From 137a886b658477caab16b5efd3c8bfcf2f1888c0 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Fri, 14 Jun 2019 14:42:00 +0200 Subject: [PATCH 001/238] REF: Refactor signature of RangeIndex._simple_new (#26722) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/indexes/base.py | 3 +- pandas/core/indexes/range.py | 71 ++++++++++++++---------------- pandas/tests/indexes/test_range.py | 8 +++- 4 files changed, 42 insertions(+), 42 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index b991f53df3a0d0..78d3d8fcb3d011 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -525,7 +525,7 @@ Performance Improvements - Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`) - Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`) -- Improved performance of slicing and other selected operation on a :class:`RangeIndex` (:issue:`26565`, :issue:`26617`) +- Improved performance of slicing and other selected operation on a :class:`RangeIndex` (:issue:`26565`, :issue:`26617`, :issue:`26722`) - Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`) - Improved performance of :meth:`read_csv` by faster parsing of N/A and boolean values (:issue:`25804`) - Improved performance of :attr:`IntervalIndex.is_monotonic`, :attr:`IntervalIndex.is_monotonic_increasing` and :attr:`IntervalIndex.is_monotonic_decreasing` by removing conversion to :class:`MultiIndex` (:issue:`24813`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4fb9c4197109fc..5bf97f44edeed3 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -273,8 +273,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if isinstance(data, RangeIndex): return RangeIndex(start=data, copy=copy, dtype=dtype, name=name) elif isinstance(data, range): - return RangeIndex.from_range(data, copy=copy, dtype=dtype, - name=name) + return RangeIndex.from_range(data, dtype=dtype, name=name) # categorical elif is_categorical_dtype(data) or is_categorical_dtype(dtype): diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 7daeb9b644a9b1..ab39969af8db02 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -6,7 +6,7 @@ import numpy as np -from pandas._libs import index as libindex, lib +from pandas._libs import index as libindex import pandas.compat as compat from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, cache_readonly @@ -82,16 +82,15 @@ def __new__(cls, start=None, stop=None, step=None, "removed in a future version.", FutureWarning, stacklevel=2) if fastpath: - return cls._simple_new(start, stop, step, name=name) + return cls._simple_new(range(start, stop, step), name=name) cls._validate_dtype(dtype) # RangeIndex if isinstance(start, RangeIndex): - if name is None: - name = start.name - return cls._simple_new(name=name, - **dict(start._get_data_as_items())) + name = start.name if name is None else name + start = start._range + return cls._simple_new(start, dtype=dtype, name=name) # validate the arguments if com._all_none(start, stop, step): @@ -108,10 +107,11 @@ def __new__(cls, start=None, stop=None, step=None, if step == 0: raise ValueError("Step must not be zero") - return cls._simple_new(start, stop, step, name) + rng = range(start, stop, step) + return cls._simple_new(rng, dtype=dtype, name=name) @classmethod - def from_range(cls, data, name=None, dtype=None, **kwargs): + def from_range(cls, data, name=None, dtype=None): """ Create RangeIndex from a range object. @@ -124,26 +124,21 @@ def from_range(cls, data, name=None, dtype=None, **kwargs): '{0}(...) must be called with object coercible to a ' 'range, {1} was passed'.format(cls.__name__, repr(data))) - start, stop, step = data.start, data.stop, data.step - return cls(start, stop, step, dtype=dtype, name=name, **kwargs) + cls._validate_dtype(dtype) + return cls._simple_new(data, dtype=dtype, name=name) @classmethod - def _simple_new(cls, start, stop=None, step=None, name=None, - dtype=None, **kwargs): + def _simple_new(cls, values, name=None, dtype=None, **kwargs): result = object.__new__(cls) # handle passed None, non-integers - if start is None and stop is None: + if values is None: # empty - start, stop, step = 0, 0, 1 + values = range(0, 0, 1) + elif not isinstance(values, range): + return Index(values, dtype=dtype, name=name, **kwargs) - if start is None or not is_integer(start): - try: - return cls(start, stop, step, name=name, **kwargs) - except TypeError: - return Index(start, stop, step, name=name, **kwargs) - - result._range = range(start, stop or 0, step or 1) + result._range = values result.name = name for k, v in kwargs.items(): @@ -360,8 +355,7 @@ def tolist(self): def _shallow_copy(self, values=None, **kwargs): if values is None: name = kwargs.get("name", self.name) - return self._simple_new( - name=name, **dict(self._get_data_as_items())) + return self._simple_new(self._range, name=name) else: kwargs.setdefault('name', self.name) return self._int64index._shallow_copy(values, **kwargs) @@ -480,11 +474,13 @@ def intersection(self, other, sort=False): tmp_start = first.start + (second.start - first.start) * \ first.step // gcd * s new_step = first.step * second.step // gcd - new_index = self._simple_new(tmp_start, int_high, new_step) + new_range = range(tmp_start, int_high, new_step) + new_index = self._simple_new(new_range) # adjust index to limiting interval new_start = new_index._min_fitting_element(int_low) - new_index = self._simple_new(new_start, new_index.stop, new_index.step) + new_range = range(new_start, new_index.stop, new_index.step) + new_index = self._simple_new(new_range) if (self.step < 0 and other.step < 0) is not (new_index.step < 0): new_index = new_index[::-1] @@ -609,12 +605,10 @@ def __getitem__(self, key): """ Conserve RangeIndex type for scalar and slice keys. """ - if is_scalar(key): - if not lib.is_integer(key): - raise IndexError("only integers, slices (`:`), " - "ellipsis (`...`), numpy.newaxis (`None`) " - "and integer or boolean " - "arrays are valid indices") + if isinstance(key, slice): + new_range = self._range[key] + return self._simple_new(new_range, name=self.name) + elif is_integer(key): new_key = int(key) try: return self._range[new_key] @@ -622,10 +616,11 @@ def __getitem__(self, key): raise IndexError("index {key} is out of bounds for axis 0 " "with size {size}".format(key=key, size=len(self))) - if isinstance(key, slice): - new_range = self._range[key] - return self.from_range(new_range, name=self.name) - + elif is_scalar(key): + raise IndexError("only integers, slices (`:`), " + "ellipsis (`...`), numpy.newaxis (`None`) " + "and integer or boolean " + "arrays are valid indices") # fall back to Int64Index return super().__getitem__(key) @@ -640,10 +635,12 @@ def __floordiv__(self, other): start = self.start // other step = self.step // other stop = start + len(self) * step - return self._simple_new(start, stop, step, name=self.name) + new_range = range(start, stop, step or 1) + return self._simple_new(new_range, name=self.name) if len(self) == 1: start = self.start // other - return self._simple_new(start, start + 1, 1, name=self.name) + new_range = range(start, start + 1, 1) + return self._simple_new(new_range, name=self.name) return self._int64index // other def all(self) -> bool: diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 6eece0ed8efeec..3f474b0166b159 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -94,8 +94,9 @@ def test_constructor_same(self): def test_constructor_range(self): - with pytest.raises(TypeError): - RangeIndex(range(1, 5, 2)) + msg = "Value needs to be a scalar value, was type " + with pytest.raises(TypeError, match=msg): + result = RangeIndex(range(1, 5, 2)) result = RangeIndex.from_range(range(1, 5, 2)) expected = RangeIndex(1, 5, 2) @@ -120,6 +121,9 @@ def test_constructor_range(self): with pytest.raises(TypeError): Index(range(1, 5, 2), dtype='float64') + msg = r'^from_range\(\) got an unexpected keyword argument' + with pytest.raises(TypeError, match=msg): + pd.RangeIndex.from_range(range(10), copy=True) def test_constructor_name(self): # GH12288 From 7ecfa8eb1242a78ebb71da2f18a5b380488b1c2e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 14 Jun 2019 14:51:05 +0200 Subject: [PATCH 002/238] TST: test custom _formatter for ExtensionArray + revert ExtensionArrayFormatter removal (#26845) * TST: test custom _formatter for ExtensionArray * Revert "REF: remove ExtensionArrayFormatter (#26833)" This reverts commit a00659a82c9bbce29554d75c91d8e897d064ac18. --- pandas/core/indexes/interval.py | 8 ++-- pandas/io/formats/format.py | 38 +++++++++++++------ pandas/tests/extension/decimal/array.py | 5 +++ .../tests/extension/decimal/test_decimal.py | 8 +++- 4 files changed, 43 insertions(+), 16 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 18aa0580d7df76..24fcb32d09d276 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1061,9 +1061,11 @@ def _format_with_header(self, header, **kwargs): def _format_native_types(self, na_rep='NaN', quoting=None, **kwargs): """ actually format my specific types """ - from pandas.io.formats.format import format_array - return format_array(values=self, na_rep=na_rep, justify='all', - leading_space=False) + from pandas.io.formats.format import ExtensionArrayFormatter + return ExtensionArrayFormatter(values=self, + na_rep=na_rep, + justify='all', + leading_space=False).get_result() def _format_data(self, name=None): diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 66af6c2172344c..f632bc13a5b241 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -849,7 +849,7 @@ def _get_column_name_list(self): # Array formatters -def format_array(values, formatter=None, float_format=None, na_rep='NaN', +def format_array(values, formatter, float_format=None, na_rep='NaN', digits=None, space=None, justify='right', decimal='.', leading_space=None): """ @@ -879,23 +879,14 @@ def format_array(values, formatter=None, float_format=None, na_rep='NaN', List[str] """ - if is_extension_array_dtype(values.dtype): - if isinstance(values, (ABCIndexClass, ABCSeries)): - values = values._values - - if is_categorical_dtype(values.dtype): - # Categorical is special for now, so that we can preserve tzinfo - values = values.get_values() - - if not is_datetime64tz_dtype(values.dtype): - values = np.asarray(values) - if is_datetime64_dtype(values.dtype): fmt_klass = Datetime64Formatter elif is_datetime64tz_dtype(values): fmt_klass = Datetime64TZFormatter elif is_timedelta64_dtype(values.dtype): fmt_klass = Timedelta64Formatter + elif is_extension_array_dtype(values.dtype): + fmt_klass = ExtensionArrayFormatter elif is_float_dtype(values.dtype) or is_complex_dtype(values.dtype): fmt_klass = FloatArrayFormatter elif is_integer_dtype(values.dtype): @@ -1190,6 +1181,29 @@ def _format_strings(self): return fmt_values.tolist() +class ExtensionArrayFormatter(GenericArrayFormatter): + def _format_strings(self): + values = self.values + if isinstance(values, (ABCIndexClass, ABCSeries)): + values = values._values + + formatter = values._formatter(boxed=True) + + if is_categorical_dtype(values.dtype): + # Categorical is special for now, so that we can preserve tzinfo + array = values.get_values() + else: + array = np.asarray(values) + + fmt_values = format_array(array, + formatter, + float_format=self.float_format, + na_rep=self.na_rep, digits=self.digits, + space=self.space, justify=self.justify, + leading_space=self.leading_space) + return fmt_values + + def format_percentiles(percentiles): """ Outputs rounded and formatted percentiles. diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 1823eeb4d7fc08..3b95c8d919eb1a 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -137,6 +137,11 @@ def isna(self): def _na_value(self): return decimal.Decimal('NaN') + def _formatter(self, boxed=False): + if boxed: + return "Decimal: {0}".format + return repr + @classmethod def _concat_same_type(cls, to_concat): return cls(np.concatenate([x._data for x in to_concat])) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 97fae41bcc7200..94c0b61c6382a2 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -200,7 +200,13 @@ class TestSetitem(BaseDecimal, base.BaseSetitemTests): class TestPrinting(BaseDecimal, base.BasePrintingTests): - pass + + def test_series_repr(self, data): + # Overriding this base test to explicitly test that + # the custom _formatter is used + ser = pd.Series(data) + assert data.dtype.name in repr(ser) + assert "Decimal: " in repr(ser) # TODO(extension) From f6e33a010fa30f25b032618cbb9677fb9dbbe8fd Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Fri, 14 Jun 2019 14:56:32 +0100 Subject: [PATCH 003/238] DOC: Fixing even more warnings (not many left) (#26850) --- doc/source/index.rst.template | 6 ++---- doc/source/whatsnew/v0.17.1.rst | 2 +- doc/source/whatsnew/v0.20.0.rst | 4 ++-- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/indexes/base.py | 12 ++++++++++++ 5 files changed, 18 insertions(+), 8 deletions(-) diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index f18c61b5e2f951..b57ce83cfc33c9 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -38,8 +38,7 @@ See the :ref:`overview` for more detail about what's in the library. :maxdepth: 3 :hidden: {% endif %} - - {% if not single_doc -%} +{% if not single_doc %} What's New in 0.25.0 install getting_started/index @@ -52,8 +51,7 @@ See the :ref:`overview` for more detail about what's in the library. {% if not single_doc -%} development/index whatsnew/index -{% endif -%} - +{% endif %} * :doc:`whatsnew/v0.25.0` * :doc:`install` diff --git a/doc/source/whatsnew/v0.17.1.rst b/doc/source/whatsnew/v0.17.1.rst index c4dc442bd7354d..9de49699b96523 100644 --- a/doc/source/whatsnew/v0.17.1.rst +++ b/doc/source/whatsnew/v0.17.1.rst @@ -61,7 +61,7 @@ We can render the HTML to get the following table. :file: whatsnew_0171_html_table.html :class:`~pandas.core.style.Styler` interacts nicely with the Jupyter Notebook. -See the :ref:`documentation ` for more. +See the :ref:`documentation ` for more. .. _whatsnew_0171.enhancements: diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index 8e6ad07ec84358..51c8c488fb9d90 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -387,7 +387,7 @@ For example, after running the following, ``styled.xlsx`` renders as below: import os os.remove('styled.xlsx') -See the :ref:`Style documentation ` for more detail. +See the :ref:`Style documentation ` for more detail. .. _whatsnew_0200.enhancements.intervalindex: @@ -497,7 +497,7 @@ Other Enhancements - ``DataFrame.to_excel()`` has a new ``freeze_panes`` parameter to turn on Freeze Panes when exporting to Excel (:issue:`15160`) - ``pd.read_html()`` will parse multiple header rows, creating a MutliIndex header. (:issue:`13434`). - HTML table output skips ``colspan`` or ``rowspan`` attribute if equal to 1. (:issue:`15403`) -- :class:`pandas.io.formats.style.Styler` template now has blocks for easier extension, see the :ref:`example notebook ` (:issue:`15649`) +- :class:`pandas.io.formats.style.Styler` template now has blocks for easier extension, see the :ref:`example notebook ` (:issue:`15649`) - :meth:`Styler.render() ` now accepts ``**kwargs`` to allow user-defined variables in the template (:issue:`15649`) - Compatibility with Jupyter notebook 5.0; MultiIndex column labels are left-aligned and MultiIndex row-labels are top-aligned (:issue:`15379`) - ``TimedeltaIndex`` now has a custom date-tick formatter specifically designed for nanosecond level precision (:issue:`8711`) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 78d3d8fcb3d011..207d16afd350f9 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -283,7 +283,7 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t s s.str.startswith(b'a') -.. _whatsnew_0250.api_breaking.incompatible_index_unions +.. _whatsnew_0250.api_breaking.incompatible_index_unions: Incompatible Index Type Unions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5bf97f44edeed3..9f0f89a0e34f59 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1410,6 +1410,9 @@ def rename(self, name, inplace=False): @property def nlevels(self): + """ + Number of levels. + """ return 1 def _sort_levels_monotonic(self): @@ -1739,6 +1742,9 @@ def is_mixed(self): return self.inferred_type in ['mixed'] def holds_integer(self): + """ + Whether the type is an integer type. + """ return self.inferred_type in ['integer', 'mixed-integer'] @cache_readonly @@ -3965,6 +3971,9 @@ def _is_memory_usage_qualified(self): return self.is_object() def is_type_compatible(self, kind): + """ + Whether the index type is compatible with the provided type. + """ return kind == self.inferred_type _index_shared_docs['contains'] = """ @@ -4337,6 +4346,9 @@ def sort_values(self, return_indexer=False, ascending=True): return sorted_index def sort(self, *args, **kwargs): + """ + Use sort_values instead. + """ raise TypeError("cannot sort an Index object in-place, use " "sort_values instead") From 430f0fd04646a0ab44ccbfbad706dfe22e9fabfe Mon Sep 17 00:00:00 2001 From: topper-123 Date: Fri, 14 Jun 2019 16:22:42 +0200 Subject: [PATCH 004/238] CLN: remove util._decorators.make_signature and make related changes (#26819) --- pandas/_typing.py | 4 +- pandas/core/groupby/base.py | 69 ---------------------------------- pandas/core/groupby/generic.py | 52 +++++++++++++++++++++++-- pandas/tests/util/test_util.py | 18 --------- pandas/util/_decorators.py | 30 --------------- 5 files changed, 52 insertions(+), 121 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index a2bb168c1e2daf..0044b269eb7b58 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -9,7 +9,7 @@ from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( - ABCExtensionArray, ABCIndexClass, ABCSeries, ABCSparseSeries) + ABCDataFrame, ABCExtensionArray, ABCIndexClass, ABCSeries, ABCSparseSeries) AnyArrayLike = TypeVar('AnyArrayLike', ABCExtensionArray, @@ -22,3 +22,5 @@ Timedelta) Dtype = Union[str, np.dtype, ExtensionDtype] FilePathOrBuffer = Union[str, Path, IO[AnyStr]] + +FrameOrSeries = TypeVar('FrameOrSeries', ABCSeries, ABCDataFrame) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 823a4155bc2b84..cffe0e589c6bc9 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -3,11 +3,6 @@ hold the whitelist of methods that are exposed on the SeriesGroupBy and the DataFrameGroupBy objects. """ - -import types - -from pandas.util._decorators import make_signature - from pandas.core.dtypes.common import is_list_like, is_scalar @@ -91,67 +86,3 @@ def _gotitem(self, key, ndim, subset=None): cython_cast_blacklist = frozenset(['rank', 'count', 'size', 'idxmin', 'idxmax']) - - -def whitelist_method_generator(base, klass, whitelist): - """ - Yields all GroupBy member defs for DataFrame/Series names in whitelist. - - Parameters - ---------- - base : class - base class - klass : class - class where members are defined. - Should be Series or DataFrame - whitelist : list - list of names of klass methods to be constructed - - Returns - ------- - The generator yields a sequence of strings, each suitable for exec'ing, - that define implementations of the named methods for DataFrameGroupBy - or SeriesGroupBy. - - Since we don't want to override methods explicitly defined in the - base class, any such name is skipped. - """ - - method_wrapper_template = \ - """def %(name)s(%(sig)s) : - \""" - %(doc)s - \""" - f = %(self)s.__getattr__('%(name)s') - return f(%(args)s)""" - property_wrapper_template = \ - """@property -def %(name)s(self) : - \"""%(doc)s\""" - return self.__getattr__('%(name)s')""" - - for name in whitelist: - # don't override anything that was explicitly defined - # in the base class - if hasattr(base, name): - continue - # ugly, but we need the name string itself in the method. - f = getattr(klass, name) - doc = f.__doc__ - doc = doc if type(doc) == str else '' - if isinstance(f, types.MethodType): - wrapper_template = method_wrapper_template - decl, args = make_signature(f) - # pass args by name to f because otherwise - # GroupBy._make_wrapper won't know whether - # we passed in an axis parameter. - args_by_name = ['{0}={0}'.format(arg) for arg in args[1:]] - params = {'name': name, - 'doc': doc, - 'sig': ','.join(decl), - 'self': args[0], - 'args': ','.join(args_by_name)} - else: - wrapper_template = property_wrapper_template - params = {'name': name, 'doc': doc} - yield wrapper_template % params diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 57d14cb4c15d79..35ffa552913aeb 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -11,7 +11,7 @@ from functools import partial from textwrap import dedent import typing -from typing import Any, Callable, List, Union +from typing import Any, Callable, FrozenSet, Iterator, List, Type, Union import warnings import numpy as np @@ -27,6 +27,7 @@ is_integer_dtype, is_interval_dtype, is_numeric_dtype, is_scalar) from pandas.core.dtypes.missing import isna, notna +from pandas._typing import FrameOrSeries import pandas.core.algorithms as algorithms from pandas.core.base import DataError, SpecificationError import pandas.core.common as com @@ -48,6 +49,51 @@ AggScalar = Union[str, Callable[..., Any]] +def whitelist_method_generator(base_class: Type[GroupBy], + klass: Type[FrameOrSeries], + whitelist: FrozenSet[str], + ) -> Iterator[str]: + """ + Yields all GroupBy member defs for DataFrame/Series names in whitelist. + + Parameters + ---------- + base_class : Groupby class + base class + klass : DataFrame or Series class + class where members are defined. + whitelist : frozenset + Set of names of klass methods to be constructed + + Returns + ------- + The generator yields a sequence of strings, each suitable for exec'ing, + that define implementations of the named methods for DataFrameGroupBy + or SeriesGroupBy. + + Since we don't want to override methods explicitly defined in the + base class, any such name is skipped. + """ + property_wrapper_template = \ + """@property +def %(name)s(self) : + \"""%(doc)s\""" + return self.__getattr__('%(name)s')""" + + for name in whitelist: + # don't override anything that was explicitly defined + # in the base class + if hasattr(base_class, name): + continue + # ugly, but we need the name string itself in the method. + f = getattr(klass, name) + doc = f.__doc__ + doc = doc if type(doc) == str else '' + wrapper_template = property_wrapper_template + params = {'name': name, 'doc': doc} + yield wrapper_template % params + + class NDFrameGroupBy(GroupBy): def _iterate_slices(self): @@ -685,7 +731,7 @@ class SeriesGroupBy(GroupBy): # Make class defs of attributes on SeriesGroupBy whitelist _apply_whitelist = base.series_apply_whitelist - for _def_str in base.whitelist_method_generator( + for _def_str in whitelist_method_generator( GroupBy, Series, _apply_whitelist): exec(_def_str) @@ -1289,7 +1335,7 @@ class DataFrameGroupBy(NDFrameGroupBy): # # Make class defs of attributes on DataFrameGroupBy whitelist. - for _def_str in base.whitelist_method_generator( + for _def_str in whitelist_method_generator( GroupBy, DataFrame, _apply_whitelist): exec(_def_str) diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py index a3b82ecc12a1be..88ce48245dc70c 100644 --- a/pandas/tests/util/test_util.py +++ b/pandas/tests/util/test_util.py @@ -5,8 +5,6 @@ import pandas.compat as compat from pandas.compat import raise_with_traceback -from pandas.util._decorators import deprecate_kwarg, make_signature -from pandas.util._validators import validate_kwargs import pandas.util.testing as tm @@ -37,22 +35,6 @@ def test_numpy_err_state_is_default(): assert np.geterr() == expected -@pytest.mark.parametrize("func,expected", [ - # Case where the func does not have default kwargs. - (validate_kwargs, (["fname", "kwargs", "compat_args"], - ["fname", "kwargs", "compat_args"])), - - # Case where the func does have default kwargs. - (deprecate_kwarg, (["old_arg_name", "new_arg_name", - "mapping=None", "stacklevel=2"], - ["old_arg_name", "new_arg_name", - "mapping", "stacklevel"])) -]) -def test_make_signature(func, expected): - # see gh-17608 - assert make_signature(func) == expected - - def test_raise_with_traceback(): with pytest.raises(LookupError, match="error_text"): try: diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index ac23fa5d7b0add..cdda02324ba066 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -319,33 +319,3 @@ def indent(text, indents=1): return '' jointext = ''.join(['\n'] + [' '] * indents) return jointext.join(text.split('\n')) - - -def make_signature(func): - """ - Returns a tuple containing the paramenter list with defaults - and parameter list. - - Examples - -------- - >>> def f(a, b, c=2): - >>> return a * b * c - >>> print(make_signature(f)) - (['a', 'b', 'c=2'], ['a', 'b', 'c']) - """ - - spec = inspect.getfullargspec(func) - if spec.defaults is None: - n_wo_defaults = len(spec.args) - defaults = ('',) * n_wo_defaults - else: - n_wo_defaults = len(spec.args) - len(spec.defaults) - defaults = ('',) * n_wo_defaults + tuple(spec.defaults) - args = [] - for var, default in zip(spec.args, defaults): - args.append(var if default == '' else var + '=' + repr(default)) - if spec.varargs: - args.append('*' + spec.varargs) - if spec.varkw: - args.append('**' + spec.varkw) - return args, spec.args From 2115bf3d58ecc20aa2b10e05574eaa06de1e56ec Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 16 Jun 2019 01:53:06 -0700 Subject: [PATCH 005/238] small cleanups (#26874) Thanks, @jbrockmendel --- pandas/core/arrays/datetimelike.py | 4 ---- pandas/core/arrays/datetimes.py | 4 ++-- pandas/core/dtypes/concat.py | 2 +- pandas/core/internals/blocks.py | 5 ++--- pandas/core/internals/construction.py | 3 +-- pandas/core/internals/managers.py | 4 ++-- pandas/io/formats/format.py | 4 ++++ pandas/tests/frame/test_repr_info.py | 4 +++- pandas/tests/series/test_api.py | 3 ++- 9 files changed, 17 insertions(+), 16 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index b0c91543dabaca..ebf1f692ccde60 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -401,10 +401,6 @@ def __array__(self, dtype=None): return np.array(list(self), dtype=object) return self._data - @property - def shape(self): - return (len(self),) - @property def size(self) -> int: """The number of elements in this array.""" diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 47aef3c84c9f72..d415dbbdaf0a37 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -265,8 +265,8 @@ class DatetimeArray(dtl.DatetimeLikeArrayMixin, 'normalize', 'strftime', 'round', 'floor', 'ceil', 'month_name', 'day_name'] - # Needed so that Timestamp.__richcmp__(DateTimeArray) operates pointwise - ndim = 1 + # ndim is inherited from ExtensionArray, must exist to ensure + # Timestamp.__richcmp__(DateTimeArray) operates pointwise # ensure that operations with numpy arrays defer to our implementation __array_priority__ = 1000 diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index e2c6fba322be02..a01ba7fc94f229 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -132,7 +132,7 @@ def is_nonempty(x): _contains_period = any(typ.startswith('period') for typ in typs) if 'category' in typs: - # this must be priort to _concat_datetime, + # this must be prior to _concat_datetime, # to support Categorical + datetime-like return _concat_categorical(to_concat, axis=axis) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f9178959d8272b..4cc6c86417b3b8 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -36,7 +36,6 @@ Categorical, DatetimeArray, ExtensionArray, PandasDtype, TimedeltaArray) from pandas.core.base import PandasObject import pandas.core.common as com -from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexing import check_setitem_lengths from pandas.core.internals.arrays import extract_array import pandas.core.missing as missing @@ -2091,7 +2090,7 @@ def _astype(self, dtype, **kwargs): if is_datetime64tz_dtype(dtype): values = self.values if getattr(values, 'tz', None) is None: - values = DatetimeIndex(values).tz_localize('UTC') + values = DatetimeArray(values).tz_localize('UTC') values = values.tz_convert(dtype.tz) return self.make_block(values) @@ -2420,7 +2419,7 @@ def setitem(self, indexer, value): except (ValueError, TypeError): newb = make_block(self.values.astype(object), placement=self.mgr_locs, - klass=ObjectBlock,) + klass=ObjectBlock) return newb.setitem(indexer, value) def equals(self, other): diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 863b9f7fb16d7e..2616f0aa97d0d4 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -131,8 +131,7 @@ def init_ndarray(values, index, columns, dtype=None, copy=False): index, columns = _get_axes(len(values), 1, index, columns) return arrays_to_mgr([values], columns, index, columns, dtype=dtype) - elif (is_datetime64tz_dtype(values) or - is_extension_array_dtype(values)): + elif is_extension_array_dtype(values): # GH#19157 if columns is None: columns = [0] diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 6f0e8a909d36f0..a1e5468e2f8718 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -15,7 +15,8 @@ maybe_promote) from pandas.core.dtypes.common import ( _NS_DTYPE, is_datetimelike_v_numeric, is_extension_array_dtype, - is_extension_type, is_list_like, is_numeric_v_string_like, is_scalar) + is_extension_type, is_list_like, is_numeric_v_string_like, is_scalar, + is_sparse) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCExtensionArray, ABCSeries @@ -770,7 +771,6 @@ def _interleave(self): Return ndarray from blocks with specified item order Items must be contained in the blocks """ - from pandas.core.dtypes.common import is_sparse dtype = _interleaved_dtype(self.blocks) # TODO: https://github.com/pandas-dev/pandas/issues/22791 diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index f632bc13a5b241..8655fb05f34e25 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -191,6 +191,8 @@ def _chk_truncate(self): series = concat((series.iloc[:row_num], series.iloc[-row_num:])) self.tr_row_num = row_num + else: + self.tr_row_num = None self.tr_series = series self.truncate_v = truncate_v @@ -499,6 +501,8 @@ def _chk_truncate(self): frame = concat((frame.iloc[:row_num, :], frame.iloc[-row_num:, :])) self.tr_row_num = row_num + else: + self.tr_row_num = None self.tr_frame = frame self.truncate_h = truncate_h diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 32594c856a236f..24dba8cb964cc2 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -511,9 +511,11 @@ def test_repr_categorical_dates_periods(self): 3 2011-01-01 12:00:00-05:00 2011-04 4 2011-01-01 13:00:00-05:00 2011-05""" - df = DataFrame({'dt': Categorical(dt), 'p': Categorical(p)}) assert repr(df) == exp + df2 = DataFrame({'dt': Categorical(dt), 'p': Categorical(p)}) + assert repr(df2) == exp + @pytest.mark.parametrize('arg', [np.datetime64, np.timedelta64]) @pytest.mark.parametrize('box, expected', [ [Series, '0 NaT\ndtype: object'], diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 9b4f1f5fd0fe5f..6c577304d5ef44 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -361,7 +361,8 @@ def test_copy(self): assert np.isnan(s2[0]) assert np.isnan(s[0]) - # GH 11794 + def test_copy_tzaware(self): + # GH#11794 # copy of tz-aware expected = Series([Timestamp('2012/01/01', tz='UTC')]) expected2 = Series([Timestamp('1999/01/01', tz='UTC')]) From 3381c6410c978c5808d4b864a9cd3b6a6bd9a412 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Sun, 16 Jun 2019 16:05:20 +0200 Subject: [PATCH 006/238] Add type hints for (NDFrame|Series)._data (#26871) --- pandas/core/generic.py | 12 +++++++++--- pandas/core/series.py | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 903fd7ffe706ab..463659ca8ea44d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6,7 +6,7 @@ import operator import pickle from textwrap import dedent -from typing import Callable, FrozenSet, List, Set +from typing import Callable, FrozenSet, List, Optional, Set import warnings import weakref @@ -35,6 +35,7 @@ from pandas.core.dtypes.missing import isna, notna import pandas as pd +from pandas._typing import Dtype from pandas.core import missing, nanops import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin @@ -118,12 +119,17 @@ class NDFrame(PandasObject, SelectionMixin): ]) # type: FrozenSet[str] _metadata = [] # type: List[str] _is_copy = None + _data = None # type: BlockManager # ---------------------------------------------------------------------- # Constructors - def __init__(self, data, axes=None, copy=False, dtype=None, - fastpath=False): + def __init__(self, + data: BlockManager, + axes: Optional[List[Index]] = None, + copy: bool = False, + dtype: Optional[Dtype] = None, + fastpath: bool = False): if not fastpath: if dtype is not None: diff --git a/pandas/core/series.py b/pandas/core/series.py index f0362596920a63..eaef1f525f3e4a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -142,6 +142,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): # Override cache_readonly bc Series is mutable hasnans = property(base.IndexOpsMixin.hasnans.func, doc=base.IndexOpsMixin.hasnans.__doc__) + _data = None # type: SingleBlockManager # ---------------------------------------------------------------------- # Constructors From 21fe224627a07e9c913d8788026a5bdc32b28b8c Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Sun, 16 Jun 2019 16:30:39 +0200 Subject: [PATCH 007/238] Add reader for SPSS (.sav) files (#26537) --- LICENSES/HAVEN_LICENSE | 2 + LICENSES/HAVEN_MIT | 32 ++++++++++ ci/deps/azure-macos-35.yaml | 1 + ci/deps/azure-windows-37.yaml | 1 + ci/deps/travis-37.yaml | 1 + doc/source/install.rst | 1 + doc/source/whatsnew/v0.25.0.rst | 1 + environment.yml | 2 + pandas/__init__.py | 2 +- pandas/io/api.py | 1 + pandas/io/spss.py | 41 +++++++++++++ pandas/tests/api/test_api.py | 2 +- pandas/tests/io/data/labelled-num-na.sav | Bin 0 -> 535 bytes pandas/tests/io/data/labelled-num.sav | Bin 0 -> 507 bytes pandas/tests/io/data/labelled-str.sav | Bin 0 -> 525 bytes pandas/tests/io/data/umlauts.sav | Bin 0 -> 567 bytes pandas/tests/io/test_spss.py | 74 +++++++++++++++++++++++ requirements-dev.txt | 3 +- 18 files changed, 161 insertions(+), 3 deletions(-) create mode 100644 LICENSES/HAVEN_LICENSE create mode 100644 LICENSES/HAVEN_MIT create mode 100644 pandas/io/spss.py create mode 100755 pandas/tests/io/data/labelled-num-na.sav create mode 100755 pandas/tests/io/data/labelled-num.sav create mode 100755 pandas/tests/io/data/labelled-str.sav create mode 100755 pandas/tests/io/data/umlauts.sav create mode 100644 pandas/tests/io/test_spss.py diff --git a/LICENSES/HAVEN_LICENSE b/LICENSES/HAVEN_LICENSE new file mode 100644 index 00000000000000..2f444cb44d5059 --- /dev/null +++ b/LICENSES/HAVEN_LICENSE @@ -0,0 +1,2 @@ +YEAR: 2013-2016 +COPYRIGHT HOLDER: Hadley Wickham; RStudio; and Evan Miller diff --git a/LICENSES/HAVEN_MIT b/LICENSES/HAVEN_MIT new file mode 100644 index 00000000000000..b03d0e640627ae --- /dev/null +++ b/LICENSES/HAVEN_MIT @@ -0,0 +1,32 @@ +Based on http://opensource.org/licenses/MIT + +This is a template. Complete and ship as file LICENSE the following 2 +lines (only) + +YEAR: +COPYRIGHT HOLDER: + +and specify as + +License: MIT + file LICENSE + +Copyright (c) , + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml index 8ed48b46b5b5a9..24c753e16d98dd 100644 --- a/ci/deps/azure-macos-35.yaml +++ b/ci/deps/azure-macos-35.yaml @@ -23,6 +23,7 @@ dependencies: - xlsxwriter - xlwt - pip: + - pyreadstat # universal - pytest==4.5.0 - pytest-xdist diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 04e4f74f85e4dd..5bdc29e0eec802 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -30,3 +30,4 @@ dependencies: - pytest-mock - moto - hypothesis>=3.58.0 + - pyreadstat diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 722a35111ab012..c9a8c274fb1442 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -19,5 +19,6 @@ dependencies: - hypothesis>=3.58.0 - s3fs - pip + - pyreadstat - pip: - moto diff --git a/doc/source/install.rst b/doc/source/install.rst index db31d75e3013e8..1c1f0c1d4cf8eb 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -285,6 +285,7 @@ pandas-gbq 0.8.0 Google Big Query access psycopg2 PostgreSQL engine for sqlalchemy pyarrow 0.9.0 Parquet and feather reading / writing pymysql MySQL engine for sqlalchemy +pyreadstat SPSS files (.sav) reading qtpy Clipboard I/O s3fs 0.0.8 Amazon S3 access xarray 0.8.2 pandas-like API for N-dimensional data diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 207d16afd350f9..f7faeea7a646f7 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -99,6 +99,7 @@ Other Enhancements - Error message for missing required imports now includes the original import error's text (:issue:`23868`) - :class:`DatetimeIndex` and :class:`TimedeltaIndex` now have a ``mean`` method (:issue:`24757`) - :meth:`DataFrame.describe` now formats integer percentiles without decimal point (:issue:`26660`) +- Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`) .. _whatsnew_0250.api_breaking: diff --git a/environment.yml b/environment.yml index 7db2ec72ccb3b5..de9bd67dd9f062 100644 --- a/environment.yml +++ b/environment.yml @@ -79,3 +79,5 @@ dependencies: - xlrd # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - xlsxwriter # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - xlwt # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile + - pip: + - pyreadstat # pandas.read_spss diff --git a/pandas/__init__.py b/pandas/__init__.py index a2fa14be839981..b95c312f12eedc 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -105,7 +105,7 @@ # misc read_clipboard, read_parquet, read_feather, read_gbq, - read_html, read_json, read_stata, read_sas) + read_html, read_json, read_stata, read_sas, read_spss) from pandas.util._tester import test import pandas.testing diff --git a/pandas/io/api.py b/pandas/io/api.py index 8c8d7cf73b37a0..725e82604ca7f0 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -16,5 +16,6 @@ from pandas.io.pickle import read_pickle, to_pickle from pandas.io.pytables import HDFStore, read_hdf from pandas.io.sas import read_sas +from pandas.io.spss import read_spss from pandas.io.sql import read_sql, read_sql_query, read_sql_table from pandas.io.stata import read_stata diff --git a/pandas/io/spss.py b/pandas/io/spss.py new file mode 100644 index 00000000000000..b1b92fc2b84399 --- /dev/null +++ b/pandas/io/spss.py @@ -0,0 +1,41 @@ +from pathlib import Path +from typing import Optional, Sequence, Union + +from pandas.compat._optional import import_optional_dependency + +from pandas.api.types import is_list_like +from pandas.core.api import DataFrame + + +def read_spss(path: Union[str, Path], + usecols: Optional[Sequence[str]] = None, + convert_categoricals: bool = True) -> DataFrame: + """ + Load an SPSS file from the file path, returning a DataFrame. + + .. versionadded 0.25.0 + + Parameters + ---------- + path : string or Path + File path + usecols : list-like, optional + Return a subset of the columns. If None, return all columns. + convert_categoricals : bool, default is True + Convert categorical columns into pd.Categorical. + + Returns + ------- + DataFrame + """ + pyreadstat = import_optional_dependency("pyreadstat") + + if usecols is not None: + if not is_list_like(usecols): + raise TypeError("usecols must be list-like.") + else: + usecols = list(usecols) # pyreadstat requires a list + + df, _ = pyreadstat.read_sav(path, usecols=usecols, + apply_value_formats=convert_categoricals) + return df diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index aa42484bf95130..b57c7a0cf0625f 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -81,7 +81,7 @@ class TestPDApi(Base): 'read_gbq', 'read_hdf', 'read_html', 'read_json', 'read_msgpack', 'read_pickle', 'read_sas', 'read_sql', 'read_sql_query', 'read_sql_table', 'read_stata', - 'read_table', 'read_feather', 'read_parquet'] + 'read_table', 'read_feather', 'read_parquet', 'read_spss'] # top-level to_* funcs funcs_to = ['to_datetime', 'to_msgpack', diff --git a/pandas/tests/io/data/labelled-num-na.sav b/pandas/tests/io/data/labelled-num-na.sav new file mode 100755 index 0000000000000000000000000000000000000000..fbe6ee77672406ba5e28289d88621faf68c72ee3 GIT binary patch literal 535 zcmY#!^D%PJP}WrNbn;aQ4hRlb2o7-!@eB^}bPiVV4OR%x%uC5HFIF%z(lY=-1vJ3K zz`(!=#XyPy#D{=L2Lm$&x6~vBLsLT&D`OKYBO^4uBy)iV!_0@<$-&CN0K{RAK?Xoz z#0OO4pO;gqke`>TP?nfenyR1xagUM%Gmrt&2LT`KxkEBCixq$fq!tKRfD$ls*@0}3 zdXN|g5Q_mZ3|s&@0;Ue+o|k_a7(nV_YC-z`gTZ?U_5XeSKPVrhjvJ~Lqz(i?=7Y=w z$?-zjfw86p26; W0mF_VG{jBU;yKhUskuNAAOHZej8AF+ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/labelled-num.sav b/pandas/tests/io/data/labelled-num.sav new file mode 100755 index 0000000000000000000000000000000000000000..bfab052089d7e62d2e9747c51434cb3a42156278 GIT binary patch literal 507 zcmY#!^D%PJP}WrNbn;aQ4hRlb2o7-!@eB^}bPiVV4OR%x%uC5HFIF%z(lY=-1vJ3K zz`(!=#Xt(o2GI-*kq!oC3T~-M3WlbJCRWDAR>o#%dP(L2O$FJ90ytP17=Sp;F~|T2 zjF^FvAooDP2Yc?2jLc#MAj;261=+^}lz_RH9moc$2Z?b2u^14;zy+X1Fm)h1U;bra z0I7$m1?l?_2Ja!%|M&I(pnQ-zZm3$2IuHPv4>AuV#|vc(0%@4LZD9T$&(7B2Y!Z@L>oIanrSU4s}Z^Pyhq~H&;mv literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/labelled-str.sav b/pandas/tests/io/data/labelled-str.sav new file mode 100755 index 0000000000000000000000000000000000000000..b96a9c00fcec10b33cf35b2d3d87bef396fcd3ad GIT binary patch literal 525 zcmbVIT}uK%6dlV-j3CMfA1^4V55|6&#DcJCt}L_>&PMcDID}vt8Tja5^k0Ne{sOam z+PTnkXE@xsbI-kJW~$Mx7uG8cin|Hvd#y>Q*J-TNxTmSzYQBs=Dbe&eo{naVIeD!M z2a5!IN~xSB2ZcPtQ|S7n%{#em*AF}=xb&szzmW%vpSY+TyE6yv0&F zx95qW#OC<~8Bv~fa_=MFqYq~VW|=8iv7zYTz1}JXy=c+5`^6>;yUp_3=FlBmEp(WJ z`2cDsOq?NR_wT%#>BxMbc*=zM?}M=iP(Nd$`J9<`1=Vmko0xjdsTCWLl&s`{<3k!X RufA{##+Dxe$fw9>{Qxb9P$&QZ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/umlauts.sav b/pandas/tests/io/data/umlauts.sav new file mode 100755 index 0000000000000000000000000000000000000000..e99cf1267bebebd16bdfe881579e6e319aa10986 GIT binary patch literal 567 zcmY#!^D%PJP}WrNbn;aQ4hRlb2o7-!@eB^}bPiVV4OR%x%uC5HFIF%z(lY=-1vJ3K zz`(!=#XyRI1w?>Bq=S*Mf?H~mf}yFQk(G(Dm5CXeUXr;$b3yij0LV@-;9zB70OBym zAVZLOB0#~AjLc#MAWFj2vPtHTZYgOH(iV8Q2SHUQqzHA3;+QaUGV?_ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py new file mode 100644 index 00000000000000..b9f58f9bf6cf65 --- /dev/null +++ b/pandas/tests/io/test_spss.py @@ -0,0 +1,74 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas.util import testing as tm + +pyreadstat = pytest.importorskip("pyreadstat") + + +def test_spss_labelled_num(datapath): + # test file from the Haven project (https://haven.tidyverse.org/) + fname = datapath("io", "data", "labelled-num.sav") + + df = pd.read_spss(fname, convert_categoricals=True) + expected = pd.DataFrame({"VAR00002": "This is one"}, index=[0]) + expected["VAR00002"] = pd.Categorical(expected["VAR00002"]) + tm.assert_frame_equal(df, expected) + + df = pd.read_spss(fname, convert_categoricals=False) + expected = pd.DataFrame({"VAR00002": 1.0}, index=[0]) + tm.assert_frame_equal(df, expected) + + +def test_spss_labelled_num_na(datapath): + # test file from the Haven project (https://haven.tidyverse.org/) + fname = datapath("io", "data", "labelled-num-na.sav") + + df = pd.read_spss(fname, convert_categoricals=True) + expected = pd.DataFrame({"VAR00002": ["This is one", None]}) + expected["VAR00002"] = pd.Categorical(expected["VAR00002"]) + tm.assert_frame_equal(df, expected) + + df = pd.read_spss(fname, convert_categoricals=False) + expected = pd.DataFrame({"VAR00002": [1.0, np.nan]}) + tm.assert_frame_equal(df, expected) + + +def test_spss_labelled_str(datapath): + # test file from the Haven project (https://haven.tidyverse.org/) + fname = datapath("io", "data", "labelled-str.sav") + + df = pd.read_spss(fname, convert_categoricals=True) + expected = pd.DataFrame({"gender": ["Male", "Female"]}) + expected["gender"] = pd.Categorical(expected["gender"]) + tm.assert_frame_equal(df, expected) + + df = pd.read_spss(fname, convert_categoricals=False) + expected = pd.DataFrame({"gender": ["M", "F"]}) + tm.assert_frame_equal(df, expected) + + +def test_spss_umlauts(datapath): + # test file from the Haven project (https://haven.tidyverse.org/) + fname = datapath("io", "data", "umlauts.sav") + + df = pd.read_spss(fname, convert_categoricals=True) + expected = pd.DataFrame({"var1": ["the ä umlaut", + "the ü umlaut", + "the ä umlaut", + "the ö umlaut"]}) + expected["var1"] = pd.Categorical(expected["var1"]) + tm.assert_frame_equal(df, expected) + + df = pd.read_spss(fname, convert_categoricals=False) + expected = pd.DataFrame({"var1": [1.0, 2.0, 1.0, 3.0]}) + tm.assert_frame_equal(df, expected) + + +def test_spss_usecols(datapath): + # usecols must be list-like + fname = datapath("io", "data", "labelled-num.sav") + + with pytest.raises(TypeError, match="usecols must be list-like."): + pd.read_spss(fname, usecols="VAR00002") diff --git a/requirements-dev.txt b/requirements-dev.txt index b40aa86e946b69..169af7da5e037b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -52,4 +52,5 @@ sqlalchemy xarray xlrd xlsxwriter -xlwt \ No newline at end of file +xlwt +pyreadstat \ No newline at end of file From 9326c1e20171a76af37251356726aa799904bb07 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 16 Jun 2019 09:32:13 -0500 Subject: [PATCH 008/238] Convert Sparse ASVs (#26704) --- asv_bench/benchmarks/sparse.py | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index ca4469e64c335e..281e81f21ba9c6 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -1,9 +1,8 @@ -import itertools - import numpy as np import scipy.sparse -from pandas import (SparseSeries, SparseDataFrame, SparseArray, Series, - date_range, MultiIndex) + +import pandas as pd +from pandas import MultiIndex, Series, SparseArray, date_range def make_array(size, dense_proportion, fill_value, dtype): @@ -25,10 +24,10 @@ def setup(self): data = np.random.randn(N)[:-i] idx = rng[:-i] data[100:] = np.nan - self.series[i] = SparseSeries(data, index=idx) + self.series[i] = pd.Series(pd.SparseArray(data), index=idx) def time_series_to_frame(self): - SparseDataFrame(self.series) + pd.DataFrame(self.series) class SparseArrayConstructor: @@ -51,16 +50,9 @@ def setup(self): N = 1000 self.arr = np.arange(N) self.sparse = scipy.sparse.rand(N, N, 0.005) - self.dict = dict(zip(range(N), itertools.repeat([0]))) - - def time_constructor(self): - SparseDataFrame(columns=self.arr, index=self.arr) def time_from_scipy(self): - SparseDataFrame(self.sparse) - - def time_from_dict(self): - SparseDataFrame(self.dict) + pd.DataFrame.sparse.from_spmatrix(self.sparse) class FromCoo: @@ -71,7 +63,7 @@ def setup(self): shape=(100, 100)) def time_sparse_series_from_coo(self): - SparseSeries.from_coo(self.matrix) + pd.Series.sparse.from_coo(self.matrix) class ToCoo: @@ -82,12 +74,12 @@ def setup(self): s[100] = -1.0 s[999] = 12.1 s.index = MultiIndex.from_product([range(10)] * 4) - self.ss = s.to_sparse() + self.ss = s.astype("Sparse") def time_sparse_series_to_coo(self): - self.ss.to_coo(row_levels=[0, 1], - column_levels=[2, 3], - sort_labels=True) + self.ss.sparse.to_coo(row_levels=[0, 1], + column_levels=[2, 3], + sort_labels=True) class Arithmetic: From adc656433ed959a9454ee6f90b5b6b1b9b9c3e42 Mon Sep 17 00:00:00 2001 From: Christian Haege <42995291+Kischy@users.noreply.github.com> Date: Sun, 16 Jun 2019 22:47:33 +0200 Subject: [PATCH 009/238] DOC: Changed string to give intended explanation (#26891) Thanks, @Kischy! --- doc/source/user_guide/missing_data.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 417eead3a2b332..cd70a109b3c77a 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -606,7 +606,7 @@ list of regex -> list of regex: .. ipython:: python - df.replace([r'\.', r'(a)'], ['dot', '\1stuff'], regex=True) + df.replace([r'\.', r'(a)'], ['dot', r'\1stuff'], regex=True) Only search in column ``'b'`` (dict -> dict): From a890caf47e797c853b790b468b33de90134514bd Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Mon, 17 Jun 2019 08:24:29 +0100 Subject: [PATCH 010/238] DOC/CI: Failing documentation build on warnings (#26852) --- azure-pipelines.yml | 6 ++++- doc/source/getting_started/10min.rst | 1 + doc/source/user_guide/io.rst | 33 ++++++++++++++++++---------- doc/sphinxext/contributors.py | 7 ++++-- pandas/core/dtypes/base.py | 16 +++++++++----- pandas/core/indexes/base.py | 13 ++++++++++- 6 files changed, 56 insertions(+), 20 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index b40d46bdebe02a..9238c270023376 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -5,6 +5,7 @@ jobs: parameters: name: macOS vmImage: xcode9-macos10.13 + - template: ci/azure/posix.yml parameters: name: Linux @@ -134,7 +135,10 @@ jobs: - script: | export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev - doc/make.py + # Next we should simply have `doc/make.py --warnings-are-errors`, everything else is required because the ipython directive doesn't fail the build on errors (https://github.com/ipython/ipython/issues/11547) + doc/make.py --warnings-are-errors | tee sphinx.log ; SPHINX_RET=${PIPESTATUS[0]} + grep -B1 "^<<<-------------------------------------------------------------------------$" sphinx.log ; IPY_RET=$(( $? != 1 )) + exit $(( $SPHINX_RET + $IPY_RET )) displayName: 'Build documentation' - script: | diff --git a/doc/source/getting_started/10min.rst b/doc/source/getting_started/10min.rst index fdf1f05b8e61fa..8bb188419cb595 100644 --- a/doc/source/getting_started/10min.rst +++ b/doc/source/getting_started/10min.rst @@ -712,6 +712,7 @@ See the :ref:`Plotting ` docs. plt.close('all') .. ipython:: python + :okwarning: ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 725af8ef8769b5..30a42de2ab2879 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3249,24 +3249,35 @@ And then import the data directly to a ``DataFrame`` by calling: .. code-block:: python - clipdf = pd.read_clipboard() - -.. ipython:: python - - clipdf - + >>> clipdf = pd.read_clipboard() + >>> clipdf + A B C + x 1 4 p + y 2 5 q + z 3 6 r The ``to_clipboard`` method can be used to write the contents of a ``DataFrame`` to the clipboard. Following which you can paste the clipboard contents into other applications (CTRL-V on many operating systems). Here we illustrate writing a ``DataFrame`` into clipboard and reading it back. -.. ipython:: python +.. code-block:: python - df = pd.DataFrame(np.random.randn(5, 3)) - df - df.to_clipboard() - pd.read_clipboard() + >>> df = pd.DataFrame({'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... 'C': ['p', 'q', 'r']}, + ... index=['x', 'y', 'z']) + >>> df + A B C + x 1 4 p + y 2 5 q + z 3 6 r + >>> df.to_clipboard() + >>> pd.read_clipboard() + A B C + x 1 4 p + y 2 5 q + z 3 6 r We can see that we got the same content back, which we had earlier written to the clipboard. diff --git a/doc/sphinxext/contributors.py b/doc/sphinxext/contributors.py index 179ba19a0908a4..7794a24dad89b7 100644 --- a/doc/sphinxext/contributors.py +++ b/doc/sphinxext/contributors.py @@ -21,12 +21,15 @@ class ContributorsDirective(Directive): def run(self): range_ = self.arguments[0] + if range_.endswith('x..HEAD'): + return [nodes.paragraph(), nodes.bullet_list()] try: components = build_components(range_) - except git.GitCommandError: + except git.GitCommandError as exc: return [ self.state.document.reporter.warning( - "Cannot find contributors for range '{}'".format(range_), + "Cannot find contributors for range '{}': {}".format( + range_, exc), line=self.lineno) ] else: diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index d1d48f9810419d..e7191136a7d538 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -68,11 +68,6 @@ class property**. ``pandas.errors.AbstractMethodError`` and no ``register`` method is provided for registering virtual subclasses. """ - # na_value is the default NA value to use for this type. This is used in - # e.g. ExtensionArray.take. This should be the user-facing "boxed" version - # of the NA value, not the physical NA value for storage. - # e.g. for JSONArray, this is an empty dictionary. - na_value = np.nan _metadata = () # type: Tuple[str, ...] def __str__(self): @@ -114,6 +109,17 @@ def __hash__(self): def __ne__(self, other): return not self.__eq__(other) + @property + def na_value(self): + """ + Default NA value to use for this type. + + This is used in e.g. ExtensionArray.take. This should be the + user-facing "boxed" version of the NA value, not the physical NA value + for storage. e.g. for JSONArray, this is an empty dictionary. + """ + return np.nan + @property def type(self) -> Type: """ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9f0f89a0e34f59..4601d63f2d27ed 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -229,7 +229,6 @@ def _outer_indexer(self, left, right): _data = None _id = None name = None - asi8 = None _comparables = ['name'] _attributes = ['name'] _is_numeric_dtype = False @@ -501,6 +500,18 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, See each method's docstring. """ + @property + def asi8(self): + """ + Integer representation of the values. + + Returns + ------- + ndarray + An ndarray with int64 dtype. + """ + return None + @classmethod def _simple_new(cls, values, name=None, dtype=None, **kwargs): """ From baa77c33fb71c29acea21ba06adaf426ed4cb561 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 17 Jun 2019 04:44:39 -0700 Subject: [PATCH 011/238] DEPR: deprecate Timedelta.resolution (#26839) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/_libs/tslibs/timedeltas.pyx | 53 ++++++++++++++++++- pandas/core/indexes/timedeltas.py | 4 +- pandas/tests/scalar/test_nat.py | 4 +- .../tests/scalar/timedelta/test_timedelta.py | 16 ++++++ 5 files changed, 72 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index f7faeea7a646f7..2b1a61186dca65 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -502,7 +502,7 @@ Other Deprecations Use the public attributes :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop` and :attr:`~RangeIndex.step` instead (:issue:`26581`). - The :meth:`Series.ftype`, :meth:`Series.ftypes` and :meth:`DataFrame.ftypes` methods are deprecated and will be removed in a future version. Instead, use :meth:`Series.dtype` and :meth:`DataFrame.dtypes` (:issue:`26705`). - +- :meth:`Timedelta.resolution` is deprecated and replaced with :meth:`Timedelta.resolution_string`. In a future version, :meth:`Timedelta.resolution` will be changed to behave like the standard library :attr:`timedelta.resolution` (:issue:`21344`) .. _whatsnew_0250.prior_deprecations: diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index ad60165e98d4f3..6a32553fe2d385 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -950,7 +950,7 @@ cdef class _Timedelta(timedelta): return np.int64(self.value).view('m8[ns]') @property - def resolution(self): + def resolution_string(self): """ Return a string representing the lowest timedelta resolution. @@ -991,7 +991,6 @@ cdef class _Timedelta(timedelta): >>> td.resolution 'U' """ - self._ensure_components() if self._ns: return "N" @@ -1008,6 +1007,56 @@ cdef class _Timedelta(timedelta): else: return "D" + @property + def resolution(self): + """ + Return a string representing the lowest timedelta resolution. + + Each timedelta has a defined resolution that represents the lowest OR + most granular level of precision. Each level of resolution is + represented by a short string as defined below: + + Resolution: Return value + + * Days: 'D' + * Hours: 'H' + * Minutes: 'T' + * Seconds: 'S' + * Milliseconds: 'L' + * Microseconds: 'U' + * Nanoseconds: 'N' + + Returns + ------- + str + Timedelta resolution. + + Examples + -------- + >>> td = pd.Timedelta('1 days 2 min 3 us 42 ns') + >>> td.resolution + 'N' + + >>> td = pd.Timedelta('1 days 2 min 3 us') + >>> td.resolution + 'U' + + >>> td = pd.Timedelta('2 min 3 s') + >>> td.resolution + 'S' + + >>> td = pd.Timedelta(36, unit='us') + >>> td.resolution + 'U' + """ + # See GH#21344 + warnings.warn("Timedelta.resolution is deprecated, in a future " + "version will behave like the standard library " + "datetime.timedelta.resolution attribute. " + "Use Timedelta.resolution_string instead.", + FutureWarning) + return self.resolution_string + @property def nanoseconds(self): """ diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index f5362c0b6bb5df..ba5507fa71e8c6 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -551,11 +551,11 @@ def _maybe_cast_slice_bound(self, label, side, kind): if isinstance(label, str): parsed = Timedelta(label) - lbound = parsed.round(parsed.resolution) + lbound = parsed.round(parsed.resolution_string) if side == 'left': return lbound else: - return (lbound + to_offset(parsed.resolution) - + return (lbound + to_offset(parsed.resolution_string) - Timedelta(1, 'ns')) elif ((is_integer(label) or is_float(label)) and not is_timedelta64_dtype(label)): diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 0ae4d107d85bd9..19426c3bf3ffbb 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -142,8 +142,8 @@ def test_nat_iso_format(get_nat): @pytest.mark.parametrize("klass,expected", [ (Timestamp, ["freqstr", "normalize", "to_julian_date", "to_period", "tz"]), - (Timedelta, ["components", "delta", "is_populated", "to_pytimedelta", - "to_timedelta64", "view"]) + (Timedelta, ["components", "delta", "is_populated", "resolution_string", + "to_pytimedelta", "to_timedelta64", "view"]) ]) def test_missing_public_nat_methods(klass, expected): # see gh-17327 diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 57b3705640202b..f10876531e66a0 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -742,6 +742,22 @@ def test_components(self): assert not result.iloc[0].isna().all() assert result.iloc[1].isna().all() + def test_resolution_string(self): + assert Timedelta(days=1).resolution_string == 'D' + assert Timedelta(days=1, hours=6).resolution_string == 'H' + assert Timedelta(days=1, minutes=6).resolution_string == 'T' + assert Timedelta(days=1, seconds=6).resolution_string == 'S' + assert Timedelta(days=1, milliseconds=6).resolution_string == 'L' + assert Timedelta(days=1, microseconds=6).resolution_string == 'U' + assert Timedelta(days=1, nanoseconds=6).resolution_string == 'N' + + def test_resolution_deprecated(self): + # GH#21344 + td = Timedelta(days=4, hours=3) + with tm.assert_produces_warning(FutureWarning) as w: + td.resolution + assert "Use Timedelta.resolution_string instead" in str(w[0].message) + @pytest.mark.parametrize('value, expected', [ (Timedelta('10S'), True), From 77e6556ab55bb2746e1d1bb1f7c3da66bba2a4a1 Mon Sep 17 00:00:00 2001 From: pilkibun <51503352+pilkibun@users.noreply.github.com> Date: Tue, 18 Jun 2019 10:07:16 +0000 Subject: [PATCH 012/238] CLN: introduce test decorator skip_if_np_lt(ver_string) (#26901) --- pandas/compat/numpy/__init__.py | 5 +++-- pandas/tests/arrays/sparse/test_array.py | 9 ++++----- pandas/tests/frame/test_analytics.py | 16 ++++++++-------- pandas/tests/series/test_analytics.py | 6 +++--- pandas/util/_test_decorators.py | 12 +++++++++--- 5 files changed, 27 insertions(+), 21 deletions(-) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 3499d631376d82..c738cc74e46a49 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -1,9 +1,9 @@ """ support numpy compatiblitiy across versions """ -import re -import numpy as np from distutils.version import LooseVersion +import re +import numpy as np # numpy versioning _np_version = np.__version__ @@ -62,6 +62,7 @@ def np_array_datetime64_compat(arr, *args, **kwargs): __all__ = ['np', + '_np_version', '_np_version_under1p14', '_np_version_under1p15', '_np_version_under1p16', diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 659f2b97485a98..c0a1b320790444 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -6,7 +6,6 @@ import pytest from pandas._libs.sparse import IntIndex -from pandas.compat.numpy import _np_version_under1p16 import pandas.util._test_decorators as td import pandas as pd @@ -175,8 +174,8 @@ def test_constructor_inferred_fill_value(self, data, fill_value): @pytest.mark.parametrize('format', ['coo', 'csc', 'csr']) @pytest.mark.parametrize('size', [ pytest.param(0, - marks=pytest.mark.skipif(_np_version_under1p16, - reason='NumPy-11383')), + marks=td.skip_if_np_lt("1.16", + reason='NumPy-11383')), 10 ]) @td.skip_if_no_scipy @@ -870,7 +869,7 @@ def test_all(self, data, pos, neg): ([1, 2, 1], 1, 0), ([1.0, 2.0, 1.0], 1.0, 0.0) ]) - @td.skip_if_np_lt_115 # prior didn't dispatch + @td.skip_if_np_lt("1.15") # prior didn't dispatch def test_numpy_all(self, data, pos, neg): # GH 17570 out = np.all(SparseArray(data)) @@ -916,7 +915,7 @@ def test_any(self, data, pos, neg): ([0, 2, 0], 2, 0), ([0.0, 2.0, 0.0], 2.0, 0.0) ]) - @td.skip_if_np_lt_115 # prior didn't dispatch + @td.skip_if_np_lt("1.15") # prior didn't dispatch def test_numpy_any(self, data, pos, neg): # GH 17570 out = np.any(SparseArray(data)) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 18d8d351e48c18..01a398584b5e1a 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1565,21 +1565,21 @@ def test_any_all_bool_only(self): (np.all, {'A': pd.Series([0, 1], dtype=int)}, False), (np.any, {'A': pd.Series([0, 1], dtype=int)}, True), pytest.param(np.all, {'A': pd.Series([0, 1], dtype='M8[ns]')}, False, - marks=[td.skip_if_np_lt_115]), + marks=[td.skip_if_np_lt("1.15")]), pytest.param(np.any, {'A': pd.Series([0, 1], dtype='M8[ns]')}, True, - marks=[td.skip_if_np_lt_115]), + marks=[td.skip_if_np_lt("1.15")]), pytest.param(np.all, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True, - marks=[td.skip_if_np_lt_115]), + marks=[td.skip_if_np_lt("1.15")]), pytest.param(np.any, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True, - marks=[td.skip_if_np_lt_115]), + marks=[td.skip_if_np_lt("1.15")]), pytest.param(np.all, {'A': pd.Series([0, 1], dtype='m8[ns]')}, False, - marks=[td.skip_if_np_lt_115]), + marks=[td.skip_if_np_lt("1.15")]), pytest.param(np.any, {'A': pd.Series([0, 1], dtype='m8[ns]')}, True, - marks=[td.skip_if_np_lt_115]), + marks=[td.skip_if_np_lt("1.15")]), pytest.param(np.all, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True, - marks=[td.skip_if_np_lt_115]), + marks=[td.skip_if_np_lt("1.15")]), pytest.param(np.any, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True, - marks=[td.skip_if_np_lt_115]), + marks=[td.skip_if_np_lt("1.15")]), (np.all, {'A': pd.Series([0, 1], dtype='category')}, False), (np.any, {'A': pd.Series([0, 1], dtype='category')}, True), (np.all, {'A': pd.Series([1, 2], dtype='category')}, True), diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index e5eb7d19dc649d..aed08b78fe6406 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1105,7 +1105,7 @@ def test_value_counts_categorical_not_ordered(self): dict(keepdims=True), dict(out=object()), ]) - @td.skip_if_np_lt_115 + @td.skip_if_np_lt("1.15") def test_validate_any_all_out_keepdims_raises(self, kwargs, func): s = pd.Series([1, 2]) param = list(kwargs)[0] @@ -1117,7 +1117,7 @@ def test_validate_any_all_out_keepdims_raises(self, kwargs, func): with pytest.raises(ValueError, match=msg): func(s, **kwargs) - @td.skip_if_np_lt_115 + @td.skip_if_np_lt("1.15") def test_validate_sum_initial(self): s = pd.Series([1, 2]) msg = (r"the 'initial' parameter is not " @@ -1136,7 +1136,7 @@ def test_validate_median_initial(self): # method instead of the ufunc. s.median(overwrite_input=True) - @td.skip_if_np_lt_115 + @td.skip_if_np_lt("1.15") def test_validate_stat_keepdims(self): s = pd.Series([1, 2]) msg = (r"the 'keepdims' parameter is not " diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 4cc316ffdd7abd..0cb82c0028c90d 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -23,6 +23,7 @@ def test_foo(): For more information, refer to the ``pytest`` documentation on ``skipif``. """ +from distutils.version import LooseVersion import locale from typing import Optional @@ -30,7 +31,7 @@ def test_foo(): import pytest from pandas.compat import is_platform_32bit, is_platform_windows -from pandas.compat.numpy import _np_version_under1p15 +from pandas.compat.numpy import _np_version from pandas.core.computation.expressions import ( _NUMEXPR_INSTALLED, _USE_NUMEXPR) @@ -142,8 +143,6 @@ def skip_if_no( skip_if_no_mpl = pytest.mark.skipif(_skip_if_no_mpl(), reason="Missing matplotlib dependency") -skip_if_np_lt_115 = pytest.mark.skipif(_np_version_under1p15, - reason="NumPy 1.15 or greater required") skip_if_mpl = pytest.mark.skipif(not _skip_if_no_mpl(), reason="matplotlib is present") skip_if_32bit = pytest.mark.skipif(is_platform_32bit(), @@ -168,6 +167,13 @@ def skip_if_no( installed=_NUMEXPR_INSTALLED)) +def skip_if_np_lt(ver_str, reason=None, *args, **kwds): + if reason is None: + reason = "NumPy %s or greater required" % ver_str + return pytest.mark.skipif(_np_version < LooseVersion(ver_str), + reason=reason, *args, **kwds) + + def parametrize_fixture_doc(*args): """ Intended for use as a decorator for parametrized fixture, From 49f33f0d49e65421db12babd50c019a28f422783 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Tue, 18 Jun 2019 12:38:28 +0100 Subject: [PATCH 013/238] DOC: Replacing travis doc deps by environment.yml (#26829) --- .travis.yml | 4 ++-- azure-pipelines.yml | 2 +- ci/deps/travis-36-doc.yaml | 46 -------------------------------------- 3 files changed, 3 insertions(+), 49 deletions(-) delete mode 100644 ci/deps/travis-36-doc.yaml diff --git a/.travis.yml b/.travis.yml index fd59544d9b3c6d..4d1281819e2cd4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -51,14 +51,14 @@ matrix: # In allow_failures - dist: trusty env: - - JOB="3.6, doc" ENV_FILE="ci/deps/travis-36-doc.yaml" DOC=true + - JOB="3.6, doc" ENV_FILE="environment.yml" DOC=true allow_failures: - dist: trusty env: - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" - dist: trusty env: - - JOB="3.6, doc" ENV_FILE="ci/deps/travis-36-doc.yaml" DOC=true + - JOB="3.6, doc" ENV_FILE="environment.yml" DOC=true before_install: - echo "before_install" diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 9238c270023376..597b5f92796c0d 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -123,7 +123,7 @@ jobs: timeoutInMinutes: 90 steps: - script: | - echo '##vso[task.setvariable variable=ENV_FILE]ci/deps/travis-36-doc.yaml' + echo '##vso[task.setvariable variable=ENV_FILE]environment.yml' displayName: 'Setting environment variables' - script: | diff --git a/ci/deps/travis-36-doc.yaml b/ci/deps/travis-36-doc.yaml deleted file mode 100644 index 9419543e601e24..00000000000000 --- a/ci/deps/travis-36-doc.yaml +++ /dev/null @@ -1,46 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - beautifulsoup4 - - bottleneck - - cython>=0.28.2 - - fastparquet>=0.2.1 - - gitpython - - html5lib - - hypothesis>=3.58.0 - - ipykernel - - ipython - - ipywidgets - - lxml - - matplotlib - - nbconvert>=5.4.1 - - nbformat - - nbsphinx - - notebook>=5.7.5 - - numexpr - - numpy - - numpydoc - - openpyxl - - pandoc - - pyarrow - - pyqt - - pytables - - python-dateutil - - python-snappy - - python=3.6.* - - pytz - - scipy - - seaborn - # some styling is broken with sphinx >= 2 (https://github.com/pandas-dev/pandas/issues/26058) - - sphinx=1.8.5 - - sqlalchemy - - statsmodels - - xarray - - xlrd - - xlsxwriter - - xlwt - # universal - - pytest>=4.0.2 - - pytest-xdist From d432f654efa3c9160309bbe17cb44ce863ebc328 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Tue, 18 Jun 2019 14:42:21 +0100 Subject: [PATCH 014/238] DOC: Remove travis doc build (#26856) --- .travis.yml | 8 ------- azure-pipelines.yml | 1 - ci/build_docs.sh | 56 --------------------------------------------- 3 files changed, 65 deletions(-) delete mode 100755 ci/build_docs.sh diff --git a/.travis.yml b/.travis.yml index 4d1281819e2cd4..8335a6ee92bef0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -48,17 +48,10 @@ matrix: env: - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" - # In allow_failures - - dist: trusty - env: - - JOB="3.6, doc" ENV_FILE="environment.yml" DOC=true allow_failures: - dist: trusty env: - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" - - dist: trusty - env: - - JOB="3.6, doc" ENV_FILE="environment.yml" DOC=true before_install: - echo "before_install" @@ -97,7 +90,6 @@ before_script: script: - echo "script start" - source activate pandas-dev - - ci/build_docs.sh - ci/run_tests.sh after_script: diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 597b5f92796c0d..11284254c7a0f6 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -176,7 +176,6 @@ jobs: cd doc/build/html git remote add origin git@github.com:pandas-dev/pandas-dev.github.io.git git push -f origin master - exit 0 # FIXME this will leave the build green even if the step fails. To be removed when we are confident with this. displayName: 'Publish docs to GitHub pages' condition : | and(not(eq(variables['Build.Reason'], 'PullRequest')), diff --git a/ci/build_docs.sh b/ci/build_docs.sh deleted file mode 100755 index bf22f0764144c3..00000000000000 --- a/ci/build_docs.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash - -set -e - -if [ "${TRAVIS_OS_NAME}" != "linux" ]; then - echo "not doing build_docs on non-linux" - exit 0 -fi - -cd "$TRAVIS_BUILD_DIR"/doc -echo "inside $0" - -if [ "$DOC" ]; then - - echo "Will build docs" - - echo ############################### - echo # Log file for the doc build # - echo ############################### - - echo ./make.py - ./make.py - - echo ######################## - echo # Create and send docs # - echo ######################## - - echo "Only uploading docs when TRAVIS_PULL_REQUEST is 'false'" - echo "TRAVIS_PULL_REQUEST: ${TRAVIS_PULL_REQUEST}" - - if [ "${TRAVIS_PULL_REQUEST}" == "false" ]; then - cd build/html - git config --global user.email "pandas-docs-bot@localhost.foo" - git config --global user.name "pandas-docs-bot" - - # create the repo - git init - - touch README - git add README - git commit -m "Initial commit" --allow-empty - git branch gh-pages - git checkout gh-pages - touch .nojekyll - git add --all . - git commit -m "Version" --allow-empty - - git remote add origin "https://${PANDAS_GH_TOKEN}@github.com/pandas-dev/pandas-docs-travis.git" - git fetch origin - git remote -v - - git push origin gh-pages -f - fi -fi - -exit 0 From bd09a5904236646778490ca9bc433d6f2d47add8 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Tue, 18 Jun 2019 15:21:24 +0100 Subject: [PATCH 015/238] DOC: Update development documentation urls (#26857) --- README.md | 2 +- doc/source/development/contributing.rst | 2 +- doc/source/whatsnew/v0.25.0.rst | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e8bfd28cc82083..aeeea1464e1fdd 100644 --- a/README.md +++ b/README.md @@ -224,7 +224,7 @@ Most development discussion is taking place on github in this repo. Further, the All contributions, bug reports, bug fixes, documentation improvements, enhancements and ideas are welcome. -A detailed overview on how to contribute can be found in the **[contributing guide](https://pandas-docs.github.io/pandas-docs-travis/contributing.html)**. There is also an [overview](.github/CONTRIBUTING.md) on GitHub. +A detailed overview on how to contribute can be found in the **[contributing guide](https://dev.pandas.io/contributing.html)**. There is also an [overview](.github/CONTRIBUTING.md) on GitHub. If you are simply looking to start working with the pandas codebase, navigate to the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [good first issue](https://github.com/pandas-dev/pandas/issues?labels=good+first+issue&sort=updated&state=open) where you could start out. diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index b5c7ae7a213cb2..c9c76f307d93f3 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -460,7 +460,7 @@ Building master branch documentation When pull requests are merged into the *pandas* ``master`` branch, the main parts of the documentation are also built by Travis-CI. These docs are then hosted `here -`__, see also +`__, see also the :ref:`Continuous Integration ` section. .. _contributing.code: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 2b1a61186dca65..833a9b1c342df7 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -411,7 +411,7 @@ If installed, we now require: | pytest (dev) | 4.0.2 | | +-----------------+-----------------+----------+ -For `optional libraries `_ the general recommendation is to use the latest version. +For `optional libraries `_ the general recommendation is to use the latest version. The following table lists the lowest version per library that is currently being tested throughout the development of pandas. Optional libraries below the lowest tested version may still work, but are not considered supported. From 7375c73cf56196cc8a2290a56dee01f019b98017 Mon Sep 17 00:00:00 2001 From: Mak Sze Chun Date: Tue, 18 Jun 2019 23:59:34 +0800 Subject: [PATCH 016/238] Add typing annotation to IntervalIndex.intersection (#26870) --- pandas/core/indexes/interval.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 24fcb32d09d276..896935fa72adbd 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1140,7 +1140,10 @@ def overlaps(self, other): @Appender(_index_shared_docs['intersection']) @SetopCheck(op_name='intersection') - def intersection(self, other, sort=False): + def intersection(self, + other: 'IntervalIndex', + sort: bool = False + ) -> 'IntervalIndex': if self.left.is_unique and self.right.is_unique: taken = self._intersection_unique(other) elif (other.left.is_unique and other.right.is_unique and @@ -1157,7 +1160,9 @@ def intersection(self, other, sort=False): return taken - def _intersection_unique(self, other): + def _intersection_unique(self, + other: 'IntervalIndex' + ) -> 'IntervalIndex': """ Used when the IntervalIndex does not have any common endpoint, no mater left or right. @@ -1179,7 +1184,9 @@ def _intersection_unique(self, other): return self.take(indexer) - def _intersection_non_unique(self, other): + def _intersection_non_unique(self, + other: 'IntervalIndex' + ) -> 'IntervalIndex': """ Used when the IntervalIndex does have some common endpoints, on either sides. From d39a6de66803b2a13d6bbb8aa98c56e90f95526c Mon Sep 17 00:00:00 2001 From: Gaibo Zhang Date: Tue, 18 Jun 2019 11:00:41 -0500 Subject: [PATCH 017/238] Fix miscellaneous small holiday errors (#26912) --- pandas/tseries/holiday.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 030d2744cd7f5e..7171a6a182bdc3 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -148,8 +148,8 @@ class from pandas.tseries.offsets -------- >>> from pandas.tseries.holiday import Holiday, nearest_workday >>> from dateutil.relativedelta import MO - >>> USMemorialDay = Holiday('MemorialDay', month=5, day=24, - offset=pd.DateOffset(weekday=MO(1))) + >>> USMemorialDay = Holiday('Memorial Day', month=5, day=31, + offset=pd.DateOffset(weekday=MO(-1))) >>> USLaborDay = Holiday('Labor Day', month=9, day=1, offset=pd.DateOffset(weekday=MO(1))) >>> July3rd = Holiday('July 3rd', month=7, day=3,) @@ -464,7 +464,7 @@ def merge(self, other, inplace=False): return holidays -USMemorialDay = Holiday('MemorialDay', month=5, day=31, +USMemorialDay = Holiday('Memorial Day', month=5, day=31, offset=DateOffset(weekday=MO(-1))) USLaborDay = Holiday('Labor Day', month=9, day=1, offset=DateOffset(weekday=MO(1))) @@ -472,10 +472,10 @@ def merge(self, other, inplace=False): offset=DateOffset(weekday=MO(2))) USThanksgivingDay = Holiday('Thanksgiving', month=11, day=1, offset=DateOffset(weekday=TH(4))) -USMartinLutherKingJr = Holiday('Dr. Martin Luther King Jr.', +USMartinLutherKingJr = Holiday('Martin Luther King Jr. Day', start_date=datetime(1986, 1, 1), month=1, day=1, offset=DateOffset(weekday=MO(3))) -USPresidentsDay = Holiday('President''s Day', month=2, day=1, +USPresidentsDay = Holiday('Presidents Day', month=2, day=1, offset=DateOffset(weekday=MO(3))) GoodFriday = Holiday("Good Friday", month=1, day=1, offset=[Easter(), Day(-2)]) From 5828b314f12f4f59d3d65ba182cd069cfdae5625 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Wed, 19 Jun 2019 00:57:07 +0100 Subject: [PATCH 018/238] DOC: Disallowing search engines in the development docs (#26931) --- azure-pipelines.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 11284254c7a0f6..cfd7f6546833dd 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -146,6 +146,7 @@ jobs: git init touch .nojekyll echo "dev.pandas.io" > CNAME + printf "User-agent: *\nDisallow: /" > robots.txt git add --all . git config user.email "pandas-dev@python.org" git config user.name "pandas-docs-bot" From 9ceb0295e3dbd3b94c70ef90d8d64731b851b07e Mon Sep 17 00:00:00 2001 From: topper-123 Date: Wed, 19 Jun 2019 02:19:15 +0200 Subject: [PATCH 019/238] Add type hints for (BlockManager|SingleBlockManager).blocks (#26888) --- pandas/core/internals/managers.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a1e5468e2f8718..907498c7ff3508 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -3,7 +3,7 @@ import itertools import operator import re -from typing import List, Optional, Union +from typing import List, Optional, Sequence, Tuple, Union import numpy as np @@ -95,9 +95,12 @@ class BlockManager(PandasObject): __slots__ = ['axes', 'blocks', '_ndim', '_shape', '_known_consolidated', '_is_consolidated', '_blknos', '_blklocs'] - def __init__(self, blocks, axes, do_integrity_check=True): + def __init__(self, + blocks: Sequence[Block], + axes: Sequence[Index], + do_integrity_check: bool = True): self.axes = [ensure_index(ax) for ax in axes] - self.blocks = tuple(blocks) + self.blocks = tuple(blocks) # type: Tuple[Block, ...] for block in blocks: if block.is_sparse: @@ -1415,8 +1418,11 @@ class SingleBlockManager(BlockManager): _known_consolidated = True __slots__ = () - def __init__(self, block, axis, do_integrity_check=False, fastpath=False): - + def __init__(self, + block: Block, + axis: Union[Index, List[Index]], + do_integrity_check: bool = False, + fastpath: bool = False): if isinstance(axis, list): if len(axis) != 1: raise ValueError("cannot create SingleBlockManager with more " @@ -1455,7 +1461,7 @@ def __init__(self, block, axis, do_integrity_check=False, fastpath=False): if not isinstance(block, Block): block = make_block(block, placement=slice(0, len(axis)), ndim=1) - self.blocks = [block] + self.blocks = tuple([block]) def _post_setstate(self): pass From 376a05e4d5b11f91bba8a3bcd422cd881a7b3511 Mon Sep 17 00:00:00 2001 From: Vikramjeet Das <41290641+intEll1gent@users.noreply.github.com> Date: Wed, 19 Jun 2019 06:11:02 +0530 Subject: [PATCH 020/238] DEPR: Deprecate Series/Dataframe.to_dense/to_sparse (#26684) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/frame.py | 13 ++++++++++--- pandas/core/generic.py | 5 +++++ pandas/core/groupby/ops.py | 6 +++--- pandas/core/series.py | 13 ++++++++++--- .../tests/arrays/sparse/test_arithmetics.py | 1 + pandas/tests/generic/test_generic.py | 14 ++++++++++++++ pandas/tests/io/json/test_pandas.py | 2 ++ pandas/tests/io/test_packers.py | 2 ++ pandas/tests/io/test_pytables.py | 10 ++++++++++ pandas/tests/series/test_api.py | 1 + pandas/tests/series/test_combine_concat.py | 13 +++++++------ pandas/tests/series/test_missing.py | 2 ++ pandas/tests/sparse/frame/test_apply.py | 2 ++ pandas/tests/sparse/frame/test_frame.py | 16 ++++++++++++++++ pandas/tests/sparse/frame/test_to_csv.py | 1 + .../tests/sparse/frame/test_to_from_scipy.py | 1 + pandas/tests/sparse/series/test_series.py | 19 +++++++++++++++++++ pandas/tests/sparse/test_combine_concat.py | 1 + pandas/tests/sparse/test_format.py | 2 ++ pandas/tests/sparse/test_groupby.py | 2 ++ pandas/tests/sparse/test_indexing.py | 3 +++ pandas/tests/sparse/test_pivot.py | 2 ++ 23 files changed, 117 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 833a9b1c342df7..d6c397679a0f36 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -503,6 +503,7 @@ Other Deprecations - The :meth:`Series.ftype`, :meth:`Series.ftypes` and :meth:`DataFrame.ftypes` methods are deprecated and will be removed in a future version. Instead, use :meth:`Series.dtype` and :meth:`DataFrame.dtypes` (:issue:`26705`). - :meth:`Timedelta.resolution` is deprecated and replaced with :meth:`Timedelta.resolution_string`. In a future version, :meth:`Timedelta.resolution` will be changed to behave like the standard library :attr:`timedelta.resolution` (:issue:`21344`) +- :meth:`Series.to_sparse`, :meth:`DataFrame.to_sparse`, :meth:`Series.to_dense` and :meth:`DataFrame.to_dense` are deprecated and will be removed in a future version. (:issue:`26557`). .. _whatsnew_0250.prior_deprecations: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d2d0525a0a0ff8..6746844f4b1fa1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1889,6 +1889,8 @@ def to_sparse(self, fill_value=None, kind='block'): """ Convert to SparseDataFrame. + .. deprecated:: 0.25.0 + Implement the sparse version of the DataFrame meaning that any data matching a specific value it's omitted in the representation. The sparse DataFrame allows for a more efficient storage. @@ -1939,10 +1941,15 @@ def to_sparse(self, fill_value=None, kind='block'): >>> type(sdf) # doctest: +SKIP """ + warnings.warn("DataFrame.to_sparse is deprecated and will be removed " + "in a future version", FutureWarning, stacklevel=2) + from pandas.core.sparse.api import SparseDataFrame - return SparseDataFrame(self._series, index=self.index, - columns=self.columns, default_kind=kind, - default_fill_value=fill_value) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="SparseDataFrame") + return SparseDataFrame(self._series, index=self.index, + columns=self.columns, default_kind=kind, + default_fill_value=fill_value) @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None) def to_stata(self, fname, convert_dates=None, write_index=True, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 463659ca8ea44d..dba88495d8128b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1940,11 +1940,16 @@ def to_dense(self): """ Return dense representation of NDFrame (as opposed to sparse). + .. deprecated:: 0.25.0 + Returns ------- %(klass)s Dense %(klass)s. """ + warnings.warn("DataFrame/Series.to_dense is deprecated " + "and will be removed in a future version", + FutureWarning, stacklevel=2) # compat return self diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index ee9d57a537340d..010047a8be4ed8 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -630,9 +630,9 @@ def _aggregate_series_fast(self, obj, func): group_index, _, ngroups = self.group_info # avoids object / Series creation overhead - dummy = obj._get_values(slice(None, 0)).to_dense() + dummy = obj._get_values(slice(None, 0)) indexer = get_group_index_sorter(group_index, ngroups) - obj = obj._take(indexer).to_dense() + obj = obj._take(indexer) group_index = algorithms.take_nd( group_index, indexer, allow_fill=False) grouper = reduction.SeriesGrouper(obj, func, group_index, ngroups, @@ -879,7 +879,7 @@ def apply(self, f): class SeriesSplitter(DataSplitter): def _chop(self, sdata, slice_obj): - return sdata._get_values(slice_obj).to_dense() + return sdata._get_values(slice_obj) class FrameSplitter(DataSplitter): diff --git a/pandas/core/series.py b/pandas/core/series.py index eaef1f525f3e4a..c4a449154860f4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1592,6 +1592,8 @@ def to_sparse(self, kind='block', fill_value=None): """ Convert Series to SparseSeries. + .. deprecated:: 0.25.0 + Parameters ---------- kind : {'block', 'integer'}, default 'block' @@ -1603,12 +1605,17 @@ def to_sparse(self, kind='block', fill_value=None): SparseSeries Sparse representation of the Series. """ + + warnings.warn("Series.to_sparse is deprecated and will be removed " + "in a future version", FutureWarning, stacklevel=2) from pandas.core.sparse.series import SparseSeries values = SparseArray(self, kind=kind, fill_value=fill_value) - return SparseSeries( - values, index=self.index, name=self.name - ).__finalize__(self) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="SparseSeries") + return SparseSeries( + values, index=self.index, name=self.name + ).__finalize__(self) def _set_name(self, name, inplace=False): """ diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index eb3af4e6dea730..31a8f13571d16a 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -9,6 +9,7 @@ @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") class TestSparseArrayArithmetics: _base = np.array diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index e6d9851b1bb990..b1a083213debd6 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -918,3 +918,17 @@ def test_axis_classmethods(self, box): assert obj._get_axis_name(v) == box._get_axis_name(v) assert obj._get_block_manager_axis(v) == \ box._get_block_manager_axis(v) + + def test_deprecated_to_dense(self): + # GH 26557: DEPR + # Deprecated 0.25.0 + + df = pd.DataFrame({"A": [1, 2, 3]}) + with tm.assert_produces_warning(FutureWarning): + result = df.to_dense() + tm.assert_frame_equal(result, df) + + ser = pd.Series([1, 2, 3]) + with tm.assert_produces_warning(FutureWarning): + result = ser.to_dense() + tm.assert_series_equal(result, ser) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 6b4c6a398962a7..a935a731ccba62 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1013,6 +1013,8 @@ def test_datetime_tz(self): assert stz.to_json() == s_naive.to_json() @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") + @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") + @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") def test_sparse(self): # GH4377 df.to_json segfaults with non-ndarray blocks df = pd.DataFrame(np.random.randn(10, 4)) diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index f568d717211cc3..9337d5916acc67 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -551,6 +551,8 @@ def test_dataframe_duplicate_column_names(self): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparse(TestPackers): def _check_roundtrip(self, obj, comparator, **kwargs): diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 5c9c3ae46df235..299c0feb502be3 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -51,6 +51,12 @@ "ignore:object name:tables.exceptions.NaturalNameWarning" ) ignore_sparse = pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +ignore_dataframe_tosparse = pytest.mark.filterwarnings( + "ignore:DataFrame.to_sparse:FutureWarning" +) +ignore_series_tosparse = pytest.mark.filterwarnings( + "ignore:Series.to_sparse:FutureWarning" +) # contextmanager to ensure the file cleanup @@ -2245,6 +2251,7 @@ def test_series(self): check_index_type=False) @ignore_sparse + @ignore_series_tosparse def test_sparse_series(self): s = tm.makeStringSeries() @@ -2262,6 +2269,7 @@ def test_sparse_series(self): check_series_type=True) @ignore_sparse + @ignore_dataframe_tosparse def test_sparse_frame(self): s = tm.makeDataFrame() @@ -2601,6 +2609,7 @@ def test_overwrite_node(self): tm.assert_series_equal(store['a'], ts) @ignore_sparse + @ignore_dataframe_tosparse def test_sparse_with_compression(self): # GH 2931 @@ -3746,6 +3755,7 @@ def test_start_stop_multiple(self): tm.assert_frame_equal(result, expected) @ignore_sparse + @ignore_dataframe_tosparse def test_start_stop_fixed(self): with ensure_clean_store(self.path) as store: diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 6c577304d5ef44..fac796fbf325a6 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -123,6 +123,7 @@ def test_sort_index_name(self): assert result.name == self.ts.name @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") + @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") def test_to_sparse_pass_name(self): result = self.ts.to_sparse() assert result.name == self.ts.name diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index e9e87e5bb07a74..d03c29ad79469c 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -212,6 +212,7 @@ def test_combine_first_dt_tz_values(self, tz_naive_fixture): assert_series_equal(exp, result) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") + @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") def test_concat_empty_series_dtypes(self): # booleans @@ -244,16 +245,16 @@ def test_concat_empty_series_dtypes(self): # sparse # TODO: move? - result = pd.concat([Series(dtype='float64').to_sparse(), Series( - dtype='float64').to_sparse()]) + result = pd.concat([Series(dtype='float64').to_sparse(), + Series(dtype='float64').to_sparse()]) assert result.dtype == 'Sparse[float64]' # GH 26705 - Assert .ftype is deprecated with tm.assert_produces_warning(FutureWarning): assert result.ftype == 'float64:sparse' - result = pd.concat([Series(dtype='float64').to_sparse(), Series( - dtype='float64')]) + result = pd.concat([Series(dtype='float64').to_sparse(), + Series(dtype='float64')]) # TODO: release-note: concat sparse dtype expected = pd.core.sparse.api.SparseDtype(np.float64) assert result.dtype == expected @@ -262,8 +263,8 @@ def test_concat_empty_series_dtypes(self): with tm.assert_produces_warning(FutureWarning): assert result.ftype == 'float64:sparse' - result = pd.concat([Series(dtype='float64').to_sparse(), Series( - dtype='object')]) + result = pd.concat([Series(dtype='float64').to_sparse(), + Series(dtype='object')]) # TODO: release-note: concat sparse dtype expected = pd.core.sparse.api.SparseDtype('object') assert result.dtype == expected diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 77b43c1414f77e..5328a58e3fbff1 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -781,6 +781,7 @@ def test_series_fillna_limit(self): assert_series_equal(result, expected) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") + @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") def test_sparse_series_fillna_limit(self): index = np.arange(10) s = Series(np.random.randn(10), index=index) @@ -809,6 +810,7 @@ def test_sparse_series_fillna_limit(self): assert_series_equal(result, expected) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") + @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") def test_sparse_series_pad_backfill_limit(self): index = np.arange(10) s = Series(np.random.randn(10), index=index) diff --git a/pandas/tests/sparse/frame/test_apply.py b/pandas/tests/sparse/frame/test_apply.py index afb54a9fa62646..4e677f5055e797 100644 --- a/pandas/tests/sparse/frame/test_apply.py +++ b/pandas/tests/sparse/frame/test_apply.py @@ -38,6 +38,7 @@ def fill_frame(frame): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") def test_apply(frame): applied = frame.apply(np.sqrt) assert isinstance(applied, SparseDataFrame) @@ -72,6 +73,7 @@ def test_apply_empty(empty): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") def test_apply_nonuq(): orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 050526aecd2bb8..2d0b338ef53c00 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -25,6 +25,8 @@ def test_deprecated(): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparseDataFrame(SharedWithSparse): klass = SparseDataFrame @@ -348,6 +350,18 @@ def test_dense_to_sparse(self): assert sdf.default_fill_value == 0 tm.assert_frame_equal(sdf.to_dense(), df) + def test_deprecated_dense_to_sparse(self): + # GH 26557 + # Deprecated 0.25.0 + + df = pd.DataFrame({"A": [1, np.nan, 3]}) + sparse_df = pd.SparseDataFrame({"A": [1, np.nan, 3]}) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = df.to_sparse() + tm.assert_frame_equal(result, sparse_df) + def test_density(self): df = SparseSeries([nan, nan, nan, 0, 1, 2, 3, 4, 5, 6]) assert df.density == 0.7 @@ -1294,6 +1308,7 @@ def test_default_fill_value_with_no_data(self): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparseDataFrameArithmetic: def test_numeric_op_scalar(self): @@ -1324,6 +1339,7 @@ def test_comparison_op_scalar(self): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparseDataFrameAnalytics: def test_cumsum(self, float_frame): diff --git a/pandas/tests/sparse/frame/test_to_csv.py b/pandas/tests/sparse/frame/test_to_csv.py index 0dda6b5cbbdaef..41d7bfabed44aa 100644 --- a/pandas/tests/sparse/frame/test_to_csv.py +++ b/pandas/tests/sparse/frame/test_to_csv.py @@ -6,6 +6,7 @@ @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparseDataFrameToCsv: fill_values = [np.nan, 0, None, 1] diff --git a/pandas/tests/sparse/frame/test_to_from_scipy.py b/pandas/tests/sparse/frame/test_to_from_scipy.py index 269d67976b5670..881d8d31e51627 100644 --- a/pandas/tests/sparse/frame/test_to_from_scipy.py +++ b/pandas/tests/sparse/frame/test_to_from_scipy.py @@ -174,6 +174,7 @@ def test_from_scipy_fillna(spmatrix): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") def test_index_names_multiple_nones(): # https://github.com/pandas-dev/pandas/pull/24092 sparse = pytest.importorskip("scipy.sparse") diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index f7c8a84720d0a9..9ce1133cb39ca5 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -61,6 +61,7 @@ def _test_data2_zero(): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") class TestSparseSeries(SharedWithSparse): series_klass = SparseSeries @@ -1045,6 +1046,7 @@ def test_memory_usage_deep(self, deep, fill_value): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparseHandlingMultiIndexes: def setup_method(self, method): @@ -1076,6 +1078,7 @@ def test_round_trip_preserve_multiindex_names(self): "ignore:the matrix subclass:PendingDeprecationWarning" ) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") class TestSparseSeriesScipyInteraction: # Issue 8048: add SparseSeries coo methods @@ -1444,6 +1447,7 @@ def _dense_series_compare(s, f): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") class TestSparseSeriesAnalytics: def setup_method(self, method): @@ -1538,6 +1542,7 @@ def test_constructor_dict_datetime64_index(datetime_type): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") def test_to_sparse(): # https://github.com/pandas-dev/pandas/issues/22389 arr = pd.SparseArray([1, 2, None, 3]) @@ -1546,6 +1551,20 @@ def test_to_sparse(): tm.assert_sp_array_equal(result.values, arr, check_kind=False) +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +def test_deprecated_to_sparse(): + # GH 26557 + # Deprecated 0.25.0 + + ser = Series([1, np.nan, 3]) + sparse_ser = pd.SparseSeries([1, np.nan, 3]) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = ser.to_sparse() + tm.assert_series_equal(result, sparse_ser) + + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_constructor_mismatched_raises(): msg = "Length of passed values is 2, index implies 3" diff --git a/pandas/tests/sparse/test_combine_concat.py b/pandas/tests/sparse/test_combine_concat.py index ed29f24ae677fa..4fed878a10ca64 100644 --- a/pandas/tests/sparse/test_combine_concat.py +++ b/pandas/tests/sparse/test_combine_concat.py @@ -180,6 +180,7 @@ def test_concat_sparse_dense(self, kind): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparseDataFrameConcat: def setup_method(self, method): diff --git a/pandas/tests/sparse/test_format.py b/pandas/tests/sparse/test_format.py index 7ed8c48fce333e..805f77eb21c2f3 100644 --- a/pandas/tests/sparse/test_format.py +++ b/pandas/tests/sparse/test_format.py @@ -13,6 +13,7 @@ @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") class TestSparseSeriesFormatting: @property @@ -110,6 +111,7 @@ def test_sparse_int(self): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparseDataFrameFormatting: def test_sparse_frame(self): diff --git a/pandas/tests/sparse/test_groupby.py b/pandas/tests/sparse/test_groupby.py index 7abc1530618b88..531a4360c78a2f 100644 --- a/pandas/tests/sparse/test_groupby.py +++ b/pandas/tests/sparse/test_groupby.py @@ -6,6 +6,7 @@ @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparseGroupBy: def setup_method(self, method): @@ -61,6 +62,7 @@ def test_aggfuncs(self): @pytest.mark.parametrize("fill_value", [0, np.nan]) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") def test_groupby_includes_fill_value(fill_value): # https://github.com/pandas-dev/pandas/issues/5078 df = pd.DataFrame({'a': [fill_value, 1, fill_value, fill_value], diff --git a/pandas/tests/sparse/test_indexing.py b/pandas/tests/sparse/test_indexing.py index 21c303fa2a064d..df59f1dfe7b135 100644 --- a/pandas/tests/sparse/test_indexing.py +++ b/pandas/tests/sparse/test_indexing.py @@ -7,6 +7,7 @@ @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") class TestSparseSeriesIndexing: def setup_method(self, method): @@ -602,6 +603,8 @@ def test_reindex(self): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") class TestSparseDataFrameIndexing: def test_getitem(self): diff --git a/pandas/tests/sparse/test_pivot.py b/pandas/tests/sparse/test_pivot.py index 48d0719bc7f2b6..114e7b4bacd94f 100644 --- a/pandas/tests/sparse/test_pivot.py +++ b/pandas/tests/sparse/test_pivot.py @@ -6,6 +6,8 @@ @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestPivotTable: def setup_method(self, method): From d47947a79099d650b4bb981a38ce09881ec6f43d Mon Sep 17 00:00:00 2001 From: topper-123 Date: Wed, 19 Jun 2019 03:05:33 +0200 Subject: [PATCH 021/238] ENH: better MultiIndex.__repr__ (#22511) --- doc/source/user_guide/advanced.rst | 9 +- doc/source/whatsnew/v0.25.0.rst | 32 +++ pandas/core/indexes/base.py | 41 ++-- pandas/core/indexes/multi.py | 220 ++++++++++++------- pandas/core/strings.py | 5 +- pandas/io/formats/printing.py | 108 +++++++-- pandas/tests/indexes/multi/conftest.py | 26 +++ pandas/tests/indexes/multi/test_format.py | 149 +++++++++++-- pandas/tests/util/test_assert_index_equal.py | 7 +- 9 files changed, 451 insertions(+), 146 deletions(-) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 3235e3c2a8b2e6..eb1ca97e465f87 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -182,15 +182,15 @@ on a deeper level. Defined Levels ~~~~~~~~~~~~~~ -The repr of a ``MultiIndex`` shows all the defined levels of an index, even +The :class:`MultiIndex` keeps all the defined levels of an index, even if they are not actually used. When slicing an index, you may notice this. For example: .. ipython:: python -   df.columns # original MultiIndex +   df.columns.levels # original MultiIndex - df[['foo','qux']].columns # sliced + df[['foo','qux']].columns.levels # sliced This is done to avoid a recomputation of the levels in order to make slicing highly performant. If you want to see only the used levels, you can use the @@ -210,7 +210,8 @@ To reconstruct the ``MultiIndex`` with only the used levels, the .. ipython:: python - df[['foo', 'qux']].columns.remove_unused_levels() + new_mi = df[['foo', 'qux']].columns.remove_unused_levels() + new_mi.levels Data alignment and using ``reindex`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index d6c397679a0f36..b458b0f9982550 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -74,6 +74,38 @@ a dict to a Series groupby aggregation (:ref:`whatsnew_0200.api_breaking.depreca See :ref:`groupby.aggregate.named` for more. + +.. _whatsnew_0250.enhancements.multi_index_repr: + +Better repr for MultiIndex +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Printing of :class:`MultiIndex` instances now shows tuples of each row and ensures +that the tuple items are vertically aligned, so it's now easier to understand +the structure of the ``MultiIndex``. (:issue:`13480`): + +The repr now looks like this: + +.. ipython:: python + + pd.MultiIndex.from_product([['a', 'abc'], range(500)]) + +Previously, outputting a :class:`MultiIndex` printed all the ``levels`` and +``codes`` of the ``MultiIndex``, which was visually unappealing and made +the output more difficult to navigate. For example (limiting the range to 5): + +.. code-block:: ipython + + In [1]: pd.MultiIndex.from_product([['a', 'abc'], range(5)]) + Out[1]: MultiIndex(levels=[['a', 'abc'], [0, 1, 2, 3]], + ...: codes=[[0, 0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 3, 0, 1, 2, 3]]) + +In the new repr, all values will be shown, if the number of rows is smaller +than :attr:`options.display.max_seq_items` (default: 100 items). Horizontally, +the output will truncate, if it's wider than :attr:`options.display.width` +(default: 80 characters). + + .. _whatsnew_0250.enhancements.other: Other Enhancements diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4601d63f2d27ed..68faa3eb3e8836 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1332,16 +1332,23 @@ def set_names(self, names, level=None, inplace=False): >>> idx = pd.MultiIndex.from_product([['python', 'cobra'], ... [2018, 2019]]) >>> idx - MultiIndex(levels=[['cobra', 'python'], [2018, 2019]], - codes=[[1, 1, 0, 0], [0, 1, 0, 1]]) + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], + ) >>> idx.set_names(['kind', 'year'], inplace=True) >>> idx - MultiIndex(levels=[['cobra', 'python'], [2018, 2019]], - codes=[[1, 1, 0, 0], [0, 1, 0, 1]], + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], names=['kind', 'year']) >>> idx.set_names('species', level=0) - MultiIndex(levels=[['cobra', 'python'], [2018, 2019]], - codes=[[1, 1, 0, 0], [0, 1, 0, 1]], + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], names=['species', 'year']) """ @@ -1403,12 +1410,16 @@ def rename(self, name, inplace=False): ... [2018, 2019]], ... names=['kind', 'year']) >>> idx - MultiIndex(levels=[['cobra', 'python'], [2018, 2019]], - codes=[[1, 1, 0, 0], [0, 1, 0, 1]], + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], names=['kind', 'year']) >>> idx.rename(['species', 'year']) - MultiIndex(levels=[['cobra', 'python'], [2018, 2019]], - codes=[[1, 1, 0, 0], [0, 1, 0, 1]], + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], names=['species', 'year']) >>> idx.rename('species') Traceback (most recent call last): @@ -5442,8 +5453,8 @@ def ensure_index_from_sequences(sequences, names=None): >>> ensure_index_from_sequences([['a', 'a'], ['a', 'b']], names=['L1', 'L2']) - MultiIndex(levels=[['a'], ['a', 'b']], - codes=[[0, 0], [0, 1]], + MultiIndex([('a', 'a'), + ('a', 'b')], names=['L1', 'L2']) See Also @@ -5483,8 +5494,10 @@ def ensure_index(index_like, copy=False): Index([('a', 'a'), ('b', 'c')], dtype='object') >>> ensure_index([['a', 'a'], ['b', 'c']]) - MultiIndex(levels=[['a'], ['b', 'c']], - codes=[[0, 0], [0, 1]]) + MultiIndex([('a', 'b'), + ('a', 'c')], + dtype='object') + ) See Also -------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 9217b388ce86bc..0f457ba799928e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -29,7 +29,8 @@ from pandas.core.indexes.frozen import FrozenList, _ensure_frozen import pandas.core.missing as missing -from pandas.io.formats.printing import pprint_thing +from pandas.io.formats.printing import ( + format_object_attrs, format_object_summary, pprint_thing) _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( @@ -193,8 +194,10 @@ class MultiIndex(Index): >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) - MultiIndex(levels=[[1, 2], ['blue', 'red']], - codes=[[0, 0, 1, 1], [1, 0, 1, 0]], + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], names=['number', 'color']) See further examples for how to construct a MultiIndex in the doc strings @@ -359,8 +362,10 @@ def from_arrays(cls, arrays, sortorder=None, names=None): -------- >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) - MultiIndex(levels=[[1, 2], ['blue', 'red']], - codes=[[0, 0, 1, 1], [1, 0, 1, 0]], + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], names=['number', 'color']) """ error_msg = "Input must be a list / sequence of array-likes." @@ -420,8 +425,10 @@ def from_tuples(cls, tuples, sortorder=None, names=None): >>> tuples = [(1, 'red'), (1, 'blue'), ... (2, 'red'), (2, 'blue')] >>> pd.MultiIndex.from_tuples(tuples, names=('number', 'color')) - MultiIndex(levels=[[1, 2], ['blue', 'red']], - codes=[[0, 0, 1, 1], [1, 0, 1, 0]], + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], names=['number', 'color']) """ if not is_list_like(tuples): @@ -477,8 +484,12 @@ def from_product(cls, iterables, sortorder=None, names=None): >>> colors = ['green', 'purple'] >>> pd.MultiIndex.from_product([numbers, colors], ... names=['number', 'color']) - MultiIndex(levels=[[0, 1, 2], ['green', 'purple']], - codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + MultiIndex([(0, 'green'), + (0, 'purple'), + (1, 'green'), + (1, 'purple'), + (2, 'green'), + (2, 'purple')], names=['number', 'color']) """ from pandas.core.arrays.categorical import _factorize_from_iterables @@ -537,15 +548,19 @@ def from_frame(cls, df, sortorder=None, names=None): 3 NJ Precip >>> pd.MultiIndex.from_frame(df) - MultiIndex(levels=[['HI', 'NJ'], ['Precip', 'Temp']], - codes=[[0, 0, 1, 1], [1, 0, 1, 0]], + MultiIndex([('HI', 'Temp'), + ('HI', 'Precip'), + ('NJ', 'Temp'), + ('NJ', 'Precip')], names=['a', 'b']) Using explicit names, instead of the column names >>> pd.MultiIndex.from_frame(df, names=['state', 'observation']) - MultiIndex(levels=[['HI', 'NJ'], ['Precip', 'Temp']], - codes=[[0, 0, 1, 1], [1, 0, 1, 0]], + MultiIndex([('HI', 'Temp'), + ('HI', 'Precip'), + ('NJ', 'Temp'), + ('NJ', 'Precip')], names=['state', 'observation']) """ if not isinstance(df, ABCDataFrame): @@ -663,21 +678,29 @@ def set_levels(self, levels, level=None, inplace=False, >>> idx = pd.MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')], names=['foo', 'bar']) - >>> idx.set_levels([['a','b'], [1,2]]) - MultiIndex(levels=[['a', 'b'], [1, 2]], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + >>> idx.set_levels([['a', 'b'], [1, 2]]) + MultiIndex([('a', 1), + ('a', 2), + ('b', 1), + ('b', 2)], names=['foo', 'bar']) - >>> idx.set_levels(['a','b'], level=0) - MultiIndex(levels=[['a', 'b'], ['one', 'two']], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + >>> idx.set_levels(['a', 'b'], level=0) + MultiIndex([('a', 'one'), + ('a', 'two'), + ('b', 'one'), + ('b', 'two')], names=['foo', 'bar']) - >>> idx.set_levels(['a','b'], level='bar') - MultiIndex(levels=[[1, 2], ['a', 'b']], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + >>> idx.set_levels(['a', 'b'], level='bar') + MultiIndex([(1, 'a'), + (1, 'b'), + (2, 'a'), + (2, 'b')], names=['foo', 'bar']) - >>> idx.set_levels([['a','b'], [1,2]], level=[0,1]) - MultiIndex(levels=[['a', 'b'], [1, 2]], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + >>> idx.set_levels([['a', 'b'], [1, 2]], level=[0, 1]) + MultiIndex([('a', 1), + ('a', 2), + ('b', 1), + ('b', 2)], names=['foo', 'bar']) """ if is_list_like(levels) and not isinstance(levels, Index): @@ -779,24 +802,34 @@ def set_codes(self, codes, level=None, inplace=False, Examples -------- - >>> idx = pd.MultiIndex.from_tuples([(1, 'one'), (1, 'two'), - (2, 'one'), (2, 'two')], + >>> idx = pd.MultiIndex.from_tuples([(1, 'one'), + (1, 'two'), + (2, 'one'), + (2, 'two')], names=['foo', 'bar']) - >>> idx.set_codes([[1,0,1,0], [0,0,1,1]]) - MultiIndex(levels=[[1, 2], ['one', 'two']], - codes=[[1, 0, 1, 0], [0, 0, 1, 1]], + >>> idx.set_codes([[1, 0, 1, 0], [0, 0, 1, 1]]) + MultiIndex([(2, 'one'), + (1, 'one'), + (2, 'two'), + (1, 'two')], names=['foo', 'bar']) - >>> idx.set_codes([1,0,1,0], level=0) - MultiIndex(levels=[[1, 2], ['one', 'two']], - codes=[[1, 0, 1, 0], [0, 1, 0, 1]], + >>> idx.set_codes([1, 0, 1, 0], level=0) + MultiIndex([(2, 'one'), + (1, 'two'), + (2, 'one'), + (1, 'two')], names=['foo', 'bar']) - >>> idx.set_codes([0,0,1,1], level='bar') - MultiIndex(levels=[[1, 2], ['one', 'two']], - codes=[[0, 0, 1, 1], [0, 0, 1, 1]], + >>> idx.set_codes([0, 0, 1, 1], level='bar') + MultiIndex([(1, 'one'), + (1, 'one'), + (2, 'two'), + (2, 'two')], names=['foo', 'bar']) - >>> idx.set_codes([[1,0,1,0], [0,0,1,1]], level=[0,1]) - MultiIndex(levels=[[1, 2], ['one', 'two']], - codes=[[1, 0, 1, 0], [0, 0, 1, 1]], + >>> idx.set_codes([[1, 0, 1, 0], [0, 0, 1, 1]], level=[0, 1]) + MultiIndex([(2, 'one'), + (1, 'one'), + (2, 'two'), + (1, 'two')], names=['foo', 'bar']) """ if level is not None and not is_list_like(level): @@ -947,28 +980,25 @@ def _nbytes(self, deep=False): # -------------------------------------------------------------------- # Rendering Methods - - def _format_attrs(self): + def _formatter_func(self, tup): """ - Return a list of tuples of the (attr,formatted_value) + Formats each item in tup according to its level's formatter function. """ - attrs = [ - ('levels', ibase.default_pprint(self._levels, - max_seq_items=False)), - ('codes', ibase.default_pprint(self._codes, - max_seq_items=False))] - if com._any_not_none(*self.names): - attrs.append(('names', ibase.default_pprint(self.names))) - if self.sortorder is not None: - attrs.append(('sortorder', ibase.default_pprint(self.sortorder))) - return attrs - - def _format_space(self): - return "\n%s" % (' ' * (len(self.__class__.__name__) + 1)) + formatter_funcs = [level._formatter_func for level in self.levels] + return tuple(func(val) for func, val in zip(formatter_funcs, tup)) def _format_data(self, name=None): - # we are formatting thru the attributes - return None + """ + Return the formatted data as a unicode string + """ + return format_object_summary(self, self._formatter_func, + name=name, line_break_each_value=True) + + def _format_attrs(self): + """ + Return a list of tuples of the (attr,formatted_value). + """ + return format_object_attrs(self, include_dtype=False) def _format_native_types(self, na_rep='nan', **kwargs): new_levels = [] @@ -1555,9 +1585,19 @@ def to_hierarchical(self, n_repeat, n_shuffle=1): >>> idx = pd.MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')]) >>> idx.to_hierarchical(3) - MultiIndex(levels=[[1, 2], ['one', 'two']], - codes=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], - [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]]) + MultiIndex([(1, 'one'), + (1, 'one'), + (1, 'one'), + (1, 'two'), + (1, 'two'), + (1, 'two'), + (2, 'one'), + (2, 'one'), + (2, 'one'), + (2, 'two'), + (2, 'two'), + (2, 'two')], + ) """ levels = self.levels codes = [np.repeat(level_codes, n_repeat) for @@ -1648,16 +1688,21 @@ def _sort_levels_monotonic(self): Examples -------- - >>> i = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) - >>> i - MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) - - >>> i.sort_monotonic() - MultiIndex(levels=[['a', 'b'], ['aa', 'bb']], - codes=[[0, 0, 1, 1], [1, 0, 1, 0]]) + >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], + ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) + >>> mi + MultiIndex([('a', 'bb'), + ('a', 'aa'), + ('b', 'bb'), + ('b', 'aa')], + ) + >>> mi.sort_values() + MultiIndex([('a', 'aa'), + ('a', 'bb'), + ('b', 'aa'), + ('b', 'bb')], + ) """ if self.is_lexsorted() and self.is_monotonic: @@ -1706,20 +1751,25 @@ def remove_unused_levels(self): Examples -------- - >>> i = pd.MultiIndex.from_product([range(2), list('ab')]) - MultiIndex(levels=[[0, 1], ['a', 'b']], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) + >>> mi = pd.MultiIndex.from_product([range(2), list('ab')]) + >>> mi + MultiIndex([(0, 'a'), + (0, 'b'), + (1, 'a'), + (1, 'b')], + ) - >>> i[2:] - MultiIndex(levels=[[0, 1], ['a', 'b']], - codes=[[1, 1], [0, 1]]) + >>> mi[2:] + MultiIndex([(1, 'a'), + (1, 'b')], + ) The 0 from the first level is not represented and can be removed - >>> i[2:].remove_unused_levels() - MultiIndex(levels=[[1], ['a', 'b']], - codes=[[0, 0], [0, 1]]) + >>> mi2 = mi[2:].remove_unused_levels() + >>> mi2.levels + FrozenList([[1], ['a', 'b']]) """ new_levels = [] @@ -2026,11 +2076,17 @@ def swaplevel(self, i=-2, j=-1): >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) >>> mi - MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) + MultiIndex([('a', 'bb'), + ('a', 'aa'), + ('b', 'bb'), + ('b', 'aa')], + ) >>> mi.swaplevel(0, 1) - MultiIndex(levels=[['bb', 'aa'], ['a', 'b']], - codes=[[0, 1, 0, 1], [0, 0, 1, 1]]) + MultiIndex([('bb', 'a'), + ('aa', 'a'), + ('bb', 'b'), + ('aa', 'b')], + ) """ new_levels = list(self.levels) new_codes = list(self.codes) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 413c0e73f8410f..6ebfbc8bb0ee0f 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2587,8 +2587,9 @@ def rsplit(self, pat=None, n=-1, expand=False): Which will create a MultiIndex: >>> idx.str.partition() - MultiIndex(levels=[['X', 'Y'], [' '], ['123', '999']], - codes=[[0, 1], [0, 0], [0, 1]]) + MultiIndex([('X', ' ', '123'), + ('Y', ' ', '999')], + dtype='object') Or an index with tuples with ``expand=False``: diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index bee66fcbfaa82e..73d8586a0a8c9a 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -265,7 +265,7 @@ class TableSchemaFormatter(BaseFormatter): def format_object_summary(obj, formatter, is_justify=True, name=None, - indent_for_name=True): + indent_for_name=True, line_break_each_value=False): """ Return the formatted obj as a unicode string @@ -282,6 +282,12 @@ def format_object_summary(obj, formatter, is_justify=True, name=None, indent_for_name : bool, default True Whether subsequent lines should be be indented to align with the name. + line_break_each_value : bool, default False + If True, inserts a line break for each value of ``obj``. + If False, only break lines when the a line of values gets wider + than the display width. + + .. versionadded:: 0.25.0 Returns ------- @@ -306,7 +312,12 @@ def format_object_summary(obj, formatter, is_justify=True, name=None, space2 = "\n " # space for the opening '[' n = len(obj) - sep = ',' + if line_break_each_value: + # If we want to vertically align on each value of obj, we need to + # separate values by a line break and indent the values + sep = ',\n ' + ' ' * len(name) + else: + sep = ',' max_seq_items = get_option('display.max_seq_items') or n # are we a truncated display @@ -334,10 +345,10 @@ def best_len(values): if n == 0: summary = '[]{}'.format(close) - elif n == 1: + elif n == 1 and not line_break_each_value: first = formatter(obj[0]) summary = '[{}]{}'.format(first, close) - elif n == 2: + elif n == 2 and not line_break_each_value: first = formatter(obj[0]) last = formatter(obj[-1]) summary = '[{}, {}]{}'.format(first, last, close) @@ -353,21 +364,39 @@ def best_len(values): # adjust all values to max length if needed if is_justify: - - # however, if we are not truncated and we are only a single + if line_break_each_value: + # Justify each string in the values of head and tail, so the + # strings will right align when head and tail are stacked + # vertically. + head, tail = _justify(head, tail) + elif (is_truncated or not (len(', '.join(head)) < display_width and + len(', '.join(tail)) < display_width)): + # Each string in head and tail should align with each other + max_length = max(best_len(head), best_len(tail)) + head = [x.rjust(max_length) for x in head] + tail = [x.rjust(max_length) for x in tail] + # If we are not truncated and we are only a single # line, then don't justify - if (is_truncated or - not (len(', '.join(head)) < display_width and - len(', '.join(tail)) < display_width)): - max_len = max(best_len(head), best_len(tail)) - head = [x.rjust(max_len) for x in head] - tail = [x.rjust(max_len) for x in tail] + + if line_break_each_value: + # Now head and tail are of type List[Tuple[str]]. Below we + # convert them into List[str], so there will be one string per + # value. Also truncate items horizontally if wider than + # max_space + max_space = display_width - len(space2) + value = tail[0] + for max_items in reversed(range(1, len(value) + 1)): + pprinted_seq = _pprint_seq(value, max_seq_items=max_items) + if len(pprinted_seq) < max_space: + break + head = [_pprint_seq(x, max_seq_items=max_items) for x in head] + tail = [_pprint_seq(x, max_seq_items=max_items) for x in tail] summary = "" line = space2 - for i in range(len(head)): - word = head[i] + sep + ' ' + for max_items in range(len(head)): + word = head[max_items] + sep + ' ' summary, line = _extend_line(summary, line, word, display_width, space2) @@ -376,8 +405,8 @@ def best_len(values): summary += line.rstrip() + space2 + '...' line = space2 - for i in range(len(tail) - 1): - word = tail[i] + sep + ' ' + for max_items in range(len(tail) - 1): + word = tail[max_items] + sep + ' ' summary, line = _extend_line(summary, line, word, display_width, space2) @@ -391,7 +420,7 @@ def best_len(values): close = ']' + close.rstrip(' ') summary += close - if len(summary) > (display_width): + if len(summary) > (display_width) or line_break_each_value: summary += space1 else: # one row summary += ' ' @@ -402,7 +431,44 @@ def best_len(values): return summary -def format_object_attrs(obj): +def _justify(head, tail): + """ + Justify items in head and tail, so they are right-aligned when stacked. + + Parameters + ---------- + head : list-like of list-likes of strings + tail : list-like of list-likes of strings + + Returns + ------- + tuple of list of tuples of strings + Same as head and tail, but items are right aligned when stacked + vertically. + + Examples + -------- + >>> _justify([['a', 'b']], [['abc', 'abcd']]) + ([(' a', ' b')], [('abc', 'abcd')]) + """ + combined = head + tail + + # For each position for the sequences in ``combined``, + # find the length of the largest string. + max_length = [0] * len(combined[0]) + for inner_seq in combined: + length = [len(item) for item in inner_seq] + max_length = [max(x, y) for x, y in zip(max_length, length)] + + # justify each item in each list-like in head and tail using max_length + head = [tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) + for seq in head] + tail = [tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) + for seq in tail] + return head, tail + + +def format_object_attrs(obj, include_dtype=True): """ Return a list of tuples of the (attr, formatted_value) for common attrs, including dtype, name, length @@ -411,6 +477,8 @@ def format_object_attrs(obj): ---------- obj : object must be iterable + include_dtype : bool + If False, dtype won't be in the returned list Returns ------- @@ -418,10 +486,12 @@ def format_object_attrs(obj): """ attrs = [] - if hasattr(obj, 'dtype'): + if hasattr(obj, 'dtype') and include_dtype: attrs.append(('dtype', "'{}'".format(obj.dtype))) if getattr(obj, 'name', None) is not None: attrs.append(('name', default_pprint(obj.name))) + elif getattr(obj, 'names', None) is not None and any(obj.names): + attrs.append(('names', default_pprint(obj.names))) max_seq_items = get_option('display.max_seq_items') or len(obj) if len(obj) > max_seq_items: attrs.append(('length', len(obj))) diff --git a/pandas/tests/indexes/multi/conftest.py b/pandas/tests/indexes/multi/conftest.py index 956d2e6cc17e30..307772347e8f5b 100644 --- a/pandas/tests/indexes/multi/conftest.py +++ b/pandas/tests/indexes/multi/conftest.py @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas as pd from pandas import Index, MultiIndex @@ -52,3 +53,28 @@ def holder(): def compat_props(): # a MultiIndex must have these properties associated with it return ['shape', 'ndim', 'size'] + + +@pytest.fixture +def narrow_multi_index(): + """ + Return a MultiIndex that is narrower than the display (<80 characters). + """ + n = 1000 + ci = pd.CategoricalIndex(list('a' * n) + (['abc'] * n)) + dti = pd.date_range('2000-01-01', freq='s', periods=n * 2) + return pd.MultiIndex.from_arrays([ci, ci.codes + 9, dti], + names=['a', 'b', 'dti']) + + +@pytest.fixture +def wide_multi_index(): + """ + Return a MultiIndex that is wider than the display (>80 characters). + """ + n = 1000 + ci = pd.CategoricalIndex(list('a' * n) + (['abc'] * n)) + dti = pd.date_range('2000-01-01', freq='s', periods=n * 2) + levels = [ci, ci.codes + 9, dti, dti, dti] + names = ['a', 'b', 'dti_1', 'dti_2', 'dti_3'] + return pd.MultiIndex.from_arrays(levels, names=names) diff --git a/pandas/tests/indexes/multi/test_format.py b/pandas/tests/indexes/multi/test_format.py index c320cb32b856cd..8315478d85125e 100644 --- a/pandas/tests/indexes/multi/test_format.py +++ b/pandas/tests/indexes/multi/test_format.py @@ -55,31 +55,11 @@ def test_repr_with_unicode_data(): assert "\\" not in repr(index) # we don't want unicode-escaped -@pytest.mark.skip(reason="#22511 will remove this test") -def test_repr_roundtrip(): - +def test_repr_roundtrip_raises(): mi = MultiIndex.from_product([list('ab'), range(3)], names=['first', 'second']) - str(mi) - - tm.assert_index_equal(eval(repr(mi)), mi, exact=True) - - mi_u = MultiIndex.from_product( - [list('ab'), range(3)], names=['first', 'second']) - result = eval(repr(mi_u)) - tm.assert_index_equal(result, mi_u, exact=True) - - # formatting - str(mi) - - # long format - mi = MultiIndex.from_product([list('abcdefg'), range(10)], - names=['first', 'second']) - - tm.assert_index_equal(eval(repr(mi)), mi, exact=True) - - result = eval(repr(mi_u)) - tm.assert_index_equal(result, mi_u, exact=True) + with pytest.raises(TypeError): + eval(repr(mi)) def test_unicode_string_with_unicode(): @@ -94,3 +74,126 @@ def test_repr_max_seq_item_setting(idx): with pd.option_context("display.max_seq_items", None): repr(idx) assert '...' not in str(idx) + + +class TestRepr: + + def test_repr(self, idx): + result = idx[:1].__repr__() + expected = """\ +MultiIndex([('foo', 'one')], + names=['first', 'second'])""" + assert result == expected + + result = idx.__repr__() + expected = """\ +MultiIndex([('foo', 'one'), + ('foo', 'two'), + ('bar', 'one'), + ('baz', 'two'), + ('qux', 'one'), + ('qux', 'two')], + names=['first', 'second'])""" + assert result == expected + + with pd.option_context('display.max_seq_items', 5): + result = idx.__repr__() + expected = """\ +MultiIndex([('foo', 'one'), + ('foo', 'two'), + ... + ('qux', 'one'), + ('qux', 'two')], + names=['first', 'second'], length=6)""" + assert result == expected + + def test_rjust(self, narrow_multi_index): + mi = narrow_multi_index + result = mi[:1].__repr__() + expected = """\ +MultiIndex([('a', 9, '2000-01-01 00:00:00')], + names=['a', 'b', 'dti'])""" + assert result == expected + + result = mi[::500].__repr__() + expected = """\ +MultiIndex([( 'a', 9, '2000-01-01 00:00:00'), + ( 'a', 9, '2000-01-01 00:08:20'), + ('abc', 10, '2000-01-01 00:16:40'), + ('abc', 10, '2000-01-01 00:25:00')], + names=['a', 'b', 'dti'])""" + assert result == expected + + result = mi.__repr__() + expected = """\ +MultiIndex([( 'a', 9, '2000-01-01 00:00:00'), + ( 'a', 9, '2000-01-01 00:00:01'), + ( 'a', 9, '2000-01-01 00:00:02'), + ( 'a', 9, '2000-01-01 00:00:03'), + ( 'a', 9, '2000-01-01 00:00:04'), + ( 'a', 9, '2000-01-01 00:00:05'), + ( 'a', 9, '2000-01-01 00:00:06'), + ( 'a', 9, '2000-01-01 00:00:07'), + ( 'a', 9, '2000-01-01 00:00:08'), + ( 'a', 9, '2000-01-01 00:00:09'), + ... + ('abc', 10, '2000-01-01 00:33:10'), + ('abc', 10, '2000-01-01 00:33:11'), + ('abc', 10, '2000-01-01 00:33:12'), + ('abc', 10, '2000-01-01 00:33:13'), + ('abc', 10, '2000-01-01 00:33:14'), + ('abc', 10, '2000-01-01 00:33:15'), + ('abc', 10, '2000-01-01 00:33:16'), + ('abc', 10, '2000-01-01 00:33:17'), + ('abc', 10, '2000-01-01 00:33:18'), + ('abc', 10, '2000-01-01 00:33:19')], + names=['a', 'b', 'dti'], length=2000)""" + assert result == expected + + def test_tuple_width(self, wide_multi_index): + mi = wide_multi_index + result = mi[:1].__repr__() + expected = """MultiIndex([('a', 9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...)], + names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'])""" + assert result == expected + + result = mi[:10].__repr__() + expected = """\ +MultiIndex([('a', 9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...), + ('a', 9, '2000-01-01 00:00:01', '2000-01-01 00:00:01', ...), + ('a', 9, '2000-01-01 00:00:02', '2000-01-01 00:00:02', ...), + ('a', 9, '2000-01-01 00:00:03', '2000-01-01 00:00:03', ...), + ('a', 9, '2000-01-01 00:00:04', '2000-01-01 00:00:04', ...), + ('a', 9, '2000-01-01 00:00:05', '2000-01-01 00:00:05', ...), + ('a', 9, '2000-01-01 00:00:06', '2000-01-01 00:00:06', ...), + ('a', 9, '2000-01-01 00:00:07', '2000-01-01 00:00:07', ...), + ('a', 9, '2000-01-01 00:00:08', '2000-01-01 00:00:08', ...), + ('a', 9, '2000-01-01 00:00:09', '2000-01-01 00:00:09', ...)], + names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'])""" + assert result == expected + + result = mi.__repr__() + expected = """\ +MultiIndex([( 'a', 9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...), + ( 'a', 9, '2000-01-01 00:00:01', '2000-01-01 00:00:01', ...), + ( 'a', 9, '2000-01-01 00:00:02', '2000-01-01 00:00:02', ...), + ( 'a', 9, '2000-01-01 00:00:03', '2000-01-01 00:00:03', ...), + ( 'a', 9, '2000-01-01 00:00:04', '2000-01-01 00:00:04', ...), + ( 'a', 9, '2000-01-01 00:00:05', '2000-01-01 00:00:05', ...), + ( 'a', 9, '2000-01-01 00:00:06', '2000-01-01 00:00:06', ...), + ( 'a', 9, '2000-01-01 00:00:07', '2000-01-01 00:00:07', ...), + ( 'a', 9, '2000-01-01 00:00:08', '2000-01-01 00:00:08', ...), + ( 'a', 9, '2000-01-01 00:00:09', '2000-01-01 00:00:09', ...), + ... + ('abc', 10, '2000-01-01 00:33:10', '2000-01-01 00:33:10', ...), + ('abc', 10, '2000-01-01 00:33:11', '2000-01-01 00:33:11', ...), + ('abc', 10, '2000-01-01 00:33:12', '2000-01-01 00:33:12', ...), + ('abc', 10, '2000-01-01 00:33:13', '2000-01-01 00:33:13', ...), + ('abc', 10, '2000-01-01 00:33:14', '2000-01-01 00:33:14', ...), + ('abc', 10, '2000-01-01 00:33:15', '2000-01-01 00:33:15', ...), + ('abc', 10, '2000-01-01 00:33:16', '2000-01-01 00:33:16', ...), + ('abc', 10, '2000-01-01 00:33:17', '2000-01-01 00:33:17', ...), + ('abc', 10, '2000-01-01 00:33:18', '2000-01-01 00:33:18', ...), + ('abc', 10, '2000-01-01 00:33:19', '2000-01-01 00:33:19', ...)], + names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'], length=2000)""" # noqa + assert result == expected diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py index ec9cbd104d7514..445d9c4e482b09 100644 --- a/pandas/tests/util/test_assert_index_equal.py +++ b/pandas/tests/util/test_assert_index_equal.py @@ -10,8 +10,11 @@ def test_index_equal_levels_mismatch(): Index levels are different \\[left\\]: 1, Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\) -\\[right\\]: 2, MultiIndex\\(levels=\\[\\['A', 'B'\\], \\[1, 2, 3, 4\\]\\], - codes=\\[\\[0, 0, 1, 1\\], \\[0, 1, 2, 3\\]\\]\\)""" +\\[right\\]: 2, MultiIndex\\(\\[\\('A', 1\\), + \\('A', 2\\), + \\('B', 3\\), + \\('B', 4\\)\\], + \\)""" idx1 = Index([1, 2, 3]) idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), From baeb1bf92b763861212d0dee951bd8ea658deadb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 19 Jun 2019 12:57:41 -0500 Subject: [PATCH 022/238] BUG: modfy(SparseArray) (#26947) Closes #26946 (cherry picked from commit 430f664ddbb4dab542b34b2c75b6d086fdef4934) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/arrays/sparse.py | 11 +++++++++++ pandas/tests/arrays/sparse/test_array.py | 10 ++++++++++ 3 files changed, 22 insertions(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index b458b0f9982550..e6bc422b52e89b 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -769,6 +769,7 @@ Sparse - Bug in :class:`SparseFrame` constructor where passing ``None`` as the data would cause ``default_fill_value`` to be ignored (:issue:`16807`) - Bug in :class:`SparseDataFrame` when adding a column in which the length of values does not match length of index, ``AssertionError`` is raised instead of raising ``ValueError`` (:issue:`25484`) - Introduce a better error message in :meth:`Series.sparse.from_coo` so it returns a ``TypeError`` for inputs that are not coo matrices (:issue:`26554`) +- Bug in :func:`numpy.modf` on a :class:`SparseArray`. Now a tuple of :class:`SparseArray` is returned. Other ^^^^^ diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 5e636b5105e568..3dda6868a80dac 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1697,6 +1697,17 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # No alignment necessary. sp_values = getattr(ufunc, method)(self.sp_values, **kwargs) fill_value = getattr(ufunc, method)(self.fill_value, **kwargs) + + if isinstance(sp_values, tuple): + # multiple outputs. e.g. modf + arrays = tuple( + self._simple_new(sp_value, + self.sp_index, + SparseDtype(sp_value.dtype, fv)) + for sp_value, fv in zip(sp_values, fill_value) + ) + return arrays + return self._simple_new(sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value)) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index c0a1b320790444..231b5a92dbb3ad 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -1071,6 +1071,16 @@ def test_ufunc_args(self): result = SparseArray([2, 0, 1, -1], fill_value=1) tm.assert_sp_array_equal(np.add(sparse, 1), result) + @pytest.mark.parametrize('fill_value', [0.0, np.nan]) + def test_modf(self, fill_value): + # https://github.com/pandas-dev/pandas/issues/26946 + sparse = pd.SparseArray([fill_value] * 10 + [1.1, 2.2], + fill_value=fill_value) + r1, r2 = np.modf(sparse) + e1, e2 = np.modf(np.asarray(sparse)) + tm.assert_sp_array_equal(r1, pd.SparseArray(e1, fill_value=fill_value)) + tm.assert_sp_array_equal(r2, pd.SparseArray(e2, fill_value=fill_value)) + def test_nbytes_integer(self): arr = SparseArray([1, 0, 0, 0, 2], kind='integer') result = arr.nbytes From d150f17384f53c3269189daccf2276d1aded7936 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 20 Jun 2019 08:58:52 +0200 Subject: [PATCH 023/238] TST: fix class method of test BoolArray (#26957) --- pandas/tests/extension/arrow/bool.py | 1 + pandas/tests/extension/arrow/test_bool.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/arrow/bool.py b/pandas/tests/extension/arrow/bool.py index 435ea4e3ec2b5c..2263f53544e417 100644 --- a/pandas/tests/extension/arrow/bool.py +++ b/pandas/tests/extension/arrow/bool.py @@ -114,6 +114,7 @@ def copy(self, deep=False): else: return type(self)(copy.copy(self._data)) + @classmethod def _concat_same_type(cls, to_concat): chunks = list(itertools.chain.from_iterable(x._data.chunks for x in to_concat)) diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index 01163064b09180..a7f28310b7554e 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -36,8 +36,7 @@ def test_array_type_with_arg(self, data, dtype): class TestInterface(BaseArrowTests, base.BaseInterfaceTests): - def test_repr(self, data): - raise pytest.skip("TODO") + pass class TestConstructors(BaseArrowTests, base.BaseConstructorsTests): From cfd65e98e694b2ad40e97d06ffdd9096a3dea909 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 20 Jun 2019 06:48:13 -0500 Subject: [PATCH 024/238] TST: Fix flaky import test (#26953) * TST: Fix flaky import test I'm not sure what, but the missing depedency test is causing issues. Now we check that things work by running it in a subprocess with site-packages disabled. Closes https://github.com/pandas-dev/pandas/issues/26952 --- pandas/tests/test_downstream.py | 36 +++++++++------------------------ 1 file changed, 9 insertions(+), 27 deletions(-) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 14d3ee5ac4fe26..9fe8b0f9563ef7 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -1,7 +1,6 @@ """ Testing that we work in the downstream packages """ -import builtins import importlib import subprocess import sys @@ -134,30 +133,13 @@ def test_pyarrow(df): tm.assert_frame_equal(result, df) -def test_missing_required_dependency(monkeypatch): +def test_missing_required_dependency(): # GH 23868 - original_import = __import__ - - def mock_import_fail(name, *args, **kwargs): - if name == "numpy": - raise ImportError("cannot import name numpy") - elif name == "pytz": - raise ImportError("cannot import name some_dependency") - elif name == "dateutil": - raise ImportError("cannot import name some_other_dependency") - else: - return original_import(name, *args, **kwargs) - - expected_msg = ( - "Unable to import required dependencies:" - "\nnumpy: cannot import name numpy" - "\npytz: cannot import name some_dependency" - "\ndateutil: cannot import name some_other_dependency" - ) - - import pandas as pd - - with monkeypatch.context() as m: - m.setattr(builtins, "__import__", mock_import_fail) - with pytest.raises(ImportError, match=expected_msg): - importlib.reload(pd) + # use the -S flag to disable site-packages + call = ['python', '-S', '-c', 'import pandas'] + + with pytest.raises(subprocess.CalledProcessError) as exc: + subprocess.check_output(call, stderr=subprocess.STDOUT) + + output = exc.value.stdout.decode() + assert all(x in output for x in ['numpy', 'pytz', 'dateutil']) From a4a18a9a694ba2641ec3ba98afc20615b2d39ad7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 20 Jun 2019 17:51:18 -0700 Subject: [PATCH 025/238] Assorted cleanups (#26975) --- pandas/core/internals/managers.py | 19 ------------------- pandas/io/formats/format.py | 2 +- pandas/io/sql.py | 2 +- pandas/tests/frame/test_constructors.py | 5 +++-- pandas/tests/frame/test_missing.py | 15 ++++++++++----- 5 files changed, 15 insertions(+), 28 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 907498c7ff3508..7fe34279c04826 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -23,7 +23,6 @@ from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algos -from pandas.core.arrays.sparse import _maybe_to_sparse from pandas.core.base import PandasObject from pandas.core.index import Index, MultiIndex, ensure_index from pandas.core.indexing import maybe_convert_indices @@ -1727,10 +1726,6 @@ def form_blocks(arrays, names, axes): object_blocks = _simple_blockify(items_dict['ObjectBlock'], np.object_) blocks.extend(object_blocks) - if len(items_dict['SparseBlock']) > 0: - sparse_blocks = _sparse_blockify(items_dict['SparseBlock']) - blocks.extend(sparse_blocks) - if len(items_dict['CategoricalBlock']) > 0: cat_blocks = [make_block(array, klass=CategoricalBlock, placement=[i]) for i, _, array in items_dict['CategoricalBlock']] @@ -1797,20 +1792,6 @@ def _multi_blockify(tuples, dtype=None): return new_blocks -def _sparse_blockify(tuples, dtype=None): - """ return an array of blocks that potentially have different dtypes (and - are sparse) - """ - - new_blocks = [] - for i, names, array in tuples: - array = _maybe_to_sparse(array) - block = make_block(array, placement=[i]) - new_blocks.append(block) - - return new_blocks - - def _stack_arrays(tuples, dtype): # fml diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 8655fb05f34e25..b2ef45b15e5494 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1567,7 +1567,7 @@ def __call__(self, num): formatted = format_str.format(mant=mant, prefix=prefix) - return formatted # .strip() + return formatted def set_eng_float_format(accuracy=3, use_eng_prefix=False): diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 1e3fe2ade6ab7a..6cb57077be76a4 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -623,7 +623,7 @@ def insert_data(self): # GH 9086: Ensure we return datetimes with timezone info # Need to return 2-D data; DatetimeIndex is 1D d = b.values.to_pydatetime() - d = np.expand_dims(d, axis=0) + d = np.atleast_2d(d) else: # convert to microsecond resolution for datetime.datetime d = b.values.astype('M8[us]').astype(object) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 68017786eb6a6c..7dc74961a2adcb 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -5,6 +5,7 @@ import numpy as np import numpy.ma as ma +import numpy.ma.mrecords as mrecords import pytest from pandas.compat import PY36, is_platform_little_endian @@ -839,7 +840,7 @@ def test_constructor_maskedrecarray_dtype(self): data = np.ma.array( np.ma.zeros(5, dtype=[('date', ' Date: Thu, 20 Jun 2019 20:11:55 -0500 Subject: [PATCH 026/238] Surface NumPy FutureWarning about comparisons (#26966) --- pandas/core/indexes/base.py | 9 ++------- pandas/tests/indexes/test_numpy_compat.py | 13 +++++++++++++ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 68faa3eb3e8836..73abd708415a13 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -76,13 +76,8 @@ def cmp_method(self, other): result = ops._comp_method_OBJECT_ARRAY(op, self.values, other) else: - - # numpy will show a DeprecationWarning on invalid elementwise - # comparisons, this will raise in the future - with warnings.catch_warnings(record=True): - warnings.filterwarnings("ignore", "elementwise", FutureWarning) - with np.errstate(all='ignore'): - result = op(self.values, np.asarray(other)) + with np.errstate(all='ignore'): + result = op(self.values, np.asarray(other)) # technically we could support bool dtyped Index # for now just return the indexing array directly diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index 460faaaf092ec9..349d10f5079e86 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -80,3 +80,16 @@ def test_numpy_ufuncs_other(indices, func): else: with pytest.raises(Exception): func(idx) + + +def test_elementwise_comparison_warning(): + # https://github.com/pandas-dev/pandas/issues/22698#issuecomment-458968300 + # np.array([1, 2]) == 'a' returns False, and produces a + # FutureWarning that it'll be [False, False] in the future. + # We just want to ensure that comes through. + # When NumPy dev actually enforces this change, we'll need to skip + # this test. + idx = Index([1, 2]) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + idx == 'a' From 58cbf81f472932d5190a88141fd2e8079fa6b021 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Abdullah=20=C4=B0hsan=20Se=C3=A7er?= Date: Fri, 21 Jun 2019 04:19:27 +0300 Subject: [PATCH 027/238] BUG: Fix skiplist init error with empty window (#26940) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/_libs/window.pyx | 9 +++++++++ pandas/tests/test_window.py | 11 +++++++++++ 3 files changed, 21 insertions(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index e6bc422b52e89b..8767a0c2d5ea10 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -742,6 +742,7 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.groupby.SeriesGroupBy.transform` where transforming an empty group would raise a ``ValueError`` (:issue:`26208`) - Bug in :meth:`pandas.core.frame.DataFrame.groupby` where passing a :class:`pandas.core.groupby.grouper.Grouper` would return incorrect groups when using the ``.groups`` accessor (:issue:`26326`) - Bug in :meth:`pandas.core.groupby.GroupBy.agg` where incorrect results are returned for uint64 columns. (:issue:`26310`) +- Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where MemoryError is raised with empty window (:issue:`26005`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 48b554ca02a9d4..3305fea06f0030 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1099,6 +1099,10 @@ def roll_median_c(ndarray[float64_t] values, int64_t win, int64_t minp, use_mock=False) output = np.empty(N, dtype=float) + if win == 0: + output[:] = NaN + return output + sl = skiplist_init(win) if sl == NULL: raise MemoryError("skiplist_init failed") @@ -1486,6 +1490,11 @@ def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, minp, index, closed, use_mock=False) output = np.empty(N, dtype=float) + + if win == 0: + output[:] = NaN + return output + skiplist = skiplist_init(win) if skiplist == NULL: raise MemoryError("skiplist_init failed") diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index bc6946cbade4c6..31baf4475214f2 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -608,6 +608,17 @@ def tests_empty_df_rolling(self, roller): result = DataFrame(index=pd.DatetimeIndex([])).rolling(roller).sum() tm.assert_frame_equal(result, expected) + def test_empty_window_median_quantile(self): + # GH 26005 + expected = pd.Series([np.nan, np.nan, np.nan]) + roll = pd.Series(np.arange(3)).rolling(0) + + result = roll.median() + tm.assert_series_equal(result, expected) + + result = roll.quantile(0.1) + tm.assert_series_equal(result, expected) + def test_missing_minp_zero(self): # https://github.com/pandas-dev/pandas/pull/18921 # minp=0 From fa92585678c0d80a484f5a6e1b561106002fef78 Mon Sep 17 00:00:00 2001 From: Chuanzhu Xu Date: Thu, 20 Jun 2019 21:48:23 -0400 Subject: [PATCH 028/238] Add type hint for (core.arrays).ranges (#26936) --- pandas/core/arrays/_ranges.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py index 4fbb8ae9f9aee1..7a83b7960a6e7f 100644 --- a/pandas/core/arrays/_ranges.py +++ b/pandas/core/arrays/_ranges.py @@ -3,14 +3,19 @@ (and possibly TimedeltaArray/PeriodArray) """ +from typing import Tuple + import numpy as np from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp -from pandas.tseries.offsets import Tick, generate_range +from pandas.tseries.offsets import DateOffset, Tick, generate_range -def generate_regular_range(start, end, periods, freq): +def generate_regular_range(start: Timestamp, + end: Timestamp, + periods: int, + freq: DateOffset) -> Tuple[np.ndarray, str]: """ Generate a range of dates with the spans between dates described by the given `freq` DateOffset. @@ -79,7 +84,10 @@ def generate_regular_range(start, end, periods, freq): return values, tz -def _generate_range_overflow_safe(endpoint, periods, stride, side='start'): +def _generate_range_overflow_safe(endpoint: int, + periods: int, + stride: int, + side: str = 'start') -> int: """ Calculate the second endpoint for passing to np.arange, checking to avoid an integer overflow. Catch OverflowError and re-raise @@ -146,7 +154,10 @@ def _generate_range_overflow_safe(endpoint, periods, stride, side='start'): return _generate_range_overflow_safe(midpoint, remaining, stride, side) -def _generate_range_overflow_safe_signed(endpoint, periods, stride, side): +def _generate_range_overflow_safe_signed(endpoint: int, + periods: int, + stride: int, + side: str) -> int: """ A special case for _generate_range_overflow_safe where `periods * stride` can be calculated without overflowing int64 bounds. From 7f8dd723a594b3b8ea03d6b87d7b031699ba9250 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 20 Jun 2019 20:48:36 -0500 Subject: [PATCH 029/238] CLN: Deduplicate show_versions (#26816) --- doc/source/install.rst | 1 + pandas/compat/_optional.py | 1 + pandas/io/pytables.py | 7 +- pandas/tests/io/test_pytables_missing.py | 14 ++++ pandas/util/_print_versions.py | 102 ++++++++++------------- pandas/util/_test_decorators.py | 17 ++++ 6 files changed, 81 insertions(+), 61 deletions(-) create mode 100644 pandas/tests/io/test_pytables_missing.py diff --git a/doc/source/install.rst b/doc/source/install.rst index 1c1f0c1d4cf8eb..ee4b36f898e314 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -286,6 +286,7 @@ psycopg2 PostgreSQL engine for sqlalchemy pyarrow 0.9.0 Parquet and feather reading / writing pymysql MySQL engine for sqlalchemy pyreadstat SPSS files (.sav) reading +pytables 3.4.2 HDF5 reading / writing qtpy Clipboard I/O s3fs 0.0.8 Amazon S3 access xarray 0.8.2 pandas-like API for N-dimensional data diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 4a7b8c4e88649c..875edb3d3f1dd1 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -19,6 +19,7 @@ "s3fs": "0.0.8", "scipy": "0.19.0", "sqlalchemy": "1.1.4", + "tables": "3.4.2", "xarray": "0.8.2", "xlrd": "1.1.0", "xlwt": "1.2.0", diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 983b1286eec91f..79d6d8563a162e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -19,6 +19,7 @@ from pandas._libs import lib, writers as libwriters from pandas._libs.tslibs import timezones +from pandas.compat._optional import import_optional_dependency from pandas.errors import PerformanceWarning from pandas.core.dtypes.common import ( @@ -448,11 +449,7 @@ def __init__(self, path, mode=None, complevel=None, complib=None, if 'format' in kwargs: raise ValueError('format is not a defined argument for HDFStore') - try: - import tables # noqa - except ImportError as ex: # pragma: no cover - raise ImportError('HDFStore requires PyTables, "{ex!s}" problem ' - 'importing'.format(ex=ex)) + tables = import_optional_dependency("tables") if complib is not None and complib not in tables.filters.all_complibs: raise ValueError( diff --git a/pandas/tests/io/test_pytables_missing.py b/pandas/tests/io/test_pytables_missing.py new file mode 100644 index 00000000000000..4ceb80889c989d --- /dev/null +++ b/pandas/tests/io/test_pytables_missing.py @@ -0,0 +1,14 @@ +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas.util.testing as tm + + +@td.skip_if_installed("tables") +def test_pytables_raises(): + df = pd.DataFrame({"A": [1, 2]}) + with pytest.raises(ImportError, match="tables"): + with tm.ensure_clean("foo.h5") as path: + df.to_hdf(path, "df") diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index a5c86c2cc80b39..5e2e013c4afcc3 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -1,5 +1,4 @@ import codecs -import importlib import locale import os import platform @@ -7,6 +6,9 @@ import subprocess import sys +from pandas.compat._optional import ( + VERSIONS, _get_version, import_optional_dependency) + def get_sys_info(): "Returns system information as a dict" @@ -58,60 +60,49 @@ def get_sys_info(): def show_versions(as_json=False): sys_info = get_sys_info() - deps = [ - # (MODULE_NAME, f(mod) -> mod version) - ("pandas", lambda mod: mod.__version__), - ("pytest", lambda mod: mod.__version__), - ("pip", lambda mod: mod.__version__), - ("setuptools", lambda mod: mod.__version__), - ("Cython", lambda mod: mod.__version__), - ("numpy", lambda mod: mod.version.version), - ("scipy", lambda mod: mod.version.version), - ("pyarrow", lambda mod: mod.__version__), - ("xarray", lambda mod: mod.__version__), - ("IPython", lambda mod: mod.__version__), - ("sphinx", lambda mod: mod.__version__), - ("patsy", lambda mod: mod.__version__), - ("dateutil", lambda mod: mod.__version__), - ("pytz", lambda mod: mod.VERSION), - ("blosc", lambda mod: mod.__version__), - ("bottleneck", lambda mod: mod.__version__), - ("tables", lambda mod: mod.__version__), - ("numexpr", lambda mod: mod.__version__), - ("feather", lambda mod: mod.__version__), - ("matplotlib", lambda mod: mod.__version__), - ("openpyxl", lambda mod: mod.__version__), - ("xlrd", lambda mod: mod.__VERSION__), - ("xlwt", lambda mod: mod.__VERSION__), - ("xlsxwriter", lambda mod: mod.__version__), - ("lxml.etree", lambda mod: mod.__version__), - ("bs4", lambda mod: mod.__version__), - ("html5lib", lambda mod: mod.__version__), - ("sqlalchemy", lambda mod: mod.__version__), - ("pymysql", lambda mod: mod.__version__), - ("psycopg2", lambda mod: mod.__version__), - ("jinja2", lambda mod: mod.__version__), - ("s3fs", lambda mod: mod.__version__), - ("fastparquet", lambda mod: mod.__version__), - ("pandas_gbq", lambda mod: mod.__version__), - ("pandas_datareader", lambda mod: mod.__version__), - ("gcsfs", lambda mod: mod.__version__), + 'pandas', + # required + 'numpy', + 'pytz', + 'dateutil', + # install / build, + 'pip', + 'setuptools', + 'Cython', + # test + 'pytest', + 'hypothesis', + # docs + "sphinx", + # Other, need a min version + "blosc", + "feather", + "xlsxwriter", + "lxml.etree", + "html5lib", + "pymysql", + "psycopg2", + "jinja2", + # Other, not imported. + "IPython", + "pandas_datareader", ] - deps_blob = list() - for (modname, ver_f) in deps: - try: - if modname in sys.modules: - mod = sys.modules[modname] - else: - mod = importlib.import_module(modname) - ver = ver_f(mod) - deps_blob.append((modname, ver)) - except ImportError: - deps_blob.append((modname, None)) + deps.extend(list(VERSIONS)) + deps_blob = [] - if (as_json): + for modname in deps: + mod = import_optional_dependency(modname, + raise_on_missing=False, + on_version="ignore") + if mod: + ver = _get_version(mod) + else: + ver = None + deps_blob.append((modname, ver)) + + if as_json: try: import json except ImportError: @@ -126,16 +117,15 @@ def show_versions(as_json=False): json.dump(j, f, indent=2) else: - + maxlen = max(len(x) for x in deps) + tpl = '{{k:<{maxlen}}}: {{stat}}'.format(maxlen=maxlen) print("\nINSTALLED VERSIONS") print("------------------") - for k, stat in sys_info: - print("{k}: {stat}".format(k=k, stat=stat)) - + print(tpl.format(k=k, stat=stat)) print("") for k, stat in deps_blob: - print("{k}: {stat}".format(k=k, stat=stat)) + print(tpl.format(k=k, stat=stat)) def main(): diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 0cb82c0028c90d..fd9c9d07a974e0 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -100,6 +100,23 @@ def _skip_if_no_scipy(): safe_import('scipy.signal')) +def skip_if_installed( + package: str, +) -> MarkDecorator: + """ + Skip a test if a package is installed. + + Parameters + ---------- + package : str + The name of the package. + """ + return pytest.mark.skipif( + safe_import(package), + reason="Skipping because {} is installed.".format(package) + ) + + def skip_if_no( package: str, min_version: Optional[str] = None From 388d22c3d1e6804dbc1390e41db1d7277b1d8c66 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 20 Jun 2019 19:01:59 -0700 Subject: [PATCH 030/238] BUG: avoid overflow in Bday generate_range, closes #24252 (#26651) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/_libs/tslibs/conversion.pyx | 9 +++++++++ pandas/tests/arithmetic/test_timedelta64.py | 9 +++++---- pandas/tests/indexes/datetimes/test_date_range.py | 13 +++++++++++++ pandas/tests/scalar/timestamp/test_timestamp.py | 7 +++++++ pandas/tests/tseries/offsets/test_offsets.py | 2 +- pandas/tseries/offsets.py | 14 +++++++++++++- 7 files changed, 49 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 8767a0c2d5ea10..a6b74865f6619c 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -600,6 +600,7 @@ Datetimelike - Bug in :meth:`isin` for datetimelike indexes; :class:`DatetimeIndex`, :class:`TimedeltaIndex` and :class:`PeriodIndex` where the ``levels`` parameter was ignored. (:issue:`26675`) - Bug in :func:`to_datetime` which raises ``TypeError`` for ``format='%Y%m%d'`` when called for invalid integer dates with length >= 6 digits with ``errors='ignore'`` - Bug when comparing a :class:`PeriodIndex` against a zero-dimensional numpy array (:issue:`26689`) +- Bug in :func:`date_range` with unnecessary ``OverflowError`` being raised for very large or very small dates (:issue:`26651`) Timedelta ^^^^^^^^^ diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 04bb4454462a7d..0a3f4ed3cc91d5 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -275,6 +275,10 @@ cdef convert_to_tsobject(object ts, object tz, object unit, - iso8601 string object - python datetime object - another timestamp object + + Raises + ------ + OutOfBoundsDatetime : ts cannot be converted within implementation bounds """ cdef: _TSObject obj @@ -294,6 +298,11 @@ cdef convert_to_tsobject(object ts, object tz, object unit, if obj.value != NPY_NAT: dt64_to_dtstruct(obj.value, &obj.dts) elif is_integer_object(ts): + try: + ts = ts + except OverflowError: + # GH#26651 re-raise as OutOfBoundsDatetime + raise OutOfBoundsDatetime(ts) if ts == NPY_NAT: obj.value = NPY_NAT else: diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index ead9876e7c2a84..2dff9a6088de8f 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -5,7 +5,8 @@ import numpy as np import pytest -from pandas.errors import NullFrequencyError, PerformanceWarning +from pandas.errors import ( + NullFrequencyError, OutOfBoundsDatetime, PerformanceWarning) import pandas as pd from pandas import ( @@ -479,10 +480,10 @@ def test_tdi_add_timestamp_nat_masking(self): def test_tdi_add_overflow(self): # See GH#14068 - msg = "too (big|large) to convert" - with pytest.raises(OverflowError, match=msg): + # preliminary test scalar analogue of vectorized tests below + with pytest.raises(OutOfBoundsDatetime): pd.to_timedelta(106580, 'D') + Timestamp('2000') - with pytest.raises(OverflowError, match=msg): + with pytest.raises(OutOfBoundsDatetime): Timestamp('2000') + pd.to_timedelta(106580, 'D') _NaT = int(pd.NaT) + 1 diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 7f03793d880b03..1545cc52eb1f44 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -740,6 +740,19 @@ def test_bdays_and_open_boundaries(self, closed): expected = pd.date_range(bday_start, bday_end, freq='D') tm.assert_index_equal(result, expected) + def test_bday_near_overflow(self): + # GH#24252 avoid doing unnecessary addition that _would_ overflow + start = pd.Timestamp.max.floor("D").to_pydatetime() + rng = pd.date_range(start, end=None, periods=1, freq='B') + expected = pd.DatetimeIndex([start], freq='B') + tm.assert_index_equal(rng, expected) + + def test_bday_overflow_error(self): + # GH#24252 check that we get OutOfBoundsDatetime and not OverflowError + start = pd.Timestamp.max.floor("D").to_pydatetime() + with pytest.raises(OutOfBoundsDatetime): + pd.date_range(start, periods=2, freq='B') + class TestCustomDateRange: diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 773b4e6f21a190..4b6b0dac916c62 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -463,6 +463,13 @@ def test_invalid_date_kwarg_with_string_input(self, arg): with pytest.raises(ValueError): Timestamp('2010-10-10 12:59:59.999999999', **kwarg) + def test_out_of_bounds_integer_value(self): + # GH#26651 check that we raise OutOfBoundsDatetime, not OverflowError + with pytest.raises(OutOfBoundsDatetime): + Timestamp(Timestamp.max.value * 2) + with pytest.raises(OutOfBoundsDatetime): + Timestamp(Timestamp.min.value * 2) + def test_out_of_bounds_value(self): one_us = np.timedelta64(1).astype('timedelta64[us]') diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 8c8a2f75c4a47e..a1ad792e57bde1 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -115,7 +115,7 @@ def test_apply_out_of_range(self, tz_naive_fixture): assert t.tzinfo == result.tzinfo except OutOfBoundsDatetime: - raise + pass except (ValueError, KeyError): # we are creating an invalid offset # so ignore diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index c1764b3845fce6..00837d36d9508e 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -97,6 +97,8 @@ def wrapper(self, other): if tz is not None and result.tzinfo is None: result = conversion.localize_pydatetime(result, tz) + result = Timestamp(result) + return result return wrapper @@ -2330,7 +2332,7 @@ def apply(self, other): # an exception, when we call using the + operator, # we directly call the known method result = other.__add__(self) - if result == NotImplemented: + if result is NotImplemented: raise OverflowError return result elif isinstance(other, (datetime, np.datetime64, date)): @@ -2467,6 +2469,11 @@ def generate_range(start=None, end=None, periods=None, offset=BDay()): while cur <= end: yield cur + if cur == end: + # GH#24252 avoid overflows by not performing the addition + # in offset.apply unless we have to + break + # faster than cur + offset next_date = offset.apply(cur) if next_date <= cur: @@ -2477,6 +2484,11 @@ def generate_range(start=None, end=None, periods=None, offset=BDay()): while cur >= end: yield cur + if cur == end: + # GH#24252 avoid overflows by not performing the addition + # in offset.apply unless we have to + break + # faster than cur + offset next_date = offset.apply(cur) if next_date >= cur: From 984514ef76166be37b19a6166c1868fa7d98f904 Mon Sep 17 00:00:00 2001 From: Christopher Whelan Date: Fri, 21 Jun 2019 04:03:02 +0200 Subject: [PATCH 031/238] BENCH: fix noisy asv benchmarks that were running on exhausted generators (#26772) --- asv_bench/benchmarks/ctors.py | 7 +++++++ asv_bench/benchmarks/frame_ctor.py | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index 1c6841a296377d..42adede631a010 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -55,7 +55,14 @@ class SeriesConstructors: [False, True], ['float', 'int']] + # Generators get exhausted on use, so run setup before every call + number = 1 + repeat = (3, 250, 10) + def setup(self, data_fmt, with_index, dtype): + if data_fmt in (gen_of_str, gen_of_tuples) and with_index: + raise NotImplementedError('Series constructors do not support ' + 'using generators with indexes') N = 10**4 if dtype == 'float': arr = np.random.randn(N) diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 19c2a913e8494a..9533938b30faca 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -72,6 +72,10 @@ class FromRecords: params = [None, 1000] param_names = ['nrows'] + # Generators get exhausted on use, so run setup before every call + number = 1 + repeat = (3, 250, 10) + def setup(self, nrows): N = 100000 self.gen = ((x, (x * 20), (x * 100)) for x in range(N)) From 4850b287b4134885d0ca8f63650326d3525e274c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 20 Jun 2019 21:06:34 -0500 Subject: [PATCH 032/238] Fix matplotlib converter registering warning (#26770) --- pandas/plotting/_core.py | 17 ++++----- pandas/plotting/_matplotlib/__init__.py | 6 ++++ pandas/plotting/_matplotlib/boxplot.py | 6 +++- pandas/plotting/_matplotlib/core.py | 7 ++-- pandas/plotting/_matplotlib/hist.py | 5 ++- pandas/plotting/_matplotlib/misc.py | 8 ++++- pandas/plotting/_matplotlib/style.py | 2 +- pandas/plotting/_matplotlib/timeseries.py | 4 +-- pandas/plotting/_matplotlib/tools.py | 3 +- pandas/tests/plotting/test_converter.py | 25 ++++++++++++-- pandas/tests/plotting/test_datetimelike.py | 40 ++++------------------ pandas/util/_test_decorators.py | 2 +- 12 files changed, 69 insertions(+), 56 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 81f5b5cb0f74c3..78c7082c69b6b1 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -5,19 +5,16 @@ from pandas.core.dtypes.common import is_integer, is_list_like from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -import pandas from pandas.core.base import PandasObject from pandas.core.generic import _shared_doc_kwargs, _shared_docs -# Automatically registering converters was deprecated in 0.21, but -# the deprecation warning wasn't showing until 0.24 -# This block will be eventually removed, but it's not clear when -if pandas.get_option('plotting.matplotlib.register_converters'): - try: - from .misc import register - register(explicit=False) - except ImportError: - pass +# Trigger matplotlib import, which implicitly registers our +# converts. Implicit registration is deprecated, and when enforced +# we can lazily import matplotlib. +try: + import pandas.plotting._matplotlib # noqa +except ImportError: + pass df_kind = """- 'scatter' : scatter plot - 'hexbin' : hexbin plot""" diff --git a/pandas/plotting/_matplotlib/__init__.py b/pandas/plotting/_matplotlib/__init__.py index 5cfb6843db9eda..1b775d03349d01 100644 --- a/pandas/plotting/_matplotlib/__init__.py +++ b/pandas/plotting/_matplotlib/__init__.py @@ -1,3 +1,5 @@ +from pandas._config import get_option + from pandas.plotting._matplotlib.boxplot import ( BoxPlot, boxplot, boxplot_frame, boxplot_frame_groupby) from pandas.plotting._matplotlib.converter import deregister, register @@ -11,6 +13,10 @@ from pandas.plotting._matplotlib.timeseries import tsplot from pandas.plotting._matplotlib.tools import table +if get_option("plotting.matplotlib.register_converters"): + register(explicit=False) + + __all__ = ['LinePlot', 'BarPlot', 'BarhPlot', 'HistPlot', 'BoxPlot', 'KdePlot', 'AreaPlot', 'PiePlot', 'ScatterPlot', 'HexBinPlot', 'hist_series', 'hist_frame', 'boxplot', 'boxplot_frame', 'boxplot_frame_groupby', diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index b8a7da5270fc02..f8bc531e3c344d 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -1,7 +1,6 @@ from collections import namedtuple import warnings -from matplotlib import pyplot as plt from matplotlib.artist import setp import numpy as np @@ -11,6 +10,7 @@ import pandas as pd from pandas.io.formats.printing import pprint_thing +from pandas.plotting._matplotlib import converter from pandas.plotting._matplotlib.core import LinePlot, MPLPlot from pandas.plotting._matplotlib.style import _get_standard_colors from pandas.plotting._matplotlib.tools import _flatten, _subplots @@ -215,6 +215,7 @@ def boxplot(data, column=None, by=None, ax=None, fontsize=None, rot=0, grid=True, figsize=None, layout=None, return_type=None, **kwds): + import matplotlib.pyplot as plt # validate return_type: if return_type not in BoxPlot._valid_return_types: raise ValueError("return_type must be {'axes', 'dict', 'both'}") @@ -296,6 +297,8 @@ def plot_group(keys, values, ax): def boxplot_frame(self, column=None, by=None, ax=None, fontsize=None, rot=0, grid=True, figsize=None, layout=None, return_type=None, **kwds): + import matplotlib.pyplot as plt + converter._WARN = False # no warning for pandas plots ax = boxplot(self, column=column, by=by, ax=ax, fontsize=fontsize, grid=grid, rot=rot, figsize=figsize, layout=layout, return_type=return_type, **kwds) @@ -306,6 +309,7 @@ def boxplot_frame(self, column=None, by=None, ax=None, fontsize=None, rot=0, def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, rot=0, grid=True, ax=None, figsize=None, layout=None, sharex=False, sharey=True, **kwds): + converter._WARN = False # no warning for pandas plots if subplots is True: naxes = len(grouped) fig, axes = _subplots(naxes=naxes, squeeze=False, diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index a7049afee80b0e..5fb4d201223bd1 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -2,7 +2,6 @@ from typing import Optional # noqa import warnings -import matplotlib.pyplot as plt import numpy as np from pandas._config import get_option @@ -61,6 +60,8 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=None, secondary_y=False, colormap=None, table=False, layout=None, **kwds): + import matplotlib.pyplot as plt + converter._WARN = False # no warning for pandas plots self.data = data self.by = by @@ -103,7 +104,7 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=None, self.rot = self._default_rot if grid is None: - grid = False if secondary_y else self.plt.rcParams['axes.grid'] + grid = False if secondary_y else plt.rcParams['axes.grid'] self.grid = grid self.legend = legend @@ -618,6 +619,8 @@ def _get_ax(self, i): @classmethod def get_default_ax(cls, ax): + import matplotlib.pyplot as plt + if ax is None and len(plt.get_fignums()) > 0: with plt.rc_context(): ax = plt.gca() diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 585c407e33311c..d34c0cb6a3889f 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -1,6 +1,5 @@ import warnings -import matplotlib.pyplot as plt import numpy as np from pandas.core.dtypes.common import is_integer, is_list_like @@ -10,6 +9,7 @@ import pandas.core.common as com from pandas.io.formats.printing import pprint_thing +from pandas.plotting._matplotlib import converter from pandas.plotting._matplotlib.core import LinePlot, MPLPlot from pandas.plotting._matplotlib.tools import ( _flatten, _set_ticks_props, _subplots) @@ -203,6 +203,7 @@ def _grouped_hist(data, column=None, by=None, ax=None, bins=50, figsize=None, def plot_group(group, ax): ax.hist(group.dropna().values, bins=bins, **kwargs) + converter._WARN = False # no warning for pandas plots xrot = xrot or rot fig, axes = _grouped_plot(plot_group, data, column=column, @@ -220,6 +221,7 @@ def plot_group(group, ax): def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, figsize=None, bins=10, **kwds): + import matplotlib.pyplot as plt if by is None: if kwds.get('layout', None) is not None: raise ValueError("The 'layout' keyword is not supported when " @@ -261,6 +263,7 @@ def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, sharey=False, figsize=None, layout=None, bins=10, **kwds): + converter._WARN = False # no warning for pandas plots if by is not None: axes = _grouped_hist(data, column=column, by=by, ax=ax, grid=grid, figsize=figsize, sharex=sharex, sharey=sharey, diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py index dacc9ef04f8199..663a3c5153fac9 100644 --- a/pandas/plotting/_matplotlib/misc.py +++ b/pandas/plotting/_matplotlib/misc.py @@ -2,7 +2,6 @@ import matplotlib.lines as mlines import matplotlib.patches as patches -import matplotlib.pyplot as plt import numpy as np from pandas.core.dtypes.missing import notna @@ -105,6 +104,7 @@ def _get_marker_compat(marker): def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): + import matplotlib.pyplot as plt def normalize(series): a = min(series) @@ -169,6 +169,7 @@ def normalize(series): def andrews_curves(frame, class_column, ax=None, samples=200, color=None, colormap=None, **kwds): + import matplotlib.pyplot as plt def function(amplitudes): def f(t): @@ -224,6 +225,7 @@ def f(t): def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): + import matplotlib.pyplot as plt # random.sample(ndarray, int) fails on python 3.3, sigh data = list(series.values) samplings = [random.sample(data, size) for _ in range(samples)] @@ -270,6 +272,7 @@ def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, use_columns=False, xticks=None, colormap=None, axvlines=True, axvlines_kwds=None, sort_labels=False, **kwds): + import matplotlib.pyplot as plt if axvlines_kwds is None: axvlines_kwds = {'linewidth': 1, 'color': 'black'} @@ -336,6 +339,7 @@ def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, def lag_plot(series, lag=1, ax=None, **kwds): # workaround because `c='b'` is hardcoded in matplotlibs scatter method + import matplotlib.pyplot as plt kwds.setdefault('c', plt.rcParams['patch.facecolor']) data = series.values @@ -350,6 +354,8 @@ def lag_plot(series, lag=1, ax=None, **kwds): def autocorrelation_plot(series, ax=None, **kwds): + import matplotlib.pyplot as plt + n = len(series) data = np.asarray(series) if ax is None: diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py index 80a15942a2867f..8c9e3ea330dd30 100644 --- a/pandas/plotting/_matplotlib/style.py +++ b/pandas/plotting/_matplotlib/style.py @@ -3,7 +3,6 @@ import matplotlib.cm as cm import matplotlib.colors -import matplotlib.pyplot as plt import numpy as np from pandas.core.dtypes.common import is_list_like @@ -13,6 +12,7 @@ def _get_standard_colors(num_colors=None, colormap=None, color_type='default', color=None): + import matplotlib.pyplot as plt if color is None and colormap is not None: if isinstance(colormap, str): cmap = colormap diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index 30038b599a386a..e36ffed10d94f9 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -3,8 +3,6 @@ import functools import warnings -from matplotlib import pylab -import matplotlib.pyplot as plt import numpy as np from pandas._libs.tslibs.frequencies import ( @@ -42,6 +40,7 @@ def tsplot(series, plotf, ax=None, **kwargs): .. deprecated:: 0.23.0 Use Series.plot() instead """ + import matplotlib.pyplot as plt warnings.warn("'tsplot' is deprecated and will be removed in a " "future version. Please use Series.plot() instead.", FutureWarning, stacklevel=2) @@ -323,6 +322,7 @@ def format_dateaxis(subplot, freq, index): default, changing the limits of the x axis will intelligently change the positions of the ticks. """ + from matplotlib import pylab # handle index specific formatting # Note: DatetimeIndex does not use this diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index f6393fc76892f4..e491cfc3309a0a 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -2,7 +2,6 @@ from math import ceil import warnings -import matplotlib.pyplot as plt import matplotlib.table import matplotlib.ticker as ticker import numpy as np @@ -168,6 +167,7 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, # Four polar axes plt.subplots(2, 2, subplot_kw=dict(polar=True)) """ + import matplotlib.pyplot as plt if subplot_kw is None: subplot_kw = {} @@ -345,6 +345,7 @@ def _get_xlim(lines): def _set_ticks_props(axes, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None): + import matplotlib.pyplot as plt for ax in _flatten(axes): if xlabelsize is not None: plt.setp(ax.get_xticklabels(), fontsize=xlabelsize) diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 39cd48ff35f96a..92d207e46b7ab8 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -12,11 +12,30 @@ from pandas import Index, Period, Series, Timestamp, date_range import pandas.util.testing as tm +from pandas.plotting import ( + deregister_matplotlib_converters, register_matplotlib_converters) from pandas.tseries.offsets import Day, Micro, Milli, Second -converter = pytest.importorskip('pandas.plotting._converter') -from pandas.plotting import (deregister_matplotlib_converters, # isort:skip - register_matplotlib_converters) +try: + from pandas.plotting._matplotlib import converter +except ImportError: + # try / except, rather than skip, to avoid internal refactoring + # causing an improprer skip + pass + +pytest.importorskip('matplotlib.pyplot') + + +def test_initial_warning(): + code = ( + "import pandas as pd; import matplotlib.pyplot as plt; " + "s = pd.Series(1, pd.date_range('2000', periods=12)); " + "fig, ax = plt.subplots(); " + "ax.plot(s.index, s.values)" + ) + call = [sys.executable, '-c', code] + out = subprocess.check_output(call, stderr=subprocess.STDOUT).decode() + assert 'Using an implicitly' in out def test_timtetonum_accepts_unicode(): diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 10743ca95e29e0..c3d824389aa4db 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -374,7 +374,6 @@ def test_axis_limits(self): def _test(ax): xlim = ax.get_xlim() ax.set_xlim(xlim[0] - 5, xlim[1] + 10) - ax.get_figure().canvas.draw() result = ax.get_xlim() assert result[0] == xlim[0] - 5 assert result[1] == xlim[1] + 10 @@ -383,7 +382,6 @@ def _test(ax): expected = (Period('1/1/2000', ax.freq), Period('4/1/2000', ax.freq)) ax.set_xlim('1/1/2000', '4/1/2000') - ax.get_figure().canvas.draw() result = ax.get_xlim() assert int(result[0]) == expected[0].ordinal assert int(result[1]) == expected[1].ordinal @@ -392,7 +390,6 @@ def _test(ax): expected = (Period('1/1/2000', ax.freq), Period('4/1/2000', ax.freq)) ax.set_xlim(datetime(2000, 1, 1), datetime(2000, 4, 1)) - ax.get_figure().canvas.draw() result = ax.get_xlim() assert int(result[0]) == expected[0].ordinal assert int(result[1]) == expected[1].ordinal @@ -429,12 +426,7 @@ def test_get_finder(self): def test_finder_daily(self): day_lst = [10, 40, 252, 400, 950, 2750, 10000] - if self.mpl_ge_3_0_0 or not self.mpl_ge_2_2_3: - xpl1 = xpl2 = [Period('1999-1-1', freq='B').ordinal] * len(day_lst) - else: # 2.2.3, 2.2.4 - xpl1 = [7565, 7564, 7553, 7546, 7518, 7428, 7066] - xpl2 = [7566, 7564, 7554, 7546, 7519, 7429, 7066] - + xpl1 = xpl2 = [Period('1999-1-1', freq='B').ordinal] * len(day_lst) rs1 = [] rs2 = [] for i, n in enumerate(day_lst): @@ -457,12 +449,7 @@ def test_finder_daily(self): def test_finder_quarterly(self): yrs = [3.5, 11] - if self.mpl_ge_3_0_0 or not self.mpl_ge_2_2_3: - xpl1 = xpl2 = [Period('1988Q1').ordinal] * len(yrs) - else: # 2.2.3, 2.2.4 - xpl1 = [68, 68] - xpl2 = [72, 68] - + xpl1 = xpl2 = [Period('1988Q1').ordinal] * len(yrs) rs1 = [] rs2 = [] for i, n in enumerate(yrs): @@ -485,12 +472,7 @@ def test_finder_quarterly(self): def test_finder_monthly(self): yrs = [1.15, 2.5, 4, 11] - if self.mpl_ge_3_0_0 or not self.mpl_ge_2_2_3: - xpl1 = xpl2 = [Period('Jan 1988').ordinal] * len(yrs) - else: # 2.2.3, 2.2.4 - xpl1 = [216, 216, 204, 204] - xpl2 = [216, 216, 216, 204] - + xpl1 = xpl2 = [Period('Jan 1988').ordinal] * len(yrs) rs1 = [] rs2 = [] for i, n in enumerate(yrs): @@ -521,11 +503,7 @@ def test_finder_monthly_long(self): @pytest.mark.slow def test_finder_annual(self): - if self.mpl_ge_3_0_0 or not self.mpl_ge_2_2_3: - xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170] - else: # 2.2.3, 2.2.4 - xp = [1986, 1986, 1990, 1990, 1995, 2020, 1970, 1970] - + xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170] xp = [Period(x, freq='A').ordinal for x in xp] rs = [] for i, nyears in enumerate([5, 10, 19, 49, 99, 199, 599, 1001]): @@ -1093,7 +1071,6 @@ def test_time(self): df.plot(ax=ax) # verify tick labels - fig.canvas.draw() ticks = ax.get_xticks() labels = ax.get_xticklabels() for t, l in zip(ticks, labels): @@ -1120,7 +1097,6 @@ def test_time_change_xlim(self): df.plot(ax=ax) # verify tick labels - fig.canvas.draw() ticks = ax.get_xticks() labels = ax.get_xticklabels() for t, l in zip(ticks, labels): @@ -1138,7 +1114,6 @@ def test_time_change_xlim(self): ax.set_xlim('1:30', '5:00') # check tick labels again - fig.canvas.draw() ticks = ax.get_xticks() labels = ax.get_xticklabels() for t, l in zip(ticks, labels): @@ -1165,7 +1140,6 @@ def test_time_musec(self): ax = df.plot(ax=ax) # verify tick labels - fig.canvas.draw() ticks = ax.get_xticks() labels = ax.get_xticklabels() for t, l in zip(ticks, labels): @@ -1432,7 +1406,7 @@ def test_format_timedelta_ticks_narrow(self): df = DataFrame(np.random.randn(len(rng), 3), rng) fig, ax = self.plt.subplots() df.plot(fontsize=2, ax=ax) - fig.canvas.draw() + self.plt.draw() labels = ax.get_xticklabels() result_labels = [x.get_text() for x in labels] @@ -1456,7 +1430,7 @@ def test_format_timedelta_ticks_wide(self): df = DataFrame(np.random.randn(len(rng), 3), rng) fig, ax = self.plt.subplots() ax = df.plot(fontsize=2, ax=ax) - fig.canvas.draw() + self.plt.draw() labels = ax.get_xticklabels() result_labels = [x.get_text() for x in labels] @@ -1529,7 +1503,7 @@ def test_matplotlib_scatter_datetime64(self): df["time"] = date_range("2018-01-01", periods=10, freq="D") fig, ax = self.plt.subplots() ax.scatter(x="time", y="y", data=df) - fig.canvas.draw() + self.plt.draw() label = ax.get_xticklabels()[0] if self.mpl_ge_3_0_0: expected = "2017-12-08" diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index fd9c9d07a974e0..ab22539f4530f5 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -76,7 +76,7 @@ def safe_import(mod_name, min_version=None): def _skip_if_no_mpl(): mod = safe_import("matplotlib") if mod: - mod.use("Agg", warn=False) + mod.use("Agg", warn=True) else: return True From b9b081dc6b510c8290ded12fe751b1216843527e Mon Sep 17 00:00:00 2001 From: killerontherun1 Date: Fri, 21 Jun 2019 07:40:48 +0530 Subject: [PATCH 033/238] Docstring GL01 GL02 fixes (#26526) --- pandas/core/accessor.py | 2 +- pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/interval.py | 4 ++-- pandas/core/base.py | 8 ++++---- pandas/core/computation/eval.py | 19 ++++++++++++------- pandas/core/dtypes/dtypes.py | 16 ++++++++++++---- pandas/core/dtypes/inference.py | 3 ++- pandas/core/generic.py | 6 ++++-- pandas/core/groupby/groupby.py | 5 +++-- pandas/core/indexes/interval.py | 3 ++- pandas/core/indexes/multi.py | 10 +++++++--- pandas/core/indexing.py | 3 ++- pandas/core/resample.py | 3 +-- pandas/core/reshape/merge.py | 6 ++++-- 14 files changed, 57 insertions(+), 33 deletions(-) diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 4353e0b3edd081..b092541da93e64 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -196,7 +196,7 @@ def decorator(accessor): return decorator -_doc = """\ +_doc = """ Register a custom accessor on %(klass)s objects. Parameters diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c079b860bb924a..155638aca55603 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -196,7 +196,7 @@ def contains(cat, key, container): return any(loc_ in container for loc_ in loc) -_codes_doc = """\ +_codes_doc = """ The category codes of this categorical. Level codes are an array if integer which are the positions of the real diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 4f628eff431674..71f4cbae7c58d5 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -987,7 +987,7 @@ def __array__(self, dtype=None): result[i] = Interval(left[i], right[i], closed) return result - _interval_shared_docs['to_tuples'] = """\ + _interval_shared_docs['to_tuples'] = """ Return an %(return_type)s of tuples of the form (left, right) Parameters @@ -1002,7 +1002,7 @@ def __array__(self, dtype=None): ------- tuples: %(return_type)s %(examples)s\ - """ + """ @Appender(_interval_shared_docs['to_tuples'] % dict( return_type='ndarray', diff --git a/pandas/core/base.py b/pandas/core/base.py index e4274e48d32277..ab9d8b9d778e5e 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -638,8 +638,8 @@ def _is_builtin_func(self, arg): class IndexOpsMixin: - """ common ops mixin to support a unified interface / docs for Series / - Index + """ + Common ops mixin to support a unified interface / docs for Series / Index """ # ndarray compatibility @@ -656,8 +656,8 @@ def transpose(self, *args, **kwargs): nv.validate_transpose(args, kwargs) return self - T = property(transpose, doc="Return the transpose, which is by " - "definition self.") + T = property(transpose, doc="""\nReturn the transpose, which is by + definition self.\n""") @property def _is_homogeneous_type(self): diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 8f6c271af4a584..ef4639a3afe4c9 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -1,6 +1,7 @@ #!/usr/bin/env python -"""Top level ``eval`` module. +""" +Top level ``eval`` module. """ import tokenize @@ -15,7 +16,8 @@ def _check_engine(engine): - """Make sure a valid engine is passed. + """ + Make sure a valid engine is passed. Parameters ---------- @@ -31,7 +33,6 @@ def _check_engine(engine): Returns ------- string engine - """ from pandas.core.computation.check import _NUMEXPR_INSTALLED @@ -60,7 +61,8 @@ def _check_engine(engine): def _check_parser(parser): - """Make sure a valid parser is passed. + """ + Make sure a valid parser is passed. Parameters ---------- @@ -88,7 +90,8 @@ def _check_resolvers(resolvers): def _check_expression(expr): - """Make sure an expression is not an empty string + """ + Make sure an expression is not an empty string Parameters ---------- @@ -105,7 +108,8 @@ def _check_expression(expr): def _convert_expression(expr): - """Convert an object to an expression. + """ + Convert an object to an expression. Thus function converts an object to an expression (a unicode string) and checks to make sure it isn't empty after conversion. This is used to @@ -155,7 +159,8 @@ def _check_for_locals(expr, stack_level, parser): def eval(expr, parser='pandas', engine=None, truediv=True, local_dict=None, global_dict=None, resolvers=(), level=0, target=None, inplace=False): - """Evaluate a Python expression as a string using various backends. + """ + Evaluate a Python expression as a string using various backends. The following arithmetic operations are supported: ``+``, ``-``, ``*``, ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index a56ee72cf1910a..7fe8ce7d716832 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -631,12 +631,16 @@ def __init__(self, unit="ns", tz=None): @property def unit(self): - """The precision of the datetime data.""" + """ + The precision of the datetime data. + """ return self._unit @property def tz(self): - """The timezone.""" + """ + The timezone. + """ return self._tz @classmethod @@ -777,7 +781,9 @@ def __new__(cls, freq=None): @property def freq(self): - """The frequency object of this PeriodDtype.""" + """ + The frequency object of this PeriodDtype. + """ return self._freq @classmethod @@ -944,7 +950,9 @@ def __new__(cls, subtype=None): @property def subtype(self): - """The dtype of the Interval bounds.""" + """ + The dtype of the Interval bounds. + """ return self._subtype @classmethod diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 63cb4d85ca308e..02ee777bbe7f3c 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -427,7 +427,8 @@ def is_named_tuple(obj): def is_hashable(obj): - """Return True if hash(obj) will succeed, False otherwise. + """ + Return True if hash(obj) will succeed, False otherwise. Some types will pass a test against collections.abc.Hashable but fail when they are actually hashed with hash(). diff --git a/pandas/core/generic.py b/pandas/core/generic.py index dba88495d8128b..360576ffdb00a2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1838,7 +1838,8 @@ def __iter__(self): # can we get a better explanation of this? def keys(self): - """Get the 'info axis' (see Indexing for more) + """ + Get the 'info axis' (see Indexing for more) This is index for Series, columns for DataFrame. @@ -1850,7 +1851,8 @@ def keys(self): return self._info_axis def iteritems(self): - """Iterate over (label, values) on info axis + """ + Iterate over (label, values) on info axis This is index for Series, columns for DataFrame and so on. """ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2b190c53da53d0..43950f2f503c85 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -172,7 +172,7 @@ class providing the base-class of operations. {examples} """) -_pipe_template = """\ +_pipe_template = """ Apply a function `func` with arguments to this %(klass)s object and return the function's result. @@ -223,7 +223,8 @@ class providing the base-class of operations. Examples -------- -%(examples)s""" +%(examples)s +""" _transform_template = """ Call function producing a like-indexed %(klass)s on each group and diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 896935fa72adbd..577d0221cd8da8 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -769,7 +769,8 @@ def _find_non_overlapping_monotonic_bounds(self, key): return start, stop def get_loc(self, key, method=None): - """Get integer location, slice or boolean mask for requested label. + """ + Get integer location, slice or boolean mask for requested label. Parameters ---------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 0f457ba799928e..0d6e75f95f8637 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1149,7 +1149,7 @@ def _set_names(self, names, level=None, validate=True): self.levels[l].rename(name, inplace=True) names = property(fset=_set_names, fget=_get_names, - doc="Names of levels in MultiIndex") + doc="""\nNames of levels in MultiIndex\n""") @Appender(_index_shared_docs['_get_grouper_for_level']) def _get_grouper_for_level(self, mapper, level): @@ -1823,12 +1823,16 @@ def remove_unused_levels(self): @property def nlevels(self): - """Integer number of levels in this MultiIndex.""" + """ + Integer number of levels in this MultiIndex. + """ return len(self.levels) @property def levshape(self): - """A tuple with the length of each level.""" + """ + A tuple with the length of each level. + """ return tuple(len(x) for x in self.levels) def __reduce__(self): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 7f4827be6dff77..6a21adb1d16ae8 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1311,7 +1311,8 @@ def _get_slice_axis(self, slice_obj, axis=None): class _IXIndexer(_NDFrameIndexer): - """A primarily label-location based indexer, with integer position + """ + A primarily label-location based indexer, with integer position fallback. Warning: Starting in 0.20.0, the .ix indexer is deprecated, in diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 874973846a0068..d1d99d28e59b65 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -204,8 +204,7 @@ def _assure_grouper(self): >>> df.resample('2D').pipe(lambda x: x.max() - x.min()) A 2012-08-02 1 - 2012-08-04 1 - """) + 2012-08-04 1""") @Appender(_pipe_template) def pipe(self, func, *args, **kwargs): return super().pipe(func, *args, **kwargs) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 1a80b35629356b..d21ad58e752c29 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -133,7 +133,8 @@ def merge_ordered(left, right, on=None, left_by=None, right_by=None, fill_method=None, suffixes=('_x', '_y'), how='outer'): - """Perform merge with optional filling/interpolation designed for ordered + """ + Perform merge with optional filling/interpolation designed for ordered data like time series data. Optionally perform group-wise merge (see examples) @@ -240,7 +241,8 @@ def merge_asof(left, right, on=None, tolerance=None, allow_exact_matches=True, direction='backward'): - """Perform an asof merge. This is similar to a left-join except that we + """ + Perform an asof merge. This is similar to a left-join except that we match on nearest key rather than equal keys. Both DataFrames must be sorted by the key. From c275dbfcee0fa9644fa2718ad76fd91ca056069b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 21 Jun 2019 09:05:08 +0200 Subject: [PATCH 034/238] BUG: catch out-of-bounds datetime64 in Series/DataFrame constructor (#26848) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/dtypes/cast.py | 4 ++- pandas/core/internals/construction.py | 5 +++- pandas/tests/test_base.py | 38 +++++++++++++++++++++++++++ 4 files changed, 46 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a6b74865f6619c..a897f364d80667 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -600,6 +600,7 @@ Datetimelike - Bug in :meth:`isin` for datetimelike indexes; :class:`DatetimeIndex`, :class:`TimedeltaIndex` and :class:`PeriodIndex` where the ``levels`` parameter was ignored. (:issue:`26675`) - Bug in :func:`to_datetime` which raises ``TypeError`` for ``format='%Y%m%d'`` when called for invalid integer dates with length >= 6 digits with ``errors='ignore'`` - Bug when comparing a :class:`PeriodIndex` against a zero-dimensional numpy array (:issue:`26689`) +- Bug in constructing a ``Series`` or ``DataFrame`` from a numpy ``datetime64`` array with a non-ns unit and out-of-bound timestamps generating rubbish data, which will now correctly raise an ``OutOfBoundsDatetime`` error (:issue:`26206`). - Bug in :func:`date_range` with unnecessary ``OverflowError`` being raised for very large or very small dates (:issue:`26651`) Timedelta diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 2f66e9ed46aa0e..c68d469d291e7f 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1038,6 +1038,8 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): .tz_convert(dtype.tz)) elif is_timedelta64: value = to_timedelta(value, errors=errors)._values + except OutOfBoundsDatetime: + raise except (AttributeError, ValueError, TypeError): pass @@ -1063,7 +1065,7 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): dtype = value.dtype if dtype.kind == 'M' and dtype != _NS_DTYPE: - value = value.astype(_NS_DTYPE) + value = tslibs.conversion.ensure_datetime64ns(value) elif dtype.kind == 'm' and dtype != _TD_DTYPE: value = to_timedelta(value) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 2616f0aa97d0d4..f564ac13dc41d2 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -8,7 +8,7 @@ import numpy.ma as ma from pandas._libs import lib -from pandas._libs.tslibs import IncompatibleFrequency +from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime from pandas.compat import raise_with_traceback from pandas.core.dtypes.cast import ( @@ -700,6 +700,9 @@ def _try_cast(arr, take_fast_path, dtype, copy, raise_cast_failure): elif not is_extension_type(subarr): subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy) + except OutOfBoundsDatetime: + # in case of out of bound datetime64 -> always raise + raise except (ValueError, TypeError): if is_categorical_dtype(dtype): # We *do* allow casting to categorical, since we know diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 3b4f85e680f6e2..d24ed9433f4f71 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1341,3 +1341,41 @@ def test_to_numpy_dtype(as_series): expected = np.array(['2000-01-01T05', '2001-01-01T05'], dtype='M8[ns]') tm.assert_numpy_array_equal(result, expected) + + +class TestConstruction: + # test certain constructor behaviours on dtype inference across Series, + # Index and DataFrame + + @pytest.mark.parametrize("klass", [ + Series, + lambda x, **kwargs: DataFrame({'a': x}, **kwargs)['a'], + pytest.param(lambda x, **kwargs: DataFrame(x, **kwargs)[0], + marks=pytest.mark.xfail), + Index, + ]) + @pytest.mark.parametrize("a", [ + np.array(['2263-01-01'], dtype='datetime64[D]'), + np.array([datetime(2263, 1, 1)], dtype=object), + np.array([np.datetime64('2263-01-01', 'D')], dtype=object), + np.array(["2263-01-01"], dtype=object) + ], ids=['datetime64[D]', 'object-datetime.datetime', + 'object-numpy-scalar', 'object-string']) + def test_constructor_datetime_outofbound(self, a, klass): + # GH-26853 (+ bug GH-26206 out of bound non-ns unit) + + # No dtype specified (dtype inference) + # datetime64[non-ns] raise error, other cases result in object dtype + # and preserve original data + if a.dtype.kind == 'M': + with pytest.raises(pd.errors.OutOfBoundsDatetime): + klass(a) + else: + result = klass(a) + assert result.dtype == 'object' + tm.assert_numpy_array_equal(result.to_numpy(), a) + + # Explicit dtype specified + # Forced conversion fails for all -> all cases raise error + with pytest.raises(pd.errors.OutOfBoundsDatetime): + klass(a, dtype='datetime64[ns]') From 224362951942e1f4e05fb8948596620aedac26d9 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Fri, 21 Jun 2019 10:37:17 +0100 Subject: [PATCH 035/238] PLOT: Add option to specify the plotting backend (#26753) --- doc/source/user_guide/options.rst | 6 +++++ doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/config_init.py | 36 +++++++++++++++++++++++++++ pandas/plotting/_core.py | 11 ++++---- pandas/plotting/_misc.py | 9 +------ pandas/tests/plotting/test_backend.py | 33 ++++++++++++++++++++++++ pandas/tests/plotting/test_misc.py | 2 +- 7 files changed, 84 insertions(+), 14 deletions(-) create mode 100644 pandas/tests/plotting/test_backend.py diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index 4b466c2c44d491..4d0def435cb1e0 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -431,6 +431,12 @@ compute.use_bottleneck True Use the bottleneck library computation if it is installed. compute.use_numexpr True Use the numexpr library to accelerate computation if it is installed. +plotting.backend matplotlib Change the plotting backend to a different + backend than the current matplotlib one. + Backends can be implemented as third-party + libraries implementing the pandas plotting + API. They can use other plotting libraries + like Bokeh, Altair, etc. plotting.matplotlib.register_converters True Register custom converters with matplotlib. Set to False to de-register. ======================================= ============ ================================== diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a897f364d80667..77b689569d57fb 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -132,6 +132,7 @@ Other Enhancements - :class:`DatetimeIndex` and :class:`TimedeltaIndex` now have a ``mean`` method (:issue:`24757`) - :meth:`DataFrame.describe` now formats integer percentiles without decimal point (:issue:`26660`) - Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`) +- Added new option ``plotting.backend`` to be able to select a plotting backend different than the existing ``matplotlib`` one. Use ``pandas.set_option('plotting.backend', '')`` where `` Date: Fri, 21 Jun 2019 16:19:43 +0200 Subject: [PATCH 036/238] COMPAT: reading generic PyTables Table format fails with sub-selection (#26818) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/io/pytables.py | 36 ++++++++++-- pandas/tests/io/pytables/test_compat.py | 76 +++++++++++++++++++++++++ pandas/tests/io/test_pytables.py | 2 +- 4 files changed, 108 insertions(+), 7 deletions(-) create mode 100644 pandas/tests/io/pytables/test_compat.py diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 77b689569d57fb..467cb5a40213c7 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -699,6 +699,7 @@ I/O - Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`) - Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`) - Bug in :meth:`DataFrame.to_html` where header numbers would ignore display options when rounding (:issue:`17280`) +- Bug in :func:`read_hdf` where reading a table from an HDF5 file written directly with PyTables fails with a ``ValueError`` when using a sub-selection via the ``start`` or ``stop`` arguments (:issue:`11188`) - Bug in :func:`read_hdf` not properly closing store after a ``KeyError`` is raised (:issue:`25766`) - Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`) - Improved the explanation for the failure when value labels are repeated in Stata dta files and suggested work-arounds (:issue:`25772`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 79d6d8563a162e..17d580bae5cf1d 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1624,7 +1624,8 @@ def infer(self, handler): new_self.read_metadata(handler) return new_self - def convert(self, values, nan_rep, encoding, errors): + def convert(self, values, nan_rep, encoding, errors, start=None, + stop=None): """ set the values from this selection: take = take ownership """ # values is a recarray @@ -1813,10 +1814,29 @@ class GenericIndexCol(IndexCol): def is_indexed(self): return False - def convert(self, values, nan_rep, encoding, errors): - """ set the values from this selection: take = take ownership """ + def convert(self, values, nan_rep, encoding, errors, start=None, + stop=None): + """ set the values from this selection: take = take ownership + + Parameters + ---------- + + values : np.ndarray + nan_rep : str + encoding : str + errors : str + start : int, optional + Table row number: the start of the sub-selection. + stop : int, optional + Table row number: the end of the sub-selection. Values larger than + the underlying table's row count are normalized to that. + """ + + start = start if start is not None else 0 + stop = (min(stop, self.table.nrows) + if stop is not None else self.table.nrows) + self.values = Int64Index(np.arange(stop - start)) - self.values = Int64Index(np.arange(self.table.nrows)) return self def get_attr(self): @@ -2159,7 +2179,8 @@ def validate_attr(self, append): raise ValueError("appended items dtype do not match existing " "items dtype in table!") - def convert(self, values, nan_rep, encoding, errors): + def convert(self, values, nan_rep, encoding, errors, start=None, + stop=None): """set the data from this selection (and convert to the correct dtype if we can) """ @@ -3431,8 +3452,11 @@ def read_axes(self, where, **kwargs): # convert the data for a in self.axes: a.set_info(self.info) + # `kwargs` may contain `start` and `stop` arguments if passed to + # `store.select()`. If set they determine the index size. a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding, - errors=self.errors) + errors=self.errors, start=kwargs.get('start'), + stop=kwargs.get('stop')) return True diff --git a/pandas/tests/io/pytables/test_compat.py b/pandas/tests/io/pytables/test_compat.py new file mode 100644 index 00000000000000..d74e1218ebdb05 --- /dev/null +++ b/pandas/tests/io/pytables/test_compat.py @@ -0,0 +1,76 @@ +import pytest + +import pandas as pd +from pandas.tests.io.test_pytables import ensure_clean_path +from pandas.util.testing import assert_frame_equal + +tables = pytest.importorskip('tables') + + +@pytest.fixture +def pytables_hdf5_file(): + """Use PyTables to create a simple HDF5 file.""" + + table_schema = { + 'c0': tables.Time64Col(pos=0), + 'c1': tables.StringCol(5, pos=1), + 'c2': tables.Int64Col(pos=2), + } + + t0 = 1561105000.0 + + testsamples = [ + {'c0': t0, 'c1': 'aaaaa', 'c2': 1}, + {'c0': t0 + 1, 'c1': 'bbbbb', 'c2': 2}, + {'c0': t0 + 2, 'c1': 'ccccc', 'c2': 10**5}, + {'c0': t0 + 3, 'c1': 'ddddd', 'c2': 4294967295}, + ] + + objname = 'pandas_test_timeseries' + + with ensure_clean_path('written_with_pytables.h5') as path: + # The `ensure_clean_path` context mgr removes the temp file upon exit. + with tables.open_file(path, mode='w') as f: + t = f.create_table('/', name=objname, description=table_schema) + for sample in testsamples: + for key, value in sample.items(): + t.row[key] = value + t.row.append() + + yield path, objname, pd.DataFrame(testsamples) + + +class TestReadPyTablesHDF5: + """ + A group of tests which covers reading HDF5 files written by plain PyTables + (not written by pandas). + + Was introduced for regression-testing issue 11188. + """ + + def test_read_complete(self, pytables_hdf5_file): + path, objname, df = pytables_hdf5_file + result = pd.read_hdf(path, key=objname) + expected = df + assert_frame_equal(result, expected) + + def test_read_with_start(self, pytables_hdf5_file): + path, objname, df = pytables_hdf5_file + # This is a regression test for pandas-dev/pandas/issues/11188 + result = pd.read_hdf(path, key=objname, start=1) + expected = df[1:].reset_index(drop=True) + assert_frame_equal(result, expected) + + def test_read_with_stop(self, pytables_hdf5_file): + path, objname, df = pytables_hdf5_file + # This is a regression test for pandas-dev/pandas/issues/11188 + result = pd.read_hdf(path, key=objname, stop=1) + expected = df[:1].reset_index(drop=True) + assert_frame_equal(result, expected) + + def test_read_with_startstop(self, pytables_hdf5_file): + path, objname, df = pytables_hdf5_file + # This is a regression test for pandas-dev/pandas/issues/11188 + result = pd.read_hdf(path, key=objname, start=1, stop=2) + expected = df[1:2].reset_index(drop=True) + assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 299c0feb502be3..ef9dbc63d873d4 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -105,7 +105,7 @@ def ensure_clean_store(path, mode='a', complevel=None, complib=None, def ensure_clean_path(path): """ return essentially a named temporary file that is not opened - and deleted on existing; if path is a list, then create and + and deleted on exiting; if path is a list, then create and return list of filenames """ try: From f2aea09e7ce6fd6beb20d2cdffa44edab6f285cc Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Fri, 21 Jun 2019 17:52:24 +0200 Subject: [PATCH 037/238] TST: tests for maybe_promote (precursor to #23982) (#25637) --- pandas/conftest.py | 44 ++ pandas/tests/dtypes/cast/test_promote.py | 677 +++++++++++++++++++++++ 2 files changed, 721 insertions(+) create mode 100644 pandas/tests/dtypes/cast/test_promote.py diff --git a/pandas/conftest.py b/pandas/conftest.py index c4285e9db038af..4bcd0ea8442e66 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -399,6 +399,10 @@ def tz_aware_fixture(request): return request.param +# Generate cartesian product of tz_aware_fixture: +tz_aware_fixture2 = tz_aware_fixture + + # ---------------------------------------------------------------- # Dtypes # ---------------------------------------------------------------- @@ -438,6 +442,46 @@ def string_dtype(request): return request.param +@pytest.fixture(params=BYTES_DTYPES) +def bytes_dtype(request): + """Parametrized fixture for bytes dtypes. + + * bytes + * 'bytes' + """ + return request.param + + +@pytest.fixture(params=OBJECT_DTYPES) +def object_dtype(request): + """Parametrized fixture for object dtypes. + + * object + * 'object' + """ + return request.param + + +@pytest.fixture(params=DATETIME64_DTYPES) +def datetime64_dtype(request): + """Parametrized fixture for datetime64 dtypes. + + * 'datetime64[ns]' + * 'M8[ns]' + """ + return request.param + + +@pytest.fixture(params=TIMEDELTA64_DTYPES) +def timedelta64_dtype(request): + """Parametrized fixture for timedelta64 dtypes. + + * 'timedelta64[ns]' + * 'm8[ns]' + """ + return request.param + + @pytest.fixture(params=FLOAT_DTYPES) def float_dtype(request): """ diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py new file mode 100644 index 00000000000000..5a5b5d47b3ccca --- /dev/null +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -0,0 +1,677 @@ +""" +These test the method maybe_promote from core/dtypes/cast.py +""" + +import datetime + +import numpy as np +import pytest + +from pandas._libs.tslibs import NaT, iNaT +from pandas.compat import is_platform_windows + +from pandas.core.dtypes.cast import maybe_promote +from pandas.core.dtypes.common import ( + is_complex_dtype, is_datetime64_dtype, is_datetime_or_timedelta_dtype, + is_float_dtype, is_integer_dtype, is_object_dtype, is_scalar, + is_string_dtype, is_timedelta64_dtype) +from pandas.core.dtypes.dtypes import DatetimeTZDtype, PandasExtensionDtype + +import pandas as pd + + +@pytest.fixture(params=[bool, 'uint8', 'int32', 'uint64', 'float32', 'float64', + 'complex64', 'complex128', 'M8[ns]', 'm8[ns]', str, + bytes, object]) +def any_numpy_dtype_reduced(request): + """ + Parameterized fixture for numpy dtypes, reduced from any_numpy_dtype. + + * bool + * 'int32' + * 'uint64' + * 'float32' + * 'float64' + * 'complex64' + * 'complex128' + * 'M8[ns]' + * 'M8[ns]' + * str + * bytes + * object + """ + return request.param + + +@pytest.fixture(params=[(True, None), (True, object), (False, None)], + ids=['True-None', 'True-object', 'False-None']) +def box(request): + """ + Parametrized fixture determining whether/how to transform fill_value. + + Since fill_value is defined on a per-test basis, the actual transformation + (based on this fixture) is executed in _check_promote. + + Returns + ------- + boxed : Boolean + Whether fill_value should be wrapped in an np.array. + box_dtype : dtype + The dtype to pass to np.array([fill_value], dtype=box_dtype). If None, + then this is passed on unmodified, and corresponds to the numpy default + dtype for the given fill_value. + + * (True, None) # fill_value wrapped in array with default dtype + * (True, object) # fill_value wrapped in array with object dtype + * (False, None) # fill_value passed on as scalar + """ + return request.param + + +def _safe_dtype_assert(left_dtype, right_dtype): + """ + Compare two dtypes without raising TypeError. + """ + if isinstance(right_dtype, PandasExtensionDtype): + # switch order of equality check because numpy dtypes (e.g. if + # left_dtype is np.object_) do not know some expected dtypes (e.g. + # DatetimeTZDtype) and would raise a TypeError in their __eq__-method. + assert right_dtype == left_dtype + else: + assert left_dtype == right_dtype + + +def _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar=None, exp_val_for_array=None): + """ + Auxiliary function to unify testing of scalar/array promotion. + + Parameters + ---------- + dtype : dtype + The value to pass on as the first argument to maybe_promote. + fill_value : scalar + The value to pass on as the second argument to maybe_promote, either as + a scalar, or boxed into an array (depending on the parameter `boxed`). + boxed : Boolean + Parameter whether fill_value should be passed to maybe_promote + directly, or wrapped in an array (of dtype box_dtype). + box_dtype : dtype + The dtype to enforce when wrapping fill_value into an np.array. + expected_dtype : dtype + The expected dtype returned by maybe_promote (by design this is the + same regardless of whether fill_value was passed as a scalar or in an + array!). + exp_val_for_scalar : scalar + The expected value for the (potentially upcast) fill_value returned by + maybe_promote. + exp_val_for_array : scalar + The expected missing value marker for the expected_dtype (which is + returned by maybe_promote when it receives an array). + """ + assert is_scalar(fill_value) + + if boxed: + # in this case, we pass on fill_value wrapped in an array of specified + # box_dtype; the expected value returned from maybe_promote is the + # missing value marker for the returned dtype. + fill_array = np.array([fill_value], dtype=box_dtype) + result_dtype, result_fill_value = maybe_promote(dtype, fill_array) + expected_fill_value = exp_val_for_array + else: + # here, we pass on fill_value as a scalar directly; the expected value + # returned from maybe_promote is fill_value, potentially upcast to the + # returned dtype. + result_dtype, result_fill_value = maybe_promote(dtype, fill_value) + expected_fill_value = exp_val_for_scalar + + _safe_dtype_assert(result_dtype, expected_dtype) + + # for equal values, also check type (relevant e.g. for int vs float, resp. + # for different datetimes and timedeltas) + match_value = (result_fill_value == expected_fill_value + # disabled type check due to too many xfails; GH 23982/25425 + # and type(result_fill_value) == type(expected_fill_value) + ) + + # for missing values, None == None and iNaT == iNaT (which is checked + # through match_value above), but np.nan != np.nan and pd.NaT != pd.NaT + match_missing = ((result_fill_value is np.nan + and expected_fill_value is np.nan) + or (result_fill_value is NaT + and expected_fill_value is NaT)) + + assert match_value or match_missing + + +def test_maybe_promote_int_with_int(): + # placeholder due to too many xfails; see GH 23982 / 25425 + pass + + +# override parametrization due to to many xfails; see GH 23982 / 25425 +@pytest.mark.parametrize('box', [(True, None), (False, None)]) +def test_maybe_promote_int_with_float(any_int_dtype, float_dtype, box): + dtype = np.dtype(any_int_dtype) + fill_dtype = np.dtype(float_dtype) + boxed, box_dtype = box # read from parametrized fixture + + if float_dtype == 'float32' and not boxed: + pytest.xfail('falsely upcasts to float64') + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling int with float always upcasts to float64 + expected_dtype = np.float64 + # fill_value can be different float type + exp_val_for_scalar = np.float64(fill_value) + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +# override parametrization due to to many xfails; see GH 23982 / 25425 +@pytest.mark.parametrize('box', [(True, None), (False, None)]) +def test_maybe_promote_float_with_int(float_dtype, any_int_dtype, box): + + dtype = np.dtype(float_dtype) + fill_dtype = np.dtype(any_int_dtype) + boxed, box_dtype = box # read from parametrized fixture + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling float with int always keeps float dtype + # because: np.finfo('float32').max > np.iinfo('uint64').max + expected_dtype = dtype + # output is not a generic float, but corresponds to expected_dtype + exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +def test_maybe_promote_float_with_float(): + # placeholder due to too many xfails; see GH 23982 / 25425 + pass + + +def test_maybe_promote_bool_with_any(any_numpy_dtype_reduced, box): + dtype = np.dtype(bool) + fill_dtype = np.dtype(any_numpy_dtype_reduced) + boxed, box_dtype = box # read from parametrized fixture + + if boxed and fill_dtype == bool: + pytest.xfail('falsely upcasts to object') + if (boxed and box_dtype is None + and is_datetime_or_timedelta_dtype(fill_dtype)): + pytest.xfail('wrongly casts fill_value') + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling bool with anything but bool casts to object + expected_dtype = np.dtype(object) if fill_dtype != bool else fill_dtype + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan if fill_dtype != bool else None + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +def test_maybe_promote_any_with_bool(any_numpy_dtype_reduced, box): + dtype = np.dtype(any_numpy_dtype_reduced) + fill_value = True + boxed, box_dtype = box # read from parametrized fixture + + if boxed and dtype == bool: + pytest.xfail('falsely upcasts to object') + if boxed and dtype not in (str, object) and box_dtype is None: + pytest.xfail('falsely upcasts to object') + if not boxed and is_datetime_or_timedelta_dtype(dtype): + pytest.xfail('raises error') + + # filling anything but bool with bool casts to object + expected_dtype = np.dtype(object) if dtype != bool else dtype + # output is not a generic bool, but corresponds to expected_dtype + exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] + exp_val_for_array = np.nan if dtype != bool else None + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +def test_maybe_promote_bytes_with_any(): + # placeholder due to too many xfails; see GH 23982 / 25425 + pass + + +def test_maybe_promote_any_with_bytes(): + # placeholder due to too many xfails; see GH 23982 / 25425 + pass + + +def test_maybe_promote_datetime64_with_any(): + # placeholder due to too many xfails; see GH 23982 / 25425 + pass + + +# override parametrization of box to add special case for dt_dtype +@pytest.mark.parametrize('box', [ + (True, None), # fill_value wrapped in array with default dtype + # disabled due to too many xfails; see GH 23982 / 25425 + # (True, 'dt_dtype'), # fill_value in array with explicit datetime dtype + # (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value passed on as scalar +]) +@pytest.mark.parametrize('fill_value', [ + pd.Timestamp('now'), np.datetime64('now'), + datetime.datetime.now(), datetime.date.today() +], ids=['pd.Timestamp', 'np.datetime64', 'datetime.datetime', 'datetime.date']) +def test_maybe_promote_any_with_datetime64(any_numpy_dtype_reduced, + datetime64_dtype, fill_value, box): + dtype = np.dtype(any_numpy_dtype_reduced) + boxed, box_dtype = box # read from parametrized fixture + + if is_datetime64_dtype(dtype): + if (boxed and (box_dtype == object + or (box_dtype is None + and not is_datetime64_dtype(type(fill_value))))): + pytest.xfail('falsely upcasts to object') + else: + if (boxed and (box_dtype == 'dt_dtype' + or (box_dtype is None + and is_datetime64_dtype(type(fill_value))))): + pytest.xfail('mix of lack of upcasting, resp. wrong missing value') + if not boxed and is_timedelta64_dtype(dtype): + pytest.xfail('raises error') + + # special case for box_dtype + box_dtype = (np.dtype(datetime64_dtype) if box_dtype == 'dt_dtype' + else box_dtype) + + # filling datetime with anything but datetime casts to object + if is_datetime64_dtype(dtype): + expected_dtype = dtype + # for datetime dtypes, scalar values get cast to pd.Timestamp.value + exp_val_for_scalar = pd.Timestamp(fill_value).value + exp_val_for_array = iNaT + else: + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +# override parametrization due to to many xfails; see GH 23982 / 25425 +@pytest.mark.parametrize('box', [(True, object)]) +def test_maybe_promote_datetimetz_with_any_numpy_dtype( + tz_aware_fixture, any_numpy_dtype_reduced, box): + dtype = DatetimeTZDtype(tz=tz_aware_fixture) + fill_dtype = np.dtype(any_numpy_dtype_reduced) + boxed, box_dtype = box # read from parametrized fixture + + if box_dtype != object: + pytest.xfail('does not upcast correctly') + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling datetimetz with any numpy dtype casts to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +# override parametrization due to to many xfails; see GH 23982 / 25425 +@pytest.mark.parametrize('box', [(True, None), (True, object)]) +def test_maybe_promote_datetimetz_with_datetimetz(tz_aware_fixture, + tz_aware_fixture2, box): + dtype = DatetimeTZDtype(tz=tz_aware_fixture) + fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture2) + boxed, box_dtype = box # read from parametrized fixture + + from dateutil.tz import tzlocal + if is_platform_windows() and tz_aware_fixture2 == tzlocal(): + pytest.xfail('Cannot process fill_value with this dtype, see GH 24310') + if dtype.tz == fill_dtype.tz and boxed: + pytest.xfail('falsely upcasts') + if dtype.tz != fill_dtype.tz and not boxed: + pytest.xfail('falsely upcasts') + + # create array of given dtype; casts "1" to correct dtype + fill_value = pd.Series([10 ** 9], dtype=fill_dtype)[0] + + # filling datetimetz with datetimetz casts to object, unless tz matches + exp_val_for_scalar = fill_value + if dtype.tz == fill_dtype.tz: + expected_dtype = dtype + exp_val_for_array = NaT + else: + expected_dtype = np.dtype(object) + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('fill_value', [None, np.nan, NaT, iNaT], + ids=['None', 'np.nan', 'pd.NaT', 'iNaT']) +# override parametrization due to to many xfails; see GH 23982 / 25425 +@pytest.mark.parametrize('box', [(False, None)]) +def test_maybe_promote_datetimetz_with_na(tz_aware_fixture, fill_value, box): + + dtype = DatetimeTZDtype(tz=tz_aware_fixture) + boxed, box_dtype = box # read from parametrized fixture + + if (boxed and (box_dtype == object + or (box_dtype is None + and (fill_value is None or fill_value is NaT)))): + pytest.xfail('false upcasts to object') + # takes the opinion that DatetimeTZ should have single na-marker + # using iNaT would lead to errors elsewhere -> NaT + if not boxed and fill_value == iNaT: + pytest.xfail('wrong missing value marker') + + expected_dtype = dtype + # DatetimeTZDtype does not use iNaT as missing value marker + exp_val_for_scalar = NaT + exp_val_for_array = NaT + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('fill_value', [ + pd.Timestamp('now'), np.datetime64('now'), + datetime.datetime.now(), datetime.date.today() +], ids=['pd.Timestamp', 'np.datetime64', 'datetime.datetime', 'datetime.date']) +def test_maybe_promote_any_numpy_dtype_with_datetimetz( + any_numpy_dtype_reduced, tz_aware_fixture, fill_value, box): + dtype = np.dtype(any_numpy_dtype_reduced) + fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture) + boxed, box_dtype = box # read from parametrized fixture + + if is_datetime_or_timedelta_dtype(dtype) and not boxed: + pytest.xfail('raises error') + + fill_value = pd.Series([fill_value], dtype=fill_dtype)[0] + + # filling any numpy dtype with datetimetz casts to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +def test_maybe_promote_timedelta64_with_any(): + # placeholder due to too many xfails; see GH 23982 / 25425 + pass + + +@pytest.mark.parametrize('fill_value', [ + pd.Timedelta(days=1), np.timedelta64(24, 'h'), datetime.timedelta(1) +], ids=['pd.Timedelta', 'np.timedelta64', 'datetime.timedelta']) +# override parametrization of box to add special case for td_dtype +@pytest.mark.parametrize('box', [ + (True, None), # fill_value wrapped in array with default dtype + # disabled due to too many xfails; see GH 23982 / 25425 + # (True, 'td_dtype'), # fill_value in array with explicit timedelta dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value passed on as scalar +]) +def test_maybe_promote_any_with_timedelta64( + any_numpy_dtype_reduced, timedelta64_dtype, fill_value, box): + dtype = np.dtype(any_numpy_dtype_reduced) + boxed, box_dtype = box # read from parametrized fixture + + if is_timedelta64_dtype(dtype): + if (boxed and (box_dtype == object + or (box_dtype is None + and not is_timedelta64_dtype(type(fill_value))))): + pytest.xfail('falsely upcasts to object') + else: + if (boxed and box_dtype is None + and is_timedelta64_dtype(type(fill_value))): + pytest.xfail('does not upcast correctly') + if (not boxed and is_timedelta64_dtype(type(fill_value)) and ( + is_integer_dtype(dtype) or is_float_dtype(dtype) + or is_complex_dtype(dtype) + or issubclass(dtype.type, np.bytes_))): + pytest.xfail('does not upcast correctly') + if box_dtype == 'td_dtype': + pytest.xfail('falsely upcasts') + if not boxed and is_datetime64_dtype(dtype): + pytest.xfail('raises error') + + # special case for box_dtype + box_dtype = (np.dtype(timedelta64_dtype) if box_dtype == 'td_dtype' + else box_dtype) + + # filling anything but timedelta with timedelta casts to object + if is_timedelta64_dtype(dtype): + expected_dtype = dtype + # for timedelta dtypes, scalar values get cast to pd.Timedelta.value + exp_val_for_scalar = pd.Timedelta(fill_value).value + exp_val_for_array = iNaT + else: + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +def test_maybe_promote_string_with_any(string_dtype, + any_numpy_dtype_reduced, box): + dtype = np.dtype(string_dtype) + fill_dtype = np.dtype(any_numpy_dtype_reduced) + boxed, box_dtype = box # read from parametrized fixture + + if (boxed and box_dtype is None + and is_datetime_or_timedelta_dtype(fill_dtype)): + pytest.xfail('wrong missing value marker') + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling string with anything casts to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +# override parametrization of box to add special case for str +@pytest.mark.parametrize('box', [ + # disabled due to too many xfails; see GH 23982 / 25425 + # (True, None), # fill_value wrapped in array with default dtype + # (True, 'str'), # fill_value wrapped in array with generic string-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None) # fill_value passed on as scalar +]) +def test_maybe_promote_any_with_string(any_numpy_dtype_reduced, + string_dtype, box): + dtype = np.dtype(any_numpy_dtype_reduced) + fill_dtype = np.dtype(string_dtype) + boxed, box_dtype = box # read from parametrized fixture + + if is_datetime_or_timedelta_dtype(dtype) and box_dtype != object: + pytest.xfail('does not upcast or raises') + if (boxed and box_dtype in (None, 'str') and ( + is_integer_dtype(dtype) or is_float_dtype(dtype) + or is_complex_dtype(dtype) + or issubclass(dtype.type, np.bytes_))): + pytest.xfail('does not upcast correctly') + + # create array of given dtype + fill_value = 'abc' + + # special case for box_dtype (cannot use fixture in parametrization) + box_dtype = fill_dtype if box_dtype == 'str' else box_dtype + + # filling anything with a string casts to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +def test_maybe_promote_object_with_any(object_dtype, + any_numpy_dtype_reduced, box): + dtype = np.dtype(object_dtype) + fill_dtype = np.dtype(any_numpy_dtype_reduced) + boxed, box_dtype = box # read from parametrized fixture + + if (boxed and box_dtype is None + and is_datetime_or_timedelta_dtype(fill_dtype)): + pytest.xfail('wrong missing value marker') + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling object with anything stays object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +def test_maybe_promote_any_with_object(any_numpy_dtype_reduced, + object_dtype, box): + dtype = np.dtype(any_numpy_dtype_reduced) + boxed, box_dtype = box # read from parametrized fixture + + if not boxed and is_datetime_or_timedelta_dtype(dtype): + pytest.xfail('raises error') + + # create array of object dtype from a scalar value (i.e. passing + # dtypes.common.is_scalar), which can however not be cast to int/float etc. + fill_value = pd.DateOffset(1) + + # filling object with anything stays object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('fill_value', [None, np.nan, NaT, iNaT], + ids=['None', 'np.nan', 'pd.NaT', 'iNaT']) +# override parametrization due to to many xfails; see GH 23982 / 25425 +@pytest.mark.parametrize('box', [(False, None)]) +def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype_reduced, + fill_value, box): + dtype = np.dtype(any_numpy_dtype_reduced) + boxed, box_dtype = box # read from parametrized fixture + + if (dtype == bytes and not boxed + and fill_value is not None and fill_value is not NaT): + pytest.xfail('does not upcast to object') + elif dtype == 'uint64' and not boxed and fill_value == iNaT: + pytest.xfail('does not upcast correctly') + elif is_datetime_or_timedelta_dtype(dtype) and boxed: + pytest.xfail('falsely upcasts to object') + elif (boxed and (is_integer_dtype(dtype) or is_float_dtype(dtype) + or is_complex_dtype(dtype)) + and fill_value is not NaT and dtype != 'uint64'): + pytest.xfail('falsely upcasts to object') + elif (boxed and dtype == 'uint64' + and (fill_value is np.nan or fill_value is None)): + pytest.xfail('falsely upcasts to object') + # below: opinionated that iNaT should be interpreted as missing value + elif (not boxed and (is_float_dtype(dtype) or is_complex_dtype(dtype)) + and fill_value == iNaT): + pytest.xfail('does not cast to missing value marker correctly') + elif ((is_string_dtype(dtype) or dtype == bool) + and not boxed and fill_value == iNaT): + pytest.xfail('does not cast to missing value marker correctly') + + if is_integer_dtype(dtype) and dtype == 'uint64' and fill_value == iNaT: + # uint64 + negative int casts to object; iNaT is considered as missing + expected_dtype = np.dtype(object) + exp_val_for_scalar = np.nan + elif is_integer_dtype(dtype) and fill_value == iNaT: + # other integer + iNaT casts to int64 + expected_dtype = np.int64 + exp_val_for_scalar = iNaT + elif is_integer_dtype(dtype) and fill_value is not NaT: + # integer + other missing value (np.nan / None) casts to float + expected_dtype = np.float64 + exp_val_for_scalar = np.nan + elif is_object_dtype(dtype) and (fill_value == iNaT or fill_value is NaT): + # inserting into object does not cast the value + # but *does* cast None to np.nan + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + elif is_datetime_or_timedelta_dtype(dtype): + # datetime / timedelta cast all missing values to iNaT + expected_dtype = dtype + exp_val_for_scalar = iNaT + elif fill_value is NaT: + # NaT upcasts everything that's not datetime/timedelta to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = NaT + elif is_float_dtype(dtype) or is_complex_dtype(dtype): + # float / complex + missing value (!= NaT) stays the same + expected_dtype = dtype + exp_val_for_scalar = np.nan + else: + # all other cases cast to object, and use np.nan as missing value + expected_dtype = np.dtype(object) + exp_val_for_scalar = np.nan + + # array case has same expected_dtype; but returns corresponding na-marker + if is_integer_dtype(expected_dtype): + # integers cannot hold NaNs; maybe_promote_with_array returns None + exp_val_for_array = None + elif is_datetime_or_timedelta_dtype(expected_dtype): + exp_val_for_array = iNaT + else: # expected_dtype = float / complex / object + exp_val_for_array = np.nan + + _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, + exp_val_for_scalar, exp_val_for_array) + + +@pytest.mark.parametrize('dim', [0, 2, 3]) +def test_maybe_promote_dimensions(any_numpy_dtype_reduced, dim): + dtype = np.dtype(any_numpy_dtype_reduced) + + # create 0-dim array of given dtype; casts "1" to correct dtype + fill_array = np.array(1, dtype=dtype) + + # expand to desired dimension: + for _ in range(dim): + fill_array = np.expand_dims(fill_array, 0) + + # test against 1-dimensional case + expected_dtype, expected_missing_value = maybe_promote( + dtype, np.array([1], dtype=dtype)) + + result_dtype, result_missing_value = maybe_promote(dtype, fill_array) + + assert result_dtype == expected_dtype + # None == None, iNaT == iNaT, but np.nan != np.nan + assert ((result_missing_value == expected_missing_value) + or (result_missing_value is np.nan + and expected_missing_value is np.nan)) From ba69f95a88e372e748fb6d7f29aa4b06bad578ca Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 21 Jun 2019 10:58:28 -0500 Subject: [PATCH 038/238] Additional tests for ufunc(Series) (#26951) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/tests/series/test_ufunc.py | 253 ++++++++++++++++++++++++++++++ 2 files changed, 254 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/series/test_ufunc.py diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 467cb5a40213c7..05978d500fa82b 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -774,7 +774,7 @@ Sparse - Bug in :class:`SparseFrame` constructor where passing ``None`` as the data would cause ``default_fill_value`` to be ignored (:issue:`16807`) - Bug in :class:`SparseDataFrame` when adding a column in which the length of values does not match length of index, ``AssertionError`` is raised instead of raising ``ValueError`` (:issue:`25484`) - Introduce a better error message in :meth:`Series.sparse.from_coo` so it returns a ``TypeError`` for inputs that are not coo matrices (:issue:`26554`) -- Bug in :func:`numpy.modf` on a :class:`SparseArray`. Now a tuple of :class:`SparseArray` is returned. +- Bug in :func:`numpy.modf` on a :class:`SparseArray`. Now a tuple of :class:`SparseArray` is returned (:issue:`26946`). Other ^^^^^ diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py new file mode 100644 index 00000000000000..05d19452b1eace --- /dev/null +++ b/pandas/tests/series/test_ufunc.py @@ -0,0 +1,253 @@ +import string + +import numpy as np +import pytest + +import pandas as pd +import pandas.util.testing as tm + +UNARY_UFUNCS = [np.positive, np.floor, np.exp] +BINARY_UFUNCS = [ + np.add, # dunder op + np.logaddexp, +] +SPARSE = [ + pytest.param(True, + marks=pytest.mark.xfail(reason="Series.__array_ufunc__")), + False, +] +SPARSE_IDS = ['sparse', 'dense'] +SHUFFLE = [ + pytest.param(True, marks=pytest.mark.xfail(reason="GH-26945", + strict=False)), + False +] + + +@pytest.fixture +def arrays_for_binary_ufunc(): + """ + A pair of random, length-100 integer-dtype arrays, that are mostly 0. + """ + a1 = np.random.randint(0, 10, 100, dtype='int64') + a2 = np.random.randint(0, 10, 100, dtype='int64') + a1[::3] = 0 + a2[::4] = 0 + return a1, a2 + + +@pytest.mark.parametrize("ufunc", UNARY_UFUNCS) +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +def test_unary_ufunc(ufunc, sparse): + # Test that ufunc(Series) == Series(ufunc) + array = np.random.randint(0, 10, 10, dtype='int64') + array[::2] = 0 + if sparse: + array = pd.SparseArray(array, dtype=pd.SparseDtype('int', 0)) + + index = list(string.ascii_letters[:10]) + name = "name" + series = pd.Series(array, index=index, name=name) + + result = ufunc(series) + expected = pd.Series(ufunc(array), index=index, name=name) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", BINARY_UFUNCS) +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +@pytest.mark.parametrize("flip", [True, False], ids=['flipped', 'straight']) +def test_binary_ufunc_with_array(flip, sparse, ufunc, arrays_for_binary_ufunc): + # Test that ufunc(Series(a), array) == Series(ufunc(a, b)) + a1, a2 = arrays_for_binary_ufunc + if sparse: + a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int', 0)) + a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int', 0)) + + name = "name" # op(Series, array) preserves the name. + series = pd.Series(a1, name=name) + other = a2 + + array_args = (a1, a2) + series_args = (series, other) # ufunc(series, array) + + if flip: + array_args = reversed(array_args) + series_args = reversed(series_args) # ufunc(array, series) + + expected = pd.Series(ufunc(*array_args), name=name) + result = ufunc(*series_args) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", BINARY_UFUNCS) +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +@pytest.mark.parametrize("flip", [ + pytest.param(True, marks=pytest.mark.xfail(reason="Index should defer")), + False +], ids=['flipped', 'straight']) +def test_binary_ufunc_with_index(flip, sparse, ufunc, arrays_for_binary_ufunc): + # Test that + # * func(Series(a), Series(b)) == Series(ufunc(a, b)) + # * ufunc(Index, Series) dispatches to Series (returns a Series) + a1, a2 = arrays_for_binary_ufunc + if sparse: + a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int', 0)) + a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int', 0)) + + name = "name" # op(Series, array) preserves the name. + series = pd.Series(a1, name=name) + other = pd.Index(a2, name=name).astype("int64") + + array_args = (a1, a2) + series_args = (series, other) # ufunc(series, array) + + if flip: + array_args = reversed(array_args) + series_args = reversed(series_args) # ufunc(array, series) + + expected = pd.Series(ufunc(*array_args), name=name) + result = ufunc(*series_args) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", BINARY_UFUNCS) +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +@pytest.mark.parametrize("shuffle", [True, False], ids=['unaligned', + 'aligned']) +@pytest.mark.parametrize("flip", [True, False], ids=['flipped', 'straight']) +def test_binary_ufunc_with_series(flip, shuffle, sparse, ufunc, + arrays_for_binary_ufunc): + # Test that + # * func(Series(a), Series(b)) == Series(ufunc(a, b)) + # with alignment between the indices + + if flip and shuffle: + pytest.xfail(reason="Fix with Series.__array_ufunc__") + + a1, a2 = arrays_for_binary_ufunc + if sparse: + a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int', 0)) + a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int', 0)) + + name = "name" # op(Series, array) preserves the name. + series = pd.Series(a1, name=name) + other = pd.Series(a2, name=name) + + idx = np.random.permutation(len(a1)) + + if shuffle: + other = other.take(idx) + a2 = a2.take(idx) + # alignment, so the expected index is the first index in the op. + if flip: + index = other.align(series)[0].index + else: + index = series.align(other)[0].index + else: + index = series.index + + array_args = (a1, a2) + series_args = (series, other) # ufunc(series, array) + + if flip: + array_args = tuple(reversed(array_args)) + series_args = tuple(reversed(series_args)) # ufunc(array, series) + + expected = pd.Series(ufunc(*array_args), index=index, name=name) + result = ufunc(*series_args) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", BINARY_UFUNCS) +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +@pytest.mark.parametrize("flip", [True, False]) +def test_binary_ufunc_scalar(ufunc, sparse, flip, arrays_for_binary_ufunc): + # Test that + # * ufunc(Series, scalar) == Series(ufunc(array, scalar)) + # * ufunc(Series, scalar) == ufunc(scalar, Series) + array, _ = arrays_for_binary_ufunc + if sparse: + array = pd.SparseArray(array) + other = 2 + series = pd.Series(array, name="name") + + series_args = (series, other) + array_args = (array, other) + + if flip: + series_args = tuple(reversed(series_args)) + array_args = tuple(reversed(array_args)) + + expected = pd.Series(ufunc(*array_args), name="name") + result = ufunc(*series_args) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", [np.divmod]) # any others? +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +@pytest.mark.parametrize("shuffle", SHUFFLE) +@pytest.mark.filterwarnings("ignore:divide by zero:RuntimeWarning") +def test_multiple_ouput_binary_ufuncs(ufunc, sparse, shuffle, + arrays_for_binary_ufunc): + # Test that + # the same conditions from binary_ufunc_scalar apply to + # ufuncs with multiple outputs. + if sparse and ufunc is np.divmod: + pytest.skip("sparse divmod not implemented.") + + a1, a2 = arrays_for_binary_ufunc + + if sparse: + a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int', 0)) + a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int', 0)) + + s1 = pd.Series(a1) + s2 = pd.Series(a2) + + if shuffle: + # ensure we align before applying the ufunc + s2 = s2.sample(frac=1) + + expected = ufunc(a1, a2) + assert isinstance(expected, tuple) + + result = ufunc(s1, s2) + assert isinstance(result, tuple) + tm.assert_series_equal(result[0], pd.Series(expected[0])) + tm.assert_series_equal(result[1], pd.Series(expected[1])) + + +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +def test_multiple_ouput_ufunc(sparse, arrays_for_binary_ufunc): + # Test that the same conditions from unary input apply to multi-output + # ufuncs + array, _ = arrays_for_binary_ufunc + + if sparse: + array = pd.SparseArray(array) + + series = pd.Series(array, name="name") + result = np.modf(series) + expected = np.modf(array) + + assert isinstance(result, tuple) + assert isinstance(expected, tuple) + + tm.assert_series_equal(result[0], pd.Series(expected[0], name="name")) + tm.assert_series_equal(result[1], pd.Series(expected[1], name="name")) + + +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +@pytest.mark.parametrize("ufunc", BINARY_UFUNCS) +@pytest.mark.xfail(reason="Series.__array_ufunc__") +def test_binary_ufunc_drops_series_name(ufunc, sparse, + arrays_for_binary_ufunc): + # Drop the names when they differ. + a1, a2 = arrays_for_binary_ufunc + s1 = pd.Series(a1, name='a') + s2 = pd.Series(a2, name='b') + + result = ufunc(s1, s2) + assert result.name is None From dfcd2b2c575d474014908802d9cedf1ac3259635 Mon Sep 17 00:00:00 2001 From: robbuckley <20515024+robbuckley@users.noreply.github.com> Date: Fri, 21 Jun 2019 17:25:32 +0100 Subject: [PATCH 039/238] BLD: fix build error for PyPy on macOS (#26536) (#26862) --- doc/source/whatsnew/v0.25.0.rst | 5 +++++ setup.py | 18 +++++++++--------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 05978d500fa82b..275be4ff58e993 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -776,6 +776,11 @@ Sparse - Introduce a better error message in :meth:`Series.sparse.from_coo` so it returns a ``TypeError`` for inputs that are not coo matrices (:issue:`26554`) - Bug in :func:`numpy.modf` on a :class:`SparseArray`. Now a tuple of :class:`SparseArray` is returned (:issue:`26946`). +Build Changes +^^^^^^^^^^^^^ + +- Fix install error with PyPy on macOS (:issue:`26536`) + Other ^^^^^ diff --git a/setup.py b/setup.py index 4579bbfa597970..389e8553eb3a31 100755 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ import pkg_resources import platform -from distutils.sysconfig import get_config_var +from distutils.sysconfig import get_config_vars import sys import shutil from distutils.version import LooseVersion @@ -442,19 +442,19 @@ def run(self): if debugging_symbols_requested: extra_compile_args.append('-g') -# For mac, ensure extensions are built for macos 10.9 when compiling on a -# 10.9 system or above, overriding distuitls behaviour which is to target -# the version that python was built for. This may be overridden by setting +# Build for at least macOS 10.9 when compiling on a 10.9 system or above, +# overriding CPython distuitls behaviour which is to target the version that +# python was built for. This may be overridden by setting # MACOSX_DEPLOYMENT_TARGET before calling setup.py if is_platform_mac(): if 'MACOSX_DEPLOYMENT_TARGET' not in os.environ: - current_system = LooseVersion(platform.mac_ver()[0]) - python_target = LooseVersion( - get_config_var('MACOSX_DEPLOYMENT_TARGET')) - if python_target < '10.9' and current_system >= '10.9': + current_system = platform.mac_ver()[0] + python_target = get_config_vars().get('MACOSX_DEPLOYMENT_TARGET', + current_system) + if (LooseVersion(python_target) < '10.9' and + LooseVersion(current_system) >= '10.9'): os.environ['MACOSX_DEPLOYMENT_TARGET'] = '10.9' - # enable coverage by building cython files by setting the environment variable # "PANDAS_CYTHON_COVERAGE" (with a Truthy value) or by running build_ext # with `--with-cython-coverage`enabled From 9088f5ebbaf098bd7113bfba7eaa6dcdbf0b4c3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Abdullah=20=C4=B0hsan=20Se=C3=A7er?= Date: Fri, 21 Jun 2019 20:01:34 +0300 Subject: [PATCH 040/238] BUG: Fix rolling median and quantile with closed='left' and closed='neither' (#26005) (#26910) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/_libs/window.pyx | 50 +++++++++++++++++---------------- pandas/tests/test_window.py | 19 +++++++++++++ 3 files changed, 46 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 275be4ff58e993..5e5a2aed3ac03f 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -747,6 +747,7 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.frame.DataFrame.groupby` where passing a :class:`pandas.core.groupby.grouper.Grouper` would return incorrect groups when using the ``.groups`` accessor (:issue:`26326`) - Bug in :meth:`pandas.core.groupby.GroupBy.agg` where incorrect results are returned for uint64 columns. (:issue:`26310`) - Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where MemoryError is raised with empty window (:issue:`26005`) +- Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where incorrect results are returned with ``closed='left'`` and ``closed='neither'`` (:issue:`26005`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 3305fea06f0030..df86f395d60977 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1116,21 +1116,15 @@ def roll_median_c(ndarray[float64_t] values, int64_t win, int64_t minp, if i == 0: # setup - val = values[i] - if notnan(val): - nobs += 1 - err = skiplist_insert(sl, val) != 1 - if err: - break - - else: - - # calculate deletes - for j in range(start[i - 1], s): + for j in range(s, e): val = values[j] if notnan(val): - skiplist_remove(sl, val) - nobs -= 1 + nobs += 1 + err = skiplist_insert(sl, val) != 1 + if err: + break + + else: # calculate adds for j in range(end[i - 1], e): @@ -1141,6 +1135,13 @@ def roll_median_c(ndarray[float64_t] values, int64_t win, int64_t minp, if err: break + # calculate deletes + for j in range(start[i - 1], s): + val = values[j] + if notnan(val): + skiplist_remove(sl, val) + nobs -= 1 + if nobs >= minp: midpoint = (nobs / 2) if nobs % 2: @@ -1507,19 +1508,13 @@ def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, if i == 0: # setup - val = values[i] - if notnan(val): - nobs += 1 - skiplist_insert(skiplist, val) - - else: - - # calculate deletes - for j in range(start[i - 1], s): + for j in range(s, e): val = values[j] if notnan(val): - skiplist_remove(skiplist, val) - nobs -= 1 + nobs += 1 + skiplist_insert(skiplist, val) + + else: # calculate adds for j in range(end[i - 1], e): @@ -1528,6 +1523,13 @@ def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, nobs += 1 skiplist_insert(skiplist, val) + # calculate deletes + for j in range(start[i - 1], s): + val = values[j] + if notnan(val): + skiplist_remove(skiplist, val) + nobs -= 1 + if nobs >= minp: if nobs == 1: # Single value in skip list diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 31baf4475214f2..9524a78dae16ce 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -594,6 +594,25 @@ def test_closed_min_max_minp(self, func, closed, expected): expected = pd.Series(expected, index=ser.index) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("closed,expected", [ + ('right', [0, 0.5, 1, 2, 3, 4, 5, 6, 7, 8]), + ('both', [0, 0.5, 1, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), + ('neither', [np.nan, 0, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), + ('left', [np.nan, 0, 0.5, 1, 2, 3, 4, 5, 6, 7]) + ]) + def test_closed_median_quantile(self, closed, expected): + # GH 26005 + ser = pd.Series(data=np.arange(10), + index=pd.date_range('2000', periods=10)) + roll = ser.rolling('3D', closed=closed) + expected = pd.Series(expected, index=ser.index) + + result = roll.median() + tm.assert_series_equal(result, expected) + + result = roll.quantile(0.5) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize('roller', ['1s', 1]) def tests_empty_df_rolling(self, roller): # GH 15819 Verifies that datetime and integer rolling windows can be From 171615a35574a1fc9c4e8260edca7d1e08e9c302 Mon Sep 17 00:00:00 2001 From: Jan-Philip Gehrcke Date: Fri, 21 Jun 2019 19:55:28 +0200 Subject: [PATCH 041/238] CLN: move pytables tests to tests/io/pytables dir (#26986) --- pandas/tests/io/pytables/__init__.py | 0 pandas/tests/io/pytables/test_compat.py | 2 +- pandas/tests/io/{ => pytables}/test_pytables.py | 0 pandas/tests/io/{ => pytables}/test_pytables_missing.py | 0 4 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 pandas/tests/io/pytables/__init__.py rename pandas/tests/io/{ => pytables}/test_pytables.py (100%) rename pandas/tests/io/{ => pytables}/test_pytables_missing.py (100%) diff --git a/pandas/tests/io/pytables/__init__.py b/pandas/tests/io/pytables/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/pandas/tests/io/pytables/test_compat.py b/pandas/tests/io/pytables/test_compat.py index d74e1218ebdb05..34ed066dd37488 100644 --- a/pandas/tests/io/pytables/test_compat.py +++ b/pandas/tests/io/pytables/test_compat.py @@ -1,7 +1,7 @@ import pytest import pandas as pd -from pandas.tests.io.test_pytables import ensure_clean_path +from pandas.tests.io.pytables.test_pytables import ensure_clean_path from pandas.util.testing import assert_frame_equal tables = pytest.importorskip('tables') diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py similarity index 100% rename from pandas/tests/io/test_pytables.py rename to pandas/tests/io/pytables/test_pytables.py diff --git a/pandas/tests/io/test_pytables_missing.py b/pandas/tests/io/pytables/test_pytables_missing.py similarity index 100% rename from pandas/tests/io/test_pytables_missing.py rename to pandas/tests/io/pytables/test_pytables_missing.py From 9aef32db29925bec7a0372b92a63cfc4e78398c2 Mon Sep 17 00:00:00 2001 From: pilkibun <51503352+pilkibun@users.noreply.github.com> Date: Fri, 21 Jun 2019 21:19:58 +0000 Subject: [PATCH 042/238] BUG: Handle NA values for ExtensionArrays in Series.count (#26836) --- doc/source/whatsnew/v0.25.0.rst | 7 ++++++- pandas/core/series.py | 2 +- pandas/tests/extension/base/methods.py | 7 +++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 5e5a2aed3ac03f..19636f42c6129e 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -782,11 +782,16 @@ Build Changes - Fix install error with PyPy on macOS (:issue:`26536`) +ExtensionArray +^^^^^^^^^^^^^^ + +- Bug in :func:`factorize` when passing an ``ExtensionArray`` with a custom ``na_sentinel`` (:issue:`25696`). +- :meth:`Series.count` miscounts NA values in ExtensionArrays (:issue:`26835`) + Other ^^^^^ - Removed unused C functions from vendored UltraJSON implementation (:issue:`26198`) -- Bug in :func:`factorize` when passing an ``ExtensionArray`` with a custom ``na_sentinel`` (:issue:`25696`). - Allow :class:`Index` and :class:`RangeIndex` to be passed to numpy ``min`` and ``max`` functions (:issue:`26125`) .. _whatsnew_0.250.contributors: diff --git a/pandas/core/series.py b/pandas/core/series.py index c4a449154860f4..11e578e74f6e75 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1659,7 +1659,7 @@ def count(self, level=None): 2 """ if level is None: - return notna(com.values_from_object(self)).sum() + return notna(self.array).sum() if isinstance(level, str): level = self.index._get_level_number(level) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 1852edaa9e7485..c8fd4d1b708e5c 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -30,6 +30,13 @@ def test_count(self, data_missing): expected = pd.Series([0, 1]) self.assert_series_equal(result, expected) + def test_series_count(self, data_missing): + # GH#26835 + ser = pd.Series(data_missing) + result = ser.count() + expected = 1 + assert result == expected + def test_apply_simple_series(self, data): result = pd.Series(data).apply(id) assert isinstance(result, pd.Series) From a14874f3e85bae66c32ab1ae4c1cd8a72e741ffe Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 21 Jun 2019 17:10:44 -0500 Subject: [PATCH 043/238] xfail test_missing_required_dependency test (#26993) --- pandas/tests/test_downstream.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 9fe8b0f9563ef7..bb662e99664e2c 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -133,13 +133,20 @@ def test_pyarrow(df): tm.assert_frame_equal(result, df) +@pytest.mark.xfail(reason="pandas-wheels-50", strict=False) def test_missing_required_dependency(): # GH 23868 - # use the -S flag to disable site-packages - call = ['python', '-S', '-c', 'import pandas'] + # To ensure proper isolation, we pass these flags + # -S : disable site-packages + # -s : disable user site-packages + # -E : disable PYTHON* env vars, especially PYTHONPATH + # And, that's apparently not enough, so we give up. + # https://github.com/MacPython/pandas-wheels/pull/50 + call = ['python', '-sSE', '-c', 'import pandas'] with pytest.raises(subprocess.CalledProcessError) as exc: subprocess.check_output(call, stderr=subprocess.STDOUT) output = exc.value.stdout.decode() - assert all(x in output for x in ['numpy', 'pytz', 'dateutil']) + for name in ['numpy', 'pytz', 'dateutil']: + assert name in output From 2b9b58dadf8a4e02b94747b6c8b22bec4b6eeefd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 22 Jun 2019 10:19:03 -0700 Subject: [PATCH 044/238] BLD: use unsigned instead of signed for lengths, avoid build warnings (#26759) --- pandas/_libs/parsers.pyx | 18 ++++++++-------- pandas/_libs/src/parser/tokenizer.c | 32 +++++++++++++++++------------ pandas/_libs/src/parser/tokenizer.h | 18 ++++++++-------- 3 files changed, 37 insertions(+), 31 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 88b918e9cc5156..b73b70caf15976 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -119,24 +119,24 @@ cdef extern from "parser/tokenizer.h": # where to write out tokenized data char *stream - int64_t stream_len - int64_t stream_cap + uint64_t stream_len + uint64_t stream_cap # Store words in (potentially ragged) matrix for now, hmm char **words int64_t *word_starts # where we are in the stream - int64_t words_len - int64_t words_cap - int64_t max_words_cap # maximum word cap encountered + uint64_t words_len + uint64_t words_cap + uint64_t max_words_cap # maximum word cap encountered char *pword_start # pointer to stream start of current field int64_t word_start # position start of current field int64_t *line_start # position in words for start of line int64_t *line_fields # Number of fields in each line - int64_t lines # Number of lines observed - int64_t file_lines # Number of lines observed (with bad/skipped) - int64_t lines_cap # Vector capacity + uint64_t lines # Number of lines observed + uint64_t file_lines # Number of lines observed (with bad/skipped) + uint64_t lines_cap # Vector capacity # Tokenizing stuff ParserState state @@ -168,7 +168,7 @@ cdef extern from "parser/tokenizer.h": int header # Boolean: 1: has header, 0: no header int64_t header_start # header row start - int64_t header_end # header row end + uint64_t header_end # header row end void *skipset PyObject *skipfunc diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 723bf56a795123..3146e49455609f 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -71,9 +71,9 @@ static void free_if_not_null(void **ptr) { */ -static void *grow_buffer(void *buffer, int64_t length, int64_t *capacity, +static void *grow_buffer(void *buffer, uint64_t length, uint64_t *capacity, int64_t space, int64_t elsize, int *error) { - int64_t cap = *capacity; + uint64_t cap = *capacity; void *newbuffer = buffer; // Can we fit potentially nbytes tokens (+ null terminators) in the stream? @@ -248,7 +248,7 @@ void parser_del(parser_t *self) { } static int make_stream_space(parser_t *self, size_t nbytes) { - int64_t i, cap, length; + uint64_t i, cap, length; int status; void *orig_ptr, *newptr; @@ -263,7 +263,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { ("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n", nbytes)) self->stream = (char *)grow_buffer((void *)self->stream, self->stream_len, - (int64_t*)&self->stream_cap, nbytes * 2, + &self->stream_cap, nbytes * 2, sizeof(char), &status); TRACE( ("make_stream_space: self->stream=%p, self->stream_len = %zu, " @@ -305,7 +305,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { self->words = (char **)grow_buffer((void *)self->words, length, - (int64_t*)&self->words_cap, nbytes, + &self->words_cap, nbytes, sizeof(char *), &status); TRACE( ("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, " @@ -336,7 +336,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { cap = self->lines_cap; self->line_start = (int64_t *)grow_buffer((void *)self->line_start, self->lines + 1, - (int64_t*)&self->lines_cap, nbytes, + &self->lines_cap, nbytes, sizeof(int64_t), &status); TRACE(( "make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", @@ -471,7 +471,7 @@ static int end_line(parser_t *self) { return 0; } - if (!(self->lines <= (int64_t) self->header_end + 1) && + if (!(self->lines <= self->header_end + 1) && (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) { // increment file line count self->file_lines++; @@ -507,7 +507,7 @@ static int end_line(parser_t *self) { } } else { // missing trailing delimiters - if ((self->lines >= (int64_t) self->header_end + 1) && + if ((self->lines >= self->header_end + 1) && fields < ex_fields) { // might overrun the buffer when closing fields if (make_stream_space(self, ex_fields - fields) < 0) { @@ -651,7 +651,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { stream = self->stream + self->stream_len; \ slen = self->stream_len; \ self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + (int64_t)line_limit) { \ + if (line_limit > 0 && self->lines == start_lines + line_limit) { \ goto linelimit; \ } @@ -666,7 +666,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { stream = self->stream + self->stream_len; \ slen = self->stream_len; \ self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + (int64_t)line_limit) { \ + if (line_limit > 0 && self->lines == start_lines + line_limit) { \ goto linelimit; \ } @@ -737,7 +737,8 @@ int skip_this_line(parser_t *self, int64_t rownum) { int tokenize_bytes(parser_t *self, size_t line_limit, int64_t start_lines) { - int64_t i, slen; + int64_t i; + uint64_t slen; int should_skip; char c; char *stream; @@ -1203,7 +1204,8 @@ static int parser_handle_eof(parser_t *self) { } int parser_consume_rows(parser_t *self, size_t nrows) { - int64_t i, offset, word_deletions, char_count; + int64_t offset, word_deletions; + uint64_t char_count, i; if (nrows > self->lines) { nrows = self->lines; @@ -1229,6 +1231,8 @@ int parser_consume_rows(parser_t *self, size_t nrows) { self->stream_len -= char_count; /* move token metadata */ + // Note: We should always have words_len < word_deletions, so this + // subtraction will remain appropriately-typed. for (i = 0; i < self->words_len - word_deletions; ++i) { offset = i + word_deletions; @@ -1242,6 +1246,8 @@ int parser_consume_rows(parser_t *self, size_t nrows) { self->word_start -= char_count; /* move line metadata */ + // Note: We should always have self->lines - nrows + 1 >= 0, so this + // subtraction will remain appropriately-typed. for (i = 0; i < self->lines - nrows + 1; ++i) { offset = i + nrows; self->line_start[i] = self->line_start[offset] - word_deletions; @@ -1265,7 +1271,7 @@ int parser_trim_buffers(parser_t *self) { size_t new_cap; void *newptr; - int64_t i; + uint64_t i; /** * Before we free up space and trim, we should diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index b6d5d6937f4dbb..66ef1887d6bc3c 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -104,24 +104,24 @@ typedef struct parser_t { // where to write out tokenized data char *stream; - int64_t stream_len; - int64_t stream_cap; + uint64_t stream_len; + uint64_t stream_cap; // Store words in (potentially ragged) matrix for now, hmm char **words; int64_t *word_starts; // where we are in the stream - int64_t words_len; - int64_t words_cap; - int64_t max_words_cap; // maximum word cap encountered + uint64_t words_len; + uint64_t words_cap; + uint64_t max_words_cap; // maximum word cap encountered char *pword_start; // pointer to stream start of current field int64_t word_start; // position start of current field int64_t *line_start; // position in words for start of line int64_t *line_fields; // Number of fields in each line - int64_t lines; // Number of (good) lines observed - int64_t file_lines; // Number of lines (including bad or skipped) - int64_t lines_cap; // Vector capacity + uint64_t lines; // Number of (good) lines observed + uint64_t file_lines; // Number of lines (including bad or skipped) + uint64_t lines_cap; // Vector capacity // Tokenizing stuff ParserState state; @@ -153,7 +153,7 @@ typedef struct parser_t { int header; // Boolean: 1: has header, 0: no header int64_t header_start; // header row start - int64_t header_end; // header row end + uint64_t header_end; // header row end void *skipset; PyObject *skipfunc; From b4d4ec5a36acda40f13a8c3c3e19262c095d4c41 Mon Sep 17 00:00:00 2001 From: Steven Date: Sat, 22 Jun 2019 15:19:37 -0400 Subject: [PATCH 045/238] DOC: df.astype example using dictionary (#26994) * DOC: df.astype example using dictionary (#26990) --- pandas/core/generic.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 360576ffdb00a2..b08c1013561579 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5622,6 +5622,31 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): Examples -------- + Create a DataFrame: + + >>> d = {'col1': [1, 2], 'col2': [3, 4]} + >>> df = pd.DataFrame(data=d) + >>> df.dtypes + col1 int64 + col2 int64 + dtype: object + + Cast all columns to int32: + + >>> df.astype('int32').dtypes + col1 int32 + col2 int32 + dtype: object + + Cast col1 to int32 using a dictionary: + + >>> df.astype({'col1': 'int32'}).dtypes + col1 int32 + col2 int64 + dtype: object + + Create a series: + >>> ser = pd.Series([1, 2], dtype='int32') >>> ser 0 1 From e27eea8635b73082f93d44f0003f6ec5b92596a6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 23 Jun 2019 02:05:26 -0700 Subject: [PATCH 046/238] TST: fix flaky test (#27004) --- pandas/tests/series/indexing/test_datetime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index cba1444846d0c3..a8120ec9c5c58b 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -425,7 +425,7 @@ def test_datetime_indexing(): """ -@pytest.fixture(scope='module') +@pytest.fixture def dups(): dates = [datetime(2000, 1, 2), datetime(2000, 1, 2), datetime(2000, 1, 2), datetime(2000, 1, 3), From cf74b0272af2e13e5b9ce40c8bf42df750ddc560 Mon Sep 17 00:00:00 2001 From: 1_x7 Date: Sun, 23 Jun 2019 04:58:06 -0700 Subject: [PATCH 047/238] DOC: Do not mention private classes in the documentation (#26997) --- ci/code_checks.sh | 4 ++-- pandas/core/generic.py | 10 +++++----- pandas/core/groupby/groupby.py | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index a16580679ff547..ac86815569a0c2 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -256,8 +256,8 @@ fi ### DOCSTRINGS ### if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then - MSG='Validate docstrings (GL03, GL06, GL07, GL09, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA05)' ; echo $MSG - $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL03,GL06,GL07,GL09,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA05 + MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA05)' ; echo $MSG + $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL03,GL04,GL05,GL06,GL07,GL09,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA05 RET=$(($RET + $?)) ; echo $MSG "DONE" fi diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b08c1013561579..0e2a6a0cac4141 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -56,7 +56,7 @@ # able to share _shared_docs = dict() _shared_doc_kwargs = dict( - axes='keywords for axes', klass='NDFrame', + axes='keywords for axes', klass='Series/DataFrame', axes_single_arg='int or labels for object', args_transpose='axes to permute (int or label for object)', optional_by=""" @@ -1940,7 +1940,7 @@ def __array_wrap__(self, result, context=None): def to_dense(self): """ - Return dense representation of NDFrame (as opposed to sparse). + Return dense representation of Series/DataFrame (as opposed to sparse). .. deprecated:: 0.25.0 @@ -9036,7 +9036,7 @@ def tshift(self, periods=1, freq=None, axis=0): Returns ------- - shifted : NDFrame + shifted : Series/DataFrame Notes ----- @@ -10272,12 +10272,12 @@ def _find_valid_index(self, how): return idx @Appender(_shared_docs['valid_index'] % {'position': 'first', - 'klass': 'NDFrame'}) + 'klass': 'Series/DataFrame'}) def first_valid_index(self): return self._find_valid_index('first') @Appender(_shared_docs['valid_index'] % {'position': 'last', - 'klass': 'NDFrame'}) + 'klass': 'Series/DataFrame'}) def last_valid_index(self): return self._find_valid_index('last') diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 43950f2f503c85..64cacd60da30f5 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -630,14 +630,14 @@ def curried(x): def get_group(self, name, obj=None): """ - Construct NDFrame from group with provided name. + Construct DataFrame from group with provided name. Parameters ---------- name : object the name of the group to get as a DataFrame - obj : NDFrame, default None - the NDFrame to take the DataFrame out of. If + obj : DataFrame, default None + the DataFrame to take the DataFrame out of. If it is None, the object groupby was called on will be used From 83fe8d78b6b086f3ceabe81cd420a3c7affe9aba Mon Sep 17 00:00:00 2001 From: Min ho Kim Date: Tue, 25 Jun 2019 01:52:41 +1000 Subject: [PATCH 048/238] CLN: Fix typos (mainly in docs and comments) (#27007) --- asv_bench/benchmarks/offset.py | 2 +- .../comparison/comparison_with_sas.rst | 2 +- .../comparison/comparison_with_stata.rst | 2 +- doc/source/user_guide/io.rst | 6 +++--- doc/source/user_guide/timeseries.rst | 2 +- doc/source/whatsnew/v0.10.1.rst | 2 +- doc/source/whatsnew/v0.14.0.rst | 2 +- doc/source/whatsnew/v0.14.1.rst | 2 +- doc/source/whatsnew/v0.19.0.rst | 2 +- doc/source/whatsnew/v0.21.0.rst | 4 ++-- doc/source/whatsnew/v0.23.0.rst | 2 +- doc/source/whatsnew/v0.23.1.rst | 2 +- doc/source/whatsnew/v0.24.0.rst | 10 +++++----- doc/source/whatsnew/v0.25.0.rst | 4 ++-- doc/source/whatsnew/v0.8.0.rst | 2 +- pandas/_libs/tslibs/fields.pyx | 2 +- pandas/compat/numpy/__init__.py | 2 +- pandas/conftest.py | 2 +- pandas/core/arrays/base.py | 2 +- pandas/core/arrays/datetimes.py | 2 +- pandas/core/arrays/integer.py | 2 +- pandas/core/arrays/period.py | 2 +- pandas/core/arrays/sparse.py | 4 ++-- pandas/core/base.py | 2 +- pandas/core/computation/pytables.py | 2 +- pandas/core/dtypes/common.py | 4 ++-- pandas/core/frame.py | 12 ++++++------ pandas/core/generic.py | 12 ++++++------ pandas/core/groupby/generic.py | 2 +- pandas/core/groupby/grouper.py | 2 +- pandas/core/indexes/base.py | 6 +++--- pandas/core/indexes/category.py | 2 +- pandas/core/indexes/datetimes.py | 2 +- pandas/core/indexes/interval.py | 2 +- pandas/core/indexes/multi.py | 6 +++--- pandas/core/indexes/range.py | 2 +- pandas/core/indexing.py | 2 +- pandas/core/internals/construction.py | 2 +- pandas/core/missing.py | 2 +- pandas/core/panel.py | 2 +- pandas/core/resample.py | 4 ++-- pandas/core/reshape/tile.py | 2 +- pandas/core/series.py | 2 +- pandas/core/sparse/scipy_sparse.py | 2 +- pandas/core/strings.py | 8 ++++---- pandas/core/tools/datetimes.py | 2 +- pandas/io/formats/excel.py | 2 +- pandas/io/json/json.py | 2 +- pandas/io/json/normalize.py | 2 +- pandas/io/json/table_schema.py | 2 +- pandas/io/pytables.py | 10 +++++----- pandas/io/stata.py | 2 +- pandas/plotting/_matplotlib/tools.py | 2 +- pandas/tests/arithmetic/test_numeric.py | 4 ++-- pandas/tests/arithmetic/test_object.py | 2 +- pandas/tests/arithmetic/test_period.py | 2 +- pandas/tests/arithmetic/test_timedelta64.py | 4 ++-- pandas/tests/arrays/test_datetimelike.py | 2 +- pandas/tests/arrays/test_integer.py | 4 ++-- pandas/tests/dtypes/test_common.py | 2 +- pandas/tests/extension/base/ops.py | 2 +- pandas/tests/extension/json/test_json.py | 2 +- pandas/tests/frame/test_combine_concat.py | 2 +- pandas/tests/frame/test_constructors.py | 2 +- pandas/tests/frame/test_indexing.py | 4 ++-- pandas/tests/frame/test_nonunique_indexes.py | 2 +- pandas/tests/groupby/test_apply.py | 2 +- pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/groupby/test_timegrouper.py | 2 +- .../tests/indexes/datetimes/test_partial_slicing.py | 2 +- pandas/tests/indexes/datetimes/test_tools.py | 6 +++--- pandas/tests/indexes/interval/test_construction.py | 2 +- pandas/tests/indexes/multi/test_missing.py | 2 +- pandas/tests/indexes/test_category.py | 2 +- pandas/tests/indexing/multiindex/test_xs.py | 2 +- pandas/tests/indexing/test_coercion.py | 2 +- pandas/tests/indexing/test_floats.py | 2 +- pandas/tests/indexing/test_iloc.py | 2 +- pandas/tests/indexing/test_loc.py | 2 +- pandas/tests/io/excel/test_writers.py | 2 +- pandas/tests/io/formats/test_format.py | 2 +- pandas/tests/io/pytables/test_pytables.py | 10 +++++----- pandas/tests/io/test_parquet.py | 2 +- pandas/tests/io/test_sql.py | 2 +- pandas/tests/plotting/test_frame.py | 2 +- pandas/tests/plotting/test_series.py | 2 +- pandas/tests/reshape/merge/test_merge.py | 6 +++--- pandas/tests/reshape/test_concat.py | 4 ++-- pandas/tests/scalar/timedelta/test_timedelta.py | 2 +- pandas/tests/scalar/timestamp/test_unary_ops.py | 2 +- pandas/tests/series/test_missing.py | 2 +- pandas/tests/test_algos.py | 2 +- pandas/tests/test_base.py | 2 +- pandas/tests/test_multilevel.py | 2 +- pandas/tests/test_window.py | 6 +++--- pandas/tests/tseries/offsets/test_offsets.py | 2 +- pandas/tseries/offsets.py | 2 +- pandas/util/testing.py | 6 +++--- scripts/validate_docstrings.py | 2 +- 99 files changed, 148 insertions(+), 148 deletions(-) diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py index 26e344758596fb..9b738e699a5b3d 100644 --- a/asv_bench/benchmarks/offset.py +++ b/asv_bench/benchmarks/offset.py @@ -9,7 +9,7 @@ pass hcal = pd.tseries.holiday.USFederalHolidayCalendar() -# These offests currently raise a NotImplimentedError with .apply_index() +# These offsets currently raise a NotImplimentedError with .apply_index() non_apply = [pd.offsets.Day(), pd.offsets.BYearEnd(), pd.offsets.BYearBegin(), diff --git a/doc/source/getting_started/comparison/comparison_with_sas.rst b/doc/source/getting_started/comparison/comparison_with_sas.rst index fc12c8524d3bfd..cbedeec737ec05 100644 --- a/doc/source/getting_started/comparison/comparison_with_sas.rst +++ b/doc/source/getting_started/comparison/comparison_with_sas.rst @@ -660,7 +660,7 @@ example, to subtract the mean for each observation by smoker group. run; -pandas ``groubpy`` provides a ``transform`` mechanism that allows +pandas ``groupby`` provides a ``transform`` mechanism that allows these type of operations to be succinctly expressed in one operation. diff --git a/doc/source/getting_started/comparison/comparison_with_stata.rst b/doc/source/getting_started/comparison/comparison_with_stata.rst index bf2b03176ecd8e..c354ed7872cb4b 100644 --- a/doc/source/getting_started/comparison/comparison_with_stata.rst +++ b/doc/source/getting_started/comparison/comparison_with_stata.rst @@ -634,7 +634,7 @@ For example, to subtract the mean for each observation by smoker group. generate adj_total_bill = total_bill - group_bill -pandas ``groubpy`` provides a ``transform`` mechanism that allows +pandas ``groupby`` provides a ``transform`` mechanism that allows these type of operations to be succinctly expressed in one operation. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 30a42de2ab2879..7caaec62c0a8a5 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -488,7 +488,7 @@ specification: .. versionadded:: 0.21.0 -Specifying ``dtype='cateogry'`` will result in an unordered ``Categorical`` +Specifying ``dtype='category'`` will result in an unordered ``Categorical`` whose ``categories`` are the unique values observed in the data. For more control on the categories and order, create a :class:`~pandas.api.types.CategoricalDtype` ahead of time, and pass that for @@ -1679,7 +1679,7 @@ S3 URLs are handled as well but require installing the `S3Fs df = pd.read_csv('s3://pandas-test/tips.csv') -If your S3 bucket requires cedentials you will need to set them as environment +If your S3 bucket requires credentials you will need to set them as environment variables or in the ``~/.aws/credentials`` config file, refer to the `S3Fs documentation on credentials `_. @@ -2078,7 +2078,7 @@ Dates written in nanoseconds need to be read back in nanoseconds: json = dfj2.to_json(date_unit='ns') - # Try to parse timestamps as millseconds -> Won't Work + # Try to parse timestamps as milliseconds -> Won't Work dfju = pd.read_json(json, date_unit='ms') dfju diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index f27e9c677d9257..7bdec001a688f5 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1133,7 +1133,7 @@ Valid business hours are distinguished by whether it started from valid ``Busine pd.Timestamp('2014-08-01 17:00') + bh pd.Timestamp('2014-08-01 23:00') + bh - # Although 2014-08-02 is Satuaday, + # Although 2014-08-02 is Saturday, # it is valid because it starts from 08-01 (Friday). pd.Timestamp('2014-08-02 04:00') + bh diff --git a/doc/source/whatsnew/v0.10.1.rst b/doc/source/whatsnew/v0.10.1.rst index b5b2b889732cde..7d51ded1cad195 100644 --- a/doc/source/whatsnew/v0.10.1.rst +++ b/doc/source/whatsnew/v0.10.1.rst @@ -170,7 +170,7 @@ combined result, by using ``where`` on a selector table. df_mt, selector='df1_mt') store - # indiviual tables were created + # individual tables were created store.select('df1_mt') store.select('df2_mt') diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst index d61b9a40438f89..f049006808c0fa 100644 --- a/doc/source/whatsnew/v0.14.0.rst +++ b/doc/source/whatsnew/v0.14.0.rst @@ -816,7 +816,7 @@ Enhancements - Implemented ``Panel.pct_change`` (:issue:`6904`) - Added ``how`` option to rolling-moment functions to dictate how to handle resampling; :func:`rolling_max` defaults to max, :func:`rolling_min` defaults to min, and all others default to mean (:issue:`6297`) -- ``CustomBuisnessMonthBegin`` and ``CustomBusinessMonthEnd`` are now available (:issue:`6866`) +- ``CustomBusinessMonthBegin`` and ``CustomBusinessMonthEnd`` are now available (:issue:`6866`) - :meth:`Series.quantile` and :meth:`DataFrame.quantile` now accept an array of quantiles. - :meth:`~DataFrame.describe` now accepts an array of percentiles to include in the summary statistics (:issue:`4196`) diff --git a/doc/source/whatsnew/v0.14.1.rst b/doc/source/whatsnew/v0.14.1.rst index 98ebbd6a523442..fcfb22d0745548 100644 --- a/doc/source/whatsnew/v0.14.1.rst +++ b/doc/source/whatsnew/v0.14.1.rst @@ -247,7 +247,7 @@ Bug Fixes - Bug in ``DatetimeIndex`` comparison doesn't handle ``NaT`` properly (:issue:`7529`) - Bug in passing input with ``tzinfo`` to some offsets ``apply``, ``rollforward`` or ``rollback`` resets ``tzinfo`` or raises ``ValueError`` (:issue:`7465`) - Bug in ``DatetimeIndex.to_period``, ``PeriodIndex.asobject``, ``PeriodIndex.to_timestamp`` doesn't preserve ``name`` (:issue:`7485`) -- Bug in ``DatetimeIndex.to_period`` and ``PeriodIndex.to_timestanp`` handle ``NaT`` incorrectly (:issue:`7228`) +- Bug in ``DatetimeIndex.to_period`` and ``PeriodIndex.to_timestamp`` handle ``NaT`` incorrectly (:issue:`7228`) - Bug in ``offsets.apply``, ``rollforward`` and ``rollback`` may return normal ``datetime`` (:issue:`7502`) - Bug in ``resample`` raises ``ValueError`` when target contains ``NaT`` (:issue:`7227`) - Bug in ``Timestamp.tz_localize`` resets ``nanosecond`` info (:issue:`7534`) diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index de29a1eb937091..fe9fdd7448923d 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -1513,7 +1513,7 @@ Bug Fixes - Bug in ``Series`` comparison may output incorrect result if rhs contains ``NaT`` (:issue:`9005`) - Bug in ``Series`` and ``Index`` comparison may output incorrect result if it contains ``NaT`` with ``object`` dtype (:issue:`13592`) - Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) -- Bug in ``Peirod`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`) +- Bug in ``Period`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`) - Bug in ``pd.set_eng_float_format()`` that would prevent NaN and Inf from formatting (:issue:`11981`) - Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`) - Clean some compile time warnings in datetime parsing (:issue:`13607`) diff --git a/doc/source/whatsnew/v0.21.0.rst b/doc/source/whatsnew/v0.21.0.rst index 5c6f1d1af6b544..44b50437a6dfe8 100644 --- a/doc/source/whatsnew/v0.21.0.rst +++ b/doc/source/whatsnew/v0.21.0.rst @@ -263,7 +263,7 @@ Now, to find prices per store/product, we can simply do: See the :ref:`documentation ` for more. -.. _whatsnew_0210.enhancements.reanme_categories: +.. _whatsnew_0210.enhancements.rename_categories: ``Categorical.rename_categories`` accepts a dict-like ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -418,7 +418,7 @@ New Behavior, without regard to the bottleneck installation: s.sum() -Note that this also changes the sum of an empty ``Series``. Previously this always returned 0 regardless of a ``bottlenck`` installation: +Note that this also changes the sum of an empty ``Series``. Previously this always returned 0 regardless of a ``bottleneck`` installation: .. code-block:: ipython diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index 98479fa30eb154..51efa37b55adde 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -276,7 +276,7 @@ To show only observed values: df.groupby(['A', 'B', 'C'], observed=True).count() -For pivotting operations, this behavior is *already* controlled by the ``dropna`` keyword: +For pivoting operations, this behavior is *already* controlled by the ``dropna`` keyword: .. ipython:: python diff --git a/doc/source/whatsnew/v0.23.1.rst b/doc/source/whatsnew/v0.23.1.rst index f6af2990c935b8..0218c3b02a4132 100644 --- a/doc/source/whatsnew/v0.23.1.rst +++ b/doc/source/whatsnew/v0.23.1.rst @@ -26,7 +26,7 @@ Fixed Regressions **Comparing Series with datetime.date** We've reverted a 0.23.0 change to comparing a :class:`Series` holding datetimes and a ``datetime.date`` object (:issue:`21152`). -In pandas 0.22 and earlier, comparing a Series holding datetimes and ``datetime.date`` objects would coerce the ``datetime.date`` to a datetime before comapring. +In pandas 0.22 and earlier, comparing a Series holding datetimes and ``datetime.date`` objects would coerce the ``datetime.date`` to a datetime before comparing. This was inconsistent with Python, NumPy, and :class:`DatetimeIndex`, which never consider a datetime and ``datetime.date`` equal. In 0.23.0, we unified operations between DatetimeIndex and Series, and in the process changed comparisons between a Series of datetimes and ``datetime.date`` without warning. diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 05d6a03639a2d1..086519ad751924 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1061,7 +1061,7 @@ The affected cases are: .. code-block:: ipython - # Comparison operations and arithmetic opeartions both raise ValueError. + # Comparison operations and arithmetic operations both raise ValueError. In [6]: df == (1, 2, 3) ... ValueError: Unable to coerce to Series, length must be 2: given 3 @@ -1324,7 +1324,7 @@ Deprecations - :meth:`Series.clip_lower`, :meth:`Series.clip_upper`, :meth:`DataFrame.clip_lower` and :meth:`DataFrame.clip_upper` are deprecated and will be removed in a future version. Use ``Series.clip(lower=threshold)``, ``Series.clip(upper=threshold)`` and the equivalent ``DataFrame`` methods (:issue:`24203`) - :meth:`Series.nonzero` is deprecated and will be removed in a future version (:issue:`18262`) - Passing an integer to :meth:`Series.fillna` and :meth:`DataFrame.fillna` with ``timedelta64[ns]`` dtypes is deprecated, will raise ``TypeError`` in a future version. Use ``obj.fillna(pd.Timedelta(...))`` instead (:issue:`24694`) -- ``Series.cat.categorical``, ``Series.cat.name`` and ``Sersies.cat.index`` have been deprecated. Use the attributes on ``Series.cat`` or ``Series`` directly. (:issue:`24751`). +- ``Series.cat.categorical``, ``Series.cat.name`` and ``Series.cat.index`` have been deprecated. Use the attributes on ``Series.cat`` or ``Series`` directly. (:issue:`24751`). - Passing a dtype without a precision like ``np.dtype('datetime64')`` or ``timedelta64`` to :class:`Index`, :class:`DatetimeIndex` and :class:`TimedeltaIndex` is now deprecated. Use the nanosecond-precision dtype instead (:issue:`24753`). .. _whatsnew_0240.deprecations.datetimelike_int_ops: @@ -1604,7 +1604,7 @@ Datetimelike - Bug in :class:`DatetimeIndex` where calling ``np.array(dtindex, dtype=object)`` would incorrectly return an array of ``long`` objects (:issue:`23524`) - Bug in :class:`Index` where passing a timezone-aware :class:`DatetimeIndex` and `dtype=object` would incorrectly raise a ``ValueError`` (:issue:`23524`) - Bug in :class:`Index` where calling ``np.array(dtindex, dtype=object)`` on a timezone-naive :class:`DatetimeIndex` would return an array of ``datetime`` objects instead of :class:`Timestamp` objects, potentially losing nanosecond portions of the timestamps (:issue:`23524`) -- Bug in :class:`Categorical.__setitem__` not allowing setting with another ``Categorical`` when both are undordered and have the same categories, but in a different order (:issue:`24142`) +- Bug in :class:`Categorical.__setitem__` not allowing setting with another ``Categorical`` when both are unordered and have the same categories, but in a different order (:issue:`24142`) - Bug in :func:`date_range` where using dates with millisecond resolution or higher could return incorrect values or the wrong number of values in the index (:issue:`24110`) - Bug in :class:`DatetimeIndex` where constructing a :class:`DatetimeIndex` from a :class:`Categorical` or :class:`CategoricalIndex` would incorrectly drop timezone information (:issue:`18664`) - Bug in :class:`DatetimeIndex` and :class:`TimedeltaIndex` where indexing with ``Ellipsis`` would incorrectly lose the index's ``freq`` attribute (:issue:`21282`) @@ -1670,7 +1670,7 @@ Timezones Offsets ^^^^^^^ -- Bug in :class:`FY5253` where date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`) +- Bug in :class:`FY5253` where date offsets could incorrectly raise an ``AssertionError`` in arithmetic operations (:issue:`14774`) - Bug in :class:`DateOffset` where keyword arguments ``week`` and ``milliseconds`` were accepted and ignored. Passing these will now raise ``ValueError`` (:issue:`19398`) - Bug in adding :class:`DateOffset` with :class:`DataFrame` or :class:`PeriodIndex` incorrectly raising ``TypeError`` (:issue:`23215`) - Bug in comparing :class:`DateOffset` objects with non-DateOffset objects, particularly strings, raising ``ValueError`` instead of returning ``False`` for equality checks and ``True`` for not-equal checks (:issue:`23524`) @@ -1838,7 +1838,7 @@ Groupby/Resample/Rolling ``SeriesGroupBy`` when the grouping variable only contains NaNs and numpy version < 1.13 (:issue:`21956`). - Multiple bugs in :func:`pandas.core.window.Rolling.min` with ``closed='left'`` and a datetime-like index leading to incorrect results and also segfault. (:issue:`21704`) -- Bug in :meth:`pandas.core.resample.Resampler.apply` when passing postiional arguments to applied func (:issue:`14615`). +- Bug in :meth:`pandas.core.resample.Resampler.apply` when passing positional arguments to applied func (:issue:`14615`). - Bug in :meth:`Series.resample` when passing ``numpy.timedelta64`` to ``loffset`` kwarg (:issue:`7687`). - Bug in :meth:`pandas.core.resample.Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`). - Bug in :meth:`pandas.core.groupby.SeriesGroupBy.mean` when values were integral but could not fit inside of int64, overflowing instead. (:issue:`22487`) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 19636f42c6129e..109005364fca60 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -119,7 +119,7 @@ Other Enhancements - ``Series.str`` has gained :meth:`Series.str.casefold` method to removes all case distinctions present in a string (:issue:`25405`) - :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`) - :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behavior of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`) -- :meth:`RangeIndex.union` now supports the ``sort`` argument. If ``sort=False`` an unsorted ``Int64Index`` is always returned. ``sort=None`` is the default and returns a mononotically increasing ``RangeIndex`` if possible or a sorted ``Int64Index`` if not (:issue:`24471`) +- :meth:`RangeIndex.union` now supports the ``sort`` argument. If ``sort=False`` an unsorted ``Int64Index`` is always returned. ``sort=None`` is the default and returns a monotonically increasing ``RangeIndex`` if possible or a sorted ``Int64Index`` if not (:issue:`24471`) - :meth:`TimedeltaIndex.intersection` now also supports the ``sort`` keyword (:issue:`24471`) - :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`) - Added :ref:`api.frame.sparse` for working with a ``DataFrame`` whose values are sparse (:issue:`25681`) @@ -694,7 +694,7 @@ I/O - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) - Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to :class:`Timestamp`, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`) - Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string ``"nan"`` instead of ``numpy.nan`` (:issue:`25468`) -- :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`) +- :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AssertionError`` (:issue:`25608`) - Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`) - Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`) - Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`) diff --git a/doc/source/whatsnew/v0.8.0.rst b/doc/source/whatsnew/v0.8.0.rst index 575ec6b7d19f4c..664325ac063c00 100644 --- a/doc/source/whatsnew/v0.8.0.rst +++ b/doc/source/whatsnew/v0.8.0.rst @@ -77,7 +77,7 @@ Time series changes and improvements interface while enabling working with nanosecond-resolution data. Also provides :ref:`easy time zone conversions `. - Enhanced support for :ref:`time zones `. Add - `tz_convert` and ``tz_lcoalize`` methods to TimeSeries and DataFrame. All + `tz_convert` and ``tz_localize`` methods to TimeSeries and DataFrame. All timestamps are stored as UTC; Timestamps from DatetimeIndex objects with time zone set will be localized to local time. Time zone conversions are therefore essentially free. User needs to know very little about pytz library now; only diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 4ebf5e587a727d..2a41b5ff2339ca 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -171,7 +171,7 @@ def get_start_end_field(int64_t[:] dtindex, object field, # YearBegin(), BYearBegin() use month = starting month of year. # QuarterBegin(), BQuarterBegin() use startingMonth = starting - # month of year. Other offests use month, startingMonth as ending + # month of year. Other offsets use month, startingMonth as ending # month of year. if (freqstr[0:2] in ['MS', 'QS', 'AS']) or ( diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index c738cc74e46a49..22bfab8b7c6d63 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -1,4 +1,4 @@ -""" support numpy compatiblitiy across versions """ +""" support numpy compatibility across versions """ from distutils.version import LooseVersion import re diff --git a/pandas/conftest.py b/pandas/conftest.py index 4bcd0ea8442e66..058361af343b63 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -122,7 +122,7 @@ def observed(request): """ pass in the observed keyword to groupby for [True, False] This indicates whether categoricals should return values for values which are not in the grouper [False / None], or only values which - appear in the grouper [True]. [None] is supported for future compatiblity + appear in the grouper [True]. [None] is supported for future compatibility if we decide to change the default (and would need to warn if this parameter is not passed)""" return request.param diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c709cd9e9f0b2f..20fd582179dc6a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -653,7 +653,7 @@ def factorize( ----- :meth:`pandas.factorize` offers a `sort` keyword as well. """ - # Impelmentor note: There are two ways to override the behavior of + # Implementer note: There are two ways to override the behavior of # pandas.factorize # 1. _values_for_factorize and _from_factorize. # Specify the values passed to pandas' internal factorization diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index d415dbbdaf0a37..6e7217762a3fb2 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -676,7 +676,7 @@ def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): def _has_same_tz(self, other): zzone = self._timezone - # vzone sholdn't be None if value is non-datetime like + # vzone shouldn't be None if value is non-datetime like if isinstance(other, np.datetime64): # convert to Timestamp as np.datetime64 doesn't have tz attr other = Timestamp(other) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 589e98f016f695..07d5664f987149 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -28,7 +28,7 @@ class _IntegerDtype(ExtensionDtype): An ExtensionDtype to hold a single size & kind of integer dtype. These specific implementations are subclasses of the non-public - _IntegerDtype. For example we have Int8Dtype to represnt signed int 8s. + _IntegerDtype. For example we have Int8Dtype to represent signed int 8s. The attributes name & type are set when these subclasses are created. """ diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index ece05567d33436..3a9322773fc691 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -854,7 +854,7 @@ def dt64arr_to_periodarr(data, freq, tz=None): ------- ordinals : ndarray[int] freq : Tick - The frequencey extracted from the Series or DatetimeIndex if that's + The frequency extracted from the Series or DatetimeIndex if that's used. """ diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 3dda6868a80dac..d692fe6d7cabef 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -562,7 +562,7 @@ class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): * 'block': Stores a `block` and `block_length` for each contiguous *span* of sparse values. This is best when sparse data tends to be clumped together, with large - regsions of ``fill-value`` values between sparse values. + regions of ``fill-value`` values between sparse values. * 'integer': uses an integer to store the location of each sparse value. @@ -1316,7 +1316,7 @@ def _concat_same_type(cls, to_concat): sp_index = IntIndex(length, indices) else: - # when concatentating block indices, we don't claim that you'll + # when concatenating block indices, we don't claim that you'll # get an identical index as concating the values and then # creating a new index. We don't want to spend the time trying # to merge blocks across arrays in `to_concat`, so the resulting diff --git a/pandas/core/base.py b/pandas/core/base.py index ab9d8b9d778e5e..30e800cb9bd732 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1137,7 +1137,7 @@ def __iter__(self): ------- iterator """ - # We are explicity making element iterators. + # We are explicitly making element iterators. if is_datetimelike(self._values): return map(com.maybe_box_datetimelike, self._values) elif is_extension_array_dtype(self._values): diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 2a762b5ee24b6d..25cfa8fe17697b 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -410,7 +410,7 @@ def visit_Assign(self, node, **kwargs): return self.visit(cmpr) def visit_Subscript(self, node, **kwargs): - # only allow simple suscripts + # only allow simple subscripts value = self.visit(node.value) slobj = self.visit(node.slice) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index ce99d150880c68..b2b74e2a70ca99 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1835,7 +1835,7 @@ def is_complex_dtype(arr_or_dtype): Returns ------- boolean - Whether or not the array or dtype is of a compex dtype. + Whether or not the array or dtype is of a complex dtype. Examples -------- @@ -1929,7 +1929,7 @@ def _is_dtype_type(arr_or_dtype, condition): Returns ------- - bool : if the condition is satisifed for the arr_or_dtype + bool : if the condition is satisfied for the arr_or_dtype """ if arr_or_dtype is None: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6746844f4b1fa1..fd2e1e3e41ced3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2145,7 +2145,7 @@ def to_parquet(self, fname, engine='auto', compression='snappy', col_space='The minimum width of each column in CSS length ' 'units. An int is assumed to be px units.\n\n' ' .. versionadded:: 0.25.0\n' - ' Abillity to use str') + ' Ability to use str') @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) def to_html(self, buf=None, columns=None, col_space=None, header=True, @@ -5312,7 +5312,7 @@ def combine(self, other, func, fill_value=None, overwrite=True): this_mask = isna(series) other_mask = isna(otherSeries) - # don't overwrite columns unecessarily + # don't overwrite columns unnecessarily # DO propagate if this column is not in the intersection if not overwrite and other_mask.all(): result[col] = this[col].copy() @@ -5572,7 +5572,7 @@ def update(self, other, join='left', overwrite=True, filter_func=None, else: mask = notna(this) - # don't overwrite columns unecessarily + # don't overwrite columns unnecessarily if mask.all(): continue @@ -6508,7 +6508,7 @@ def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, 2 13 dtype: int64 - Retuning a list-like will result in a Series + Returning a list-like will result in a Series >>> df.apply(lambda x: [1, 2], axis=1) 0 [1, 2] @@ -6993,7 +6993,7 @@ def round(self, decimals=0, *args, **kwargs): 3 0.2 0.2 With a dict, the number of places for specific columns can be - specfified with the column names as key and the number of decimal + specified with the column names as key and the number of decimal places as value >>> df.round({'dogs': 1, 'cats': 0}) @@ -7004,7 +7004,7 @@ def round(self, decimals=0, *args, **kwargs): 3 0.2 0.0 Using a Series, the number of places for specific columns can be - specfified with the column names as index and the number of + specified with the column names as index and the number of decimal places as value >>> decimals = pd.Series([0, 1], index=['cats', 'dogs']) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0e2a6a0cac4141..992c83e66090e8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3193,7 +3193,7 @@ def _slice(self, slobj, axis=0, kind=None): result = result.__finalize__(self) # this could be a view - # but only in a single-dtyped view slicable case + # but only in a single-dtyped view sliceable case is_copy = axis != 0 or result._is_view result._set_is_copy(self, copy=is_copy) return result @@ -3243,7 +3243,7 @@ def _check_setitem_copy(self, stacklevel=4, t='setting', force=False): force : boolean, default False if True, then force showing an error - validate if we are doing a settitem on a chained copy. + validate if we are doing a setitem on a chained copy. If you call this function, be sure to set the stacklevel such that the user will see the error *at the level of setting* @@ -3644,7 +3644,7 @@ class animal locomotion result.index = new_index # this could be a view - # but only in a single-dtyped view slicable case + # but only in a single-dtyped view sliceable case result._set_is_copy(self, copy=not result._is_view) return result @@ -6488,7 +6488,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, for c, src in to_replace.items(): if c in value and c in self: # object conversion is handled in - # series.replace which is called recursivelly + # series.replace which is called recursively res[c] = res[c].replace(to_replace=src, value=value[c], inplace=False, @@ -6724,7 +6724,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, Note how the last entry in column 'a' is interpolated differently, because there is no entry after it to use for interpolation. Note how the first entry in column 'b' remains ``NaN``, because there - is no entry befofe it to use for interpolation. + is no entry before it to use for interpolation. >>> df = pd.DataFrame([(0.0, np.nan, -1.0, 1.0), ... (np.nan, 2.0, np.nan, np.nan), @@ -9576,7 +9576,7 @@ def describe(self, percentiles=None, include=None, exclude=None): DataFrame.max: Maximum of the values in the object. DataFrame.min: Minimum of the values in the object. DataFrame.mean: Mean of the values. - DataFrame.std: Standard deviation of the obersvations. + DataFrame.std: Standard deviation of the observations. DataFrame.select_dtypes: Subset of a DataFrame including/excluding columns based on their dtype. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 35ffa552913aeb..91be320a3e674d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1316,7 +1316,7 @@ def _apply_to_column_groupbys(self, func): return func(self) def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None): - """Calcuate pct_change of each value to previous entry in group""" + """Calculate pct_change of each value to previous entry in group""" # TODO: Remove this conditional when #23918 is fixed if freq: return self.apply(lambda x: x.pct_change(periods=periods, diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index febfdc7bdf908c..d0f28bed4399ba 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -494,7 +494,7 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, # not an iterable of keys. In the meantime, we attempt to provide # a warning. We can assume that the user wanted a list of keys when # the key is not in the index. We just have to be careful with - # unhashble elements of `key`. Any unhashable elements implies that + # unhashable elements of `key`. Any unhashable elements implies that # they wanted a list of keys. # https://github.com/pandas-dev/pandas/issues/18314 is_tuple = isinstance(key, tuple) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 73abd708415a13..cb5b4a6c8993c0 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1131,7 +1131,7 @@ def to_flat_index(self): .. versionadded:: 0.24.0 - This is implemented for compatability with subclass implementations + This is implemented for compatibility with subclass implementations when chaining. Returns @@ -1486,7 +1486,7 @@ def _get_level_values(self, level): Return an Index of values for requested level. This is primarily useful to get an individual level of values from a - MultiIndex, but is provided on Index as well for compatability. + MultiIndex, but is provided on Index as well for compatibility. Parameters ---------- @@ -3885,7 +3885,7 @@ def _try_convert_to_int_index(cls, data, copy, name, dtype): from .numeric import Int64Index, UInt64Index if not is_unsigned_integer_dtype(dtype): # skip int64 conversion attempt if uint-like dtype is passed, as - # this could return Int64Index when UInt64Index is what's desrired + # this could return Int64Index when UInt64Index is what's desired try: res = data.astype('i8', copy=False) if (res == data).all(): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 122c30ae7dfd50..3d3774ce48e8b6 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -534,7 +534,7 @@ def _can_reindex(self, indexer): @Appender(_index_shared_docs['where']) def where(self, cond, other=None): # TODO: Investigate an alternative implementation with - # 1. copy the underyling Categorical + # 1. copy the underlying Categorical # 2. setitem with `cond` and `other` # 3. Rebuild CategoricalIndex. if other is None: diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e6d876436c9868..5ce670d9fe33e1 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -376,7 +376,7 @@ def _is_dates_only(self): def __reduce__(self): - # we use a special reudce here because we need + # we use a special reduce here because we need # to simply set the .tz (and not reinterpret it) d = dict(data=self._data) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 577d0221cd8da8..49f657332bbbf3 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -390,7 +390,7 @@ def itemsize(self): 'a future version') warnings.warn(msg, FutureWarning, stacklevel=2) - # supress the warning from the underlying left/right itemsize + # suppress the warning from the underlying left/right itemsize with warnings.catch_warnings(): warnings.simplefilter('ignore') return self.left.itemsize + self.right.itemsize diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 0d6e75f95f8637..a06d304fb5a229 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -947,7 +947,7 @@ def f(l): def memory_usage(self, deep=False): # we are overwriting our base class to avoid # computing .values here which could materialize - # a tuple representation uncessarily + # a tuple representation unnecessarily return self._nbytes(deep) @cache_readonly @@ -1074,7 +1074,7 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False, sentinel = '' # GH3547 # use value of sparsify as sentinel, unless it's an obvious - # "Truthey" value + # "Truthy" value if sparsify not in [True, 1]: sentinel = sparsify # little bit of a kludge job for #1217 @@ -2729,7 +2729,7 @@ def convert_indexer(start, stop, step, indexer=indexer, return m if isinstance(key, slice): - # handle a slice, returnig a slice if we can + # handle a slice, returning a slice if we can # otherwise a boolean indexer try: diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index ab39969af8db02..47dad1788e0219 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -525,7 +525,7 @@ def _union(self, other, sort): sort : False or None, default None Whether to sort resulting index. ``sort=None`` returns a - mononotically increasing ``RangeIndex`` if possible or a sorted + monotonically increasing ``RangeIndex`` if possible or a sorted ``Int64Index`` if not. ``sort=False`` always returns an unsorted ``Int64Index`` diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 6a21adb1d16ae8..f6aa54f4836d9a 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -955,7 +955,7 @@ def _getitem_lowerdim(self, tup): def _getitem_nested_tuple(self, tup): # we have a nested tuple so have at least 1 multi-index level - # we should be able to match up the dimensionaility here + # we should be able to match up the dimensionality here # we have too many indexers for our dim, but have at least 1 # multi-index dimension, try to see if we have something like diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index f564ac13dc41d2..d766d7f06d34a1 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -588,7 +588,7 @@ def sanitize_array(data, index, dtype=None, copy=False, subarr = data # everything else in this block must also handle ndarray's, - # becuase we've unwrapped PandasArray into an ndarray. + # because we've unwrapped PandasArray into an ndarray. if dtype is not None: subarr = data.astype(dtype) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index cdb3b775678299..4230b212f567a1 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -619,7 +619,7 @@ def mask_zero_div_zero(x, y, result, copy=False): def dispatch_missing(op, left, right, result): """ - Fill nulls caused by division by zero, casting to a diffferent dtype + Fill nulls caused by division by zero, casting to a different dtype if necessary. Parameters diff --git a/pandas/core/panel.py b/pandas/core/panel.py index c65a73bd0d3f09..9d6b7333ca39fd 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -1392,7 +1392,7 @@ def update(self, other, join='left', overwrite=True, filter_func=None, Parameters ---------- other : Panel, or object coercible to Panel - The object from which the caller will be udpated. + The object from which the caller will be updated. join : {'left', 'right', 'outer', 'inner'}, default 'left' How individual DataFrames are joined. overwrite : bool, default True diff --git a/pandas/core/resample.py b/pandas/core/resample.py index d1d99d28e59b65..632b5a9c5e0024 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -48,7 +48,7 @@ class Resampler(_GroupBy): groupby : a TimeGrouper object axis : int, default 0 kind : str or None - 'period', 'timestamp' to override default index treatement + 'period', 'timestamp' to override default index treatment Returns ------- @@ -1602,7 +1602,7 @@ def _take_new_index(obj, indexer, new_index, axis=0): def _get_timestamp_range_edges(first, last, offset, closed='left', base=0): """ - Adjust the `first` Timestamp to the preceeding Timestamp that resides on + Adjust the `first` Timestamp to the preceding Timestamp that resides on the provided offset. Adjust the `last` Timestamp to the following Timestamp that resides on the provided offset. Input Timestamps that already reside on the offset will be adjusted depending on the type of diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 8c29bdc2a974c5..96124331e43ef8 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -429,7 +429,7 @@ def _convert_bin_to_numeric_type(bins, dtype): def _convert_bin_to_datelike_type(bins, dtype): """ - Convert bins to a DatetimeIndex or TimedeltaIndex if the orginal dtype is + Convert bins to a DatetimeIndex or TimedeltaIndex if the original dtype is datelike Parameters diff --git a/pandas/core/series.py b/pandas/core/series.py index 11e578e74f6e75..730a96f5435a12 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1481,7 +1481,7 @@ def iteritems(self): Lazily iterate over (index, value) tuples. This method returns an iterable tuple (index, value). This is - convienient if you want to create a lazy iterator. Note that the + convenient if you want to create a lazy iterator. Note that the methods Series.items and Series.iteritems are the same methods. Returns diff --git a/pandas/core/sparse/scipy_sparse.py b/pandas/core/sparse/scipy_sparse.py index 0dd8958e93c133..7ff0f465756613 100644 --- a/pandas/core/sparse/scipy_sparse.py +++ b/pandas/core/sparse/scipy_sparse.py @@ -42,7 +42,7 @@ def get_indexers(levels): values_ilabels = [x[0] for x in values_ilabels] # # performance issues with groupby ################################### - # TODO: these two lines can rejplace the code below but + # TODO: these two lines can replace the code below but # groupby is too slow (in some cases at least) # labels_to_i = ss.groupby(level=levels, sort=sort_labels).first() # labels_to_i[:] = np.arange(labels_to_i.shape[0]) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 6ebfbc8bb0ee0f..710b29c6a6536c 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2058,7 +2058,7 @@ def _get_series_list(self, others, ignore_index=False): # self._orig is either Series or Index idx = self._orig if isinstance(self._orig, Index) else self._orig.index - err_msg = ('others must be Series, Index, DataFrame, np.ndarrary or ' + err_msg = ('others must be Series, Index, DataFrame, np.ndarray or ' 'list-like (either containing only strings or containing ' 'only objects of type Series/Index/list-like/np.ndarray)') @@ -2155,7 +2155,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): Parameters ---------- - others : Series, Index, DataFrame, np.ndarrary or list-like + others : Series, Index, DataFrame, np.ndarray or list-like Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and other list-likes of strings must have the same length as the calling Series/Index, with the exception of indexed objects (i.e. @@ -2571,7 +2571,7 @@ def rsplit(self, pat=None, n=-1, expand=False): 0 Linda van der Berg 1 George Pitt - Rivers - To return a Series containining tuples instead of a DataFrame: + To return a Series containing tuples instead of a DataFrame: >>> s.str.partition('-', expand=False) 0 (Linda van der Berg, , ) @@ -3292,7 +3292,7 @@ def rindex(self, sub, start=0, end=None): The ``s5.str.istitle`` method checks for whether all words are in title case (whether only the first letter of each word is capitalized). Words are - assumed to be as any sequence of non-numeric characters seperated by + assumed to be as any sequence of non-numeric characters separated by whitespace characters. >>> s5.str.istitle() diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 8e6331fe44e6ba..5893ff0e0dd8fe 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -363,7 +363,7 @@ def _adjust_to_origin(arg, origin, unit): raise ValueError("incompatible 'arg' type for given " "'origin'='julian'") - # premptively check this for a nice range + # preemptively check this for a nice range j_max = Timestamp.max.to_julian_date() - j0 j_min = Timestamp.min.to_julian_date() - j0 if np.any(arg > j_max) or np.any(arg < j_min): diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 4db00e34b39e25..5792f6e2a5a08a 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -557,7 +557,7 @@ def _format_hierarchical_rows(self): # MultiIndex columns require an extra row # with index names (blank if None) for - # unambigous round-trip, unless not merging, + # unambiguous round-trip, unless not merging, # in which case the names all go on one row Issue #11328 if isinstance(self.columns, ABCMultiIndex) and self.merge_cells: self.rowcounter += 1 diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 7bafa15bb1979f..f14b615471ccc5 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -180,7 +180,7 @@ def __init__(self, obj, orient, date_format, double_precision, self.schema = build_table_schema(obj, index=self.index) - # NotImplementd on a column MultiIndex + # NotImplemented on a column MultiIndex if obj.ndim == 2 and isinstance(obj.columns, MultiIndex): raise NotImplementedError( "orient='table' is not supported for MultiIndex") diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index fa4e35b08bf6e6..2d8bc20b1195e5 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -79,7 +79,7 @@ def nested_to_record(ds, prefix="", sep=".", level=0): else: newkey = prefix + sep + k - # only dicts gets recurse-flattend + # only dicts gets recurse-flattened # only at level>1 do we rename the rest of the keys if not isinstance(v, dict): if level != 0: # so we skip copying for top level, common case diff --git a/pandas/io/json/table_schema.py b/pandas/io/json/table_schema.py index 7742bc717b1849..a54f5cdf723a39 100644 --- a/pandas/io/json/table_schema.py +++ b/pandas/io/json/table_schema.py @@ -142,7 +142,7 @@ def convert_json_field_to_pandas_type(field): 'int64' >>> convert_json_field_to_pandas_type({'name': 'a_categorical', 'type': 'any', - 'contraints': {'enum': [ + 'constraints': {'enum': [ 'a', 'b', 'c']}, 'ordered': True}) 'CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)' diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 17d580bae5cf1d..97d5b1dd2a1e5f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -824,7 +824,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, raise ValueError( "all tables must have exactly the same nrows!") - # axis is the concentation axes + # axis is the concentration axes axis = list({t.non_index_axes[0][0] for t in tbls})[0] def func(_start, _stop, _where): @@ -948,7 +948,7 @@ def append(self, key, value, format=None, append=True, columns=None, of the object are indexed. See `here `__. min_itemsize : dict of columns that specify minimum string sizes - nan_rep : string to use as string nan represenation + nan_rep : string to use as string nan representation chunksize : size to chunk the writing expectedrows : expected TOTAL row size of this table encoding : default None, provide an encoding for strings @@ -1343,7 +1343,7 @@ def error(t): else: - # distiguish between a frame/table + # distinguish between a frame/table tt = 'legacy_panel' try: fields = group.table._v_attrs.fields @@ -3316,7 +3316,7 @@ def validate_version(self, where=None): warnings.warn(ws, IncompatibilityWarning) def validate_min_itemsize(self, min_itemsize): - """validate the min_itemisze doesn't contain items that are not in the + """validate the min_itemsize doesn't contain items that are not in the axes this needs data_columns to be defined """ if min_itemsize is None: @@ -3500,7 +3500,7 @@ def validate_data_columns(self, data_columns, min_itemsize): def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, min_itemsize=None, **kwargs): """ create and return the axes - leagcy tables create an indexable column, indexable index, + legacy tables create an indexable column, indexable index, non-indexable fields Parameters diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d8dfd15477974f..00b7a29b27b63f 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2708,7 +2708,7 @@ class StataWriter117(StataWriter): Each label must be 80 characters or smaller. convert_strl : list List of columns names to convert to Stata StrL format. Columns with - more than 2045 characters are aautomatically written as StrL. + more than 2045 characters are automatically written as StrL. Smaller columns can be converted by including the column name. Using StrLs can reduce output file size when strings are longer than 8 characters, and either frequently repeated or sparse. diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index e491cfc3309a0a..acb5ab7b8e04be 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -133,7 +133,7 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, Number of rows and columns of the subplot grid. If not specified, calculated from naxes and layout_type - layout_type : {'box', 'horziontal', 'vertical'}, default 'box' + layout_type : {'box', 'horizontal', 'vertical'}, default 'box' Specify how to layout the subplot grid. fig_kw : Other keyword arguments to be passed to the figure() call. diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 256ee930b4cdac..f58f8981317dfc 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -1,4 +1,4 @@ -# Arithmetc tests for DataFrame/Series/Index/Array classes that should +# Arithmetic tests for DataFrame/Series/Index/Array classes that should # behave identically. # Specifically for numeric dtypes from collections import abc @@ -587,7 +587,7 @@ def test_operators_frame(self): tm.assert_series_equal(ts / ts, ts / df['A'], check_names=False) - # TODO: this came from tests.series.test_analytics, needs cleannup and + # TODO: this came from tests.series.test_analytics, needs cleanup and # de-duplication with test_modulo above def test_modulo2(self): with np.errstate(all='ignore'): diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 8d67e02d514ffe..dd931939ddf51e 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -1,4 +1,4 @@ -# Arithmetc tests for DataFrame/Series/Index/Array classes that should +# Arithmetic tests for DataFrame/Series/Index/Array classes that should # behave identically. # Specifically for object dtype from decimal import Decimal diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index e254312e397247..bc1b78bf944d15 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -1,4 +1,4 @@ -# Arithmetc tests for DataFrame/Series/Index/Array classes that should +# Arithmetic tests for DataFrame/Series/Index/Array classes that should # behave identically. # Specifically for Period dtype import operator diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 2dff9a6088de8f..047900c3d7586d 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1,4 +1,4 @@ -# Arithmetc tests for DataFrame/Series/Index/Array classes that should +# Arithmetic tests for DataFrame/Series/Index/Array classes that should # behave identically. from datetime import datetime, timedelta @@ -48,7 +48,7 @@ def test_compare_timedelta64_zerodim(self): tdi >= np.array(4) def test_compare_timedelta_series(self): - # regresssion test for GH#5963 + # regression test for GH#5963 s = pd.Series([timedelta(days=1), timedelta(days=2)]) actual = s > timedelta(days=1) expected = pd.Series([False, True]) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 2f42ec5bae2b04..2337d8363155cf 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -450,7 +450,7 @@ def test_concat_same_type_invalid(self, datetime_index): arr._concat_same_type([arr, other]) def test_concat_same_type_different_freq(self): - # we *can* concatentate DTI with different freqs. + # we *can* concatenate DTI with different freqs. a = DatetimeArray(pd.date_range('2000', periods=2, freq='D', tz='US/Central')) b = DatetimeArray(pd.date_range('2000', periods=2, freq='H', diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 066eadc9b68bc0..65f7628370ad4a 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -164,13 +164,13 @@ def _check_op(self, s, op_name, other, exc=None): self._check_op_integer(result, expected, mask, s, op_name, other) def _check_op_float(self, result, expected, mask, s, op_name, other): - # check comparisions that are resulting in float dtypes + # check comparisons that are resulting in float dtypes expected[mask] = np.nan tm.assert_series_equal(result, expected) def _check_op_integer(self, result, expected, mask, s, op_name, other): - # check comparisions that are resulting in integer dtypes + # check comparisons that are resulting in integer dtypes # to compare properly, we convert the expected # to float, mask to nans and convert infs diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index c7a62dfe77c37d..675abec661b5ad 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -417,7 +417,7 @@ def test_is_datetime_or_timedelta_dtype(): assert not com.is_datetime_or_timedelta_dtype(pd.Series([1, 2])) assert not com.is_datetime_or_timedelta_dtype(np.array(['a', 'b'])) - # TODO(jreback), this is sligthly suspect + # TODO(jreback), this is slightly suspect assert not com.is_datetime_or_timedelta_dtype( DatetimeTZDtype("ns", "US/Eastern")) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 2ac68c52d53c71..708eb9c7c8c439 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -36,7 +36,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): op(s, other) def _check_divmod_op(self, s, op, other, exc=Exception): - # divmod has multiple return values, so check separatly + # divmod has multiple return values, so check separately if exc is None: result_div, result_mod = op(s, other) if op is divmod: diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 97c329e0a5c92b..89d30b0a3cc06d 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -25,7 +25,7 @@ def data(): # Why the while loop? NumPy is unable to construct an ndarray from # equal-length ndarrays. Many of our operations involve coercing the # EA to an ndarray of objects. To avoid random test failures, we ensure - # that our data is coercable to an ndarray. Several tests deal with only + # that our data is coercible to an ndarray. Several tests deal with only # the first two elements, so that's what we'll check. while len(data[0]) == len(data[1]): diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 9683beb20def54..faa86acb1584f2 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -664,7 +664,7 @@ def test_combine_first_mixed_bug(self): expected = Series([True, True, False], name=2) assert_series_equal(result, expected) - # GH 3593, converting datetime64[ns] incorrecly + # GH 3593, converting datetime64[ns] incorrectly df0 = DataFrame({"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 7dc74961a2adcb..c6508072cb8c7a 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -149,7 +149,7 @@ def _check_mixed_dtypes(df, dtypes=None): if d in df: assert(df.dtypes[d] == d) - # mixed floating and integer coexinst in the same frame + # mixed floating and integer coexist in the same frame df = _make_mixed_dtypes_df('float') _check_mixed_dtypes(df) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 40785c6a1d3214..3c9558d5cbd108 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -2729,7 +2729,7 @@ def _check_set(df, cond, check_dtypes=True): cond = df >= 0 _check_set(df, cond) - # aligining + # aligning cond = (df >= 0)[1:] _check_set(df, cond) @@ -3691,7 +3691,7 @@ def test_assigning_ops(self): df.at["j", "cats"] = "c" # Assigning a Category to parts of a int/... column uses the values of - # the Catgorical + # the Categorical df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")}) exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")}) df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index d46ce41fc7f038..e7583adff403bf 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -307,7 +307,7 @@ def check(result, expected=None): with pytest.raises(ValueError, match=msg): df[df.A > 6] - # dup aligining operations should work + # dup aligning operations should work # GH 5185 df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3]) df2 = DataFrame([1, 2, 3], index=[1, 2, 3]) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 5bea749febc766..0fb8673e6274a4 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -133,7 +133,7 @@ def f(g): def test_group_apply_once_per_group(df, group_names): # GH2936, GH7739, GH10519, GH2656, GH12155, GH20084, GH21417 - # This test should ensure that a function is only evaluted + # This test should ensure that a function is only evaluated # once per group. Previously the function has been evaluated twice # on the first group to check if the Cython index slider is safe to use # This test ensures that the side effect (append to list) is only triggered diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 87b57b0609b362..3da3ab22b643bc 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -130,7 +130,7 @@ def func(dataf): assert isinstance(result, DataFrame) # GH5592 - # inconcistent return type + # inconsistent return type df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb', 'Pony', 'Pony'], B=Series( np.arange(7), dtype='int64'), C=date_range( diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index ef05e6ada4890a..4ca470d316e5c8 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -92,7 +92,7 @@ def test_groupby_with_timegrouper_methods(self, should_sort): def test_timegrouper_with_reg_groups(self): # GH 3794 - # allow combinateion of timegrouper/reg groups + # allow combination of timegrouper/reg groups df_original = DataFrame({ 'Branch': 'A A A A A A A B'.split(), diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 085e62ed9341e6..6ec8568ce72428 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -94,7 +94,7 @@ def test_slice_duplicate_monotonic(self): def test_monotone_DTI_indexing_bug(self): # GH 19362 - # Testing accessing the first element in a montononic descending + # Testing accessing the first element in a monotonic descending # partial string indexing. df = pd.DataFrame(list(range(5))) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index ea33e563b31bef..2a5ae92cb59f50 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -201,7 +201,7 @@ def test_to_datetime_with_non_exact(self, cache): def test_parse_nanoseconds_with_formula(self, cache): # GH8989 - # trunctaing the nanoseconds when a format was provided + # truncating the nanoseconds when a format was provided for v in ["2012-01-01 09:00:00.000000001", "2012-01-01 09:00:00.000001", "2012-01-01 09:00:00.001", @@ -383,7 +383,7 @@ def test_to_datetime_now(self): def test_to_datetime_today(self): # See GH#18666 # Test with one timezone far ahead of UTC and another far behind, so - # one of these will _almost_ alawys be in a different day from UTC. + # one of these will _almost_ always be in a different day from UTC. # Unfortunately this test between 12 and 1 AM Samoa time # this both of these timezones _and_ UTC will all be in the same day, # so this test will not detect the regression introduced in #18666. @@ -606,7 +606,7 @@ def test_to_datetime_tz_psycopg2(self, cache): ], tz=psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None)) assert is_datetime64_ns_dtype(i) - # tz coerceion + # tz coercion result = pd.to_datetime(i, errors='coerce', cache=cache) tm.assert_index_equal(result, i) diff --git a/pandas/tests/indexes/interval/test_construction.py b/pandas/tests/indexes/interval/test_construction.py index 7a54ad5c180a44..eb9b573cce91d1 100644 --- a/pandas/tests/indexes/interval/test_construction.py +++ b/pandas/tests/indexes/interval/test_construction.py @@ -272,7 +272,7 @@ def test_constructor_errors(self): IntervalIndex.from_tuples(tuples) def test_na_tuples(self): - # tuple (NA, NA) evaluates the same as NA as an elemenent + # tuple (NA, NA) evaluates the same as NA as an element na_tuple = [(0, 1), (np.nan, np.nan), (2, 3)] idx_na_tuple = IntervalIndex.from_tuples(na_tuple) idx_na_element = IntervalIndex.from_tuples([(0, 1), np.nan, (2, 3)]) diff --git a/pandas/tests/indexes/multi/test_missing.py b/pandas/tests/indexes/multi/test_missing.py index 518c12bb20e131..1928c303a1bcdd 100644 --- a/pandas/tests/indexes/multi/test_missing.py +++ b/pandas/tests/indexes/multi/test_missing.py @@ -74,7 +74,7 @@ def test_dropna(): idx.dropna(how='xxx') # GH26408 - # test if missing values are dropped for mutiindex constructed + # test if missing values are dropped for multiindex constructed # from codes and values idx = MultiIndex(levels=[[np.nan, None, pd.NaT, "128", 2], [np.nan, None, pd.NaT, "128", 2]], diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index d38fa20a9335cf..d89d282fb785b8 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -676,7 +676,7 @@ def test_get_loc(self): with pytest.raises(KeyError): i.get_loc('NOT-EXIST') - # non-unique, slicable + # non-unique, sliceable cidx3 = CategoricalIndex(list('aabbb'), categories=list('abc')) idx3 = Index(list('aabbb')) diff --git a/pandas/tests/indexing/multiindex/test_xs.py b/pandas/tests/indexing/multiindex/test_xs.py index f9117341e3a783..bbc55c75c5b77b 100644 --- a/pandas/tests/indexing/multiindex/test_xs.py +++ b/pandas/tests/indexing/multiindex/test_xs.py @@ -52,7 +52,7 @@ def test_xs_loc_equality(multiindex_dataframe_random_data): def test_xs_missing_values_in_index(): # see gh-6574 - # missing values in returned index should be preserrved + # missing values in returned index should be preserved acc = [ ('a', 'abcde', 1), ('b', 'bbcde', 2), diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 8b2b0b349e2031..e9c1b85e7d40c8 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -665,7 +665,7 @@ def test_where_index_period(self): class TestFillnaSeriesCoercion(CoercionBase): - # not indexing, but place here for consisntency + # not indexing, but place here for consistency method = 'fillna' diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 9a2aae08dbb154..ada613110d9bf9 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -35,7 +35,7 @@ def test_scalar_error(self): # float_indexers should raise exceptions # on appropriate Index types & accessors # this duplicates the code below - # but is spefically testing for the error + # but is specifically testing for the error # message for index in [tm.makeStringIndex, tm.makeUnicodeIndex, diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 4fa26dc67ba0c2..6b5ad66e268df2 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -20,7 +20,7 @@ def test_iloc_exceeds_bounds(self): # iloc should allow indexers that exceed the bounds df = DataFrame(np.random.random_sample((20, 5)), columns=list('ABCDE')) - # lists of positions should raise IndexErrror! + # lists of positions should raise IndexError! msg = 'positional indexers are out-of-bounds' with pytest.raises(IndexError, match=msg): df.iloc[:, [0, 1, 2, 3, 4, 5]] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 5f5718fe3eac35..11d0fa2602baac 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -16,7 +16,7 @@ class TestLoc(Base): def test_loc_getitem_dups(self): # GH 5678 - # repeated gettitems on a dup index returning a ndarray + # repeated getitems on a dup index returning a ndarray df = DataFrame( np.random.random_sample((20, 5)), index=['ABCDE' [x % 5] for x in range(20)]) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 961d781764b671..ea75e97bace0bc 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -730,7 +730,7 @@ def test_to_excel_multiindex_dates( assert recons.index.names == ('time', 'foo') def test_to_excel_multiindex_no_write_index(self, engine, ext): - # Test writing and re-reading a MI witout the index. GH 5616. + # Test writing and re-reading a MI without the index. GH 5616. # Initial non-MI frame. frame1 = DataFrame({'a': [10, 20], 'b': [30, 40], 'c': [50, 60]}) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index edb7c2136825da..0eeb0e6eb2f2df 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -533,7 +533,7 @@ def test_to_string_with_formatters_unicode(self): assert result == ' c/\u03c3\n' + '0 1\n1 2\n2 3' def test_east_asian_unicode_false(self): - # not alighned properly because of east asian width + # not aligned properly because of east asian width # mid col df = DataFrame({'a': ['あ', 'いいい', 'う', 'ええええええ'], diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index ef9dbc63d873d4..413c11ba2f9fe8 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -225,7 +225,7 @@ def test_long_strings(self): def test_api(self): # GH4584 - # API issue when to_hdf doesn't acdept append AND format args + # API issue when to_hdf doesn't accept append AND format args with ensure_clean_path(self.path) as path: df = tm.makeDataFrame() @@ -2656,7 +2656,7 @@ def test_select(self): expected = df.reindex(columns=['A', 'B']) tm.assert_frame_equal(expected, result) - # equivalentsly + # equivalently result = store.select('df', [("columns=['A', 'B']")]) expected = df.reindex(columns=['A', 'B']) tm.assert_frame_equal(expected, result) @@ -3284,7 +3284,7 @@ def test_frame_select_complex2(self): expected = read_hdf(hh, 'df', where='l1=[2, 3, 4]') - # sccope with list like + # scope with list like l = selection.index.tolist() # noqa store = HDFStore(hh) result = store.select('df', where='l1=l') @@ -3308,7 +3308,7 @@ def test_frame_select_complex2(self): result = read_hdf(hh, 'df', where='l1=list(selection.index)') assert_frame_equal(result, expected) - # sccope with index + # scope with index store = HDFStore(hh) result = store.select('df', where='l1=index') @@ -5164,7 +5164,7 @@ def test_legacy_datetimetz_object(self, datapath): assert_frame_equal(result, expected) def test_dst_transitions(self): - # make sure we are not failing on transaitions + # make sure we are not failing on transitions with ensure_clean_store(self.path) as store: times = pd.date_range("2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index afdd83ba9bb8cd..db5c92fb681a2d 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -305,7 +305,7 @@ def test_write_index(self, engine): check_round_trip(df, engine) def test_write_multiindex(self, pa): - # Not suppoprted in fastparquet as of 0.1.3 or older pyarrow version + # Not supported in fastparquet as of 0.1.3 or older pyarrow version engine = pa df = pd.DataFrame({'A': [1, 2, 3]}) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index e651892bde0a0d..b053afa4dd7d5d 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1338,7 +1338,7 @@ def check(col): # this is parsed on Travis (linux), but not on macosx for some reason # even with the same versions of psycopg2 & sqlalchemy, possibly a - # Postgrsql server version difference + # Postgresql server version difference col = df.DateColWithTz assert is_datetime64tz_dtype(col.dtype) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 4ee918fa48dab3..06c753d1b8e21f 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -2952,7 +2952,7 @@ def test_plain_axes(self): fig.add_axes([0.2, 0.2, 0.2, 0.2]) Series(rand(10)).plot(ax=ax) - # suppliad ax itself is a plain Axes, but because the cmap keyword + # supplied ax itself is a plain Axes, but because the cmap keyword # a new ax is created for the colorbar -> also multiples axes (GH11520) df = DataFrame({'a': randn(8), 'b': randn(8)}) fig = self.plt.figure() diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 9dabb351967414..9a954b522333dd 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -529,7 +529,7 @@ def test_df_series_secondary_legend(self): assert ax.right_ax.get_yaxis().get_visible() tm.close() - # seconcary -> secondary (without passing ax) + # secondary -> secondary (without passing ax) _, ax = self.plt.subplots() ax = df.plot(secondary_y=True, ax=ax) s.plot(legend=True, secondary_y=True, ax=ax) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index b487f865b68a4b..8eb41415552602 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -603,7 +603,7 @@ def test_other_datetime_unit(self): 'datetime64[ns]']: df2 = s.astype(dtype).to_frame('days') - # coerces to datetime64[ns], thus sholuld not be affected + # coerces to datetime64[ns], thus should not be affected assert df2['days'].dtype == 'datetime64[ns]' result = df1.merge(df2, left_on='entity_id', right_index=True) @@ -1243,9 +1243,9 @@ def test_merge_incompat_infer_boolean_object(self): ([0, 1], pd.Series([False, True], dtype=bool)), ]) def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals): - # these are explicity allowed incompat merges, that pass thru + # these are explicitly allowed incompat merges, that pass thru # the result type is dependent on if the values on the rhs are - # inferred, otherwise these will be coereced to object + # inferred, otherwise these will be coerced to object df1 = DataFrame({'A': df1_vals}) df2 = DataFrame({'A': df2_vals}) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 1420d4420e430c..4f65251ebd9237 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2198,7 +2198,7 @@ def test_categorical_concat(self, sort): def test_categorical_concat_gh7864(self): # GH 7864 - # make sure ordering is preserverd + # make sure ordering is preserved df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list('abbaae')}) df["grade"] = Categorical(df["raw_grade"]) df['grade'].cat.set_categories(['e', 'a', 'b']) @@ -2265,7 +2265,7 @@ def test_categorical_index_preserver(self): }).set_index('B') tm.assert_frame_equal(result, expected) - # wrong catgories + # wrong categories df3 = DataFrame({'A': a, 'B': Categorical(b, categories=list('abe')) }).set_index('B') msg = "categories must match existing categories when appending" diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index f10876531e66a0..469072970133d0 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -213,7 +213,7 @@ def test_conversion(self): assert isinstance(td64, np.timedelta64) - # this is NOT equal and cannot be roundtriped (because of the nanos) + # this is NOT equal and cannot be roundtripped (because of the nanos) td = Timedelta('1 days, 10:11:12.012345678') assert td != td.to_pytimedelta() diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index 657008856482f4..8b13458050ce8d 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -112,7 +112,7 @@ def test_ceil_floor_edge(self, test_input, rounder, freq, expected): ]) @pytest.mark.parametrize('rounder', ['ceil', 'floor', 'round']) def test_round_minute_freq(self, test_input, freq, expected, rounder): - # Ensure timestamps that shouldnt round dont! + # Ensure timestamps that shouldn't round dont! # GH#21262 dt = Timestamp(test_input) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 5328a58e3fbff1..94050f75264445 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -634,7 +634,7 @@ def test_timedelta64_nan(self): # td np.float64 -> another float-object somewher on + # casting to -> np.float64 -> another float-object somewhere on # the way could lead jepardize this behavior comps = [np.nan] # could be casted to float64 values = [np.nan] diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index d24ed9433f4f71..d82b205803b098 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -206,7 +206,7 @@ def check_ops_properties(self, props, filter=None, ignore_failures=False): result = getattr(o, op) - # these couuld be series, arrays or scalars + # these could be series, arrays or scalars if isinstance(result, Series) and isinstance(expected, Series): tm.assert_series_equal(result, expected) elif isinstance(result, Index) and isinstance(expected, Index): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index e8d6b3bcaa77f7..aa9c9bb05f8774 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1840,7 +1840,7 @@ def test_sort_index_and_reconstruction(self): # 15622 # lexsortedness should be identical - # across MultiIndex consruction methods + # across MultiIndex construction methods df = DataFrame([[1, 1], [2, 2]], index=list('ab')) expected = DataFrame([[1, 1], [2, 2], [1, 1], [2, 2]], diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 9524a78dae16ce..4dfdd1c96728bd 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -1387,7 +1387,7 @@ def quantile_func(x): def test_rolling_quantile_np_percentile(self): # #9413: Tests that rolling window's quantile default behavior - # is analogus to Numpy's percentile + # is analogous to Numpy's percentile row = 10 col = 5 idx = pd.date_range('20100101', periods=row, freq='B') @@ -2003,7 +2003,7 @@ def test_pairwise_with_self(self, f): # DataFrame with itself, pairwise=True # note that we may construct the 1st level of the MI - # in a non-motononic way, so compare accordingly + # in a non-monotonic way, so compare accordingly results = [] for i, df in enumerate(self.df1s): result = f(df) @@ -2154,7 +2154,7 @@ def is_constant(x): def no_nans(x): return x.notna().all().all() - # data is a tuple(object, is_contant, no_nans) + # data is a tuple(object, is_constant, no_nans) data = create_series() + create_dataframes() return [(x, is_constant(x), no_nans(x)) for x in data] diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index a1ad792e57bde1..151cd2a42ecef6 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -813,7 +813,7 @@ def test_call(self): assert self.offset4(self.d) == datetime(2014, 6, 30, 14) def test_sub(self): - # we have to override test_sub here becasue self.offset2 is not + # we have to override test_sub here because self.offset2 is not # defined as self._offset(2) off = self.offset2 msg = "Cannot subtract datetime from offset" diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 00837d36d9508e..ac20ad16696386 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -116,7 +116,7 @@ class DateOffset(BaseOffset): off specifying n in the keywords you use, but regardless it is there for you. n is needed for DateOffset subclasses. - DateOffets work as follows. Each offset specify a set of dates + DateOffset work as follows. Each offset specify a set of dates that conform to the DateOffset. For example, Bday defines this set to be the set of dates that are weekdays (M-F). To test if a date is in the set of a DateOffset dateOffset we can use the diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 107c17c5253fb2..f14b202b034d6d 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1666,7 +1666,7 @@ def index_subclass_makers_generator(): def all_timeseries_index_generator(k=10): """Generator which can be iterated over to get instances of all the classes - which represent time-seires. + which represent time-series. Parameters ---------- @@ -1793,7 +1793,7 @@ def makeCustomIndex(nentries, nlevels, prefix='#', names=False, ndupe_l=None, # pass None to index constructor for no name names = None - # make singelton case uniform + # make singleton case uniform if isinstance(names, str) and nlevels == 1: names = [names] @@ -1872,7 +1872,7 @@ def makeCustomDataframe(nrows, ncols, c_idx_names=True, r_idx_names=True, N < idx_nlevels, for just the first N levels. If ndupe doesn't divide nrows/ncol, the last label might have lower multiplicity. dtype - passed to the DataFrame constructor as is, in case you wish to - have more control in conjuncion with a custom `data_gen_f` + have more control in conjunction with a custom `data_gen_f` r_idx_type, c_idx_type - "i"/"f"/"s"/"u"/"dt"/"td". If idx_type is not None, `idx_nlevels` must be 1. "i"/"f" creates an integer/float index, diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 64eaf45376b2fa..dddd5eb1f1eab7 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -150,7 +150,7 @@ def error(code, **kwargs): code : str Error code. message : str - Error message with varaibles replaced. + Error message with variables replaced. """ return (code, ERROR_MSGS[code].format(**kwargs)) From 8ea2d087cda0f40a4e41ce108a32859d51b4d69f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 25 Jun 2019 02:47:46 +0200 Subject: [PATCH 049/238] BUG: fix empty Series repr for subclasses (#27001) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/io/formats/format.py | 3 ++- pandas/tests/series/test_subclass.py | 3 +++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 109005364fca60..d10f9567188d14 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -793,6 +793,7 @@ Other - Removed unused C functions from vendored UltraJSON implementation (:issue:`26198`) - Allow :class:`Index` and :class:`RangeIndex` to be passed to numpy ``min`` and ``max`` functions (:issue:`26125`) +- Use actual class name in repr of empty objects of a ``Series`` subclass (:issue:`27001`). .. _whatsnew_0.250.contributors: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b2ef45b15e5494..152e9a2e9ab3d3 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -257,7 +257,8 @@ def to_string(self): footer = self._get_footer() if len(series) == 0: - return 'Series([], ' + footer + ')' + return "{name}([], {footer})".format( + name=self.series.__class__.__name__, footer=footer) fmt_index, have_header = self._get_formatted_index() fmt_values = self._get_formatted_values() diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index 563a94f4588cbc..b47d339f5a5f20 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -39,6 +39,9 @@ def test_subclass_unstack(self): tm.assert_frame_equal(res, exp) + def test_subclass_empty_repr(self): + assert 'SubclassedSeries' in repr(tm.SubclassedSeries()) + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseSeriesSubclassing: From 2da45994b63062396a2b75ead738b5df8ecc8070 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 25 Jun 2019 05:19:44 -0700 Subject: [PATCH 050/238] TST: parametrize pytable test (#27032) --- pandas/core/arrays/base.py | 7 ++- pandas/core/groupby/generic.py | 2 +- pandas/core/internals/managers.py | 2 +- pandas/tests/io/pytables/test_pytables.py | 74 +++++++++++------------ 4 files changed, 41 insertions(+), 44 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 20fd582179dc6a..51ad01dd6b369c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -115,6 +115,7 @@ class ExtensionArray: # ------------------------------------------------------------------------ # Constructors # ------------------------------------------------------------------------ + @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): """ @@ -286,6 +287,7 @@ def __iter__(self): # ------------------------------------------------------------------------ # Required attributes # ------------------------------------------------------------------------ + @property def dtype(self) -> ExtensionDtype: """ @@ -319,6 +321,7 @@ def nbytes(self) -> int: # ------------------------------------------------------------------------ # Additional Methods # ------------------------------------------------------------------------ + def astype(self, dtype, copy=True): """ Cast to a NumPy array with 'dtype'. @@ -479,8 +482,7 @@ def dropna(self): def shift( self, periods: int = 1, - fill_value: object = None, - ) -> ABCExtensionArray: + fill_value: object = None) -> ABCExtensionArray: """ Shift values by desired number. @@ -836,6 +838,7 @@ def copy(self, deep: bool = False) -> ABCExtensionArray: # ------------------------------------------------------------------------ # Printing # ------------------------------------------------------------------------ + def __repr__(self): from pandas.io.formats.printing import format_object_summary diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 91be320a3e674d..1b4e001620286d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -480,7 +480,7 @@ def first_not_none(values): # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here so = self._selected_obj - if (so.ndim == 2 and so.dtypes.apply(is_datetimelike).any()): + if so.ndim == 2 and so.dtypes.apply(is_datetimelike).any(): result = result.apply( lambda x: to_numeric(x, errors='ignore')) date_cols = self._selected_obj.select_dtypes( diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 7fe34279c04826..592c385dd87ec0 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1027,7 +1027,7 @@ def set(self, item, value): value_is_extension_type = (is_extension_type(value) or is_extension_array_dtype(value)) - # categorical/spares/datetimetz + # categorical/sparse/datetimetz if value_is_extension_type: def value_getitem(placement): diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index 413c11ba2f9fe8..be318ede2df4a9 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -1070,47 +1070,41 @@ def test_encoding(self): result = store.select('df', Term('columns=A', encoding='ascii')) tm.assert_frame_equal(result, expected) - def test_latin_encoding(self): - - values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'], - [b'E\xc9, 17', b'a', b'b', b'c'], - [b'EE, 17', b'', b'a', b'b', b'c'], - [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'], - [b'', b'a', b'b', b'c'], - [b'\xf8\xfc', b'a', b'b', b'c'], - [b'A\xf8\xfc', b'', b'a', b'b', b'c'], - [np.nan, b'', b'b', b'c'], - [b'A\xf8\xfc', np.nan, b'', b'b', b'c']] - - def _try_decode(x, encoding='latin-1'): - try: - return x.decode(encoding) - except AttributeError: - return x - # not sure how to remove latin-1 from code in python 2 and 3 - values = [[_try_decode(x) for x in y] for y in values] - - examples = [] - for dtype in ['category', object]: - for val in values: - examples.append(pd.Series(val, dtype=dtype)) - - def roundtrip(s, key='data', encoding='latin-1', nan_rep=''): - with ensure_clean_path(self.path) as store: - s.to_hdf(store, key, format='table', encoding=encoding, - nan_rep=nan_rep) - retr = read_hdf(store, key) - s_nan = s.replace(nan_rep, np.nan) - if is_categorical_dtype(s_nan): - assert is_categorical_dtype(retr) - assert_series_equal(s_nan, retr, check_dtype=False, - check_categorical=False) - else: - assert_series_equal(s_nan, retr) - - for s in examples: - roundtrip(s) + @pytest.mark.parametrize('val', [ + [b'E\xc9, 17', b'', b'a', b'b', b'c'], + [b'E\xc9, 17', b'a', b'b', b'c'], + [b'EE, 17', b'', b'a', b'b', b'c'], + [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'], + [b'', b'a', b'b', b'c'], + [b'\xf8\xfc', b'a', b'b', b'c'], + [b'A\xf8\xfc', b'', b'a', b'b', b'c'], + [np.nan, b'', b'b', b'c'], + [b'A\xf8\xfc', np.nan, b'', b'b', b'c'] + ]) + @pytest.mark.parametrize('dtype', ['category', object]) + def test_latin_encoding(self, dtype, val): + enc = 'latin-1' + nan_rep = '' + key = 'data' + + val = [x.decode(enc) if isinstance(x, bytes) else x for x in val] + ser = pd.Series(val, dtype=dtype) + + with ensure_clean_path(self.path) as store: + ser.to_hdf(store, key, format='table', encoding=enc, + nan_rep=nan_rep) + retr = read_hdf(store, key) + + s_nan = ser.replace(nan_rep, np.nan) + + if is_categorical_dtype(s_nan): + assert is_categorical_dtype(retr) + assert_series_equal(s_nan, retr, check_dtype=False, + check_categorical=False) + else: + assert_series_equal(s_nan, retr) + # FIXME: don't leave commented-out # fails: # for x in examples: # roundtrip(s, nan_rep=b'\xf8\xfc') From 606178a91c4003f589ec64b08f853164fd45ada2 Mon Sep 17 00:00:00 2001 From: Mak Sze Chun Date: Tue, 25 Jun 2019 20:34:25 +0800 Subject: [PATCH 051/238] Remove pandas.core.index.datetimelike from MyPy Blacklist (#26280) --- mypy.ini | 5 +---- pandas/core/algorithms.py | 3 ++- pandas/core/arrays/base.py | 4 ++-- pandas/core/indexes/datetimelike.py | 24 ++++++++++++++---------- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/mypy.ini b/mypy.ini index eea6a3b5516778..f8b37ee5b86638 100644 --- a/mypy.ini +++ b/mypy.ini @@ -3,7 +3,4 @@ ignore_missing_imports=True follow_imports=silent [mypy-pandas.conftest,pandas.tests.*] -ignore_errors=True - -[mypy-pandas.core.indexes.datetimelike] -ignore_errors=True +ignore_errors=True \ No newline at end of file diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 932ac71a23ed03..ff1313c21d96f6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -3,6 +3,7 @@ intended for public consumption """ from textwrap import dedent +from typing import Dict from warnings import catch_warnings, simplefilter, warn import numpy as np @@ -27,7 +28,7 @@ from pandas.core import common as com -_shared_docs = {} +_shared_docs = {} # type: Dict[str, str] # --------------- # diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 51ad01dd6b369c..d1dfb6b5e8599d 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -6,7 +6,7 @@ without warning. """ import operator -from typing import Any, Callable, Optional, Sequence, Tuple, Union +from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Union import numpy as np @@ -26,7 +26,7 @@ _not_implemented_message = "{} does not implement {}." -_extension_array_shared_docs = dict() +_extension_array_shared_docs = dict() # type: Dict[str, str] class ExtensionArray: diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index abe2853c75c874..7c90fb11aa1bf1 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -2,6 +2,7 @@ Base and utility classes for tseries type pandas objects. """ import operator +from typing import Set import warnings import numpy as np @@ -62,14 +63,17 @@ class DatetimeIndexOpsMixin(ExtensionOpsMixin): # DatetimeLikeArrayMixin assumes subclasses are mutable, so these are # properties there. They can be made into cache_readonly for Index # subclasses bc they are immutable - inferred_freq = cache_readonly(DatetimeLikeArrayMixin.inferred_freq.fget) - _isnan = cache_readonly(DatetimeLikeArrayMixin._isnan.fget) - hasnans = cache_readonly(DatetimeLikeArrayMixin._hasnans.fget) + inferred_freq = cache_readonly( + DatetimeLikeArrayMixin.inferred_freq.fget) # type: ignore + _isnan = cache_readonly(DatetimeLikeArrayMixin._isnan.fget) # type: ignore + hasnans = cache_readonly( + DatetimeLikeArrayMixin._hasnans.fget) # type: ignore _hasnans = hasnans # for index / array -agnostic code - _resolution = cache_readonly(DatetimeLikeArrayMixin._resolution.fget) - resolution = cache_readonly(DatetimeLikeArrayMixin.resolution.fget) + _resolution = cache_readonly( + DatetimeLikeArrayMixin._resolution.fget) # type: ignore + resolution = cache_readonly( + DatetimeLikeArrayMixin.resolution.fget) # type: ignore - _box_values = ea_passthrough(DatetimeLikeArrayMixin._box_values) _maybe_mask_results = ea_passthrough( DatetimeLikeArrayMixin._maybe_mask_results) __iter__ = ea_passthrough(DatetimeLikeArrayMixin.__iter__) @@ -131,11 +135,11 @@ def _ndarray_values(self): # Abstract data attributes @property - def values(self) -> np.ndarray: + def values(self): # Note: PeriodArray overrides this to return an ndarray of objects. return self._data._data - @property + @property # type: ignore # https://github.com/python/mypy/issues/1362 @Appender(DatetimeLikeArrayMixin.asi8.__doc__) def asi8(self): return self._data.asi8 @@ -762,9 +766,9 @@ class DatetimelikeDelegateMixin(PandasDelegate): boxed in an index, after being returned from the array """ # raw_methods : dispatch methods that shouldn't be boxed in an Index - _raw_methods = set() + _raw_methods = set() # type: Set[str] # raw_properties : dispatch properties that shouldn't be boxed in an Index - _raw_properties = set() + _raw_properties = set() # type: Set[str] name = None _data = None From f0919f272d9614058b5ebb5e0664d1ac6f23540f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 25 Jun 2019 05:48:13 -0700 Subject: [PATCH 052/238] BUG: Fix timedelta64+Timestamp, closes #24775 (#26916) --- doc/source/whatsnew/v0.25.0.rst | 2 ++ pandas/_libs/tslibs/c_timestamp.pyx | 12 +++++++++ .../tests/scalar/timestamp/test_arithmetic.py | 20 +++++++++++++++ .../scalar/timestamp/test_comparisons.py | 12 +++++++++ .../tests/scalar/timestamp/test_timestamp.py | 25 ------------------- 5 files changed, 46 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index d10f9567188d14..a58cdc8c93ab7b 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -603,6 +603,8 @@ Datetimelike - Bug when comparing a :class:`PeriodIndex` against a zero-dimensional numpy array (:issue:`26689`) - Bug in constructing a ``Series`` or ``DataFrame`` from a numpy ``datetime64`` array with a non-ns unit and out-of-bound timestamps generating rubbish data, which will now correctly raise an ``OutOfBoundsDatetime`` error (:issue:`26206`). - Bug in :func:`date_range` with unnecessary ``OverflowError`` being raised for very large or very small dates (:issue:`26651`) +- Bug where adding :class:`Timestamp` to a ``np.timedelta64`` object would raise instead of returning a :class:`Timestamp` (:issue:`24775`) +- Bug where comparing a zero-dimensional numpy array containing a ``np.datetime64`` object to a :class:`Timestamp` would incorrect raise ``TypeError`` (:issue:`26916`) Timedelta ^^^^^^^^^ diff --git a/pandas/_libs/tslibs/c_timestamp.pyx b/pandas/_libs/tslibs/c_timestamp.pyx index 6bf6b6dcea8dd5..f9d1a906207fe1 100644 --- a/pandas/_libs/tslibs/c_timestamp.pyx +++ b/pandas/_libs/tslibs/c_timestamp.pyx @@ -55,6 +55,9 @@ def maybe_integer_op_deprecated(obj): cdef class _Timestamp(datetime): + # higher than np.ndarray and np.matrix + __array_priority__ = 100 + def __hash__(_Timestamp self): if self.nanosecond: return hash(self.value) @@ -85,6 +88,15 @@ cdef class _Timestamp(datetime): if ndim == 0: if is_datetime64_object(other): other = self.__class__(other) + elif is_array(other): + # zero-dim array, occurs if try comparison with + # datetime64 scalar on the left hand side + # Unfortunately, for datetime64 values, other.item() + # incorrectly returns an integer, so we need to use + # the numpy C api to extract it. + other = cnp.PyArray_ToScalar(cnp.PyArray_DATA(other), + other) + other = self.__class__(other) else: return NotImplemented elif is_array(other): diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index 21e1dccaefc4bf..8310b140b50e00 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -112,3 +112,23 @@ def test_addition_subtraction_preserve_frequency(self): td64 = np.timedelta64(1, 'D') assert (ts + td64).freq == original_freq assert (ts - td64).freq == original_freq + + @pytest.mark.parametrize('td', [Timedelta(hours=3), + np.timedelta64(3, 'h'), + timedelta(hours=3)]) + def test_radd_tdscalar(self, td): + # GH#24775 timedelta64+Timestamp should not raise + ts = Timestamp.now() + assert td + ts == ts + td + + @pytest.mark.parametrize('other,expected_difference', [ + (np.timedelta64(-123, 'ns'), -123), + (np.timedelta64(1234567898, 'ns'), 1234567898), + (np.timedelta64(-123, 'us'), -123000), + (np.timedelta64(-123, 'ms'), -123000000) + ]) + def test_timestamp_add_timedelta64_unit(self, other, expected_difference): + ts = Timestamp(datetime.utcnow()) + result = ts + other + valdiff = result.value - ts.value + assert valdiff == expected_difference diff --git a/pandas/tests/scalar/timestamp/test_comparisons.py b/pandas/tests/scalar/timestamp/test_comparisons.py index 763cfc23ea8322..b572b4607108cd 100644 --- a/pandas/tests/scalar/timestamp/test_comparisons.py +++ b/pandas/tests/scalar/timestamp/test_comparisons.py @@ -156,6 +156,18 @@ def test_timestamp_compare_with_early_datetime(self): assert stamp < datetime(2700, 1, 1) assert stamp <= datetime(2700, 1, 1) + def test_compare_zerodim_array(self): + # GH#26916 + ts = Timestamp.now() + dt64 = np.datetime64('2016-01-01', 'ns') + arr = np.array(dt64) + assert arr.ndim == 0 + + result = arr < ts + assert result is True + result = arr > ts + assert result is False + def test_rich_comparison_with_unsupported_type(): # Comparisons with unsupported objects should return NotImplemented diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 4b6b0dac916c62..b9946796a4e1fb 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -796,31 +796,6 @@ def test_tz_conversion_freq(self, tz_naive_fixture): class TestTimestampNsOperations: - def setup_method(self, method): - self.timestamp = Timestamp(datetime.utcnow()) - - def assert_ns_timedelta(self, modified_timestamp, expected_value): - value = self.timestamp.value - modified_value = modified_timestamp.value - - assert modified_value - value == expected_value - - def test_timedelta_ns_arithmetic(self): - self.assert_ns_timedelta(self.timestamp + np.timedelta64(-123, 'ns'), - -123) - - def test_timedelta_ns_based_arithmetic(self): - self.assert_ns_timedelta(self.timestamp + np.timedelta64( - 1234567898, 'ns'), 1234567898) - - def test_timedelta_us_arithmetic(self): - self.assert_ns_timedelta(self.timestamp + np.timedelta64(-123, 'us'), - -123000) - - def test_timedelta_ms_arithmetic(self): - time = self.timestamp + np.timedelta64(-123, 'ms') - self.assert_ns_timedelta(time, -123000000) - def test_nanosecond_string_parsing(self): ts = Timestamp('2013-05-01 07:15:45.123456789') # GH 7878 From f5587633eec08212737158df98e3afbe3afd06f3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 25 Jun 2019 09:11:51 -0700 Subject: [PATCH 053/238] TST/REF: parametrize arithmetic tests, simplify parts of core.ops (#26799) --- pandas/core/ops.py | 53 +++++++++---------- pandas/tests/arithmetic/test_datetime64.py | 32 +++++++---- pandas/tests/arithmetic/test_period.py | 18 +++++-- pandas/tests/arithmetic/test_timedelta64.py | 19 +++++-- .../offsets/test_offsets_properties.py | 8 ++- 5 files changed, 81 insertions(+), 49 deletions(-) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 86a255321f8277..0b9e56fd19556a 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -1077,7 +1077,7 @@ def fill_binop(left, right, fill_value): return left, right -def mask_cmp_op(x, y, op, allowed_types): +def mask_cmp_op(x, y, op): """ Apply the function `op` to only non-null points in x and y. @@ -1086,16 +1086,14 @@ def mask_cmp_op(x, y, op, allowed_types): x : array-like y : array-like op : binary operation - allowed_types : class or tuple of classes Returns ------- result : ndarray[bool] """ - # TODO: Can we make the allowed_types arg unnecessary? xrav = x.ravel() result = np.empty(x.size, dtype=bool) - if isinstance(y, allowed_types): + if isinstance(y, (np.ndarray, ABCSeries)): yrav = y.ravel() mask = notna(xrav) & notna(yrav) result[mask] = op(np.array(list(xrav[mask])), @@ -1633,39 +1631,38 @@ def _arith_method_SERIES(cls, op, special): if op in [divmod, rdivmod] else _construct_result) def na_op(x, y): - import pandas.core.computation.expressions as expressions - try: - result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs) - except TypeError: - result = masked_arith_op(x, y, op) - - result = missing.fill_zeros(result, x, y, op_name, fill_zeros) - return result - - def safe_na_op(lvalues, rvalues): """ - return the result of evaluating na_op on the passed in values + Return the result of evaluating op on the passed in values. - try coercion to object type if the native types are not compatible + If native types are not compatible, try coersion to object dtype. Parameters ---------- - lvalues : array-like - rvalues : array-like + x : array-like + y : array-like or scalar + + Returns + ------- + array-like Raises ------ - TypeError: invalid operation + TypeError : invalid operation """ + import pandas.core.computation.expressions as expressions try: - with np.errstate(all='ignore'): - return na_op(lvalues, rvalues) - except Exception: - if is_object_dtype(lvalues): - return libalgos.arrmap_object(lvalues, - lambda x: op(x, rvalues)) + result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs) + except TypeError: + result = masked_arith_op(x, y, op) + except Exception: # TODO: more specific? + if is_object_dtype(x): + return libalgos.arrmap_object(x, + lambda val: op(val, y)) raise + result = missing.fill_zeros(result, x, y, op_name, fill_zeros) + return result + def wrapper(left, right): if isinstance(right, ABCDataFrame): return NotImplemented @@ -1713,7 +1710,8 @@ def wrapper(left, right): if isinstance(rvalues, ABCSeries): rvalues = rvalues.values - result = safe_na_op(lvalues, rvalues) + with np.errstate(all='ignore'): + result = na_op(lvalues, rvalues) return construct_result(left, result, index=left.index, name=res_name, dtype=None) @@ -2136,7 +2134,6 @@ def na_op(x, y): result = masked_arith_op(x, y, op) result = missing.fill_zeros(result, x, y, op_name, fill_zeros) - return result if op_name in _op_descriptions: @@ -2183,7 +2180,7 @@ def na_op(x, y): with np.errstate(invalid='ignore'): result = op(x, y) except TypeError: - result = mask_cmp_op(x, y, op, (np.ndarray, ABCSeries)) + result = mask_cmp_op(x, y, op) return result doc = _flex_comp_doc_FRAME.format(op_name=op_name, diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index afd29852fea7e1..64b4e162483f11 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -37,6 +37,27 @@ def assert_all(obj): # ------------------------------------------------------------------ # Comparisons +class TestDatetime64ArrayLikeComparisons: + # Comparison tests for datetime64 vectors fully parametrized over + # DataFrame/Series/DatetimeIndex/DateteimeArray. Ideally all comparison + # tests will eventually end up here. + + def test_compare_zerodim(self, tz_naive_fixture, box_with_array): + # Test comparison with zero-dimensional array is unboxed + tz = tz_naive_fixture + box = box_with_array + xbox = box_with_array if box_with_array is not pd.Index else np.ndarray + dti = date_range('20130101', periods=3, tz=tz) + + other = np.array(dti.to_numpy()[0]) + + # FIXME: ValueError with transpose on tzaware + dtarr = tm.box_expected(dti, box, transpose=False) + result = dtarr <= other + expected = np.array([True, False, False]) + expected = tm.box_expected(expected, xbox, transpose=False) + tm.assert_equal(result, expected) + class TestDatetime64DataFrameComparison: @pytest.mark.parametrize('timestamps', [ @@ -339,17 +360,6 @@ def test_comparison_tzawareness_compat(self, op): class TestDatetimeIndexComparisons: - # TODO: parametrize over box - def test_compare_zerodim(self, tz_naive_fixture): - # Test comparison with zero-dimensional array is unboxed - tz = tz_naive_fixture - dti = date_range('20130101', periods=3, tz=tz) - - other = np.array(dti.to_numpy()[0]) - result = dti <= other - expected = np.array([True, False, False]) - tm.assert_numpy_array_equal(result, expected) - # TODO: moved from tests.indexes.test_base; parametrize and de-duplicate @pytest.mark.parametrize("op", [ operator.eq, operator.ne, operator.gt, operator.lt, diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index bc1b78bf944d15..413d58d9429e7e 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -20,17 +20,27 @@ # Comparisons -class TestPeriodIndexComparisons: +class TestPeriodArrayLikeComparisons: + # Comparison tests for PeriodDtype vectors fully parametrized over + # DataFrame/Series/PeriodIndex/PeriodArray. Ideally all comparison + # tests will eventually end up here. - # TODO: parameterize over boxes - def test_compare_zerodim(self): + def test_compare_zerodim(self, box_with_array): # GH#26689 make sure we unbox zero-dimensional arrays + xbox = box_with_array if box_with_array is not pd.Index else np.ndarray + pi = pd.period_range('2000', periods=4) other = np.array(pi.to_numpy()[0]) + pi = tm.box_expected(pi, box_with_array) result = pi <= other expected = np.array([True, False, False, False]) - tm.assert_numpy_array_equal(result, expected) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(result, expected) + + +class TestPeriodIndexComparisons: + # TODO: parameterize over boxes @pytest.mark.parametrize("other", ["2017", 2017]) def test_eq(self, other): diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 047900c3d7586d..22b5fd452d6615 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -31,22 +31,33 @@ def get_upcast_box(box, vector): # ------------------------------------------------------------------ # Timedelta64[ns] dtype Comparisons -class TestTimedelta64ArrayComparisons: - # TODO: All of these need to be parametrized over box +class TestTimedelta64ArrayLikeComparisons: + # Comparison tests for timedelta64[ns] vectors fully parametrized over + # DataFrame/Series/TimedeltaIndex/TimedeltaArray. Ideally all comparison + # tests will eventually end up here. - def test_compare_timedelta64_zerodim(self): + def test_compare_timedelta64_zerodim(self, box_with_array): # GH#26689 should unbox when comparing with zerodim array + box = box_with_array + xbox = box_with_array if box_with_array is not pd.Index else np.ndarray + tdi = pd.timedelta_range('2H', periods=4) other = np.array(tdi.to_numpy()[0]) + tdi = tm.box_expected(tdi, box) res = tdi <= other expected = np.array([True, False, False, False]) - tm.assert_numpy_array_equal(res, expected) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(res, expected) with pytest.raises(TypeError): # zero-dim of wrong dtype should still raise tdi >= np.array(4) + +class TestTimedelta64ArrayComparisons: + # TODO: All of these need to be parametrized over box + def test_compare_timedelta_series(self): # regression test for GH#5963 s = pd.Series([timedelta(days=1), timedelta(days=2)]) diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py index 50be2deca4d30c..271f4ceef5f49a 100644 --- a/pandas/tests/tseries/offsets/test_offsets_properties.py +++ b/pandas/tests/tseries/offsets/test_offsets_properties.py @@ -71,7 +71,10 @@ def test_on_offset_implementations(dt, offset): assert offset.onOffset(dt) == (compare == dt) -@pytest.mark.xfail +@pytest.mark.xfail(reason="res_v2 below is incorrect, needs to use the " + "commented-out version with tz_localize. " + "But with that fix in place, hypothesis then " + "has errors in timezone generation.") @given(gen_yqm_offset, gen_date_range) def test_apply_index_implementations(offset, rng): # offset.apply_index(dti)[i] should match dti[i] + offset @@ -82,6 +85,7 @@ def test_apply_index_implementations(offset, rng): res = rng + offset res_v2 = offset.apply_index(rng) + # res_v2 = offset.apply_index(rng.tz_localize(None)).tz_localize(rng.tz) assert (res == res_v2).all() assert res[0] == rng[0] + offset @@ -93,7 +97,7 @@ def test_apply_index_implementations(offset, rng): # TODO: Check randomly assorted entries, not just first/last -@pytest.mark.xfail +@pytest.mark.xfail # TODO: reason? @given(gen_yqm_offset) def test_shift_across_dst(offset): # GH#18319 check that 1) timezone is correctly normalized and From c9182df84736ce060c30d386c9f3a97614ca7778 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 25 Jun 2019 09:13:19 -0700 Subject: [PATCH 054/238] CLN: Remove never-True Block.is_sparse (#27037) --- pandas/core/dtypes/concat.py | 5 +---- pandas/core/internals/blocks.py | 34 +++++++++---------------------- pandas/core/internals/concat.py | 2 -- pandas/core/internals/managers.py | 20 +++++++----------- 4 files changed, 18 insertions(+), 43 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index a01ba7fc94f229..242885c7a96793 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -73,10 +73,7 @@ def _get_series_result_type(result, objs=None): return DataFrame # otherwise it is a SingleBlockManager (axis = 0) - if result._block.is_sparse: - return SparseSeries - else: - return objs[0]._constructor + return objs[0]._constructor def _get_frame_result_type(result, objs): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4cc6c86417b3b8..92ea936944a3ca 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -62,9 +62,7 @@ class Block(PandasObject): is_bool = False is_object = False is_categorical = False - is_sparse = False is_extension = False - _box_to_block_values = True _can_hold_na = False _can_consolidate = True _verify_integrity = True @@ -182,10 +180,6 @@ def get_values(self, dtype=None): def to_dense(self): return self.values.view() - @property - def _na_value(self): - return np.nan - @property def fill_value(self): return np.nan @@ -1189,8 +1183,6 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): # sparse is treated like an ndarray, but needs .get_values() shaping values = self.values - if self.is_sparse: - values = self.get_values() if fill_tuple is None: fill_value = self.fill_value @@ -1411,6 +1403,9 @@ def quantile(self, qs, interpolation='linear', axis=0): ------- Block """ + # We should always have ndim == 2 becase Series dispatches to DataFrame + assert self.ndim == 2 + if self.is_datetimetz: # TODO: cleanup this special case. # We need to operate on i8 values for datetimetz @@ -1420,8 +1415,7 @@ def quantile(self, qs, interpolation='linear', axis=0): # TODO: NonConsolidatableMixin shape # Usual shape inconsistencies for ExtensionBlocks - if self.ndim > 1: - values = values[None, :] + values = values[None, :] else: values = self.get_values() values, _ = self._try_coerce_args(values, values) @@ -1433,14 +1427,11 @@ def quantile(self, qs, interpolation='linear', axis=0): qs = [qs] if is_empty: - if self.ndim == 1: - result = self._na_value - else: - # create the array of na_values - # 2d len(values) * len(qs) - result = np.repeat(np.array([self.fill_value] * len(qs)), - len(values)).reshape(len(values), - len(qs)) + # create the array of na_values + # 2d len(values) * len(qs) + result = np.repeat(np.array([self.fill_value] * len(qs)), + len(values)).reshape(len(values), + len(qs)) else: # asarray needed for Sparse, see GH#24600 # TODO: Why self.values and not values? @@ -1451,8 +1442,7 @@ def quantile(self, qs, interpolation='linear', axis=0): interpolation=interpolation) result = np.array(result, copy=False) - if self.ndim > 1: - result = result.T + result = result.T if orig_scalar and not lib.is_scalar(result): # result could be scalar in case with is_empty and self.ndim == 1 @@ -2024,10 +2014,6 @@ class DatetimeLikeBlockMixin: def _holder(self): return DatetimeArray - @property - def _na_value(self): - return tslibs.NaT - @property def fill_value(self): return tslibs.iNaT diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index d92c15e1d6f93f..8f699ae24230db 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -187,8 +187,6 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): pass elif getattr(self.block, 'is_categorical', False): pass - elif getattr(self.block, 'is_sparse', False): - pass elif getattr(self.block, 'is_extension', False): pass else: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 592c385dd87ec0..26b6920c119dd0 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -102,16 +102,11 @@ def __init__(self, self.blocks = tuple(blocks) # type: Tuple[Block, ...] for block in blocks: - if block.is_sparse: - if len(block.mgr_locs) != 1: - raise AssertionError("Sparse block refers to multiple " - "items") - else: - if self.ndim != block.ndim: - raise AssertionError( - 'Number of Block dimensions ({block}) must equal ' - 'number of axes ({self})'.format(block=block.ndim, - self=self.ndim)) + if self.ndim != block.ndim: + raise AssertionError( + 'Number of Block dimensions ({block}) must equal ' + 'number of axes ({self})'.format(block=block.ndim, + self=self.ndim)) if do_integrity_check: self._verify_integrity() @@ -966,7 +961,7 @@ def iget(self, i, fastpath=True): """ block = self.blocks[self._blknos[i]] values = block.iget(self._blklocs[i]) - if not fastpath or not block._box_to_block_values or values.ndim != 1: + if not fastpath or values.ndim != 1: return values # fastpath shortcut for select a single-dim from a 2-dim BM @@ -1820,8 +1815,7 @@ def _shape_compat(x): def _interleaved_dtype( - blocks: List[Block] -) -> Optional[Union[np.dtype, ExtensionDtype]]: + blocks: List[Block]) -> Optional[Union[np.dtype, ExtensionDtype]]: """Find the common dtype for `blocks`. Parameters From 6119d0253e013b31de7d050a52af2226e864d304 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 26 Jun 2019 06:21:36 -0500 Subject: [PATCH 055/238] CI: conda 4.7.1 (#26595) CI compat with conda 4.7.x Bump minimum deps for removal of free. --- ci/azure/windows.yml | 4 +++- ci/deps/travis-36-locale.yaml | 12 +++++------- doc/source/install.rst | 6 +++--- doc/source/whatsnew/v0.25.0.rst | 6 ++++++ pandas/compat/_optional.py | 3 ++- pandas/io/html.py | 2 +- 6 files changed, 20 insertions(+), 13 deletions(-) diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml index 6d4afccb578651..20cad1bb4af962 100644 --- a/ci/azure/windows.yml +++ b/ci/azure/windows.yml @@ -21,7 +21,9 @@ jobs: displayName: 'Add conda to PATH' - script: conda update -q -n base conda displayName: Update conda - - script: conda env create -q --file ci\\deps\\azure-windows-$(CONDA_PY).yaml + - script: | + call activate + conda env create -q --file ci\\deps\\azure-windows-$(CONDA_PY).yaml displayName: 'Create anaconda environment' - script: | call activate pandas-dev diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index badf4e6932da80..75e3348adab7c7 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -8,24 +8,22 @@ dependencies: - python-blosc - cython>=0.28.2 - fastparquet=0.2.1 - - gcsfs=0.1.0 + - gcsfs=0.2.2 - html5lib - ipython - jinja2 - - lxml=3.7.0 - - matplotlib=3.0.0 + - lxml=3.8.0 + - matplotlib=3.0.* - nomkl - numexpr - numpy - openpyxl - pandas-gbq=0.8.0 - psycopg2=2.6.2 - - pymysql=0.7.9 + - pymysql=0.7.11 - pytables - python-dateutil - # cannot go past python=3.6.6 for matplotlib=3.0.0 due to - # https://github.com/matplotlib/matplotlib/issues/12626 - - python=3.6.6 + - python=3.6.* - pytz - s3fs=0.0.8 - scipy diff --git a/doc/source/install.rst b/doc/source/install.rst index ee4b36f898e314..013a27c980e977 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -276,15 +276,15 @@ SciPy 0.19.0 Miscellaneous statistical functions XLsxWriter 0.9.8 Excel writing blosc Compression for msgpack fastparquet 0.2.1 Parquet reading / writing -gcsfs 0.1.0 Google Cloud Storage access +gcsfs 0.2.2 Google Cloud Storage access html5lib HTML parser for read_html (see :ref:`note `) -lxml HTML parser for read_html (see :ref:`note `) +lxml 3.8.0 HTML parser for read_html (see :ref:`note `) matplotlib 2.2.2 Visualization openpyxl 2.4.8 Reading / writing for xlsx files pandas-gbq 0.8.0 Google Big Query access psycopg2 PostgreSQL engine for sqlalchemy pyarrow 0.9.0 Parquet and feather reading / writing -pymysql MySQL engine for sqlalchemy +pymysql 0.7.11 MySQL engine for sqlalchemy pyreadstat SPSS files (.sav) reading pytables 3.4.2 HDF5 reading / writing qtpy Clipboard I/O diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a58cdc8c93ab7b..18a3785867714c 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -455,12 +455,18 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+ | fastparquet | 0.2.1 | +-----------------+-----------------+ +| gcsfs | 0.2.2 | ++-----------------+-----------------+ +| lxml | 3.8.0 | ++-----------------+-----------------+ | matplotlib | 2.2.2 | +-----------------+-----------------+ | openpyxl | 2.4.8 | +-----------------+-----------------+ | pyarrow | 0.9.0 | +-----------------+-----------------+ +| pymysql | 0.7.1 | ++-----------------+-----------------+ | pytables | 3.4.2 | +-----------------+-----------------+ | scipy | 0.19.0 | diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 875edb3d3f1dd1..31746dc3d6c164 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -9,7 +9,8 @@ "bs4": "4.6.0", "bottleneck": "1.2.1", "fastparquet": "0.2.1", - "gcsfs": "0.1.0", + "gcsfs": "0.2.2", + "lxml.etree": "3.8.0", "matplotlib": "2.2.2", "numexpr": "2.6.2", "openpyxl": "2.4.8", diff --git a/pandas/io/html.py b/pandas/io/html.py index 2e2327a35f2c74..15b9d25f6be6c3 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -40,7 +40,7 @@ def _importers(): on_version="ignore") _HAS_BS4 = bs4 is not None - lxml = import_optional_dependency("lxml", raise_on_missing=False, + lxml = import_optional_dependency("lxml.etree", raise_on_missing=False, on_version="ignore") _HAS_LXML = lxml is not None From b4aa1d66ecc8d391e8003a0a398bd96fe956bfdf Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 26 Jun 2019 07:18:54 -0500 Subject: [PATCH 056/238] PKG: Add test extra (#27039) --- setup.cfg | 1 + setup.py | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/setup.cfg b/setup.cfg index 68d042ecfc4b8d..eb687c1f546d43 100644 --- a/setup.cfg +++ b/setup.cfg @@ -57,6 +57,7 @@ split_penalty_after_opening_bracket = 1000000 split_penalty_logical_operator = 30 [tool:pytest] +# sync minversion with setup.cfg & install.rst minversion = 4.0.2 testpaths = pandas markers = diff --git a/setup.py b/setup.py index 389e8553eb3a31..0380c717ecb415 100755 --- a/setup.py +++ b/setup.py @@ -784,4 +784,12 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): classifiers=CLASSIFIERS, platforms='any', python_requires='>=3.5', + extras_require={ + 'test': [ + # sync with setup.cfg minversion & install.rst + 'pytest>=4.0.2', + 'pytest-xdist', + 'hypothesis>=3.58', + ] + }, **setuptools_kwargs) From a7f1d69b135bbbf649cf1af9a62d79acb963e47c Mon Sep 17 00:00:00 2001 From: Mak Sze Chun Date: Wed, 26 Jun 2019 21:28:15 +0800 Subject: [PATCH 057/238] [ENH] nargsort handles EA with its _values_for_argsort (#26854) --- pandas/core/sorting.py | 17 +++++++---------- pandas/tests/extension/base/methods.py | 10 ++++++++++ pandas/tests/test_sorting.py | 10 +--------- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 21c0c8f747b10e..750a4c903176f8 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -1,6 +1,4 @@ """ miscellaneous sorting / groupby utilities """ -import warnings - import numpy as np from pandas._libs import algos, hashtable, lib @@ -238,13 +236,15 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): handles NaNs. It adds ascending and na_position parameters. GH #6399, #5231 """ + from pandas.core.internals.arrays import extract_array + items = extract_array(items) + mask = np.asarray(isna(items)) # specially handle Categorical if is_categorical_dtype(items): if na_position not in {'first', 'last'}: raise ValueError('invalid na_position: {!r}'.format(na_position)) - mask = isna(items) cnt_null = mask.sum() sorted_idx = items.argsort(ascending=ascending, kind=kind) if ascending and na_position == 'last': @@ -255,15 +255,12 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): sorted_idx = np.roll(sorted_idx, cnt_null) return sorted_idx - with warnings.catch_warnings(): - # https://github.com/pandas-dev/pandas/issues/25439 - # can be removed once ExtensionArrays are properly handled by nargsort - warnings.filterwarnings( - "ignore", category=FutureWarning, - message="Converting timezone-aware DatetimeArray to") + if is_extension_array_dtype(items): + items = items._values_for_argsort() + else: items = np.asanyarray(items) + idx = np.arange(len(items)) - mask = isna(items) non_nans = items[~mask] non_nan_idx = idx[~mask] nan_idx = np.nonzero(mask)[0] diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index c8fd4d1b708e5c..94069994079e3a 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -2,6 +2,7 @@ import pytest import pandas as pd +from pandas.core.sorting import nargsort import pandas.util.testing as tm from .base import BaseExtensionTests @@ -51,6 +52,15 @@ def test_argsort_missing(self, data_missing_for_sorting): expected = pd.Series(np.array([1, -1, 0], dtype=np.int64)) self.assert_series_equal(result, expected) + @pytest.mark.parametrize('na_position, expected', [ + ('last', np.array([2, 0, 1], dtype='int64')), + ('first', np.array([1, 2, 0], dtype='int64')) + ]) + def test_nargsort(self, data_missing_for_sorting, na_position, expected): + # GH 25439 + result = nargsort(data_missing_for_sorting, na_position=na_position) + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize('ascending', [True, False]) def test_sort_values(self, data_for_sorting, ascending): ser = pd.Series(data_for_sorting) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 75fa37eb9af095..f198fb6ae57b1e 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -6,8 +6,7 @@ from numpy import nan import pytest -from pandas import ( - DataFrame, MultiIndex, Series, array, concat, merge, to_datetime) +from pandas import DataFrame, MultiIndex, Series, array, concat, merge from pandas.core import common as com from pandas.core.sorting import ( decons_group_index, get_group_index, is_int64_overflow_possible, @@ -181,13 +180,6 @@ def test_nargsort(self): exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - def test_nargsort_datetimearray_warning(self): - # https://github.com/pandas-dev/pandas/issues/25439 - # can be removed once the FutureWarning for np.array(DTA) is removed - data = to_datetime([0, 2, 0, 1]).tz_localize('Europe/Brussels') - with tm.assert_produces_warning(None): - nargsort(data) - class TestMerge: From ff50b46045886604dd70438f73df7bf9da3da89b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 26 Jun 2019 08:56:34 -0700 Subject: [PATCH 058/238] CLN: Remove no-longer-used BlockManager.xs (#27043) * Remove no-longer-used BlockManager.xs --- pandas/core/internals/blocks.py | 17 ++-------- pandas/core/internals/managers.py | 42 ------------------------ pandas/core/panel.py | 8 +---- pandas/tests/internals/test_internals.py | 15 --------- 4 files changed, 4 insertions(+), 78 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 92ea936944a3ca..c6be56df7ae0c9 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -647,24 +647,13 @@ def _try_cast_result(self, result, dtype=None): if self.is_integer or self.is_bool or self.is_datetime: pass elif self.is_float and result.dtype == self.dtype: - # protect against a bool/object showing up here if isinstance(dtype, str) and dtype == 'infer': return result - if not isinstance(dtype, type): - dtype = dtype.type - if issubclass(dtype, (np.bool_, np.object_)): - if issubclass(dtype, np.bool_): - if isna(result).all(): - return result.astype(np.bool_) - else: - result = result.astype(np.object_) - result[result == 1] = True - result[result == 0] = False - return result - else: - return result.astype(np.object_) + # This is only reached via Block.setitem, where dtype is always + # either "infer", self.dtype, or values.dtype. + assert dtype == self.dtype, (dtype, self.dtype) return result # may need to change the dtype here diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 26b6920c119dd0..aff39d765dc95f 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -816,48 +816,6 @@ def to_dict(self, copy=True): return {dtype: self.combine(blocks, copy=copy) for dtype, blocks in bd.items()} - def xs(self, key, axis=1, copy=True, takeable=False): - if axis < 1: - raise AssertionError( - 'Can only take xs across axis >= 1, got {ax}'.format(ax=axis)) - - # take by position - if takeable: - loc = key - else: - loc = self.axes[axis].get_loc(key) - - slicer = [slice(None, None) for _ in range(self.ndim)] - slicer[axis] = loc - slicer = tuple(slicer) - - new_axes = list(self.axes) - - # could be an array indexer! - if isinstance(loc, (slice, np.ndarray)): - new_axes[axis] = new_axes[axis][loc] - else: - new_axes.pop(axis) - - new_blocks = [] - if len(self.blocks) > 1: - # we must copy here as we are mixed type - for blk in self.blocks: - newb = make_block(values=blk.values[slicer], - klass=blk.__class__, - placement=blk.mgr_locs) - new_blocks.append(newb) - elif len(self.blocks) == 1: - block = self.blocks[0] - vals = block.values[slicer] - if copy: - vals = vals.copy() - new_blocks = [make_block(values=vals, - placement=block.mgr_locs, - klass=block.__class__)] - - return self.__class__(new_blocks, new_axes) - def fast_xs(self, loc): """ get a cross sectional for a given location in the diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 9d6b7333ca39fd..c0340fc975a7e0 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -856,13 +856,7 @@ def xs(self, key, axis=1): if axis == 0: return self[key] - self._consolidate_inplace() - axis_number = self._get_axis_number(axis) - new_data = self._data.xs(key, axis=axis_number, copy=False) - result = self._construct_return_type(new_data) - copy = new_data.is_mixed_type - result._set_is_copy(self, copy=copy) - return result + raise NotImplementedError("Panel is removed in pandas 0.25.0") _xs = xs diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index fbd821f8ec3422..b997e2b6eec8fc 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -705,21 +705,6 @@ def test_reindex_items(self): mgr.get('d').internal_values(), reindexed.get('d').internal_values()) - def test_multiindex_xs(self): - mgr = create_mgr('a,b,c: f8; d,e,f: i8') - - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - - mgr.set_axis(1, index) - result = mgr.xs('bar', axis=1) - assert result.shape == (6, 2) - assert result.axes[1][0] == ('bar', 'one') - assert result.axes[1][1] == ('bar', 'two') - def test_get_numeric_data(self): mgr = create_mgr('int: int; float: float; complex: complex;' 'str: object; bool: bool; obj: object; dt: datetime', From 19d5d53988dbfa58ce1ecc3d83b7025e4171a6e5 Mon Sep 17 00:00:00 2001 From: Pav A Date: Wed, 26 Jun 2019 20:24:45 +0100 Subject: [PATCH 059/238] DEV: Install pyreadstat using conda instead (#27061) --- environment.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index de9bd67dd9f062..c21a0949fc4039 100644 --- a/environment.yml +++ b/environment.yml @@ -79,5 +79,4 @@ dependencies: - xlrd # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - xlsxwriter # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - xlwt # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - - pip: - - pyreadstat # pandas.read_spss + - pyreadstat # pandas.read_spss From d94146cf5d5d9f899a31308d0a4d9e5d132f6442 Mon Sep 17 00:00:00 2001 From: krsnik93 Date: Wed, 26 Jun 2019 20:48:43 +0100 Subject: [PATCH 060/238] BUG: Raise a ValueError when index and data lengths don't match (#26911) * Raise a ValueError when index and data lengths don't match --- doc/source/whatsnew/v0.25.0.rst | 2 ++ pandas/core/indexing.py | 49 ++++++++++++++++++++++-------- pandas/tests/indexing/test_iloc.py | 14 +++++++-- pandas/tests/indexing/test_loc.py | 12 +++++++- 4 files changed, 62 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 18a3785867714c..901e4f69428976 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -673,6 +673,8 @@ Indexing ^^^^^^^^ - Improved exception message when calling :meth:`DataFrame.iloc` with a list of non-numeric objects (:issue:`25753`). +- Improved exception message when calling ``.iloc`` or ``.loc`` with a boolean indexer with different length (:issue:`26658`). +- Bug in ``.iloc`` and ``.loc`` with a boolean indexer not raising an ``IndexError`` when too few items are passed (:issue:`26658`). - Bug in :meth:`DataFrame.loc` and :meth:`Series.loc` where ``KeyError`` was not raised for a ``MultiIndex`` when the key was less than or equal to the number of levels in the :class:`MultiIndex` (:issue:`14885`). - Bug in which :meth:`DataFrame.append` produced an erroneous warning indicating that a ``KeyError`` will be thrown in the future when the data to be appended contains new columns (:issue:`22252`). - Bug in which :meth:`DataFrame.to_csv` caused a segfault for a reindexed data frame, when the indices were single-level :class:`MultiIndex` (:issue:`26303`). diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index f6aa54f4836d9a..1539feb2e0856c 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2407,29 +2407,54 @@ def convert_to_index_sliceable(obj, key): return None -def check_bool_indexer(ax, key): - # boolean indexing, need to check that the data are aligned, otherwise - # disallowed +def check_bool_indexer(index: Index, key) -> np.ndarray: + """ + Check if key is a valid boolean indexer for an object with such index and + perform reindexing or conversion if needed. + + This function assumes that is_bool_indexer(key) == True. + + Parameters + ---------- + index : Index + Index of the object on which the indexing is done + key : list-like + Boolean indexer to check - # this function assumes that is_bool_indexer(key) == True + Returns + ------- + result: np.array + Resulting key + Raises + ------ + IndexError + If the key does not have the same length as index + + IndexingError + If the index of the key is unalignable to index + + """ result = key - if isinstance(key, ABCSeries) and not key.index.equals(ax): - result = result.reindex(ax) + if isinstance(key, ABCSeries) and not key.index.equals(index): + result = result.reindex(index) mask = isna(result._values) if mask.any(): raise IndexingError('Unalignable boolean Series provided as ' 'indexer (index of the boolean Series and of ' - 'the indexed object do not match') + 'the indexed object do not match).') result = result.astype(bool)._values - elif is_sparse(result): - result = result.to_dense() - result = np.asarray(result, dtype=bool) else: - # is_bool_indexer has already checked for nulls in the case of an - # object array key, so no check needed here + if is_sparse(result): + result = result.to_dense() result = np.asarray(result, dtype=bool) + # GH26658 + if len(result) != len(index): + raise IndexError( + 'Item wrong length {} instead of {}.'.format(len(result), + len(index))) + return result diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 6b5ad66e268df2..8b54907131b8c2 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -265,6 +265,16 @@ def test_iloc_getitem_bool(self): typs=['labels', 'mixed', 'ts', 'floats', 'empty'], fails=IndexError) + @pytest.mark.parametrize('index', [[True, False], + [True, False, True, False]]) + def test_iloc_getitem_bool_diff_len(self, index): + # GH26658 + s = Series([1, 2, 3]) + with pytest.raises(IndexError, + match=('Item wrong length {} instead of {}.'.format( + len(index), len(s)))): + _ = s.iloc[index] + def test_iloc_getitem_slice(self): # slices @@ -614,10 +624,10 @@ def test_iloc_mask(self): 'cannot use an indexable as a mask'), ('locs', ''): 'Unalignable boolean Series provided as indexer ' '(index of the boolean Series and of the indexed ' - 'object do not match', + 'object do not match).', ('locs', '.loc'): 'Unalignable boolean Series provided as indexer ' '(index of the boolean Series and of the ' - 'indexed object do not match', + 'indexed object do not match).', ('locs', '.iloc'): ('iLocation based boolean indexing on an ' 'integer type is not available'), } diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 11d0fa2602baac..2f6e908717071f 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -194,7 +194,17 @@ def test_loc_getitem_bool(self): typs=['ints', 'uints', 'labels', 'mixed', 'ts', 'floats']) self.check_result('bool', 'loc', b, 'ix', b, typs=['empty'], - fails=KeyError) + fails=IndexError) + + @pytest.mark.parametrize('index', [[True, False], + [True, False, True, False]]) + def test_loc_getitem_bool_diff_len(self, index): + # GH26658 + s = Series([1, 2, 3]) + with pytest.raises(IndexError, + match=('Item wrong length {} instead of {}.'.format( + len(index), len(s)))): + _ = s.loc[index] def test_loc_getitem_int_slice(self): From edf5ae8585a5cc45ab0a95d924555fd9ec2576d7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 26 Jun 2019 22:17:02 -0400 Subject: [PATCH 061/238] Admin: Disable codecov comments and re-enable results in the checks (#27066) --- codecov.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/codecov.yml b/codecov.yml index 512bc2e82a7363..1644bf315e0ac7 100644 --- a/codecov.yml +++ b/codecov.yml @@ -1,13 +1,13 @@ codecov: branch: master +comment: off + coverage: status: project: default: - enabled: no target: '82' patch: default: - enabled: no target: '50' From e9555159afd8caabc88b45ca6c06a16efe2f6afc Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 26 Jun 2019 21:49:01 -0500 Subject: [PATCH 062/238] Used default follow import strategy (#27048) --- mypy.ini | 1 - 1 file changed, 1 deletion(-) diff --git a/mypy.ini b/mypy.ini index f8b37ee5b86638..d29beeca73f1b8 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,6 +1,5 @@ [mypy] ignore_missing_imports=True -follow_imports=silent [mypy-pandas.conftest,pandas.tests.*] ignore_errors=True \ No newline at end of file From 1452e714bd3484d8da1bc060357e619e9de97c11 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 27 Jun 2019 10:44:06 -0500 Subject: [PATCH 063/238] COMPAT: 32-bit nargsort (#27064) --- pandas/tests/extension/base/methods.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 94069994079e3a..d9e61e6a227e63 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -53,8 +53,8 @@ def test_argsort_missing(self, data_missing_for_sorting): self.assert_series_equal(result, expected) @pytest.mark.parametrize('na_position, expected', [ - ('last', np.array([2, 0, 1], dtype='int64')), - ('first', np.array([1, 2, 0], dtype='int64')) + ('last', np.array([2, 0, 1], dtype=np.dtype('intp'))), + ('first', np.array([1, 2, 0], dtype=np.dtype('intp'))) ]) def test_nargsort(self, data_missing_for_sorting, na_position, expected): # GH 25439 From c1673cf42e2827c06b16c3f9bc2affd9df7c311a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 27 Jun 2019 11:37:36 -0500 Subject: [PATCH 064/238] CLN/PERF: remove gc.collect from the setting_with_copy checks as not needed (#27031) * CLN/PERF: remove gc.collect from the setting_with_copy checks as not needed * fixup whatsnew * add asv --- asv_bench/benchmarks/indexing.py | 18 ++++++- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/generic.py | 86 +++++++++++++++----------------- 3 files changed, 57 insertions(+), 48 deletions(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 4c932cf3600e85..4e82fa55925292 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -5,7 +5,7 @@ from pandas import (Series, DataFrame, MultiIndex, Int64Index, UInt64Index, Float64Index, IntervalIndex, CategoricalIndex, - IndexSlice, concat, date_range) + IndexSlice, concat, date_range, option_context) class NumericSeriesIndexing: @@ -335,4 +335,20 @@ def time_assign_with_setitem(self): self.df[i] = np.random.randn(self.N) +class ChainIndexing: + + params = [None, 'warn'] + param_names = ['mode'] + + def setup(self, mode): + self.N = 1000000 + + def time_chained_indexing(self, mode): + with warnings.catch_warnings(record=True): + with option_context('mode.chained_assignment', mode): + df = DataFrame({'A': np.arange(self.N), 'B': 'foo'}) + df2 = df[df.A > self.N // 2] + df2['C'] = 1.0 + + from .pandas_vb_common import setup # noqa: F401 diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 901e4f69428976..ffd5ba19cd0746 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -579,6 +579,7 @@ Performance Improvements - Improved performance of :attr:`IntervalIndex.is_unique` by removing conversion to ``MultiIndex`` (:issue:`24813`) - Restored performance of :meth:`DatetimeIndex.__iter__` by re-enabling specialized code path (:issue:`26702`) - Improved performance when building :class:`MultiIndex` with at least one :class:`CategoricalIndex` level (:issue:`22044`) +- Improved performance by removing the need for a garbage collect when checking for ``SettingWithCopyWarning`` (:issue:`27031`) .. _whatsnew_0250.bug_fixes: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 992c83e66090e8..1af3e9449f3dab 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3263,58 +3263,50 @@ def _check_setitem_copy(self, stacklevel=4, t='setting', force=False): """ - if force or self._is_copy: + # return early if the check is not needed + if not (force or self._is_copy): + return - value = config.get_option('mode.chained_assignment') - if value is None: - return - - # see if the copy is not actually referred; if so, then dissolve - # the copy weakref - try: - gc.collect(2) - if not gc.get_referents(self._is_copy()): - self._is_copy = None - return - except Exception: - pass + value = config.get_option('mode.chained_assignment') + if value is None: + return - # we might be a false positive - try: - if self._is_copy().shape == self.shape: - self._is_copy = None - return - except Exception: - pass + # see if the copy is not actually referred; if so, then dissolve + # the copy weakref + if self._is_copy is not None and not isinstance(self._is_copy, str): + r = self._is_copy() + if not gc.get_referents(r) or r.shape == self.shape: + self._is_copy = None + return - # a custom message - if isinstance(self._is_copy, str): - t = self._is_copy + # a custom message + if isinstance(self._is_copy, str): + t = self._is_copy - elif t == 'referant': - t = ("\n" - "A value is trying to be set on a copy of a slice from a " - "DataFrame\n\n" - "See the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/user_guide/" - "indexing.html#returning-a-view-versus-a-copy" - ) + elif t == 'referant': + t = ("\n" + "A value is trying to be set on a copy of a slice from a " + "DataFrame\n\n" + "See the caveats in the documentation: " + "http://pandas.pydata.org/pandas-docs/stable/user_guide/" + "indexing.html#returning-a-view-versus-a-copy" + ) - else: - t = ("\n" - "A value is trying to be set on a copy of a slice from a " - "DataFrame.\n" - "Try using .loc[row_indexer,col_indexer] = value " - "instead\n\nSee the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/user_guide/" - "indexing.html#returning-a-view-versus-a-copy" - ) - - if value == 'raise': - raise com.SettingWithCopyError(t) - elif value == 'warn': - warnings.warn(t, com.SettingWithCopyWarning, - stacklevel=stacklevel) + else: + t = ("\n" + "A value is trying to be set on a copy of a slice from a " + "DataFrame.\n" + "Try using .loc[row_indexer,col_indexer] = value " + "instead\n\nSee the caveats in the documentation: " + "http://pandas.pydata.org/pandas-docs/stable/user_guide/" + "indexing.html#returning-a-view-versus-a-copy" + ) + + if value == 'raise': + raise com.SettingWithCopyError(t) + elif value == 'warn': + warnings.warn(t, com.SettingWithCopyWarning, + stacklevel=stacklevel) def __delitem__(self, key): """ From b80df7b7ea216f119cab87eb6fd2c4c6b81b3303 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 27 Jun 2019 12:24:53 -0500 Subject: [PATCH 065/238] API/REGR: Convert to float for index union (#27034) --- doc/source/user_guide/indexing.rst | 23 ++++++++++---- doc/source/whatsnew/v0.25.0.rst | 5 +++ pandas/core/indexes/numeric.py | 46 ++++++++++++++++++++++++++-- pandas/tests/indexes/test_numeric.py | 26 ++++++++++++++++ pandas/tests/indexes/test_setops.py | 39 ++++++++++++++++++++--- 5 files changed, 126 insertions(+), 13 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 4ea7c656fd197a..02522e95a2d79e 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -1559,11 +1559,11 @@ See :ref:`Advanced Indexing ` for usage of MultiIndexes. index.levels[1] index.set_levels(["a", "b"], level=1) +.. _indexing.set_ops: + Set operations on Index objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _indexing.set_ops: - The two main operations are ``union (|)`` and ``intersection (&)``. These can be directly called as instance methods or used via overloaded operators. Difference is provided via the ``.difference()`` method. @@ -1592,11 +1592,22 @@ with duplicates dropped. The resulting index from a set operation will be sorted in ascending order. -Missing values -~~~~~~~~~~~~~~ +When performing :meth:`Index.union` between indexes with different dtypes, the indexes +must be cast to a common dtype. Typically, though not always, this is object dtype. The +exception is when performing a union between integer and float data. In this case, the +integer values are converted to float + +.. ipython:: python + + idx1 = pd.Index([0, 1, 2]) + idx2 = pd.Index([0.5, 1.5]) + idx1 | idx2 .. _indexing.missing: +Missing values +~~~~~~~~~~~~~~ + .. important:: Even though ``Index`` can hold missing values (``NaN``), it should be avoided @@ -1624,11 +1635,11 @@ Occasionally you will load or create a data set into a DataFrame and want to add an index after you've already done so. There are a couple of different ways. +.. _indexing.set_index: + Set an index ~~~~~~~~~~~~ -.. _indexing.set_index: - DataFrame has a :meth:`~DataFrame.set_index` method which takes a column name (for a regular ``Index``) or a list of column names (for a ``MultiIndex``). To create a new, re-indexed DataFrame: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ffd5ba19cd0746..e697c34c85c181 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -347,6 +347,11 @@ considered commutative, such that ``A.union(B) == B.union(A)`` (:issue:`23525`). pd.period_range('19910905', periods=2).union(pd.Int64Index([1, 2, 3])) pd.Index([], dtype=object).union(pd.Index([1, 2, 3])) +Note that integer- and floating-dtype indexes are considered "compatible". The integer +values are coerced to floating point, which may result in loss of precision. See +:ref:`indexing.set_ops` for more. + + ``DataFrame`` groupby ffill/bfill no longer return group labels ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index b6c8ba588f9d66..a228895e527aa9 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -7,9 +7,11 @@ from pandas.core.dtypes.common import ( is_bool, is_bool_dtype, is_dtype_equal, is_extension_array_dtype, is_float, - is_integer_dtype, is_scalar, needs_i8_conversion, pandas_dtype) + is_float_dtype, is_integer_dtype, is_scalar, needs_i8_conversion, + pandas_dtype) import pandas.core.dtypes.concat as _concat -from pandas.core.dtypes.generic import ABCInt64Index, ABCRangeIndex +from pandas.core.dtypes.generic import ( + ABCFloat64Index, ABCInt64Index, ABCRangeIndex, ABCUInt64Index) from pandas.core.dtypes.missing import isna from pandas.core import algorithms @@ -123,6 +125,24 @@ def insert(self, loc, item): item = self._na_value return super().insert(loc, item) + def _union(self, other, sort): + # Right now, we treat union(int, float) a bit special. + # See https://github.com/pandas-dev/pandas/issues/26778 for discussion + # We may change union(int, float) to go to object. + # float | [u]int -> float (the special case) + # | -> T + # | -> object + needs_cast = ( + (is_integer_dtype(self.dtype) and is_float_dtype(other.dtype)) or + (is_integer_dtype(other.dtype) and is_float_dtype(self.dtype)) + ) + if needs_cast: + first = self.astype("float") + second = other.astype("float") + return first._union(second, sort) + else: + return super()._union(other, sort) + _num_index_shared_docs['class_descr'] = """ Immutable ndarray implementing an ordered, sliceable set. The basic object @@ -225,7 +245,9 @@ def _assert_safe_casting(cls, data, subarr): def _is_compatible_with_other(self, other): return ( super()._is_compatible_with_other(other) - or all(isinstance(type(obj), (ABCInt64Index, ABCRangeIndex)) + or all(isinstance(type(obj), (ABCInt64Index, + ABCFloat64Index, + ABCRangeIndex)) for obj in [self, other]) ) @@ -301,6 +323,14 @@ def _assert_safe_casting(cls, data, subarr): raise TypeError('Unsafe NumPy casting, you must ' 'explicitly cast') + def _is_compatible_with_other(self, other): + return ( + super()._is_compatible_with_other(other) + or all(isinstance(type(obj), (ABCUInt64Index, + ABCFloat64Index)) + for obj in [self, other]) + ) + UInt64Index._add_numeric_methods() UInt64Index._add_logical_methods() @@ -447,6 +477,16 @@ def isin(self, values, level=None): self._validate_index_level(level) return algorithms.isin(np.array(self), values) + def _is_compatible_with_other(self, other): + return ( + super()._is_compatible_with_other(other) + or all(isinstance(type(obj), (ABCInt64Index, + ABCFloat64Index, + ABCUInt64Index, + ABCRangeIndex)) + for obj in [self, other]) + ) + Float64Index._add_numeric_methods() Float64Index._add_logical_methods_disabled() diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index c61e0fa6d60210..3437f501aa9109 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -1118,3 +1118,29 @@ def test_join_outer(self): tm.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx) + + +@pytest.mark.parametrize("dtype", ['int64', 'uint64']) +def test_int_float_union_dtype(dtype): + # https://github.com/pandas-dev/pandas/issues/26778 + # [u]int | float -> float + index = pd.Index([0, 2, 3], dtype=dtype) + other = pd.Float64Index([0.5, 1.5]) + expected = pd.Float64Index([0.0, 0.5, 1.5, 2.0, 3.0]) + result = index.union(other) + tm.assert_index_equal(result, expected) + + result = other.union(index) + tm.assert_index_equal(result, expected) + + +def test_range_float_union_dtype(): + # https://github.com/pandas-dev/pandas/issues/26778 + index = pd.RangeIndex(start=0, stop=3) + other = pd.Float64Index([0.5, 1.5]) + result = index.union(other) + expected = pd.Float64Index([0.0, 0.5, 1, 1.5, 2.0]) + tm.assert_index_equal(result, expected) + + result = other.union(index) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index b626ced2ccb1b1..8c0762c7e7e5a3 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -2,6 +2,7 @@ The tests in this package are to ensure the proper resultant dtypes of set operations. ''' +from collections import OrderedDict import itertools as it import numpy as np @@ -10,13 +11,17 @@ from pandas.core.dtypes.common import is_dtype_equal import pandas as pd -from pandas import Int64Index, RangeIndex +from pandas import Float64Index, Int64Index, RangeIndex, UInt64Index +from pandas.api.types import pandas_dtype from pandas.tests.indexes.conftest import indices_list import pandas.util.testing as tm -COMPATIBLE_INCONSISTENT_PAIRS = { - (Int64Index, RangeIndex): (tm.makeIntIndex, tm.makeRangeIndex) -} +COMPATIBLE_INCONSISTENT_PAIRS = OrderedDict([ + ((Int64Index, RangeIndex), (tm.makeIntIndex, tm.makeRangeIndex)), + ((Float64Index, Int64Index), (tm.makeFloatIndex, tm.makeIntIndex)), + ((Float64Index, RangeIndex), (tm.makeFloatIndex, tm.makeIntIndex)), + ((Float64Index, UInt64Index), (tm.makeFloatIndex, tm.makeUIntIndex)), +]) @pytest.fixture(params=list(it.combinations(indices_list, 2)), @@ -74,3 +79,29 @@ def test_compatible_inconsistent_pairs(idx_fact1, idx_fact2): assert res1.dtype in (idx1.dtype, idx2.dtype) assert res2.dtype in (idx1.dtype, idx2.dtype) + + +@pytest.mark.parametrize('left, right, expected', [ + ('int64', 'int64', 'int64'), + ('int64', 'uint64', 'object'), + ('int64', 'float64', 'float64'), + ('uint64', 'float64', 'float64'), + ('uint64', 'uint64', 'uint64'), + ('float64', 'float64', 'float64'), + ('datetime64[ns]', 'int64', 'object'), + ('datetime64[ns]', 'uint64', 'object'), + ('datetime64[ns]', 'float64', 'object'), + ('datetime64[ns, CET]', 'int64', 'object'), + ('datetime64[ns, CET]', 'uint64', 'object'), + ('datetime64[ns, CET]', 'float64', 'object'), + ('Period[D]', 'int64', 'object'), + ('Period[D]', 'uint64', 'object'), + ('Period[D]', 'float64', 'object'), +]) +def test_union_dtypes(left, right, expected): + left = pandas_dtype(left) + right = pandas_dtype(right) + a = pd.Index([], dtype=left) + b = pd.Index([], dtype=right) + result = (a | b).dtype + assert result == expected From f8b0b9fa2b5811a8c49d7096dbe47254e341054e Mon Sep 17 00:00:00 2001 From: Inevitable-Marzipan <43890311+Inevitable-Marzipan@users.noreply.github.com> Date: Thu, 27 Jun 2019 19:16:27 +0100 Subject: [PATCH 066/238] BUG/TST: fillna limit checks and tests (#27077) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/internals/blocks.py | 12 ++++++------ pandas/tests/frame/test_missing.py | 16 ++++++++++++++++ 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index e697c34c85c181..0853a5962272a5 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -693,6 +693,7 @@ Missing - Fixed misleading exception message in :meth:`Series.interpolate` if argument ``order`` is required, but omitted (:issue:`10633`, :issue:`24014`). - Fixed class type displayed in exception message in :meth:`DataFrame.dropna` if invalid ``axis`` parameter passed (:issue:`25555`) +- A ``ValueError`` will now be thrown by :meth:`DataFrame.fillna` when ``limit`` is not a positive integer (:issue:`27042`) - MultiIndex diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index c6be56df7ae0c9..0b2af9391784c5 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -344,12 +344,6 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): """ inplace = validate_bool_kwarg(inplace, 'inplace') - if not self._can_hold_na: - if inplace: - return self - else: - return self.copy() - mask = isna(self.values) if limit is not None: if not is_integer(limit): @@ -361,6 +355,12 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): "is currently limited to 2") mask[mask.cumsum(self.ndim - 1) > limit] = False + if not self._can_hold_na: + if inplace: + return self + else: + return self.copy() + # fillna, but if we cannot coerce, then try again as an ObjectBlock try: values, _ = self._try_coerce_args(self.values, value) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index a1dbeba3642406..c72951ac4cdfa6 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -516,6 +516,22 @@ def test_fillna_skip_certain_blocks(self): # it works! df.fillna(np.nan) + @pytest.mark.parametrize("type", [int, float]) + def test_fillna_positive_limit(self, type): + df = DataFrame(np.random.randn(10, 4)).astype(type) + + msg = "Limit must be greater than 0" + with pytest.raises(ValueError, match=msg): + df.fillna(0, limit=-5) + + @pytest.mark.parametrize("type", [int, float]) + def test_fillna_integer_limit(self, type): + df = DataFrame(np.random.randn(10, 4)).astype(type) + + msg = "Limit must be an integer" + with pytest.raises(ValueError, match=msg): + df.fillna(0, limit=0.5) + def test_fillna_inplace(self): df = DataFrame(np.random.randn(10, 4)) df[1][:4] = np.nan From 38ab7523c15d09e23c538755703aeb338ec35a1c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 27 Jun 2019 13:33:19 -0500 Subject: [PATCH 067/238] BUG: fix index validation in algos.take (#27079) --- pandas/core/algorithms.py | 2 +- pandas/core/arrays/sparse.py | 9 --------- pandas/tests/test_take.py | 5 +++++ 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ff1313c21d96f6..77664b3fa73d06 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1526,7 +1526,7 @@ def take(arr, indices, axis=0, allow_fill=False, fill_value=None): if allow_fill: # Pandas style, -1 means NA - validate_indices(indices, len(arr)) + validate_indices(indices, arr.shape[axis]) result = take_1d(arr, indices, axis=axis, allow_fill=True, fill_value=fill_value) else: diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index d692fe6d7cabef..1c4ab70fa9332d 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1868,15 +1868,6 @@ def _maybe_to_dense(obj): return obj -def _maybe_to_sparse(array): - """ - array must be SparseSeries or SparseArray - """ - if isinstance(array, ABCSparseSeries): - array = array.array.copy() - return array - - def make_sparse(arr, kind='block', fill_value=None, dtype=None, copy=False): """ Convert ndarray to sparse format diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index a60fce99eb9a7c..afcc90a1c8e74b 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -420,6 +420,11 @@ def test_take_axis_1(self): expected = np.array([[0, 0], [3, 0], [6, 0], [9, 0]]) tm.assert_numpy_array_equal(result, expected) + # GH#26976 make sure we validate along the correct axis + with pytest.raises(IndexError, match="indices are out-of-bounds"): + algos.take(arr, [0, 3], axis=1, allow_fill=True, + fill_value=0) + class TestExtensionTake: # The take method found in pd.api.extensions From 27f9d05aa3852741571e274681512423c441f72d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 27 Jun 2019 15:39:24 -0500 Subject: [PATCH 068/238] BUG: fix tzaware dataframe transpose bug (#26825) --- doc/source/whatsnew/v0.25.0.rst | 2 + pandas/core/groupby/generic.py | 49 +++++-- pandas/core/internals/construction.py | 23 +++- pandas/tests/arithmetic/test_datetime64.py | 134 +++++++++----------- pandas/tests/arithmetic/test_timedelta64.py | 6 +- pandas/tests/frame/test_constructors.py | 29 +++++ pandas/tests/frame/test_dtypes.py | 6 +- pandas/tests/frame/test_operators.py | 41 ++++++ pandas/tests/groupby/test_function.py | 6 +- pandas/tests/groupby/test_groupby.py | 8 +- 10 files changed, 207 insertions(+), 97 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 0853a5962272a5..2487acfe5579a0 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -784,6 +784,7 @@ Reshaping - Bug in :func:`DataFrame.sort_index` where an error is thrown when a multi-indexed ``DataFrame`` is sorted on all levels with the initial level sorted last (:issue:`26053`) - Bug in :meth:`Series.nlargest` treats ``True`` as smaller than ``False`` (:issue:`26154`) - Bug in :func:`DataFrame.pivot_table` with a :class:`IntervalIndex` as pivot index would raise ``TypeError`` (:issue:`25814`) +- Bug in :meth:`DataFrame.transpose` where transposing a DataFrame with a timezone-aware datetime column would incorrectly raise ``ValueError`` (:issue:`26825`) Sparse ^^^^^^ @@ -811,6 +812,7 @@ Other - Removed unused C functions from vendored UltraJSON implementation (:issue:`26198`) - Allow :class:`Index` and :class:`RangeIndex` to be passed to numpy ``min`` and ``max`` functions (:issue:`26125`) - Use actual class name in repr of empty objects of a ``Series`` subclass (:issue:`27001`). +- Bug in :class:`DataFrame` where passing an object array of timezone-aware `datetime` objects would incorrectly raise ``ValueError`` (:issue:`13287`) .. _whatsnew_0.250.contributors: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1b4e001620286d..a10920b7a5afb4 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -21,10 +21,12 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution -from pandas.core.dtypes.cast import maybe_downcast_to_dtype +from pandas.core.dtypes.cast import ( + maybe_convert_objects, maybe_downcast_to_dtype) from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, is_bool, is_datetimelike, - is_integer_dtype, is_interval_dtype, is_numeric_dtype, is_scalar) + is_integer_dtype, is_interval_dtype, is_numeric_dtype, is_object_dtype, + is_scalar) from pandas.core.dtypes.missing import isna, notna from pandas._typing import FrameOrSeries @@ -334,7 +336,6 @@ def _decide_output_index(self, output, labels): def _wrap_applied_output(self, keys, values, not_indexed_same=False): from pandas.core.index import _all_indexes_same - from pandas.core.tools.numeric import to_numeric if len(keys) == 0: return DataFrame(index=keys) @@ -406,7 +407,6 @@ def first_not_none(values): # provide a reduction (Frame -> Series) if groups are # unique if self.squeeze: - # assign the name to this series if singular_series: values[0].name = keys[0] @@ -481,14 +481,7 @@ def first_not_none(values): # as we are stacking can easily have object dtypes here so = self._selected_obj if so.ndim == 2 and so.dtypes.apply(is_datetimelike).any(): - result = result.apply( - lambda x: to_numeric(x, errors='ignore')) - date_cols = self._selected_obj.select_dtypes( - include=['datetime', 'timedelta']).columns - date_cols = date_cols.intersection(result.columns) - result[date_cols] = (result[date_cols] - ._convert(datetime=True, - coerce=True)) + result = _recast_datetimelike_result(result) else: result = result._convert(datetime=True) @@ -1710,3 +1703,35 @@ def _normalize_keyword_aggregation(kwargs): order.append((column, com.get_callable_name(aggfunc) or aggfunc)) return aggspec, columns, order + + +def _recast_datetimelike_result(result: DataFrame) -> DataFrame: + """ + If we have date/time like in the original, then coerce dates + as we are stacking can easily have object dtypes here. + + Parameters + ---------- + result : DataFrame + + Returns + ------- + DataFrame + + Notes + ----- + - Assumes Groupby._selected_obj has ndim==2 and at least one + datetimelike column + """ + result = result.copy() + + obj_cols = [idx for idx in range(len(result.columns)) + if is_object_dtype(result.dtypes[idx])] + + # See GH#26285 + for n in obj_cols: + converted = maybe_convert_objects(result.iloc[:, n].values, + convert_numeric=False) + + result.iloc[:, n] = converted + return result diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index d766d7f06d34a1..ecdf8a1f77b94e 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -159,9 +159,28 @@ def init_ndarray(values, index, columns, dtype=None, copy=False): # on the entire block; this is to convert if we have datetimelike's # embedded in an object type if dtype is None and is_object_dtype(values): - values = maybe_infer_to_datetimelike(values) - return create_block_manager_from_blocks([values], [columns, index]) + if values.ndim == 2 and values.shape[0] != 1: + # transpose and separate blocks + + dvals_list = [maybe_infer_to_datetimelike(row) for row in values] + for n in range(len(dvals_list)): + if isinstance(dvals_list[n], np.ndarray): + dvals_list[n] = dvals_list[n].reshape(1, -1) + + from pandas.core.internals.blocks import make_block + + # TODO: What about re-joining object columns? + block_values = [make_block(dvals_list[n], placement=[n]) + for n in range(len(dvals_list))] + + else: + datelike_vals = maybe_infer_to_datetimelike(values) + block_values = [datelike_vals] + else: + block_values = [values] + + return create_block_manager_from_blocks(block_values, [columns, index]) def init_dict(data, index, columns, dtype=None): diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 64b4e162483f11..b1091d38c10d01 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -448,8 +448,7 @@ def test_dti_cmp_null_scalar_inequality(self, tz_naive_fixture, other, # GH#19301 tz = tz_naive_fixture dti = pd.date_range('2016-01-01', periods=2, tz=tz) - # FIXME: ValueError with transpose - dtarr = tm.box_expected(dti, box_with_array, transpose=False) + dtarr = tm.box_expected(dti, box_with_array) msg = 'Invalid comparison between' with pytest.raises(TypeError, match=msg): dtarr < other @@ -597,49 +596,63 @@ def test_dti_cmp_nat_behaves_like_float_cmp_nan(self): @pytest.mark.parametrize('op', [operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le]) - def test_comparison_tzawareness_compat(self, op, box_with_array): + def test_comparison_tzawareness_compat(self, op, box_df_fail): # GH#18162 + box = box_df_fail + dr = pd.date_range('2016-01-01', periods=6) dz = dr.tz_localize('US/Pacific') - # FIXME: ValueError with transpose - dr = tm.box_expected(dr, box_with_array, transpose=False) - dz = tm.box_expected(dz, box_with_array, transpose=False) + dr = tm.box_expected(dr, box) + dz = tm.box_expected(dz, box) msg = 'Cannot compare tz-naive and tz-aware' with pytest.raises(TypeError, match=msg): op(dr, dz) - if box_with_array is not pd.DataFrame: - # DataFrame op is invalid until transpose bug is fixed - with pytest.raises(TypeError, match=msg): - op(dr, list(dz)) - with pytest.raises(TypeError, match=msg): - op(dr, np.array(list(dz), dtype=object)) + # FIXME: DataFrame case fails to raise for == and !=, wrong + # message for inequalities + with pytest.raises(TypeError, match=msg): + op(dr, list(dz)) + with pytest.raises(TypeError, match=msg): + op(dr, np.array(list(dz), dtype=object)) with pytest.raises(TypeError, match=msg): op(dz, dr) - if box_with_array is not pd.DataFrame: - # DataFrame op is invalid until transpose bug is fixed - with pytest.raises(TypeError, match=msg): - op(dz, list(dr)) - with pytest.raises(TypeError, match=msg): - op(dz, np.array(list(dr), dtype=object)) + + # FIXME: DataFrame case fails to raise for == and !=, wrong + # message for inequalities + with pytest.raises(TypeError, match=msg): + op(dz, list(dr)) + with pytest.raises(TypeError, match=msg): + op(dz, np.array(list(dr), dtype=object)) # Check that there isn't a problem aware-aware and naive-naive do not # raise assert_all(dr == dr) assert_all(dz == dz) - if box_with_array is not pd.DataFrame: - # DataFrame doesn't align the lists correctly unless we transpose, - # which we cannot do at the moment - assert (dr == list(dr)).all() - assert (dz == list(dz)).all() + + # FIXME: DataFrame case fails to raise for == and !=, wrong + # message for inequalities + assert (dr == list(dr)).all() + assert (dz == list(dz)).all() + + @pytest.mark.parametrize('op', [operator.eq, operator.ne, + operator.gt, operator.ge, + operator.lt, operator.le]) + def test_comparison_tzawareness_compat_scalars(self, op, box_with_array): + # GH#18162 + dr = pd.date_range('2016-01-01', periods=6) + dz = dr.tz_localize('US/Pacific') + + dr = tm.box_expected(dr, box_with_array) + dz = tm.box_expected(dz, box_with_array) # Check comparisons against scalar Timestamps ts = pd.Timestamp('2000-03-14 01:59') ts_tz = pd.Timestamp('2000-03-14 01:59', tz='Europe/Amsterdam') assert_all(dr > ts) + msg = 'Cannot compare tz-naive and tz-aware' with pytest.raises(TypeError, match=msg): op(dr, ts_tz) @@ -662,8 +675,7 @@ def test_scalar_comparison_tzawareness(self, op, other, tz_aware_fixture, tz = tz_aware_fixture dti = pd.date_range('2016-01-01', periods=2, tz=tz) - # FIXME: ValueError with transpose - dtarr = tm.box_expected(dti, box_with_array, transpose=False) + dtarr = tm.box_expected(dti, box_with_array) msg = 'Cannot compare tz-naive and tz-aware' with pytest.raises(TypeError, match=msg): op(dtarr, other) @@ -725,17 +737,16 @@ def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, xbox = box_with_array if box_with_array is not pd.Index else np.ndarray rng = date_range('1/1/2000', periods=10, tz=tz) - # FIXME: ValueError with transpose - rng = tm.box_expected(rng, box_with_array, transpose=False) + rng = tm.box_expected(rng, box_with_array) result = rng == other expected = np.array([False] * 10) - expected = tm.box_expected(expected, xbox, transpose=False) + expected = tm.box_expected(expected, xbox) tm.assert_equal(result, expected) result = rng != other expected = np.array([True] * 10) - expected = tm.box_expected(expected, xbox, transpose=False) + expected = tm.box_expected(expected, xbox) tm.assert_equal(result, expected) msg = 'Invalid comparison between' with pytest.raises(TypeError, match=msg): @@ -826,9 +837,8 @@ def test_dt64arr_add_timedeltalike_scalar(self, tz_naive_fixture, expected = pd.date_range('2000-01-01 02:00', '2000-02-01 02:00', tz=tz) - # FIXME: calling with transpose=True raises ValueError - rng = tm.box_expected(rng, box_with_array, transpose=False) - expected = tm.box_expected(expected, box_with_array, transpose=False) + rng = tm.box_expected(rng, box_with_array) + expected = tm.box_expected(expected, box_with_array) result = rng + two_hours tm.assert_equal(result, expected) @@ -841,9 +851,8 @@ def test_dt64arr_iadd_timedeltalike_scalar(self, tz_naive_fixture, expected = pd.date_range('2000-01-01 02:00', '2000-02-01 02:00', tz=tz) - # FIXME: calling with transpose=True raises ValueError - rng = tm.box_expected(rng, box_with_array, transpose=False) - expected = tm.box_expected(expected, box_with_array, transpose=False) + rng = tm.box_expected(rng, box_with_array) + expected = tm.box_expected(expected, box_with_array) rng += two_hours tm.assert_equal(rng, expected) @@ -856,9 +865,8 @@ def test_dt64arr_sub_timedeltalike_scalar(self, tz_naive_fixture, expected = pd.date_range('1999-12-31 22:00', '2000-01-31 22:00', tz=tz) - # FIXME: calling with transpose=True raises ValueError - rng = tm.box_expected(rng, box_with_array, transpose=False) - expected = tm.box_expected(expected, box_with_array, transpose=False) + rng = tm.box_expected(rng, box_with_array) + expected = tm.box_expected(expected, box_with_array) result = rng - two_hours tm.assert_equal(result, expected) @@ -871,9 +879,8 @@ def test_dt64arr_isub_timedeltalike_scalar(self, tz_naive_fixture, expected = pd.date_range('1999-12-31 22:00', '2000-01-31 22:00', tz=tz) - # FIXME: calling with transpose=True raises ValueError - rng = tm.box_expected(rng, box_with_array, transpose=False) - expected = tm.box_expected(expected, box_with_array, transpose=False) + rng = tm.box_expected(rng, box_with_array) + expected = tm.box_expected(expected, box_with_array) rng -= two_hours tm.assert_equal(rng, expected) @@ -928,9 +935,6 @@ def test_dt64arr_add_sub_td64_nat(self, box_with_array, tz_naive_fixture): def test_dt64arr_add_sub_td64ndarray(self, tz_naive_fixture, box_with_array): - if box_with_array is pd.DataFrame: - pytest.xfail("FIXME: ValueError with transpose; " - "alignment error without") tz = tz_naive_fixture dti = pd.date_range('2016-01-01', periods=3, tz=tz) @@ -952,7 +956,7 @@ def test_dt64arr_add_sub_td64ndarray(self, tz_naive_fixture, result = dtarr - tdarr tm.assert_equal(result, expected) - msg = 'cannot subtract' + msg = 'cannot subtract|bad operand type for unary -' with pytest.raises(TypeError, match=msg): tdarr - dtarr @@ -997,13 +1001,11 @@ def test_dt64arr_sub_timestamp(self, box_with_array): tz='US/Eastern') ts = ser[0] - # FIXME: transpose raises ValueError - ser = tm.box_expected(ser, box_with_array, transpose=False) + ser = tm.box_expected(ser, box_with_array) delta_series = pd.Series([np.timedelta64(0, 'D'), np.timedelta64(1, 'D')]) - expected = tm.box_expected(delta_series, box_with_array, - transpose=False) + expected = tm.box_expected(delta_series, box_with_array) tm.assert_equal(ser - ts, expected) tm.assert_equal(ts - ser, -expected) @@ -1011,20 +1013,19 @@ def test_dt64arr_sub_timestamp(self, box_with_array): def test_dt64arr_sub_NaT(self, box_with_array): # GH#18808 dti = pd.DatetimeIndex([pd.NaT, pd.Timestamp('19900315')]) - ser = tm.box_expected(dti, box_with_array, transpose=False) + ser = tm.box_expected(dti, box_with_array) result = ser - pd.NaT expected = pd.Series([pd.NaT, pd.NaT], dtype='timedelta64[ns]') - # FIXME: raises ValueError with transpose - expected = tm.box_expected(expected, box_with_array, transpose=False) + expected = tm.box_expected(expected, box_with_array) tm.assert_equal(result, expected) dti_tz = dti.tz_localize('Asia/Tokyo') - ser_tz = tm.box_expected(dti_tz, box_with_array, transpose=False) + ser_tz = tm.box_expected(dti_tz, box_with_array) result = ser_tz - pd.NaT expected = pd.Series([pd.NaT, pd.NaT], dtype='timedelta64[ns]') - expected = tm.box_expected(expected, box_with_array, transpose=False) + expected = tm.box_expected(expected, box_with_array) tm.assert_equal(result, expected) # ------------------------------------------------------------- @@ -1044,16 +1045,13 @@ def test_dt64arr_naive_sub_dt64ndarray(self, box_with_array): def test_dt64arr_aware_sub_dt64ndarray_raises(self, tz_aware_fixture, box_with_array): - if box_with_array is pd.DataFrame: - pytest.xfail("FIXME: ValueError with transpose; " - "alignment error without") tz = tz_aware_fixture dti = pd.date_range('2016-01-01', periods=3, tz=tz) dt64vals = dti.values dtarr = tm.box_expected(dti, box_with_array) - msg = 'DatetimeArray subtraction must have the same timezones or' + msg = 'subtraction must have the same timezones or' with pytest.raises(TypeError, match=msg): dtarr - dt64vals with pytest.raises(TypeError, match=msg): @@ -1064,9 +1062,6 @@ def test_dt64arr_aware_sub_dt64ndarray_raises(self, tz_aware_fixture, def test_dt64arr_add_dt64ndarray_raises(self, tz_naive_fixture, box_with_array): - if box_with_array is pd.DataFrame: - pytest.xfail("FIXME: ValueError with transpose; " - "alignment error without") tz = tz_naive_fixture dti = pd.date_range('2016-01-01', periods=3, tz=tz) @@ -1214,9 +1209,8 @@ def test_dti_add_tick_tzaware(self, tz_aware_fixture, box_with_array): expected = DatetimeIndex(['2010-11-01 05:00', '2010-11-01 06:00', '2010-11-01 07:00'], freq='H', tz=tz) - # FIXME: these raise ValueError with transpose=True - dates = tm.box_expected(dates, box_with_array, transpose=False) - expected = tm.box_expected(expected, box_with_array, transpose=False) + dates = tm.box_expected(dates, box_with_array) + expected = tm.box_expected(expected, box_with_array) # TODO: parametrize over the scalar being added? radd? sub? offset = dates + pd.offsets.Hour(5) @@ -1369,26 +1363,25 @@ def test_dt64arr_add_sub_DateOffset(self, box_with_array): s = DatetimeIndex([Timestamp('2000-01-15 00:15:00', tz='US/Central'), Timestamp('2000-02-15', tz='US/Central')], name='a') - # FIXME: ValueError with tzaware DataFrame transpose - s = tm.box_expected(s, box_with_array, transpose=False) + s = tm.box_expected(s, box_with_array) result = s + pd.offsets.Day() result2 = pd.offsets.Day() + s exp = DatetimeIndex([Timestamp('2000-01-16 00:15:00', tz='US/Central'), Timestamp('2000-02-16', tz='US/Central')], name='a') - exp = tm.box_expected(exp, box_with_array, transpose=False) + exp = tm.box_expected(exp, box_with_array) tm.assert_equal(result, exp) tm.assert_equal(result2, exp) s = DatetimeIndex([Timestamp('2000-01-15 00:15:00', tz='US/Central'), Timestamp('2000-02-15', tz='US/Central')], name='a') - s = tm.box_expected(s, box_with_array, transpose=False) + s = tm.box_expected(s, box_with_array) result = s + pd.offsets.MonthEnd() result2 = pd.offsets.MonthEnd() + s exp = DatetimeIndex([Timestamp('2000-01-31 00:15:00', tz='US/Central'), Timestamp('2000-02-29', tz='US/Central')], name='a') - exp = tm.box_expected(exp, box_with_array, transpose=False) + exp = tm.box_expected(exp, box_with_array) tm.assert_equal(result, exp) tm.assert_equal(result2, exp) @@ -1425,9 +1418,6 @@ def test_dt64arr_add_mixed_offset_array(self, box_with_array): def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, box_with_array): # GH#18849 - if box_with_array is pd.DataFrame: - pytest.xfail("FIXME: ValueError with transpose; " - "alignment error without") tz = tz_naive_fixture dti = pd.date_range('2017-01-01', periods=2, tz=tz) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 22b5fd452d6615..0ae325cfce7877 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -869,10 +869,8 @@ def test_td64arr_add_timestamp(self, box_with_array, tz_naive_fixture): idx = TimedeltaIndex(['1 day', '2 day']) expected = DatetimeIndex(['2011-01-02', '2011-01-03'], tz=tz) - # FIXME: fails with transpose=True because of tz-aware DataFrame - # transpose bug - idx = tm.box_expected(idx, box_with_array, transpose=False) - expected = tm.box_expected(expected, box_with_array, transpose=False) + idx = tm.box_expected(idx, box_with_array) + expected = tm.box_expected(expected, box_with_array) result = idx + other tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index c6508072cb8c7a..434ee2f8bf0afe 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2400,3 +2400,32 @@ def test_nested_dict_construction(self): index=pd.Index([2001, 2002, 2003]) ) tm.assert_frame_equal(result, expected) + + def test_from_tzaware_object_array(self): + # GH#26825 2D object array of tzaware timestamps should not raise + dti = pd.date_range('2016-04-05 04:30', periods=3, tz='UTC') + data = dti._data.astype(object).reshape(1, -1) + df = pd.DataFrame(data) + assert df.shape == (1, 3) + assert (df.dtypes == dti.dtype).all() + assert (df == dti).all().all() + + def test_from_tzaware_mixed_object_array(self): + # GH#26825 + arr = np.array([ + [Timestamp('2013-01-01 00:00:00'), + Timestamp('2013-01-02 00:00:00'), + Timestamp('2013-01-03 00:00:00')], + [Timestamp('2013-01-01 00:00:00-0500', tz='US/Eastern'), + pd.NaT, + Timestamp('2013-01-03 00:00:00-0500', tz='US/Eastern')], + [Timestamp('2013-01-01 00:00:00+0100', tz='CET'), + pd.NaT, + Timestamp('2013-01-03 00:00:00+0100', tz='CET')]], + dtype=object).T + res = DataFrame(arr, columns=['A', 'B', 'C']) + + expected_dtypes = ['datetime64[ns]', + 'datetime64[ns, US/Eastern]', + 'datetime64[ns, CET]'] + assert (res.dtypes == expected_dtypes).all() diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 96cf70483d4e7f..7ed601e4f70461 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -978,9 +978,11 @@ def test_astype(self): Timestamp('2013-01-03 00:00:00+0100', tz='CET')]], dtype=object).T + expected = DataFrame(expected, + index=self.tzframe.index, + columns=self.tzframe.columns, dtype=object) result = self.tzframe.astype(object) - assert_frame_equal(result, DataFrame( - expected, index=self.tzframe.index, columns=self.tzframe.columns)) + assert_frame_equal(result, expected) result = self.tzframe.astype('datetime64[ns]') expected = DataFrame({'A': date_range('20130101', periods=3), diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index f1c8445bf98e0a..1e932879e9ad0f 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -793,3 +793,44 @@ def test_no_warning(self, all_arithmetic_operators): b = df['B'] with tm.assert_produces_warning(None): getattr(df, all_arithmetic_operators)(b, 0) + + +class TestTranspose: + def test_transpose_tzaware_1col_single_tz(self): + # GH#26825 + dti = pd.date_range('2016-04-05 04:30', periods=3, tz='UTC') + + df = pd.DataFrame(dti) + assert (df.dtypes == dti.dtype).all() + res = df.T + assert (res.dtypes == dti.dtype).all() + + def test_transpose_tzaware_2col_single_tz(self): + # GH#26825 + dti = pd.date_range('2016-04-05 04:30', periods=3, tz='UTC') + + df3 = pd.DataFrame({'A': dti, 'B': dti}) + assert (df3.dtypes == dti.dtype).all() + res3 = df3.T + assert (res3.dtypes == dti.dtype).all() + + def test_transpose_tzaware_2col_mixed_tz(self): + # GH#26825 + dti = pd.date_range('2016-04-05 04:30', periods=3, tz='UTC') + dti2 = dti.tz_convert('US/Pacific') + + df4 = pd.DataFrame({'A': dti, 'B': dti2}) + assert (df4.dtypes == [dti.dtype, dti2.dtype]).all() + assert (df4.T.dtypes == object).all() + tm.assert_frame_equal(df4.T.T, df4) + + def test_transpose_object_to_tzaware_mixed_tz(self): + # GH#26825 + dti = pd.date_range('2016-04-05 04:30', periods=3, tz='UTC') + dti2 = dti.tz_convert('US/Pacific') + + # mixed all-tzaware dtypes + df2 = pd.DataFrame([dti, dti2]) + assert (df2.dtypes == object).all() + res2 = df2.T + assert (res2.dtypes == [dti.dtype, dti2.dtype]).all() diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 3d9bfcd126377d..14f27f0c4c7d87 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -963,12 +963,14 @@ def test_count(): df['9th'] = df['9th'].astype('category') - for key in '1st', '2nd', ['1st', '2nd']: + for key in ['1st', '2nd', ['1st', '2nd']]: left = df.groupby(key).count() right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) tm.assert_frame_equal(left, right) - # GH5610 + +def test_count_non_nulls(): + # GH#5610 # count counts non-nulls df = pd.DataFrame([[1, 2, 'foo'], [1, np.nan, 'bar'], diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3da3ab22b643bc..dcd0d3938c6a57 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -129,12 +129,14 @@ def func(dataf): result = df.groupby('X', squeeze=False).count() assert isinstance(result, DataFrame) + +def test_inconsistent_return_type(): # GH5592 # inconsistent return type df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb', - 'Pony', 'Pony'], B=Series( - np.arange(7), dtype='int64'), C=date_range( - '20130101', periods=7))) + 'Pony', 'Pony'], + B=Series(np.arange(7), dtype='int64'), + C=date_range('20130101', periods=7))) def f(grp): return grp.iloc[0] From b387da894076c7427ba7a2fb467c3e821a2cef27 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 27 Jun 2019 16:22:37 -0500 Subject: [PATCH 069/238] API: remove deep keyword from EA.copy (#27083) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/arrays/base.py | 7 +------ pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/integer.py | 11 +++-------- pandas/core/arrays/interval.py | 11 +++-------- pandas/core/arrays/numpy_.py | 2 +- pandas/core/arrays/sparse.py | 8 ++------ pandas/core/indexes/interval.py | 4 +++- pandas/core/internals/blocks.py | 2 +- pandas/core/internals/construction.py | 4 +++- pandas/core/sparse/series.py | 4 +++- pandas/tests/arrays/sparse/test_array.py | 6 +++--- pandas/tests/extension/arrow/bool.py | 7 ++----- pandas/tests/extension/arrow/test_bool.py | 9 ++++++--- pandas/tests/extension/base/interface.py | 8 ++++++++ pandas/tests/extension/conftest.py | 2 +- pandas/tests/extension/decimal/array.py | 6 ++---- pandas/tests/extension/json/array.py | 2 +- pandas/tests/extension/test_sparse.py | 4 ++++ 19 files changed, 49 insertions(+), 51 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 2487acfe5579a0..ff84ceae417cd5 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -805,6 +805,7 @@ ExtensionArray - Bug in :func:`factorize` when passing an ``ExtensionArray`` with a custom ``na_sentinel`` (:issue:`25696`). - :meth:`Series.count` miscounts NA values in ExtensionArrays (:issue:`26835`) +- Keyword argument ``deep`` has been removed from :method:`ExtensionArray.copy` (:issue:`27083`) Other ^^^^^ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index d1dfb6b5e8599d..6340cc732d6c1c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -820,15 +820,10 @@ def take(self, indices, allow_fill=False, fill_value=None): # pandas.api.extensions.take raise AbstractMethodError(self) - def copy(self, deep: bool = False) -> ABCExtensionArray: + def copy(self) -> ABCExtensionArray: """ Return a copy of the array. - Parameters - ---------- - deep : bool, default False - Also copy the underlying data backing this array. - Returns ------- ExtensionArray diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ebf1f692ccde60..93166759d8dbdd 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -605,7 +605,7 @@ def _concat_same_type(cls, to_concat): values = np.concatenate([x.asi8 for x in to_concat]) return cls(values, dtype=dtype) - def copy(self, deep=False): + def copy(self): values = self.asi8.copy() return type(self)._simple_new(values, dtype=self.dtype, freq=self.freq) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 07d5664f987149..88de497a3329fa 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,4 +1,3 @@ -import copy import sys from typing import Type import warnings @@ -375,14 +374,10 @@ def take(self, indexer, allow_fill=False, fill_value=None): return type(self)(result, mask, copy=False) - def copy(self, deep=False): + def copy(self): data, mask = self._data, self._mask - if deep: - data = copy.deepcopy(data) - mask = copy.deepcopy(mask) - else: - data = data.copy() - mask = mask.copy() + data = data.copy() + mask = mask.copy() return type(self)(data, mask, copy=False) def __setitem__(self, key, value): diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 71f4cbae7c58d5..aaa41241825982 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -680,21 +680,16 @@ def _shallow_copy(self, left=None, right=None, closed=None): return self._simple_new( left, right, closed=closed, verify_integrity=False) - def copy(self, deep=False): + def copy(self): """ Return a copy of the array. - Parameters - ---------- - deep : bool, default False - Also copy the underlying data backing this array. - Returns ------- IntervalArray """ - left = self.left.copy(deep=True) if deep else self.left - right = self.right.copy(deep=True) if deep else self.right + left = self.left.copy(deep=True) + right = self.right.copy(deep=True) closed = self.closed # TODO: Could skip verify_integrity here. return type(self).from_arrays(left, right, closed=closed) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index f651f89fab8340..1c5dc7666c3a15 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -285,7 +285,7 @@ def take(self, indices, allow_fill=False, fill_value=None): fill_value=fill_value) return type(self)(result) - def copy(self, deep=False): + def copy(self): return type(self)(self._ndarray.copy()) def _values_for_argsort(self): diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 1c4ab70fa9332d..3512d4e9e29db2 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1262,12 +1262,8 @@ def searchsorted(self, v, side="left", sorter=None): v, side, sorter ) - def copy(self, deep=False): - if deep: - values = self.sp_values.copy() - else: - values = self.sp_values - + def copy(self): + values = self.sp_values.copy() return self._simple_new(values, self.sp_index, self.dtype) @classmethod diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 49f657332bbbf3..777fa2eadd289f 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -429,7 +429,9 @@ def __reduce__(self): @Appender(_index_shared_docs['copy']) def copy(self, deep=False, name=None): - array = self._data.copy(deep=deep) + array = self._data + if deep: + array = array.copy() attributes = self._get_attributes_dict() if name is not None: attributes.update(name=name) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 0b2af9391784c5..db0eb44eabbfef 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2212,7 +2212,7 @@ def copy(self, deep=True): """ copy constructor """ values = self.values if deep: - values = values.copy(deep=True) + values = values.copy() return self.make_block_same_class(values) def get_values(self, dtype=None): diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index ecdf8a1f77b94e..96b4ab7f3fbc6c 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -218,8 +218,10 @@ def init_dict(data, index, columns, dtype=None): arrays = (com.maybe_iterable_to_list(data[k]) for k in keys) # GH#24096 need copy to be deep for datetime64tz case # TODO: See if we can avoid these copies + arrays = [arr if not isinstance(arr, ABCIndexClass) else arr._data + for arr in arrays] arrays = [arr if not is_datetime64tz_dtype(arr) else - arr.copy(deep=True) for arr in arrays] + arr.copy() for arr in arrays] return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 3e3bae64440825..2e740c0acc465a 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -450,7 +450,9 @@ def copy(self, deep=True): """ # TODO: https://github.com/pandas-dev/pandas/issues/22314 # We skip the block manager till that is resolved. - new_data = self.values.copy(deep=deep) + new_data = self.values + if deep: + new_data = new_data.copy() return self._constructor(new_data, sparse_index=self.sp_index, fill_value=self.fill_value, index=self.index.copy(), diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 231b5a92dbb3ad..fbf86f66e437fc 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -591,9 +591,9 @@ def test_set_fill_invalid_non_scalar(self, val): with pytest.raises(ValueError, match=msg): arr.fill_value = val - def test_copy_shallow(self): - arr2 = self.arr.copy(deep=False) - assert arr2.sp_values is self.arr.sp_values + def test_copy(self): + arr2 = self.arr.copy() + assert arr2.sp_values is not self.arr.sp_values assert arr2.sp_index is self.arr.sp_index def test_values_asarray(self): diff --git a/pandas/tests/extension/arrow/bool.py b/pandas/tests/extension/arrow/bool.py index 2263f53544e417..0d6396033fac7c 100644 --- a/pandas/tests/extension/arrow/bool.py +++ b/pandas/tests/extension/arrow/bool.py @@ -108,11 +108,8 @@ def take(self, indices, allow_fill=False, fill_value=None): allow_fill=allow_fill) return self._from_sequence(result, dtype=self.dtype) - def copy(self, deep=False): - if deep: - return type(self)(copy.deepcopy(self._data)) - else: - return type(self)(copy.copy(self._data)) + def copy(self): + return type(self)(copy.copy(self._data)) @classmethod def _concat_same_type(cls, to_concat): diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index a7f28310b7554e..21ce5e999334eb 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -17,8 +17,9 @@ def dtype(): @pytest.fixture def data(): - return ArrowBoolArray.from_scalars(np.random.randint(0, 2, size=100, - dtype=bool)) + values = np.random.randint(0, 2, size=100, dtype=bool) + values[1] = ~values[0] + return ArrowBoolArray.from_scalars(values) @pytest.fixture @@ -36,7 +37,9 @@ def test_array_type_with_arg(self, data, dtype): class TestInterface(BaseArrowTests, base.BaseInterfaceTests): - pass + def test_copy(self, data): + # __setitem__ does not work, so we only have a smoke-test + data.copy() class TestConstructors(BaseArrowTests, base.BaseConstructorsTests): diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 6388902e456272..fd47ae6f312907 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -66,3 +66,11 @@ def test_isna_extension_array(self, data_missing): assert not na.all() assert na.dtype._is_boolean + + def test_copy(self, data): + # GH#27083 removing deep keyword from EA.copy + assert data[0] != data[1] + result = data.copy() + + data[1] = data[0] + assert result[1] != result[0] diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index b6e839f250e4e9..6fbd43e46495f2 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -16,7 +16,7 @@ def data(): """Length-100 array for this type. * data[0] and data[1] should both be non missing - * data[0] and data[1] should not gbe equal + * data[0] and data[1] should not be equal """ raise NotImplementedError diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 3b95c8d919eb1a..2b1bb53e962bee 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -101,10 +101,8 @@ def take(self, indexer, allow_fill=False, fill_value=None): allow_fill=allow_fill) return self._from_sequence(result) - def copy(self, deep=False): - if deep: - return type(self)(self._data.copy()) - return type(self)(self) + def copy(self): + return type(self)(self._data.copy()) def astype(self, dtype, copy=True): if isinstance(dtype, type(self.dtype)): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 4b93f0e12e32a9..1b5009830303bc 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -143,7 +143,7 @@ def take(self, indexer, allow_fill=False, fill_value=None): return self._from_sequence(output) - def copy(self, deep=False): + def copy(self): return type(self)(self.data[:]) def astype(self, dtype, copy=True): diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index faf1905ea1763a..86ca3e230ddd5d 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -98,6 +98,10 @@ class TestInterface(BaseSparseTests, base.BaseInterfaceTests): def test_no_values_attribute(self, data): pytest.skip("We have values") + def test_copy(self, data): + # __setitem__ does not work, so we only have a smoke-test + data.copy() + class TestConstructors(BaseSparseTests, base.BaseConstructorsTests): pass From 46cd7f0dd56a7189c46412cd873abc0735f38748 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Thu, 27 Jun 2019 16:23:02 -0500 Subject: [PATCH 070/238] Removed Panel Kludge from Pickle/Msgpack tests (#27082) --- doc/source/user_guide/io.rst | 15 ++--- doc/source/whatsnew/v0.25.0.rst | 5 ++ pandas/io/packers.py | 5 ++ pandas/io/pickle.py | 4 ++ .../0.16.0/0.16.0_x86_64_darwin_2.7.9.msgpack | Bin 4445 -> 0 bytes .../0.16.2_AMD64_windows_2.7.10.msgpack | Bin 4745 -> 0 bytes .../0.16.2/0.16.2_AMD64_windows_3.4.3.msgpack | Bin 4745 -> 0 bytes .../0.16.2_x86_64_darwin_2.7.10.msgpack | Bin 6196 -> 0 bytes .../0.16.2/0.16.2_x86_64_darwin_2.7.9.msgpack | Bin 4745 -> 0 bytes .../0.16.2/0.16.2_x86_64_darwin_3.4.3.msgpack | Bin 6196 -> 0 bytes .../0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack | Bin 4684 -> 0 bytes .../0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack | Bin 4684 -> 0 bytes .../0.17.0_AMD64_windows_2.7.11.msgpack | Bin 10177 -> 0 bytes .../0.17.0/0.17.0_AMD64_windows_3.4.4.msgpack | Bin 9300 -> 0 bytes .../0.17.0_x86_64_darwin_2.7.11.msgpack | Bin 10177 -> 0 bytes .../0.17.0/0.17.0_x86_64_darwin_3.4.4.msgpack | Bin 9300 -> 0 bytes .../0.17.0/0.17.0_x86_64_linux_2.7.11.msgpack | Bin 10177 -> 0 bytes .../0.17.0/0.17.0_x86_64_linux_3.4.4.msgpack | Bin 9300 -> 0 bytes .../0.17.1_AMD64_windows_2.7.11.msgpack | Bin 10177 -> 0 bytes .../0.17.0/0.17.1_AMD64_windows_3.5.1.msgpack | Bin 9300 -> 0 bytes .../0.17.1_AMD64_windows_2.7.11.msgpack | Bin 10177 -> 0 bytes .../0.17.1/0.17.1_AMD64_windows_3.5.1.msgpack | Bin 9300 -> 0 bytes .../0.17.1_x86_64_darwin_2.7.11.msgpack | Bin 11323 -> 0 bytes .../0.17.1/0.17.1_x86_64_darwin_3.5.1.msgpack | Bin 9300 -> 0 bytes .../0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack | Bin 10307 -> 0 bytes .../0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack | Bin 9300 -> 0 bytes .../0.18.0_AMD64_windows_2.7.11.msgpack | Bin 8386 -> 0 bytes .../0.18.0/0.18.0_AMD64_windows_3.5.1.msgpack | Bin 8341 -> 0 bytes .../0.18.0_x86_64_darwin_2.7.11.msgpack | Bin 8386 -> 0 bytes .../0.18.0/0.18.0_x86_64_darwin_3.5.1.msgpack | Bin 8341 -> 0 bytes .../0.18.1_x86_64_darwin_2.7.12.msgpack | Bin 119258 -> 0 bytes .../0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack | Bin 119206 -> 0 bytes .../0.19.2_x86_64_darwin_2.7.12.msgpack | Bin 12325 -> 0 bytes .../0.20.3_x86_64_darwin_3.5.2.msgpack} | Bin 119196 -> 118654 bytes .../0.10.1/AMD64_windows_2.7.3.pickle | Bin 4381 -> 0 bytes .../0.10.1/x86_64_linux_2.7.3.pickle | Bin 4338 -> 0 bytes .../0.11.0/0.11.0_x86_64_linux_3.3.0.pickle | Bin 8978 -> 0 bytes .../0.11.0/x86_64_linux_2.7.3.pickle | Bin 4338 -> 0 bytes .../0.11.0/x86_64_linux_3.3.0.pickle | Bin 5822 -> 0 bytes .../0.12.0/0.12.0_AMD64_windows_2.7.3.pickle | Bin 8692 -> 0 bytes .../0.12.0/0.12.0_x86_64_linux_2.7.3.pickle | Bin 8768 -> 0 bytes .../0.13.0/0.13.0_AMD64_windows_2.7.3.pickle | Bin 7208 -> 0 bytes .../0.13.0/0.13.0_i686_linux_2.6.5.pickle | Bin 7143 -> 0 bytes .../0.13.0/0.13.0_i686_linux_2.7.3.pickle | Bin 7123 -> 0 bytes .../0.13.0/0.13.0_i686_linux_3.2.3.pickle | Bin 10019 -> 0 bytes .../0.13.0/0.13.0_x86_64_darwin_2.7.5.pickle | Bin 7278 -> 0 bytes .../0.13.0/0.13.0_x86_64_darwin_2.7.6.pickle | Bin 7445 -> 0 bytes .../0.13.0/0.13.0_x86_64_linux_2.7.3.pickle | Bin 7278 -> 0 bytes .../0.13.0/0.13.0_x86_64_linux_2.7.8.pickle | Bin 7639 -> 0 bytes .../0.13.0/0.13.0_x86_64_linux_3.3.0.pickle | Bin 10049 -> 0 bytes .../0.14.0/0.14.0_x86_64_darwin_2.7.6.pickle | Bin 8159 -> 0 bytes .../0.14.0/0.14.0_x86_64_linux_2.7.8.pickle | Bin 9309 -> 0 bytes .../0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle | Bin 191074 -> 0 bytes .../0.14.1/0.14.1_x86_64_linux_2.7.8.pickle | Bin 11930 -> 0 bytes .../0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle | Bin 127687 -> 0 bytes .../0.15.0/0.15.0_x86_64_linux_2.7.8.pickle | Bin 15162 -> 0 bytes .../0.15.2/0.15.2_x86_64_darwin_2.7.9.pickle | Bin 14892 -> 0 bytes .../0.16.0/0.16.0_x86_64_darwin_2.7.9.pickle | Bin 15013 -> 0 bytes .../0.16.2/0.16.2_AMD64_windows_2.7.10.pickle | Bin 15173 -> 0 bytes .../0.16.2/0.16.2_AMD64_windows_2.7.14.pickle | Bin 132692 -> 0 bytes .../0.16.2/0.16.2_AMD64_windows_3.4.3.pickle | Bin 13766 -> 0 bytes .../0.16.2/0.16.2_x86_64_darwin_2.7.10.pickle | Bin 16598 -> 0 bytes .../0.16.2/0.16.2_x86_64_darwin_2.7.9.pickle | Bin 15013 -> 0 bytes .../0.16.2/0.16.2_x86_64_darwin_3.4.3.pickle | Bin 15444 -> 0 bytes .../0.16.2/0.16.2_x86_64_linux_2.7.10.pickle | Bin 14893 -> 0 bytes .../0.16.2/0.16.2_x86_64_linux_3.4.3.pickle | Bin 14116 -> 0 bytes .../0.17.0/0.17.0_AMD64_windows_2.7.11.pickle | Bin 18269 -> 0 bytes .../0.17.0/0.17.0_AMD64_windows_3.4.4.pickle | Bin 16236 -> 0 bytes .../0.17.0/0.17.0_x86_64_darwin_2.7.11.pickle | Bin 18089 -> 0 bytes .../0.17.0/0.17.0_x86_64_darwin_3.4.4.pickle | Bin 16026 -> 0 bytes .../0.17.0/0.17.0_x86_64_darwin_3.5.3.pickle | Bin 129175 -> 0 bytes .../0.17.0/0.17.0_x86_64_linux_2.7.11.pickle | Bin 18089 -> 0 bytes .../0.17.0/0.17.0_x86_64_linux_3.4.4.pickle | Bin 16581 -> 0 bytes .../0.17.0/0.17.1_AMD64_windows_2.7.11.pickle | Bin 18269 -> 0 bytes .../0.17.1/0.17.1_AMD64_windows_2.7.11.pickle | Bin 18269 -> 0 bytes .../0.17.1/0.17.1_x86_64_darwin_2.7.11.pickle | Bin 18089 -> 0 bytes .../0.18.0/0.18.0_AMD64_windows_2.7.11.pickle | Bin 16875 -> 0 bytes .../0.18.0/0.18.0_AMD64_windows_3.5.1.pickle | Bin 14674 -> 0 bytes .../0.18.0/0.18.0_x86_64_darwin_2.7.11.pickle | Bin 16718 -> 0 bytes .../0.18.0/0.18.0_x86_64_darwin_3.5.1.pickle | Bin 14671 -> 0 bytes .../0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle | Bin 127584 -> 0 bytes .../0.19.2/0.19.2_AMD64_windows_2.7.14.pickle | Bin 133468 -> 0 bytes .../0.19.2/0.19.2_x86_64_darwin_2.7.12.pickle | Bin 127525 -> 0 bytes .../0.19.2/0.19.2_x86_64_darwin_2.7.14.pickle | Bin 132762 -> 0 bytes .../0.19.2/0.19.2_x86_64_darwin_3.6.1.pickle | Bin 126076 -> 0 bytes .../0.20.3/0.20.3_x86_64_darwin_2.7.14.pickle | Bin 132857 -> 0 bytes .../0.20.3_x86_64_darwin_3.5.2.pickle} | Bin 127853 -> 127923 bytes .../tests/io/generate_legacy_storage_files.py | 45 ++++--------- pandas/tests/io/test_packers.py | 40 +---------- pandas/tests/io/test_pickle.py | 62 ++---------------- 90 files changed, 40 insertions(+), 136 deletions(-) delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.0/0.16.0_x86_64_darwin_2.7.9.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_2.7.10.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_3.4.3.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.10.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.9.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_3.4.3.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_3.4.3.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_2.7.11.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_AMD64_windows_3.4.4.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_2.7.11.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_darwin_3.4.4.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_2.7.11.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_3.4.4.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_2.7.11.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.1_AMD64_windows_3.5.1.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_2.7.11.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_AMD64_windows_3.5.1.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_2.7.11.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_darwin_3.5.1.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_2.7.11.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.17.1/0.17.1_x86_64_linux_3.4.4.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_2.7.11.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_3.5.1.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_2.7.11.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_3.5.1.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_2.7.12.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack delete mode 100644 pandas/tests/io/data/legacy_msgpack/0.19.2/0.19.2_x86_64_darwin_2.7.12.msgpack rename pandas/tests/io/data/legacy_msgpack/{0.19.2/0.19.2_x86_64_darwin_3.6.1.msgpack => 0.20.3/0.20.3_x86_64_darwin_3.5.2.msgpack} (92%) delete mode 100644 pandas/tests/io/data/legacy_pickle/0.10.1/AMD64_windows_2.7.3.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.10.1/x86_64_linux_2.7.3.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.11.0/0.11.0_x86_64_linux_3.3.0.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.11.0/x86_64_linux_2.7.3.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.11.0/x86_64_linux_3.3.0.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.12.0/0.12.0_AMD64_windows_2.7.3.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.12.0/0.12.0_x86_64_linux_2.7.3.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_AMD64_windows_2.7.3.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.6.5.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.7.3.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_i686_linux_3.2.3.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.5.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.6.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.3.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.8.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_3.3.0.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.14.0/0.14.0_x86_64_darwin_2.7.6.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.14.0/0.14.0_x86_64_linux_2.7.8.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.14.1/0.14.1_x86_64_linux_2.7.8.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.15.2/0.15.2_x86_64_darwin_2.7.9.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.0/0.16.0_x86_64_darwin_2.7.9.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_2.7.10.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_2.7.14.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_3.4.3.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.10.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.9.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_3.4.3.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_2.7.10.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_3.4.3.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_2.7.11.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_AMD64_windows_3.4.4.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_2.7.11.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.4.4.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.5.3.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_2.7.11.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_3.4.4.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.1_AMD64_windows_2.7.11.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.1/0.17.1_AMD64_windows_2.7.11.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.17.1/0.17.1_x86_64_darwin_2.7.11.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_2.7.11.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_3.5.1.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_2.7.11.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_3.5.1.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_AMD64_windows_2.7.14.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_x86_64_darwin_2.7.12.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_x86_64_darwin_2.7.14.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_x86_64_darwin_3.6.1.pickle delete mode 100644 pandas/tests/io/data/legacy_pickle/0.20.3/0.20.3_x86_64_darwin_2.7.14.pickle rename pandas/tests/io/data/legacy_pickle/{0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle => 0.20.3/0.20.3_x86_64_darwin_3.5.2.pickle} (85%) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 7caaec62c0a8a5..6b3edd92ab5a9e 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3319,16 +3319,7 @@ any pickled pandas object (or any other pickled object) from file: .. warning:: - Several internal refactoring have been done while still preserving - compatibility with pickles created with older versions of pandas. However, - for such cases, pickled ``DataFrames``, ``Series`` etc, must be read with - ``pd.read_pickle``, rather than ``pickle.load``. - - See `here `__ - and `here `__ - for some examples of compatibility-breaking changes. See - `this question `__ - for a detailed explanation. + :func:`read_pickle` is only guaranteed backwards compatible back to pandas version 0.20.3 .. _io.pickle.compression: @@ -3406,6 +3397,10 @@ both on the writing (serialization), and reading (deserialization). optimizations in the io of the ``msgpack`` data. Since this is marked as an EXPERIMENTAL LIBRARY, the storage format may not be stable until a future release. +.. warning:: + + :func:`read_msgpack` is only guaranteed backwards compatible back to pandas version 0.20.3 + .. ipython:: python df = pd.DataFrame(np.random.rand(5, 2), columns=list('AB')) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ff84ceae417cd5..5a5de357e17780 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -13,6 +13,11 @@ What's New in 0.25.0 (April XX, 2019) `Panel` has been fully removed. For N-D labeled data structures, please use `xarray `_ +.. warning:: + + :func:`read_pickle` and :func:`read_msgpack` are only guaranteed backwards compatible back to + pandas version 0.20.3 (:issue:`27082`) + {{ header }} These are the changes in pandas 0.25.0. See :ref:`release` for a full changelog diff --git a/pandas/io/packers.py b/pandas/io/packers.py index e3d45548e49789..4a273bfe2decb1 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -134,6 +134,11 @@ def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs): Returns ------- obj : same type as object stored in file + + Notes + ----- + read_msgpack is only guaranteed to be backwards compatible to pandas + 0.20.3. """ path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf) if iterator: diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index ce2ed247c158bf..afe1622d99eac3 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -116,6 +116,10 @@ def read_pickle(path, compression='infer'): read_sql : Read SQL query or database table into a DataFrame. read_parquet : Load a parquet object, returning a DataFrame. + Notes + ----- + read_pickle is only guaranteed to be backwards compatible to pandas 0.20.3. + Examples -------- >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)}) diff --git a/pandas/tests/io/data/legacy_msgpack/0.16.0/0.16.0_x86_64_darwin_2.7.9.msgpack b/pandas/tests/io/data/legacy_msgpack/0.16.0/0.16.0_x86_64_darwin_2.7.9.msgpack deleted file mode 100644 index 554f8a6e0742ac7f253c0fb108195b411440f99d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4445 zcmdT{J8#oa6n4|}0TijJnm@n-10Ycz0|TT`Ayo)b#A8Knas!4q4!$mwf>c$MA_iWD zN(eDDO&lHt76wL&kQgX}35q&^SP+5%#8fu!@i{kfo0{4ckCNej&UNhj-S3?9otq&? z%cfZGnPN({*tt=wZ@idQbvEv#tdltwhiP?6=~9$upQh?xwGT zJ78q(n)WvW_dI@b32>zAz&pUNA3m?16u5==p8~uPZGAr3;vCJWoZs&di&~n_I)Otm zZL(wej)GYD0kDA^2VHgrEo+@cMh$QqIpaI-c4nLIw z2Es|RFebXG(BmA&3>-wor&lII(q|=;P<0UQ@V0_5Y zwO*Dwhq?w=dKC$z%%?PJh9eN8b#EsPJ!dk`$8o=E;x_?yH=1_P{@5xvi78=>jE|~F%ZP(P=HBmcK zYe&Rb=YV^1yI^El!8&1tOZkwjn-ilJC)SJAy2A+|D?{`f{LE)8K`mT_cc5spLv3Vk zZvD(3-RpLtTQwn=cp)MR7kzT34`OVC-QWydxHHFkdT|CW#0;f{lh#ab3HM7H*_4wu zMXn5jKMj_;Ge+_#zcS=#7Sp+xrMKC6s@#SuM|C|m$`7kKCepCv`aO3$_e>>cD?9r; zHJ~a>myy6SxGlQwHlR#hxv2JvtPe-~kM$Yw#`~_I2X!^89%3evz9Wts>JAVcE$c6+ zj_Ue(=XC$C?>8s#{&YW?&kk@}kM2$XneL(trRa9>RKFq{SyiIfR1!USyK;}7cdMif z!^f}i=Hep@M;6xXQ|NCmsDR!!9L(E<+d0-@ld-uhEI_5XCrcUhYhgsD)5~uNx_|wI z;j@sSWBn>+H z9z!A^3;nY{$+E? zsacj8Sa^?dI>i~F-5q^$?rNkiQf~1hGN}+{vyrGI=sux5xJ#@w8T_4JQ4F^i3D{)DxBvhE diff --git a/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_2.7.10.msgpack b/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_2.7.10.msgpack deleted file mode 100644 index 7f05cc605333d7b2fc4243bf21464efb3d190968..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4745 zcmds4OK;Oa5O&kFv`{2cHGhB$8~};(IB}Rj--I@7jzS$Hl zC7&TfCo37*Ag6DnHMO7f)wM}%rVZfg<~tn z1svNrwsXWV-0I64>(|;b0IxA@)dJBKEpsZwqBu99g<~tn1svNrwsTy_aS_K3j*B^> zMN|Fu=C(=5$9AynHrj-2T&HO{C+l?dCNM+|SuKnKweVtX$~q$F2Fdfz7p=1&9!pYH z%VsGtrmK`~KxolYl_8Qjg)WCge>o4D6c+uTM72Je2@k*HGtLIVjwHseK7Dqs=14iK zX)`Gooz-2oZx9FWCZyKi@zI~dz9>3Qi;FO4GJ}Oi&iKMv>M3n5Z7KDZdXNyyli~GT zba=R!l^i?KoG+WWSt-Oej!ROwxIgGtHmAvdDeQB0=i5k-JCmv+qK@+9i?`X-Y&ONs zShE>pV+*f>?SnZ(aUwsZ!YvlC1+quYvX&=h6kPqxBuU71@)ouR}kwz_zTma zU+V(?3}%#%R&=tzyV#iVD`t5>)`35~U?jrDu$UPJT1`+WoPi5_=J4PU&cFqmiJB@_ zR?(@!L@ll5Ggek-`7;LEHdY!Fb;DEAkMliMcx9`p%y@j;0A4gpJ zxd7VA9Xs6nR5>s2Cpx}-lbxmHtw0G1-yh`!V;`i^-$s7^d`SwQ3-P_ z6;{lG5;IDfTnuWPz9MO=6^WBzVFBj}<_mN*%c7QVBg>+t{ydjuWBnYDrk_kvE#rVr z2oG9lM|H_$gnG5lz#=&=N24I%Of?V55nP4QESrzy!1R?I?H2dQfHyg!+0~05hxQzI z(teuwexWNNNgKM)y#+k;Xz8jX!`Hj-p8#C;e%nkkflE7oXX~UZ8O^vYziQlgg?01Q zFm%JP28OjTtizS2b_^UO>3R`OL=e%-;>_OIoPVRe734h0jn1XZS$*tu(qoSiv-DD| zCFUGKCPW${igqVZO7Z6$#a|SGwKj+E1+_)7j`&WX_+DTa-$$$yT(~y#mt5@mPG#Y- Yaee=L4mYkXd?p4T9D_vG>W5tU0frG**#H0l diff --git a/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_3.4.3.msgpack b/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_AMD64_windows_3.4.3.msgpack deleted file mode 100644 index b10d175c1d68ff481a8baf39da089e9be2f351df..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4745 zcmd5=OHb5L6z&WIf@mTH{(u`@sEPPkxKL1HLW~jPV@t}+9j#8MV{cJJ6G9}Y3sDLr z#<*aZ=_sIXT(}ft;zEqB6f+Br8)I~#aiu%&x%YI+bXr;vA1wOa+i7Rc>-)~R3|SdD zO;Q6^GNsAJV9q$7x!;}A;-|Y*IW3=kdZ>In_gGQJRp-{=V%ITN|ZT{1~y; zh3jztqhCB{<5J(Zzo)6<`z(4=&XCb)byPUCMmRLRA40Ylf|Zwh2_602*aW>_lr{<% zB`Jc7C}I_23{eA;kssDv7_n&EZh zY%QMJSBGnsPUvu&SVdImO8U9hVidPtU+9bSc_PPxEiq@aSh^H#4`J4I=N9i zZ@?{b1 z3=cnkavgA@^Y}Z!uOB{dnrFD1_Fn+J7HfJw-zbc17A0}N9nIlpT1lO0!r-K;_$!GS zQb|U#ZQ_@L^rJ)ykz_aTZM|66DtK&0s>SzK(m1qbUKDWxUsAS`Wr1|%3RAKSd3%@f z){z%8h%bKJC4%7EaRk3NLi+)55jMWg5c6W#fn7U52XRT_b-jmO~yy194AUCsE`m;f|8`T_sGQ3 z=k7fgQ51{l52R|5?z#dUK+x97$ci=oW#nXW!umCGhtT5vAN{VJR1%x_=iejHU(SlE z^pb=>Q^rC|)yj%!Dc1i`MBR+F_bz#NGc$C=+RMbucWhVAcM$BoAOf=7b|%r2S-Lahb}@TSoI!R9dls2G`5_p46545Td0)Dv zD?lAjcgE-Js7Abxat(vGUc4MGKg#8ppv<-6i)FH8d&&Q&@c($ZAbN`@jdeMXq_IH4MryBReB0&ef4BGZHhSRc4f3afn12UNAR%W+}A&?5N{y|3F?RCt8+sAh;Rs bK)^~u91&o_j|gXadc_@BusdaivcThChs;;m diff --git a/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.10.msgpack b/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_darwin_2.7.10.msgpack deleted file mode 100644 index ed606295b0830c7db08e191b9b2037740ff5c52c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6196 zcmds5Piz!b7@uype~_B6wHxD2z34$PRwU7bZs|hO7_99$nh>H%x4W}K3x-4!kFFu%Knh^W7p& zSMvrwT+UYL#aCbn&qth*U?8+44NKDHJ4>C{dL=1c()R<(o0C<*uRlNiCtzahvjf*D z?)mQf&j4qK-~0ve_glX{bA#e5Wa2%*Pm_IDZuC|)q~dx-(adRf$~@`vT=r&`oL?IQ zDZ#OwW0GSB$4-u29J@Jg;>f}dF@RkN_wir7$e`ACRIN$x@i$4-u29J@Jg z;`k89%^Z6;KFpDMv@q;U?zqJAu^T))&wOIp_>$yM0U2cS9At=?NG(o5YT-rimi;y= zjNu=9uXQfnxhzRFJ)bAotk(#g0F$E-^doG+A+dj&hfE5K{R%PViKY(n`MnG?tJ+@2 zEuMd#^_H2NJrUP)sv_3HMo4+%J7djQZS7WD(n_>j33}|{uI?Tyn2MYJ1r;t?jBVJN z#H#2Tu9)xW@%-;;_MiFr$E)1m48v}YTQYXOprZ|&j+o22H$ihSVwOZkR_0MljyWBq zW@fM1?n<#80hF2FzxTPFh?v;W0?6~|OFyH>1$oNZXh!&|dNH=qH=}?56W8c^;_mU+ zV;k-Hd4O$S z+buS%^u}9HOd`^{qAKH9_Vcv0#r|(ZTy(P&+{IE;XMVVPOcF$_SMs{PnBj(V@`f#) zX|EGRQ%!54q!da^Nc7XFjZJ=@5>Q}pf2D|m474)VHPPj)@q@e@Y zU<)}%RXo}SDPaQQKg|gXeUPTPH?i{bEgANliUsF`WPTSYpk-O9>F6)*)|i(5^>Zd#{iGGuHV+yY^Pq)(C^VGG zaK4_W3fbw+R3{U$N7k>Do>_%%tv`r1mBF3+e3kwq?ROs=`MB;13|RdA_A=Y63vZ0R zg>%bGY3czerA)~RrksjavjRl-8dTHZwiV5@wWy}7-b?P^GUA$TgXU@)&95voZvIRO ziP&$me;w~jNz%*x$8G|Cbm_^hX^KDX`}Tdnr*6Ntm`<^w-G3gJ_4;%)=LX|CH&dnZ z_jx$5z6HHvENS5cjJ`^Fouz3~PG6xE0M%|2%5 zH^1-cyrX4PY~)%grCRL9Ow!PECgXfz+8SMOhB7MW^S$DUrRl5>UUX8{vuPIDWN@10JA5^39a#nC)JE<^6^=)5Vur4Hvy)F7rk4~MKv?b{_0$6U0Hpu zC}ks^=FD0sbFl&Ac}LeqSn4_+xq5foEe}RX*hsLl20#cAs3V zY#e;R*u;ZB{=DfOu$VTC#RM$`*V?XUY$M;UQMYnvIgy7+4d?0|Cgw?rvK4v?M+!#^ zy@ei>*714x=4{d7=ZSRw`HpBM5ZeiNC!BP~P`9n@n>^0i*tD)m)yX;idfQFSc2m@j zG}{p|wonwD7|vK)Mfo`$E-`=|NPFBa8d+AfE*YQL9Mcvj{@cG4Y)%RpL830_lVIZetTsXIgL3iO*v8LyC-C{&7@hkVWbSoV>G&U5MFg`n zi5(_P?gW+1njmTNf~4YLjRX%8IvlgG;aiWH48M(Ce~ZJie*T;cmp?^Db*%FyVL&U4w154+)7tRC$J=GRCM{pE|W40B}f$pnRa<{xj3;3NQ9Iswn?C(MwDE4mZ zzuR3gMftY-<}1J-fBSNOT;SiQf4&O%)tlps@fd!(@^^M!>WYUWcXYGlCQG84AAq47 zhJ!F1g5hiY(%kn0Kd|KbBpM5mMDNL-y*{1oQQmTDmSx62rph^eqC4qv$B;Q`4H0Fzlc*%7%Q6y7*?lF}g6X)MGIxWEC52&EDnAZgM{5h}Q9%hj5A$L&(D*WKAbFi25E zNQ;2dZX2npK#F6pX_FQqfy5Cbq#g(=r?$vKpkAs94j?Wq7hvAbjQ7V`cegdg)QA0c z*E=)cyzjj?Z)U+#3NoH~yH>=yqREG=GBQv@O2=c8{Y!A{MxI9q4+$ReG~A5`0%$^ zhJeqfFoL0|nAwXNWP-6uIW(EM7{BuqyopV5*(lYmA}ZkgoR!OK$e6F98B9(vE1K4% zfz>hlx;=RQNq7I2-LFvLF>btDsu0{e7rVu}oML&b9E}=`bbczYWe;XlR6zT&UbCOb z@dd|XwbxL78k2^!BqqS{_ZPeyg)tB|Pdz z_Wb}=&(fTLc7iOBL0Jm82Q?8Zflw#tS*#S|fB!Sj`Yhx+q{$h&Y`EggQN&V}87zA= zISIO#$IBDJN%Ox+$$_zQa=p~rTpO4k!Lj0vfgVo`950VWvO-G)v}GndAjo69p5SK| zZCzr_ntl+#>(`|}j}0d!>A8`kKLft|%_Cb<6o1(Ju_2sEoad!Q1SymNKteBB=BwJG! z&B_$gD_Q}w8{>5iZc5@!sUqH3#sM8;n8vC)oi`L#iaDi(m_L-P#I$mXPK+ACS*;** zXWRwN?6N5^fXr5lH>8jOv}u%4;5CBjcZa(XKq7_ui*LrPKfpQeD1ex8okV(0HZpcD z%@`%p5g&0WAQDB;JcGSFsJsnyg>}l*(@yn;FRGtC1l8BdzCBOwK*Yd$0lI3f!|v5K zn|p6On0c=bHmHp6*xP>c|EG^dBC#v|4wW74QYlSRx`N4(iacFB%yzy!H;Vcr;@S&R z@J{{daggum>3)`;EQC*N_5nd{k4=>AC_GGEUHv-|+R5qAV&6TTWrOg!bpH`T8)d~= z4O6zcKf0A1VNYc|CWGBeSLpAZbkm*1qyx)IxBfKT=}OfV)8V@b`y^}8SvMYd_D2+I zttn49jY2O_?nYls{q=X$kuJw>9D6BpquuX6{Bg)e9@k^rwC+=Dt?6q*g5><^?=Bvd z+MLi-XxY9t${TuSNvmtm6c^2&Ub81|#(K>dy(Soy@4D0Cai|-cQXF~ D`dh1> diff --git a/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack b/pandas/tests/io/data/legacy_msgpack/0.16.2/0.16.2_x86_64_linux_2.7.10.msgpack deleted file mode 100644 index 6bf1b9b9afaaafbd8feabf50954cabe53309b6e5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4684 zcmds4OHUI~6dp=>DJGQAKj20eY9gWw7g`=6A%=+Yu{8s|m61-T&J2oZLWl^u5M`9a z7#Eaw8WnZp!lf7!7h-g!7#A8h#^^%h3U}T!=iHV~J9IE;h+W+8&g0(s?shFqE5W80i;bCttg`vEk}$_{icD(ql=EMfoL>ck6d53MeD8nd76sicT32O*z3F`81fdULk#k6!d2PB4l#6M z$Z_5DLhi$35Poo&yAW80DXU5Qv>YE~uR~+EUcQ=?q^uT;8O)qd$24srLJJ`}*6SH- zC_Ia-n>)0ah{Hq&lcf$5%9|3+mTFD4rP@;+sa7`J&#<6Y2cePqZS4wS)~3 zI~LdEb!+xj9%pG}R#k-6DLCEswkj&Eil7y!v;usrqAchhjGIc%@^dO&Vh-A%T!xj_ z5-e?=(f+bKW=(_muYWaPohd{H@zR37a{A;~;JM33GderbJYL%4H?4H9tb_k>!kq|{ zVR2^|+-iXm;T@PbcTNos;T@RpJAOmOjw!lfa^>vP5>Y#*^Y|G9Z5zw>#cGYBF0_orzRmo)))bPYz0H#!0WUw>yd%u<-I2%V0k?cU zun-R6*13PA%cLvpk6im|$xUZ?HQxzCGYq?6Xn|oj?lgC>_ay6EE}|hHB6>sg?4{|f zkMfq36D&UZCsoew;@wHBGy06u>w&63t_7J9VTdTooj@r<&k4m}5rO$KhwpinMX`@q uCs3>x*hJqE`*Q|eJ>7SvwG?V&`^ z@syhEVSHNYq_q_JvTC0t8F_#CXChVFCepj@(%S{Gax{zSe7-Sq_%Suj>`(+(z0>AK zx*byDB$9^uzvTpT@ofj?WG7-H;g5d&JlFRtz56gH%gb{z-VQNa$gGU%+5)vn*3NgS zX5L6KJh4t2aMCBQ^yTFh$2w**HmeiI_x2<$twOI%=BZ^BeUqj*1@cZeHs0EttqA`+ggU-173cz zdPB2|Zx22^2e{_ro~h;-@@g*Uba zp+YR@GO@4gIP3OHX(qRup1}IAxs848A|&ZNI16@(0wQdN_yvFZX}aOnDn)7TGY$d9 zmxT*}4{qJ~4j9_7zV)?>`yV~N3OL$%_#NQa51-e)adA8EKMQyz-1O|tDs&^rzzcE( zvAhptAOWbCV^-f}Az`wUxwc}OnVJ#5C>e&yBt!@s2&06Jgi8pQ5-uZLPPl>)0}urR zbr)ZS6iR79B71QY6P<#N*aB`2{F{^sp97+oDCIgCp~h(-j1o2yE+Hg#`0ko)nUhvi zEIIId4>LlZ$?0$nRI4~HI)6D|Tj%$sqZCX6>2V|~JLwBT>4_w+r0dcnNeueDT&Kym z9%i{_Pc`qXp*W3%A|Wg&rxR+$^&_Un_|TZey)L7Oc)I?l>R3q6;ujqZPw{{VOi2kq zBq&!J?5!aAt||@ear@w^Tjt zh=J~9Z1Cpu7sHAIVw`t>u%@VP6)e}-%ndB)q*-$|TqX8pLpoM()AEE+Dy*nXspDQQ z<^+0%7%Q6kh+V^VgszN~9h*pa?;QLPXOvK>17Bs8WO~g7~>06m8;hYifJl-3_<_ zDXJPJaG(&Uk*bPAT*pN{bKpoSR*gieaw?8`0C7PT9H=;z8#6oa?QS-^>)j--+!7!5 zThDrD-n@A~-@Nf$C7nx>!iANjq><0{=1Do5SBR>v>QQZaRTs}>BvsYhN9a>6oh75} z$GVu*&gF^z1bc_I4i0>fQ$N(>DTSQXBO`i@KDM5Y8zbm(`fF)zOb~>d_wFX{bP7WE zwEP~!WLtcN;dkF${)b_-@A=_hD8Bu}jjtFkkG=mZ!@qz3v$sm|b#>-bhF`|JeyVor z@tl+;tL1b~n}d54P^4{x1NKQ-LQpyAC&EF90>%L2fFfW5unDjkumunm)jG;X+n6B;7ZBF#Bww$6Tt>cH z->Q>RMpD%C*V+22nv~7Ya*Hr#)nc~l;>S{Eiu~MpCvo@QO+nBTaw??~ZS|T;MF?bU4BT2=@M33=ipGO2BGxTRtwR*b}p(O5A`uOjx_dU7hG zrET8QmSqhPFhu6pYOyTmNLl+tcC6*j=eK_d<#1cgtD1`Qdz8bXLu(H|BD(3el1&!~ zjfRa86+EqW9sELw_{YAE*(^Ta(`CqIz1&lZ7ZxSWUU4q$tukXl4;P zVWe@@;Ai}GXX^9%4h+K_%lRGW*G#F=F{USn^l_;`qx>$Z%-M}syt8s3P7B3$GC&mH zi(wdisNFDt@vM}SP7wuPX`!rHI>GFOQ-jCUP`J5xvk3p^`-k{*i*2-?U_5G*shIKW z|6nYXS(%nn3>#7we6zwf14@-I5xrs{iMR)nh!sdI592R;*@wl#eAYd@7U~dIt7eGX zjuwK(nTHf6YclrogrqR)T{Lbe|7JJS*fPx+xzSS5PAi09$P2{iE2*@iYIK7*Dd&>h zkZJ}{gE5+JaVfk(_>rXsbc;aYFuWET)-5o?pqt=#G%OFd2(N(=>~Z(_R#iaK5H2wM zwfyhbT^)jOwEyC748QoXr>~ph#=GBt#_-uc-dykQKzYp?+@{Rvpu6GqaUA>KO?@Dl z3apQ%6}n|Q#4i2pa+qBP*yRX{*Y=iCTi7&t4Vb@z{-k>y+Z5PcL7<#v{ zXw16~ad7PbdaIcsodytd=L<25FBHr^_zhN}4h-?{J%9%Rkxw9hd=lE927C@s0PF&M z3h)`gUb711!$0(J7FJW!4QgN4jVwEEx3axVOZOs!1L5Q9UZ&m4v~ZGU)SZg7WLJmS zyI~$(;gJ>Q+?utXD=fdd=G@&aeC}T6Uc$`nr^yD5t#ns^enlat`sc9zh3v(?`(S6J zm-&{f*4~RQ9^Z|z8)G+icB;Tz*bO@|fz#XYo3ct?%8|^4$Bl0+#>A*u>AWv&{ve@1 z!hi$>2?i1fBqT^UkWlmhp+EwH1Oo{R5|Uo{T_4~JfQJD40S^NX03HDx1bh+jCBT;f zj{?2|_$uIQ!HLeI(}%oFweg6KQ*w_2uv>|o|J7j|bzX6dd5a5q3SYgejMDRiI;4r9 z6}-iW%|qCKHS8bs5JdD8MD!R$^qdDXyqEgaH0q}^tVLi(buW5)s0?35jM_Cx@39@x z>p2E-j{*qanFMy&BG*hI*pncZEXJBAo8V@YHvlOsiJa`miQW7zGs*oNuUQDA08&CD-d-J|;W;)F5zNJe`o9xXMht7JsM3{{%OhGQs}sXmDJrrlTlv2H5&KkH zHVk_}RmhmFm}j!2qz%7MM$7^|w^M96{I6H{OZbVwNgOZ%SO>_Tu{5BXJ>9+s(YwVL6v^H|e$W0e18dCvjH{fQHgpae0lX8~i;zdLW2d{C{@cZDkUl*q16=#=*GO{Y4 zAUfA+v7egr7_<6fCN)NsA!RgM*7UQ?^!cOt{(RrgbASus4zP|pCxPK()fs;2SXz@S zn?5JBC1CiNYj`gjxR^Iz1#x>hHUi$7;09etGHH6cqjW`g*qW|FPM#Br{xX`PjG(C- zn0W1X-6%aVS8PqsE7b9cnaYrg5uMBT3wU~{vVYJ0_eE>7BpvQL|2xA^e`xP)qqyd)8K+M2ntU(>|u2-MU4u4=p&uJwk5LGEVAEv2&M*PTpVQFn_*x=5UZ_HH)2 zR7`5x0e0wOhlA|U%?^jw(8RIHl9o&w#9XnR4ERg)0rvqu3D^;!WtF{!&wtLbNm7KhH_tn8 z_k%>KUgDnyz+HfifV%;KPFEThF{(~J;n}B{)XC|5zsKYGJ>2HdVdW0DcOvlWjq459 zS|q7$Tzi|LvZt=d@S86${>w1l`Apxh6yNyn`_CE94Ziam!+-wxt7DzwE5^h}3_q)D z{c*h|wC`hE>$ZZN__G$MlqGsfQ>Oi59}kMXJm~!BHlgcaSL*>A0Cxd40`3NU1aJ@F zqkxYAa-lue`V|+9OS)i;$Z1(O&U*b@u*++?)^PpC)vzwl&dMrDp9{{RX2Si)vIpo8 zt`CkoXA~JBt>D0PXY_c&o5~Z^Osqb>d5Xz{5;G8@)C<81h#3sA3D$2-@ZSaNR8;ee z85}Zql>t^ZkM~tpKdFagu4@J)kCVBKAq_0R%_x17{wF4quMLLS~7U^pT{H_!5 zS-=B;U4RDxy8#aY_5eNy_&neXfQJEJ1bhkbWkFiZJ)TaY&lYuZqDy3yd{JzaOxt=s zTxKWEaQXD;7++y0bOlJU=#1H*43Zer^NI;nGsrPZZ#uu=oS413663_Y9|(`2+DB?6 zv!-)1gcYa9`szZA`zF4`WknK|F%1!AwxMr! yYEm#9CtpPeq=>$)k-^XKox=sm5Oy;f$^1Y)uakd6!CFEsB)W^Ei8#M+l8(m zRnw{#4h)*E(ll{smS$*Y4jd`MS|y6$G+ONd;({hPFmWn3-pBv%xNe--S%T8h9Q?DN z?dSjdfBxQkStuqmaWZpxEv^{kW3zQiO=mTt>+5FFm|ZvJ^C?Bw&5lue*GQ(x7<<^1 zZ2ibo<`j$enIU>YY{J zVi=Ex*BE~F#kIc~1_z!V`I+MD-+cQS!`1P(e_{C7Z-4Y}QG8jS`;g(M;hyiey3KG# zNt5+LGGi>jISMG#w&5ZBph`keIp{ASKnene0Kp zQ>ekKeU5-utJ??1<)U}gHy1i0B(l2$xOe0YE~~?Bgs<$C1%*T3!Rulor7HVvtSww< z8YIrqic#Hv1x6PW*b2BacO#uFnH3U3M4fz(Oc{0!^KMsiflw|G&ILocAU(>K)y_<( zjHE4EqG{Fu2}5Rat>+4Ah7^ns)Y@!WZuQK;;Xv0-eZZ(8)ml=X8;mMjHDsc*@|6-`qvLNm+A5njf1 z&d~h#t^G9PQM+8Fl$ibp z&O(`$(NuZQSN)aYrcJq(UmXwz_ zYS99w&uV$*KT4BT8GB(;(HQm4@e|6w*~u)n%rZvKTZ+axjS$O*UKo8bk<@g9P7tTm zOk6lp)db2JqZJlc{X0Y@veblO;VB*Z*Fw{V1!fow6TC*patRCn8W_bM%aG=p{QyaW zzr^t8>OY_NbV<^Q!7INq{N%g7fnJK+Z+`s|!zX`#b+fk%<+aw}c2!0PkM*yQQ`mn! z^?^ywvp!bV=(Ol4I}EbJF?JYYhvO(-^fkV=uxs$htNl^Pt2lweuJ?|-NogdTE?POz z_iSUCkGlbRaN{sWt5qSL4iGBO7eXbzFk|&0PH?5^a7mypNCb|bd&nmEna0O>|SKc*uWVJ*cz?l@@l=__AG&F4;98 z_ikHTSGZ+`HMi8$?JnnunJF5vnXvKKq+ zWXpHg#FVSr+>07d%*NOWpN(CZo)InVgzcEX_Ex*^Nm((gWJv1r!^St3Lvqks>AWxP z{vn}2!hi$>2?i1fBqT^UkWlmip+EwH1Oo{R5|Vy+ZUFEZz@vbJfX4ub0FMI>13nA* z9N_bSCjegnd=c;^??h+W8AEQa+I&REDY?r6aJeON@mGf$)_LV|7A^21CwghjnnvSPhHDX6QOhrSx~L3aMhx0D zNzbtz(d|AaahC-UF*ETTusNZvYC7{6MfKZ8aemM(;Z)l(tFRj*qV$1xVr&X*0L3qt zmnv=DEeCD!y6SK-j8jC6Q$&nY#J%w7%g5MDb!O({r8lt?{rs3HUr3K31y0Oij~qe>B~2;%30P_&KLt*PyGcQ=p* zq^N3?!hu3IjZ{?};@Imn3FX3pBSA=wL=c=xq8>n8Pz47nPT|IzeKYpk^{(yMZjw@b z*l&03x1M?Py>H&kPG@yRiz@o1ba*Uz>H4S|Pil&;uVxZ*Tv@fEiLf$0U2r>WoKGrQ z_0+q{s9~Lr$+~V0!7npm*^uW_{i%V}@zh|dKh1u}pJCZU79%lLHu`$`nC$EJU)zJ$ zIwj}l?#p-`;@E`P9D;|U)mA(@u7tN|HnS&Z!SWEC=S>%^2;kob5$h1^5#6piW<&IB zZXHKRfhp@fIX0&J)G~GLhaWFXQdW&bbj4U*FeX;vc}qq#uIPpwPo^WPx*ih#3LRdE zM@xIl=h|V=^2xE75zRX~X9nubK)o5PGlT5dU_rQbGp#0+wDGDG{UMmSK91NGvh*`@QZefqtf-;Hbzv$otryMTvv&S2k-L{qKqemm z=QHf!{Eu)n9^f|2)bZePhVe{)P@)f!h;l4U^kO0`YnptXtNIdCb@>ELcSas(394FG ze8!6lpeB+Cp4_mhF%p*(@@YlmTNUEV6%W3Yy93&q#P|j(i|Yx6z*pu2@#3i0o$L?g@#J^!8re>Tt% ztSIHrSVOF-SdjE9(|oVi3b#`#)k4Zo3`K|#xj@g!^{PvK&jqp86SI9y^qLg1y)Xe< z!@wrHJv!^dbjOBFcvEZgP`U%^^AypMnlwp${>Rr^XITm};VcSE@1Ev!oZu(4hzMyB z5z-^=g>Gp8b?Uu38{royS3cn3|6=}#fSVIYY0J=rz%y5PwhyBE2@E*Fv3mpI#$%;dM9S(};%=dk~Kx_97ld>_dD8@ma*@ v5RV}~kN5)Oi`YAyt*emZd+GgD6*Mn!W0whDY$0t*^bzwA8^PYH6zcH5h>M%d diff --git a/pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_2.7.11.msgpack b/pandas/tests/io/data/legacy_msgpack/0.17.0/0.17.0_x86_64_linux_2.7.11.msgpack deleted file mode 100644 index 16706a2229384b318f6405b328bc84d9444ef720..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10177 zcmeHNO>Em#9CtpPeq=>$)k-^XKox=sm5Oy;f$^1Y)uakd6!CFEsB)W^Ei8#M+l8(m zRnw{#4h)*E(ll{smS$*Y4jd`MS|y6$G+ONd;({hPFmWn3-pBv%xNe--S%T8h9Q?DN z?dSjdfBxQkStuqmaWZpxEv^{kW3zQiO=mTt>+5FFm|ZvJ^C?Bw&5lue*GQ(x7<<^1 zZ2ibo<`j$enIU>YY{J zVi=Ex*BE~F#kIc~1_z!V`I+MD-+cQS!`1P(e_{C7Z-4Y}QG8jS`;g(M;hyiey3KG# zNt5+LGGi>jISMG#w&5ZBph`keIp{ASKnene0Kp zQ>ekKeU5-utJ??1<)U}gHy1i0B(l2$xOe0YE~~?Bgs<$C1%*T3!Rulor7HVvtSww< z8YIrqic#Hv1x6PW*b2BacO#uFnH3U3M4fz(Oc{0!^KMsiflw|G&ILocAU(>K)y_<( zjHE4EqG{Fu2}5Rat>+4Ah7^ns)Y@!WZuQK;;Xv0-eZZ(8)ml=X8;mMjHDsc*@|6-`qvLNm+A5njf1 z&d~h#t^G9PQM+8Fl$ibp z&O(`$(NuZQSN)aYrcJq(UmXwz_ zYS99w&uV$*KT4BT8GB(;(HQm4@e|6w*~u)n%rZvKTZ+axjS$O*UKo8bk<@g9P7tTm zOk6lp)db2JqZJlc{X0Y@veblO;VB*Z*Fw{V1!fow6TC*patRCn8W_bM%aG=p{QyaW zzr^t8>OY_NbV<^Q!7INq{N%g7fnJK+Z+`s|!zX`#b+fk%<+aw}c2!0PkM*yQQ`mn! z^?^ywvp!bV=(Ol4I}EbJF?JYYhvO(-^fkV=uxs$htNl^Pt2lweuJ?|-NogdTE?POz z_iSUCkGlbRaN{sWt5qSL4iGBO7eXbzFk|&0PH?5^a7mypNCb|bd&nmEna0O>|SKc*uWVJ*cz?l@@l=__AG&F4;98 z_ikHTSGZ+`HMi8$?JnnunJF5vnXvKKq+ zWXpHg#FVSr+>07d%*NOWpN(CZo)InVgzcEX_Ex*^Nm((gWJv1r!^St3Lvqks>AWxP z{vn}2!hi$>2?i1fBqT^UkWlmip+EwH1Oo{R5|Vy+ZUFEZz@vbJfX4ub0FMI>13nA* z9N_bSCjegnd=c;^??h+W8AEQa+I&REDY?r6aJeON@mGf$)_LV|7A^21CwghjnnvSPhHDX6QOhrSx~L3aMhx0D zNzbtz(d|AaahC-UF*ETTusNZvYC7{6MfKZ8aemM(;Z)l(tFRj*qV$1xVr&X*0L3qt zmnv=DEeCD!y6SK-j8jC6Q$&nY#J%wEm_7ljFF)k-^Xm?{JlDi!Ox0^={;s!0`^DB|aWP~|4h7M8@B?Lt?O zs%cdR2L^Q)X_`1RO`NVPMM$ee5u8S+9Y9>r1P3Ng<;M5iqxjDsfAy>rd_|r5h~j5WojgkyLfX+5y=F`9v59QNS2r6JRr73*a8WR=~Z0YzIUq+CabcHY1ghMJbJ zHg9|&X2zvo+NZDn^z$V_sK}|5Drqa!sY@wYUIRM`9K|hU)1jn2IiAtdrlsgMmb3KI z7PrrqU^n0s zfKLJ*1biB>2e23LS-?YpeSn7n`vH#t4gfv}_&neXfJXse1bhkb<#?H>nW)dy^K6XT zW-*r(6-7M9vete9KWJ5~HSE?c=aT7c#In(nnW;v%>i4=aOsqNxN3%MrN~!_9a<$%y zqqb}5G)kE^s&>hBcAe|&k@}r|)>m`r?6i=zf6&!;6A`d0UrZ&M3C7(pR{rk z&1cfX*6`D&A!u+~e2$C)Y7w?~EI_)l^JPUk)rT9a(RN+YOJkx!8!L7~x+{7zi7S(| z-_mJ6$X4iyxpH=Vg00XKvf{|G(61S(w4!QcsFRR$Nz2$qN+IpsjIHusA~PQG1YI?& zZ@qS2e=w?VZUw<5Jw4WJQ-oPPHKtmgaIqa023QSu$Jq^7?IMdIJ7Kkq)iGAX0{jly z>i}G1_1x2-s+G-GAjyXJm&-T^a`>rb3JjM<_w`(lxy-9M^+??wGs~TRVQ#bvapo21 z4QOrjo`#l@9+HNvBDtl0wVX)9OkQi`iPzgw8G=nWN7%NS*+W>@oFj1UTpZ#!wO#7J z=l}bnvt1C5_Fej&;-^1!_jVDy{noc1Q+(>r*Vek)Yf6pT%8|`o4JJsv{JJ@6^_Ll% zvf#2F_m{O$CK>7wJ@nDTVS4DNha)vb!-VW7T2=2Ho{4IYq<6UpgV4lXE~@FeNM?Sn zyURrh`^;m=-j2H`lzBrbev4&uu9twG?ol+{cY)?xhw$0E|2V6g1WPs{$_V}t2x xCA=sM%YY!Jr^A?ra@c)0yld$0k-)-5Z#bi)4cs7%Q6kh+V^VgszN~9h*pa?;QLPXOvK>17Bs8WO~g7~>06m8;hYifJl-3_<_ zDXJPJaG(&Uk*bPAT*pN{bKpoSR*gieaw?8`0C7PT9H=;z8#6oa?QS-^>)j--+!7!5 zThDrD-n@A~-@Nf$C7nx>!iANjq><0{=1Do5SBR>v>QQZaRTs}>BvsYhN9a>6oh75} z$GVu*&gF^z1bc_I4i0>fQ$N(>DTSQXBO`i@KDM5Y8zbm(`fF)zOb~>d_wFX{bP7WE zwEP~!WLtcN;dkF${)b_-@A=_hD8Bu}jjtFkkG=mZ!@qz3v$sm|b#>-bhF`|JeyVor z@tl+;tL1b~n}d54P^4{x1NKQ-LQpyAC&EF90>%L2fFfW5unDjkumunm)jG;X+n6B;7ZBF#Bww$6Tt>cH z->Q>RMpD%C*V+22nv~7Ya*Hr#)nc~l;>S{Eiu~MpCvo@QO+nBTaw??~ZS|T;MF?bU4BT2=@M33=ipGO2BGxTRtwR*b}p(O5A`uOjx_dU7hG zrET8QmSqhPFhu6pYOyTmNLl+tcC6*j=eK_d<#1cgtD1`Qdz8bXLu(H|BD(3el1&!~ zjfRa86+EqW9sELw_{YAE*(^Ta(`CqIz1&lZ7ZxSWUU4q$tukXl4;P zVWe@@;Ai}GXX^9%4h+K_%lRGW*G#F=F{USn^l_;`qx>$Z%-M}syt8s3P7B3$GC&mH zi(wdisNFDt@vM}SP7wuPX`!rHI>GFOQ-jCUP`J5xvk3p^`-k{*i*2-?U_5G*shIKW z|6nYXS(%nn3>#7we6zwf14@-I5xrs{iMR)nh!sdI592R;*@wl#eAYd@7U~dIt7eGX zjuwK(nTHf6YclrogrqR)T{Lbe|7JJS*fPx+xzSS5PAi09$P2{iE2*@iYIK7*Dd&>h zkZJ}{gE5+JaVfk(_>rXsbc;aYFuWET)-5o?pqt=#G%OFd2(N(=>~Z(_R#iaK5H2wM zwfyhbT^)jOwEyC748QoXr>~ph#=GBt#_-uc-dykQKzYp?+@{Rvpu6GqaUA>KO?@Dl z3apQ%6}n|Q#4i2pa+qBP*yRX{*Y=iCTi7&t4Vb@z{-k>y+Z5PcL7<#v{ zXw16~ad7PbdaIcsodytd=L<25FBHr^_zhN}4h-?{J%9%Rkxw9hd=lE927C@s0PF&M z3h)`gUb711!$0(J7FJW!4QgN4jVwEEx3axVOZOs!1L5Q9UZ&m4v~ZGU)SZg7WLJmS zyI~$(;gJ>Q+?utXD=fdd=G@&aeC}T6Uc$`nr^yD5t#ns^enlat`sc9zh3v(?`(S6J zm-&{f*4~RQ9^Z|z8)G+icB;Tz*bO@|fz#XYo3ct?%8|^4$Bl0+#>A*u>AWv&{ve@1 z!hi$>2?i1fBqT^UkWlmhp+EwH1Oo{R5|Uo{T_4~JfQJD40S^NX03HDx1bh+jCBT;f zj{?2|_$uIQ!HLeI(}%oFweg6KQ*w_2uv>|o|J7j|bzX6dd5a5q3SYgejMDRiI;4r9 z6}-iW%|qCKHS8bs5JdD8MD!R$^qdDXyqEgaH0q}^tVLi(buW5)s0?35jM_Cx@39@x z>p2E-j{*qanFMy&BG*hI*pncZEXaM z(zF3xSmjcZrg6bAbB9u_8y7B1V^R_^x~eo?Xxx}a7dEbN=Xw19Jny`?7X+2XeD|FD zI_Lkt|9P2SqT^~rHR$h| zttfU}jgiQOlr|x6x-k~f$IpyL)tGvU7#T65vomQnOQ+K@EgbdP8hbdg!!fhB=i0a) zjT?m0RsP1Mn5G(rdd{Y&)i{|Aq&3x4*A6Q9=Ys=3pdB_zlup}cA}Xc!a7-f!`S6TB z_AVJW3!l#3$Sz37)F@dMPo7KmCkK-K^7iW|BD!i0@YQF;7U|jpd@taGm*JpKViRC9 zU>o2;z()ZO0d@dB2KYGOVZbK=I{~`@p9VYv*bR6Tum|uMU@zb^fX@Ox2Y4LtdB7I{ zU+k08TMBVe(7Y=Ux$e92gH)4M78tIjt^{$3B^O(vOwoT z^RAd*zlGKBh%N4jZQ*r9||>Thz?4m6}We)aj4KRNol zp6dUR;cGW`#$>-=C6XC_l(CZy|f zh98KTvl9t9!w=Yu9WvO`ra3eBjMEf;UXZnha0@ighS%aTs4RQFcw8ymTK=Ha;xaQD z^#qrs$Yw1t@-4H4yocB&HA;mtJZBbC|EPD=EAphdnjU?@-BEV1T-VtL z=5|_KN)+b^6@8n}Yrcy7-1|^CUt1_-!O=EP8-&zSa$k0*V#zSp)^gWbn)QIr)yimV z57HUMi}c2E6*{}qDcX)}oeoK}&F0w!?E-GQzc2jtSzC*u9PhsLGslm=>F8=_c=wI3 zKjiqtudl4Px5&+AlkeipT=oODtX-^kF&o16m&@H-$wB1w%B!Nen!o$2X_lrQ;XmE{ z=P3W_;XlXLQN_OMJ>`~*SHJuIlA@&8jq81Iqm))jZVgiMJdK+dJ>kLRP_log!t)+$ zeB^R?hn9J+%9ZLX7pt)WQSLnEbXA?8=LCRQCj#`g_oQvnL@uA4{U6P$ek8(XIffLM zer~poT;_E#BR{w8wHBjuGReO$!$$Ky0gFbbG+T5aNHsJxOrUCQ%kqYt zlNfhOn{{o!wBkfxhT_2SqP7cmRW@lZqg}qe+S=H>qgFv{_4fnw5=p6%(y!_j7L8Kp zSUyLR=|hEOZ+`+K{+FX9J9pmMy!&E6d?WnlH|)7CH>X&W&Us(*0|amVi>|*+Ot<#! VVczluEMJ4=4_JQo8{9Tx{telmo6G7%Q6kh+V^VgszN~9h*pa?;QLPXOvK>17Bs8WO~g7~>06m8;hYifJl-3_<_ zDXJPJaG(&Uk*bPAT*pN{bKpoSR*gieaw?8`0C7PT9H=;z8#6oa?QS-^>)j--+!7!5 zThDrD-n@A~-@Nf$C7nx>!iANjq><0{=1Do5SBR>v>QQZaRTs}>BvsYhN9a>6oh75} z$GVu*&gF^z1bc_I4i0>fQ$N(>DTSQXBO`i@KDM5Y8zbm(`fF)zOb~>d_wFX{bP7WE zwEP~!WLtcN;dkF${)b_-@A=_hD8Bu}jjtFkkG=mZ!@qz3v$sm|b#>-bhF`|JeyVor z@tl+;tL1b~n}d54P^4{x1NKQ-LQpyAC&EF90>%L2fFfW5unDjkumunm)jG;X+n6B;7ZBF#Bww$6Tt>cH z->Q>RMpD%C*V+22nv~7Ya*Hr#)nc~l;>S{Eiu~MpCvo@QO+nBTaw??~ZS|T;MF?bU4BT2=@M33=ipGO2BGxTRtwR*b}p(O5A`uOjx_dU7hG zrET8QmSqhPFhu6pYOyTmNLl+tcC6*j=eK_d<#1cgtD1`Qdz8bXLu(H|BD(3el1&!~ zjfRa86+EqW9sELw_{YAE*(^Ta(`CqIz1&lZ7ZxSWUU4q$tukXl4;P zVWe@@;Ai}GXX^9%4h+K_%lRGW*G#F=F{USn^l_;`qx>$Z%-M}syt8s3P7B3$GC&mH zi(wdisNFDt@vM}SP7wuPX`!rHI>GFOQ-jCUP`J5xvk3p^`-k{*i*2-?U_5G*shIKW z|6nYXS(%nn3>#7we6zwf14@-I5xrs{iMR)nh!sdI592R;*@wl#eAYd@7U~dIt7eGX zjuwK(nTHf6YclrogrqR)T{Lbe|7JJS*fPx+xzSS5PAi09$P2{iE2*@iYIK7*Dd&>h zkZJ}{gE5+JaVfk(_>rXsbc;aYFuWET)-5o?pqt=#G%OFd2(N(=>~Z(_R#iaK5H2wM zwfyhbT^)jOwEyC748QoXr>~ph#=GBt#_-uc-dykQKzYp?+@{Rvpu6GqaUA>KO?@Dl z3apQ%6}n|Q#4i2pa+qBP*yRX{*Y=iCTi7&t4Vb@z{-k>y+Z5PcL7<#v{ zXw16~ad7PbdaIcsodytd=L<25FBHr^_zhN}4h-?{J%9%Rkxw9hd=lE927C@s0PF&M z3h)`gUb711!$0(J7FJW!4QgN4jVwEEx3axVOZOs!1L5Q9UZ&m4v~ZGU)SZg7WLJmS zyI~$(;gJ>Q+?utXD=fdd=G@&aeC}T6Uc$`nr^yD5t#ns^enlat`sc9zh3v(?`(S6J zm-&{f*4~RQ9^Z|z8)G+icB;Tz*bO@|fz#XYo3ct?%8|^4$Bl0+#>A*u>AWv&{ve@1 z!hi$>2?i1fBqT^UkWlmhp+EwH1Oo{R5|Uo{T_4~JfQJD40S^NX03HDx1bh+jCBT;f zj{?2|_$uIQ!HLeI(}%oFweg6KQ*w_2uv>|o|J7j|bzX6dd5a5q3SYgejMDRiI;4r9 z6}-iW%|qCKHS8bs5JdD8MD!R$^qdDXyqEgaH0q}^tVLi(buW5)s0?35jM_Cx@39@x z>p2E-j{*qanFMy&BG*hI*pncZEXaM z(zF3xSmjcZrg6bAbB9u_8y7B1V^R_^x~eo?Xxx}a7dEbN=Xw19Jny`?7X+2XeD|FD zI_Lkt|9P2SqT^~rHR$h| zttfU}jgiQOlr|x6x-k~f$IpyL)tGvU7#T65vomQnOQ+K@EgbdP8hbdg!!fhB=i0a) zjT?m0RsP1Mn5G(rdd{Y&)i{|Aq&3x4*A6Q9=Ys=3pdB_zlup}cA}Xc!a7-f!`S6TB z_AVJW3!l#3$Sz37)F@dMPo7KmCkK-K^7iW|BD!i0@YQF;7U|jpd@taGm*JpKViRC9 zU>o2;z()ZO0d@dB2KYGOVZbK=I{~`@p9VYv*bR6Tum|uMU@zb^fX@Ox2Y4LtdB7I{ zU+k08TMBVe(7Y=Ux$e92gH)4M78tIjt^{$3B^O(vOwoT z^RAd*zlGKBh%N4jZQ*r9||>Thz?4m6}We)aj4KRNol zp6dUR;cGW`#$>-=C6XC_l(CZy|f zh98KTvl9t9!w=Yu9WvO`ra3eBjMEf;UXZnha0@ighS%aTs4RQFcw8ymTK=Ha;xaQD z^#qrs$Yw1t@-4H4yocB&HA;mtJZBbC|EPD=EAphdnjU?@-BEV1T-VtL z=5|_KN)+b^6@8n}Yrcy7-1|^CUt1_-!O=EP8-&zSa$k0*V#zSp)^gWbn)QIr)yimV z57HUMi}c2E6*{}qDcX)}oeoK}&F0w!?E-GQzc2jtSzC*u9PhsLGslm=>F8=_c=wI3 zKjiqtudl4Px5&+AlkeipT=oODtX-^kF&o16m&@H-$wB1w%B!Nen!o$2X_lrQ;XmE{ z=P3W_;XlXLQN_OMJ>`~*SHJuIlA@&8jq81Iqm))jZVgiMJdK+dJ>kLRP_log!t)+$ zeB^R?hn9J+%9ZLX7pt)WQSLnEbXA?8=LCRQCj#`g_oQvnL@uA4{U6P$ek8(XIffLM zer~poT;_E#BR{w8wHBjuGReO$!$$Ky0gFbbG+T5aNHsJxOrUCQ%kqYt zlNfhOn{{o!wBkfxhT_2SqP7cmRW@lZqg}qe+S=H>qgFv{_4fnw5=p6%(y!_j7L8Kp zSUyLR=|hEOZ+`+K{+FX9J9pmMy!&E6d?WnlH|)7CH>X&W&Us(*0|amVi>|*+Ot<#! VVczluEMJ4=4_JQo8{9Tx{telmo6G7%Q6dwPqr;)0PL(;^p%9R60idYqiA~=eb#E{cOQsZ6)uqX3LJjIpZRVCPosP=1s&#~Z zM~lRiQTT~2g|xFtMSlW*3AKm&--@em>+Wz$Iiov9bQk@&@aZ^vj_#)4mcN}KPKV1o zMhJm2K?@#ni>5eEn26zCG@7AOhy z2y7DQ71%5g&Dk;v{ouA_wxS)QcD$dKG!>V&-qN>P=j(*PO0Z^0yk7mjwNR~o7h;J^R>G4t{k0Ger&tz zx&QDsA-X3K4y%fG>$*yxq^K8Q4ln53I2)VG0Yw#=t5VT2GB|g9tyXd#K_Jo>u395}FcIjj`UI5*eU`4H?WtqEBPb2t{HJ zLvS{KnGLjy=?On2Ka?(7@WSY9SVF zr}3B^mrp7w{7Lf&O+bzT?YM&4<{k2iqqVNG|J)zopXbtO>6T-lP}ec9;8q`V?oo}J zCtjvHhc|B<`^-yj*>3Wp{LQj?!+SH?^l>-txU=(NSHG01Syl@GwJ4{itH)(^Ks*mvJ6Y)@;QeK1F=M%!P;q#7S! zHqNHf_prT?_w7|?l?%)!jrQ+7OlX0?f&zQhc%e%%(#KU=c)1F~>mqA&nl=RGA~u*u z+%N46@TzWsJpxgfp|0O0>h}oTCy)qi7uY3mufX0)gq&UtU(3$Tref<&rbWh!IlHY# z2)zW4kgwZ%ge`1v5>N(c!3&**)qC{o+j@jq=zpU}GIS@q(ILjXb%`MbkFQagl-;_# z3M!l8=ZEgMeEvq?FwM9)8d+HAokIrE$U$Ego@c-N>Kf^wXec3S#r z*-glU&&iW(#OrVG+||{+XK&BG-u=%U=sVbd=9Sb_mT^?}rw(RMNX^bJjGuXXmi~i|-QJQk2J|kW<*MD}&5qKeuto<5^L{0o z1cO59{mKPL5vM1ZE*Yc0gff9GJBAo8V@YHvlOsiJa`miQW7zGY9T#nyqHE0HlFb2_s#ddJ6~sZc9>ZTp}L28 z=4a=ddH>({nJF`&%4sq+pH5B|^z+&4!zpFt>`+?H$ft=~l*fo>OtA}QG7~3b=Gm02 zY36{nAE&2DNf~~RjOfLvZ~7GijGQI8h`lyHV?-M4Q>FLi)F{zb58+*acVB^>UXD$G&At4I3kmjWJ+a`j zx(vx>y*7{uENKMZF{Gq;dlZ(b1FXko1>G*DlVc=46d%pj?#{E!o%1L1{cgLvd}fKN zr6(r1vtVk$XnvyD1{eLy<+_)Hn{)5D?&idU=eppj651b+#k3IIru~)O@fB7=7nw~oT~23BE1Un^GI{mKpDsyKNl7F$qOTO_jcdJdBi9y`*8(jZwI?+1 z`YcxWolRxA;0{os$p~AUHqc7t$OPZtlJ!>yUAN;2XQ1OTI-A$Am=FrvQpIMhD&`8A zU9Mn(&F&#a_y%nCAlj;Ido&LCK%FZC)VVXj>R5RhzJndu!Crpc0eA=yJGN#xeiZO= zz^4Euz&60g0G|NtbaZ&8)%nz(BrapuV@oeI^1GKP{_lCJOkO$n+!Zz@u9w3ha2p_5 zKTK`>NGdOf$6|^Ln*4M5|6FIG}T* zWBd%;p=azssYC@)Mo3K(ZJNI8G;7J>m`79T4eKKJXCzK6vPPKT{d6Nd-(tJMC}Y-$ z)ESX_BU)!f=~ieuKqe+J6P6cdHXQg!Q=N^#%GTo#uP)#JrCkYfz%3;3=I%-jez?An*jF%I;H;@$%L|2MN+-Y{0Ky3 z`H!4$;p%5Ex;6Dn((B)T_t`CKj=90Ne_{C7Z+~>IQ+!#Q_>keJ^=&__w-ysBIp^-q zAS$e`z(}Jvgh@=VJ2EBwC2!705PNxF{BVNq!8Vq z#TN>CKdysi$tzU(NOh6*oMf+cuB9wx1C+Scw0_C=#a4TH(7G%%+V>6u^$^t!fQ^9r o0Gj~!13m1 zXYTzT=X*?6n4Zce&4nv0o-`cuW7a-vWpa7bwzpW+S=?g!g|uPYtb2mrby68~Qv5a3 zlg_1_$sQ4N#OUbAJ6Zc(7N5(T7g=P2#rW;w-D&v>`37TqO4GEPckd)__i9@IqV<-* zWLNx4fnR-b{V##hp{Gy%#PRiSzWq$#=G5Ci3;grf-v@U&zHBdjDDcyG-}k${ES@zo z<`#>kvd#)FXoY-t&ge+#uEajoD)>V9FQPz;62=JQggRk@u#K>tu!9hi>YNmBRAQ21 zPy%L&oS9Eq$<@FMjMIvA!USO(VLM?5VJBf1;UU6qLM*UrN-jjLkTiByY3z_%Bk8Nn zo;A`&-oDH_gpO^;$mEulRmhDMN16VLu_7+vd`iA>6~eMYWZCuh5pp6t)u{9%H(kS==Sbw}#X0mm!&^=5;z(3_i{jN}nOO zdw+@BJhx!w#|N4rLy4`-4Fr3uAI#oQg@Ji9mYmN)x?X1RbsH8r#UkS@a#}Je8<*6FIIKO~Vc`9N7`9f!)5Tehe^0p&_;;fZTa$?W((6S=K^)gG$ zq^;Qt_L@+mV`glxIs5IcK6yn9#fr@{!SY z=C%UB(OlOR&an8(R&mFuVFwO{Lp=98JwH9S1e#y}NrRw(&kE#1Si-2FA5m{H6 zy%fXE|HEQ9O}ge%sFz&Us>l0~%(AQ0#E)QF4x(^|S zaXhv{+7ns~AiFSPz+4ceUt0g45oXQmtG*aKV=OY=IiEK<>5)Ri1leG;D)M62OX?26 z>&TF+<;Ee6Wc!HgEMASnYGRe0CsuTx1@{^11l8q{F?Bsac$5&%1wwt4#vdnqicll$ zBYceT3Bti3&dBNu-9d!ive8=>EUuwl3&BH8FtKLRaiCBfLJTcS#~S_Mthfe=`Ot9) z5lrBbjf_Z$BH)%_f%%Wkzd!Hm(X^ApSAP-s$#(-o{T%n+{Q4t-PyY7mPJa*9>-M+H z@-k077(yVY@q$JQ1mQKXKsI=1-m;F#L-@c$UgSMWUmqy@Bv3~yWY=w)2c`kp8E64s%J7w^-Tt<<}wCW6mb&MJKZ zeM&eDtB|OK=%?wEyt&^5Iu2UItw&1Sr1*tVlRvl#5bgqm+W_G{Xv{XL+H0Q7P^BSs zRW5gYWQNJWs1l-k#5mCJKJpcx(opqB0=sR2L0mN~Eubqk?h}Xig?_|S`-@g>XI1|+ z;dWZ;Zt0avu?bmqC5Ec@b!|Ob;F;=!XBlah$hQ-$=P@nRZ1*m%O5@6G%DnAw?Kc6Vk1G+__( z&HsJB_rCXizxUou7geGpi8_;yk7Z_zxOO>1785C1)OvgJ^Z+j38kMDtLR59tNT%at ze45``lt5Z9OKvo*c{`pSkP1`GyVIWOZ*8h@z=B{L`zDfx98oo`d%TSkW0A%7QptE zVKYK!B)13PK`WxQAzG?C_wO~{Z54#}3Hfb?@gofdhTnX7<6nm1u4npwrTE5o-+#_< ze(0Uw82K2zUsv3Ggr= z)S{I1)AB3QSV~KlueIfK?3z_&b9yMEhZ^*7L=V$#)L=|d^}L)WdF?|v2uj2?2xhxM zc~e(sv&EtEWL_R*9J7M4(Y0EGRBThm_}VB0uQ2;&-OaUEp#jfKsMMLaCLuH_IZomb znoU`DmY_X|Y;GOH3QZXwh^aC1OUu;FpMJh32#jP)OkPw?lrlv*kx+@Y%6fG^AXC{YdVWMym@@DkDi3UD0?$k^ z!O&T?@-uA5JacJmoS$JkOjNkW7>T5!YP5Ps<#gQ0h-s3#YM8xgip!2M#Llr6ygibV zM=uUbVp=>;l%jd0W*wc*GKiwd0d;NOQ5{HlL?H$@0HT{AHpBU?Nq!$&75`vd`T1b2=C33-MJJpBK|{QBlOpR&^E9@uUXJ5unX{6z!QMofF}Wa08at-0zL=$Jm3p} zrvYCCd%3ZS(bU+Vq4=*9EUCLv0QLZI{?u=&@DP*`~tcG_BPB&&`r=i&@HexqWc_!_n^CUK8&iaN1`V2bPw6U zo`^b4+-oOGbqQueTl){mfQ^mhX%4V)vpBir!LBRGsve3~QB^Ba)F4z#=fq54YKgg` zr#P{crAkTT^vqY?k{&nP>lyXU)i&y*<3K~Zpb%1SaOcV6ci5(kg>Ih~tcW%3-}C={(bgger@OEH&hXP8I=b2^Zoc*H#|)qP z^R@N%79Q~ST(HlSCpX|D2SjOcbz+DbG6z?*Hw$#80z5wzNfW#?X0|R zE(-Xo8=bBccprIf3MjxhBM!9$^9pq4*|XHS)-iS~BP^M$szK7KPgq%_fohVr^L0$` zf|Nd+qj>N*_dqi^0@zBnYfW#Ck84lP4%|###UZcd+fMg#9v`@wpBdrop19c@bFQ*( z7N8krb7y=QHCmC&s6AWIb#r2pE~WEQ?Uo9$^{U!62w5` z>(1k#`mask53FZV9TiieqF%9GyJoabNrJQYZCq|e!2Xo7%M~Ov$oF{hJ*&_LQ%j*z(asd TfQJDe0XzcuDBxp&T#x<-G>Dtb diff --git a/pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_2.7.11.msgpack b/pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_2.7.11.msgpack deleted file mode 100644 index 9185ea93acbf2aab94451543bfedba175e029e92..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8386 zcmeHMT}&KR6rSA$O06aZA=bnPBt{#9XemWo|DdH!h^eKP`k-mZz|PXOyF1&NK`EM$ zrUYu#T9%SDjV3I+vlM96q$Vc5h%qUNnEKR08l&;WH2PrUOY4hz&)jonhhb)!g{9c& zJj^$9XLjzn_k8Dk=iVLA!?B>$b0QrSHR9iNI~JEdfw!Tpq5k!ldMF)Iq@&5^6nkVoX*a$LpA~0X1wpuc<9f-}azUuP>XBM!-$3gHT1`vx7ixEo@*CCc7u1DlEGuZ;Y8Pfto=QEj! zOG;P{4tg=G2<<3FEJ0j{Sck{YMStPrB!wWeynjK}Trn zSTszA@{Tfc#~V_oW@E}o!WT&RiW2@n!q3jdX#3voh!!^8JQamoZo3Y)tBI5xlTzAS zveoC~p@=LxkTE+LZuHF;(fCG=rHC6F-3Fr@U#9W!K36*NE4B7^pR2V&tzGLathHY* zNIx}I2M;{CTV!lgVux%Vong~mCH}De0hVH_K8s$F0RR50<8#25Pv7#h6jSx_!|(6E zR_-={F-=A5@3!=4rQ@B=jEf^skA{1sU}0(C62!9@P=e1rrIq!c`|irAY)VUoWO)`N zPjF*GD%c%gID(wXB-xbQncS6ZPB!rm>ShYzIn(N%mBycYwipW*I7gcb&$Ztn;5=%* z+cfJTg=N)2P?!EpM?~&Cq7DtX=mUGIGVKY5qdu26pa7XCMwkVFt{xWS(vTlCVmuZU z6-7M8+gRGq7_z4tWefkMU1!|xQHRZL?Du@g=ULsb6ULPmdzgAGs9>Bqod7O5vuFj% z3xmICOmKTtjEP;6!n>GgW^=X&oWP~V?ga<3-XTA5Z@5Y3?Y`X4H#YL;BvzAqK%(ys zPp}Gx+>pRvShgGH_%yIHhJnu^418u7=rgdduJ7>mGo_11rQux`+{M87W*EPB|9z5Y2`L6OGRNOKIc~ zooh@Z3Kyak-P>q{R=Vq>w$X!By#@oFoDm={||(yYqb0;f7SZG}@koVIb(ruG?&viuVA87i~* zH|#~$s>E-97- zJnZ*8mdwS|e^74c&5dg_zj#>3o&$+>l-&aDlq^fPM@YBbwntVqKG|~Wcv_LV>RbyT z*f%)qt!`)2%PV=fFe>?@-95YoE?5i7Kz;?i9v5R$l- diff --git a/pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_3.5.1.msgpack b/pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_AMD64_windows_3.5.1.msgpack deleted file mode 100644 index 792ef27268aafb788e0daf9e74eba1d36245312a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8341 zcmeHMU1%It6rSD9rm0m(VuBTXSc7Ph7;TzHTmLp~S_o22ZR&#}VUoFO0t&p&bxoB<}G$ zX-|jPGwp5ZH^s;x(dIV^_Rp>7;XcT*2Cz1mqVsBFeByDBb6z7X(^*rp6pnh#^)JFa zg1{CqElZ-Riid&exh}Z`Pd;BROAk!z1@r+{0hY}YC$zHU%Ae_^YTV6)of3zrj)A*j znnQ}|ear2b+~Hg{lp8w0w)2bZ1f2#ti*{N{G{N*!@M*VtS*ZEDjLW zFpUAtXdEf#n31=n8FhpB5ek{n(g=T|(C@WD{`=C1O_5%wpmWUaI=)$0JB4bxsOkbl zikC%sfLKf;rRhq{Pp>5AF*TvG2e=F8(L^*nL?qP2se+oQoI^V#{I5GS>JQb2_cl9l z!xAT8LUx3mi`1iu9*&aYG4c}{zs<+m+g_8k*R4V5x3hnqY^W22?X4$%V)*ViO)ZTS zulIcM4#P)&d10Zku58X2#qw~TeaYNsiZh0|1577VbTiq^F0Jgcg#+d>F9VzTf&v2Eh4^0)Ef~R!w;mrt&&gNp%+_z1Iwfz51|*I>g=gQ*_~; zS1m3vo}!4NoS|YCmISmyEDbi&0*{*W;TOC5ug$a-J^Tq z@6LOh?Em)}%nWx~OW3`W#h?WVg3T80WHGEZ&2>n77ITJWEP1n-1(d62bV?tdODmy} zM)WzuENQ>58$I*gl@q+|oMFix4K&eMl1{b|gg@G};l?x3wXw>qcyye^bSSY}yXv&o zFGfUFJCc=2KIzq8t-f*dvQv6IK|7kHaTZGxTEVxsS?Zkv2@2iC#QEIViUP&N7X|AF z4OIiK1FQjD5136oSglQt^DQcq-xuNKNbXZ1l^jZRGCMM>c9ZMnn>VUyRv&(aFMV|S4~E{B$2zW2-22tnA2K}K{mPFFfB*T{=0%FnX(Mkl{Gh7g+r@guHS<

>-MquC~$5#AqH-Oq3Z0;zpNFq8;Ydc2u!!zbu!4{fvYehy8@YRU2=@ zSK2ggf#wmMDs`Fwst)^n*vAWii8KLXUl03s*ayTuUo$+{0{8^rX24d!Er4x+TLIev zp9Fjga2w!uz^4J90ep5<`-dyAxk~m9>1kxiBN(sSBbeL<2KF4!I}gMrJf#gTTk{C! z2MrFf{8(hGTHW*z_>0{fz~);J4@MYTv&+Fv6%ncjd|76bE%kfZ%fSg*l)!@>UO=tD O9IYX+GNpH=L*PGxDVgU0 diff --git a/pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_2.7.11.msgpack b/pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_2.7.11.msgpack deleted file mode 100644 index e3a57986656abe75f6c0e6060140e9af03720576..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8386 zcmeHMT}&KR6rSA$O06b<5NqNC5~Gblw3MQ)e@jc75K~Jn^+7|(z|PXOyF1&NK`EM$ zrUYu#T9%SDjV4`oXDQICNli?A5o1ykG4-ik(in{|rqKr*Us_+(d*_@pvn;dA?81VL z?!$aDcV_OKd(Y4J-MhnjAncR+Pb7SzCY`*{BZnf2q^h$M+Q6)SBq*xtrB-%F3xuRL z{-EyDjzy$T_|v?tx#{(=+L`bx($QFJoZT|-bQ%5fZpFEFK@hIoyisXBM!Z3qWuqdJszxOA*TuHz1ZHZbXEcnQr4>Gp2;lj16;g=X9;7Znt{aWsWcasN=&!u}9{JI6bVfmFB z$j@{=5ekr@&`nbAenaZftWOy|jz?XdsH-IE_C($6SckDcD)6Fv>ehIeg+;%k@ zm%~zAdrQWsbU2hre^3@#0A&sajK1|E8t(-xN38Jb#(Oj#_-64ZzA$U=46^y|(gxPr z+AkLssQTJN2cO&{vUpU&o#u4cia+dlfR&!w&q7xvj(`8v{W-^%Pv3U4(o_5K!|xxs zUNxbqX#L$*9`g%FsGTr7ykNtRG6^e;P;kWz1^Ol5T89CqIVAVIIi~ETu5#$ycUMnY zr7rH5<#{i@829x>7LOojF^RRrcExtbT4OCxM|87X0H_WO z4Zw@FGYz2o6;@S8IJ*obyMuDi5p{Gpz51tBZFW~>+~W&`Tv?vrA73EPn*u;r4~r3L z)Xm9gGVBu-MLY&=tn4QZ+EWeL0&h|lP(RtODNE>n<~0Pu1*RKz@o^=@e#Rb)6pTGF zMR_dQN6`wHmk<7uF~MCSF)a2<3VdR!mCe~+?gXHkOfOIZor}uIKzB@d@6b!i`ipfy08&WV0?1*XLvPlD%nFf~3g7>mbKT{@|)4H;>O=YI5 zQ#&)$FO#&u&ecuBoKz5vcZ&*FiEzTCB0m|RmH{pvjTT)ytVj~hMrZz|VPun?>l{XuE)-UDoe@T8Wd{4@T_jVc#&VpUzn%Z{baka5>})*s z6UTSIscWcVxX|&%I~*VT<)!(WN-)=4&Kq?PZC=Vc&WJT-uB4sVI-X!^YAZi9^20WM zXyS+MVA{-nV^Nl0K|Vt=3;M_uSw*GCi)?LU=B#QLSG!m*dfjM|W8`Bl`)Fa>2H z&4K^ZZTce1PF+$i3HZyrhLkr|+kcR57fiaP4Qou@oa@+goMLThcf~s*;F5xPdz^SX z+xAFN^OG&7jwckUw=t^#;`;_?L9Q>S+u8JjlIJIel0Vwl4=wzJwZJ@0=O^g(h!~cF zC)S?@#yq8N+7r1&*@;MHp2|4U0+oF#>qHYo161bgywDNR0+oHD5u%BDe69iUDa5Ua zjfmS2n-I4nHX}Zb_zdC>#GQ!GB0h)sd=~l{!|=?t>?;=iq-TaZzdu{Di4MJ;$J$cX zA~9RA>~ONzFy^x2%i4Jsj331pA;lRX#Ty~T9qk3J%WBeoK*VQ+ZWy6?q;|q=_cM6V zP(x0_F{-y|Qm6%w##YTLu9h6+Lyld{+;O>%+mK^!D>l!Yu7VSaNkWQALW)VkrLnoV cA9|0F-Xo;<38}W`i1mVkWA%5Miy0jM06esussI20 diff --git a/pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_3.5.1.msgpack b/pandas/tests/io/data/legacy_msgpack/0.18.0/0.18.0_x86_64_darwin_3.5.1.msgpack deleted file mode 100644 index 33b71200258b25aa256e55a0bb3d358d674686cb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8341 zcmeHMU1%It6rSD9rm0m(VuBTXh(WYSj5ggyTmLp~S_o2`+SCU{!X$e)UAw!potbDF zg-{x{6}2{-21-$AvOAk5ZB?is_+mt8z({>+Q;I0QNYMw2FO4ti{W)iUW@mrgnHmf3 z!+bM$cJ7?Nd+vA6#EqaTN61kl7*=F`+&B=HHEph2*5#dSJUtLr1`lWx@$?&Vc$jF5 zy;A<~=JRmv7gz_lGA6Ez-LKF5&(QYEF_Lh5Q}k6@JQWRuA}%pF;XXU4L}Dt@v;`xe zABqtp6xBO@NqRaP4all0A4&zNtk=ZL=+OmdLXr8MOsj272l4zYSF_WXotMpZfos_x zk)!euQB(2=(GqcXgT5|6M#Nl34?E93C?-rR1Fw@oJr$+KTHvxIrn{!M#SKlDRb66} zoKxlK5b5Vrr8HfMc|o|B*f^gR9@lgdbIn9T#oUm{xM4#^VR2I2%wS%Zm3dueZHkz8 zIrFxP&1xYjy2cH&c>rqwosksZA|yTA6U}wM&@Zd??Q%Yg1}$L(L#n2yH8Q9~17Nvb z!(lyCDw24_MA*L|Vpx1nC6WWOnpC1hoak>V>|_*2Mw2*V==)Vdl609A3nHtvj7CF0 z7rRE+;eGS>wY{0* zwZ6~aX86d@FDx}TI3p$QKtjq3l=Toj6d^f$;bWS8`Bmb$Y-EQH zcG$!YK6cpbjEIBjY1x*15#qsl&pfps-0h8I6P^*cj%`(s^KDen35s}C5S>3#VJhQIyvOWQKV z=d`i67`|WA^v!Z(I!4rx5;)3Ud0w207s4jn%WsRqoa!f25o*AHUcg$wHGp-1YXR>9 zTnBhJ;5~rZ{W_OlLmPlwX&0;E3Yrk}w4UwsnapZ21&E!AeK>))OGtb$8ohA87H|z< z9pG9(+M76>Qpr$9+;F~mXSUw_RAYyM7%gRZYr9-PFz!fx2GgKj~5^_*POf?exa2&DNeX-rzRtaAt z99M?<6k)&BhhxI%(MhCB97{%4>0WTv>w(knyqkG)i-;JDh=nmJ zCifxwyIu=An7UU1B6gK}6_wZmE3YekS7uhd?@amh@OX-``X z5$)JaK<6w9uEkR@98Hm65nGrD3Y7mNl6* z6JH^66}r zvb?M<^7mWrqorr2Kl#_61D~<$7f;-B)6%o$qX*yHbG6YdYl$|ZL%}Y-U6fW}%63!4 OtXJM=a1ksvobWeKVVUOu diff --git a/pandas/tests/io/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_2.7.12.msgpack b/pandas/tests/io/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_2.7.12.msgpack deleted file mode 100644 index 978c2c5045314bbac06fd0965346f15cfb9db120..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 119258 zcmeEqWq965x}{3Y%<#owW@ct)2FH*%i6OSb%uEh*97C9ynVFfHnQ86V>7LuSd+y9! z*xhGWKXgj-C9YqLx)i zdJ`QI8dNp;!-T--9uYyylfO>xD^#F)SX9gS;K-oPu@&NyUj6o__0MsWhxI>D2ZEse z4=>~AWC+rFhBr*cKn9IZ#x=91y-r3rSI)BMld$fNUE`8*UY$nQlkvr^$2l$}VX3Ho zy^?VZsvf=|$A|fc2JMUs35y<@@|Yr5(qf_u75IG>zfYeeLDB_M0D@ByNkudju~fuU zkw`@{6{%EA(ahqi$+P`bOY+iFG?^F?6d4j8IP4!03r&>+OGP{tiBu$0kxE566`53I zQ;|!>6tR}n`6r{PM&75R@1m92^xCy)!W?X@a1r;mPVBma^(pE5t`dheu!uN#jLELrIOG z2#~6(dj4VUQamVmiY1ZB%~TgjF=bM3>bSp~FXi9F`ah@pTMMVyHA|{Bez)tdj$d~E z)t(<3@=MT^VG_gJb_fcH{(Y*SBTkM1NDM$@U?c`kx?`!b*X+_MI^;JqF9}U?ie+!g(k{4p}D3^D?^V5CF76p zZPzAat??h=q}1a#JMHMY=jHj)QK{1Z-hAw~ax zRT@9``}^<3GAZwqiqcPJySP$v0sghXwuv(fqRp|8k>A|A)HKpG4>X zW;gm3F2CIBSNjiclq%&Ps^34IBrp8NvHs!cd@}FTcvXU+V)@42NX8L+(&f^U@M41v zy^}HX{pv?FDMj4hw(~#pUaHLhzUusAh4sR3JL!K!*2(cnY$|W^Eno62fAXzB@~vQs zXiIAU+?4%u33<{lnw2ufA4%4KYk6Fnw72;gQ%O6Vs9$mQD?R_6Ey%y%6TkUf3UT?9 zOgQ0h6ZUU=nqPaHKgSfEtw^f(KTmTSH2YhpSd&DZem~^N;a9$5gO>k-q>$x#a(agzE3m$pgPv#9xd1MfCo>(){T`(SMw;llL3Hyy_PN{+EmX z0rQs&{Z>_f37u*gU@PioaOGFVf|v_Gd`NY^eyP zqMC|+shBAhbNqW&@ef;h;?Lah7u&D@*37UdB`y4o0snuw{o75}pRvndu>_@Fe{83O z{BNx7|G{#5@wamO>%clSbpP%?c1CiE{nN7hpFR;uG0DGFwcka?Xr6>6zsV7BGi~mVNAK)K7DEU+_ zf4*S}0pWo`|Nf*SZ=t^_$jK5PtmT&!Js`I>9h=&A7gd|9Y z6i9_MNQVr_ge=I09LR+{$Ool?{GhZ@Iw(Dq0m=wvf-*x{psY|fC_9t`$_eFyazlBb zyih(UKU4rJ2o-_~Lq(vXP%)@DR01jqm4ZqqCG@gTIjB5T0jda9f+|B*psG+cs5(>w zstMJCYD0CPx==l+KGXne2sMHlLrtKjP&24G)Br-)DP+p4S)thgP_6C z5NIeg3>prNfJQ>2pwZA6Xe=}i8V^l?VxWo8Bxo`;1)2&?gQi0>pqbDtXf`wlnhVW? zVxjp^925^FK#9--Xd$!+S`00LmO{&*<0p!?7R=ppn7dJH{*o+Ww4>Ev^AQQ+8vVg208^{iFfSe!~$PMy;YXbsu`f6x{LfItugfsu{1Hm9L7z_bJ!7wl!i~u9SC@>m~0b{{9 zFdj?*F<>H?1SW$iU@Djfrh^$^CYS|ggE?R>m|nw3&29K2rLFmz*4Xb zEC(yVO0Wv725Z1tunw#T8^A`e32X*iz*evgYzI5QPOuB?27AC>un+792f#sa2pk4S zz)^4v90w=BNpK3B24}!oa1NXY7r;eu30wwOz*TS!Tn9J6O>hg`26w<+a1Y!E55Pn4 z2s{Q)z*F!HJO?kpOYjQ325-Py@D98OAHYZO348`$z*q1Md<6cX)4}QC3~)v`6Py{& z0%wJ@!P(&)a85WEoEy#q=Y{jZ`QZX^LAVfH7%l=Ag^R((;Sz93xD;F(E(4c^%faR0 z3UEcZ5?mRs0#}8r!PVg!a80-tTpO+f*M;lB_2C9^L%0##7;XYLg`2_6;TCX9xE0(Q zZUg(nZQ%eo5DtQa;dXF)I0WthcZ55^p>P-+4oAS9;Yc_Nj)uFyUEywUcen@K6Yd4~ zhWo&M;eK#`cmO;Q9t01DhrmPOVeoKx1UwQR1&@Zuz+>TY@OXFv90N~;C&829DezQy z8ay4I0ndbI!L#8x@LYHv91G8fh=G`hh1iILxQK`NkTi%Nk`_saq(?F!8Iep#W+V%e z70HHVM{*!Jkz7b_BoC4o$%o`e3LpiMLP%kx2vQU&h7?CiASIDfNNJ=DQWhzPlt(Hc z6_H9vWuyvH6{&_)M`|E7ky=P?qz+OSsfW}@8XygkMo43%3DOj4hBQZ7AT5zrNNc1G z;*YdN0+2u?2nj~oA?=Y6qyy3s>4bzLVMsU4J1cx*^?>9!O857t$N) zgY-rEA^nj7$UtNeG8h?x3`K?^!;ullNMsZ;8X1F(MaCiHkqJl)G7*`COh%?4Q;})N zbYuoH6Pbm~M&=-Mk$FfgG9QUU;*kU-5m|sNL>3{7ktN7dWErv?S%IuXRw1jAHON|I z9kL$TfNVrIA)Apc$W~+_m1UyOBM}USuD#A31;=L=GW`kt4`a?iJktfJg zi$XDbW@*Vk+RQLf3qX>$k7>c6=N}?1> zqYTQT9Ll2tDxwl9qYA2`8mglPYN8fuqYmn#9_mBWpnhmtG##2A&46Y^GohK$ENE6V z8=4)>f#yVWp}EmKXkIiQnjbBI7DNl7h0!8tQM4FZ94&#CL`$Ki(K2XRv>aL>xedvDl0D2HTgdRqZphwYT=yCJ}dJ;W_ zo<`50XVG)$dGrE$5xs<7Mz5e((QD{+^agqpy@lRJ@1S?ld+2@i0s0Vqgg!=}pij|f z=yUW1`VxJGzDD1mZ_#(?d-MbP5&eXIM!%q6(QoK?^al!I0E00ELop1)F#;no3ZpRw zV=)fnF#!`X36n7eQ!x$GF#|I(3$rl?b1@I|VQDZwEG?D}OOIv1GGdvq%vcsIE0zt* zj^)5|V!5!~SRO1dmJiF16~GE&g|Na{5v(Xy3@eV6z)E7Ju+mr=tSnXzE00ycDq@we z%2*YwDpn1vj@7_wVzsc^SRJe`Ru8L>HNYBTjj+a86RauL3~P?Hz*=Ihu+~@`%pYru z1z>?#5EhKJ!`fpZSO=^l)(H#6!mw~G0_%)LVo_K$)&=W|b;G)2J+PiwFRVA#2kVRV z!}?DUZx zCN>M3jm^R4V)L+AY(5r;#bXIrBDMfqh%LevV@t55*fMN6wgOv;t-@AgYp}K0I&3|* z0o#ae!Zu@Du&vlOY&*6C+llSMc4K?6z1Ti%KXw2+h#kTXV@I%~*fH!lb^<$zox)CI zXRx!_IqW=k0lSD@!Y*T1u&dZL>^gP>yNTVxZew?_yVyPKKK1~6h&{p{V^6TB*fZ=o z_5yo}y~18&Z?L!6JM2C70sDx3!aiePu&>xR>^t@YgK&VuID(@%hT}MalQ@ObID@k| zhx53Ai@1c#xPq&=hU>V2o4AGBxP!a6hx_m}xF4PtPlu<+GvFEVOn7EI3!W9vhG)lf z;5qSJcy2roo)^!D=f?}+1@S_7VY~=l6fcGs$4lTP@ltqcybN9zFNc@ME8rFJN_b_w z3SJejhF8aH;5G4Dcx}86UKg*2*T);+4e>^JW4sC86mNz%$6Men@m6?iybbP;x5WeS zKs*Q!#@pfT@esTN-VyJFhvH#)I39s_#v}14JR0wUcg4Hm-SHlHPrMi28}Ebn#rxs? z@d5Zid=NeuAA%3XhvCEV5%@@a6h0asgOA0>;p6cMcnm%fpM+1wr{GiZY4~(}20jy? zh0n(4;B)bLcq~32kHh2f1UwO6fG@-s;fwJl_)>fsz8qhHuf$j3tMN7XT6`V89^Zg( z#5dua@h$jPd>g(U-+}MMcj3G7J@{UHAHE+yfFHyU;fL`f_)+{AejGo6pTtk$r|~oR zS^OM+9>0KJ#4q8O@hkXM{2G28zk%PxZ{fG`JNRAv9)2HxfIq|^;g9hr_*48D{v3aS zzrj`6NK2$6(i0hoj6^0PGm(YJN@OFl6FG>S zL@pvXk%!1j}DpqBK#4C`*(h$`cicibN%%GEs%7 zN>n4N6E%pML@lB=QHQ8Y)FbK>4Ty$BBcd_UglI}MBbpN}h?YbvqBYTm@F&_50Yo4X zL05{rn%#1djDv5Z(wtRPkrtBBRa8e%Q6j#y7@AT|=4 zh|RE^&{zPdp$V5|4<-#1rBv@r-y*ydYi@ zuZY*g8{#eTj(AUeAU+bGh|k0q;w$lu_)h#FAQF%;iI6CXkvK_^BuSAp$&f6`kvu7o zA}NtFsgNqEkveIRCTWp2>5wkzkv=jF=|`p|(~;@P3}i+!6PcOJLS`kik=e-{WKJ>{ znVZZ*<|Xry`N;xgL9!58m@GmTC5w^8$r5BqvJ_dGEJKzh%aP^D3S>pH5?Pt7LRKZK zk=4l>WKFUbS(~gw)+Ota^~nZgL$VRsm~28eC7Y4W$rfZwvK85yY(x5!ZOH&KkPIS& z$#!IWGKB0vb|gEIp=1~tPDYTO$w)Gaj3&E~UCC}_cd`fBlk7$ICi{?m$$n&iasWAy z97GN#hmb?bVdQXf1UZr%MUE!NkYmYl_4tbZnN8TqNkPpd6B!e~=IbD40Shl)@;SA}ErgD4Jp@mf|R$5-5?9 zD49|ymC`7kGANU>D4TL9m+~kdm4@=8(o*TD^i&2aBbABDOl6_6QrW2NR1PX9m5a(v z<)QLY`KbI<0jeNXh$>7Kp^8$)sNz%!sw7p4DovH4%2MU1@>B(?B2|g1OjV((Qq`#H zR1K;oRg0=k)uHNA^{Dz(1F9j_h-yqVp_)?7sOD4)swLHmYE8AF{HeB702N3DQNdI@ zsy!7#b)Y&@ov2VMj0&eBsLoU*6-7l;U8t^9H>x|;gX&53qIy$(sJ>J`sy{V=8b}SI z22(?*q0}&HI5mPANsXdLQ)8&H)HrH9HGztuCQ_5A$N0hOx=LN6u2VOto764pHg$)(OWmXHQxB+z)FbLK^@Ms#J)@peFQ}K)E9y1% zhI&iAqux^=sE^bq>NE9)`bvGHzEeLahz2xFBQ#25G)@yVNmDdUGc-$cG*1h(NK3R# zE3`^$v`!neNn5l{JG4uCw2w|h`_XCXbaZ+;1D%o1L}#Y6&{^qhbapxios-T*=ce<} zdFgy~e!2i%kS;_Qri;);>0)$ox&&R4E=8B7%g|-%a&&pR0$q`=L|3M(&{gSbbalE0 zU6Zaw*QV>xb?JI^eYyeNkZwdbrkl`B>1K3ux&_^mZbi4I+tB`WTRMOaq=V>Sx*gq~ z4xu~H9qCSVC>=(J(-CxMI+Bi}qvDs zx6#|_9rR9m7rmR_L+_>c(fjEG^g;R%eV9H%AEl4c$LSOFN%|Chnm$9HrO(ml=?nBl z`VxJazCvH6uhG}(8}v>37JZw(L*J$E(f8>G^h5d){g{42Kc%11&*>NROZpZ4ntnsS zrQgx-=@0Zr`V;+`{z8AHztP|6A2h@O24)ZjWiSS32!>=RhGrOsWjKas1V&^eMrIU7 zWi&=-48~+E#%3JGWjw~mq+$G+v`jiCJ(GdS$Yf$NGg+9dOg1JvlY`00LR zJ|;g?fGNlnVhS@wn4(NErZ`iADan*#N;74cvP?OqJX3+G$W&q~GgX+XOf{xDQ-i6= z)M9Egb(p$LJ*Ga>fN97yVj43|n5IlKra9AsY00!=S~G1Jf2J)HzyvZuOfb`qY0rc( z9hiCX&c1~P+~!ORe5 zC^L*1&WvD2GNYK$%ot`YGmaV0OkiS|iOeKsGBbsl%1mRXGc%Z(%q(U$Gl!YW%wuAi z`Ai%W&m=I3%mQX1vxr&DEMb;1%b4ZN3T7p6 zxy)Q)t}@q{>&y-2CUc9q&D>$`GWVGK%md~j^N4xOJYk+P&zR@T3+5&Bih0evVcs(D znD@*F<|Ffo`OJJ_zB1pK@5~PdVgU=Y2#c~9i?akvvJ^|R49l_{%d-M2vJxw^3ahdj ztFs1cvKDKz4(qZW>toZfer#Gc9h;ubz-DAKv6Y*n@zTb-@J)?{n3 zwb?psUA7)upKZW4WE-)K*(PjLwi(-;ZNau=Td}R#HmpC}mJMJ7*&sHUZO67}L)Z>% zN466i%7(GwYy{hxjbx+PXtoR6mF>oMXM3}YljJC+^Cj%O#ZG3-Qk5<8il!cJwUvD4WZ>`ZnRJDZ)u&SmGZvFv;{ zj*Vv%*hF>#yO3SPE@qdoOW9@Ya&`r~l3m5FX4kN5*>&uCb_2VS-NbHYx3F8;ZR~b- z2fLHq#qMVJuzT5k?0)tDdyqZE9%hfQN7-ZSarOjzl0C(qX3wx^*>mi9_5yp6y~JK- zudr9yYwUIQ278me#olJ`uy@&e?0xnD`;dLaK4zb=PuXYebM^)Ml6}R#X5X-H*>~)F z_5=Ho{ltD|zp!7~Z|ryW2Mck4gE@plIgG2;a+$cyTox`XmyOHL<=}F1xwzb19xgAJkIT;$ z;0khuxWZf!t|(WGE6$bRN^+&R(p(v?ELV;z&sE?ma+SEsTotY=SB&!)RQCu|Fh3m?7&x}y`f~%gf!rW&FgJu7$_?X& zb0fHs+$e4|H-;O_jpN316Sx>|A~%Vf%uV5@a?`l!+zf6eH;bFi&Ee*9^SD@UJ{QNu za|v7`w}4y7E#ek)OSq-nGHyAyf?LV0;#PBOxV79mZauew+sJL=Hgj9Jt=u+lJGX<| z$?f8Hb9=bG+&*qUcYr&{9pVmiN4TThG442bf;-8b;!bmCxU<|j?mTyayU1PQE^}A7 ztK2p2I(LJ+$=%{^b9cDA+&%6-_kerIJ>nj7Pq?StGwwO}f_urm;$CxaxVPLp?mhQ` z`^bIbK678VuiQ88JNJWwc)-Iv!lOLK<2=EWJjK&I!?Qfc^Sr=|yu{1A!mGT->%766 zyv5tR!@Io4`}j1xAD@;_$EW8r@EQ3`d}clipOw$XXXkV9Ir&_CZaxp6m(R!N=L_%! z`9geQz6f8GFUA+=OYkN6QhaH?3}2Qn$Cu|T@D=$=d}Y20UzM-MSLbW+HThb6ZN3g) zm#@dy=Ns@1`9^$Wz6sxyZ^k$0TktLUR(xx|4e!slk7!Vl$#@x%EM z{78NjKbjxIkLAbld#82j@@KgC|{B(W>Ka-!u&*tawbNP9EEI*%*l`8E7nejUG_-@tF=H}RYKE&Nt~8^4|3!SCdE z@w@pw{9b+^zn?$AALI}5hxsG?QT`ZzoIk;z@wfRq{9XPYf1iKAKja_rkNGG3Q~nwMoPWW;gE!9xNNU;ziMgn~jL zp|DUyC@K^aiVG!#l0qq=v`|JUE0hz;3l)TlLM5THP(`RJR1>NTHH4Z%EupqhN2n{* z6Y2{MgoZ*Rp|Q|JXeu-lnhPz2mO?9`wa`ZJ7upH|LZA>N1Pkqi_CkozLFg!S5<-PA zAzX+MIt!6Pln^a+5xNT9gziEQp{LMG=q>aS`U?Go{=xuZpfE@nEDRBb3d4lq!U$oc zFiIFLj1k5P5@JcQlvr9UBbF7*iRHx#Vnwl%SXrzhRu!v>)x{cOO|h0(TdX7273+!h z#Rg(Sv60wVY$7%ln~BZE7Gg`WmDpNrBl?SN#Q-r-3=)IIc4B)mMC>4T6g!EbVwe~% zMu?roNHI!`7Q2XD#cpDEv4_}G>?QUV`-pwTeqw)dfH+VbBn}pbh(pC;;&5?P2y&8i?~(XCT!#%SVAOJ!X#WGBvPUzT4E$t;v`-YBvFziSyCib(j;9nBvY~^TXG~< z@+6;>M)H%=O6jEZQU)oblu61gWs$N<*`(}J4k@RUOUfTQU$4^R7t8VRgtPn)uifD4XLJ7OR6o^k?KnIr20|= zsiD+JYAiL8no7;2=28o(rPNAlEwz#SrM6Ol6etBr!BRV^y%ZvKkUC18q);hL3YQ|J z&QhclB}GeJq^?posk_ue>M8Y-dP{wzzEVG_zcfG^C=HSZOGBig(lBYbG(s9Fjgm%7 zW2CXtIBC2zL5h(kN|U6?(iCZ`G)la@;>q?OVtX|=RQS}U!S)=L|tjnXD*v$RFpDs7XtOFN{U(k^MYv`5-2 z?UVLP2c(12A?dJmL^>)Rla5O#q?6Jq>9ll4IxC%%&Px}hi_#_OvUEkdDqWMVOE;vO z(kbVs@?-IMN152T0EBk8g9M0zSclb%a2q?ghw>9zDmdMmw?-b){(kJ2aUv-Cy! zDt(i_OFtw?1~M!oGAd&-E)y~-Q!*_xGAna3FAK6LOR_90vMOt`E*r8bTe2-XvMYPC zPfjEI$!XXSbicum7mGa? z$=~H4GNb?nRuBbMFa=i#g;XeoRv3jS+QgSPKl)OqlCBITYDX0`u3M)mFqDnEP zxKctXsgzPmD`k|jN;##xQbDPxR8lG{Rg|hqHKn>zL#e6MQfe!8l)6eirM}WYX{a<( z8Y@kdrb;uVxza*uskBmBD{T~irL7X61S&yFu+mOxuY@Qal#WU#B~%Gh!j%Z6vl6L9 zDbY$7rK{3S>8|updMdq?-bx>(uhLKHuMAKIDua~4$`EC!GE5n+j8H}@qmMbN13b4Q(~3*N}Lj}Bq)i>0%f7HNLj2b zQI;ypl;z3_Wu>x8S*@&5)++0i^~wfiqq0fatZY%XD%+Ip$_{0xvP;>m>{0eA`;`64 z0p*}_NI9$=QI0Cdl;g??<)m^-Ijx*g&MN1W^U4L~qH;;OtXxs9D%X_j$_?eFa!a|b z+)?f-_mum}1LdLeNO`P0QJyN#l;_F|<)!jUd9A!r-YV~u_sR$5qw-1ltb9?vD&Lgv z$`1unfeNdLimI53tAt9bluE0N%Bq~otAZ-3k}9i;s;Zi*tA=W-mTIex>Z+dVQ`4w^ zYFag&nqJMIW>hn&nbj<6RyCWNUCp89RCB4h)jVomHJ_SaEua=u3#ox-e zCTdf)nc7@!p|(_8sjbyEs=wM+4NwEsAT?NRr?yu^)DCJ#wUZjEhNR@$O^&tI$52fPF1I=)72U3Om&tzTb-lMRp+U(>U=d$jaL)YM0J6>P+g=hR+p$t z)n)2(b%nZ8U8Sy8*QjgNb?SO`gSt`Oq;6KXs9V)->UMR9x>Mby?pF7xd)0mFe)WKQ zP(7p`R*$Gh)nn>$^@Ms-J*A#j&!}hBbLx5Zf_hQCq+V99s8`i%>UH&odQ-in-d69Z zch!69ef5F*P<^C6R-dR()o1E+^@aLUeWkuu->7fZcj|lfgZfeZq<&Vvs9)7@>UZ^r z3TZ%tHAF)-Ov5!oBQ;8+HAZ7KPUAH}6E#VbHAPc3P17|)Gc`-IHAizbPxEPMG(Rn^ zmQG8rWzaHenY7GW7A>omP0Oz3&~j?IwA@-AEw7eO%dZvC3TlP4!delns8&oXu9eVA zYNfQ&S{bdZR!%FgRnRJGm9)xQ6|JgPO{=ce&}wS6wAxx7t*%y2tFJZC8fuNS##$4t zsn$$uuC>rwYOS=^S{uz@YpVrlfm)CjthLkHYav<(t)tdS3)RB3a4kaXtVL>3TC~#B9rx@$eOo?0)hx7J7NtM$|RYXh`_+8}MPHbfh$4bz5eBeap)C~dShMjNY*)5dEP zv>0uoHc6YTP0^-m)3oW@3~i=1OPj6D(dKINv{-Gv7N^B)30k7IKwGFS(iUq=w58fI zZMn8WTdA$mR%>gtwc0vuy|zKysBO|VYg@Ff+BR*wwnN*g?b3E@d$hgUK5f5tKs%@% z(hh4!w4>TF?YMSAJE@)0PHShhv)VcBymmpms9n-7Yge?Z+BNOEc0;?V-O_GrceK0O zJ?*~sKzpb?(jIG1w5QrL?YZ_sd#SzBUTbf(x7s`Hz4k%-sD08tYhSdl+BfaH_Cteo zpu;+%qdKPJI-!#~rPDg2vpT2qx}b}?q|3UZtGcG^x}lr8rQ5orySk_Q^fbDko>ot% zr`I#+8TCwhW<86ZRnMkp*K_DO^;~*xJ&&GO&!^|t3+M&)LV97nh+b4LrWe;s=q2@1 zdTG6kURE!sm)9%k74=GbWxa}CRj;O3*K6oC^;&vuy^da2ucz178|V%7MtWntiQZIi zrZ?AH=q>eDdTYIn?ytAi1N1;WNDtQA>FxCpy@TFS@1%$7VS2b8p?B6J^(Z}B@1l3r zyXoEa9(qr`m)=|NqxaSO>HYNq`ape3bbW?CQ=g^J*5~MR^?7=%K3|X16`T}`c{3LzFps;@6>ncyY)T#UVWdwUq7H9)DP*0 z^&|RG{g{4SKcS!0PwA)iGx}NmoPJ)vpkLH4>6i5@`c?g!eqFzz-_&pExAi;vUHzVZ zUw@!K)F0`O^(XpM{h9t;f1$tBU+J&)H~L%so&H|`pnudq>7Vs4`d9s%{$2l}Lk2Kl z12IqoGjM}2NP{wHgE3fxGk8NVL_;!ULorlCGjzi+Ov5s4!!caLGkiuG!_P=-q%+bR z8H|iZCL^KhG=hDIZ!vC+h6YBV#N z8!e2MMk}MW(Z=vM+8O~ypb=yQ8|{qtMu^eD=xB5@LX9vZ+=wtb8<9qo5p8rax*FY# z?nV!zr_sykZS*nv8vTs^#sFiWF~}Hf3^9fp!;Im^2xFu%${1~oF~%C>jPb?yG#uj6%vCY_S>@ap3yNun&9%HYu&)9DqFb*1rjKjte zCFsg zMl+L{+00^QHM5!7%^YSzVb<24+LEk=fX6Vm39Kna#}> zW=pe`+1hMl`kQUd05i}GGK0-_W_vTl>|k~@JDH(om>F(Hn4Qf?Gs=uMyO>?gZf1A0 zhuPEYW%f4vn0?KDW`A>lInW$r4mO9FL(O64aC3w?(i~-uHpiG_&2i>bPa=4Nw?xz*feZZ~(BJI!6@ZgY>h*W73BHxHNx%|qs4^N4xW zJZ2s@PnajoQ|4*&jCs~PXP!4Nm>11U=4JDWdDXmTUN>);H_cn-ZS#(K*Su%mHy@Y} z%}3^A^NIP?d}cm3Uzjh=SLSQ;jrrDmXTCQ-m>8$it1}meL z$;xbHv9em(tn5|}E2ovq%5CMb@>=<<{8j<0pjF5!Y!$JJTE(p5Rtc-5Rmv)Dm9ffN z<*f2n1*@V}$*OEsv8r0ttm;+`tEN@Us%_P=>RR=z`c?z0q1DK0Y&Ef(TFtEHRtu}8 z)yisZwXyuIwpM@@Xa!lpRy(V`6=HR;I$E8qP%F#|w<4_0R-_eWMO$60u2wgzyVb+$ zY4x&tTYap)RzIu1HNYBZ4YCGXL#(0JFl)Fq!WwCfvPN5Dtg+TOYrHkVim@hIldQ?s z6lVmDVb2 zwYA1tYpt`^TN|v6)+TGSwZ+#%jiI%*xW zj$0?Jlh!Hgv~|WhYn`*sTNkX0)+Ot*b;Y`BU9+xRH>{i1E$g;*$GU6Xv+i3DtcTVk z>#_C3dTKqho?9=hm)0xmwe`k&YrV7HTOX{C)+g(;^~L&XeY3t>KP<=wHf$p{YGXET z6E)3VedUk!gf!)w|nc{-QEtdJJ=oVPIjmrW{2Amc4s@%j+3W2M_C|Y?z1iMkZ?(7C+wC3pPJ5TV+umdEwfEWk?F05f`;dLuK4KrWkJ-oV z6ZT2_lzrMhW1qFp+2`#G_C@=Wec8TZU$w8<*X9K@hj#==bRBayxmPyiPtRzf-^|=oE4aJ4KwLPBEvrQ^G0flyXWtWt_53Ij6i+ z!Kvs}awI=rnQ~J58LXPBW*u)52-#v~pTI zZ5)56trOq`Izdjb)6Qw{gg70Xj!q{h)CqIKod~D16X`@b(M}hqtJBTt?(}eaI=!6U zP9LYQ)6ePe3~&ZIgPg(65ND_}%o*;Ca7H?#oYBr0XRI^M8ShMRVw{Q2BxkZS#hL0% zbEZ2poSDunXSOrPnd{7RVx9R;oD=UPIEl^zXQ8vmS?nxvmO9Iv<<1IcrL)Re?W}Ru zI_sSE&IV_rv&q@)Y;m?a+nnvr4riyc%h~PharQdM;mYq_=EI&NLJo?G8-;5Kv{xsBZ>Zd13J+uUv8wsc#$t=%@R zzuVRga0A^SH`s0Gws%9^4sJ)clN;)Wx#4bv+u4nDqugk>i`&)h=5}{`xINuoZg01b z+t=;q_IC%k1KmOHV0VZ+)E(vycSpD*-BIpncZ@sM9p{dBC%7^0M0b)q*`4A}b*H)0 z-5Ksoca}Tbo#W1R=ee=&d^gUGcN5%1cY(XmUF0rym$*yaW$to!g}c&S<*s(uxNF^Y z?s|8FyV2d`Zg#i0TitE$c6W!n)7|CncK5h@-F@zU_kerQJ>(vCkGMzOWA1VHgnQCG z<(_uWxM$sS?s@lud(pk*UUsjzSKVvwb@zsQ)4k>1cJH`%-FxnR_ksJ+edIoNpSVxm zXYOJ;{?j#Zx`a(>=p8JrM%K!8LzBY&MWU#@G5$h zyvkk`uc}wgtM1kCYI?Q2+Fl*6u2;{i?=|omdX2osUK6jW*UW3~weVVct-RJ=8_(Zs z>jij$UXT~;we#A0AzlZsqu0p`^}@VxFT(5WMS4+QwAaP!>UHzFdp*3KUN5h=*T?JY z_4E3B1H6IWAaAfY#2e}j^M-pPypi50Z?reY8|#hp#(NXI7;mCC$(!s=@uqsyyy@Ny zZ>BfPo9)f<=6dtISZ}@;=f!&oUZS_aTj(wF7JEy)rQR}cxwpbw>87DXU zduP0}-Z}5Qcfq^pUGgq_SG=pAmt^dvCnA-aGHT_rd$cf1vkMNN`%18Sc zAM4|Myif3nKFKHh6rbwTe7eu@nLf*B`y8L^^L##E8lRsptuLJ~y)T0=qc4*$voDJ; zt1p``yDx_?r!SW;w=a+Hf3bECLAP#ex2D&cwvD&Df77;Y+qP}nwr$(CZQC}_KIgx8 zWJVS;vXH%Ujb}uRQEj!>(gzuWj6tR#bC4y-8e|Ky2RVYAL9QTokSE9+ zp`dV3Bq$mb3yKFNf|5b0pmb0sC>xXu$_EvKib18Ia!@6x8dM9a2Q`A4L9L*6P$#Gx z)C=kd4T6S2qo8rnBxo8m3z`Qlf|fz6pmoqDXdAQ(+6NtijzOoObI>K|8gvV~2R(wG zL9d{9&?o2{^b7h21A>9UpkQz?Bp4bD3x)?Hf|0?fV017h7#oZW#s?FEiNT~`axf*B z8cYkO2Qz}1!K`3*FejKB%nRlR3xb8gqF`~bBv=|O3zi2ff|bFlV0Ex2SR1Sh)(0Dc zjlrg1bFd}Y8f*);2RnkD!LDF;uqW6X>2ZDpaq2O?EBsdxz3yudTf|J3i;B;^% zI2)V`&IcEQi@~Mfa&RTM8e9vm2RDM7!L8tSa3{DM+zajp4}yomqu_DyBzPJ;3!Vor zf|tRo;C1jOcpJP6-UlCokHM$lbMPhj8hi`B2S0+J!LQ(V@F(~i{3HGqp+smAMuZjN zM0gQFL==%kWD!L~712a=5ktfju|#YUN5mEJM0}AzBov86Vv$576@d^!3MI5K!U`w6 z2ocFda*;x$6sbgNkw&By=|p;wL1Yw}L}rmiWEI&&c9BEm6uCrhkw@ee`9ywEKok^( zL}5`x6cxopaZy5)6s1IIQAU&%RgL39+IL}$@ObQRr1chN)i6um@m(MR+Z z{X~B;KnxUv#9%Q*3>Cw~a4|xR6r;pwF-D9P%@AoL2MM8#AdNYY!%zYcCkb3 z6uZQ3u}AC``^0{6KpYf@#9?tn92LjJadASN6sN>#aYmdK=frt&L0lA<#AR_sTou>E zb#X)76t~1}aYx)0_r!hiKs*$W#AES9JQdHxbMZpF6tBc<@kYEA@5FoYL3|XS#AoqE zd==lsckx5~6u-o8@kjg>|Hyx3C>dIYkzr*x8D2(^5oIJ9Sw@jjWi%OG#*i^(EE!wI zk#S`_8DA!l31uRgSSFE4Wgvx=Qb{e1w9-j0Lu4|UT&9pIWh$9krjco7I+cQwvlaR zJK0`#kR4?w*;#gxU1c}fUG|VYWiQ!V_K|&MKiOXnkOSo)Iam&nL*+0zT#k?<Kk8oGg;x<&L={OzR#8+`6-`A~F;q+yOT|`kR9qEL#a9VbLX}7*R!LM+6)2&kQc5eM zta8e$5S2_NS1D9Vl}e>nX;fO3PNi2FR7RCaWmZ{KR+UX`R7F)uRaRA0RaH$@S2a{kRZG=YbyQtd zPt{iqR72HBHC9bjQ`Jm0S1nXa)k?KiZB$#;PPJDZR7cfGbyi(eSJh2*S3Oiu)l2nO zeNz4OTE&O;%IXR5eXaS2NU1 zHA~G_bJSclPt8{g)IzmLEmlj^QngGiS1Z&?wMwm4Yt&k`POVoP)JC;QZB|>XFPwiI+)IoJf9acxwQFTlmS0~g-bxNI9XVh7BPMud5)J1hkT~=4r zRdr2WS2xs6bxYk=chp^VPu*7!)I;@1JyuWDQ}s+eS1;5{^-8@~Z`51$PQ6zj)JOG6 zeO6!8SM^POS3lHG^-KL$f7D;~kN#JO(xG)29ae|a;dKNZQAg5|brc;{N7K=D3>{O) z(y?_M9aqQG@pS^7P$$xfbrPLa2U=*UmDbv5tDW{bL?_e9bqbwQr_!l)8l6_B)9G~v zol$4fnROPORcF)Lbq<|V=hC@#9-UX`)A@A)T~HU&g>?~KR2S35bqQTkm(rzm8C_PF z)8%yqT~Sxkm30+eRaev1bq!rp*V46h9bH$~)Ae-&-B35ujdc^_fbr0QB_tL#}AKh2?)BW`TJx~wQgY^(SR1ed`^$0yu zkJ6*{7(G^x)8q97JyB26ll2rmRZr8?^$a~z&(gE?96eXh)ARKLy-+XGi}ez{R4>!Z z^$NXGuhOga8ogGp)9dvHy-{z{oAnmGRd3VV^$xvL@6x;V9=%uZ)BE)SeNZ3LhxHMC zR3FpF^$C4apVFuG8GTlt)93XCeNkW1m-Q8WRbSKB^$mSf-_p1B9er2d)A#iQ{ZK#B zkM$G%R6o{M{ZW6?pY<30Re#gp^$-11|I)wpAN^PVWBxUv zOlT9vgf-zzcoV@yG?7eX6U9U|(M)s`!^AYPOl%X!#5M6ue3QT=G>J@Nlf)!7fe}U; zWwbHI8fUx-G09AFlftAlsZ45<#-ugrOnQ^SWHgyfW|PHaHQ7vdlf&dRxlC@8$K*Bn zOny_q6f}iQVN=8uHN{MEQ^J%qrA%p4#*{VXOnFnmR5X=LWmCmeHPuXYQ^V9WwM=bO z$J90TOnuY9G&GG&W7EVmHO)+O)55egtxRjv#&^uY%-hNrm!h(Dx2D-v1x5O zo8D%y8Eqz;*=Dg>Z8n?T=CC zv1M&JTi#Z%6>TM3*;cVtZ8cln*042gEnC~xv2|@dTi-UY4Q(Ua*fz0EZ8O{4wy-U2 zE8E((v2ATT+unAt9c?Gu*>EIZrIv2*P_JKrv_3+*Di*e160?KZpJ?yx)UF1y?Av3u=4yWbwL2kjwy z*dDP*?J;}Yp0FqFDSO(Uv1jc$d){8K7wsi`*KL08BXc12uKSIiZ6C0t2Y%9VCyTv=Dnm3I|fMOVpHc2!(eSIt#-HC#JU*UU9{EnG|2%C&ZFTwB-9wRas{N7u=9c3oUo*Ufcz zJzP)M%k_4BTwmAE^>+i@KsU$@c0=4yH_Q!pBiu+g%8hno+*mixjdv5=L^sJzc2nF` zH_c6VGu%u!%guIk+*~)$&36mjLbu2*c1zq+x6CbfE8I%A%B^;5+*-HJt#=#TMz_gr zc3a$5x6N&LJKRpU%k6f1++MfO?RN*_sl(aFWgJ_%Dr}P+*|j~ zy>}nnNB7Bnc3<3A_sxBGKip6E%l&qL++X*P|JR4|p?w%1)`#=qeFPuTNAi(<6d%<` z^U-|_AJfP3v3(pL*T?hmeFC4*C-RAX5}(uuUU=!1*WP&Ro%cS(C-cdD3ZK%a@~M3q zpVp`I>3s&D(P#3ReHNe9XY<*84xiKK^0|E;pV#N}`F#Oj&=>NBeGy;O7xTq^318Be z@}+$lU)GoN<$VQT(O2@7eHCBTSM$|<4PVpO^0j>(U)R_3^?d{1&^PjpeG}i*H}lPX z3*XYW@~wRv-`2PD?R^K|(RcEleHY)=ck|tS58u=G^1Xc@-`Dr^{rvzx&=2y1{SZIY z5A(zQ2tU$~@}vD2Kh}@)_FZ0X&3cu2?@~izCzt*qw>-`45(Qopb{T9E~Z}Z#z4!_gy^1J;Wzt`{c`~3lb z&>!-L{SklEAM?lk34hX`@~8b7f7YM#=lunL(O>eH{S|-JU-Q@f4S&<$^0)mRf7jph z_x%I^&_D8z{S*JxKl9K13;)u;@~{0H|JJ|r@BIh=(SP!v{TKh$fAin{5C7Bu^1uBb z|JVN$@^46}kkBDvLc)fG3ke?*AtYi*q>#uVQ9`1IL<@-?LTBLVkQo2>!q25`J2YzC zp>d-F|1(SQfAz}I@mKIaWw1@#c4+>e-I)LVvt|D&r~BV`_Z58j&matqeg7}bsmK5M zH(S5|6x99C`)|QxDDMAfdFubyHJ(WPJ~BqZs2B~SV+{OHGxGm=|5+9KKQA`M!MGR? z85)v!9&z?xVKYhxX(i}kQRHo%712peM)Y>LgWIkv!-*a}-?8*Gd1uswFbj@Su1 zV;Ag-|E&V<&fNohVlV8CeXuX~!~Qq`2jUa4Js2={N&t;w+qvb8s%s!}+)X7vdsZj7xASF2m)x0$1WHT#ajREw01$xB)lf zCftl$a4T-Z?YIMX;x62cdvGuA!~J*w58@#_j7RV&9>e2!0#D*8JdJ1YES|&jcmXfs zCA^GR@G4%z>v#ii;w`+5cknLW!~6IEAL1i?j8E_>KEvnu0$<`Qe2s7LExyC|_yIrS zC;W_G@GE}9@Aw0M;xGJ-fABB7|{8{=SHjEC_t0Vc#mm>82_QVdX_M1>j+T6E|!1e0NMOo1se6{f~Cm=@Dvddz?s zF%xFSESMFuVRp=cIWZUJ#ypr8^I?80fCaG-7RDl26pLYTEP*Al6qd#^SQg7+d8~jH zu@Y9sDp(b(VRfv5HL(`f#yVIR>tTItfDN$`HpV8{6q{jlY=JGY6}HAU*cRJid+dN6 zu@iR2F4z^jVR!6-J+T+|#y;2=`(b|^fCF(54#puk6o=t(9DyTo6pqF*I2Om@c$|O} zaS~3(DL56U;dGpVGjSHq#yL0_=iz)@fD3UEF2*Ie6qn(0T!AZb6|TlLxE9ypdfb2; zaT9LFEw~l8;db1CJ8>88#yz+f_u+m#fCupq9>ybh6p!I?Jb@?i6rRR2coxs$dAxuZ z@e*FfD|i*J;dQ)$H}MwU#yfZy@8NxXfDiEzKE@~b6rbU9e1R|V6~4wd_!i&cd;EYO z@e_W=FZdO|;dlIjKk*m-#y|KM|GQ%FZz%E~L*xJ7v$)at-{=?vV`40fjd3t8#>4oS z025*&OpHk|DF!G|qC$-ZEjsiVg2^yBrofb#3R7bmOpEC-J!Zg+mVx%J$As3*acz=gO77vmCKipy|0uE3SJ3RmMAT#M^)J#N5_xCuAo7Tk*4a69h6 zowy5k;~w0L`*1%Vz=L=Q591L$ipTIcp1_lM3Qyx1Jd5Y>JYK+ycnL4#6}*bq@H*bW zn|KRv;~l(<_wYVGz=!wRk0dY#~N4@ zYhi7ygLSbU*2f0e5F24*Y=TX(88*ij*b-Y|YixsUu^qO@4%iVpVQ1`uU9lT>#~#=d zdtq{5Fg=Ve1cE$89v7s_!3{?YkY%m@g2U$5BL#3;b;7UU-27$#~=6; zf8lTZgMaZq%XI(e{fnV6G={;j7!Jc@1dNE0FfvBLs2B~SV+@Rmu`o8q!MGR?<6{C$ zh>0*UCc&f_pg@TVH5#<&&|?TD!{nF(Q(`JijcG6~ro;4@0W)GI%#2wuD`vy&m;-ZS zF3gR2FfZoA{8#`BVj(PyMX)Fq!{S&1OJXT3jb*Sbmc#N`0V`r9tc+E#DptelSOaTf zEv$`ourAia`q%&)Vk2yfO|U68!{*omTVgA0jcu?kw!`+=0Xt$R?2KKoD|W-~*aLfF zFYJwdurKz*{x|>!;vgK1LvSb#!{ImrN8%_Pjbm^uj>GXd0Vm=loQzX&Do(@cI0I+m zES!yVa4ycn`M3ZV;v!s(OK>SJ!{xXFSK=yMjcaf%uEX`X0XO0%+>BdrD{jN>xC3|M zF5HcKa4+t|{dfQm;vqbYNAM^f!{c}YPvR*&jc4#Ip2PEa0Wabuyo^`yDqh3ucmr?Z zExe6)@GjoN`}hDK;v;;FPw*)|!{_({U*ao#jc@QRzQgzU0YBm={ET1lD}KZ8_yd39 zFZ_*v@Gt&nm-c`1A46ei41-}Y9EQgT7!f03WQ>AQF&ak47#I^{VQh?paWNjo#{`%V z6JcUZf=Mwzff5yJG-%PG#}G`0$uR|{#8j9X(_mUmhv_i`X2eXG8M9zk%!b)92j;|F zm>ct8Ud)I2u>cmtLRc7!U{NfF#jymI#8Oxq%V1e7hvl&XR>VqJ8LMDbtcKOG2G+z{ zSR3nLU95-ou>m&3M%WmeU{h>{&9Mcx#8%iE+hAL4hwZTgcEnED8M|Ot?1tU32lm8X z*cY>oQBhJ2F}D; zI2-5ST%3pVaRDyGMYtH3;8I+M%W(y+#8tQ&*Wg-QhwE_zZp2Nv8Mok8+=kn62kyjO zxEuH2UfhTK@cNB9_@;8T2t&+!Gm#8>zl-{4z(hwt$Ne#B4s8Nc9H{D$B02mZug z_#6M=U;NK1{r}`YhQiPo2E$@F437~oB1Xc<7zLwZG>nchFeb*r*cb=nVmyqG2{0ih z!o-*alVX4ZB`VZt(4s?+A(#x4V+u@(sW3IB!L*nT(_;qAh?y`mX2GnO4YOko%!#=$ zH|D{-m=E(~0W64xurL7)R4Xa}ftckU- zHrBzqSP$!C18j(murW5lrq~RdV+(AFt*|w=!M4~A+hYgph@G%AcEPUL4ZC9x?1{aw zH}=84*bn>T033*ea4-(Rp*ReO;|Lsyqi{5i!Lc|F$KwQ?h?8(KPQj@-4X5J_oQbn= zHqODhI1lIJ0$hlTa4{~yrML{2;|g4ft8g{0!L_&!*W(7kM!LxV{&*KHWh?np(Ucsw)4X@)3yotB) zHr~Ozcn|O61AK^&@G(BYr}zw?;|qL=ukba#!MFGh-{S}Th@bE?e!;K!4Zq_L{E5Ht zH~zuD_)mE9A46ei41-}Y9EQgT7!f03WQ>AQF&ak47#I^{VQh?paWNjo#{`%V6JcUZ zf=Mwzff5yJG-%PG#}G`0$uR|{#8j9X(_mUmhv_i`X2eXG8M9zk%!b)92j;|Fm>ct8 zUd)I2u>cmtLRc7!U{NfF#jymI#8Oxq%V1e7hvl&XR>VqJ8LMDbtcKOG2G+z{SR3nL zU95-ou>m&3M%WmeU{h>{&9Mcx#8%iE+hAL4hwZTgcEnED8M|Ot?1tU32lm8X*cY>oQBhJ2F}D;I2-5S zT%3pVaRDyGMYtH3;8I+M%W(y+#8tQ&*Wg-QhwE_zZp2Nv8Mok8+=kn62kyjOxEuH2 zUfhTK@cNB9_@;8T2t&+!Gm#8>zl-{4z(hwt$Ne#B4s8Nc9H{D$B02mZug_#6M= zU;HNm`H!J6G={;j7!Jc@1dNE0FfvBLs2B~SV+@Rmu`o8q!MGR?<6{C$h>0*UCc&f_ zpg@TVH5#<&&|?TD!{nF(Q(`JijcG6~ro;4@0W)GI%#2wuD`vy&m;-ZSF3gR2FfZoA z{8#`BVj(PyMX)Fq!{S&1OJXT3jb*Sbmc#N`0V`r9tc+E#DptelSOaTfEv$`ourAia z`q%&)Vk2yfO|U68!{*omTVgA0jcu?kw!`+=0Xt$R?2KKoD|W-~*aLfFFYJwdurKz* z{x|>!;vgK1LvSb#!{ImrN8%_Pjbm^uj>GXd0Vm=loQzX&Do(@cI0I+mES!yVa4ycn z`M3ZV;v!s(OK>SJ!{xXFSK=yMjcaf%uEX`X0XO0%+>BdrD{jN>xC3|MF5HcKa4+t| z{dfQm;vqbYNAM^f!{c}YPvR*&jc4#Ip2PEa0Wabuyo^`yDqh3ucmr?ZExe6)@GjoN z`}hDK;v;;FPw*)|!{_({U*ao#jc@QRzQgzU0YBm={ET1lD}KZ8_yd39FZ_*v@Gt%o zk^IL{7#hQ1SPX~ZF#<-!NEjKTU{s8T(J=JeU{rVSX%t z1+fqo#v)i0i(zprfhDmNmc}wz7RzCItbi4<5?014SQV>bb*zCku@=_GI#?I$VSQ|X z4Y3h6#wOSln_+Wofi1BWw#GKt7TaNa?0_Ay6L!Wf*cH2BckF>Zu^0BnKG+xgVSgNe z191=z#vwQqhv9G>fg^Dgj>a)K7RTXuoPZN?5>Cb`I2EVibew@RaTd4oS025*&OpHk|DF!G|qC$-Z zEjsiVg2^yBrofb#3R7bmOpEC-J!Zg+mVx%J$As3*acz=gO7 z7vmCKipy|0uE3SJ3RmMAT#M^)J#N5_xCuAo7Tk*4a69h6owy5k;~w0L`*1%Vz=L=Q z591L$ipTIcp1_lM3Qyx1Jd5Y>JYK+ycnL4#6}*bq@H*bWn|KRv;~l(<_wYVGz=!w< zALA2ziqG&lzQC9G3SZ+Je2ee!J$}HC_z6Gb7yOFf@H_s%pZE)Z;~)Hs|3oJLF%*Wz zFc=oYVR(#y5it@*#wZvSqhWN6fiW=_#>O}p7vo`kOn?b75hlhYm=psPC{dwCgBBfn z48dfW98+LQOogd24W`9(m>x4=M$CknF$-qJY?vK$U{1`1xiJss#eA3_3t&MkgoUvP z7R6#%97|wHEQO`943@=mSRN~2MXZFCu?kkjYFHg>U`?!rwXqJ?#d=sD8(>3hgpIKY zHpOPx99v*ZY=y0{4YtL0*d9AzN9=^1u?u#^ZrB}rU{CCYy|EAW#eUcy2jD;)goAMi z4#irsL98cg$JcXz644%bvcpfj{MZAQU@d{qWYj_=R;7z=RxA6|%#d~-kAK*iLgpctF zKE-GF9ADr|e1)&^4Zg*9_#QvtNBo4J@e6*%Z}=U5;7|O8zwr8GV;BsJ z;V?W#z=#+LBV!bdiqSAS#=w{u3u9v(jEnIwJ|@6~mta2uj}5RPHp0f(1e;q9kCAPxW*aq8TJ8X{~up@TD&e#RJVmIuLJ+LSC!rs^i`(i)rj{|TZ4#L4W1c%}< z9F8M!B#y$-I0nb!I2?}?a3W5^$v6e4;xwF&GjJx(!r3?n=i)q^j|*@iF2cpQ1efA6 zT#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN!rizB_u@X>j|cD|9>T+T1drk| zJdP*uB%Z?4cm~hnIXsUS@FHHq%XkH^;x)XEH}EFj!rOQU@8UhYj}P!6KElWN1fSwF ze2y>hCBDMf_y*tNJA98H@FRZ0&-ewu;y3(`Kkz61!r%A@|KdMU$$t!mp)m}G#c&uN zBVa^~gpn}{M#X3t9b;fjjD@i=4#vfJ7#|a0LQI5-F$pHc00l}^sL`NBhaN*P879XR zm=aTAYD|M^F&(DI444r!VP?#NSuq=C#~hdwb75}GgLyF@=Enk95DQ^pEP_R`7#7D8 zSQ1NNX)J?fu^g7i3Rn>Rk0dY#~N4@Yhi7ygLSbU*2f0e5F24*Y=TX(88*ij z*b-Y|YixsUu^qO@4%iVpVQ1`uU9lT>#~#=ddtq{5Fg=Ve1cE$89v7s z_!3{?YkY%m@g2U$5BL#3;b;7UU-27$#~=6;f8lTZgMaa#XyiYJ!q6B7!(uoLj}b5; zM#9J#1*2j#jE*rdCdR_p7zg8GJdBSCFd-(w#Fzw=Vt@iAD%5DuqC<}%m<*F+3QUQq zFg2#Zw3rUlV+PEKnJ_bE!K|1Kvttg-iMcR0=E1y}5A$OIEQp1$Fc!h0SPY9}2`q`F zur!vzvRDqwV+E{;m9R2a!Kzpdt78qUiM6mc*1@`159?zCY>17pF*d=b*bJLv3v7w4 zur;>9w%88aV+ZVrov<@@!LHa1yJHXRiM_Bl_QAf`5BuW)9EgK(Fb=_?I1Gp52pox{ za5Rp=u{aLL;{=?DlW;Ol!KpY6r{fHqiL-Dv&cV4j59i|oT!@QsF)qQSxD1!$3S5b+ za5b*MwYUz~;|AP_n{YF3!L7Irx8n}niMwz&?!mpd5BK8%Jcx(zFdo69cnpu@2|S6X z@HC#mvv>~A;|08km+&%P!K-);uj388iMQ}J-od+g5AWjxe29b0(F&@Up1eg#LVPZ^zNijfy5*2DRXwjj^5KM;2F$Jc?RG1pm zU|LLv=`jOl#7vkOvtU-thS@O(=EPi>8}ndZ%!m2002ahTSQv|7Q7neVu>_XHQdkv02a#7(#vx8PRXhTCxm?!;ZV8~5N|+=u(|03O6cco>i1Q9Opn@dTd4Q+OKB z;8{F}=kWqw#7lS?ui#a@hS%{1-o#sY8}Hy_!ytyQ+$Tc@ddubSNIy= z;9Go$@9_hE#83Dczu;H=hTriA{={GS8~@;6{3izakD)L$hQY8H4#Q&vjEIpiGDg9u z7!9Li42+4fFgC`)xEK%PV**Twi7+uH!K4_VK#2-98no!pV+bb0SI818ZU}tc`WBF4n{P*Z>=1BW#RKuqigf=GX#TVk>NoZLlr2 z!}iz#J7Op7j9suRcEj%21AAgG?2Ub}FZRR!H~D z!}YiUH{vGTj9YLkZo}=k19##s+>Lv1FYd$rcmNOLAv}yn@F*U`<9Gs3;we0hXYeeZ z!}E9nFXAP8n18?Fjyp4D8F5biY_y8Z`BYccc@F_mS=lB9&;wyZOZ}2U? z!}s_BKjJ6+j9>68e#7th1ApQ#{EdI`Fa8sg{Krrj8pB{%42R({0!GA07#X8rRE&nv zF$TuOSQs1QU|fuc@i74=#6*}FlVDN|P@qJG8Vy=>=rIJ7VRB4?DKQnM#x$4~(_wnd zfEh6pX2vX-6|-S>%z-&E7v{!1m>2V5ek_0mu@Dxmq=6{}%&tbsML7S_f(SQqPIeQbaYu@N@LCfF34VRLMOEwL50#x~d%+hKd` zfE}?DcE&E)6}w?~?14S87xu^NPR1!X6{q2JoPjfO7S6^wI2Y&Pd|ZGFaS<-YCAbuq;c{GoD{&RB#x=MW*Wr5H zfE#fWZpJOR6}RDb+<`lB7w*PAxEJ@~emsB&@em%yBX|^#;c+~HC-D@X#xr;p&*6Ez zfEV!+UdAhU6|doSyn#3I7T(4?co*;CeSClq@ew}8C-@Yf;d6X}FYy(=#y9vD-{E`w zfFJP_e#S5O6~Ezk{DD957yiaS_!s|)MgC(b42@whEQZ7I7y%<;O(V-YNh#jrS*z>-)BOJf-us$}xhS&%jV-swO&9FJPz?RqwTVoq+i|w#IcEFC< z2|HsK?26s6JNCey*b94OAMA_$us;sKfj9^U;}9H*!*Do`z>zo#N8=bAi{o%SPQZyc z2`A$eoQl(MI?lkEI16Xv9Gr{ua6T@;g}4Y8;}Tqo%Wyfaz?HZPSK}I7i|cSbZorMW z2{+>w+=|<9JMO@pxC?jV9^8xja6cZvgLnuJ;}JZH$M86wz>|0iPvaRpi|6n>UcifZ z2`}Rnyo%TGI^MvWcnfdi9lVS8@IF4khxiB|;}d*}&+s|Ez?b+6U*j8mi|_C~e!!3T z2|wc({EFZ3JO03*_zQpIAN-5|#3uhS6o$qy7#71}c#MD%F%m|`C>Rx^VRVdvF)qLqPRxb5F%Ra&e3%~#U_mT|g|P@0#bQ_-OJGSXg{83!mc?>d9xGr)tb~=Z z3RcBxSRHF%O{|5ru@2V7dRQMDU_)$#jj;(f#b($XTVP9Ug{`p-w#9bX9y?%1?1Y`M z3wFhB*d2RdPwa)gu@Cmee%K!e;6NONgK-EB#bG!cN8m^tg`;r{j>T~}9w*>LoP?8c z3QomoI2~u;Oq_+YaSqPKc{m>z;6hx4i*X4q#bvl0SKvxqg{yH5uElk@9yj1d+=QEP z3vR`2xE*)kPTYmNaS!greYhVF;6Xfuhw%s=#bbCJPvA*Bg{Schp2c%`9xvcUyo8tW z3SPx)cpY!xO}vG-@eba_dw3ro;6r?bkMRjU#b@{&U*Jo8g|G1qzQuR=9zWnm{DhzJ z3x36K_#J=XPyB_y@elsRf8vn;7z#sU7z~TyFg!-Uh!_bYV-$>v(J(s3z?c{dV`ChQ zi}5f%CcuQ42oqxxOo{;tl&DanL5mJOhF~&GjwvuDroz;i2Ge3XOph5bBWA+Pm<6+9 zHq4GWFem21+?WURVm{1|1+X9%!opYti()Y>jwP@pmcr6l2FqeOERPkiB38o6SOu$M zHLQ*`uqM{R+E@qcVm+*n4X`0L!p7JHn_@F;jxDeyw!+rf2HRpgY>yqVBX+{h*af>{ zH|&l*uqXDy-q;8GVn6JU18^V?!ofHMhvG0Cjw5g+j>6G62FKz!9FG%lB2L1|I0dKT zG@Onza3;>e**FL1;yj#>3veMW!o|1*m*O&9jw^5_uEN#02G`;`T#p-YBW}XYxCOW3 zHr$Roa3}7<-M9z$;y&Du2k;;s!ozq3kK!>rjwkRWp2E|32G8O-JdYRfB3{DFcm=QG zHN1{D@Fw2E+js}>;yt{N5AY#A!pHaopW-uojxX>fzQWh|2H)a4e2*XSBYwiq_yxb> zH~fx2@F)Jl-}ndr;y-c8e+-48F${*qa2OsVU_^|BkueHJ#b_8EV_-~-g|RUX#>IFT z9}{3gOoWLs2`0q=1xi$?(V#_#9z!r0CdU+*5>sJnOoM4L9j3<&m=QB!X3T_y7RM4;5=&ueEQ4jS9G1rlSP?5>Wvqf#u^Lv# z8dwu+VQs8~b+I1S#|GFC8)0K?f=#g*Hpdp&5?f(wY=dpF9k#~~*bzHnXY7Jqu^V>B z9@rCmVQ=h%eX$?*#{oDH2jO5GfxDhwuX54~XaT{*O z9k>&B;cnc6dvPD`#{+l}58+`vf=BTf9>)`S5>Mf2JcDQP9G=Guco8q*WxRq{@fu#o z8+a3M;cdKwckv$H#|QWjAK_zsf=}@oKF1gM5?|qKe1mWC9lpm8_z^$hXZ(U+@f&`} zANUi0;cxtdfAODq~8VmJ(s5ilY~!pIl}qhd6SjxjJM#=_Vb2jgNqjE@O0 zAtu7am;{qzfC42d)M(J6LysYt43lFDOo^#5HKxI|m=4op2F!?=Ff(Sste6e6V-C!T zxiB~8!MvCc^J4)lh=s5)7Qv!e42xq4EQzJCG?u}#SPsi$1+0jburgM`s#p!HV-2i{ zwXinU!Ma!v>th3Kh>fr@Ho>OY44Y#MY>BO~HMYUF*bduc2keNQurqeSuGkH`V-M_! zy|6d-!M@lJ`{Mu{h=Xu24#A-~42R*ZsI1b0-1e}PIa57H8sW=U%;|!dM zvv4-f!MQjO=i>rgh>LJBF2SX^442~yT#2i2HLk(6xDMCj2Hc37a5HYft+)-h;||=3 zyKpz|!M(T-_u~OPh==en9>Jq{43FapJc+09G@ik;cn;6w1-yut@G@S(t9T8s;|;ut zx9~RJ!Mk`5@8bh}h>!3wKEbE>44>l*e2K5{HNL^O_zvIW2mFYi@H2kFulNnW;}86a zzwkHy!N2%VeDWVdVQ374VKE$r#|Rh^BVlBWf>ALVM#mT!6Jud)jDvA89>&K6m=F_T zVoZWbF+hP5|5x*N9dTfwkP?vY?ha{0>F(|ZX=zZ9l5Pa4+d(6Mo9i_&LAem;8!f^BaE4@Ay4`;E%k; zpZGI>;jjFSzw-|UX^sDk7&8GAG7%Fq36nAzlQRWVG8OM&YTn7acsJAV9^T9Qn3n07 zo*9^tnV6Ybn3eZ48?!S9b21knU~cANUOvct%+CTW$U-d4A}q>cEY1=v$xXLAncavtaN2|mdMT*#-mh>N*|OSz0sb2(RVC0B7Z z*KjS@aXmM1BR6p~pWzlh%dOnT=eV6axRcNG1-{5#e2Fh}H}`Na_wg0(=c_!x*LaYx z^AHd72#@j@kMjgi@)S?=4Bz0JJj-)@i*NHhFYq0{%Zt3k%e=y?yvFOi!T0z+Kj4SF z$&dIkKjEkRjGyxhe#x)+HNWAv{EpxA2mZ)g{E0vF7yioM_&fh#kk0teh%pl|Armn% zlQ1chF*#E(B~$SZrskczi+3{(@8P|?k7=2X>6w8UnTeU1g;{w&voSk!Feh{I0p?~N z=H-LT$NVh7f-Jay;iGKHR&32SY|D0R&kpR!PVCGs?89LixF&Ji5RQ5?-N9LsSW&&N1{6FG^KIfYX>jgNCWXK*HGaW?00 zF6VJRpWu^Rz=eE@i@2CexRlHIG?#M)S8^3sa}C#W9oKUMH*ym<^BHd8v)syUe2&|> zgFE>=U*L<}#h3UpcXJQ-avxvee!j{Be2oYBIuG$MkMJmu@iC^8(-DyS&Ityv!@S%4@vN8+?!N^8^yf8nqEjlc5`2I-Cej2JTk6EYDKGYOM28Iv;wQ!*9rU~1mUyLdO#@E+dF z`h8VP1%gi`7m4X5kAV6 zY{k}W!?tY4_Uyop?8MIO!mjMb?(D&y?8V;f!@lgt{v5!89K^vK!l4|-;T*w{9L3Qb z!?7I4@qCOEIFXY$nNv8G)A%^2a|UN}7H4w~=W-tB^9er51zgCdxQL6ngiE=MPjfj} za3xo9HP>)0*Ks{Ja3eQyGoRrWKFh7##^<=5JGhh2^98=hU3`fzb2s;JFZb~k?&qsK zz}I+?uk#QO^9Yaf7?1M=Px2H`^9@>Ne2Z`MJTLGazRQce#LK+GtGveRyutVQ zK0n}xyvdLFF+bs_{EVOT3x3J3_%*-bxBQOZ^9TOOTl|SX^B4Zg-}pQKV35K1&xkP- zFd-8$F_SPUlQB6{FeOv*4yNXvyo+};4e#N-ypL&_j_H|!8JUThnT1(-KeI7Ab1)}! z@d4&$9_HnP%*XsJz=ABq!Ysm~EXLw2!ICV+(k#QWEXVS!z>2KI%B;ewtj6lB!J4ea z+N{I6tjGF%hz;0~jo6q?*p$uKoDZ`FAK{~H$yRL5Hf+mwY|jqt$WH9cF6_!~?9LwS z$zJTuKJ3eW?9Txl$Uz*;AsotK9L^CO$x$55F&xWr9M8u%ffG52lR1S`IgO8VI%jYu zXK^;?a4zR@KA+%|T)>5Vii^0IOSqKF_%xSu1y^zvS91;5avj%m12=LLH}e^8;j`Sz zZG4W~xr00TJYV38+{KspGIw(i_i`U!;eNi#1AL7K`8p5rFpuykkMTH9@FY+1G|%u2 zzR9yZ$G7-4&+`J`;k&%ZOT5f0yvl35&KrD>@ACtG$ea9#AM+D_%Fp;Yzu=erieK{^ ze#`IpJ%8Yjyv3jRGk@W){Eff!4+a^H|BM(j0TVJ26Eg{uG8vOI1yeE=?_g@)$-8(r z)9@bN%lnv?>6o4wn30*7nOT^X_cI%_GY4}r7aw45=3!nw$b8Jt0xZZvEX*P-%3>_e z5-iD5EX^`3%W^Ew3arRVtjsE`%4)368m!4$tj#*C%X+NOhuDA(*@%tVgiYCu&G|4} z@DV=BmTbk=Y{Rx}$M)>Nj_kzF?82_>#_sIFp6tcm?8Cn7$Nn6^fgHra9KxX-#^D^n zksQU*9K*33$MJlO6F8BRIGIy8mDBh*r*j5pau#QE4(D%77D_&z`2hrG#;_%T1>r~Hhc^9z2-ulO~;;kW#b-}49l$Xoo0Kl2y<%HQ}q|6q{G z_|J$j6EGnYF)@=cDU&fdQ!ph{@eZcuoxF>8GY#+Iy}XZUnU3k1ff<>JnVE%Ic|Wr; zJ9986bMXP@W*+9{gUrYLEWm;+#KJ7XqAbSZEWwg2#nLRpvMk5)tiXz_#LBF~s;tK9 ztihVB#oDaHx~#|ge25L$kd4@wP1uyp*qjfu1s~y~Y{^z^%{FYyc5KfM?8r{+%r5N8 zZtTt;?8#p2%|7hQe(cWy9LPZ&%pn}gVI0m89LZ4}%`qIyaU9RbIDr#6iIX{nQ#p-~ zb2?{mCTDRr=Ws6PaXz2mlU%@se2R;>m`k{n%lI^xa|Ks&6<2c&*K!@#a|1VW6F2i2 zZsD`s%58j(+qr`~`8;3Xi`>PR_%e5M5BG8(U*Uef$^(3j2l+Y=@i33@D39?tPw*s9 z@ifoy4Zg{e(1&g{aj?8ffw!Jh2J-t5D^?8p8bz=0gZ!5qS& z9LC`s!I2!r(Hz6E9LMo|j1xGKlQ@}EIF-}*IHz+4XL1&2a}MWn9_RB3KFI}K$fvl7 zi@AhLxr|S9IahEcS8+Aha4pwyJvVS8H*qtc;TAs2t=z`vxScz=lh5-7zQ|pCi7#_E z_i!)w@fGgpt31Hhc#yC25D)VRkMbCg^8`=w6i@RE-{6}(%X55-Z}U7a@EyL(i@e0k zyuz!z#_PPn_xL_P;D@})kN7b^;ivqJpYscT$*=e|zu~w1j^FbK{>WSWi9hof{>tC@ zJO5yi#rV&NF%vK$6EQK9Fe#HUIa4qtQ}GU_=AFEYcQXy|;k~?%X_=1cnSmLZiJ6&& zS$RLRF*|cGCv))u=4Kw|<%7(}{4BtNEX2Yr!lEq3;w-_EEXC3+!?G;L@~ps$ti;N! z!m6ys>a4+d(6Mo9i z_&LAem;8!f^BaE4@Ay4`;E%k;pZGI>;jjFSzw-|US&jdU7&8GAG7%Fq36nAzlQRWV zG8OM&YTn7acsJAV9^T9Qn3n07o*9^tnV6Ybn3eZ48?!S9b21knU~cANUOvct%+CTW z$U-d4A}q>cEY1=v$xXLAncavtaN2|mdM zT*#-mh>N*|OSz0sb2(RVC0B7Z*KjS@aXmM1BR6p~pWzlh%dOnT=eV6axRcNG1-{5# ze2Fh}H}`Na_wg0(=c_!x*LaYx^AHd72#@j@kMjgi@)S?=4Bz0JJj-)@i*NHhFYq0{ z%Zt3k%e=y?yvFOi!T0z+Kj4SF$&dIkKjEkRjGyxhe#x)+HNWAv{EpxA2mZ)g{E0vF z7yioM_&fh#aDVWRC1k{y37C+Hn3zeJl*yQ!DVUO}cn4GSPTs}4nTGf9Uf##FOvm)h zz>Lhq%*?{9yr0>aojI73x%dEcGY|9fLFQwA7GOaZVqq3xQ5IuymS9PiVriCPS(amY zR$xU|Vr5ogRaRql)?iK6Vr|x8UDjiLKEwuW$VP0;CTz-PY|e+-f{*Y~wqz@|W*fF; zJGN&Bc4Q}ZW*2s4H+E+a_GB;iW*_!tKlbMU4&)#X<`53$Fb?Mkj^rqg<`|CUIF9FI zoWO~k#L1k(shq~gIh`{&le0LRb2yjtIG<1ONiN_*KE*{`%q3jPWqg{;xq>UXimSPX zYq^f=xq%zGiJSQhxA0kR4km zNtukvnSv>qigz$I@8n&)n`w9t@8x|=%XCc749v((%*-sz%KMp(*_nemnTroFH}fzr zA7nn}X8{&uAr@v47G*IOX9<>MDVAm#mSs7XX9ZSdC01q?R%JC-XARb5E!Jio)@41` z=R<73hHS*fY{I5&#^!vOE%*o@WlOeVYqnuqwqtvCU`KXhXLey%c4K$;U{Cg9Z}wqd z_G5nz;6M)IU=HC>4&!i+;7E?*XpZ4nj^lVf#tEFrNu10noXTl@oYOgjGdYX1IfrvO zkMsEipX35A;e3U*s;n#Fx37d$^bT_zL&)RUY7LJjmC1h=+NEM|q6Ld4eZ-il=#oZ}3f? zv`okJ%)pGy#LUdXth}Gun4LM8lezc+b2AU~@$!m&xrv+k47c!EZsj&U$L-v~oqV1z@I~(8OMIETxrckXkFRh)U*!S5 z#)EvFhj^Grc$CL@oF{mar+AuY_y*tPS)Sute4FQaf$#8LUgRZS<`rJ$HD2cpzQ_0Z z0YBtTe#DRY2|wj${G4C#OMb<#`3=A2cl@3|@JHU_PyCs`@K^rE-}wiF9L9e}jG2H5 znTUy*gh`o<$(e#FnTmHXHSgqIyqjrw5AWrDOv`jk&kW4SOw7zI%*y+jjoF!lIhl(O zFgNosFCSz+=4SyGWFZ!25f)`J7H0{TWGR+r8J1-^mS+W4WF=N+6;@?6R%Z>?WG&Wa z9oA(%*5^ZPz=mwZ#%#i-Y{uq%m@W7SA7x9nVr#ZxTef3+c3?+#VrO9yYq*x{xSkuhk(;=g&u|N$Ntl$$n4Bq?lBswHQ}a&V#k-k? z_wZic$Fxkx^vuAF%*4#h!mPZX*_fRER$*0EV|CVGP1a&<)?r=NV|_lv25iViY|JKX%4TfNhuMOU z@KLs8E4F4Ewq-lEX9spY9GGdPp8IGb}gm-9HEPw+`D;6gsdMO@4!T*_s9 zn#;L@E4hlRxrS@Gj_bLB8@Y*_`3$%4S#ISvKF96c!JT}bFYra~;!Av)ySayZxsR`K zKVRhmzQ%)moridsM|hOSc$_DAlBal@XZQx+OLnT^?* zgE^Ut4=^|LFfSiuKIUfu7Gxn7W)T);F&1YDmSicGW*L@cIhJPyR%9hsW))UtHCAU0 z)?_W#W*ydLJ=W(#Y`}(W#KvsGrfkOMe3&ix2p?rjwqk3xVOzFidv;()c4B9CVOMrz zclKaU_F`}LVPE!Re-7Y44&q=A;ZP3aaE{%qg78X?&d1 zIfFAfi?cb0b2*Rm`2?Tj0xslJT*Sp(!lhisr@5RfxRR^5nrpb0>$sj9xRINdpRbJzD-r#$DpC9l;-sDI8n4j=de#X!F1;6B1 z{F>kJTYksy`2&CCE&jxx`3ryLZ~UEqFnGZD&xkP-Fd-8$F_SPUlQB6{FeOv*4zBIl ztNp-%n`-wNRH9hzKAk!bTRmuazqQ@}TKn^LP~j3&)^_aMsq?_eYx;ER-aA~|>Cerb z|K75pW8dEW26P@caQ~W4?FY3#lK9M@ml6h15C;i@gh8SpagZcP8YByn2PuM-L8{=6 zAa!tOa9415kS4e%xHq^jNE@UJ(gzuWj6tR#bC4y-8r&ab3$h0}f}BCFKW|O$AWx7t zcreHpN9uDg+gSNDYZ7E}*v z1T}+NLG7SUP&cR-)DIpC8Uzi4MnU7CNzgQC7BmkY4q60{1dj$SgH}Q7piR&=Xcx2( zIs_eqPC@6OOVBmw7IY7K1U-XZLGPeX&^PE8^bZCE1A{@q;9y8FG#C~P4@LwdgHgfg zU`#MJ7#EBW9t$P}6N5>?8J%pT?lbB4LX2g2N8o-l9tV3;q=9~KA;hK0hyVUe(CSS&0amIzCRrNYu-nXqhF zE-W8b2rGt_!pdQluxeN>tRB_~YlgML+F_lrZdfm@A3hW|2pfit!p32fuxZ#VY#u%w zwg?{y9}QcEt-{t}o3L%zE^Hrm2s?(I!p>oruxr>Y>>lI4m3GHhEIiy!o}f|aA~+Kd^%hnt_W9#tHRacns9BnE?ggO2sehC z!p-3`;g;~(aBH|Nd@kG`?g)2=&xbFBFNV9qm%^9B-Qk{aZ@4dfCEOpr8XgE=3lD~` zhlj$$;gRrYcq}{~o(NBdr^3_WnedJ9&G2k^E_^F|J3Jp=2;T|c4KIe5!pq^6@M?H1 zydK^N-wWRlKL|ezZ-yU*ABUfWpN5}>pNC(BUxr_WUx(j>--h3X--kbhKZdu$pTeKR zU&3F*-@@O+KmKg|VH8DilpsnNC5jS9Nus1tvM70!B1##hitdO~M|VbdMR!MOqI;ry zqx+(?QMxF7lp)F(Wr{LKS)#1b{ZY0kdz2%}8Rd!|h;m1HqP)?AQNAdDR3IuC6^aT+ zMWUinv8Z@dA}Se`ib_XiqOwuBsC-l*su)#@Do0hKs!_G5dQ>B-8P$qvM|GmQQN5^s z^ib3wY8W+&8b?i{rctw~dGv79B6=ixG-?^Oidsi)qP9`HsD0ES>KJv3I!9fiu2HwB zd(KM}4BcQNO5vG$0xn4T=UwL!zP4uxNNRA{rTuibh9cqOsAqXngcoG$EQ8 zO^PN*Q=+NSwCM3@dNd=N8O@4jM{}aN(Y$DW^hESzv>;j-JrymA7Dr2>rO~qJ>1cVh zB3c=(idIK!qP5YwXnnLH+8Aw$Hb>7yTcT&9tyIvBkk9f}S|N1~(AvFLboB03qJicUvoqBo*9qqEVu=&k7O z=zMe`dMA1}x)@!GE=O0QtI@USdUPXtFM2=vAo?)68GRIe9DNdf8hsXh9(@sg8GRLf z9eoph8+{jjAN>&h7~P6~ihhoMiGGcKi++#(__ON|<0y{f1aZPRQJgqV5+{w5#mVCo zamqMVd`Fx*zB9fnzB^76-xJ>(-xsHi)5YoI3~|OdQ=B=@5@(I?kF&+u;~a6$I9L2Y zoIB1F=ZznX^Tqk&0&&5(P+T}J5*Lk&#l_Lx*#kf*jIj#~{ zjjP4g;~H_zxK>;{t`pad>&5lshvEit!?;o0IBpU*jhn^IuG@gwo0am%<>+&XR( zw~gDy?c)w{$GB76Iqnj7jl0F&;~sI(xL4de?i2To`^Ej^0r9|iP&_yu5)X}s#lzzf z@yK{oJUSi|kB!I0Jh%#gE6+;~DYHcvd_+o)gcF=f(5mC*mjL z1@Xf8sd!PmI9?JjjhDqw$IIgt@yd8rygFVJuZ`Em>*Eda#&}b_IesSI5|UJMp{m#rRTuIldBKjjzSm;~VjN@%!-y@rUuv zcVa6Im+#Ju~{pq~h<@$R{xq zFd-8$F_SPUlQB6{FeOv*4yNXvyo+};4e#N-ypL&_j_H|!8JUThnT1*TFaIm@=LG5R z9ENM{*QLa}39F9LMu9PT)jN z;$%+YR8HgLoX#1X$yuDuIh@ORoX;otBo}ZYpW-4e<`ORDGCs}aT)~xG#noKHwOq&b z+`x_8#Law$Tlg%uavPuHcJAO#KF=5UB6sm6zRca+!@bY}i zpYk()&M){Szv9>YhTrl#e$OBHBX991{>)$aD}Uqf{DXh~n(&V$WW<;Wn2?E>m`RwF z$(Woen3Ab@2UGJ--o?9_hWGGZ-p8~|$Mnp=jLgK$%)+d^pV^q5Ihd2V_yBV=5A*Ut z=3{;qU_lmQVHROg7GrUiU`du@X_jGGmScHVU`1A9WmaKTR%3P6U`^IyZPsC3)? z#0G50Mr_O`Y|3VA&WG88kML2pWGl928@6RTwr2-+WG8lJ7j|Vgc4rUvWH0t+ANFNG z_U8Z&?yQj^|^Xz=@p1$(+KeoW{pFoijL-vpAb`IG6J{ zpHJ{fF5p5w#YJ4qC0xp7e45L-f-AX-tGR}2xsL0(fg8DroB0g4@L6u)nY+1%d%2IVa6ez=0lvnAe4U4Qm`8Y&$9SA4c#@}hnrHY1-{e`I<6C^2 z=Xrte@LgWyC0^zgUgb4j=MBEc_xS-oI<=2mgGQ@b~!7h%pl|Armn%lQ1chF*#E(B~$SZrskczi+3{(@8P|? zk7=2X>6w8UnTeU1g;{w&voSk!Feh{I0p?~N=H-LT$NVh7f-Jay;iGKH zR&32SY|D0R&kpR!PVCGs?89LixF&Ji5RQ5?-N z9LsSW&&N1{6FG^KIfYX>jgNCWXK*HGaW?00F6VJRpWu^Rz=eE@i@2CexRlHIG?#M) zS8^3sa}C#W9oKUMH*ym<^BHd8v)syUe2&|>gFE>=U*L<}#h3UpcXJQ-avxvee!j{B ze2oYBIuG$MkMJmu@iC^8(-DyS&Ityv!@S%4@vN8+?!N z^8^yf8nqEjlc5`{+UVe_xR6< zF%vK$6EQK9Fe#HUIa4qtQ}GU_=AFEYcQXy|;k~?%X_=1cnSmLZiJ6&&S$RLRF*|cG zCv))u=4Kw|<%7(}{4BtNEX2Yr!lEq3;w-_EEXC3+!?G;L@~ps$ti;N!!m6ys>a4+< zti{@_!@8`;`h18D*pQ9bm`&J}&Dfj|vjrdFqio4mY|S=o%XVzf4(!NI?949g%5Ln= z9_-0p?9D#x%YN+70UXFd9Lymc%3&PN5gf@;9L+Ht%W)jf$2fr#If;`wg;P0=k8?U_ za3*JQHs^3I=W#xt;FDayg?x&OxR^`0l*{-umvaSIauru|4cBrV*K-3mauYZ68E)aT z+{$fyj@!9|JNZ0c;EUYFm-sSwa}W1&A79~qzRCl9jR*NU5AiUM@FjOe=zKDFzk3R>~t{fd@$^C zFzovG6|Vi4%*Bm^y7%rpa8Uc+{U)vH(05R`iK_;U*i?C7_x1&w_U$>m?_VeX&t29I zY2Ryb=hJt7HzjxV9qod3eY;AR<+V|<) zYr^VIgZoWf*P&P6jy(s?o&0xJ#riJ2`nLZwtMlL8$3M6K=S0r`u*YACoQKo?J?r}O z^al*ZJPg2KoE5`EkN%-9PT+z{9s68{Iz4R~4;Q`r9N-%`{BQjLgbx%)wmD z&AiOV0xZPBEW)BJ#^Nl&k}Sp2EW@%a$MTglv>Jci@hX4S!kYHOIuD%xXVUCHvt0h` zu%5rN^2#bJEC1JVto)jRgZlPMxc1M09xy2Q^G)Y|p)YLQzE9WA{}%k42mJN6?LU%t z|9#Qu|E%a=mAmC1vDCJA`#$Zvb{^pD8vYe)8+&&j*11#Le;07`{+|Zi?Z>6T?f44+ zi}(uv8D9;9|G3A!9d^ZsfN;_cr8`|o`8|2Zz#{-eA5f2|X|{pkLmeW0q0{)^cBGu!oc-?!p+ yP4RY2-u8A(-u8A(-u6Go=HCbUKP`XzIJf)O{~^}@*L~H$503w)j{bid9RC*<;i_Z+ diff --git a/pandas/tests/io/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack b/pandas/tests/io/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack deleted file mode 100644 index ea8efdc86dd2d45a151ae49e78790a694a115d4f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 119206 zcmeErWpLO^vhD~*%#02*Gcz+YIDF~xS6nVA`Ehnbm~nVFdxU!2%`md@_pJ-c=5 zRz3aDCyi!C-P2$9v|5AXL&AfCJNJtX36B{V-!3#TI(kus@R+>0D2gP)W4F1vO zN5g`a5n+*0!O_t>Vt+LL8WtBG7#6%EE+{Z2aDR1>bRdZrXd+^Xh$kXUL?RK%M5Gdt zPQ-+Hg9rZ-YH8^o*D)bs!9V4i8`E{iFWHtQ^hs#Xo4ZkXbd%qNjB6DY+NO6AKfxAG%zZ<&%y-zM#lt(MfOjS;af{!a_iAxXm0WdG3RjcUWm>oaa(JJbPe1l?xI{X^6v_)t{%XZDe0>cAa2S@!f z&cYv@3M`w@Z{E+R(PIYA`#EFu{?3UMOaiL-uPZANQzl~iM0`;U{5+(@doK}FCt}7# z1c|66V)8^xn~0fvfvGyWzVjGI$pPRThX zlIT@4Fue6I*NuygiHL+h*%Xz~n8>Cd=8cL0KOP20BEKAIVOU7#;GkyzBLj~AlL6Bw zshWsM21+tOl0iT8`fn^a@^it@|4D|wSziSH9Z8d{k~HkE?|(_U;754=Gh+M-&i@uI zlB7&}&p(s$@$8SIH5q_{IY(Ykz#+R+X4QY-h1%=8Ct$jJl@9CjFDd^Num4QcKgHo+ zvHB0moV2I>1DU7B{Rk~N5>7c2PPr0Jxf4!#4*zL||9(J``%gV`Y~)`Cl-M5w4;N1y z_)Evd)qcjvKhrQVYW`cAp+7lu0VGCpkhEL=jfUtS_00)CeY4b@(tpT5e{3N?SMOhn ztYiNU-G8m+?>VOQUx|cClushS-z@Gwbj&2vB#|%4ME?tp`3J-E*aQkB1?T_V-ap}A z+xvfH1Mx9!qJo2$1jRJ_73O2QM*o{^O1>iOSn!W*?VsQ;8!b;tBWpMOGq>IJ7fEF1 zUA={Z|T>u6*{-`Aq zz8L?Q9g=kWYhC~PJ}K-b_5bVp|8)5Mzhi^=pTYBAx5EF=kN-|N<99!slEQcr@BUe` z@P8~no{*3W|K0rfmqYz$)7zxQOmg18vzUJ{rx~A=)+XgO|IYRj85kZM+V3Bh4dSaO z^!b~T!HcA@m59He*#7E}R02pUDI}E)k{AE=JY^y#l@O9j1W6@_q>@7B#C};5F?%BB zNW`3pm@5%;Ct{vN%$tb$5;1=w7D&W`iC8ER3nyZc|4M@XKQ2{_!yx*TIOF~oYZbr4 zIP^bL=J*T3{q`mGdkM{76AvUBEUDy?guf*+;OFt*NobP%{?|JI_-nr(hd}gCABVrG zloWH5Fez*#O_S7~gunOvKb;#Ssgv~Hzo*V0q~qB53NgW9C4QZMhgtth0qx)Rho70@ zKS{Lzf|I``*|U@ElZZ)NOu{5CCSej6lkjgj`D>#8>G-61l7ehfQvcuK_`jMkjzj-& zton0u{OJS!-`ME>7&(3`!4LRRZ2y^m{??ROm6zYF`S$!s@;|T2FZ%7nIyO8orcXk6 zOt|>@ujcn=iJSE$$Dc}Zb4n)^_veL2M1=*0{!1f%{tbw05fRho_qq>CUuJdlBr)ey z`SF59C89$Dv(=7h*CishWnfIdgc@tEoJqfQO8xlb)H)(6y$_8~IW3!+uozARqtQpteL+9a9}EBk!5}ag3;{#IFfbg903*RDFdB>j zW5GBu9!vle!6YylOaW8DG%y{^05icXFdNJPb3rVK1My%Um=6|!gbUU>R5r zR)CdY6<7_{fVE&9SPwRUjbIbl47Px+U>n#Dc7UB=7uXHi{KKt46cBy;2O9NZh)KM7Pt-WfV~+kKhyd48DM`;2ZeA{c@r5Fro}p%5Bj z5EkJO9uW``kq{YC5Eao79Wf9Su@D<^5Et5&Xb zMkEuG8OefVMY18;ksL@)Bo~q!$%EuY@*(+=0!Trm5KLK-!21rAs5z-iGf;2^%A)w=$V6lkG8vhI zOhu+4(~%j-Ok@@^8<~U5MPiXSBp#WE%tsa=3z0?0Vq^)j6j_EWM^+#!kyXfQWDT+w zS%<7gHXs|3O~__s3$hj2hHOW6AUly=$Zlj0vKQHh>_-kD2a!X_VdMyM6gh?*M@}Fo zkyFTN>hFnK(AUBa)$Zg~fau>OW+(#ZD50OX6W8?|) z6nTa`M_wQ=kyprTfVc8VHm`DU?PT zltnp|M+H!5YfdT4#L0oo94gf>Q-piR+cXmd0W zZGpB#gV12K71|nYgNC4O(ROGk8it0W5ojdZ9*shy(HOJ?+7a!9c1F9PUD0l6ceDrE z6YYieM*EhoD2zVd!vl1UeEOg^otYpkvW-=y-GjIuV_OPDZDo zQ_*SYbaVzf6P<<5M(3b&(O5JNjYsF9^U($9LUa+j7+r!cMVFz=(G}=QbQQW9U4yPg z*P-jt4d_O66S^7Qf^J2(q1(|N=uUJOx*Oet?nU>Z`_TjFLG%!M7(Id>MUSD!(G%!N z^b~p;J%gS_&!Okh3+P4k5_%cEf?h?hq1VwH=uPw%dKzL) zi*Xo_37CjUn2afyifNdR8JLM#n2kA@i+Pxj1z^dr4p7tRPkhD~uJviekmE;#di+BvuM5jg`U5 zV&$;%SOu&iRtc+&Rl%xa)v)SV4Xh?s3#*OQ!Rli5u=-d7tRdD2Ym7C)nqtkc=2#%s z0&9r{VZm4{tTomK3&GlA?XXZR3=791ut=;u7KKG)F<1wzBi0G)jCH}fV%@OrSP!fx z)(h*6^}+gL{jmPn0Bj&O2pfzI!G>bPu;JJUY$P@c8;y;@#$w~J@z?}xA~p$|j7`C& zV$-nc*bHnYHVd1L&B5kku~-}ykIlp8V+*i_*dlB(wgg*>EyI>$E3lQ=Dr_~j23w1* z!`5RPu#MOzY%{h6+lp<&wqrZ6o!BmHH?{}ci|xbqV+XK<*dgpNb_6?$9m9@eC$N*) zDeN?M20M$L!_H$Du#4Cw>@s!*yNX@Iu46Z_o7gSvHg*TQi`~QSV-K*0*dy#Q_5^#1 zJ;R=3FR+)`E9^D)278OW!`@>bu#ea$>@)TS`-*+TzW>R;36*JGOpk%uHiav;3jV2Htygq?%_TjfG5L~<0NiWkF+<0bHtcqzOzUIs6V zm&42B74V99CA>0T1+R)%!>i*p@S1onyf$73uZ!2i>*EdZhIk{qG2R4kiZ{cX>oS;#>i z3Q&X+l%WDus6ibX(1aGWp#xp$K_3RdWH32Q0aLEKP&(X!a}exECP$dVz4+Y0ZYPCurw?K%ffQ7JgfjK!b-3* ztOBdTYOp%20c*lqur{m%>%w}lK5PIR!bY$$Yyz9YX0SO7ge_o87zBf1E7%&gfg!Lh zYzISO7z~FIFcP+hQ7{_Dzz(n@>;yZ*F0d=?2D`%^uqW&Vd&54kFYE{V!vSz090Ui$ zA#f-h28Y8Da3mZBN5e62EF1^N!wGOAoCGJsDR3&B2B*Ura3-7uXTv#gE{uh7Fdoi> z^Wg%x5H5m?;S#tME`!VA3b+!kf~(;gxE8L1>){5t5pIH;;TE_RZiCz54!9HUg1g}! zxEJn&`{4n25FUbu;SqQg9)ri>33w8of~Vmbcov?6=ivo-5nh6q;T3olUW3=+4R{mY zg16xvco*J-_u&Kh5I%yB;S=~2K7-HU3-}Vgg0JBl_!ho{@8Jjd5q^T7;TQN7euLj3 zAP@p2FajqaK@cQC5j4RNEWr^xArK-V5i+3=DxncNVGt%^5jNovF5wYA5kMp(k`pP2 zltd~bHIar$OQa*x6B&q%L?$9Lk%h=gWFxW@If$G@E+RLPhsaChBk~gkh=N2RqA*c} zC`uF~iW4P>l0+$@G*N~qOOzwZ6BUSxL?xm!QH7{VR3oYrHHex-EuuD2hp0=`BkB_k zh=xQXqA}5gXi79AniGLU3!)_vLwOL=+KC#1I{b zjzlM-Gtq_UN^~Q-6FrEYL@%N@(TC_u^dtHc1BijdAYw2vgcwQ;BZd^r7Vl**^ z7)y*J#uF2WiNqvgGBJgiN=zfB6EldJ#4KVqF^8B-#1e5tJTZ@$Pb?r75{rn%#1djD zv5Z(wtRPkrtBBRa8e%Q6j#y7@AT|=4h|R zE^&{zPdp$V5|4<-#1rBv@r-y*ydYi@uZY*g8{#eTj(AUeAU+bGh|k0q;w$lu_)Y*4 zAyE<|aT1aQNs<&vlMKm{9LbXcDUuQ?lM1Pl8mW^8X_6LclMd;U9_f<-WHK^2nSxA7 zrXo|5X~?u>Ix;<(fy_u|A~Ta&$gE^GGCP@r%t_`VbCY?!^sFTl59^#kwA)k`Z$miq>@+J9-d`-R~-;(dh_v8oiBl(H^OnxE1lHbVhB%lxq zr7#MoAVp9lMNu@xP%On!JS9*fB~db^P%5QSI%QBMWl=WeP%h!t(Q#GiXR4uADRfnoe)uZZD4XB1x zBdRggglbAPqncBJR12yl6+{J7t*F*i8!CiqOSPjysW2*>il8E?_EZ!VO~p_hsE$-8 zsx#Gv>PmH^x>G%VWYH`RygOZB7rQv;}h)F5gwHG~>U4WourBdC$oC~7n{h8jza zqsCJcsEO1hYBDv2no3Qhrc*Phnba(5HZ_NuOT|)gR6I40noljD7E+6-#ncjNDYcAR zPOYF;Qmd%d)Ea6nwT@a(ZJ;(%o2bpy7HTWCjoMD_pmtKbsNK{aYA>~q+D{#z4pN7x z!_*P#D0PfFPMx4mQm3fX)EVk5b&fhuU7#*fm#E9s73wN=jk-?Vpl(vPsN2*X>MnJU zx=%fz9#W5}$J7(*DfNtcPQ9RBQm?4j)Enw8^^ST^eV{&4pQz8&7wRkZjrvXj8lh1d zqj4J21WnQuP16j`(j3jx0xi-KEz=6E(i*MP25r(7ZPO0z(jM*80dz7tIh}$|NvEPy z(`o3mbUHdcoq^6sXQDIHS?H{EHaa_S4I+AWrN72!A4Bdh5NOz(; z(_QGUbT_&?-GlB)_o92#edxY)Ke|6XfF4K>q6gDM=%Ms5dN@6T9!Za)N7G~IvGh24 zJUxM)NKc|C(^KfF^fY=pJ%gS}&!T73bLhErEFDM3)AQ*0^a6Szy@*~+FQJ#x%jo6w z3VJ2Iie62xq1V#u==JmldLzAw-b`+}u!CVh*(P2ZvK()Z~5 z^aJ`K{fK@{KcSz}&*}v>C5zE`ZEKVfy^LgFf)W1$_!(MGb5Oh%qV6wGlm(EhW;Qd2najj7aZEfjkD1RbU=}iqn8nNzW+}6bS|k~>yO`a~9%e7IkJ-;0U=A{en8VBw z<|uQFInJD5PBN#M)65y>EOU-I&s<-7$n8(Z$<|*@xdCt6GUNWzk*UTH{E%T0f&wOA$GM|{w%opY>^Nsn=02X0U7GrT1 zvII-A6ic%V%d#BHvjQu!5-YO`tFjuavj%Ij7HhK(>#`o}vjJ=}HaVMuP06NWQ?qH< zv}`&yJ)42e$Yx?Qvsu`zY&JGKn}f~C=3;ZRdDy&cJ~lsFfGx-tVhgiH*rIGPwm4gY zEy)7?|26iL6iQUX@VYjl|*zN2Nb|<@w-OcV{_pK_9lCaz0KZX@3QyU`|Jbu zA^V7Z%syeCvd`G(>T?aahFl}AG1r7^ z$~EJfbAen7t|b@51#_*q)?6Ddglo&S<3hPGE}VdgTqmwG*M;lK zb>q5oJ-D7+FRnM&hwIDrGq{=DEN(V8hnvgAa&cTdH;$&fVZ{a<{nK+#T*NcaOWzJ>VX4 zkGRL&6YeSZjC;<#;9hdCxYyhp?k)F@d(VB~K60P9&)gU8EBB52&H*0bQ6A%Q9`XcF z@)S?=4A1f$&+`H=@)9re3a|1Suk!|P@)mFN4)5|F@ACnCGCnz-v8^LhBZd_F!uUw|*j7vc-^Mfjq8F}^rof-lLJ z;!E>o__BOCzC2%nugF*8EAv(Os(dxRI$wjY$=Bj*^L6;Td_BHC-+*t(H{u)fP57pK zGrl<=$hY8I@U|x8XzhwtPE2ln>*>`3OFeZ_h{Z(R>Wwf$zw7;yd$Q_^y06 zzB}K8@5%S#d-HwxzI;EvKRZ{fG{+xYGL4t^)Ui{H)f;rH_U`2G9={vdydKg=KDkMhU( zltL;YwU9iMgn~jLp|DUyC@K^aiVG!#l0qq= zv`|JUE0hz;3l)TlLM5THP(`RJR1>NTHH4Z%EupqhN2n{*6Y2{MgoZ*Rp|Q|JXeu-l znhSwK3!$YDBm@hsgw{eEAw*~^v=c&wFdMXz6NU>TgptB1VYDzt7%Pku#tRdKiNYjd zvM@!MDohim3p0e7!YpC7Fh`gx#0qgjyf9CgFDwui3X6or!V+PruuNDktPoZTtAy3U z8ey%lPFOE&5H<>%gw4VhVXLrB*e>i4b_%+&X~eW*Ix)SN zLCh#-5;Kcg#H?aAF}s*U%qiv)bBlSzykb5vzgR#lC>9b6i$%nuVllC}SVAl*mJ&;g zWyG>#IkCK0L98fN5-W>U#HwO7vAS48tSQzKYm0Tnx?(-CzSux)C^ixsi%rC)Vl%P1 z7$~+7TZ%zqu-Hm$Ew&Lu#I|BPF;ol_!^H?OQfx0qiP2(=*g@_UBs?pH?h0e zL+mN`5_^k%#J*xbvA;M#94HPF2a7|*q2e%cxHv)_DUK3Hi(|yG;y7`ir2*J;tlbpcuTx3-VyJL_r&|+1M#8wNPH|l z5ub|B#OLA*@um1md@a5a--_?V_u>cfqxebuEPfHcir>WVB9IUXl`sjHphQTdL`k&7 zNUX$3yd+4XBuTQQNUEesx@1VEWJ$K@r8skBr^ zDl3(f%1afbic%%1vQ$N?DpixJOEsjLQZ1>rR7a{S)syN=4Wx!rBdM{}L~1HElbTC` zQVXf26eI;pt)$jc8!1F;E47nCr7$U6ijX3u_EMA-EyYM3q>fT2sk78Y>MC`Ux=THz zo>DKVx70`KEA^B5O9P~V(jaNDG(;LI4U>jTBczeiC~34bMj9)Plg3LEq>0ibX|gm$ znkr3`rb{!VnbIt2wlqhYE5%B2QoJ-znlCMo7D|hx#nKXKskBU5F0GJON~@&R(i&;4 zv`$(tZICufo21Rs7HO-rP1-K)kakMDq}|dUX|J?T+AkfD4oZil!_pDysB}y^E}f80 zN~fgL(i!QjbWS=iU63wHm!!+m73r#UO}Z}KkZwx1q}$RR>8^B7x-UJD9!ig-$I=t& zsq{>GF1?UmO0T5X(i`cm^iFy&eULs%pQO*y7wN0?P5LeY8Ie&LlW`f!giOkmOv{YS z%ACy0f-K6CEX#_l%9^aphHT1~Y|D=9%AV}Y0dg`qxtu~yDW{TC%W34aaymJ^oI%bg zXOc6^S>&v8HaWYTL(VDZl5@*>;l55L#1ygWgkC{L0n%Twg3 z@-%t6JVTx-&yr`$bL6>ltQ;rD%k$*<@&b9GyhvUwFOiqZ%jD(q3VEfxN?t9mk=M%W zPk3HhXa zN+%ixrhH4jE#Hyv%J<~^@&oyy{78N*Karox z&*bOw3;Ct|N`5WBk>ASiDq(UjQ!YHi5 zDZC;mq9Q4>qA04ODY{}PreZ0!;wY}-DZUb*BvXivMV{1oJuYww~|N6tK?JiD+QE-N+G4NQbZ}L6jO>TC6tm%DW$YhMk%Y5 zQ_3q9l!{6vrLs~*sj5^{sw*{=no2FDwo*r_tJG8KD-D!}N+YGQ(nM*hG*g-@fl3Rd zr4pnBE3K5)N*g6aX{)qTLX|KjT!~O3mG(-M60O829h8nrC#AE}Md_+^Q@SfXl%7g2 zrMJ>Y>8tcp`YQvJfyy9durfp$sti+xDH63l$pvbWwtU$nXAMqaZ0>0PnoYQP!=kSl*P&tWvQ}ES+1;5Rw}EM)yf)Wt+Gy8 zuWV2@Dw~we$`)m-vQ62p>`-gPAaFA z)5;m;ta45{uUt?rDwmYY$`$3Ra!t9e+)!>Rx0Kt;9p$caPr0u=P#!9el*h^w<*D*a zd9J)rUMjDY*UB5^t@2KJuY6EGDxZ|k$`|FU@=f`!02NVD6;p8)s)S0aluE0N%Bq~o ztAZ-3k}9i;s;Zi*tA=W-mTIex>Z+dVs{v{$szv}!svy_!MIsAf_# zt69{nYBn{ynnTU0=2CO3dDOgWJ~h8uKrN^iQVXj^)S_xJwYXYBEvc4LORHtnvT8ZC zyjnr6s8&)dt5wvhYBjaGT0^a=)>3P$b=10QJ+;2tKy9ctQX8vH)TU}PwYeIowoqHD zL29ttN^PySQA5(LLI4&Qb(&})UoO~b-X%3ov2PyC#zG`sp>R! zx;jIhsm@Yot8>)3YOETk#;fzx`RW37p}I(2tS(WPs>{^n>I!wGx=LNGu2I*j>(uq? z26dylN!_e&QManw)a~jHb*H*Z-L39X_p1BU{ptbrpn6C>tR7L1s>jsh>IwCvdP+U5 zo>9-L=hXA+1@)qONxiIIQLn1k)a&XE^`?4Dy{+C+@2dCI`|1Prq54RDtUghns?XHt z>I?Oy`bvGRzER(*@6`9|2lb=+N&T#TQNOC+)bA?L5DnEZ4cDMXXrxAIw8m(x#%a7J zXrd-*vZiRNrfIrnXr^Xqw&rNA=4rkbpe56iYbmspS}HBImPSjfrPI=D8MKUACM~m; zMa!yX)3R$hw47QlEw`3O%d6$n@@oaOf?6T1uvSDXsuk0UYbCUjS}CoxRz@qUmD9>= z6|{<4C9SenMXRb+)2eGVw3=Eit+rN2tE<)1>T3<1aL~E$V{hHE3Vk=iJ2v^GW?tBupfYZJ7I+9YkVHbtANP1B}p zGqjo7EN!+nN1Lm~YH?b;Hcy+cEzlNfi?qes5^brrOk1w4&{k@zwAI=gZLPLWTd!@< zHfo!+&Ds`itF}$suI|y6YZ(?Ona`q z&|YeXc6FjLzzu&g+6M z>XI(&imvLKuIq+w>XvTnj_&H7?&|@1GCjGTLQkou(o^ec^t5_9J-wbm&!}h8GwWIO zta>&*yPiYOspryj>v{CNdOkhBUO+FX7t#ysMf9S2F}=86LNBS8(o5@Q^s;(6y}VvQ zuc%kjE9+JCs(LlOx?V%Csn^nL>viOp$2 z-b!z+x6woNwt71~R1ed`^$0yuZ?8w`(Rz&DLGP${(mU&2^sah0y}RB+@2U6Fd+UAl zzIs2szdk@8s1MQy>qGRR`Y?UCK0+Ug)9N`UZWY zzDeJ#Z_&5v+w|@F4t=M-OW&>U(f8{6^!@q){h)qGKdc|okLt(tgV+H`UU->eo4QqU(v7X*YxZ94gIEmOTVq((eLW_^!xe){h|Iyf2=>zpX$%_=lTo% zrT$8Pt-sOV>hJXT`Um}^{z?C=f6>3{-}LV~Fc1SZFatNBK^UY#8MMI|tic()AsC_| z8M2`ms-YRWVHl=i8MfgVuHhNJ5nv=Uk{c>v*lz4Fb{e~k-Nqhcud&bAZyYcV8i$O-#u4MFam+YwoG?xrr;O9a8RM*R z&Ny#eFfJOGjLXIqNn~cetoXMMlDVmZg zn~JHLnyH(HX_}U4n~v$4p6QzbW->FmnZitIrZQ8TY0R`{Iy1eQ!OUo8GBcZ5%&cZM zGrO6?%xUH_bDMe0ykIkUW3!K`Rj zGAo-^%&KNJv$|QstZCLVYnye+-!_5dY(rj-=nbBs9*}?2+b}~DgUCgd#H?zCh!|ZAHGJBhS%)Vwn zv%fjO9B2+Q2b)98q2@4ixH-ZcX^t{Sn`6we<~Vb_Il-K0PBJH(Q_QL6G;_K+!<=c( zGH07}%(-T)8E3|u^UV3?0&}6c$XskLF_)Ul%;n|^bEUb;Ty3r~*P83h_2ve1qq)i4 zY;G~Pn%m6n<_>eGxy#&b?lJe8`^^330rQ}F$UJNwF^`(Z%;V+>^Q3voJZ+va&zk4V z^X3KfqIt=@Y+f<1n%B(h<_+_vdCRZYE`qUTQ#hjRxPWxRmZAp)wAkb4XlP%Bdf91#A<3avzl9hRtu}86=Vflt*q8o z8!NS}efx?4T0o>nibx7Ek$YxT4G zTLY|t)*x%JHN+Zf4YP(@Bdn3uC~LGe#u{slv&LH!tclhnYqB-Pnrcn6rdu& z7Hg}u&Dw75uy$IztlicgYp=D>+HW1O4qAt-!`2b&sCCRbZk@1BTBoej)*0)pbX&AM*gux?tntlQQd>#lXrx^F$O9$JsA$JP_;srAfyZoRNxTCc3v z)*I`s_0D>4eXu@SpRCW;7wfC_&H8Qu8?jLvvvC{RgiYF%P1}sk+MLbXf-TyTE!&E% z+M2D~hHcuGZQG9R+Mey(0d_Jwxt+pJX{WMN+iC2yb~-z~ox#p%XR(!9vTNIQ?7DV6yT0APZfG~M8{19nrgk&CxgBV?uv^+ecCg*bZf&=* zL+rM8J3G`4v%~EOJJN1%N7>PKjNQTRXm_$Z+gKq9%+xVN84lUvGzE7ygk95Xiu^y+f(eR_B4CCJ;R=9&$4IR zbL_cxtQ}{^+w<)C_5ypMy~ti{FR_=}%k1U$3VWr!%3f`+vDez`?Dh5rd!xO{-fVBN zx7yq6?e-3Pr@hPGZSS%7+WYMN_5u5#eaJp+AF+?x$L!-G)%rhUu4ZQrr)+V|}H_5=H&{m6c7Ke32A{2X!z9cc4Q!q(eEh!#J$NIlLn{q9Zx7qd2Og zIl5ywreis_<2bJ4IldF%By*BGDV&r}Dkrs*#!2g>bJ9B*oQzH;C$p2q$?9ZtvO77P zoK7w$x0A=o>*RCtI|ZDAP9dkTQ^YCi6myC@C7hB@DW|kk#wqKRbILmvoQh5*r?OMU zsp?d7syj8DnocdJwo}Kc>(q1VI}MzMP9vwW)5K}&G;^9efldpjr4!@?JFT47P8%o0 zY3sCeLY*)t+=*}^o%T+Y6Yaz}9h{C%C#SR1#p&vFbGkb{oSsfEr?=C`>Fe}!`a1)h zfzBXjurtIN>I`#+J0qNt&M0TJGsYR~jC0026P$_8BxkZS#hL0%bEZ2poSDunXSOrP znd`(laZbE5&zbKma27g?oW;%(XQ{KyS?;WGRywPk)y^7ct+UQq?`&{3I-8u$&K75@ zv(4G=>~MBEyPVz59%rw!&)M%Ba1J_$oWsr$=csecIqsZrPCBQY)6N;^taHve?_6*$ zI+vWw&K2jXbIrN#+;DC>x18I~9p|od&$;hBa2`63oX5@+=c)6|dG5S$UOKOw*UlT~ zt@F-#?|g7RI-i`+&KKva^UeA002gsl7jtnJx`a!*luNse%etJ)yMimak}JE4tGb%2 zyM}AJmTS9?>$;xny8&)8H@Ta_P3fj`Q@d&0v~D^#y_>GpDayM5fgZa=rbJHQ?2 z4sr*(L)@Y6Fn72+!X4?3a!0#k+_COBcf32no#;+-C%aSJsqQp)x;w+2>CSRzyK~&R zZmb*U#=G;}`R)RDp}WXk>@IPay35?)iG326v;o$=&R3akskL z-0kiTcc;6{-R>hECy2sq(?g{s#d&)iSo^j8*=iKw|1^1$R z$-V4eaj&}9-0SWQ_ojQxz3tv{@4ENg`|bnxq5H^v>^^ayy3gF_?hE&&`^tUozH#5W z@7(w92lu1<$^Gnpalg9X-0v>%5D)b*5BH!)c%(;pw8wa?$9cRbc%mnHvZr{er+K<( zc&2B0w&!@R=Xt&t;3e~tdnvq>UMerOm&Qx$rSsBz8N7^MCNHy>#mnkt^Rjz6yqsPx zFSnP+%j@Oy@_Plmf?gr7uvf$@>J{^fdnLS*UMa7%SH>&rmGjDb6}*aGC9kqq#jEO7 z^QwC_yqaDuueMjmtLxSC>U#~mhF&ADvDd_F>NWG4dx2gHuca5{1$(W$)?OPg#B1xd z^FqBaFWigpBE9xrlo##AcpbcsUMH`!*Tw7Vb@RG=J-nV?FR!=P$Ls6$^ZI)Oyn)^z zZ?HGS8|n@7hI=Etk=`h8v^T~Z>y7iqdlS5g-Xw3bH^rOkP4lLEGrXDJEN`|q$D8ZL zdU0O7H_w~zE$|k4i@e3&5^t%u%v_h^Tkmb~HhP=9&E6JotGCVD z?(Oh)db_;c-X3qSx6j+}9qbult5? z`j&6|j_>-O@B0CMGC#SW!cXa^@>Bb1{Iq^LKfRy9&**3JGy7TmtbR5>yPw0)>F4ru z`+5Alem+0HU%)Tu7xD}HMf{?EF~7K9!Y}EU@=N<={IY&Izr0_;ujp6uEBjUas(v-U zx?jVu>DTgW`*r-fem%dw-@tF^H}V_%P5h>QGrzeX=(q4&`ayoM-^y?8xA8;#wthQ5 z)DQE+{RltOZ|_I>(SD5I!SCpI@;m!o{H}gCzq{YV@9FpQd;5L-zJ5Qyzdyhq=nwJ- z`$PPp{xE;IKf)jBkMc+RWBjrHIDfo9!Jp_)@+bRK{Hgvlf4V=zpXtx?XZv&fxqhr4 z=g0f={Q3R@f1$s~U+gdOm-@^6<^BqPrN7Ew?XU6I`s@7l{sw=ezscY1Z}GSK+x+eR z4u7Y=%irzq@%Q@s{Qdp`|Db=!KkOgzkNU^_ zU-7T{*Zk}L4gaQp%fId4@$dTg{QLd`|DpfLf9yZ;pZd@I=l%=-rT@x*?Z5Hg`tSVr z{s;e~|H=RCfAPQi-~8`B2tWeR04x9xfB{4R89)Wl0ZhREW9=V;Mu`@!O?T~O+sNtp zE!(zj+qP}nwr$(CZQHDU|MQ->5!I@Qs@ZvBHZr>-vol7NAZidTh#tfUVg|8-*g>2i zZV)esA0!A828n{iL6RV8kSquS5y(IVIxv9^T;PL{AbF4?NExIGQU_^*v_ZNceUKr@ z7-R}E2U&uwLAD@!kR!+$fLrN(W_v zvO&3^d{7~%7*q->2UUWqLA9WIP$Q@r)Cy_`b%MG^a^?heS*G0zo36GAQ%`73I+#5 zf}z2%V0bVh7#WNTMh9bpvB9`td@v!H7)%N#2UCKn!L(p{Fe8{5%nD`)bAq|SykLH? zAXpeI3Kj=Tf~CQ-V0o}2SQ)GeRtIZ>wZXbzeXt?e7;FkQ2U~)z!M0#~up`(R>cFAUGHt3JwQHf}_E);COH%I2oJ@P6ua#v%$IGd~hMS7+eZ22Umit!L{Ie za3i=G+zM_7cY?dYz2JWEAb1!&3LXbff~Uc=;Cb*Ocp1D3UI%Z2x52yMeefap7<>vo z2Va7(!MEUh@FVyc{0e>te}cckzaW(Oj|eToh_E7@2rnXth$51RETV|0BASRUVu+X` zmWVClh`1u2h%XX|gd&khERu+%BAEz;5K<_ig%MUb;YEl@E>eh;B9%xj(ulMook%Y- zh>RkW$SksmtRkDpE^>&RBA3W5@`$`5pU5u?h=QV!C@hMIqN126E=q`!qLe5t%80U} zoG33Uh>D_;s4S|8s-l{xE^3IHqL!#F>WI3co~SPxh=!t(Xe^qDrlOf>E?S6|qLpYZ z+K9HIooFvQh>oI@=q$R3uA-ahE_#TbqL=6``iQ=wpXe_Jh=F2|7%YZ}p<2p7m@KA?(@VwG4e)`+!Yomek6h>c>C*ete)tzw(lE_R5WVwc!0_K3Y=pV%)Bh=bygI4q8c zqvDu2E>4J(;*>Zo&WN+(oH#Eoh>PNqxGb)StKyotE^dgM;+D8A?ufhMp13a_h=<~l zcr2cXr{bA-E?$V2;+1$U-iWv2op>)kh>zlv_${)oTgp9m%Y zBSXtDGOP?I!^;RVqKqUX%P2Ccj3%SY7&4}eC1cAtGOmm#!DOeIsxG%~GBC)3LeGNa5SGs`S8tIQ^|%N#PN%q4TnJTkA$ zC-ch!vY;#^3(F$1s4OOn%M!ArEG0|JGP0~JC(FwUvZAacE6Xafs;nlf%Nnw#tR-v9 zI*ovZL%IJIgMztL!Gb%O0|) z>?M24KC-XuC;Q6*a-bX}2g@OHs2nDT%Mo&<93@A~F>Fca;e-ZkIJj^sr;&dDyRyn!m5ZWs*0)Ns)Q=3N~zMS zj4G?jsq(6Vs;DZd%BqU0s;a5#s)nkmYN^_)j;gEbsrsscYN#5i#;S>Gs+y_hs)cH) zTB+8mjcTjfsrIUa>Zm%Y&Z>**s=BG}s)y>Sda2&3kLs)Xss3t!8mI=T!D@&as)niI zYJ?i8Myb(ij2f%Psqt!pny4nJ$!dz4s-~&wYKEGrW~td~j+(3HsrhPwTBsJO#cGLK zs+OtcYK2;V!I}PN~!Cj5@2%sq^ZBx~MLx%j$}{s;;T)>V~?hZmHYqj=HPvsr%}I zdZ-?$$LfiCs-CIm>VWBKNeyQK;kNT_r zsZjbqI!|HH4ypEtF>PR}Wj-sRLXga!%p=0V;I<}6ZO?xRPNI|Q zWIE77ORcolMqBN)*C9H&PN7rkR64azqtohiI=#-IGwMt_v(BQk>TEi@&Y^SaTspVT zqx0%~I=?QU3+h6;ur8vD>SDUME}={6Qo6J*qs!`Yy1cHSE9y$RvaX`5>T0^WuAyt{ zTDrEbqwDH=y1s6p8|p^7v2LQ9>SnsRZlPQ1R=Txrquc6sy1nk8JL*olv+kn1>TbHb z?xB0?Ub?sLqxS21g9-&9-QF^oS=nqo}p*zS$ejfqvz^*dcIzu7wScNv0kE=>ScPlUZGd&ReH5vqu1(ndcEGDH|kA# zv)-b&>TPSOx2KA}(QQ~I<%qtEJd`nTCMCzM*gGTl%)Xqwnf_`o4akAL>W?v3{bT>Sy}7exYCLSNgSnqu=Ux z`n~?3Kk85Vv;LyL>TmkH{-J;BU;4NHqyOrEI+Xd332nlduqK=dZz7n8CX$J4qL`>A znu%^=n3yJ(iEZMTxF(*7ZxWbOl9;3>nF)+A(kP>iG1fTaO^8WuQkaw`l}T;V zn6xIHNpCWkj3$%GY_gcFCY#A_a+sVZm&tANn7k&R$!`jnf~JruY>JqorkE*iN|=(S zlqqe>n6jpvDQ_y6il&mOY^s>5rkbg4YM7d)mZ@#(n7XE(sc#yXhNh8eY?_#+rkQDO zT9}rmm1%9-n6{>!X>U51j;538Y`U1Prkm+*dYGQ3m+5W#n7*c;>2C&@fo6~yY=)Sj zW|$dnMwpRilo@Ttn6YM@8E+<-iDr_SY^Ip0W}2C9W|)~~mYHqln7L-2nQs=Dg=Udi zY?hd%W|>)TR+yD$m04}pn6+k|S#LI&jb@YCY_^!KW}De=c9@-Jm)ULhn7wA7*>4V* zgXWMqY>t?t=9oEdPMDMClsRqAn6u`bId3kQi{_HKY_6EA=9;-~ZkU_qmbq>2n7ihl zxo;krhvt!aY@V2>=9zhJUYM8Wm3eL6n78Jgd2c?LkLHv4Y`&PU=9~F$ewd%;m-%h} zn7`(q31$CdL)$PmtPN+w+XyzIjbtO+C^o8%W~18}Hl~ebW7{}3u8n8o+XObDO=J_> zBsQr{W&)Qslp>1Rv+a|WDZDyO>7Ph5rWn0@ewykYv+uIJdqwQon+b*`N z?Pj~%9=50LWqaE`wy*7H``ZC_pdDlf+aY$S9cG8y5q6{>cCX!M_uB*Zpgm*{+ava%zJ4E`p2bBDu&e zii_%^x#%v2i|Jyy*e;HX>*BfiE`dwv61l`KiA(B|xxfi0opRb4XPtB2g}CG{g-hvD zxzsL=OY73P^e%(T=rXy?E{n_RvbpRohs)`5x!f*~%j@#F{H}m2=nA>Qu81q@in-#h zge&PvxzetTE9=U+@~(ob=qkC&u8OPbs=4Z}hO6mnx!SIdtLy5y`mTX%=o-1ku8C{v znz`n#g=^_rxz?_YYwOy%_O65L=sLO1u8ZsHy1DMIhwJHjx!$gi>+AZt{%(L9=mxpL zZipM|hPmNxgd6EbxzTQn8|%io@os{f=q9(hP&x*x!dlJ zyX)?``|g2z=pMPp?umQqp1J4lg?s5-O`|JL>Q2swYv=8IM`fxtHkKiNvNItTU;-mU#KDv+LWBOP=wvXfE`glIRPv8^! zL_V=k;*-u`WzHi_g`bNI7Z{nN!X1=*^;amDvzO`@T+xm9Cz3<>V`cA&H z@8Y}qZoa$k;d}aCzPIn=`}%&qzaQWS`ayoMAL57lVSczD;Ya#WezYIs$NF)8yr1AF z`bmDWpW>(bX@0t&;b;0;ezu?E=lXemzF*)M`bB=RU*ebgWq!F|;aB=qezjlY*ZOsS zz2D$B`b~bb-{QCWZGOAo;dlC7ez)J__xgQ)zdzs)`a}M(KjM%2WB#~5;ZOQg{tc|Hn8#b_8EV_;1D?`+8bIk7Pg#>IFT9~0pJZrLQ_o*0v0QcQ*c z3Y4f&qd|)fJ%-@_?#ra$o)S}GYD|M^F&(DI|5h9RpOXY6LAtw#wj=zr{Q#*firOy z&c-=77w6%8T!0I45iZ6hxD=P+a$JEcaTTt{HMkbn;d@fE(tH~1Fc;d}gmAMq1@#xM94zu|ZMfj{vV z{>DG}7ysdZYiIw@`45K1Fc=oYVR(#y5it@*#wZvSqhWN6fiW=_#>O}p7vo`kOn?b7 z5hlhYm=u#?fC42d)M(J6LysYt98+LQOogd24W`9(m>x4=M$CknF$-qJY?vK$U{1`1 zxiJss#eA3_3t&MkgoUvP7R6#%97|wHEQO`943@=mSRN~2MXZFCu?kkjYFHg>U`?!r zwXqJ?#d=sD8(>3hgpIKYHpOPx99v*ZY=y0{4YtL0*d9AzN9=^1u?u#^ZrB}rU{CCY zy|EAW#eUcy2jD;)goAMi4#irsL98cg$JcXz644%bvcpfj{MZAQU@d{qWYj_=R;7z=R zxA6|%#d~-kAK*iLgpctFKE-GF9ADr|e1)&^4Zg*9_#QvtNBo4J@e6*%Z}=U5;7|O8 zzwr3IVV;qc&@i0Cnz=W6x6JrugipelQ zff5yJG-%PG#}G`8DKI6b!qk`s(_%VIj~Or{X2Q&v1+!u{%#JxQC+5Q3mKFp5= zupkz~!dL{0Vlga^C9oux!qQj<%VIe!j}@>YR>I0y1*>8;td2FXCf35*SO@E3J*SeNC+@=CxCi&*KHQH7 z@E{(-!*~Rb;xRmqC-5Ym!qa#L&*C{ej~DPFUc$?G1+U^YypA{UCf>r^cn9y|J-m+( z@F70J$M^)F;xl}XFYqP4!q@l)-{L!bk00VrVV`vP6VKE$r#|Rh^BVlBWf>ALVM#mT!6Jud)jDvA89>&K6m=F_TVoZWbF&PFZ zP@+PO1}!@D7=p<$1*XJQm>SbyT1i(0EQZCg1eU~7SQ^Vw}aN>~}IU{$P!)v*TF#9CMz>tJ21hxM@m zHpE8Q7@J^IY=+IT1-8Ui*c#hlTWp8zu>*F*PS_c{U{~yh-LVJu#9r7N`(R(}hy8H? z4#Yt?7>D3c9EQVj1dhZ}I2y;`SR9AraRN@nNjMp&;8dK3({TpQ#925S=ipqNhx2g( zF2qH+7?_uyXKhx_pW z9>ha<7?0plJch^d1fIlGcpA^(Sv-g5@d94NOL!Tt;8nba*YO74#9Me9@8Dg$hxhRT zKEy}(7@y!%e1^~Q1-`^r_!{5fTYQJ_@dJLuPxu+X;8*;H-|+|j#9#Ou|KMNzhyNX% z`=9*B&=>~8VmJ(s5ilY~!pIl}qhd6SjxjJM#=_Vb2jgNqjE@O0Atu7am;{qzG7M0l zM1>j+T6E|!1e0S5Oo^#5HKxI|m=4op2F!?=Ff(Sste6e6V-C!TxiB~8!MvCc^J4)l zh=s5)7Qv!e42xq4EQzJCG?u}#SPsi$1+0jburgM`s#p!HV-2i{wXinU!Ma!v>th3K zh>fr@Ho>OY44Y#MY>BO~HMYUF*bduc2keNQurqeSuGkH`V-M_!y|6d-!M@lJ`{Mu{ zh=Xu24#A-~42R*ZsI1b0-1e}PIa57H8sW=U%;|!dMvv4-f!MQjO=i>rg zh>LJBF2SX^442~yT#2i2HLk(6xDMCj2Hc37a5HYft+)-h;||=3yKpz|!M(T-_u~OP zh==en9>Jq{43FapJc+09G@ik;cn;6w1-yut@G@S(t9T8s;|;utx9~RJ!Mk`5@8bh} zh>!3wKEbE>44>l*e2K5{HNL^O_zvIW2mFYi@H2kFulNnW;}86azwkHy!N2$q|2sGP zKlzWLF${*qa2OsVU_^|BkueHJ#b_8EV_-~-g|RUX#>IFT9}{3gOoWLs2`0s47@$Cj z3N;$E=+I*bCdU+*5>sJnOoM4L9j3<&m=QB!X3T_y7RM4;5=&ueEQ4jS9G1rlSP?5>Wvqf#u^Lv#8dwu+VQs8~b+I1S#|GFC z8)0K?f=#g*Hpdp&5?f(wY=dpF9k#~~*bzHnXY7Jqu^V>B9@rCmVQ=h%eX$?*#{oDH z2jO5GfxDhwuX54~XaT{*O9k>&B;cnc6dvPD`#{+l} z58+`vf=BTf9>)`S5>Mf2JcDQP9G=Guco8q*WxRq{@fu#o8+a3M;cdKwckv$H#|QWj zAK_zsf=}@oKF1gM5?|qKe1mWC9lpm8_z^$hXZ(U+@f&`}ANUi0;cxtdfAJswcWnND z@*hKE7z~TyFg!-Uh!_bYV-$>v(J(s3z?c{dV`ChQi}5f%CcuQ42oqxxOp3`cK!Fk! zYBXrkp~nzRjwvuDroz;i2Ge3XOph5bBWA+Pm<6+9Hq4GWFem21+?WURVm{1|1+X9% z!opYti()Y>jwP@pmcr6l2FqeOERPkiB38o6SOu$MHLQ*`uqM{R+E@qcVm+*n4X`0L z!p7JHn_@F;jxDeyw!+rf2HRpgY>yqVBX+{h*af>{H|&l*uqXDy-q;8GVn6JU18^V? z!ofHMhvG0Cjw5g+j>6G62FKz!9FG%lB2L1|I0dKTG@Onza3;>e**FL1;yj#>3veMW z!o|1*m*O&9jw^5_uEN#02G`;`T#p-YBW}XYxCOW3Hr$Roa3}7<-M9z$;y&Du2k;;s z!ozq3kK!>rjwkRWp2E|32G8O-JdYRfB3{DFcm=QGHN1{D@Fw2E+js}>;yt{N5AY#A z!pHaopW-uojxX>fzQWh|2H)a4e2*XSBYwiq_yxb>H~fx2@F)Jl-}ndr;y(-(f&9nN z7zV>)I1G;wFd|06$QT8qVl<47F)${^!q^xG<6=CFj|ng#Cc?y+1e0Pi3{ap%g&GZ7 zbm%bzlVb`@iK#F(roptB4%1@>%!rvVGiJf8m<_XI4$O(UFgNDGyqFL3V*xCPg|ILd z!J=3Ui(?5aiKVbKmcg=E4$ET&tcaDcGFHK=SPiRV4XlZ^ur}7gx>yhEV*_l6jj%B` z!KT;@n_~-XiLJ0Tw!ya84%=e~?1-JPGj_qQ*bTd55A2D(us8O>zSs}@;{Y6pgK#ho z!J#+|hvNtwiKB2dj=`}w4#(pJoQRWfGETv%I1Q)c44jFxa5m1txi}B!;{sfWi*PY6 z!KJtim*WatiK}omuEDjq4%g!b+=!cSGj74HxDB`C4%~^ma5wJ3y|@qe;{iN~hwv~S z!J~K#kK+kEiKp;1p24$t4$tESyoi_ZGG4)}cnz=P4ZMlB@HXDTyLb=p;{$w%kMJ=* z!Ke5PpW_RBiLdZAzQMQn4&UPk{D`0MGk(FZ_zl0~5B!P0@HhU!zxWSBMI`?*G={;j z7!Jc@1dNE0FfvBLs2B~SV+@Rmu`o8q!MGR?<6{C$h>0*UCc&hb3QK3eI79DyF z!Q_|%Q(`JijcG6~ro;4@0W)GI%#2wuD`vy&m;-ZSF3gR2FfZoA{8#`BVj(PyMX)Fq z!{S&1OJXT3jb*Sbmc#N`0V`r9tc+E#DptelSOaTfEv$`ourAia`q%&)Vk2yfO|U68 z!{*omTVgA0jcu?kw!`+=0Xt$R?2KKoD|W-~*aLfFFYJwdurKz*{x|>!;vgK1LvSb# z!{ImrN8%_Pjbm^uj>GXd0Vm=loQzX&Do(@cI0I+mES!yVa4ycn`M3ZV;v!s(OK>SJ z!{xXFSK=yMjcaf%uEX`X0XO0%+>BdrD{jN>xC3|MF5HcKa4+t|{dfQm;vqbYNAM^f z!{c}YPvR*&jc4#Ip2PEa0Wabuyo^`yDqh3ucmr?ZExe6)@GjoN`}hDK;v;;FPw*)| z!{_({U*ao#jc@QRzQgzU0YBm={ET1lD}KZ8_yd39FZ_*v@Gt(uP?5-g42@whEQZ7I z7y%<r%PphbrsLohj} zz?7H@Q)3!Ti|H^uX26V?2{U6B%!=7CJLbTgm;O(V-YNh#jrS* zz>-)BOJf-us$}xhS&%jV-swO&9FJP zz?RqwTVoq+i|w#IcEFC<2|HsK?26s6JNCey*b94OAMA_$us;sKfj9^U;}9H*!*Do` zz>zo#N8=bAi{o%SPQZyc2`A$eoQl(MI?lkEI16Xv9Gr{ua6T@;g}4Y8;}Tqo%Wyfa zz?HZPSK}I7i|cSbZorMW2{+>w+=|<9JMO@pxC?jV9^8xja6cZvgLnuJ;}JZH$M86w zz>|0iPvaRpi|6n>UcifZ2`}Rnyo%TGI^MvWcnfdi9lVS8@IF4khxiB|;}d*}&+s|E zz?b+6U*j8mi|_C~e!!3T2|wc({EFZ3JO03*_zQpIAN-5|FjQpnA46jp42$6~JVwBX z7zra|6pV_|FgnJ-m>3IVV;qc&@i0Cnz=W6x6JrugipelQff5yJG-%PG#}G`8DKI6b z!qk`s(_%VIj~Or{X2Q&v1+!u{%#JxQC+5Q3mKFp5=upkz~!dL{0Vlga^C9oux z!qQj<%VIe!j}@>YR>I0y1*>8;td2FXCf35*SO@E3J*SeNC+@=CxCi&*KHQH7@E{(-!*~Rb;xRmqC-5Ym z!qa#L&*C{ej~DPFUc$?G1+U^YypA{UCf>r^cn9y|J-m+(@F70J$M^)F;xl}XFYqP4 z!q@l)-{L!bk00nchFeb*r*cb=nVmyqG2{0ih!o-*alVUOqP@qJG8Vy=>=rIJ7V+u@(sW3IB z!L*nT(_;qAh?y`mX2GnO4YOko%!#=$H|D{-m=E(~0W64xurL7)R4Xa}ftckU-HrBzqSP$!C18j(murW5lrq~RdV+(AFt*|w= z!M4~A+hYgph@G%AcEPUL4ZC9x?1{awH}=84*bn>T033*ea4-(Rp*ReO;|Lsyqi{5i z!Lc|F$KwQ?h?8(KPQj@-4X5J_oQbn=HqODhI1lIJ0$hlTa4{~yrML{2;|g4ft8g{0 z!L_&!*W(7kM z!LxV{&*KHWh?np(Ucsw)4X@)3yotB)Hr~Ozcn|O61AK^&@G(BYr}zw?;|qL=ukba# z!MFGh-{S}Th@bE?e!;K!4Zq_L{E5HtH~zuD_zy!xCI2xrhQY8H4#Q&vjEIpiGDg9u z7!9Li42+4fFgC`)xEK%PV**Twi7+uH!K9cB0~9Dxp+SI818ZU}tc`WBF4n{P*Z>=1BW#RKuqigf=GX#TVk>NoZLlr2 z!}iz#J7Op7j9suRcEj%21AAgG?2Ub}FZRR!H~D z!}YiUH{vGTj9YLkZo}=k19##s+>Lv1FYd$rcmNOLAv}yn@F*U`<9Gs3;we0hXYeeZ z!}E9nFXAP8n18?Fjyp4D8F5biY_y8Z`BYccc@F_mS=lB9&;wyZOZ}2U? z!}s_BKjJ6+j9>68e#7th1ApQ#{EdI`FaE<&(a3)cjbSh>hQsg}0V850jEqq*Dn`TT z7z1NsER2nDFfPW!_?Q3_Vj@h8NiZoU!vFJs)Gh-IairFwb=D?ho3v**0%!~OjKNi4(SO^Pa5iE+uusD{$l2{5$V;L-q<*+*1(!r3u|K?tc&%qJ~qIH*a#bA6KsmjusOECme>kgV;gLX?XW#| zz>e4nJ7X8@irug~_Q0Ol3wvW9?2G-dKMufwI0y&h5FCoba5#>@kvIxR;}{%^<8VAq zz==2sC*u^Hiqmj9&cK;C3uogToQv~tJ}$t8xCj^H5?qSQa5=8PmADF5;~HFx>u^18 zz>T;GH{%xEira8I?!cY63wPrl+>85gKOVq?cnA;U5j={=@Hn2plXwbG;~6}Q=kPpU zz>9bZFXI)wir4Tu-oTr93vc5cyo>knK0d&Q_y`~46MTx#@HxJ~m-q@_;~RX7@9;f- zz>oL|KjRntir?@%{=lF33xDGu{EPoERCMwmLt_{Wi{UUlM!<*|2_s_^jEd1PI>x}5 z7z<-#9E^+cFg_;0gqR2uV-ie?$uK~H5*2DRXwjj^5KN9KFeRqK)R+d-VmeHZ889Pe z!pxWjvtl;PjyW(V=EB^V2lHY+%#Q`IAQr;HSOkk=F)WTHuq2kk(pUz|VmU026|f>! z!pc|$t70{*jy13**23CY2kT-ztd9+_AvVIs*aVwmGi;76uqC#_*4PHyVmoY)9k3&I z!p_(QyJ9!&jyZzFARfZQcm$8)F+7eZ@FbqX(|88Y;yFBz7w{rp z!pnFCui`bljyLco-oo2>2k+uNypIp?AwI&#_ynKgGklIO@Fl*&*Z2nC;yZkgAMhi7 z!q4~xzv4Iijz91x{=(n*2mj(f3>Aa?$IuuC!(uoLj}b5;M#9J#1*2j#jE*rdCdR_p z7zg8GJdBSCFd-(w#Fzw=VloU+phSfl4O(>QF$9xi3QUQqFg2#Zw3rUlV+PEKnJ_bE z!K|1Kvttg-iMcR0=E1y}5A$OIEQp1$Fc!h0SPY9}2`q`Fur!vzvRDqwV+E{;m9R2a z!Kzpdt78qUiM6mc*1@`159?zCY>17pF*d=b*bJLv3v7w4ur;>9w%88aV+ZVrov<@@ z!LHa1yJHXRiM_Bl_QAf`5BuW)9EgK(Fb=_?I1Gp52pox{a5Rp=u{aLL;{=?DlW;Ol z!KpY6r{fHqiL-Dv&cV4j59i|oT!@QsF)qQSxD1!$3S5b+a5b*MwYUz~;|AP_n{YF3 z!L7Irx8n}niMwz&?!mpd5BK8%Jcx(zFdo69cnpu@2|S6X@HC#mvv>~A;|08km+&%P z!K-);uj388iMQ}J-od+g5AWjxe29F!wSOQC8DJ+d;uq>9t@>l^YVkNAMRj?{n z!|GTAYho>|jdidt*2DVP02^W>Y>Z8?DK^9A*aBN(D{PHzur0R3_SgYCVkhj3U9c;5 z!|vDvdtxu_jeW2$_QU=-00-hA9E?M7C=SEnI08rFC>)Jra4e3)@i+k|;v}4mQ*bIy z!|6B!XW}fJjdO4=&cpe*02ksST#QR_DK5k1xB^$=DqM|ga4oLG^|%2y;wIdTTW~9G z!|k{Ocj7MGjeBq}?!*0f01x6JJd8*1C?3P(cmhx2DLjp5@GPFg^LPO-;w8L{SMVxc z!|QkhZ{jVyjd$=a-oyL&03YHbe2h=_xJ%n;wSu!U+^n_ z!|(V5f8sCvjeqbj{=-nQ$bSrtVK6L)!|)gZBVr_sj8QNuM#JbB17l(=jE!+HF2=+7 zm;e)EB20`)FexU(00l}^sL`NBhaN*PIi|prm85)v!9& zz?xVKYhxX(i}kQRHo%712peM)Y>LgWIkv!-*a}-?8*Gd1uswFbj@Su1V;Ag--LO0M zz@FF(dt)E$i~X=a4#0sp2nXX39E!tmIF7)PI0{GO7#xe^a6C@Hi8u)-;}o2V({MV@ zz?nD;XX6~4i}P?kF2IGj2p8iLT#CzZIj+E!xC&R}8eEI(a6N9ojkpOn;}+bC+i*MX zz@4}YcjF%1i~Ddt9>9Zm2oK{CJc`HgIG(_hcnVMB89a;U@H}3?i+Bky;}yJ$*YG;t zz?*mrZ{r=ji}&z8KEQ|g2p{7Ue2UNTIljP`_zGX+8+?oJ@I8LOkN62c;}`sj-|##B zz@PXFf8!tgi~lfGZ1NvNV;BsJ;V?W#z=#+LBV!bdiqSAS#=w{u3u9v(jEnIwJ|@6~ zm2nS(V@o>OpYlqC8omEmta2uj}5RPHp0f(1e;q9kCAPxW*aq8TJ8X{~up@TD&e#RJVmIuLJ+LSC z!rs^i`(i)rj{|TZ4#L4W1c%}<9F8M!B#y$-I0nb!I2?}?a3W5^$v6e4;xwF&GjJx( z!r3?n=i)q^j|*@iF2cpQ1efA6T#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN z!rizB_u@X>j|cD|9>T+T1drk|JdP*uB%Z?4cm~hnIXsUS@FHHq%XkH^;x)XEH}EFj z!rOQU@8UhYj}P!6KElWN1fSwFe2y>hCBDMf_y*tNJA98H@FRZ0&-ewu;y3(`Kkz61 z!r%A@|KdLk6^H!C&=>~8VmJ(s5ilY~!pIl}qhd6SjxjJM#=_Vb2jgNqjE@O0Atu7a zm;{qzG7M0lM1>j+T6E|!1e0S5Oo^#5HKxI|m=4op2F!?=Ff(Sste6e6V-C!TxiB~8 z!MvCc^J4)lh=s5)7Qv!e42xq4EQzJCG?u}#SPsi$1+0jburgM`s#p!HV-2i{wXinU z!Ma!v>th3Kh>fr@Ho>OY44Y#MY>BO~HMYUF*bduc2keNQurqeSuGkH`V-M_!y|6d- z!M@lJ`{Mu{h=Xu24#A-~42R*ZsI1b0-1e}PIa57H8sW=U%;|!dMvv4-f z!MQjO=i>rgh>LJBF2SX^442~yT#2i2HLk(6xDMCj2Hc37a5HYft+)-h;||=3yKpz| z!M(T-_u~OPh==en9>Jq{43FapJc+09G@ik;cn;6w1-yut@G@S(t9T8s;|;utx9~RJ z!Mk`5@8bh}h>!3wKEbE>44>l*e2K5{HNL^O_zvIW2mFYi@H2kFulNnW;}86azwkHy z!N2$qL&YWkF*Jt3uow=*V+4$dkuWkw!KfGwqhkz=iLo#?#=*E4594D3Oo)jvF($#J zm<$6HC{dwCgBBfn48i1>0#jltOpR$UEvCctm;p0lCd`akFe_%m?3e>{VlK>$c`z^L z!~9qP3t}NGj76|07Q^CL0!v~kERAKbESAIaSOF_yC9I59uqsx=>R1D7VlAwVb+9hh z!}{0&8)74Dj7_j9HpAxF0$XA$Y>jQOEw;n<*a16YC+v(}uq$@M?$`r+VlV8CeXuX~ z!~Qq`2jUa4Js2={N&t;w+qvb8s%s z!}+)X7vdsZj7xASF2m)x0$1WHT#ajREw01$xB)lfCftl$a4T-Z?YIMX;x62cdvGuA z!~J*w58@#_j7RV&9>e2!0#D*8JdJ1YES|&jcmXfsCA^GR@G4%z>v#ii;w`+5cknLW z!~6IEAL1i?j8E_>KEvnu0$<`Qe2s7LExyC|_yIrSC;W_G@GE}9@Aw0M;xGJ-fABB< z!%*?ae+-RbFf4|{@E8FjVkC@=Q7|e-!{`_TV`40fjd3t8#>4oS025*&OpHk|DJH`J z1xi$?(V#_#9z!rWrofb#3R7bmOpEC-J!Zg+mVx%J$As3*ac zz=gO77vmCKipy|0uE3SJ3RmMAT#M^)J#N5_xCuAo7Tk*4a69h6owy5k;~w0L`*1%V zz=L=Q591L$ipTIcp1_lM3Qyx1Jd5Y>JYK+ycnL4#6}*bq@H*bWn|KRv;~l(<_wYVG zz=!wv(J(s3z?c{dV`ChQi}5f%CcuQ42oqxxOp3`cK!Fk! zYBXrkp~nzRjwvuDroz;i2Ge3XOph5bBWA+Pm<6+9Hq4GWFem21+?WURVm{1|1+X9% z!opYti()Y>jwP@pmcr6l2FqeOERPkiB38o6SOu$MHLQ*`uqM{R+E@qcVm+*n4X`0L z!p7JHn_@F;jxDeyw!+rf2HRpgY>yqVBX+{h*af>{H|&l*uqXDy-q;8GVn6JU18^V? z!ofHMhvG0Cjw5g+j>6G62FKz!9FG%lB2L1|I0dKTG@Onza3;>e**FL1;yj#>3veMW z!o|1*m*O&9jw^5_uEN#02G`;`T#p-YBW}XYxCOW3Hr$Roa3}7<-M9z$;y&Du2k;;s z!ozq3kK!>rjwkRWp2E|32G8O-JdYRfB3{DFcm=QGHN1{D@Fw2E+js}>;yt{N5AY#A z!pHaopW-uojxX>fzQWh|2H)a4e2*XSBYwiq_yxb>H~fx2@F)Jl-}ndr;y()I1G;wFyjAJyIqG}RlSX(o#`o7W_>nb zLpI_we3p&*9GkEyoAG%zXA8DuE4F4Ewq-lEX9sp84j-r^_xl%Mf)e!(yK6~E>;{FdMGd;Y*5`4fNUFZ`9i@pt~gKlvAf zOyWNy#!SG3OvJ=Y!lX>bCiU*L;;iG%nuU*TZB${~D>LphAY zIf5fOilaG(V>yoFIe`;7iIX{nQ#p;(`8sFt4bJ2&zRB5~!?~Qt`CPz-T*Sp(!lhis zi@>WETG!F=hfLWFjVJ5+-FbCT9w!WGbd+ z8m47B-o^CHz`L1|nV6Ybn3dU>ojI73xtN=In3wnPUgqO{yr215fDf=B3$ZYZ@Ie-3 zF&1YDKE#qN#nLRphxrH}Ls!@8`; zr&*s3*pQ9*44-9VKF20(%4U3?&Dnx2*@~^%hHcr7?b(4H*@>OmgC?8{4mScHV zU`1A9WmaKTR%3P6;NyIPPqHR!@hR439oA(%KF#`Uz=mwZXZS1|^EozQQ#RxCY|a*J z$yRL5Hf+mwY|jqt$WH9cF6_!~?9LwS$zJTuKJ3eW?9Txl$QSq`U*aIX%vU&=uW|@q z<4_LcaE{##2C@oCm)12$wMKEr3(n9s2Zo3a_7XLGh-OSWQbwqaYgV|#XBM|NUoc41d`V|Vso zPxfMO_F-T4V}B0dK)%2i`4R{5Wxm3}e3e7^8i#TihjRo+aui2%499XD$8!QFauO$V z3a4@!r}K5r;2WIDS$vbTIfrvOkMp^J3%Q7kxr9r(jLW%#E4hlRxrS@Gj_bLB8@Y*Z zaWl7YE8pfje3#p}ojbUbySSTs_#XFiANTVB5AqNX^9Yaf7?1M=Px2Js=V_kdS$@C| zd5-6Kffsp+mwAO(d5s_OV_xSC-sCNQ!cX}bKj#|%3>_e5`2gyS&F4uh7a=*KFY^f zmgQKU6XFY^@+ z=BpgS*Ep2JIGiImlA}19V>p)MIGz(Yk&`%?Q#h5=IGwL^2H)UJ&f=S#%{iRQd7RG$ zT*yUS%q3jPWn9h`T**~j%{5%hbzIL4+{jIQi<`NHTlqHM;k(?%?cBkg+{NA8!}qwC z`?#M6c#wy9m`8Y&$9SA4c#^01K2P%u&+-F)$a6f;3%tlnyv!@S%4__HAM-kI@Fs8Z z6Mo9i_&LAem;8!f^BaE4@Ay4`;E()?Kl2y<%HQ}q|KOkei$QkrpAlmwU_vHhVkTiy zCS!7@U`nQ9YNlaYrsG{q&kVeq8JUThnT1)IjoF!lIhl*OnTL6K5AS6@-pBiyp9S~; z3$hRkvj`t#Q5Iuymf%Ay$xVs*pjW-nr+yY?bx0j*pZ#snO)eG z-PoNy*pt23n|;`q{n(!aIFK*!MZUyAe3`FsFkj^mzQ&;(#^D^nksQU*9K*33$MKxN ziJZjAoWiM`#_4>WGx!E)au(m@Y|i0a&f|P8;6g6qVlLrQF5_~p;7YFIYOdj0uH$-c z;6`rZTinbo+{(B44&UWAZs!i}!9=^xD+{gVqz=J%*!#u*HJjUZZ!IM12_j#IU zc$OdVL!RS#Uf@Mu;$>dpRbJyq{Fv8ygEx7LpYT(D#?Sc$zvNf^n&0qSe#h_m1ApXC z{F%S-SN_J|`3L{xUkq}H|BM(j0TVJ26Eg{uG8vOI1yeE=Q!@?IG9B+?dS>9=%*ag4 z%q+~xY|PFa%*kBL%{e(1&g{aj?8ffw!Jh2J-t5D^?8p8bz=3>$FY+Z0 z;>&!6gZV0l@HGzQFb?Mkj^rqg<`|CUIF9E8PUIv`<`holG*0L1oWVCZle73HXLAnc zavtY%0T*%+7jp@hav7I%1y^zvS91;5avj%m12=LL-{NL&;a0xQcla*1aXWW#CwFl- z_wYULWzs z8@$O|{DhzKGk(r5_$9yM*ZhXx@;iRdANV7G;?Mkrzw$T!&Oi7k|6-6+{Aa|N37C+H zn3zeJl*yQ!DVUO}n3`#rmg#sG(=!9_W=3XWW@celW@C2dU{2;@ZsuWL-otyDkN5F@ z=4Syuz=ABq!YslES(L?CoF(`WOR^M8vkV{RBYc#Pu`J86JS(swE3q=GuqvyuI&1K8 zKEWqhlePF1YqJjPvL2sieKuf2HsUjUmW}xwo3JUH@p(393$|n{wq_f)WjnTK2XtLmw1_1c$L@q5kKa2-r!B%;wSu+pYd~k!7uq0zvegmmf!Jv z{=gsk6MyC}{FT4)cmBaY`4@v+;y)wCOu&Rp#KcU(q)f)-Ou>{)#nep0v`ojln4TGU zH#0I5GcyabG8?lq2XitPb2AU~@*dvHe7ukMGd~OP0TyH-7G@DX$f7L9;w-_3Sdyh! znq~MfAK{~XjAdDl$3qHvJs!* zvuw=g*n~~ljL)+au{Zm$FZ;1S2XG)? z;EQ~TgZMIE;b6YXA$*NPIgG4~#Kl~~rCi44T)~xG#noKHwOq&b+`x_8#J9MaTey{P^Bum+ZQRZs z+{s;jjFSzw;0N$-fxn4*uCf zMvR$&37LqAnS@E1jLDgTDVd6?nTBbZj(0IVGw^O^WF}^27G`BOW@irOWG?1r9_Hme zyqEcSAMa;=7T^Oc$U-d4B7BfVS&YS5f)BAIOR+S|@L@i}NBJ1bvK-5^0xPl-E3*o# zvKp(i1|R1We3CU;i%+pO>##2C@oCm)12$wMKEr3(n9s2Zo3a_7XLGh-OSWQbwqaYg zV|#XBM|NUoc41d`V|VsoPxfMO_F-T4V}B0dK)%2i`4R{5Wxm3}e3e7^8i#TihjRo+ zaui2%499XD$8!QFauO$V3a4@!r}K5r;2WIDS$vbTIfrvOkMp^J3%Q7kxr9r(jLW%# zE4hlRxrS@Gj_bLB8@Y*ZaWl7YE8pfje3#p}ojbUbySSTs_#XFiANTVB5AqNX^9Yaf z7?1M=Px2Js=V_kdS$@C|d5-6Kffsp+mwAO(d5s_OV_xSC-sCNQ!cX}bKj#|%3>_e z5`2gyS&F4uh7a=*KFY^fmgQKU6XFY^@+=BpgS*Ep2JIGiImlA}19V>p)MIGz(Yk&`%?Q#h5=IGwL^ z2H)UJ&f=S#%{iRQd7RG$T*yUS%q3jPWn9h`T**~j%{5%hbzIL4+{jIQi<`NHTlqHM z;k(?%?cBkg+{NA8!}qwC`?#M6c#wy9m`8Y&$9SA4c#^01K2P%u&+-F)$a6f;3%tln zyv!@S%4__HAM-kI@Fs8Z6Mo9i_&LAem;8!f^BaE4@Ay4`;E()?Kl2y<%HQ}q|KOke zi$PxTpAlmwU_vHhVkTiyCS!7@U`nQ9YNlaYrsG{q&kVeq8JUThnT1)IjoF!lIhl*O znTL6K5AS6@-pBiyp9S~;3$hRkvj`t#Q5Iuymf%Ay$xVs*pjW- znr+yY?bx0j*pZ#snO)eG-PoNy*pt23n|;`q{n(!aIFK*!MZUyAe3`FsFkj^mzQ&;( z#^D^nksQU*9K*33$MKxNiJZjAoWiM`#_4>WGx!E)au(m@Y|i0a&f|P8;6g6qVlLrQ zF5_~p;7YFIYOdj0uH$-c;6`rZTinbo+{(B44&UWAZs!i}!9=^xD+{gVqz=J%* z!#u*HJjUZZ!IM12_j#IUc$OdVL!RS#Uf@Mu;$>dpRbJyq{Fv8ygEx7LpYT(D#?Sc$ zzvNf^n&0qSe#h_m1ApXC{F%S-SN_J|`3L{xUkvUM{~0l60w!c4CT0>QWilpb3Z`T# zre+$ZWjfx)^vuA!nUR^8nOT^X*_fRfCKpgU*t<1#FzOB2lG`9;cFbqVI0m89LZ4}%`qIyaU9PHoXAO>%qg78 zX`Ig2IfHL-CTH13bt>Jj^3J%40mv6FkXNe4nRzhG+Q!Kjb-{ z=LKHmC0^zgUgb4@#E*HMH+Yk`_z6GdXZ)OB@JoKhulWtX<#+s^Kk!HX#Gm;Kf8}rd zoqzC8{>9*4@t+Z6CSXD)Vqzv?QYK?^reI2@Vrr&gTBhS&OwSCwn;DsjnVE%InT^?* zgE^UtxtWJ~c@OVpKHkUsnV$vt01L7Z3$q9xWKkAlahBjiEXh(V%`$wLkML1G#a4-X`2?S2P1fR5tj#*C%X)m8_1S<8*@(~ZSvKZ#Y{I5&#^>3b zE!dK+*qUwFmhIS{9oUhb*qL3}mEG8#J=l}I*qeRWm;KnE12~W`@I}7FL4297a4=ux z5WdEt9LC`s!I2!r(Hz6E9LMpTz=@p1$(+KeoW|*Voiq3bXL1(bkJTYksy`2&CCPyCs`@K^rE-}wjszu(iIFqyZCTDXF=W-tBa{(7}5f^g_mvR}Ga|Ks&6<2c&*K!@#a|1VW z6W`)yZsAtG&3E`Nw{bgna3^H1znnVZoOkc`!2mo4_nx=Q}doZmsh$|rhUuioffw2+NyQWp$l8x-naEX zg_gAJ+PPbg);)XfSkR?a`_AFtM_t^dRr4M_n)hGWvt9FUttTX2(5iW_=KB%{K^Vme z5+?e$U6%Ry_J1GqpATDDqf4vS{q7upQP&n7TDR==Z^i#U_QLjm7f{Q;k2`H{^SLeN zww&8)ZtJ;i=C=L6wd(f1y^5Cn@0xX|)W3b||FK<1;#>c;Y=7e8x36RvL_r)R2oeT~ zg2X|RAZd^+NFJmJQUJtb zv7l^FE+`*V2r34Zg33XaplVPps2sqk_@Fm|$!$ zE*Kw72qp%Tg2};@U}`Wem>#?y%n05HW(Ko@H-p*1oM3J+FPI-J2o?s5g2lm-U}>-{ zSRSkhRtBqr)xnxzZLls_A8ZIV2AhJng3ZB}U~BMp@J{e=1SgJB6LYE@9WOTi8A95%vswg}uW*Vc)P{*gqT)4h&xiUkqOg2Zb+(uY`lcSHmIU zYvIsvSU5Z!5snN;g`>kU;n;9oI6j;ZP7Ei7lfx`<5uOZBh3|)_!!zO8 z@PqKf@LYI4ybxXtFNK%GE8*4fTKG};adhhKzWhF^tWhu?(X zhTnzXhd+crhChWrhrfishQEcshkt~BhJW22{9zPDag-oR7$u4lM@gcjQL-p`lp;zQ zrHWEVX`-}Iy6CPbeUu@(JIWYkiZVx8qO4K2D0`G6${FQ~az}ZhywN?;y-~jCzUcla ze^elPASxIYiV8KXNldPjYtzEQuZe>5N( z7`+g^7`+q?ie8Rhi3UfnMnj_4qM^~SXm~Ut8X1j>Mn_|!vC+6_d^91N7)^>MM^mDy z(X?oK^m;TSdLx<{&5GWPW=C_PxzW66ezYK37%hqxM@yom(XwcHv?5v=t%_DhYofK$ zx@dj0A=(&iir$JgM_Zz;(c95G(Yw*MXnV9H+8OPNc1L@n_oBVgzG#1RAUYTwiVjCd zqNCBV=y-G@IvJgc-j7a4XQH#w2hoSox#)a!A-Wh{iY`Z2qN~xh=%eW4=z4S`x*6Sy zK8Ze!K8rq&zKFhzzKXt%zKOn#zKgz(eu#dIeu{pMeu;jKev5vO{)qmJ{<=N&hjA3g zae_EuoG4BlCyA5B$>QX3ia2GQDo!1ziPOgE;=AJXafbNrIAfeC&Kzfnv&Pxt>~W4b zXPhg}9p{Pj#`nbc#`)s=;``(Lae?@OxL{l;E*uw$AB>B}#p2>|iTI(oWLzpP9hZq8 zjvt91jUS84#^vJjafP^ITq&*`SBa~})#B=Ljrj5SiTKI5W?U{6hR<{8Bt9emQ<69vr_K4~bukhsML=;qi!gWIQS! z9gm5}#^d7g@q~C{JSm0-)8gsz>+y{Ejd*4}D}FPc9nXp9#`EI&@q&0^yeM8A zFNv4N%i`tnig;zbDqbD0iPy&K;`Q-{cw@XNek+y~FX8dpA|G%zLmeuIet7M6Pw@~?e#Za=uzwPob*D3pRmcO&! z`)93i=V|1fmqim91~X_=09F+DTzZf0aAW@Z*N*|OSz28xq>UXimSPXYq^f=xq%zGiEnW;w{R=p<~w|s+qj)O zxRblMn|t^k_i`Wi^8gR>5D)VRkMbCg^8`=w6yN7*p5a-3zz=zj=Xrq_d5M>Kg;#lv zAMs;e=MCQEEq=mJ`58aw7yOc6@oRp=Z}}a+=MVgmKk;Y&!e99tf9D_klYjB{j|Bg0 zAtT02z=TZ1#7x4ZOvdC)!IVtJ)J(&)Ovk&Jo*8&IGcpr1GYhja8?!S9b21lmGY|9f z9^T7*ypQ)YKMU{y7Gxn7W)VKfqAbSZEWw9ZlBHOhW%w{3;iG(vWm%5pS%DQLMGrO=WyRkcauqS)5H~X+J`>{Ujnnx$XYdWqG z!IfOa)m+21T*vj?z>VC*x44;GxRr179lpzL+|C``$z9ydJ$#RQxsUsKfCqVqhk1lY zd5p(-f+u;3@AEXz@GL*zhdjsgyugdR#LK+GtGvdK_%W~Z25<5fKjEkRjGyxhe#x)+ zHNWAv{EpxA2mZ*P_%nauul$X_^AG;Xzj*r=!JYWeh%pl|Armn%lQ1chF*#E(B~vjq z(=aX5@h+xk2Hwq#%*4#h!mP~3?99QO%*EWy!@RtQ_c9;vC?8{4mScHVU`1A9WmaKTR%3P6;NyIPPqHR!@hR439oA(% zKF#`Uz=mwZXZS1|^EozQQ#RxCY|a*J$yRL5Hf+mwY|jqt$WH9cF6_!~?9LwS$zJTu zKJ3eW?9Txl$QSq`U*aIX%vU&=uW|@q<4_LcaE{6aN`8W&$Q;A|_@MCS@`vX9}idDyC){re!+b#q`X;yP1)hn3-9amD!k` zIhd2Vn45W+m-p~q=Hq?5pZQsU53nE$u`rA9K^A2(7H0`Q#F8w<(k#P=`3N87V=T*Z zEYAw8$V#ltDy+(Ctj-#IoKNsc)?_U{#oDaHx~#{iS)UEqkd62ZpJih{$0lsbW_+H_ z*@7+Eimlm(ZP||P*?}F|iJjSnUD=J@*@HdVi@n*0ec6xwIe-KC0$=1y9K@IT3J3F5 z4&iGY%3&PN5gf@;9L+Ht%W)jf37p7DoXjbl%4wX=*Exf4a3*K*P0r>V&gDGL=K?O| zA};0hxSdBhe15Cm-K6*}s`t-Zcd`i=x9QZi`G2`hZ?8KUiud1%&j0)I>vEo!zhe@n zDSsym@&9wq;*_}+=2o0rX>R4Y74Bp*&TG}{U)6cl?F7fxz1nwfU9v>uEHJ?m zirje>1poQ<|4>aAc5dIVb*q0>*MFH{`tK_{<$v8+3&J6PpP8k9UyIvkq;t20eVTXb z-TGMCAu|piy?vK;QP(zYdbaMhV_vTTJLb3P+Vzm068ulKp;fPDy$1ZNHf;S5U8G>H z`t^!bX@2`^_2}~7beCMsFOIGq;_Udvu-uvF@_) zpV9ZP8t^|jE`J{MKXkh1b#DLvb5;M-6tb*l?@qni|GSi;f7vyccmBJSf%*3+>^X6M z*DkH+_v+jAp9yTr#4w&1CYTt-6QkRY3C%6Ndj6}%{I>(O_Wv4}|9fEmr*AR0lb!Eu z37LqAnS@Ds=jsffI)bQL5VLGKvnVeov{@Y_m(U>@KOW%B4+?nojrH3Q?B#72wAX zlUZ5_Z@Z0-YMb;F`~>c0XRWlzt6XJvX&MbYy2KL<>A7GB6}ihE$BJ84qJTobd?8BN^8) z9>qAFzV?MM1TuFpKXlGpY?9q%HP-o#wT<3exnxy{;|*g23}-xo@kqusj7Kp(f$?a@ zCo(>XaXQ|PLNYWmV5-+8Fz&r{bX2RYa;aD$+T{&xfC1Sdnr)i^Fy#GCc-{NJeUDgC zDIJeJ5pnwLA%t3F+`S~%gR1K2WH61nWQWxj?#8{@9SC1YHi6CH9ipY2q7S<6u)`&T z7gamH%lcJJ9qpiHIr&dtNKg#q5L~8VeL>4%dIQEWz;7w9o6IIYdfIJ^wd8K8)S>u) z@;5u2g4!vBIaO+>3e#{V)so5zhslGB4l`U*soWry>}G`ApavhjgAPtwOBpHj+#p%!n6InWV(LE@;UQG3~ak) zRY1k)#WOc0V*jUq#(Ycy-gmL&0Pv+-FQ~Bp(?5~?@UoM!O%9of^nc;FhXHZw+UY#{ zi!T()+-{kRL&NsvW>b}9yd)@Mo`t%SBUCKxpayfeF`ITBJ+=j*<}$1G2s3EZY^=2P z7h-Vl)tJ|q-#D+apfQg+F1N>LQTIIz-HncIYYk1tOA|vpNe)YYNRs0(n1q((xXbL= z;nzV(+|+`zuhO;hw#~lLN*Pt~D>9CnMw3|x_8J+Mp2RpU#tk$*Zv5c3;g;8kHffs* zf@`bADBA5}HNA&}RSOB)lZ<<-&-5}xz<3R z;{Z!m3GE9Y_OX`D-uQex3vzWCaYlUHwrFNqqA_b=pVC%1#H0s`k1d>Xjzao57Dom2q7Rm{Ny z7_2uriusaeG_tHTl4U`51r3UK{7{O+w5#@$b*ViVt|#49;Xa_%^S(Ts7sQeb^i~Sv zX^iu%j3>p@*!>xd&te=g9?!U*@tKUL56t3*hLi^4x&kcJ$JBAPQLp8r@9}iSCeb35 z(m9iL`M~JMp&QsCIYU_}@NN)0l-JatJz4d5x-#rS3U`?vR~bSfWf!C)72WBkk#6{$|`yXPVgNT*@n1i zD-|6Z!K~+I*11cqMrmLnE+gBY!5>yl$5BR5>6hXv9B$Jwdg8H(=3x8GMS@Bl79J6) ziJA~Sanj_gv@x-9@d;NaCQZ3!>a=UG)9LidDbr_MKQlG$Q?ovue#7iJpUKFan{{LM zXLD}4`E#HD!Y#QvLtg&8g82(>y=~za7cE}0^!CDKcPuYjap#x5{FS>_es$H?igl~S zl2W5oR=&n`_u2}x#cErJSE7~o+*?&$bKm`6|3=`Etya5H zvP;GnXt7;^FJ}7laK>D2wVUv_S@`W9j8wR;Zf7U>vn#XV!P3RU>TxHndxy_DWqI7qy)L5C2ugX22&mpO#0)5%`1 zb%uJRvAYw*x?R}1bNm@Md7y~!belsWhC!SF+s&XQ;>Pku2Ghtf2Wfz+FQny4pcX!wWt`e(b4eVuT&y!8!oFePmYOBw-Gw+*hA6d-# zy0rQGS0nO{cMk;JeCX$G*OMP-AcXrT|KZ(%6J)De%=rTs&mGmHW{c?908BLnDg`x-8Ru8RcySR>t7A(a25Z$zZfGcM$>}<1#K~I5HzxIOUCJSJJR(a zjW#}Mnm%R5%(PkQv->YUP_Cgn3TX`dV5nDo5pL+0d)MTHgEf?yk5j1@7q{Aj7sgo6Aem_Q+7TBVc@I~Ux#yR*y;t;@?w zrQkzj3Rh#(Hf@L9;n$xSn%JZ-#D}U;i#BQc_tN&s#)taggAbMT-aB*Mot<5F)>6~% z%S>kPoO{0Wo%@}0=F^aE`7>MjSjfdnClt3&+F3m|hwec?FSvm?8uqbXuo&i8iM`HP zb{~hA?B(!f>$Zj3?e|MgbYH&Qec4i1d`RJ>fFI!ZYj!C|J*+6swcz^}9|w-jxOlH$ z0HU}ULaWds^h~uo%?lh50Cz=k@f_Ms5s$EbFn+5nm+W}0&AD)=^p0&I*wNlx7TKe! zChX(fUW?xFw2xy2frZ1Rj@|lyW;8$n>|j+*B7oO6)#>1MM0Q*iRMxu-*Gqr0r2-_F z07w`C*_{Z-S(%l4WtYs!K<<`;K|$yRiBTDvZAAVyV}MXDbaZ>&#{frB zo~9@*dXyHz&JPqLV;bCYUOAkx=fhlkNyo!r66;SCx*7XM_==_I)g-p9DbwQEi9FAS zn;ct{*-lH<_9xlpa3H(1%SicYOu5`3H?9KhZ3$?{xqhBxb&r?zvHd^@af5!E0*@2j zs!stcyix4jmJ;#nbKg5;ga`@5`c(iJG!5`PC-sm-ukMkCL<$vZIWw+DYC751YjsB} z4*~d5K|Or4pt@~kyD6vnp~5|IQ(-kdJ_+1SH49Y&7La-YERZ$3x_sge=fTc~ZdE5P zBAp=rbROP^C2$7bE?9$)MDWp~^2q97n3b=Im~VKh%ujmVlgG(Q5h;_7%fgJky2%*ECnyVWAm^nVUEE=NPQgaCooXHQNbZ=j4J&dA*Xk<;==1Qb ziF)`C%OQ^Gy{vrbf6fk<(sLoSZT}Zbqj?EAMv1 zLfjKoa#o}3$i`LeaTp^9^MBlS6j8m3Z8_lu#|HD27&jYBjvYZm@0&OB@y%~f#rD3T zKAyi(r)92De9q7YMNYAT=WFwxzBS!xx`ZPk9}Wwkzj_qzb+#GBY=2@g!>KXv#VXfB zCPZ~k98(0=*AI*>^IFt07i)gu5rD41!gjTt&T(3Ymdao{n9$gMO`GuwIuOnuY*~5E z#g+RcVO5=^@mwP}!c%Fl!#2m3V2q;UAY8z&;9{@Ffebtx@JOs4B9$$MX@8?Age%t< zUsX2a4*LExEq|n=Yuy0*K_cq>qmg|5GrPkgrqT`NcD2G^nM@!zlX7v-Rv+qJ)Z zZD+K3;UeCe>_?gnt`^*)1i&iv+A<0R?YH! zJ9#aR*>mbjEgeW<(}w^q7k30zQl`n@l$CSMxC*ZD`5`hzzK^)U6X)E)0~Jjy%6q}5 zn?3U$e7XGRO=ryP8S=5xFeWP$M0Zm@UhuLWoj4q%xDsiWk#Eq(ktU;X52lnW<9B7M^S*pH`~VS(I)7F&1l#kdc1 zc4ubK|M||j=kX``A4l>xo$At|=7ahvtHrNMMn}YG3-VjmLBFKyUAyV?s3Ha6u;o)^ zc;qDNkAdZ-{FZJ=n&FD-h8oPnR~Q7AjG4f0%CNEmJs1J&D=vD~Kv09aJ{<8WxrkYr zK@3*Oz(QWT4c1rMLac!A2$XZMCX_);m)d*)dX10h`)LaUjSk^1cKIK z$$tp`IOoKLkhiG{Dl7IXdLvaf-hle=fK}-C??Bmds*qVjG_sXo;o^Qg_APijWIyW+ z__A{%(XTIX$Zz9k}QIm$y#Ltr`jX*9{PR*QyX*1?d0X8PyNe${)hDIh6Q zE7X`1dunh=yiVSXN=I;nPm)x~-pgU)W!r%aTk0JZ?V=j!XHG`3COVbXt;mw5No|(C zUkbuA?x-UB0vv>RR)5qF7N9@+!4fq36?o%m_9jj-R5bz$(Y^sN&re|KKVhRRm^a!F zR)!cBPM2-x5>+C^kb;3GDXj#l(AP0g*qAAx=bQryQUgkn?Op zZMXtfg)oLsSMbLdZ@Mn9R@YP8fYsvDbOR3>UR9BkdI5U>2G~?@ClfFD@MJ=mNRYAo zC=z@#R{E3jxR@Qq=;a$=p}?A?h(g-Zbou=(EjpnsG1%ObsmdmRO^-vB#TTL_m%$=* z_bT`>VWLNr1jEwDII2%qv7`kl+MUq#7+8=occu(I^HARqD1D00 zCwXMns-80F%tP>+$h7Dh*c!5ek^=q5GfZq&19^*1YiDzv)~s3s2CC^*oZ48&N3j@q z!TBVWtE}Y7Fx$y*cIxQ&ywFbQ2~CfGhK>#3nXqI86rl41_;NoE3T8bDu?&9{LPaBB zzWt=Qpo%>yU*U8!r(4$WFIzc%mD6pU zZm;6u^N8Wgrl(CIh+(0K#2!92Hlr_NVBs{^QASi-l2k|IC+@^y>e+E6RG3m2K=ka? zU=Ir|Mk_DlYJKZAzKngF#e+yhk=bhtJANJ~XSx*7>8<qm+|BKPi&^$BQ;x5VPTv zS-v!;UA}Y$@#05io2pCfySPBZ5fw3HZ%9If9$PmL z_E-BS-&M{1dFHPVZ;87qd|qgeYOuAOjaXSMbrRG3+%)6-nQuV7xUBI?#_=eQ%VxT_ zxQKtkR#j_byM(@-?57Va&mqmL$}sIYKVKYJDMy;4@qCvHxIHuSX4+8<^>Q(I(V3DD z6EA^~DVfct*OX0YwwSFp&u&$dp$6rf_AQb^8!FS-wL{Ovupc}RMbjA9p|Wo9Oa4w< z*hD5xnBqgHJ3#F+AtVl#Y%EV>Np6Oe7Ni6SXq8Lp~6!ih)gU0BChD5DT`p=x1-E+2GsK(QDzdf0M z{`vlI{`u$MW9|%=6y%F0B6=T&7mtFx5tZoJ8&0I-z5V`hGNt?D=@^Nssg&9;cct`j zGGUTbI!t8j3-_vth-&(Ysi&g4>1U4QR`5WgalM_om2p|Xl|_;^ybqTv0cli{vzBu? zLi&4k8CM|p?o9z#Kmvu-42N5GX)kUL=pB>G2kL>oaJF4spteOvbB zT$ihMAc-hOs=RMdeEa;_s}$Y-hUTvT_rCwZDT?DA2d+^3_4}V!Uk2Q24jrX<(p&S% z<+&tS980ph+ti5((e%q$UU3{(W$+9Im3LVKcxIR?MdOs|M?{YiRmQV`qbuNoP-Y|V z;Uf_f&&ePu5s=baa5$$SPc%VrwUJkj32e#Mo*)|=xn^_on%%%PGeD-Okf}xqcqYrl zg>kV*vcTo$=;7$)=;K(%v7BQC$4ZVQn9XTvRA%ikuFc>ZA!FjvK0Six6~$rM$ek=1 zWIR6~#7!&+UzP^3M+C8e1u^K{>unH32rpFH`0|hzg^h_tHF-a1Z4x!!E8|6@Kz8i2 zPtTpG4hBK-n~|qGnZ$K%O?juS&p=F7w? z|4GR-sx*7MctYKl!OIPZLF}^GtENodFH(uUiG)(}3AKU|sLUEmN6=2rnM0`<=I~f* zIL_y&a9r6$6i2$OWNXaM8W3cki$B*(m+2lWnft5aID_L%jouSx`Ud^cq0_#JCqFGq#&u;Xb8xT5gfdj14#do%tH#WS1Tz6rn|E}iM5*ce)MaiY3()6|kSqY2$KS@RTxv$`jP zI}}uzNT%XyEIOd$y(6fMcT)R|S+r+jT~nvw=$G1tCM?;JPoqT)?(z8GHbF zQhE=*4-_isc|TJKQk3fXfD*zFD#)X$DSXg4NWTo=L&hPa&Zt8kobJORBczA$Ltv9k z=(r1c$dM#|*yu9y|GG}#s*R>hf>mgROn70{O3UD|V$2ieitVvgtZZc}0-@^;6X^f+ zR6>oJR#o1uCe$7sig-6z_JpuO@1R07QG0egGIe$o8l1z-8rrOXeJz6qF}iZ0g2$OJ zXxDu?ddVtz^?Rq8`au7e@xM;jKYo{gyi`IS1gZtKx_kw*08wW)O1$H=p6&3Gjsdv%nit*xiqNySL zq~lbK*a?n02&y!^dKTVOEWD>f_@pzuG5hROj@iaC2=BBbyk~6T!8yo>XC-Rsb8zi( z(Tvjb0^vRD2=6&N!RH+WX?SVmgP+}c48M@UXA1oiD?n^O?x4dmei3%}EED0S5I*M= z;br?8=iwJ!JJI9K_=5GN1;4`gdzmo?lDz7WWIQj)sN^1|)nQy5TW8e8>bRP$aq+W4 tU_lLQd}7GyN6Zr_<_Q$@1&S75_~9zQ=1}>fU4hpXJ#7F0 diff --git a/pandas/tests/io/data/legacy_pickle/0.10.1/x86_64_linux_2.7.3.pickle b/pandas/tests/io/data/legacy_pickle/0.10.1/x86_64_linux_2.7.3.pickle deleted file mode 100644 index f0787f30c14533a5fd96d1fdb0b2707545b169a5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4338 zcmb_fTWl0n7~X3yQ*Omp5U@f887^H*DTs*DmP#ojOQ9URz*=W^r=69(^z6(^TUZO~ zZmr728=`_%^u;$56MZo;#>7NJWt&zeoZx*-&X9n#v{zheOeHMhzvi2_r6NGV-v{mrmG@=8M9^ex1U z;XzfPRm8VzljM=SU{Ebe0pi^f4u^OC4G;T}JUNm0QaVh9KciqME54lX z@zjr`31X}!@D}2`=gwY1^o5$*zXIIz;YX(sCwlf@M*Q{rpVwXjyh|TFf_NfO_t~ZS zMz}PUbbr6D8agC1EKp(9aaxn3vqVzaXAaQWQ6vh}6!0UaCJb4ib09{Khea%hqwbyF3OPjRVzG;_4x+HPB@tJIeZaJC$jL#0F6my^vaw}- zb8EBX5yD}h{zl^OPp4@^R~z@E8d^}ZIlYO;Q#VdTYo%)u#)t?P3W^51r`fqJy^60= z-`2Z3nq^nB$nfzZ#CSIL_ucha=nSGSyg*$_Y6m&^6j} z!cXqxXkjm1YcbhpQYmyK($Pb_wlqm|fofLX1t{LEtkY1D)>|=WA3Ln^R$i8y%W8+M zilG|{FlT#N$GtRv_j`1U_(ML z(*WEn@^&;8=td~Ww~0BrNhIEOtw|DGtKW{Rd-EEs9>B_165M9TFb2m0yTj#S3z}3i zacu$%$1`(r8A^e8w@nG|PRHu#waBXNJu;Ab#bUrf=&YZQR&BUb~ z8{1gW9aDmC$0b1(xSu!$?~RhcI~*oRI$kv__-(G>JJIa_V77BgkaweNzU{DhX@J`u z!)E-T$1YcJJJ6(pG24tD?C09d0&`irhPf_w)&_MKkJwIJC6Lz^!DT0*fQh#QltR3n zDm(2ElHD`igT}Dg&NgY+%-f{T8$NCSz1HJtcqeiiG1-3QGU5kc6q^13tcfi?ia5F8 zn}I(8yWUQ|hj?bQFE|PK!^Jbbh^>(|7ba`Vw@DpYJ)Tl^owZA0IGww5v_~Y>sdOeO zC*mV2-7`)qc<;1Vn}b~v>yvslSBKO!I%$$lf(jNby3c?j;Y-FT$D!JW`}4*i>ML$AlRf+D(xeF<8Ycj zr1fdVe^oW$R*h$LgUZnS=5-Z*wLafdV>o?aSuA9Dzep|ASaeGD*=z;*3txZaMiK;`qmnp?bit z_q~4N0 zJ#T*Lps(?LUZI@?I$n3tF~RAOd}CM#CafdsPPmj9Pn$Vz{ZO#bCc+$_9CiD#=2>XX hv(TDvq1CoqemKEzxCp-B$o@@HJFoG4kzPF;`UmelFZlof diff --git a/pandas/tests/io/data/legacy_pickle/0.11.0/0.11.0_x86_64_linux_3.3.0.pickle b/pandas/tests/io/data/legacy_pickle/0.11.0/0.11.0_x86_64_linux_3.3.0.pickle deleted file mode 100644 index e057576b6894b010a78308041957610d51e609a8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8978 zcmcIp2Y6If79Nt&2Bis#4HerU5*1JpTPTW{CE^7UOT?LE!b?ayXKtj(h^X@-D2NTa zVi$YwioN%Sx~{J4>RNVJ*IgCe;{N}A?@cC=eB!t6TIy|WQZ+d*K2y_GO^q3=Embxy zB~{JI#_D7yXy)|9CpFqVMops1xO6R#<)qvAE-C5WDm{$!Oq6#ikLM&FXpdLhdClp# z^qNsbv!i8$@>qG7R@u-$Yu)u%pvx7(ZjW3hqrn+_vZ_bR!Hc*n)(e@!eZ9X|Fg;GwG@J%hM%%Jlb%Uw-)c z&TD}CWfmS!Tv{^Vxi$S|TPxdDL~|Fp)l^fHN!v_(0;XBEAG%0($jXihHwldbhg?i% z&#BJH(5&nD|Br;&%gPw9#B(#d zH%;ziWZ%%_e%_YOb1!j~gcZZoQ>Z%SG@3TE7hkEWSQBgcehW`--OX0#j|tf-`s z6gG>WtSBp0SUY*=0;j&(KXk4vz#bv4T`1L7(vfpDv^vWO5bd`MWMTXL7AbjRYH_73vT{**Q7K zmvd9;N?CzHg+~gGBX>~AdGz&s>c7Cqg<+5{YB!N#F60wG8heCtv6V|WR;TMy>blIx z%23zk?Q}(bI_k{F;R-8PrqUIaa#g}x;+>tgapG8*1mKt7TP`BG=R6 z4MuJZE#BmvfQh&{F)=by-lc65udhwHoibyS4b5@6WeEC$kcpX#{6?fwx?;-ehcV@4k5z;_8w8hy4KT{p24E6v;zOg$@$pGC!8GYIDZN!r*@Ql8cfb-Mm}HnVekc}!P(samgUb@{*qtVf2GKJdpJo?P@{2BuPq(47kPNMfs<|s8cd+} zrhs!uV;q6@L5)=&PSLDuKQ=<|{y?J}frh;YP*9iwIuJEhHwI0$>LviAgFxLCI2GiA zZtB`>2I}kPpxFCp9(!MnCL7Ya1?sZ8C8&2)hg@+D;=$&JS4Hwbc8Kd>v>m!gw*n-x zx;4RF#wX}D)bdezJX{w#ggji?7HD)kw&V_!)9opMEAB0%JAkvgBWOMd2{u5_p+KWM zfrdSYQNW(OtLx6-tPTf707?NycVUCzWYk?*hL+u&WCSR+bj#^T3b62Yg_45b&u`W` z3O%jv4w~|H4}f!hobE}D1+I@K57T=AjgA2gXK`-|q8xBe?1LJs`-0}(-;a&Zdw-zO zGSIO1SPIyi#~h=l<4|LDJSaji0buk1HVQ*r6<*+2n%@|@5{a8{c@5t?RzQDK8HG|DoxWSqo;QVX)9g@XSnJC1_} zt8+jRg1LZ}wzJ4RqJvjKaG2^mG{7OG^MM@D@j#;sKslh2oMtJ&EmKcmT^$2i$2C{S zY^&qOleW4LfL;4UfUk={87^L*^dvCUdos}IDWKGg9LP}+eGE|_oI9|!)ipfTHC)G6 z5nW6PKaA)S_CY@5+lZcqvahFuT3renB-gKd#nNR?{TZMY9xg|^9Hp$D2^w9I^eix5 z+toNU=<@8Dlm=Ra8&JA*sO*f3ytHI~lae56{RXUsr*qJg>ma z^{Anl8-PY{1P#sHL;=m*%(}M3gUVY_1C_S|e7y}6Pd>dJ7%i_mSccL&fky8FbxO;- zmKU|^-4r50RtCY3Xteu87%K}UAx2*YB8(mqYZnIXBK)QOBR$cBB+|2Xq27a$TD=!E zm8{hJIPSLjxt}b2d;nV-?j%p55mN3cKwX$lPorF^&$!-g!+jP!hx;7R=<}2Y>GT2x z9PW#(Yd6m?p$5Z!8Q|+Hpeb)EX8Ki>qM3e;WvG1}X!H%xaHf5IlOkMc^QR2>cE8T} z=*k<(JM#J#Bv#jea$epB7R<{#WHF+5fkuA|8anzO1$6XxtZV1!`>27V9{_y)Jt*hp zLn59MbzzME06vQGM<`e7A3+m!Tr-}BxO~h)c))zZGDh(ypwUl3^Kto%qV+$0qkQa` z*FRkc`Z+XP{Q{J6`7`abCBR?EVnlxh8vPP9jLTOPFfM;%UAwq^jT*$|?*L!_0m`_1 zLri(iQCz+SAI0T6lq+>DXyI1&J$Rb_0ciA}lm@x?FA8Y-->hqA`ah_F=^p{U^4-TH zji1l73m9rE0vas_r8WpkD4@2kC|K=ANu_ot6vUwiTcLYTAf7Ok27~EE0lRO&I`?fR zl~mmjT*%DV-e9l+_F)}gU^TUhf4%eto6~-vzV-*jQ1jdPM&M~~0MO__P?{^w>Bbb$ z+$OAZ7Vtq%2Z3{{t*7atIms##;g5 zwfj(}TZ7>kwgDO)LJ`K$HK*HBfF+t}w=TgT>h9MObvyL1x;<$AMz#Z$K>LnBqY2PZ z`%nrBwS#Z>i}=Wj?t~hv!$A3APIpEjQOD@k(MPyG9N=pyD4pB|n0InlFdW8iK%*ll zf|GF5*O3%q#dUDA^A)9|&;zd>(7c;_Pzm(!2{bwyG_QXziV7W#974VOb^JD`V<54* zHz@ts2UwV*`+}#b{eVXIr%U)*?^|ftvZ>Q@>WIby#jo+-VZ^!Qm23xj{i{bG-`lG zr&5aX<1#yq0vbJxb#3dt-DP$Rboap)0XsP{-9o;sk^i#$G>f?qTN zmFQvh7*Lp+3Gnq;P|ijYn9r&d7@DjC8m*>?Q^BlCQ?&kBh5JMedRUzWn$IeaN}#_M zXtWMAuYWd0>vAeuLs6DRyw;3@^cGny@>@7Zk^WO?A%;rsxAN;y%021brA(rbukL= zfry2D3Cd7)DZtmuKnqo^1W#3$1M$)T4OLx90aaavf?K0dbv4RRbq&DRYeBh3Tt`f` zc+FW|g`%4UP0t#k*Mo}=?i*N!og0BhZvv$qtm~U8pq*P#uzD*cm3kYYAb7X46}sO6 zGhqUh^GpoLL- z7(C6d1{!??l;-gsdz1p2e~fkcgZy!D`IGwzFfj5Yz#U~deF{Zip9W>lJp;_|InRQj zrRRXSs#63@J#zX21=tbFq7@%uv=c;#cJ3EpbpAxBFG6DVB~Z8+ToPUeL#wXKmZ$cz=^N+8yt2p~lxWY>;Uw{Vytj`!fIl diff --git a/pandas/tests/io/data/legacy_pickle/0.11.0/x86_64_linux_2.7.3.pickle b/pandas/tests/io/data/legacy_pickle/0.11.0/x86_64_linux_2.7.3.pickle deleted file mode 100644 index f0787f30c14533a5fd96d1fdb0b2707545b169a5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4338 zcmb_fTWl0n7~X3yQ*Omp5U@f887^H*DTs*DmP#ojOQ9URz*=W^r=69(^z6(^TUZO~ zZmr728=`_%^u;$56MZo;#>7NJWt&zeoZx*-&X9n#v{zheOeHMhzvi2_r6NGV-v{mrmG@=8M9^ex1U z;XzfPRm8VzljM=SU{Ebe0pi^f4u^OC4G;T}JUNm0QaVh9KciqME54lX z@zjr`31X}!@D}2`=gwY1^o5$*zXIIz;YX(sCwlf@M*Q{rpVwXjyh|TFf_NfO_t~ZS zMz}PUbbr6D8agC1EKp(9aaxn3vqVzaXAaQWQ6vh}6!0UaCJb4ib09{Khea%hqwbyF3OPjRVzG;_4x+HPB@tJIeZaJC$jL#0F6my^vaw}- zb8EBX5yD}h{zl^OPp4@^R~z@E8d^}ZIlYO;Q#VdTYo%)u#)t?P3W^51r`fqJy^60= z-`2Z3nq^nB$nfzZ#CSIL_ucha=nSGSyg*$_Y6m&^6j} z!cXqxXkjm1YcbhpQYmyK($Pb_wlqm|fofLX1t{LEtkY1D)>|=WA3Ln^R$i8y%W8+M zilG|{FlT#N$GtRv_j`1U_(ML z(*WEn@^&;8=td~Ww~0BrNhIEOtw|DGtKW{Rd-EEs9>B_165M9TFb2m0yTj#S3z}3i zacu$%$1`(r8A^e8w@nG|PRHu#waBXNJu;Ab#bUrf=&YZQR&BUb~ z8{1gW9aDmC$0b1(xSu!$?~RhcI~*oRI$kv__-(G>JJIa_V77BgkaweNzU{DhX@J`u z!)E-T$1YcJJJ6(pG24tD?C09d0&`irhPf_w)&_MKkJwIJC6Lz^!DT0*fQh#QltR3n zDm(2ElHD`igT}Dg&NgY+%-f{T8$NCSz1HJtcqeiiG1-3QGU5kc6q^13tcfi?ia5F8 zn}I(8yWUQ|hj?bQFE|PK!^Jbbh^>(|7ba`Vw@DpYJ)Tl^owZA0IGww5v_~Y>sdOeO zC*mV2-7`)qc<;1Vn}b~v>yvslSBKO!I%$$lf(jNby3c?j;Y-FT$D!JW`}4*i>ML$AlRf+D(xeF<8Ycj zr1fdVe^oW$R*h$LgUZnS=5-Z*wLafdV>o?aSuA9Dzep|ASaeGD*=z;*3txZaMiK;`qmnp?bit z_q~4N0 zJ#T*Lps(?LUZI@?I$n3tF~RAOd}CM#CafdsPPmj9Pn$Vz{ZO#bCc+$_9CiD#=2>XX hv(TDvq1CoqemKEzxCp-B$o@@HJFoG4kzPF;`UmelFZlof diff --git a/pandas/tests/io/data/legacy_pickle/0.11.0/x86_64_linux_3.3.0.pickle b/pandas/tests/io/data/legacy_pickle/0.11.0/x86_64_linux_3.3.0.pickle deleted file mode 100644 index e6ed07d75da64251660e5e7471229b899047b758..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5822 zcmbtY2Y6J)7T$z}vM5auY@pZ{m8ggsv5*iHVI|^#8cW2z$qjcyvdK9+ks@nE-5aqW zHtdSM_uhNOUcidIzxVX{o*G}{{b%mXf*78U&%59E|5MIA=gc`XvorVjZaIo2YD#0V zSOce;eJaXPv8^|?q%?J_YfWY9Q`xe*7WT`GOniJwr}h)+IF*c;Po=*SNoTarnN(9Y zKCY>yZsz1vCN;z7ICY0)k6P-PsLQl9x6X<5HMceCbc#7OCqAA1x|U2aHU*^9RNwJR}ATYC$t5^&Qz;;%e1 z8hN@!6f3bz+i2Pr9wtH>;rX?+9lSIVjw5X*vb4RpwS%}dQc;-+SF$vu9j#*x<+M}e z*v^VZ!Li`CF6{!BrdW%Xz8`A~6%qHe+}R%0*Vf;!AV;G$?dqj!X}6$PVA8s~I9aY} z51Z35!3N06o{3;^Aeq<;!>eY{i$jrBY;ZJX1x{d%ehWF}SyGRwx*QdPof94{L8 zRkWXIEXmUZqj7(u(d4*_CK?`3lVBkU3kQTW8MzKjHml|R0~JlN**qxNa$S<6G);xd zX(1gf$vH&)GU+%}^wuakO!Q)&gmic?YBkOOJBFR@bcCiO#Y>~DRO@<;BEO)m!A_OQB6sJ0$L~qHMt( zZIalQsF6j`(9*Iq9pj3lXqGGPLbF|QG|h3vU1=`ze5_<+USXEx#Kdum+Kq`k9d8WG z4+bMHCm??(hIEqnVDYjgD%0`mwf3Ng}&=Tx?YT%nSO)F->B#&k;lGubC)w{t=rHonr?;4 z+d{fsVta?1nsnWWk0>r|CTmW`wbh+r*S}_+?$vakY$o@+JUt+mA5`>^ z4Zy=D0FSr;JPL0g3+Zv$OrG$(wwmsUXx8VnpQR^r^i(3~w#J4Zb;hR}nK(Ty+v+o7 zie9y9Uo*bHo|xcPY_iDxL+Wf( z+KW$1H~U$gYHp3w8>jS)#TLH&;)N&mudeQ(Hx<2QW(yVyEt^nbHDD1ecd36{ti7Y? zU2E;VbxZyGnm#} zGA-rl5m5}uHID>Bo_IvD-j>(1yuRf=mg#3rok-I!A^pQ!%ONaVmugDku=dYjRF$=Z zznXuA^lwG~aT9KO6y)f9XHPyWlYKF2`7dJmbHTv52&}jmD8nz#b2sU*XQ9ztf|~g& z;Zh^m-3WGZ_TV1q!M}&`M+V0|(V)4PG&`elRK#d+u;TTA*68{uHi~X2kO)e&xew}^ z`vScV?gvubA6O#-UNm=jfRWh%81g`%m#q;GF!jtKmcmy=+{2LSEMlTmap%L3qPG-y5$i1AMWDLx43#;>FdMN>`BG@u>-!P4=E z#(xMjXg(B(@yn?^n6&!%50e&HJ{+w02%sIpktn($NaVQ|4VsSv7REmv6?yq+u;LWZ zzU-msUN#e`4h@>?ff&CJQrrM^bUNZ(Q4Gmde<10m5 zHl6Xa+41-W!Ffnxiw;3K?f@ptsW6h|lhIIDZ=)8M(AE1*Iu+*y=s+y+>)=yRR(vYZ z991$c+?;Ak&-M8u zORf^W^WyV=5H2noS-A{+(4&u^A-rVDAuEKx`}mt3mV@`nwjV2edda{SmJi@7jryyA zYfY|6>eY}yQm+B!_*!7X{LxRhS-lP|jV6OdQ3kI^gW?;cIm%OGl%5;WV$yRH3L)PN zjE=c-*yCGJmB`)-R(u=KM)r2;=<3GvM-P5{2Q+KG6Nm+W7bxVrfjA_~*%9CE{3gB! z4LQCSxayc5o zbjVKhn6w$w6Sd!5$&aI6sE*?&P{scABna+41q%6Tpcf1a`uMf{49aegeOAga_8eI8 z^FU)PS(+^F;1{G5`|OfvVK{XAtm9aP?Xm2>PdV;q+uipn$LspO)8lRhPETh$FNLR? zUj*V}$}frEc3*#4${64)V8yQjZ8Tq#4vFUL($*+3Y_ySk17$?+O;E^h0daHXWy077 zWHL3Uc{!>%ej8}6eC*HQcTjO=-<2}Vz6VzPKG2%|KsvqZNi-fwG$tyV!+c-A*TXDff8qa2{-ax8RabY_tLpCRevcE&a#9jc zR5rCWq~}R#PRinmiiV_3OL=`;N9)}ARrT$fu4?UQu}w+Ueu@vciQ_?B2!wbO)mfQLNx#2LgD z(`J4|{Q8qGcYGJPPj=Dq#HAGjUwn6f46?FO?V546H#TO|Hj7B-$7JK-C&}QPY!XlO zoE0i$(|Tr+If#)oq+4tNPYkm7l(|@ z$tWzg)~0#s1{u9ZaABX=bhnOTvKu4YossQfWQ>cfGB0~FvV+IS*f6rajEr-UEtI{x zMmFBcKB>iZGCn>zdL}HY)sxuLl#0#3yku>%b#6?mmmq>=ue@~8sr_p)1rv-+Oq4ga zx68hhtE1s*E0Z!68JF<=_*nZJnd~3yfXs-+iMV(4pRy=ZtQ?5!)C6)+sy2|Rfz+ll znCy%T`e0h0X5l35w2P!O;(zHY;)+g#Yn4n+xCBpt=-C8#3q3#W6K6&Q;D+HCh72j9ZL_#8JXuj z%+EBg^V9aLmIYRhUt^VrTX-t#X|#nGa-rww=>#K-=n3+PnQ>UWCs{c;UMHu-eaNS> zTi!_E9;hrYoea6(nhQ>TVP$cmyuB?gOY(ABAg8Cig{8Ex%*YweLU~@6)575HHoBZ? z<*Y=xJ+58O&dWK0oSRD5$$5C7sB)wDyJJDlXBZdI`h`X=@(H`xRie>yNw^X&HF8`JP8+-W1KyKn}-yFy-m~EStTi2MOs1C6@_*!%uW4oPx?=W(w^NYjlE@lqLP5AS^ zyW2|ZTsrTzat~s4XXd^1d7qK{z0U_Seg8j>xCgB~RGZ;3_Ha%fiDx#=6i%u~jXajY z8TEL^A5c#O@+4FER3J~IxcAP^ofYOxp2^9x@eJyMFSOaYNzJCKqLfZ|l27V$OzQIt z@C73;x&V9UWhHmm0Y0fAc2^JjRfqb45e|+p9^nqv@{*O83%lnPI(yZ~YtGpQd3l{P zI4?93E;sW3?F_~ugMA`gFT(XB>>HuHVdYH}ptk~fJJro?D_fszNoq{q$uuT#MDd3z z_--KY8F}Alc~+R_*u<9h`Z-Zm`JiS4zhQpKus<~ND;GA3#z)*R&Ke(0KDP4f!tVNn zoquEGx4!f5TxTBiZbyA;<+D_${NBjtiJqxw-oLQ&WguUrGDUl7ZBt*or~bg{|0BKq z$;j7!`oH<#PXC{+e9P%~o9Qof_E#g{`RV_j#R(VAAHw3a%E*t=h4ZI1ic?Ph7G602 z?k}AG2;`quen!c{`gwK83lqNUUs5vU_PXW7)(PK!M11&-V5jeaeHu1fNL;n?`?G%l zPQIb_UgCLU%PLm^KV5m=bmHi`5myqo>i@;kAAxhP%za3FY}9}eKLPtbbM1U$-ORn7 z|Ea&SKw2#Y?e=;)u_N2mmd<8*P1Q2=9Db6P19Dmc8c+0YYuDCfOVfh1R-$2XqDOdt z)gF*g1aMi!bya&pGTMu6Lvq@i7*zv}K?t-DXe!fXmFjvil30kaf3YzNF%>YI> z2Tk-&*=)Febqm*J2xy>Nf}*PRLRD+jWNTW7q9LbSfo5j;9TzQb*C7^`HN5g9_ve6^ z4ntq$d22vCr`r$`r4w{pn)sk57VS&jjxvJS9%wYqZhS2&&FgS#5JUx!W!(Xi)e)eD z_;+Ln4BZK6vh|EU#0kK?({d ze3*xIPeIs!^Xd?XMmVfmYhl^y!JtUZG(e(sq8>tbMX52A@nF+|MrVM!2P?~Kf*Qo< zDmdT7bS7G?9tv7W&tdF@;fDi_9s%lyA4v^|b8#`fdK6l$)`22LM+1z`Vy7rY$B>7q zq|>B8ohhu6dTRcwRnho|?g6k-=1@~uB`uJw zwt^PY(8dlJ+72{27t{|GYBE&^#zJ#nVv(2sGRYH2MorXBZn{F*WX_)Foiu zVX3FNZl}9${6?co!LE5~>N4v2rAE&H7oDBD9DJZ>f?7QbH2ebbzdc0tY-j!)(AwrQ zP!!~I0XaPnG~QeZy12Oq=;~AMd`LzwU>i=k3yJ=ey9h#{7lWoIFIp8&bfPlF3zuGk zR;!m%m(6tK^fGXX3Qf-*sh5LAg87Q4E65|kR{)J(3CaXRaTOJuxU0cfy@r}Py_S%v zX!a9-9eZK;^+2OHP|MQ-6MrK$9DfrV6XmI->dj!uEznyWhi@5rtK+0&p56uyi{W-Q zR5ms#zUJKlKCgFz26`7L)-*X0-Z=FEuEjUS>dU>B3PTB`-?50rs5;_!$4*`Mx z3Y3}T$1nW|0u6r*H2P~$XSg!2pHPE0s(6${H#y`vIs9>Q`179=^*0z{^|zq#^*cbI zpMr8xd|1tP`S3iLQkt-9lfl7#t5t5f)+gdg+}1vuRx>UffjlAo=UvhBZjY5{J4A| zF8O0eF8i3c_^Dx4gMjbNxNjkG`_F1t4(hKzV1(6Gpal;<(g-~K1T^|L&>|0ir;@Y& z54I;NP!V?3e}Z@HNd4Kfqm=XIT1RWCXLr*wuvp{e0M1VZuz0;yLZFVN@)px#zLYG|uJ7p`_6wv5s(9o{iRJ&2Rw(nbh ziDS8R`FR=R+^;)XcSjGadw`~rbvlNAR?qyN6!A=BfkyWN^|6klhOzF=#xAk$gBHX( z9uR0XXe!yM6NstIET89zkbR!_1z)F=K#TLdA7uL8A82$kwNaiApoYGuu(6Bp1JMHC zHGn`50!?LFk>{!4e4cB`!|uUAqtif(@_Y!D>;9IpYEhWo?uwiLB!(hqVA<+)Q093C z{jAP&f+C)2CeY}ipgz{as9~&!v$0F8N1z3<9tjBaC{X6Pj+n}H_&gsC+2?r{_&PlX zv^dX6$n>288m*@`%5wuX^qpp77vGI&f$!OXKr^7MdhTA;1cBz7fkx+m(i{$|7Ak11 z6^zw3YU;F|P?U+e?1kY1G^*4_Pm`sF<1HJ*dsPS6@LqK+IPA0I0PYTu*Lh$Aoex@E zGZsLm{o{c~7lJzb<$29fL;EMNF}yr20-LC`8BM44MDqOft|x(muag1ps*%@Izy^9M zC~I*Zh?v3)Q~-evJAt^!Q^kOAJ};&QH=VV9WY;CoT0ISvsdaa;(>Va9mI93~1NElP zpvId5?|wvXdtDCB>Y1SKqI4FRt{0`V(Guu6u0x%kOTG7b9t8S4ABf91Reb#LsK1cP zbwBF+RqI6zQ$Gm^ z^eIpj)MgfAB=>26UnkFyhpT6SMxO%>U16o;Z0ywMsmD@T5zSi!zHoXhsg4v8`hRQm zxVoT*{kki_-i{yV(U14&|E5=DghGSKKNpgyCoQp1eC e#>Qr*vN_7(>)?^YHvoaY3CcqK7BQ9WsQMRGdHVAJ diff --git a/pandas/tests/io/data/legacy_pickle/0.12.0/0.12.0_x86_64_linux_2.7.3.pickle b/pandas/tests/io/data/legacy_pickle/0.12.0/0.12.0_x86_64_linux_2.7.3.pickle deleted file mode 100644 index 3049e9479158173effed407441fb6b50274a87cc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8768 zcmcIq2b5G*wyjQtLP>&RLd8^wv?LJ`r9l+2rNs+{v2ClmtEp}}om00ZNQtQRdQeeJ zm=!VSoCD^BIqNt&&Ww&>W^|lU(Fy*&-@C7wIySo@4>DeK@jP#CG7F9-b5)FJr z>+GVoR8;!Ji)T)dO)4XmMIExKk-o9Y(sXFhws$6Es{why7agxkgRMOi}$Pv z6tYz*JQRVf7 z|LGOZQp(D-bZOep`~mFXfkvjg4jz;qu{<6NUH(!!Wrmf5k)}#d4oOydGSic)WEz8> z_M<VI{4tJ)G_*qlk4Z9p^F7iwOX&#ZlhTtggO^!3IaV#F#KM42WwV?S zkIjm6(!qfHcb#hTD=W+6C9N$fIV~rrdvZoHG_ZmO&NOls4V2_$B@GPeZmG-JR?dl+ z*afX}ZcfhgR9@;^to8J*LZR*NBKHWuE!|bjNH&= zXxt{my&>F)Ze&FWg}>*%XrPG02jEevc#^0V|5`9k-VZk9UmN+rZOFg*UmNm=*pPj{3_?D#^4t8f{Fr8c zXXN+J>>tt}pe?TgOrU&X<y7;2?xR0;$yZkX5!^@r ziA?+V(SLdJZ!13`i`7Vpjb z9z6Yq=6lJjCKi{i2Y<3|)ok*(>Je+mI}G@I#Sh@NHQ5iyYsU^8@gunJ)7LH{SI^z& zxgQ58C$`lhV7Ir|sqLADmQ*IgTdWqNnT0VdO(CbMqhYc>TM?F&qAn7Z1J<(?hN zbIs@gU@+NqAQhZ%TuYprx*2M$4g!WVR0p#Wp57d6bPJ$+dI%MTvsJf5jn%Dy$qwBb zWON%~ym!)Og1M{P`b~xcJ>3q7jjtCrzIshGr*wN%WOWB%x+YwfZesflqQTCG*P+CL zd?2dB&=y(V5fsbnPNaCz6y2F7-l~kc#jCrpj2LzW8;!9UXI4&!Q-K&tdCKT+C|Ml= z%tyaF8{o;2V58+g_v9W_@JX&!9fgwB(Lh8#24r+B8w7{1j-w1Kd-|I3Kw9aU)4iy` zRy_em3Vs{ERqF&;u(~%e>FGpJyl8*jhh_`npTsi!?+Z4%AJ9cHnF>Zh=URc1)hWPy z{8QNgPwo#kIt}QaJb(&5$&FeMM9J!OAmTp=WON1_xcCpI3@eqs<`5vQL~}Zm3dEm3 z{evWIeEdQCZ7KrZ-tn*st%7B%hXRq8S)h2)R6UID3i4uDM!#l*jm`nGU&T3%Q-SFG zmBE{a&P9#Y!-4tS9Kl9-`bev_Yx4^LBc(y)9XJ!sc04ra6;;dV0L6F*yu?>cBC|?%c#IBgr3a0 zMs6^ToK1~fYK_ci%IYbgs+^t*@-zozns^$jhXU<(fQ|kNNV`}7%c<~BMm-Ipe>m#t zeycP5R{Uh6DOCV2Q1|~BY75}pF3QAdh6&Op8>$Ecjca4Hu#%pMVDgHXBQ$u`JOleD^ zDHFUj%uKW-=A|^MZv^e%1Xb%>z{0uqHparXQj}yB~p#{uW5PWjXzr3gkH!CKqM@*L4)b50m;kw6OYnAYAf*DuvC@r2bs0rOMm&ol!2e*qi) z3YfS5HB~$(El%2|FCipUIZqN zIrAg(@+XIt4+g(G=e{N6T|cc{H+X=4k0-3I2j=bnKqIjKBiQIafO-4>q>6+6FV@CO zv5Absen4lcKNU1J0D^xS!qh;>Ftr)T(?P&O zQ-e{asm;N7mjF6bL#UvsEg|^FDNJnz8K$-ddAbd-(A2gl)6`I~(d~fF)b>=+)D95* z;~A!gL58UvL7wgeOr|T4TV%tPV$+PIm(v9YHO!irsp5 zD(G}1>o%-ij8Hjh;Cc^`r=x($^x+t=(U8K8c?@Nk9Sb%(4(OZZI@*&e%=7#O8S|pk zue&my^M9efYgxym#_C?cWTILpfD6WaZx+#`iD0Aq09~Mys9>P`vaVC0`=JJbP6l~e z0Zb-3bP73{t_gEI73DC;`$MkQX~4o9AAmAl9|$%&o!TJB2T?)SGg#Nj^}(ot>q?NP zhX9l5X5@G#q%g--lwtNzu+dq-e2x#JYU5u*)-Me*+udFkeow=mW?Tun}<+ru0mi*lIb8pzdp9I!CQ z36$wN2{u|wZII(SD(E`Jx=ya^Q3KcWK%S<7+~)ktR|5*P)(AE_A4qFBlA5TXwPpxb zTd1klR#HJ8+Sm$@3)rYq8+0Z^1s}Jp3oc*n5QEFt@sO~*7J&S-KBo&Idb$W$xKS)d zndVOb8(jjV`I4Mwsi652Sr^=}mO_k|*|erodJ<)RtJh_a;ObQe=OIER;0QFvsz6`DV)!lfFro`xr^o(^QL{X5(lGy>}@z(&sm zI_qaqQD`0IPQS36x4sfJR?i0dx2AI-biOs6iyBYQ^BYv_`P7HqScL++aRC^&da4kl zfAU{M)yALv{VVihJYn?`U_Q)CX$1B!0~@^@n74lgRfU0u0ioXig_qI%b!Ig*R<8td zu-1SJr{7g5)78~rqt{T&P;z=L6`UW}LGYIX4wCC3V-?&0^7KYv;Z(Z`WtzGfZ1fhO zGj%H!G<6$<@T9yQGECh8^7Kw1&!oG^_$XF}{|Jz3(z{X2>OH^>U&Q>kS6>~S#YR!R z7kb=J?*oPF{(i{S`T(#n)(25$2Ok0(eVAIriu3CcD%inCSr;#xVN=ahbS(;jJXIfq zj5Y8$2%UZcA7%hP+{;4cc|LF^6`-=Ryvj4>|k;NN5Ke`19zg)fa%=z}JBbHm(<0M2s(i pjlK+Y*?ffxX7g3nH8PEjE`P53!D{I8NYY;-hB)#u*)JS$kWFla?=HckYsly0dHJU7$Eu*ggkF)d3`Gt5f!QE3#q(8Ip(%sDf2E_ZqOlV1%#?l&{% zJKucY|9r1=_UbB^L{%-GEo98K1RknYEtWAHlf*g;rF>6sqO(ZNM81@BvIeC_uhK!y z&SJrKXsOdtNW8PhC}a#f;n*h4ns$N-QWk;)C!05ypjd^}BqM!VG`;ysQmZDTTcS#z zCtS%my*;Kv>a?n*bCZ!|6gcYDWK4_BPo+|qy$TomNk&{%t18&ZDE@Q$o%DdVEE2ir z*~e=hZi+<4tSf3rGs9vxBE9v7>kc5TO3Z9~1n3pJ@4Oc2&Sfk2BYp9y=g-{-^kRGC zdZb%p&G+qVa#H2C6uY`?)3L#tUWF*rH<4j38Ln!z9o_(G=tQK@P6dB5X3jAbG6Hmr zBqLzTNG-N{O~xjpTrFBiMoa0`=Ki!+oh>+|(ejaF0DG`4DL5C-=9o44n2qL`SpZX3 zz|^cJtB-_<0~29a)FY89POCYMaT@2ehSOS3>o~3F)JgUAr%PImx58wMOHKig$!FJ^ z88TL2#;ou-l;zA76fzDVJJltp0e1%=Wsva_H{f8>f|0Y8<=H}a&TP*WJ6A6?3P!g{ z6*2)PpRSXMR@@4pn8cun_gT#Yn#$FS9Lr9ozoC=KfI0JN1w5AdGMVCoJXJ^rkS`Y? zPh%i&kbQ9hfUJ`<)J5X^$*3^Pv2)q9vI2a;c8q+FLeAU*U~YTh{_C%4Or^jh&eE#8 zibZnvqEZu&rI|4tpsRpq0U;oD_1le(!x}q<&xP#^$T=zp``kpqkV5EcNNPx zS5p0rLKLi9msq!XLk5TMG@cbNE#TI)dbEzy!Hvwii>x`mbyfK+na@mnlS?iZrY+D( zYsFAriW!rKa+~jE?Wk>}&zd%%JE~4DQ6*azGFz&ccaIWq3pkv##QSE(Tg`aK4Hel= z+SJLV@<$eh0L#wiNq6y(fHiY~Sd^ho_`gh-B|_xl0gdqs-I1dHphrq4m#JEHv0##< zR_&}QlDC*nKRW+X$*z=;H7^w5AOfzT7Hd>1Tdd2i_F%Cbn=E6Cb%jf=6pOW7CoAA% zK{e9Y$CN&+{-o3s3v0$om%LTA8myo!yv>pd0zmL~34(VBtstE5Lul)iQN8fFs;KS@4<3tm(ozjiJ>)k zGlaM_PhFil$$*7H8aJ7RBboEAeZ4JIxWHkY4(GIi(-EACe9{%fFE`jL4$GOnZC;`1 z_CwfWy4Uwv;Gm1SC{BiUP>-Rx)uZD$RXIJU)mgN0&>IujcX+xetSIt4l1+}39Ay<# z1@Z;N9n_gf1WhfA&DCKZtngkuLM_Vmv+&DHD^vLYSs5sZN>?_Q>$u9ul}u9fA+n}x zwQ26LJzA^-eUJF<35Dvp%tH6HUE-*gp*3-@#2!t!W{JDzRXSM%00b#vtpEV(iQkj< z9%oMqzWm_CJ#7aYBaw->=-ZLz+YanUddEZkGhYTeEHi!s(!sHhb-x01(arhWknWgQ z6(0op^xhrIk_R%F@wu&sfcETipFp~MR@02bKu6qtQ!i3|k6P>S_Qgh=WZGsFM~4 zR=KmkFYL%eIQ#pv3Oz30d;-`m|$nFW+nt3;epI z)j_WAqSz&Vz$F7mWNJ2~c!>+PV8*4$s*o+P>>qT=hcFn1^I-6#Y_*1J$>nBz8%VfT zC)=Tw5D7}!wy<5^hy5J+5e96G^*Q9Dsupe2qU~Dr5-qyWWNBAghL16BsKVip`}lE& z92CgcH{_g@g<-P8C7)0&cfbnA#C38^Ky&p;Y_5W!xSp9(%ZkHXbkg8D{^Af60|){I zd$t^QG*;fvwVqs%h(z7!t!RsfvgW>60t_si(GLBxJV@WRf7d=&!L z5OEstd@aoLb%^K+Jl_cO+yWtAf#;iHo^L@;@OVr}fJCdug90x(-KsToWXvuDc2XUy z;edpETZ^ahjDUPc8|{yTT7=vD7NL~3t@$pwU5in(yM=rgYz;PH-^1?*o3QW8`=cK) z?Rf6G1GX$_gOVR2Sx0{4ZQplm)ugnR{1`UxQgq(Qrh?leh(F=Gox8l<&QF0EV82`b z*gb4U9gFA5!Y#1X7dt2H)-X6K#QM+5=ZY;tCFt}L}3vIqMmILFJ7`*0g( z!+Ta%_PXSL)sjzj*a#l!iF)#L$d3b@KZK=}@=TcLPkt(ZFvsI9^r2^&4?U-o z=VdKlU^96f{h9j^ZyXY^@PCmO?5`d-TlY#kX0b>jEXtq7j#`YDKeNMy7bVkP3M=?G zNkM4i(pC#I?*KFJ?>hO1Y~I1}?3bn4)*LYJ70JB8ka@>n{{}aDW*)1+@Pn>Hl6i;2 z3jR}4keN51nfEVd-oJJ7AKAQDL$fI+GD!e@$j`(yilgCVOsnus*nV2gd7_>_&=`}5 zS((P+mL+Cd19xTFnAVDka7YHBL+iYWZ~{f^abn!Ds0k;{c#j8CbQs*RsG-A=RnnpX z-{7AvAXkY~E<_k!`~T(8r6Vd#5x#{Kr?Rw6M`U};9s|!6GYdVXjB_mG3fXDz3!D{I8NYY;-hB)#u*9J}Qk;7kb$DojGS7m%Ff({A%{c{buHT z=bP{QpYL@JmsdD6s%r5}K5cHI@KC90v9w{EG}f6f=DK^DI}6Ng&K0wE#$e3oQ978} zS;$*9D|Xrnjdyk%`Ltm*+m^{Prq#>^Da$~DoynQYQLI9%5|KVFn%Z_LtyUA!gwp5A zR?>D)x2ez?tztz>B9e##L#>*KY0>kO$>hba!Nq?P5l7W3@)k^mKfB*f4QMMPk-MIK zyz1eGNTg|VK|`7z5xXAg&DULX5NSp8?6yaMuG)M1)kt@*TzdfNi%&g&)_$NDSiM`2 zZjUwIyT8FsmYP!N>atAR0%Lj(rXwxQ5bpkpKv z0aHe6u?-v379HczdM%nyM2o59w*Hh>naSIXJx>pp}Bg1dR(?C1|yvHGFr2lOVWu@>*j77UxtlCMya`7uxL=QvSW7VMREYILWUb_^aUY+(N%n%E_LW?xlU_zs=>GX60{b- zrSy^76H-J77K*nx^sOqXBYrJ-8-XhD_COT?2;LEb;GIG(NNrHt+)E-VU16y8K4u0- z#C5V(LRHjit?mwy&)?zG<;zzEq*R2Svxw&m_b;x+Z zyB<}q?IYlx7YRv%4DFsSLu0E;#|x?oI#u1 zXJuNf1KrN@+7TP-3zFaBb~3$+JkgT{d3z70qx%7Jb`rYyoOnafsVTKh90E)+J$!?t{=KrT~f3%dDF6Z z&$<;raUA*{RjbVxSkB01Hk-6}K&uiB?B$T4pjG9~>J^xv-WyI(?*pX#Ah;5dI$=RT zN+yh`D4mo&j%C>}yUqvhePF>V$ zAV+sGYzjZ%&<`Gwq4|*FW-Qom85bg}Lbt=B?{MhV7zraqB)C#`lA-$VaOeFYknmxh zegyglnVLdd6uzbVsFxf+hPcLfXG3?XTC`1zF43Y(wdgXFr`ymXd>omf_y)u6nqh_; zl*QLK-0UR55cz~duT@F!013v!Covwb3&aD|vrl11@p zOLhMea#eBN&qg8sb!^>_;saj}d>loV==PPsunRr-z*j*xaMrJ(v-(l`b-@bz4|)>> zsUc(2XZc2e<(m-1Wms+wuzU-GybQ~?11z^d5^z~eNO+c3i>Cl?GP+f(>qwhj25go( zHo)Nr{f-t-;Q;{st~S;i3G@Ts^ZJ2e$|8##`h6|N%yo0=ZD6Or&-wv=-rr~a5K-`B zYA=q8T=gT^n4}CwZ%4A3{@C4|-=S5~;wJhN*qVoC-ib58(-FVB#Fpl#?w00m;03_% z34iKdoKVBVce3;b?C0g~2-`A@hccnPuVkv+29%@jChI;pV(H2-%cl2ZV+03uY5D-} zv@CefOUQ!`{h3O_$2EKe&+26D_&H?5hY05XU+DCgVF!9R0No?y)4bp7??>kUJE|0uNTH*q~@`S+F5j1&W_o1DQ(335gyW>fu z19wbqZUXw~np?Lbo&HQ>UsHXFkS9DL=t;kZ(^fbVI)@(MbS{08(|PnaoX)3Dae6NO zt?OgI3$MxVxiHr$pDrKN&jeWhP(G-i4X`}tMJ(Xwr-tVPEH8L*4?&G*Rp>Q;>$S5d%mYZZbUi!?>3;r51>u&+= zUJhvo4O)uK} ztU4^Ph6}_r%xdA5XJV$nT}k?7Bg8y7_=31$Bi(s$uEgqaUfd?kG}$P;$73co8t!<= zFg#B~qkJ9e@eQUFo}7nq(#0B%oFTmnDU3p{fWs~}mVfMi!gH^wvqt_L{CJZPb(|Ru zkGn!foD`H(E~!ZcvA*CXPyMl0v2j8#FT&Y)DbEG?WOyu_P}nJ9#tEEJ^a_{|idr20 zOUTT!i7?`@Q{mU&y6}AjA68Z(lfov!I5^;G$cCOc8E$ko1%AVxI2GUco_IPwYVqZ^ znQLRy;K99L!p?wu;hQbw5SV%3!D{I8NYY;-hB)#u*)JS$kWFla?=HckYsly0dHJU73ekTL9pW1BQ<+6g8|SqKuGY~EagVii)8jPz>J^p?v>t(uG`m0nM_ zl5u*vO@-8HRZHh4BgrT*)T_yu7M-6;r7n2|F7}g*xT;oFuwg3vIekvLUt1Q5-2KdB zH4imKB4gGUHKdtgu^W)ya{aXjkX9vTwml5=vfX!FgLLPzmHUyt@Z@u6?*n?Fy=eo| zt+D2N_cb}Ga#M<3UAF1iU`&rflP-v!tKN&OU7z!ByI!2Na zFlD3`TeCJ}lToe~EhM9*bZSdqTC2_$9MWj{z%hJ1)Rq*S^JjC&ntaGcdyrXxQdXeU ztR}0EM2X`PVOP{6kt$BBIgN1|=d^~?T2AXYt>@H9_4cJpT8+29WQE5H$K$H;dpiNybb#2Y|T zVEQ>nktB>17>j47_qkRA^tMoAE#8$Y8V+e;iqCS%Y@zshIyqlbe9lomH<+i2C7dfM z{>EU$$Vzmn_P0CFl>QNS}Vrz zQp}V*hTD7xYe!`xz1FmT-BERNkt&(85J#(+Uyl-43ox9t#QJ7rt!Av_28(DXZR+G= z`4fu*w6em(7Y~Y7Ge?I-7ixt6$~0NRB`)sQ7`sp$DJlC(=U~bN^w#1LU9ce;2Lb5Mzykay3}e9*2%HSGPX{ax#V)OPRn(&0=^X#APv8z z^jh^Nq=Z;lCRV!St*X^v1(o1!mQ(`(fVWEkyn`zRi4E$ScL_X&BMi3M$IRb|_d6|3 z09DfJ?d}eq&EMq`UA0<74zs%rYMTj>$k~%f=kEr~I$ZJ|EP!??4V9Z1T7x%3h)eT) z)v1#V7#L)36Ac{6+;-ioZJ@#h4&!tZt@tY9})pD7Io@u+pQ7uDj;tq)onefUI_sT1DvKH(Qa=|)of4Kkb8DjqlZv4Q+ zJ#7aYBaw+W>)Vm$+YanUdi#TYGhYHaEHi#1(t)v$u6h~hqMP!!BHb~sDn0=8sl7Xv zBb}|!*oAaTuhsD8^}UdupiO0s>ZT*o)0Kcz5CWSKrxO+eR=Jk*^Tq{Qd^irS1UMdKO>(G)~RuSf+e+JjkD?(5K!2J~vA9UoOLh(_S z=3`j&%ToOHp<#4=2e;Lq2Dg`V{LG2f@v|sCREn?(DbsUjhy~5SeJ9iPbLG13LQe1u zsIY$b1I%y@9Z!#cLaRq7b2>FtX^+79^Y~egGfpeh|Ax>owirzJFO=(kBXZR+-OoWG z!F6oikKzMg41F9$mg)AT(69$R_`pq|8#wEi(OH8S{R(G=od@|Ugs34RGhq2zh~?`L z!WCG)5n{O+Lc9XYH$yDnf)wDfn2_#>R*(CCFBRRQHFRXmE(5kn9c$p&gM3?yr*Z2~ zzN3xyM?$^8t$r_1O54_am)xessJW_zd>8Btc39uT&j&lK@5|etAK)m@RJX&HByCXg zLnQ0TkGyU9ZmpV>){!5>rd*1~J8&j=I^y>ezL~kx+synFcmeRcj80UJpF=LZ-@@$wfKGlPJJ2sf&^=f=$p?M@uaNnFIiUd@e=Xzq5Q+j}eV7-H zWBV?x-ow6-N@qslhoPKkgx@@stX^<{9QS&+6nkS;OaXA`hECbFbk| zK?2tLFVZUgl`-Q|SK0uJl@U=;{wy}gV!ZsBofNzvne}2wyT3`=L3@_AT9|1Em}!64 z$v*PPOX|Dv6DJCpwK77Vcw={~Qu*81GrB(PQY&)&yJf0iS7!!!8m&W0iWnWqYcV#J; z*7A99%muMR>%4hzibU&iUfi*$31`H3kB3Th7~HWip~I0?(xCz0U>acwc?joPbi|P} zghwHTQOFN)v_(gx;a74)IuYOa`GZcvM=id@G4pM7GCX+q?Q{y<^Q1DzItZpt zg%=WX({L(EO2(5_TdPJoVp-xY*>JO@FlQ%HWW}~OV#VGS zvG?A4@4a`i_g(`Jey$pYQVUzH{cxoS8W@b7t<{9Nr&M@kMFfxmkBnx@)4ms61ZK@t}=( zwV69B6W84;i_!Yzk$R2tSb0&iUQ_AriSm-lBIJ(JJt`_HcK;iXh4NTl_e_j0Zpi6g zFou6d_xAN#M)yfC96u!%d+hVKI=$LE7VA5&aUXJZw~|Z9H=lRbkL2RgEylhE-s}Dc zPa-d#vhRE3@0NbK`Rm}(xdn%kPblg0)a$)6niGx}Q<==6Hi=}r0~PN>e*nN(c&M~7qOv1Yx#(i>Qx&zxPI(*yE)Lp$0AJ+Qo} zDP7U52l;w%y0QZ-);1Wukrz4K&P(9!NeyPZ5uvL$rmHvc;O9-19uoRF;RUX(9jS)~ zuH8)OVTtk%p=&o^#kEFn5jYXe;-UZfaCZ2xH+;BO_;3yJ=@d|mQpjg+Cb*a%? zRuuGBzTP_BRLSDOs?o#qdYfP^*4NI-RO=DW({_|Wq4-q0!s2>cIC{Ih-ae>M1^=e> z$dxL<9_oxBpydTSKDwX@Qr>8*|h zJ;ochaC{={^S=&Wml-`4gBs`S@#$1jPrww}ofF5tuscTY!a3QMtxQyUx3HB-;hgL_ zLhl~TNx9N{xH*}xC$BswV~nmyFHGt^6BAv}%8T5fO|Gss9p4-4D`!&mv*LQnvGDzg z&pmU_(Y-3Li1t!i*)>$t*r=yYEDJ4UjNY4F-p3oAYS#Ov?Lurw)o1j66-{YxBafc0 z%y#;byJ3BYm}Z5n)TW= z>s;VgrjvE8(OKG?=}p6s>e3!ZVuM^wU1Q4VdON0uyl!-3nw8YrjY-&ePH7W1-qdE} z?QH{nh|#k#ra8Wz%RbC=V>*;(=PR8L%^ns`ea%R{Aej2Yl|I5v{V09p%2OXR`Y3G2 zXhjBt{eDb1=(2#jguHBlKHBI4JK%c;PG}CDz;4&q>SL@27UuP_&I89K_3>@SmC20y zVO%+ZYxqPCC(kJ@q}y^u0#kM|<~s zi!k*Mw3>R8(+}FIe<-gXc2oaIQa_58?fLu|TYg;WC)hF$=O?}H{~rhGQ${~s;qkD1 zCa<4OcRMQZfvULL6)0U6&_meD`hheDwD5*cjpzQ(m2?zbD(w}kAJqr4B4th>F=6^k) zR`+y{hX%Wayhh0EBcJX8#q<|Oe~B>nm9M|1kG^S%9iaaW@7AdF?d zl=pD!=~s5?H>H=cO9)lVxtVcx#$p`4USag_(Z>ISW&c$AudwXjwk$6@TyMgOFj53e zH%l={Ne7@WB^EnUkxEJ@a5%Bjnd#2W(gm!fD=?U0iBs|4oMh>S9vE2zSW%EQLB4bc zrc=$*gIvkwQd3)3S9M6Lt|=o7?~(Jp{p=k$9qa|^1)aTHRAJRiZ%Qq9KUoVhZW|AJ z*d)>iLSFg;6W-EFT)(m3?DpX0Iou<%Hq^m3k#&$vN0FpfxWPmNWp)DA`Ysf%ItDFyc8N?!d{Nt=@p9&xFNZSTtD{5_sI9Wa@Z2`pk7~`upB&VN&X%3{oC{&wgTM!(aYwNll$)U#EM?BsqMxPV7u=s&sVeap24owiF+(H3O11@ZhPoDHJ1YK@Awjl>MI$4DEo-x7 z_sb4!5IxxutYjx(u(f3r6|EzI?2H^Eqk%3S$QTwv?J}^EvB0qQI4W2>iVfqDV`Ksl zv0)dGl3jtmOtiQg74Xs|uv^EwGY#wIU?qD1!*x6v$t;)ae{-Q#pnW5I0xPmyJr&I| z1?0(U8 zgKrI~U0ES6`=TtC^M1A>-)$L07|6pQ64>3J7TbGaTK{8`x1_-g( z1Nl-5RYW1xC4MhsDV8jQu4nJ|U z%y(UBW&>#X7_gFsK(<^|kYlMp04d=GR*r*Y4x*!09w8P ztmHx7*ujvOOb0O5`+ zK}xOy`f|0!Yp8JUxE2DVQquZp1j`egwR40&*w+z87? zZUVOHrkmL?dUOj|$*n;42&?5bDmdq~D7Qm0atAPS(VZ-Sns5e@%YNJgFmM#F!e1yJ(^u#y*nVa=DQU`@NWUWR1k6(ENH zDoDv|Kwn0Jb*uBV?NT39RI2U})+WDro9gB(gOCnEDL_Q_DcUEC;4D z&9Z`QPkMX3bkiK^2?57@FHp2( zdqZGTYk`&Y0kSEC#J*IpskNEsJ}SvNklc4A=?4J@*9CFGuE&yY4E-U{@cLjS8&Czq zT?#UQ3hae}ZqLGnG5$xs{m&?BkE0EdV`LyO>h2&2v^p59WFuf`bz>@=Ra)cI7Rx3m zX=GC%KbXl7+6cbe)(U=XNwG4%fF>VI(Wf(A>3rgFYivNIL<1`vJsb$&y>eK;F@+|MvF z8X6;GfM}`=cBKm8J8Cp&qP_6FY=KNf4I{e& zqvbG(Mqqz;u#$3MWPcB;+8R{QPRrtOvP?z|BNf2N{+=`f`%}P5_5w!s6;=3diT;1J zc=@6}cs1T}_=)e67d||=v@iILy>FjS9`b4V(|vo%RMarCH!!ll4~@Y7zF;N$0VDhT zh0;#;{>-&81{->7IRK)QhRcDiq;2G&R?-Mbwvx7$X^^lysz98kgTbv2mlS07Bn?(F zom$RRL8__XqRSxR4wo9FG4nG(zIZ^4C^}qfA=6YAtYjuIG*w3hP1Pge4wnX`VX6`2 z%Pe4PQyMZ&30O%E7@9Iv&{Pu=?nF2QX_%S~@?{RNwW+y~X=)x=$)Ui|)O;#vDvyLa zrw&6JrWSyFIULBt}LcG<_V%m*atHZyF}-1f*PSJ&|dcJqfI25zw0DwmO+AEc@uv==QQ(=IzO- zXCJ3uRhCnbW8_pIKOme2ZZ+qpQ$&x>04q5Y7!LF-Dmc)ynYYqF&p{3bdM?P9^MHJt zKcAfT>Ya}-fb4vHA<{`%3~cS=iy*VC2FVrY{Hias@E$ z8Tj~0q@0hhVj5jFY$FTz3dMW=7nN9rxR`Msc97;j{ zqJs1NHxhOw<0XR^Q7qyjkT1nRbU8ZcIzXl=emqgq5g3~4LEnNc*=bRvn+hLUHL1JuZ+vKyK&uL^>(GfUSqy z8#4R278v(?YB5~i@2OxP*JhsWz$BBYA1muXj>K`&43!D{I8NYY;-hB)#u*)JS$kWFlvca+fA<6DaBHp;5F!Y?(xqD}s3-__-%v@oC z38{8zEtFnlWkqC`JuORnSY)P&RF~s_o@9Z}6X~S-EY?Ee8yNNMUmI4PSlQWkiTZL35BE4EPwc`p>ttO%grPt%F zq@A8_Qz124#fpWANFoXZwQ3@!MHeTN$;)1bi~UJNTve;c+prY=Iekv5Ut1N4-1GFK zRSz~qBI7m}G{p3X*o}y{-f;b4#EPcbZ4UunzVFWK5cjNFdkFFQC!Rh3AmByzwk?P| zV~zJ6Y;cmLsua4qY}2tpnI45GGq;lwE*Yt6)g9goQrC%wLNyiqlQy%Cp^#A^VV#w=Y%HsyzRaaV|Lx;w6{aWTwe@ zVKSzKN1?1{fuN8HV6oF(at2U$fT0XBQKAMjm^5!>ZDn;PzbokY%{TFM&KOL*QkZxK zGx1hA5SM_7b#j)vOng2O6-qgFHj`4;Kp@zTk?U5-*~{lP&u^YLr)5s)Qj$rCjB~Wg zu0nyFyR4P-F`X31s9IBu9VsoAH*zMKnJlKPdJC<1o=aw_)~$`=NGP^MC#_}E zc?A|tp3ZH4thJ-CkzQ*?zwW3yxmcA{S&DBfScs1iwid>4QWELwk+hPLP8ck?os_AQ zOXN>1&u5>1-^Jgz5BQX|)H=a%{4aIct?mE*H+aLMN-?TR}rwgWpnmt=d!4QY>s6 znoHKIR-F}ehBsPLPXGhFNix8jxlrID=yTpB>?`a!n9GN(=V0fW&frBWrISujFsS5d zRB$wl-A%8yf(mCig5yYzbsR@=6t%<*l9%i2^@r6=-8QdRbou$a4n?o+wIDbz<(xPS z9-JOQW2=V~II0}aYju`w8}Qx<%XfI9D6A{;8e%0zOHq(Hr}cxDJ=WLJ z7A3j7fpb~kuhrlTfbr$vsFv+xshN`;m;b+U95hO$E0fK3TxDd7CMozPv}S76DIU+= zTC4*jllb!sg|52N!r--C;;7b@S_6++oZ&>^7J1-arIU@Ie^4$qas9#O{>a)h%#jr$ z{r<`O+m6&nB9m{?cOmB54j)3ic4?nM>x0l+5@ z>|Tx7qR-lkIIaGfoksz?_qvZE?wi*z>lomuyKm}2)Yo2k@3H#9N89$IoynWF%|_hK zV3aK`*{W)_`2x)u+015>xcyocpLgE@6$~a@HrsAHu7X_+b_%j&JKE`#Wr4$t!)uL@$v|Wo{tVNfatR72F_z)69Zw@EhhliPL&^KS* zWOI@hCdh7=d_=X}ek+^}*UQ-eGuKCP<_gl`W2i|rYZ?nNK7(ucO(SR$5dI3V4<`4I zBl~Et1vNZ|b3sXozajJ*L!XuRPe9m9GJf*Z%J?Z{58Vp;462WFN{O6%LL{gLZdIAA zpDvYkFH%BeKoj-{Bp`-s`1bVZC$)MwmE-hKGd|AFpTW;^JL9qv`ELxp#wmo!{@GI5 zZ$hdnCi@&@5?n*eeu5bITBqo$QqSfM&fme!d)#^IZW|slmsg4bB zQbN9=#Z!1%K)$Js_1}bMf!q99pqR3)#V)yBi&1l3Gx-+i8SKQqjh_#8V&9RsN#Dh{ zyi(l(yOxwe$@dU8lka=m_kCI=DQ+S^fX%y);+IJL z-y8~oAATkK;Xz~t%=)mdoWwDK`OtC=k^@k)d=(4qULKK zN9@0Adeb<-hu7S`74h7s5(mcBmoRzQXM&OV2psIfq0?OQD8pv*Ylid4V+>o!Zx~)c z4tX*9xa^bPGG<;-KT*CY{4PZD`|?HM40iKl*h8iUOX7Vbna$qUIT$ zJS$8194_Q(_a`1Te5hywU;kP1(q9-cy1LvRd#KD1$tS#8`pga^o|n{mAtc>jCFx+i zOIgiKwZlxczv<-fvT8>{i(ix$TMIz7mn78&eAR~E9S62~Di76x_@UTQNws4k>HZ-} z$5dO)RQo4W?O!_ix2)RB!D5QXPttn$j9;#46yJi2{fbR1@RzXew35?!VL)SyA(n0$ zhg(*?X%*a+lx$kf*THcbCCWn9bM_}$?eLsUQnn$? zr(D4k0*d}5{`&#@ARWu)@&^mrDEN7f7zdAKiw!+ZCLGTQ`5c19qNv5;f2PbVod9oK zdOG|KHbrcQ!d@$@kVv6t!26lo=|m)hrbH*fjZV*mzf!o;$@q(3XXq4s)Z)t>GuK9^ z!h?6;PN%^=FFp&bBcSSZ_(8JV3|xww67fXE4(vF4xDPpqTof2-3!GF{8K2pm*~eWL*kutEL_md$zy`}gv_9BH6meoi;9{@ac4y}U8Wlzh}9u}EtDwbs_YKB>9J}Qk;7kcRToqNuG472P~@~he3Z~phr z{mys4_xGK%+bbLzRke5~pEfsBxT#dNSlX~n8tcjzb3MJyT?J+~=ZaZ7V=!j)DxJ*i zD&#Gj6}xPO#=ClqeA=*@ZOddC(`x37lx4ud&g9G$$X20MiAbLoP3>4stJOp_q4c@D zm9*X4V=A;pt5~@>5lKXWpjJ)9wCFj>Wb)!y;KRQX5l7W3@)j(GKfB*f4QQ((k-MIL zwCcgeNM!t$f`*u`i(QX+^L5uALab<>+wlKS4;}1TKpSS>gh#1Xc^I5m+lw5m+a1l)!p{c2bM3?C(z%wJJCI=tPH}4AGp+ zY&O&M6zNcIVwXZ$!^2delfa*o9eOHIcY+%YIweF6PBK~E$Xd#pOn!aVT$(L(ZCq*O zjrAr|=xMNcs!peoIPv{79W{;j5ztf>wu=ns1a4eIr>BEidAcc(Ogzo#8J-Ex6pSGN zE|CE+g9pIYaGETE0MO}7b-8?fA}W=#t!yTxtc5VOY$MmBP<6%pmPIWK=C#fXe3WDo z;&zr+*RowdfHYo)U^ovwp-`R(#D ze3#NkYEMYRBiPj5n$YIvA_Ut4H@8VLMSZhL)mjEmk--*(B3+^Od6`F z)mlBBqPoA`p`9vOC~KP4V=&8XmW7?)$@2aV(9CdX3fr|+Ou?K%yR>?Dg=Ci&MGJzk z8x-^_`Fd1v43FK-ueE}TU|1(`l)!p{qXo)ZLj2?v`nvsL4Oh3r?G+g>e>b4$wS5GF z^CH1Xz|g_z5;V2DI8mS~@T_)w`L;p#nY4VTD~iIqqNpJo6JtVAl0`|NR^ZsuDu%n< zkY$fg9cYV^T+RzFxdE*PClt(A8wbe(^nD1Go{;0{|Bd6IQ7YY;Y_{`qBU>~n^GrzQ zXw@kZ&z2VJ#K^S0`K1V5bv41@z1*RjRMMq2ikQU-QwHv45xBiN-2(di<>CsVKiJ$G zsJD)ApoU1lf66@_FE&IXQ*P3CA?7*`9YnnCf&RHK0oJ7_ZABcM@W}d?0hiyHy9IG~ zTSa^j@X7tV*C4j)v-cuSYj|eoVZfff&SQxC7BtR20yz548+sA-b!Xpwq+#g6duh?i zwxw@bQ`%6H#AtOL*ZSUvMmwd?Y zkS=O9P{g|#4vz10==+Z<;rx-}mNeKp8W$p|LU)48uXgA)mZA_NwPc(5^8){m|X27VXfYOSS0vT6CGo>v6~lA4Ov5&B0{5 zc7(|Wee<++010NpC&JkP?e039x%_naBx+L4o5o^{PyaLWrr|dU z2!93G`;+^pkbSg|Kn;)KTu@TtKOK0Cq0h_v9tiu8jGsBNGVVq8(5w zg&hk08lj zo81O%r#d%g@@aaj7Ej@60sXc%&U+G=1-|3W0>zX?mN@jgT8x?NTj=*dPk$$N8@}G( ziG3ff;OEp2@TsU&`(W3SG8p|K!WMeFyM6zWR!NJS=^e0n53T$$u7pTO``sz_J9oMJ zou2?L82s+=tM0)CHGCRw_oKnRlG`eYayunaZUZEi$Z~HP){>Qc)kyC_blAb|hx457 z46|%{9}ZP;QkkYd#ciAgzw-w4Gl$->i`pa+t{wiSl z2g^4Je=G4268|TMLg0r3VLv>KtbkcR)|KNpCJ1@|5syi5gFT#$OWgLT#{~AJAjxxk z4)1D$d9JPH>cLf$?rKcH>m$mz9>8yAbF~MQTStkQ2F;vV4U%wFY(qB0-y1LvRd$`Py$tS&9de09d z4u#bETR^(Mhopn?E=3k{)n4SPy`SDi79S2RCT*bFk&tTt@KhUlcO2a2syti= z5{F{{45{|7fOP*3Nyk-N!c}_(RbzO-q_Ze|hZJNL_>1R0R*ARnaOjPNUfpIfLF|=o z78eY?>dmUc468Xqtl_K%E_oSewQyBZyO|=^!MPeF4Xbn4!HE?ch3n!rVW!FI@fr`X z*l4)o`NGB^Y0wc25AaZ_lHybvTtLM?62n={8p?B#At$-Xd!GvwYu1R1$$f+uXj5n7 z_UG($|@vM*s1Uw0(Ac%otlww^uGsV>T- zlo)9j&jrd8Yl&mU?=q<+9`Aa|N1j_rK5h|UGhIb|m#I<_*JZQtb_3dCgKq?^2v)KA ERs0kyq5uE@ diff --git a/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.3.pickle b/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.3.pickle deleted file mode 100644 index 0ff7f6c68f34e468b08a9516076f0e07072a8357..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 7278 zcmd5>3!D{I8NYY;-hB)#u*)JS$kWFlvca+fA<6DaBHp;5F!Y?(xqD}s3-__-%v@oC z38{8zEtFnlWkqC`JuORnSY)P&RF~s_o@9Z}6X~S-EY?Ee8yNNMUmI4PSlQWkiTZL35BE4EPwc`p>ttO%grPt%F zq@A8_Qz124#fpWANFoXZwQ3@!MHeTN$;)1bi~UJNTve;c+prY=Iekv5Ut1N4-1GFK zRSz~qBI7m}G{p3X*o}y{-f;b4#EPcbZ4UunzVFWK5cjNFdkFFQC!Rh3AmByzwk?P| zV~zJ6Y;cmLsua4qY}2tpnI45GGq;lwE*Yt6)g9goQrC%wLNyiqlQy%Cp^#A^VV#w=Y%HsyzRaaV|Lx;w6{aWTwe@ zVKSzKN1?1{fuN8HV6oF(at2U$fT0XBQKAMjm^5!>ZDn;PzbokY%{TFM&KOL*QkZxK zGx1hA5SM_7b#j)vOng2O6-qgFHj`4;Kp@zTk?U5-*~{lP&u^YLr)5s)Qj$rCjB~Wg zu0nyFyR4P-F`X31s9IBu9VsoAH*zMKnJlKPdJC<1o=aw_)~$`=NGP^MC#_}E zc?A|tp3ZH4thJ-CkzQ*?zwW3yxmcA{S&DBfScs1iwid>4QWELwk+hPLP8ck?os_AQ zOXN>1&u5>1-^Jgz5BQX|)H=a%{4aIct?mE*H+aLMN-?TR}rwgWpnmt=d!4QY>s6 znoHKIR-F}ehBsPLPXGhFNix8jxlrID=yTpB>?`a!n9GN(=V0fW&frBWrISujFsS5d zRB$wl-A%8yf(mCig5yYzbsR@=6t%<*l9%i2^@r6=-8QdRbou$a4n?o+wIDbz<(xPS z9-JOQW2=V~II0}aYju`w8}Qx<%XfI9D6A{;8e%0zOHq(Hr}cxDJ=WLJ z7A3j7fpb~kuhrlTfbr$vsFv+xshN`;m;b+U95hO$E0fK3TxDd7CMozPv}S76DIU+= zTC4*jllb!sg|52N!r--C;;7b@S_6++oZ&>^7J1-arIU@Ie^4$qas9#O{>a)h%#jr$ z{r<`O+m6&nB9m{?cOmB54j)3ic4?nM>x0l+5@ z>|Tx7qR-lkIIaGfoksz?_qvZE?wi*z>lomuyKm}2)Yo2k@3H#9N89$IoynWF%|_hK zV3aK`*{W)_`2x)u+015>xcyocpLgE@6$~a@HrsAHu7X_+b_%j&JKE`#Wr4$t!)uL@$v|Wo{tVNfatR72F_z)69Zw@EhhliPL&^KS* zWOI@hCdh7=d_=X}ek+^}*UQ-eGuKCP<_gl`W2i|rYZ?nNK7(ucO(SR$5dI3V4<`4I zBl~Et1vNZ|b3sXozajJ*L!XuRPe9m9GJf*Z%J?Z{58Vp;462WFN{O6%LL{gLZdIAA zpDvYkFH%BeKoj-{Bp`-s`1bVZC$)MwmE-hKGd|AFpTW;^JL9qv`ELxp#wmo!{@GI5 zZ$hdnCi@&@5?n*eeu5bITBqo$QqSfM&fme!d)#^IZW|slmsg4bB zQbN9=#Z!1%K)$Js_1}bMf!q99pqR3)#V)yBi&1l3Gx-+i8SKQqjh_#8V&9RsN#Dh{ zyi(l(yOxwe$@dU8lka=m_kCI=DQ+S^fX%y);+IJL z-y8~oAATkK;Xz~t%=)mdoWwDK`OtC=k^@k)d=(4qULKK zN9@0Adeb<-hu7S`74h7s5(mcBmoRzQXM&OV2psIfq0?OQD8pv*Ylid4V+>o!Zx~)c z4tX*9xa^bPGG<;-KT*CY{4PZD`|?HM40iKl*h8iUOX7Vbna$qUIT$ zJS$8194_Q(_a`1Te5hywU;kP1(q9-cy1LvRd#KD1$tS#8`pga^o|n{mAtc>jCFx+i zOIgiKwZlxczv<-fvT8>{i(ix$TMIz7mn78&eAR~E9S62~Di76x_@UTQNws4k>HZ-} z$5dO)RQo4W?O!_ix2)RB!D5QXPttn$j9;#46yJi2{fbR1@RzXew35?!VL)SyA(n0$ zhg(*?X%*a+lx$kf*THcbCCWn9bM_}$?eLsUQnn$? zr(D4k0*d}5{`&#@ARWu)@&^mrDEN7f7zdAKiw!+ZCLGTQ`5c19qNv5;f2PbVod9oK zdOG|KHbrcQ!d@$@kVv6t!26lo=|m)hrbH*fjZV*mzf!o;$@q(3XXq4s)Z)t>GuK9^ z!h?6;PN%^=FFp&bBcSSZ_(8JV3|xww67fXE4(vF4xDPpqTof2-349z?8Q;zB=9n~Xlcu!LQrZIT^a@Ry)>@7vrBK4!rL>*mHOATOq_gcFd2eQE zQaT3IBnB+htD>UTiYT5a3La=h1c9n3N(+cZRFFdjt3X;0{JuBu&9U7yDbgQKe!u{oaDmfli!sjFsiHl!>gZ;!qwyc!pOn4RktN|-Iq^u2v?s@vr z@`vg|p~>6w3Sz1{dL!bkH(Y-Rv8-WH+kU`H_uhFO;_kH@46M5y?waLj4&DnI1 zo+76RhcXj86w-PYrV^P3{+w=;Q-Qhz+^CTm0cvoPPIFqul-8$nn=<;!Oulo=8ZD=7 z(y2sFgO_KjWR?*#JU`7wO=JBAXex8tg+_A%Gp?qR)4{AP-6TjRmS*G(*Mw(s#t;CP zhya+w0$_VEO)xlj(MvKpV3u@IjqY%PpoB?F`vA!&q> zPA#^RC_PqES4n&D6Bl!9`Ll?dz|AJ{X@Nzbj+bB|NQ+r{((4J8yh&Cf`J7JHC=qLG zo+NZ@03-jBfM#ns`*VGH?yw!nSrc2Cvo19*@SJ6tWF2$XdYfzz&Qese5x(WM%ggXv zQom7kLK>cdP3_G#d5dh+7+&9cs}bl}V1Ty;4Dfa?6qfXn>^YLlN9{IfZ`HWWHB=;3 z=qB%Amg=y{JB6iy3gS&vH(yxui0*VFI>p@u5lwVi3AZWXl}h-0CAV-j~ug*(@7Ng}i3BMoqmzv?O*V8q*f=Nya8w95l>A5+)>)Q)-+y z1iL&hx7{isAh_3qC<;D-WnIr}?HY+QtmZh5V-3gg9EC|u&p%vWr?ahR>b5yu&T{=! zK+&uE4M?ktI42Gxrl0cQi zfvcI1r9C4SY&I{UEsCm8FXz%Tq*UT;g_Xv_5qKL+ED*j|1RS^h|Hg69;-#*1Cev}b zmMQ4Oc1>t3R4S4@p5LWJJ1{c)+{x=omZ^- z(BCU`16+Twxw{k?9OF^|BK?6G_qDxP8w$<1Mcs**Z98-j@s0-v7QF;motm~Cad^rj zn_dQ7eRK9U#9b@OV#9z>?%TBPiq z``#n9BQF_N7R+=`H%+#5cpn&L$R=0IN>wgTvsxy-O()k3Ddl_}@qVbiuwr0qh8;)M z-fO{5UY1;kb~<5MVic#sPG1mXB9(~q5BNv|asELk&Oa3J^M|>g2Zxl`(QU>MF2%4? z2g185X_-90KVtM55Yh#u5{h^i#gTWHO+I>531^Qar=-E6SGy2NC2~Eu{9`uxIOf4P zo(B$>PZ%RDG=fWv8-T+nRq`pEj=W~}X*6Gyjb~(cpqV+HEMV);ATbON{+!w~#+-u2 z{@Ui0l`t?5K5LU3W#gJ5!yojY3kE$5g*TzvUetdcHK|~|VL67OcZ|I^c)b9kTmtsq z(&7uqKHP6WsgB~{QdG%r_C3Z3XI1=*5ZwV8zjR_{{4%nKriAST)yF}qNY1bLNKg%2 z=`&fsS}g0=kP;#TdaXOb0Wln7x2Hotz16{)9B2D_?ooEW1wYH}jIR~R|8?JE95$Hj z-zb*-R-`ItvM)v^-Z8rD$B2P%`aX^oD>Jn@4$?y zv4IF;#Y^Mw`bfS9^Joc@JA5SHhk3OG$q#%adtsh+NOY)beM%MXX*d<(LZqYp9^ebIUpNc0Uji){ z{K4R-euXbovZ1%tiw5URtosCt^`1bnP7zon%dbnY7Ocdl#(E8+!xHl$*l_AfQ_~{* zadv{O)D$^@>r4~wvp(~%O&*br;N}&3g1f4s!8{7J^fv}JiN{p(+h715^qKzg(oMo! zG5ijRU(K!+_~D75AAXOlfY|`nmE+jQ2zdVwE))L_z-Trua@!wWCa`b@NuJw%cxOF~ zZ7Z6teH?M&@hR9Bhh4Eo@+XE(!;h1{4L{uEVH# zkwwi*DtS35;V{0))9x^j8a`Asg0GJRy!1~-jIJ)V2{2k_h~yJqExu;ECI1Sj^>3ea z{|QJ3Q(V$m%2ay=Rin5~q|z|_1{9=ac*oU|M)1^b5E{K+ZdIq_Ik8*V=>*QuotG%?{rMQyGKN^;X&s!hB2Op5Sy5%D_53~97KD_dlb!cq z!;PMV@5L;G>N=f*=eYkyr@|S_CVDcG2K;gg-oa|a>V8qR35{{9kWMSjMFgY>WN{7D z>|mvwPG>^xH(2?G*g2J5`vyxogR?)*qKNy20cB6~^C?vqg@B@siGT87yQDL@TwZj| z68s!T%!bR7HHn@cBs_x=@^J-gN-kw~I*;W-4V zjAR(G^Wa3K^Wirbu?z5y7qJWRQi-)&dbW)&f(z$-C2fFnk-VERzd_rx00PEa3^y3F zcr0Evh`nhycOwUpqXI*%`~gR?=Pqy}zb|rJ;PJjM_=x>0!H3s`Mn@6PYfVBC$7@UQ Qaxd;kCd diff --git a/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_3.3.0.pickle b/pandas/tests/io/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_3.3.0.pickle deleted file mode 100644 index 5f27dd2ff056dea94b1e1d2fa458e3e517aa2a84..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10049 zcmc&)2Xs_L7v6-vC>_B{v8@F}K*b)4qC_GNmRQ!fOEzql6z<&@DY7E!z8Jf|y<_ja zSL_XY@4a`iVgJ8x-n>T$U^&0%{F`&W@6OzrJ9pZhnFoioD@ZI+*&!B-O;0ss>ZGI~ zr8Zep*O>C8-4tY3r}9ImG-_t3EfSx^f^liT!93}(l)ART%V=^hx4S9`OjXX4VMvK0Nkpc9tLs1511D9$?$)Lcc!A8~&UD`(0S=yzTC+ntL#wOsd{n2m! z*em|p1r%%VSl07oeR!DgWdnGawZpU_eb~sz#%_#;WP78RLlbrsM$0Ca$H*}BZd249 z?#qaDCMhG)Lpw`xOuL;O*^FJQcStsmOyvy462&Xeuasic&Jqd1z~9%E#z z8^v*U6n98u`(?Qsvd9bhEo`j@uGrUb?s8!}RDD|#~;C4-7JTq08=SF)^K zYb3|oor^LpyLQW1H>z;P{leuNa=45`4wrMtQs+s1YUR6_DS66jls;(&`2alh-IPkV| z<<>=})xw3jHV&m3P2~_PS5?Ef%*!&S9-5THI2U%mAI@I|?iJXkU(L716wWMr$+qp=cPv7M;j=we=adLXYK&JdSM~Z{!5F5iiPr zTmYCjN=^&{z)40>tUG40QT+-5%u4tA^ zeYuP~=W@(e=$2bIL5vO7&FCQBUcsThl0$u!k*hh>or-czcAJ*5qk@GJV%xwM2F$hr zu78&bP|LJM7}3VaAoYd6o=m&e1{a06Yq(a247p`<7`ho{&`xh0tGTaDbd{BVCe z=CwUHwA|szozZ5#ize?jvcQ?VCwm*pbz-@DJ-IL4EcY9Ez?TPod5D$lP&~{^j~IE> z4aH;s+fXd@dR}sydHJt4OV>9$Xl$4E%Y`!gB{TN_xI;DsI^8}zylo*E|YgW zd3U*;e2?wFZ{!2Ek4gQ|&H(GnN1l8fb?_6G`_#y1uH5IgT%6^;@Z?K$?kgi-`|^!1 zi=xhb%SzuF`JRX#Zy;zc{CUUE=(2p8Os;{|Cxx z3}=JU5@1kTOCj0{oK9_rl&9^1>1OQ!GTIU7YbT4HsmLU?3pk9i8b^YryMm3bM3o&y zU6~5pue2M5O;VnAhfq<}9w1+P0_hwBwf15W80if*+6U;2^reDER-rHqM*2a3k^UfG zR|Q5!R$~zuSsiS24WKi!CKWWY7KIiV82|xB27-JY1dNOfW)T=!8*FqPpfj>A6*RIQ zg(G2PeF!j;0QtHBFr8`E4at?lVmcvIyset=f0cl84UZ+J>ohnWdgPItn$> z_?DogZUw~5^Z3-QDPz310UO;G$nox8)a|GUo(L01>Gnw3_^M^L;AmTLIcKVlp@zq* zj%A(TWYuwyecb`*>5f2z;+g;U5gUsQ>iNkrKjFxoetzL~@`4e)Hu@Rd?ZJy@k&}CF z{m9Qfb-Z; zHsg(1Q^@f&4MgCY0y0_+^fhC#h6;?*RB*WPvXo)H7Hl*JbPI19#2nYx|8t4dp?y#5 zffYF}kBVk(0QuSo#LG&jlXEt<^uh?Z*uo207%BuMayGC8_jlf7Rgu}0&bG_BXpl#` zpdK)hvq6AEnsyn?L%*pS^d084~$pbmFgTKrt>P{`h9#uUkMdor4xVEdUX(4+i-< z7sw@X2sqq0^C+VShk}hB24oMgaSo^AU)?xGSoG8fE_dTJvq5-r1lVW`&@KHVsaR^` z9EBWDj|N5?=NJ}3?PI}4j{~~e$5XM?#yJ5wp8f}jjdLQ%=t)3dPqugp6&R&c!Qnig zMj6&m2OB*D=;rxMD*mtYd=@tzuNHy1c1J?8}}ercWr+m`G&JjTSP7$ zI%M>V;N2G7dmMTGq&?pxfA!9%8@~kJEU_h z1Pnfu)Ys0;RO`b?FLfmfzJK7Of&22IkDvsG?NJavR_J38lDZHWU8f(X4n2MXZ1hR0 zFm~;W`VjqlUHEMNPsjJcm2Ruy$G$~wxqtA`zTlU4zhw@2{SV6*_U)-(puDGF z0;49rLV`_v4L14>Flu5ERh;8*nQOxhK2&b1-ys>MhU@pOQX}+-R;iKtW2@9=x)>>} zke@)Dsh`2oLi+^?y74R6=x|cS7#ZoxA~3QN*yzeYXQUeyG}4^{pIkj4z(`M!uf2eg zk=`rkk1&Rt5RG8j$a{)ye7Xgq+)>Yang+=$a7las)<0 zH-I{{H4toc5LFmDTx$nY!M4_B9``6le;vqZZe5VC>jCLAKW^*#NYG9KY;*%4?Le_1 z6|}Pv1Wz}nCaFV6;YJ(EBB;9w*yu2-0v|S|f^~;8&qh^z0M`*nxx{Q8i3B{`3`A$_ z&fOdd8r%YGbQF*V+ZA<7Drj&k=2;u~2(DWrWsgZOtC@^$!z91;;Ip`Hiv;X%2jaTh z9vnGdh6LLg4K_N4Dzt;3G?t3i`#0RDScdl3mUSFzc)9~H>cx&U0_)?!Mt1@_>pN4? z+B(up{lawEicUa|r{zF?6W3iJB(hCed;!;qkbSMNs-*5}HEtLuAwdUr0~<9|p?m0{ zual|z`vI^=neL7np6&sRI=Cl|!2Vufqx=RC+25P0)?Ey{f_(c6|1lxFXX`%Dc)Bl; z)3hHrTH{G1*v|f7Jie%6KZ<$)68uS885V(&8nDr+KxZUN z1&!2FV0g$ufRSk+U+aL}l=Wo3R@|m+K-zA~MhHor4oqY>&JKX{LY12XrHp3tV51&O z!S_;>W&BxvVYFdh;>+QD8uo{E38)J{MRJT!o4>O_#Q zCjq(cP6mflcM4^+cq-WFX~1aePNxcOMpG9q?_dgI%bf3_9`)>RMQJ?)B|JS7$f-Mv z?YOBsn=;%v2W<3Qpc}~fRB#~AW1btx^C6>m7l3@d5Xh;!h>Y7~t4H|7$SUY1z(j7D zOJndPiBEQ4?Jh+r+!QafRW7$xHbX}$vh%{1_!Y=Z>XpE7+ODDuudfChy$0x}?OH1S z(X?HM8lGMcM8|Fb`FbOe<8%`^oVJ@Oqs3dmMsEeW7H^}1<8(Xo#)nf?UJ_oEy?kA5 zI=(x;Tjae|{q(rrfr3ZkrGM;`kDqaP&k8&e?nI8!yMXq9!Hr1oMq+%Kvxb1Y0BQPk z4>-pTW8m*)UQ;@^DTXWOhVee+74&{!@VTSaFg}3Hq&^5lr|hT6hbW^{4}*<90%WJ~ zuzM7uy9r`3{>x9W$6(ykg}{oOduBb3L_wbbM$fD#k;pCmw0a6zMxO@Sr`0nMe0>(k zfq4!bPS*33(XSW4coqO%zh0tZc{>dG_Sg0R`MV!#^krDU^8rX_UIphko|vWAn1?S? z6T&Ha9T^3E1Gvl-y@|}Ez6C^A><#K|%IL~FV59E>*%jQN-lKx+?tKWJen3r9KO}|2 z@DYok?#E!GpHLM9$xo@kF!X2}F!VEMJpCLPoyT9WFsytDHu@EiR&Z_lnhIFyzoJ3Y zZ&1S1ML2rm0E~S8nMPp! z7qHP^fzJAGRJ67pT4~z~%Gsa(4$0F$fcBEa|Aw>VidceFbSErD0z0)G$XEU|C|5!U zaM)8X1Vo1 zW>R|svjbqJ7X^32=uH{E_5tIy0d&5uLdDLkiM?U`UBwTYRo|NvPZu2 zb#>$wbPeD#zOIQ(yi$O<3!D{I8NYY;-hIrnz%GlRAOb23uN9W%A&~5@B;t+hDhxfRZSLM#=E8mKIWt#S zWJ06eZ8L;kWMxHUmOU*?dst+qsZ^Gws2OIZ`KUBXUF@OXcjlaVahF~A$**RAzx`(B zeDj^}`+tw~opY}$vq@A|;+cF}-%8-2TvlRf&C*G%GhfVg_cV4EsNR?>X041ysn#QP zP`$H|H!WK1v?LPm?AG#W&1|$xon~~ikqMGkfdng))7PL_iBu#ay-GCIe>tg?6VZg! z>&TYUR!_Gskt(HZ?b1Xf5e0^7IT2H$E0W3NC9lH8{v{%|td!+V_!R!EJ}Wh#v_~R$ zKmBOMgY}Wf_-zFRX}TtM1JYZrzwQXqvc`F>4*|XGz#Y4h?rYz080qs*JUjOg&I>QBy;p?EbCT}fn=8)C9kd1R7GXSNG zK&e4alpl){$0fq9s6!%UoR)JM<224`1*esqR&iR*sg>*ntt-00y@fOx?YDw$@)4bM;0(b9OY0hY=fyU1`(VBl&hIU8VQ=_Wxku{0y+xDcMp z1w#N_Dgs~z3xMsxG+7A&ppu#LYB7BxDvYwsY$heGhcGoQE!Qm(dCh{R#Z3$6H_!K7 zQZfl~J4-3=Dip}<)h%3(ZlyrSIRR_up0u_7DJ7QIayprpET)V)13l4blk;R_ih;n+ zSIL6FR81#6l}Z+ZBoE2v<46*i373gyX8WRmLW_?hhKFV(oNkV%`%?y5t;D;s1R3cvW~bR?BMXiN{&a1L#sF)+`;f^ z3FzE<(mD%#%5oXf#=yDICaVNEZ&b-eUj6YE&b7Ea@zS%}&6|r+jYzLCV?ecJm8_8i za9o0;Wh`CC3ZDz-S*ZYP0$Ix$>%_2MMH#hHx=P*@oMLSW95{?_We`&f8k#y;LWE!q zi}JMBlPbAXR>})`owO_E)|LXfOt<8n|e*r6OgwuL!-J&5s;R#1$G z^oAe|Z^qG)A^jF)9SPEJ4NmbkZim12hT##g2^vEB?IlDA*087`((jOscEiEfjbFBO z_E3fol?B5#6BvR<0E5h_$6%99HU}}thA`lMD=O7ycL(n!ayH4!#uBlDn%x>T^+wS~ zuqV+>6aeUMn-FXc&0-3=T0)gt=L;duiC_Zm<>F6NRr8M=Hp3=J&~oye)o>FgG3 z^^QSjOd#Lks3Nf*j92+>i7|noy%IHn8iP%8Gv7*VA98VHJuYGqssT2avVB0Q!hRU@ z+`?_kyP?a3$lVce+*SWKj)Rs;>dIuZ9am`CqE4=MAvESGl_?(2?@?kM7@0fWZuz~? zf2jrrZ?8@IWP>X8JZ5n}BLcUd2k!e+G7!kz_jCIJb9ei8%?P(|5b5_%xwrMjx=3Wo z&FUVcxz;0xk>375-@KQA)}$wGM>;s+k&Q0{U42vTR-}8EmBj~vK6!BOI;73&oc&0r z)jhN8DA4Zx_G3s7EUcgN3eeGa-PnUv-LU+gSL%k|=Cu{gOkOulw%z*xfU?Ub*UCzD zzCd$YHnUA9y9bmCzW3V$1r>IAY@heRP%%--hX5xpOFoP^ow6)3!m02PUl3y=m5B3? z`d9*SzSoKKj|KdE9ryD!1IquWe-r-~zGzcJMQF}QR5 z97wo9C7*}(MU*#RAIpo|jc$Sc0zw~SEiCz>tVCOtXqytfP>HV6S;-Fg=_X``Zo;3B zUm9UP!b<=8=A)Hl4*RlAzH*Giz8Z8GG|peccFxO~ucIZEY+YH3A?#fvUst?!1TsVd z@!mG@W)vUoHK5GLu#*X`0^jfrV|K6=;G2*c0Ud8SwK{$a#lwPzOA(rnomEKBZ~Iu# z3_RIky51Vr^)}>$$bfa)ZRvm+u94&E@K0`W=u}Rp`_}7YaQ+TX%W=lfLi&H#H;j!6 z)BSs4-M^1q6-@W}D8##l*Znv#aJz4E6dBU(2fkqkdguav2)aSA9>8Gra`Z=>6}Fk= z4oFc=WTwaRV;{?%kisQce&S=f3sSrU%iTVfpF#<6Sac|NJC$lY!*WW|JxXmyTJO?e zPu#INlTVX-m3Rt|ugHDMICsR?E*x~*g<{Gy+HCSOB}Vm)P2}fbr?<)b1y1j6^6p0z zyqx+aj`B+NE7*FbG)f*ovW@)OIZyhHQcj9n$%Am36!`KX{1PG^@jJvftq(h!)<=LB z0Dm+%)o<~GD%LQ!c+ue8i5;7uVy7mk*tiH9(&e!dqJ=0i)kvQ~b=Zdg4h|T*GSsxl zVeGr$=sZmx$Gx}-@7V(Mdz(BV8^Oa$HiD<J z@OB$dBlF3eQi30z3Hsq#6a~Wiu&$iIc~QXo&$&YUd-vfm4!P|wt`OKsgC@`IJG!R< zTE1mX*FKJP;LhodEb8rr)A?WL9YEhGx9f*X}qBdZ@VD&ZT?@EUyd%Q9Ag z;iKSf!0=k}6_583uA_rH95aVotDJLGI@+1mGklDg)-fDUDg(+g!`qnQb@(kBUJo}a z9fzy|!yE98XZU!03>faK_VhH)>=t}FfeUnNKAjj8csdh^m7ksgx2*WnNpKgc_;fP= z4i5St)##bdcW_uqr{H&S%b>bW&%%2=yQEX$j%690hO7aK$z#?1Ee*KY#Y#LqhbeW~U}YU*=Un#eTO8>OF8&0IC7!hgjGgJ1Q>tDHg~jqF ze(b|#l*(K$FS=$4d5$Ay!y^R5v8@N46J(sr7i4%==Va7I1>|c fcuhFpF~sxQGGU10wdMG@3E|k};ed6)EH=IdbIz@n diff --git a/pandas/tests/io/data/legacy_pickle/0.14.0/0.14.0_x86_64_linux_2.7.8.pickle b/pandas/tests/io/data/legacy_pickle/0.14.0/0.14.0_x86_64_linux_2.7.8.pickle deleted file mode 100644 index 19cbcddc4ded850c7590344bd1ea85e11387a42d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9309 zcmd5?3wT^b72ZvD^SEu=rnFFAr4%R&Z9|);586VLP$*^T5@>IEUgB={rgz(YWM=O| zTY4?1Nr(#67owt8L=>MW3O-Oo1bL_^%2NbUK^`iI0tFv<&dizn%7%n|^wZ`0`rovYw0O*v!zhU+qpF7>1|p! zH+xFOvdt?!w#MQ;ed%I0U2d_y{-W$tVakCXz8dx-ykYo$?0!#6QW1lhA95W#|h3*#mZFNbib7 z?tJFax(B94BGb2*bj0k0*tLi^UVYU|h&3(qJ01i){np#AK-}H6@o~f#o_ucJLx9JZ zcV2{eS!~)p4^6dGVN*)Iy=BubgE8AQrX6(&o8YjC3BA5seqc>KNEDi>;Xhe3Z>Kdj z3Fa6{M!=NGdTh&u*)p5r=+R;_TFIma2QqqXu4uDn%Oy_mD_L7xw3oCIWm8?s_K}oX zz*0`JG%b;=9nKQ*5)r?sgh&lxEn$o>PFP1+PuM`%NNA_}Vb*nh;NDV}P4_0QJTAXz zT|$6nzpp#{{b=^nz-2L-OPsKdu%57iu#r$BoIp5{u!+!4>Ctrq1DT3mC!>$eaM=D3 z&4t`nGs_N84iywj32nUyQ;i)6{yfNG2ZMAsxG~LU2Bg7BCNHM*Wo><~xG8UT=1V&-m&qy!0D~QwSfhqdMwL-^IiJgD8z4-}cDm4~vBcVitxH-LEofWd z`)R2Z#O+afZEvZ>j$YGFdQ3Y5bDSHncHX$H9n9#lV!B|m`Ke0AYPQf5Ee<;-Va>FV z*aZe#7#ON`+(Q{`5oq$5YlHLaX(7Z@!ea z*-~NrGKU?jj9+fB6@jrru_c8Lvydv*$b z+5>a$826kNKQ)CM>lEai;IP$-oD&Upl9zvciE|xpPn3Grxa;O*oJOSInlohB34^Uo z1n4*gzt)K5Iz{9`(T0*Za6;0Nq*V-4B*y*M{fUEe7fMx4R{VHG1l4?hC_>^|x@C~5GV_HWw z9@86xG@OZF$Hw$oC^{CV&khc84%y+)ygGJRgc>arxiZv5J1 zW{CJb-(lS$e*BNkygQ|#RP_+72qJZ_c_??uq{Cj@*xhC-)gg!K2{KpD>|&C9M@+sRyFZZ&f#v( zDN__d&S7>!k9O$MPCa^p9$gL3rrPc+0m3h!K5uyuUI=Qup||>m-ZsL}WM@ThEcbO& z&9vQN7bUEvs!}NTrFq$GQRR|osiJ8I1eD{jcVZD-u4JHkXFK&K*+Z$jn6~jt4E8RV zpI0Bf8|OD!WYMi}cU)J%A9UD<64p+AD#a|G+f?9wm;(1B2D=jM_ps70q&49BV%YqOr)hEwS3JQ>6{!qA>RNRzHYE@V43D^Q8yyPF;S7TZw9s~xitpd z@hudF;>f@3ZW`mVgTm$QU3PX#_~mAYeS4T+z7zBdoCCg#W_t_cduU0$*glqFqxW7E~YYw;-FxurLcH`|tZc<024={s*uC0&~1|@6Pdus2=t$+;Px+ zEEGfYyv-+qX5dAYnCp+ibG;oUAu?b)b}Lj6hTquvl=26)OPobG+qW$blS)rq+;KqcO9^tr!D4BX`#994$qcDL`dq#ic=dth!5toH`j z=ub%$E7+ew(V7-D6U2(QjDPMExep55Dk8t|iTn}@;3^{b`$T>P1+o+|A*=1u8}Uj` zW}IK^P2E|uHw{Pp?k%}umOY@yGk6=v9@O`7zxb+xhuo^5k||rA4trRSadT5EdjyQ| z&a98(@ZOpAH^_pwrXItul&O9TC+SR@v&Rv(vnS;B*zfdOR@us)gv+r&&);KDh;-!d z4|E3qqdbE@1+qZ+)4`$sgdG}0UEJ`w%K!7Ucp}ivg~=hS}4PNvCsV3VSh{9 zQc9~5e|S!j-w$t>IET3|P!xOoTSkxwV@o!%VM6Z7kVAmwp< zp~v9Pd?BxcXYts|>(%2T-k=`ac%yn;%r$rv8G%oLr-0x1M0|tfiFLr6uvfKQ%qIl} zCkw$U8JXiHr%z+E5TvB*4>>+HD7cRhqy!Gx8L#Xi;`lUpk3q+$!-K*1#a;nD`{5ff zLFPg}1Do?H2D7THsI|8R4!w*-zXt|8B zve~m!nmJmQk{9*-2sx~0_#8E?H2g?x`X?(4?-Yh7urC^Z6g(LGXcP??J{RA3hR?&M zUl$0w`!AnQ!tNH#TS%e13-e=w3Ks~4xEJ$<@F=!p-U?5lJ(w?|esUM#ZL%NShV#YP zFK%1hH2D&I#>;WO6rSWtT!x|nzZ{Eiz^KqZL(%1iAxMdz!cfv!UR{d{PNhw?JuR^5 z@)g+Gdd?CXEyT`B(e{-q{5Vqo7IQ6L<_C;D-mj-xu2mCM1&8`;G8Q&`70t_wu6CtQ zMpp+kAs~hqgS;~+c!Ch5G=@p1qQ~KXG%)jgHGFaSi6|Z_GQ@YS3#BxE5(q$~t^p7s zdony2e68r_+h6%9_y)X7k&?uw3|aVh_!h}eCHYV^z+3SfkskabIxy|-wR{~Zg6s%9 zPf-D&6p5l6C{4kOfQL?_eznPXvStwX)fZJ?iK8Wg@Qvj7%N03ZOS{AcgbN8<2^SHz z5iTZNLb#N08R4;n%L!Kyt|UB8lu5ix661y1>53Q`YG+_GFT|{B{mNz;EZm3!%*Q+wMr< zr~IZcLiQ?bptiy()K(jS!nfLA=}~c^^w5TJo;1W`#ZZPwp3leTW~5`Y#{&{wskHnT DWQl2+ diff --git a/pandas/tests/io/data/legacy_pickle/0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle b/pandas/tests/io/data/legacy_pickle/0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle deleted file mode 100644 index 917ad2b0ff1a3534bd34a71568b241121a769cf8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 191074 zcmd4a1(;UlzBp{88M?crySuw{0O=YSns;VshVHfy8xc`V5JXWF5kW#hQ9w{JK|m2i zNm0Ic4fm*c_C9BS=lcJz@B8iRntN8fYd!0U^~5{4XC#de-BB_!#jw$Xqb7#J8_7yW zCLcT?E-I9K(CBfa#!f0fXiQ90`BCFW#tj<~6Ek2^M8BA*L1RY8#>I>q6c-UnF=*_7 z(SrxXmXC{#iWwFaTR!1J5lzEah#NL4s#S2=h)~KlN#;i0F>rq8P^yw`?r0M+_tv!| z2FFbr8x;{s9htOU%{EEe+z~z@P02RNBk!o&zJ2@FH^Y(eqfL_dl95SA$A&)|{$Jd@ zxPc2JJ0?l8`}FaYhqEL}l6~@+$iTsAldlcj^vJ4^lGLq`NZ21kvI8xRr77*27LHc7${mMJp%hzWyZLz&|v z?-XMrTR9N`P^mvlqx64s;eXyJaXW5c_^M@YOJCXl<}%7A$v^Act@Oo* z%0;HESiVx#@|8m6<3km~@16k@qhdo9ql-tT92jIxY^YM7Q04egm6DOEM-7eX7v2{e zsye7&zk%b1g69XV`J_$5*;+|E)Yyh5r+rW`+D zqX&$N3e{{sZeV26QNu#DqJwg(9UrPwGP-DVknMHD*ioBM*e-bYsmb;K%)17` zLnoOVw8IXG?XY1&JDhs^yQ@LC9rlR}U%gSdETaF){Kmlrk|vaDrhk@Z+`!vTZ4#WC zEaB8#|IEn5h8s69s!yot?YC(5FA65H920N(Pycc2=Ktani68mrr4q|JoRs!`LM=-6 z2+}%cbX2Hi!f$aC#)Mi$#mx&UQ0v>CuZ^38^KwXHUbg+W@3cSLz^(gVXdoR8Z<(5z z8YWcLcHukyRch^nJB3TF;jJR+aC?4r437S1Mbara^*H04&k$CDqFLis7M3#(h8GWk=;zQk{!!53R$>@sF^`gUVtVew4u9DGJqyK6~ zJ)>(xx4G4Pq7xcUiXkJ%42TQ$N@zR1<3oKC+fLtb?d})O=f7TA{r{UZCZ_41H97H> zHQ=_oG0>IT{_W*=>nZJ43!^hc|1~}t7yakRWYE}2ag!tZjg1J6FBusT*`-3W0kJ{1 z6`Ig`!`7QO+a!rh)~rj{(8RdTp-JHmas#v+^U!7;gi0Ihp;P8~h!!>JF{uhU* zI$XI%wd()kaJ<7+s{j4Ne_niAR6=1-zb#K^{IBygJ~T5Z>{&r!j}8iZc6?~g?S(xz zT-c$%7k1LP*wDO$lAa$QT98=M3&SP7D4eRlhQ5n~R7K2SAsA!qpQyZp9#vqHLpG1&&CNqn)otzwUM7{R zSFTd=?=QuQx5oKf5=JRQ#*7KQ+M=P4+rhDYp;5Mi%M-!^ezyUF#PyIGI zbnI5WIey!nKl_jG93T2T$bc_`3^){Iz=`-f+&C8K*KPCR4BhUdajG7sE>; z;ZFFM+Y0X2|KAi`eCSe8*DnWk{Y+5Tuf&J0Cf4=v=rsHU*TQxE+^xDE88Kwo$dUaL zruU)ie`a@d%^txs4^;nui%i}xsL40}T)6pt+`#C0(LrVZBR+JqWb~=XEb4f$IHaP- z1T(N(bv@>e@UJEN#3T*&{A~~ylPuVOtI~(dD{<8_CV9f{|7hVd+)xe{*xTq*<|Ii9 zZ0NHha8#pgLwHFuSk#P36KwuDZ@4wQjEM+#rilKtt;M7b zuJV7gxkdk@O~s^h$8fU>8br8m{EIddlm7Oz{@!e2GK7yMw49iX;ookn1Hp35tvWC~ zDEEbt{~s3VaK)85oRIjKEJ4b`#WnmN#gsL?{~yH^ACoQESw21{d%}k*4o|4-mHNcw z2ye&7i!JcqMO`fnq6mw^=d21}5@bk7Y`mNFV zZ55-KkB29De}6d?PuTq*y&S?5u>Z3!hq(5MHKas174b18gS7mM8d55}|MnUZ)hDKO z!pVPc$z_5);S5TZ&~erb+SOn2pLd*pb(Gexzp6XS|KOAVc$R9npAn4H5}u>_zj%%s z&P|!{92JAB{T2W7bNn~;q~`6Z2(J2%bk@54j9aA=t|zr`r88W=>jY&u^$;+VRB zeI`82kEs{lyw&~(Bn&feJGcJr=Qc<@xADI?x8d#QHcC9V$-g+a@$Kg}Njx|FRt&y; zi;9nF8r~JWxtj%NMpqmjyvV{sr~Wgx$AZ-Qn*`!;joDIHAVyp!JT6k7@Pi z@$k4a;dtv?$Ai`qKHlcf<4M~j%r;`$-Z~!C%JA`ae;!ZPCV9f=+uu4Kl=ir&;QN;7 z86y3@p6A8%O56yIn;6qOyb+$`-FDqR;eElq!*7D1?(_{d z6D}403M8gq;-|ykg2eQ{_33a891xsz+f@bzd%{;qoiM9v_*ZMW^;`I8LL{2FE|O^C zY%9@!q;b%{zIx)-Ts#;D-xmLFJK>w2n8CqCdjv~@Lx#nK|1cmbyxk*cZn42%3Ji`J z65bBJ_lb=@5Fax%I63%xgSaY{W1_{Vuppkzl{jDjX!^n85TY> zJPsHhCO&3F!ZlL$8yPh|YGiE8$na)35C2?9iWwCgZ*xbxOeKO$>^dWOSBS*l{+VC@ zZ@&lsPiLnY-8U1%qw$0Y|Jgx8WMwvHXAb6MF6L$)=4C$SX8{&uAr@v47Ui8R#^Nl& z@JdiZlwxU?VOf@Ac~)TJtG|-%%B;ewtj6lB!Nj3PE!(wOhjm$x_4$u42a`0izcHJz zDVwo5TQG5rpq1^`Y{Rx}$M)>N#4fy(?au7Nu8d?ic4rUX#h&cN-t5D^?8p8bz=0gZ z!HnV%4rMflaX3eCBu8;H$8an|jA1O}IF92vffG52lR1S`8P92)&KaD^S)9!|oXdHf z&jnn_MO@4!T*_r!&K10y_wZic$NRaG4{#M9csuPjC&NkJTYkrrJjL&MnrC>H zKk!GM<4-)#3;dZE`3ryLC0^zgUgb4j=Wo2h-}wh`GI2-~tm_4ll)-vl5WzxD5W#Xv z5W(_S5Wxai5W#{<5W#|15WyN(5NR1KI0X?bDFqQM9|aLC9tDy3CqBVCO%TCKO%TD# zOc23hOc23hOc22$Oc22`Oc1#ktg8eOtfT}Htfd4Itdj&0ER+NhER_TiERF95lqH| z2&NQ41k>aof{Abt!K62cU`i20FzpN?m|O-COeTW}rW8RW4&j1HT@b+}E{I?X7ep{| z3nG}Y1rbcnf(T}1K{RAA^9rIdgV|LO!IUb9U`iE4FqH}-m`eo_%%Fk@=1xHbGo~Pd zxl$0p3@M0UZWKf?BMKsz`UDY7cY+9}Iza>zn;?S8Oc23LCWv4H6GSj=2_l%R1QEQV${+hX)Z1#DfTi;z0ys@F0SrcMzjFhGQ9G3}YF`aU9PHoXAO>%qg78cuwPV z&frYW;%v_0T+ZWsF5p5g;$kl0QZD0iuHfCghxhV6-p`eMfUEc*AL7G&gpcwuKF-yA zf@}CB*YYW@-j97Jj1j6fj{ybf8u#w;Lp6sU-&C8@iMRQDzEW6f8!1Q&OdmQiGTQhN77sG zr=(2A|I>erp22l9G7~d13$rpCvoi;CG8c0*5A!k~^Roa8vJeZi2#fMg7GrUiU`du@ zX_jGGmScHVU`1A9WmaKTR%3P6U`^IyZPsC3)?V$^He++PU`w`QYqnuq zwqtvCU`KXhXLey%MzR~bvj^{DPxfMO_F-T4V}B0dKn~(yMsWy-GMd9UoFh1rqd1yl zIF=#CFqUx~$MKxNiJZjAoWiM$=QK{|49?^%&gLA>+Q~g}?F= zFY^ko@*1!6H{Rgy{DU`{_z#foNT&a1GA4Y{96pr7c1osVYNlZX(=r{?GXpa+6Eial zvoagAGY4}r7jrWY^D-avvj7XS5DT*ii}FquV{w*XNtR-1mSI_zV|i9!MOI>ER$*0E zV|CVGP1a&<)?r=NV|_MYLpEY#HepjXV{^7(OSWQbwqaYgV|#XBM|NUoc41dWvKzZI z;fus@{k+R|PxfMO_F-T4V}B0dKn~(yMsWy-GMd9UoFh1rqd1ylIF=#CFqUx~$MKxN ziJZjAoWiM$=QK{|49?^%&gLA>+Q~g}?F=FY^ko@*1!6H{Rgy z{DU`{@aIcsuPjC&NkJTYkrrJjL&M znrC>HKk!GM<4-)#3;dZE`3ryLC0^zgUgb4j=Wo2h-}wh`GD(VC{r(+H%4AH=6imrf zOwBZmU|ObQdS+loW@2V$VOC~icIIGC=3;KXFuiRXENKl37g z;jg^J%e=y?yvFPNjW_r^|KLp~NvZ#5QYK?^reI2@Vrr&g1k*Ad(=!7zG7~d13$rpC zvoi;CG8c0*5A!k~^Roa8vJeZi2#fMg7GrUiU`du@X_jGGmScHVU`1A9WmaKTR%3P6 zU`^IyZPsC3)?V$^He++PU`w`QYqnuqwqtvCU`KXhXLey%MzR~bGvVLU z1pUA5p6tcm?8Cn7$Nn6^fgHrajN%XuWi*FzI7e_KM{zXAa4bWNVJzb~j^jCj6FG^K zIfYXh&uN^_8Jx*koXt6$%Xys71zgBQT+Ah0%4J;66}+4G@Lt}>`?-=2a1|fqLwuNz z@KHX-$GMtMa1Ed2T0X^fe45X2J)h-se4a1xMZUxh+{l-?iJSQfxA0ZI#;tsvZ*Uvm zaL;RSZ@Gy_?C_m*f9_MHLoL}$+ zzvNf^n&0qSe#etM#qW8VXLy!B@JF8GPdv{H{FxW|3xDM$Ugi~ELhq%*?{9%*O1@!JN#++|0wg%*XsJ zz=ABq!Ysm~ypzRPoF!P2rC6F}SeE5jo)uV;l~|coSe4aSoi$jKwOE^VSeNx!pAFcM zjo6q?*p$uKoGsXrt=O7v*p}_so*meco!FUO*p-p&#_mk`@7o0ZzwMsv#op}0zU;^T z9KeAb#KDZ>5DsNDhjBPZa3n`@G{Zu-pBj7k`HhdALK)Pn2+#LKE}tnnon>IpX6FT z#dUm|&u~4T<#T+VFYraa#0}iYm$`|X`3kr2Rldfpe4TG_8{gzx+|IYTgYR%B-{mgu z=6l@3z1+wBe4iii01xs*e#Arkn4j=4kMJlz$3qHvJo4z37fJRo3jO5vK3pi4coFE z+p_~ZvJ*SA3%l~xf2%C~@BeIfXAj=Rp6tcm?8Cn7$Nn6^fgHrajN%XuWi*FzI7e_K zM{zXAa4bWNVJzb~j^jCj6FG^KIfYXh&uN^_8Jx*koXt6$%Xys71zgBQT+Ah0%4J;6 z6}+4G@Lt}>`?-=2a1|fqLwuNz@KHX-$GMtMa1Ed2T0X^fe45X2J)h-se4a1xMZUxh z+{l-?iJSQfxA0ZI#;tsvZ*Uvma zL;RSZ@Gy_?C_m*f9_MHLoL}$+zvNf^n&0qSe#etM#qW8VXLy!B@JF8GPdv{H{FxW| z3xDM$Ugi~ElQJ2TGX+yJ6;m?}Bbb)yn4TG!k(rp8S(ugC zn4LM8lew6id6<{^n4bk$kcC*7MOc(~vKWiA1WU3MOS25ivK-5^0xPl-E3*o#vKp(i z25YhwYqJjPvL5TR0UNRr8?y!bkZS zALnX5!8Lr6Yxxw{@o7H8^?a7k@p-<$7x@x5a3f#lCT`{{+`?D+8n^OwzQJvLlW%c5 z-{ua!!<~GWySSV0aS!)$ANTWpe!v4f$Pf7u5AkDu!oxhmqx_V|c$}Z{bAG`S{E}bs zYktFT`5jO46u;+bp5a;kz#n;zKk+;-@Mm7+FZ`94c$rstmDhNkzwrit=O4VuBoX?5 zCS@`vX9}idDyC){MldbYF+DRdBQr5GvoI^OF*|cGCv!13^Dr;-F+U5iAPccDi?Ar~ zWHAhGQ9G3}YF`aU9PHoXAO>%qg78cuwPV&frYW;%v_0T+ZWsF5p5g;$kl0 zQZD0iuHfCghxhV6-p`eMfUEc*AL7G&gpcwuKF-yAf@}CB*YYW@-j97Jj1j6fj{ybf8u#w z;Lp6sU-&C8@iMRQDzEW6f8!1Q&OdmQNz&^7nUu+xoGF-+shFB+7{Rnm$Mnp=jLgK$ z%)+e9#_Y_&oXo}C%)`9Q$NVh7f-J;F#F^pv#$8kI-a3Uvh zGN*7V<2jAfIfFAfi?cb0b2*Rmxqu6~h>N*|OSz28xq^4|9^T9Qct2P20j}bMe25S8 z5kAVt_&8Vd39jLjT+64pj!*L$uIICSj?eQ2zQ~ujfgAZUH*qsx;TFEi*SM9h^9^p} zn|zDg`8Id(9q#12+{N8|k9)Y6`?#O)^8+5>L4L@Oc!(eK6CUOf9_6Pz#^d~qpYscz z;FtW0U-KJ&%kOxSr}#Zj^9;}O2mZ)&{E6pzfj{#if8npZ#LK+GtGveR{EavGJOAKK zCj8$&g@*ulq`Nf^NXleP&J;|^R7}k@j9^-(V|r#_MrLAWW?@!lV|M0XPUd26=3!pu zV}2H3K^9_R7GY7|$zm+d5-iD5EX^`3%W^Ew3arRVtjsE`%4)368m!4$tj#*C%X+NO z25iViY|JKX%4TfN7Hr8@Y|S=o%XVzf4(!NI?949g%1Cx&clO|2?8#p2%|7hQe(cWy z9LPZ&%qR}wP)2hYhjRo+aui2%497CW7{)S=<2arZIFXY$nNv8G@tnr#oWYr##o3(0 zxtz!OT)>4~#Kl~~rCi44T*13}5AWrDyq_!i09WxrKE#Ll2p{ERe4MNK1lRCMuH{o) z$EW!W*YjCE$LIM1U*t>Nz>R#Fo4A>;a0_4MYuw7$`3AS~O}@qLe49J?4tMfh?&5B~ z$35K3ecaFY`2i2`AV1_sJj9Rr2@mrKkMdI<<8gk*&-n#U@JoKhulWtX<##;EQ~aK% zd4^~C1ApW>{>1aVz@K@MzwlRH;$>dpRbJzD{>B^poqzBqlcYEQXHq6(a;9KPrebQQ zVFc4M9n&)dGcpr1GYhja8?!S9b21lmGY|7JAM>*S3$hRkvj~gwP8MTvmS9PiVriCP zS(amYR$xU|Vr5ogRaRql)?iK6Vr|x8UDjiLHef?GVq-R8Q#NCBwqQ%PVr#ZxTef3+ zc3?+#VrOp%} z#xRy~9LMpTz=@p1$(+KejOR2?=M2u|EY9W}&gDGL=K?O|A};0h8VP1%gi*@7+Eimlm(ZP||P*?}F|iJjSnT^Y%4?9Lv%i#^$kz1fF- z*^m7>fCD**gBisk9Li`8<8Y4PNRHxYj^S8_7{ge`aU92U0w;13Cvys?GM>{ooijL- zvpAb`IG6J{p9{E-aRE;d(yH=lDEd;EQ~T8@Q1#a}zi76>j0He2rWAI^W;Zc6dV?55!_&LAe34Y11_%*-bxBQML zd5YijG|%uXf8dWi$Deqf7x*(T@)!QfOT5f0yvl35&fj>0zw-~?WRi^feM zDVAm#mSs7XX9ZSdC01q?R%JC-XARb5E!Jio)@41`X9G55BQ|CeHf1w5XA8DuE4F4E zwq-lEX9spwJUT_$J@tcD~IWe1|*vE_ZP^-{T(cC-^15;@A9!-|{=2%ko{DD969Dm|@Uf|EX$Y1y? zFYz+3@G7tII)CF0{?0#mlSwk^|CyA@n4Bq?lBt-QX&Aw@Ovm)hz>Lhq%*?{9%*O1@ z!JN#++|0wg%*XsJz=ABq!Ysm~ypzRPoF!P2rC6F}SeE5jo)uV;l~|coSe4aSoi$jK zwOE^VSeNx!pAFcMjo6q?*p$uKoGsXrt=O7v*p}_so*meco!FUO*p-p&#_sIFyV#Sx z*qeRWm;KnE12~X_IG9l!!l8`jFb?Mkj^rqg<`|A;h%t<19LI4yCvYMsaWbcHD&skg z(>a4PIg7J7hjTfP^SOWvxrmFogiE=M%ejJg^B&&I`*=TB@&T^mgM5e&^ASGE$M`r` z^9ioulU&QExQ->#3_&fjLO(w~#|7TJr zV{)coN~U6JreOrrG9A-112ZxcGcyabG8?lq2XitPb2AU~G9UA^01L7Z3$qA|@=g|G zah707mSSm^VOf@Ac~)RWR$^sVVO3URb=F`_)?#heVO`c^eKuf2HezEoVN*6^bGBeh zwqk3xVOzFidv;()c4B9CVOK`78@say?_y8(VsG|gU-n~v4&Xoz;$TK`2!}G7!#JEH zIFh3{nqxSYA;vJ4aU93-oWO~k#L1k(sf_0|PUj5HC%t!brALHX(%_q2qPjW4v;yOOfXSklv@;N@w7x*Gy z;s$Qy%iP4xe1%*1DqrJPzRowejc@WTZs*(F!FRZm?{XJ+^F8k2Uhd<5zRwSMfCu>@ zKjI;N%ujfjM|hN<@)(cvGk(r5c!FQ@D}K#y_$|NVNuJ{OJk2va%OCh7&+#Xo=LP=E zi~NPZ@)9re3a|1Suk$zF;P3o{H<=`h{+~&ijLDgTDVd6?nT8Qe%XCc749v((%*-sz z%52Qe9L&jF%*{N^%Y4kw0xZZvEX*P-$~#$%#aV(SS&F4uhGkifOmgh=Uo$AsotR4&!i+;7E?*XpZ4nh8V+G#&I0Sa{?!F5+`#C zr!tbQGcY4FF*CC;E3+{>b1)}!F*oxtFY_@!3$P#yu`r9U zDDPx37H0{TWGR+r8J1-^mS+W4WF=N+6;@?6R%Z>?WG&Wa9oA(%)@K7YWFt0a6EHH{atP?&Uu2=llGC z2Y8Sl@*^JN$NYqcd4xy#DUb0uKjY{8f+zSTzv9>YhTrl#p5!Th&(l1^v;2WS@*IES zd0ybpyvSepD=+afukb3b@j8Fw4gStQc#}!8>HnFO$(Woen3AcOnrRrpv`okJ%)pGy z#LUdXtjxyj%)y+@#oWxpyv)b^EWm;+#KJ7XqP&yESezwTlBHOhWmuNwSe_MFk(F4P zRalkPSe-RkleJizby%16Sf35pkd4@wP1uyp*qklclC9X9ZP=FW*q$BOk)7C?UD%b8 z?8ffw!MoU#z1W+5*q8m-p946MgE*K`9KxZD<}eQD2#(|^j^-GSWr#71WgN$GJST7> zCvh^Ta4O?Djng@UGdYX1IfrvOkMp^J3%Q7kxr9r(jLW%#ck>?J%lmjgSMmX_;)8sM z5AzW|%E$OPSMv$3;gej;r?`$!^BJz^vwV)v^98=hm$-o&`7$?gGhg8rzRK6Qm9O&+ zZsVJLi`)4&ckmtVER$*0EV|CVGP1a&<)?r=NV|_MYLpEY# zHepjXV{^7(OSWQbwqaYgV|#XBM|NUoc41dWvKzaz2k&A}_F`}LVPE!Re-7Y44&q=& zaR`Spn!`ApBRGFYn|1T*(KxiVyN3KFmk>C?DhFT+JuAhEH-WpW-?`&1blt z&+<7w&lmV2U*ZOC8n5#=-r(>2gEyHZhyI^QnT*Mqf+?AbshNfmOv`jk z&kW4SOw7zI%*t%c&K%6iT+Gcp%*%Yt&jKvSLM+T8EXq4sjKx`kC0UB4S%zgR?oIFqwDn{zmq^EjUixR8sum`k{n%eb5?csK9iy}Xb2b0r_( zDn7`E_%I*gqkN2yb2Xpf8a~Oje2VM%G@s#mKFjC$JYV38e2E*lkuP%-H}e&4;j4U& zTlqTQ;5NR=x44~ea|hqyPQJ@s+|Bp6hkLn?`}saU-~k@whx~|#_%T1>VIJX8e#&D! z&d>Nczu*ae$*=e|zu~w1jwgAF-}5xj@GO7ek37epc%B#dGcWQN{>n?d%qzUgYrM|i zc!R(558h;wocezzWilpb3Z`T#re+#OFfG$DJu@&PGchx>Fe|e$J9986b1^sbFfa2l zKMSxR3$ZYZuqf|jF&1YDmSicGW*L@cIhJPyR%9hsW))UtHCAU0)?_W#W*ydLJ=SLf zHe@3Y}ihk1lY`6-X_I6vd({DLR=CBNd={D$B1JD%hze$Uf9 z!?XN>Kk^)Z;(1=+&%DTA_$x2*GOzF|ukkv6;|>1KKX{W#a_Rq>l*yQ!DVUO}n3`!A z!L&@r^vuAF%*4#h!mP~3?99QO%*EWy!@SJL{4BtNEX2Yr!lJyB#aNsrSdyh!nq^p) zXFuiRXENKl37g;jg^J%e=y? zyvFPNjW_r^|KLp~$*uoqQYK?^reI2@Vrr&g1k*Ad(=!7zG7~d13$rpCvoi;CG8c0* z5A!k~^Roa8vJeZi2#fMg7GrUiU`du@X_jGGmScHVU`1A9WmaKTR%3P6U`^IyZPsC3 z)?V$^He++PU`w`QYqnuqwqtvCU`KXhXLey%MzR~bvj^{DPxfMO_F-T4 zV}B0dKn~(yMsWy-GMd9UoFh1rqd1ylIF=#CFqUx~$MKxNiJZjAoWiM$=QK{|49?^% z&gLA>+Q~g}?F=FY^ko@*1!6H{Rgy{DU`{B#-`|NtukvnSv>q zim91~5lqW;OwSC=$V|-4EX>Mm%+4Il$z06MJj}~{%+CTW$U-d4A}q=~S&YS5f+bms zrCEk$S&rpdffZSam05*VS&h|MgEd);wONOCS&#MEfDPG*joE}v*^JHEf-TvKt=Wcc z*^cemfgRb2o!Nz58Od(!&K|srJ=u%B*@u1EkNr7-138F;8O0$S%4iPbaE{hGQ9G3}YF`aU9PHoXAO>%qg78cuwPV z&frYW;%v_0T+ZWsF5p5g;$kl0QZD0iuHfCghxhXT;nE(0014JajrMKZwr$(CZQHhO z+qP}nwr#uL%$qZt`8OxCB5G9;^<`AlWKQ8!PUCdW;7rcqY|i0a&f|P8;6g6qVlLrQ zF5_~p;7YFIYOdj0uH$-c;6`rZW^UnDZsT_D;7;!1Ztme;?&E$Q;6WbZVIJX89^-MI z;7Ok1X`bO(p5u95;6+~IWnSS`UgLG%;7#7*ZQkKs-s62f;6py*V?N{)#nep0v`okJ z%)pGy#LUdXtjxyj%)y+@#oWxpyv)b^EWm;+#KJ7XqAbSZEWwg2#nLRpvMk5)tiXz_ z#LBF~s;tK9tihVB#oDaHx~#|gY`}(W#KvsGrfkOMY{8an#nx=Ywrt1t?7)uf#Ln!( zuI$F{?7^Pw#op}0zU;^T9KeAb#K9cGp&Z8H9Kn$s#nBwYu^h+ooWO~k#L1k(shq~? zoWYr##o3(0xtz!OT)>4~#Kl~~rCi44T)~xG#noKHwOq&b+`x_8#Le8ot=z`#+`*mP z#ogS)z1+wBJivoI#KSzoqddmrJi(JZ#nU{)vpmQ1yugdR#LK+GtGveRyuq8i#oN5Y zyS&Hye87i%#K(NXr+miee8HD|#n*hpw|vL<{J@X=#LxV~ul&aE{K236#ozqHzx>Al zarB=78Hj-ygh3gM!5M-f8H%A9hG7|w;TeGu8Hte@g;5!e(HVm=8H=$QhjAH?@tJ@L znTUy*gh`o<$(e#FnTn~IhH06O>6w8UnTeU1g;|-6*_nemnTxrZhk2Qg`B{JkS%`&M zghg45#aV(SS&F4uhGkifOmghGRL7<2iv7If;`wg;P0=(>a4PIg7J7hjTfP^SOWvxrmFogiE=M%ejIpxr(c~ zhHJTw>$!m&xrv*(g z=Xrq_d5M>Kg;#lv*Lj0Cd5gDshj)38_xXSi`G}ACgira5&-sEc`HHXkhHv?f@A-ir z`H7$Tgr zGYX?J8ly7?V=@+FGY;c29^*3s6EYDKGYOM28Iv;wQ!*7(GY!)+9n&)dGcpr1GYhja z8?!S9b21lmGY|7JAM>*S3$hRkvj~f_7>lz6OR^M8vkc3!9Luu;E3y(RvkI%S8mqGg zYqAz=vkvRB9_zCK8?q4_vk9BB8Jn{OTe1~fvklv_9ow@5JF*iyvkSYj8@sayd$JdM zvk&{SANz9v2XYVxa|nlW7>9ENM{*QLa}39F9LIA4Cvp-ea|)+&8mDsxXL1&2a}MWn z9_Mob7jh97a|xGn8JBYfS8^3sa}C#W9oKUMH*ym8n5#PZ}Jvz^A7Lw9`Ex3AMz0& z^9i5w8K3h7U-A`S^9|qf9pCc>9|OeGe+FbA24)Zj zWiSS32!>=RhGrOsWjKas1V&^eMrIU7Wi&=-48~+E#%3JGWjw}b0w!c4CT0>QWilpb z3Z`T#re+$ZWjdy324-X?W@Z*9LixF&Ji5RQ5?-N9LsSW&k3B!Nu10noXTmO&KaD^S)9!|oXdHf&jnn_MO@4! zT*_r!&J|qARb0(AT+4M_&kfwjP29{a+{$g-&K=yzUEIw*+{=C3&jUQjLp;nQJj!D{ z&J#SzQ#{QxJj-)D&kMZBOT5f0yvl35&KtbRTfEIXyvuvM&j)i zSA5Mke9L!y&ky{_PyEa;{K{|q&L8~AU;NEK{L6m~kRZVSpZ_o*12HgzFermDI72Wb zLoqbNFf79{JR>k7BQY|gFe;-lI%6;Fe|e$J9986b1^sbFfa2lKMSxR3$ZYZuqcbMI7_f3OR+S|uq?~5 zJS(swE3q=GuqvyuI%}{dYq2)#urBMdJ{zzh8?iB)uqm6dIa{zLTd_6Uur1rMJv*=? zJFzpnuq(TCi2XQcma43gyI7e_KM{zXAa4g4hJST7>Cvh^T za4M&9I%jYuXK^;?a4zR@J{NEy7jZF{a4DB@IahEcS8+Aha4pwyJvVS8H*qt!a4WZQ zJ9ls=cX2oOa4+|9KM(LA5AiUM@F zV|*rHLMCEjCSg)0V{)coN~U6JreRv9V|r#_MrLAWW?@!lV|M0XPUd26=3!puV}2H3 zK^9_R7GY5qV{w*XNtR-1mSI_zV|i9!MOI>ER$*0EV|CVGP1a&<)?r=NV|_MYLpEY# zHepjXV{^7(OSWQbwqaYgV|#XBM|NUoc41d`V|VsoPxfMO_F-T4V}B0dKn~(y4&hJ^ z<8Y4PNRHxYj^S92<9JTsL{8#lPT^Ee<8;p8OwQtL&f#3n<9sgQLN4NBF5yxx<8rRx zO0ME+uHjm)<9cr3MsDI}ZsAsL<96=gPVVAv?%`hU<9;6CK_22^9^p|Q<8hwgNuJ_q zp5a-Z<9S}-MPA}%Ug1?<<8|KPP2S>d-r-%|<9$BhLq6hTKH*b7<8!{?OTOZ3zTsQG z<9mMKM}FdGe&JVs<9GhxPyXU>{^4K#V}L~Z&wvcXzzo8m494IL!H^8a&Lhq z%*?{9%*O1@!JN#++|0wg%*XsJz=ABq!Ysm~EXLw2!ICV+(k#QWEXVS!z>2KI%B;ew ztj6lB!J4ea+N{I6tjGFnz=mwZ#%#i-Y{uqn!Io^r)@;MJY{&NOz>e(1&g{aj?8ffw z!Jh2J-t5D^?8p8bz=0gZ!5qS&9LC`s!I2!r(Hz6E9LMpTz=@p1$(+KeoW|*#!I_-J z*_^|G!IfOa)m+21T*vj?z>VC*&D_GR+{W$P!JXX2-Q2^y z+{gVqz=J%*!#u*HJjUZZ!IM12(>%koJje6Az>B=Z%e=y?yvFOi!JE9r+q}cOyvO@| zz=wRq$9%%4e8%T|!Iyl+*L=gbe8>0vz>oaI&-}u#{KoJ6!Jqua-~7YB{Ko)^^`8M5 zh=Cb|K^cs}8G<1hilG^XVHu9$8G#WQiIEwFQ5lWV8G|tyi?JDpaT$;CnScqIh>4km zNtukvnSv>qim91~X_=1cnSmLZiJ6&&S(%O5nS(i*i@BMHd6|#-S%3vuh=o~%MOlo+ zS%M{5ilteGWm%5pS%DQh8VP1%gi*@7+E zimlm(ZP||P*?}F|iJjSnUD=J@*@HdVi@n*0ec6xwIe-H>h=VzVLphAYIf5fOilaG( zV>yoFIe`;7iIX{nQ#p;(IfFAfi?cb0b2*Rmxqu6~h>N*|OSz28xq>UXimSPXYq^f= zxq%zGiJQ5FTe*$fxq~~oi@Ujpd%2JMd4LCbh=+NEM|q6Ld4eZ-il=#oXL*k2d4U&s ziI;hWS9y)sd4o53i??})cX^NZ`G61kh>!V%Px*|``GPO`im&;GZ~2bz`GFt#iJ$p} zU-^yS`GY_Ci@*7YfBBCAlITAJG7tkZ2!k>hgEIs}G898I48t-U!!rUSG7=**3ZpU_ zqca9$G8SVq4&yQ&<1+yhG7%Fq36nAzlQRWVG8I!Z4bw6m(=!7zG7~d13$rpCvoi;C zG8c0*5A!k~^Roa8vJeZi2#c~9i?akvvJ^|R49l_{%d-M2vJxw^3ahdjtFs1cvKDKz z4(qZW>$3qHvJo4z37fJRo3jO5vK3pi4coFE+p_~ZvJ*SA3%jx#yR!#-vKM=^5Bsto z`*Q#Xau5e|2#0bQhjRo+aui2%499XD$8!QFauO$V3a4@!r*j5pau#QE4(DU6 z2#@j@kMjgi@)S?=4A1f$&+`H=@)9re3a|1Suk!|P@)mFN4)5|F@ACm4@(~~N37_&A zpYsJ@@)ck64d3z|-}3`M@)JMv3%~Lkzw-xw@)v*e5C8HX10>ad24o-xW)KEtFa~D` zhGZy)W*CNLIEH5gMr0&LW)wzcG)89(#$+tUW*o+4JjQ1NCS)QeW)dc4GA3sVrerFn zW*VktI;Lj^W@IL2W)@~;HfCoI=43ABW*+8cKIUfu7Gxn7W)T);F&1YDmSicGW*L@c zIhJPyR%9hsW))UtHCAU0)?_W#W*ydLJ=SLfHe@3<{6&lIiBYQUgRZS<`rJ$HD2cp z-sCOb<{jSUJ>KU7KI9`l<`X{UGd|}FzT_*u<{Q4{JHF=!e&i>9<`;hDH-6_2{^T$I z<{$p$KL$vq{|v}L49p-5%3uu65Ddvs49zeM%W&*mcF53%Rf4u|(6CACP959+CqRJa zL7Jy16<|n}|1=y{v17acx{$oof69bUHl%r!5taLA|F6CI-&H{SKn%d zG|R9o%dtEwup%q5GOMsEtFbz3uqJD#;r?upt|LMGrO=WyRkcauqS)5H~X+J`>{U)0*Ks{Ja3eQy zGq-Rnw{bgna3^@Fs8ZHt+B*@9{n#@F5@ZF`w`$pYb_g@FidIHQ(?p-|;;^@FPF*Gr#aFzwtYN z@F#!qH~;W2|1rRS{%7$Y24o-xW)KEtFa~D`hGZy)W*CNLIEH5gMr0&LW)wzcG)89( z#$+tUW*o+4JjQ1NCS)QeW)dc4GA3sVrerFnW*VktI;Lj^W@IL2W)@~;HvaDsE{AtLmw1_1c$L?9oi})sw|JX(c$fEhpAYzu zkNB8R_>|B1oG@KzxbPf_?Q0}AmIPJhyKHW z48*_;!k`Ss;0(c#48_n4!>|m;@QlESjKs){!l;bK=#0UbjK$cD!?=vc_)NfrOvJ=Y z!lX>ba4+1 zY{k}W!?tY4_Uyop?8MIO!mjMb?(D&y?8V;f!@lgt{v5!89K^vK!l4|-;T*w{9L3Qb z!?7I4@tnYkoW#kT!l|6b>72otoWfJjBC1!lOLK<2=EWJjK&I!?Qfc^Sr=| zyu{1A!mGT->%766yv5tR!@Io4`+UHMe8k6m!l!)3=X}AJe8ty%!?%3L_x!+*{KU`v z!ms?s@BG1^{Ken=!@vB;0D<(M0U3ya8H7O@jKLX#AsLFH8HQmQj^P=B5gCb*8HG_9 zjnNr{F&T@o8HaHhkMWs+37LqAnS@E1jLDgTDVd6?nTBbZj_H|!8JUThnT1)IjoF!l zIhl*OnTL6qkNH`E1zCuNS%gJdjKx`kC0UB4S%zgjng@UGdYX1IfrvOkMp^J z3%Q7kxr9r(jLW%#E4hlRxrS@Gj_bLB8@Y*_xrJM~joZ0{JGqOyxrckXkNbIm2YHBx zd4xxKjK_I`CwYped4^|sj^}xS7kP=7d4*Sbjn{dDH+hS(_ANh%&`GsHkjo1rpG9KeI0TVJ26Eg{uG8vOI1yeE= zQ!@?IG9A-112ZxcGcyabG8?lq2XitPb2AU~G9UA^01L7Z3$qA|vKWiA1WU3MOS25i zvK-5^0xPl-E3*o#vKp(i25YhwYqJjPvL5TR0UNRr8?yXLAncavtY%0T*%+7jp@hav7I%1y^zvS91;5avj%m12=LLH**WO zavQgE2X}H8cXJQ-av%5e01xsI5Az6*@)(cv1W)o5PxB1V@*L0e0x$9sFY^ko@*1!6 z25<5fZ}SfC@*eN=0Uz=aAM**H@)@7=1z++NU-J#$@*Usv13&T;Kl2N}@*BVN2Y>Px zfAbIj@*e{P(SHVHAO>a-24ye?X9$L5D28SjhGjU0X9PxMBt~WwMrAZcXAH(JXAb6MF6L$)=4C$S zX8{&uAr@v47G*IOX9<>MDVAm#mSs7XX9ZSdC01q?R%JC-XARb5E!Jio)@41`X9G55 zBQ|CeHf1w5XA8DuE4F4Ewq-lEX9sp49jL!s2$V5!cBuvU=OwJTc$y7|uG)&8MOwSC= z$V|-4EX>Mm%+4Il$z06MJj}~{%+CTW$U-d4A}q>cEY1=v$xM$W7eLE!@g&+|C``$z9yd zJ>1KE+|L6%$U{8LBRtAuJkAq5$x}SdGd#<4JkJZf$Vb5JG{$# zyw3-G$VYt4Cw$6he9jkq$ya>MH+;)?e9sU3$WQ#tFZ{}H{LUZz$zS}8n2?E> zm`RwF$(Woen3AcOnrWDp>6o4wn30*7nOT^X*_fRR?oIFqwDn{zmq^EjUixR8sum`k{n%eb5?xRR^5nrpb0 z>$sj9xRINH=XjnM zc#)TQnOAs~*La;bc$2qyn|FAZ_jsQV_>hnIm{0hW&-k1#_>!;qns4})@A#e{_>rIZ znP2#o-}s$B_>;f*n}7J1{}>>+{xcv0F))KLD1$LLLog&mF*L(4EWbQGcY4FF*CC;E3+{> zb1)}!F*oxtFY_@!3$P#yu`r9UD2uT;ORywMu{6uDEX%PxE3hIfu`;W$Dyy+NYp^D3 zu{P_lF6*&A8?Yf8u`!#lDVwo5Td*Ztu{GPUE!(j@JFp`=u`|1{E4#5fd$1>au{Zm$ zFZ;1S2XG(@iy=9F7NR^AMha`@iCw9 zDWCB2>oY324Y|aVNeER zaE4$=hGJ-jVOWM^ct&7EMq*?}VN^zAbjDyz#$s&7VO+*zd?sK*CSqbHVNxbza;9KP zrebQQVOpkRdS+loW@2V$VOC~icIIGC=3;K84j-r{ZE;a%S2eLmnrKH_6O;Zr{2bH3n9zT#`X;ak4rdw$?Ye&T0-;a7g+cmCi{ z{^D=`;a~n^fROsnfDFXI48ouc#^4OWkPOAp48yPt$MB56h>XO@jKZjl#^{W}n2g2P zjKjE$$M{UZgiOT5Ov0p0#^g-FluX6cOvAKH$Mnp=jLgK$%)+e9#_Y_&oXo}C%)`9Q z$NVh7f-JNj_kzF?82_>#_sIFp6tcm?8Cn7$Nn6^fgHra z9KxX-#^D^nksQU*9K*33$MKxNiJZjAoWiM`#_62FnViMhoWr@C$N5~qgJnVE%InT^?*gE^UtxtWJ~nUDEdfCX8Ig;|6}S&YS5f+bmsrCEk$S&rpdffZSa zm05*VS&h|MgEd);wONOCS&#MEfDPG*joE}v*^JHEf-TvKt=Wcc*^cemfgRb2o!Nz5 z*^S-VgFV@cz1fF-*^m7>fCD**gE@plIgGpufB5Cby^gEAO{GXz626hku%!!jJhGXf(r5+gGTqcR$!GX`Ta7GpCG<1!xOGXWDa z5fd{BlQJ2TGX+yJ6;m?}(=r{?GXpa+6EialvoagAGY4}r7jrWY^D-avvj7XS5DT*i zi?SGtvjj`B6ic%V%d#BHvjQu!5-YO`tFjuavj%Ij7HhK(>#`o}vjH2j5gW4!o3a_3 zvjtnS65D)VRkMbCg^8`=w6i@RE&+;74 z^8zpO5-;-#uksqN^9FD77H{(o@A4k+^8p|75g+pjpYj=>^95h>6<_lW-|`*b^8-Kf z6F>6{zw#Tu^9O(O7k~2)|MDLLgwcNnWFQ7+5C&y124@I{WGIGa7=~pyhGzsuWF$sr z6h>t? zWG&Wa9oA(%)@K7YWFt0a6E?yQj^_kUZs!i}!9`5Bn?&kp> z49QRo%`gnha174~jL1lg%qWb?XpGJnjLBGx%{Yw9c#O{kOvpq`%p^?8WK7N! zOvzMC%`{BQbWG0-%*ag4%q+~xY|PFa%*kBL%{%qg78X`Id(oXJ_7%{iRQd7RG$T*yUS%q3jPWn9h`T**~j%{5%hbzIL4+{jJb z%q`r?ZQRZs+{sl%p*L?V?53iJjqi$%`-g9b3D%ryvR$u%qzUg zYrM`IyvbX<%{#oyd%VvFe8@+9%qM)xXMD~Ve92dQ%{P3@cYMze{K!xI%rE@PZ~V?5 z{K;SZ%|HChe+&>#{~3^h7??pAl))IBAsCXO7@A=imf;wl5g3t?7@1KRmC+cTF&LAv z7@Khzm+=^%37C+Hn3zeJl*yQ!DVUO}n3`#rmg$(D8JLlon3-9amD!k`Ihd2Vn45W+ zm-(2V1z3=USeQjvl*L$_C0LTBSej*6mgQKU6k7BQY|gFe;-lI%6;Fe|e$J9986b1^sbFfa2lKMSxR3$ZYZuqcbMI7_f3OR+S|uq?~5JS(sw zE3q=GuqvyuI%}{dYq2)#urBMdJ{zzh8?iB)uqm6dIa{zLTd_6Uur1rMJv*=?JFzpn zuq(TCi2XQcma43gyI7e_KM{zXAa4g4hJST7>Cvh^Ta4M&9 zI%jYuXK^;?a4zR@J{NEy7jZF{a4DB@IahEcS8+Aha4pwyJvVS8H*qt!a4WZQJ9ls= zcX2oOa4+|9KM(LA5AiUM@FV|*rH zLMCEjCSg)0V{)coN~U6JreRv9V|r#_MrLAWW?@!lV|M0XPUd26=3!puV}2H3K^9_R z7GY5qV{w*XNtR-1mSI_zV|i9!MOI>ER$*0EV|CVGP1a&<)?r=NV|_MYLpEY#HepjX zV{^7(OSWQbwqaYgV|#XBM|NUoc41d`V|VsoPxfMO_F-T4V}B0dKn~(y4&hJ^<8Y4P zNRHxYj^S92<9JTsL{8#lPT^Ee<8;p8OwQtL&f#3n<9sgQLN4NBF5yxx<8rRxO0ME+ zuHjm)<9cr3MsDI}ZsAsL<96=gPVVAv?%`hU<9;6CK_22^9^p|Q<8hwgNuJ_qp5a-Z z<9S}-MPA}%Ug1?<<8|KPP2S>d-r-%|<9$BhLq6hTKH*b7<8!{?OTOZ3zTsQG<9mMK zM}FdGe&JVs<9GhxPyXU>{^4K#V}OYI&wvcXzzo8m494IL!H^8a&Lhq%*?{9 z%*O1@!JN#++|0wg%*XsJz=ABq!Ysm~EXLw2!ICV+(k#QWEXVS!z>2KI%B;ewtj6lB z!J4ea+N{I6tjGFnz=mwZ#%#i-Y{uqn!Io^r)@;MJY{&NOz>e(1&g{aj?8ffw!Jh2J z-t5D^?8p8bz=0gZ!5qS&9LC`s!I2!r(Hz6E9LMpTz=@p1$(+KeoW|*#!I_-J*_^|< zoX7cGz=d4I#azOrT*l>G!IfOa)m+21T*vj?z>VC*&D_GR+{W$P!JXX2-Q2^y+{gVq zz=J%*!#u*HJjUZZ!IM12(>%koJje6Az>B=Z%e=y?yvFOi!JE9r+q}cOyvO@|z=wRq z$9%%4e8%T|!Iyl+*L=gbe8>0vz>oaI&-}u#{KoJ6!Jqua-~7YB{Ko*1^q&D4h=Cb| zK^cs}8G<1hilG^XVHu9$8G#WQiIEwFQ5lWV8G|tyi?JDpaT$;CnScqIh>4kmNtukv znSv>qim91~X_=1cnSmLZiJ6&&S(%O5nS(i*i@BMHd6|#-S%3vuh=o~%MOlo+S%M{5 zilteGWm%5pS%DQh8VP1%gi*@7+Eimlm( zZP||P*?}F|iJjSnUD=J@*@HdVi@n*0ec6xwIe-H>h=VzVLphAYIf5fOilaG(V>yoF zIe`;7iIX{nQ#p;(IfFAfi?cb0b2*Rmxqu6~h>N*|OSz28xq>UXimSPXYq^f=xq%zG ziJQ5FTe*$fxq~~oi@Ujpd%2JMd4LCbh=+NEM|q6Ld4eZ-il=#oXL*k2d4U&siI;hW zS9y)sd4o53i??})cX^NZ`G61kh>!V%Px*|``GPO`im&;GZ~2bz`GFt#iJ$p}U-^yS z`GY_Ci@*7YfBBCABI`c`G7tkZ2!k>hgEIs}G898I48t-U!!rUSG7=**3ZpU_qca9$ zG8SVq4&yQ&<1+yhG7%Fq36nAzlQRWVG8I!Z4bw6m(=!7zG7~d13$rpCvoi;CG8c0* z5A!k~^Roa8vJeZi2#c~9i?akvvJ^|R49l_{%d-M2vJxw^3ahdjtFs1cvKDKz4(qZW z>$3qHvJo4z37fJRo3jO5vK3pi4cq>|cJ4ads;Y4mHPRqmBHbn3-Q5y`bjPMQ-6bW6 zDBUHXlz`G5(x8HLw~C!$-Zk0pR{4H@=RD`!=ed8~^~ZYGh&lG0Bj*}(@6BH9%{~lg zU-n~v4&Xoz;$RNpP!8j8j^Id+;%JWHSiZt>9M1`y$Vq&alR1U2aVlTuG`_(%Ih`{& zle0LRb2yjtIG+o+kc+sOOSqKFxSVfs1y^zv-{w19%{5%hbzIMPxq%zGiSO}!Zsrzl zMm%+4Il$y|JaxtWJ~nUDEdfCX8Ig;|6}S&YS5f+bmsrCEk$ zS&rpdffZSam05*VS&h|MgEd);wONOCS&#MkA{($F8?iB8ViPvy%WTHxY{8an#nx=Y zwrt1t?7)uf#Ln!(t_)*0c4rUvWH0t+ABM9p`>{UQm9KLe-{6~^&KaD^S)9!|oXdHf&jnn_MO@4!T*_r!&bPRN zE4hkq^Bu0{8m{F!uIIblz>VC*_xL_Ha|^d}8@F=@cXAhZa}W1&ANTVB5AqN{;9(x& zQ6A%Qp5RHI;%R=!kN7dq@GL*!IiBYQUgRZS<`rJ$HD2cp-sCOb<{jSUJ>KU7e#+1I zIUn*1e#x)+HNWAv{EpxA2mZ*P_%k2zF@NE&{Eff!5B|w0DXjmD%4m$v7>vnSjLkTV z%V!yn@tJ@LnTUy*gh`o<$(e%BF(sd8DyC){re!*&X9i|uCT3<9W@R>JXAb6MF22Cr z%)`9Q$NVh7f-Jp(ta2&^T0w;13U*%*@;cJ}A*Ex-E@J&wV z49?^%&gLA>MsDJJe4m@S zg4kmNtukvnS#$TC7)+1 zre+$ZWjdy324-X?W@Z*a4+X+e$I#df?x70e$8+AEx+UU{DD96C;rSw ze9T|?D}Uqf{DXfo%JbHLMrAZcXAH(mS-jD>a3eSIJ-*M)+`_Hg#_im}o!rIU+{3-x$NfCOgFM6! zc$i0cl*f3SCwP*lc$y#bBYw;?Jj+jbj^}xS7kP=7d4*Sbjn{dDH+hS(>w%wPB`f8+1`gMTtgD(gR^G8&^Z24gZ7 zV>1rp@>#}Xd?sK*CSqbHVNxbza;D&OOv&e&im91~X_=1cnSmLZiJ6&&S(%O5nS(i* zi!U%Y^Dr;-F+U5iAPccDi?Aq*u{cYxBulY0%djlVu{##2Cu|8j912$wMHs(ug!lrzg&DfkR*pjW-nr+yY?bx0j*pZ#snO)eGVeH24?7^Pw z#op}0aQ0?yD9LMpTz=@p1S2>we_!_73bxz|O ze3R2TgEKjcvpI)zIgj(XfD5^Zi@AhLxs1#C7FTd3SMhDW!_{2FwOq&be3u)zk(>A) z-{)p-;Z|%766yv5tR!@Io4`+UGp`58awLw><8`4zwBH~g00@q7NjANdo1 z<|97lFZ`9i@pt~gKN%&p^`B80jnNr{F&T@o8HaKCEaNdg6EGnYF)@=cDU&fdQ}8*a zLhq%*?{9%*O1@!JN#+7nqxQn3wsOp9NTug;dpRbJzD-r!B%;%(mHUEbq; zKH#VPjGyx%zu=erieK{^e#`IpJ%8Yj{E0vF5g+px{>tC@JOALHjFQIs&!~*X=#0Ub zjK$cD!?=8w@fe>8n2?E>m`RwF$(Woe_#9L6d8T4&reRv9V|r#_MrLAWW?@!lV|M0X zPUhkZ%*{N^%Y4kw0xZZvEX*P-%3>_e5-iD5EX^`3%W^Ew3arRVtjsE`%4)368m!4$ ztj#*C%X+NO7ukRf*@%t#5}U9oUuH8lXA8DuE4F4Ewq-lEX9spfCD**gE@plIgGC-GHI<`llhseGN& z_y*tPbk5*R&f;v&;atw+d@kTZF5+S?;ZiQ+a=yhCT**~@o9}Qn*KjS@aXsJV25#gg zzQ^~unOnG(+qj)OxRblMn|rvI`?#M6c#wzq0T1&CkMbCg^8`=w6i@R*e#DP?hG+Q+ z&+$Aj@FFkqGOzF|ukku>@Fs8ZHt+B*@9{n#@Kb)q&-svF@JoKhulWtX<#+s^Kk!HX z#GmFe|e$J9986bMXb{W*+8cKIUfu7Gxn7W)T);F&1YD zmSicGW*L@cIhJPyR%9hsW))UtHCAU0)?_W#W*ydLJ=W)oY`}(W#KwGyP1uw#vl*MS z1zWNeTeA(@vK`yA13R)4JF^SBGK}5Wojur-z1W+57|y=z$Nn6^fgHra9KxX-#^D^n zksQU*9K*4Eh2uD$6F8BR_$nuJ3SZ+?zRqcUgKu&=XK*HGaW?00F6VJR7jPjLaWR*0 zDVK3M-{K0c1KE+|L6% z$V2>qhk1lYd5p(-f+u;3r}-g2;>SG0v;2hTc%Bz{k(YRxS9q1zc%3(Rlec)AcX*fg zc%KjWDL>=qe8?~OCBNd={D$B1JATg}_#=Pf&wRwk{Dr^rH~!8)_$Q;Jv;H$GqcJ*T zFeYO$HsdfZpJhD8X96Z+?l6U_&-yW4^>DY|59}jLq4CE!m2#*@kV|j_uij9odPU*@ayh#%}D+ z9_-0p?9Dz5XJ7VXe-7Y44&q=A;ZP3aaE{40 z=QO^-H#wa%IFqwDn{zmq^EjUixR8sum`k{n%eb6xaRpa$72oDNT+KCH%XM7Oce#NZ zxry)beQxF!Zsj&^=ML`VF7D1Y{k}W!?tY4_Uyop?8MIO!mbQsH+E+a_GB;iW*>&LFZ;1S2XG(gB(jLK+? z&KQizSd7g$jLT;kkMWs+37LqAnS@E1jLDgT&oL#RXDX&<8m47Bre_9bWF}^27G`BO zW@irOWG=qI+|0wg%*XsJz=ABq!Ysm~EXLw2!ICV+(k#QWEXVS!z>2KI%B;ewtj6lB z!J4ea+N{I6tjGF%kqy|8jo6qku?d^8n5#PZ}Jvz^A7Lw9`Ex3KjmlqoDca0zvNf^n&0qSe#h_m z1ApXC{F#sVn7{B>{>I<=2mfT0j8UGvo?%o*V|2z~OvYkt#$jAO%Xo~>1Wd?8Ow1%q z%4AH=6nu^;`8-oGHPbLH(=k0WFe5WDGqW%&voSk!Feh{I1?FZR=4C$SX8{&uAr@v4 z7G*IOX9<>MDVAm#mSs7XX9ZSdC01q?R%JC-XARb5E!Jio)@41`=ZkE>hHS*fe2Goi zlrOUxo3jO5vK3pi4coFE+p_~ZvJ*SA3%fFm-PoNy*pt23n|&D0zU;^T9KeAb#K9cG zp&Z8H9Kn$s#nBwYv3!N&IGz(Yk(2l;Cvys4<5a%RX?%lkayn;lCTDRr=Ws6PaXuGt zAs2BmmvAYUaXH`O3a;cTzRh>Inrpb0>$slpasxMV6W`13bt>{D6mfghzRd$9aM$d5Wj`AwS~BJj1j6gy(pk7kH7Ec$rstmDhNkH+Yk` zc$;^4m-l#|5BMoR4&!i+;7E?*XpZ4nzQS=F&k3B!Nqm)) zIfburDqrU`zQH#+oijL-vpAb`IG6J{p9{Ee(1&g{aj3}ZKTXAkydFZO01hO;mGu|EfJAO~?U zhj1u|aX3eCBu8;H$8aoP;W&=x1Wx26zRJm*!q+&JuX7sT;G3Mz8Jx*koXt6$%Xys7 z1zgBQT+Ah0%4J;6x442Uxr%S|9j@jYuH`ze=eyj%joifd_&zst3%7C`w{r(~au;`V z5BG8(_wxV`@(@4ZVIJX89^-MI;7Ok1X@1C$_%YA$EI;8np63N#Y#BGavCWf8nqEjlc5`{>dm= ztpAM4XpGJnjLBGx%{Yw9XBm(2nScqIh>4kmNtukvnS#$TC7)+1re+$ZWjdy324-X? zW@Z*a4+X+e$I#df?x70e$8+AEx+UU{DD96C;rSwe9T|?D}Uqf{DXfo zN;d01qcR$!GX`Ta7GpCG6w8U znTeU1g;|-6*_nemnTszlH}fzr^D#dQupkSuFpID#i?KLMup~>dG|R9o%dtEwup%q5 zGOMsEtFbz3uqJD#;sxWCJ#2BR1wsY{I5|na$XoE!dK+*qUwFmhIS{9oUhb z*qL3}m0|40?(D&y?8V;f!*KRxKlbMU4&)#X<`53$Fb?Mkj^rqg<`|CUD;&r1oWO~k z#8)|)Q}`OE@^wz*8+?<~IfFAfi?cb0b2*Rmxqu6~h>N*|OSz28`4(4jC0FrnzQfgA z!?j$;^?a8bxRIOq9^dC?ZsAsL<96=gPVVAv?%`hU<9;6CK_216?z#sV&f94}T<}dt}zwvke!9N)#yY-(@8I92ygE1M4u^ESP`7GlxJ`*q@ z`?c#*yl($?jbn7}*{?v+@ZQfvi4xW|^pdG|R9o%dtEwup%q5GOMsEtFbz3uqJD#;sx zWCJ#2BR1wsY{I5|na$XoE!dK+*qUwFmhIS{9oUhb*qL3}m0|40?(D&y?8V;f!*KRx zKlbMU4&)#X<`53$Fb?Mkj^rqg<`|CUD;&r1oWO~k#8)|)Q}`OE@^wz*8+?<~IfFAf zi?cb0b2*Rmxqu6~h>N*|OSz28`4(4jC0FrnzQfgA!?j$;^?a8bxRIOq9^dC?ZsAsL z<96=gPVVAv?%`hU<9;6CK_216?z#sV&f94}T<}dt} zzwvke!9N+f@IUiRL_}0ZV|2z~OvYkt#$jAO%Xo~>1Wd?8Ow1%q%4AH=6nu^;`8-oG zHPbLH(=k0WFe5WDGqW%&voSk!Feh{I1?FZR=4C$SX8{&uAr@v47G*IOX9<>MDVAm# zmSs7XX9ZSdC01q?R%JC-XARb5E!Jio)@41`=ZkE>hHS*fe2GoilrOUxo3jO5vK3pi z4coFE+p_~ZvJ*SA3xnJ5;1$Mh4DP{$R}c1NF9!GS!7I3L4_?8Yd+-Ww)q__*_U8Z& z?yD9LMpTz=@p1S2>we_!_73bxz|Oe3R2TgEKjcvpI)z zIgj(XfD5^Zi@AhLxs1#C7FTd3SMhDW!_{2FwOq&be3u)zk(>A)-{)p-;Z|%766yv5tR!@Io4`+UGp`58awLw><8`4zwBH~g00@q7NjANdo1<|97lFZ`9i@pt~g zKN%%z#60s1qcR$!GX`Ta7GpCG6w8UnTeU1g;|-6*_nemnTszlH}fzr^D#dQupkSuFpID#i?KLMup~>dG|R9o%dtEw zup%q5GOMsEtFbz3uqJD#;sxWCJ#2BR1wsY{I5|na$XoE!dK+*qUwFmhIS{ z9oUhb*qL3}m0|40?(D&y?8V;f!*KRxKlbMU4&)#X<`53$Fb?Mkj^rqg<`|CUD;&r1 zoWO~k#8)|)Q}`OE@^wz*8+?<~IfFAfi?cb0b2*Rmxqu6~h>N*|OSz28`4(4jC0Frn zzQfgA!?j$;^?a8bxRIOq9^dC?ZsAsL<96=gPVVAv?%`hU<9;6CK_216?z#sV&f94}T<}dt}zwvke!9N)#n)RPi8I92ygE1M4u^ESP`7Glx zJ`*q@6EQK9Fe#HUIaBaCrsVTX#nep0v`okJ%)pGy#LUdXtjxyj%)y+@#TS^Hd6<{^ zn4bk$kcC*7MOc)@SezwTlBHOhWmuNwSe_MFk(F4PRalkPSe-RkleJizby%16Sf4Ml z0UNRr8}lVLVN<@$W^B$DY{^z^%{FYyc5KfM?8r{+%r5N8Fm_{i_Fzx;VsG|gIQz06 z`*Q#Xau5e|2#0bQhjRo+aui2%49D^nj^lVv;6zU1tDMX!e2r82I;ZgszRBsF!I_-J z*_^|miz~R2tN1qG;cBkoTCU@IzRL~V$W45Y?{hP^a4WZQ zJ9ls=cX2oOa4+|9KM(LA5Ag#Y<`Ev{F&^g$p5!T>=7;=84j-r{ZE;a%S2eLi5ny5q*T3+oyBHXVA>p7XCK?fpU#|NoQrC$|>qBW^IF zWMD>SVrFJxR%T;%=3q|d;tR~pJj}~{%+CTW$U-d4A}q>cEY1=v$xau{ZlLoPF7k{W*XGIf#QfghM%u!#RQ@If|n>hGY2($8kI- za3Ux1RZivx^9Yaf7?1M= zPx2H`^Fw~bk9me?`3cYQJTLGfFYz+3@G7tII&bhMZ}B$o@GkH1J|FN?e#X!FkYDgi ze#Ni(4Zr1g{GLDXNB+d0`G}AC3xDNr{GEUBPeyqr;=?nH%4m$v7>vnSjLkTV%V!yn z@tJ@LnTUy*gh`o<$(e%BF(sd8DyC){re!*&X9i|uCT3<9W@R>JXAb6MF22Cr%)`9Q z$NVh7f-JBF^5E5rz1fH1 z?91SmJ$Uu!01o6J4(1RJ0r}A}9;~RXF z(>a4PIg7J7hjTfP^SOWvxrmFogiE=M%lQ^pa3xpqZN9_RT*I|o$Mt-d8@Q31_#WTq zW^UnDZsT_D;7;!1Ztme;?&E$Q;6WbZ2RzIpJj!D{&J#SzQ#{QN`4KB=Z%e=y?yvFOi!JE9r+q}cOyvO@|z)$%ZKj%Y!!7uq0zvegmmf!Jv{=gsk6MyC- zKISj{mA~i- z$>*7hshNgpnU3k1ff<>JnVE%InT^?*gE^UtFEBUrFfa2lKMSxR3$ZYZuqcbMI7_f3 zOR+S|uq?~5JS(swE3q=GuqvyuI%}{dYq2)#urBMdK3`-5He@3<=1XkCrhJ*r*qklc zlC9X9ZP=FW*q$BOk)7C?UD%aj?8ffw!Jh2J-t5D0_GLfz=Kv1mAP(jb4&^Wo=LnAE zD30bBj^!&H$MKxNiJZh&Ihj-V8mIDgPU9PVlhZkaGdYX1IfrvOkMp^J3%Q7kxr9r( zjLZ2JS8yd)@om1t)m+21T*vi%mm9c|oA@5z=Vor;{FdMGd;Y*5`4fNUBR=LY{FT4)cmBaY86}$apHUf&(HVm= z8H=$QhjIBV<1s!HFd-8$F_SPUlQB6{@HwXB^GwCmOvAKH$Mnp=jLgK$%)+e9#_Y_& zoXo`+n45W+m-(2V1z3=USeQjvl*L$_C0LTBSej*6mgQKU672otoWd(4A1ft zp5u95;6+~IWnSS`UgLG%;7#7*ZQkKs-s62fVB{aYM-Tncd-3r8p>NfZfAc;d^m%yr zz|i|&fAhZNuP5fWnuzwqIq$o9QM;kJGAw(k|*G4GQj;#O|kuWi-f*tp>%LODWz>_0L@ScZttqeA3p z86NB|<_P^N!rv7T>@x-x5H+F#g6+l}O+#-{Dm;0w7;)WC{hxqfD>69b$-fC`obtb2 z@{|F*0go+p+`cDJ@iz3=RbzHF5`PXqla|N58IhqDNpi{42 z;lbADlR&$+;o-slr@)Z#U=y@szjmRM3ARC>u7F@e^hu!+6%g!;MqGK@Cl~NkWd)n1 z|HvEcnMTxAMBa|W!-MV9E_sKC2ivDPy5tWFD!fa9mf^uRYL4OI!7gf!E`_@k32MC7 zGogRFFf+(dEG(#!{=>s({goaisGm^!?1=Q>())Mp-?v}8c3qN$Y2Z1b$jAns8ywj( zd|tp)=R7}%34UC}^(_d35hopSnHo5IsB#vD(w?gDMUltWit(hO>i=h3|L?+)H@Bg3 zp1l4u+^=2Rmf?#de(c|Ua!KTmjX33{LGXW`a@*nI!6tD~+rcJrP}{-P2eln+77q^( zwu&PvJBrF)8Kf5r3ofgF+lb0+9$A^ILXnY`8EhtpDl^zie!4P)UFCmNX0W;ZFK5=J zO{g-14dKYE-szAiPkKhbP_erd`}gd!`{`o$ zJSjGqBZ6sRZz%NFw6HG}2u%x5wbFh^hbFgRh6%ReBW9RhJv)X6oAFI!go^6dCA?$D z@L*5=Nn!m4_6iR+<|F2q!@)666?!Cy30-N7;2*0r&Jod|f4_%PpS=Fl|2VZ^i~nCr zi5ydrRDw z_g?rPCmTF(_^X_MUUVq_$t&_St>00ii+?grpK@M*&k4cviYMhov`g^VBBHPS%QPK4 z%lJp$;E~3YycPeNrh}&&L0=D^ZUlWj=$t`c51w*_rs;D}qJwEVavDA#WGEIE)J|xc zzVKIiP7>Jgv+ zWi`I;!2jr5q1gX*H3z?$80rBxLK&axt2dPw?MVli*m#hF@ZUQ?M22MLBTUIO%)m^{ z!fedJT+Chm$#ixr;`ILR2f=fpf7Da(c<8^@Q)p5P9vcM>eJ{utYUrTigND8za(MWI zC$SOj9PP<+8$3n|wR2DjCbh=#I`nJZqwAoK9a{Gr(l`9GP)cOO2G5ux8}?!7bEsjz z2zaVtzYJnR4Qu@ko;y8t!vE5w!857gkpIgj4IX7Z)uh2QttY1tG-)ua1rN7!bV*aA zK%pYV3gj4&8PHkG;bF z)ya^iPND>jqgSfZuu=Grp@@DB!hZ@8D);B0+<$os&77e-fk&a#rz-DpDB>TL_e&7` z=ctINyihy;8XOsF=ih=Hp>_`1A!z5{LkIc{_?%sOFw}{YN=*{#j0(CyU<0C6A{iOB^Li$%0dF1Wr1b zDR1(S7aMII7MT5htrN);|3^7-|5;AZcb^R+ng**y=$^koc)U zR5(20KSD*q6NN$%%XQ+=r>EOCNe~li+h-!$HfeBRaJd!!Wvxl}bl&8Vd95{1^_dh; z=Y1|RZ|JLAFdILWH|5iLpO4I2>8~{`RVXUx@~MNIp)Mcv(V)wx3Btp}(>{s*=NgtS z$WSaS=qaH!Ed5{Uk!x6ni1eTbgwiwql^(5D^e1y;rik>QWkczi|4NTqD`M`<5|JLX zm^H9*oQR3DL;v34Swqp0BP?6+IYX#QL*G)fhdzf!SdM_FY9MD26EO=0?Vc+L{6H)t(sLL(t|cKvO*h0p)H_}-NUPf;)aJ;dveJ9r?!Qj zq{pov3{D_SQA{*QsDwnAg$?|&d! z*Ma{xi`r8K)l*QYr9wj^QDk3>914*;OZ7w9o*JAlIxckY8mhP8OLpY1^(x~3ySvwj zADFj%g!x&31zCuNS%gJdjKx`kC0UB4StjDHxj{t5{C(HlFtTDIR8lpTz$2uKDF4y;xXK{qCm1o=o?8U9(VpWUp%;9NaRz zML?+S{yxQ)K}@LE8J(?y;FIs#HTrezQ6ao_C>$2NndGyF%@1{l&=;6CLB^+gLE9iE zm}~y(1^;EMCr31F$58M|E&aOmU?tsWMH zhlh8268+C^Bt2tK*IzdA<6@P464eWFcRus;{-6a7PRPj!g_kp=zLC6+wZ3Q;1vLZng3 zOJ`(Pi1dGUiHQ9Fr^iQ@6Zy3{aw<0UKtea|z#uGJHo`R76x$pY2D5|&F2|j;^4?iGY3BC8Hr^Nr zl2L-`m=30QLP_WeEf9>8kdRPAD53X~Kn$txpR&7iIw9kPB$NK~%MW40;)#lc)hqk9+P%d+ zj>YQi%DK&nSfYXq>f?!OyJC;l*46|5Nsst15%c4AWxha7>7Uoe7ho_s3>Mym-l7ts%+k4W8DBofwshOPTrf= zqyXG70C1cH&?U^W8nf~7MAbmd6iPAiR7er4RI*CRY9(uwtW|QDl66YfE9tfN(x_{D z$l2ZP>}%n`wg08YnkNj@=r;;Rzp)zqcyeI18cU6mwMq_CvQEi*B`qa~D>*{RkxF{4 zcE#MjzD&`sm0VenF=*v$MNx;oNjSy};oX+mae@6;HAWZjBo{YHLijrq`vZ#ua$6p}Zi_ zO()qlgsX@?i(*Se9osg*a68p8G2kH0fb9hXmYPwsH!&c|;_+Gf^N9*QE3eR*%~*#L zPYYhUtH)wH%xRoFZSs^!O_L&z)!ItL-O;Y<=W1vWX!rueKWZr61!;3=vru&HgSRHm4vw3JV! zESgRmC6Z|rYc(Q`>}uB@o$f4BKh5rT#dNXYW$fyFy31j^wH7n(SQj3j?z7$F?iMZ_ zxCc3KPh(1Zt@o6YY;Wo*^jOOm^rU^GsiDgP;~B<)W`038Dh6VuaV;6;vwd8`cHg+$ z=IdfH65MZX^+hoNX_xt0UD_UA$6cJu(|6!uq10W zkUB{9MBMJs+@3dRb_?Q4g7)Fw3cMC6>y%u(On`Z?G3GQF zV1ARpJSYlgWmlFRE|C0|&%UjZJc5usGEAY7+&>rVt;(LsAo9P1F~@q{?fa7+HBH40 z?2dxAQY4(wnoXJ(UdAv@!?a46Zd}S3y%KmCC&@DAC)&z@rZ5k3oBp(xC7*6D(}gxI z?J69-)0$*X+^*`*JFLU5@{Z|d3mmTxdB`>kK| zB*sMynire~z55g;y$8LN_a6N`z-|Lb)XRDr+7(8c7gK;fVZmeWn$sJp~S5-H4G;V#ndCNgYUU8g%<1&PKqZB=P=G^>S0cU59fNlB~ewzD%OcG>LC z0B(0Tdss%1|1P^a1sdNSoXGB>z4sy)OuyG>KZ(0n*kcvJac0&m|EXg6eZ=zn39N9* zctC+g4hn7z9$d#80}6}BH(x#DlQFT_<~Ju#gY25|@e7dmKGC=HrzD5BZ?Y6}?MBZo z_zTHd*LK|wxpI1C&03PLtywuAvMD+7GRUpQytU#pl0BFC&qLlfW$eVyNsfB>s>P7W z!*+l4^D%?pEX*txvU#Ua5H}1D5h_3P*~3WGcCETqc!bt1x<3#%2tO}L*h%&%ffL5j zYJl^V#gSVYERRKKj!-KT32P!IMkM?~M#3)*=l)7L_rZSqf9NY0et6txzZMH-6#w6p z;P1015HG()yxfm?`JK<6G~?wdikH=DZ7^b_KI0WslsxU;?NVG6?K)aEIyla#&-m=w z0gHxESu%&x{aSi|m|E;P8t(6X_B?`VgbF6v%OBjqZZXXJix;Se7b(Nx8{n1z+<+5z}?iCo*UM?D#udZWY(kAOa8<<|JaN=t|dwqZt-!Pp>XPh@N?J$bo z!c2yV)ujjl$K3qQ;``!@GdD(S1d)ON&D_HjyQ3$2$=HAYy`P;2u$o^~2J!egsQm2CeSFZkmkgS$908{iZEaZmHyi z$SOYo=a2DOg)^F!jQ>v&HMTcm?0+sD`zNrf6=R=-PQqvCv3~&vK8<`FU6zdPFA=qb zo>tzqG&UmZXJ(9ku1v9e{wr;m(_#xoTnWSYZxNHf)8@L2$v+|{|D?_LKogGReQ3oi zs7PCMY3TU8lFF0|c@@kk9lk{AkXNJN^BPen@0!RZ%ManTux@tWD*Aj_pd`i#Wt~*M zN6Pv@S(&I3N=qs~BIWQvS(T_3$`MleNzuV|pL@4mkE>jf3;9UW*&|c!PDh%qR#FSI z`F1`Ebv5FOk#B(a4Fi3+wc)r+33fNdOu;?e=VM@9&7B34`B-`-yo0L)z9Cfrso?g2 zk3&r~3GngCMrK$3H5Bct4DFIPqF1~v;TzMNJJGIU#YOz<^iH>~MjzimBl-@-DS4l7 zf}%XO6HoZ2q8Ld$d^2TVC(Z~hA(iDDX~m%_Cgy%Z*S-bJ)vGII0TJz$QU`DDQKwkQERa6P}C60^h2Fs+dBTBfGcC8A3NGl z^vag%#iNI*v#0*qRX}){8dpfiZ|eRe{D)JKoL^>H{+4O@ZDFXEcv_Z^pfWgnB+Q7j z;2AiZMA_%x5!KO^IR%5}9B`o}_NYLw7w5FpW4>~g&A6LrQfHt8soT&j$=e|eQah+2 zu+_5Hfuba}yzvXp6~VO@U!aKW~lZ*CK&gE)^ zF+EgWF6n1sv>&<&Ckno^>a)|v(e1(X$)~&s+5galhVdkyIrNUDklVkWSTlZ1Nk5&@ zeh{-=sIaT)r`txJr+4u>nRn~gDZEF&HgTq3r*clOGPDcyW;)PA4dO-evWQk?d3m&H zc#JSCv-RYQOvA;(Q0*%X?kq74j}?Y$r%Fs;N@XzJhZ!*)ci?Yv`iRviRkAp#T z4s;-hWNyENL*zme@OQ~gwe zpU8HXqaueqmqUijA-Cm_)pE#Zk-va(cXW-vWT8dk3bHW0cO`1VJa-ip%jdbPO~Y%1 zVYxhat!a3jFf5nnu2+WYSa$=8Vs%tuPr2?!sspcYf{u6%_JP+wMA_#zi|S~uy9EZ# zIgo*J)vbYE=epaZ9!@6xj{?2Ubw8GRIFIzV2l~oHZK9IS^LI!+oT74F(XOGt3v@d9 zomA)ZyF`-}#<@)Bt$DDr_}yec;dBoP5m)!3L6ZMOw2K~deu|nfG2Vxwj;nHs@qT5$ zK8eu?%?DIR`QOW>@_P*}QchB` zQOU_lPEoQ+$*D?CQ*u`&o0Z&6$>~b&uH+s{?kRKl&m=Hm4u4n!BQxY9D3*AVa`?~D zjJW=#fYasLu6ll)f@M&f=PHAFBK zf4@f2u9@RGT{HM^sF1^d`uv}ZqmblJz{+s@Z&8B`61Y9M2Z)JByIsqef=kOfRn~H} zD(jQ(fDP&I%r6p2gD;XsJX-2?L{$e#dnmm)tvreE7?3@M8q8IC@@Xmxl**h}o-s|I zHBBkEJx4Y0_wQjy{)U6#@8?nW`5#1eG`GC~gXSFYrrh*mpx3$WC8-BGdLK4Br`Yf^ zbYR0D(JaYdfiR}_Dr!PDyoREfo20}{{yNntHq6cKib9gVLBb%zn`nk!0y3=9&`YUF zprs~(mYM`w4kUxwk8Y=>Zl|Sguch9vMaf0`uD9ew!kB$qPegk99TWu|xex~m+IP{8 kocJE}<#(VnY$J zEB20vg@1PPMup?~zVqMj-uwUiJkNgjtXZ>W&FV7~JkwGp#NJdUGR@FYgQ6$I1{FgqD9oih}ej-(_=Fx#AYfJnPJrE z(8#C}LnlSYW}Y9Ju3u8L*enCXISpqdBK$QddPICwL~PdfH?>dEKE>RwvDqSX^cx!& z-7ge9A}W69xTFZN+2=>59zK3hTx^bn$eTvBziI4%#4C=9%{g$?*pXu&FUaqOI~u>~Si4ILF9TQIr!6R-Gh^{ZYB{losY zqvC7Tu)lCZY>_fCIb(9g{8a`;lgc0_xa^_HmtCw(`&9pW*)B9i;^(F$Qlw-mre+$Z zWjdy32F4GFNxH=1k!i;S?RNB_*b*^+UPp3MPTCXEIWdBZDVY#^bD5Y7kvDa?;fhMR zqGCb4q?W%lOv`jk&kT%UMrLAW{+Zod#zv-#8yGbrDijx6Iw3Y_^5GU87auipOl;ZY zYKV^;F?2vg_v9TBvE|xUtzNTM-MV#ZRQc-@(XnID)XK+2rcLU&u@w?xE0&45zOs{c zMRZH-+7Yppg05CMA+|~|67&oDR8(x$`pU>Vhv;=il!KMSUIRG-*7*R`;^ z|3z{)CZD_=KOjh^UQ+(*-uP#HR9tjKtI+64 z(W5%HXqq&@b^UXI>-G;Nm0ZNhHF~2#zH`O=l|=V{NFw==8d{P+RUk!ixiS6pml zpV%H{dWCCa^r-0Ao{7KW$B&Nf6&+kw^tkY4-PR|zcbTNBzWv6kz9W40l!;9-Tk=`| zI!jFcn4aO`u}{*`eQ!LvU-;-$iAU$Vu|XscP~mEd=@m|;e{^JqxY42b(V;=nq1dRz zfj8U?2VU2j2PuzWyd0c7UPk}h@$%39lT=jSHdbU!l1$HRO%faY;NUufDj!lNCaCI| zpsN2?4@3W<9whc>=S(i4WN)a-VgHasa?~4>`m2zmV}d#yUM8kzOt0vepsq%!tC2UB z_NZ|3|7~53PO7UhHy#}uKKehdt59@IQU%2Y-8nuc=*DBq#59fR5EFFVaS5^G%fz&d z`KxbEh-niu=nVQ&2W+vrt){Xg_{XfayoS+=%hRbnYxEvG0 z%#?!<+v~*c2ROUE)L3ZNm4%VxiO#j{!j9`G$_aW zt}Dl7k_$Ryr{oT~Jh4MgxqjW&I9R~+iCs}9CUF+7U8QQR>t^AY@bYzKLhPzCF`4Jb zWQqA}S$cm=_W3b?Ep*PuT9=vr*5@>*!{5G)oRzQ^$)w(*afcI5H&9KmYv#IXKFB#mO9 zCFT0_8*}}||0LI62BrH|xOBe`m+pJv()}hO_S@@A_q(8UKmL2^rW_O>`+Z{3{*Vy+ zV{*~{6cp{{q)h&NVW3)peRErgN=kxs>Gnbt;a&C!;#y>{QCDYO*JqoK9nZBBe+!x zrA<6CUE-06H%y`Q;op5i8NxBbx2>Uw#GThY%m`%+hyAsVOm1>PXyPY%m6qJ=Lz#j- zQGG(01FpNP3i1*?WOQg~aDyMw{BOTQS%R2B_Ob>fgu-_uWg;{7OO72K7s?(41=%~D z5XuqWdHwVh${7wz%3d%!g>r>A6WjZM;L%j>V9Um&P@Z5T=F*?TQz&n+9holi&LnzJ zC||HOFBrHY)Ak!UdQfoM{Nd<-d#V#E5Ck|=_`y!qq~}?of{DSwvz;53Mk&J2c2cEG zaZ~(&vB51*s8Be)#8NICKH`SUDDqe0!BCO!zb)mWLHwYJ6bmOBG?C(oNj4ppSd~F5 zDG_WZgi0ocCbg1O;aMVdb2xsRVd1j=TSFNe+@y`^m-IL!R4Vb%TM`eQm|TN#V}ge= z(Up>)m?Um_ZW1aTL=Eb%OyY56!^Z_rC*#Hr2p;`~$_1N|nUjuAK0)~)Ea{FtR3RL@ zPpD!zB(Z2Jg#&IVn#$p@poX(1-gHj*cSZBRx)_L%U`A$QW@h1^4ZKQ_pP)XghBN)= zg98PW^fb6^`u)p>z9MmQ%~uWQpaI7kP=7f7f-QZVJb=M0Yazl034~PB#sJa^j`3ZVj z!*Hg9p4KQa%T0$Rw%MSkH4e5DLQN7wlX_ZklM`HS({TJY!@?Dw)YBp(1`i!EqF>?z ztx&Td_|H0v$=@qnQUi3r=E1(m)cwNUr$zXuN0_0O;iLw{Y#FA*wFuhbqG5dttg+s#k1)=ugfW%vafzT8q0ZsWKkx5usNyc+gOjQ_Xm!bN z20~pEga4zK1HmY8rpU>rKV?smqR93>OTvz9`u)YQ`(K$;@p53sK?SFTy^{B>AwLJU z-ZXM=*yZ(7rnwUM@#*E=!q(_heqGq&**{zOOW>Gw2^Yd1sGg(z)xa#rHcSlLr$_x0 zSF`^)3UmuH8?@!_;p@yb?7B`K>GJ-Mb@E~V(35*OY0BBjy)S4||I|%;{!7H`x@mY3 zpVUon3w|Xvl-}V(5*x}ejo|hmBq4N1VtD5H!)~bjKH($(UFG);ACy@6!@?ClKl1+|K7z1h2ysw7Ow83EpN4iBgJ?_OkF_X;C4A}PU<;iUS6MukaQc8yN_dHt$z zOgJp4#niK7!ppAM@MhBeU*h+5_l18gxk5p-q$U-Ym}dB~L~@gg4?+?`V-v&wwMmT& zANlW^)cEj0L1u!d6n`I2CL{*`N8?HGh~a+_$l2L77;lLm%}>!#%?PI6Q7z>qq;Qo3R8PWYEd;XxsB-o7)~6$}VB=6&k_ zjl55DlIsVx@Z+5u@<07wB8D$A{Ngn68fN_SHOzEW&~Ad&*q@J?f}?`Ze_sAc@e4Ic zvKUMJZB_P9^)k!J|Eibp@i$!8?0<xY+xf2NT*yetYflSY@Q;ADv} zw?m7Au%rZ*{6hlwT%W+b$q6+3D}kkdCUDpL7?vK52ir`3L(S{d0e+_Nfz(-;lIF z+?2*fhu^v;&RUTf!f(Zsrj^HnU6Gml4T>Hd6}(#QH+<-*L7|P|ur!G;dqbPTzjOY% z|8Fbb&B2~OSH5Eh#Kp8t2t6K-ojMdfq*~~S;Ac>)9TP%Zf?wg6cA>3_KmPXKF0?Hi z5{&oh6YmI{{MEjbeg%6IKglLPqDVIRPBGcU_WS4C+0gcXPCYp_iKqBaw}ThA!P~yj zj&Q2I!cV9N4-Ex>N3%27?iKDBap6C#85G(TY=_?khIWS|CN=Cmi9fIJ8GFNFe@(nk zhBt@h4!(!w3BF=3B|h$m9XlcPRIr&i^9~E2DtH+i+82(H_#ox!;Ma}!UC)HWf`o%x z!SIZ}KQT)91!cIo4hv^5oW`?(-V=tN3x3B9OA~x2gboDXiGM;BU%guBV6ZVPOYjw} zPYxySJ{;aXEPL=BWFUCZbR@ie!(GAAa9I1BI%F#yuGa3;>IB<9$-i!_!T+~^8S|gs z>t}V|Yz&tFiJ#zai4s3KnTxrZhk2Qg`B{JkS%`&Mghg45#aV(Sc{5A#7M5o6tMan8 z%dtEwup%q5GLxr|sA&Dnx2*@~^1 z{D7sc?RIR>TiJmf*@?+Zf-bhZvKzZIl0Ddyy?7gY^LF0BKJ3eW?9V6;;6M&yGzW7C zV>pzHg4w*?&L1+<{s|llYEN%_%xs4em=|Rcz_3ah=+NEM|q6L`8-eXBv0`)U*L;; ziD&pS&+-+%%GdZh-{3jE$+vi(Z}S4*;YGg7_xL_P;D`K(AM+D_%1iu=pYscT$*=e| zzu~w1j^FbK{>YzrnLqOif8kaB%HQ}q|KK$yFS2gBDJdW&Q!zEeYmo3KEz>bQGcbY~ znTeShUbBQhS(%ODg-iI8gE^Ut;q_Pe6JCOaKY5vt`B{JkS%`&Mghg45#aV(Sc{5A# z7M5limSs7XX9ZSdC01q?R%JC-XARb5E!Jl860WZ8daTa|Y{*7z%qDEgW^B$DY{^z^ z%{FYyc5KgE*?}F|iJjSnUD=J@8Oa{($zHsTy?HzDU?28nKlW!72XG(G!IfOa`?;EH_y8Z|LtM*;xsH!;Js;%;KE{pQ#Lax1PjCyj zavQgE2X}H8cXJQ-@<~3$eSDhFa6g~rb3DLS5U*&6jop10Q-{f07&$oGj@9-ku<$HXeAMitd#E*U*YOdq=cC-f$GDN3xS5ah32xz5ZsT_D;7;!1Ztme;KFO!Jk5BU%?&q_7 zjt6*r@X|^_&LAem;8!f^BaE4@Ay4`;E()?m-#cV@E2a?ul$X_^ABEQ z@;^DeDV6!3shIdGJJ^-Rc3P%mdS+k*Gcpr1GYhja8?!S9b21lmGY|7JAM>*S3$hRk zvj~f_7>lz6OY&xx;w>!AGAzq-EYAw8$V#ltDy+(Ctj-#&$y%(9ENM{*QLa|~k{VjSZ+mg6{{6F8BRIGIzJz&kmW(>R?o zIFqwDn{zmq^EjV(@op~QLN4NBF5x}AmrHpcmvK2)a3xpqey-*kKEMb05ZCfyuHz$I z&quj|k8vY6aWfz16WqeB+{W$P!JXX2-Q2^ye3DObAD`wk+|OtE91rjy5AiUM@F<_mn0FYydt=2^bNSNR%W=Nml7H~ALN^KD+>JG{tu`5xcr2mFv9@ne3% zPkD)-@pFE`FZmU}<~RJ7-|>6?z#sV&FY{+!;V-<(U-=t<=O4Vr#J}JQ#?PBloBx@L zshNgpnU3k1ff3BeOw7zI%*t%c&K%6iT+Gcp%*%Yt&jKvSLM+T8EXram&Jrxin^}st zur$lCEX%PxE3hIfu`;W$Dyy+NYp^D3u{P_lF6*&A8?Yf8u`!#lDVwo5Td*Ztu{GPU zE!(j@Z)FE|WG8lJ7j|Vgc4s7euqS)*HumQ2yn}t%m;KnEQ5?X59K>i2<`BkkD2H)4 zM{p!ZaWuy;mLbM5o?|(V<2iv7If;`wg$cZqQ#p;(IfFAfi?cb0b2*Rmc^B{I0xskt zF6I*6!+W`u_i-7Qa|Ks&74PS2uHgfGkPmS!ALcqf!u5QV8~7MEauYZ6aX!H<+{$g- z&K=yzUEIw*+{-8V6!-CIKEwTdme2735AqNX^9Yaf7?1OLp5RHI;%UCX7x@y;@MWIm zD}0r&@pZnzb9|F;@jTz=1-`?Je3$R>eSW|X`4K_e5-iD^S&Fx?G|R9o%dtEwup%q5GOMsEtFbz3 zuqJD#;r?upt|e^9gR@R&L{V?%+=D;%@HYUOvgExQ|cs z8Sdw^e2xcrkcW7fM|hOSc%0Ak1W)o5PxA%7$d`DAFY_#4;j4U&uk#I_Fe|e$J9986b1^sbFfa2lKMSxR3$ZYZ zuqcbMI7_f3Z)Pdp!qP0mvMk5)tiXz_#LBF~s;tK9tihVB#oDaHx~#|gY`}(W#KvsG zrfkOMY{8an#nx=Ywrt1typ;{FdMGd;Y*5`4cbmXI|kiyvkqs8-M2?yv7vi%>PWuR7}k@Ov`jk z{4dZ1No24c!Hmqr%*?{9%*O1@!JN#++|0wg%*XsJz=ABq!Ysm~EXLw2!IHe0rFaWV zvkc3!9Luu;E3y(RvkI%S8mqGgYqAz=vkvRB9_zCK8?q4_vk9BB8Jn{OTe1~fvklv_ z9ozF(c3?+#VrOyoFIe`;7iIX{n3A~e2IgQgfgEKjcvpI)zIgj&s7w_f* zF61IE<`Uk+d%2YNaT%9$1y^zv@8@c+;RAe-4{U62#@j@kMnt+;7Ok1X}-W0`4Z3Y zWuE0Le3h^9b-uxKe3NhSJm2O8zQc=rm+$d?e!vg;5kKZ9{FImY89(P2{E}bsYktFT z`5nLK5B!lo@iKqr75>7j{FT4)cmBa^Op)IF&y-BX)J(&)Ovm)hzzAk!CT3<9W@R>J zXAb6MF6L$)=4C$SX8{&uAr@v47G*IOX9UXiuZFh z*YE*8$cMO=4|5$K;d(yG4Sb9nxrv+kIG^AaZsj&^=ML`VF7Dd(6Mo7|{EVOT3x3J3_%*-bxBQOZ^9TOOpLm%+^9q0ARsPD~_&fjL zHKxd5{%1<2Vrr&gTBc)qW?%#}G7~d13$rpCvoi;CG8c0*5A!k~^Roa8vJeZi2#c~9 zi?akv@@AIeEiBD4EX#5%&kC%_O03K(tjcPv&Kj)ATCB}Ftjl_=&jxJBMr_O`Y|3VA z&K7LRR&32SY|D0R&s*7n9odPU*@a!%jolf^9_-0pyp6qiJMUm0_GLfzXA}o;AO|s; zgE@pT9LixF&Ji5RQ5?-NjAe*%jOSR6<9JTsL{8#lPGJJ?N*|_wZgW<$YYn}M`3XPeC4R=w`31k^ zSNxja@LPVz@A(6N$3qHvJo4z37fJRo3jO5vK3pi4coFE+w)d-U`KXhXLey% zc4K!&vIl#z7jI*4-p)JNhkeCzyvTR?9^dB&{E#2*U*YOdq=cC-f z$GDN3xS5ah32xz5ZsT_D;7;!1Ztme;KFO!Jk5BU%?&q_7jt6*r@X|^ z_&LAem;8!f^BaE4@Ay4`;E()?m-#cV@E2a?ul$X_^ABEQ;{ShDumrd%Q_?yhB~vjq z(=aX5F+DRdf*F~KnVE%InT^?*gE^UtxtWJ~nUDEdfCX8Ig;|6}S&YS5f+cw~OYs(# zW*L@cIhJPyR%9hsW))UtHCAU0)?_W#W*ydLJ=SLfHe@3P!Wa(aFb?Mk zj^rqg<`~8@#5l%tEXQ#?CvYMsaWbbcfp>B$r*S%Ga3*JQHs^3I=W#yo;@w=pgK0eK7xS!ARIUe9a9^zph;ZYvraX!xzJjqi$%@_D0U*Z|Q%(Hxj zuktm%&Nq0DZ}Kgk=i9u%cX*NS@;$!K5BMQJ;>Y}ipYjqvU-Bz{&2RWEzvK7( zfj{ymUgpod!e4lmzw$T!&OdmKDKfkNXG*4GYNlaYrek_$U<5NV6EialvoagAGY4}r z7jrWY^D-avvj7XS5DT*ii?SGtvjj`>W|rbDEX^`3%W^Ew3arRVtjsE`%4)368m!4$ ztj#*C%X+NO25iViY|JKX%4TfN7Hr8@Y|S=o%XVzfTiJmf*@>Omgk zkJTYksy`2&CCPrS^Zd4<35Du3l~{GEUB8dGF3 z|1%|1F*VaLEz>bQGcbY~nTeU1g;|-6*_nemnTxrZhk2Qg`B{JkS%`&Mghg45#aV(S zc{5A#7M5limSs7XX9ZSdC01q?R%JC-XARb5E!Jio)@41`X9G55BQ|CeHf1w5XA8Du zE4F4Ewq-lE=dJ9(j_kzF?82_>#_o({5B6j)-p1a%op-Pg`?4SVGl~N^kb@Y_!5qRE z4&^Wo=LnAED30bB#xle>#&aykaXcq*A}4V&r!awcaw?~BI%jYuXK^;?a4zR@KJVh) zT)>4~#Kl~~dw4IG@;)x(a<1S?uHyY%%{6?05Aq?d<-=UZN4TDkaswaZMsDI}KF%k& zg$3qHvJo4z37fJRo3jO5vK3pi4coFE+w)d-U`KXhXLey%c4K!& zvIl#z7jI*4-p)JNhkeCz zyvTR?9^dB&{E#2*U*YOdq=cC-f$GDN3 zxS5ah32xz5ZsT_D;7;!1Ztme;KFO!Jk5BU%?&q_7jt6*r@X|^_&LAe zm;8!f^BaE4@Ay4`;E()?m-#cV@E2a?ul$X_^ABEQitOfprerFnW*VktI;Lj^Mld5Y zF*CC;E3+{>b1)}!F*oxtFY_@!3$P#yu`r9UD2uT;ORywwW+~pn(k#QWEXVS!z>2KI z%B;ewtj6lB!J4ea+N{I6tjGFnz=mwZ#%#i-Y{uqn!Io^r)@;MJY{&Mzl^xiTo!FUO z*p=PbossOpp6tcj*qgWW4)$SR_G5oWaR3K$5TiMmLm0!M9LC`s!I2!r(Hz5Ah8V|q zj^#Lx=LAmVBu?fOCh$&9l-G-}wixF+~pZKT|RlQ!@?IG9A-110$G`nV6Ybn3dU>ojI73xtN=In3wsOp9NTu zg; z@iTtTFZdMm%+4Il$z06MJj}~{%+CTW$U-d4A}q>cEY1=v$(vba@izA6?Yx71*q8m-pHUpZfgHqW4(1TXa43gyI7e_KM{zXA zFqR?4F`i>Nj^jCj6FG^KIfV(llT$g3(>a4PIg7J7hjTfP^LZEV<^nF{A};0<-otyj zl=pENmvaSIaux6AYOdh}e2@=uEg$AOKEm~UlpFXMH*ym<^Km}GE!@g&+|C``$z9yd zJ>1JD`4so@X+FdKe3sAg01xsI5Az6*@)(cvd7j`&p5ke~z!&)v&+uiQw@qK>45BU*4<|q7=m-rb!=NJ5vU-4^x!*BT=zvmD9kw5V= zf94hb!mIq1zwvke!D~#B%lyxjOvThp!?aAt^vu8rW@IL2W)@~;HfCoI=43ABW*+8c zKIUfu7Gxn7W)T);F&1YDmgLPW#amdKWmuNwSe_MFk(F4PRalkPSe-RkleJizby%16 zSf35pkd4@wP1uyp*qklclC9X9ZP=FW*q*nt13R)4JF^SBvKzZIl0Ddyy?7gY^LF0B zKJ3eW?9V6;;6M&yGzW7CV>pzHg4w*?&L1+<{s|llYEN%_%xs4em=|Rcz_3ah=+NE zM|q6L`8-eXBv0`)U*L;;iD&pS&+-+%%GdZh-{3jE$+vi(Z}S4*;YGg7_xL_P;D`K( zAM+D_%1iu=pYscT$*=e|zu~w1j^FbK{>YzrnLqOif8kaB%HQ}q|KK&I$Zh^-N~U6J zreRv9V|r#_1T!)dGcyabG8?lq2XitPb2AU~G9UA^01L7Z3$qA|vKWiA1WWQ}mf|fe z%`z;@axBjZtjJ2N%qpzPYOKy0tjSue%{r{hdaTa|Y{*7z%qDEgW^B$DY{^z^%{FYy zc5KgE*?}F|iJjSnUD=J@8Oa{($zHsTy?HzDU?28nKlW!72XG(G!IfOa`?;EH_y8Z|LtM*;xsH!;Js;%;KE{pQ#Lax1PjCyjavQgE z2X}H8cXJQ-@<~3$eSDhFa6g~rb3DLS5 zU*&6jop10Q-{f07&$oGj@9-ku<$HXeAMitd#EE>b1cVkJST7>Cvh^T zFoAb+DyMNeXK*HGaW?00F6VJR@8aEDz=d4I#azOBcrTaoJ}%>OuHZ_p;{9CBHGF^% z@*%F}!(7KlxSo%410Ul?ZsKM>&L_BqTe*$fxq~~oi@Ujpd-)`v;yymjXSkow@;M&h zK_22^9^p|Q<8eOE6FkXNJk1yQB46SezRa_Hg|G58zRovzj&JfUp6A=Vz;}3&@A5sq z&ky(^KjO#ygrD*fKjY{8f?x70e$8+AEx+UU{DD96Ctl{yyux32mA~>g{?0#mjVbb) z|Cy4hn3`#rmg$(D85qHg%*4#h!mP~3?99QO%*EWy!@SJL{4BtNEX2Yr!lEq3;w-_E zyqTqV3rn*M%d#BHvjQu!5-YO`tFjuavj%Ij7HhK(>#`o}vjH2j5gW4!o3a_3vjtnS z6; z{FdMGd;Y*5`4cbmXI|kiyvkqs8-M2?yv7vy%>PWuR7}k@Ov`jk&kT%UMrLAWW?@!l zV|M0XPUd26=3!puV}2H3K^9_R7GY5qV{w*XN#4v-yoIG%hGkifLMGrO=WyRkbX z*@HdVi?^{iZ|5EC!@lgt{*2-P4&)$4b1;W6hC?}w!#RQ@If|n>hOrDWj`1AJaU9PH zoXAO>%qdLZot(;PoX#1X$yuDuIh@ORoX@*>Hy3ar7jZF{@E+dFrM!>JxST7vlB;+> zS91*?;DdaKYxywO@e!`)qujv9xRIN z=d*l{2Y8T&c$i0cl*f3S&+`ON@)S?=1-{6ac!n?YEMMWPe2uU34W8qhe2eG#HZSlU zUgW!ckMHvXe#npbF+bs_yu{D=IlthS{EA=m8-C00_&tB%kNk<3`7^KZ7hdJB{Eff! z4_;%6{N{hAWGbd+8m47Bre_95Fe5WDGqW%&voSk!Feh^{H}fzr^D#dQupkSuFpID# zi?KLMuq1D0Dc-`;EW@%a$MUSeimb%Stir0S#_FuWnykgzti!sj$NFr*hHS*fY{I5& z#^!9nmTbk=Y{Rx}$M(FH9oUhb*qL3}mEG8#k?g^q?8V#Io44~0_F-T4V}C|*00(jq zqdAyE7{j3)#^D^nksQU*9K%?K7{_>y$UG*0IX&g3l4<{ZxD zJkIA`yqgQSkc+sOOL!0O-Y%Q^HFZ#W8BD1 z+|0-M1h;T2w{bgna3^V$^He++PU`w`QYqnuqwqtwV$`0(vPVCGs z?8{WxIDi8=h|wI(A&lWr4&!i+;7E?*XpUhlLyTiQ z$8sFUa{?!F5+`#C6L=@5avG;|24`{>XLAncavtaNF5b-rT*yUS%q6^s_i`!k<1#Mi z3a;cT-p|!s!w2{vAL3d*%yoQ(>-i`*@G)-WCT`~Ae1coJmD{+TJGhg(xSM;pmrwF3 z?&H&ZhWq&}pW^`@wJUf_$J@t zdA`jHe1{kL|FBe#QFvT!!-mK99ox2T+qP}nwrx9UlOj%=q>R&sjcq&mp6^}f{;`j} zu75MLW}nyaI^MvWcnfdi9lVS8@IF4khxiB|;}d*}&+s|Ez?b+6U*j8mi|_C~e!!3T z2|wc({EFXDNKF4xLKziQQ9~UKG|@sE9dyw{9|H_A!WhQExEK%PV**Twi7+uH!K9cB zlVb`@iK#F(roptB4%1@>%!rvVGiJf8m<_XI4$O(UFgNDGyqFL3V*xCPg|ILd!J=3U zi(?5aiKVbKmcg=E4$ET&tcaDcGFHK=SPiRV4XlZ^ur}7gx>yhEV*_l6jj%B`!KT;@ zn_~-XiLJ0Tw!ya84%=e~?1-JPGj_qQ*bTd55A2D(us8O>zSs}@;{Y6pgK#ho!J#+| zhvNtwiKB2dj=`}w4#(pJoQRWfGETv%I1Q)c44jFxa5m1txi}B!;{sfWi*PY6!KJti zm*WatiK}omuEDjq4%g!b+=!cSGj74HxDB`C4%~^m@H_k-cjF%X0e{4w@MqkM`*1%V zz=L=Q591O11%Jh(cnpu@Z}>a@fq&v(_&1)wlXwbG;~6}Q=kPrKga6_Myoi_ZGG4)} z_#a-w>v#ii;w`+5cknLW!~6IEAL1i?j8E_>KEvnu0$<`Qe2s7LExyC|_yIrSC;W_G z@GE{pAqo9Q31w7JMGbW{&_oMubkIc)eGD+f2xAxr<6=CFj|ng#Cc?y+1e0PiOpYlq zC8omEmta2uj}5RPHp0f(1e;q9k zCAPxW*aq8TJ8X{~up@TD&e#RJVmIuLJ+LSC!rs^i`(i)rj{|TZ4#L4W1c%}<9F8M! zB#y$-I0nb!I2?}?a3W5^$v6e4;xwF&GjJx(!r3?n=i)q^j|*@iF2cpQ1efA6T#hSn zC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN!td~V+>Lwi2mBF#!k=+3?!*0f01x6J zJd8*17yK2E;xRmqzv1ur2mXnF;oo=yPvR*&jc4#Ip2PF_5B`f6@FHHq%XkH^;(vGz zuj388iMQ}J-od+g5AWjxe29Js)Gh-IairFwb=D?ho3v**0%!~OjKNi4(SO^Pa5iE+uusD{$l2{5$ zV;L-q<*+*1(!r3u|K?tc&%qJ~qIH*a#bA6KsmjusOECme>kg zV;gLX?XW#|z>e4nJ7X8@irug~_Q0Ol3wvW9?2G-dKMufwI0y&h5FCoba5#>@kvIxR z;}{%^<8VAqz==2sC*u^Hiqmj9&cK;C3uogToQv~tJ}$t8xCj^H5?qSQa5=8PmADF5 z;~HFx>u^18z>T;GH{%xEira8I?!cY63%|qfaX0S4AMi)~34g}DxDWT^0X&F@@Gu_1 zU+`BvipTIc{)WHfANVK!g@5A-Jc+09G@ik;cn;6wKlm?Rz>9bZFXI)wivQs?ypA{U zCf>r^cn9y|J-m+(@F70J$M^)F;xl}XFYqP4!q@l)-{L!bk00!;vgK1LvSb#!{ImrN8%_Pjbm^u zj>GXd0Vm=loQzX&Do(@cI0I+mES!yVa4ycn`M3ZV;v!s(OK>SJ!{xXFSK=yMjcaf% zuEX`X0XO0%+>BdrD{jN>xC3|MF8mI^$KALGf50E{C;S=r;y&Du2k;;s!ozq3f5Bhz zC?3P(_#6I?f8d|^7ygYW@FbqX(|88Y;yFBz|KPuP0Wabuyo^`yD*lJp@H*bWn|KRv z;~l(<_wYVGz=!wCN zR8d164K&e08y$4fLmvYSF~S(e!MGR?<6{C$h>0*UCc&hb43lFDOo^#5HKxI|m=4op z2F!?=Ff(Sste6e6V-C!TxiB~8!MvCc^J4)lh=s5)7Qv!e42xq4EQzJCG?u}#SPsi$ z1+0jburgM`s#p!HV-2i{wXinU!Ma!v>th3Kh>fr@Ho>OY44Y#MY>BO~HMYUF*bduc z2keNQurqeSuGkH`V-M_!y|6d-!M@lJ`{Mu{h=Xu24#A-~42R*ZsI1b0- z1e}PIa57H8sW=U%;|!dMvv4-f!MQjO=i>rgh>LJBF2SX^442~yT#2i2HLk(6xDMCj z2Hc37a5HYft+)-h;||=3yYM^w9(Us&`~iQ&pYUhgi~Ddt9>9Zm2oK{C`~`o-qj(ID z<8Syo{(*nuU-&nkz>|0iPvaRpi|6n>{)7MG1-yut@G@S(tN0&Y!|QkhZ{jVyjd$=a z-oyL&03YHbe2h=_xJ%n;wSu!U+^n_Lm>tIM+s$AP(=-O zG|)r~ZFJB@4}A! z!pc|$t70{*jy13**23CY2kT-ztd9+_AvVIs*aVwmGi;76uqC#_*4PHyVmoY)9k3&I z!p_(QyJ9!&jyrj=$mW z_y_)pf8pPF0#D*8JdJ1YES|&j_z(Vz7w{rp!pnFCui}4r4X@)3yotB)Hr~Ozcn|O6 z1AK^&@G(BYr}zw?;|qL=ukba#!MFGh-{S}Th@bE?e!;K!4TY4#|1A+Elu85)v!9&z?xVKYhxX(i}kQRHo%712peM)Y>LgWIkv!-*a}-?8*Gd1uswFbj@Su1 zV;Ag--LO0Mz@FF(dt)E$i~X=a4#0sp2nXX39E!tmIF7)PI0{GO7#xe^a6C@Hi8u)- z;}o2V({MV@z?nD;XX6~4i}P?kF2IGj2p8iLT#CzZIj+E!xC&R}8eEI(a6N9ojkpOn z;}+bC+i*MXz@4}Yzr*iwH}1h7@JIX!f5yGI5BK8%Jcx(zFdo5Q@K-#F$M87*hQH$< z_$U5_f8z-}iKp;1p24$t4$tF1_%B|-i+Bky;}yJ$|KT;fjyLco-oo2>2k+uNypIp? zAwI&#_ynKgGklIO@Fl*&*Z2nC;yZkgAMhi7!q4~xzv4F(Qqg~uP(}q+)KEtQO|;NP z2VL~g#{ff&Fotn3F2=+7m;e)EB20`)FexU(SI818ZU}tc`WBF4n{P*Z>=1BW#RKuqigf=GX#TVk>NoZLlr2!}iz#J7Op7j9suR zcEj%21AAgG?2Ub}FZRR!H~D!}YiUH{vGTj9YLk zZo}=k19##s{0_gz-M9yTz#s7^{2BM+KHQH7@E{(-!*~RL!C&zx9>e4K8~%=e;Gg&x z{*5Q_B%Z?4cm~hnIXsX5;J%!rvVGiJf8m<_XI z4$O(UFgNDGyqFL3V*xCPg|ILd!J=3Ui(?5aiKVbKmcg=E4$ET&tcaDcGFHK=SPiRV z4XlZ^ur}7gx>yhEV*_l6jj%B`!KT;@n_~-XiLJ0Tw!ya84%=e~?1-JPGj_qQ*bTd5 z5A2D(us8O>zSs}@;{Y6pgK#ho!J#+|hvNtwiKB2dj=`}w4#(pJoQRWfGETv%I1Q)c z44jFxa5m1txi}B!;{sfWi*PY6!KJtim*WatiK}omuEDjq4%g!b+=!cSGj74HxDB`C z4%~^m@H_k-cjF%X0e{4w@MqkM`*1%Vz=L=Q591O11%Jh(cnpu@Z}>a@fq&v(_&1)w zlXwbG;~6}Q=kPrKga6_Myoi_ZGG4)}_#a-w>v#ii;w`+5cknLW!~6IEAL1i?j8E_> zKEvnu0$<`Qe2s7LExyC|_yIrSC;W_G@GE{pAr1XU31w7JMGbW{&_oMubkIc)eGD+f z2xAxr<6=CFj|ng#Cc?y+1e0PiOpYlqC8omEmta2uj}5RPHp0f(1e;q9kCAPxW*aq8TJ8X{~up@TD&e#RJVmIuLJ+LSC z!rs^i`(i)rj{|TZ4#L4W1c%}<9F8M!B#y$-I0nb!I2?}?a3W5^$v6e4;xwF&GjJx( z!r3?n=i)q^j|*@iF2cpQ1efA6T#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN z!td~V+>Lwi2mBF#!k=+3?!*0f01x6JJd8*17yK2E;xRmqzv1ur2mXnF;oo=yPvR*& zjc4#Ip2PF_5B`f6@FHHq%XkH^;(vGzuj388iMQ}J-od+g5AWjxe29Js)Gh-IairFwb=D?ho3v**0 z%!~OjKNi4(SO^Pa5iE+uusD{$l2{5$V;L-q<*+*1(!r3u|K? ztc&%qJ~qIH*a#bA6KsmjusOECme>kgV;gLX?XW#|z>e4nJ7X8@irug~_Q0Ol3wvW9 z?2G-dKMufwI0y&h5FCoba5#>@kvIxR;}{%^<8VAqz==2sC*u^Hiqmj9&cK;C3uogT zoQv~tJ}$t8xCj^H5?qSQa5=8PmADF5;~HFx>u^18z>T;GH{%xEira8I?!cY63%|qf zaX0S4AMi)~34g}DxDWT^0X&F@@Gu_1U+`BvipTIc{)WHfANVK!g@5A-Jc+09G@ik; zcn;6wKlm?Rz>9bZFXI)wivQs?ypA{UCf>r^cn9y|J-m+(@F70J$M^)F;xl}XFYqP4 z!q@l)-{L!bk00!;vgK1LvSb#!{ImrN8%_Pjbm^uj>GXd0Vm=loQzX&Do(@cI0I+mES!yVa4ycn z`M3ZV;v!s(OK>SJ!{xXFSK=yMjcaf%uEX`X0XO0%+>BdrD{jN>xC3|MF8mI^$KALG zf50E{C;S=r;y&Du2k;;s!ozq3f5BhzC?3P(_#6I?f8d|^7ygYW@FbqX(|88Y;yFBz z|KPuP0Wabuyo^`yD*lJp@H*bWn|KRv;~l(<_wYVGz=!wCNR8d164K&e08y$4fLm&IJ>X5Kn-*L5i9Tf$k zQ-c3(6aF{ol<~s|M-Ec8FhB>X@stP+9!n{P_RP z<+@^#YSpS$`u4x|e_LgtPpgS_QdIl@4<`zeAPb713Ywq`hF}VoU<;1m3ZCE#fe;Fj z5EJ4EafNt7d?A65P)H;s7Lo`_g=9i8H9{NCLyzsMaU{- z6S50Agq%VyA-9l6$SdR%@(TrofI)5ohC(BuvCu?lDl`+C3oV3}LMx%Q&_-x0 zv=iD39fXcTC!w>@Md&JY6S@mMgq}h#p|{XS=qvOS`U?Yufx;kRurNdzDhv~b3nPS) z!YE<1Fh&?Fj1$HS6NHJvBw?~JMVKl~6Q&C@gqgxDVYVDgMYt;bCtMS*3pa$D!Y$#pa7VZ++!O8#4}^!pBjK^|M0hGZ z6P^n%gqOlA;kEEacq_aU-U}aukHRP6v+zauDt!Cj=NCmulto2UMNQO2Lo`K8v_(gB zMNjm_Kn%r5jEQl?xMDmpzL-EvC?*mUi%G+&X~eW*Ix)SNLCh#- z5;Kcg#H?aAF}s*U%qiv)bBlSzykb5vzgR#lC>9b6i$%nuVllC}SVAl*mJ&;gWyG># zIkCK0L98fN5-W>U#HwO7vAS48tSQzKYm0Tnx?(-CzSux)C^ixsi%rC)Vl%P1*g|Y6 zwh~*5ZN#=>JF&gkLF_1Y5<81s#I9mDvAftq>?!sVdy9RA5;u!m#I52sal5!f+$ru7zZ1U~ zcZ++(AH*NUpTwWVz2ZJ`zj#1AC>|0Ii$}y?#9zgu;xX~K_?!5<_=otX_?P&%ctSiW zo)S-sXT-DOIq|&skNB^6LA)ql5-*Ea#H-?e;x+NQctgA?-V$$%cf`BmJ@LNyKzt}Z z5+93C#HZpj@wxayd?~&XUyEf*eSL!GAmj*}! zr9skQX^1pb8YT^wMo1&2QPOB>j5JmnCykdTNE4+=(qw6hG*y}=O_yd!Go@M5Y-x@( zSDGiymljA1rA5+WX^FH{S|%-*R!A$QRnlr{jkH!;C#{z@NE@Y1(q?Ikv{l+BZI^aP zJEdLHchdLLZfTG7gY={Hlk~H+SK24-mkvk=r9;wT>4@};^s97KIwl>Lev^Ke{*eBZ z{*wNdPDm%EQ_^YajC58yC!Lr6k^Yq~NEf9`(q-w2bXEFKx+YzhZb&z!TheXmj&xVL zC*7AGNDrk)(qrj~^i+B#J(pfcFQr$~Yw3;jR(dDBmp(`zrBBjl>5KGL`u4x8Uz8ij63OS{m zN=_}Ok<-fQ~an{r<_a9E$5N*%K7B{asj!ZTu3e~7mT(Uard&&|E!UCj%Jt;>as#=c z+(>RLH<6pl&E)2C3%RA-N^UK;k=x4cGBMD zraViYEzgnX%Jby;@&b9GyhvUwFOiqZ%jD(q3VEfxN?t9mk=M%W%KPN~@&Wmvd`LblACZ5Nf0d8Q$K>Pk zZ}RW*AM&5_U-IAb3HhXaNg zOTI1Nk?+d)Sf04h+ z-~JbVQIQl`Q503t6kRbCQ?V3VaTHha6kiFHP>Gb75=V)v#8cuc36z9NA|;;$N-ib0l1Is_amwN-d?fQb(z) z)Kls!4U~pTBc-v@L}{utQ<^I+l$J^>rM1#VX{)qT+AAHDj!Gw`v(iQBs&rGjD?OB+ zN-w3i(nsm5^i%pP1C)WvAZ4&JL>a0KQ-&)el#$9PWwbIz8LNy_#w!z)iOM8pvNA=P zs!UU+D>Iat$}DBJGDn%K%v0to3zUV*B4x3%L|LjVQ{IqD2b6=#A?2`gMEOPe zRXM60Q;sXYDZeX!D1Rz{DSs;`l#|LS<+O4}Ijfvg&MW^Y|0)-hi^?VCvT{Yas{E&1 zQ?4sFl$**e<+gH1xvSh$?kf+Jhsq=6vGPQDsytJkD=(Cn$}8oy@X)m1&!R|7RvBQ>VRQRAxd)c9%wHKCeF zO{^wSld8$oK@M+0^W64mGEmOUk%c;%W)Cq*_WXt(H;Cs^!%3Y6Z2TT1l;}R#B^})zs>04Yj6P zORcTeQR}Mp)cR@zwV~QbZLBs?o2t#!=4uPIrP@kut+r9ys_oSFY6rEW+DYxKc2T>k z-PG=C54ES-OYN=pQTwX>)c)!Kb)Y&(9jp#fhpNNW;pzx=q&i9+t&UO0s^ir0>I8M7 zI!T?ZPEn_-)70te40WbDOP#IGQRk}j)cNWHb)mXQU92uqm#WLuIQYAx=G!vZc(?Y+tlsq4t1xxOZ`s$Ufr$kQGZZ>RDV)`R`;s=)cxuK^`Lr4 zJ**y4e^Gx`kE+MiObng>IL*@{lrg}@gt=>`Zs`u3U>I3zm`bd4OK2e{l&(!DY3-zV?N`0-qQQxZX z)c5KK^`rVp{j7dbzpCF`N0zScl%s5R0WYfZGKS~IP=) z(mHEhw60n=t-IDk>#6n9dTV{OzFI%6zcxS{s14EvYeTf5+AwXnHbNVzjnYPIW3;i_ zIBmQ(L7S*e(k5$Dw5i%OZMrr?o2kvxW@~e_x!OE!zP3PHs4dbKYfH4H+A?jqwnAH} ztbw5{4UZM(KZ+o|o+zSF+fc58dIAG9B}pR}K~z1lu) zzji=7s2$P{Ye%$Sv|qKO+A;08_M7&*_J{VT_Lugzc0xO;ozhNgXSB20IqkgmkM^&2 zLA$73(k^RPw5!^G+BNOEc0;?V-O_GrceK0OJ?*~sKzpb?(jIG1w5QrL?YZ_sd#SzB zUTbf(x7s`Hz4k%-sD08tYhSdl+BZ$mMP1TmUC~ut({iS)#J5bdmXdLBKmo=?xO7tjmph4jLD5xuBhOfRmN&`av2^wN46y{ukNFRxe7E9#Z> z%6b*Os$Na6uGi3O>b3ORdL6y4UQe&DH_#jEjr7KP6TPY4OmD8Y&|B)Q^wxSCy{+C( zZ?AXIJL;YE&UzQUtKLoTuJ_P;>b>;ddLO;7-cRqZ56}ndgY?1r5PhgVOdqa~&`0W{ z^wIhleXKrCAFof)C+d^*$@&z1sya+CO`W$_(K2M*oFVGk2i}c0%5`C$@ zOkb|A&{yiK^ws(reXYJuU$1Y_H|m@8&H5I7tG-R&uJ6!y>bvys^zZfE`X2oU{YU*L z{bzlzzE9t;AJ7l#hxEhx5&akaSN*7dOh2ywrvI-0q5rA>rT?v;&`;{8^wato{j7dY zKd=9z|EpinFY1@{%lZ}ls{WsTO~0<+&~NIu^xOI!{jPpbzpp>gAL@_v$NCffss2oV zuD{S<>aXVyK$B1jhGvXTwjD$uaBe9XhNNOZAk{c}Mkk}Q(Z%R$bThgeJ&c}4FQd27$LMSHGx{3?jDf}=W3VyA7-|eN zh8rV{k;W)vv@ymQYm76-8xxF)#w261F~yi_Of#k%GmM$WEMvAY$Czu(Gv*r$jD^M` zW3jQsSZXXYmK!UKmBuP#wXw!nYpgTY8yk#`#wKI4vBlVGY%{hSJB*#iF5^4ndt5K`H9gZe12Z%uGiJsyCE(I z1~a3X$;@nKF|(T4%zeh<`ep;Oq1ni6Y&J2Qn$67S zW(%{W*~)BfwlUk9?acOO2eYHu$?R-)F}s@G%}~cj`&*4$26Lmi$=qyiF}IrA% z{LcK|+->eLe=vVEe=>hI_nQ06{pJDlpn1qVY#uRxF@H6Wn#auJ=5OZj<{##t=3nOD z<_YtpdCEL(o-xmw=gjlwKjy#Y1@oeL$-Hb{F|V5cnb*wg<_+_vdCR#&+@In3a!YBS#hkmRy-@dmB319C9)D*Nvxz+GAp^2!b)kSvQk@V zth81-E4`J$%4lV>GFw@!tX4KFyOqPrY2~tVTY0R!Rz54gRlq7}6|xFjMXaJ$F{`*$ z!YXN%vPxTJtg==)tGrdgs%TZRDqB^os#Z0tx>dufY1Oi7TXn3uRz0h})xc_KHL@C8 zO{}I?Gpo7P!fI)?vRYehthQD=tG(61>S%ScI$K?=u2wgzyVb+$Y4x&tTYap)RzIu1 zHNYBZ4YCGXL#(0JFl)Fq!WwCfvPN5Dtg+TOYrHkVnrKb3CR+GuUEHd|Y)t=2Yc zyS2mGY3;JUv%a@>TYIb@tRJnPte>sD);?>$b-+4k9kLEvN3370U#+9oG3&VXoAtZ( zhxMoRm-V-G!a8Z4vQArPth3fR>%8@k^{;iox@cXpE?ZZutJZ(kHS4-{!@6nRvTj>< zth?4d>%R5CdT2ee9$QbWr`9v;x%I+&X}z*uTW_ql);sIH^}+gReX>4VU#zdzH%qWZ zTe4+au~l2Mb=$B_+p=xjv0dAt)0$JZ)dPG+L`Rkb{0FUoz2c}=dg3yx$N9_9y_m{&(3cbunXFS?80^t zyQp2vE^e2wOWLLE(smiUtX`*TiUJc)^;1at=-OUZ+Eae+MVpqb{D&=-OcW9_pp1~z3kq0 zAG@#J&+cyzum{?M?7{XBd#F9k9&V4YN7|$8(e@a7tUb;iZ%?o%+LP?b_7r=nJ1`) zK5qYJ|8DXxU797!$9NAGE)zKW?F&xve9NTdm*YOLhcLJ1Lx$PAVt0lg3Hwq;t|c8JvtxCMUC##mVYqbFw=*oSaTBC%2Qw$?N2E@;e2b zf=(f)uv5e->J)Q|J0+ZwPAR9fQ^qOllyk~E6`YDrC8x4e#i{C4bE-QvoSIH8r?yka zsq55p>N^dbhE5}=vD3t9>NInjJ1v}+PAjLi)5dA*v~$`!9h{C%C#SR1#p&vFbGkb{ zoSsfEr?=C`>Fe}!`a1)hfzBXjurtIN>I`#+J0qNt&M0TJGsYR~jC0026P$_8BxkZS z#hL0%bEZ2poSDunXSOrPnd{7R<~s|Vh0Y>pv9rWk>MV1XJ1d-(&MIfMv&LELtaH{o z8=Q^KCTFv=#o6j?bGAD>oSn`t=R4Kt>9JHI)U6#ChsGbDlddoR`ik=e6_3dF#A$-a8+h zkIpCOv-8FI>U?tqS9B#;b`@83HCJ~H*K{q{b{*GsJ=b>wH*_O6=Eia3y7AojZUQ%< zo5)S~0P>r<=>o?dEaw zy7}DvZUMKTTgWZ!7IBNZ#oXd<3AdzM$}R1dam%{p-12S(x1w9gt?X8DtGd

TV6U zrd!Lc?bdPYy7k=pZUeWW+sJL~HgTJ}&D`d03%8})%5Ckoaof7>-1cqFamTvj-0|)N zccMGVo$O9=r@GVJ>Fx}7raQ}>?ap!My7S!m?gDqAyU1PaE^(K-%iQJe3U{Tu%3bZQ zao4)*-1Y7TccZ(>-Ry30x4PTh?d}eDr@PDj&i&rq?e1}ZaDQ}va({OBy8GPy?g96p zd&oWP9&vwhe|3+#$K2!YZ|?8zAMT&-U+&-T3HPLX%02C#anHKv-1F`~?!WE@_o92r zz3g6bue$%a*WByw4fm#d%f0R1aqqhK-23hW_o4g9ee6DQpSsW7=k5#lrTfZ#?Y?o} zy6@cg?g#gy`^o+6esRCL-(0~HJ;{?j#Zx`a(>=p8JQ@)CPVyrf<-FS(b(OX;QZQhRBJ+Hpkz-#C=@)~gdt1D%-ZpQ$x5L}%?ef0!zV~)}d%PdKAHAQvpS``_ zK5xHwz&q$2@(z1PykERuy`$bS@3{Ay_q+Fp_ow%l_qTV#JL#SBPJ3s(v)(!Hy!Vgy zuXn+_=w0$Idsn=x-hbXT@49!xyXoEXZhLpUyWTzTzW2a;=softdr!Ql-ZSsH_riPW zz4BgrZ@jnOJMX>s!Taca@;-ZCyszFjPw+)w@?~G~RbTUU-|$V}@@?PoUElM4Kk!38 z@?(A+Kdv9okMAe&6Z(n##C{S#sh`YG?x*lm`l z&Cl-V@N@dP{M>#XKd+z9&+ixT3;Kop!hR9Is9(%4?w9aO`lbBRei^^4U(PS@SMV$P zmHf(n6~C%q&9Cm)@N4?D{MvpUzph`;ukSbT8~Tm>#(opOso%_R?ziw;`mOxdejC57 z-_CFEcknyazpvlV@9z)r2l|8j!Tu0`s6Wgf?vLHp>b?Vs>Z`ltNU{u%$Q zf6hPe|KtDbU+^#bm;B5A75}RLpMTB2?%(il`nUYs{vH3Wf6u?~Kky&=kNn5}6aT6I z%zy5`@L&3`{MY^)|E>SdfA4?rKl-2i&;A$xtN+ax0x^&RIZy&M&;mU$0yD4zJ8%Lw z@B%*wf-s1JSP&NtArUx^EnZc}Jb}%QH8_Wyl2MdCQ z!J=Ssuq0R-EDM$gD}t55s$g}nCRiJ+3)Tl4f{nqZU~{k~*cxmLwg)?cox!f)yWsm^ zcd#e;A^0))Dfl_q8|(}A2M2QCO8|M3(g1s1pfvXf{VeW;Bs&!xElNyTnnxTH-ekNt>AWWC%7Bj3+@LGf``GQ z;BoLIcp5wlo(C_2m%*#xb?_#58@vnN2Oolu!KdJJ@Fn;fd<%q745d&Gl~4_}P!EmJ z46V=(ozM-v&<}$!45Kg>#tGww@xu6Ff-qs2C`=qC36qA&!sKC!FlCr3OdX~P(}wB7 z^kIfDW0)z-9A*i#hS|dGVU93om@CX3<_YtL`NI5Rfv{j$C@dTn35$ls!s20xuw+;& zEFG2!%ZBB`@?nLrVpu7x999XdhSkFAVU4h6SSzd@)(Pu|^}_mLgRo)PC~O=y37dw^ z!scO%uw~dPY#p`<+lKAJ_F;#xW7sL|9Ciu2hTX#MVUMt9*emQE_6hrj{lfm?fN)?q zC>$IP35SNm!r|eFaAY_t9374c$A;s=@!^DUVmK+B98L+RhSS37;f!!*I4hhT&I#v+ z^TPSzf^cEDC|n#a373Y;!sX$LaAmkETpg|n*M{rD_2GtaW4I~Y9Bv7>hTFpJ;f`=; zxGVfF{65?r?g@Vge++*Le-8JC`@;RC_Ee<34aNH4UdM$!sFp@;qT!e;h*7O z;osqj@ML%@JRP11&xYs1^Wi_?zu|@OVt6UM99{{phW~}v!t3FU@Md@`ydB;N?}qon z`{9G|VfZL~96kx3hR?$1;fwHP_$quIz6sxk@51-thwx+gDf}FM3BQKlLLm|(DUu^4 zQX?(WBO@{+E3zXeaw9MDqaX^SD2hdKqPS7KD1MY6N*E=I5=Tj*q*1acd6Xhb8KsI+ zM`@z8QMxF7lp)F(Wr{LKS)!~_wkUg)Bgz@&igHJJqP$VQD1THSDi{@t3P(kvqEWG^ zcvK=P8I_7kM`fb2QMssmR3WMuRf;M{RidgrO~ozd9)&08Lf&|M{A_4o63#U!q^5qtUVGc=TKJ zd-O;2XY^O}cXT2;8J&twM`xn5(Yfe+^iT9}bRoJJU5YM8SE8%Yf6=w*dUPYY8QqF* zM|YyT(Y@$?^dNc|J&GPjPok&Mv*>yBB6=CUie5)=qPNkz=za7d`WStRK1W}ouhF+i zh>0;NCdZVR8q;EW%!rvWD`v->m>ctAek_QEu_zXc#fim@#f!y{C5R=AC5k1EC5a`C zC5t7GrHG}BrHZAFrHQ4DrHiGHWr$^rWr}5vWr<~tWs7Bx<%s2s<%;Ew<%#8u<%{Ky z6^Ip#6^a#(6^Rv%6^j**m57y$m5P;)m5G&&m5Y^+RftuLRf<)PRf$!NRf|=R)ri%M z)r!@Q)rr-O)r-}SHHbBgHHtNkHHkHiHH$TmwTQKhwTiWlwTZQjwTrcnb%=G0b&7S4 zb%}M2b&GY6^@#P1^@{b5^@;V3^^5h74Tuel4T=qp4T%kn4T}wrjfjnmjf#zqjfsto zjf;(sO^8j5O^Qv9O^Hp7O^Z#B&4|s6&5F&A&56y8&5O;CEr>0QEs8CUEr~6SEsHIW zt%$9Rt%|LVt%4`a1_tlD13suZTDFB%=Bn*cXxto z@BjeZJ)XUueV+ZE1D=DP zL!QH)Bc7w4W1i!l6P}ZvQ=ZeFGoG`abDr~_3!aOfOP27NtY!Q3jL|WkQ)z7L*lbL)lRdloRDbxltaJ7v)3w zQ2|sC6+(qk5mXcvL&Z@ER1%d!rBM(ngUX_Es648GDxylLGOB{AqH3r*3Pv?hO;iij zMs-kKR1bxq`ltbFh(b{#)EG5EVW=r;hQd*E6oFcxmZ%kKjoP5Ls2%d6_9zl{Kpjyh z)ERX_T~RmG9YvuYs3+=$dZRw5FY1T-qXB3j8iWR;A!sNXhK8dNXe1hiMx!xkEEy+JrWvEodv+hPI;}XeZi*cB4ILFWQIpqXXz5I)o0RBj_kPhK{2X=p;IY zPNOsEEINnIqYLOFx`ZyHE9fe^hOVO<=q9>_ZlgQsF1m;AqX+0AdW0UMC+I19hMuDr z=p}lEUZXeYEqaIEqYvmK`h-5CFX$`!hQ6a8=pPgd2jJK^4vvfC;rKWKPKXoX#5f5~ zij(2wI0a6LQ{mLugAvAIfG^@p_%gnNui|U?I=+E#;#>GO zzJu@Ld-y(nfFI&V_%VKhpWXLEsE8ZIrDj!VyF;4*TVxXfG@E-ROf%g*KCa&o!2+*}?mFPD$Y z&lTVba)r3UToJA)SBxvpmEcNprMS{u5LbpP%a!BGa}~IXTqUkDSB0y}RpY92!CVcl zCRdBA&DG)Ra`m_nu0Gd*YsiIijkv~K6E2Kv$~EJ{x#nC1*Me)wwc=WHZMe2vJI>3s z=OVcdTt}`G*O}|Wb>+Ho-MJ{P2iKG9#r5X;aDBOcTz_r=H;@~|4d#Y$L%CtxaBc)Q zk{iX1=EiVixpCZhZUQ%vo5W4#rf^faY20*f1~-$N#m(mCaC5nN+N0lpw#h%d|+;fwOc_~LvCz9e6YFU<$>W%#muIleq!fv?C{;w$r2_^Nz0zB(Vw z*WhdNwfNe69lkDKj}PJN^9}fhd???DZ_GF0!}z9rGd`Se&PVVq_?CPtzBS*5Z_Bsi zy?lEXwGx=HkY<>ltL;YwcrtufCWzA1wjx6 zNgx6hWI+*BfeD(R3x;3{mS78kLK-2hkWNT1WDqh6nS{(j79p#UO~@|f5ONB+gxo?N zA+L~6$S)KS3JQgU!a@I(IQ5TU-%Kxin03XO!uLK7iOXeu-l!iDBSgwR50DYOz=3vGn9 zLOa1Lv=<_U4njwvlh9e{B6JnH3EhP#p@+~@=q2<_L3zdBS{Qfv`|m zBrFz|2up=!!g67Suu@nhtQOV?YlU^fdSQdGQP?DG7Pbgmg>AxiVTZ6&*d^>1_6U20 zeZqd>fN)SaBpeow2uFou!g1k*a8fuWoEFXqXN7aZdEtU^QMe>r7On_ag=@lf;f8Qi zxFy^c?g)2S-l;fL^#5K9aYV~cUbxMDmpzL-EvC?*mUi%G+&9ubLH|zcvrez#l+%b39+PDN-Qk~iDksHVmYzASV62PRuU_VRm7@d zHLx&JP2y&8i?~(XCT6B}pO@m1IegREbHNq)UcmN|t0xfl?YNt&~nmFJ+K2N|~h0QWhzzlugPm<&bhp zxuo1u9x1PsPs%S9kP1qLq{31Wsi;&;DlV0fN=l`q(o&FAMk*_nlgdjKq>54{sj^f> zsw!2Js!PFA4XLJ7OR6o^k?KnIq!6jT)Ie$|g-VU2#!?e0Olm4MlftFuQiRk(YALmn zT1#!Dwo*IEE47y*r4CX@sgu-M>LPWOx=G!oD5;0kQ|cx4mikD2rG8R>X@E3P8YB&t zhDbxDVbXAEgfvnbC5@KGNMogO(s*ftG*Ox)O_rueQ>AIrbZLe(Q<^2smgY!vrFqhP zX@Rs*S|lx&mPkvbWzuqKg|t#yC9Rg$NNc5a(t2rwv{Bk5ZI-r3TcvH%c4>#SQ`#l% zmi9<{rG3(V>40=lIwT#Ijz~wPW72Wygmh9mC7qVeNN1&U(s}8EbWyq_U6!s$SEXyx zb?Jt5Q@SPHmhMP*rF+tS>4Ef6dL%uTo=8unXVP=&h4fN-CB2s3NN=Tg(tGKH^ildG zeU`pRU!`x-cj<@pj}(gpkk}*+iA&;<_#^>IND`66Bne4Ml9A*j1xZO#k<`RP5W$2a zJQ0XUBti%!GEs<17}1DM3}OpQkusz#DM!ka3Zx>bL@JXiq$;UKs*_++ zgVZFoNNrMw)Ft&u2&qpRkcK3bG$M^j6B0(6l4c~FG$#?H1!+lIk=CRQX-nD>FKJIA zNe9xAbRwNe7t)nx89_#pQDih3 zL&lPEWIUNbCXz{HGMPfAl4)c*nL%cfS!6buL*|lsWIkCy7Lr9|F+GPy#ol56BTxj}A{TjVymL++A$?DxoIAnm*%7SX#rZ07NUh|5n7ZMqs3_nT9THcrD+f? zL(9@~v^=dqE7D4|GOa?Z(rUCi4W>0{OVYDf2 zM#E`y8bMpomb4XZP2146v>o-*_B4`qpdD!^+L?BtU1>Mkokr0fv?uLFd(%F&FYQPB z(*blK9YhDyA#^AmMu*c8bR-=`N7FHMEFDM3(+PAUokS~(+zYZ-9$IjEp#j0Mz_-)bSK?K zchfy|FWpD?(*yJ%Jwy-FBlIXeMvv1I^dvn+Pt!B>EImih(+l(>y+kk5EA%S8Mz7Nw z^d`MUZ__*UF1<(Z(+Bh+eMBGAC-ftr@@x5x{8oM^zn4GAALUQ-XZef# zRsJS_mw(9r$gz|FCAJbriL1m@;wuT1gi0bMv64hdsw7jAD=CzeN-8C_;!%)-6;9z5 zK@k;6AqrJwMNw3RDVm}yhGHs~Vk?148YQigPD!t1P%(n4vev{G6t zZIrf3JH@NCS0a@TN=K!W(pl-EbXB@3-IXY%htgB&rSw+%D1DWFN`Ga5GEf<$3|59H zLzQ95aAkxtQW>R;R>mk}m2t{=Wr8wMnWRisrYKXDY07kEhB8x`rOa05D07v0%6w&k zvQSy1ELN5%OO<8Ha%F|GQdy;}R@NwMm37K`WrMO&*`#b%wkTVbZOV3Khq6=IrR-Mr zD0`KC%6{d5a!@&>99E7fN0npBapi<^QaPoZR?aAAm2=8@<$`iixujfHt|(WPYsz)y zhH_K6rQBBTD0h{6%6;X5@=$rCJXW44PnBoNbLEBdQhBAkR^BLYm3PW}<%9B3`J{YS zz9?UnZ_0P&hw_gSOASzCt8vu0YCJW*nm|pcCQ=itNz|ljGBvrHLQScrQd6rQ6{%R| zR9+QSQI%ApQdL$JRaKd)sk&;YrfR9S8mOjG)2ivz^lAn*qnb(0tY%TOs@c@+Y7RB0 znoG^C=27#i`PBSs0kxo7NG+@uQH!d@)Z%IhwWL}~Ev*KrWz@22Ikmi6L9M7(QY))f z)T(MVwYnOt)=+Dzwba^b9ks4nPYqG)s}0nKYN*;sZLBs?!_=l~Gc{aou12UW)Rt;1 zwYAztZL79Zy=r?kQthC2R6D7i)h=pRwVT>qjZ%B4J=I=nZ?%uwSM8_vR|lvA)j{fD zb%;7t9i|RfN2nv!QR--Qj5<~wr;b-As1wym>ST3_I#r#fPFH8BGu2t@Y;}%0SDmNM zR~M)Y)kW%Jb&0xEU8XKqSEwu1RqASWjk;D{r><8ws2kNy>SlF|x>en#ZdZ4xJJnt4 zZgr2kSKX)XR}ZKM)kErG^@w^@J*FO4PpBu=Q|f8;jCxi*r=C|Ys29~s>SgtcdR4uq zURQ6ZH`QC}ZS{_NSG}j+S0AVk)ko@M^@;jaeWpHFU#KtDSL$o^jrvx7r@mJ|s2|l& z>Sy(f`c?g=epi2}|ERH80E^Ayu(&K9i_a3Uge(zD%#yIAEE!AAQm~XP6-&)L3^B|& z#xsG5Ok#vlCNqVpj4_Sr%wQ(7n9TxN8kUx&W9eB2mXT#*nOPQ=m1SeuSq_$ym13n?5G%vVvU03EtH3IH7fu(qrn^Ro6V zl67DmStr(+bzxmuH`bj+u^y}^>&1GrKCCb6$NIAYY#zFxvk`108^uPm zF>EXw$Hub>Y$BV)CbKDQDx1cpvl(nAo5g0cIczSQ$L6yIY$0337PBR6DO<*tvlVP5 zTg6thHEb>xYD4znZdC_BcE zvlHwjJH<}3Gwduo$Ii10>>|6wF0(7_D!az6vm5LtyTxv^JM1pI$L_NS>>+!^9?8ZcKC>_EEBnU2vmea)noBJ}i><}c;%f1<_*w!j zp_WKXtR>NsYRRsnybIYjw1`T0Jd9tFJZC z8fu|hBdxL4L<`fJYR$B8t+^JVwa{8>t+du!8?CL@PV;K*wMeal)=}%Eb=JCQUA1mn zcP&cmq4m^yX}z^RT3@Z7)?XW-4b%o{gS8>rP;HntTpOW{)JAEewK3XQZJahrsq3zUmX}h&O+Fos+wqHA-9n=nKhqWWxQSF#^Tsxti z)J|!qwKLjT?VNUAyP#dvE@_vwE8118W*(j&!VZIY4vn^dOd@lQO~4j*0bnY^=x`}J%^rC&!y+q^XPf?e0qMpfL>59q!-qU=tcEn zdU3skUQ#cmm)3*yGJ09PoL*k9pjXr@>6P^=dR4ueUR@8?Yv?ugT6%50j$T)Yen?dKbN` z-c9eWN9jHEo_a66x86tZtM}9U>jU(G`XGI)5q%* z^ojZ;eX>49pQ=yOr|UEHnffe!wmwIntIyNt>kIUS`XYU?zC>TDFVmOnEA*B6Dt)!S zMqjJ1)7R@8^o{x^eY3tr->PrZx9dCfo%$|)x4uW;tMAkI>j(6M`XT+Wenda2AJdQP zC-js0DgCs5Mn9{c)6eS{^o#l>{jz>Vzp7u;uj@DToBA#Nwth#ytKZY_>kssY`Xl|Z z{zQMOKhvMf>|Ehn}zw1AA=WA|^03)^$$B1jh zGvXTwjD$uaBe9XhNNOZAk{cX&B$)#Fmf8XjNC>ZBd?Lq$Zr%d3L1rs!bTCJ zs8P%)Zj>-e8l{ZVMvzg)C~K56${Q7oibf@)vQfpTYE(0-8^J~mqoz^IsBP3S>KgTo z5Tm})z-VZM8jXy`MiV2Sw(U@dRHl`R;jcLYoV}>!)m}Sg1<`{F0dB%KWfw9n7WGpt87)y<1#&TnYvC>#& ztTxsdYmIfrdSipJ(b!~cHntdBjcvwuV~4TR*k$ZC_85DOea3#{fN{_`WE?h*7)Om` z#&P3>and+toHouFXN_~ldEHm(>~jcdkr+xpV&4C$U#x~=a zam{#Ud^3TW&`e||Hj|i1&17bBGliMbOl77vJti`-$(g(rgYvyfTX zEMgWli}Yl} zJDXk1u4Xs0yBTHnFngN4%-&`nv#;6D>~9V*2bzP-!R8Qis5#6WZjLZVnxo9o<`{FV zInEq!PB15$lg!EH6mzOM&75w|FlU;x%-QA~bFMkhoNq2L7n+OA#pV)oskzKtZmuv_ znybv!<{ERYxz1c~ZZJ2Ro6ODT7IUk)&D?J8Fn5}}%-!Z5bFaD2+;1K*51NO}!{!n5 zsCmphZk{ktny1Xu<{9&>dCojVj8)buXO*`qSQV{GR%NS-Rn@9yRkwnz8dgoKmQ~xT zW7W0lSs_+^tAW+f3bh(pjjbkDnAOy3W`$eLtq7}y)zWHZwYJ(=ZLM~e*J^J?S{r-f2y3J@${KBr zvBp~CtntDCNurZvl&ZOyUfTJx;=)&gsxwa8j*EwPqb%dF+r z3Tvgc%35u$vDRAato7CgYooQv+H7sHwp!b)?bZ%!r?t!4ZSAr4TKla1)&c9Fb;vqw z9kGsD$E@Sl3G1YF$~tYGvCdlOtn=0d>!NkZx@=vsu3Fcu>(&kHrgh7@ZQZf%TKBB` z)&uLI^~ic`J+Yoz&#dRx3+tuz%6e_RvEEwmtoPOj>!bC_`fPo%zFOa`@753NA1js} zV8^!O*m3Q6c6>X5ozPBXC$^K=N$q5Iayx~c(oSWkwmmkovCY}ME!d(h*~F%{Y%8{E zGh4HD+ptaBvTZxiPGhIF)7k0m40c94lbzYlVrR9p+1c$Jc1}B&o!ic1=e6_M`RxLB zLA#J$*e+rhwTs!s?GkoLyOdqp4zkPGW$kiydAovL(XM1ywyW4x?P_*)JJ_ya*R*Td zwe31~UAvwgV%N7D*bVJayOG`4ZeoYoP3>lOxZT{2uv^$I?N)YcyN%t}ZfAS#_I9M* z!R}~xvOC*d?5=h1`)K5n0|Pui#K)AkwrtbNWtZ(p!4+L!Fh_7(f8ea*gZ->`4mx9r>Y9s90* z&%SRzupiow?8o*K`>FlRer~_8U)rzi*Y+Fxt^Lk^Z-1~q+Mn#t_80rB{muSv|FHkD zV+95T#tw`V7&kCpVEn)Yfe8Z>1ttzm5|}hFSzz+O6oHYAIy4H7*SvLP)?D60&WFiG zBzJzK?d?765am7M{7e&(%J~^i!{sRNQFlGu^V>($9dpZ^kEVMRr++vbT)l|u|3p3i zN6-0}V?$9KFfJGmj1MLN6M~7r#9$IIDVPjQ4yFK8f~mmNpa(=C204%i1yBSfkbo4F zK?RI{>I~L3@Q*JWbUsYa|M=rWh@(G@(D{%(|06A!4onYb05gJ_z|3G4Fe{i1%ns%N zbAq|R++ZFsFPIO^4;BCmf`!1sU=gq=SPU!5?lqY2G@XV!FAwza09pz+yrh0w}4y0ZQyor z2e=d51?~p-fP2Ax;C}D`cn~}U9tMwqN5Ny@aqt9q5OelfOo-r;C=7`_z-*qJ_etFPr+y4bMOWD5_|=|2H${h!FS+$ z@B{b}`~-dmzkpxCZ{T1SSTPfJwn*p!=G| zeWUNA%O&>pbRRY3Nlawb~qA*bnRv4gd#&gTTSy5O63s3>*%Q07rtOz|r6sa4a|u91l(aCxVl} z$>0=lDmV?C4$c5)g0sNc;2dx+I1ii;E&vyTi@?R;5^yQF3|tPb09S&mz}4Uya4onF zTn}ylH-ekM&EOVrE4U5Z4(YM>4ppb1)_ z4F-Z~z_egGFg=(7%m`)zGlN;ctY9`UJD3B^3FZQGgL%NbU_LNESO6>t76J={MZlt9 zF|asT0xSuZ0!xEIU>UG1SPm=?Rsbu4mB7ki6|gE;4Xh3ZgEhdKU@fpVSO=^N)&oPp z`d|aFAs7la0vm%(z%Z~W*bEE@n}ZQx3$P{F3TzFw0o#J@Krh%Hj08J?9l=guXRr&{ z73>Cf2cy6qU{A0Y*c3 zTm!BJ*MaN74d6y_6Sx`N0&WGjf!o0y;7)KCxEtI9?gjUO`@sX?LGTcG7(4Xg1)qV>!5835@D=zPd;`7(-+}MJ58y}e6ZjeY0)7R*f#1O&;6I@AeKY>+KNuT~ z1I7j8f$_lvU_vkvm>5g~CIyp$$-xw0N-!0e8uWk^Rqpa6=X1QL*fGN^zm$UqI$ zK?5{F3$(#NFb$X%Ob4b1Gk_VvOkid(3z!wm24)9yfH}cjU~VuEm>0|k<_8Oa1;IjK zVXz2T6f6c72TOn@!BSvpFbFIImIcd!<-rPIMX(ZB8LR?U1*?J8!CLaiXAJ?f4}nWzhC(l5I-Qnf4}nW zzhC+G->-c8?^nM4_bcE2fBKbgzPISbb>5FoK-=TLj%Ii*b z{P_+&ciO{wnNTi4|L_TeL%iLl-He1NfVs7acE?NHm$=N zw`u8hr&Sua4Q<^dw0)+=ZM2-h}ARw?wq`FS~OtX+pg2 zluH`t&4|uZb!T1ve5&pYjC)`K{-^3r#ME;B#VQ*5^QpO$GX8fny3;a$25_fn+yJrN zH!}vO^xekry`u6j{fy#H;e>^F-5H$d4smC3qP*@LPRtH*=W_o1CGNZq{1UfA-07V( z5!m?|{^(a1xRX8qp$ptOAb$+#g4hAhm;QEW6dK}nr-l4Y zv{Q^`b7!B@_z&Ipv)He;b0?(yee6z2{Tau3b?l4dd&6q5@Mj-K-^BXa*8xt6)7P=x z-F|!PYa9p^P%h2@H(mZisOTh#>+BNcjpyzd{k~c!mE-$D|Ng$(1Q6UgN;gRo`gU=X zB!Y9FU5rdg;IL|4GTj$S}q`t-9k0BZC3d1S6v+8f^U&ASd zQ|1ql@-G2W{T?87bbuJ+(c_ebMR}2fn{3$K!$~$b7u;my-1R6g?<;n*%^91%Y!loa zf+E~L=n&-#{|GeP# z26I>co_m^86`tE^TisoV`6b>)bc}NGeCbm8&&>Ou)}8%*52tB-k3ZKV-O+Elzt(qx zwA|nK$Gf#{x9r#R2y|Ef{yfq+RnAEut%KVy>D(=yesSB$?U(e9QQi!`QvXT7-z^#4 z`k)B+xxz^xyi0VmXL1Uon?19;XNWh8i~n%5`WAn0_H1t1ufxsmuKxXSb2wGuy_{yx z=`O@<_FU01+-46O9Q?D{{f+plFtlb7zX`!lJ^kzdu0G|@?R?Fi+x@k^X3yjPzCW5h zZ}c(v2Xj7m_3zIkzfM6yZMC=w>ey z-Rwo3!suo%=I$BdE$-q!+!DUU-CtN(=MKz7TU4Q+IPUk;$PQWpvl>2)Wr}Vc+^m=r($VOq|!6DxA?%ok= zoYIK3&X2J0+9BQwZkfLoD*6_GZ-q*3*{@cp?2GX84D7qw_yTqqP}N(-sq{yx`j<%6 zAQGI*tNS*oSu37%;F00purP11yL&BvNcZsEkTsmLu<+1|?s83cImBDbw`0uYuI-jM z$LKV!FS-9{+<%|EzQ)BryQoggIL@W)XPoF}bn1N%xJBjMUGhJi<_+=I^&OWl`|J4@ ze{aZ;=-~eBukWn-v+37mq=8fBZ@-3r2@v{wfJV^)iv0@E*eP=^BTXFKaTDh5;fxzM z(cN*=)LoDAHuDw7e9;o_ZV(jVwxe?yY5uc5`Z5#YuXo!g`Z97LdSJD13Zn;BOLxx@ zZz~u76KU;R{JkaGxMfafK_awuS7Qz=|MIUFGVPoVoM_H}z487fT6>80$GG~RyDVJv zWmTlJzteji++PwisX9X3IKHGB6xxz+WX+y4ch)S~@?`n-NL(>|#=xrHkSXz z;}kDh-8;(Ju>zwNQYH_MTfAho>fX^#L1b0$7zZcrSU2upemG6!y!sjE)cTuwytDLY zvP^Kxel_z%clGbhJjtnY(qyuOnLUo{==Q^Tl_sM=D20f;l}mn_FQ*0=HdF5i~D|-_?02^oXz|h zGXF0z7eLJaZH6p#_IEO5kuP-26j=;`|1YmZpL+grC0Y_Q&fl&?vEUJXnu}ilBhJ#8 zaRlEL?dhz~qfn|?vGQdbanp71UOf|byi|Q-ch|rN<&NT1f5cf9Gme}3%iV%n?)9%} zn>Ja!E1cEnwQSy%G1jtsSH)P%;aweLEvI*lv*vV~)0S(UCCGwxzQy0OV7*)BWC8pU zw!vBTC&90a-$tj*pGKSh5@7T10k%X3h;jAY`ZK_`zXaI+dw?C$0h~KFjo?zU^Jjov ze+jVr_W*mM0~G%iV6Ri=T)+1@xC3s#yN5I2+!5jqxC8Ebl=q;oIOg^Hkh?)pgqwZN z_51M8`snNT5r4g#RZjiUpY?Id#r2)okNNA}Y;o$3|E!N)&VS-R;jeeI0IvVRiTr2% zCLP*(PddfXAf(UnTN4R4;!X3&H?g)->2X5);gDMvg{xJMe=_0W1 zAJDg)Euh)UyX|X%F|a)l{2niJGkTejk~2YuHBL0 zj_bGXdX)E_uQ=woe(!D&6yfHZGcG^;tdAboAN}=i9y#@&e%42i>(BmrH$R;EFF)&J zmy2I6w(~OLtH0jucNo{seQW2YbXZI8H>WJh``x$8f!}Ua`|1-{cDG9G{o!oj+@$`; zA$r(5v+nM&4{-5!Uu@sv@BJUgEpz%G{s@TcuKvrgkI{tjoDKX#J^o*!C4gxE+Y5Ta zpV1TjC3<3r{y*IhPvUInoG_BQ!D5~-lEMDYpAt^9ySHDW|8pXYnE&tonDGC?OfXcl zfLXz8V0JJEm=nwe<_7bCdBJ>Oey{*o(EmqFa<^f8f5fD4TjY;FVp2x8l7Bd*a##OT zMx=H&a9*8y9Na;L-0ht~<>s9`sIX&{m-Cgz{3C{U>w_ZPK7m)K5xxr;WU}CtM`yC= z?i}KkT>QsPe2c$lGIh(GOor2d?5_ImYLtl#YgN>%IO`Gak6)())maPA9icgc#z`0E z?(kc_Xl{vnZv2@q|K@hxpLr7t+B^F8U9`V$)5Y>V{^j+i{`+rqu0=n8WAwErTIZmD zdqkH<`|s*~uksAvbM>Ww>0A80DXr+k@TY+7tooYn*DItzr^w$GY5uZ*+TZt2_pke> zcZ!@daRvuBX)?MyI7#Dnv70oR-1R7LW?ym4GjSGogP;hvXPh%>)}Qs!XX0%Bdbb^; z?~rtffRxJa6h^014tG!I{;Z4tdF1jf{@xC`-LhXPmB(H6HN?-nD(21W6#h!7e9l^U zuZSG}l*;d}#XJKQa3i<@e`eII-#Rf?bS6a`ok`I~XHvBPno<6E|F`=`$BDjO5`Dwj z%~0nr*@Es9@uf>4_Zj?g#wZ;9to-Rx#9j3@Q^fC$Q`D(&&J@KQ+)gjf|*5AzTAP5MGVvb{AC5xDr1;KG;MX=4|IJ4VJ@31?wRWl<&T1>32Ibcqh zbIv*EoWpr|dg>YOPESuo&pW^WdsWrb-OM7&=Y8Ms`|YcGRqu7xd;j+;baO<7E6Sr* zb$h1Oo-0tPj9OK#$-FJ9TDp>%)?}_epR;AVovY_AkwnIxlbDPS`SwmbMp6+`UAk+i zSB)$BM6HVad{NtMRkcZ5h`O*~wyiB^=W~(C$$1eG+44xy*A=Tpt-4HBb|yR8=i8!R zkJTrkx{3ZRG*TK!g#NVJ9r>&+D=yVlAs;+IS48936&6o2snx9h;k*cmEkb{?k!C+Z zA-ZJxK1+#y`N8Mgyal>*ZsB1>PpTUH>|2BKm}4f{X^XX+3*9rsI#y+SCNG8=^Ye^- z{?&e6w?um^lOH!mx3A}l^`p)j&S0l#gAJ4o98A02Onbv6`EFR zTBT{VrZt-O(X>|6I!*H_M@?}ftELOM&9;h-9d8~6%Bm$Yojd5sY~qSdqfVVwJ}oq( z&Gd{mgkP%k_^LIn(X@}IwVKvx8qu_`rmJb%Pg8HWn-{Da6=z}#S40t!o$Yh&RX+7;WNcLI@+6vG3(;bL26k{!9o%=XNz4!f}<+cJAvGLxKT%ZR9l z@V3#o7~!nzz>y=vjvPhPu3qF|+EH!XvS0F&H=WVRxEK?_#aOKlxY*O+VjSaQK`Xaid0$7&F56i!?VQ)V8-O+p<|P zVQPccW9L&C)DCL4JBDYw)2hsN=cFrkv1&%tj~r7!QZ%??B9drwuALK&&Iqd}MRw1L zNpUgR6}v{Q+Rj-r0bWi_X-Onfg?40`Od=7%>{^g8rXpEXbGq2gsyQ^-QGlM9Gq#W{ z1%qRw!KJh=xvIZe&2IXt&LS7mO~VYIyho}94)b;f4~D5lP~pQ*;T zcX)i(3KZ%%DGTQV4{7_r_WL@p`F>HS(V6DJ%KN+GfT%OYDH`%XC+5uX8PKY1Ep&;4 z*otvi9BizZfE5oZrVC$M)JXylONEEuqK7{Yux*W6LFy+E?0=4RVA!C%Wu`{wvKEvFLG?xyekUEu7|>|bCmbzhDS7SRlGkRMm;QG)CF3d z<4ar0LcLjR*+RCMtHyVDczg?2Wqe1#jz^Lmk0LwHAv+%JierLyTm(BV@Y}JXvt4wv z5s!7namI)}Fyiqg!8wigDjK0DxZ=d9Rnw77*&R7?Qe2!Ibq1RR%79KKb9Q8k%+9wn z&66jpJ^U}YGZbO0b!A?=dX(d#cu3=RszVj*Ds-_)EPMjVWXGE>a zY{nLgt;+nItT@xggzQ6ULTAOr*-_=GbHbiFm!4a}E7kz>yrO<*ZD%p) zAn8h$9vaRzX7@qC?3_tjF0`$>Tvq0@velO2BHmS93@|PUfbkM-iIoSV4&O^LNOVsv=^IFTr5$mj?I`QtgiyPgLhTj`wc{z&Zgs_N!BD#$p>|0i)bcrT2Z!37 zuDHvD+T94Xdz@+qN0xh|PHnSO=g?u~K3Ciyb^7%<{hi_=;{j)2kK-LBLVL#tX)cjP z&d}dwPp?;&hk(0>1GsxcKbE$vz4@~@jQ{cu8;^Tb8Rs#daUOTY6U!Or$rTyrssDy? zo(3$QAy_<1u(*d{@tiB3H?WBCM(YK@;t@YADqFCQy~wzD$rUdfT)YBYysBXPS{Syk z`(WFf%_)Cys(1r7d^3=h@j!c4R<^e!J0g>OW$_k(_qHqEiCTRVrW3N{yBG$h_J{B= z-qXXN-N+x2MpH>CmJ*pS-ghPxwT8RLWFFej5;}E>ReY+=InZNj^dg*O??GJ|HBnuM{&WBE2MCG7vEU@ zl!Q_DPrcp#7C`t;&G7s1442UiSLcOfZf`*Nt-%-33j1ohnx_3U?XRf^v>)J#AB!6R zJ%4p`F_WjScL4wLb^t%o`0ExqShRN2>NM$X2Q{EUbr`JaP)(znZq<;Vx^S85V?2>i zLy7RARPSB>-L!fDeE*@5U>zawjnUxyxz`OcUj>jsc(L%04{gM&~<3h^$4$m37Qg zhafCgm}`VO$DE&3s92G<_KuDOpD$%!Gt^qKtyQL@aI?>=lT`jnJlIabuJe zI5wgFya7vV$8QRbD>q|)zn*l+;hU3WZw}vrgxKL}a&nLLfA7C2#(_BAl7`*j%B^UG zPGg#*1EIV%>aN^|x&BZdPW@wP^0JENbU1?RQu&;Y;d=B+Q}m`))E2B-tdDIHhb_4+ z*sdIbzss$W+##aY2;Af+r;tEIjznL?-zXLwjlUH0tLd0mO2?48-mW48mymL-(l(Ct z#N~FZj`lTjJQo93%W`|-!GBeY9PDo71giS4TfEJU+yPArKs&M?$}R>#JAvcMoteKp zfOa9t-T>4=n0K^ z_h2uY6}^n$aW|D1_a3@Gz=55UckF@)ek*Th)qmXvMK{!RW8V&HIr~r3PuBJ)$$$@AE=#5|_!zBN3L5u~m7`{jA804I9|M?qRcVU zxEM?k`xF&Wir5!bWQZgl!{;u!9~eq{{}R#%1f>sT>Bge;L7sHHg!I8d=>$vTC}*C1 zh$o#aA)N|Jx3Dzwu90p9!*-<&LMufZb;n9UDU&vRv#9P$huLbWz}rNauAMr>(v$@X zu1tIF*jRJ>Y}HOthxQJyy`qUvSF%&J6S~o!@!Bh!s#rd&+6grUo8D0HQe)N8`zW;> zbfL-WmuR)ylDMZ#%x=%LN#e&ef8xft?DRA;hk-NKn?u@zmT4D{CaRfiSc}~8HnBr)8p(&1$7n$N%c`=Fr6pXh7 zC5`S&xEOTxrNo1)vEb43E<-htxQtBTB~44VL*5U5T1V~! zRMwIEAj*i`hlnVLJj}(QJ0Brl(VdT?8gS=hV5oR~yoB@9x3?o{Q|D7f+&{Ea&5GF3jymEho>BVIA{Jh-aYQP$=fq?_k>mUS?H>f)xLN+Fdyb0>c zx0vfk$lKH&OVg~?kuhcUBM;&muLQrgmG7X*IPqO7YbX8zWjOIYB5K%w9=?+`xBVTksp-M@S)bAU;Ts~X;x&Qex&l>% zgip}lm49J98VR3rB{;*+v|5ahvd-t+h1b;L48H)gBo_W!6b2T|JvoN|H&(-F_&XPa z5B*ZhX*7I=Dn2hLZVUebhH}c+C8WOzO8=9kecR!0!9cqC4ur7uJ#`0`$kl|UWmI?N z56mVonRN0ab%>?O&`1~m^4js08*d7KQtf0|oi3bV23r3H(?IJ#BpH`K6H&9EtJ(17 zYdP_X(5j$n7+SDsnKK>sq%X7rQAnR^u{15aK!bNO!$SkV-s+I&pJ@-*>FWVWAA+eF z(5GPG8PI28z8TQhVav^cJ{3b;)0bl83;I%wd_lG$U(gq0uI$fT71wJ2zB=`frD>KX zuIu>Iu>?MGkpsXsh95{}9n*tQhT+#BqJ}+~i$U-WAs!5mtq>u8O{yxxFaLduTnnvG zgnN3j*EIFHjp`Za23LhsmiGhb0+WEKqf`xIVd0Y(i*<1|Tufr%;K<`8~kZv1wXmZqnU6)lQZ% z>FOX8L-WBjF?1M7#^nMcY8EaRgElyvcyN>U|3a#&7{cG;e)+#-RiIF}zo$@p zEKLT{iL+o;2UrpVBa6%a&{2EYKUdvoR!mp$D zSejg+5uP4^a$4i|dMfKf(+wyCw>J_|!@Y@%LEPRS;LmL|_&rF_C`H!J0ns-5g;R?3{g z(^Fs?JUva4arq1pHIHYx7_`H4#4F9p);Bt2>r=sXdk^+nH6cQ2##f|)}NUT><$uL6h#s9sIZbnL|s+{$CVYq#yE~MyS?-byE9ugGbCv-vARY) z1+yaNoH2_zVb0mpJ3YhQ>FKHHdFS_k6?(dRXA$A^zHj*b_Eo*A_qyu6|9cg>IWA_YtXUw_rN$8MIwV6?p3bTsR zU46YOZPp`Vgz^hmRijbf!cE4i{esz+mYkW-g(oHEhgo>1<5^FeZ4@!8Gg;o2XiY6J zS+7o`M_hJey_;yHG>|a;X*OH)i7@Na5NrrE1WumL`WpS?g`64ZW@{p!nj<@~ew{}7 z?77W3*55XQnTB8?DQ28t8#iSNZSC`Go3h-jZ7a0qQwh#75Xd)WGr2r3H08srM)nHB zHlmdcXb3Def=N7YlL#IiUaUGa=jXSZVYaCenmP^x3ZlcnNJF_1oY2_Vxc@Ksi9Zbi z8&=8WAc}waj(oDym=*{;`Pn;_ulElG1}w-LM4Nk-UrF?ii_iatXsCAB#5X_>TJ^|T zM3+yS`2o?dKKf$lo1lB;79B(MjPjai-|U~q95abFlWo>m=$OtnH_B3(JR79VFJ|oW zukz~_CEBBz{MgZ|eM_4Sj#wL8HCE9ETS*&On087}dq||Atn0MZL<8a%lq3*Rv`o=* zMJp7oRJ4bpRf<+CnonACid!3%?Z9oenQdb^^UzS1E#YaRgPP2?Hrp;@RU5&nz8P(= zW;6tTDOcmGP_$Cf9*R~eTCHeU(VmKKq-ZZio#F0KFe+u7u^nv|K}5Et=9$fGCyxtc zkMIl;kYTnn0%RAP4Mp!bA|b(sd3wXeCeI{VbKx1O%&b3FW!%88dC8g1s6>p7_TXZSQU_cdsBtk?;9{XS9QFn- zVr;j_6#dYKpq^De*P2R(4@Rix@`<+gFdG*gF>>t4QNu?Mcm2YRjR>{fjk1<(mW`iM zr}UWlBnGvIob3evZ1>!d+3uCF+1^Iw@Y)fhYe%p;n@vO#P0TZMtlk=KR3^#pIkr!X zO|sd(5u>VY7LS9MW0RZW@nj)|Op}Sn!4QOan#Nh3#j^@g3|R zpRpE&Doz6aeBdB$CTxF*1)ColvFfd<7OZ@j%?^)PldYm5kFcWFbe92*vgSfNJ5pFN zX0xNT72~kt(ZzJ(N{cE<;9*Js@SD`|#{#y^5hFO!2-X|HeT?8Flz}>xkW3h|#bd}> zN)RE@qC>*!HdQ)7C~f-n-paOhl2$3&w@%nP1zTgVY5!nny9V3Z{b1W5n<1-a$*M<_ zRoiXGyjJC~>MXZaD_UB!i9E{*yXI|H&~}{zyUvx~pXVRZ{0;H`0vPoek5LyYb(Sk_ zNegvnu{Dc?#cVmgWBub>v?1d=4t6}A?05p%aW2{MM4O%Dwc}#gaiQCep|%w35Jo)N zW~XQ)cEX6KmIUWi+N&spo@TStBSvLwB5Ahf*cmZ)X2hz|2~+?&naG*p$vnHj%rs7# zDEBaDIeVBTYPz(3k5Jo|vrFaa0IlCxmV?0{FkWL7Rs2DzIGfsofnX?~G>r<{e&pCW zF?MdmD9dI{w$v!g&&{%BCPr({q0ycfW9LVt8!qs>;X+ax63bCvon3Ttpcu23(#GZ@ zDSfeD`Vx{Z6Ve0ysYGurgqCHbTSapTFSZ8$E?aGt z@E-lzRc3DicyHV6oruvht~(*Wy^CRBYVY}n@xB@c zZ4y2}0!$`2`;f>2_D5@cahLFs(L-b%Z2dnz1wb+?<4xISDE^ZNNq<)AkYdY10VLo8 zZEio2Hu%*4@W1>P8+?Z0e@>qHf;{sodFD%-{Z)IWOnByR@XXsT&lHd_+vDm0^LLwl z6|r*Kj=A;(&zZIQ>>_IV1oJhl_z#4mFAnR5YUK&UN`Ii`K|K+7od(l(0Ams{O*h8~S*V^&c7u z771GaN2~S!mu@$u;`mJl!Q@@H0G!gAQuIrwQAMY9N^p7HlJFqfIy!(?9zsbtl$TL| zZ<6NaO2_{bxAF>%Ct>qS8g!k_dx#;|Ti+u>coix(uNEwM@W4uhxrV88)cRS5iVs4Muq!vA}y#QQZsh-l!Ck)=CRO!~39ObJ__-tZ$9} zDq`psOoa{ahc+2Gygzk|@r{W?t0%7!aUcNaA%TNWt^t~7#B-rb8HHPqhaqu$}zv~wW z48AqCglOBuZ$2RUz$+cYz6b5uJZK@&HJiRY>j%&&*R`!Ax@>%?Vh!jgtC!6nIy$z? z3Zh&0`}~X_LEBf@?-5-!s{bxOf%bml>iI-tGxvD&FM@F(j(4JA*V%k$8lhF67SVxF-UW4=4;5T@C=a9l z(KLBkMswO9!PUupPQ`F7dZj6P(=uucMith_7LGFz-xX||569mwYozE9mTLrV&l8hL zAk0UgFXC^c5FCZS6!ROYn3qaNle+G%|2!9ee2mmKmh{BZWw>b~M|b}oDmG)VwW5PB%PXaMaAj?MQH{H_4nnR6W6!s^~=H8}t6&UNkFu0U*ZRP-5H%sQv&4 zc23Tj3L^NeyPakKE$Sp4qUbiR9aI${VR*Dwtj+KH^CT;p=QQhX2V9TWfZ}4Ec6m^KEDGL;Ao_5-?u@>#KWjjS3+FPCW zP=h#K@iy5`=tg_SX)kLi7xG!zPN*rE^j1JDHAXeP1Cq-@JDQAM@n*9nfxE%@>{OzT@^E8{!L5Xg#_*_wNU1ITh z#6ixkT5REfihW5i7z&k@3E(-`gnYiQ7jCoYA1(`Vp~V+at5MOC;&}0T3@Wq_#%tJS zz7Q3<=;AhYk;pn$6ur1yM7-kCb)4w(p@rkNQN{}M`0-$f?e&m@S6776!o7w@OZykL1LZUl<+OT>6od(?Vy1=c6&~YI#@RdV0JAz+? zvM7$^7wh6Eeu*xQ=9lW?7=9Uw02GY393_SB%SF-a>J`L;tFZvl^R7VElXb5ILt^== z64F~lBYj;7>Fd4HHwbB0-n`L~zNv)t&0gtSgtRMX-wFn@?rk8Hb#JHc z$hri5%DQ(@-R5@+wma+IMIEAPf*7*qN~c|C-MeKwK?m*kIPE&C-YeS)5@^5AY1di# ze%Vg0mRUDxgqsriINoJ6Wn1|J;HOpOK1gL1xeuX?$X!K5I^i%&+HQ*Hb?6dQs$OI~gA=c%{vs^$ga6;tDjqRWS_drkflSk}jMqiK0=GWp9z zQLGiAAhsuTm&sot0g>ol723S!dQC|w^uLa($6RlKffLP}AcUc}s5>x3P9O}u4Qlgu z1lJ9rcd0#^CbP&hUef4AF2;vYaq(`GzlSF6<@c$qy!-*m@bZU5OXrm98tZA?(eYxaW(&!T=Ax%p)0OFjkFgVYzyE^2>Pq+K) z^wyoz2U5}x^of*z2K2dS zwMWx5OC9Rf;*+a5J`LeDU~9t|GNz*{s*H6yt|c9?T-H;O7Vv7#I| zuKRXS+qm^?z)oyir6)wNCF^fTj5O`R(zN?Z)4nTBJFfI)_~ZJ@%-Y@Zd@TmEZBMe& z&pU{s*UvkuXQ?C;LDjPg?F5FD-noSIE?()OLfVz0hB?x;C8T%tN)H#(u07ZYM|xxl z=}}5ron=N-RbaFD!4e+>CX(D(5DKi_s1E{*uup+Cj_NkwU9jDY=y>W7O%uS7?%aI{^aidpqr+hRTKzPSEgv^?@(RHss5LKy5xHskAAOG>InvltAsR_V|=!dawD*nIIzWU%;WSa zk_X9cZat;)INHUp>r#VpWyH9I{j#FywO_kdE+NWL^(0CThO}I+gmm63T@cc)L^%fx zB&xX}1nzm%9pEN361eA6-R27f+nuP6p$^eBSrCb4q0_Dtl`Y%JGCI*5snLHdm>T_y zNHWHcBO+&UyeN8Ya02n*CguMVsVdQrzr~&KN#H{XE@l>&P|~54tnbCl$&iP;PN9dm zQPipQmWe!e3caOs|GLX#I#2mOLg;#xF12k}C+QA~7C%;y{GmW)BF0ap;R!IEM&t73 zyVLbpC78~jY6(N02|iNZSs;XpCDaY5ATJRr&IZNhm*7fJ$h+5bs6Cn{gQ%2OFe>mr zVa!&3t`J!&M2^%i&c!E{qCGr~dWHEibije>JP?hL^GPPgFA$<)Vi$^{7a_2ak&2E|bhM&l6dkMRZiPg-@==|o)+`E$UzgkD`@Byqb=bucFMjCh z`DwnU-L;D=-TUb|(vA;^qSuZO5|1V>-Tn}&!gR%3i&bDr`G-r$KjM{tw5$AMU}2?v z9E1?_1PK5!qgY@$?)? z#`yC@paCmU{7Y!!r@%uhT?Cig<%&;7bv2>RCywyhT-EqqNsJZ=(S@;vEq3 k_`B2(9w!fw$KM0B`TK$^9WHN2KA`q!nrx?6!9wl-0LhaWfdBvi diff --git a/pandas/tests/io/data/legacy_pickle/0.16.0/0.16.0_x86_64_darwin_2.7.9.pickle b/pandas/tests/io/data/legacy_pickle/0.16.0/0.16.0_x86_64_darwin_2.7.9.pickle deleted file mode 100644 index d45936baa1e00d80abce3c941b6d88150cc4ca8b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 15013 zcmd5@2Y4JsveqhDT4RHa4JKz;l5oa>ARA+2goME!6PINrjXYW_P4BJ~9!#=99x2nM^ZxOcZR!0RpTUxSqE*qbeo*#GPTOaB6_1x7GR&__GXivAbF0kEx z-BzEJ>gM)u;!HV_IR7-;ZMk&ZUA-<=S5jAU?7r?A*1%LgYo~@#Q61a?-B$VR zxy@O3pl8K8>SFmux#G0DW>ZJLy=#6=Q>U*0d0q2;`eOJF+>EZ_35pL8@0A zx{DM2Bx!p3*U9yV~BR#2n(6h@1=CH<&bi3`Y(~w`Xue+{Q*4mMC*E8yu zD-PIK+jadS?Nd8)qetrY4Lo3I+ z(=tuVHLcLJQqw+~R%u$TX|B;xOWee&>;i5(o83(ve;o$OswF;6cF>F2%yTzSIMr6H zKC+@M^ollyUCQbb+vI|bKDyTe1hff-wLq}#Ib>8%~J+U$vKolUdr z(;ewqwurkUAUraWbVoVsJ3z+hC^B}W9xE0*&|^Ydt7JuH_w_R-opi^BfHqDK2cYe3 zfHqzNZDH8Ib_LLq?oNp*=Am^l2Z|d$diaQ;LA?!Cird*LYw7HCcbQVF``fujJbYKR zs@mwPCalV;CZ;`iH>+}J&9IR*!`w-pyE~$9dY+wiCp)98%0}up>rP3!dwA~DgjLl( zOQgWdx_dUIQjPgmgs6^GDh^mQA=1<#%2dcWvzJvlC*79EIQ7oBd^(?1dTeyob7&m-ALKyi2Pd4#&OQ#bob=p763(7ZL6Iq^-kBLtpjFnK?{W{7T7-O~(P9&{ z*j$J+fq1H;i|x5B(dp09(;p9bIteQ_!HP|^V!K(fNhkw#I+C?Qk(rPpXKO(~zr{em zt=AaqJDbw!H@j-x)+$=1=^C|C*X__XCfgC6Y-eDyUA<4%g;oT$nx$5osnwk4=EGW@ z1Fg;uYPF)Jtuvi-=SscK^W6DHuM42p!<6+GMrY)$iuDhNQWu4kdW0UXEfAiRhI-T3 z@W96`kMFt1`c1pvT43;}YueFzWGG&pj@z$8PBHh@c)z+gsh^r4mo@+!Kuw zPl6IpE(*?iCOI0Rr+DtE39GU#-DtOE-P4lp=?Q00DEv33vvz#4=v-iTG)$VHlAkmD zVyOHTm$n2ba!n~cSUr2U9es2)`3#5PdMi}=i3fcc!O5qjZr1sMA;Rp@MfjLTcXmp zQhKFTc$@7gz1{L|7Z}&1s~&D;)|Uj|CJnF2mHr@ zm?uJrc~U=?DX5A`a35VTSnds-E6$klxTlnIo(?GI8P9#ThjO0#IpsY6TPWuRz~V*1 z;w8f3LBir?&wa(fA};fvR{@JBgRm%T!uIl-#Kr5L`-Z{Ao4~~~1>3ixuzfoK+un5U z@6qo-hwp}hB_3$c>J+WW>EpWv%IYFknL-G^M9h>#^wN;mkFP z%y!{8WY?HOimaKV%HN^J?>zS(30wzc^Nl!byWd;=l!O^>xz2HafSG?(EBt44h5zCT zSC;{PUT{!`d>_4?I4!*scPz3u*Fwq3zEz$H{K zcCer9p-0rDx6EaYTGe5Yrb9GMXu3^pZpxzNs*f>7O3fxN4}B_U5HUQwZuKC1MX8<- zwhLwgBUL6K$|Sp&TL5{8r*fc(z+HkU_s7amzT2wev7Z4ihwBzm0p5}&fUl@TN!nNR zp-4Driz@Q||5#a6V?Jq5#5w6np6Dy5JlXjX4kK1W#S{G`OC3b8Ghwa$*||PLYb)om z;w`OhZ7F%o6RU&m8_Ze9>eHxvMlj>C%2V(TF@QyVB_IZJoJQwf2W}+RM8y+>@HgRn zZw=Hw#QO*BNMbPBlrM?3*exm6CWjlU7$W7kry8V~TNZPfH!+`W?Xa_1Uh0W;&>hcQ z7sM0mNm*R!RjR&XeK4>WUI9RKu>1B}CqjWJo^cN4C2kW=vL zP4(LTYhIlx;<}sZhn6lj?#2jE#4k7R(J|zgTcBC_<(4QZlx#&2nR#V20utbOVr$9o z*PV%2ej7^mX8CO?BrTtzwY#nVdqZxAdBCT)=d|I|HJqU{IV0T!ZoLEQo){{*LAM^p z{`DD}Te)>6>dMuPxvX~P;pmm28)uaJ=B+Ahoh<_ABQXMOPmILh9^0qv5Lep=uH@5` zDG(Q<&=-z4S_xG(&f{?9Wc}h$9Y=#c&uo&>usFk%Lf06iMSr z@)eLc7z`zyEFyhKSUM%8Cl;g+^`+BAq#MK1O;Q@?JoD^kU)nAr-4d3bC8ZIYjkE&> zVrMG|casdeV>e+~Ws*3Xbx*WOw%Shc>JX}HXNUR>BSqd59ez7*Z)AI?YNtb@y~}Se zt&@i{;i`7x9qqzzFRLq;@>$hR_~dQARgl|}Rn2!rYRkx>$?BJCwp-G;TTIPv?PwNx z@+#!Z3^9l0HU0KL4m%eueh!;&%sR(=Vjg>!3wzdZF&{-3^At}kK#4bC;xJhVT!M*( zX2BLWM~)-v2g> zBT(T*nK+VN6ta$z#V{_9CaTDAy zgJV*5`NBeW$E$ZgMK}dLkRqH4lF=LdX&fdgPA5?_K0_A6Gd`0%%s8fId={!9&paCp zNN?bYbBaixtEKggqlrx5%AfBg=YfMLaXyHHTNUv;6g+VO{wAEzor<`SJf%V4qBO>6 z^l`VNu1~~8QX;&9iwhFe`BK^Cv%uYnxCAT&8s6fhn;ctQin=E*!(Y5=Z*FZDm!nu0 z!`(zWDXw5=nGaowQuKg&6?p}9U5#o;UDtqtdR)p# zfIK0K;U_#vUI8Fap^A5fh5h1bFqEC1DI)!BSo%3B9oQ$I2Lrp!3m}Boi|h_~(fWkf zORRh1WyvPK%x>}uJJe^Wd+Zjk`t3ly+$~;H?bNl}E%5%?pzU=q4cgwIWKz6IqE@j? z7Q;Gti@XBb-ext5ws*jEW%7e@UhZi{>tY?uKd0=b5I-5-&l>V;O}5U)!%_o)&F20sG7Q=s=o*I#B#|E zs`>}E*Jrp&qw3?0s(%F2sQRCjOp1SzsEPht7Q?Fk4|xSu|HNu^qP+3M2uOWmD9O~P zR2G7%58f5Xf$)J1;#RTOtAZad@mZ!zX-TO`diXt4LZ%}!pO9G&zirYp=jTmP74uW4 zz|8r@Q^G;iQ(4I(EQDDJKY&uJ=O<9nG59$Y$3U=UF9?uICdD8*FA34X zvKY4fT4r#y_O(%k;R`W+2pCFp>lBe*H!QuLln(3&>w|%qz5xiqy&<~;+(aY6y%Fo4 z*jTcIIm0FtsLxP?i0Pa9?Iz}LrrN0)6Vneh=-(VngZ?cjnG{=+s8wtwi(wrk$Sc@? zYgQHd@wc#fZUa6BLAtnYkwLc8gZ#=*%f$AOhq-F_kRXb7(Bqa#6dlJ)#^7&7aVq+5 zrRTX{jkbBMqFZWOcn72S+ttdgWhkd7!8DBXip0C&X08fMBUmk>$dTY9AP>7wl0yd#zq=mz7lsdROd$YG4vteSd?*7K zUgev5WSl9%aq#5imzUMGwje)5N;-H&ZrGZ^%(8Fwut-RiTt=LYjNLvUSt z1jk{#ok|8i#qR7JCiX;G7Ke*EQyd}oGR2Xi-V{fP1{7f>C~t3+w5g}bV%XICkjFr* zV7V`Od!k9QgYmVQ9qKcL7~-q#x10FdqS^@^w9oR}%?*sB z+6f7?xBBho1}CH1X=)vxt!nwlw{Po;d_0?V;JpjKsa_At&=jwr1)pzFw$0;^N zM;4JjDlC1pln!|9F~0QTBGOC3(#J~afY%=9OLrHM zK3+@fo3ayFm0#2bfO#UA@Z6I?=(#7e4?LIHr{|u+x+hMR?4akK#t!uv0t}vay5DX* z_YBoe;Gq3XzukE5S*o3YK>OK#dud%|T`BHv&r$6(wD#P*Re}G4W4DQO!Slp_eZ~?$mmz<(N}aQcoq5q;a798U|hIH%OjbCws|c5#(1ovzg;?~o>x<= z=ul0EX*yif5t@$Fbd;u}HQiCuF`ACmbeyK+HQhDjEJMY)SW_=FG_Ok_ZsFpS?pEd4L?o zJs%_fEm4X58H^ZOqIGL%m986VKC?1c8$B|_2g6UqKI1>p>??+%XXt16hW6Gj)3k^9 z9HqyHWihPBN66#CmDwLfRjRI#s67Ujl7GC2{1ajMCwt021r~zt(;!65GZX+~XkQ}c zS=K%AoMZ=s?(^(WpP^+CG+*%BP0)Q&wNsDgI660@@$?dy22U?jGAUjmQR{eB7Q=dY zjl2S$UT0M*LV;?ccmtdQp58>wIXuIgFyLt!Yr&B77R5Q552vLe3;J)4e?6Skc)wKM zz|x#hgYZlz#oJtn3=!{e1(6W(u344B%6qIz9c9AGc^?gk5g&li;vcdfEKUp1;va!} Y;$z8G23HrEpRm0?L*3~Zkf|*5BF0G6)KSV$M3Im8b}Y1rfw`R}>oKIL_|&(mQPGni-O`m_Qd5bK(q` zbIv*E95JUmJ;UAUdFP2<@c#b_J>9*tfcW`+-{5aw)$8|OSH1Utud1utV@f?Ymar;X z+MDdzE;?litGp?lv)%H>&UAZII$NE~+M>nIR!fn1s@{SFd(Pd! z_+L&r5MUkH4U1f#)SlaUgucF!=Wd*E)^-Lu1t)Bxo#0^JEoR=ECTh#N&s)#5ME+um zl$2^(rfIpR6`EFR+Dp?aO?zvaYjD&YH?t}`f!vNJcXP*|hyk;@5}zyu^lY~9+$|GM zZ!1<8nbKBfN}D1w<$8b>npSGsOVcV%dutlkw2!81Xxdj(f4p1gtzOy2bZc5<-EBM< zs3)kixpZ5nyR8XCF5BAD5T9x;#NF){!2t>@_4Pp5_F1b^C9J!H=kAEqZflusH@Vfu zf(*5ih)3>@ zcQ}+sB$Dn(XF~^o*f|P_U8MIa7C7);LI@B6iHYW}wJ}HfYw=4jEIFaS^w;iIS#w8+yT`;DEoA2!aLYZ_WNM<58EbC7 zT9X-<_T0U!$|2Q5M^q1W$9wMHsJ7|ZcGjKXjI=5n=+~?}G3oB(xswuBRoirt0x#?C z+n7o<T+UxZgX_})6MwD0GLj~ijB2mPP3M~B;CZpz_y_B>n{juCWhmX2+rV{@LH4?A`y96KZE z*ox-Xj&#nQC0#q)bLSY>&V_4_QV5tA9g=5;v|0&xH2gY0TwAde5~WKSPV!X=oD<6YiyiDfmlhbG}mBT72L68-Sd*}`3bA6 zquq8du*!0?I@~2T#%s^ycz>64FHERNTojGO#nNi2TzS?qb;eW)=9+T>H$#^w^`%ku zOQm|5R398IDrP$(%`Bmr%WSK6wnOAPM3XJt%jG`n3gGR^5Z+2fq{5v{g}ayvizaUAnIS zPXIShhH&$gxi6{;o(|M6is_y}y7E0U?)S8E&ocq{JnOm7b#u@2D{{{ZW|+UJdtL-G zULrDHCNdr(GG6iAR}C`aa#QpgknvOy8D))FZC;nqc*AqwG|+ep&{(FR`*sw%?*yUS zlh^%K`(1ePy-?-E9qs8Iq6ORf_+Ejw`#!+;f#-gR!kcRBXhN0yC=~9G&5e=JWu+yk zmsV9uRu=5#Kk2giq>O2(eu~j!`Tl)$^nWl$xO{(x655a!?&l$6T&@?v2=z~D zivNsG@n3R^YszXryQe9xZi2E8&(e+tX6c7Qr@m?Gz|*WsVd zIxP51CFjt4frG_mA8k^To;EXU)hL01nhw@9q3L!txry_aD;X1xlp0Q4J_o8@goxqp zwQB}3EK1FQuySy3FhcDOM49CGa2|lC*^&~v2d)SlD~R%fu?!7#Syem%;A)=3*^8(E zZ{b2Co(hd{9=*nZ5M^{ieEDscq&KvsDQDI)r_sSgUo0NQDA z?sJe!Vr_IhF%W+f&JWfAok%>2(Agvg;hIV)u@1$OVqJ2$xQfA2kL#-ei#c&IM|IH)Q?G8$Ubt`~D>*8{d*VowRN2cOQ~I@Y=#1{{h;kX@hyB%h!Ep`VXKJuWh@F z^xV;<70W?Cd;Z)hq$85sUq-rF|1VGd5w!C%?|ssThY#5PC(wRRUNwg_Ic>}{KlL|R zFE+(sf#A(J(Sc4OwKq4D`^{22QzU=4Fn29oU{Vg*RxCNU>@FEf&aH4=CFj;?DWq&e z9k~0-?u9hiT-xf>?!)&Q%OZX!-#iT|M$V%0pmbo@5pf@v8y>mXF^8i z3UYfV^gS^|a)Y@&l=5{MhFs-#CYsN^8**8l(Ze8>VJ>G>4(F{ZETYW<&rM=D*q#`H zzugv6DG*od2u|tK6Q~duBO!}y+*vB_g1@W)Yv>A~jEGGe zyDtz!@hg=Q9ZG@;3W?5wMCqcKe0meEk|6vcAqo;@DB;pZR!I;{d7Ez}b=KCeJ zmgI2B>YHk^o6|UsOwDL%ZxVU(D&&h0F_Z1J{p*1p?krsKcewe6th2}yvngFJ?CHbA z95fNu(>yU3EuNQ&qhup+WG3d3!+Kw}z`;W{o;zZ(($carnLPf0*Tr{!>RG+kolhMv%7INQk z3Ob%R6@L>}9H$H^9K1Gmw2H;xm`%NW+abmAs@~r}oCXQ(A5I6!=+*xWnn{W?Nz{iWS)M=cmjC;Gk?Q0da5wBYua5 zCoaU_gcCY{5f_oCod|T4##mH8>2lOji?~>-geP%HK!tu^DrG(ooU@2a!NO4s&wSF2 zjx8=j-xHVPFW$~KwX}&V(5#K&7$cn&S5jDRn65%A`W$;Tc>$MQgMP?m*MfoV=XD?~ z57$#3<$;l4dAI@86E{k3s5;z4fw~M`qxPQ-R$rn3AA_Xi3mb7WE@gDAZ(&=<`c||N z>)S}wz;Bn$u$%56F96Rw*;hNg!XED~Fjc(oE@I*yV?v)PN8=ql;^#5ExEBhD`+d?@ z@KoV`V+q(RTgW%mgI8dTka++ao_J7N3Pa{0V+*75PwfJjd06U%o$*LOhnRU(%E;s| zIO8!eftrj$%j4)}e4JobPe>gJlqY2~e2b^Z3jpP5^zqWMu#P+fh6>lSMbw{*s6Q{& z18e9DU|<<~5rjBqe?irx_>x2o^()y7`}=G10z>^1`_ZBP z8BFJB`1>zx1pWPuuP*()>YMB3^Hnfbwoz5DamBa1vCN*o@{W_;c@ zcQE1a??LGAe^3Vgrhn+~A3!~^TyjJH{*eN88P3%Bdy(2hXpk5)KNX+O*8aTv*8YF_rQFKc|Ws;ipy7 z2r4kEDL=8o>rJa7=Lp*t{iqz-1+595*#)h|c2e}0T0tldAcvmCe@Ojb_U1MO>b1j` zu-^C|YG8))2MPI$zG}dRGbskju}QQJlFcv#*740N2ds-e5KyS~gTYY#Td#=v`VsXF zqR?WKv0kxAcjHe|6>Leal27fE7UD3BS-470wwJmED-CEPaTOGyUp+>GbLpVT* zs-YZNqy`T2N2^dZoc$us905Mc;7AanV`qv19SjxGu?wgtMoF$JgX&RoS6;8n&?lw} zR-4F=2Gc}-3^kKtHxf12-DNWz`8~*EfQ55xB0yG`9XYp5uDvVVN0yTlgPp+-yI!8#lC3E<}gufn#09@ zra3~?ndV4Qk0!zeC+?4y4*F!-3lUeD%YNs3#-pM@aR+IZ+A>RNW&%m{)1aBd?eU%&P|WJ<%xH zp{mxacQw2{s2Bq}l;vKbC+CwUAkl8{Jp*$-wj+J%G`xT+F5Tf|l_V5?ZB z;IjO6UwG9=IU)}UU~#6j9fbTWWBFGn^z5*uV#CyDpn8X>TZ-$ARVaf(Sx8B_eeOV3n&4YxFxF?(8h#yT}AeHB%A4=t8z)*lWwut(1 z5%uGxdLWfg@YNR-QC}EQKT)a&Qu!oby{m}&BBQR4-cDv;e%>2E=_z0$olgZ}IxnUS z(wUjSbUuxJPn<5

#fj5_K7(4XJphFHlJ5vy=o;1c|c?5`H?Lqa=t9NStdVYD;S? zabSpN!1I&1K}3g7&dMa*Ai%&+XM@oESH z?$^*-uvlDc3~=v9pYwU*Ta(p_{_k!7)cvY!6dj`JP)&zvI$YBcnvT?TXH9p}bd;vM zYC2leF`DkC>F#RVc^!Qt;cz{j64`d%;5$my{u|jZ0*9Nx$F}ok5a!b@6hl4{Y|N)y zK|OJs-@x}yDksHVBx<~O%Vrq9_mD^U>g2zdeXC+p z&Tbnk?n5WTKZVume)OEF8J;)+Cl63M1e6EKk(Dq26#vYrME(p=3@y682ZwKgQp&_|@7QlKtF2kNSk&sYB! D3|Uj( diff --git a/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_2.7.14.pickle b/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_AMD64_windows_2.7.14.pickle deleted file mode 100644 index 6341fa26d1f258f02ba0689c8163ba403931e97f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 132692 zcmd411#le8vMAWh%spn7nVA`6%Q$9P7TF_N;2uflF*7qWGcz+YGcz;8?;c5}bI-oJ z5%0(DD5A5gDl021xvJF@@>yVj8;y#Ki0ow94LHD!M1&?rT1a^N(71@ugnWq!k>04t zgnWJ!DkLtlV@MSnAu%d8vZgbOve5sG`WF`Fv+w{nl-R{0g4{?OZ=_|Be@~2W(G>Z#&ICU7IoJT}tOkyZy@qu6< z5C{xtU>#2XnCni$m~8Nm_s$U95 zL}KUmk(8yop*kgOjzAm32na;nV40wxpjtodpZ^gE_-wJnCD?4+e~JAQ!v?wQ0l@x; zm$7pa0H|H!-42YPk+}|BGj-x?2SWMsRy=RR`a5=waNx{(jjucK#jVGAF4(YKLXWNv z9EK=|FGz{@CgVb5BP}y1sec2@c%q$1j7H^c*<4*{|`(C0F(mZ6o{liGzDTQ5KnksOAXirBd^e zBcuve=uk&EBad~??+E2)&e5kGp%m!9IEl7l&)d5V)`lm`UD?6`z)v~lcEgS>J{BE>OOnhjfRp6JRelyBy&e%`0E9kQd{mo|KR5oq1 zYm+M6BLB15CGY2D`D6%GNQOiT6jGp)0*w@yHU(x(ftf4(6lYO4*4~bo@ex+BUt3AF zx#SV5o*yB_=8F5Q61L5`!8(80OUcwNIg&?G`~ppZSPH~bAdv#e6iB5&It4N*kWGO< z`7M>?##7?1RoZ8ju@gX($jEk9*;L<8B-UVwMBQ6oY-Ta|-SLoMmd4r!DWnP64%I5evItZIHY`h9kFd$v`>Q#lY!a30QN z$cfW6;*-2q&3|Le8KtaRHqk(zRr?>lS?8B;IzeBooRp9lA6qFYAu291A;C^LR?xpu z{s$^$)wP$Y=dCw6hwz@jCHw8LN4ttVV9a6CV?0OCiB>=S*^A zDaos`Ei5}t`4d$WzjQneJ&imzN|+m~)~sluqJ^!d|He=9IAt}nnP~2_TBI`3GQiCy zB!+quqvG1y3?zE2;6U7#Am%1JgnFawEEpeWwQ?gdp$UmrNRr1HZRH8^%h!*_q~z9+ zn2zzWR_lLTzqUQ!QDogV{#7U|)Mg~iXN9LS65&vDbcdxV-rf~cKgs4S(pxFAZB(2U z`ERs;$D*t@HsQ8D%i|{;90;cbS4V-oQ()BfR}3MZkhlaZ>fcEILZqx{n_xSi72_wE zTtOryM#bd&qe!6sY+14Pq&S}yZkKYE8>@L*lO0YUOuMU9(tD`e1nS#|R$W6yag?r;u+MR>k zsGpwICCJVEHj-dUI3$;8gx}bz#zj~|{*9ef49Xg6 zlN;uWM)pQSEkO>twqox}m4Uju8Zv+Ojxiub#| zHQi$mCRaoD;0(`X&s4ukG0$|*44b)Mac8DoSx0tC3<-1WWR}O-*KD6P=eK<&$EUf! zyv~VE$yJaw&*PLq^L^HWf5>>@pE9OWeA-dnqJI;?FC5BRY!hDMvzGb^|LOHuRXg@X zdaY&uMkjfkvXK_$2lEs^I6;f!Nrb0xuAcG z)7H*^W?X3-CB^l>?C^=aFjU_IWKe5?lK{PJ}(+ zvks@aO3iQK9)8#kcDmZ(XP}vbz(AYc*_ST}*6XU9uOv>>>SY00qC!m}{@$ zX!45By84ebuKlrw!|Qbq;h&oE=!@@b--yFvY|dkw7nZY=qrL|EODRt zzo)VilFSR`mu||rCA+7WcPJnswj+84Ke3d4(Qeo|21-ouO7^SAc3iJ)zxce0{S5Ft zcS|W=>y*GLuIp7D)R!LH8NHf=7VOoX9@K;RyoP@s5o8BFyT0(6cGKr|`8(~qli#Wl zZyIOL&r7*K&%)lc&Wsez|8}G1P3Q0buP)VW|38ubP{ns50Hj|TJko)&6<=R>;MUXq z^S-ws9g(J+13&7Q+J3NM^+mDk9XP%egnqQ)y(8lrIr~0 zM`0=VXB%?+=67~raO2VkKZ~jU?@ezr8yf7*;A~X!{B&|}#?;OIU-NU%U(W2!l)9wt z&yFM7w&9Ooe>$r-^IvGc`>QvLJ?M8?Z&v#)S>4&337((o&Tcn--W>kUztkPz%=tHU z=X7Qyt2^3Ju|InMf2_S+HXpXH=e9v~vXAF+xQuqZJK8b$=w!#vYrp!u`TS#!J4gTF zyZN0S$7iD*ttY!}^xv-PRPSMl2`Nrlz`=IhG&w>11m0`?Y9rdGnC6YMSN zC-?KF^|uSWM&w$S>?FUfN|E%PHR`x%uME-QV@-{}YeO7QL{IbuAcGKsr?tCI5Qk> zDL=QV7yRC;{{H_e9NBmL|GNhLbGOxOCL)8q)g4~`h%+@(xyJqVL{nVvXwBV5V zYda18LUD>&|5h6QhpjY9MaKMFDUi%k>hpiz%KwAGxKm00JFAWVLP=JRZ7@xeS+#B5 zargjF+UVr?(#+rbGrlzcMI_pZDR%H^VFQ1ps2KXwcUszgKj{QJbN(tsS~-1==Y#!H z>4c=xsq~9Z>)+{w{s%f?|D+S1N~iKKIuXCqiTn?A+WeDF+f+Js)KB>s=JnWpkv?yf z4N{U>n}ZH9aMk%a)L>c&l~p_CTNQW@}tK8 zM0K3lMs5E$Dl`~`{ixPYRL5y-RPW!Yu$>tEs0lw&9ivQ&bjn_Ta&(i<=`lHhCE7!7 zHY6glO{o1HH>6!uT!c5t=|lY`xVMAzF8>_=t!VCOkNotjq_6}x6dUF3WOw?!ot;UD zH?nOZZx{RZ*T*}A?&3*>Bv6#@X$e0AL&u#^KyE}cq&Vl~q%}-;xp9U5Z9v@-rWQN`P zBk9g`hV9)zLXKqHD3oFwv-|`9`OABwj6qHS2krAWY$q}lzyKT|01}|=3k}A;#NYrP z5C9R702xpK70>`3FaQ&90cn7=Ksq2jkO9aDWCAh+S%9oSHXu8Y0|)?e0=a&%0LyMDo_om4%7f@ z0<~hNCc9A4nRks6VMsx0(1qs0Y0ER&;#fR^a6SV zeSp3|KcGJ_02l}i0tN#^fT6%JU^p-W7zvC5MgwDjvA{TBJTL*62uuPd15<#hz%*bw zFawwg%mQWubAY+PJYYVs09Xht0u}>HfTh4PU^%b?SP85GRs(B*wZJ-HJ+J}T2y6m2 z16zQtz&2nzumji$>;iTJdw{*bK43p^05}L70uBR5fTO@M;5cvsI0>8rP6KCvv%opv zJa7TH2wVa#16P2nz%}4Ha09pr+yZU`cYwRVJ>Wj@0C)&I0v-cTfTzGS;5qODcnQ1$ zUITA{x4=8#J@5he2z&xQ17Cozz&GGK@WWQW9Ued!L_ie8KpZ4M5~M&HWIz_=Kpqr8 z5tKj~R6rHfKpiwd6Lf)Tz_egGFg=(7%m`)zGlN;ctY9`UJD39u0CR%5z}#RSFfW)7 z%nud-3xb8f!e9}wC|C?E4we8*f~COHU>UG1SPm=?Rsbu4mB7ki6|gE;4Xh5<0BeG^ z>`YS|tOEvtb-{XIeXs%85Nu?B18xj90h@x&z~*2Juq7A_wgN-I)?g?Y28M$XU?kWE zYzun8C@>mq2gZQ0U>q0^wg)ZH3nqYxU=r8?>v_6G-m1HnOdP#6LZ1&4vd!4cp{a1=Ni90QI8$ARO)3E)I<5;z&00!{^|fz!bm z;7o89I2)V;&IRXz^T7q+LU0kd7+eA_1($)#!4=?2a22>3Tm!BJ*MaN74d6y_6Sx`N z0&WGjf!o0y;7)KCxEtI9?gjUO`@sX?LGTcG7(4Xg1)qV>!5835@D=zP zd;`7(-+}MJ58y}e6ZjeY0)7R*f#1O&w)5NB0fHd}LLm&oAp#;H3ZfwfVj&LVApsI0 z36dcNQXvh}ApQD`+CR7Uw zgla=|pdhF&R1c~THGmpIjUYGF7-|AFg_=Rlp%zd}C>Ux5g+Q&LP$&!vha#Xzs14K> z@<35gG}I1?fnuRJC?0ALS&$b>fD)l3r~}jy>I8L$xr- z)DP+p4S)thgP_6C5NIeg3>prNfJQ>2pwZA6Xe=}i8V^l?CPI^-$UzkG!L2&Er1q6i=f5O5@;#33|bDYfL21Qpw-YCXf3o3S`Tf2HbR@A&CnKT zE3^&T4()(;Lc5^d&>m-fKEcEpwrM9=qz*&IuBid zE<%@}%g`0*Ds&CH4&8umLbssX&>iS5bPu`@J%AoUkD$lU6X+@Q40;Z|fL=ncpx4kF z=q>aPdJlboK0=?M&(Ig>EA$Qe4*jr0KM32^0gS*XjKMfez$8q;G|a#(%)vY?z#=Tc zGOWNVtid{Lz$Wa1)4*xrbZ~k&1Dp}g1ZRe`z**sJaCSHc902EpbHTacJaAq(ADka9 z02hP{!G+->a8bAzTpTU|mxN2frQtGgS-2cr9!;DxHTLKhr!`+1RM#sf!o3! zI0}x2+rcq#EF1^N!|h=U_QDBpBAf(wfIGsS;LdOtxGUTZ_QBoZ9&k^%7u*}}1NVjd z!TsR@@IZJFJQyAV4~2)p!{HI|NO%-H8Xg0Wg~!3;;R*0WcoIAro&ryWr@_+g7rYzZ1Mh|R!TaF@@Im+xd>B3gABB&>$Kez3N%$0e8a@M`h0np~;S2CZ z_!4{>z5-u`uff;h8}Lo|7JM7N1K)-3!S~?@@I&|!{1|=$KZT#c&*2yFOZXN18h!)6 zh2O#N;Scaf_!ImY{sMo6zro+(A9msg5eR`11VIrD!4U!>5elIZ24N8n;Sm855ebnI z1yKCvkm^Vcq$W}e2}Eim zb&w#WE>aJvk2F9UB8?C?(imxiG)0;r&5;&JOC%U+g@hojkx(QI2}dH3NTdzY7V#ia zNHo$8i9uqKI3ylvk64HoNk9^jB%}k<5$S|5lY3dLq4$-bf#$FVYX` zj|@NtB7=~@$Pi>GG7K4xj6g;rqma?a7-TFm4jGS3KqexSkjcmtWGXTZnU2grW+JnY z*~lDZE;0|9k1RkIB8!m4$P#2JvJ6>{tUy*GtB}>m8e}c94q1OR7Mq4 zMKx4M4b((kXc{yvnhs5mW4roWT6WST=f_6o_p+2-b+5_#0_CkB3ebByWKeRtO z03C=9LIDqchI}&J@h{M0DXu)LLZ|~(5L7#^f~$heTlw8U!!l(x9B_c zJ^BIth<-vpqhHXk=r{B``U3?p5Q8unLogJ>FdQQ=5~DC0V=xxuFdh>y5tA?(Q!o|N zFdZ{66LVo{u(Vh@EIpP1%ZO#dGGke=tXMWIJC*|rz;a@_u-sT4EH9Q1%a0Yn3Sxz@ z!dMZkC{_$Bj+MYlVx_RsSQ)G=Rt_tVRlq7@m9WZK6|5>&4XcjTz-nT(ut2OfRtF2h z>SFb<`d9<3A=U_UV~w#USW~PS)*NerwZww4R#*tu8Vkk3uy8B_i^SSsZ7~lPg+*iS zuox^Bi^Jlv_Lzlvu>>p;OTs!}9kEVWXRHg>73+riuy7on`eOaC{@4I) zAT|gaj19qtV#BcE*a&PSHVPY!jlsrZf!KPxIV#~1Q*a~bVwhCK~t-;n}>#+6M25cj?3EPZq!M0-CuD4 z!LDN0u?U>#yN%t!?qc__``82QA@&G+j6K1gV$ZPW*bD3>_6mEAy}{mM@38mS z2kayE3HywF!MiPypd@!EJDJP5Cg z*Td`M4e*9|BixNQ#+%?x@n(2)yanD855`;JA$V&%6c5A0@d!K;Z-ckRJ$Muzjkm*N z@K`(!kH_2N7VgCp@I*Wb?|^s2JK>%2E_heG8}7rq<2~@6crUy+-UsiC_rv?+1Mq?P zAbc=B1RshI!-wM|@R9f^d^A1=AB&H}$Kw<5iTEUZGCl>LiciC*<1_G?_$+)jJ_nzR z&%@{A3-E>bB78Bv1Ye3T!P@!=K|X@R#^2{5Adte~Z7v-{T+f zkN7A2GyVntihsku<3DhK011eI34}lijKB$kAPI_~35H+^j^GJ_5DAHp358GzjnD~$ zFbNlthDb}KBhnKYh>S!gA~TVN$Vy}*vJ*Lo03s)mi^xsnA@UOWi2OtWq99R-C`=R~ ziW0?$;zS9eBvFbeO_U+Z66J{ULma z!Nd?^C^3u}PK+Q%5~GOG#28{MF^(8dOduu_lZeU06k;kdjhIf%AZ8M?h}py(VlFX{ zm`^Mq77~ky#l#X~DY1-LPOKnS603;S#2R8Pv5r_zY#=rgn~2TC7Gf*0jo41?Aa)YF zh~2~}p& zpNP-I7vd}NjrdOdAOI31ArdAL5+yMbCkc`yDUv1`k|jBkCk0X@B~m68QYAG~Ck@gh zU1SN&$ckhovNBnPtV&iRtCKaznq)09kgQGCA%n=eWIeJz z*???FHX_|*W3ma^lx#*eCtHv$$zZY-8A7%uL&-2QoQxnN$u?wL(nCg((PTR^hKwcS z$au0nX^~zsflMTm$PQ#jvJ=^v>_T=WyOBP!JK2NmN%kUplYPj(WIwV$Ie;8U4k8DW zL&%}zFmgCKf*eVXB1e;B$g$)&ay&VKoJdY0CzDgispK?rIyr-!NzNi?lXJ+qRBHiXxJGq10N$w(d zlY7X$r{B2SZN$g|`*@;rHgyhvUmFOyfutK>EE zI(dV@;&*1{78Nx zKa*d`ujDuKJNbhID3F3Em_jI&!YG^~D3YQmnqnxH;wYXHD3OvVnNlc~(kPuWD3fwg zX{fYRIx0Ptfyzi_qB2uisH{{rDm#^f3ZQaQxv1P!9x5-DkIGLKpbAojsKQhcswh>A zDo&N4N>Zh$(o`9$ELDywPgS5QQkAI6R28Z!RgJ1n)u3upwWvULA9iUsa8}7)tU;W!l-a6f{LWtP;Ds>6-7l;?WhV2C@+;jB~nRL2dX30iRw&sp}JDtC?D0G>Ou9SdQrWpK2%?-AJv~4Kn|HJlnjjig3Vqp2~}SZW+Ko|-^Sq$W|5sVUS{Y8o}2nnBH^W>K@LIn-Qg9yOm@ zKrN&eQH!Z1)KY30wVYZ(t)x~_tEn~AT527&p4vcdq&88TsV&r2Y8$nk+ClB4c2T>j zJ=9)mAGM!4KpmtGQHQA`)KTgfb(}guoup1tr>Qg4S?U~hp1MF?q%KjHsVme~>Kb*O zxKpZ)`auCSNJBJCBQ#25G)@yVNmDdUGc-$cG*1h(NK3R#E3`^$v`!neNxSGY zbXqzcou1A>XQVUHndvNaRyrG=cfzM1?fU`VY&!ilrBaW zr%TW!=~8rQx(r>GE=QNAE6^3`N_1tq3SE`1Mpvh6&^75=bRb=uu0sdWb?JI^eYyeN zkZwe~>Be*ux+&d^ZcewLThhUFD>{U3O^4E9bT}PBN78NRwzP+iqNC|{bPOF!$I0$J6dIUX^9z~C)$IxTxarAh40zHwQL{Fxt&{OGY^mKX#J(HeA&!*?lbLn~Xe0l-B zkX}SDrkBu5>1FhCdIi0bUPZ5_*U)R}b@Y0A1HF;nL~o|I&|B$k^mcj&y_4QW@220|V9`UHKFK1H9V&(LS-bM$%o0)3IbL|>+_&{yee^mY0M zeUrXL-=^=-cj1XtF`UU-xenr2g-_URAcl3Mu1O1WyM1Q8g z&|m3q^mqCP4KN@BF))KLD1$LLLog&mF*L(4EWM`}1224Yy z5#weWGfkMLOf#lA(}HQq1T(Fe5T-Q~%7iiDOav3jv|-vZ9wv&3X4)|^Oe_<}#53&~ zi}5lEOd^xSbYMC%otVx{7p5!Ijqx$vnI23}rWezj>BID8`Z4{P0n9*V5Hpw=!VG1G zF~gY=%t&SwGnyI0jAh0#nZwLw<}ve`1zNJAMrISUnc2c@WwtTfnH|hdW*4)Y*~9E* z_A&dJ1I$6@5ObJ0!W?CeF~^w`%t_`HbDBBBoMp~2=a~!4MdlK7nYqGTWv(&TnH$Va z<`#3Cxx?IL?lJe72h2m}5%ZXN!aQZ3G0&M7%uD7K^O||Xyk*`o@0kzGN9Ggrnfbze zWxg@rnI8Y@owgOv`t;AMltFTqsYHW4323wP@#Rjsq**a_xTbHfJ)@K{A4cSJl zn{CWCVVkne*yd~twj~?Pwqirr)@&#n#)h*IY$V%;ZOeMtC^njH$HuU+Y#bZUwr4HY z%O}+-pJC~iu&Sw{}3)w~N zVs;6;lwHOyXIHQ**;VXnb`86hUB|9xH?SMoP3&fN3%ixw#%^bKushjZ>~3}syO-U^ z?q?6M2iZgHVfF}nls(2CXHT#v*;DLk_6&QLJ;$DBFR&NcOYCL#3VW5k#$IP{us7LT z>}~cAdzZb(-e(`M57|fTWA+LAlzqlNXJ4={*;njq_6_@%eaF6MKd>L!PwZ#*3;UJ* z#(rmiumA^g5C?MzhjJK)a|B0n6i0Im$8sFUa{?!F5+`#Cr*ayna|UN}E-nq1mP^N_ z=Q3~^xlCMUE(@2H%f@Bra&Q4$PA(Uho6E!H!^LuOTs+sFvp6r8 zz$J1?TnDZr*NN-Qb>X^l-8diDo$JB%N*8^8_Z262PAA>2@I7&n|7 z!Hwibaih60+*ocLH=dioP2?tVlesC}RBjqKotweU}4snOMBivE$7bzUF0rtm$@t4Rqh&hox8!^l%p*L?V?53iJjqi$%`-g9b3D%ryvR$u%qzUgYrM`Iyve)xG<;e<9iN`h zz-Qz$@tOH7d{#aipPkRa2k<%hTzqam51*IM$LHq@@CErod||!_Uz9J#7w1dxCHYc( zX}%0!mM_Pb=PU3P`AU3cz6xKJuf|vBYw$JsT6`d1o3FzM@pbune0{zF-;i&_yZOd^ z6TT_mjBn1j;9K&+d@DYLZ_S7DVSG3r!AJ6K__n-O!hzBAv2@5*=MeSCMm2j7$L#rNj>@O}Aye1CobKad~959WvPL-}F+aDD_o zk{`v7=Ev}3`EmSsegZ#{pTtk*r|?txY5a7420xRZ#n0yF@N@Zj{Cs`^zmQ+VFXor< zOZjE|a()HBl3&HI=GX9R`E~qyegnUe-^6d`xA0r}ZTxnA2fvfw#qZ|#@O$}v{C@rb ze~>@KALftnNBLv?asC8w2&jMwxIhS`Knb+K2&}*fydVgoAPKUd2&$k7x?l*V;1bdZX@zt`dLe_5 zQOG1@7P1Igg=|80A%_qk

rVxrIDJULl{5Unn3H6bcE2g(5;xp_ouyC?S*-N(rTf zGD2CQoKRk2|}Wf zByVsWvASW+w{mKMv1 zWyNx0d9i|6QLH3Z7ORL=#cE=8v4&VvtR)7DwZ%GOkXTo&C)O7mhz-R?qFZb%HW8bO z&BW$n3$djbEVdFu#MWY{7$$~`5n`m+MrkMQjyPAGC(aiahzrF<;$m@$xKvyw zE*DpbE5%jfYH^LYR$M2p7dMC-#ZBU7af`TB+$L@pcZfU1UE*$WkGNOdC+-&yhzG?( z;$iWKcvL(l9v4rDC&g3ZY4MDBRy-%37cYnx#Y^I4@rrm=ye3{3Z-_U=TjFi;j(AtR zC*Btyh!4d_;$!iN_*8r*J{MnzFU42lYw?ZvR(vPE7e9y}#ZTgA@r(FX{3d=Ee~5qt zN{|Fgh=fX*giC}(N|Z!PjKoTu#7lxCN|Gc?ilj=Kq)UcmN-imllvYY7rI#{D8Kq28 zW+{u5Rmvu1mvTq}QcfwClv~Op<(2YD`K1Cl2lo$B2|^DN!6tqQcbCr6e!h}>PSISU8$Z_UuqyVlp0BHsj<{VYAQ98 znoBLDmQt|PN(zx$OQBMj6fQ+bky0C}t>lrSq-d#~6eGn-aZTK zS|P2JR!OU+HPTvXowQ!sAZ?U3Nt>lD(pG7kv|ZXE?UZ&&yQMwSUTL4SUpgQilnzOU zr6bZ&>6mm}Iw75uPD!VwGtyb6!FgdLg}(UP-T|H_}__o%CM%AbpfRNuQ-J(pTx5^j-QP0Wv5< zGAtu9Dq}J(6EZ1NGA%PQD|0e03$iFnvMejIDr>SX8?q_8-ZIggxI&L`)W3&;iKLULibh+I@ICKs1W$R*`ca%s7YTvje8 zmzOKZ73E5DWx0x6Rjwvimutv1CJW?JdkCw;C zW94!3czJ?6QJy4EmZ!*5ILd-;R>QT`-qA04ODY{}Prs7i4C~1{+N_r)Ol2OT|WLB~$ zS(R)`b|r@rpyX6?DY=zAN?s+Ol3yvH6jTZ+g_R;oQKgttTq&WHR7xqOl`=|MrJPb; zsi0I;Dk+tfDoRzQno?b8Pi(p%}H^i}#P{gnaAKxL3JSQ(-WRfZ|Ul@ZEFWt1{n8KaC< z#wp{K3Ccuek}_GDqD)n$Dbtl1%1mXJGFzFW%vI(o^OXh4LS>P%SXrVhRhB8sl@-cL zWtFm8S);5~)+y_i4a!Dkld@UaqHI;RDchAD%1&jMvRm1s>{a$D`;`OALFJHgSUI8` zRgNjgl@rQI<&<(-Iis9a&MD`W3(7_1l5$zOqFhz3Dc6-7%1z~#a$C8h+*R%=_mv0A zL*lqI^}pDc_YJ3ZQ~2q{1qq zqAI51Dxs1prP3;+vMQ(Ys-TLhq{^zIs;Z{ys-c>yOHHGuRnw{I)eLGzHItfI&7x*i zv#HtD9BP1?Q_ZF3R`aNN)qHAxwSZbsEuO7_Vrp@~>2skJ{MygwFtTs`bs?F5qY74cc z8mzWbL)6x4s2Zk*s}X9X+D2`wdekU2T5YGssIh9C8n3ojE!C?gsEKNl+ClB8c2Ya5 zUDU2>H`S+hS9_>E)n00EwU63Y?Wguv2dD$pLF!<2h&ogqrVdv}s3X-;>S%S0I#wO0 zj#nqB6V*xTWOa%Me1U8iMmu>rY=`ks4LY~ z>S}e3x>jAMu2(mx8`VwfW_63YRo$j;S9hp8)m`dtb&tAN-KXwX52y##L+WAmhS^_idR9HBo>woZ7u8GZW%Y`BRlTNOS8u2{)m!Rq^^SU1y{Fz+AE*!2 zN9tqsiTYH1rao6+s4vx5>TC6l`c{3XzE?k}AJtFlXZ4HvRsE)ZSAVF025OK7Ylwzw zn1*YFMrxEsYmCNfoW^T{CTfxd71fGq#kCSzNv)JtS}UWK)yiq*wF+8A zt&&z*tD;ras%h1=8d^=QmKLbh*6L_MT3xN4R$ptNHPjkuZmqG_L~E)w)0%57w3b@1 z)=CS}T5F+Nm=>-@XpveQt*z$KqO@qOoff0TYH?b;)?TwTua=-CYDroLt)tdS>#TLr zx@z4tpVnRLq4m^yX}z^RT3@Z7)?XW-4b%o{gS8>rP;HntTpOW{)JAEewK3XQZJah< zo1jh9CTWwkDcV$Rnl@dVq0Q80X|uIC+FWg(HeXwyEz}lii?t=%Qf-;GTw9^7)K+P$ zwKdvWZJoAW+n{aKHffu+E!tLXo3>rsq3zUmX}h&O+Fos+wqHA-9n=nKhqWWxQSF#^ zTsxti)J|!qwKLjT?VNUAyP#dvE@_vwE811cM&| zJw$JTUG4x<`-FqxE)rj2^4U>G67d-O{~!f}W@+=^gZrdMCZJ-bL@K zchh}(cfE(+Q}3ns*8Avv^?rJPeSkhtAEXb~hv-A~Vft`=gg#OqrH|If=wtP9`gnbU zK2e{fPu8dCQ}t>3bbW?CQ=g^J*5~MR^?CYyeSyAEU!*VAm*`9NW%_b`g}zc>rLWf4 z=xgZKIA6WYjh48TE|@Mnj{K;Wio@O^l{SGo!iD!f0s(8?B5G zqqPxggc;#Rgb``9G1?j)Bg%+2+8HrMtPy9#8|@9t@EQq5qLE~DFghBYjLt?EqpQ)) z@EP5W9!5{2m(knkWArup8U2j`#z13`G1wSl3^j%s!;KNfNMn>S+8ASuHO3j^jS0p? zW0Eo1m|{#drWw|W0SGj*kWuowi(-v9mY;$m$BQ}W9&8d8T*X`#zEtdao9Lw95s#^$Bh%l zN#m4p+BjpJHO?95jSI#_ZW0urpruYrZv-<>CFsgMl+L{+00^QHM5!7%^YTc znbXW=<~H+~dCh!gezSmC&@5yYHj9`=&0=P8vxHgFEM=B9%a~=&a%Oq6f?3h5WL7q- zm{rYcW_7cMS<|d#2AZ|aI%ber*Q{sOHyfA@%|@o%Y-~0$o0`qc=4K1Cr5S9tGDFPP zW~dovhMN&)q}j%7YkJHmGumut#+b2YoEdMnH!ag^CYXt4lG(xRXm&C?n_bMVW;fGk zb~k&NJbPa=4Nw?xz*feZZ~(BJI!6@ZgY>h*W73BHxHNx%|qs4^N4xWJZ2s@Pnajo zQ|4*&jCs~PXP!4Nm>11U=4JDWdDXmTUN>);H_cn-ZS#(K*Su%mHy@Y}%}3^A^NIP? zd}cm3Uzjh=SLSQ;jrrDmXTCQ-m>*P#5OHU4)Bt zQ7+oWxL6nG;$4DEbV)ARrMOg==F(k;%XGP1X0Ieu8C)4%nOvD&SzK9N*<9IO zIa~p*oUUB1+^#&Xysmt%{H_A7g04cY!mc8&qOM}D;;s^|lCDy&(ylVDvaWKj@~#T5 zimpno%C0J|s;+9T>aH5Dnyy-|Kv!*79aoU6uB)D_zN>+&p{tS0?P~05;%e$@=4$S0 z;cDp$cC~VaxLUhHU19%+wR;NEWNE`SJhnZh%&f|)9^1BU+qUf<+qP}nwr$(kYu10S zlYO)g_H%MoR!7v)7xjMK(eb=SL!*(=*l1!jHJTaCjTS~rqm|LxXk)ZB+8OPQ4n{|# zlhN7eVstgS8QqN@Mo*)c(c9=_^fme!{fz;}Kx2?G*cf6AHHI0X#&BbVG13@ij5fv? zV~ugfcw>Sw(U@dRHl`R;jcLYoV}>!)m}Sg1<`{F0dB%KWfw9n7WGpt87)y<1#&TnY zvC>#&tTxsdYmIfrdSipJ(b!~cHntdBjcvwuV~4TR*k$ZC_85DOea3#{fN{_`WE?h* z7)Om`#&P3>and+toHouFXN_~ldEHm(>~jcdkr+xR`By+Mk#hhwRGpCy~%$epabGA9hoNLZA=bH=6h2|o2vAM)t zYA!REn=8zf<|=cwxyD>;t~1x08_bR7CUdj7#oTIcGq;;N%$?>gbGNz2+-vSL_nQaI zgXSUguzAEhY92F>nGq0OB%$w#d^R{`% zyldVw@0$vCVnwy0S<$T+R!l3F72Aqq#kJyD@vQ__LMxG# z*h*q0wG7L&EX%eWOIXrUmTP&IwtOqFLaby~aw~t#np;D}$BM%4B7> zvRGNIY*uzFhn3UHW#zW=Sb42{R(`91RnRJA6}F04MXh31ajS$?(kf+@w#ry#t#Vd* ztAbV0s$^BRs#sO6YF2fthE>z5W!1LoSaq#>R(-31)zE5WHMW{qO|52DbE}2b(rRV3 zw%S;2t#(#>tAo|i>ST4cx>#MUZdP}zht<>SW%aiDSbeR2R)1@NHP9Ml4Yr0@L#<&} zs5RUgVU4s#S);8n)>vztHQt(FO|&LildUP%RBM_w-I`&|v}ReetvS|QYo0aVT3{`- z7Fmm}CDu}FnYG+nVXd@QS*xuz)>>I$#~N4q1n-Bi2#tn04GbVV$&2S*NWt)>-SEb>6yQU9>J)m#r(-RqL8{-MV4j zv~F3qtvl9T>z;MrdSE@Y9$AmAC)QKznf2UyVZF3oS+A`()?4eH_1^kmeY8GVpRF&} zSL>Vg-TGnuw0>E?tv^;6JFFef4sS=WBifPd$aWMvsvXUaZpW}=+Oh1|b{spd9nX$$ zC$JORiR{F75<97F*rsjSw(Z!$mbS88+q1Rp+kqWoC$p2=DeRPXDm%5E#!hRev(wud z?2L9MJF}g|&T40~v)eiBoOUidx1Gn%Yv;4`+Xd``b|Jg4UBoVG7qg4oCG3)RDZ8{? z#x84@v&-8R?22|JyRu!yu4-4atJ^i~nszO_wq3`rYuB^u+YRi7b|bs7-NbHcH?y1D zE$o(dE4#Jb#%^o3v)kJp?2dLPyR+TJ?rL|lyW2hNo^~(0x829?YxlGJ+XL)@_8@z( zJ;WYr53@t<;r0l7q&>Sy~*BeZ?U)9+wAT34tuA) z%ieA8vG>~h?EUru`=EWuK5QSckJ`uVWuvBNlFop4TgCxR2viR46fqBv2VXijt|h7;3?<-~U4 zIB}hLPJAbUlh8@zBzBTGNgcy69m}yD#}STnl;b*{qaEJ~oDe6OliW$+q;yg_shu=V zS|^>8-pSx(bTT=aoh(jPC!3Sq$>HR5ayhx3JWgIGpOfDy;1qNUIfb1fPEn_rQ`{-x zlypitrJXWPS*M&+-l^bJbSgQOohnXMrSjx;52j^IgOnr zPE)6u)7)v{v~*fIt(`VbTc@4V-s#|UbUHbmoi0vSr<>E=>EZNrdO5wFK2Be!pVQwN z;0$yIIfI=c&QNEV6Y30iMmQs#QO;;*j5F35=ZtqII1`;o&SYnbGu4^qOm}8DGo4w^ zY-f%$*O}+ccNRDcokh-KXNj}aS>`NvRyZr2RnBT>jkDHS=d5=&I2)Z!&Sqzev(?$= zYQ_gAUjC0mG=bU#gI2WBu z&SmF{bJe-#Tz76bH=SF~ZRd`2*SY81cOEzookz}N=ZW*wdFDKKUN|qESI%qajq}!d z=e&15I3Jx)&S&R~^VRw0e0P30Kb>FBZ|9E_MuZjNM0gQFL==%kWD!L~712a=5ktfj zu|#YUN5mEJM0}AzBov86Vv$576^1Z{C2ZjcA*4{k6`s(-7l8;7$wYFILZlR_L~4;n zq!sBzdXYh76q!V3kws(`*+h1cL*x{>L~fBs-9&fML-Z8AL~qeY^cDR?e=$G|6obTIF+>a% z!$hbUE=GuvVw4yy#)z?EoER@Ah>2p7m@KA?(@VwG4e)`+!Yomek6h>c>C*ete)tzw(lE_R5WVwc!0_K3Y=pV%)B zh=bygI4q8cqvDu2E>4J(;*>Zo&WN+(oH#Eoh>PNqxGb)StKyotE^dgM;+D8A?ufhM zp13a_h=<~lcr2cXr{bA-E?$V2;+1$U-iWv2op>)kh>zlv_$ z{)jL#tPCf^%Lp=}j3gt=C^D*yCZo$3GNz0rW6L-)u8b$+%LFo^Oe7P_Br>Tqq$w?F zOGgSRrIN1nq?W!6WQa^Algkt`rA#GL%QP~rOefRJ3^Jq4Bs0q_GONrcv&$SZr_3dD z%RDl#%qR290?ixn0dk-mBnQhOa;O|8L*;NeLXMQ9l#k?N`9waI&*XFYLcWx*QThR5F!ZrBEqVDwSHL zQE62=m0o2~8C52gS!GdKRW_Af4RbMqw4OJu6ST#{iRWsFG zwNNcpE7e-HQEgQ_)n0W_9aSgQS#?oeRX5dL^-w)kFV$Q1QGHcE)n5%z1JxikSPfA_ z)i4#RhN}^3q#C70t1)V<8mGpq32LI6q$aB=YO0#1rmGohrkbT@t2t_}ny2Qg1!|#M zq!z0sYN=YLma7$NrCOy{t2Jt^TBp{l4Qiv>q&BN9YOC6&wyPa#r`n}Zm%Vj;j;uq&lTet264XI;YO73+ke}q%Nx~>Z-b?uB#jBrn;qWt2^qh zx~J}|2kN1Eq#mm$>Zy9Bo~sw?rFx}at2gScdZ*s259*`(q&}-J>Z|&uzN;VVr~0LS zt3N7?8`cfyhIb>l5#2~`WH*W%)s5yxcVoCQ-B@mHH;x{nA>$%$X-M|fTlex*=6mCj4m7Cg4)1L1IUWk{>OYWucQhKSp)Lt4dt(VSA?`7~ZdYQb; zUKTH_m(9!W@Ctf`yuw})uc%kdEAEx>N_wTd(q0*_tXIw} z?^WD&UKOvZSIw*L)$nS1wY=J19j~re&#Uh>@EUrJyvAM=uc_C}Ywor1T6(R# z)?OR0t=GTrS z3-yM3BfOE`C~vek#vAL6^TvA&^4#dkegU z-Xd?Yx5QiOE%TOpE4-E7DsQ#7##`&H^VWMCyp7%_Z?m_>+v;uewtG9go!%~Qx3|aJ z>+SRQdk4IO-XZU>cf>pD9rKQRC%lv1Dets*#yjhs^Uix0yo=r?@3MEryXsx@u6sAU zo8B$&ws*(7>)rG2dk?&a-Xrg^_r!bZJ@cM>FT9uDEAO@U#(V3%^WJ+OypP@|@3Z&C z`|5r3zI#8spWZL;xA(^jqr>WOI=qgcBkD*xvW}vo>S#K;j-g}fSUR?jqvPs$I=)Vz z6Y4}du}-3sYD1ga(zbTA&{8YyYENtJ>p+L-WIDM{p;PKqI<-!t)9Q3Oz0ROB>P$Mb z&Z4vGY&yHnp>yh7I=9ZF^Xhy$zb>E)>O#7(E~1O-V!F64p-bvgy0k8%%j$Bvysn@t z>Pot@uA-~zYP!0vp=;_|y0)&P>*{*CzHXo!>PEV;Zlas&X1cjQ1_|?xMTuZo0efp?m6Hy0`A5`|5tWzaF3m>Op$29-@cpVLDU~*CX^uJxY(( zWAs=(PLJ0U^h7;LPu5fPR6R{k*E94?JxkBlbM#z2PtVs2^g_KzFV;)+QoT$s*DLf& zy-KgvYxG*ZPOsM+^hUi&Z`ND%R=rJc*E{r1y-V-bd-PttPw&?U^g(?{AJ#|oQGHAw z*C+HzeM+C!XY^TpPM_Bo^hJG1U)ERjReeog*EjS{eM{fgcl2F-Pv6%M^h5nfKh{t5 zQ~gXo*Dv%-{Yt;qZ}eOJPQTY5^hfi>tRKz~??><> z`jPy|eiT2dAI*>M$M9qNvHaM696zof&yVjX@Duup{KS3|KdEo{rf>PS@A$%(zVcn) z^R@5$fgj>0^OO52{FHtwKeeC6PwS`i)B73xjD99Rv!BJ!>Syz_`#Joael9<^pU2PZ z=kxRX1^j}3A-}L+#4qX>^Naf>{E~htzqDV*FYA}{%lj4lihd=(vR}oo>R0ot`!)QU zel5SYU&pWO*YoT94g7|FBfqiV#Bb_1^PBrE{FZ(zzqQ}SZ|k@7+xs2-j(#VUZ@V?``pf*~{tADkzsg_j zukqLV>-_cp27jZ!$=~d6@wfWh{O$e@f2Y68-|g@5_xk(%{r&;}pnu3e>>u%u`p5j^ z{t5r2f671YpYhN7=lt{j1^=Rd$-nGh@vr*V{OkS=|E7P-zwO`g@A~)r`~Cy}q5sH# z>_73J`p^95{tN%5|H^;uzwzJt@BH`v2mho0$^Yzs@xS`t{O|q`|EK@U|LyC6hsc91W|)%LG&O-5HpAs#17&Faf5h4{2)P)Fh~?64w3{(10yg4E3gA6 z5P=L-;09ix13w6YkRVx*JV+6w3{nNDgET?fAYG6?$Pi=?0|s76c1}MZw}=Nw73n7Ay}| z1S^A8!Rla5ur^p1tPeH>8-q>3=3q;(HP{wx4|W7QgI&SyU{A0&*ca>%4g?2-L&4$T zNN_Yb790;w1Sf-2!Rg>ka5gv>oDVJp7lTW|<={$iHMkaB4{iiEgImGv;7)KixEI_H z9t01AN5SLZN$@my7CaAL1TTYE!Rz2n@HTiCybnGEAA?W9=ip25HTV{M4}JtcgI~e# z;7<@HBy32yknkZ9LL!Dl3W*#NB_wJ{w2m+%<00Vg5gD za-o%@*Zr@v?w=~U1^?XoUtRmcf3E)Tmc2!P@BP$kH2UfMVIWQ;Y!rYh#^I|^C zj|H$G7Q(_<1dHN-OAv~4E`cSn6qd$+OAi0JF3b798s74pD_}*egq5)hR>f*q9cy4s ztcA6)4%WqbSRWf;Lu`bNu?aTCX4o8CU`uR;t+5TZ#dg>pJ77obgq^VqcExVk9eZF; z?1jCt5B9}=*dGVrKpcdFaR?5@VHk?TaRiRUQ8*gM;8+}o<8cB`#7Q_Ar{GkahSPBd z&csv02a#7(#vx8PRXhTCxm z?!;ZV8~5N|+=u(|03O6cco>i1Q9Opn@dTd4Q+OKB;8{F}=kWqw#7lS?ui#a@hS%{1 z-o#sY8}Hy_!ytyQ+$Tc@ddubSNIy=;9Go$@9_hE#83Dczu;H=hTriA z{={GS8~@-xg|q)AEQZ7I7y%<th3Kh>fr@Ho>OY44Y#MY>BO~ zHMYUF*bduc2keNQurqeSuGkH`V-M_!y|6d-!M@lJ`{Mu{h=Xu24#A-~3`21^j=+&P z3Pcz=gO77vmCKipy|0uE3SJ z3RmMAT#M^)J#N5_xCuAo7Tk*4a69h6owy5k;~w0L`*1%Vz=L=Q591L$ipTIcp1_lM z3Qyx1Jd5Y>JYK+ycnL4#6}*bq@H*bWn|KRv;~l(<_wYVGz=!w~8SeGP{RhvD)6_nzxm zJU2GR!MGR?<6{C$h>0*UCc&g=potdR=%7G}3SIP2qmKcGU@}aODKI6b!qk`s(_%VI zj~Or{X2Q&v1+!u{%#JxQC+5Q3mKFp5=upkz~!dL{0Vlga^C9oux!qQj<%VIe! zj}@>YR>I0y1*>8;td2FXCf35*SO@E3J*D z!}YiUH{vGTj9YLkZo}=k19##s+>Lv1FYd$rcmNOLAv}yn@F*U`<9Gs3;we0hXYeeZ z!}E9nFXAP8n18?Fjyp4D8F5biY_y8Z`BYccc@F_mS=lB9&;wyZOZ}2U? z!}s_BKjJ6+j9>68e#7th1ApQ#{EdI`zd!Vc4bS|?@c8e>xPN{Wk#i)Bj8QNuM#JbB z17l(=jE!+HF2=+7m;e)EB20`)Few^nqJ=g(C{Ut87d_PIV}K!;43lFDOo^#5HKxI| zm=4op2F!?=Ff(Sste6e6V-C!TxiB~8!MvCc^J4)lh=s5)7Qv!e42xq4EQzJCG?u}# zSPsi$1+0jburgM`s`&3xsDD16)j8L|npg{KV;!uE^{_rRz=qfe8)Fk}ip{V&w!oIy z3R`0vY>Vx%J$As3*a|SQBeuZLEWJu^!gP2G|fAVPkB9O|cm^#}?QU zTVZQ#gKe=Lw#N?G5j$aL?1Ejf8+OMY*b{qUZ|sA8u^;xw0XPr`;b0tsLva{};&2>+ zBXJat#xXb+$KiOKfD>^NPR1!X6{q2JoPjfO7S6^wI2Y&Pd|ZGFaS<-YCAbuq;c{Go zD{&RB#x=MW*Wr5HfE#fWZpJOR6}RDb+<`lB7w*PAxEJ@~emsB&@em%yBX|^#;c+~H zC-D@X#xr;p&*6EzfEV!+UdAhU6|doSyn#3I7T(4?co*;CeSClq@ew}8C-@Yf;d6X} zFYy(=#y9vD-{E`wfFJP_e#S5O6~Ezk{DD957yiaS_}@SFg^kGk$M6^dBVr_sj8QNu zM#JbB17l(=jE!+HF2=+7m;e)EB20`)Few^nqJ=g(C{Ut87d_PIV}K!;43lFDOo^#5 zHKxI|m=4op2F!?=Ff(Sste6e6V-C!TxiB~8!MvCc^J4)lh=s5)7Qv!e42xq4EQzJC zG?u}#SPsi$1+0jburgM`s`&2~?7zSNbFP6ku@=_GI#?I$VSQ|X4Y3h6#wOSln_+Wo zfi1BWw#GKt7TaNa?0_Ay6L!Wf*cH2BckF>Zu^0BnKG+xgVSgNe191=z#vwQqhhZoV z#}POZN8xB3gJW?Vj>ic&5hvkfoPtwv8cxRALVM#mT!6Jud)jDvA89>&K6m=F_TVoZWb(LfU|w9!F<5*51Wp++AA48dfW98+LQ zOogd24W`9(m>x4=M$CknF$-qJY?vK$U{1`1xiJss#eA3_3t&MkgoUvP7R6#%97|wH zEQO`943@=mSRN~2MXZFCu?kkjYFHg>U`?!rwXqJ?#d=sD8(>3hgpIKYHpOPx99v*Z zY=y0{4YtL0*d9AzN9=^1u?u#^ZrB}rU{CCYy|EAW#eUcy2jD;)goAMi4#i;@ioJ(o8pq&R9Eam^0#3w9I2otlRGfy>aR$!BSvVW#;9Q)C^Kk(##6`Fmm*7%dhRbmU zuEbTi8rR@jT!-s%18&4kxEZ(LR@{c$aR=_iUAP|SQBeuZLEWJu^!gP2G|fAVPkB9O|cm^#}?QUTVZQ# zgKe=Lw#N?G5j$aL?1Ejf8+OMY*b{qUZ|sA8u^;xw0XPr`;b0tsLva{};&2>+BXJat z#xXb+$KiOKfD>^NPR1!X6{q2JoPjfO7S6^wI2Y&Pd|ZGFaS<-YCAbuq;c{GoD{&RB z#x=MW*Wr5HfE#fWZpJOR6}RDb+<`lB7w*PAxEJ@~emsB&@em%yBX|^#;c+~HC-D@X z#xr;p&*6EzfEV!+UdAhU6|doSyn#3I7T(4?co*;CeSClq@ew}8C-@Yf;d6X}FYy(= z#y9vD-{E`wfFJP_e#S5O6~Ezk{DD957yiaS7$yqyAH!jIjDQg_5=O=-7!{*obc}&9 zF&4(gI2ae>VSG%02{92S#w3^&4K&e08yyrVQK5?-YV8}ndZ%!m2002ahTSQv|7Q7neVu>_XHQdkd zaX20);6$8+lW_`8#c4PlXW&eng|l%E&c%5+9~a<4T!f2p2`Lkg}ZSN?!|q$9}nO`JcNhw2p+{_cpOjQNj!z8@eH2D zb9f#v;6=QIm+=Z-#cOySZ{SV5g}3nz-o<-(A0OaDe1wnj2|mSV_#9v0OMHc|@eRJk zclaJZ;79y~pYaQR#c%i>f8bC2g}?C+hKb7j$8Z=PBVa^~gpn}{M#X3t9b;fjjD@i= z4#vfJ7#|a0LQI5-F$pF`15LEhMh68-ROq6I8hs2f1e0NMOo1se6{f~Cm=@Dvddz?s zF%xFSESMFuVRp=cIWZUJ#ypr8^I?80fCaG-7RDl26pLYTEP*Al6qd#^SQg7+d8~jH zu@Y9sDp(b(VRfv5HL(`f#yVIR>tTItfDN$`HpV8{6q{jlY=JGY6}HAU*cRJid+dN6 zu@iR2F4z^jVR!6-J+T+|#y;2=`(b|^fCF(54#puk6o+9b4#yEV5=Y@^9D`$V9FE5c zI1wk|WSoLiaT-p?88{PX;cT3Pb8#Nd#|5|$7vW-Df=h83F2@zP5?A4BT!U+I9j?a> zxDhwuX54~XaT{*O9k>&B;cnc6dvPD`#{+l}58+`vf=BTf9>)`S5>Mf2JcDQP9G=Gu zco8q*WxRq{@fu#o8+a3M;cdKwckv$H#|QWjAK_zsf=}@oKF1gM5?|qKe1mWC9lpm8 z_z^$hXZ(U+@f&`}ANUi0;cxtdVWKhrF&u`+2pAC~VPuSgQ85}u#~2tBV_|HJgK;q) z#>WJh5EEfyOoB<#Koc#r(LsR{6}sr5MjrzV!DN^mQ(#I=g{d(Orp0ua9y4G@%!HXS z3ueV^m>qLqPRxb5F%Ra&e3%~#U_mT|g|P@0#bQ_-OJGSXg{83!mc?>d9xGr)tb~=Z z3RcBxSRHF%O{|5ru@2V7dRQMDU_)$#jj;(f#b($XTVP9Ug{`p-w#9bX9y?%1?1Y`M z3wFhB*d2RdPwa)gu@Cmee%K!e;6NONgK-EB#bFqV!*K+T#8EgJ$KY5ThvRVqPQ*z# z8K>Y>oQBhJ2F}D;I2-5ST%3pVaRDyGMYtH3;8I+M%W(y+#8tQ&*Wg-QhwE_zZp2Nv z8Mok8+=kn62kyjOxEuH2UfhTK@cNB9_@;8T2t&+!Gm#8>zl-{4z(hwt$Ne#B4s z8Nc9H{D$B02mZug_#6LVnCQ%Z42R({0!GA07#X8rRE&nvF$TuOSQs1QU|fuc@i74= z#6*}FlVDOb&_oMubWor~g)Vxi(Z>KoFc~Jt6qpiIVQNf+X)zt9#|)SeGht@Tf>|*e zX2%?u6LVp1%!7F`ALhpbSP%p5^ zR>vAx6Ki2@tb=v29@fVO*bp0GV{C#=u^BeU7T6M7VQXxIZLuA;#}3#LJ7H(+f?cs2 zcE=vr6MJEA?1O!=ANI!qI1mTnU>t%&aTtc;a2$anaTJcmF*p{-;dq>Y6LAtw#wj=z zr{Q#*firOy&c-=77w6%8T!0I45iZ6hxD=P+a$JEcaTTt{HMkbn;d@fE(tH~1Fc;d}gmAMq1@#xM94 zzu|ZMfj{vV{>DETCI<5#!(n)gfDthgM#d-@6{BHvjDayR7RJUn7#HJVd`y4|F%c%l zB$yNpG|@sE9TX^0p^F}B^fAB?OoquZ1*XJQm>SbyT1i(0EQZCg1eU~7SQ^Vw}aN>~}IU{$P!)v*TF z#9CMz>tJ21hxM@mHpE8Q7@J^IY=+IT1-8Ui*c#hlTWp8zu>*F*PS_c{U{~yh-LVJu z#9r7N`(R(}hy8H?4#Yt?7>D3c9EPDd97o_t9EGEC435QdI36e9M4W_^aSBewX*eBc z;7pu_vvCg2#d$a%7vMr%go|+rF2!ZI99Q5rsL98cg$JcXz644%bvcpfj{MZAQU@d{qWYj_=R z;7z=RxA6|%#d~-kAK*iLgpctFKE-GF9ADr|e1)&^4Zg*9_#QvtNBo4J@e6*%Z}=U5 z;7|O8zwr--iOKxOa2OsVU_^|BkueHJ#b_8EV_-~-g|RUX#>IFT9}{3gOoWLs2_{7Y zO|;NP2L(!0=%R-jeGD)JlVNg9fhjQ+rp7dw7SmyR%zzm&6K2LNm=&{OcFch}F&E~> zJeU{rVSX%t1+fqo#v)i0i(zprfhDmNmc}wz7RzCItbi4<5?014SQV>bb*zCku@=_G zI#?I$VSQ|X4Y3h6#wOSln_+Wofi1BWw#GKt7TaNa?0_Ay6L!Wf*cH2BckF>Zu^0Bn zKG+xgVSgNe191=z#vwQqhhZoV#}POZN8xB3gJW?Vj>ic&5hvkfoPtwv8cxRAQF&ak47#I^{VQh?paWNjo#{`%V6JcUZf=SUp6D_pS zL4gt#y6B-s9|H`*WSAUNU`kAdsWA&yZK`exYu?QB$VptqYU`Z^6rLhc_#d264D_}*egq5)hR>f*q9cy4stcA6)4%Wqb zSRWf;Lu`bNu?aTCX4o8CU`uR;t+5TZ#dg>pJ77obgq^VqcExVk9eZF;?1jCt5B9}= z*dGVrKpcdFaR?5@VHk?TaRiRUQ8*gM;8+}o<8cB`#7Q_Ar{GkahSPBd&csv02a#7(#vx8PRXhTCxm?!;ZV8~5N| z+=u(|03O6cco>i1Q9Opn@dTd4Q+OKB;8{F}=kWqw#7lS?ui#a@hS%{1-o#sY8}Hy< zyodMk0Y1b>_!ytyQ+$Tc@ddubSNIy=;9Go$@9_hE#83Dczu;H=hTriA{={GS8~Rk0dY#~N4@Yhi7ygLSbU*2f0e z5F24*Y=TX(88*ij*b-Y|YixsUu^qO@4%iVpVQ1`uU9lT>#~#=ddtq43-9DyTo6pqF*I2Om@c$|O}aS~3(DL56U;dGpVGjSHq#yL0_=iz)@ zfD3UEF2*Ie6qn(0T!AZb6|TlLxE9ypdfb2;aT9LFEw~l8;db1CJ8>88#yz+f_u+m# zfCupq9>ybh6p!I?Jb@?i6rRR2coxs$dAxuZ@e*FfD|i*J;dQ)$H}MwU#yfZy@8NxX zfDiEzKE@~b6rbU9e1R|V6~4wd_!i&cd;EYO@e_W=FZdO|;dlIjKk*m-#y=P)4)Y(w zVR(#y5it@*#wZvSqhWN6fiW=_#>O}p7vo`kOn?b75hlhYm=p~((Lx&?6ev-liymt9 zF~AT^hRHDnro>d38q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(}T38$FU|p<-^|1jq#75W{ zn_yFHhRv}Bw!~K08rxu7Y=`Z!19rqt*crQESL}w}u?P0VUf3J^U|;Nq{c!*e#6dV1 zhu}~ghM_ndN8m^tg`;r{j>T~}9w*>LoP?8c3QomoI2~u;Oq_+YaSqPKc{m>z;6hx4 zi*X4q#bvl0SKvxqg{yH5uElk@9yj1d+=QEP3vR`2xE*)kPTYmNaS!greYhVF;6Xfu zhw%s=#bbCJPvA*Bg{Schp2c%`9xvcUyo8tW3SPx)cpY!xO}vG-@eba_dw3ro;6r?b zkMRjU#b@{&U*Jo8g|G1qzQuR=9zWnm{DhzJ3x36K_#J=XPyB_y@ehWH%lyZ17#<^F zM2v)yF$zY-Xc!%1U`&jKu`v$D#dsJW6JSD2go!Z;CPf2Hw9rNe1xi%tqK6uN3@`+f zVRB4?DKQnM#x$4~(_wndfEh6pX2vX-6|-S>%z-&E7v{!1m>2V5ek_0mu@Dxmq=6{}%&tbsML7S_f(SQqPIeQbaYu@N@LCfF34 zVRLMOEwL50#x~d%+hKd`fE}?DcE&E)6}w?~?14S87xuUuCPRAKI6KCOUoP%?59?r)FxDXfNVqAht zaTzYh6}S>t;c8riYjGW}#|^j-exUdJ1F6K~;dyn}b~9^S_X_z)lAV|;>7 z@fkkH7x)ri;cI+@Z}Ab0(F&@Up1eg#LVPZ^zNzp(PEws@=ff5zE=%Gd*0}R1rm>g4J zN=${RF%720beJA9U`EV@nK27y#cY@zb6`%)g}E^g=EZ!N9}8eXEQE!z2o}X+SR6}W zNi2n>u?&{Qa#$WKU`4Eim9Yv|#cEg`YhX>Rg|)E`*2Q{Q9~)ppY=n)m2{y%M*c@A6 zOKgR$u?@DxcGw;}U`OnPov{mc#ctRgdtguOg}t#4_QihK9|zz-9E5{$2oA+z7>dJj z1dhZ}I2y;`SR9AraRN@nNjMp&;8dK3({TpQ#925S=ipqNhx2g(F2qH+7?_uyXKhx_pW9>ha<7?0plJch^d z1fIlGcpA^(Sv-g5@d94NOL!Tt;8nba*YO74#9Me9@8Dg$hxhRTKEy}(7@y!%e1^~Q z1-`^r_!{5fTYQJ_@dJLuPxu+X;8*;H-|+|j#9#Ou|6rK-%zq4r;V}Y6#7Gz!qhM5w zhS4zw#>7|{8{=SHjEC_t0Vc#mm>82_QZ&#+3vF~zphSf(dZ^LI07EbtCdU+*5>sJn zOoM4L9j3<&m=QB!X3T_y7RM4;5=&ue zEQ4jS9G1rlSP?5>Wvqf#u^Lv#|HHdo$5rvPkN=;U1ypS8ZpH2{>_9QG5h+Df&;!`m zsMy`z-QC^Y-QC^s-8tt40&nm8ci;Et@%WuT_PJ+v&g|}7*X}HH4lS@Hwn7hVjh@&B zz0ezd&=>u%Ew)2{Y>xrh0Xt$R48$Ppj9t)z!5D&}*cH2BckF>Zu^0BnFzkbUu^;xw z0XPr`VK@%PAvhExa2O895jYY@;bUuCPRAKI6KCOUoP%?5 z9?r)FxDXfNVqAhtaTzYh6}S>t;c8riYjGW}#|^j-exUdJ1F6K~;dyn}b~ z9^S_X_z)lAV|;>7@fkkH7x)ri;cI+@Z}A?OpYlqC8omEmpqpie0fAcE=vr6MJEA48uOy7yDs<9DoCH z5QgJm9D+kJ0*B#n9DyTo6pqF*I2Om@c$|O}aS~3(DL56U;dGpVGjSHq#yL0_=iz)@ zfD3UEF2*Ie6qn(0T!AZb6|TlLxE9ypdfb2;aT9LFEw~l8;db1CJ8>88#yz+f_u+m# zfCupq9>ybh6p!I?Jb@?i6rRR2coxs$dAxuZ@e*FfD|i*J;dQ)$H}MwU#yfZy@8NxX zfDiEzKE@~b6rbU9e1R|V6~4wd_!i&cd;EYO@e_W=FZdO|;dlIjKk*m-Mv;Q?9~ByC zhxX`zaWF2%Lr08{2{0ih!o-*alcEzQ!{nF(Q(`JijcG6~ro;4@0W)GI%#2wuD`vy& zm;-ZSE_BA+mKFp5=upkz~!dL{0Vlga^C9oux!qQj<%VIe!j}@>Yx?m-&j8(8I zR>SI818ZU}tc|W%2kW96*2DVP02^W>Y>e*M1e;q9kCALBjY>l4S2EEW5eb5*E zur0Pje{7Ed*a16YCk(_O?2KK|g25Pqq1Y9>VR!6-J+T+|#xU%IeX$?*#{oDH2Vpo4 z#vwQqBXAfF#}POZN8xB3gJW?Vj>ic&5hvkfoPtwv8cxRF!wSOQC8DJ+d;uq>9t@>l^Yq6=2S%2)-fVl}Le zHLxbu!rJJHb+9hFVLhym4X`0L!p7*1O|U68!{*omTVgBpz}D!AZO{w7(Fc9e58Glp z^vCuXfE}Zr78+OMY*b{qUZw$je*cba@e;j}VaS(>%U>t%& zF#?C-a2$anaTJcmF*p{-;dq>Y6LAtw#wj=zr{Q#*firOy&c-=77w6%8T!0I45iZ6h zxD=P+a$JEcaTTt{HMkbn;d@fE(tH~1Fc;d}gmAMq1@#xM94zu|ZMfj{vV{zj3C@gEf$XovRbfN?M` z#zRMpj|ng#Cc?y+1e2l@Cd1^I0#jltOpR$UEvCctm;p0lCd`akFe_%m?3e>{VlH&X z+?WURVm{1|1+X9%!opYti()Y>jwP@pmcr6l2FqeOERPkiBD!EDtc+E#DptelSOaTf zEv${MSO@E(8`i`6*Z>=1BW#TB*aVwmGi;76uqC!a4{VK|*ap4O8-36h{je>zLw{_K z0oVaMVkZp5Anc4?(1O7jf}z+IyJ2_ifjzMo_Qo*mgMG0d_QwG@5C>s64#puk6eDmL z4#yEV5=Y@^9D`$V9FE5cI1wk|WSoLiaT-p?88{PX;cT3Pb8#Nd#|5|$7vW-Df=h83 zF2@zP5?A4BT!U+I9j?a>xDhwuX54~XaT{*O9k>&B;cnc6dvPD`#{+l}58+`vf=BTf z9>)`S5>Mf2JcDQP9G=Guco8q*WxRq{@fu#o8+a3M;cdKwckv$H#|QWjAK_zsf=}@o zKF1gM5?|qKe1mWC9lpm8_z^$hXZ(U+@f&`}ANUi0;cpbF8UInCfp%z*4j2dHVmx%j z_?Q3_Vj@h8NiZooVKPjPDKI6b!qk`s(_%VIj~Or{X2Q&v1+!u{%#JxQC+0$D%#C?4 zFXqGiSO5!RAuNnVuqYP8;#dMpVks<*Ww0!k!}3@GE20Zl!pc|$t70{*jy13**23E8 zigmCqx?w%6j}5RPHp0f}j!m#BHpAxF0$XA$^uX5WiEYpez0n7K(GS~VJM_o)7=RtH zBX+_-48qRX1uYniAsC8Xu^V>B9@rCmVQ&n>KG+xgVSgNe191?B<6s^NPR1!X6{q2JoPjfO7S6^wI2Y&Pd|ZGFaS<-YCAbuq;c{Go zD{&RB#x=MW*Wr5HfE#fWZpJOR6}RDb+<`lB7w*PAxEJ@~emsB&@em%yBX|^#;c+~H zC-D@X#xr;p&*6EzfEV!+UdAhU6|doSyn#3I7T(4?co*;CeSClq@ew}8C-@Yf;d6X} zFYy(=#y9vD-{E`wfFJP_e#S5O6~Ezk{DD957yd?(hVdU28fb_1=zwuBF2+MgjE@O0 zAtu7am;{re6DGstm;zH`Dol-OFfFFT^q2uNVkXRtSuiVR!|a#?b7C%Z#@v_(^I|^C zj|H$G7Q(_<1dC!ZERH3xB$mR`SO&{tIV_JAup+u(C9I59uqsx=>R1D7VlAwVu2=`_ zq8rx3`q%&)Vk2yf?$`vIVl!-xEwClFLJw?>p4bMx&>MZw7yYm;wnKkxj{(>LJ7Omc z#31a9UC@HT7=oeL6}w?~?14S87xu<5?1O!=ANI!qI1mS6I1a`kI20pr7!Jn~I1)$U zXdHuMaU71v2{;ia;bfeGQ*jzj#~C;iXW?v|gL82n&c_9~5EtQMT!Kq+87{{axDr?4 zYFvYBaUHJ54Y(0E;bz=|TX7q1#~rv6cj0c_gL`ow?#Bao5D(#DJc38@7#_zHcoI+H zX*`2x@f@DV3wRMP;bpvnSMeHN#~XMPZ{cmcgLm;B-p2>{5Fg=Ve1cE$89v7s_!3{? zYkY%m@g2U$5BL#3;b;7UU-27$#~=6;f8lQwX&L`fp@DX2j}90I<6=B?#Q2y16JjDv zj7cylI$<(QjwvuDroz;i2Ge3XOph5bBWA+Pm<6+9Hq4GWFem0hXUvUxFfZoA{8#`B zVj(PyMX)Fq!{S&1OJXT3jb*Sbmc#N`0V|>lR>I0y1*>8;td2FXCf35*=!$i)F1leo ztd9+_AvVIs=#EXWDK^9A*aBN(EA+tD=!tF63%$_?ebEowVmtK5_85R2up@TDKn%jp z*aa;Zj3F3`U9lT>#~#=ddtq-3!#>y-`(b|^fCF(5hT~uyffg^Dgj>a)K z7RTXuoPZN?5>Cb`I2EVibew@RaTd=|jjmV+>!KUh!}{0& z8)74DjPBS3n_@F;jxDeywn7hVjh@&Bz0ezd&=>u%Ew)2{Y>xrh0Xt$R48$Ppj9t)z z!5D&}*cH2BckF>Zu^0BnFzkbUu^;xw0XPr`VK@%PAvhExa2O895jYY@;bUuCPRAKI6KCOUoP%?59?r)FxDXfNVqAhtaTzYh6}S>t;c8riYjGW} z#|^j-exUdJ1F6K~;dyn}b~9^S_X_z)lAV|;>7@fkkH7x)ri;cI+@Z}A? zOpYlqC8omEmpqp zie0fAcE=vr6MJEA48uOy7yDs<9DoCH5QgJm9D+kJ0*B#n9DyTo6pqF*I2Om@c$|O} zaS~3(DL56U;dGpVGjSHq#yL0_=iz)@fD3UEF2*Ie6qn(0T!AZb6|TlLxE9ypdfb2; zaT9LFEw~l8;db1CJ8>88#yz+f_u+m#fCupq9>ybh6p!I?Jb@?i6rRR2coxs$dAxuZ z@e*FfD|i*J;dQ)$H}MwU#yfZy@8NxXfDiEzKE@~b6rbU9e1R|V6~4wd_!i&cd;EYO z@e_W=FZdO|;dlIjKk*m-Mv;N>9~ByChxX`zaWF2%Lr08{2{0ih!o-*alcEzQ!{nF( zQ(`JijcG6~ro;4@0W)GI%#2wuD`vy&m;-ZSE_BA+mKFp5=upkz~!dL{0Vlga^ zC9oux!qQj<%VIe!j}@>Yx?m-&j8(8IR>SI818ZU}tc|W%2kW96*2DVP02^W>Y>e*M z1e;q9kCALBjY>l4S2EEW5eb5*Eur0Pje{7Ed*a16YCk(_O?2KK|g25Pqq1Y9> zVR!6-J+T+|#xU%IeX$?*#{oDH2Vpo4#vwQqBXAfF#}POZN8xB3gJW?Vj>ic&5hvkf zoPtwv8cxRF!wSOQC8 zDJ+d;uq>9t@>l^Yq6=2S%2)-fVl}LeHLxbu!rJJHb+9hFVLhym4X`0L!p7*1O|U68 z!{*omTVgBpz}D!AZO{w7(Fc9e58Glp^vCuXfE}Zr78+OMY z*b{qUZw$je*cba@e;j}VaS(>%U>t%&F#?C-a2$anaTJcmF*p{-;dq>Y6LAtw#wj=z zr{Q#*firOy&c-=77w6%8T!0I45iZ6hxD=P+a$JEcaTTt{HMkbn;d@fE(tH~1Fc;d}gmAMq1@#xM94 zzu|ZMfj{vV{zj3B@gEf$XovRbfN?M`#zRMpj|ng#Cc?y+1e2l@Cd1^I0#jltOpR$U zEvCctm;p0lCd`akFe_%m?3e>{VlH&X+?WURVm{1|1+X9%!opYti()Y>jwP@pmcr6l z2FqeOERPkiBD!EDtc+E#DptelSOaTfEv${MSO@E(8`i`6*Z>=1BW#TB*aVwmGi;76 zuqC!a4{VK|*ap4O8-36h{je>zLw{_K0oVaMVkZp5Anc4?(1O7jf}z+IyJ2_ifjzMo z_Qo*mgMG0d_QwG@5C>s64#puk6eDmL4#yEV5=Y@^9D`$V9FE5cI1wk|WSoLiaT-p? z88{PX;cT3Pb8#Nd#|5|$7vW-Df=h83F2@zP5?A4BT!U+I9j?a>xDhwuX54~XaT{*O z9k>&B;cnc6dvPD`#{+l}58+`vf=BTf9>)`S5>Mf2JcDQP9G=Guco8q*WxRq{@fu#o z8+a3M;cdKwckv$H#|QWjAK_zsf=}@oKF1gM5?|qKe1mWC9lpm8_z^$hXZ(U+@f&`} zANUi0;cpa~8UInCfp%z*4j2dHVmx%j_?Q3_Vj@h8NiZooVKPjPDKI6b!qk`s(_%VI zj~Or{X2Q&v1+!u{%#JxQC+0$D%#C?4FXqGiSO5!RAuNnVuqYP8;#dMpVks<*Ww0!k z!}3@GE20Zl!pc|$t70{*jy13**23E8igmCqx?w%6j}5RPHp0f}j!m#BHpAxF0$XA$ z^uX5WiEYpez0n7K(GS~VJM_o)7=RtHBX+_-48qRX1uYniAsC8Xu^V>B9@rCmVQ&n> zKG+xgVSgNe191?B<6s^NPR1!X6{q2JoPjfO z7S6^wI2Y&Pd|ZGFaS<-YCAbuq;c{GoD{&RB#x=MW*Wr5HfE#fWZpJOR6}RDb+<`lB z7w*PAxEJ@~emsB&@em%yBX|^#;c+~HC-D@X#xr;p&*6EzfEV!+UdAhU6|doSyn#3I z7T(4?co*;CeSClq@ew}8C-@Yf;d6X}FYy(=#y9vD-{E`wfFJP_e#S5O6~Ezk{DD95 z7yd?(MOa@F6&h%V_UM3dFfPVJM~sgNFd-(w#Fzw=q7x>=|jjmV+>!KUh!}{0&8)74DjPBS3n_@F;jxDeywn7hV zjh@&Bz0ezd&=>u%Ew)2{Y>xrh0Xt$R48$Ppj9t)z!5D&}*cH2BckF>Zu^0BnFzkbU zu^;xw0XPr`VK@%PAvhExa2O895jYY@;bUuCPRAKI6KCOU zoP%?59?r)FxDXfNVqAhtaTzYh6}S>t;c8riYjGW}#|^j-exUdJ1F6K~;d zyn}b~9^S_X_z)lAV|;>7@fkkH7x)ri;cI+@Z}ASnQK5l$XpasU2jgNqbj0|W025*&OpHk|DLP>?OpYlqC8omEmpqpie0fAcE=vr6MJEA48uOy7yDs< z9DoCH5QgJm9D+kJ0*B#n9DyTo6pqF*I2Om@c$|O}aS~3(DL56U;dGpVGjSHq#yL0_ z=iz)@fD3UEF2*Ie6qn(0T!AZb6|TlLxE9ypdfb2;aT9LFEw~l8;db1CJ8>88#yz+f z_u+m#fCupq9>ybh6p!I?Jb@?i6rRR2coxs$dAxuZ@e*FfD|i*J;dQ)$H}MwU#yfZy z@8NxXfDiEzKE@~b6rbU9e1R|V6~4wd_!i&cd;EYO@e_W=FZdO|;dlIjKk*m-Mv;y2 z9~ByChxX`zaWF2%Lr08{2{0ih!o-*alcEzQ!{nF(Q(`JijcG6~ro;4@0W)GI%#2wu zD`vy&m;-ZSE_BA+mKFp5=upkz~!dL{0Vlga^C9oux!qQj<%VIe!j}@>Yx?m-& zj8(8IR>SI818ZU}tc|W%2kW96*2DVP02^W>Y>e*M1e;q9kCALBjY>l4S2EEW5 zeb5*Eur0Pje{7Ed*a16YCk(_O?2KK|g25Pqq1Y9>VR!6-J+T+|#xU%IeX$?*#{oDH z2Vpo4#vwQqBXAfF#}POZN8xB3gJW?Vj>ic&5hvkfoPtwv8cxRF!wSOQC8DJ+d;uq>9t@>l^Yq6=2S%2)-f zVl}LeHLxbu!rJJHb+9hFVLhym4X`0L!p7*1O|U68!{*omTVgBpz}D!AZO{w7(Fc9e z58Glp^vCuXfE}Zr78+OMY*b{qUZw$je*cba@e;j}VaS(>% zU>t%&F#?C-a2$anaTJcmF*p{-;dq>Y6LAtw#wj=zr{Q#*firOy&c-=77w6%8T!0I4 z5iZ6hxD=P+a$JEcaTTt{HMkbn;d@fE(tH~1Fc;d}gmAMq1@#xM94zu|ZMfj{vV{zj35@gEf$XovRb zfN?M`#zRMpj|ng#Cc?y+1e2l@Cd1^I0#jltOpR$UEvCctm;p0lCd`akFe_%m?3e>{ zVlH&X+?WURVm{1|1+X9%!opYti()Y>jwP@pmcr6l2FqeOERPkiBD!EDtc+E#Dptel zSOaTfEv${MSO@E(8`i`6*Z>=1BW#TB*aVwmGi;76uqC!a4{VK|*ap4O8-36h{je>z zLw{_K0oVaMVkZp5Anc4?(1O7jf}z+IyJ2_ifjzMo_Qo*mgMG0d_QwG@5C>s64#puk z6eDmL4#yEV5=Y@^9D`$V9FE5cI1wk|WSoLiaT-p?88{PX;cT3Pb8#Nd#|5|$7vW-D zf=h83F2@zP5?A4BT!U+I9j?a>xDhwuX54~XaT{*O9k>&B;cnc6dvPD`#{+l}58+`v zf=BTf9>)`S5>Mf2JcDQP9G=Guco8q*WxRq{@fu#o8+a3M;cdKwckv$H#|QWjAK_zs zf=}@oKF1gM5?|qKe1mWC9lpm8_z^$hXZ(U+@f&`}ANUi0;cpZ<8UInCfp%z*4j2dH zVmx%j_?Q3_Vj@h8NiZooVKPjPDKI6b!qk`s(_%VIj~Or{X2Q&v1+!u{%#JxQC+0$D z%#C?4FXqGiSO5!RAuNnVuqYP8;#dMpVks<*Ww0!k!}3@GE20Zl!pc|$t70{*jy13* z*23E8igmCqx?w%6j}5RPHp0f}j!m#BHpAxF0$XA$^uX5WiEYpez0n7K(GS~VJM_o) z7=RtHBX+_-48qRX1uYniAsC8Xu^V>B9@rCmVQ&n>KG+xgVSgNe191?B<6s^NPR1!X6{q2JoPjfO7S6^wI2Y&Pd|ZGFaS<-YCAbuq z;c{GoD{&RB#x=MW*Wr5HfE#fWZpJOR6}RDb+<`lB7w*PAxEJ@~emsB&@em%yBX|^# z;c+~HC-D@X#xr;p&*6EzfEV!+UdAhU6|doSyn#3I7T(4?co*;CeSClq@ew}8C-@Yf z;d6X}FYy(=#y9vD-{E`wfFJP_e#S5O6~Ezk{DD957yd?(i}4>78fb_1=zwuBF2+Mg zjE@O0Atu7a7~<`p#w~PkL(3v5gkJS_w{IU9Qn-X=u~`r>!(8Odf97G%uSHu&JBECo< z5{g73u}C743MY|FBo`?}N|8#W7HLFUkxrx+8AL{rNn{pTL{^baWEVL^PP3bwMQ)Kt z6V#m>4cbh>>EH7%j$#v0|JUFD8hIVv?9FriiIxnwTzTh?!!Rm@VdrxniD} zFBXV}Vv$%ZmWZWdnOH7Xh?QcMSS{9wwPKxEFE)scVw2b`wur4_o7gUPh@E1W*e&*m zy<(r(FAj)<;*dBjj)FCK`8;*oePo`|R7nRqTi-7LWyHAz4@!kws-OSzMNoC1ojD zT9%PzWjR@1R*)5?i>xFo%PO*}tR}0=8nULWC2LDpSx45DZnB=NFB`~)vXN{o-DMNm zR5p{%WeeF-wvryQwe*y2q?h!TKGIkE$+ohc^q1{rfb1YU%1$y+2FcE{i?qmK86rbv zSJ_Q=mpx=p*-Q48VX}|xEBnd*a)2Bt2gz_bSPqdxWrQ3ihszOiq#Pwj%Q14S94E)i z338&GBqz%$a;lsrr^^{~rko{b%QxpbDx&s<0}eimGC&xGJGas#2=7Dx=D(a;m(lpeiaCRY_G=Ra8|~O;uMl zR83V&)mE;mj;gENR6SK+HBb#zBh^^Bt0t^;7-T05wnz zQsHW_8lr}(2sKO%S0mI&HA;E&O;%IXR5eXaS2NU1HA~G_bJScl zPt8{g)IzmLEmlj^QngGiS1Z&?wMwm4Yt&k`POVoP)JC;QZB|>XFPwiI+)IoJf9acxwQFTlmS0~g-bxNI9XVh7BPMud5)J1hkT~=4rRdr2WS2xs6 zbxYk=chp^VPu*7!)I;@1JyuWDQ}s+eS1;5{^-8@~Z`51$PQ6zj)JOG6eO6!8SM^PO zS3lHG^-KLWZ~f9xhGEzl_J)HI$B1jhGaQZhMgk+Dk;q7FBr%d2PDU~#xsk$1X{0hz z8)=NRMmi(Ck-^AlWHK@vS&XbkHY2-{!^mmmGMtUvMjj)tkR5u>P4 z%qVV@FiIMwjM7FKqpVTRC~s6SDjF_EC8M%Y#i(jjGpZXkjG9I*qqgB{)G_KBZbm($ zzR|#FXf!e!8}3FEqp8u%Xl}GHS{khk52Ll=X|yrC3~$57@HPC5wnjU{-)L_H7#)m` zMkgcC2r@bwT?~s6Y=ju0MpvVo(cS1_^fY=Iy^S!VkI~oYXY@A)7z2$#Mz}H97-9@H zB8*|iaASlq(imloHpUoZjd8|!V}dc!m}E>grWjL=X~uM8hB4EaWz06_7;}wz#(ZOe zvCvp#EH;)HOO0j5a$|+D(pY7zHr5zxjdjL)V}r5L*ko)rwisKDZN_$Ehq2SxW$ZTg z7<-L<#(v{~anLwq95#*^M~!2~apQz>(l}+DHqIDljdR9%~@z8i=JT^kS1Lo<$Jz&1M^iS`fA}2OQf2fo)w;WS|||qXSx>BFy!;ypM;D7#|a0LQI5-F$pF`CrpOPF$Jc? zRG1pmU|LLv=`jOl#K=?6%=;`D>$jigBaZDAYfdQg+0cCSvAuF*9?XmRFh3T+f>;O( zV-YNh#jrS*z>-)BOJf-7)R4Xa}ftckU-Ho9URtcz|~59?zC zY>17pF}hLgWIkv!-*a|(cHF{zj^g?g+L0|O4w%88+u{{Q02keNQFc5>VGj>4> z24e_@Vpr^j-LVJu#9r7N!>|wb#eUcy2jD;)gyA?Chu~0*z+pHXN8m^tg`;r{j>T~} z9w*>LoP?8c3QomoI2~u;Oq_+YaSqPKc{m>z;6hx4i*X4q#bvl0SKvxqg{yH5uElk@ z9yj1d+=QEP3vR`2xE*)kPTYmNaS!greYhVF;6Xfuhw%s=#bbCJPvA*Bg{Schp2c%` z9xvcUyo8tW3SPx)cpY!xO}vG-@eba_dw3ro;6r?bkMRjU#b@{&U*Jo8g|G1qzQuR= z9zWnm{DhzJ3x36K_#J=XPyB_y(VPjez9cF%&<^d<0pnm?jE9aG9}{3gOoWLs2_{7+ zOoquZ1*XJQm>SbyT1`*Cu>m&3 zM%WnLu?aTCX4o8CU`uR;9@rW^u?>2mH~OG2`e9pahyK_e1F!>j#7-E9LD(6)pap|5 z1VgbacEj%21AAgG?2TdA2m4|_?2iL*AP&NC9E?M7C`RBg9F8M!B#y$-I0nb!I2?}? za3W5^$v6e4;xwF&GjJx(!r3?n=i)q^j|*@iF2cpQ1efA6T#hSnC9cBNxCYnaI$Vz% za3gNQ&A0`(;x^olJ8&oN!rizB_u@X>j|cD|9>T+T1drk|JdP*uB%Z?4cm~hnIXsUS z@FHHq%XkH^;x)XEH}EFj!rOQU@8UhYj}P!6KElWN1fSwFe2y>hCBDMf_y*tNJA98H z@FRZ0&-ewu;y3(`Kkz61!ry3q4`3VrQK5l$XpasU2jgNqbj0|W025*&OpHk|DLP>? zOpYlqC8omEmpqp zie0fAcE=vr6MJEA48uOy7yDs<9DoCH5QgJm9D+kJ0*B#n9DyTo6pqF*I2Om@c$|O} zaS~3(DL56U;dGpVGjSHq#yL0_=iz)@fD3UEF2*Ie6qn(0T!AZb6|TlLxE9ypdfb2; zaT9LFEw~l8;db1CJ8>88#yz+f_u+m#fCupq9>ybh6p!I?Jb@?i6rRR2coxs$dAxuZ z@e*FfD|i*J;dQ)$H}MwU#yfZy@8NxXfDiEzKE@~b6rbU9e1R|V6~4wd_!i&cd;EYO z@e_W=FZdO|;dlIjKk*m-M)Owyw(%bo8fb_1=zwuBF2+MgjE@O0Atu7am;{re6DGst zm;zH`Dol-OFfFFT^q2uNVkXRtSuiVR!|a#?b7C%Z#@v_(^I|^Cj|H$G7Q(_<1dC!Z zERH3xB$mR`SO&{tIV_JAup+u(C9I59uqsx=>R1D7VlAwVu2=`_q8rx3`q%&)Vk2yf z?$`vIVl!-xEwClFLJw?>p4bMx&>MZw7yYm;wnKkxj{(>LJ7Omc#31a9UC@HT7=oeL z6}w?~?14S87xu<5?1O!=ANI!qI1mS6I1a`kI20pr7!Jn~I1)$UXdHuMaU71v2{;ia z;bfeGQ*jzj#~C;iXW?v|gL82n&c_9~5EtQMT!Kq+87{{axDr?4YFvYBaUHJ54Y(0E z;bz=|TX7q1#~rv6cj0c_gL`ow?#Bao5D(#DJc38@7#_zHcoI+HX*`2x@f@DV3wRMP z;bpvnSMeHN#~XMPZ{cmcgLm;B-p2>{5Fg=VjQWwDp5U?lNYBpvkzOf_p6H4Ejh>$D z3A5;_p1=G?PtW}5Z{=m2@Blpv7vUa9NM+dBJH+~0KcI}Y89hT3`vxh}bJlTGV zpr@WU+XWt^@qG#aDxabKu|MUU9tlRZ0oO3{!x%9B;nK)+dM3KO3yi9g1HQsXfFH$hFFXB44<>D*YzY{>|WOs ze%5xCHB*w9Z>_J$-qt5?ZB+kKR?iCBn!56rrt}=4wJF2eRBA5asN^zF&HA!7sSi;% zsV5Zu0w$YF>u+)OyrP*YdRj3?-rY0{IDP+O(GuF$;+u*z?P4=7CULRmot~78iiw_? zjEsq%o%}~k^i*Z+p8k7G^kio2nlWRdr#b%;6FuumO!UNO?3n0DP;0yYNlf%isVydY zt~7R2dfwF9)PEBbJ;@p^CVH;bysq?AE7z5tZRNVs`ObBv=U&4sdgk?S*Oi`$wOv;_ zmeH;&|3Z-$m7b&xv*;<>$csu()0!8Ro~5;&<45bpn7_NybGC%xf4kY~$zC(*zUaL~ z|9Q8b^o?xzKTohRG-JpM(4VXdE^UVZBPdO75-CqPn zm=|)GMViaM1x1Z!#79egu2N8^su-hv)7H%sV+MCJDNjg zBC|BilGwT}VT6Ao|HQ7+yx$A+ZlA;~^iLY$?-b@P16^gPxBp$UG^sANwz8(vDZh(O z0lEe27N@8!zkhjeh`uFR)E2ifpVg&kpGh9IWoDKU4We&JVQw)W7tE7Osq0vOS(wUP zi=8d0b=m)=3)7elnAwungltKto3ss$I(%eHdUG|*lEJ#|pF?9tUC}x;>aLHLEdfqC z;YgEA);2QhHb|2!dfmg4)x_W4-yzJB&AKtJn}0lOe6pKw!z?+hg|Xt3Q-3B#d}1b7 zF8v|hGmhc@sUzI$3vuYlHHV8%LZs*!O&+=V1tq|O|mv*FG@?)2sY-7JUPC1)9GUJq6A5Zio%oFVZ%nR8(0Ncy#5bMj_ zY<>N!!O(&2aWF2%Lu+>A)qAz}Tt2-!|EQkJZ@vw)6tEV?>bZjYGykUN3h591|Iu@W z%^jJ!SHy(eE2?*G%RQYF9>$0+SS;e}n zYAgTqz+cgH3fKVM57U15va zlja?{uDKp&aWj{ZA*!d>JuLN2Z1=$i){Qaa(@>ZFU+#mA%m&N=HP(BHeigg_X*jYA zo9Ln%T!q3_TE_!1>ePv8gOa+rnS(@AvqHcq^L%Q!P|J2JZ>Af!-pZTnCSnDvg)aNQ zoI*>p0W(^yOo*0;ZqgPl-F-x>wYeH*@w9IHXSCYriq>f9Q;!}k z@iCW?q4L%19u_|n+j+LNZj2eKcDn5Ua-RNX17@h&>%Bw|RY0_c%{#vNr|o}s$G6@{ zJLuM|vFWIri4~hp(VG6B9?AmE2F%z5nGl=Kx=CAXbbk?>F6L^O#bVv|&)5X(iq_cZ zbBz|8fT?D-MaCt>+D52ugJHI-UiYwcGqIgxck9NOap|GUbX@p@ATq)A9ZtW@&DK}c zz}M3(G2_t7Byzr`x49l>2{V^}nQ!T%i)}mLPg?UC()@!q#=lT-5ALh$#tK5eXrJe) zuD^9pBlO9S)C+yl{kiE5Ft=J~Syn90P!P|Pn zEMh)?2U|v&g)!!dteJ{QuU4PcmGTN+R-crYy1e7#;##|q(Tk^1s(?ks&ly0O13Y$L+YJqv%Djrnx$5z^Dq)4g#ON6S3(1GyVo=G#8L zK!5x%3)^V1(A*joqDAJL*h#xsm&FXx5?%D~LbTMZVCL;I6Y_Ss{)jDabykqKE6mj} z%S!9E7Z!0*x~4U8b?4g>*V{dzk5`DNIdSRPwquZ2h-I~|L=LXe>*U~CbKT6r zb(*LXTyK3BGY2>5GJS$7JTj^qby4&a{F8l~%uTlJ+x(ZdwnVn|pWfZJnjbK;ZJV{N z=;^jSvL$`19IWRRQgdgFn%gvMaI1{GG43#n!Yn)WNB{ZH;=3YSH2+uJHWB@=Z;ZQj z-B@?8J<&eTwYt~3r+;;0+-GhzZ;b8^o<3ea{(hGI=KCMDan zz0DptWUl{-(_!nom~lFy%XFM%RGf~QMbZhlEz5Vk=X7?q%ne4u-+wibl(L|l?RqMN$y>?BP#mw&Oy6ErOeZ$-o zpy%Grh}<;SW8496MKz`GfI21r@z-YT6pj7fY{B|6_h5a+es6t9x~+E{`3J`xz0+8k zcvqLjJg0lQ=<}{76-He{-n$|Gt ze&A+g&K+{=d8#)PoM*ZX56g2+)QP;XzKa>0m%1$GiM-N9e?O7e=B9wpw&1)m*JA|d zZB$d%;F!~eQNgiw^}m{<{?hl~Z%^W4+s?G&JH6w`;Jnv6jTM{^Q3n!rnSRtoe?O;B z<|Z>ZpG^qP7hTyF9NmEg=c~CIX8C5__Lty%*EOxd(fvShY+slVpC7sg@%gEn@v!{T zM4iiT>${lo5$3E(toTS>^!Ia7<|eZ_+gC4!uE$p|c6vSb7b^C;P{-ar@^<{MzIt)^ zOHF0|kN=;qUgB7H7H+0Sz+jypeDxC7y5GoP#k0PPd4i6SP5&AD;#=E_x~E%5z5v~{ z{hB7TL$CtnXs>vHl*) zcC1ldlEy5u<UooM4v|Uj;<+-A= z=+ZDtR_n&7OKMouV3bWaV7;VtXPd)T_E@Gmyhy}RwM8M*vdci+gnYjPJ`&D5A4Ghd>Eh@3h zfzf|b$MX|5ADrT82SE~M{k!q2R@Fl3=W!|Z*uc!yk>SjeVCu*2PK8M#d*TXEe zbPNA{4zI0?_0gI4T-$TFlH%|46Rx^ytjN`g_Hpj8b*($H-m+s(O}LpG%qQ>w+mm-a z^IbsHGq;yH*{L^2W=R9xNUVJ|)MYU{vXNO7yCWN$WoBZyn~)ey^apK;p~FRDG&NVl zEX}Ojq7viJ9jCdjY0V7Xf03DyAaXv!IvvqM*C97r>UDCXmAP)_hKDBVlv-Qg#mo&) zT}JQIeQk76^u+&jJMl6%*;2&&FKzimw)MZ>PJGP<%+&GIdx?HKX&c$F`Rw^;uA!YS z{pVbRzgZmcmqm=Zh5+Ug+Uv@HS(vA6CPaX_HLCYISa%n*_d4pbn7!9Y7yZ5W0?kck zUIdwt7oBxwTVCi;kr!ReRdYJhx-BX%{ycdH>zdZQ(ESqma91bV@DieHkO`r>84pWW zP1Lz`v%ZUYF5PvRJ{P;FyF`!w*UOc}Xi-Gr0Rdqm$^at>oV<9OzyJ!UiO9*!#REnz zgcBY*8j=oNkY7P@K?HFDS9IJ!!F^#-Hbp_f9T!9e0YR1^i-Lmd_q}?rmhK)6Gl_GU z*Y)c4_fJ)S_0_AY{)eu(Epn}gErAtN5$0HNeFg3`!)-4sY5)K$ZV>F47B?F5>1I95 z-4lXMPm!v!9+f%2+hFAa(u@?z?T8<2Jvpy$LtWjd=D9m)XRSZKsS$ST*yYn`FWB3% zxv`;ubW?(4+2hUhj^q1b*G!r;B%g_*m74QgqSDa()~GZrzbz^a&u>Qw4-nlt3s->{ zcStEE#%%gh)w>fe$BU4#&4JGoY;Fb4U1gkkh7%=hcgHyQRN&lO#<|aMun%khMsR+N zv!DX!{xZ%3hBGGMJP02W$cJEb)IQ7`-KgcT(oy>ey|sLyRQh2XF%CIGGkw89H zhztp2s}ChH5&C#B6nRw_*&}(vhmv^+eXv+7KPIW><4& zDITn%AD1C6@L{=>Qa-GpFIA3}a21u~4fs5--mJh`RmNFuIFWL!iE-9e;H)d-yk$6% za=abmtgpa%$8f?n!Mk)x!`$>ZI^7m$yAC>e*)Ptwqq&Mn=Lo8$Q5Qy)o2sea(h3~xi=&L;8f+?3dHXWvAyZZb`QU!0{(l0|F31!P6T35?IK#NO}h<%OKCm`?(Q71 z?8DjrZ#C^cujw*p2df>Twpr~^wZqg7S9_t_i`0%#J5uc^wHK?sMD3++75@YK#@1(h z*e`|k*W6SN?!Yx-ukxf5PRT&|eiWf0tqSf_(v|6I_@AIKn(VNnV*?xs8+#K(a3eOhCQrZ4 zSZLN*Xx3P0w#-I{ce?i*j_pXs?8X*BAGXDK)|c%lXi}qZo9EfNbV;9OX;1Y-1+=I? ojMC&h36*cm@bM^32H>)m50pC3lCVKAh>^k)cC3oglqpUB0PP_w!T{KkEK5IwT&8k4!T1$U8)w%eJ=i)DjE&brAw!cYSFNV8@5?5 znf5U6KnLs&3xx(hEW}*QTay zvthe5xz*V39(J!h=zf~7blo9NWy-&9c!b?JgBJXx!o!xg98C>ixS ztc3Q#{FAk+-Kaa04OkdWvyLw90?x25FrRj>JEj+0oN)*y&)(~IOBWmar%fD~*l`{7 z%;~&Wit3x)@$vCn{s9mB$Z5c>u|(Cz^})Y}JL5Lo!tB^iP5aL8U(NhvK+^^f)INl? z*qeGB>6edu_I0F*;cG^J3iP(;zWFfHqdV_Bh4e4K`s13DK;Lg4`Y6%|Q%k>pasa1w zW?->2GYx{R6W)yN*} za)mk88k#7n%t%^mL#*H%PbIi^rUlIi+9hbWpgn?S1??5IPtblr{f#F})rM=`Xx%`w z+pCRD8$zp(S@(7(48A6v6HQF5CB^rYplLxff_4eoEohIR-kKbmo~om%pwT3Y&U8pB zypi^RjTtwyj>1el$q3;!%zD)*gDv%KDc2_N+F?|U-DX`H)zAb82sfT?zvmy6T_Pmb-A@8tCKP2&3qr`>`s5_GAc7YcfbpeqDjCFp8FFBf!3(5nPpGlF|C zVEH#%B%U$I_&T!G+GFii*lBNVVyik(+A)47^lpUPnKF4ruSXzOmE2VvA(X~Fpe6VFQBDAK( zl2huEsXhjpu+=>edN9cRYO$(kp(OdvS1at9h7SKpd^r*13$V=j2*WHgK|$ z@gD31jlG4~bRjltkD=IXCj<7*b-+_|9q^<#z&z+$kD<97u})Z@gH7SzN3G9eJms2f z9k*U^%B2G2es@6bXE%(>H2A*zA^tw}gO?ur z#NzRBOi5@G49VR#B8H>GdPf>f!qd7Z6@ByVhFk=}HO_R%T4?kvLa`pQNo`OJh-_9H z61hw+f!wr?lP%aF&8UCUXyl=W$WN7PV9awhC4W;820M%;7&^gaUnunfmct&Ir=e?Q z$uARgQ+lfBl8{8Sn}9A;^JUnixw}>`O&VpF_Kqpv{6YqDVw%ZM)`}Qxo8BrCttV86V#f2J^ z7G|O}RV|h(P?6ZqTf9eyT!|+~l?J|+&xlY>zG}`hY{S;xC{94Gymw*>?$y+D^nNB7urTMdaDA#si zajujuP>@5>gqbfO7*nM>B(XGOW6fT4P1J6sYfIVc&M!|%_2)bG;=_1vH3x*9S zLI2J5+5xjVzI9Zo?ETUeBuq}g-e$+3xYKoJhSo}QCAgMgjjMJ{XbINQgx8yUimO?IjEqSv$?eeSxf<06eHa^rtFdl9AJOexI2jO* z=VU<50#Dp`DLmO8{*FTY*PYBkm=>iHK;Vx9q-K3OZz+1W%_KrK)UG)>^y zBFQMp12wZC$KY31Ipf;1B7z5suRqZw2CwKM-%`Zj&~AG=FF`DF3SLi_2rp|-8+F?p79iHbUF>TU zlm*P9OY%i?$^b8)pDLBhc?M9p+YX)K?56!*^Iauy@ifhl55QydkfH$A3a}S{mski* z=K%&b+_M8REU4dnznihTxj3y}Ksc>_+PlTEOI6dhgQ&H&1OnwCDPZebIsp?_7*R` z2kDt*uk8Li&~0C=df{c!rU9cgD zAlFGl;j~?b;zMm<5Wd_O2t~?s>AEz_&`p^MIp zJol8jr7t;n2|UdpDZJT}XzbQNY?9$POZ2vsPFc~ZwNx^EOKWjT|7ov`=VZxJT;7t; zPr}nWuy{eh$K5&{;3IoVO7*6Q8c|psxfFCUGNszAHYnH}xm|5gkq8SLcoIfac@_bB z(Btz^;}mQx2k`ikLE}lt6kIx>=Ki9UW_Iyzfufzfj@y`|1G`Gq-DNZ8L$FX(TCT$9%U_%Ll9n(vL3i!* z@ zOYF`&R~Hp>Imav*ZNY*M1X!>DEvf@EBg>&95o)3v2Jaz zBjjOVW+e0DBB@a0$cq8Y(p%+;ag)YL)H4F!h?KZv35t*4c_L0D{_^?+=eD+0JoCROmX z9LZ@|^}?#w9tGj^w7)9G`wfq9x>nG2g02_zT0u7mx>3+gg1%SK>jZtDpw|m}gP@xQ zy>SFN!<|)&dq8nAR(w%TXdieOaaqT&_z3ElfbV%eK6f?qybRNS?rOI9$6G9aCw~U> zjFd2(uu2IZ)dG){u%yztNGc^RX|t2z)6I3foJVR<&}p@mNRS=me&Dd@aTTe2yh$ql z$?c~UsW=2{1iS)JH+Qdxin}RE#j{Ix1ldU`(Cr}>09fjJB9Tm@o<)`oYG0@Z@E>!c zNVko_mcwN%fiP7e2D6;FW+4hs@J?A7Ua;Lzwlh&U3VS zSYk|l950VRnrKg- z;6Cd!h<%zizh@AdwsP?w9()VVB&LrYd=2Sy*A86$HqZqx9N&+0;?5hs`?h~pfi@Ba z$@jpvpkV)Cz|6I+M&GkJ2b+_Uxgkrgkvvk4k4gADn#E0gSp0suf!`I@q__n z&pLYGFz#x;rA&SYrap_;p#R+>^K@N6rvteCG?P}CfJ`Z>`S2#VUo(2f6I zwU@|GWWxBL!+&3ghoHO;f{?xrZ-h3H^DrGIm%ADC diff --git a/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.10.pickle b/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.10.pickle deleted file mode 100644 index d279403b3c7653f8e703822c42e0a7b6c86e3e5f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16598 zcmd5@2YejG)z_VhrE|eHHkfYu(S#&-s^x-Z6boZ}>Bn)BR$eV?J9h`RVKIc*pwK%c z1PGyoB=p`pp#%sJdJ9P)v;;x|7!tzw|4+HwU7f(#U-EtD-%oGm&AhiW@BQDK*_rjR zao>w2t;)`xc6*73N_o<%XwT$qucECl)6<^G*5K3b_L)e~I6BX?3=*Zf+Y+_bsX~IivcbqrW_l6~{nx0Q(&bUL54)57k`k2*wr9Q3z7^|fj^$hBj5FT4Z9Vz!zNNKoy~3{T&UfWHGeTe>kZv19 zEtl}ds9p)!Mz(rmo6A;Mu~s~9y(Au;SfM+#=a%-_32%KXK5r@p6hnt`$>s_xwp&X} z%idq(C;v2;`LIe)7NYow4ro=)S+6k`~p|-Q{)gcq45mOnj-aJ!7BWH`AOjkCsu(M}zmp!Aax9#wG znV!sITO_;&2v10+yot`n4j?fp3W>=yVC4!2225%Z0mR zJ=Y2~yQuN)8Xezm-=I+YsVs`e!uWR2_}(5?Rb6fU#M*jqPv4u41e#f5XT3&eqE*#O zOJ%(oDQ~9l%}QF;-HSyUysS67EuC)7cOuL5q|*s_whhT*FC>aenJVU3RYzpH^3c=d zOv`8TS@pzLE8gAd&2{Loy?w7a>C`*4|My`3c}1WH0(u@3nZ|gF@9mScs=6|*c30M$ zpYrxiI%7hfZp~!v#9pFzncdSet5GGm1%(8+P+N+@U9WT6A{pFAM{*j28!;P;#W3zM zPEm!WRV-qAtSlDKwc1uCb5qvaFXio@w90#XZ0`W8Ja=TTcc6{Y+DCA-2c^7&lgfjK zL_L_I(zry}$SAUCH|GFmy+f6BIx2k_rOT!CxTv2^-jP}kq?U|r)nt1`u2;0%!fTcJ zv<>KL4}sJ!TB^gO!Hz-GU?D6NUO(Pb<952D)tZi+E^)jV^MEIvqyrCj!h-?xxr8>ceIHUA8~Sw8sYNj2#?ja zS))9493b%h5Co3b>N)~xPx*5&|E^jt{rdwoz7^5&tz4V&od7$YNIUk^j>piBKlHtm z!gf3vc04|0$M#$r8@5xV9Z&VW(~KQYhaJyQ<2*Au&a>8LoM*$1=Y;Hdu2zSnvyTa- z^GYS1J&|-$-uX#KE{jv@>L-VmMTh&z3w-ZKNoRP!Gr}qE6MyWC>URpe!M47oxn+rT zU&6aEX(gE$oC#Lut>w;>z< zTH@kS-+Rp9;&I^O2?gHYM8W$cXQS7P!EDY|SuVg&!G=$VHZpjiy|`C&Vo#lz87OZ?{5&M8SFqJ-%?R&Rl4ag&R(VHnulG7aT2JZ?Qf^$|R3`d;RNGtM#YBIA zP@GaUC!$Z_rYKJQ)Wxc7YTedD+f;J6PlD$MaO(NX=C zqr!HdbNnc2#Tu>H3@bL%ip@f~IJ=KAvA=~Z|A`h{{RLBRi2kld@{j09J`IfIv%yF5 zIV`h^midB~`H+_R()a#pETi`0|AJ*c30kJIyK@O3@s+gAzkTm(Z5zSNkW7gfi1eF? zlMzm2qMSTsr#0DbhziJOQlgR%3A}!bs-j-lUkqV&jtQUU7?+cV$qf_gf-7U4d}9>V z)b;wD>AJUET0{+*e614+s*22FD0yo!i(!z@_+mI8vZpUb$VW7~(sVOYVja|d!5w|b z7h)9qH@Tc^&?E8ooEXg}yc~&jQSikW{55_V%W_Jr#~yM<>&wQ_@r4*i-dfCP1IWWK z8}cDRzid?0i?s{;{nb0);qopvrZPF=O{gW}gH4N?*9b|QK^{IB&xgPVo6AQuy3$PW z!4|0dVoS*lY`Mi&?BC>aUI8EE@ovYbm;o6kTSLbG8ph9Cocae~>kO3L&;d zSGX=I<+h_7&ib?NPOcD{1q99l%rXzhX!BPKlk!k*2C8BVwwZ$%Av(k?(p- zOjv*`e`~vvC$yfSnhw)+xTYgCH3?qS;mPoCJxK92TpLIoE-sYzQSafR0sYLYxR`(n z=Tfu6LB4F#sx=wx1!LrN)nSaL<1|fby8ZOroaL)jALFpJ8c;%>`KYB$Oyo$`2?2bP z9uM9Rc{(sroeqe}l0C@jUp1dZE5RbcV3|_T5+V(=-7jy78>B=A* zt%HmQkS5Za(5xeEE__`DmZ~9o{r=!Zs zHoY!&I%#*~CvTA6_3X->J_8-vzR}U7tJZ&c@#mm(uIj#p^xR$Im8(GCe&XDPq!Uv+ zTuQq6sP|6!0<`Z^|25KkCyd_VOVAMyU$K-lwcoCfemSa;0v2Mh0P-TvbBr5`a7$0` zKNjI`i7?(zKQz9=gdU=`Dy44RMuqGMnF-x&L``XS*g#*>)H_>UXVk(1=5qnyL{OV8nWEd|_Q?^QSvf)swM zZ-3Q)W6E~-FPAf~#=>#oNCFfK}oyFg`&*_`7 zbmD;FP2|v#gR98&;hjD0 z;^*X5%F9u470c@u+5>yrU!bM1x6QX^ow>gFC3{y0dvSyK6^d|Xi!ZK5iML_m8d(Tj zh>2^-L5{Cp;ow9MZyd2$JYHTdu~)d26xT(1;bx5ga5ss|F>yUxt;&v0fsa6LK!uxW zd=kfGwmx$ZIkjtG*^|)pfw}UEu zcgW6ReRq-veR$g<&hL_4c!5W}xEvLA+zqPl(ZoF{XE6u4Q@9ruU)+bkxZ%KgW*S$n zZM|LMesIhtUS1)~?)Vy~uw!@tJ+NbV5X9As{XG2ZMPiSd<`)D!~xbjzslHY)XEb%0WgM(i26bin08h?|{bD`z!x8x}c z0vGWZv(fYY4o-BrU3o@IglF(_&izqh7aH5e(iI><}KI~3>ZEv`bzXQ;Ua zd|*bOOBo4}Rk9d2m4}3w}&=>22`eK~qs!&%8 z`vz=payd&K>drD_>J7m(rrwB>IAkVK1KmUx!*JV_yy8GNV>LR^@nAZ~!PJ|x5H$4` zMw+Hx{VnsV<;uX{w*6G!O^7Y|SZV65_`Hazw>D!?3(z*KMrW`sm@st`gr?q(ePC+Z zhNj*g)E7HQZqU>_vc1XWERCt>8dL8CrZIIbB~xN&5;f2|Sqz)Hp1h)|8(57FbOM;r zfUVC&sR3J`NwN^!`b;JVif|W)yjALJEHNe0C0-Uco1Up`lG_ovpOCwpY4RD;#Jk91 zc;vfkZF*O-o9u$D7v9BurGf~YO%7}UYe)p*1}69$9AQ`;f_nA}>iKT>Eg07vIWB3xCRq&IZ>~|U=Ds(o zaC|YRH-n)p_q`I*^TN_CQaZ3A>;ndJ`g{9Iu(aO9`4HUAwloAYQ5zWMJMviG58xCMXFcy+GXIYzv{QubVaw( zw0J$E_&ZLQOG_JvC&ARtaYgc-ZN{p=)WK>ALoNm%DbE2RDmvK>sGygK3K!HDhfA&k zg?cUQVtbQIgP4uvFnlW}@5J%NSXOkS#h{>vWxbE=MHwjQBT<9*WHF2aK^{<`Gf9?J zHTWU~a$qV5{A z;nX)t`ybq*b}Z;ii*7xdblZ2DpBT$Kso=cM59%S%FCYW|ai_PpUi=7US!@tLHpL0z zLQ|Y5eqxG~#6>8=OEBKWC}~$;B8y>H|CBtqS|1TyiYh!^T>gFrh62iEC8RG8OJ5z(A|~MZet#pw+4G!{`C{Jt9>HG5%|S6P+c zDh9Cm8kh*)*FhM(Z?F%7m+)uszR5by+9W#|yl=5XlS_;t`2J96H^KWy)lTf7{ZEB< z6TEM$c47kU?-bhO%~j2DEdPI2?ew({-n>q14=ID=G$|_gmVcGSu){wh4-VIx^p9CJu^q77-@sQ<{YeQWf7eRZbVdG0 zLHnmAw11|xf7^-i=jaE7ucESGYWPCSW2+Bq^U&gB6SRu{_cr_Ld9~9Ot<$t#(*{i^ zXgX2TNt#a9bc&`^HJzsEE}HJD>28|tu3j>}q+uiw{z;2OUNZh=tfZ3sSFDx*!oR`C zPV#FIh7kWZ9YcsHV+h4i_eETCgD)B7;2?tN3cZtb<(i~%%L{3P+e!+jM3vMbhdV?T z!?>*`54hFgUc;(YIX7o_H;M!*F8>r)rJ<-fQ{3TrJ|HrTy+aroP7d>40V9Po#4`Ca z7%{X=+t$!(T{qNxpJxuf25b^XgrA6|rtn0wv>1v(hkk}{=wR(~r?!rqtF+@tSq$58 zlo^I{`)E|9>56-`b-_}w8B;=jY*>E1f%5Bvg`~^>k4MC8Kmj0z{v~2IWZf4VNp>*l zZp;o%E8k{1Ywn5D~&K(ca15ews7EC!wic?zwPM3r{SopMH&EcfK^i2@S{*wIKBP&g=(|(*yK)9jGtrC099I9dkFZ Oy~(BRbdJd9YySr)U6JGf diff --git a/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.9.pickle b/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_darwin_2.7.9.pickle deleted file mode 100644 index d45936baa1e00d80abce3c941b6d88150cc4ca8b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 15013 zcmd5@2Y4JsveqhDT4RHa4JKz;l5oa>ARA+2goME!6PINrjXYW_P4BJ~9!#=99x2nM^ZxOcZR!0RpTUxSqE*qbeo*#GPTOaB6_1x7GR&__GXivAbF0kEx z-BzEJ>gM)u;!HV_IR7-;ZMk&ZUA-<=S5jAU?7r?A*1%LgYo~@#Q61a?-B$VR zxy@O3pl8K8>SFmux#G0DW>ZJLy=#6=Q>U*0d0q2;`eOJF+>EZ_35pL8@0A zx{DM2Bx!p3*U9yV~BR#2n(6h@1=CH<&bi3`Y(~w`Xue+{Q*4mMC*E8yu zD-PIK+jadS?Nd8)qetrY4Lo3I+ z(=tuVHLcLJQqw+~R%u$TX|B;xOWee&>;i5(o83(ve;o$OswF;6cF>F2%yTzSIMr6H zKC+@M^ollyUCQbb+vI|bKDyTe1hff-wLq}#Ib>8%~J+U$vKolUdr z(;ewqwurkUAUraWbVoVsJ3z+hC^B}W9xE0*&|^Ydt7JuH_w_R-opi^BfHqDK2cYe3 zfHqzNZDH8Ib_LLq?oNp*=Am^l2Z|d$diaQ;LA?!Cird*LYw7HCcbQVF``fujJbYKR zs@mwPCalV;CZ;`iH>+}J&9IR*!`w-pyE~$9dY+wiCp)98%0}up>rP3!dwA~DgjLl( zOQgWdx_dUIQjPgmgs6^GDh^mQA=1<#%2dcWvzJvlC*79EIQ7oBd^(?1dTeyob7&m-ALKyi2Pd4#&OQ#bob=p763(7ZL6Iq^-kBLtpjFnK?{W{7T7-O~(P9&{ z*j$J+fq1H;i|x5B(dp09(;p9bIteQ_!HP|^V!K(fNhkw#I+C?Qk(rPpXKO(~zr{em zt=AaqJDbw!H@j-x)+$=1=^C|C*X__XCfgC6Y-eDyUA<4%g;oT$nx$5osnwk4=EGW@ z1Fg;uYPF)Jtuvi-=SscK^W6DHuM42p!<6+GMrY)$iuDhNQWu4kdW0UXEfAiRhI-T3 z@W96`kMFt1`c1pvT43;}YueFzWGG&pj@z$8PBHh@c)z+gsh^r4mo@+!Kuw zPl6IpE(*?iCOI0Rr+DtE39GU#-DtOE-P4lp=?Q00DEv33vvz#4=v-iTG)$VHlAkmD zVyOHTm$n2ba!n~cSUr2U9es2)`3#5PdMi}=i3fcc!O5qjZr1sMA;Rp@MfjLTcXmp zQhKFTc$@7gz1{L|7Z}&1s~&D;)|Uj|CJnF2mHr@ zm?uJrc~U=?DX5A`a35VTSnds-E6$klxTlnIo(?GI8P9#ThjO0#IpsY6TPWuRz~V*1 z;w8f3LBir?&wa(fA};fvR{@JBgRm%T!uIl-#Kr5L`-Z{Ao4~~~1>3ixuzfoK+un5U z@6qo-hwp}hB_3$c>J+WW>EpWv%IYFknL-G^M9h>#^wN;mkFP z%y!{8WY?HOimaKV%HN^J?>zS(30wzc^Nl!byWd;=l!O^>xz2HafSG?(EBt44h5zCT zSC;{PUT{!`d>_4?I4!*scPz3u*Fwq3zEz$H{K zcCer9p-0rDx6EaYTGe5Yrb9GMXu3^pZpxzNs*f>7O3fxN4}B_U5HUQwZuKC1MX8<- zwhLwgBUL6K$|Sp&TL5{8r*fc(z+HkU_s7amzT2wev7Z4ihwBzm0p5}&fUl@TN!nNR zp-4Driz@Q||5#a6V?Jq5#5w6np6Dy5JlXjX4kK1W#S{G`OC3b8Ghwa$*||PLYb)om z;w`OhZ7F%o6RU&m8_Ze9>eHxvMlj>C%2V(TF@QyVB_IZJoJQwf2W}+RM8y+>@HgRn zZw=Hw#QO*BNMbPBlrM?3*exm6CWjlU7$W7kry8V~TNZPfH!+`W?Xa_1Uh0W;&>hcQ z7sM0mNm*R!RjR&XeK4>WUI9RKu>1B}CqjWJo^cN4C2kW=vL zP4(LTYhIlx;<}sZhn6lj?#2jE#4k7R(J|zgTcBC_<(4QZlx#&2nR#V20utbOVr$9o z*PV%2ej7^mX8CO?BrTtzwY#nVdqZxAdBCT)=d|I|HJqU{IV0T!ZoLEQo){{*LAM^p z{`DD}Te)>6>dMuPxvX~P;pmm28)uaJ=B+Ahoh<_ABQXMOPmILh9^0qv5Lep=uH@5` zDG(Q<&=-z4S_xG(&f{?9Wc}h$9Y=#c&uo&>usFk%Lf06iMSr z@)eLc7z`zyEFyhKSUM%8Cl;g+^`+BAq#MK1O;Q@?JoD^kU)nAr-4d3bC8ZIYjkE&> zVrMG|casdeV>e+~Ws*3Xbx*WOw%Shc>JX}HXNUR>BSqd59ez7*Z)AI?YNtb@y~}Se zt&@i{;i`7x9qqzzFRLq;@>$hR_~dQARgl|}Rn2!rYRkx>$?BJCwp-G;TTIPv?PwNx z@+#!Z3^9l0HU0KL4m%eueh!;&%sR(=Vjg>!3wzdZF&{-3^At}kK#4bC;xJhVT!M*( zX2BLWM~)-v2g> zBT(T*nK+VN6ta$z#V{_9CaTDAy zgJV*5`NBeW$E$ZgMK}dLkRqH4lF=LdX&fdgPA5?_K0_A6Gd`0%%s8fId={!9&paCp zNN?bYbBaixtEKggqlrx5%AfBg=YfMLaXyHHTNUv;6g+VO{wAEzor<`SJf%V4qBO>6 z^l`VNu1~~8QX;&9iwhFe`BK^Cv%uYnxCAT&8s6fhn;ctQin=E*!(Y5=Z*FZDm!nu0 z!`(zWDXw5=nGaowQuKg&6?p}9U5#o;UDtqtdR)p# zfIK0K;U_#vUI8Fap^A5fh5h1bFqEC1DI)!BSo%3B9oQ$I2Lrp!3m}Boi|h_~(fWkf zORRh1WyvPK%x>}uJJe^Wd+Zjk`t3ly+$~;H?bNl}E%5%?pzU=q4cgwIWKz6IqE@j? z7Q;Gti@XBb-ext5ws*jEW%7e@UhZi{>tY?uKd0=b5I-5-&l>V;O}5U)!%_o)&F20sG7Q=s=o*I#B#|E zs`>}E*Jrp&qw3?0s(%F2sQRCjOp1SzsEPht7Q?Fk4|xSu|HNu^qP+3M2uOWmD9O~P zR2G7%58f5Xf$)J1;#RTOtAZad@mZ!zX-TO`diXt4LZ%}!pO9G&zirYp=jTmP74uW4 zz|8r@Q^G;iQ(4I(EQDDJKY&uJ=O<9nG59$Y$3U=UF9?uICdD8*FA34X zvKY4fT4r#y_O(%k;R`W+2pCFp>lBe*H!QuLln(3&>w|%qz5xiqy&<~;+(aY6y%Fo4 z*jTcIIm0FtsLxP?i0Pa9?Iz}LrrN0)6Vneh=-(VngZ?cjnG{=+s8wtwi(wrk$Sc@? zYgQHd@wc#fZUa6BLAtnYkwLc8gZ#=*%f$AOhq-F_kRXb7(Bqa#6dlJ)#^7&7aVq+5 zrRTX{jkbBMqFZWOcn72S+ttdgWhkd7!8DBXip0C&X08fMBUmk>$dTY9AP>7wl0yd#zq=mz7lsdROd$YG4vteSd?*7K zUgev5WSl9%aq#5imzUMGwje)5N;-H&ZrGZ^%(8Fwut-RiTt=LYjNLvUSt z1jk{#ok|8i#qR7JCiX;G7Ke*EQyd}oGR2Xi-V{fP1{7f>C~t3+w5g}bV%XICkjFr* zV7V`Od!k9QgYmVQ9qKcL7~-q#x10FdqS^@^w9oR}%?*sB z+6f7?xBBho1}CH1X=)vxt!nwlw{Po;d_0?V;JpjKsa_At&=jwr1)pzFw$0;^N zM;4JjDlC1pln!|9F~0QTBGOC3(#J~afY%=9OLrHM zK3+@fo3ayFm0#2bfO#UA@Z6I?=(#7e4?LIHr{|u+x+hMR?4akK#t!uv0t}vay5DX* z_YBoe;Gq3XzukE5S*o3YK>OK#dud%|T`BHv&r$6(wD#P*Re}G4W4DQO!Slp_eZ~?$mmz<(N}aQcoq5q;a798U|hIH%OjbCws|c5#(1ovzg;?~o>x<= z=ul0EX*yif5t@$Fbd;u}HQiCuF`ACmbeyK+HQhDjEJMY)SW_=FG_Ok_ZsFpS?pEd4L?o zJs%_fEm4X58H^ZOqIGL%m986VKC?1c8$B|_2g6UqKI1>p>??+%XXt16hW6Gj)3k^9 z9HqyHWihPBN66#CmDwLfRjRI#s67Ujl7GC2{1ajMCwt021r~zt(;!65GZX+~XkQ}c zS=K%AoMZ=s?(^(WpP^+CG+*%BP0)Q&wNsDgI660@@$?dy22U?jGAUjmQR{eB7Q=dY zjl2S$UT0M*LV;?ccmtdQp58>wIXuIgFyLt!Yr&B77R5Q552vLe3;J)4e?6Skc)wKM zz|x#hgYZlz#oJtn3=!{e1(6W(u344B%6qIz9c9AGc^?gk5g&li;vcdfEKUp1;va!} Y;$z8G23HrEpRm0?L*3~Zk1pmY^tFP*_T!)cE_}(bshE>`rz`w5PVe z?e|{4__f>aXPwqq*3zp5-LmO1QL}6|Z%ixaHEX~c)2fy}Agfu= z7XG|Nw7$!J_ip>+{H3nZLa~^=Nh{WMd;OlnqH~tc=Czt;v-z^OkxEq`D3oh@wX78} zO5c`3d3#Y`Q>={Kut_Ux+x4m~JMABKCs7l_vTtP8vS17qP;->`aV)AF;1NdAU+8X~n`W-5zJD8*a`MURk_sXX5o7OG%WJ>!q=s z0~{lzda+i}s#Ofhda_l0q*AtO)%r*c{jxM~XtQ?EgEyhEZnp>HIh#i7&%`rxZg-od z^R=C0I=z_KbP3HWE?T8ne5E}!G_>&uE`H0zu&}{I*}}Z&SF`upH9I#xxFZ&O^7Nms z{>R){?DRv!2MCr{zy2J-M}E2Y{C@&==TEtX;Ng>gv;7Ug^qD<;j%F$Rn!L_c@V2{Grwau2m4=G+m()q_96RCrI< zj}LaVqeVitBFyZ?FGo0?XPgL{8w-Yvg+s>4#%W4?jWdk1Tn084j~o5FZQ~r`SL58q zalmiJ%Hx7hAXjY{4lG$du!L)&wj5~^%N`a_Tpd}qv59q*wpX*LgM@eF>IH}!*q)`W zVqt`21#6!p1f{OAl(qw!(3o4d2+0dsXRc-`{nwtg>NyA)l#n2BmUMv9o-CE+b#1aa zq&+=atY|gcXme2AB?XA3Buq=#CLu8yU7RNYJNZ*A)+J%Lgv73N%@QCsTSC9E8yo#X zL4ths_~48Psj)3e3`uxsdPgB*2aQinOqb7c!4XI}b~uN4@1}X~;i6*PYCK?k*x2ip z17qB{TZuq>PI9!t*euETs${=$XFXSEisz}*Bhy9AjT&MVUT{>^i(W(?Y|`7b}9W0zWtr# z+aW06tR@B47ery*OEY$om)|j>zP(Z{jA%tTl!KlpkHg%S9jWBW z&NTP~G%zXeC0>7k#cv6EZ`_MXCSq}#3G2vKs(HPt=k0MA1=gKav!F2!!gi#an+-Yg z`+R1@yck-_??7EaLl46%PUf_#P4`{6k5($SjvDU4)j4NP6M}<8H_yYAB;AuP-ObZb z>?J)Sjh0BC7NqZwB7M*_dRd{7o>9F8dsoj&EGI->RHsqa+nr(qGOI2!(;j5%eq<){ z#|ti@U_hD4DVVn#m-mL_a$f+Kr9w1c!pqx4@t{Xs5usPAH7?%>Vbc`Kl8+1dtCV#} zUhQT`9&0OVIUP|oDiA#NdNoo`v(RqVmDS2Fy*#uwJaz(0nHOQA=1&&F-qj@G??4gd0_I>;?pqC#$3$BbnB;^e`*bgy8vxM5O+F?J%~fPCtKSUiMok` z5n4E8&|!X{&?k#Q!EyvD3XLumdZhBHA;8WdA+{AkvQ^VcV~Djs))8BzJ$va%PxTp- z^fXy&Y$w*p5?Ok`pLJck<$`sZMOj39NBq=bQf@=fWfIe@}Jz-A}fauypec1>dnub!LANg5|a^Plu(Ao<*lQrNl%l z)`=9qb|rs922(~4>xtMv=r;~Fu^sS39BjHu+A$b2B+Yu5oFp>oy`IVVRoIZLzWxpT znSq;Z{Ma4Yg>NKJ7&3LVfp<8#ZmYd%(W#qyXS39xPeCT3+)e+@VUMFjnbsg+Z@PSt z@WdB?P<)UxvtU#6Sv>P7-F!j)hrH-flh7=q^Js*JCRUVKDT^3<#{BrD>VwaY3Y%_J zE(Y18()XhBWE69ypJSaue#o6_8KcnuAwm)5xy+o5AdCq*>~Rv3DY)#yoJ92#=KTa| zt)7XdQT>RB^p*SGJTMP?Su2+Af1cp@_s<-d5BTe?58Xm=@gFmf&*ytwjs5tAAG?&5 z$`YG{r2;GyV7UMn*I|we&?7YDg!wE~u~9|&cWfKy?HPzX6R`wO2Iy|&=_)q!A;L>|i0^8V zs|Yn4aPyc}Paofm=mRmn+Xv+mSh{9Zb=?+0wqjNh;d6#RYK_rL`?-;l9;Pvr_Qx9d zr8dl{pJOG2A)@d+;aKeP_(kC*RpE*PG4_{GER9MRa>-ewnno(c)~$+HVUXB*#C{@A zUuBAv9tIQei?GWRJ3z5{4oX#q-yFb0h=v6#iR@M)&bQAQGVt1nL7pd0hvW+-9D}gT z6?mBv=;meA%aq(l&f}zp9UhlSaF@|TgThF}G`doYz*_+tY9LiNiY2LL5vg!=CoA@y zTlN8bs+P#bvlq_FCN}yiH5a6rk`yoPLXCV9MjoOz;oi$+h)()L5MAw)nEk-jZg331 z)amWhM~+V8w%0*Tedo3eb6ZVN#Qqsl+HGJujo{-Xcav5bLuKlzDU9|B|J;Qvn6y-2gUex z9=9qyKh#7UOa^m}is36RIlUWV_c-J<4zx_mCaS=+!6>0f=T0p->(0)#Kt>eO5AT_{ zzITH$wWc^3$4n7+&XKYQd8CO=2*|-wH7kNo<-0{0dLCG3*0Tf!a* zXG!R|Sv7acMM|ZctvjhiTSK$w{GHYfldk- zdW8UiBji7h2!uK)@CqrDLoUi|>P-QoTomPkOrKvU#v52A;Dr({k#MPm%OqSb;YAW& zEa7`3yhOrFC0rrlWfHEG@bVe?XX3Vi^OjBp+!y=w$Nw(9M|--1gws={^n z$bIWUvEYe1>%iw?YR}d<5d2Cxk^w=aaAk}mdz?IF6jtW>U>iX|<1t9hHM zaCNb$xK~xUF}A2Uq$=E~oM&}W%(XW@-|3N`??^p8E1TCxHJmrj-hikaeiZ+L&Sob# Wf|*5BF0?jQ&ViaF~TSji%$1rc0VRs`ETj^n&Zl>JhP%_#Q_=Iz@BgpR)7{Jh%IAIG@cZqndiCDxs+a0jb#r8?=f)CNMQca1 zJ=;a6EMb*5r*pPj-qe-uXijHqa#>rn+SwXO5>Iv5vr-d4kZWzX>&O*%D~gjws46|T zSHddI&2_69tnwCNyKZ%qv9qNmYv;1@3F$d;H@?+TZg020FJ9vHFtwlH;bj{j340eAZ40yDgn-ov8@i{@qsjj9JZD zcYtTbI_hKjM!Dj&yGBz-zP)QsO;e|^YufW|xz@B0m1Uu^p(;4azuhZCM$KJb}02jsBEj&!^2uHBGdIMrRpDr@b?x$7GB%M}Og ztL?g8k@32Y+~|>de0|T|AmOax40H-Q*ih-f!LnP;vNuZ9mo2+&z0eZ*iz!f2s$rRi z2dk{5 zv(p_rsaDIebB&nPj%u|#MOVA?s;qXGwCC<>RSvBgHnL`zTkE;wkVMn7?W{ZA8EREF zQtw%Jx1>A4b9YZzRqfM73b?F0u_={m%(o)bbfi*otgZlI^$BN)Lrd-jOHNj6YlyCG?^Rjb6ew++ zx}T=zw{LWQ)^Y^uI4Ozd1K-p3gYNftpz{L~&Uk0C11%ruxd$bjiB3V02Rn7n)PMr5 zvgUl3dx+Fx(sK_rT1-KUhZWLAAT8=7frm9lr{AQfKN@b^oUmf!tk`%fwwo23fHu5N zCnOt+YzZlHx@JU3w3v`^dQ6pm5GtL1y{p!3t%6k=_N$e;ZiTKf*-Uh@GXj%s>v^*6 z;O(GRJE_&fsMRjdb;DW}(CYM{Rx4WCI@38fEA^W5+`Q52Oz3r%vi|JojOMJ0_2)vV zhli9pPm6N`X-jFSCygzgFE!?=`5h6R-~3gX-;vPcQPktn)Z;Ab@fgoNHmt`5(Br(I z9!uL>-Gx$#$9eAYMv2`};t55ab24ic?V%@n?nw!&vMt?cw`JXvlkO=AXP`--(xDsE zSvx*abk4Oq8YYZWHO#4g4Re}aF8A+Yx@q$C zL9Qjfz#06zl-fGsKLn3?IOH*p=*O~>R5=OZqZ`JTd-Bc|7ruDhqe?lC1(fr+=RUEF za-LkCa-RBcDCcRo#WQq^XXzIA&@Gb9>3+_op3`+M{o(BYe*WQhmb(>q0LQ@Sm_TcGW}1;=~abKgO(O*M5k zBelI7vh{oVxk!R#r6ov_R#i&o6BPCDcU!$vMlv`*z}%sa52JJcqn5$S;v;0u#pM{yW& z|I{_^cQEt!YK1>USGbfbTwNyk**%&0w}w_+3+%1oY8v*@u&;)`8~q4V{8ZR=_4?Iy z!8Eyiz3u+X-*$iI{Hqr@*i81&;xy=~Z0T342nK36Si^*dTi50$&0nf$j3rWPDsg!d z)CI%8>sJps?teHF>=G;hM(P6KmmaH9dH$wCu=v0=fM8zJf_~|?s(4&yhRflaM8q(* za3TCvl%gdKD#}P7&eEb>%lLm{R#AcZq&-o|N!NO!mz?r==X*qms6xjR)e@yn9@vVo z);NjloS#*wSn-zDwziZ!)QR30@-5`7ZS`tY5h7N@fK{GCEQmgA>Wcx<7oB{gbBBYV z5v!x)36??$=UZ!lju>8p>97&~F{UC%3?NxjtU(O7PccyPaR)W7VXj8Z4cz#Awzb2~ zW_c4Q27w&UTob?(Ye`<*!&NF;F&GH!yts4Yt(#bz-5z#cu@0G4sKUX_#h6%EqF3r} z4B`Fa22YRsu763%1~(;7BWxe{%?E_{y|Qq~_kg{d*PTbWbgj3i{{T4Y>h?Pb7mY2g zSPJ;bvx}w?j!bT|gmB~jpP&3AVAm4wJ;Db^4A|x;z`l=PF^4d@-;PiH)Zc`+SPzp; zC&l_)=Rn7gVX=W;+wT^}P9#>vhWd#JS4J6wy+|x>6s8Kr;>H+LvA79Z$~iVAy)3UX z+VGnJrbH-BQ3k~W%hl45%)js=Jq zBb%Z7T(JXjuwVHC2W5yD%Wm+F!>>Zbju=u7w3C#>>|z{fXJ91HWzoQp+aD;w5 zw88GKpAeIwU(VVlvIS2JQ!wo1_B#o9gnVeed;IS6toS{lL>tRm$)_AauH zDVnxYu45>r6@DvT$8!8+pYSXwH8RhsVG3mx?BmnaJmKH_l0u$Qge9Bd6tQ1H08_;N z=psX;@fi6kMjQZy;y$nl_d#LqgC+O)0{0<4cd`igp<(WnBB`HO&Ie;Ps~J%S6E_} zYy@tx#B5@av#S<3IRBz9iN#7w%gUtV`S*lkPJ{~AS^S6FLfmMHxg51BT3Q9(ydI7Y z>tMWxZ5H#;;Z2wDNTNLJ2-yt#<$U4_Ti212iEjK&SaF>GQ@8|e z>TDAy0As3fc|j}5@e17E{!au2w*Qj=GJ5kqnPQUS6aqElQ)M$eN01A{EF2*AO$i1-5-#!a3#r+4rm0XgR1~N zakaz-v%xhSugg#^s&H(y`q2LPydovvn22jJl+jkcj%{t_>(PdlZy-?9zEL*AI=YFt zf?wXut}4+A#n~-DDofv5M8a)ag1+93TH4%A`WC+(^sx9HQdIERf2S6No8pCZn!(Ev zMOV2C^q#m|$_cy5Jz5f`@#EvKfBDBg7RPp-RoKQJ@K4G)4$Aa@;nLZGSoeGix>QHc)i>$UR2}Mwc0K40@--mOF$ZLdzqX` z@d|-j#jCOz*1>DU6};_rcB9_*29VAa=<`iBf)4VQ;jXWc4&s+=z=wW4^^Gr@@o1m# z^2FO1RV#Q0t!S9POI%?E@39+Q!TUfW)IR{AEj}a-Y(d@77Jmfv#77dVLS1d_A9K7e z!&T}~cd|y+e*)5|`p@J{icbjCL_d|yu&Vz;Tw$W0u^XM}=Ri7hq3SQ#2&(!^!%bDM ze098ZzOuuLs!sLVRQ#2Xl>zfNJ}+{G_jfY~wE=y_Zgd4-0|{0C1AwajhBQz$bwgEu z3+Rb|N^DTo-*LPy!&MqpFEpzD9!R6=AIO;$O9|9Of0WIzs(&J`pz43I8=dISKtcja zpMOgUD1H7T8^O}&7h)g?U(O(J6?;D_coP~VDJ>~AMNf=FvK*21gsgH(iBmJjn}lUl z%ukX6Gv}8{2?tp(T-oqLB_)HODn-r2&z2|y$(EUuUoYW(qE(S|gpF^7$Q!A0_(7AY za`;gb9FX5M@f7TT3nuy!1F_JC^uR%tkkbbOp)@zB2=|&{?zJR$U`H4X1akV?0CetkNDk+w zH`2M+W#1F)Np!GgSf31a8EOzYeFJ~oqHv-al|HkA@icJXADmInP zunsmOu3-Po*;U?;zlF_n3*aFH<>Hn_ge0_(-}ud#*b4kG*VcSU(E8hGxn?vPK+pJc>Ei^2=M^X4~YvtCmEvF})X$a>P$#*qouF9FVW4DMRhXRk3 zHw=JYF`Q)Z3R;O?F#^yNBPCWjg*q0F;&@$#3Na~fD`WK0KpLZOPtK$mL!jomgKUP4 zK9)EbUEl2O$gY_yzg{ZXeJ2blyYE~?&MsQcZ?*fb;D_C7rRbpH$7#X8GJSAk3OUgD zpqygqLm9a6F5dJba&`;K!KVXALtifb@w9<>LojyuDIXGcKe|cHAi%fwy>lMnmY>!? zJBV)%g6oOxdS#%qR=nqiD)7@dKSn4KM* z(8tJ7+iXy1JI|5Lu$|`;SBTof(G5lIJRp>pctyC62y@Su+<~Y)(&s*^2=~!p?qei( zAZm~Gxfc}SUZ}bCP1$km%1>egj(I$gh}>=fM(zouLFCf+8M!C2?}?KnIvBYplb|j` z2SenY;*XoiJyngft=u3>*A>;$U!HhF`$03G9I8 zE(BhM^hHI4T&#us*6nz)Fa44t(wAuIE4mZB6m;fOmDEV2 zjJ(QdNoDw}*)8G-*8q<)@>&3f&~+q32+_+JLe~R&;s%KgmXSAdye>mi=rS@RHze)1 zHvw<__GU6C#VrJCy0^+^*l%wm4t}e{{dRV(in^TLK2F?$PKJL9Thg8AIa4ycB!Y|F zMe2}`+)WJgp6?_6Em4X586PpQMC;bTD&05Gd@5zGHkPRn?+HH<`;7lYv#%Hk%Fxf? z4eV)LmTC9$IZBWB$!1uO_Y=p3E3-dl)PWdZ`{(HP=oH7PKwvL5Sb$0;0hus;!U$EC#GC$yWU52{TIU=90`5&828NUDk diff --git a/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_3.4.3.pickle b/pandas/tests/io/data/legacy_pickle/0.16.2/0.16.2_x86_64_linux_3.4.3.pickle deleted file mode 100644 index 6d5451f96e20d6e8aea11a1777edb024a7a561a5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 14116 zcmeHNdyE}b8NcuDqua-_D^-;63SFVBZ7H-umECS>sn;&WQvO?S=I)*CxzqdD-no}P zXsIN2amfJ*9f=|S(I6pF`Gbf`LNtUJpBPOvicgGxXne%*5?lz0{=V~=nLGC}_inc< zsI%GmX6~GG=A7^Qea|zWN__6UTO;B)>5TALG4Gsoc%ot+-0bkRvqmXr*rU_siaDAu z<;id}Vh z$~LFVrJQ5(RKdK{EQqHpg|U+UVG`8SVFNsoGpdHehfB5M>@hi+BE8e7RE%Q|Uzw@U zcy_f?o31)%9K4bmI(!8tFPfZj(vj27L^SK%>wLl)Nk`L((_XV=u6k_Nq>iy$x1nV^ zDqF>Q^n^1xIr*`F;Si_vO;sm5PfLksjfz9(<2cWh%Qzpd9d%4Rug&2dSj`| zHq6$si)aGu1N!wb+*~OKDwj-jt{y3Sory0-R`X>YU*i`;*09lGU{|Gl+$>G*9QQ5I z8@~tRU(RFsP5$Mu_4SL1&|)TfB9Uk`7K_In++wC<^)scBc2ldU{G(Q;Sfb5)?Agpr zg@gpiBxed`1H!#PP1c?@Dz-T)pYzrG#qCZAa+m0F#`)k$=OJ_);TL3b=8OS>W@hq* zLgr4RP&1v~rxqkC`HGBP%R)jBv>a2tlMlH44gs5(LAFqyJ_PV)MC)Ecje31Dqaftx zWWH22D3t{f+ui4iDLVvLSEJh}t_6ZqfhiINYgH&$)4 zk~eLwMT0_%zz(yTFPa+l>%RgXiH5;GO^u*2UI7_mGvW1CehqJKxt&X-Kv z-U+Sn@wc~Bt`v)$maU9LM~~>zoa;OxS9TJlQ7limv2&6W0LkLx<=6>k_EFTn=OgVA|?Zq6p5r-PFc62MI}_LHq$0}V%mV2|L9y!ve)~^B)vq@%j)FU zd^sc`;w|-T&;35tqRlc=av2+GFz*Ooh973DVlsL%=t- z?{K)=4wY^@*lq_C4lK2Cbkmm6O~NImgewto3EAJC;=xJ=9s)+TCtJ%S^ne-TNxP7r zrnrGZ1{pN)l<;1*!yd1pqqR92iCEX+ga$F$#M*{i_PxO4SUT>1{JX&8csk*K{94WA z3F9cjDweMyqRQ5?HlnWCjC3+$2{CdB+WSbWGO8Z)mD2X@7cR-?D%cbslua|_4D@hp zi1%izw(89c3gF0@Ic%Ec_~8D0>0rTZ;X&Ckb{9CoPENAB**)6lv(xizE*)j}vim$w z!|re6WL5)5h-m7aOoK(no0_?pqf6~#B9Tki{^kBR;c{jlJ^5Rr&t5xt^;@8;zVrAo zqEmO=`2Dv$S7bl~8VqW$ROwqhb*8Ium6q+-jY&G9=v6_Bw7SvSYtuvp;( zE#!U80$9XN2dq74h)W6Oh8)I zJRnoLlJ6jT0psO?cZq}KO$RwwI)w8AO^Q3|mg$nPL5J|wpuOTQ5+P4UN53QjWpp{9 z?pG;F$P+=@f0t&YtjEB~wxY3~KnnqR$%_9R=evXa*D+BI`A6xRAa6Z_D>sa-WotZ% zTnu#ruz;s#;U>yCfR_&kk{0!Z_wN%;{FuI*NB24{kd0VQv2R~OAvLLJO3@XH_9)t` zXrH3}imp_2Kv5TvqTFEC8@W?FUY}IKfgv$fSrnXD)J>NLGd0^pMWSF-^LMIzN;Mo-9Cx(9b!l1{CIFe+@F9g*-u_*OMu|OV*yNk9b?o@y*U28yv zshB@%<}$e&sseRIDwXf|B8#psG_Vbb} zQ%1N8l-X*qQWQ-m=S2SE4k!n16(ua3q5&H{zZj?0+=6q}_w9yy8xo$Mz^wju_RR-} z5b@lw`GMaPo&WY_qr;%Ty#47DMA!c*{rs?qgc@fR6RVSzums9zn^0~5LFtjmwUTU+ zyFDw2n25pM)}Ytdnk?P3z?UHL|XpER}mnG*a3dfi&x8 z2+G<;r`>eAg-&~LdeI`US**Xk20h-`BA`J}G`6VE|E;f53#CE#(Y4SZ z7F~qyc!mT1%%?Z3hoaQJN!Cv;ae%Ed*=oOY#82%uF4$GElOt>A;sp% zT7DUdz+R{7lC7}g`a;oG_u7c+=fQh2rlI0#&WUj0)alrBKH+@D6%Oy4HmVtH+)tMa zi%~T<4Ds)YgqT1qN3nB_=Avc~u|w~7@(Wd7BtF@|G|hwMN`BfXV0(DNcjwz4?#YNx zHOw4#hf|qpW15Nm{6s02FJiCOMk$7VFXyP>-}FlccOGbXmuzczVQ=){sE+|}$eZP3bF zIvT@z1?v^r7de;LaWikdfiry10*I8i^(HPGUqTiSvW~|hBG|I1INV;wo{dDfSSRH! zf}yTA0zI_~4Tgf*FIE%?hTOe9ncLn-EmSac30(`Xi{cdY&Mb&cgF(owzA7c3@#H?y zM(JM1*{^HK@Of0#ERH+YalF{o80%XQmi0B9TE@MUoVVEn4?5P!m_%VufO=7meBP$k zL~m?m{c`RjG;P&iV@wBg zI`Zx|vKH*Ho&rBzMG1Ejb-w=@5{!zkpKyg_9U+ExFDK6uE@OBd8uxV=RWt$zF}eoM z%guok6m(Fqdi#3uQM!EZ=$ND%72TxhW<{@6bc>?bDSEx4TNT}==!X@(LD3r(-LB|I zmNh$g)d#|#xge~$Uto|E-vxuo2mNuM+DhiB{S9AuV8=MHv+Uc?!aiP!ueZv(fyJ_RMmaZwxsb!`0 z>&Fi+N*V#xHtMI6o!rN0++J%cQtE(0O^4fGvS>#GEq)fEoJ#sRb(@bi%c-^d`_-A9v}7b_>jkA)WgFVTwmqhin6n+a(jzo zTU!*mb-oRgCl0Vl9BwM+cdu+)vr&AB9l{22z=(?j%Mgb<+EUcP@sWN^gvY*5YXX_# z;`OO+!{C37mE8gM86RLD@&L=@s4zI8p!KlX^b1)Zs1%0W%31?Np~y`#ew1{UA;s^+ zh;+4!REo!WXJDwBn1hm9Cf8HZurNeNRTJ6MQTqYi{?soULwmE1(rL5foG; zn=wWWF~-EiBqkb7jEP$`ZqdZVHAZ6+jZ2KixS&ZS|9@`X>Z+O^KzzRUhVSch@2PW7 z-Fwb|&fSW0D?K-nO4qb^wOY$vRH{egJra;sZl)M5ZOU@p{_%$0ZEbn0kWbFatw_4b1I}{? zc<#Vdx_?)<>dbYtue97jYt#L*TFf2X!k)4tN&d829fe%d-Jv1TP|;9v!6J7^dRVrY zx3bFW$Q9bp)B^6%wdv|J&T7rO!@P8&t07TrmJ#RN9b3AJojoh+Te_81-&yP^wC9vU zN6@~dyDMK%#g;L4u#G%*M>JHdPA8i2zMWDHiFJwOYG0zYu%gFGx;v*U=O2L{ zB_J^})li*IOl@pzJbp9&E+yT#V99f&Z@lp8!B2u7onLh}=_S>}AAE9H0fX$y zbz1JI#^Rbq?k?%7_O61vtFgZfIpkp9vAdOdKDVndX`=spchB7;W$$PYw@XIY(>H>R zakrask4`mI^&YoB(hB)YXi`z>(<+}<`?SWVwLb0V(>kB__i3To)??f&UE2fXcDK4? z>|h`U%=$@kfe84c+1qpXN!fj3a(;JL!JggKnsoOCF!%G^{Zsa!wf11U1h|wvbgdo0 zDrT4ijA2H@zt#RQYkXSk(|$g!^J#ydCVe`;DmM}I6N~9uokQHiJa1kqN;XNBNxFwa_Yt0ZBs!k$vx#Pd2r8E$gsmc67I#F+bOpTV|=Gdld8?oyxe=?A>hOb$T4H zGsG=5t8KU?OnGuKgso+f)%F;~np#blB4+{J;r(+tJtbY*9| zyTqnrPxRcSDSNU##hz-@F(<(>Czql#l!oRr|N4P>iszo1O4oMenyrq!o5{GRrR?Dm z2sG#NR`OWYz0&GxoHaw|<;+&+Ub7HBODG8Y-kx0OR!|jreW}_oV zfkZ{3ve0a$YnW5>ZcD~(O)aGhy1Ojbl2YNUZnw=sudOrL>tz|&PH6|X#~tj5?n;5+ zkT~?{NZVr4K0`O}h&S()=2g;sWE=-3ZHk&Us_C-Q{qxXp8!&yL&b9BI5&Q+wLfpNcnMCC#_l)aP2Zw(ofe!cW@>zR#rW zdG^UR!tH#|U7fNQ*rkYCV>j7KN0PnR_G71}t)n|va4(SXxzKahhQnta!sjC0)r;d@ zy~LPlyYkw40OHaJAT}6%`B0wJ-tEiVn^uXpKdXEAxp)shZ!~UK55EADevu}JPb~*1xIPV8^-Vf=#tLeO(Joo0P^L_;9-4J(PWg+k0 zBA&a|bAKH6+)v=S+q8Xu3i~$tM!Vf}f7Z)rKi`tk?vSA+SKBb!C$JFrje94IdsigF z?l$j*N%3C&7oQIL%I{lW7UO;QX!HCcWS)CH_r6}{`Q?_(bHC~4Cu*JtfRG1?kcWtn z+lY{dJ@;1zAxT+-JOYH=9YcscM$qx7=l(j3j^6+sk7+FbHjd@reJm_L4#PeXapaRm zALh;z%-s1@x%I|0tT!_5(bb9lo%T04?e((DLw{xXch7wzK8`od zIG%!O|B+hC2$~_kGv)Uf`JJWtFo_?JZ^4cKjJWY_qiEY_Jb!)j4or-$y&Lb^dqyL> zwrQKY_C9R;0d4y)+V*wY_TQfSpQvp=gl*rB+qR~&eYw@@Zl-k=cQu|$;5TGrRmplr zRe=aFRJ901Hub88JRWXr=RQf*LP0L0`tdEH(*ji&QVgT0{;bb2$*&<{%f@v5BM)D3 z{!ngART2WSzHaO8Rs&F(Gu@XKbgO}2>2VCAs_1;E29x)3#<2qwg$~)#c zV!inpH4Jr6?I_&P(oPMhe3Qe`#$8m|T2LeSm@e801yAja->{2DvYt_+D8)^z+C>DS zF4~p6k2AF0pa3q~oo@-dXpfL$Sw?K*(G-)7+LPL(&!ed(=8nBWs<$WWs4-9gckIo# zz#aR@JLWlJDY#=_)IGJIaKrA{pYlx(M;3QSRZFg*QhZK-9DstS4#aQR9|y6XQ3q2> z#&(DZMEy}u9{u6(77V7rmOry`Hwj1`iYKtxSP(N8$bJ|dvX*tLb{y&?j|r94I7p(8 z<3T(%0l(NIXIr{kt-P9uTEvNm@v+|%6?iyCbk1|cQ^3P4)IBv@xEc>tTt}#5Dc|JKk}*6Ku~+WN>LV&ON17ie z%`Y%M%J<-?&8j!2SW@%PveaCN!b8VPy9Tzy;dQ>)j8hUd4^_0)MCh)QyP^vgXzE8#*BNMuiGbm?J?g+3f89U zc!JF=P{7@RIvKn*YZwxzprl=XD%-?do+0o5lgm#-hY)gE_8K8~y7YOTBfv$-*ZJ~UVL#OMIODEgfJ36v*rl?xM4%}vsT1rzj`%!4ICC*2R&Gs#hA7O3a zd1@JcQ;r1IFzwS48yTLI;=vF}fS%-j?yCH!r=EJD~ny2k@E z-Wgg^vB!;>4Wykj-g=Jo_Q%#7@-FCr)?Lpb-L&)5%iaT>bA9J6q?b*utl0$m;zO4; zkxtAUbPefVLtnY%ebAn3yk|-8m@w?14?qXsd)*4s%#tbhe=sx%`#d@ei;+JK;Pad}p6$w*u-$8#N*D^Sv4S;;m67RLv|+2DBU9N`aI z>%;)yv1MNX_!Jvy0628|+VuauKUbk2i0O0LZ^ZO@?4mu-k#Ipwe;Rd9eMY$9m_DEK zO%7eIW7>(wF)!(w3wb}1S5sahxtOlQG~1?dny%J>>!}O)EI!GKKvGX3xLwX2%LYkx zA!HGaYo*~j{Q5~?pq~V^-ixTQubZRDosqg&E4+lNGHN}EUI|?)#pptB19@~{^=g|d zq0h3;6kl%FSy}>p4iXwppJ#&zoW1~#r@kos-f+5%E&GDgm)MBC!TD7mV`+R)m$9K zChMzuOED?PM13`$AtA)O|5elE6Yc&+^Ok5{ny*P6GNyh8=Cp?#yb6c?&8tn11T>zk%Ytqf}{ zwU++2KCCT+>RU$p7Q20YZz=6JED^q~yN%fR4yAOmxj~B2Wb@sU5=l1SLtW>ZTo7g7 z2TyCbv5bZv#5DZSXqZ{ja8sb+<}w<76w`2v(SY5lf10Ll4K)0?jE0}YG~8x1AgLP- zKLyY7)a@XggMP*)n1h&%aR8=%&bp`W5H=>EbrtIp_1D!NZh9-2&|&T%Wr#ex<5Zo(38zC!A}p$ zwpwjDTx4g@XzyxO_mfv67w+l-mUj%EhnBt%;z_XdEjH)vC7ybS($&gZHeNl9BAk1Q zr+$SJFHqGZQV89lsz=G;Bv7~7#%>>bl0>4ivZ@Nb42%6bE{4lg{=>~S?o-uo_;@KV zklWf7K7V=)^`(Stp5H^Ys^6l{+g9~E3TnhXF2yKjpCB(ZK|aa4GX=lSRQ?RWj~o`b z#pUWLRAiEL_&VHItEWM=@@G(r?}vU*9+hL0#@Y8-k>Ncsk>s$CRPqN<|1wrRhjKn$ zqaF1;DxUfyesK$j(~m4}@LRe&)C=I4t$-ZEh&Vn>3sxO}f&^9_e-;IQp#pnyM!krF zz1X4u?WvBxka&qtGwQD*Bu6SQOEGGvSIFZ~5_%}Fq8^X!*T91@a0d9dG8$et8vF}x zlj6XZzaY5&4i1vd8z9a&{9qh!f_Um5_)R$z@f(>o-y++$G5}SXK(=)zfw|T>oU$^e ze^My6U41*$knVYhb?tk)y|f&97d0MVv-bb+^fs>P5Mty*? ze&N4RiofvR@XN5cqQOSdww_7ufCj+H5YZJr&KPPm0j^bR`0dolIM^WJgU&!~7+5$R zXVcyg8*VhBZ@~#=FN}?#R19M~g<8@)JIf2A`K4r`k$CE8fQ@1?fELEN3tMVz?JC7+ zfbK?K2wS_OF4zi9oqK?%HSAeN!|0fXy^Mw#rHOA0c$oP11|d54VH2Q}7$G|MWgUmw z!X`XR^V#Zr%vz~HBA6Nv&;s;>p0sd=va5mXf#7=TAZgVHTo2~Mj5mE$+0Yi zeKIFdB0kyn{lTVH+7nw_XzLG#)Ny>V3}r6g81u>TK`-<)*}!`Fq0GZ0_+&l^ebPut z_=L8iPfh^!)B@o~e6o-NO%BH!_+*Lk$s%dhB(3`J$zndtsIXQ@63;ARz z>*WV>5+1=KSoNJOim>WCMGE2N+^OVHA1-N;LCbx{s4{Vx%8E*}-aC!YWZ5OlCRy5L zol%|AH!)|I z6r<+s4%BNK^q{T*QJNlH@U#_G84dZEhJw)$nxcx}VS+gmgi&!8n;R_l-CCmmh*b zdMOBTw1M)#5uHaIeHPSHpA)Xmu6l3%c|LD)XdIJV(*yBIrhKY`4>R-X3wUHe^F@?B zbs2v9Ttt6~4>Rg=k&)qjS&C86TtQwa+kS=h@bF4-xe|{wT&^mk=xU>Ad*Sj`C;(hG zif*}m`kEB$Y<%e181u?Cp|^$4bH6V9Z930Ql|J}1AAXjn!S?W4DA#Ax-|%~eZ&a&} znY~zDhx+^>gZXtTtKUSmutm$+r zoDVbV4v~?Vxl@W!Gu=fVJ?LL;+|7D8W~A(YP*;kddmyIc=NDy^-fNU@Z~WW`1#s~% zMRz#g-fuL2)Onuk&Pe8dAgZa{Jg+k=PS#Apr!h?rMm6aVs9xLn{)XYWMx8u<{qv-2 z?-^4+0`#fHx1K|~|4R)Ijo|facytd%HF1sfFj-4^rjM`T#;ISSYHH)vBca*^^=POz zQT;kpJ52osHFys;eGDZ(YKFp}ML`IUqKcyIT{@djF>73wuhxNXaf##P|!n+XG zDdD{`syBsHmy0z#mcQO>YW?x9dLI&i?+2ng95nwjnm=|*_;*xOxo-Ns+*(TbPgK*# zNC_VrO#lWKnDokKvK(dthY7%8+&c_+hfzMqbg?1fB$2rs9>RvdpoJ0nUkUvjkdR^& z29@N6azGX8@f^TARh0Y*b82Mi1}3@W2xu+iXe zTz6nyhM{M(871fr!6T%Cp&(2J!zhVVz-VDA*pYQl4HtGK6^vkuCWjD281EDalv2UY zT7uAm#K@9FkP1d=2|@-EyBLXv%7$88dEoeHS1mv!m`qSi*WkbITODdQJoD7<(&&}Q zWrjRYE}&FW?EwK?0PHE!qea@R8jPmBz%5;8jG;_M?Jc5`FZPjQ6e#@SNdI z6K|US@9p>W`|77_I@YJ-d^+Bz6MQ<+r-%7;l1~r!=@C9X(x;PsI>o0`eR`DMXO5+9 z1Q+9Il-Q1Dd|)WO_MX7{mT)l<0@!CB2EqWGM0o@t;l}_x92DOu3pc#aJd)3w9QwuV zGaZ>2eY{VWc2lIC!MhU$`cyv5sG~$i26VI(qd=cV9)a%XlIg6=xz*f))j31WK;7X_ zDW}Xt&7SFuL(~FG$FM;JOtZ+5BpU)#Z~$H*e+Ep3R`^aZw9c;^YQB^;hq1jDy|a70 z6q8)=QnSuA6kU(}8NQ)?J(qRnu`*(@=o~3VEqYwg6Ah8MsEY+lyR+lL(im$fqhMZ4 z!TjC|8o@$ZKLLcOTEG@S6@5)qEo9wOi-a9X>rIqsa_AzY)kr5-EEXAmiP9ZM@yHi;gD+Sn_sWM=eV0Rj7_IhVFd;{2J zSqrDI)2Tr;hETUL;^p8IpzTMU*7s4{8}ynZu`r5r_{^AN5mPmD5V4fg5)4e^sg-px tl@m3Pg$GDAZ6E~2GRgrI^aMd+gL&O(-DD|M zXQlZ(^WMDUyT9*!`-#}y##JGH9dMIQe57QInlAn9&X&`oxp6a_F7LNpXa1N`$QpLv zaIs|eiC#{}Mx$(8&A&L}e&m4rS^i;1dekf%Sv0vjTqmC9znn$>?{kH+St=NLjN82- zUmV`G)hHM{&5|pJN$$V`s%Fd;vgWw!bfydD-t@Kn;fQN3aN_DWEjd=8#Sc0$Ic1?T zI<`-&c(jr)=ZsRx*oP_8C3Cn~u*;>&a2ZoNao*8&+OR|3<%pXK-R2I2Gw!F{8{K3o zoQmA$b&F@q`^HTAFmnAR=$Q(OUQs7}t2;P2c-=`{{Fb_^>}DotGvW2F6W-vOc&;46 znARdJzVq5bc}>4>2iB57>;2YJYZ*;%ovkL1<00upay64(UNc#UCL2f!YKB5#`6D7> zRKl2qu3C#p3tTAaQ?|`g&b0d^!h1QwL1(j^8#UEZtxZl(?K= zo!Hnu0+I9>u$6Q=ZyH*@$*_s`?sdn$b@F5?>E~%$ z>64#HFF77v5Sj$)l@iIoLW|{U15g(R&zuj|vz8l*&MkhhLVsW$aGk`8zLoubE4gIH zjB(R;tzIXQA)&P)-$|^%X&>ELO5+X0xjj?KLDE1CCvN9+!z3moAwou+#2zDGp)rOl zwDcp^JtSMA(K>34SImly@6hPlg?8n)B~ADW?*=F4c5sop{kM z;faIdxl#K(-H*7(07o?Cn_wgv2VCn8!qQ#VL)PIP#0{KyAs_J`uNa(o*8SE4pjOYY zQBK2Vq~8UqML}s$?F@mLdR160M|Ju(oQi@>lMUc3_M0^9*4$Yv<%W&C>vRqHo_wY; zksdaNEwZn$RoUDqRF>_ur;DYmSu(Tkgwur%VjL2Zri7wBl7K@;WsH(bcc|LvNU`Wy z83z*6kiI z@i(nNw0404c~dG7$%7rjfyq|Jrmr=~_;Ngx)^NfDu;Uw{RcAqeB>Qo3p42-`LVg;~DfQtm^7htu!S#2d1cDl8(3Ie&1FJTTS{Pipfz`8Cmf3Suw=>)V>#unkz9w%ETigqV#=bkTl@`?R3a52lv+nwo*IO;m%`df2BTPjYo4FJ zv<9dX;G~IxC!9bI<|*dDviQXn9(n2@wsl;49B)H6{To`Fb)#dwTwye}04);(k#LAJ z?K>0J$yTY7z?Yocv~sCTZm{)B}4O(WE!IrKVNbn5Ws%>_3Mw(JvB+`^V?rKf| z(yu{^sFkp#o&ZsyAU{1Yt_UjNWZu!jb)w%H5`Iv^WfGn(;W-k1Si+=)=Sg_MI->fd zUREPi`6&1II?6@0RCNG3HIze0J$&5n~u13#_PgHyDb4&CHMu3kR z=U~`sInq%JTU$qsIQKiIPmh@2=ithkt9h*!PM<`*S9}hZ0lm2!G!q9xxKrGLU_}hb z#^|=2ZY8?exV@JRzOoIRg|w64v-UShw$?7T-`ejYz7Fuc*=~P>-`el>H$+mN!a6_= zRn|T-nw!3S1dA)A0y(vkezP#RalJ21Se2)f)<2GzGt7e~YbkpHo3Bgg z>5h?5gm56S49iBJJxjnMOgcB~HkJC{omPUG z{+wh_0}irhaEG9A2IqWgvggICK#^(Jo;BtPha-__l>65~;^prfMjjl0)YoLM$q6Qv zE^UB4>Ee#L6Wv`I_wp;+v2Q47o|wLlx!D`IX)7+o%XZ=-*A899&H=+%i}YcMO2#X$ z=n`Dv({7ZuOtJIunF>)@A0F6B+&nt7emqXF)%+z}!yhijo#+{)TsnnMzAc93(iN95 zAz_<@?GlndN>`_ZT@rRn*drlPf$J1@Cf+pnV$>SM$n`lqET3QMI0Gt|UkM4@Bm}$g zt3yJ)27XLry8fg;OJYA1`kQLt}_`(*HHLxA$@gB5<>zN zohcYvCmSSGxo!pL$@ePvt>9F@SL3MYPmLqeW|Ky7o^XE&e8V2XO$)a#cDNt!8^sP^ zMw^c06!j_@_GfZG??s~$>nV~+8~E9Y#hZ&V;c3zwOC7r)lvnKXx?~3lYRO#77K)rR z59Y;$eUjZ)NVrnMRT5q#VZVf{C0rxn#S&g3;iVF;mGCkNFPHF&QY-A1j&UK3AP4GpIwMsLzhw!TxS77{89v0aO(PjRAjwIg7e8|49YtI z5&Xh(t1k6k!A9*gW9x1zBTVTCP5TvrmPAK&Rp0oW)i7*gLuYjk8Uvlxh18&8G|3MY zL{DRBQ4+vC)(H8Xs#d@rgClG9$!4c}h(8Z;FV$y2owl@H$?yZ+Uhzqq+)*{GwZ!Sj zlx;Z*&U1U5-l1G!XWnd38d7huyEatpT*0*MYRl~;q$@1kAvnFl?QP%)1>5bpP+rqM z;6N7g$7eph{+~F=IP;MmhY60Zf8$kxkH2u}!hZpFWj}B$!4r#rzVkTX=7&cgC3w%; zNc;rgZ=Stph+zMY^B*L5*8D%*@h0Hdg9lzA_{2rM=f4Fw=lc)sBe>({%bt3RADD1Y zi;UP(XMqeMB*kPQOqU>7a4_3}eZx{;Q^owDS_zy?^(NPe@6RKZw)x-*@#MujVnHbE z;j~tcZE9}4tdeE{9di9u2bQ(9AP~BV0q*j3W2#`O$4JZuv!f$9i6p?(kpiMa_~Y4I zUnK~;27Q$oW?S$uLv~^}^U41UcmszS;#8k)!jh_el({%dWf4ytV{7q&M%a~jaC!uteInnM(E`slsQyVXYtz#j5=JO%piW%1N*GF%Ul@>I ztp)|1vVWI^#5{EM2oRbhA@PYPR%{saRA+Whr_``KO&2U0ilJ$pGvoaC2>w0R{Q|Jk zh;_e$HqE;1UOcg*xM@XH$yA(nSU6SoXdSCnm9o@vu27XIdaEmVWSqmhC5OxY3({G0 z#6T)C9V8`FC|c5)2y*0ujsi}brlJ6R+J+Af+^IEuTJnVN7$ACW!eCt@MOA{<4Hjn5 zd?7@wv)&59_mZL(BV9wQYTAhm40Y1R1<9>?tuZT0_nAGy4dqFKm+$LHdxX5Sum$24 zxC;#&cjd58<}TD9yH1rVSZsrB`AA3VesZcL#Fkdjklr;pL!Fi-D{^T@YCX1?NeO6+ zkqFh3Of`C`DqQhJFRy{&8b&WsSoFZ?Wr@GRfAqB6-{3#2+C&X1-mxBVdbG;kg@i2f zTD-^=8B&{$h`7%~$4QYa&Ww7cKh3M*l|z&ms*QZCPB=763`){=Pr=KXc1Z+x_5-TN zz0=ax5|1PsBKx%(TRzpLvA)#UeV{?L?T%}=^I@WoDk;xc3_Mss6K$FC%B73_W$Y*4$$Xuj2M-^QAS=A=^Jh6X zyyRtpXN7X#YeKnS1yBxQL2ER7tPvBFc}aSDn0__V#s2*5Y3AU6G{M7(06aKRl)0S} z{-qQzVxcn>N@`;v5@H>z=nA$5g=-j0o=B@|-^t3>bcVU+h)k#Ztam0<`C1biw7v_^ z<6i({Sgkd@tmcQ%R;xAmv097l*EIY&<%(HU7PE7qK?cR)Q|Ap8qlEvEsXo<}87sum zWwatW#!r6TRCm>39{U(JpXx@kEj3Pl0=oszH8 tSE@9>xks3_MspmT};tgYdYf8 zY2UyE>uem=1!_5RqJwhLD#uLjI9=qD(*^y)FQAiO&CWb&!v6)ld1_!!9fx&l=V2m< z!}y_EYaFI31OoYk)PVk%052x@mAnR0mp*IVhKYh1A*xOO&#Q&Cco=)Q4r887J{p8E zI;L+;Ci&q`!)=49wdq!NO7g7!KSlLCYtH9SnqcFnLD;yyWo$IhVxA^5vRPp;6=vhW za{WIp2ySX(jiY)hnq`!Pf zppenDLEbx^ReWZ#0V5)cqM{%uqFjO;3W{=yAS%cqf`|}M$@l-is_yCTolQdW>Gy4Zzj;-!UcIh* z@BdyM-8sL)bIX&d>h`WyYmJLaWinOOn$25oRZCB{t2LXe%jYcBZsqDkB$4T|&dJP& zK)$`xTEt!nx4N`wOsd9n2P9J!`E_n>W2&l6S*|-U-m$x_EobF(iTT;J2{&=r1@0iv z9h^)J?CMsX*^c&gmOG?3H6WwK+@UQTDF>3^Ppj3D&nDd68p<2W8ps|C7*CdYN-TlBdzYJhO!N*@@715k0c(Q*kA-&^J{yogu7>|V#zcNs2l>L zlMPj=@>z|IjVEr!pZqkGd9X@X4qfptzcJt3n_5~{cK^%I)I2_-tZdY}?i6Y3psMRg zZ@%ivw@54Mj+*lX=(0QSzMORP(v{DVzVX7VM?ML9TyFijq?cBWeCWv$dCajZ+iAID z8VehjyL+W7+q?4a-oE)|#v%V2zwT3_eNk6_>SWWtujlTUw0E~h+C>}euWevs+U@?d z$0i#p`%c?Tv`qfWH7Tnww9?QjL#qv~F?4{TwT2EfG~aCNDISoj=>cxLTitPXFb^Nf zx+Sqx1k7X(^xT7zcE5;R(%qG}7j(5I+=BtjLp=A;q&=k99%>iimb8cW+5xCyb~()0 zWh}f~WoB7zXpNx*46QYEprHvv2N^op&>@BflRmtV8jx$rc4Sq~J;HO3MC33qa`|j$ zk9(Bw&3vw-y*aVMZ%DXxizZB(GIjd&X_Lnn{}PRjKIKNvaVzq=jzR{*BCV1|| zKBkxyF~yQC1pnkhsz#?0cZ%muP1;fT>83=JB%6eL3{+3^-07G?rqjx25$RcXM%tZe zkIAO0Gg6)956$2=Z>g?qM=sIS-nF{Jn%mLca@LY;S9Z0f67I3kcU&^<&bIflf!q2x zZs*WH)f;U1Cuy)$wmnb=cAuL~yYnJpFn=5B;_Cf+z8Pv3=6KI9l~ao`8p=gWn1fAtp60oyCsQ>Y*=DOF=cd!{8A*F& z#Q)9NoRv6Ub+5C!8t2c^xj7T$<};1`n3pD)bexsEbY3hSGcO_Cl$V#oek1Loie{rC z%l7iJ@``-3m8xc1&ABaUw>7zn4(RT(TuVy%bGqF&3uCs<=9pKfT|240+aC9>L)8_6 zzF~3H(Tldlw0)ND-4XBI$=;RHdvqKFe&Q4@ZPe0br3U7@RlZxbTFUK~1)&GXb0Yv& zMSHdRc|6x|9uGoBWBJY52GjML)*3pjUND}=JmAkl(uN<;h95&lJ|}GC5>8kXZZ}S_ zVg0p9+p^m%8&+OttbA_VPUo@jcCGAbD}N;3dp&#au$9lZY;D>L5PKiB5py3)+KcT| zZA96}J$FOWUTPNuX`|g_D;+@g3fqKCbz4VwHt$|2QFD>!_J*V86Ns9N^++#?kMvSw zqn*lSmjMcwN1(9DsLO>?qxNfme%`WP{QOBhzE8!+_vu|3-)CUY&(fY((4H65o}csF zE2H-OJnXrtk3C!S8SFm4Aol#C=Ux@I=hd+1HF~^XijQ~mu8jA~u;;aQwTFfeDi<$V;a;C#?taZ4XmbzzbFI6`=8$#__M?IR4G9;P`Qv^@)fVpET;Qxj4yR zET1a1xtNX3McREjX^(I8_b&dK#6S8jD`XWCf*duTLG=IDbDyQRQbVwfSJ;THmZOOK zyN1CyY`D+aDCEpRwqXtS+0WbNpCw87_u2$6#7*#rT`|EQ;j2HV+!~Oz?y1R zojLM5SAOTo?|jXNK};aN1sDE3;=;F$o*iH0%x2~vurCJoPJCeRh6c7}hX?i^4EsI} z`%fD7bsF|xp8M~pVLyOj-;Nu$y0d+a)#`4gX%%ybr^@jgGO(&(J*_H11Qe=D1R@7` zRZSjp;ZB}~R1NxP(`o=861o^rwMD(~l^V$ELOQ~ zy9+n8rBfp*-{f%GaaUEe=G7=R(^Y$*;Hf?F8+O%b*3)VXrFdXfdx=2QReO`S3p3jX z{o$&8`H--y_ABa@9mEb^Mls!}{i#ewJeFEwzBr($d1p4KY8?8*7YFho@Wny$h{cYW z3BEWObx$24+^{bWrF@gadBuHE*^PSk-)Q%E?s59!w zgEPzt!ACZ@^5-w+ehN}YqXqUF58^Kca`eS`tkvDB9oIOCd7-kJ07(pSB8aCZ;TOl@ zOiOpGl~a>Zi+FGfoAqFDTCJ=o!&x>}n~@`*)vBhVxi>W^<39^3j-d)>GBu5TV}6@n z)W_hR!Ky@LGcG}AqGl@V)GSmS#%Pdi)v>4*h>hb=&<321Qv4b9VWEI?aZg6yL#c(*{RHWLq3@tv4{p}1 zI{&IlYUaX9ErKZAbE5QXU_V?`=L*fZ4N;3x#riGs{sWePr*A-1Bc;>oByt2_OUa12 zU>SKZ&u((gU(+AH`aAz`Z?l*%3I`cF*w7({4mC8u?{bU>K5gQlgNqnrAbm65B=glb zF=_?m{L2`XGIJTY!Nz)ff>CWqe}~Fp^;%%0p`#5=8hUtre&PBpTE=%+Mh_?@?u9Yi)hQD?x|+s>S)8gIHub| z`6h=B*U^?rw6%A1WaQdVwSpb^%pS9fmTEQuXt5{mLJH0Ht&RzwmzSZ<|yPtNmR6E0>+MJAp9&g^*Ih-l2Cwi1oncjaapq&y? zd$xIK#Rfmp0H3lFA$d+8!AMB1L9-6YwJ2$ztYa4eieo@^9K4*e>1LY-%3#nQxu5AjpP5f)U7gJ+@FNGqngVZHj;iXiSR+o|J-OuGxjBf2Vkq7@(ZLqof`6R2%?WInd z#cj{0AfdtZX?BQ!=`-MX>a)V{3#KdBvp+C>j-4-k} zEx%8_reh5q7utK3^2>GR;YGi&e{5b~?IXlZsm$s6(sTNn=}{wd8c&_YzwOUy+gSQ~ zpW#qJXcFW(AuNP77;sye%5@i6(CL45~2t>C5- z3cedv@I7C_+@gY;0|mE~Q1Jbzf?ItBIDh&hxGhldgAxjU7*%k)uK+3BSMVe7EKl75 z!WHMo?12@BsTa3g>L;vw>P}(nB?q6wz>IfMpvhtSD0u4bpdD}Wr2QV<&JacWy+M0L zgFf&5RJRlUXumILuWYE2{`c#4LaAW!4Ys`^HISbd=r!eM;HQRUTCKJ$-bQE6YVT@Q z50Fqlp(WVd7MgSRN>4pZ=_+Ndo~V9~B3yc!r+$GFUxlhiq!4-= zsvac=$-Z`jjdMJX9OdN|6_u43WLWB#F)6$j^ z8ZI?5cnROq-JxCp$KUA7<%)>o>#|^D@drp?WAR5(@Fyy;XQb7ODA+3;`re-9n7zVF zY)z{_i;&!uye!42m0lqaGbz_Ic@@=oFuw*KOo2Q#0rA4_x^R;_0v8 zAh)~$;!MB~X7MJ7r~ZcDq%#@6k)`o1vbBi;rHXQ7GG{7yz0NTdl?nZwGO>f`+r@r# z%|BSxZl}kK+nsk%42#!8I5&39Y0zulOuw2+@M?95eE+ zUGd8-$6$hAXL*X>W&smUNl5U6EkJ_bYni~|CtL9u@$;<^xcL=VWcygOAwTTW$l#}4 zaXmwgo*lhm+zs+T$S_eAzM>dz6ydURBR_Z=ZLk%94DAMNKD0)FgX)+h?au(e0ABeAuosE?+UqH#u})zQEj!(sp_%yBRF)R@{^iqW9lhrA-D_C-}N z6PVW$mMbi)KR*>hp@i7bR+HObe}P%W?u_LvB`$X|bT^p`)sJzkd=43qg-na319 zFNUb8en5Q77=RGLJ=kUd$aQ`eV?rYGAeWJQkw~?pOjscQjHG?x3CMj*~z=wN$ua zcPwLjlf$_N?pW!&W4Ux{l1}}&V+9-2DkU;9jg?Z2y5nT>itbp&YUyd5f+m;)ySh_F z4|a8@Ng=#nJDnV;!z)(gOYF@0d~2xEF_DV03V%O$2AgDGB^x8zx@F`UGX68A7#)As zXfx5>EEJTvy3ZQ z=@^FVS>(VdFpI$TLwr&gfqh3<4;HsTPdux{-;`jxoK~GOE-_@66r+ah_Lb}D_n@kA zP+SLG@U;C@2?e>Rg1oOFv=SA-!$NX42qWPf_CO>MqKt$!tb1y$u*2J%brfiFXh$q0 z=LYTmLUW#Or+tEj&oxHR&LZVY=ntf97S-~u=*v>9wecBd zW6UMj79SSAgZ+xIcjyi_S^ON+|17gK4bFSlLAkz8{;C-mJ|L}}w_t_39`z-|hVs)< zR=q11W{cu1|^2VrR3${q*}LX@F#8|$9> zfw04=^@kK_a_9o2*4u-2Kehfyw{xOFYTfFG${pb1zDqjwlUjem# z4G)jv>(B7K9uDi_7U<_>tm5rEzD}E&w67o(C(i^5ep^Dpvrz@V^A&_H zd!GvwJYPb=@1qJ{@D+rvoBseFBK40TjMP7|2O^ab!bp9Qbx*w{>~N(1nF37?(T7NV zIcWDI^%dPtRH6OVpxwU>eoeO%EolEs(C*(RzpmSf0zcAI1N9P#4_nn=!T00-4U|3g zCVu-}BLBw5w0cWqB<}w%#i+;MCQs*Vxq)M~zqr@@V)VZQVIBSNme9PVsJT>Vp;i9+ z?v%}IiT5A@#J(@8!%^~2qj*=N|6gG}rG{um7~EX>^;p#YJFI6HqW=S;a(dk7+69&ai??-(hRgM22Z*{1B(B`RqrOzvomkrXMSW2mc+7AM_qTgSn z$BMMS8Sph70B-Sh!#K*M)qx@^x#1uwM&WTVd4PvGZXLpEpWG0#@SzaU`5;+B+hIoA z&h7>d540aqLi>?MJ8*zd9_Gh}e=bMJ>X)5IK@NdYM`hupaLmRxpE3G$u9h(04 zj&1sRb@iH#H*|ub6AhhY=ww5u7&_I^V+@^U=yXG87&_C?S%w~~kBj4J7(v7YS|oNh zndnQ$QGVQz?&_BkmY;$AIFSVBv<3i__WOVKK)U!C^Z#4d^#!6z7N~qN34I56-I6rU-NPzVy0%!AeLcT{8?!jwX!N^a-znu(16U+ l20}2brX0XPClCxasHfV6tFfTpusCdQa%ek0vlI$-{{!{^`Skz* diff --git a/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.4.4.pickle b/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.4.4.pickle deleted file mode 100644 index b4cce985f2f0b18b50bc37c7c0ea347b01352b17..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16026 zcmds83yfS<8J_39x9=BVu@It*z}l7;pyL;w#9@}})2dyPW zDys*m_5waIDTVk-BqX9HhKL3=78D=R@(3bEB?uVpzB744p1vG1*#h2iL%w{)D(eTDdrUnvU&)Q{ zmUGWlvgM3cENZ(=-jyusBZYjaT&#?gF$Rx|jwWx#T1A&bW+HT_*&j}spEI|ZJ&AB4 zey7tdp2i||x-pWu5GEmDnTscpT5(b1A#2ERHsSF|D&XJbAMrddnp z-K8>qh87JL@?~Zwx_CHmCc-?neP^09ne9q+m3Vu;P|RuB%pTpGsH76R2v6+O@=u>> zjIa%az3bSg*nj{8yO9k#LT8)Un36Wz!nQ)d(R5|Jp>!AUxJ%`OkiA_nJiN~2;q~~} zkAE9Xc9@pjP|qOJQIAO3rFE5C>fG^bmG1RP+NS87K1Ub7Qb!lXjM-uK5Ie#iu9_*N zsaW|y5G%jqWhG2ivb?7uGiSs&zx$dQ>h9{$nElZgw-XoAdKQin)-Vj!2nqe%od~l%kon#TaI8|&;kL3` zIdhxp3q`ga#f;f^3vfq+0C%zqaHH9RR{mcB&fmQ~>T;M;DT9=Ba=)XbGoll=1{;eJ z3k2}gxCm`twnozLGoHR8c>0UiUTlHhgcj~8+(IeF!U$pg2P8;C2rPM*Z9%h{6imf7m}9W4~h0p)N)XUeY|V|CbGu3>uw!A$+o?HGGL!0vY-76$S*g+k3CFBIDR zDg;T>^M7N#a8TX_O7ovX8(5E z-kKntW{Dsjvy>NDAg-X0Id1DL`S9(fLW+kCdV!)Shh=|+OAOW#cqV+!TMOCm>JQsChzQb zeRGAtZAqa4@VpbeB{@<^Q@&pVC(J{kFB%C6kn+x?H9p<3hy`U)v;>*=MN?W4UNNQZ zF^#?8p*!uXG^mSAi4l&dt^lLFqKev9Zez8mD!i>Uo*XSw?QEZE%*7P>e8texxK=Fb zy|R(d-YUK#J<_fd-CYdrPNwxy4Tdne9r+asE28*f^7;1fqnT_rDM~Bm!2U@oTHcy0 zRZ^l_*Cn5&CW0!;EZ!ARkt3rT#P^uRcPo2!vzyg-TBn@M!O8yS{G;pMni~q8|Jd+B zqPcY^UMBj)?+;w|HfU%1yuCyx7r!v}4(PzQb3Y(@|C&gA67=uS+`pCRs^LqHOwLsR zm!UwBcdoCLGI_mJ@+NZ3m7BGwln%iHOsLG=RBEP^JzH8WQ&ooSlrr_VsY;Z(v7o4P^QA-lywfttHqgMaIcs}`&RUSX$TL6iX2NZ0p(PE5Z zJPy=P^v5Wup-vR29>8CUEj1oNlh^5-&Se|lrQveyZoo`>PTy(DsAH> zQ1k~!Xh(mz30L!yll?!?C+RXpFYLpUZHWdh&BbuK-+x)IP93UKeW|p_B`v2RovZPZ zW@O5ixa=tlB?~r7gk*9D>@eJ-UG=MD$Lt#(n(7Tlrh3D}?i+N+i|hk($&2(LE>hJ} zRIq5p2$sKz#){PtPDJd6pSc?%3CHog=x&H6V)h-!sX-T&cGk(;QIgW6!>OcDGlp++ zHiV>ivI=_Akk~syUDA}*?hAGJxgOTRTT^OeV<)D7BDQdE*5jHiln!mp#aCId6OpK$BwrbU{+Pts$?A&`)oy6AE20_aAq^i`ZxU z$iA0|K6&}POWp&W_49A+COUlU)xUht6B8K^03{Z$Tw##$Ab$4=fe1o=p8o7tjJi@b z?gEO0Z8{4@=IGKLSVfR7eGP3MU8?Z7mCN+7sCV;TISyR1daBN` zHdL15T+}D&hZJ?3sQmp>a;6Q0SBF(D@lDo<&xyTeA#1R-WQ%cvgqh+UsAMEjWE?4E z&1nkLj#jI~+aNg-9uYOokxX!qEg}RJ%|Kp8zom-^HH`s$nJ})$r;l>krJ6`sgb&8e z)UOHcHEu=6n#v4Sg|rbyB@rIWr!zV1iK0|dU(5X2;F8hBkb~EfIT7_*9~p^wcbc*Z zRDEL^?1`^ZM5IzQoj*?QmvU^RH13_v=~yaO!$!waReadUyKn+QY_#DVLTS2S4`k@f z*{z;Bl`E=wpK>&ZV+hvqgZfk{CVP#&$&Sm)>&(sRCtpaO^GU@+MR3dCyX+xvb zV0x~$eob`7oGpru)h7pWgb^W7^E))sU<8db)ia1e<6twlClq?)@Vf^WAlq28@}55v zop|cP-i4sgZ~Nh1q91-W@ytS*g#@fi*85}F|6W0-ZwiH#l3XsyDoIwG6f2$?NH&9E zPx+^r8$1Oj7wG?45A+di^!b9m&E4QG3(ayjxC==OsG&+tc)K+3bZBpy+CuhJNYdIt z@ScC0Nh2SCtN%ZJkpYI_vauNAG?viEIrMQZeVm66*8vpcb~1vrc{ZTJXt9fS_UVul zIvnw$%B4o1#M%SPu6Z>2s!`e9;8vqO?uJOBB@s~-+&ieDN{x)CrtfW>HCCs^i+pz3 z8-{^0JpTk@beUo!z0VYJD%;v)4;sKun#687kC$0d*G+lA0rnRqHN0qm1NdomE~09S z{deGw@e)4t)4#da`C3-mt* zrvpKMf{^MLFilcJ6@+;=rL}lrBq(K8Rfu4{Y2! zQLE8t!v+(y>+fr*p$axA8{*7^o~3t1j4i;Gw{j@kczl!S-Re9%)F)xoRzMkGj$U039Ev|y-4jTRx%E@BN z5*q78u3)_+A__Lf4rm*IQETcER9Uz|O<+wjws>NF^PHSWM)N9}JxxIaoE?0DgqAL}>T+vmEu2%F4ML(wKm5N@a=o&?@ zR`eQ0KYrS3?#F7N=hWQa@_`<_3{J=4ol%NrqIIM?lc%>Wb0XeQ)y+C(JemcHBe41J zlauaj8(U<*#%sNtXuZJLDPLf;4wMm#eO@)SI2((E#1&VxMbTD8+Z1hAv_sKOMY|O3 zR+Ri0y#jh1FI=^W*C4&`NH%efzar8v+Hff9V=HmGbPD;BUE_1|B@y27=#I)RUGa+A4In;k&n#9KHViA1AzHujc|kBNRfDzZ=?V;JBEzqI9p;n=eqAy3{HwePL)dUe*q~ZYwG|2 diff --git a/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.5.3.pickle b/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.5.3.pickle deleted file mode 100644 index 537864af7028b2c282150d54bf58aa3f3d553c01..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 129175 zcmdqq1(a0B-Y9y4ySux)ySuwP3=Hn>Zi5dbSOO%F5Wy1MlRzK@NN@-N5+H#DcjxuM zk8^U)ckXxZUGJ{<-g-5Q{`Kx%dv{gss`}S%hGulcc`>WM6D}-`2+bN2rC*13-P`r; z7aFz^xpz?CF1))m$>De)8aA-*MHa&v|wW;=cV{T}0`jF_g26XS= zDg4{rdME(4l{z`9k9Uy&&z|^$Rpbtjc|R4Grp9yJET5#{`-q z+=S2y@3apMt4P*B+eDoZ8YoHF8wBjwe@O43urr14k}r_6#_M*5a-g)yAmMbVETEAX> z`}gYGF{p2#UqtR6G*CT*GX|O|(%)MsVpt1>24@e67`V3WLWA@Eqoh#+J?(E5`sZ6l z47`7dknnYi{=N9YW&XABWx@&{*6tD7x9b}k_Bb@~*r`{qz+;~QgF}M?`v*)6G*F;O zwE~waxK416;JShOHTc*1g%7-0^EW5z)ico85&946_0NNYUZ0|0NYsA4+jQz1R+B)N ziPWikuXceeoxr8)*E=wdf^xsv9}=g*>&=RR+E#vj#6QMFtTr8kI<*U2{5IVJJuGmZ z_T77R=oYZe>!t}w^!H2OrDy-3zCGIox=%>V@?rUEwd>igbD+InpDt^l=KrWz?f+D- zutt1SuXoA?Mosy^p9+CL75`bckc8gv?RX9pDy-Ybf8Ff^<$paA-(Jh$86mO${k1@| zhD7YtH>gi&aFmc3uz4cz(?4`#STCJYp>DWvM^Yas z_aI)laH-d}niXb`a(B;%x&7$GTn_`r=$K-7n2(d3>HH{Ql~p}9hB?1@_(+cfUO70w zL6`+wp!y>=U6;`Q;a{V{O4(pzQVh6g}-_!s4b$WAD%Cc`v%k=+p{r+=x!mdzQ z+lC$a-?x5rIsTh!M1E6^ykRH%XZoL4U!Uf;Xaba)s zp94&sKpzS$HM;yS%*KP?4vj#oy-wj7Hd%zF*Ml)&q}N+-rkSv(Ve?;DdOi09jP`mf zT)4228J1$aAzVzxVr+(u@7F=6U7)Z3m#H(_o55REX8g|w@0*NC%f2xyv;X_t6X(Cq z>u)E?up`6D{GaEax4FVdK7hS5O@|6%K*~JgX@GuY5%sv zNB>VFrfZ|H!vZ%IVYghNuh&ccLc6vMy!hr$D|lGfwr@59Yr^1J!E?e|G&Fd=y@B;t z@WTJN@Csh?KU#Ge6#u&F`Y%hA!2A5CHCNzb{hwWPg$tK1-nHpZ0&B3|%SW6Gb60`* zIi3cLy?5o1Fk3Y)vH$5m*Jl6!8Ycgzu4vaafnM@@Ed9TIdH%;^X?viE|J_(x9@zi4 zu@v7}3j3tsZA1wSsA%o`_cL6&E)E>=kEQVIq11Ir*h$(C2#L@k5TgT~I3!ZwR;xhZ zBY-4-zx@jA4C>sgZbkdU;l0MOI>ER^i)t zU(I%P)?iK6Vr|yp+k2LJw(GM28?q4_vk9BB8Jn{OTe1~fvklv_9ow@5J2Hr!*qOoX z!mjMb?(D&y?8V;f!@lgt{v5!89K=AFd!2@GD2H)4M{p!ZaWuzpEXQ#?CvYMsaWbcH zDyK1o(>a4PIg7J7hjSUqd7RI8`5qT=As2Bmm+*alz@=Qq<@}HzaRpcMV}8O_{FI;Z zbAG|qT*I|o$MyV@UvUFBauYXm3%7C`w{r(~au;`V55ML&{FZyUkKb`W5Ab^)ftGvcP`4_MA25<5f zZ}SfC@*eN=0Uz=aAM**H@)@7=1z++N10SZpPVX>0BQPQ(F*2htDx)zvV=yLTF*f5c zF5@vi6EGnYF)@=cDU&fdQ!ph{F*VaLEz>bQGcY4FF*CC;E3+{>b1)}!F*oxtFY_@! z3$P#yu`rA9?Gmn-?cyxKk}SoryT-6ohGiM{F-}-2&kC%_u#a}aQe{?QRaT?F*$aFO z^d^5z)?#heVO@sZqlTsW47*7UOAQ%z_ZgNNvk9BB8Jn{OTe1~fvkk-US;JC0wr2;1 zeKru5g4l_j8O$#1%5Dt%4&!i+;7E?*XpZ4n zj^lVv;6zU1WKQ8!PGbnCa|UN}7H4w~=Q5P@IG^wGJucuvF5+S?;rsl6OSz28`5`~z z3a;eG{DiCcDL>=q{DP~whHJTw>-itLzw$R;;$>dp@BD*T zd5wSaFJ9*j-sCOb<{jSUJ>KU7KI9`l<`X{UGd|}FzT_*u{!BD5KfM$F^)wYOJR>k7 zBQY|gFe;-lI%6; zFe|e$J9986b1^sbFfa2lKMSxR3$ZYZuqcbMI7_f3OR+S|uq?~5JS(swE3q=Guqvyu zI%}{dYq2)#urBMdJ{zzh8?iB)uqm6dIa{zLTd_6Uur1rMJv*=?gV>3k8O$#1%5Ln= z9_-0p?9D#x%YN+70UXFd9Lymc%3&PN5gf@;9L+Ht%W)jf37p7DoXjbl%4rPYbk5*R z&f;v&;arAt9_RC2zQ+Yz$VFVtC48SBa4DB@IX~n_T)~z6n4fSJKjmlqoL_J?*KjS@ zaXr7}SKPpj+{De?!mZrK?cBkg+{NA8!>{=bzvW);<9FQ81N@!``2!E}Fpuy@{=}m^ z#^XG}lRU-KJj1j6ndkTm&+`H=@>l-GOT5f0{GEUBDzEWR{>AIO!JE9r+q}cOyvO@| zz=wRq$9%%4e8%T|!Iyl+a1q|b{dX9i5%~J|Y64jz*^bO8jLK+?&KQizSd7g$jLUe8 z&jd`!L`=*iOv+?T&J;|^R7}k@Ov`jk&kW4SOw7zI%*t%c&K%6iT+Gcp%*%Yt&jKvS zLM+T8EXram&JrxiQY_6fEX#5%&kC%_O03K(tjcPv&Kj)ATCB}Ftjl_=&jxJBMr_O` zY|3VA&K7LRR&32SY|D0R&kpR!Aa-JB2D1yhvKzaz2Ya#?d$SMwvLE|%00(jq2XhFA zau|nm1V?fdM{^9vavaBV0w;13Cvys?avDQ8oijL-vpAb`IG3TE$N7Ah?{NVaauFAE z3E$@jT*_r!&JXz!S8ydi<|kamPx%=?=NDYfHC)SeT+c806*q7rH*qt!a4WZQJ9ls= zcX2oO@N0g#%p?4fKk+D!@ibQ zGcY4FF*CC;E3+{>b1)}!F*oxtFY_@!3$P#yu`r9UD2uT;ORywMu{6uDEX%Px-~38% zxNsG1S7K#WVO3URb=F`_)?#heVO`c^eKuf2HezEoVN*6^bGBehwqk3xVOzFidv;() z2C)-6Gnif2mEG8#J=l}I*qeRWm;KnE12~X_IG95?l*2fjBRG72otoW8e#^bw$M3kG2lzb? z@&_K`VIJX+{E0_-jK_I`CwYped4^~CGtcoCp63N#bQGcY4FF*CC;E3+{>b1)}!F*oxt zFY_@!3$P#yu`r9UD2uT;ORywMu{6uDEX%PxE3hIfu`;W$Dyy+NYp^D3u{P_lF6*&A z8?Yf8u`!#lDVwo5Td*Ztu{GPUE!(j@JFp{z*omDP%r5N8ZtTt;?8#p2%|7hQe(cWy z9LPZ&%pn}gVI0m89LZ4}%`qIyaU9PHoXAO>%qg78X$;|X&frYW;%v_0T!wNU=kr~@ z#|2!-MO@4!e4ih1DVK3MKjcST!Ik`&pKujF$3qHvJo4z37fJRo3jO5vK3pi4coFE z+p_~ZGKihnnZfMBuI$F{?7^Pw#op}0zU;^T9KeAb#K9cGp&Z8H9Kn$s#nBwYu^h+o zoWO~k#L1k(shq|TPUj5H6w8UnTeU1g;|-6 z*_nemnTxrZhk2Qg`B{JkS%`&Mghg45#aV(SS&F4uhGkifZs!i}!9)8Vl_$~KxAHU;%9^m&p$RBu!hk1lQ@+ThUF&^g$ zp5!T><{6&l&pgLpc%Bz{k-zdcUgBk5;qUx|S9y(p@-JTJ4c_D}-sT{)#nep0v`okJ%)pGy#LUdXtjxyj%)y+@#oWxpyv)b^EWm;+#KJ7XqAbSZ zEWwg2#nLRpvMk5)eDe)ufxrK^U5S-hg;iON)meizS&OwZs!i}! z9)8Vl_$~KxAHU;%9^m&p$RBu!hk1lQ@+ThUF&^g$p5!T><{6&l&pgLpc%Bz{k-zdc zUgBk5;qUx|S9y(p@-JTJ4c_D}-sT{)#nep0v`okJ%)pGy z#LUdXtjxyj%)y+@#oWxpyv)b^EWm;+#KJ7XqAbSZEWwg2#nLRpvMk5)tiXz_#LBF~ zs;tK9tihVB#oDaHx~#|gY`}(W#KvsGrfkOMY{8an#nx=Ywrt1t?7)r;VkdTHFuSlT zyRkcauqS)5H~X+J`>{Uap9v5&S7jZF{@O^&3rCi44{E#1U1y}N8e!^A!l%Mf)e!V$^ zHe++PU`w`QYqnuqwqtvCU`GbA6FW1QUD%b~*quGtlfBrReb|@%*q;M9kb^jwLpYSf zIGiImlA}19V>p)MIGz(Yk&`%?Q#h5=7{ckC!I_-J*_^|<4COq|=evB53%HPrxR^`$ zK0n}6F5_~3$d9;!EBP@$;VOR0&-gjN;A*bnTCU@Ie#x)6fg8Dro4JKsxsBVogFCs4 zySay7^BaE4z1+v|xSt33JrD8+9^zph;g9@@M|q6Ld4eZ-il=#oXZbVF@fV)w1zzN@ z{Ee4*nOFEb|KL?###2Cu|6BHAsewVo3JUHu{m3?C0nsI+psO$u{}GmBZJt9of*t7 z?89LixF&Ji5RQ5?-N9LsSW&k3B!Nu10noXTko z;dIX6OwQtL&f#2!avtaNUB1T!T*yUS%q4uEA8;v`aXCNaM_j>`{Ft9`6+h)?{G4BK zHP>)0*Ks|+%ko{F&$Y3(xZcFY;IZ#!I}+EBu{*@G7tIPyWU0yuq8i#oN5Y zyS&Hye87i%#K(NXr+miee8HD|#kc?4%R4c}e@0+LMq*?}VN^zAbjDyz#$s&7VO+*z zd?sK*CSqbHVNxbza;9KPrebQQVOpkRdS+loW@2V$VOC~icIIGC=3;K72otoW8e#^bw$M3kG2lzb?@&_K`VIJX+{E0_-jK_I`CwYped4^~CGtcoCp63N# zbQ zGcY4FF*CC;E3+{>b1)}!F*oxtFY_@!3$P#yu`r9UD2uT;ORywMu{6uDEX%PxE3hIf zu`;W$Dyy+NYp^D3u{P_lF6*&A8?Yf8u`!#lDVwo5Td*Ztu{GPUE!(j@JFp{z*omDP z%r5N8ZtTt;?8#p2%|7hQe(cWy9LPZ&%pn}gVI0m89LZ4}%`qIyaU9PHoXAO>%qg78 zX$;|X&frYW;%v_0T!wNU=kr~@#|2!-MO@4!e4ih1DVK3MKjcST!Ik`&pKujFJXAb6MF6L$)=4C$SX8{&u zAr@v47G*IOX9<>MDVAm#mSs7XX9ZSdC01q?R%JC-XARb5E!Jio)@41`X9G55BQ|Ce zHf1w5XA8DuE4F4Ewq-lEX9sp<5IeCmgV}{$*^S-VgFV@cz1fF-*^m7>fCD**gE@pl zIgGtBYiI;hWzw-}XhnIm{0hW&-k1#_>!*}E{^!m z@QlESjKs){!l;bK=#0UbjK$cD!?=vc_)NfrOvJ=Y!lX>ba4+1Y{k}W!?tY4_Uyop3}PpCW-z<3 zE4#5fd$1>au{Zm$FZ;1S2XG(a4PIg7J7hjSUqd7RI8`5qT=As2Bmm+*alz@=Qq<@}HzaRpcMV}8O_{FI;ZbAG|q zT*I|o$MyV@UvUFBauYXm3%7C`w{r(~au;`V55ML&{FZyUkKb`W5Ab^)ftGvcP`4_MA25<5fZ}SfC z@*eN=0Uz=aAM**H@)@7=1z++N!^IW<8J-ask&zggQ5coc7@aW~ld%|^aTu5J7@rB4 zkcpU>Ntl$$n4Bq?lBt-QX_%Jjn4TG!k(rp8S(ugCn4LM8lew6id6<{^n4bk$kcC*7 zMOc)@SezwTlBHOhWmuNwSe_MFk(F4PRalkPSe-RkleJizby%16Sf35pkd4@wP1uyp z*qklclC9X9ZP=FW*q$BOkwNUl&J1Q3c4aqqXAkydFZO01_GLfz=Kv1mAP(jb4&^Wo z=LnAED30bBj^#Lx=LAmVBu?fOPUSR)a5`sjCTDRr=Ws4VIgj)CF5lw$sj@@+)rOMsDI}ZsAsL<96=gPVVAv z?%~(`hTn28_whUK=K+4tgZzPqc$i1{BY)yi9^-MI;7Ok1X`bO({>*dyh39#J7x^oH z<0W3^75>gYc$L@qC;#Gg-r!B%;%(mHUEbq;KHx(>;$uGHQ$FK!zTiu~Vz_wXKf^Nu zBQg>rGYX?J8ly7?V=@+FGY;c29^*3s6EYDKGYOM28Iv;wQ!*7(GY!)+9n&)dGcpr1 zGYhja8?!S9b21lmGY|7JAM>*S3$hRkvj~f_7>lz6OR^M8vkc3!9Luu;E3y(RvkI%S z8mqGgYqAz=vkvRB9_zCK8?q4_vk9BB8Jn{OTe1~fvklv_9ow@5J2Hr!*qOoX!mjMb z?(D&y?8V;f!@lgt{v5!89K^vK!l4|-;T*w{9L3Qb!?7I4@tnYkoW#kT!l|6b5KiX| z&g3l4<{ZvtDCcoL-{pH;z=d4I#azPo`2m-58JF`ze#8}A$&dL7SMgJR#?Sc$S91;5 zavj(6OMb--+{jJb%q`r?ZQRZs+{siR}2?l{AYMZU_?e@WJY0BMq_lwU`)nhY{p?+#$$XYU_vHh zVkTiyCS!7@U`nQ9YNlaYrek_$U`A$QW@celW@C2dU{2;@ZsuWL=3{;qU_lmQVHROg z7GrUiU`du@X_jGGmScHVU`1A9WmaKTR%3P6U`^IyZPsC3)?V$^He++P zU`w`QYqnuqwqtvCU`GbA6FW1QUD%b~*quGtlfBrReb|@%*q;M9kb^jwLpYSfIGiIm zlA}19V>p)MIGz(Yk&`%?Q#h5=7{ckC!I_-J*_^|<4COq|=evB53%HPrxR^`$K0n}6 zF5_~3$d9;!EBP@$;VOR0&-gjN;A*bnTCU@Ie#x)6fg8Dro4JKsxsBVogFCs4ySay7 z^BaE4z1+v|xSt33JrD8+9^zph;g9@@M|q6Ld4eZ-il=#oXZbVF@fV)w1zzN@{Ee4* znOFEb|KL?#vnSjLkTV%Xo~>1Wd?8Ow1%q%4AH=6imrfOwBY*%XCc749v((%*-sz z%52Qe9L&jF%*{N^%Y4kw0xZZvEX*P-%3>_e5-iD5EX^`3%W^Ew3arRVtjsE`%4)36 z8m!4$tj#*C%X+NO25iViY|JKX%4TfN7Hr8@Y|S=o%XVzf4(!Mvc4B7+vkSYj8@say zd$JdMvk&{SANz9v2XYVxa|nlW7>9ENM{*QLa}39F9LIA4Cvp-ea|)+&8bdgpGdPp8 zIGb}gm!X`;`Fxk}aRC={5f^g_-{%Kh%4J;65BU*Sa3w$HCtSr(`58aw7hKIXT+4M_ z&oB8EH*h02aWl7YE4OhwcW@_naX0tyYktFTxtII+9ryD9zvn^zz(YLDBm9v+@hFe+ zI8X2-Pw_O*@GO7kIsU@)yuge6mA~;4FY^k2=O4VvYy6Xc@j7qtCU5aJ@9-|~@jf5$ zAs_KEpYSQ4@i|}cC0{XILh+yB8G#WQiIEwFQ5lWV8G|tyi?JDpaT$;CnScqIh>4km zNtukvnSv>qim91~X_=1cnSmLZiJ6&&S(%O5nS(i*i@BMHd6|#-S%3vuh=o~%MOlo+ zS%M{5ilteGWm%5pS%DQh8VP1%gi*@7+E zimlm(ZP||P*?}Dy#7^wYV0K|wc4K$;U{Cg9Z}wqd_G5nz;6M)IU=HC>4&!i+;7E?* zXpZ4nj^lVv;6zU1WKQ8!PGbnCa|UN}7H4w~=Q5P@IG^wGJucuvF5+S?;rsl6OSz28 z`5`~z3a;eG{DiCcDL>=q{DP~whHJTw>-itLzw$R;;$>dp z@BD*Td5wSaFJ9*j-sCOb<{jSUJ>KU7KI9`l<`X{UGd|}FzT_*0OCk7BQY|g zFe;-lI%6;Fe|e$ zJ9986b1^sbFfa2lKMSxR3$ZYZuqcbMI7_f3OR+S|uq?~5JS(swE3q=GuqvyuI%}{d zYq2)#urBMdJ{zzh8?iB)uqm6dIa{zLTd_6Uur1rMJv*=?gV>3k8O$#1%5Ln=9_-0p z?9D#x%YN+70UXFd9Lymc%3&PN5gf@;9L+Ht%W)jf37p7DoXjbl%4rPYbk5*R&f;v& z;arAt9_RC2zQ+Yz$VFVtC48SBa4DB@IX~n_T)~z6n4fSJKjmlqoL_J?*KjS@aXr7} zSKPpj+{De?!mZrK?cBkg+{NA8!>{=bzvW);<9FQ81N@!``2!E}Fpuy@{=}m^#^XG} zlRU-KJj1j6ndkTm&+`H=@>l-GOT5f0{GEUBDzEWR{>AIO!JE9r+q}cOyvO@|z=wRq z$9%%4e8%T|!Iyl+aEZl#hGzsuWF$sr6h>t?WG&Wa9oA(%)@K7YWFt0a6Eh=VzVLphAYIf5fOilaG( zV>yoFIe`;7iIX{nQ#p+xoX#1X$yuDuIh@N-&f|Q(%lEi|3%Q7kxrFcY11{wHKl2=a;dx%*MgGd)c!`&Jg}?I; zUgb6Z$-j7=H+Yk`c$;^4m-l#|5BQLe_?S=ll+XB_FZhzL7%qwU&+v@Eh>XO@jKZjl z#^{W}n2g2PjKjE$$M{UZgiOT5Ov0p0#^g-FluX6cOvAKH$Mnp=jLgK$%)+e9#_Y_& zoXo}C%)`9Q$NVh7f-JNjtpWac4jcUuq(TCi2XQcma43gyI7e_KM{zXAa4g4hJST7>Cvh^Ta4M%Ugwr{LGdYX1Ifru@ z%6Xj6cljO{a3L3QF_-Xte!!(%#^wBwA8`d&@?(C&Rs58n@pFE`)m+21T*vkNl3#HH zH*ym^95h>6~iSJ{~4YU7?F_}nNb*((HNaE7?ZIWn{gPI@fe>8n2?E>m`RwF$(Woe zn3AcOnrWDp>6o4wn30*7nOT^X*_fR?yQ zj^_kU84j-r{ZE;a%S2eLmnrKH_6O;Zr{2bH3n9zGAp!;y=SP0wXdKBQpx4G8&^Z z24gZ7V>1rpG9KeI0TVJ26Eg{uG8vOI1yeE=Q!@?IG9A-112ZxcGcyabG8?lq2XitP zb2AU~G9UA^01L7Z3$qA|vKWiA1WU3MOS25ivK-5^0xPl-E3*o#vKp(i25YhwYqJjP zvL5TR0UNRr8?y#_sIFp6tcm?8Cn7 z$Nn6^fgHra9KxX-#^D^nksQU*9K*33$MKxNiJZjAoWiM`#t=^D49?^%&gLA>Whm!y zKHuegT)>4~#Kl~~_xS;rav7KNLw>{+T*;6530Lt`e#X!F1y^$o*K!@#^Gklk4cy30 z+{`W9%5B`v9o)%X+|51wn&0qS?&Usy$NfCO?|G0v@DLC42!G^HJj!D{&J#SzQ#{Qx zJj9yv`fE$y>b5JG{$#yw3-G$VYt4Cw$6h ze9jkq$yW@QT>NKvMqornVq`{PR7PWT#$ZgwVr<4?T*hO3CSXD)Vqzv?QYK?^reI2@ zVrr&gTBc)qW?)8UVrFJxR%T;%=3q|dVs7SPUgl$d7GOaZVqq3xQ5IuymS9PiVriCP zS(amYR$xU|Vr5ogRaRql)?iK6Vr|x8UDjiLHef?GVq-R8Q#NCBwqQ%PVr#ZxTef3+ zc3?*au@gHpm|fVF-PoNy*pt23n|;`q{n(!aIFN%lm_s;}!#JEHIFh3{nqxSY<2arZ zIFXY$nNv8G(-^|(oWYr##o3(0xeVnz&gZ*)j|;evi@2Ce_&z`2QZD0ie#nowf-Ctk zKjA8V%Fp;Yzu;=F;aaZadVa~TxPcqFiJQ5FTe*$fxq~~oi@UjpU-KJ&%e~yk@3@}_ z_&pEu2Oi>K9^sGtiAQ;i$9aM$d5WibhG+RR&+!+Y=LKHmul$Xdc$ruDJOAKSUgMwq zi`RLBH+hS#%p?4fKk+D!@i6w8UnTeU1g;|-6*_nemnTxrZhk2Qg`B{JkS%`&Mghg45#aV(SS&F4uhGkif zZs!i}!9)8Vl_$~KxAHU;%9^m&p z$RBu!hk1lQ@+ThUF&^g$p5!T><{6&l&pgLpc%Bz{k-zdcUgBk5;qUx|S9y(p@-JTJ z4c_D}-sTGdv?OA|o*}qcAF?F*;)~CSx%+ z<1jAcF+LM8Armn%lQ1chF*#E(B~vjq(=aX5F+DRdBQr5GvoI^OF*|cGCv!13^Dr;- zF+U5iAPccDi?Aq*u{cYxBulY0%djlVu{##2Cu|6BH zAsewVo3JUHu{m3?C0nsI+psO$u{}GmBZJt9of*t7?89LixF&Ji5RQ5?-N9LsSW&k3B!Nu10noXTko;dIX6OwQtL&f#2!avuLLmg+HB zcOFo~aG4@ao9f-RZQEGewr$%s*S2ljwr$(C&Uemv@}t+ZnaO0@Y3`|T8k`Piz?pCs zoDJu|xo{qw4;R3Na1mS#m%ycP8C(umz?E!P#a1-1Nx4^A%8{7_e zz@2ax+zt1@y>K7g4-deD@DMxo44IRj#3q9z=0ERFE zW5Ad&7K{z!z_>6Tj1LpQgfJ0I43og5Fd0k^Q^1rk6-*7&z_c(OOb;`_j4%_-470$j zFdNJcbHJQ17t9Uwz`QUY%nu8|g0K)Q42!^`uox^3OTdz_6f6y+U>R5zmV@PC1y~VQ zf|X$vSQS=-)nN@-6V`&YVI5c()`Rt71K1EYf{kGl*c3K{&0!1J61IY^VH?;Mwu9|q z2iOsIf}LR(*cEnz-C+;d6ZV3=VISBR_JjT505}j1f`j1@I1~dB60cXNla5kI+=fZh#K3o77!bNZ~TmqNEWpFuM0awCR za5Y>5*TQvhJ=_2{!cA~9+yb}4ZE!o>0e8Y(a5vlo_riT}KRf^r!b9*dJOYoxWAHdU z0Z+nH@H9LF&%$%?JiGue!b|WnyaKPnYw$X}0dK-v@HV^y@4|cVK70Tl!bk8id;*`s zXYe_E0bjyb@HKn`-@xi~(c9STHt>1LMMYFg{EG6T(C=F-!uJ!elTx zOaW8ER4_G61JlBEFg?rwGr~+TGt2_B!fY@*%mH)4TrfAx1M|XsFh48+3&KLMFf0O# z!eX#EECEZxQm{0Pf@NS?SPqtl6<|eJ308(xU{zQRR);lUO;`)ohIL?FSP#~R4PZmq z2sVaIU{lx(His=>OV|pwhHYS5*bcUb9biY;33i5EU{}};c85J+PuL6ghJ9dP*bnxH z1K>b72o8or;7~XW4u>P)NH_|PhGXDZI1Y}76W~NR2~LL5a0;9Xr@`rP2Am0J!P#&Q zoD1i{`EUVT2p7S{a0y%rm%-(51zZVN!PRgLTnpF1^>72+2sgpaa0}cDx54dj2iysF z!QF5V+za=?{qO)h2oJ%-@CZB#kHO>c1Uv~(!PD>zJPXgk^Y8+^2rt3Q@Cv*Nufgl^ z2D}Mx!Q1c-ybJHa`|tsL2p_@6@CkehpTXzw1$+r#!PoE&d<);f_wWP!2tUEk@C*D3 zzrpYD2mA?t!Qb!?{0sjd=5Dq|ky4+R%Xt1!mtP|3X8$wummg#OTp4G3YLLoVL4bHR)7^@C0H3& zfmLBOSRK}YHDN7S8`gn!VLezMHh>LbBiI-=flXmE*c`TiEnzFz8n%ILVLR9!c7PpW zC)gQwfn8xY*d6wOJz+1{8}@;HVL#X(4uAvUAUGHffkWXiI2?|EBjG4G8jgWu;W#)R zPJk2PBsdvH!zpkooCc@E8E__?1!u!Ka4wt&=feeXAzTC(!zFMjTn3lJ6>ue71y{p0 za4lR1*TW5PBisZx!!2+t+y=M99dIYy1$V@GLwB&%+DwBD@4I!z=JAyauns8}KH)1#iPU@GiUu@52Z1A$$ZM!zb`5dC^x!e+2JYyn%sR;ZeiUa&Xp1N*{$us<9C z2f{&cFdPDh!eMYY905ndQE)UI1INN~a6FsJqaU`bdCmWENV3@i)F!Sb*ItOzT?%CHKo z3ai2Dum-FNYr)#E4y+67!TPWPYzP~{#;^%&3Y)>^umx-hTfx?_4QvbB!S=8N>3>*u`!SQec zoCqhu$uJsDfm7i$I33P_GvO>a8_t1q;XF7WE`ST+BDfeXflJ{sxE!v4E8!}*8m@tB z;X1e;Zh#x%Cb$`Hfm`7=xE=0*JK-+48}5O7;Xb$@9)JhoA$S-bfk)vncpRR9C*di0 z8lHh?;W>C7UVsybdeDaf3}FPufH7e#7#qfcabY|dA0~haVIr6qCV@#|GMF5ufGJ@r zm>Q;mX<<5;9%g_UVJ4UvW`S8@HkcjefH`3cGSd0{@79~OWGVIf!;7J)@!F<2ay zfF)rmSQup+DkE5j;-$nKCmzB2m8YTa3CB6 z2g4z7C>#cd!x3;K90fov#2gkz+a3Y)pC&Oqs1x|(2;B+_x&V;kzY&Zwbh4bKi zxBxDMi{N6o1TKZk;BvSEu7s=LYPbfjh3nvYxB+g2o8V@+1#X4g;C8qJ?u5JGZny{T zh5O)scmN)Rhu~p&1RjOQ;Bj~Yo`k31X?O;nh3DXTcmZC7m*8c11zv^M;B|Nd-h{W{ zZFmRXh42UEtPHEbs<0ZY z4r{=guokQh>%h9O9;^==z=p69Yz&*grmz`o4qL#MuoY|#+rYN49c&Lfz>csJ>I4tv0!uovtN`@p`iAM6hYz=3cO91MrRp>P-+4oAR|a1#7tVw8;R3i2E`p2U61WsDgUjIxxDu{{tKk~B7OsQq z;Rd)7Zi1WP7Pu8|gWKT_xD)PzyWt+V7w&`m;Q@FM9)gGA5qK0HgU8_scoLq1r{NiR z7M_FW;RSdRUV@k56?he1gV*5=coW`&x8WUl7v6*S;RE;(K7xSVp#wQ|p$B~!zz{}Y3>Xu}g0W#77#GHa@nHg(5GI0&VG@`WCWFag3YZe6f~jE| zm=>mk>0t(#5oUs!VHTJbW`o&b4ww_>g1KQHm>1@Q`C$QA5Eg=kVG&pq7K6oM30M-A zf~8>;ECb8Ja~Yy;cEcCbC{06W4?urureyTWd;JL~~_!d|d9>;wD4ey~3r00+WBa4;MK zhr(fSI2-{-!clNE90SL~ad14G04Kspa59XBQ{YrM4NiwM;7m9R&W3Z~TsRNThYR3B zxCkzWOW;zt3@(Q&;7Yg(u7+#iTDT6bha2EVxCw5CTi{l>4Q_`!;7+&;?uL8dUbqkL zhX>$6cnBVbN8nL-3?7Fk;7NE2o`z@OS$GbfhZo>QcnMyHSKw864PJ*g;7xc7-iCMJ zU3d@PhY#RG_y|6RPvBGd3_gc1;7j-lzJ_n$TlfyXhacca_z8Z7U*K2x4St6|;7|Aq z{)T_xU-%CS$C)hXynug%)Jch7RP=g&y=_07DpoF3&w_VU|bjv z#)k=DLYN39hDl&jm<%R|DPT&N3Z{l>U|N_CriU3|Mwkg^hFM@%maYf^ z32VXHunw#X>%sc40c;2x!N#x&Yzmvf=CB2930uL|unlYr+rjp*1MCPp!OpM?>5lZh>3jHn<(`fIHzXxEt<)d*ME~A0B`Q;URb!9)U;UF?bxFfG6Q8cp9F8XW=<` z9$tVK;U#z(UV&HPHFzD~fH&bScpKh2mZjo*HfAcA$#Iiiae|LGt zNG&r*M+s5l|38*nEGbv6T*uCA%l=n$RuPIvItp)Y(jP+hmcdqCFBg;m08VU4g>SSPF(HV7MqO~Phj zi?CJLCTtgW2s?#c!fs)Yuvge8>=zCQ2ZckzVd02yR5&Ia7fuK#g;T<5;f!!rI47JJ zE(jNeOTuO0if~o9CR`V82sedW!foM>a96k|+!r1Q4~0jhH%#F%0%F}4^-j4Q?yMy#Npxyailm(94(F!$BN^`@!|w=qBu#MEJlk{#Hr#m zak@A|oGH!{XNz;hx#B!=zPLbKC@vBgi%Z0%;xci$xI$bht`b*^Ys9tUI&rA z5;u!m#I52sal5!f+$ru7cZ++(z2ZJ`zj#1AC>|0Ii$}zx;xX~KctSiWo)S-sXT-DO zIq|%BLA)ql5-*Ea#H->p@w#|JyeZxiZ;N-tyW&0ZzW6|VC_WM&i%-O-;xqBN_(FUs zz7k)HZ^XCaJMq2vLHsCw55?It5|u28Nw(xjTyiB(@})ourHB+miYdjCVoPzPxKcbRzLY>pC?%2-OG%`pQZgyI zltM}=rIJ!hX{5ALIw`%BLCPp)k}^wKq^wdlDZ7+I$|>cNa!Yxnyiz_Xzf?dfC>4?l zOGTujQZcExR6;5#m6A$JQBoPHtW-`aFIA8#N|mI_QWdGHR86Wb)sSjRwWQio9jUHV zPpU69kQz#jq{dPcsj1XVYA&^qT1u^?)>0d(t<+9xFLjVQN}Z(6QWvSK)J^Ix^^kf> zy`a)hX|uFN+A3|6wo5ytozgC8x3ovvEA5l^O9!Nb(jn=vbVNET9g~hrC!~|oDe1Iy zMmj5*Qq>Iue>9TZ1x+-0hu1hzho6;@mwsc3jE8UasOAn-n(j)1y^hA0pJ(HeG zFQk{!E9tfLMtUo~lio`oq>s`k>9h1j`YL^szDqx(pVBYsxAaH)EB*WL=oe*4mSsg& zWg=^`E*r8bQ`wT4Y|DC@j>s|Om~t#Rwj4)}E60=L%L(L!aw0jgoJ3A4 zCzF%QDdd!LDmk^BMoufIlhex?*9C6|%Q%H`zpas|1fTuH7hSCOmA)#U1O4Y{UVORg=~ zk?YF!sq!>=x;#UkDbJE;%X8$p@;rIIyg*(kFOnC_OXQ{UGI_bYLS8Aal2^-XvDc_QB%Xj3v@;&*!{6KyvKawBIPvoca zGx@pvLVhW~l3&Yj{wRNvKg(a_ukttfyZl4`DgTmx%YWp*^1uJWFDjBE zD~h5jM9~yoF%(mwils2cRvd*ZuHq@a5-6b(QDP`Dl~_t_C5{qTiKoO@5-16kL`q^M ziIP-FrX*KVC@GaxN@^vIl2%Elq*pR18I?>*W+jV~RmrAgS8^yhm0U`0C6AI<$*1I3 z3Md7YLP}w!h*DH3rW997C?%CrN@*oZDWjBC$|>cQ3Q9$#l2TczqEuC?DbDf zQd_B`)K%&!^_2!nL#2_@SZSg(RhlWyl@>}%rIpfJX`{4N+9~ao4oXL*lhRq~qI6Zd zDczMGN>8Pi(p%}H^i}#P{gnaAKxL3JSQ(-WRfZ|Ul@ZEFWt1{n8KaC<#wp{K3Ccue zk}_F|R;DOZm1)X!Wri|SnWfBD<|uQOdCGicfwE9pq%2mJC`*-P%5r6evQk;4tX9@2 zYn64%dS!#MQQ4$yR<QZ#nlpONwt((T8&c6sAbi1YI(JST2ZZ}R#vO1Rn=;0b+v|C zQ>~@eR_myB)p}}uwSn4DZKO6Y_h1ybWrM6bvsBP7DYJ0VV+EMMKc2>Kn zUDa-CceRJwQ|+bpR{N-Z)qZM!b$~ih9i$Fchp0oUed6 zI#HdZPFADUDe6>pnmS#bq0Urisk7BN>Rff6I$vF&E>st(i`6CSQgxZSTwS5AR9C61 z)ivr`b)C9i-Jot%H>sP|E$UWvo4Q@yq3%?7sk_xZ>Rxr9x?eq@9#jvhht(tMQT3R5 zTs@(lR8Og=)idf@^_+TMy`WxHFR7Q+E9zDCntENmq25$)skhZT>Rt7odS88@K2#s6 zkJTsYQ}vnpTz#RwR9~sD)i>%}^_}`&{h)qSKdGP9FX~tIoBCb-q5f2VslU}f>R*hLT}qI2l1kl2K$d8AHaBab!H1Kqit&WHN~+Q^-^@jZ7yq z$V@Ve%qDZlTr!W$Ckx0zvWP4uOUP2Pj4UTB$V#$`tR`#7TC$F;CmYB{vWaXaTgX?V82Ub2tuCkMzua)=xzN61lfj2tH?$VqaFoF-?;S#pk?Cl|;?a*13f zSIAXzja(-;$W3yK+$ML(U2>1yClAO&@`yYpPsmg9j65eV$V>8yye4nRTk?*)Cm+a1 z@`-#VU&vSTjeI9R$WQW%{3d_MU-FL#ny5*dtSOqR5lz!{&CpDZYL><{TXQt7xtgc> zTA+nmM2n%t)M9C|wK!T_EuI!%OQ0px5^0IGBwA7}nU-8jp{3MPX{og|T3RigmR`%C zWz;fhnYAohRxO*BUCW{6)N*OLwLDs0EuWTOE1(tB3TcJ4B3eI%plWPFiQJi`G@^rghhPXg#%FT5qk7)>rGN_16Yy1GPcg zU~PytR2!xZ*G6a~wNct=ZHzWn8>fxeCTJ72N!nyBTAQLx)uw6FwHew>ZI(7$o1@Lu z=4tb_1=>Pwk+xV{qAk^yY0I@0+DdJewpv@Gt<~0P>$MHqMs1U}S=*v*)wXHdwH?|{ zZI`xN+oSE(_G$aI1KL6Dkak!*q8-(aX~(q_+DYw{c3L~5oz>21=d}ykMeUMyS-YZL z)vjsRwHw+^?Ur_1yQAIJ?rHb62iimJk@i@7qCM4~Y0tG6+Dq+~_F8+Rz17}n@3jxw zN9~jLS^J`W)xK%pwIA9~?U(jj`=kBU{%L|P>XI(&imvKJ*K}PsbW^9gr8C{u9i8j0 z?&-cB=%F6bW9TvUSbA(djviN!r^nY5=n3^idSX3^o>Wh!C)ZQxDfLu(YCVmfR!^s= z*E8rD^-OwZJ&T@I&!%VBbLctsTzYOjkDgc0r{~uT=mqscdSShYUQ{op7uQSZCG}Ez zX+26WqnFjo>E-nbdPTjGURkfASJkWO)%6;BO}&<0Td$+n)$8f?^#*!Fy^-EnZ=yHV zo9WH<7J5s)mEKx!qqo)D>FxCndPlvJ-dXRWch$S;-Sr-NPraAkTkoUy)%)rF^#S@o zeULs_AEFP{hv~!h5&B4dls;M?qmR|c>ErbY`b2$_K3R{}r|47lY5H`1hCWlDrO(#q z=yUaX`h0zXzEEGJFV>gnOZ8>?a(#uqQeUO7*4OB3^>zAseS^MH-=uHWx9D5-ZTfb7 zhrUzarSI1F=zH~j`hNX@eo#N8AJ&iPNA+X+as7mTQa`1i*3al?^>g}p{epf`zocK* zujp6xYx;HlhJI7OrQg=?=y&yd`hER@{!o9UKh~eQh%kt*5BxF^>_Mv z{e%8d|D=D`zvy4}Z~AxrhyGLlrT^Cd=zsNpx?qTgWXOhMs0J}KLpKb=G^k-2%&-l| z;D&2>hHnH$Xhe({Moc4?5!;Ak#5Lj>@r?vVLL-rp*hpd|HIf<0jTA;oBbAZbNMocm z(i!QE3`RyHlablTVq`V48QF~-MouG_k=w{)KJv6dPaStfzi-tWHdIK z7)_04MsuTu(b8yTv^LrpZH;zDd!vKV(dcA!Ho6#Hjc!JFqleMc=wSw(U@dRHlmFw##CdPG2NJ9%rs^h zvyC~%Tw|Uw-&kNQG!_|)jU~oXW0|qsSYfO*RvD{}HO5+Fow457U~Dut8JmqQ##UpS zvEA5V>@;>6yNx}@USprJ-#B0#G!7YujU&cU6)JDn}Hdc5i^Dv(~M=tHshFa&3I;fGl7}VOk^fDlbA`(WM*J46HZ&WV zjm;)zQ?r@b+-zaCG+UXi%{FFRvz^)A>|k~@JDHu$E@oG=o7vs$VfHk8nZ3#+-L4L510qdL*`-ghLRCtrMs;dXlTvC? zMs4a)PF?Czp9VCf5gLQWq_Jph8i&TE@o0RSfF`7gXkwa#CZ)+}a+-psq^W3XnueyO z>1cYIfo7zcXl9y)W~JF^cAA6cq`7Ernuq44`DlJxfEJ{MXkl7}7Nx~#aaw|wq@`$S z8b!;{va}p6Pb<)hv=XgMtI(>n8m&%i(3-RstxfCDy0jjxPaDvNv=MDgo6x4T8EsBm z(3Z3nZB5(IwzM5>Pdm_#v=i-2yU?z*8|_Ye(4Mpx?M?g8zO*0hPY2L}bPyd(htQ#P z7#&VW(2;Z$9Zkp3v2+|APbbicbP}CRqv;enl}@A6=?prP&Z4vF96FcIqx0zkx{xlS zi|G=&lrE#o=?c1%uA-~y8oHLQqwDDgx{+?8o9Pz1m2RWk=?=P+?xMTt9=ezAqxMqwnbl`jLL3pXnF+m42h&=@0so{-VF> zANrU6qk<(`k|kS;rCP+&EZs6J)1sDTG0V0bi(9VcS-urmp%t-WSTU_wR%|Pd71xSq z#kUez39Up{Vk?Q2)JkS0w^CRstyETOD~*-bN@u0FGFTa{Ojc$qiSUIg+ zR&FbgmDkE=<+lo01+79>VXKH$)GB5bw@O$gtx{HLE6OTkm9@%Q<*f=RI)z23A9>k=590Vl}mzSSOh_`dR(00oFikkTuvEVhy#1S;MUn)<|oVHQE|u zjkU&E%PbFF#Sd~1QV&{||Iww72+t!377 zYlXGaT4k-a)>vz;b=G=ogSFAxWNo&#SX-@a)^=-$wbR;V?Y8zx6aEI%S=<&RA!ybJlt5f_2flWL>tdSXZrU)^+QKbxK2wdS$(~-dJy~ch-CBgZ0t+WPP^2SYNGg)_3cN_0#%g z{kHyCf31I(z(gi7nJG+VglSA?1~VCD7Gun24&%&a9`jkiLKa~$SWFg+#b$9>To#YT zX9-wBmWU;0Nmx>rj3s9&SW1?PrDkbZT9%HbXBk*VmWgF%Sy)zljm1a?_3@gjZvGS|}tH>&`%B%{j%Br#ItOl#e zYO&g^4y()RvHGk5Yseb0#;gfz%9^p}tOaYyTCvuw4QtEVvG%M3>&QB>&a4aT%DS=c ztOx7Kda>TD59`bNvHolT8^{K+!E6W{%7(GwYy=z0MzPUs3>(YFvGHsIo5&`y$t;>p zVN=;OHl592GubRQo6TW!**rF%Eno}TBDR<Zxnyq1L**dnKZD1SO zCbpSvVO!ZYww>)@JJ~L_o9$tH**>nw?>1**SKe zU0@g4C3cxzVOQBTcAec|H`y(Ao84h|**$ijJzx*nBleg*VNcmJ_ME+7FWD>hn!RCf z**o^0ePAEiC-#|rVPDxd_MQD;KiMz#oBd&b**_-OqAl67t=OtfY|Ykf!!~VdTQ;+8 z+p)Rr+Mey(fgRcrJBA(8j%CNT*R*Tdwe31~UAvxL-)>+xv>Vxt z?Iw0pyP4hGZeh2yTiLDcHg;RPo!#E0&XV146*bD7N_F{X9z0_W2FSl3NEA3VGYI}{n)?R0?w>Q`u?M?P(dyBo* z-ezyNci21aUG{E!kGYTBy{+)3f2bW%B~ zoit8bC!Let$>3ykGC7%@EKXJ@o0Hwi;pB93Ik}xYPF^RUliw-e6m$wXg`FZ!QKy(w z+$rIdbV@m;ohYY_Q`RZxly@pP6`e{>Wv7Z$)v4xGcWO8_omx(9r;bzCspr&p8aNG| zMowd=iPO|+<}`O&I4zx4PHU%))7EL{w0Al<9i2{2XQzwP)#>JRcX~KIonB6Fr;pRu z>F4x!1~>zqLC#=jh%?j~<_vd6I3t}=&S+In$jP z&P->PGuxTt%ys5D^PL6GLT8b)*jeH%b(T5HofXbXXO*+sS>vpA);a5)4bDbqle5{` z;%s%cIoq8b&Q52Sv)kF@>~;1z`<(;MLFbTj*g4`Hb&fg5ofFPU=ah5WIpdsl&N=6u z3(iI7l5^R);#_sEIoF*V&Q0f*bKAM&+;#3b_nimML+6q6*m>eSb)GrTofpna=auu? zdE>lw-Z}4`56(yDlk?g6;(T?!Ip3Wh&QIr;^V|93{B{000vEZ&Wv+0Q6RvTc8{FiS zTbyy5JDhWud)(&%4|#;g;4yhD9-GJEad|u*pC{l6c_N;eC*etXGM=2L;3;`3o|>oO zX?Z%Ho@d}0c_yBjXW?0SHlCg5;5m6No}1_4d3ipbpBLZ-c_Chy7vV*DF@5OubKD;mQ$NTdEd>|jh2lF9( zC?Cd$^AUU`AH_%WF?=i^$H(&td?KI3C-Z1Ng-_+v_;fyl&*ZcCY(9t2<@5M_zJM>} zi}+%`gfHdG_;S92ujH%vYQBcAU8C_;$X7@8rAqZoY@_<@@-4 zet;k3hxlQBgdgR{_;G%MpX8_bX?}*E<>&Z$et}=)m-uCVgz;|Kz{; zZ~lk><^Qw2#525#s^+!$_5HD>%&MmLk2+0Ei+b+ftI-5hRC zHiFsc89n_-C^!@-6`%=cbYrho#D=OXSuW8IqqC{o;%-N;4X9*xr^N;?oxM| zyWCyju5?$qtKBv3T6dkh-reACbT_%1-7W4`cbmK2-Qn(Zce%UWJ?>t2pS#~Z;2v}j zxrf~&?os!cd)z(Yo^(&Sr`WybU(SD-7oG}_nZ6O{o(#} zf4RTiKki@mpDTExCwa1`c&bM{&C@-@Gd=2A9`kI^@wn%Dp67dk7kUvdh8NR|<;C{m zcyYaWUVJZsm(WY(CH9hdNxfuVaxaCK(o5x~_R@H1y>woBFN2rS%j9MDvUpj&Y+iOR zhnLgK<>mJBczL~iUVg8DSI{fu750jFMZIEPaj%3|(ktba_M*HpURkf4SKh1ORrD%( zmAxunRj-;?-K*i%^lEvvy*gf9ubx-mYv48X8hMSqCSFsonb+KF;kEQyd9A%RUR$r7 z*WT;kb@V!UoxLtzSFfAb-Rt4?^m=){y*^%FubL4Lm^a)T;f?f0 zd855C-dJy(H{P4zP4p&tlf7tfiZ|7p=1uozcr(3O-fVAg-f!=Z_t*R93BKq{zU(W$>Jwk{b>Hw!pZb>1eA{EX{RRF)f04h~ zU*a$Im-);675++pmA~3wPJfrb+u!5w_4oPv z{R93%|B!#!KjI(tkNL;_6aGp6lz-Yk zgIGcAAWjfBh!?~U5(Ei@L_y*pNsu&179ySU1=WKZLCv66P&=p-)D7wd^@9dM!=O>nIA{_y4VnebgBC%{pjFU1 zXcM#z+6C={4nfDDQ_wl+5_Ao^1>J)lLC>I9&^zc8^bPt2{euC)z+g}?I2aNP4Tc57 zgAu{VU{o+V7!!;Q#s%Yp3Bkl*QZPA)4yFWCgK5F^U`8-Am=(+p<^*$tdBOZ(L9j4b z6f6#w1WSWu!SY~5urgQ`tPa)$YlC&c`d~w_G1wGr4z>hagKfd~U`Mbs*cI#!_5^!_ zeZl_VKyWZP6dVqY1V@8o!SUcka56X*oDR+eXM=OW`QSouF}M_54z2`OgKNR{;6`vW zxE0(E?gV#(d%^wSLGUnm6g&=|1W$u!!Smon@G^K6ybj(3Z-aNi``|=Rt~F#Rl{mw^{_@*GprTX4(o(!#CmE@Ll*m{1AQ&KZT#e zFX7klTlhWv5&jH+g}=i;;otCIC`81F6p5JW?W3 zGEyp1IuaEr6Db=h7bzd95UCic6sa7k5~&)g7O5Vo5vdue6{#Jm6R8`i7pWg<5NQ}` z6lol35@{M~7HR%J?cD`<)Xcsx@H+00Op-}vl1x(E-Q5bsT}q*aZGjdlxO-vo;_mM5 z?(XjH?)GKde+%WMh4Y?!&bi-nPk7khZX=r=m+XeKaRFQ)*P3g?wdI1iV6Gk4o(thZ zxiBuA>%ev7I&q!3E?ifx8`qud!S&>NalN@dTwksq*Pk1}4de!K5!_&I2xsSpa>Kac z+z4(YH;Nn0MRH@fvD`RrJU4-x$W7uVb5ppf+%#@FH-nqW&EjTrbGW(OJZ?U>fLq8d z;udpDxTV}OZaKGtTgk2BR(wcI*xJ-30|$Zg^_b6dEr+%|4Iw}acs?c#QGd$_&a zK5jpEfIG+?;tq31xTD-L?l^aXJIS5mPIG6tv)noEJa>V+$X((tb62>l+%@hxcZ0jh z-QsR@ceuOUJ?=jDfP2V2;vREPxToAR?m72@d&#}xUUP4_x7<7KJ@cx z+&9jNcjjGqSKf_x=RNpDd}2NcpOjC=C+Ab}Dfv`F*iLcCi@m2V$yfW_>KG~elx#?-^y>}xAQyro%}9-H@}D9 z%kSg&^9T5Y{2~4@e}q5EALEbnC-{^6DgHEnhCj=n24{{xW}szsg_Zuk$zf zoBS>QHh+h|%irVg^AGrk{3HG`|Ac?aKjWYCFZh@IEB-bAhJVYycT2{xkoD z|H^;kosctfL9WORxg!sh2qi{IP*Ri(B}XYxN|XwvMrlx5ln$jw84!nfgphzlBq5As zq#%MQQjvyq#E^kZ2LZ~n*f{LPI zs5mNtN}^JzG%AD2qH?G_s(>n@N~kjOLRC;zZk^)iE5$Rs1B-&>Y@6m0cwaE zp~k2QYKnYNGvte!qZY^y`JWO-x-lz}ii~6DdXaE|B2B8Qv7!5&oG!zX(!_f#d5{*KmQ6w6J#-ed( zJeq(eqDg2nnu4aHX=pl{fo7svXf~RI=AwCMK3aelqD5#iT7s6MWoS8CfmWhbXf;}c z)}nQ2J=%aaqD^Qs+Jd&CZD>2%fp(%@XgAt}_M&}gKRSR8qC@B~I)aX(W9T?Kfli`R z=rlTm&Z2YZJi34`qD$y9x`M8vYv?+K6-#2qDSa4dV-#!XXrV4 zfnK6l=rwwS-lBKtJ^Fw?qEF~E`hvcqZ^%h-7F+~Z!A)=%JcL9-Vj+o;R7fTy7g7i* zg;YXnA&rn$NGGHhG6_QG9r;tm?E#wjM3i*WmLII(mP)H~&6cLIF#f0KQ38AD=N+>Oq5y}eXgz`cKp`uVp zs4RF1RfMX7w@^)}F4Pce3blmVLLH&5P*12YG!PmJjfBQR6QQZ#BQz6yh2}yF!B6lP zS_-WMn-Cxb3ay1ULR%q72o~B2?S&8_R0tEog$_bTp_9;A=pu9#x(VHd9zsu{m(W}2 zBlH#e3H^lu!a!k=5Frc}h6r|Hs4z?zE{qUH3ZsP4LZmQ87%Pku#tRdKiNYjdvM@!M zDohim3p0e7!YpC7Fh`gx%oFAd3xtKjB4M$xL|7^;6P61rgq6Z7VYRSESSzd()(abi zjlw2jv#>?jDr^(B3p<3J!Y*OAut(S{>=X722ZV#dA>puaL^vuO6OIcfgp$v#G))JA`z*mikhg4Of*DO^b|9SnZ(Ru7BQ=s zP0TLl5Oa#T#N1*YF|U|U%r6!Y3yOur!eSAzs8~!aE|w5WilxNTVi~clSWYZ2RuC(S zmBh-TmsmxtDte36#Oh)Vv8Gr{tS#0N>x%Wn`eFmIq1Z@lEH)9Fiaug9(N}CPwh;Y9 zf3c<5O0A5;u!m#I52sal5!f+$ru7cZ++(z2ZJ`zj#1AC>|0Ii$}zx;xX~KctSiWo)S-s zXT-DOIq|%BLA)ql5-*Ea#H->p@w#|JyeZxiZ;N-tyW&0ZzW6|VC_WM&i%-O-;xqBN z_(FUsz7k)HZ^XCaJMq2vLHsCw5&n!q(o9;DT$O+N+uNOozcG)x*UjgUr4qomPNq%=kvD~*%JOB1At(j;lJ zG)0;!O_QceGo+c)ENQkhN17|mljchcq=nKVX|c3KS}HA*mP;$7mC`C{wX{ZBE3K2( zOB9BM}Iw~ELj!P$`lhP^a zv~)%~E1i?hOBbYz(k1D#bVa%8`=E$Oy&N4hKBlkQ6oq=(WY>9O=gdMZ7W zo=Y#Jm(nZgwe&`ME4`E6OCO|<(kJP&^hNqAeUqH9Gj_qQ*bTd551a@m#z}BeoD3(& zDR4@h3a7?ta9W%Wr^gvEhk1;!fJH1}jAg7~f+<$9hIP!aflcg*GvZ7*GtPpu;%qoO z&Vh5{TsSw*gY)8iI6p3c3*th!FfM|N;$pZsE`dwpQn)lOgUjM_xIC_aE8M$tKsUn2Cj)~;o7(ku8Zs8`nUmZh#TR?xCw5GeQ-1Ei<{#X*bn>TmbewR;Q$IfG^@p_%gnNui|U?I=+E# z;#>GOzJu@Ld-y(nfFI&V_%VKhpW#|iJVkUCMTCu$SLJia%wq^oK{XJra!0w7+*$4-ca^)z-Q^x~Pq~-eTka$GmHWy4c6q2gOdc+e zkVnd+ z`L=vVzAN98@5>M5hw>x&vHV1SDnFB-%P-`Y@+B&zmwm~ALNhnC;7AdMgA&( zlbsZ2#YJ&d+!S}kLrJ70R+1=5m1IhCC54hwNu{J#(kN+_bV_<9gTg7if)qg!6-mL0 ztSAamsG=&GqAN@>6jSk3GAfyr%t{s|tCCI0uH;a1D!G*0N**Pzl26I66i^B(g_Ocd z5v8b7OewCEP)aJLl+sEWrL0m;DX&yeDk_zf%8Hj#MX9QIE7g?hN)4r^QcJ0=)KTgx z^_2Qb1ErzTNNKDzQJN|~N;Ab*X|A+T{1ktsrP4~VDFI5L(pqVwv{izXV5ObXUI|e` zl`th->7aB}Iw_r%E=pIWo6=qBq4ZRGDZQ0GN?)a)(q9>%3{(aw5z1g?h+itWx29KS*fg2Rx4|iwaPkWy|O{ssBBU;D_fMU$~I-YvP0Ra>{50sdz8J( zK4rghKsl%!QVuIel%vWq<+yS}IjNjdPAg}Wv&uQ;ymCRgs9aJmD_4}O$~EP>aznYP z+){2Uca*!zJ>|agKzXPX;CCNx~l7gfpsYq&)hNLCwNP3ciaD*p_2t*_j!9*qs zA%qf@XhbKB7{nx=BqPa0GLtMME6GN(lN=-`$whLLJR~p4NAi;bq#!9o3X>wFC@DsY zlM7}JWGPujmXj4^C0RvQlQm>5 zSx45B4P+zPL^hKxWGmT5wv!!XC)q`IlRacF*+=%11LPn%L=KZ9m!lRM-txkv7k2jn4nL>`kTFCDh$f~js!&2HRjEdG%BVq2>Pa)wOf)miLbK9rG&{{fbJAQi zH_b!y(tI>OEkFy>LbNa~LW|O3v^Xt6OVU!bG%Z8R(sHystw1Z%O0+WdqE%>B>P@TB z>a+%}No&#Cv<|IH>(TnO0c}Vd(Z;k1ZAyJ;GwMs5(-zc^`qP%Q6}8a-8c18yHnc4b zqQSHsZBIjJC=H|Gv;*x(JJHUx3++m~(eAVd?MZvl-n0+xOZ(CObO0Sl2hj*Rm=2+K zI+PBh!|4b*l8&OIX(Sy($I@|hJe@!%(n)kOokFM5X>>ZBL1)rgbT*ws=hAs}K3zZ; z(nWMJT|$@AWpp`RL08gMbTwT=*V1)#J>5Vz(oJ+T-9oq0ZFD=`L3h$!bT{2Y_tJfI zKRrMX(nItxJwlJtWAr#ZK~K_C^fWy~&(d@BJiS0K(o6I*y+W_jYxFw3L2uGq^ftXi z@6vnpK7BwR(ns_$eL|noXY@IJL0{5W^fi4$-_m#VJ^esG(ogg={X)M|>lT-)v+AO{ zs&1;g>Y*l56RSzoq-ruXxtc;vsism>t7+7)imIeyRaO<1s8m%| zP1RMV8mg&!su|TxYGyTynpMrFW><5lIn`WhZZ(gZSIwv9R|}{G)k11vwTN0&Ev6P% zOQ~@eR_myB)p}}uwSn4D zZKO6R5H0I$oWiPE;qUlhrBeRCSsRNT3x?bI&Zd5m^o7FAqR&|@YUEQJXRClSn)jjH7b)ULlJ)j;` z52=UMBkEE0n0j12p`KJvsi)O5>RI)idS1PtUQ{osm(?rkRrQ*BUA>{+RBx%b)jR54 z^`3fPeV{&6AE}SkC+bu6nfhFPp}tgKsjt;H>Ra`l`dd71fGq z#kCSzNv)JtS}UWK)yiq*wF+8At&&z*^U|tlRW)y|npRz_q1DuCX|=UFT3xN4R$ptN zHPjkujkP9PQ_V+fruk~kwHBJ6=C8HXT4^>dKnv7bYi+c)T96j3wbR;bAzG*wriE)A zw2oRQt+Uoe>#B9rx@$eOo?0)hx7J7NtM$|RYXh`_+8`}L8>|h{?AlOmm^NG+p^el= zX`{7BZHzWn8>fxeCTJ72N!ny>iZ)f7rcKvoXfw50+H7r(HdmXc&DR!a3$;bsVr_}G zR9mJk*H&mNwN=_`ZH=~8Tc@qpHfS5QP14cPugegi}qEsZgHbK z>n^&h?xwrz9(p1@v7SUvswdNv>nZeirr_9-FXV$amS@mpsc0GrlQ_rR6*7N9j^?Z7My?|a&FQgaNi|9r5 zVtR4CgkDlFrI*&r=wC5#M`bvG3zFJ?SuhrM->-7!#MtzgMS>K{>)wk)}^&R?7eV4vl-=pu<_v!of z1NuSzkbYP{q94_d>BsdG`bqtiep)}HpViOl=k*KvMg5X~S-+xR)vxK-^&9$4{g!@P zzoXyP@9FpT2l_+(k^WeJqCeH2>Cg2S`b+(l{#t*dzt!LA@AVJ*NBxujS^uJc)va3` zFlXk%T$vklXC5pOOU#n6q%0Xr&Qh?HEEP-5(y+8F9ZSzLFplvIF@cFpVwlNHVT4hp zGL7ksF@u@RlVxO?SZ0=mWo6k|c9w(XWVu*wmWSnK`B;8dfE8qgSYcL#6=lU(aaMwr zWTjYXR)&>j}G(o`tYb7RJI^2iB2wVx3tR)|GW* z-B}OTll5Y~Ss&Jy^<(|n05*^fVi9aG8^Y{tC>zFxvk`108^uPmNH&IzW#ia*Hi1oK zlh|Z7g-vDC*mO37&1AFKY&M6@W%Jm4wty{Ui`Zhege_&u*mAaltz@g%YPN>0W$V~_ zwt;PAo7iTyg>7Zq*mkyq?PR;yZnlT*W&7BEc7PpZhuC3ugdJta*l~7(on)ujX?BL4 zW#`y=c7a`Fm)K=?gSI;MiL{bk<3VLq%cw%sf^S{ z8Y8Wd&PZ=$FgSxZkRceNAsN_^4aFb^HB>`0bb}d&VH%!BMkAAv*~nsKHL@AmjT}Z! zBbSle$YbO+@)`M!0!BfjkWttuViYxs8O4nfMoFWTQQ9bDlr_p3<&6qPMWd2Y+3+%| z7*!2#qnc6OsA1GJY8kbSI!0Zio>AXuU^Fxu8I6r5MpMJbXlD2t&5ag@pW$z`G+G%p zBftnWS{rSQwnmTqqot==xg*d z`Wpj`fyN*s!We7}G3>@rW0*1A7-5VwMj4}xNMnpK));4uHzpVpjY-C2V~R1=m}X2j zW*9S#S;lN*jxpDmXUsPi7z>R>#$scMvD8>*EH_pdD~(mgYGaMD)>vn(H#Qg>jZMa8 zV~erX*k)`ub{IR2UB+%>kFnR-XY4l)7zd3*#$n@#anv|w95+rFCyi6aY2%D>);MRJ zH!c_#jZ4O5TgGkUj&awxXWTa)7!Qp{#$)4&@zi)`JU3n#FO65m zYvYaa)_7;UH$E62jZemBzzsncd7` z<}`Dexy?LgUNfJW-z;DjGz*!9%_3$|vzS@jEMb;3OPQt3GGk#%yZ_nZaf|v%MK&hMHk!xY@z%Xm&C?n_bMVW;e6D*~9E<_A+~$eayaQ zKeN9%z#M1}G9%2v<`C0v4mF3F!_5)qNOP1q+Ke>Em}AXx=6G|0InkVCPBy2QQ_X4S zbaRF|)0}0_Hs_dg&3Wd0bAh?gTx2dbmzYbIrno2R>{hi4+s#GXk!lX@oeOzxS& zGo@!LPsfkX*%SK&hxqx2`1?ghgg3Xl2Ly%X${!i{#M#LyQ}4*`kr@M@TTg8)(9gnp zX{+XKtNz!Yrn~RjKQge5wMj(jk|g?v?4o~vF8a9({K*w`1KmLnFcFv-OadkalYz;> z6ktj)6_^@K1EvMjf$6~vAP4dw0tHY6B@lx$sDLpaox*Ds{ON9k)-UQsFF*YXam=p~ zTEDaxy<`G2gIU0=U^Xy2m;=lS<^pqrdBD72J}^I604xX=0tv_6G-m1HnOH1UMKR0@}f$;4pAFI076Ajsi!6 zk>D6`EI1Av4^99lf|J0>;1qBwI1QW*&H!hEv%uNl9B?i;51bD!02hLbz{TJaa4EP9 zTn?@PSAwg+)!-U%Ew~O`4{iWAf}6n2;1+NzxDDJ6?f`d!yTIMx9&j(X58MwP01twP zz{B7X@F;i;JPw`!PlBhw)8HBKEO-t)4_*K-f|tO{;1%#Hcn!P`-T-fcx4_%r9q=xA z54;aP03U*nz{lVd@G1BVd=9<K_zkpvcs6Qr z23c>|hQsCzuP&4dwy!g89JwU;(foSO_c(76FTb z#lYfV39uws3M>tl0n38r!17=Pup(FqtPFaARlusCH&_j<4%Pr`g0;ZfU>&e7SP!fZ zHUJxfjljlW6R;`h12zMF!RBBK(9xK7EdF3iuoY+n1HeGAHRxz*JC?R!5Eu-$1KWck zU?>;{hJzizj$kLSGuQ>}3U&j#gFV2WU@x#Y*az$j_5=Ha1HggcATRS;!9(C-@CbMmJO&;IPk<-EQ{ZXv40ski2c8EnfEU3_;AQX%con<`UI%Z0 zH^E!rZSW3w7rY1F2OoeB!AIa@@Co=7d#4fqy(2fhbCfFHq6;Aij) z_!ayHT0c7Yn0SQP!71Tf-WS{|>peL9S%miizvw&H_Y+!aU2bdGg1?C3xfO)}uV1BRw zSP(1(76yxeMZscVaj*nf5-bIl2FrkD!E#`EumV^StOQmDy}&A9RnQx(237}afHlEd zU~RAtSQo4Z)(0Da4Z%iWW3UO>6!Za`fxcjKum$J``hzXOR-g?G00Y6+U>mS47z74` z?ZEb62p9^6f#F~Wup`(B>DfFr?C;Ak)s90QI8$ARO)3E)I<5;z&00!{^|fz!bm;7o89I2)V;&IRXz z^T7q+LU0kd7+eA_1($)#!4=?2a22>3Tm!BJ*MaN74d6y_6Sx`N0&WGjf!o0y;7)KC zxEtI9?gjUO`@sX?LGTcG7(4Xg1)qV>!5835@D=zPd;`7(-+}MJ58y}e z6ZjeY0)7R*f!0lDqU(Rq1#|`7KzGmsOavwdlYmLVWMFbI1(*^{1*QhmfN8;WV0thE z$bmeFKmim%3B;fbDj)$VsDc`(gA6o46Z8Z#f|XOK!30$*b20P0bn558f*i$ z1%tp~upQVQ3;{#IFfbhK0CogBft|rFU{|mk*d6Qv_5^!@y}>?UU$7t89~=M<1P6f; z;9zhFXa|Ra!@%L-2yi4g3LFhaf@8q3;5cwRI02jpP68)`Q^2X{ICug)37!H^gJ;0A;5qO-cmcc!UIH(J zSHP>_HSjuk1H1{|0&jzNz`Ni*F!r`^fsUScJk#yYR%IJe`gkHoP3!FB;^g|@8@>Ja zMsLm@&WZkeqqqOw=(eCUUX?JdJ{cmCY_p|={TmM^H|64`+$J~4N*Nu#*gA|V_lgP>0 z#nsK-+^X0>Rn7a5yG+E_xL!9C_P*aq4L*&=L%Z3AP^*w@*e-_;%1B+Up#tMVL;+9RKu9qxjJqxU0MP6WO$Fp+QgO`(+ zd-VN!!+is5IR0ALS|1*2oq%;cr9FvHTmLX$Ki@FlNb7W}`3BZ;Y~q+(3~wGd)$u&P zwfV0CaD;7?BN0T}64_nL`gV?t4d&$CBW)=oY^fq_scq@JoUIdZa@;Q1#(6o1H@8jk z{`F;?UP+^GFkH3Z4>uTgv;BO7Ve2i+xCDhpTF15ig+&fP}93?_cydlh(;8 zLzmz>4*in3&33qI^7xkyU9x8@b=I=RwjIMAPOo0~s>5eD9%emfSv<67SBFF0(jPpR z)^XUVnv>{$odk?Y88N{d z==VMR|9;2@wzH1%FAb*n(_?q*t>dJ}q;b=uueClVJq9**Y}~~0@IykhyZeRtgmsCG zH!c3=5@AR7g!qchp}&i-I0@iyM26k1U1#f*?Jhp;tgMmIp0rM_9OZmN!~8>nA}ded z`0bmQlijsS)!NqkSb5`@S2jnTc67vLbfUhO3-MMurpwzN^>_H~YxC6(y+^!rvC=V} zqS|Ymjhk-&rNI(^ggZv;<3#vR1(0JVBjE_Qh5pB)`kyS?-}j8k!2hAZcT5i?6!?y5 zir)l2i9hgdrlZWa>d%a}KwE2nTUN)joVGl1ZWbR^8(OuuRpdE4J3GQ7flZvPD&Db) zzpawDt+L~AKLnihCV#!)-^(R7H>dtN7dws=RcyyOO6y-8CH7kbWl(%&b?njnbj$I7@?*pw8x4Ps z4cEApcQtE$!m(j@@eBLU)!e@-&Y>}6H3j~x&FF~ksK)wlg8ZLqWyBu^j){hZGDNT6 zMuE-sZ|8=8>NCaHQ;O;y+C03iu0zrq;l3eOqo`bv-}fFw$_RUk&@k)o0YR;*1_y=N zBE75|a9JVbm=o|y9va=1@ap8~Rs=fc1AcmNWT^jOggItB5{fX#%*$``jCxI} zd8mJj;2=N8^T4du`+7Os-E6_(A(7s;EZ){mQCmiO+ui(pJ4fx1ylp^>kl^ooS%Wn; z-%y{hfVTdQ+(V^11Y(8dOC$)5ry{t+F!TSdo*umZEfKd?_d>vSCZc>l1E-OVQ;s70hTXp|^+ z(8wA{auoc<9p?&Xn#hp~9QB2@d)*&)w@xwYGey@G@g)F{s?p(W?cC(YogHZ=-chT- z7LH!kACJT&wk3fPvxJZ8m;w)U%zwky)(42H;asCqaCE#RhYB(_X4?OF*1y;z+$LCk zG`nk%Z&;*Fw5Rm(v7Y&a2ek`m(YlSlRmpnyx2jFsl&I319)9+9_3Jd=fsU*AG6p)X z;j6rSL-gQ8xmwYK6Hx_wWOQMl>+`hPuhaZ-NJLc%c8{RokhZ>U;v6EnzOcJB54LLe zsAD%O(x4T;d!~Sg?W23fE zdzv3diKAE8-K$4!5R-RpU2Q$02Z*hHXhKbRs}BmJ-0qda{M(kYN)Csme;C-G4C;$WJS_wLEp7jtL0>1G#hSxbZ{&8^IC2oIYnzcTvUXL0-jEVoKVZ|nq z){u4{D^da{SsTR+Y5!q-fA6w6W(2;^2K+eA@T*@Z_@{;*e|zvBMna+^({VLM^b*@g z{bh1;{hpj+zOM=U{hSPbPBt;ojY>xEfI$jsAOlS>BbXV?3T6j$g1NxlU>-0pm=DYk z761!^g}}mK5wIv&?9b+2;*0MDlIHWDljcudG?+^~Cs(7eD zH|Y|S=&j5hXNmcHjPb2^ys4E>epBWnuT54x>Pe-jjYmgq9{Fe0b3F4IKTMmsK~y+~%p8jcm-w7xB^cacZ*uY}~kY z^7^S4klkAUL*wk{LfW3fy3{4ABaqv=;w3cl?-WLH&pIMD-~P5$^v_K8MCH4frjEC5 zN}%I=pYgN<9dqVI>@I)rQZMULn3zjo5;nF;7~2AKoL2O}k!zW9wIi#hPU#piay(7= zBAkEbWVa9T5#G6`bNY?HK3T_iZLLw!ImOX#>U;^dzl=&#lpB-xWwjc=yo+pIA;gCI~pc`qttff zmgpH%N2wk6V$^?qmj0(}_~>@tFT3>d*YGj*K}^Rz=CvaY#&@WgE9IOXP5&?=eHtgH z!nucCa~QEp%bvmVeC>7J9D3gMI+P*JuMZX7+Ks+`*{x-8a4xGsZ;?BiJkjKhCST+a zlc9l*?^W62&VT-|$|!?lGfKia>wAMS;)ik0gc>|g;%1ER{gT+M^D6EZzsOu4;%?#U z<@WQa$G5m!xOjOu2E?s#5y*b$tn~?1tjS|4*1uh^eh-eYX9<&;o%@V74QPPqNwA?_C6vjZgMQr7wel5(KqtEPXaCUN_@Ciyou4gZhT zBi0bP)xn9XM*>|O(e3JZaC1DkJ03i&2S@1te_2KRgN?5xjt~6foZVj19v}_Y>H7kA{ zY=7F!_=hS}t8WWIjeZW=n4pYX75t*yagA+vCU}g0Yz+B(r~B_*(;GdE95~HNXx7xw z42!4%WZ#0V5)cqM{%uqFjO;3W{=yAS%cqf`|}M$@l-is_yCTolQdW>Gy4Zzj;-!UcIh* z@BdyM-8sL)bIX&d>h`WyYmJLaWinOOn$25oRZCB{t2LXe%jYcBZsqDkB$4T|&dJP& zK)$`xTEt!nx4N`wOsd9n2P9J!`E_n>W2&l6S*|-U-m$x_EobF(iTT;J2{&=r1@0iv z9h^)J?CMsX*^c&gmOG?3H6WwK+@UQTDF>3^Ppj3D&nDd68p<2W8ps|C7*CdYN-TlBdzYJhO!N*@@715k0c(Q*kA-&^J{yogu7>|V#zcNs2l>L zlMPj=@>z|IjVEr!pZqkGd9X@X4qfptzcJt3n_5~{cK^%I)I2_-tZdY}?i6Y3psMRg zZ@%ivw@54Mj+*lX=(0QSzMORP(v{DVzVX7VM?ML9TyFijq?cBWeCWv$dCajZ+iAID z8VehjyL+W7+q?4a-oE)|#v%V2zwT3_eNk6_>SWWtujlTUw0E~h+C>}euWevs+U@?d z$0i#p`%c?Tv`qfWH7Tnww9?QjL#qv~F?4{TwT2EfG~aCNDISoj=>cxLTitPXFb^Nf zx+Sqx1k7X(^xT7zcE5;R(%qG}7j(5I+=BtjLp=A;q&=k99%>iimb8cW+5xCyb~()0 zWh}f~WoB7zXpNx*46QYEprHvv2N^op&>@BflRmtV8jx$rc4Sq~J;HO3MC33qa`|j$ zk9(Bw&3vw-y*aVMZ%DXxizZB(GIjd&X_Lnn{}PRjKIKNvaVzq=jzR{*BCV1|| zKBkxyF~yQC1pnkhsz#?0cZ%muP1;fT>83=JB%6eL3{+3^-07G?rqjx25$RcXM%tZe zkIAO0Gg6)956$2=Z>g?qM=sIS-nF{Jn%mLca@LY;S9Z0f67I3kcU&^<&bIflf!q2x zZs*WH)f;U1Cuy)$wmnb=cAuL~yYnJpFn=5B;_Cf+z8Pv3=6KI9l~ao`8p=gWn1fAtp60oyCsQ>Y*=DOF=cd!{8A*F& z#Q)9NoRv6Ub+5C!8t2c^xj7T$<};1`n3pD)bexsEbY3hSGcO_Cl$V#oek1Loie{rC z%l7iJ@``-3m8xc1&ABaUw>7zn4(RT(TuVy%bGqF&3uCs<=9pKfT|240+aC9>L)8_6 zzF~3H(Tldlw0)ND-4XBI$=;RHdvqKFe&Q4@ZPe0br3U7@RlZxbTFUK~1)&GXb0Yv& zMSHdRc|6x|9uGoBWBJY52GjML)*3pjUND}=JmAkl(uN<;h95&lJ|}GC5>8kXZZ}S_ zVg0p9+p^m%8&+OttbA_VPUo@jcCGAbD}N;3dp&#au$9lZY;D>L5PKiB5py3)+KcT| zZA96}J$FOWUTPNuX`|g_D;+@g3fqKCbz4VwHt$|2QFD>!_J*V86Ns9N^++#?kMvSw zqn*lSmjMcwN1(9DsLO>?qxNfme%`WP{QOBhzE8!+_vu|3-)CUY&(fY((4H65o}csF zE2H-OJnXrtk3C!S8SFm4Aol#C=Ux@I=hd+1HF~^XijQ~mu8jA~u;;aQwTFfeDi<$V;a;C#?taZ4XmbzzbFI6`=8$#__M?IR4G9;P`Qv^@)fVpET;Qxj4yR zET1a1xtNX3McREjX^(I8_b&dK#6S8jD`XWCf*duTLG=IDbDyQRQbVwfSJ;THmZOOK zyN1CyY`D+aDCEpRwqXtS+0WbNpCw87_u2$6#7*#rT`|EQ;j2HV+!~Oz?y1R zojLM5SAOTo?|jXNK};aN1sDE3;=;F$o*iH0%x2~vurCJoPJCeRh6c7}hX?i^4EsI} z`%fD7bsF|xp8M~pVLyOj-;Nu$y0d+a)#`4gX%%ybr^@jgGO(&(J*_H11Qe=D1R@7` zRZSjp;ZB}~R1NxP(`o=861o^rwMD(~l^V$ELOQ~ zy9+n8rBfp*-{f%GaaUEe=G7=R(^Y$*;Hf?F8+O%b*3)VXrFdXfdx=2QReO`S3p3jX z{o$&8`H--y_ABa@9mEb^Mls!}{i#ewJeFEwzBr($d1p4KY8?8*7YFho@Wny$h{cYW z3BEWObx$24+^{bWrF@gadBuHE*^PSk-)Q%E?s59!w zgEPzt!ACZ@^5-w+ehN}YqXqUF58^Kca`eS`tkvDB9oIOCd7-kJ07(pSB8aCZ;TOl@ zOiOpGl~a>Zi+FGfoAqFDTCJ=o!&x>}n~@`*)vBhVxi>W^<39^3j-d)>GBu5TV}6@n z)W_hR!Ky@LGcG}AqGl@V)GSmS#%Pdi)v>4*h>hb=&<321Qv4b9VWEI?aZg6yL#c(*{RHWLq3@tv4{p}1 zI{&IlYUaX9ErKZAbE5QXU_V?`=L*fZ4N;3x#riGs{sWePr*A-1Bc;>oByt2_OUa12 zU>SKZ&u((gU(+AH`aAz`Z?l*%3I`cF*w7({4mC8u?{bU>K5gQlgNqnrAbm65B=glb zF=_?m{L2`XGIJTY!Nz)ff>CWqe}~Fp^;%%0p`#5=8hUtre&PBpTE=%+Mh_?@?u9Yi)hQD?x|+s>S)8gIHub| z`6h=B*U^?rw6%A1WaQdVwSpb^%pS9fmTEQuXt5{mLJH0Ht&RzwmzSZ<|yPtNmR6E0>+MJAp9&g^*Ih-l2Cwi1oncjaapq&y? zd$xIK#Rfmp0H3lFA$d+8!AMB1L9-6YwJ2$ztYa4eieo@^9K4*e>1LY-%3#nQxu5AjpP5f)U7gJ+@FNGqngVZHj;iXiSR+o|J-OuGxjBf2Vkq7@(ZLqof`6R2%?WInd z#cj{0AfdtZX?BQ!=`-MX>a)V{3#KdBvp+C>j-4-k} zEx%8_reh5q7utK3^2>GR;YGi&e{5b~?IXlZsm$s6(sTNn=}{wd8c&_YzwOUy+gSQ~ zpW#qJXcFW(AuNP77;sye%5@i6(CL45~2t>C5- z3cedv@I7C_+@gY;0|mE~Q1Jbzf?ItBIDh&hxGhldgAxjU7*%k)uK+3BSMVe7EKl75 z!WHMo?12@BsTa3g>L;vw>P}(nB?q6wz>IfMpvhtSD0u4bpdD}Wr2QV<&JacWy+M0L zgFf&5RJRlUXumILuWYE2{`c#4LaAW!4Ys`^HISbd=r!eM;HQRUTCKJ$-bQE6YVT@Q z50Fqlp(WVd7MgSRN>4pZ=_+Ndo~V9~B3yc!r+$GFUxlhiq!4-= zsvac=$-Z`jjdMJX9OdN|6_u43WLWB#F)6$j^ z8ZI?5cnROq-JxCp$KUA7<%)>o>#|^D@drp?WAR5(@Fyy;XQb7ODA+3;`re-9n7zVF zY)z{_i;&!uye!42m0lqaGbz_Ic@@=oFuw*KOo2Q#0rA4_x^R;_0v8 zAh)~$;!MB~X7MJ7r~ZcDq%#@6k)`o1vbBi;rHXQ7GG{7yz0NTdl?nZwGO>f`+r@r# z%|BSxZl}kK+nsk%42#!8I5&39Y0zulOuw2+@M?95eE+ zUGd8-$6$hAXL*X>W&smUNl5U6EkJ_bYni~|CtL9u@$;<^xcL=VWcygOAwTTW$l#}4 zaXmwgo*lhm+zs+T$S_eAzM>dz6ydURBR_Z=ZLk%94DAMNKD0)FgX)+h?au(e0ABeAuosE?+UqH#u})zQEj!(sp_%yBRF)R@{^iqW9lhrA-D_C-}N z6PVW$mMbi)KR*>hp@i7bR+HObe}P%W?u_LvB`$X|bT^p`)sJzkd=43qg-na319 zFNUb8en5Q77=RGLJ=kUd$aQ`eV?rYGAeWJQkw~?pOjscQjHG?x3CMj*~z=wN$ua zcPwLjlf$_N?pW!&W4Ux{l1}}&V+9-2DkU;9jg?Z2y5nT>itbp&YUyd5f+m;)ySh_F z4|a8@Ng=#nJDnV;!z)(gOYF@0d~2xEF_DV03V%O$2AgDGB^x8zx@F`UGX68A7#)As zXfx5>EEJTvy3ZQ z=@^FVS>(VdFpI$TLwr&gfqh3<4;HsTPdux{-;`jxoK~GOE-_@66r+ah_Lb}D_n@kA zP+SLG@U;C@2?e>Rg1oOFv=SA-!$NX42qWPf_CO>MqKt$!tb1y$u*2J%brfiFXh$q0 z=LYTmLUW#Or+tEj&oxHR&LZVY=ntf97S-~u=*v>9wecBd zW6UMj79SSAgZ+xIcjyi_S^ON+|17gK4bFSlLAkz8{;C-mJ|L}}w_t_39`z-|hVs)< zR=q11W{cu1|^2VrR3${q*}LX@F#8|$9> zfw04=^@kK_a_9o2*4u-2Kehfyw{xOFYTfFG${pb1zDqjwlUjem# z4G)jv>(B7K9uDi_7U<_>tm5rEzD}E&w67o(C(i^5ep^Dpvrz@V^A&_H zd!GvwJYPb=@1qJ{@D+rvoBseFBK40TjMP7|2O^ab!bp9Qbx*w{>~N(1nF37?(T7NV zIcWDI^%dPtRH6OVpxwU>eoeO%EolEs(C*(RzpmSf0zcAI1N9P#4_nn=!T00-4U|3g zCVu-}BLBw5w0cWqB<}w%#i+;MCQs*Vxq)M~zqr@@V)VZQVIBSNme9PVsJT>Vp;i9+ z?v%}IiT5A@#J(@8!%^~2qj*=N|6gG}rG{um7~EX>^;p#YJFI6HqW=S;a(dk7+69&ai??-(hRgM22Z*{1B(B`RqrOzvomkrXMSW2mc+7AM_qTgSn z$BMMS8Sph70B-Sh!#K*M)qx@^x#1uwM&WTVd4PvGZXLpEpWG0#@SzaU`5;+B+hIoA z&h7>d540aqLi>?MJ8*zd9_Gh}e=bMJ>X)5IK@NdYM`hupaLmRxpE3G$u9h(04 zj&1sRb@iH#H*|ub6AhhY=ww5u7&_I^V+@^U=yXG87&_C?S%w~~kBj4J7(v7YS|oNh zndnQ$QGVQz?&_BkmY;$AIFSVBv<3i__WOVKK)U!C^Z#4d^#!6z7N~qN34I56-I6rU-NPzVy0%!AeLcT{8?!jwX!N^a-znu(16U+ l20}2brX0XPClCxasHfV6tFfTpusCdQa%ek0vlI$-{{!{^`Skz* diff --git a/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_3.4.4.pickle b/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_linux_3.4.4.pickle deleted file mode 100644 index 67a442c2be93bc9c64cdefb7c98de4af29f2c7ed..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16581 zcmeHO3yfS<8J_p<&hEC`?NcGNv_zIi*S3^Gd6eC5p_Fl#))rz&*745H-Mw==k7XXv zR%(f{g|&y$$d#agDFijfNJu1-5CekIpeQCL1tkq?6ctRgAqoWy_507`K4$Lh%xt%| zG2YGI|K7Rho;l}#eE;K|xjXXJX9oiO*lYE%j?!eZq!%-K$)ej(xn!{glUhElmHJDQ zTCt?}%g3y1JHOv9T0dZ2wAcCye<_mJa=OJjQst6r+mKXfmypHH$5j-#=C;>iwB~Suf_bEXLKlJ(Hiv>Kn6#v7N(O zUYpR1mK;Xks59)c7zEx->$~t)GOyp6+@zIt8p0O-;RBYjh;>(g1_l`M5f+i7$X9Zc zyZiYda+Pd3qZNzVZi{s#i~3k0Un&pSJoE z!NlDCPP1qle9`H}(D0RLnFxwj5f;408W|b6=_C$*N(3;Xp-{eLp$q(#tpirsN=*&z z3Iu+2-p>aAH7^i2@2SzlM011hyg~Gbzdo??UC_?-Id>5~vG}(W?}2XqPVNUpAG|6Q zJpuaW%MWfRx_0!E$BCXl@2&gZ2c3L;?=hm!U)Foc2cUC)@~z!OM{m9Qr4Q!O*R#lY zQNP_{i=>nky9?vvC7sl#b7Q5H$?K)kCa}w;Cc!-B!6G~FOp~s$j(nk*)3TX8x;4dO z;%$qygZW%gXh8Bx>shD-ag_duQAIUJVG^E@_XX3nX14X`iGQ4}g1)vd@uYP73?5 zSe)y%($B-d3rHH3X zjE@UzFwU)g8`^BFJi}UOtTWat05Yyk1dR=QEn{HH7&JDyRmIpW^#lqRTh+gMZU1Tx zS+Nv>NXZ)IckQ`}ViF%fdD~N!3~U`X$D*ZdW{eCK9nKY2%3>U|N|v#;qOpj!3$SS{ ztdzJ}Bst#2Bm_cDqvP3vRbk*+K6!P3S#PMjSy&NmJ+u3lfM(8kICADQOiQTnRxt_+20n zm$Xw-(oZ_N1qsZNlyugyZe!SM1!TE9riM<7RT{U|vpcdD8r~_`)WgQ3)63Qwbh1uaJ>PD(I{XDUKd>B05By#2aK`*^sleSj`juu3-=UmA)4-tJVEsP zk394a(NO=&!B;`IJ^#Y}M2~E}^$nu`_`}`D?u#rq>x)u&~!@ z60{+Lsg!g?9a*iMxkGxyDG7M2Eh$|mfG_CyNCkW>Hi>wxkcLd`xJr3!`Gj?D zLb(jZ6p^Anv4+$SfH+Jsn`=%8bqmFEp_oS0IKpCCEu|x<9EGMYWj$_)O=%@Wv)wlo z3w!kZ$R_{zpYKC=?F!i7qwIfy#S94c(c`@hPE9Q2bF_ z9plvRP~)U&MO`y8;&3sUBD{~EmyU=L-t0koa|OP{8{}L^Lj)UGIHeUWI`6@GyimaT z_R20xS7>ZJY_7uFFbRAy+#G^nsFhEAfH=imPm|%j7V_X`xO`{9=}AY-jFf?M-&}*1 z<|dqUF3S>YM@lkYq?7_sbhtXW6HnM&5!@k!*KXi#U4dqIGOdqm5PxzffC-MAJ9`Tr zw@Z3l*~xsnpSP;zZf=bDaRK~yxDXt)Sa2i$hw#72;l%WT)~RCD>?R4) zN?QrLbZ%bN9=r{j2@igyasYxhA6H`_PNZ~OB3>Jz2JsE>*b$0Ugw0jQ(y9f;D%8S` ze$iP~z;XJAOM$%#Fer%fAF^silcLb8!LUhrz-8Ob;td?lb6K=McR*EY$0 zdeeU@iu)81Hkd|COk)*w<{XWuF`L6wGO?9}>$M9GeE|TIQ~+R_z&E4_k&O(Y(IF${UDr(*bjb5 z4|Ic>V{}1jm2rr5P(4Nv##sCpZce8q@;h$lh?LCM4R=&n$Lk)|n5M!rWhUSuKV+0O0Z^ler~h~h$*bf9HaJ5wmS??BPLRMv8nNWs61#P!fCzxeu> z=8cSyvo|+D#+wJ^+msk`3q2zecuPs(m|FtrauRC;XN87OTS6z>;o81TJGOo?>P5?* z%Dti!J^4}UX`&;C-wUB$wCvII*|t|zeW>Xufkh^FgApv0Bsuo6_$}K%dA(Mmr8MiN zBhQ{ZnFz4(r-paHim;-hR#KFE@eOBZZte|i=^_9H2dWOb5I|)dJu75bvjkI*3Cf@p z2CDlri&DdivjZ;99zJ_H+kAX*V&9qNY*}%3!r?3nk0HgIE$*r?OlgwRETA1fX#UD@ z*FxK=%GwsekF#xfpZ=LD~~)iCx^e{}K15_jG1m{85q-MX)c?g^Xas>B?kFM&=Z)>$x?X%e+iuQpLrj z;)zLh5dN91*mXnK=Ao;HDrhpbvFquxF*IM*ck+P_XdUO~za4I3=?=)DXZb342M=G9 z#9!~4vuMJiTX|ClM0k2UYN13Udokvkp99O(%5~!!xo4#yx}~h=1}$?925K|h+=9_I z#3Xk=ju7omWEr|@J4G-NO@!{5u1H#)odyPk{i-b=iuTCQPwW%Vg?Fm8X!VVLtDoQ- zKNMETTYGi1(Q{I&mdldF>Y0@#J#RKF5pE>iQ*{`3iSR~xKn1@`=t9CS0BCRT#$7xY9R#-V**;-k%(C6-yHy37YH{1H0Wz&sq_Cs9$JLvH zR;z5aidOGVs;%#fel8dg^b?ZyNqVuQmrB|%=_*NAOS(qV%OqVZ>E)8Hlk^HnuatDX zq*qCLwWQZbdTrC_*^J0FU(9sEh5Mju95eHDAybpXv#gsQ@FKgyA}^3|qq9nAPR z1?tqioe0iONZiA=^@Z;<@aw|^7F1kk(Xvt0bxV}BRj%cfyw{+OYu7^E2;)HORqt9W zMcjaXTCKIV91W1jXBG{37AYG>1LtB5#5WpPP7f#=c-t{O7DsZL#3qulLe?_>ga^u? zPF>;M>#S9ipRT3}JK>8g3NOJnez<+M`3L~O9_bO>Oqt&$6t78$$?8Nx918NqaPu*0 z*C-A57%SKMnhabUGiVPDi{#Up9QIx#)oTVn>vF{(Ih}>z&yx5kU)w?#GGFULNme8l z;^H~c}->U~uFDihVax~n06~1r{M(WD4==#)RIXZc=yyb? ze!RSYA?R;*JaZS(3;&#Wc_IJ(N-YBOdD$TZe&13ktOja=tP$igLDmX#ISAA!4%M$oA0sp@CRGcWu%>xP=)oWCz}3h?soULwmE1(rL5foG; zn=wWWF~-EiBqkb7jEP$`ZqdZVHAZ6+jZ2KixS&ZS|9@`X>Z+O^KzzRUhVSch@2PW7 z-Fwb|&fSW0D?K-nO4qb^wOY$vRH{egJra;sZl)M5ZOU@p{_%$0ZEbn0kWbFatw_4b1I}{? zc<#Vdx_?)<>dbYtue97jYt#L*TFf2X!k)4tN&d829fe%d-Jv1TP|;9v!6J7^dRVrY zx3bFW$Q9bp)B^6%wdv|J&T7rO!@P8&t07TrmJ#RN9b3AJojoh+Te_81-&yP^wC9vU zN6@~dyDMK%#g;L4u#G%*M>JHdPA8i2zMWDHiFJwOYG0zYu%gFGx;v*U=O2L{ zB_J^})li*IOl@pzJbp9&E+yT#V99f&Z@lp8!B2u7onLh}=_S>}AAE9H0fX$y zbz1JI#^Rbq?k?%7_O61vtFgZfIpkp9vAdOdKDVndX`=spchB7;W$$PYw@XIY(>H>R zakrask4`mI^&YoB(hB)YXi`z>(<+}<`?SWVwLb0V(>kB__i3To)??f&UE2fXcDK4? z>|h`U%=$@kfe84c+1qpXN!fj3a(;JL!JggKnsoOCF!%G^{Zsa!wf11U1h|wvbgdo0 zDrT4ijA2H@zt#RQYkXSk(|$g!^J#ydCVe`;DmM}I6N~9uokQHiJa1kqN;XNBNxFwa_Yt0ZBs!k$vx#Pd2r8E$gsmc67I#F+bOpTV|=Gdld8?oyxe=?A>hOb$T4H zGsG=5t8KU?OnGuKgso+f)%F;~np#blB4+{J;r(+tJtbY*9| zyTqnrPxRcSDSNU##hz-@F(<(>Czql#l!oRr|N4P>iszo1O4oMenyrq!o5{GRrR?Dm z2sG#NR`OWYz0&GxoHaw|<;+&+Ub7HBODG8Y-kx0OR!|jreW}_oV zfkZ{3ve0a$YnW5>ZcD~(O)aGhy1Ojbl2YNUZnw=sudOrL>tz|&PH6|X#~tj5?n;5+ zkT~?{NZVr4K0`O}h&S()=2g;sWE=-3ZHk&Us_C-Q{qxXp8!&yL&b9BI5&Q+wLfpNcnMCC#_l)aP2Zw(ofe!cW@>zR#rW zdG^UR!tH#|U7fNQ*rkYCV>j7KN0PnR_G71}t)n|va4(SXxzKahhQnta!sjC0)r;d@ zy~LPlyYkw40OHaJAT}6%`B0wJ-tEiVn^uXpKdXEAxp)shZ!~UK55EADevu}JPb~*1xIPV8^-Vf=#tLeO(Joo0P^L_;9-4J(PWg+k0 zBA&a|bAKH6+)v=S+q8Xu3i~$tM!Vf}f7Z)rKi`tk?vSA+SKBb!C$JFrje94IdsigF z?l$j*N%3C&7oQIL%I{lW7UO;QX!HCcWS)CH_r6}{`Q?_(bHC~4Cu*JtfRG1?kcWtn z+lY{dJ@;1zAxT+-JOYH=9YcscM$qx7=l(j3j^6+sk7+FbHjd@reJm_L4#PeXapaRm zALh;z%-s1@x%I|0tT!_5(bb9lo%T04?e((DLw{xXch7wzK8`od zIG%!O|B+hC2$~_kGv)Uf`JJWtFo_?JZ^4cKjJWY_qiEY_Jb!)j4or-$y&Lb^dqyL> zwrQKY_C9R;0d4y)+V*wY_TQfSpQvp=gl*rB+qR~&eYw@@Zl-k=cQu|$;5TGrRmplr zRe=aFRJ901Hub88JRWXr=RQf*LP0L0`tdEH(*ji&QVgT0{;bb2$*&<{%f@v5BM)D3 z{!ngART2WSzHaO8Rs&F(Gu@XKbgO}2>2VCAs_1;E29x)3#<2qwg$~)#c zV!inpH4Jr6?I_&P(oPMhe3Qe`#$8m|T2LeSm@e801yAja->{2DvYt_+D8)^z+C>DS zF4~p6k2AF0pa3q~oo@-dXpfL$Sw?K*(G-)7+LPL(&!ed(=8nBWs<$WWs4-9gckIo# zz#aR@JLWlJDY#=_)IGJIaKrA{pYlx(M;3QSRZFg*QhZK-9DstS4#aQR9|y6XQ3q2> z#&(DZMEy}u9{u6(77V7rmOry`Hwj1`iYKtxSP(N8$bJ|dvX*tLb{y&?j|r94I7p(8 z<3T(%0l(NIXIr{kt-P9uTEvNm@v+|%6?iyCbk1|cQ^3P4)IBv@xEc>tTt}#5Dc|JKk}*6Ku~+WN>LV&ON17ie z%`Y%M%J<-?&8j!2SW@%PveaCN!b8VPy9Tzy;dQ>)j8hUd4^_0)MCh)QyP^vgXzE8#*BNMuiGbm?J?g+3f89U zc!JF=P{7@RIvKn*YZwxzprl=XD%-?do+0o5lgm#-hY)gE_8K8~y7YOTBfv$-*ZJ~UVL#OMIODEgfJ36v*rl?xM4%}vsT1rzj`%!4ICC*2R&Gs#hA7O3a zd1@JcQ;r1IFzwS48yTLI;=vF}fS%-j?yCH!r=EJD~ny2k@E z-Wgg^vB!;>4Wykj-g=Jo_Q%#7@-FCr)?Lpb-L&)5%iaT>bA9J6q?b*utl0$m;zO4; zkxtAUbPefVLtnY%ebAn3yk|-8m@w?14?qXsd)*4s%#tbhe=sx%`#d@ei;+JK;Pad}p6$w*u-$8#N*D^Sv4S;;m67RLv|+2DBU9N`aI z>%;)yv1MNX_!Jvy0628|+VuauKUbk2i0O0LZ^ZO@?4mu-k#Ipwe;Rd9eMY$9m_DEK zO%7eIW7>(wF)!(w3wb}1S5sahxtOlQG~1?dny%J>>!}O)EI!GKKvGX3xLwX2%LYkx zA!HGaYo*~j{Q5~?pq~V^-ixTQubZRDosqg&E4+lNGHN}EUI|?)#pptB19@~{^=g|d zq0h3;6kl%FSy}>p4iXwppJ#&zoW1~#r@kos-f+5%E&GDgm)MBC!TD7mV`+R)m$9K zChMzuOED?PM13`$AtA)O|5elE6Yc&+^Ok5{ny*P6GNyh8=Cp?#yb6c?&8tn11T>zk%Ytqf}{ zwU++2KCCT+>RU$p7Q20YZz=6JED^q~yN%fR4yAOmxj~B2Wb@sU5=l1SLtW>ZTo7g7 z2TyCbv5bZv#5DZSXqZ{ja8sb+<}w<76w`2v(SY5lf10Ll4K)0?jE0}YG~8x1AgLP- zKLyY7)a@XggMP*)n1h&%aR8=%&bp`W5H=>EbrtIp_1D!NZh9-2&|&T%Wr#ex<5Zo(38zC!A}p$ zwpwjDTx4g@XzyxO_mfv67w+l-mUj%EhnBt%;z_XdEjH)vC7ybS($&gZHeNl9BAk1Q zr+$SJFHqGZQV89lsz=G;Bv7~7#%>>bl0>4ivZ@Nb42%6bE{4lg{=>~S?o-uo_;@KV zklWf7K7V=)^`(Stp5H^Ys^6l{+g9~E3TnhXF2yKjpCB(ZK|aa4GX=lSRQ?RWj~o`b z#pUWLRAiEL_&VHItEWM=@@G(r?}vU*9+hL0#@Y8-k>Ncsk>s$CRPqN<|1wrRhjKn$ zqaF1;DxUfyesK$j(~m4}@LRe&)C=I4t$-ZEh&Vn>3sxO}f&^9_e-;IQp#pnyM!krF zz1X4u?WvBxka&qtGwQD*Bu6SQOEGGvSIFZ~5_%}Fq8^X!*T91@a0d9dG8$et8vF}x zlj6XZzaY5&4i1vd8z9a&{9qh!f_Um5_)R$z@f(>o-y++$G5}SXK(=)zfw|T>oU$^e ze^My6U41*$knVYhb?tk)y|f&97d0MVv-bb+^fs>P5Mty*? ze&N4RiofvR@XN5cqQOSdww_7ufCj+H5YZJr&KPPm0j^bR`0dolIM^WJgU&!~7+5$R zXVcyg8*VhBZ@~#=FN}?#R19M~g<8@)JIf2A`K4r`k$CE8fQ@1?fELEN3tMVz?JC7+ zfbK?K2wS_OF4zi9oqK?%HSAeN!|0fXy^Mw#rHOA0c$oP11|d54VH2Q}7$G|MWgUmw z!X`XR^V#Zr%vz~HBA6Nv&;s;>p0sd=va5mXf#7=TAZgVHTo2~Mj5mE$+0Yi zeKIFdB0kyn{lTVH+7nw_XzLG#)Ny>V3}r6g81u>TK`-<)*}!`Fq0GZ0_+&l^ebPut z_=L8iPfh^!)B@o~e6o-NO%BH!_+*Lk$s%dhB(3`J$zndtsIXQ@63;ARz z>*WV>5+1=KSoNJOim>WCMGE2N+^OVHA1-N;LCbx{s4{Vx%8E*}-aC!YWZ5OlCRy5L zol%|AH!)|I z6r<+s4%BNK^q{T*QJNlH@U#_G84dZEhJw)$nxcx}VS+gmgi&!8n;R_l-CCmmh*b zdMOBTw1M)#5uHaIeHPSHpA)Xmu6l3%c|LD)XdIJV(*yBIrhKY`4>R-X3wUHe^F@?B zbs2v9Ttt6~4>Rg=k&)qjS&C86TtQwa+kS=h@bF4-xe|{wT&^mk=xU>Ad*Sj`C;(hG zif*}m`kEB$Y<%e181u?Cp|^$4bH6V9Z930Ql|J}1AAXjn!S?W4DA#Ax-|%~eZ&a&} znY~zDhx+^>gZXtTtKUSmutm$+r zoDVbV4v~?Vxl@W!Gu=fVJ?LL;+|7D8W~A(YP*;kddmyIc=NDy^-fNU@Z~WW`1#s~% zMRz#g-fuL2)Onuk&Pe8dAgZa{Jg+k=PS#Apr!h?rMm6aVs9xLn{)XYWMx8u<{qv-2 z?-^4+0`#fHx1K|~|4R)Ijo|facytd%HF1sfFj-4^rjM`T#;ISSYHH)vBca*^^=POz zQT;kpJ52osHFys;eGDZ(YKFp}ML`IUqKcyIT{@djF>73wuhxNXaf##P|!n+XG zDdD{`syBsHmy0z#mcQO>YW?x9dLI&i?+2ng95nwjnm=|*_;*xOxo-Ns+*(TbPgK*# zNC_VrO#lWKnDokKvK(dthY7%8+&c_+hfzMqbg?1fB$2rs9>RvdpoJ0nUkUvjkdR^& z29@N6azGX8@f^TARh0Y*b82Mi1}3@W2xu+iXe zTz6nyhM{M(871fr!6T%Cp&(2J!zhVVz-VDA*pYQl4HtGK6^vkuCWjD281EDalv2UY zT7uAm#K@9FkP1d=2|@-EyBLXv%7$88dEoeHS1mv!m`qSi*WkbITODdQJoD7<(&&}Q zWrjRYE}&FW?EwK?0PHE!qea@R8jPmBz%5;8jG;_M?Jc5`FZPjQ6e#@SNdI z6K|US@9p>W`|77_I@YJ-d^+Bz6MQ<+r-%7;l1~r!=@C9X(x;PsI>o0`eR`DMXO5+9 z1Q+9Il-Q1Dd|)WO_MX7{mT)l<0@!CB2EqWGM0o@t;l}_x92DOu3pc#aJd)3w9QwuV zGaZ>2eY{VWc2lIC!MhU$`cyv5sG~$i26VI(qd=cV9)a%XlIg6=xz*f))j31WK;7X_ zDW}Xt&7SFuL(~FG$FM;JOtZ+5BpU)#Z~$H*e+Ep3R`^aZw9c;^YQB^;hq1jDy|a70 z6q8)=QnSuA6kU(}8NQ)?J(qRnu`*(@=o~3VEqYwg6Ah8MsEY+lyR+lL(im$fqhMZ4 z!TjC|8o@$ZKLLcOTEG@S6@5)qEo9wOi-a9X>rIqsa_AzY)kr5-EEXAmiP9ZM@yHi;gD+Sn_sWM=eV0Rj7_IhVFd;{2J zSqrDI)2Tr;hETUL;^p8IpzTMU*7s4{8}ynZu`r5r_{^AN5mPmD5V4fg5)4e^sg-px tl@m3Pg$GDAZ6E~2GRgrI^aMd+gL?soULwmE1(rL5foG; zn=wWWF~-EiBqkb7jEP$`ZqdZVHAZ6+jZ2KixS&ZS|9@`X>Z+O^KzzRUhVSch@2PW7 z-Fwb|&fSW0D?K-nO4qb^wOY$vRH{egJra;sZl)M5ZOU@p{_%$0ZEbn0kWbFatw_4b1I}{? zc<#Vdx_?)<>dbYtue97jYt#L*TFf2X!k)4tN&d829fe%d-Jv1TP|;9v!6J7^dRVrY zx3bFW$Q9bp)B^6%wdv|J&T7rO!@P8&t07TrmJ#RN9b3AJojoh+Te_81-&yP^wC9vU zN6@~dyDMK%#g;L4u#G%*M>JHdPA8i2zMWDHiFJwOYG0zYu%gFGx;v*U=O2L{ zB_J^})li*IOl@pzJbp9&E+yT#V99f&Z@lp8!B2u7onLh}=_S>}AAE9H0fX$y zbz1JI#^Rbq?k?%7_O61vtFgZfIpkp9vAdOdKDVndX`=spchB7;W$$PYw@XIY(>H>R zakrask4`mI^&YoB(hB)YXi`z>(<+}<`?SWVwLb0V(>kB__i3To)??f&UE2fXcDK4? z>|h`U%=$@kfe84c+1qpXN!fj3a(;JL!JggKnsoOCF!%G^{Zsa!wf11U1h|wvbgdo0 zDrT4ijA2H@zt#RQYkXSk(|$g!^J#ydCVe`;DmM}I6N~9uokQHiJa1kqN;XNBNxFwa_Yt0ZBs!k$vx#Pd2r8E$gsmc67I#F+bOpTV|=Gdld8?oyxe=?A>hOb$T4H zGsG=5t8KU?OnGuKgso+f)%F;~np#blB4+{J;r(+tJtbY*9| zyTqnrPxRcSDSNU##hz-@F(<(>Czql#l!oRr|N4P>iszo1O4oMenyrq!o5{GRrR?Dm z2sG#NR`OWYz0&GxoHaw|<;+&+Ub7HBODG8Y-kx0OR!|jreW}_oV zfkZ{3ve0a$YnW5>ZcD~(O)aGhy1Ojbl2YNUZnw=sudOrL>tz|&PH6|X#~tj5?n;5+ zkT~?{NZVr4K0`O}h&S()=2g;sWE=-3ZHk&Us_C-Q{qxXp8!&yL&b9BI5&Q+wLfpNcnMCC#_l)aP2Zw(ofe!cW@>zR#rW zdG^UR!tH#|U7fNQ*rkYCV>j7KN0PnR_G71}t)n|va4(SXxzKahhQnta!sjC0)r;d@ zy~LPlyYkw40OHaJAT}6%`B0wJ-tEiVn^uXpKdXEAxp)shZ!~UK55EADevu}JPb~*1xIPV8^-Vf=#tLeO(Joo0P^L_;9-4J(PWg+k0 zBA&a|bAKH6+)v=S+q8Xu3i~$tM!Vf}f7Z)rKi`tk?vSA+SKBb!C$JFrje94IdsigF z?l$j*N%3C&7oQIL%I{lW7UO;QX!HCcWS)CH_r6}{`Q?_(bHC~4Cu*JtfRG1?kcWtn z+lY{dJ@;1zAxT+-JOYH=9YcscM$qx7=l(j3j^6+sk7+FbHjd@reJm_L4#PeXapaRm zALh;z%-s1@x%I|0tT!_5(bb9lo%T04?e((DLw{xXch7wzK8`od zIG%!O|B+hC2$~_kGv)Uf`JJWtFo_?JZ^4cKjJWY_qiEY_Jb!)j4or-$y&Lb^dqyL> zwrQKY_C9R;0d4y)+V*wY_TQfSpQvp=gl*rB+qR~&eYw@@Zl-k=cQu|$;5TGrRmplr zRe=aFRJ901Hub88JRWXr=RQf*LP0L0`tdEH(*ji&QVgT0{;bb2$*&<{%f@v5BM)D3 z{!ngART2WSzHaO8Rs&F(Gu@XKbgO}2>2VCAs_1;E29x)3#<2qwg$~)#c zV!inpH4Jr6?I_&P(oPMhe3Qe`#$8m|T2LeSm@e801yAja->{2DvYt_+D8)^z+C>DS zF4~p6k2AF0pa3q~oo@-dXpfL$Sw?K*(G-)7+LPL(&!ed(=8nBWs<$WWs4-9gckIo# zz#aR@JLWlJDY#=_)IGJIaKrA{pYlx(M;3QSRZFg*QhZK-9DstS4#aQR9|y6XQ3q2> z#&(DZMEy}u9{u6(77V7rmOry`Hwj1`iYKtxSP(N8$bJ|dvX*tLb{y&?j|r94I7p(8 z<3T(%0l(NIXIr{kt-P9uTEvNm@v+|%6?iyCbk1|cQ^3P4)IBv@xEc>tTt}#5Dc|JKk}*6Ku~+WN>LV&ON17ie z%`Y%M%J<-?&8j!2SW@%PveaCN!b8VPy9Tzy;dQ>)j8hUd4^_0)MCh)QyP^vgXzE8#*BNMuiGbm?J?g+3f89U zc!JF=P{7@RIvKn*YZwxzprl=XD%-?do+0o5lgm#-hY)gE_8K8~y7YOTBfv$-*ZJ~UVL#OMIODEgfJ36v*rl?xM4%}vsT1rzj`%!4ICC*2R&Gs#hA7O3a zd1@JcQ;r1IFzwS48yTLI;=vF}fS%-j?yCH!r=EJD~ny2k@E z-Wgg^vB!;>4Wykj-g=Jo_Q%#7@-FCr)?Lpb-L&)5%iaT>bA9J6q?b*utl0$m;zO4; zkxtAUbPefVLtnY%ebAn3yk|-8m@w?14?qXsd)*4s%#tbhe=sx%`#d@ei;+JK;Pad}p6$w*u-$8#N*D^Sv4S;;m67RLv|+2DBU9N`aI z>%;)yv1MNX_!Jvy0628|+VuauKUbk2i0O0LZ^ZO@?4mu-k#Ipwe;Rd9eMY$9m_DEK zO%7eIW7>(wF)!(w3wb}1S5sahxtOlQG~1?dny%J>>!}O)EI!GKKvGX3xLwX2%LYkx zA!HGaYo*~j{Q5~?pq~V^-ixTQubZRDosqg&E4+lNGHN}EUI|?)#pptB19@~{^=g|d zq0h3;6kl%FSy}>p4iXwppJ#&zoW1~#r@kos-f+5%E&GDgm)MBC!TD7mV`+R)m$9K zChMzuOED?PM13`$AtA)O|5elE6Yc&+^Ok5{ny*P6GNyh8=Cp?#yb6c?&8tn11T>zk%Ytqf}{ zwU++2KCCT+>RU$p7Q20YZz=6JED^q~yN%fR4yAOmxj~B2Wb@sU5=l1SLtW>ZTo7g7 z2TyCbv5bZv#5DZSXqZ{ja8sb+<}w<76w`2v(SY5lf10Ll4K)0?jE0}YG~8x1AgLP- zKLyY7)a@XggMP*)n1h&%aR8=%&bp`W5H=>EbrtIp_1D!NZh9-2&|&T%Wr#ex<5Zo(38zC!A}p$ zwpwjDTx4g@XzyxO_mfv67w+l-mUj%EhnBt%;z_XdEjH)vC7ybS($&gZHeNl9BAk1Q zr+$SJFHqGZQV89lsz=G;Bv7~7#%>>bl0>4ivZ@Nb42%6bE{4lg{=>~S?o-uo_;@KV zklWf7K7V=)^`(Stp5H^Ys^6l{+g9~E3TnhXF2yKjpCB(ZK|aa4GX=lSRQ?RWj~o`b z#pUWLRAiEL_&VHItEWM=@@G(r?}vU*9+hL0#@Y8-k>Ncsk>s$CRPqN<|1wrRhjKn$ zqaF1;DxUfyesK$j(~m4}@LRe&)C=I4t$-ZEh&Vn>3sxO}f&^9_e-;IQp#pnyM!krF zz1X4u?WvBxka&qtGwQD*Bu6SQOEGGvSIFZ~5_%}Fq8^X!*T91@a0d9dG8$et8vF}x zlj6XZzaY5&4i1vd8z9a&{9qh!f_Um5_)R$z@f(>o-y++$G5}SXK(=)zfw|T>oU$^e ze^My6U41*$knVYhb?tk)y|f&97d0MVv-bb+^fs>P5Mty*? ze&N4RiofvR@XN5cqQOSdww_7ufCj+H5YZJr&KPPm0j^bR`0dolIM^WJgU&!~7+5$R zXVcyg8*VhBZ@~#=FN}?#R19M~g<8@)JIf2A`K4r`k$CE8fQ@1?fELEN3tMVz?JC7+ zfbK?K2wS_OF4zi9oqK?%HSAeN!|0fXy^Mw#rHOA0c$oP11|d54VH2Q}7$G|MWgUmw z!X`XR^V#Zr%vz~HBA6Nv&;s;>p0sd=va5mXf#7=TAZgVHTo2~Mj5mE$+0Yi zeKIFdB0kyn{lTVH+7nw_XzLG#)Ny>V3}r6g81u>TK`-<)*}!`Fq0GZ0_+&l^ebPut z_=L8iPfh^!)B@o~e6o-NO%BH!_+*Lk$s%dhB(3`J$zndtsIXQ@63;ARz z>*WV>5+1=KSoNJOim>WCMGE2N+^OVHA1-N;LCbx{s4{Vx%8E*}-aC!YWZ5OlCRy5L zol%|AH!)|I z6r<+s4%BNK^q{T*QJNlH@U#_G84dZEhJw)$nxcx}VS+gmgi&!8n;R_l-CCmmh*b zdMOBTw1M)#5uHaIeHPSHpA)Xmu6l3%c|LD)XdIJV(*yBIrhKY`4>R-X3wUHe^F@?B zbs2v9Ttt6~4>Rg=k&)qjS&C86TtQwa+kS=h@bF4-xe|{wT&^mk=xU>Ad*Sj`C;(hG zif*}m`kEB$Y<%e181u?Cp|^$4bH6V9Z930Ql|J}1AAXjn!S?W4DA#Ax-|%~eZ&a&} znY~zDhx+^>gZXtTtKUSmutm$+r zoDVbV4v~?Vxl@W!Gu=fVJ?LL;+|7D8W~A(YP*;kddmyIc=NDy^-fNU@Z~WW`1#s~% zMRz#g-fuL2)Onuk&Pe8dAgZa{Jg+k=PS#Apr!h?rMm6aVs9xLn{)XYWMx8u<{qv-2 z?-^4+0`#fHx1K|~|4R)Ijo|facytd%HF1sfFj-4^rjM`T#;ISSYHH)vBca*^^=POz zQT;kpJ52osHFys;eGDZ(YKFp}ML`IUqKcyIT{@djF>73wuhxNXaf##P|!n+XG zDdD{`syBsHmy0z#mcQO>YW?x9dLI&i?+2ng95nwjnm=|*_;*xOxo-Ns+*(TbPgK*# zNC_VrO#lWKnDokKvK(dthY7%8+&c_+hfzMqbg?1fB$2rs9>RvdpoJ0nUkUvjkdR^& z29@N6azGX8@f^TARh0Y*b82Mi1}3@W2xu+iXe zTz6nyhM{M(871fr!6T%Cp&(2J!zhVVz-VDA*pYQl4HtGK6^vkuCWjD281EDalv2UY zT7uAm#K@9FkP1d=2|@-EyBLXv%7$88dEoeHS1mv!m`qSi*WkbITODdQJoD7<(&&}Q zWrjRYE}&FW?EwK?0PHE!qea@R8jPmBz%5;8jG;_M?Jc5`FZPjQ6e#@SNdI z6K|US@9p>W`|77_I@YJ-d^+Bz6MQ<+r-%7;l1~r!=@C9X(x;PsI>o0`eR`DMXO5+9 z1Q+9Il-Q1Dd|)WO_MX7{mT)l<0@!CB2EqWGM0o@t;l}_x92DOu3pc#aJd)3w9QwuV zGaZ>2eY{VWc2lIC!MhU$`cyv5sG~$i26VI(qd=cV9)a%XlIg6=xz*f))j31WK;7X_ zDW}Xt&7SFuL(~FG$FM;JOtZ+5BpU)#Z~$H*e+Ep3R`^aZw9c;^YQB^;hq1jDy|a70 z6q8)=QnSuA6kU(}8NQ)?J(qRnu`*(@=o~3VEqYwg6Ah8MsEY+lyR+lL(im$fqhMZ4 z!TjC|8o@$ZKLLcOTEG@S6@5)qEo9wOi-a9X>rIqsa_AzY)kr5-EEXAmiP9ZM@yHi;gD+Sn_sWM=eV0Rj7_IhVFd;{2J zSqrDI)2Tr;hETUL;^p8IpzTMU*7s4{8}ynZu`r5r_{^AN5mPmD5V4fg5)4e^sg-px tl@m3Pg$GDAZ6E~2GRgrI^aMd+gLWZ#0V5)cqM{%uqFjO;3W{=yAS%cqf`|}M$@l-is_yCTolQdW>Gy4Zzj;-!UcIh* z@BdyM-8sL)bIX&d>h`WyYmJLaWinOOn$25oRZCB{t2LXe%jYcBZsqDkB$4T|&dJP& zK)$`xTEt!nx4N`wOsd9n2P9J!`E_n>W2&l6S*|-U-m$x_EobF(iTT;J2{&=r1@0iv z9h^)J?CMsX*^c&gmOG?3H6WwK+@UQTDF>3^Ppj3D&nDd68p<2W8ps|C7*CdYN-TlBdzYJhO!N*@@715k0c(Q*kA-&^J{yogu7>|V#zcNs2l>L zlMPj=@>z|IjVEr!pZqkGd9X@X4qfptzcJt3n_5~{cK^%I)I2_-tZdY}?i6Y3psMRg zZ@%ivw@54Mj+*lX=(0QSzMORP(v{DVzVX7VM?ML9TyFijq?cBWeCWv$dCajZ+iAID z8VehjyL+W7+q?4a-oE)|#v%V2zwT3_eNk6_>SWWtujlTUw0E~h+C>}euWevs+U@?d z$0i#p`%c?Tv`qfWH7Tnww9?QjL#qv~F?4{TwT2EfG~aCNDISoj=>cxLTitPXFb^Nf zx+Sqx1k7X(^xT7zcE5;R(%qG}7j(5I+=BtjLp=A;q&=k99%>iimb8cW+5xCyb~()0 zWh}f~WoB7zXpNx*46QYEprHvv2N^op&>@BflRmtV8jx$rc4Sq~J;HO3MC33qa`|j$ zk9(Bw&3vw-y*aVMZ%DXxizZB(GIjd&X_Lnn{}PRjKIKNvaVzq=jzR{*BCV1|| zKBkxyF~yQC1pnkhsz#?0cZ%muP1;fT>83=JB%6eL3{+3^-07G?rqjx25$RcXM%tZe zkIAO0Gg6)956$2=Z>g?qM=sIS-nF{Jn%mLca@LY;S9Z0f67I3kcU&^<&bIflf!q2x zZs*WH)f;U1Cuy)$wmnb=cAuL~yYnJpFn=5B;_Cf+z8Pv3=6KI9l~ao`8p=gWn1fAtp60oyCsQ>Y*=DOF=cd!{8A*F& z#Q)9NoRv6Ub+5C!8t2c^xj7T$<};1`n3pD)bexsEbY3hSGcO_Cl$V#oek1Loie{rC z%l7iJ@``-3m8xc1&ABaUw>7zn4(RT(TuVy%bGqF&3uCs<=9pKfT|240+aC9>L)8_6 zzF~3H(Tldlw0)ND-4XBI$=;RHdvqKFe&Q4@ZPe0br3U7@RlZxbTFUK~1)&GXb0Yv& zMSHdRc|6x|9uGoBWBJY52GjML)*3pjUND}=JmAkl(uN<;h95&lJ|}GC5>8kXZZ}S_ zVg0p9+p^m%8&+OttbA_VPUo@jcCGAbD}N;3dp&#au$9lZY;D>L5PKiB5py3)+KcT| zZA96}J$FOWUTPNuX`|g_D;+@g3fqKCbz4VwHt$|2QFD>!_J*V86Ns9N^++#?kMvSw zqn*lSmjMcwN1(9DsLO>?qxNfme%`WP{QOBhzE8!+_vu|3-)CUY&(fY((4H65o}csF zE2H-OJnXrtk3C!S8SFm4Aol#C=Ux@I=hd+1HF~^XijQ~mu8jA~u;;aQwTFfeDi<$V;a;C#?taZ4XmbzzbFI6`=8$#__M?IR4G9;P`Qv^@)fVpET;Qxj4yR zET1a1xtNX3McREjX^(I8_b&dK#6S8jD`XWCf*duTLG=IDbDyQRQbVwfSJ;THmZOOK zyN1CyY`D+aDCEpRwqXtS+0WbNpCw87_u2$6#7*#rT`|EQ;j2HV+!~Oz?y1R zojLM5SAOTo?|jXNK};aN1sDE3;=;F$o*iH0%x2~vurCJoPJCeRh6c7}hX?i^4EsI} z`%fD7bsF|xp8M~pVLyOj-;Nu$y0d+a)#`4gX%%ybr^@jgGO(&(J*_H11Qe=D1R@7` zRZSjp;ZB}~R1NxP(`o=861o^rwMD(~l^V$ELOQ~ zy9+n8rBfp*-{f%GaaUEe=G7=R(^Y$*;Hf?F8+O%b*3)VXrFdXfdx=2QReO`S3p3jX z{o$&8`H--y_ABa@9mEb^Mls!}{i#ewJeFEwzBr($d1p4KY8?8*7YFho@Wny$h{cYW z3BEWObx$24+^{bWrF@gadBuHE*^PSk-)Q%E?s59!w zgEPzt!ACZ@^5-w+ehN}YqXqUF58^Kca`eS`tkvDB9oIOCd7-kJ07(pSB8aCZ;TOl@ zOiOpGl~a>Zi+FGfoAqFDTCJ=o!&x>}n~@`*)vBhVxi>W^<39^3j-d)>GBu5TV}6@n z)W_hR!Ky@LGcG}AqGl@V)GSmS#%Pdi)v>4*h>hb=&<321Qv4b9VWEI?aZg6yL#c(*{RHWLq3@tv4{p}1 zI{&IlYUaX9ErKZAbE5QXU_V?`=L*fZ4N;3x#riGs{sWePr*A-1Bc;>oByt2_OUa12 zU>SKZ&u((gU(+AH`aAz`Z?l*%3I`cF*w7({4mC8u?{bU>K5gQlgNqnrAbm65B=glb zF=_?m{L2`XGIJTY!Nz)ff>CWqe}~Fp^;%%0p`#5=8hUtre&PBpTE=%+Mh_?@?u9Yi)hQD?x|+s>S)8gIHub| z`6h=B*U^?rw6%A1WaQdVwSpb^%pS9fmTEQuXt5{mLJH0Ht&RzwmzSZ<|yPtNmR6E0>+MJAp9&g^*Ih-l2Cwi1oncjaapq&y? zd$xIK#Rfmp0H3lFA$d+8!AMB1L9-6YwJ2$ztYa4eieo@^9K4*e>1LY-%3#nQxu5AjpP5f)U7gJ+@FNGqngVZHj;iXiSR+o|J-OuGxjBf2Vkq7@(ZLqof`6R2%?WInd z#cj{0AfdtZX?BQ!=`-MX>a)V{3#KdBvp+C>j-4-k} zEx%8_reh5q7utK3^2>GR;YGi&e{5b~?IXlZsm$s6(sTNn=}{wd8c&_YzwOUy+gSQ~ zpW#qJXcFW(AuNP77;sye%5@i6(CL45~2t>C5- z3cedv@I7C_+@gY;0|mE~Q1Jbzf?ItBIDh&hxGhldgAxjU7*%k)uK+3BSMVe7EKl75 z!WHMo?12@BsTa3g>L;vw>P}(nB?q6wz>IfMpvhtSD0u4bpdD}Wr2QV<&JacWy+M0L zgFf&5RJRlUXumILuWYE2{`c#4LaAW!4Ys`^HISbd=r!eM;HQRUTCKJ$-bQE6YVT@Q z50Fqlp(WVd7MgSRN>4pZ=_+Ndo~V9~B3yc!r+$GFUxlhiq!4-= zsvac=$-Z`jjdMJX9OdN|6_u43WLWB#F)6$j^ z8ZI?5cnROq-JxCp$KUA7<%)>o>#|^D@drp?WAR5(@Fyy;XQb7ODA+3;`re-9n7zVF zY)z{_i;&!uye!42m0lqaGbz_Ic@@=oFuw*KOo2Q#0rA4_x^R;_0v8 zAh)~$;!MB~X7MJ7r~ZcDq%#@6k)`o1vbBi;rHXQ7GG{7yz0NTdl?nZwGO>f`+r@r# z%|BSxZl}kK+nsk%42#!8I5&39Y0zulOuw2+@M?95eE+ zUGd8-$6$hAXL*X>W&smUNl5U6EkJ_bYni~|CtL9u@$;<^xcL=VWcygOAwTTW$l#}4 zaXmwgo*lhm+zs+T$S_eAzM>dz6ydURBR_Z=ZLk%94DAMNKD0)FgX)+h?au(e0ABeAuosE?+UqH#u})zQEj!(sp_%yBRF)R@{^iqW9lhrA-D_C-}N z6PVW$mMbi)KR*>hp@i7bR+HObe}P%W?u_LvB`$X|bT^p`)sJzkd=43qg-na319 zFNUb8en5Q77=RGLJ=kUd$aQ`eV?rYGAeWJQkw~?pOjscQjHG?x3CMj*~z=wN$ua zcPwLjlf$_N?pW!&W4Ux{l1}}&V+9-2DkU;9jg?Z2y5nT>itbp&YUyd5f+m;)ySh_F z4|a8@Ng=#nJDnV;!z)(gOYF@0d~2xEF_DV03V%O$2AgDGB^x8zx@F`UGX68A7#)As zXfx5>EEJTvy3ZQ z=@^FVS>(VdFpI$TLwr&gfqh3<4;HsTPdux{-;`jxoK~GOE-_@66r+ah_Lb}D_n@kA zP+SLG@U;C@2?e>Rg1oOFv=SA-!$NX42qWPf_CO>MqKt$!tb1y$u*2J%brfiFXh$q0 z=LYTmLUW#Or+tEj&oxHR&LZVY=ntf97S-~u=*v>9wecBd zW6UMj79SSAgZ+xIcjyi_S^ON+|17gK4bFSlLAkz8{;C-mJ|L}}w_t_39`z-|hVs)< zR=q11W{cu1|^2VrR3${q*}LX@F#8|$9> zfw04=^@kK_a_9o2*4u-2Kehfyw{xOFYTfFG${pb1zDqjwlUjem# z4G)jv>(B7K9uDi_7U<_>tm5rEzD}E&w67o(C(i^5ep^Dpvrz@V^A&_H zd!GvwJYPb=@1qJ{@D+rvoBseFBK40TjMP7|2O^ab!bp9Qbx*w{>~N(1nF37?(T7NV zIcWDI^%dPtRH6OVpxwU>eoeO%EolEs(C*(RzpmSf0zcAI1N9P#4_nn=!T00-4U|3g zCVu-}BLBw5w0cWqB<}w%#i+;MCQs*Vxq)M~zqr@@V)VZQVIBSNme9PVsJT>Vp;i9+ z?v%}IiT5A@#J(@8!%^~2qj*=N|6gG}rG{um7~EX>^;p#YJFI6HqW=S;a(dk7+69&ai??-(hRgM22Z*{1B(B`RqrOzvomkrXMSW2mc+7AM_qTgSn z$BMMS8Sph70B-Sh!#K*M)qx@^x#1uwM&WTVd4PvGZXLpEpWG0#@SzaU`5;+B+hIoA z&h7>d540aqLi>?MJ8*zd9_Gh}e=bMJ>X)5IK@NdYM`hupaLmRxpE3G$u9h(04 zj&1sRb@iH#H*|ub6AhhY=ww5u7&_I^V+@^U=yXG87&_C?S%w~~kBj4J7(v7YS|oNh zndnQ$QGVQz?&_BkmY;$AIFSVBv<3i__WOVKK)U!C^Z#4d^#!6z7N~qN34I56-I6rU-NPzVy0%!AeLcT{8?!jwX!N^a-znu(16U+ l20}2brX0XPClCxasHfV6tFfTpusCdQa%ek0vlI$-{{!{^`Skz* diff --git a/pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_2.7.11.pickle b/pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_2.7.11.pickle deleted file mode 100644 index d244ce48b04c605ba02ae176c1dbdeb74f40e70d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16875 zcmc&*37i~7)t{L?l1%Ok63!5=fsiE^5Rw3!kPyO>C0B>2jN|O?B)!dET{8nopbaPq zY`8rT5ila6hzAM+A_|Bgmw?DEf~X*e3Zg&&CEx%5s;j5FcNW5@eBb8BysB5PURS;M zf3J>iPD^=iGGo=XceNGPx~SA-tlGAGso>VO_T;5`t6O#EJKEP5+@ZbJfSeX{hqZE~97vi!ZH10fKJ5-~N;V~$5@#-RM_8kBP3Ijp^ z*6yxiNtIhmX?Kj4O2anN>W*znY_yUsc-{_~rsU>idZRDVR$A9nNV_{)sl_ufq9i27 zWtwWOp8FW+(pzsopLENTmCumA`rOM0JPvwzalaGW zh`Za|iFPm>gK6E8ULpeiZ1(iry)t&c&|KWzRk9a!wWZy?fzN$Bci)UXwAUVHS8$iH zNA}tQx?+ae&lqL`++6EVv(BgWJ{{oGfj%AN)3i?q`*es;hx#;__5NjRK(RI7kyk}` zlII?P7-EPNOZm|xfysp(4J^ya}ism-6_pB4KUSn zr}Z(w^oRjAr?wLNGs;%IP9^R^o_lb{4$vn7r<>BtCE29iL!f)6=N^iI=Q<0eJR&{s z&dR#8?eTf5E+^GFW^AGWfG()6d`B_8yuE8pM`2z^ck79Z^IiEh1(kLWgTBKvS@(zv z2n^^|zH3b(y^Oy;1R6beE|MawrAqF+ri4{nEERg(Bb$;eb@#aQo9cXcEy!oxqaw&( zC=RaMXv4u7AB_nE>;CGc7U((J-o?hF7iDZb)5g>6%5#p%dv23eKc!*njE1T1v7Wma zi8{ZwP;{H^Bdq!sPPXVCmvxtT?$V4kuyc*dfmd{wwdQgy<#uGVu3Rn+hqodpEk{18 zlZ>>&sy`{;QHCDNURchTi_8=`WG;7Q;I892*A^?)+3v2g>6njs?#DCsEPJ*+$EIOF z0mGiq*RYa9(~Fr&eZyuw_mdf`z9ZjK=qS3mtox~qJtpEs$v{V`?)8PP=J|7VhRp{V zw#C?%S!Jr9R9hvhoEl52%qmD0$z&2iF~+XwXfrxmDUeJgQ>B)IRmbdDbPHK`b!HWl zPIp(qT_dH^$=$A9z^Dr+an$Wu*U7+B$a*KnvtEbjPKobF7@zyGnP70*YnXmJb@#4# z_ipK4Bi+aKm-$4EO*K6QYf!OUmAX}1LAkDEb_E<2BX}&C`NC$aO(rh)n~4L+YN)<; zgb&$9O$YjPM593VB+LX(Jvn2;nQP(9kfGO^2aBPrcw&7-!1sg=7o3u@*VuN!hN(~W zP3^^P^=aw+fu`P|O?_It`{~ksyG{L>g01cQS;XNPHsWt%#%{Jxun}>aJon6uz09rz z)mgS>mvk`MD{Vh~>Q;Aj=Syy{M9^l>Jv$sh=OBX4#ZVFc=T!^;^WoMDB5u9V7_S^k zGulV}`E2V3@!99}b3R|~IbXo^zR2la#Oa;O>3zv_FOE*{%b4DUeWuq|%3*JJiA?V+ zo_lF{dY55(m+QxVwc2C1V0vG(>ul^1ugKT~TYJ`()~9nlY4__HD{U=5a6ujkSarEq z9<$}{RnR}Hz$mT}zo1XivKI_ock##7x5P#bka2yRdA^-HDJ7Qlo z;o8mz3Z&cKYq_4Z`GU7n*jG)sY>#u6K8AgipZBl9h`*y}^xbMR`W_7VeH!vw8uD@) z@&}&#!>A#zgCVbp8!}ZYy4Q;pf8@D0gspfZtay{Q(vM-KW`F88d+tyAO#PPkoBB^> z>gkO(rv4FZHGLEP3?{lYg4x^5bGi4?SVe~R3noqWm-jw+$A{Xbu~mj4DUVlB&s3Ys zv#{OoX}jlWyN77IKX~pRjqUW-=1;KQBQe`qHErb{_jxhk3!eMuumN9$0blC7i;*3P zzkhkzb6<(CB!4khd=w`7Yi1SkJXe0_$?uW!J74oLH9tE326wy~amQ;$(e`I_e--{a zOogGn9v|92j7ARapW8jOH(=d2Y2CMI-Ir+Ht)BaK)VlA$y068pTi4mXw$SFjEB5`D z=l(lv-}hkOZQ@kLps(Url>|SZRVk3LX;cm6dAP>58go&#tS>Yf1RG^JxSaRE{g<3E zk_Q)62Lai7wzt6=Y6vQx z8Y-;Ny03;&*m5}9xMfmpB{iJQ*gq?74LvmyzajgmQLJaxXi9M>pvH(mpR<4(3qGv0 z0|i58ZnY!jeJjbLww;#<3^z57+T^k0siulWb^;$3*_nc{$Sxw*?8tawlwDEx)NaBJ z8)bLOTMh>oH%d)wzN99womSZc1y4=HZ`dk(vYu6YQA&olw+K|MQlkS(?E^mMu`h_p zQF1QCGYV_Eac7b5NFNz0s|+OZ*!@5}wLgAwGRd`ew-t(N5^53C9>8Xr7CT04XKPXk z94oAWIoWy_+SFvU_gaH<=3&s$Kpl)rbs#%OcOPm>NFfnDm32XE3+|Psq2^cCtLdmX z#Bh*2)C|4*VSRt{c!2tYs_Bk!L>u7!JO!&=AY};5fFvZ8>Qb|_QQQ%vD|_a zr<#W%G$eackxDVlVddPZ)ZKi&n{zpI?ksVpYGpSTDW0rLjH`Ea(YB*xu?+U zi)!LfhDVThtQioFIz08x(5GItSor;1r{NYSkr;yiu-t?*L~})%Lj}FoKwhA6>s7)H zraBJ1O`8a;B`9etFJ+%9R$c}^fVG^00M-f-Yj(sv0G5Tir&bDA0}B^qc*OCPw;bA8 z1ItRUZtv*G$t{H9Dj8VH9>0pCYw@x6G4{l9s@!5<@Ax?SIC!4=1b#D)z-N@UYL2&E z216&Xquf-eEG4t*ljQKYp>kx%UfTF2zOk8CF1B|SibdW@t4~23@XdpGszv%@i&L*R zL8=uD+yeqlynj}0toCycs0!>%pbZzlnk!;;K=MMT`S6UE;VA(`63PZgwPG5Ys22?x~Z78;Qu-g8a?UwY5e+0V2Gat zwB8NW*xy}oLC=`3(O)hqnIaN5kC{lV#McH(Gotj9VLY`nQnj&_C3Mv47%ZO`@Nv$cWKmg zf=?%g_CM9UbFq1P#XD3Go7$KAN~J{#Ywab~r}Y)nrCz2rg<30rA3Ux12i2uU`}+?2 zvc6K{ zhHnNMzEwrT)iDjPLY<5=k*75;tfGhAbp*tVpDAuoH=! zv;?C85G-E#%e7 zD?0U4mPZHep{?)F&=PEY%PmEFm8Wi{bge3^nWk<-5zfte>UNa)zD5093ZYjn>JD{CDvklkWGYotgHeXXei0YLyBGe$i!i)?QFpR=72mY4Zdds3;V#ry5wb;o2+^i~ zi8^1-sJkhs5qFOiqnN#yywC!9AM4IE{5sS5Q;#1x<+|my>V8yYk#zVtyxvm30@cbN zKq-EJevmvW$GM0H=!ZmxuZ@T#hhwCYUxWJZZq&ml7p*$&sNbOCsYmdOR}46|=kUU> zwYx(-3XVAn$o+wcbE3% zDo;x>YNy|k$EhUsRGvXS9^21?2Mvv$`n_nV+<&C}Q}1)&!+C!IaWeS9Wg{<1Hw)Rf-7ZjV4IQr+a>aQs42mTGE_yb=hFJzzBP>x}KE$&a0Sl8=AE1c~B1@#mya05`4HmCC%aU zKJKY}%OSyS;ui_y%<2cF@&^g|gY$jV@S_w@@spG)sNokXGVbt2SRdH%`<1vC`7Mj- zN!ak4mUuR(OqyT1I2tqj*u@FZf_d_*7eAc%;Y<8M!^jK4WjN~ErjTALU1S7-6QmeL;2z|K5IB+b>ImEuP4)s<60Q2#M^(Fde>d+v6{nh1mdLKSmrm`=e7|TeRU=%um?#FudsqBv? zUq!{(dLF9#eaxm-FCvpgy?DOH1nJk2Ta%iALe6sz!`K=qYCq8IftiK~rv-n_{ z%4|L{=94+WD6~%wW4-!R4o4Gwas&u{(nv}8gtnqj=7M@^o^T^RIg$dF!}$h2S!H}O zUwSQ&Uj6vwC^lx*LXnY)94*DDPZp6E^2sr*SD#1|nqU#^(T){G*rP3$LU{AlOb+$o zT~*Y^MQJ8gBH{^AS%m4D|p(9ZB;ZB zVj5N(4WYBx8t|~d*dUCGc6LEj5Uq>~hjmY#DD23Vrh^ieLvvz%=?nxaD^Qn~pqYXN z=6Mr1-QeO%Q@ZsNI4&ErN{Nh^sVK#$nM&l*pFUK|tcL?fmn&=jNf3a|8ga!a>MDC& z|7%ON7JT?;9Rci|AY_}X5 z$K*$A2tL!2m-P5ptEe`j$@qN}%APtCzx{4)&thX%^@@y4Z?hDmem|Q$I>XO4=dd20 zUd7|*q6v1|)l=tHQFOjhgiQJ2V%R?tT>u4u%Y~v_UiW-XiUVzYHr5>T%I8B*3tymq zLHOHsftsm&K4v~6t4@Q4)UegZ{gO z>sSxRjFkOPi7N4PJ;Y!+WXvCl(!Lq<2Jqpf8>Km%GH)V>Pb6$U%xcTsUnKedII5{y zr0V2Y`PR?WW14P`YSQ1Zyt3u(3&!BpLU`S*SK$-5RRRP`8C@2dUdpg9BjGpQGd>{|+fe zUHS|1#K1bs--&u0`FDY*k^jpo8t#s1xW{M+A^+Y$!+ljW+#l2ME2ANF-}*qH;lU~z z9*SxBwb2l|^L-dRMBi^f7=4eh3!;yZXY@VFx~Co!b|m^9r-bDYiio}^0)a~OJ*g!K zF-SaBkqGW%f2$=39Y{P~kqGW|f2Sn~2@}EAAiW0SV?gx`_$Hp8MHx42`0aNMe2$H{ zW)m5S=RZm@>ij>E=VxSs?|Ihyi)YQRMD+_0)=~XuQQbGHUj!crd`X(axPO_P|4LN9 z64mq{MD<^yn*M{R{;Sah-y&zxuzw@VVK6(4VTYmXFk&4B>OzxfnvzZ(w*GSE9R3s# zMtl`=ei*+d#i%F$PF^UCUq?M2#{U3Mhw(qFXm}&0;Z36<6vl4_8n#x^@ODhYJ4Qn& zjNc73{HuzFe;W<{p6or=Wg2=-nOi~MHZ&oc6F3<$nvm=7Cy*kb%S?Bhi#gm3_b_QKKqQ!8E?af@UsVeo zYB1V7HAFf+AGyPq_Vf};rPWXf;BtJJNDmijvvwOzBfzcP;*X?ER*e!-3Hi}di~?m0 zc?61o+!@PypO6n3dIt#TFyB$s^$qiJfyDTjgt_$J34CC9X9|Y%z%G=R!5~s-IUc89 zGtr>w|J|8LKd+%t(eFdHo$k{aK0U~%2mAC8pU(8@p+24E)7d_q;>wny@eY-i|xa9%b{P) zSuv%4EJMWR_!k`GNDOQjKc5$^5}U#(kHVn*E5Ssh0eLE0dc1s{tVyH{@P_DH(h2d7M&r*s6`J7Mxr5dFzRB#$_eHWur$VIR#9+h zOu?+a3TA_aq&o+Ms5*>2fGYZ$s5+c=+>i=8l5`s>VL5aW66V}MpptawX$jiXWK7&# z`>;C_Tu;rHZvDV+0UNXGD3K8(EtF!^NJo}%2^UjOyg-O x>tZS=YM^CkK&n{|LQt%r96&)&5EK?DzOWUpPBs3!isLC@IW(ZjHRW={{{THfz(@c9 diff --git a/pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_3.5.1.pickle b/pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_AMD64_windows_3.5.1.pickle deleted file mode 100644 index 3733e16122709862fd9066dc29c52cfba6e7cbed..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 14674 zcmd^FeT*Ds8Q<^wZ0~w)Z4CwrF?i5&w%77e%15u)Qp)z0LR%7&UYFV1xqElmkL&JE zzoaQ7w0b%qNk>qF(hx)9A2k>Z8bSo4qJm;D3TOz1KQLndfRI>PLa4vz{g|2EnccnX zU2DX-+&;5A@4WLq&+~ge-shS7W8d1mHpGuZcE1%V2_9M0x=MMnQRu4?-G*cwE3L}$YF)C_>@3T{>)NN;R z@Q-AuYG;c@TyH*wBp42mwC0!-yprorMT_fq$l3pX}wUS;Z=~_vzmvo(^H%NNpda^VPqEmQn zu3?sE1&?+$c(}yMfmLe;R&nelG6Wy8Si)?3VWOPI55QG>#>~Nq6>%D`I^Z>; z!f#lKy;|OcrL+4@&5CL8kT%mz*n7szSRIAjem$Ee17LRxkh5KEl0c0CohYstSV~F_?7I6`FyjEojS&AxTDXNgAs2^cszM0;y=21?v7O^dzj%!{g z*eS=n;Bp*gE3JlHtr{J`r>Q})?@Ps+R(oYKJzl1S0Qa7BaO?oqyek zbm100)(NN!(^-97Lk^Z6&*k%J?yBs~hffQkSZ(Qwnc*R~Tg)h*WzBQ$yEli7#fo$1 zXiwW`Q9{CV1adD{co?B|qYEh~tX3;}Ifs-I&p;Y!_3@k%Z0OZo!3nR%wGfLnul{EU z(27(K+9@7;$IJR2+xnbL4b;4)@$rgYh2!qpU{-QPy;9kTl+uw-N3m2cX!+bh-JY@% zVlezY(hR-;+v=coWc57KFz)k1$q;3IR=7xUK6dZEYz2BTGeMa}Cqw}MHRAPOF`xee z7sC;R7P9zcxN09I*TennA20sh(7zXiLKibt?%|9$WOZ3s}Bk|Lq@4oWrcA{%Wmp^%W0Zpq?4Y>GvAjw?2O9086MY~C68yG?e zxGva0>kYErfR<9Uc9Az~oVv)ph;4H=- z0BUMD)+Ps$N{`|XqYVw0YSZ1hWZN)k`T*YGFXXDQ*$9Oa4VUj3NIb^mz~(Fv6dMQc z)|$gZaQO2>uSOfP2DK@H(=08u(kTZuXDK0AT1iv7N4=M?j|z)wKsaYhR_bz=%=4(G zS*eo>n-x1!h&lQ2{uT^l>NqypcKzN>WO+Iwh_Ver=b*sB-%s?vx&9>1x3ck}=CY=e{kTX;k%OE>$9J#+(#h>Gkdug5vlX0}n1b$A)BE?e~!&9&@enl53u3K~> zt7CCq9kXYNz?!`#&vpz-UmT@D<9;_GYGiYx$tG2-2AY(Lbm~#+dlN($%ON-pij&9B zJhBKIyXyxZ{yWjBUtTt_81&5@KfRCWm;RA_Wij9I`P~c(Ygm$~-p5e-Uj<(Wf-Vn{ z)q<=Mm{jO%r#sccR9+Y-QM8UgsM_kuTLjl&!r$a{0#F1kRFf6s+H zyK9zUcl}dsHM=>;{Xjr-fxOk%hW9J%3EJ3OiR-E!Hv6UlVZCJ?tr5Gyp-z_MvA@_w zvZqkDl}@C)nMbJ#dYNfaEuEz%8U{YUvx#~tFZErXa55^K#8Rb*dUsEPGVTF}WVSbs z(A+ZfQl*lSt%|LGRrD zJ`EKZ-64wV=+zFH8&zgsvB%?|7l>#q=o!-RE8C z8a*(})-LLK_6)f5n=d*l!mapr@bzw{{yQGak9s$xzLQj2`t4v-3HI7fSIf8v)dcRf zm(Ypapgh@H2x=eD&`6wCgqt3EI>F6jXK+K5f~O}%7b0sEJpYFhM1dPFFJnH?X!OuW zFMTYak3M{CmIwj1jg5c{)dwDa_4nU>aDk`~unn8n%ORA6Ul{9a9t+iOc_ta8gJBLD z4_;at`Y^#aXcx4_d%>GT-tN5+NhavU4>FyHbQr=RJK6*d2$Vp5&M+T9dqSi;$^bQv}hqx(G2ZCwEEPWXri$+ATL|pqP4Z1 zHyjanWioX+3YGn*s;+5g=38AG8<7rgh-8Eiu*pFH#y(HBu3b-(@C?MD(`g4%8!U<(+%Rrwl0TVb$%d(tnld1XN3rzaq4I=w2mv{0xU z{}k?;4x~yq#Y`ILG=RX>1=79o8yLhCZR7siiVS?Yp~va${8?G>bIO9z*}*H3^a232 z&`%ma#s#lg&XZ;-$G{@nGVa3gO#RN7b~hpsSquA0 ziEdrz9VXtb)XO#fx_SJCOSoa)%CnQ*L5(_NJs3=Op z+_fO^YIlSjylg>?peG<(5Jn%wO}XA68)?*qKrz1;!JQuzD(IluxKm;Y&lG_>7%IhJ ziltC8KVY2vfGGIkOeNYA4UK8~8jzwQPW>b6_6GrBhwU$6Q2vLe~)teuxXmGU|m*=BClTzA3QP=YPg_n~6*q>oLM z=%?dbn!H=>1TK2{F|gt=$0j^Yyw69{3rPU?Ac;og8YNLfjQT#92}FaL&~Ui8c(0b1u6n z(`=s*Y=Y8Cxu`&qxc#i%N+fstL?Q~stzFhLfxsnS74)uwOf3RC)Qwn?!7GpzH*!9p kSUKcl1$)5Oe0@~ diff --git a/pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_2.7.11.pickle b/pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_2.7.11.pickle deleted file mode 100644 index 3ce45c6949be0ed8e600fba1af34e39d92b6894e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16718 zcmc&*37i~7)!&^xl1%Ok63!5=fsiE^As4W@2w};Rt3y=Aadx+p-e#|^nSmtG2Gj&r zZVyBSjEE@Wfr5aD0wTzvfXFR^s33<5qCfy8{{OG4d%AmPA$%U+?C;mF>b+O5s@_#c z_nw~ez3Plr*WT4;Z}3p5$yl{*`J(OBw)W(^+VX{lV!>AJcA-I7(z!1Cl-wdP6x%!P zrEHb<>MFA)xaxgxK*mZHH+ch_t=e_U_PjyKhTZGd73^Xmy(qsi?WGSo!yD{-Lo(K& zu5Q(t?`Yp-dqaDz0Xfa)4Qu5{Igm6TZFWa7pZ10~RX0^NRh_xg8)1#gl?rxF*&X>} z`^lQY8`*2so^(oE!5igU)m=^1r52fS-W%Q8RqE{7*wEUo?1s)#N3lJx6b6F!t=(OP zqAIl()7}`(m4Wyux+G16=pub%*=sLZ{Ftimn_Sk7}S1Yw_76w!ehH;suTB~|q zb93{tJMfc7Q`bQRDPf5olEmR@UlRn?u(KT-eisH&>5o4PH+w!yVm5MF!9#jg^k z8Ya(w1aQSIx1CS8ZTXs~2w#5Y#X}wiJhHI)RKjy>$K3Pis3PXrmG88@@y(^JE4|&U zn)a@uw|k&|F)d8ok}ufW!p5?DBsriVctdM?X;*RP48ydi@9mXwMmuAivMTo0s&Fv- z_F(oCGEFsoXK&_PCBJG7s!|5l7+7myoq_cR4lr<_frAVzwm5pG`&jio7;tx+H_-_v z6L_?yq?ZeWnasYvw_nET*D;rMcNLw*U2SP^e|YBszIR~88QSX%bIM+qaYpt!p>HMh za*)x>1Q@y2%(BkFdIJX-IMBdB2Br-hY~T`)a` zeQ#PHRZNemV%c`az>JbruQQ5wnD5QZIH4Cx+-XvJmE@eXcQ{1P^1a!ZLax&;<`M9D zZ%)>m>x|D^bvdcd3x*~F;Mlh6%6AmftJ=HPci0O$x?4|LmhZ~1w^iCZ0`iW`WWA%x z-Vivy%6F}|(<}KjzR>7<^N|~&H&yf&G*wx(g`(Z#Eo`c0sk_Hp)Kq8uY;iv89UY5+ zCA4(i76+Ej7_Y1f+`GSeu?1p|arSV~^U{oiE**5{l)E`L?|V&F{gj5OGa9CP$NAne zr0e_!yWllDM_KhPoNK{5KI<*_y%ia2VCQ<31Fqn$Y|Z6bO6|yLUAbHumTyIBT7|S! zCy8mbRey56qXapYv!s+S6__w`NMPQY&{`*Osx4Nkv)xXog%BBlo)D}x(7+Lm;;koR9oeGTr?vbs+tI51IKuA}4#I9r#%Xp=bP#A;eecYSv(hQY(piq> z6m<+aYaA0bb?Z91^F^;$0_T&ycXl*z&OzXui;*Jg&#M&m=fk2G#4LKD(OfB#T(o)m zGuQUbVy;i=Za!V9o6lf&pXKZ>;_S}l>^|pv7sqG!dCcy@KC^2p=CH5(g3RuVzIRD< zc9&vym+5Z5RH@r-nBAA1ItTm1%QMcv)}D>UP3c@u+WShzN?WTAU7SbORbAc{$8Ni6 z#|~uSn&XzO_P$zN>0Rjza=1zSn(uwR&)RcUZ0$)b!{0Fa8%O<3$UixISL~@KT+?}< zO}OKomg@+cFL)z`J=KIu_c>?9BiKWkTz@qN{7pTfZ&jMmx1q=HP>B<=pOjKkcKOJKjw> zcm6Mw^E0@|U38DT=^i)IJ?`^Vr5^8DQQvdE_s6LEo`?Eg=(}r?9f;Ylyy$x` zC0CF?867?h1^qd*mcBe+o(tr;P@ao49y2q6@fX-c3lbeP2uTT^Q_G)rq ze~S$4?>jxP*Pz37UllK_x>4G?mJNK z4zZ|WxK}W$ss=uvRVjd|VpI+Jk*lzUhK^M&t4o5^flV<^E7N99n?5CaSeYWH6;%gb z*|n|f?pF1vENL_i#mCeDp!A>yQfl9Qwi*O{-dAZd9_OpU!q)8UfL_!PRD3m5P?2?C z4I{JVa@bv&h1P{?k;Lm|795enHu*qU7#E_AXd>b}}buu+}tO@7Pe z*pfP_Y0Vea1k$OLeNgb#L_DKP*_ZXK+K*f^y#0lttdtrZSLy)ZF^>ZQfzUl7qi5e!dtDBH$2A8JZj9zA<1tKzOLxKWyhnyGA3(@}Bhz+nbaGf*qh zW{07ml`<2hsLcYU91c8eHj9j~*=%8Jc14@8*&Nh;HCM3OW;I-T)I9QAE>)ATSqX=Y zo}50Rt0ScOkHE|@vV{Uhx z84nIQJm}8Q2VJ#H(EVJb$vxN*^oA8DnjD%#SuWf6S_64=#*J4I7nkaI;I?k1S1m_L zt9S*QR8a9s;NexP$Ox}mEo{xM*alu@q3)|Sg4JGyn=y280{JbMO4eRwrPsB0bmZjP zL2;c7_2i6S%h9zMNBaO<;viLOajtWX6MYakUwsJAj4Qq~O6xSod#=ENPGm#5nowDC zX4Qv@;YmZ~h>(4BV3p*?WTz2v_2@ zvub0tp9?@`voXAExL7r~8CL5Ay;GON8>*`A-|O!AuZ^s#+UuI^1%#dRUww-3<_EV< zejRXd+wP|kZr}Cs^=|+!xvKMe!i(ml>b3(ud+$Z72xnv`T~4^q$QRCi6R_uU|4G7I zrjMHR7T~ZuuG~nNT{HKtw?+oL7_}aQg|9fA=NLB@MD1pJ|Fs};D@2NGI!$c}yc_OQ zRUr&c>cbcdgAR~&7<8hf9i@v+#8F)11l_>+sz>lcd)h;3WQ^%_L!lI2VJkv+6vKC7B}5H{%{4 z>+CFsYWcVG>uh;l)hQtzTktM0om2-Yp_aD`iB<8!K1@spvLb#8B&34m*gyTh82e{T zC-HQ~Tad)~Kih|qljOTHnZreu=kPhx3VwrqPIw!GR`~6_{x$y|jT%lcaAIV?Q^_tD zo6gI2Vf)0qKHrBaElgNjzfgHjUoV{)3QS(^M<% zZ_;$SQpx5UFlUW5h%XrRn({;7t)aO#dtDx{vT`T2ceSaTiK~;hZt6!Yj}GaP&Fzmt z2{*T;mV&d^SGSP6R@v*PsasKmNwdDX4JE!uQ9qGFHV!>J z8$Tc{d>KUdaXCWD_$8ouC!-!jxnR|4BmD{$Up<5;-W}jTp2Hiw*6t4VFfhSJUoHlO z9iN$m8;eK4fsMtZBH%F!aOPyy<0v?5T-x55=bjD#o+n7ns$Uc6nLH`QxR!oH9A;9j zXYv%P$zXmOI0$I;)o(>W`CcPs!u=WGVY%M{xEVaq`|kmK^#?pN?j$^8%igm@YxTiR zQq{;(?quM4-3GGCmBe%8NnA<%vD}Vof1XusQd+aTm3aX*UXGN6jXOwk+e?ckP-^;|6mGcU!N#*<%IBZN`1z?)^8(SbvQ0q(+e+Tr{ zYl4ktiPuTDTxv;gO!3NkC_MomGv(wZoO%N^*VubI%D#FNPuSb7XzsU2%&NDAM>O~k zDaOtHPvXjs`wpvmO7dJ#*5MA2wAFb4j~=mui<@fTVfB=dqsMsMJ!!Qy)okU51>?97 z@duIe16_VFyKxtOWa2A+Vp73f_{9kajW?m;%~l_O;pZqx`|;Bh(~$ncFI18#pgduI zx8iDN;ioHZ=q8vWzhN;!#Lrlg{R|_n>?gxf)#@xSCnJE&b2*9U&m=$?lp?1L*Yx7Z|$- zezym(zS>h7_2YMYk(gC`3y=8S1S!V-ZXe<}CY@#`vRc{i_65mV44v=CLe#DH4`eje ziCcwx!oaPJSA}kttfRgB0{&!OE}0Ktw=$0d*Xs^kDzG@JxP7Hc`a46}P%ee-L!RZamF-00pl}7!TV;YHBHC=dQ8Z)FAH^*Vb zm8UV2)ymU293zl1sF1!K@| zbp)%G=W!%RFvn2i7gp zswx%izm}6E`ykoK$X08G^daM4DaH8sR~cz0x>t*YaI2*^MZw0(0#O@uO$A9O7)eO= z%-Cy%WxebMi;l8w^#L|d?6W=yT(Hmj5X)J0qBM-UYnB)&1!@s@eIMTs#oYeGQ8`%L zLOIE-lKf5p+uN+l$+$$3ACY2Qk@-Nlo_-6e+7HU>Kr3)se{B^6*l_{t0s)bu)q3Et zkT?L0gm$(-B+x|}2`=ltI!VycElUR(ESGx3Led%1gN3F`)2W|uA$c|km2O~h8!3(Y z2^EjTtWv@w8Y)OJuAw4vFrsmg606Zr(dF{0eli%Ku14GrinQ|P)qE4FHUJOHY$RjU zVK$MUJ&D7d#Y@oW@4fO@@>NUUD+n!ZzeWuYFtGeeM&l1@lx68u4o|dl8poB$gMKhp z?KdA~_1(0x84SpVrvuOj&LBU0fV!s-d<@W69~Z240ev?81nHJb%>>!d8iJ3iscgbRj=^K>^>>QxXsTd4wf+Knq%g%)sz*TFt`6tJUXFUp8VGKTBivVpNMO)#u6IXXIaCTjt65jqMi& zy^|xq_3YgEH)=*#y$1|OehGv_wU@FF?dF$BG42Rok|BhL)orM1KPaciF9W9qTwX!I zSKVq471gKxoiK85&<_-B(u$I+`B8K?cjE36LJI4(UO9{HCUJ zqG5V`I|!9;0qd)8OQU|$<9A5Rs%wNtLgl+sjBDt7#KCsv>i=3+qoE>Y^VLr|Y_0~n*VqgnDsV$k9HTvoZei)Gs%MCDY9PMrvCjn8$a zsvn^m)TXH)M{3j6Es@#`b!(({n7R!$C?2}}2};J-Zbo5-iK&=ETjj~_PD0gt3ZDuqzAXCzt(hm3+PXV z^xziuH=0f_2%^;*q?b8-qo1`isQ;2O{{TsGNTzg8vki^KV4(pN*Up z_iv2gzYyUva9zf$%P@5roi2lNNsv&Qs@*#5<>i7n`b8d;_cHiQ$i5=QxB>r4TsdT4 zMKu|+e*;d3?B6R0cr7mA^*}%*WZwt{Y_A~T&A5QK0s)bbeLEEJj|u|*X#|+9);p}q z9Q2wozwB{4KtkkJ;h4k7tp;S|(#ILODb{^eBj{-4@(m{O{;%Qx?s%j7YiQJPih)xN zoMzy317{d`n1M45Jlw!p2F^Bcj)8LxoM+$>`dGCm6(bhgiyBGn;r9--q%-maR^QEH z`+xz*s)+!MiG9hBn4n)UCiVmL)&7Ew9;*%@-EwJ@;8^9#3S%7cKxvnec7Y?haSuF* z#H>14cw|14q!{|FYRqj@n7VXaYssVMEk8+eBikh?7-51dWH=4o*F+ZA0 z3{zg{N8zn(mHYxf3SgDde*lC3(+XhlrDAYR*k_|SE#4y*ny|;<1}1DzuTP)M-z0O(9dumPNj zCZ;nT$vQ4H1szSWjbyM~S_TPnen<}z>;g@v9)kpl8)oBb3xW03B5Bl*t1TumtBw{P z(a#bo#`SXyab;Iq%Bmyy z{_mcq<-OnbfyV|HK2gujXH_#NB5FFGGe~5Z<-x498?9hf)a1sMvl0YkmQ?_BhSlVQ fGtdZh1`80MvkF%Gf_|HD0_m1Z-36JYRBHGaw*rx- diff --git a/pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_3.5.1.pickle b/pandas/tests/io/data/legacy_pickle/0.18.0/0.18.0_x86_64_darwin_3.5.1.pickle deleted file mode 100644 index 1eccef3903e01d8bb93e254709724ca8344e4433..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 14671 zcmeHO3yfS<8J_n(OWUQTJR2Yc7Fw2eTafaQZnsd%uw$WxXu>kw*}2;@m)V(R9%Wl; ziagq`oFbuDVhkn?F&bkujSxtPh=f2C2pZ(2L6k%dNDPDoC?rz9|2*!!GxzR1c3TAF z-Q@l=_ndp?od5s+*ZI%h9{cY4H6d~BwEKBvB4_Wkd8}k^9kltelSUzHlm~K!tT|BVBVVENYCv3HeXcNX`-61^rBYWN#wBjaP{qzn4|j$_;cz4p zjoSFi=CeK;oh|Z=%}$|SVfjbBOx{&4mMX~Fth2FKXrL`-&j8;Z$ zk7kULP4^wRj~9!$-%_2jO|(~cgL|0L3W$mgvgK?Q7JotQ;v@L6OXWbt0+0}`+*)*B zHpLKS9*4-xJYdw%yC!l|W;UIzPTG8-SCX;|k8cuP_uFTGOZ1PWAuR|^HFu>a%!(m+ zF^Z>Y@rf_?4wNfY%u>Msta$Ibd~xjN5u;#ig&f3~{aCcKrc~o4#Wz;Wi6NWiJ>2kt zyJQr$k_uc))FV1J2)UOlMyUch@L0KGPTI*ZeJJ8Xyq395CK4X9d3c?Nue1oCEW*in zGO|Yqf4oNcJbXzYus2C9*Z21?lbieNxvfrUyNjW?qcf zf>NrRGpn#17Fs2*h@zV4=bFycE)=q<*ux6REN}R*A1I!H7fcw+#EOBHs|QvJ@Fp^Z zKcV=9{lqpSUxgiDdwA!>)>0ac0BL8YnuA>|;A6a;&yA63BC90rQWY<ZO~f@9KI9=-?` z&|`Sm6uUD}xDi*l2SPQM7Otl4TNOE6yYJV7r`Ad}6^4fk>?21V{G#}`5`|Vw62*H; z6jk!_r5x0h#1`g#UN@y$cw)2cqCFnF!Fh|c?na}(otgr!V&E`3VywrH$v7%2uXoI?%E-1DIRN66di*hvW~+O z?Ut+)=q^tumAWVoST(MU)rS5*OJ#>Mzpc{j0w9DI#Q zw|3&OVkLXGbv8~N0l>>xP6FP)4jH}rjG^^YcO70x{McOwkBj(!j-XJ#Ka*aIgk*!( zYy{oC3OW_UR~egkZOI`Ws~8iL2(`YA^zPmlU-DamrYt^eH`*vCIOfU<6 zvVLTLBi=we<_ol8T!$lwI4R0ZhAZ}NGH=2fKKu5c60h`yLW_44uOpi6 ziT#l1!w-J%1EP_EbBA8-bFE?P;P_iGXa4Nr3{vxSWyg^Mhm)#Rq_35m7B=q_yA|tU zGb+G=c3D;o`2(L<|5&)86pvt9JT|$VNF+_T>F3?oZ9eaE1BqR!U|)6M@xzCa=S8m` z*#hPOMggu5KF4|+%OysiYP_*#2^f0n!HE}5qK7}rJVG>e-^YzeX~e$Hq(M zEwT{i<7Kmgv?0148zqk2>qyH6WERMd#To&5T58tJV-Eq@MHq%AL}2H!n{Ulh zcmk$W%O zoIJa*0F=t`@Gw&6^|mLaIRe&kL2Chh%}lJdV1YG6d~nk%zX`oEm{Du+SbNpPBtouY z_DHFT-=8`2=Eo?*nN~l^6Px63@3&Ld378Y_zNU&T4zqMQ_OcuZ&J+6{h*n36g$lEi zz1A7%yVJHxAj#gcbr#XE)eizhC8r3BRu$jsZnLq#48sIK&A?;L!6^2KjxPHUo;sl= zFkW-jf1iyp-50#b-JV58U>aIhDJy|Dtxnve!Lz#YF);YEEx(ELxY%e1K%u;nP$o^t+|M9r|!#D75t8=zT;dhCXeFTpq?%h@QxXZ>eHPl?iwO}=14pvruoS5GkUb#VK^@6|?ZzLa3vGeT z`YVQ*9{vI#=N(K$nOx6#lhZv!@a&L0FNE(9T%H=v{z}3V*c3e({(kSap)+pM@E5Kiku7|_bSuchjVnZ7ol{_2ybI~9q592BXU%Z~=Lym`1GT7} ziAtL-Ek&=g6tZv2H83iiJBg#4t~on^ts8cN{j8%11gJ+gig1->);1nBACwgWtZ0Yy z^cPBX)%VstJ?7?`d0{oX{Z1E^R%JC~6r|I78g?SjOr0FB{NI~8A`h>@r(7a<`ks!o zMk;AEUjpF_ah8+P;3ZtGb6@uNms9b}ZApd330{^*bls{`W2q$Ml2*v6>3j1f9hE6- zaXC{KdeCt9$vG9U3#AWT&{9We0xhre!tHIoAS_h7@6i5}^A3(HIECeYa^eqL2i-UU z8^g42c>uwBRyVk35ab(g=!S?>j+PA{=!R%AriL=v6V^UF79j^e*0JG1`CT?L=Y{*a*KyGHpDnOOTrT3t~!s8WIGEE!hW339ZAS=B|{rM0a6WZN=O*X(0h;f28;8gG!f{4VW>&9fdJ|E2v z(6~D=3l*8yYTRDk;A$M{&n7%pAv>Rks%9DoGSl+NhjlgDoJ#AFL!!UNBdfAA$uSB! zQhi=qZ@uBE`A__|?!r^0!+Y>?+S-fz>`JrN{q$(hcq*FcSRFT)|t9W5IKjcC%2bp8hsCE-3NS zUyq-ezRwVPkR5R9ZJ~D$d~)9+?3i6Nc<*0`PCtG6z+%wXw>)+m(X;-Me15UmN%JE~ znkIS9HHJg7D}f*ooQEc|N|M!*oG*@Pwt*@aB#qHO`#_<_>9_VHwZTOIdgC~36KJ-2 zbc1WQB&-+e1~(p%4NG)`8$l4>KXmcVuAd{=or)IK1l`omQ;GXO+}-V49Oi=t3b6)(d2QYmZHQS@G}0JOU&hd<3Wf0~-QWpr9M-RiL69>Q!Ke zZiplkbcP#YH>tqa@z~ulZR*|cWK-a~-+W1Vww@^Jqms?CF7F{B850I(I1W!q`J7PD zU;2JrGa=+0bH-^(>rs~pyqf|Z{G-8GF>lY&Bsk`@#{T+49p^-|KVk_EJ7~E$4HgND zl-PO+W(<|@mq8G3v=HUE>{{zNex?tVHQ;l=iu_D|%Vn|-H;KuAaG8vQ7)_DZFa>r~ z0La^;LXAzVAGSC!P6@$Ys*ZTCX0WWUBl-6?*0i&J2MicVvfIn#3eykCO-1Z6Nh zC%3H*JXTsRHf!g|eiuJpb+e`seK)WUL6vdMbeQ*=OMln#{7m0lVqfrd{mi?>WuE7( zwRN7$Jl>1rWAfjNDm!rYVQwf6o>yZG_XlnfTMNOcpah$ul38yP;|Q5PI@XwqY>Q4+ zq{K#i@D!=Af`0}ZzI=;4vfvmx$VML<9Aulz#D9HH`h<_8$A+5Fx`}^2Aw__nFT2Ro WCQ}Bkdv$~B=gDhd=R&et9rzEpA&WKu diff --git a/pandas/tests/io/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle b/pandas/tests/io/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle deleted file mode 100644 index bb237f53476b5c9b7026fe7813a8f26e5f233bda..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 127584 zcmc$^Wpo_LvMyQ-SKBAXgeFINu<(S?xQNgs|Ky~| z#OTN*|FkI}BrdXZNOkXoNcDk2E9*>)1IW$s?x*!Epg_N?4l4p&lhX zE+sahtABWWVx)g;N=$NeXkwyQ5byf%__(Cx#FX%4;8D|O0k3V+-g>kEXh0Ag=G{;C z^KK0WWK4)i?wSw@JVp>wzqD5%*gL`W3&4Wl@_~VYb$@xUv|9j_>a|K-l6R^1FL_{c z*pQ%x5OnbIRpNpSL0Y%?ARmoju=zf^Zu;c6K8pAkta8yy8|~UN%1386YL=WDORFnnKGq|7ykPKfq~w@E#%Jrds)aotOh)V1KmDT6iM|I z{SQ+V`-ds&fA-}-@f3H#R8I*n%!I`!yFDe{-ffGPDOS30sgi&1S{S4;EtTpi{U1!r z{DWzDLf7POAS3~J%K8O?DoN3yg__29>>BSWSATTR04NBp+oY+dd~#z?1@GP!-SqG6 z{?xIscRL^0R7&+!{*Udd{9`+xF{-+$-&{(R{zK-%Akgg-BB)8B8lg$ak%@7hYIRp{ z`Sr^uMh&kR)xBcW@G__wgvNy?dushQX}TvuLcD_^DRBwW;T>bVz)?GJfTvDlPu(CQ z9sN83zjytY>}ejVml8xLC4{77wWofnC(w)PQ8Dq}eH#4UCn>=j5t07sb3l`{Uro}$ z9e5gg^>38wY3%2P*QAs%FLrsF1aavresgUagok($H`EjKn`0VOdZEOtYiLXoXc8UQ zJ|?niOni99`k`^5?IRO`r=bH89ui@)>lH{2wK#}#3oY2}sM8>xxO zp4MKwVO~2Xc-jQOzNh#EPjG;ck=boSgFPW`^|vG$x}{@sx@uL@@wwe!_#Em>K6-!; zl%jrvQrLe5rP>)-l0jZq%6O=pPN@tkW>7VQnin&# zH@7c-yQg}3__><|$r)km1HIp|o9Yg7`$Dm&HxzrhaksY{>FwvH0{=VIS4~NZj*Co6 zO6x`Xc$fA~_4M<>LlBYCfAsgFzPHmz>oo@aSI8gebAgcN$iEtaLI2A}pbG?@%6z2C z2MK~QuMQsVqp?-q-t^J!X9pJi=%q}A)5AwU8&}$Y^3qyMVmJEe#Igwf*-IZBo7luh ziwEbQ>!aDECnLUiX~Nvp>pr@xn4JHsm+}V|cJWV*jcrTg3M^X2GN&zLl*|I>$K z|EmwjW%zK=kf8tnT{7OAmlNFHe4Oa#j&t{NdoyrSs%NsFJJJ1jl1*`^xQC`Em-`F$5mjI>u7CBxg-ht~8>1)dpRQ_f8F z%<^R&n~`*5{+aHp{+H?Q?fd_~P4~a_S{)heS>yMQJYMS)lpNsp=5c9XV*P#n?@Qg^rOlhS z;2)J|osT^{!0oL(n6HihS7BQJ$7^5CrWK_P-hBOsj2r*&%DBnL-dkgR)zPQE@3Z1x za&G=(icbySmq?ndTmCKUR^LpoWn|6%8CzHKw? z>|6FP`M3Y^8l=7UR^T1~mVKvh7Lg`VK5) zUPpQ@aLEhZYg1A^m(z#65t=r9#XG#wJA5^L7zserhOc>tw|a-Krw^k6SlaLn@9@r) zNMCd1?h-`%+V*r{z8S=YL_|h~de1jPI(j>C&#fRlt(EiK4pRPl^G7>&=dX4wB`hfj ziH-K$P4(Oh!V)9f7xmo#Pt;C3IRW*3=O-E1{UFuz&@YIH2@Q*kN%A}j_B{3?(eGY3 z?Gqvm>1!s&cZ-Y*te(-gKl!U~f9kU-mEH;$@%CFjBR%_{aNZkWX_*Ye2!f(%(e^i& z=NVj{`KG|>f5~ByLAdW=B+2t4*z?k_h3|kNDmpPK+4HJ}FC>$EM?Vps*O6ZFBRl!T ze-rF^n-QDu{_XttzAd~3$rqa+{95?%A}ln~^D*P8o99#7Q&fDs=X2VVC#8$$OIr1D zzw#Z-d`*w4Z~tce-N)DqR6fQ((iz9cMS6awF-h(m@A;L^^7=-kEv(B$8=7s5Bl#NO*iiWKtsS{q<)ZNCdvu0dRdWPr3t(raLf^NgMy44}_X# z4fLAMs|Nz1Uc|+{z)V6EL_+{#ybT}+@sI$CkOaw)0;!M&>5u`LkOkS01G%6~P-Z9# zloiSbWruP=IiXxoZYU4r2jzwGLHVHqP(jEaDg+gViaI`*(xrVHRw8Y1G)*_f^I{1pu5mL z=sxrSdI&v&9z#!{r_eL#IrIX03B7_|LvNtB&^zco^a1(^eS$tiU!bqhH|RU`1NsU5 z^19!f#xM$FFb)$i2~#i)1DJtXn1gv(fJIn>Wmth#Sc7%gfKAwfZP<<@$3&Ta=qHr;|I9viQ373LP!)4&Ia5=a< zTmh~KSAr|URp6>{HMlxl1Fi|zf@{Nd;JRLft_RnL1K|d6L%0##7;XYLg@fQ`aC5i? z+!AgDw}#um!EjqR1a1e1!eMYY905ndQE+?M4M)Qr;Er$%91F+6@o)m{ffL~*I2lfX zJHegdE^t@48{8f40jI(};a+fWxDVVH?g#gW2fzd2LGWOB2s{)X1`mfvz$4*N@Mw4p zJQf}YkB29~6X8klWOxcZ6`lr9hiAYu;aTu(cn&-lo(Io|7r+bQMet&H3A_|u1}}$K zz$@WZ@M?GsycS*uuZK6l8{tjxW_Sy{72XDKhj+j`;a%`vhi||);al)+_zrv*z6alj zAHWacNAP3#3H%g(20w>iz%Suf@N4)D{1$!(zlT4-AK_2%XZQ>J75)Z)hkw97;a}e5 z_o6t0A{c@r1VSPdLL&fS5EkJO9uW``kq{YC5Eao79Wf9Su@D<^5EqgO$&6${vLe}# z>_`qICz1=vjpRZ6ki1AfBtKFBDTw$Zg^LYo{9kq$^lBnF8^;*fYG0r4P-ND`8aq#&J; z&PW%eE7A?=j`ToMk)B8|q&LzB>5KG3`Xd97fyf|aFfs%giVQ=BBO{QJ$S7nqG6oro zj6=pF6Of6>BxEu&1(}LWL#87$keSFVWHvGfnTyOr<|7M`g~%dgF|q_%iY!BxBP)=V z$SPztvIbd;tV7l#8<362CS)_R1=)&hL$)J3ke$dbWH+)0*^BH$_9F+7gUBJ|FmePr ziX20ZBPWoP$SLGBat1kzoI}nd7m$m{CFC-41-Xh`L#`t?kekRY#jYZ?ocr*d^powS_nvABPozTu` z7qlzd4egHhKvU74XfL!k+6V26_Cx!l1JHr!AapP~1RaVFLx-ay(2?jUbTm2!9gB`b z$Da%dI!CW-b3%B5739` zBlI!)1bvD=L!YBB(3j{d^fmeheT%+B-=iPUkLV}#Gx`Pnihe`Cqd(A}=r3>KhcN_0 zF$}{o0wXaBqcMOn7>jWjj|rHFNtlc&n2Kqbjv1JVS(uGEm$ z6U&9=#`0i(SY9k2mLDsC6~z3pLRev}2v!s;h84$3U?s6qSZS;bRu(IVmB%Vz6|qWK zWvmKT6|06-$7*0Tv07MdtPWNe3&84O^|3�oD*}gf+&RU`??gtQpoEYk{@IT4AlR zHdrv$77M}JVWC(U7LG+=kysSg9&=;SSO=^l7K6oNaacT-fO)V)ED1}-Qm{@~XRHg> z73+p|$9iC?SWm1M)*I`C^~L&O{jmYqKx_~;7#o5O#fD+Su@TrvY!o&c8-tC-#$n^J z3D`tz5;hr|f=$JyVbif0*i38|HXECR&Bf+n^RWfkLTnMX7+ZoZ#g<{qu@%@#Y!$W| zTZ65|)?w?h4cJC(6Sf)Kf^EgNVcW4C*iLL0wj0}n?Zx(C`>_MqLF^EA7(0R;#g1Xe zu@l%y>=bqyJA<9Y&SB@V3)n^M5_TE8f?dU~Vb`%6*iGyfb{o5c-No)<_pt}qL+lau z7<+;}#hzi$u@~4&>=pJJdxO2j-eK>t57GadcnQ2DUJ5Ubm%+>8oc@alLCye3`?uZ`Ei>*4`;J-j|1h&R9+;*IdecoV!S9)vf;o8v9;mUt_?HQojf z#@pf{cso2455vRp2s{#x!rS9+JR0wScf@1xSUe7o#}jZ5o`@&m$#@Fh3Ga+|!Moz! z@a}jIJQeSW_riPQeek|`KfFIa03V1C!Uy9+@S*rHd^kP=ABm5`N8@AgvG_QAJU#)R zh)=>N<5Tdd_%wVvJ_DbL&%$TpbMU$NJbXUB0AGkN!WZL9@TK@Nd^x@XUx}~6SL18& zwfH)GJ-z|oh;PC-<6H2p_%?hyz60Nh@4|QEd+@#ZK72oZ06&Ny!Vlv|@T2%K{5XCB zKZ&2hPvd9sv-mmuJbnSch+o1l<5%#j_%-}GegnUW-@cksLTJ^ViY0Dp)-!XM*L z@Td4Q{5k#te~G`sU*m7^xA;5!J^lgzh=0OA<6rQv_&5AJ{saGs|H2^xCJ+K8FajqC zf+Q${CIG<@EWr^xArK-V5i+3=DxncNVGt%^5jNovE+P|=naDz9C9)COi5x^uA{UXH z$V2!Md5L^Pexd+Tknkr85rv5&L{Xv`QJg42lq5Z5Z!c9aI9f*!Z3=vDj5%EL<;UN- z#AIR$F_oA`Oeba#Gl^NmY+?>EmzYP)Cl(M3iABU>VhOR7SVk--RuC(RRm5sy4Y8J3 zN316{5F3e2#Aad(v6a|HY$tXQJBeMyZekCym)J+_Ck_w?i9^I;;s|k+I7S>NP7o)F zQ^aZF3~`n?N1P`v5EqF{#AV_Nah146TqkZ2H;G%sZQ>4bm$*mVCms+FiATg^;tBDT zct$)YUJx&dSHx@L4e^$EN4zIK5Fd$8#Ao6Q@s;>Sd?$VoKZ#!iM8YIOq9jJ*BteoS zMbac78ImPAk|zaHBqdTN6;dTNQYQ`4BrVb=9nwW+A~Ta&$gE^GGCP@r%t_`VbCY>U zKQb?wkIYXNAPbWIWFfLJS%fS~79)$3CCHLwDY7(KhAc~#Bg>N&$ckhovNBnPtV&iR ztCKaznq)1qHd%+PO9qhj$oga;*???FHX<97O~|HX5ZR1uPPQOhlC8+rWE(P=Y)gia z?Z{9vj0`6u$Vf7ZY)`t$XtD#@k&Gc@$v85eOdvgEBAG-clPP2;vNPF*>`HbcyOTZ0 zRI(@8i|kGIA^Vd3$o}L2av(W~983-&hmym{;p7N%Bsq#4O^zYQlH~av`~hTud$@my*lK<>U%-CAo@RO|Bu=lIzIz zO^&>x=>xIZd7-w2bD_o zqK=8UdO$s-9#M~}C)88w8TFic zLA|72QLm{t)LZHu^`81beWX57pQ$g@SLz$}o%%ujq<&El4bupX(in}?1WnQuP1As8 zXqM(^o)&14mS~w)XqDDzoi=EbwrHDnXcwJ{&P->av(nk<>~sz~C!LGVP3NKg=)813 zIzL^2E=c>+h3LX`5xOW{j4n=>pi9!F=+blBmWE7Fzd%5)XFDqW4PPS>Do z(zWQ?bRD`b9YEKk>(haB1G*vIh;B?bp_|e{bThg+-GXjOx1w9qZRlXSEgeF)qeJO1 zI-HK6Bk3r*J?*BW=?-*9I);v=LE8UImPWPZw>7H~i zx;Nd2?o0Qh`_lvHf%G7HFg=7GN)Mxl(m|=$Z5^dNw_Wo=eZ8=hF-5h4dnNF};LdN-v|A(<|te^eTEay@p;(ucOz~8|aPn zCVDfyh2BbUqqoyL=$-T~dN;j?-b?SJ_tOXHgY+T#FnxqRN*|+-((-+ z>VrVg05k-RKx5DZGzCGR8E6h#fR>;YXbswcV9*wXfOa4hgn@7n0U|*ZXz%?OqiE0p zbObRV7Q}&g;Qf~+kO-1MGDrcPKxfbebOqf&chCc*f}Wrk=neXSzMvoI4+emNU=SD# zhJc}97#I#lfRSJn7!AgNv0xk+4<>+#U=o-Nrhutn8ki1dfSF(xm<{HDxnLfc4;Fxh zU=dghmVl*T8CVWhfR$hsSPj;IwO}1s4>o{}U=!F3wt%f*8`utZfSq6$*bVl8yfS=$O zfEbuT7?i;noFN#Jp%|J048yPt$MB56h>XO@jKZjl#^{W}n2g2PjKjE?OiX4b3zL<} z#$;!5FgclAOl~F*@-g|D0!%^1pDDx?W{NOHnPNQ%ur?+Gn^U0jATYJqnR_ybp1HtWWG*q6nJdgy<{ERIxxw6IZZWr+JIr0?9&?|0 zz&vCgF^`!i%v0tW^PG9XykuT6ubDT@Tjm|}p83FhWIi#UnJ>&&<{R^!`N8~TelZXW zvj~f_7>lz6OR^M8vw&q-mgQKU6^_wjtYyZOk@do3cS{GqyR~f^EsR zVq3Fq*kHCT8^X3@L)kDkoQ+^3*(kO>>t>_b4s1s@hK*(8*myR9^{|O-5}VAXu$|b> zY!|jG+l}qc_Fz-lo@_6+H`|Bp%l2dYvjf>ze9JA@s|4r7P2BiNDbD0VbEh8@d} zW5=@-*oo{Ub}~DKoytyQr?WHInd~ffHamx%%g$rxvkTaT>>_qCyM$fJE@PLoE7+Cn zDt0xyhF#09W7o4A*p2Kab~C$$-O6rbx3fFgo$M}lH@k=3%kE?Mvj^CN>>>6rdxSm8 z9%GNQC)kthDfTpbhCR!kW6!e}*o*8X_A+~gy~?8Iu`-FYUK4YJ=FW8stEA}<}hJDMvW8bqM*pKWd_A~p1{mOo0zq3EspX@Id;$RNp zP!8j8j^Id+;%E+V499XD$8!QFauO$V3a4@!r*j5pau#QE4(H-BahbU+Tvjd{mz~SO z<>Yd4xw$-?AD5TQ$K~e=a0NMkt`Jw4E5a4!igCrc5?o2H6jz!n!H=wdF#%c3dbI#)WecTqGC8wddSiG}nRa$i;B6TpSnAC2$@tkxSx|xfHGw*O}|W zb>+Ho-MJoID%X?i#r5X;aDBOcTz_r=H;@~|4d#Y$L%CtxaBc)Qk{iX1=EiVixpCZh zZUQ%vo5W4#rf^faY20*f1~-$N#m(mCaC5nN+ zE_6q-;eLl58wy#gZRPx5Pm2>j33UA;79VK_|g0rek?zZAJ0$V zC-Rf{$@~<4DnE^%&d=ay^0WBa{2YERKaZc!FW?vQi}=O-5`HPaj9<>L;8*gi_|^Ox zel5R_U(avgH}aeK&HNUAE5D83&hOxN^1Jxm{2qQUzmMO~AK(x2hxo(%5&kHDj6cqw z;7{_W_|yCu{w#lvKhIy_FY=f8%lsAoDu0c?&fnl~^0)Zg{2l%-e~-V0xNI=F9?DtNP;XVf+}c&E*OF-Sb{A$f=kFGWEQdrS%qvub|Ht5Q^+Oc z7V-#wLS7-CkY6Yu6cqf0LPBAoh)`50CKMM+2qlG5LTRClP*x}>lou)p6@^MdWub~t zRj4LZ7itJKg<3*wp^i{j2oUNC^@Tv8fzVKBBs3P92u+0`p_$NJXd$!|S_!R%HbSt_ zRtOQ=386xm5H3UrkwTQvUT_Q1LI=pJ2`-KC-LE(^aSU4ga6^;qVg%iR_ z;goP%I3t`D&I#v*3&KU=l5knLB3u=&3D<=i!cF0pa9g+|+!gK#_k{<-L*bF|Sa>2l z6`l#tg%`q0;g#@Ocq6xuzlJ+ZzRC^irqijBm^ViU2c7$i0mn~N>PmSQWhwb(`s7Tby; zVmmQZ3=_k}2r*KO65ESzF`vEn#!yf{IeC{7Y5 zi&Mm@;xuu(I76H%&Jt&fbHusgJaN9bKwKy;5*Le0#HHdgak;ocTq&*+SBq=Jwc7v*J1Nym&#pC|(jTi&w;};x+NQctgA?-V$$%cf`BmJ@LNyKzt}Z5+93C#HZpj z@wxayd?~&XUyE zsv*^sYDu-FI#OLJK&mIzmja~*QbVbc)L3dFHI;&-W>Ryhh161NCAF5?NWoHDDMV@~ zg-T&kxD+8pN>NgK$t^`o9i)y@j1()yN%2yG=+kvd78r7lueshiYY>LH~{ zJ*8e!Z>f*eSL!GAmj*}!r9skQX^1pb8YT^wMo1&2QPOB>j5JmnCykdTNE4+=(qw6h zG*y}=O_yd!Go@M5Y-x@(SDGiymljA1rA5+WX^FH{S|%-*R!A$QRnlr{jkH!;C#{z@ zNE@Y1(q?Ikv{l+BZI^aPJEdLHZfTFSSK24-mkvk=r9;wT>4LPDm%EQ_^Ya zjC58yC!LorNEf9`(q-w2bXB@0U6*c1H>F$BZRw75SGp(NmmWwDrAN|Z>524IdL})W zUPv#cSJG?gjr3M}C%u*T9D+kE+klajeF1L_d%B|$qavM2VZYzh#?c`87 zOb(YL|?uWUMw$>m&(iJgOTI1Nk?+d)Sf04h+-{kM|5BaD3ONJC!K@?QM6kH(`QlS)D0ScqA z3a9Xjpoog3$cmzwkK(7~Rq`qM zl>$mZ#a}6;6jq8TMU`SoaixS(QYodBR>~-4m2ygXrGipXsiag^swh>JYD#sbhEh|h zrPNmHD0P(prJhn>2~-*=4V6YpW2K4GR0&d=Db1A@N=v1c(pqVw1S@To5T%_Gs)Q-w zN`w-rL@DhRw-T*%P&z6xN~{v6#48DkM@dwYlw>7E>7;a4x+qWu7u$S)eRb7AcFBCCXA|nX+72p{!I^DXWz=%35WevR>JsY*aQW zo0TofR%M&AUD=`RRCX!5l|9N{WuLNNIiMU=4k?F~Bg#?bm~vb>p`27sDW{b)%30-{ za$dQhTvRS8mz68ZRppv;UAdv$RBkD^l{?B^<(_h1d7wO09x0ENC(2XhnetqDp}bUH zDX*0`%3I}~@?QC%d{jOupOr7lSLK`XUHPH>RDLOt3af~Us+fwagi5NEN~=I+R959w zUKLbPl~h?(R8`efT{TowwNzVmRF|4b&8%iov#QzD>}n1*r#hv)VyjwHNRRw zEvWjdh19}o5w)mVOf9aKP)n+%)Y57hwX9lBEw5HkE2@>$%4!w0s#;C0uGUa%s#Kok1GS;rNNub(QJbnkYBROD+Cpuqwo+TGZPZ}3ts0`XQ$y7-HC&BQ zBh@Iiz3Nt@)edS$HAan9O-gf$AW2usTE?st!|!t0UBr>L_)zIz}CUvl z)S2omb+$T3ovY4M=c^0Uh3X=8vARTEsxDKPt1Hx%>MC`$x<*~Au2a{m8`O>JCUvvA zMct}yQ@5)-)Sc=sb+@`l-K*|X_p1lggX$smuzEy2svc91t0&Zx>M8ZKdPY5~o>R}O z7u1XDCH1m;MZKzCQ?IKx)SK!p^|pFPy{q0+@2d~ghw3BsvHC=PsyMQlN z`bK@LzEj_;AJmWPC-t-XMg6LNQ@^V})Sv1v71Cf0(NGQ3aE;JNjnZfhXpF{coW^T{ zCTfx2bEwxr!YpsnIthLobw02sk7N&)35n7}c zrM1`GTC~!`(Ov09uKuO(<6Em2F-lC>1Alh#@5qIK1}Y2CFRTB_Dl>!tP9`e=Q% zep-KRfHqJYqz%@FXhXGO+Hh@zHc}g9!W3_SGcx{3uN_ELMLz1H4n zZ?$*Yd+mevQTwEQ*1l+8wQt&Y?T7YL`=voTtRp(AV>+%AI;m4StplCWS)J2)UC>2c z(q&!IRbA6{-Ox?l(rw+*U3w-xvz|rIs%O))>pAqCdM-V;o=5l7^XmEZ{CWYspzg01 z(hKWF^rCt(y|`XNFR7Q(OY3FyvU)kayk0@Cs8`Y}>s9osdNsYeUPG^`*V1e2b@aM= zfL>3puLtT4^oDvPy|LazZ>k6B&GhDa3%#Y@N^h;V(S!B2dWham57oo;a6Lkg)T8wF zx?7LdJLnzt7(G^x)8q97-J>V!NqVxLqIc3e>s|D&dN;kh-a}8-d+NRP-g+Osuij7Z zuMf}%>Vx#b`Vf7nK1?63kI+Zzqx8}G7=5fhP9LvN&?oAX^vU`ZeX2f9pRUi)XX>-` z+4>xPu0BtnuP@LS>WlQn`VxJqzD!@Puh3WOtMt|S8hx$4PG7HY&^PLv^v(JfeXG7r z->&b_cj~+J-TEGVuf9*;uOHA4>WB2h`Vsx8eoQ~EpU_Y0r}WeM8U3t&PCu_-&@bwj z^vn7c{i=RVzpmfVZ|b-7+xi{-u6|FyuRqWq>W}ot`V;-B{!D+aztCUmuk_dY8~v^R zPJgd|&_C**^w0Vi{j2^>|E~Ygf9k(<$bb#RKn={m4ZS+8ASuHO3j^jS0p?W0Eo1m|{#drWw|W0SGj*kWuowi(-v z9mY;$m$BQ}W9&8d8T*X`#zEtdao9Lw95s#^$Bh%lN#m4p+BjpJHO?95jSI#_)C@Dj%?LBnj56DsZZq2K zV0JWP%vdwdj5iZZkC|vDnaO5~*~#o|b}_q}-OTQ04>Q&5Y4$REn|;i_W!j!ySc;M zY3?$2n|sW?=00=3dB8kq9x@M`N6e$^SpV%yl7rBFPm4) ztL8QHx_QIAY2Gq#n|I8+<~{Si`M`W=J~AJhPt2$0GxNFm!hC7IGGCi-%(vz{^S$}O z{AhkMKbv37ujV)NyZOWXY5p=H3$_powJ-~}2#d5Ri?)EpSggfayd_woC0VkiSgNI2 zx@B0VWm&f6SS~A*mD$Q-Wwo+d*{vK_PAiv{+sb44S$VB|R(`91RnYRc3R#7%B34nW zm{r^=VU@H>S*5KqR#~f@Ro<##RkSKum8~jPRjZm+-Kt^Lv}#$ktvXg+E5NE})wcqz z23A9>k=590Vl}mbtY%hotA*9lYGt*y+E~F>TPwtBXN6i}R=5>mMOsl-d&_M_TOF*9 zR*V&E#aZ!Ig5|Litt2biO0hawovkibSF4-V-RfbbT0N~^R&T41)z|80^|uCC1Fb>U zU~7mq)EZ_Dw?i?!9-W^K22SUas< z)^2N$wb$Bb?Y9nC2dzWaVe5!>)H-Gzw@z3mty9)%>x^~QI%l1?E?5_>OV(xUignex zW?i>#SU0U()@|#Kb=SIQ-M1cC53NVmW9y0a)Ouz;w_aE;tyk7->y7o+dS|`2K3E^E zPu6Gai}ls|W_`DQSU;^_7G%RVVxu-@<2GTFHf7T`uo;`RIh(fyTeKxxwiR2oHCwk0 z+q5m)wjJALXRIhb}75GUB)hJm$S>;73_+3CA+d+#ja{sv#Z-R?3#8hyS81&u4@O__3Zj~pxwZ3 zXg9JO+fD4Ic97l7Zf>`*TiUJc)^-~^*lufw*zN34JIoHZBkV{!%5HDF?P$A$-O-M* zW9>LQ-cGPRcA}kRC)+7@C%d!V#qMf%v%A|p>{Pp_-OKK6_p$rh{p|ks0DGW4$R2DD zv4`5j?BVtZd!#+e9&L}Y$J*oU@%99JqCLr;Y)`SL+SBam_6&QbJ<8-`el&_x1<-qy5SL zY=5!8+TZN&_7D4~{mX_N*g+iB!5rKn9MYj2+5ryZuny<&j^K!n&_I7OXePI0G% zQ_?Brly=HEWu0Ns_s0H>Z)-wAXYI1Qaf zPGhHu)6@xanmNs#7EVj2mDAd3;{-cxoe-y;6Y7LH;ZB4T=|nm09k&zhbZ|O4F;1)# z=fpb+j>k!KlAL5G#p&d9cDguSoo-Hdr-zg3^mKYTy`4TzU#Fkb-x=TxbOt$togvOp zXP7hG8R3j{MmeLMG0s?LoHO2;;7oKTIg_0!&QxcbGu@ft%yecsvz_oh8mvXPL9yS>dd7RynJkHO^XRowMHA;B0g@Ih&m=&Q@oev)$R@>~wZHyPZAG zUT2@P-#OqMbPhR(og>as=a_TcIpLgiPC2KYGtOD(oO9l};9PVrIhUO)&Q<4{bKSY& z+;na^x1BrAUFV*2-+ACXbRId6ohQyy=b7`|dEvZtUOBIwH_ltWC+c zJ0+ZwPAR9fQ^qOllyk~E6`YDrC8x4e#i{C4bE-QvoSIH8r?ykasq55p>N^dbhE5}= zvD3t9>NInjJ1v}+PAjLi)5dA*v~$`!9h{C%C#SR1#p&vFbGkb{oSsfEr?=C`>Fe}! z`a1)hfzBXjurtIN>I`#+J0qNt&M0TJGsYR~jC0026P$_8BxkZS#hL0%bEZ2poSDun zXSOrPnd{7R<~s|Vh0Y>pv9rWk>MV1XJ1d-(&MIfMv&LELtaH{o8=Q^KCTFv=#o6j? zbGAD>oSn`tXScJ*+3W0c_B#ihgU%u6uye#Y>Kt>9J13lz&MD`#bH+LAoO8}Q7o3aE zCFinp#kuNSbFMo#oSV)q=eBdlx$E3>?mG{hht4DCvGc@v>O6CvJ1?A<&MW7&^Tv7W zymQ_=ADoZQC+D;C#rf)dbG|!2oS)7w=eP662_^m!p+y)GR)iDbMFbI1L=urj6cJTK z6VXKs5mUqxu|*saSHu(XMFNpfBoc{565$9Tq;Q22T6n?;Uj)L65Rp_Q6Ujviky4}* zsYM!*R-_Z@MFx>kWD=Q07LiqC6WK)$kyGRnxkVn4SL74w60uY)6U)U4u~MuOtHm0zR;&~2#RjoaY!aKr7O_=q6WhfOu~Y04yTu-{SL_q} z#Q||p91@4c5ph%;6UW5~aZ;QTr^Oj@R-6;(#RYLuToRYX6>(Ku6W7HJaZ}t9x5XWC zSKJf##RKtBJQ9z^6Y*3$6VJs9@lw1Juf-ejR=gAM#Ru_Gd=j6<7x7hm6W_%T@l*T~ zzr`OBO8z55%P=ym3@5|O2r{CKBqPfxGOCOwqstgFri>+H%Q!Nwj3?vE1TvvaBooUd z(vd<+=}INF^rVr#45XDIGO0`^lgkt`rA#GL%QP~rOefRJ3^Jq4Bs0q_GONrcv&$SZ zr_3dD%RDl#%qR290?ixn0dk-mBnQhOa;O|8hszOiq#Pwj%Q14S94E)i338&G zBqz%$a;lsrr^^{~rko{b%QM;mYq_=EI&NLJ zo?G8-;5Kv{xsBZ>Zd13J+uUv8wsc#$t=%?mTeqFt-tFLabUV47-7aodx0~DD?cw%x zd%3;cK5k#PpWELZ;0|;Lxr5yy?ofA_JKP=Nj&w)4qunv?Sa+N|-kso1bSJr!-6`%= zcbYrho#D=OXSuW8IqqC{o;%-N;4X9*xr^N;?oxM|yWCyju5?$qtKBv3T6dkh-reAC zbT_%1-7W4`cbmK2-Qn(Zce%UWJ?>t2pS#~Z;2v}jxrf~&?os!cd)z(Yo^(&Sr`WybU(SD-7oG}_nZ6O{o(#}f4RTiKW-@Xj|#2AsIV%W3a=ul zh$@nbtfHu>Dw>L}VyKuZmWr+7sJJSgimwu=ges9rtdb~42_==Ql+wymM)@jGR)wgf zDw#^IQmB+Fl}fGBsI)4bO0P1gj4G4Ltg@)ADx1o#a;Tgtm&&d3sJtql%C8Ejf~t@z ztcs|js+cOSN~n^mlq#*tsIsb@Dz7T2imH;Ttg5K0s+y{aPZXsI%&vIYBQ) zZm65;mb$I(sJrT(y00Fnhw71fte&W+>X~}3UZ|Jqm3pn-sJH5!dapjHkLr{9tiGtP z>YMtmeyE@7m-?;#s8ISJ9a@LcVRbkiUPsUobtD~GN6}GrG#y>X&@pu^9b3oIadkW$ zUnkHBbt0WuC((`;T54A-t+l6(_I04G4$(<~fCUT4r5btauz zXVF=8Hl1DP&^dK3om=P8d38RWUl-5?bs=3?7tuv^FRb&@FW<-CDQNZFM`{ zUU$$Pbtm0fchOySH{D(L&^>i8-COt3eRV(GUk}g&^&mZ1579&QFg;w4&?EIIJz9^^ zWA!*aUQf^y^&~x6PtjBLG(BC<&@=TcJzLMwbM-tuUoX%L^&-7kFVRc&GQC`{&@1&S zy;`r)YxO$4UT@GF^(MVpZ_!)zHoaZ%&^z@my<6|md-XoOUmws1^&x#&AJIqkF@0R0 z&?ogNeOjN*&|VlXtQXD; z??vz;dXc=yUKB5?7tM?A#qeT!vAoz`951dH&x`LR@Dh57yu@A-&+&vOJ=as7_B_vc zz884b3-OYA$-LxV3NNLX%1iB~@zQ$fy!2iMFQb>q%j{+GvU=IP>|PEprRt`6 zrdP|W?bY$>diA{eUIVY8*T`$^HSwBy&AjGb3$LZu%4_Yl@!ERry!KuPucOz=>+E&$ zx_aHb?p_bCr`OBt?e+2cdi}iq-T-f)H^>|84e^G0!@S|%2ydh}${X#C@y2@Nyz$-y zZ=yHJo9s>Trh3!7>D~-)rZ>x*?alG#dh@*b-U4r-x5!)UE%BCm%e>{@3U8&i%3JNN z@z#3ly!GA&Z=<)#+w5)ewtCyV?cNS=r?<=7?d|dQdi%Wn-U08RcgQ>J9r2EO$Gqd- z3GbwL$~*0y@y>eZyz|}#@1l3fyX;-@u6ozJ>)s9TrgzJ`?cMS2diT8h-UIKU_sDzf zJ@KA;&%Ec}3-6`(%6sj-@!opxy!YM*@1yt0`|N%3zIxxh@7@pZr}xYI?fvmWnSV@Z z6UKx!;Y@fF!9+BXOk@+qL^aV&bQ8nGG_g!<6UW3g@l1S^z$7$@Ok$J7I7S$0T%(LO zo-xKZfw3mUBsIxQa+AWOG^tE#lg6Yq=}dZ)!DKXR6g9<6aZ|#SG^I>wQ^u4vl;G&RjkbJN1KG_6c))5f$l?M!>q!E`j8OlQ-@bT!>fchkf4 zG`&o3)5r8R{Y-x|zzj5l%wRLb3^l{da5KV;G^5ODGscWH&$wy!E7{}%x1I2 zY&F}=cC*9mG`q}hv&ZZ;`^Gq7ryjeU-{bieB=9m;9Eb$PwFT0llv+Blzu8d zwV%dM>!zDJ(`xX3(ekH%MU&XKLSM#g;HT;@>Ex)#3$FJ+x^XvN!{Dyub zzp>xMZ|XPmoBJ*NmVPV0wco~X>$mgU`yKp_ekZ@P-^K6hck{dZJ^Y@2FTc0n$M5U+ z^ZWY){DJ-;f3QEqALyPut`xE?${v?00KgFNwPxGhyGyIwU zEPu8?$DixZ^XK~u{DuA^f3d&BU+OROm-{RHmHsM!wZFz+>#y_I`y2d?{w9C3zs29` zZ}YeNJN%vgE`PVb$KUJk^Y{A){Db}>|FD0=Kk6UzkNYS5lm03Hw137w>!0(_`xpF+ z{w4pif5pG*U-Pf~H~gFaE&sNE$G_{}^Y8l){D=M{|FQqXf9gN;pZhQTm;NjNwg1L{ z>%a5g`yc#|{wM#l|Hc36fAhclKm4EmFaNjy#}5_!6NC=J1Yv`4LHHm-5HW}pL=K_^ zQG;kf^dLqMGl&($4&nrHgLpywAVH8YNE9Rvk_1j50vWi03UuHFCh&tGut7+WG)NXC z4^jjvgH%E4AWe`qNEf6JG6WfeOhM)#OOQ3l7Gw`{1UZ9TLGB<=kT=K|N9uDg+gSNDYZ7E}*v1T}+NLG7SUP&cR- z)DId24TDBO3xh?$;$TUzG*}ia4^{*#gH^%mU`?<#SQo4hHUt}k zO~K}1ORzQA7Hki81UrLW!R}yBus7Hj>< zgIB@p;7#y0co)16J_H|wPr>KlOYk-L7JLtW1V4jc!SCQt5X$~zL)$PmtPN+w+XyzI zjbtO+C^o8%W~18}Hl~ebW7{}3u8n8o+XObDO=J_>B-XLQO6yu>we_sAz74FkAvUQ^ zW|P|#HlVvW~us$}xhS&%jV-swO&9FJPz?RqwTVoq+ zi|w#IcEFC<2|HsK?26s6JNCey*b94OAMA_$us;sKfj9^U;}9H*!*Do`z>zo#N8=bA zi{o%SPQZyc2`A$eoQl(MI?lkEI16Xv9Gr{ua6T@;g}4Y8;}Tqo%Wyfaz?HZPSK}I7 zi|cSbZorMW2{+>w+=|<9JMO@pxC?jV9^8xja6cZvgLnuJ;}JZH$M86wz>|0iPvaRp zi|6n>UcifZ2`}Rnyo%TGI^MvWcnfdi9lVS8@IF4khxiB|;}d*}&+s|Ez?b+6U*j8m zi|_C~e!!3T2|wc({EFZ3JO03*_zQpIAN<>!`!|2|SS+`su-g!?ytD{%kjZ}aWn{B64Zo4;q2zxjJG`J2DTlE3+T6#1K&7z<-# z9E^+cFg_;0gqR2uV-j>wphOoHYV^>cj{#Z?!K9cBlVb`@iK#F(roptB4%1@>%!rvV zGiJg69pSPuXU80v6LVp1%!7F`ALhpbSP%LgWIkv!-*a}-?8*Gd1 zuswFbj@Su1V;Ag--LO0Mz@FF(dt)E$i~X=a4#0sp2nXX39E!tmIF7)PI0{GO7#xe^ za6C@Hi8u)-;}o2V({MV@z?nD;XX6~4i}P?kF2IGj2p8iLT#CzZIj+E!xC&R}8eEI( za6N9ojkpOn;}+bC+i*MXz@4}YcjF%1i~Ddt9>9Zm2oK{CJc`HgIG(_hcnVMB89a;U z@H}3?i+Bky;}yJ$*YG;tz?*mrZ{r=ji}&z8KEQ|g2p{7Ue2UNTIljP`_zGX+8+?oJ z@I8LOkN62c;}`sj-|##Bz@PXFf8!tg->rlHg#K?GhYF2h@c-}o`l9o>F)${^!q^xG z<6=CFj|ng#Cc?y+1RWG8(M5$CJv8WJfEGhADJH|@m;zH`Dol-OFfFFT^q2uNVkXRt zSuiVR!|a#?b7C&cjd?IH=EMA001ILvER034C>F!wSOQC8DJ+d;uq>9t@>l^YVkNAM zRj?{n!|GTAYho>|jdidt*2DVP02^W>Y>Z8?DK^9A*aBN(D{PHzur0R3_SgYCVkhj3 zU9c;5!|vDvdtxu_jeW2$_QU=-00-hA9E?M7C=SEnI08rFC>)Jra4e3)@i+k|;v}4m zQ*bIy!|6B!XW}fJjdO4=&cpe*02ksST#QR_DK5k1xB^$=DqM|ga4oLG^|%2y;wIdT zTW~9G!|k{Ocj7MGjeBq}?!*0f01x6JJd8*1C?3P(cmhx2DLjp5@GPFg^LPO-;w8L{ zSMVxc!|QkhZ{jVyjd$=a-oyL&03YHbe2h=_xJ%n;wSu! zU+^n_!|(V5f8sCvjeqce|LFfG4ErC$;D6Vc{rgrp=I|H+BVr_sj8QNuM#JbB17l(= zjE!+HF2=+7m;e)EB20`)&_RI`T~w&iLxVmBXfXtnVlqsQDKI6b!qk`s(_%VIj~Or{ zX2Q&v1+!u{%#JxQC+5Q3mKFp5=upkz~!dL{0Vlga^C9oux!qQj<%VIe!j}@>Y zR>I0y1^>GY>hJe|=IU4jYho>|jdidt*2DVP02^W>Y>Z8?DK^9A*aBN(D{PHzur0R3 z_SgYCVkhj3U9c;5!|vDvdtxu_jeW2$_QU=-00-hA9E?M7C=SEnI08rFC>)Jra4e3) z@i+k|;v}4mQ*bIy!|6B!XW}fJjdO4=&cpe*02ksST#QR_DK5k1xB^$=DqM|ga4oLG z^|%2y;wIdTTW~9G!|k{Ocj7MGjeBq}?!*0f01x6JJd8*1C?3P(cmhx2DLjp5@GPFg z^LPO-;w8L{SMVxc!|QkhZ{jVyjd$=a-oyL&03YHbe2h= z_xJ%n;wSu!U+^n_!|(V5f8sCvjeqceKk)w(mi>=mFf4|{@E8FjVkC@=Q7|e-!{`_T zV`40fjd3t8#>4oS025*&OpHm;L4guoRH)HIgFXgmF$9xhGE9ysFeRqK)R+d-VmeHZ z889Pe!pxWjvtl;PjyW(V=EB^V2lHY+%#Q`IAQr;HSOkk=F)WTHuq2kk(pUz|VmU02 z6|f>!!pc|$t70{*jy13**23CY2kT-ztd9+_AvVIs*aVwmGi;76uqC#_*4PHyVmoY) z9k3&I!p_(QyJ9!&jyZzFARfZQcm$8)F+7eZ@FbqX(|88Y;yFBz z7w{rp!pnFCui`bljyLco-oo2>2k+uNypIp?AwI&#_ynKgGklIO@Fl*&*Z2nC;yZkg zAMhi7!q4~xzv4Iijz91x{=(n*2mifi_P_jtp)m}G#c&uNBVa^~gpn}{M#X3t9b;fj zjD@i=4#vfJ7#|a0LQI5-F$p>-P@;ct8Ud)I2u>cmtLRc7!U{NfF#jymI#8Oxq%V1e7hvl&X zR>VqJ8LQxbmskJ&{?A+;YhX>Rg|)E`*2Q{Q9~)ppY=n)m2{y%M*c@A6OKgR$u?@Dx zcGw;}U`OnPov{mc#ctRgdtguOg}t#4_QihK9|zz-9E5{$2oA+zI2=ddNF0TuaSV>d zaX20);6$8+lW_`8#c4PlXW&eng|l%E&c%5+9~a<4T!f2p2`Lkg}ZSN?!|q$9}nO`JcNhw2p+{_cpOjQNj!z8@eH2D zb9f#v;6=QIm+=Z-#cOySZ{SV5g}3nz-o<-(A0OaDe1wnj2|mSV_#9v0OMHc|@eRJk zclaJZ;79y~pYaQR#c%i>f8bC2g}?C+{(H;(|LuPajbSh>hQsg}0V850jEqq*Dn`TT z7z1NsER2nDFfPW!_?Q3_Vj@h8Nzg%o5?xfN(L;ki252z^lVUPVjwvuDroz;i2Ge3X zOph5bBWA+Pm<6+9Hq4GWFem21+?WURVm{1|1+X9%!opYti()Y>jwP@pmcr6l2FqeO zERPkiB38o6SOu$MHLQ*`uqM{R+E@qcVm+*n4X`0L!p7JHn_@F;jxDeyw!+rf2HRpg zY>yqVBX+{h*af>{H|&l*uqXDy-q;8GVn6JU18^V?!ofHMhvG0Cjw5g+j>6G62FKz! z9FG%lB2L1|I0dKTG@Onza3;>e**FL1;yj#>3veMW!o|1*m*O&9jw^5_uEN#02G`;` zT#p-YBW}XYxCOW3Hr$Roa3}7<-M9z$;y&Du2k;;s!ozq3kK!>rjwkRWp2E|32G8O- zJdYRfB3{DFcm=QGHN1{D@Fw2E+js}>;yt{N5AY#A!pHaopW-uojxX>fzQWh|2H)a4 ze2*XSBYwiq_yxb>H~fx2@F)Jl-}nbZMPUD9Xbgj4F&u`+2pAC~VPuSgQ85}u#~2tB zV_|HJgK;q)#>WJh5EEfyOo9#yl<1;DjUF2GF+htUm=u#?a!i3KF%_o9G?*6CVS3Df z88H)P#w?f>vtf43fjKc3=Egjj7xQ6$EPw^E5EjNFSQLw4aV&u)u@siZGFTSNVR@{8 z6|oXl#wu79t6_DlfiY z6LAtw#wj=zr{Q#*firOy&c-=77w6%8T!0I45iZ6hxD=P+a$JEcaTTt{HMkbn;d@fE(tH~1Fc;d}gm zAMq1@#xM94zu|ZMfj{vV{>DETDkA$ILt_{Wi{UUlM!<*|2_s_^jEd1PI>x}57z<-# z9E^+cFg_;0gqR2uV-j>wphOoHYV^>cj{#Z?!K9cBlVb`@iK#F(roptB4%1@>%!rvV zGiJf8m<_XI4$O(UFgNDGyqFL3V*xCPg|ILd!J=3Ui(?5aiKVbKmcg=E4$ET&tcaDc zGFHK=SPiRV4XlZ^ur}7gx>yhEV*_l6jj%B`!KT;@n_~-XiLJ0Tw!ya84%=e~?1-JP zGj_qQ*bTd55A2D(us8O>zSs}@;{Y6pgK#ho!J#+|hvNtwiKB2dj=`}w4#(pJoQRWf zGETv%I1Q)c44jFxa5m1txi}B!;{sfWi*PY6!KJtim*WatiK}omuEDjq4%g!b+=!cS zGj74HxDB`C4%~^ma5wJ3y|@qe;{iN~hwv~S!J~K#kK+kEiKp;1p24$t4$tESyoi_Z zGG4)}cnz=P4ZMlB@HXDTyLb=p;{$w%kMJ=*!Ke5PpW_RBiLdZAzQMQn4&UPk{D`0M zGk(FZ_zl0~5B!P0@HhU!P?6aG7#hQ1SPX~ZF#<-!NEjKTU{s8T(J=g4JN=${RF%720beJA9U`EV@nK27y z#cY@zb6`%)g}E^g=EZ!N9}8eXEQE!z2o}X+SR6}WNi2n>u?&{Qa#$WKU`4Eim9Yv| z#cEg`YhX>Rg|)E`*2Q{Q9~)ppY=n)m2{y%M*c@A6OKgR$u?@DxcGw;}U`OnPov{mc z#ctRgdtguOg}t#4_QihK9|zz-9E5{$2oA+zI2=ddNF0TuaSV>daX20);6$8+lW_`8 z#c4PlXW&eng|l%E&c%5+9~a<4T!f2p2`Lkg}ZSN?!|q$9}nO`JcNhw2p+{_cpOjQNj!z8@eH2Db9f#v;6=QIm+=Z- z#cOySZ{SV5g}3nz-o<-(A0OaDe1wnj2|mSV_#9v0OMHc|@eRJkclaJZ;79y~pYaQR z#c%i>f8bC2g}?C+hKkJo$IuuC!(uoLj}b5;M#9J#1*2j#jE*rdCdR_p7zg8GJdBSC zFd-(w#Fzvf6e!U}g&I9H=wpBuLog{O!{nF(Q(`JijcG6~ro;4@0W)GI%#2wuD`vy& zm;-ZSF3gR2FfZoA{8#`BVj(PyMX)Fq!{S&1OJXT3jb*Sbmc#N`0V`r9tc+E#Dptel zSOaTfEv$`ourAia`q%&)Vk2yfO|U68!{*omTVgA0jcu?kw!`+=0Xt$R?2KKoD|W-~ z*aLfFFYJwdurKz*{x|>!;vgK1LvSb#!{ImrN8%_Pjbm^uj>GXd0Vm=loQzX&Do(@c zI0I+mES!yVa4ycn`M3ZV;v!s(OK>SJ!{xXFSK=yMjcaf%uEX`X0XO0%+>BdrD{jN> zxC3|MF5HcKa4+t|{dfQm;vqbYNAM^f!{c}YPvR*&jc4#Ip2PEa0Wabuyo^`yDqh3u zcmr?ZExe6)@GjoN`}hDK;v;;FPw*)|!{_({U*ao#jc@QRzQgzU0YBm={ET1lD}KZ8 z_yd39FZ_*v@P9?U|3vw}b3kYegJCfohQ|mP5hGz_jDk@y8b-$$7!zY*Y>b0(F&@Up z1eg#LVPZ^z4hod$qC$-x8uT$hiy@d4lVNg9fhjQ+rp7dw7SmyR%zzm&6K2LNm=&{O zcFch}F&E~>JeU{rVSX%t1+fqo#v)i0i(zprfhDmNmc}wz7RzCItbi4<5?014SQV>b zb*zCku@=_GI#?I$VSQ|X4Y3h6#wOSln_+Wofi1BWw#GKt7TaNa?0_Ay6L!Wf*cH2B zckF>Zu^0BnKG+xgVSgNe191=z#vwQqhv9G>fg^Dgj>a)K7RTXuoPZN?5>Cb`I2EVi zbew@RaTd-P@;ct8Ud)I2u>cmtLRc7!U{NfF#jymI#8Oxq%V1e7hvl&XR>VqJ8LMDbtcKOG z2G+z{SR3nLU95-ou>m&3M%WmeU{h>{&9Mcx#8%iE+hAL4hwZTgcEnED8M|Ot?1tU3 z2lm8X*cY>oQBhJ z2F}D;I2-5ST%3pVaRDyGMYtH3;8I+M%W(y+#8tQ&*Wg-QhwE_zZp2Nv8Mok8+=kn6 z2kyjOxEuH2UfhTK@cNB9_@;8T2t&+!Gm#8>zl-{4z(hwt$Ne#B4s8Nc9H{D$B0 z2mZug_#6M=|1ReFCmQ=7!(dnphv6{-M#M-M8KYoSjE2!M2FAo#7#rhYT#SeDF##sT zM3@+ppo0P>x~Nd2hX#EN&|(NC#blTqQ(#I=g{d(Orp0ua9y4G@%!HXS3ueV^m>qLq zPRxb5F%Ra&e3%~#U_mT|g|P@0#bQ_-OJGSXg{83!mc?>d9xGr)tb~=Z3RcBxSRHF% zO{|5ru@2V7dRQMDU_)$#jj;(f#b($XTVP9Ug{`p-w#9bX9y?%1?1Y`M3wFhB*d2Rd zPwa)gu@Cmee%K!e;6NONgK-EB#bG!cN8m^tg`;r{j>T~}9w*>LoP?8c3QomoI2~u; zOq_+YaSqPKc{m>z;6hx4i*X4q#bvl0SKvxqg{yH5uElk@9yj1d+=QEP3vR`2xE*)k zPTYmNaS!greYhVF;6Xfuhw%s=#bbCJPvA*Bg{Schp2c%`9xvcUyo8tW3SPx)cpY!x zO}vG-@eba_dw3ro;6r?bkMRjU#b@{&U*Jo8g|G1qzQuR=9zWnm{DhzJ3x36K_#J=X zPyB_y@ehWI&hP&i8pB{%42R({0!GA07#X8rRE&nvF$TuOSQs1QU|fuc@i74=#6*}F zlc0kFCAz3kqlX544A5c-CdFi!98+LQOogd24W`9(m>x4=M$CknF$-qJY?vK$U{1`1 zxiJss#eA3_3t&MkgoUvP7R6#%97|wHEQO`943@=mSRN~2MXZFCu?kkjYFHg>U`?!r zwXqJ?#d=sD8(>3hgpIKYHpOPx99v*ZY=y0{4YtL0*d9AzN9=^1u?u#^ZrB}rU{CCY zy|EAW#eUcy2jD;)goAMi4#irsL98cg$JcXz644%bvcpfj{MZAQU@d{qWYj_=R;7z=R zxA6|%#d~-kAK*iLgpctFKE-GF9ADr|e1)&^4Zg*9_#QvtNBo4J@e6*%Z}=U5;7|O8 zzwr--ioyQJ&=>~8VmJ(s5ilY~!pIl}qhd6SjxjJM#=_Vb2jgNqjE@O0Atu7am;@aZ zDA7fQ8a*`VV}KSzFexU(SI818ZU}tc`WB zF4n{P*Z>=1BW#RKuqigf=GX#TVk>NoZLlr2!}iz#J7Op7j9suRcEj%21AAgG?2Ub} zFZRR!H~D!}YiUH{vGTj9YLkZo}=k19##s+>Lv1 zFYd$rcmNOLAv}yn@F*U`<9Gs3;we0hXYeeZ!}E9nFXAP8n18?Fjyp4D8 zF5biY_y8Z`BYccc@F_mS=lB9&;wyZOZ}2U?!}s_BKjJ6+j9>68e#7th1ApQ#{EdGw zR800ihQ=@$7QVSG%02{92S#w6&VK#49Y z)aao>9|N=)f=MwMCdU+*5>sJnOoM4L9j3<&m=QB!X3T_y7RM4;5=&ueEQ4jS9G1rlSP?5>Wvqf#u^Lv#8dwu+VQs8~b+I1S z#|GFC8)0K?f=#g*Hpdp&5?f(wY=dpF9k#~~*bzHnXY7Jqu^V>B9@rCmVQ=h%eX$?* z#{oDH2jO5GfxDhwuX54~XaT{*O9k>&B;cnc6dvPD` z#{+l}58+`vf=BTf9>)`S5>Mf2JcDQP9G=Guco8q*WxRq{@fu#o8+a3M;cdKwckv$H z#|QWjAK_zsf=}@oKF1gM5?|qKe1mWC9lpm8_z^$hXZ(U+@f&`}ANUi0;cxtdp<=QB zF*Jt3uow=*V+4$dkuWkw!KfGwqhkz=iLo#?#=*E4594D3Oo)jvF(yF=1xj>Lp+*l4 z`WT?a5KM~6Fgd2cl$Z)rV;W40=`cNJz>Js)Gh-IairFwb=D?ho3v**0%!~OjKNi4( zSO^Pa5iE+uusD{$l2{5$V;L-q<*+*1(!r3u|K?tc&%qJ~qIH z*a#bA6KsmjusOECme>kgV;gLX?XW#|z>e4nJ7X8@irug~_Q0Ol3wvW9?2G-dKMufw zI0y&h5FCoba5#>@kvIxR;}{%^<8VAqz==2sC*u^Hiqmj9&cK;C3uogToQv~tJ}$t8 zxCj^H5?qSQa5=8PmADF5;~HFx>u^18z>T;GH{%xEira8I?!cY63wPrl+>85gKOVq? zcnA;U5j={=@Hn2plXwbG;~6}Q=kPpUz>9bZFXI)wir4Tu-oTr93vc5cyo>knK0d&Q z_y`~46MTx#@HxJ~m-q@_;~RX7@9;f-z>oL|KjRntir?@%{=lF33xDGu3>BOGkD)OP zhQ)9g9wT5xjD(Rf3P#0f7#(9^OpJxGF%HJXco-iOU_wlUi7^Q}C{Ut{3N?CY(8mBR zhG0@mhRHDnro>d38q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(}T38$FU|p<-^|1jq#75W{ zn_yFHhRv}Bw!~K08rxu7Y=`Z!19rqt*crQESL}w}u?P0VUf3J^U|;Nq{c!*e#6dV1 zhu}~ghQo0Lj>J(o8pq&R9Eam^0#3w9I2otlRGfy>aR$!BSvVW#;9Q)C^Kk(##6`Fm zm*7%dhRbmUuEbTi8rR@jT!-s%18&4kxEZ(LR@{c$aR=_iUAP4oS025*&OpHm;L4guoRH)HIgFXgmF$9xh zGE9ysFeRqK)R+d-VmeHZ889Pe!pxWjvtl;PjyW(V=EB^V2lHY+%#Q`IAQr;HSOkk= zF)WTHuq2kk(pUz|VmU026|f>!!pc|$t70{*jy13**23CY2kT-ztd9+_AvVIs*aVwm zGi;76uqC#_*4PHyVmoY)9k3&I!p_(QyJ9!&jyZzFARfZQcm$8) zF+7eZ@FbqX(|88Y;yFBz7w{rp!pnFCui`bljyLco-oo2>2k+uNypIp?AwI&#_ynKg zGklIO@Fl*&*Z2nC;yZkgAMhi7!q4~xzv4Iijz91x{=(n*2SdeW|6^zjgJCfohQ|mP z5hGz_jDk@y8b-$$7!zY*Y>b0(F&@Up1eg#LVPZ^z4hod$qC$-x8uT$hiy@d4lVNg9 zfhjQ+rp7dw7SmyR%zzm&6K2LNm=&{OcFch}F&E~>JeU{rVSX%t1+fqo#v)i0i(zpr zfhDmNmc}wz7RzCItbi4<5?014SQV>bb*zCku@=_GI#?I$VSQ|X4Y3h6#wOSln_+Wo zfi1BWw#GKt7TaNa?0_Ay6L!Wf*cH2BckF>Zu^0BnKG+xgVSgNe191=z#vwQqhv9G> zfg^Dgj>a)K7RTXuoPZN?5>Cb`I2EVibew@RaTdv(J(s3z?c{dV`ChQi}5f%CcuQ42oqxxbWor~7Zqys(4db2S`5LYm<*F+3QUQq zFg2#Zw3rUlV+PEKnJ_bE!K|1Kvttg-iMjrt-t7W9YM^cS{v@GDad&rjcb7uZ;ssh- zSQcm@ZSlq3-QC^Y-QC^Y-Q}Cf{+9~9@8@~X`<-_=XMY*VY;uhxn=V)e%VIe!j}@>Y zR>I0y1*>8;td2F%18ZU}^u*d&2kT-ztd9-Q3mal1Y>Z8?DK^9A=#4F~CALBz^u^Zb zhyK_G+oBEIp&i>}2keLe7>J#)GnyEL!5D&Fuq$@M?$`r+VlQ-HZ|sA8u^;xw0XPr` z;b0tsLva`m#}POZN8xB3gJW?Vj>ic&5hvkfoPtwv8cxRO}p7vo`kOn?b75hlhYm=u#?a!i3KF%_o9G?*6CVS3Df88H)P z#w?f>vtf43fjKc3=Egjj7xQ6$EPw^E5EjNFSQLw4aV&u)u@siZGFTSNVR@{86|oXl z#wu79t6_DlfgV^BYoRCB#yVIR>tTItfL_=T8)0K?f=#g*Hb-x4fi1BW`k*hiMnCk& zHrN(z*beR39y?%148TC_gq_jEAPmM3?1Ejf8+OMY*b{r91AAj1?2G-dKMufwI0y&h z5FCoba5#>@kvIxR;}{%^<8VAqz==2sC*u^Hiqmj9&cK;C3uogToQv~tJ}$t8xCj^H z5?qSQa5=8PmADF5;~HFx>u^18z>T;GH{%xEira8I?!cY63wPrl+>85gKOVq?cnA;U z5j={=@Hn2plXwbG;~6}Q=kPpUz>9bZFXI)wir4Tu-oTr93vc5cyo>knK0d&Q_y`~4 z6MTx#@HxJ~m-q@_;~RX7@9;f-z>oL|KjRntir?@%{=lF33xDGu6iK=MqmBkfK{t$w z(J(s3KzEFZu`o8q!MGR?<6{C$h>0*UCc&hb43lFDOo^#5HKxI|m=4op2F!?=Ff(Ss zte6e6V-C!TxiB~8!MvCc^J4)lh=s5)7Qv!e42xq4EQzJCG?u}#SPsi$1+0jburgM` zs#p!HV-57cnpg`xu{PGhx>yhEV*~WUhS&%jV-swO&9FIoV+(AFtD3c z9EQVj1dhZ}I2y;`SR9AraRN@nNjMp&;8dK3({TpQ#925S=ipqNhx2g(F2qH+7?_uyXKhx_pW9>ha<7?0pl zJch^d1fIlGcpA^(Sv-g5@d94NOL!Tt;8nba*YO74#9Me9@8Dg$hxhRTKEy}(7@y!% ze1^~Q1-`^r_!{5fTYQJ_@dJLuPxu+X;8*;H-|+|j#9#Ou|DZ_5^&fRKFbcY1RE&nv zF$TJ0OpJxGF%HJXco-iOU_wlUi7^Q##blTqQ(#I=g{d(Orp0ua9y4G@%!HXS3ueV^ zm>qLqPRxb5F%Ra&e3%~#U_mT|g|P@0#bQ_-OJGSXg{83!mc?>d9xGr)tb~=Z3RcBx zSRHGi2iC+|=!vzl4%WqbSRWgp7dFI3*ch8&Q*4IK(HmP}OKgQc=!>n<5B;$XwnZDZ zLp!#|4%iU`Fc3RoXEZSggE0iVU{~yh-LVJu#9rvY-q;8GVn6JU18^V?!ofHMhvG0C zjw5g+j>6G62FKz!9FG%lB2L1|I0dKTG@Onza3;>e**FL1;yj#>3veMW!o|1*m*O&9 zjw^5_uEN#02G`;`T#p-YBW}XYxCOW3Hr$Roa3}7<-M9z$;y&Du2k;;s!ozq3kK!>r zjwkRWp2E|32G8O-JdYRfB3{DFcm=QGHN1{D@Fw2E+js}>;yt{N5AY#A!pHaopW-uo zjxX>fzQWh|2H)a4e2*XSBYwiq_yxb>H~fx2@F)Jl-}nbba<2cVqk&P-4WnW-jE*tT z9b;lFjE!+HF2=+7m;e)EB20`)FexU(SI8 z13j=N)wX80$XA$^g&;2jeh8lZLlrcupQd5 zJ$As37=VG;2|J^SK^Tl7*af>{H|&l*uqXCH2lmE3*cba@e;j}VaS#s1AvhF=;cy&* zBXJat#xXb+$KiOKfD>^NPR1!X6{q2JoPjfO7S6^wI2Y&Pd|ZGFaS<-YCAbuq;c{Go zD{&RB#x=MW*Wr5HfE#fWZpJOR6}RDb+<`lB7w*PAxEJ@~emsB&@em%yBX|^#;c+~H zC-D@X#xr;p&*6EzfEV!+UdAhU6|doSyn#3I7T(4?co*;CeSClq@ew}8C-@Yf;d6X} zFYy(=#y9vD-{E`wfFJP_e#S5O6~Ezk{DD957yiaSC{l3!M;#4}f^HZUqhWN6f$kU+ zV_|HJgK;q)#>WJh5EEfyOoB-<879XRm=aTAYD|M^F&(DI444r!VP?#NSuq=C#~hdw zb75}GgLyF@=Enk95DQ^pEP_R`7#7D8SQ1NNX)J?fu^g7i3Rn>Rk0dY#~SE? zHL(_YVr{I0b+I1S#|G$y4Y3h6#wOSln_+YG#unHTTcHp7Vr%q6e{6$o(T45Nj_t7n zcEkV-#7@{5O$@?d48bng6}w?~?14S87do&v_QAf`5BuW)9EgK(Fb=_?I1Gp52pox{ za5Rp=u{aLL;{=?DlW;Ol!KpY6r{fHqiL-Dv&cV4j59i|oT!@QsF)qQSxD1!$3S5b+ za5b*MwYUz~;|AP_n{YF3!L7Irx8n}niMwz&?!mpd5BK8%Jcx(zFdo69cnpu@2|S6X z@HC#mvv>~A;|08km+&%P!K-);uj388iMQ}J-od+g5AWjxe29p$vfU=(!2s2B~SV+?f1m>3IV zV;qc&@i0Cnz=W6x6JrugipelJrofb#3R7bmOpEC-J!Zg+mus$|GFKmd7urW5lrq~Rdqc^s|me>k?&=*^yANpe(Y>PH*hjwg_9k3$? zU?6tF&S+u~24e_z!LHa1yJHXRiM`N)y|EAW#eUcy2jD;)goAMi4#irsL98cg$JcXz6 z44%bvcpfj{MZAQU@d{qWYj_=R;7z=RxA6|%#d~-kAK*iLgpctFKE-GF9ADr|e1)&^ z4Zg*9_#QvtNBo4J@e6*%Z}=U5;7|O8zwr->R9yd2M+2jv8%D)w7#(AvJI2IV7#rhY zT#SeDF##sTM3@+pU{Xwm$uR|{#8j9X(_mUmhv_i`X2eXG8M9zk%!b)92j;|Fm>ct8 zUd)I2u>cmtLRc7!U{NfF#jymI#8Oxq%V1e7hvl&XR>VqJ8LMDbtcKOG26|vktc9Lf z8|z?QtcUfn0eWFWY=n)m2{y%M*c`pF1-8Ui=!3r48vW28+hAL?VLP;Ad+dN6F#rRx z6Lv-ugD@CFunTs@ZrB}rU{CCY4(yG6urKz*{x|>!;vgK1LvSb#!{ImrN8%_Pjbm^u zj>GXd0Vm=loQzX&Do(@cI0I+mES!yVa4ycn`M3ZV;v!s(OK>SJ!{xXFSK=yMjcaf% zuEX`X0XO0%+>BdrD{jN>xC3|MF5HcKa4+t|{dfQm;vqbYNAM^f!{c}YPvR*&jc4#I zp2PEa0Wabuyo^`yDqh3ucmr?ZExe6)@GjoN`}hDK;v;;FPw*)|!{_({U*ao#jc@QR zzQgzU0YBm={ET1lD}KZ8_yd39FZ_*vP^9Mik2)F{1>G!!pc|$t70{*jy2E&Yho?*#M)Q~ z>ta2uj}6cZ8)74Dj7_j9HpAxVjV-Vxwn88D#n$MD{@4cFq7B=j9ou6E?1%vvh@G%A znizz^7=m4}D|W-~*aLfFFLYpU?1O!=ANI!qI1mTnU>t%&aTpHA5jYY@;bUuCPRAKI6KCOUoP%?59?r)FxDXfNVqAhtaTzYh6}S>t;c8riYjGW} z#|^j-exUdJ1F6K~;dyn}b~9^S_X_z)lAV|;>7@fkkH7x)ri;cI+@Z}A9Zm2oK{CJc`HgIG(_hcnVMB89a;U@H}3? zi+Bky;}yJ$*YG;tz?*mrZ{r=ji}&z8KEQ|g2p{7Ue2UNTIljP`_zGX+8+?oJ@I8LO zkN62c;}`sj-|##Bz@PXFf8!q%X}SKRjs`|SH;jtWFgnIScZ`X#FgC`)xEK%PV**Tw zi7+uH!K9cBlVb`@iK#F(roptB4%1@>%!rvVGiJf8m<_XI4$O(UFgNDGyqFL3V*xCP zg|ILd!J=3Ui(?5aiKVbKmcg=E4$ET&tcaDcGFHK=SPiRV4fMd8SPMO|HrBzqSP$!C z1N6d%*a#bA6KsmjusM2T3v7w4&EF)-LO0Mz@FF(9oQTDU|;Nq{c!*e#6dV1hu}~ghQo0Lj>J(o8pq&R9Eam^0#3w9 zI2otlRGfy>aR$!BSvVW#;9Q)C^Kk(##6`Fmm*7%dhRbmUuEbTi8rR@jT!-s%18&4k zxEZ(LR@{c$aR=_iUAP&yZK`exY zu?QB$VptqYU`Z^6rLhc_#d264D_}*egq5)hR>f*q9c!Qm*2G%qiM6o~*2Q{Q9~+<- zHpE8Q7@J^IY=+Iz8(UyYY=u7Pi>=WQ{jm+UMH{w5JGRFT*bxIT5IbRKG%*N+F$B9{ zSL}w}u?P0VUg*Hy*a!P!KkSbKa3BuC!8inm;xHVJBXA^+!qGSe$Kp5~j}verPQuAJ z1*hUPoQ^YaCeFgyI0xtAJe-dUa3LSeNC+@=CxCi&*KHQH7@E{(-!*~Rb;xRmqC-5Ym!qa#L&*C{ej~DPFUc$?G z1+U^YypA{UCf>r^cn9y|J-m+(@F70J$M^)F;xl}XFYqP4!q@l)-{L!bk00O}p7vo`kOn?b75hlhY zm=u#?a!i3KF%_o9G?*6CVS3Df88H)P#w?f>vtf43fjKc3=Egjj7xQ6$EPw^E5EjNF zSQLw4aV&u)u@siZGFTSNVR@{86|oXl#wu79t6_DlfgV^BYoRCB#yVIR>tTItfL_=T z8)0K?f=#g*Hb-x4fi1BW`k*hiMnCk&HrN(z*beR39y?%148TC_gq_jEAPmM3?1Ejf z8+OMY*b{r91AAj1?2G-dKMufwI0y&h5FCoba5#>@kvIxR;}{%^<8VAqz==2sC*u^H ziqmj9&cK;C3uogToQv~tJ}$t8xCj^H5?qSQa5=8PmADF5;~HFx>u^18z>T;GH{%xE zira8I?!cY63wPrl+>85gKOVq?cnA;U5j={=@Hn2plXwbG;~6}Q=kPpUz>9bZFXI)w zir4Tu-oTr93vc5cyo>knK0d&Q_y`~46MTx#@HxJ~m-q@_;~RX7@9;f-z>oL|KjRnt zir?@%{=lF33xDGu6dAbwqmBkfK{t$w(J(s3KzEFZu`o8q!MGR?<6{C$h>0*UCc&hb z43lFDOo^#5HKxI|m=4op2F!?=Ff(Sste6e6V-C!TxiB~8!MvCc^J4)lh=s5)7Qv!e z42xq4EQzJCG?u}#SPsi$1+0jburgM`s#p!HV-57cnpg`xu{PGhx>yhEV*~WUhS&%j zV-swO&9FIoV+(AFtD3c9EQVj1dhZ}I2y;`SR9AraRN@nNjMp&;8dK3 z({TpQ#925S=ipqNhx2g(F2qH+7?_uyXKhx_pW9>ha<7?0plJch^d1fIlGcpA^(Sv-g5@d94NOL!Tt;8nba z*YO74#9Me9@8Dg$hxhRTKEy}(7@y!%e1^~Q1-`^r_!{5fTYQJ_@dJLuPxu+X;8*;H z-|+|j#9#Ou|Dedo^&fRKFbcY1RE&nvF$TJ0OpJxGF%HJXco-iOU_wlUi7^Q##blTq zQ(#I=g{d(Orp0ua9y4G@%!HXS3ueV^m>qLqPRxb5F%Ra&e3%~#U_mT|g|P@0#bQ_- zOJGSXg{83!mc?>d9xGr)tb~=Z3RcBxSRHGi2iC+|=!vzl4%WqbSRWgp7dFI3*ch8& zQ*4IK(HmP}OKgQc=!>n<5B;$XwnZDZLp!#|4%iU`Fc3RoXEZSggE0iVU{~yh-LVJu z#9rvY-q;8GVn6JU18^V?!ofHMhvG0Cjw5g+j>6G62FKz!9FG%lB2L1|I0dKTG@Onz za3;>e**FL1;yj#>3veMW!o|1*m*O&9jw^5_uEN#02G`;`T#p-YBW}XYxCOW3Hr$Ro za3}7<-M9z$;y&Du2k;;s!ozq3kK!>rjwkRWp2E|32G8O-JdYRfB3{DFcm=QGHN1{D z@Fw2E+js}>;yt{N5AY#A!pHaopW-uojxX>fzQWh|2H)a4e2*XSBYwiq_yxb>H~fx2 z@F)Jl-}nbbCa(Xeqk&P-4WnW-jE*tT9b;lFjE!+HF2=+7m;e)EB20`)FexU(SI813j=N)wX80$XA$^g&;2jeh8lZLlrcupQd5J$As37=VG;2|J^SK^Tl7*af>{H|&l*uqXCH z2lmE3*cba@e;j}VaS#s1AvhF=;cy&*BXJat#xXb+$KiOKfD>^NPR1!X6{q2JoPjfO z7S6^wI2Y&Pd|ZGFaS<-YCAbuq;c{GoD{&RB#x=MW*Wr5HfE#fWZpJOR6}RDb+<`lB z7w*PAxEJ@~emsB&@em%yBX|^#;c+~HC-D@X#xr;p&*6EzfEV!+UdAhU6|doSyn#3I z7T(4?co*;CeSClq@ew}8C-@Yf;d6X}FYy(=#y9vD-{E`wfFJP_e#S5O6~Ezk{DD95 z7yiaSC^B>XM;#4}f^HZUqhWN6f$kU+V_|HJgK;q)#>WJh5EEfyOoB-<879XRm=aTA zYD|M^F&(DI444r!VP?#NSuq=C#~hdwb75}GgLyF@=Enk95DQ^pEP_R`7#7D8SQ1NN zX)J?fu^g7i3Rn>Rk0dY#~SE?HL(_YVr{I0b+I1S#|G$y4Y3h6#wOSln_+YG z#unHTTcHp7Vr%q6e{6$o(T45Nj_t7ncEkV-#7@{5O$@?d48bng6}w?~?14S87do&v z_QAf`5BuW)9EgK(Fb=_?I1Gp52pox{a5Rp=u{aLL;{=?DlW;Ol!KpY6r{fHqiL-Dv z&cV4j59i|oT!@QsF)qQSxD1!$3S5b+a5b*MwYUz~;|AP_n{YF3!L7Irx8n}niMwz& z?!mpd5BK8%Jcx(zFdo69cnpu@2|S6X@HC#mvv>~A;|08km+&%P!K-);uj388iMQ}J z-od+g5AWjxe29p$vfU=(!2s2B~SV+?f1m>3IVV;qc&@i0Cnz=W6x6JrugipelJrofb#3R7bm zOpEC-J!Zg+mus$|GFKmd7urW5lrq~Rdqc^s| zme>k?&=*^yANpe(Y>PH*hjwg_9k3$?U?6tF&S+u~24e_z!LHa1yJHXRiM`N)y|EAW z#eUcy2jD;)goAMi4#irsL98cg$JcXz644%bvcpfj{MZAQU@d{qWYj_=R;7z=RxA6|% z#d~-kAK*iLgpctFKE-GF9ADr|e1)&^4Zg*9_#QvtNBo4J@e6*%Z}=U5;7|O8zwr-> ztX%(5M+2jv8%D)w7#(AvJI2IV7#rhYTnzTLCHD*&T+fWB2@&G!<<>4BIDZi{zD(Jp z%D3$O%mfmLnNYrGv&WF{wnVnX9%7J}<~yXGnMh@OXtkaigX}To_OJ$Lqp35}gf0vb zMYxHmBASRUVhDE;Q^XRnMH~@V#1rvF0+CQ85{X3;kyIoT$wdm0Qlt{8MH-P-q!Z~y z29Z%@5}8F78KP_=yT~DOid-VM$RqNKd?LRnAPR~?qOd3;ii%>QxF{h?ic+GqC?m>> za-zJbAS#MVqOzzWs)}l&x~L&ML`_jkc#7Jhj;Jf@iTa{}@DdG0Bhgqi5luxi(Oh_o z7NVtSC47XhXf6DNzi1=c3Y%yr?4rHsAUcWw5hyx|&cYNyB3OioE~2aGCc29rqNnI3 z9HO`ABl?PdqQ4j*28uyquoxnSieX~77$HWAQDU?hBgTqxV!W6jCW=X7vX~;KifLlH zm?370Sz@-BBj$>EV!l`)7K%k;u~;IOie+NCSRq!5RbsVRBi4#_V!hZPHi}JRv)CfG zifv-M*dca`U1GP`Ble1YV!t>b4vItKus9-)ieuuqI3Z4oQ{uEZBhHF*;=H&ZE{aRy zvbZ9yifiJ!xFK$eTjI93BkqcO;=Xtw9*Rfev3Me$if7`vcp+YjSK_sJBi@R4;=TAF zK8jD`v-l#uif`h(_#u9ZU*fm;BfDSIbj{GBXl`0mEt(cxi=nw|F|}A)Y%Pu!SBs~` z*Ai$6wM1HCEs2&?OQt2)QfMi)R9b2+jh0qRr={02Xc@IkT4pVamQ~B9W!G|OIkj9` zZY__NSIej6*9vF_wL)59t%z1sE2b6KN@yjuQd()Pj8;}Fr~fST=UjiXf3r?nvdqIwbuMJ zf31zyR!5Yi0<=J_lh#=?wID563(>l0UA1mncddukQ|qNUwBA}Dt*_Qk z>#q&a25N(}!P*dQs5VR+u8q(}YNNE#+8AxDHclI_P0%K4leEd&6m6+8S-GwoY5GZO}Gqo3zc^7HzAx zP1~;R&~|FOwB6bsZLhXZ+pita4r+(A!`cz;sCG;{uAR_MYNxc*+8OPvc1}C5UC=IS zm$b{;7452aO}nn$&~9qCwA(0*#awBOnvIr%kR*9|?2?xsi8qv_H07`nS2Q;(&` z*5l}L^>})GJ%OH3PoyW-ljuqHWO{Nvg`QGRrKi@@=xOzIdU`#Bo>9-FXV$amS@mps zc0GrlQ_rR6*7N9j^?Z7My?|a&FQgaNi|9r5VtR4CgkDlFrI*&r=w7IIRy^da2ucz178|YqoL%osSSZ|^?)tl+fb#J|e-coO+ z`{=%UYu!)x*W2iAb(`K!x9jco4thsDKo8VA>78{`57LA65WS1uRqv*E*L&za^cjNm`UribK1v_0kI~2KdW-y`U-uezDi%MuhG})>-6>d27RNx zN#Cq*(YNZ`^zHf%eW$)l->vV__v-uf{rUm@pnga{tRK;j>c{ls`U(A{eo8;BpV80i z=k)XX1^uFaNx!UL(XZ;)^y~T!{ic3PzpdZV@9OvT`}za@q5epJtUuA8>d*A&`V0M~ z{z`wXztP|7@AUWj2mPb|N&l>W(ZA~7^zZr){ips*|E>R#OTT95hG9f8+>EG3G$XnZ z!*DlZ8nKMnMjRuq5zmNkBrp;hiHyWX5+kXR%t&seFj5++jMPRNBdw9nNN;2?G8&nT z%tjU?tC7vfZsag>8o7+zMjj)tkR5u>P4%qVV@FiIMwjM7FKqpVTR zC~s6SDjJoH%0?BVs!`3TZqzV5jG9I*!_%m3)G_KB^^E#P1H;Q`Xf!e!8%>O+Ml++i z;cc`qS{khkAH&yZZTK1fMjNB8VKdqpcB8$~!RTlN7=cD7qqAWeK}N6(aUfcy^TIbU!$MV-xy#FGzJ-ijUmQRW0*1A7-5VwMj4}xF~(S9oH5>*U`#Y7 z8Iz4E##CdPG2NJ9%rs^hvyC~%Tw|Uw-&kNQG!_|)jU~oXW0|qsSYfO*RvD{}HO5+F zow457U~Dut8JmqQ##UpSvEA5V>@;>6yNx}@USprJ-#B0#G!7YujU&cUB#WEmI*-7qRz zHxY6>I_VhbjxjM7#>O}p7vo`kOn?b75hlhYm=u#?a!i3KF%_o9H0V0@bfnW`gr9xN zI~?aL!s8&EF)-LO0Mz@FF(9oQTDU|;Nq z{c!*e#6dV1hu}~ghQo0Lj>J(o8pq&R9Eam^0#3w9I2otlRGfy>aR$!BSvVW#;9Q)C z^Kk(##6`Fmm*7%dhRbmUuEbTi8rR@jT!-s%18&4kxEZ(LR@{c$aR=_iUAPY0OjMZ@T*9z3X*JE~_5>ZNW=)Jxr%s8<|}i|Vn4dZ`B*>ZKlLsF!-2 zp zJeU{rVSX%t1+fqo#v)i0i(zprfhDmNmc}wz7RzCItbi4<5?014SQV>bb*zCNSQBfZ zC)UO~SQqPIeQbbU*bp0GV{C#=u^Bc;Z)|}ru@(BDFSbTM^v5>X7H!xL?bseWU`GtV zK#~#=dd!Yk+V;}5`{jfg{z=1dj2jdVNiocz=gO77vmCKipy|0uE3SJ3RmMA zT#M^)J#N5_xCuAo7Tk*4a69h6owy5k;~w0L`*1%Vz=L=Q591L$ipTIcp1_lM3Qyx1 zJd5Y>JYK+ycnL4#6}*bq@H*bWn|KRv;~l(<_wYVGz=!wS$mTbi=3^4WnZWbjO$&3u9v( zjEnIwJ|@6~mJs)Gh-IairFwb=D?ho3v**0 z%!~OjKNi4(SO^Pa5iE+uusD{$l2{5$V;L-q<*+)<6%eiM7xZ zYhxX(i}kQRHb5_Ih>fr@Ho>OY44b1jw!oIy3VqNQTcaQPV;gLXHf)D>Y>yqVBL-j~ zcEZkRVh{#n2zJ4)*bTd55A2D((1E?N5B9}=*dGVrKpcdFaR?5@VK^K|;7A;Wqj3z5 z#c?SI813j=N)wX80$XA$^g&;2jeh8lZLlrcupQd5J$As37=VG; z2|J^SK^Tl7*af>{H|&l*uqXCH2lmE3*cba@e;j}VaS#s1AvhF=;cy&*BXJat#xXb+ z$KiOKfD>^NPR1!X6{q2JoPjfO7S6^wI2Y&Pd|ZGFaS<-YCAbuq;c{GoD{&RB#x=MW z*Wr5HfE#fWZpJOR6}RDb+<`lB7w*PAxEJ@~emsB&@em%yBX|^#;c+~HC-D@X#xr;p z&*6EzfEV!+UdAhU6|doSyn#3I7T(4?co*;CeSClq@ew}8C;0dGc}e9LYWe%TDDwAt z#mrxe?-iqENB>|SKc8S9GqcR_ zHnYfgTXG+pm#v|!npIWuHnXYBRJJA#GrP*LHS-X*<{p~O+ry1tHs+A|KHg?dRWa;O z7jvolGB$35h?4BHcLvxK#g$oA~-Z#PS-9A^OJ;Bb?tu4(cE z!&WI3p^7F)P_VDR?UlD#+G_8&?T^3howr%WN?E@QDQj(p|7A!y+R?38$CD}R<+@2! z3i_Wre!7wHZFtb!g{*40T;u(HZJ`ku+1EX8zZqZXkCE6KXG+l;gjl zM2-JOsySPL-5Qv3{5O_qhuOr+4Lkmus`?>z)gckbe{I+JZ)WY=T4TaXKG3!(PR4b8us=v&aQE8(p z%BZxpss-4s!Cie1$Yv8!W?5EU;szu=a^J zCqU(Tn}HJNk#WN;e!AQ=}p(;Ae-m0SPi9S}f0J{|!*%N(b+F|yya>MpSe^o!ku0AG0PuP06;y=LJ zcc9w$pYb_J<$9ZgCC;!8Q5m-Ao`i0wOghYAR%V3I4Ojd8m(Yz+JITxzJGS_;!*82anXh)@9G>nch(3)Xm)Q4Kb zajg2Z(7270NryS!%8U@V32LAJ61RzJr~e0WnTXzK0Rs9gVif+Uam%%mSW?1{qRQvuj;%2E_Z*#W9IV0w%j7Vn0T$wNX zb)KpiwqNI~`pzk-ZUYxsyXW(C-OU9%@8t3XwrLaO9~|UfH^j#b_BSgB__>=4WnH-* zERs0P#nz6ohk3;glDBlh?E>1?3JeIgnM-88M=ZG_m`mlB-I)wxdGP2a7mZ~qKaw|j z-zg7xxh!>_*$TN8*_o}B`7-jWR7Dy2)mF6tyA=`{`86`_FxOhSVdwffRX@b8&Obur zy$rvQPUd=9;Oc=5a@%2Ul<)ucz$TUJZElvxS3M3ytzNl~fl=QQms5 zV7*tg-YYrtL+`q`%X%^pJ0#(HZl`L*+2;TLTE=b7F4dTIw7XR+5sr3GXv=C$>{UtU zQCaW*^=S9W%Ca~1OH^+hQ2WYxV1-Ba#zC2On1`&~u)T3u)eo_&bBfp-ZXMfo^Y=54 z$RbyN9F^M+^O$`9w?B@nTyOJ)#5rzGs*Ffx#3`9C<9b?E3_ByvsQRIObyg*-vP=p2 z-6K!+`~`{>D44HMzA!I$Pft0N&#AV)RxL+u=LkNpGOerC1=Um}M{-f-%a~nK6=lpW zTh#*W)+xxCU6E;rdDY4d8?$Sweu!NiZ^W4Ce!=E-nePhN4Y}>KgY#6T9p*DDH*9d8tNI~!b!-uWV>e`;D=;tQw!?fW-~SEFE0ybQzLq!- z@r}v|bBG4FC2wWE49q);>wEiqx$Q7Ns2%?OZ*?D4whWlO`Id2lgWg`Os=w`1o*Yx9$Jvq`6=bLGc2j(R>V(At@1*{l13&Y zh9#}clVc;DM2(H~Y7e=Nt9aYw*vKGtm>I2H=h%=zmMdT;RUTqj@o|m~`BW>m8UXU) zP|G$Q0)2wb%(Bc*H)oOC4l}EKXZl6xF2BlAG4n76t4B)7T$cnn4Xl?ew_dWh zqslxvMhnO+`4~v;`0pqwM0@{r5h$!GMHnVULU&YWThyu&VG$@MGwsd+Qe18R9UAh~ zN?1EsQ>LV~89tb$R9@)VC@quDD27g%GBVHE8)YMDpj`L{%DWmUAKE|#ndk2?D@xSh zs-*UigG+U@8eElC+F@3)veoF4bIBTARaJ!$yXuM%hgt1!xg7RZd39&GihGE^dcG#V zfu?kgwHh+lVS1>YoVU>t##&8P;%(MaIM2@0+6*6_+A2>SkaI<;qmq%2wYsv%oRermfvvWyJ6c#f zwNyJr+8wP_iF|~kaGry&wHdxUTC2R!bMRBi$j`xF7TH@lyQ7WV4%Z!RUCpTOkayF6 zyTcje|MZaXYoGsK9rO+bXMgdc6F|QyW_sAJ4|b*AhlDZ-4U!xyv-1W^BlTZo8h~o ztI7-Qj&3R$`8jl#MY0|F47-Qbj(o7%)7l98V6~UZkoH z)8Lc$IAPlv8Md9s{wN~Ltyjc{I5M8{iI8<%R{Ta=o8jX(#?`p<^vBAiv*SZo{c$qS z8Ncz7G%z820~1{hR0wTglFXBrjmZ)AEndvN7@o`-??AF(R?owvS0*5(U?HB3Xn4?O(&AAHa3C^=N!$)Yo%9EYQ z09l}tVdsW(JM>;`(QhVAI3s=oRpYrqWk>PEl9 ze_G94t5u?M&Wgz}6{f=smb+VXm`sokPI+on^f$53#GrxYje^ZeSjg>00sPsJ}<3i=`(N7cUn9{DjV`+s@l zdq=cP{>4Zr{s;da`El8}jP42b8P2hA(%K9^7EW1*AOqwa=BH)SITS)yw=*)&*}&OI z8aNlef%C2g!Yq#$WS(3eFG|$ZxTN-wQ$vkXH8n1)w8Ok&WrtlJuc`_mcGU%PdA#GF6>mBeHXtFvi@GX)CzK3t%hpPekoXPd4 z#Gii~_!UV5zr#21$JIc^&<2EDZ{=*$Bx<(lQZ?JuC|9%1P-%x5#mWvl+uYO+A$B$T z`@esFkdaHRc6at(8f!Cr@1^DAn5^k!GU8-SFY}xY zWQe4JjNu!|_UkTrY**<1~TnXK9WHjpEd26EB>_gT4A(mDs{)uMdp zaLX-=HoX$HqOt>Q_g8V%#Q`IAQr;HSOkk=F)WTHuq2kk($4Qx9x5=_ zcdD8yA`x!jYq_H2ya0Qu7sb{bDx^nQ#k<>!%6K*KAR5`Ly`8|iB^dBDjMI2Isu5kTQ z(536^v+JSYKR1f4!hfQ?*8KfMd8F3Mr99|xFIT?H|5R?>p7~hkYK;zGYcqT_Te}X! zIXe7g(u!W_t&P9Ta`r}>NcL|Vet+A)`?r%>a+$SD)Oc#Ic97#q^|BgI9aLIAzO=H# zF0%ouLWo`Uj9exI|CYO!*-p-K6-U=HDW4}Z+B(ZzhiR&v{?lj+Qia}Tu)=wMA=YO2 z_;gWup`)#JLD>u-DGCyXzMOh)}v*+zjL(pkQw0yTTj)9YV+@4n;Wiwg=>(x zbPY0>u0iJV-@)c=@BegvS39m}K4Ay1kGI)NorpD(9O?`rtTVk`XXPA8eN@tlls$ZW x`pN>izVwr*@z7t@ljA}4gBlM5qz-eSmFpZ1&OaXvQsp6b6+74Z5)zW<{{ufO6ng*w diff --git a/pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_AMD64_windows_2.7.14.pickle b/pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_AMD64_windows_2.7.14.pickle deleted file mode 100644 index ddd88f77aa2a49c2a39943e7c044eac972f96aa9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 133468 zcmc$E1#}cmv*@_=xX%Rl;O_1c2)a0gBm_1g!fq0Lad&rjcXxMpcXxMrGn)iH;Q#Ku z@0@qf+nh5sU0q#W)>Yj->52F}aG(p1jBOp!$zwOrKo{0JG$F!+g|`olZ5%6Vk0_+RI&*YB4Z+|JEMRH|Ieg48R$Y1x_GEy7uLob;qlOa4vcHl zCO#q|9#jhL3_KuvFAwAMuz@Z*HqILp8Xegs!ov-6ks(QB9zNWW)DZ+YkJb^<38BCv z1i?X25Hz5kM|8;{iSZF3-iYYXgvbs_1RiOS3v1W0b-YLRx!~9!I5Etxai~WLk4=ne z-#JfsoHrs*Ok#9GWT@9`3u2ECkBf~@@Fs>Q0FRnP1#H{+dp%kZ)XxQn+4JdvLGWN0 z^h>5_ozS^`1n?Lxq*gIoBG{%d1A{OZTq-y?xW*6r=YIr2KHDs@@wV9ZUqb(cutBam z5VZf{W#XI+L0XqMmjhcf*jxv$nL6>c1Cczr%b&Ml-5t9|IB;g2hSweV;@0C_7i?H2 zzDHLF4#U*L7vuz6$=J}C2#*z<*uS19z=eB5W7|e}Ql)sn?_Jx=VWII6Nls37Zt6cN z`~yKfXLvG68lNYv9YgU6q22^fx**7f#V16x_oNSkowvC5o(w_2oiuSq7nT?qo8Zah z{)?||d^^y>LG{>#B88GUXZCrr1iBTs>i%WrtjSh(+lGxyF>JQLAoOp;CYuuS|KKD5 zA;}O;hFCJhlOd4|$z(_+Lpm8IgeB`@$K}tk^JM>}P|AWyXdMX}q{L;iLOFb%oPq9t zjr&@Wv55(G%;d5|Kex}5C(z9ea`WzAA)VJP4RR+1Zra4KKP;2)AC`$pu}uE|dB`Oz zSf{x;$1&E-jrpoLMtZ4Mf8-dcoF{asW1Nx4I_Gtaax?qr(~eQ{^6$K?gxsol0e3;pTO!YNW`^T)l-!2bO|cUBP>-X=OOG{IB!m!l$` z^kv62!6(1F`uXSpR$%nk=2g9HYpY^`QOK0l2R|rCPAoN5~Px$m<-iqXePtd z$uNB~%vkQHO-s3m_V#LtYwaohOP`d$OX>l2`~=B?SH|Zl`(FiKTx^27a%^khDQAaX zd7r1kU!hm=SLoR}^N-N0lwugiiCFRy!;>MA49R3jB||zHf@H`fLpB+5$?&Jvl@ncL za*6O%@p-BSy2vIGp&fHcvel+0kayI<&K=eh?b( zwjU|ghpCVl9~m1FA8%JZPlJD(;U7f6^rR_s^L)5?WL z*nS9g+epD){t#p%!yLK71KsJ|t=ysRutZxFN3x{y+}f_|5uFl3!W^SSxE-C^_&jZY z)5-nYh)7!vyJQnKAS&CgiST-&{!Ndhe&A_mvyJw7V*HNyXDW^TEuLMN<7ZFYzp-Pcv~YfS)&A7U3dN96vs*x6YcUBlVW$ry@~(E|1VPD>0nRR z(dX&pm)y2v=Rg1dCv9=uUEE#W zc9$K|&Zexkf0q^F4vCHTbpJPjeh~ss4_lI+K2NVC8~URlF(EQK&u<$m@O#VC+aA=% z=jm&g$q;vFe27ykJ^h?Lf!hhd{y~I`jCXcc_Ev0wy=fr)i!a_Y(AhwAwwJYMkiAdz zYc<$L;SMU^GsJmy7{q&qI)jopc!mYLn3%|LZ(MSFc(A)xu#5c%D&8}~rl+DKLfiX! zj`85Kbx@?{;$N1fOT(Uoki7_m~596YVWigg=2z{x?Vbq5+;MHsh&2&oqDhasCZg zWNh1{+%!E1wFONP2d!jlGsErdp=bI$v;LA~_V1DqNjneE9JjOenCtV*`)kDf-$$^? za#e_E8yV|a@ZU)9rvjdZHuFV3&tgCG-vy?f6&BJUF(Tfx#OBS@~~5IK9BL%4WLS=UMZYW^4b@tbRmH>ZX*HE&n#hKZt;5t3AawpJ%%> zg$r{sfoF%`=boK`Zs)Cj5SHjXQsw}DC6XMp>))pMMFTv$?HTs?JbVAL(7xX-L{w=~ zxIp28p8fyEG^rnW4%iG2`aFkHta3QeJw1rDWh7j5hfr^%eJqQM^&D|w(RMcX98L81 z9&sP_`_)gzCLi`fqC3XLc#gT9x_aE_Ig!Hgq(kMz0+Fn>9S!_X*CgqF>fgltoep?T z+sx1SJZDpwpR+wiJ8q*>OiDZ3i6pM)|BdTU65zRDGrZ{YTuNr>=knX0 zL-(*?w?Fl{?PTY+Q=Hq*Z0^y)?%$KyO@}@{#b9=XUa`#?CD6!@Ay1V?3;+NID0$x)a^{xYE9+90dFNgt7ksX^S@ZV_>EO~`_2hnKuCMw zc^T*e<>Mnm^EQZU*E!Dfs@BNvPPVU6zk%m%M6#y@3sjPFHk zWTK0XZy%C$SK~!(gwKoF&p>z4lu6>-+dhuSlSBgQ`&;$>cRRp~+oaihc?thG(ivx8 zv&1Kc*;g1|%5J*2q|qr8&<+j{vF~3(y}*7?nZti~VxRURys@Ft@u0q4ouVTuM8}1< zs}&j>+BU)qyo^n5PsQ3Gu!WbifqgJc6q;mY=dr+YNn>-AOQ_o8 zW72zyKp{yI0l%(EXD+Yeh$8&+^1`d82!!9xHxR5npM4Pe z$4!V=|HUlIVKB%=IhPjBsVTv}kMbHeu7Ce9+jyq^mSkni8S@{k9AM)^ecn_ya1r_P zcAZZ<(!{w$nTrq*?}z@mOS`bN;#iHi8g;xGSKbcQAQKg#j>AlLtYcUH3HuyaCX8$>1Lf+~)b zQT{C8EQ6?|)nCmu(Gah_jHR zk`{2Hi?VB58?SwV?G3WgK5tF?`B!E06C@WldovYU#Tfv+wQO2@c?SCj*0!I?i=Ith z^htc{*d6}mo|4i3C}qDFvARh->p9$7cDUyP< zsWaeLur{-sK5uhB@*lz4!lD0Xu(q@je+6rZGt3bg{GO~@`SJgwWMzNh_-v(2C4w@n%Hzbd{vWtQKQrgQcGhYFE@Q#y(|9}E0yMEz$q+NBI5{9eds zlmFxQ-+SS|wtjR9_g~g`xc?z~%->W=3ffze*#9hcT*?qT>i<{1Rs7uD{HLI5pECP@ zsg~z&D&SY%R?C~Dmc2IO9V&O~sHCKn;BWn&loJ2J$4NwX#_3>#q=XatbFFo>aXo4rUBKJf!8w`2mO7bFmlQq~e(~x0Pd?rL10Ubte7dLbu~U8WCpmAAfAZ=1ANch8 zn@{f)J{5oQ>0{#}eBQn`NUnVSoPkM|&&l@5m9M{p^?3*QJAanG!H})5e{(y~A@$cj zC-Wrwyo3Irgl*Fxe$v4|Nu5=0lMeZl6tzo)pLFO?QfDREq{IFsMFzvDpLF<7QYU&7 zBb;5ezp}b$=j@hL`bOAYE-s{XM4M3it9M8{yMB2`Iyl_F$@h+O-j$#I|Jaj{wtId? zZDLrw3yF#Jj^^VG+^s z-idarg?EyJ`*r&Bmu7!;t>TUA5)m6*Df#d>*{1t>_?zMka$=A2SEt>PJ&6gpZ3nO+Yz>s5gi;QXZy#@amM^yO>_ON-&f^42bZ)}NLfwu9aLi{ z>xS8f;RSXpxzWYDFs0EZF3!6srQu2JXf&`_?9X@a2Ak5} z=56%z*yQIygoMYnwtceMZv9rFw>aIlHi(!lDK-iu$HrEF$A3PDH%K3Bo7r|a0-<(| z!yy79Aqt`)05SHFf`fQSfJ8`wWJrNjNP~39fK14O0-#h-YA6kq7D@-DhcZAJp-fO_ zC<~Mo$_52O*`XXzPAC_Y8_EOah4Ml9p#o4rs1Q^bDgqUSib2Jp5>QE~6jT~21C@o! zLFJ(eP(`Q`R2ixQRfVcS)u9@8%-4i!LBUXMs18&Yst47F8bB_nA=C(J3^jq8Ld~G& zPz$If6auw^LZL7y9BK_kKy9G5kQ<7GqM&wAG!z5HLUB-g$OCzycqjo%ggQVSp-xa| zs0-8;>IV6s?obb?C)5k-4fTQgLj9or&;V#4Gzc0D4S|M2!=T~N2xufU3K|WKfyP4P zpz+WIXd*NTnhZ^Wrb5%8>Cg;lCNvA04b6e(Li3>c&;n>7v(CA8CUgtB4c&q6LieEi&;#fp z^ay$kJ%OG=&!Fef3+N^E3VIE_f!;#zp!d)R=p*zA`V4)6zCz!i@6Zn?JHQBx!WfLh z1WdvdOv34?J9u{B`mS7oHU=`M29X4PSw%`Cb6`UGQ1E+=4!Rg@)a7H*2oEgpn zXN9xDfpB&>2b>em1?PtIz%$G~ZzB!iMsQ=e3EUKJ1~-RW zz%AhrxD^}+g7rYzZ1Mh|R!TaF@@Im+xd>B3gABB&> z$Kez3N%$0e8a@M`h0np~;S2CZ_!4{>z5-u`uff;h8}Lo|7JM7N1K)-3!S~?@@I&|! z{1|=$KZT#c&*2yFOZXN18h!)6h2O#N;Scaf_!ImY{sMo6zro+(ANJz6s{?`}7=j}N zLLwAGBLHC#7U2*c5fBlP5E)Sr710nKF%T26kN_kVk{U^aq(#yp>5&XbMkEuG8OefV zMY17*NOmL#k`u{=X^b>Mnj+1R=12>q zB@%+PLPC)+BphjtL?CUDwul>vM52&(NHh|I#3FG>d&Glyk$5BlNklpz9g$8*XQT_# z73qffknTtiq$kn~>5cS3`Xc?1{>T7iATkIUj0{1BBEyj3$OvR4G71@uj6udCd7vI<#^ ztU=Zy>yY)x24o|$3E7NnLAD~>knPA0WGAu<*^TT$_9FX`{m232AaV#fj2uCZBFB*9 z$O+^matb+(oI%ba=aBQr1>_=f3Av11L9Qa#kn6||5=ngz{@ zW}U=&Cz=b*jpjk~qWRGLXaTe!S_mzS7D0=m#n9qt3A7|y3N4M6LCd1$(DG;n zv?5vwt&CPdtD@D=>Szr#2(5|MLW9xTXdSdJS`V#{Hb7lyL$ndv7;S8|{PkMf;)s(E;c{bPzfi9fA%;hoQsK5$H&C6gnCmgN{YVq2tjB z=tOi9IvJgUPDQ7o)6p5|Omr4H8=ZsBMdzXO(FN#2bP>83U4kw}m!Zqi73fNI6}lQ- zgRVu_q3h8N=tguCx*6SqZbi4D+tD59PIMQ#8{LEMMfaim(F5o~^bmR&J%S!ZkD5^bz_P zeS$tkpP|pu7wAj$75W-|gT6)Iq3_WT=tuMu`WgL#enr2b-_aj-;fFB@~9F)z1}r0%3CoOS!Lnl6 zus|$3mIKR)<-&4fd9b`#J}f_004s|Bq1=bP^ z!CGOVSQr+LwZCIIKP9!Ms>JmVhN<9k7mAC#*Bp1?!4+ z!+cnGtOwQ;>xK2k`e1#repr8O05%XCgbl`qU_-HC*l=tFHWC|!jmE}cW3h4Acx(bT z5u1cf#-?CXv1!5!-}q#;`rdyM^7x?qGMZd)R&K0rn7kggwTd zU{A4U*mLX!_7Z!Ay~f^PZ?SjSd+Y=D5&MLF#=c-*v2WOS><0$nFpl6Tj^Q{?;3Q7r zG!AeEXK@baaRC=`372sNS8)y3aRWDT3lG3k;i>U7cv?Iio*vJDXT&q%nei-mRy-RX zh-b%h;5qSJcy2roo)^!D=f?}+1@S_7VY~=l6fcGs$4lTP@ltqcybN9zFNc@ME8rFJ zN_b_w3SJejhF8aH;6ZpzycQme*T(DMb@6(5eY^qg!W-g^@Wyx(yeZxcZ;rRXTjC*j zD?Ahr!^82`cm&=CZ;QL}NIVK}hezWvcq|@=x5quW7mvpi@I<@=-VyJFcgDNmUGZ+X z5ATlmz_iSCCy|TDP2?f+68VVyL;<28QHUr^6d{Td#faiW38ExXiYQH#A<7cvi1I`Qq9Rd= zs7zELsuIO>79h^R@_B7%w9L>;0oQIDukG$33=L!uGUm}o*YC7KbHnf(T(sC z-H9GVPofvmo9ILICHfKli2=kwVh}Ny7(xssh7rSw5yVJh6fv3@LyRTH5#xyo#6)5e zF`1Y`OeLlf(}@|xOkx%>o0vn)CFT+Hi3P+$ViB>JSVAl%mJ!Q|6~szn6|tIFL#!p% z5$lN!#71Hhv6PtU(5mHOX3JFjF4wHX<97O~|HXGqO3^f^12Kkgdp2 zGK>r-TayuF8?r6wCL_rxvK<*s#*ndO9NC`qkX|yLOdu1<4rE8N6WN*ULUtv)kv_6J z*@Ns!_9A%JUM}!NKPUr zlT*m4KfILVZA`g>C$fM*j@;G^dJV~A+ zPm^cJv*bDQJb8h62=GE$kS%v2UCE0v83q_R^v zsGL+TDmRsf%1h;=@>2za@@Fja&qN)@AuQzfX9R4J-7RfZ}{m7~g26{w0-C8{!2 zg{n$bqpDLis358)Rf`IyYEyNnx>P-?KGlG7Q4Og^RAZ_M)s$*RHK$roEvXQy6%|T_ zQQ=f;DuQZ5wWZutBo#%qqoS!8Dwc|)+EX6NOT|+OR3g=Z>PU5>I#XS!u2eV5M|G!q zP(7($RBx&e)tBl=^`{0<1F1pOU}^|8lp012r$$gCsZrEuY78}&8b^(%CQuWpNz`O& z3N@9QMop(?P&27n)NE=FHJ6%4&8HSn3#moaVrmJslv+kDr&drasa4c!Y7MoPT1Ty? zHc%U>P1I&;3$>NnMs25dP&=tz)NX1IwU^pQ?WYb<2dP8UVd@BVlsZNor%q5OsZ-Qx z>I`+3I!B$SE>IV#OVnlR3U!sbMqQ_FP&cVt)NSexb(gwF-KQQ<52;7gW9kX@lzK)z zr(RGmsaMo%>J9aldPlvdK2RU2Pt<4X3-y)yMt!G#P!J8%2#wMhjnf28(iBb8fM#fx z=4hT4Xpxp^nO10()@YqJXp^?+06G<&nodKfrPI;r=?rv6Iuo6l&O&FUv(bTcb~*>0 zlg>rwrt{Ew>3np4x&U2}E<_imi_k^sVsvr31YMFYMVF?_&}Hdzba}c0U6HOtSEj4b zRq1MUb-D%}MAxKi(ZO_Wx(;2Ju1D9W8_+JgA>D{>OgEvM(#`1RbPKvA9YVLFL+LO& zoNi4=&~50pw408kqv&>YG#x|7(s6Wq+CzKkcshYjq&v_Z=}vTKx(nTv?ne9Q?sN~j zC*6zgP4}Vu(*5ZE^ZGTYGCOwOuP0yj{((~x~^a6Szy@*~+FQJ#x%jo6w3VJ2Iie62xq1V#u==Jml zdLzAw-b`+}u!CVh*(P2ZvK()Z~5^aJ`K{fK@{KcSz}&*bRa#*05XD1AT!7UvVv?N5M&2AKu(Yg_ zUXTyu2L(VuPzV$TML2AS=770i9+(dn zfQ4WYSPYhcrC=FY4pxAbU=>&m)_}EO9as-GfQ?`i*bKIStzaA24t9W@U>Dd8_JF-$ zAJ`8LfP>%=I1G+}qu>}g4o-lR;1oCw&VaMv95@dyfQ#S~xD2j)uizW_4t@ZL zff6+VbU_`nDk5r zCL@!H$;@P7vNGA2KqfnrgUQL{VsbNin7m9rCO=bvDaaIJ3NuBRqD(QSI8%Zt$&_MB zGi8{vOgW}JQ-P_-RAMSKRhX(wHKsaKg9&14GPRgsrZ!WDsms)3>N5=(7t@ew#587_ zFin|eOmn6M(~=2cS}~za7!%I4W+Iq2Ok2jyL^4rKJ0_ZmVPcs$raj|fyi7ckz$7vq zn2t;*rZdxp>B@9td`x$y2h)@3#q?(SFnyVROn+toGmsg?3}%KfLz!XBaApKEk{QK} zX2vjMnQ_c`W&$&jnZ!(HrZ7{PY0PwH1~ZeH#mr{rFmsuC%zS16vyfTDEM}H4OPOWN za%Kgyl3B&9X4WuknRU#1W&^X4*~DySwlG_nZOnFN2eXse#q4JGFngJO%zowobC5a2 z9A=I%N10>HapnYbk~zhkX3j8YnRCo}<^pq(xx`#%t}s`bYs_`#26L0S#oT7@Fn5`I z%zfqo^N@MOJZ7FSPnl=TbLIu}l6l3vX5KJwnRm>4<^%JQ`NVu?zA#^zZ_Ib*2LrJ% zi?Aq*u{cYxBulY03s{C_S&rpdffZSam05*VS&h|MgEd);4PaBTso6AaS~eYMzb+&EE~tRXFaT!jb{_sM79Il zk?q8GX1lOm*>0?l?auaKd$PUQ-fSPXFWZmp&kkS*vV+*c>=1S+JB%I9j$lW!qu9~x z7jvdcVU?;MZ*vae^b}BoKozBi+XR@={+3Xy4E<2B%&n{pWvWwWo>=Je=E`TdyGBKo?uV1r`Xf%8TKrDjy=y_U@x+l*vsq{_9}agz0TfXZ?d=8+w2|oE_;u? z&pu!uvX9uu>=X7W`;2|gzF=Rnuh`e@8}=>xj(yL5U_Y{-*w5@2_AC31{m%YiAr9sc z4&^Wo=LnAED30a;$8apiaXcq*A}4V&r*JB#aXM#kCTDR0Tq-U#mxfErrQ_0b8MusG zCN49Vh0Dri;{v(tTn;WLmy65I<>B&j`MCUC0j?ldh%3w$;fiv_xZ+$1t|V8AE6tVR z%5vqn@>~V3B3Fs4%vIs4a@Dx%Tn#RWtI5^kg1OpU9j-1{kE_o$;9Oiot`XOmYr-|< znsLp!7FKR3 zow&|i7p^PUjq`EcxgK0kt{2yv>%;Zs`f>fa0o*`t5I2|`!VTqyal^S0+(>Q|H<}y6 zjpfF18bz5x1CI!Y$>Nam%?C z+)8d0x0+kSt>xBn>$wfwMs5?gncKo`<+gF#xgFe2ZWp(k+r#bU_Hp~U1KdIG5O

xgQ+F!#u*H zJjUZZ!IM12(>&lAp5-~7=LKHmC0^zgUgb4j=MCQEEk1xx#i!=e@M-yUe0n|upOMeR zXXdl;S@~>yAfKJj!RO?2@wxdtd|o~upPw(l7vu}^h4~_UQN9>ooG-zb=cIDdja$)Dm+^Jn<8{5k$Se}TWqU*a$GSNN;^HU2t(gTKk&;&1bJ_`Cc){yzVJ zf5<=LAM;Q6r~EViIsbxx$-m-X^KbaK{5$?V|AGI=f8sy$U-+;5H~u^SgNFoIKm=64 z1Y95lQlJD{00JYh0w?f-Ac%q_$burMf+pyKA((RqR>I;D0C7! z3tfb+LN~!DbQgLEJ%wIEZ=sLSSLi477X}Ceg+an#VTdqP7$yuCMhGK?QNn0pj4)Oh zCyW;+2or@#!en8JFjbf)Oc!PdGlf~gY+;TtSC}Wv7ZwN$g+;<*VTrI*SSBnNRtPJF zRl;gvjj&c&C#)AX2pfe>!e(KMuvOS5Y!`M2JB3}sZefqGSJ)@)7Y+yqg+sz&;fQcl zI3^qyP6#K3Q^INCjBr*sC!7~92p5G*!e!x#a84KQkO+&2h>Dnq zi-bsult_y}WJFfvL|zm`QItekR76$OL|rsQQ?$eYF_oBFOe3Zh(~0TD3}QwxlbBh| zB4!n{iGgBvF^8B_%q8X)^N4xHd}4mFfLKs0Bo-Enh(*O>VsWvASW+w{mKMv1WyNx0 zd9i|6QLH3Z7ORL=#cE=8v4$8V))Z@r!D4N(j#yW$C)O7mh%T|A*hp+FHW8bO&BW$n z3$djbBDNAk#V|2kY%NBJZN#>sTZ|N=#CBq|7$e4tabkPXBYMSnF+ofeJBS^{PGV=V zi`Z4{Ci=wgVh^#W*h}m!_7VGv{lxy_0CAu=NE|E<5r>My#Npxyailm(94(F!$BN^` z@!|w=qBu#MEKU)piqpjD;tX-7I7^%@&JpK|^The$0&$_ZNL(x~5toX~#O2}&aizFQ zTrI8<*NW@J_2LF`qqs@jEN&6Eird8P;tp}AxJ%qE?h*Hj`^5d?0r8-CNIWba5s!+; z#N*-#@uYZ4JT0CP&x+^7^Wp{ZqIgNXEM5_>ir2*J;tlbpcuTx3-VyJL_r&|+1M#8w zNPH|l5ub|B#OLA*@um1md@a5a--_?V_u>cfqxebuEPfHcir>WV;tvs$U36pS% zkVuJ=XbDJ+#7dmROM)ayk|axtq)M8kONL}hmJ}eRl2S`)q_k2xDZP|I$|z-$GD}&c ztWq{9P|7alka9}7q});-DX)}I$}bg=3QC2f!cq~bs8mcUE|rieNR8A@{ zRgfx5m88m26{)IJO{y-{kbPq#b`cebQB{h^9NsXl@Qd6m!)Ld#I zwUk1nR#K=GCWT9_r3k5w)K+p!ky4b@PKuUdq*y6VYA<;tuM{sONQqJhsiV|M>MV7U zx=P(7pVVFIA@!7cNxh{$QeUZ`)L$AP4U`5+gQX$TP-&PnTpA&bltxLTr7_Z2X`D1( znjlS-CP|Z}DbiGFnlxRSA6~<4x*%PYE=iZAE7Dcznsi;dA>EX2Nw=jt(p~AEbYFTPJ(M0v zkEJKlQ|X!XTzVnBlwL`%r8m-B>7Ddm`XGIjK1rXYFVa`(oAh1!Awe=MBQh#uGArq= zaz(k4Tv@InSCy;D)#Vy;kX%!)B?rs3Bjl0tD0#FzMjk7VlgG;w z&oANFBwtPpvE8mmv%Mav-@+0}N z{6u~#Ka-!!FXWf5li$l96G+J1|_4CNy)5aQL-x8 zlt3lBl0(U<812m`Y3&seoB93fHF`SqzqPuC_|ND%5Y_bGEy0(j8?`dW0i5rcx8ez zQJJJnR;DOZm1)X!Wri|SnWfBD<|uQOdCGicfwE9pq%2mJC`*-P%5r6evQk;4tX9@2 zYn64%dS!#MQQ4$yR<QZhn&nbj<6RyCU% zsAgAls5#YKYHl@;npe%I=2r`-1=T`oVYP@_R4t|!S4*fR)lzC{wTxO;EvJ@OE2tIK zN@``bidt2zrdC&Ls6lE?wU!#J)>iANb=7)meYJt=QX8s`)W&KPwW-=nZLYRZTdE;y zD>YONQ^VEPYJ}QGZL7M~NHt1rr$(zWYOETkwpTr>SB+N_)I_y|+EMMKc2>KnUDa-? zPwlSuPKJvbI!+z0PEaSR zlhn!T6m_aPO`WdJP-m*M)YKb*e zx=vlMZcsO>o7Bzf7Imw-P2H~UPKXN{dQLsBUQjQpm(KpZ~`c8eXeo#NEpVZIl7xk<9P5rL^P$3Q05DnEZ4c7>b)F_SC zfW~O7#%a7JXrd-*vZiRNrfIrnXr^Xq0a_|8wU$OptEJP@YZ{<>jrw}3YWcMMS^=$~R!A$X714@n#kAsD39Y17N-M3E(aLJ&wDMX7t)f;* ztE^Sgs%q7=>RJsgNUN#U(t@?xS{<#fR!^(1HPBpIL#>h4SZks+)tYI|wH8`SEktXj zg=%41xYk;W(AsEiHMbV2MQQD{Xe~yI)#9}Fnn&|$@mhkGsCCdfYMr#sS{JRW)=l$i z-L)QCPpy~MTkE6s)%t1uwE@~dZICut8=?)>hH1mK5!y&?lr~x$qm9+ZY2&pC+C*)V zHd&jZP1UAp)3q7eOl_7nTbrZJ)#hpQwFTNjZIQNETcR!1mTAki71~N|m9|=2qpj7} zY3sEO+D2`YwprVvZPm7E+qE6qPHmUATic`U)%I!owFBBg?T~g@JE9%cj%mlW6WU4b zly+J>qn*{xY3H>I+C}Y>c3HckUDd8>*R>nkP3@L;Tf3v()$VEcwFlZm?UD9ad!jwn zo@vju7urkhmG)YDqrKJMY45cU+DGk^_F4O)ebv5c-?bkaq{BL*qdKPJI-!#~rPDgl z8J*QRo!13j)FoZk64o(odQrWYUR*Dsm()w?rS&p;S-qTIUaz26)GO(g z^(uN*y_#NKub~I&HT7D0uwGlQqu15z>Gkynx=U}UH_{vHP4uREGrhUqLT{;u=&kfn zJxmYRTk8>e8@;XW)+6;Oy`3Jd$LO(ooZepd=w3ZuPtX(f4thtulipeHqIcE1={~)? z-b3%H_tJaoee}M1KfS*`Kp&_N(g*89^r8ANeYieCAE}ShN9$wsvHCcDygosns87-- z>r?cp`ZRsIK0}|W&(de>bM(3TJbk{tKwqdY(iiJX^riYTeYw6uU#YLsSLs$1#`Zj&LzC+)s@6vbcd-T2fK7GG_KtHG-(huuL^rQMQ{kVQYKdGP6 zPwQv&v-&yxynaEys9(}A>sR!v`ZfK!enY>h-_mdEcl5jZJ^jA^K!2z|(jV(j^r!kW z{ki@^f2qIHU+Zu5xB5H%z5YS}sDIKw>tFP*`ZxW%{zHch*gy=_zzp0V4AP(s+5iS) zum)%FhG2+>WXOhMsD@_fhGCe7Wds^BY=2aQ94n*iJOE;nv_YKz+_C; z9W@nsH`((_?zgcr(FFG&`6b%}!=#vy0i)>}LAR?q(0O zr`gNwZT2zyn*GfF<^Xe`ImjGr4l#$C!_4942y>)4${cNuF~^$Y%<<*~bD}xPoNP`p zr<&8u>E;Y`ra8-;ZO$?0n)A&0<^pq}xyW2>E-{yy%gp8G3Uj5o%3N)(G1r>w%=P95 zbECP*+-z6`?dA@1r@71AZSFDmn)}TC<^l7ddB{9$9x;!a$IRpA3G<|R$~*fvfrg_V}ZQe2On)l56<^%Jg`N({1J~5w~&&=oM z3-hJ<%6x6UG2fc+%=hL8^P~C6{A_+Pznb67@8%B^vS16bPz$qgi?B$GvSFjn&q2Tai|j)y|5xVysvz&T4OYEUy)BC0L182dksi$?9x%vASB_ET7fg>S6V? zdRe`#K2~3=pVi+QU=6eeS%a-1)=+DhHQX9ujkHEtqpdO4SZka$-kM-dv?f`Tttr-2 zYnnCPnqkefW?8eXIo4cjo;BZEU@f#3S&OYD)>3PkwcJ`^t+ZBItF1NGT5FxP-r8Vo zv^H6rtu5A8Yn!#*+F|Xqc3HcvJ=R`ppS9mQU>&p$S%!&A zS?ip2-nw92v@Thftt-}5>zZ}lx?$b4ZdtdjJJwz6o^{`PU_G=RS&ywJ)>G@5_1t=4 zy|i9gudO%MTkDzno6`e8u z(aY#<^fCGx{fz#`0Arvr$QW!4F@_q$j8J2^F~S&Wj50!MJE#GAhnp8QqLw#x!G@vCTMUTr-{--%Ma8G!vPL%_L@0(=bg_n3idq(sWGM^i1DW zrZxjJ#7t%;H&d7?%~WP;GmV+nOlPJyGng68OlD>?i<#BTW@a~Ym^sZ{W^OZ&nb*u` z<~IwN1z zW^J>MS=X#*);AlN4b4VoW3!3b)NE!pH(Qu3%~ob>vyIu-Y-hGNJD45KPG)Dbi`mue zW_CAwm_5y2W^c2P+1Ko6_BRKZ1IidHK&==%^BuQbCx;VoMX;4=b7`(1?EC?k-6AhVlFk8naj-;=1Oywx!PP~ zt~J-0>&*@3Mst(7+1z4oHMg1D%^l`WbC++*%F_nG_61Li^Vka^fVVjeY*na9l& z=1KFEdD=WvWna|A^=1cRH`PzJAzBS*O@68Y9NAr{U+5BRDHNTnP%^&7Z^OyPC{9}d@VMRC* zUPKTPMI;efL=jO%G!b3I5HUq85nIF&aYZ~4UnCF-MIw<{BoRr4Axt5JC2S#uBV6GL zUnrqPAVNekkzAw@DMc!gTBH$aMLLmQWDpreCXrcW5m`kxkzM2vIYlm!TjUXWMLv;V z6c7bPAyHTq5k*BYQCyS|B}FMwT9grGMLAJkR1g(KB~e*a5miMsQC-v!HAO8^ThtMC zMLkhpG!P9%Bhgqi5luxi(Ok3;Ek!HQTC@>uMLW@6bPydyC(&7S5nV+$(OvWqJw-3k zTl5iqML*GB3=jjwATd}B5kti=5h{j@5n`kmB}R)eVyqY^#)}DJqL?Hmiz#BNm?ox+ z8DgfGC1#5`Vy>7c=8FYlp;#mqizQ;ISSFT>6=J1WC02_yVy##w){6~dqu3-ii!EZS z*e14%9b%{0C3cHFVz1aI_KO4Jpg1HBizDKwI3|vZ6XK*eB~FVo;;c9)&Wj7;qPQe3 zi!0))xF)WP8{($8C2os5;;y(S?u!TFp?D-7izni#cqX2U7viOOC0>g+;;ncm-ir_7 zqxd8~i!b7<_$I!KAL6I@C4P%PB8(N*3TK74B3KcvNLFMkiWSw0W<|GRSTU_wR%|Pd z71xSq#kUez39Up{Vk?Q2)G{p75|(Azmb4tpwLHtWl%=h}3bB$|$*mMtN-LF>+Dc=k zwbEJXtqfL1E0dMk%3@`;vRT=!99B*%mzCSfW97B-S^2F3Rza(fRoE(G6}5_4#jO%n zNvo7q+A3p}waQuLtqN8}tCCgOs$x~Os#(>o8dgoKmQ~xTW7W0lS@o?3Rzs_i)!1rc zHMN>q&8-$zORJUD+G=C9wc1(jtqxX4tCQ8)>SA@Zx>?<=9#&7Qm(||&%bZdq+)0$SZl3y)_QA$wb9ySZML>pTdi%@ zc58>V)7oY2w)R+it$o&h>wtC8I%FNTj#x*nW7cu&gmuz7Wu3OpSZA$s)_LoKbw)#qdSpGeo>)(UWxclESZ}R& z)_d!N_0jrdeYU<>U#)M}ck74s)B0uow*FXQ?67t?JG>pij%Y`+Bim8zsCG0vx*fxg zX~(i-+i~o;c04=2oxo0LC$baUN$jMyVVkzFE!(!G?bxpE*}ko8Z3lLUoy<;dr?6An zsqEBt8au6>&Q5Psn8M~}q&Mt3Puq)b??8=h)K58GckJ~5gllCe5w0*`tYoD{v z+ZXJM_9gqWeZ{_NU$d{wh+EPkKy3&)rR8q@8hR9?xxlAEb z%2YD7Oe53EbTYlnAT!ELGPBGgv&w8TyUZbT%3LzH%p>#4d@{c*APdSuval>7i^^iM zxGW({%2KklEF;Uxa?8ZiezLzDAP34p zaUPv1mj~oQc}O0XN90j?OdgjfX~&I#{Ca3VU9oXAcTC#n<8iSER3Vmh&$*iIZLt`pCR?<8;%I*FXb zP7)`nV>qTG9LupC={Sz-c#iKVM>~NN;v{pDJ1Lx$PAVt0lg3Hwq;t|c8JvtxCMUC# z#mVYqbFw=*oSaTBC%2Qw$?N2E@;e2bf=(f)uv5e->J)Q|J0+ZwPAR9fQ^qOllyk~E z6`YDrC8x4e#i{C4bE-QvoSIH8r?ykasq55p>N^dbhE5}=vD3t9>NInjJ1v}+PAjLi z)5dA*v~$`!9h{C%C#SR1#p&vFbGkb{oSsfEr?=C`>Fe}!`a1)hfzBXjurtIN>I`#2 zo#D<1XQVUA8SRX5#yaDi@y-NiqBF^v>`ZZ{I@6r#&J1U!Gs~In%yH&A^PKt40%xJK z$XV}+wiI@_G>&JJg%v&-4->~Z!w z`<(sG0q3A|$T{pBagI92oa4?3=cIGWIqjTr&N}Cu^Uek5qI1c)>|AlKI@g@*&JE|L zbIZBy+;Q$Y_niCA1LvXh$a(BMah^KQoafF9=cV(?dF{M$-a7A`_s$3Bqw~r6?0j*) zI^Ue{&JX9O^UL||{Bgp#Vcl?UcsGI@(T(IrcB8ma-DqxfH-;P2jpfF6%=ncLiL z;kI;JxvkwcZdcR2cSpD*-BIpncZ@sM9p{dBC%6;cN$zBKiaXVv=1zBKxHH{Z?re9CJJ+4( z&UY8M3*ANTVt0wV)LrH-cUQP8-Bs>tca6K&UFWWMH@F+!P3~rQi@Vj`=5BX)xI5il z?rwLFyVu?4?spHk2i-&NVfTo8)IH`NcTcz{-Ba#q_l$eiJ?EZxFSr-oOYUX&ihI?) z=3aMixHsKf?rryud)K|^-gh6k58X%ZWA};s)P3eYcVD;i~H66=6-j7xIf)r?r-;x8^#Okh4aFD5xj_ABrmcT#f$1i^P+n(yqI1rFSZxQ zi|fVn;(H0agkB;qv6sY4>KUHt3D5FuPkN5$dY7Pub@}RE9@2Vih9Mo;$8`_ zq*ux-?UnJ$dgZ+GUInkBSIMjFRq?8N)x7Fn4X>tG%d73x@#=c@y!u`Puc6n-YwR`g zntIK==3Wc0rPs=9?X~gRdhNXSUI(wE*U9Vbb@94--MsEz53i@!%j@m+@%noGy#C$* zZ=g5G8|)47hI+%iP;a<5!W-$0@CN(H zdvmKIE%Fw7OT4AtGHY9mytUpsZ@ss{+vsibHhWvVt==|o zySKyJ>Fx4%dwaaS-ac=?cfdR79r6x)N4%rnG4Hr{!aM1m@=kkaytCdp@4R=xyXal= zE_+wJtKK#5x_867>D}^fdw0CM-aYTW_rQDTJ@OuVPrRqzGw-?g!h7kx@?Lvyytm#v z@4ffI`{;f0K6_uhuiiKByZ6KU>HYG4dw;wzepo-8AKs7PNAx54k^LxsR6m*@-H+kN z^kez4{WyMHKb{}oPv9r?6Zwh#Bz{ug@J(O%mT&vgcYN3PeBW2T_5(k}Pv$50Q}`+U zRDNndji1&}=co5G_!<37er7+5pViOiXZLgXIsIIIZaP z{YHLczlq<}Z{|1mTlg*gR(@;0jo;R9=ePGe_#ORDerLan-_`HtclUevJ^fyOZ@-V< z*YD@|_Xqd`{Xza%zv5r@uld*g8~#oImVev7}|C9gO|Kfl3zxm(&AO27Om;c-UT~#;LUG-2sRWH?B^-+CQKh<9iPy^K< zHCPQ%L)9=9s)nl(YNQ&aMyoMutQx1rs|jkNnxrPHDQc>krlzYIYNnc{W~(`Bu9~Oj zs|9MITBH`MC2FZ!rk1M}YNcAGR;x8?ty-tns|{+S+N3tCEo!UUrnajcYNy(zcB?&V zuiB^fs{`twI;0M(BkHI+rjDx<>ZCfQPOCHOtU9O8s|)I)x}+|vE9$Dcrmm|S>ZZD- zZmT=$uDYl0s|V_#dZZq!C+ewsrk<-8>ZN+6UaL3it$L^4s}Jg<`lLRqFY2rMroO8m z>Zkgpeycw!j1H^A>F_#&j;JH)$U2ISs-x-XI);v^W9isBj*hG2>G(QC`%nPOH=D^g4sis59xzI*ZP#v+3+Q zht8>U>D)Sx&a3n3{JMZHs0-=Bx`-~Si|OLJgf6K|>C(E4F00Gw^16bqs4MBpx{9u< ztLf^xhOVh=>Ds!EuB+?m`nrK`s2l0Vx`}S8o9X7dg>I=^>DIc9ZmZkr_PT@as5|M- zx{L0ryXo$_hwiC+>E61J?yLLh{(68Os0Zo6dWasXhv`r~T#wKr^(Z}BkI`fGI6YoZ z&=d6}Jy}oDQ}r}GUC+=n^(;MG&(U-BJUw48&Qt8y;*P3TlF@*UGLC4^)9_z@6mhpKD}Qb&V3eOX`8SM@c0UEk0*^(}o{-_dvVJ$+w4&=2(^{a8QIPxUkXT))sS z^(*~aztL~?JN;gN&>!_D{aJs}U-dWrUH{NO^)LNf|IuNButB&Wd=Mds7(@yp2T_8k zL9`%x5F>~g#0p{uae}x(ydZv%AV?S_3K9oNf~0{Fn1Kkazz$^K1a9C3exL#!1VKoU zEJz-t2vP>Ag498pAZ?H?NFQVfG6tD~%t4kQYmhC-9^?pe2DyUVL7pIQkT1v|6bK3i zg@VFCk)UW$EGQn72ucQ}g3>{mplnbsC?8Y^Dh8E;%0ZQ&YEUhx9@GeG2DO6PL7kv( zP%o$-Gzc07je^EOlb~tPENC9I2wDcMg4RKspl#4DXdiS4ItHDB&Ow);YtSv|9`p!$ z2EBsbL7$*+&@bp83yT2P1-!!Kh$#FeVrqj0?sG6M~7sq+oI| zC72pa3#JD%f|bLs<_8Ocg~6g=aj+y<8Y~Nz2P=Y=!Kz?&uqId=tP9o$ z8-k6&reJfhCDhP@FsX0ybIn3AA*m;r{Ht&CHNYA3%&!;CJvR2on-EBwR@NkO(0W zLn4Jl4v7*HH6&U{^pF@KF+*a7#18ozS{|kDzX^AZ8+F+AH%%_Ia`d|YW!C+TqTBuV z*1tt)9rpaa`oB~5_WpD4zg)VI&`SNX{k_5bcSR}I-we9{%|B_Y|IGhuf89TMp#RK2 zm2LmbKVjJa%sFP`IRj?IOqdz7U{?GuWH>wL9GDYx zVQ$QWc`+a6#{yUo3t?d_f<^Ja(u3lhOJGSXg{AS|ki);%WjX&%YW{c1V+E{;m9R2a z!Kzpdt78qUiM6mc*1@`159?zCY>17pF*d=b*bJLv3v7w4ur;>9w%88aV+ZVrov<@@ z!LHa1yJHXRiM_Bl_QAf`5BuW)9EgK(Fb=_?I1EE^IF7)PI0{GO7#xe^a6C@Hi8u)- z;}o2V({MV@z?nD;XX6~4i}P?kF2IGj2p8iLT#CzZIj+E!xC&R}8eEI(a6N9ojkpOn z;}+bC+i*MXz@4}YcjF%1i~Ddt9>9Zm2oK{CJc`HgIG(_hcnVMB89a;U@H}3?i+Bky z;}yJ$*YG;tz?*mrZ{r=ji}&z8KEQ|g2p{7Ue2UNTIljP`_zGX+8+?oJ@I8LOkN62c z;}`sj-|##Bz@PXFf8!tg8#w##gvD?e9wT5xjD(Rf3P#0f7#(9^OpJxGF%HJXco-iO zU_wlUi7^Q#MFUL~XrYY~9dyw{9~Ej0Fa(ofa!i3KF%_o9G?*6CVS3Df88H)P#w?f> z|9geY&N&C>#9Wvg^I%@ghxxGp7Q{kW7>i(0EQZCg1eU~7SQ^Vw}aN>~}I z;6EwR|4cQkjy13**23CY2kT-ztd9+_AvVIs*aVwmGi;76uqC#_*4PHyVmoY)9k3&I z!p_(QyJ9!&jy)Jra4e3)@i+k| z;v}4mQ*bIy!|6B!XW}fJjdO4=&cpe*02ksST#QR_DK5k1xB^$=DqM|ga4oLG^|%2y z;wIdTTW~9G!|k{Ocj7MGjeBq}?!*0f01x6JJd8*1C?3P(cmhx2DLjp5@GPFg^LPO- z;w8L{SMVxc!|QkhZ{jVyjd$=a-oyL&03YHbe2h=_xJ%n z;wSu!U+^n_!|(V5f8sCvjeqdpKDWQm(XiqEeVm2~hvD)6ciVO>er{}xgK;q)#>WJh z5EEfyOoB<#KobR8Xrn|2UG&gLg&G44!DN^mQ(#I=g{d(Orp0ua9y4G@%!HXS3ueV^ zm>qLqPRxb5F%Ra&e3%~#U_mT|g|P@0#bQ_-OJGSXg{83!mc?>d9xGr)tb~=Z3RcBx zSRHF%O{|5ru@2V7dRQMDU_)$#jj;(f#b($XTVP9Ug{`p-w#9bX9y?%1?1Y`M3wFhB z*d2RdPwa)gu@Cmee%K!e;6NONgK-EB#bFqV!*K+T#8EgJ$KY5ThvRVqPQ*z#8K>Y> zoQBhJ2F}D;I2-5ST%3pVaRDyGMYtH3;8I+M%W(y+#8tQ&*Wg-QhwE_zZp2Nv8Mok8 z+=kn62kyjOxEuH2UfhTK@cNB9_@;8T2t&+!Gm#8>zl-{4z(hwt$Ne#B4s8Nc9H z{D$B02mZug_#6M=e_!;64bT3^@c3`H+~1EPa*l+NF$zY-Xc!%1U`&jKu`v$D#dsJW z6JSD2go!Z;CPf2H6lkH15*>8WLmw4t3@`+fVRB4?DKQnM#x$4~(_wndfEh6pX2vX- z6|-S>%z-&E7v{!1m>2V5ek_0mu@Dxmq= z75^=T`uqD?opTMWiM6mc*1@`159?zCY>17pF*d=b*bJLv3v7w4ur;>9w%88aV+ZVr zov<@@!LHa1yJHXRiM_Bl_QAf`5BuW)9EgK(Fb=_?I1EE^IF7)PI0{GO7#xe^a6C@H zi8u)-;}o2V({MV@z?nD;XX6~4i}P?kF2IGj2p8iLT#CzZIj+E!xC&R}8eEI(a6N9o zjkpOn;}+bC+i*MXz@4}YcjF%1i~Ddt9>9Zm2oK{CJc`HgIG(_hcnVMB89a;U@H}3? zi+Bky;}yJ$*YG;tz?*mrZ{r=ji}&z8KEQ|g2p{7Ue2UNTIljP`_zGX+8+?oJ@I8LO zkN62c;}`sj-|##Bz@PXFf8!tgw@&r%^D}G&_CJQl2pAC~VPuSgQ85}u#~2tBV_|HJ zgK;q)#>WJh5EEfyOoB<#KobR8Xrn|2UG&gLg&G44!DN^mQ(#I=g{d(Orp0ua9y4G@ z%!HXS3ueV^m>qLqPRxb5F%Ra&e3%~#U_mT|g|P@0#bQ_-OJGSXg{83!mc?>d9xGr) ztb~=Z3RcBxSRHF%O{|5ru@2V7dRQMDU_)$#jj;(f#b($XTVP9Ug{`p-w#9bX9y?%1 z?1Y`M3wFhB*d2RdPwa)gu@Cmee%K!e;6NONgK-EB#bFqV!*K+T#8EgJ$KY5ThvRVq zPQ*z#8K>Y>oQBhJ2F}D;I2-5ST%3pVaRDyGMYtH3;8I+M%W(y+#8tQ&*Wg-QhwE_z zZp2Nv8Mok8+=kn62kyjOxEuH2UfhTK@cNB9_@;8T2t&+!Gm#8>zl-{4z(hwt$N ze#B4s8Nc9H{D$B02mZug_#6M=fB)DQHX{2U!(#-Dh>T~} z9w*>LoP?8c3QomoI2~u;Oq_+YaSqPKc{m>z;6hx4i*X4q#bvl0SKvxqg{yH5uElk@ z9yj1d+=QEP3vR`2xE*)kPTYmNaS!greYhVF;6Xfuhw%s=#bbCJPvA*Bg{Schp2c%` z9xvcUyo8tW3SPx)cpY!xO}vG-@eba_dw3ro;6r?bkMRjU#b@{&U*Jo8g|G1qzQuR= z9zWnm{DhzJ3x36K_#J=XPyB_y@els3ng4gfVmJ(s5ilY~!pIl}qhd6SjxjJM#=_Vb z2jgNqjE@O0Atu7am;{refhG#H&_;<4y6B;g3N;28g2^yBrofb#3R7bmOpEC-J!Zg+ zmVx%J$As3 z*a0*UCc&g=pos!4v{9miE_&#rLX822U@}aODKI6b!qk`s(_%VIj~Or{X2Q&v z1+!u{%#JxQC+5Q3mKFp5=upkz~!dL{0Vlga^C9oux!qQj<%VIe!j}@>YR>I0y z1*>8;td2FXCf35*SO@E3J*D!}YiUH{vGT zj9YLkZo}=k19##s+>Lv1FYd$rcmNOLAv}yn@F*U`<9Gs3;we0hXYeeZ!}E9nFXAP< zj92g~Uc>8n18?Fjyp4D8F5biY_y8Z`BYccc@F_mS=lB9&;wyZOZ}2U?!}s_BKjJ6+ zj9>68e#7th1ApQ#{EdGwOceG%hQsg}0V850jEqq*Dn`TT7z1NsER2nDFfPW!_?Q3_ zVj@h8NiZoIXre$1ZItMsiyr!@P-B20m<*F+3QUQqFg2#Zw3rUlV+PEKnJ_bE!K|1K zvttg-iMcR0=E1y}5A$OIEQp1$Fc!h0SPY9}2`q`Fur!vzvRDqwV+E{;m9R2a!Kzpd zt78qUiM6mc*1@`159?zCY>17pF*d=b*bJLv3v7w4ur;>9w%88aV+ZVrov<@@!LHa1 zyJHXRiM_Bl_QAf`5BuW)9EgK(Fb=_?I1EE^IF7)PI0{GO7#xe^a6C@Hi8u)-;}o2V z({MV@z?nD;XX6~4i}P?kF2IGj2p8iLT#CzZIj+E!xC&R}8eEI(a6N9ojkpOn;}+bC z+i*MXz@4}YcjF%1i~Ddt9>9Zm2oK{CJc`HgIG(_hcnVMB89a;U@H}3?i+Bky;}yJ$ z*YG;tz?*mrZ{r=ji}&z8KEQ|g2p{7Ue2UNTIljP`_zGX+8+?oJ@I8LOkN62c;}`sj z-|##Bz@PXFf8!qv6P5js;V?W#z=#+LBV!bdiqSAS#=w{u3u9v(jEnIwJ|@6~m{ zVlK>$c`z^L!~9qP3t}NGj76|07Q^CL0!v~kERAKbESAIaSOF_yC9I59uqsx=>R1D7 zVlAwVb+9hh!}{0&8)74Dj7_j9HpAxF0$XA$Y>jQOEw;n<*a16YC+v(}uq$@M?$`r+ zVlV8CeXuX~!~Qq`2jU6G62FKz!9FG%lB2L1|I0dKTG@Onz za3;>e**FL1;yj#>3veMW!o|1*m*O&9jw^5_uEN#02G`;`T#p-YBW}XYxCOW3Hr$Ro za3}7<-M9z$;y&Du2k;;s!ozq3kK!>rjwkRWp2E|32G8O-JdYRfB3{DFcm=QGHN1{D z@Fw2E+js}>;yt{N5AY#A!pHaopW-uojxX>fzQWh|2H)a4e2*XSBYwiq_yxb>H~fx2 z@F)Jl-}nc^L}UMBI1G;wFd|06$QT8qVl<47F)${^!q^xG<6=CFj|ng#Cc?y+1e2nH zCJMCBMu`r(=%J4aH3k@h$uK#lz?7H@Q)3!Ti|H^uX26V?2{U6B%!=7CJLbTgm;O(V-YNh#jrS*z>-)BOJf-us$}xhS&%jV-swO&9FJPz?RqwTVoq+i|w#IcEFC<2|HsK?26s6JNCey*b94O zAMA_$us;sKfj9^U;}9H*!!Q(w;|Lsyqi{5i!Lc|F$KwQ?h?8(KPQj@-4X5J_oQbn= zHqODhI1lIJ0$hlTa4{~yrML{2;|g4ft8g{0!L_&!*W(7kM!LxV{&*KHWh?np(Ucsw)4X@)3yotB) zHr~Ozcn|O61AK^&@G(BYr}zw?;|qL=ukba#!MFGh-{S}Th@bE?e!;K!4Zq_L{E5Ht zH~zu@f_lS7|KEE+I1G;wFd|06$QT8qVl<47F)${^!q^xG<6=CFj|ng#Cc?y+1e2nH zCJMCBMu`r(=%J4aH3k@h$uK#lz?7H@Q)3!Ti|H^uX26V?2{U6B%!=7CJLbTgm;O(V-YNh#jrS*z>-)BOJf-us$}xhS&%jV-swO&9FJPz?RqwTVoq+i|w#IcEFC<2|HsK?26s6JNCey*b94O zAMA_$us;sKfj9^U;}9H*!!Q(w;|Lsyqi{5i!Lc|F$KwQ?h?8(KPQj@-4X5J_oQbn= zHqODhI1lIJ0$hlTa4{~yrML{2;|g4ft8g{0!L_&!*W(7kM!LxV{&*KHWh?np(Ucsw)4X@)3yotB) zHr~Ozcn|O61AK^&@G(BYr}zw?;|qL=ukba#!MFGh-{S}Th@bE?e!;K!4Zq_L{E5Ht zH~zu@0{X+oVEVSG%02{92S#w3^&4Kz`p zg*Hlb&_xe@RH!k)5KM;2F$Jc?RG1pmU|LLv=`jOl#7vkOvtU-thS@O(=EPi>8}ndZ z%!m2002ahTSQv|7Q7neVu>_XHQdkdaX20);6$8+lW_`8#c4PlXW&eng|l%E z&c%5+9~a<4T!f2p2`Lkg}ZSN z?!|q$9}nO`JcNhw2p+{_cpOjQNj!z8@eH2Db9f#v;6=QIm+=Z-#cOySZ{SV5g}3nz z-o<-(A0OaDe1wnj2|mSV_#9v0OMHc|@eRJkclaJZ;79y~pYaQR#c%i>f8bC2g}?C+ z{#VQsHYWQY!(#-Dh>ta2u zj}5RPHp0f(1e;q9kCAPxW*aq8TJ8X{~up@TD&e#RJVmIuLJ+LSC!rs^i`(i)r zj{|TZ4#L4W1c%}<48`F%0!QK~9F1deERMtRH~}Z(B%F*>a4Js2={N&t;w+qvb8s%s z!}+)X7vdsZj7xASF2m)x0$1WHT#ajREw01$xB)lfCftl$a4T-Z?YIMX;x62cdvGuA z!~J*w58@#_j7RV&9>e2!0#D*8JdJ1YES|&jcmXfsCA^GR@G4%z>v#ii;w`+5cknLW z!~6IEAL1i?j8E_>KEvnu0$<`Qe2s7LExyC|_yIrSC;W_G@GE}9@Aw0M;xGJ-fAGIT zzOb>_{}>)4U_^|BkueHJ#b_8EV_-~-g|RUX#>IFT9}{3gOoWLs2_{7YO%!OMjS?Mn z(L)~JeU{rVSX%t z1+fqo#v)i0i(zprfhDmNmc}wz7RzCItbi4<5?014SQV>bb*zCku@=_GI#?I$VSQ|X z4Y3h6#wOSln_+Wofi1BWw#GKt7TaNa?0_Ay6L!Wf*cH2BckF>Zu^0BnKG+xgVSgNe z191=z#vwQqhhZoV#}POZN8xB3gJW?Vj>ic&5hvkfoPtwv8cxRpo<>* zs8C~oA(#x4V+u@(sW3IB!L*nT(_;qAh?y`mX2GnO4YOko%!#=$H|D{-m=E(~0W64x zurL7)R4Xa}ftckU-HrBzqSP$!C18j(m zurW5lrq~RdV+(AFt*|w=!M4~A+hYgph@G%AcEPUL4ZC9x?1{awH}=84*bn>T033*e za4-(Rp*RdfaX5~^kvIxR;}{%^<8VAqz==2sC*u^Hiqmj9&cK;C3uogToQv~tJ}$t8 zxCj^H5?qSQa5=8PmADF5;~HFx>u^18z>T;GH{%xEira8I?!cY63wPrl+>85gKOVq? zcnA;U5j={=@Hn2plXwbG;~6}Q=kPpUz>9bZFXI)wir4Tu-oTr93vc5cyo>knK0d&Q z_y`~46MTx#@HxJ~m-q@_;~RX7@9;f-z>oL|KjRntir?@%{=lF33xDGu3=`*ndp|6O z!|)gZBVr_sj8QNuM#JbB17l(=jE!+HF2=+7m;e)EB20`)Few^nqCg96l<1&~9{Q+I zV}K!;43lFDOo^#5HKxI|m=4op2F!?=Ff(Sste6e6V-C!TxiB~8!MvCc^J4)lh=s5) z7Qv!e42xq4EQzJCG?u}#SPsi$1+0jburgM`s#p!HV-2i{wXinU!Ma!v>th3Kh>fr@ zHo>OY44Y#MY>BO~HMYUF*bduc2keNQurqeSuGkH`V-M_!y|6d-!M@lJ`{Mu{h=Xu2 z4#A-~3`21^j=+&P3Pcz=gO7 z7vmCKipy|0uE3SJ3RmMAT#M^)J#N5_xCuAo7Tk*4a69h6owy5k;~w0L`*1%Vz=L=Q z591L$ipTIcp1_lM3Qyx1Jd5Y>JYK+ycnL4#6}*bq@H*bWn|KRv;~l(<_wYVGz=!w< zALA2ziqG&lzQC9G3SZ+Je2ee!J$}HC_z6Gb7yOFf@H_s%pZE)Z;~xwYm;I07Fg!-U zh!_bYV-$>v(J(s3z?c{dV`ChQi}5f%CcuQ42oqxxOo|4YD9}P1B|7M$hdwIQ7+?q{ z!{nF(Q(`JijcG6~ro;4@0W)GI%#2wuD`vy&m;-ZSF3gR2FfZoA{8#`BVj(PyMX)Fq z!{S&1OJXT3jb*Sbmc#N`0V`r9tc+E#DptelSOaTfEv$`ourAia`q%&)Vk2yfO|U68 z!{*omTVgA0jcu?kw!`+=0Xt$R?2KKoD|W-~*aLfFFYJwdurKz*{x|>!;vgK1LvSb# z!%!TKBXA^+!qGSe$Kp5~j}verPQuAJ1*hUPoQ^YaCeFgyI0xtAJe-dUa3LSeNC+@=CxCi&*KHQH7@E{(-!*~Rb z;xRmqC-5Ym!qa#L&*C{ej~DPFUc$?G1+U^YypA{UCf>r^cn9y|J-m+(@F70J$M^)F z;xl}XFYqP4!q@l)-{L!bk00nchFeb*r*cb=nVmyqG2{0ih!o-*alcIqp3bfEhi4MBxp^pkR1{i|LFgd2c zl$Z)rV;W40=`cNJz>Js)Gh-IairFwb=D?ho3v**0%!~OjKNi4(SO^Pa5iE+uusD{$ zl2{5$V;L-q<*+*1(!r3u|K?tc&%qJ~qIH*a#bA6KsmjusOEC zme>kgV;gLX?XW#|z>e4nJ7X8@irug~_Q0Ol3wvW9?2G-dKMufwI0y&h5FCobFcgR5 z2pox{a5Rp=u{aLL;{=?DlW;Ol!KpY6r{fHqiL-Dv&cV4j59i|oT!@QsF)qQSxD1!$ z3S5b+a5b*MwYUz~;|AP_n{YF3!L7Irx8n}niMwz&?!mpd5BK8%Jcx(zFdo69cnpu@ z2|S6X@HC#mvv>~A;|08km+&%P!K-);uj388iMQ}J-od+g5AWjxe29*~}o?|(V<2iv7If*AQi<6nn6L}J+a4M&9I!|T}XK*HG@f4oQ*_^|I<=2mj<>49Xb) z88K!a=4C$SX8{&uAr@v47G*IOX9<>MDVAm#mSs7XX9ZSdC01q?R%JC-XARb5E!Jio z)@3~&!uo8$hHS*fY{I5&#zWbhhp`1)vK3pi4coFE+p_}?XGeBoXLey%c4K$;U{Cg9 zZyv!T*@u1EkNr7-M{ytraWIE)D2H)4M{p!Z@n{~yV>z0~aSV^=SdQa(PT)jN;t9;+ zWM=b3p2R7f%4wX=lbORAoXJ@{g{N{h=Ws4hKd@kf7 zUcd{vm`k{n%Xkql<|Vw8mvK2S=L%lImAsOxcokRkYOdimyq4GTdfvbrc@uBuExeVt z@pj(9J9!uH<~>}?bzIMTc^^0Mem=mBe2@?EVLrk~xrv+k7`O0oKEbVgl27qzZsRlD z&S$xU&v7T8=L>w1FY#sW;%>geSGk9;@pZnzH~ALd<~w|s@9}+pzz_KmKjtU=l%Mf) ze!(yK6~E>;{FdMGd;Y*5`4fNUFZ`9i@pt~gKlvAfvc`W#jG2ddnUDEdfCX8Ig;|6} zS&YS5f+bmsrCEk$S&rpdffZSam05*VS&h|MgEd);wONOCS&xUXJ{zzh8?iB)uqm7I zP&VgbY{8an#nx=Ywrt1t?7+j>k)7C?UD%b~*quGtlfBrRNAO7YVPE!Re-7YL9LPZ& z%pn}gVI0m89LZ5Un#b^1j^=S3!{a%Y<2arZIFXZh0<$=o**uXaaSEq$8mIGQ=5PjQ zau!eFshrI@oXgXAI&*mj=kZLQ#j|-1&*gkB;CVcs3%Q6F@Io%;5-#O3Uc`%e2`}Yk zT+Yk6f>&@QujDFT#nrr;Yj_Q><#oKCH}FQ@#G82wZ{=;gopzQwos4&UW_e4iiiLw>}M`3XPeXZ)OB@JoKhulWtX<#+s^Kk!HX z#Gm;Kf8}rdoqzC8{>7l2@t+Z6=3!puV}2H3K^9_R7GY5qV{w*XNtR-1mSI_zV|i9! zMOI>ER$*0EV|CVGP1a&<)?r=N;~}ii25iViY|JKX%4R&2&3PDGuq9iuHQTT)+p#@6 z@NjlyCw68Rc4aqqXAkydFZSjUJd%Cbm;KnE19%h%au5e|2#0bQhjRo+auko|F+7%| zc^t>^c#h>bj^_kU6x}I$yRL5Hf+mwY|jonoE_PTo!Nz5*^S-VgFV@c zy?F$WWFPirKlbMU9>swi#K9cGp&Z8H9Kn$s#iMx)kL73{$1yyfV>yoFIe`;7i6=0N zlbOvEc@n2^DyMNePi78ha3*K*6rRf2oWr?1ji)o0XK)_Rv;ojU*#UY#@G1<-{f0-oA2;lzQ_0Z0YBtN{FtBcQ+~$J z`31k^SNxja@LPVz@A(6NF8ohkeCELBoX0bH7SHB6JeTvifamdiF61I!zzeyUOSqKFco8q=CA^fE zaXByN3SPmLyppSU6<70WuHiMjme=um-oP7q6L01%yp^}{cHY4|c^B{IJzUFmT+e%X zA2;xRKERE9kPq=;KEg-2iJSQtxA1X3!L59fPw{DP<1^gOXSsvVaVMYW3w)6;@n!Df zZoa}-xreXub-uwj`4->iJA9Y#@qK>45BU*4<|q7=pYd~k!7uq0zvegmmf!Jv{=gsk z6MyC}{FT4)cmBaY`4@wV#(zePnTL6qkNH`E1zCuNS%gJdjKx`kC0UB4S%zg;e3U*t=CnY*}~ukcmw;cI-IZ}3gN#kctm z-{pIJpC9l;e#DRY2|wj${G4C#OMb<#`3=A2cl@3|@JIf{pZN=aushq~?JefJ1!I_-JQ+O(8a}MY7G@i~}p22xMlV|a4p2KrFp9^>%&*wre z;sv~ri@AhLxr`U_VqU^ac^Q}Ua<1SNT*)iBidS(pujU$F!)tjRujdWCkvH*X-ojgX z8*k?wypwnFZr;PST*vjim-lf4@8<*D$OrilALb)`l$*Gjk8ukh=M&t@C;1eg<~BaV z?R=Iy_#AigdA`6G`4V5|F7D$3qHvJo4z37fJR z4`p*6#ujYJR&32SY|D0R&kj7C9odPU*@a!%josOUJ=u%Bc?6GSANFNG_U8Z|#ep2e z!5qS&9LC`s!I2!rqj?OE{0cO}v@6@K)Z&+j$4?kJTYksy`2&CC zPyCs`@K^rE-}wjsZff zn#XYrkLOs9<9JTsL{8!f%;IEb^F*G+DV)k_oX(S(!x@~(Sv-ZOayI91E>Gj>%;g!J z$1`~r&*nKim-D%R=ka_lR#65Ak6>!biD@ zoB0^G@NquDt$dPC@o8@3Gu+N+xr5JfC!gmFe338lW$xl`zQR|zhp+KeSW|X`4K6?z#sV&f95azmA~N!`YFY*qL3}mEG8#J=l}I z*qcZ2NcLf0_G5nz;87gNK^)8>9LixF&Ji5RQ9PQ*@K}!KaU8?rIhNx%o)b8clXwEN zIGNczktcBqr*ayn^JM0524`{>PvNPY%{iRQ(|9^_c?Re4OrFKFc@EFzd@kU5Jf91> zh!^leF6I(0Oyq-7kM&87mc?)ml zZM>a#@J`;vyLk`Savj(6Uf#zIyq^znBOl~Le3*~$QEuX9KE^G4oKJ8opX5_~n%npc zxAR%<;B(x`=lKF(+ySSUL@Kx^NYkZw=@J+tOxA_j=<$HXeAMitd#E?lW;RdcNu0u|oW|)qnK_)n znViK_cq(Ue4(IYTp3YpJ!FfEBXYp*F!*e;G3wR#S=Rz*x1-y`pxr9r(j2H1@UcyUx z8JF{NuHY41$t$^vS8+A3<{DnZYk3{7=MB7(H}PiP!drP8Z|5DnlXvlM-ov$A$Mw9I z_i+R7=L6ix2l)^m<|BNRo4A>eaSI>k6Wq!t`4pe#Ha^4ce3m=-9Cz}0zQ7mx5?|&n z?&d3em3#OaU*{WqlW*~DzQcF<9^dB&{E#2_&DvjH2j5gW4!o3a@XWpf_J7Hr8@Y|S=o%XVzf z4m_M4*@>Omg?WG&Wa z9oA(%9>V%;z=mwZ#%#i-Y{o;`oQJUmTe1~fvklv_9ow@54`)YqVrOfJbp42XQcma43gyI7e_KNAYML!(%y`$8ijg=U9&8cuwF%PT~p7 z;$&v?M4rScoXTmO&XbwL8Jx*kJcXxnHs^3IPvhy#^u@8&&R%XM7OdwCx>@P0nPjeL*~@nJr~N4bfc`53qGaX!JVe3DP`X>Q{) z+|FmYgU@j%pXUpFkuULO?&5B~!dJP6ukm%h!8iF9-{w1fm+$d?e!vg;5kKZ9{FI;Z zbAG`u`4zwBH~g00@q7NjANdo1<}dt}zwvke!9V#IgPK9+6Eb4VJj}~{%+CTW$U-d4 zA}q>cEY1=v$xIS*qCwqz@|W*fF;JGN&B9?p*J#Ln!(uI$F{?7^Pw#oj!EN3sw5vLE|%0FUB8 z4&q=A;ZP3aaE{S-pzZsmg~5l z_wqh&;Qf4n8~Gp~;=_D|k8%??^D%DW<9vc!`6Qp>)7-{qxSh{(2cP3kKF=5UB46Um z+{N8|g|BiCU*qe1gKzRJzRh>|F5lz({D2?wBYw#`mXVSP4WLpEY#HepjX?v?8Cn7$Nn6^qd1U*IG95?l*2fjBRG8^B%6{I~UK%*XsJz=ABq!Ysm~EXLw2!ICV+(k#QWEXVS!z>2KI%B;ewtj6lB!J4ea z+N{I6tj9xGpAFcMjo6q?*p$t9D4X*zwqQ%PVr#ZxTef3+cHrUc$WH9cF6_!~?9LwS z$zJTuBX}hHurK?uKL_wA4&)#X<`53$Fb?Mkj^rpF&0}~hNAoz2;qe^HaU9PHoXAN$ zfmxi)Y@W!IIE7O=jnjEDb2x)DIg6+8RLa372viFXF|#gqQL%F6ZT3!7I3uS8^4v;%Z*aHN1w`@;YA68+apc;?2B; zxAHdL&O3M~@8aFOhikcx>v=El;|AW(2e^?B@*zIVNBAf=aWfy|7Cz1=xRp=xDL&0@ ze1_ZkEO+oZ?&R})fiLnUzRX?R%~$v;_wY5o&Nuib-{RYRhwt(|zRwT%AwS~B{DhzK zGk(r5_$9yM*ZhXx@;iRdANV7G;?Mkrzw$T!&Oi7k|6)+b_|J$j^Dr;-F+U5iAPccD zi?Aq*u{cYxBulY0%djlVu{##2C@etN$12$wMHf9qx zWiuYi<~)oo*pjW-nr+yY?bx0jcsM(<6FajDyRsX*vj=;!7kl#v9?3rJ%YN+70X&KW zIf#QfghM%u!#RQ@If_T~7#_>fJdR^{JjZez$8!QFauQEq7AG^CC-Njt;Z#oJbe_x{ z&frYW;we0pvpI)zc^Xe=F3;dRp2@R#HqYU?oX-V3kLPnC7x4mK$i-a3rCi2~crh>G zrM!&Gc{x|`3a;dpT*a%nnpblTui>@4j@R=B-pHGHGjHLoyp6Z>4&KSTcsK9iTCU@I z-pl*Af%o$PZsdb}h!67-KFUqp%*VKekMjv`<&%7hPjefe;dVaD9ej>E`8;3Xi+qVM za~F5>6~4+ne2uU34Zg{@_%`3+yL^xD^8Y#BGk@W){Eff!5B|x&7}PcXGh)m<%*%Yt&jKvSLM+T8EXram&JrxiQY_6fEX#5% z&kC%_O03K(tjcPv&Kj)ATCB}Ftjl^lg!S2g4cUl|*@R8mjEAy04`U0qWGl928@6RT zwr2+(&W`NF&g{aj?8ffw!Jh2J-aLXwvJd;RANz9vkK#ZM;$RNpP!8j8j^Id+;?X>Y z$8t1};}{;#u^h+ooWO~k#1ojs$;{@7Jc&~{mD4z#Co_jLIFqw@3Qy&1&f#31#?zV0 zGdPcD@+_Xsb9gT2a{NA%u9GFFXM7v&K10ZD|sbX@hYz7 z)m+1CcrCBv^}K;M@+RKQTX-vP$slx@;+|h{d|BM`5+(S!+eB~ zauYZ6F>c}Ge1co~B%k8b+{S0PozHRypW{wG&lmV2U*gN$#oc^`uW}Dx=q{DNQdD}K#y_$|NV_xyoB@+bbxU-&D3*n%zDimlm(ZP||P*@1_%BRjD(yRa*}u{(RP zCws9skKmE)!@lgt{v5!gIFN%lm_s;}!#JEHIFh4yG>_r29L?i6hR1U($8kI-a3Ux1 z1ZHtEvw0#<;uKEhG*0Kq%;5~qtC@JOALH{ENXM#(zePnTL6qkNH`E1zCuN zS%gJdjKw)~Tvo*avla}_*&GJJtZ~EgPM$imdF!0VlCA6w$)dqoIa?BPa~@BY^|Lo7 z%d9e4W%~y6hlS(L8=Uh*n%p;B6nz}e&weP`SU7ZN{`8i@APV9jPmnjr7vv8L1O7YzdHYgXA4=Mx|gGxc=ph{3Rs1{TYY6LZdT0!lgPEa?f zm&B-k&>(0SGzuCAO@gLDv*6I6d2m?JB4`=33R(wkg0?}spncFGI6UYWbP75LU4pJb zx1f8_Bj_3Q3VH`e1V;vag1$k&pnotRI4T$z3*2ObSj2vVzG$c5q^FQZOZ$8cYkO2PX$P!Hi&LFe^AEI5n6Z z%n9ZOrv;}6xxpF1yx`2>tl;e6oZ#GGey|`oFE~F~7%U1d2rdj32TOvb!Ls0@;Nswt z;L_l-V0mzPup+o3SQ%UytO~9QRtHxHYl3TnYlG{8>w_DD8-tsIn}b_|TZ7wz+k-oT zJA=D|yMudzwZXbzeQ4}uSakAjbbPl8W_&w|f`FM=tP$1>YlXGLI$_-3>$@w!zN+VuvvI$*gQNeY!S8$ zTZOH|HeuVaUD!VC5FQ?O3_FFL!!BXhuv^$Y>=E`1dxgEjBf=xYK4IUmU)Vn!5FQl{ z3CEG2!vy*l=7pKAaFv3@3#rgjwO_FgrXk zJSm(KP7SAp)5DX)oNz`sGn^Hk5}q2)4(Eh(!_&gk!`$$Ua9((3cvg6Jcush3I6qtv zo)?}UE({li7lapvi^C=1(r{ULQFw89NqA{^S-3pBJX{f85v~lc3|EC$g{#A>!!_YG z;kDs);q~DS;f>)<;mzSK;jQ6q;qBoa;ho`K;oadq;o5LrxIVl$yf54k-XA^?ZVVp` z9||819|<1~H-($S$HFb)ujJ*pAajA})-qdHODs9tnP zR6lAEHH;cXjiV+})2LZ=Xw*D9ENT(8j9Nvlqc&07s9n@P>JS|sb&NVioue*M*Qi_6 zJ?atljCw`Aqa>qdrmJs9)4S8W0^74U7gwgQFqQ&}djRJQ@*=j7CLAN5@3RMx&$S zqA}6&(b#BQG(MUTO^hZ*Cq!A%1V!Mbo2`qnv0)G&7nNof4fI&5q_o zbEDIu)1%zzjA&kTW^`6`c63g3ZZtny5S*BH@Yv{5Zxa=5N(Vej2?;}jvk2~jW$J_qsO8x(c{q*(bnk6=&9)G zXj}A5v^{z@+7Ue$?Tns}UWi_dUW#6hc163RSE5&=J<)5?>(Lw0o6%d*+tEAGyU}~m z`_Tu{htWsT$I&Ozr_pE8=g}9@m(f?z*U>l8x6yae_t6j0kI_%j&(SZ@uhDPO@6jL8 zpV42*qd$zJIF9qgdE{Mf z6jzR`#8u;JarL-HTr;i}*N*GNb>n*RA#we~V#WTlcKb#Kk*^ea4tSVV`a&xvO zYxT2A|Lbkw$u!N|K-M7(hjyvH_w8oy3&Gy?NK(u^%*%YtyhSAI1#A~&Ar@v47G*IO zX9<>MDVAm#mSs7XX9ZSdC01q?R%JEryY(8jYw~~}ekQLtdp`$E2<>|@OkREVej2bL z8?iB)uqm7IP&VgbY{8an#nx=Ywrt1t?7+j>k)7C?UD%b~*quGtlfBrRNAO7YVPE!R ze-7YL9LPZ&%pn}gVI0m89LZ5Un#b^1j^=S3!{a%Y<2arZIFXZh0<$=o**uXaaSEq$ z8mIGQ=5PjQau!eFshrI@oXgXAI&*mj=kZLQ#j|-1&*gkB;CVcs3%Q6F@Io%;5-#O3 zUc`%e2`}YkT+Yk6f>&@QujDFT#nrr;Yj_Q><#oKCH}FQ@#G82wZ{=;gopzQwos4&UW_e4iiiLw>}M`3XPeXZ)OB@JoKhulWtX z<#+s^Kk!HX#Gm;Kf8}rdoqzC8{>5Y_AoB?sF=ig-Wj^L-0TyH-7G@C^Wib|K36^9j zmS!22WjU5-1y*DwR%R7eWi?i34c25W)@B{nWj!9k`fR|4Y{bTF!lrD-L)n~%u?1VQ z6F8ohkeCELBoX0bH7SHB6JeTvifamdiF61I!zzeyUOSqKFco8q=CA^fEaXByN3SPmL zyppSU6<70WuHiMjme=um-oP7q6L01%yp^}{cHY4|c^B{IJzUFmT+e%XA2;xRKERE9 zkPq=;KEg-2iJSQtxA1X3!L59fPw{DP<1^gOXSsvVaVMYW3w)6;@n!DfZoa}-xreXu zb-uwj`4->iJA9Y#@qK>45BU*4<|q7=pYd~k!7uq0zvegmmf!Jv{=gsk6MyC}{FT4) zcmBaY`4^M#0rrmnj2JTy^D-avvj7XS5DT*ii?SGtvjj`B6ic%V%d#BHvjQu!5-YO` ztFjuavj%Ij7HhK(>#`mXVSP4WLpEY#HepjX?v?8Cn7$Nn6^qd1U*IG95?l*2fjBRG8^B%6{IaU_&-yV>V$^Hshge&coP(E!m2#*@kV|j_uijhqEI)u`|1{E4#5fd$1>au{V$4 zk?h01?8p8bz@s>jgE*K&IF!RUoFh1rqj)rr;jtXe<2Z)Lb1cVkJST7>C-DSkaWb=c zB2VHJPUSRC=gG|B49?^%p2AZ(n{zmqr}1>=@(j-7nLLYU^BkVb`CP#Bcs>_$5ij6{ zT+Ah0%4NKW7xNNc%FDQ%mvaTL;7VS}RlJI;c{SJY8eYrmcs+06jl79B^A_I9+ju+g z;GMjSck>>u&+`So$d~vscX2mg;j7%k*Z4Z$;G2AlZ}T0#%lG&`Kj4S_i2Hw`mrjHvztGE* z{6ep7PC6U1?+1G6j7V-yIv;Y7ALymiBk5Q2VgC8q=~UIiVflkFiu2^ncffc0*+VnM zq|+z|eyyKQsq8CbL|Vq^oOE`j{;+Ut@+-h$;rLUvN6}GHcwd zVfkiE%$YoKMou~dlbe&yztqoaF@E~ksT0P|XgYpc&cvpBGZY$<$yX>RotjC0c$m)2 z@ z=H{exHTAQq_YJaY^bNCW_RZ_Zj_KUZ*wH!Z)Xl$szmU%1WNtL?{N(2g!|LySa|wbZ z`Cp#{w`Yz^N=qkx_MSGK`Z?HX)2X1bxjE@fP+Rfo0Co_)z402mz{?0UvW#Pn3A}qGfY`kvX>|1=A_e1^|Mzb%h5UMR8#$|GTAE= z_y6)Uop%x|4J&UFs4z^b|A3t7%+!H7!=w?;95*q0b+T`CPC92*KRcbV%uc5(CuU!p zNlvG*>hJxvNji&lV6#kTv@+#Jnc6CmY-T?DUOdx=m?<>X+d9!~ts&oR)?PqoP zN2A(X;@6!O-DR%@SfWd&QKhqP`|Bi~dfQhg>Ga!w)JZxQcVMIsUMJ~n-GMp(T_@>0 z-a+aloz_z)K@#W#0~w}i>Eq`AQYTT8d2gM>2PrDgzM}qL>LhP+&_C)VU$U9>xcmuv zT!FOIy*(~%?|NLpBsDjuP-fqM_PE07;h7$n*5yHaTvnTX-K|KbsG^yo4&L30rTa$b z6i?XuyeN?-9ON-oGD%NbRH-!MKYw&xI?bO3IeAR|-&@qr^D-@}Oj=sz=~y-`<{+mn zm!u~_E1w`}71Er0gO*;kpjAv(b8{+X_WftjDyJDUK}#?9pg|j!cf!muGw0@1Nwa!d zRZZ4&bE+lF|Ms-1p6(l+QzK#Tebh`74ssv0lJq2ewbP7wv-0KU)X8Klko9%_tY7-9 z+4$FAeS=|fzy3pW>Lxp84$i5U<}8?1C>cr)Nmg@n>Sqopx-hF)R`I@J@&pc&=Wl~F zSILE0rE-UbQ~QRq#vSk@xQ6LLYNtZl-)oG0yR*43Tc0{$*-z1~ z#H(svIA~X?{o`h%q=4kQ8>i>VR8*6+#Dm;m(==mlPO}6Rb!eJ%Z$+i?QBlp4)!dxJ zGW-6sqFSUGGZmGd_@EV)-7xJcYN%zVv{sqY4&K09r~5|dv`N?-__k?6R)GP6+b-G6 z&1s)WJRrCo(qsOI;2xeHlm@rZ{H!Fn1A}1e$9wW6!3@gIoi;3WLg75ur>#O0TyJYk9JAVnrX6K)6@MOxo*j3Zcg`1;sKHC zksk9uM6PFg(ElHi>y?}+X_dVbw8|sWQ}1n+X`9e0k4#o`bNXcV{j*gT%Dm2v%<8^Hk_SZ2h-5ompW#ulD4d-cqdx{eKu7y>6Y9mbUMz zr=%qvaMe@)E`47!n3g8(y}Hcuf4u7HN#>;LPEJsDIqBhht1gY1s+*Cl=H|@I?EBBE zo0VqFR9$+v2d}!kQzp-zI3eehG^@%xHCfNinVl^ETjkA3_l?e(o3OVDo|YyYq???c zq$knOO*8(po1Bs6|97p(3A^Vo3+Ix4A zYi~z5D=j?J5zbDFILK9;lcXonJU1<1Z%3G)=G+_2^d>}eL9&{gb6#fOe@65CG-D>3 z>D3=Rn$d)rISbQ#Lb)he&&|0YS^l?BUYPD1owGP$?@cU86Ap3{OOx~@l*`hL{|x0t zY5u*TOozJUkJ10zP?vd#T%4AciQOe>F$X#ArAc}cyUP;9Zh4w>Z|u^m7Q4%n)!dvF znSK8myDQR+nb@V5d(hZrum8u(!^%u)S7u5(_>+27x^HyORSA1W-22M#6c@#*08*ja>h=Xm~(fgq%D6WzBQvuwI_JJ*eeYbv z(lj2K$(cdPT9SD~nr(E>{VDq&u@7Xn{{4?gHl}InNt_={lm7D&`*)}xN-`u7PX1ZS z!v`txk$olpmua13cjj~8JKO#@@<)>#6G!K4%9NdXFWa11Es$L=F{|K!1DeWX>E3^K z-Tklr`~Cztxcg7IB`r2{qR021DEXUZSlqqm(3~fd?fxZ-rGL4kn_H93HP_Ym_`3Gg;r?pEa$ASrgMa#$@if==iZS$0YN%VZy8SFoOIuO2zivNIlm7WA`1>L7MY3z} zWA@8TS%1Gc?;A3{O45@?^>t?VKgZf{(v>F}^1o%^lenXrHEH_4ItBm>5Vy-&1n zlce07@6tp6bF}$>Uy1*R(dLIV?*WgHAOCT@r{GVSGyS*G=I3PZuzX|2j~$;iG3S?L zBRA*QG%3&236rPf{Fbct4QI?6m)!pE$@<@M|0A>Y@3{Y&re(shKkk1eNqb*!|8B!U z^1$3*e_^t5;7vqn+P}+*lcWRtW}YN1=?i%i^o4xsfqVNx+Di0={Ap@#PJztMzx%@9 zExce_LZ&mM6`Hj0LI-5s-)#!-&zaWZz6PGvVqbGElI+jTDVjMv^PciQH0NSzu4Fzh zWq&wIWVZet){<%3zngQZH0j{YxpcBCJN>32sj@Q3`oCTd%kD2KnFY)=*#DS`IIsa9 zxSia0=98Qu^Eq%k^B!3)J@LLjh{~rYJ)oVYQ{@NTQ^hpt;P+H1*_FKKR8COSRnp9R zYdQ^}ny#9xCbNK;>5z7_a+-v7M6M3=q$&Gj}+PrR@88l@*ap!OQ?zmWYMy-AvM@Ox^S>`H2{S%TU- zG|jxX_R;{Vz2?bkGT)flcaYj^k>>E2BxvObM+rCH!v{*gDOX%r>U% zzuC5#t$$ZwyEN_J71%yaI{3|YNOmQ~El;{fZqDIpKHr9QOxF+mj;m9en3kP)-yawM z?c1=<2gw;F|6ju2--dO`oNP^U;_UPv4ea|itZU}H`)adWX6xTK*nMBoe-D>EGG*-_ zGcto^c3O1$1y;{w=l+QGI!J-N_Z657Vg{ijG6kkz)s0NQ6q}iqGjU?hkxAm-Z|>6f zt;|)Y<@ZUF|M6u^-*j_yPQSE-e|3lcX-XQrG=2;IA7^d;RnmZemvpf2aQ8jtlAM{( zzOVR__$Tv&nQPk@1buZyWl1XdIHH?GNqHgOoGuKg$`uubh9p zZjMOOlA&>Af-W~IJ#cTAOPjndcXXPXn{!NN=l(wT_iN&@Y1T}iOM{(^jH3_8`S&1t z-2SX-H1`dS*<+Ky?R&nCNp|Gs9G@xTf9sxO(_Eu-{=Z(XGdhbRiiS{>hy?*fMMXqF z#0Uw!C}72?2bI`A6)~tQ5FG(gS;dB6K`h5!u=ifDH|*FedXD{wU9pS(-Zy=BLk{3E zKfbqf=k43wnLBr8-z*$J<62~ll;V~>kQiCu$yP@w9G?2vFZss8K`;3Zl8AoEXSezf z29AzBgpT2gFQLC&O>=HW?G!@@V;YiRE3s{dYRC4*b}+W1v7LHsolX0A?Bk|k#o4^!$mDEu*?&*(~QcR9xh>1kF zr=JQaJM2fWrWOyP9qjK(CL7(4Ri)D zFk%*rVZ@m{Q8R+wU>GqQhp9SC>Kr4yoSaQhp#&SxQKFSiBuO6W96A|NoQq>58}ZvX zDbAx;Nu5vQp7{bP#iY29nCTnSxEB!$hlJOPi&3R>M3)rAy41#6)*R7g2moSTF0mzN zGDi*@$`M@=Tu^Tl7w3fhwy~~v8I3CuKqqIeG8Zb&0$rWGP$StNn;mi&x&{I8WY@}7 ztVCTWrMN$?w^#D1m>U3PId~&5*RMAf#JM>i=N20$m>X{ehPm-J7{le;d7|br*<-kT z2hmjBDU}6ezKgCxW;QA2#=A3~er~+Sxw28*+&DRy8}CI$s^&_F#^%QR=v7kp)3_FS zKuU4TK1hsAZ|3ChAwnV3<=AYNdDcG+H?4?|NL;Uoc7}WuIJ);SsSc;e$BBrfEakF{ zZy(WQ`$Qb6XjxWf{;;r&{xXr~#gX*B$X9a~&u-hgtZZP_tPf~s&g)d!4tBw)xl?Jo zF07f~u5~8TlTjqQynTx39$XWSt*`H;o<`A@s?;+Iy#JWA*TH0yjE z`M!NYN^#%5NGw+FmjGo)&C9?XM_wt2^J+fMYc@_WYF-cHyipM6&3v4I?@jY;``Uj~F z#o|XI|A|=q6h~SPvG_TTv>ambi;aZdZa&?%?XN@=YD7Y5NT>)2g&^Vh54OCM!YyvJ z<8ScuUAkCGahLv1ELMj<0A{?S^(*iFLNL;pYo@A8O2yDvY zYxufxsx7Kh)lSal`*ySoD=rly=~$uGfDblR*OU|5%L(yf!-iZ7b>y(Y z8#_cN3%amv>`J^V+a>ch*v++-n+P^0w4}c44hOs%ZUSS*SP!0}#~A5i#@MDfOjV`S zxgq9~1)I@PMi~3cz8NMm^%>IklmNXXKup>s$JQ!(l~gs2d*a?wirLzS7}??Vq%R@K zQjVBdH-4DvhjPLnuPgltagQZwgYgK~4d6Ku?FJGN!3${DK)OClzcSNp_M?7lZ4Sw% z^K%R4LAi@!8gv)6Yh-Jo6w#l>t!=1Y7N~>ei7lCiNGWdG&E3s7GHn4!n$oWahXQgv zG^`-Z@O+pp^I^6E1f}1aHTsjTZFrt$A!%S(xGm9CZ6}qEhCV|vg08|tb|5r)`;4bo z`WolTX1Ah@-6X@;9e}24M>)SSzV1XXA!$`d+Tm(ffS#|T zP?BAxq^ss@Er5uwyD^Bw4d^;2qK8w-Wv8o-x3uS5+S}A=NdM6#wL7215|M9RSykK9 zJ)0xqUWBC0lDMVzMg`Pt9~i@J>xz|WwuWap6T-U-Rkv5|FrtAGdp?^FMnMWlL2zaS*Jn}-Sv!lDu*Js$ech4E2Q zNs-A(pn7Om-~oC2c$icV>lZ}F#V5vw#zc3E^l(Fh$dL4D9zNVBsZS8#yF^6BB!>cz z5C8{20nnhv9x+G`NlA(fNsNpMO^)uAKEWdm3Bo#dj!5#zsX=gD0Gtw*rg5l836D#O zP3Y<$9-kQLADa@B937gN=oQ4fK0H1yDLFAEJQ;Y@^jW}bo3ytcEdUx21c!O|)BU_# zg8>;6B9gl%L;{Zygw!wX6$tiDF#Q6sAh>*BU|`)}-e1}!07~^*B`(Rk)ccn_Fga{U zP(uhh`1mSuL53i$TYQj@MljfXA6++n@>?H8{0mmO=%tNz?HT2xvl}+M>7%diJS}j^ zODiVz>h7Z>F!ksqIoYdYTxe{h#|lgt*w|wSq0w>49_P3D(_HIyaad?lWV(klJnQ;% zhSx&gXb1;&;*v`i&zO`c)sxxJt+-Y9-=@ftZVI`=XY~s}|5N)6eWA1;oK7Gl zgQ6J}%b<7$B{C?PL8%N%XVBy@ceeZ$QJe2^q@dNcH6O zbN`QlSwB85*j2E4KcyOWq8-Wc4~3%g_?f zphN~GGboio=?n@oD3d|i49aEDzm(3G5=172hsJ~^CVBFwdJ1?G$d?~U$)T|co`Sz! znw%699R`~IegQoGb&3=#QL;>#(!~q^{R4r4-oP#7&i;E@$iJ)xJcR?@K2sD)^%VUN zQxyA`De8arM8vnOw0U> zX?Q}{N!#&_%*?S@9qAn zV`1-hKCr2j>Z$x6+g173c0OZNbyL5&lqmfVnG1tJw@-+mCWUH*CM8EE#(ApMUA^Vk zFP|7Sykb=Mic!PMpk@#n7nf z8hZ6_lPtR)fDe?S zeuGlj{|HL8Gq5CsysngSQ#qYd8C1-mY6dkks9y%nS2?*>ug_50@3*ia>C%EUOJ%4Y z?$zv{2pZuN7D*E}x_CzYi2NUgO#?vIyNMSB{>ddOgG=O)Aj$`pKB!3c)vERb2Znhq z=yto@UZY3*xxLnQd+qH0+rk|}Q#~Dnh{FCwiu)Jw#H4y+y-6F|B{Inq=WY>1g!$Z@ zfq5n8e8SGUAJuCd^H}?xVYk$@#x}DSu#LS04>-R_^du>91-s#2Y3< zy+QSN9u3RLBX6hje<4ZSe`dn)f5fmi8le9Y<|FKvz`?=%Xz1-d`oRaF9>gP^$|D9;l z+$rv1=?UikS5o=fq_E_qj3k=w?&9_)Qc4izEg(^ezBX}2s%NJA&$^NJDx++8d;8Fu zzNx@7%WKTpsh&B$%wsb$FP+O=_y1HV{#_#Gc{$AY=H&vv|Ev*dEolZsE==_-`m@no z?DGnimRk*eWBQUbnC$WIHoKCy(G2!1^=sklF{`9qtEOG6rCqB>{uS`c(t~CBKZ0dN zs%PcDf@Rgef@Sr81j`zqSm56&zxIEb^4{+M|7^Otd z`;zPL-~Ybg{axU^xeNYPe>VEq!voyj`h)qJ`9I3jrhmNlk8a>ia$}{*ZIqKc@K9@O_V@$-4bdS$Ftm5^44E%C^Th%NvxwkB{Gi{U7_{`nzcV83TL&A>4nFecwM`gEZB= zvhVlG?hV!hUPN$v)BfO}X@4kV(GqV?A5Kr{(7(d`Ncy6qe=a)qk44AR5#aAdC(;+4 z{BzN%e=Itk4i$edI+MQW?4OIy{bSMjv_;kaUUVTc)pOB@-IsjWy}*avm;F*bSAK)H z*U?@JT=jzYhLlv#we(?cgr*H&_YQCI4&O*0Mgy?4;hWy!9p2$v>BC3>nl^mfJG>_) z($`|Sy9Cj`<~<#n?*wrn5s^`$-cyc{j^2LUb2kW2Yv?@pf|S4B{G%zm|CfJL!jgiJ z*l5p#RL{d8EHScuQO~3Q#O<{66HwoGijslek5fHQ{DO#>(6GpuB+t`e&oeI){qBg< zJ|xnxzGh;4x5&7_>KWbp^S`?H7e1R(=?!raZ?EMu(#!vb^xou1%VZcv5EM6GL<0E+Oui}Z>g*~usV z`(V$9jM)76r}IDgw(yoDUu=H%YvIF-u+T)$myEk^p08k{4kFgi1yo?jQl0)gd0g1 zPfU(ajEGE3#JmH(W1vLbdy^D|hKD9665iX?MACarI}A)NRy2|FKK#o)-g%Mj7-rkxv`{ z@8?5Jvj%!?=+zj3P%rf2UW6tg3ZfwZG2U*FgLp`QL`Z^UNP$#HgLKG%Ovr+4$bno? zCMYwM1wstMJCYD0CPy55ez9#kI+gc?8%p+-<+s0q{* z3WAzJ&7l@hOQ;pp8fpUtLv5iDs2vmvg+bv^1QZEHLG2+o6b*HNIzll}EEEUDLkW-v zN`#W2WGDsd1a*eGKwY73PaLit%O!VtD!Z}T4)`#9@+qHgf>B&p)Jr>XdAR0+5zo^c0s$LJeF&Kvln1m^qh5^jLEX=_?EWjcx!7{ACDy+deY``XL!8Yu`E;tjM z8O{P{g|org;T&*II2W87&I9|wdEtC;ez*W!5cY=)!G+->a8bAzTpTU|mxN2frQtGg zS-2cr9C*YItDfl#e20ja)gU`bk;EV7j_%eJ2z6xK1ufsRsoA538Hhc%Z z3*Uq9!w=wx@FVy!`~-dqKZBpcFW{H(EBH1127U{_gWtm+;E(Vp_%r+k{tADCzr#P^ zpYShl@_XSNK@kkW5dt9*3ZW5zFbIoq2#*Meh)9TxD2R$^h>jSDiCBn@IEV|$gk(mt zAX$-YNOmL#k`u{=A@z|!qyf?pX@oRJnjlS)Afy@6 z9BF~HL|P%Okv2#$(iRCp+99Dx7!r;|AdyHE(jIXm(MSiRBNBteB5_DOl7M)SL?j7G zMpBSYNN1!A(iQ23bVqt1sYp+x7t$N)gY-rEA^nj7$UtNeG8h?x3`K?^!;ullNMsZ; z8X1F(MaCiHkqO8|WD+tNnSxA3rXkaj8OTgz7BU-|gUm(dA@h+1$UH$B`4rN#qoA8aacUMb07TkqgL06bB~c2cQGhZii*hKB3aE%msEjJ8ifX8i8mNg{sEsz2XfWCq4ME$Xp=cNyjz*x7XcXEWb)(T}2ecy^gT|t9Xgr#LdeB5P2~9>* z&`xM)v8|{PkMf;)s(E;c{bPzfi9fA%;hoQsK5$H&C6gnCm zgN{YVq2tjB=tOi9IvJgUPDQ7o)6p5|Omr4H8=ZsBMdzXO(FN#2bP>83U4kw}m!Zqi z73fNI6}lQ-gRVu_q3h8N=tguCx*6SqZbi4D+tD59PIMQ#8{LEMMfaim(F5o~^bmR& zJ%S!ZkD5^bz_PeS$tkpP|pu7wAj$75W-|gT6)Iq3_WT=tuMu`WgL#enr2b-_alFPxP0! z@WU8_p%{kY7=e)(h0z$m7>va@jK>5_#3W3{6imf5OvenOCl9L$Af!ZKr7u&h`% zEIXD1%ZcT}a$|WgKP)en56h1gzzSmiSRt%1Rs<`G6~l^SC9slMDXcVB1}lq|!^&e7 zu!>kEtTI*wtBO^_s$(^&SQ{)DYm0?o?XXZR3=791ut+QlYmd3HXsiR)5sSfMu{bOqOTau>B9??DV<}iC ztTWaH>xy;5x??@CRIDe~3+s*b!TMtTu>RNpY#=rW8;lLXhGN68;n)alBsK~gjg7&^ zV&ky!*aU1MHVK=IO~IyO)3E8-3~VMg3!9D2!RBJ~u=&^mY$3J?TZ}EimSW4W<=6^r zCAJD%jjh4fV(YN=*amDPwh7yeZNau;+pz7}4s0j33)_wD!S-VNu>IHp>>zdsJB%H{ zj$+5K05p z>>>6DdyGB7o?_3i=hzGECH4w?jlIF%V(+l`*az$*_6hrpeZjtB->~o459}xQ3xjYN zM{pF!a2zLa5~pw)2RMVXIEVANfQz_<%eaE8xQ6Svft$F6+qi?f@Jx7SJPV!`&xU8m zbKp7gTzGCg5AKKO#q;6$@d9{3+#fH57siX=Me$;Ial8ax5-)|9#>?Pk@p5>1yaHYk zuY^~|tKe1fYIt?L23`}dh1bUG;C1l;ydGX355ybb4e>^JW4sC86c56i;mz?DcuTw$ z-WqR%2jgw=5WF28iihFhcmy7aN8#;pHy(|5z&qkGcq|@=$KwgO2T#P4@MJs%?}T^8 zyWm~%Zg_XR2cC-e#Czer@jiH8ydT~lAAk?U2jPS9A^1>y7(N^yfse#T;iK^}_*i@# zJ|3TdPsAtTlkq9|RD2pf9iM^E#Ao5N@j3Whd>%d@Uw|*f7vYQXCHPW&8NM7}fv?0@ z;j8gA_*#4&z8>FzZ^Sp@oAE99R(u=29p8cP#CPGl@jdund>_6aKY$;^58;RLBluDL z7=9c-fuF=r;ivI4_*wiMejdMoU&Jrrm+>q3Rs0%$9lwF!#BbrZ@jLik{2qQEe}F&4 zAK{PjC-_tR8U7r9fxpCG;ji&G_*?uP{vQ8;f5boGpYbpFSNt3P9shy<#DC!s0TT#; z5*UFK1VIuMK@)&r2$tXoo)8F;kO-Mj2$j$XoiGTKun3!Q2p5rw$V_A*vJ%;d>_iSC zCy|TDP2?f`h`dBTB0o`pC`kAdg^0pL5uzwjj3`c&AW9OYh|)wEqAXF4C{I)%DiW25 z%0v~SDp8H7PShZ3619lhL>;0o5kS-<>Jx!P1EL|(h-geSA(|3FL^Gl}(Sm46v?5v) zZHQo^EfGSrBSMKVBAkdIB8e!XJ>e#zi4H_ZB8G@1;)r-6f$$KCL=urqq!68m&O{fY zE76VUPV^vBiJn9+qBqfp=u7k?`V#|)fy5wUFfoJ}N(>{06C;R`#3*7kF@_jRj3dSq z6Nrh#Bw{i#g_ufABc>BGh?&GJVm2{{m`ltf<`WBug~TFaF|mYLN-QIm6Dx?7#42Jn zv4&VntRvPF8;Fg>CSo(Oh1g1LBeoMeh@HePVmGme*h}mq_7ew)gTx`?FmZ%9N*p7O z6DNq1#3|x5afUccoFmQ?7l@0*CE_x1g}6#wBd!xSh?~SM;x=)IxJ%q4?h_A)hr}b| zG4X_WN<1T;6EBFD#4F-8@rHOyyd&NdABc~{C*m{lh4@N*Bfb+qh@ZqS0wQ4&AyE<| zagrcOk|JpmkPOL^9LbXcDUuQ?lM1Pl8mW^8X_6LclMd-3Gm)9eEM!(P8=0NVLFOcL zk-5n{q#v1=%tz)Y3y=j#f3gr+m@GmTC5w^8$r5BqvJ_dGEJKzh%aP^D3S>pH5?Pt7 zLRKZKk=4l>WKFUbS(~gw)+GbTdSrbvkZeFUBpZ>9$tGk|GKg$OHYZz`2Ctv1A+>PbQEaGLcLolgSja6WN*ULUtv) zk=@B2WGdN{>_zq_`;dLfeq?`g06CBxL=Gm0kVDB~?xOkVna5;R7#_C%AicjqHM~cTvR41GnIwPN@b(6Q#q)dR4ytv zm51`9@>2Pz{8Ry|AmvXLq6$+*sG?LcsyJ1GDoK^1N>gR1vQ#;$JXL|JNL8XLQ&p&{ zR5hwPRfDQY)uL)sb*Q>j09B8wPX$s9sD@M{sxj4sYDxuB&8X&73#uj6ifT=@p@ONl zR0!3M3Z=rRa4Ld|q@t+yl$(mCI#3;{7%G;EqvELq%0nemNmMeGLUp1#Q(dU8R5z+S z)q_fT}L+0-0rE;WyuPc5JpQj4g?)DmhbwTxO$t)Ny?tEkn~8fq=I zj#^J`pf*yQsLj+CYAdyk+D`4Dc2c{j-P9gxFSU=_PaU8RQirI+)Dh|^b&NVrouE!q zr>N7^8R{%`jyg|Wpe|CEsLRw9>MC`Ox=!7oZc?|X+teNEE_IK(Pd%U>Qje&|)D!9{ z^^AH>y`WxFuc+758|p3fj(SghpgvNcsL#|F>MQk)`cD0zep0_Eh=yr|Mrn-3X@Vwc zil%8mGc-$cG*1h(NK3R#E3`^$v`!neNn5l{JG6_=L}#Y6&{^qhbapxios-T*=ce<} zeso?sADy2rKo_L_=|Xg2x(HpAE=CupOVB0hQgms$3|*EkN0+B7&=u)QbY;2#-x&hsgZbUbxo6t?^Ai5ddoNht4q+8Lg={9sQ-Ifla z+tHzP7#&VW(2;Z$-JW*S(R2s8BOOD>(s6V=oj`l&L^_F1rc>xnbZ5E?-IeY}cc**M zsdP`e7u}ogL-(co(f#QG^gwzLJ(wOs52c6E!|4(9NO}}KnjS-srN`0Z=?U~idJ;XE zo(evpA^g?Dsx6#|_9rR9m7rmR_L+_>c(fjEG^g;R%eV9H%AEl4c$LSOFN%|Ch znm$9HrO(ml=?nBl`VxJazCvH6uhG}(8}v>37JZw(L*J$E(f8>G^h5d){g{42Kc%11 z&*>NROZpZ4ntnsSrQgx-=@0Zr`V;+`{z8AHztP|6AM{W97YzXzAOHmzzySeBKmi&6 zzyKC-fCmB)fdpir02OFJ2L>>K1#I8|7sv!MgDfB`$Of{593Usi1#*Ktzz^gF`9OY9 z02BoNpb#hwih!b^7$^=(fRdmTC=JShvY;F&4=R9)pc1GIs(`AX8mJCxfSRBds153X zx*!151NA{5XaE|5MxZfh0-Ay#&ZunlYnJHSq`3+x7a zz+SKq><0(HL2w8h21meAa10y=C%{Q?3Y-RKz*%q(oCg=cMQ{mR23NpUa1C4sH^5DB z3)}{Gz+G?;+y@W9L+}VZ22a3K@C-Z$FThLi3cLnyz+3PRyayk^NAL-J24BEe@C|$i zKfq7$3qTCaAPmZ249*Y?$xsZ<0ES^$hGTd}U_?e@WJY0BMq_lwU`)nhY{p?+OeQ8X zlZDC3WMi^3IhdSGE+#jVhw)?bGWnSNOaZ1K~XVrnyWn7T{=Q;(_71TqbnhD;--G1G);$^Nw&gGpt2GQF7IOdqB%(~s%T3}6N_gP6h05N0Sdj2X_1U`8^d zn9HZq%-&CC{NE3=K+&g@`zGP{`F%pPVhvya)&9AFMI zhnU065#}g!j5*GnU`{fpnA6M|<}7oLInP{RE;5&x%ghz#Dszpw&fH*bGPju9%pK+~ zbC0>tJYXI&kC?~I6Xq%NjCszyU|uq>nAgl3<}LG%dCz=cJ~E$}&&(I*EAx%{&ir70 zGQSvzg;|6}S&YS5f+bmsrCGo-EX#5%&kC%_O03K(tjcPv&Kj)ATCB}Ftc%UWW@fXn zS=nrCb~Xo_lg-8EX7jLqY+g1Wo1ZPf7G(X|LTq8S2wRjb#ujHwuqD}2Y-zR(Tb3=y zmS-!l71>H`Wwr`im955BXKS!E*;;IEwhmjD4PfiB_1QqS0o#ym#5QJ|uua(@wi(-; zZNau=Td}R#Hf%84mJMOsv7u}j8_q_sk!%#(o^`X)YzMX@8^gx3acn%BzoMXM3=zY)`fq+nepf_GSCA{n-KRKz0y2m>t3nWrwlD*%9nWb`(3B z9m9@g$Fbwt3G7665<8il!cJwUvD4WZ>`ZnRJDZ)u&SmGZ^VtRLLUs|mm|emyWtXwb z*%j`rzUyPMs^?q&C}``H8RLG}=P zm_5QCWskAP*%RzZ_7r=XJ;R=5&#~v(3+zSq5__4w!d_*svDeuf>`nF-dz-z(-evEx z_t^*RL-rB-n0>-NWuLLn*%$0f_7(e@eZ#(G-?8u659~+w6Z@I{!hU7HvESJr>`(R= z3vn=qa43gyI7e_KM{zUjng@UGdYX1Ifrv`nYhec7A`B7 zjmysE;Bs=gxZGSG&X3E><>T^m1-OEoKUat=%oX8^a>cmfTnVluSBfjmmEp>A<+$=( z1+F4jiL1<2;i_`gxawRDt|nKDtIgHn>T&^GJ+3|%$Ti>^a*epgTobM-7sNH=nsY6< zmRu{YHP?m<=Gt;0Tstn53**AM2riO~;@Wd=E}HAWb>w2WST2r>=Mp#%m&he?$y^H8 ziR;XD;kt6&xb9pJE|u%a_2PPSeYn0{KdwJFfE&mS;s$d=xS`xIZa6oB8_A90Mss7h zvD`RrJU4-x$W7uVb5ppf+%#@FH-nqW&EjTrbGW(OJZ?U>fLq8d;udpDxTV}OZaKGt zTgk2BR(wcI*xJ-30|$Zg^_b6dEr+%|4Iw}acs?c#QGd$_&aK5jpEfIG+?;tq31 zxTD-L?l^aXJIS5mPIG6tv)noEJa>V+$X((tb62>l+%@hxcZ0jh-QsR@ceuOUJ?=jD zfP2V2;vREPxToAR?m72@d&#}xUUP4_x7<7KJ@cx+&At!_k;V%{o)`V z<`Ev{F&^g$p5!T><^j*}EYI;gFYqES@iMRQDzEW6Z}28>@iy=9E^S-u=!p0B`H zz8YVhuff;kYw@-DI(%I|fUn2b=L7i$d_%qw-cfy z#kc0$@WFgrK7?<_hw@>3I3K}B@=<(y-pxnz9r%uX3?IwK@$q~D@8J{qBtDr>;XCo2 z`7V4{z8l}2@4=_?J^5aIZ@v%Tm+!~-=Lhfu`9b_(eh5F5AI1;oNAM&0QT%9r3_q41 z$B*YH@Duq-{A7L#Kb4=xPv>XwGx=HkY<>PslG65DE(ZLLs5BP(&yy6cdUIC4`bfDWSAbMkp(k6Uqw}go;8X zp|Vg#s47$wstYxQnnEq1wopf?D+CDjg!)3D&_HM?G!hyMO@yXGkkCwMF0>F@3ay0J zLK`7iXe)#W?SxPvOb8bugh(MuXfL>hXrY79QHT*@g*YKzNDw?iqL3sc3n@Y;p|j9M z=qhv*x(hvoRH3KPOXw~15&8=Kg#N++VW2Qb7%U7Ch6=-k;lc=Eq%cYtEsPPy3gd+F z!USQWFiDs!OcACE(}d~53}L1)OPDRp5#|c>g!#e(VWF@{SS&0NmI}*+<-!VKrLam^ zEvymN3hRXR!UkcZuu0e~Y!S8!+l1}H4q>OTOV};!5%voEg#E$+;h=CxI4m3yjta+w z?C#;yNF%I zZen+_hnOn%6nlxi#Xe$Rv7gvq93T!92Z@8lA>vSRm^fSj5UA!UQ6mN;Q#XI6%@t$~Jd>}p)ABm5} zC*o7_nfP3MA-)t}iLb>s;#={Z_+I=VeiT26pT#fYSMi(pUHl>b6n}}31WSm7N|=O8 zghWb|L`y(oBv#@iUJ@ixk|bGDBvsNRT{0w7vLst_B$t#)$}DA(vP#*c>{1RXr<6;| zE#;B?q`Xo-DZf-eDk%9&g`~n#5vizDOe!vwkV;CWq|#CusjO5^Dlb)#DoT~4%2E}n zs#Hy?F4d4~O0}fgQXQ$T6d=`;>PvxA1F50ZNNOxKk(x?DQZuQ!)Iw@0wUSy(ZKPnS ztrQ})lR~91DO`$>BBdy)z2ugnr4CX@DMpHw;-q*fLGnn6Qj(M`rAVEm&QcettJF>E zF7=R7rJhnRskhWe>MQk=`bz_(fzlvpurx#(Dh-o{OCzL_(kN-PG)5XLjg!Vp6QqgK zBx$lVMVcy2lcq~Eq?ytzX|^;+nk&td=1U8th0-Evv9v^5DlLEfWCTX*@McOKDleSAcq@B_(X}7dT+AHmo_Dct(gVG`CuyjN^Djk!KODCk0 z(kbb*bVfQWos-T>7o>~QCF!zsMY<|oldelQq?^($>9%x7x+~q2?n@7(htebIvGhcG zDm{~)OE09C(ktn;^hSCsy_4QcAEb}cC+V~FMfxgzlfFwoq@U6+36fzMkx?0wahZ@w znUZN4$c)U&oXpFDEXtBB%ZjYZnykx)Y|55w%Z}`lGs&6dEOJ&ko19(FA?K8H$+_h` zvY(t+&L`)W3&;g!f4PucSS}(Lm5a&6C3UWocl3ZD?B3G5G z$<^f=a!t9GTwAUq*Ode0dUAa^P;MYMlpD#7CJW?JdkCw;CW94!3czJ?6QJy4E zmZ!*5ILd-;R>QT`-d{v|^StRM=iU<$4f3aL;E ztpJ5lScOw~MNmXVQe;I@R7F#C#ZXMeQf$RhTuLS-vyw&0s$^5LD>;;$N-ib0l1K4V z@+$e1{7M0(pyIC-QVJ_Yl%h&8rMOZ;DXElFN-JfQvPwCnyi!4_s8muaD^--LN;Rdr zQbVb!)KY3Ib(FeFfKpGXuLLR$l!i(prLodPX{rP%&6MU!3#FyfN@=aMQG%7WN{G@< z301(uhLKHuMAKIDua~4$`EC!GE5n+j8H}@qmek#8dNQG5IMO942RYE0IN~Kkx zGAgTbDz6Hvs7k7=Dyph#s;(NUsamS7I;u;}q-IvLs9Du)YIZe;np4fC=2r8lerjGd zpPFASpcYj9)k11vwTN0&Ev6P%OQbZMBYCR}E0>srA)BwSn4DZKO6y zm>RA|sF7-v+Fo_5(P{^^qZ*^es&Q((nxJ~rL^VlGR#VhYYG<{J+EwkQc2|3-scKKP zm)cwHqxMz%sr}Uf>OggnI#?Z|4poP#!_^V$NOhDtS{WD`8R|@RmO5LVqs~?5sq@tZ>Oysqx>#MJE>)MQ%heU?N_CaGT3w^ARoAKO)eY)K zb(6YT-J)()x2fCJ9qLYXm%3ZsqwZDrsr%If>Ou98dRRT89#xO2$JG<+N%fR^T0Ntl zRnMvC)eGuH^^$s7y`o-Kuc_D78|qE&gquy2TsrS_f>O=LB`dEFUK2@Ko&(#;| zOZAodT79FwRo|)a)eq`N^^^Kp{i1$Vzp3BVAL>u_mkMdHhG?jUX}Cscq(*791~f)v zHBRF-K@&AelQl(CHBHksLo+o?vo%L^X_>UlS{5y^E36gKifYBQ;#vuct+m!h3)b3dAzC{vR14F> zwFoUzi_+R_ZY^5tpmo$@v{)@pi`Nn~kCvz1`=K5JjJui7{5yY@r-sr}L*9o7*Y)iE8{37ym_oz{WQ=&a7^ zye{aXF6pwa=&G*ix^C#EZt1q}=q^2zo>|YLXVtUm+4UTHPCb{NThF8W>3Q{hdValt zUQqYf3+aXRB6?B1m|k2jp_kN4>815DdRe`kUS6-DSJW%%mGvrmRlS;CU9X|n)NARr z^*VZ8JwUIg*VhB}26{uik=|HuqBqrp^k#Z6`^+vvf1TRlW?r-$lcdbl2; zN9s{}d)=)^>mBrtdW;^c$LaBUg6`21^&~x6PtiN+o%JqySG}9wUGJf%>OJ*ddT+gt z-dFFZ_tyvL1NA}rV10-_R3D}f*GK3h^-=n0eT+UO+Mv&3WXl}GHS{kj4)-bgS!Mxv2qBpWG4C!@2`#pr5uGrAi+j8vni(aY#<^fCGx z{fz#`0Arvr$QW!4F@_q$jN!%zW27<47;TI(#v0>{@x}yWqA|&sY)mnx8q@oHl`;7g@0pp-?$T(~qF^(F?jN`@$8^Tq|^ zqH)Q%Y+Ny}8rO{L#tq}9am%=E+%fJN_l*0-1LL9b$ari#F`gRFjOWG+zH-T z0JEN1-wZSxm<`QFW@EF7+0+a&o0-kc7G_JcmD$>CV+NaT%@DJl8ES@^;bw#xX-1jt zO}80sb}&1dF=nh8XU3ZerpHV)lgwl@#q4BuHoKTz&2DCQvxk{#_B4B$z0E#mU$dXt z-yC2LGzXc3%^~JcbC@~Y9AS<$N13C|G3HovoH^c{U`{kAnUl>a=2UZSDCBLHRf7#ow?rJU~V)wnVZcm=2ml? zx!v4h?lgCqyUji3UUQ$h-#lO*G!L1F%_HVf^O$+uJYk+RPnoC9Gv-Vl39;EZ!0<(UL6L zQY_WdEZs6J)3Pkvax9mX$;xbHv9em(tn5|}E2ovq%5CMb{H(lIJ}bXfz$$3@TZOE` zRuQYHRm>`Gm9R=$rL59c8LO;S&MI$Juqs-WtjbmutEyGas&3V=YFf3d+EyK_t`%U_ zv+7%cRs*Y{)yQgWHL;pnK~^)Xxz)mIX|=LiTWzdhtF0AcwX;I4Fe}`Oup+G}tG(s6 zqOA^AM=QpPwc@OJE5Y(uiB^)8Y^7M8tj<;!tE<(`>TdP0Qmvj=FRQoJ$LeeKv-(>D ztbx`bYp^xM8fp!*hFc@7k=7_{v^B;WYmKwUTNA8_)+B4PHN~20O|zz3Gpw1`ENiwk z$C_)+v*ue1tcBJhYq7P&T52t`mRl>VmDVb2wYA1tYpt`^TN|v6)+TGSwZ+#%jiI%*xWj$0?Jlh!Hgv~|WhYn`*sTNkX0)+Ot* zb;Y`BU9+xRH>{i1E$g;*$GU6Xv+i3DtcTVk>#_C3dTKqho?9=hm)0xmwe`k&YrV7H zTOX{C)+g(;^~L&XeY3t>KdhhDFAK6^8?jLvvvHfSNt?218`zA^+MLbXf-TyTE!&E% z+M2D~hHcuGZQG9RvNPG4?JRayJDZ)|&SB@YbJ@A=Jhq>m*Uo3>w+q+>ZGXFvUDz&S z7qyGo#qAPyNxPI?+Ad?4waeM%?Fx29yOLemu3}fUtJ&4<8g@;)mR;MfW7o93C9(JnT)9z*Ww)@z9?S6KDdw@OA z9%K);huA~yVfJu)ggw$8WskPU*kkQ+_IP`OJ<*sUSuz}m)J|~W%hD=g}u^VWv{l^*lX=|_Ii7Rz0uxeZ??DCTkUQ3c6*1t z)81w8w)fb3?S1xs`+$AWK4c%ZkJv};WA<_TgniOJWuLas*k|o?_Idk)ebK&TU$(E< zSM6)|b^C^W)4pZjw(rR=A;5Dw{34($Mkaaf0Qct>zVM{;CGaa2ch zbjNT^$8v1Paa>L&C$p2q$?9ZtvO77PoK7w$x0A>5bMiX*ocvA!r=a8S6mkkXMVz8e zF{ijw!YS#La!Na8oU%?ir@T|aspwR4Dmzu2s!lbhx>Lic>C|#+J9V78PJmO-sqX|j z4V;EfBd4*`#A)gTInA8rP79}{)5>Y>v~hx+woZuC&IxtGoNyF*4120DYB z!Ojq8s58tN?u>9oI-{J?&KPH`GtL?BOmHSTlbp%U6lbb4&6)1ZaArEQoY~GCXRb5P zneQxc7CMWZ#m*9Esk6*k?yPWDI;))3&KhT}v(8!XY;ZO@o1D$g7H6xo&DrkkaCSPo zoZZeIXRou*+3y^14myXN!_E=ssB_FY?woK=I;Wh|&Kc*dbIv*MTyQQrmz>Ma73ZpR z&AIN}aBe!coZHSF=dN?lx$iu19y*Vl$IcVysq@Tv?!0hbI9Sn5%W=6}nOvD&SzK9N*<9IOIb1nixm>wjd0c+3ysmt%{H_A7f-ZkoAy;8n z5m!-HF;{U{30Fy1DOYJ%8CO|XIahgC1y@B^C0Auv6<1YPHCJ_44OdN9E!Y3U+C2qX zvb=A+pFOrcm8i_B>alIxwr$(CZQHhO+qTWKXaB#yn{#&}&U5prcXdQxtypia?uaLg zI>nsgP6?-^Q_3mrlyS;B<(%?P1*f7@$*JsAajH7ioa#;ur>0ZOsqNHp>N@qD`c4C< zq0`7|>@;zjI?bHsP79}{)5>Y>v~k)x?VR>b2dAUc$?5ELak@I)obFB!r>E1)>FxA! z`a1ob{>}hrpfkuB>+I>Vgd&Io6uGs+q5jB&;~@0DXI?J5p&I)Ixv&vcRtZ~*l>zwt@24|zQ$=U2| zake_!obApIXQ#8v+3oCc_B#8V{mudBpmWGM>>P29I>(&j&I#wFbILjGoN>-N=bZD- z1?Qr3$+_%YajrVooa@dF=caSZx$WF>?mG9J`_2RBq4UUj>^yOvI?tTv&I{+I^U8Vc zym8(-@0|C}2j`>n$@%PjalSg=obS#L=cn__`R)91LWzGwXc0z)72!m95kW*0kwjz> zMMM?RM062D#1yeaY!OGq74bxTkw7FAi9}+NL^whSDU{H{6-HR$310*vL?ji-L~@Zr zq!g({YLP~y73oBJkwIh>nM7uhMPwD(M0Sxw=e7iZm~z~ z75l_~aX=gths0rVL>v{z#Bp&#oD`?TX>mrJ73aixaY0-Zm&9dpMO+ov#C35)+!VLO zZE;8375Bt_@jyHjkHll~L_8JG#B=dNycDm*Ywr>!^!Y6f{Z95$;dK_j4Gqa=rV?kDPzglGLDQZi~%qp|V z>@tVUDRar(GLOtF^U3_OfGj8r$-=UTEGmo1;avEcDQn5vvW~1P>&g1Efov!n$;PsYY$}_{=CXxsDO<_bvW;vj+sXE_gX}0f z$?*s-?y`sMDSOG@vXAU5`^o-tfE*|X$-#1n94d#&;c|o=DM!iCa*P}+$I0<> zf}AKP$;onxoGPcu>2ijgDQC&qa*muU=gIkUfm|pT$;EPsTq>8z<#L5wDObtWa*bRo z*U9yAgWM=L$<1<$+$y)p?Q)0QDR;@;a*y0A_sRY8fIKJ<$;0x9JSvaL+*)YDR0T!@{YVK@5%e}fqW<*$;a}Ed@7&G=kkSo zDPPIg@{N2e-^us#gZwBz$)e5yztx~Ji8nsrfQ|r|RwNY(So7EPzRc%w-)ef~&?NYnd z9<^8PQ~T8cbx<8rht&~vR2@^t)d_V{ol>XO8Ff~jQ|HwMbx~bXm(>+@Rb5lp)eUu1 z-BP#J9d%dTQ}@*a^-w)hkJS_PR6SGA)eH4fy;85$8}(MbQ}5LW^-+CNpVb%jRee+6 z)erSk{ZhZx9~DafqeJU3I;;+-!|Mn-qK>2^>nJ*^j;5pQ7&@korDN+jIm=IILQAc*)~+_%YESz*&>=dhPNtLV6gs6&rBmxPI;~Ep)9Va6qt2u=>nu8} z&Ze{L96G1YrE}{%Ims_SE~bm?61t==rAzBFx~wjz%j*idqOPPX z>nggcuBNN&8oH*grEBXtx~{IL>+1%(p>Cub>n6IXZl;^-7P_TwrCaMZx~*=f+v^Ux zqwb_T>n^&h?xwrz9=fOQrF-i>y07l1`|AOEpdO?L>mhom9;S!u5qhK^rAO;AdaNF& z$Lk4tqMoED>nVDwo~Ebk8G5FkrDy9odajm_=rUZ$7p6?&y!rB~}U zdaYik*Xs>>qu!)9>n(b#-ln(f9eStUrFZK+davH6_v-`tpgyDz>m&N8KBkZB6Z)h+ zrBCZK`m8>u&+7~NqQ0ar>nr-IzNWA18~UccrElvy`mVmG@9PKpp?;(v>nHlDex{%6 z7y6}srC;ke`mKJa-|G+hqyD5n>o5AN{-(d{ANr^MrGM)`I+Xj58`=%yhIPZa;oS&s zL^qNf*^S~xb)&h_-5736H$$!g zxFK#*H<_EyUALZF-)-PFbQ`&i-6n2Rx0&1AZQ-_bTe+>>Hf~$Do!j2+;C6I7xt-lEZdbRP z+uiNq_H=u>z1==;U$>vz-yPr%bO*VE-68H!cbGfe9pR32N4cZjG45D*oIBo~;7)WW zxs%-~?o@Z0JKde(&U9zFv)wuFTz8&3-(BD?bQigc-6if)cbU7~UE!{DSGlX*HSSt> zox9%M;BIs`xtrZB?pAl3yWQR4?sRv#yWKtRUU#3n-#y?SbPu_Q-6QT%_n3R!J>i~o zPr0YvGwxaUoO|BA;9hhuxtHB5?p61id)>X^-gIxdx7|DLUH6`Q-+kadbRW5o-6!r- z_nG_Lec`@zU%9W{H||^ao%`PX;C^&Jxu4xH?pODl``!KF{&au2zuiA>DD#gAZNiwa zCY%XxBAAFKl8J1hn5ZV2iEd(;m?oBqZQ_`?CZ36J5}1T0kx6Wl7{>@BjWXJ}#u#fn zC(oFYiZR(i1 zrk<&98kmNrk!ft2n5L$gX>MAWmZp_yZQ7W&rk!bTI+%{8lj&@_n69Rq>27+Mo~D=S zZTgtLrl09=2AF|nkQr=-n4xBv8E!_Hk!F+`ZN`|fW}F#sCYXt4l9_C#n5kx(nQms7 znP!%mZRVJ{W}caE7MO)*ky&h(n5AZ!S#DOCm1dP$ZPu8zW}R7YHkgfOli6&xn5|}; z*=}~2oo1KWZT6VGW}n$_4w!@HkU4CQn4{*HIc`pvljf8;ZO)jp=A1ciE|`nvlDTZI zn5*WRxo&Qlo933eZSI)6=AOB49+-#bk$G&Mn5X8Md2U{qm*$muZQhu-=AC(OKA4Z@ zllg4En6KuW`EGugpXQhOZT^^0_8%MChOuF7I2+zZun}z}8`(y&QEfCE-NvvnZ7dtx z#<6j2JR9F8unBDHyV+_tbSZ7bW_ zwy|w(JKNrNupMnD+u3%pU2Qkp-S)6OZ7Wp# z?I=6ijuoLYhJK0XLQ|&Z6-OjKx?JPUn&ardtJUibmunX-XyVx$VOYJhd z+^(=I?JB$4uCZ(FI=kL(up8|ryV-8BTkSTx-R`hE?Jm39?y-CAKD*x@um|lSd)OYa zN9{3t+@7!}?J0ZOp0Q``IeXq-uovwmd)Z#GSM4=>-QKV_?JaxT-m!P>J$v6iun+Ac z``A9QPwg}N+`h0c?JN7*zOirZJNw>#upjLw``Lc6U+p*h-TtsY?JxV={;{FFf4tCM z7%!|B&I|8F@FIGVyvSY@FRB;Ki|)nnVtTQ>*j^kjt{2aX?7Pub@}RE9@2Vih9Mo;$8`_q*ux-?UnJ$dgZ+GUInkBSIMjFRq?8N z)x7Fn4X>tG%d73x@#=c@y!u`Puc6n-YwR`gntIK==3Wc0rPs=9?X~gRdhNXSUI(wE z*U9Vbb@94--MsEz53i@!%j@m+@%noGy#C$*Z=g5G8|)47hI+%i;ob;uq&LbN?Tzuq zdgHwD-UM%=H_4mqP4T9B)4b{43~#14%bV@Z@#cE-y!qY&Z=tuyTkI|ImU_#)<=zT! zrMJpk?XB_Fdh5LP-Ue@@x5?Y=ZSl5x+q~`G4sWNo%iHbk@%DQAy#3w*@1S?cJM10t zj(W$u0^^^I@{S%lYN~3Vubu zl3&@c;#c*n`PKazeoeoYU)!(a*Y)f9_5B8ZL%)&V*l*%D^_%(4{T6;pzm?zGZ{xT1 z+xhMN4t__!li%6z;&=7C`Q7~;!pLb`P2Ow{!D+CKii+<&-Lf|^Zf<>LVuCJ*k9r= z^_Tg}{T2R7f0e)5U*oU!*ZJ%H4gN-dlfT*D;&1i0`P=;+{!V|FzuVvA@Adcj`~3s{ zLI03{*gxVQ^^f_-{S*F4|CE2)KjWYE&-v&53;sp_l7HF1;$QWz`Pcm${!Rauf7`#~ z-}UeL_x%U{L;sQg*ni?b^`H6A{TKdA|CRsRf8)RP-}&$T5B^90lmFTO;(ztO`QQB? z{!jmx|J(oLhYJ1)LI+`jutB&Wd=Mds7(@yp2T_8kL9`%x5F>~g#0p{uae}x(ydZv% zAV?S_3K9oN0w)lG3{;>4H!y(>yuc5FAS6f{Bny%UDT0(isvvccCP*8k3(^M}f{a0? zAajr<$Qon|vIjYWoI$Q2caSH@8{`Y}2L*zHL7|{NtArUx^EnZc}Jb}%QH8_Wyl2MdCQ z!J=Ssuq0R-EDM$gD}t55s$g}nCRiJ+3)Tl4f{nqZU~{k~*cxmLwg)?cox!ePcd#ee z8|(}A2M2QCO8|M3(f}@f{VeW;Bs&!xEfpwt_L@Q zo58K%c5o-S8{7--2M>aW!K2`D@FaK|JPV!&FM^lBtKfC;CU_gX3*HAGf{($c;B)XL z_!@i*z6U>opTV!-ckm|&74lC==#VfWVMD@&gb#@j5-}uFNaT=wkl+(!afWqF4J^|33Y{s`c8({`(rWbb46D zzFGgB$^V{c+5fkK?*H=t^-hAn-Ijk7<^TAv#`y1JH0J0S17qU9jm-b$-^(2SOB{@g z@i0Cnz=W6x6Jru|P@qJG8eKGK(L)~t48f$B43lFDOo^#5HKxJ;)u*OoPLCNdBWA+P zm<9h=XPb>VJLbTgm;O(V-fscHFh!P;#dMpVk!KuT~}9w*>LoP?8c3QomoI2~u;Oq_+YaSqPKc{m>z;6hx4i*X4q#bvl0SKvxqg{yH5 zuElk@9yj1d+=QEP3vR`2xE*)kPTYmNaS!greYhVF;6Xfuhw%s=#bbCJPvA*Bg{Sch zp2c%`9xvcUyo8tW3SPx)cpY!xO}vG-@eba_dw3ro;6r?bkMRjU#b@{&U*Jo8g|G1q zzQuR=9zWnm{DhzJ3x36K_#J=XPyB_y@elrO*!`Qo-MWACw^sLW{`Tqq&EFc`zxmsu z`!|0pbpPgWiSFP0?a%$2zwNkx^S2)NZ~hkK{>|TN+`swTi2FBx8*u;T@7d*V{vKHV z=I?RkZ~h)p{w5B_#dsJW6JSD2go!Z;Iw(-0LX9pOwCJIa0ft~wOoquZ1*XJQm>Sby zT1ZzFARfZQcm$8)F+7eZ z@FbqX(|88Y;yFBz7w{rp!pnFCui`bljyLco-oo2>2k+uNypIp?AwI&#_ynKgGklIO z@Fl*&*Z2nC;yZkgAMhi7!q4~xzv4Iijz91x{=(n*2mf~o;Xk4OTf3n`V;KDZ`})4< z{N5NC6Jud)jDvA89>&K6m=F_TVoZV#3Y4f&ql*SDdgx<-A(#}CVRB4?DKQnM#x$4~ z(_wndfEh6pX2vX-6|-S>%z-&E7v{!1m>2V5ek_0mu@Dxmq=6{}%&tbsML7S_f(SQqPIeQbaYu@N@LCfF34VRLMOEwL50#x~d% z+hKd`fE}?DcE&E)6}w?~?14S87xu^NPR1!X6{q2JoPjfO7S6^wI2Y&Pd|ZGFaS<-YCAbuq;c{GoD{&RB#x=MW z*Wr5HfE#fWZpJOR6}RDb+<`lB7w*PAxEJ@~emsB&@em%yBX|^#;c+~HC-D@X#xr;p z&*6EzfEV!+UdAhU6|doSyn#3I7T(4?co*;CeSClq@ew}8C-@Yf;d6X}FYy(=#y9vD z-{E`wfFJP_e#S5O6~Ezk{DD957yiaS_`g5&{}YD&k74k?`^^6RQ8?!C7y%<!;vgK1LvSb#!{Imr zN8%_Pjbm^uj>GXd0Vm=loQzX&Do(@cI0I+mES!yVa4ycn`M3ZV;v!s(OK>SJ!{xXF zSK=yMjcaf%uEX`X0XO0%+>BdrD{jN>xC3|MF5HcKa4+t|{dfQm;vqbYNAM^f!{c}Y zPvR*&jc4#Ip2PEa0Wabuyo^`yDqh3ucmr?ZExe6)@GjoN`}hDK;v;;FPw*)|!{_({ zU*ao#jc@QRzQgzU0YBm={ET1lD}KZ8_yd39FZ_*v@V`qw|NZ>@CoKCP!(dnphv6{- zM#M-M8KYoSjE2!M2FAo#7#rhYT#SeDF##sTM3@+ppo0P>D%9wrL5m*x7+?q{#blTq zQ(#I=g{d(Orp0ua9y4G@%!HXS3ueV^m>qLqPRxb5F%Ra&e3%~#U_mT|g|P@0#bQ_- zOJGSXg{83!mc?>d9xGr)tb~=Z3RcBxSRHF%O{|5ru@2V7dRQMDU_)$#jj;(f#b($X zTVP9Ug{`p-w#9bX9y?%1?1Y`M3wFhB*d2RdPwa)gu@Cmee%K!e;6NONgK-EB#bG!c zN8m^tg`;r{j>T~}9w*>LoP?8c3QomoI2~u;Oq_+YaSqPKc{m>z;6hx4i*X4q#bvl0 zSKvxqg{yH5uElk@9yj1d+=QEP3vR`2xE*)kPTYmNaS!greYhVF;6Xfuhw%s=#bbCJ zPvA*Bg{Schp2c%`9xvcUyo8tW3SPx)cpY!xO}vG-@eba_dw3ro;6r?bkMRjU#b@{& zU*Jo8g|G1qzQuR=9zWnm{DhzJ3x36K_#J=XPyB_y@elrc&+LEs2SZ~R42$6~JVwBX z7zra|6pV_|FgnJ-m>3IVV;qc&@i0Cnz=W6x6Jru|P@qJG8eKGK(L)~t48f$B43lFD zOo^#5HKxI|m=4op2F!?=Ff(Sste6e6V-C!TxiB~8!MvCc^J4)lh=s5)7Qv!e42xq4 zEQzJCG?u}#SPsi$1+0jburgM`|8BAV_Z?h~xjNRsnpg{KV;!uE^{_rRz=qfe8)Fk} zip{V&w!oIy3R`0vY>Vx%J$As3*acz=gO77vmCK zipy|0uE3SJ3RmMAT#M^)J#N5_xCuAo7Tk*4a69h6owy5k;~w0L`*1%Vz=L=Q591L$ zipTIcp1_lM3Qyx1Jd5Y>JYK+ycnL4#6}*bq@H*bWn|KRv;~l(<_wYVGz=!wv>U|0-? z;V}Y6#7Gz!qhM5whS4zw#>7|{8{=SHjEC_t0Vc#mm>83wg90Tg)aar?iyrzIU&yZK`exYu?QB$ zVptqYU`Z^6rLhc_#d264D_}*egq5)hR>f*q9cy4stcA6)4%WqbSRWf;Lu`bNu?aTC zX4o8CU`uR;t+5TZ#dg>pJ77obgq^VqcExVk9eZF;?1jCt5B9}=*dGVrKpcdFaR?5@ zVK^K|;7A;Wqj3z5#c?RW52E$@F437~o zB1Xc<7zLwZG>nchFeb*r*cb=nVmyqG2{0ih!o-*a9TX^0p+*-CTJ+Gz07Eb-Cd1^I z0#jltOpR$UEvCctm;p0lCd`akFe_%m?3e>{VlK>$c`z^L!~9qP3t}NGj76|07Q^CL z0!v~kERAKbESAIaSOF_yC9I59uqsx=>R1D7VlAwVb+9hh!}{0&8)74Dj7_j9HpAxF z0$XA$Y>jQOEw;n<*a16YC+v(}uq$@M?$`r+VlV8CeXuX~!~Qq`2jUa4Js2={N&t;w+qvb8s%s!}+)X7vdsZj7xASF2m)x z0$1WHT#ajREw01$xB)lfCftl$a4T-Z?YIMX;x62cdvGuA!~J*w58@#_j7RV&9>e2! z0#D*8JdJ1YES|&jcmXfsCA^GR@G4%z>v#ii;w`+5cknLW!~6IEAL1i?j8E_>KEvnu z0$<`Qe2s7LExyC|_yIrSC;W_G@GE}9@Aw0M;xGJ-e=t-;_CJQkFc=oYVR(#y5it@* z#wZvSqhWN6fiW=_#>O}p7vo`kOn?b75hlhY=%7G}3N^ZD(4vPv1{i`#F&QSu6qpiI zVQNf+X)zt9#|)SeGht@Tf>|*eX2%?u6LVp1%!7F`ALhpbSP%p5^R>vAx6Ki2@tb=v29@fVO*bp0GV{C#=u^BeU7T6M7 zVQXxIZLuA;#}3#LJ7H(+f?cs2cE=vr6MJEA?1O!=ANI!qI1mTnU>t%&aTpHA5jYY@ z;bUuCPRAKI6KCOUoP%?59?r)FxDXfNVqAhtaTzYh6}S>t z;c8riYjGW}#|^j-exUdJ1F6K~;dyn}b~9^S_X_z)lAV|;>7@fkkH7x)ri z;cI+@Z}AVx%J$As3*acz=gO77vmCKipy|0uE3SJ3RmMA zT#M^)J#N5_xCuAo7Tk*4a69h6owy5k;~w0L`*1%Vz=L=Q591L$ipTIcp1_lM3Qyx1 zJd5Y>JYK+ycnL4#6}*bq@H*bWn|KRv;~l(<_wYVGz=!wIFT9}{3gOoWLs2|6fHqC$->8no!4j{$~YQcQ-)F$Jc?RG1pmU|LLv z=`jOl#7vkOvtU-thS@O(=EPi>8}ndZ%!m2002ahTSQv|7Q7neVu>_XHQdkv02a#7(#vx8PRXhTCxm?!;ZV8~5N|+=u(|03O6cco>i1Q9Opn@dTd4Q+OKB;8{F} z=kWqw#7lS?ui#a@hS%{1-o#sY8}Hy_!ytyQ+$Tc@ddubSNIy=;9Go$ z@9_hE#83Dczu;H=hTriA{={GS8~@<{ihBQv@_*-m&=>~8VmJ(s5ilY~!pIl}qhd6S zjxjJM#=_Vb2jgNqjE@O0Atu7am;@aZC{dwC7Y$nU(8mBnFexU(SI818ZU}tc`WBF4n{P*Z>=1BW#RKuqigf=GX#TVk>NoZLlr2 z!}iz#J7Op7j9suRcEj%21AAgG?2Ub}FZRR!H~D z!}YiUH{vGTj9YLkZo}=k19##s+>Lv1FYd$rcmNOLAv}yn@F*U`<9Gs3;we0hXYeeZ z!}E9nFXAP8n18?Fjyp4D8F5biY_y8Z`BYccc@F_mS=lB9&;wyZOZ}2U? z!}s_BKjJ6+j9>68e#7th1ApQ#{EdI`e+B*jL}mYD7z~TyFg!-Uh!_bYV-$>v(J(s3 zz?c{dV`ChQi}5f%CcuQ42oqxxbWor~g&JKnXwgF-0}R2Wm<*F+3QUQqFg2#Zw3rUl zV+PEKnJ_bE!K|1Kvttg-iMcR0=E1y}5A$OIEQp1$Fc!h0SPY9}2`q`Fur!vzvRDqw zV+E{;m9R2a!Kzpdt78qUiM6mc*1@`159?zCY>17pF*d=b*bJLv3v7w4ur;>9w%88a zV+ZVrov<@@!LHa1yJHXRiM_Bl_QAf`5BuW)9EgK(Fb=_?I1Gp52pox{a5Rp=u{aLL z;{=?DlW;Ol!KpY6r{fHqiL-Dv&cV4j59i|oT!@QsF)qQSxD1!$3S5b+a5b*MwYUz~ z;|AP_n{YF3!L7Irx8n}niMwz&?!mpd5BK8%Jcx(zFdo69cnpu@2|S6X@HC#mvv>~A z;|08km+&%P!K-);uj388iMQ}J-od+g5AWjxe29Js)Gh-IairFwb=D?ho3v**0%!~OjKNi4(SO^Pa5iE+uusD{$l2{5$V;L-q<*+*1(!r3u|K?tc&%qJ~qIH*a#bA6KsmjusOECme>kgV;gLX?XW#| zz>e4nJ7X8@irug~_Q0Ol3wvW9?2G-dKMufwI0y&h5FCoba5#>@kvIxR;}{%^<8VAq zz==2sC*u^Hiqmj9&cK;C3uogToQv~tJ}$t8xCj^H5?qSQa5=8PmADF5;~HFx>u^18 zz>T;GH{%xEira8I?!cY63wPrl+>85gKOVq?cnA;U5j={=@Hn2plXwbG;~6}Q=kPpU zz>9bZFXI)wir4Tu-oTr93vc5cyo>knK0d&Q_y`~46MTx#@HxJ~m-q@_;~RX7@9;f- zz>oL|KjRntir?@%{=lF33xDGu3>BUK|6^zjgJCfohQ|mP5hGz_jDk@y8b-$$7!zY* zY>b0(F&@Up1eg#LVPZ^z4hod0P@{_mEqdrJeU{rVSX%t1+fqo#v)i0i(zprfhDmNmc}wz7RzCItbi4< z5?014SQV>bb*zCku@=_GI#?I$VSQ|X4Y3h6#wOSln_+Wofi1BWw#GKt7TaNa?0_Ay z6L!Wf*cH2BckF>Zu^0BnKG+xgVSgNe191=z#vwQqhv9G>fg^Dgj>a)K7RTXuoPZN? z5>Cb`I2EVibew@RaTdv(J(s3z?c{dV`ChQ zi}5f%CcuQ42oqxxbWor~g&JKnXwgF-0}R2Wm<*F+3QUQqFg2#Zw3rUlV+PEKnJ_bE z!K|1Kvttg-iMcR0=E1y}5A$OIEQp1$Fc!h0SPY9}2`q`Fur!vzvRDqwV+E{;m9R2a z!Kzpdt78qUiM6mc*1@`159?zCY>17pF*d=b*bJLv3v7w4ur;>9w%88aV+ZVrov<@@ z!LHa1yJHXRiM_Bl_QAf`5BuW)9EgK(Fb=_?I1Gp52pox{a5Rp=u{aLL;{=?DlW;Ol z!KpY6r{fHqiL-Dv&cV4j59i|oT!@QsF)qQSxD1!$3S5b+a5b*MwYUz~;|AP_n{YF3 z!L7Irx8n}niMwz&?!mpd5BK8%Jcx(zFdo69cnpu@2|S6X@HC#mvv>~A;|08km+&%P z!K-);uj388iMQ}J-od+g5AWjxe29D%9wrL5m*x7+?q{#blTqQ(#I=g{d(Orp0ua9y4G@%!HXS3ueV^ zm>qLqPRxb5F%Ra&e3%~#U_mT|g|P@0#bQ_-OJGSXg{83!mc?>d9xGr)tb~=Z3RcBx zSRHF%O{|5ru@2V7dRQMDU_)$#jj;(f#b($XTVP9Ug{`p-w#9bX9y?%1?1Y`M3wFhB z*d2RdPwa)gu@Cmee%K!e;6NONgK-EB#bG!cN8m^tg`;r{j>T~}9w*>LoP?8c3Qomo zI2~u;Oq_+YaSqPKc{m>z;6hx4i*X4q#bvl0SKvxqg{yH5uElk@9yj1d+=QEP3vR`2 zxE*)kPTYmNaS!greYhVF;6Xfuhw%s=#bbCJPvA*Bg{Schp2c%`9xvcUyo8tW3SPx) zcpY!xO}vG-@eba_dw3ro;6r?bkMRjU#b@{&U*Jo8g|G1qzQuR=9zWnm{DhzJ3x36K z_#J=XPyB_y@ehWI#s0_87zV>)I1G;wFd|06$QT8qVl<47F)${^!q^xG<6=CFj|ng# zCc?y+1RWG8QK3c`4O;Zj#{fexDJH|@m;zH`Dol-OFfFFT^q2uNVkXRtSuiVR!|a#? zb7C&cjd?IH=EMA001ILvER034C>F!wSOQC8DJ+d;uq>9t@>l^YVkNAMRj?{n!|GTA zYho>|jdidt*2DVP02^W>Y>Z8?DK^9A*aBN(D{PHzur0R3_SgYCVkhj3U9c;5!|vDv zdtxu_jeW2$_QU=-00-hA9E?M7C=SEnI08rFC>)Jra4e3)@i+k|;v}4mQ*bIy!|6B! zXW}fJjdO4=&cpe*02ksST#QR_DK5k1xB^$=DqM|ga4oLG^|%2y;wIdTTW~9G!|k{O zcj7MGjeBq}?!*0f01x6JJd8*1C?3P(cmhx2DLjp5@GPFg^LPO-;w8L{SMVxc!|Qkh zZ{jVyjd$=a-oyL&03YHbe2h=_xJ%n;wSu!U+^n_!|(V5 zf8sCvjejsyZ1z8f#xNKb!(n)gfDthgM#d-@6{BHvjDayR7RJUn7#HJVd`y4|F%c%l zB|SQBeu zZLEWJu^!gP2G|fAVPkB9O|cm^#}?QUTVZQ#gKe=Lw#N?G5j$aL?1Ejf8+OMY*b{qU zZ|sA8u^;xw0XPr`;b0tsLva`m#}POZN8xB3gJW?Vj>ic&5hvkfoPtwv8cxR0*UCP4=U zN>r%PMS~VS^fAB?Op3`cIi|prm85)v!9&z?xVKYhxX( zi}kQRHo%712peM)Y>LgWIkv!-*a}-?8*Gd1uswFbj@Su1V;Ag--LO0Mz@FF(dt)E$ zi~X=a4#0sp2nXX39E!tmIF7)PI0{GO7#xe^a6C@Hi8u)-;}o2V({MV@z?nD;XX6~4 zi}P?kF2IGj2p8iLT#CzZIj+E!xC&R}8eEI(a6N9ojkpOn;}+bC+i*MXz@4}YcjF%1 zi~Ddt9>9Zm2oK{CJc`HgIG(_hcnVMB89a;U@H}3?i+Bky;}yJ$*YG;tz?*mrZ{r=j zi}&z8KEQ|g2p{7Ue2UNTIljP`_zGX+8+?oJ@I8LOkN62c;}`sj-|##Bz@PXFf8!qv z6_@>wp)m}G#c&uNBVa^~gpn}{M#X3t9b;fjjD@i=4#vfJ7#|a0LQI5-F$p>-P@+PO zE*iAxp^pKEU{Xwm$uR|{#8j9X(_mUmhv_i`X2eXG8M9zk%!b)92j;|Fm>ct8Ud)I2 zu>cmtLRc7!U{NfF#jymI#8Oxq%V1e7hvl&XR>VqJ8LMDbtcKOG2G+z{SR3nLU95-o zu>m&3M%WmeU{h>{&9Mcx#8%iE+hAL4hwZTgcEnED8M|Ot?1tU32lm8X*cY>oQBhJ2F}D;I2-5ST%3pV zaRDyGMYtH3;8I+M%W(y+#8tQ&*Wg-QhwE_zZp2Nv8Mok8+=kn62kyjOxEuH2UfhTK z@cNB9_@;8T2t&+!Gm#8>zl-{4z(hwt$Ne#B4s8Nc9H{D$B02mZug_#6LVsCevu z42@whEQZ7I7y%<>vBd{AMznj3n9V z7E57iEQ4jS9G1rlSP?5>Wvqf#u^Lv#8t8#Fu@=_GI#?I$VSQ|X4bc-DVPkB9O|cm^ z#}?>?EwL50MsM^%U-UzNY=dplhV9Uf?Xd%P!~hJ$PS_bu48mXx!7kVpyJ2_ifjzMo zI*ZsI1b0-1e}PIa57H8sW=U%;|!dM zvv4-f!MQjO=i>rgh>LJBF2SX^442~yT#2i2HLk(6xDMCj2Hc37a5HYft+)-h;||=3 zyKpz|!M(T-_u~OPh==en9>Jq{43FapJc+09G@ik;cn;6w1-yut@G@S(t9T8s;|;ut zx9~RJ!Mk`5@8bh}h>!3wKEbE>44>l*e2K5{HNL^O_zvIW2mFYi@H2kFulNnW;}86a zzwkHyK`jaQe-vn76m-L=7!9Li40Oks7z<-#9E^+cFg_;0gqR2uV-ie?$uK#lz?7H@ zQ)3!Ti|H^uX26V?2{U6B%!=7CJLbTgm;O(V-YNh#jrS*z>-)B zOJf-T~}9w*>LoP?8c3QomoI2~u;Oq_+Y zaSqPKc{m>z;6hx4i*X4q#bvl0SKvxqg{yH5uElk@9yj1d+=QEP3vR`2xE*)kPTYmN zaS!greYhVF;6Xfuhw%s=#bbCJPvA*Bg{Schp2c%`9xvcUyo8tW3SPx)cpY!xO}vG- z@eba_dw3ro;6r?bkMRjU#b@{&U*Jo8g|G1qzQuR=9zWnm{DhzJ3x36K_#J=XPyB_y z@egWAx&Na;1EZiDM#X3t9b=$7#>7|{8{=SHjEC_t0Vc#mm>82_QcQ-)F$Jc?RG1pm zU|LLv=`jOl#7vkOvtU-thS@O(=EPi>8}ndZ%!m2002ahTSQv|7Q7neVu>_XHQdkuVU|Y0dJG5ha?0_9H00XfTc19C}Fc?Fy3wFhB*d2RdPwa&b?2Ub} zFZRR!H~D!}YiUH{vGTj9YLkZo}=k19##s+>Lv1 zFYd$rcmNOLAv}yn@F*U`<9Gs3;we0hXYeeZ!}E9nFXAP8n18?Fjyp4D8 zF5biY_y8Z`BYccc@F_mS=lB9&;wyZOZ}2U?!}s_BKjJ6+j9>68e#7th1ApQ#{EdH5 zOUC^l1sWIy-7qRf!{`_T-7zM{!q^xG<6=CFj|ng#Cc?y+1e0PiOpYlqC8omEmkzUYVk*aq984cnm|+hYgphyfUgov<^S7=*zXf?cpHcEj%21AAgGbYO4ngMG0d z_QwG@5C`F49D+k}7!Jn~I1)$UXdHuMaU71v2{;ia;bfeGQ*jzj#~C;iXW?v|gL82n z&c_9~5EtQMT!Kq+87{{axDr?4YFvYBaUHJ54Y(0E;bz=|TX7q1#~rv6cj0c_gL`ow z?#Bao5D(#DJc38@7#_zHcoI+HX*`2x@f@DV3wRMP;bpvnSMeHN#~XMPZ{cmcgLm;B z-p2>{5Fg=Ve1cE$89v7s_!3{?YkY%m@g2U$5BL#3;b;7UU-27$#~=6;f8lTZgIaR# z|0vMFDCmY!F&ak480d~MF&4(gI2ae>VSG%02{92S#w3^&lVNg9fhjQ+rp7dw7SmyR z%zzm&6K2LNm=&{OcFch}F&E~>JeU{rVSX%t1+fqo#v)i0i(zprfhDmNmc}wz7RzCI ztbi4<5?014SQV>bb*zCNSQBeuZLEWJu^!gP2G|fiu@N@LCfF34VRLMOUf2>_VQch8 zAM`~(^v5>X7H!xL?bseWU`GtVK#~#=dd!Yk+V;}5`{jfg{ zz=1dj2jdVNioc zz=gO77vmCKipy|0uE3SJ3RmMAT#M^)J#N5_xCuAo7Tk*4a69h6owy5k;~w0L`*1%V zz=L=Q591L$ipTIcp1_lM3Qyx1Jd5Y>JYK+ycnL4#6}*bq@H*bWn|KRv;~l(<_wYVG zz=!wtbEjES)@HpaoY7!TuP0!)aBFfk^7)R4Xa}f^uU@}3u|K?tc&%qJ~qIH=!uQ6F*d=b*bJLv3-rR4*a};tH~OG2 z`k_Cz!M13_c4)`;*a16Y00v?w?2IM`VK9bZ7wn4Nusim^p4bZ=*cY>oQBhJ2F}D;I2-5ST%3pVaRDyG zMYtH3;8I+M%W(y+#8tQ&*Wg-QhwE_zZp2Nv8Mok8+=kn62kyjOxEuH2UfhTK@c zNB9_@;8T2t&+!Gm#8>zl-{4z(hwt$Ne#B4s8Nc9H{D$B02mZug_#6MAmXiBF3N$bZ zx?xm|hS4zwx?@a?g|RUX#>IFT9}{3gOoWLs2`0s4m>g4JN=${RF%720beJA9U`EV@ znK27y#cY@zb6`%)g}E^g=EZ!N9}8eXEQE!z2o}X+SR6}WNi2n>u?&{Qa#$WKU`4Ei zm9Yv|#cEg`YoG_##9CMz>tJ21hxM@mHbhTsgpIKYHpOPx99y6lw!~K08okj6ebEp7 zu?@CG8@59`w#N?G5d$y~J7H%uF$jY(1iN5Y?1tU32lm8X=)m6C2m4|_?2iL*AP&O8 zI0T2{FdU8}a3qex(KrUj;y4_S6L2CZzFARfZQ zcm$8)F+7eZ@FbqX(|88Y;yFBz7w{rp!pnFCui`bljyLco-oo2>2k+uNypIp?AwI&# z_ynKgGklIO@Fl*&*Z2nC;yZkgAMhi7!q4~xzv4Iijz91x{=(n*2enk(|52cUQP2&e zVl<47G0+`jVl0e}aWF2%!}yp06JjDvj7cylCd1^I0#jltOpR$UEvCctm;p0lCd`ak zFe_%m?3e>{VlK>$c`z^L!~9qP3t}NGj76|07Q^CL0!v~kERAKbESAIaSOF_yC9I59 zuqsx=>R1CkuqM{R+E@qcVm+*n4X`14Vk2yfO|U68!{*omy|5*=!q(`GKIn^n=#Oo% zE!wah+Oa)$z>XMzf!GN88#yz+f_u+m#fCupq9>ybh z6p!I?Jb@?i6rRR2coxs$dAxuZ@e*FfD|i*J;dQ)$H}MwU#yfZy@8NxXfDiEzKE@~b z6rbU9e1R|V6~4wd_!i&cd;EYO@e_W=FZdO|;dlIjKk*m-#y_Z~=KhZY4UB?r7!{*o zbc})S7!zY*Y>b0(F&@Up1eg#LVPZ^zNii8F#}t?nQ(|=z%q{7S_f(SQqPIeQbaY(GweCV{C#=u^BeU7U+d7u@$yPZ}dT5^h1AagKg1< z?a+?xu>*F*01U)V*cnX>!e9)+F4z^jVR!6-J+T)$us8O>zSs}@;{Y6pgK#ho!J#+| zhvNtwiKB2dj=`}w4#(pJoQRWfGETv%I1Q)c44jFxa5m1txi}B!;{sfWi*PY6!KJti zm*WatiK}omuEDjq4%g!b+=!cSGj74HxDB`C4%~^ma5wJ3y|@qe;{iN~hwv~S!J~K# zkK+kEiKp;1p24$t4$tESyoi_ZGG4)}cnz=P4ZMlB@HXDTyLb=p;{$w%kMJ=*!Ke5P zpW_RBiLdZAzQMQn4&UPk{D`0MGk(FZ_zl0~5B!P0@HhTJEe-d76lh=+bi=3^4WnZW zbjO$&3u9v(jEnIwJ|@6~mJs)Gh-IairFwb z=D?ho3v**0%!~OjKNi4(SO^Pa5iE+uusD{$l2{5$V;L-q<*+ z)<6%eiM6mc*1@`159?zCY>1xN2peM)Y>LgWIkrGAY>BO~HF~2D`l28DV;gLXHf)D> zY>yqVBL-j~cEZkRVh{#n2zJ4)*bTd55A2D((1E?N5B9}=*dGVrKpcdFaR?5@VK^K| z;7A;Wqj3z5#c?SbyT1i(0EQZCg1eU~7SQ^Vw}aN>~}IU{$P!)v*S8 zU`?!rwXqJ?#d=sD8(>59#75W{n_yFHhRv}BdSOd!g{{#Web5*E&>!1iTeM+2v}1eh zfE_Ua1F;i!MiYZD7(=iNcExVk9eZF;?1c{OjeW2$_QU=-00-hA9E?M7C=SEnI08rF zC>)Jra4e3)@i+k|;v}4mQ*bIy!|6B!XW}fJjdO4=&cpe*02ksST#QR_DK5k1xB^$= zDqM|ga4oLG^|%2y;wIdTTW~9G!|k{Ocj7MGjeBq}?!*0f01x6JJd8*1C?3P(cmhx2 zDLjp5@GPFg^LPO-;w8L{SMVxc!|QkhZ{jVyjd$=a-oyL&03YHbe2h=_xJ%n;wSu!U+^n_!|(V5f8sCvjek%}$Ne7#8W;uLFe*mF=okatF($^s z*cb=nVmyqG2{0ih!o-*alVUPVjwvuDroz;i2Ge3XOph5bBWA+Pm<6+9Hq4GWFem21 z+?WURVm{1|1+X9%!opYti()Y>jwP@pmcr6l2FqeOERPkiB38o6SOu$MHLQ*`&;x5? zEv$`ourAia`q%&)q9-=O#@Ga#Vl!-xEzk>FVk>No-spqA=!gE;2HTxDhwuX54~XaT{*O9k>&B;cnc6dvPD`#{+l}58+`vf=BTf9>)`S5>Mf2 zJcDQP9G=Guco8q*WxRq{@fu#o8+a3M;cdKwckv$H#|QWjAK_zsf=}@oKF1gM5?|qK ze1mWC9lpm8_z^$hXZ(U+@f&`}ANUi0;cxtdT6)cT=_t^^DCmY!F&ak480d~MF&4(g zI2ae>VSG%02{92S#w3^&lVNg9fhjQ+rp7dw7SmyR%zzm&6K2LNm=&{OcFch}F&E~> zJeU{rVSX%t1+fqo#v)i0i(zprfhDmNmc}wz7RzCItbi4<5?014SQV>bb*zCNSQBeu zZLEWJu^!gP2G|fiu@N@LCfF34VRLMOUf2>_VQch8AM`~(^v5>X7H!xL?bseWU`GtV zK#~#=dd!Yk+V;}5`{jfg{z=1dj2jdVNiocz=gO77vmCKipy|0uE3SJ3RmMA zT#M^)J#N5_xCuAo7Tk*4a69h6owy5k;~w0L`*1%Vz=L=Q591L$ipTIcp1_lM3Qyx1 zJd5Y>JYK+ycnL4#6}*bq@H*bWn|KRv;~l(<_wYVGz=!wtbEjES)@HpaoY z7!TuP0!)aBFfk^7)R4Xa}f^uU@}3u|K? ztc&%qJ~qIH=!uQ6F*d=b*bJLv3-rR4*a};tH~OG2`k_Cz!M13_c4)`;*a16Y00v?w z?2IM`VK9bZ7wn4Nusim^p4bZ=*cY>oQBhJ2F}D;I2-5ST%3pVaRDyGMYtH3;8I+M%W(y+#8tQ&*Wg-Q zhwE_zZp2Nv8Mok8+=kn62kyjOxEuH2UfhTK@cNB9_@;8T2t&+!Gm#8>zl-{4z( zhwt$Ne#B4s8Nc9H{D$B02mZug_#6MAmXZ5E3N$bZx?xm|hS4zwx?@a?g|RUX#>IFT z9}{3gOoWLs2`0s4m>g4JN=${RF%720beJA9U`EV@nK27y#cY@zb6`%)g}E^g=EZ!N z9}8eXEQE!z2o}X+SR6}WNi2n>u?&{Qa#$WKU`4Eim9Yv|#cEg`YoG_##9CMz>tJ21 zhxM@mHbhTsgpIKYHpOPx99y6lw!~K08okj6ebEp7u?@CG8@59`w#N?G5d$y~J7H%u zF$jY(1iN5Y?1tU32lm8X=)m6C2m4|_?2iL*AP&O8I0T2{FdU8}a3qex(KrUj;y4_S z6L2CZzFARfZQcm$8)F+7eZ@FbqX(|88Y;yFBz z7w{rp!pnFCui`bljyLco-oo2>2k+uNypIp?AwI&#_ynKgGklIO@Fl*&*Z2nC;yZkg zAMhi7!q4~xzv4Iijz91x{=(n*2enMx|52cUQP2&eVl<47G0+`jVl0e}aWF2%!}yp0 z6JjDvj7cylCd1^I0#jltOpR$UEvCctm;p0lCd`akFe_%m?3e>{VlK>$c`z^L!~9qP z3t}NGj76|07Q^CL0!v~kERAKbESAIaSOF_yC9I59uqsx=>R1CkuqM{R+E@qcVm+*n z4X`14Vk2yfO|U68!{*omy|5*=!q(`GKIn^n=#Oo%E!wah+Oa)$z>XMzf!GN88#yz+f_u+m#fCupq9>ybh6p!I?Jb@?i6rRR2coxs$dAxuZ z@e*FfD|i*J;dQ)$H}MwU#yfZy@8NxXfDiEzKE@~b6rbU9e1R|V6~4wd_!i&cd;EYO z@e_W=FZdO|;dlIjKk*m-#y_ZK=KhZY4UB?r7!{*obc})S7!zY*Y>b0(F&@Up1eg#L zVPZ^zNii8F#}t?nQ(|=z%q{7S_f(SQqPIeQbaY z(GweCV{C#=u^BeU7U+d7u@$yPZ}dT5^h1AagKg1*F*01U)V*cnX>!e9)+ zF4z^jVR!6-J+T)$us8O>zSs}@;{Y6pgK#ho!J#+|hvNtwiKB2dj=`}w4#(pJoQRWf zGETv%I1Q)c44jFxa5m1txi}B!;{sfWi*PY6!KJtim*WatiK}omuEDjq4%g!b+=!cS zGj74HxDB`C4%~^ma5wJ3y|@qe;{iN~hwv~S!J~K#kK+kEiKp;1p24$t4$tESyoi_Z zGG4)}cnz=P4ZMlB@HXDTyLb=p;{$w%kMJ=*!Ke5PpW_RBiLdZAzQMQn4&UPk{D`0M zGk(FZ_zl0~5B!P0@HhTJEerR56lh=+bi=3^4WnZWbjO$&3u9v(jEnIwJ|@6~mJs)Gh-IairFwb=D?ho3v**0%!~OjKNi4(SO^Pa z5iE+uusD{$l2{5$V;L-q<*+)<6%eiM6mc*1@`159?zCY>1xN z2peM)Y>LgWIkrGAY>BO~HF~2D`l28DV;gLXHf)D>Y>yqVBL-j~cEZkRVh{#n2zJ4) z*bTd55A2D((1E?N5B9}=*dGVrKpcdFaR?5@VK^K|;7A;Wqj3z5#c?Jlc5@-pvL|S4kiI!AL zrX|->XeqT+T52tgmR3urrPnfO8MRDWW-W_sqHJ1rEr*s<%cbSk@@RRrd|H03fL2f| zq!reRXhpSRT5+v}R#GdamDb8=Wwmlzd98w0QLCg?)~aY#wQ5>*t%l~I)zoTfwY550 zU9Fy0Uu&Q>)I7CDT4Sw=)>LbzHP>2bURq17mDXDG)_gQy%}?{!+GuSxo7PUVYwfiT zT1PEF3)DJkoi$Sn(t@=Rt&7%G>!x+rdT2ehUYbMet@Y9RYW=kS+5l~!Hb@(+4bg^b z!?fYr2yLV`N*k?>(Z*`ywDH;mZK5_wo2*UIrfSo)>Dml!rZ!8Pt0%e3X%3T>sfN?Wb1(bj6~wDsBsZKJkH+pKNTwrbn7?b;4)r?yMmt?kkF zYWuYP+5zpLc1Sy{9np?z$F$?x3GJkIN;|Ec(avh;wDZ~p?V@%`yR2Q&u4>n`>)H+N zrglrat=-Y?YWKAJ+5_#O_DFlIJ<*-3+<)$N_(xn(cWtBwD;Nv?W6Wd`>cJ@ zzG~mJ@7fRTr}j(xt^JYRuj@iL^eDQU9#xN~N7rNM?s`l;mL6M=qsP_b>GAahdO|&s zo>)(!C)Jbb$@LU^NG|~ndO^LAURW=p7uAdD#q|<;NxhU_S}&uQ)ywJS^$L1Ly^>y8ucBAgtLfGC8oGyG zQ?I4h*6ZkX^?G`Jy@B3P_tYEdjrAsaQ@xqqTyLR!=`HnEdTZTV_tAZIKiyw%qqo&< zdOO{&x7R!99rXY`Q17I7)=fP~57tBUE_zqJo8DdTq4(5#=?=ZO-be4N_tX391N4FV zAbqetL?5aT(}(LL^pW}~eY8GCAFGek$LkaHiTWgcvOYzhs!!9W>ofG3`Ye66K1ZLc z&(r7Y3-pEhB7L#GL|>{e)0gWj^p*N5eYL(uU#qXv*XtYfjrt~iv%W>&s&CV`>pS$F z`YwI9zDM7y@6-3|2lRvbA^os^L_ew@(~s*X^ppB2{j`2YKdYb9&+8ZTi~1$~vVKLs zs$bKu>o@e9`Yrvoen-En-_!5w5A=unBmJ@dM1QJ3)1T`v^q2Z8{k8r^f2+UK-|HXr zkNPM5v;IZ@s(;hJ>p%3L`Y-*r{zp!JT?k=_D8fxd712a=5kt6(m?D;lE#io{BA$pZ z5{QH%kw`3(h@>K!NG?){lp>W#Ez*dzBArMtGKh>KlgKQxh^!);$S!h-oFbRVE%J!G zBA>`F3W$QDkSHvQh@zsHC@xBflA@F-Ey{?pqMRr%Du{}rlBg`Ih^nHRs4i*<4^dOp z617DgQCHLx^+f~GP|TqO&kXkO&qbqKoJ%x{22p7m@KA?(@VwG4e)`+!Yomek6h>c>C*ete)tzw(lE_R5WVwc!0_K3Y=pV%)Bh=byg zI4q8cqvDu2E>4J(;*>Zo&WN+(oH#Eoh>PNqxGb)StKyotE^dgM;+D8A?ufhMp13a_ zh=<~lcr2cXr{bA-E?$V2;+1$U-iWv2op>)kh>zlv_${>V$e zZV1CLq8M&QR3n-Z-H2hh8!?SoMr)JSF|H&PfWjZ{Wz zBaM;PNN1!sG8h?+Oh#rSi;>mHW@I;V7&(nxMs6dIk=Mv)hz77#>DVqn1(IsAJSM>KXNo21Y}} z(`aNgHkuesjb=u3qlMvRv@}{7tqpI($M7}$41c4I(bljT?F_ro-soU-Gy;r3qm$9u zFpVH1*a$JY7+sBSMt7r!(bMQy#AEU3)&**OqFa{cfjKRhbW2iCA7;cO(MjE4x z(Z(2KtTE0QZ%i;I8k3C4#uQ_!G0m86%rIsevy9os9AmCA&zNs4Fcun%jK#(hW2v#s zSZ=H^RvN2})y5iQt+CEnZ)`9&8k>yG#uj6%vCY_S>@ap3yNun&9%HYu&)9DqFb*1r zjKjteWJh5EEfyOoB-<879XRm=aTA zYD|N!Q%^@aJx2K1r+mV3z9K9KxgHDUlaKS29dlq#%!Roz59Y;um>&yZK`exYu?QB$ zVptqYU`Z^6rLhc_#d264D_}*egq5)hR>f*q9c!Qm*2G#^8|z?QtcUfn0X9TWY=n)m z2{y%M*c@A+7q-M!*c!dj2Yt~G{jm+UMH{w5JGRFT*bxIT5IbRKG%*N+F$B9{SL}w} zu?P0VUg*Hy*a!P!KkSbKa3BuC!8inm;xHVJBXA^+!qGSe$Kp5~j}verPQuAJ1*hUP zoQ^YaCeFgyI0xtAJe-dUa3LSeNC+@=CxCi&*KHQH7@E{(-!*~Rb;xRmqC-5Ym!qa#L&*C{ej~DPFUc$?G1+U^Y zypA{UCf>r^cn9y|J-m+(@F70J$M^)F;xl}XFYqP4!q@l)-{L!bk00{VlK>$c`z^L!~9qP3t}NGj76|07Q^CL0!v~kERAKbESAIaSOF_yC9I59 zuqsx=>R1CkuqM{R+E@qcVm+*n4X`14Vk2yfO|U68!{*omy|5*=!q(`GKIn^n=#Oo% zE!wah+Oa)$z>XMzf!GN88#yz+f_u+m#fCupq9>ybh z6p!I?Jb@?i6rRR2coxs$dAxuZ@e*FfD|i*J;dQ)$H}MwU#yfZy@8NxXfDiEzKE@~b z6rbU9e1R|V6~4wd_!i&cd;EYO@e_W=FZdO|;dlIjKk*m-#y=?k3}C%<6lh=+bi=3^ z4WnZWbjO$&3u9v(jEnIwJ|@6~mJs)Gh-Ia zirFwb=D?ho3v**0%!~OjKNi4(SO^Pa5iE+uusD{$l2{5$V;L-q<*+)<6%eiM6mc*1@`159?zCY>1xN2peM)Y>LgWIkrGAY>BO~HF~2D`l28DV;gLX zHf)D>Y>yqVBL-j~cEZkRVh{#n2zJ4)*bTd55A2D((1E?N5B9}=*dGVrKpcdFaR?5@ zVK^K|;7A;Wqj3z5#c?O}p7vo`kOn?b75hlhYm=u#?a!i3KF%_o9G?*6CVS3Df88H)P#w?f> zvtf43fjKc3=Egjj7xQ6$EPw^E5EjNFSQLw4aV&u)u@siZGFTSNVR@{86|oXl#wu79 zt6_DlfgV^BYhi7ygLSbU*2f0e5IwOGHpV8{6q{jlY=K_b5?f(w^hO`_ML+b%HrN(z z*beR39y?%148TC_gq_jEAPmM3?1Ejf8+OMY*b{r91AAj1?2G-dKMufwI0y&h5FCob za5#>@kvIxR;}{%^<8VAqz==2sC*u^Hiqmj9&cK;C3uogToQv~tJ}$t8xCj^H5?qSQ za5=8PmADF5;~HFx>u^18z>T;GH{%xEira8I?!cY63wPrl+>85gKOVq?cnA;U5j={= z@Hn2plXwbG;~6}Q=kPpUz>9bZFXI)wir4Tu-oTr93vc5cyo>knK0d&Q_y`~46a4%8 zycF^awfucv6#4tSVrEL2aQ!?lmE3lispWelKhH~}!XCOuv_ZDl!#tyDx-g=+MUD99 z{KXD4ttuGd&-v*>cV0Hk%S^A5p1QaE;h(4O>#6&B>i(X38&AEh|KFp`Aj^dKgm^{` z@;BT02bmdV+F@o=d)iX^cJdDJ^A5`68)*9HaTaiIV3l(>Gs`0Kw}x4)@>x}R&ls&b z`UiXac?Wx&*<`@W%r4(;$-Ql!wnnmO4pqp@%&CH@Y)u_zE)}pf_t0!DJan6vha10a z%q`>IUS=LuFzink^Q!W)ZQKUQA1iofb3UhNnv8~fMN9|#$jb7m%B)Y#uWE_pQwzwr zx5F%`3U+Xqg{)!$cI$8hz0ATg?J$d2QPURWp?SD@=<@fJMOFC_yE?WI?~owbu-$Ck z)rt7n9@%N6V%EOJ)xLJsGIpzBoUKwqMZL_D5_x=29cC#Nus!!OOIsU>Y%l-*cC(C% zI2%9?4mWw~x-LI3Y>~1Nis*6#1^f8h-g=qkRF++r?{*>I{B0k-%Ti!C->o0KR+AgDC#y@s4T~BwK&$wK85Yh8-P9KE$p%C*r)Q z^^p?oelUEBL6^LZs`pUQ*7JjN=*sF!VD({>u|MOw-wU0F{ z+NjE`Pi?DeiR4plGA>)Cohm3>#cmY~uv>>KTcy2BJIoGNG;FJMROLhL>ewQ*il>`j zaI4@RW`N9cwNIejc9@;y`@ikeSw+1}Q{p_*AQgz@NP}fuwoiyE=w^%RFuPa)|H0P2L)5SYd-IGc623fQ98rs+n=q{AF(1tT=w zD7DXjX}ZyBC)srFgKVT25Ir`My(rqUAk2=r9?#f-90xnbti_ezl`cX`=_TD?8{ z#wh33ZFF1t=HByMSEOZv`t($eijwH~mCz<0BR@wr@mO^@VQ0>`(9f6mS^4?SmyB93 z*VX?&UBjbNE*eJ17--F~@#;gZ;W$BkT4=jXlu3s<$qGhjx5;Xs|I%(#)K32o+HI=* z9692qsl$;YZo2wdc}28_f*f%(WZGfQw4%-t=WabI%u?k;?5cGmj<^i25jWe~caGZk zpAk1#MZL^<66cJVuL6&jdz^i1g^UPMut;o)7jf~5dU#kkrmS1NT3$R;F zB3pjFOgqdCRy6Ef->Axm*wy(*Xn9Y=FQk*XNoKfuV6)tIm|Nugzdf*3MZL^z5_z2R zA$+?EM7Z$nP_fWf-l>x69;%hM-YZz|6|MJ5&Uon4_%2yaHpFg8xSrdiDsk5NKfjmp zh_hE!W*zN5RZE1U-5*-B8WRUp(s@+Y`+q&!gR-#fjYAUE8;8}tavoUCBYWeBOgqe@ zRy1sH98={(?CP8%_J&)>cHR8_%;Pf4)gLG1w!=Ir-~a88Q!46Zo|ZVr%^4MlWJa8o zaoMitRKc(_;=C#!+E*7;vMQG;O@1fIQ$2ryA_WTOE0izH%e{7OIg~G|x_(qGM``B> zzNCWI-RiQcDv~3)BIB~nuBw8v&8}I+0_@f)$Tqt!(+=~76%E^FH&yu%yE@*8Z6^GJ z&08|=YOvdK+hN|3@BcQ~T^02*?@648d0z!0Im`z#E*tEjDj2rG9;x!q2D9$!|HE68 zHN79J%B;qDqH2lcQ=iJXY@BDRplqDyR}-TLlLA`v!I}qsoZu)?zInSjF8d$>o76YDaf7nk*r2 zD$%V9VyFtNd!Co+F1O{^=zDVEV1XAaA6|9z&6Cu7cLif>gA+DwA(O(5g`US>jxt9uj4ZHJjy?eK5+CQ)Jeq4JxW>|UX3 zk@Rj-RVG63CJWt99c*%|$hWY4n?i;>qqg$(_O&dzVu6WZCXIZ;5yWoH+(vcq2G3aN-{ zGY@01T53w>x+TbIV7+9%^^(0EL&oG7Ei6NFc}VT}?Lj%b2q_DnwF2 z#qbqWa#c`1w1Ubq=I=18NYvn}s`ikBOLemvT-8+CVOF=oYIMoDWR0#Gsz8Wcbw!B7 z^!S@EhrN|w)0wZ@J;YzFyUA~$DP3c&mW(>g+G;1~V|0YER!8M{nROM;v#V!qhHsww zDy9y|c|~cUl97+KhBC{ZTaEhmIWM@QE& zI7e21N=AMvfig>WM<<-mbsyj@n!wj;bVY?$(<%ihSx&G~rN3QM&v3BaB zc8at+x~d#6vzx+s4&ANI@ZHfv#X`HIr%FbC4!vZStmmo2bXfJsMb_TdM%YExJ}M%s zlZ*TRX_2*Wgd#%z*D`C{{jB=l+df#U+WoDR)%pXh&G1J)kb0QcgRCO*s%qUkZ12_5 z^I#csp8AkTDi}%y@_Mc=%fnRCTH$S?*0JR`VSj&fxC}bi+12BR?Sraygp7q*Sskg8 zUgjvZd$=JwT18}A$kwoi=-)qe{GW!P{HU<^u`yxmiR@1%GT(YdT*{HHD%XXqKSQPEQi~N1`)D#t6$r7bc9b`d*?;q0H*f#mHcn5vQ`$5SHN`=H3rtJ@^TEQwpU|dgVbSew4%--;QZ#Y zN#%#wRm-@pfS$4B61%m+AQ#$$%*`^(bxGVJw;kqI`5t+PZ&Nv5=5~ei)OJ{#;X8b% zipg%^!m>*x!=74bSMQctvL5+w*?S_XXD{{qhd*@6T$Nu749ndG;BSQ-R_t-|2$|%sOMSOdYsE#FXnz5n<&1fl_)U! zj&j(640#euuQuG$OPTet$BD%8Bh>ReY(45h;e`q`QGfGp6BwA^d?}NzQ~~opRNkzDv;|BeZT%z@H>(U{)DeUlh-`w_zSH-moYipghb6Y zL#k$*8s%!XMNw&o>1Kt)&bFv(hY-6OeR8%%`{9jDplws@nznH$Ms7I)WDm8smtAJh2TDzL8b~Q)sYHHfmj12R41N*;!9FQ%S zM(ytGy|mV5_})v$$1z#c%VfmKnnA{#6=aO0f=uBn$n2^h%w)~-w}PyZRFExv1=(E{ zgqf^4{#KAPk_vKB0nb0VRnj?IBM!GbGE2_cyb?8M^Qn?@&Z+^f=4^hIc9;dMaM(Fp zP!$NVt05=nT%o`Dt~p!SnXiVD%rEjc-!*58I`h>Kk@>~`<_nLg9zxF9;?8{4^{zQ9 z7rN!o;QTw7C1lWHmQ?##UlzhGZd*m%>#Bn8W+_=lt_ha5b|_=*kWD^#+1=$Zv2}CR zS5{`Q!d*_4k-x@LI6Jt4wHbbdR8%oJ(fB@5NhO^PYrQ-FR#!fBPFI!%oO8O0RolOF zy6WE&)gqK|Er?f_aXEHtsDfed*&eFAIu`jeI59-6qr7$9U7h~F`_AF~ygcQc=EMA0 z01ILvER034C>F!wSOQC8DJ<>$u2WMrjP+fomTHj*57xC^t>nBS*HOv;)?;;L0r_30 zo*_jSgRHGkk0MxemiQI{anQYQ4}$n>I4!?2WdO>~9Odf4hJ8x66>c z%(j=P@zg=>AjgyHWi_5UsR?d;sS0$~MvWIV52L5+ujQinOniaLjb Ub8&yL$`7%twsT!yLPGNVfBy>S&;S4c diff --git a/pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_x86_64_darwin_2.7.14.pickle b/pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_x86_64_darwin_2.7.14.pickle deleted file mode 100644 index 555be58cc33acbb34a2669809cd7528ba83fb329..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 132762 zcmc$^1ymeO&?q|Y?z3?x?(QxmL^nzfbcXxMpcXxMpcb7M_K=S32d;jy! zd-pEqOifo;S66jOclFQ<@>yVj8;y#Ki0o|H4LHD!M1&?qT1a?8Xk0{SVu7T@NN-eR zVgWx16%rTODWsZBkQ5agS=$*!S?Ir${*4OrS$KdOO6qD6L2jg-H`21mzYmOW*Df(K zDUqrc+J&;H+`TQ@XE6b8GA`a58yXYUHPT`SyYY||GK&j$By|L#oJT}tOj0Oi@qu6< z5C{xxWC?C5BsnoM#2XnCniSPBg}@RAyO9o^A`&gh=LX{f!Q?Q%#-Wz%K|QvRN`Qw5 z@{mCuIXo^oHla&_@OW=zf!O4jq^MA@*A~p?7akv%nB+|kPogX}g^IG*%HL~gfk1yZ z7-r9}2iT%e{Zk1dlDZ^BQkLO{>Xotu1Z@H{AP{kb<${8O>io2S{zo9-vsWoD(H?65 zB@IXl8|-cX00$mj!OlwnpmvRSJ1~Mq<~eZfv`KFq2o=a*`GO4_?%X}nfwLMkz2U%@ zx1Z#@Xu}GLJ-az@IHDZ6C?(lC$A!j5TCSkv0gbG5o)nng4M)W#SsBvI;y1Z%`moT% z$P~+`T0G<5DeRT9y(FBf9hX$1SSnE_pOrblBYRZO?YJZ|jmbbzQRB z#zs=LL{lM_3h`7(q(U+kQmK$mg-j}BQ{gYA^Ci3S)CjZk`>X;1ZoEZgsMi+TDwt-_ zv^7YJ#8d*xDrDy@g;ANmz_qp{8^Sj7SvTU9%(_`0EqNzEc7JJfIIUmvUZ zf6ekYLdq&(&sNfBl}eq>D(%KQ*^DAWyI5rc;Y5%9NDE%NN^)XUTx4RRoiwbn|24y3 zh$yR^&Aq(Os^D<Rr6WZ?TH4t(Zr-sZ<1Bx&#XmRHQiW>fA()=)pEm;wmE8hY$VS)UTY)kIN}8c zcxrfRd1`y=B-;`>0;VLnx^{Mq?3@%5=B!OUkE2?U&#M22Y7PDv(a_e$PJEaRu`H;t5R*aW*))DGrI9aXFswxwD>^4jrn6R(Mqu2kuG+0Tb8}s`nB@fDAYkET8Yl9!ywU0at5Vv zu#$t^bZk_(H$Ju9G00Oa$j$r(m1uRc>4})g&;&ov&UOorjfzW7icGY+*bUf5TU~=} zIs8bgTaazaso$Z_KC0d6`jG%l5YqUEc%OY=Ccm}YFkpD3J+)WhRs(w;u6*FSu& z_n$sTHB7eSD$<|F`uvw6e$!A^Uz>41pVi;*f2=<$qvG19q^JRbuq|ksIB+#vn}HtZ zBr?cn4gQBDL;jQmtCynm(EpMmrJu5f*$jvKtP%fUIPy=1Osd#bBHKsBS)=}o?O&-V zYqZULjL#bDXa1)?q!WN4O_FV6jq^BRINoPX@YDS@04F-8nq=o#&`GqQvlx)=^@u<7 zP5v(-QZn8Yo9R@aHSHgoP5+x_jU!{Dei?hle=+?l6=lt|na}cBv;E9%8Rxhe+uiKr zV2I!At+|0lqQ`!unU8kVP1}6t{ns3SA)>7L_7n?z)|}4PPxkk&^Q`yV)lbHxo~T1&I>pCY z8$8bLyU}NDO5?cMp>liyOV!%;2JV+>Qgq+)Ut<1AM_F5K=G%PM_B7@@Y>SbO*~qjd zC7nY@3fGnRop+KX#TZC4g)ncLduaWe9LpLM`L zlJhvZ_Mp!?6yS*s1Uzv;9y_6W>_qCZf;@Ke^w?)|kDV?($w3}}X7t#(&|_ynkDd5D zU4uM-=DNcUePY^z*;zU~G{(ujM|{>%JDWR++)h2Q3D&XSI>tnWQBBf1DC>Cb!bOXh zC|$Z#u|mIpR8WvzI-KxiPa78U$9a{qP6qi?=P93cI#sdXhtM;B_*lJ9P8D)C*>lWi zowEzSuz36UcHU!8TcmW+QiV#E_&sYO%FpzI&${>zrkDO;8lKQ4sVfzdKv|aq+*IYn zsL+B<;yZMSx31J1)x+5#>NIX*T}^6aU9;!D?jh5r`>T$H?CG4Od&6hl{Ks^+{+P~L zjN2X}jZ5)Tf04Nm736V*a5pZPa*&C$?$lYe`R7kZjOw-+cWp85*$nQx;keKw>p|L* zCM7zS2nn%~A<1zGQQ;k8?9%OFP=D)DBkQpnOQ}b#Cuy_(O?JP9o+i7=#DtKP>e+hc zv!2^^cDtB(d!83*^CTwN9uZj}g@$VEZ#DLp+m!Xv*8i2ydTpQ5LlTq2?CQ~a<7QJv zq;Y-gMnmixIMjNV#?fD_+C^NXH!d_Lk!oyjY%!5lV&cO))C-LZZ69eLO5fXzKLi9@ zAAeUqX@#CQG_HLl)yR3K);*tm)@S?VX)hRJpA-|UFV4}!K940>UjzBnV(MFHu=U-e zriqZMR7yFVqEqFR()h=(()g!S?~whSI^sO<%9S#16OQo4E}zj$CE9eVo=-k6 z?hJ7Z;jvBN`L+07!p4XCyrd1>SfK)iixnvBrR;{!OFJXnNNDHCL@#5%dp^3cFvk-T zy{z2~_HuqQ-cLrvw)ckEorzw-Zik12ge6DWHy&{zA(R~p;gOVAwDGpib}q3cllhkiA0@o9-SGTO4zj$FJq~xSBgkhNU>WS9LfbA+}q4H9xad zylyw_L=7b+dJX&4V~2s)v|oH)m;DUzym3pZG2mp-)Ijj2b4cHLY}fXtcgTXh861pr zq2tZyALrcZIMtCilil=rGy9Rh%Rg@xhyE{xl{c%6un(>&w?f`*e)`mlp+9Vy-RZHt znfLoe+TD&fhadevy58vw04FmZuKZpEfQ+kxM>#OI@|znD+;(O_{tq^!Bhqzu;3xfZ z`;RuPxj1%%11FS$&`&nJe{@1)2Nny?JI8@p#K*%w+c05{@0tU57M1dTu_3pAK^F%G zH!XYUifjZdz|3gGZ|4Z2ANu3w! zpB5^%69NAcQ+fYE_(x3Tv%6DbD!=`f;u!^;A^zJwzh@M*n?7$LKk_f0QP`pXt7jCk z5x+g7sGt7dJfoP?q3*oI~xzd~`~zC%XxI zpKRx~?<2kCZM4r@!G3!FzDfECQuj#v6c<{<89;d}+O&35SMm?6Y(Je8!K9{$6sA=i zcK@|Q{;@ZDtEOgR`xV`13;j zv|X;YA8p@Wb^r6)x1xRF8SJg&^ftF2KxO~4ivL;F|Ew1ID_#R_iWC>F>kRnq!u9N? z&l}`N{>6psJM@2b;RZJ1w+lD))Bl?bH*$JxC8$4>TVp@^f0W$pyX*gNliROFXkrVI z>dS8XH8R-S)Di2CtkujfC>V&RWvx<9_W1q#pPwUspBqxL6!pg@;26kRConBbAX?NM#=?~dk{U4IOb?Q8SW?1Jc{crLI|AWv`(z%28>(J(3bO=cu zf~Bp0A)ERyzyJOE|F74*Z7ToY>+UfBo7|!QAWBirKF5Upi_GDv!r`acSJJJ`5iY+{@te}230#yYqZ-%n)|m&T^bZ#MCNWs~qP zY^;B<@usn{(|wvWiGO92^e=3Z|G}nX8k?%W*>tjTkv?x{8>DW2U7T?#o1c^RQ#Ze^ z4%X-G=I{LL?q`R(z1rO!O8@TXB${NO*Y`I?$&t?C+rO*2Nv9%A+4=g~U2ZlcBC=hm{rxhegS~@!`#Ct& zUkG^nJMZ$Z{=XdE2iQHo95p#C(GA5$c?a4^pLdWm2=PX?FXA0+zy7m|^nWX*>N#Ib zQ+LTB{((d7XMh`v2@Q*kN%Ri0TfyGp4(@mD=r3RW`)dtveAmdhplT_lqj!W&msUM` zM>>OSk0Vk_vBIgP*eE~xpIfUPK6Zr#03ZlKFzm!q^dIa-|6&*IuyAO>l)t2~NH^+y zC{6T^v3r8OV;$1w&gaH>H|E~)c01BmF|wnh;spPgiO!f`5j4r) z`ZEwGJ2+b(qxJa!rxKT;XmKO z*w-BK-lTYMM5Ncdz(zQql)Ve>mqa%l9-8D`WWV~ni|wcX9}SX<7V$2z8)@aY-{dx3 zWczsg3vy@-Wryjnckfc0(mviT^Yd8l=YfTU$4A%}Sz)*S*o0R)-GN{|Ay45Y=1|TDl3CIj&0kQ(wfb2jHASaLu2mo>ed4Rk?J|I6(04N9)0ty2~fTBP#pg2$h zC<&ATN&{tpvOqbYJWv6s2vhkKsBH`Py?t5)BO93)BOGfciiKpdru* zXbdy~+(1*H8PFVP0ki~K0j+^xpbZcLv;{(eFd!U=03v~QKzqOgL;=x22OtKB1>%5s zAOWxdFOUc%0m(o|pcBv;=mK;Fx&hq*AJ7Bn3G@Pb1ATzLKtG^AFaQ_`3<3rNLx7>c zFkm<^0vHL50!9O4fU&?hU_3AZmC0|~F0$~sVQ4j-hkN`=L0%?!|S&##HPyj_x0%cGERZs(U&;U))1*QYj zgBie#U?wm#m<7xVW&^W>Il!D?E-(Ph4dwy!g89JwU;(foSO_c(76FTb#lYfV39uws z3M>tl0n38r!17=Pup(FqtPEBGtAf?Q>R=79CRhus4c4)Dn7Uv+FbJ#0CQ?MD>9Bcu$1Y3cv!C{ICug)37!H^gJ;0A;5qO-cmcc!UIH(JSHP>_HSjuk1H1{| z0&jzNz`Ni*@ILqed6 zH}E_71N>Og@|U8o)u1l5NcKnnh2}x?p#{)FXc4p+ zS^_PFmO;y*70^m(6|@>!1FePDLF=In&_-wzv>Dn0ZH2Z$+o2uMPG}dj8`=Zyh4w-F zp##uC=n!-mIszSqjzPzv6VOTM6m%Lo1D%D=LFb_h&_(DHbQ!t=U4^be*P$EGP3RVM z8@dDCh3-N3p$E`I=n?c7dICL#o5a%gejPY8JLARn1=;ege6#p60UV3^#$@ za8tM$+#GHJw}e~4t>Iv}4IBcug+t*mI2?|EBjI*%d)NavLV@#97s+i7ZQNvM)Dwek$gygqySP7DTEY8iXcUiVn}hM1X2GRgr2)b)*JT6RCyNM(QAeNL{2J5`@%88XygkMo43%3F1bY zBF&KINDHJT(h6yf1S4&b5Tq>9wZ8hMmiudNGuYE#3Koag?N!f zBne4IIwGBr&PW%eE7A?=j`)xsNKd2}(i`c6^hNq1{gDC4Kx7ay7#V^LMTQ~6krBv9 zWE3(Q8H0>P#v$X83CKib5;7T?f=orGA=8l=$V_AwG8>tL%thuQ^N|I}LSzxL7+HcW zMV2AUkrl{FWEHX+S%a)a)*$A={B1$WCMzvK!fh>_zq=`;i04 zLF5o}7&(F*MUElIkrT*C

6=3(=QdDJ{+ z9yd>zC(TplY4ePE);wpPH!qkM%}eHG^NM-Zyk=fEZP~j~T`a zYlXAITM?{?RwOI36~&5bMYEz?F|3$YEGxDZ$BJvkv*KF`tb|q~E3uWtN@^LFX<3$S zIhL?o%d@2ATY;r4ZG~9LtmIY-E2WjnN^PaF(pu@P^i~Edqm{|ZY-O>sTG_1ZRt_tt zmCMR)<+1Ww`KT__qE*SNY*n$U zTGg!TRt>ABRm-Yv)v@YY^{o0<1FNCc$ZBjgv6@=Vtmak=tEJV-YHhW#+FI?b_Erb0 zqt(gkY<01^THUPfRu8ME)ywK_^|AU|{jC1h0BfK%$Qo=7v4&d1tWayXHNqNcjj~2t zW2~{(IBUE$!J24IvL;(otf|&CYq~YVnrY3lW?OTtxz;>uzO}$wXf3i9TT85^)-r3k zwZd9yt+G~IYpk`_I%~bP!P;nTvNl^=tgY5IYrD0>+G*{wc3XR_z1BW!zjeSmXdSW+ zTSu&;)-mh2b;3Gnow80_XRNc$-Krx@q0AZd-S(yVgDH zzV*O*Xg#tXTTiT~)-&t5^}>2-y|P|gZ>+c0JL|pm!TM-@vOZg1tgqHL>$~;C`f2^L zep`R6Fm_lwoE_edU`Mng*^%uic2ql>9o>#$$FyVFvF$i^Tsxi}-%emBv=iBh?Idw^P_D?NoMZJB^*zPG_gLGuRpJOm=2Fi=EZZ zW@oo^*g5T7c5XY5o!8E1=eG;k1?@t1VY`T3)GlThw@cV1?NWAWyNq4dE@zjwE7%q7 zN_J(tie1&NW>>ds*fs51c5S*S8zk4edsDW4npn)NW=sw_DgP?N)YcyN%t} zZfCc*JJ=oVPIhOzi`~`kW_P!H*gfrDc5l0n-Pi7C_qPYw1MNZfV0(x?)E;Js+QaP; z_DFk_J=z{)kG03y+KEpMthUJ+1_GrwYS;Z?H%?`dzZc2-ed2z_u2dH z1NK4tkbT%bVjs1S*~je@_DTDcecC=_pS91~=j{vjMf;L{*}h_5wXfON?Hl$@`<8v% zzGL6D@7ee52lhkzk^R_yVn4N?+0X44_DlPf{n~zGzqQ}l@9huvNBfig+5TdGwZGZl z?H~3}`_l;*I?*^&JB}k9*YO`=-GAFr{!b$0*a#A~KoU~3lC%u!w$>?Nq zGCNtEtWGv3yOYDo>Ev>9J9(VEPCh5UQ@|Lic>C|#+J9V78PCci-)4*xyG;$g{O`N7qGpD)J!fENW za#}lWoVHFor@hm`>F9KFIy+sQu1+_nyVJwz>GX1XJAItKPCuu=Gr$?>3~~lLL!6<` zFelU*?u>9oI-{J?&KPH`GtL?BOmHSTlbp%U6lbb4&6)1ZaArEQoY~GCXRb5PneQxc z7CMWZ#m*9Esk6*k?yPWDI;))3&KhT}v(8!XY;ZO@o1D$g7H6xo&DrkkaCSPooZZeI zXRou*+3y^14myXN!_E=ssB_FY?woK=I;Wh|&Kc*dbIv*MTyQQrmz>Ma73ZpR&AIN} zaBe!coZHSF=dN?lx$iu19y*Vl$IcVysq@Tv?!0hbI~h$^Cq=pu%QDPoD(B94eF;)(bo zfk-G4iNqp_NGc3r3QO3+5kk1a6H@ph5K3qfB9e*ZB85mPQi;?ejYuofiS#0a$S5+2 z%p!}(Dzb^}B8SK+a*5m`kH{DyoUJrqJ?NFT8Y-8jc6;{ ziT0v{=qNgg&Z3LxD!Pg8qKD`ydWqhmkLWArBr#b`5mUu9F57KgBQcTl^7W+^}vqH@q9c zjp#;lBfC-DsBSbjx*Nld>Be$nyK&sOZag=>o4`%zCUO(IN!+Ba;hL`H+OFdY*L6Kt zy1pB@%GGX&o6Jq_rf^fbsod0V8aJ(*&Q0%Ta5K7@+{|tkH>;b?&F zuba=!?-p2B7x{ch%ZWFhu+stk5ws2dzt=!gb8@H|7&Ta2@a67u4+|F(n zx2xOD?e6w)d%C^c-fkbauiMYyC5B zyA#}r?j(1zJH?&qPIITbGu)Z(EO)j$$DQlWbLYDY+=cEUcd@&~UFt4#m%A(6mF_Bc zwY$b$>#lRxyBpk%?k0D$yT#q=ZgaQ0JKUY_E_b)P$KC7hbN9Ok+=K2R_pp1!J?b8F zkGm(_lkO?^w0p)q>z;GZyBFMx?j`rKd&Rx#UURRzH{6@>%Mc}yC2+-?kD%N`^Ek0esjOOKir@0FZZ|m#|`6! z^}>1Ky$D`JFOnD8i{eG~qIuE17+y>-mKWQLUed%dR~36f!EM$4E2uZP#u>*e+K`gnc4eqMiXfH%+^eb-#g$P^bUE4 zy(8XH@0fSoJK>%5PI;%jGu~P6oOj;4;9c}Cd6&H_-c|3Kcip?;-SlpGx4k>wUGJWE z-+SOa^d5PSy(ivN@0s`9d*QwGUU{#*H{M(Co%i1R;C=Kyd7r&6-dFFN_uc#9{q%l$ zzr8l!3Yk)-lBs1HnO3Hg>176)QD%~vWfqxLW|P@v4w+Nt zlDTCbnOEkM`DFoFP!^JfWf56a7L&zg30YE>lBH!CSyq;lwxm+Pv%2jf;TqD=Yb#lGj zAUDcQaqz@~}K2kIG~6xI7_G%2V>RJR{G_ zbMm~rATP>G^0K@lugYuky1XH8%3Jcbyd&?*d-A?~ARo#{^09m(pUP+QxqKmC%2)EW zd?Vk=ck;dbAV11a^0WLRzshg&yZj-4%3t!g{3FBoVf}D^ct3(4(U0Uu_M`Yw{b+u4 zKZYOEkLAbquea9ER>wCWReLwJ(ul*1|nV;NG z;ivRd`KkRhep)}BpWe^lXY@1qnf)w&RzI7c-Ou6Y^mF;S{XBkNKcAo9FW?vS3;Bip zB7RZ7m|xs4;g|GF`KA3bep$bqU*50aSM)3SmHjGyRlk~F-LK)-^lSOG{W^YKzn)*; zZ{Rod8~Kg>CVo@Dncv)R;kWc#`K|pnep|nt-`?-wcl0~?o&7F;SHGLz-S6S|^n3Ze z{XTwQzn|aVAK(x42l<2jA^uQ*m>=p7_ec06{ZamCe~drYALozvC-@WnN&aMiia*t# z=1=!$_%r=k{%n7aKi8k<&-WMj3;jj@VtkSWL3GxQ{g8V^& zpkPoaC>#_CiU!4k;z5a^WKb$79h3>m2IYeCL4}}VP${S!R0*mE)q?6lji6>wE2tgR z3F-#*g8D&&pkdG`XdE;Nng-2+=0S^~WzZ^U9kdDB2JM3OL5HAY&?)E~bP2i!-Gc5x zkDzDJE9f2c3Hk>8g8spPU|=vP7#s`5h7 zCI?f3sll{hdN3oH8O#c12XlhC!MtF8upn3%ED9C}OM<1rvS4|zB3K!$3RVYeg0;cA zV12M5*cfaIHV0dRt--cnd$1$e8SDyn2YZ6O!M zqKc#3s-!BbDypigrmCwNs-~)?YO6Y`uBxZ% zs|Ko}YNQ&gCaS4wrkbl3s-ZCfWE~=~Qrn;*hs;BCudaFLF zuj;4zs{v}D8l(oRA!?`^rb5+lHA0P4qts|MMvYbD)Oa;PO;nTAWHm)iRnydTHABr* zv(#)gN6l69)O@u-EmVutVzopqRm;?JwL+~_tJG?>My*xr)Oxi+ZB(1oX0=6aRom2d zwL|SxyVP#AN9|Sn)P8k99aM+ZVRb|uRmaqEbwZs~r_^b6Mx9mX)OmG5T~wFUWpzbe zRoB#Ybwk}$x72NQN8MHT)P40pJyeg>WA#KmRnOFO^+LT=uheVxM!i+<)O+;+ zXZ1yWRo~Qi^+WwsztnH_M}^U0bvPYfN6-;JpgJ6dR0ds=E=2U=;ZLv%8oT&K_}bt;`&r_pJ3I-Oo;&>3|m zompqmS#>s@UFXm_buOJ-=h1m}KAm3|&;@lNU04^r#O?5NfT({6Ibt~Okx6y5N zJKbJ)&>eLr-C1|hU3E9zUH8yEbuZmp_tAZIKiyvs&;#`#Jy;LXL-jBns)y?ldZZqu zN9!?qtRAPw>j`?Io}?%1DSE1&rl;!}dZwPGXX`n7uAZmo>jiqDUZfZ6C3>k|rkCp# zdZk{aSL-!;tzM_s>kWFN-lR9{Eqbfornl=IdZ*r{ck4ZRuimHk>jV0rKBN!pBl@U5 zrjP3r`lLRkPwO-KtUjmD>kIm#zN9bfEBdOwrmyQ8`li06Z|ghyuD+-5>j(Owexx7k zC;F*=rl0E<`lWuQU+Xvet$wH9>ks;){-i(aFZ!$groZbS`ltS-f9pRwOi0*}a3SGC zB7{T?i4+n!BuYrskZ2*%Lt=!)42cyIJLGTPd6c^U2HZ7n)M4G{z)=NOaz z-#t`+&#~#_;NOVwf9vm-xqmA@CcuQ42oqxxOo|4YXrYY`3Utv!i9QCXP-6%t!{nF( zQ(`JijcG6~{ufG{o<0L+#7vkOvtU;IFH|@?eGbftxiB~8!MvCc^J4)lh=s5)7Qv$U zUnqNV`Vv?YOJQmJH{D_}*egq5)hR>f*q9cy4stcA6)4%WqbSRWf; zLu`bNu?aTCX4o8CU`uR;t+5TZ#dg>pJ77obgq^VqcExVk9eZF;?1jCt5B9}=*dGVr zKpcdFaR?5@VHk?TaRiRUQ8*gM;8+}o<8cB`#7Q_Ar{GkahSPBd&csv02a#7(#vx8PRXhTCxm?!;ZV8~5N|+=u(| z03O6cco>i1Q9Opn@dTd4Q+OKB;8{F}=kWqw#7lS?ui#a@hS%{1-o#sY8}Hy_!ytyQ+$Tc@ddubSNIy=;9Go$@9_hE#83Dczu;H=hTriA{={GS8~@9t@>l^YVkNAMRq&tC^M9%u zR>vCnPuBTARSRol9juG>us$}xhS&%jV-swO&9FJPz?RqwTVoq+i|w#IcEFC<2|HsK z?26s6JNCey*b94OAMA_$us;sKfj9^U;}9H*!!Q(w;|Lsyqi{5i!Lc|F$KwQ?h?8(K zPQj@-4X5J_oQbn=HqODhI1lIJ0$hlTa4{~yrML{2;|g4ft8g{0!L_&!*W(7kM!LxV{&*KHWh?np( zUcsw)4X@)3yotB)Hr~Ozcn|O61AK^&@G(BYr}zw?;|qL=ukba#!MFGh-{S}Th@bE? ze!;K!4Zq_L{E5HtH~zu@cAfzZZzd;(KFb9E^+cFg_;0gqR2uV-ie? z2AXK0jSdQQ(L;$o2B=VD2qweim;zH`Dol-OFfFFT^q2uNVkXRtSuiVR!|a#?b7C&c zjd?IH=EMA001ILvER034C>F!wSOQC8DJ+d;uq>9t@>l^YVkNAMRj?{n!|GTAYho>| zjdidt*2DVP02^W>Y>Z8?DK^9A*aBN(D{PHzur0R3_SgYCVkhj3U9c;5!|vDvdtxu_ zjeW2$_QU=-00-hA9E?M7C=SC=9F8M!B#y$-I0nb!I2?}?a3W5^$v6e4;xwF&GjJx( z!r3?n=i)q^j|*@iF2cpQ1efA6T#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN z!rizB_u@X>j|cD|9>T+T1drk|JdP*uB%Z?4cm~hnIXsUS@FHHq%XkH^;x)XEH}EFj z!rOQU@8UhYj}P!6KElWN1fSwFe2y>hCBDMf_y*tNJA98H@FRZ0&-ewu;y3(`Kkz61 z!r%A@|Jx-NHaznm!{fipO}p7vo`kOn?b75hlhY zm=p~((Lx&?6zHOd5`7F%p~etQhRHDnro>d38q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRs44))ZhEF zI(-eSiM6mc*1@`159?zCY>17pF*d=b*bJLv3v7w4ur;>9w%88aV+ZVrov<@@!LHa1 zyJHXRiM_Bl_QAf`5BuW)9EgK(Fb=_?I1EE^IF7)PI0{GO7#xe^a6C@Hi8u)-;}o2V z({MV@z?nD;XX6~4i}P?kF2IGj2p8iLT#CzZIj+E!xC&R}8eEI(a6N9ojkpOn;}+bC z+i*MXz@4}YcjF%1i~Ddt9>9Zm2oK{CJc`HgIG(_hcnVMB89a;U@H}3?i+Bky;}yJ$ z*YG;tz?*mrZ{r=ji}&z8KEQ|g2p{7Ue2UNTIljP`_zGX+8+?oJ@I8LOkN62c;}`sj z-|##Bz@PXFf8!tgcZcfV=V#am%zq4z5ilY~!pIl}qhd6SjxjJM#=_Vb2jgNqjE@O0 zAtu7am;{refhJmLqk{rn^iZOY0V>oOg2^yBrofb#3R7bmOpEC-J!Zg+mVx%J$As3*aof=wpBiHHKg^OpYlqC8omEm1$w3tcA6)4%WqbSRWf;Lu`bNu?aTCX4o8CU`uR;t+5TZ#dg>pJ77obgq^Vq zcExVk9eZF;?1jCt5B9}=*dGVrKpcdFaR?5@VHk?TaRiRUQ8*gM;8+}o<8cB`#7Q_A zr{GkahSPBd&csv02a#7(#v zx8PRXhTCxm?!;ZV8~5N|+=u(|03O6cco>i1Q9Opn@dTd4Q+OKB;8{F}=kWqw#7lS? zui#a@hS%{1-o#sY8}Hy_!ytyQ+$Tc@ddubSNIy=;9Go$@9_hE#83Dc zzu;H=hTriA{={GS8~@L#V+4$dkuWkw!KfGwqhkz=iLo#?#=*E4594D3 zOo)jvF($#JXrPG}+UTG_7d@2dV}J@ZhF~&GjwvuDroz;i2Ge3XOph5bBWA+Pm<6+9 zHq4GWFem21+?WURVm{1|1+X9%!opYti()Y>jwP@pmcr6l2FqeOERPkiB38o6SOu$M zHLQ*`uqM{R+E@qcVm+*n4X`0L!p7JHn_@F;jxDeyw!+rf2HRpgY>yqVBX+{h*af>{ zH|&l*uqXDy-q;8GVn6JU18^V?!ofHMhvF~{#o;&tN8%_Pjbm^uj>GXd0Vm=loQzX& zDo(@cI0I+mES!yVa4ycn`M3ZV;v!s(OK>SJ!{xXFSK=yMjcaf%uEX`X0XO0%+>Bdr zD{jN>xC3|MF5HcKa4+t|{dfQm;vqbYNAM^f!{c}YPvR*&jc4#Ip2PEa0Wabuyo^`y zDqh3ucmr?ZExe6)@GjoN`}hDK;v;;FPw*)|!{_({U*ao#jc@QRzQgzU0YBm={ET1l zD}KZ8_yd39FZ_*vFid3TKZe8b7y%<7)R4Xa}f ztckU-HrBzqSP$!C18j(murW5lrq~RdV+(AFt*|w=!M4~A+hYgph@G%AcEPUL4ZC9x z?1{awH}=84*bn>T033*ea4-(Rp*RdfaX5~^kvIxR;}{%^<8VAqz==2sC*u^Hiqmj9 z&cK;C3uogToQv~tJ}$t8xCj^H5?qSQa5=8PmADF5;~HFx>u^18z>T;GH{%xEira8I z?!cY63wPrl+>85gKOVq?cnA;U5j={=@Hn2plXwbG;~6}Q=kPpUz>9bZFXI)wir4Tu z-oTr93vc5cyo>knK0d&Q_y`~46MTx#@HxJ~m-q@_;~RX7@9;f-z>oL|KjRntir?@% z{=lF33xDGu3=@Ux}57z<-#9E^+cFg_;0gqR2uV-ie? z2AXK0jSdQQ(L;$o2B=VD2qweim;zH`Dol-OFfFFT^q2uNVkXRtSuiVR!|a#?b7C&c zjd?IH=EMA001ILvER034C>F!wSOQC8DJ+d;uq>9t@>l^YVkNAMRj?{n!|GTAYho>| zjdidt*2DVP02^W>Y>Z8?DK^9A*aBN(D{PHzur0R3_SgYCVkhj3U9c;5!|vDvdtxu_ zjeW2$_QU=-00-hA9E?M7C=SC=9F8M!B#y$-I0nb!I2?}?a3W5^$v6e4;xwF&GjJx( z!r3?n=i)q^j|*@iF2cpQ1efA6T#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN z!rizB_u@X>j|cD|9>T+T1drk|JdP*uB%Z?4cm~hnIXsUS@FHHq%XkH^;x)XEH}EFj z!rOQU@8UhYj}P!6KElWN1fSwFe2y>hCBDMf_y*tNJA98H@FRZ0&-ewu;y3(`Kkz61 z!r%A@!$f8NV>k?t5ilY~!pIl}qhd6SjxjJM#=_Vb2jgNqjE@O0Atu7am;{refhJmL zqk{rn^iZOY0V>oOg2^yBrofb#3R7bmOpEC-J!Zg+mVx%J$As3*a0*UCc&g=potdR=%7Fs zJ(TEUfC@E+U@}aODKI6b!qk`s(_%VIj~Or{X2Q&v1+!u{%#JxQC+5Q3mKFp5= zupkz~!dL{0Vlga^C9oux!qQj<%VIe!j}@>YR>I0y1*>8;td2FXCf35*SO@E3J*D!}YiUH{vGTj9YLkZo}=k19##s+>Lv1FYd$r zcmNOLAv}yn@F*U`<9Gs3;we0hXYeeZ!}E9nFXAP8n18?Fjyp4D8F5biY z_y8Z`BYccc@F_mS=lB9&;wyZOZ}2U?!}s_BKjJ6+j9>68e#7th1ApQ#{EdI`zo6c* z(f{`z5Dvp*1dNE0FfvBLs2B~SV+@Rmu`o8q!MGR?<6{C$h>0*UCc&g=potdR=%7Fs zJ(TEUfC@E+U@}aODKI6b!qk`s(_%VIj~Or{X2Q&v1+!u{%#JxQC+5Q3mKFp5= zupkz~!dL{0Vlga^C9oux!qQj<%VIe!j}@>YR>I0y1*>8;td2FXCf35*SO@E3J*D!}YiUH{vGTj9YLkZo}=k19##s+>Lv1FYd$r zcmNOLAv}yn@F*U`<9Gs3;we0hXYeeZ!}E9nFXAP8n18?Fjyp4D8F5biY z_y8Z`BYccc@F_mS=lB9&;wyZOZ}2U?!}s_BKjJ6+j9>68e#7th1ApQ#{EdI`zkvR* zF_`}t9wT5xjD(Rf3P#0f7#(9^OpJxGF%HJXco-iOU_wlUi7^Q#MFUN=&_)LZy6B-q z9|KgVF$9xga!i3KF%_o9G?*6CVS3Df88H)P#w?f>vtf43fjKc3=Egjj7xQ6$EPw^E z5EjNFSQLw4aV&u)u@siZGFTSNVR@{86|oXl#wu79t6_Dlfi{ z5Fg=Ve1cE$89v7s_!3{?YkY%m@g2U$5BL#3;b;7UU-27$#~=6;f8lTZga0k&2^*97 zkKr)_M#M-M8KYoSjE2!M2FAo#7#rhYT#SeDF##sTM3@+pU{W;DLRk0dY#~N4@Yhi7ygLSbU*2f0e5F24* zY=TX(88*ij*b-Y|YixsUu^qO@4%iVpVQ1`uU9lT>#~#=ddtq43-9DyTo6pqF*I2Om@c$|O}aS~3(DL56U;dGpVGjSHq#yL0_=iz)@fD3UE zF2*Ie6qn(0T!AZb6|TlLxE9ypdfb2;aT9LFEw~l8;db1CJ8>88#yz+f_u+m#fCupq z9>ybh6p!I?Jb@?i6rRR2coxs$dAxuZ@e*FfD|i*J;dQ)$H}MwU#yfZy@8NxXfDiEz zKE@~b6rbU9e1R|V6~4wd_!i&cd;EYO@e_W=FZdO|;dlIjKk*m-#y=P)7XSZaI1G;w zFd|06$QT8qVl<47F)${^!q^xG<6=CFj|ng#Cc?y+1e2nHCR%8tg92UjP@<0kD%2Q) z$uK#lz?7H@Q)3!Ti|H^uX26V?2{U6B%!=7CJLbTgm;O(V-YNh z#jrS*z>-)BOJf-us$}xhS&%jV-swO z&9FJPz?RqwTVoq+i|w#IcEFC<2|HsK?26s6JNCey*b94OAMA_$us;sKfj9^U;}9H* z!!Q(w;|Lsyqi{5i!Lc|F$KwQ?h?8(KPQj@-4X5J_oQbn=HqODhI1lIJ0$hlTa4{~y zrML{2;|g4ft8g{0!L_&!*W(7kM!LxV{&*KHWh?np(Ucsw)4X@)3yotB)Hr~Ozcn|O61AK^&@G(BY zr}zw?;|qL=ukba#!MFGh-{S}Th@bE?e!;K!4Zq_L{E5HtH~zu@7V(CS&F_B85)v!9&z?xVKYhxX(i}kQRHo%712peM)Y>LgW zIkv!-*a}-?8*Gd1uswFbj@Su1V;Ag--LO0Mz@FF(dt)E$i~X=a4#0sp2nXX39E!s* z6o=yo9EqcFG>*ZsI1b0-1e}PIa57H8sW=U%;|!dMvv4-f!MQjO=i>rgh>LJBF2SX^ z442~yT#2i2HLk(6xDMCj2Hc37a5HYft+)-h;||=3yKpz|!M(T-_u~OPh==en9>Jq{ z43FapJc+09G@ik;cn;6w1-yut@G@S(t9T8s;|;utx9~RJ!Mk`5@8bh}h>!3wKEbE> z44>l*e2K5{HNL^O_zvIW2mFYi@H2kFulNnW;}86azwkHy!7y?7{g2@=JVwBX7zra| z6pV_|FgnJ-m>3IVV;qc&@i0Cnz=W6x6JrugiUyi!p^Xj-bkReJJ_e{zV+bb0SI818ZU}tc`WBF4n{P*Z>=1BW#RKuqigf=GX#T zVk>NoZLlr2!}iz#J7Op7j9suRcEj%21AAgG?2Ub}FZRR!H~ZzFARfZQcm$8)F+7eZ z@FbqX(|88Y;yFBz7w{rp!pnFCui`bljyLco-oo2>2k+uNypIp?AwI&#_ynKgGklIO z@Fl*&*Z2nC;yZkgAMhi7!q4~xzv4Iijz91x{=(n*2gAf={$n@{j}b5;M#9J#1*2j# zjE*rdCdR_p7zg8GJdBSCFd-(w#Fzw=qJbt_XrqGyUGz|*j{z#w7=p<#Ii|prm85)v!9&z?xVKYhxX(i}kQRHo%712peM)Y>LgWIkv!-*a}-? z8*Gd1uswFbj@Su1V;Ag--LO0Mz@FF(dt)E$i~X=a4#0sp2nXX39E!s*6o=yo9EqcF zG>*ZsI1b0-1e}PIa57H8sW=U%;|!dMvv4-f!MQjO=i>rgh>LJBF2SX^442~yT#2i2 zHLk(6xDMCj2Hc37a5HYft+)-h;||=3yKpz|!M(T-_u~OPh==en9>Jq{43FapJc+09 zG@ik;cn;6w1-yut@G@S(t9T8s;|;utx9~RJ!Mk`5@8bh}h>!3wKEbE>44>l*e2K5{ zHNL^O_zvIW2mFYi@H2kFulNnW;}86azwkHy!7%Zd{}>L#V+4$dkuWkw!KfGwqhkz= ziLo#?#=*E4594D3Oo)jvF($#JXrPG}+UTG_7d@2dV}J@ZhF~&GjwvuDroz;i2Ge3X zOph5bBWA+Pm<6+9Hq4GWFem21+?WURVm{1|1+X9%!opYti()Y>jwP@pmcr6l2FqeO zERPkiB38o6SOu$MHLQ*`uqM{R+E@qcVm+*n4X`0L!p7JHn_@F;jxDeyw!+rf2HRpg zY>yqVBX+{h*af>{H|&l*uqXDy-q;8GVn6JU18^V?!ofHMhvF~{#o;&tN8%_Pjbm^u zj>GXd0Vm=loQzX&Do(@cI0I+mES!yVa4ycn`M3ZV;v!s(OK>SJ!{xXFSK=yMjcaf% zuEX`X0XO0%+>BdrD{jN>xC3|MF5HcKa4+t|{dfQm;vqbYNAM^f!{c}YPvR*&jc4#I zp2PEa0Wabuyo^`yDqh3ucmr?ZExe6)@GjoN`}hDK;v;;FPw*)|!{_({U*ao#jc@QR zzQgzU0YBm={ET1lD}KZ8_yd39FZ_*vFid>rKZe8b7y%<Gd= zlp^*4>|>*1cXxMpcXxMpcgMSP&IKxR-_P@WdC!Oa?at25?*317pF*d=b*bJMaH?}|@Y>B?;hyECVf!GRLV-U8% zU~G%+uswFbj@Su1qXk1S6vMC!cExVk9eZF;?1fhBjeW2$_QU=-00&|?4#L4W1S4=L z4#VL%0!QK~9F1deERMtRH~}Z(B%F*>a4Js2={N&t;w+qvb8s%s!}+)X7vdsZj7xAS zF2m)x0$1WHT#ajREw01$xB)lfCftl$a4T-Z?YIMX;x62cdvGuA!~J*w58@#_j7RV& z9>e2!0#D*8JdJ1YES|&jcmXfsCA^GR@G4%z>v#ii;w`+5cknLW!~6IEAL1i?j8E_> zKEvnu0$<`Qe2s7LExyC|_yIrSC;W_G@GE}9@Aw0M;x80Q8UInCfez@1P8bWFF*dqj z9E^+cFg_;0gqR2uV-ie?$uK#lz?7H@Q)3!Ti|H^uX26V?2{U6B%!=7CJLW)F%!#=$ zH|D{-m=E(~0W64xurL*1($R zjKQ~H3nfD492$D z4%=e~?1-JPGg>eNLop1yU{~yh-LVJu#9nB{-q;8GVn6JU18^XQ;~*T2Lofn|;xHVJ zBXA^+!qGSe$Kp5~j}verPQuAJ1*hUPoQ^YaCeFgyI0xtAJe-dUa3LSeNC+@=CxCi&*KHQH7@E{(-!*~Rb;xRmq zC-5Ym!qa#L&*C{ej~DPFUc$?G1+U^YypA{UCf>r^cn9y|J-m+(@F70J$M^)F;xl}X zFYqP4!q@l)-{L!bk00yhEV*~U=FKmd7urW5lrq~Rdqc^rdA8d)f=!gCofPvTwTVoKm!C-8Q?XW#| zz>e4nJEH|dFcibE3wFhB*d2RdPwa(O?2Ub}FZRR!H~)Jra4e3)@i+k|;v}4mQ*bIy!|6B!XW}fJjdO4=&cpe*02ksST#QR_DK5k1xB^$= zDqM|ga4oLG^|%2y;wIdTTW~9G!|k{Ocj7MGjeBq}?!*0f01x6JJd8*1C?3P(cmhx2 zDLjp5@GPFg^LPO-;w8L{SMVxc!|QkhZ{jVyjd$=a-oyL&03YHbe2h=_xJ%n;wSu!U+^n_!|(V5f8s9`$r=Aqp@9zQh)x&_oiR4LU>uB#@i0Cn zz=W6x6JrugipelJrofb#3R7bmOpEC-J!Zg+mLgWIeKFY^udj0T_s_ur&r@8w|#_*bduc2keNQ zurpdP1Vb?lyI@!BhTX9T_QYOj#opKl`(i)rj{|TZhT|X{j6*O2hvG0Cjw5g+j>6G6 z2FKz!9FG%lB2L1|I0dKTG@Onza3;>e**FL1;yj#>3veMW!o|1*m*O&9jw^5_uEN#0 z2G`;`T#p-YBW}XYxCOW3Hr$Roa3}7<-M9z$;y&Du2k;;s!ozq3kK!>rjwkRWp2E|3 z2G8O-JdYRfB3{DFcm=QGHN1{D@Fw2E+js}>;yt{N5AY#A!pHaopW-uojxX>fzQWh| z2H)a4e2*XSBYwiq_yxb>H~fx2@F)I4k%I9b6&mP(j_8E3&>3T+3&z2?7!TuP0!)aB zFfk^th4-L@#WJjj%B`!KT;@o1-_jKp$+0zUYVk7=VG;3R`0kw!vU*i|w#IcEFC<2|J?& zLogJ>unTs@ZrB}rU{CCYR_u*^urKz*{x|>!VmJ=M!8imXa3~JL;Wz?E;wT)AV{j~v z!|^x)C*mZWj8kwbPQ&Rq183qaoQ-pEF3!XGxBwU8B3z71a49as<+uV@;woH?Yj7>D z!}YiUH{vGTj9YLkZo}=k19##s+>Lv1FYd$rcmNOLAv}yn@F*U`<9Gs3;we0hXYeeZ z!}E9nFXAP8n18?Fjyp4D8F5biY_y8Z`BYccc@F_mS=lB9&;wyZOZ}2U? z!}s_BKjJ6+j9>68e#7th1ApQ#6e$`1QK5kj=!i}j3!O1Gx?miPi}5f%CcuQ42oqxx zOp3`cIi|prmus$|G zPxQiu*a#bA6KsmjusM2T3-rO3=!<^nj{z8nt*|u)VH*s_w%88aV+ZVrov<@nFa$#} z47*@g?1tU32lm8XXvN;x2m4|_?2iL*Aco^09E?LS0*B%-9F8M!B#y$-I0nb!I2?}? za3W5^$v6e4;xwF&GjJx(!r3?n=i)q^j|*@iF2cpQ1efA6T#hSnC9cBNxCYnaI$Vz% za3gNQ&A0`(;x^olJ8&oN!rizB_u@X>j|cD|9>T+T1drk|JdP*uB%Z?4cm~hnIXsUS z@FHHq%XkH^;x)XEH}EFj!rOQU@8UhYj}P!6KElWN1fSwFe2y>hCBDMf_y*tNJA98H z@FRZ0&-ewu;y3(`Kkz61LXnE`9~BztfR5;dvCtV~qYK8txEK%PV**Twi7+uH!K9cB zlVb`@iK#F(roptB4%1@>%!rvVGiJf8m<_XI4s^wwm;O(V-YNh z#jrS*z>-)BOJf-7)R4Xa}ftcmVe3u~hX*1@`159?zC^h7Ug zh>fr@Ho>OY44b1jwm=_jiN5HE{uqFP*a}-?5VpZ!Y>Vx%J$As3*a|i> z#ctRgdtguOg;wm1eXuX~!~Qq`2Vyu5!ofHMBXB4V!{ImrN8%_Pjbm^uj>GXd0Vm=l zoQzX&Do(@cI0I+mES!yVa4ycn`M3ZV;v!s(OK>SJ!{xXFSK=yMjcaf%uEX`X0XO0% z+>BdrD{jN>xC3|MF5HcKa4+t|{dfQm;vqbYNAM^f!{c}YPvR*&jc4#Ip2PEa0Wabu zyo^`yDqh3ucmr?ZExe6)@GjoN`}hDK;v;;FPw*)|!{_({U*ao#jc@QRzQgzU0YBm= z{ET1lD}KZ8_yd39FBGX6|52fV4(NzZ7z>>-Ho9OOjEnIwJ|@6~mJs)Gh-IairFwb=0I1>iMcR0=E1y}5A$OIEQp1$Fc!h0SPY9} z2`q`Fur!vzvRDqwqZ?MhidYFNV->85)v!9&z?$fewXimNU>&TB^{_rRKu`3-hS&%j zV-swO&9FIoV+-`bmgtLq=#K#yh^??S24Ncv#ZzFARfZQcm$8)F+7eZ@FbqX(|88Y;yFBz7w{rp!pnFC zui`bljyLco-oo2>2k+uNypIp?AwI&#_ynKgGklIO@Fl*&*Z2nC;yZkgAMhi7!q4~x zzv4Iijz91x{z8$4@gEf$=zxysgt5>WW1|bk!MGR?<6{C$h>0*UCc&hb43lFDOo^#5 zHKxI|m=4op2F!?=Ff(Sste6e6V-9r1oR|x9V;;17pF*d=b z*bJMaH?}|@Y>B?;hyECVf!GRLV-U8%U~G%+uswFbj@Su1qXk1S6vMC!cExVk9eZF; z?1fhBjeW2$_QU=-00&|?4#L4W1S4=L4#VL%0!QK~9F1deERMtRH~}Z(B%F*>a4Js2 z={N&t;w+qvb8s%s!}+)X7vdsZj7xASF2m)x0$1WHT#ajREw01$xB)lfCftl$a4T-Z z?YIMX;x62cdvGuA!~J*w58@#_j7RV&9>e2!0#D*8JdJ1YES|&jcmXfsCA^GR@G4%z z>v#ii;w`+5cknLW!~6IEAL1i?j8E_>KEvnu0$<`Qe2s7LExyC|_yIrSC;W_G@GE}9 z@Aw0M;x80w8UInCfez@1P8bWFF*dqj9E^+cFg_;0gqR2uV-ie?$uK#lz?7H@Q)3!T zi|H^uX26V?2{U6B%!=7CJLW)F%!#=$H|D{-m=E(~0W64xurL*1($RjKQ~H3nfD492$D4%=e~?1-JPGg>eNLop1yU{~yh-LVJu#9nB{ z-q;8GVn6JU18^XQ;~*T2Lofn|;xHVJBXA^+!qGSe$Kp5~j}verPQuAJ1*hUPoQ^Ya zCeFgyI0xtAJe-dUa3LSeN zC+@=CxCi&*KHQH7@E{(-!*~Rb;xRmqC-5Ym!qa#L&*C{ej~DPFUc$?G1+U^YypA{U zCf>r^cn9y|J-m+(@F70J$M^)F;xl}XFYqP4!q@l)-{L!bk00yhEV*~U=FKmd7urW5lrq~Rdqc^rd zA8d)f=!gCofPvTwTVoKm!C-8Q?XW#|z>e4nJEH|dFcibE3wFhB*d2RdPwa(O?2Ub} zFZRR!H~)Jra4e3)@i+k|;v}4mQ*bIy!|6B!XW}fJ zjdO4=&cpe*02ksST#QR_DK5k1xB^$=DqM|ga4oLG^|%2y;wIdTTW~9G!|k{Ocj7MG zjeBq}?!*0f01x6JJd8*1C?3P(cmhx2DLjp5@GPFg^LPO-;w8L{SMVxc!|QkhZ{jVy zjd$=a-oyL&03YHbe2h=_xJ%n;wSu!U+^n_!|(V5f8s9` z=^6h~p@9zQh)x&_oiR4LU>uB#@i0Cnz=W6x6JrugipelJrofb#3R7bmOpEC-J!Zg+ zmLgWIeKFY^udj0T_s_ur&r@8w|#_*bduc2keNQurpdP1Vb?lyI@!BhTX9T_QYOj#opKl`(i)r zj{|TZhT|X{j6*O2hvG0Cjw5g+j>6G62FKz!9FG%lB2L1|I0dKTG@Onza3;>e**FL1 z;yj#>3veMW!o|1*m*O&9jw^5_uEN#02G`;`T#p-YBW}XYxCOW3Hr$Roa3}7<-M9z$ z;y&Du2k;;s!ozq3kK!>rjwkRWp2E|32G8O-JdYRfB3{DFcm=QGHN1{D@Fw2E+js}> z;yt{N5AY#A!pHaopW-uojxX>fzQWh|2H)a4e2*XSBYwiq_yxb>H~fx2@F)I4k%93a z6&mP(j_8E3&>3T+3&z2?7!TuP0!)aBFfk^th4-L@#WJjj%B`!KT;@o1-_jKp$+0zUYVk z7=VG;3R`0kw!vU*i|w#IcEFC<2|J?&LogJ>unTs@ZrB}rU{CCYR_u*^urKz*{x|>! zVmJ=M!8imXa3~JL;Wz?E;wT)AV{j~v!|^x)C*mZWj8kwbPQ&Rq183qaoQ-pEF3!XG zxBwU8B3z71a49as<+uV@;woH?Yj7>D!}YiUH{vGTj9YLkZo}=k19##s+>Lv1FYd$r zcmNOLAv}yn@F*U`<9Gs3;we0hXYeeZ!}E9nFXAP8n18?Fjyp4D8F5biY z_y8Z`BYccc@F_mS=lB9&;wyZOZ}2U?!}s_BKjJ6+j9>68e#7th1ApQ#6d4)+QK5kj z=!i}j3!O1Gx?miPi}5f%CcuQ42oqxxOp3`cIi|prmus$|GPxQiu*a#bA6KsmjusM2T3-rO3=!<^nj{z8n zt*|u)VH*s_w%88aV+ZVrov<@nFa$#}47*@g?1tU32lm8XXvN;x2m4|_?2iL*Aco^0 z9E?LS0*B%-9F8M!B#y$-I0nb!I2?}?a3W5^$v6e4;xwF&GjJx(!r3?n=i)q^j|*@i zF2cpQ1efA6T#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN!rizB_u@X>j|cD| z9>T+T1drk|JdP*uB%Z?4cm~hnIXsUS@FHHq%XkH^;x)XEH}EFj!rOQU@8UhYj}P!6 zKElWN1fSwFe2y>hCBDMf_y*tNJA98H@FRZ0&-ewu;y3(`Kkz61LXk<>J`xog=zxys zgt5>WW1|bk!MGR?<6{C$h>0*UCc&hb43lFDOo^#5HKxI|m=4op2F!?=Ff(Sste6e6 zV-9r1oR|x9V;;17pF*d=b*bJMaH?}|@Y>B?;hyECVf!GRL zV-U8%U~G%+uswFbj@Su1qXk1S6vMC!cExVk9eZF;?1fhBjeW2$_QU=-00&|?4#L4W z1S4=L4#VL%0!QK~9F1deERMtRH~}Z(B%F*>a4Js2={N&t;w+qvb8s%s!}+)X7vdsZ zj7xASF2m)x0$1WHT#ajREw01$xB)lfCftl$a4T-Z?YIMX;x62cdvGuA!~J*w58@#_ zj7RV&9>e2!0#D*8JdJ1YES|&jcmXfsCA^GR@G4%z>v#ii;w`+5cknLW!~6IEAL1i? zj8E_>KEvnu0$<`Qe2s7LExyC|_yIrSC;W_G@GE}9@Aw0M;x81L8UInCfez@1P8bWF zF*dqj9E^+cFg_;0gqR2uV-ie?$uK#lz?7H@Q)3!Ti|H^uX26V?2{U6B%!=7CJLW)F z%!#=$H|D{-m=E(~0W64xurL z*1($RjKQ~H3nfD z492$D4%=e~?1-JPGg>eNLop1yU{~yh-LVJu#9nB{-q;8GVn6JU18^XQ;~*T2Lofn| z;xHVJBXA^+!qGSe$Kp5~j}verPQuAJ1*hUPoQ^YaCeFgyI0xtAJe-dUa3LSeNC+@=CxCi&*KHQH7@E{(-!*~Rb z;xRmqC-5Ym!qa#L&*C{ej~DPFUc$?G1+U^YypA{UCf>r^cn9y|J-m+(@F70J$M^)F z;xl}XFYqP4!q@l)-{L!bk00yhEV*~U=FKmd7urW5lrq~Rdqc^rdA8d)f=!gCofPvTwTVoKm!C-8Q z?XW#|z>e4nJEH|dFcibE3wFhB*d2RdPwa(O?2Ub}FZRR!H~)Jra4e3)@i+k|;v}4mQ*bIy!|6B!XW}fJjdO4=&cpe*02ksST#QR_DK5k1 zxB^$=DqM|ga4oLG^|%2y;wIdTTW~9G!|k{Ocj7MGjeBq}?!*0f01x6JJd8*1C?3P( zcmhx2DLjp5@GPFg^LPO-;w8L{SMVxc!|QkhZ{jVyjd$=a-oyL&03YHbe2h=_xJ%n;wSu!U+^n_!|(V5f8s9`SsDLPp@9zQh)x&_oiR4LU>uB# z@i0Cnz=W6x6JrugipelJrofb#3R7bmOpEC-J!Zg+mLgWIeKFY^udj0T_s_ur&r@8w|#_*bduc z2keNQurpdP1Vb?lyI@!BhTX9T_QYOj#opKl`(i)rj{|TZhT|X{j6*O2hvG0Cjw5g+ zj>6G62FKz!9FG%lB2L1|I0dKTG@Onza3;>e**FL1;yj#>3veMW!o|1*m*O&9jw^5_ zuEN#02G`;`T#p-YBW}XYxCOW3Hr$Roa3}7<-M9z$;y&Du2k;;s!ozq3kK!>rjwkRW zp2E|32G8O-JdYRfB3{DFcm=QGHN1{D@Fw2E+js}>;yt{N5AY#A!pHaopW-uojxX>f zzQWh|2H)a4e2*XSBYwiq_yxb>H~fx2@F)I4k&W>m6&mP(j_8E3&>3T+3&z2?7!TuP z0!)aBFfk^th4-L@#WJjj%B`!KT;@o1-_jKp$+0zUYVk7=VG;3R`0kw!vU*i|w#IcEFC< z2|J?&LogJ>unTs@ZrB}rU{CCYR_u*^urKz*{x|>!VmJ=M!8imXa3~JL;Wz?E;wT)A zV{j~v!|^x)C*mZWj8kwbPQ&Rq183qaoQ-pEF3!XGxBwU8B3z71a49as<+uV@;woH? zYj7>D!}YiUH{vGTj9YLkZo}=k19##s+>Lv1FYd$rcmNOLAv}yn@F*U`<9Gs3;we0h zXYeeZ!}E9nFXAP8n18?Fjyp4D8F5biY_y8Z`BYccc@F_mS=lB9&;wyZO zZ}2U?!}s_BKjJ6+j9>68e#7th1ApQ#6xkX7QK5kj=!i}j3!O1Gx?miPi}5f%CcuQ4 z2oqxxOp3`cIi|prm zus$|GPxQiu*a#bA6KsmjusM2T3-rO3=!<^nj{z8nt*|u)VH*s_w%88aV+ZVrov<@n zFa$#}47*@g?1tU32lm8XXvN;x2m4|_?2iL*Aco^09E?LS0*B%-9F8M!B#y$-I0nb! zI2?}?a3W5^$v6e4;xwF&GjJx(!r3?n=i)q^j|*@iF2cpQ1efA6T#hSnC9cBNxCYna zI$Vz%a3gNQ&A0`(;x^olJ8&oN!rizB_u@X>j|cD|9>T+T1drk|JdP*uB%Z?4cm~hn zIXsUS@FHHq%XkH^;x)XEH}EFj!rOQU@8UhYj}P!6KElWN1fSwFe2y>hCBDMf_y*tN zJA98H@FRZ0&-ewu;y3(`Kkz61LXm^<9~BztfR5;dvCtV~qYK8txEK%PV*(8I3rg(~ zHmJU3gA^jn&&#n*htPt>EE~swNGH;Z3?ie*Br=OEBCE(IvYR1t6*)yNkz3>uc||^vUlb4pMIljG z6cI&5F;QHU5G6$^QCgG{WkoqrUbu-0qN1oIDvK(js;DNaiyES)a2K^iZQ&v6h`OSl zs4p4_PvIpRibkTbXd;@5W}><97A=I2XeoS!pYRs}B2csvtwoS%BZ5U+(N44%9Yjab zNpuz#5h6lGnCK$9if*F2=plNFUcxGRi$0>S=qLJ%0b-yC7lXuLF+@a&p<2p7m@KA?(@VwG4e)`+!Yomek6h>c>C*ete)tzw(lE_R5WVwc!0_K3Y=pV%)Bh=bygI4q8c zqvDu2E>4J(;*>Zo&WN+(oH#Eoh>PNqxGb)StKyotE^dgM;+D8A?ufhMp13a_h=<~l zcr2cXr{bA-E?$V2;+1$U-iWv2op>)kh>zlv_$81 zmDyx=nM1nDoHCcpE%V5{GM~&Z3&?`9kSr{V$fB~CEG|pPlCqR6Ez8KVvYae0-DCw> zQC5*ovZL%IJ4=fUk)bk7c9C6WH`!hGkUeEDX_dWY zAK6#-ll|oYIZ%enL2|GhA|vEbIZO_hBjiXqN{*IeiOVm=e zOf6R{)JnBVtyXK)TD4BCR~ytuwMlJOThvyyO>I{@)K0Za?N)o#UbRo{R|nKVbx0jn zN7PYuOdVGz)Jb(pomOYmS#?gGR~OVpbxB=TSJYK?OR5u>P4%qVV@FiIMwjM7FK zqpVTRC~vqK6^x2TC8M%Y#i(jjGpZXkjGBhKQOl@pco=nzx<);tzR|$&G`x(4MkAxK z(ZpzKG&7nT-bM?<$7pHz8h(bq5nu!wt&G-2kkQ5nHrg8PjP^zcqodKu=xkVw5F^wG zGrAaEjc!JFqleMc=w(=q-bNpzuhGxwZwxR78sWwuW3VyAh%km4!;Im^2xFu%${1~o zF~%C>jPb?BbCWrZLNyZOk#|8uN_##sXuZvB+3#EHRcE%Z%m5 z3S*_Q%2;izG1eOEjP=F_W23Rj*lcVuwi?@v?Zyscr?Jb}ZR|1j8vBg>#sTA?amYAq z95Id>$Bg5~3FD-3$~bMDG0qz2jPu3?&6Y^rg6)-ZQL>L8uyI* z#slM_@yH1E3tpoK_u#eWGAMme2CHSOx#k*_4oS025*&OpHk|DJH|@m;zH`Dol-OFfFFT^cZ>T z8QIQ+(SG)6zTwzE(WZkUUklAQANwaK=EB^V2lHY+%#Q`IAQr;HSOkk=F)WTHuq2kk z(pUz|VmU02Zdd^;VkNAMRj?{n!|GTAYoa^W!rJJ8b+9hh!}{0&J<$srVk2yfO|U68 z!{+FXEzk#BqA&WPKL%hRw!+pJgl#Yw+hRLxj~%chcEZkR!4M3^FzkX|u^V>B9@rCm zp%r^$AMA_$us;sKff$a1a4-(R2po#Ta5#>@kvIxR;}{%^<8VAqz==2sC*u^Hiqmj9 z&cK;C3uogToQv~tJ}$t8xCj^H5?qSQa5=8PmADF5;~HFx>u^18z>T;GH{%xEira8I z?!cY63wPrl+>85gKOVq?cnA;U5j={=@Hn2plXwbG;~6}Q=kPpUz>9bZFXI)wir4Tu z-oTr93vc5cyo>knK0d&Q_y`~46MTx#@HxJ~m-q@_;~RX7@9;f-z>oL|KjRntir?@% z{=lF33(biD+ee~810B#2oiG+UV{CN6I2ae>VSG%02{92S#w3^&lVNg9fhjQ+rp7dw z7SmyR%zzm&6K2LNm=&{OcFcjUm=kkhZp?#uF(2l~0$30WVPPzSMX?wb#}Zf)OJQj& zgJrQCmPa?NfEBS4R>mq=6{}%&tbsMr9cy82^uRh;7wchtY=EBVg$=P0HpV8{6q{jl z^u`wGgDue){m>r+Fc4c|YYf6R7>sSP9k#~~*bzHnXS84lhGH0Y!LHa1yJHXRiM`N@ zy|EAW#eUcy2jD;q$3Zw4hhPK_#bG!cN8m^tg`;r{j>T~}9w*>LoP?8c3QomoI2~u; zOq_+YaSqPKc{m>z;6hx4i*X4q#bvl0SKvxqg{yH5uElk@9yj1d+=QEP3vR`2xE*)k zPTYmNaS!greYhVF;6Xfuhw%s=#bbCJPvA*Bg{Schp2c%`9xvcUyo8tW3SPx)cpY!x zO}vG-@eba_dw3ro;6r?bkMRjU#b@{&U*Jo8g|G1qzQuR=9zWnm{DhzJ3x36K_#J=X zPyB`E*8ukM9~BztfR5;dvCtV~qYK8txEK%PV**Twi7+uH!K9cBlVb`@iK#F(roptB z4%1@>%!rvVGiJf8m<_XI4s^wwm;O(V-YNh#jrS*z>-)BOJf-< zi{-F9x?u&Zh?TH1R>7)R4Xa}ftcmVe3u~hX*1@`159?zC^h7Ugh>fr@Ho>OY44b1j zwm=_jiN5HE{uqFP*a}-?5VpZ!Y>Vx%J$As3*a|i>#ctRgdtguOg;wm1 zeXuX~!~Qq`2Vyu5!ofHMBXB4V!{ImrN8%_Pjbm^uj>GXd0Vm=loQzX&Do(@cI0I+m zES!yVa4ycn`M3ZV;v!s(OK>SJ!{xXFSK=yMjcaf%uEX`X0XO0%+>BdrD{jN>xC3|M zF5HcKa4+t|{dfQm;vqbYNAM^f!{c}YPvR*&jc4#Ip2PEa0Wabuyo^`yDqh3ucmr?Z zExe6)@GjoN`}hDK;v;;FPw*)|!{_({U*ao#jc@QRzQgzU0YBm={ET1lD}KZ8_yd39 zFEoDyU?2Zcp@9zQh)x&_oiR4LU>uB#@i0Cnz=W6x6JrugipelJrofb#3R7bmOpEC- zJ!Zg+mLgWIeKFY z^udj0T_s_ur&r@8w|#_*bduc2keNQurpdP1Vb?lyI@!BhTX9T_QYOj#opKl z`(i)rj{|TZhT|X{j6*O2hvG0Cjw5g+j>6G62FKz!9FG%lB2L1|I0dKTG@Onza3;>e z**FL1;yj#>3veMW!o|1*m*O&9jw^5_uEN#02G`;`T#p-YBW}XYxCOW3Hr$Roa3}7< z-M9z$;y&Du2k;;s!ozq3kK!>rjwkRWp2E|32G8O-JdYRfB3{DFcm=QGHN1{D@Fw2E z+js}>;yt{N5AY#A!l>Wp>4_He_jwNH@AFDn^lVGy&-3(*i`Am%TVnWmo}PX&zma#3 z6&|c-!Xmt438@SRN2h4t=m%G|m7-^9qJOEcXKW~B^-G2Hysf|slP zg+&N6^{-F#?NC3nFg>4SKdYWmis`I+hRN4z(GyI;F6P2&(Q{0$!EtqhzUbh1=EB>e zXPm4SJ>TRSoKPnxG8gW`-O*i|zq8cyPr*s_LSJo|Z&-*~gUR)F3Ui@v&@Xtf-Xwl{ zp2}*`vsA9ZI%vVRnAroS=d8?t=_#wA^ua?-tQI|Q{Q>?P@GwCE-2DK%Y^VJj&m9nJi$ zs{Lc$m+fOKQD2}gQBS`G1}`+1wjawrHCKGSEqW5}ua=}|$)pJuhkYGCeIxFVi!V^fKMj>1BGJ z(rVGOlri-(J!@(2Wx6I~>}5eOBfFR|+n&`T&E>ycOzBi_i(z7aIymTr7@kRvX1dvu zoOH%Ne^(t#=eJkx|EVQ??qh37&bmC?!!WikD25ZenCWJ~;+PPyxH_l(v81n?fWx@=Q787bHtIlb!AsrrSD4V%zwPd&L`?rR2=!~`+(kG6whJsV-W>3|)+Dgl5D=qrK zkxQp~TXLJ&Bc4Ym1UY*U)x74W)soMa7%i&#^)b<+YOAyYQAKz}3v@xf#|*SfxDIq( zAr3yi>SP91B#|<%Z2pm zsU6{v^ysDE6;eoFn*AR^6gF4Qx+$Vl{_KrKe_x9EhG1Tb{bPpB_A&R`KL2SbabkZg zbjH|d>$1i4No^r1uFv*6swK=#tEHqZF0@GwYU!vVqDQri-t+$83!N6Nhx_QXzXI7(r!?06s)B7<(YCB)TUNHG z|Na}p*UW5&$JR9dh;++JxqW>i{RbJc2TXWRF0rM1@?ZIz~PH|9!nY~Q9^V1T8A&Pug)G}o<` zPUiC8YU`|1y)70Kd$SAC2{CkxP&3^Oe3;JoXUFKG^Z!+!U3E$|9&^I{YwSdgf`y6| zDqNsQfj>Sj9v)_A=w{{(o?+H~H8&2hcZ2TwG`4QgLzfoA_4G8;&4~8W1=zcRRp+!v zRNoE}?QO1FEq!eJ{*7o~ozWIieIGGLR0V`u`ssXxv%k4+wG1$q|Aup*PW85go7gXZ zkWPr<@&}vgW;loFjDLnRLg%-I)BdLMzZ?K<&yJzGFk8@u=~7}i>u@vO4B7}2f;Lj; zvu39alZTtQW+8CYD7BqdeF$OJov00~)0UK*8ZJe#N=nv-cI@Q}U!Nh*) z6LrELmu@hiOfu8WfK4`uoCTR;u3Ifr^#T9>BgJVt*$kxlZxlzH*An4nC}IBP`q#f{ z;4w8_XN}h1XZ${#{yx)oo@x5Nrt5`nwZETBv&_9=`T;)MT(RHD9DP6xcQV&ZH!ESD z36(Hk=d@RX4g{63z+AOj7TWgxSqX-1Xk4T-+A2YZA;wAw^Kxuu@ogVyS!^q5i7v@2 zp7~~Odqfm6U(iD=OU=Z;reNI;55#x0_<=Ix>VbFw%gBR{u$$CRI1`>*qU|G}s;A2g3Q<9$dM6V*}<+qV90DMvVw43B*KI%=la8;WmKamUQGsA7)C zP|S&cia8lsOvT^Docgnv(=ilt=AUBDMix`)cQNP8v_Pxnyb0a>f4U@7A>Q4w?8?qM`hIU z4-3@OismeHJb&L%A9Ie@r&YU-zM+;IIv0Karnzpl+%lJpc>k&E-_}{YEq63gHF4Lr z^>>Bd(`jbQQBf6oU#I-p^?!Hz2j(vOBlV%JsNe6zk%Pk{Gu>=Rk8QjE98RC;6!Sdh zI50?0^rqwypx+T~A9KI$V~z~V?2oUfW|GzNOdt5~sPdfh{uiUl3!OFEsPgjn;XLtP z+0OHyMwQp*UN0vfe_#KgK+79*!)ke}lN>q(v}td7XRf--kT5^<*58}!zeE1Pw)J<& zKk76cil~r(GE?lYuD@IDXEQCT=Dx&G%-4U4`4(ABh2O<|H`C0<|HFjF|5G1lZ~VG> z(D;ApRI5dpkK{k+<|5m)G}G;kUst5rrd70z=Kb3mzY&#DS6^hC4oVl)_I*Yd@mL@6=&bF<;!x>wrnP>mqf?ae<%q=*Mxhq(I6=Bv?Tyy=eSH5_W zCAmA=+UP%14K&fHHXD80ykpxZdYg{iw!Iw2*QbsA3nzg-Wwf@LF!C~@+D9Ut67vlu zHg}otFiA|P-=sRTy?%A@sNZDfs@0O*w(rkpoBffMLT9u++jI=nz@uG@7WJOl#ZvyVla_w7+UCy-tbwCNr43f*sB1nzfeET>q=qGDVhT z)>^RrPtK@{vscG|n)v_R=fCey=w{!~v~Fg7=E#c6qR$$w;<83wNmRvU(|#KECKGs@J1`e^pE;nngQC z{;}|%zUV6cZ$@SQ#|OW^=qh15)p+x?!4u3y20a3OZcl7RM9DB;LOlP_JXMIQvat`Zl3VZp}OiHMaB4?Y~*9k z%xC*VetBmG-<;XAU0P&-tJ}8z4seag@}mM$(@e3qp5FtPyO|ah*IF?YQ~RG{JR*zv z;~lb&nP!fEbxr7I_4IM}Zl;?$-K@S&wOSh3cK+GV?8BO;&S~psI?(3$=M^nu zNc;NlSGg8u2K%d=kFCJpuX2%J*tay(&HD7!8UJh^emcKCS!7Ea;?<3Y0RQQE8~Hp< z>SoWF64PQv%#2wvJGx>n%wzxJ)?Z&?WP=OPR~PNM92j}UQ4Ow@PWf-)X>DdOpUy!h zG@Ca15PP%H74rXjxz6A$iZJ}~5m5q`D6uPwI;f+OG)e#^SVoKml{o(S#$lZJ;|%$9 zC5p<~6{8Nu-Zl2#3-*Q``-~1&tXNP7yV#%S-EzBkmqfuaKl0u_`|f+W-DjWOd-uT5 zCcQEZ@ zl`!P~j99LxnkPr>VdpFjP1dm$B-SI-d$!Ku)8hP=Ouis4&zffg}fX zI-Lv|&cHPig!tVL8RpWZq|T&qkJ~M!kPK%L(}QBV^=u;kG)VN}98~H2&bc|E&NHEw zH@|Z}+JjFQNNCA;Tqu{l<##Sh?x&A^^Yb~$F|e+835ko*UT0P=G55*O;9Q!zPd}L( zo7r*qxeV>`T$jsDtR!6_rEn~I%#D(Xc_olE^{xWuhV<&3FxOybTqxx?KYF zxdz@rmy)`Z#&yMAQVM(YZen0}g_F8_i1_T5Yjd2HF#TS*Y4y8L!Y1lxSHgL~F{<}V zwZ9ZTK!gHfm%@Mi%@JQIf>rfFFI4WPt6+X0r@>Du&O?42nJa}QJRTT9=gUaPv^(czARHq*EO7-e-uhgKP z@Jfy9NtDq2=+XJON=Ef5DTSl@G%@zD8`Wokr02=Az#KWA%L(&*Cd>;qj5kjf_+ehm z3G-4W%*!?m^7QsUb9}`Q^J-3**D_&Vw_*IWB`~HfVa%+3gYERJ<$N%+_D!NlTS{ei z)-I%{%v!P)v-Yi&qn)*HJ4bR6j_;%#2ewp5isxPDNV384y_BQPZ@%vw$tJ4BzH~NNTf z&n0+P7`^}w8hYM#xV5{sw@5oI9a?2kC< zBhGd21Uj0*iGgg%3-#JWGRonN&N^_=YO$^a%&NtD&SU+Qhkec305~SYhII7h zWFz|X&%yWT#W^O)xir`Y6#l=LZhhaHN!pGww$|7>W9yA=Ft*Xy(Z-H3cC4}EjBPS@ zys;CEov2^-H)h}1o%L|`OX{U=6Wdo#^fx8a*U*iC1K#j9gE13pb2iZ{Ey-dg))u(L zx1Lhx&nx}1Ka!3zuQ*)x4KI=<&hT|h39yv}2>BX?MBSP$_|}ugJ?%D93W>TcF)$+$ zkQySASDZ4lZAP=&4&{iy5+NByh=(B25KKc5u9j^A#?=v_+WL&^C9R%jUcro*{l4B> zoBpwB_skx+KDa3tJ9kIB>$Mi@5d2u&+TQAA+u0zGYf027rLbE^+rPw-XbceP#$=y2 z7LXg9aXC?%GEv57qf7tN`3|cDL1F9Q7EE?gTVeJ4@?6Il2p7kng2&-LR{a!p@vREa2#FM5H$zQ75Yw zfC)!;M@f!+qT!mOdjJS1x+jA$tWVLNfE7*@m!G0K-oI^cscti!KINyE)LwiR%fG37 zj+BV)ZJ*8YZyzGkWl`8s`=SEkH4Vn_r&V0#UpR4a41e~+CB8zHI-frJ0Fdu-nm+7S KD_)(QHGczW7(h(` diff --git a/pandas/tests/io/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle b/pandas/tests/io/data/legacy_pickle/0.20.3/0.20.3_x86_64_darwin_3.5.2.pickle similarity index 85% rename from pandas/tests/io/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle rename to pandas/tests/io/data/legacy_pickle/0.20.3/0.20.3_x86_64_darwin_3.5.2.pickle index 20af597c57a1babcfe68b501e331e79e0f15534e..9777319465de6a012f3d071e436bc07600ee66ae 100644 GIT binary patch delta 14376 zcmc&*36NaHd7hqYclOX)!oVVs0LO+|;w;iifKWhSEvtiJ2U+MqATiADzEw|aXJ(m0 zS4H&zn`1>=w$Q`Rv2aCBY$wEyF-U-8$(R7aP6Zt76scIqF@>C%loJcPTq;tPn0$Y~ zW8TcXot@QTR;71MzwZC|{-giy_uv1~a_|=|J@)1m%(ht*k#CYLEGGC z$nNoUZX{jo9nKfDeO+u$r@X-P#C-cD_R&~+NE<5}n~eD0^;cXt+w*g8CO4vO*NVNP zh5Sw}XTJ6_JAdyNI#}!8uU@CO1_F!)gQ1X7GKP2;pOpwaVDJw=WUMyg36^NZ&76t; z)&R5qQ7cWxiupn*Ul`E}Mp8z{w5|5YJj%l|vXX`~_JRDk4Cm5W&5)7NOrcl;J0s~* z+TdL>k}GG&cNlW-M3-I}EsW7-%iyhxj6@*8EE+<5ZZes?`F;GUL*fRIFab7YXeeDU z=(rQd(R?1qd&=7l4ZoKk1pT1q zU+F{xsbE0~mS4#SG}U!cwid@zqXl9PT8fO0<U%5St|Z_) zPMam|L9evKl3iz*-#X4NGY1c`wX0>Yv=cfGj~29hjHD&j-AVq4JkO@ARMZL?U@Jd5 zTGUEKL$)t17c)7nSiC8{10d+R&%(i$Z6iezRn)xwDR$}A0Vp^?3YIKqB$$ll@`Y@A zEVENHCd!ujH0T&esgmh4hu9xn;)1iqVqnzEK#1ROzITY-G2e~sMu@X`3$=t zUZ;F!^^jRu1-Sdo$Dd(y&+~2ltoiyg?A!~w_@7t9h-i6YqJNS3iDQu6o6oXYI&5XR z{+_bv0Dm&k=7T-F^ekBy{-XGA+h8q1?N%YQ2-~elbPwVmnQRRuQHzp!E+uW|H=bji zx-SJJ08l{PPBOaF!lAFh$TpxdHTtm3x5KgD0PNHJ@?o}4pF%p$$WznSRx1^SorXq+ z&UIz>JPXUMaTvblLgbL2;JN9(Fz)}tg}wk&L*&gLe4E{=f3*zj6;~lDh($QS9|tYB zDE=6?{FR!YLy55W=VLWL2NNOh&%Y^aGMq+SF!*R0&Ur|7rbe{Uba||l+JcZ_Nb{#p zvzBwpLq!=H1`D7?Ml!k5{HyoQ>AR>sg=gC4JXt?nt-R3Vs`g#G>Z2Da2UN@A;5#9R z@#J$Od)ciQ56Eb0I6p$~`?o_Z$n;g4W_2teEIpT=U`Jb4$kvpKNZJT_5E%xciwu#3 zoGce4K1m{LhzYXc#rqy$J-WDy8rKq)pazJBEddq`g(FciSCK(W8jvKjpwDMSu?8(B z^}>T6x|{?uvGiWju>JR`udjOU>#Nv(i+UdWuZOH>{uHUVg@!ku4qlPe9b(F|r8t}( zOBcxH#$|LIJ~|JDR{KQfX>^tK^FAZV_sG`4e6GX?pnu1C^L_B!Eh)>x8~h7^q2s(~ zrOHJuRnW%LrOZ}MAJN7T-Y59}1mllGD1+~*ATDDnT$R#Ugu+=>ez{bHdq#X*L-9;C z*k|m9KYZn#qtQ2JC6nZBz>O1Kh?_QwcDJft%EPYwR3-k9fQGwN3!GP|D;s zD@Ke&$Qf)%wpH`?PR*+dmtI3ss;=0Lj1HuWC1ls2_Jn#@uP{30%driBq$}5xp0SnA zVVfjPGL1$gbU1n33c|Y%O8(E&lMHDTAzGbW6sf(knteupv}*>rEaL%~u7agtL4dCX z2N_W!Gej6S{n40mZK`2e#HdjRHHzi5?WNQZ*_W$e9VFWtwtbH)?7GKM{`kYd+-xCq6;f#!Ru@g;-Hb3(5IBw%zgyXeAbcLO+@`3acNCIaoNiAV;SWEdSolPm2A4W z1^yL?xnx-(W0b|^B#Ag3F2kWm@u!1p3)wE;vW4_ajvC|qp=TUR%N=Xu}n zMct@_=&}u0^I!UxB6C>Yb-O3I- zz@)~u<+JEjcHM%P$!2dI9n9ZDJiFYXu9>}ZFLbjYLZDqS7T9Pa}zq_XMxFyN33=>GNfO{(o z4Q&!3`v+I#Wtv~BG!QXJm}bWj)-#K5a-Y&vG|h+1JCCrrGosa%i~EnTkLlVtf;Czx zAAo2Zh9TimL`^r-$hhkgu?S^qG-nrIgbd~C(pOO@D04{Vy?Kj&d!YS)MJCQ7qBc{Embgi=O zSSqWP(r}=98dV$Kn}*L?NlgTr*b3Yvvba$K6nX1$ya_5pj^vuke|2|4O}V)eE5DvE z;WxOI6f3XvPeBv(5|ZLjFu*7wE;l7dn?fZ=y^$cjQx>tGgl7x#^K?yu6nk+c9;MqM z85}7Y(-q*rfa z285*P5wQSlczpO2xte3`lXx%K7t;AtcVDwmcyg@nfBcZgSwq3<_ZiA*@ff@$Z{I{^ zz1li6IdnLY2vwt%8)?u4rRX4e8jZ)3yob7rR_53|QCCuJrMe0FKWSd`GMm%mZG1~z zW3*Kjii!Si^Wqm+_tna$Q8`)Tt7xnQzs0HnPa2p|28>(9w|k#wZF=;B;@e_7cxrr8 z_0q1x*77~Q;J4C)o=fSv)@So|e&|_^oMAd4Zwim)=p?TAI%|>sQyymj#E)6YL#Pv- zSx4lX9i23nyn#@3);Rr=httpdavB;bk7L@phuG2T_?i6eO<^{Il zE>5d6o`srI4JCeqz>}bEzkUJ9-Aj=7+V4GwI0V8oK-`q-X!GXYzx1eM;lEV$QBg-3 z%bDY;$_$3DtDd5){}sD@*h$1?%aGl2dhV&qjnQ@MH8F%HxTh8FdQG>v>o@HD8=p*c z)lYmVjyWqv9e_A_=}B6dnf<%1Maql{TdE_{oZC-bR?OI!l zK)1j&d41^v(%eEX*WPPC35aEo?L_}}GkJ)uGCy;IeZ0=rn&xzRP`JYbh3h>~=$mC@ zfy#moAjBfHa991X&Wn|EdksC#nfc}qSzPCv_+4sB2Pc!}BLEDNN2GX-f&^KGHE%_b zaKi)MnSREpqbqkl>}I9GKu~3+o|Mz32YUkiHWj!8VIu7%$cjOH!0#nB`(8;^4W5vZ z5-(_)fn?K(c6R>h?m@8D$7{DUVCM0l@+Wo*df+H)>8^u})#{}Ay`yX)64VBf(OvaC zjb&1;4pQ-5m3qlinw}t6RZB%`VN<+BK=L%9IcC+;r}zhIp;;xByytOVlp7;O(UUE);Y;6yLOW8W1gLa@+$j`&+!l7j(sABpNG9BH5)?Iuas0J|Bdz4 z8+T9*vDp048*G984icT3Ld-ODUr}Xa2kd%@+Px7vwqN;X-0mBbY>ud7y5i&Wo+&^L z!s-B~*SePFgrA14Ww5?Q^~MTuTYZaQBAN)Q>N4%j5E~qnqV^)Lp&J{f9VkRuNGQvb3QVU0P%VC*s@Hu@XnNk! zJygHnqxw!q_2ye|vzF73R+aFeX2x!w(42qjH9i>SUj^kKfmEi6oA!UVa1#M;WH_~U zO+T#$QmyoCqoo2C9m6%3E74|9yLi?x^{LN%PBws-ZY4+W*Egx&*(KVsSI>!&Z&eT| zenOMpQ+GKWMNlejtNp1eO|?rh(}bRdygCPe#v`LX$q?4U;YJ8XY@)+mbV$+RZXAf2 zGvj;os-4tsz^nM4CehSvgmg!+;ips4dYbrKLC|R!9+UMgYQp0hyM^vq5v@V!9Ug=( zqH8NWif_QYPXnPgKyA;s9{D&Sv{Eo^&NUJ~>l68qmAE~ONw%}a_X>04FEL}KXsGI} zjpk9+-nv3kx=umW(#LwccV_f85|*<9Y6%WmeR&8)^iwx%TWtQhgwUZAUx`?G<`d?U zW7w+N&@uY=JT>dy-LB#vJ*+?P!OWwCnbQxa{xpg|ZOr}2iT+hoX*U1rFzYaT{*|rx zJxJnRk0g#)Bq7_dBR8evBZlGp*cr$zU4P=81vrW2Bza}9tgE%MO;sCVJK^Ddld z@E8QwCc zAstGp9e2B(!7h3A#P1IMc4pGbwM8p{Z2tNwRAaHe?5x+%|KC6R3h0^Ma2*)rv{A&BYi4^;H=b>L(*si0#SefD$p!se(zjbFJ6MTU9v zF*b)j6Zxb0R-!1#jiKX)o5d4~8(M_fn3mx(tt2|Mw69&6Ynl|)%geqWY1DqBvGE%9 z$rhC7o!)t`{RDnS*L$qZX#v12l<)X@mXp>YSNS+Wz z)+>y#wU_5Iy`rsE8@p*QcHqFb=PiSp0`Go!<$t2~6u9x8w1oc+eD1X!SJFp3Z}mL- zW7^a@@4>$=x8r>8+b?!^;{5uOHxFY|teQ4IB9;S+O>FMyO3YO|7!~Ns<=+1XB%2}-fW7J2zNIWfiuQs%fz7WS6pF5JTeTDn@5 z82kC4w{&Y~TAfZur_$I`-@s5KH?7qLt(9`rD;Plm+zMHQnxll$0VUQ zX+tibf#&p-z7(`7Y@a=OwR|QGU7GEd2DCCCESu_;v;+-yr-~^}ZjxK#zFReU+@RK> z#o{C$fj=fv{}l_hxGzrl+kkvVB9XZ4IQ+0n{F^X*c}M%~#icY2!R+U^YjSw@O*nuY zb@aJ%>@41Crrp{pC5yCj>YP&_eSaf~C0gXEL-I6v2A#m?UcR^StkzixIp)z_^&D4V zXE(Raf3$5rxnto4rSy&k3S+j9&9QBP=N#V)z#NwkE!iJ=^uvwhrN|s)v6z#5IWEk7 zGOS(a&Nmycn7Xc%F41T)(<>E=V-Lk?4+#S-r!reb9rp=$hh*|!PC!eSeeTf<>lFKXvSGiquE@( zH`SBbB7r6ZH}-cIU~C#mmiGkDY#|^=AIg!EtQIHwO^=faV?l1gzRkrg8ucgppd~!V7zI25B=}qBjXbCP>eQSebwCMsX|f8XSK_I{n+v2aUTsVzp4}X)1k3` zgX{CsCf?B3_TPJdeq@~7PXo(Ro6Q-@-x-p(Zq;N}{vN29n2&sGoXFdTO6dgaC7fNr zR#DYaf_*E|hC*L*eI7Lz78Y9HlS8fwEid2ZzhpvlvRhi8D)khT8#BNf8cO%%x;B>I z@?SN5hJ1ES@x_B6^||G10#npeO&LjOM6J*b7!5r+nU(sJ%fPr$u!bec&(dc5J2A-J z30a5rk6BVGWU^AB;Km17ISuF5uP;bN4OjvUiMKNumC`N>A8+L5v(l>XL3xFK{8KW! zJTusFN@*L&LRt$-+GKL&%+QQ!cB@`*?I9qqD32qjmIp#Bq9=*PC(C<7lVT31K!p9C zXSHA6d|EI7M#hZAwb`-qMd9(2n&fw!Zswz*r3>|k_K=wV)Az{O@&n;!_0xEx!`|b5 z82#AL(grlPhPDnC*xV9X6Bi<|p2+8)L5$vsjjUBCc?-U8-C z!+qdCa@`t)vhh}zqE9+RrYwaZqRX2_sPy!TwH!gG-f@Ur+ry;7qPV2VyXEJs-shgr z(?0}%S4X4&`!MNf<--Ob=EDL#;v(}|o%+3>kZ-k=Xt=<9D)L9UUElQ#nLU$6lf6B8?xHfObf26N_hs@Uh`2uNFhwHv}?_NxC^2G12 z@Z>XCo#2T|{l&tFc`|wi7Sxnq5dF|$vU+N5o?&q6)bH9))+9!6WOHDZ4?HY88?7of z$HLAt@R_TaOh&9;idtm?ff^Er=42NZ4RlvEzWT5I(E~{`**@>1oQX8$$jkr6)04mpg z&y)G;D0m~M;SHDz^(mVw>!kkPjLq)Ot-~1^u7CSgXSGQ;NIeA&1Z{>M8pvfO%`H@7 zI@qaC-a{tOGU5~s6#H|o#Ha|J`no+3TxhhA%NJQ;4OMcuCzUqFY@%aDJnzcorPg$+ zAZfIDMLxGh$|jawl+Uw*r#j8Wr~?`n^7#niI@YB85_nRBiue{0KdPccLs+M?#VTcJ zp!@_bQf-fc16jQ6!Wqh)&;aAi-EgBF7Ryr|@zJS~PeM0Fc>F#-i}2|0F4{D*4u!N?-u&9`%Pj{1=MTw!>+Ubg z`2J&A>c1>Y4RsfjP~BVnj9OnZU|VkO)mj1-zy|X6%kF!eV+EJ5Pn>Qo1VLq7Un!x z9i^<%X%!1C3vH_`l!kgU1Ar-t<*F?8{1TSSBKXBd{R=5E0#Uv@)fe{u_?B@T2U1pp zMqj*G(2QbA?PN}e!5ql8m3W0aB^u$l4MJ=Hj{l z2neWO>2bECq0Le%A1OeMQ;=2ttlxp^({OJlTPh-e8}3CM5Wf1b3fznj$|mRqprxRk zE9HSC#7N*F!dr1->=oM=q^=x5WeKRnrI=R1XITf42kNWkY4BR$1g`mKF7tpM z0awUmyATVE81SgF4M-EfxF1#Ars@Tw((p>I8!*g+F2gvM%hpLF$R{jzq1hwDG@H(s z&)C$&nwJ2L6}W6nIgGJw)2o1kY@F1 zcxkQ8pM}3qfNPlTG6#xWs}w!90O>4brn8KaBggigAgGUkX`TAM17xad)?&`>9hWkS z!bVoAOJ-<&Fx#E!Wj?IlGETV`+`MufUeMv=a(K)yrAGi>ju?tWw3Fi)l0(R941<16 zxbZlOwweMVNw$5)0m%Ts#3hL%G>0sT&@F)C z-Xg06u>u7lpt6PdEG#n0vYsRd%p7vSEPx?qBa#CaWu{fyurj+qY9N8aG$+^`8sQEN zVYPXuIqD4lmDTujPj^D0>@#N5hua9AaVSf(a;c3-`Z=qCKbqv<)ZPl$vAwu|P6=jqG7#OKF zu5vv~!Zy0icwuUdavpTg&C$w!;61lS+{My=K9k80@5nh59kypyYijK0jGStXxXUD{`~WmWDdR=M%MQH! z5H7fk(Fvbw)wwOxXsY7pXogR71hHq2IhnVaIb-MGuvT`5lfo7|%Ys6+p1M^S}L}}~M z44dp6im{CON2_8e$ABLv&^1$wm5%P@K>)O_?p%ro06Vl3>G#UNeYxu9RynAZQ?Yj6 zcLJ{OQq(@rvf+9g1!gHW$CkCI@VWTR9UGw1g|1Xj3XoYG;dZE6l-J>ss1Kf8d6a34 z+`}>V-R~U;|K;RFg6)%+6il|^)MYR)ZGdCGZX1g{$|#sJx+S${D6gc%tfZuYO{D?L z7QKNB8riWkz?zQMvLk3ui@mbq298(2z5*eBe;~k*ExPQ;GC~#491{M^Hf3ui$8i1& zc7~-M#{3Kfc{<{oQBops5y08GH88@Y@=%mREIN)8BdN0zH1E3M6 zmtj?$@*1Wbm_@lID_C!`Ly(AyPACtC#_E6E11B-{(Ae{jg7)FhQ9KEW=O=BsCuIJt zIpp=`8A_tBJ4_}z|NX^Kzmu9zDp{OkQIE&8t|sJ zs_$feYzfLfLpb+2E!Tfk)muIwQ%qAX?+A}otIe71DSM%D)iYEs1U8tCWq5X0VK~c> zg>wZz7gYEO|0)58`Yi`TCqkStPW$L*zeHxKvuHG3EC>>ALF~lhgY@J=9|SvzZCV+Cpu9 ziD^F*^p9U66X8I2i_@JV!;LfUO}<(O$_*;uq7S&mP= LooseVersion('0.18'): - from pandas import RangeIndex - index['range'] = RangeIndex(10) + index['range'] = RangeIndex(10) if _loose_version >= LooseVersion('0.21'): from pandas import interval_range @@ -191,14 +189,9 @@ def create_data(): nat=NaT, tz=Timestamp('2011-01-01', tz='US/Eastern')) - if _loose_version < LooseVersion('0.19.2'): - timestamp['freq'] = Timestamp('2011-01-01', offset='D') - timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo', - offset='M') - else: - timestamp['freq'] = Timestamp('2011-01-01', freq='D') - timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo', - freq='M') + timestamp['freq'] = Timestamp('2011-01-01', freq='D') + timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo', + freq='M') off = {'DateOffset': DateOffset(years=1), 'DateOffset_h_ns': DateOffset(hour=6, nanoseconds=5824), @@ -239,14 +232,6 @@ def create_data(): def create_pickle_data(): data = create_data() - # Pre-0.14.1 versions generated non-unpicklable mixed-type frames and - # panels if their columns/items were non-unique. - if _loose_version < LooseVersion('0.14.1'): - del data['frame']['mixed_dup'] - del data['panel']['mixed_dup'] - if _loose_version < LooseVersion('0.17.0'): - del data['series']['period'] - del data['scalars']['period'] return data @@ -256,14 +241,6 @@ def _u(x): def create_msgpack_data(): data = create_data() - if _loose_version < LooseVersion('0.17.0'): - del data['frame']['mixed_dup'] - del data['panel']['mixed_dup'] - del data['frame']['dup'] - del data['panel']['dup'] - if _loose_version < LooseVersion('0.18.0'): - del data['series']['dt_tz'] - del data['frame']['dt_mixed_tzs'] # Not supported del data['sp_series'] del data['sp_frame'] @@ -272,7 +249,8 @@ def create_msgpack_data(): del data['frame']['cat_onecol'] del data['frame']['cat_and_float'] del data['scalars']['period'] - if _loose_version < LooseVersion('0.23.0'): + if _loose_version >= LooseVersion('0.21') and ( + _loose_version < LooseVersion('0.23.0')): del data['index']['interval'] del data['offsets'] return _u(data) @@ -285,7 +263,6 @@ def platform_name(): def write_legacy_pickles(output_dir): - # make sure we are < 0.13 compat (in py3) version = pandas.__version__ print("This script generates a storage file for the current arch, system, " diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index 9337d5916acc67..59fa9fbd02da1d 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -1,5 +1,4 @@ import datetime -from distutils.version import LooseVersion import glob from io import BytesIO import os @@ -84,7 +83,6 @@ def check_arbitrary(a, b): assert(a == b) -@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") class TestPackers: def setup_method(self, method): @@ -99,7 +97,6 @@ def encode_decode(self, x, compress=None, **kwargs): return read_msgpack(p, **kwargs) -@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") class TestAPI(TestPackers): def test_string_io(self): @@ -463,7 +460,6 @@ def test_basic(self): assert_categorical_equal(i, i_rec) -@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") class TestNDFrame(TestPackers): def setup_method(self, method): @@ -842,7 +838,6 @@ def legacy_packer(request, datapath): return datapath(request.param) -@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestMsgpack: """ @@ -858,15 +853,11 @@ class TestMsgpack: minimum_structure = {'series': ['float', 'int', 'mixed', 'ts', 'mi', 'dup'], 'frame': ['float', 'int', 'mixed', 'mi'], - 'panel': ['float'], 'index': ['int', 'date', 'period'], 'mi': ['reg2']} def check_min_structure(self, data, version): for typ, v in self.minimum_structure.items(): - if typ == "panel": - # FIXME: kludge; get this key out of the legacy file - continue assert typ in data, '"{0}" not found in unpacked data'.format(typ) for kind in v: @@ -874,15 +865,7 @@ def check_min_structure(self, data, version): assert kind in data[typ], msg def compare(self, current_data, all_data, vf, version): - # GH12277 encoding default used to be latin-1, now utf-8 - if LooseVersion(version) < LooseVersion('0.18.0'): - data = read_msgpack(vf, encoding='latin-1') - else: - data = read_msgpack(vf) - - if "panel" in data: - # FIXME: kludge; get the key out of the stored file - del data["panel"] + data = read_msgpack(vf) self.check_min_structure(data, version) for typ, dv in data.items(): @@ -909,33 +892,16 @@ def compare(self, current_data, all_data, vf, version): return data def compare_series_dt_tz(self, result, expected, typ, version): - # 8260 - # dtype is object < 0.17.0 - if LooseVersion(version) < LooseVersion('0.17.0'): - expected = expected.astype(object) - tm.assert_series_equal(result, expected) - else: - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): - # 8260 - # dtype is object < 0.17.0 - if LooseVersion(version) < LooseVersion('0.17.0'): - expected = expected.astype(object) - tm.assert_frame_equal(result, expected) - else: - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_msgpacks_legacy(self, current_packers_data, all_packers_data, legacy_packer, datapath): version = os.path.basename(os.path.dirname(legacy_packer)) - # GH12142 0.17 files packed in P2 can't be read in P3 - if (version.startswith('0.17.') and - legacy_packer.split('.')[-4][-1] == '2'): - msg = "Files packed in Py2 can't be read in Py3 ({})" - pytest.skip(msg.format(version)) try: with catch_warnings(record=True): self.compare(current_packers_data, all_packers_data, diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index b115a08d3b0d39..eb912908d28f49 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -11,7 +11,6 @@ 3. Move the created pickle to "data/legacy_pickle/" directory. """ import bz2 -from distutils.version import LooseVersion import glob import gzip import lzma @@ -69,18 +68,8 @@ def compare(data, vf, version): m = globals() for typ, dv in data.items(): - if typ == "panel": - # FIXME: kludge; get this key out of the legacy file - continue - for dt, result in dv.items(): - try: - expected = data[typ][dt] - except (KeyError): - if version in ('0.10.1', '0.11.0') and dt == 'reg': - break - else: - raise + expected = data[typ][dt] # use a specific comparator # if available @@ -92,12 +81,7 @@ def compare(data, vf, version): def compare_sp_series_ts(res, exp, typ, version): - # SparseTimeSeries integrated into SparseSeries in 0.12.0 - # and deprecated in 0.17.0 - if version and LooseVersion(version) <= LooseVersion("0.12.0"): - tm.assert_sp_series_equal(res, exp, check_series_type=False) - else: - tm.assert_sp_series_equal(res, exp) + tm.assert_sp_series_equal(res, exp) def compare_series_ts(result, expected, typ, version): @@ -121,47 +105,19 @@ def compare_series_ts(result, expected, typ, version): def compare_series_dt_tz(result, expected, typ, version): - # 8260 - # dtype is object < 0.17.0 - if LooseVersion(version) < LooseVersion('0.17.0'): - expected = expected.astype(object) - tm.assert_series_equal(result, expected) - else: - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def compare_series_cat(result, expected, typ, version): - # Categorical dtype is added in 0.15.0 - # ordered is changed in 0.16.0 - if LooseVersion(version) < LooseVersion('0.15.0'): - tm.assert_series_equal(result, expected, check_dtype=False, - check_categorical=False) - elif LooseVersion(version) < LooseVersion('0.16.0'): - tm.assert_series_equal(result, expected, check_categorical=False) - else: - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def compare_frame_dt_mixed_tzs(result, expected, typ, version): - # 8260 - # dtype is object < 0.17.0 - if LooseVersion(version) < LooseVersion('0.17.0'): - expected = expected.astype(object) - tm.assert_frame_equal(result, expected) - else: - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def compare_frame_cat_onecol(result, expected, typ, version): - # Categorical dtype is added in 0.15.0 - # ordered is changed in 0.16.0 - if LooseVersion(version) < LooseVersion('0.15.0'): - tm.assert_frame_equal(result, expected, check_dtype=False, - check_categorical=False) - elif LooseVersion(version) < LooseVersion('0.16.0'): - tm.assert_frame_equal(result, expected, check_categorical=False) - else: - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def compare_frame_cat_and_float(result, expected, typ, version): @@ -177,11 +133,7 @@ def compare_index_period(result, expected, typ, version): def compare_sp_frame_float(result, expected, typ, version): - if LooseVersion(version) <= LooseVersion('0.18.1'): - tm.assert_sp_frame_equal(result, expected, exact_indices=False, - check_dtype=False) - else: - tm.assert_sp_frame_equal(result, expected) + tm.assert_sp_frame_equal(result, expected) files = glob.glob(os.path.join(os.path.dirname(__file__), "data", From 1c0eb45803f7c901d5cd04480e0b4327912a01c2 Mon Sep 17 00:00:00 2001 From: pmaxey83 Date: Thu, 27 Jun 2019 14:25:02 -0700 Subject: [PATCH 071/238] DOC: Make section title capitalization consistent #26830 (#26950) --- doc/source/development/contributing.rst | 14 +- .../development/contributing_docstring.rst | 2 +- doc/source/development/extending.rst | 14 +- doc/source/development/internals.rst | 2 +- doc/source/ecosystem.rst | 8 +- doc/source/getting_started/10min.rst | 20 +-- doc/source/getting_started/basics.rst | 30 ++-- .../comparison/comparison_with_r.rst | 6 +- .../comparison/comparison_with_sas.rst | 36 ++--- .../comparison/comparison_with_stata.rst | 44 +++--- doc/source/getting_started/dsintro.rst | 10 +- doc/source/getting_started/overview.rst | 10 +- doc/source/getting_started/tutorials.rst | 8 +- doc/source/install.rst | 6 +- doc/source/reference/arrays.rst | 16 +-- doc/source/reference/frame.rst | 10 +- doc/source/reference/groupby.rst | 2 +- doc/source/reference/index.rst | 2 +- doc/source/reference/indexing.rst | 22 +-- doc/source/reference/io.rst | 4 +- doc/source/reference/offset_frequency.rst | 2 +- doc/source/reference/resampling.rst | 2 +- doc/source/reference/series.rst | 24 ++-- doc/source/reference/style.rst | 10 +- doc/source/user_guide/advanced.rst | 10 +- doc/source/user_guide/categorical.rst | 28 ++-- doc/source/user_guide/computation.rst | 24 ++-- doc/source/user_guide/cookbook.rst | 24 ++-- doc/source/user_guide/enhancingperf.rst | 22 +-- doc/source/user_guide/gotchas.rst | 4 +- doc/source/user_guide/groupby.rst | 8 +- doc/source/user_guide/indexing.rst | 34 ++--- doc/source/user_guide/integer_na.rst | 2 +- doc/source/user_guide/io.rst | 132 +++++++++--------- doc/source/user_guide/merging.rst | 6 +- doc/source/user_guide/missing_data.rst | 12 +- doc/source/user_guide/options.rst | 16 +-- doc/source/user_guide/reshaping.rst | 12 +- doc/source/user_guide/sparse.rst | 8 +- doc/source/user_guide/style.ipynb | 12 +- doc/source/user_guide/text.rst | 12 +- doc/source/user_guide/timedeltas.rst | 6 +- doc/source/user_guide/timeseries.rst | 84 +++++------ doc/source/user_guide/visualization.rst | 42 +++--- doc/source/whatsnew/v0.10.0.rst | 6 +- doc/source/whatsnew/v0.11.0.rst | 18 +-- doc/source/whatsnew/v0.12.0.rst | 8 +- doc/source/whatsnew/v0.13.0.rst | 14 +- doc/source/whatsnew/v0.13.1.rst | 6 +- doc/source/whatsnew/v0.14.0.rst | 14 +- doc/source/whatsnew/v0.14.1.rst | 2 +- doc/source/whatsnew/v0.15.0.rst | 8 +- doc/source/whatsnew/v0.15.1.rst | 2 +- doc/source/whatsnew/v0.15.2.rst | 2 +- doc/source/whatsnew/v0.16.0.rst | 46 +++--- doc/source/whatsnew/v0.16.1.rst | 14 +- doc/source/whatsnew/v0.16.2.rst | 8 +- doc/source/whatsnew/v0.17.0.rst | 50 +++---- doc/source/whatsnew/v0.17.1.rst | 6 +- doc/source/whatsnew/v0.18.0.rst | 34 ++--- doc/source/whatsnew/v0.18.1.rst | 20 +-- doc/source/whatsnew/v0.19.0.rst | 16 +-- doc/source/whatsnew/v0.19.1.rst | 4 +- doc/source/whatsnew/v0.19.2.rst | 4 +- doc/source/whatsnew/v0.20.0.rst | 96 ++++++------- doc/source/whatsnew/v0.20.2.rst | 6 +- doc/source/whatsnew/v0.20.3.rst | 2 +- doc/source/whatsnew/v0.21.0.rst | 64 ++++----- doc/source/whatsnew/v0.21.1.rst | 10 +- doc/source/whatsnew/v0.22.0.rst | 6 +- doc/source/whatsnew/v0.23.0.rst | 52 +++---- doc/source/whatsnew/v0.23.1.rst | 12 +- doc/source/whatsnew/v0.23.2.rst | 10 +- doc/source/whatsnew/v0.23.3.rst | 2 +- doc/source/whatsnew/v0.23.4.rst | 8 +- doc/source/whatsnew/v0.24.0.rst | 120 ++++++++-------- doc/source/whatsnew/v0.24.1.rst | 8 +- doc/source/whatsnew/v0.24.2.rst | 6 +- doc/source/whatsnew/v0.25.0.rst | 61 ++++---- doc/source/whatsnew/v0.4.x.rst | 4 +- doc/source/whatsnew/v0.5.0.rst | 4 +- doc/source/whatsnew/v0.6.0.rst | 4 +- doc/source/whatsnew/v0.7.0.rst | 4 +- doc/source/whatsnew/v0.7.3.rst | 4 +- 84 files changed, 783 insertions(+), 784 deletions(-) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index c9c76f307d93f3..26e9b2fdb07a6c 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -127,7 +127,7 @@ to build the documentation locally before pushing your changes. .. _contributing.dev_c: -Installing a C Compiler +Installing a C compiler ~~~~~~~~~~~~~~~~~~~~~~~ Pandas uses C extensions (mostly written using Cython) to speed up certain @@ -155,7 +155,7 @@ Let us know if you have any difficulties by opening an issue or reaching out on .. _contributing.dev_python: -Creating a Python Environment +Creating a Python environment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Now that you have a C compiler, create an isolated pandas development @@ -209,7 +209,7 @@ See the full conda docs `here `__. .. _contributing.pip: -Creating a Python Environment (pip) +Creating a Python environment (pip) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If you aren't using conda for your development environment, follow these instructions. @@ -605,7 +605,7 @@ and run ``flake8`` on them, one after the other. .. _contributing.import-formatting: -Import Formatting +Import formatting ~~~~~~~~~~~~~~~~~ *pandas* uses `isort `__ to standardise import formatting across the codebase. @@ -651,7 +651,7 @@ The `--recursive` flag can be passed to sort all files in a directory. You can then verify the changes look ok, then git :ref:`commit ` and :ref:`push `. -Backwards Compatibility +Backwards compatibility ~~~~~~~~~~~~~~~~~~~~~~~ Please try to maintain backward compatibility. *pandas* has lots of users with lots of @@ -699,7 +699,7 @@ See :ref:`contributing.warnings` for more. .. _contributing.ci: -Testing With Continuous Integration +Testing with continuous integration ----------------------------------- The *pandas* test suite will run automatically on `Travis-CI `__ and @@ -930,7 +930,7 @@ options or subtle interactions to test (or think of!) all of them. .. _contributing.warnings: -Testing Warnings +Testing warnings ~~~~~~~~~~~~~~~~ By default, one of pandas CI workers will fail if any unhandled warnings are emitted. diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst index f7e2b42a1ccbdf..62216f168af3cd 100644 --- a/doc/source/development/contributing_docstring.rst +++ b/doc/source/development/contributing_docstring.rst @@ -929,7 +929,7 @@ plot will be generated automatically when building the documentation. .. _docstring.sharing: -Sharing Docstrings +Sharing docstrings ------------------ Pandas has a system for sharing docstrings, with slight variations, between diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index 8bee0452c22071..363ec10d58bb6a 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -3,7 +3,7 @@ {{ header }} **************** -Extending Pandas +Extending pandas **************** While pandas provides a rich set of methods, containers, and data types, your @@ -12,7 +12,7 @@ pandas. .. _extending.register-accessors: -Registering Custom Accessors +Registering custom accessors ---------------------------- Libraries can use the decorators @@ -70,7 +70,7 @@ applies only to certain dtypes. .. _extending.extension-types: -Extension Types +Extension types --------------- .. versionadded:: 0.23.0 @@ -210,7 +210,7 @@ will .. _extending.extension.testing: -Testing Extension Arrays +Testing extension arrays ^^^^^^^^^^^^^^^^^^^^^^^^ We provide a test suite for ensuring that your extension arrays satisfy the expected @@ -238,7 +238,7 @@ for a list of all the tests available. .. _extending.subclassing-pandas: -Subclassing pandas Data Structures +Subclassing pandas data structures ---------------------------------- .. warning:: There are some easier alternatives before considering subclassing ``pandas`` data structures. @@ -260,7 +260,7 @@ This section describes how to subclass ``pandas`` data structures to meet more s You can find a nice example in `geopandas `_ project. -Override Constructor Properties +Override constructor properties ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Each data structure has several *constructor properties* for returning a new @@ -348,7 +348,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> type(sliced2) -Define Original Properties +Define original properties ^^^^^^^^^^^^^^^^^^^^^^^^^^ To let original data structures have additional properties, you should let ``pandas`` know what properties are added. ``pandas`` maps unknown properties to data names overriding ``__getattribute__``. Defining original properties can be done in one of 2 ways: diff --git a/doc/source/development/internals.rst b/doc/source/development/internals.rst index 9c434928c214e9..748caae2954609 100644 --- a/doc/source/development/internals.rst +++ b/doc/source/development/internals.rst @@ -102,7 +102,7 @@ So, for example, ``Series[category]._values`` is a ``Categorical``, while .. _ref-subclassing-pandas: -Subclassing pandas Data Structures +Subclassing pandas data structures ---------------------------------- This section has been moved to :ref:`extending.subclassing-pandas`. diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index b1a54307525584..b76dd3e0ff8e6e 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -3,7 +3,7 @@ {{ header }} **************** -pandas Ecosystem +Pandas ecosystem **************** Increasingly, packages are being built on top of pandas to address specific needs @@ -26,7 +26,7 @@ substantial projects that you feel should be on this list, please let us know. .. _ecosystem.stats: -Statistics and Machine Learning +Statistics and machine learning ------------------------------- `Statsmodels `__ @@ -243,7 +243,7 @@ you can obtain for free on the FRED website. .. _ecosystem.domain: -Domain Specific +Domain specific --------------- `Geopandas `__ @@ -332,7 +332,7 @@ and check that they're *actually* true. .. _ecosystem.extensions: -Extension Data Types +Extension data types -------------------- Pandas provides an interface for defining diff --git a/doc/source/getting_started/10min.rst b/doc/source/getting_started/10min.rst index 8bb188419cb595..68ba777ec2c2aa 100644 --- a/doc/source/getting_started/10min.rst +++ b/doc/source/getting_started/10min.rst @@ -3,7 +3,7 @@ {{ header }} ******************** -10 Minutes to pandas +10 minutes to pandas ******************** This is a short introduction to pandas, geared mainly for new users. @@ -16,7 +16,7 @@ Customarily, we import as follows: import numpy as np import pandas as pd -Object Creation +Object creation --------------- See the :ref:`Data Structure Intro section `. @@ -83,7 +83,7 @@ As you can see, the columns ``A``, ``B``, ``C``, and ``D`` are automatically tab completed. ``E`` is there as well; the rest of the attributes have been truncated for brevity. -Viewing Data +Viewing data ------------ See the :ref:`Basics section `. @@ -183,7 +183,7 @@ Selecting via ``[]``, which slices the rows. df[0:3] df['20130102':'20130104'] -Selection by Label +Selection by label ~~~~~~~~~~~~~~~~~~ See more in :ref:`Selection by Label `. @@ -224,7 +224,7 @@ For getting fast access to a scalar (equivalent to the prior method): df.at[dates[0], 'A'] -Selection by Position +Selection by position ~~~~~~~~~~~~~~~~~~~~~ See more in :ref:`Selection by Position `. @@ -271,7 +271,7 @@ For getting fast access to a scalar (equivalent to the prior method): df.iat[1, 1] -Boolean Indexing +Boolean indexing ~~~~~~~~~~~~~~~~ Using a single column's values to select data. @@ -340,7 +340,7 @@ A ``where`` operation with setting. df2 -Missing Data +Missing data ------------ pandas primarily uses the value ``np.nan`` to represent missing data. It is by @@ -580,7 +580,7 @@ With a "stacked" DataFrame or Series (having a ``MultiIndex`` as the stacked.unstack(1) stacked.unstack(0) -Pivot Tables +Pivot tables ~~~~~~~~~~~~ See the section on :ref:`Pivot Tables `. @@ -600,7 +600,7 @@ We can produce pivot tables from this data very easily: pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']) -Time Series +Time series ----------- pandas has simple, powerful, and efficient functionality for performing @@ -735,7 +735,7 @@ of the columns with labels: @savefig frame_plot_basic.png plt.legend(loc='best') -Getting Data In/Out +Getting data in/out ------------------- CSV diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 5ec0094de0a914..3ba79210a43ee1 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -3,7 +3,7 @@ {{ header }} ============================== - Essential Basic Functionality + Essential basic functionality ============================== Here we discuss a lot of the essential functionality common to the pandas data @@ -19,7 +19,7 @@ the previous section: .. _basics.head_tail: -Head and Tail +Head and tail ------------- To view a small sample of a Series or DataFrame object, use the @@ -34,7 +34,7 @@ of elements to display is five, but you may pass a custom number. .. _basics.attrs: -Attributes and Underlying Data +Attributes and underlying data ------------------------------ pandas objects have a number of attributes enabling you to access the metadata @@ -286,7 +286,7 @@ using ``fillna`` if you wish). .. _basics.compare: -Flexible Comparisons +Flexible comparisons ~~~~~~~~~~~~~~~~~~~~ Series and DataFrame have the binary comparison methods ``eq``, ``ne``, ``lt``, ``gt``, @@ -304,7 +304,7 @@ indexing operations, see the section on :ref:`Boolean indexing .. _basics.reductions: -Boolean Reductions +Boolean reductions ~~~~~~~~~~~~~~~~~~ You can apply the reductions: :attr:`~DataFrame.empty`, :meth:`~DataFrame.any`, @@ -468,7 +468,7 @@ which we illustrate: df2 df1.combine_first(df2) -General DataFrame Combine +General DataFrame combine ~~~~~~~~~~~~~~~~~~~~~~~~~ The :meth:`~DataFrame.combine_first` method above calls the more general @@ -643,7 +643,7 @@ there for details about accepted inputs. .. _basics.idxmin: -Index of Min/Max Values +Index of min/max values ~~~~~~~~~~~~~~~~~~~~~~~ The :meth:`~DataFrame.idxmin` and :meth:`~DataFrame.idxmax` functions on Series @@ -677,7 +677,7 @@ matching index: .. _basics.discretization: -Value counts (histogramming) / Mode +Value counts (histogramming) / mode ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The :meth:`~Series.value_counts` Series method and top-level function computes a histogram @@ -752,7 +752,7 @@ on an entire ``DataFrame`` or ``Series``, row- or column-wise, or elementwise. .. _basics.pipe: -Tablewise Function Application +Tablewise function application ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ``DataFrames`` and ``Series`` can of course just be passed into functions. @@ -806,7 +806,7 @@ We encourage you to view the source code of :meth:`~DataFrame.pipe`. .. _R: https://www.r-project.org -Row or Column-wise Function Application +Row or column-wise function application ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Arbitrary functions can be applied along the axes of a DataFrame @@ -987,7 +987,7 @@ not noted for a particular column will be ``NaN``: .. _basics.aggregation.mixed_dtypes: -Mixed Dtypes +Mixed dtypes ++++++++++++ When presented with mixed dtypes that cannot aggregate, ``.agg`` will only take the valid @@ -1106,7 +1106,7 @@ selective transforms. .. _basics.elementwise: -Applying Elementwise Functions +Applying elementwise functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Since not all functions can be vectorized (accept NumPy arrays and return @@ -1726,7 +1726,7 @@ sorting by column values, and sorting by a combination of both. .. _basics.sort_index: -By Index +By index ~~~~~~~~ The :meth:`Series.sort_index` and :meth:`DataFrame.sort_index` methods are @@ -1753,7 +1753,7 @@ used to sort a pandas object by its index levels. .. _basics.sort_values: -By Values +By values ~~~~~~~~~ The :meth:`Series.sort_values` method is used to sort a `Series` by its values. The @@ -1785,7 +1785,7 @@ argument: .. _basics.sort_indexes_and_values: -By Indexes and Values +By indexes and values ~~~~~~~~~~~~~~~~~~~~~ .. versionadded:: 0.23.0 diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst index 2957430666b8a8..444e886bc951d2 100644 --- a/doc/source/getting_started/comparison/comparison_with_r.rst +++ b/doc/source/getting_started/comparison/comparison_with_r.rst @@ -26,7 +26,7 @@ use HDF5 files, see :ref:`io.external_compatibility` for an example. -Quick Reference +Quick reference --------------- We'll start off with a quick reference guide pairing some common R @@ -35,7 +35,7 @@ operations using `dplyr pandas equivalents. -Querying, Filtering, Sampling +Querying, filtering, sampling ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ =========================================== =========================================== @@ -85,7 +85,7 @@ R pandas =========================================== =========================================== -Grouping and Summarizing +Grouping and summarizing ~~~~~~~~~~~~~~~~~~~~~~~~ ============================================== =========================================== diff --git a/doc/source/getting_started/comparison/comparison_with_sas.rst b/doc/source/getting_started/comparison/comparison_with_sas.rst index cbedeec737ec05..69bb700c97b15c 100644 --- a/doc/source/getting_started/comparison/comparison_with_sas.rst +++ b/doc/source/getting_started/comparison/comparison_with_sas.rst @@ -31,10 +31,10 @@ As is customary, we import pandas and NumPy as follows: proc print data=df(obs=5); run; -Data Structures +Data structures --------------- -General Terminology Translation +General terminology translation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. csv-table:: @@ -78,10 +78,10 @@ see the :ref:`indexing documentation` for much more on how to use an ``Index`` effectively. -Data Input / Output +Data input / output ------------------- -Constructing a DataFrame from Values +Constructing a DataFrame from values ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ A SAS data set can be built from specified values by @@ -110,7 +110,7 @@ and the values are the data. df -Reading External Data +Reading external data ~~~~~~~~~~~~~~~~~~~~~ Like SAS, pandas provides utilities for reading in data from @@ -151,7 +151,7 @@ In addition to text/csv, pandas supports a variety of other data formats such as Excel, HDF5, and SQL databases. These are all read via a ``pd.read_*`` function. See the :ref:`IO documentation` for more details. -Exporting Data +Exporting data ~~~~~~~~~~~~~~ The inverse of ``PROC IMPORT`` in SAS is ``PROC EXPORT`` @@ -169,10 +169,10 @@ and other data formats follow a similar api. tips.to_csv('tips2.csv') -Data Operations +Data operations --------------- -Operations on Columns +Operations on columns ~~~~~~~~~~~~~~~~~~~~~ In the ``DATA`` step, arbitrary math expressions can @@ -228,7 +228,7 @@ DataFrames can be filtered in multiple ways; the most intuitive of which is usin tips[tips['total_bill'] > 10].head() -If/Then Logic +If/then logic ~~~~~~~~~~~~~ In SAS, if/then logic can be used to create new columns. @@ -256,7 +256,7 @@ the ``where`` method from ``numpy``. tips = tips.drop('bucket', axis=1) -Date Functionality +Date functionality ~~~~~~~~~~~~~~~~~~ SAS provides a variety of functions to do operations on @@ -301,7 +301,7 @@ see the :ref:`timeseries documentation` for more details. tips = tips.drop(['date1', 'date2', 'date1_year', 'date2_month', 'date1_next', 'months_between'], axis=1) -Selection of Columns +Selection of columns ~~~~~~~~~~~~~~~~~~~~ SAS provides keywords in the ``DATA`` step to select, @@ -338,7 +338,7 @@ The same operations are expressed in pandas below. tips.rename(columns={'total_bill': 'total_bill_2'}).head() -Sorting by Values +Sorting by values ~~~~~~~~~~~~~~~~~ Sorting in SAS is accomplished via ``PROC SORT`` @@ -358,7 +358,7 @@ takes a list of columns to sort by. tips.head() -String Processing +String processing ----------------- Length @@ -466,7 +466,7 @@ approaches, but this just shows a simple approach. firstlast -Upcase, Lowcase, and Propcase +Upcase, lowcase, and propcase ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The SAS `UPCASE `__ @@ -555,7 +555,7 @@ types are accomplished via the ``how`` keyword. outer_join -Missing Data +Missing data ------------ Like SAS, pandas has a representation for missing data - which is the @@ -671,7 +671,7 @@ operation. tips.head() -By Group Processing +By group processing ~~~~~~~~~~~~~~~~~~~ In addition to aggregation, pandas ``groupby`` can be used to @@ -701,7 +701,7 @@ In pandas this would be written as: Other Considerations -------------------- -Disk vs Memory +Disk vs memory ~~~~~~~~~~~~~~ pandas operates exclusively in memory, where a SAS data set exists on disk. @@ -713,7 +713,7 @@ If out of core processing is needed, one possibility is the library (currently in development) which provides a subset of pandas functionality for an on-disk ``DataFrame`` -Data Interop +Data interop ~~~~~~~~~~~~ pandas provides a :func:`read_sas` method that can read SAS data saved in diff --git a/doc/source/getting_started/comparison/comparison_with_stata.rst b/doc/source/getting_started/comparison/comparison_with_stata.rst index c354ed7872cb4b..db687386329bb9 100644 --- a/doc/source/getting_started/comparison/comparison_with_stata.rst +++ b/doc/source/getting_started/comparison/comparison_with_stata.rst @@ -31,10 +31,10 @@ libraries as ``pd`` and ``np``, respectively, for the rest of the document. list in 1/5 -Data Structures +Data structures --------------- -General Terminology Translation +General terminology translation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. csv-table:: @@ -78,10 +78,10 @@ see the :ref:`indexing documentation` for much more on how to use an ``Index`` effectively. -Data Input / Output +Data input / output ------------------- -Constructing a DataFrame from Values +Constructing a DataFrame from values ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ A Stata data set can be built from specified values by @@ -107,7 +107,7 @@ and the values are the data. df -Reading External Data +Reading external data ~~~~~~~~~~~~~~~~~~~~~ Like Stata, pandas provides utilities for reading in data from @@ -155,7 +155,7 @@ such as Excel, SAS, HDF5, Parquet, and SQL databases. These are all read via a function. See the :ref:`IO documentation` for more details. -Exporting Data +Exporting data ~~~~~~~~~~~~~~ The inverse of ``import delimited`` in Stata is ``export delimited`` @@ -177,10 +177,10 @@ Pandas can also export to Stata file format with the :meth:`DataFrame.to_stata` tips.to_stata('tips2.dta') -Data Operations +Data operations --------------- -Operations on Columns +Operations on columns ~~~~~~~~~~~~~~~~~~~~~ In Stata, arbitrary math expressions can be used with the ``generate`` and @@ -222,7 +222,7 @@ DataFrames can be filtered in multiple ways; the most intuitive of which is usin tips[tips['total_bill'] > 10].head() -If/Then Logic +If/then logic ~~~~~~~~~~~~~ In Stata, an ``if`` clause can also be used to create new columns. @@ -245,7 +245,7 @@ the ``where`` method from ``numpy``. tips = tips.drop('bucket', axis=1) -Date Functionality +Date functionality ~~~~~~~~~~~~~~~~~~ Stata provides a variety of functions to do operations on @@ -290,7 +290,7 @@ see the :ref:`timeseries documentation` for more details. tips = tips.drop(['date1', 'date2', 'date1_year', 'date2_month', 'date1_next', 'months_between'], axis=1) -Selection of Columns +Selection of columns ~~~~~~~~~~~~~~~~~~~~ Stata provides keywords to select, drop, and rename columns. @@ -319,7 +319,7 @@ to a variable. tips.rename(columns={'total_bill': 'total_bill_2'}).head() -Sorting by Values +Sorting by values ~~~~~~~~~~~~~~~~~ Sorting in Stata is accomplished via ``sort`` @@ -337,10 +337,10 @@ takes a list of columns to sort by. tips.head() -String Processing +String processing ----------------- -Finding Length of String +Finding length of string ~~~~~~~~~~~~~~~~~~~~~~~~ Stata determines the length of a character string with the :func:`strlen` and @@ -361,7 +361,7 @@ Use ``len`` and ``rstrip`` to exclude trailing blanks. tips['time'].str.rstrip().str.len().head() -Finding Position of Substring +Finding position of substring ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Stata determines the position of a character in a string with the :func:`strpos` function. @@ -383,7 +383,7 @@ the function will return -1 if it fails to find the substring. tips['sex'].str.find("ale").head() -Extracting Substring by Position +Extracting substring by position ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Stata extracts a substring from a string based on its position with the :func:`substr` function. @@ -401,7 +401,7 @@ indexes are zero-based. tips['sex'].str[0:1].head() -Extracting nth Word +Extracting nth word ~~~~~~~~~~~~~~~~~~~ The Stata :func:`word` function returns the nth word from a string. @@ -431,7 +431,7 @@ approaches, but this just shows a simple approach. firstlast -Changing Case +Changing case ~~~~~~~~~~~~~ The Stata :func:`strupper`, :func:`strlower`, :func:`strproper`, @@ -547,7 +547,7 @@ types are accomplished via the ``how`` keyword. outer_join -Missing Data +Missing data ------------ Like Stata, pandas has a representation for missing data -- the @@ -645,7 +645,7 @@ operation. tips.head() -By Group Processing +By group processing ~~~~~~~~~~~~~~~~~~~ In addition to aggregation, pandas ``groupby`` can be used to @@ -664,10 +664,10 @@ In pandas this would be written as: tips.groupby(['sex', 'smoker']).first() -Other Considerations +Other considerations -------------------- -Disk vs Memory +Disk vs memory ~~~~~~~~~~~~~~ Pandas and Stata both operate exclusively in memory. This means that the size of diff --git a/doc/source/getting_started/dsintro.rst b/doc/source/getting_started/dsintro.rst index 1abca7ac393dd7..914c55115567aa 100644 --- a/doc/source/getting_started/dsintro.rst +++ b/doc/source/getting_started/dsintro.rst @@ -3,7 +3,7 @@ {{ header }} ************************ -Intro to Data Structures +Intro to data structures ************************ We'll start with a quick, non-comprehensive overview of the fundamental data @@ -399,7 +399,7 @@ The result will be a DataFrame with the same index as the input Series, and with one column whose name is the original name of the Series (only if no other column name provided). -**Missing Data** +**Missing data** Much more will be said on this topic in the :ref:`Missing data ` section. To construct a DataFrame with missing data, we use ``np.nan`` to @@ -407,7 +407,7 @@ represent missing values. Alternatively, you may pass a ``numpy.MaskedArray`` as the data argument to the DataFrame constructor, and its masked entries will be considered missing. -Alternate Constructors +Alternate constructors ~~~~~~~~~~~~~~~~~~~~~~ .. _basics.dataframe.from_dict: @@ -498,7 +498,7 @@ available to insert at a particular location in the columns: .. _dsintro.chained_assignment: -Assigning New Columns in Method Chains +Assigning new columns in method chains ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Inspired by `dplyr's @@ -614,7 +614,7 @@ To write code compatible with all versions of Python, split the assignment in tw -Indexing / Selection +Indexing / selection ~~~~~~~~~~~~~~~~~~~~ The basics of indexing are as follows: diff --git a/doc/source/getting_started/overview.rst b/doc/source/getting_started/overview.rst index b531f686951fc3..ec76c60f24257b 100644 --- a/doc/source/getting_started/overview.rst +++ b/doc/source/getting_started/overview.rst @@ -81,7 +81,7 @@ Some other notes - pandas has been used extensively in production in financial applications. -Data Structures +Data structures --------------- .. csv-table:: @@ -131,7 +131,7 @@ changed, but, for example, columns can be inserted into a DataFrame. However, the vast majority of methods produce new objects and leave the input data untouched. In general we like to **favor immutability** where sensible. -Getting Support +Getting support --------------- The first stop for pandas issues and ideas is the `Github Issue Tracker @@ -152,7 +152,7 @@ pandas is a `NumFOCUS `__ sponso This will help ensure the success of development of pandas as a world-class open-source project, and makes it possible to `donate `__ to the project. -Project Governance +Project governance ------------------ The governance process that pandas project has used informally since its inception in 2008 is formalized in `Project Governance documents `__. @@ -160,13 +160,13 @@ The documents clarify how decisions are made and how the various elements of our Wes McKinney is the Benevolent Dictator for Life (BDFL). -Development Team +Development team ----------------- The list of the Core Team members and more detailed information can be found on the `people’s page `__ of the governance repo. -Institutional Partners +Institutional partners ---------------------- The information about current institutional partners can be found on `pandas website page `__. diff --git a/doc/source/getting_started/tutorials.rst b/doc/source/getting_started/tutorials.rst index 8e23c643280c1a..212f3636d0a987 100644 --- a/doc/source/getting_started/tutorials.rst +++ b/doc/source/getting_started/tutorials.rst @@ -8,7 +8,7 @@ Tutorials This is a guide to many pandas tutorials, geared mainly for new users. -Internal Guides +Internal guides =============== pandas' own :ref:`10 Minutes to pandas<10min>`. @@ -17,7 +17,7 @@ More complex recipes are in the :ref:`Cookbook`. A handy pandas `cheat sheet `_. -Community Guides +Community guides ================ pandas Cookbook by Julia Evans @@ -74,7 +74,7 @@ Excel charts with pandas, vincent and xlsxwriter * `Using Pandas and XlsxWriter to create Excel charts `_ -Video Tutorials +Video tutorials --------------- * `Pandas From The Ground Up `_ @@ -96,7 +96,7 @@ Video Tutorials `Jupyter Notebook `__ -Various Tutorials +Various tutorials ----------------- * `Wes McKinney's (pandas BDFL) blog `_ diff --git a/doc/source/install.rst b/doc/source/install.rst index 013a27c980e977..352b56ebd30200 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -236,7 +236,7 @@ Package Minimum support .. _install.recommended_dependencies: -Recommended Dependencies +Recommended dependencies ~~~~~~~~~~~~~~~~~~~~~~~~ * `numexpr `__: for accelerating certain numerical operations. @@ -255,7 +255,7 @@ Recommended Dependencies .. _install.optional_dependencies: -Optional Dependencies +Optional dependencies ~~~~~~~~~~~~~~~~~~~~~ Pandas has many optional dependencies that are only used for specific methods. @@ -299,7 +299,7 @@ zlib Compression for msgpack .. _optional_html: -Optional Dependencies for Parsing HTML +Optional dependencies for parsing HTML ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ One of the following combinations of libraries is needed to use the diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 38406bf5b26560..77a87cafb92581 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -3,7 +3,7 @@ .. _api.arrays: ============= -Pandas Arrays +Pandas arrays ============= .. currentmodule:: pandas @@ -37,7 +37,7 @@ stored in a :class:`Series`, :class:`Index`, or as a column in a :class:`DataFra .. _api.arrays.datetime: -Datetime Data +Datetime data ------------- NumPy cannot natively represent timezone-aware datetimes. Pandas supports this @@ -156,7 +156,7 @@ If the data are tz-aware, then every value in the array must have the same timez .. _api.arrays.timedelta: -Timedelta Data +Timedelta data -------------- NumPy can natively represent timedeltas. Pandas provides :class:`Timedelta` @@ -211,7 +211,7 @@ A collection of timedeltas may be stored in a :class:`TimedeltaArray`. .. _api.arrays.period: -Timespan Data +Timespan data ------------- Pandas represents spans of times as :class:`Period` objects. @@ -277,7 +277,7 @@ Every period in a ``PeriodArray`` must have the same ``freq``. .. _api.arrays.interval: -Interval Data +Interval data ------------- Arbitrary intervals can be represented as :class:`Interval` objects. @@ -342,7 +342,7 @@ A collection of intervals may be stored in an :class:`arrays.IntervalArray`. .. _api.arrays.integer_na: -Nullable Integer +Nullable integer ---------------- :class:`numpy.ndarray` cannot natively represent integer-data with missing values. @@ -369,7 +369,7 @@ Pandas provides this through :class:`arrays.IntegerArray`. .. _api.arrays.categorical: -Categorical Data +Categorical data ---------------- Pandas defines a custom data type for representing data that can take only a @@ -434,7 +434,7 @@ data. See :ref:`api.series.cat` for more. .. _api.arrays.sparse: -Sparse Data +Sparse data ----------- Data where a single value is repeated many times (e.g. ``0`` or ``NaN``) may diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 7d5cd5d245631e..6ae2ea6e392e63 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -115,7 +115,7 @@ Binary operator functions DataFrame.combine DataFrame.combine_first -Function application, GroupBy & Window +Function application, GroupBy & window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -133,7 +133,7 @@ Function application, GroupBy & Window .. _api.dataframe.stats: -Computations / Descriptive Stats +Computations / descriptive stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -177,7 +177,7 @@ Computations / Descriptive Stats DataFrame.var DataFrame.nunique -Reindexing / Selection / Label manipulation +Reindexing / selection / label manipulation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -312,7 +312,7 @@ specific plotting methods of the form ``DataFrame.plot.``. .. _api.frame.sparse: -Sparse Accessor +Sparse accessor ~~~~~~~~~~~~~~~ Sparse-dtype specific methods and attributes are provided under the @@ -332,7 +332,7 @@ Sparse-dtype specific methods and attributes are provided under the DataFrame.sparse.to_dense -Serialization / IO / Conversion +Serialization / IO / conversion ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst index 5c8a563a47d001..921eb737aef076 100644 --- a/doc/source/reference/groupby.rst +++ b/doc/source/reference/groupby.rst @@ -40,7 +40,7 @@ Function application GroupBy.transform GroupBy.pipe -Computations / Descriptive Stats +Computations / descriptive stats -------------------------------- .. autosummary:: :toctree: api/ diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst index 31b493e472099f..12ca318c815d35 100644 --- a/doc/source/reference/index.rst +++ b/doc/source/reference/index.rst @@ -3,7 +3,7 @@ .. _api: ============= -API Reference +API reference ============= This page gives an overview of all public pandas objects, functions and diff --git a/doc/source/reference/indexing.rst b/doc/source/reference/indexing.rst index 8931caf3943880..bbac964e8a201a 100644 --- a/doc/source/reference/indexing.rst +++ b/doc/source/reference/indexing.rst @@ -3,7 +3,7 @@ .. _api.indexing: ============= -Index Objects +Index objects ============= Index @@ -48,7 +48,7 @@ Properties Index.T Index.memory_usage -Modifying and Computations +Modifying and computations ~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -96,7 +96,7 @@ Compatibility with MultiIndex Index.is_lexsorted_for_tuple Index.droplevel -Missing Values +Missing values ~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -205,7 +205,7 @@ CategoricalIndex CategoricalIndex -Categorical Components +Categorical components ~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -222,7 +222,7 @@ Categorical Components CategoricalIndex.as_ordered CategoricalIndex.as_unordered -Modifying and Computations +Modifying and computations ~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -240,7 +240,7 @@ IntervalIndex IntervalIndex -IntervalIndex Components +IntervalIndex components ~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -278,7 +278,7 @@ MultiIndex IndexSlice -MultiIndex Constructors +MultiIndex constructors ~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -288,7 +288,7 @@ MultiIndex Constructors MultiIndex.from_product MultiIndex.from_frame -MultiIndex Properties +MultiIndex properties ~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -299,7 +299,7 @@ MultiIndex Properties MultiIndex.nlevels MultiIndex.levshape -MultiIndex Components +MultiIndex components ~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -316,7 +316,7 @@ MultiIndex Components MultiIndex.reorder_levels MultiIndex.remove_unused_levels -MultiIndex Selecting +MultiIndex selecting ~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -336,7 +336,7 @@ DatetimeIndex DatetimeIndex -Time/Date Components +Time/Date components ~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst index 9c776e3ff8a82c..666220d390cdc3 100644 --- a/doc/source/reference/io.rst +++ b/doc/source/reference/io.rst @@ -3,7 +3,7 @@ .. _api.io: ============ -Input/Output +Input/output ============ .. currentmodule:: pandas @@ -14,7 +14,7 @@ Pickling read_pickle -Flat File +Flat file ~~~~~~~~~ .. autosummary:: :toctree: api/ diff --git a/doc/source/reference/offset_frequency.rst b/doc/source/reference/offset_frequency.rst index ccc1c7e171d229..4a58055f1c9559 100644 --- a/doc/source/reference/offset_frequency.rst +++ b/doc/source/reference/offset_frequency.rst @@ -3,7 +3,7 @@ .. _api.dateoffsets: ============ -Date Offsets +Date offsets ============ .. currentmodule:: pandas.tseries.offsets diff --git a/doc/source/reference/resampling.rst b/doc/source/reference/resampling.rst index 2a52defa3c68f6..57263139d9c180 100644 --- a/doc/source/reference/resampling.rst +++ b/doc/source/reference/resampling.rst @@ -43,7 +43,7 @@ Upsampling Resampler.asfreq Resampler.interpolate -Computations / Descriptive Stats +Computations / descriptive stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index a061f696f4b309..e8e2f64e22cb51 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -119,7 +119,7 @@ Binary operator functions Series.product Series.dot -Function application, GroupBy & Window +Function application, groupby & window -------------------------------------- .. autosummary:: :toctree: api/ @@ -137,7 +137,7 @@ Function application, GroupBy & Window .. _api.series.stats: -Computations / Descriptive Stats +Computations / descriptive stats -------------------------------- .. autosummary:: :toctree: api/ @@ -188,7 +188,7 @@ Computations / Descriptive Stats Series.value_counts Series.compound -Reindexing / Selection / Label manipulation +Reindexing / selection / label manipulation ------------------------------------------- .. autosummary:: :toctree: api/ @@ -296,14 +296,14 @@ Sparse :ref:`sparse ` .. _api.series.dt: -Datetimelike Properties +Datetimelike properties ~~~~~~~~~~~~~~~~~~~~~~~ ``Series.dt`` can be used to access the values of the series as datetimelike and return several properties. These can be accessed like ``Series.dt.``. -Datetime Properties +Datetime properties ^^^^^^^^^^^^^^^^^^^ .. autosummary:: @@ -339,7 +339,7 @@ Datetime Properties Series.dt.tz Series.dt.freq -Datetime Methods +Datetime methods ^^^^^^^^^^^^^^^^ .. autosummary:: @@ -358,7 +358,7 @@ Datetime Methods Series.dt.month_name Series.dt.day_name -Period Properties +Period properties ^^^^^^^^^^^^^^^^^ .. autosummary:: @@ -369,7 +369,7 @@ Period Properties Series.dt.start_time Series.dt.end_time -Timedelta Properties +Timedelta properties ^^^^^^^^^^^^^^^^^^^^ .. autosummary:: @@ -382,7 +382,7 @@ Timedelta Properties Series.dt.nanoseconds Series.dt.components -Timedelta Methods +Timedelta methods ^^^^^^^^^^^^^^^^^ .. autosummary:: @@ -478,7 +478,7 @@ strings and apply several methods to it. These can be accessed like .. _api.series.cat: -Categorical Accessor +Categorical accessor ~~~~~~~~~~~~~~~~~~~~ Categorical-dtype specific methods and attributes are available under @@ -508,7 +508,7 @@ the ``Series.cat`` accessor. .. _api.series.sparse: -Sparse Accessor +Sparse accessor ~~~~~~~~~~~~~~~ Sparse-dtype specific methods and attributes are provided under the @@ -560,7 +560,7 @@ specific plotting methods of the form ``Series.plot.``. Series.hist -Serialization / IO / Conversion +Serialization / IO / conversion ------------------------------- .. autosummary:: :toctree: api/ diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst index bd9635b41e343e..3d155535e25857 100644 --- a/doc/source/reference/style.rst +++ b/doc/source/reference/style.rst @@ -9,7 +9,7 @@ Style ``Styler`` objects are returned by :attr:`pandas.DataFrame.style`. -Styler Constructor +Styler constructor ------------------ .. autosummary:: :toctree: api/ @@ -17,7 +17,7 @@ Styler Constructor Styler Styler.from_custom_template -Styler Properties +Styler properties ----------------- .. autosummary:: :toctree: api/ @@ -26,7 +26,7 @@ Styler Properties Styler.template Styler.loader -Style Application +Style application ----------------- .. autosummary:: :toctree: api/ @@ -44,7 +44,7 @@ Style Application Styler.clear Styler.pipe -Builtin Styles +Builtin styles -------------- .. autosummary:: :toctree: api/ @@ -55,7 +55,7 @@ Builtin Styles Styler.background_gradient Styler.bar -Style Export and Import +Style export and import ----------------------- .. autosummary:: :toctree: api/ diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index eb1ca97e465f87..280eb05964787b 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -3,7 +3,7 @@ {{ header }} ****************************** -MultiIndex / Advanced Indexing +MultiIndex / advanced indexing ****************************** This section covers :ref:`indexing with a MultiIndex ` @@ -179,7 +179,7 @@ on a deeper level. .. _advanced.shown_levels: -Defined Levels +Defined levels ~~~~~~~~~~~~~~ The :class:`MultiIndex` keeps all the defined levels of an index, even @@ -642,7 +642,7 @@ And now selection works as expected. dfm.loc[(0, 'y'):(1, 'z')] -Take Methods +Take methods ------------ .. _advanced.take: @@ -712,7 +712,7 @@ faster than fancy indexing. .. _indexing.index_types: -Index Types +Index types ----------- We have discussed ``MultiIndex`` in the previous sections pretty extensively. @@ -981,7 +981,7 @@ bins, with ``NaN`` representing a missing value similar to other dtypes. pd.cut([0, 3, 5, 1], bins=c.categories) -Generating Ranges of Intervals +Generating ranges of intervals ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ If we need intervals on a regular frequency, we can use the :func:`interval_range` function diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index a6315c548b3827..7dca34385c1eec 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -3,7 +3,7 @@ {{ header }} **************** -Categorical Data +Categorical data **************** This is an introduction to pandas categorical data type, including a short comparison @@ -38,10 +38,10 @@ See also the :ref:`API docs on categoricals`. .. _categorical.objectcreation: -Object Creation +Object creation --------------- -Series Creation +Series creation ~~~~~~~~~~~~~~~ Categorical ``Series`` or columns in a ``DataFrame`` can be created in several ways: @@ -90,7 +90,7 @@ Categorical data has a specific ``category`` :ref:`dtype `: df.dtypes -DataFrame Creation +DataFrame creation ~~~~~~~~~~~~~~~~~~ Similar to the previous section where a single column was converted to categorical, all columns in a @@ -130,7 +130,7 @@ This conversion is likewise done column by column: df_cat['B'] -Controlling Behavior +Controlling behavior ~~~~~~~~~~~~~~~~~~~~ In the examples above where we passed ``dtype='category'``, we used the default @@ -181,7 +181,7 @@ during normal constructor mode: categories=["train", "test"])) -Regaining Original Data +Regaining original data ~~~~~~~~~~~~~~~~~~~~~~~ To get back to the original ``Series`` or NumPy array, use @@ -243,7 +243,7 @@ expects a `dtype`. For example :func:`pandas.read_csv`, array. In other words, ``dtype='category'`` is equivalent to ``dtype=CategoricalDtype()``. -Equality Semantics +Equality semantics ~~~~~~~~~~~~~~~~~~ Two instances of :class:`~pandas.api.types.CategoricalDtype` compare equal @@ -438,7 +438,7 @@ use :meth:`~pandas.Categorical.set_categories`. intentionally or because it is misspelled or (under Python3) due to a type difference (e.g., NumPy S1 dtype and Python strings). This can result in surprising behaviour! -Sorting and Order +Sorting and order ----------------- .. _categorical.sort: @@ -510,7 +510,7 @@ necessarily make the sort order the same as the categories order. (e.g. :meth:`Series.median`, which would need to compute the mean between two values if the length of an array is even) do not work and raise a ``TypeError``. -Multi Column Sorting +Multi column sorting ~~~~~~~~~~~~~~~~~~~~ A categorical dtyped column will participate in a multi-column sort in a similar manner to other columns. @@ -963,7 +963,7 @@ Following table summarizes the results of ``Categoricals`` related concatenation +----------+--------------------------------------------------------+----------------------------+ -Getting Data In/Out +Getting data in/out ------------------- You can write data that contains ``category`` dtypes to a ``HDFStore``. @@ -1000,7 +1000,7 @@ relevant columns back to `category` and assign the right categories and categori The same holds for writing to a SQL database with ``to_sql``. -Missing Data +Missing data ------------ pandas primarily uses the value `np.nan` to represent missing data. It is by @@ -1052,7 +1052,7 @@ Gotchas .. _categorical.rfactor: -Memory Usage +Memory usage ~~~~~~~~~~~~ .. _categorical.memory: @@ -1152,7 +1152,7 @@ You can use ``fillna`` to handle missing values before applying a function. df.apply(lambda row: type(row["cats"]), axis=1) df.apply(lambda col: col.dtype, axis=0) -Categorical Index +Categorical index ~~~~~~~~~~~~~~~~~ ``CategoricalIndex`` is a type of index that is useful for supporting @@ -1173,7 +1173,7 @@ Setting the index will create a ``CategoricalIndex``: # This now sorts by the categories order df.sort_index() -Side Effects +Side effects ~~~~~~~~~~~~ Constructing a ``Series`` from a ``Categorical`` will not copy the input diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 71cbf58dff871c..a2f93dcf337d78 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -5,12 +5,12 @@ Computational tools =================== -Statistical Functions +Statistical functions --------------------- .. _computation.pct_change: -Percent Change +Percent change ~~~~~~~~~~~~~~ ``Series`` and ``DataFrame`` have a method @@ -294,7 +294,7 @@ sugar for applying the moving window operator to all of the DataFrame's columns: .. _stats.summary: -Method Summary +Method summary ~~~~~~~~~~~~~~ We provide a number of common statistical functions: @@ -335,7 +335,7 @@ compute the mean absolute deviation on a rolling basis: .. _stats.rolling_window: -Rolling Windows +Rolling windows ~~~~~~~~~~~~~~~ Passing ``win_type`` to ``.rolling`` generates a generic rolling window computation, that is weighted according the ``win_type``. @@ -404,7 +404,7 @@ For some windowing functions, additional parameters must be specified: .. _stats.moments.ts: -Time-aware Rolling +Time-aware rolling ~~~~~~~~~~~~~~~~~~ .. versionadded:: 0.19.0 @@ -469,7 +469,7 @@ default of the index) in a DataFrame. .. _stats.rolling_window.endpoints: -Rolling Window Endpoints +Rolling window endpoints ~~~~~~~~~~~~~~~~~~~~~~~~ .. versionadded:: 0.20.0 @@ -511,7 +511,7 @@ For fixed windows, the closed parameter cannot be set and the rolling window wil .. _stats.moments.ts-versus-resampling: -Time-aware Rolling vs. Resampling +Time-aware rolling vs. resampling ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Using ``.rolling()`` with a time-based index is quite similar to :ref:`resampling `. They @@ -529,7 +529,7 @@ will have the shape of a regular frequency between the min and the max of the or To summarize, ``.rolling()`` is a time-based window operation, while ``.resample()`` is a frequency-based window operation. -Centering Windows +Centering windows ~~~~~~~~~~~~~~~~~ By default the labels are set to the right edge of the window, but a @@ -542,7 +542,7 @@ By default the labels are set to the right edge of the window, but a .. _stats.moments.binary: -Binary Window Functions +Binary window functions ~~~~~~~~~~~~~~~~~~~~~~~ :meth:`~Rolling.cov` and :meth:`~Rolling.corr` can compute moving window statistics about @@ -695,7 +695,7 @@ Furthermore you can pass a nested dict to indicate different aggregations on dif .. _stats.moments.expanding: -Expanding Windows +Expanding windows ----------------- A common alternative to rolling statistics is to use an *expanding* window, @@ -716,7 +716,7 @@ they are implemented in pandas such that the following two calls are equivalent: These have a similar set of methods to ``.rolling`` methods. -Method Summary +Method summary ~~~~~~~~~~~~~~ .. currentmodule:: pandas.core.window @@ -798,7 +798,7 @@ relative impact of an individual data point. As an example, here is the .. _stats.moments.exponentially_weighted: -Exponentially Weighted Windows +Exponentially weighted windows ------------------------------ .. currentmodule:: pandas.core.window diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 772362cab396c6..15af5208a4f1f3 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -99,7 +99,7 @@ Splitting df[df.AAA <= 5] df[df.AAA > 5] -Building Criteria +Building criteria ***************** `Select with multi-column criteria @@ -245,7 +245,7 @@ Ambiguity arises when an index consists of integers with a non-zero start or non df[~((df.AAA <= 6) & (df.index.isin([0, 2, 4])))] -New Columns +New columns *********** `Efficiently and dynamically creating new columns using applymap @@ -399,7 +399,7 @@ Sorting df.sort_values(by=('Labs', 'II'), ascending=False) -`Partial Selection, the need for sortedness; +`Partial selection, the need for sortedness; `__ Levels @@ -413,7 +413,7 @@ Levels .. _cookbook.missing_data: -Missing Data +Missing data ------------ The :ref:`missing data` docs. @@ -485,7 +485,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to expected_df = gb.apply(GrowUp) expected_df -`Expanding Apply +`Expanding apply `__ .. ipython:: python @@ -595,7 +595,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df.A.groupby((df.A != df.A.shift()).cumsum()).groups df.A.groupby((df.A != df.A.shift()).cumsum()).cumsum() -Expanding Data +Expanding data ************** `Alignment and to-date @@ -690,7 +690,7 @@ To create year and month cross tabulation: Apply ***** -`Rolling Apply to Organize - Turning embedded lists into a MultiIndex frame +`Rolling apply to organize - Turning embedded lists into a MultiIndex frame `__ .. ipython:: python @@ -706,7 +706,7 @@ Apply for ind, row in df.iterrows()}) df_orgz -`Rolling Apply with a DataFrame returning a Series +`Rolling apply with a DataFrame returning a Series `__ Rolling Apply to multiple columns where function calculates a Series before a Scalar from the Series is returned @@ -1099,7 +1099,7 @@ HDFStore The :ref:`HDFStores ` docs -`Simple Queries with a Timestamp Index +`Simple queries with a Timestamp Index `__ `Managing heterogeneous data using a linked multiple table hierarchy @@ -1169,7 +1169,7 @@ Storing Attributes to a group node .. _cookbook.binary: -Binary Files +Binary files ************ pandas readily accepts NumPy record arrays, if you need to read in a binary @@ -1334,7 +1334,7 @@ Values can be set to NaT using np.nan, similar to datetime y[1] = np.nan y -Aliasing Axis Names +Aliasing axis names ------------------- To globally provide aliases for axis names, one can define these 2 functions: @@ -1361,7 +1361,7 @@ To globally provide aliases for axis names, one can define these 2 functions: df2.sum(axis='myaxis2') clear_axis_alias(pd.DataFrame, 'columns', 'myaxis2') -Creating Example Data +Creating example data --------------------- To create a dataframe from every combination of some given values, like R's ``expand.grid()`` diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index 525f9abb1d1aee..c15991fabfd3be 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -3,7 +3,7 @@ {{ header }} ********************* -Enhancing Performance +Enhancing performance ********************* In this part of the tutorial, we will investigate how to speed up certain @@ -15,7 +15,7 @@ when we use Cython and Numba on a test function operating row-wise on the .. _enhancingperf.cython: -Cython (Writing C extensions for pandas) +Cython (writing C extensions for pandas) ---------------------------------------- For many use cases writing pandas in pure Python and NumPy is sufficient. In some @@ -33,7 +33,7 @@ faster than the pure Python solution. .. _enhancingperf.pure: -Pure python +Pure Python ~~~~~~~~~~~ We have a ``DataFrame`` to which we want to apply a function row-wise. @@ -429,7 +429,7 @@ Read more in the `Numba docs `__. .. _enhancingperf.eval: -Expression Evaluation via :func:`~pandas.eval` +Expression evaluation via :func:`~pandas.eval` ----------------------------------------------- The top-level function :func:`pandas.eval` implements expression evaluation of @@ -465,7 +465,7 @@ engine in addition to some extensions available only in pandas. The larger the frame and the larger the expression the more speedup you will see from using :func:`~pandas.eval`. -Supported Syntax +Supported syntax ~~~~~~~~~~~~~~~~ These operations are supported by :func:`pandas.eval`: @@ -505,7 +505,7 @@ This Python syntax is **not** allowed: -:func:`~pandas.eval` Examples +:func:`~pandas.eval` examples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :func:`pandas.eval` works well with expressions containing large arrays. @@ -669,7 +669,7 @@ whether the query modifies the original frame. Unlike with ``eval``, the default value for ``inplace`` for ``query`` is ``False``. This is consistent with prior versions of pandas. -Local Variables +Local variables ~~~~~~~~~~~~~~~ You must *explicitly reference* any local variable that you want to use in an @@ -714,7 +714,7 @@ standard Python. pd.eval('a + b') -:func:`pandas.eval` Parsers +:func:`pandas.eval` parsers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There are two different parsers and two different engines you can use as @@ -754,7 +754,7 @@ The ``and`` and ``or`` operators here have the same precedence that they would in vanilla Python. -:func:`pandas.eval` Backends +:func:`pandas.eval` backends ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There's also the option to make :func:`~pandas.eval` operate identical to plain @@ -779,7 +779,7 @@ is a bit slower (not by much) than evaluating the same expression in Python %timeit pd.eval('df1 + df2 + df3 + df4', engine='python') -:func:`pandas.eval` Performance +:func:`pandas.eval` performance ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :func:`~pandas.eval` is intended to speed up certain kinds of operations. In @@ -804,7 +804,7 @@ computation. The two lines are two different engines. This plot was created using a ``DataFrame`` with 3 columns each containing floating point values generated using ``numpy.random.randn()``. -Technical Minutia Regarding Expression Evaluation +Technical minutia regarding expression evaluation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Expressions that would result in an object dtype or involve datetime operations diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index 3d89fe171a343e..f9a72b87e58d8f 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -75,7 +75,7 @@ See also :ref:`Categorical Memory Usage `. .. _gotchas.truth: -Using If/Truth Statements with pandas +Using if/truth statements with pandas ------------------------------------- pandas follows the NumPy convention of raising an error when you try to convert @@ -317,7 +317,7 @@ See `this link ` and the dfg.groupby(["A", [0, 0, 0, 1, 1]]).ngroup() -Groupby by Indexer to 'resample' data +Groupby by indexer to 'resample' data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Resampling produces new hypothetical samples (resamples) from already existing observed data or from a model that generates data. These new samples are similar to the pre-existing samples. diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 02522e95a2d79e..c09eb87df03689 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -3,7 +3,7 @@ {{ header }} *************************** -Indexing and Selecting Data +Indexing and selecting data *************************** The axis labeling information in pandas objects serves many purposes: @@ -46,7 +46,7 @@ See the :ref:`cookbook` for some advanced strategies. .. _indexing.choice: -Different Choices for Indexing +Different choices for indexing ------------------------------ Object selection has had a number of user-requested additions in order to @@ -181,7 +181,7 @@ columns. df[['A', 'B']] -Attribute Access +Attribute access ---------------- .. _indexing.columns.multiple: @@ -287,7 +287,7 @@ largely as a convenience since it is such a common operation. .. _indexing.label: -Selection By Label +Selection by label ------------------ .. warning:: @@ -420,7 +420,7 @@ above example, ``s.loc[1:6]`` would raise ``KeyError``. .. _indexing.integer: -Selection By Position +Selection by position --------------------- .. warning:: @@ -533,7 +533,7 @@ A list of indexers where any element is out of bounds will raise an .. _indexing.callable: -Selection By Callable +Selection by callable --------------------- .. versionadded:: 0.18.1 @@ -573,7 +573,7 @@ without using a temporary variable. .. _indexing.deprecate_ix: -IX Indexer is Deprecated +IX indexer is deprecated ------------------------ .. warning:: @@ -631,7 +631,7 @@ For getting *multiple* indexers, using ``.get_indexer``: .. _deprecate_loc_reindex_listlike: .. _indexing.deprecate_loc_reindex_listlike: -Indexing with list with missing labels is Deprecated +Indexing with list with missing labels is deprecated ---------------------------------------------------- .. warning:: @@ -655,7 +655,7 @@ Selection with all keys found is unchanged. s.loc[[1, 2]] -Previous Behavior +Previous behavior .. code-block:: ipython @@ -667,7 +667,7 @@ Previous Behavior dtype: float64 -Current Behavior +Current behavior .. code-block:: ipython @@ -732,7 +732,7 @@ However, this would *still* raise if your resulting index is duplicated. .. _indexing.basics.partial_setting: -Selecting Random Samples +Selecting random samples ------------------------ A random selection of rows or columns from a Series or DataFrame with the :meth:`~DataFrame.sample` method. The method will sample rows by default, and accepts a specific number of rows/columns to return, or a fraction of rows. @@ -807,7 +807,7 @@ Finally, one can also set a seed for ``sample``'s random number generator using -Setting With Enlargement +Setting with enlargement ------------------------ The ``.loc/[]`` operations can perform enlargement when setting a non-existent key for that axis. @@ -1076,7 +1076,7 @@ without creating a copy: df.where(df < 0, -df) == np.where(df < 0, df, -df) -**alignment** +**Alignment** Furthermore, ``where`` aligns the input boolean condition (ndarray or DataFrame), such that partial selection with setting is possible. This is analogous to @@ -1351,7 +1351,7 @@ to ``in``/``not in``. df[df.c.isin([1, 2])] -Boolean Operators +Boolean operators ~~~~~~~~~~~~~~~~~ You can negate boolean expressions with the word ``not`` or the ``~`` operator. @@ -1407,7 +1407,7 @@ floating point values generated using ``numpy.random.randn()``. df2 = df.copy() -Duplicate Data +Duplicate data -------------- .. _indexing.duplicate: @@ -1474,7 +1474,7 @@ default value. s.get('a') # equivalent to s['a'] s.get('x', default=-1) -The :meth:`~pandas.DataFrame.lookup` Method +The :meth:`~pandas.DataFrame.lookup` method ------------------------------------------- Sometimes you want to extract a set of values given a sequence of row labels @@ -1628,7 +1628,7 @@ Missing values idx2 idx2.fillna(pd.Timestamp('2011-01-02')) -Set / Reset Index +Set / reset index ----------------- Occasionally you will load or create a data set into a DataFrame and want to diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index c5667e9319ca61..97b9c2f95dc508 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -5,7 +5,7 @@ .. _integer_na: ************************** -Nullable Integer Data Type +Nullable integer data type ************************** .. versionadded:: 0.24.0 diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 6b3edd92ab5a9e..e32bb0f1102527 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -13,7 +13,7 @@ =============================== -IO Tools (Text, CSV, HDF5, ...) +IO tools (text, CSV, HDF5, ...) =============================== The pandas I/O API is a set of top level ``reader`` functions accessed like @@ -51,7 +51,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like .. _io.read_csv_table: -CSV & Text files +CSV & text files ---------------- The workhorse function for reading text files (a.k.a. flat files) is @@ -88,7 +88,7 @@ delim_whitespace : boolean, default False .. versionadded:: 0.18.1 support for the Python parser. -Column and Index Locations and Names +Column and index locations and names ++++++++++++++++++++++++++++++++++++ header : int or list of ints, default ``'infer'`` @@ -155,7 +155,7 @@ mangle_dupe_cols : boolean, default ``True`` Passing in ``False`` will cause data to be overwritten if there are duplicate names in the columns. -General Parsing Configuration +General parsing configuration +++++++++++++++++++++++++++++ dtype : Type name or dict of column -> type, default ``None`` @@ -211,7 +211,7 @@ memory_map : boolean, default False directly onto memory and access the data directly from there. Using this option can improve performance because there is no longer any I/O overhead. -NA and Missing Data Handling +NA and missing data handling ++++++++++++++++++++++++++++ na_values : scalar, str, list-like, or dict, default ``None`` @@ -243,7 +243,7 @@ verbose : boolean, default ``False`` skip_blank_lines : boolean, default ``True`` If ``True``, skip over blank lines rather than interpreting as NaN values. -Datetime Handling +Datetime handling +++++++++++++++++ parse_dates : boolean or list of ints or names or list of lists or dict, default ``False``. @@ -263,7 +263,7 @@ keep_date_col : boolean, default ``False`` date_parser : function, default ``None`` Function to use for converting a sequence of string columns to an array of datetime instances. The default uses ``dateutil.parser.parser`` to do the - conversion. Pandas will try to call date_parser in three different ways, + conversion. pandas will try to call date_parser in three different ways, advancing to the next if an exception occurs: 1) Pass one or more arrays (as defined by parse_dates) as arguments; 2) concatenate (row-wise) the string values from the columns defined by parse_dates into a single array and pass @@ -288,7 +288,7 @@ chunksize : int, default ``None`` Return `TextFileReader` object for iteration. See :ref:`iterating and chunking ` below. -Quoting, Compression, and File Format +Quoting, compression, and file format +++++++++++++++++++++++++++++++++++++ compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``}, default ``'infer'`` @@ -348,7 +348,7 @@ tupleize_cols : boolean, default ``False`` Leave a list of tuples on columns as is (default is to convert to a MultiIndex on the columns). -Error Handling +Error handling ++++++++++++++ error_bad_lines : boolean, default ``True`` @@ -460,7 +460,7 @@ worth trying. .. _io.categorical: -Specifying Categorical dtype +Specifying categorical dtype '''''''''''''''''''''''''''' .. versionadded:: 0.19.0 @@ -529,7 +529,7 @@ This matches the behavior of :meth:`Categorical.set_categories`. df['col3'] -Naming and Using Columns +Naming and using columns '''''''''''''''''''''''' .. _io.headers: @@ -646,7 +646,7 @@ use in the final result: In this case, the callable is specifying that we exclude the "a" and "c" columns from the output. -Comments and Empty Lines +Comments and empty lines '''''''''''''''''''''''' .. _io.skiplines: @@ -759,7 +759,7 @@ We can suppress the comments using the ``comment`` keyword: .. _io.unicode: -Dealing with Unicode Data +Dealing with Unicode data ''''''''''''''''''''''''' The ``encoding`` argument should be used for encoded unicode data, which will @@ -834,7 +834,7 @@ If a subset of data is being parsed using the ``usecols`` option, the Date Handling ''''''''''''' -Specifying Date Columns +Specifying date columns +++++++++++++++++++++++ To better facilitate working with datetime data, :func:`read_csv` @@ -947,7 +947,7 @@ data columns: specify `index_col` as a column label rather then as an index on the resulting frame. -Date Parsing Functions +Date parsing functions ++++++++++++++++++++++ Finally, the parser allows you to specify a custom ``date_parser`` function to @@ -1001,7 +1001,7 @@ a single date rather than the entire array. .. _io.csv.mixed_timezones: -Parsing a CSV with mixed Timezones +Parsing a CSV with mixed timezones ++++++++++++++++++++++++++++++++++ Pandas cannot natively represent a column or index with mixed timezones. If your CSV @@ -1031,7 +1031,7 @@ To parse the mixed-timezone values as a datetime column, pass a partially-applie .. _io.dayfirst: -Inferring Datetime Format +Inferring datetime format +++++++++++++++++++++++++ If you have ``parse_dates`` enabled for some or all of your columns, and your @@ -1070,7 +1070,7 @@ Note that ``infer_datetime_format`` is sensitive to ``dayfirst``. With os.remove('foo.csv') -International Date Formats +International date formats ++++++++++++++++++++++++++ While US date formats tend to be MM/DD/YYYY, many international formats use @@ -1118,7 +1118,7 @@ writing to a file). For example: .. _io.thousands: -Thousand Separators +Thousand separators ''''''''''''''''''' For large numbers that have been written with a thousands separator, you can @@ -1163,7 +1163,7 @@ The ``thousands`` keyword allows integers to be parsed correctly: .. _io.na_values: -NA Values +NA values ''''''''' To control which values are parsed as missing values (which are signified by @@ -1385,7 +1385,7 @@ should pass the ``escapechar`` option: .. _io.fwf: -Files with Fixed Width Columns +Files with fixed width columns '''''''''''''''''''''''''''''' While :func:`read_csv` reads delimited data, the :func:`read_fwf` function works @@ -1686,7 +1686,7 @@ documentation on credentials -Writing out Data +Writing out data '''''''''''''''' .. _io.store_in_csv: @@ -1805,7 +1805,7 @@ Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datet json = dfj.to_json() json -Orient Options +Orient options ++++++++++++++ There are a number of different options for the format of the resulting JSON @@ -1869,7 +1869,7 @@ preservation of metadata including but not limited to dtypes and index names. index and column labels during round-trip serialization. If you wish to preserve label ordering use the `split` option as it uses ordered containers. -Date Handling +Date handling +++++++++++++ Writing in ISO date format: @@ -1910,7 +1910,7 @@ Writing to a file, with a date index and a date column: with open('test.json') as fh: print(fh.read()) -Fallback Behavior +Fallback behavior +++++++++++++++++ If the JSON serializer cannot handle the container contents directly it will @@ -2003,7 +2003,7 @@ If a non-default ``orient`` was used when encoding to JSON be sure to pass the s option here so that decoding produces sensible results, see `Orient Options`_ for an overview. -Data Conversion +Data conversion +++++++++++++++ The default of ``convert_axes=True``, ``dtype=True``, and ``convert_dates=True`` @@ -2090,7 +2090,7 @@ Dates written in nanoseconds need to be read back in nanoseconds: dfju = pd.read_json(json, date_unit='ns') dfju -The Numpy Parameter +The Numpy parameter +++++++++++++++++++ .. note:: @@ -2218,7 +2218,7 @@ For line-delimited json files, pandas can also return an iterator which reads in .. _io.table_schema: -Table Schema +Table schema '''''''''''' .. versionadded:: 0.20.0 @@ -2378,7 +2378,7 @@ HTML .. _io.read_html: -Reading HTML Content +Reading HTML content '''''''''''''''''''''' .. warning:: @@ -2797,7 +2797,7 @@ See the :ref:`cookbook` for some advanced strategies. .. _io.excel_reader: -Reading Excel Files +Reading Excel files ''''''''''''''''''' In the most basic use-case, ``read_excel`` takes a path to an Excel @@ -2879,7 +2879,7 @@ with ``on_demand=True``. .. _io.excel.specifying_sheets: -Specifying Sheets +Specifying sheets +++++++++++++++++ .. note :: The second argument is ``sheet_name``, not to be confused with ``ExcelFile.sheet_names``. @@ -2980,7 +2980,7 @@ should be passed to ``index_col`` and ``header``: os.remove('path_to_file.xlsx') -Parsing Specific Columns +Parsing specific columns ++++++++++++++++++++++++ It is often the case that users will insert columns to do temporary computations @@ -3035,7 +3035,7 @@ the column names, returning names where the callable function evaluates to ``Tru pd.read_excel('path_to_file.xls', 'Sheet1', usecols=lambda x: x.isalpha()) -Parsing Dates +Parsing dates +++++++++++++ Datetime-like values are normally automatically converted to the appropriate @@ -3048,7 +3048,7 @@ use the ``parse_dates`` keyword to parse those strings to datetimes: pd.read_excel('path_to_file.xls', 'Sheet1', parse_dates=['date_strings']) -Cell Converters +Cell converters +++++++++++++++ It is possible to transform the contents of Excel cells via the ``converters`` @@ -3073,7 +3073,7 @@ missing data to recover integer dtype: pd.read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun}) -dtype Specifications +Dtype specifications ++++++++++++++++++++ .. versionadded:: 0.20 @@ -3089,10 +3089,10 @@ no type inference, use the type ``str`` or ``object``. .. _io.excel_writer: -Writing Excel Files +Writing Excel files ''''''''''''''''''' -Writing Excel Files to Disk +Writing Excel files to disk +++++++++++++++++++++++++++ To write a ``DataFrame`` object to a sheet of an Excel file, you can use the @@ -3138,7 +3138,7 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`. .. _io.excel_writing_buffer: -Writing Excel Files to Memory +Writing Excel files to memory +++++++++++++++++++++++++++++ Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` or @@ -3218,7 +3218,7 @@ argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are: .. _io.excel.style: -Style and Formatting +Style and formatting '''''''''''''''''''' The look and feel of Excel worksheets created from pandas can be modified using the following parameters on the ``DataFrame``'s ``to_excel`` method. @@ -3447,7 +3447,7 @@ pandas objects. os.remove('foo.msg') os.remove('foo2.msg') -Read/Write API +Read/write API '''''''''''''' Msgpacks can also be read from and written to strings. @@ -3546,7 +3546,7 @@ Closing a Store and using a context manager: -Read/Write API +Read/write API '''''''''''''' ``HDFStore`` supports an top-level API using ``read_hdf`` for reading and ``to_hdf`` for writing, @@ -3592,7 +3592,7 @@ HDFStore will by default not drop rows that are all missing. This behavior can b .. _io.hdf5-fixed: -Fixed Format +Fixed format '''''''''''' The examples above show storing using ``put``, which write the HDF5 to ``PyTables`` in a fixed array format, called @@ -3616,7 +3616,7 @@ This format is specified by default when using ``put`` or ``to_hdf`` or by ``for .. _io.hdf5-table: -Table Format +Table format '''''''''''' ``HDFStore`` supports another ``PyTables`` format on disk, the ``table`` @@ -3658,7 +3658,7 @@ enable ``put/append/to_hdf`` to by default store in the ``table`` format. .. _io.hdf5-keys: -Hierarchical Keys +Hierarchical keys ''''''''''''''''' Keys to a store can be specified as a string. These can be in a @@ -3725,10 +3725,10 @@ will yield a tuple for each group key along with the relative keys of its conten .. _io.hdf5-types: -Storing Types +Storing types ''''''''''''' -Storing Mixed Types in a Table +Storing mixed types in a table ++++++++++++++++++++++++++++++ Storing mixed-dtype data is supported. Strings are stored as a @@ -3792,7 +3792,7 @@ storing/selecting from homogeneous index ``DataFrames``. Querying '''''''' -Querying a Table +Querying a table ++++++++++++++++ ``select`` and ``delete`` operations have an optional criterion that can @@ -3998,7 +3998,7 @@ See `here `__. .. _io.stata: -Stata Format +Stata format ------------ .. _io.stata_writer: -Writing to Stata format +Writing to stata format ''''''''''''''''''''''' The method :func:`~pandas.core.frame.DataFrame.to_stata` will write a DataFrame @@ -5367,7 +5367,7 @@ values will have ``object`` data type. .. _io.stata-categorical: -Categorical Data +Categorical data ++++++++++++++++ ``Categorical`` data can be exported to *Stata* data files as value labeled data. @@ -5413,7 +5413,7 @@ whether imported ``Categorical`` variables are ordered. .. _io.sas_reader: -SAS Formats +SAS formats ----------- The top-level function :func:`read_sas` can read (but not write) SAS @@ -5475,7 +5475,7 @@ easy conversion to and from pandas. .. _io.perf: -Performance Considerations +Performance considerations -------------------------- This is an informal comparison of various IO methods, using pandas diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 25c486c839b7fb..43d44ff30c64a5 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -814,7 +814,7 @@ The ``indicator`` argument will also accept string arguments, in which case the .. _merging.dtypes: -Merge Dtypes +Merge dtypes ~~~~~~~~~~~~ .. versionadded:: 0.19.0 @@ -1361,7 +1361,7 @@ Timeseries friendly merging .. _merging.merge_ordered: -Merging Ordered Data +Merging ordered data ~~~~~~~~~~~~~~~~~~~~ A :func:`merge_ordered` function allows combining time series and other @@ -1381,7 +1381,7 @@ fill/interpolate missing data: .. _merging.merge_asof: -Merging AsOf +Merging asof ~~~~~~~~~~~~ .. versionadded:: 0.19.0 diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index cd70a109b3c77a..1439296fb82960 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -74,7 +74,7 @@ Series and DataFrame objects: df2['one'] == np.nan -Integer Dtypes and Missing Data +Integer dtypes and missing data ------------------------------- Because ``NaN`` is a float, a column of integers with even one missing values @@ -175,7 +175,7 @@ account for missing data. For example: .. _missing_data.numeric_sum: -Sum/Prod of Empties/Nans +Sum/prod of empties/nans ~~~~~~~~~~~~~~~~~~~~~~~~ .. warning:: @@ -473,7 +473,7 @@ at the new values. .. _missing_data.interp_limits: -Interpolation Limits +Interpolation limits -------------------- Like other pandas fill methods, :meth:`~DataFrame.interpolate` accepts a ``limit`` keyword @@ -523,7 +523,7 @@ the ``limit_area`` parameter restricts filling to either inside or outside value .. _missing_data.replace: -Replacing Generic Values +Replacing generic values ~~~~~~~~~~~~~~~~~~~~~~~~ Often times we want to replace arbitrary values with other values. @@ -568,7 +568,7 @@ missing and interpolate over them: .. _missing_data.replace_expression: -String/Regular Expression Replacement +String/regular expression replacement ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. note:: @@ -664,7 +664,7 @@ want to use a regular expression. Anywhere in the above ``replace`` examples that you see a regular expression a compiled regular expression is valid as well. -Numeric Replacement +Numeric replacement ~~~~~~~~~~~~~~~~~~~ :meth:`~DataFrame.replace` is similar to :meth:`~DataFrame.fillna`. diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index 4d0def435cb1e0..1f296c0d6c0889 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -3,7 +3,7 @@ {{ header }} ******************** -Options and Settings +Options and settings ******************** Overview @@ -68,7 +68,7 @@ with no argument ``describe_option`` will print out the descriptions for all ava pd.reset_option("all") -Getting and Setting Options +Getting and setting options --------------------------- As described above, :func:`~pandas.get_option` and :func:`~pandas.set_option` @@ -120,10 +120,10 @@ are restored automatically when you exit the `with` block: print(pd.get_option("display.max_columns")) -Setting Startup Options in python/ipython Environment +Setting startup options in Python/IPython environment ----------------------------------------------------- -Using startup scripts for the python/ipython environment to import pandas and set options makes working with pandas more efficient. To do this, create a .py or .ipy script in the startup directory of the desired profile. An example where the startup folder is in a default ipython profile can be found at: +Using startup scripts for the Python/IPython environment to import pandas and set options makes working with pandas more efficient. To do this, create a .py or .ipy script in the startup directory of the desired profile. An example where the startup folder is in a default ipython profile can be found at: .. code-block:: none @@ -266,7 +266,7 @@ The options are 'right', and 'left'. .. _options.available: -Available Options +Available options ----------------- ======================================= ============ ================================== @@ -444,7 +444,7 @@ plotting.matplotlib.register_converters True Register custom converters .. _basics.console_output: -Number Formatting +Number formatting ------------------ pandas also allows you to set how numbers are displayed in the console. @@ -475,7 +475,7 @@ To round floats on a case-by-case basis, you can also use :meth:`~pandas.Series. .. _options.east_asian_width: -Unicode Formatting +Unicode formatting ------------------ .. warning:: @@ -538,7 +538,7 @@ However, setting this option incorrectly for your terminal will cause these char .. _options.table_schema: -Table Schema Display +Table schema display -------------------- .. versionadded:: 0.20.0 diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 8ad78a68977ad5..b7b6dd0a69c24d 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -3,7 +3,7 @@ {{ header }} ************************** -Reshaping and Pivot Tables +Reshaping and pivot tables ************************** Reshaping by pivoting DataFrame objects @@ -186,7 +186,7 @@ removed. .. _reshaping.stack_multiple: -Multiple Levels +Multiple levels ~~~~~~~~~~~~~~~ You may also stack or unstack more than one level at a time by passing a list @@ -214,7 +214,7 @@ not a mixture of the two). # from above is equivalent to: df.stack(level=[1, 2]) -Missing Data +Missing data ~~~~~~~~~~~~ These functions are intelligent about handling missing data and do not expect @@ -509,7 +509,7 @@ each group defined by the first two ``Series``: pd.crosstab(df.A, df.B, values=df.C, aggfunc=np.sum) -Adding Margins +Adding margins ~~~~~~~~~~~~~~ Finally, one can also add margins or normalize this output. @@ -727,7 +727,7 @@ DataFrame will be pivoted in the answers below. df -Pivoting with Single Aggregations +Pivoting with single aggregations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Suppose we wanted to pivot ``df`` such that the ``col`` values are columns, @@ -775,7 +775,7 @@ and rows occur together a.k.a. "cross tabulation". To do this, we can pass df.pivot_table(index='row', columns='col', fill_value=0, aggfunc='size') -Pivoting with Multiple Aggregations +Pivoting with multiple aggregations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ We can also perform multiple aggregations. For example, to perform both a diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index 6ee11bd78fee95..98fd30f67d05b5 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -116,7 +116,7 @@ in many places .. _sparse.accessor: -Sparse Accessor +Sparse accessor --------------- .. versionadded:: 0.24.0 @@ -142,7 +142,7 @@ See :ref:`api.frame.sparse` for more. .. _sparse.calculation: -Sparse Calculation +Sparse calculation ------------------ You can apply NumPy `ufuncs `_ @@ -239,7 +239,7 @@ Sparse-specific properties, like ``density``, are available on the ``.sparse`` a df.sparse.density -**General Differences** +**General differences** In a ``SparseDataFrame``, *all* columns were sparse. A :class:`DataFrame` can have a mixture of sparse and dense columns. As a consequence, assigning new columns to a ``DataFrame`` with sparse @@ -370,7 +370,7 @@ row and columns coordinates of the matrix. Note that this will consume a signifi .. _sparse.subclasses: -Sparse Subclasses +Sparse subclasses ----------------- The :class:`SparseSeries` and :class:`SparseDataFrame` classes are deprecated. Visit their diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 79a9848704eec9..8aa1f63ecf22a8 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -26,7 +26,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Building Styles\n", + "## Building styles\n", "\n", "Pass your style functions into one of the following methods:\n", "\n", @@ -297,7 +297,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Finer Control: Slicing" + "## Finer control: slicing" ] }, { @@ -410,7 +410,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Builtin Styles" + "## Builtin styles" ] }, { @@ -612,7 +612,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Sharing Styles" + "## Sharing styles" ] }, { @@ -754,7 +754,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Table Styles" + "### Table styles" ] }, { @@ -840,7 +840,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### CSS Classes\n", + "### CSS classes\n", "\n", "Certain CSS classes are attached to cells.\n", "\n", diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 87c75e8bcd91fd..4f1fcdeb62f148 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -3,7 +3,7 @@ {{ header }} ====================== -Working with Text Data +Working with text data ====================== .. _text.string_methods: @@ -81,7 +81,7 @@ and replacing any remaining whitespaces with underscores: exceptions, other uses are not supported, and may be disabled at a later point. -Splitting and Replacing Strings +Splitting and replacing strings ------------------------------- .. _text.split: @@ -356,7 +356,7 @@ of the string, the result will be a ``NaN``. s.str[0] s.str[1] -Extracting Substrings +Extracting substrings --------------------- .. _text.extract: @@ -518,7 +518,7 @@ same result as a ``Series.str.extractall`` with a default index (starts from 0). pd.Series(["a1a2", "b1", "c1"]).str.extractall(two_groups) -Testing for Strings that Match or Contain a Pattern +Testing for Strings that match or contain a pattern --------------------------------------------------- You can check whether elements contain a pattern: @@ -547,7 +547,7 @@ an extra ``na`` argument so missing values can be considered True or False: .. _text.indicator: -Creating Indicator Variables +Creating indicator variables ---------------------------- You can extract dummy variables from string columns. @@ -569,7 +569,7 @@ String ``Index`` also supports ``get_dummies`` which returns a ``MultiIndex``. See also :func:`~pandas.get_dummies`. -Method Summary +Method summary -------------- .. _text.summary: diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst index 40a8fd3101409f..3e46140d79b8e6 100644 --- a/doc/source/user_guide/timedeltas.rst +++ b/doc/source/user_guide/timedeltas.rst @@ -5,7 +5,7 @@ .. _timedeltas.timedeltas: *********** -Time Deltas +Time deltas *********** Timedeltas are differences in times, expressed in difference units, e.g. days, hours, minutes, @@ -229,7 +229,7 @@ Numeric reduction operation for ``timedelta64[ns]`` will return ``Timedelta`` ob .. _timedeltas.timedeltas_convert: -Frequency Conversion +Frequency conversion -------------------- Timedelta Series, ``TimedeltaIndex``, and ``Timedelta`` scalars can be converted to other 'frequencies' by dividing by another timedelta, @@ -360,7 +360,7 @@ inferred frequency upon creation: pd.TimedeltaIndex(['0 days', '10 days', '20 days'], freq='infer') -Generating Ranges of Time Deltas +Generating ranges of time deltas ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Similar to :func:`date_range`, you can construct regular ranges of a ``TimedeltaIndex`` diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 7bdec001a688f5..fcad6db9459817 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -3,7 +3,7 @@ {{ header }} ******************************** -Time Series / Date functionality +Time series / date functionality ******************************** pandas contains extensive capabilities and features for working with time series data for all domains. @@ -183,7 +183,7 @@ future releases. .. _timeseries.converting: -Converting to Timestamps +Converting to timestamps ------------------------ To convert a :class:`Series` or list-like object of date-like objects e.g. strings, @@ -235,7 +235,7 @@ inferred frequency upon creation: pd.DatetimeIndex(['2018-01-01', '2018-01-03', '2018-01-05'], freq='infer') -Providing a Format Argument +Providing a format argument ~~~~~~~~~~~~~~~~~~~~~~~~~~~ In addition to the required datetime string, a ``format`` argument can be passed to ensure specific parsing. @@ -252,7 +252,7 @@ option, see the Python `datetime documentation`_. .. _datetime documentation: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior -Assembling Datetime from Multiple DataFrame Columns +Assembling datetime from multiple DataFrame columns ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. versionadded:: 0.18.1 @@ -279,7 +279,7 @@ You can pass only the columns that you need to assemble. * required: ``year``, ``month``, ``day`` * optional: ``hour``, ``minute``, ``second``, ``millisecond``, ``microsecond``, ``nanosecond`` -Invalid Data +Invalid data ~~~~~~~~~~~~ The default behavior, ``errors='raise'``, is to raise when unparseable: @@ -304,7 +304,7 @@ Pass ``errors='coerce'`` to convert unparseable data to ``NaT`` (not a time): .. _timeseries.converting.epoch: -Epoch Timestamps +Epoch timestamps ~~~~~~~~~~~~~~~~ pandas supports converting integer or float epoch times to ``Timestamp`` and @@ -356,7 +356,7 @@ as timezone-naive timestamps and then localize to the appropriate timezone: .. _timeseries.converting.epoch_inverse: -From Timestamps to Epoch +From timestamps to epoch ~~~~~~~~~~~~~~~~~~~~~~~~ To invert the operation from above, namely, to convert from a ``Timestamp`` to a 'unix' epoch: @@ -396,7 +396,7 @@ Commonly called 'unix epoch' or POSIX time. .. _timeseries.daterange: -Generating Ranges of Timestamps +Generating ranges of timestamps ------------------------------- To generate an index with timestamps, you can use either the ``DatetimeIndex`` or @@ -471,7 +471,7 @@ resulting ``DatetimeIndex``: .. _timeseries.custom-freq-ranges: -Custom Frequency Ranges +Custom frequency ranges ~~~~~~~~~~~~~~~~~~~~~~~ .. warning:: @@ -504,7 +504,7 @@ used if a custom frequency string is passed. .. _timeseries.timestamp-limits: -Timestamp Limitations +Timestamp limitations --------------------- Since pandas represents timestamps in nanosecond resolution, the time span that @@ -561,7 +561,7 @@ intelligent functionality like selection, slicing, etc. .. _timeseries.partialindexing: -Partial String Indexing +Partial string indexing ~~~~~~~~~~~~~~~~~~~~~~~ Dates and strings that parse to timestamps can be passed as indexing parameters: @@ -648,7 +648,7 @@ Slicing with string indexing also honors UTC offset. .. _timeseries.slice_vs_exact_match: -Slice vs. Exact Match +Slice vs. exact match ~~~~~~~~~~~~~~~~~~~~~ .. versionchanged:: 0.20.0 @@ -719,7 +719,7 @@ Note also that ``DatetimeIndex`` resolution cannot be less precise than day. series_monthly['2011-12'] # returns Series -Exact Indexing +Exact indexing ~~~~~~~~~~~~~~ As discussed in previous section, indexing a ``DatetimeIndex`` with a partial string depends on the "accuracy" of the period, in other words how specific the interval is in relation to the resolution of the index. In contrast, indexing with ``Timestamp`` or ``datetime`` objects is exact, because the objects have exact meaning. These also follow the semantics of *including both endpoints*. @@ -738,7 +738,7 @@ With no defaults. datetime.datetime(2013, 2, 28, 10, 12, 0)] -Truncating & Fancy Indexing +Truncating & fancy indexing ~~~~~~~~~~~~~~~~~~~~~~~~~~~ A :meth:`~DataFrame.truncate` convenience function is provided that is similar @@ -763,7 +763,7 @@ regularity will result in a ``DatetimeIndex``, although frequency is lost: .. _timeseries.components: -Time/Date Components +Time/date components -------------------- There are several time/date properties that one can access from ``Timestamp`` or a collection of timestamps like a ``DatetimeIndex``. @@ -805,7 +805,7 @@ on :ref:`.dt accessors`. .. _timeseries.offsets: -DateOffset Objects +DateOffset objects ------------------ In the preceding examples, frequency strings (e.g. ``'D'``) were used to specify @@ -922,7 +922,7 @@ in the operation). .. _relativedelta documentation: https://dateutil.readthedocs.io/en/stable/relativedelta.html -Parametric Offsets +Parametric offsets ~~~~~~~~~~~~~~~~~~ Some of the offsets can be "parameterized" when created to result in different @@ -958,7 +958,7 @@ Another example is parameterizing ``YearEnd`` with the specific ending month: .. _timeseries.offsetseries: -Using Offsets with ``Series`` / ``DatetimeIndex`` +Using offsets with ``Series`` / ``DatetimeIndex`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Offsets can be used with either a ``Series`` or ``DatetimeIndex`` to @@ -997,7 +997,7 @@ calculate significantly slower and will show a ``PerformanceWarning`` .. _timeseries.custombusinessdays: -Custom Business Days +Custom business days ~~~~~~~~~~~~~~~~~~~~ The ``CDay`` or ``CustomBusinessDay`` class provides a parametric @@ -1071,7 +1071,7 @@ in the usual way. .. _timeseries.businesshour: -Business Hour +Business hour ~~~~~~~~~~~~~ The ``BusinessHour`` class provides a business hour representation on ``BusinessDay``, @@ -1172,7 +1172,7 @@ following subsection. .. _timeseries.custombusinesshour: -Custom Business Hour +Custom business hour ~~~~~~~~~~~~~~~~~~~~ .. versionadded:: 0.18.1 @@ -1205,7 +1205,7 @@ You can use keyword arguments supported by either ``BusinessHour`` and ``CustomB .. _timeseries.offset_aliases: -Offset Aliases +Offset aliases ~~~~~~~~~~~~~~ A number of string aliases are given to useful common time series @@ -1243,7 +1243,7 @@ frequencies. We will refer to these aliases as *offset aliases*. "U, us", "microseconds" "N", "nanoseconds" -Combining Aliases +Combining aliases ~~~~~~~~~~~~~~~~~ As we have seen previously, the alias and the offset instance are fungible in @@ -1263,7 +1263,7 @@ You can combine together day and intraday offsets: pd.date_range(start, periods=10, freq='1D10U') -Anchored Offsets +Anchored offsets ~~~~~~~~~~~~~~~~ For some frequencies you can specify an anchoring suffix: @@ -1308,7 +1308,7 @@ These can be used as arguments to ``date_range``, ``bdate_range``, constructors for ``DatetimeIndex``, as well as various other timeseries-related functions in pandas. -Anchored Offset Semantics +Anchored offset semantics ~~~~~~~~~~~~~~~~~~~~~~~~~ For those offsets that are anchored to the start or end of specific @@ -1356,7 +1356,7 @@ it is rolled forward to the next anchor point. .. _timeseries.holiday: -Holidays / Holiday Calendars +Holidays / holiday calendars ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Holidays and calendars provide a simple way to define holiday rules to be used @@ -1456,7 +1456,7 @@ or calendars with additional rules. Time Series-Related Instance Methods ------------------------------------ -Shifting / Lagging +Shifting / lagging ~~~~~~~~~~~~~~~~~~ One may want to *shift* or *lag* the values in a time series back and forward in @@ -1489,7 +1489,7 @@ changes all the dates in the index by a specified number of offsets: Note that with ``tshift``, the leading entry is no longer NaN because the data is not being realigned. -Frequency Conversion +Frequency conversion ~~~~~~~~~~~~~~~~~~~~ The primary function for changing frequencies is the :meth:`~Series.asfreq` @@ -1511,13 +1511,13 @@ method for any gaps that may appear after the frequency conversion. ts.asfreq(pd.offsets.BDay(), method='pad') -Filling Forward / Backward +Filling forward / backward ~~~~~~~~~~~~~~~~~~~~~~~~~~ Related to ``asfreq`` and ``reindex`` is :meth:`~Series.fillna`, which is documented in the :ref:`missing data section `. -Converting to Python Datetimes +Converting to Python datetimes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ``DatetimeIndex`` can be converted to an array of Python native @@ -1654,7 +1654,7 @@ For upsampling, you can specify a way to upsample and the ``limit`` parameter to ts[:2].resample('250L').ffill(limit=2) -Sparse Resampling +Sparse resampling ~~~~~~~~~~~~~~~~~ Sparse timeseries are the ones where you have a lot fewer points relative @@ -1807,7 +1807,7 @@ See :ref:`groupby.iterating-label` or :class:`Resampler.__iter__` for more. .. _timeseries.periods: -Time Span Representation +Time span representation ------------------------ Regular intervals of time are represented by ``Period`` objects in pandas while @@ -1939,7 +1939,7 @@ objects: .. _timeseries.period_dtype: -Period Dtypes +Period dtypes ~~~~~~~~~~~~~ .. versionadded:: 0.19.0 @@ -1974,7 +1974,7 @@ The ``period`` dtype can be used in ``.astype(...)``. It allows one to change th dti.astype('period[M]') -PeriodIndex Partial String Indexing +PeriodIndex partial string indexing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ You can pass in dates and strings to ``Series`` and ``DataFrame`` with ``PeriodIndex``, in the same manner as ``DatetimeIndex``. For details, refer to :ref:`DatetimeIndex Partial String Indexing `. @@ -2007,7 +2007,7 @@ As with ``DatetimeIndex``, the endpoints will be included in the result. The exa dfp['2013-01-01 10H':'2013-01-01 11H'] -Frequency Conversion and Resampling with PeriodIndex +Frequency conversion and resampling with PeriodIndex ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The frequency of ``Period`` and ``PeriodIndex`` can be converted via the ``asfreq`` method. Let's start with the fiscal year 2011, ending in December: @@ -2078,7 +2078,7 @@ frequencies ``Q-JAN`` through ``Q-DEC``. .. _timeseries.interchange: -Converting Between Representations +Converting between representations ---------------------------------- Timestamped data can be converted to PeriodIndex-ed data using ``to_period`` @@ -2122,7 +2122,7 @@ the quarter end: .. _timeseries.oob: -Representing Out-of-Bounds Spans +Representing out-of-bounds spans -------------------------------- If you have data that is outside of the ``Timestamp`` bounds, see :ref:`Timestamp limitations `, @@ -2156,7 +2156,7 @@ These can easily be converted to a ``PeriodIndex``: .. _timeseries.timezone: -Time Zone Handling +Time zone handling ------------------ pandas provides rich support for working with timestamps in different time @@ -2164,7 +2164,7 @@ zones using the ``pytz`` and ``dateutil`` libraries or class:`datetime.timezone` objects from the standard library. -Working with Time Zones +Working with time zones ~~~~~~~~~~~~~~~~~~~~~~~ By default, pandas objects are time zone unaware: @@ -2320,7 +2320,7 @@ To remove time zone information, use ``tz_localize(None)`` or ``tz_convert(None) .. _timeseries.timezone_ambiguous: -Ambiguous Times when Localizing +Ambiguous times when localizing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ``tz_localize`` may not be able to determine the UTC offset of a timestamp @@ -2354,7 +2354,7 @@ Handle these ambiguous times by specifying the following. .. _timeseries.timezone_nonexistent: -Nonexistent Times when Localizing +Nonexistent times when localizing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ A DST transition may also shift the local time ahead by 1 hour creating nonexistent @@ -2392,7 +2392,7 @@ Transform nonexistent times to ``NaT`` or shift the times. .. _timeseries.timezone_series: -Time Zone Series Operations +Time zone series operations ~~~~~~~~~~~~~~~~~~~~~~~~~~~ A :class:`Series` with time zone **naive** values is diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 2448d0e5d99303..6589900c8491c8 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -23,7 +23,7 @@ libraries that go beyond the basics documented here. .. _visualization.basic: -Basic Plotting: ``plot`` +Basic plotting: ``plot`` ------------------------ We will demonstrate the basics, see the :ref:`cookbook` for @@ -97,7 +97,7 @@ You can plot one column versus another using the `x` and `y` keywords in .. _visualization.other: -Other Plots +Other plots ----------- Plotting methods allow for a handful of plot styles other than the @@ -311,7 +311,7 @@ The ``by`` keyword can be specified to plot grouped histograms: .. _visualization.box: -Box Plots +Box plots ~~~~~~~~~ Boxplot can be drawn calling :meth:`Series.plot.box` and :meth:`DataFrame.plot.box`, @@ -495,7 +495,7 @@ then by the numeric columns. .. _visualization.area_plot: -Area Plot +Area plot ~~~~~~~~~ You can create area plots with :meth:`Series.plot.area` and :meth:`DataFrame.plot.area`. @@ -531,7 +531,7 @@ To produce an unstacked plot, pass ``stacked=False``. Alpha value is set to 0.5 .. _visualization.scatter: -Scatter Plot +Scatter plot ~~~~~~~~~~~~ Scatter plot can be drawn by using the :meth:`DataFrame.plot.scatter` method. @@ -599,7 +599,7 @@ See the :meth:`scatter ` method and the .. _visualization.hexbin: -Hexagonal Bin Plot +Hexagonal bin plot ~~~~~~~~~~~~~~~~~~ You can create hexagonal bin plots with :meth:`DataFrame.plot.hexbin`. @@ -762,7 +762,7 @@ See the `matplotlib pie documentation `). These can be used to control additional styling, beyond what pandas provides. -Controlling the Legend +Controlling the legend ~~~~~~~~~~~~~~~~~~~~~~ You may set the ``legend`` argument to ``False`` to hide the legend, which is @@ -1140,7 +1140,7 @@ You may pass ``logy`` to get a log-scale Y axis. See also the ``logx`` and ``loglog`` keyword arguments. -Plotting on a Secondary Y-axis +Plotting on a secondary y-axis ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ To plot data on a secondary y-axis, use the ``secondary_y`` keyword: @@ -1194,7 +1194,7 @@ with "(right)" in the legend. To turn off the automatic marking, use the plt.close('all') -Suppressing Tick Resolution Adjustment +Suppressing tick resolution adjustment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ pandas includes automatic tick resolution adjustment for regular frequency @@ -1248,7 +1248,7 @@ in ``pandas.plotting.plot_params`` can be used in a `with statement`: plt.close('all') -Automatic Date Tick Adjustment +Automatic date tick adjustment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. versionadded:: 0.20.0 @@ -1276,7 +1276,7 @@ with the ``subplots`` keyword: plt.close('all') -Using Layout and Targeting Multiple Axes +Using layout and targeting multiple axes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The layout of subplots can be specified by the ``layout`` keyword. It can accept @@ -1377,7 +1377,7 @@ Another option is passing an ``ax`` argument to :meth:`Series.plot` to plot on a .. _visualization.errorbars: -Plotting With Error Bars +Plotting with error bars ~~~~~~~~~~~~~~~~~~~~~~~~ Plotting with error bars is supported in :meth:`DataFrame.plot` and :meth:`Series.plot`. @@ -1423,7 +1423,7 @@ Here is an example of one way to easily plot group means with standard deviation .. _visualization.table: -Plotting Tables +Plotting tables ~~~~~~~~~~~~~~~ Plotting with matplotlib table is now supported in :meth:`DataFrame.plot` and :meth:`Series.plot` with a ``table`` keyword. The ``table`` keyword can accept ``bool``, :class:`DataFrame` or :class:`Series`. The simple way to draw a table is to specify ``table=True``. Data will be transposed to meet matplotlib's default layout. diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst index 9d497f2fc658db..59ea6b97762327 100644 --- a/doc/source/whatsnew/v0.10.0.rst +++ b/doc/source/whatsnew/v0.10.0.rst @@ -255,7 +255,7 @@ Convenience methods ``ffill`` and ``bfill`` have been added: New features ~~~~~~~~~~~~ -Wide DataFrame Printing +Wide DataFrame printing ~~~~~~~~~~~~~~~~~~~~~~~ Instead of printing the summary information, pandas now splits the string @@ -290,7 +290,7 @@ The width of each line can be changed via 'line_width' (80 by default): wide_frame -Updated PyTables Support +Updated PyTables support ~~~~~~~~~~~~~~~~~~~~~~~~ :ref:`Docs ` for PyTables ``Table`` format & several enhancements to the api. Here is a taste of what to expect. @@ -490,7 +490,7 @@ Updated PyTables Support however, query terms using the prior (undocumented) methodology are unsupported. You must read in the entire file and write it out using the new format to take advantage of the updates. -N Dimensional Panels (Experimental) +N dimensional Panels (experimental) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Adding experimental support for Panel4D and factory functions to create n-dimensional named panels. diff --git a/doc/source/whatsnew/v0.11.0.rst b/doc/source/whatsnew/v0.11.0.rst index 0dfcfca9a7464f..31fab6c9aeb74d 100644 --- a/doc/source/whatsnew/v0.11.0.rst +++ b/doc/source/whatsnew/v0.11.0.rst @@ -20,7 +20,7 @@ of useful recipes in pandas (and that we want contributions!). There are several libraries that are now :ref:`Recommended Dependencies ` -Selection Choices +Selection choices ~~~~~~~~~~~~~~~~~ Starting in 0.11.0, object selection has had a number of user-requested additions in @@ -56,7 +56,7 @@ three types of multi-axis indexing. See more at :ref:`Advanced Indexing ` and :ref:`Advanced Hierarchical `. -Selection Deprecations +Selection deprecations ~~~~~~~~~~~~~~~~~~~~~~ Starting in version 0.11.0, these methods *may* be deprecated in future versions. @@ -88,7 +88,7 @@ Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passe df3 df3.dtypes -Dtype Conversion +Dtype conversion ~~~~~~~~~~~~~~~~ This is lower-common-denominator upcasting, meaning you get the dtype which can accommodate all of the types @@ -103,7 +103,7 @@ Conversion df3.astype('float32').dtypes -Mixed Conversion +Mixed conversion .. code-block:: ipython @@ -134,7 +134,7 @@ Mixed Conversion E int32 dtype: object -Forcing Date coercion (and setting ``NaT`` when not datelike) +Forcing date coercion (and setting ``NaT`` when not datelike) .. code-block:: ipython @@ -154,10 +154,10 @@ Forcing Date coercion (and setting ``NaT`` when not datelike) 5 2001-01-05 dtype: datetime64[ns] -Dtype Gotchas +Dtype gotchas ~~~~~~~~~~~~~ -**Platform Gotchas** +**Platform gotchas** Starting in 0.11.0, construction of DataFrame/Series will use default dtypes of ``int64`` and ``float64``, *regardless of platform*. This is not an apparent change from earlier versions of pandas. If you specify @@ -185,7 +185,7 @@ The following will all result in ``int64`` dtypes Keep in mind that ``DataFrame(np.array([1,2]))`` **WILL** result in ``int32`` on 32-bit platforms! -**Upcasting Gotchas** +**Upcasting gotchas** Performing indexing operations on integer type data can easily upcast the data. The dtype of the input data will be preserved in cases where ``nans`` are not introduced. @@ -280,7 +280,7 @@ While float dtypes are unchanged. E int32 dtype: object -Datetimes Conversion +Datetimes conversion ~~~~~~~~~~~~~~~~~~~~ Datetime64[ns] columns in a DataFrame (or a Series) allow the use of ``np.nan`` to indicate a nan value, diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst index ff549f10a97c3c..0a74d674867156 100644 --- a/doc/source/whatsnew/v0.12.0.rst +++ b/doc/source/whatsnew/v0.12.0.rst @@ -177,7 +177,7 @@ API changes ``__repr__``). Plus string safety throughout. Now employed in many places throughout the pandas library. (:issue:`4090`, :issue:`4092`) -I/O Enhancements +I/O enhancements ~~~~~~~~~~~~~~~~ - ``pd.read_html()`` can now parse HTML strings, files or urls and return @@ -282,7 +282,7 @@ I/O Enhancements - ``read_csv`` will now throw a more informative error message when a file contains no columns, e.g., all newline characters -Other Enhancements +Other enhancements ~~~~~~~~~~~~~~~~~~ - ``DataFrame.replace()`` now allows regular expressions on contained @@ -371,7 +371,7 @@ Other Enhancements is detected (:issue:`4214`) -Experimental Features +Experimental features ~~~~~~~~~~~~~~~~~~~~~ - Added experimental ``CustomBusinessDay`` class to support ``DateOffsets`` @@ -398,7 +398,7 @@ Experimental Features dts = pd.date_range(dt, periods=5, freq=bday_egypt) print(pd.Series(dts.weekday, dts).map(pd.Series('Mon Tue Wed Thu Fri Sat Sun'.split()))) -Bug Fixes +Bug fixes ~~~~~~~~~ - Plotting functions now raise a ``TypeError`` before trying to plot anything diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst index 095d1807ca8736..0614de82cbcd05 100644 --- a/doc/source/whatsnew/v0.13.0.rst +++ b/doc/source/whatsnew/v0.13.0.rst @@ -203,7 +203,7 @@ API changes - ``Series.argmin`` and ``Series.argmax`` are now aliased to ``Series.idxmin`` and ``Series.idxmax``. These return the *index* of the min or max element respectively. Prior to 0.13.0 these would return the position of the min / max element. (:issue:`6214`) -Prior Version Deprecations/Changes +Prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ These were announced changes in 0.12 or prior that are taking effect as of 0.13.0 @@ -234,7 +234,7 @@ Deprecated in 0.13.0 behavior is the default, but the new behavior is available through the keyword argument ``as_indexer=True``. -Indexing API Changes +Indexing API changes ~~~~~~~~~~~~~~~~~~~~ Prior to 0.13, it was impossible to use a label indexer (``.loc/.ix``) to set a value that @@ -305,7 +305,7 @@ A Panel setting operation on an arbitrary axis aligns the input to the Panel 2001-01-14 30.0 32.0 2001-01-15 30.0 32.0 -Float64Index API Change +Float64Index API change ~~~~~~~~~~~~~~~~~~~~~~~ - Added a new index type, ``Float64Index``. This will be automatically created when passing floating values in index creation. @@ -369,7 +369,7 @@ Float64Index API Change In [3]: pd.Series(range(5))[3.0] Out[3]: 3 -HDFStore API Changes +HDFStore API changes ~~~~~~~~~~~~~~~~~~~~ - Query Format Changes. A much more string-like query format is now supported. See :ref:`the docs`. @@ -468,7 +468,7 @@ HDFStore API Changes via the option ``io.hdf.dropna_table`` (:issue:`4625`) - pass through store creation arguments; can be used to support in-memory stores -DataFrame repr Changes +DataFrame repr changes ~~~~~~~~~~~~~~~~~~~~~~ The HTML and plain text representations of :class:`DataFrame` now show @@ -915,7 +915,7 @@ Experimental .. _whatsnew_0130.refactoring: -Internal Refactoring +Internal refactoring ~~~~~~~~~~~~~~~~~~~~ In 0.13.0 there is a major refactor primarily to subclass ``Series`` from @@ -1030,7 +1030,7 @@ to unify methods and behaviors. Series formerly subclassed directly from .. _release.bug_fixes-0.13.0: -Bug Fixes +Bug fixes ~~~~~~~~~ - ``HDFStore`` diff --git a/doc/source/whatsnew/v0.13.1.rst b/doc/source/whatsnew/v0.13.1.rst index 161b0ef395f05b..6242c40d44bf81 100644 --- a/doc/source/whatsnew/v0.13.1.rst +++ b/doc/source/whatsnew/v0.13.1.rst @@ -43,7 +43,7 @@ Highlights include: df.loc[0, 'A'] = np.nan df -Output Formatting Enhancements +Output formatting enhancements ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - df.info() view now display dtype info per column (:issue:`5682`) @@ -179,7 +179,7 @@ API changes [0 rows x 2 columns] -Prior Version Deprecations/Changes +Prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There are no announced changes in 0.13 or prior that are taking effect as of 0.13.1 @@ -394,7 +394,7 @@ There are no experimental changes in 0.13.1 .. _release.bug_fixes-0.13.1: -Bug Fixes +Bug fixes ~~~~~~~~~ - Bug in ``io.wb.get_countries`` not including all countries (:issue:`6008`) diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst index f049006808c0fa..25a75492d78fb1 100644 --- a/doc/source/whatsnew/v0.14.0.rst +++ b/doc/source/whatsnew/v0.14.0.rst @@ -245,7 +245,7 @@ API changes .. _whatsnew_0140.display: -Display Changes +Display changes ~~~~~~~~~~~~~~~ - The default way of printing large DataFrames has changed. DataFrames @@ -301,7 +301,7 @@ Display Changes .. _whatsnew_0140.parsing: -Text Parsing API Changes +Text parsing API changes ~~~~~~~~~~~~~~~~~~~~~~~~ :func:`read_csv`/:func:`read_table` will now be noisier w.r.t invalid options rather than falling back to the ``PythonParser``. @@ -321,10 +321,10 @@ Text Parsing API Changes .. _whatsnew_0140.groupby: -Groupby API Changes +Groupby API changes ~~~~~~~~~~~~~~~~~~~ -More consistent behaviour for some groupby methods: +More consistent behavior for some groupby methods: - groupby ``head`` and ``tail`` now act more like ``filter`` rather than an aggregation: @@ -473,7 +473,7 @@ Some other enhancements to the sql functions include: .. _whatsnew_0140.slicers: -MultiIndexing Using Slicers +MultiIndexing using slicers ~~~~~~~~~~~~~~~~~~~~~~~~~~~ In 0.14.0 we added a new way to slice MultiIndexed objects. @@ -625,7 +625,7 @@ Plotting .. _whatsnew_0140.prior_deprecations: -Prior Version Deprecations/Changes +Prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There are prior version deprecations that are taking effect as of 0.14.0. @@ -731,7 +731,7 @@ Deprecations .. _whatsnew_0140.knownissues: -Known Issues +Known issues ~~~~~~~~~~~~ - OpenPyXL 2.0.0 breaks backwards compatibility (:issue:`7169`) diff --git a/doc/source/whatsnew/v0.14.1.rst b/doc/source/whatsnew/v0.14.1.rst index fcfb22d0745548..26018c5745a118 100644 --- a/doc/source/whatsnew/v0.14.1.rst +++ b/doc/source/whatsnew/v0.14.1.rst @@ -169,7 +169,7 @@ Experimental .. _whatsnew_0141.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ - Bug in ``DataFrame.where`` with a symmetric shaped frame and a passed other of a DataFrame (:issue:`7506`) - Bug in Panel indexing with a MultiIndex axis (:issue:`7516`) diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index f9e47b45f498d8..bea2ce815d243a 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -220,7 +220,7 @@ Finally, the combination of ``TimedeltaIndex`` with ``DatetimeIndex`` allow cert .. _whatsnew_0150.memory: -Memory Usage +Memory usage ^^^^^^^^^^^^ Implemented methods to find memory usage of a DataFrame. See the :ref:`FAQ ` for more. (:issue:`6852`). @@ -339,7 +339,7 @@ Timezone handling improvements .. _whatsnew_0150.roll: -Rolling/Expanding Moments improvements +Rolling/expanding moments improvements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - :func:`rolling_min`, :func:`rolling_max`, :func:`rolling_cov`, and :func:`rolling_corr` @@ -879,7 +879,7 @@ Other notable API changes: .. _whatsnew_0150.refactoring: -Internal Refactoring +Internal refactoring ^^^^^^^^^^^^^^^^^^^^ In 0.15.0 ``Index`` has internally been refactored to no longer sub-class ``ndarray`` @@ -1109,7 +1109,7 @@ Performance .. _whatsnew_0150.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ - Bug in pivot_table, when using margins and a dict aggfunc (:issue:`8349`) diff --git a/doc/source/whatsnew/v0.15.1.rst b/doc/source/whatsnew/v0.15.1.rst index 1091944cb056fa..2e036267b58045 100644 --- a/doc/source/whatsnew/v0.15.1.rst +++ b/doc/source/whatsnew/v0.15.1.rst @@ -275,7 +275,7 @@ Enhancements .. _whatsnew_0151.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ - Bug in unpickling of a ``CustomBusinessDay`` object (:issue:`8591`) diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst index 9f0449d6a17547..a41ad5bdf8cd66 100644 --- a/doc/source/whatsnew/v0.15.2.rst +++ b/doc/source/whatsnew/v0.15.2.rst @@ -191,7 +191,7 @@ Performance .. _whatsnew_0152.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ - Bug in concat of Series with ``category`` dtype which were coercing to ``object``. (:issue:`8641`) diff --git a/doc/source/whatsnew/v0.16.0.rst b/doc/source/whatsnew/v0.16.0.rst index 2cb09325c94667..b903c4dae4c5a6 100644 --- a/doc/source/whatsnew/v0.16.0.rst +++ b/doc/source/whatsnew/v0.16.0.rst @@ -39,7 +39,7 @@ New features .. _whatsnew_0160.enhancements.assign: -DataFrame Assign +DataFrame assign ^^^^^^^^^^^^^^^^ Inspired by `dplyr's @@ -135,7 +135,7 @@ from a ``scipy.sparse.coo_matrix``: .. _whatsnew_0160.enhancements.string: -String Methods Enhancements +String methods enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^ - Following new methods are accessible via ``.str`` accessor to apply the function to each values. This is intended to make it more consistent with standard methods on strings. (:issue:`9282`, :issue:`9352`, :issue:`9386`, :issue:`9387`, :issue:`9439`) @@ -228,7 +228,7 @@ sub-class of ``datetime.timedelta``. Mentioned :ref:`here `, the following @@ -279,7 +279,7 @@ enhancements make string operations easier and more consistent with standard pyt .. _whatsnew_0161.enhancements.other: -Other Enhancements +Other enhancements ^^^^^^^^^^^^^^^^^^ - ``BusinessHour`` offset is now supported, which represents business hours starting from 09:00 - 17:00 on ``BusinessDay`` by default. See :ref:`Here ` for details. (:issue:`7905`) @@ -351,12 +351,12 @@ Deprecations .. _whatsnew_0161.index_repr: -Index Representation +Index representation ~~~~~~~~~~~~~~~~~~~~ The string representation of ``Index`` and its sub-classes have now been unified. These will show a single-line display if there are few values; a wrapped multi-line display for a lot of values (but less than ``display.max_seq_items``; if lots of items (> ``display.max_seq_items``) will show a truncated display (the head and tail of the data). The formatting for ``MultiIndex`` is unchanged (a multi-line wrapped display). The display width responds to the option ``display.max_seq_items``, which is defaulted to 100. (:issue:`6482`) -Previous Behavior +Previous behavior .. code-block:: ipython @@ -378,7 +378,7 @@ Previous Behavior [2013-01-01 00:00:00-05:00, ..., 2013-04-14 00:00:00-04:00] Length: 104, Freq: D, Timezone: US/Eastern -New Behavior +New behavior .. ipython:: python @@ -399,7 +399,7 @@ New Behavior .. _whatsnew_0161.performance: -Performance Improvements +Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved csv write performance with mixed dtypes, including datetimes by up to 5x (:issue:`9940`) @@ -409,7 +409,7 @@ Performance Improvements .. _whatsnew_0161.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ - Bug where labels did not appear properly in the legend of ``DataFrame.plot()``, passing ``label=`` arguments works, and Series indices are no longer mutated. (:issue:`9542`) diff --git a/doc/source/whatsnew/v0.16.2.rst b/doc/source/whatsnew/v0.16.2.rst index ca0ad8d3ae7f94..543f9c6bbf300a 100644 --- a/doc/source/whatsnew/v0.16.2.rst +++ b/doc/source/whatsnew/v0.16.2.rst @@ -86,7 +86,7 @@ See the :ref:`documentation ` for more. (:issue:`10129`) .. _whatsnew_0162.enhancements.other: -Other Enhancements +Other enhancements ^^^^^^^^^^^^^^^^^^ - Added `rsplit` to Index/Series StringMethods (:issue:`10303`) @@ -105,7 +105,7 @@ Other Enhancements .. _whatsnew_0162.api: -API Changes +API changes ~~~~~~~~~~~ - ``Holiday`` now raises ``NotImplementedError`` if both ``offset`` and ``observance`` are used in the constructor instead of returning an incorrect result (:issue:`10217`). @@ -113,7 +113,7 @@ API Changes .. _whatsnew_0162.performance: -Performance Improvements +Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved ``Series.resample`` performance with ``dtype=datetime64[ns]`` (:issue:`7754`) @@ -121,7 +121,7 @@ Performance Improvements .. _whatsnew_0162.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ - Bug in ``Series.hist`` raises an error when a one row ``Series`` was given (:issue:`10214`) diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst index 8a3f87e8488ca3..67abad659dc8df 100644 --- a/doc/source/whatsnew/v0.17.0.rst +++ b/doc/source/whatsnew/v0.17.0.rst @@ -103,7 +103,7 @@ This uses a new-dtype representation as well, that is very similar in look-and-f There is a slightly different string repr for the underlying ``DatetimeIndex`` as a result of the dtype changes, but functionally these are the same. - Previous Behavior: + Previous behavior: .. code-block:: ipython @@ -115,7 +115,7 @@ This uses a new-dtype representation as well, that is very similar in look-and-f In [2]: pd.date_range('20130101', periods=3, tz='US/Eastern').dtype Out[2]: dtype('` for more details. .. _whatsnew_0170.matheval: -Support for Math Functions in .eval() +Support for math functions in .eval() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :meth:`~pandas.eval` now supports calling math functions (:issue:`4893`) @@ -329,7 +329,7 @@ has been changed to make this keyword unnecessary - the change is shown below. .. _whatsnew_0170.gbq: -Google BigQuery Enhancements +Google BigQuery enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - Added ability to automatically create a table/dataset using the :func:`pandas.io.gbq.to_gbq` function if the destination table/dataset does not exist. (:issue:`8325`, :issue:`11121`). - Added ability to replace an existing table and schema when calling the :func:`pandas.io.gbq.to_gbq` function via the ``if_exists`` argument. See the `docs `__ for more details (:issue:`8325`). @@ -339,7 +339,7 @@ Google BigQuery Enhancements .. _whatsnew_0170.east_asian_width: -Display Alignment with Unicode East Asian Width +Display alignment with Unicode East Asian width ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. warning:: @@ -411,7 +411,7 @@ Other enhancements bar = pd.Series([1, 2]) baz = pd.Series([4, 5]) - Previous Behavior: + Previous behavior: .. code-block:: ipython @@ -421,7 +421,7 @@ Other enhancements 0 1 1 4 1 2 2 5 - New Behavior: + New behavior: .. ipython:: python @@ -609,14 +609,14 @@ In prior versions it was ``errors='ignore'``. Furthermore, the ``coerce`` argume has been deprecated in favor of ``errors='coerce'``. This means that invalid parsing will raise rather that return the original input as in previous versions. (:issue:`10636`) -Previous Behavior: +Previous behavior: .. code-block:: ipython In [2]: pd.to_datetime(['2009-07-31', 'asd']) Out[2]: array(['2009-07-31', 'asd'], dtype=object) -New Behavior: +New behavior: .. code-block:: ipython @@ -638,7 +638,7 @@ To keep the previous behavior, you can use ``errors='ignore'``: Furthermore, ``pd.to_timedelta`` has gained a similar API, of ``errors='raise'|'ignore'|'coerce'``, and the ``coerce`` keyword has been deprecated in favor of ``errors='coerce'``. -Consistent Parsing +Consistent parsing """""""""""""""""" The string parsing of ``to_datetime``, ``Timestamp`` and ``DatetimeIndex`` has @@ -648,7 +648,7 @@ Prior to v0.17.0, ``Timestamp`` and ``to_datetime`` may parse year-only datetime uses the beginning of the year. ``Timestamp`` and ``to_datetime`` may raise ``ValueError`` in some types of datetime-string which ``DatetimeIndex`` can parse, such as a quarterly string. -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -663,7 +663,7 @@ Previous Behavior: v0.17.0 can parse them as below. It works on ``DatetimeIndex`` also. -New Behavior: +New behavior: .. ipython:: python @@ -681,7 +681,7 @@ New Behavior: pd.Timestamp.now() pd.Timestamp.now() + offsets.DateOffset(years=1) -Changes to Index Comparisons +Changes to Index comparisons ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Operator equal on ``Index`` should behavior similarly to ``Series`` (:issue:`9947`, :issue:`10637`) @@ -689,7 +689,7 @@ Operator equal on ``Index`` should behavior similarly to ``Series`` (:issue:`994 Starting in v0.17.0, comparing ``Index`` objects of different lengths will raise a ``ValueError``. This is to be consistent with the behavior of ``Series``. -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -702,7 +702,7 @@ Previous Behavior: In [4]: pd.Index([1, 2, 3]) == pd.Index([1, 2]) Out[4]: False -New Behavior: +New behavior: .. code-block:: ipython @@ -729,7 +729,7 @@ or it can return False if broadcasting can not be done: np.array([1, 2, 3]) == np.array([1, 2]) -Changes to Boolean Comparisons vs. None +Changes to boolean comparisons vs. None ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Boolean comparisons of a ``Series`` vs ``None`` will now be equivalent to comparing with ``np.nan``, rather than raise ``TypeError``. (:issue:`1079`). @@ -740,14 +740,14 @@ Boolean comparisons of a ``Series`` vs ``None`` will now be equivalent to compar s.iloc[1] = None s -Previous Behavior: +Previous behavior: .. code-block:: ipython In [5]: s == None TypeError: Could not compare type with Series -New Behavior: +New behavior: .. ipython:: python @@ -776,7 +776,7 @@ HDFStore dropna behavior The default behavior for HDFStore write functions with ``format='table'`` is now to keep rows that are all missing. Previously, the behavior was to drop rows that were all missing save the index. The previous behavior can be replicated using the ``dropna=True`` option. (:issue:`9382`) -Previous Behavior: +Previous behavior: .. ipython:: python @@ -802,7 +802,7 @@ Previous Behavior: 2 2 NaN -New Behavior: +New behavior: .. ipython:: python @@ -882,7 +882,7 @@ Changes to ``Categorical.unique`` cat cat.unique() -Changes to ``bool`` passed as ``header`` in Parsers +Changes to ``bool`` passed as ``header`` in parsers ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In earlier versions of pandas, if a bool was passed the ``header`` argument of @@ -901,7 +901,7 @@ A ``bool`` input to ``header`` will now raise a ``TypeError`` .. _whatsnew_0170.api_breaking.other: -Other API Changes +Other API changes ^^^^^^^^^^^^^^^^^ - Line and kde plot with ``subplots=True`` now uses default colors, not all black. Specify ``color='k'`` to draw all lines in black (:issue:`9894`) @@ -1016,7 +1016,7 @@ Removal of prior version deprecations/changes .. _whatsnew_0170.performance: -Performance Improvements +Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Development support for benchmarking with the `Air Speed Velocity library `_ (:issue:`8361`) @@ -1039,7 +1039,7 @@ Performance Improvements .. _whatsnew_0170.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ - Bug in incorrect computation of ``.mean()`` on ``timedelta64[ns]`` because of overflow (:issue:`9442`) diff --git a/doc/source/whatsnew/v0.17.1.rst b/doc/source/whatsnew/v0.17.1.rst index 9de49699b96523..55080240f2a55e 100644 --- a/doc/source/whatsnew/v0.17.1.rst +++ b/doc/source/whatsnew/v0.17.1.rst @@ -31,7 +31,7 @@ New features .. _whatsnew_0171.style: -Conditional HTML Formatting +Conditional HTML formatting ^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. warning:: @@ -135,7 +135,7 @@ Deprecations .. _whatsnew_0171.performance: -Performance Improvements +Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Checking monotonic-ness before sorting on an index (:issue:`11080`) @@ -152,7 +152,7 @@ Performance Improvements .. _whatsnew_0171.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ - ``SparseArray.__iter__()`` now does not cause ``PendingDeprecationWarning`` in Python 3.5 (:issue:`11622`) diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst index 9ff6ad7188f5aa..a7174c6325f86c 100644 --- a/doc/source/whatsnew/v0.18.0.rst +++ b/doc/source/whatsnew/v0.18.0.rst @@ -62,7 +62,7 @@ Window functions have been refactored to be methods on ``Series/DataFrame`` obje df = pd.DataFrame({'A': range(10), 'B': np.random.randn(10)}) df -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -82,7 +82,7 @@ Previous Behavior: 8 7 0.079587 9 8 -0.954504 -New Behavior: +New behavior: .. ipython:: python @@ -145,14 +145,14 @@ This continues to work as before for function or dict-like values. .. _whatsnew_0180.enhancements.rangeindex: -Range Index +Range index ^^^^^^^^^^^ A ``RangeIndex`` has been added to the ``Int64Index`` sub-classes to support a memory saving alternative for common use cases. This has a similar implementation to the python ``range`` object (``xrange`` in python 2), in that it only stores the start, stop, and step values for the index. It will transparently interact with the user API, converting to ``Int64Index`` if needed. This will now be the default constructed index for ``NDFrame`` objects, rather than previous an ``Int64Index``. (:issue:`939`, :issue:`12070`, :issue:`12071`, :issue:`12109`, :issue:`12888`) -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -168,7 +168,7 @@ Previous Behavior: Out[6]: 8000 -New Behavior: +New behavior: .. ipython:: python @@ -341,13 +341,13 @@ In addition, ``.round()``, ``.floor()`` and ``.ceil()`` will be available throug s s.dt.round('D') -Formatting of Integers in FloatIndex +Formatting of integers in FloatIndex ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Integers in ``FloatIndex``, e.g. 1., are now formatted with a decimal point and a ``0`` digit, e.g. ``1.0`` (:issue:`11713`) This change not only affects the display to the console, but also the output of IO methods like ``.to_csv`` or ``.to_html``. -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -369,7 +369,7 @@ Previous Behavior: 2,3 -New Behavior: +New behavior: .. ipython:: python @@ -383,7 +383,7 @@ Changes to dtype assignment behaviors When a DataFrame's slice is updated with a new slice of the same dtype, the dtype of the DataFrame will now remain the same. (:issue:`10503`) -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -406,7 +406,7 @@ Previous Behavior: b int64 dtype: object -New Behavior: +New behavior: .. ipython:: python @@ -419,7 +419,7 @@ New Behavior: When a DataFrame's integer slice is partially updated with a new slice of floats that could potentially be down-casted to integer without losing precision, the dtype of the slice will be set to float instead of integer. -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -443,7 +443,7 @@ Previous Behavior: 10 4 5 1 8 12 7 8 9 -New Behavior: +New behavior: .. ipython:: python @@ -484,7 +484,7 @@ See the `xarray full-documentation here `__ * major_axis (major_axis) int64 0 1 2 * minor_axis (minor_axis) int64 0 1 2 3 -Latex Representation +Latex representation ^^^^^^^^^^^^^^^^^^^^ ``DataFrame`` has gained a ``._repr_latex_()`` method in order to allow for conversion to latex in a ipython/jupyter notebook using nbconvert. (:issue:`11778`) @@ -981,7 +981,7 @@ assignments are valid for multi-line expressions. .. _whatsnew_0180.api: -Other API Changes +Other API changes ^^^^^^^^^^^^^^^^^ - ``DataFrame.between_time`` and ``Series.between_time`` now only parse a fixed set of time strings. Parsing of date strings is no longer supported and raises a ``ValueError``. (:issue:`11818`) @@ -1074,7 +1074,7 @@ In 0.18.0, this deprecation warning is removed and these will now raise a ``Type s2 = pd.Series([1, 2, 3], index=list('abc')) s2 -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -1104,7 +1104,7 @@ Previous Behavior: c 3 dtype: int64 -New Behavior: +New behavior: For iloc, getting & setting via a float scalar will always raise. @@ -1180,7 +1180,7 @@ Removal of prior version deprecations/changes .. _whatsnew_0180.performance: -Performance Improvements +Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved performance of ``andrews_curves`` (:issue:`11534`) diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst index 069395c2e0f360..7e06e5050c5f0f 100644 --- a/doc/source/whatsnew/v0.18.1.rst +++ b/doc/source/whatsnew/v0.18.1.rst @@ -31,7 +31,7 @@ New features .. _whatsnew_0181.enhancements.custombusinesshour: -Custom Business Hour +Custom business hour ^^^^^^^^^^^^^^^^^^^^ The ``CustomBusinessHour`` is a mixture of ``BusinessHour`` and ``CustomBusinessDay`` which @@ -199,7 +199,7 @@ On other levels .. _whatsnew_0181.enhancements.assembling: -Assembling Datetimes +Assembling datetimes ^^^^^^^^^^^^^^^^^^^^ ``pd.to_datetime()`` has gained the ability to assemble datetimes from a passed in ``DataFrame`` or a dict. (:issue:`8158`). @@ -226,7 +226,7 @@ You can pass only the columns that you need to assemble. .. _whatsnew_0181.other: -Other Enhancements +Other enhancements ^^^^^^^^^^^^^^^^^^ - ``pd.read_csv()`` now supports ``delim_whitespace=True`` for the Python engine (:issue:`12958`) @@ -317,7 +317,7 @@ The index in ``.groupby(..).nth()`` output is now more consistent when the ``as_ 'B': [1, 2, 3]}) df -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -333,7 +333,7 @@ Previous Behavior: 1 2 Name: B, dtype: int64 -New Behavior: +New behavior: .. ipython:: python @@ -348,7 +348,7 @@ Furthermore, previously, a ``.groupby`` would always sort, regardless if ``sort= df = pd.DataFrame(np.random.randn(100, 2), columns=['a', 'b']) df['c'] = np.random.randint(0, 4, 100) -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -370,7 +370,7 @@ Previous Behavior: 2 -0.720589 0.887163 3 0.859588 -0.636524 -New Behavior: +New behavior: .. ipython:: python @@ -446,7 +446,7 @@ Previous behavior: 2000-11-30 value 13 dtype: int64 -New Behavior: +New behavior: .. code-block:: ipython @@ -580,7 +580,7 @@ Deprecations .. _whatsnew_0181.performance: -Performance Improvements +Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved speed of SAS reader (:issue:`12656`, :issue:`12961`) @@ -601,7 +601,7 @@ Performance Improvements .. _whatsnew_0181.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ - ``usecols`` parameter in ``pd.read_csv`` is now respected even when the lines of a CSV file are not even (:issue:`12203`) - Bug in ``groupby.transform(..)`` when ``axis=1`` is specified with a non-monotonic ordered index (:issue:`12713`) diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index fe9fdd7448923d..52ea9e8839e45e 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -264,7 +264,7 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification .. _whatsnew_0190.enhancements.union_categoricals: -Categorical Concatenation +Categorical concatenation ^^^^^^^^^^^^^^^^^^^^^^^^^ - A function :func:`union_categoricals` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`, :issue:`13763`, :issue:`13846`, :issue:`14173`) @@ -298,7 +298,7 @@ Categorical Concatenation .. _whatsnew_0190.enhancements.semi_month_offsets: -Semi-Month Offsets +Semi-month offsets ^^^^^^^^^^^^^^^^^^ Pandas has gained new frequency offsets, ``SemiMonthEnd`` ('SM') and ``SemiMonthBegin`` ('SMS'). @@ -596,7 +596,7 @@ Comparison operators Comparison operators raise ``ValueError`` when ``.index`` are different. -**Previous Behavior** (``Series``): +**Previous behavior** (``Series``): ``Series`` compared values ignoring the ``.index`` as long as both had the same length: @@ -631,7 +631,7 @@ Comparison operators raise ``ValueError`` when ``.index`` are different. s1.eq(s2) -**Current Behavior** (``DataFrame``, no change): +**Current behavior** (``DataFrame``, no change): .. code-block:: ipython @@ -675,7 +675,7 @@ Logical operators align both ``.index`` of left and right hand side. s1 & s2.reindex_like(s1) -**Current Behavior** (``DataFrame``, no change): +**Current behavior** (``DataFrame``, no change): .. ipython:: python @@ -1324,7 +1324,7 @@ operations on that platform. .. _whatsnew_0190.api.other: -Other API Changes +Other API changes ^^^^^^^^^^^^^^^^^ - ``Timestamp.to_pydatetime`` will issue a ``UserWarning`` when ``warn=True``, and the instance has a non-zero number of nanoseconds, previously this would print a message to stdout (:issue:`14101`). @@ -1406,7 +1406,7 @@ Removal of prior version deprecations/changes .. _whatsnew_0190.performance: -Performance Improvements +Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved performance of sparse ``IntIndex.intersect`` (:issue:`13082`) @@ -1426,7 +1426,7 @@ Performance Improvements .. _whatsnew_0190.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ - Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`) diff --git a/doc/source/whatsnew/v0.19.1.rst b/doc/source/whatsnew/v0.19.1.rst index 12f3e985565e0d..a89d1461073bde 100644 --- a/doc/source/whatsnew/v0.19.1.rst +++ b/doc/source/whatsnew/v0.19.1.rst @@ -22,7 +22,7 @@ We recommend that all users upgrade to this version. .. _whatsnew_0191.performance: -Performance Improvements +Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Fixed performance regression in factorization of ``Period`` data (:issue:`14338`) @@ -34,7 +34,7 @@ Performance Improvements .. _whatsnew_0191.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ - Source installs from PyPI will now again work without ``cython`` installed, as in previous versions (:issue:`14204`) diff --git a/doc/source/whatsnew/v0.19.2.rst b/doc/source/whatsnew/v0.19.2.rst index 14310ceb45b4a5..023bc78081ec93 100644 --- a/doc/source/whatsnew/v0.19.2.rst +++ b/doc/source/whatsnew/v0.19.2.rst @@ -39,7 +39,7 @@ The ``pd.merge_asof()``, added in 0.19.0, gained some improvements: .. _whatsnew_0192.performance: -Performance Improvements +Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance regression with ``PeriodIndex`` (:issue:`14822`) @@ -50,7 +50,7 @@ Performance Improvements .. _whatsnew_0192.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ - Compat with python 3.6 for pickling of some offsets (:issue:`14685`) - Compat with python 3.6 for some indexing exception types (:issue:`14684`, :issue:`14689`) diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index 51c8c488fb9d90..ef6108ae3ec909 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -151,7 +151,7 @@ commonly called 'unix epoch' or POSIX time. This was the previous default, so th .. _whatsnew_0200.enhancements.groupby_access: -Groupby Enhancements +Groupby enhancements ^^^^^^^^^^^^^^^^^^^^ Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now reference either column names or index level names. Previously, only column names could be referenced. This allows to easily group by a column and index level at the same time. (:issue:`5677`) @@ -240,7 +240,7 @@ The default is to infer the compression type from the extension (``compression=' .. _whatsnew_0200.enhancements.uint64_support: -UInt64 Support Improved +UInt64 support improved ^^^^^^^^^^^^^^^^^^^^^^^ Pandas has significantly improved support for operations involving unsigned, @@ -263,7 +263,7 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937 .. _whatsnew_0200.enhancements.groupy_categorical: -GroupBy on Categoricals +GroupBy on categoricals ^^^^^^^^^^^^^^^^^^^^^^^ In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueError`` when grouping on a categorical series with some categories not appearing in the data. (:issue:`13179`) @@ -280,7 +280,7 @@ In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueErr ordered=True)}) df -**Previous Behavior**: +**Previous behavior**: .. code-block:: ipython @@ -288,7 +288,7 @@ In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueErr --------------------------------------------------------------------------- ValueError: items in new_categories are not the same as in old categories -**New Behavior**: +**New behavior**: .. ipython:: python @@ -296,7 +296,7 @@ In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueErr .. _whatsnew_0200.enhancements.table_schema: -Table Schema Output +Table schema output ^^^^^^^^^^^^^^^^^^^ The new orient ``'table'`` for :meth:`DataFrame.to_json` @@ -457,7 +457,7 @@ Selecting via a scalar value that is contained *in* the intervals. .. _whatsnew_0200.enhancements.other: -Other Enhancements +Other enhancements ^^^^^^^^^^^^^^^^^^ - ``DataFrame.rolling()`` now accepts the parameter ``closed='right'|'left'|'both'|'neither'`` to choose the rolling window-endpoint closedness. See the :ref:`documentation ` (:issue:`13965`) @@ -580,7 +580,7 @@ Map on Index types now return other Index types mi = pd.MultiIndex.from_tuples([(1, 2), (2, 4)]) mi -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -596,7 +596,7 @@ Previous Behavior: In [8]: mi.map(lambda x: x[0]) Out[8]: array([1, 2]) -New Behavior: +New behavior: .. ipython:: python @@ -616,7 +616,7 @@ New Behavior: .tz_localize('Asia/Tokyo')) s -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -627,7 +627,7 @@ Previous Behavior: 2 2 dtype: int32 -New Behavior: +New behavior: .. ipython:: python @@ -653,7 +653,7 @@ Previous behaviour: In [2]: idx.hour Out[2]: array([ 0, 10, 20, 6, 16], dtype=int32) -New Behavior: +New behavior: .. ipython:: python @@ -697,7 +697,7 @@ data-types would yield different return types. These are now made consistent. (: ...: pd.Timestamp('20160101', tz='US/Eastern')]) Out[8]: array(['2016-01-01T05:00:00.000000000'], dtype='datetime64[ns]') - New Behavior: + New behavior: .. ipython:: python @@ -727,7 +727,7 @@ data-types would yield different return types. These are now made consistent. (: In [2]: pd.unique(pd.Series(list('baabc'), dtype='category')) Out[2]: array(['b', 'a', 'c'], dtype=object) - New Behavior: + New behavior: .. ipython:: python @@ -737,7 +737,7 @@ data-types would yield different return types. These are now made consistent. (: .. _whatsnew_0200.api_breaking.s3: -S3 File Handling +S3 file handling ^^^^^^^^^^^^^^^^ pandas now uses `s3fs `_ for handling S3 connections. This shouldn't break @@ -746,7 +746,7 @@ in prior versions of pandas. (:issue:`11915`). .. _whatsnew_0200.api_breaking.partial_string_indexing: -Partial String Indexing Changes +Partial string indexing changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :ref:`DatetimeIndex Partial String Indexing ` now works as an exact match, provided that string resolution coincides with index resolution, including a case when both are seconds (:issue:`14826`). See :ref:`Slice vs. Exact Match ` for details. @@ -756,7 +756,7 @@ Partial String Indexing Changes df = pd.DataFrame({'a': [1, 2, 3]}, pd.DatetimeIndex(['2011-12-31 23:59:59', '2012-01-01 00:00:00', '2012-01-01 00:00:01'])) -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -771,7 +771,7 @@ Previous Behavior: Name: a, dtype: int64 -New Behavior: +New behavior: .. code-block:: ipython @@ -797,7 +797,7 @@ Now the smallest acceptable dtype will be used (:issue:`13247`) df2 = pd.DataFrame(np.array([np.nan], dtype=np.float32, ndmin=2)) df2.dtypes -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -806,7 +806,7 @@ Previous Behavior: 0 float64 dtype: object -New Behavior: +New behavior: .. ipython:: python @@ -823,12 +823,12 @@ currently released version of ``pandas-gbq=0.1.4``. Documentation is now hosted .. _whatsnew_0200.api_breaking.memory_usage: -Memory Usage for Index is more Accurate +Memory usage for Index is more accurate ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In previous versions, showing ``.memory_usage()`` on a pandas structure that has an index, would only include actual index values and not include structures that facilitated fast indexing. This will generally be different for ``Index`` and ``MultiIndex`` and less-so for other index types. (:issue:`15237`) -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -843,7 +843,7 @@ Previous Behavior: In [11]: index.memory_usage(deep=True) Out[11]: 180 -New Behavior: +New behavior: .. code-block:: ipython @@ -900,7 +900,7 @@ doesn't behave as desired. [[0, 0, 1, 1], [0, 1, 0, 1]])) df -Previous Behavior: +Previous behavior: .. code-block:: python @@ -918,7 +918,7 @@ Previous Behavior: In [15]: df.sort_index().index.is_monotonic Out[15]: False -New Behavior: +New behavior: .. ipython:: python @@ -929,13 +929,13 @@ New Behavior: .. _whatsnew_0200.api_breaking.groupby_describe: -Groupby Describe Formatting +Groupby describe formatting ^^^^^^^^^^^^^^^^^^^^^^^^^^^ The output formatting of ``groupby.describe()`` now labels the ``describe()`` metrics in the columns instead of the index. This format is consistent with ``groupby.agg()`` when applying multiple functions at once. (:issue:`4792`) -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -970,7 +970,7 @@ Previous Behavior: 1 1.5 0.707107 1 2 2 3.5 0.707107 3 4 -New Behavior: +New behavior: .. ipython:: python @@ -982,7 +982,7 @@ New Behavior: .. _whatsnew_0200.api_breaking.rolling_pairwise: -Window Binary Corr/Cov operations return a MultiIndex DataFrame +Window binary corr/cov operations return a MultiIndex DataFrame ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ A binary window operation, like ``.corr()`` or ``.cov()``, when operating on a ``.rolling(..)``, ``.expanding(..)``, or ``.ewm(..)`` object, @@ -1000,7 +1000,7 @@ See the section on :ref:`Windowed Binary Operations ` for periods=100, freq='D', name='foo')) df.tail() -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -1012,7 +1012,7 @@ Previous Behavior: Major_axis axis: A to B Minor_axis axis: A to B -New Behavior: +New behavior: .. ipython:: python @@ -1040,7 +1040,7 @@ usually resulting in an invalid comparison, returning an empty result frame. The df.to_hdf('store.h5', 'key', format='table', data_columns=True) df.dtypes -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -1050,7 +1050,7 @@ Previous Behavior: ^ SyntaxError: invalid token -New Behavior: +New behavior: .. code-block:: ipython @@ -1084,14 +1084,14 @@ joins, :meth:`DataFrame.join` and :func:`merge`, and the ``.align`` method. right = pd.Index([1, 2, 3]) right - Previous Behavior: + Previous behavior: .. code-block:: ipython In [4]: left.intersection(right) Out[4]: Int64Index([1, 2], dtype='int64') - New Behavior: + New behavior: .. ipython:: python @@ -1106,7 +1106,7 @@ joins, :meth:`DataFrame.join` and :func:`merge`, and the ``.align`` method. right = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3]) right - Previous Behavior: + Previous behavior: .. code-block:: ipython @@ -1116,7 +1116,7 @@ joins, :meth:`DataFrame.join` and :func:`merge`, and the ``.align`` method. 1 10 100 2 20 200 - New Behavior: + New behavior: .. ipython:: python @@ -1124,7 +1124,7 @@ joins, :meth:`DataFrame.join` and :func:`merge`, and the ``.align`` method. .. _whatsnew_0200.api_breaking.pivot_table: -Pivot Table always returns a DataFrame +Pivot table always returns a DataFrame ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The documentation for :meth:`pivot_table` states that a ``DataFrame`` is *always* returned. Here a bug @@ -1137,7 +1137,7 @@ is fixed that allowed this to return a ``Series`` under certain circumstance. (: 'col3': [1, 3, 9]}) df -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -1149,7 +1149,7 @@ Previous Behavior: 9 E 5 Name: col1, dtype: int64 -New Behavior: +New behavior: .. ipython:: python @@ -1157,7 +1157,7 @@ New Behavior: .. _whatsnew_0200.api: -Other API Changes +Other API changes ^^^^^^^^^^^^^^^^^ - ``numexpr`` version is now required to be >= 2.4.6 and it will not be used at all if this requisite is not fulfilled (:issue:`15213`). @@ -1192,12 +1192,12 @@ Other API Changes .. _whatsnew_0200.privacy: -Reorganization of the library: Privacy Changes +Reorganization of the library: privacy changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. _whatsnew_0200.privacy.extensions: -Modules Privacy Has Changed +Modules privacy has changed ^^^^^^^^^^^^^^^^^^^^^^^^^^^ Some formerly public python/c/c++/cython extension modules have been moved and/or renamed. These are all removed from the public API. @@ -1327,7 +1327,7 @@ Using ``.ix`` will now show a ``DeprecationWarning`` with a link to some example df -Previous Behavior, where you wish to get the 0th and the 2nd elements from the index in the 'A' column. +Previous behavior, where you wish to get the 0th and the 2nd elements from the index in the 'A' column. .. code-block:: ipython @@ -1532,7 +1532,7 @@ Should be changed to: .. _whatsnew_0200.deprecations.other: -Other Deprecations +Other deprecations ^^^^^^^^^^^^^^^^^^ - ``SparseArray.to_dense()`` has deprecated the ``fill`` parameter, as that parameter was not being respected (:issue:`14647`) @@ -1584,7 +1584,7 @@ Removal of prior version deprecations/changes .. _whatsnew_0200.performance: -Performance Improvements +Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved performance of ``pd.wide_to_long()`` (:issue:`14779`) @@ -1606,7 +1606,7 @@ Performance Improvements .. _whatsnew_0200.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ Conversion @@ -1713,7 +1713,7 @@ Plotting - Bug in the date and time converters pandas registers with matplotlib not handling multiple dimensions (:issue:`16026`) - Bug in ``pd.scatter_matrix()`` could accept either ``color`` or ``c``, but not both (:issue:`14855`) -Groupby/Resample/Rolling +Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in ``.groupby(..).resample()`` when passed the ``on=`` kwarg. (:issue:`15021`) diff --git a/doc/source/whatsnew/v0.20.2.rst b/doc/source/whatsnew/v0.20.2.rst index b2592579eb03fd..232d1d283d9bd4 100644 --- a/doc/source/whatsnew/v0.20.2.rst +++ b/doc/source/whatsnew/v0.20.2.rst @@ -35,7 +35,7 @@ Enhancements .. _whatsnew_0202.performance: -Performance Improvements +Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance regression fix when indexing with a list-like (:issue:`16285`) @@ -46,7 +46,7 @@ Performance Improvements .. _whatsnew_0202.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ - Silenced a warning on some Windows environments about "tput: terminal attributes: No such device or address" when @@ -97,7 +97,7 @@ Plotting -Groupby/Resample/Rolling +Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in creating a time-based rolling window on an empty ``DataFrame`` (:issue:`15819`) diff --git a/doc/source/whatsnew/v0.20.3.rst b/doc/source/whatsnew/v0.20.3.rst index 8dc6acc2074bd0..72faabd95bf1f5 100644 --- a/doc/source/whatsnew/v0.20.3.rst +++ b/doc/source/whatsnew/v0.20.3.rst @@ -20,7 +20,7 @@ and bug fixes. We recommend that all users upgrade to this version. .. _whatsnew_0203.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ - Fixed a bug in failing to compute rolling computations of a column-MultiIndexed ``DataFrame`` (:issue:`16789`, :issue:`16825`) diff --git a/doc/source/whatsnew/v0.21.0.rst b/doc/source/whatsnew/v0.21.0.rst index 44b50437a6dfe8..34b610e8af0b38 100644 --- a/doc/source/whatsnew/v0.21.0.rst +++ b/doc/source/whatsnew/v0.21.0.rst @@ -300,7 +300,7 @@ as in :meth:`DataFrame.rename`. .. _whatsnew_0210.enhancements.other: -Other Enhancements +Other enhancements ^^^^^^^^^^^^^^^^^^ New functions or methods @@ -412,7 +412,7 @@ Previously WITH ``bottleneck``: In [2]: s.sum() Out[2]: 0.0 -New Behavior, without regard to the bottleneck installation: +New behavior, without regard to the bottleneck installation: .. ipython:: python @@ -434,7 +434,7 @@ but for consistency with the all-NaN case, this was changed to return NaN as wel .. _whatsnew_0210.api_breaking.loc: -Indexing with a list with missing labels is Deprecated +Indexing with a list with missing labels is deprecated ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Previously, selecting with a list of labels, where one or more labels were missing would always succeed, returning ``NaN`` for missing labels. @@ -448,7 +448,7 @@ See the :ref:`deprecation docs `. s = pd.Series([1, 2, 3]) s -Previous Behavior +Previous behavior .. code-block:: ipython @@ -460,7 +460,7 @@ Previous Behavior dtype: float64 -Current Behavior +Current behavior .. code-block:: ipython @@ -492,7 +492,7 @@ Selection with all keys found is unchanged. .. _whatsnew_0210.api.na_changes: -NA naming Changes +NA naming changes ^^^^^^^^^^^^^^^^^ In order to promote more consistency among the pandas API, we have added additional top-level @@ -524,7 +524,7 @@ Previously: In [2]: type(list(s)[0]) Out[2]: numpy.int64 -New Behaviour: +New behavior: .. ipython:: python @@ -544,7 +544,7 @@ Previously: In [8]: type(df.to_dict()['a'][0]) Out[8]: numpy.int64 -New Behaviour: +New behavior: .. ipython:: python @@ -561,7 +561,7 @@ you would get a label based selection, potentially duplicating result labels, ra (where ``True`` selects elements), this was inconsistent how a boolean numpy array indexed. The new behavior is to act like a boolean numpy array indexer. (:issue:`17738`) -Previous Behavior: +Previous behavior: .. ipython:: python @@ -578,7 +578,7 @@ Previous Behavior: True 2 dtype: int64 -Current Behavior +Current behavior .. ipython:: python @@ -588,7 +588,7 @@ Current Behavior Furthermore, previously if you had an index that was non-numeric (e.g. strings), then a boolean Index would raise a ``KeyError``. This will now be treated as a boolean indexer. -Previously Behavior: +Previously behavior: .. ipython:: python @@ -600,7 +600,7 @@ Previously Behavior: In [39]: s.loc[pd.Index([True, False, True])] KeyError: "None of [Index([True, False, True], dtype='object')] are in the [index]" -Current Behavior +Current behavior .. ipython:: python @@ -614,7 +614,7 @@ Current Behavior In previous versions of pandas, resampling a ``Series``/``DataFrame`` indexed by a ``PeriodIndex`` returned a ``DatetimeIndex`` in some cases (:issue:`12884`). Resampling to a multiplied frequency now returns a ``PeriodIndex`` (:issue:`15944`). As a minor enhancement, resampling a ``PeriodIndex`` can now handle ``NaT`` values (:issue:`13224`) -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -634,7 +634,7 @@ Previous Behavior: In [5]: resampled.index Out[5]: DatetimeIndex(['2017-03-31', '2017-09-30', '2018-03-31'], dtype='datetime64[ns]', freq='2Q-DEC') -New Behavior: +New behavior: .. ipython:: python @@ -650,7 +650,7 @@ New Behavior: Upsampling and calling ``.ohlc()`` previously returned a ``Series``, basically identical to calling ``.asfreq()``. OHLC upsampling now returns a DataFrame with columns ``open``, ``high``, ``low`` and ``close`` (:issue:`13083`). This is consistent with downsampling and ``DatetimeIndex`` behavior. -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -670,7 +670,7 @@ Previous Behavior: open high low close 2000-01 0 9 0 9 -New Behavior: +New behavior: .. ipython:: python @@ -732,7 +732,7 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in .. _whatsnew_0210.api_breaking.dtype_conversions: -Dtype Conversions +Dtype conversions ^^^^^^^^^^^^^^^^^ Previously assignments, ``.where()`` and ``.fillna()`` with a ``bool`` assignment, would coerce to same the type (e.g. int / float), or raise for datetimelikes. These will now preserve the bools with ``object`` dtypes. (:issue:`16821`). @@ -752,7 +752,7 @@ Previously assignments, ``.where()`` and ``.fillna()`` with a ``bool`` assignmen 2 3 dtype: int64 -New Behavior +New behavior .. ipython:: python @@ -789,7 +789,7 @@ These now coerce to ``object`` dtype. .. _whatsnew_210.api.multiindex_single: -MultiIndex Constructor with a Single Level +MultiIndex constructor with a single level ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The ``MultiIndex`` constructors no longer squeezes a MultiIndex with all @@ -818,7 +818,7 @@ UTC Localization with Series Previously, :func:`to_datetime` did not localize datetime ``Series`` data when ``utc=True`` was passed. Now, :func:`to_datetime` will correctly localize ``Series`` with a ``datetime64[ns, UTC]`` dtype to be consistent with how list-like and ``Index`` data are handled. (:issue:`6415`). -Previous Behavior +Previous behavior .. ipython:: python @@ -833,7 +833,7 @@ Previous Behavior 2 2013-01-01 dtype: datetime64[ns] -New Behavior +New behavior .. ipython:: python @@ -843,14 +843,14 @@ Additionally, DataFrames with datetime columns that were parsed by :func:`read_s .. _whatsnew_0210.api.consistency_of_range_functions: -Consistency of Range Functions +Consistency of range functions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In previous versions, there were some inconsistencies between the various range functions: :func:`date_range`, :func:`bdate_range`, :func:`period_range`, :func:`timedelta_range`, and :func:`interval_range`. (:issue:`17471`). One of the inconsistent behaviors occurred when the ``start``, ``end`` and ``period`` parameters were all specified, potentially leading to ambiguous ranges. When all three parameters were passed, ``interval_range`` ignored the ``period`` parameter, ``period_range`` ignored the ``end`` parameter, and the other range functions raised. To promote consistency among the range functions, and avoid potentially ambiguous ranges, ``interval_range`` and ``period_range`` will now raise when all three parameters are passed. -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -863,7 +863,7 @@ Previous Behavior: In [3]: pd.period_range(start='2017Q1', end='2017Q4', periods=6, freq='Q') Out[3]: PeriodIndex(['2017Q1', '2017Q2', '2017Q3', '2017Q4', '2018Q1', '2018Q2'], dtype='period[Q-DEC]', freq='Q-DEC') -New Behavior: +New behavior: .. code-block:: ipython @@ -877,7 +877,7 @@ New Behavior: Additionally, the endpoint parameter ``end`` was not included in the intervals produced by ``interval_range``. However, all other range functions include ``end`` in their output. To promote consistency among the range functions, ``interval_range`` will now include ``end`` as the right endpoint of the final interval, except if ``freq`` is specified in a way which skips ``end``. -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -888,7 +888,7 @@ Previous Behavior: dtype='interval[int64]') -New Behavior: +New behavior: .. ipython:: python @@ -896,7 +896,7 @@ New Behavior: .. _whatsnew_0210.api.mpl_converters: -No Automatic Matplotlib Converters +No automatic Matplotlib converters ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Pandas no longer registers our ``date``, ``time``, ``datetime``, @@ -915,7 +915,7 @@ converters on first-use (:issue:`17710`). .. _whatsnew_0210.api: -Other API Changes +Other API changes ^^^^^^^^^^^^^^^^^ - The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`) @@ -1024,7 +1024,7 @@ Removal of prior version deprecations/changes .. _whatsnew_0210.performance: -Performance Improvements +Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved performance of instantiating :class:`SparseDataFrame` (:issue:`16773`) @@ -1036,7 +1036,7 @@ Performance Improvements .. _whatsnew_0210.docs: -Documentation Changes +Documentation changes ~~~~~~~~~~~~~~~~~~~~~ - Several ``NaT`` method docstrings (e.g. :func:`NaT.ctime`) were incorrect (:issue:`17327`) @@ -1044,7 +1044,7 @@ Documentation Changes .. _whatsnew_0210.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ Conversion @@ -1114,7 +1114,7 @@ Plotting - Bug causing ``plotting.parallel_coordinates`` to reset the random seed when using random colors (:issue:`17525`) -Groupby/Resample/Rolling +Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in ``DataFrame.resample(...).size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`) diff --git a/doc/source/whatsnew/v0.21.1.rst b/doc/source/whatsnew/v0.21.1.rst index c8897ca86e8cf3..64f3339834b38d 100644 --- a/doc/source/whatsnew/v0.21.1.rst +++ b/doc/source/whatsnew/v0.21.1.rst @@ -31,7 +31,7 @@ Highlights include: .. _whatsnew_0211.converters: -Restore Matplotlib datetime Converter Registration +Restore Matplotlib datetime converter registration ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Pandas implements some matplotlib converters for nicely formatting the axis @@ -77,7 +77,7 @@ Improvements to the Parquet IO functionality .. _whatsnew_0211.enhancements.other: -Other Enhancements +Other enhancements ^^^^^^^^^^^^^^^^^^ - :meth:`Timestamp.timestamp` is now available in Python 2.7. (:issue:`17329`) @@ -93,14 +93,14 @@ Deprecations .. _whatsnew_0211.performance: -Performance Improvements +Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved performance of plotting large series/dataframes (:issue:`18236`). .. _whatsnew_0211.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ Conversion @@ -143,7 +143,7 @@ Plotting - Bug in ``DataFrame.plot()`` and ``Series.plot()`` with :class:`DatetimeIndex` where a figure generated by them is not pickleable in Python 3 (:issue:`18439`) -Groupby/Resample/Rolling +Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in ``DataFrame.resample(...).apply(...)`` when there is a callable that returns different columns (:issue:`15169`) diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst index b38fcd9d62af45..ea36b35d61740d 100644 --- a/doc/source/whatsnew/v0.22.0.rst +++ b/doc/source/whatsnew/v0.22.0.rst @@ -37,7 +37,7 @@ time, we changed the sum and prod of an empty ``Series`` to also be ``NaN``. Based on feedback, we've partially reverted those changes. -Arithmetic Operations +Arithmetic operations ^^^^^^^^^^^^^^^^^^^^^ The default sum for empty or all-*NA* ``Series`` is now ``0``. @@ -93,7 +93,7 @@ returning ``1`` instead. These changes affect :meth:`DataFrame.sum` and :meth:`DataFrame.prod` as well. Finally, a few less obvious places in pandas are affected by this change. -Grouping by a Categorical +Grouping by a categorical ^^^^^^^^^^^^^^^^^^^^^^^^^ Grouping by a ``Categorical`` and summing now returns ``0`` instead of @@ -196,7 +196,7 @@ Once again, the ``min_count`` keyword is available to restore the 0.21 behavior. pd.Series([1, 2], index=idx).resample("12H").sum(min_count=1) -Rolling and Expanding +Rolling and expanding ^^^^^^^^^^^^^^^^^^^^^ Rolling and expanding already have a ``min_periods`` keyword that behaves diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index 51efa37b55adde..41c946cc9a5593 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -22,7 +22,7 @@ Highlights include: - :ref:`Instantiation from dicts respects order for Python 3.6+ `. - :ref:`Dependent column arguments for assign `. - :ref:`Merging / sorting on a combination of columns and index levels `. -- :ref:`Extending Pandas with custom types `. +- :ref:`Extending pandas with custom types `. - :ref:`Excluding unobserved categories from groupby `. - :ref:`Changes to make output shape of DataFrame.apply consistent `. @@ -105,7 +105,7 @@ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python using ``.assign()`` to update an existing column. Previously, callables referring to other variables being updated would get the "old" values - Previous Behavior: + Previous behavior: .. code-block:: ipython @@ -118,7 +118,7 @@ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python 1 3 -2 2 4 -3 - New Behavior: + New behavior: .. ipython:: python @@ -186,7 +186,7 @@ resetting indexes. See the :ref:`Sorting by Indexes and Values .. _whatsnew_023.enhancements.extension: -Extending Pandas with Custom Types (Experimental) +Extending pandas with custom types (experimental) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Pandas now supports storing array-like objects that aren't necessarily 1-D NumPy @@ -398,7 +398,7 @@ In previous versions, ``.rank()`` would assign ``inf`` elements ``NaN`` as their s = pd.Series([-np.inf, 0, 1, np.nan, np.inf]) s -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -411,7 +411,7 @@ Previous Behavior: 4 NaN dtype: float64 -Current Behavior: +Current behavior: .. ipython:: python @@ -424,7 +424,7 @@ Furthermore, previously if you rank ``inf`` or ``-inf`` values together with ``N s = pd.Series([np.nan, np.nan, -np.inf, -np.inf]) s -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -436,7 +436,7 @@ Previous Behavior: 3 2.5 dtype: float64 -Current Behavior: +Current behavior: .. ipython:: python @@ -502,7 +502,7 @@ Supplying a ``CategoricalDtype`` will make the categories in each column consist .. _whatsnew_0230.enhancements.other: -Other Enhancements +Other enhancements ^^^^^^^^^^^^^^^^^^ - Unary ``+`` now permitted for ``Series`` and ``DataFrame`` as numeric operator (:issue:`16073`) @@ -597,7 +597,7 @@ Pandas will use the dict's insertion order, when creating a ``Series`` or ``DataFrame`` from a dict and you're using Python version 3.6 or higher. (:issue:`19884`) -Previous Behavior (and current behavior if on Python < 3.6): +Previous behavior (and current behavior if on Python < 3.6): .. code-block:: ipython @@ -614,7 +614,7 @@ Previous Behavior (and current behavior if on Python < 3.6): Note the Series above is ordered alphabetically by the index values. -New Behavior (for Python >= 3.6): +New behavior (for Python >= 3.6): .. ipython:: python @@ -738,7 +738,7 @@ where a list-like (e.g. ``tuple`` or ``list`` is returned) (:issue:`16353`, :iss columns=['A', 'B', 'C']) df -Previous Behavior: if the returned shape happened to match the length of original columns, this would return a ``DataFrame``. +Previous behavior: if the returned shape happened to match the length of original columns, this would return a ``DataFrame``. If the return shape did not match, a ``Series`` with lists was returned. .. code-block:: python @@ -764,7 +764,7 @@ If the return shape did not match, a ``Series`` with lists was returned. dtype: object -New Behavior: When the applied function returns a list-like, this will now *always* return a ``Series``. +New behavior: When the applied function returns a list-like, this will now *always* return a ``Series``. .. ipython:: python @@ -824,7 +824,7 @@ Note that this change also applies to :meth:`DataFrame.append`, which has also r .. _whatsnew_0230.api_breaking.build_changes: -Build Changes +Build changes ^^^^^^^^^^^^^ - Building pandas for development now requires ``cython >= 0.24`` (:issue:`18613`) @@ -833,12 +833,12 @@ Build Changes .. _whatsnew_0230.api_breaking.index_division_by_zero: -Index Division By Zero Fills Correctly +Index division by zero fills correctly ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Division operations on ``Index`` and subclasses will now fill division of positive numbers by zero with ``np.inf``, division of negative numbers by zero with ``-np.inf`` and `0 / 0` with ``np.nan``. This matches existing ``Series`` behavior. (:issue:`19322`, :issue:`19347`) -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -859,7 +859,7 @@ Previous Behavior: In [11]: pd.RangeIndex(1, 5) / 0 ZeroDivisionError: integer division or modulo by zero -Current Behavior: +Current behavior: .. ipython:: python @@ -888,7 +888,7 @@ extracted). As of Pandas 0.23.0 :func:`str.extract` always returns a ``DataFrame ``expand`` is set to ``False``. Finally, ``None`` was an accepted value for the ``expand`` parameter (which was equivalent to ``False``), but now raises a ``ValueError``. (:issue:`11386`) -Previous Behavior: +Previous behavior: .. code-block:: ipython @@ -906,7 +906,7 @@ Previous Behavior: Out [4]: pandas.core.series.Series -New Behavior: +New behavior: .. ipython:: python @@ -933,7 +933,7 @@ The default value of the ``ordered`` parameter for :class:`~pandas.api.types.Cat In previous versions, the default value for the ``ordered`` parameter was ``False``. This could potentially lead to the ``ordered`` parameter unintentionally being changed from ``True`` to ``False`` when users attempt to update ``categories`` if ``ordered`` is not explicitly specified, as it would silently default to ``False``. The new behavior for ``ordered=None`` is to retain the existing value of ``ordered``. -New Behavior: +New behavior: .. ipython:: python @@ -978,7 +978,7 @@ yourself. To revert to the old setting, you can run this line: .. _whatsnew_0230.api.datetimelike: -Datetimelike API Changes +Datetimelike API changes ^^^^^^^^^^^^^^^^^^^^^^^^ - The default ``Timedelta`` constructor now accepts an ``ISO 8601 Duration`` string as an argument (:issue:`19040`) @@ -1007,7 +1007,7 @@ Datetimelike API Changes .. _whatsnew_0230.api.other: -Other API Changes +Other API changes ^^^^^^^^^^^^^^^^^ - :func:`Series.astype` and :func:`Index.astype` with an incompatible dtype will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`18231`) @@ -1130,7 +1130,7 @@ Removal of prior version deprecations/changes .. _whatsnew_0230.performance: -Performance Improvements +Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Indexers on ``Series`` or ``DataFrame`` no longer create a reference cycle (:issue:`17956`) @@ -1162,7 +1162,7 @@ Performance Improvements .. _whatsnew_0230.docs: -Documentation Changes +Documentation changes ~~~~~~~~~~~~~~~~~~~~~ Thanks to all of the contributors who participated in the Pandas Documentation @@ -1190,7 +1190,7 @@ read the `NumFOCUS blogpost`_ recapping the sprint. .. _whatsnew_0230.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ Categorical @@ -1393,7 +1393,7 @@ Plotting - :func:`DataFrame.plot` now supports multiple columns to the ``y`` argument (:issue:`19699`) -Groupby/Resample/Rolling +Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug when grouping by a single column and aggregating with a class like ``list`` or ``tuple`` (:issue:`18079`) diff --git a/doc/source/whatsnew/v0.23.1.rst b/doc/source/whatsnew/v0.23.1.rst index 0218c3b02a4132..d730a57a01a606 100644 --- a/doc/source/whatsnew/v0.23.1.rst +++ b/doc/source/whatsnew/v0.23.1.rst @@ -1,6 +1,6 @@ .. _whatsnew_0231: -What's New in 0.23.1 (June 12, 2018) +What's new in 0.23.1 (June 12, 2018) ------------------------------------ {{ header }} @@ -20,7 +20,7 @@ and bug fixes. We recommend that all users upgrade to this version. .. _whatsnew_0231.fixed_regressions: -Fixed Regressions +Fixed regressions ~~~~~~~~~~~~~~~~~ **Comparing Series with datetime.date** @@ -64,7 +64,7 @@ To summarize, here's the behavior in 0.22.0, 0.23.0, 0.23.1: In addition, ordering comparisons will raise a ``TypeError`` in the future. -**Other Fixes** +**Other fixes** - Reverted the ability of :func:`~DataFrame.to_sql` to perform multivalue inserts as this caused regression in certain cases (:issue:`21103`). @@ -85,7 +85,7 @@ In addition, ordering comparisons will raise a ``TypeError`` in the future. .. _whatsnew_0231.performance: -Performance Improvements +Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved performance of :meth:`CategoricalIndex.is_monotonic_increasing`, :meth:`CategoricalIndex.is_monotonic_decreasing` and :meth:`CategoricalIndex.is_monotonic` (:issue:`21025`) @@ -94,10 +94,10 @@ Performance Improvements .. _whatsnew_0231.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ -**Groupby/Resample/Rolling** +**Groupby/resample/rolling** - Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`) - Bug in :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`) diff --git a/doc/source/whatsnew/v0.23.2.rst b/doc/source/whatsnew/v0.23.2.rst index cae2415e3374e5..df8cc12e3385ed 100644 --- a/doc/source/whatsnew/v0.23.2.rst +++ b/doc/source/whatsnew/v0.23.2.rst @@ -1,6 +1,6 @@ .. _whatsnew_0232: -What's New in 0.23.2 (July 5, 2018) +What's new in 0.23.2 (July 5, 2018) ----------------------------------- {{ header }} @@ -25,7 +25,7 @@ and bug fixes. We recommend that all users upgrade to this version. .. _whatsnew_0232.enhancements: -Logical Reductions over Entire DataFrame +Logical reductions over entire DataFrame ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :meth:`DataFrame.all` and :meth:`DataFrame.any` now accept ``axis=None`` to reduce over all axes to a scalar (:issue:`19976`) @@ -56,7 +56,7 @@ With pandas 0.23.2, that will correctly return False, as it did with NumPy < 1.1 .. _whatsnew_0232.fixed_regressions: -Fixed Regressions +Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) @@ -70,14 +70,14 @@ Fixed Regressions - Fixed regression in :func:`to_clipboard` that defaulted to copying dataframes with space delimited instead of tab delimited (:issue:`21104`) -Build Changes +Build changes ~~~~~~~~~~~~~ - The source and binary distributions no longer include test data files, resulting in smaller download sizes. Tests relying on these data files will be skipped when using ``pandas.test()``. (:issue:`19320`) .. _whatsnew_0232.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ **Conversion** diff --git a/doc/source/whatsnew/v0.23.3.rst b/doc/source/whatsnew/v0.23.3.rst index 3b1a0cfa5f2739..bb8862a89b0031 100644 --- a/doc/source/whatsnew/v0.23.3.rst +++ b/doc/source/whatsnew/v0.23.3.rst @@ -1,6 +1,6 @@ .. _whatsnew_0233: -What's New in 0.23.3 (July 7, 2018) +What's new in 0.23.3 (July 7, 2018) ----------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.23.4.rst b/doc/source/whatsnew/v0.23.4.rst index 01f904e129f804..060d1fc8eba341 100644 --- a/doc/source/whatsnew/v0.23.4.rst +++ b/doc/source/whatsnew/v0.23.4.rst @@ -1,6 +1,6 @@ .. _whatsnew_0234: -What's New in 0.23.4 (August 3, 2018) +What's new in 0.23.4 (August 3, 2018) ------------------------------------- {{ header }} @@ -20,17 +20,17 @@ and bug fixes. We recommend that all users upgrade to this version. .. _whatsnew_0234.fixed_regressions: -Fixed Regressions +Fixed regressions ~~~~~~~~~~~~~~~~~ - Python 3.7 with Windows gave all missing values for rolling variance calculations (:issue:`21813`) .. _whatsnew_0234.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ -**Groupby/Resample/Rolling** +**Groupby/resample/rolling** - Bug where calling :func:`DataFrameGroupBy.agg` with a list of functions including ``ohlc`` as the non-initial element would raise a ``ValueError`` (:issue:`21716`) - Bug in ``roll_quantile`` caused a memory leak when calling ``.rolling(...).quantile(q)`` with ``q`` in (0,1) (:issue:`21965`) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 086519ad751924..403b4908d36e3b 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1,6 +1,6 @@ .. _whatsnew_0240: -What's New in 0.24.0 (January 25, 2019) +What's new in 0.24.0 (January 25, 2019) --------------------------------------- .. warning:: @@ -35,7 +35,7 @@ Enhancements .. _whatsnew_0240.enhancements.intna: -Optional Integer NA Support +Optional integer NA support ^^^^^^^^^^^^^^^^^^^^^^^^^^^ Pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled through the use of :ref:`extension types `. @@ -194,7 +194,7 @@ is a float. .. _whatsnew_0240.enhancements.interval: -Storing Interval and Period Data in Series and DataFrame +Storing Interval and Period data in Series and DataFrame ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :class:`Interval` and :class:`Period` data may now be stored in a :class:`Series` or :class:`DataFrame`, in addition to an @@ -300,7 +300,7 @@ value. (:issue:`17054`) """) -*Previous Behavior*: +*Previous behavior*: .. code-block:: ipython @@ -309,7 +309,7 @@ value. (:issue:`17054`) [ A B C 0 1 2 NaN] -*New Behavior*: +*New behavior*: .. ipython:: python @@ -361,7 +361,7 @@ See the :ref:`Advanced documentation on renaming` for more .. _whatsnew_0240.enhancements.other: -Other Enhancements +Other enhancements ^^^^^^^^^^^^^^^^^^ - :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`) @@ -482,7 +482,7 @@ for the default line terminator (:issue:`20353`). This change only affects when running on Windows, where ``'\r\n'`` was used for line terminator even when ``'\n'`` was passed in ``line_terminator``. -*Previous Behavior* on Windows: +*Previous behavior* on Windows: .. code-block:: ipython @@ -508,7 +508,7 @@ even when ``'\n'`` was passed in ``line_terminator``. Out[5]: b'string_with_lf,string_with_crlf\n"a\nbc","a\r\nbc"\n' -*New Behavior* on Windows: +*New behavior* on Windows: Passing ``line_terminator`` explicitly, set thes ``line terminator`` to that character. @@ -569,7 +569,7 @@ missing indicator, ``np.nan``. (:issue:`20377`) from io import StringIO -*Previous Behavior*: +*Previous behavior*: .. code-block:: ipython @@ -579,7 +579,7 @@ missing indicator, ``np.nan``. (:issue:`20377`) Out[7]: 'nan' -*New Behavior*: +*New behavior*: .. ipython:: python @@ -591,7 +591,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form .. _whatsnew_0240.api.timezone_offset_parsing: -Parsing Datetime Strings with Timezone Offsets +Parsing datetime strings with timezone offsets ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Previously, parsing datetime strings with UTC offsets with :func:`to_datetime` @@ -602,7 +602,7 @@ offset in the ``tz`` attribute. Now, :func:`to_datetime` preserves the UTC offset in the ``tz`` attribute when all the datetime strings have the same UTC offset (:issue:`17697`, :issue:`11736`, :issue:`22457`) -*Previous Behavior*: +*Previous behavior*: .. code-block:: ipython @@ -616,7 +616,7 @@ UTC offset (:issue:`17697`, :issue:`11736`, :issue:`22457`) In [4]: pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"]) Out[4]: DatetimeIndex(['2015-11-18 10:00:00', '2015-11-18 10:00:00'], dtype='datetime64[ns]', freq=None) -*New Behavior*: +*New behavior*: .. ipython:: python @@ -656,7 +656,7 @@ Parsing mixed-timezones with :func:`read_csv` :func:`read_csv` no longer silently converts mixed-timezone columns to UTC (:issue:`24987`). -*Previous Behavior* +*Previous behavior* .. code-block:: python @@ -671,7 +671,7 @@ Parsing mixed-timezones with :func:`read_csv` 1 1999-12-31 18:00:00 Name: a, dtype: datetime64[ns] -*New Behavior* +*New behavior* .. ipython:: python @@ -704,7 +704,7 @@ to '23:59:59.999999999' when calling :attr:`Series.dt.end_time`, :attr:`Period.e :attr:`PeriodIndex.end_time`, :func:`Period.to_timestamp()` with ``how='end'``, or :func:`PeriodIndex.to_timestamp()` with ``how='end'`` (:issue:`17157`) -*Previous Behavior*: +*Previous behavior*: .. code-block:: ipython @@ -717,7 +717,7 @@ or :func:`PeriodIndex.to_timestamp()` with ``how='end'`` (:issue:`17157`) In [5]: p.end_time Out[5]: Timestamp(2017-01-01 23:59:59.999999999) -*New Behavior*: +*New behavior*: Calling :attr:`Series.dt.end_time` will now result in a time of '23:59:59.999999999' as is the case with :attr:`Period.end_time`, for example @@ -744,7 +744,7 @@ from an :class:`numpy.ndarray` of :class:`Timestamp` objects to a :class:`arrays ser = pd.Series([pd.Timestamp('2000', tz='UTC'), pd.Timestamp('2000', tz='UTC')]) -*Previous Behavior*: +*Previous behavior*: .. code-block:: ipython @@ -752,7 +752,7 @@ from an :class:`numpy.ndarray` of :class:`Timestamp` objects to a :class:`arrays Out[3]: array([Timestamp('2000-01-01 00:00:00+0000', tz='UTC')], dtype=object) -*New Behavior*: +*New behavior*: .. ipython:: python @@ -761,7 +761,7 @@ from an :class:`numpy.ndarray` of :class:`Timestamp` objects to a :class:`arrays .. _whatsnew_0240.api_breaking.sparse_values: -Sparse Data Structure Refactor +Sparse data structure refactor ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ``SparseArray``, the array backing ``SparseSeries`` and the columns in a ``SparseDataFrame``, @@ -807,7 +807,7 @@ Previously, when ``sparse=True`` was passed to :func:`get_dummies`, the return v a :class:`DataFrame` or a :class:`SparseDataFrame`, depending on whether all or a just a subset of the columns were dummy-encoded. Now, a :class:`DataFrame` is always returned (:issue:`24284`). -*Previous Behavior* +*Previous behavior* The first :func:`get_dummies` returns a :class:`DataFrame` because the column ``A`` is not dummy encoded. When just ``["B", "C"]`` are passed to ``get_dummies``, @@ -828,7 +828,7 @@ then all the columns are dummy-encoded, and a :class:`SparseDataFrame` was retur df = pd.DataFrame({"A": [1, 2], "B": ['a', 'b'], "C": ['a', 'a']}) -*New Behavior* +*New behavior* Now, the return type is consistently a :class:`DataFrame`. @@ -861,7 +861,7 @@ Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with .. _whatsnew_0240.api.datetimelike.normalize: -Tick DateOffset Normalize Restrictions +Tick DateOffset normalize restrictions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Creating a ``Tick`` object (:class:`Day`, :class:`Hour`, :class:`Minute`, @@ -869,7 +869,7 @@ Creating a ``Tick`` object (:class:`Day`, :class:`Hour`, :class:`Minute`, ``normalize=True`` is no longer supported. This prevents unexpected behavior where addition could fail to be monotone or associative. (:issue:`21427`) -*Previous Behavior*: +*Previous behavior*: .. code-block:: ipython @@ -891,7 +891,7 @@ where addition could fail to be monotone or associative. (:issue:`21427`) In [7]: ts + tic + tic + tic == ts + (tic + tic + tic) Out[7]: False -*New Behavior*: +*New behavior*: .. ipython:: python @@ -905,13 +905,13 @@ where addition could fail to be monotone or associative. (:issue:`21427`) .. _whatsnew_0240.api.period_subtraction: -Period Subtraction +Period subtraction ^^^^^^^^^^^^^^^^^^ Subtraction of a ``Period`` from another ``Period`` will give a ``DateOffset``. instead of an integer (:issue:`21314`) -*Previous Behavior*: +*Previous behavior*: .. code-block:: ipython @@ -922,7 +922,7 @@ instead of an integer (:issue:`21314`) In [4]: june - april Out [4]: 2 -*New Behavior*: +*New behavior*: .. ipython:: python @@ -933,7 +933,7 @@ instead of an integer (:issue:`21314`) Similarly, subtraction of a ``Period`` from a ``PeriodIndex`` will now return an ``Index`` of ``DateOffset`` objects instead of an ``Int64Index`` -*Previous Behavior*: +*Previous behavior*: .. code-block:: ipython @@ -942,7 +942,7 @@ an ``Index`` of ``DateOffset`` objects instead of an ``Int64Index`` In [3]: pi - pi[0] Out[3]: Int64Index([0, 1, 2], dtype='int64') -*New Behavior*: +*New behavior*: .. ipython:: python @@ -952,7 +952,7 @@ an ``Index`` of ``DateOffset`` objects instead of an ``Int64Index`` .. _whatsnew_0240.api.timedelta64_subtract_nan: -Addition/Subtraction of ``NaN`` from :class:`DataFrame` +Addition/subtraction of ``NaN`` from :class:`DataFrame` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Adding or subtracting ``NaN`` from a :class:`DataFrame` column with @@ -965,7 +965,7 @@ all-``NaT``. This is for compatibility with ``TimedeltaIndex`` and df = pd.DataFrame([pd.Timedelta(days=1)]) df -*Previous Behavior*: +*Previous behavior*: .. code-block:: ipython @@ -976,7 +976,7 @@ all-``NaT``. This is for compatibility with ``TimedeltaIndex`` and 0 0 NaT -*New Behavior*: +*New behavior*: .. code-block:: ipython @@ -986,7 +986,7 @@ all-``NaT``. This is for compatibility with ``TimedeltaIndex`` and .. _whatsnew_0240.api.dataframe_cmp_broadcasting: -DataFrame Comparison Operations Broadcasting Changes +DataFrame comparison operations broadcasting changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Previously, the broadcasting behavior of :class:`DataFrame` comparison operations (``==``, ``!=``, ...) was inconsistent with the behavior of @@ -1006,7 +1006,7 @@ The affected cases are: df = pd.DataFrame(arr) df -*Previous Behavior*: +*Previous behavior*: .. code-block:: ipython @@ -1045,7 +1045,7 @@ The affected cases are: ... ValueError: Unable to coerce to Series, length must be 2: given 3 -*New Behavior*: +*New behavior*: .. ipython:: python @@ -1072,7 +1072,7 @@ The affected cases are: .. _whatsnew_0240.api.dataframe_arithmetic_broadcasting: -DataFrame Arithmetic Operations Broadcasting Changes +DataFrame arithmetic operations broadcasting changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :class:`DataFrame` arithmetic operations when operating with 2-dimensional @@ -1085,7 +1085,7 @@ broadcast. (:issue:`23000`) df = pd.DataFrame(arr) df -*Previous Behavior*: +*Previous behavior*: .. code-block:: ipython @@ -1096,7 +1096,7 @@ broadcast. (:issue:`23000`) ... ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (3, 1) -*New Behavior*: +*New behavior*: .. ipython:: python @@ -1105,13 +1105,13 @@ broadcast. (:issue:`23000`) .. _whatsnew_0240.api.incompatibilities: -Series and Index Data-Dtype Incompatibilities +Series and Index data-dtype incompatibilities ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ``Series`` and ``Index`` constructors now raise when the data is incompatible with a passed ``dtype=`` (:issue:`15832`) -*Previous Behavior*: +*Previous behavior*: .. code-block:: ipython @@ -1120,7 +1120,7 @@ data is incompatible with a passed ``dtype=`` (:issue:`15832`) 0 18446744073709551615 dtype: uint64 -*New Behavior*: +*New behavior*: .. code-block:: ipython @@ -1143,7 +1143,7 @@ other than another ``Categorical`` of ints (:issue:`19214`) s = pd.Series([0, 1, np.nan]) c = pd.Series([0, 1, np.nan], dtype="category") -*Previous Behavior* +*Previous behavior* .. code-block:: ipython @@ -1157,13 +1157,13 @@ other than another ``Categorical`` of ints (:issue:`19214`) 2 NaN dtype: float64 -*New Behavior* +*New behavior* .. ipython:: python pd.concat([s, c]) -Datetimelike API Changes +Datetimelike API changes ^^^^^^^^^^^^^^^^^^^^^^^^ - For :class:`DatetimeIndex` and :class:`TimedeltaIndex` with non-``None`` ``freq`` attribute, addition or subtraction of integer-dtyped array or ``Index`` will return an object of the same class (:issue:`19959`) @@ -1175,7 +1175,7 @@ Datetimelike API Changes .. _whatsnew_0240.api.other: -Other API Changes +Other API changes ^^^^^^^^^^^^^^^^^ - A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`) @@ -1212,10 +1212,10 @@ Other API Changes .. _whatsnew_0240.api.extension: -Extension Type Changes +Extension type changes ~~~~~~~~~~~~~~~~~~~~~~ -**Equality and Hashability** +**Equality and hashability** Pandas now requires that extension dtypes be hashable (i.e. the respective ``ExtensionDtype`` objects; hashability is not a requirement for the values @@ -1263,7 +1263,7 @@ ways of adding operator support. - :meth:`ExtensionArray._formatting_values` is deprecated. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`) - An ``ExtensionArray`` with a boolean dtype now works correctly as a boolean indexer. :meth:`pandas.api.types.is_bool_dtype` now properly considers them boolean (:issue:`22326`) -**Bug Fixes** +**Bug fixes** - Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`) - :meth:`~Series.shift` now dispatches to :meth:`ExtensionArray.shift` (:issue:`22386`) @@ -1329,7 +1329,7 @@ Deprecations .. _whatsnew_0240.deprecations.datetimelike_int_ops: -Integer Addition/Subtraction with Datetimes and Timedeltas is Deprecated +Integer addition/subtraction with datetimes and timedeltas is deprecated ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In the past, users could—in some cases—add or subtract integers or integer-dtype @@ -1338,7 +1338,7 @@ arrays from :class:`Timestamp`, :class:`DatetimeIndex` and :class:`TimedeltaInde This usage is now deprecated. Instead add or subtract integer multiples of the object's ``freq`` attribute (:issue:`21939`, :issue:`23878`). -*Previous Behavior*: +*Previous behavior*: .. code-block:: ipython @@ -1354,7 +1354,7 @@ the object's ``freq`` attribute (:issue:`21939`, :issue:`23878`). In [10]: dti + pd.Index([1, 2]) Out[10]: DatetimeIndex(['2001-01-08', '2001-01-22'], dtype='datetime64[ns]', freq=None) -*New Behavior*: +*New behavior*: .. ipython:: python :okwarning: @@ -1371,7 +1371,7 @@ the object's ``freq`` attribute (:issue:`21939`, :issue:`23878`). .. _whatsnew_0240.deprecations.integer_tz: -Passing Integer data and a timezone to DatetimeIndex +Passing integer data and a timezone to datetimeindex ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The behavior of :class:`DatetimeIndex` when passed integer data and @@ -1417,7 +1417,7 @@ The old behavior can be retained with by localizing directly to the final timezo .. _whatsnew_0240.deprecations.tz_aware_array: -Converting Timezone-Aware Series and Index to NumPy Arrays +Converting timezone-aware Series and Index to NumPy arrays ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The conversion from a :class:`Series` or :class:`Index` with timezone-aware @@ -1459,13 +1459,13 @@ The default behavior remains the same, but issues a warning The previous or future behavior can be obtained, without any warnings, by specifying the ``dtype`` -*Previous Behavior* +*Previous behavior* .. ipython:: python np.asarray(ser, dtype='datetime64[ns]') -*Future Behavior* +*Future behavior* .. ipython:: python @@ -1512,7 +1512,7 @@ Removal of prior version deprecations/changes .. _whatsnew_0240.performance: -Performance Improvements +Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Slicing Series and DataFrames with an monotonically increasing :class:`CategoricalIndex` @@ -1547,7 +1547,7 @@ Performance Improvements .. _whatsnew_0240.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ Categorical @@ -1827,7 +1827,7 @@ Plotting - Bug in :func:`DataFrame.plot.bar` caused bars to use multiple colors instead of a single one (:issue:`20585`) - Bug in validating color parameter caused extra color to be appended to the given color array. This happened to multiple plotting functions using matplotlib. (:issue:`20726`) -Groupby/Resample/Rolling +Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :func:`pandas.core.window.Rolling.min` and :func:`pandas.core.window.Rolling.max` with ``closed='left'``, a datetime-like index and only one entry in the series leading to segfault (:issue:`24718`) @@ -1915,7 +1915,7 @@ Style - :meth:`~pandas.io.formats.style.Styler.background_gradient` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` (:issue:`15204`) - :meth:`~pandas.io.formats.style.Styler.bar` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` and setting clipping range with ``vmin`` and ``vmax`` (:issue:`21548` and :issue:`21526`). ``NaN`` values are also handled properly. -Build Changes +Build changes ^^^^^^^^^^^^^ - Building pandas for development now requires ``cython >= 0.28.2`` (:issue:`21688`) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index 8f963f1285e1b8..9dffe1f0764774 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -2,7 +2,7 @@ .. _whatsnew_0241: -Whats New in 0.24.1 (February 3, 2019) +Whats new in 0.24.1 (February 3, 2019) -------------------------------------- .. warning:: @@ -17,7 +17,7 @@ including other versions of pandas. See :ref:`whatsnew_0240` for the 0.24.0 chan .. _whatsnew_0241.api: -API Changes +API changes ~~~~~~~~~~~ Changing the ``sort`` parameter for :class:`Index` set operations @@ -47,7 +47,7 @@ The `sort` option for :meth:`Index.intersection` has changed in three ways. .. _whatsnew_0241.regressions: -Fixed Regressions +Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.to_dict` with ``records`` orient raising an @@ -62,7 +62,7 @@ Fixed Regressions .. _whatsnew_0241.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ **Reshaping** diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index c3b442e2352bb7..21936ba270c776 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -2,7 +2,7 @@ .. _whatsnew_0242: -Whats New in 0.24.2 (March 12, 2019) +Whats new in 0.24.2 (March 12, 2019) ------------------------------------ .. warning:: @@ -17,7 +17,7 @@ including other versions of pandas. .. _whatsnew_0242.regressions: -Fixed Regressions +Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.all` and :meth:`DataFrame.any` where ``bool_only=True`` was ignored (:issue:`25101`) @@ -39,7 +39,7 @@ Fixed Regressions .. _whatsnew_0242.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ **I/O** diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 5a5de357e17780..1980e00f1073dc 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -1,6 +1,6 @@ .. _whatsnew_0250: -What's New in 0.25.0 (April XX, 2019) +What's new in 0.25.0 (April XX, 2019) ------------------------------------- .. warning:: @@ -29,7 +29,7 @@ Enhancements .. _whatsnew_0250.enhancements.agg_relabel: -Groupby Aggregation with Relabeling +Groupby aggregation with relabeling ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Pandas has added special groupby behavior, known as "named aggregation", for naming the @@ -113,7 +113,7 @@ the output will truncate, if it's wider than :attr:`options.display.width` .. _whatsnew_0250.enhancements.other: -Other Enhancements +Other enhancements ^^^^^^^^^^^^^^^^^^ - :func:`DataFrame.plot` keywords ``logy``, ``logx`` and ``loglog`` can now accept the value ``'sym'`` for symlog scaling. (:issue:`24867`) - Added support for ISO week year format ('%G-%V-%u') when parsing datetimes using :meth:`to_datetime` (:issue:`16607`) @@ -159,7 +159,7 @@ is respected in indexing. (:issue:`24076`, :issue:`16785`) df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific')) df -*Previous Behavior*: +*Previous behavior*: .. code-block:: ipython @@ -168,7 +168,7 @@ is respected in indexing. (:issue:`24076`, :issue:`16785`) 0 2019-01-01 00:00:00-08:00 0 -*New Behavior*: +*New behavior*: .. ipython:: python @@ -185,7 +185,7 @@ Constructing a :class:`MultiIndex` with ``NaN`` levels or codes value < -1 was a Now, construction with codes value < -1 is not allowed and ``NaN`` levels' corresponding codes would be reassigned as -1. (:issue:`19387`) -*Previous Behavior*: +*Previous behavior*: .. code-block:: ipython @@ -199,7 +199,7 @@ would be reassigned as -1. (:issue:`19387`) Out[2]: MultiIndex(levels=[[1, 2]], codes=[[0, -2]]) -*New Behavior*: +*New behavior*: .. ipython:: python :okexcept: @@ -211,7 +211,7 @@ would be reassigned as -1. (:issue:`19387`) .. _whatsnew_0250.api_breaking.groupby_apply_first_group_once: -``GroupBy.apply`` on ``DataFrame`` evaluates first group only once +``Groupby.apply`` on ``DataFrame`` evaluates first group only once ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The implementation of :meth:`DataFrameGroupBy.apply() ` @@ -230,7 +230,7 @@ Now every group is evaluated only a single time. print(group.name) return group -*Previous Behavior*: +*Previous behavior*: .. code-block:: python @@ -243,14 +243,14 @@ Now every group is evaluated only a single time. 0 x 1 1 y 2 -*New Behavior*: +*New behavior*: .. ipython:: python df.groupby("a").apply(func) -Concatenating Sparse Values +Concatenating sparse values ^^^^^^^^^^^^^^^^^^^^^^^^^^^ When passed DataFrames whose values are sparse, :func:`concat` will now return a @@ -260,14 +260,14 @@ When passed DataFrames whose values are sparse, :func:`concat` will now return a df = pd.DataFrame({"A": pd.SparseArray([0, 1])}) -*Previous Behavior*: +*Previous behavior*: .. code-block:: ipython In [2]: type(pd.concat([df, df])) pandas.core.sparse.frame.SparseDataFrame -*New Behavior*: +*New behavior*: .. ipython:: python @@ -293,7 +293,7 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t ``'bytes'``-only data will raise an exception (except for :meth:`Series.str.decode`, :meth:`Series.str.get`, :meth:`Series.str.len`, :meth:`Series.str.slice`), see :issue:`23163`, :issue:`23011`, :issue:`23551`. -*Previous Behavior*: +*Previous behavior*: .. code-block:: python @@ -313,7 +313,7 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t 2 False dtype: bool -*New Behavior*: +*New behavior*: .. ipython:: python :okexcept: @@ -324,7 +324,7 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t .. _whatsnew_0250.api_breaking.incompatible_index_unions: -Incompatible Index Type Unions +Incompatible Index type unions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ When performing :func:`Index.union` operations between objects of incompatible dtypes, @@ -334,7 +334,7 @@ of empty :class:`Index` objects will now be evaluated before performing union op rather than simply returning the other :class:`Index` object. :func:`Index.union` can now be considered commutative, such that ``A.union(B) == B.union(A)`` (:issue:`23525`). -*Previous Behavior*: +*Previous behavior*: .. code-block:: python @@ -345,7 +345,7 @@ considered commutative, such that ``A.union(B) == B.union(A)`` (:issue:`23525`). In [2]: pd.Index([], dtype=object).union(pd.Index([1, 2, 3])) Out[2]: Int64Index([1, 2, 3], dtype='int64') -*New Behavior*: +*New behavior*: .. ipython:: python @@ -371,7 +371,7 @@ are returned. (:issue:`21521`) df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]}) df -*Previous Behavior*: +*Previous behavior*: .. code-block:: python @@ -381,7 +381,7 @@ are returned. (:issue:`21521`) 0 x 1 1 y 2 -*New Behavior*: +*New behavior*: .. ipython:: python @@ -400,7 +400,7 @@ with :attr:`numpy.nan` in the case of an empty :class:`DataFrame` (:issue:`26397 df = pd.DataFrame({"empty_col": pd.Categorical([])}) df -*Previous Behavior*: +*Previous behavior*: .. code-block:: python @@ -410,7 +410,7 @@ with :attr:`numpy.nan` in the case of an empty :class:`DataFrame` (:issue:`26397 count 0 unique 0 -*New Behavior*: +*New behavior*: .. ipython:: python @@ -496,7 +496,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor .. _whatsnew_0250.api.other: -Other API Changes +Other API changes ^^^^^^^^^^^^^^^^^ - :class:`DatetimeTZDtype` will now standardize pytz timezones to a common timezone instance (:issue:`24713`) @@ -515,13 +515,13 @@ Other API Changes Deprecations ~~~~~~~~~~~~ -Sparse Subclasses +Sparse subclasses ^^^^^^^^^^^^^^^^^ The ``SparseSeries`` and ``SparseDataFrame`` subclasses are deprecated. Their functionality is better-provided by a ``Series`` or ``DataFrame`` with sparse values. -**Previous Way** +**Previous way** .. ipython:: python :okwarning: @@ -529,7 +529,7 @@ by a ``Series`` or ``DataFrame`` with sparse values. df = pd.SparseDataFrame({"A": [0, 0, 1, 2]}) df.dtypes -**New Way** +**New way** .. ipython:: python @@ -538,7 +538,7 @@ by a ``Series`` or ``DataFrame`` with sparse values. The memory usage of the two approaches is identical. See :ref:`sparse.migration` for more (:issue:`19239`). -Other Deprecations +Other deprecations ^^^^^^^^^^^^^^^^^^ - The deprecated ``.ix[]`` indexer now raises a more visible ``FutureWarning`` instead of ``DeprecationWarning`` (:issue:`26438`). @@ -552,7 +552,6 @@ Other Deprecations - The :meth:`Series.ftype`, :meth:`Series.ftypes` and :meth:`DataFrame.ftypes` methods are deprecated and will be removed in a future version. Instead, use :meth:`Series.dtype` and :meth:`DataFrame.dtypes` (:issue:`26705`). - :meth:`Timedelta.resolution` is deprecated and replaced with :meth:`Timedelta.resolution_string`. In a future version, :meth:`Timedelta.resolution` will be changed to behave like the standard library :attr:`timedelta.resolution` (:issue:`21344`) -- :meth:`Series.to_sparse`, :meth:`DataFrame.to_sparse`, :meth:`Series.to_dense` and :meth:`DataFrame.to_dense` are deprecated and will be removed in a future version. (:issue:`26557`). .. _whatsnew_0250.prior_deprecations: @@ -568,7 +567,7 @@ Removal of prior version deprecations/changes .. _whatsnew_0250.performance: -Performance Improvements +Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Significant speedup in :class:`SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`) @@ -593,7 +592,7 @@ Performance Improvements .. _whatsnew_0250.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ @@ -748,7 +747,7 @@ Plotting - - -Groupby/Resample/Rolling +Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`pandas.core.resample.Resampler.agg` with a timezone aware index where ``OverflowError`` would raise when passing a list of functions (:issue:`22660`) diff --git a/doc/source/whatsnew/v0.4.x.rst b/doc/source/whatsnew/v0.4.x.rst index 0c2047ee69b81d..8e41e528f5b75d 100644 --- a/doc/source/whatsnew/v0.4.x.rst +++ b/doc/source/whatsnew/v0.4.x.rst @@ -5,7 +5,7 @@ v.0.4.1 through v0.4.3 (September 25 - October 9, 2011) {{ header }} -New Features +New features ~~~~~~~~~~~~ - Added Python 3 support using 2to3 (:issue:`200`) @@ -32,7 +32,7 @@ New Features - :ref:`Enable ` unstacking by name (:issue:`142`) - :ref:`Enable ` ``sortlevel`` to work by level (:issue:`141`) -Performance Enhancements +Performance enhancements ~~~~~~~~~~~~~~~~~~~~~~~~ - Altered binary operations on differently-indexed SparseSeries objects diff --git a/doc/source/whatsnew/v0.5.0.rst b/doc/source/whatsnew/v0.5.0.rst index 4e635a5fe68593..37c52ac7bb34e1 100644 --- a/doc/source/whatsnew/v0.5.0.rst +++ b/doc/source/whatsnew/v0.5.0.rst @@ -12,7 +12,7 @@ v.0.5.0 (October 24, 2011) from pandas import * # noqa F401, F403 -New Features +New features ~~~~~~~~~~~~ - :ref:`Added ` ``DataFrame.align`` method with standard join options @@ -36,7 +36,7 @@ New Features - :ref:`Added ` support for different delimiters in ``DataFrame.to_csv`` (:issue:`244`) - TODO: DOCS ABOUT TAKE METHODS -Performance Enhancements +Performance enhancements ~~~~~~~~~~~~~~~~~~~~~~~~ - VBENCH Major performance improvements in file parsing functions ``read_csv`` and ``read_table`` diff --git a/doc/source/whatsnew/v0.6.0.rst b/doc/source/whatsnew/v0.6.0.rst index c0aba18d08b279..973ba897b32347 100644 --- a/doc/source/whatsnew/v0.6.0.rst +++ b/doc/source/whatsnew/v0.6.0.rst @@ -11,7 +11,7 @@ v.0.6.0 (November 25, 2011) from pandas import * # noqa F401, F403 -New Features +New features ~~~~~~~~~~~~ - :ref:`Added ` ``melt`` function to ``pandas.core.reshape`` - :ref:`Added ` ``level`` parameter to group by level in Series and DataFrame descriptive statistics (:issue:`313`) @@ -49,7 +49,7 @@ New Features - :ref:`Added ` ``raw`` option to ``DataFrame.apply`` for performance if only need ndarray (:issue:`309`) - Added proper, tested weighted least squares to standard and panel OLS (:issue:`303`) -Performance Enhancements +Performance enhancements ~~~~~~~~~~~~~~~~~~~~~~~~ - VBENCH Cythonized ``cache_readonly``, resulting in substantial micro-performance enhancements throughout the code base (:issue:`361`) - VBENCH Special Cython matrix iterator for applying arbitrary reduction operations with 3-5x better performance than `np.apply_along_axis` (:issue:`309`) diff --git a/doc/source/whatsnew/v0.7.0.rst b/doc/source/whatsnew/v0.7.0.rst index deff214354e2b2..a63cd37e47dc28 100644 --- a/doc/source/whatsnew/v0.7.0.rst +++ b/doc/source/whatsnew/v0.7.0.rst @@ -109,7 +109,7 @@ New features - :ref:`Added ` ``level`` argument to ``xs`` method of DataFrame. -API Changes to integer indexing +API changes to integer indexing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ One of the potentially riskiest API changes in 0.7.0, but also one of the most @@ -255,7 +255,7 @@ In the case of integer indexes, the behavior will be exactly as before If you wish to do indexing with sequences and slicing on an integer index with label semantics, use ``ix``. -Other API Changes +Other API changes ~~~~~~~~~~~~~~~~~ - The deprecated ``LongPanel`` class has been completely removed diff --git a/doc/source/whatsnew/v0.7.3.rst b/doc/source/whatsnew/v0.7.3.rst index 24bb756d66d681..a8697f60d7467e 100644 --- a/doc/source/whatsnew/v0.7.3.rst +++ b/doc/source/whatsnew/v0.7.3.rst @@ -50,7 +50,7 @@ New features - Add ``kurt`` methods to Series and DataFrame for computing kurtosis -NA Boolean Comparison API Change +NA Boolean comparison API change ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Reverted some changes to how NA values (represented typically as ``NaN`` or @@ -79,7 +79,7 @@ in numerical arrays, would cause a large amount of problems for users. Thus, a "practicality beats purity" approach was taken. This issue may be revisited at some point in the future. -Other API Changes +Other API changes ~~~~~~~~~~~~~~~~~ When calling ``apply`` on a grouped Series, the return value will also be a From f5131913b32b756bc72d6dd33ecd4c52369edd3b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 27 Jun 2019 16:27:17 -0500 Subject: [PATCH 072/238] TST: Fix flaky test (#27010) --- pandas/tests/resample/test_resample_api.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 7157ecccace006..ca2fb1acb6afa8 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -14,10 +14,15 @@ end=datetime(2005, 1, 10), freq='Min') test_series = Series(np.random.rand(len(dti)), dti) -test_frame = DataFrame( +_test_frame = DataFrame( {'A': test_series, 'B': test_series, 'C': np.arange(len(dti))}) +@pytest.fixture +def test_frame(): + return _test_frame.copy() + + def test_str(): r = test_series.resample('H') @@ -76,7 +81,7 @@ def test_groupby_resample_on_api(): assert_frame_equal(result, expected) -def test_pipe(): +def test_pipe(test_frame): # GH17905 # series @@ -92,7 +97,7 @@ def test_pipe(): tm.assert_frame_equal(result, expected) -def test_getitem(): +def test_getitem(test_frame): r = test_frame.resample('H') tm.assert_index_equal(r._selected_obj.columns, test_frame.columns) @@ -111,7 +116,7 @@ def test_getitem(): @pytest.mark.parametrize('key', [['D'], ['A', 'D']]) -def test_select_bad_cols(key): +def test_select_bad_cols(key, test_frame): g = test_frame.resample('H') # 'A' should not be referenced as a bad column... # will have to rethink regex if you change message! @@ -120,7 +125,7 @@ def test_select_bad_cols(key): g[key] -def test_attribute_access(): +def test_attribute_access(test_frame): r = test_frame.resample('H') tm.assert_series_equal(r.A.sum(), r['A'].sum()) @@ -143,7 +148,7 @@ def test_api_compat_before_use(): getattr(rs, attr) -def tests_skip_nuisance(): +def tests_skip_nuisance(test_frame): df = test_frame df['D'] = 'foo' From 8b48f5c75a058c239a5d0eb9ee4f1593f1be1810 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 27 Jun 2019 16:33:08 -0500 Subject: [PATCH 073/238] BUG: Restrict DTA to 1D (#27027) --- pandas/core/algorithms.py | 19 +++++++++++++------ pandas/core/arrays/datetimes.py | 2 ++ pandas/io/formats/format.py | 8 ++++++++ pandas/tests/arrays/test_datetimes.py | 12 ++++++++++++ 4 files changed, 35 insertions(+), 6 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 77664b3fa73d06..98daae076fbc14 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -17,12 +17,12 @@ from pandas.core.dtypes.common import ( ensure_float64, ensure_int64, ensure_object, ensure_platform_int, ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype, - is_complex_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, - is_datetimelike, is_extension_array_dtype, is_float_dtype, is_integer, - is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype, - is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype, - is_sparse, is_timedelta64_dtype, is_unsigned_integer_dtype, - needs_i8_conversion) + is_complex_dtype, is_datetime64_any_dtype, is_datetime64_ns_dtype, + is_datetime64tz_dtype, is_datetimelike, is_extension_array_dtype, + is_float_dtype, is_integer, is_integer_dtype, is_interval_dtype, + is_list_like, is_numeric_dtype, is_object_dtype, is_period_dtype, + is_scalar, is_signed_integer_dtype, is_sparse, is_timedelta64_dtype, + is_unsigned_integer_dtype, needs_i8_conversion) from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, na_value_for_dtype @@ -105,6 +105,13 @@ def _ensure_data(values, dtype=None): dtype = values.dtype else: # Datetime + if values.ndim > 1 and is_datetime64_ns_dtype(values): + # Avoid calling the DatetimeIndex constructor as it is 1D only + # Note: this is reached by DataFrame.rank calls GH#27027 + asi8 = values.view('i8') + dtype = values.dtype + return asi8, dtype, 'int64' + from pandas import DatetimeIndex values = DatetimeIndex(values) dtype = values.dtype diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 6e7217762a3fb2..eaa0278da6dc32 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -309,6 +309,8 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): "ndarray, or Series or Index containing one of those." ) raise ValueError(msg.format(type(values).__name__)) + if values.ndim != 1: + raise ValueError("Only 1-dimensional input arrays are supported.") if values.dtype == 'i8': # for compat with datetime/timedelta/period shared methods, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 152e9a2e9ab3d3..c709ff876b3a0b 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1274,6 +1274,8 @@ def format_percentiles(percentiles): def _is_dates_only(values): # return a boolean if we are only dates (and don't have a timezone) + assert values.ndim == 1 + values = DatetimeIndex(values) if values.tz is not None: return False @@ -1325,6 +1327,12 @@ def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None): def _get_format_datetime64_from_values(values, date_format): """ given values and a date_format, return a string format """ + + if isinstance(values, np.ndarray) and values.ndim > 1: + # We don't actaully care about the order of values, and DatetimeIndex + # only accepts 1D values + values = values.ravel() + is_dates_only = _is_dates_only(values) if is_dates_only: return date_format or "%Y-%m-%d" diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 1218527f6fd9bd..c7c0e1180ce464 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -15,6 +15,18 @@ class TestDatetimeArrayConstructor: + + def test_only_1dim_accepted(self): + arr = np.array([0, 1, 2, 3], dtype='M8[h]').astype('M8[ns]') + + with pytest.raises(ValueError, match="Only 1-dimensional"): + # 2-dim + DatetimeArray(arr.reshape(2, 2)) + + with pytest.raises(ValueError, match="Only 1-dimensional"): + # 0-dim + DatetimeArray(arr[[0]].squeeze()) + def test_freq_validation(self): # GH#24623 check that invalid instances cannot be created with the # public constructor From de0867f8bedab9ce7d8d4f46267c4c123ff3166c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 27 Jun 2019 16:59:48 -0500 Subject: [PATCH 074/238] BLD: fix py37 build warnings (#26769) --- pandas/io/msgpack/_unpacker.pyx | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/pandas/io/msgpack/_unpacker.pyx b/pandas/io/msgpack/_unpacker.pyx index 8734990c44da04..c2e2dfc521a514 100644 --- a/pandas/io/msgpack/_unpacker.pyx +++ b/pandas/io/msgpack/_unpacker.pyx @@ -5,15 +5,13 @@ from cython cimport Py_ssize_t from cpython cimport ( PyCallable_Check, - PyBUF_SIMPLE, PyObject_GetBuffer, PyBuffer_Release, + PyBUF_SIMPLE, PyObject_GetBuffer, PyBuffer_Release, Py_buffer, PyBytes_Size, PyBytes_FromStringAndSize, PyBytes_AsString) cdef extern from "Python.h": ctypedef struct PyObject - cdef int PyObject_AsReadBuffer(object o, const void** buff, - Py_ssize_t* buf_len) except -1 from libc.stdlib cimport free, malloc from libc.string cimport memcpy, memmove @@ -129,8 +127,14 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, Py_ssize_t buf_len char* cenc = NULL char* cerr = NULL + Py_buffer view + bytes extra_bytes - PyObject_AsReadBuffer(packed, &buf, &buf_len) + # GH#26769 Effectively re-implement deprecated PyObject_AsReadBuffer; + # based on https://xpra.org/trac/ticket/1884 + PyObject_GetBuffer(packed, &view, PyBUF_SIMPLE) + buf = view.buf + buf_len = view.len if encoding is not None: if isinstance(encoding, unicode): @@ -149,10 +153,13 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, if ret == 1: obj = unpack_data(&ctx) if off < buf_len: - raise ExtraData(obj, PyBytes_FromStringAndSize( - buf + off, buf_len - off)) + extra_bytes = PyBytes_FromStringAndSize(buf + off, buf_len - off) + PyBuffer_Release(&view) + raise ExtraData(obj, extra_bytes) + PyBuffer_Release(&view) return obj else: + PyBuffer_Release(&view) raise UnpackValueError("Unpack failed: error = {ret}".format(ret=ret)) From ce86c21ce7c64089d488965821ac36ac6eddddc9 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 27 Jun 2019 18:13:27 -0500 Subject: [PATCH 075/238] BUG: preserve categorical & sparse types when grouping / pivot (#27071) --- doc/source/whatsnew/v0.25.0.rst | 31 ++++++++++- pandas/core/groupby/generic.py | 11 +++- pandas/core/groupby/groupby.py | 42 +++++++++++---- pandas/core/groupby/ops.py | 6 +-- pandas/core/internals/blocks.py | 24 ++++++++- pandas/core/internals/construction.py | 5 +- pandas/core/nanops.py | 9 ++-- pandas/tests/extension/base/groupby.py | 12 +++++ .../tests/extension/decimal/test_decimal.py | 6 ++- pandas/tests/groupby/test_categorical.py | 21 ++++++++ pandas/tests/groupby/test_function.py | 53 +++++++++---------- pandas/tests/groupby/test_nth.py | 19 ++++--- pandas/tests/resample/test_datetime_index.py | 6 +++ pandas/tests/sparse/test_groupby.py | 10 ++-- pandas/tests/sparse/test_pivot.py | 21 +++++--- 15 files changed, 205 insertions(+), 71 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1980e00f1073dc..8f677b1f7dc76a 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -322,6 +322,35 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t s s.str.startswith(b'a') +.. _whatsnew_0250.api_breaking.groupby_categorical: + +Categorical dtypes are preserved during groupby +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Previously, columns that were categorical, but not the groupby key(s) would be converted to ``object`` dtype during groupby operations. Pandas now will preserve these dtypes. (:issue:`18502`) + +.. ipython:: python + + df = pd.DataFrame( + {'payload': [-1, -2, -1, -2], + 'col': pd.Categorical(["foo", "bar", "bar", "qux"], ordered=True)}) + df + df.dtypes + +*Previous Behavior*: + +.. code-block:: python + + In [5]: df.groupby('payload').first().col.dtype + Out[5]: dtype('O') + +*New Behavior*: + +.. ipython:: python + + df.groupby('payload').first().col.dtype + + .. _whatsnew_0250.api_breaking.incompatible_index_unions: Incompatible Index type unions @@ -809,7 +838,7 @@ ExtensionArray - Bug in :func:`factorize` when passing an ``ExtensionArray`` with a custom ``na_sentinel`` (:issue:`25696`). - :meth:`Series.count` miscounts NA values in ExtensionArrays (:issue:`26835`) -- Keyword argument ``deep`` has been removed from :method:`ExtensionArray.copy` (:issue:`27083`) +- Keyword argument ``deep`` has been removed from :meth:`ExtensionArray.copy` (:issue:`27083`) Other ^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a10920b7a5afb4..7c8c7956f8cb4a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -158,12 +158,19 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, obj = self.obj[data.items[locs]] s = groupby(obj, self.grouper) - result = s.aggregate(lambda x: alt(x, axis=self.axis)) + try: + result = s.aggregate(lambda x: alt(x, axis=self.axis)) + except TypeError: + # we may have an exception in trying to aggregate + # continue and exclude the block + pass finally: + dtype = block.values.dtype + # see if we can cast the block back to the original dtype - result = block._try_coerce_and_cast_result(result) + result = block._try_coerce_and_cast_result(result, dtype=dtype) newb = block.make_block(result) new_items.append(locs) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 64cacd60da30f5..202d4fb15f9717 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -786,6 +786,8 @@ def _try_cast(self, result, obj, numeric_only=False): elif is_extension_array_dtype(dtype): # The function can return something of any type, so check # if the type is compatible with the calling EA. + + # return the same type (Series) as our caller try: result = obj._values._from_sequence(result, dtype=dtype) except Exception: @@ -1157,7 +1159,8 @@ def mean(self, *args, **kwargs): """ nv.validate_groupby_func('mean', args, kwargs, ['numeric_only']) try: - return self._cython_agg_general('mean', **kwargs) + return self._cython_agg_general( + 'mean', alt=lambda x, axis: Series(x).mean(**kwargs), **kwargs) except GroupByError: raise except Exception: # pragma: no cover @@ -1179,7 +1182,11 @@ def median(self, **kwargs): Median of values within each group. """ try: - return self._cython_agg_general('median', **kwargs) + return self._cython_agg_general( + 'median', + alt=lambda x, + axis: Series(x).median(axis=axis, **kwargs), + **kwargs) except GroupByError: raise except Exception: # pragma: no cover @@ -1235,7 +1242,10 @@ def var(self, ddof=1, *args, **kwargs): nv.validate_groupby_func('var', args, kwargs) if ddof == 1: try: - return self._cython_agg_general('var', **kwargs) + return self._cython_agg_general( + 'var', + alt=lambda x, axis: Series(x).var(ddof=ddof, **kwargs), + **kwargs) except Exception: f = lambda x: x.var(ddof=ddof, **kwargs) with _group_selection_context(self): @@ -1263,7 +1273,6 @@ def sem(self, ddof=1): Series or DataFrame Standard error of the mean of values within each group. """ - return self.std(ddof=ddof) / np.sqrt(self.count()) @Substitution(name='groupby') @@ -1290,7 +1299,7 @@ def _add_numeric_operations(cls): """ def groupby_function(name, alias, npfunc, - numeric_only=True, _convert=False, + numeric_only=True, min_count=-1): _local_template = """ @@ -1312,17 +1321,30 @@ def f(self, **kwargs): kwargs['min_count'] = min_count self._set_group_selection() + + # try a cython aggregation if we can try: return self._cython_agg_general( alias, alt=npfunc, **kwargs) except AssertionError as e: raise SpecificationError(str(e)) except Exception: - result = self.aggregate( - lambda x: npfunc(x, axis=self.axis)) - if _convert: - result = result._convert(datetime=True) - return result + pass + + # apply a non-cython aggregation + result = self.aggregate( + lambda x: npfunc(x, axis=self.axis)) + + # coerce the resulting columns if we can + if isinstance(result, DataFrame): + for col in result.columns: + result[col] = self._try_cast( + result[col], self.obj[col]) + else: + result = self._try_cast( + result, self.obj) + + return result set_function_name(f, name, cls) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 010047a8be4ed8..38478be5a8e07f 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -19,7 +19,7 @@ from pandas.core.dtypes.common import ( ensure_float64, ensure_int64, ensure_int_or_float, ensure_object, ensure_platform_int, is_bool_dtype, is_categorical_dtype, is_complex_dtype, - is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype, + is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype, is_sparse, is_timedelta64_dtype, needs_i8_conversion) from pandas.core.dtypes.missing import _maybe_fill, isna @@ -451,9 +451,9 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, # categoricals are only 1d, so we # are not setup for dim transforming - if is_categorical_dtype(values): + if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError( - "categoricals are not support in cython ops ATM") + "{} are not support in cython ops".format(values.dtype)) elif is_datetime64_any_dtype(values): if how in ['add', 'prod', 'cumsum', 'cumprod']: raise NotImplementedError( diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index db0eb44eabbfef..652f70746f6182 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -594,7 +594,8 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, values = self.get_values(dtype=dtype) # _astype_nansafe works fine with 1-d only - values = astype_nansafe(values.ravel(), dtype, copy=True) + values = astype_nansafe( + values.ravel(), dtype, copy=True, **kwargs) # TODO(extension) # should we make this attribute? @@ -1746,6 +1747,27 @@ def _slice(self, slicer): return self.values[slicer] + def _try_cast_result(self, result, dtype=None): + """ + if we have an operation that operates on for example floats + we want to try to cast back to our EA here if possible + + result could be a 2-D numpy array, e.g. the result of + a numeric operation; but it must be shape (1, X) because + we by-definition operate on the ExtensionBlocks one-by-one + + result could also be an EA Array itself, in which case it + is already a 1-D array + """ + try: + + result = self._holder._from_sequence( + result.ravel(), dtype=dtype) + except Exception: + pass + + return result + def formatting_values(self): # Deprecating the ability to override _formatting_values. # Do the warning here, it's only user in pandas, since we diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 96b4ab7f3fbc6c..0806e6e927e8de 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -687,7 +687,10 @@ def sanitize_array(data, index, dtype=None, copy=False, data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) - if is_object_dtype(subarr.dtype) and dtype != 'object': + if (not (is_extension_array_dtype(subarr.dtype) or + is_extension_array_dtype(dtype)) and + is_object_dtype(subarr.dtype) and + not is_object_dtype(dtype)): inferred = lib.infer_dtype(subarr, skipna=False) if inferred == 'period': try: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 7923e463c7719b..24a28bf0005cb1 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -72,11 +72,12 @@ def _f(*args, **kwargs): class bottleneck_switch: - def __init__(self, **kwargs): + def __init__(self, name=None, **kwargs): + self.name = name self.kwargs = kwargs def __call__(self, alt): - bn_name = alt.__name__ + bn_name = self.name or alt.__name__ try: bn_func = getattr(bn, bn_name) @@ -804,7 +805,8 @@ def nansem(values, axis=None, skipna=True, ddof=1, mask=None): def _nanminmax(meth, fill_value_typ): - @bottleneck_switch() + + @bottleneck_switch(name='nan' + meth) def reduction(values, axis=None, skipna=True, mask=None): values, mask, dtype, dtype_max, fill_value = _get_values( @@ -824,7 +826,6 @@ def reduction(values, axis=None, skipna=True, mask=None): result = _wrap_results(result, dtype, fill_value) return _maybe_null_out(result, axis, mask, values.shape) - reduction.__name__ = 'nan' + meth return reduction diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 1929dad075695a..daeec5923888c3 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -64,6 +64,18 @@ def test_groupby_extension_apply( df.groupby("A").apply(groupby_apply_op) df.groupby("A").B.apply(groupby_apply_op) + def test_groupby_apply_identity(self, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], + "B": data_for_grouping}) + result = df.groupby('A').B.apply(lambda x: x.array) + expected = pd.Series([df.B.iloc[[0, 1, 6]].array, + df.B.iloc[[2, 3]].array, + df.B.iloc[[4, 5]].array, + df.B.iloc[[7]].array], + index=pd.Index([1, 2, 3, 4], name='A'), + name='B') + self.assert_series_equal(result, expected) + def test_in_numeric_groupby(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping, diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 94c0b61c6382a2..ecef835a9c7977 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -192,7 +192,11 @@ class TestCasting(BaseDecimal, base.BaseCastingTests): class TestGroupby(BaseDecimal, base.BaseGroupbyTests): - pass + + @pytest.mark.xfail( + reason="needs to correctly define __eq__ to handle nans, xref #27081.") + def test_groupby_apply_identity(self, data_for_grouping): + super().test_groupby_apply_identity(data_for_grouping) class TestSetitem(BaseDecimal, base.BaseSetitemTests): diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index f24fa0daa5b18f..58a43dc218d333 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -697,6 +697,27 @@ def test_preserve_categorical_dtype(): tm.assert_frame_equal(result2, expected) +@pytest.mark.parametrize( + 'func, values', + [('first', ['second', 'first']), + ('last', ['fourth', 'third']), + ('min', ['fourth', 'first']), + ('max', ['second', 'third'])]) +def test_preserve_on_ordered_ops(func, values): + # gh-18502 + # preserve the categoricals on ops + c = pd.Categorical(['first', 'second', 'third', 'fourth'], ordered=True) + df = pd.DataFrame( + {'payload': [-1, -2, -1, -2], + 'col': c}) + g = df.groupby('payload') + result = getattr(g, func)() + expected = pd.DataFrame( + {'payload': [-2, -1], + 'col': pd.Series(values, dtype=c.dtype)}).set_index('payload') + tm.assert_frame_equal(result, expected) + + def test_categorical_no_compress(): data = Series(np.random.randn(9)) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 14f27f0c4c7d87..e4303c0a070760 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -12,7 +12,7 @@ from pandas import ( DataFrame, Index, MultiIndex, Series, Timestamp, date_range, isna) import pandas.core.nanops as nanops -from pandas.util import testing as tm +from pandas.util import _test_decorators as td, testing as tm @pytest.mark.parametrize("agg_func", ['any', 'all']) @@ -144,6 +144,7 @@ def test_arg_passthru(): index=Index([1, 2], name='group'), columns=['int', 'float', 'category_int', 'datetime', 'datetimetz', 'timedelta']) + for attr in ['mean', 'median']: f = getattr(df.groupby('group'), attr) result = f() @@ -459,35 +460,33 @@ def test_groupby_cumprod(): tm.assert_series_equal(actual, expected) -def test_ops_general(): - ops = [('mean', np.mean), - ('median', np.median), - ('std', np.std), - ('var', np.var), - ('sum', np.sum), - ('prod', np.prod), - ('min', np.min), - ('max', np.max), - ('first', lambda x: x.iloc[0]), - ('last', lambda x: x.iloc[-1]), - ('count', np.size), ] - try: - from scipy.stats import sem - except ImportError: - pass - else: - ops.append(('sem', sem)) +def scipy_sem(*args, **kwargs): + from scipy.stats import sem + return sem(*args, ddof=1, **kwargs) + + +@pytest.mark.parametrize( + 'op,targop', + [('mean', np.mean), + ('median', np.median), + ('std', np.std), + ('var', np.var), + ('sum', np.sum), + ('prod', np.prod), + ('min', np.min), + ('max', np.max), + ('first', lambda x: x.iloc[0]), + ('last', lambda x: x.iloc[-1]), + ('count', np.size), + pytest.param( + 'sem', scipy_sem, marks=td.skip_if_no_scipy)]) +def test_ops_general(op, targop): df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) - for op, targop in ops: - result = getattr(df.groupby(labels), op)().astype(float) - expected = df.groupby(labels).agg(targop) - try: - tm.assert_frame_equal(result, expected) - except BaseException as exc: - exc.args += ('operation: %s' % op, ) - raise + result = getattr(df.groupby(labels), op)().astype(float) + expected = df.groupby(labels).agg(targop) + tm.assert_frame_equal(result, expected) def test_max_nan_bug(): diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 6a08a8d79b63e8..b174fb0e0b6f96 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -282,18 +282,21 @@ def test_first_last_tz(data, expected_first, expected_last): ]) def test_first_last_tz_multi_column(method, ts, alpha): # GH 21603 + category_string = pd.Series(list('abc')).astype( + 'category') df = pd.DataFrame({'group': [1, 1, 2], - 'category_string': pd.Series(list('abc')).astype( - 'category'), + 'category_string': category_string, 'datetimetz': pd.date_range('20130101', periods=3, tz='US/Eastern')}) result = getattr(df.groupby('group'), method)() - expepcted = pd.DataFrame({'category_string': [alpha, 'c'], - 'datetimetz': [ts, - Timestamp('2013-01-03', - tz='US/Eastern')]}, - index=pd.Index([1, 2], name='group')) - assert_frame_equal(result, expepcted) + expected = pd.DataFrame( + {'category_string': pd.Categorical( + [alpha, 'c'], dtype=category_string.dtype), + 'datetimetz': [ts, + Timestamp('2013-01-03', + tz='US/Eastern')]}, + index=pd.Index([1, 2], name='group')) + assert_frame_equal(result, expected) def test_nth_multi_index_as_expected(): diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 5711174ef0c9fa..830ba6062cc720 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -112,6 +112,12 @@ def test_resample_integerarray(): dtype="Int64") assert_series_equal(result, expected) + result = ts.resample('3T').mean() + expected = Series([1, 4, 7], + index=pd.date_range('1/1/2000', periods=3, freq='3T'), + dtype='Int64') + assert_series_equal(result, expected) + def test_resample_basic_grouper(series): s = series diff --git a/pandas/tests/sparse/test_groupby.py b/pandas/tests/sparse/test_groupby.py index 531a4360c78a2f..bf6055bc127259 100644 --- a/pandas/tests/sparse/test_groupby.py +++ b/pandas/tests/sparse/test_groupby.py @@ -29,11 +29,10 @@ def test_first_last_nth(self): sparse_grouped_last = sparse_grouped.last() sparse_grouped_nth = sparse_grouped.nth(1) - dense_grouped_first = dense_grouped.first().to_sparse() - dense_grouped_last = dense_grouped.last().to_sparse() - dense_grouped_nth = dense_grouped.nth(1).to_sparse() + dense_grouped_first = pd.DataFrame(dense_grouped.first().to_sparse()) + dense_grouped_last = pd.DataFrame(dense_grouped.last().to_sparse()) + dense_grouped_nth = pd.DataFrame(dense_grouped.nth(1).to_sparse()) - # TODO: shouldn't these all be spares or not? tm.assert_frame_equal(sparse_grouped_first, dense_grouped_first) tm.assert_frame_equal(sparse_grouped_last, @@ -69,5 +68,6 @@ def test_groupby_includes_fill_value(fill_value): 'b': [fill_value, 1, fill_value, fill_value]}) sdf = df.to_sparse(fill_value=fill_value) result = sdf.groupby('a').sum() - expected = df.groupby('a').sum().to_sparse(fill_value=fill_value) + expected = pd.DataFrame(df.groupby('a').sum().to_sparse( + fill_value=fill_value)) tm.assert_frame_equal(result, expected, check_index_type=False) diff --git a/pandas/tests/sparse/test_pivot.py b/pandas/tests/sparse/test_pivot.py index 114e7b4bacd94f..8f98117f20208f 100644 --- a/pandas/tests/sparse/test_pivot.py +++ b/pandas/tests/sparse/test_pivot.py @@ -2,6 +2,7 @@ import pytest import pandas as pd +from pandas import _np_version_under1p17 import pandas.util.testing as tm @@ -11,12 +12,13 @@ class TestPivotTable: def setup_method(self, method): + rs = np.random.RandomState(0) self.dense = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8), + 'C': rs.randn(8), + 'D': rs.randn(8), 'E': [np.nan, np.nan, 1, 2, np.nan, 1, np.nan, np.nan]}) self.sparse = self.dense.to_sparse() @@ -40,13 +42,16 @@ def test_pivot_table(self): values='E', aggfunc='mean') tm.assert_frame_equal(res_sparse, res_dense) - # ToDo: sum doesn't handle nan properly - # res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', - # values='E', aggfunc='sum') - # res_dense = pd.pivot_table(self.dense, index='A', columns='B', - # values='E', aggfunc='sum') - # tm.assert_frame_equal(res_sparse, res_dense) + def test_pivot_table_with_nans(self): + res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', + values='E', aggfunc='sum') + res_dense = pd.pivot_table(self.dense, index='A', columns='B', + values='E', aggfunc='sum') + tm.assert_frame_equal(res_sparse, res_dense) + @pytest.mark.xfail(not _np_version_under1p17, + reason="failing occasionally on numpy > 1.17", + strict=False) def test_pivot_table_multi(self): res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', values=['D', 'E']) From 08a599b3478b5fb7d9e6edf6b9e0278809e6ac7b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 27 Jun 2019 19:13:59 -0400 Subject: [PATCH 076/238] CLN: Dead version checking code post minimum version bump (#27063) * Remove dateutil version check tests * Remove more dead version checking code * Flake8 and fix bs4 comparison * Use import_optional_dependency again * Remove unused import * Change bs4 import test to check for ImportError and add whatsnew * Some linting * noqa necessary imports --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/io/html.py | 7 ++--- pandas/io/pytables.py | 5 ---- .../tests/indexes/datetimes/test_timezones.py | 13 +++------ pandas/tests/indexes/datetimes/test_tools.py | 9 ------ pandas/tests/io/conftest.py | 7 ----- pandas/tests/io/excel/test_style.py | 13 ++------- pandas/tests/io/pytables/test_pytables.py | 3 +- pandas/tests/io/test_html.py | 2 +- pandas/tests/io/test_parquet.py | 13 +++------ .../tests/scalar/timestamp/test_rendering.py | 11 ++------ .../tests/scalar/timestamp/test_timezones.py | 28 ++++++------------- pandas/tests/tseries/offsets/test_offsets.py | 20 ------------- 13 files changed, 26 insertions(+), 106 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 8f677b1f7dc76a..73745fe0d59886 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -538,6 +538,7 @@ Other API changes - Most Pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`) - The ``.str``-accessor has been disabled for 1-level :class:`MultiIndex`, use :meth:`MultiIndex.to_flat_index` if necessary (:issue:`23679`) - Removed support of gtk package for clipboards (:issue:`26563`) +- Using an unsupported version of Beautiful Soup 4 will now raise an ``ImportError`` instead of a ``ValueError`` (:issue:`27063`) .. _whatsnew_0250.deprecations: diff --git a/pandas/io/html.py b/pandas/io/html.py index 15b9d25f6be6c3..d54489aabf1ed3 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -4,7 +4,6 @@ """ from collections import abc -from distutils.version import LooseVersion import numbers import os import re @@ -830,10 +829,8 @@ def _parser_dispatch(flavor): if not _HAS_BS4: raise ImportError( "BeautifulSoup4 (bs4) not found, please install it") - import bs4 - if LooseVersion(bs4.__version__) <= LooseVersion('4.2.0'): - raise ValueError("A minimum version of BeautifulSoup 4.2.1 " - "is required") + # Although we call this above, we want to raise here right before use. + bs4 = import_optional_dependency('bs4') # noqa:F841 else: if not _HAS_LXML: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 97d5b1dd2a1e5f..c8c27f62cef343 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -5,7 +5,6 @@ import copy from datetime import date, datetime -from distutils.version import LooseVersion import itertools import os import re @@ -227,10 +226,6 @@ def _tables(): import tables _table_mod = tables - # version requirements - if LooseVersion(tables.__version__) < LooseVersion('3.0.0'): - raise ImportError("PyTables version >= 3.0.0 is required") - # set the file open policy # return the file open policy; this changes as of pytables 3.1 # depending on the HDF5 version diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 368dc68e516df9..908d563eca8fa6 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -2,7 +2,6 @@ Tests for DatetimeIndex timezone-related methods """ from datetime import date, datetime, time, timedelta, tzinfo -from distutils.version import LooseVersion import dateutil from dateutil.tz import gettz, tzlocal @@ -554,14 +553,10 @@ def test_dti_construction_ambiguous_endpoint(self, tz): assert times[0] == Timestamp('2013-10-26 23:00', tz=tz, freq="H") if str(tz).startswith('dateutil'): - if LooseVersion(dateutil.__version__) < LooseVersion('2.6.0'): - # see GH#14621 - assert times[-1] == Timestamp('2013-10-27 01:00:00+0000', - tz=tz, freq="H") - elif LooseVersion(dateutil.__version__) > LooseVersion('2.6.0'): - # fixed ambiguous behavior - assert times[-1] == Timestamp('2013-10-27 01:00:00+0100', - tz=tz, freq="H") + # fixed ambiguous behavior + # see GH#14621 + assert times[-1] == Timestamp('2013-10-27 01:00:00+0100', + tz=tz, freq="H") else: assert times[-1] == Timestamp('2013-10-27 01:00:00+0000', tz=tz, freq="H") diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 2a5ae92cb59f50..a971a1088860a7 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -2,10 +2,8 @@ import calendar from datetime import datetime, time -from distutils.version import LooseVersion import locale -import dateutil from dateutil.parser import parse from dateutil.tz.tz import tzoffset import numpy as np @@ -1739,8 +1737,6 @@ def test_parsers_dayfirst_yearfirst(self, cache): # 2.5.2 20/12/21 [dayfirst=1, yearfirst=0] -> 2021-12-20 00:00:00 # 2.5.3 20/12/21 [dayfirst=1, yearfirst=0] -> 2021-12-20 00:00:00 - is_lt_253 = LooseVersion(dateutil.__version__) < LooseVersion('2.5.3') - # str : dayfirst, yearfirst, expected cases = {'10-11-12': [(False, False, datetime(2012, 10, 11)), @@ -1762,11 +1758,6 @@ def test_parsers_dayfirst_yearfirst(self, cache): for date_str, values in cases.items(): for dayfirst, yearfirst, expected in values: - # odd comparisons across version - # let's just skip - if dayfirst and yearfirst and is_lt_253: - continue - # compare with dateutil result dateutil_result = parse(date_str, dayfirst=dayfirst, yearfirst=yearfirst) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index a4e778a68c728b..d431e835a07cef 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -1,4 +1,3 @@ -from distutils.version import LooseVersion import os import pytest @@ -43,12 +42,6 @@ def s3_resource(tips_file, jsonl_file): """ pytest.importorskip('s3fs') boto3 = pytest.importorskip('boto3') - botocore = pytest.importorskip('botocore') - - if LooseVersion(botocore.__version__) < LooseVersion("1.11.0"): - # botocore leaks an uncatchable ResourceWarning before 1.11.0; - # see GH 23731 and https://github.com/boto/botocore/issues/1464 - pytest.skip("botocore is leaking resources before 1.11.0") with tm.ensure_safe_environment_variables(): # temporary workaround as moto fails for botocore >= 1.11 otherwise, diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index d8426f54bb188f..d8971777f6eb47 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -1,5 +1,3 @@ -from distutils.version import LooseVersion - import numpy as np import pytest @@ -107,15 +105,8 @@ def custom_converter(css): assert cell1.font.color.rgb != cell2.font.color.rgb assert cell2.font.color.rgb == alpha + '0000FF' elif ref == 'D4': - # This fails with engine=xlsxwriter due to - # https://bitbucket.org/openpyxl/openpyxl/issues/800 - if engine == 'xlsxwriter' \ - and (LooseVersion(openpyxl.__version__) < - LooseVersion('2.4.6')): - pass - else: - assert cell1.font.underline != cell2.font.underline - assert cell2.font.underline == 'single' + assert cell1.font.underline != cell2.font.underline + assert cell2.font.underline == 'single' elif ref == 'B5': assert not cell1.border.left.style assert (cell2.border.top.style == diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index be318ede2df4a9..40cc05c3174710 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -43,8 +43,7 @@ 'release beyong 3.4.4 to support numpy 1.16x')) -_default_compressor = ('blosc' if LooseVersion(tables.__version__) >= - LooseVersion('2.2') else 'zlib') +_default_compressor = 'blosc' ignore_natural_naming_warning = pytest.mark.filterwarnings( diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index bd6fc6f57c4963..9f9fcabbfe42c1 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -54,7 +54,7 @@ def assert_framelist_equal(list1, list2, *args, **kwargs): def test_bs4_version_fails(monkeypatch, datapath): import bs4 monkeypatch.setattr(bs4, '__version__', '4.2') - with pytest.raises(ValueError, match="minimum version"): + with pytest.raises(ImportError, match="Pandas requires version"): read_html(datapath("io", "data", "spam.html"), flavor='bs4') diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index db5c92fb681a2d..f5f8dac71d095f 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1,6 +1,5 @@ """ test parquet compat """ import datetime -from distutils.version import LooseVersion import os from warnings import catch_warnings @@ -454,10 +453,8 @@ class TestParquetFastParquet(Base): def test_basic(self, fp, df_full): df = df_full - # additional supported types for fastparquet - if LooseVersion(fastparquet.__version__) >= LooseVersion('0.1.4'): - df['datetime_tz'] = pd.date_range('20130101', periods=3, - tz='US/Eastern') + df['datetime_tz'] = pd.date_range('20130101', periods=3, + tz='US/Eastern') df['timedelta'] = pd.timedelta_range('1 day', periods=3) check_round_trip(df, fp) @@ -485,8 +482,6 @@ def test_unsupported(self, fp): self.check_error_on_write(df, fp, ValueError) def test_categorical(self, fp): - if LooseVersion(fastparquet.__version__) < LooseVersion("0.1.3"): - pytest.skip("CategoricalDtype not supported for older fp") df = pd.DataFrame({'a': pd.Categorical(list('abc'))}) check_round_trip(df, fp) @@ -512,7 +507,7 @@ def test_partition_cols_supported(self, fp, df_full): df.to_parquet(path, engine="fastparquet", partition_cols=partition_cols, compression=None) assert os.path.exists(path) - import fastparquet + import fastparquet # noqa: F811 actual_partition_cols = fastparquet.ParquetFile(path, False).cats assert len(actual_partition_cols) == 2 @@ -524,7 +519,7 @@ def test_partition_on_supported(self, fp, df_full): df.to_parquet(path, engine="fastparquet", compression=None, partition_on=partition_cols) assert os.path.exists(path) - import fastparquet + import fastparquet # noqa: F811 actual_partition_cols = fastparquet.ParquetFile(path, False).cats assert len(actual_partition_cols) == 2 diff --git a/pandas/tests/scalar/timestamp/test_rendering.py b/pandas/tests/scalar/timestamp/test_rendering.py index c16ab39d642d9d..69ea0a810c4ce3 100644 --- a/pandas/tests/scalar/timestamp/test_rendering.py +++ b/pandas/tests/scalar/timestamp/test_rendering.py @@ -1,7 +1,5 @@ -from distutils.version import LooseVersion import pprint -import dateutil import pytest import pytz # noqa # a test below uses pytz but only inside a `eval` call @@ -10,13 +8,8 @@ class TestTimestampRendering: - # dateutil zone change (only matters for repr) - if LooseVersion(dateutil.__version__) >= LooseVersion('2.6.0'): - timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', - 'dateutil/US/Pacific'] - else: - timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', - 'dateutil/America/Los_Angeles'] + timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', + 'dateutil/US/Pacific'] @pytest.mark.parametrize('tz', timezones) @pytest.mark.parametrize('freq', ['D', 'M', 'S', 'N']) diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py index f67ecdaf746f6b..914423fcf5ba77 100644 --- a/pandas/tests/scalar/timestamp/test_timezones.py +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -2,7 +2,6 @@ Tests for Timestamp timezone-related methods """ from datetime import date, datetime, timedelta -from distutils.version import LooseVersion import dateutil from dateutil.tz import gettz, tzoffset @@ -145,18 +144,11 @@ def test_tz_localize_ambiguous_compat(self): assert result_pytz.value == result_dateutil.value assert result_pytz.value == 1382835600000000000 - if LooseVersion(dateutil.__version__) < LooseVersion('2.6.0'): - # dateutil 2.6 buggy w.r.t. ambiguous=0 - # see gh-14621 - # see https://github.com/dateutil/dateutil/issues/321 - assert (result_pytz.to_pydatetime().tzname() == - result_dateutil.to_pydatetime().tzname()) - assert str(result_pytz) == str(result_dateutil) - elif LooseVersion(dateutil.__version__) > LooseVersion('2.6.0'): - # fixed ambiguous behavior - assert result_pytz.to_pydatetime().tzname() == 'GMT' - assert result_dateutil.to_pydatetime().tzname() == 'BST' - assert str(result_pytz) != str(result_dateutil) + # fixed ambiguous behavior + # see gh-14621 + assert result_pytz.to_pydatetime().tzname() == 'GMT' + assert result_dateutil.to_pydatetime().tzname() == 'BST' + assert str(result_pytz) != str(result_dateutil) # 1 hour difference result_pytz = naive.tz_localize(pytz_zone, ambiguous=1) @@ -164,12 +156,10 @@ def test_tz_localize_ambiguous_compat(self): assert result_pytz.value == result_dateutil.value assert result_pytz.value == 1382832000000000000 - # dateutil < 2.6 is buggy w.r.t. ambiguous timezones - if LooseVersion(dateutil.__version__) > LooseVersion('2.5.3'): - # see gh-14621 - assert str(result_pytz) == str(result_dateutil) - assert (result_pytz.to_pydatetime().tzname() == - result_dateutil.to_pydatetime().tzname()) + # see gh-14621 + assert str(result_pytz) == str(result_dateutil) + assert (result_pytz.to_pydatetime().tzname() == + result_dateutil.to_pydatetime().tzname()) @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), gettz('US/Eastern'), diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 151cd2a42ecef6..5683924ee1283f 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -1,5 +1,4 @@ from datetime import date, datetime, timedelta -from distutils.version import LooseVersion import numpy as np import pytest @@ -2998,25 +2997,6 @@ def _make_timestamp(self, string, hrs_offset, tz): offset_string = '-{hrs:02d}00'.format(hrs=-1 * hrs_offset) return Timestamp(string + offset_string).tz_convert(tz) - def test_fallback_plural(self): - # test moving from daylight savings to standard time - import dateutil - for tz, utc_offsets in self.timezone_utc_offsets.items(): - hrs_pre = utc_offsets['utc_offset_daylight'] - hrs_post = utc_offsets['utc_offset_standard'] - - if LooseVersion(dateutil.__version__) < LooseVersion('2.6.0'): - # buggy ambiguous behavior in 2.6.0 - # GH 14621 - # https://github.com/dateutil/dateutil/issues/321 - self._test_all_offsets( - n=3, tstart=self._make_timestamp(self.ts_pre_fallback, - hrs_pre, tz), - expected_utc_offset=hrs_post) - elif LooseVersion(dateutil.__version__) > LooseVersion('2.6.0'): - # fixed, but skip the test - continue - def test_springforward_plural(self): # test moving from standard to daylight savings for tz, utc_offsets in self.timezone_utc_offsets.items(): From 87d7cdf2600223f67df4b73edac4252d71155c9f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 27 Jun 2019 18:20:10 -0500 Subject: [PATCH 077/238] Allow multiple lambdas in Groupby.aggregate (#26905) --- doc/source/user_guide/groupby.rst | 23 ++++ doc/source/whatsnew/v0.25.0.rst | 20 ++++ pandas/core/groupby/generic.py | 106 +++++++++++++++++- .../tests/groupby/aggregate/test_aggregate.py | 97 ++++++++++++++-- 4 files changed, 231 insertions(+), 15 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 54e26c155595ba..147f07e36efb84 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -568,6 +568,29 @@ For a grouped ``DataFrame``, you can rename in a similar manner: 'mean': 'bar', 'std': 'baz'})) +.. note:: + + In general, the output column names should be unique. You can't apply + the same function (or two functions with the same name) to the same + column. + + .. ipython:: python + :okexcept: + + grouped['C'].agg(['sum', 'sum']) + + + Pandas *does* allow you to provide multiple lambdas. In this case, pandas + will mangle the name of the (nameless) lambda functions, appending ``_`` + to each subsequent lambda. + + .. ipython:: python + + grouped['C'].agg([lambda x: x.max() - x.min(), + lambda x: x.median() - x.mean()]) + + + .. _groupby.aggregate.named: Named aggregation diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 73745fe0d59886..da939687500b66 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -79,6 +79,26 @@ a dict to a Series groupby aggregation (:ref:`whatsnew_0200.api_breaking.depreca See :ref:`groupby.aggregate.named` for more. +.. _whatsnew_0250.enhancements.multiple_lambdas: + +Groupby Aggregation with multiple lambdas +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can now provide multiple lambda functions to a list-like aggregation in +:class:`pandas.core.groupby.GroupBy.agg` (:issue:`26430`). + +.. ipython:: python + + animals.groupby('kind').height.agg([ + lambda x: x.iloc[0], lambda x: x.iloc[-1] + ]) + + animals.groupby('kind').agg([ + lambda x: x.iloc[0] - x.iloc[1], + lambda x: x.iloc[0] + x.iloc[1] + ]) + +Previously, these raised a ``SpecificationError``. .. _whatsnew_0250.enhancements.multi_index_repr: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7c8c7956f8cb4a..72c8d330170d4c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -5,13 +5,13 @@ These are user facing as the result of the ``df.groupby(...)`` operations, which here returns a DataFrameGroupBy object. """ - from collections import OrderedDict, abc, namedtuple import copy +import functools from functools import partial from textwrap import dedent import typing -from typing import Any, Callable, FrozenSet, Iterator, List, Type, Union +from typing import Any, Callable, FrozenSet, Iterator, Sequence, Type, Union import warnings import numpy as np @@ -24,9 +24,9 @@ from pandas.core.dtypes.cast import ( maybe_convert_objects, maybe_downcast_to_dtype) from pandas.core.dtypes.common import ( - ensure_int64, ensure_platform_int, is_bool, is_datetimelike, - is_integer_dtype, is_interval_dtype, is_numeric_dtype, is_object_dtype, - is_scalar) + ensure_int64, ensure_platform_int, is_bool, is_datetimelike, is_dict_like, + is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype, + is_object_dtype, is_scalar) from pandas.core.dtypes.missing import isna, notna from pandas._typing import FrameOrSeries @@ -49,6 +49,10 @@ NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) # TODO(typing) the return value on this callable should be any *scalar*. AggScalar = Union[str, Callable[..., Any]] +# TODO: validate types on ScalarResult and move to _typing +# Blocked from using by https://github.com/python/mypy/issues/1484 +# See note at _mangle_lambda_list +ScalarResult = typing.TypeVar("ScalarResult") def whitelist_method_generator(base_class: Type[GroupBy], @@ -217,6 +221,8 @@ def aggregate(self, func, *args, **kwargs): raise TypeError("Must provide 'func' or tuples of " "'(column, aggfunc).") + func = _maybe_mangle_lambdas(func) + result, how = self._aggregate(func, _level=_level, *args, **kwargs) if how is None: return result @@ -830,6 +836,7 @@ def aggregate(self, func_or_funcs=None, *args, **kwargs): if isinstance(func_or_funcs, abc.Iterable): # Catch instances of lists / tuples # but not the class list / tuple itself. + func_or_funcs = _maybe_mangle_lambdas(func_or_funcs) ret = self._aggregate_multiple_funcs(func_or_funcs, (_level or 0) + 1) if relabeling: @@ -1698,7 +1705,10 @@ def _normalize_keyword_aggregation(kwargs): # process normally, then fixup the names. # TODO(Py35): When we drop python 3.5, change this to # defaultdict(list) - aggspec = OrderedDict() # type: typing.OrderedDict[str, List[AggScalar]] + # TODO: aggspec type: typing.OrderedDict[str, List[AggScalar]] + # May be hitting https://github.com/python/mypy/issues/5958 + # saying it doesn't have an attribute __name__ + aggspec = OrderedDict() order = [] columns, pairs = list(zip(*kwargs.items())) @@ -1712,6 +1722,90 @@ def _normalize_keyword_aggregation(kwargs): return aggspec, columns, order +# TODO: Can't use, because mypy doesn't like us setting __name__ +# error: "partial[Any]" has no attribute "__name__" +# the type is: +# typing.Sequence[Callable[..., ScalarResult]] +# -> typing.Sequence[Callable[..., ScalarResult]]: + +def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: + """ + Possibly mangle a list of aggfuncs. + + Parameters + ---------- + aggfuncs : Sequence + + Returns + ------- + mangled: list-like + A new AggSpec sequence, where lambdas have been converted + to have unique names. + + Notes + ----- + If just one aggfunc is passed, the name will not be mangled. + """ + if len(aggfuncs) <= 1: + # don't mangle for .agg([lambda x: .]) + return aggfuncs + i = 0 + mangled_aggfuncs = [] + for aggfunc in aggfuncs: + if com.get_callable_name(aggfunc) == "": + aggfunc = functools.partial(aggfunc) + aggfunc.__name__ = ''.format(i) + i += 1 + mangled_aggfuncs.append(aggfunc) + + return mangled_aggfuncs + + +def _maybe_mangle_lambdas(agg_spec: Any) -> Any: + """ + Make new lambdas with unique names. + + Parameters + ---------- + agg_spec : Any + An argument to NDFrameGroupBy.agg. + Non-dict-like `agg_spec` are pass through as is. + For dict-like `agg_spec` a new spec is returned + with name-mangled lambdas. + + Returns + ------- + mangled : Any + Same type as the input. + + Examples + -------- + >>> _maybe_mangle_lambdas('sum') + 'sum' + + >>> _maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP + [, + .f(*args, **kwargs)>] + """ + is_dict = is_dict_like(agg_spec) + if not (is_dict or is_list_like(agg_spec)): + return agg_spec + mangled_aggspec = type(agg_spec)() # dict or OrderdDict + + if is_dict: + for key, aggfuncs in agg_spec.items(): + if is_list_like(aggfuncs) and not is_dict_like(aggfuncs): + mangled_aggfuncs = _managle_lambda_list(aggfuncs) + else: + mangled_aggfuncs = aggfuncs + + mangled_aggspec[key] = mangled_aggfuncs + else: + mangled_aggspec = _managle_lambda_list(agg_spec) + + return mangled_aggspec + + def _recast_datetimelike_result(result: DataFrame) -> DataFrame: """ If we have date/time like in the original, then coerce dates diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 801b99fed5ce6d..ea59cde54f17bd 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -10,6 +10,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, compat, concat from pandas.core.base import SpecificationError +from pandas.core.groupby.generic import _maybe_mangle_lambdas from pandas.core.groupby.grouper import Grouping import pandas.util.testing as tm @@ -210,15 +211,6 @@ def test_multiple_functions_tuples_and_non_tuples(df): tm.assert_frame_equal(result, expected) -def test_agg_multiple_functions_too_many_lambdas(df): - grouped = df.groupby('A') - funcs = ['mean', lambda x: x.mean(), lambda x: x.std()] - - msg = 'Function names must be unique, found multiple named ' - with pytest.raises(SpecificationError, match=msg): - grouped.agg(funcs) - - def test_more_flexible_frame_multi_function(df): grouped = df.groupby('A') @@ -362,6 +354,12 @@ def test_series_named_agg_duplicates_raises(self): with pytest.raises(SpecificationError): gr.agg(a='sum', b='sum') + def test_mangled(self): + gr = pd.Series([1, 2, 3]).groupby([0, 0, 1]) + result = gr.agg(a=lambda x: 0, b=lambda x: 1) + expected = pd.DataFrame({'a': [0, 0], 'b': [1, 1]}) + tm.assert_frame_equal(result, expected) + class TestNamedAggregationDataFrame: def test_agg_relabel(self): @@ -458,3 +456,84 @@ def test_agg_namedtuple(self): expected = df.groupby("A").agg(b=("B", "sum"), c=("B", "count")) tm.assert_frame_equal(result, expected) + + def test_mangled(self): + df = pd.DataFrame({"A": [0, 1], "B": [1, 2], "C": [3, 4]}) + result = df.groupby("A").agg( + b=("B", lambda x: 0), + c=("C", lambda x: 1) + ) + expected = pd.DataFrame({"b": [0, 0], "c": [1, 1]}, + index=pd.Index([0, 1], name='A')) + tm.assert_frame_equal(result, expected) + + +class TestLambdaMangling: + + def test_maybe_mangle_lambdas_passthrough(self): + assert _maybe_mangle_lambdas('mean') == 'mean' + assert _maybe_mangle_lambdas(lambda x: x).__name__ == '' + # don't mangel single lambda. + assert _maybe_mangle_lambdas([lambda x: x])[0].__name__ == '' + + def test_maybe_mangle_lambdas_listlike(self): + aggfuncs = [lambda x: 1, lambda x: 2] + result = _maybe_mangle_lambdas(aggfuncs) + assert result[0].__name__ == '' + assert result[1].__name__ == '' + assert aggfuncs[0](None) == result[0](None) + assert aggfuncs[1](None) == result[1](None) + + def test_maybe_mangle_lambdas(self): + func = { + 'A': [lambda x: 0, lambda x: 1] + } + result = _maybe_mangle_lambdas(func) + assert result['A'][0].__name__ == '' + assert result['A'][1].__name__ == '' + + def test_maybe_mangle_lambdas_args(self): + func = { + 'A': [lambda x, a, b=1: (0, a, b), lambda x: 1] + } + result = _maybe_mangle_lambdas(func) + assert result['A'][0].__name__ == '' + assert result['A'][1].__name__ == '' + + assert func['A'][0](0, 1) == (0, 1, 1) + assert func['A'][0](0, 1, 2) == (0, 1, 2) + assert func['A'][0](0, 2, b=3) == (0, 2, 3) + + def test_maybe_mangle_lambdas_named(self): + func = OrderedDict([('C', np.mean), + ('D', OrderedDict([('foo', np.mean), + ('bar', np.mean)]))]) + result = _maybe_mangle_lambdas(func) + assert result == func + + def test_basic(self): + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) + result = df.groupby("A").agg({"B": [lambda x: 0, lambda x: 1]}) + + expected = pd.DataFrame({("B", ""): [0, 0], + ("B", ""): [1, 1]}, + index=pd.Index([0, 1], name='A')) + tm.assert_frame_equal(result, expected) + + def test_mangle_series_groupby(self): + gr = pd.Series([1, 2, 3, 4]).groupby([0, 0, 1, 1]) + result = gr.agg([lambda x: 0, lambda x: 1]) + expected = pd.DataFrame({'': [0, 0], '': [1, 1]}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.") + def test_with_kwargs(self): + f1 = lambda x, y, b=1: x.sum() + y + b + f2 = lambda x, y, b=2: x.sum() + y * b + result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0) + expected = pd.DataFrame({'': [4], '': [6]}) + tm.assert_frame_equal(result, expected) + + result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10) + expected = pd.DataFrame({'': [13], '': [30]}) + tm.assert_frame_equal(result, expected) From bd72942eac5c175942034cdb8b8dcf96cb562084 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Fri, 28 Jun 2019 14:13:22 +0200 Subject: [PATCH 078/238] Fixturize tests/frame/test_axis_select_reindex.py (#25627) --- .../tests/frame/test_axis_select_reindex.py | 280 +++++++++--------- 1 file changed, 148 insertions(+), 132 deletions(-) diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 42f98d5c96aa5d..b4fde43ff30556 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -8,12 +8,11 @@ import pandas as pd from pandas import ( Categorical, DataFrame, Index, MultiIndex, Series, date_range, isna) -from pandas.tests.frame.common import TestData import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal -class TestDataFrameSelectReindex(TestData): +class TestDataFrameSelectReindex: # These are specific reindex-based tests; other indexing tests should go in # test_indexing @@ -204,34 +203,36 @@ def test_merge_join_different_levels(self): result = df1.join(df2, on='a') tm.assert_frame_equal(result, expected) - def test_reindex(self): - newFrame = self.frame.reindex(self.ts1.index) + def test_reindex(self, float_frame): + datetime_series = tm.makeTimeSeries(nper=30) + + newFrame = float_frame.reindex(datetime_series.index) for col in newFrame.columns: for idx, val in newFrame[col].items(): - if idx in self.frame.index: + if idx in float_frame.index: if np.isnan(val): - assert np.isnan(self.frame[col][idx]) + assert np.isnan(float_frame[col][idx]) else: - assert val == self.frame[col][idx] + assert val == float_frame[col][idx] else: assert np.isnan(val) for col, series in newFrame.items(): assert tm.equalContents(series.index, newFrame.index) - emptyFrame = self.frame.reindex(Index([])) + emptyFrame = float_frame.reindex(Index([])) assert len(emptyFrame.index) == 0 # Cython code should be unit-tested directly - nonContigFrame = self.frame.reindex(self.ts1.index[::2]) + nonContigFrame = float_frame.reindex(datetime_series.index[::2]) for col in nonContigFrame.columns: for idx, val in nonContigFrame[col].items(): - if idx in self.frame.index: + if idx in float_frame.index: if np.isnan(val): - assert np.isnan(self.frame[col][idx]) + assert np.isnan(float_frame[col][idx]) else: - assert val == self.frame[col][idx] + assert val == float_frame[col][idx] else: assert np.isnan(val) @@ -241,28 +242,28 @@ def test_reindex(self): # corner cases # Same index, copies values but not index if copy=False - newFrame = self.frame.reindex(self.frame.index, copy=False) - assert newFrame.index is self.frame.index + newFrame = float_frame.reindex(float_frame.index, copy=False) + assert newFrame.index is float_frame.index # length zero - newFrame = self.frame.reindex([]) + newFrame = float_frame.reindex([]) assert newFrame.empty - assert len(newFrame.columns) == len(self.frame.columns) + assert len(newFrame.columns) == len(float_frame.columns) # length zero with columns reindexed with non-empty index - newFrame = self.frame.reindex([]) - newFrame = newFrame.reindex(self.frame.index) - assert len(newFrame.index) == len(self.frame.index) - assert len(newFrame.columns) == len(self.frame.columns) + newFrame = float_frame.reindex([]) + newFrame = newFrame.reindex(float_frame.index) + assert len(newFrame.index) == len(float_frame.index) + assert len(newFrame.columns) == len(float_frame.columns) # pass non-Index - newFrame = self.frame.reindex(list(self.ts1.index)) - tm.assert_index_equal(newFrame.index, self.ts1.index) + newFrame = float_frame.reindex(list(datetime_series.index)) + tm.assert_index_equal(newFrame.index, datetime_series.index) # copy with no axes - result = self.frame.reindex() - assert_frame_equal(result, self.frame) - assert result is not self.frame + result = float_frame.reindex() + assert_frame_equal(result, float_frame) + assert result is not float_frame def test_reindex_nan(self): df = pd.DataFrame([[1, 2], [3, 5], [7, 11], [9, 23]], @@ -305,32 +306,32 @@ def test_reindex_name_remains(self): df = df.reindex(columns=i) assert df.columns.name == 'iname' - def test_reindex_int(self): - smaller = self.intframe.reindex(self.intframe.index[::2]) + def test_reindex_int(self, int_frame): + smaller = int_frame.reindex(int_frame.index[::2]) assert smaller['A'].dtype == np.int64 - bigger = smaller.reindex(self.intframe.index) + bigger = smaller.reindex(int_frame.index) assert bigger['A'].dtype == np.float64 - smaller = self.intframe.reindex(columns=['A', 'B']) + smaller = int_frame.reindex(columns=['A', 'B']) assert smaller['A'].dtype == np.int64 - def test_reindex_like(self): - other = self.frame.reindex(index=self.frame.index[:10], - columns=['C', 'B']) + def test_reindex_like(self, float_frame): + other = float_frame.reindex(index=float_frame.index[:10], + columns=['C', 'B']) - assert_frame_equal(other, self.frame.reindex_like(other)) + assert_frame_equal(other, float_frame.reindex_like(other)) - def test_reindex_columns(self): - new_frame = self.frame.reindex(columns=['A', 'B', 'E']) + def test_reindex_columns(self, float_frame): + new_frame = float_frame.reindex(columns=['A', 'B', 'E']) - tm.assert_series_equal(new_frame['B'], self.frame['B']) + tm.assert_series_equal(new_frame['B'], float_frame['B']) assert np.isnan(new_frame['E']).all() assert 'C' not in new_frame # Length zero - new_frame = self.frame.reindex(columns=[]) + new_frame = float_frame.reindex(columns=[]) assert new_frame.empty def test_reindex_columns_method(self): @@ -545,41 +546,41 @@ def test_reindex_api_equivalence(self): for res in [res2, res3]: tm.assert_frame_equal(res1, res) - def test_align(self): - af, bf = self.frame.align(self.frame) - assert af._data is not self.frame._data + def test_align_float(self, float_frame): + af, bf = float_frame.align(float_frame) + assert af._data is not float_frame._data - af, bf = self.frame.align(self.frame, copy=False) - assert af._data is self.frame._data + af, bf = float_frame.align(float_frame, copy=False) + assert af._data is float_frame._data # axis = 0 - other = self.frame.iloc[:-5, :3] - af, bf = self.frame.align(other, axis=0, fill_value=-1) + other = float_frame.iloc[:-5, :3] + af, bf = float_frame.align(other, axis=0, fill_value=-1) tm.assert_index_equal(bf.columns, other.columns) # test fill value - join_idx = self.frame.index.join(other.index) - diff_a = self.frame.index.difference(join_idx) + join_idx = float_frame.index.join(other.index) + diff_a = float_frame.index.difference(join_idx) diff_b = other.index.difference(join_idx) diff_a_vals = af.reindex(diff_a).values diff_b_vals = bf.reindex(diff_b).values assert (diff_a_vals == -1).all() - af, bf = self.frame.align(other, join='right', axis=0) + af, bf = float_frame.align(other, join='right', axis=0) tm.assert_index_equal(bf.columns, other.columns) tm.assert_index_equal(bf.index, other.index) tm.assert_index_equal(af.index, other.index) # axis = 1 - other = self.frame.iloc[:-5, :3].copy() - af, bf = self.frame.align(other, axis=1) - tm.assert_index_equal(bf.columns, self.frame.columns) + other = float_frame.iloc[:-5, :3].copy() + af, bf = float_frame.align(other, axis=1) + tm.assert_index_equal(bf.columns, float_frame.columns) tm.assert_index_equal(bf.index, other.index) # test fill value - join_idx = self.frame.index.join(other.index) - diff_a = self.frame.index.difference(join_idx) + join_idx = float_frame.index.join(other.index) + diff_a = float_frame.index.difference(join_idx) diff_b = other.index.difference(join_idx) diff_a_vals = af.reindex(diff_a).values @@ -588,55 +589,38 @@ def test_align(self): assert (diff_a_vals == -1).all() - af, bf = self.frame.align(other, join='inner', axis=1) - tm.assert_index_equal(bf.columns, other.columns) - - af, bf = self.frame.align(other, join='inner', axis=1, method='pad') + af, bf = float_frame.align(other, join='inner', axis=1) tm.assert_index_equal(bf.columns, other.columns) - # test other non-float types - af, bf = self.intframe.align(other, join='inner', axis=1, method='pad') + af, bf = float_frame.align(other, join='inner', axis=1, method='pad') tm.assert_index_equal(bf.columns, other.columns) - af, bf = self.mixed_frame.align(self.mixed_frame, - join='inner', axis=1, method='pad') - tm.assert_index_equal(bf.columns, self.mixed_frame.columns) - - af, bf = self.frame.align(other.iloc[:, 0], join='inner', axis=1, - method=None, fill_value=None) - tm.assert_index_equal(bf.index, Index([])) - - af, bf = self.frame.align(other.iloc[:, 0], join='inner', axis=1, - method=None, fill_value=0) - tm.assert_index_equal(bf.index, Index([])) - - # mixed floats/ints - af, bf = self.mixed_float.align(other.iloc[:, 0], join='inner', axis=1, - method=None, fill_value=0) + af, bf = float_frame.align(other.iloc[:, 0], join='inner', axis=1, + method=None, fill_value=None) tm.assert_index_equal(bf.index, Index([])) - af, bf = self.mixed_int.align(other.iloc[:, 0], join='inner', axis=1, - method=None, fill_value=0) + af, bf = float_frame.align(other.iloc[:, 0], join='inner', axis=1, + method=None, fill_value=0) tm.assert_index_equal(bf.index, Index([])) # Try to align DataFrame to Series along bad axis with pytest.raises(ValueError): - self.frame.align(af.iloc[0, :3], join='inner', axis=2) + float_frame.align(af.iloc[0, :3], join='inner', axis=2) # align dataframe to series with broadcast or not - idx = self.frame.index + idx = float_frame.index s = Series(range(len(idx)), index=idx) - left, right = self.frame.align(s, axis=0) - tm.assert_index_equal(left.index, self.frame.index) - tm.assert_index_equal(right.index, self.frame.index) + left, right = float_frame.align(s, axis=0) + tm.assert_index_equal(left.index, float_frame.index) + tm.assert_index_equal(right.index, float_frame.index) assert isinstance(right, Series) - left, right = self.frame.align(s, broadcast_axis=1) - tm.assert_index_equal(left.index, self.frame.index) - expected = {c: s for c in self.frame.columns} - expected = DataFrame(expected, index=self.frame.index, - columns=self.frame.columns) + left, right = float_frame.align(s, broadcast_axis=1) + tm.assert_index_equal(left.index, float_frame.index) + expected = {c: s for c in float_frame.columns} + expected = DataFrame(expected, index=float_frame.index, + columns=float_frame.columns) tm.assert_frame_equal(right, expected) # see gh-9558 @@ -649,6 +633,34 @@ def test_align(self): expected = DataFrame({'a': [0, 2, 0], 'b': [0, 5, 0]}) tm.assert_frame_equal(result, expected) + def test_align_int(self, int_frame): + # test other non-float types + other = DataFrame(index=range(5), columns=['A', 'B', 'C']) + + af, bf = int_frame.align(other, join='inner', axis=1, method='pad') + tm.assert_index_equal(bf.columns, other.columns) + + def test_align_mixed_type(self, float_string_frame): + + af, bf = float_string_frame.align(float_string_frame, + join='inner', axis=1, method='pad') + tm.assert_index_equal(bf.columns, float_string_frame.columns) + + def test_align_mixed_float(self, mixed_float_frame): + # mixed floats/ints + other = DataFrame(index=range(5), columns=['A', 'B', 'C']) + + af, bf = mixed_float_frame.align(other.iloc[:, 0], join='inner', + axis=1, method=None, fill_value=0) + tm.assert_index_equal(bf.index, Index([])) + + def test_align_mixed_int(self, mixed_int_frame): + other = DataFrame(index=range(5), columns=['A', 'B', 'C']) + + af, bf = mixed_int_frame.align(other.iloc[:, 0], join='inner', axis=1, + method=None, fill_value=0) + tm.assert_index_equal(bf.index, Index([])) + def _check_align(self, a, b, axis, fill_axis, how, method, limit=None): aa, ab = a.align(b, axis=axis, join=how, method=method, limit=limit, fill_axis=fill_axis) @@ -676,13 +688,14 @@ def _check_align(self, a, b, axis, fill_axis, how, method, limit=None): @pytest.mark.parametrize('ax', [0, 1, None]) @pytest.mark.parametrize('fax', [0, 1]) @pytest.mark.parametrize('how', ['inner', 'outer', 'left', 'right']) - def test_align_fill_method(self, how, meth, ax, fax): - self._check_align_fill(how, meth, ax, fax) + def test_align_fill_method(self, how, meth, ax, fax, float_frame): + df = float_frame + self._check_align_fill(df, how, meth, ax, fax) - def _check_align_fill(self, kind, meth, ax, fax): - left = self.frame.iloc[0:4, :10] - right = self.frame.iloc[2:, 6:] - empty = self.frame.iloc[:0, :0] + def _check_align_fill(self, frame, kind, meth, ax, fax): + left = frame.iloc[0:4, :10] + right = frame.iloc[2:, 6:] + empty = frame.iloc[:0, :0] self._check_align(left, right, axis=ax, fill_axis=fax, how=kind, method=meth) @@ -775,24 +788,24 @@ def test_align_series_combinations(self): tm.assert_series_equal(res1, exp2) tm.assert_frame_equal(res2, exp1) - def test_filter(self): + def test_filter(self, float_frame, float_string_frame): # Items - filtered = self.frame.filter(['A', 'B', 'E']) + filtered = float_frame.filter(['A', 'B', 'E']) assert len(filtered.columns) == 2 assert 'E' not in filtered - filtered = self.frame.filter(['A', 'B', 'E'], axis='columns') + filtered = float_frame.filter(['A', 'B', 'E'], axis='columns') assert len(filtered.columns) == 2 assert 'E' not in filtered # Other axis - idx = self.frame.index[0:4] - filtered = self.frame.filter(idx, axis='index') - expected = self.frame.reindex(index=idx) + idx = float_frame.index[0:4] + filtered = float_frame.filter(idx, axis='index') + expected = float_frame.reindex(index=idx) tm.assert_frame_equal(filtered, expected) # like - fcopy = self.frame.copy() + fcopy = float_frame.copy() fcopy['AA'] = 1 filtered = fcopy.filter(like='A') @@ -819,35 +832,35 @@ def test_filter(self): # pass in None with pytest.raises(TypeError, match='Must pass'): - self.frame.filter() + float_frame.filter() with pytest.raises(TypeError, match='Must pass'): - self.frame.filter(items=None) + float_frame.filter(items=None) with pytest.raises(TypeError, match='Must pass'): - self.frame.filter(axis=1) + float_frame.filter(axis=1) # test mutually exclusive arguments with pytest.raises(TypeError, match='mutually exclusive'): - self.frame.filter(items=['one', 'three'], regex='e$', like='bbi') + float_frame.filter(items=['one', 'three'], regex='e$', like='bbi') with pytest.raises(TypeError, match='mutually exclusive'): - self.frame.filter(items=['one', 'three'], regex='e$', axis=1) + float_frame.filter(items=['one', 'three'], regex='e$', axis=1) with pytest.raises(TypeError, match='mutually exclusive'): - self.frame.filter(items=['one', 'three'], regex='e$') + float_frame.filter(items=['one', 'three'], regex='e$') with pytest.raises(TypeError, match='mutually exclusive'): - self.frame.filter(items=['one', 'three'], like='bbi', axis=0) + float_frame.filter(items=['one', 'three'], like='bbi', axis=0) with pytest.raises(TypeError, match='mutually exclusive'): - self.frame.filter(items=['one', 'three'], like='bbi') + float_frame.filter(items=['one', 'three'], like='bbi') # objects - filtered = self.mixed_frame.filter(like='foo') + filtered = float_string_frame.filter(like='foo') assert 'foo' in filtered # unicode columns, won't ascii-encode - df = self.frame.rename(columns={'B': '\u2202'}) + df = float_frame.rename(columns={'B': '\u2202'}) filtered = df.filter(like='C') assert 'C' in filtered - def test_filter_regex_search(self): - fcopy = self.frame.copy() + def test_filter_regex_search(self, float_frame): + fcopy = float_frame.copy() fcopy['AA'] = 1 # regex @@ -895,10 +908,10 @@ def test_filter_corner(self): result = empty.filter(like='foo') assert_frame_equal(result, empty) - def test_take(self): + def test_take(self, float_frame): # homogeneous order = [3, 1, 2, 0] - for df in [self.frame]: + for df in [float_frame]: result = df.take(order, axis=0) expected = df.reindex(df.index.take(order)) @@ -911,7 +924,7 @@ def test_take(self): # negative indices order = [2, 1, -1] - for df in [self.frame]: + for df in [float_frame]: result = df.take(order, axis=0) expected = df.reindex(df.index.take(order)) @@ -941,9 +954,11 @@ def test_take(self): with pytest.raises(IndexError, match=msg): df.take([3, 1, 2, -5], axis=1) + def test_take_mixed_type(self, float_string_frame): + # mixed-dtype order = [4, 1, 2, 0, 3] - for df in [self.mixed_frame]: + for df in [float_string_frame]: result = df.take(order, axis=0) expected = df.reindex(df.index.take(order)) @@ -956,7 +971,7 @@ def test_take(self): # negative indices order = [4, 1, -2] - for df in [self.mixed_frame]: + for df in [float_string_frame]: result = df.take(order, axis=0) expected = df.reindex(df.index.take(order)) @@ -967,9 +982,10 @@ def test_take(self): expected = df.loc[:, ['foo', 'B', 'D']] assert_frame_equal(result, expected) + def test_take_mixed_numeric(self, mixed_float_frame, mixed_int_frame): # by dtype order = [1, 2, 0, 3] - for df in [self.mixed_float, self.mixed_int]: + for df in [mixed_float_frame, mixed_int_frame]: result = df.take(order, axis=0) expected = df.reindex(df.index.take(order)) @@ -993,49 +1009,49 @@ def test_reindex_boolean(self): assert reindexed.values.dtype == np.object_ assert isna(reindexed[1]).all() - def test_reindex_objects(self): - reindexed = self.mixed_frame.reindex(columns=['foo', 'A', 'B']) + def test_reindex_objects(self, float_string_frame): + reindexed = float_string_frame.reindex(columns=['foo', 'A', 'B']) assert 'foo' in reindexed - reindexed = self.mixed_frame.reindex(columns=['A', 'B']) + reindexed = float_string_frame.reindex(columns=['A', 'B']) assert 'foo' not in reindexed - def test_reindex_corner(self): + def test_reindex_corner(self, int_frame): index = Index(['a', 'b', 'c']) - dm = self.empty.reindex(index=[1, 2, 3]) + dm = DataFrame({}).reindex(index=[1, 2, 3]) reindexed = dm.reindex(columns=index) tm.assert_index_equal(reindexed.columns, index) # ints are weird - smaller = self.intframe.reindex(columns=['A', 'B', 'E']) + smaller = int_frame.reindex(columns=['A', 'B', 'E']) assert smaller['E'].dtype == np.float64 - def test_reindex_axis(self): + def test_reindex_axis(self, float_frame, int_frame): cols = ['A', 'B', 'E'] with tm.assert_produces_warning(FutureWarning) as m: - reindexed1 = self.intframe.reindex_axis(cols, axis=1) + reindexed1 = int_frame.reindex_axis(cols, axis=1) assert 'reindex' in str(m[0].message) - reindexed2 = self.intframe.reindex(columns=cols) + reindexed2 = int_frame.reindex(columns=cols) assert_frame_equal(reindexed1, reindexed2) - rows = self.intframe.index[0:5] + rows = int_frame.index[0:5] with tm.assert_produces_warning(FutureWarning) as m: - reindexed1 = self.intframe.reindex_axis(rows, axis=0) + reindexed1 = int_frame.reindex_axis(rows, axis=0) assert 'reindex' in str(m[0].message) - reindexed2 = self.intframe.reindex(index=rows) + reindexed2 = int_frame.reindex(index=rows) assert_frame_equal(reindexed1, reindexed2) msg = ("No axis named 2 for object type" " ") with pytest.raises(ValueError, match=msg): - self.intframe.reindex_axis(rows, axis=2) + int_frame.reindex_axis(rows, axis=2) # no-op case - cols = self.frame.columns.copy() + cols = float_frame.columns.copy() with tm.assert_produces_warning(FutureWarning) as m: - newFrame = self.frame.reindex_axis(cols, axis=1) + newFrame = float_frame.reindex_axis(cols, axis=1) assert 'reindex' in str(m[0].message) - assert_frame_equal(newFrame, self.frame) + assert_frame_equal(newFrame, float_frame) def test_reindex_with_nans(self): df = DataFrame([[1, 2], [3, 4], [np.nan, np.nan], [7, 8], [9, 10]], From a65b2e31258d69cddc0810a7645a3c4eb9ee3d91 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Fri, 28 Jun 2019 14:13:42 +0200 Subject: [PATCH 079/238] Fixturize tests/frame/test_indexing.py (#25633) --- pandas/tests/frame/conftest.py | 11 + pandas/tests/frame/test_indexing.py | 661 ++++++++++++++-------------- 2 files changed, 342 insertions(+), 330 deletions(-) diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index d8a590bc492a48..61a8ea0c384ba3 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -253,6 +253,17 @@ def timezone_frame(): return df +@pytest.fixture +def uint64_frame(): + """ + Fixture for DataFrame with uint64 values + + Columns are ['A', 'B'] + """ + return DataFrame({'A': np.arange(3), 'B': [2**63, 2**63 + 5, 2**63 + 10]}, + dtype=np.uint64) + + @pytest.fixture def simple_frame(): """ diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 3c9558d5cbd108..3b8daa28227f8a 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -25,9 +25,9 @@ class TestDataFrameIndexing(TestData): - def test_getitem(self): + def test_getitem(self, float_frame): # Slicing - sl = self.frame[:20] + sl = float_frame[:20] assert len(sl.index) == 20 # Column access @@ -35,14 +35,14 @@ def test_getitem(self): assert len(series.index) == 20 assert tm.equalContents(series.index, sl.index) - for key, _ in self.frame._series.items(): - assert self.frame[key] is not None + for key, _ in float_frame._series.items(): + assert float_frame[key] is not None - assert 'random' not in self.frame + assert 'random' not in float_frame with pytest.raises(KeyError, match='random'): - self.frame['random'] + float_frame['random'] - df = self.frame.copy() + df = float_frame.copy() df['$10'] = np.random.randn(len(df)) ad = np.random.randn(len(df)) @@ -59,13 +59,13 @@ def test_getitem_dupe_cols(self): with pytest.raises(KeyError): df[['baf']] - def test_get(self): - b = self.frame.get('B') - assert_series_equal(b, self.frame['B']) + def test_get(self, float_frame): + b = float_frame.get('B') + assert_series_equal(b, float_frame['B']) - assert self.frame.get('foo') is None - assert_series_equal(self.frame.get('foo', self.frame['B']), - self.frame['B']) + assert float_frame.get('foo') is None + assert_series_equal(float_frame.get('foo', float_frame['B']), + float_frame['B']) @pytest.mark.parametrize("df", [ DataFrame(), @@ -76,10 +76,10 @@ def test_get_none(self, df): # see gh-5652 assert df.get(None) is None - def test_loc_iterable(self): + def test_loc_iterable(self, float_frame): idx = iter(['A', 'B', 'C']) - result = self.frame.loc[:, idx] - expected = self.frame.loc[:, ['A', 'B', 'C']] + result = float_frame.loc[:, idx] + expected = float_frame.loc[:, ['A', 'B', 'C']] assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -89,11 +89,11 @@ def test_loc_iterable(self): lambda l: dict(zip(l, range(len(l)))).keys()], ids=["list", "iter", "Index", "set", "dict", "dict_keys"]) @pytest.mark.parametrize("levels", [1, 2]) - def test_getitem_listlike(self, idx_type, levels): + def test_getitem_listlike(self, idx_type, levels, float_frame): # GH 21294 if levels == 1: - frame, missing = self.frame, 'food' + frame, missing = float_frame, 'food' else: # MultiIndex columns frame = DataFrame(np.random.randn(8, 3), @@ -129,30 +129,30 @@ def test_loc_uint64(self, val, expected): expected.name = val tm.assert_series_equal(result, expected) - def test_getitem_callable(self): + def test_getitem_callable(self, float_frame): # GH 12533 - result = self.frame[lambda x: 'A'] - tm.assert_series_equal(result, self.frame.loc[:, 'A']) + result = float_frame[lambda x: 'A'] + tm.assert_series_equal(result, float_frame.loc[:, 'A']) - result = self.frame[lambda x: ['A', 'B']] - tm.assert_frame_equal(result, self.frame.loc[:, ['A', 'B']]) + result = float_frame[lambda x: ['A', 'B']] + tm.assert_frame_equal(result, float_frame.loc[:, ['A', 'B']]) - df = self.frame[:3] + df = float_frame[:3] result = df[lambda x: [True, False, True]] - tm.assert_frame_equal(result, self.frame.iloc[[0, 2], :]) + tm.assert_frame_equal(result, float_frame.iloc[[0, 2], :]) - def test_setitem_list(self): + def test_setitem_list(self, float_frame): - self.frame['E'] = 'foo' - data = self.frame[['A', 'B']] - self.frame[['B', 'A']] = data + float_frame['E'] = 'foo' + data = float_frame[['A', 'B']] + float_frame[['B', 'A']] = data - assert_series_equal(self.frame['B'], data['A'], check_names=False) - assert_series_equal(self.frame['A'], data['B'], check_names=False) + assert_series_equal(float_frame['B'], data['A'], check_names=False) + assert_series_equal(float_frame['A'], data['B'], check_names=False) msg = 'Columns must be same length as key' with pytest.raises(ValueError, match=msg): - data[['A']] = self.frame[['A', 'B']] + data[['A']] = float_frame[['A', 'B']] msg = 'Length of values does not match length of index' with pytest.raises(ValueError, match=msg): @@ -172,17 +172,17 @@ def test_setitem_list(self): expected = Series(['1', '2'], df.columns, name=1) assert_series_equal(result, expected) - def test_setitem_list_not_dataframe(self): - data = np.random.randn(len(self.frame), 2) - self.frame[['A', 'B']] = data - assert_almost_equal(self.frame[['A', 'B']].values, data) + def test_setitem_list_not_dataframe(self, float_frame): + data = np.random.randn(len(float_frame), 2) + float_frame[['A', 'B']] = data + assert_almost_equal(float_frame[['A', 'B']].values, data) - def test_setitem_list_of_tuples(self): - tuples = list(zip(self.frame['A'], self.frame['B'])) - self.frame['tuples'] = tuples + def test_setitem_list_of_tuples(self, float_frame): + tuples = list(zip(float_frame['A'], float_frame['B'])) + float_frame['tuples'] = tuples - result = self.frame['tuples'] - expected = Series(tuples, index=self.frame.index, name='tuples') + result = float_frame['tuples'] + expected = Series(tuples, index=float_frame.index, name='tuples') assert_series_equal(result, expected) def test_setitem_mulit_index(self): @@ -229,29 +229,30 @@ def inc(x): expected = pd.DataFrame([[-1, inc], [inc, -1]]) tm.assert_frame_equal(df, expected) - def test_getitem_boolean(self): + def test_getitem_boolean(self, float_string_frame, mixed_float_frame, + mixed_int_frame, datetime_frame): # boolean indexing - d = self.tsframe.index[10] - indexer = self.tsframe.index > d + d = datetime_frame.index[10] + indexer = datetime_frame.index > d indexer_obj = indexer.astype(object) - subindex = self.tsframe.index[indexer] - subframe = self.tsframe[indexer] + subindex = datetime_frame.index[indexer] + subframe = datetime_frame[indexer] tm.assert_index_equal(subindex, subframe.index) with pytest.raises(ValueError, match='Item wrong length'): - self.tsframe[indexer[:-1]] + datetime_frame[indexer[:-1]] - subframe_obj = self.tsframe[indexer_obj] + subframe_obj = datetime_frame[indexer_obj] assert_frame_equal(subframe_obj, subframe) with pytest.raises(ValueError, match='boolean values only'): - self.tsframe[self.tsframe] + datetime_frame[datetime_frame] # test that Series work - indexer_obj = Series(indexer_obj, self.tsframe.index) + indexer_obj = Series(indexer_obj, datetime_frame.index) - subframe_obj = self.tsframe[indexer_obj] + subframe_obj = datetime_frame[indexer_obj] assert_frame_equal(subframe_obj, subframe) # test that Series indexers reindex @@ -259,14 +260,14 @@ def test_getitem_boolean(self): # key is not the same as the given index, we will reindex # not sure this is really necessary with tm.assert_produces_warning(UserWarning, check_stacklevel=False): - indexer_obj = indexer_obj.reindex(self.tsframe.index[::-1]) - subframe_obj = self.tsframe[indexer_obj] + indexer_obj = indexer_obj.reindex(datetime_frame.index[::-1]) + subframe_obj = datetime_frame[indexer_obj] assert_frame_equal(subframe_obj, subframe) # test df[df > 0] - for df in [self.tsframe, self.mixed_frame, - self.mixed_float, self.mixed_int]: - if df is self.mixed_frame: + for df in [datetime_frame, float_string_frame, + mixed_float_frame, mixed_int_frame]: + if df is float_string_frame: continue data = df._get_numeric_data() @@ -286,10 +287,10 @@ def test_getitem_boolean(self): if bif[c].dtype != bifw[c].dtype: assert bif[c].dtype == df[c].dtype - def test_getitem_boolean_casting(self): + def test_getitem_boolean_casting(self, datetime_frame): # don't upcast if we don't need to - df = self.tsframe.copy() + df = datetime_frame.copy() df['E'] = 1 df['E'] = df['E'].astype('int32') df['E1'] = df['E'].copy() @@ -379,26 +380,26 @@ def test_getitem_ix_mixed_integer(self): expected = df.iloc[:, [1]] assert_frame_equal(result, expected) - def test_getitem_setitem_ix_negative_integers(self): + def test_getitem_setitem_ix_negative_integers(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - result = self.frame.ix[:, -1] - assert_series_equal(result, self.frame['D']) + result = float_frame.ix[:, -1] + assert_series_equal(result, float_frame['D']) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - result = self.frame.ix[:, [-1]] - assert_frame_equal(result, self.frame[['D']]) + result = float_frame.ix[:, [-1]] + assert_frame_equal(result, float_frame[['D']]) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - result = self.frame.ix[:, [-1, -2]] - assert_frame_equal(result, self.frame[['D', 'C']]) + result = float_frame.ix[:, [-1, -2]] + assert_frame_equal(result, float_frame[['D', 'C']]) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - self.frame.ix[:, [-1]] = 0 - assert (self.frame['D'] == 0).all() + float_frame.ix[:, [-1]] = 0 + assert (float_frame['D'] == 0).all() df = DataFrame(np.random.randn(8, 4)) # ix does label-based indexing when having an integer index @@ -425,11 +426,11 @@ def test_getitem_setitem_ix_negative_integers(self): assert a.ix[-1].name == 'T' assert a.ix[-2].name == 'S' - def test_getattr(self): - assert_series_equal(self.frame.A, self.frame['A']) + def test_getattr(self, float_frame): + assert_series_equal(float_frame.A, float_frame['A']) msg = "'DataFrame' object has no attribute 'NONEXISTENT_NAME'" with pytest.raises(AttributeError, match=msg): - self.frame.NONEXISTENT_NAME + float_frame.NONEXISTENT_NAME def test_setattr_column(self): df = DataFrame({'foobar': 1}, index=range(10)) @@ -437,43 +438,43 @@ def test_setattr_column(self): df.foobar = 5 assert (df.foobar == 5).all() - def test_setitem(self): + def test_setitem(self, float_frame): # not sure what else to do here - series = self.frame['A'][::2] - self.frame['col5'] = series - assert 'col5' in self.frame + series = float_frame['A'][::2] + float_frame['col5'] = series + assert 'col5' in float_frame assert len(series) == 15 - assert len(self.frame) == 30 + assert len(float_frame) == 30 exp = np.ravel(np.column_stack((series.values, [np.nan] * 15))) - exp = Series(exp, index=self.frame.index, name='col5') - tm.assert_series_equal(self.frame['col5'], exp) + exp = Series(exp, index=float_frame.index, name='col5') + tm.assert_series_equal(float_frame['col5'], exp) - series = self.frame['A'] - self.frame['col6'] = series - tm.assert_series_equal(series, self.frame['col6'], check_names=False) + series = float_frame['A'] + float_frame['col6'] = series + tm.assert_series_equal(series, float_frame['col6'], check_names=False) with pytest.raises(KeyError): - self.frame[np.random.randn(len(self.frame) + 1)] = 1 + float_frame[np.random.randn(len(float_frame) + 1)] = 1 # set ndarray - arr = np.random.randn(len(self.frame)) - self.frame['col9'] = arr - assert (self.frame['col9'] == arr).all() + arr = np.random.randn(len(float_frame)) + float_frame['col9'] = arr + assert (float_frame['col9'] == arr).all() - self.frame['col7'] = 5 - assert((self.frame['col7'] == 5).all()) + float_frame['col7'] = 5 + assert((float_frame['col7'] == 5).all()) - self.frame['col0'] = 3.14 - assert((self.frame['col0'] == 3.14).all()) + float_frame['col0'] = 3.14 + assert((float_frame['col0'] == 3.14).all()) - self.frame['col8'] = 'foo' - assert((self.frame['col8'] == 'foo').all()) + float_frame['col8'] = 'foo' + assert((float_frame['col8'] == 'foo').all()) # this is partially a view (e.g. some blocks are view) # so raise/warn - smaller = self.frame[:2] + smaller = float_frame[:2] with pytest.raises(com.SettingWithCopyError): smaller['col10'] = ['1', '2'] @@ -492,27 +493,27 @@ def test_setitem(self): assert_frame_equal(df, expected) @pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"]) - def test_setitem_dtype(self, dtype): - arr = np.random.randn(len(self.frame)) + def test_setitem_dtype(self, dtype, float_frame): + arr = np.random.randn(len(float_frame)) - self.frame[dtype] = np.array(arr, dtype=dtype) - assert self.frame[dtype].dtype.name == dtype + float_frame[dtype] = np.array(arr, dtype=dtype) + assert float_frame[dtype].dtype.name == dtype - def test_setitem_tuple(self): - self.frame['A', 'B'] = self.frame['A'] - assert_series_equal(self.frame['A', 'B'], self.frame[ + def test_setitem_tuple(self, float_frame): + float_frame['A', 'B'] = float_frame['A'] + assert_series_equal(float_frame['A', 'B'], float_frame[ 'A'], check_names=False) - def test_setitem_always_copy(self): - s = self.frame['A'].copy() - self.frame['E'] = s + def test_setitem_always_copy(self, float_frame): + s = float_frame['A'].copy() + float_frame['E'] = s - self.frame['E'][5:10] = np.nan + float_frame['E'][5:10] = np.nan assert notna(s[5:10]).all() - def test_setitem_boolean(self): - df = self.frame.copy() - values = self.frame.values + def test_setitem_boolean(self, float_frame): + df = float_frame.copy() + values = float_frame.values df[df['A'] > 0] = 4 values[values[:, 0] > 0] = 4 @@ -565,10 +566,10 @@ def test_setitem_boolean(self): [lambda df: df > np.abs(df) / 2, lambda df: (df > np.abs(df) / 2).values], ids=['dataframe', 'array']) - def test_setitem_boolean_mask(self, mask_type): + def test_setitem_boolean_mask(self, mask_type, float_frame): # Test for issue #18582 - df = self.frame.copy() + df = float_frame.copy() mask = mask_type(df) # index with boolean mask @@ -579,34 +580,34 @@ def test_setitem_boolean_mask(self, mask_type): expected.values[np.array(mask)] = np.nan assert_frame_equal(result, expected) - def test_setitem_cast(self): - self.frame['D'] = self.frame['D'].astype('i8') - assert self.frame['D'].dtype == np.int64 + def test_setitem_cast(self, float_frame): + float_frame['D'] = float_frame['D'].astype('i8') + assert float_frame['D'].dtype == np.int64 # #669, should not cast? # this is now set to int64, which means a replacement of the column to # the value dtype (and nothing to do with the existing dtype) - self.frame['B'] = 0 - assert self.frame['B'].dtype == np.int64 + float_frame['B'] = 0 + assert float_frame['B'].dtype == np.int64 # cast if pass array of course - self.frame['B'] = np.arange(len(self.frame)) - assert issubclass(self.frame['B'].dtype.type, np.integer) + float_frame['B'] = np.arange(len(float_frame)) + assert issubclass(float_frame['B'].dtype.type, np.integer) - self.frame['foo'] = 'bar' - self.frame['foo'] = 0 - assert self.frame['foo'].dtype == np.int64 + float_frame['foo'] = 'bar' + float_frame['foo'] = 0 + assert float_frame['foo'].dtype == np.int64 - self.frame['foo'] = 'bar' - self.frame['foo'] = 2.5 - assert self.frame['foo'].dtype == np.float64 + float_frame['foo'] = 'bar' + float_frame['foo'] = 2.5 + assert float_frame['foo'].dtype == np.float64 - self.frame['something'] = 0 - assert self.frame['something'].dtype == np.int64 - self.frame['something'] = 2 - assert self.frame['something'].dtype == np.int64 - self.frame['something'] = 2.5 - assert self.frame['something'].dtype == np.float64 + float_frame['something'] = 0 + assert float_frame['something'].dtype == np.int64 + float_frame['something'] = 2 + assert float_frame['something'].dtype == np.int64 + float_frame['something'] = 2.5 + assert float_frame['something'].dtype == np.float64 # GH 7704 # dtype conversion on setting @@ -624,14 +625,14 @@ def test_setitem_cast(self): df.one = np.int8(7) assert df.dtypes.one == np.dtype(np.int8) - def test_setitem_boolean_column(self): - expected = self.frame.copy() - mask = self.frame['A'] > 0 + def test_setitem_boolean_column(self, float_frame): + expected = float_frame.copy() + mask = float_frame['A'] > 0 - self.frame.loc[mask, 'B'] = 0 + float_frame.loc[mask, 'B'] = 0 expected.values[mask.values, 1] = 0 - assert_frame_equal(self.frame, expected) + assert_frame_equal(float_frame, expected) def test_frame_setitem_timestamp(self): # GH#2155 @@ -642,7 +643,7 @@ def test_frame_setitem_timestamp(self): data[ts] = np.nan # works, mostly a smoke-test assert np.isnan(data[ts]).all() - def test_setitem_corner(self): + def test_setitem_corner(self, float_frame): # corner case df = DataFrame({'B': [1., 2., 3.], 'C': ['a', 'b', 'c']}, @@ -659,7 +660,7 @@ def test_setitem_corner(self): df[datetime.now()] = 5. # what to do when empty frame with index - dm = DataFrame(index=self.frame.index) + dm = DataFrame(index=float_frame.index) dm['A'] = 'foo' dm['B'] = 'bar' assert len(dm.columns) == 2 @@ -735,16 +736,16 @@ def test_setitem_clear_caches(self): assert df['z'] is not foo tm.assert_series_equal(df['z'], expected) - def test_setitem_None(self): + def test_setitem_None(self, float_frame): # GH #766 - self.frame[None] = self.frame['A'] + float_frame[None] = float_frame['A'] assert_series_equal( - self.frame.iloc[:, -1], self.frame['A'], check_names=False) - assert_series_equal(self.frame.loc[:, None], self.frame[ + float_frame.iloc[:, -1], float_frame['A'], check_names=False) + assert_series_equal(float_frame.loc[:, None], float_frame[ 'A'], check_names=False) - assert_series_equal(self.frame[None], self.frame[ + assert_series_equal(float_frame[None], float_frame[ 'A'], check_names=False) - repr(self.frame) + repr(float_frame) def test_setitem_empty(self): # GH 9596 @@ -785,8 +786,8 @@ def test_getitem_empty_frame_with_boolean(self): df2 = df[df > 0] assert_frame_equal(df, df2) - def test_delitem_corner(self): - f = self.frame.copy() + def test_delitem_corner(self, float_frame): + f = float_frame.copy() del f['D'] assert len(f.columns) == 3 with pytest.raises(KeyError, match=r"^'D'$"): @@ -794,15 +795,15 @@ def test_delitem_corner(self): del f['B'] assert len(f.columns) == 2 - def test_getitem_fancy_2d(self): - f = self.frame + def test_getitem_fancy_2d(self, float_frame): + f = float_frame with catch_warnings(record=True): simplefilter("ignore", FutureWarning) assert_frame_equal(f.ix[:, ['B', 'A']], f.reindex(columns=['B', 'A'])) - subidx = self.frame.index[[5, 4, 1]] + subidx = float_frame.index[[5, 4, 1]] with catch_warnings(record=True): simplefilter("ignore", FutureWarning) assert_frame_equal(f.ix[subidx, ['B', 'A']], @@ -891,10 +892,10 @@ def test_getitem_setitem_integer_slice_keyerrors(self): with pytest.raises(KeyError, match=r"^3$"): df2.loc[3:11] = 0 - def test_setitem_fancy_2d(self): + def test_setitem_fancy_2d(self, float_frame): # case 1 - frame = self.frame.copy() + frame = float_frame.copy() expected = frame.copy() with catch_warnings(record=True): @@ -905,12 +906,12 @@ def test_setitem_fancy_2d(self): assert_frame_equal(frame, expected) # case 2 - frame = self.frame.copy() - frame2 = self.frame.copy() + frame = float_frame.copy() + frame2 = float_frame.copy() expected = frame.copy() - subidx = self.frame.index[[5, 4, 1]] + subidx = float_frame.index[[5, 4, 1]] values = np.random.randn(3, 2) with catch_warnings(record=True): @@ -925,18 +926,18 @@ def test_setitem_fancy_2d(self): assert_frame_equal(frame2, expected) # case 3: slicing rows, etc. - frame = self.frame.copy() + frame = float_frame.copy() with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - expected1 = self.frame.copy() + expected1 = float_frame.copy() frame.ix[5:10] = 1. expected1.values[5:10] = 1. assert_frame_equal(frame, expected1) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - expected2 = self.frame.copy() + expected2 = float_frame.copy() arr = np.random.randn(5, len(frame.columns)) frame.ix[5:10] = arr expected2.values[5:10] = arr @@ -945,7 +946,7 @@ def test_setitem_fancy_2d(self): # case 4 with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - frame = self.frame.copy() + frame = float_frame.copy() frame.ix[5:10, :] = 1. assert_frame_equal(frame, expected1) frame.ix[5:10, :] = arr @@ -954,10 +955,10 @@ def test_setitem_fancy_2d(self): # case 5 with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - frame = self.frame.copy() - frame2 = self.frame.copy() + frame = float_frame.copy() + frame2 = float_frame.copy() - expected = self.frame.copy() + expected = float_frame.copy() values = np.random.randn(5, 2) frame.ix[:5, ['A', 'B']] = values @@ -973,8 +974,8 @@ def test_setitem_fancy_2d(self): # case 6: slice rows with labels, inclusive! with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - frame = self.frame.copy() - expected = self.frame.copy() + frame = float_frame.copy() + expected = float_frame.copy() frame.ix[frame.index[5]:frame.index[10]] = 5. expected.values[5:11] = 5 @@ -983,9 +984,9 @@ def test_setitem_fancy_2d(self): # case 7: slice columns with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - frame = self.frame.copy() - frame2 = self.frame.copy() - expected = self.frame.copy() + frame = float_frame.copy() + frame2 = float_frame.copy() + expected = float_frame.copy() # slice indices frame.ix[:, 1:3] = 4. @@ -1006,18 +1007,18 @@ def test_setitem_fancy_2d(self): frame[frame['a'] == 2] = 100 assert_frame_equal(frame, expected) - def test_fancy_getitem_slice_mixed(self): - sliced = self.mixed_frame.iloc[:, -3:] + def test_fancy_getitem_slice_mixed(self, float_frame, float_string_frame): + sliced = float_string_frame.iloc[:, -3:] assert sliced['D'].dtype == np.float64 # get view with single block # setting it triggers setting with copy - sliced = self.frame.iloc[:, -3:] + sliced = float_frame.iloc[:, -3:] with pytest.raises(com.SettingWithCopyError): sliced['C'] = 4. - assert (self.frame['C'] == 4).all() + assert (float_frame['C'] == 4).all() def test_fancy_setitem_int_labels(self): # integer index defers to label-based indexing @@ -1078,7 +1079,7 @@ def test_fancy_getitem_int_labels(self): expected = df[3] assert_series_equal(result, expected) - def test_fancy_index_int_labels_exceptions(self): + def test_fancy_index_int_labels_exceptions(self, float_frame): df = DataFrame(np.random.randn(10, 5), index=np.arange(0, 20, 2)) with catch_warnings(record=True): @@ -1092,17 +1093,17 @@ def test_fancy_index_int_labels_exceptions(self): msg = (r"None of \[Index\(\['foo', 'bar', 'baz'\]," r" dtype='object'\)\] are in the \[index\]") with pytest.raises(KeyError, match=msg): - self.frame.ix[['foo', 'bar', 'baz']] = 1 + float_frame.ix[['foo', 'bar', 'baz']] = 1 msg = (r"None of \[Index\(\['E'\], dtype='object'\)\] are in the" r" \[columns\]") with pytest.raises(KeyError, match=msg): - self.frame.ix[:, ['E']] = 1 + float_frame.ix[:, ['E']] = 1 # partial setting now allows this GH2578 - # pytest.raises(KeyError, self.frame.ix.__setitem__, + # pytest.raises(KeyError, float_frame.ix.__setitem__, # (slice(None, None), 'E'), 1) - def test_setitem_fancy_mixed_2d(self): + def test_setitem_fancy_mixed_2d(self, float_string_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) @@ -1110,11 +1111,12 @@ def test_setitem_fancy_mixed_2d(self): result = self.mixed_frame.ix[:5, ['C', 'B', 'A']] assert (result.values == 5).all() - self.mixed_frame.ix[5] = np.nan - assert isna(self.mixed_frame.ix[5]).all() + float_string_frame.ix[5] = np.nan + assert isna(float_string_frame.ix[5]).all() - self.mixed_frame.ix[5] = self.mixed_frame.ix[6] - assert_series_equal(self.mixed_frame.ix[5], self.mixed_frame.ix[6], + float_string_frame.ix[5] = float_string_frame.ix[6] + assert_series_equal(float_string_frame.ix[5], + float_string_frame.ix[6], check_names=False) # #1432 @@ -1273,8 +1275,8 @@ def test_ix_dup(self): sub = df.ix['b':'d'] assert_frame_equal(sub, df.ix[2:]) - def test_getitem_fancy_1d(self): - f = self.frame + def test_getitem_fancy_1d(self, float_frame, float_string_frame): + f = float_frame # return self if no slicing...for now with catch_warnings(record=True): @@ -1329,15 +1331,15 @@ def test_getitem_fancy_1d(self): # slice of mixed-frame with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - xs = self.mixed_frame.ix[5] - exp = self.mixed_frame.xs(self.mixed_frame.index[5]) + xs = float_string_frame.ix[5] + exp = float_string_frame.xs(float_string_frame.index[5]) tm.assert_series_equal(xs, exp) - def test_setitem_fancy_1d(self): + def test_setitem_fancy_1d(self, float_frame): # case 1: set cross-section for indices - frame = self.frame.copy() - expected = self.frame.copy() + frame = float_frame.copy() + expected = float_frame.copy() with catch_warnings(record=True): simplefilter("ignore", FutureWarning) @@ -1349,13 +1351,13 @@ def test_setitem_fancy_1d(self): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - frame2 = self.frame.copy() + frame2 = float_frame.copy() frame2.ix[2, [3, 2, 1]] = [1., 2., 3.] assert_frame_equal(frame, expected) # case 2, set a section of a column - frame = self.frame.copy() - expected = self.frame.copy() + frame = float_frame.copy() + expected = float_frame.copy() with catch_warnings(record=True): simplefilter("ignore", FutureWarning) @@ -1366,13 +1368,13 @@ def test_setitem_fancy_1d(self): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - frame2 = self.frame.copy() + frame2 = float_frame.copy() frame2.ix[5:10, 'B'] = vals assert_frame_equal(frame, expected) # case 3: full xs - frame = self.frame.copy() - expected = self.frame.copy() + frame = float_frame.copy() + expected = float_frame.copy() with catch_warnings(record=True): simplefilter("ignore", FutureWarning) @@ -1387,8 +1389,8 @@ def test_setitem_fancy_1d(self): assert_frame_equal(frame, expected) # single column - frame = self.frame.copy() - expected = self.frame.copy() + frame = float_frame.copy() + expected = float_frame.copy() with catch_warnings(record=True): simplefilter("ignore", FutureWarning) @@ -1396,8 +1398,8 @@ def test_setitem_fancy_1d(self): expected['A'] = 7. assert_frame_equal(frame, expected) - def test_getitem_fancy_scalar(self): - f = self.frame + def test_getitem_fancy_scalar(self, float_frame): + f = float_frame ix = f.loc # individual value @@ -1406,9 +1408,9 @@ def test_getitem_fancy_scalar(self): for idx in f.index[::5]: assert ix[idx, col] == ts[idx] - def test_setitem_fancy_scalar(self): - f = self.frame - expected = self.frame.copy() + def test_setitem_fancy_scalar(self, float_frame): + f = float_frame + expected = float_frame.copy() ix = f.loc # individual value @@ -1422,8 +1424,8 @@ def test_setitem_fancy_scalar(self): ix[idx, col] = val assert_frame_equal(f, expected) - def test_getitem_fancy_boolean(self): - f = self.frame + def test_getitem_fancy_boolean(self, float_frame): + f = float_frame ix = f.loc expected = f.reindex(columns=['B', 'D']) @@ -1446,49 +1448,49 @@ def test_getitem_fancy_boolean(self): columns=['C', 'D']) assert_frame_equal(result, expected) - def test_setitem_fancy_boolean(self): + def test_setitem_fancy_boolean(self, float_frame): # from 2d, set with booleans - frame = self.frame.copy() - expected = self.frame.copy() + frame = float_frame.copy() + expected = float_frame.copy() mask = frame['A'] > 0 frame.loc[mask] = 0. expected.values[mask.values] = 0. assert_frame_equal(frame, expected) - frame = self.frame.copy() - expected = self.frame.copy() + frame = float_frame.copy() + expected = float_frame.copy() frame.loc[mask, ['A', 'B']] = 0. expected.values[mask.values, :2] = 0. assert_frame_equal(frame, expected) - def test_getitem_fancy_ints(self): - result = self.frame.iloc[[1, 4, 7]] - expected = self.frame.loc[self.frame.index[[1, 4, 7]]] + def test_getitem_fancy_ints(self, float_frame): + result = float_frame.iloc[[1, 4, 7]] + expected = float_frame.loc[float_frame.index[[1, 4, 7]]] assert_frame_equal(result, expected) - result = self.frame.iloc[:, [2, 0, 1]] - expected = self.frame.loc[:, self.frame.columns[[2, 0, 1]]] + result = float_frame.iloc[:, [2, 0, 1]] + expected = float_frame.loc[:, float_frame.columns[[2, 0, 1]]] assert_frame_equal(result, expected) - def test_getitem_setitem_fancy_exceptions(self): - ix = self.frame.iloc + def test_getitem_setitem_fancy_exceptions(self, float_frame): + ix = float_frame.iloc with pytest.raises(IndexingError, match='Too many indexers'): ix[:, :, :] with pytest.raises(IndexingError): ix[:, :, :] = 1 - def test_getitem_setitem_boolean_misaligned(self): + def test_getitem_setitem_boolean_misaligned(self, float_frame): # boolean index misaligned labels - mask = self.frame['A'][::-1] > 1 + mask = float_frame['A'][::-1] > 1 - result = self.frame.loc[mask] - expected = self.frame.loc[mask[::-1]] + result = float_frame.loc[mask] + expected = float_frame.loc[mask[::-1]] assert_frame_equal(result, expected) - cp = self.frame.copy() - expected = self.frame.copy() + cp = float_frame.copy() + expected = float_frame.copy() cp.loc[mask] = 0 expected.loc[mask] = 0 assert_frame_equal(cp, expected) @@ -1651,17 +1653,18 @@ def test_setitem_mixed_datetime(self): df.loc[[4, 5], ['a', 'b']] = A assert_frame_equal(df, expected) - def test_setitem_frame(self): - piece = self.frame.loc[self.frame.index[:2], ['A', 'B']] - self.frame.loc[self.frame.index[-2]:, ['A', 'B']] = piece.values - result = self.frame.loc[self.frame.index[-2:], ['A', 'B']].values + def test_setitem_frame_float(self, float_frame): + piece = float_frame.loc[float_frame.index[:2], ['A', 'B']] + float_frame.loc[float_frame.index[-2]:, ['A', 'B']] = piece.values + result = float_frame.loc[float_frame.index[-2:], ['A', 'B']].values expected = piece.values assert_almost_equal(result, expected) + def test_setitem_frame_mixed(self, float_string_frame): # GH 3216 # already aligned - f = self.mixed_frame.copy() + f = float_string_frame.copy() piece = DataFrame([[1., 2.], [3., 4.]], index=f.index[0:2], columns=['A', 'B']) key = (slice(None, 2), ['A', 'B']) @@ -1670,7 +1673,7 @@ def test_setitem_frame(self): piece.values) # rows unaligned - f = self.mixed_frame.copy() + f = float_string_frame.copy() piece = DataFrame([[1., 2.], [3., 4.], [5., 6.], [7., 8.]], index=list(f.index[0:2]) + ['foo', 'bar'], columns=['A', 'B']) @@ -1680,7 +1683,7 @@ def test_setitem_frame(self): piece.values[0:2]) # key is unaligned with values - f = self.mixed_frame.copy() + f = float_string_frame.copy() piece = f.loc[f.index[:2], ['A']] piece.index = f.index[-2:] key = (slice(-2, None), ['A', 'B']) @@ -1690,13 +1693,14 @@ def test_setitem_frame(self): piece.values) # ndarray - f = self.mixed_frame.copy() - piece = self.mixed_frame.loc[f.index[:2], ['A', 'B']] + f = float_string_frame.copy() + piece = float_string_frame.loc[f.index[:2], ['A', 'B']] key = (slice(-2, None), ['A', 'B']) f.loc[key] = piece.values assert_almost_equal(f.loc[f.index[-2:], ['A', 'B']].values, piece.values) + def test_setitem_frame_upcast(self): # needs upcasting df = DataFrame([[1, 2, 'foo'], [3, 4, 'bar']], columns=['A', 'B', 'C']) df2 = df.copy() @@ -1706,12 +1710,12 @@ def test_setitem_frame(self): expected['C'] = df['C'] assert_frame_equal(df2, expected) - def test_setitem_frame_align(self): - piece = self.frame.loc[self.frame.index[:2], ['A', 'B']] - piece.index = self.frame.index[-2:] + def test_setitem_frame_align(self, float_frame): + piece = float_frame.loc[float_frame.index[:2], ['A', 'B']] + piece.index = float_frame.index[-2:] piece.columns = ['A', 'B'] - self.frame.loc[self.frame.index[-2:], ['A', 'B']] = piece - result = self.frame.loc[self.frame.index[-2:], ['A', 'B']].values + float_frame.loc[float_frame.index[-2:], ['A', 'B']] = piece + result = float_frame.loc[float_frame.index[-2:], ['A', 'B']].values expected = piece.values assert_almost_equal(result, expected) @@ -1775,87 +1779,94 @@ def test_getitem_list_duplicates(self): expected = df.iloc[:, 2:] assert_frame_equal(result, expected) - def test_get_value(self): - for idx in self.frame.index: - for col in self.frame.columns: + def test_get_value(self, float_frame): + for idx in float_frame.index: + for col in float_frame.columns: with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = self.frame.get_value(idx, col) - expected = self.frame[col][idx] + result = float_frame.get_value(idx, col) + expected = float_frame[col][idx] assert result == expected - def test_lookup(self): - def alt(df, rows, cols, dtype): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = [df.get_value(r, c) for r, c in zip(rows, cols)] - return np.array(result, dtype=dtype) + def test_lookup_float(self, float_frame): + df = float_frame + rows = list(df.index) * len(df.columns) + cols = list(df.columns) * len(df.index) + result = df.lookup(rows, cols) - def testit(df): - rows = list(df.index) * len(df.columns) - cols = list(df.columns) * len(df.index) - result = df.lookup(rows, cols) - expected = alt(df, rows, cols, dtype=np.object_) - tm.assert_almost_equal(result, expected, check_dtype=False) + expected = np.array([df.loc[r, c] for r, c in zip(rows, cols)]) + tm.assert_numpy_array_equal(result, expected) - testit(self.mixed_frame) - testit(self.frame) + def test_lookup_mixed(self, float_string_frame): + df = float_string_frame + rows = list(df.index) * len(df.columns) + cols = list(df.columns) * len(df.index) + result = df.lookup(rows, cols) + expected = np.array([df.loc[r, c] for r, c in zip(rows, cols)], + dtype=np.object_) + tm.assert_almost_equal(result, expected) + + def test_lookup_bool(self): df = DataFrame({'label': ['a', 'b', 'a', 'c'], 'mask_a': [True, True, False, True], 'mask_b': [True, False, False, False], 'mask_c': [False, True, False, True]}) df['mask'] = df.lookup(df.index, 'mask_' + df['label']) - exp_mask = alt(df, df.index, 'mask_' + df['label'], dtype=np.bool_) + + exp_mask = np.array([ + df.loc[r, c] for r, c in zip(df.index, 'mask_' + df['label'])]) + tm.assert_series_equal(df['mask'], pd.Series(exp_mask, name='mask')) assert df['mask'].dtype == np.bool_ + def test_lookup_raises(self, float_frame): with pytest.raises(KeyError): - self.frame.lookup(['xyz'], ['A']) + float_frame.lookup(['xyz'], ['A']) with pytest.raises(KeyError): - self.frame.lookup([self.frame.index[0]], ['xyz']) + float_frame.lookup([float_frame.index[0]], ['xyz']) with pytest.raises(ValueError, match='same size'): - self.frame.lookup(['a', 'b', 'c'], ['a']) + float_frame.lookup(['a', 'b', 'c'], ['a']) - def test_set_value(self): - for idx in self.frame.index: - for col in self.frame.columns: + def test_set_value(self, float_frame): + for idx in float_frame.index: + for col in float_frame.columns: with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.frame.set_value(idx, col, 1) - assert self.frame[col][idx] == 1 + float_frame.set_value(idx, col, 1) + assert float_frame[col][idx] == 1 - def test_set_value_resize(self): + def test_set_value_resize(self, float_frame): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - res = self.frame.set_value('foobar', 'B', 0) - assert res is self.frame + res = float_frame.set_value('foobar', 'B', 0) + assert res is float_frame assert res.index[-1] == 'foobar' with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): assert res.get_value('foobar', 'B') == 0 - self.frame.loc['foobar', 'qux'] = 0 + float_frame.loc['foobar', 'qux'] = 0 with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - assert self.frame.get_value('foobar', 'qux') == 0 + assert float_frame.get_value('foobar', 'qux') == 0 - res = self.frame.copy() + res = float_frame.copy() with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): res3 = res.set_value('foobar', 'baz', 'sam') assert res3['baz'].dtype == np.object_ - res = self.frame.copy() + res = float_frame.copy() with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): res3 = res.set_value('foobar', 'baz', True) assert res3['baz'].dtype == np.object_ - res = self.frame.copy() + res = float_frame.copy() with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): res3 = res.set_value('foobar', 'baz', 5) @@ -1907,16 +1918,16 @@ def test_get_set_value_no_partial_indexing(self): with pytest.raises(KeyError, match=r"^0$"): df.get_value(0, 1) - def test_single_element_ix_dont_upcast(self): - self.frame['E'] = 1 - assert issubclass(self.frame['E'].dtype.type, (int, np.integer)) + def test_single_element_ix_dont_upcast(self, float_frame): + float_frame['E'] = 1 + assert issubclass(float_frame['E'].dtype.type, (int, np.integer)) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - result = self.frame.ix[self.frame.index[5], 'E'] + result = float_frame.ix[float_frame.index[5], 'E'] assert is_integer(result) - result = self.frame.loc[self.frame.index[5], 'E'] + result = float_frame.loc[float_frame.index[5], 'E'] assert is_integer(result) # GH 11617 @@ -2079,12 +2090,12 @@ def test_iloc_sparse_propegate_fill_value(self): df = SparseDataFrame({'A': [999, 1]}, default_fill_value=999) assert len(df['A'].sp_values) == len(df.iloc[:, 0].sp_values) - def test_iat(self): + def test_iat(self, float_frame): - for i, row in enumerate(self.frame.index): - for j, col in enumerate(self.frame.columns): - result = self.frame.iat[i, j] - expected = self.frame.at[row, col] + for i, row in enumerate(float_frame.index): + for j, col in enumerate(float_frame.columns): + result = float_frame.iat[i, j] + expected = float_frame.at[row, col] assert result == expected def test_nested_exception(self): @@ -2433,14 +2444,14 @@ def test_at_time_between_time_datetimeindex(self): result.loc[bkey] = df.iloc[binds] assert_frame_equal(result, df) - def test_xs(self): - idx = self.frame.index[5] - xs = self.frame.xs(idx) + def test_xs(self, float_frame, datetime_frame): + idx = float_frame.index[5] + xs = float_frame.xs(idx) for item, value in xs.items(): if np.isnan(value): - assert np.isnan(self.frame[item][idx]) + assert np.isnan(float_frame[item][idx]) else: - assert value == self.frame[item][idx] + assert value == float_frame[item][idx] # mixed-type xs test_data = { @@ -2454,15 +2465,15 @@ def test_xs(self): assert xs['B'] == '1' with pytest.raises(KeyError): - self.tsframe.xs(self.tsframe.index[0] - BDay()) + datetime_frame.xs(datetime_frame.index[0] - BDay()) # xs get column - series = self.frame.xs('A', axis=1) - expected = self.frame['A'] + series = float_frame.xs('A', axis=1) + expected = float_frame['A'] assert_series_equal(series, expected) # view is returned if possible - series = self.frame.xs('A', axis=1) + series = float_frame.xs('A', axis=1) series[:] = 5 assert (expected == 5).all() @@ -2582,7 +2593,8 @@ def test_boolean_indexing_mixed(self): with pytest.raises(TypeError, match=msg): df[df > 0.3] = 1 - def test_where(self): + def test_where(self, float_string_frame, mixed_float_frame, + mixed_int_frame): default_frame = DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C']) @@ -2610,9 +2622,9 @@ def _check_get(df, cond, check_dtypes=True): assert (rs.dtypes == df.dtypes).all() # check getting - for df in [default_frame, self.mixed_frame, - self.mixed_float, self.mixed_int]: - if df is self.mixed_frame: + for df in [default_frame, float_string_frame, + mixed_float_frame, mixed_int_frame]: + if df is float_string_frame: with pytest.raises(TypeError): df > 0 continue @@ -2662,8 +2674,8 @@ def _check_align(df, cond, other, check_dtypes=True): if check_dtypes and not isinstance(other, np.ndarray): assert (rs.dtypes == df.dtypes).all() - for df in [self.mixed_frame, self.mixed_float, self.mixed_int]: - if df is self.mixed_frame: + for df in [float_string_frame, mixed_float_frame, mixed_int_frame]: + if df is float_string_frame: with pytest.raises(TypeError): df > 0 continue @@ -2716,9 +2728,9 @@ def _check_set(df, cond, check_dtypes=True): v = np.dtype('float64') assert dfi[k].dtype == v - for df in [default_frame, self.mixed_frame, self.mixed_float, - self.mixed_int]: - if df is self.mixed_frame: + for df in [default_frame, float_string_frame, mixed_float_frame, + mixed_int_frame]: + if df is float_string_frame: with pytest.raises(TypeError): df > 0 continue @@ -3166,20 +3178,20 @@ def test_mask_callable(self): tm.assert_frame_equal(result, (df + 2).mask((df + 2) > 8, (df + 2) + 10)) - def test_head_tail(self): - assert_frame_equal(self.frame.head(), self.frame[:5]) - assert_frame_equal(self.frame.tail(), self.frame[-5:]) + def test_head_tail(self, float_frame): + assert_frame_equal(float_frame.head(), float_frame[:5]) + assert_frame_equal(float_frame.tail(), float_frame[-5:]) - assert_frame_equal(self.frame.head(0), self.frame[0:0]) - assert_frame_equal(self.frame.tail(0), self.frame[0:0]) + assert_frame_equal(float_frame.head(0), float_frame[0:0]) + assert_frame_equal(float_frame.tail(0), float_frame[0:0]) - assert_frame_equal(self.frame.head(-1), self.frame[:-1]) - assert_frame_equal(self.frame.tail(-1), self.frame[1:]) - assert_frame_equal(self.frame.head(1), self.frame[:1]) - assert_frame_equal(self.frame.tail(1), self.frame[-1:]) + assert_frame_equal(float_frame.head(-1), float_frame[:-1]) + assert_frame_equal(float_frame.tail(-1), float_frame[1:]) + assert_frame_equal(float_frame.head(1), float_frame[:1]) + assert_frame_equal(float_frame.tail(1), float_frame[-1:]) # with a float index - df = self.frame.copy() - df.index = np.arange(len(self.frame)) + 0.1 + df = float_frame.copy() + df.index = np.arange(len(float_frame)) + 0.1 assert_frame_equal(df.head(), df.iloc[:5]) assert_frame_equal(df.tail(), df.iloc[-5:]) assert_frame_equal(df.head(0), df[0:0]) @@ -3243,16 +3255,10 @@ def test_interval_index(self): class TestDataFrameIndexingDatetimeWithTZ(TestData): - def setup_method(self, method): - self.idx = Index(date_range('20130101', periods=3, tz='US/Eastern'), - name='foo') - self.dr = date_range('20130110', periods=3) - self.df = DataFrame({'A': self.idx, 'B': self.dr}) - - def test_setitem(self): + def test_setitem(self, timezone_frame): - df = self.df - idx = self.idx + df = timezone_frame + idx = df['B'].rename('foo') # setitem df['C'] = idx @@ -3281,7 +3287,8 @@ def test_setitem(self): def test_set_reset(self): - idx = self.idx + idx = Index(date_range('20130101', periods=3, tz='US/Eastern'), + name='foo') # set/reset df = DataFrame({'A': [0, 1, 2]}, index=idx) @@ -3291,11 +3298,11 @@ def test_set_reset(self): df = result.set_index('foo') tm.assert_index_equal(df.index, idx) - def test_transpose(self): + def test_transpose(self, timezone_frame): - result = self.df.T - expected = DataFrame(self.df.values.T) - expected.index = ['A', 'B'] + result = timezone_frame.T + expected = DataFrame(timezone_frame.values.T) + expected.index = ['A', 'B', 'C'] assert_frame_equal(result, expected) def test_scalar_assignment(self): @@ -3309,16 +3316,10 @@ def test_scalar_assignment(self): class TestDataFrameIndexingUInt64(TestData): - def setup_method(self, method): - self.ir = Index(np.arange(3), dtype=np.uint64) - self.idx = Index([2**63, 2**63 + 5, 2**63 + 10], name='foo') - - self.df = DataFrame({'A': self.idx, 'B': self.ir}) - - def test_setitem(self): + def test_setitem(self, uint64_frame): - df = self.df - idx = self.idx + df = uint64_frame + idx = df['A'].rename('foo') # setitem df['C'] = idx @@ -3343,7 +3344,7 @@ def test_setitem(self): def test_set_reset(self): - idx = self.idx + idx = Index([2**63, 2**63 + 5, 2**63 + 10], name='foo') # set/reset df = DataFrame({'A': [0, 1, 2]}, index=idx) @@ -3353,10 +3354,10 @@ def test_set_reset(self): df = result.set_index('foo') tm.assert_index_equal(df.index, idx) - def test_transpose(self): + def test_transpose(self, uint64_frame): - result = self.df.T - expected = DataFrame(self.df.values.T) + result = uint64_frame.T + expected = DataFrame(uint64_frame.values.T) expected.index = ['A', 'B'] assert_frame_equal(result, expected) From cfb9bbe5a15f8ca3a133b396c5b90494c095e26f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 28 Jun 2019 07:32:41 -0500 Subject: [PATCH 080/238] CLN: clean-up sanitize_array series construction (#26979) --- pandas/core/internals/construction.py | 79 ++++++++++++--------------- 1 file changed, 36 insertions(+), 43 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 0806e6e927e8de..bdfb854679a2c5 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -21,8 +21,8 @@ is_extension_array_dtype, is_extension_type, is_float_dtype, is_integer_dtype, is_iterator, is_list_like, is_object_dtype, pandas_dtype) from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, ABCPandasArray, - ABCPeriodIndex, ABCSeries, ABCTimedeltaIndex) + ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, ABCPeriodIndex, ABCSeries, + ABCTimedeltaIndex) from pandas.core.dtypes.missing import isna from pandas.core import algorithms, common as com @@ -570,59 +570,40 @@ def sanitize_array(data, index, dtype=None, copy=False, else: data = data.copy() + # extract ndarray or ExtensionArray, ensure we have no PandasArray data = extract_array(data, extract_numpy=True) # GH#846 if isinstance(data, np.ndarray): - if dtype is not None: - subarr = np.array(data, copy=False) - + if (dtype is not None + and is_float_dtype(data.dtype) and is_integer_dtype(dtype)): # possibility of nan -> garbage - if is_float_dtype(data.dtype) and is_integer_dtype(dtype): - try: - subarr = _try_cast(data, True, dtype, copy, - True) - except ValueError: - if copy: - subarr = data.copy() - else: - subarr = _try_cast(data, True, dtype, copy, raise_cast_failure) - elif isinstance(data, Index): - # don't coerce Index types - # e.g. indexes can have different conversions (so don't fast path - # them) - # GH#6140 - subarr = sanitize_index(data, index, copy=copy) + try: + subarr = _try_cast(data, dtype, copy, True) + except ValueError: + if copy: + subarr = data.copy() + else: + subarr = np.array(data, copy=False) else: - # we will try to copy be-definition here - subarr = _try_cast(data, True, dtype, copy, raise_cast_failure) + subarr = _try_cast(data, dtype, copy, raise_cast_failure) elif isinstance(data, ExtensionArray): - if isinstance(data, ABCPandasArray): - # We don't want to let people put our PandasArray wrapper - # (the output of Series/Index.array), into a Series. So - # we explicitly unwrap it here. - subarr = data.to_numpy() - else: - subarr = data - - # everything else in this block must also handle ndarray's, - # because we've unwrapped PandasArray into an ndarray. + # it is already ensured above this is not a PandasArray + subarr = data if dtype is not None: - subarr = data.astype(dtype) - - if copy: - subarr = data.copy() + subarr = subarr.astype(dtype, copy=copy) + elif copy: + subarr = subarr.copy() return subarr elif isinstance(data, (list, tuple)) and len(data) > 0: if dtype is not None: try: - subarr = _try_cast(data, False, dtype, copy, - raise_cast_failure) + subarr = _try_cast(data, dtype, copy, raise_cast_failure) except Exception: if raise_cast_failure: # pragma: no cover raise @@ -637,9 +618,9 @@ def sanitize_array(data, index, dtype=None, copy=False, elif isinstance(data, range): # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype='int64') - subarr = _try_cast(arr, False, dtype, copy, raise_cast_failure) + subarr = _try_cast(arr, dtype, copy, raise_cast_failure) else: - subarr = _try_cast(data, False, dtype, copy, raise_cast_failure) + subarr = _try_cast(data, dtype, copy, raise_cast_failure) # scalar like, GH if getattr(subarr, 'ndim', 0) == 0: @@ -701,10 +682,22 @@ def sanitize_array(data, index, dtype=None, copy=False, return subarr -def _try_cast(arr, take_fast_path, dtype, copy, raise_cast_failure): - +def _try_cast(arr, dtype, copy, raise_cast_failure): + """ + Convert input to numpy ndarray and optionally cast to a given dtype. + + Parameters + ---------- + arr : array-like + dtype : np.dtype, ExtensionDtype or None + copy : bool + If False, don't copy the data if not needed. + raise_cast_failure : bool + If True, and if a dtype is specified, raise errors during casting. + Otherwise an object array is returned. + """ # perf shortcut as this is the most common case - if take_fast_path: + if isinstance(arr, np.ndarray): if maybe_castable(arr) and not copy and dtype is None: return arr From de22a483ace1c0cb1ff6ac1d245c4afe5e514ce6 Mon Sep 17 00:00:00 2001 From: pilkibun <51503352+pilkibun@users.noreply.github.com> Date: Fri, 28 Jun 2019 12:33:25 +0000 Subject: [PATCH 081/238] TST: Add missing tests for loc slicing of PeriodIndex, TimedeltaIndex (#27086) --- pandas/tests/indexing/test_datetime.py | 16 ++++++++++++++++ pandas/tests/indexing/test_timedelta.py | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/pandas/tests/indexing/test_datetime.py b/pandas/tests/indexing/test_datetime.py index 4c865d00b3adbd..d4da34cab6f5c9 100644 --- a/pandas/tests/indexing/test_datetime.py +++ b/pandas/tests/indexing/test_datetime.py @@ -313,3 +313,19 @@ def test_loc_setitem_with_existing_dst(self): columns=['value'], dtype=object) tm.assert_frame_equal(result, expected) + + def test_loc_str_slicing(self): + ix = pd.period_range(start='2017-01-01', end='2018-01-01', freq='M') + ser = ix.to_series() + result = ser.loc[:"2017-12"] + expected = ser.iloc[:-1] + + tm.assert_series_equal(result, expected) + + def test_loc_label_slicing(self): + ix = pd.period_range(start='2017-01-01', end='2018-01-01', freq='M') + ser = ix.to_series() + result = ser.loc[:ix[-2]] + expected = ser.iloc[:-1] + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/test_timedelta.py b/pandas/tests/indexing/test_timedelta.py index 8e7a71ad3d71e1..e3f5bcff4a22e2 100644 --- a/pandas/tests/indexing/test_timedelta.py +++ b/pandas/tests/indexing/test_timedelta.py @@ -95,3 +95,19 @@ def test_roundtrip_thru_setitem(self): assert expected == result tm.assert_frame_equal(df, df_copy) + + def test_loc_str_slicing(self): + ix = pd.timedelta_range(start='1 day', end='2 days', freq='1H') + ser = ix.to_series() + result = ser.loc[:"1 days"] + expected = ser.iloc[:-1] + + tm.assert_series_equal(result, expected) + + def test_loc_slicing(self): + ix = pd.timedelta_range(start='1 day', end='2 days', freq='1H') + ser = ix.to_series() + result = ser.loc[:ix[-2]] + expected = ser.iloc[:-1] + + tm.assert_series_equal(result, expected) From c3133dbebb8442773ab5a6453d78dfbde4742219 Mon Sep 17 00:00:00 2001 From: Ryan Joyce Date: Fri, 28 Jun 2019 08:38:31 -0400 Subject: [PATCH 082/238] BUG: XlsxWriter ignoring formats on numpy types if merged cells (#27006) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/io/excel/_xlsxwriter.py | 2 +- pandas/tests/io/excel/test_writers.py | 16 ++++++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index da939687500b66..f2136c4b86aacf 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -785,6 +785,7 @@ I/O - Fixed bug in :func:`pandas.read_csv` where a BOM would result in incorrect parsing using engine='python' (:issue:`26545`) - :func:`read_excel` now raises a ``ValueError`` when input is of type :class:`pandas.io.excel.ExcelFile` and ``engine`` param is passed since :class:`pandas.io.excel.ExcelFile` has an engine defined (:issue:`26566`) - Bug while selecting from :class:`HDFStore` with ``where=''`` specified (:issue:`26610`). +- Fixed bug in :func:`DataFrame.to_excel()` where custom objects (i.e. `PeriodIndex`) inside merged cells were not being converted into types safe for the Excel writer (:issue:`27006`) Plotting ^^^^^^^^ diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py index 2dc736f81f6f8c..2ddfcf3de5a8f1 100644 --- a/pandas/io/excel/_xlsxwriter.py +++ b/pandas/io/excel/_xlsxwriter.py @@ -210,7 +210,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, startcol + cell.col, startrow + cell.mergestart, startcol + cell.mergeend, - cell.val, style) + val, style) else: wks.write(startrow + cell.row, startcol + cell.col, diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index ea75e97bace0bc..a4fdcdf70a3ea6 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -1162,6 +1162,22 @@ def test_path_local_path(self, engine, ext): path="foo.{ext}".format(ext=ext)) tm.assert_frame_equal(result, df) + def test_merged_cell_custom_objects(self, engine, merge_cells, ext): + # see GH-27006 + mi = MultiIndex.from_tuples([(pd.Period('2018'), pd.Period('2018Q1')), + (pd.Period('2018'), pd.Period('2018Q2'))]) + expected = DataFrame(np.ones((2, 2)), columns=mi) + expected.to_excel(self.path) + result = pd.read_excel(self.path, header=[0, 1], + index_col=0, convert_float=False) + # need to convert PeriodIndexes to standard Indexes for assert equal + expected.columns.set_levels([[str(i) for i in mi.levels[0]], + [str(i) for i in mi.levels[1]]], + level=[0, 1], + inplace=True) + expected.index = expected.index.astype(np.float64) + tm.assert_frame_equal(expected, result) + class TestExcelWriterEngineTests: From e9f9ca1f18f22215bb32cfd182f69997792b50e4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 28 Jun 2019 09:13:24 -0400 Subject: [PATCH 083/238] BUG: Fix handling of ambiguous or nonexistent of start and end times in date_range (#27088) * BUG: Raise AmbiguousTimeError for date_range with ambiguous start time. * Clarify comment * Add nonexistent tests * xfail one case after discovered bug * Add whatsnew issue number * Missing backtick * Misspelling --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/arrays/datetimes.py | 22 +++++++++----- .../tests/indexes/datetimes/test_timezones.py | 29 +++++++++++++++---- 3 files changed, 39 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index f2136c4b86aacf..f4dd94b7d918b2 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -691,6 +691,7 @@ Timezones - Bug in :func:`to_datetime` where an uninformative ``RuntimeError`` was raised when passing a naive :class:`Timestamp` with datetime strings with mixed UTC offsets (:issue:`25978`) - Bug in :func:`to_datetime` with ``unit='ns'`` would drop timezone information from the parsed argument (:issue:`26168`) - Bug in :func:`DataFrame.join` where joining a timezone aware index with a timezone aware column would result in a column of ``NaN`` (:issue:`26335`) +- Bug in :func:`date_range` where ambiguous or nonexistent start or end times were not handled by the ``ambiguous`` or ``nonexistent`` keywords respectively (:issue:`27088`) Numeric ^^^^^^^ diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index eaa0278da6dc32..6b554ddf25c96d 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -433,10 +433,12 @@ def _generate_range(cls, start, end, periods, freq, tz=None, if tz is not None: # Localize the start and end arguments start = _maybe_localize_point( - start, getattr(start, 'tz', None), start, freq, tz + start, getattr(start, 'tz', None), start, freq, tz, + ambiguous, nonexistent ) end = _maybe_localize_point( - end, getattr(end, 'tz', None), end, freq, tz + end, getattr(end, 'tz', None), end, freq, tz, + ambiguous, nonexistent ) if freq is not None: # We break Day arithmetic (fixed 24 hour) here and opt for @@ -2121,7 +2123,8 @@ def _maybe_normalize_endpoints(start, end, normalize): return start, end, _normalized -def _maybe_localize_point(ts, is_none, is_not_none, freq, tz): +def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous, + nonexistent): """ Localize a start or end Timestamp to the timezone of the corresponding start or end Timestamp @@ -2133,6 +2136,8 @@ def _maybe_localize_point(ts, is_none, is_not_none, freq, tz): is_not_none : argument that should not be None freq : Tick, DateOffset, or None tz : str, timezone object or None + ambiguous: str, localization behavior for ambiguous times + nonexistent: str, localization behavior for nonexistent times Returns ------- @@ -2141,10 +2146,13 @@ def _maybe_localize_point(ts, is_none, is_not_none, freq, tz): # Make sure start and end are timezone localized if: # 1) freq = a Timedelta-like frequency (Tick) # 2) freq = None i.e. generating a linspaced range - if isinstance(freq, Tick) or freq is None: - localize_args = {'tz': tz, 'ambiguous': False} - else: - localize_args = {'tz': None} if is_none is None and is_not_none is not None: + # Note: We can't ambiguous='infer' a singular ambiguous time; however, + # we have historically defaulted ambiguous=False + ambiguous = ambiguous if ambiguous != 'infer' else False + localize_args = {'ambiguous': ambiguous, 'nonexistent': nonexistent, + 'tz': None} + if isinstance(freq, Tick) or freq is None: + localize_args['tz'] = tz ts = ts.tz_localize(**localize_args) return ts diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 908d563eca8fa6..088007ba6af4b6 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -541,12 +541,9 @@ def test_dti_construction_ambiguous_endpoint(self, tz): # construction with an ambiguous end-point # GH#11626 - # FIXME: This next block fails to raise; it was taken from an older - # version of this test that had an indention mistake that caused it - # to not get executed. - # with pytest.raises(pytz.AmbiguousTimeError): - # date_range("2013-10-26 23:00", "2013-10-27 01:00", - # tz="Europe/London", freq="H") + with pytest.raises(pytz.AmbiguousTimeError): + date_range("2013-10-26 23:00", "2013-10-27 01:00", + tz="Europe/London", freq="H") times = date_range("2013-10-26 23:00", "2013-10-27 01:00", freq="H", tz=tz, ambiguous='infer') @@ -561,6 +558,26 @@ def test_dti_construction_ambiguous_endpoint(self, tz): assert times[-1] == Timestamp('2013-10-27 01:00:00+0000', tz=tz, freq="H") + @pytest.mark.parametrize('tz, option, expected', [ + ['US/Pacific', 'shift_forward', "2019-03-10 03:00"], + ['dateutil/US/Pacific', 'shift_forward', "2019-03-10 03:00"], + ['US/Pacific', 'shift_backward', "2019-03-10 01:00"], + pytest.param('dateutil/US/Pacific', 'shift_backward', + "2019-03-10 01:00", + marks=pytest.mark.xfail(reason="GH 24329")), + ['US/Pacific', timedelta(hours=1), "2019-03-10 03:00"] + ]) + def test_dti_construction_nonexistent_endpoint(self, tz, option, expected): + # construction with an nonexistent end-point + + with pytest.raises(pytz.NonExistentTimeError): + date_range("2019-03-10 00:00", "2019-03-10 02:00", + tz="US/Pacific", freq="H") + + times = date_range("2019-03-10 00:00", "2019-03-10 02:00", freq="H", + tz=tz, nonexistent=option) + assert times[-1] == Timestamp(expected, tz=tz, freq="H") + def test_dti_tz_localize_bdate_range(self): dr = pd.bdate_range('1/1/2009', '1/1/2010') dr_utc = pd.bdate_range('1/1/2009', '1/1/2010', tz=pytz.utc) From 71379386cdbf4db38718e0f344c8b7ac7035d474 Mon Sep 17 00:00:00 2001 From: How Si Wei Date: Fri, 28 Jun 2019 22:10:20 +0800 Subject: [PATCH 084/238] ENH: Support multiple opening hours intervals for BusinessHour (#26628) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/tests/tseries/offsets/test_offsets.py | 355 +++++++++++++++++-- pandas/tseries/offsets.py | 284 ++++++++++----- 3 files changed, 526 insertions(+), 114 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index f4dd94b7d918b2..1fd0257d93f452 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -158,6 +158,7 @@ Other enhancements - :meth:`DataFrame.describe` now formats integer percentiles without decimal point (:issue:`26660`) - Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`) - Added new option ``plotting.backend`` to be able to select a plotting backend different than the existing ``matplotlib`` one. Use ``pandas.set_option('plotting.backend', '')`` where ``' assert repr(self.offset6) == '' assert repr(self.offset7) == '<-2 * BusinessHours: BH=21:30-06:30>' + assert (repr(self.offset8) == + '') + assert (repr(self.offset9) == + '<3 * BusinessHours: BH=09:00-13:00,22:00-03:00>') + assert (repr(self.offset10) == + '<-1 * BusinessHour: BH=13:00-17:00,23:00-02:00>') def test_with_offset(self): expected = Timestamp('2014-07-01 13:00') @@ -791,25 +848,59 @@ def test_with_offset(self): assert self.d + BusinessHour() * 3 == expected assert self.d + BusinessHour(n=3) == expected - def test_eq(self): - for offset in [self.offset1, self.offset2, self.offset3, self.offset4]: - assert offset == offset + @pytest.mark.parametrize("offset_name", [ + "offset1", + "offset2", + "offset3", + "offset4", + "offset8", + "offset9", + "offset10" + ]) + def test_eq_attribute(self, offset_name): + offset = getattr(self, offset_name) + assert offset == offset + + @pytest.mark.parametrize("offset1,offset2", [ + (BusinessHour(start='09:00'), BusinessHour()), + (BusinessHour(start=['23:00', '13:00'], end=['12:00', '17:00']), + BusinessHour(start=['13:00', '23:00'], end=['17:00', '12:00'])), + ]) + def test_eq(self, offset1, offset2): + assert offset1 == offset2 - assert BusinessHour() != BusinessHour(-1) - assert BusinessHour(start='09:00') == BusinessHour() - assert BusinessHour(start='09:00') != BusinessHour(start='09:01') - assert (BusinessHour(start='09:00', end='17:00') != - BusinessHour(start='17:00', end='09:01')) + @pytest.mark.parametrize("offset1,offset2", [ + (BusinessHour(), BusinessHour(-1)), + (BusinessHour(start='09:00'), BusinessHour(start='09:01')), + (BusinessHour(start='09:00', end='17:00'), + BusinessHour(start='17:00', end='09:01')), + (BusinessHour(start=['13:00', '23:00'], end=['18:00', '07:00']), + BusinessHour(start=['13:00', '23:00'], end=['17:00', '12:00'])), + ]) + def test_neq(self, offset1, offset2): + assert offset1 != offset2 - def test_hash(self): - for offset in [self.offset1, self.offset2, self.offset3, self.offset4]: - assert hash(offset) == hash(offset) + @pytest.mark.parametrize("offset_name", [ + "offset1", + "offset2", + "offset3", + "offset4", + "offset8", + "offset9", + "offset10" + ]) + def test_hash(self, offset_name): + offset = getattr(self, offset_name) + assert offset == offset def test_call(self): assert self.offset1(self.d) == datetime(2014, 7, 1, 11) assert self.offset2(self.d) == datetime(2014, 7, 1, 13) assert self.offset3(self.d) == datetime(2014, 6, 30, 17) assert self.offset4(self.d) == datetime(2014, 6, 30, 14) + assert self.offset8(self.d) == datetime(2014, 7, 1, 11) + assert self.offset9(self.d) == datetime(2014, 7, 1, 22) + assert self.offset10(self.d) == datetime(2014, 7, 1, 1) def test_sub(self): # we have to override test_sub here because self.offset2 is not @@ -830,6 +921,9 @@ def testRollback1(self): assert self.offset5.rollback(self.d) == datetime(2014, 6, 30, 14, 30) assert self.offset6.rollback(self.d) == datetime(2014, 7, 1, 5, 0) assert self.offset7.rollback(self.d) == datetime(2014, 7, 1, 6, 30) + assert self.offset8.rollback(self.d) == self.d + assert self.offset9.rollback(self.d) == self.d + assert self.offset10.rollback(self.d) == datetime(2014, 7, 1, 2) d = datetime(2014, 7, 1, 0) assert self.offset1.rollback(d) == datetime(2014, 6, 30, 17) @@ -839,6 +933,9 @@ def testRollback1(self): assert self.offset5.rollback(d) == datetime(2014, 6, 30, 14, 30) assert self.offset6.rollback(d) == d assert self.offset7.rollback(d) == d + assert self.offset8.rollback(d) == datetime(2014, 6, 30, 17) + assert self.offset9.rollback(d) == d + assert self.offset10.rollback(d) == d assert self._offset(5).rollback(self.d) == self.d @@ -857,6 +954,9 @@ def testRollforward1(self): datetime(2014, 7, 1, 20, 0)) assert (self.offset7.rollforward(self.d) == datetime(2014, 7, 1, 21, 30)) + assert self.offset8.rollforward(self.d) == self.d + assert self.offset9.rollforward(self.d) == self.d + assert self.offset10.rollforward(self.d) == datetime(2014, 7, 1, 13) d = datetime(2014, 7, 1, 0) assert self.offset1.rollforward(d) == datetime(2014, 7, 1, 9) @@ -866,6 +966,9 @@ def testRollforward1(self): assert self.offset5.rollforward(d) == datetime(2014, 7, 1, 11) assert self.offset6.rollforward(d) == d assert self.offset7.rollforward(d) == d + assert self.offset8.rollforward(d) == datetime(2014, 7, 1, 9) + assert self.offset9.rollforward(d) == d + assert self.offset10.rollforward(d) == d assert self._offset(5).rollforward(self.d) == self.d @@ -960,6 +1063,35 @@ def test_normalize(self, case): datetime(2014, 7, 6, 23, 0): False, datetime(2014, 7, 7, 3, 0): False})) + on_offset_cases.append((BusinessHour(start=['09:00', '13:00'], + end=['12:00', '17:00']), { + datetime(2014, 7, 1, 9): True, + datetime(2014, 7, 1, 8, 59): False, + datetime(2014, 7, 1, 8): False, + datetime(2014, 7, 1, 17): True, + datetime(2014, 7, 1, 17, 1): False, + datetime(2014, 7, 1, 18): False, + datetime(2014, 7, 5, 9): False, + datetime(2014, 7, 6, 12): False, + datetime(2014, 7, 1, 12, 30): False})) + + on_offset_cases.append((BusinessHour(start=['19:00', '23:00'], + end=['21:00', '05:00']), { + datetime(2014, 7, 1, 9, 0): False, + datetime(2014, 7, 1, 10, 0): False, + datetime(2014, 7, 1, 15): False, + datetime(2014, 7, 1, 15, 1): False, + datetime(2014, 7, 5, 12, 0): False, + datetime(2014, 7, 6, 12, 0): False, + datetime(2014, 7, 1, 19, 0): True, + datetime(2014, 7, 2, 0, 0): True, + datetime(2014, 7, 4, 23): True, + datetime(2014, 7, 5, 1): True, + datetime(2014, 7, 5, 5, 0): True, + datetime(2014, 7, 6, 23, 0): False, + datetime(2014, 7, 7, 3, 0): False, + datetime(2014, 7, 4, 22): False})) + @pytest.mark.parametrize('case', on_offset_cases) def test_onOffset(self, case): offset, cases = case @@ -1125,6 +1257,76 @@ def test_onOffset(self, case): datetime(2014, 7, 7, 18): (datetime(2014, 7, 7, 17), datetime(2014, 7, 8, 17))})) + opening_time_cases.append(([BusinessHour(start=['11:15', '15:00'], + end=['13:00', '20:00']), + BusinessHour(n=3, start=['11:15', '15:00'], + end=['12:00', '20:00']), + BusinessHour(start=['11:15', '15:00'], + end=['13:00', '17:00']), + BusinessHour(n=2, start=['11:15', '15:00'], + end=['12:00', '03:00']), + BusinessHour(n=3, start=['11:15', '15:00'], + end=['13:00', '16:00'])], { + datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 11, 15), + datetime(2014, 6, 30, 15)), + datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15)), + datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15)), + datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15)), + datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15)), + datetime(2014, 7, 2, 10): (datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15)), + datetime(2014, 7, 2, 11, 15): (datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 2, 11, 15)), + datetime(2014, 7, 2, 11, 15, 1): (datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 11, 15)), + datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15)), + datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 11, 15), + datetime(2014, 7, 3, 15)), + datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15)), + datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15)), + datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15)), + datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15)), + datetime(2014, 7, 7, 12): (datetime(2014, 7, 7, 15), + datetime(2014, 7, 7, 11, 15))})) + + opening_time_cases.append(([BusinessHour(n=-1, start=['17:00', '08:00'], + end=['05:00', '10:00']), + BusinessHour(n=-2, start=['08:00', '17:00'], + end=['10:00', '03:00'])], { + datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 8), + datetime(2014, 7, 1, 17)), + datetime(2014, 7, 1, 18): (datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 8)), + datetime(2014, 7, 1, 23): (datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 8)), + datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 8), + datetime(2014, 7, 2, 8)), + datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 8), + datetime(2014, 7, 2, 17)), + datetime(2014, 7, 2, 16, 59): (datetime(2014, 7, 2, 8), + datetime(2014, 7, 2, 17)), + datetime(2014, 7, 5, 10): (datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 8)), + datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 8), + datetime(2014, 7, 4, 17)), + datetime(2014, 7, 4, 23): (datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 8)), + datetime(2014, 7, 6, 10): (datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 8)), + datetime(2014, 7, 7, 5): (datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 8)), + datetime(2014, 7, 7, 18): (datetime(2014, 7, 7, 17), + datetime(2014, 7, 8, 8))})) + @pytest.mark.parametrize('case', opening_time_cases) def test_opening_time(self, case): _offsets, cases = case @@ -1303,6 +1505,81 @@ def test_opening_time(self, case): datetime(2014, 7, 7, 3, 30, 30): datetime(2014, 7, 4, 22, 30, 30), datetime(2014, 7, 7, 3, 30, 20): datetime(2014, 7, 4, 22, 30, 20)})) + # multiple business hours + apply_cases.append((BusinessHour(start=['09:00', '14:00'], + end=['12:00', '18:00']), { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), + datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 10), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 17), + datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 1, 17, 30, 15), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 9), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 14), + # out of business hours + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 10), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), + # saturday + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 17, 30): datetime(2014, 7, 7, 9, 30), + datetime(2014, 7, 4, 17, 30, 30): datetime(2014, 7, 7, 9, 30, 30)})) + + apply_cases.append((BusinessHour(n=4, start=['09:00', '14:00'], + end=['12:00', '18:00']), { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 17), + datetime(2014, 7, 1, 13): datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 10), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 11), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 14), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 17), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 15), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 15), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 15), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 14), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 11, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 11, 30, 30)})) + + apply_cases.append((BusinessHour(n=-4, start=['09:00', '14:00'], + end=['12:00', '18:00']), { + datetime(2014, 7, 1, 11): datetime(2014, 6, 30, 16), + datetime(2014, 7, 1, 13): datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 15): datetime(2014, 6, 30, 18), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 10), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 11), + datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 12), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 12), + datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 12), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 12), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 12), + datetime(2014, 7, 4, 18): datetime(2014, 7, 4, 12), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 14, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 14, 30, 30)})) + + apply_cases.append((BusinessHour(n=-1, start=['19:00', '03:00'], + end=['01:00', '05:00']), { + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 4), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 20): datetime(2014, 7, 2, 5), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 4): datetime(2014, 7, 2, 1), + datetime(2014, 7, 2, 19, 30): datetime(2014, 7, 2, 4, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 23), + datetime(2014, 7, 3, 6): datetime(2014, 7, 3, 4), + datetime(2014, 7, 4, 23): datetime(2014, 7, 4, 22), + datetime(2014, 7, 5, 0): datetime(2014, 7, 4, 23), + datetime(2014, 7, 5, 4): datetime(2014, 7, 5, 0), + datetime(2014, 7, 7, 3, 30): datetime(2014, 7, 5, 0, 30), + datetime(2014, 7, 7, 19, 30): datetime(2014, 7, 7, 4, 30), + datetime(2014, 7, 7, 19, 30, 30): datetime(2014, 7, 7, 4, 30, 30)})) + @pytest.mark.parametrize('case', apply_cases) def test_apply(self, case): offset, cases = case @@ -1359,6 +1636,42 @@ def test_apply(self, case): datetime(2014, 7, 7, 1): datetime(2014, 7, 15, 0), datetime(2014, 7, 7, 23, 30): datetime(2014, 7, 15, 21, 30)})) + # large n for multiple opening hours (3 days and 1 hour before) + apply_large_n_cases.append((BusinessHour(n=-25, start=['09:00', '14:00'], + end=['12:00', '19:00']), { + datetime(2014, 7, 1, 11): datetime(2014, 6, 26, 10), + datetime(2014, 7, 1, 13): datetime(2014, 6, 26, 11), + datetime(2014, 7, 1, 9): datetime(2014, 6, 25, 18), + datetime(2014, 7, 1, 10): datetime(2014, 6, 25, 19), + datetime(2014, 7, 3, 11): datetime(2014, 6, 30, 10), + datetime(2014, 7, 3, 8): datetime(2014, 6, 27, 18), + datetime(2014, 7, 3, 19): datetime(2014, 6, 30, 18), + datetime(2014, 7, 3, 23): datetime(2014, 6, 30, 18), + datetime(2014, 7, 4, 9): datetime(2014, 6, 30, 18), + datetime(2014, 7, 5, 15): datetime(2014, 7, 1, 18), + datetime(2014, 7, 6, 18): datetime(2014, 7, 1, 18), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 1, 18, 30), + datetime(2014, 7, 7, 10, 30, 30): datetime(2014, 7, 2, 9, 30, 30)})) + + # 5 days and 3 hours later + apply_large_n_cases.append((BusinessHour(28, start=['21:00', '03:00'], + end=['01:00', '04:00']), { + datetime(2014, 7, 1, 11): datetime(2014, 7, 9, 0), + datetime(2014, 7, 1, 22): datetime(2014, 7, 9, 3), + datetime(2014, 7, 1, 23): datetime(2014, 7, 9, 21), + datetime(2014, 7, 2, 2): datetime(2014, 7, 9, 23), + datetime(2014, 7, 3, 21): datetime(2014, 7, 11, 0), + datetime(2014, 7, 4, 1): datetime(2014, 7, 11, 23), + datetime(2014, 7, 4, 2): datetime(2014, 7, 11, 23), + datetime(2014, 7, 4, 3): datetime(2014, 7, 11, 23), + datetime(2014, 7, 4, 21): datetime(2014, 7, 12, 0), + datetime(2014, 7, 5, 0): datetime(2014, 7, 14, 22), + datetime(2014, 7, 5, 1): datetime(2014, 7, 14, 23), + datetime(2014, 7, 5, 15): datetime(2014, 7, 14, 23), + datetime(2014, 7, 6, 18): datetime(2014, 7, 14, 23), + datetime(2014, 7, 7, 1): datetime(2014, 7, 14, 23), + datetime(2014, 7, 7, 23, 30): datetime(2014, 7, 15, 21, 30)})) + @pytest.mark.parametrize('case', apply_large_n_cases) def test_apply_large_n(self, case): offset, cases = case diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index ac20ad16696386..087c05574090ca 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -17,6 +17,7 @@ from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.core.dtypes.generic import ABCPeriod +from pandas.core.dtypes.inference import is_list_like from pandas.core.tools.datetimes import to_datetime @@ -581,9 +582,44 @@ class BusinessHourMixin(BusinessMixin): def __init__(self, start='09:00', end='17:00', offset=timedelta(0)): # must be validated here to equality check - start = liboffsets._validate_business_time(start) + if not is_list_like(start): + start = [start] + if not len(start): + raise ValueError('Must include at least 1 start time') + + if not is_list_like(end): + end = [end] + if not len(end): + raise ValueError('Must include at least 1 end time') + + start = np.array([liboffsets._validate_business_time(x) + for x in start]) + end = np.array([liboffsets._validate_business_time(x) for x in end]) + + # Validation of input + if len(start) != len(end): + raise ValueError('number of starting time and ending time ' + 'must be the same') + num_openings = len(start) + + # sort starting and ending time by starting time + index = np.argsort(start) + + # convert to tuple so that start and end are hashable + start = tuple(start[index]) + end = tuple(end[index]) + + total_secs = 0 + for i in range(num_openings): + total_secs += self._get_business_hours_by_sec(start[i], end[i]) + total_secs += self._get_business_hours_by_sec( + end[i], start[(i + 1) % num_openings]) + if total_secs != 24 * 60 * 60: + raise ValueError('invalid starting and ending time(s): ' + 'opening hours should not touch or overlap with ' + 'one another') + object.__setattr__(self, "start", start) - end = liboffsets._validate_business_time(end) object.__setattr__(self, "end", end) object.__setattr__(self, "_offset", offset) @@ -605,62 +641,93 @@ def next_bday(self): else: return BusinessDay(n=nb_offset) - @cache_readonly - def _get_daytime_flag(self): - if self.start == self.end: - raise ValueError('start and end must not be the same') - elif self.start < self.end: - return True - else: - return False - - def _next_opening_time(self, other): + def _next_opening_time(self, other, sign=1): """ - If n is positive, return tomorrow's business day opening time. - Otherwise yesterday's business day's opening time. + If self.n and sign have the same sign, return the earliest opening time + later than or equal to current time. + Otherwise the latest opening time earlier than or equal to current + time. Opening time always locates on BusinessDay. - Otherwise, closing time may not if business hour extends over midnight. + However, closing time may not if business hour extends over midnight. + + Parameters + ---------- + other : datetime + Current time. + sign : int, default 1. + Either 1 or -1. Going forward in time if it has the same sign as + self.n. Going backward in time otherwise. + + Returns + ------- + result : datetime + Next opening time. """ + earliest_start = self.start[0] + latest_start = self.start[-1] + if not self.next_bday.onOffset(other): - other = other + self.next_bday + # today is not business day + other = other + sign * self.next_bday + if self.n * sign >= 0: + hour, minute = earliest_start.hour, earliest_start.minute + else: + hour, minute = latest_start.hour, latest_start.minute else: - if self.n >= 0 and self.start < other.time(): - other = other + self.next_bday - elif self.n < 0 and other.time() < self.start: - other = other + self.next_bday - return datetime(other.year, other.month, other.day, - self.start.hour, self.start.minute) + if self.n * sign >= 0: + if latest_start < other.time(): + # current time is after latest starting time in today + other = other + sign * self.next_bday + hour, minute = earliest_start.hour, earliest_start.minute + else: + # find earliest starting time no earlier than current time + for st in self.start: + if other.time() <= st: + hour, minute = st.hour, st.minute + break + else: + if other.time() < earliest_start: + # current time is before earliest starting time in today + other = other + sign * self.next_bday + hour, minute = latest_start.hour, latest_start.minute + else: + # find latest starting time no later than current time + for st in reversed(self.start): + if other.time() >= st: + hour, minute = st.hour, st.minute + break + + return datetime(other.year, other.month, other.day, hour, minute) def _prev_opening_time(self, other): """ - If n is positive, return yesterday's business day opening time. - Otherwise yesterday business day's opening time. + If n is positive, return the latest opening time earlier than or equal + to current time. + Otherwise the earliest opening time later than or equal to current + time. + + Parameters + ---------- + other : datetime + Current time. + + Returns + ------- + result : datetime + Previous opening time. """ - if not self.next_bday.onOffset(other): - other = other - self.next_bday - else: - if self.n >= 0 and other.time() < self.start: - other = other - self.next_bday - elif self.n < 0 and other.time() > self.start: - other = other - self.next_bday - return datetime(other.year, other.month, other.day, - self.start.hour, self.start.minute) + return self._next_opening_time(other, sign=-1) - @cache_readonly - def _get_business_hours_by_sec(self): + def _get_business_hours_by_sec(self, start, end): """ Return business hours in a day by seconds. """ - if self._get_daytime_flag: - # create dummy datetime to calculate businesshours in a day - dtstart = datetime(2014, 4, 1, self.start.hour, self.start.minute) - until = datetime(2014, 4, 1, self.end.hour, self.end.minute) - return (until - dtstart).total_seconds() - else: - dtstart = datetime(2014, 4, 1, self.start.hour, self.start.minute) - until = datetime(2014, 4, 2, self.end.hour, self.end.minute) - return (until - dtstart).total_seconds() + # create dummy datetime to calculate businesshours in a day + dtstart = datetime(2014, 4, 1, start.hour, start.minute) + day = 1 if start < end else 2 + until = datetime(2014, 4, day, end.hour, end.minute) + return int((until - dtstart).total_seconds()) @apply_wraps def rollback(self, dt): @@ -668,13 +735,11 @@ def rollback(self, dt): Roll provided date backward to next offset only if not on offset. """ if not self.onOffset(dt): - businesshours = self._get_business_hours_by_sec if self.n >= 0: - dt = self._prev_opening_time( - dt) + timedelta(seconds=businesshours) + dt = self._prev_opening_time(dt) else: - dt = self._next_opening_time( - dt) + timedelta(seconds=businesshours) + dt = self._next_opening_time(dt) + return self._get_closing_time(dt) return dt @apply_wraps @@ -689,11 +754,28 @@ def rollforward(self, dt): return self._prev_opening_time(dt) return dt + def _get_closing_time(self, dt): + """ + Get the closing time of a business hour interval by its opening time. + + Parameters + ---------- + dt : datetime + Opening time of a business hour interval. + + Returns + ------- + result : datetime + Corresponding closing time. + """ + for i, st in enumerate(self.start): + if st.hour == dt.hour and st.minute == dt.minute: + return dt + timedelta( + seconds=self._get_business_hours_by_sec(st, self.end[i])) + assert False + @apply_wraps def apply(self, other): - businesshours = self._get_business_hours_by_sec - bhdelta = timedelta(seconds=businesshours) - if isinstance(other, datetime): # used for detecting edge condition nanosecond = getattr(other, 'nanosecond', 0) @@ -703,63 +785,75 @@ def apply(self, other): other.hour, other.minute, other.second, other.microsecond) n = self.n + + # adjust other to reduce number of cases to handle if n >= 0: - if (other.time() == self.end or - not self._onOffset(other, businesshours)): + if (other.time() in self.end or + not self._onOffset(other)): other = self._next_opening_time(other) else: - if other.time() == self.start: + if other.time() in self.start: # adjustment to move to previous business day other = other - timedelta(seconds=1) - if not self._onOffset(other, businesshours): + if not self._onOffset(other): other = self._next_opening_time(other) - other = other + bhdelta + other = self._get_closing_time(other) + + # get total business hours by sec in one business day + businesshours = sum(self._get_business_hours_by_sec(st, en) + for st, en in zip(self.start, self.end)) bd, r = divmod(abs(n * 60), businesshours // 60) if n < 0: bd, r = -bd, -r + # adjust by business days first if bd != 0: skip_bd = BusinessDay(n=bd) # midnight business hour may not on BusinessDay if not self.next_bday.onOffset(other): - remain = other - self._prev_opening_time(other) - other = self._next_opening_time(other + skip_bd) + remain + prev_open = self._prev_opening_time(other) + remain = other - prev_open + other = prev_open + skip_bd + remain else: other = other + skip_bd - hours, minutes = divmod(r, 60) - result = other + timedelta(hours=hours, minutes=minutes) - - # because of previous adjustment, time will be larger than start - if n >= 0: - bday_edge = self._prev_opening_time(other) + bhdelta - if bday_edge < result: - bday_remain = result - bday_edge - result = self._next_opening_time(other) - result += bday_remain - else: - bday_edge = self._next_opening_time(other) - if bday_edge > result: - bday_remain = result - bday_edge - result = self._next_opening_time(result) + bhdelta - result += bday_remain + # remaining business hours to adjust + bhour_remain = timedelta(minutes=r) - # edge handling if n >= 0: - if result.time() == self.end: - result = self._next_opening_time(result) + while bhour_remain != timedelta(0): + # business hour left in this business time interval + bhour = self._get_closing_time( + self._prev_opening_time(other)) - other + if bhour_remain < bhour: + # finish adjusting if possible + other += bhour_remain + bhour_remain = timedelta(0) + else: + # go to next business time interval + bhour_remain -= bhour + other = self._next_opening_time(other + bhour) else: - if result.time() == self.start and nanosecond == 0: - # adjustment to move to previous business day - result = self._next_opening_time( - result - timedelta(seconds=1)) + bhdelta + while bhour_remain != timedelta(0): + # business hour left in this business time interval + bhour = self._next_opening_time(other) - other + if (bhour_remain > bhour or + bhour_remain == bhour and nanosecond != 0): + # finish adjusting if possible + other += bhour_remain + bhour_remain = timedelta(0) + else: + # go to next business time interval + bhour_remain -= bhour + other = self._get_closing_time( + self._next_opening_time( + other + bhour - timedelta(seconds=1))) - return result + return other else: - # TODO: Figure out the end of this sente raise ApplyTypeError( - 'Only know how to combine business hour with ') + 'Only know how to combine business hour with datetime') def onOffset(self, dt): if self.normalize and not _is_normalized(dt): @@ -770,10 +864,9 @@ def onOffset(self, dt): dt.minute, dt.second, dt.microsecond) # Valid BH can be on the different BusinessDay during midnight # Distinguish by the time spent from previous opening time - businesshours = self._get_business_hours_by_sec - return self._onOffset(dt, businesshours) + return self._onOffset(dt) - def _onOffset(self, dt, businesshours): + def _onOffset(self, dt): """ Slight speedups using calculated values. """ @@ -786,6 +879,11 @@ def _onOffset(self, dt, businesshours): else: op = self._next_opening_time(dt) span = (dt - op).total_seconds() + businesshours = 0 + for i, st in enumerate(self.start): + if op.hour == st.hour and op.minute == st.minute: + businesshours = self._get_business_hours_by_sec( + st, self.end[i]) if span <= businesshours: return True else: @@ -793,17 +891,17 @@ def _onOffset(self, dt, businesshours): def _repr_attrs(self): out = super()._repr_attrs() - start = self.start.strftime('%H:%M') - end = self.end.strftime('%H:%M') - attrs = ['{prefix}={start}-{end}'.format(prefix=self._prefix, - start=start, end=end)] + hours = ','.join('{}-{}'.format( + st.strftime('%H:%M'), en.strftime('%H:%M')) + for st, en in zip(self.start, self.end)) + attrs = ['{prefix}={hours}'.format(prefix=self._prefix, hours=hours)] out += ': ' + ', '.join(attrs) return out class BusinessHour(BusinessHourMixin, SingleConstructorOffset): """ - DateOffset subclass representing possibly n business days. + DateOffset subclass representing possibly n business hours. .. versionadded:: 0.16.1 """ From 45ea26763da832189747ac9f86630fece84b4f18 Mon Sep 17 00:00:00 2001 From: Mak Sze Chun Date: Fri, 28 Jun 2019 22:12:23 +0800 Subject: [PATCH 085/238] =?UTF-8?q?[CI]=20Add=20pytest-azurepipelines=20in?= =?UTF-8?q?=20=E2=80=8Bpandas-dev=20(#26620)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ci/deps/azure-35-compat.yaml | 1 + ci/deps/azure-36-locale.yaml | 1 + ci/deps/azure-36-locale_slow.yaml | 1 + ci/deps/azure-37-locale.yaml | 1 + ci/deps/azure-37-numpydev.yaml | 1 + ci/deps/azure-macos-35.yaml | 1 + ci/deps/azure-windows-36.yaml | 1 + ci/deps/azure-windows-37.yaml | 1 + 8 files changed, 8 insertions(+) diff --git a/ci/deps/azure-35-compat.yaml b/ci/deps/azure-35-compat.yaml index c783670e78d529..fe207d122657bd 100644 --- a/ci/deps/azure-35-compat.yaml +++ b/ci/deps/azure-35-compat.yaml @@ -22,6 +22,7 @@ dependencies: - hypothesis>=3.58.0 - pytest-xdist - pytest-mock + - pytest-azurepipelines - pip - pip: # for python 3.5, pytest>=4.0.2 is not available in conda diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index fbb240734d45df..99fa4d5c9e160f 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -23,6 +23,7 @@ dependencies: - pytest>=4.0.2 - pytest-xdist - pytest-mock + - pytest-azurepipelines - hypothesis>=3.58.0 - pip - pip: diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml index 9ddc782da930e4..2bf2bd74795d28 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-36-locale_slow.yaml @@ -29,6 +29,7 @@ dependencies: - pytest>=4.0.2 - pytest-xdist - pytest-mock + - pytest-azurepipelines - moto - pip - pip: diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 2ebb7dda86e366..bd8ba912d52980 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -28,6 +28,7 @@ dependencies: - pytest>=4.0.2 - pytest-xdist - pytest-mock + - pytest-azurepipelines - pip - pip: - hypothesis>=3.58.0 diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml index 831f13fb421f03..c56dc819a90b1e 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-37-numpydev.yaml @@ -17,3 +17,4 @@ dependencies: - "--pre" - "numpy" - "scipy" + - pytest-azurepipelines diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml index 24c753e16d98dd..0b96dd9762ef5d 100644 --- a/ci/deps/azure-macos-35.yaml +++ b/ci/deps/azure-macos-35.yaml @@ -29,3 +29,4 @@ dependencies: - pytest-xdist - pytest-mock - hypothesis>=3.58.0 + - pytest-azurepipelines diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index b1795059091b92..b0f3f5389ac854 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -26,4 +26,5 @@ dependencies: - pytest>=4.0.2 - pytest-xdist - pytest-mock + - pytest-azurepipelines - hypothesis>=3.58.0 diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 5bdc29e0eec802..43504dec269533 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -28,6 +28,7 @@ dependencies: - pytest>=4.0.2 - pytest-xdist - pytest-mock + - pytest-azurepipelines - moto - hypothesis>=3.58.0 - pyreadstat From 6af58407cd5ac56ca11f5dffc1fd9b636ad68fb6 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Fri, 28 Jun 2019 17:15:02 +0200 Subject: [PATCH 086/238] Fixturize tests/frame/test_dtypes.py (#25636) --- pandas/tests/frame/conftest.py | 12 +++ pandas/tests/frame/test_dtypes.py | 126 ++++++++++++++++-------------- 2 files changed, 79 insertions(+), 59 deletions(-) diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index 61a8ea0c384ba3..3232c400bd8ce5 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -231,6 +231,18 @@ def mixed_int_frame(): return df +@pytest.fixture +def mixed_type_frame(): + """ + Fixture for DataFrame of float/int/string columns with RangeIndex + Columns are ['a', 'b', 'c', 'float32', 'int32']. + """ + return DataFrame({'a': 1., 'b': 2, 'c': 'foo', + 'float32': np.array([1.] * 10, dtype='float32'), + 'int32': np.array([1] * 10, dtype='int32')}, + index=np.arange(10)) + + @pytest.fixture def timezone_frame(): """ diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 7ed601e4f70461..f68770d796292b 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -11,13 +11,19 @@ Categorical, DataFrame, Series, Timedelta, Timestamp, _np_version_under1p14, concat, date_range, option_context) from pandas.core.arrays import integer_array -from pandas.tests.frame.common import TestData import pandas.util.testing as tm from pandas.util.testing import ( assert_frame_equal, assert_series_equal, makeCustomDataframe as mkdf) -class TestDataFrameDataTypes(TestData): +def _check_cast(df, v): + """ + Check if all dtypes of df are equal to v + """ + assert all(s.dtype.name == v for _, s in df.items()) + + +class TestDataFrameDataTypes: def test_concat_empty_dataframe_dtypes(self): df = DataFrame(columns=list("abc")) @@ -400,10 +406,10 @@ def test_select_dtypes_typecodes(self): FLOAT_TYPES = list(np.typecodes['AllFloat']) assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected) - def test_dtypes_gh8722(self): - self.mixed_frame['bool'] = self.mixed_frame['A'] > 0 - result = self.mixed_frame.dtypes - expected = Series({k: v.dtype for k, v in self.mixed_frame.items()}, + def test_dtypes_gh8722(self, float_string_frame): + float_string_frame['bool'] = float_string_frame['A'] > 0 + result = float_string_frame.dtypes + expected = Series({k: v.dtype for k, v in float_string_frame.items()}, index=result.index) assert_series_equal(result, expected) @@ -413,8 +419,8 @@ def test_dtypes_gh8722(self): result = df.dtypes assert_series_equal(result, Series({0: np.dtype('int64')})) - def test_ftypes(self): - frame = self.mixed_float + def test_ftypes(self, mixed_float_frame): + frame = mixed_float_frame expected = Series(dict(A='float32:dense', B='float32:dense', C='float16:dense', @@ -425,32 +431,39 @@ def test_ftypes(self): result = frame.ftypes.sort_values() assert_series_equal(result, expected) - def test_astype(self): - casted = self.frame.astype(int) - expected = DataFrame(self.frame.values.astype(int), - index=self.frame.index, - columns=self.frame.columns) + def test_astype_float(self, float_frame): + casted = float_frame.astype(int) + expected = DataFrame(float_frame.values.astype(int), + index=float_frame.index, + columns=float_frame.columns) assert_frame_equal(casted, expected) - casted = self.frame.astype(np.int32) - expected = DataFrame(self.frame.values.astype(np.int32), - index=self.frame.index, - columns=self.frame.columns) + casted = float_frame.astype(np.int32) + expected = DataFrame(float_frame.values.astype(np.int32), + index=float_frame.index, + columns=float_frame.columns) assert_frame_equal(casted, expected) - self.frame['foo'] = '5' - casted = self.frame.astype(int) - expected = DataFrame(self.frame.values.astype(int), - index=self.frame.index, - columns=self.frame.columns) + float_frame['foo'] = '5' + casted = float_frame.astype(int) + expected = DataFrame(float_frame.values.astype(int), + index=float_frame.index, + columns=float_frame.columns) assert_frame_equal(casted, expected) + def test_astype_mixed_float(self, mixed_float_frame): # mixed casting - def _check_cast(df, v): - assert (list({s.dtype.name for - _, s in df.items()})[0] == v) + casted = mixed_float_frame.reindex( + columns=['A', 'B']).astype('float32') + _check_cast(casted, 'float32') + + casted = mixed_float_frame.reindex( + columns=['A', 'B']).astype('float16') + _check_cast(casted, 'float16') - mn = self.all_mixed._get_numeric_data().copy() + def test_astype_mixed_type(self, mixed_type_frame): + # mixed casting + mn = mixed_type_frame._get_numeric_data().copy() mn['little_float'] = np.array(12345., dtype='float16') mn['big_float'] = np.array(123456789101112., dtype='float64') @@ -460,15 +473,9 @@ def _check_cast(df, v): casted = mn.astype('int64') _check_cast(casted, 'int64') - casted = self.mixed_float.reindex(columns=['A', 'B']).astype('float32') - _check_cast(casted, 'float32') - casted = mn.reindex(columns=['little_float']).astype('float16') _check_cast(casted, 'float16') - casted = self.mixed_float.reindex(columns=['A', 'B']).astype('float16') - _check_cast(casted, 'float16') - casted = mn.astype('float32') _check_cast(casted, 'float32') @@ -479,39 +486,40 @@ def _check_cast(df, v): casted = mn.astype('O') _check_cast(casted, 'object') - def test_astype_with_exclude_string(self): - df = self.frame.copy() - expected = self.frame.astype(int) + def test_astype_with_exclude_string(self, float_frame): + df = float_frame.copy() + expected = float_frame.astype(int) df['string'] = 'foo' casted = df.astype(int, errors='ignore') expected['string'] = 'foo' assert_frame_equal(casted, expected) - df = self.frame.copy() - expected = self.frame.astype(np.int32) + df = float_frame.copy() + expected = float_frame.astype(np.int32) df['string'] = 'foo' casted = df.astype(np.int32, errors='ignore') expected['string'] = 'foo' assert_frame_equal(casted, expected) - def test_astype_with_view(self): - - tf = self.mixed_float.reindex(columns=['A', 'B', 'C']) - - casted = tf.astype(np.int64) - - casted = tf.astype(np.float32) + def test_astype_with_view_float(self, float_frame): # this is the only real reason to do it this way - tf = np.round(self.frame).astype(np.int32) + tf = np.round(float_frame).astype(np.int32) casted = tf.astype(np.float32, copy=False) # TODO(wesm): verification? - tf = self.frame.astype(np.float64) + tf = float_frame.astype(np.float64) casted = tf.astype(np.int64, copy=False) # noqa + def test_astype_with_view_mixed_float(self, mixed_float_frame): + + tf = mixed_float_frame.reindex(columns=['A', 'B', 'C']) + + casted = tf.astype(np.int64) + casted = tf.astype(np.float32) # noqa + @pytest.mark.parametrize("dtype", [np.int32, np.int64]) @pytest.mark.parametrize("val", [np.nan, np.inf]) def test_astype_cast_nan_inf_int(self, val, dtype): @@ -927,12 +935,12 @@ def test_asarray_homogenous(self): tm.assert_numpy_array_equal(result, expected) -class TestDataFrameDatetimeWithTZ(TestData): +class TestDataFrameDatetimeWithTZ: - def test_interleave(self): + def test_interleave(self, timezone_frame): # interleave with object - result = self.tzframe.assign(D='foo').values + result = timezone_frame.assign(D='foo').values expected = np.array([[Timestamp('2013-01-01 00:00:00'), Timestamp('2013-01-02 00:00:00'), Timestamp('2013-01-03 00:00:00')], @@ -948,7 +956,7 @@ def test_interleave(self): tm.assert_numpy_array_equal(result, expected) # interleave with only datetime64[ns] - result = self.tzframe.values + result = timezone_frame.values expected = np.array([[Timestamp('2013-01-01 00:00:00'), Timestamp('2013-01-02 00:00:00'), Timestamp('2013-01-03 00:00:00')], @@ -963,7 +971,7 @@ def test_interleave(self): tz='CET')]], dtype=object).T tm.assert_numpy_array_equal(result, expected) - def test_astype(self): + def test_astype(self, timezone_frame): # astype expected = np.array([[Timestamp('2013-01-01 00:00:00'), Timestamp('2013-01-02 00:00:00'), @@ -979,12 +987,12 @@ def test_astype(self): tz='CET')]], dtype=object).T expected = DataFrame(expected, - index=self.tzframe.index, - columns=self.tzframe.columns, dtype=object) - result = self.tzframe.astype(object) + index=timezone_frame.index, + columns=timezone_frame.columns, dtype=object) + result = timezone_frame.astype(object) assert_frame_equal(result, expected) - result = self.tzframe.astype('datetime64[ns]') + result = timezone_frame.astype('datetime64[ns]') expected = DataFrame({'A': date_range('20130101', periods=3), 'B': (date_range('20130101', periods=3, tz='US/Eastern') @@ -998,19 +1006,19 @@ def test_astype(self): expected.iloc[1, 2] = pd.NaT assert_frame_equal(result, expected) - def test_astype_str(self): + def test_astype_str(self, timezone_frame): # str formatting - result = self.tzframe.astype(str) + result = timezone_frame.astype(str) expected = DataFrame([['2013-01-01', '2013-01-01 00:00:00-05:00', '2013-01-01 00:00:00+01:00'], ['2013-01-02', 'NaT', 'NaT'], ['2013-01-03', '2013-01-03 00:00:00-05:00', '2013-01-03 00:00:00+01:00']], - columns=self.tzframe.columns) + columns=timezone_frame.columns) tm.assert_frame_equal(result, expected) with option_context('display.max_columns', 20): - result = str(self.tzframe) + result = str(timezone_frame) assert ('0 2013-01-01 2013-01-01 00:00:00-05:00 ' '2013-01-01 00:00:00+01:00') in result assert ('1 2013-01-02 ' From 4ef793f2658bd7c8752e604548e55c2bcdd82d7b Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Fri, 28 Jun 2019 17:15:58 +0200 Subject: [PATCH 087/238] Fixturize tests/frame/test_constructors.py (#25635) --- pandas/tests/frame/test_constructors.py | 150 +++++++++++++----------- 1 file changed, 81 insertions(+), 69 deletions(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 434ee2f8bf0afe..981dc8b32b8cc5 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -17,7 +17,6 @@ from pandas import ( Categorical, DataFrame, Index, MultiIndex, RangeIndex, Series, Timedelta, Timestamp, date_range, isna) -from pandas.tests.frame.common import TestData import pandas.util.testing as tm MIXED_FLOAT_DTYPES = ['float16', 'float32', 'float64'] @@ -25,7 +24,7 @@ 'int32', 'int64'] -class TestDataFrameConstructors(TestData): +class TestDataFrameConstructors: @pytest.mark.parametrize('constructor', [ lambda: DataFrame(), @@ -60,14 +59,14 @@ def test_emptylike_constructor( result = DataFrame(emptylike) tm.assert_frame_equal(result, expected) - def test_constructor_mixed(self): + def test_constructor_mixed(self, float_string_frame): index, data = tm.getMixedTypeDict() # TODO(wesm), incomplete test? indexed_frame = DataFrame(data, index=index) # noqa unindexed_frame = DataFrame(data) # noqa - assert self.mixed_frame['foo'].dtype == np.object_ + assert float_string_frame['foo'].dtype == np.object_ def test_constructor_cast_failure(self): foo = DataFrame({'a': ['a', 'b', 'c']}, dtype=np.float64) @@ -181,11 +180,11 @@ def test_constructor_dtype_str_na_values(self, string_dtype): df = DataFrame({'A': ['x', np.nan]}, dtype=string_dtype) assert np.isnan(df.iloc[1, 0]) - def test_constructor_rec(self): - rec = self.frame.to_records(index=False) + def test_constructor_rec(self, float_frame): + rec = float_frame.to_records(index=False) rec.dtype.names = list(rec.dtype.names)[::-1] - index = self.frame.index + index = float_frame.index df = DataFrame(rec) tm.assert_index_equal(df.columns, pd.Index(rec.dtype.names)) @@ -244,24 +243,29 @@ def test_constructor_ordereddict(self): assert expected == list(df.columns) def test_constructor_dict(self): - frame = DataFrame({'col1': self.ts1, - 'col2': self.ts2}) + datetime_series = tm.makeTimeSeries(nper=30) + # test expects index shifted by 5 + datetime_series_short = tm.makeTimeSeries(nper=30)[5:] + + frame = DataFrame({'col1': datetime_series, + 'col2': datetime_series_short}) # col2 is padded with NaN - assert len(self.ts1) == 30 - assert len(self.ts2) == 25 + assert len(datetime_series) == 30 + assert len(datetime_series_short) == 25 - tm.assert_series_equal(self.ts1, frame['col1'], check_names=False) + tm.assert_series_equal(frame['col1'], datetime_series.rename('col1')) - exp = pd.Series(np.concatenate([[np.nan] * 5, self.ts2.values]), - index=self.ts1.index, name='col2') + exp = pd.Series(np.concatenate([[np.nan] * 5, + datetime_series_short.values]), + index=datetime_series.index, name='col2') tm.assert_series_equal(exp, frame['col2']) - frame = DataFrame({'col1': self.ts1, - 'col2': self.ts2}, + frame = DataFrame({'col1': datetime_series, + 'col2': datetime_series_short}, columns=['col2', 'col3', 'col4']) - assert len(frame) == len(self.ts2) + assert len(frame) == len(datetime_series_short) assert 'col1' not in frame assert isna(frame['col3']).all() @@ -361,18 +365,24 @@ def test_constructor_dict_nan_tuple_key(self, value): @pytest.mark.skipif(not PY36, reason='Insertion order for Python>=3.6') def test_constructor_dict_order_insertion(self): + datetime_series = tm.makeTimeSeries(nper=30) + datetime_series_short = tm.makeTimeSeries(nper=25) + # GH19018 # initialization ordering: by insertion order if python>= 3.6 - d = {'b': self.ts2, 'a': self.ts1} + d = {'b': datetime_series_short, 'a': datetime_series} frame = DataFrame(data=d) expected = DataFrame(data=d, columns=list('ba')) tm.assert_frame_equal(frame, expected) @pytest.mark.skipif(PY36, reason='order by value for Python<3.6') def test_constructor_dict_order_by_values(self): + datetime_series = tm.makeTimeSeries(nper=30) + datetime_series_short = tm.makeTimeSeries(nper=25) + # GH19018 # initialization ordering: by value if python<3.6 - d = {'b': self.ts2, 'a': self.ts1} + d = {'b': datetime_series_short, 'a': datetime_series} frame = DataFrame(data=d) expected = DataFrame(data=d, columns=list('ab')) tm.assert_frame_equal(frame, expected) @@ -462,7 +472,7 @@ def test_constructor_with_embedded_frames(self): result = df2.loc[1, 0] tm.assert_frame_equal(result, df1 + 10) - def test_constructor_subclass_dict(self): + def test_constructor_subclass_dict(self, float_frame): # Test for passing dict subclass to constructor data = {'col1': tm.TestSubDict((x, 10.0 * x) for x in range(10)), 'col2': tm.TestSubDict((x, 20.0 * x) for x in range(10))} @@ -478,13 +488,13 @@ def test_constructor_subclass_dict(self): # try with defaultdict from collections import defaultdict data = {} - self.frame['B'][:10] = np.nan - for k, v in self.frame.items(): + float_frame['B'][:10] = np.nan + for k, v in float_frame.items(): dct = defaultdict(dict) dct.update(v.to_dict()) data[k] = dct frame = DataFrame(data) - tm.assert_frame_equal(self.frame.sort_index(), frame) + tm.assert_frame_equal(float_frame.sort_index(), frame) def test_constructor_dict_block(self): expected = np.array([[4., 3., 2., 1.]]) @@ -923,14 +933,14 @@ def test_constructor_arrays_and_scalars(self): with pytest.raises(ValueError, match='must pass an index'): DataFrame({'a': False, 'b': True}) - def test_constructor_DataFrame(self): - df = DataFrame(self.frame) - tm.assert_frame_equal(df, self.frame) + def test_constructor_DataFrame(self, float_frame): + df = DataFrame(float_frame) + tm.assert_frame_equal(df, float_frame) - df_casted = DataFrame(self.frame, dtype=np.int64) + df_casted = DataFrame(float_frame, dtype=np.int64) assert df_casted.values.dtype == np.int64 - def test_constructor_more(self): + def test_constructor_more(self, float_frame): # used to be in test_matrix.py arr = np.random.randn(10) dm = DataFrame(arr, columns=['A'], index=np.arange(10)) @@ -956,8 +966,8 @@ def test_constructor_more(self): with pytest.raises(ValueError, match='cast'): DataFrame(mat, index=[0, 1], columns=[0], dtype=float) - dm = DataFrame(DataFrame(self.frame._series)) - tm.assert_frame_equal(dm, self.frame) + dm = DataFrame(DataFrame(float_frame._series)) + tm.assert_frame_equal(dm, float_frame) # int cast dm = DataFrame({'A': np.ones(10, dtype=int), @@ -1223,8 +1233,9 @@ def test_constructor_scalar(self): expected = DataFrame({"a": [0, 0, 0]}, index=idx) tm.assert_frame_equal(df, expected, check_dtype=False) - def test_constructor_Series_copy_bug(self): - df = DataFrame(self.frame['A'], index=self.frame.index, columns=['A']) + def test_constructor_Series_copy_bug(self, float_frame): + df = DataFrame(float_frame['A'], index=float_frame.index, + columns=['A']) df.copy() def test_constructor_mixed_dict_and_Series(self): @@ -1286,10 +1297,10 @@ def test_constructor_list_of_namedtuples(self): result = DataFrame(tuples, columns=['y', 'z']) tm.assert_frame_equal(result, expected) - def test_constructor_orient(self): - data_dict = self.mixed_frame.T._series + def test_constructor_orient(self, float_string_frame): + data_dict = float_string_frame.T._series recons = DataFrame.from_dict(data_dict, orient='index') - expected = self.mixed_frame.sort_index() + expected = float_string_frame.sort_index() tm.assert_frame_equal(recons, expected) # dict of sequence @@ -1393,38 +1404,38 @@ def test_constructor_Series_differently_indexed(self): tm.assert_index_equal(df2.index, other_index) tm.assert_frame_equal(df2, exp2) - def test_constructor_manager_resize(self): - index = list(self.frame.index[:5]) - columns = list(self.frame.columns[:3]) + def test_constructor_manager_resize(self, float_frame): + index = list(float_frame.index[:5]) + columns = list(float_frame.columns[:3]) - result = DataFrame(self.frame._data, index=index, + result = DataFrame(float_frame._data, index=index, columns=columns) tm.assert_index_equal(result.index, Index(index)) tm.assert_index_equal(result.columns, Index(columns)) - def test_constructor_from_items(self): - items = [(c, self.frame[c]) for c in self.frame.columns] + def test_constructor_from_items(self, float_frame, float_string_frame): + items = [(c, float_frame[c]) for c in float_frame.columns] with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): recons = DataFrame.from_items(items) - tm.assert_frame_equal(recons, self.frame) + tm.assert_frame_equal(recons, float_frame) # pass some columns with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): recons = DataFrame.from_items(items, columns=['C', 'B', 'A']) - tm.assert_frame_equal(recons, self.frame.loc[:, ['C', 'B', 'A']]) + tm.assert_frame_equal(recons, float_frame.loc[:, ['C', 'B', 'A']]) # orient='index' - row_items = [(idx, self.mixed_frame.xs(idx)) - for idx in self.mixed_frame.index] + row_items = [(idx, float_string_frame.xs(idx)) + for idx in float_string_frame.index] with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): recons = DataFrame.from_items(row_items, - columns=self.mixed_frame.columns, + columns=float_string_frame.columns, orient='index') - tm.assert_frame_equal(recons, self.mixed_frame) + tm.assert_frame_equal(recons, float_string_frame) assert recons['A'].dtype == np.float64 msg = "Must pass columns with orient='index'" @@ -1435,16 +1446,16 @@ def test_constructor_from_items(self): # orient='index', but thar be tuples arr = construct_1d_object_array_from_listlike( - [('bar', 'baz')] * len(self.mixed_frame)) - self.mixed_frame['foo'] = arr - row_items = [(idx, list(self.mixed_frame.xs(idx))) - for idx in self.mixed_frame.index] + [('bar', 'baz')] * len(float_string_frame)) + float_string_frame['foo'] = arr + row_items = [(idx, list(float_string_frame.xs(idx))) + for idx in float_string_frame.index] with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): recons = DataFrame.from_items(row_items, - columns=self.mixed_frame.columns, + columns=float_string_frame.columns, orient='index') - tm.assert_frame_equal(recons, self.mixed_frame) + tm.assert_frame_equal(recons, float_string_frame) assert isinstance(recons['foo'][0], tuple) with tm.assert_produces_warning(FutureWarning, @@ -1485,14 +1496,15 @@ def test_from_items_deprecation(self): columns=['col1', 'col2', 'col3'], orient='index') - def test_constructor_mix_series_nonseries(self): - df = DataFrame({'A': self.frame['A'], - 'B': list(self.frame['B'])}, columns=['A', 'B']) - tm.assert_frame_equal(df, self.frame.loc[:, ['A', 'B']]) + def test_constructor_mix_series_nonseries(self, float_frame): + df = DataFrame({'A': float_frame['A'], + 'B': list(float_frame['B'])}, columns=['A', 'B']) + tm.assert_frame_equal(df, float_frame.loc[:, ['A', 'B']]) msg = 'does not match index length' with pytest.raises(ValueError, match=msg): - DataFrame({'A': self.frame['A'], 'B': list(self.frame['B'])[:-2]}) + DataFrame({'A': float_frame['A'], + 'B': list(float_frame['B'])[:-2]}) def test_constructor_miscast_na_int_dtype(self): df = DataFrame([[np.nan, 1], [1, 0]], dtype=np.int64) @@ -1752,24 +1764,24 @@ def test_constructor_for_list_with_dtypes(self): expected = expected.sort_index() tm.assert_series_equal(result, expected) - def test_constructor_frame_copy(self): - cop = DataFrame(self.frame, copy=True) + def test_constructor_frame_copy(self, float_frame): + cop = DataFrame(float_frame, copy=True) cop['A'] = 5 assert (cop['A'] == 5).all() - assert not (self.frame['A'] == 5).all() + assert not (float_frame['A'] == 5).all() - def test_constructor_ndarray_copy(self): - df = DataFrame(self.frame.values) + def test_constructor_ndarray_copy(self, float_frame): + df = DataFrame(float_frame.values) - self.frame.values[5] = 5 + float_frame.values[5] = 5 assert (df.values[5] == 5).all() - df = DataFrame(self.frame.values, copy=True) - self.frame.values[6] = 6 + df = DataFrame(float_frame.values, copy=True) + float_frame.values[6] = 6 assert not (df.values[6] == 6).all() - def test_constructor_series_copy(self): - series = self.frame._series + def test_constructor_series_copy(self, float_frame): + series = float_frame._series df = DataFrame({'A': series['A']}) df['A'][:] = 5 @@ -2318,7 +2330,7 @@ class List(list): tm.assert_frame_equal(result, expected) -class TestDataFrameConstructorWithDatetimeTZ(TestData): +class TestDataFrameConstructorWithDatetimeTZ: def test_from_dict(self): From a272b60aabc2a3b4128a7365e9b2b0080bcd5121 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Fri, 28 Jun 2019 16:37:39 +0100 Subject: [PATCH 088/238] STYLE: Isort __init__ files (#26749) --- pandas/_libs/__init__.py | 2 +- pandas/_libs/tslibs/__init__.py | 6 +- pandas/api/__init__.py | 2 +- pandas/api/extensions/__init__.py | 20 ++--- pandas/api/types/__init__.py | 13 ++-- pandas/arrays/__init__.py | 8 +- pandas/core/arrays/__init__.py | 24 +++--- pandas/core/frame.py | 88 +++++++--------------- pandas/core/internals/__init__.py | 27 ++++--- pandas/io/clipboard/__init__.py | 7 +- pandas/io/excel/__init__.py | 2 +- pandas/io/json/__init__.py | 2 +- pandas/tests/extension/base/__init__.py | 10 ++- pandas/tests/extension/decimal/__init__.py | 3 +- pandas/util/__init__.py | 6 +- setup.cfg | 23 +----- 16 files changed, 97 insertions(+), 146 deletions(-) diff --git a/pandas/_libs/__init__.py b/pandas/_libs/__init__.py index 1f6042389416ec..fcf5ffbfcad92a 100644 --- a/pandas/_libs/__init__.py +++ b/pandas/_libs/__init__.py @@ -1,4 +1,4 @@ # flake8: noqa from .tslibs import ( - iNaT, NaT, NaTType, Timestamp, Timedelta, OutOfBoundsDatetime, Period) + NaT, NaTType, OutOfBoundsDatetime, Period, Timedelta, Timestamp, iNaT) diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 21ba0ae06a036b..67a323782a836a 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -1,9 +1,9 @@ # flake8: noqa -from .conversion import normalize_date, localize_pydatetime +from .conversion import localize_pydatetime, normalize_date from .nattype import NaT, NaTType, iNaT, is_null_datetimelike from .np_datetime import OutOfBoundsDatetime -from .period import Period, IncompatibleFrequency +from .period import IncompatibleFrequency, Period +from .timedeltas import Timedelta, delta_to_nanoseconds, ints_to_pytimedelta from .timestamps import Timestamp -from .timedeltas import delta_to_nanoseconds, ints_to_pytimedelta, Timedelta from .tzconversion import tz_convert_single diff --git a/pandas/api/__init__.py b/pandas/api/__init__.py index afff059e7b6018..58422811990c47 100644 --- a/pandas/api/__init__.py +++ b/pandas/api/__init__.py @@ -1,2 +1,2 @@ """ public toolkit API """ -from . import types, extensions # noqa +from . import extensions, types # noqa diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py index cb6241016d82f2..0bd2733cb494cc 100644 --- a/pandas/api/extensions/__init__.py +++ b/pandas/api/extensions/__init__.py @@ -1,10 +1,12 @@ """Public API for extending pandas objects.""" -from pandas.core.accessor import (register_dataframe_accessor, # noqa - register_index_accessor, - register_series_accessor) -from pandas.core.algorithms import take # noqa -from pandas.core.arrays import (ExtensionArray, # noqa - ExtensionScalarOpsMixin) -from pandas.core.dtypes.dtypes import ( # noqa - ExtensionDtype, register_extension_dtype -) +from pandas.core.dtypes.dtypes import ( # noqa: F401 + ExtensionDtype, register_extension_dtype) + +from pandas.core.accessor import ( # noqa: F401 + register_index_accessor, register_series_accessor) +from pandas.core.algorithms import take # noqa: F401 +from pandas.core.arrays import ( # noqa: F401 + ExtensionArray, ExtensionScalarOpsMixin) + +from pandas.core.accessor import ( # noqa: F401; noqa: F401 + register_dataframe_accessor) diff --git a/pandas/api/types/__init__.py b/pandas/api/types/__init__.py index 438e4afa3f5807..668f79921d8e61 100644 --- a/pandas/api/types/__init__.py +++ b/pandas/api/types/__init__.py @@ -1,9 +1,8 @@ """ public toolkit API """ -from pandas.core.dtypes.api import * # noqa -from pandas.core.dtypes.dtypes import (CategoricalDtype, # noqa - DatetimeTZDtype, - PeriodDtype, - IntervalDtype) -from pandas.core.dtypes.concat import union_categoricals # noqa -from pandas._libs.lib import infer_dtype # noqa +from pandas._libs.lib import infer_dtype # noqa: F401 + +from pandas.core.dtypes.api import * # noqa: F403, F401 +from pandas.core.dtypes.concat import union_categoricals # noqa: F401 +from pandas.core.dtypes.dtypes import ( # noqa: F401 + CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype) diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index 7d9b1b7c7a6597..ab014d49236b3c 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -4,12 +4,8 @@ See :ref:`extending.extension-types` for more. """ from pandas.core.arrays import ( - IntervalArray, PeriodArray, Categorical, SparseArray, IntegerArray, - PandasArray, - DatetimeArray, - TimedeltaArray, -) - + Categorical, DatetimeArray, IntegerArray, IntervalArray, PandasArray, + PeriodArray, SparseArray, TimedeltaArray) __all__ = [ 'Categorical', diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 1033ce784046e8..2d09a9eac6eab1 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -1,13 +1,11 @@ -from .array_ import array # noqa -from .base import (ExtensionArray, # noqa - ExtensionOpsMixin, - ExtensionScalarOpsMixin) -from .categorical import Categorical # noqa -from .datetimes import DatetimeArray # noqa -from .interval import IntervalArray # noqa -from .period import PeriodArray, period_array # noqa -from .timedeltas import TimedeltaArray # noqa -from .integer import ( # noqa - IntegerArray, integer_array) -from .sparse import SparseArray # noqa -from .numpy_ import PandasArray, PandasDtype # noqa +from .array_ import array # noqa: F401 +from .base import ( # noqa: F401 + ExtensionArray, ExtensionOpsMixin, ExtensionScalarOpsMixin) +from .categorical import Categorical # noqa: F401 +from .datetimes import DatetimeArray # noqa: F401 +from .integer import IntegerArray, integer_array # noqa: F401 +from .interval import IntervalArray # noqa: F401 +from .numpy_ import PandasArray, PandasDtype # noqa: F401 +from .period import PeriodArray, period_array # noqa: F401 +from .sparse import SparseArray # noqa: F401 +from .timedeltas import TimedeltaArray # noqa: F401 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fd2e1e3e41ced3..df7003ecf000e9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -14,93 +14,61 @@ from io import StringIO import itertools import sys -import warnings from textwrap import dedent from typing import FrozenSet, List, Optional, Set, Type, Union +import warnings import numpy as np import numpy.ma as ma from pandas._config import get_option -from pandas._libs import lib, algos as libalgos - -from pandas.util._decorators import (Appender, Substitution, - rewrite_axis_style_signature, - deprecate_kwarg) -from pandas.util._validators import (validate_bool_kwarg, - validate_axis_style_args) - +from pandas._libs import algos as libalgos, lib from pandas.compat import PY36, raise_with_traceback from pandas.compat.numpy import function as nv -from pandas.core.arrays.sparse import SparseFrameAccessor +from pandas.util._decorators import ( + Appender, Substitution, deprecate_kwarg, rewrite_axis_style_signature) +from pandas.util._validators import ( + validate_axis_style_args, validate_bool_kwarg) + from pandas.core.dtypes.cast import ( - maybe_upcast, - cast_scalar_to_array, - infer_dtype_from_scalar, - maybe_cast_to_datetime, - maybe_infer_to_datetimelike, - maybe_convert_platform, - maybe_downcast_to_dtype, - invalidate_string_dtypes, - coerce_to_dtypes, - maybe_upcast_putmask, - find_common_type) + cast_scalar_to_array, coerce_to_dtypes, find_common_type, + infer_dtype_from_scalar, invalidate_string_dtypes, maybe_cast_to_datetime, + maybe_convert_platform, maybe_downcast_to_dtype, + maybe_infer_to_datetimelike, maybe_upcast, maybe_upcast_putmask) from pandas.core.dtypes.common import ( - is_dict_like, - is_datetime64tz_dtype, - is_object_dtype, - is_extension_type, - is_extension_array_dtype, - is_datetime64_any_dtype, - is_bool_dtype, - is_integer_dtype, - is_float_dtype, - is_integer, - is_scalar, - is_dtype_equal, - needs_i8_conversion, - infer_dtype_from_object, - ensure_float64, - ensure_int64, - ensure_platform_int, - is_list_like, - is_nested_list_like, - is_iterator, - is_sequence, - is_named_tuple) + ensure_float64, ensure_int64, ensure_platform_int, infer_dtype_from_object, + is_bool_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, + is_dict_like, is_dtype_equal, is_extension_array_dtype, is_extension_type, + is_float_dtype, is_integer, is_integer_dtype, is_iterator, is_list_like, + is_named_tuple, is_nested_list_like, is_object_dtype, is_scalar, + is_sequence, needs_i8_conversion) from pandas.core.dtypes.generic import ( - ABCSeries, ABCDataFrame, ABCIndexClass, ABCMultiIndex) + ABCDataFrame, ABCIndexClass, ABCMultiIndex, ABCSeries) from pandas.core.dtypes.missing import isna, notna -from pandas.core import algorithms -from pandas.core import common as com -from pandas.core import nanops -from pandas.core import ops +from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.datetimelike import ( - DatetimeLikeArrayMixin as DatetimeLikeArray -) + DatetimeLikeArrayMixin as DatetimeLikeArray) +from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.generic import NDFrame, _shared_docs -from pandas.core.index import (Index, MultiIndex, ensure_index, - ensure_index_from_sequences) +from pandas.core.index import ( + Index, MultiIndex, ensure_index, ensure_index_from_sequences) from pandas.core.indexes import base as ibase from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import PeriodIndex -from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable, - check_bool_indexer) +from pandas.core.indexing import ( + check_bool_indexer, convert_to_index_sliceable, maybe_droplevels) from pandas.core.internals import BlockManager from pandas.core.internals.construction import ( - masked_rec_array_to_mgr, get_names_from_index, to_arrays, - reorder_arrays, init_ndarray, init_dict, - arrays_to_mgr, sanitize_index) + arrays_to_mgr, get_names_from_index, init_dict, init_ndarray, + masked_rec_array_to_mgr, reorder_arrays, sanitize_index, to_arrays) from pandas.core.series import Series -from pandas.io.formats import console -from pandas.io.formats import format as fmt +from pandas.io.formats import console, format as fmt from pandas.io.formats.printing import pprint_thing - import pandas.plotting # --------------------------------------------------------------------- diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index d24dd2edd4e1d5..b9530e15f71e22 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,12 +1,15 @@ -from .blocks import ( # noqa:F401 - _block_shape, # io.pytables - _safe_reshape, # io.packers - make_block, # io.pytables, io.packers - FloatBlock, IntBlock, ComplexBlock, BoolBlock, ObjectBlock, - TimeDeltaBlock, DatetimeBlock, DatetimeTZBlock, - CategoricalBlock, ExtensionBlock, Block) -from .managers import ( # noqa:F401 - BlockManager, SingleBlockManager, - create_block_manager_from_arrays, create_block_manager_from_blocks, - items_overlap_with_suffix, # reshape.merge - concatenate_block_managers) # reshape.concat, reshape.merge +from .blocks import ( # noqa: F401 + Block, BoolBlock, CategoricalBlock, ComplexBlock, DatetimeBlock, + DatetimeTZBlock, ExtensionBlock, FloatBlock, IntBlock, ObjectBlock, + TimeDeltaBlock) +from .managers import ( # noqa: F401 + BlockManager, SingleBlockManager, create_block_manager_from_arrays, + create_block_manager_from_blocks) + +from .blocks import _safe_reshape # noqa: F401; io.packers +from .blocks import make_block # noqa: F401; io.pytables, io.packers +from .managers import ( # noqa: F401; reshape.concat, reshape.merge + concatenate_block_managers) +from .managers import items_overlap_with_suffix # noqa: F401; reshape.merge + +from .blocks import _block_shape # noqa:F401; io.pytables diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py index 2063978c76c5ac..e033d882a73f7e 100644 --- a/pandas/io/clipboard/__init__.py +++ b/pandas/io/clipboard/__init__.py @@ -25,12 +25,13 @@ """ __version__ = '1.5.27' -import platform import os +import platform import subprocess + from .clipboards import ( - init_osx_clipboard, init_qt_clipboard, init_xclip_clipboard, - init_xsel_clipboard, init_klipper_clipboard, init_no_clipboard) + init_klipper_clipboard, init_no_clipboard, init_osx_clipboard, + init_qt_clipboard, init_xclip_clipboard, init_xsel_clipboard) from .windows import init_windows_clipboard # `import qtpy` sys.exit()s if DISPLAY is not in the environment. diff --git a/pandas/io/excel/__init__.py b/pandas/io/excel/__init__.py index 704789cb6061e7..455abaa7fb5892 100644 --- a/pandas/io/excel/__init__.py +++ b/pandas/io/excel/__init__.py @@ -1,4 +1,4 @@ -from pandas.io.excel._base import read_excel, ExcelWriter, ExcelFile +from pandas.io.excel._base import ExcelFile, ExcelWriter, read_excel from pandas.io.excel._openpyxl import _OpenpyxlWriter from pandas.io.excel._util import register_writer from pandas.io.excel._xlsxwriter import _XlsxWriter diff --git a/pandas/io/json/__init__.py b/pandas/io/json/__init__.py index 32d110b3404a92..cbb4e37fae6a10 100644 --- a/pandas/io/json/__init__.py +++ b/pandas/io/json/__init__.py @@ -1,4 +1,4 @@ -from .json import to_json, read_json, loads, dumps # noqa +from .json import dumps, loads, read_json, to_json # noqa from .normalize import json_normalize # noqa from .table_schema import build_table_schema # noqa diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index 1f7ee2ae17e4a1..0b3f2b860c1270 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -46,11 +46,13 @@ class TestMyDtype(BaseDtypeTests): from .getitem import BaseGetitemTests # noqa from .groupby import BaseGroupbyTests # noqa from .interface import BaseInterfaceTests # noqa +from .io import BaseParsingTests # noqa from .methods import BaseMethodsTests # noqa -from .ops import BaseArithmeticOpsTests, BaseComparisonOpsTests, BaseOpsUtil # noqa -from .printing import BasePrintingTests # noqa -from .reduce import BaseNoReduceTests, BaseNumericReduceTests, BaseBooleanReduceTests # noqa from .missing import BaseMissingTests # noqa +from .ops import ( # noqa + BaseArithmeticOpsTests, BaseComparisonOpsTests, BaseOpsUtil) +from .printing import BasePrintingTests # noqa +from .reduce import ( # noqa + BaseBooleanReduceTests, BaseNoReduceTests, BaseNumericReduceTests) from .reshaping import BaseReshapingTests # noqa from .setitem import BaseSetitemTests # noqa -from .io import BaseParsingTests # noqa diff --git a/pandas/tests/extension/decimal/__init__.py b/pandas/tests/extension/decimal/__init__.py index c37aad0af84075..7c48e7e71503e7 100644 --- a/pandas/tests/extension/decimal/__init__.py +++ b/pandas/tests/extension/decimal/__init__.py @@ -1,4 +1,3 @@ -from .array import DecimalArray, DecimalDtype, to_decimal, make_data - +from .array import DecimalArray, DecimalDtype, make_data, to_decimal __all__ = ['DecimalArray', 'DecimalDtype', 'to_decimal', 'make_data'] diff --git a/pandas/util/__init__.py b/pandas/util/__init__.py index 202e58c916e473..9600109f015345 100644 --- a/pandas/util/__init__.py +++ b/pandas/util/__init__.py @@ -1,2 +1,4 @@ -from pandas.util._decorators import Appender, Substitution, cache_readonly # noqa -from pandas.core.util.hashing import hash_pandas_object, hash_array # noqa +from pandas.util._decorators import ( # noqa + Appender, Substitution, cache_readonly) + +from pandas.core.util.hashing import hash_array, hash_pandas_object # noqa diff --git a/setup.cfg b/setup.cfg index eb687c1f546d43..77dc043042f794 100644 --- a/setup.cfg +++ b/setup.cfg @@ -119,8 +119,9 @@ combine_as_imports=True force_sort_within_sections=True skip_glob=env, skip= + pandas/__init__.py pandas/core/api.py, - pandas/core/frame.py, + pandas/io/msgpack/__init__.py asv_bench/benchmarks/attrs_caching.py, asv_bench/benchmarks/binary_ops.py, asv_bench/benchmarks/categoricals.py, @@ -159,23 +160,3 @@ skip= asv_bench/benchmarks/dtypes.py asv_bench/benchmarks/strings.py asv_bench/benchmarks/period.py - pandas/__init__.py - pandas/plotting/__init__.py - pandas/tests/extension/decimal/__init__.py - pandas/tests/extension/base/__init__.py - pandas/io/msgpack/__init__.py - pandas/io/json/__init__.py - pandas/io/clipboard/__init__.py - pandas/io/excel/__init__.py - pandas/compat/__init__.py - pandas/compat/numpy/__init__.py - pandas/core/arrays/__init__.py - pandas/core/groupby/__init__.py - pandas/core/internals/__init__.py - pandas/api/__init__.py - pandas/api/extensions/__init__.py - pandas/api/types/__init__.py - pandas/_libs/__init__.py - pandas/_libs/tslibs/__init__.py - pandas/util/__init__.py - pandas/arrays/__init__.py From be4b48e7a3a2a9ef1e636bafb6c332bc3a50e2de Mon Sep 17 00:00:00 2001 From: topper-123 Date: Fri, 28 Jun 2019 17:25:03 +0100 Subject: [PATCH 089/238] Add type hints to dtypes/dtypes.py (CategoricalDtype) (#26327) --- pandas/core/arrays/categorical.py | 2 +- pandas/core/dtypes/dtypes.py | 72 +++++++++++++++++++------------ 2 files changed, 46 insertions(+), 28 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 155638aca55603..9f5e3e8ee77f02 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -435,7 +435,7 @@ def ordered(self): return self.dtype.ordered @property - def dtype(self): + def dtype(self) -> CategoricalDtype: """ The :class:`~pandas.api.types.CategoricalDtype` for this instance """ diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 7fe8ce7d716832..9da6fb84ee18b8 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1,6 +1,6 @@ """ define extension dtypes """ import re -from typing import Any, Dict, Optional, Tuple, Type +from typing import Any, Dict, List, Optional, Tuple, Type, Union import warnings import numpy as np @@ -18,7 +18,8 @@ str_type = str -def register_extension_dtype(cls): +def register_extension_dtype(cls: Type[ExtensionDtype], + ) -> Type[ExtensionDtype]: """ Register an ExtensionType with pandas as class decorator. @@ -60,9 +61,9 @@ class Registry: These are tried in order. """ def __init__(self): - self.dtypes = [] + self.dtypes = [] # type: List[Type[ExtensionDtype]] - def register(self, dtype): + def register(self, dtype: Type[ExtensionDtype]) -> None: """ Parameters ---------- @@ -73,11 +74,13 @@ def register(self, dtype): self.dtypes.append(dtype) - def find(self, dtype): + def find(self, + dtype: Union[Type[ExtensionDtype], str], + ) -> Optional[Type[ExtensionDtype]]: """ Parameters ---------- - dtype : PandasExtensionDtype or string + dtype : Type[ExtensionDtype] or string Returns ------- @@ -126,28 +129,28 @@ class PandasExtensionDtype(ExtensionDtype): isnative = 0 _cache = {} # type: Dict[str_type, 'PandasExtensionDtype'] - def __str__(self): + def __str__(self) -> str_type: """ Return a string representation for a particular Object """ return self.name - def __repr__(self): + def __repr__(self) -> str_type: """ Return a string representation for a particular object. """ return str(self) - def __hash__(self): + def __hash__(self) -> int: raise NotImplementedError("sub-classes should implement an __hash__ " "method") - def __getstate__(self): + def __getstate__(self) -> Dict[str_type, Any]: # pickle support; we don't want to pickle the cache return {k: getattr(self, k, None) for k in self._metadata} @classmethod - def reset_cache(cls): + def reset_cache(cls) -> None: """ clear the cache """ cls._cache = {} @@ -211,17 +214,24 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): _metadata = ('categories', 'ordered') _cache = {} # type: Dict[str_type, PandasExtensionDtype] - def __init__(self, categories=None, ordered=None): + def __init__(self, categories=None, ordered: bool = None): self._finalize(categories, ordered, fastpath=False) @classmethod - def _from_fastpath(cls, categories=None, ordered=None): + def _from_fastpath(cls, + categories=None, + ordered: bool = None + ) -> 'CategoricalDtype': self = cls.__new__(cls) self._finalize(categories, ordered, fastpath=True) return self @classmethod - def _from_categorical_dtype(cls, dtype, categories=None, ordered=None): + def _from_categorical_dtype(cls, + dtype: 'CategoricalDtype', + categories=None, + ordered: bool = None, + ) -> 'CategoricalDtype': if categories is ordered is None: return dtype if categories is None: @@ -231,8 +241,12 @@ def _from_categorical_dtype(cls, dtype, categories=None, ordered=None): return cls(categories, ordered) @classmethod - def _from_values_or_dtype(cls, values=None, categories=None, ordered=None, - dtype=None): + def _from_values_or_dtype(cls, + values=None, + categories=None, + ordered: bool = None, + dtype: 'CategoricalDtype' = None, + ) -> 'CategoricalDtype': """ Construct dtype from the input parameters used in :class:`Categorical`. @@ -314,7 +328,11 @@ def _from_values_or_dtype(cls, values=None, categories=None, ordered=None, return dtype - def _finalize(self, categories, ordered, fastpath=False): + def _finalize(self, + categories, + ordered: Optional[bool], + fastpath: bool = False, + ) -> None: if ordered is not None: self.validate_ordered(ordered) @@ -326,14 +344,14 @@ def _finalize(self, categories, ordered, fastpath=False): self._categories = categories self._ordered = ordered - def __setstate__(self, state): + def __setstate__(self, state: Dict[str_type, Any]) -> None: # for pickle compat. __get_state__ is defined in the # PandasExtensionDtype superclass and uses the public properties to # pickle -> need to set the settable private ones here (see GH26067) self._categories = state.pop('categories', None) self._ordered = state.pop('ordered', False) - def __hash__(self): + def __hash__(self) -> int: # _hash_categories returns a uint64, so use the negative # space for when we have unknown categories to avoid a conflict if self.categories is None: @@ -344,7 +362,7 @@ def __hash__(self): # We *do* want to include the real self.ordered here return int(self._hash_categories(self.categories, self.ordered)) - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: """ Rules for CDT equality: 1) Any CDT is equal to the string 'category' @@ -391,7 +409,7 @@ def __repr__(self): return tpl.format(data, self.ordered) @staticmethod - def _hash_categories(categories, ordered=True): + def _hash_categories(categories, ordered: Optional[bool] = True) -> int: from pandas.core.util.hashing import ( hash_array, _combine_hash_arrays, hash_tuples ) @@ -441,7 +459,7 @@ def construct_array_type(cls): return Categorical @staticmethod - def validate_ordered(ordered): + def validate_ordered(ordered: bool) -> None: """ Validates that we have a valid ordered parameter. If it is not a boolean, a TypeError will be raised. @@ -461,7 +479,7 @@ def validate_ordered(ordered): raise TypeError("'ordered' must either be 'True' or 'False'") @staticmethod - def validate_categories(categories, fastpath=False): + def validate_categories(categories, fastpath: bool = False): """ Validates that we have good categories @@ -475,7 +493,7 @@ def validate_categories(categories, fastpath=False): ------- categories : Index """ - from pandas import Index + from pandas.core.indexes.base import Index if not fastpath and not is_list_like(categories): msg = "Parameter 'categories' must be list-like, was {!r}" @@ -496,7 +514,7 @@ def validate_categories(categories, fastpath=False): return categories - def update_dtype(self, dtype): + def update_dtype(self, dtype: 'CategoricalDtype') -> 'CategoricalDtype': """ Returns a CategoricalDtype with categories and ordered taken from dtype if specified, otherwise falling back to self if unspecified @@ -538,14 +556,14 @@ def categories(self): return self._categories @property - def ordered(self): + def ordered(self) -> Optional[bool]: """ Whether the categories have an ordered relationship. """ return self._ordered @property - def _is_boolean(self): + def _is_boolean(self) -> bool: from pandas.core.dtypes.common import is_bool_dtype return is_bool_dtype(self.categories) From 1be0561df529ef0597465c6dfe023e34f7dd6d4d Mon Sep 17 00:00:00 2001 From: Thijs Damsma Date: Fri, 28 Jun 2019 18:44:09 +0200 Subject: [PATCH 090/238] Openpyxl engine for reading excel files (#25092) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/_typing.py | 1 + pandas/core/config_init.py | 38 ++++++++++++- pandas/io/excel/_base.py | 4 +- pandas/io/excel/_openpyxl.py | 74 +++++++++++++++++++++++++- pandas/tests/io/data/test1.xlsm | Bin 13967 -> 12091 bytes pandas/tests/io/data/test1.xlsx | Bin 13878 -> 12074 bytes pandas/tests/io/excel/test_readers.py | 11 +++- 8 files changed, 125 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1fd0257d93f452..1fe808e0988606 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -159,6 +159,7 @@ Other enhancements - Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`) - Added new option ``plotting.backend`` to be able to select a plotting backend different than the existing ``matplotlib`` one. Use ``pandas.set_option('plotting.backend', '')`` where `` None: + """Reader using openpyxl engine. + + Parameters + ---------- + filepath_or_buffer : string, path object or Workbook + Object to be parsed. + """ + import_optional_dependency("openpyxl") + super().__init__(filepath_or_buffer) + + @property + def _workbook_class(self): + from openpyxl import Workbook + return Workbook + + def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): + from openpyxl import load_workbook + return load_workbook(filepath_or_buffer, + read_only=True, data_only=True) + + @property + def sheet_names(self) -> List[str]: + return self.book.sheetnames + + def get_sheet_by_name(self, name: str): + return self.book[name] + + def get_sheet_by_index(self, index: int): + return self.book.worksheets[index] + + def _convert_cell(self, cell, convert_float: bool) -> Scalar: + + # TODO: replace with openpyxl constants + if cell.is_date: + return cell.value + elif cell.data_type == 'e': + return np.nan + elif cell.data_type == 'b': + return bool(cell.value) + elif cell.value is None: + return '' # compat with xlrd + elif cell.data_type == 'n': + # GH5394 + if convert_float: + val = int(cell.value) + if val == cell.value: + return val + else: + return float(cell.value) + + return cell.value + + def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + data = [] # type: List[List[Scalar]] + for row in sheet.rows: + data.append( + [self._convert_cell(cell, convert_float) for cell in row]) + + return data diff --git a/pandas/tests/io/data/test1.xlsm b/pandas/tests/io/data/test1.xlsm index f93c57ab7f857e1c6b7654542e2e9bde54ae80d9..28f4f27e4e1b18862d5d063d635f895463396849 100644 GIT binary patch delta 8907 zcmZvCWmFy8vM%oK!3pl}E+GVWcXxMNKyZhJTY?8?;qLD4PH-o$&j0?&sLg@{LoQC5 z7Hp_F!if#K{?IE$*|f3h=xi>V_XopptSU@|b1m(k80Ba?@A%U%JnqZelic1o%kNy)Vt z89BLsNL8`Q+!jtL{7HtUc`;*8*teS=sbfXBQ9ON#0UUW_Io|_}O28G`R+A5f4wO9= ztGZRVvI)1!?wC+oB*h(wltaCE+7L72txHwDifxHQ|6r|Wgc2g6eUG*0=YTI0vAf!5 zo%SL5NX7d*Bp5J-0Ht;jk`l5u5J3h4@EbvX`#U5USoBA#P(MUGZJ9kBoNbI99Bi08 z?QF}{^c=Eyu>1_H-XV9gm-cPILR5QMEXnKDl8TOYN5B=TA=2vQ%=3P|w=a%~B+A!G z&&J{b^kXO6h3=i7j25{T1c8xJO!dR?4U7xPc{}TN)_``ptf7XaBE2sJ(;&c{zv0N6 zgTN1B%2MHJ2oQu$#w=FNAKSee z?@O^P$nC*Xo4C!d6~eCm03~_pkdGd}@GBWA=w80gaDRWrqL^T#`lQ{JapnGpm5qHV zs3MfEy|T$%&6?SeuHm#li15jF3;7g-aA!%x{bf!+Z8f#sU9~yC7T#V8O|#Zw`i`v2CdDv?`+&g6i0Pfr^sBN$l&d7BgU*rI z&TH0jWhM+End&R{v0TeUHq0jW4uqPj%8hLO4bK;U(Z)KCZV|K?!QIz~gCQ`dblH`* zshOZFuQ&Ab1H|^QRS1BbYM7}|*h%=CUv%V`HrJ?RO!JMEVRm81JQ{7jFLN&(M4Ejys^2vc8vz1>Ge8sN4Cr3qnEF~54=V1CFyq^1PXE$t^2lRId#TG z3+DtEbRg&u2C>V1qm0{HP{6=av3M-NL& zHm_c7VhVY)43Gz?dY_V!@$|($cs6&jLfM=_$CC6j^-Z{BC;wUSg;rHEr;(WAaS7gTni)-}_=sl*<@*CT9z@eitayW{%p(*50-EJsKzVIL zBPLcYh+au7NSOYaDE*XsWhZ$HhTV?^bjnF<;3=Po`keXNBPzj+Am*H&*+jDOCfvK@ z-8Qc=xjfiiw|kaL>un(nj3dB?XBaafkbwXL3jlI|Vgyj{qt)m^m188T-RCK*-8!D7 zk!ga<+EFj7&3jy=ah&cuqE|rm4D@opd-?+f1h_eu1G$vzX>SLKDW?$BhTOAfNH$5n z{-9w^X@ss)L!|Ws7bmHSBS~zZAy5I=3<_Q-sH~}vT53|ZB0;= z3ky#!FZ4UgP2%|MuiYs9M=i!p_!*OFsCIz9i(jDVXw$&dx?pPL*Gz1}7)l27Xs5+} zbX1_L*1H@YpoinMz-Eh{&hrO_FurCtJ%cZ`Egzt3&Hc!eJ}T)gVm}vmJjn6}@Ao5( zwtPyy7mrRf)qT{S@%h2_%#XK1EoG_G>as{n8Wp}jy8;CaGT!emN!8pA1AWU*g>wUp z0!H|#@G#bB6L`V9Ca}D7ZqhD*pp>_D@pZfeTf+__lE=9AkWTBG9T;b|_r#Ej%knKH z+Mj^mt1JF0&y%E!$^DHg4D();O!_#6P#IYL(ceWMgLHBzwa?)1R=-jbc@z)EtrP1# zv&aM^X16XS8S9`~R=4Qw`@pZ}8ea)hbYkqFk|DXEM4-Fa?4(Bt$`EEz<^1@CFsYi& zAYXaXz2mU3Ydm=)Vd2g4+by-o!do8aV&aC6u-ws7d2sPEE+$NWxEFt^Y_1s385PotwSCb3~YcDSV=$t z7>^?0#_Gg<5=L_Ic;VcQqukh@LQ1ad?(-ah@ia7`- z&4@rEx8KCUQ`8-iOjD09kyq?dWZFt=yCX+^eQPf+zR&UZ{^NQz)1K!2H4U*RF~y)m z@meq|dMS-!HU(hAL&M>#Ae9u2ZU%1;ctm7)ky-uW?~HCY3^%w0`6ywVn?mx9ca;vO zA1*YZsmq)vaFZ%}P^yRg2hJcN(f5rs9Qs1#3Ow?}2mK&pMIS#lGV)X8WNCm;Nt?np zYl0YZG|i?)DTp!`wA))F79Mh_d`?8Ra&45u%1^xGGjmUaf#GlYNcnnz+sKRqm{zBf z*n`m$xBvOh)+}r2t2ym*ebj9|Lkjm@+o#X~@|W)5?a8ZlXTwY}xFttWa#6z)-$}B4 zyb-eE(|m5L#)aA_&c>91UTrf`XWpnMcWf;Czaxn%TAS5WHmo??`cP$ zvk8F(Kqymu+ZpahQka6PX^NuZ-RfJVoQ1=12*sc>Y*M8~mR5VH5(<1PB9ClfVJk@1 zGmOwAP`>jt?i-eEc%lC}&rQmlBk1#XdGsf&QxI@-_}FVu(_Bv{V-HKI0#G@hk283G zeZ}1{=zP08zT!TbqrU3t>wI|}P3?TY;=8ThS;J=T@VmY~0^)A^-5w~!5v{Be5~W>| zMSw0Vw@ognqKpDjp0B`#v-?=Y`VvLZ58Z#Aj}AfT>yRGzxK*F^tP*MtK(`5W+AOz_ zTs2d~rdLUVpw~>I-SIs509&OQqnDSCIkM}(Kg|)X5mN~^=SWQ(i-K%Bvegkc=V(>m!)GbQoee3Knd-!i!$*6}NP}0d z$2aV%dVkr>3;LRp8-2V8i+^~@-oU6OlhUa&uLLV6QAd>NZ#HKDp!+LTB{PMDcO>nf zCliKXnSH*nTYS+@1f*S*XJWlRu0$EzoQl zZ+3r`WLItB;1uMELlc3%S@4K2u`aF0vzMjnogIrJx5WvuNAf#rmUa>PkR`FT^V>-b zoR&qZH_o}Qo+G`#L+zlU!G_>j2h=BD>rbS2jd$Z{N;W|PMu1Q_1SH(`pm6LrU5V^6 zZQJvt%%w~&R7fmI^>$3)@0k9{h#Kk_j}c-Ia2FWpIGwBKB)&`c0cCw`N zQRx0wF%LheHo$Z=EX%J|r55UDYw(HB4;h?k3~Nc`jJr-$!qjXlaZ^az;%4hzS&_20 zs-EmrKTQh4+~~1?OjSI~|M85AnU%L_E^o;wL4;0x^rY2|5l@vaI@A`t*r=F0@jq1c z&`O1&S)Ro+k~Jq#a>G+HKoP^zw+Pl>PS>1v)tpv5P6gy@OFHc#u-IW^d+yzBeQDnd z6I~GvH786uFu^XaOK*#=M3_jI^aiq*q=PFOGqaP%xvDI;IC)vSS(UbbDPSazcCC$J znPCaS-Dcn)PkGiFXh{#7V^e50ZYFSMKaN2~xB2p!x+#!~N@c2#-~O@Q8m^I!oWtf3 z^AN9h(GvjEOg^WBo2Sy`{^?te{ey>5$|u~KFo`5SU7-933~NG~Pf{PF0V5z8nPZwT z&yHRr08L1HH$#M;B-07am4C>St()8`bRADAN1h~bTcy(Rt+aoWsGMS%u&13QCaxWl z?|>VM9yT^2AnLpG@i=Q>W;J%g-lNBk;RwKTu`P0jHz}5!FG|rpFU>Tkhclvb zY+8Yy1W#c(qdWJ6Ng%Bh@~&NBottES#0AD`-=e6W?s(1>cofF}8@cRSe6EJB(Ee4P z(YexwsTl=h-b=b|d(JNkvYUnR#~7QOR-1fpWNod|Y8^WR9FoEu*Mps#B#+=#CAIKS z_836hO@!e@I{V>iWUX!99M&UHz}pePQ>=WbDCUq#PV+QFFqFh)7_|n}ihBOu{$nL{t5)oQvL27UGVlKKfd%d+bJsx@DN4q>D0p@&vAHH1);Zs%mhN z>3nyOaXbV)DoWO147Yy!X{SSX+1Kd;om4gZxqjrQmYutGhhYk>E3pr&1#3)F&}Tv{4t z69kGejz;`lP5VuLP8;=Gr+?5jv?A*3}Nr5V3_jeYil6WLi z_irRUQ>?7U)UA+hKYQ*&B&Z?j0c~B97r}C{nyE~fc4j$So83Y3hjfVT#d`vOB!Nzc zj6w0<%bsi~=G&H>lQqITF)r}bYt3SkSnxTsH2G--P6ool1xu@CT*o6$6}Ls08-|%+ zaO!hG9wC9JRXQ-ANUeOWegtY6)EXdZv(xLly{hb2291JykmEwHE}WNb000;@<%H4l z-8veCO8&YCqq%fO9>RoTdTfI;ONW9%kVTG}+6#KP_^%E|_I(&BN6a z#qIH=r(Db?LUORI!&8X;4U4&Rs56}3ShPK{!g5nHR8v71Esn-1hlhowgNE61Kdf#M ze<6~3>iX$MkM=v$-n}EtJ?$^$%lM#-ClU^KR*5f;x3#!mdoJf_!UKB+227=}w;eD^ z-`5|k7Mq&$OB@z#9T^i1ZbJv_5_&@-uMx|o(sh2x((y71L@_#)n7%=>`7^%$9T*@1 zd77E`gI);ypcjz;hhA`T^|Cc{`HNhrQb|VCbUNH_d z4Q!vmFvHvn6H(G?j(WQ%icGBdX{A@NB*TCb=CCoANe1jVDHKr{tJ5t*`pQivDVyA~ za~!|jNgQPuqGVH2`fDoE>;cfH{lfIVE4Q)&G(`P6zyJX^Z~kCuS4&pF7$-Y(C&?|_ zEng8+fRnN>vNFJ;U4I^rMS;>wnUF+(iq2K@f{pfVJhc*DnvuZzF$d9J0TM^DX3dv} z#_mubY+guv3gdQqrtcTGN__W|9^tS|zK-51K^J99rPpvraVwRL*eC!IgR~QcR$2;A zc(k3nNU}RdSF$NY^PMhZ^ljtsiYA!j$8Py6tW5t^0i99t}~CCt_#B?vA7Ut zg%oi{bnml;k}BJU(^u$Cg7aw3WZkSbFCOk+f=}O2>OJ)Pt}bn_T~R|_%E2Gs!#RU( z$*6FR)xjNxR4lPaqv-&KxG|e?#W2chQ}a1&wtBZhc;1martSeq;AAp(bZL1qXVfY= z2;vQTOs6@wXz^lEV2oFIQa&+=Zinymv^Ffr-tYJtqy0)V3%6x3f^~{X z8Fv)AWb`=ie%L(v)V>8vneH*HlBV4$@zw4@gXeglJ=hPXk!4TUx#^j)t0_J zdQYu>B_BkZzxdvSwMzHU#)8@LR)Y}5cnQ~0{%r}61a3Ni0ZeNr-T7 zSd<)u`14x!<@O3$pr~bTdfEe9I%5Fl8;sdGE{Q3V-&u>W%8sT|5Dk?8QbUnb=9)^V z=-nKT&RxiA3V%(*mD=&lV_gF?xCFw~I4$575lbOp&zyNGIE)Dl@da6S{tuxX&_eM? z_FzgctFK@Ker&=C=+-e>e*oj~eCD&n*#-`Yz(UkDqjGmd3sWPh-XD;9I9xw#va;pF zP(l{USCcNO$fUV&pEP3H6GQaK5<@7&5T5C47)L*t9XuDt5;2Ib#U}dUYSQzRQxHIW zIG-4|E*VLO8;FnC6OR8_dy0l0#hR0U9Rh-DIpO(mtok@0NDo;n4?Q&q#9d}?F%yY!#m~g!*BCL6V^6xV_&nLnKi}dLYv&>`Den3F?{ZJMvBF5;2U7qI5}j z?Xsf}v#9jHW$jqqYOibnVWD_MxZW%R#c zWk(XtFRt$0@NwOJlNnAEc}wYzOc{%E*}??B8m2YkteJMCvv_47`p8Iq%sIjKSW&Nd zvcy@Tvm+^NYj{r1s7x!|1Pi8Uu9#+?E9{Yx!9^5|Mbe$J z%O2}EFP$|EjgRcd6_%_BeX~Q+LcHaE-dRFg9(hS|HB%QBAkJS)`6>lsLizH%LW%sH z?T7EGTEoNw?AB|LRthXLr!SoEQMK&-b8GU*=PEmsMmwZ*V8{ObjawcNz#welJ^5b5 zkw#hXMVA%eP%<=`9Q??O>m?%zU`ywrc3D+ZAdVVmc=6~~l}1ZN-^-AY894t;IrA)4 z6iiooe)IZz-RUT_j`g$J1bfb$AMxUnV%~`Dhx^-OE6e2gA)lkWuFmro#HQgVs89GV zEY0kaQ!g&*tzfDlz=G_Q<& zt81vj{o*BO!roNSJdY%k$q|p_BnYaMKNYTLQ}?ssBqdu_pHs9CfHKykhu(=0xF`d~ zzplNL-P_{l(JBQ^eeB!}eRj{ifPkLMQ>2W5T-8{}&zT|^I}nh7MO%XI!E(lWV)PGa zAg~C6J4H9AsFHzi$*hIYonj^txyvq>Id$`mu$-jNX`)FcS>&3=zH=5z(K%YmkLAdPYRHfjD`VlL6>5|zahiMc>D@AIONKS-7Jhq z<<3+m99T^XDVkxl35vko>_Q%ZiT0=~Lk z_q?b38dKsUiaGXmF?R`3#3G8BXlrC2Ke;txxQuUOJccyDjV$eK=WWe^H&ynfczLEO zn;9}}Nz(G2YT~!9{_`Q%-S|$IC03P5SwuHyR*qalX(@)j`sP|f$G;Kd=o2BJdc&#{ z-;oeM2wgNgy(+zFusa$Il^$X?;!R5D6@CDP-~4p^J%qTfpY(6IcrjWt(TyUDG_+^K z`>8R6!eS{(TL?4WCV$FD5Q}}jjRvlTj1qdp%-do9`>kL~crtEz;w5FiHP1Te`?C9_ zucO^M34$WhVufG}Fd$g8o~XgjW^~&gwb&3+!P8SJxO-v6y(w^3&RIxf@*Vc?CYO(_ zn?C1bxNBRB#Y_ec=(b;FK?=Qqyh8~+kICYo2^CRiEg31R)Y5I!R(Q zKJCKRN9Fbp;n2!Z3Sj4%+jA=?mVm&uDVC`V5kTPbgPuu%+|u&yRzrRK8wL$8fAXuo zLm7sum6D%$^!9vjn|Wp|f+T`O6Lt^_$1baUHUTQBPu&@KG^$yG0iYbn`Q=ogXz}m0 zB8&s#ahW>bh33RR`6~pDO>!EC2!y$b4)AVqQ*jQkT#4#L4tyMh#*%j$_*>GZ!hzMW z^b)YsK?hKPrPYC6wAK`!V-hkUm6)LdQ?}?~vMiqiklx9yp;UJvh?ymbVjJViaGA`n z${p>Ir0<0Hs*B}1mghOgf%hO&3fAhj!}Ya;gNxO{GtEfs9nzg31LuVoUgc)|$->(f zbsGiG;NB%&^%?(jidEt4TTyU}LRmy5b4^7z3CQ=;?F-}0#j(|kfVWreP}QFsMRQ)t z&5npSh(bo^@h`>ZSBM9ROMWkw${i4@YR9Tv@&T-Q`5Y1EDfh{i7P|gNvQYr34poBLS93; zWmrM5oc=398OJ1*uTHQ?w^9-o6c*&lfQ(XD$Ix5|_z|rv3HY}oK9-p`ub&E3aIDXY zw5Vg8qbCMuAagOcDsT-3+6{a+S=3GDQQe=!g*AByQ8d==Jz8KC{eQGg9GuPmM;-bjJ%2G2 R$Y9DJ4rl2e$Z delta 10681 zcmb7qbyVET)@9@FF2NzV1&847!QI{6p&>xy5*oMQ?iL8{?(QK#a5@m207LGZdG~vF zzL_=i`>U$^oU_)cs97xbW%~9okcFa!d3ye>-^sc&Ln09M0<5-MPk(=>3fWhk0V zvkC)NRxV6QShGNxqh*(0}(v*wQcN}q+8h!=9A&O~%9EI}Gw?C8BLn-|>^$E+h zWmY$1joFe>s|i*ozqf-ba2choo2l>_OSgVgjrQAEaHr=P+k;ry=~UsF02PKfE|(JX9si=E{UF8KZa$Gz*xNL$SqE!>Q~56kASI zQ?)7}f~~Z=Ir=-a8k;AA<~fE9Uv7^cZp)|AK5j+Fw_`V64+vwQ6}p1BiQlsPOcj_y zmmIziIc&Xq`(^;hKXUW+ammJ%#{sse4Q8Wp9qy3shdeF2%D_FxR(mjH<;26K3{5R^ zq%Fhbks$!2F`pv?|4|aZ%gjy}F@VGE@Lb3X)kbYIShtU5UA^cQy|EsK9|pM@hapY} z1^_5W0E-~u0gE)(o!0p<1KH|cD1I8w8oGv$gfkD9g|r~ft;d9XEU}4+l(ds$)Kb0L z_Lf~88w?z2`KY9p-8-AHld-edp6{6L&xnU3B-0z7KspvxM!0=&c#@VPC=oxGuHA~2 z{o^o%@yh9WI;%1?(OaMLoD}elQKosx@P}{oLet=}IM5?G)bg!G40p#^XQ_akYq6cY z=ekcmq|_*eNQPRR&N!6POHleyEwQWEAb}!@M+YWQvr#!`M(>ii6@R?F%RS0QFnFZ_ zbcBEsotNE;4H3a`a4Dd8)9y; z3Ey<@2{6%2WzOGf%EgR!Uc@3rS?7pa=i3Xn9$>CPZ7DTy^C+$W`s;fmHill$;lPZ=_5AstU@U@GdC0)p%nM$?%5&qQJbYf>{kWQ3kkWih1)DAd_5-;Sw}* zPh0WJ%|sCH9zuCA78uoe5pAh?ewd^L4vhM%RSt(Pp~}PTfi-1q#yJbaqv-Aw^t}z(3Us~Hi!p@M6>&{ z9&icvAZy9aw~CJ1@Izi+t<^=-CF<+g!K**W7~j6zPCOaIpUmLpl4`f^Fn#YELC0ae zCF*|(y54*9v?Lhx9qnXPu!T_&PSiyr4>ZX&Hkc4-110G)r+_r5^KoCN!efR!IakNH z7nQc3!aGfl(D3c}2pU*(4}Y{70&k1^)wTl}+=|-IhZI*bk&@O-kxKAo%ek^}aG($- z|7I#H4vpN-cLZ7P_FyXtUito+1e22HCfE3pt&cE)NXSK8^jMwHC)s)MbFTq(_Tc-3Q4POxj{+1 z=J+MZ*-^1)hfyzya6))`Vx3p;T}H?Hfj(qZ!_z9JTm|O4e~IiFBNJK4vk0)8e6UNK zSFT>ZrG;JnWpLrSeHGHJlyJH}A z#4;je!EofH*!b~$nywNRSy0Q!a!LIK6rWa4K1JvF956cJ9(cLx-$$U{tw=U8yf=7G zyzP$U=X_Qe8We2)oeX_j=&4I-rhQC51L)bX&hy3Vkz1degcKgzhDvwuRspzrAfk9v zXInP}jik)VFjScqUaoFI9RNGwD|d$9yK}ssA*d-^VzJ#NLaulcEceFsxT>`MYrQX| zDX!t&%+OcaObed!)5eJ3!Np${H*5>dg?wj4NCgi`btE;+-+s2Tz!nY(@Y893ZwM?r zu#cJN_UdX8GTa?*#+-d4A-QEh@;$C!K#f(B42xlUb5fTeAkVzdg7ia8@`6t}R1W$u zj0x`B`Z*dm_V*@Bz0x$_)Yuu#u{o+)0ci*nXf{Q72~n_jq~6j*tAX5>5$}tI@;DDz zEEi1kxl$DCeDbCpBM75{IhD;>@$!KKa0F~JFm3n<^hjkz9F{acEWSCfiY0!LmCEN& z(d!bQ(uw|Ta#UB!yc<&YouX>S6&Fm{fHmtzh`7FsTdKiWuAxY+O>N{}J9#Y?5!!&| zO+UofG6Yp92JJ0M>_SNtp!Q~#95II4tGN!{U*0z{;MXAqvxP~RG*{{x9G?d8ON3P3 zcqE8kqUH_sC~Kse0Ta>uz70Univn8~kK?xue9OW4RFf1-duCaGE~aMa8fO~I2OCN= zsitpw=Cy25>rnX_1ys-=6k7PVHzqOg*bnY3b(GZxp_7I;CG;V6uCTx>V1v#Rta`+I zOM#WWHUSF`^nBcU4UI7c&x01=Fb8FL5%C-fs)%!w@l`7#f;juAA*ZqYck~G25Yl*} z3Y?cuEl$Dh4b~fcrwqFxWWQRMQ^mRDGzR!d@ORO7;|X_QJ=ODqjVRXpU0_qxh%?cY z&HNi?FAd!ZL#m|V3m+;YjpX-g>b)l$HcT`e8*r5Z9b<;Lfy=E%@)NGW@XqEz(=E^H zB_-6)Nyn5-cFy)oju_f}Ip0rQ_4LD;APdaY+W5<`cIcl|R%KusSq4 z3oj`>o}V$U+=1Lg36J+Q52dF@>D>rGTkPW8Med#oTxv&;u6#pWoaxJKQ<=U=Agj-r zUU~`bwt+mOx4Z2L#vLkN;b?0T^bs4|TY5pmFGN-^P=C-4 zdBS@sape{0?*9dJoc{tk!&jig4K%iW3ElTTy}yh_fYQ7xYf-C=YiU1gGzXc$`RlCH zSX4gWzBh;AK(CgSV+Z*M9OU|+8yF{So&ActwHae<99$`dx_GZ%D)i! zKEEsqSaC!spZwzDn&#N>(B^vV?vyZ-ugA)iOJ){F zOKnd8aN3mQm;GNzmn+oy1DHa>*f%F+rQAcH{TI@W68O8mB3*_MzxxqmM7sn3P4}mM z`(Tx0bPLfkK@X$tr%{^>G1K8Orb)Zg?U^0%#~q7>h3T{jA|pT%zAfl)tree`huck94oNwl+qv#r*sllt`O5Brv? zlkYWET^nI^!1C1ZFm8&H)LnEz34GWcnN6VU2$QDXh?A8a) zXa&OhV+fR`%R`kz!Yh%p$=+Jv3R4|lnPho7UTyF|)>hgaXPYm;ERr6jm>+<6+7XDS z43e`b_4YA-lwOVv1ZK-6tPzoV?K}e{5uU!@x_elkI`>ODCY}FAyH4iKG7fuJA%L_i zHj@$+!u)$H76(yiR=7e=I;{C)Xct$&T(Gp>MY?S8w5xSRB(JL13hO>uO1TaFt~5n* zNfHwNt-Q?=6+9I%FLpj%A^hy*V+=v4k<=uHDz>Ve`Izj4oO7oXN^fo+Gj^tW`94P2 zU}~$LPQ+P5>~u=HG=gb(d|~=PVq7FhqE)&3RnJ|o85xV~cVnc6wtwgy>PI4Ur9CUQuv9q@QeW{P#lSwJcO z**-?J9;EnVPYaa+x(M@|IExL6uUnUP^4muwI zmQ)A9m`9v(o*!{6$~ZDef+eN08-w0CaX7_bDbMi25g)=e$Pl*VsmV(;ZYeu>@mQgI zyAuXiNy8OBRwB;J^Ljf{evy#y7ul}iU|QCV4gs24#;hrkpKm>M3{vQ>WU!#ad?A6^ z7`bF%fHW3d@>nfSARwPrI#?rOx3V>F^01ye7_N8l8=5Ecq{FIq32W6~UyS!2SgS)? za#`L@kWPBU9*v}WW>}=hE!{ndNTxdcSa&!@DS=yKop0^@1AD!y=$m(1K{{##Evw7q zMwMUhue@jj0we?}Py^jus7oy{+hF=TO@p0eOYNk&5{RWb09qJ8|% zNT30LI`BC@H86`E69i$pe#5+t0K-phk^++2jFHz|yxHQAFoR=n&H-0+`6H+ye$Zgh zLqf?S2flmpp$OnlgX90mqo#>}AK4W9z-zm8iY!p)j}c>$=*s%r<|ijp`K)Wn|56I+ zn%)XSaa;1a=NmFizqywvQbVqe9jgwt@S41zeNJK_B6xo^`=U)HX$dJ0ab>P-zo!d7{_Av)?$sNqkn(^x1(XXL@bWH#>*%qV7`lib56GqV`>Qg>mCbNveBT z1vM*XQzc#x>~s{CI>G=KX+pEfksprh8XE%aK9KSfY$De z{NLEc_@j*nja#nTqZ)d`wXap(<9E4qrle59?Kv|1NItZ0Hb6MgT2u=8p)Suerl;vu ziTmc62oTqfej{I#e1<;>b^Vfg#|Hj@frr#IX$~d&dQ1|)=cp_|1aom&e5gvA_7ArO zYK!xo=BPSQug!o}%|&lWI#wWr8VEQxg=Gm}8+s~Ylk{MduUd=#veK3sBC<;WR;#m+57C!>% zt{f+&E2E^|Dk~`gp|=xc`z>NH!gP%LRi!A~d{hJ0BbUf1^cE&Q2C0uuk=mkl!!NUU zu6j||WTIo*>nc3XNpRTt{MKlZZ}dd-6`4f|%7^ zo3Gh5(xzN!GQUk&gd zH6>g&gh0&63CQ>T_r4e*x;oNqhe_f?5A!&7^$j91>X_p221MMPUOoY_^Cx*E^Ujp* zPKO%m8>prbNiU*noCcWc-wC6aXgC7tn32M=fQc7{birB_q~5l$!Hlkp@UF)GpN^KQ zi6qJVwJ4Tg6iXAg3ImV)(^D6}{8M(+2z;+9T=EQ7OE!B|mgufY_1G&@>;Q}K*1apC!1w< zg7bdtRA7QiLbk@T5~8j)QM3TMI6ghy)_mtqNld25o8BXKf{RkSU+epby2Fv0=Bv!< z!@EZ{i0mZQfr%6V2vBdS)2U=(D@wP58goH*fS^Wj(UjHd!k^&$Foge-k@O%oGEifs zM2Z|5cXLckIo!|>yEa^f*CT)>atM~dqW~3uB&g1_3hM29ScL1)!sGyQN3@79zw7>3 zLlJp{Kbc8ImK?L>AGgSqMGJ&6vBO_J^Ag?&aYS461nBR^{L+x46Xoq zA*f85l-yIJm=TzRZ?e&02mPQrq+Ryo0&&MOWc*lYB*feok%cEXEC!OdD-X82{ho0< z{hs#@cNz;LMBcpj^SJscGd@n)5z^Qye|Nb){rvmE{AzgK*+>}p+?SB1`MXvm;BtKW z^J7S&n0k~^!2RAAYZaq_>m$ha`Bz)b8YCooJ?zxvE1;ZRC8>cBny15yb( zoO1A&2v2QV^q)6`@5pPE40bnhRD}Y1P6`p?tJx^j++_r z&l4I}x?2e~T?Gk&kjJI-ZxgPL-pLY+*|5dtkILet9AAj0aO35qapxd+WRzrQ=jz8R zo-v@03c?zP!cvZkopeX@Avm(?1teT>-LZiJp_Ome9X2H#rvmhxlL85lkU}zK?u1R_ ziE4L0^_3@Bl9yK5{nXc?6{hqarmFh>RV}2{ypQi#T2h`IIQa(ZK`T25N22rcYhX;A zIspUj=C6y6aQ<2<-g~3By|(8m8CG(5k0BeOn1eQ1OymJE?L-BTUJSmhlXL%MzH{MD z%#GrD>NzgatE^8y?=A&)oP^_7F>%x_^NIvh1d3PXEVDk8twY9cEXdb`l_t`Jc;oR- zd|QF0QCkCrK!+?*tqG?O5XP%^P6?vtN1(4`qw>V-na3nsl?>U%)&NHxL*jH^m6=z;#5v7hRhb%wF`4f;B4 z$g)CGNp~HkHIS>@GkCK6(Cbqq%$f(OAAQONoKqbpfLBCW8D0%DNmedPgQ+zgTV|e2 ziw2Et6^n5;CG-LnDU6{Q)|#Oa#oYOgVlx;aDQTvs%MJY5BfJ{3jEwQP-!YRQ^cLR2V6+7k+4?4E%F8cuP+lH}3LpvWii$=im5KUr zDBP691YE3kIgem?2O2N-R&s}m>!`9Y7!>df@BvGx@~!)iJX+BNQAGxmb_y*#&|gw&hjaCgfG(?v+GdV6wFd zEzqnQefh}yNNE1hQ=N)u!8Rc{dOm1F5!6<{j1vQ&wtr;ro6Zr#i#1D_)6hG-WyA_| z07~d`&Uzy^`qf|Sq*j?&P*qUXOjNJHG^pqpFZWm)hQcI^mSx$gB}0V@k^#>HKZ%9v z+|s`+3=Pu#^7Z!aY+FYko%>v$VxtiqWXYs+{pHqpo(WqnzWc|5=yM~3EdW&>d z@uGiRv3QB?e@!eVu+^BhJ#C0`OLKR-*z5WcFvQAyy7HA+hD<%wI-#`8DS^&C(lz#e zM)L`O5CNlSHp}H$#O&rId17a3PppeY(F4tgippeys)6=J66j>qU!U_hX%LN&=zx>Gv zE-Rjek`R}sp`fz92+*TK^LPurnL z>wl^mGeC;FnnSbkW;8jeH!`9jJoJgDX2!Nr4N;d=sv3-64!gZ&N0UBUWl;py^Yv%+ zzvG=7eN=Y267}p)R&@2jGm*5+m#0f$cs>xmT_WTLjU>6y+D1<|^#UP{U!0SMviUm& zKiFZ4j?fLhH9xL~@D4^H&l!|w%CD3s;|~OvB?PJ4%a08SAf3yI)m_4wV&5X_f9Qaf z#C&S5?i7#8B=j=L&S>oKlf_+6kSXWlcgb4|S5V7iH8?`_d~l1LfA)$u3D=6Nn85q? zP*I^H1}tsA;1y<|qXCKry-_246uY9oQSQdJJ+go3akzI$)sWAG$E!%88$_Y7Q?Gks zT;$DIXDRz>l0rM_eI?3g1AS^8Mf36IvRflf61(z>#VA2wwbs}w2=HVVt`)7XX5lGnlhnyD^&L?*SX7Z%;pi$DEU zheSgy$iB)bN)a5uBa=fwWCsF;(riSvOQACM)E?Pd^)M#JtXM5o1lqBNf zmHhFBP8S-y5OY|>^@v{MHHNZM>Vnj#>8xYQt!leGW5^9G!o%jTTq@Ly)0OeEP!F*0 z-+h-Ek?PvNhvNx7(U8RmVI?tVvsuR3WtaibR?Ci0KDB0iw|&Y;t-{NiJw|!WU#!SS z?^a8Jgy3wV6rK*Yt89^ZP{hd#!tEKu!3Peyhmz6IKr##eVehuGr8{Dl9k^iNa2mT5XGI_sO+6P z^#+W{B4&e{H#P@+g?7ohFFBBYv!8yZRS@1HrlYsMRI5}7|Di{Z9SKD21AS-QQ2`P) z82_m}VS!YJ*I*%{cTn+Q7a}rX)lF$W0r^rRH8bV51qJic0-u9|Payxssi+J4N_y^0 zD!V?Yv#U#OLQIm)4apF3w>sIt+l##;f7VQhKZ#s1ve20}VJj0PXx~3GPBq@*#*Qww zz-3CGzw$ACrc?oM>(ZYu^buEs!ADk1=?R0losdeZ^1!h-7Fp4+5VdRCD^e3 znTI9xM)$GfO5MUdOZfPMm%>pc)Oo2|H4k9S9@gO3(-LoSpZpt1EqnI&U8eVWvL1!X zNvzO6F)w_D4&O5;u&Mj$zsAxnR?m05f$NOKB9v%4&)SsyOjU)QGK-#u;G7>HB}l`LEvX|H|3! zRl%2pg#Qj*@uv}tP>rTFmZw~%djd!qtry(L+kBzDe>_|X6OZpV-(oie>JnVXgqu^L zR6B`MhI2_~G6&=cx76n82%!Z5<-+i5MrPV46>g(q()7ObTWKwdJDB2+IuE&|yIxnR zOIKJZT3)M}XDHt6tgF~~DK+x$*TQV%sK0?n*=KLcQ(iVm``kXyHxvwE_+9lU=~3Dw z6aE6(_amBT<6*B!%iN@UKae8Do=j}Id%v>7e_;&(BDeXB%Ue^Icp2p6>-bi@^DrToh%}Yh6z$$-& z(un)OIE#@YO886SovgmyKK*ps$Me_@XCL-}I;KiVzmrbl6N=k)Z1|HHvLo zrscp~(dD((b<)reW7(Qd#_tUapH&~)gtYBxuJDM;GVn$guf^XB@2D0wVJS?t=Ch2K zB-CH~5wG11Gr9zR^N=SHs4Vs~bgSxaT$$>8xy;HXqME=ZKLl#!DI`D-M z?5KSemeESm{sS7oLHG+Zw*Rj+=KR+h3o7@EVuHHSN(%By3iIM@q2UTRU*T&%`oMY@}47N;>NMevUe>kJE2wASg&6pWNRny1dZ z-`#K7rSvI#YuDT2%pvh}l|W~)2{zGa08E!iu3N1K;@Ev?9C8s8ti=xv=Bir{Lc*IA zCOOS))$aWo{p8GlvC)_=kL1>T0Mb7hKupnw5My-w@ws2~X34LayN-oYA0h|)F0>Ch5| zJ&%F&szlzQVks3iD(B=2x#L?W1KiH2-=ivfY}IVWSpxS6l*HD&%*GbXP?gy{H+808 z+E1D+T~&KRAKE4FtS*U<_%U;NbW!=38HGCR;xrhG8TUiX!@=y0fSFHO~4PVltNVc zU1zE~#YhsGQ<@XMo7>n-JXrp~ompPzxGvRBI+SETPfsQxrkE z(+I1%%=tal=%zQg>$rj`aS1Km8#+#yk6Rg5Ab)f4a6xXB4fuP*u(4v~QfN}k_QQF! zfkp9M%hwa@-0NGT*r*>qj|T&sv^Mn{`^EHKF}15i7qDd@$XhIQR*ymgJ2JS*E~{(u zKQN0#cbaQc_PR02jk$D`P(UT;zqSSDh8%U1gx^*bHKfW+LI0}G3@WL`TDinLzr{kf z>eK0RLd%Ecqv2xSB({t1lnmwL5{9%aAe+wdfq>L#*qn34YK8)Z5@ zM0`>Xtz{mXKm%^zr;kBbI}E9jxG8&}{FR9LEn9~)zg1orm$ellR0`&7s&i&d!WUc; z#4D%^H~Ecss2wmr42pTvjVH|cNHNEQ zq_!m(?L9+nB-%LwPy=$*j^iM_j`}Nn1*Gn1h-~YajXz9S?EOdPTtgQaFQGX@e*L&Lkq0R!wG86Ir zvqbsZS(E<%;>`KS`G4c~znyg${&M~Yv;VhDud48$r{JGt_IJ=R#{Uee_21VIe{tJC zqy8JbUsn<9?~tz}VP*ip!r5Ha&Dq7B&D`0|5{$`=EBS9sc`a%%|HJ-?^}pDgy14wg Y|Eem$y>^TLM8*Q-zMh$Njz6>i0rpo}xc~qF diff --git a/pandas/tests/io/data/test1.xlsx b/pandas/tests/io/data/test1.xlsx index a437d838fe13082f989d7f8ea253f091490c2777..862574e05a114240e403a653b7fe22d2feb458b3 100644 GIT binary patch delta 8845 zcmZvCWmp}{wk_`NvT%3TKnN1t-QC?C77!%3EZl+<2)1x{3BlbxxD#9-d!KjD{?56# z`$yaCQB~ip9yRBf-DqEKh_5UM4TA##2Y~o&dU00{v>55}j00wx>S^|IrH zpTj?)(=-*vwsH*C_=4j{6w;p{b=+*f*kHL9%mBM)?t2I{FvEOiCik-M0(_;{U!7Hs z&Cy6xcm$3uNm7FtD%35L>LxDj9=owndZqI#gR@y6mzVpY*2E6`4XGN?zAnX?9W-04LWtA&-6c(J808`^`P z3DgsLsrAe}#ka|?wmU&&WUB9-jTj(Tg3>mu5o?ge!~xo z|9xP9FmxK`y!wJImFI^b^ifHKNM)~Ew7X_DXulIpZEM4Y(wU|F_fLJ?_HIz;ETvr1v2n~|qM5_x% z5{3dI{z`W>?<@JoU#OQUIUAt$S^j_9N0ti zO4JZUo2Gd+3>E^~U3(ZbA>ABiipp;{yyT%A(vHfH@i4ncNk#VW1tMn9+e$poTBgMs zk<+q`7q%QIhhH+T~7spJZ2Lh?@CIudM2$8re~q9t&QE6j^BGjuI@ReGo^23W?bpjjG@*qSf`p$t4DFO5i;< zN~}mOjGmMfPl2-PNUQ*)bv<3cZ;_s_3HTI?Btx4H(l`u(Y$qg&73$ZDnp2^;!oir& zE7mYYJk~V9n;qCJ+074UONWxP&%SI15guz+^NU&K2BntE6>=utEzVZ#DSEG_qBjQ1 z(L|nzaTlwftw7mq$!b@q$98V8VnEWZ9`>lcKx-CJf{|tCInF+Tx|-wEIY9)fWMC-|G$qJ>dKo zq@~q4`zE>OYIkrRVApDU;h;siZ)5IuzEMH7|Ds++UJv{>FAf>Ig*URv}G# z08~-E-f^HvXYO(xO{K(l>_+Vw9)XI}COC4on6K(A9!c7jRaB+tvySp0qFbD8E`H6^ zH4=LvDEg{=%z68YE;4Z|!n3pPbzD~Iu@K^P1JXR1T?)W87y*l{H4gbV7Nb2FyyLBd(BDVIrjSRIMxioz9Lx zX};&j{B;IT44s|ePiyx)V-H9M>$YH5`5F~!qlNb zip> zo0RL6$$(2ipOWmp6IkMoQ3F7H&IGL%n#Xl;$9~7R^uZXHZq$ulFMgy|jZ<9qWC1PKZ8@hk=Nsx~m(43W}IB5C~e%$X+JAd~t=&ym^$ zTdjf25CADoRvS;2)GP7r)FGTBDDtnOl>gWYxB z6bOL5+$TkLn@o(}z;DzdMA|(}%>H!tg59g0hu%!l$*+-n`S@cY*4G5TALw-zQwx3g z^EzvNOpJIEqrLWGV8 zXNxgT5V~UqFEHyN>jn%-eO(h@BS^G2?j#|5i0=sNvaQ{Q1G;LwC52U9RBWm+#sMML6MIuyS~YVgD^7mA$I zwwP?HhhbgQYOv>rxRP&rDMH1MZ`5iKkS$fnKx_8nbPgy zX@19a;#$JWSCaj=M_P%MuOi<0_%)!y*;;jI;Ues~!5+0N|Ag2_D)OWIGV#u0`FO(} zYk_%jg~phR=GHf~8i#d)nH1Tjo&F_4(~P0WYW zh5smm;^y_tvlCCVzByqJTBA14Nd`1Do~USYaVE7BV?)sh+^EtQ6{m?g2`5i~f=2DQ zjz^@fKO~!?8(XBR+@;R4m(}$|jh1@tC@sCq4fOrvemUKd?kkm!+?$kY)Tw+WoE@{6 zPCb(fwBx7e_E(Zlj={7*bOb&iGe66%d<%5NbQne$T7-U(u+L8=lMz^9#2Y{ePi*eC zP%K}WY>4-b# zy>Yh48~bZdxm_Lh*iMrpe9`qQHiG`HKXh~aqTAItT?%Q<9g;+SB9s{A;Y z-==l0F@m=~X=G5>LQ)Do?NLdLZ@oHeY$)(hCOjBDO|5Qn{v@3V> zFye+tvg83mXyt~zil&<>ApT%jFW)dV=_9(@ZCddqAiw%8W$VJBDp>Y{wYgY zZX-~0j~N6K_%(fW)1EZp8rg4|(xNm@Avuw)VEQ?(ag%KfX^~WJH_zajhQMTTcbV)^ zW98%$;*CciiMdhqKqRpyYrwyot?ru>ho-Q}1GP)`J9>s;0rr4BsjW-qI2J+2D$N(~ z%-_J7DbT5I$k=FI__Py-(BGDj{I=Ht2Wkz(iy%-Luy(QZGn1IM{f-%co?)EP9K5iQqcKJ2GZFU+dh}Zs(*5 zw|O58H^aI6x2JN;8N-j&7UOmqe{=_LZf`_wGur)wf6<;_9poytgLk>{x{;Np7k-fn*G*o_cf77e!q zB=4Kymeyyq$5bJWXGr>jxXLmhl}*{W$m8ABmRenWY&~qsJ3be&P{p{{#j;Pchv089 ze;7-B(iv>ch?wP6YB6mgcI7&XMaQ)JOi9-qOiQaa+5f@up}`iRiIIxi?g9INpl`t& z2-iY2tA}5p*6c|rlk0fzWs*vWUmGEjET|7s`~}C6nC_R{&tk*^OhM(I0u(qfX$4_? z)ZNMa#6*_mg5mz*r#EL0l}-2>fl96-S@4!xmGf))zy?VL^%9`BgDf__16pvO53@*H zn~e!RE;1kQ;CRXtD^b4sJ4f1D=%napb(y$aHaMc8N;pksLpWUz-{ zlU-yPH@A8qFN5mMh|^0TK+_0R(A+0HJ@ zSQut>w7k(Me#6$|cBlT5zsor$xmwOs!>>FY2T$8D%LZY3m=49l$&-1(FvCi{b5!58 znkj(LMF$ls+VY&E@-ZJ$gKLu>f-Q|jV4==;_!QX>-o#fy!rE7x*w&(TUg-OssGqrb1o`)*HYp1bdJ6PmyZuEpG9xx(zl6TZ zfuK1X;uRKrQmqF^NN(eA^DS7%sLlvQmy5~J<3(++Dr5u_=tYGOyE1?FV;zW9TLFks z?9tN#s08XGjpQ?$c!>bS3^<3T7Y~F(pi7*ybmtB5iC&yci2gVp0rsGNg(Mmf^_#=z zUjdi5bqkx8j~npTZAa`J+<#P7HSFu5asneQc&%adbFjT@mugl0ElZMPKR3`nstcKz zE4Co{TYp$1_eAu~G_Ic?xA8dBxZp=%C%so@4Vyn(4Hl>Ca|6A?g22!?Fvg8)6d(p# zE=VxQfR+@ff2JhP_OZfB>O1*{IXUzEEWbT&dndrrlboNxUS7!>cQnjvfYf>)o)X zRl@nFv=ev2YXgQath;y4aCZ#fRWA}kG9SsfJvk&kJKxmdOZ8sN(ti!^6B;y^#@%wl zCVyMIw^?XzDJ*lEw|8bqGP(&Ls!!|-i@HLtkj~KiF3%{yA{5QyRA&AP%^Aq@^7p`i ziKtU-0`Hnb@Vn-K`Y+AF&E3b|!tJlGqFz(MX@wu71Ap=D(?ylCD)Ry_HLeE$5_Z`% z*gUvn8p{HEH{z3uPD}Lb9Z6JDWuA>e(V`qPT7=X3Xch&i^SJnv(rCT@4-_dr3Q75t z*6pK&tuE4N<1iJwvhwefQ5N^We%)u*w;hG$Wsott)F3ky!kp#3wL=|65lg)M^sOYH ze2-#fY!P1S-lye3cHM@v1RQF#KAOa2rV~uw+Gku0nX$AgL|GPM+lO3aMuydD|hs8w+hIIx34^* z_7t@ErkapWKh>;pM`9R(#`v)t@uhI88k2Lmoc0Dc9|?S;w#_|*P#`Jf92nCJ6u(<2yYG#wx7xbo#eGw8toPqz6YT_3vn&DoMGgm0z-xGhJ z6#$P09rmN@lbfl}+ydPnS5`Asa)segZ|C7>gKx*bm%-JEN9Gi#zzD`Z=eOp)jE|zp zmop24Z2FaZ)}(%Leou_rvkO+wRz;=+#m5!nLzoUkE|06j!dwH+FR{8W^fL&XM!#@Q zuxa8C!xv2+<~$EtMjksh;b}6whSkz_yCkF>?zQ-j20KCn;F>5_A5zoCp=8lf*A~ol z@)Nika&0`S1YH-zf#D);*Znbjojbj*HzA5aIyd;c;`lSIS-uC6ZaCIKQFHa!x0mf1 zTO)ULnwN?p=!A7%$xm&>VCZ~nACrUKUEl*r# zM;)vR&?&1lp4ZA+m3D-S_npJ7;q#y!hmh0c_LS)ilDFEZM*4~L>z3)rh5gncdO5_Y zz@0}s$_AYZ0Zx+%4s)tIZbfR7j54)KyB=%folN3rGhMa6h8Wi8*8tbwpxE6c#46fK z+OR!M48&9snGku|gpoJ22jJh?ocEDdwm1v~gdOtV%M8cg%Z%nqT%jb6e|$3VQ#*7( zmv*DpPF5{?VgYrV&tS7dA>?v!ZdSpwpW>HG<3Syt1ujxZ1GRm1Ef2mKrD_XIp{&+n z2OeaXnBH#njJ4f46TH9&H|#QYYkl3*>0V7syv_{TXYRL$(!=OVvaCW5R*3NOj4-Kv zMG!U6rnZn_hPIJTt>@?Zyv5OG<45YTGTy_gm(p5k1Tjj6nRcWP;W7c}5o_mvMl0-Z z)iZy2P&Y8B!oTnhJ=yWX#7amD1PkBGNbdB(^oBa-;mRzj{z!5?sT}d%~^b zMmDs&r@1H&$hduokhRF2YXtk@L%U7iJgam972T68@k`<#&kK5(%f{T!Q2k3OMdq>g z@L8`EC4mX@`ahrD=Rl?Ye8_J)tb8Nz!T6YTazqTjNDBtS9Vm-9!s+ByOI}g;XAIEL z{4_)zZtE@}mF5}c_hxuSjNJZAqf|ksIli%n4QB@v@uT<)HO+tY-L&!!4FcOh=5mYy z^hJ(iDaqMi29B@*Gporojht3i0kSHj2^~G^64i}Mn48HuAblS z;)BTI&~lLy&g!_9Ix6KsqSpBt>Gzx&EJ4_>a299yWag{^r>!Dt+uAB2^t3`KjU_Hw zt7_q*x3m0uw_z)(A8H#fHIAkq>KoaBkP=9fV+=RQI7&ggmTZ%u5v&l%Zqe*g*~ zE9GxFL#cfn{=$t!af!#^8|N6qL9BzbX-bLHbv!bm`RFSa)t<;!)+Ta;KVZ$T2m>4` zs@C^E6SL8z%(|tcl4rmArIRuo8)HV58N(ol@lRdBIS0UP6S%RIi9vNQG&2nYYsk-1 zPr%~Cg{1iPDJXh;AR^@6uS5@ZCm5K~9QoQR6qu5o0u6DJo|WUuIA+1Pw9+cvnXMl1 z!6@R)8711*muM}j!=2*ALul&LDzA7a_RKR;Ohw(i{ko`BW`5a41jr0U>giPX?N-%e zkL7*Ab=kf+6gVY=&=2(8Bp5@$WJD+)@ZLS~U@unNigrASEa_6?rj=O_TnPJ+to6p` z;u`6oD61%uJfYnw28L&NQIRfE*uoS-{n;mayd>=VDVSNqw6-g}+lIC<-xK8ndUxb0 zBv`ieBx$a4p3xf?w-4mrv>ZD@!80MTnkN~^Lr~A9nLVNPp0S!^t);y{gcuFL(bk&` z_!+I3^nI}J77_IzjEiAyLVZ4KNpVLMyEMc~{HKQn?P%#BGW?lu~{ zAB?#+=3>r4Q~S-izgI3H`a1x#BysIg$AEF>rzADm!7P`EOF0x3m?zpVsMq0(q;PV| zvSmGWQ?mA2yzMmieH5}#U?MD(wyxjznh9-cogbK$4tt(6=Ofx}LwZkExQN;-+Qb0^ z&Zj5C-&Uu|2IhM%S2jqtis!p85hElBPAWZKVu7IMcG`(I=)VFIq-Tc_48S|PWdDm@ zIRA%Twk0tF;_KgxAJ#lK*buZ)H&t#aRB>pQtjvgN;MyWjTNr+Il`ap)9GYm3x+dBm zDI1hdl({N(btXq_4$mr>{LuMoh67jf>$K#%XWd1$$T<}*oI;jH)`>9_7qXE8c3=+h z!Q#>WIKt9Np7_&W^zzgwtO!gh8onzT&0Hi^fK9Bd9H|If@lE3%S~kx*Q`)7VK!_|F zjbc3El0VXOT|904IreKWzPN1pql^QZ4)P7()Al0D(y!-KcMDAsA=1LtR4HjNE86F$ zWg65koZtLcG#ba};WuAGbW-8jc>EFk4{PM_p4w7=QC2&cH94SUfI5MDch?>TAZ8IO z--)*p?sS?4AI9t;r?Q_DDWMMn_&#!yK+X()I=2-KCDQ0I=4Y=SbygzpiLTwlm z!VnU*XI4e=gTmBN zF?=!$ZLVO7_evLC0exxUIetl2vqOH#2{26ghctxV4Sh=E2^t`0b-zoDA5ENDFOv%Z zv>*rbVNG{Cr>`}@t4$h~?#Q(T_VkW#9tks_zeE)YwYsVJ15c`O++a{54nrBH7yBv4 zvB^K&fy6Eh=@QeDs!jp9DYqKNc!Hfw;wiuM!=*>?7yEI>tTu*ZvQ@rC{8#N5#DOTJ zNOo7?KpFQHpg~YnB?eYAo&0lRlZOp@aBg=fmjZZl8`>nF#u0DcM2fj=ff6|oE7fKI zSCyRDlmvdF6k1U=CJaZFaTK$CeYudUyL>3PHe2wWIT{Bch22VpF^1lW!_IW<532ggC;RKEa)RZ@gZnNfaYdeC~dZjS4gcF?Oj z*J6SqYqi}p1+FA{#dZzpYxls}PxqaKF1JMvwF!A-4_6NEd}CQ@=KhA3I)HO1BJiC# z4haD@>o#3P&H$niOwpW->Wt>0o)`#pCa9UnS82T$#6h$VmM3FxVWjl~3`iYRwD3bVflkZcuSJt06Y)#q~5 z1-2nymOLl?ogLQ5kd#ptDut<9K|!IS4UGrN+=o-rmxoJ@d<+&A~GYt{=4~ z-r)bD4MahT9;V#)GOvBzC@ViCP#X*()5b+N16J1Il$e*ky#_hVdNkhD5)Zt=-y|l_ zo|d`jL?qf@glV(7Cg$_y*S~6i9qL>7p+rdqXB2`=itHkZ)%3Un-w>TY@DqRmzHSs) zunUt3lZ-n+%^$g~2-*dyr9Jj!640w>3kQL7q34#;K%%9;S4*(=Nyp^s{pVYf{uC|~ zJ2xw69UuYnlbjGe;wR&s;CYiYNt^_^0j83-T11<&<|4s0@Jtf$Qz82>iz|bD7;UNi zM`RQq)nb1ZnRCVrQ)K%A_rV6oH^$OE#b7q}5b7S2eSTSfy-|8t2q9r zTqlA3ENOU~n@;zaPHtWfC+~C;!M0}0YtUyu}3Rkd-N?d0;4;(4D}}>%2b=; znb(rgR;3@2Rcy7DJ!D}2n`f5m^CO#QAzz=mpVfb^l`REmHaa7rUQz${%9I7Jgy_3h zOzNbV$svI;NlJEo)l;7Y2)EE7?fgXZs34u;ufRxE&db0;CNc@vtX1C`)Z&AlpEtubWPQ2Hv|Sjp|@iYUb785-d;2t6mrIbEC?q0+`$T*Ni^oE9bQ^*f*Xiv#8=E0KvT)- z-B54HWP#2{oL#>66!z^68bVnP5{d=l|M-cfKtid?vWzodre_qQ|NB`Tc%Kp`XqiE`5d{ySy=$3}_>1jkJV_$TxD?`D3G z{M}41CobTh0OG%ms44$8Lg2z>{5M^^D+Cz-wDW%ENecmC?qsIy>g4RkX5#D&!sNyQ j{Cgh1cOCOT%w-us6Fj&$uyB9fMSb5@-w}J6{<`{q2DMcY delta 10588 zcmb7qbyQu;^5(%^gF|o#Zo%E%fbvi1Lj=&rLsAIxbCzuT^W%uP$o7|l z-Zp<(qXT}eEc{kw+Rb^E&-y#6AP$H{J5Ca_6NZ2S*jc>XWzXR1w8Vrq#@h?o!S7n1 z+sfu<)l!vLlc_a(MR$H22vMSN5vLQ?4O?J~ecgkiiG~5Fn;iri^|JuTIYUHGwvm-L zt@3R&X*DH{wD4lxZGF+i1Zvl2TTZ}@6nH2y1~B)^*xt#JDt@gjZ+W;BKxly(OKA(a zbt?UKE+p%*kstbVT#d-;JK=qDW3ChTnmRGxqP^q?-ktqE?6q`WEQ)m29)oySQq=Vf;Y=}EE%$*3G zAtgK2M_&c`4fc|e3H}wI*|^NnC=4<#uCk;w!eZ{}^d+227X0rqs}9Wp-v@u9>47iE z?~%;jSYK?2hfkas7WQv23=wno>C-4z-Dl=0BU4CQh|R6}`1NT_f(Z@3B`OYd;jq*MafF(=!#Az~ z5l=M0s-_RR&N}mja0!Hu-I$0wj#f7cF}5|6M(MOL_>QBs>;O5;Bc{~E4H2TzK+*^(disZ1Ir@p=TC}xD7!{0z>ZF>JTKB~p zADUgxOgf!1+_4f$Z%b0Wm~R;Hi`6DSGX?9wYk#I9SON{bb`e$%5;4O4W7;V6>jXpw zuOj1vSbxvH49!#fC2900zo{p*q)wAl=2I^W)bsb{io+a|Dg5`5Fb$ih~zF zd!YQc4LY58yb;0ik!3Ev_o;2myB|*n)m+UZiF9|IpM^l3Bz+y4Tr$a!WFOFXm?@j4=e?Xe75C$k;f})`omS7EPWFurIG9!xzsyAKy%(CXK+3i+)`t<`>);UADEpT1|Fbp!n zz4&;~J7W}2%-FsIO2nt@9^yAs{HCdTG%uPvIf>)V(GQwT;U{ojR*!aHv_1e)gwDgC z&ggwVEHHRD8K*8qMbH=9#^35iLrCK;m!KKl%gIQyP2dUxZhcFGfv3~cwE5Ue>P zc-bH41^W4$Y!aic@jrDaOtcPtOa-~tEOR!xJ#c*FAS8mtv?SNwzL5hh?h46YS6Nl{ zLn0_L(e{@n2N$cFQusiRc*-2(bZs2$r1Ggt7n-hh2$9Gi`pdj^JSh9x+EVR#S{GgO zexkocI?a@`_^3AY(Z8TY{)bhb3BTv05E0)Vk(Ric$-6J+rkH{OK3-a_0(zjlU7N^h z4!4d5e!Z=Mdi2S+V&bd1gqzVlyedrM#2B>WE2G-DKG`PSrbHhq5@tM#Au>@1pbW9z zRZmemu?QH>bxBgLtFX|SV6v7o0g~ZJQ7rSZW5c0uiQFae7kxR*LIn!=v)OhT&1Q^p z*c0WeJhI1up?Kl`Y>FmK*g2qH7+hv4sAe2E8ib;JRx`?)AVyU z^h4x{K)MSPIFRA{sJxvdfsdqctFJ=!mi3JDx!5CRFf|O4WKUXx;Z_4RhDu})hlS9H zRlJ29WC~N!rN^J%u>|P2kzz_?vp!nFHte2^)k!e6rWN&Mp{oZju_ZHpup~2|HZ? zuIG#wP$mhRcV8N&FS}h)?LJ!b!%z)agQegLjO^zC%{S@GjyQsX+w1#`R$a^I6p+8f zACS>o+uO_m(KNY-)#rC**6j#BHc+1c0~J^Nei+7cSk_7#km%b^uMye$agJ78+9nz? zS%#`2ziY5Aq3p4&>AHC#uQ6? zIi_2<4RI32JJ?p=lN=kQal#E*WfA4bcXpLySJ{7Xaa1Xu(O3vtAK zE^gr!=0CoIUMoGw z*P>OZT#}eX}O1)ZkMz*FQtdk2Sks0GhvY!wXSw;q+92akD|x(r6d~6BrCX zR4pY@Yjfi6q{m^9_wWxQc3!bCRGt}WmNIZSr=K{f&Xj|vY~u;eG!Jb*zHQ&x=Qugf z_gMhK6^}MLI3@#Y?wcJCyghjz9Sj!OnwR}=vLYz2uBo2!S09_?FCI!0&lwnjW}2=5 z(6|xdh0PzN%i?eU2}&fS>z?8_SNw*n`3LC+alIX1kuH^=$9bPFwAGg9y7P07jla?X zs;O`hpNsz5)1YOlh|$0h{iyZP+Qf$F!-nb1%y{w$zj~(gDTYwg6VSWUw2WS{$80BPSHB0mJh->>*wp(_7-27GZ>2G4r#gQ0)25+peu0&8 zLcHd?P}*~i5-8=!Pp{_{`Zg<^LWXyni&YXBAQ2YA2l$J@gGF^p#4}$Pte1O_sd$5W zB5@TZivtw{f=dxIh~JrF36dXN8m7AfFMn{JE-f?zC+p9^jN&d|(LaE&HACT#XvHUy zs%;{@$lQQ6xW@B^Orc>qt(?7tp{}0p+S?eP+jk1vM(zJbyLN_^B32tmet@JSCcOeV z+_Zo>qpdI`6HFc(HOBM-q=O@1%3o6FEJfOX+|i;Wj7!;VfoX?0vDlJkOOiC9FdhNt zM%HqU9F`oE9W|XI7kqs9DH1nOUt$zZ8BFEVk3+_wp=pVFE)|F z)kwKm^RoReta|}CjA76!63C#mfa5zS3+%F0;Ta=7arMe41$8kjoiHs7#-_RNS5%H4 zN!eA^#J@S@c==lU{gQw}cxuLxmZs2TZjQ{OmnF3&G7`8Cq;tENNmO|5H)@slmdWZ5 zrRxV#Cy*ya$a)r^j>K?N=f|muc?IkrCSG4qFm;EZ%R6a~Bu6y$W~IFa&!rHBxvD*($PPwX2G@Zk*<*?#_7r zWs)#?52f(avRv*!vPLm6Z=tmcR{DAEhyak0S>%!e$;s+{TOX>_LMnt=Vw!Gh{|$Rx%BU;18h%JT- zcb9HdK0ac6q{zNb4isNa(VL-q+Kv3}q`z8Avdiv2Sn;kIaH;;3AxQJJP8$er^3_d= zI)D0gy@h)U)+5jPe)J`{S%P3CYS;HEYCT<5JlqB&kB_q}+HAX*ioS1w zS3;BTUyu1fK;Ru>S=7|`s6hMCIcIhgUw8D%ZY?ILTlA)0=z{1c$s7Lu+$}&<&I!ZeF%Z zdd|bsTBQ_SD_1NI!>zTO5-%T9`g0##0@lcfvLnR~`j7mJ!$C_cH)KFN&+#>yYYIUGywu9`Tq;@Ip z!Ejlx5LC9~e3w3LLQS$vqsTl+;U4#QZbSeyrBw*Nj+9vN3Njf;JN9*~46M;#@oD*? z!HO|KmpR$a_!g;MdAUc)lmQAGoXbl5ZenD_(oE?KJ`2~Jjo3dsuW#=wCtAL%`&^>v zxPFHho>rAd89-38(Qs@yOy;d(SeWjS5C*OEC(8kA{#_T z;45+DhJZ!)BeX#a2iUC9I1KigCGpB@Bykag1SAO69icf9qE4)uJ|DxEh{<#kMn3t~ zCPnj1()poP7<<(My>!)4NRw(3uIq!1)y{6s)agf)sJ4CK$@?codc-zRl#G^6xjxek z*0YsV5yuUFxB%Xpuu(_3=Py5=c0*%K8o$$n8mGBW$O|J%9I7SMW*-ItsO77S2c`i^&#H_v$qx<%wV;< z+lu9VF;Ae~@JKG(Rql@mA!)~-EDOxoK=L^cu+)K)Vi#gc%?~sm^z=onvb4AbOE%Li zG}L{WeGUZ0mDUGK&kzdR*>P_^x-ub$76(}pALoyynrY|G5{m?iq*|ps=zYSAm?9@< zzK0{1YjRHGt;7*3rYECy&i513wu$V{Z7hpoy0MqcDnxGU)qE>*fOMGZU( zPB&S+KH_6-Lzs;n%4)yJ;-_TLb2X{BIdZqDxDoM&m>>~0V3MTj4=!b^U;)!Z@%FbU zZ;8vwt#vO#eWQ|wbksB5Y?^MN>!xlt7&FwFTW$_?dBK@PyDr<;mRj>NbG((LhV0y~&+mAj?xPskgr zNy(myY#vb{?aY12Sw1#yHn zh|a(7{8T|2c8xQdMoyd%Ip-ZcOP@{!f-pTk#qk2RPt;D%41#8<(^%Sk3xX z=6|%qaH?3Y{TgS)mje7$&E;b^=lZxlHV#tn#gwkfnIuBONYqt-h7O)U`#LV(Ofxup zVVq9>z*9YKJ-a3kC|SJGYx9XEP)@skS9=gLPwPf-CBUaZV$bTN zO*q|4Kz0B!eHt0Z*dTf+df(d&ROmi0h&D-wooHXIp;Rd^MoKXe=UGI-5mwXQgtgM$ zwN9^Rtah*G?Y)iKyilRH0$whczodqT$=U*Hn`Cd#m&czU?@cZTrtS3wLC@W>$?A`l zLO$oi<6j;E;zU%#^?mNP8!eRdeXjOT*95M=aX}NSZTO~PhTDOm>UvZsM)CR+Gmv}l zXzmdTQDGGQ8$(?+sZf7ivMN;tQpDIGV5O-wWtKi+y}_ZHd|BzzM6D%A4rPJl7L3vLfmU)_`MNp zKX37kj8?^^#ag*IYYXP7B3cF^lMLRL!#U`4V{x7G=!RKNKyW4qcm)Ec+{r zB=d8{U>|xmfsDded-FiH>B1T#b{|gZE?W`bKJL%9{7xtc+1 z0CPvTyUmg~Js^o_3n;0ESl*h-neK&JohW8p-$(J_QOs+fWIF=7#7|FktC@&5cbMx- zs%Tp^cBP-yt!*xujkYYL;Vns|3q-R}4-75f$f*^XKnqArHaeQG>1hkZ53gG}W)O6Y z)W2tJE}B@Pi(e1Z!sUsjuP7y4QxLVp)FYnH98a4S=Fx+Hg9CQF>Xyu6Ng>ewx>9Xx ztj*D=6@seMW$g^A#?MCIEjNS*GwBxca4Q_>qu1iaM2b?`&Sa%UQZE$}RNm*6poZdd zem2h?lG)Ogt`9de_O;ZbJ3Fv5yLDg zsijvMsSbv~N{oxe!f2Io@rSjgbYp2Eu`Rg@FA9P}0#AVMVHM}=>PP7t8*p69y$a17 zu6fKoKSf#>D_KzETWnCT$>NV>CeDSywRoCw4P_c&XP(*1CR*dYjH^v9Lu8OLNGw5- z5uG4LxpXu@GO2avAnqWbctei0%OCq&2E5VnK*WN2&9Nr11(~|7TLv8 zJBK+&-A$-J;q<|weVa^oI1n+d7~2-6@8lmQ@p|`x7jxxE{9mlmT(lmKvZ;x_wA-y z)Kp+8HPQ9GMg5%S8@cKuS9v$s4eg)rQ{BgSO9zj#J#zDy4B1>gY6cPI0?y2#&^jCt zk$Z}t^kLFt80n}@=U-mysNY=2zi241A{rd11T$qs@|ErIc;%!}{$e|vT|8~god0kg z8q0Re+~~f{6Cl#tiaq1udK4Lnw^G6ZoHTJq#s*g5HR0#v%4&3td!6pmgYlot(@A}+ zxqDK3-g8Y2J}5d|3cL0s$UAyq8;YCd$Wq7BKJNjItDHrmJc zXYjQ1eXvFs{!ZQZ&g7u{l&dcsaZ0y1O?II;0jJl$DArHaMs}#57vV%or0N{T2=fO1 z1VeX!(ff* zk`e6n`;rnZ5zyDxGcG||YD$o>-&+;D2a!vfYsF4Xt9_gMZ?<<1Noulbu-GMu)O|># z)~Z!cbhBKk%Zx?83=^qF-7kf?Eg_FA!YMyppLePyi({5vGU~_jE>;?t`vG3g!Y1yq zBIu)$UEr_=tO~^{5iH zE=uBox71quV#BR3`u>Jx>;(TXk9wCS!UDB}k3|kuHkpM#Ibj*>t)AqdgKt>5cxKuu zIs407xnDTYOlI)=2bpeQ5;L*05*}|#C06>IkpU|Ad=6{rO#2PEOAFvzIH?V7)of3r zyD}ij$aFQbh=>*9&|M}77_WLbL|EnT2E+tEb+?`?w}37kOClUAv((hty%U zMOCC`S8fHt8Oi5}+~g{P6^vpVzYbtuz`>6dFR}7xW)-9twR|$<6XD{4x>}oRFRNWI zyLcf7w}emE)Y_2XnaI6-_6O7ow;`m3uMP;^>h@Yj9LiSn6M7t=d~8gf(z!gHXl*Gs zQ&k_Eo~`$3p-GNC+izSUM{3gPPMHWzm@VhuY|%~tsLG{>N1vKfH?5vhlghBuCl8Qb z{TIpi(7Dl&Bg8!(`3g%7-BC2l&?jW)cFOsX&VN4FWp~Vo%YMyyW_Q{*e7NHG#r_jb z(=(4Z_4DRuhe`tDI*+CwoJ3ie0?>Rx9$fn>{BHyoTf0}H1Shhb*ThTL#fV__JK#m_ zo5^kL+jV;Ni9;v->VK^Cdh%}(cb>B%JhB{pp^_6^C7`CUI9Dl^3;wA?gBb>b?+)3d z+fV}G*BF3>(XgRGd*2`oQT_Pt)!3Q3Yz8dvgaqV5!l1NQ)>up%Jv=kxMk) zFGf-rc~~2pq8kyLu;oYmYD9>8v3snu`SD|NhRcSt31_gp#_6UspaYoZj9cfUCin8GYaa!0&)xjSq zpYVG8dKar>5Q^gje=f>|d$J7PMG{Vwl~@DM4dCjS73)tKwtY1Nc?DFopO0dEN%-yZ zI>)5X$Ae^e;tU0IrFp(f9H09eiyXRNXW96oLKgx@hA*xW{`mZ+m*GwK|BKImc5?s6 zXX_pobn(;R$G`<|O1@C#2r2_vvU%z!fQ0^X?zOCCBh}r*-a?RQOpnPbiylZD_bM{j zgdC~dPM9p1T|AA!CsVMYGFyuu#SbJCgj4Z-qIFd6COk4(XOqWVV^-AG2xri~-yy~E zs!Ua~#8lqwO2s5q{(56s$PM#PTUewWmbz@kdEMkMt<&87{-?B$Wxt&s zq>4A>$(0uPu6{fm^y;+Kb&C5v5kl1A$cnT3Yj*hGvIYPitc(T&vN`kZXMF8*bYuYF z-CzCg;ACazV&f^LexTs41xDaa_V4yaKi0WludTNmeFFPwh#pc%$LPg zK$**#rGp)r+vgtznv0gab8FwHXpC))B;bSmtYk{Qkcj#*%(NP|ihn659(=0(yDR%J5gEtFOEWmTm{=9DLVZWN~>*3%vB4$~tQo#@b)b)3fl&N3cltKpNR{u@Jb(%~L3o z4IfE2R9Xb3G~zXXg9fM%_RN_1|0~99f5n(ju}2s^q!XnuH@h$|JK72oCYM!K8UiyZ z9!dO+T#6hGCXm)w`Q#f+-1;~-#sn!e7S2)G_M+!;j78fEZ^1Cn10@4K81z2pn~jF_ zJVb3l&FEkgc-7`POIPh!U}_Oxo_^QCd?-q&vyO6h9F0r}=RmJYFIP!pnc7%RU$L!z z?AZPN-KuqBx1zgdwbh#`1RnN6$aH4DI!ZNw(H!AblSOYdiwC7`7JRIQ=$`IWdDCt{ zaGl&Jo3WM3omZ`wjL8KvrP2KNtcnjHnkU_?QVz`f2;`X&#O(vG9O5qyMIV;Js4R4+ zlgy?E7Q6&dfR85F+#XK61xT5*D(y@>b8FhRyQ`o(g>hsI^9Zy{A^M4e0e_?r-Te^D zWyUxMTD>n6^n>)jql{o0G4)09HOj`dMzo|5AfI?WxgHk0;8U0vu|eN-zjc~|W5Xdk z7gDWM58Qb7k5zZ1;cibig~u9PSj=GO)pxrB`MSgsggyb)x`T*RAEF@R)L5Bru)TnD zMlq0wVW#=&m~=CN62te`h50SnL*HsY5JYVmLMg>JWVeXI?Yab17r>$SO0JT%H>paZ zgLY%hV>xn2OwqWbw_tGA;VEL)A@B}T!xW_@E%Kk)kKqlm#E>BKYq`TQAGcf$gPxoX z)tE9qwdy})DmpSRMk#I{3F-(EwpK*8*hXz!yt@TnT(VUVIPthhK+IC*!Egv?uwBh1 z*XG3P%I?YekRekw^tMEwdJhDY-+RG`%amZg;P5+)2z>~E5&XcqBSgP^^qixe#~Tf% zjKVcal3XAX#lsBf(U}-MqYi>E*wiV7G~H`)-WZPMQUAmGW?}IkWaBxRp$a zpOgOtH=3Fo0<=&CkZG-@o~~BbW)q*2zP`)@&bZE9)esJTxjp-wFmqgdrVWRnha9ZM zRD^j+!#TaL#1wPSfISE!n_v_#=PUZUuO<>_*$jo zz{5&wNjhe!9_TYA!qxttN4P?sI>kl1!OPw9Tsw4?{dpsSMxo#*7Pv{J=UdgBCvng3uH+ z{-e0bmrpDGg&Z3r-B`ZP8weyzVr}oCnQxjIN_>Hs2|iQ%eySoX7gq773JvuLQZxo$ ztB6lFUPPTy`l}4*hZ!V8Wrf>^yAz;Sgnq?vg4fa?0+JE%U(TR`m+4g?8o^8q#6aaxQc-cEYCmyX@&^4WMClr`-}zvfcHOQf9_StU}17>usjns(ck>$KZR;B zUWEw2$xP@(|E2_9iO~PF)P?iMQaB?v(Z6?tVg4zEh5ttg@>S^XT|NNd&42t;7$NvQ zBNfp<*F1mA6q17<8A*tKS8I>2Z&P0vOjv(^eSN~E2DdO$bN{pI`CCTfH5vZv#MpnS zfxzAj_}u?2AO2RBqWQllv;9{7Kl1M1%CfY7l>d=+|7#Zh=P3ARqWRls8{K~zmHO|w z{ZHQcr`3N0>uVG-{x Date: Fri, 28 Jun 2019 12:41:31 -0500 Subject: [PATCH 091/238] CLN: Assorted cleanups (#27094) --- pandas/core/groupby/ops.py | 3 +- pandas/core/internals/__init__.py | 3 +- pandas/core/internals/blocks.py | 32 +++++++++++++----- pandas/core/internals/managers.py | 42 ------------------------ pandas/core/reshape/merge.py | 54 +++++++++++++++++++++++++++---- pandas/tests/io/test_html.py | 1 + 6 files changed, 77 insertions(+), 58 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 38478be5a8e07f..dd44bc6990d598 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -475,7 +475,8 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, else: if axis > 0: swapped = True - values = values.swapaxes(0, axis) + assert axis == 1, axis + values = values.T if arity > 1: raise NotImplementedError("arity of more than 1 is not " "supported for the 'how' argument") diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index b9530e15f71e22..bf46e5d1a74e47 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,3 +1,4 @@ + from .blocks import ( # noqa: F401 Block, BoolBlock, CategoricalBlock, ComplexBlock, DatetimeBlock, DatetimeTZBlock, ExtensionBlock, FloatBlock, IntBlock, ObjectBlock, @@ -9,7 +10,7 @@ from .blocks import _safe_reshape # noqa: F401; io.packers from .blocks import make_block # noqa: F401; io.pytables, io.packers from .managers import ( # noqa: F401; reshape.concat, reshape.merge + _transform_index, concatenate_block_managers) -from .managers import items_overlap_with_suffix # noqa: F401; reshape.merge from .blocks import _block_shape # noqa:F401; io.pytables diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 652f70746f6182..a131509a4ed102 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -722,16 +722,28 @@ def replace(self, to_replace, value, inplace=False, filter=None, try: values, to_replace = self._try_coerce_args(self.values, to_replace) - mask = missing.mask_missing(values, to_replace) - if filter is not None: - filtered_out = ~self.mgr_locs.isin(filter) - mask[filtered_out.nonzero()[0]] = False + except (TypeError, ValueError): + # GH 22083, TypeError or ValueError occurred within error handling + # causes infinite loop. Cast and retry only if not objectblock. + if is_object_dtype(self): + raise + + # try again with a compatible block + block = self.astype(object) + return block.replace(to_replace=original_to_replace, + value=value, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert) + + mask = missing.mask_missing(values, to_replace) + if filter is not None: + filtered_out = ~self.mgr_locs.isin(filter) + mask[filtered_out.nonzero()[0]] = False + try: blocks = self.putmask(mask, value, inplace=inplace) - if convert: - blocks = [b.convert(by_item=True, numeric=False, - copy=not inplace) for b in blocks] - return blocks except (TypeError, ValueError): # GH 22083, TypeError or ValueError occurred within error handling # causes infinite loop. Cast and retry only if not objectblock. @@ -746,6 +758,10 @@ def replace(self, to_replace, value, inplace=False, filter=None, filter=filter, regex=regex, convert=convert) + if convert: + blocks = [b.convert(by_item=True, numeric=False, + copy=not inplace) for b in blocks] + return blocks def _replace_single(self, *args, **kwargs): """ no-op on a non-ObjectBlock """ diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index aff39d765dc95f..5494b75ff9e4e9 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1859,48 +1859,6 @@ def _compare_or_regex_search(a, b, regex=False): return result -# TODO: this is no longer used in this module, could be moved to concat -def items_overlap_with_suffix(left, lsuffix, right, rsuffix): - """ - If two indices overlap, add suffixes to overlapping entries. - - If corresponding suffix is empty, the entry is simply converted to string. - - """ - to_rename = left.intersection(right) - if len(to_rename) == 0: - return left, right - else: - if not lsuffix and not rsuffix: - raise ValueError('columns overlap but no suffix specified: ' - '{rename}'.format(rename=to_rename)) - - def renamer(x, suffix): - """Rename the left and right indices. - - If there is overlap, and suffix is not None, add - suffix, otherwise, leave it as-is. - - Parameters - ---------- - x : original column name - suffix : str or None - - Returns - ------- - x : renamed column name - """ - if x in to_rename and suffix is not None: - return '{x}{suffix}'.format(x=x, suffix=suffix) - return x - - lrenamer = partial(renamer, suffix=lsuffix) - rrenamer = partial(renamer, suffix=rsuffix) - - return (_transform_index(left, lrenamer), - _transform_index(right, rrenamer)) - - def _transform_index(index, func, level=None): """ Apply function to all values found in index. diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index d21ad58e752c29..549c69486ebfa0 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -3,6 +3,7 @@ """ import copy +from functools import partial import string import warnings @@ -27,8 +28,7 @@ from pandas.core.arrays.categorical import _recode_for_categories import pandas.core.common as com from pandas.core.frame import _merge_doc -from pandas.core.internals import ( - concatenate_block_managers, items_overlap_with_suffix) +from pandas.core.internals import _transform_index, concatenate_block_managers import pandas.core.sorting as sorting from pandas.core.sorting import is_int64_overflow_possible @@ -555,8 +555,8 @@ def get_result(self): ldata, rdata = self.left._data, self.right._data lsuf, rsuf = self.suffixes - llabels, rlabels = items_overlap_with_suffix(ldata.items, lsuf, - rdata.items, rsuf) + llabels, rlabels = _items_overlap_with_suffix(ldata.items, lsuf, + rdata.items, rsuf) lindexers = {1: left_indexer} if left_indexer is not None else {} rindexers = {1: right_indexer} if right_indexer is not None else {} @@ -1303,8 +1303,8 @@ def get_result(self): ldata, rdata = self.left._data, self.right._data lsuf, rsuf = self.suffixes - llabels, rlabels = items_overlap_with_suffix(ldata.items, lsuf, - rdata.items, rsuf) + llabels, rlabels = _items_overlap_with_suffix(ldata.items, lsuf, + rdata.items, rsuf) if self.fill_method == 'ffill': left_join_indexer = libjoin.ffill_indexer(left_indexer) @@ -1809,3 +1809,45 @@ def validate_operand(obj): else: raise TypeError('Can only merge Series or DataFrame objects, ' 'a {obj} was passed'.format(obj=type(obj))) + + +def _items_overlap_with_suffix(left, lsuffix, right, rsuffix): + """ + If two indices overlap, add suffixes to overlapping entries. + + If corresponding suffix is empty, the entry is simply converted to string. + + """ + to_rename = left.intersection(right) + if len(to_rename) == 0: + return left, right + + if not lsuffix and not rsuffix: + raise ValueError('columns overlap but no suffix specified: ' + '{rename}'.format(rename=to_rename)) + + def renamer(x, suffix): + """ + Rename the left and right indices. + + If there is overlap, and suffix is not None, add + suffix, otherwise, leave it as-is. + + Parameters + ---------- + x : original column name + suffix : str or None + + Returns + ------- + x : renamed column name + """ + if x in to_rename and suffix is not None: + return '{x}{suffix}'.format(x=x, suffix=suffix) + return x + + lrenamer = partial(renamer, suffix=lsuffix) + rrenamer = partial(renamer, suffix=rsuffix) + + return (_transform_index(left, lrenamer), + _transform_index(right, rrenamer)) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 9f9fcabbfe42c1..33268b637d44ae 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -262,6 +262,7 @@ def test_bad_url_protocol(self): self.read_html('git://github.com', match='.*Water.*') @network + @pytest.mark.slow def test_invalid_url(self): try: with pytest.raises(URLError): From 3a53954a86df1d321b3c4ea766905f25bb225adc Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 28 Jun 2019 13:32:32 -0500 Subject: [PATCH 092/238] Enabled stricter type checking (#27097) --- mypy.ini | 1 + pandas/core/arrays/period.py | 2 +- pandas/core/dtypes/dtypes.py | 10 +++++----- pandas/core/nanops.py | 3 ++- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/mypy.ini b/mypy.ini index d29beeca73f1b8..cba20d2775fbee 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,5 +1,6 @@ [mypy] ignore_missing_imports=True +no_implicit_optional=True [mypy-pandas.conftest,pandas.tests.*] ignore_errors=True \ No newline at end of file diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 3a9322773fc691..bb144764a26fcb 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -195,7 +195,7 @@ def _simple_new(cls, values, freq=None, **kwargs): def _from_sequence( cls, scalars: Sequence[Optional[Period]], - dtype: PeriodDtype = None, + dtype: Optional[PeriodDtype] = None, copy: bool = False, ) -> ABCPeriodArray: if dtype: diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 9da6fb84ee18b8..81e061a0fc7b4d 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -214,13 +214,13 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): _metadata = ('categories', 'ordered') _cache = {} # type: Dict[str_type, PandasExtensionDtype] - def __init__(self, categories=None, ordered: bool = None): + def __init__(self, categories=None, ordered: Optional[bool] = None): self._finalize(categories, ordered, fastpath=False) @classmethod def _from_fastpath(cls, categories=None, - ordered: bool = None + ordered: Optional[bool] = None ) -> 'CategoricalDtype': self = cls.__new__(cls) self._finalize(categories, ordered, fastpath=True) @@ -230,7 +230,7 @@ def _from_fastpath(cls, def _from_categorical_dtype(cls, dtype: 'CategoricalDtype', categories=None, - ordered: bool = None, + ordered: Optional[bool] = None, ) -> 'CategoricalDtype': if categories is ordered is None: return dtype @@ -244,8 +244,8 @@ def _from_categorical_dtype(cls, def _from_values_or_dtype(cls, values=None, categories=None, - ordered: bool = None, - dtype: 'CategoricalDtype' = None, + ordered: Optional[bool] = None, + dtype: Optional['CategoricalDtype'] = None, ) -> 'CategoricalDtype': """ Construct dtype from the input parameters used in :class:`Categorical`. diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 24a28bf0005cb1..834a3bc3d8bbb2 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -228,7 +228,8 @@ def _maybe_get_mask(values: np.ndarray, skipna: bool, def _get_values(values: np.ndarray, skipna: bool, fill_value: Any = None, - fill_value_typ: str = None, mask: Optional[np.ndarray] = None + fill_value_typ: Optional[str] = None, + mask: Optional[np.ndarray] = None ) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, np.dtype, Any]: """ Utility to get the values view, mask, dtype, dtype_max, and fill_value. From 9693230aa0637c5856587a34485d3e411599d0f4 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Fri, 28 Jun 2019 21:54:50 +0200 Subject: [PATCH 093/238] DOC: clarify that Index.equals(non_index) returns False (#27105) closes #14411 --- pandas/core/indexes/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index cb5b4a6c8993c0..c96d9e2c5f77ab 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4170,7 +4170,8 @@ def equals(self, other): Returns ------- bool - If two Index objects have equal elements True, otherwise False. + True if "other" is an Index and it has the same elements as calling + index; False otherwise. """ if self.is_(other): return True From ad18ea35ba461a92b1ea2204f4edc55bb42e9d71 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 28 Jun 2019 16:04:12 -0400 Subject: [PATCH 094/238] Remove read_table deprecation (#27102) * Undeprecate read_table * Add whatsnew note * flake8 * Edit 0.24.0 read_table deprecation to mention undeprecation, and move current 0.25 --- doc/source/whatsnew/v0.24.0.rst | 2 +- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/io/parsers.py | 26 ++------------------------ pandas/tests/io/parser/test_common.py | 10 ++++------ pandas/tests/io/test_common.py | 27 ++------------------------- 5 files changed, 10 insertions(+), 56 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 403b4908d36e3b..a66056f661de35 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1298,7 +1298,7 @@ Deprecations - :meth:`Series.compress` is deprecated. Use ``Series[condition]`` instead (:issue:`18262`) - The signature of :meth:`Series.to_csv` has been uniformed to that of :meth:`DataFrame.to_csv`: the name of the first argument is now ``path_or_buf``, the order of subsequent arguments has changed, the ``header`` argument now defaults to ``True``. (:issue:`19715`) - :meth:`Categorical.from_codes` has deprecated providing float values for the ``codes`` argument. (:issue:`21767`) -- :func:`pandas.read_table` is deprecated. Instead, use :func:`read_csv` passing ``sep='\t'`` if necessary (:issue:`21948`) +- :func:`pandas.read_table` is deprecated. Instead, use :func:`read_csv` passing ``sep='\t'`` if necessary. This deprecation has been removed in 0.25.0. (:issue:`21948`) - :meth:`Series.str.cat` has deprecated using arbitrary list-likes *within* list-likes. A list-like container may still contain many ``Series``, ``Index`` or 1-dimensional ``np.ndarray``, or alternatively, only scalar values. (:issue:`21950`) - :meth:`FrozenNDArray.searchsorted` has deprecated the ``v`` parameter in favor of ``value`` (:issue:`14645`) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1fe808e0988606..27a72014a9f8e4 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -604,6 +604,7 @@ Other deprecations - The :meth:`Series.ftype`, :meth:`Series.ftypes` and :meth:`DataFrame.ftypes` methods are deprecated and will be removed in a future version. Instead, use :meth:`Series.dtype` and :meth:`DataFrame.dtypes` (:issue:`26705`). - :meth:`Timedelta.resolution` is deprecated and replaced with :meth:`Timedelta.resolution_string`. In a future version, :meth:`Timedelta.resolution` will be changed to behave like the standard library :attr:`timedelta.resolution` (:issue:`21344`) +- func:`read_table` has been undeprecated. (:issue:`25220`) .. _whatsnew_0250.prior_deprecations: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3b16544e722332..9c914003c3764b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -540,14 +540,8 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): def _make_parser_function(name, default_sep=','): - # prepare read_table deprecation - if name == "read_table": - sep = False - else: - sep = default_sep - def parser_f(filepath_or_buffer: FilePathOrBuffer, - sep=sep, + sep=default_sep, delimiter=None, # Column and Index Locations and Names @@ -613,19 +607,6 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer, memory_map=False, float_precision=None): - # deprecate read_table GH21948 - if name == "read_table": - if sep is False and delimiter is None: - warnings.warn("read_table is deprecated, use read_csv " - "instead, passing sep='\\t'.", - FutureWarning, stacklevel=2) - else: - warnings.warn("read_table is deprecated, use read_csv " - "instead.", - FutureWarning, stacklevel=2) - if sep is False: - sep = default_sep - # gh-23761 # # When a dialect is passed, it overrides any of the overlapping @@ -732,10 +713,7 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer, read_table = _make_parser_function('read_table', default_sep='\t') read_table = Appender(_doc_read_csv_and_table.format( func_name='read_table', - summary="""Read general delimited file into DataFrame. - -.. deprecated:: 0.24.0 - Use :func:`pandas.read_csv` instead, passing ``sep='\\t'`` if necessary.""", + summary='Read general delimited file into DataFrame.', _default_sep=r"'\\t' (tab-stop)") )(read_table) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 28ea90f005f3f8..c74e57627d679b 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1917,16 +1917,14 @@ def test_read_csv_memory_growth_chunksize(all_parsers): pass -def test_read_table_deprecated(all_parsers): +def test_read_table_equivalency_to_read_csv(all_parsers): # see gh-21948 + # As of 0.25.0, read_table is undeprecated parser = all_parsers data = "a\tb\n1\t2\n3\t4" expected = parser.read_csv(StringIO(data), sep="\t") - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = parser.read_table(StringIO(data)) - tm.assert_frame_equal(result, expected) + result = parser.read_table(StringIO(data)) + tm.assert_frame_equal(result, expected) def test_first_row_bom(all_parsers): diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 0ea87d9d961f28..f580dc460fd68f 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -160,6 +160,7 @@ def test_read_non_existant(self, reader, module, error_class, fn_ext): @pytest.mark.parametrize('reader, module, error_class, fn_ext', [ (pd.read_csv, 'os', FileNotFoundError, 'csv'), + (pd.read_table, 'os', FileNotFoundError, 'csv'), (pd.read_fwf, 'os', FileNotFoundError, 'txt'), (pd.read_excel, 'xlrd', FileNotFoundError, 'xlsx'), (pd.read_feather, 'feather', Exception, 'feather'), @@ -191,18 +192,9 @@ def test_read_expands_user_home_dir(self, reader, module, msg1, msg2, msg3, msg4, msg5)): reader(path) - def test_read_non_existant_read_table(self): - path = os.path.join(HERE, 'data', 'does_not_exist.' + 'csv') - msg1 = r"File b'.+does_not_exist\.csv' does not exist" - msg2 = (r"\[Errno 2\] File .+does_not_exist\.csv does not exist:" - r" '.+does_not_exist\.csv'") - with pytest.raises(FileNotFoundError, match=r"({}|{})".format( - msg1, msg2)): - with tm.assert_produces_warning(FutureWarning): - pd.read_table(path) - @pytest.mark.parametrize('reader, module, path', [ (pd.read_csv, 'os', ('io', 'data', 'iris.csv')), + (pd.read_table, 'os', ('io', 'data', 'iris.csv')), (pd.read_fwf, 'os', ('io', 'data', 'fixed_width_format.txt')), (pd.read_excel, 'xlrd', ('io', 'data', 'test1.xlsx')), (pd.read_feather, 'feather', ('io', 'data', 'feather-0_3_1.feather')), @@ -228,21 +220,6 @@ def test_read_fspath_all(self, reader, module, path, datapath): else: tm.assert_frame_equal(result, expected) - def test_read_fspath_all_read_table(self, datapath): - path = datapath('io', 'data', 'iris.csv') - - mypath = CustomFSPath(path) - with tm.assert_produces_warning(FutureWarning): - result = pd.read_table(mypath) - with tm.assert_produces_warning(FutureWarning): - expected = pd.read_table(path) - - if path.endswith('.pickle'): - # categorical - tm.assert_categorical_equal(result, expected) - else: - tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('writer_name, writer_kwargs, module', [ ('to_csv', {}, 'os'), ('to_excel', {'engine': 'xlwt'}, 'xlwt'), From 497c4ebe2ff87b3ee8ab6f7ba44ef15bb5d36c0d Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Sat, 29 Jun 2019 00:13:01 +0200 Subject: [PATCH 095/238] TST: test initializing a Series from Index while passing dtype (#27107) closes #17088 --- pandas/tests/series/test_constructors.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index f2345a0822f6d9..49417942a35980 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -224,6 +224,13 @@ def test_constructor_list_like(self): result = Series(obj, index=[0, 1, 2]) assert_series_equal(result, expected) + @pytest.mark.parametrize('dtype', ['bool', 'int32', 'int64', 'float64']) + def test_constructor_index_dtype(self, dtype): + # GH 17088 + + s = Series(Index([0, 2, 4]), dtype=dtype) + assert s.dtype == dtype + @pytest.mark.parametrize('input_vals', [ ([1, 2]), (['1', '2']), From 110c02f4e2fa3884057ef3520484ec006fb969ec Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 28 Jun 2019 18:53:53 -0400 Subject: [PATCH 096/238] DEPR: Series.put, Series.real, Series.imag, Index.dtype_str (#27106) --- doc/source/whatsnew/v0.25.0.rst | 5 ++++- pandas/core/arrays/categorical.py | 4 +--- pandas/core/indexes/base.py | 5 +++++ pandas/core/nanops.py | 2 +- pandas/core/series.py | 12 ++++++++++++ pandas/core/util/hashing.py | 2 +- pandas/io/packers.py | 8 ++++---- pandas/tests/indexes/multi/test_format.py | 7 ++++--- pandas/tests/indexes/period/test_period.py | 12 +++++++----- pandas/tests/indexes/test_common.py | 7 ++++--- pandas/tests/series/test_dtypes.py | 16 ++++++++++++---- pandas/tests/series/test_internals.py | 7 +++++++ pandas/tests/test_nanops.py | 4 ++-- 13 files changed, 64 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 27a72014a9f8e4..28bf796be404a9 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -604,7 +604,10 @@ Other deprecations - The :meth:`Series.ftype`, :meth:`Series.ftypes` and :meth:`DataFrame.ftypes` methods are deprecated and will be removed in a future version. Instead, use :meth:`Series.dtype` and :meth:`DataFrame.dtypes` (:issue:`26705`). - :meth:`Timedelta.resolution` is deprecated and replaced with :meth:`Timedelta.resolution_string`. In a future version, :meth:`Timedelta.resolution` will be changed to behave like the standard library :attr:`timedelta.resolution` (:issue:`21344`) -- func:`read_table` has been undeprecated. (:issue:`25220`) +- :func:`read_table` has been undeprecated. (:issue:`25220`) +- :attr:`Index.dtype_str` is deprecated. (:issue:`18262`) +- :attr:`Series.imag` and :attr:`Series.real` are deprecated. (:issue:`18262`) +- :meth:`Series.put` is deprecated. (:issue:`18262`) .. _whatsnew_0250.prior_deprecations: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 9f5e3e8ee77f02..a1d591458fba37 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1989,9 +1989,7 @@ def _repr_categories_info(self): """ category_strs = self._repr_categories() - dtype = getattr(self.categories, 'dtype_str', - str(self.categories.dtype)) - + dtype = str(self.categories.dtype) levheader = "Categories ({length}, {dtype}): ".format( length=len(self.categories), dtype=dtype) width, height = get_terminal_size() diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c96d9e2c5f77ab..6a708536689c43 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -690,7 +690,12 @@ def dtype(self): def dtype_str(self): """ Return the dtype str of the underlying data. + + .. deprecated:: 0.25.0 """ + warnings.warn('`dtype_str` has been deprecated. Call `str` on the ' + 'dtype attribute instead.', FutureWarning, + stacklevel=2) return str(self.dtype) def ravel(self, order='C'): diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 834a3bc3d8bbb2..cc8b241bedba1f 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1275,7 +1275,7 @@ def _ensure_numeric(x): except (TypeError, ValueError): x = x.astype(np.float64) else: - if not np.any(x.imag): + if not np.any(np.imag(x)): x = x.real elif not (is_float(x) or is_integer(x) or is_complex(x)): try: diff --git a/pandas/core/series.py b/pandas/core/series.py index 730a96f5435a12..31cb7432b3ae1e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -617,10 +617,14 @@ def put(self, *args, **kwargs): """ Apply the `put` method to its `values` attribute if it has one. + .. deprecated:: 0.25.0 + See Also -------- numpy.ndarray.put """ + warnings.warn('`put` has been deprecated and will be removed in a' + 'future version.', FutureWarning, stacklevel=2) self._values.put(*args, **kwargs) def __len__(self): @@ -793,7 +797,11 @@ def __array_prepare__(self, result, context=None): def real(self): """ Return the real value of vector. + + .. deprecated 0.25.0 """ + warnings.warn("`real` has be deprecated and will be removed in a " + "future verison", FutureWarning, stacklevel=2) return self.values.real @real.setter @@ -804,7 +812,11 @@ def real(self, v): def imag(self): """ Return imag value of vector. + + .. deprecated 0.25.0 """ + warnings.warn("`imag` has be deprecated and will be removed in a " + "future verison", FutureWarning, stacklevel=2) return self.values.imag @imag.setter diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 93074f5afa2b3f..a916f2f06df21e 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -269,7 +269,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(dtype, np.complex128): - return hash_array(vals.real) + 23 * hash_array(vals.imag) + return hash_array(np.real(vals)) + 23 * hash_array(np.imag(vals)) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 4a273bfe2decb1..cef0af3edbb20b 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -523,16 +523,16 @@ def encode(obj): return {'typ': 'np_scalar', 'sub_typ': 'np_complex', 'dtype': obj.dtype.name, - 'real': obj.real.__repr__(), - 'imag': obj.imag.__repr__()} + 'real': np.real(obj).__repr__(), + 'imag': np.imag(obj).__repr__()} else: return {'typ': 'np_scalar', 'dtype': obj.dtype.name, 'data': obj.__repr__()} elif isinstance(obj, complex): return {'typ': 'np_complex', - 'real': obj.real.__repr__(), - 'imag': obj.imag.__repr__()} + 'real': np.real(obj).__repr__(), + 'imag': np.imag(obj).__repr__()} return obj diff --git a/pandas/tests/indexes/multi/test_format.py b/pandas/tests/indexes/multi/test_format.py index 8315478d85125e..85d30b8f6de6b8 100644 --- a/pandas/tests/indexes/multi/test_format.py +++ b/pandas/tests/indexes/multi/test_format.py @@ -8,9 +8,10 @@ def test_dtype_str(indices): - dtype = indices.dtype_str - assert isinstance(dtype, str) - assert dtype == str(indices.dtype) + with tm.assert_produces_warning(FutureWarning): + dtype = indices.dtype_str + assert isinstance(dtype, str) + assert dtype == str(indices.dtype) def test_format(idx): diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 2f3f15101e7caa..a70f67557bfc22 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -132,12 +132,14 @@ def test_shallow_copy_changing_freq_raises(self): def test_dtype_str(self): pi = pd.PeriodIndex([], freq='M') - assert pi.dtype_str == 'period[M]' - assert pi.dtype_str == str(pi.dtype) + with tm.assert_produces_warning(FutureWarning): + assert pi.dtype_str == 'period[M]' + assert pi.dtype_str == str(pi.dtype) - pi = pd.PeriodIndex([], freq='3M') - assert pi.dtype_str == 'period[3M]' - assert pi.dtype_str == str(pi.dtype) + with tm.assert_produces_warning(FutureWarning): + pi = pd.PeriodIndex([], freq='3M') + assert pi.dtype_str == 'period[3M]' + assert pi.dtype_str == str(pi.dtype) def test_view_asi8(self): idx = pd.PeriodIndex([], freq='M') diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 451fb2ed7906df..3cb907c6f58442 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -155,9 +155,10 @@ def test_set_name_methods(self, indices): assert indices.names == [name] def test_dtype_str(self, indices): - dtype = indices.dtype_str - assert isinstance(dtype, str) - assert dtype == str(indices.dtype) + with tm.assert_produces_warning(FutureWarning): + dtype = indices.dtype_str + assert isinstance(dtype, str) + assert dtype == str(indices.dtype) def test_hash_error(self, indices): index = indices diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 392163228398b2..b9146534d10f11 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -428,17 +428,25 @@ def test_astype_empty_constructor_equality(self, dtype): as_type_empty = Series([]).astype(dtype) tm.assert_series_equal(init_empty, as_type_empty) + @pytest.mark.filterwarnings('ignore::FutureWarning') def test_complex(self): # see gh-4819: complex access for ndarray compat a = np.arange(5, dtype=np.float64) b = Series(a + 4j * a) - tm.assert_numpy_array_equal(a, b.real) - tm.assert_numpy_array_equal(4 * a, b.imag) + tm.assert_numpy_array_equal(a, np.real(b)) + tm.assert_numpy_array_equal(4 * a, np.imag(b)) b.real = np.arange(5) + 5 - tm.assert_numpy_array_equal(a + 5, b.real) - tm.assert_numpy_array_equal(4 * a, b.imag) + tm.assert_numpy_array_equal(a + 5, np.real(b)) + tm.assert_numpy_array_equal(4 * a, np.imag(b)) + + def test_real_imag_deprecated(self): + # GH 18262 + s = pd.Series([1]) + with tm.assert_produces_warning(FutureWarning): + s.imag + s.real def test_arg_for_errors_in_astype(self): # see gh-14878 diff --git a/pandas/tests/series/test_internals.py b/pandas/tests/series/test_internals.py index 29846f10dae33d..0b62624ad2696b 100644 --- a/pandas/tests/series/test_internals.py +++ b/pandas/tests/series/test_internals.py @@ -221,3 +221,10 @@ def test_hasnans_unchached_for_series(): ser.iloc[-1] = np.nan assert ser.hasnans is True assert Series.hasnans.__doc__ == pd.Index.hasnans.__doc__ + + +def test_put_deprecated(): + # GH 18262 + s = pd.Series([1]) + with tm.assert_produces_warning(FutureWarning): + s.put(0, 0) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index ad035f9c0158da..6e7b34a0632ad8 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -144,9 +144,9 @@ def _coerce_tds(targ, res): # but nanops doesn't, so make that an exception elif targ.dtype.kind == 'O': raise - tm.assert_almost_equal(targ.real, res.real, + tm.assert_almost_equal(np.real(targ), np.real(res), check_dtype=check_dtype) - tm.assert_almost_equal(targ.imag, res.imag, + tm.assert_almost_equal(np.imag(targ), np.imag(res), check_dtype=check_dtype) def check_fun_data(self, testfunc, targfunc, testarval, targarval, From d7d26bed4365d33f6f47f54fbe3b4221c52fe098 Mon Sep 17 00:00:00 2001 From: Andrew Wood Date: Fri, 28 Jun 2019 18:56:21 -0400 Subject: [PATCH 097/238] TST: change assertion messages in assert_frame_equal (#27023) (#27068) --- pandas/tests/util/test_assert_frame_equal.py | 78 ++++++++++++-------- pandas/util/testing.py | 6 +- 2 files changed, 52 insertions(+), 32 deletions(-) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 7aa8f1d527d39e..735d16f7ad0dbd 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -5,7 +5,12 @@ @pytest.fixture(params=[True, False]) -def by_blocks(request): +def by_blocks_fixture(request): + return request.param + + +@pytest.fixture(params=['DataFrame', 'Series']) +def obj_fixture(request): return request.param @@ -70,29 +75,35 @@ def _assert_not_frame_equal_both(a, b, **kwargs): @pytest.mark.parametrize("check_like", [True, False]) -def test_frame_equal_row_order_mismatch(check_like): +def test_frame_equal_row_order_mismatch(check_like, obj_fixture): df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) df2 = DataFrame({"A": [3, 2, 1], "B": [6, 5, 4]}, index=["c", "b", "a"]) if not check_like: # Do not ignore row-column orderings. - msg = "DataFrame.index are different" + msg = "{obj}.index are different".format(obj=obj_fixture) with pytest.raises(AssertionError, match=msg): - assert_frame_equal(df1, df2, check_like=check_like) + assert_frame_equal(df1, + df2, + check_like=check_like, + obj=obj_fixture) else: - _assert_frame_equal_both(df1, df2, check_like=check_like) + _assert_frame_equal_both(df1, + df2, + check_like=check_like, + obj=obj_fixture) @pytest.mark.parametrize("df1,df2", [ (DataFrame({"A": [1, 2, 3]}), DataFrame({"A": [1, 2, 3, 4]})), (DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), DataFrame({"A": [1, 2, 3]})), ]) -def test_frame_equal_shape_mismatch(df1, df2): - msg = "DataFrame are different" +def test_frame_equal_shape_mismatch(df1, df2, obj_fixture): + msg = "{obj} are different".format(obj=obj_fixture) with pytest.raises(AssertionError, match=msg): - assert_frame_equal(df1, df2) + assert_frame_equal(df1, df2, obj=obj_fixture) @pytest.mark.parametrize("df1,df2,msg", [ @@ -136,12 +147,13 @@ def test_empty_dtypes(check_dtype): assert_frame_equal(df1, df2, **kwargs) -def test_frame_equal_index_mismatch(): - msg = """DataFrame\\.index are different +def test_frame_equal_index_mismatch(obj_fixture): + msg = """{obj}\\.index are different -DataFrame\\.index values are different \\(33\\.33333 %\\) +{obj}\\.index values are different \\(33\\.33333 %\\) \\[left\\]: Index\\(\\['a', 'b', 'c'\\], dtype='object'\\) -\\[right\\]: Index\\(\\['a', 'b', 'd'\\], dtype='object'\\)""" +\\[right\\]: Index\\(\\['a', 'b', 'd'\\], dtype='object'\\)""".format( + obj=obj_fixture) df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) @@ -149,15 +161,16 @@ def test_frame_equal_index_mismatch(): index=["a", "b", "d"]) with pytest.raises(AssertionError, match=msg): - assert_frame_equal(df1, df2) + assert_frame_equal(df1, df2, obj=obj_fixture) -def test_frame_equal_columns_mismatch(): - msg = """DataFrame\\.columns are different +def test_frame_equal_columns_mismatch(obj_fixture): + msg = """{obj}\\.columns are different -DataFrame\\.columns values are different \\(50\\.0 %\\) +{obj}\\.columns values are different \\(50\\.0 %\\) \\[left\\]: Index\\(\\['A', 'B'\\], dtype='object'\\) -\\[right\\]: Index\\(\\['A', 'b'\\], dtype='object'\\)""" +\\[right\\]: Index\\(\\['A', 'b'\\], dtype='object'\\)""".format( + obj=obj_fixture) df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) @@ -165,43 +178,50 @@ def test_frame_equal_columns_mismatch(): index=["a", "b", "c"]) with pytest.raises(AssertionError, match=msg): - assert_frame_equal(df1, df2) + assert_frame_equal(df1, df2, obj=obj_fixture) -def test_frame_equal_block_mismatch(by_blocks): - msg = """DataFrame\\.iloc\\[:, 1\\] are different +def test_frame_equal_block_mismatch(by_blocks_fixture, obj_fixture): + msg = """{obj}\\.iloc\\[:, 1\\] are different -DataFrame\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) +{obj}\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) \\[left\\]: \\[4, 5, 6\\] -\\[right\\]: \\[4, 5, 7\\]""" +\\[right\\]: \\[4, 5, 7\\]""".format(obj=obj_fixture) df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 7]}) with pytest.raises(AssertionError, match=msg): - assert_frame_equal(df1, df2, by_blocks=by_blocks) + assert_frame_equal(df1, + df2, + by_blocks=by_blocks_fixture, + obj=obj_fixture) @pytest.mark.parametrize("df1,df2,msg", [ (DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}), DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "e̊"]}), - """DataFrame\\.iloc\\[:, 1\\] are different + """{obj}\\.iloc\\[:, 1\\] are different -DataFrame\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) +{obj}\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) \\[left\\]: \\[é, è, ë\\] \\[right\\]: \\[é, è, e̊\\]"""), (DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}), DataFrame({"A": ["a", "a", "a"], "E": ["e", "e", "e"]}), - """DataFrame\\.iloc\\[:, 0\\] are different + """{obj}\\.iloc\\[:, 0\\] are different -DataFrame\\.iloc\\[:, 0\\] values are different \\(100\\.0 %\\) +{obj}\\.iloc\\[:, 0\\] values are different \\(100\\.0 %\\) \\[left\\]: \\[á, à, ä\\] \\[right\\]: \\[a, a, a\\]"""), ]) -def test_frame_equal_unicode(df1, df2, msg, by_blocks): +def test_frame_equal_unicode(df1, df2, msg, by_blocks_fixture, obj_fixture): # see gh-20503 # # Test ensures that `assert_frame_equals` raises the right exception # when comparing DataFrames containing differing unicode objects. + msg = msg.format(obj=obj_fixture) with pytest.raises(AssertionError, match=msg): - assert_frame_equal(df1, df2, by_blocks=by_blocks) + assert_frame_equal(df1, + df2, + by_blocks=by_blocks_fixture, + obj=obj_fixture) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index f14b202b034d6d..05e0a8df496c57 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1218,7 +1218,7 @@ def assert_frame_equal(left, right, check_dtype=True, # shape comparison if left.shape != right.shape: raise_assert_detail(obj, - 'DataFrame shape mismatch', + '{obj} shape mismatch'.format(obj=obj), '{shape!r}'.format(shape=left.shape), '{shape!r}'.format(shape=right.shape)) @@ -1249,7 +1249,7 @@ def assert_frame_equal(left, right, check_dtype=True, assert dtype in lblocks assert dtype in rblocks assert_frame_equal(lblocks[dtype], rblocks[dtype], - check_dtype=check_dtype, obj='DataFrame.blocks') + check_dtype=check_dtype, obj=obj) # compare by columns else: @@ -1264,7 +1264,7 @@ def assert_frame_equal(left, right, check_dtype=True, check_exact=check_exact, check_names=check_names, check_datetimelike_compat=check_datetimelike_compat, check_categorical=check_categorical, - obj='DataFrame.iloc[:, {idx}]'.format(idx=i)) + obj='{obj}.iloc[:, {idx}]'.format(obj=obj, idx=i)) def assert_equal(left, right, **kwargs): From dd11fc22a3e76721e2c6634cc7c68840f5f1b636 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 28 Jun 2019 20:56:04 -0500 Subject: [PATCH 098/238] DEPR: Series.item and Index.item (#27112) * Deprecate item * Add whatsnew * Use assert_produces_warning instead --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/base.py | 4 ++++ pandas/core/indexes/base.py | 7 ------- pandas/core/indexes/numeric.py | 9 ++++----- pandas/core/indexes/period.py | 5 +++++ pandas/tests/series/test_api.py | 9 +++++---- pandas/tests/test_base.py | 8 +++++--- 7 files changed, 24 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 28bf796be404a9..e12088a7ad05bf 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -608,6 +608,7 @@ Other deprecations - :attr:`Index.dtype_str` is deprecated. (:issue:`18262`) - :attr:`Series.imag` and :attr:`Series.real` are deprecated. (:issue:`18262`) - :meth:`Series.put` is deprecated. (:issue:`18262`) +- :meth:`Index.item` and :meth:`Series.item` is deprecated. (:issue:`18262`) .. _whatsnew_0250.prior_deprecations: diff --git a/pandas/core/base.py b/pandas/core/base.py index 30e800cb9bd732..93db65deff8202 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -693,11 +693,15 @@ def item(self): """ Return the first element of the underlying data as a python scalar. + .. deprecated 0.25.0 + Returns ------- scalar The first element of %(klass)s. """ + warnings.warn('`item` has been deprecated and will be removed in a ' + 'future version', FutureWarning, stacklevel=2) return self.values.item() @property diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6a708536689c43..4b7582fcf7cc03 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -57,13 +57,6 @@ _index_shared_docs = dict() -def _try_get_item(x): - try: - return x.item() - except AttributeError: - return x - - def _make_comparison_op(op, cls): def cmp_method(self, other): if isinstance(other, (np.ndarray, Index, ABCSeries)): diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index a228895e527aa9..5f9c1f22887cc8 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -16,7 +16,6 @@ from pandas.core import algorithms import pandas.core.common as com -import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( Index, InvalidIndexError, _index_shared_docs) from pandas.core.ops import get_op_result_name @@ -442,7 +441,9 @@ def __contains__(self, other): return np.isnan(other) and self.hasnans except ValueError: try: - return len(other) <= 1 and ibase._try_get_item(other) in self + return len(other) <= 1 and other.item() in self + except AttributeError: + return len(other) <= 1 and other in self except TypeError: pass except TypeError: @@ -457,9 +458,7 @@ def get_loc(self, key, method=None, tolerance=None): nan_idxs = self._nan_idxs try: return nan_idxs.item() - except (ValueError, IndexError): - # should only need to catch ValueError here but on numpy - # 1.7 .item() can raise IndexError when NaNs are present + except ValueError: if not len(nan_idxs): raise KeyError(key) return nan_idxs diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index b20b0c6f853d9c..dc11099c3e903d 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -874,7 +874,12 @@ def item(self): """ return the first element of the underlying data as a python scalar + + .. deprecated 0.25.0 + """ + warnings.warn('`item` has been deprecated and will be removed in a ' + 'future version', FutureWarning, stacklevel=2) # TODO(DatetimeArray): remove if len(self) == 1: return self[0] diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index fac796fbf325a6..1cd5bd09a82e77 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -419,10 +419,11 @@ def f(x): tm.assert_series_equal(result, expected) # .item() - s = Series([1]) - result = s.item() - assert result == 1 - assert s.item() == s.iloc[0] + with tm.assert_produces_warning(FutureWarning): + s = Series([1]) + result = s.item() + assert result == 1 + assert s.item() == s.iloc[0] # using an ndarray like function s = Series(np.random.randn(10)) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index d82b205803b098..f9a1bb97cc48cd 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -326,13 +326,15 @@ def test_ndarray_compat_properties(self): pass with pytest.raises(ValueError): - o.item() # len > 1 + with tm.assert_produces_warning(FutureWarning): + o.item() # len > 1 assert o.ndim == 1 assert o.size == len(o) - assert Index([1]).item() == 1 - assert Series([1]).item() == 1 + with tm.assert_produces_warning(FutureWarning): + assert Index([1]).item() == 1 + assert Series([1]).item() == 1 def test_value_counts_unique_nunique(self): for orig in self.objs: From f331c5610afb2bd5f36f983004799ea9e1545969 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 28 Jun 2019 21:11:26 -0500 Subject: [PATCH 099/238] Added keep_links=False (#27115) --- pandas/io/excel/_openpyxl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index ec42acf9877377..7b1e203bd33ad1 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -482,7 +482,7 @@ def _workbook_class(self): def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): from openpyxl import load_workbook return load_workbook(filepath_or_buffer, - read_only=True, data_only=True) + read_only=True, data_only=True, keep_links=False) @property def sheet_names(self) -> List[str]: From d050791acd77849246d9da4a70f6c71ccc59a633 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 28 Jun 2019 21:36:20 -0500 Subject: [PATCH 100/238] DEPR: Change current DeprecationWarnings to FutureWarnings (#27113) * Change current DeprecationWarnings to FutureWarnings in prep for removal in 0.25.0 * Change RangeIndex.start/stop/step back to DeprecationWarning * Change one test warning --- pandas/core/internals/blocks.py | 6 +++--- pandas/tests/extension/decimal/test_decimal.py | 2 +- pandas/tests/indexes/test_range.py | 8 ++++---- pandas/tests/internals/test_internals.py | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index a131509a4ed102..f0128b70d74327 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -220,7 +220,7 @@ def make_block_same_class(self, values, placement=None, ndim=None, if dtype is not None: # issue 19431 fastparquet is passing this warnings.warn("dtype argument is deprecated, will be removed " - "in a future release.", DeprecationWarning) + "in a future release.", FutureWarning) if placement is None: placement = self.mgr_locs return make_block(values, placement=placement, ndim=ndim, @@ -1794,7 +1794,7 @@ def formatting_values(self): "'ExtensionArray._formatting_values' is deprecated. " "Specify 'ExtensionArray._formatter' instead." ) - warnings.warn(msg, DeprecationWarning, stacklevel=10) + warnings.warn(msg, FutureWarning, stacklevel=10) return self.values._formatting_values() return self.values @@ -3056,7 +3056,7 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, if fastpath is not None: # GH#19265 pyarrow is passing this warnings.warn("fastpath argument is deprecated, will be removed " - "in a future release.", DeprecationWarning) + "in a future release.", FutureWarning) if klass is None: dtype = dtype or values.dtype klass = get_block_type(values, dtype) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index ecef835a9c7977..4625c79e1bc3dc 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -397,6 +397,6 @@ def _formatting_values(self): ser = pd.Series(DecimalArray2([decimal.Decimal('1.0')])) - with tm.assert_produces_warning(DeprecationWarning, + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): repr(ser) diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 3f474b0166b159..5f7f10e881cede 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -171,12 +171,12 @@ def test_start_stop_step_attrs(self, index, start, stop, step): assert index.stop == stop assert index.step == step - def test_deprecated_start_stop_step_attrs(self): + @pytest.mark.parametrize('attr_name', ['_start', '_stop', '_step']) + def test_deprecated_start_stop_step_attrs(self, attr_name): # GH 26581 idx = self.create_index() - for attr_name in ['_start', '_stop', '_step']: - with tm.assert_produces_warning(DeprecationWarning): - getattr(idx, attr_name) + with tm.assert_produces_warning(DeprecationWarning): + getattr(idx, attr_name) def test_copy(self): i = RangeIndex(5, name='Foo') diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index b997e2b6eec8fc..697c0b52805895 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -287,7 +287,7 @@ def test_delete(self): def test_make_block_same_class(self): # issue 19431 block = create_block('M8[ns, US/Eastern]', [3]) - with tm.assert_produces_warning(DeprecationWarning, + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): block.make_block_same_class(block.values, dtype=block.values.dtype) @@ -1254,7 +1254,7 @@ def test_holder(typestr, holder): def test_deprecated_fastpath(): # GH#19265 values = np.random.rand(3, 3) - with tm.assert_produces_warning(DeprecationWarning, + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): make_block(values, placement=np.arange(3), fastpath=True) From 989f912eec97df2ebd921a1423534e4d49133e12 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 28 Jun 2019 21:37:44 -0500 Subject: [PATCH 101/238] DEPR: deprecate msgpack support (#27103) * DEPR: deprecate msgpack support closes #27084 * warnings in docs * review comments --- doc/source/user_guide/io.rst | 14 +++++++--- doc/source/whatsnew/v0.13.0.rst | 2 ++ doc/source/whatsnew/v0.25.0.rst | 6 ++++ pandas/core/generic.py | 7 +++-- pandas/io/packers.py | 26 ++++++++++++++--- pandas/tests/io/test_common.py | 1 + pandas/tests/io/test_packers.py | 49 +++++++++++++++++++++++---------- 7 files changed, 80 insertions(+), 25 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index e32bb0f1102527..e7070585a4b9c4 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3393,15 +3393,15 @@ both on the writing (serialization), and reading (deserialization). .. warning:: - This is a very new feature of pandas. We intend to provide certain - optimizations in the io of the ``msgpack`` data. Since this is marked - as an EXPERIMENTAL LIBRARY, the storage format may not be stable until a future release. + The msgpack format is deprecated as of 0.25 and will be removed in a future version. + It is recommended to use pyarrow for on-the-wire transmission of pandas objects. .. warning:: :func:`read_msgpack` is only guaranteed backwards compatible back to pandas version 0.20.3 .. ipython:: python + :okwarning: df = pd.DataFrame(np.random.rand(5, 2), columns=list('AB')) df.to_msgpack('foo.msg') @@ -3411,6 +3411,7 @@ both on the writing (serialization), and reading (deserialization). You can pass a list of objects and you will receive them back on deserialization. .. ipython:: python + :okwarning: pd.to_msgpack('foo.msg', df, 'foo', np.array([1, 2, 3]), s) pd.read_msgpack('foo.msg') @@ -3418,6 +3419,7 @@ You can pass a list of objects and you will receive them back on deserialization You can pass ``iterator=True`` to iterate over the unpacked results: .. ipython:: python + :okwarning: for o in pd.read_msgpack('foo.msg', iterator=True): print(o) @@ -3425,6 +3427,7 @@ You can pass ``iterator=True`` to iterate over the unpacked results: You can pass ``append=True`` to the writer to append to an existing pack: .. ipython:: python + :okwarning: df.to_msgpack('foo.msg', append=True) pd.read_msgpack('foo.msg') @@ -3435,6 +3438,7 @@ can pack arbitrary collections of Python lists, dicts, scalars, while intermixin pandas objects. .. ipython:: python + :okwarning: pd.to_msgpack('foo2.msg', {'dict': [{'df': df}, {'string': 'foo'}, {'scalar': 1.}, {'s': s}]}) @@ -3453,14 +3457,16 @@ Read/write API Msgpacks can also be read from and written to strings. .. ipython:: python + :okwarning: df.to_msgpack() Furthermore you can concatenate the strings to produce a list of the original objects. .. ipython:: python + :okwarning: - pd.read_msgpack(df.to_msgpack() + s.to_msgpack()) + pd.read_msgpack(df.to_msgpack() + s.to_msgpack()) .. _io.hdf5: diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst index 0614de82cbcd05..ab48594ddadab0 100644 --- a/doc/source/whatsnew/v0.13.0.rst +++ b/doc/source/whatsnew/v0.13.0.rst @@ -829,6 +829,7 @@ Experimental Since this is an EXPERIMENTAL LIBRARY, the storage format may not be stable until a future release. .. ipython:: python + :okwarning: df = pd.DataFrame(np.random.rand(5, 2), columns=list('AB')) df.to_msgpack('foo.msg') @@ -841,6 +842,7 @@ Experimental You can pass ``iterator=True`` to iterator over the unpacked results .. ipython:: python + :okwarning: for o in pd.read_msgpack('foo.msg', iterator=True): print(o) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index e12088a7ad05bf..008f6f0b8643ee 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -590,6 +590,12 @@ by a ``Series`` or ``DataFrame`` with sparse values. The memory usage of the two approaches is identical. See :ref:`sparse.migration` for more (:issue:`19239`). +msgpack format +^^^^^^^^^^^^^^ + +The msgpack format is deprecated as of 0.25 and will be removed in a future version. It is recommended to use pyarrow for on-the-wire transmission of pandas objects. (:issue:`27084`) + + Other deprecations ^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1af3e9449f3dab..3bc7bbb633aed3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2418,8 +2418,11 @@ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs): """ Serialize object to input file path using msgpack format. - THIS IS AN EXPERIMENTAL LIBRARY and the storage format - may not be stable until a future release. + .. deprecated:: 0.25.0 + + to_msgpack is deprecated and will be removed in a future version. + It is recommended to use pyarrow for on-the-wire transmission of + pandas objects. Parameters ---------- diff --git a/pandas/io/packers.py b/pandas/io/packers.py index cef0af3edbb20b..30e51e62aa764a 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -78,8 +78,11 @@ def to_msgpack(path_or_buf, *args, **kwargs): """ msgpack (serialize) object to input file path - THIS IS AN EXPERIMENTAL LIBRARY and the storage format - may not be stable until a future release. + .. deprecated:: 0.25.0 + + to_msgpack is deprecated and will be removed in a future version. + It is recommended to use pyarrow for on-the-wire transmission of + pandas objects. Parameters ---------- @@ -92,6 +95,12 @@ def to_msgpack(path_or_buf, *args, **kwargs): compress : type of compressor (zlib or blosc), default to None (no compression) """ + warnings.warn("to_msgpack is deprecated and will be removed in a " + "future version.\n" + "It is recommended to use pyarrow for on-the-wire " + "transmission of pandas objects.", + FutureWarning, stacklevel=3) + global compressor compressor = kwargs.pop('compress', None) append = kwargs.pop('append', None) @@ -121,8 +130,11 @@ def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs): Load msgpack pandas object from the specified file path - THIS IS AN EXPERIMENTAL LIBRARY and the storage format - may not be stable until a future release. + .. deprecated:: 0.25.0 + + read_msgpack is deprecated and will be removed in a future version. + It is recommended to use pyarrow for on-the-wire transmission of + pandas objects. Parameters ---------- @@ -140,6 +152,12 @@ def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs): read_msgpack is only guaranteed to be backwards compatible to pandas 0.20.3. """ + warnings.warn("The read_msgpack is deprecated and will be removed in a " + "future version.\n" + "It is recommended to use pyarrow for on-the-wire " + "transmission of pandas objects.", + FutureWarning, stacklevel=3) + path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf) if iterator: return Iterator(path_or_buf) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index f580dc460fd68f..04faf5aee4b6d4 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -45,6 +45,7 @@ def __fspath__(self): # https://github.com/cython/cython/issues/1720 @pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning") +@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestCommonIOCapabilities: data1 = """index,A,B,C,D foo,2,3,4,5 diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index 59fa9fbd02da1d..203b550b8936ae 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -2,7 +2,7 @@ import glob from io import BytesIO import os -from warnings import catch_warnings +from warnings import catch_warnings, filterwarnings import numpy as np import pytest @@ -83,6 +83,7 @@ def check_arbitrary(a, b): assert(a == b) +@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestPackers: def setup_method(self, method): @@ -97,6 +98,7 @@ def encode_decode(self, x, compress=None, **kwargs): return read_msgpack(p, **kwargs) +@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestAPI(TestPackers): def test_string_io(self): @@ -159,6 +161,7 @@ def __init__(self): read_msgpack(path_or_buf=A()) +@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestNumpy(TestPackers): def test_numpy_scalar_float(self): @@ -277,6 +280,7 @@ def test_list_mixed(self): tm.assert_almost_equal(tuple(x), x_rec) +@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestBasic(TestPackers): def test_timestamp(self): @@ -322,6 +326,7 @@ def test_intervals(self): assert i == i_rec +@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestIndex(TestPackers): def setup_method(self, method): @@ -387,6 +392,7 @@ def categorical_index(self): tm.assert_frame_equal(result, df) +@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestSeries(TestPackers): def setup_method(self, method): @@ -437,6 +443,7 @@ def test_basic(self): assert_series_equal(i, i_rec) +@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestCategorical(TestPackers): def setup_method(self, method): @@ -460,6 +467,7 @@ def test_basic(self): assert_categorical_equal(i, i_rec) +@pytest.mark.filterwarnings("ignore:msgpack:FutureWarning") class TestNDFrame(TestPackers): def setup_method(self, method): @@ -549,6 +557,7 @@ def test_dataframe_duplicate_column_names(self): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestSparse(TestPackers): def _check_roundtrip(self, obj, comparator, **kwargs): @@ -595,6 +604,7 @@ def test_sparse_frame(self): check_frame_type=True) +@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestCompression(TestPackers): """See https://github.com/pandas-dev/pandas/pull/9783 """ @@ -676,18 +686,21 @@ def decompress(ob): with monkeypatch.context() as m, \ tm.assert_produces_warning(PerformanceWarning) as ws: m.setattr(compress_module, 'decompress', decompress) - i_rec = self.encode_decode(self.frame, compress=compress) - for k in self.frame.keys(): - - value = i_rec[k] - expected = self.frame[k] - assert_frame_equal(value, expected) - # make sure that we can write to the new frames even though - # we needed to copy the data - for block in value._data.blocks: - assert block.values.flags.writeable - # mutate the data in some way - block.values[0] += rhs[block.dtype] + + with catch_warnings(): + filterwarnings('ignore', category=FutureWarning) + i_rec = self.encode_decode(self.frame, compress=compress) + for k in self.frame.keys(): + + value = i_rec[k] + expected = self.frame[k] + assert_frame_equal(value, expected) + # make sure that we can write to the new frames even though + # we needed to copy the data + for block in value._data.blocks: + assert block.values.flags.writeable + # mutate the data in some way + block.values[0] += rhs[block.dtype] for w in ws: # check the messages from our warnings @@ -715,14 +728,18 @@ def test_compression_warns_when_decompress_caches_blosc(self, monkeypatch): def _test_small_strings_no_warn(self, compress): empty = np.array([], dtype='uint8') with tm.assert_produces_warning(None): - empty_unpacked = self.encode_decode(empty, compress=compress) + with catch_warnings(): + filterwarnings('ignore', category=FutureWarning) + empty_unpacked = self.encode_decode(empty, compress=compress) tm.assert_numpy_array_equal(empty_unpacked, empty) assert empty_unpacked.flags.writeable char = np.array([ord(b'a')], dtype='uint8') with tm.assert_produces_warning(None): - char_unpacked = self.encode_decode(char, compress=compress) + with catch_warnings(): + filterwarnings('ignore', category=FutureWarning) + char_unpacked = self.encode_decode(char, compress=compress) tm.assert_numpy_array_equal(char_unpacked, char) assert char_unpacked.flags.writeable @@ -794,6 +811,7 @@ def test_readonly_axis_zlib_to_sql(self): assert_frame_equal(expected, result) +@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestEncoding(TestPackers): def setup_method(self, method): @@ -839,6 +857,7 @@ def legacy_packer(request, datapath): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestMsgpack: """ How to add msgpack tests: From b640530a788206fb445a7295fafdf9ed53b78567 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Sat, 29 Jun 2019 17:35:35 +0200 Subject: [PATCH 102/238] TST: test passing index (and other iterables) to .loc (#27120) closes #16712 --- pandas/tests/frame/test_indexing.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 3b8daa28227f8a..4c1abfb1a7f6fb 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -76,8 +76,9 @@ def test_get_none(self, df): # see gh-5652 assert df.get(None) is None - def test_loc_iterable(self, float_frame): - idx = iter(['A', 'B', 'C']) + @pytest.mark.parametrize('key_type', [iter, np.array, Series, Index]) + def test_loc_iterable(self, float_frame, key_type): + idx = key_type(['A', 'B', 'C']) result = float_frame.loc[:, idx] expected = float_frame.loc[:, ['A', 'B', 'C']] assert_frame_equal(result, expected) From 67eec4eefdf8a77082efe149301c0342bb803e5e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 29 Jun 2019 12:02:40 -0500 Subject: [PATCH 103/238] CLN: Remove older deprecations (#27121) * Remove Series in rename_categories as list-like * Remove reindex_axis and rename_axis behavior * Add Series.reindex_axis * Fix whatsnew and import error --- doc/source/reference/frame.rst | 1 - doc/source/whatsnew/v0.25.0.rst | 3 + pandas/core/arrays/categorical.py | 14 --- pandas/core/frame.py | 7 -- pandas/core/generic.py | 101 +----------------- pandas/core/panel.py | 9 +- pandas/core/series.py | 21 ---- pandas/tests/arrays/categorical/test_api.py | 7 +- pandas/tests/frame/test_alter_axes.py | 14 +-- .../tests/frame/test_axis_select_reindex.py | 38 ------- pandas/tests/sparse/series/test_series.py | 8 -- 11 files changed, 13 insertions(+), 210 deletions(-) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 6ae2ea6e392e63..1c0e6a3a252246 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -198,7 +198,6 @@ Reindexing / selection / label manipulation DataFrame.idxmin DataFrame.last DataFrame.reindex - DataFrame.reindex_axis DataFrame.reindex_like DataFrame.rename DataFrame.rename_axis diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 008f6f0b8643ee..815e13733f7a24 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -627,6 +627,9 @@ Removal of prior version deprecations/changes - Removed the previously deprecated ``pd.options.html.border`` (:issue:`16970`) - Removed the previously deprecated ``convert_objects`` (:issue:`11221`) - Removed the previously deprecated ``select`` method of ``DataFrame`` and ``Series`` (:issue:`17633`) +- Removed the previously deprecated behavior of :class:`Series` treated as list-like in :meth:`~Series.cat.rename_categories` (:issue:`17982`) +- Removed the previously deprecated ``DataFrame.reindex_axis`` and ``Series.reindex_axis``` (:issue:`17842`) +- Removed the previously deprecated behavior of altering column or index labels with :meth:`Series.rename_axis` or :meth:`DataFrame.rename_axis` (:issue:`17842`) .. _whatsnew_0250.performance: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a1d591458fba37..3ef2f41f253387 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -887,11 +887,6 @@ def rename_categories(self, new_categories, inplace=False): .. versionadded:: 0.23.0 - .. warning:: - - Currently, Series are considered list like. In a future version - of pandas they'll be considered dict-like. - inplace : bool, default False Whether or not to rename the categories inplace or return a copy of this categorical with renamed categories. @@ -939,15 +934,6 @@ def rename_categories(self, new_categories, inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') cat = self if inplace else self.copy() - if isinstance(new_categories, ABCSeries): - msg = ("Treating Series 'new_categories' as a list-like and using " - "the values. In a future version, 'rename_categories' will " - "treat Series like a dictionary.\n" - "For dict-like, use 'new_categories.to_dict()'\n" - "For list-like, use 'new_categories.values'.") - warn(msg, FutureWarning, stacklevel=2) - new_categories = list(new_categories) - if is_dict_like(new_categories): cat.categories = [new_categories.get(item, item) for item in cat.categories] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index df7003ecf000e9..f5a5be5d209325 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3768,13 +3768,6 @@ def reindex(self, *args, **kwargs): kwargs.pop('labels', None) return super().reindex(**kwargs) - @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs) - def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, - limit=None, fill_value=np.nan): - return super().reindex_axis(labels=labels, axis=axis, method=method, - level=level, copy=copy, limit=limit, - fill_value=fill_value) - def drop(self, labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise'): """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3bc7bbb633aed3..46d990597355f0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1176,11 +1176,6 @@ def rename_axis(self, mapper=sentinel, **kwargs): Notes ----- - Prior to version 0.21.0, ``rename_axis`` could also be used to change - the axis *labels* by passing a mapping or scalar. This behavior is - deprecated and will be removed in a future version. Use ``rename`` - instead. - ``DataFrame.rename_axis`` supports two calling conventions * ``(index=index_mapper, columns=columns_mapper, ...)`` @@ -1280,22 +1275,15 @@ class name inplace = validate_bool_kwarg(inplace, 'inplace') - if (mapper is not sentinel): + if mapper is not sentinel: # Use v0.23 behavior if a scalar or list non_mapper = is_scalar(mapper) or (is_list_like(mapper) and not is_dict_like(mapper)) if non_mapper: return self._set_axis_name(mapper, axis=axis, inplace=inplace) else: - # Deprecated (v0.21) behavior is if mapper is specified, - # and not a list or scalar, then call rename - msg = ("Using 'rename_axis' to alter labels is deprecated. " - "Use '.rename' instead") - warnings.warn(msg, FutureWarning, stacklevel=3) - axis = self._get_axis_name(axis) - d = {'copy': copy, 'inplace': inplace} - d[axis] = mapper - return self.rename(**d) + raise ValueError("Use `.rename` to alter labels " + "with a mapper.") else: # Use new behavior. Means that index and/or columns # is specified @@ -4378,89 +4366,6 @@ def _needs_reindex_multi(self, axes, method, level): def _reindex_multi(self, axes, copy, fill_value): return NotImplemented - _shared_docs['reindex_axis'] = (""" - Conform input object to new index. - - .. deprecated:: 0.21.0 - Use `reindex` instead. - - By default, places NaN in locations having no value in the - previous index. A new object is produced unless the new index - is equivalent to the current one and copy=False. - - Parameters - ---------- - labels : array-like - New labels / index to conform to. Preferably an Index object to - avoid duplicating data. - axis : %(axes_single_arg)s - Indicate whether to use rows or columns. - method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}, optional - Method to use for filling holes in reindexed DataFrame: - - * default: don't fill gaps. - * pad / ffill: propagate last valid observation forward to next - valid. - * backfill / bfill: use next valid observation to fill gap. - * nearest: use nearest valid observations to fill gap. - - level : int or str - Broadcast across a level, matching Index values on the - passed MultiIndex level. - copy : bool, default True - Return a new object, even if the passed indexes are the same. - limit : int, optional - Maximum number of consecutive elements to forward or backward fill. - fill_value : float, default NaN - Value used to fill in locations having no value in the previous - index. - - .. versionadded:: 0.21.0 (list-like tolerance) - - Returns - ------- - %(klass)s - Returns a new DataFrame object with new indices, unless the new - index is equivalent to the current one and copy=False. - - See Also - -------- - DataFrame.set_index : Set row labels. - DataFrame.reset_index : Remove row labels or move them to new columns. - DataFrame.reindex : Change to new indices or expand indices. - DataFrame.reindex_like : Change to same indices as other DataFrame. - - Examples - -------- - >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]}, - ... index=['dog', 'hawk']) - >>> df - num_legs num_wings - dog 4 0 - hawk 2 2 - >>> df.reindex(['num_wings', 'num_legs', 'num_heads'], - ... axis='columns') - num_wings num_legs num_heads - dog 0 4 NaN - hawk 2 2 NaN - """) - - @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs) - def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, - limit=None, fill_value=None): - msg = ("'.reindex_axis' is deprecated and will be removed in a future " - "version. Use '.reindex' instead.") - self._consolidate_inplace() - - axis_name = self._get_axis_name(axis) - axis_values = self._get_axis(axis_name) - method = missing.clean_reindex_fill_method(method) - warnings.warn(msg, FutureWarning, stacklevel=3) - new_index, indexer = axis_values.reindex(labels, method, level, - limit=limit) - return self._reindex_with_indexers({axis: [new_index, indexer]}, - fill_value=fill_value, copy=copy) - def _reindex_with_indexers(self, reindexers, fill_value=None, copy=False, allow_dups=False): """allow_dups indicates an internal call here """ diff --git a/pandas/core/panel.py b/pandas/core/panel.py index c0340fc975a7e0..350c3083623eb5 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -18,7 +18,7 @@ import pandas.core.common as com from pandas.core.frame import DataFrame -from pandas.core.generic import NDFrame, _shared_docs +from pandas.core.generic import NDFrame from pandas.core.index import ( Index, MultiIndex, _get_objs_combined_axis, ensure_index) import pandas.core.indexes.base as ibase @@ -1244,13 +1244,6 @@ def rename(self, items=None, major_axis=None, minor_axis=None, **kwargs): return super().rename(items=items, major_axis=major_axis, minor_axis=minor_axis, **kwargs) - @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs) - def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, - limit=None, fill_value=np.nan): - return super().reindex_axis(labels=labels, axis=axis, method=method, - level=level, copy=copy, limit=limit, - fill_value=fill_value) - @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.transpose.__doc__) def transpose(self, *args, **kwargs): diff --git a/pandas/core/series.py b/pandas/core/series.py index 31cb7432b3ae1e..7df01ffda0b8b8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3998,27 +3998,6 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): return super().shift(periods=periods, freq=freq, axis=axis, fill_value=fill_value) - def reindex_axis(self, labels, axis=0, **kwargs): - """ - Conform Series to new index with optional filling logic. - - .. deprecated:: 0.21.0 - Use ``Series.reindex`` instead. - - Returns - ------- - Series - Reindexed Series. - """ - # for compatibility with higher dims - if axis != 0: - raise ValueError("cannot reindex series on non-zero axis!") - msg = ("'.reindex_axis' is deprecated and will be removed in a future " - "version. Use '.reindex' instead.") - warnings.warn(msg, FutureWarning, stacklevel=2) - - return self.reindex(index=labels, **kwargs) - def memory_usage(self, index=True, deep=False): """ Return the memory usage of the Series. diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index 15e4bbab8f6490..4be3919f173c45 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -91,12 +91,7 @@ def test_rename_categories(self): def test_rename_categories_series(self): # https://github.com/pandas-dev/pandas/issues/17981 c = Categorical(['a', 'b']) - xpr = "Treating Series 'new_categories' as a list-like " - with tm.assert_produces_warning(FutureWarning) as rec: - result = c.rename_categories(Series([0, 1])) - - assert len(rec) == 1 - assert xpr in str(rec[0].message) + result = c.rename_categories(Series([0, 1], index=['a', 'b'])) expected = Categorical([0, 1]) tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 303604ba7d7ea1..e7b4c2c65b842d 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -668,24 +668,20 @@ def test_rename_axis_inplace(self, float_frame): assert no_return is None tm.assert_frame_equal(result, expected) - def test_rename_axis_warns(self): + def test_rename_axis_raises(self): # https://github.com/pandas-dev/pandas/issues/17833 df = DataFrame({"A": [1, 2], "B": [1, 2]}) - with tm.assert_produces_warning(FutureWarning) as w: + with pytest.raises(ValueError, match="Use `.rename`"): df.rename_axis(id, axis=0) - assert 'rename' in str(w[0].message) - with tm.assert_produces_warning(FutureWarning) as w: + with pytest.raises(ValueError, match="Use `.rename`"): df.rename_axis({0: 10, 1: 20}, axis=0) - assert 'rename' in str(w[0].message) - with tm.assert_produces_warning(FutureWarning) as w: + with pytest.raises(ValueError, match="Use `.rename`"): df.rename_axis(id, axis=1) - assert 'rename' in str(w[0].message) - with tm.assert_produces_warning(FutureWarning) as w: + with pytest.raises(ValueError, match="Use `.rename`"): df['A'].rename_axis(id) - assert 'rename' in str(w[0].message) def test_rename_axis_mapper(self): # GH 19978 diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index b4fde43ff30556..18c95beb62a13a 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -416,17 +416,6 @@ def test_reindex_fill_value(self): expected[4] = 'foo' assert_frame_equal(result, expected) - # reindex_axis - with tm.assert_produces_warning(FutureWarning): - result = df.reindex_axis(range(15), fill_value=0., axis=0) - expected = df.reindex(range(15)).fillna(0) - assert_frame_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning): - result = df.reindex_axis(range(5), fill_value=0., axis=1) - expected = df.reindex(columns=range(5)).fillna(0) - assert_frame_equal(result, expected) - # other dtypes df['foo'] = 'foo' result = df.reindex(range(15), fill_value=0) @@ -1026,33 +1015,6 @@ def test_reindex_corner(self, int_frame): smaller = int_frame.reindex(columns=['A', 'B', 'E']) assert smaller['E'].dtype == np.float64 - def test_reindex_axis(self, float_frame, int_frame): - cols = ['A', 'B', 'E'] - with tm.assert_produces_warning(FutureWarning) as m: - reindexed1 = int_frame.reindex_axis(cols, axis=1) - assert 'reindex' in str(m[0].message) - reindexed2 = int_frame.reindex(columns=cols) - assert_frame_equal(reindexed1, reindexed2) - - rows = int_frame.index[0:5] - with tm.assert_produces_warning(FutureWarning) as m: - reindexed1 = int_frame.reindex_axis(rows, axis=0) - assert 'reindex' in str(m[0].message) - reindexed2 = int_frame.reindex(index=rows) - assert_frame_equal(reindexed1, reindexed2) - - msg = ("No axis named 2 for object type" - " ") - with pytest.raises(ValueError, match=msg): - int_frame.reindex_axis(rows, axis=2) - - # no-op case - cols = float_frame.columns.copy() - with tm.assert_produces_warning(FutureWarning) as m: - newFrame = float_frame.reindex_axis(cols, axis=1) - assert 'reindex' in str(m[0].message) - assert_frame_equal(newFrame, float_frame) - def test_reindex_with_nans(self): df = DataFrame([[1, 2], [3, 4], [np.nan, np.nan], [7, 8], [9, 10]], columns=['a', 'b'], diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 9ce1133cb39ca5..290e0203567db1 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -1516,14 +1516,6 @@ def test_deprecated_numpy_func_call(self): raise_on_extra_warnings=False): getattr(getattr(self, series), func)() - def test_deprecated_reindex_axis(self): - # https://github.com/pandas-dev/pandas/issues/17833 - # Multiple FutureWarnings, can't check stacklevel - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False) as m: - self.bseries.reindex_axis([0, 1, 2]) - assert 'reindex' in str(m[0].message) - @pytest.mark.parametrize( 'datetime_type', (np.datetime64, From cb5b75b0916e037fdd141ce1648521e2832a4d51 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Sat, 29 Jun 2019 20:29:08 +0200 Subject: [PATCH 104/238] DOC: list-like tolerance does not apply to Index.get_loc (#27128) closes #20930 --- pandas/core/indexes/base.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4b7582fcf7cc03..23089cb577bf59 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2724,17 +2724,11 @@ def _convert_can_do_setop(self, other): * backfill / bfill: use NEXT index value if no exact match * nearest: use the NEAREST index value if no exact match. Tied distances are broken by preferring the larger index value. - tolerance : optional + tolerance : int or float, optional Maximum distance from index value for inexact matches. The value of the index at the matching location most satisfy the equation ``abs(index[loc] - key) <= tolerance``. - Tolerance may be a scalar - value, which applies the same tolerance to all values, or - list-like, which applies variable tolerance per element. List-like - includes list, tuple, array, Series, and must be the same size as - the index and its dtype must exactly match the index's type. - .. versionadded:: 0.21.0 (list-like tolerance) Returns From 14e1c5a5469db2bfa3ba2682955427caf458b00f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 29 Jun 2019 15:54:02 -0500 Subject: [PATCH 105/238] CLN: Remove tupleize_cols keyword in io methods; from_csv method (#27126) * Remove tuplize_col keyword in read_html, read_csv, to_csv * Remove from_csv * Remove warning check and type --- doc/source/reference/frame.rst | 1 - doc/source/user_guide/io.rst | 9 --- doc/source/whatsnew/v0.25.0.rst | 2 + pandas/_libs/parsers.pyx | 3 - pandas/core/frame.py | 69 +-------------------- pandas/core/generic.py | 18 +----- pandas/core/series.py | 72 +--------------------- pandas/io/formats/csvs.py | 8 +-- pandas/io/html.py | 12 +--- pandas/io/parsers.py | 25 +------- pandas/tests/frame/test_to_csv.py | 34 ---------- pandas/tests/io/parser/test_unsupported.py | 14 ----- pandas/tests/io/test_html.py | 6 +- pandas/tests/series/test_io.py | 16 ----- 14 files changed, 16 insertions(+), 273 deletions(-) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 1c0e6a3a252246..1a316c2f25ec63 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -336,7 +336,6 @@ Serialization / IO / conversion .. autosummary:: :toctree: api/ - DataFrame.from_csv DataFrame.from_dict DataFrame.from_items DataFrame.from_records diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index e7070585a4b9c4..9af6c36cc4e4d3 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -340,13 +340,6 @@ dialect : str or :class:`python:csv.Dialect` instance, default ``None`` `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to override values, a ParserWarning will be issued. See :class:`python:csv.Dialect` documentation for more details. -tupleize_cols : boolean, default ``False`` - .. deprecated:: 0.21.0 - - This argument will be removed and will always convert to MultiIndex - - Leave a list of tuples on columns as is (default is to convert to a MultiIndex - on the columns). Error handling ++++++++++++++ @@ -1718,8 +1711,6 @@ function takes a number of arguments. Only the first is required. * ``escapechar``: Character used to escape ``sep`` and ``quotechar`` when appropriate (default None) * ``chunksize``: Number of rows to write at a time -* ``tupleize_cols``: If False (default), write as a list of tuples, otherwise - write in an expanded line format suitable for ``read_csv`` * ``date_format``: Format string for datetime objects Writing a formatted string diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 815e13733f7a24..a6e227f4360db0 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -630,6 +630,8 @@ Removal of prior version deprecations/changes - Removed the previously deprecated behavior of :class:`Series` treated as list-like in :meth:`~Series.cat.rename_categories` (:issue:`17982`) - Removed the previously deprecated ``DataFrame.reindex_axis`` and ``Series.reindex_axis``` (:issue:`17842`) - Removed the previously deprecated behavior of altering column or index labels with :meth:`Series.rename_axis` or :meth:`DataFrame.rename_axis` (:issue:`17842`) +- Removed the previously deprecated ``tupleize_cols`` keyword argument in :meth:`read_html`, :meth:`read_csv`, and :meth:`DataFrame.to_csv` (:issue:`17877`, :issue:`17820`) +- Removed the previously deprecated ``DataFrame.from.csv`` and ``Series.from_csv`` (:issue:`17812`) .. _whatsnew_0250.performance: diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index b73b70caf15976..cafc31dad3568e 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -297,7 +297,6 @@ cdef class TextReader: object encoding object compression object mangle_dupe_cols - object tupleize_cols object usecols list dtype_cast_order set unnamed_cols @@ -351,7 +350,6 @@ cdef class TextReader: skipfooter=0, verbose=False, mangle_dupe_cols=True, - tupleize_cols=False, float_precision=None, skip_blank_lines=True): @@ -370,7 +368,6 @@ cdef class TextReader: self.parser.chunksize = tokenize_chunksize self.mangle_dupe_cols = mangle_dupe_cols - self.tupleize_cols = tupleize_cols # For timekeeping self.clocks = [] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f5a5be5d209325..d7da653618b2fe 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -331,7 +331,7 @@ def _constructor(self): _constructor_sliced = Series # type: Type[Series] _deprecations = NDFrame._deprecations | frozenset([ - 'get_value', 'set_value', 'from_csv', 'from_items' + 'get_value', 'set_value', 'from_items' ]) # type: FrozenSet[str] _accessors = set() # type: Set[str] @@ -1786,73 +1786,6 @@ def _from_arrays(cls, arrays, columns, index, dtype=None): mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) return cls(mgr) - @classmethod - def from_csv(cls, path, header=0, sep=',', index_col=0, parse_dates=True, - encoding=None, tupleize_cols=None, - infer_datetime_format=False): - """ - Read CSV file. - - .. deprecated:: 0.21.0 - Use :func:`read_csv` instead. - - It is preferable to use the more powerful :func:`read_csv` - for most general purposes, but ``from_csv`` makes for an easy - roundtrip to and from a file (the exact counterpart of - ``to_csv``), especially with a DataFrame of time series data. - - This method only differs from the preferred :func:`read_csv` - in some defaults: - - - `index_col` is ``0`` instead of ``None`` (take first column as index - by default) - - `parse_dates` is ``True`` instead of ``False`` (try parsing the index - as datetime by default) - - So a ``pd.DataFrame.from_csv(path)`` can be replaced by - ``pd.read_csv(path, index_col=0, parse_dates=True)``. - - Parameters - ---------- - path : string file path or file handle / StringIO - header : int, default 0 - Row to use as header (skip prior rows) - sep : string, default ',' - Field delimiter - index_col : int or sequence, default 0 - Column to use for index. If a sequence is given, a MultiIndex - is used. Different default from read_table - parse_dates : boolean, default True - Parse dates. Different default from read_table - tupleize_cols : boolean, default False - write multi_index columns as a list of tuples (if True) - or new (expanded format) if False) - infer_datetime_format : boolean, default False - If True and `parse_dates` is True for a column, try to infer the - datetime format based on the first datetime string. If the format - can be inferred, there often will be a large parsing speed-up. - - Returns - ------- - DataFrame - - See Also - -------- - read_csv - """ - - warnings.warn("from_csv is deprecated. Please use read_csv(...) " - "instead. Note that some of the default arguments are " - "different, so please refer to the documentation " - "for from_csv when changing your function calls", - FutureWarning, stacklevel=2) - - from pandas.io.parsers import read_csv - return read_csv(path, header=header, sep=sep, - parse_dates=parse_dates, index_col=index_col, - encoding=encoding, tupleize_cols=tupleize_cols, - infer_datetime_format=infer_datetime_format) - def to_sparse(self, fill_value=None, kind='block'): """ Convert to SparseDataFrame. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 46d990597355f0..166d8526456fbc 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2903,7 +2903,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, mode='w', encoding=None, compression='infer', quoting=None, quotechar='"', line_terminator=None, chunksize=None, - tupleize_cols=None, date_format=None, doublequote=True, + date_format=None, doublequote=True, escapechar=None, decimal='.'): r""" Write object to a comma-separated values (csv) file. @@ -2976,14 +2976,6 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, .. versionchanged:: 0.24.0 chunksize : int or None Rows to write at a time. - tupleize_cols : bool, default False - Write MultiIndex columns as a list of tuples (if True) or in - the new, expanded format, where each MultiIndex column is a row - in the CSV (if False). - - .. deprecated:: 0.21.0 - This argument will be removed and will always write each row - of the multi-index as a separate row in the CSV file. date_format : str, default None Format string for datetime objects. doublequote : bool, default True @@ -3017,13 +3009,6 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, df = self if isinstance(self, ABCDataFrame) else self.to_frame() - if tupleize_cols is not None: - warnings.warn("The 'tupleize_cols' parameter is deprecated and " - "will be removed in a future version", - FutureWarning, stacklevel=2) - else: - tupleize_cols = False - from pandas.io.formats.csvs import CSVFormatter formatter = CSVFormatter(df, path_or_buf, line_terminator=line_terminator, sep=sep, @@ -3033,7 +3018,6 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=columns, header=header, index=index, index_label=index_label, mode=mode, chunksize=chunksize, quotechar=quotechar, - tupleize_cols=tupleize_cols, date_format=date_format, doublequote=doublequote, escapechar=escapechar, decimal=decimal) diff --git a/pandas/core/series.py b/pandas/core/series.py index 7df01ffda0b8b8..3d54fa4485c84f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -137,7 +137,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): # tolist is not actually deprecated, just suppressed in the __dir__ _deprecations = generic.NDFrame._deprecations | frozenset( ['asobject', 'reshape', 'get_value', 'set_value', - 'from_csv', 'valid', 'tolist']) + 'valid', 'tolist']) # Override cache_readonly bc Series is mutable hasnans = property(base.IndexOpsMixin.hasnans.func, @@ -4212,81 +4212,13 @@ def between(self, left, right, inclusive=True): return lmask & rmask - @classmethod - def from_csv(cls, path, sep=',', parse_dates=True, header=None, - index_col=0, encoding=None, infer_datetime_format=False): - """ - Read CSV file. - - .. deprecated:: 0.21.0 - Use :func:`pandas.read_csv` instead. - - It is preferable to use the more powerful :func:`pandas.read_csv` - for most general purposes, but ``from_csv`` makes for an easy - roundtrip to and from a file (the exact counterpart of - ``to_csv``), especially with a time Series. - - This method only differs from :func:`pandas.read_csv` in some defaults: - - - `index_col` is ``0`` instead of ``None`` (take first column as index - by default) - - `header` is ``None`` instead of ``0`` (the first row is not used as - the column names) - - `parse_dates` is ``True`` instead of ``False`` (try parsing the index - as datetime by default) - - With :func:`pandas.read_csv`, the option ``squeeze=True`` can be used - to return a Series like ``from_csv``. - - Parameters - ---------- - path : str, file path, or file handle / StringIO - sep : str, default ',' - Field delimiter. - parse_dates : bool, default True - Parse dates. Different default from read_table. - header : int, default None - Row to use as header (skip prior rows). - index_col : int or sequence, default 0 - Column to use for index. If a sequence is given, a MultiIndex - is used. Different default from read_table. - encoding : str, optional - A string representing the encoding to use if the contents are - non-ascii, for python versions prior to 3. - infer_datetime_format : bool, default False - If True and `parse_dates` is True for a column, try to infer the - datetime format based on the first datetime string. If the format - can be inferred, there often will be a large parsing speed-up. - - Returns - ------- - Series - - See Also - -------- - read_csv - """ - - # We're calling `DataFrame.from_csv` in the implementation, - # which will propagate a warning regarding `from_csv` deprecation. - from pandas.core.frame import DataFrame - df = DataFrame.from_csv(path, header=header, index_col=index_col, - sep=sep, parse_dates=parse_dates, - encoding=encoding, - infer_datetime_format=infer_datetime_format) - result = df.iloc[:, 0] - if header is None: - result.index.name = result.name = None - - return result - @Appender(generic.NDFrame.to_csv.__doc__) def to_csv(self, *args, **kwargs): names = ["path_or_buf", "sep", "na_rep", "float_format", "columns", "header", "index", "index_label", "mode", "encoding", "compression", "quoting", "quotechar", "line_terminator", - "chunksize", "tupleize_cols", "date_format", "doublequote", + "chunksize", "date_format", "doublequote", "escapechar", "decimal"] old_names = ["path_or_buf", "index", "sep", "na_rep", "float_format", diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 120eb4612fc9ba..e1d95862ec872c 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -24,9 +24,9 @@ class CSVFormatter: def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, - index_label=None, mode='w', nanRep=None, encoding=None, + index_label=None, mode='w', encoding=None, compression='infer', quoting=None, line_terminator='\n', - chunksize=None, tupleize_cols=False, quotechar='"', + chunksize=None, quotechar='"', date_format=None, doublequote=True, escapechar=None, decimal='.'): @@ -68,9 +68,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', self.date_format = date_format - self.tupleize_cols = tupleize_cols - self.has_mi_columns = (isinstance(obj.columns, ABCMultiIndex) and - not self.tupleize_cols) + self.has_mi_columns = isinstance(obj.columns, ABCMultiIndex) # validate mi options if self.has_mi_columns: diff --git a/pandas/io/html.py b/pandas/io/html.py index d54489aabf1ed3..f080e1d1fc1888 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -912,7 +912,7 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, - tupleize_cols=None, thousands=',', encoding=None, + thousands=',', encoding=None, decimal='.', converters=None, na_values=None, keep_default_na=True, displayed_only=True): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. @@ -976,14 +976,6 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, parse_dates : bool, optional See :func:`~read_csv` for more details. - tupleize_cols : bool, optional - If ``False`` try to parse multiple header rows into a - :class:`~pandas.MultiIndex`, otherwise return raw tuples. Defaults to - ``False``. - - .. deprecated:: 0.21.0 - This argument will be removed and will always convert to MultiIndex - thousands : str, optional Separator to use to parse thousands. Defaults to ``','``. @@ -1073,7 +1065,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, _validate_header_arg(header) return _parse(flavor=flavor, io=io, match=match, header=header, index_col=index_col, skiprows=skiprows, - parse_dates=parse_dates, tupleize_cols=tupleize_cols, + parse_dates=parse_dates, thousands=thousands, attrs=attrs, encoding=encoding, decimal=decimal, converters=converters, na_values=na_values, keep_default_na=keep_default_na, diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9c914003c3764b..73d47af5922f7e 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -9,6 +9,7 @@ import re import sys from textwrap import fill +from typing import Any, Dict, Set import warnings import numpy as np @@ -293,13 +294,6 @@ `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to override values, a ParserWarning will be issued. See csv.Dialect documentation for more details. -tupleize_cols : bool, default False - Leave a list of tuples on columns as is (default is to convert to - a MultiIndex on the columns). - - .. deprecated:: 0.21.0 - This argument will be removed and will always convert to MultiIndex - error_bad_lines : bool, default True Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no DataFrame will be returned. @@ -501,7 +495,6 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): 'squeeze': False, 'compression': None, 'mangle_dupe_cols': True, - 'tupleize_cols': False, 'infer_datetime_format': False, 'skip_blank_lines': True } @@ -514,7 +507,6 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): 'memory_map': False, 'error_bad_lines': True, 'warn_bad_lines': True, - 'tupleize_cols': False, 'float_precision': None } @@ -530,12 +522,8 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): 'float_precision', } -_deprecated_defaults = { - 'tupleize_cols': None -} -_deprecated_args = { - 'tupleize_cols', -} +_deprecated_defaults = {} # type: Dict[str, Any] +_deprecated_args = set() # type: Set[str] def _make_parser_function(name, default_sep=','): @@ -595,7 +583,6 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer, comment=None, encoding=None, dialect=None, - tupleize_cols=None, # Error Handling error_bad_lines=True, @@ -691,7 +678,6 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer, error_bad_lines=error_bad_lines, low_memory=low_memory, mangle_dupe_cols=mangle_dupe_cols, - tupleize_cols=tupleize_cols, infer_datetime_format=infer_datetime_format, skip_blank_lines=skip_blank_lines) @@ -1052,10 +1038,6 @@ def _clean_options(self, options, engine): "and will be removed in a future version." .format(arg=arg)) - if arg == 'tupleize_cols': - msg += (' Column tuples will then ' - 'always be converted to MultiIndex.') - if result.get(arg, depr_default) != depr_default: # raise Exception(result.get(arg, depr_default), depr_default) depr_warning += msg + '\n\n' @@ -1362,7 +1344,6 @@ def __init__(self, kwds): self.true_values = kwds.get('true_values') self.false_values = kwds.get('false_values') - self.tupleize_cols = kwds.get('tupleize_cols', False) self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) self.infer_datetime_format = kwds.pop('infer_datetime_format', False) self.cache_dates = kwds.pop('cache_dates', True) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 354826a4b3e7be..cfe9e00a47db5c 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -33,15 +33,6 @@ def read_csv(self, path, **kwargs): return pd.read_csv(path, **params) - def test_from_csv_deprecation(self): - # see gh-17812 - with ensure_clean('__tmp_from_csv_deprecation__') as path: - self.tsframe.to_csv(path) - - with tm.assert_produces_warning(FutureWarning): - depr_recons = DataFrame.from_csv(path) - assert_frame_equal(self.tsframe, depr_recons) - def test_to_csv_from_csv1(self): with ensure_clean('__tmp_to_csv_from_csv1__') as path: @@ -582,19 +573,6 @@ def _make_frame(names=None): result.columns.names = df.columns.names assert_frame_equal(df, result) - # tupleize_cols=True and index=False - df = _make_frame(True) - with tm.assert_produces_warning(FutureWarning): - df.to_csv(path, tupleize_cols=True, index=False) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = read_csv(path, header=0, - tupleize_cols=True, - index_col=None) - result.columns = df.columns - assert_frame_equal(df, result) - # whatsnew example df = _make_frame() df.to_csv(path) @@ -608,18 +586,6 @@ def _make_frame(names=None): index_col=[0]) assert_frame_equal(df, result) - # column & index are multi-index (compatibility) - df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) - with tm.assert_produces_warning(FutureWarning): - df.to_csv(path, tupleize_cols=True) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = read_csv(path, header=0, index_col=[0, 1], - tupleize_cols=True) - result.columns = df.columns - assert_frame_equal(df, result) - # invalid options df = _make_frame(True) df.to_csv(path) diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 4447a5580ae60e..a8748c88e0e558 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -122,17 +122,3 @@ def read(self): with pytest.raises(ValueError, match=msg): read_csv(NoNextBuffer(data), engine=python_engine) - - -class TestDeprecatedFeatures: - - @pytest.mark.parametrize("engine", ["c", "python"]) - @pytest.mark.parametrize("kwargs", [{"tupleize_cols": True}, - {"tupleize_cols": False}]) - def test_deprecated_args(self, engine, kwargs): - data = "1,2,3" - arg, _ = list(kwargs.items())[0] - - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False): - read_csv(StringIO(data), engine=engine, **kwargs) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 33268b637d44ae..63184dd1a8f839 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -310,10 +310,8 @@ def test_multiindex_header_index(self): @pytest.mark.slow def test_multiindex_header_skiprows_tuples(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df = self._bank_data(header=[0, 1], skiprows=1, - tupleize_cols=True)[0] - assert isinstance(df.columns, Index) + df = self._bank_data(header=[0, 1], skiprows=1)[0] + assert isinstance(df.columns, MultiIndex) @pytest.mark.slow def test_multiindex_header_skiprows(self): diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 90949f6bfab915..39c217e7d95b13 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -29,17 +29,6 @@ def read_csv(self, path, **kwargs): return out - def test_from_csv_deprecation(self, datetime_series): - # see gh-17812 - with ensure_clean() as path: - datetime_series.to_csv(path, header=False) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - ts = self.read_csv(path) - depr_ts = Series.from_csv(path) - assert_series_equal(depr_ts, ts) - @pytest.mark.parametrize("arg", ["path", "header", "both"]) def test_to_csv_deprecation(self, arg, datetime_series): # see gh-19715 @@ -68,11 +57,6 @@ def test_from_csv(self, datetime_series, string_series): assert ts.name is None assert ts.index.name is None - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - depr_ts = Series.from_csv(path) - assert_series_equal(depr_ts, ts) - # see gh-10483 datetime_series.to_csv(path, header=True) ts_h = self.read_csv(path, header=0) From 2811464a87e6e18f5daef87ee700075ebd8a5e7d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 29 Jun 2019 15:54:27 -0500 Subject: [PATCH 106/238] ERR: Raise error in to_excel when saving datetimes with timezones (#27129) * ERR: Raise error in to_excel when saving datetimes with timezones * Remove unnecessary setting of engine * Typo * Flake8 --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/io/formats/excel.py | 4 ++++ pandas/tests/io/excel/test_writers.py | 15 +++++++++++++++ 3 files changed, 20 insertions(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a6e227f4360db0..83a90443b24cb1 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -561,6 +561,7 @@ Other API changes - The ``.str``-accessor has been disabled for 1-level :class:`MultiIndex`, use :meth:`MultiIndex.to_flat_index` if necessary (:issue:`23679`) - Removed support of gtk package for clipboards (:issue:`26563`) - Using an unsupported version of Beautiful Soup 4 will now raise an ``ImportError`` instead of a ``ValueError`` (:issue:`27063`) +- :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` will now raise a ``ValueError`` when saving timezone aware data. (:issue:`27008`, :issue:`7056`) .. _whatsnew_0250.deprecations: diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 5792f6e2a5a08a..66a00bf9ab0540 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -402,6 +402,10 @@ def _format_value(self, val): val = '-{inf}'.format(inf=self.inf_rep) elif self.float_format is not None: val = float(self.float_format % val) + if getattr(val, 'tzinfo', None) is not None: + raise ValueError('Excel does not support datetimes with ' + 'timezones. Please ensure that datetimes ' + 'are timezone unaware before writing to Excel.') return val def _format_header_mi(self): diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index a4fdcdf70a3ea6..8f20136f1ea4b6 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -1178,6 +1178,21 @@ def test_merged_cell_custom_objects(self, engine, merge_cells, ext): expected.index = expected.index.astype(np.float64) tm.assert_frame_equal(expected, result) + @pytest.mark.parametrize('dtype', [None, object]) + def test_raise_when_saving_timezones(self, engine, ext, dtype, + tz_aware_fixture): + # GH 27008, GH 7056 + tz = tz_aware_fixture + data = pd.Timestamp('2019', tz=tz) + df = DataFrame([data], dtype=dtype) + with pytest.raises(ValueError, match="Excel does not support"): + df.to_excel(self.path) + + data = data.to_pydatetime() + df = DataFrame([data], dtype=dtype) + with pytest.raises(ValueError, match="Excel does not support"): + df.to_excel(self.path) + class TestExcelWriterEngineTests: From 65ec968d22fbb647b86fd0dd9ad1de9e64167119 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 29 Jun 2019 22:46:33 -0500 Subject: [PATCH 107/238] TST: Decoupled more xlrd reading tests from openpyxl (#27114) --- pandas/tests/io/excel/test_readers.py | 103 +++++++++++++------------- pandas/tests/io/excel/test_writers.py | 2 + pandas/tests/io/excel/test_xlrd.py | 8 ++ 3 files changed, 63 insertions(+), 50 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 579f39e21d3c1d..40a6970aa7f049 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -15,7 +15,6 @@ import pandas.util.testing as tm from pandas.io.common import URLError -from pandas.io.excel import ExcelFile @contextlib.contextmanager @@ -736,41 +735,44 @@ class TestExcelFileRead: pytest.param(None, marks=pytest.mark.skipif( not td.safe_import("xlrd"), reason="no xlrd")), ]) - def cd_and_set_engine(self, request, datapath, monkeypatch): + def cd_and_set_engine(self, request, datapath, monkeypatch, read_ext): """ Change directory and set engine for ExcelFile objects. """ + if request.param == 'openpyxl' and read_ext == '.xls': + pytest.skip() + func = partial(pd.ExcelFile, engine=request.param) monkeypatch.chdir(datapath("io", "data")) monkeypatch.setattr(pd, 'ExcelFile', func) def test_excel_passes_na(self, read_ext): - excel = ExcelFile('test4' + read_ext) - - parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False, - na_values=['apple']) + with pd.ExcelFile('test4' + read_ext) as excel: + parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False, + na_values=['apple']) expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected) - parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=True, - na_values=['apple']) + with pd.ExcelFile('test4' + read_ext) as excel: + parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=True, + na_values=['apple']) expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected) # 13967 - excel = ExcelFile('test5' + read_ext) - - parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False, - na_values=['apple']) + with pd.ExcelFile('test5' + read_ext) as excel: + parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False, + na_values=['apple']) expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected) - parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=True, - na_values=['apple']) + with pd.ExcelFile('test5' + read_ext) as excel: + parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=True, + na_values=['apple']) expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected) @@ -778,79 +780,80 @@ def test_excel_passes_na(self, read_ext): @pytest.mark.parametrize('arg', ['sheet', 'sheetname', 'parse_cols']) def test_unexpected_kwargs_raises(self, read_ext, arg): # gh-17964 - excel = ExcelFile('test1' + read_ext) - kwarg = {arg: 'Sheet1'} msg = "unexpected keyword argument `{}`".format(arg) - with pytest.raises(TypeError, match=msg): - pd.read_excel(excel, **kwarg) - def test_excel_table_sheet_by_index(self, read_ext, df_ref): + with pd.ExcelFile('test1' + read_ext) as excel: + with pytest.raises(TypeError, match=msg): + pd.read_excel(excel, **kwarg) - excel = ExcelFile('test1' + read_ext) + def test_excel_table_sheet_by_index(self, read_ext, df_ref): - df1 = pd.read_excel(excel, 0, index_col=0) - df2 = pd.read_excel(excel, 1, skiprows=[1], index_col=0) + with pd.ExcelFile('test1' + read_ext) as excel: + df1 = pd.read_excel(excel, 0, index_col=0) + df2 = pd.read_excel(excel, 1, skiprows=[1], index_col=0) tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) - df1 = excel.parse(0, index_col=0) - df2 = excel.parse(1, skiprows=[1], index_col=0) + with pd.ExcelFile('test1' + read_ext) as excel: + df1 = excel.parse(0, index_col=0) + df2 = excel.parse(1, skiprows=[1], index_col=0) tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) - df3 = pd.read_excel(excel, 0, index_col=0, skipfooter=1) + with pd.ExcelFile('test1' + read_ext) as excel: + df3 = pd.read_excel(excel, 0, index_col=0, skipfooter=1) tm.assert_frame_equal(df3, df1.iloc[:-1]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df4 = pd.read_excel(excel, 0, index_col=0, skip_footer=1) + with pd.ExcelFile('test1' + read_ext) as excel: + df4 = pd.read_excel(excel, 0, index_col=0, skip_footer=1) + tm.assert_frame_equal(df3, df4) - df3 = excel.parse(0, index_col=0, skipfooter=1) - tm.assert_frame_equal(df3, df1.iloc[:-1]) + with pd.ExcelFile('test1' + read_ext) as excel: + df3 = excel.parse(0, index_col=0, skipfooter=1) - import xlrd # will move to engine-specific tests as new ones are added - with pytest.raises(xlrd.XLRDError): - pd.read_excel(excel, 'asdf') + tm.assert_frame_equal(df3, df1.iloc[:-1]) def test_sheet_name(self, read_ext, df_ref): filename = "test1" sheet_name = "Sheet1" - excel = ExcelFile(filename + read_ext) - df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc - df2_parse = excel.parse(index_col=0, - sheet_name=sheet_name) + with pd.ExcelFile(filename + read_ext) as excel: + df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc + + with pd.ExcelFile(filename + read_ext) as excel: + df2_parse = excel.parse(index_col=0, + sheet_name=sheet_name) tm.assert_frame_equal(df1_parse, df_ref, check_names=False) tm.assert_frame_equal(df2_parse, df_ref, check_names=False) def test_excel_read_buffer(self, read_ext): - pth = 'test1' + read_ext - expected = pd.read_excel(pth, 'Sheet1', index_col=0) + engine = pd.ExcelFile.keywords['engine'] # TODO: fixturize + expected = pd.read_excel(pth, 'Sheet1', index_col=0, engine=engine) with open(pth, 'rb') as f: - xls = ExcelFile(f) - actual = pd.read_excel(xls, 'Sheet1', index_col=0) + with pd.ExcelFile(f) as xls: + actual = pd.read_excel(xls, 'Sheet1', index_col=0) + tm.assert_frame_equal(expected, actual) def test_reader_closes_file(self, read_ext): - f = open('test1' + read_ext, 'rb') - with ExcelFile(f) as xlsx: + engine = pd.ExcelFile.keywords['engine'] # TODO: fixturize + with pd.ExcelFile(f) as xlsx: # parses okay - pd.read_excel(xlsx, 'Sheet1', index_col=0) + pd.read_excel(xlsx, 'Sheet1', index_col=0, engine=engine) assert f.closed - @pytest.mark.parametrize('excel_engine', [ - 'xlrd', - None - ]) - def test_read_excel_engine_value(self, read_ext, excel_engine): + def test_conflicting_excel_engines(self, read_ext): # GH 26566 - xl = ExcelFile("test1" + read_ext, engine=excel_engine) msg = "Engine should not be specified when passing an ExcelFile" - with pytest.raises(ValueError, match=msg): - pd.read_excel(xl, engine='openpyxl') + + with pd.ExcelFile("test1" + read_ext) as xl: + with pytest.raises(ValueError, match=msg): + pd.read_excel(xl, engine='foo') diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 8f20136f1ea4b6..ffa77de930cbda 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -250,6 +250,7 @@ class and any subclasses, on account of the `autouse=True` set_option(option_name, prev_engine) # Roll back option change +@td.skip_if_no('xlrd') @pytest.mark.parametrize("engine,ext", [ pytest.param('openpyxl', '.xlsx', marks=pytest.mark.skipif( not td.safe_import('openpyxl'), reason='No openpyxl')), @@ -1252,6 +1253,7 @@ def check_called(func): 'something.xls', engine='dummy')) +@td.skip_if_no('xlrd') @td.skip_if_no('openpyxl') @pytest.mark.skipif(not PY36, reason='requires fspath') class TestFSPath: diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index b9fc9305a40335..94e1435d4dfaba 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -27,3 +27,11 @@ def test_read_xlrd_book(read_ext, frame): result = pd.read_excel(book, sheet_name=sheet_name, engine=engine, index_col=0) tm.assert_frame_equal(df, result) + + +# TODO: test for openpyxl as well +def test_excel_table_sheet_by_index(datapath, read_ext): + path = datapath("io", "data", 'test1{}'.format(read_ext)) + with pd.ExcelFile(path) as excel: + with pytest.raises(xlrd.XLRDError): + pd.read_excel(excel, 'asdf') From a173cacbffbd49c0aecafda76ace7aed9b0af60c Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Sun, 30 Jun 2019 14:06:45 +0200 Subject: [PATCH 108/238] PERF: do not instantiate IndexEngine for standard lookup over RangeIndex (#27119) * TST: actually test #16877 on numeric index (not just RangeIndex) * PERF: do not instantiate IndexEngine for standard lookup over RangeIndex closes #16685 --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/indexes/range.py | 34 ++++++++++++++++++++++++++++-- pandas/tests/indexes/test_base.py | 7 ++++-- pandas/tests/indexes/test_range.py | 22 +++++++++++++++++++ 4 files changed, 60 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 83a90443b24cb1..82e093bc2bd490 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -645,6 +645,7 @@ Performance improvements int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`) - Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`) - Improved performance of slicing and other selected operation on a :class:`RangeIndex` (:issue:`26565`, :issue:`26617`, :issue:`26722`) +- :class:`RangeIndex` now performs standard lookup without instantiating an actual hashtable, hence saving memory (:issue:`16685`) - Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`) - Improved performance of :meth:`read_csv` by faster parsing of N/A and boolean values (:issue:`25804`) - Improved performance of :attr:`IntervalIndex.is_monotonic`, :attr:`IntervalIndex.is_monotonic_increasing` and :attr:`IntervalIndex.is_monotonic_decreasing` by removing conversion to :class:`MultiIndex` (:issue:`24813`) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 47dad1788e0219..70ca0b349e7ed5 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -13,8 +13,8 @@ from pandas.core.dtypes import concat as _concat from pandas.core.dtypes.common import ( - ensure_python_int, is_int64_dtype, is_integer, is_scalar, - is_timedelta64_dtype) + ensure_platform_int, ensure_python_int, is_int64_dtype, is_integer, + is_integer_dtype, is_list_like, is_scalar, is_timedelta64_dtype) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, ABCTimedeltaIndex) @@ -348,6 +348,36 @@ def get_loc(self, key, method=None, tolerance=None): raise KeyError(key) return super().get_loc(key, method=method, tolerance=tolerance) + @Appender(_index_shared_docs['get_indexer']) + def get_indexer(self, target, method=None, limit=None, tolerance=None): + if not (method is None and tolerance is None and is_list_like(target)): + return super().get_indexer(target, method=method, + tolerance=tolerance) + + if self.step > 0: + start, stop, step = self.start, self.stop, self.step + else: + # Work on reversed range for simplicity: + start, stop, step = (self.stop - self.step, + self.start + 1, + - self.step) + + target_array = np.asarray(target) + if not (is_integer_dtype(target_array) and target_array.ndim == 1): + # checks/conversions/roundings are delegated to general method + return super().get_indexer(target, method=method, + tolerance=tolerance) + + locs = target_array - start + valid = (locs % step == 0) & (locs >= 0) & (target_array < stop) + locs[~valid] = -1 + locs[valid] = locs[valid] / step + + if step != self.step: + # We reversed this range: transform to original locs + locs[valid] = len(self) - 1 - locs[valid] + return ensure_platform_int(locs) + def tolist(self): return list(self._range) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 1de20dc765655f..c618b9b05a9426 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1439,9 +1439,12 @@ def test_get_indexer_strings_raises(self): index.get_indexer(['a', 'b', 'c', 'd'], method='pad', tolerance=[2, 2, 2, 2]) - def test_get_indexer_numeric_index_boolean_target(self): + @pytest.mark.parametrize("idx_class", [Int64Index, RangeIndex, + Float64Index]) + def test_get_indexer_numeric_index_boolean_target(self, idx_class): # GH 16877 - numeric_index = pd.Index(range(4)) + + numeric_index = idx_class(RangeIndex((4))) result = numeric_index.get_indexer([True, False, True]) expected = np.array([-1, -1, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 5f7f10e881cede..e9fe1278d7827c 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.core.dtypes.common import ensure_platform_int + import pandas as pd from pandas import Float64Index, Index, Int64Index, RangeIndex, Series import pandas.util.testing as tm @@ -965,3 +967,23 @@ def test_append(self, appends): # Append single item rather than list result2 = indices[0].append(indices[1]) tm.assert_index_equal(result2, expected, exact=True) + + def test_engineless_lookup(self): + # GH 16685 + # Standard lookup on RangeIndex should not require the engine to be + # created + idx = RangeIndex(2, 10, 3) + + assert idx.get_loc(5) == 1 + tm.assert_numpy_array_equal(idx.get_indexer([2, 8]), + ensure_platform_int(np.array([0, 2]))) + with pytest.raises(KeyError): + idx.get_loc(3) + + assert '_engine' not in idx._cache + + # The engine is still required for lookup of a different dtype scalar: + with pytest.raises(KeyError): + assert idx.get_loc('a') == -1 + + assert '_engine' in idx._cache From ddec4eb9d1863385e133d074fba07d8020141b4f Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sun, 30 Jun 2019 10:56:35 -0500 Subject: [PATCH 109/238] Regened files to remove build warnings (#27122) --- pandas/tests/io/data/test4.xls | Bin 25600 -> 25600 bytes pandas/tests/io/data/test4.xlsm | Bin 8022 -> 8360 bytes pandas/tests/io/data/test4.xlsx | Bin 28216 -> 8344 bytes pandas/tests/io/data/test5.xls | Bin 20480 -> 20480 bytes pandas/tests/io/data/test5.xlsm | Bin 8017 -> 8642 bytes pandas/tests/io/data/test5.xlsx | Bin 8002 -> 8626 bytes pandas/tests/io/data/test_multisheet.xls | Bin 24576 -> 24576 bytes pandas/tests/io/data/test_multisheet.xlsm | Bin 11148 -> 11313 bytes pandas/tests/io/data/test_multisheet.xlsx | Bin 11131 -> 11296 bytes pandas/tests/io/data/test_squeeze.xls | Bin 26112 -> 26112 bytes pandas/tests/io/data/test_squeeze.xlsm | Bin 8990 -> 9122 bytes pandas/tests/io/data/test_squeeze.xlsx | Bin 8972 -> 9106 bytes pandas/tests/io/data/test_types.xlsm | Bin 8733 -> 9042 bytes pandas/tests/io/data/test_types.xlsx | Bin 33769 -> 9010 bytes 14 files changed, 0 insertions(+), 0 deletions(-) diff --git a/pandas/tests/io/data/test4.xls b/pandas/tests/io/data/test4.xls index 0e6f4331e2547d9a68e395c0317badcba0f48287..10a6ab1cca6a4d8e7bbfef77d8bfa7d9caf7b2bc 100644 GIT binary patch delta 56 zcmZoT!Pszuaf1evI0pkmcxFzHf@5V$Qfg6&LRe~1YEE&o%2)O3 MU1i){lGMWp03LA@<^TWy delta 33 pcmZoT!Pszuaf1dEqsZhs2eHYHOyQfSFZ diff --git a/pandas/tests/io/data/test4.xlsm b/pandas/tests/io/data/test4.xlsm index 52328c7b28be9a536316e718745ebbf5baeaaa0d..a5a2ff8caadd65345e9ee5f8a40aa1b53a25609a 100644 GIT binary patch delta 2976 zcmZ8jX*kr27ydE!eHlAr-?y>ET#B(~ow08tYa(l8%Qn_jL=4KcjwNJBMhKBCLuJb{ zvZpkaZR|w(cfZ`8`#+z~bDsD8@;>K%&zaGw)oWs*M^keZlRr`d0GQOV)nx z*@m&jU|k|mRNe1p32MGsBOh*+_>O`cU53Q;KldB3C_l0=e3msn@^p4n{IY&cHqEyM**+iauq&3*T3h(c zw-xP{cvFs{#ei=5GW4^*ByX9S!&(FdqcKy-lg}rX>2&1IKSf z!U>wGE}aX_)|!xBYeycgO$IrTnT)%XD=EDD@cBc>n~3Wyif_x*+xo8!4cBVm4S7oq zzk(l!^||xe*=Nb;Bt48y(u7AxTP^GNQkh+N!{kVPAx?}D^k$Yws+aR?lCGl~`O6V~ zhLi-;))`ET3i-)0ezQ@oH7zmoVSla3zuPB}eeI93g?9myrR?vG8Y?ldBcs-JANF|JDgXs$q!6j3d7y6X7FL%j)aJk4Jg)~ z*Jcu9B;AphSk_#dVtxt)JY0yFvVf5)%u4ETU&y zC{VZEsfJCFo)YT9?_REnqM8A%q0H3z=$w8{5VzFA*=ouyCDsXODR z`P;QoW4&b{>45$U}8oJ-00aJde~p>EwGspF}gDm)qT@8X4)*N^wBfddZx=f zqQ(~a9jpEiKR#tOYI-%K^+Zmt#fwizHRI{-(YDQcR`TLD-%UD^yb&0e%$vd!DuTHR~Oo!;qb zISP(#+SSa94CQmiYAzKTe-}k(ju*O7<#egfeV@Am$V^0xm>=e@=PJ-Ej+p?bT2(Tl zo(s~YHP0PJD{O@GQg%Td(l2A|ess#6DTi#l+@1ObN22W#C0|P*%@tepSne^uh~zes zT))&iL#XBAuI{T8W78Rkzut3njB-^Fa_6D{bHwqQh;|C%&_&ynp(#1fp8Xc zsv}h3k_G9Ab-M&hkU>cCnD9W6ZF6?e3{p(y@xDr^fESaTJrVR`2Er%EJjhw77BM(4T!{ ze2T%6@<7@2!86X!XK|z+OehZ*9>mm7L%+phn#ufoQ$J%8xkS;(kYXs?z;^x*A%*3X zBufneD#QB5iZ51m{zZp@?|>OEi=geEkYyFRIdxRT5zy>;JhE3L{hmWl0NMIO%B*{)uB+053CFpAY#nf=W| z;aQM@O=$rKawSNb=1FE_<<*x}3P@UXw>fN9ccy=56)oQ)BA5r0GVA)8HiL3M&dk=A zn8Uu}+#DlXXc+A6AI12S)!%)^i&D3d_1bi=e&gRrc9y}ecFx34q-Tq)4}P4~4(~0> z$*W;@O0s>mrMFSdxG2WCZD?91rJf_K#=op+O{Tj-U2i0)RKcs_FFh39R)$acH@QS`Kg*Q%kG+UZk%I%I?7OyIyy?%?f_Hd=EmmO5r9|@GBA3X57At1wj+*{V zPezcd_Tp*kV&!XnZHsp;0pt)$*bej+g`G<$-Stlx_4Be_cMKN9g%M|5p*LzHc%di& zAPAq$BSL|W%e6+PK*4FtmS?P0+g5LALaCgle2_{i_ePH$<~tkM8LqF-{(4ERadqO+ ze)T0%!qGyJ)SiJeEY`}!PSvP?F9(!s?>ZE8&uzClHbpR~_H#?EMAcV1{t^6oed@GI zJ^132Cp11(euUhrEa#Gn5|xO|XVJD)RE}^#Qx?KKPdz7o~p=nzpXgT`^{p~P+ znc$@0OKcPL!ygiHJ$4fmus$m|{d+%)i^=FN8oRo4g|B%zc<>je`9ee}A#gM%^oDQ? zZ>HsB^hi~P=N{)-YfSW(aYR~R)BxZ^@2g@9x2$faUS!<5XfGUTJt{9F(_%^<8dt)X z&*2<4mNY4k7gbOh6*q#jFZT9ac)Ml%O#p}rY;PA{!{MZlGUDzwzmmqTrz_nJXV@p`V3T{j3FK1M zYF;u&V1ZTdh{Rt(r3i#9X@x6p9pPhqv@p7)d?Mu(hqv`zZ^Y>F^MVoIRdU^{ca^*MPbpWjeQ5{>bY z%lhc5oxSP%Qjl7TD=T>x>ZRudLAI+m9O)$dB&wCiv&50|4{y$M#RLFkylr!Wec4 zKjjz;hMkE^-pZVUQV{?HGyvUim?-dX%u)dW?;qmt4uh$M$YaKY_$g<2Fe;4P3ja_2 n*Mh%On*Bf2R$k0DL|pLS%>e-b+kXHi0vIJ>F&YlYKhysK2qR%m delta 2643 zcmZ8jc{~(~79PtWB+Cp#7-Y$ooy?<>q9JRHExWQOG0n`iOtzsc!_1T`L$>Vu$d)Z@ z$P`nE7-Wf%eTzKzzW46^-FN;vzjMC7&iT&wokG3gOO;%wux*dC0N)q@fN>@O;0yo& zK>I170v`Cf1qAqBK>PdU5lw<<%G|gg@7AE|jU&|=FydV|R90Iu2Tk^+uKW;e_!`uJ zkUmSN)Uq)@knQL?u)jPO=Df1BiwEi2>Ud{p>C6lT*nQKzF9>_&r7pi2 zO69b9Cfz>NikluooZ6e}bZP5mUeCgGZs`{s+|LnQ7a~uGYi-%Ir-|~MeQB*@YHpNZ zTV6MeX&JQL$u2bYAUkCx@=gLrx;ZTrq%al%DHLqZ4JoY>gg4r#ns0K~xn3|}kmMyg zV^}2d_Bl0hac8Xu2R+!08baIOc23p4GoU2Ds+;m*L$AkBeLA|Q%?I&p4)XYVGmWQd zj%SseSU7FAeQZsM0rFy4K%C2x!;EiwHKE9solxs1i-1Z;HWW-wlaO`;sw8H?6=b!fSUx1c70vvm!v~}d= zCtC7H#d|3G;bY+l)rT620`bD{X_KDj;}>!cDLVU|)X`gKr$lHsmOYL~sx;V_>jH1% zxo*v*vde@k#>*bj0wR4Q9^yMdwok27B$!RZJ|7aZnZU$Zfho!aS0@X0^4tb3h7ka$ z0s;W1|HFu&V2t0Lpg;Lw{Rzz*hK-!}8_5=O0bSN|>apmN=?S zQvJrTKU9*Vi)or=N_*svbyQ1G&~HJ>jJhiB4G6)c@~a8I%OSfyUFdLzK5@aNAg!#e zyt-6FJ^1OfKd|(d!y>@fbhUZzE9cXvn}Y@-xgA#1@^*~0ZoYY8e- zDhYhtaoW#+KGcD&Jq9P2AaFDTY^y>JUj7 zFph?2KC}0V61HNAL-Jpj`mT$xS}U8LEXyavy9XpSXUoq|$!O=zCw6W^xHwYlxjhk8 zG60-T%+kfXRt)Syv$5_>(_r=L*u}$NGtFvSkgncujEju(i->e~mIX^ZQPvDtBEqc^ zTDMz3!=4VU8`Ca)kcl9#*5tGo3`LQFHwrwie)&0lfM{*I^zp{4>g3hhKIMcWrFMAq zHI}NL?_m68gYQ*z&UmhE2#^ak zbUH3;^`F_?&X0?-t0x1AzR#F9JsP+Oy*>?20-MTQHmwHnJ2;K+nNd11UG}yN{a!ea zSkMopT24GEGh8e8O31q~QL_#_9K_OvgaC=M89>S+k{1LARTL2Cq-6;_yR*mjhT>w6 zbABmq&Jtt35>AzyHp7r!elgM!2eFcGi}+?YhUee^!PZm9hGpBIPh=~FSPT}&n&8v_ z(5R;ePzAbxes%hagbP^K9(m?z@x$@gDonO-M1B6%J5NUeV#583rCIX?(U?Ojs|UU! z-#z5MQ|OTsMLgDZXGUdL12sECc(QuW*hk>j?*| zCS&PKlP|ms93$0+{82}VS804XA2Yamc>2;T7K7Xed#iIBorw-brpqdrC@p*KE!)_* z`);ZSwezg&VsTa;PxIMvwQ8(L>Il5iDbT{h9oyo2agXlC@39YtVa*xjg!nx5e;RkL zh=t}msPG(}hs6Erd2v%kKYrOJ6gK5e9`x*yE1WCma9&&E38t~Y7L$BLjm%2EvNYvn z((DFj?mcWw(c@3+V9MQ3QK9#{o+XfGHUs$6glc?wx!)1j!+1R1%!WaZ(a$mQzG%rz zOfP0~Lum3r@Xnbbs}&7N-sAUz+xr{T6!g3z)uyo#XE>8qcNni#)~`P2q`3xfG;nwi zE9|kZUwCKon%5A-x*Q8^kcpSVJ(?%I+FE2v)J^hY({Q^eRkPvtD-Cp+f%riHbc)zI zNH9HFGlnt%5o5q65S)!WltEo|V5@#_NO~XzvXK!X90xY-~t) zYPgQIH4kb)WZGCV^Auc>5w+A$%0hQ3j1(+Nv>=1u8J22f7lLKbYgzQ8oJdb)bGy{9DcO#9 zua`L-G{Y-^Yv~t4--#e7KYdRdUjmloG|44Zz~On&d}ApHVo;J5K{+BUk#M zC}%rd-tCvADDz4_u@DmUa-7K)gM?t4g55<0TYV5eIrm?wqWoi)>4bOCo}lPijp_5)Cu? zyqn7+zgHv6JHfx@s~${fn4($vy{fLf*zzC7S2!jW&`fbzp0UKM}m;e9( diff --git a/pandas/tests/io/data/test4.xlsx b/pandas/tests/io/data/test4.xlsx index 441db5e55e666571a94e9966fe2b184bf702faab..6fb62272caf091e4294d9b397023b81f8ae7d706 100644 GIT binary patch delta 5703 zcmZu#bySq!+8v~ZW(Y~?4jH{|?vl&z z-nFjlyX$-ZInO%JK5M=2InO@tes=Q`+Iu+)4G20082}3a1ONbx0E_*PwkBu*KvFG* z>?2rznWZ7Z&h8!&b3-iuiQHbamSlV>K;UT9w$9n9DKxsjO%p#ZWUyPughrI_>i~+i zi-Ib}-9Y~!myAZ1pUTwqopm2JX=Gbj-s zT}^u0!KkOjX`$KTew_j7^x$X~ zoH4U}vL5gptzflviqfeGQP-8mo|YN(CpmC;1sVnVcX1@dooZaSijdoy@){y! zLl{G+<^dWGaSe|v1)Rr&`eexSRYwlLFd%PO*GOsmR%onR5s67w>Xb}QJgbER2t_HV zxF}a>aPiLBBTG=!-s<3Q$pF;EreWCr zQTlI~z#lAlvMm`-qYjw)^kHLF>)X+vJc2=c!T6B}Dz{OC7cyt*UF@{P+SFm_&tV%hFlFUs$m5~|OZeG}2Z0_mG zZWU{NAn%#8ne)u&Fa0>YI}*9dn@c`HhE|xBzaJ^q5|2N9O^riZNE`WBIy1zeU(G>( z+2XPaZAM;euQIacvuN%Pd^{UAmtORnKqgvT6*ZMfKIma#^P?(o(2?=%idsj@R@|<} z0^u&fsA$gBQgP?JfBo`x6EB$@V4({qW>tLh0-h0f!nm7ugsEgHKmz zxLS?%W+8m&E|Go*qXXU-!i4MSq-QB&eO; z@;yAlEDzF_XDNd`2r{u>mmI8AQTatoil;48kwW%kU@rlCkBKKv!)p4=BMm>_UWaR; zDnlDSe`d(ywBEdLQCK|3$_e$OpFE+$c_BsM2bkQRX5_x?Q91`6x8GW;`HeaB7Z-=8zD|w>I>s&wV6vc`hI`^GX)k;h+35!Rx3(&a z;R}sHAL1>eXIRD@i!pTW4%!IAgEiEQQ4%eC5t!DtvJe zY1){=U>%1F?yq)bf@5mQ8;C8;=G;6*$#9JhY=tV_(4$f-uO3J}`YWkotS@t!C7w~b zH5c9F+Q^x-5vx~deVRYNI2vmFB;)x|4Jq9gh-#o&mwXrM0+4yc(x?xMGJ7UbM6CLP z_w_Ffv!kc0`AacTIDW~&9v|*+lCboS0?c=PqF-Xt$8(99VM5LNZiJ}8-rlVw;$1&D zM?dMD!5^p3(~67FSf}Imeg_hmbBW$Y$j#^&#nHC+;kEOBvB5s$B~#7Srf69!D~NkM8c5(VhIZy1etFoiR<%BAEl4HSZXnU9SoLY=j~ zp(-bv>nOG-s8pbqjBYOpzK%C2CMw;~+vj}~2n%^<+}C#GV9nr4?j5U7x!LXe?2eGN zIqyP)c*482%f!Ph@bspK2Fg}2cd?`|(BitWX7;PUV5=o~qHB8qw(-L?J)Vvck33v; zYQgG;$?zMH;M%1*c%tZpn6EXH&a(eDURpFEEMyl7@04NrKJ*RBb8}8kNE0x)6E^1+zKHygMQ!CwzC5O|$QH2M z)4V`+?%1wj=PTY=it@R4Fn9k>a`sD{vgECRXZY&={Vx=exLx)8$Suj0H2H4S7hZKW^?$rqileO~OVh89rii2jps=VBCM0Fhu zX7m+W((J`~jU7$;bb4s{KYS13eXUVB{&NaOW!1FSb^DC9Y9ghY?m>w%F0KX(s%HXm z)yEgG1I^upP~-uX2aq9k^l-VwS}B_MLRB;16HU=7wDMg6p>{oU6Mww!tZ7PUpSc)& zihD#rcpA?=wxZ^VuWhL2NTfvPW@wvbR75h*>(7#f1W4T61!X~lJdvzy-}s446O80` zp@p3VV*tmb*C^yXWjBQfhj&~Ba3fK`!DtR!Vn-A=>W<}U(x-+Uqqnkm_-kLcaozH{ zpgNAH|HwkM-?DyTfkJp2D2EHNnG zy2#ZFzUl<#17q+l66d(-V6n57T~v=7&pZNUKTHHGUz5>Y1>0QKn}X5cg0i`LA$S8b z^gwj$83cs*noz8Z$JkB%RW1p6`EQt9bJX{7{G3f_#X2?)MA$cI4Z7e(O!aKUr&DWk zX~3+qtsZRRnisZ4^~bONPtDR2nOhBTF3!w*D8($ZLR3zm&BR6It3Z73t|`1SsCUep zPZjaSqO#)csE9%x`qfQ6Z!c+K@yaO21j604|K!k;Btp)OO1Z~ToubDgI1`MaMDjKJ z$}tE|HWVIkJp7u8<&{t;iBoC?ZHB6=@VPx|Vj5TN(HY;1q*~ISw}D&mMBB%>aiPw7 z@P~p;poZh%)4;~#Xhg|6DXwMkJ$!B*tR4A|umT)y7Ex@By9GZ>8s8+=WhGM$wed4* zq*#8^$VxZC?Ksm|z#`dZy>n<%v(p;vnqVKDXRh3I&T?kp)$bL^zdAkr-66UH7oy-p zzAN9Gp*o8kQ!F zDVq=aCbW^ac-mI!JU(7a&C%8{zc1{6Ru(dcnwt~z3A_?K#7+HiF?$#$2Q(3Sh}VMR zpwx{M|4gri2?yp+{q`e3ua*U|4y{qQu4BW_T*Vj3t-fq0kS`y1$mOTg@M$Cwl%rA< zRP~rdHVxwUYdiq}oC1}kK2RJLA{S)k7R@295>pQ;0ff~JR_tpv|?M>k20H?Vc^RNxzDuHMUHfu@AXmKhFns0i)A2$dvF=nvu^{V~d zZpaCYC-C(P9$0OqFZ7V(VK$AiK!-ICQ0~=D@4V%9=Nawo^%H0$LX3GW5rii=T1Ij3 z%P;1P%j(CBT2Y#Z{ql+>2`Lim`I0<7&)A>-1IgBRxJyS>7+)Xb>P%W37d^wO;BLmE zIcXQwN1PGyjv;*EaCDisc%;N>Ryxp%X)Qu_T0S_sbgh5H_Nj)WptvIc%oA8=oT3$T zOP9)XiK=l-MgLe%k88_sEtTVqWWUNW?{K5zq)`E$pzJ~IB!%ozp*7(SZU`-rDx~%&TWl zw=v8M1sHB3vj$kiwHBZ;t@kkX>H0EHgK0_5+D-$>;(|cZeHo5bX(4EprLWB_-n`^p zaPi9iy-Am7$H-RoLWD+GBt^Mbqj#d{QzU!%R8VcMZ$|lyWYn<`ooxPsZb`?cJ-2Aw zs=x{dWwTwZHa+{Bb!Mi#JK(5UjHwRIB; z2m&6Hj^J}JAybS*)uFJlr;x_i(G7C)5K(w|dL)Q?LrU^2AAS<=RSqnbgSJ{+iI53V zkBWjEIGKtYLfBe0C;G&l+vJc@-)lixjYu5S?V12WZr`5n(ua(V}xfj zFUmJn+$cu92#X&i=EV`SSoPr#7maP6kfK}ptSs@efje(f8tn5sS_~w<07CUpW!rqt zprE+kP~%dSy8^0nF^4^Za3ow?PV65TL;F}PN?h=MrQ{`c0!|cquKll3{t(j9v?TH? zpEB@fT{%5x8SU@_=%W%xHAm}v8&}pmZiyY|P4vB|If7!I$OvpLeY1P78R<4{$hcje z{C_c3*OUNm)uz;E3}EE8K}hf=Skl36Y{<(Z&}(A-ztSb`-l`ga9* zge6^sOFwRe%AP3J^thB$o1gXV>IQH7NI1L+(W~OtSucmuU(exqyJGj{V{dhPa5dwy z1_r2rc&_WOOg_(@RA2bv)fV^uXLRf}J2bLAw^>e1Yc zgWp~b7oTdPIjXLI*kqu6cs{B0=SZosI_Pao#&c&1WuB*kXCZ|Z(d9(onT20OU_!LX z7K&-CO8;`2RD-jz+@EKyqnh0Kt%WAUBK~0}+oFbHH&q@5-6q9!{&HRVxMk*g$Dy$W zD~@T@CY6-4N7Ns!S9X`iGx)(Ls?Xd_-v*FZ?{)o0VC@sWh$nauSU)iU0Ls5@hKKE& zH=Z7R{~q}N(ORjgP?ry2;;#p3*H3s>!a~$UXfPDZtMYznl-g&XJAMPqTBtf5bh)hX z0~dr0y)FQC=9i-<+3ZsT`SwF}@oB9iwq)U{vh1FmZ4&u>D+p)ZRK0<=6xiq1uy{oG z)WGds+iC9TtOy{1;Zr@8(}!Ks{t8=JH=1oai9jv8{N~=u(#$=AL-$LSMjpamRxKlM zPZpM6B#~8IoM!o7)#bL7qeIdqr(I}%EXEY|v5I{i^`2V+R{ATF)urtV$#TA8@~e_` z@||37Nu1=RAPtyr@8RZ*Z-TYDqpi<#obF}P&!a17BGO5qt4{1#W${({RviUAP;6Ky zL~LGWw(n@2yMci=RgCRr`(4DWq3LCOqCCsOr*6`{@i|oyg_F|@fBQqBZ`<{Zn4b^j zKFe7BrbdP8b9`EFn+=+bN@UpWU;8fU*^`}|T1=o9qSd`GhbqKFGQmg>6kp`92_@4D zQm^p7Is_VHmMhUd&7(I$Aaj>= ztolM$IZ>R80X!xuS#C$5<3ES zqiGwo*=@ju?sJY!|6Mn5dc0Mfof{@K!w0j+sb0jI0Sy+I@(LOTCW}~6yCo_(&@c-V zd&IL-$DTo{WD{7vhnxi#+U?H&>A)1RntXp+D7_VYv6NkOqM#!YppiAlq!6;jjEzxA zMX`wAK?b?N5?>(hQPIa6oB#=BZOy&rOv9-pmnt&|6`*afd?C-SZEb54uL_>^#3r9a zcmcVc?O{3~%WC^L)=xQBcw(n_l&{Tz%#o{)w}(Tb)m4UR6Q;&IVm*o|(JltuliF0#x2;tG^ZF1& zo{CcF$4@Udv4{QI4Vu`Lu zM$JK+lBa)oFTA7YGkKO*vY%+2eMULt|2le;iGoUEgL{`SVm`yvr87F7(g?HCM=zQ* zrFo8Cro`zKjoGk(M_-Ifv6og?GsUh3-JqZe^5C-(iWMF)1Q{EK(YP#2c=EWM+^Vm` zk|%Zakl}Hk-qXI8zOT019PNJ2x5>$3R!JvsTZyJ#Jv~0F2<`lZqKfxIl@8G*Vi~e- z38abn*e>U7!pG7+6x-Y6e)XdM6{dZ zT_b-=%qyn;Hq7y5e=|YftNX*FG-^dQ7%SY2D`Cw@GI7tQgBFvs`7t3DsfxW3b9%aA z!NJ*Mru#5oadtW?wq+a}(L*)Ct-uZV_X3&rxlX)Qb$_k>G%5C-Y_IADqQVuxhF|`i zQlkTdMcT8qx+933bm5zRZ?L}TDxuy-VVGzn#5Z%ynD5~6Rk2jiUPaN*8}r>2hKFnN z{vI8m0YQ7j3;2J$Pq?>(7|nmJyNAma?_tH?Cj&m@Ku7nFw{`L`+4_*D{+oP=+8*U3cbstHj*iri`YGVij%>95W&Oj8gZ^S4q9RK8q!Tmkf2$Q803iN905KXk Uu`@LxE&vzs_F?rSx_`3&0#)~g9RL6T literal 28216 zcmeEucU)81w(tp|w@8sHO}dCE(xe5XixeAGK%`6WK>|UMA_yob2uM*tP(cJ~(mN_5 zRjQN(6%m0%+7UwXoj7x++?n5-d*|Nw*ZU5V?6b?-YpuOk+iRn_F`Swcpatjw01yGx z43o9IU;xlb4FDVfJ(aDFuTPMRPmsgeFh7?-yF(~%FX0E&R8si>71aO#KmUh0PB}upc<3+1i=bIU*PKeu$*Ok+J`5%-Oxuoe4dGtGu4w zX%pR_y;xv(o{RHyi>6;GyLYxmdSVJE-QvQl?^rbO_V}xsC@-a^#=8e=1P>Q@C5n4G zt%u%odUuu0N(GU=znV#3cbkP}>w!%IEx6$x)FI<0I)1muG}=TGVV-y=d15}cCRFV9 z+sd8k7jGXsw}+Pa;v;`>XjM=cvTG(Npg0KO^GW<7ikCyol@ z@|!d17H&N})Vf&vOep1$+Ljo#$N`Y|X0z*#f8i~SX?*_F{csMkmx8{zGx>|$(EcKm z;eh$?gf!yK*<2P#NT(qgWP*g$A;85eQ0@@r^ZyIy|6z&z4e6z+CKTa}*)n|JmK8By zC49|1$7j#!Q2H47$z_j=?BtgrB25~xdOC&d=dKoM%rC6=WpB71)(9`&Q~jJOU60{L zY0!x&{Z`N2syaF@4x5;!=VEh)Y)bDo-fhQRKKcEWS9Qw4C*xB2Ca;iwf-c6iPFVGc z^Aimh4u5oVuQQl%)Ipz14^=0QL|uCDs^)~&_+$PH8-ojZPRpd zxyhAw516y(-$TD0BQzAe9BJ)`PusJ8*bJ9l9DEfmlKmF-NkObKVMLw%GSZ-mX!byQ zO(pOCg3zHLwT1d)>F+)sY$1(<$;Th1=SF&$ z&l<$4dZ?8TpogKn<`ezX5f6?(Y2a+s^xvvl8|U zE|~HUl^p9Gc~#P<9V(^l@?C0p2>#`$Sw$`5(9BG<`@_3w@f-`ZYb7tI`Y#%;?ap&f9Oxk3`j zfHJnR>S61vOG&p5zqc4V&qUnlJvMpa1N`XTGo$fTE%Yg?y|um}O~JKpp1$U3Rh3exbs3XpYx$iA2{iTKxg%9#$N~ za5SDHfre*RxpJfK{kzlGdotNBdhguwxEpTHBmJr4QL6S?numKP?TL5%>~@}}F+LV6 zV(9Y!{`y;+$m}@jqEEP!*WK;CJe?Qqp3yzIlC_0Mjbpu60bmS*1&coK`rbXF+mWVl zXIb0(axE-ALtH{E`dQv%vr0x5by%i-;A%{x(*4%3be%)b7R=`NwbJBjwSXzHX|D%R zl`l`SqXqAnUC1eD0*vv~7FXvFos$s86A>f)zza~S$qwbD|BdM7vV{WH}c-d?ti+gq!e zYsp&N?sUZOh}~P)=6eCfamzD=PXW4aT~G7XGf}Mv^Lym28{c30&hckUU&mJ6^AuWg z0`$xI)6xgJI|jHop9u=^@No;ID2wyS+KlZKPv~kW{qwQ2r`U}4@2?9gqKjX4hxduO zK73GHi_<^fycLG;+vHW-Xl)m{6*Eyq6?3*&VxX`6d(hod?F!T-bOya_<4wnEB0g7iFNr4FFfATM0ChL=rrwra;c~QLH*cN<9dvENp5>n z&u#|y95bC;XU@$ocYl2hxRl(0nGamSJ8)rMT+6dUWK8q_8GVNl@t$mS05Bs!arl31 z?I3p-Zx^{ApGPR6JvZFru= zD@kCfHV_lj)jof-&RpNdO=Is8t|>%I-)gHa3!QzDpnOjE z;#DopTiJEIx+jXVWn|>9IUbZ-yDf(I>D0TiS3QC7+I&uQfdBcOcPCsT#Jd$3%&$6` zhr1~}5Ydh(lOKP5CzMBB;>4FLH?QYfnCxSS77#qEc-+(_n;|IP`e;;GEV0Ba`CJrF zI-|Wpbi#Kgi<+lT=lV{@KccA=fAREt?+5uS*tW-+XKv7S6~6N+^LHs8@C8ox&8y~^ zHk^@N=LJ}G@FSC_N3EB9&f30s%l?vdZ*2PMh>Rk)!%&NoTVTPvOYitIWi1}TCi1F} z5%pb~YMQ>xvAo1B1%fu74b%3acv@!y^s;iN`*L3wzj>#7z~Q)2*mB6xWg6$s2^Kjt ze2)JPv!;%=66ZuwF%Lmk_H{K6a_#{Wz~XE*Nd3s;gucqPgFf>pVX0A$Ig_i#zF*Cy z7Gl@W8^&IJ_?Wjb?_x=OX0v5~OT-3I`tkO_B#RQ`(euZg?K!jbAfb7dpo0#45*`9m(u8x z%bE;Na&EjqA59RLJG4;R6_XK_efcSS$F7n5rr9A0m-m|8am_ELRQ!)mWfl%7t`!E= z`5JT{2~5H2VviWbAKw;03U|&=N1PC_5A~3$`yd*#a%QA`#Mmg0^`34P!*u?UPM0rh zyvLXQo`)lbU$)!qTiD98dO82eq}33_%8m%J1%nSel!^! zXMeVrdp5(ZjHkY8=H_@)%|uualk$w{Yb!Go-WTG1Gq&wi3SZmv(~riPp5E6vL?8L( zL@dL;_f{7~qp2;0%=d2Wzt`T}#OE&Vl=(=|_;vupg5J@u@5%_zwk$WRtj8AiOdS)m zXh6Hijjj7dzLy%|dD8vqWVTt=!@UR9CMNHklm<5?(#}<%GVZrfe3<$0^Bar0QfZ}y zfc!>37Fs4HN4Mw;Na};-9Tg5{EDHi}UYf7@INgrx(#4iDocdH9EY_}bQRf45#&~Jg z_(y4lg}jjdPcOfZAGCrGy;W*2?7|VPQ$L( zeZQfYJby;*w99hw6Fg57kBfVa=2|B{G;|YGCR(41=}n}enZNCSpm4HHzVe7dd8K)U zZol)gL)lnRrQHp9u50DJy`KCx_ml7L-E&E~%2{Fm{R4Cj3J+d9PqCK#z@eur$0v9n zmK`6#jHNTJM(P~EwzqRuMMh02_SJ-a+#D?(zP(z-{5IJsU0-D>eLF;=T!l$_xA0k; z`kgoM*~S|Fg}18-Pr<~1qI=!sqn7IdfHJ+Oj=+^YheB!e>rZFFxDVh4Vus>l?O;pI zivl-g{dAbhub}HLDv_B2O31Z}ZozjeW25sYi=D)z4oV(izxGwzR(>Khw(?TVy49V7 zXPo69J3l>HHUSSXv~t*L4W5pMb80`%_DQXVMXTjtRst=wqbp_#ZZGvMI(H3uc+RcP z)Qw-HEjC>i4>tC~=sO|z8)UcdV87U#rdr3)^?v@+4;wpz}aL~f&y?q?m<0=9;V zx|>1Q+XtkOad%tqBJ4i;oJmTlOh z#C=JV1|MSWcc~dyQd?<6uRm&*_jQitG1B=kOpJ_G|@m4uzm2GnSt{grG1lmmjV{VVX(e z)Zsm`t1%?H`#btZ|Z<$yO0O9``19Hw3Jc)1Jl2u3t>iFEA*# zPf93m;|oX^xwLws-nvZsmMN&;P*aFZiHnoFtvmM=+pT6jbV`5wYy{C^ut`?^)fps=)Z_h)I$2)5i&5+`SiVZ_+Ct@0kI$fE;bd?A!kDU!N!3;kTp9 zNlmcz?!K2hYU27f*kXS0^j>4tPY#`WlXvyrDKWI_p37Ggj#itCdPDI4>K7izoq6QU zwAwrGxp~)}MORq1Qfb0u94-ICmmcwRd-V@!-Pf)yUZC&sl%6K4Jq8^@MrkhU2GW{3`!hv_g!ft}KRc}bX2YE*{gsv-BuIkE7 zr3KB3@8;S^x4q3|D_p6DEw5Y|Cfu4}wqzPzq!kVh$!!`>HdQ!Hm}<Jh-SxlHi}7 znm)aulc#YqKGlBy{D#VOzbugQ_ zR_x{Wb@CjLE37l`TJ^2jgg3B~O7{JD)xccdkd>@@R!G-*Pi;(?N61L4f&?`cb$IsT8K(o+~y(TNw-x+ zX~jxJkjRHyYCJ-pt5Zi+&TJnUs?Z!vQFM6j8GAC!LaH=BEyqgAEl99v?QW$y({|&5 z@fV?-BNeYT)yfTS2gK>GqwC zBa;#C`ZY#&$-Z(vnuHCcB0>Xq6pEnK8ayct%2&mC29YcBm+z^rA@P z%cGtl)ZvLn4HiL?;wMDkScJSic9{(~F!M>>$@ZiXF8VHqpFF+pDgzG-yg0hmBYp$% zvP+n{iJTa7ocV0r4AgqhS)I~BWtVAut9-0xuMCPkq=?Vpk`!A<4}*Sn1m9p|l{~RwcZC$+@1HBfO>oyzQk~hS zB|?d+pNL!jD(j@iR|Ci8$BpTg+sIejn| z9&7ccB0SD{Vo~!o`8R5b5h_z25nxR-VV|%m08oA@$B53pPUZo=eu0oOl?=?jtg5^+X&N5KzRUO=n#?*U~+T{^wTysKS`OM(wFl6`)70Xr?Zq{**|apeeZvI zV{�bOHbvB~Rh&9O&c?l~Djd%mkOt%jza|S>NLgmAhvfEE( zT}t`lPi3ngea@cNf%;HzWp#3QbcV`ksC@9^Z|dECQ~n7Ds0Recz{STUz{5$H!V8GA zt{z@4KN^1e_>1P>^!%TS-d@2F%|9MN=#x3n^Nca{Ek&W-J;3nv&*gk4N4+ypSq!Rw z=@)dIQZE6OzXp4r(}v0i0Dy|!H9-H|PyMOXFT0<$gvwk{y|YJ<(b=EtFZ*0HHG|5~ zSgHq}zQz0{fxzjJt~wWsMt{zlA2$cohgAIR%4?htYa)wg> z1I}f49cT+u;J^beIi4_w%6w2cKEU@hWiAM3_yZR&1IkcAEQ50Mnkjwd7qzVSp2z< zFEuyyVd|sMuNw79Y7J@?>SKT~HIiD3T9aBAsy#-n_6r8)zrj-&a03=0On-vo4+KLq zQ}D0=j^E@9milFk@YMhZ52yYVJcP}!RQmqf4#D|_VhCs7-*Ny5ggEdI>STY*ryg`8 z&o3>)@2Km}4>R|m0kgm2FU<#<&ou9#&k36MzmBJwqInPfOaR)@3V8j%$Q#1S1W$uk!<*nQ;Pvn~fH1rj-Ue@hcf-r!4Zpzr^R@p8$}emGgI-D< zg<8rg{A0e~^4bOPVm;2v&nn8Q%PPbw&U*5fUc#*0tVdW4StX$s{$JM3>(}<*(c%GE zL%jM2SO0u2iUfE7=O9cxfKaG!0K`=ahkbxB2ck{LZt%A&I6$SxerBGQyz+b zrHuLMn<}5`A3b0|KKvhLAcNACJO$`byodkjp#b@h08j>#TLCtDAOi-#VQc^u8w}0{ zBXhOQ&GcdXs97Zz#^b_fSQelUF3*1 zEypQGy8Zr~@>g!%rx!h5(YE*WAYSa~#ek~}j9lD2ynNykl2XzK6cm+|RgS6Z=<4Yk z7#f{8bM~BtCA7Rw&MvNQ?jDx{gMvdsQDK*(uf@d1#a~ZIO}l+3J>za>)`Nn=qKCyL zj~-V(ud1%8t$R`5-qG3B-ShI*>!IP1ccWwD6O&ln{D+U97CtY2A$(g~-`Ly&zi(6M zqR{z|=TGT^m;j@qriN3~QRsqEg;EG-qoxr#Ld&jwiq6rWW54_rdd}mw?pL%ih#ozS z-+M7&kdaGF0V__RkoJSJe~qxKe~YqT2>X?;Nq`v+ZAvyc8-N5zMe0a0KwrMeb!)SX zPA_+1M|HSA%abWI-5PycPVDJR>*>3xZ%mCoJ$^N1pX>0%A){}F?o8n#A8P}KHnRX! zJ~2|wR6NB}_At%&VA*uevuvk?BN^#o)*{zoVRmOZ^fDro))yBgXqv*?M(x>m3o&j& zv024&>eBv2;7o6ipg4i~l*gBkD`+IP&wgf1_l`B!=$5hNi1!BXjS98qM~)t1#``Q@ zu9rcz=Pnq zexhVh3rRm8A=?7F;1@L&!7~PCaL_vFNqJ1mo?x|{V?ieB)@CK+s9J1J*`2R7?Fi|W zGTCZ=PKP?U8e{q^u1q7zEHDusw_H1N-t*YBqZY*ppD2KMU?J!Xd z41eO#=UGDPBM=Sv(hOX%Q<(UA%{-x`3ytY(xO1!V)d={6sx2B>Kk(pP^|bPt=fJKD z#8)K#3F-P+G!maPL59A>GbZPhH2*6H8Ngx#6_6yHDV5^qa7`-V`Uc_La%upEMh!CLNe zl+$9cYDHw|<_4e>h)=|#Ewu3UwgkE`3zH>rs{E>^)iSQSG5v8Q zpF)mCsn5>!bRgnXd)g5(^)*lC zd>Ox}<`(1E+G@9x?}y!A+)~aM8;cYiUZ`)VuS=&*4^O=QA?#t6zFVq++eIUZC=Izk z+x*`pH$A*_{Bt?nJ(?VY3Z6}L7HMrW+#^HnwzFIy=0wvAPYqwg zk*W~3#oDdH{zv2X*4AX8=VK?kflqsshVtfRyhD3GKUx%D9RH~jFVv~lJ2Em#20Ay( z@wHuq^Y(2|W_2`Dh++=R>e5&w^(F5jFFVDDgJ@(i&5m*gD$Sv&ud`Kf_)qpRN^aR0 zztn2ax0~Ip3dVKJu;OQ~KmFXohz@+T$k&}wFg|g~#3x2aa|2#@d^I^?^WbA5Z%_M$!Wyo6i_TtNsI=NkO-X4*&2SDe z7!Eg3kwAAUhQxCzQnO+JEs1Tp{>l-{pSKsRT-6>Z9#PlE<~?^X7HoN_*h*Ib3T;+{ zM+kB_vyPS+5*64n@5||#$4QhXD0fFPzFVJ--O9tqec48|d%LyQ3nj`q5rX@#3J=6Q zu?JgDs7e%Ub`&lRtVA3leS8}9p&8h&C9y60eGuYbatoay1J`|HkInCcc1BvR%0ewC z(1K4{nq61-%mWYM`^kW`6r?4tVeHAk{rU4`pe-~O^W8{mpWfKDU^?*D{Y&2SEdJFp z7;02yH&aUm3dQr)S8p6iIqUiS4E`o1? z#MB7s+Sex_(;_t1@)NT3Z#9nYUcVP^P6e4G zWI1Ml@DwxG(19ct=EF&ByWym`q(Cy{&q8+cqd4G~#T~u6Obnc#0@WSLdLl|}K|SA< z$R6yLvW%N!iGe$_SwYp2Ft*T ztapg%=>3@V=#$F8mS&flrN5d1oGMD}Ujl_i!?8>|OhRl$afurZXX^;FN##$^T7#Dz zl(x4zmB*B8#rg^wMC$5peLNz5ygwmhsR_U zc7*7`khibLzS;}kwVkx};JP{0Dv@W9>Y;mTe{rTl%E9bV!#5vNVH1B?82}8oQ-`+L zHNpJFW?l&XYjI#d7<&Fx456l`U@)kk8P+d(<^I4Q*AyWOV&|UZ1kNpVvGAWK*uN%cLOvH7z@q4m;fH%j8Wmvp zT(*qRRZWFauVu8{1KY_jqq+M*e_zi>+FW^5dJ(>~1@kg$pyLZIiDb22+wf}!ObUi` zCBQdCZ0NQ|T#rK%hP{z(ZZ*F7Ab4w@Uj5j8^cj4q1Ft467~9iP(C%9@g|c}&-@zA} z6gi8{PGa+^?$dlF=VD~%tH|Ti1$hCsQ4tLO^C|pavKFHAN19o|+75gVPP#aG?Gypg zu@vu?qppY(vsJK@>#D!Dq;Tt%*z0fY={w#Z^fO&hr?n7ijYd1tk#A-s>cG(XrGA9H zE*W5IG$SO=v&P_z)P3=JcEd$OBu%12%DiRg%MB%hA3^Y@I`31o(eT%5eiqAVILqgH6sEk3?WNMVbnqjBT$QOb1NS6+29=+zF+8W_yhCQ&5wYqsnl7*RGt!a6r^`l5@39 z*qqMtUx4>75dR0XV6Q+On=z;A(QA<`?_61M{hVr?@6=Tq1U9lCqO6RbJd)#JztMYl z{OZd$h3v4=KN9hGCI!aE&|0U~=8_y}Y>@3|-H@K8&|zyfXLG+VJZxG5%KyBwe<1Epc>YIp_aVCM+|FlX zi&OSzw`~^{%A?HejeH76h#F!cpC^M`J#tkPJzRrtxMW^T`f%ZJ{2MJ{UJR3_1X^`C zk{^XI8&MC&%E&As8Qhh&LNe@yhp;bd>Jm+AjV|Q!9^xxmRk0pnVveJEc*2c#;dhOWQCnQ_fm`ZKDZ;o9!V3ajM*L!Y%j<1Y|p8xiJEz3M}E;9H@@=`q(5a zCLY5U1osroegKA>HUqnZyII3sGgKfQKBE=GhYrWeaih)g%2yQ}Xuy_k)k9@sqxK5T z_@cT2*+%#I(?@1(FWJit2QA65ds=Ytc4wD<$=F6)*5W67!u?>&kmTM*1(KpC4M@zg z7;qU7`iu8~3_(%+cS3iX)aZHvA9$woR8^jD&19u!IW|_;b?q?kyL}~+2iackJd9Fp z0Z*tyMnZd86jCBwEyNHgBKz;u9FrC8%oTFrLbdCUuIH!fVOx`Zm#@AsR%_ic%Z#10 z^65FQ?+!16%yvOX-n!rr67T2z6&O%&We;G@ZlBGD_B9rAdHPj1E$sz|BhocJ!P4UtM6bjOyHeKiBzVuAxXIkA;(Xo>t2-4k7C`~ z;WvcV&LW!(L@BipFGV4~dW({-*!YrwNr;1w0c1s0$~KKh3kZjHS9b|O>K;OV-usXY z%&|ac1$_CeplGiSy+FulAf6 z{JLD7!ZkYq3fe`2;~pV|#ZhkdOR09*iTs8SuD%XQpOh-l9S1Z={~0s*e@vQz;ZRTk zZk9)6SmE0vMC|6H${LYS>lv;|^-`8jD*v%2Ab56s(qNaWYL*DACIjjb4RDN*usI@JZtEzX6_4P#Y>mSlmO^w9Tv;@rlf{MRF zM=+e=d143$IufX?s;1~an9-xI4Cy|vwbbYHZezYt5oyBk|W0Hl=Tw;zjAa zfiUilSuTkp!U8U(-7HR16EyQB=o}vi>gOdIjiEJmGI^R}TK$ z&WUuUB79H$2ir$(**$y-ebj#9?HY9nN+EZ|IIgfInSVZ8$#HdDIYi8$e!QV8Rw_V-WC3wpT%lXXPhaeZU ztUozqr+n&p!(I65G}W37!8GOpnAg#)hX3>&O9t2$RYRKYd-7povp#j48{7z231&HZ zy0R{%vrM3?XyZ;Nx5aRGmw4>iTSgYNQHm|k8=(_L9n%!ZU=+?0hoH@xGcUKz7la{- zOOSz2sZmmwnt8{nr;nNs1~)l$@V!lXVtszG{%P7{P()GS=$YYi$MEo8mgDWKog6kZ z6LwRt#JH7ej~G+q8@djt9+d{>h{xs-ptfQ;0%hCAFqcB-u8N5vns58WVa?)3?FI0p z+zi#NI3Hh&VVxnxaAO{^#>RWoeG1ZC-*5EqHx^soA9dMbGxSEwXOBUO6^E2m78G{> zCEkdytGt+j;Q}&%m4+lu;WmJAL-mqkbA+HXQlpmdh54*zM#DrKv?XpXS9~b_6n7kY z0uk%toTe0-YyT1&CVYlX`{qQdwung7$vv$CZvRV2wDVyS`NnJxgBd4oAh=NXoe|hf zmudMW<>S_zDSHem#Qf|ocJQ=KI;Zi!q_h6#<@swe;X7*G4(ke#cdD6X&a zd_{xBhTWrY)uj)sw7o?T#d0D(!?W-fdAL!0Xp9Yleeu-s8G*qpmAnr2H-!7cWv>gM zvvdaLNXJuc7+GZ}HEtC^0Quiw39J=;?is-btw30sMx_|n)i^MC_NlX>F4=}j#Cwd< z%p1LJQHjY3cyNZffnmC$;wr2Stpt{rEMgFidyDn(~(I^{&$Ts z#r^nU+@TwBwhg!6f2rSZ({>0ghf@pm=8T<-4p{OVdXUTP_xk;E8vx1~KuG`_(wl9w z1fmR19KYKAD5#IFl?=epyk(%~RA+#@y#S%Q|LYP~Bd|RZMu=>l;g1lqt|gp&6@e}D z?NqLpcv93+xkSP0ojHSwbKfK07>bfpnV?wYuU%@XJm+2^hrTEw7{` zNLDqCW^~k>H|KTa@)>uh@+yksdt%FQgH@YB>EGAwSY;9?d?b^cZ3 zRgSs|S^aIuZ<1+*;$Y*>7LWl!285sktAFnDaYfxE2<=(e2W6*P;-lqah}zt#-WS(+ zG|R9V@^Y*AQ;al`*h146Dipu$FTwqtF07U(2#S+naaSdDj3bAWknh5N?XIZ^yA>9a z5fv8QKOY&8n-95#E&of?pJ~?=ua6zGhPRVMGB63mQ%8g5@ns;~73^945xFy!6#Ula9zVW`5} z1q8L=@62EZif|4@>NqlxG5~`%kln&AqrEufpcrDnT_`sCs~&z1oJ6DGKq$;hU&iL` zu{}7$F;?Jtefi>^c7BsOT5GG5C%W^(#;ML}tEl9&_+Go*oyOfh8&_fc*kiC)D7sNS z)j)is(lEK+<(W&6hmMT&OP34u>d@KxcelB?_Gvfl?D9E6Zx5G^L0pwtj3#opI>&YX zJc0H8oQ6)1>O$wTZ+C}fV;z2`0y~jyABSz+reCwwA>7)D%zM&P+y!~21v>uww1jNI z4$GQ5M0th8W%;$!CIQ>V{Fr4K)R1u(J~V;!GVNq;SZ%Cv0dv549+y2&R=*XBIsuV% zAf{aif3BL)a;1?CClKCab8h8P=zL2&-`>8IPP15>lxynrUXa)Kct)yO3KidTc}4-B zlU(G#W0J(*ZX@1Ro=7yf|K&+IZHgna(&(fx97<%QkQf~Zp&kB2y$XW<3+ybryPznT zgDa0+CFFHB)8(!if{fWv`V+jk!<5&0>*nC&dK$2`*Jx*Rqmg z7OqQHr z<(5^V1MwUn_EPiERM8D1};S%OQD*vLC~NK`Jeb1#IBob=N{JP3$7EsIzcT+uf9+Q z1Uvi(ObeUs>TR$9?FbAOpQbY#g>V3;39nd=(mVi6h~-8FxWnH@@}POaGo|Qbc%>(g z(VF-&Qc(%YQm3)G>BW$&%L^#sH$0afe*L5Kv{>cKH!EL_Tp?dyh?ewfa<{m1HAM(Y ze-aF^rqQlW&EP_CK6tseJy$SMAFmmMZ<$v|vNozY;iUHuKNyp(p4|%u%%3D^#s!wx zzn=tC%<4wvzRnwm^M~Xk2ATq~p&dvDJ222{fC$Gv;jgQX(CPD(m6(+F@qU0$xxOkm zn9yx3Ryo-|Wq2$||4aS=9d4n^7nD}+Y?8$Kqm(pDrt7#LpY&NWth3f144)xUhSkSJUS#^D2P@(gYB*cP|t zV$pVvrC`Cycf|?)Ce?Uk;=^c>(~E#+Gi2v6#2`Y;GOt``xNkx;9bO{I7h6t4>JvPG zzhUm4y4Wl~a!^x_)Nc=fY2{xuC5DTt;*67dh>lOeXoIS>6M|=zCruwcd|5JyGKX#J zLFzs(2J_i&um$hn3B}((EQd~e%le_uv!EO$#td`il`^qz}zOFQU+;dYPYNtn7z=615a4%%oL-nBp?g%J==clBi5R7DCprHUW z*E|LVtrV;9m$O1QIz+Gz+uT9EL~hDKK|SzcJ&zIP(rAeTv$co#R36~5Nh~oHXXUMD zxDTi|vV5Q#|Dx3g1n1%7QI@2bX?`g4wb?@Sfh?b{x(pdOzLErSZ^wb83>_c+EhOQo zcuWjhQTAz#%Ro>jBdWIRh)6h-tIR%|z9IeSBlwc210{2?D_R1+e-A#zX#bw|z$?h= znkPdU_QB?Pq3?7y+d|uJTKmSXLYe8zMt-c3%YedlamWc%NHyH#db6@xZ2MfTBIV$l z&gVy7UHlPB8~nF$fh|CKmMDj9@Qsbstg9WUDv^wCw^@JIw>PQYi1T{!d3nR+idahv z*qtlXY&W4l%WdXIapPoHJ637zJ;(qT@l?@-9m+{Bci5!10)J*|-1e0tT*m`G4a%=9 zKe#GiQE-0x{M+N7j6PhniV7=_Z9EnJ80|ygZks{~5N!uY`}H1N$`u=_jxfN^Mg=wT z+YdCVJ+}HR`?w8fNZiwNx;l#toJyY6PyH&=Wayie@8}jF3T#Q@Et3^?!`r?=d13Ua zvI!F$!*W;YXe4vIo*L-cX__H5nv19k++P@=Q)8LRvF{sKOd*>Q4;@Cb!*0I^u0Ct~ zlkJlTYeO~hqubn9u-*d2VIOj;3WToRUR*vpeZKMGK)ZAJ^@&$U4Xxn%!>t2Wrdt+# zpUPYU#fx(D#Je8mVVOK;I?UGeV$ymMF{%1h6&!C4E@$3Qnh9z?A$j*Z0EMd8!p$Ke z>**u|Tm4WLDVri>%Q~buAr3MytH}!m*FI6lY?C0dBXLAX$brV{`c}{pnmj`S7}{rE z-xAB&S%(J`dv87qo#r>HR~Y}&@UB(095kp#Z0|#xkpVOanZe9@$WIp>L4NipAa~cG zLM-y!M!Ab60#SLKfi4SJj_uqHn1hSJIijhjG*UghXDe9JI% zelCjj`5&qI6NrBkw_|d2pDl-MmUL0iJN>;0z6nnI74a+k9##kOS*)oeMb^ZRg(=-h z@z=E3kJX~Hg>=>Cedrh?%Uz>7CXk7)& zBYme_4mcVFDKL!^%)mSjv$FuDk^mB>pa#7vuCAbp-tXVgWv8&4G6jR5#AF=zQ)sf zg3D?mverIGmz9&nr-iGJO>4#HldK;<4EXJ#M$;u+ zIoB&_`;J||+shpn$Tw-lD>~s30=p@25)gYO9qSe2HD5<(3596RdFwK$C(w~`)hfy6 zN6_})sG*a@A~+f?>Ib0k;$Q>O8h1T#Z&5V0yr;71#fZtQTL+z&Z!$-NlgaJC`xfYIBh-jh=H&N7H zBBK|-awFHvB7|02X>oKO?NY6HF=^{|<)Z`I2N0aFVNAQVr^U3+LEbgx^sP~aw+k}-&&&-^qw-l& zuGysL2|*$3tydW5RLN+x`qPwPs`_U`%ax@Mwa#6cy9%R=?gP*?JBNuJVD+5P6k;CJ zg@JCc+M?8Ygz$A1YS)Pd4;`%AN&CU_0#r(CPg|My;ZeDR!5I4S>e)nZb$bb42Jq@& zP&sWcYcwX%Cts+*US#B;#0ZhI!om7wN0(`*K_Ju3g7z6nns`ew=9}`USMW!iu(K9=g&_;$0(r%c~v^lMJqYB8{Z$@AOl{gv-7LLqH8_XKyb?? zC}PWtN3O18C^6pOd<)>y=xg~A-=X{*@uP^WyDF89Z&_FTb0e!@W4nK!gnuF*)&t7p z-rIO)=CbASpq|}$eJ{Ff`ex*o&WPZA7VuT`9TeXp_h4ozr=w7G`0u>eLsYLut$OZgrs&BH$B)1vHfYDBPcfbQJp6W{M(G!*|D z{1pHG8?SUZ#PfBvzA>|V?zqL=1(BTq>;Svb@F5j=h>=Fr2k2HJG8Rb_$+;xN+K71k zC6WsNaiyRHwAF2DZmfHjY_#%pB?uJH#kr|BroMi&((~kq0t{cN*0bVboF+>B6^ zn^EdPk{S#)){q#nva){+M8?jbML-wqu5VjymfTe|^5L`z?pW!^tjTtGaN`I|R|-Q(A)AkOFjzL4Jl6K__7cF_1+kisVAqxbtZd5MAbv$goK_KYqkw|U_d4z(3}#h{BHcdvYu<`!|J|;tqQWruYDQ8SPQetp6Wsq zhYtE={bMfHVvON*zBZIYL1ujME}+mtq)8!4QIz!2_Plifx>k+nLgFEhZBIWVW&mys zsT(UuE}%F?M_n4q-Lc3)2e;7OAx6mg`Yi-_+7Ohy%^)}=KJZ4ExMSlwa!>PHy)Mr5 zK=oHSCB1=ZLe~ZQ{{qP0LE}TC5LS-4&$BEuNJH^8CG3&*w0dQQ8}&(5*cUOPwb$SR zkk4BC@6z)JRA^t#%`#pKP+*Z0ff&+L7~vOD0;LViuAz@0qZ?%uCIz#_D#1|wR3pJr z`L{n}oqf*F*^MMqjU=;vI7Yu^*B`+D3k+X_JhB-M{JZiFq~QCh&sjl7+pw~itz45Q zxC}pD!#*-Y1u`&Nc16=eqUG+PA#4^OI7TtA*1|ME0lMlB!iEOgp@M0_s|+ zJ_hO$$~$NF4@WYgku@o;GxXphE=R6Shy`sxIOGc&3O9yr^mBb#D(s_;7P{`U4wfoL zSP-0s55~Ni6Y@2|`X+>^E?g5IYd+BS)W!(M`yh4{N$sBHsS`BAPhct<%cFk!mR&4Y zlu25DZ3|C$l#e(|>LEDnveJ-a`yQ0oHro!yfXCuZp*-sdq zPdV~!=dzTD0Tm>p{~^h+@~V7}U|tijs+5R~KA|y+z5LAgJf=!9VZ7 zb#R(M7t{(8cMvANvaC5i78-Aul-jD z@`BYR+u|)Bw62%^e z(r`&9;3ZvUi#AE?AvHGV7RFb~0HnboO*zo4(`F(1>-v(=J~2Eh!Lx@n1P0DGNk}9h zSx^aFR>dPkq$l(6N)F?cfv73rx{K8t{}ZoaUmWn69+^7s&4PXIw|(?-01jfXt7WqZiWod2;L+tPD<$Wbwn_PKT_Kro9+s z2P``sh-LY#7V};89)#vJ<#D1;HChHopODAAggf2;1axV$O3wKt596o1b@ss#Jy(|f zv=n3W-l$b#|b#i6Y5Io2F3vJAv7TET~ zCrn}6r*M$4T+g`bhy3If2&wT$mR-F0td&uW(0@UYPZf1c1_}wF7$Mgm^1CvwA6w2_ z&SP94ENTgOL;pFTIZTTN@ZUxflSuMgaUBlZdf{3EF~2mBn5b<$U(n_8fK@lPgr+jk zJ8O_1{sS%};G8Kn7jQyJE!B{64~dg)uAKfVJrbYfjuu3ydROGLKK=;@geurT2L4}r zR~nUM+J-?*Tv4+$xA5fBlv}2iJGMA#W~7zj?qp_SglQ_Jf?JGB?v_oZ;-oF6=~ynA zgBz|jYLS{WZkP`4Fqu*jevj3cMKk9+XFh+<9}c{n`{w1@?(4bV`?)UlN(SP6{8Ftp z&8N)}`f5!rjH`fsXNF7$X{>gXf=jrr#isHqwp3hrKsZvj*pE!~yN7WhBr>wsJNOz< zCwksSbd&n)kK!}Ipc%?24g5j?3M!EZGGq(*&;gerA5x&DB8=SJ_O_Is>mto93pi3x< z9Dg@f#7s(_`-|V*@FahQD>xa>l{f3@o|1S{C`$7k`a!(wZ(?ig78=2W92H7wF`{Fi36|0LiKrvH&DI18_QRy$DHXF2N~S1946q z$x*olPVKpaSslqi@uC1^5G+UV?8tDJkvg9I`fX2(n=gmo1PKsLpLeJ7GStsik{sZ( z*-_7m2|>S0Puck|-A`eug0EzgpN}m)Ew>B@0LAj&A|#5QvIx;w zT7p-gMFMkVSk@$6o!}c;_S)WT&jeM}T)!nYLFM>_QSWHA0-YQf_&Cs3eZ_78S~8?3 zCKUAk65h#d5Mv%?wg@Tsko6Gcc>Gb%ldHPAPe*%T{gZ;U^E2d!`4JZ2c&;vMhyF$g zM05xWp4SiZ_2K_+W2tZ24z`MYu--$l3;=;g53xoqr?JE4=T01Z=ksprlDWxnTE(w4 zHLMrB*XiJsC&vCSqoIOg|cfn zggu%vmCJ}fgA?6BO>*jW=cfQmAbh7#F>97aaF3@V3_`*g1$)Id8CGWVUuy zcid5GQH9-($ZheQlVgFUo!$Zp!R8<`_2&*$9|3+j1qqXwu=dz$m>bsPBJ33 zZ! zTv`|O|NIOhzKGdzY;f(yL#zfUIQ}7O)-QfI^?svs^{Z?#e6sm3hCk^y*JsO%J+=A% zPiW5yv_GUxhZ{6X3C4-OZw3!R!q9Wb*9J?|>arOoTxIy6@a^Fj27|hN99Xi3&5MYyP?oTNbVV7~gf=^Bp@^UB zpCBltKLgps9WpPN=kj+*NzDhhTrT}hFYc6Dvf~4hS)sz>=qIJEst7R*vNCt7*oQ@$ z9U3TM|9Ft)c>&9n9)R|26QL+@TBtFW-ADE+THIpQRV(?q+^+CGt-P-i=ck3e=&`5H zODaog1T}RebZEBEzm8rqZC}(sPAq%QA%K%vrl`IV7aBo*u%yxMz0Q5p2(^|prmQDN z^AXhp(KVMV!u|e!SFwtj|M1+ilYd~1x7l_>kd|jqFW${Nb8;Ct2_{8?&C4sqn0VX_ zW{5xcVI8?J_d3z8qU&{%iGP&t^*om*HJyPwK8OsZn#UTx=yq$h?_f2=Mih4)*!bm4 zg26v4{SnIff%5=}IfH)UusK8OBaoIvvb=euuoCXZnw_6O`Awb#v8xmGMeITx5`bA4 z*AVPvQ{F1f)UtBeuq99~m~(YLm9vYdO$)YSYqZ^qX{7hZf|@U{Npf-bKmpm!m3(2G z=M76}A5Sb?gVGlE>WGRoL2y>=20(Nr<#P1FF5A49Cp)0}0X7F+37tl;5(cRr>d!RT z8twE6eP*KH#(8*k^KKt(kV><|nLo#Wd1<+$aHiu(fKV3`N^p(H`@{wvlflEuv(zm8^4@r}-M4lL$K0h%y1sX0jL~-;-0xK|oQA&s zLs$0Qyr?zMrz_ZVl0CG?J-gxz1zF`+S15A7LG9=IU97 z-Q}4nL-WlY6Z+)IGLadbBCmvFtF3QtHL{h-G%?Wf4onv@gx0QTW1X&*L7qs7rGuOz z*ZXHb!iEP`&&qLR>}|*Q+l)}3rJv95X&XF3pa$E@yJKdu4MK$rCNqkn^|n+L9VRj2$dEN9R=42_ z=lZb?YsHGZ_m&nqrREu+?|0yEj(0beNSa-Bi8-zDye=cNz|EJ*ymn)qD?zC3eHsIC6BYk7FONCm5f+Q;I4y5ccR9%HgdrLc{Vi_Q@*5*J5!zh8B(i@dF z-$~m4labGO2YuF%nq9VPw&_Vk+31OrS2Qu{x};9TNaZxmt`dVToOe<_kR!JL0dx$L zf!yVgk9RzgazuZ!g4bas%9t_3-<`~%HO>5>Du9Tc|3$Ce;sNnF_JD=j>DB2zHe(O* zJ2Td5yjlpED&cKMd!{zBh^w8Je*xkOZwP198OQqR+@YYt_8Yv~`TL`k!P^9YH4C1ZDeiKwEVZA|A%k3!Q!eC=;|=as3)6{St} zrc|Ug_l3}t5PBXKKi@hfm92NKm3j0QHP9hJAUdgB{M}x&S&B!5%&K(Ja<;yuMsg&> ziO3~yH`Z)@2@$UlLC0VEt~h_si(S3}_0x6PnqwW(H_z&M(~QIyo=}3NZ9K9@eMMo> z2X;qX9FS~!Sv4_Ew4Xcyvk31~krnh0&PosMk*;+_8edgwCM9h2b|=SS*#V_P7q2~9 z6LhMc;$v;?zS$Vn-J>YwzdlcCL?+Wr{+Zh4A0m`Swqm+zg)lpPc_x7^P-4ZVsY-ZV zWZH*lN(;ivHYs@Lrk3$!UTIZ8V)vGLZB8C`GfnAF@<zS@`izN%*q1_E&ohXt$#@7AK2tynIvm8VCB~<0ys-=zt7> z@L$SnE%)2UcYh(ab-?ax2VYD7eCeP7OpPyufcOo+Rz3OBa76e6>;K=~ zBq#cfFb2PI6`z>GvayW#2jg#~75F{y%M~npV1M+$FJ8d!fS;{-*?|F&KLElWL7%ys z`Azxxe3wo4t^NGhzhU?N=;iNuS(m-E1De*)0rhK^R{mT0Iu0+p04&si`|Y!n|3+ng k^RJiFFU_+~d}01+aXnzS3Jj4YsVQ+t6A1k7{!5?!0}L-p4FCWD diff --git a/pandas/tests/io/data/test5.xls b/pandas/tests/io/data/test5.xls index 4bb7cd4767dd7f0632d971f43e1e87147dc42c29..f484f54a60c6195fd9899e7ad9436d99c32a17b6 100644 GIT binary patch delta 44 zcmZozz}T>WaYGIZ`vej8{$9PSn~Pa?Fmi-v=Hw_iR;EmLWDB1hz{aOV delta 44 zcmZozz}T>WaYGIZyNglB2Zf&(Hy5+)VB`pJ^iyzg^>d%>$QC|1fQ@hS47LnU0CCn2 AbN~PV diff --git a/pandas/tests/io/data/test5.xlsm b/pandas/tests/io/data/test5.xlsm index 845cec785b498d35b3b7f3916b94eca86dbbf8ca..7b8d489f243219ca0dbeb5f8212ccd08eab765ca 100644 GIT binary patch delta 5862 zcmZ8lXEa=E7apAmMsLIDozZ)b=ry875+xB`1fx5;L=X%THF}BOqIc0dL-d|e1`$H^ z&%Jl8@2>BA|Gayh{k&(Lb=H3NyPs!5zUu+Gz7`fX1qc^}4+4RhLCob1KCV^doR~o3 zkZ!ji_~DuQ)q_p`sHJK}e6fAZK{6f#`HnhHoyYB)Zdw;-+LMu>B&unbU)8A0J-2ea z+}>0t98$-_dow7xRvo6mn&vx7c$Aq;f?d@AaL&`Ab?7RcSK9yrxsv?OAjq@oO z!xW8zlllWdYOMJmj5&N`+)4Ay`=x-l3(pR9xem!15(|w(iSYFF2lF-N8nVaeIZ5wV zquoTRUQ%jDSURwiOHuL`0{XxBgAHFw;FUYE{<~ zSF!1d{O~^Ij4;q5&(mHgQZ;zZ6obtrKPnKtISh&!nBRvmGJ7U*H9ff{=|8mJ9}6&> zxmRnVrKN+SVNqXtXV$%tgr?5XMI&0-o-hGS3 z4wHQwdAnwg3B0^gl|KMA+{&)xSWy{lNOUO)xtKdu8JwGQjuaoDJHhz6tsA?>iKrl> z#Hlk9WId^=$6~|;N)vSSx&`k?9I9VM_Mah_;=syUUP|iC%#Qw&i3*UObj4?zFhnx((yH~PL zNSx|U-YB2Y&zDId`Qr~4J8fic0gK8><79-W!>FLAu!H+JgVW&M&lsxR47vz2M3;rs z*4Hzoa@%g*Ha=K5#r+)O1D!agCD4#1@&Qe3Pc!rW>3n#K|EuN7R>x=5u`~DL{uZP} zi}8z?oPUG|Q)Isy6yQ#9-^xj8HacAMP`(`GZeRGtH*|7nyE)K@T=}aKfVa0)eoL4j zP&qCLL~_STZzq_S>l;UFS64>?Z)c}GBS+UXL5gd6`&*2288Me244$&ok5t8JIdEqM z-B`B}qmE&5A_L@Ynf#i z+yaMi&oapokI_bMLVvUmvp{jtI2E(OVQ1V>zNJ5(nc=TNCnXA4QSgbUfWuO55?byx zyOEd9zOdO0s0th|m_|$?#ja&rqRG?56$ID9EHwH_w5Fz*7wZXNu7x^lb|y~0v5y5& zxsuDT>dI>O6JG2y-<~m_n;!XD?bVNIhC_#W7Hz`yEPF0R>A2v&)f71GbJ8Q(p4W(E zcKbhj!+rDn_cds6wOX4i5{%zSCseG^JCe>PsM(eFr9 zjlg!LS|j$D)JNb-kCg(iRibZWEh;2Bmr$3SVV+Q>80X%FBgdCaZb>JxDfC+%-r{XRWEK;CT=?EH z3k(&sT#VO&__#NcQC;p{$^{Q3IBxo5HZ)d!8EL;Elqi7c-XqpZeXA}uXYDrL>TlZd z3f?w5c+QuU>+CW8nWw-{IJD!e#YCt5)GlRy0j2u0F0eW@LwdigNVxH!$myaAidp4Z z|C(1AQMfG`)e?Y~p)P?`vs>XS@otEm91Mm$ZCWmtUU@d>7~QdC=f_JCZNy@kWlOsQ zzL=%q{|;D4eXSws%G5S4HMr+59g;=V8QUU8J?|mcppmz2MWgb-6%1{@Nz{J&xhGQw z646E5Pphv!GF*>1~ZEY~{+|J3Z<81P3X{hF|9VcG|8ZHDzClW?d7$Et33n0J@& zb*^7%z&51uaPJx5LP*U64Sk&%!dkc!Edc}HQaShDSec^6TfgD;(ATWc1Q;Azwh^A0 z$Xa6Zps?4RPA%*Nh;XrN{ThIXAWK|P#$;|uYJTv$WKR#+$#J+@!|XY%bnB9l7->K2 zdS$djkn_6Y81|`{sWFj<3swCRZ^qcPu@dq%&3?B7dmW(T`k85{*FCHxL>?K}^wlly z)s#)L5?{^zNoPo!GaQ(4Oi19iJG22$mt0y}ohHWB#T%MI8ip=UI=%Y*?q*@-XccNO4?&Fyxei(7psY%kA+_B67IhkfI09@O|bD42=s?N_PB zO1W&?C)4gp*X$?}JOQYj>OwC@d@BovNn>*EMU03qP#1r3O1~fb<@_?D&TgGekZ(dm z*n9Z5tqi+C;>#aVoNQ6UR4RhQ5ZM@btQkqpTy>D_Ngv<^JvY?Cp(dBvhOyQ@_z6G7 zf-SZ~P0r>ns)$eMmn^Xi;yMkeohlO#(U*ET+AxqZioFbH%Qk* z^66;d(CLmQ?_%0F|0P|Q;JNy*udFaXPX&3urp8`o$u9oLVSq-ufy)u!c7@6}=odB) z(GKsR*r@VTaF5oAA4h}i85JsaJ^7vv4&UuJg4=Y0+Y7JUD_le+83h5w6F~@Ych@~a zoDhlwF6Ps@5^ll=zP%IQ=a!1Tgw3;(k>xN6s4q$~Wj*3`;|4~QNhykNF~>4BdY2_@ z6U!d!d%o3HFeR2kphWxQd-afY!N$+ST(6V7$70O8Nt-CN$`VSPS|&3p2B#@8@fH`PsMVtqGA@h)3Hs!E|hLBfDZrO2|5?W%G*N z!|WEq-Fng9Z}r0V^m}kx?3yWmqmt`k$rHxQ@1lqT>H@KTe>@wKi6(J$820e3q?-ng z{bAdsXkHNIUPE9m^dvM%4o5MUbXasFXW^u|%w=q>ijM2jzG}(wWY-9*Vy>dX?cmR^ zl@4Bg+m$EXB)H7p6L)dM8Na;!wDQpFhly;Fj@wbt>FzBjlQ@E0 z8DsCEGu#++mlj1O@zQE7Vj(Nw-SlT1`GLS;X6IiK705AR3QMaTWk96 zv0O&M)`Qz3+ivrE>BD3rQ*wNTN)^VUO*Guh!Lx%>D*4CSnrzBEo*qj209RSfo!*_~?>6UB#UTCn*5ZZ;@`j6*_=tsSCDw6f@rXKd}CNN70 zk`-^)KCpf})TIhmcK%~~m^}4zT6~XDUO2vz30xNkwRUJY5>6$gSkNXm(DsZxWFehX zKY&ff$L_gf#AjL$rr*%GyX?91ho3trwPXIWBDrLI+QT7Cxj7Elx>E}ku==3|qxEHL z-f(E-*(j_=#^v-kj)oDn^$k<9d}b!*nxi>*f=#~`*a%+4-kRp7Bw$$?Hq_#S+1Qr% z;bbtCKE@4O8sGcG2BSRms%|Z{%}6w*9vh{tS`ND;(exl0LF~~OfAf9kTR475F@L?n zds0y6f74{4E>r|~RBaev9(r$_`YCfQQ4>aaHO>dSb180Mh;V#8AB8Y}J0 zOhxn0Ult`BSTtf5mE~Z{u;HB=InSO|Psu{kz~5|W3(=PC(dG9?8&!P~BE^{)lnwL! zB3W-ewB9U9%1JfzA)hf%;u7#LX_J}LsoF&!qo*qrMnz`ISn_b?cA~Z%xa>qzU0G%x zsf^lT>39J7S?}Bz;h{|=5*DnYj?Ltft=(hZxQ*Rgqf`^z7Qrzpsr-Lt4U)|UACeq# zxi^qyXcJkpyx!F(tA8qWZ6rRfV*hsCxGb2drEb%liPx203crc=g>1@d{CIbHUC|y5 z4k&!Aw(G*!69}k85BvZhT-*7hA>%&2SV~dnF<$_W)laV*){AYIA><8776fMTz^i+P z0?ik7S%cZ73=?KyzoK&9`{0_eH09)GR%?{-;1P>jSjwZqe_HYoqT9=xU@+vQd#W9k z*pxn{nb^Md#gC|#-Y;A^xFy<6$f{s!^I3BT2DUGCnk{*|ZOTgHMUpUC`v)OWUrYmCJa*duE0J)@Zv)l&d%+Jc_^bFZ))e*8%eN4Gfe`MT)R7-SlK31cM4BB2K zatJkUMG|S2`M%Qs<-25COo_Yn2!>DJssuYpE9;b-m3m}ZVMx-NwZeyk6y)}PFaEb zRyg;8%ByGh*|ZE*igHwu7lQWY^#^mGuuA{n)nw$IA9@9KeGyAaz{``lylRB5ke^?d z7{4GH)p9fzeP^xNf&WBfs&u>J^9KnnP5;$`^+7t*30804=l(Z|(9(qqDkd-+O?!?r z{?p!1)cke_tP8=NY4g$}09I4ERhxT9$y&TIU(5#1lm(qMnXnf@_e>o{WtDYdXBg?a z;!)7=OJn$r%G5ep`VW`pJ(n;f1B+(y3rS*q&D#D__maHC`u9*W_a)>>)3 z=UZ@Juky;7f8PO3LR3q_cO?b*de*ZE&xA7SM9Oo)&(s8Xx4j&}z{jyHMeHJd4$X_B zk25Zp9h|aJ@!OF>nf(vr{JcT=U|BVs+i0mD0hr=1y&XSS%MphJc;#&}GL~NOmyBC$ zT14M-Xfs?}SraxNo%Bx5dtNXEJ^e}fVhYzkq_8`$dba@srVlF>(JbVUkp0Qxb*iZE ziyrW5d7j7i;1b=<0#FBR(;bp-Rc!^3qMSYi;9tdwu1)g3rjc2*MY+}RzOKnG-;^z) z-k1~@f`4%gWQ=5Oqlv5{KM38WGFcUbcjez2NtAyta_yu4)BBV)hV$O86`l*X+eShU z>3k>r+IG=8JfEm@7cskji>ZGksf^*^*gdc_>wSSm3Ea65h-xx2#Lc~Z9Ct!e4fC2e z)EpRqfnKN}Mfi%&6-nNq{vf0Y*L3=&o`6u)=pW;S<}%WI_IuxNT5-e|bB?OyP=Ru& zf&mW`s%5I$_#{nI0_Dx%^`tn5jGT}D2P;ftrUlr%JUHC*p-NNeUo2~4TS*aWPmx(= z*c(y_!ilro^QKP>qls1&2eelqgYQZ-KMpqmxz9FIkMzFcLc*=0wJNm*hh7t&>;=Ci zR`0}nah?C3m`O4MQ&WG&k_>#cQJ1B4j8ZI2m(;R9tK0K<>o-W%LXCGe+TZzIRA)T8 ziAF%7)AfBMs{9MgsF|0TOAEWu0;?uz)YgeN{OoZ+!$Y5ntZYH?PRBG4H}rm9=tk-yfjen4U|09NJqSV7GqD-Ua$9g zq~7ZC{V(t;u--#rqma7!LR&+7;5We$et**<3d(l*_t1v}moMy0tihoZ!C+p}zaLzh zqb_|?X^pY&@k+PACi^Sf+tPUStlro7={fj3!i=5)<2k~MUIJK__TY0pzS3QfpD;3A z3VWaZ31T95TVcO7(xC?1;%2ifV^p>)!_cjm9-C3oYx{wwjrK$r4={efeZr0sPLlMp zTh7@+j3%WQI=ssI=1G_BXh)Fb4=dsfvkG`p8DG2WgUq#pid45xV!A#&CCA6BR6YUo zOX>!NI%)Pc5CJ^~w?B`P=u|k^5tcl$YZfve_FuI^aJU<&h;hl)9V}Rrlb_@r{-$EN z4fU4dWTfR-CU_H9Sa?<=jp4tf(c)Ujc3SObX#5hFuA3qU6%0P)xgy3Xgr2orMJH9*Bbl2|a?)HDz2*Q$qAM*dM)4zET0iu|J z8S?*<{9nMm`v`kRDCEC${N3xx|IOY-HzWuyMs~=5yDc^dMDss@aw^0D1s$S|5sVmQ zpu^mxLHuCgfc(wFJ7PNDeKP6);3sGiyNs-mzl-P&zKwd*+g*m`Y+P2_=m~sReN~1<) zNrQ{uD6#vn2q+?MGc#>G5C}`Q=S*fpI+KJ*=cLCxw`7r~hOxlqc&E5*#*<|B(t+vL zagEnfAK$3%X?(Kc$jgn6l6AwQJOPLsu+|Nn4oN`iBn_&2(wf`(-*HVaSH7n_P=x)E z$0`NXRTxXuuKa;XR}*{Kf*4~#kAB4HwP}1ZWz0?~-62DeMY6-gxmB)&kh}fXl?K@v z#^QROu`)puk)A=8*(QO;@oTJbqUW^MN; z_Kk1#3eT*vL1b~p+;d~7k4ukaS|=dNi2>*sYTH+iNEMg`kDanZw`|G zO+s{1mvhx3@jX0b!mxdZFlF~%NOQ<&pxaoCh*E}mclMOvD}W`xo2>;{?GjU))t9Z2 z&!BYb!{2VR7`ZVYTPenD{ zp%Owd8UgLAihgk&h;8!Z_nuMYP&|m-i=VsH;XgK$xc7ipdLFhD8mw-aVT1SB2ioHC zX0ipuZmVdU%Ag=>w~wxHyAFAgrsVEc(KOW_%G62Lg^%4cnej7q0)OlrA0+);b~z|^ z4Ch1BA1yx}VB4f!2hPeh)w>3s<|LC>VtIbiVm9Bcg~+#5cOYj>eY&TC*Zd-=u8ZfV z1_TfGSUG6m5>N2$xxF&;8Go~;@G7iUiB;cuw1W>YcfNHdul{gA3Ogcio?q_68rFox zL+pMtBIHKqD61|f+r~!*w5s2~_reD%+Ks^}NX7@A22fz1qogSf`@ve@$%??!%O5#Hw_8cAJIT#DYgel@0APXv0bxik!Oc%)69i&Z_z<=<&OdZAZCD?ZpI-+Sjs zp?nv{HHx)cWH~1HnnOrLb)zat#dkP0xA`wwU%N8x8~`Gw@h^F1E%?hp<+hi|&5=w?}o@Q^TP~dX~GH zsPJ1cm+cR}32=Bn#zL)B)dNtGaXFw5~%%s!O zVeomYuXRvJlyR7^a@bdDP}&O#nL048N0Ma+fTuorHJ2q{VgO>yZ7NTMinVIzZGFB| zmIRGu5ie#K7a4!skGkGuSWn`=deB5zaFrn#P`)lt|CjCSL$N1O9kplcONKaDU}!P0 z_exIBq3il; z{AKH(ivlLnE!(fZkZV&~*S=TBYvNhg=N)teO`yCSa@My%?r&ZJ7V{QRv<9)#%h-s6 zG$TrU1u+WxAeWFWAsVqI9A_?cg=al1hx*Z%QN_p-loBgwRQ2$JpJcPD&biyGxdL$SCl2 z2+r{ejTy^xiIj8t)&OC8RA2zU zvbh7=%H8RMNOc*STb6v49X79n+Uv>YxOK8EWDz#Pit%0;H@e@b4St1(@+{g|(Rm-z zZ)1Tq!0*Isdc2>wQNC3us`{p=_b}w8nv~qrVoK7+!YOFv5d9uD;+x zc_HkQ2H{B}n489E(B&QJUp}%2n!Q@&KQ62Oed6VZ6zs|#vk*<$ty{a2{$Bql6NtDB zP(66T?z2QjpY2A&9G3CWf91CXjAp6mY2Qe2cv`p0>{F4|EFjAm^XJn@|FQ(f(M0r}8O}m^}_@iF6w^4Vf>5ji~68GaGSh%#}0Ch;TWBa|&SOH`OaQhT~Z6BaEv{^)66UkP0LW)2ePiGe~^~5fGTcADgG?=Kl5; zDYRljT9{(O9}S3A#n1BXw-f6jY(e~dT$70ehw);}a8=pq@^;d>M*wS(BIt--sNiAI z-K;}K9<7-aXf{Hn^07pE6n+V@eo3LAKrX~Ro!WG`=SecAieVpOr6fyk!_eWq#D zT31w4yFJK)C)qwstKiiWcQF5_xoK|X4}E<6%${LwuVEb^z5UjZnEjiB8cK?8U@e{s@d4p=?i2J;K z{Z^7%hW8qjo`EIQTD)0mqwO|#!=RA1)#W!%MBT*)Z@bQRc1$vdqN=HRZX*8-XH-+D z-E8+9eRR;p!+pMHrRvcW@)-LPLszLC|D5GlJCA#g95T#%e70g1zy77$Bsn9MVaBu&7ypnBo4i*;UAj$?P_Lp*)#TIyq#2$kdxCap2#m|9Wg$Y z`FZH$`i4uSwgO{J031O^cuOehm~>=^F?xkZFb=$^8o}RBGJrQfQaMf3mzDxE>Nwmv zi8m^{lR+lUe??^B=eyPTYT2nrZfe$+k(+_jVB6eyouHg5Ny>`GG<^nhMD0>dD&tQC z?i49cWH=0br?njk{m7T-1B~2nCYT@}-3XmeJ8DIKmk}&jO4kP+h)cSK#G@;VfeUAk z0Pot`3sI{ELrhuqqh2Gt8g3QV*oW_vLYjbKC|^ONYx2 zo;Gr=*jCJ?d6ifLCT~Dcem7sF+^+6*G591KwY9U*uJ%M1GS1OH(-|%Bb1*HVv$gQ+ z_H}oH-27p@ul|X~=+*%Tt}%ac%t2T}lH$!oGc0>v(6vG4e%dm78_WwZW;I;$az-ao ziEjZcQ5wzJJpIf%?^Rm4(IT#|Ba~;O>xv>l>K$CuEOu+4?CLK*kcA>^6bx^4DGt-c zz=}lcnYpA1(+6f@ub-d8+ukPTqqGY?7VFzr+t1Iw>@j@O$H6jG|Nx7K9^ zEJhDYo_0)aOd9F`Anit&mD@^KCq5GN6xy1Px;lF;ZKP{)r`*GQgW2F1B+%S1gG6Y} zZaa}Jk=~#1R#aP0X5t*uN7Yy<>y7$G`K#=r)YSzxE{b{O5B@RP~S;8TS-UI6;;+@b!VJSEif>XGab2| z_P{ORk`P2nB3ToeQ1jd5)9FsPw5Ny__O<*#Zx}{-`jI~L03v->HJ(z7-(J6DQ4`7P?Us8;nhmkTDDtA0*Tn-b@ZAliLbEH%^%7bafs2-y4U9e!v9xpT^d*u(F$InC0GkP1Vbtj{ z_%F@*AraWH09Fovl27LnzyVQwg+0LUIE#nmH#_=DXFGLnvSwM@cgcp^L5+_G*ilG< zP@iIH^=>#YM_ZRpsh*FW^5@mz z%7f$O&oT*JA|_eycUqu#=19Plw+&86{<|IgyBk#V&7(HGd4`;BsCNrMif83wsqOAk zg<}HYA)jf52-9~hYW$|o-y3O%2vJph@Mw{!;<<`+scxP_X{(s@oa_izK%={RPSP*8 zB*kM28`*tovONv@8P8eds8!6P)XOZ7(}(8C+_5ioE1D94d2m}XjiF(7a~*TOr^S`{ z*7rwZIaM|MKzT7DpsuG!RP^evzCl0L@#(jrk?`UNWsKKW4qxz~P@ylvZIm5UBcJ@) zrPzq};wO{)O@fvf-FY4{#{>@d$eI%rk07bB{a=i%Y@W?%M%2mIEM-hL`bcfXc$vMz zeTaE+bb03Fc0FO>B|5VmMSx&$<2C%E{xN}R&ZgZ@Kz2Jf{kHa72fF3p93z_{tkHCH zIr2Vm>e9*&w>0R?y379Cq-whcaS9;q3C}Zomv<=D@V>w!k+74s(Q5nR#U{zIg}Un5 z!o$G(i=jJ0QSw38kvVEyWwNS$oK&P#N(3vn-eT*uWK|At-unM+%Kxr{rKrU@=B5fP z1ONc=-&LpzgWSb~nu$;bms+U6jw>wiRy7V@sc0Z`3Ab=J#r-xP?@tSk%j|!m^2IoiDyN!QqFGngm=1oP zTBR)f?Q;>=rMK8LpJejK&U*PHX2C4avL6rir3nPf#!1cO)js?v$YueCU?MM6??iiORcLTcvn_Qb%yL=|MW`y;{zIX8&j9V^UaIz<gh_?66E{Nsa{eEh<()+U diff --git a/pandas/tests/io/data/test5.xlsx b/pandas/tests/io/data/test5.xlsx index 13781bb06048f1e20b89bb71b005c578ce2b5694..8660913c7481289e2a1fcbdbfb55744d1112d11d 100644 GIT binary patch delta 5804 zcmZ8lbx<43w+$AcSa6r%p5R)n6t_a5Xt5TTQXqJP6)4366ff>B#hn7hf|lY|JU}U0 zq1Bk(LTeBQTM|94ITaZ#7rISP*8FD#5_;lQ% z-p%z(Xw+c4KHKPN-DYFt?b1iBht0OR>E@AQn=LcQt!Hs(JHCrNZAZ%Yl?i<$&n}E<=K{iI`&7_ zfMqJ!%OX9nu>p^be(T$tPnunx+$|z=R<@X{(1?Ui;+2UtV0)JYleQ>beOdH*x#?hn zGhcV0d7ytccihKwy_+_1N(V~;vmAvGaOm_8fS&WmMye`dH)PO&JY|8_sPV5pmX%wM z4RVHoHTqX=Iq?avbZA&z`PYkva+GzYBzQyLl?;Dtk?heuvSkQ#E1tsriT(5%O>s%O z-*(oc1p3wkW)F%yS|8q*2|!J5n}r>prtZUq{{j(tWE40}hETTihEgmhOn7CIwr)R^ zFych*E^_#4ZY2Svr0Jum)((FDE?9k;q$#$5oqna8ivjeUE(}W*-udosWNAe<_Fx!% zxhY!tk&Hry?Tb%U#HSk%Z(Lr^x2Y-~l^X-3u(^x5%XC!@KWo-ZIe}->uQWSC4-}?h}vd}+yewM6F#3>yiX(_C8dR$ zPNy99w6R~P4jgs?U;LpnG;ok|thLE`E6wU>`O2l|OgXcg{60`XCu>-rRg~;n7^ynM zk$V9*Z4^Bn&Gs9R!0)dZ{`D>NBXj90#y{y}hsY&=YM1~*FyQ*yngdW!C&AxHs81%0 zl61if!iq9HB*FXE$g{)=7I}BRM$g-3Vf~A8q^@d=qYfstRpvZf42gvD-$sK3Td1eh&3xo;st&SC=L715EAB zg88m{pI+kq?zpql_M33-Ew~}vW~k6)Di?bk7~#bdIjqVI=SlM1&QEVQJlXJ4x*g%^ zUi!^HdVXT}^V`te8X}jB5{|Kh16aWX0IGoi0LcU8d|e zK2(5YUZwQ!hEa9-w0u$0q?CNtuLWKyifU~GD>Q_Y%438qe)_IU$_e5@rkKFVC%vFU zu@Zi&MnU7UE0&~fdUx!gN;wK4lM|xD?=kbYV)w0Y%8>T zs2Z_t9QGq|5>3Cu!Qt^=cqCJlJ~YW;7kTr4nJTX$a8ULv5xq_F&5QXgI>8y}a3WubY4sFH13v_Jr9g11)X~COW6s!7NcGzZ=v1`J)lN!KoB0Of&n# zzJs~j2uSwH`XsF=l`4k%)Q7v5vNbX^xOwK-S3u!({VHsi2zP7X^HP{E!xGyX0ZJX3 z_5%xjw|t6#`=PWtRIR}G36Jy-EBWesV-h3+ZTE2~J&OR$a966ueVEO=Q@Q1n}wg!SG*%%LiS_IpPu zIPHy{hF`imh3C@^%*H_Z9Lek z*uT|YOJSM5;Y@Y~Ry%}TSWBAA1DDGmY>{?d_VJsqBxiW*1+v3*h$aBE4zDN&{3ejKLws8j)hcZ_x_hGPH# zXf&dT30sut+%K=X>KbO~@|1#i(YkYw?=eZQlTOGc3fcu6j~M+c zB7$8}xZJTqXZT{8b#V~7EDv1~|uhj|5!&>WFH>+U0 zb4&i7P%rE*wZf~8(V>QoChsoxhRdb7gCG$Q%Pyb=9uaH>6lF@~ zk)##?eMt58f}EedH))yw1F77;Wg?z)oOi!7+{e#m#FL?;ub^Y;qxIYxF(y4@hAHa0h6@KQ7;qN{@g=>!VqCF>gC0~_ zr~j1Q?H{2-^*rJ+{{-774FOK_CgQqBbt-XEZhMZYw1?6S`-=Exa4MIku$yuJ+Tt4;V!LsK=a3AJB2AVk3rw`27yVaMV0nPCM&KB7#s@=HF)JNJl z{OGUv=q!BnjZeYdznu;~g&C9q{_xl7Ben^|;clk01ri>@$Nqy;KNeO>%flDh$jI_p z1l5)$S#tIHJb2*aN~9EJXiQ{|`rxW$V{+AVJs$`ya(5Sb@)(Z1H1F^tGuXu|$*L_p zGJ-re$K_%Z=ls@qw8}uNp*_EW$ziK(61sGZ>3KDMd3$1^Y{uFr1j~i@HR~BEn8UT8Sb$3-eVBu8>Jh5s5gMq;m}Tbv76cqNu4rU z{SZwQ)btMP;?vcrObm&$)0mfkE!`~qED*9+iQ)rL9=5zIU_K8^d5oh_KsqM6mA`b} zUgd^F*3og_I@YZ?pC1@v)hyK1cpP8{q?;YKZ~^dacVw< zzUk-czPoE>#5*jtI!D1rRl6Viouevr;ppQ$oK^Ly5Mt&e8qZx`Ihb25%}z0ct?&@; zzWkw33`od77th5^>+0z%K#d@%X`}6j*J>37`v9;*w%hj2O8j&yOKM_`avjF9Z4AuB z$-9SADs^XLLpFU8S9e$X7^tNBrS3^GP2LgGJ~w;7PB^Ak@eD5g^n-#ymet0v4+>;E zigF{jx%Q5a(H~q;mTdko^K{@K2`mLAE8A;)ViPsmrvg%Py|z0^oq0Vge#j&zoLI{O zYD!?XacVgg&LE&z(jtDQKcIYYa{*RMlH|B3^ zl3ON=0Zw7cpObJqPimoJwg40~Mo+fkEvI^+t^9gqLjG{;L^x5`&=@7_XD~7M0?n}* zHvPuCR?sr`_AC!2KI__;fhIr1)~@-U4ZLV4m--&tvwooq~voS>~+ z4ZkJP@FE#U9MTwl_5a{sGI>I=c)!MXUfdM;x6NEls1)v1w`FvD;=6Sjpv1jGO%UzV zx)|cgt+0h5T3!$$&0HO4+kp3OCo>33LObzbt57RYS-Fupj~2CBaR8OtZr$hcv5V<1 zdnQ(4D{5Z^bOF%Ey#p$GtRkoxiG|k!8pP zeYK@6L0NUjR1;3Ls`w*B%5pF$TNZ~!a-+O7->yhLmTKppdkLNf;tQ;3k(tt|IK-Tx zW^3dpM7~$C76KpdM{hfEJBVhuvwqiCo^Zg@_JSAXe(+p|g|(4Ln6rsGx06eD_9K0P zt^L~*R8u_WA+gFC0@w4;QcXsllALmTwvc6O5!tZ5InX0(wvf6v6rWUfjM_A+3SsGJ z`f19-=T0w$*GBtFHhn#DvcI~i^pFMz@E+ONcVpxY52{6d+W{TlI|QN_CjI=e6r=yd zmczZ))84dfmf5W`khi3mBOu-diSe3Nz}QBhskyx7YjazhYNL30%>K;L&#~Le4X* zdLPeA>08wpkaXs*@#D-v`TZ21PP>26AgVDs^o`YNIPpH-UZP3!&UeML7)+xUa5!dL3hB*Ulpcp;wN+bpdG=InBpUI@ zZsE+B(@B#F`{4I~Z=$HJvnlBfC*4#y4ZgTFg8fyV*(A%_ace(x3!h_T)hK%5%(U#vct3VQyiZ$2HCh3rpmQT?oN>YzQk6VmOv?I6-!m-ryO zy9CjVX}&i!G8=Zg9u0hN8uF@t%9c`ZO^XY`%AMaaMY46#MAnfXhaFJ8Sck&;iqM7< z)jvwzhv=^dE!bkY9vxWYy772yB@K`+_QLM%mTle_5%nG*<~PxpdZ&^~7*5Xp-}dKy zZ?GOjHvoVrAR|MZJUYS=J1ByhHP}*fV!$nR`=AtG*R=15HP=+_#>&Sc~7?t9UntlmtJF&3c^Iia7?`pD*Y;vbN#V?p+}Xr z8nyLC33-o=MaOGu^s9Mlh-5QN!#v1b*40P!t3?_;bPQqy))c;?dSsHRUo@4aoxxQb z1~X1|zW^CjpM~c z+rM1&ksds1z>$x#znU0nsB_Fv6hYZs-(hzPFy*wD=RH5Fmdur<%zBG8Hplt7u8N}9 z;C_RevLiHPx{czB@JLwGZl&)eh~v59^3hol|L8glqJL+0hN>RfzyO7isFE1!jJ)-g z=iA|qXA^!@@JuZ!bl}`hx!`AK(2Y$OQ9w&+9L8TRojz{-Ep+)JeePFpx;$qG{FFdQDOb34kXdbF zou#$XE(eBc@S*J%o!GA9SZiZ^bshBq>@0Z21+;KW%bmu}P8Js6W+Id|dQ;5C%1oSSL2l7K2MfX!@p4SU^sw z{*=ga1RYft|Gc$6Tzk;OXfzf;PZI-E0Pz3JSwt^`Ed9U!;R8zRp}Y}1%>Nmm5HyTT z{~BxvX-4Ho|FN48QH&xC|6h6k=iDCxgeemcp6;*5i zfabpd*;I%%3OYm$6A01HNQXH?gP3RJWcZhE5Bzg|xGCv>;d^KiOH6DG|Gt$60MmmZ U0QxV0J_CY|nH9*u{LkwD0H{3Cga7~l delta 5200 zcmZ8lcQhQ%_Fq;fES6wbZ>vU&P9l1*OVn?&O7s?%DB0+35m_bCJ3*}Iq9wWzy|cOq zq6I++&&xZ%U(S2;&%I~nbLPyZr z`dAuvK9m}jf-2sEMN7XsY$#b1#3^*7j0gIhCsaJL7whdIXqn}1IUH3fV9zMiz?Rul zHr4+a=f&iS|l6vytJy|bv=nHf@4?5UPaaXSn zsVP^lZ;2^fC%$w|AmDHlT#oQ-zN?VBP)hPEi^H#{8}BOM)XNj6Q$W?n3|_f3KI1#l z%OaKx)gTIp`z@6;zkm6$)j6uN{q8s5tQd}W%>#f*0)hz*g7iN9tSJMs;G@v5hP8mA zps4-OebCp%FZH3(67z8+tuYhC44Bz_-`#n*#yV;^=+yZUtuDS(Vjg1GzsmXIi!%>E1VZ}< zI!F|-eRk+uF(FPPtk4U;uWfxDxoOst)6JKyB`lgFdAapaCpv^4pH|w_-=kix|NQX% zRo2Mmi!8S0C0T`1pHsnVRC>(1nAUm_nIN1J9wy9 z!_J)|6g~T4tzo0VUSsj1YFRctxzj>{09yesM+8Ls0@IMwpR1ealV(Mc6W+eho4337 z1lo0*XFJTyfR3Bwm}h3uk4!Orx|tW$HD{?nkVwYEH8Xkh98yq;P5t#>8XV33M$50) zYLe}s3F)z*(M>(bSgFjwM1ScQB{x5ql>Iqud+0aie>%VZ>ZNd&Z6LJS$d93G8?j>_42C zdujTcjIG0FD@Z%f9~}^gY0I>$6FozEo!t2o3MhxQ0-A0-0AK+h0HC_bK!4XeejeT~ z2oDbzVShJQw2`jIya+`r)eH*c<7P|EFh~4lI9osAiya#^lZBIid+vqqPRUOmmRm7bR z&{ba7VUIJ^yuGSgraQtJyGXn7YxgZ@pINy<-gb=-lDx0H7?e0x;75axRvr$pZPRRm z=9F5RJcCd3(#Wc7Vc%#l+bJ7i%58O>s97t&o*B@!fM|60uSch*z`OgbTvQ6u#`w;> zzS)Hgm#nY(RMu)F>X`5E;sb2lsn3*k3HiSh9<#!*OF9?4 z>Rn0%09axOK$0jPq3u!4Of+OzI^7Eifu|S*syuiA?jnqV5EGa@^&0(P&C;u$Z ziocnO>2x&xI zwWMQvqN_=ycDePw3tX^1_hj5X1Vqi?U%+OSb>)_W+E8YKJe-bd{oInh)GZlR(kf7v zH774<0qp@~3{hnABbJ>b$xRhT&>+P_E7mb}zp?hzY^WK`!kHZ%NgaRDad%3P3n?!^ zu2QWN#AMkUE$Q~{$)Fz5nPP)bEDhzIbfo6hZ1?X0)!H;h!?sS-_dEUVgTmqr!~E65 z{<4E|z9{JQp+yslG&cw`{m!?wGVKBbkYM_xVJuvx-+*)UTc9X^F_uHLlx0z3F?A4o zy-mNFDsXkTg`nsvODG7tsSN(N@9#&xFI^YA@90meb+}01X6o#lnI4&z`{I>jeS~n6 z&E!Avl2JMg3L+hl%JFiz*rp43Ffs@kBg}&mTy6ep5xw`mkr;qi7>(&$9gxY+r5;4E z8j7J~d^L*kZsT$l(8;Gx4f7|+wi+I!iY4{y6*vHx72{puFqIf*MNNzBes z{Izj>xHhhSucS?)6A0FYcbYAs4o@3$EKmv;dy^RK;F(JI_0{;3_TRsX7)f>1-^;|IrxP~*p4uj z#PSJuK93b-=TkFnSorG~X17lzDu1)tcp|=?TY#KDa`rf8boT(PXoLBvSp@UnHXZrr zt-7=OxNgs@C#BBqAt>iGb%cKdTqIB)Mqn?AZtAb8{=E(i5H-IvkWi(%9k?c3x6wE4 zp_>Yto$+O!WYUxxF;{qtpe_OXq=jq3-a|Jv-GaSkTV_&nd_&+lUa*Et7(Cy{MQX zvg89+FQp}^mP;P?kAy^K^k~dI6kW>!u^a;>qHuw;^m@gbUjxz~K281j)=&}>OfGL7 zuI`$dEjwWE)uGePsT72>@jgOIiynY?C#QJdF0|UorSLA&;XS_K;Yvmg`Ov_DF>_^x z1e)1cc1I!_Z#$0&)y|Lk!V6fVx^Dc^4f7D@SA-X3#k;bP@YOtBAAl28v&T%Z)dHAv zN%2;J*K-uGPbKlLL>1yVkd+^$K3X`-d9$xp?ZF#&SyDzu^Uha*crQsg)tzUKEl!n? zon+@@6du#c5V2QtT&(5w42a5EfqvEFK-Nd= zt?FHyp)mL$cmEd}BL37Ysh z{-2#BdI{RJzCNhWMnNL@FxDqExtYpNa`{I9dpcFRBd~Chd}&R1Esg32 z(wVXNP0`KN$cv0(ceEjU*VFTY`LlD?`XGp?1^st_S{L7HV$=rLUWyXmiV=nOIsj0iv zYU(tJt>1`$-e)RgMh$&BPkC?=y0)dwz=L7tfF)X;xNSoirfb@r#lAhDpve!Z zQrSClb^3V6g&I@*J9oVv+P?ZL!U(rZpIK(iKY=r?LydUjoSSx1wX={L>|lD9aC@m% z*{x5vc|Qya8`xb;acedHDn&YVb#P$PxKuT*Z1R%@X1QZq!ky-N=D~3cMF|qQ?5er=KijArbXp$p{;JbMLP0PK~vRJ?wrTGBaGEb|g+goJoPHRh&PY*riW1 zvb0seseaaoBkLd@>86Q-{yfcr#eB5xjQ6W9EI=POeYh}CVTcb>&>|(E7EV1T8QEos zTZIWFLY6e61U?fFpzv@b)iZ?s8LuVHJAdr{Ofs)z&LWi)xFSS6^WSL>Tyg7Fnx1oH z;HBp_{cdBi3B=w?m9?X?%ACa5u zx59B5N9_xWLPg7&aJoY&8LzM;bafeM@$5bz+Q_UMdSQ5X0#egF_2vi*X>J(1G z`I#HNv%A>grfqBl9p~zp?TQopI+&5w)n5GN`*qJNCESlBfA~+`(VauCljg#uF&7bO z8S>DHR>@qPkmm54e+)|r3o3j#B+=1V92o#C*p z?yFa_DGGYl2|ELoSN|cT;i!5Qvl}vcLI40T{3WDM9qsLXp5FOmME@{T6KjtQ5pZx$ z?e(K`g-0g>zr|OPkOCF>CkU>gG|P+E`-&pob8QR0ZVbiGN+x>Ta)JS zb&?(}YpkQRee!)FAK@Kb?A6(GIddZfGZyvKW{b)6m``bhEUu#{YeY}U@$-$!Z)^wi&iz`j68TKYyA!phKPOLODgSs^?7!UbT<{% z7HVam=UvTnZ68;sgxxmQmge0=*qQFkKV2-@Y;lrbOjWHSvVHoFMkSDdFgHB4$%5yb zOhzOp_+f9#dW7lAT;_)odc(sAL(#ljgYlIg8={gxn1Z`vx{v zj>g`bs6>Yt0b#{+8c&rzP}*|idcqnLgDErhZjzBt-}V6b)_pPx!sOh{?ppQ3#OV;e z-5g-N_qyQIhVr12;04=y@Vl9&2FZPN#+TxTEh*8U=dnu;r~0~+B4#I*1i2Eap`^eC zCyd^iY}%)StAjW)n~(P&TcZ;xPQ=LFafAE8rb%Hfi$zD}QIEVYJa4Zpr9Y8^&k^4z zQiQscLzAbkzHnvaOK|vrUMXorF~-kyzerjM{C(UX#9ZGBJH@-=rBoIp+k$bX338y- z)VqLeZr(Z1UmipLs1UZ|h;vMhJyaZz-CC3)qynKKd0d6STlINjT5OwF&lO#f^nF^t zKB;FGxs7RJX2x5>OfyTNY~*|dJ&rDGx;P8Ut3Hr=L?XPcPA+Z6KWBs_W`8!;_-GM) zE3b}7y4C2O1+Bt)dX2is)cX>i3#7yhzf9WOt|sLpCZQak%5{0T98joooWxpLyL7!M zmjx7g;`T{xtZWmW7#Td+ccn+(LN_Z8qI?wbjordC6F@whjP`<{q6@Az*2wQh>uB+O zGJ$qBAz9|-TA_q|jr5d6v>MJj;#Jmincb5lWeZmKEyjr}`P zFwv|me`F6#8LJA}Ukw0r#ww24Wn;o$A;au5&|_HHLD2v28<_wA+P{0;cthlvP&RJp z-!B^gxb;5J5^ofG;eRBiw) WKmY*rUw}16%uTOIQ~~|>?7slwt&z2)}Gk58ry2)G`5Y#b~CZns6oTVW@FoKY&C4`G;H`yd++&rPS3f2!M*#L znZ4)v%~~6?-gi+^1_BZj01bcz005+b@9!)~L%;w4cxV6s0{{!IBVupoWNPQ6uj=k# z3Vg%lW@|%|0|`!(1po*A{(swl@e>${AC~W7K^41^d=S}Ul-3OoLbX~F^QRy}SDLt(RwDGusdWG+p{=pXnSLoc6YpWnq&W5r5t* zvTRuOz_-}Is@Ld&Pv5odYtJO%G;}_Tq2+cY#H#vZDemxTzesLW6xy0+Nq8}XSU}T~ zK31nXCSJ=L85z8HbK(FF#E8335oDGE$44J>;I2CV4CQ7o4HYA`ojSXwJCz{PpKwe_ zC#`@JLAs?LVzy--sJ4R9KoyT4XAXOgM%VwLF6NYJq zf5C5^8VmU)XhjwTPXrKt^&L%ZfXqxkKL3Z?|6;KI>DEhQ-F- z3d^_(Nw$!x`uItGL8*((r65@8pe96B#Ses#@NM&X_`JNr7kMy9dcDa}`W_vVm%QG! zEcoNyTW1(LD#t|ex1}4sXfCr?v)9QI((W`aZP9ckjYZiqLu=&X)90d9sN+l;_;48c zgu&SS$^P2?a$2iKx8-27!YapQ!IcfKvJT?LQ+?+X3-*x&L%C&-rjoG+fkx(w zfqWh4xBuN+Yy7ws$Y%Oa1KR>8z2C;#y}@(#Sirq(c1AI=r~-75G{i*zrnP*($9O#v0G&i!Utx^rOv$_^$XpjP|kSdQNdIOM+L(I+GV zyiH0rX9ARw+$EOY@wv#()xy|ECrz`33BLR{`$f0x)|a^;`{_! z%W!70j6z!)Cz9=uac}UbbOIL3;Y7qeW=^{X)Uqr^YcA18dOopa+i&hfRwM?y4L$kD z-J{;%QQwem^>B{`jtc~B;{20Iv2%_&K7u$G2?hW_0oe-3r2cG7d1|)yODw2Q0+!FD z-)?5!f`jD*hrgcu&bWN;!LR<#N15=$gguzp8cMSiosr28r0b+K_g zfr=RJA%{8Cta-H&rCXvh)=Z~zS!Fq9Zt@r&`@7Ee-FJ;yt)e|Odt77F_Pz? z=tr?dS%r}DDu8M}|A$6`aLiuui`XK>cFiFj=6F##m z^v05-XyMFj2&!d;ZZuo%DiFKA5}Z503Yh%1ryBRlfOh`qozSeBG7wB}^W7?7V?)Rr z2!&uVD~VemZMC482pwN_IBu>sVH6F7O97TIW~R_XD~FUr5V=ip~GXg(6x>i&{?4H;vr??C;*vf`|bxdCa8eL;j zj}@3pm^4nMF%gh?35E~8H0Cm34UKjB>~j<%#C0w*e$}G485J>Sk^=b}-65k~9AoqTFrh?~-A&F&*{+sZ)Q*OGzN8>VdSwbbUS)Hf6D zs=|$}JJ#MLmtmf%qJ@N|nDz=2hD3`dOy?RnVjguM%O`P1T}2xOzHIOorp4OV<#wsW z`v7+S0OqKyRZyqwfY`rY_Qp;M~3keDNYuqwx-NKKUsfJ z?~&GcGyx}OJNda_#7@S}{0M~>rw(OY9>R)kiJkUL?yI+KCm#Ki3yUj7pMsXi(b02A zm?Xb&kz^^;zoqpZ6-L7>eg+@Ukm_IMs*NvGRdi*>8{gd>Kjrrwzt(MaLRZ;^EljJ- zJ62~_)rFxZe@Qh|d%Q}BEG;*`0ZwTwsui$&0uH>Hy1qYsBdSBz&FYfn+ufkJ92t~k zn=tGJIU1^Oz@FCG0w-fkvCf*FI6%t--NMQ^PnVEINFLiLA7I4dY0E}ReuS7Wl6RdC#MQ@}VcrN7YpW3_k4pTR3oquq(j$5R)?uShF7ahx0zcI2T(l96uQ|(? zEoFA&<5l=H_m)*F)|Urr5EY9R6>m>&zpZuBE1Iq8^q@eKx=n(PgKroGEN6tre;lIB zokdC)?s6iwf{Vpdgjnx|6VLQ)+h}}%%jH#f8@_>#)V6DVE%dUqS_YZbFl-^-nzgxS z3JgH;flLTpg2Bu!K>R}=Y0ef29EF6_Ee$36q>Er0sV2_5 zSsaB9-}B3=i_QJq>3aqF(`To<^{)L#Vs;(z0it->tElmg$D{EIL6;}DhtZ1={e^Sg zf#W+tpCL-9wB7@7LVTlfY_sgRa1>fhBY^YVKNq=PGSX|&Poq&s7CGJiEp)uP4 z%)a0uYXmB*M;5G&j_i$?37~=rwQi$%!D+P&lcE>8m9N9tVI_L1i6k+mLbM9(i*baD zmX&iS7I)&##s**V5{$ww&k3}!M17S=JZ|>!xOrB3uNhlJrO*YBSlDZoG=q&n*CSNp z;UhZ?qg&_b2<84oQLcutX|djeSW<2l^y0l*5u?!IZgEs$+gr2K#A(x+mxnI|*kUS4 zQ0qPr8(B4;v!x(lF3Tb-$vTIelA$e1o-j|vL>&UbVls?Ar#;~U?GAyA@4j0Jb%1{} zM7e+Qrk9EhYiq&`h{IYy?s>!#p~zR`NF|kLEVv%>A!o2@EC{K6U<8XSlN6a|5%)v~ zl3>H93H1x)_6#i+9NF54iVOp@v03>5R60qiDs<4w#92lMc3X{(``~aLh_AgN6NkT@v#c%Xb!;8agIBQ2cGIcCv=~bTQ946Wn0? zb5PLBB&i%b(n-yi!kAA{K`Dz(l@?l>9Qof(ixQP|o4||8a=>{hV6N3{=WnW~`3O@{ z`;74lj|@5@$`Qw!#69BJi!;Em8kYvyvqGI@99Ovbxmy@!^+>0n;TTtB(Y5i#Eg~+C zW-0{7*yqZ~@}T+m!?&$yEI3l^$>x+r$1EV^obt26fL~k!n^9irl5<$MU~skdjJrWM z^=yw}Ptxl8M~bE}-p(r}YJC<%Ii~?OqNmFu8Id_0D55vK;eJ$mIU#Bpx~X2~Ptsbq zrArQ(jkehzkXHH(uL)r|#_QU3oOLgL?94;dMkf;qCz;6#hN( zlhb;VL*r(#*(xDMV}dT6R*bJXyb4pxU0v4a>{6mht)Oq=Iq%(}HOQKCaxO6Xva zIdlyKcHf;D&>s&xsmGZ#N;cUBp*=2K`r=q4PrpDvZ^bo)eKsbyXK z^W4e?6)GrgS!)bXvn&kmLSj?)_0mWc>W6NT@-?T-oN$ylsP0XEb;q%w}AaC?{ zf5n{|BC!tv0Kg%EqPu_jE1;9RjVbVlr^?f?jZWo4eezL$hS=nN@82nmscbt)rgor~ z;6i^@0zqXxyMV^!Ih20V;R-!Ce6rN0XPsv-?v~-snZdA1ti8ckQ@$&`W9J~E8!{w4 zt7`|m88+U}FQ2xiv!;tTY4S8_yZ8Bi5-nB{9l+1WT0rFY!TUtOvMu%AN1wkc@%ok3 z0EGkoY@KRP_43_gxpD1@QW2Jqq~vBpyo{@8Qp4=HUv2Gu{hE7%m8lRiYJGmMhc|i*3SyJOl-N=qUn9#tNR<67e-aQAJ9y#!*>Wa_0BiW)iTY3R0*l zh3?7o?HncL8~OZeWszGjqR0&2xC3S_GK8?N<=Z*W%kBl>H2#OoI0@*VfEtHgQF5p~8vy6nHa+SddOi@D^yG2d!ZZUeiwjdGjut1vsLMzbC^HiOm!WZm-4*lb!xyEP`G#LX?EmtQ?W!84@}mmazhKNQ9#C4I`I)U z1l#6X+_t&awO~S00*u@_gwc1^TJiad^p0zjv~pVS>Fo`$v%4~12x^VqFs98|&YM4# zUIJoTK7N;w_5F}`NE{E;s(~m$;6tuFs!ugM=!A^rLBez!r42FM$w~?nkv`g}0qe%qNR%UNc!r7^}) z1Rg1-{mo-G9*W(GP&&t1J~b~&X@R1-MSg}gqt{||@C4L>xmHi@hKwm(aELNav4}5# zZHw|!n5M~1n{Z)dHBe}X>@W(s>Be(`7WE7yD39m*D-ur=g~Llmdapf~HlYLc=#}OH2?aNW%dDxWBSJ zpoOWa6Oj4mj{OHc#H+utPi4XM&a8d*d#JT!)R&UNM=Jo7%!Noz&)AM5CmCR|)THM< z-n6_VRTQ4Sq^$CeM%mTX2cCBbB)qF4*Hu(!j4@ssBvI>D*0xRDyXjfEBE%Ekr0at! z8ARvD@Nb7E7QFTBCl-?r8%w;>Iu;A?y8QUqA(k(r%14otvMm_Wf}OM)MG*jlWuY zs@NSNMocH1zw)#iKDv4*#Kt^L@h1*R7N4o-`)l`tT{&9?rHZf*uBxnLra8O{(SzcMA=&LOkK5gHi^nsr2oJtkeS zVc!^^u$uWd=ZfACp<}5h!#^;DYM`tMG}W`WeJf8JZ%?3IQ89Pnoj`3~nl#{4F4015 z)+fnoaQWuMvk8sr?8ijSe5tUSt#7FFSQ4dG{DMy)tu>o@3gB(Ty+a{3njc1WFohd% zfPf|+bq!68pJ`)X|N4zE_J9IB{NARK@XW%?Kk|K1D?p9>?g$K8+}h$F1@}At!&Gb+VA1*$C8K)a+Qv{Y|OJ*J&_M1 zQ?FJH)eYBP@}gpSn2#V=PSRW()U!_{@rR^GrmD@l?OQAVT!p%N^1ws+^D@fFOSy`D z6wcKT#7XuRlo0_Y&Gi24tF?^FK~*Mo2WsrI-bq1G(_@aeW~39WHW;hsc*fP~Mg1NR z@bFI{y!_p}s%@{IkbqX~{xgz(riAh8a`vgOQQI^Qd|#(&b8ajXSFyw(MW%@4f0A3i zz{jqEuXJ)XrhaZ`H$d!<`l`#Y;C_GImaNtPbPSBKCZMDEx~h679(k_qY^4q zg^}%NX~o!?)bKA(4AwT-K?xo`EEpdh&Nxh2Bi$ZUCzLrU0(M0GeUhuB>utom;NA%J zepPWMy2t?!#~M!dD3)?A3pKwvj76ya+;vlP9Mia6t{=6BfQ`W9{s}RQogjN4!+wMS zBMDs4a2+sR4wo`^^^X5NRWshLv(FPtb3&WBT+A$U%CzPreN(5X} zq}Sr;^!3@KqZducr6~`tZoy506lX2F*PSiRW`aTWWU;|~&%@{mUBwdwaB@Q`<;{J? ztjsK+B4uKA4l zu1i^!D~eQI5%HWEePY9L%4qV~2+EgKMKR&Lx*G5=F|tg>DtV8SZ^AJr{3f6b{Pl%j zyz)OxCCFRpOMFan#znYiy=TF`G9D8!3utZ=-6cXtdtU%9h?LXUGp`V3>Zx&(z<+m? zN(+B;6vmHvI+J*mZKo?An12B!I`4U$eFu$pjauR6u(knKRTd&95!u})G`G@#8O)i( zq$3U~cez9Nj!BPFuKG@!VP(v)u%q1FskUPgg0T+H!<~-?_j0n2|J_3RB6$LWef^i> zpl~Lhgrx9E608>mIST&5wnyHmJPueHkUG3)S-d$l>*ZuHw0;QgZ{blr$)NZj_gwkT zczJeR4M_Rm>>iO*=y(uLrhL|2ACNmjy_Zi-)?K~EPCJF&(7)UfImtU=f60IBWUk-N z&y?$JKX&CgtejrSaPG(;kZc+>^3w!+z#{ z^o05tgp#^ls8(xixB4Q6p_%OFQ$Qp{vQ55PUZaD%3>VI#KtL?GhbXYypB0pWh3T+FT@#xTWOq)z#2| ztl?*Y9xs(OkqSGm>S$G=nC+|&DGMIWst!CSI0T9eQQGqo-S9646_>w{u)LRU=*4m( z>ruD1<3Wt7;UdZ3Askqz=sVAG$DOSELMpxIoV;sIlt!&6mnJ#2hCU z)IxQ+Tc=QCLplO{&Jby}g!)}h1~V*!{gWm6CvS7?<%J}!u|DXX?G<-3bAIFw0ge2s z0%0~i1-W@g*}ys4am}{0ln&z4ih%-BtYUWMZz=MlICP^GlLIIl!SAWH6h9$46m8Ei z#9L9-GfoGni)y!KbBa1L(dDeGGuSS|O0zvGt1V@Gg>(_qw5>O9B7wBb4>lSeZ0)RY|CuHs?HT1%9PwWo$8ytrfl+kGtla?gufrFXJbndA#=C zKqLJsRfi#lw%y)PY;0!IK76P2+ClpPb$22on-+T7NU+5BOoVBQM{k!~Pwc;5(-)vv92{T**tqoe9mHS}AH+1YEg7;QXcII=uoGiaBYa}$NB^j=XZNn+^e ze3^RutptcH?zwA$Bc5BrT^>!IDMcT(A`~txPQ3m7eClKrK9XBPy`|o>a*XeIj(SjP z8P%U}CJ8Xxsb6~>BIHfj*UNA|BvWC&Z;DA(qA$&TA?cMfD?xgF?zX?3l>zkC z4_Td~4wPwHa8^9oV0B+ga zU3l7P;f7c~_JaXW+y+FK~7AIuC%!2st6yll~afP61R1wj?G)L~$Wu*~Vtc$r?fpK=V zNoZ-z(R_52(!i?6QskDW+mo!+!Op~OXo~nKM*<*y!Ay5#-?5IaF2}$~k4(tc?cTmCAiHxSKDJO-t#_Hw2a%puvYq z_&O3DNMI@BagK3mf_v2@o(Z(YmoGLhP8QfI)f;RN(>Z&6i;2_a(2T_vbE}vXdwK3% zD%TUzmN0>KUG0<-66y5ZwzG5iDArFwxyBubDO`r@`C!Ry8owrUZN&;_IGE46Ldvl`Hxgc`m=vfpT zz4PLgK2MP|AIOXBB{tl6FRQO;kq}p%%jQ{Bsyth!DPE3NmO$s2>LXv#DTgXk#p5VQ6Xh!=O##-~5oQkD`A`39%Sbmcqi?wXz;?byJgQkx(JcB&oER) zdSdX#Lpfx~C7qB+O8$NdZ@BuX`*E=78^wZet ztOg}ug56fXcOSy^4C;CQ-Ay+xKXfR9{t^RF7YOao9Aj+nX!<{r0U7DPH?PlY`?+M;qm+U_o$qHM4$zG>Xq3_Jx331V=7OO&zWKJoM6N1ueF?F%gxbe@FB z>r{LV_ZeRnBHXtb+sX2YZ-P3`EP1)!#hzzheo+~5V?iLri{Ygl21kuv&v%uFdT)>Y z%!A!esR@91QDv5F|7B_F=ntm&ORx&p&EK)UER+#ACkD1!$Gy?RT(zYqf8Yf+uKE}R zT&AHA9E}gzxjcWcQ$$DSxslq7GnLkeKo>VVU75w%VNG2VZc#Q=C2C=I#czB;E#PL$bbH6 z!s>Uyzs{NbS#TTF0R7KXC%^0YeYoJ4CN5A6{M)F(@4~+oo7r=ih zKnL}2L1+I}4*y;B_d3}xQ7X_B#lJ87Z&kD3wftTq_@%`G)Iz57yV22a{_&pN)UHW%N_e;Kk;^#H| q#qIqr{Z~%@B@3!#QvK=Ne=xM73>3&tf3!W20MZ~+pQQQm?f(Gb0HLb@ delta 8335 zcmZWu1yCH@wjJChFc4e^cL@*(?moD?5AM$379fPdg1b8e4IUr_g1c)75Zr=)?tS&{ z`|tm*uIkgZcb~4>b=H=(>UHCE@WF}*h(G``02KfLPy-m%9yG)1@fqNunyV^(oWL&r zGxF56jOySRuE<`bt>UC;92Z+0hkX0YoY2$j94rt7RrI5jbF-(-x~i&omlIU$lVwbZ zf{pv+m_ChoWfuDp>ucxj(o16yuouEPX3jn*SFqmEb&xPm!DQA{YzXAKLt&C9(1o)a zE8P7)6M;^{(XZqOc29@q|8COvR)0%6ImkOlRxlsz34*^n7(`tP4>y+}6?mGa+8my@Z?+$9=5<@YuM-l{^z5%R& zU6C!x5Vf#S*`IyN3uhElduPN*dohhH8@I?Jk|i>wIWe4H@o)x5RX&ED1oTo$e(-F7 z%=kB1!FD=v003A62p7F5zO%8~Q$F}u%fXX?4 zKQ|s)$1Jl}`s|IowN@=oK1Qwjl>5VTRnO1dpKN%f4^_DKn`InUOo-hCCE30A&#rHM zd1*#`^BoVJr4;fL`FYJ`b_C`sObTi4kCUzrCg+}a6|Eo-@6L&Ut6~>hyz_(=D>}^0 zWx4Suu`4tH;Q1K=@bA+J>johsvApZ2A#MDF%mD}ORrtz@8~JxIoTj}z2^KwYSo(}T z1A`M)G8-J#x%ECfJC`kanU$w)(Q!T}K0l{pE;@*2W^@)7ekXyG2O>xoIv9Ua?^)iw z{Rv0pYv-nC5C$&@TCkqi$$VW^jFTp(!&bjxE*E^=6}Xd?JSd}XX~3JWp*t@}si1SN z4Xr3-{aEVy<)_mD+~OyO*t&F3EFu?783W$**Ynds}et*)!`5ly&d-$PM)Oc zAMe}jc(^E?9ay+fwHQN1Z5nae4Q#}e9AZ)9;pd5;aB<^pwuVxqUIW_)(Y4zZP)#Zm z@wjp9>EBQgr(V;N{q#vT{Eg{spZ_Rl9$`+g)1knJy;oVNx#e1gT3>QI2$@)AiUYcL zF&4kJiqz1}c<-~EZeh7>J*|LLdP!gGPWH1_Yt7)RzH@U)Bu!RF+DZ%OFz3EnC(Z|T z!tXzTs`VbB@%|1z1YbXGZGc5D z(F?+H;YU|&*Yq*sm->cHYNhtN1#@dnrK(eipRkOFq}!qI5i*XqO|uC>(LcGceh%l9 z45vWWVBJhGzOCD=pZVhuiwg6xrbY=e2bmF)&KY_-@zO*+33k4PcN z8=|3N&v%h4^l(gDymEvsyt*JN+sxm#AJ>2#=CCgGgMv$o*tH>18Z)3#1l-8^U4b7% ze&tgycMxHu_a7rN{!#JBD5nJT`x8?OpU#x-zfTIy+yQ48?*jEj=WLK^{HP#so!T3eZ6I=-P-Uoz_iq{t|??Oe$U@lQ#o%!@V8W2tFA zI!ZGlw(!&AbN9`OwKm=8jCBAhXmg??U&Y_|y_sVLdA(!7kZMp0ATbz<=5zdIGpd68 zN2G-<77v>v!%O~Yao}-8x+#rVt*c+V{&2W^l%0oYoVj7#M{rcg6G}FJrr!od%L^c# zulDS$NxO#ucu9KYG}tRtTUKqoS1>Dsr*e_kv-L~$=T2jv4+!^CS)Sf}K`egCW(%m@ zlLY>?_VLCz5%?T?V(WvZaJEL+r48}S%7{$M3r@0aj$m&yo;8W*i&7#I=IkS_nXc60 z-m?deMh1Aru+u|Xw>p06M<3U>q5$BEqG8{*M&9P+y&5&ARO60|@sulpd%c<`9G84! zH_f;R!4uXWW~6stYZA?&)qQPWTWj+0=Wc-K@albO>4%(Lk-Oc?v+ZeF(b6v7egvSh zR=>_V^z5c3S05^Te>@AT9sMIU@ccAw)BSi?OpJPH`{NgMcV6|`=jn2nKlk~zA0Och zL18{RWg;%aoM(XfsyTI>GO~v5WbyTYX$e5AC3B=D;Ap5I>OV6$NWSdE13g{+1K9hr_|L?8nrlV{_~! zc_sEofITnNSsm!gJMijfs?U%$5vh!K8ZKLntR=89T)EXVP1jK>j+!_zpYP+G3<@ax z3#&3B^{l=_!m+NSjB?@eh6Onq2D}==EnJB{QEdT|Z2s^rAVL%C5A{|^j3t!#gQg+> zX5v0m$>>3XYf`#{(^pPoy8U@Ekx@I*f)TLnVAd#9Nd)JW1#?5cbx?3$8`?61TAsP6 zf;Fe~M^Bh5{zLRWi_|+w3UCw8syFJ~RA4ji3i7TdQ#C`hPh&Jz+n3l+e9(JEL2=y* zOw^8&dFq%k;7L=|O~Y!F0T&m0P8e4GTEsn$f(Hs)K_-#*fLDxP?ZgSe($hN2WzFY@ zYfoP&+d$rwIZyg=>+X}#O50-&uEKqQ{Fxbz_XZj;&lQIxBKw`+)`I{a&PJ*3@I**t z=Ahh?F4uN0F2yARZ`7*)BB4r&{7=6OUH8p00$iuSquDuz@|8|Yp%jKS0ywjKdQdWS%mYsgA!eyU4N*R)mf98>&&|>~Lb|`F zfPJM_{qq2P*{)s)c@olnMahw%DZGq}g9Pw{bk3A!Z37GJmXD{CC4C8#v9~nd0lA82 z71z)kTQ!b78?yJUNk-z6lOP%~v0i@?Al{WvNczy%_KRH&>0vVDQi`!bmys881+{>> zP8x=-mher;X`z$oF(@xrwsTs^*se7(xcEioZj${}q48>tR@uZ~h#uMx?T&63-46E# z;}yZB6~HRQO$a%q2qEw+v8A(hXSN0}|xnbA7(s@eUz&Dg2aZ&ViCf zFu{zj8uPLw?Su+009FLj1Q$T|;&-LAx#vWTs*CB=c?xn35&~JV@Ri6~l_kEja&LcE zwcC)!d}AtTtND~s5nW-Y%~c(xMZ6HbI}rP&4?`MexWkf7Aa+ zV(7v?+jz+PFn&#PSuoYKRO^bo)_}vuYMn#CSU#?Bjd3PRp?G&j{|dim+Asach6=Gf z!BRX#Kr!_G`CnD)iLU)P3IPD{h6%qWfpW9^I6A~>8+^zV00z}IJ@;5(TgLbo9}T~Tz8+mr5BS}P%;7Jj_jQXi7Z^NQ(naV_NCKMW5x7$ zAP#@6^wq9jkLWqTpXdh`4FQ{ZMocJ2+-vsun_f!ihp@7o-BfpJKz<^IMWyEcCuoVK zL5{{-LOPBfSTu7fdgSb2vekfH73_S}gLUMMdj_FQ{DxL{70y_T%_=OlaXnr$3Y0j4?rjGMffrI zEb-p88L9`4)JT#!$I=?`pnswjk%da0 z=p-!0`S{hw4#s%kTrr2>&0<>DY}>e9ApNQBut1pQM648oD4O<&Gv$*9tN-0tkj?XM zCK`9v5;&YtjRlqbN9PbVRKgxFe9XY**GxjRioq^Y@3gsC;D^UKW?-8ho?JgFe%pFTG5Mg*Kn|0+%@3+xt;XC zyUY$Cfr#m>Kb-=LT+z9qrYN8JS(b-9r!RviisMdCT890bwdGOZF(?uEA>)y3!)O4&z7z;PXl)3JK zkDDe=7o$Gb5+z^KGgQ|3u(ZJxr2}q|r}plHcx6du^{=39q|6KsrHr)RA)Kek_Y7_3 zO2on7Z{=<36os`X);zqL{0^ca4V2*Hu|X5qi{0U;R0K zl8TalyfxaZ0uw}=607x?)J&<**}e_LF;w|F-TqrGCk;?9*QnWP3!eVAzvHLpB2GyB zYfVs6!ro!^ZRofLbL&pb0YaA6E8)2SGb7z#;2&3Xbk3tl)8Uz{w=n4KwOI=jIW19%`=pe&(%!;=+Nhu4 z5x(=JV|U5qm}iUo4QoT6`L7%T$4Qqc;5pJYb(pq3AsZYM9x{yBEl9V03Zi z`B3SnA}UUP!lV`k#`Dy{Cxt(`bDmb)d;_fB_DSYGF1;qd%bM@H%^zRAXWgi0*!D^i zXTigI0Hu!~cL%vS+W7p;ew!P}A|5Ge<1v}R=JDnx_03<%Jchn@`IV3LiY>qxNlIC) z@QGkroI#^$PRRxf`P^O^h`nZtw-4wprS>2e^EAAFs9qdfW4@rpgNG_ z%@T1ml@F{YhbrLM7F5yu`rVyAgjZ=h%n*0LX2I2#*|AscR=(K{6L-y3bC>zv651?$ z2XU*zkTUHCm4SqIE3*i4N;TU+18sZ1vbpEiX8&b2XsCE!EEu8KVA&$YIxRs$ta!R} z62`4&gw;tSCE!puIpkf*g`jQ>Q*Ov z_J4etIL_W4x{~p7=5@<<0WQAIaeS^mIUuX~G&fyzQ80$~c_Eeik&{>l2i7y*0_(^` z*-jlgMGnR|^Y1+tD|HfmDniUf7W}|TD6pji`+PlKxIy_s-2Iy>fpvh;(L6*veIj2F z8UA<0U(-R^5Sh|qgG4F57n|X6Mba{ zl{4;+r6h*p3P}_pp{D3A2r(VDZ9`GN(9+>qXv5Ioz1^lun%Fgvoyp2m!ePc3@?M=_ zdR6TZ#<>R#>pjmhZ?N_;Nm3xeRY3=ZbT3Tm^V-rqk+$NUMFk4?RaL>6OJXqR@|HQK z_yo076Uk=6dr)?w^QP5Gb;D?SnE~`B9h@}DZBk3T8<_3AXp%_q5jGBgZKcuLLk?>kDoZz8& zFPL6wBg^_%bLOJcRp_d_eHfy=-Wkl=4Mju3%n~GfFVQ@>=tKC&Y2$Vpc42b;CuZVE zr7sa}N3qy3N4Pqsr_MN*0m|7^6}-P;pQ5Qn4k9Iwud;4-B~n=d!#PJMwrFlH60Ttk ze~(New5V$;dXvN$^VYv9FPTbzMvkC_KUd7Hj+?y~T@L;(3KX4`V?l+R`z4&uOpVfoKv=gdAaiWRh%>c4RrJ~>{Ck(3FW zqE#IF5;W=Sm-t0q4fyaNLk<%?xV@TD!3hWi`Bmc$?0;ApE@pWsb&IV=WMgu!r7CGg zI&fo?N+CeTt3Rb-&|W|1tTTS|TI7~CQG5(i8?mYe?C2v-t?zv~4f}7bI?t6*!bq*y z^a_7NTrxL!tPTV^Ru;bBH8hl!vVJM4lI0`EbV~kg8Q)OZ>liwoT&~`sSyIKpl6jRZ zJ)eB_tt6~Bz9F?r&Ff98a-Ev>F^N0pyscsvRabFJ9ePsb2-FmFtF?{I!=ORmV77i7 zM0Ta@jDmv#kJ2g>X~Aft#7Dkhyu&DpQT{xa(_eCrr4w~18^W(2$>PncMJXD$q6WNO zV^&zrobY_+P4h+LS3No6_JO6L3hhM41htKhj?l%7Y(G3*;2dI%Y_rCyJq`Ce=+u;Vi(00A<@G#>0W`^ls__i&K7x18IyCyXg!NH?O%Gfy7BpxZKb_8 zoQu@_8;(W$RK*5RaB}Hq?Xd#&mhU0jfFr1?t}xT1GQm?C&b6K|q`T0DnF`M=skE4I zi3;UApq{;;%rbEB4MBi?t7>KW&elSd1?8`=Wapr$senWuihgr5{MY7|1pe^2-!#BO zT!rW^6Q-gRqyom`wEpSrCpe2jk{{Hvl=u>ld)}==Gsy9J@E zJ>Hes2q_m$TOOqZ1g1$8mb`gcNoZ#qQHS=A@b_Ta+3ynV%gp_pCC1?}v`x*W!i+B>J@Gcj}}X!_ypdGEZZ}Qn0or1D8~Mq zpvGWVGM{|Mk%(=GrLQ&|XJxr*FX=Be+6>@n@%Llf;dM&_|fS! z?cR<`crgNEOk!LcwK>czlA~XNLANO)K(TiyQ!rKS1V#E*!K;Ird^=zU;<0lqRjCTD zkjTa@$G^J2clmk_`fvb%U85O24IFe*C5#god7>PEGUab8_w|zzQCd1?;aN>{ zR6J@#lhz>ZEm>DOQgh6t&*H`N-pF$z=P8 zGsfNGcqB-^m9VIexi#F$e>=Rut2_}Nh?$nVReH5rQhTsh(UAg3mQv@;4_Q`VXi2mp zSwj6_z50mm^m{2=a75g=W%#G&#rA-*8+Z*b7G5bFnn~7D%he<9$4*2FAxjx@@rBiZ zxv4Lw81QpP)dg*mIq^iiiGzN7i${t3%AJe49%vlEIO*I8jk)}|Xei+1bMN=|oh)6Q@_X`PRpE_ktYau*rCVqD==6T8 zGFUjYHm^NlFEZd)>#tL2rgraTz^$WuG-B0`WZW1@{AJp~KnBpQ>|>{`g>d9b9i=~l zN^HF{-X6&)yWYDpiC9@-V4^vK1>Jg>>2AfT{ zXCy_{PPU4+Kd1+~#s?Z&7>JV>?i~}9C9Mgb6lO&r9-p4;_-eL>9qW7TmNzIR6qC~wX zdu)0V?jBV3nL#ZrOefsU=*cUX7TQk>4Jgr|fmds5p%G$>)P#i1g5!?)m8 zr)1}Bg5S9lqjz0-Y0YcVAoPIUd3UDqdl39$`E;z=ieoBa(UkW=|uqqC|9r)86falahnJXlpCUxCra>Ek$YXkC9yH^zhe`cB>_^^|5)OQPcQM=fr-|J`Wyb?Kecn~x# z5;ret3ATKwg8jD|xiY$q>FpHtuI6NCCacoo--XFqrw>O*h5cVq(~02hsZ6mD!>vbB z*HI)I@%LR3!|^yhwR;Otb%Ag?=^-qLhSa8Nqjh%Xh^(=u(}`#cB5?Utpd%!`SQ^gj|B0PrHk{;x{}*2P3a_K$Ev0s!FuFDn2Lh5>tEq9gl{;n!6o}=aQ=Tz#Dp(S1h9B!0E`m1KI!WC;sza zBMO)>Go`^lx%wpxul{3wS+Pf44RkW)2pTwq|xt|JoYqrRn*r?ZE;lylltH_}AP20Nc_{ ATL1t6 diff --git a/pandas/tests/io/data/test_multisheet.xlsx b/pandas/tests/io/data/test_multisheet.xlsx index dc424a9963253a1b3bfc7432a50b061755aa2dd0..c22771232961fc9c9a91f4aab55990461dab22cc 100644 GIT binary patch literal 11296 zcmeHNgG8SX{5VDK#&-^WvHRMLsD8=knZjl>F!XvB_+Snd(ZV8&$)lW zclR^1_ssK~wRX&U-<2xzFt9iPc)$|?06+ogcYpWR843U(hX(+#0Z*V`iQC&bnc6uS zXn-9|LHew2wl?HBu+a2b0BFeX|F``=egdBp^yRzRae|NG9>oVsYvyS~2b%O1TBrQZ z5PH5kvB<~2jUPT-^+@$1S9{)vd!XenLI6!6H3 z`RJ~YJo}T}z&d!xIw+Gb;}_Eob#HtTHHkm#5c+$pk(eUZc6{b%=T_4;h-=;`&LD_TF2;(jpJ}4N)q2+QqP_mm`6orZ`Fr1_Q#2ZklC2 z6Ai*`Lmj>r3Y5NAq^Ys1s4mXsH9T!j*>Y%6+Gg%1Z+b&&BMus5kn>r><@~E0 zbtchFb_AaU(Yz>h##=1GOPmLhoyNq+_7|*j$W*t{d(!r=y?0o8z?9CqMHlpD6N#e- zZ2;J9xD97#*~2p(S%VNR@C0cjw*jlQ$__Hwn|XP3lc%&KPBGsn1v1_D-(n4Se+S$S zSbbEN7F!dC3mLQWxI>VZ>yCJdqPW7S!m=W`b-RNM06adz095{h)>=(=s&j}5DnP)9 z3_+`bqp1yujrGU=e<=Nb49!2?dP$t3ayL6l@UhHY$iUU~@>>isc~?=HW(o~&U)d$} z+NfM=;^lTa5)2KZ02pbXR`0t{i_3yhdjk}g8{8!kSU3Vyb*`ns?{3~WBQVlBCP}?1 zS?|GgnZB65Op%rY)4R0BFcvoyX3Gz*Qb|poN>pNuvc4oj#Lg!P#uG~M)9q8#SuwsY zgPImoKP(NdsDGZd7eAWjGn-Vfiz*Vzr*JTlf;#{*HlHu^9I&Cd_)es$Zoy|+VVvp6 zPYE_KvFSLINNdNw_heB@A5f#@#=7PlmF}m>y71Ah;XWP80QWpc>@FTS8T5}JD_ns% zuD^ri`~pD$8?vGU3jh!SoA`suJ(@B#`gBsKb+bhn1O(4X3j-^G9tgL$RGq zUOLl?#DoSk+meKldd{A)mo4#o%uHXwsdkNz#ZbKp@ual?pDAL)4{Z@6kJimqCAe+8 z7xOu&_{`V!XXc!t6yb6PMi}e`q3DTGn!nvpj0026~Rv;$zXJg9Kw6$Mg$9NF7e5CkxHN}C*Ac2Ytl|$i- zy*G3D#RN;dg#V410G!4}`!UIDIe8vu7XP`nX2G`Yqsqnxy?0$;LphL6Z)iGrz(|u8 z^qK|ebjIZDF)$8!sJKu~C=Tvo*HrD6s^ZLw z)b|lC_Q&(kkQqw{4L@+}alGYsBHihTH^a=>C*%p5q{pQ4%=IIC{f2^ei@!ZxY^;!-{y~Zmxn%v zG3L@H@LIjl?fB{oJ1<@5*qOdadU|s_`N{G_kG#GQPNcvf_4yz#7#fc;l2evg{UFtb z1Bi2N-_SOBu_E45xc>d@A!g<1%fjh2@YthD;qVYNG%qJI)XAE0L0azEXjNo$c}5Ov zTs-rpAEe$>IJpl-SbX014$efr3nQ{FQhMhw5w5u=SG~506juXrWX-YWDy07Ln=OLKv({6-)|yZn0N`+YQV9Soh&#WGXz`TOol%{;E+Yb0Gr!SKVbJZ0BV9F)WG8i;qYO_KRYzjp)H@F9NRM zI&XcdIt?H|_&fXJykaNufg}@ZL_ZRWpHOkKFts&h`?=@%!Mz7MqcOxhIBis?B9U8} zTeF|3b$DLU#OEO|+ZNmDPUSv-!+GT1H$FGNT=+3?feH&Nhn!VriI+S}jp+@8&#)LK zPSGRuXr^r63U5t9sfLOx8{z2o_UN&Y&*-IIixZam_LC3k6?unRY#Mq9bX3o12Wt*j zNYLaIXV;-=fD$_Xi$~C)tBK3oV||HNlwBMySw3C$DvMEp$+n3@p0LBA28LYe9nFaH zKe0d6ib@f%*|iw zZ7)U6S8|#W))Xl*w17JHi4}82sXE3)H$Xk>8&plwEB7rP3AtpXAtXFtNO$Ua9T28Vmr6Ps5G zSCS@xa#oDm^`<$el~Nq)9{3i)cA&#@%tRx3Qf#?ICDaly(nZJ0xdWFkX={C5Fl7Nj zd7J+TUQD{KQaphm`*74eE2GDZGqOVTj9)VBg?hT-`he>J25{)W4%_(JIVMuAZ(f48 zK5SC5XD^O|j~%OMr$*d3bf`-TL(KNt>^N!Cbn4muQ(?~73UZ9vw`9gv4X2!`$T*7% zXsQa%A;*-M^D;+l6S2|zAgI_(<4@@i1R%S85KGW^E75l7Z${|1PxX6fIdQ*^nSt;* z3aC5|*dtX0s~u@&^ME32A#ZaA8b<OwERS>{``YVwRDhjXyd@HH}DF(6#17Br8ZJj

* z57B|C^Nkf2I@;X%-%SgXRP`F63rlmL1*j1&HEn0FswM?V(lB~~gdYwJ+at?RMjEBu zP`(2Dq|9o#Y*td4>3znWtYr-|K>9|hdxA2?@F#Jo(rkvc&D%oO21T=0vL*eb96GKMigZ8o} zHn!;rP^twU&!h^j4$qQkpi-iT*`Dd*PeHKQurH=LP7&IQtHt6nf)C0Y>{j^f|o!@D>j^yDJA zHi8)4Ts%CW>v=17hwK8l=6z{El;859+S9>u3rEZ0tEXM5WJ!#Q?FQfWUguAp3tXu6 zU+qt~8I+zk)q1FCOW?e`KbRZ@*ro4KKKkY?%nIl949V9bQ{lZ!xsg%R>Y0{Ku(Caq z#tHG=Swgi6&}yAVl`C~OSNZ0?Y*vH|zpTdggt%3d?L4itQ)re?&9Gb@y)|o<8E%@L z87wL}VP7YQTCTP47Nt~u?0mGjNRSMsrEO&EXbqpe{AYi~mlh(u3j+WkqC&#EfB7qr z6WGQS^utr-y|j%<x+sRBwVs~CUPck|08tr@Ku6HFRC zOj>WfzaPa&mdE%D333#W`o8r#61HqjyY)5@s!Y0kZq-liKr~&e(OtE8b6*CmIZ`de z^_G#@s85i0l}N6i9`&uMxvg6TCtA5`oT1|Dks&b8S{8o~Db*lde!_l%@!4XN#6J&N znFxM@n1-dCf4W#|6;N278vOF0v^3@Y_nIbhsDp9}xJu=&@zc#5RkbUn{7W_QYbesF zOrQ8YHXTajurFnsIgg9r0%&@_eK!0=tdAgWSYl67`nWoqB~_Kq=cN6%sJ4x~(O)qT z#k~u>SVAl)#-#-cUZRWXg;r})5CH-=p`puk=;XC|C+|)hU@ow%fPsXU_x(q<(lMe0 z28h=IU7NX0h@Kley#f+dfdFMB6F7+%}QB?t-Zydsy zI%}*1{lt4lb;(N6msaq3o4!w$0Me<3 zDMl7Vt2n4jGurEbjpIkfaT{g`G1|&X4ilHrS>PS0-sKn~Ptt+keHK0ZtXX8AM|KdS zW|rg0r)id{WD5R5Q`@fzl7zAA=WSsYG80q&F7-|AT)Tt8C@Cy5W?w#xBusuzejm96 zAFDdbO!0NZg2!(yPpZa$4OoRr^O=(n1+-Pg9F7IVV+TnZO{nA4@ym3O5vLA6OQ_ft zRjlJ@_pmB|T{G|P{n=fu5T^R+*SWES%mwA7VRes^SYR}>uxtHUhs z>Zc_PAc@>UU!&^bOG!o~V!D7_tA{otmQ-Fu6nUpOl&7H9c_mpK)0D;y#4yTgI7}2S z1m)Ze;7ou;9Wy!l{i(sS^uySP&fXrh8z8z6UQ32oKj`ZH?X+!O8=E*AP4QHkB=cpJ zTkYkBPNi=7P-I}<_0>n``=~x=i|TAHM6(l3+q`ldejn>FLtu9-!5D- zE3AQtFwl?uJJM*lFD9tNTRG9XoH<`D_nUf}dfY{na-IgPO)--ByOn{5VBdXM*_cw* zQ_n8)-9%46QBFBwdo)T~{l^MI{Woh%90(`KAp!sdztTOBg{i3%i0$Wr>jym~XzAOh zvEz8XuX*&ntFdG;kd-CEEC3YGgvd@#*^Z(m8{)E8XXM>qH9wx7DXYQ#{x{SCT0t0HNs z%?9hJ6K!O887g4!DL>R&%zyfd)r57{Z|jv&Q3W5)!&2xulQ%#g-+NzROj3?~a{pD~ z&4lCrAflHn_6Y1y9-@;L)%ny}sk1ed`q~E^lENC|bQAd+N4!%M9l!Fkvyiyfoha1A zBOYK3qr-dFSX2~$9BS1*R1_Zptk@jwoH0KxoJZ~=BA9&VuLm)&9t6B3L>epm{3S}R z<<(QfSpbT1oGmOS^}Nrwv1c=RkA^v}BM#fKn+9oY{4_JtByUKtV>=N2)Fw5Fu(Ud0 z)@SL9K61;jdrv&xUV;m@6>XJO%fsHfYH(0e%_9wd(n_eCbg)$CNa8)=TO?Ky_Qm7T z>fK>3D3^gkMQVa(K0%iG!kTJ~iM}e_SjW}+tt@@CEsl4DSO=rYfIO?-<(m`#20VtdFDo6} zx$;W3fzhk`;%J?sr-H)iE!k`n054;{Eo#Z({4m_cr!*FZ9Lm z`jwH8b~cRF<^m?Mql$Ma1uX(|^_+<`p9UvB*97mV(`Q68-o!ITNmbsoT?z5@d!Tu6 z9g&r-_dejd^!O;Za4yp5Mj+XbB$FEEsvdOOm}hZ#pzTMcU91>s8Ld7Oz`%7k|BO~K zPJd}w$2FEL6p|5@raA4lYpwQk74G7}oe=%c%jln41(fV@F=S8|M&-x-+C3`GtRH}IXN5#c6BD`v(3MW?}-D4YOx!q}usqs+zZ%O!hr&P+-*+_aK>WlV#QFkUi%YhEZ9ZGR8l65W( zHNV=AL$3PNc~yNF+pt+?5WRzphs+B8h?2!coZX*k|Cty&8Ct|>4KP`Tm^yM1Bosm0 zM0oA&{lMOo*lMmAJI$6lsXY$+DbG6j0e3s*NlOR~GJz@T3n?t7x@?N!vqrR%)H_$V z;6@SZljiNqj^-vak-$32xM0D@A*{sCqA_Ab#X8wMx)B^3 zziED*>aR38oWXH^8r`DrIY!rCTu&nfmm(0-jYid93IYW$OE}cZ3pHF(2%TBHGL>=%N8^wa1cE^>xs^>vrHu`1P)WK!g0oY$KVY848)#3_uEe+&RgzHx=(f{K)&U; zWyiY!jtHCiH?>M^lVV{;6hMoh=Ja;YDo2}oygW)2x;aQ=K)N~z6T&&3N;=54(^Cq_ zKZBE)^*GGFfycbWD0g#MU5BbH4Uv?N>S`68S+2(k=E-4wB?YT^zQq{C`kF?u>PDA& zdBo^Ldl}fNrhOiUr54d0EJ#mqKHe)7G?y_?m56L#w^S4u&dQ&d96nBt`?MfO*-y;& zzzc)l0XGx&mB2}sKu+~q86^UPFEaQI5{3sQoY4J_tKf+M|F)|kg&?BcJz6RwKl0Ip z_nPY+T6?J1;<3q^tLMmZhnO4I(haGTk`vxi{(T2q-DduK#h$i9SN;PbBd|V% zrA_!Z*F5)5o1cx03@_PrrJN~A&{djrSMlQ;SKu7-HSc92F~BB~(d)#pTIIYp5H||d zhWA`i8_~S=@3wqYX1In_8~=`~)V(acVu)YbhrDtA9kW^(IhvZNJ2_g~ng0x3O-4kK zy4W$`##+}%yAEs@7%AflkVOT3pOV+)qH@PCq=c`mg!*RxQ4q&3rY`O=6H*9$-E=I@LQY9U^8Bi25G+g2uU8z?A%L5oY`mUIZ{m`&}g zVg5kdK=5s~@-1lAD7_z|E52h)MS%g{moT>4eMNJ_<>jRJIWB2E` zYiw=bmFBBdPLKqrds9r<5EB_}F!QXy1Bu_T@B)={jpwXXjJ}n$KX1M$At&I2(q${u zxI!USq|%q|E!J1FFs*-0#7Ge?k-bI?bAvEdMKKq3WO6RY_!dUaJPXbT(OFO0+oF8j zg*c6@Fa3+QCjlr1IB4{FvEOFE?tRWxG>&S7ifXd-;KAu4-RK)>2wB|n)qp;Gd==|- zZ}dnle5Vtsd}eXv<>%{NE3f>H$`bAk-Hw%GLi=O%ooe&2!E6(`zu8vZ>ir;zK;rJ} zOy_+{b++5an0YeYnsQO5lH8{iM#I-xhpNREBT4q|T5YUxCDiQo!jN)~T!hJN^@l1vp1*Y3!- zZRHwpX8E{HfAf$0kduacnFfh$l_4;~fONo2?13td_6{I6BL{~c(1G-d|JV0|@Me0v zsQn^4O3*RPB^k;xapSNGieE{N;`8$gW2iV6bIk%^c9ltJN$kOFOtfnMiu*#;R}Z%b z1=+o=vFnn6K-OR}Jyyq9(TYkZBZJTn%U2QQl;WJ(+`{nmJ(wy>JEQNXXnXOn_&Oq~ zpz(5OtzN3-lxA}@81hx9^-d9NJUcWkVKiSCUZ{tM9xN7YPjVoCLX&`hNI)OlqbdDJ ztShy6wtjXr$629OZ@Ztt)8kV_mZ69Vl$_74WL4?mzjditOH5zD0oir7(MU^YF!9+= z&k&(oKLnOVN~?SmR6#eqxv-6tFVvJSMq}p0$C~SxwjGKq-isORKS>Nbo|$+Ma#V&;wRp)XW0v!1PPv2 zUofL!E;<&?trnG$idJJT^4p3j>iGoK?e?5Cd2HG-s84c;ps=iS*Xji2XM6W;46 zhuobSttPh>=}z+1((}Go6&QVoUg|Zu(0M)VR%!Qrn_DR$<=y*9o!cRk;qm&J5%Ka# z>iJzG%q-8b(fDJPzO!g; zI`n*1Ce=YCh7(mORbm+Rq=7P%;iW`}I%T21^SHmSD<(^4vW2KhFcl8V6DwF#UXyYE zEGd;>Ym6UXNm?~`U_11YoOH8dpg)}4EPCuPMvAw7LZB^tEr|pi%%Map?tn#A_48HM z=k2BI!^dT6kPJkNdJ`eT+4czu${^vyk1fsK;JQP4Ajf!76HeF!ueEOHHiY#N()9eh zo6hm^8B~G%C5Di;DCVCz259eS`ahC^80o(^&p3VSfFEO12)F2Q9*Q}(Isv+uqHKLg z0et)}_;Em{NqX65`|SobrPVeEoZwYKu%k>1zefHQ+yYLm90ux>Z!f}h@vPou&DtEU zl8Wg+fErSwmuex8%44b8kkSgRK^hIq$jW2$1p14n#`eNruOHbYj{0bO$z+ zp&kBMq5oyYX(FR4pEnqIYZzWRS7$pW6%c&(;oA1>oKtX`O+$nUiasjePz^YQ1`CkbVs5R8GsT`D{$kA&cmlIgMEKm{lX5qR1c|4~fKj|0ltOf@Xn) zz5hJH_{UTKy z!7oj;kQn&4S%cq&fA6>c5>`d|$L{Oz3Vttu|55-2>EA-m{;M4RyXfzAvR|UOkTHsX zU-;juX1{Csy+-g$ivgsE{0~}wRt$dE@_SD7OG_>3zi#9A4C!|bzb78QG{imq$Lsi+ zhWxJOud&xJ9RR>I1px4SB=)=X?~d-5d;#^(Yxs-X`(664ocv1`0N|tj)4Bg(Xcc)l Uh@Jjud!Pd3Ag10y|Kr>L0c8%FX8-^I delta 8371 zcmZX31yCH@*7e}-gF6iFPLSa4Zo%CNI_TgQAi!Y3-5r7k4-f*u-8F;|+$Hdnd+Xi% zzxQ=@RiCbXde^DlXPv$FUitbDx_IhJaPU9?A^;fx08j(cSA|;L8}JxmAycaToWRHv zm4K*{)<}F#t|87g@h)xLGZ+W|A(D=k114nG69WkgbYk4}AKx--M4z9pJb%kURh%37 z;Ah7PoDy#g5Osx0jtb4{E{50HMmbTd!rv00wjfwR&-MoGy03zXb&Oc`@r!zP$gB}Q z_OANY^J;adzmu;s@9Robf;0eL|GIH)^W+bW?^?90Kl?$U*uc%@zc{&1j*{Bs)M2!s zkFN}NZ3ZTZ`|^ywpGV+a)NE_{Nowq>mZoKmJFMvcX<1lA%P~|p@tbtCS|L4VH`n7@ z#9o%i=qK`v=qwf`Hi=j5l@)NW?g5hK2-uF$GOoIa7O_*ccS&XDlTKXnO+BCh`r*?& zw>Qr?a>dV-V85p)`Q)(}xZXCtVDb}h2EVWExG=Ly=qbP)xPZCWebf<2GqJVG!zzM5 zTJB~mqv(w{TD@6d;0n zfNI}$Qy&}!EF>;@LJ^J&JL)9#1pE8?x0cXo-j#RAT`{vrM9?L%UoP~ZnT|mJu>7|A zc?#%Qz{R)sPChe&<#>a$4FdJ-it4*;o#O(C>zET&Na_lc!eook5FWZkomPPrJrZ?y zEr-rT?b2JTe0-Gmh72py(G{EGuX|?6`A#S~Kvo@FyD??MD`^`(hP`5ffrRNlEtL^S%2^e}1|#|3c?uR~d!EWI=ul znLWORDwATm$M31PPNwHx_myoRPoJ*IfU6QWJKXbxRcku*trhu+C-Ey30O0u<4)C9? z2xUh>L|_papdoGgj>rK6=~Mj7i5>Y@FkEK5z3`Vju~_;|yaIz0RkIqMHMtEwy1JGx zdz)9JZ_{x;Cq6%Cpf5RzWo3316@4XvQ2@e86*-xF(Cl5=y88}8}^W+M&_h!t-F($oKI48nYjM#8J=pLP* zI4r4toMH}N2)zfoj}mv{$t%Gee;<(=N0_{*sKZ9-KEy{fq`xIccfH z!(8$n*dxi<148%-*kPAV1x+hSPdv)^!o)$>)K;c|KYIo;gj%mi1PcI+!T|tSF8~HR zarn5pJ6gE9IW(`M3rzck>^&oki2LO2TofzRh@Z~tu?UD>O?Zx^OwkNX z^lx=OUO>v`1{W{XEXR?Nn@8RD0-MmKhFLUt1bE^n-Q0OwY$(#MfgMArx*dwhrd5f! z+*pqEd=$iK*R*8c!6|Qkp}RU3{Fb+fu%OuOROH9puPV~sep`*)P zEFrXx)Y8v<2VTjrv|6#5QA8-aq%ZLx`(CHBZUp&k=-N^mNt4}~zS_z;!g-+4h4o(Z z&FV@-P)r9yg_6gC#t&ca9`qHp#!LK17DdMF_x`5gB03Z#5(?bvQFzCFLbt=5G@`Q~ zYcg~+#i4$tSoxxnl4-c~*Xpue9ZUf=nXxLpQ}D_gVFLYX4W6O#{!U>0&mXoo)x|DR z3&SDUuw$!s>xO9Y%l#v!b<+DiLV0y&(lx2XPZ%b{G9B;WGLLu6atJ`t-?=cpkK~q) zq)mfLAhk>tozE7UN-ps{a$-BBn8V@XjbKJ0_Q-s;q5Qq74x?JOI!xhayp;Fv3urg$&%#T~*!5vC8grm=1kC99eW4#i zLDf?qcTl9ytudLvm_%fhOM=D2iJ2w1E4AmZorAI*xb0q!2mlyC?a=ul^lU$;6td$; zCj>L!dXws;s{2bzN&-J4#E>GvYdg@f@6{PFhSFS_ZZ7eBSd1BP(GvyEt|<*&bnQ=^ zENwImr`N4wRd~higx8nbr*ot_p~=zm(@!?PN#j_=j;mWz`ExlTqy0za=ZeJ0+dADA7KTrKU-8 zmSKc%6`;rC?w=QLYrfGN?*vlN=0->2#6R@&&9j2MMOe_J8kuoKCFKJ)qlgIMOr5&OErGFAkX2|yRx$Pxp}Ye_b$(NX5_@mx_JlSfGRoz zdK-|lo7Ox-i0H%d9JFriR(kOHX~wqa_k9U5@}b?gACSETwP*0t<(@#^^W6X*+$a2^ z0#wRGY=(KS0E;yX>NpidS=)rDvn&I2SZQ7~V6bc0Av+fD<`rEb{TtH5zD~Tb@Qh$R z!W-V&-&7`U7C-I`u^oD8hwQQjhX8Nz0-10qSPLuKFTe%cF4Ue*KUd_E(w2`cFqajS z*?$8Zc_FTvz_+}EIN#I2!!|^uvOek9Y_)P$z@~7OHm`JjXX!X<;=}^}qyfqT@|VauhUJ4Y)g)QbVG;LIk;jkv%|!HpVUWc1VmBg!sMo8-cCF z1E$ikp9yc1Gn`yLbDGc{EQr4zvllBIRhJve9)l>q#=@~=ZXB=)3hr-5Sz*w~w-8gb z;gm`8g1*Ijj6PtI7LlS*Z{}I^L7txuY{6bd+|y>NVTcAdMPsyoiv7S3c~BCP(62;C z?kruPju{7@G)LVut~DERak1xyVKl5qJYXq$BC!=_5$O(k$N1Gvp5QM(ZLnO{etf+4 z@`bPs=1*JjWE{8cJqfS2|IWo$dc%#UhEw5%Jr86y(R+EYlyE42jGd zl3&*6+R4MFxP;@4TJv8bP%TyX?w6_Wk=9$G!A}~9CSn{e=aii*G-~~}UwM#LB4EwL z{lZ(2>lAo2H_uS9+GQo2%CL?PWBx!7N`Z`f;_AS~ELUp4E9BNu+acn*Te(Ka4D=SV zuhwaP9E2_3GYBD1MtGP^^MDgUIr4l0QOgUU)(2dA~H-Fh>Fn_pDk7TI@I8t;~9)h)cm=;4FVp6JH0op2v@ z++vvYLTIIgykG*GVt*-HWL{Z_tfmyG)g1wTD~E0YVBov(MPm92v@-(#zAe+cL`GUD z5I#cfWB;vgrYRXK>k{nr8rBfQd?JK>+X%Td!-(Y_AVC2(5B$}hcNo4~@w;S8E`&6K z32J=Rl%FkSFI;#5u*R3czW}nAh?LRhof9#tEoD&WE6O)Y3TDT`Rw3$CmHN)fzxq|( zVM`kGg{iQ;_Csc6bfvv6S51@-@nZDeVC<)U2Yd_0bcYDF5`QgsQkI&PMFz;O4lHc$ z@PxY8ga$Uy;c%SSA@r;T9qrq=;ni{j$tLmaxE2UFccbefu${JmZ0R!zF8GS&oz+B; zNnaIM64(5xTiumt@suK34yk*Ool(t-dFzQvwx0UusHg%nX}$FEg_C_Rha(QO^mqPl z3t7+heT3%P5kYpP7W3?y?nlU^`9sf9Ft+6!}xWn6`?eV@0*Kv@Y%Al!d(8eOCJS%Sczy5{G;ZF@cEemdf0iE!q` z1R>3;gpRfAB`$gixD1PZ?#k<)lUMZOl8efwfV|QD6XDm3*Bg`kH6`obbSmf&)rM&kO34VqFGjHAAEq6S{dbP zy&|CF=!Hfzm!U?^4W(EQ+E+u*$2?g_`8+ZSWaBq=dcvraLDm6ui|qRml@iRk8VHYj z{A|`?X-yj;qxM&ySmcy<>BOsNSamo^W~p9XPzXen{}6&(lk-LOrnqa=9)osG6nfG}S4DJ=JYi_0sbA+FVI}o~Iz+*mwxyRf*j1*St;f z#y$@1$e+GziWW!PVz)pQcQNR~xgA;A>}BvV9u>)uJ+Y)mlPx%w*MUC=Cfndys1%4! z!eU&HaW;3+CIaV6ID~GN(!1x{C+q|1Pwhqo!>lG_rQyU-bVpq&pFCOp@5h5|pZBs* zxU-kl!wEE4kjcMw4O2rT9dX0Qjog0BCPb?m&B168PFD1o2@zYx&PN_J4u}Vd(r@kJ z(Vkuaj=4uiqj%R?KL z$6Y2z1D)NPaXzYNN0*rnyhK&cPl1yfHP&mfUPi0$_`Hy;3$DBjjI(|mNZmwMQ_R_M zUW2NN3`aZgg-O#Q$UV9o|JX$MT~0xYr@G$o?(N-_|NUi7 z00Gx3+GgCN<|)HVh#`;P7)=O0$uX!21-G@Jd@C$kr<8x=bKIQxx7Q0kEno0S35nDF zr`!kzFE`T%favU{%V7;ZD%R*azZ4uq(0o{%mYL$r>kXee6lyxwBecaE6Usb~z~5V@ zE*E298_Cj78JVgZ{203G6lH_%4o@9DKjT%Tm^D9xb`Y{MIg~Tg`-ZWeBHuBzTPPC; ztAD9z*R0IePb+3g!&`#v)3NkI-IDMSj2oJ*n22Jo@WX{Q=QfY49R5nLx|jUcsYKeiN;mD<4c{ zmtSO?SXZ(fnmoxS%L)@ z;}MiGaoiK+?raPGp7Sa%kVPU=%GPr#lg*RwCXMe8GLIpIZa)ezaM%J&5TsSai=Oaj zBp9@s=ap?S5YHV|fSBuMxCekPR2CY2CtUs~DXI#>xAZ>iNq8A4y#n{-1Gc zBM#-djjDqQ9oFU%h%Ebp}KF?4576Ruu&GypUReE0v@bmmB(Mi*}IB-1@bt-h*$S zCXaJ=hOcD3U3uMe+<;4jxz5iuCqKz*Kg`b*Ulfj`d|XW9{>@3OhXw7OXoYrWA?>6M zpCSgMo%#12i}Aqu_cBoN%zgMPf8DB7faDf<7@I)O5yprUxb_5w$~ATs<< z#)Kx4AiSXRC1DC-2i46!2ZJ5v=vQ&S$c8gh8+(=L+IsoP%4uxYX)3AQRKD5Pg`%8=KN?uoPw_be(Ighv8 zITak#T04Fa{Q(@9q2)?&pr&;Cn`+Obj>D3N=z0k$`>zL+Xk5~D!Fz; z5}X)fyn=$%&l!cBl?uPbE@mXS77E9I zEZGmHSKiFF`O%WK0d=HVCyuV8QnwQ)z%k~Fsh$vsH63@w2PZe2F-W}p|Q*S9f~ zN2>jaD7#7}&bgvBF}?LBu?(EO)xiglO~+Ji9dZyUd3?1^%UdGVRdpEG=)_j-ttEnW zw2`lo$pn`5&Bc7lj4`kLoAXns3}@x>O9k@8-Rrs8`%vX!@1sD`IdE10u?Jn_JpW^* z&etcat;hF|&XOYYpbtrhU4{B%?5YRRAnWH8yivcHF9ep;|C zAM;L3B7w^yKz#K~_(I!!M|wXk@=-Ww@B*q0}$29+iv9y_T+` z8STW5O)hf)GG5)5h4uNlW}iU>Eb0(j+r{wENNvTd8!@AgICZ}E=QbYjS$CZ)BZZM# zvl$fqa&XJqAN2~b8sy^q?iIfV>PVLfa4wkH| z6q$vTt1qQteesQH)f(P>Z7TH|Hpe6$oC|hJ-BjHrsr9JIRikF;+imR-Hcz8QL!-Hd z2@u(piYpQp5-d`iaHJ)ptujCPqRB3!7+S^ieC|N$1BPDIpDwW`bYL+4?%R2I z`Z`&A{9(Ao+8ZxjJM5sBu3g``FyCY%9gaGiN)ofzT(S4G2{~ts&a-&SL9)ujh7Y1- zIkWTHb?8aL#r{dRuyB`+>xriR4Z&bMIkYp+G5THs>%7ZU6rE` zjGOcVAIFkono=VuIHl~P?s%bQ>(>xnz!5}EUzF*$3jR|%*0q7JLr;+{GZn6Ra#;z% zG8NKSKm&VWxmDm0AAW#in_5-H?)GAoCFPILWapr$>3~Eq#eju59-)O5zCSGX7cKQ+ zt|C;oNi#7DQb7|5TK^386Rag+srMS$%KVAPy&`K4T78?@SAr4e+V#sY8UsH5JwlMR zUZ1KQxYUd0ZO<}%e6wTDRd0n0RcI4#YF4?mJX@`Z;uCl$v+d3#V;bmpqZkKn zf|`Qgl7S1HM7A7IBb9aZ+Os{ic&vmOyYuw)NOrYl-sa0>+#0^(Z zJQ{Rj(hS%>jJJwqKJYB|tr=t(=xaEFBO$BRVyprJGGwg%S2P*MjUavcLzTp_+J!8Y z2K1fN`~BVPr{T4ub)p}hyha8v0;}kQm~^kREs1jlMSr`ThFqsSaOXx*LK(%;V_)09VnC)R!u{`}M6tY7R0gAmxnue!XLZFQkAJ< z3%}mHq&)&Y*hHyV8E-hCKglqI$@I^|5mBK_|bmYWfd^$6njJWeC(2^ae zX;Rq}BHu|^(!AW>e4XLMIqyY2Yfd<@S9a?g*mF>M|G~kP4nUU#RJPk}nH?c7u85@X zFEa4&$tLaeE>q=66i{ir)4mVzf}*FihKnHKZEesGA30pQCFY_H-IFlvF}+!L(PR98 z?Y9SwQL`};vQ6Q{?QSL>zh24zoi9j#mnV3|bfwTn=Rsh?!%d2#7F;|U`1C5-_3t=$ zFSL&4y~J6s=@&KiiyGpx!igSv0(XxYdK|x6N)#!sJ5|Esr{4fj@L#YJB4Abu{Y*5H z8{`=W@?My60X=W*+A1dBJj^#>YPR&HKicTJA+Cpiy>}il6;NB)5F9NJEHOS+KZmml zs;lk(qJxTqYxpJ3@b=9)A{Tr3m(md3ed3Ya6ro~W+7f#fNr$&h?GmlzGYsJ71p0d)^SXih*gq!9d^9XI?ZoQC;Z0Kj1!c0L}|FmX4~Tx zNm0FV>THgN7Fe;}k^u#)afaYlEl6n`ouEF}JzMAUp|2Ihm9gK;`(rFqr6Y zrk_JJ7!Nvip8aZH_ASq82YN*}SxIxUOH|$fCZ@vP#Xo7^VG82Vds2ea3gVY3ksru@ zH$RE?45@%;k;{rQ2=+32^9yH$57I*eO0}pQ93*dp&`9P+F6t&t3tG2w%CkBRxHx{O z#yM!dF$up%gXV*XV4R@sAX3PiSrvx?PHcqDzycraWdTL#lg2i@Q7MBR78MyE|Dt=n zvc0P*Ue|7n!FAQ84XpAqbTe9Hkt>S8Tov%yT;YB zJ|X4G|I!&IP6Kpp@HsGhwog^ET2S;cdn2bh zbtTuV9A?$MucD)(MN5sR4r54^=tUU^G9Z^r)>n8*vcb4+p=6QJ&f-?`O6Jz%I-ux~ zxO>Y;vK2rS9lzAdm(y*|?4)XTx1_i-S(lakDoW8geLOlU8u*NyK?LJSWrhJCZZn#; zfh5_4ckmWI9GBBex33Uc9|)tD5yArhhT2SHtlr)No;B8NCJ|-vHB3Qus5-193Njtl zCG7WWsn%zt|FB;eST?}_r((nZk+Gq3j3T6esj4p-R{15{qP;x-y{e$BOca01j8HWu zeE8{?Tns%!BwVfz2)gC^p?yuOfs=rA)gtSdQmiktw7 z!h%iqzr5q$#VAuiS(zz~{yEcM!iDp%^<_j#4FFiVTB^Idx_NL|y1HBcp(_8orhksA zmkI>?zdo;~h5ED5i#xrXV{8cjYYd_POS-f)ce0eSGq-p7+cm<=5BWonFaU}#?noK` Gy!}6S+Dy3s diff --git a/pandas/tests/io/data/test_squeeze.xls b/pandas/tests/io/data/test_squeeze.xls index 7261f4df13f087f24a4e01db7942d4ba3b370248..2524b975bdce6f7e11e9a88e6641efafe2694c6f 100644 GIT binary patch delta 44 zcmZoT!`N_!af1dEHwOblcxFzHf@5XMWJ9LZ%`=#;Byt33{EO<V!Z diff --git a/pandas/tests/io/data/test_squeeze.xlsm b/pandas/tests/io/data/test_squeeze.xlsm index d7fabe802ff52c14fb92ea43075e2833187b519c..fb19b5a40efb73d5511ca0631dd0c3922cdeeebe 100644 GIT binary patch delta 3608 zcmZWsc{tQ<+a6nk#1J(YyRpaEcSf>rkFxI~rLhkore7v0BTEKJma!XI29-*(WsrT# zo;^#l@9XDzzwdjW&n#4(Iaut0sh`m@|;v4kOUY6Vgi9c zAzo6!z5%G)zP>2Q5O1$MOK;y%1msl1^NeqEZ@_?7^hPA^RSsX!<>eo{Zg*hX5-h4! z8I;9`lfjI1`G4esu3`7*1G*>F_l6lHALy*AA@V=5(rtFD%^|ezd~a6KQmpgvVNxE2 zyi9bUPa&uS{foEvciLUV2R%gR$rPaM%Sr6F$WT;T1$I%^R)!)a*@8xzWPvI%YSu57 zB^$q-t>5BzfJpXG@L8=jB)nxS^b+&5M7T2Y;^x004^?LKeR}B+xj$lyOe{ZJT!AAc zLc2P@UeGbsnmw@7*GW-Ms@Z0Z_@;}VpU(Ms)ZH8@{6PXb72WO*YXEIjH^ zFiw)wS|BCa@7~t7oc}nZY=e>7JfvlcU$z&~yBnZ-_03C2-++ofWc!(gt|_4u=UF>7 zU;!X1F04Q8iVJ8;;*B!NL-6JmWMr|K#HzOm>JNPKs1tVr)4W9PtP9Es8P)O%p~aGg zX(#UysC+{+(Kiz?IHI~c$E)2hTwp4%dmV7oXuka$59dnBXWHSBb}|T(f9pO1=wuKq zo3V1am({l%Wm2$&d6X%gj#S$!Xj{)4RbC<7RQRSLy&w@myT{=H45n0_FuxP6qV z%B`0@meHj$fd*7Qt#o)YSezHt zzh;gea;^cjN;Rd?hbR~WD{{~7y5EtW;xQH8pNka;r;ry&p#*`bFMvRo{#!oiz)-I{ z=zodFV$3HU0l}0H0HkA62KNz3ao%N*C#<5SV({d{Wtu8f^HujZN#29N8g%!1QgLbG zRW;cc-RYl{YnJnoW*s(VmT|c;5PWqq?7`ie_=&Q_+^ZhvGqv_%*_;&t}m8Z5MWzlZ$3#+2|UqTwXP-7{qz7xB5(5xgV zX|Z6MEv!u*E}A{aPUqtMj1?~U0%-XbQ_}n0)?&$2Swxc%*4M6=U-!1}Y?IvkwsnPR z`v#58ZDU+X-*wJ8!f*m=?i$We;i1E=2F0i_H*NUUqh7Uf&O`J%9#%YG`+UJ)Ny1^~ zdb&+>CJ2gSpkGZjhl?!oYOdJRmb)nErZ%kc{Iu8JGfy{`OyF~Q>HUuA3S8;lQA)!Z zlgJe2P1UM^%0y#uZpM=oHrvi6D{--KF@aWt$AMx(-)ggld^=@VzG@ZY^8>4c;#@Z| z%yImJ0T8_}<{SIB-Z39_l8SsYXQ^f)cV+tu*IV~gTe42DbgAi7)T4n8N@b@=DB}02 z1GXo){@@Nf4vnKsF0$1IRHwO3HPPWIniXlmGU}n2X|vHw5M_<_+@X#Zm)e>tgp*%R z*E;O%i!DZfNbwF#qH&Nv)v;1gA3vy~?=kdt=d*E>(w9^Dkx|8!mGwqec8MQ-^uu0g z{oRC_laZ9!=zs;Vw~aXEyTbAQ5bj6DIS3J{CXv+6C4QR@DLg^OI449stJEh^=Cz-h z(}7{6ATF-_1`ph*1%BCqD=cWGXMo;>a!1Nw-y+I1VQb6r_`=_`t%`zhgXu@Gf*ku4 zAdn9=zU(S5(C#yafavVe95PYwG0*Fb>n5-XN@G3D35#eGbAf0CkC?&O#$>R5bJ=v> z;~Igxx(i($`Hnv|P&yx`zP$4=5Vkd?gSv8heukfHoNO!$z%~5({q9_nSmIWzE&W+1 z9{i&e!vvvadEwlr5t;4M5yLIF_nNETrQp5k)?@1-Kw492%H&XAT%mJ|rr&`>g@rzt zuIWQ5E2;8!ZbKyV407Y~$&nM|DYActCHfa-|G?Di)>y}lrr=})+<+$xN zwG%)E(C{OlD44ShIosF7JvD=Vf4O)|OE7?W?g}?jl)Ag8VKSF2K~KtUTsk1^rZODO z=y$5C+3Xl8Wa}A(LodAx&uuh-aE8)7dS7G4TchE!M7hU0WGQ#V780T13XE(kjpM*h z$z7|oc5$?t>-&#|nzwm)tn#1d2iwGm#OL||Z@;+PuQ-31TQx)D-Qu#po6~PxH8R1s z9vi#AFr6f~s5`5)Nlce_ue$~ADm*&n*$-`$>RNZ(UNfEzATJ;u%B(YLpoa`*qp=DH zyVGebiYjmJZm_H*c)j(ewJLOruR|f9Z8B_?zMuAs?*6lHW`1)1_h)BRpudc$s98aH z3>PgP!O=%gP9|BJF zRT7u+zS!G@PtvP95n*S<$v@k6d6P)wY18zyW)gj zu)(`zCuBV32DWKnn0p$JisHzt_-G@FQ=b2} zc;31cvtjhiJi0vgj(v1c)+T&bR#o!nRPxjhrTznI|D~kGv9z`RBUS^GLHZKjR#S;U zdB=tMhX(xxxWnTseu3p)GoAYYo5~sO%^Y7XgB}3YAsM1kE#C=QLQ*mrLZafSjDKdm zT3kIPi!_j-OUkXNFvfk>u-%EoYv}cRb3DC=6wj#c=01)4lYizK=ZxHb(lViH0e^n$ zgA}J9tOnaAD9xbA@;XG=Fmma-`3SL~`C6v3X^&;49qS0cl#l3i*g`CjmfNf*e7)mA zlNw446-N!GG52wIUD;&$6GP`RH}#pkmzt=JVKlMI2TQ!3eVQjLNgH^r>^b3B=H3Ql z4?h;yPMB9kw$s`jgVP?W5fv)Aze)H|Fs@u~B7+1~3wu(#rA_fY->zoIE*eX1D{lzy^Om|h;Nz4Q;qr^0liWb=IKDNRGEJED% z(jG;WTNuOwj>YO_qDO?Vdf7?mDHewLfxpHnCLRjrTu!WFwgrlFzsd)fYh#{69us z<j_mP-IkeUUcB!RWS~A;;t5k)=dMj#`iOEm2 z()mI*Sp3G_MA--cN~(@+gB)EEUE+~ezkLZ=B*;mgfppbw4Pyq{8|oWARrn&XB)_N_eddEr3_i%ak^8%Fx6zesK#XY?=y_!wbFAG}lQozW z{^;OP3a`pLLjTXogy-gyh5g?V@O%pVpJTy*_vYilM?oMA|C<7V*qHFEd~n!5%@+iM z{&RaC$zH)r@!z0yh2k|huHa+%A+UcO`Tt?r&#`RyT0S<|zr}fO-OT^-pbsa$nqPqP X-vm3~_8|WO*v9Yh3tS-c{+;|U{lkL+ delta 3500 zcmZvfc{J4D|HsENb_Q9-GGiwartF{WA!J0zzHiy4OohbTt~8c0BU>W-zDH%J!N``0 zvJ4^=Su13lpWpd>f8WpfeSg3E$36Gl*Xy41I`{Rs&)4e~7``y5<7Wgoa~70is6ikw z7zBcVK%fwR`CwF#pBoD0Cl?anUu0>D%F$+TzqJNvhe<(M0%KXQCTIzCbP3${gx$=juT3)NeLt&R?Ym<#1n$4=a-ZSWV%D3~{tAz7m9`Lerl~6`G0SFN)X#WvJQ)2dGm@mTGx3V*_$Vt5bbgmsGQ{!T@%0@?niK*+9tJ zP5^q1$RJ~EuK=F(d4q0=)_IyU7ujL6VocqVO)Zb1^Z0F>Lh-!QCFdLSFf{ns`$Bxa z2KEnUC%*U|upTJ=(aF!#AhD?}_}VqwkJhh)(mtmTGx`>uBxp%b;Z*Y${5im64%j~t55_DDO^m{9vt;aK|j8+l(z5@|s8h`FNS z$=mM3^D<>Sj7Ta?zO9Zs^|e%1DZB@4-}Y*5DTpy@eg`hAGpn+2(7~3t$lmYfcwI(LgGf!6XK_{uC;>Tqg#NKF0bFYQl*0 z*zWADV+Lo~R}%+wXDrq^FU=m znFX>7hCp8eg}AKfu$EaPTdDGC&Td?z`Ot=bC0Y@g1ttl}QfNRR%Gq?ogoc(2LhFpE8`%I6qa`=p`= z;~nZQoRtwe&EL~ZkG3>ujDRxr0Ma3N9=5vrA`JcDRR*;N!u^f4TC$ujCbQz+@Pu-| zv(TVR6!yO5gryH5=#dMLGxyfK=Om1_bsn+8u6;gdDHpww#%{lnS+u9ya=kmxK)?uA z=4ImV2d%E|6bC=)y@M2AW`Z!7*q$&yE?TMgi!pxxOugevbgN!U9DuEQ)?#1GN@mD^ zo8r}^?zTU8A%u9LusU!S+{HmwY;&u%EfJT9wDd@_9w{&m9@MD@O&i*1%A7c+pJu4A zIq7==`BE}nobDDL={(cI^yyS?*p~A$Qww)Z!Sm3~B-V5XK{_;0E5GthO08(6{qZUS z&LekM1X<-~24c|%=qBzCm2q+@4nh)mdM@~U1(A+U2w{HTL=tgC@Ho9}Lz1!*%g7tJ z;Y}>#q^hv*!Gl$^Qu~9zCiIPE(MmP4uYR1FCrFJk6-kJFz2E!0Iy9;smAd93O%BL) zhu(CKEo69tS&URU4+j#JT{`)AHgYl!B2{0WME{b$vU>uI?09#wQcHTrYUd;~_fMc0 zXuR6BGK7D@?nE>kEj};Mmhhb%L%vQRjSA*1n^mpkUN>`cKEOTDxSBk=yP)(F@p+DH z3zb(j_YO%0=BhM{)^jeg@qF>+7?QK)EF${C#2eQq9y*q|3OZ4;Wv2xus{pn{79PLb zVA(=G91OVGrZp3E#eDNP|F@rhLF76AErrbEJLx$CDFp#8tc6IALlb zd2W+PxtpTPso!hALX?4yY^>2vLf#o4G^{2i4mH(GoFKr&=1QiH$ zaAt-6OKNdeeB1!;ec+4&yZ%1-5JJ1pA)%FRkjnL1DVokFD&MD}v>bai6OvxEaj^AQ zk&Bp5+Z~?h`0?$bp>~h^lnU;tn4ja>**vyLMyRX(?Za$MO6co9#!n!#O%Ap+N*d9n z`@^ed6WwpCg+SCg(K|{oU8qT48&n&eH^sqn`n`&DWdRufJ(R*0lB0Z^;1zlrcUz7% zm!Y^d-nA9UWMf}Xa_OPHn;+73dLnEH6x_bcibS`NlT>=vI5-5#Jya#nOW5I4+Ut{1 zTq7rTvaeO^NsBNilP^^*jsxudB?j}OkNP*EY1tL&syg^w)o;FUWd345P&yXeim&7Gbv`_9Q?6Wgys|QFrCB8v3Yw1(jbFMy8BY>@ z8`04?tOaU>KrYcUB`)m|tSAKDHQZ&-!Xhm`COd0=QMt2fJM3&V4zT3}hg26Sjc|3Os7^{8cC1L}ToU)ZT~im` zyzx#KQFzPzl90u-;+I+wroGLHY#^k*BY~+XLC~|HD1b7ZSvt8xXDawaPPOT(Nc(g* zbp=y5jpepCulpFCODRL`r+%MLI(6w1YFuvif8xsFR+}c6L|13b{Lm?Lnd$cr>T6~O zpEH}aj85BVrOct%%Y;&p%d&C2w!XtJ@&_K@P4cO|j(bVFObYk4`Pj`aI;I=7#gdZ3}g@rx17}F)x{v~*_yZ`tLWji5CE24L+ z3aDH=BsoSSGYtF}ke`z(VCDG?i?6b$k*;vsy%NUp>ZE;x1kWcVmh+rjg=kldR^WD8 zclW%y&?#}Ug6~8L9t1bDpflg{y|UVFesnHHs!b*9-G@gF3nav9rx__s71h?1%EHFk zgAV$jA!S)kU;r6u^Ao22`YX0{5>!%Z0!C0RiFcp(~6f zRK1MX&s4>(F-|~~3UD^*Rp27bgFE$zVL45n07C#ha!;6zRw@xC@`fB8z@O&Rf_0zhU3E&UPQK6mD5}1}TrE*7n)*f?hs6f^SHn!K371Vj-o>g=*ifp@$7*)Y z!{O!}G~`ke-IDm{E*!qtVfEX>^Af0iL}d2Vz+ZUY_kACSfIp>E?!yQ4Gc-tstR;TI zb3=$(Dm@L(4}8)Gk3Z&>tWia~m*vwf%O)2>DIu`T>C>sP3+KEql8ugXNA8yTchld! zCsZw%6fk2DFx|iX$NjiD{L$tu*5b850u^`G0$#wS20@^~d&Wz}ZH90+2Ci>87_$eL zx`iu@dmXeaxa)*(M$w6G8+}VXDrhTNKO)m{Gd>}u%IaR4A)vG6q_f(C(Y1%I9(pe| z%_w=Q5F^1758hFqhJ3~>cKOJH(eM+2MAVr$|KFx#ZtX@~`q@P*H8R6Rr~p&ci~>vi z7R{az3t8aJ6n(t5HKIu1;paM#QR`AZ@t&;ZGZySl+R5RE2kYwVFSSwP&Wpdr(yVXw z_j$v_?;*GrgRoCoqLYf5k921@JDeVI@})5r?;x~AkdceVo#BPb(#@#@R9^lZb?i#! zt!bPAtV!MA_fhFGC4KCrw*h%~D>dd%AK?59;u)c*<~>+p{0*lj+)F6z7OZ`;3DdmV z+e$9?bg9;CsSciG@3a7dmGv>)p3ccUZE0w81excDh?@F$2V!VN^JIukV|@+A@WyU= zeqh%xy%}i6jml(em`$Rz-hM`ZGuIR?LD^t)U!$U)(B=2dwP2#eU(V?AI|Uf|>^p9*i5+k5q_k|vISEAfWAk_?nw1%i=f<1T&LRq#ChkkFPseKD zQ#3~7PPI4tI<+Cr5jJ19g62-DO1_fX*8yCo@D$TOpBY?%$YsI*Y!lDY`mfQ(jN1_r z68z7K{6A(8D1Z%T#>bC?h_U0?Mfs^i*l`F5H;y2}F8GhC|6RrZIrEGcXU4;iOAwWz y7Ujn^ib@Lp`)!_80gith&sM+qaes-43H+O>vzCDUe>nu>LdC@BbcO$W^S=OaN>En- diff --git a/pandas/tests/io/data/test_squeeze.xlsx b/pandas/tests/io/data/test_squeeze.xlsx index 89fc590cebcc76ed594f1bd64585a0c880254c5c..84141fc80ef11015af9f96743261b73c59772c97 100644 GIT binary patch delta 3594 zcmZXXWmFW}8it2%W*mv3b0np^TLutB=^Roxqzp(7LrF6rqkw>fBc-I22&jMzAPpjo z#30>B$Mu{WXWe`EpS{*z@7h1U{l3rszGR*BJJpmFsDT)iAqx=zAWQ}T0096%kf&&% zkDrI5kB^5)keBCcQ!k$>G0>HY`!(18@i!Q`fL2UmqnqxmNr1rkuVd0yz zAAz)yBNBIQlQ_ehW)Ojc$}-VOUY2q|19uVas0&G;yurN}k81RW`m4Teaou6zmbkCm z20rXYNqvBXUrJ2q*4ybeJqD<)5-;OS#Btd_%X!ZX%nTx21WaMn!ef*hdgAG8x#ABP zrY#qfST0y%nCnFd*G@s1?oX&(oIi;mr(NT3bu+lNXU!Cp#EhL-uC zH3Zs*u%%NyU2O)EXRW`Q2h7@Jqw;~g(xzRgY9zQAZvwlW`I8#HC$fN!pXJN-%1Oz;eDHcTKCLU7P(ge7q5XiI0@d|VYq9NO(0GTN0@UPXUMml^N8v_(6ySi$$(?Ljb#>l*%1RU|JsS2mFpBkp;20mN zp-9;lemQ|2r`R%OXJ+%G7B*5MPq#kkKs?upZV29F>6%ML<2DmyV#1{*=aUO5*nH0p zZmuIQmTx3l=%m_1bOQLfBSrn%&TlEr)TxE9qoa)1L)0Nnmp%ULN9T4AB@$kF z%O&L%Qqp&tXyojh-DHqdcUTcE-f?bjJh8Jm*o0nT#=~tCKL&4l4K~~lwVd^97#3Zx z(SI1)rF~15v$A?L+K>HsgOl8$$-O*KNm9Oru<^UY4$IP8*8Avo@*`?X8*9`lw3MpK z!ZnppCX=1mTj!3{S6P-|BNyXLLXip_oZ|2)){*w6^k5hjVv}&u%@(Z`#gke<(UPhI zQ(uUfD2$}N&Cy+Wf5=K8?UGn{s}OFDDC_qH@Wf;IR^4Z|Qd$jFeLKDx%?|B^ZU?bG z@ea&>{9)zFUt*)UA@%4iwNXK)dq4VE3n)96ELHL?@3|}6^uwubcA3oPl#DOtA*od* z4OidK?L~_lX^c@R(UF9!b9tbaYb?tZ0VaZx4i!cq&67b001%S`05t!M55hm#(*^NY zcuc0fbHzZ&s&A<33qxw>Nf9B=b=PP*feJzB^Rso5T8~af=bUF=W2ha-k;bz(73qvh zlF5U)P4dm_MQTQU7L}%GY!Zm4SkM+x-@i8=Ct(41W;2zhx2j>?dC_U!d5#Kv5fIw8 z9hfXn4Ep*rXvLs8|tJ9Wjr5NnqfIZ>|$8oEVst5JdzW(S=7k2 z=*$B^5~(S-vy7qqtDLHv*5p-i8J(=Q9gaC`&12(S1CcZ?_*1XCG6xjH;4iuCM1yJ^ zfpJHpB1&PlJrMgcI)mP_f6YutFieoA8y4X&ck&@l+W5urg%P5MhTq5Ere;c{VD>aDN#+6Of zP~hiN7p6o+BH{B=OHGvG0=uCqA}m9-COc3-|^~V#uRj2$KU??V9+7gawR=ECA+z~+fK(M{C$LS!V{r; zl(u*|nX#1Mw?gJ+Aw*bLGBX;)9&YeTj9;{aKdXO@+oDetQy^jRDo8oM!aH5!g|Cs_ zsa_0kBDzY81FGKzy=lW18n8L^jq)zxFHzWuNvvVo!GY}s>7TZ(eGFnJqnwQ5Wj-MQ z0KADYd5oN>Uhio!koGaj8Ibt+#`2vRoiuu0@hDg0f>p#_W1a*t4nf%T-j6`t&dPj7i@C9=wij`EZby5iHo6GT{psDxR2efy#QjH^^8_D1)QZmNWes|qawR;k zuSu?YbeY~xf1$k5s~gmX)3F~6>+pu8zj^mmPTfZF(U8d6?CbaW56qsstExgbFM3`O z+o6b1B;0DzGREL>d+TQO6C=pir>hUtdHrtS8Q9eXhzEz-eqeFJl-1bwwbO#5Eb8+` z-F{^yi(mROg?C_K5SqF$YZ5=;?Z-)zL$tO8#nJf<5rl{%*nid1nxO&-e4B$C-Z zAJ`_hLX)fRoqTx>;*|QyLoIHf`k%_?T^7q?cUlRfD=@Svt0Wiso z&*NYne%7EBm4Eh*}mzR$E6b=MBj5gYOBhWUtyv?rrwv?GZO9C9Ql zB;iFh+<|#^=hG!W6K$e&+#x(Cy^Nzr)!iJ(2kjv!zo|1WWh9}eh8U%s9X}i&EXkpV z4oJ>QxJ!LIBwW0&hI^GmR^?4rnb;kAH&u0E-Jjwp+4$jpsm`i40oTCuy;v4$ZbzZl zyP`Mr5*C*cRVgmk2?6>0&?QMlk-7Qj^WWu0Pl+F`JzJg5-WffogWVmYEa&Vt6!w?4 zU0Du+jTR@KT`>6iS9vb>pP=X!uF3De@==Ekp*;FTf>au%`$21Jgg`w|Y$}n#Tz=;2 z_7#r54F_41@Ae3#J)sLdnoYlr*mc8m44)Ci5k15{L2ngZJD}~=4x_tf6-}V=4;n;S zd|8{LdU(aDWx+3k`1E4dG>j+9iaT%T$r}!tzO$m6(GGD( zCH599E2{O9TV0T4hbWcFykq|;K25b1|3#dV?My4-r4_`BwfcP$a)LP(nPpBCUK^p{ zO2IJBw6vm~g*~t@AUq}elNuB;Q!%VYzfEIXd(O;FfU@<77Nd*ZAv*`Aho+Qyjg!`I zM%yLRlOp^#MR>bp46wYDR@axW8V26g^x}~R_#Upkbqj_$MmHXz=bYe}ZlX>ht}zky zs>vyP&OwV}Vj)2iia$OVn1&$f53=|5I%+Hb7&436V}&t&-5}nos1U%#?i4!T@}*gB zZyABsqP)uj-@9H!Zvl+jebM^W24cOwREHap@rJr>WX_jmNwl1Dv4nc}zQe(wi_!W5 z{ezWA5hiJU_nfKLwbzQSRC95f=Wh6WO02YPT@^*woow+LC^y(FtHP#=T z-VOA*@XdHCpTZ!O+uCSupX;-Hma@;gtNLgy4uN{t*PWa$@Y3+k7fH^o7Ex8LGn&7O zD-#1}&HY_m)!HyNE&^2BZyHm_{!Sci>~_kGCx5J(nfPm#5Q`cBta+)R$6weagZk-S zy7Hx4Xu!)~_X=dwuYv0~1I#W+xjJ4)2aRGvz9n4OuNH#0aK zx}Xcv;}@MOD}*RkDuj59N=)SeIUbi}MAo0pG|X9QhUp_$JQE>8U$>J3BDud$W@1UI z@H53(2&$tr)|dX3cqLP752f-`a$Ba<0z5)}HfpV_QCy_ZjIBbUn(jgDHW275@XnsJ zkW5JHVY*}_3R2ye(gQkY5Ln|7S9YXPE9GUuErJGG4tI~^+4tTaltAQ8pSXHS7YQOen^8+K)qc{g)gM%NZ8xwW6axJo6`G_bjBAb(B zW9QxRE86uHccao9r7D~SORe)B?;|k*UhV8(`3ChF3Q>Zb0UH;4aE%Y@0`5@h{P9Dl?5#(!OA$AT&1=3)7drGC$Mp#Ku! OF-zP$qz#;Z9{vM+DtLAP delta 3497 zcmZvfcU05Mw#P#Wy+(vk0)*bADE&|Z3Mfbr4!sIUFHwRNQGRq3LrDUHLj7L-H`j zm%~lAFp3N^*6JA>bmli{f_&6TAjM(6sErsa`n{Oy1nUD}cN}V+)k_bWBEI@DC+)deqF@x_rdP zE8*WAHYs|-{STa}#XN6D&QMG(+BOIjy`3Q<(})mpv0x-)?N;s*SsVvs51A_)oxbZl zl9DOeWkgcx@NT!=t*xfAO656Z`@UagqbSO#vj$wzWL9V4q!Y9@V;`i+`Q3@}&*+6i zSqN=6YTPZN`CCf61XpIjl1Ed^1EQKgA18GDscb0TE%+moO0#!KDZ*3Pud-)IdSq|L z=7hlowq)XH?t;a-9h#j3?ZtFW3%*vPh zY6SEqQb^0nj_X-fa^-5D=j_L{8;@-1S7MZr&%tCNISLI3L^+>snb6R3f#^UC=ZL~V z+GTP%3Fpw(o$2=(8N z#@@FaxAg7_dU%`L<>L0d#{`VFWnO8810h9N%t3GBI2<;!3itJ!Zg%Dx^51}!c$)b8 zK`SfU#lTOx?;^#Pnb;Xj>`s{<7p~O$#TtKnrrGu_rsZmCJbXQJ(O=8+OJm$nl`f4kvVnBILlOHb2jh< z@+4(CAnq1-GPtLO=`*O@uuW1jQ(y0yg6E-I$*dWU0(5AgT4Ck8q(K*cgcD$fyl?FJrRE2;YoNBc#K}IE?HHDWvE%u=ua%; zgu1ZrVaTf4Ylp+Y2K23Ek#db$UxRov50D098nP$u?Lqgi%J66+DsBCa^lU(mJM>SN zxB`YJn1v`cDM27f^>#Zi_a-6pFiQQ^Y0O{JT6?F!(5_cIE474I9DQb_WJiPirYf8bQ?D&S1XmYe3Er~ufKSh)Rsz;Xq= zco^_!tL~Q|E%U9DykC9>QTyo;e8b(TH@N7QJ)q!v4JX>Czda!7H@K=->&HzkBra@` zDECtJAqL$JDVr+Rf(bVj?VM0dtHjXmc^b?xOuq!N8xQz z@7BBAF$rTk0|Uf6_bFu;r(%DOWoL8SAsL~r4n9ZOI+XCYzm1=;&onsN(Wq$0ygnFQ zH5>1IS1AOd){EIyf$2j{dRw6gbnX-<%h_55S?en>_G=)OEsUV*)8iR_7Vjg^n!`}k zlHl5cWU_UrCExC%y_Xl(ads+f1myeNV@0ByW|P&r);T%(OYf*lNQv90rxI&ZP+UW& z_Hu94Ysm{RXOnLgO->&;`ic$bhadKBL2=n-8R~lJIqKhi-^uik$J7-YAGKarKU7m< z3^@T9oetTIUAHi}5=!rMz`#R)ziiR5SZdA;VF8zjth)9~inA0`D>sbFWb=nFhn73= z@cpIh%&_^=oiBosQ5%U-^mm@m%9|&Y7)}X({tBLs8n2()n>op*pR*hiWU#lty@~Ck zu}Fuy%uQK}tW+sdxe6LhsfG9KPv&&cn|ql%zgYI2lfx!`Ec|VB{pYo8Cy7fs`u1XD zC%R)x9|~taL>%@wDI`lWnWpZ|Su98}TVVR&j|>!4RcXu3%BfJ$y!7yd#rqQpWRZ7~ zZS{k?pn3}1WqPKh#eI_Xxk~YZ@qcg&0}iM!!d!jxlAC`B&t0fN`^oiL;Y;j`te;L) zK9V?8H&p4R2`VjqbO2kFD4$05J44h3C_~G`aH7`tq>T~=Q`QUfv40s$?$QY$yi|Ut z_uk&BUzh$o&k|~cY1`VWnystKTzenb2%%w>g(^c_ZK>)L;zw;O(uB)m9zInyF^!w= z^_2>2%r6UBJS%#o%g(gFHJ%NG)wU%v6($OJgZB%pJ%y2TR zQTN7KE3KqC^k#`rDsovap2yC2@MYeI$M=%Gt8b>iryN~2@%RxDKg56rY-i;crN6lL z?=AbE9m#3tD6ycE;!l3M_rUviOw)(>fB{rQM98Js4k6;V(235zlW&xr#Aw~f?(GVo zeEo>*6obq(^#6*SOfG|!<}ob1$(}~K3exTuGmcdz9~dTjJR!44LADFft{C0G9b9MU zyr$3@X`+nxR7EgI(9D9)eA`!Rm1ur^Ayu+f?fLuBhjm}cN~`T=9;z zG^#CWSH@0S-;bVoE+EO{(on5ZQPW!!*_xRT^Ap42R+9?VhRE3*Q0lecPIPhvMgW(?0=!7mwC zc(_tNV66BNCM|cNX>bwU?kjgcRFS4AwM*Kyh5PFTo!(b;;e1@dc9y` z!Ae{ULD;7(G08>D$NDo{ZO#uNyg0_9T_pq@8MR>C{-{7zx-snom8U;v4Tp+(3l0*% zn%oKg7@Z+g+{^LW2T*Xg(qR5PD#*tmmKlC#-h~xTzvbM3e+7lvz=#tKn8wxamf2E| z+m$*^m7xOp2-Bi9rvc4Uz&{|f0j0yEx}OyW2OxTV)}m1f;p5hY@3m>&3x zgCX|=XN5#MwjutASkAb3^!pr9_QQhdtZNFK>@1?&kFNp?3jC+97`@Ui$j|r3o^7yp z+h(n_Z&$mMv+ugOCbc!+O>Nqgcjk}j!{+jgH>xrk&y6+U&Law&CLKupn2ysZ$ z|HljhIkDk&dHL`Z5e}RPAGJ3J9?pId{|3$>@QteOipWqy`0xcH s5(58znCB&c^B>3a)e;|GR#cS#-$`qWJpcdz diff --git a/pandas/tests/io/data/test_types.xlsm b/pandas/tests/io/data/test_types.xlsm index c66fdc82dfb672286c0e1215c1a3a42c3af3b277..4c8c10e40effe7bf55237d7cdfec369d712417d1 100644 GIT binary patch delta 5629 zcmZ9QRa6{Ml7^As5=gM%uEE`d6TEQ??hS$9!5Vi$Xxyy{?hcJ>LU3H>y+`=`n_ zc)4cgS=#vOZ$C>l$-}m}_8ldx#T^OZp3b4aN^=EWqmScy2h?ZcPWq*IT z$*jkxr9IuUb~2U|F@#-@W5Ge}chYSR-b0sBok83_2+Dxnrjmv3o1Lb|EsyCQO@Xu*c+UJG6 z$uvU=x`>;?6?CIJ6A8sFCYiW`LGR)M8?tnH6jwwiH|UxtBt_woDW)ey2esz6F&<5t zl_!lVnRP_r^R1F-TA%M?Q223201aylX&W;NwhF?g%+-I$1e*R}hJwU;`s9_Sxtc_! zNMhz(W_kPc>IC!pWQ@DYgd;i}=$V(>x1PUAf3bT%LEMi?SI*;E$eCsj1vIwCO90U+ zo#d7<=V~KJb($3TNgdlMnUHIN^z>!v=DX<*iw8t^81TAZjL1>~*ReO_t2U^S<&IN- z!T^+Fxx`fY61fZF+8``G09D2v`LBeB3zB3v9_1LJ$W0WSKVP&__rNJq;6d(m26`2} zNxewv$qKL|->4KK5${dmqL(qK4!%VrL?-v>{zND7fa3XNljdjY1i7wiC-N)LZLm;v zzLC3ZTs91&#~baNjAcnP+IgULnBEEyDxUmu^qS){nVAZ652%S(+b(_kiEe<`cuMD; ztepOhRG(sx=w$E_&v5+BL&8TO(5fIx`an6y&_jxdMnb<#`xE_5=`x}G$Fis zB$Yo!A{C=Uw!!jJ#uc__Sp?-gCXxw_iwn?;0;TW1nY_HxZ+;2Q{aRC4PMJ~XbxrkxTOtJc4XA+A%=a7#0YT=4L7aav z=;r>_(c0}F1VWOHoTs@60#_4mU#G0{GTCdvz*ZBU3Ev0#>}57)RkOx2ID5XxV{UZ0 zWkH;N(4ZPo8U9_^5!K5b+^Hnk4NA`dCapwp5bFGs z@kogAdQMH!UDpSC{u|ZpZA0Ceb9Un8V^Jb`9@79Ai_@-dV7U`lk+4HZItBoUD=)J5 zRusG=Ki}41+s)AS{_9vyTRSzD^u=3@VNN;_3jSH2LT&S(FbE6xVNm<|I-hRbZ%8Ge zip)AcO96jhU4oE0OO<=SylxI<>mxY$aQ)&5nbBSaRh~4}Fb?qjd$;IBX(M#-=Qa zS$vSVn(8`1#6ISnm~dk4B}zsinsFk*2i!JxnXzZ=PXp zG@~j|flZ?zIJp7;`r12fZTg;<&6}=ovl94}=M5jGNo@=)wF@_k zz;jlsdlfIx@fR+N)I8M5*_-~%jp@gkc^f2{J=rRlt4hlm6YRp9S;bNCp7f5g!-eCX z9{nEUAS*s4MVYZhWM*F@zm@S9pWrC?${G9jpk7_U?91&2Xu1cbN$ zLc-n7+R2*ppEvhEz&JD*jUf|!Y0dYN*lu1A{G0I%D{JG9iEw(W>?HEWrbB=(FAo?> z_7;gj_)3!*Qj{v~ES@y=0U4*`G9H7zVUKE(X>@_Ea+^NWQOVE~Cr06^qqz7cC&1^z z9dtI;k>;b2hSdvBG40ei7te}aNTZ)l0Re@W`TW!slVb6$F+Sg6y?;<%$_;SEclwDw zw19jk>zJQH^-*|Xp?pfbwfM`|6}4!*(@~DQIBl<~iJ=lpDTNH_UI| z1lZj%Jnj%Ta}rJJGRkhF8pwRkd**3XG4s=(bUWYcahRe;4>9yDGDR*i9=bYsH0)}c zDn_*93r_ygv>i3Le!Jd@&Ir-Ugz2aHR*JItu=zY~P)#@$cH5d97N}Z^ zMZ&l*BHO0e_Fz7bz|^R3UirvGqxll3A^Xv_C7dIy(ZQ<~MrhT$`|a%tc8F95m}MOB z9jE+IQ%gmkR&d)e_LxTkArMKE{^Q2a0c`aN^tzT9X5qE|C5CDAH_qX27|mL=S#Xqe zZmT7No_^3pH*xVml0TV6>62+*dxI?cf2uS+u;tAPQgh~r`#zoST|{(=gD!XP`b_Iu z8(EbNAmJ)enO)od&ia(4QkpYJ6S zulir@sK-;lmdGj6PHCcQPpj4~PZ(p&gK+!V6gnFj5x@u)&DazYrBDd$oc{WsPqAri|szV~D8Dw0| z7&JK=WGo^_dabELVdtYOPN3?(65y=3pB0_?%R{&VB1^XiN9;HNhnm8&-24=sF;-hV zuAm+T#wt#FS)XW@i+juWp~`K6bjyY5c4#*pwPhV-&T}*z^u6 zV)x*h7;BhV(?a8D6tAerMQJjuokYt$c96ukS;9?!!kl+xr*%?~)-0LgP^p4#cueOi z%M8&PTn3zDaZN>Mv%Wb?a;mX){u1m>#2kgc3b`Ydg)4m$+RoDU$^M3`y2g*RO|=y> z%`%U&%LQ)l`gjlrLN~BY^&vX;`{cqF;9NgsX1XGN)cNY2pTj%q>*niGJOD6Q%sd<& z6Pp+w97*_OESp_!=y;rzxsWOF8ks;|x8rT%R@}gNR2|cU*Dp#hL^o8Fc%!qwNg|od z);qF6^%yr)?}0vcIm_KH;0m|7uR3lfv$xNG%WS4iB&w~)K*011!@;ZTD=CL`s9(@8 z#*_2i9Cc{zHZM)m@d(O5yR?`8E9hh_8du2@ppB{30db7>HiJLy{c;i=`{ox$NhTgx z`~h!A6)nQTbQx!aF{0TFth#qAjBo3k7`V&0-uUldAM% z*HHXqrE=x~6tJu9Wsr)>yf{r_u3|$5@E`%0;z|>k*oK-crt43->rZOzr{){Vf7!<5 zav~)3-o9Rw>)4Kvf=PwjkSFa}5*9b4Lu0Ej$I|6}5_wD05j8A0d8y;wwHDjHe0A`! zFYAzlu+zo5*T->9aRn2vzZV@%`D-xPmL4&~quy!(Y9({!-H&^X50s;2Y6)Uw)SBoQ z{e0KxfZohX#|ON7yGznH?~U3@H)BLxpw;5}?qkm9TQBpJcf@rOvPmMwiE6)4xf9ZS zlls|B*+I!Te3Rq_PHcLCcoK%286ViFGQZ%tiw=A9^w8Od!$|-+YE(h%T2(GjWdo}e z74(bbpxzFuxcClaksU#Nh`v528``(1z?cx%{Za0q%o@Uk?K`gxvtM@e(C8`Qq;GU0 zF&dr)X;wMC{83fEC)L@gNYocIdh!oA#L~);uRGM?f>g7=+)(ZR*!~z`-Jfw!+=~$1 z!YMzOnW)x&;8A#Y5-O1i>|K+EH&`HN5@v5 z5x8~okQWj^t74KI&i-xa@!|b}QugiruVv`04VqVwn2!rcs91AXL)tl&j`@CyY&c24 zEM_^;AP#*MD$K~ANiSZ%Jr&^uvd1W9C7GYv2w+}Tb{qXuCp2}uCRx*mB zXbLIJP}Yvgn%9Fo;mA%dbd=N(6k^@yEk_SMucRYY`?w;aQ?%Loc}*KqlAxP*?#s7e zqh-ZszSvpNI76G2A}jr5p~trY(Y3xA)(Mq7%9I)`L#3aSGZ>!NB{0wt@L!&bP4vK6 z=Jb~3ISA{^MO$5GQE3k#EM?^xd$puUmPC<#!5^?2iu+Tf2>8@)bvnEby3{B^$|-{0 z6oo#jjflqJtg;ZfTkN;ox|?p{-@oP19pSr742np7GE4ggmJ;$jqgLq?!XZ)yg-&h{sdt51bny=@2|*I_1Wlj9R9~GyD`|4T2-o zB1Ikpayyj~9%*GNkq0jj88RhJ;Iv$epM;$9QutGEB|ko&&2}|yswg@JF;g*F9p|>* z^+OcVQg-MpLbuM2%Ef$WG`1m;W^KneIvVhA+bnS-4Sz)=K3O<}SjVDf(sghW zLRsH(wlJC^zbGsvRbaV!AXT8+S((GBuhYrHp6tr0^jG_6=g2VV#8aqy4pCYijh!Y6CmwVhN_gFhvD8HIuhFLH96uyBW=`1rkm2_wDh$gv6$#I;2~^| zCnVK}8Ku4@D+Yrcyi_BYR&azo&Q3aA0kFy;nVzt32?JL`o=)B=6#!mu>>-L48_l}?%cc-2CIXvsVBhuv z7o$4oK1lk;&GzYaVh|Pq%Ex&^+Ns10EE*6%8JQa!?bNI1MC1P0M^rFj@}T=?x9t^j zQoEy`U7}Iq+sA89bj&Nt3uWmiy!K}mfd7X-9~Snm$|f;uifPtbr%#-R+7&AG#c(Cw zNEdenUai1a^lbpv#I4hX!wtOQfAPUPgsOo>ZUtG;B%WLx^>+{5bb$~%*pIJVKhczD z%8K6=EyEr7En9xO+@jbD3$Y!eY=4*}{W52~tq`B^0G7lb2s_jJIRfK2txpASuKx7M z?wOyMb1{`M?fVg)AMS*6iuS{=#~=A0Kf@T-HAR(r=?~`h08(m1P%PL;y^oqC{6zKP zHO>utQD#I!Q%uC+F9yc3-`)yqOY-lN5MWoU%I%eNzp*&i{l9wON=vMkv)&^eSXE-l zf(hjEzOqABh;}_1Bb@0E1U#19)X}*uVd;GzYJo{*};P69e?` zF9{{9Um>SK1gSbNabbs_Am3AA!^wo50N6nl7K(b8H5Q0qPg}Jgmicw`krfGt^Xg5L zGmDsB#Ifob^*P9g{Phit*emi9_D)J{5r!R}WZ`DJ z74UW;M{-uN1F(BtBz^)%NSZ+sb06Pae-+h{w$jJs%1#Ipd{_P4)0BTkQ% z!@EWe$^>(JCJ60O(R66W3`4AlR0g=SfIGDvlOurdA#UqtTtQod!W(0CGWRQv)Mt{B zscsty<{Q~AO~|W!ZexNH!s_>Zz7%cfPibVuyO~$jm*igSlq9S3Urn@^h~CWIp{X56 z5YAYxShN7wCm+MhbJ4633gI6P9FxRU?t*^o(p$>8VjXJfHGcq^wgqbWJlYC{r zf@^DCdibIyqFF8R#E5p0`?cNK;Y*Dm}P~Km|O~>K%}_6 z#e%f-LQrPhH`A^VXO`x#8Q;}VK0|O6yUIaeg-krwn$#4n4d+~1WYa@$CV`w;Vfs0! z=pvu{l*Z75Y@D6}YpIc5yv!7oHEJ6&w6FNNQ?6QgCrhfTQCKQb!&E)lYsE2Fo(r+0 zd@Rn_i}BYdfw}L`D>BHb3nXjr-o-sx@tM2W1O4+p{3=M|w`Fi|3+mIbGJj1xU${@l@kK|S&kh`A&YgtycR)idSvH;mIZk#`OX_07do?9{`>VpJxSMY; z4Ex=_Py~G0BX<}+SQc93^bS)kSlEI1%Urm)WieiqBcKF%*wsyx8KgJ)f zXXf99j6iFrmcRw1rfRa?$|b9-st50?^d#`4h-6B>Kref$j=AegC!_z`dvED45PnaX zxPuE0-rfmyKhwy*HTV6#hDnFQA~z*G8mcG-&ef3|WoPxNnTc8V2;am-Y_C9atVCPNFVSTyVl)X4xTe0319)!zUZuITU%gCfp9XNfuQN+Ew%B#{E za-YONPKwQkAL(#}0RXJfCUxUeLzG>pK?GgoyAXz}Y`r?xL18A$RBl$MjsDsc5>buz zX!3ndZThR|Aqxyn?<65{Ig@Niwd zA(RrI%#b_2MdgehrE$C18-laWpPJ^UD&Py+Vj949mg^wE3IqgdHr47?@VqQcw?_Hy zitTuU6Y+|rv_>D}fcU2oMsr5uo%FZpNes?t`BW-u#&V}jPUhtk}N4h zcu@#>Rj&2Np5D6|^bQ>Yeuz*+lgympmyl|P$SbnZOeVVI=-B|+3>Tag0yMQJn>+Fr zU-lmw2e?v%6zimNj?23gtz@581|M#7pQu*pJG9(jKNV4C=q??3do;!m2naC*ETs3; zOs7&)N8_yob9ae`x-@{qvaB#&-jIEf2t6^U{fLtgJS0#7}4~wj{#|*aA2)|{2@@mD->KFep_`BQO-%ghCa=SQHrlA2%P>~_B*U%JapCI<_IrcR=m?I3PJj6u;J!1vARTH_dx+> zY-M{C1;@Vrt#zVUH_wPOm{FfFAZ8vcW$jSZe@!E)+lTWhj3<9ml z;$BsThQFC|j3ic3o@DamhK`Z$=vZ@q-YYshL|IB}S4z%FrIT%M3QDY?nNR%+{Yy-c zSZD?9lH;-{9=qd|3~NQwd+1-G z1Ov8kF8aXecFO#d9()n_!Er#c$gxx>!Cys4=E+xYF5n5LH)mq&&Qh@p1jL$FxrO2> zVnqvzR`BEQ{7O8iPZ+HnbzzjPtl$$KGvH_c(+M1OYXNRdqy%(0f7Hhn*&U97 zRV|gzxd3*>3kq0{NkDSz4H9h|_IB9P!J(-%n@;jt*j4Qr!f<~)WQweL;{fx$Z<&}& zs^bXcf)bwLT{n?v?K(S2z=x)! znH|p6^>u_&Ml~!SaZ9;H8{hG?mQE~kCay*5k&DK0Raa22f@N%pjOgaF#xrN$h#Ea3 zCQEX=?3K^vOQ+H=Sw=dV>kBpOh2t6Y*n2>bl(_^uRi@99cnyn1MZo*U1oafL(K4)# z)7hEPY>UNIHFpK&coF2n3D)`F6*mnP8l$YtgKUjStIabqhGD;>N5k=&Kn20L)6W%< z4XBJ~&59`Ao<_>yvP9+O^;~Kr$WJIFDe-Y6gzbu+!5EG#-h3_ej^&q8rD4G5NfVGe z4Ar@Y#!;^3CNi%o|6(iGD^Y8|kZAidH5)ccn{~ETCCZVkl-WqkHa{!S_be0AtY0ol zDfSdk<61wKP!J5}@RE0MxNgm7`!=D6xSr?$-!r>vNO% zpoi6;h;6e;Dh%mHSqm0+#}}N0VU7@4eSsf7*!BL%OTE-eQ%k0Drs|QZd0?HIp2=Ld zl~E`#`AsR@ULzSLRGbcS9F!m(s&~!)H2rme?cC4Dr?YhlZ)h?HnPRK;`J)w=-et4k z3lN1S;aDBY@Sb~<`KWEjJfi)4SEp)me9ILt?1Fy&sq|UKgK-{cT51)#_!+Wx9!g3l ztz|N)jmxA!t`Hy;+%+iG&{L?tK>GRT1&_D%c3XTAxt-Ry-C+QgNVI>)hjM7qV?PG~=n4PfGT|A4#hzc#-*NFG@q+mi0$QSrp}eZ|Y0 z(Lh}d6O-(lXS0gHA%x+*I3I){r3-H6HHC$Pg_()a3=~_osHqHPXw%WlmU)yn2BxBv zvFinr4-@BUN^dE;uaPpAl8YHBUV=7_v%18}$U`p`Z;W;QpMn`aV`3FUhq=D_1-zyL zUcPlh*vimf)ycq|Vmo8{coAGG|h(gV>*02oQUa(a0BIa+!A4V>DmE^xlTfwN~Li|U!BB8I23W}%Xic3D<~E~bk4%Pkc8`to(6!x!{Y;Jwk;jZ322KxczE)+#_;p6RtGZJp_cPr@ zHdSSZ9f(BGJzF(+cyBpdBNQMr8&wY~HZU{X3E)%+D(2)|FVr$10DkD~^D5%tu%~7b z!IN-_MK_z=Bh+Dfq9%~OkkxE$B_ic_92OCDir8jIsHyhWu&djS-|xX!{MdgiJidlL z+*qz{HwM6~sIwFe%HAe=$!_1l3@mvu~0y_>R;scKr2avm)dQ<^_dHX^9|>;t5Jc;_QXwE zWXawd#!48|+}Rp^z$tixg(2pJbqQMIrF~|Oh(iN1*y7sZXi%41VNk!SJXyl7PMaWw zNUH2yhg_>B&En@P0*-lAHmfGlJ=e7sSCYRXQZaUIo0gVL3OH|8I4yE5?F{+qG!tDb zj`+>>rWr`Le~)?Zr=4M4|>0e3NA$Ixz92XFGI=ZB;guK)Zfp@NvYBj(X#PA!<@3D7DI zRX|L&U1J~7UYJ%pPzc<}!K|asH{BncTU$p^eb%eUoka=~+GA#^LD(;6;ov5h3)GX1 zoh>WBf%eHx$b`tpmU7V!DSrlW^$zg`BAdF_roIW7RCq&4>&wvAYyLAy5ZteedTglI zXv8J~F88rEBjR9~J({ic-BVioiP`E2y$d|BK|n=)YzwI~hz+h%XPJM5h80a}qvkvs zPMRd;0ZFl*xi{wN1OLo>k`y^TBw5GlKSTZ~W%q>gOVU(9#yGjq-gttu(V)@!$YwAd zjg+rGi|xmsUsXNW*j0mL6G)MB(i(6)W)pYpEq5HK&MTl3yWA~XEvwv{SZN4bHhI0N zv+5PrZx!O9Gl^_3>FtCEJ_sxdJo3&aD`65S@q5o{=*X^;TY{f+OXxs;5V3Gvolj3`FU0y1#zQe)b%ZQY}q!_JL0?(>iR7a)G2F86#|pM)}mFvR;X10^9w$A@ugX z*q%^F3xFIsQ#lqYJs`2XspX^`9H#L-3BhB-bbRnYF|iC2?9tdmLAcO#Zb_VCN-==t z6)|wM=4n-@kOA~Ko8KUpFPLd_=3<-&7UmOtZ)H4`SilV6>Vn=cmWM_>fx#STkuopU zBh$YfhK|{0$1TfY3fz2H3TC7@jUyWy#B@Na&I|ewWp03ZZMY%BM6DPVrcK$udzw~l zn+%lGkNi7%(c}kg(hqmE2o!KcwmunI!D?x8*N;djwe--S1LBZ}<=NVA2V7{3*PIi1 z-al;Z-tFjZ3`N#g0nt%;Be5&opN9jymO0Zk9Zt;nMVJ@_=!svQy$_ZM_IfvK`J%e-_*AFcS3+p^o-`tW59$7qTX z?V81iD&D>?FVza<0}2e*hWrbwt#if8`{>DRZZ_6{BYn0$?GQj62s-4x{`uWGDT@^T z3EJpRMcqkU_2^|_AGd6nMhmX6%E-*z>pu7pLK#f&r(c%M)dx0C&`yzGIt=O7h=chX zy?i(&_O|y~8L6&V&~FqJs)vGV>qLQvYC|0DVE2-80al1zgM&Mnr{gq!Yjkf?2XZ|( z?yoW4wngxW7}C; zu;th9G4u3m=kN@!e~>h`?UTw&PVZm#nl8fgq{AjwH=}R|L%gC|{0qs_Z#>_XcFA0f zl9~%}diU)N>h46bx<+>M{0vacffmqx=_IHeBP$#yb0t;{39Wzp#Uyx(nZ!s=f+b@T zZ?1y^0NB;1P&1%F0uz^&`}jx(R?(j#60e0HaxKD|fV2SwnE98$Rde@tba(@c+wx<; zlye2`=eXAEzvy>p5_zubuDo+$wpy|I3~%Y!XI=AzMg4hoA8Gu)Y7E#dvw!=6f@453 zr%UO8kw;0_Conky8nXz*8$LwONspC8uix5mp9)pCf&R(m@YMGRFabc8x6Jb^XHo|z1bIS!JC@;2Wrso?ET8X+{ z@CFi((XQz<;P|@5*QiwX7J|P&?6k~u>NiWidB6L@jPeDG zs$yc-wJ`5-K*1nXBI7nE{myLvUA=&Nzo?=$@dl+jPS?xV?|$L_UF8*NvW3ros!Ujq zOUH-;k#xvRtv zp^(uK))Mi&H&Bdf`ZTr{vfZsj>|qnR)RBgS*dac_ZF9t)vAi5Uiw*-ym*#aTl)T|* zt@giLoe*1u;?sK%_IDcuDG!M|M7ornAVvu-WCyN8(QCF0wN#qX?~`e1y~?&av_CjV zREp|>zB04l3aB@mh8H=-43}!FRnTz{80Ov&JG7Gw(1-G^Y2||!GTy`p%@H8>Mk72vl7R0Bg%7lB{qT<2dHIFJI2oQ7Ms{)M_;-cT-)V3`z0;lGak%_ zNluSvH}{_2;7{St4OhtBmTmcDjV1@8-wxl=1Cp8;=CS|X=q05y3ex<&T=_#;{eOBz z^5_2dL_AL_V`8KEcS8XH5dRzfW5wbmF)^{y{C{5iH>Z*~X@`-X<{xpx0stufjsCGJ v$dW9WIBEWQk$R!QB(wA=tqV7Tn#P;O>yXNA9b7sdwv} zKfP=Any%@tUbFXFa~KX&FHfkU3=4+`g#d*F1qDR~Wqy`pYYYtq6<lE1V{GA>Gqr$O=M0PQ zz*rmKKxpd}AXnujt}lzaDL3ek2e3f=jr@Hfj9;T~G@e@d@oi1m409!esX)_f z27)SXr-^_*3HB$lnYaq*UFMAVL~n?(b;vZCrj0?eSmlEI-C z!7lYv3sQfl;wi*kIJt*cygTf}*6S{5tf4L{+JNxW&A~$vzmv&5!;rJf)I%Wm3qleW zG9Gf{M^2iXw~bZ#(GE)n161$jL<>8A|2ur}er_!uOIq1W>U|s4hrl4kRm{fN@3bUq z5C&4L55ys`3c!xQ-@_|wqOnJVRCn9FmC-nOA~a22RbgpQpFI$n>0MLhKUZ$`VSCQq z&fR4wDETmYcEmAPw3HSo4{gxM&s@sZVNI~>5+malk%ke7Wds}at2*efn?F=T&q-^a zR)y6z3*;XqO=JNVQi~7KB_f1WPNp;P2i?qVma6>+9jI;}i9c%F3fa|~=ei10`0v#DhdYEkpzJaA7a4AAA@0u35?FGq5G`UH@BD+VuyK+zPX>(GDK4;L(+ zoc7}ufdd+-rK;S=iPeGn63*$B<%y5I}rw6ya+a#I8u&Vz0NW*T>(--ZNDo{gnGi4@| z_1-I3X5|hc59C8Jb^Q)mQ54+=YU*H?ishkO?hfgy{kOHM&p4p_`xDw8X@xR*IiHw+ zm^)?opaKPuDba23TSnW*^DTGj=V2zu$`$L#&AIjN(7@bA5+*V}aI@)v;u93t|OI9KOJA=PX)xqcRpQhsqI_c@Y-bJ$bu! z!LbAhtyl^~J#9FTvS$^RmGgGkTJVrxKW1e(j~<49S`fZBvZH#QrDIt1@HQD%x@j(Z zek{MBUYH8I#RpgcC$#T_0r$O2=a3wyqSm&At8wTc;}R!}(Rj+jkuWcnyusl6&9b7S zbZG50YB9!f<~0|mR)R+&?OCp=Hoq^j9q=;qjf_6*BXv6+OCgNw+^{zuGzenfJ@yPY zg^SPnyIHIRp!#BMLw(XEKb5c~U9T83d(KENir@%~z)PS2U*UQ;Cm4c{=QMK|Q)cV= z8K7eC3Qc~hb5ifhp?KD~UGNt|F$HospvP6>1u(Gq_I*<~g?r^DhPG2RP^9!Zmqm?R z^}j!9AYg;6D=(P!UJxET6d5eoW$O}#*VuZ$6-9E>aipk-VI@4kMO{%p%SN!W)% z!-DJiog>{YTeGn&S{#5x?kKl{GG7;tlG3EWN9@o>&H!5nprtKOH#^9%UpgjuL|&_( zriYc{zl*t_*sw;3sIZ*>6{?9Lnzv6;AfC4<_5l^ht4WsmNIH!D=bC6LD2z5*45&(I zOn?~LOs&89Gn&yzzAsK!?gIat1}_sGiK?4M&`*^$(@V02JO=?kBsF=Zk#1(WH`mODhkW^_*a zWYZPkyW!?ht^0&+`1MJL9e8k_Um$^oGd_Ob_e*Yr9n}^T7jZ7>2}adsW{oWCl04z9 zqz*e|qYcldrBIucMV?t7x#FX-nv_a5sYlODetEd(I*}#dy;~6_vcHYEJk&+^Q=OmtEk|nrs>9x)djS1vE<0W}&@a7)W$V=hlr%@wymm z-M!7U!fnI+>JQ`d|YX|nZ~ss3%>@&m64DQ)r-!H<|@OCry2bbC$0 zMaBkx<1qERf`NjPdE+4Z|8UUF-N(Vo?JolhQw*JFIProu5*{#9H+UHAw3ibtCp{D5 z`+4nTw&zr{$1^#)Kg*+RL)@|p9gC?D3@Hqr7xzVU^ZNIz@D74AG65-@o1+5}Wk!TX zr#Qd&_^*Ev&83!uP^$Dm_m%zohrktq&yM$?wN)-_Y|0Ngn<9V;Ix3IA0otkw_n&y# z$;bT-)o9=4PLm5M3(hAZ3XQgMYm*-#lC*sHs(X6|Ii=7@!ZxnN>u4?3t?e9CN9uA_?jdH7w$M9|6B^$RS$oeF|H zahgG_X1wCy{+03JIkTIB*v6XD&J&_CM<*Ihx(WPOq5Q(Rb*vm z@Z$ZpS`8p)D|c68aJ*n026(s%O~>BH)rAG)4&n_nt{p8!yM_GXQ|H^fXk=L+aIfp;ENqt09HB$Pky)*qrP_2M1N;Nv zy)@=JZ-qvW#Z^gb3@Wn?GY!XbR;!1LQYcG+kQ;%& zWeKI55UW$|jSl^kNup>Zz_Dl&0XzudlLVTZ*KfX^IJ))C35TRLTFabd(#EC}O&uQY z#C8%~7&@U_Eylv4G-+qC5g=-O#8i_`jVrP2n8kG3nN%P^kdfD4{e5x_wiU7y@DY{uxAIC}pewfH z5YoUR?4zthK`Mo!&>h%?j8pzAH(15gUvJ9o>ag2>ni46*z^}*zw$y0g_V|wh zq-nYY+LjlTQrxsCe4OGGZ-%b%G*{53d;W2RV13f$(+@CNd0gm1{s&-f7zJAErW|UG z6qC47Z+MP%iOumsKAwqT?}GBFu}1R^Ktop1wIz%_w9(!R2wujw>^b;~yN(tj)se_F zp&5@}aiXQIqDL*TXApD7Ee;g~qeZK@J@gYzeG;j@C7Mxavu}lN1_{SG42Ryd`IkT_xn8gy4zYyv9(se(4%l7iv_!UzV@N)*WklO969Y{+2XmZ!f|b7*V#@KVr+Sd7>pG%f(058Ev`C z?Fs~TE6`VS(8~HoFz`8A8X=}nsPjO>G_bg4~K$WD|hSq8^H z-e#LZTP2nQ=9yg6kXWqlFOwZ>ZJa-Yya^a1u{R1IiDlQ6J_+t+fAG!uilMs62eU`9 z8$H9c0Dr)l*alHNjz!Y9N%JK*^Z(?+66pM6z|>@2?6ec^oxlA%%Dd*fF)U30NYo4% zhJ=DfxCV;Ae=(BHsW5OjOU_!%;>U!=lh^6MP1=q9IT2aU@Y`#c+zZ+b0Y1*~@;O;J zi_vOdHuwkfJq0Vk*EV;x%O$bWZQkdjoAKPk>vOsFjInr))i@yI+MxgT?pDe^quoDv znEw3gAXmM;ZjXm5`D_IKXPdNF09-q8Dh83WbVak30c;OFPW_O{m(IGHOvASGnO=f{ zdpT|r%|OO#tt&f9<@SR&4}HExVYmkie%@r&v)Z+HTO{zsW5BOq40+!qp0W&R4Ra12$~brJrPj|r z_8xZS9dd-6AdmimscK3q0g*eFSD7=bGx z-7mS1&4dk@g3dcd`puC=Hwa7IU?)?Og(B-Smb=KHH+MIUUDz6tX094V@RoM9%S-vs z4YEqwCDNV_irBagSmAvE>_R;~4i?0(kwMWRu18~B!CAHV344!T+or=n+lBV1X`$q= zG{Vsup5M|fb9?wAt4F8QSty9qmomExj@d=i%VF<2)Yk+k=7!x6?0(r4|71Rzbx%4B z7uiLxxRRNzXD)L3qsHc14PXaj;LQ0bwQtP^M8kG-vgM6(tLnF_^+x^pp}Eq)!-|Ba zKFj~$=pipCc3I6JF_<$7H1LpQJyyzjcp6@9pR-2v3KsQs0ScCA9cV~9r_nGzO_K~J z^P5JmCh5l_ZL|w9u)PK{IUuenEwUvOn?bNXglH9DH&ujs66uWmsWm$CV8+}uP0%#L zm^preHZq%(^R%lOSYSWf*<%|E#g2|vF&QOn+pZ)>Imt)MS0n9C2 zt2lr4h)KzUU#I~bvOLYKxo3)@U_}zG5W14e9vI`L@ojV(!Q<@mX;+7pW@e+=)(`1F z{CI7137I18d`sW66(Z5l5yVSu5TZ*|d^{9$#kYE^%%@ ze9;7VX0r?2{@`bQvESw8iZN4VB3B-jtf}T^2CAoX>+Hv@?CP0WObM^Q>$l#93$Id14O89JTh%oZddC|7+PxqQIj<_$$fEh$Qb>Bc6p=4A0E5{ceK z^r-fgN#-jF3)Kq3*q*INQp!W$2bP(p^dvquAYsO$3|<}_=UffWb$w_`9`wK7(u9|e!~kZmL~ zSzEENtx$$%A0hi5K~R0(1Z?MupAoDz!uNi|643i~DSrAF$zdQ!y8-tQ$eweKcBgr0 z3R+!@f2nYjt@bSzdr{#J^W6j>dH#@Fk;OCiXnlW=TO({-_!_xTSHqs4J>{Cn@2w!v zY1+hMU!6jGUVQptA)R!f33&pOR+aI`b6~NSHeA8tSsSg-vdA(6vo6Ov3$h z=ZS=JOMa~^{eso@s-hVn`Ac%~@V0ykrM84>6FhV7JkX|4r6-CZjw4+H3A$ZhTi3JJ zv?2u0d>?G!346v)+{af9Dsual-JZ;yhpztacNa}iq3v?959bh~@@#p@yQ0-Kdp?Vn zQI`jJ8zDiKGx$BpDdNxbMtcfz3BMC1uz!YL>JE)8vtRs3OWfHQ^2q64n4EVpkum8l z4l4+AM87~R_U{gW{f{5`+Yqi;^37f~`?2Ro2@Q-%R9Ek%Bnm&L4-60UQ$BS6TzI;j zcTyM`$-}db)V>lCiX+(;=x5h9m+5nX1fjshbRJAY{3vH)np5c!{66s;5^qiDY#U5R zweBT)RJM&lZTY-LJ^`sgY8s)CK=J}XuA^g@FETp{?J1)G{IZ`jWB=E4DXz3HVHjIY zO;Nzd+6EB^qE}OqF8$2$I+zK#Yc#K}#TZ$4Srl>Vv63p^piLrXjr@*YTB?VV<8J2r z{b2D7-jtgvWM|2T$0_^lNpqc|+vw>YSVLA6=ryw}^}pj7U>bugMP9_n61{%s-M@x82EY zqWQqZUnVPl`gz^fT<2`o7JP+yWw1^jW$44rAH7^EVTAmTzu04}kMzSJkR2NMM1i(O zBIKxv7F=omUiYTf96Hg{My=SQpq@6OGT~%Fy=iK8$#XfDg2Pz+Tj?1_4vVjfWk^k* zt7a5T2Xf=>vy$jJIlDwOh{zwE-Mg|6TXFR;THQWLy04F*Y>*(Ad5U`=CE3#8B!~cf z>Q;hW20L!IJTl2UzZt;kOy&zr)wS|rx$HUsRC4SLX{day$bDUCz_8x@p0>tiZCcgY zcTfpEUNfscUnp6ywT&EeOIpg-NscB=x6hp-)NH$+xc8ksCA-8P(6uETH;FDNO(%hJ zgl(p`f#5)0d=ZFEVLw0 z`N@?H*strD8qtgoaoaNG4BqV*+8(c$d0Ka%yp)JYbK6cZ+s<)mg5BVC8y654Qh)08 zBWp!^Nhc{e$hxh$A@$lKC)!x>G5)YZfHVJysCF!lKWnjW=^8TB3J)20`gHAOAsCW& zIDdQPscD?{Xkx=hCs9pp@QU!}y1c?dX(&U(utEKQ{wO3d1PhYo43&DDR%bEKr-!BnLG?fm=$_@E`V zgD6P<>0l@*+<)1(C>1kF5)D5o9)v~u-$nc%4xcdT9z>ax6@o(gpRd(hrtgi{c>fS@ zks)!C5Qv`ie@XwgChY$MFd<0-h0u`xJ;m{FPm{d$3t0aLFi)1$9YTzO0)_IX1EHX@ IDgS2wAHCH-VE_OC literal 33769 zcmeEsc|4Tu*Z8fmBs)onQP#2+QkEIAC7}{ROp$$=vScufvWz8ELdcdqNp{(dElKvh zm>HEVnJLR1gWv6WzTfA0zTfxxyubH-e}BHe+sA#Mxvq0v*E#1p=UnGH*LBGdMt=}s z1oi^}a1>BGAE)U~2LSE#0Kfz6-)F6j_VjV^^g&(qzwO{{BZu*D7kN&mWb+$Sz0TG|uZhvof04(@31Bph*Hv&qJ`@!Tq!5nOZjX_Iym9e>(o9BL%VaVaJ- zZeli~+E47shw`mYZ$7-ZMF=c8!NvT<@A7mSpXRJ4dGZ~sj+XW^>%CZ9$q&q0O&cEn zwM8rCRpmjE7kqUevcztAbCD|)+CQkI;_6;w{X8vMlSj4Z7p=@hY^qNAxG`)3=j>QG zW90^|4rhPM@NYp1>18BQQ?%+fA5*yE&TH&b5891?cq{I(-AC6JMT>eKjjtc0fgj3@ zsreN!+Ndx&t0{$P?oqy6%RmI?|KB+O7hB|CSbzE0XwNvq zH_m@-O%0l@6bZYO?#X$@Z+|%K;PFrc zb}(eLC#g?dpQyi~@WtM_R)6BAHr6c3Pn|X#;`+S1`hw>83&9&}19O?X&wJU#uReWf z$mdMbmS8b(l7HFRe~CNm8TP{%wLa(Va7!QTlP&w_^+4H$f$q?wX&*3OmBh-Uht;_Q zkouK0ljp~Op2^IfgUk7-&DEVv8u=pATuL}>;mYCYba_Mc?eRj>63owGY)6Nrp=A(c zAOF#M9wc>m{)9wT2h|Dy`{^+5^1mC9AKJ^!4vlu(Tjc&}Omxsn2U*(x?&sy>KG!w{ z*3OmC#?XPFJA<(&_0nxSv&Lit0ek5?&US)^TM0Fp-C0K2KFQZvGsBu!B(s;lI~{4~ zlUNM3xWp^Ie^p|9+V21&b*x^JvwhN9gFgO%gxL5FH4C{HF=K8|)raf>`FS z-a37J$Hb^gpXqdROz|&?CVVMepTB^3v)OcG=b5u#mgSBoeQ{G5NmZCRD)jc~4;P^> z?(R3HHlY(QncIHdpwOQ(Jlm|#=zsUlnmjbq|50TsZ%MmNLTa!BsSF2%0jbPimF2+| zBlI%`&PIzx+-s~89c=0rgU;0UC5so6QZ5M<_V4Xvs~>snyF9!*eKIV#FxiVA8K07} z@%4*Jj{AV3in+5H5yN}4op&hl(%e4W?p@Yb3gvMxqN}Z_`^&$MG+TMBu<>HK*@gNd)IBYkNwvly?n28z>8DF1A`hk1Vor!B&59Nrry+AW3! zGc(3Z`X$%Y*%9353Yzb&wU&IBsVE$b`M7ZTR=}x%lhj~Y2 z5~dT=9Xj0`YJkWx*XWfJ_bVNFK=O{5WqbC8__EgGiKwnyyX8-5TtM{aWLk=p+>=ak z`T6RM$nl?EC8=yx`eI@_TGt=e-qF;|Gbdj`7-j8`9xiQ3OZl{BtnT@YTT|29xv`>o<7?y0 zmLhI_v1;~dw)QcxF9wQ*wy)?T#g4)UZ+&F_zFwbvdY>4tal}(H<(Bz#r*oUub1vrnX@Vu*hPto*^`-|jpc`}As9TA2?u*nHZ_J7>goL@-6xyohch zv+68O&!MrpagoUOmb~N*TDjGKvh|}d-l9T3z9)7D^mYhdYIpWhz>m@`-&I^@xYa(v zCXa;?1ru2j+FGX%PUICFrs~MPt2&G%K4$^gZe1IoFLJq{cP326a~2~aHOfOYx^Ia7 zc1db3V)gp@i2M03_!~0q3L{gREc%**)@a8A)@a?`ZBCLLxHJB%^Pfo@=EIfh=D0P3#wWF&!_~$rm<`Cvm z#D>d7u{xu;enfn$iuMDyv;2`i4OT@j3NatK=NZ2k7kz}deB2RnJ*M{aC!T(FY|Byt zQ(Q^o`HKhF-eVP`g@|%=$K^6iR1j;b?1{TsE}ZNv`JI||Qkg`HKh?cxcy&wPi7vrM5_U(-d12AXmu-4QAh z8MG&yS!9iRf=E%@N0VjPmWT-c!nP%s)+u*IDu6iT&*E*X_ni`YH*9tozdRsz!hR-! z>EZLk%Xhwnu6Wg)TUBhQlVoWNiszR6=%jML-sjs>B}2|_$7AYc4aT(z&5g$eZ`#$d zR{9i}b}9MWOgSFhtWGC+OnEX+y^j19J=a;}*tmZ~-hDs~^Y|NAVe~KYMGt)NRCeet z7H@mChi@j?spN27<@Cex#_9?GE*6z((RbHOjQHP(_fA_A_9=ZMWF;x$jjtSOAKV|j zcp-xM$j56pL__H<;FoyUjy)qZH3~S3+ou!>8$R)3p3_zQHc~=;yIdxXd zydLX}A6va0{84K7aB=6?i)kj6`Mfe}6O)M-kAv$H31(H74g1WM^HcKY-<#LIJbrr4 zE34r)8zal@vFU;bLT0(&v@MgA0FF%%FF4hQh7`1SoR5~dZp)YD&j3AKlAA7$O{N( z)1^fQv67gKtB~3!NCd*c%HWVG<=$%lFYBW(hn}odvVMrOPtrTHl(czQqVx=l%69JS zR`ta9u$hKx!MP7B(XYT5ue@iSyNVX8UVzGeH*KMNoN|5)dUaP)>G)2N`@;t#BW&oF znihl}%HGyyDZPiSwL87b;#Ig?qwM7Sv^*j-YqG#zOiD)b1b5gsE$dSgei7xa)vMPM zWlV3KdU5O3#gYk_*ZFIxjTYZep|FEmFVZ|8SJ8#4rQ^PPn`?!ZP3JsW>Rq_iG3eq( zT$!#Nw__|YUKaP+`AXxHI8?mlzE<%=PaP%N6F1LSVRJ>c^ner9O{u(SxOj9SMl1%Gm+$mE`@8~oUYU|o!f+8+b!ImvTd8qH;kCUPe&7fhJNADGDi*Cu$?-u za(>GV*b-KAPB17gA9hrO_gXbwLs9gqo^BkRm@Ic72N{>g7(G!-H;v&lxZC-t$p z^QO{c@vU27^OQ44+`)_E+~uhAKV-|b+)k!DhZ9p*`iTb|F|w`}J?AiM?xb^s_pGw@ zoTvF*5e$?d8Mlv{&P4d)WCljG$H`p5ElcL!%)1;FO zpL->Rv802C<7_pCa5o3aQarERUD}Hl|KkdpSbWr16^bP}832IoAAy9Azq`Y45k&42 zBlMI4Ys3m{9ZvsAP~6g7CS{pjiZct(%aFRS{$2Iw{UdnE?vzFp@`9HTlZKD3_j##k ztWMMO$DdX&Jm;w}KAb)~Vi(0Vlvot*=|%GyM+LRL^?%`ZBG&P&@Y6PxjyaQ*pP{$n z^*hz~h21@i2$=c4|IWyx$A9 zIdzx4$862?s)l2(oG0Pm63&~|*Xs!y8Zzgt7qgYvj^CjkDAjR@Rs1Lq=(El}w<|6F z_*j2Q|aqjqo2>Y%)MfqTVu=_UM(rgX4Q2))iSrmaWVSAg9pcE_4+PL znb!u6evO~Tm$Y+S*8RHkoF|cJ$)x9yTlO~eeF%ynK0i7y()xt1i|%c&7hy*pt9|WA zT?zh;{y5&=leB;#f}cwRURt``s=w3uM5~xQe-26Wgv1y7r^6`P z{ZBt95?kNnzqc(j@Cfktwj0K@WDao$Mr`l1YlO!HAgIWPvB7?w6-+&~^Ia*mS+Sp$<<@aT1u(pBN%gMDeXan}~PcVSukbx_kv)5>>uKd%~juetOC-I%k!Fpf9u zvdkuyZ!+qdapLLOOXtIT4$d{?nr>+31*&XqI+V|njXb5y*hQXrEqXqA(b**59dYPd zK*MwKYp3K6yu5K@wCN>%N70=VUPv9OA1v=Lj2(N>;iP~`I(g3CxSByWk%3F8>j2op zR{WW{L?rW`^lQwNxey`dfQY$_)8Y+Hn*xzpyl*#y?U$@T_^QI$n`zy=*w#qmW2bkp zGc(+cqv}8O5hA@3$6*Yr%W+u~!;KuB2A?-8qPqK7a@71!-aQDJ?K5s&Q>lcd`PI@h zxFJFK4Q9IMN`gwqB%&LK7B6(q>lb+t!V0H=@x1rDy^ITy^ zvrk@0C3(CG{_#oW-KZc#anSjw5AV-tv|r_xipt-owZD&!tI4=EtL)fY$?3G2Pt5&- zceRJv&b2&T%t|;S5HLFO@Ie`&B8sTSlnT4rb70AwrIYKecmP${{AhmIuPs(1nqR+8 zT=lO!S?9?^$GuKf z@TwCWeARc62UrG1PL^4eG zmIzN{_kVqM_IkR+?ViELO}WHhzSZ}QJ~dzF$iCgrrn)h3#CLO(M-Mj4&q}y~xR!M} ztiO3k&AcV{V?Z?;VaFhW_{8HXyqq0W>6BoH4Bp%!tN%Fo$`d0T{A86`DV4gU*t)5w zf~Eua>2E9F+TKLrCvbmp{Rbd;?>P#%nuds;u>*j;|Gn()Ewue5FZ6A1X!GRjVdr_% z#a+(zwu95|_%0FPyr5^G2hh>c0q3DVV3*8PYoM)-x^%@v&)}jiQ~>}?g*R{CcHbug z03Mz`URU(biCnX^5@DKvj%WCx+^_(^a?{@Xw$`Of7x#wW!`=J->$5)kdx~pF_V4Te z75m?QvD|X-wg&*Zz2hVFEpK}d2*v;agS-80FCPG4{0x}c?*KE5G-T&mvX1Sz`xS~VL<)pJ9s*Hx!8;B z$pVtBql>%4Zxz44{x8*k!Tira4|iXP=ieS-=*{ZwW@-q1rS^Dt_Bwy%A8?laO^Bd0PN#-^wKl?9e>bD+S$QiZ;l~)V8AU09bE{PfnZ^0-}C?IA9lg} zuIXO+Z~X)~Ya2r_)DO(d_2z|35G(+}kzVL4dt*T~!=5|1>+g*P^$+X3?Q`iLG{ELP z-HrFgg7~Fpckup=N2pKOypQu$NKg0L(x3M6y84f{^j3~8dItZ%$KH1TO&1~B=ySb% zuk6tV^-te_(@R$$f}wuse>ixW?a2*-84lmNsk0|rIS57o=5#j!2LKJ&L2rA&6W9hs zfGdDLa1Q$32E3peN5BPehe{lvS_i1ag^mqygUbFM@6vBLzqk1l!|Auy^LwqKQT`t1 z_YoHUfrF;!qgS9;g#Oj&FVbt!pP@esh|nYHHR%!bI#B6ZdbK|&xbzn~MF1yY0iyKx ze(nIi(8zmqm;>h?3iwL>(MIIH7s|!HFP;KX^Cy?+KkK1>{@@s*8U0rY0N#)U{>h!} zU**(=Z8-c#jmQXn?e!tnE-Ya3f0Hl6XNGx(5$HX^@bS;~3{wmrp}z@03z`A<-zf5c zsB-yjCcOW^ll>PSS|Lmy;3o7YL3n70)_+p^FEi@s`B31`nmtK4;EmUff%`IlmMV-X zEGvvCWGmz%|A?BeVf|G*Mq=VL#~ex6+tsuBES-rWDJ z{}*0d082<#|CH+AkF{q3E`S+Ci3{Kd;d((*-IK5<;1BisS3CI!rS!t|C+Q9TO>58W z>GD6RVAN&QVblUd7*8;sWjx7vVGsVNlfPvNy0MJI?_V;pea`q89rJ$f9q6IBw4k8Ag_Hr^JH{IPu zeyj2p@pABX@Ve`8OAgpu2lv_mz|Skct#5RP`u;5QF#-UM?~u>x|Fg`>7XV&*L(8(# zpJh^zPvp%6fZEgczFv2K?+;pddjY`PQSsDozv)=dLB45vclRgcub3VK!1klv-L2=l zyW2TX{~rOM#eJ6q_~>F>909<{XwU0Imo4b&1ORCF_ZwvhI>A3+Jvu4Kf$x>?0|2}W zK>rV%Jzu@|?DDXGtMa`W;F3X5KpS5#J2*VMkLBeb=5bauV%em6KYJTf{qJ~2rm&wl>$b#8uP zk^1B3>e~7S_-k{IuRT8h`TQ+kkPzth(bL1|nfCai+vm5(I2S#`(UXkaT9=t_-r+fR z>fZi?=N@I3wK9t;UZL>Xc?}#mB&I|Xr|z-#8)yGD#_s=*IQxUKKlz#jSYeP-a>2L& zB(RgGj@$+Im#!arv|hrbn=!YgI@Fiy#^RS`iG3n3_Uf(Wr>BqK8ykFm(LH6GfhtBN z_kL$G&0P>+ug5W_>Ef0-u7+MT_ ztnoi6Q)?=^=^|!$#Qg0~leM{R?B=}<6WZ}zARM)7+`zvJgndum1>Oz=I6@|9whQPi zwPu3G3piS=COjpxa;1G20L@U9PHpMP*5>_6F*6~VY->2!c<``5M*$%SZZ(iy0}d6M zkV+0%H}e?)O_Y~2%sFXoNG{)}K5l`nCG)n&BiM`$-2zV?fvxuK9NYy6rJItrtkBob za2NRO3Lr@)v=h6)b6;W~P12`%M=y&*wi$GwEFhFYQ+*Q{Xz5d28s5z5tCoJ&$4K4M zq;MQlLrO16{8mliIR3pvwn`9`ox62Pmv{NSbF1TRO%22U;xOGgRFyfyaf zduk5iLa3&2DbyQ|WU2WzQNr}8EQ@D>0&L{MsMHC({oDX zNKz8~ztjHO_w7zl4$T;hrG^rI zegZ{jR|qD|pbGK(8&doOY06BCm8ca>ml~cNod?EHUM>YQQ0DLrg0m+{YBrLDx?Nz( zCtX5AyRD$3d0chv)0g49UpYDYaeqTK-D8>vcvbXaJU*C}Ui2Z3Bs8S>Fn8|Q6mm2~ zeTLZ8#(`JuB5;J)RTmS{2NqQ`3MfBYYc}Ju{j(P~RFcQWf`y0X>gwxilNgf%W1>F$ z=cnp9J=S-!Gmr?;kpDa7|0TJ;JhY0wzYCN*b)utYSQ`Y1>3m%Xo&hDdMke!w)@i=s zL4x(nArN<=@r|1X`uuR^U9N?ijoiMXaa&8vU7+hrJGZ_kAw)xEJ%EBD^a)}`DFu;V z%PH`7wVvVO(OsZ@y_8bZLA`F@A?aN`^4x8d^9QiV0){P>WK04o zueZHLc<68R7$d)INLgyJ71&N|QU&pC)9jS#s8{pN2e95n3j&?VIpY(qR*pGNq@`cW z@-NY{B!lHw%lXhj71{UkabBBiT&cj%OorfY1+ai1%uhWEzL{;5TB;LHDHb~2V?=RJ zN@Cm`ln!IXDn;Q3!6TTrnwK@Y=Aipe*cCLOb`-{t~%0 zDNVIr$@PAUd69fOoUE8Vkb2`_;_~uvdo~3ByFOF6x-&K8K zP|!>7XP$if9NlU%O>}c(n?qo3hou4Xp6@O|iv@hFc^)%{q}Y%$cdlGw>V;RL;#T7) z#@d1vx1VD@$nyM)bw~oo{bsI)%f)S3=PSwB;km$WHAzPMR3LZ4-TO?acSJwC3<>_3}|! ztUYPwFj)S5!*B$_B_CgDSi;h3D@bw7E&*pc?xt4FgkzQSP1vT(UXF|T9phx*>X@nS z`?gK&JG9WmIJ|QPL-^St5Z?i3#EPlf##5%Z%u;*WGdbO{K|v%Tt1l&3)^3%1Ro;}I z&g$Ek4HLH&X4!Roko#tNV^!A^x?YUm`d82e*(S>|>(n0PO)hAa4@V#>hkTnurfFM4KHu*YASY45f<^`=gG9;@8v1h6JsX?H`@gKcdA+Z$~`YNpN9TvgAq0+d9IY~otpd*YG1 zIgqIV;2r;24N1~2(8i`uF73?Z!_*;RlQaZn7r3WXiWjggB}4RKaxQJ>E@Mqd*2U;Z z`_|Lb48uDIErWYd+`h=jxvq^JA&1x?^fM!=KF$f#Ec zX{)7~0}U!uyP-b59728Jb6M# z^@~QF0RxGW?eY;QX8AP^A)BR4c+)Z3jk*E!w>;1?ci#LI=+q$~2)_9Gbr6tbpFRATkYnl6KNUFtsggEUg zbtY;@dLGWEyb=LLJC&Icqp9$A^g+o^4)}GNBIxZRTNA6UMk-~;*ff~TvZ-CK7+RA* zjET`2a2mp@kd1=b^gMWUo0=ZLO2C9IFPc24Pr~u_Q29D-Pg4pyN*S;xsp$4tH1TwS z2j%-pXZ26ev^7H(48bz1L}QgXB6ki;#;I!&XBgb3r76?*;IW>{LQaIc<(P~QgEqo~ zX_1U%q#a1m-m`bqh;0S3ZexsQ?fz%xD+iTsH$>H0)&!yT{p**zJ zFGGmhluqR#uC$fLS}{e^PHJQ%h{fjikXE8p%@UL+wH0S9Kh+(*_uPwaW#->y59h<6Vb1|{Bg5#wW()6`Hb^GWM>T#I#GDiNV|1FtMd;-|a{^EgC8h7g-=vPsqS{O8F=}M_7=##xz`^c}I!Ke|>omF7 zbhL~zPf1%mJ$d(~XLYPgyiLjtC$3mn_v#6jd8<2*65*QD;8hY zDCyV21p4!}@q%ttRXCfqvtJ!s3Z88KFg5WRUuK0$DDGBYwe@tmQT5{KW@an{sc_lT zR`MFn&_%9Z>`J${>>Hp4x5b1-Hq}yf9`$bNxg=#Ar)I=Z?L6QYzs-+oc`l0pF!ht~ln8n1ajkHchn+qki3FHk>uJG+JGY?4jen zGcxMy@?GaNRpyc&-Kx|2MkH=te*`i>4`^+_-VYO_ve$E~mxIz*JFk%SA?J zRjAdwX_;N!XZbV)5PNos=^pJGB@>Tyo;k2M!>(>gVzX$6vp1cdT|7qLtow)`f` zrL0S-_;8275_!w=QOTMujrL=rLhrA-zcuq{uRSHvk z-+mC^p{f(iJBAJx#%fVk!e*N&LVclCC^}3s2`sN}$mkp*1S?TGU#48WX~pt*)xJ=5EtLpQ zS!MsKQf^C^0FwyDc(TW-Il?AeZwS)}j!~V9*cY-XkwYma;qJ+=1|hcs+h;hxP>L}% zgpIK}1MnW%!`iALb#luTx2FYxxAX{ezLLxV;7iAJ=b=q`QU{?gWU5=!H=PgWL%s5Q3Dd^}? zf})=$5ocREN^+YJd{#d~C$%B9s-PoXtu>g5GM(&F`f0IdQ@5cy+3J}s+{V;=q*OlS ztXk#-e*Nr#vXE6E?g*xofa0Txmo7{zj5f(E%v74RrAyB;MbP|n!J1izuZ^=dw(B02 zm1!DI9h33Pic~{FhVqcF32Wv7S1XlhpIv~hgVeLXGU$Ms2>TECIDXx$j;cisUyE;_ z;WpxnQ5T){P&6KGQpR|j4fvp#qmtz*gWaG}9p(+RZ#P` z6vur1g0CvDedaQa*mhj#{LbSx`4V{}K6KJAM41EMk1?6$stOV%evK!^^Qg2hGpIXJ ze!wS|$Qp%DQ^o`8SJbs`)!lbR>I9vP zC&z=5_|_Gnk8TOzhCaA9>p*qH2fEFwp}4Wv`#rzWP_t@%;Z2gDMCPHjw;~zCI6}|R z&Oxx=jAEQp=;DzNdiELO^-|R3e1(|wn#zG~MjP+;{V#G_Hk)|QJy_h~{{Xb~(Tttx zW~Qc(?gH6xi(R0#CIa`%Tazuwublyo*grxstAIjL9BY6ulzdR4qs+CVCYVSY%c4x- z>RMEk7FbHQQ5_ z*rOgEm8UjNr&f*O9mafm*z|_!n#8y9G+wd|vaLEExy+D@a)t}ymw6F2x!8-8V<=1} zc@txLj*3n8>oM+1EN&(oU^7ChKYPeG(Bm|@eIJ{}IxmH7 zdA-6n&u~i*)O&}E+HCG@X29T}&BbA?nhU8f40(uVQn~~sHk0Fa_*_($i&}GB5vNA5 zmuFSRd}5tJJfRpp8_zd6zXKn1fCt_dNBural=Z$RNQ>-LWgK5jy|7*sq>B!DZvCV}ImBCbCqj`CSa~J53 zFWv=+Q_$Y!O%^*7`BlX$Bg#R69?- zFVp*>a9GDtgp;*7Jd+0@! zB15y0jI3;AvY=p_4i4x*%eX|2_Z{+#09d`%p}N%cN>)jh$7E==*g&od;3!!$iEN@9 zS;H|-E!aZwWR5yQ;_=sO?p3{~zn)(nYD0)c<@-dFQtT`xqECHh;5a7dr+S}m9!YtE z@!koY)Zby1#evHJ{J;6z%T{B!LGcwR+A*m(0jhOLzW}{0Un5wk!es@MkR#7@eVqXZ z^?iHNVrkvlY5-eqhED*aMQa~!&&_nPgt?8WbN z>v)>DFci0Etv}iYLRnBdYMM;{g%UV@CLIVVLJhKgW>UVji49b|Ly{NVfi?zg2wodE zZ>(W_+q44JB$6ghgJ0IaUv<%aYxer8QmF()BEFQASFVdQS~mxupRd zVd|H+xM-GCkK|4y{}hIAJplBj&UNC1Nniq{rac%2=8z(Td1;FHHb|2LDQTqSNM&5a zVCHO(-+YvovvXQE}mH8fAsdFrWzW&pni52 z&{`ISeE*?l+T9Q&1Ubi#%U=MysVcKw51_;X?HJ`c?l3lRixrH3^Dd-X+;^fjCFk$- z&G0G5N5c8g`7!NHOjL^qUn95nq-52ro5}@IKAGIk8OKH{LDP=Yc9YW*SsXH;$`%{# zCTQ`9^(;#D-|fB*PW93(8Bl2Zw?W zQ1WvMx_!wuhv0M{G5Ge$>q(P|$#^Bvl8~UxU4WhpYQ2KC;uZ2?hy%3qwG@xc$OgWf zq-Aao>vPoPFm+vfGU^+O3FEExW>hwxveJeO9~xiS;M+8tT1dr3Evs9Q>jRIA+BkJu z4cD`ZzyolE%cQ?VebwYx8q=1;LZ%Sb zkDSSk$+7~2Kn`sw})C9qpmB>upeo$0|-W_VhdhyJC&6r8Ch{Fze=vXZfM%XX&^d1w_G z)x_P8bVAZ&5ko#J{203_vQunH6`D_cgRl~{DVLqSG{f%DSO@wA-m#}L*?>;uSRR|o zA)gY*$)2{4R4SNC0e2#EDFIuSDWfycH;$suTYtPsh% zzCLsRJkQ5ZHkiQk8Dz4H($w9@E^|q}Uy>{jC;5vx`q^1ViKPO+v`LFyZd*RI^Ppkz zM~p>>Fx@DE`6Jedk{m%Aps|tFaIA}|$o69N0k^V6t8n~1nrI=Yx~M$r?o%R2;JJ>~ zG&1fn8m@vCo`VYku_;4UoE(+PI7Se8aioC{kw!9MbNSg#_80MgSt%~E9HcV2L=V>F+tm$^U8H54$Ee-zVtT-@u62n~niG|U z=+}ZfLbGTZYA}KFW`fu=)S8&j!=6e)@bhDKl)lIYzk($s`zjRIs_LcWJJ;blI9O?F z!|L*+ndsB0!kKQ_Q#5U;@u7&=+!zBfV9V*>3Fi3EIff3rjKG}R(5~{5d?bqP<&Lzo zO7j7P5RG%dw=a5{fhzonC~X68ZRSRZL4n02x^d*xFS??FB^D5B zBcJrrs7T`H9qu8Ani2W@N&rNmz+cqv2A(Zrnj;0jEJ$i}BC34*McwKgJ7w^~5-ScJ zi1fj+s^CcqzSEm#L%|VZwxCpz8iga$CtaKTSpbnvsh%+_tEL`$-NZ=^jv}Ll=LN#b zwj*KdJ%royn@ts^t_IJirdkVKY&nLHRyGbTi`G>cZp)EBYv1918O8hFPBVG`W7U^0 z-U+p^YO2lrn=p3#5y;@tnJPE|H4)R<%%v_{v?zQAtf9uQ_iShH@X*+yyk-QAiC8Lh z9V6L-tiW}0OzM$$SkU-t1Pdj&SR%0$)FyUC1PfsBmcuj~>Vch36#o?XsxwIOs-efR zRoJNA_NxXQwRznOT%kIV{5m-BIF7W|T|h`(9Ll6#92sj>Ma7M+J2Y%32eZ>|k7AjK zQ)x5IG%X4ZQg+W3?9ErF!wZ+14vqN)S&^aWr!BtFxxd$gY;+hlFAqxMI^_j+2Ce$K z;-hTks2fb zQ9l>pl^e?@TSlN0DLzd7I9WrQAW+lRD@j7$lcg!m9R84(v$#Hqk6fMMP-!!PVe;So z?AX~KV=|fz;-Lup!@WY3^R!^YNR@Q+Q^M_6s0i}Kp=ifNJ4G47nSiT7xv4oJ+4@ZS zEN5;m6rHK7jlVT#-Y|CR!FHnZ{<_D-KdvFzdlVYX((VN5oARVOUTz!AUbepK^-<#p zyC8%4Vdvw@Cu&QJe~y$Y_Z~JH$UA)8JQ4S)^j~Lx(|bw#ziBOWr8x_9M`ERo-kOVU zGYmBkE5yy(e-%VJ({J;_^%H#n+ zskfO!5_!})=s?5-k3fJXXsY(PexE*mn!y-WV@snae_^sAHL0q|1?^CXsq5+Lmyg04 zRumz2hPU?0ldmZ?mZ)ceoR9KMgdQ*8Ltd&$|pjPVfoj=uZ-;a)Ze6eJjI#ze@<e&gdC1lC48v-M4Q ze)AyL0ATgZ`s?j5(^fb);^+dBod)+Mu7snwXc~UAji?LT507GeJd77-p}Q249fpzF zkw(>>ON8l@_hofDpx`MswnZU7<;djs)VfHf#k&bN$wTB)K64Ij&fI{m zqT%GX_#JMBlLxOyynpN;7D1<=4ftmM`{coX=5)?f&k*{;g*Ct^;-|pYCbdDxS)$qz z+=FOkLyn1h)1BV5c6g|~#oKyNUAK6r$M$e-!!XjJ5y8JEJu`U@%`%RquY_(5sbOAy zjuIPP74)+nU&p9xa(}zVA|t}ou=B3z4CuXe4&$}49s??pN_YqZm`ycSaKjuFbRw4W zV$b!j_@s+k4_5hC@s1uH^Nkh$+PP#Lc&FItYnyCchiv63>%vPMFL4zOP!3bu7}~pp z<<8IsFEPzJ#p9gig`6|1BC3delRWFbpug{y0;ocs!_(*}L+i0reS(o8nZ3%7YESNk zVwIz1m6Xv;Z-y_K5tVsRmOpxxjmM12kdae;e6`kE<&<-3$%oh<;KiQElj?>kMDaw@ z{Oui^BkDI(&5EqfJn$6d3eAc1p;Y>7Je$?avpiR28+^XdYi6=r7aPZ+M80Uj`*p&n z{22ls@#8Hihjp^*!a%ztFZB#7LUMw*;I*Y19qu}m7g^+M{LH2B(`cp)?iS>F{*^C_ zVfgj`fo-}iOrIS8MKyjDTk;HZ<;I=RXK>I!b*rE05vbpldXX>V0C+uE7GxYE3uhB; z1lz4}ZNI)Pi&3!|a8+!YvMyxiq~r0IQ(IyYH1Un`*@!zg1E^GwGY?>6&HqN>CA#hO zS@}%?9~@0wVi))dogRK$;m7q4t5O2(O|U_xvZ>MifMQ+N;gHBb=)Auxvcm-BRQclzKQw3Ay1j3mjAfCLt%700m0 ziShK{hc<)e5#uy}fB)^0Sq-&zftaaXz>&q~MAwCi5nt&Z4N2|65)YqJc9-M%^QvZ* z0>SBMZXihZf506!yO<`}I1D8Spg4{eG^FlNb<(6F>M0da^v5zoP&H2>cKb965`y`u z3E;cYK(nTy6=f|%S@Ctu7f zl28`BaFV_GjsqFa-N6^3Zg@;QGs0uwR=C6)o$A+hQ37NGIsVW72N{0~rF>RP>9#Wf zzTrsH4bTc|*xn2>bL;}}ERM(~8E8WbG|6L%$&O`I9^+A82QZXuGEvcPt47Hv@+G-7 zb9_|!);Q~--_gW6ux2E9g>U~RqNdo*)TK$fe3(8a$dziX=^^3qoo+6eWjnly7bATQ z#QA1}X|q?GxF}bff$!)B4&OqI)sikXzj*7+BKLIghk%IYLYx!ZO$mxw0ZlaEDwj*m zXcRe3em~~gOq64cX@Ui?5kI~~;t1ab9Qwk{;roMy`^YGMP_50&WnJLN3TYRBxzq%B z#62Qy!!;|*N95G`9P+H)QtPiRoKA4H+9_k5{9bhXYv57}&aV=-m)Y_VpcJgxI?ib5)>c#xd_+%5aV~ zS48ZA?@BJlvFbXKSDGcuqSIA}KcY1pl;Kh1zZS8k3FhPjGaJk)FZ|4dV+Y-0&or?OQ@%2LRvPD#=z z*<*?dNz+26OlGq0Q-mTWMOsL9A!An*vW6@(B)b{Q^ocQkxAQ!|=bSp{`#Rs}oc{dw zhu7=#(S5tO&vm`8>w4eURXXmIPO7lF20{vPAta^2k<6~>56 zr#O$_N_r5PM8Z%t;nS3xrQ)s*>!4z%g*~o-FDvA6!=#ob=pzHE*BA(h_sYgqB;z>@kmY%n}v19J5*iL?(F(3FG@I`u2XEzf-S8y=U zfmuMm{cZxwzRAD%A2Jfv7!msu94*%8VuEv(c)gDQ+;A7e+uDCyV9Nz~tfi<(R$A{> zSd7^b!A%G|x?EODHM4*|erz<}<8p2P6iVUt+Z*SjC!KKhY)3ML zH}KVOZ{nQx=}Q}Bjj&v4!%d%zgpeP7*0<%pn!~J8)s69gr+<9XQl2DKQ!umqD>f{7U-yU#DdzD#y=mtX*H zRhz-t-I#9+Aj(H$kyyv!gMG5Olwi4S&~6s0f`%kIgN<0pF=3$tGd*x?KuGM`=N68h z_tW#ag>CsCt35BrwO{urh#J|En$c``?qS`Ko88@wQ4+9kSJ*3|Jp+xM-TkDBMnCG> z#z4UF=`Y4pE_UysPpus_P;EKMOnSuvMru{>>L9L&?0Kc9Pfi%oO`S}P9U$qH)wk`> z7ex{rVGA=8`J|bz?n^Kd@OvIw;L2%h z^_3T{5zV1KKo_(PEC6OXm}w5RqAv)`5QAdqo78}~dE0v1ko39t7yd-tQ>+4X%CWe0pHL9i0&i*c3pjlpsP(9;2opM&6s z7jqOD;?5?nDuk?{q2dN)!F=#t2O3yhMSFtt35YdNkCG+e%M8%0yB(T1;js7OjkVV& zgHCFy2uFQ|!|YNCHJP|=dXH}S&Jf)<9{f-&FOvQtXT!-uSt~{0 zUEl4i&sLvi5)%t(C6sDxmrivO?1Mg>eio`6nED84izQA9*LhMe@!V#%49Gnlz;`Jh zZ6j)3dtzPv-00cJ^9OyS@dD%Q4}@uVgWfu;!l+fN;Z;QO#b^xRF2ikzS`a?dRrr%j zfzNsn|0P0?YS2Qd(5uvQ>?9!619jRp=`0na!tBB}fNm7q`xuFt+QS| zUGv41b-c(My8g`?F@b`Vu;nCap9X4npTCa;(Sq3w;TSwOT_4yI9<#k^L0%?|2xlzQ zD{wZ##4f*;gUlGibYIOL)pcj=dqH?%f)gM0(~-(}c2?;mswp$Sa|zbiQ?R%M1CgE3 z2dLW2u6p(!D29cr?iFNBjnC)+&Iz}Yod|ief*3uK#V_3CBv{H-7*UQY)X7Za3gd)E zbtj|U37%9G5d#_1NwS@`(AIfxd+jCI(RQ^t6Fa>{$A&Jx2HkZs;?iQ_HD;B~X<_da z4-o1`e7*_|9}VE$a@mWRJ7OO0w94iAF1z?+_`pDWa>ang$8iIx2-U03TXxhk9BHHw z!a1SJCU&%YeN>FpMgI+Q)+)QEXX=iTUedn*F71OZez(1U_1Pj6Ugt~?F2PLl7XbEG zV{k{7VCTq~jogi#wTq7p;nW>7{v>MpT&{_>YxD9u>A)b z6SwR?9$xmr@2jBAZXND})PX=6@PN;#Pee2H8hvRw%GG_LXu%TY2PdG~3Lm+=SIuW% zF$b@=y$COupNvj9=d-1xd40fOn<@vLa!!J)Qu30w-pl=w34)^5Mb`hq)?yi?vGA2T zwtbOwApb%Ko4D(_ACguw8za+8)ST$u>LCuRz@oY9RQB?*25r@HRnA=KN{W&0d|e&7 z=w}vRapO=`g4>`1ViE=YqOSrF!6bm$z6&_64$!E7m@q183FdA`6XEDW(q0UbzZZbi zvWx`0Btc5$y>qptgG^`>;Sg51lNO#n`?h4iyLJdV7_YA*{zx#e>yA6T)TI@na$q&G z2AzYPz%r&nsp1Cey>7jz4lOEzfG@`(<}!o&)?{$e1f4@r+*4c5!9^=Wm#!yx)nd>FayAfNUOkOzmiTqa>k{u2W8vAQV)prPjijUqVZ;{cfht zRqBOTf?kmjoufv7B+^7jZs%f4OM zP6%WBKvt|{qxE~5ON>q}-hRS4K(%9Cczq@{@sy?7-Y=%FnLWiy&s52x>$7c0s57m=VEu=r3G_*e{7gZ^XW4kTB^q_PNb=USPvQYk91>(V0&W0C6iI+uT zmhSTV{pd`u*I{HqB zwL23u8RxzQ>WNydd&w;8R}8H(Qm=n%Ue;7$b+O`EXuzS5_gbDFGf&h`Zul%Vp}1MS zr}4-6;11m|BUNmm7=;R`V{7x{{GxO@{Ns0WzB!2~OAK#w$_1hZ+J z3uH-UfBh}OjU4$Rolj0nu=Qg&U|bv?M$E+5vhNhPklZiA3dgdt{8!Uk>7b zCj~H2CY>bEP29nnt1b;l-J>rj(vceIlJ4GoFvV1t9`%c7`1Ycg{OPOLhlBf9?#%8o znlRo6P1rJng$fuETkFOurBvDR?yo7nNxc=lCaa~S56VrI$9cK06^@CJ8X6tCEU_Ic zFH(hgPt;P2+{F%{_Z+L;<8Fx(G3pQ0 zjY;pA55GFE8FxxxbRW?j!KZh79r9IyFXu{{8&@#F4oYPto@>AfVOMmUxb}Y}CIm7& zf=@?|>f|tQB-|5rjFQdPzjgP^t-FVPH_OB38X==R)VvH4y96Ub0D|6I-buMu%s|GqW?#!7 zRaV*^8pil5_)Exu$G(g;w;#nz(xWR)WwK19w;RiZz8QX(&KtEFDT}8Ir$EmQn>}88 zTeX~5)8CMpnwS?r4M?9D&lqUqY=79^+}>Sz$#&2A>dL##pTt~R>Q9?*EJOkjy_%g2 zJcg8zCD;@d(A;O1<+*%U(4wG$@)E3+xC0Qfdthfrdsh{7j8jcJJYQScwxkdSqbe<1 zO3+U_jTN>k1RCX)Fms#qs@J2Ww~HIW?*zh{R&L|$Zkd>K)f(0^fwbPbN7dP%l~pdk zJ=)w^leN*-_(@uTC2QjL`kNv=cdMTB9}-&K1mpkaBT6`jpHScBPwUn(>pREA2N99V z?2L)L${xGkb9S8dK0ThRTBA->Uh*8O7ptOd*M0M@YQ=8ly`^|DLXifeQ8+mgEjVO> z75o-8l~c0bxVUmcPQHB2=M?9aQ(G?@B5nR`!fOYZ_&Y2UiQv4#&o`$Od=HLu6e#@h z;e9Txa>_VHwbENT4M^`6k zr%rGj#7v&2;I&|Hrd}!Z4?9PH2&G`$nZQ7JJxX0~m3Wl%7u` zr6R>eEiaDbT+HsZKG91!%kY+dPEnI`JwNDoWlMm?ehV{{;h*&PCua}baQLfJyv*4* zl&3<6iP(y)&-y!tuLFBZ$5002ua9!O@c>fBQ7v1gt+Sb>apH+j@)PKr@w4^biQ2+i zFQ}wT^m~~Z8G}}*qcmSFjz$mN{NV<{_d1dt{q7Ss4GdXSxGpd za)tIBu|9qZ|6x|l*@!jkVV9HOtgQibOfX=pp6eA_rg2IV4|I5_EFQ)YNR?Qx{ysjJ zUUYVIiCTYzZZk@(ge|oyo#o~uPyS@YA0L^?Fm;%#O1Sj|`2@?tE_k0{-gwB-hhkFK z5%+6#P}7I6Tcnnh;sQ(dOBRG4%v*@+P*s{arhVKpbaTQx{%0@Xqa>r1JIo>LyhiJj*#L%T!J0u;MlTLOf2nBDot!3L0PsuCqDQ7y7&3e!Kj)iL+w$w1SyBk!=q z=EoGn6SLvB*2(M;bU~zpPb?cdFIHj!f+)6LzzX}7SM#@C!7~}Jcz1#h?W#{aM?upq z#eTpPwS|E^iHvI-n4|zl@H@Av=5Ic-AEMufHk4H>_2_=Whvz>|KUz8+YoT~haSwy6 zi-$}UfWEP-#zCvnNYm}$2Gf82VkbJZRMDFfn50jJu2jV`@N33C+?!NWwOs8r(!n=$ z_`~Qi-I{x!K096IUBioA3(sPP9cvkY?3mNL)W@TKVd8^gK1@EavXrvVx483$_ocp& z;dOS#lb_F1J=1?cqCzfE5TcP2@^b9KeiKBg%g{_3q$K5t&GP~ z)t@6X>C)oGEcdjqs`qq^knPr({?1cP_(}Ix#H?GDZh(Im)i+jUY}Y!%(~rTEr!KE0 z*j4ps@ebaM;=xL;_uNR=P|%H^miV1se~nvpd+(a*ulU^s8D2RP_;Cu z6BOS2Hzrej2U22ylqN7tLz&u}7u;)oOEwBr!6*VQ_js(QF$_ERj-jzp zUB3=MY$$C5AOs^s>RS%G1OMHB^Eg8bixZz<7BfWBnK@`AM20EuHP$^-3%l@E`K3&> zp`0_bwm-8X(TU!1bIu-H<{CocSUcUw;bb!E<{!(ZZ}Y-|!g&Ye{v z8;C<<=?;W6b`i^~tPjoE%oJ!3;JL{vsERg9b}3!^szo1psvjE$n4*-5JijVk!RI%g z0WJ(*xo%I*^GsK+W=`{51g09KPIpe}3ghEzGek9Y1L8G*2_^&5p4rdD2F-RqEiux% zpJQkJ3q>)#RFWl=ZCHE1rn#KopX8%(;n^lc3yIFpOk0F$(m1OqoK4HVfgKgpSs3t+ zGQ}}W02_rGc}YzO6A&=71_u#**!%Q1FTo52)PcVaSOvU5`u)BKL`)%*5{QhLy~`8u z_?symCsg;AZOzUNc@CWtep9RZT;V467vmBs=`lE3$HH)331XFS_~fh9MK@$5Eu;%ka$E8ua)TRvdb)sOBcA zB4hi$`G6m#D0sz_aoAdTJ6O8^ayCBf>9M*uPR%Uty zoK4u*ayk|#INWS3-&JGNGdp(G*(p5M(O6;HSYgIc7W0R-`l0u~SZg^F%dWylDi3lV z=uWsHdL)pbub64V|Dk*1TbZY6C$=99=dp+(pE>nLUK7rUKTi_lC^9ieSUqV~3Ey(n z5cD}}up=}7CUdN!D8I1CB9J21*^>Ls)7|_~0j=1#S({I7Ztm{=nnu;ScbDNa#J)W7 zL^>Wa^juI6F6zmkRyPJ=R+nl)301cEa(oc`10-FJmen^;h2-g(!Cb}nAW*bYf@1KB z(PQs%??ICjORbkG7MS_0SM~atTd(dTCPqrsSVwA4F#U_ZR$FV;8|&}WH0!GEj}SF= zxsXulv%#Ziraz{$;+N>EQ)f2y`f%m6G~uJvk6hq?N-hHjF6=M}0Jy-owJhACAQD6- zY=Pw^3P?Ga&?VULMGUmD1p~zcrY6S<*f=+V3q|}M@W(g77by(dybOxFI=7LlMBe2V zp#n@YX8gz|qRl-@K#Ad)4qGWPCc+cXA4IvJ{qpub^rd?BII`i@H%RmE+oHxvz}eT- zPh#$WOk!!mNKQ~DHwXcIV`{*33IQxobTw&qCz>PdO68cs&_50yai`fMQr zIDN20Z^&dZBn_>pmDp)`JX}2%fr^`QR!)U^l74LY&!>%9J>sD3ZSPjcvI$lX(>Dvp zWNEhdZ5-3tYiz7?d&mHX-@kHGzZufzA5B?nmLj&wk$Cc=7_c!z1n&~c8&_TQ3G(YD z?`m8H1`u5#Cr%(*JBF0#J<_~HIyI8hgC2}N@TK1ZBL>AF2D zBOac>_~%u%PAE1Lk3Kc@T$lG+oi2)C5f z7|NfV-g4)_ru6a@!!n>|e$L+o>gWtmJ&kc87-kGhVhN_=f{1P=+CNqg<=Ah|Wx;J6 znZstKfb+3`dVC6T?RLb@w{n+xB`iev@2CZqEi0Ziu|U-sH~^qrVO53WWB}#*b5?2D zcIeTTws+A=3tmS9w!@}r&W_Bw74_*ox`7%a??em*6+(iewCi-Auh{gf`A`q^IaOT* za3&b)T+3w}_iqhOio3nJZl}0|!dpx9^cE=WDUzF<3o=}&R2Q9>6J(~VJ2OOB1g`@J zG$5xck(xsz?frLRckhb&WoKQ{4*Bry%hAOaupB?G>@p1bTSEkPIZkY(11F&G?JoPc zG#g7-yRP8M(RtBbkv{KP;ioz?GQzLndwKfMXz=!TWh-0|w@X``+i0iXdeqyatEA>8 zA2HAB>B*ui3QV0CYJ2C$DqY3=rA!&4@$z<`UbmFhtF!Zhg@cqlK}J_%$U_2dU^gw0 z)E`$+M5)Aw;~9+su8&-(Xjq%aJqe?h@zNC#C`j^3Jsdz!`YtI_$f~KT?nZusZ!^1B z?fSRbFypyaw`=|0o69hV^bgWl+nGm@ocECg^xPKcktV#|3~smz)Mhb5DqaFPy=Jd8 zVCW(&<=r`pIrJ0U-J1{@4_1(QLE_Q}f*2!^m5Ns5z=YhEUH&vc?T@_3peDCR+JGmK2Uqvn1{TJvcgwij4reX$ra8(`IUb$&DVst z{c8;HFE|O(U@Ql?fV1?fF!SeM0SN0D2jr8V8qGO~)f=5`V?3aO4XCZwE|A;6LPes2 zKDT9Pq9(N6O|Tz`=8x)BhX{$beb}Ox-G!=H5i$%}*Ix9Q5=`8+AnGP3)Z~L6?bTd- z$dP7x)dGI|UM*^fk@ZU^E^ORGz=T1qve&6@WLZjMkpyg+<@JtE&TgJ!`jl`lA+uT_ z4La1!_;R#$4anfNn%W;%i4$;3LcRVlt5WP8MiF}Nw8NBv5yi=On*e55!^mKtn}mkS zwS4Bw+hzxg4+Zj0u0yoa=x`==aYZ8+c@D#g0|INkfhfSI;KMDO0E-(nynHH=6{J}{ z3?%ymn0*ee`xil#QH)ISiY`};a*3dD6LycG`}Qse&HG^)?+eL3j+eS+nvejv ztVa0*G$T%8>Kz4X-HL$p{l_aJXkKBknps`_BDZ-YHQYyq^~)M=!V`h4ed#?$Qk;T) zddCHi*hCsD-2Yhv(BiD38%C0{8+mIJCX8MxQ)i2oof-WkzE2RQqC?a*AT^<~OEY`h z>sAR5N?LbL{;FP{A%%VzTpzacfndTPcJQxPc-b`hVdeTuw-8h+Etdg+mwVc}NwpH? zOjCa9dD@MrT7Z|eJBm)br^CBO>#lE;Ur#3Zkt%9)7O5Lr^|8tore5?LX_a9nJ$9Ax znH6|B&YsrH5_R{K>Pw0sINc*rZH25V*!Tfnl%Lh~&PtH7vCD`zk&MQiIENGP2t#$o z)eO*U$3K}%{%Uf9#pijun~K7dGrmu?FO|LH!IGhaz(pp z0U34{_7h&f-;Qh}v~WcLKfFzUtCBFmtKn=@o@K<%oZ@LtuAIE(-bY?{9gKCNc>Q64 zurY(ify{Y60u~H?0+`J4*I_^F4bc3^@b@nY21k?D8S2|GZYn1vpUW1%>!#yyB^fJR zo39kDwDaSRhjd%x)xCzgjX!B4Bo%3q#DazlI;OC1M%_&=i2L|D_{v0)5m>*HEhoX6 z+@cB&*C@vAX-kQ5(!Ehy9rX)K;ozhMyvHfRCc9in_5mVo0TW3ZR-oDxU zd`6I==g4rQN~~PMYi#9qTgsZT8iw8t)bT(%o!d}2BQBZB5~#XVXSG4LKgQyvt+wP! z1l`n#J+l~-UMkeXk#6H|8};F1MbLbu%XDj!B`PuNpj$3d&qnqcwaKlM*ypZdq5bZH zd`RrXgra-)^EY?-;nM>|*Tv9Dj_iUOLE2*RXb+#QERU24h?DL$&dWuEcxZdP*rT)vqCtGb{fRWCr@8=T3x+I^eCh#1$dsNa zQJ#cp)iO0QX`L+D(|# zKH>`7L19(#{$N;k_6K$gyDM-M-(7(} z`TGd26sf;&`HA~(@&TI;_Kh175s|Si2(k@~t2YaqIZ1U{raU(rDaiaU+ zA=h_nOSY@5J+CX8*Sc*ZSr_~Bwho@TfF-Uwa|Ueh!?B@bm_Y{?@}kiq!Ue$xF2hkO z1k@|#WLxn1##dx}0)tX*dJbsHUfm;Gofj^A*!SStjt;&B2ta#Q7-(bPfD{S7wc zl}*T{&ZVXA3k9EpOb_e>*`jSgE@@#<>hwRwCY&-}ZV_y1Km1u2^Sqp&7b+k~p8 zeEX6IdzD;2?%&(JDrR>upRZW8-NDDj)!J&txy60GvyJU_ymC9H-s5}ujkRlwcuUJZ zW`EfHPPh* z4;5_4lW|f(nejH+oSh;jO-d}UW`4A6wo00g!Wyf zG*&)4^>o3`*wE&oLL863+hI0r|Ly+$$ruO4^zLtZ5{gf{TLxdHdp8Jq``_dx3K`TG zsAP^$8uUF>Rk{;!$3J{fV}KQ!)8>s~TanP`Ra;w_q!3L7O=bTaE$8!l^by%Bam2uJ z=|3wz)M0*q+bwYNq(Lroq2Hf8P-5i&4&J|#f-nEPAN5nzZ2H(KIa?t({q}eVCI5C!=hdzVw7P@A!zUL|iR#zi3_Culx_mTrg>OV2-@+mN zsZxg<>E7#nv3#N|QE8|_DD9fM zQ_D`iuOCW1g$_EzeRf`h^flXeBc@lNda!VGgk(DAf;1p@%8DY~JmaFg+J$JxFuM}u zYXU=+o$Mb3yR$C7?2ox$zxK+7^3pR#M)q5FVcXiJP;PR`(gVVG^d(y5@1D9LJ+S>; zTgx+~iH5|m4{Jr{epbtR{*y7ordO4O5Q5p8C7r0Zvbn-DD)}(~L)j;{MAA>_QFpSrK7Ul_-ybUU%L-ytLk-A;c&!#H*rHkl6BWS08jbX*TGq?$x*c@;beb z@LNk5SGuab_g%F8a^|SJU3Bsgdgol@s<}4_?|in|6ex`+8@O^W@H;cN_Y^tws@7o@ zm&Vc`4}N<#hX;G{uR3k;6>7lH=K(4DVXzG_y@Qd4=fOFy0_o?MYn^`o&u$<7g2$N) z9=PMzE}pq!zkIOGOup~n`&#Y4Z=MQ1jo(#n|FPlsWvYMQaDeC6nyddq(dr-j`LTBK zZ#{{x`9me+9~=Kz@%FdIvV6Zb{zLuSAA9(*SnO{-DDeMj>DV7T__0>rZylTk8v-gZ zU-?hvydRrZ{7?byx2E-C|C#z9>Y@Fsm;bq9({H_;lKda*H~l!2?=|snUBF-s-oNYQ q|Lf%+n}07(f8YF|&+nT5TezB;tO7>_2I@cY!K#7k7reRr^#1^2a_5Br From b4913295ab549652d1714082a60ec730436ed612 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 30 Jun 2019 17:16:24 +0100 Subject: [PATCH 110/238] TST/CLN: engine fixture for tests/io/excel/test_readers.py (#27139) --- pandas/tests/io/excel/test_readers.py | 53 ++++++++++++--------------- pandas/tests/io/excel/test_writers.py | 24 +++++------- 2 files changed, 33 insertions(+), 44 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 40a6970aa7f049..be5951fe12b469 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -31,24 +31,29 @@ def ignore_xlrd_time_clock_warning(): yield +@pytest.fixture(params=[ + # Add any engines to test here + pytest.param('xlrd', marks=td.skip_if_no('xlrd')), + pytest.param('openpyxl', marks=td.skip_if_no('openpyxl')), + pytest.param(None, marks=td.skip_if_no('xlrd')), +]) +def engine(request): + """ + A fixture for Excel reader engines. + """ + return request.param + + class TestReaders: - @pytest.fixture(autouse=True, params=[ - # Add any engines to test here - pytest.param('xlrd', marks=pytest.mark.skipif( - not td.safe_import("xlrd"), reason="no xlrd")), - pytest.param('openpyxl', marks=pytest.mark.skipif( - not td.safe_import("openpyxl"), reason="no openpyxl")), - pytest.param(None, marks=pytest.mark.skipif( - not td.safe_import("xlrd"), reason="no xlrd")), - ]) - def cd_and_set_engine(self, request, datapath, monkeypatch, read_ext): + @pytest.fixture(autouse=True) + def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext): """ Change directory and set engine for read_excel calls. """ - if request.param == 'openpyxl' and read_ext == '.xls': + if engine == 'openpyxl' and read_ext == '.xls': pytest.skip() - func = partial(pd.read_excel, engine=request.param) + func = partial(pd.read_excel, engine=engine) monkeypatch.chdir(datapath("io", "data")) monkeypatch.setattr(pd, 'read_excel', func) @@ -726,23 +731,15 @@ def test_read_excel_squeeze(self, read_ext): class TestExcelFileRead: - @pytest.fixture(autouse=True, params=[ - # Add any engines to test here - pytest.param('xlrd', marks=pytest.mark.skipif( - not td.safe_import("xlrd"), reason="no xlrd")), - pytest.param('openpyxl', marks=pytest.mark.skipif( - not td.safe_import("openpyxl"), reason="no openpyxl")), - pytest.param(None, marks=pytest.mark.skipif( - not td.safe_import("xlrd"), reason="no xlrd")), - ]) - def cd_and_set_engine(self, request, datapath, monkeypatch, read_ext): + @pytest.fixture(autouse=True) + def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext): """ Change directory and set engine for ExcelFile objects. """ - if request.param == 'openpyxl' and read_ext == '.xls': + if engine == 'openpyxl' and read_ext == '.xls': pytest.skip() - func = partial(pd.ExcelFile, engine=request.param) + func = partial(pd.ExcelFile, engine=engine) monkeypatch.chdir(datapath("io", "data")) monkeypatch.setattr(pd, 'ExcelFile', func) @@ -830,20 +827,18 @@ def test_sheet_name(self, read_ext, df_ref): tm.assert_frame_equal(df1_parse, df_ref, check_names=False) tm.assert_frame_equal(df2_parse, df_ref, check_names=False) - def test_excel_read_buffer(self, read_ext): + def test_excel_read_buffer(self, engine, read_ext): pth = 'test1' + read_ext - engine = pd.ExcelFile.keywords['engine'] # TODO: fixturize expected = pd.read_excel(pth, 'Sheet1', index_col=0, engine=engine) with open(pth, 'rb') as f: with pd.ExcelFile(f) as xls: actual = pd.read_excel(xls, 'Sheet1', index_col=0) - tm.assert_frame_equal(expected, actual) + tm.assert_frame_equal(expected, actual) - def test_reader_closes_file(self, read_ext): + def test_reader_closes_file(self, engine, read_ext): f = open('test1' + read_ext, 'rb') - engine = pd.ExcelFile.keywords['engine'] # TODO: fixturize with pd.ExcelFile(f) as xlsx: # parses okay pd.read_excel(xlsx, 'Sheet1', index_col=0, engine=engine) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index ffa77de930cbda..d65bebe16804cf 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -224,7 +224,7 @@ def test_read_excel_parse_dates(self, ext): class _WriterBase: @pytest.fixture(autouse=True) - def set_engine_and_path(self, request, engine, ext): + def set_engine_and_path(self, engine, ext): """Fixture to set engine and open file for use in each test case Rather than requiring `engine=...` to be provided explicitly as an @@ -252,14 +252,10 @@ class and any subclasses, on account of the `autouse=True` @td.skip_if_no('xlrd') @pytest.mark.parametrize("engine,ext", [ - pytest.param('openpyxl', '.xlsx', marks=pytest.mark.skipif( - not td.safe_import('openpyxl'), reason='No openpyxl')), - pytest.param('openpyxl', '.xlsm', marks=pytest.mark.skipif( - not td.safe_import('openpyxl'), reason='No openpyxl')), - pytest.param('xlwt', '.xls', marks=pytest.mark.skipif( - not td.safe_import('xlwt'), reason='No xlwt')), - pytest.param('xlsxwriter', '.xlsx', marks=pytest.mark.skipif( - not td.safe_import('xlsxwriter'), reason='No xlsxwriter')) + pytest.param('openpyxl', '.xlsx', marks=td.skip_if_no('openpyxl')), + pytest.param('openpyxl', '.xlsm', marks=td.skip_if_no('openpyxl')), + pytest.param('xlwt', '.xls', marks=td.skip_if_no('xlwt')), + pytest.param('xlsxwriter', '.xlsx', marks=td.skip_if_no('xlsxwriter')) ]) class TestExcelWriter(_WriterBase): # Base class for test cases to run with different Excel writers. @@ -1198,12 +1194,10 @@ def test_raise_when_saving_timezones(self, engine, ext, dtype, class TestExcelWriterEngineTests: @pytest.mark.parametrize('klass,ext', [ - pytest.param(_XlsxWriter, '.xlsx', marks=pytest.mark.skipif( - not td.safe_import('xlsxwriter'), reason='No xlsxwriter')), - pytest.param(_OpenpyxlWriter, '.xlsx', marks=pytest.mark.skipif( - not td.safe_import('openpyxl'), reason='No openpyxl')), - pytest.param(_XlwtWriter, '.xls', marks=pytest.mark.skipif( - not td.safe_import('xlwt'), reason='No xlwt')) + pytest.param(_XlsxWriter, '.xlsx', marks=td.skip_if_no('xlsxwriter')), + pytest.param( + _OpenpyxlWriter, '.xlsx', marks=td.skip_if_no('openpyxl')), + pytest.param(_XlwtWriter, '.xls', marks=td.skip_if_no('xlwt')) ]) def test_ExcelWriter_dispatch(self, klass, ext): with ensure_clean(ext) as path: From 0b90d5baf75b4c60934b45fe58ed999bf44442bc Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sun, 30 Jun 2019 12:15:16 -0500 Subject: [PATCH 111/238] Bump python_requires to 3.5.3 (#27116) --- ci/deps/azure-35-compat.yaml | 2 +- doc/source/whatsnew/v0.25.0.rst | 6 +++++- setup.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/ci/deps/azure-35-compat.yaml b/ci/deps/azure-35-compat.yaml index fe207d122657bd..97c45b2be27d7d 100644 --- a/ci/deps/azure-35-compat.yaml +++ b/ci/deps/azure-35-compat.yaml @@ -11,7 +11,7 @@ dependencies: - openpyxl=2.4.8 - pytables=3.4.2 - python-dateutil=2.6.1 - - python=3.5.* + - python=3.5.3 - pytz=2017.2 - scipy=0.19.0 - xlrd=1.1.0 diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 82e093bc2bd490..a91a298f9f7e3c 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -5,9 +5,13 @@ What's new in 0.25.0 (April XX, 2019) .. warning:: - Starting with the 0.25.x series of releases, pandas only supports Python 3.5 and higher. + Starting with the 0.25.x series of releases, pandas only supports Python 3.5.3 and higher. See :ref:`install.dropping-27` for more details. +.. warning:: + + The minimum supported Python version will be bumped to 3.6 in a future release. + .. warning:: `Panel` has been fully removed. For N-D labeled data structures, please diff --git a/setup.py b/setup.py index 0380c717ecb415..19c22fc25733d8 100755 --- a/setup.py +++ b/setup.py @@ -783,7 +783,7 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): long_description=LONG_DESCRIPTION, classifiers=CLASSIFIERS, platforms='any', - python_requires='>=3.5', + python_requires='>=3.5.3', extras_require={ 'test': [ # sync with setup.cfg minversion & install.rst From 734b6d1346bd1ee9a480946c08607dadd4b14cb3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 30 Jun 2019 13:57:01 -0500 Subject: [PATCH 112/238] CLN: Remove cdate_range, raise_on_error keyword, categories and ordered keywords in astype (#27141) * Remove raise_on_error * Removed categories and ordered keywords in astype * Remove cdate_range * Remove unused import --- doc/source/user_guide/timeseries.rst | 10 ---- doc/source/whatsnew/v0.25.0.rst | 3 + pandas/core/generic.py | 33 +---------- pandas/core/indexes/datetimes.py | 62 +------------------- pandas/core/internals/blocks.py | 15 ++--- pandas/tests/api/test_api.py | 10 ---- pandas/tests/series/indexing/test_boolean.py | 11 ---- pandas/tests/series/test_dtypes.py | 32 +--------- 8 files changed, 13 insertions(+), 163 deletions(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index fcad6db9459817..ce02059cd421f4 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -474,16 +474,6 @@ resulting ``DatetimeIndex``: Custom frequency ranges ~~~~~~~~~~~~~~~~~~~~~~~ -.. warning:: - - This functionality was originally exclusive to ``cdate_range``, which is - deprecated as of version 0.21.0 in favor of ``bdate_range``. Note that - ``cdate_range`` only utilizes the ``weekmask`` and ``holidays`` parameters - when custom business day, 'C', is passed as the frequency string. Support has - been expanded with ``bdate_range`` to work with any custom frequency string. - -.. versionadded:: 0.21.0 - ``bdate_range`` can also generate a range of custom frequency dates by using the ``weekmask`` and ``holidays`` parameters. These parameters will only be used if a custom frequency string is passed. diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a91a298f9f7e3c..e42752cca90435 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -637,6 +637,9 @@ Removal of prior version deprecations/changes - Removed the previously deprecated behavior of altering column or index labels with :meth:`Series.rename_axis` or :meth:`DataFrame.rename_axis` (:issue:`17842`) - Removed the previously deprecated ``tupleize_cols`` keyword argument in :meth:`read_html`, :meth:`read_csv`, and :meth:`DataFrame.to_csv` (:issue:`17877`, :issue:`17820`) - Removed the previously deprecated ``DataFrame.from.csv`` and ``Series.from_csv`` (:issue:`17812`) +- Removed the previously deprecated ``raise_on_error`` keyword argument in :meth:`DataFrame.where` and :meth:`DataFrame.mask` (:issue:`17744`) +- Removed the previously deprecated ``ordered`` and ``categories`` keyword arguments in ``astype`` (:issue:`17742`) +- Removed the previously deprecated ``cdate_range`` (:issue:`17691`) .. _whatsnew_0250.performance: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 166d8526456fbc..841131b697f9cd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8644,13 +8644,6 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, try_cast : bool, default False Try to cast the result back to the input type (if possible). - raise_on_error : bool, default True - Whether to raise on invalid data types (e.g. trying to where on - strings). - - .. deprecated:: 0.21.0 - - Use `errors`. Returns ------- @@ -8738,18 +8731,7 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, cond_rev="False", name='where', name_other='mask')) def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, - errors='raise', try_cast=False, raise_on_error=None): - - if raise_on_error is not None: - warnings.warn( - "raise_on_error is deprecated in " - "favor of errors='raise|ignore'", - FutureWarning, stacklevel=2) - - if raise_on_error: - errors = 'raise' - else: - errors = 'ignore' + errors='raise', try_cast=False): other = com.apply_if_callable(other, self) return self._where(cond, other, inplace, axis, level, @@ -8759,18 +8741,7 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, cond_rev="True", name='mask', name_other='where')) def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None, - errors='raise', try_cast=False, raise_on_error=None): - - if raise_on_error is not None: - warnings.warn( - "raise_on_error is deprecated in " - "favor of errors='raise|ignore'", - FutureWarning, stacklevel=2) - - if raise_on_error: - errors = 'raise' - else: - errors = 'ignore' + errors='raise', try_cast=False): inplace = validate_bool_kwarg(inplace, 'inplace') cond = com.apply_if_callable(cond, self) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 5ce670d9fe33e1..e2658b66f83ba1 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -29,7 +29,7 @@ import pandas.core.tools.datetimes as tools from pandas.tseries.frequencies import Resolution, to_offset -from pandas.tseries.offsets import CDay, Nano, prefix_mapping +from pandas.tseries.offsets import Nano, prefix_mapping def _new_DatetimeIndex(cls, d): @@ -1568,66 +1568,6 @@ def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, closed=closed, **kwargs) -def cdate_range(start=None, end=None, periods=None, freq='C', tz=None, - normalize=True, name=None, closed=None, **kwargs): - """ - Return a fixed frequency DatetimeIndex, with CustomBusinessDay as the - default frequency - - .. deprecated:: 0.21.0 - - Parameters - ---------- - start : string or datetime-like, default None - Left bound for generating dates - end : string or datetime-like, default None - Right bound for generating dates - periods : integer, default None - Number of periods to generate - freq : string or DateOffset, default 'C' (CustomBusinessDay) - Frequency strings can have multiples, e.g. '5H' - tz : string, default None - Time zone name for returning localized DatetimeIndex, for example - Asia/Beijing - normalize : bool, default False - Normalize start/end dates to midnight before generating date range - name : string, default None - Name of the resulting DatetimeIndex - weekmask : string, Default 'Mon Tue Wed Thu Fri' - weekmask of valid business days, passed to ``numpy.busdaycalendar`` - holidays : list - list/array of dates to exclude from the set of valid business days, - passed to ``numpy.busdaycalendar`` - closed : string, default None - Make the interval closed with respect to the given frequency to - the 'left', 'right', or both sides (None) - - Notes - ----- - Of the three parameters: ``start``, ``end``, and ``periods``, exactly two - must be specified. - - To learn more about the frequency strings, please see `this link - `__. - - Returns - ------- - rng : DatetimeIndex - """ - warnings.warn("cdate_range is deprecated and will be removed in a future " - "version, instead use pd.bdate_range(..., freq='{freq}')" - .format(freq=freq), FutureWarning, stacklevel=2) - - if freq == 'C': - holidays = kwargs.pop('holidays', []) - weekmask = kwargs.pop('weekmask', 'Mon Tue Wed Thu Fri') - freq = CDay(holidays=holidays, weekmask=weekmask) - - return date_range(start=start, end=end, periods=periods, freq=freq, - tz=tz, normalize=normalize, name=name, - closed=closed, **kwargs) - - def _time_to_micros(time): seconds = time.hour * 60 * 60 + 60 * time.minute + time.second return 1000000 * seconds + time.microsecond diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f0128b70d74327..36390d46728123 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -542,17 +542,10 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, if self.is_categorical_astype(dtype): # deprecated 17636 - if ('categories' in kwargs or 'ordered' in kwargs): - if isinstance(dtype, CategoricalDtype): - raise TypeError( - "Cannot specify a CategoricalDtype and also " - "`categories` or `ordered`. Use " - "`dtype=CategoricalDtype(categories, ordered)`" - " instead.") - warnings.warn("specifying 'categories' or 'ordered' in " - ".astype() is deprecated; pass a " - "CategoricalDtype instead", - FutureWarning, stacklevel=7) + for deprecated_arg in ('categories', 'ordered'): + if deprecated_arg in kwargs: + raise ValueError('Got an unexpected argument: {}'.format( + deprecated_arg)) categories = kwargs.get('categories', None) ordered = kwargs.get('ordered', None) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index b57c7a0cf0625f..6ed1284ff13bae 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -131,13 +131,3 @@ def test_testing(self): from pandas import testing self.check(testing, self.funcs) - - -class TestCDateRange: - - def test_deprecation_cdaterange(self): - # GH17596 - from pandas.core.indexes.datetimes import cdate_range - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - cdate_range('2017-01-01', '2017-12-31') diff --git a/pandas/tests/series/indexing/test_boolean.py b/pandas/tests/series/indexing/test_boolean.py index 43dc292652519b..ef7312616250db 100644 --- a/pandas/tests/series/indexing/test_boolean.py +++ b/pandas/tests/series/indexing/test_boolean.py @@ -229,17 +229,6 @@ def test_where_unsafe(): assert_series_equal(result, expected) -def test_where_raise_on_error_deprecation(): - # gh-14968 - # deprecation of raise_on_error - s = Series(np.random.randn(5)) - cond = s > 0 - with tm.assert_produces_warning(FutureWarning): - s.where(cond, raise_on_error=True) - with tm.assert_produces_warning(FutureWarning): - s.mask(cond, raise_on_error=True) - - def test_where(): s = Series(np.random.randn(5)) cond = s > 0 diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index b9146534d10f11..59566ad3232c7e 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -223,15 +223,12 @@ def test_astype_dict_like(self, dtype_class): with pytest.raises(KeyError, match=msg): s.astype(dt5) - def test_astype_categories_deprecation(self): + def test_astype_categories_deprecation_raises(self): # deprecated 17636 s = Series(['a', 'b', 'a']) - expected = s.astype(CategoricalDtype(['a', 'b'], ordered=True)) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = s.astype('category', categories=['a', 'b'], ordered=True) - tm.assert_series_equal(result, expected) + with pytest.raises(ValueError, match="Got an unexpected"): + s.astype('category', categories=['a', 'b'], ordered=True) def test_astype_from_categorical(self): items = ["a", "b", "c", "a"] @@ -349,21 +346,12 @@ def test_astype_categorical_to_categorical(self, name, dtype_ordered, expected = Series(s_data, name=name, dtype=exp_dtype) tm.assert_series_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = s.astype('category', ordered=dtype_ordered) - tm.assert_series_equal(result, expected) - # different categories dtype = CategoricalDtype(list('adc'), dtype_ordered) result = s.astype(dtype) expected = Series(s_data, name=name, dtype=dtype) tm.assert_series_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = s.astype( - 'category', categories=list('adc'), ordered=dtype_ordered) - tm.assert_series_equal(result, expected) - if dtype_ordered is False: # not specifying ordered, so only test once expected = s @@ -387,20 +375,6 @@ def test_astype_categoricaldtype(self): tm.assert_series_equal(result, expected) tm.assert_index_equal(result.cat.categories, Index(['a', 'b', 'c'])) - def test_astype_categoricaldtype_with_args(self): - s = Series(['a', 'b']) - type_ = CategoricalDtype(['a', 'b']) - - msg = (r"Cannot specify a CategoricalDtype and also `categories` or" - r" `ordered`\. Use `dtype=CategoricalDtype\(categories," - r" ordered\)` instead\.") - with pytest.raises(TypeError, match=msg): - s.astype(type_, ordered=True) - with pytest.raises(TypeError, match=msg): - s.astype(type_, categories=['a', 'b']) - with pytest.raises(TypeError, match=msg): - s.astype(type_, categories=['a', 'b'], ordered=False) - @pytest.mark.parametrize("dtype", [ np.datetime64, np.timedelta64, From af7f2ef73e449f01acc6de47463c9b1440c6b0fb Mon Sep 17 00:00:00 2001 From: pilkibun <51503352+pilkibun@users.noreply.github.com> Date: Sun, 30 Jun 2019 19:27:47 +0000 Subject: [PATCH 113/238] DOC: add internal links to "Endpoints are inclusive" section (#27131) --- doc/source/user_guide/advanced.rst | 4 +++- doc/source/user_guide/indexing.rst | 10 ++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 280eb05964787b..20cde34baf3732 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -1108,6 +1108,8 @@ the :meth:`~Index.is_unique` attribute. weakly_monotonic.is_monotonic_increasing weakly_monotonic.is_monotonic_increasing & weakly_monotonic.is_unique +.. _advanced.endpoints_are_inclusive: + Endpoints are inclusive ~~~~~~~~~~~~~~~~~~~~~~~ @@ -1137,7 +1139,7 @@ index can be somewhat complicated. For example, the following does not work: s.loc['c':'e' + 1] A very common use case is to limit a time series to start and end at two -specific dates. To enable this, we made the design to make label-based +specific dates. To enable this, we made the design choice to make label-based slicing include both endpoints: .. ipython:: python diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index c09eb87df03689..888266c3cfa55c 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -61,8 +61,8 @@ of multi-axis indexing. * A list or array of labels ``['a', 'b', 'c']``. * A slice object with labels ``'a':'f'`` (Note that contrary to usual python slices, **both** the start and the stop are included, when present in the - index! See :ref:`Slicing with labels - `.). + index! See :ref:`Slicing with labels ` + and :ref:`Endpoints are inclusive `.) * A boolean array * A ``callable`` function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above). @@ -335,8 +335,7 @@ The ``.loc`` attribute is the primary access method. The following are valid inp * A list or array of labels ``['a', 'b', 'c']``. * A slice object with labels ``'a':'f'`` (Note that contrary to usual python slices, **both** the start and the stop are included, when present in the - index! See :ref:`Slicing with labels - `.). + index! See :ref:`Slicing with labels `. * A boolean array. * A ``callable``, see :ref:`Selection By Callable `. @@ -418,6 +417,9 @@ error will be raised (since doing otherwise would be computationally expensive, as well as potentially ambiguous for mixed type indexes). For instance, in the above example, ``s.loc[1:6]`` would raise ``KeyError``. +For the rationale behind this behavior, see +:ref:`Endpoints are inclusive `. + .. _indexing.integer: Selection by position From 3a72b814cfaabc25051babf65f1c32c9f2fb1f06 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 30 Jun 2019 15:50:23 -0500 Subject: [PATCH 114/238] DOC: remove okwarning/okexcept for fixed issues (#27150) --- doc/source/getting_started/10min.rst | 1 - doc/source/user_guide/missing_data.rst | 1 - 2 files changed, 2 deletions(-) diff --git a/doc/source/getting_started/10min.rst b/doc/source/getting_started/10min.rst index 68ba777ec2c2aa..510c7ef97aa988 100644 --- a/doc/source/getting_started/10min.rst +++ b/doc/source/getting_started/10min.rst @@ -712,7 +712,6 @@ See the :ref:`Plotting ` docs. plt.close('all') .. ipython:: python - :okwarning: ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 1439296fb82960..ef77826e9a444f 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -458,7 +458,6 @@ You can mix pandas' ``reindex`` and ``interpolate`` methods to interpolate at the new values. .. ipython:: python - :okexcept: ser = pd.Series(np.sort(np.random.uniform(size=100))) From f58a1fee78629ad158eca41c8f228e59aae311b0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 30 Jun 2019 15:53:06 -0500 Subject: [PATCH 115/238] DOC: remove the force uninstall from contributing guide (#27149) --- doc/source/development/contributing.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 26e9b2fdb07a6c..dde1db7e693de3 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -178,7 +178,6 @@ We'll now kick off a three-step process: # Create and activate the build environment conda env create -f environment.yml conda activate pandas-dev - conda uninstall --force pandas # or with older versions of Anaconda: source activate pandas-dev From b870dee2818cbfce090f97302b2ab07f858cbd6f Mon Sep 17 00:00:00 2001 From: pilkibun <51503352+pilkibun@users.noreply.github.com> Date: Sun, 30 Jun 2019 21:41:55 +0000 Subject: [PATCH 116/238] DOC: tweak paragraph regarding cut and IntervalIndex (#27132) --- doc/source/user_guide/advanced.rst | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 20cde34baf3732..6a2620635445d7 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -965,7 +965,8 @@ If you select a label *contained* within an interval, this will also select the df.loc[2.5] df.loc[[2.5, 3.5]] -``Interval`` and ``IntervalIndex`` are used by ``cut`` and ``qcut``: +:func:`cut` and :func:`qcut` both return a ``Categorical`` object, and the bins they +create are stored as an ``IntervalIndex`` in its ``.categories`` attribute. .. ipython:: python @@ -973,13 +974,17 @@ If you select a label *contained* within an interval, this will also select the c c.categories -Furthermore, ``IntervalIndex`` allows one to bin *other* data with these same -bins, with ``NaN`` representing a missing value similar to other dtypes. +:func:`cut` also accepts an ``IntervalIndex`` for its ``bins`` argument, which enables +a useful pandas idiom. First, We call :func:`cut` with some data and ``bins`` set to a +fixed number, to generate the bins. Then, we pass the values of ``.categories`` as the +``bins`` argument in subsequent calls to :func:`cut`, supplying new data which will be +binned into the same bins. .. ipython:: python pd.cut([0, 3, 5, 1], bins=c.categories) +Any value which falls outside all bins will be assigned a ``NaN`` value. Generating ranges of intervals ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From ad2e98c3c59233dd8cd5234cead93010e0bd4b8a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 1 Jul 2019 06:41:23 -0500 Subject: [PATCH 117/238] ERR/TST: Raise NotImplementedError in to_hdf for extension dtypes in MultiIndex (#27144) --- doc/source/whatsnew/v0.25.0.rst | 4 ++ pandas/io/pytables.py | 6 ++- .../tests/indexes/datetimes/test_timezones.py | 12 ++++++ pandas/tests/indexes/multi/test_format.py | 4 +- pandas/tests/io/pytables/test_pytables.py | 36 +++++++++++++++-- pandas/tests/reshape/test_pivot.py | 40 +++++++++++++++++++ 6 files changed, 96 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index e42752cca90435..1390e60179aae6 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -566,6 +566,7 @@ Other API changes - Removed support of gtk package for clipboards (:issue:`26563`) - Using an unsupported version of Beautiful Soup 4 will now raise an ``ImportError`` instead of a ``ValueError`` (:issue:`27063`) - :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` will now raise a ``ValueError`` when saving timezone aware data. (:issue:`27008`, :issue:`7056`) +- :meth:`DataFrame.to_hdf` and :meth:`Series.to_hdf` will now raise a ``NotImplementedError`` when saving a :class:`MultiIndex` with extention data types for a ``fixed`` format. (:issue:`7775`) .. _whatsnew_0250.deprecations: @@ -719,6 +720,7 @@ Timezones - Bug in :func:`to_datetime` with ``unit='ns'`` would drop timezone information from the parsed argument (:issue:`26168`) - Bug in :func:`DataFrame.join` where joining a timezone aware index with a timezone aware column would result in a column of ``NaN`` (:issue:`26335`) - Bug in :func:`date_range` where ambiguous or nonexistent start or end times were not handled by the ``ambiguous`` or ``nonexistent`` keywords respectively (:issue:`27088`) +- Bug in :meth:`DatetimeIndex.union` when combining a timezone aware and timezone unaware :class:`DatetimeIndex` (:issue:`21671`) Numeric ^^^^^^^ @@ -814,6 +816,7 @@ I/O - :func:`read_excel` now raises a ``ValueError`` when input is of type :class:`pandas.io.excel.ExcelFile` and ``engine`` param is passed since :class:`pandas.io.excel.ExcelFile` has an engine defined (:issue:`26566`) - Bug while selecting from :class:`HDFStore` with ``where=''`` specified (:issue:`26610`). - Fixed bug in :func:`DataFrame.to_excel()` where custom objects (i.e. `PeriodIndex`) inside merged cells were not being converted into types safe for the Excel writer (:issue:`27006`) +- Bug in :meth:`read_hdf` where reading a timezone aware :class:`DatetimeIndex` would raise a ``TypeError`` (:issue:`11926`) Plotting ^^^^^^^^ @@ -868,6 +871,7 @@ Reshaping - Bug in :meth:`Series.nlargest` treats ``True`` as smaller than ``False`` (:issue:`26154`) - Bug in :func:`DataFrame.pivot_table` with a :class:`IntervalIndex` as pivot index would raise ``TypeError`` (:issue:`25814`) - Bug in :meth:`DataFrame.transpose` where transposing a DataFrame with a timezone-aware datetime column would incorrectly raise ``ValueError`` (:issue:`26825`) +- Bug in :func:`pivot_table` when pivoting a timezone aware column as the ``values`` would remove timezone information (:issue:`14948`) Sparse ^^^^^^ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index c8c27f62cef343..f439e365fbcf0b 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -23,7 +23,8 @@ from pandas.core.dtypes.common import ( ensure_object, is_categorical_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_list_like, is_timedelta64_dtype) + is_datetime64tz_dtype, is_extension_type, is_list_like, + is_timedelta64_dtype) from pandas.core.dtypes.missing import array_equivalent from pandas import ( @@ -2647,6 +2648,9 @@ def write_multi_index(self, key, index): index.codes, index.names)): # write the level + if is_extension_type(lev): + raise NotImplementedError("Saving a MultiIndex with an " + "extension dtype is not supported.") level_key = '{key}_level{idx}'.format(key=key, idx=i) conv_level = _convert_index(lev, self.encoding, self.errors, self.format_type).set_name(level_key) diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 088007ba6af4b6..af0183379790a5 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -1094,6 +1094,18 @@ def test_dti_union_aware(self): assert result[0].tz.zone == 'US/Central' assert result[-1].tz.zone == 'US/Eastern' + def test_dti_union_mixed(self): + # GH 21671 + rng = DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT]) + rng2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02'], tz='Asia/Tokyo') + result = rng.union(rng2) + expected = Index([pd.Timestamp('2011-01-01'), + pd.NaT, + pd.Timestamp('2012-01-01', tz='Asia/Tokyo'), + pd.Timestamp('2012-01-02', tz='Asia/Tokyo')], + dtype=object) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('tz', [None, 'UTC', "US/Central", dateutil.tz.tzoffset(None, -28800)]) @pytest.mark.usefixtures("datetime_tz_utc") diff --git a/pandas/tests/indexes/multi/test_format.py b/pandas/tests/indexes/multi/test_format.py index 85d30b8f6de6b8..8413fc1318d0b2 100644 --- a/pandas/tests/indexes/multi/test_format.py +++ b/pandas/tests/indexes/multi/test_format.py @@ -10,8 +10,8 @@ def test_dtype_str(indices): with tm.assert_produces_warning(FutureWarning): dtype = indices.dtype_str - assert isinstance(dtype, str) - assert dtype == str(indices.dtype) + assert isinstance(dtype, str) + assert dtype == str(indices.dtype) def test_format(idx): diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index 40cc05c3174710..ec347396727182 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -17,9 +17,9 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, DatetimeIndex, Index, Int64Index, MultiIndex, - RangeIndex, Series, Timestamp, bdate_range, concat, date_range, isna, - timedelta_range) + Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Index, Int64Index, + MultiIndex, RangeIndex, Series, Timestamp, bdate_range, concat, date_range, + isna, timedelta_range) import pandas.util.testing as tm from pandas.util.testing import ( assert_frame_equal, assert_series_equal, set_timezone) @@ -4749,6 +4749,19 @@ def test_select_empty_where(self, where): result = pd.read_hdf(store, "df", where=where) assert_frame_equal(result, df) + @pytest.mark.parametrize('idx', [ + date_range('2019', freq='D', periods=3, tz='UTC'), + CategoricalIndex(list('abc')) + ]) + def test_to_hdf_multiindex_extension_dtype(self, idx): + # GH 7775 + mi = MultiIndex.from_arrays([idx, idx]) + df = pd.DataFrame(0, index=mi, columns=['a']) + with ensure_clean_path(self.path) as path: + with pytest.raises(NotImplementedError, + match="Saving a MultiIndex"): + df.to_hdf(path, 'df') + class TestHDFComplexValues(Base): # GH10447 @@ -5170,3 +5183,20 @@ def test_dst_transitions(self): store.append('df', df) result = store.select('df') assert_frame_equal(result, df) + + def test_read_with_where_tz_aware_index(self): + # GH 11926 + periods = 10 + dts = pd.date_range('20151201', periods=periods, + freq='D', tz='UTC') + mi = pd.MultiIndex.from_arrays([dts, range(periods)], + names=['DATE', 'NO']) + expected = pd.DataFrame({'MYCOL': 0}, index=mi) + + key = 'mykey' + with ensure_clean_path(self.path) as path: + with pd.HDFStore(path) as store: + store.append(key, expected, format='table', append=True) + result = pd.read_hdf(path, key, + where="DATE > 20151130") + assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 8543d2c2df7d6f..7def8e53859c70 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -429,6 +429,46 @@ def test_pivot_with_tz(self, method): pv = pd.pivot(df, index='dt1', columns='dt2', values='data1') tm.assert_frame_equal(pv, expected) + def test_pivot_tz_in_values(self): + # GH 14948 + df = pd.DataFrame([{'uid': u'aa', + 'ts': pd.Timestamp('2016-08-12 13:00:00-0700', + tz='US/Pacific')}, + {'uid': u'aa', + 'ts': pd.Timestamp('2016-08-12 08:00:00-0700', + tz='US/Pacific')}, + {'uid': u'aa', + 'ts': pd.Timestamp('2016-08-12 14:00:00-0700', + tz='US/Pacific')}, + {'uid': u'aa', + 'ts': pd.Timestamp('2016-08-25 11:00:00-0700', + tz='US/Pacific')}, + {'uid': u'aa', + 'ts': pd.Timestamp('2016-08-25 13:00:00-0700', + tz='US/Pacific')}]) + + df = df.set_index('ts').reset_index() + mins = df.ts.map(lambda x: x.replace(hour=0, minute=0, + second=0, microsecond=0)) + + result = pd.pivot_table(df.set_index('ts').reset_index(), + values='ts', index=['uid'], columns=[mins], + aggfunc=np.min) + expected = pd.DataFrame( + [ + [pd.Timestamp('2016-08-12 08:00:00-0700', tz='US/Pacific'), + pd.Timestamp('2016-08-25 11:00:00-0700', tz='US/Pacific')] + ], + index=pd.Index(['aa'], name='uid'), + columns=pd.DatetimeIndex( + [ + pd.Timestamp('2016-08-12 00:00:00', tz='US/Pacific'), + pd.Timestamp('2016-08-25 00:00:00', tz='US/Pacific') + ], + name='ts') + ) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize('method', [True, False]) def test_pivot_periods(self, method): df = DataFrame({'p1': [pd.Period('2013-01-01', 'D'), From d054c037132b6cdb61cdf3ed90499287f163aeb4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 1 Jul 2019 10:36:24 -0500 Subject: [PATCH 118/238] CI: Install moto from conda-forge (#27163) * CI: Install moto from conda-forge Closes https://github.com/pandas-dev/pandas/issues/27161 --- ci/deps/azure-37-locale.yaml | 2 +- ci/deps/azure-windows-37.yaml | 2 +- ci/deps/travis-36-cov.yaml | 2 +- ci/deps/travis-36-locale.yaml | 2 +- pandas/core/indexes/base.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index bd8ba912d52980..05adbf0c924dc0 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -10,6 +10,7 @@ dependencies: - jinja2 - lxml - matplotlib + - moto - nomkl - numexpr - numpy @@ -32,4 +33,3 @@ dependencies: - pip - pip: - hypothesis>=3.58.0 - - moto # latest moto in conda-forge fails with 3.7, move to conda dependencies when this is fixed diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 43504dec269533..08208d1e2d59ac 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -10,6 +10,7 @@ dependencies: - jinja2 - lxml - matplotlib=2.2.* + - moto - numexpr - numpy=1.14.* - openpyxl @@ -29,6 +30,5 @@ dependencies: - pytest-xdist - pytest-mock - pytest-azurepipelines - - moto - hypothesis>=3.58.0 - pyreadstat diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index c497495553e8bb..fead806fc8e1c4 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -12,6 +12,7 @@ dependencies: - geopandas - html5lib - matplotlib + - moto - nomkl - numexpr - numpy=1.15.* @@ -46,6 +47,5 @@ dependencies: - pip: - brotlipy - coverage - - moto - pandas-datareader - python-dateutil diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index 75e3348adab7c7..0d9a760914dab1 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -14,6 +14,7 @@ dependencies: - jinja2 - lxml=3.8.0 - matplotlib=3.0.* + - moto - nomkl - numexpr - numpy @@ -36,7 +37,6 @@ dependencies: - pytest>=4.0.2 - pytest-xdist - pytest-mock - - moto - pip - pip: - hypothesis>=3.58.0 diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 23089cb577bf59..13e672cbc131a2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -679,7 +679,7 @@ def dtype(self): """ return self._data.dtype - @cache_readonly + @property def dtype_str(self): """ Return the dtype str of the underlying data. From 46adc5b1c2aacb312d72729af72bc0ad600917c0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 1 Jul 2019 10:57:41 -0500 Subject: [PATCH 119/238] DEPR: deprecate get_values (#26409) --- doc/source/whatsnew/v0.25.0.rst | 3 +++ pandas/_libs/lib.pyx | 5 ++++- pandas/_libs/src/ujson/python/objToJSON.c | 14 ++++++++++++-- pandas/core/algorithms.py | 2 +- pandas/core/arrays/categorical.py | 9 ++++++++- pandas/core/arrays/sparse.py | 19 ++++++++++++++++--- pandas/core/dtypes/concat.py | 2 +- pandas/core/frame.py | 5 +++-- pandas/core/generic.py | 10 ++++++++++ pandas/core/groupby/generic.py | 6 +++--- pandas/core/indexes/base.py | 10 ++++++++++ pandas/core/indexes/category.py | 7 ++++--- pandas/core/indexes/multi.py | 2 +- pandas/core/internals/blocks.py | 10 ++++++++-- pandas/core/ops.py | 2 +- pandas/core/series.py | 10 ++++++++++ pandas/core/sparse/frame.py | 9 +++++---- pandas/core/sparse/series.py | 2 +- pandas/io/formats/format.py | 4 ++-- pandas/tests/arrays/categorical/test_api.py | 18 ++++++++++++------ pandas/tests/arrays/sparse/test_array.py | 11 +++++++---- pandas/tests/frame/test_api.py | 6 ++++++ pandas/tests/frame/test_reshape.py | 2 +- pandas/tests/indexes/multi/test_analytics.py | 2 +- pandas/tests/indexes/period/test_period.py | 8 +++++--- .../tests/indexing/multiindex/test_slice.py | 4 ++-- pandas/tests/series/test_api.py | 6 ++++++ pandas/util/testing.py | 10 ++++++---- 28 files changed, 149 insertions(+), 49 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1390e60179aae6..3b237592122a4f 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -615,6 +615,9 @@ Other deprecations Use the public attributes :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop` and :attr:`~RangeIndex.step` instead (:issue:`26581`). - The :meth:`Series.ftype`, :meth:`Series.ftypes` and :meth:`DataFrame.ftypes` methods are deprecated and will be removed in a future version. Instead, use :meth:`Series.dtype` and :meth:`DataFrame.dtypes` (:issue:`26705`). +- The :meth:`Series.get_values`, :meth:`DataFrame.get_values`, :meth:`Index.get_values`, + :meth:`SparseArray.get_values` and :meth:`Categorical.get_values` methods are deprecated. + One of ``np.asarray(..)`` or :meth:`~Series.to_numpy` can be used instead (:issue:`19617`). - :meth:`Timedelta.resolution` is deprecated and replaced with :meth:`Timedelta.resolution_string`. In a future version, :meth:`Timedelta.resolution` will be changed to behave like the standard library :attr:`timedelta.resolution` (:issue:`21344`) - :func:`read_table` has been undeprecated. (:issue:`25220`) - :attr:`Index.dtype_str` is deprecated. (:issue:`18262`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c09fb96eb9182c..990ac7c96a73ef 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -76,7 +76,10 @@ def values_from_object(obj: object): """ return my values or the object if we are say an ndarray """ func: object - func = getattr(obj, 'get_values', None) + if getattr(obj, '_typ', '') == 'dataframe': + return obj.values + + func = getattr(obj, '_internal_get_values', None) if func is not None: obj = func() diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index cc87d95bf35d8e..926440218b5d93 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -269,9 +269,19 @@ static PyObject *get_values(PyObject *obj) { } } - if (!values && PyObject_HasAttrString(obj, "get_values")) { + if (!values && PyObject_HasAttrString(obj, "_internal_get_values")) { PRINTMARK(); - values = PyObject_CallMethod(obj, "get_values", NULL); + values = PyObject_CallMethod(obj, "_internal_get_values", NULL); + if (values && !PyArray_CheckExact(values)) { + PRINTMARK(); + Py_DECREF(values); + values = NULL; + } + } + + if (!values && PyObject_HasAttrString(obj, "get_block_values")) { + PRINTMARK(); + values = PyObject_CallMethod(obj, "get_block_values", NULL); if (values && !PyArray_CheckExact(values)) { PRINTMARK(); Py_DECREF(values); diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 98daae076fbc14..4e84d7b26b7075 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1590,7 +1590,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) if is_sparse(arr): - arr = arr.get_values() + arr = arr.to_dense() elif isinstance(arr, (ABCIndexClass, ABCSeries)): arr = arr.values diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3ef2f41f253387..68c7b79becb555 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1483,6 +1483,8 @@ def get_values(self): """ Return the values. + .. deprecated:: 0.25.0 + For internal compatibility with pandas formatting. Returns @@ -1491,6 +1493,11 @@ def get_values(self): A numpy array of the same dtype as categorical.categories.dtype or Index if datetime / periods. """ + warn("The 'get_values' method is deprecated and will be removed in a " + "future version", FutureWarning, stacklevel=2) + return self._internal_get_values() + + def _internal_get_values(self): # if we are a datetime and period index, return Index to keep metadata if is_datetimelike(self.categories): return self.categories.take(self._codes, fill_value=np.nan) @@ -1923,7 +1930,7 @@ def __iter__(self): """ Returns an Iterator over the values of this Categorical. """ - return iter(self.get_values().tolist()) + return iter(self._internal_get_values().tolist()) def __contains__(self, key): """ diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 3512d4e9e29db2..97ab6ec8235ef2 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -454,7 +454,7 @@ def _sparse_array_op( if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: with np.errstate(all='ignore'): - result = op(left.get_values(), right.get_values()) + result = op(left.to_dense(), right.to_dense()) fill = op(_get_fill(left), _get_fill(right)) if left.sp_index.ngaps == 0: @@ -1468,8 +1468,21 @@ def to_dense(self): """ return np.asarray(self, dtype=self.sp_values.dtype) - # TODO: Look into deprecating this in favor of `to_dense`. - get_values = to_dense + def get_values(self): + """ + Convert SparseArray to a NumPy array. + + .. deprecated:: 0.25.0 + Use `to_dense` instead. + + """ + warnings.warn( + "The 'get_values' method is deprecated and will be removed in a " + "future version. Use the 'to_dense' method instead.", + FutureWarning, stacklevel=2) + return self._internal_get_values() + + _internal_get_values = to_dense # ------------------------------------------------------------------------ # IO diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 242885c7a96793..66f7a6365fe416 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -194,7 +194,7 @@ def _concat_categorical(to_concat, axis=0): return union_categoricals(categoricals) # extract the categoricals & coerce to object if needed - to_concat = [x.get_values() if is_categorical_dtype(x.dtype) + to_concat = [x._internal_get_values() if is_categorical_dtype(x.dtype) else np.asarray(x).ravel() if not is_datetime64tz_dtype(x) else np.asarray(x.astype(object)) for x in to_concat] result = _concat_compat(to_concat) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d7da653618b2fe..3ff3fff22f4f04 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1616,7 +1616,8 @@ def to_records(self, index=True, convert_datetime64=None, else: ix_vals = [self.index.values] - arrays = ix_vals + [self[c].get_values() for c in self.columns] + arrays = ix_vals + [self[c]._internal_get_values() + for c in self.columns] count = 0 index_names = list(self.index.names) @@ -1632,7 +1633,7 @@ def to_records(self, index=True, convert_datetime64=None, names = [str(name) for name in itertools.chain(index_names, self.columns)] else: - arrays = [self[c].get_values() for c in self.columns] + arrays = [self[c]._internal_get_values() for c in self.columns] names = [str(c) for c in self.columns] index_names = [] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 841131b697f9cd..957efa402346e8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5220,6 +5220,9 @@ def get_values(self): """ Return an ndarray after converting sparse values to dense. + .. deprecated:: 0.25.0 + Use ``np.asarray(..)`` or :meth:`DataFrame.values` instead. + This is the same as ``.values`` for non-sparse data. For sparse data contained in a `SparseArray`, the data are first converted to a dense representation. @@ -5259,6 +5262,13 @@ def get_values(self): [nan, 2.], [nan, 3.]]) """ + warnings.warn( + "The 'get_values' method is deprecated and will be removed in a " + "future version. Use '.values' or 'np.asarray(..)' instead.", + FutureWarning, stacklevel=2) + return self._internal_get_values() + + def _internal_get_values(self): return self.values def get_dtype_counts(self): diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 72c8d330170d4c..210e82837118c9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1118,7 +1118,7 @@ def nunique(self, dropna=True): """ ids, _, _ = self.grouper.group_info - val = self.obj.get_values() + val = self.obj._internal_get_values() try: sorter = np.lexsort((val, ids)) @@ -1192,7 +1192,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, bins=bins) ids, _, _ = self.grouper.group_info - val = self.obj.get_values() + val = self.obj._internal_get_values() # groupby removes null keys from groupings mask = ids != -1 @@ -1306,7 +1306,7 @@ def count(self): Count of values within each group. """ ids, _, ngroups = self.grouper.group_info - val = self.obj.get_values() + val = self.obj._internal_get_values() mask = (ids != -1) & ~isna(val) ids = ensure_platform_int(ids) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 13e672cbc131a2..0123e6a5f10659 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3764,6 +3764,9 @@ def get_values(self): """ Return `Index` data as an `numpy.ndarray`. + .. deprecated:: 0.25.0 + Use :meth:`Index.to_numpy` or :attr:`Index.array` instead. + Returns ------- numpy.ndarray @@ -3802,6 +3805,13 @@ def get_values(self): >>> midx.get_values().ndim 1 """ + warnings.warn( + "The 'get_values' method is deprecated and will be removed in a " + "future version. Use '.to_numpy()' or '.array' instead.", + FutureWarning, stacklevel=2) + return self._internal_get_values() + + def _internal_get_values(self): return self.values @Appender(IndexOpsMixin.memory_usage.__doc__) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 3d3774ce48e8b6..db4778f5e375f1 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -355,9 +355,10 @@ def _wrap_setop_result(self, other, result): name = get_op_result_name(self, other) return self._shallow_copy(result, name=name) - def get_values(self): - """ return the underlying data as an ndarray """ - return self._data.get_values() + def _internal_get_values(self): + # override base Index version to get the numpy array representation of + # the underlying Categorical + return self._data._internal_get_values() def tolist(self): return self._data.tolist() diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a06d304fb5a229..19ba147fe9a279 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1246,7 +1246,7 @@ def values(self): for i in range(self.nlevels): vals = self._get_level_values(i) if is_categorical_dtype(vals): - vals = vals.get_values() + vals = vals._internal_get_values() if (isinstance(vals.dtype, ExtensionDtype) or hasattr(vals, '_box_values')): vals = vals.astype(object) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 36390d46728123..b79f87461093de 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -177,6 +177,12 @@ def get_values(self, dtype=None): return self.values.astype(object) return self.values + def get_block_values(self, dtype=None): + """ + This is used in the JSON C code + """ + return self.get_values(dtype=dtype) + def to_dense(self): return self.values.view() @@ -2921,7 +2927,7 @@ def to_dense(self): # Categorical.get_values returns a DatetimeIndex for datetime # categories, so we can't simply use `np.asarray(self.values)` like # other types. - return self.values.get_values() + return self.values._internal_get_values() def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ @@ -3222,7 +3228,7 @@ def _putmask_preserve(nv, n): dtype, _ = maybe_promote(n.dtype) if is_extension_type(v.dtype) and is_object_dtype(dtype): - v = v.get_values(dtype) + v = v._internal_get_values(dtype) else: v = v.astype(dtype) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 0b9e56fd19556a..a4d31cb227f190 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -1891,7 +1891,7 @@ def wrapper(self, other, axis=None): name=res_name, dtype='bool') else: - values = self.get_values() + values = self.to_numpy() with np.errstate(all='ignore'): res = na_op(values, other) diff --git a/pandas/core/series.py b/pandas/core/series.py index 3d54fa4485c84f..f415bc9fd3561f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -506,11 +506,21 @@ def get_values(self): """ Same as values (but handles sparseness conversions); is a view. + .. deprecated:: 0.25.0 + Use :meth:`Series.to_numpy` or :attr:`Series.array` instead. + Returns ------- numpy.ndarray Data of the Series. """ + warnings.warn( + "The 'get_values' method is deprecated and will be removed in a " + "future version. Use '.to_numpy()' or '.array' instead.", + FutureWarning, stacklevel=2) + return self._internal_get_values() + + def _internal_get_values(self): return self._data.get_values() @property diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 67ecbcbea67f99..6a0ba5f93c5092 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -493,7 +493,7 @@ def xs(self, key, axis=0, copy=False): return data i = self.index.get_loc(key) - data = self.take([i]).get_values()[0] + data = self.take([i])._internal_get_values()[0] return Series(data, index=self.columns) # ---------------------------------------------------------------------- @@ -694,9 +694,10 @@ def _reindex_with_indexers(self, reindexers, method=None, fill_value=None, if col not in self: continue if row_indexer is not None: - new_arrays[col] = algos.take_1d(self[col].get_values(), - row_indexer, - fill_value=fill_value) + new_arrays[col] = algos.take_1d( + self[col]._internal_get_values(), + row_indexer, + fill_value=fill_value) else: new_arrays[col] = self[col] diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 2e740c0acc465a..88b6634db92b64 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -224,7 +224,7 @@ def __repr__(self): def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): """ perform a reduction operation """ - return op(self.get_values(), skipna=skipna, **kwds) + return op(self.array.to_dense(), skipna=skipna, **kwds) def __getstate__(self): # pickling diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index c709ff876b3a0b..3f98fc235b2c58 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -129,7 +129,7 @@ def _get_footer(self): return str(footer) def _get_formatted_values(self): - return format_array(self.categorical.get_values(), None, + return format_array(self.categorical._internal_get_values(), None, float_format=None, na_rep=self.na_rep) def to_string(self): @@ -1196,7 +1196,7 @@ def _format_strings(self): if is_categorical_dtype(values.dtype): # Categorical is special for now, so that we can preserve tzinfo - array = values.get_values() + array = values._internal_get_values() else: array = np.asarray(values) diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index 4be3919f173c45..d2f63268e5a123 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -243,7 +243,7 @@ def test_set_categories(self): tm.assert_index_equal(c.categories, Index([1, 2, 3, 4])) exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) - tm.assert_numpy_array_equal(c.get_values(), exp) + tm.assert_numpy_array_equal(c.to_dense(), exp) # all "pointers" to '4' must be changed from 3 to 0,... c = c.set_categories([4, 3, 2, 1]) @@ -257,7 +257,7 @@ def test_set_categories(self): # output is the same exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) - tm.assert_numpy_array_equal(c.get_values(), exp) + tm.assert_numpy_array_equal(c.to_dense(), exp) assert c.min() == 4 assert c.max() == 1 @@ -265,13 +265,13 @@ def test_set_categories(self): c2 = c.set_categories([4, 3, 2, 1], ordered=False) assert not c2.ordered - tm.assert_numpy_array_equal(c.get_values(), c2.get_values()) + tm.assert_numpy_array_equal(c.to_dense(), c2.to_dense()) # set_categories should pass thru the ordering c2 = c.set_ordered(False).set_categories([4, 3, 2, 1]) assert not c2.ordered - tm.assert_numpy_array_equal(c.get_values(), c2.get_values()) + tm.assert_numpy_array_equal(c.to_dense(), c2.to_dense()) @pytest.mark.parametrize('values, categories, new_categories', [ # No NaNs, same cats, same order @@ -378,7 +378,7 @@ def test_remove_unused_categories(self): tm.assert_index_equal(out.categories, Index(['B', 'D', 'F'])) exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8) tm.assert_numpy_array_equal(out.codes, exp_codes) - assert out.get_values().tolist() == val + assert out.tolist() == val alpha = list('abcdefghijklmnopqrstuvwxyz') val = np.random.choice(alpha[::2], 10000).astype('object') @@ -386,7 +386,7 @@ def test_remove_unused_categories(self): cat = Categorical(values=val, categories=alpha) out = cat.remove_unused_categories() - assert out.get_values().tolist() == val.tolist() + assert out.tolist() == val.tolist() class TestCategoricalAPIWithFactor(TestCategorical): @@ -499,3 +499,9 @@ def test_recode_to_categories_large(self): new = Index(expected) result = _recode_for_categories(codes, old, new) tm.assert_numpy_array_equal(result, expected) + + def test_deprecated_get_values(self): + cat = Categorical(["a", "b", "c", "a"]) + with tm.assert_produces_warning(FutureWarning): + res = cat.get_values() + tm.assert_numpy_array_equal(res, np.array(cat)) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index fbf86f66e437fc..8a51704732d7f3 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -615,16 +615,19 @@ def test_shape(self, data, shape, dtype): [1, np.nan, np.nan, 3, np.nan], [1, np.nan, 0, 3, 0], ]) - @pytest.mark.parametrize("method", ["to_dense", "get_values"]) @pytest.mark.parametrize("fill_value", [None, 0]) - def test_dense_repr(self, vals, fill_value, method): + def test_dense_repr(self, vals, fill_value): vals = np.array(vals) arr = SparseArray(vals, fill_value=fill_value) - dense_func = getattr(arr, method) - res = dense_func() + res = arr.to_dense() tm.assert_numpy_array_equal(res, vals) + with tm.assert_produces_warning(FutureWarning): + res2 = arr.get_values() + + tm.assert_numpy_array_equal(res2, vals) + def test_getitem(self): def _checkit(i): assert_almost_equal(self.arr[i], self.arr.to_dense()[i]) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index ce841b302a0375..ed224e23fbe20c 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -547,3 +547,9 @@ def test_tab_complete_warning(self, ip): with tm.assert_produces_warning(None): with provisionalcompleter('ignore'): list(ip.Completer.completions('df.', 1)) + + def test_get_values_deprecated(self): + df = DataFrame({'a': [1, 2], 'b': [.1, .2]}) + with tm.assert_produces_warning(FutureWarning): + res = df.get_values() + tm.assert_numpy_array_equal(res, df.values) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index a3b9e529431e5d..ac8d1557a4c43c 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -804,7 +804,7 @@ def _test_stack_with_multiindex(multiindex): else: assert_frame_equal(result, expected) - df.columns = MultiIndex.from_tuples(df.columns.get_values(), + df.columns = MultiIndex.from_tuples(df.columns.to_numpy(), names=df.columns.names) expected = df.stack(level=level, dropna=False) if isinstance(expected, Series): diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py index 5ac73a3c5b9406..f886d78da6da24 100644 --- a/pandas/tests/indexes/multi/test_analytics.py +++ b/pandas/tests/indexes/multi/test_analytics.py @@ -20,7 +20,7 @@ def test_shift(idx): def test_groupby(idx): groups = idx.groupby(np.array([1, 1, 1, 2, 2, 2])) - labels = idx.get_values().tolist() + labels = idx.tolist() exp = {1: labels[:3], 2: labels[3:]} tm.assert_dict_equal(groups, exp) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index a70f67557bfc22..b33982f3d62f35 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -164,7 +164,9 @@ def test_values(self): exp = np.array([], dtype=np.object) tm.assert_numpy_array_equal(idx.values, exp) - tm.assert_numpy_array_equal(idx.get_values(), exp) + tm.assert_numpy_array_equal(idx.to_numpy(), exp) + with tm.assert_produces_warning(FutureWarning): + tm.assert_numpy_array_equal(idx.get_values(), exp) exp = np.array([], dtype=np.int64) tm.assert_numpy_array_equal(idx._ndarray_values, exp) @@ -172,7 +174,7 @@ def test_values(self): exp = np.array([pd.Period('2011-01', freq='M'), pd.NaT], dtype=object) tm.assert_numpy_array_equal(idx.values, exp) - tm.assert_numpy_array_equal(idx.get_values(), exp) + tm.assert_numpy_array_equal(idx.to_numpy(), exp) exp = np.array([492, -9223372036854775808], dtype=np.int64) tm.assert_numpy_array_equal(idx._ndarray_values, exp) @@ -181,7 +183,7 @@ def test_values(self): exp = np.array([pd.Period('2011-01-01', freq='D'), pd.NaT], dtype=object) tm.assert_numpy_array_equal(idx.values, exp) - tm.assert_numpy_array_equal(idx.get_values(), exp) + tm.assert_numpy_array_equal(idx.to_numpy(), exp) exp = np.array([14975, -9223372036854775808], dtype=np.int64) tm.assert_numpy_array_equal(idx._ndarray_values, exp) diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py index 3394c4c06d45a0..2431f27bff78ab 100644 --- a/pandas/tests/indexing/multiindex/test_slice.py +++ b/pandas/tests/indexing/multiindex/test_slice.py @@ -21,7 +21,7 @@ def test_per_axis_per_level_getitem(self): # example test case ix = MultiIndex.from_product([_mklbl('A', 5), _mklbl('B', 7), _mklbl( 'C', 4), _mklbl('D', 2)]) - df = DataFrame(np.arange(len(ix.get_values())), index=ix) + df = DataFrame(np.arange(len(ix.to_numpy())), index=ix) result = df.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] expected = df.loc[[tuple([a, b, c, d]) @@ -88,7 +88,7 @@ def test_per_axis_per_level_getitem(self): tm.assert_frame_equal(result, expected) # multi-level series - s = Series(np.arange(len(ix.get_values())), index=ix) + s = Series(np.arange(len(ix.to_numpy())), index=ix) result = s.loc['A1':'A3', :, ['C1', 'C3']] expected = s.loc[[tuple([a, b, c, d]) for a, b, c, d in s.index.values diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 1cd5bd09a82e77..71b0a2d9d74eb8 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -502,6 +502,12 @@ def test_integer_series_size(self): s = Series(range(9), dtype="Int64") assert s.size == 9 + def test_get_values_deprecation(self): + s = Series(range(9)) + with tm.assert_produces_warning(FutureWarning): + res = s.get_values() + tm.assert_numpy_array_equal(res, s.values) + class TestCategoricalSeries: diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 05e0a8df496c57..cec9416e5d2c5c 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1052,7 +1052,8 @@ def assert_series_equal(left, right, check_dtype=True, assert_attr_equal('dtype', left, right) if check_exact: - assert_numpy_array_equal(left.get_values(), right.get_values(), + assert_numpy_array_equal(left._internal_get_values(), + right._internal_get_values(), check_dtype=check_dtype, obj='{obj}'.format(obj=obj),) elif check_datetimelike_compat: @@ -1071,11 +1072,11 @@ def assert_series_equal(left, right, check_dtype=True, '{right}.').format(left=left.values, right=right.values) raise AssertionError(msg) else: - assert_numpy_array_equal(left.get_values(), right.get_values(), + assert_numpy_array_equal(left._internal_get_values(), + right._internal_get_values(), check_dtype=check_dtype) elif is_interval_dtype(left) or is_interval_dtype(right): assert_interval_array_equal(left.array, right.array) - elif (is_extension_array_dtype(left.dtype) and is_datetime64tz_dtype(left.dtype)): # .values is an ndarray, but ._values is the ExtensionArray. @@ -1086,7 +1087,8 @@ def assert_series_equal(left, right, check_dtype=True, is_extension_array_dtype(right) and not is_categorical_dtype(right)): assert_extension_array_equal(left.array, right.array) else: - _testing.assert_almost_equal(left.get_values(), right.get_values(), + _testing.assert_almost_equal(left._internal_get_values(), + right._internal_get_values(), check_less_precise=check_less_precise, check_dtype=check_dtype, obj='{obj}'.format(obj=obj)) From 355e322b5ed4a31c9e1cab9652510e85a54f23a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Abdullah=20=C4=B0hsan=20Se=C3=A7er?= Date: Mon, 1 Jul 2019 19:41:05 +0300 Subject: [PATCH 120/238] ENH: Exclude nuisance columns from result of window functions (#27044) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/window.py | 66 +++++++++++++++++++++++++-------- pandas/tests/test_window.py | 15 ++++---- 3 files changed, 59 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 3b237592122a4f..8b4becebead29b 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -854,6 +854,7 @@ Groupby/resample/rolling - Bug in :meth:`pandas.core.groupby.GroupBy.agg` where incorrect results are returned for uint64 columns. (:issue:`26310`) - Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where MemoryError is raised with empty window (:issue:`26005`) - Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where incorrect results are returned with ``closed='left'`` and ``closed='neither'`` (:issue:`26005`) +- Improved :class:`pandas.core.window.Rolling`, :class:`pandas.core.window.Window` and :class:`pandas.core.window.EWM` functions to exclude nuisance columns from results instead of raising errors and raise a ``DataError`` only if all columns are nuisance (:issue:`12537`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/window.py b/pandas/core/window.py index 2b3cc4f0bf00a1..8f888ba510b0eb 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -22,7 +22,7 @@ ABCDataFrame, ABCDateOffset, ABCDatetimeIndex, ABCPeriodIndex, ABCSeries, ABCTimedeltaIndex) -from pandas.core.base import PandasObject, SelectionMixin +from pandas.core.base import DataError, PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.generic import _shared_docs from pandas.core.groupby.base import GroupByMixin @@ -243,7 +243,7 @@ def _wrap_result(self, result, block=None, obj=None): return type(obj)(result, index=index, columns=block.columns) return result - def _wrap_results(self, results, blocks, obj): + def _wrap_results(self, results, blocks, obj, exclude=None): """ Wrap the results. @@ -252,6 +252,7 @@ def _wrap_results(self, results, blocks, obj): results : list of ndarrays blocks : list of blocks obj : conformed data (may be resampled) + exclude: list of columns to exclude, default to None """ from pandas import Series, concat @@ -285,6 +286,13 @@ def _wrap_results(self, results, blocks, obj): indexer = columns.get_indexer(selection.tolist() + [name]) columns = columns.take(sorted(indexer)) + # exclude nuisance columns so that they are not reindexed + if exclude is not None and exclude: + columns = [c for c in columns if c not in exclude] + + if not columns: + raise DataError('No numeric types to aggregate') + if not len(final): return obj.astype('float64') return concat(final, axis=1).reindex(columns=columns, copy=False) @@ -672,13 +680,21 @@ def _apply_window(self, mean=True, **kwargs): center = self.center blocks, obj, index = self._create_blocks() + block_list = list(blocks) + results = [] - for b in blocks: + exclude = [] + for i, b in enumerate(blocks): try: values = self._prep_values(b.values) - except TypeError: - results.append(b.values.copy()) - continue + + except (TypeError, NotImplementedError): + if isinstance(obj, ABCDataFrame): + exclude.extend(b.columns) + del block_list[i] + continue + else: + raise DataError('No numeric types to aggregate') if values.size == 0: results.append(values.copy()) @@ -700,7 +716,7 @@ def f(arg, *args, **kwargs): result = self._center_window(result, window) results.append(result) - return self._wrap_results(results, blocks, obj) + return self._wrap_results(results, block_list, obj, exclude) _agg_see_also_doc = dedent(""" See Also @@ -843,10 +859,22 @@ def _apply(self, func, name=None, window=None, center=None, check_minp = _use_window blocks, obj, index = self._create_blocks() + block_list = list(blocks) index, indexi = self._get_index(index=index) + results = [] - for b in blocks: - values = self._prep_values(b.values) + exclude = [] + for i, b in enumerate(blocks): + try: + values = self._prep_values(b.values) + + except (TypeError, NotImplementedError): + if isinstance(obj, ABCDataFrame): + exclude.extend(b.columns) + del block_list[i] + continue + else: + raise DataError('No numeric types to aggregate') if values.size == 0: results.append(values.copy()) @@ -892,7 +920,7 @@ def calc(x): results.append(result) - return self._wrap_results(results, blocks, obj) + return self._wrap_results(results, block_list, obj, exclude) class _Rolling_and_Expanding(_Rolling): @@ -2291,13 +2319,21 @@ def _apply(self, func, **kwargs): y : same type as input argument """ blocks, obj, index = self._create_blocks() + block_list = list(blocks) + results = [] - for b in blocks: + exclude = [] + for i, b in enumerate(blocks): try: values = self._prep_values(b.values) - except TypeError: - results.append(b.values.copy()) - continue + + except (TypeError, NotImplementedError): + if isinstance(obj, ABCDataFrame): + exclude.extend(b.columns) + del block_list[i] + continue + else: + raise DataError('No numeric types to aggregate') if values.size == 0: results.append(values.copy()) @@ -2316,7 +2352,7 @@ def func(arg): results.append(np.apply_along_axis(func, self.axis, values)) - return self._wrap_results(results, blocks, obj) + return self._wrap_results(results, block_list, obj, exclude) @Substitution(name='ewm') @Appender(_doc_template) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 4dfdd1c96728bd..889754841a078d 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -14,7 +14,7 @@ import pandas as pd from pandas import ( DataFrame, Index, Series, Timestamp, bdate_range, concat, isna, notna) -from pandas.core.base import SpecificationError +from pandas.core.base import DataError, SpecificationError from pandas.core.sorting import safe_sort import pandas.core.window as rwindow import pandas.util.testing as tm @@ -118,9 +118,11 @@ def tests_skip_nuisance(self): def test_skip_sum_object_raises(self): df = DataFrame({'A': range(5), 'B': range(5, 10), 'C': 'foo'}) r = df.rolling(window=3) - - with pytest.raises(TypeError, match='cannot handle this type'): - r.sum() + result = r.sum() + expected = DataFrame({'A': [np.nan, np.nan, 3, 6, 9], + 'B': [np.nan, np.nan, 18, 21, 24]}, + columns=list('AB')) + tm.assert_frame_equal(result, expected) def test_agg(self): df = DataFrame({'A': range(5), 'B': range(0, 10, 2)}) @@ -1069,15 +1071,12 @@ class DatetimeLike(Dtype): def check_dtypes(self, f, f_name, d, d_name, exp): roll = d.rolling(window=self.window) - if f_name == 'count': result = f(roll) tm.assert_almost_equal(result, exp) else: - - # other methods not Implemented ATM - with pytest.raises(NotImplementedError): + with pytest.raises(DataError): f(roll) From b115a6bf5553445fdf29824623ea2b7c3a424660 Mon Sep 17 00:00:00 2001 From: Michael <15952683+soilstack@users.noreply.github.com> Date: Tue, 2 Jul 2019 02:19:16 +0800 Subject: [PATCH 121/238] BUG: Partial slicing an datetime MultiIndex (#27127) Fixes GH26944 AttributeError on partial multiindex timestamp slice --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/indexes/multi.py | 4 +++- pandas/tests/indexes/multi/test_indexing.py | 21 +++++++++++++++++++++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 8b4becebead29b..30ae4ebe21ca4e 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -773,7 +773,7 @@ Indexing - Bug in which :meth:`DataFrame.to_csv` caused a segfault for a reindexed data frame, when the indices were single-level :class:`MultiIndex` (:issue:`26303`). - Fixed bug where assigning a :class:`arrays.PandasArray` to a :class:`pandas.core.frame.DataFrame` would raise error (:issue:`26390`) - Allow keyword arguments for callable local reference used in the :meth:`DataFrame.query` string (:issue:`26426`) - +- Bug which produced ``AttributeError`` on partial matching :class:`Timestamp` in a :class:`MultiIndex` (:issue:`26944`) Missing ^^^^^^^ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 19ba147fe9a279..9cb0a2fac85b0c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2755,7 +2755,9 @@ def convert_indexer(start, stop, step, indexer=indexer, # a partial date slicer on a DatetimeIndex generates a slice # note that the stop ALREADY includes the stopped point (if # it was a string sliced) - return convert_indexer(start.start, stop.stop, step) + start = getattr(start, 'start', start) + stop = getattr(stop, 'stop', stop) + return convert_indexer(start, stop, step) elif level > 0 or self.lexsort_depth == 0 or step is not None: # need to have like semantics here to right diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 929c080042a45c..3acd194b28a050 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -397,3 +397,24 @@ def test_get_indexer_categorical_time(): Categorical(date_range("2012-01-01", periods=3, freq='H'))]) result = midx.get_indexer(midx) tm.assert_numpy_array_equal(result, np.arange(9, dtype=np.intp)) + + +def test_timestamp_multiindex_indexer(): + # https://github.com/pandas-dev/pandas/issues/26944 + idx = pd.MultiIndex.from_product([ + pd.date_range("2019-01-01T00:15:33", periods=100, freq="H", + name="date"), + ['x'], + [3] + ]) + df = pd.DataFrame({'foo': np.arange(len(idx))}, idx) + result = df.loc[pd.IndexSlice['2019-1-2':, "x", :], 'foo'] + qidx = pd.MultiIndex.from_product([ + pd.date_range(start="2019-01-02T00:15:33", end='2019-01-05T02:15:33', + freq="H", name="date"), + ['x'], + [3] + ]) + should_be = pd.Series(data=np.arange(24, len(qidx) + 24), index=qidx, + name="foo") + tm.assert_series_equal(result, should_be) From 7ceefb3f2e1c1f6b0e4435ab4d10ffa4aef4ec36 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Mon, 1 Jul 2019 23:06:50 +0200 Subject: [PATCH 122/238] DOC: mention that float_format can be a function, not just a string (#27156) --- pandas/core/generic.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 957efa402346e8..380af8930f344d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2796,8 +2796,10 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True, Formatter functions to apply to columns' elements by position or name. The result of each function must be a unicode string. List must be of length equal to the number of columns. - float_format : str, optional - Format string for floating point numbers. + float_format : one-parameter function or str, optional, default None + Formatter for floating point numbers. For example + ``float_format="%%.2f"`` and ``float_format="{:0.2f}".format`` will + both result in 0.1234 being formatted as 0.12. sparsify : bool, optional Set to False for a DataFrame with a hierarchical index to print every multiindex key at each row. By default, the value will be From 02b552d280f917c1a181fcfd956a2d01f0bd5462 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 1 Jul 2019 16:20:04 -0500 Subject: [PATCH 123/238] Add __array_ufunc__ to Series / Array (#23293) --- doc/source/development/extending.rst | 19 +++ doc/source/getting_started/dsintro.rst | 50 ++++++-- doc/source/user_guide/computation.rst | 1 + doc/source/whatsnew/v0.25.0.rst | 2 + pandas/core/arrays/base.py | 11 ++ pandas/core/arrays/categorical.py | 15 +++ pandas/core/arrays/integer.py | 49 +++++++- pandas/core/arrays/sparse.py | 42 +------ pandas/core/ops.py | 87 +++++++++++++- pandas/core/series.py | 103 +++++++++++++---- pandas/tests/arithmetic/test_datetime64.py | 4 + pandas/tests/arithmetic/test_numeric.py | 19 +++ pandas/tests/arrays/test_integer.py | 68 +++++++++++ pandas/tests/extension/decimal/array.py | 23 ++++ .../tests/extension/decimal/test_decimal.py | 44 +++++++ pandas/tests/series/test_analytics.py | 6 +- pandas/tests/series/test_ufunc.py | 109 ++++++++++++++---- 17 files changed, 553 insertions(+), 99 deletions(-) diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index 363ec10d58bb6a..12af80f1bce80f 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -208,6 +208,25 @@ will 2. call ``result = op(values, ExtensionArray)`` 3. re-box the result in a ``Series`` +.. _extending.extension.ufunc: + +NumPy Universal Functions +^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`Series` implements ``__array_ufunc__``. As part of the implementation, +pandas unboxes the ``ExtensionArray`` from the :class:`Series`, applies the ufunc, +and re-boxes it if necessary. + +If applicable, we highly recommend that you implement ``__array_ufunc__`` in your +extension array to avoid coercion to an ndarray. See +`the numpy documentation `__ +for an example. + +As part of your implementation, we require that you defer to pandas when a pandas +container (:class:`Series`, :class:`DataFrame`, :class:`Index`) is detected in ``inputs``. +If any of those is present, you should return ``NotImplemented``. Pandas will take care of +unboxing the array from the container and re-calling the ufunc with the unwrapped input. + .. _extending.extension.testing: Testing extension arrays diff --git a/doc/source/getting_started/dsintro.rst b/doc/source/getting_started/dsintro.rst index 914c55115567aa..33e5d390447d77 100644 --- a/doc/source/getting_started/dsintro.rst +++ b/doc/source/getting_started/dsintro.rst @@ -731,28 +731,62 @@ DataFrame interoperability with NumPy functions .. _dsintro.numpy_interop: Elementwise NumPy ufuncs (log, exp, sqrt, ...) and various other NumPy functions -can be used with no issues on DataFrame, assuming the data within are numeric: +can be used with no issues on Series and DataFrame, assuming the data within +are numeric: .. ipython:: python np.exp(df) np.asarray(df) -The dot method on DataFrame implements matrix multiplication: +DataFrame is not intended to be a drop-in replacement for ndarray as its +indexing semantics and data model are quite different in places from an n-dimensional +array. + +:class:`Series` implements ``__array_ufunc__``, which allows it to work with NumPy's +`universal functions `_. + +The ufunc is applied to the underlying array in a Series. .. ipython:: python - df.T.dot(df) + ser = pd.Series([1, 2, 3, 4]) + np.exp(ser) -Similarly, the dot method on Series implements dot product: +Like other parts of the library, pandas will automatically align labeled inputs +as part of a ufunc with multiple inputs. For example, using :meth:`numpy.remainder` +on two :class:`Series` with differently ordered labels will align before the operation. .. ipython:: python - s1 = pd.Series(np.arange(5, 10)) - s1.dot(s1) + ser1 = pd.Series([1, 2, 3], index=['a', 'b', 'c']) + ser2 = pd.Series([1, 3, 5], index=['b', 'a', 'c']) + ser1 + ser2 + np.remainder(ser1, ser2) -DataFrame is not intended to be a drop-in replacement for ndarray as its -indexing semantics are quite different in places from a matrix. +As usual, the union of the two indices is taken, and non-overlapping values are filled +with missing values. + +.. ipython:: python + + ser3 = pd.Series([2, 4, 6], index=['b', 'c', 'd']) + ser3 + np.remainder(ser1, ser3) + +When a binary ufunc is applied to a :class:`Series` and :class:`Index`, the Series +implementation takes precedence and a Series is returned. + +.. ipython:: python + + ser = pd.Series([1, 2, 3]) + idx = pd.Index([4, 5, 6]) + + np.maximum(ser, idx) + +NumPy ufuncs are safe to apply to :class:`Series` backed by non-ndarray arrays, +for example :class:`SparseArray` (see :ref:`sparse.calculation`). If possible, +the ufunc is applied without converting the underlying data to an ndarray. Console display ~~~~~~~~~~~~~~~ diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index a2f93dcf337d78..4f44fcaab63d45 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -5,6 +5,7 @@ Computational tools =================== + Statistical functions --------------------- diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 30ae4ebe21ca4e..8850ee79a893b2 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -886,6 +886,7 @@ Sparse - Introduce a better error message in :meth:`Series.sparse.from_coo` so it returns a ``TypeError`` for inputs that are not coo matrices (:issue:`26554`) - Bug in :func:`numpy.modf` on a :class:`SparseArray`. Now a tuple of :class:`SparseArray` is returned (:issue:`26946`). + Build Changes ^^^^^^^^^^^^^ @@ -896,6 +897,7 @@ ExtensionArray - Bug in :func:`factorize` when passing an ``ExtensionArray`` with a custom ``na_sentinel`` (:issue:`25696`). - :meth:`Series.count` miscounts NA values in ExtensionArrays (:issue:`26835`) +- Added ``Series.__array_ufunc__`` to better handle NumPy ufuncs applied to Series backed by extension arrays (:issue:`23293`). - Keyword argument ``deep`` has been removed from :meth:`ExtensionArray.copy` (:issue:`27083`) Other diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 6340cc732d6c1c..0762a607f20aea 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -107,6 +107,17 @@ class ExtensionArray: attributes called ``.values`` or ``._values`` to ensure full compatibility with pandas internals. But other names as ``.data``, ``._data``, ``._items``, ... can be freely used. + + If implementing NumPy's ``__array_ufunc__`` interface, pandas expects + that + + 1. You defer by raising ``NotImplemented`` when any Series are present + in `inputs`. Pandas will extract the arrays and call the ufunc again. + 2. You define a ``_HANDLED_TYPES`` tuple as an attribute on the class. + Pandas inspect this to determine whether the ufunc is valid for the + types present. + + See :ref:`extending.extension.ufunc` for more. """ # '_typ' is for pandas.core.dtypes.generic.ABCExtensionArray. # Don't override this. diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 68c7b79becb555..b77a4f985067d3 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -26,6 +26,7 @@ from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import isna, notna +from pandas.core import ops from pandas.core.accessor import PandasDelegate, delegate_names import pandas.core.algorithms as algorithms from pandas.core.algorithms import factorize, take, take_1d, unique1d @@ -1292,6 +1293,20 @@ def __array__(self, dtype=None): ret = np.asarray(ret) return ret + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + # for binary ops, use our custom dunder methods + result = ops.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs) + if result is not NotImplemented: + return result + + # for all other cases, raise for now (similarly as what happens in + # Series.__array_prepare__) + raise TypeError("Object with dtype {dtype} cannot perform " + "the numpy op {op}".format( + dtype=self.dtype, + op=ufunc.__name__)) + def __setstate__(self, state): """Necessary for making this object picklable""" if not isinstance(state, dict): diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 88de497a3329fa..644c2f634240f7 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,3 +1,4 @@ +import numbers import sys from typing import Type import warnings @@ -17,7 +18,7 @@ from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, notna -from pandas.core import nanops +from pandas.core import nanops, ops from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin from pandas.core.tools.numeric import to_numeric @@ -344,6 +345,52 @@ def __array__(self, dtype=None): """ return self._coerce_to_ndarray() + _HANDLED_TYPES = (np.ndarray, numbers.Number) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + # For IntegerArray inputs, we apply the ufunc to ._data + # and mask the result. + if method == 'reduce': + # Not clear how to handle missing values in reductions. Raise. + raise NotImplementedError("The 'reduce' method is not supported.") + out = kwargs.get('out', ()) + + for x in inputs + out: + if not isinstance(x, self._HANDLED_TYPES + (IntegerArray,)): + return NotImplemented + + # for binary ops, use our custom dunder methods + result = ops.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs) + if result is not NotImplemented: + return result + + mask = np.zeros(len(self), dtype=bool) + inputs2 = [] + for x in inputs: + if isinstance(x, IntegerArray): + mask |= x._mask + inputs2.append(x._data) + else: + inputs2.append(x) + + def reconstruct(x): + # we don't worry about scalar `x` here, since we + # raise for reduce up above. + + if is_integer_dtype(x.dtype): + m = mask.copy() + return IntegerArray(x, m) + else: + x[mask] = np.nan + return x + + result = getattr(ufunc, method)(*inputs2, **kwargs) + if isinstance(result, tuple): + tuple(reconstruct(x) for x in result) + else: + return reconstruct(result) + def __iter__(self): for i in range(len(self)): if self._mask[i]: diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 97ab6ec8235ef2..29cc899fa6a9b3 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -38,6 +38,7 @@ from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.missing import interpolate_2d +import pandas.core.ops as ops import pandas.io.formats.printing as printing @@ -1665,42 +1666,11 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)): return NotImplemented - special = {'add', 'sub', 'mul', 'pow', 'mod', 'floordiv', 'truediv', - 'divmod', 'eq', 'ne', 'lt', 'gt', 'le', 'ge', 'remainder'} - aliases = { - 'subtract': 'sub', - 'multiply': 'mul', - 'floor_divide': 'floordiv', - 'true_divide': 'truediv', - 'power': 'pow', - 'remainder': 'mod', - 'divide': 'div', - 'equal': 'eq', - 'not_equal': 'ne', - 'less': 'lt', - 'less_equal': 'le', - 'greater': 'gt', - 'greater_equal': 'ge', - } - - flipped = { - 'lt': '__gt__', - 'le': '__ge__', - 'gt': '__lt__', - 'ge': '__le__', - 'eq': '__eq__', - 'ne': '__ne__', - } - - op_name = ufunc.__name__ - op_name = aliases.get(op_name, op_name) - - if op_name in special and kwargs.get('out') is None: - if isinstance(inputs[0], type(self)): - return getattr(self, '__{}__'.format(op_name))(inputs[1]) - else: - name = flipped.get(op_name, '__r{}__'.format(op_name)) - return getattr(self, name)(inputs[0]) + # for binary ops, use our custom dunder methods + result = ops.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs) + if result is not NotImplemented: + return result if len(inputs) == 1: # No alignment necessary. diff --git a/pandas/core/ops.py b/pandas/core/ops.py index a4d31cb227f190..5dd84550732121 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -6,7 +6,7 @@ import datetime import operator import textwrap -from typing import Dict, Optional +from typing import Any, Callable, Dict, Optional import warnings import numpy as np @@ -29,6 +29,7 @@ from pandas.core.dtypes.missing import isna, notna import pandas as pd +from pandas._typing import ArrayLike import pandas.core.common as com import pandas.core.missing as missing @@ -1660,7 +1661,14 @@ def na_op(x, y): lambda val: op(val, y)) raise - result = missing.fill_zeros(result, x, y, op_name, fill_zeros) + if isinstance(result, tuple): + # e.g. divmod + result = tuple( + missing.fill_zeros(r, x, y, op_name, fill_zeros) + for r in result + ) + else: + result = missing.fill_zeros(result, x, y, op_name, fill_zeros) return result def wrapper(left, right): @@ -2349,3 +2357,78 @@ def wrapper(self, other): wrapper.__name__ = op_name return wrapper + + +def maybe_dispatch_ufunc_to_dunder_op( + self: ArrayLike, + ufunc: Callable, + method: str, + *inputs: ArrayLike, + **kwargs: Any +): + """ + Dispatch a ufunc to the equivalent dunder method. + + Parameters + ---------- + self : ArrayLike + The array whose dunder method we dispatch to + ufunc : Callable + A NumPy ufunc + method : {'reduce', 'accumulate', 'reduceat', 'outer', 'at', '__call__'} + inputs : ArrayLike + The input arrays. + kwargs : Any + The additional keyword arguments, e.g. ``out``. + + Returns + ------- + result : Any + The result of applying the ufunc + """ + # special has the ufuncs we dispatch to the dunder op on + special = {'add', 'sub', 'mul', 'pow', 'mod', 'floordiv', 'truediv', + 'divmod', 'eq', 'ne', 'lt', 'gt', 'le', 'ge', 'remainder', + 'matmul'} + aliases = { + 'subtract': 'sub', + 'multiply': 'mul', + 'floor_divide': 'floordiv', + 'true_divide': 'truediv', + 'power': 'pow', + 'remainder': 'mod', + 'divide': 'div', + 'equal': 'eq', + 'not_equal': 'ne', + 'less': 'lt', + 'less_equal': 'le', + 'greater': 'gt', + 'greater_equal': 'ge', + } + + # For op(., Array) -> Array.__r{op}__ + flipped = { + 'lt': '__gt__', + 'le': '__ge__', + 'gt': '__lt__', + 'ge': '__le__', + 'eq': '__eq__', + 'ne': '__ne__', + } + + op_name = ufunc.__name__ + op_name = aliases.get(op_name, op_name) + + def not_implemented(*args, **kwargs): + return NotImplemented + + if (method == '__call__' and op_name in special + and kwargs.get('out') is None): + if isinstance(inputs[0], type(self)): + name = '__{}__'.format(op_name) + return getattr(self, name, not_implemented)(inputs[1]) + else: + name = flipped.get(op_name, '__r{}__'.format(op_name)) + return getattr(self, name, not_implemented)(inputs[0]) + else: + return NotImplemented diff --git a/pandas/core/series.py b/pandas/core/series.py index f415bc9fd3561f..9179099562832c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5,6 +5,7 @@ from io import StringIO from shutil import get_terminal_size from textwrap import dedent +from typing import Any, Callable import warnings import numpy as np @@ -714,6 +715,84 @@ def view(self, dtype=None): # ---------------------------------------------------------------------- # NDArray Compat + _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray) + + def __array_ufunc__( + self, + ufunc: Callable, + method: str, + *inputs: Any, + **kwargs: Any + ): + # TODO: handle DataFrame + from pandas.core.internals.construction import extract_array + cls = type(self) + + # for binary ops, use our custom dunder methods + result = ops.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs) + if result is not NotImplemented: + return result + + # Determine if we should defer. + no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__) + + for item in inputs: + higher_priority = ( + hasattr(item, '__array_priority__') and + item.__array_priority__ > self.__array_priority__ + ) + has_array_ufunc = ( + hasattr(item, '__array_ufunc__') and + type(item).__array_ufunc__ not in no_defer and + not isinstance(item, self._HANDLED_TYPES) + ) + if higher_priority or has_array_ufunc: + return NotImplemented + + # align all the inputs. + names = [getattr(x, 'name') for x in inputs if hasattr(x, 'name')] + types = tuple(type(x) for x in inputs) + # TODO: dataframe + alignable = [x for x, t in zip(inputs, types) if issubclass(t, Series)] + + if len(alignable) > 1: + # This triggers alignment. + # At the moment, there aren't any ufuncs with more than two inputs + # so this ends up just being x1.index | x2.index, but we write + # it to handle *args. + index = alignable[0].index + for s in alignable[1:]: + index |= s.index + inputs = tuple(x.reindex(index) if issubclass(t, Series) else x + for x, t in zip(inputs, types)) + else: + index = self.index + + inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) + result = getattr(ufunc, method)(*inputs, **kwargs) + if len(set(names)) == 1: + # we require names to be hashable, right? + name = names[0] # type: Any + else: + name = None + + def construct_return(result): + if lib.is_scalar(result): + return result + return self._constructor(result, + index=index, + name=name, + copy=False) + + if type(result) is tuple: + # multiple return values + return tuple(construct_return(x) for x in result) + elif method == 'at': + # no return value + return None + else: + return construct_return(result) def __array__(self, dtype=None): """ @@ -776,30 +855,6 @@ def __array__(self, dtype=None): dtype = 'M8[ns]' return np.asarray(self.array, dtype) - def __array_wrap__(self, result, context=None): - """ - Gets called after a ufunc. - """ - return self._constructor(result, index=self.index, - copy=False).__finalize__(self) - - def __array_prepare__(self, result, context=None): - """ - Gets called prior to a ufunc. - """ - - # nice error message for non-ufunc types - if (context is not None and - (not isinstance(self._values, (np.ndarray, ExtensionArray)) - or isinstance(self._values, Categorical))): - obj = context[1][0] - raise TypeError("{obj} with dtype {dtype} cannot perform " - "the numpy op {op}".format( - obj=type(obj).__name__, - dtype=getattr(obj, 'dtype', None), - op=context[0].__name__)) - return result - # ---------------------------------------------------------------------- # Unary Methods diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index b1091d38c10d01..908e197ec1d282 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -670,6 +670,10 @@ def test_comparison_tzawareness_compat_scalars(self, op, box_with_array): @pytest.mark.parametrize('other', [datetime(2016, 1, 1), Timestamp('2016-01-01'), np.datetime64('2016-01-01')]) + # Bug in NumPy? https://github.com/numpy/numpy/issues/13841 + # Raising in __eq__ will fallback to NumPy, which warns, fails, + # then re-raises the original exception. So we just need to ignore. + @pytest.mark.filterwarnings("ignore:elementwise comp:DeprecationWarning") def test_scalar_comparison_tzawareness(self, op, other, tz_aware_fixture, box_with_array): tz = tz_aware_fixture diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index f58f8981317dfc..31c7f47bcf5bd2 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -891,6 +891,25 @@ def test_ufunc_coercions(self, holder): exp = tm.box_expected(exp, box) tm.assert_equal(result, exp) + @pytest.mark.parametrize('holder', [pd.Int64Index, pd.UInt64Index, + pd.Float64Index, pd.Series]) + def test_ufunc_multiple_return_values(self, holder): + obj = holder([1, 2, 3], name='x') + box = pd.Series if holder is pd.Series else pd.Index + + result = np.modf(obj) + assert isinstance(result, tuple) + exp1 = pd.Float64Index([0., 0., 0.], name='x') + exp2 = pd.Float64Index([1., 2., 3.], name='x') + tm.assert_equal(result[0], tm.box_expected(exp1, box)) + tm.assert_equal(result[1], tm.box_expected(exp2, box)) + + def test_ufunc_at(self): + s = pd.Series([0, 1, 2], index=[1, 2, 3], name='x') + np.add.at(s, [0, 2], 10) + expected = pd.Series([10, 1, 12], index=[1, 2, 3], name='x') + tm.assert_series_equal(s, expected) + class TestObjectDtypeEquivalence: # Tests that arithmetic operations match operations executed elementwise diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 65f7628370ad4a..fb62a90a6007e0 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -717,6 +717,74 @@ def test_astype_nansafe(): arr.astype('uint32') +@pytest.mark.parametrize( + 'ufunc', [np.abs, np.sign]) +def test_ufuncs_single_int(ufunc): + a = integer_array([1, 2, -3, np.nan]) + result = ufunc(a) + expected = integer_array(ufunc(a.astype(float))) + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s) + expected = pd.Series(integer_array(ufunc(a.astype(float)))) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + 'ufunc', [np.log, np.exp, np.sin, np.cos, np.sqrt]) +def test_ufuncs_single_float(ufunc): + a = integer_array([1, 2, -3, np.nan]) + with np.errstate(invalid='ignore'): + result = ufunc(a) + expected = ufunc(a.astype(float)) + tm.assert_numpy_array_equal(result, expected) + + s = pd.Series(a) + with np.errstate(invalid='ignore'): + result = ufunc(s) + expected = ufunc(s.astype(float)) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + 'ufunc', [np.add, np.subtract]) +def test_ufuncs_binary_int(ufunc): + # two IntegerArrays + a = integer_array([1, 2, -3, np.nan]) + result = ufunc(a, a) + expected = integer_array(ufunc(a.astype(float), a.astype(float))) + tm.assert_extension_array_equal(result, expected) + + # IntegerArray with numpy array + arr = np.array([1, 2, 3, 4]) + result = ufunc(a, arr) + expected = integer_array(ufunc(a.astype(float), arr)) + tm.assert_extension_array_equal(result, expected) + + result = ufunc(arr, a) + expected = integer_array(ufunc(arr, a.astype(float))) + tm.assert_extension_array_equal(result, expected) + + # IntegerArray with scalar + result = ufunc(a, 1) + expected = integer_array(ufunc(a.astype(float), 1)) + tm.assert_extension_array_equal(result, expected) + + result = ufunc(1, a) + expected = integer_array(ufunc(1, a.astype(float))) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize('values', [ + [0, 1], [0, None] +]) +def test_ufunc_reduce_raises(values): + a = integer_array(values) + with pytest.raises(NotImplementedError): + np.add.reduce(a) + + # TODO(jreback) - these need testing / are broken # shift diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 2b1bb53e962bee..d097a599730b80 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -84,6 +84,29 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): def _from_factorized(cls, values, original): return cls(values) + _HANDLED_TYPES = (decimal.Decimal, numbers.Number, np.ndarray) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + # + if not all(isinstance(t, self._HANDLED_TYPES + (DecimalArray,)) + for t in inputs): + return NotImplemented + + inputs = tuple(x._data if isinstance(x, DecimalArray) else x + for x in inputs) + result = getattr(ufunc, method)(*inputs, **kwargs) + + def reconstruct(x): + if isinstance(x, (decimal.Decimal, numbers.Number)): + return x + else: + return DecimalArray._from_sequence(x) + + if isinstance(result, tuple): + return tuple(reconstruct(x) for x in result) + else: + return reconstruct(result) + def __getitem__(self, item): if isinstance(item, numbers.Integral): return self._data[item] diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 4625c79e1bc3dc..80885e4045e647 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -390,6 +390,14 @@ def test_divmod_array(reverse, expected_div, expected_mod): tm.assert_extension_array_equal(mod, expected_mod) +def test_ufunc_fallback(data): + a = data[:5] + s = pd.Series(a, index=range(3, 8)) + result = np.abs(s) + expected = pd.Series(np.abs(a), index=range(3, 8)) + tm.assert_series_equal(result, expected) + + def test_formatting_values_deprecated(): class DecimalArray2(DecimalArray): def _formatting_values(self): @@ -400,3 +408,39 @@ def _formatting_values(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): repr(ser) + + +def test_array_ufunc(): + a = to_decimal([1, 2, 3]) + result = np.exp(a) + expected = to_decimal(np.exp(a._data)) + tm.assert_extension_array_equal(result, expected) + + +def test_array_ufunc_series(): + a = to_decimal([1, 2, 3]) + s = pd.Series(a) + result = np.exp(s) + expected = pd.Series(to_decimal(np.exp(a._data))) + tm.assert_series_equal(result, expected) + + +def test_array_ufunc_series_scalar_other(): + # check _HANDLED_TYPES + a = to_decimal([1, 2, 3]) + s = pd.Series(a) + result = np.add(s, decimal.Decimal(1)) + expected = pd.Series(np.add(a, decimal.Decimal(1))) + tm.assert_series_equal(result, expected) + + +def test_array_ufunc_series_defer(): + a = to_decimal([1, 2, 3]) + s = pd.Series(a) + + expected = pd.Series(to_decimal([2, 4, 6])) + r1 = np.add(s, a) + r2 = np.add(a, s) + + tm.assert_series_equal(r1, expected) + tm.assert_series_equal(r2, expected) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index aed08b78fe6406..df69bb35115cfe 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -484,18 +484,18 @@ def test_matmul(self): b = DataFrame(np.random.randn(3, 4), index=['1', '2', '3'], columns=['p', 'q', 'r', 's']).T - # Series @ DataFrame + # Series @ DataFrame -> Series result = operator.matmul(a, b) expected = Series(np.dot(a.values, b.values), index=['1', '2', '3']) assert_series_equal(result, expected) - # DataFrame @ Series + # DataFrame @ Series -> Series result = operator.matmul(b.T, a) expected = Series(np.dot(b.T.values, a.T.values), index=['1', '2', '3']) assert_series_equal(result, expected) - # Series @ Series + # Series @ Series -> scalar result = operator.matmul(a, a) expected = np.dot(a.values, a.values) assert_almost_equal(result, expected) diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 05d19452b1eace..1a0eeb51c4921a 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -1,3 +1,4 @@ +from collections import deque import string import numpy as np @@ -12,14 +13,12 @@ np.logaddexp, ] SPARSE = [ - pytest.param(True, - marks=pytest.mark.xfail(reason="Series.__array_ufunc__")), - False, + True, + False ] SPARSE_IDS = ['sparse', 'dense'] SHUFFLE = [ - pytest.param(True, marks=pytest.mark.xfail(reason="GH-26945", - strict=False)), + True, False ] @@ -43,7 +42,7 @@ def test_unary_ufunc(ufunc, sparse): array = np.random.randint(0, 10, 10, dtype='int64') array[::2] = 0 if sparse: - array = pd.SparseArray(array, dtype=pd.SparseDtype('int', 0)) + array = pd.SparseArray(array, dtype=pd.SparseDtype('int64', 0)) index = list(string.ascii_letters[:10]) name = "name" @@ -61,8 +60,8 @@ def test_binary_ufunc_with_array(flip, sparse, ufunc, arrays_for_binary_ufunc): # Test that ufunc(Series(a), array) == Series(ufunc(a, b)) a1, a2 = arrays_for_binary_ufunc if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int', 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int', 0)) + a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int64', 0)) + a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int64', 0)) name = "name" # op(Series, array) preserves the name. series = pd.Series(a1, name=name) @@ -82,18 +81,15 @@ def test_binary_ufunc_with_array(flip, sparse, ufunc, arrays_for_binary_ufunc): @pytest.mark.parametrize("ufunc", BINARY_UFUNCS) @pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) -@pytest.mark.parametrize("flip", [ - pytest.param(True, marks=pytest.mark.xfail(reason="Index should defer")), - False -], ids=['flipped', 'straight']) +@pytest.mark.parametrize("flip", [True, False], ids=['flipped', 'straight']) def test_binary_ufunc_with_index(flip, sparse, ufunc, arrays_for_binary_ufunc): # Test that # * func(Series(a), Series(b)) == Series(ufunc(a, b)) # * ufunc(Index, Series) dispatches to Series (returns a Series) a1, a2 = arrays_for_binary_ufunc if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int', 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int', 0)) + a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int64', 0)) + a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int64', 0)) name = "name" # op(Series, array) preserves the name. series = pd.Series(a1, name=name) @@ -121,14 +117,10 @@ def test_binary_ufunc_with_series(flip, shuffle, sparse, ufunc, # Test that # * func(Series(a), Series(b)) == Series(ufunc(a, b)) # with alignment between the indices - - if flip and shuffle: - pytest.xfail(reason="Fix with Series.__array_ufunc__") - a1, a2 = arrays_for_binary_ufunc if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int', 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int', 0)) + a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int64', 0)) + a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int64', 0)) name = "name" # op(Series, array) preserves the name. series = pd.Series(a1, name=name) @@ -138,8 +130,6 @@ def test_binary_ufunc_with_series(flip, shuffle, sparse, ufunc, if shuffle: other = other.take(idx) - a2 = a2.take(idx) - # alignment, so the expected index is the first index in the op. if flip: index = other.align(series)[0].index else: @@ -198,10 +188,13 @@ def test_multiple_ouput_binary_ufuncs(ufunc, sparse, shuffle, pytest.skip("sparse divmod not implemented.") a1, a2 = arrays_for_binary_ufunc + # work around https://github.com/pandas-dev/pandas/issues/26987 + a1[a1 == 0] = 1 + a2[a2 == 0] = 1 if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int', 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int', 0)) + a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int64', 0)) + a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int64', 0)) s1 = pd.Series(a1) s2 = pd.Series(a2) @@ -241,7 +234,6 @@ def test_multiple_ouput_ufunc(sparse, arrays_for_binary_ufunc): @pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) @pytest.mark.parametrize("ufunc", BINARY_UFUNCS) -@pytest.mark.xfail(reason="Series.__array_ufunc__") def test_binary_ufunc_drops_series_name(ufunc, sparse, arrays_for_binary_ufunc): # Drop the names when they differ. @@ -251,3 +243,70 @@ def test_binary_ufunc_drops_series_name(ufunc, sparse, result = ufunc(s1, s2) assert result.name is None + + +def test_object_series_ok(): + class Dummy: + def __init__(self, value): + self.value = value + + def __add__(self, other): + return self.value + other.value + + arr = np.array([Dummy(0), Dummy(1)]) + ser = pd.Series(arr) + tm.assert_series_equal(np.add(ser, ser), pd.Series(np.add(ser, arr))) + tm.assert_series_equal(np.add(ser, Dummy(1)), + pd.Series(np.add(ser, Dummy(1)))) + + +@pytest.mark.parametrize('values', [ + pd.array([1, 3, 2]), + pytest.param( + pd.array([1, 10, 0], dtype='Sparse[int]'), + marks=pytest.mark.xfail(resason='GH-27080. Bug in SparseArray') + ), + pd.to_datetime(['2000', '2010', '2001']), + pd.to_datetime(['2000', '2010', '2001']).tz_localize("CET"), + pd.to_datetime(['2000', '2010', '2001']).to_period(freq="D"), + +]) +def test_reduce(values): + a = pd.Series(values) + assert np.maximum.reduce(a) == values[1] + + +@pytest.mark.parametrize('type_', [ + list, + deque, + tuple, +]) +def test_binary_ufunc_other_types(type_): + a = pd.Series([1, 2, 3], name='name') + b = type_([3, 4, 5]) + + result = np.add(a, b) + expected = pd.Series(np.add(a.to_numpy(), b), name='name') + tm.assert_series_equal(result, expected) + + +def test_object_dtype_ok(): + + class Thing: + def __init__(self, value): + self.value = value + + def __add__(self, other): + other = getattr(other, 'value', other) + return type(self)(self.value + other) + + def __eq__(self, other): + return type(other) is Thing and self.value == other.value + + def __repr__(self): + return 'Thing({})'.format(self.value) + + s = pd.Series([Thing(1), Thing(2)]) + result = np.add(s, Thing(1)) + expected = pd.Series([Thing(2), Thing(3)]) + tm.assert_series_equal(result, expected) From 58b1732749cf165941fc29075dcabd929e5c622f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 1 Jul 2019 16:40:57 -0500 Subject: [PATCH 124/238] BUG: Fix indexing on DatetimeBlock (#27110) --- doc/source/whatsnew/v0.25.0.rst | 2 ++ pandas/core/internals/blocks.py | 4 ++++ pandas/tests/extension/base/getitem.py | 14 ++++++++++++++ pandas/tests/extension/test_numpy.py | 4 ++++ pandas/tests/groupby/aggregate/test_other.py | 8 ++++++-- pandas/tests/indexing/test_datetime.py | 9 +++++---- 6 files changed, 35 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 8850ee79a893b2..4ff05424a2842b 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -774,6 +774,8 @@ Indexing - Fixed bug where assigning a :class:`arrays.PandasArray` to a :class:`pandas.core.frame.DataFrame` would raise error (:issue:`26390`) - Allow keyword arguments for callable local reference used in the :meth:`DataFrame.query` string (:issue:`26426`) - Bug which produced ``AttributeError`` on partial matching :class:`Timestamp` in a :class:`MultiIndex` (:issue:`26944`) +- Bug in :meth:`DataFrame.loc` and :meth:`DataFrame.iloc` on a :class:`DataFrame` with a single timezone-aware datetime64[ns] column incorrectly returning a scalar instead of a :class:`Series` (:issue:`27110`) +- Missing ^^^^^^^ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b79f87461093de..cad37bf2b8ae13 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1538,6 +1538,10 @@ def iget(self, col): col, loc = col if not com.is_null_slice(col) and col != 0: raise IndexError("{0} only contains one item".format(self)) + elif isinstance(col, slice): + if col != slice(None): + raise NotImplementedError(col) + return self.values[[loc]] return self.values[loc] else: if col != 0: diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index dfc82c6041eae5..6a5507b51b3bac 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -73,6 +73,20 @@ def test_loc_frame(self, data): result = df.loc[:3, 'A'] self.assert_series_equal(result, expected) + def test_loc_iloc_frame_single_dtype(self, data): + # GH#27110 bug in ExtensionBlock.iget caused df.iloc[n] to incorrectly + # return a scalar + df = pd.DataFrame({"A": data}) + expected = pd.Series([data[2]], index=["A"], name=2, dtype=data.dtype) + + result = df.loc[2] + self.assert_series_equal(result, expected) + + expected = pd.Series([data[-1]], index=["A"], name=len(data) - 1, + dtype=data.dtype) + result = df.iloc[-1] + self.assert_series_equal(result, expected) + def test_getitem_scalar(self, data): result = data[0] assert isinstance(result, data.dtype.type) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index f31fa5b87cfe58..74ca296d232958 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -180,6 +180,10 @@ def test_take_series(self, data): # ValueError: PandasArray must be 1-dimensional. super().test_take_series(data) + @pytest.mark.xfail(reason="astype doesn't recognize data.dtype") + def test_loc_iloc_frame_single_dtype(self, data): + super().test_loc_iloc_frame_single_dtype(data) + class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests): @skip_nested diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index a061eaa1a2c6f1..903ffa23173cbe 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -421,11 +421,15 @@ def test_agg_timezone_round_trip(): assert ts == grouped.nth(0)['B'].iloc[0] assert ts == grouped.head(1)['B'].iloc[0] assert ts == grouped.first()['B'].iloc[0] - assert ts == grouped.apply(lambda x: x.iloc[0])[0] + + # GH#27110 applying iloc should return a DataFrame + assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 0] ts = df['B'].iloc[2] assert ts == grouped.last()['B'].iloc[0] - assert ts == grouped.apply(lambda x: x.iloc[-1])[0] + + # GH#27110 applying iloc should return a DataFrame + assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 0] def test_sum_uint64_overflow(): diff --git a/pandas/tests/indexing/test_datetime.py b/pandas/tests/indexing/test_datetime.py index d4da34cab6f5c9..278fa6bd44f99b 100644 --- a/pandas/tests/indexing/test_datetime.py +++ b/pandas/tests/indexing/test_datetime.py @@ -39,7 +39,7 @@ def test_setitem_with_datetime_tz(self): def test_indexing_with_datetime_tz(self): - # 8260 + # GH#8260 # support datetime64 with tz idx = Index(date_range('20130101', periods=3, tz='US/Eastern'), @@ -65,11 +65,12 @@ def test_indexing_with_datetime_tz(self): # indexing - fast_xs df = DataFrame({'a': date_range('2014-01-01', periods=10, tz='UTC')}) result = df.iloc[5] - expected = Timestamp('2014-01-06 00:00:00+0000', tz='UTC', freq='D') - assert result == expected + expected = Series([Timestamp('2014-01-06 00:00:00+0000', tz='UTC')], + index=['a'], name=5) + tm.assert_series_equal(result, expected) result = df.loc[5] - assert result == expected + tm.assert_series_equal(result, expected) # indexing - boolean result = df[df.a > df.a[3]] From 54f4514e92cbb7d64b77d2194ba873a6541e81ce Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 1 Jul 2019 16:54:17 -0500 Subject: [PATCH 125/238] CLN: True option in Series.groupby.nth(dropna=) (#27168) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/groupby/groupby.py | 22 +++++----------------- pandas/tests/groupby/test_nth.py | 11 +++++------ 3 files changed, 11 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 4ff05424a2842b..42d90a6161a883 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -644,6 +644,7 @@ Removal of prior version deprecations/changes - Removed the previously deprecated ``raise_on_error`` keyword argument in :meth:`DataFrame.where` and :meth:`DataFrame.mask` (:issue:`17744`) - Removed the previously deprecated ``ordered`` and ``categories`` keyword arguments in ``astype`` (:issue:`17742`) - Removed the previously deprecated ``cdate_range`` (:issue:`17691`) +- Removed the previously deprecated ``True`` option for the ``dropna`` keyword argument in :func:`SeriesGroupBy.nth` (:issue:`17493`) .. _whatsnew_0250.performance: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 202d4fb15f9717..925f006de92b6a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -13,7 +13,6 @@ class providing the base-class of operations. from functools import partial, wraps import types from typing import FrozenSet, List, Optional, Tuple, Type, Union -import warnings import numpy as np @@ -1741,22 +1740,11 @@ def nth(self, "dropna option with a list of nth values is not supported") if dropna not in ['any', 'all']: - if isinstance(self._selected_obj, Series) and dropna is True: - warnings.warn("the dropna={dropna} keyword is deprecated," - "use dropna='all' instead. " - "For a Series groupby, dropna must be " - "either None, 'any' or 'all'.".format( - dropna=dropna), - FutureWarning, - stacklevel=2) - dropna = 'all' - else: - # Note: when agg-ing picker doesn't raise this, - # just returns NaN - raise ValueError("For a DataFrame groupby, dropna must be " - "either None, 'any' or 'all', " - "(was passed {dropna}).".format( - dropna=dropna)) + # Note: when agg-ing picker doesn't raise this, just returns NaN + raise ValueError("For a DataFrame groupby, dropna must be " + "either None, 'any' or 'all', " + "(was passed {dropna}).".format( + dropna=dropna)) # old behaviour, but with all and any support for DataFrames. # modified in GH 7559 to have better perf diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index b174fb0e0b6f96..deb0f48b9cea2c 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -3,8 +3,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, isna -from pandas.util.testing import ( - assert_frame_equal, assert_produces_warning, assert_series_equal) +from pandas.util.testing import assert_frame_equal, assert_series_equal def test_first_last_nth(df): @@ -168,13 +167,13 @@ def test_nth(): result = s.groupby(g, sort=False).nth(0, dropna='all') assert_series_equal(result, expected) + with pytest.raises(ValueError, match='For a DataFrame groupby'): + s.groupby(g, sort=False).nth(0, dropna=True) + # doc example df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) g = df.groupby('A') - # PR 17493, related to issue 11038 - # test Series.nth with True for dropna produces FutureWarning - with assert_produces_warning(FutureWarning): - result = g.B.nth(0, dropna=True) + result = g.B.nth(0, dropna='all') expected = g.B.first() assert_series_equal(result, expected) From 3b3b7915723e014331210e64dfb7a9574305c3e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Abdullah=20=C4=B0hsan=20Se=C3=A7er?= Date: Tue, 2 Jul 2019 00:56:47 +0300 Subject: [PATCH 126/238] BUG: Fix empty closed window issue with rolling min and max (#27140) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/_libs/window.pyx | 6 ++++-- pandas/tests/test_window.py | 18 ++++++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 42d90a6161a883..4ef29806a06c63 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -858,6 +858,7 @@ Groupby/resample/rolling - Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where MemoryError is raised with empty window (:issue:`26005`) - Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where incorrect results are returned with ``closed='left'`` and ``closed='neither'`` (:issue:`26005`) - Improved :class:`pandas.core.window.Rolling`, :class:`pandas.core.window.Window` and :class:`pandas.core.window.EWM` functions to exclude nuisance columns from results instead of raising errors and raise a ``DataError`` only if all columns are nuisance (:issue:`12537`) +- Bug in :meth:`pandas.core.window.Rolling.max` and :meth:`pandas.core.window.Rolling.min` where incorrect results are returned with an empty variable window`` (:issue:`26005`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index df86f395d60977..6203577e450d95 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1313,9 +1313,11 @@ cdef _roll_min_max_variable(ndarray[numeric] values, # if right is open then the first window is empty close_offset = 0 if endi[0] > starti[0] else 1 + # first window's size + curr_win_size = endi[0] - starti[0] for i in range(endi[0], endi[N-1]): - if not Q.empty(): + if not Q.empty() and curr_win_size > 0: output[i-1+close_offset] = calc_mm( minp, nobs, values[Q.front()]) else: @@ -1344,7 +1346,7 @@ cdef _roll_min_max_variable(ndarray[numeric] values, Q.push_back(i) W.push_back(i) - if not Q.empty(): + if not Q.empty() and curr_win_size > 0: output[N-1] = calc_mm(minp, nobs, values[Q.front()]) else: output[N-1] = NaN diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 889754841a078d..8604acb1bd2b2c 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -48,6 +48,12 @@ def win_types_special(request): return request.param +@pytest.fixture(params=["sum", "mean", "median", "max", "min", + "var", "std", "kurt", "skew"]) +def arithmetic_win_operators(request): + return request.param + + class Base: _nan_locs = np.arange(20, 40) @@ -522,6 +528,18 @@ def test_closed(self): with pytest.raises(ValueError): df.rolling(window=3, closed='neither') + @pytest.mark.parametrize("closed", ["neither", "left"]) + def test_closed_empty(self, closed, arithmetic_win_operators): + # GH 26005 + func_name = arithmetic_win_operators + ser = pd.Series(data=np.arange(5), + index=pd.date_range("2000", periods=5, freq="2D")) + roll = ser.rolling("1D", closed=closed) + + result = getattr(roll, func_name)() + expected = pd.Series([np.nan] * 5, index=ser.index) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("func", ['min', 'max']) def test_closed_one_entry(self, func): # GH24718 From 6c658794f96df42b3b7798039285edd912c2e588 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 1 Jul 2019 18:56:49 -0500 Subject: [PATCH 127/238] API: change IntervalIndex.contains to work elementwise (#17753) --- doc/source/reference/arrays.rst | 1 + doc/source/reference/indexing.rst | 2 +- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/arrays/interval.py | 47 +++++++++++++++++++ pandas/core/indexes/base.py | 22 +++++---- pandas/core/indexes/category.py | 4 -- pandas/core/indexes/datetimelike.py | 2 - pandas/core/indexes/interval.py | 27 ++--------- pandas/core/indexes/multi.py | 2 - pandas/core/indexes/period.py | 2 - pandas/core/indexing.py | 2 +- .../tests/indexes/interval/test_interval.py | 37 ++++++++------- pandas/tests/indexes/period/test_indexing.py | 6 --- pandas/tests/indexes/test_base.py | 5 ++ pandas/tests/indexes/test_range.py | 6 ++- 15 files changed, 99 insertions(+), 67 deletions(-) diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 77a87cafb92581..bf9520c54040df 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -335,6 +335,7 @@ A collection of intervals may be stored in an :class:`arrays.IntervalArray`. arrays.IntervalArray.from_arrays arrays.IntervalArray.from_tuples arrays.IntervalArray.from_breaks + arrays.IntervalArray.contains arrays.IntervalArray.overlaps arrays.IntervalArray.set_closed arrays.IntervalArray.to_tuples diff --git a/doc/source/reference/indexing.rst b/doc/source/reference/indexing.rst index bbac964e8a201a..65860eb5c2f51d 100644 --- a/doc/source/reference/indexing.rst +++ b/doc/source/reference/indexing.rst @@ -248,7 +248,6 @@ IntervalIndex components IntervalIndex.from_arrays IntervalIndex.from_tuples IntervalIndex.from_breaks - IntervalIndex.contains IntervalIndex.left IntervalIndex.right IntervalIndex.mid @@ -260,6 +259,7 @@ IntervalIndex components IntervalIndex.get_loc IntervalIndex.get_indexer IntervalIndex.set_closed + IntervalIndex.contains IntervalIndex.overlaps IntervalIndex.to_tuples diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 4ef29806a06c63..fa8519c89b67fd 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -624,6 +624,7 @@ Other deprecations - :attr:`Series.imag` and :attr:`Series.real` are deprecated. (:issue:`18262`) - :meth:`Series.put` is deprecated. (:issue:`18262`) - :meth:`Index.item` and :meth:`Series.item` is deprecated. (:issue:`18262`) +- :meth:`Index.contains` is deprecated. Use ``key in index`` (``__contains__``) instead (:issue:`17753`). .. _whatsnew_0250.prior_deprecations: diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index aaa41241825982..8ed28065ee7aae 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -79,6 +79,7 @@ from_arrays from_tuples from_breaks +contains overlaps set_closed to_tuples @@ -1017,6 +1018,52 @@ def repeat(self, repeats, axis=None): right_repeat = self.right.repeat(repeats) return self._shallow_copy(left=left_repeat, right=right_repeat) + _interval_shared_docs['contains'] = """ + Check elementwise if the Intervals contain the value. + + Return a boolean mask whether the value is contained in the Intervals + of the %(klass)s. + + .. versionadded:: 0.25.0 + + Parameters + ---------- + other : scalar + The value to check whether it is contained in the Intervals. + + Returns + ------- + boolean array + + See Also + -------- + Interval.contains : Check whether Interval object contains value. + %(klass)s.overlaps : Check if an Interval overlaps the values in the + %(klass)s. + + Examples + -------- + >>> intervals = pd.%(qualname)s.from_tuples([(0, 1), (1, 3), (2, 4)]) + >>> intervals + %(klass)s([(0, 1], (1, 3], (2, 4]], + closed='right', + dtype='interval[int64]') + >>> intervals.contains(0.5) + array([ True, False, False]) + """ + + @Appender(_interval_shared_docs['contains'] % _shared_docs_kwargs) + def contains(self, other): + if isinstance(other, Interval): + raise NotImplementedError( + 'contains not implemented for two intervals' + ) + + return ( + (self.left < other if self.open_left else self.left <= other) & + (other < self.right if self.open_right else other <= self.right) + ) + _interval_shared_docs['overlaps'] = """ Check elementwise if an Interval overlaps the values in the %(klass)s. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 0123e6a5f10659..548791dafea1d2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4019,13 +4019,6 @@ def is_type_compatible(self, kind): >>> idx Int64Index([1, 2, 3, 4], dtype='int64') - >>> idx.contains(2) - True - >>> idx.contains(6) - False - - This is equivalent to: - >>> 2 in idx True >>> 6 in idx @@ -4040,8 +4033,21 @@ def __contains__(self, key): except (OverflowError, TypeError, ValueError): return False - @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) def contains(self, key): + """ + Return a boolean indicating whether the provided key is in the index. + + .. deprecated:: 0.25.0 + Use ``key in index`` instead of ``index.contains(key)``. + + Returns + ------- + bool + """ + warnings.warn( + "The 'contains' method is deprecated and will be removed in a " + "future version. Use 'key in index' instead of " + "'index.contains(key)'", FutureWarning, stacklevel=2) return key in self def __hash__(self): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index db4778f5e375f1..321297335cf236 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -386,10 +386,6 @@ def __contains__(self, key): return contains(self, key, container=self._engine) - @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) - def contains(self, key): - return key in self - def __array__(self, dtype=None): """ the array interface, return my values """ return np.array(self._data, dtype=dtype) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 7c90fb11aa1bf1..e141f7b5c5b230 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -221,8 +221,6 @@ def __contains__(self, key): except (KeyError, TypeError, ValueError): return False - contains = __contains__ - # Try to run function on index first, and then on elements of index # Especially important for group-by functionality def map(self, mapper, na_action=None): diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 777fa2eadd289f..9f9ebcf67cee68 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -139,7 +139,7 @@ def func(intvidx_self, other, sort=False): name=_index_doc_kwargs['name'], versionadded="0.20.0", extra_attributes="is_overlapping\nvalues\n", - extra_methods="contains\n", + extra_methods="", examples=textwrap.dedent("""\ Examples -------- @@ -291,27 +291,6 @@ def __contains__(self, key): except KeyError: return False - def contains(self, key): - """ - Return a boolean indicating if the key is IN the index - - We accept / allow keys to be not *just* actual - objects. - - Parameters - ---------- - key : int, float, Interval - - Returns - ------- - boolean - """ - try: - self.get_loc(key) - return True - except KeyError: - return False - @Appender(_interval_shared_docs['to_tuples'] % dict( return_type="Index", examples=""" @@ -1137,6 +1116,10 @@ def equals(self, other): self.right.equals(other.right) and self.closed == other.closed) + @Appender(_interval_shared_docs['contains'] % _index_doc_kwargs) + def contains(self, other): + return self._data.contains(other) + @Appender(_interval_shared_docs['overlaps'] % _index_doc_kwargs) def overlaps(self, other): return self._data.overlaps(other) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 9cb0a2fac85b0c..fd64f18c50b34b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -922,8 +922,6 @@ def __contains__(self, key): except (LookupError, TypeError, ValueError): return False - contains = __contains__ - @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, values=None, **kwargs): if values is not None: diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index dc11099c3e903d..f61b2e679f0c81 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -433,8 +433,6 @@ def __contains__(self, key): except Exception: return False - contains = __contains__ - @cache_readonly def _int64index(self): return Int64Index._simple_new(self.asi8, name=self.name) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 1539feb2e0856c..7e199c6c9f66ba 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2393,7 +2393,7 @@ def convert_to_index_sliceable(obj, key): elif isinstance(key, str): # we are an actual column - if obj._data.items.contains(key): + if key in obj._data.items: return None # We might have a datetimelike string that we can translate to a diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index b2f409837344a5..a5e9f5902f565c 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -753,23 +753,28 @@ def test_contains(self): assert Interval(3, 5) not in i assert Interval(-1, 0, closed='left') not in i - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def testcontains(self): + def test_contains_method(self): # can select values that are IN the range of a value i = IntervalIndex.from_arrays([0, 1], [1, 2]) - assert i.contains(0.1) - assert i.contains(0.5) - assert i.contains(1) - assert i.contains(Interval(0, 1)) - assert i.contains(Interval(0, 2)) + expected = np.array([False, False], dtype='bool') + actual = i.contains(0) + tm.assert_numpy_array_equal(actual, expected) + actual = i.contains(3) + tm.assert_numpy_array_equal(actual, expected) - # these overlaps completely - assert i.contains(Interval(0, 3)) - assert i.contains(Interval(1, 3)) + expected = np.array([True, False], dtype='bool') + actual = i.contains(0.5) + tm.assert_numpy_array_equal(actual, expected) + actual = i.contains(1) + tm.assert_numpy_array_equal(actual, expected) - assert not i.contains(20) - assert not i.contains(-20) + # __contains__ not implemented for "interval in interval", follow + # that for the contains method for now + with pytest.raises( + NotImplementedError, + match='contains not implemented for two'): + i.contains(Interval(0, 1)) def test_dropna(self, closed): @@ -939,11 +944,9 @@ def test_datetime(self, tz): assert iv_false not in index # .contains does check individual points - assert not index.contains(Timestamp('2000-01-01', tz=tz)) - assert index.contains(Timestamp('2000-01-01T12', tz=tz)) - assert index.contains(Timestamp('2000-01-02', tz=tz)) - assert index.contains(iv_true) - assert not index.contains(iv_false) + assert not index.contains(Timestamp('2000-01-01', tz=tz)).any() + assert index.contains(Timestamp('2000-01-01T12', tz=tz)).any() + assert index.contains(Timestamp('2000-01-02', tz=tz)).any() # test get_indexer start = Timestamp('1999-12-31T12:00', tz=tz) diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 0801b364028708..27a690e58b70f1 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -464,19 +464,13 @@ def test_contains(self): idx0 = pd.PeriodIndex(ps0) for p in ps0: - assert idx0.contains(p) assert p in idx0 - - assert idx0.contains(str(p)) assert str(p) in idx0 - assert idx0.contains('2017-09-01 00:00:01') assert '2017-09-01 00:00:01' in idx0 - assert idx0.contains('2017-09') assert '2017-09' in idx0 - assert not idx0.contains(p3) assert p3 not in idx0 def test_get_value(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index c618b9b05a9426..b46e5835f4b411 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2159,6 +2159,11 @@ def test_tab_complete_warning(self, ip): with provisionalcompleter('ignore'): list(ip.Completer.completions('idx.', 4)) + def test_deprecated_contains(self): + for index in self.indices.values(): + with tm.assert_produces_warning(FutureWarning): + index.contains(1) + class TestMixedIntIndex(Base): # Mostly the tests from common.py for which the results differ diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index e9fe1278d7827c..7cdf5db64b3a9f 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -273,10 +273,12 @@ def test_cached_data(self): 91 in idx assert idx._cached_data is None - idx.contains(90) + with tm.assert_produces_warning(FutureWarning): + idx.contains(90) assert idx._cached_data is None - idx.contains(91) + with tm.assert_produces_warning(FutureWarning): + idx.contains(91) assert idx._cached_data is None idx.all() From 527e714647a16ad31f80ea535af5a63156515861 Mon Sep 17 00:00:00 2001 From: pilkibun <51503352+pilkibun@users.noreply.github.com> Date: Tue, 2 Jul 2019 00:17:48 +0000 Subject: [PATCH 128/238] Fix build warnings (#27157) --- pandas/_libs/groupby.pyx | 2 +- pandas/_libs/hashtable.pxd | 2 +- pandas/_libs/hashtable.pyx | 2 +- pandas/_libs/hashtable_class_helper.pxi.in | 2 +- pandas/_libs/hashtable_func_helper.pxi.in | 2 +- pandas/_libs/index.pyx | 4 ++-- pandas/_libs/lib.pyx | 8 ++++---- pandas/_libs/src/parser/tokenizer.c | 11 ++++++----- 8 files changed, 17 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 8f662b57615f3a..e3f18572abca1e 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -260,7 +260,7 @@ def group_shift_indexer(int64_t[:] out, const int64_t[:] labels, int ngroups, int periods): cdef: Py_ssize_t N, i, j, ii - int offset, sign + int offset = 0, sign int64_t lab, idxer, idxer_slot int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64) int64_t[:, :] label_indexer diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 609420f4297985..51ec4ba43159cc 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -41,7 +41,7 @@ cdef class StringHashTable(HashTable): cdef struct Int64VectorData: int64_t *data - size_t n, m + Py_ssize_t n, m cdef class Int64Vector: cdef Int64VectorData *data diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 544fb3d8a15c0b..3e620f5934d5ef 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -44,7 +44,7 @@ cdef int64_t NPY_NAT = util.get_nat() _SIZE_HINT_LIMIT = (1 << 20) + 7 -cdef size_t _INIT_VEC_CAP = 128 +cdef Py_ssize_t _INIT_VEC_CAP = 128 include "hashtable_class_helper.pxi" include "hashtable_func_helper.pxi" diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 8c2c560c062acc..bf2189a8c1fd72 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -659,7 +659,7 @@ cdef class StringHashTable(HashTable): int64_t[:] locs = np.empty(n, dtype=np.int64) # these by-definition *must* be strings - vecs = malloc(n * sizeof(char *)) + vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 80d864c65d0875..e400ec0e608f05 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -241,7 +241,7 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values): # construct the table n = len(values) - kh_resize_{{ttype}}(table, min(n, len(values))) + kh_resize_{{ttype}}(table, n) {{if dtype == 'object'}} for i in range(n): diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index b7c3e0e4cdd63b..ba2838d59f8149 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -352,10 +352,10 @@ cdef class IndexEngine: cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: cdef: - Py_ssize_t mid, lo = 0, hi = len(values) - 1 + Py_ssize_t mid = 0, lo = 0, hi = len(values) - 1 object pval - if hi >= 0 and val > util.get_value_at(values, hi): + if hi == 0 or (hi > 0 and val > util.get_value_at(values, hi)): return len(values) while lo < hi: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 990ac7c96a73ef..1df220029def62 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -480,7 +480,7 @@ def maybe_indices_to_slice(ndarray[int64_t] indices, int max_len): def maybe_booleans_to_slice(ndarray[uint8_t] mask): cdef: Py_ssize_t i, n = len(mask) - Py_ssize_t start, end + Py_ssize_t start = 0, end = 0 bint started = 0, finished = 0 for i in range(n): @@ -1634,7 +1634,7 @@ def is_datetime_with_singletz_array(values: ndarray) -> bool: Doesn't check values are datetime-like types. """ cdef: - Py_ssize_t i, j, n = len(values) + Py_ssize_t i = 0, j, n = len(values) object base_val, base_tz, val, tz if n == 0: @@ -1916,8 +1916,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, ndarray[int64_t] ints ndarray[uint64_t] uints ndarray[uint8_t] bools - ndarray[int64_t] idatetimes - ndarray[int64_t] itimedeltas + int64_t[:] idatetimes + int64_t[:] itimedeltas Seen seen = Seen() object val float64_t fval, fnan diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 3146e49455609f..2752fb64240222 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -424,13 +424,13 @@ static void append_warning(parser_t *self, const char *msg) { if (self->warn_msg == NULL) { self->warn_msg = (char *)malloc(length + 1); - strncpy(self->warn_msg, msg, strlen(msg) + 1); + snprintf(self->warn_msg, length + 1, "%s", msg); } else { ex_length = strlen(self->warn_msg); newptr = safe_realloc(self->warn_msg, ex_length + length + 1); if (newptr != NULL) { self->warn_msg = (char *)newptr; - strncpy(self->warn_msg + ex_length, msg, strlen(msg) + 1); + snprintf(self->warn_msg + ex_length, length + 1, "%s", msg); } } } @@ -1433,13 +1433,14 @@ PANDAS_INLINE void uppercase(char *p) { int to_boolean(const char *item, uint8_t *val) { char *tmp; int i, status = 0; - int bufsize = sizeof(char) * (strlen(item) + 1); + size_t length0 = (strlen(item) + 1); + int bufsize = length0; static const char *tstrs[1] = {"TRUE"}; static const char *fstrs[1] = {"FALSE"}; tmp = malloc(bufsize); - strncpy(tmp, item, bufsize); + snprintf(tmp, length0, "%s", item); uppercase(tmp); for (i = 0; i < 1; ++i) { @@ -1815,7 +1816,7 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep, double r = PyOS_string_to_double(p, q, 0); if (maybe_int != NULL) *maybe_int = 0; if (PyErr_Occurred() != NULL) *error = -1; - else if (r == Py_HUGE_VAL) *error = Py_HUGE_VAL; + else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL; PyErr_Clear(); return r; } From 7ec7c9eecab5720a88e659fd0993eaab47d0c33b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 1 Jul 2019 20:55:29 -0700 Subject: [PATCH 129/238] CLN: convert argument in .take method (#27171) * CLN: Convert argument in take * Fix one test and whatsnew note --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/generic.py | 16 +--------------- pandas/tests/frame/test_axis_select_reindex.py | 9 ++------- pandas/tests/series/indexing/test_indexing.py | 3 --- pandas/tests/sparse/series/test_series.py | 9 --------- 5 files changed, 4 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index fa8519c89b67fd..2030bb4d974c3b 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -646,6 +646,7 @@ Removal of prior version deprecations/changes - Removed the previously deprecated ``ordered`` and ``categories`` keyword arguments in ``astype`` (:issue:`17742`) - Removed the previously deprecated ``cdate_range`` (:issue:`17691`) - Removed the previously deprecated ``True`` option for the ``dropna`` keyword argument in :func:`SeriesGroupBy.nth` (:issue:`17493`) +- Removed the previously deprecated ``convert`` keyword argument in :meth:`Series.take` and :meth:`DataFrame.take`(:issue:`17352`) .. _whatsnew_0250.performance: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 380af8930f344d..822428c6787bea 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3365,7 +3365,7 @@ def _take(self, indices, axis=0, is_copy=True): return result - def take(self, indices, axis=0, convert=None, is_copy=True, **kwargs): + def take(self, indices, axis=0, is_copy=True, **kwargs): """ Return the elements in the given *positional* indices along an axis. @@ -3380,15 +3380,6 @@ def take(self, indices, axis=0, convert=None, is_copy=True, **kwargs): axis : {0 or 'index', 1 or 'columns', None}, default 0 The axis on which to select elements. ``0`` means that we are selecting rows, ``1`` means that we are selecting columns. - convert : bool, default True - Whether to convert negative indices into positive ones. - For example, ``-1`` would map to the ``len(axis) - 1``. - The conversions are similar to the behavior of indexing a - regular Python list. - - .. deprecated:: 0.21.0 - In the future, negative indices will always be converted. - is_copy : bool, default True Whether to return a copy of the original object or not. **kwargs @@ -3449,11 +3440,6 @@ class max_speed 1 monkey mammal NaN 3 lion mammal 80.5 """ - if convert is not None: - msg = ("The 'convert' parameter is deprecated " - "and will be removed in a future version.") - warnings.warn(msg, FutureWarning, stacklevel=2) - nv.validate_take(tuple(), kwargs) return self._take(indices, axis=axis, is_copy=is_copy) diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 18c95beb62a13a..12ac373aa8f607 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -919,13 +919,8 @@ def test_take(self, float_frame): expected = df.reindex(df.index.take(order)) assert_frame_equal(result, expected) - with tm.assert_produces_warning(FutureWarning): - result = df.take(order, convert=True, axis=0) - assert_frame_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning): - result = df.take(order, convert=False, axis=0) - assert_frame_equal(result, expected) + result = df.take(order, axis=0) + assert_frame_equal(result, expected) # axis = 1 result = df.take(order, axis=1) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 702e22b6741e4a..d794b4aca82e67 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -801,9 +801,6 @@ def test_take(): with pytest.raises(IndexError, match=msg.format(5)): s.take([2, 5]) - with tm.assert_produces_warning(FutureWarning): - s.take([-1, 3, 4], convert=False) - def test_take_categorical(): # https://github.com/pandas-dev/pandas/issues/20664 diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 290e0203567db1..2abd63281c4fee 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -542,15 +542,6 @@ def _compare(idx): exp = pd.Series(np.repeat(nan, 5)) tm.assert_series_equal(sp.take([0, 1, 2, 3, 4]), exp.to_sparse()) - # multiple FutureWarnings, can't check stacklevel - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - sp.take([1, 5], convert=True) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - sp.take([1, 5], convert=False) - def test_numpy_take(self): sp = SparseSeries([1.0, 2.0, 3.0]) indices = [1, 2] From e145443deaf792051d44abb401efa878e514422f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 2 Jul 2019 05:27:11 -0700 Subject: [PATCH 130/238] ERR: Raise on duplicates names in read_csv (#27175) --- doc/source/user_guide/io.rst | 3 +-- doc/source/whatsnew/v0.19.0.rst | 2 +- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/io/parsers.py | 9 +++------ pandas/tests/io/parser/test_dtypes.py | 9 ++++----- pandas/tests/io/parser/test_mangle_dupes.py | 15 +++++---------- 6 files changed, 15 insertions(+), 24 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 9af6c36cc4e4d3..b9f90bf750482c 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -108,8 +108,7 @@ header : int or list of ints, default ``'infer'`` line of data rather than the first line of the file. names : array-like, default ``None`` List of column names to use. If file contains no header row, then you should - explicitly pass ``header=None``. Duplicates in this list will cause - a ``UserWarning`` to be issued. + explicitly pass ``header=None``. Duplicates in this list are not allowed. index_col : int, str, sequence of int / str, or False, default ``None`` Column(s) to use as the row labels of the ``DataFrame``, either given as string name or column index. If a sequence of int / str is given, a diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index 52ea9e8839e45e..1dad8769a6b39c 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -218,7 +218,7 @@ contained the values ``[0, 3]``. **New behavior**: .. ipython:: python - :okwarning: + :okexcept: pd.read_csv(StringIO(data), names=names) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 2030bb4d974c3b..7495d7d7313fea 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -567,6 +567,7 @@ Other API changes - Using an unsupported version of Beautiful Soup 4 will now raise an ``ImportError`` instead of a ``ValueError`` (:issue:`27063`) - :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` will now raise a ``ValueError`` when saving timezone aware data. (:issue:`27008`, :issue:`7056`) - :meth:`DataFrame.to_hdf` and :meth:`Series.to_hdf` will now raise a ``NotImplementedError`` when saving a :class:`MultiIndex` with extention data types for a ``fixed`` format. (:issue:`7775`) +- Passing duplicate ``names`` in :meth:`read_csv` will now raise a ``ValueError`` (:issue:`17346`) .. _whatsnew_0250.deprecations: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 73d47af5922f7e..8fe0e466e7c0ac 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -99,8 +99,8 @@ data rather than the first line of the file. names : array-like, optional List of column names to use. If file contains no header row, then you - should explicitly pass ``header=None``. Duplicates in this list will cause - a ``UserWarning`` to be issued. + should explicitly pass ``header=None``. Duplicates in this list are not + allowed. index_col : int, str, sequence of int / str, or False, default ``None`` Column(s) to use as the row labels of the ``DataFrame``, either given as string name or column index. If a sequence of int / str is given, a @@ -394,10 +394,7 @@ def _validate_names(names): if names is not None: if len(names) != len(set(names)): - msg = ("Duplicate names specified. This " - "will raise an error in the future.") - warnings.warn(msg, UserWarning, stacklevel=3) - + raise ValueError('Duplicate names are not allowed.') return names diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 1d3c935e9101b4..738b9d96937507 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -424,18 +424,17 @@ def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers): tm.assert_frame_equal(result, expected) -def test_empty_with_dup_column_pass_dtype_by_indexes_warn(all_parsers): +def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers): # see gh-9424 parser = all_parsers expected = concat([Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], axis=1) expected.index = expected.index.astype(object) - with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + with pytest.raises(ValueError, match='Duplicate names'): data = "" - result = parser.read_csv(StringIO(data), names=["one", "one"], - dtype={0: "u1", 1: "f"}) - tm.assert_frame_equal(result, expected) + parser.read_csv(StringIO(data), names=["one", "one"], + dtype={0: "u1", 1: "f"}) def test_raise_on_passed_int_dtype_with_nas(all_parsers): diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 3b00acd8598fac..6ab761398631b9 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -37,17 +37,13 @@ def test_basic_names(all_parsers): tm.assert_frame_equal(result, expected) -def test_basic_names_warn(all_parsers): +def test_basic_names_raise(all_parsers): # See gh-7160 parser = all_parsers data = "0,1,2\n3,4,5" - expected = DataFrame([[0, 1, 2], [3, 4, 5]], - columns=["a", "b", "a.1"]) - - with tm.assert_produces_warning(UserWarning, check_stacklevel=False): - result = parser.read_csv(StringIO(data), names=["a", "b", "a"]) - tm.assert_frame_equal(result, expected) + with pytest.raises(ValueError, match='Duplicate names'): + parser.read_csv(StringIO(data), names=["a", "b", "a"]) @pytest.mark.parametrize("data,expected", [ @@ -90,9 +86,8 @@ def test_thorough_mangle_names(all_parsers, data, names, expected): # see gh-17095 parser = all_parsers - with tm.assert_produces_warning(UserWarning, check_stacklevel=False): - result = parser.read_csv(StringIO(data), names=names) - tm.assert_frame_equal(result, expected) + with pytest.raises(ValueError, match='Duplicate names'): + parser.read_csv(StringIO(data), names=names) def test_mangled_unnamed_placeholders(all_parsers): From a99c4630ce295872996ba43204ebd14a751bece1 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Tue, 2 Jul 2019 06:27:37 -0600 Subject: [PATCH 131/238] BUG: Fix Index constructor with mixed closed Intervals (#27173) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/indexes/base.py | 6 +++++- pandas/tests/indexes/interval/test_construction.py | 14 ++++++++++++++ pandas/tests/indexes/test_base.py | 13 ++++++++++++- 4 files changed, 32 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 7495d7d7313fea..b7614423e11dd4 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -764,7 +764,7 @@ Interval - Construction of :class:`Interval` is restricted to numeric, :class:`Timestamp` and :class:`Timedelta` endpoints (:issue:`23013`) - Fixed bug in :class:`Series`/:class:`DataFrame` not displaying ``NaN`` in :class:`IntervalIndex` with missing values (:issue:`25984`) -- +- Bug in :class:`Index` constructor where passing mixed closed :class:`Interval` objects would result in a ``ValueError`` instead of an ``object`` dtype ``Index`` (:issue:`27172`) Indexing ^^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 548791dafea1d2..a0bd13f1e4f9e9 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -421,7 +421,11 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, return Float64Index(subarr, copy=copy, name=name) elif inferred == 'interval': from .interval import IntervalIndex - return IntervalIndex(subarr, name=name, copy=copy) + try: + return IntervalIndex(subarr, name=name, copy=copy) + except ValueError: + # GH27172: mixed closed Intervals --> object dtype + pass elif inferred == 'boolean': # don't support boolean explicitly ATM pass diff --git a/pandas/tests/indexes/interval/test_construction.py b/pandas/tests/indexes/interval/test_construction.py index eb9b573cce91d1..aabaaa0f297f9d 100644 --- a/pandas/tests/indexes/interval/test_construction.py +++ b/pandas/tests/indexes/interval/test_construction.py @@ -364,6 +364,16 @@ def test_index_object_dtype(self, values_constructor): assert type(result) is Index tm.assert_numpy_array_equal(result.values, np.array(values)) + def test_index_mixed_closed(self): + # GH27172 + intervals = [Interval(0, 1, closed='left'), + Interval(1, 2, closed='right'), + Interval(2, 3, closed='neither'), + Interval(3, 4, closed='both')] + result = Index(intervals) + expected = Index(intervals, dtype=object) + tm.assert_index_equal(result, expected) + class TestFromIntervals(TestClassConstructors): """ @@ -388,3 +398,7 @@ def test_deprecated(self): @pytest.mark.skip(reason='parent class test that is not applicable') def test_index_object_dtype(self): pass + + @pytest.mark.skip(reason='parent class test that is not applicable') + def test_index_mixed_closed(self): + pass diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index b46e5835f4b411..a3563838e048da 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -22,7 +22,8 @@ CategoricalIndex, DataFrame, DatetimeIndex, Float64Index, Int64Index, PeriodIndex, RangeIndex, Series, TimedeltaIndex, UInt64Index, date_range, isna, period_range) -from pandas.core.index import _get_combined_index, ensure_index_from_sequences +from pandas.core.index import ( + _get_combined_index, ensure_index, ensure_index_from_sequences) from pandas.core.indexes.api import Index, MultiIndex from pandas.core.sorting import safe_sort from pandas.tests.indexes.common import Base @@ -2432,6 +2433,16 @@ def test_ensure_index_from_sequences(self, data, names, expected): result = ensure_index_from_sequences(data, names) tm.assert_index_equal(result, expected) + def test_ensure_index_mixed_closed_intervals(self): + # GH27172 + intervals = [pd.Interval(0, 1, closed='left'), + pd.Interval(1, 2, closed='right'), + pd.Interval(2, 3, closed='neither'), + pd.Interval(3, 4, closed='both')] + result = ensure_index(intervals) + expected = Index(intervals, dtype=object) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('opname', ['eq', 'ne', 'le', 'lt', 'ge', 'gt', 'add', 'radd', 'sub', 'rsub', From 8507170c8d9b1de1f3f0b49f391d99a5cd83e43c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 2 Jul 2019 08:31:16 -0500 Subject: [PATCH 132/238] CLN: separate raising from non-raising parts of method (#27151) * separate coerce_values from coerce_args --- pandas/core/internals/blocks.py | 109 +++++++++++++---------- pandas/tests/internals/test_internals.py | 4 +- 2 files changed, 63 insertions(+), 50 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index cad37bf2b8ae13..6cfeb62ef736b8 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -208,17 +208,15 @@ def array_dtype(self): """ return self.dtype - def make_block(self, values, placement=None, ndim=None): + def make_block(self, values, placement=None): """ Create a new block, with type inference propagate any values that are not specified """ if placement is None: placement = self.mgr_locs - if ndim is None: - ndim = self.ndim - return make_block(values, placement=placement, ndim=ndim) + return make_block(values, placement=placement, ndim=self.ndim) def make_block_same_class(self, values, placement=None, ndim=None, dtype=None): @@ -369,7 +367,9 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): # fillna, but if we cannot coerce, then try again as an ObjectBlock try: - values, _ = self._try_coerce_args(self.values, value) + # Note: we only call try_coerce_args to let it raise + self._try_coerce_args(value) + blocks = self.putmask(mask, value, inplace=inplace) blocks = [b.make_block(values=self._try_coerce_result(b.values)) for b in blocks] @@ -659,7 +659,21 @@ def _try_cast_result(self, result, dtype=None): # may need to change the dtype here return maybe_downcast_to_dtype(result, dtype) - def _try_coerce_args(self, values, other): + def _coerce_values(self, values): + """ + Coerce values (usually derived from self.values) for an operation. + + Parameters + ---------- + values : ndarray or ExtensionArray + + Returns + ------- + ndarray or ExtensionArray + """ + return values + + def _try_coerce_args(self, other): """ provide coercion to our input arguments """ if np.any(notna(other)) and not self._can_hold_element(other): @@ -669,7 +683,7 @@ def _try_coerce_args(self, values, other): type(other).__name__, type(self).__name__.lower().replace('Block', ''))) - return values, other + return other def _try_coerce_result(self, result): """ reverse of try_coerce_args """ @@ -718,9 +732,9 @@ def replace(self, to_replace, value, inplace=False, filter=None, # try to replace, if we raise an error, convert to ObjectBlock and # retry + values = self._coerce_values(self.values) try: - values, to_replace = self._try_coerce_args(self.values, - to_replace) + to_replace = self._try_coerce_args(to_replace) except (TypeError, ValueError): # GH 22083, TypeError or ValueError occurred within error handling # causes infinite loop. Cast and retry only if not objectblock. @@ -793,7 +807,8 @@ def setitem(self, indexer, value): # coerce if block dtype can store value values = self.values try: - values, value = self._try_coerce_args(values, value) + value = self._try_coerce_args(value) + values = self._coerce_values(values) # can keep its own dtype if hasattr(value, 'dtype') and is_dtype_equal(values.dtype, value.dtype): @@ -925,7 +940,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, new = self.fill_value if self._can_hold_element(new): - _, new = self._try_coerce_args(new_values, new) + new = self._try_coerce_args(new) if transpose: new_values = new_values.T @@ -1127,7 +1142,8 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, return [self.copy()] values = self.values if inplace else self.values.copy() - values, fill_value = self._try_coerce_args(values, fill_value) + values = self._coerce_values(values) + fill_value = self._try_coerce_args(fill_value) values = missing.interpolate_2d(values, method=method, axis=axis, limit=limit, fill_value=fill_value, dtype=self.dtype) @@ -1298,11 +1314,12 @@ def func(cond, values, other): if cond.ravel().all(): return values - values, other = self._try_coerce_args(values, other) + values = self._coerce_values(values) + other = self._try_coerce_args(other) try: - return self._try_coerce_result(expressions.where( - cond, values, other)) + fastres = expressions.where(cond, values, other) + return self._try_coerce_result(fastres) except Exception as detail: if errors == 'raise': raise TypeError( @@ -1349,10 +1366,10 @@ def func(cond, values, other): result_blocks = [] for m in [mask, ~mask]: if m.any(): - r = self._try_cast_result(result.take(m.nonzero()[0], - axis=axis)) - result_blocks.append( - self.make_block(r.T, placement=self.mgr_locs[m])) + taken = result.take(m.nonzero()[0], axis=axis) + r = self._try_cast_result(taken) + nb = self.make_block(r.T, placement=self.mgr_locs[m]) + result_blocks.append(nb) return result_blocks @@ -1423,7 +1440,7 @@ def quantile(self, qs, interpolation='linear', axis=0): values = values[None, :] else: values = self.get_values() - values, _ = self._try_coerce_args(values, values) + values = self._coerce_values(values) is_empty = values.shape[axis] == 0 orig_scalar = not is_list_like(qs) @@ -1579,7 +1596,8 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, # use block's copy logic. # .values may be an Index which does shallow copy by default new_values = self.values if inplace else self.copy().values - new_values, new = self._try_coerce_args(new_values, new) + new_values = self._coerce_values(new_values) + new = self._try_coerce_args(new) if isinstance(new, np.ndarray) and len(new) == len(mask): new = new[mask] @@ -2120,25 +2138,25 @@ def _can_hold_element(self, element): return (is_integer(element) or isinstance(element, datetime) or isna(element)) - def _try_coerce_args(self, values, other): + def _coerce_values(self, values): + return values.view('i8') + + def _try_coerce_args(self, other): """ - Coerce values and other to dtype 'i8'. NaN and NaT convert to + Coerce other to dtype 'i8'. NaN and NaT convert to the smallest i8, and will correctly round-trip to NaT if converted back in _try_coerce_result. values is always ndarray-like, other may not be Parameters ---------- - values : ndarray-like other : ndarray-like or scalar Returns ------- - base-type values, base-type other + base-type other """ - values = values.view('i8') - if isinstance(other, bool): raise TypeError elif is_null_datetimelike(other): @@ -2156,7 +2174,7 @@ def _try_coerce_args(self, values, other): # let higher levels handle raise TypeError(other) - return values, other + return other def _try_coerce_result(self, result): """ reverse of try_coerce_args """ @@ -2249,13 +2267,6 @@ def is_view(self): # check the ndarray values of the DatetimeIndex values return self.values._data.base is not None - def copy(self, deep=True): - """ copy constructor """ - values = self.values - if deep: - values = values.copy() - return self.make_block_same_class(values) - def get_values(self, dtype=None): """ Returns an ndarray of values. @@ -2305,21 +2316,22 @@ def _slice(self, slicer): return self.values[loc] return self.values[slicer] - def _try_coerce_args(self, values, other): + def _coerce_values(self, values): + # asi8 is a view, needs copy + return _block_shape(values.view("i8"), ndim=self.ndim) + + def _try_coerce_args(self, other): """ localize and return i8 for the values Parameters ---------- - values : ndarray-like other : ndarray-like or scalar Returns ------- - base-type values, base-type other + base-type other """ - # asi8 is a view, needs copy - values = _block_shape(values.view("i8"), ndim=self.ndim) if isinstance(other, ABCSeries): other = self._holder(other) @@ -2347,7 +2359,7 @@ def _try_coerce_args(self, values, other): else: raise TypeError(other) - return values, other + return other def _try_coerce_result(self, result): """ reverse of try_coerce_args """ @@ -2488,21 +2500,22 @@ def fillna(self, value, **kwargs): value = Timedelta(value, unit='s') return super().fillna(value, **kwargs) - def _try_coerce_args(self, values, other): + def _coerce_values(self, values): + return values.view('i8') + + def _try_coerce_args(self, other): """ Coerce values and other to int64, with null values converted to iNaT. values is always ndarray-like, other may not be Parameters ---------- - values : ndarray-like other : ndarray-like or scalar Returns ------- - base-type values, base-type other + base-type other """ - values = values.view('i8') if isinstance(other, bool): raise TypeError @@ -2517,7 +2530,7 @@ def _try_coerce_args(self, values, other): # let higher levels handle raise TypeError(other) - return values, other + return other def _try_coerce_result(self, result): """ reverse of try_coerce_args / try_operate """ @@ -2688,7 +2701,7 @@ def _maybe_downcast(self, blocks, downcast=None): def _can_hold_element(self, element): return True - def _try_coerce_args(self, values, other): + def _try_coerce_args(self, other): """ provide coercion to our input arguments """ if isinstance(other, ABCDatetimeIndex): @@ -2701,7 +2714,7 @@ def _try_coerce_args(self, values, other): # when falling back to ObjectBlock.where other = other.astype(object) - return values, other + return other def should_store(self, value): return not (issubclass(value.dtype.type, diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 697c0b52805895..411146843d60fa 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -299,14 +299,14 @@ def test_try_coerce_arg(self): block = create_block('datetime', [0]) # coerce None - none_coerced = block._try_coerce_args(block.values, None)[1] + none_coerced = block._try_coerce_args(None) assert pd.Timestamp(none_coerced) is pd.NaT # coerce different types of date bojects vals = (np.datetime64('2010-10-10'), datetime(2010, 10, 10), date(2010, 10, 10)) for val in vals: - coerced = block._try_coerce_args(block.values, val)[1] + coerced = block._try_coerce_args(val) assert np.int64 == type(coerced) assert pd.Timestamp('2010-10-10') == pd.Timestamp(coerced) From c407b7308cefdc8b66e25c61f1ad21f1f70ca121 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Tue, 2 Jul 2019 09:41:43 -0600 Subject: [PATCH 133/238] API: Implement new indexing behavior for intervals (#27100) --- ci/code_checks.sh | 4 +- doc/source/user_guide/advanced.rst | 31 +- doc/source/whatsnew/v0.25.0.rst | 140 +++++++- pandas/core/arrays/interval.py | 5 - pandas/core/indexes/base.py | 20 +- pandas/core/indexes/interval.py | 324 +++++++++--------- pandas/core/indexing.py | 2 +- .../arrays/categorical/test_operators.py | 13 + pandas/tests/frame/test_missing.py | 10 + .../tests/indexes/interval/test_interval.py | 260 +++----------- .../indexes/interval/test_interval_new.py | 76 ++-- pandas/tests/indexes/test_category.py | 13 + .../tests/indexing/interval/test_interval.py | 155 +-------- .../indexing/interval/test_interval_new.py | 37 +- pandas/tests/indexing/test_indexing.py | 12 +- pandas/tests/reshape/test_concat.py | 18 +- 16 files changed, 494 insertions(+), 626 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ac86815569a0c2..00c430064e4a55 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -245,10 +245,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Doctests interval classes' ; echo $MSG - pytest --doctest-modules -v \ + pytest -q --doctest-modules \ pandas/core/indexes/interval.py \ pandas/core/arrays/interval.py \ - -k"-from_arrays -from_breaks -from_intervals -from_tuples -get_loc -set_closed -to_tuples -interval_range" + -k"-from_arrays -from_breaks -from_intervals -from_tuples -set_closed -to_tuples -interval_range" RET=$(($RET + $?)) ; echo $MSG "DONE" fi diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 6a2620635445d7..a42ab4f0255bdc 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -938,9 +938,8 @@ for interval notation. The ``IntervalIndex`` allows some unique indexing and is also used as a return type for the categories in :func:`cut` and :func:`qcut`. -.. warning:: - - These indexing behaviors are provisional and may change in a future version of pandas. +Indexing with an ``IntervalIndex`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ An ``IntervalIndex`` can be used in ``Series`` and in ``DataFrame`` as the index. @@ -965,6 +964,32 @@ If you select a label *contained* within an interval, this will also select the df.loc[2.5] df.loc[[2.5, 3.5]] +Selecting using an ``Interval`` will only return exact matches (starting from pandas 0.25.0). + +.. ipython:: python + + df.loc[pd.Interval(1, 2)] + +Trying to select an ``Interval`` that is not exactly contained in the ``IntervalIndex`` will raise a ``KeyError``. + +.. code-block:: python + + In [7]: df.loc[pd.Interval(0.5, 2.5)] + --------------------------------------------------------------------------- + KeyError: Interval(0.5, 2.5, closed='right') + +Selecting all ``Intervals`` that overlap a given ``Interval`` can be performed using the +:meth:`~IntervalIndex.overlaps` method to create a boolean indexer. + +.. ipython:: python + + idxr = df.index.overlaps(pd.Interval(0.5, 2.5)) + idxr + df[idxr] + +Binning data with ``cut`` and ``qcut`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + :func:`cut` and :func:`qcut` both return a ``Categorical`` object, and the bins they create are stored as an ``IntervalIndex`` in its ``.categories`` attribute. diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index b7614423e11dd4..e9d23cfd8efc12 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -484,6 +484,142 @@ This change is backward compatible for direct usage of Pandas, but if you subcla Pandas objects *and* give your subclasses specific ``__str__``/``__repr__`` methods, you may have to adjust your ``__str__``/``__repr__`` methods (:issue:`26495`). +.. _whatsnew_0250.api_breaking.interval_indexing: + + +Indexing an ``IntervalIndex`` with ``Interval`` objects +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Indexing methods for :class:`IntervalIndex` have been modified to require exact matches only for :class:`Interval` queries. +``IntervalIndex`` methods previously matched on any overlapping ``Interval``. Behavior with scalar points, e.g. querying +with an integer, is unchanged (:issue:`16316`). + +.. ipython:: python + + ii = pd.IntervalIndex.from_tuples([(0, 4), (1, 5), (5, 8)]) + ii + +The ``in`` operator (``__contains__``) now only returns ``True`` for exact matches to ``Intervals`` in the ``IntervalIndex``, whereas +this would previously return ``True`` for any ``Interval`` overlapping an ``Interval`` in the ``IntervalIndex``. + +*Previous behavior*: + +.. code-block:: python + + In [4]: pd.Interval(1, 2, closed='neither') in ii + Out[4]: True + + In [5]: pd.Interval(-10, 10, closed='both') in ii + Out[5]: True + +*New behavior*: + +.. ipython:: python + + pd.Interval(1, 2, closed='neither') in ii + pd.Interval(-10, 10, closed='both') in ii + +The :meth:`~IntervalIndex.get_loc` method now only returns locations for exact matches to ``Interval`` queries, as opposed to the previous behavior of +returning locations for overlapping matches. A ``KeyError`` will be raised if an exact match is not found. + +*Previous behavior*: + +.. code-block:: python + + In [6]: ii.get_loc(pd.Interval(1, 5)) + Out[6]: array([0, 1]) + + In [7]: ii.get_loc(pd.Interval(2, 6)) + Out[7]: array([0, 1, 2]) + +*New behavior*: + +.. code-block:: python + + In [6]: ii.get_loc(pd.Interval(1, 5)) + Out[6]: 1 + + In [7]: ii.get_loc(pd.Interval(2, 6)) + --------------------------------------------------------------------------- + KeyError: Interval(2, 6, closed='right') + +Likewise, :meth:`~IntervalIndex.get_indexer` and :meth:`~IntervalIndex.get_indexer_non_unique` will also only return locations for exact matches +to ``Interval`` queries, with ``-1`` denoting that an exact match was not found. + +These indexing changes extend to querying a :class:`Series` or :class:`DataFrame` with an ``IntervalIndex`` index. + +.. ipython:: python + + s = pd.Series(list('abc'), index=ii) + s + +Selecting from a ``Series`` or ``DataFrame`` using ``[]`` (``__getitem__``) or ``loc`` now only returns exact matches for ``Interval`` queries. + +*Previous behavior*: + +.. code-block:: python + + In [8]: s[pd.Interval(1, 5)] + Out[8]: + (0, 4] a + (1, 5] b + dtype: object + + In [9]: s.loc[pd.Interval(1, 5)] + Out[9]: + (0, 4] a + (1, 5] b + dtype: object + +*New behavior*: + +.. ipython:: python + + s[pd.Interval(1, 5)] + s.loc[pd.Interval(1, 5)] + +Similarly, a ``KeyError`` will be raised for non-exact matches instead of returning overlapping matches. + +*Previous behavior*: + +.. code-block:: python + + In [9]: s[pd.Interval(2, 3)] + Out[9]: + (0, 4] a + (1, 5] b + dtype: object + + In [10]: s.loc[pd.Interval(2, 3)] + Out[10]: + (0, 4] a + (1, 5] b + dtype: object + +*New behavior*: + +.. code-block:: python + + In [6]: s[pd.Interval(2, 3)] + --------------------------------------------------------------------------- + KeyError: Interval(2, 3, closed='right') + + In [7]: s.loc[pd.Interval(2, 3)] + --------------------------------------------------------------------------- + KeyError: Interval(2, 3, closed='right') + +The :meth:`~IntervalIndex.overlaps` method can be used to create a boolean indexer that replicates the +previous behavior of returning overlapping matches. + +*New behavior*: + +.. ipython:: python + + idxr = s.index.overlaps(pd.Interval(2, 3)) + idxr + s[idxr] + s.loc[idxr] + .. _whatsnew_0250.api_breaking.deps: Increased minimum versions for dependencies @@ -686,7 +822,7 @@ Categorical - Bug in :func:`DataFrame.at` and :func:`Series.at` that would raise exception if the index was a :class:`CategoricalIndex` (:issue:`20629`) - Fixed bug in comparison of ordered :class:`Categorical` that contained missing values with a scalar which sometimes incorrectly resulted in ``True`` (:issue:`26504`) -- +- Bug in :meth:`DataFrame.dropna` when the :class:`DataFrame` has a :class:`CategoricalIndex` containing :class:`Interval` objects incorrectly raised a ``TypeError`` (:issue:`25087`) Datetimelike ^^^^^^^^^^^^ @@ -764,6 +900,7 @@ Interval - Construction of :class:`Interval` is restricted to numeric, :class:`Timestamp` and :class:`Timedelta` endpoints (:issue:`23013`) - Fixed bug in :class:`Series`/:class:`DataFrame` not displaying ``NaN`` in :class:`IntervalIndex` with missing values (:issue:`25984`) +- Bug in :meth:`IntervalIndex.get_loc` where a ``KeyError`` would be incorrectly raised for a decreasing :class:`IntervalIndex` (:issue:`25860`) - Bug in :class:`Index` constructor where passing mixed closed :class:`Interval` objects would result in a ``ValueError`` instead of an ``object`` dtype ``Index`` (:issue:`27172`) Indexing @@ -778,6 +915,7 @@ Indexing - Fixed bug where assigning a :class:`arrays.PandasArray` to a :class:`pandas.core.frame.DataFrame` would raise error (:issue:`26390`) - Allow keyword arguments for callable local reference used in the :meth:`DataFrame.query` string (:issue:`26426`) - Bug which produced ``AttributeError`` on partial matching :class:`Timestamp` in a :class:`MultiIndex` (:issue:`26944`) +- Bug in :class:`Categorical` and :class:`CategoricalIndex` with :class:`Interval` values when using the ``in`` operator (``__contains``) with objects that are not comparable to the values in the ``Interval`` (:issue:`23705`) - Bug in :meth:`DataFrame.loc` and :meth:`DataFrame.iloc` on a :class:`DataFrame` with a single timezone-aware datetime64[ns] column incorrectly returning a scalar instead of a :class:`Series` (:issue:`27110`) - diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 8ed28065ee7aae..aa56d99d298f47 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -41,11 +41,6 @@ .. versionadded:: %(versionadded)s -.. warning:: - - The indexing behaviors are provisional and may change in - a future version of pandas. - Parameters ---------- data : array-like (1-dimensional) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a0bd13f1e4f9e9..6e0d26750df00d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3235,8 +3235,9 @@ def reindex(self, target, method=None, level=None, limit=None, if self.equals(target): indexer = None else: - - if self.is_unique: + # check is_overlapping for IntervalIndex compat + if (self.is_unique and + not getattr(self, 'is_overlapping', False)): indexer = self.get_indexer(target, method=method, limit=limit, tolerance=tolerance) @@ -4481,8 +4482,7 @@ def argsort(self, *args, **kwargs): result = np.array(self) return result.argsort(*args, **kwargs) - def get_value(self, series, key): - """ + _index_shared_docs['get_value'] = """ Fast lookup of value from 1-dimensional ndarray. Only use this if you know what you're doing. @@ -4492,6 +4492,9 @@ def get_value(self, series, key): A value in the Series with the index of the key value in self. """ + @Appender(_index_shared_docs['get_value'] % _index_doc_kwargs) + def get_value(self, series, key): + # if we have something that is Index-like, then # use this, e.g. DatetimeIndex # Things like `Series._get_value` (via .at) pass the EA directly here. @@ -4915,13 +4918,6 @@ def _searchsorted_monotonic(self, label, side='left'): raise ValueError('index must be monotonic increasing or decreasing') - def _get_loc_only_exact_matches(self, key): - """ - This is overridden on subclasses (namely, IntervalIndex) to control - get_slice_bound. - """ - return self.get_loc(key) - def get_slice_bound(self, label, side, kind): """ Calculate slice bound that corresponds to given label. @@ -4955,7 +4951,7 @@ def get_slice_bound(self, label, side, kind): # we need to look up the label try: - slc = self._get_loc_only_exact_matches(label) + slc = self.get_loc(label) except KeyError as err: try: return self._searchsorted_monotonic(label, side) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 9f9ebcf67cee68..83bc5963f4f9ef 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1,12 +1,14 @@ """ define the IntervalIndex """ +from operator import le, lt import textwrap +from typing import Any, Optional, Tuple, Union import warnings import numpy as np from pandas._config import get_option -from pandas._libs import Timedelta, Timestamp +from pandas._libs import Timedelta, Timestamp, lib from pandas._libs.interval import Interval, IntervalMixin, IntervalTree from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.util._exceptions import rewrite_exception @@ -17,13 +19,15 @@ ensure_platform_int, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal, is_float, is_float_dtype, is_integer, is_integer_dtype, is_interval_dtype, is_list_like, is_number, is_object_dtype, is_scalar) +from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import isna +from pandas._typing import AnyArrayLike from pandas.core.arrays.interval import IntervalArray, _interval_shared_docs import pandas.core.common as com import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( - Index, _index_shared_docs, default_pprint, ensure_index) + Index, InvalidIndexError, _index_shared_docs, default_pprint, ensure_index) from pandas.core.indexes.datetimes import DatetimeIndex, date_range from pandas.core.indexes.multi import MultiIndex from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range @@ -601,6 +605,23 @@ def _maybe_cast_indexed(self, key): return key + def _can_reindex(self, indexer: np.ndarray) -> None: + """ + Check if we are allowing reindexing with this particular indexer. + + Parameters + ---------- + indexer : an integer indexer + + Raises + ------ + ValueError if its a duplicate axis + """ + + # trying to reindex on an axis with duplicates + if self.is_overlapping and len(indexer): + raise ValueError("cannot reindex from an overlapping axis") + def _needs_i8_conversion(self, key): """ Check if a given key needs i8 conversion. Conversion is necessary for @@ -694,7 +715,8 @@ def _searchsorted_monotonic(self, label, side, exclude_label=False): 'increasing or decreasing') if isinstance(label, IntervalMixin): - raise NotImplementedError + msg = 'Interval objects are not currently supported' + raise NotImplementedError(msg) # GH 20921: "not is_monotonic_increasing" for the second condition # instead of "is_monotonic_decreasing" to account for single element @@ -711,18 +733,6 @@ def _searchsorted_monotonic(self, label, side, exclude_label=False): return sub_idx._searchsorted_monotonic(label, side) - def _get_loc_only_exact_matches(self, key): - if isinstance(key, Interval): - - if not self.is_unique: - raise ValueError("cannot index with a slice Interval" - " and a non-unique index") - - # TODO: this expands to a tuple index, see if we can - # do better - return Index(self._multiindex.values).get_loc(key) - raise KeyError - def _find_non_overlapping_monotonic_bounds(self, key): if isinstance(key, IntervalMixin): start = self._searchsorted_monotonic( @@ -749,7 +759,10 @@ def _find_non_overlapping_monotonic_bounds(self, key): stop = self._searchsorted_monotonic(key, 'right') return start, stop - def get_loc(self, key, method=None): + def get_loc(self, + key: Any, + method: Optional[str] = None + ) -> Union[int, slice, np.ndarray]: """ Get integer location, slice or boolean mask for requested label. @@ -770,11 +783,8 @@ def get_loc(self, key, method=None): >>> index.get_loc(1) 0 - You can also supply an interval or an location for a point inside an - interval. + You can also supply a point inside an interval. - >>> index.get_loc(pd.Interval(0, 2)) - array([0, 1], dtype=int64) >>> index.get_loc(1.5) 1 @@ -782,63 +792,42 @@ def get_loc(self, key, method=None): relevant intervals. >>> i3 = pd.Interval(0, 2) - >>> overlapping_index = pd.IntervalIndex([i2, i3]) - >>> overlapping_index.get_loc(1.5) - array([0, 1], dtype=int64) - """ - self._check_method(method) - - original_key = key - key = self._maybe_cast_indexed(key) + >>> overlapping_index = pd.IntervalIndex([i1, i2, i3]) + >>> overlapping_index.get_loc(0.5) + array([ True, False, True]) - if self.is_non_overlapping_monotonic: - if isinstance(key, Interval): - left = self._maybe_cast_slice_bound(key.left, 'left', None) - right = self._maybe_cast_slice_bound(key.right, 'right', None) - key = Interval(left, right, key.closed) - else: - key = self._maybe_cast_slice_bound(key, 'left', None) + Only exact matches will be returned if an interval is provided. - start, stop = self._find_non_overlapping_monotonic_bounds(key) + >>> index.get_loc(pd.Interval(0, 1)) + 0 + """ + self._check_method(method) - if start is None or stop is None: - return slice(start, stop) - elif start + 1 == stop: - return start - elif start < stop: - return slice(start, stop) - else: - raise KeyError(original_key) + # list-like are invalid labels for II but in some cases may work, e.g + # single element array of comparable type, so guard against them early + if is_list_like(key): + raise KeyError(key) + if isinstance(key, Interval): + if self.closed != key.closed: + raise KeyError(key) + mask = (self.left == key.left) & (self.right == key.right) else: - # use the interval tree - key = self._maybe_convert_i8(key) - if isinstance(key, Interval): - left, right = _get_interval_closed_bounds(key) - return self._engine.get_loc_interval(left, right) - else: - return self._engine.get_loc(key) - - def get_value(self, series, key): - if com.is_bool_indexer(key): - loc = key - elif is_list_like(key): - loc = self.get_indexer(key) - elif isinstance(key, slice): - - if not (key.step is None or key.step == 1): - raise ValueError("cannot support not-default step in a slice") - + # assume scalar + op_left = le if self.closed_left else lt + op_right = le if self.closed_right else lt try: - loc = self.get_loc(key) + mask = op_left(self.left, key) & op_right(key, self.right) except TypeError: - # we didn't find exact intervals or are non-unique - msg = "unable to slice with this key: {key}".format(key=key) - raise ValueError(msg) + # scalar is not comparable to II subtype --> invalid label + raise KeyError(key) - else: - loc = self.get_loc(key) - return series.iloc[loc] + matches = mask.sum() + if matches == 0: + raise KeyError(key) + elif matches == 1: + return mask.argmax() + return lib.maybe_booleans_to_slice(mask.view('u1')) @Substitution(**dict(_index_doc_kwargs, **{'raises_section': textwrap.dedent(""" @@ -849,112 +838,133 @@ def get_value(self, series, key): None is specified as these are not yet implemented. """)})) @Appender(_index_shared_docs['get_indexer']) - def get_indexer(self, target, method=None, limit=None, tolerance=None): + def get_indexer(self, + target: AnyArrayLike, + method: Optional[str] = None, + limit: Optional[int] = None, + tolerance: Optional[Any] = None + ) -> np.ndarray: self._check_method(method) - target = ensure_index(target) - target = self._maybe_cast_indexed(target) - - if self.equals(target): - return np.arange(len(self), dtype='intp') - - if self.is_non_overlapping_monotonic: - start, stop = self._find_non_overlapping_monotonic_bounds(target) - start_plus_one = start + 1 - if not ((start_plus_one < stop).any()): - return np.where(start_plus_one == stop, start, -1) + if self.is_overlapping: + msg = ('cannot handle overlapping indices; use ' + 'IntervalIndex.get_indexer_non_unique') + raise InvalidIndexError(msg) - if not self.is_unique: - raise ValueError("cannot handle non-unique indices") + target = ensure_index(target) - # IntervalIndex if isinstance(target, IntervalIndex): - indexer = self._get_reindexer(target) - - # non IntervalIndex + # equal indexes -> 1:1 positional match + if self.equals(target): + return np.arange(len(self), dtype='intp') + + # different closed or incompatible subtype -> no matches + common_subtype = find_common_type([ + self.dtype.subtype, target.dtype.subtype]) + if self.closed != target.closed or is_object_dtype(common_subtype): + return np.repeat(np.intp(-1), len(target)) + + # non-overlapping -> at most one match per interval in target + # want exact matches -> need both left/right to match, so defer to + # left/right get_indexer, compare elementwise, equality -> match + left_indexer = self.left.get_indexer(target.left) + right_indexer = self.right.get_indexer(target.right) + indexer = np.where(left_indexer == right_indexer, left_indexer, -1) + elif not is_object_dtype(target): + # homogeneous scalar index: use IntervalTree + target = self._maybe_convert_i8(target) + indexer = self._engine.get_indexer(target.values) else: - indexer = np.concatenate([self.get_loc(i) for i in target]) + # heterogeneous scalar index: defer elementwise to get_loc + # (non-overlapping so get_loc guarantees scalar of KeyError) + indexer = [] + for key in target: + try: + loc = self.get_loc(key) + except KeyError: + loc = -1 + indexer.append(loc) return ensure_platform_int(indexer) - def _get_reindexer(self, target): - """ - Return an indexer for a target IntervalIndex with self - """ - - # find the left and right indexers - left = self._maybe_convert_i8(target.left) - right = self._maybe_convert_i8(target.right) - lindexer = self._engine.get_indexer(left.values) - rindexer = self._engine.get_indexer(right.values) - - # we want to return an indexer on the intervals - # however, our keys could provide overlapping of multiple - # intervals, so we iterate thru the indexers and construct - # a set of indexers - - indexer = [] - n = len(self) - - for i, (lhs, rhs) in enumerate(zip(lindexer, rindexer)): - - target_value = target[i] - - # matching on the lhs bound - if (lhs != -1 and - self.closed == 'right' and - target_value.left == self[lhs].right): - lhs += 1 - - # matching on the lhs bound - if (rhs != -1 and - self.closed == 'left' and - target_value.right == self[rhs].left): - rhs -= 1 - - # not found - if lhs == -1 and rhs == -1: - indexer.append(np.array([-1])) - - elif rhs == -1: - - indexer.append(np.arange(lhs, n)) + @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) + def get_indexer_non_unique(self, + target: AnyArrayLike + ) -> Tuple[np.ndarray, np.ndarray]: + target = ensure_index(target) - elif lhs == -1: + # check that target IntervalIndex is compatible + if isinstance(target, IntervalIndex): + common_subtype = find_common_type([ + self.dtype.subtype, target.dtype.subtype]) + if self.closed != target.closed or is_object_dtype(common_subtype): + # different closed or incompatible subtype -> no matches + return np.repeat(-1, len(target)), np.arange(len(target)) + + if is_object_dtype(target) or isinstance(target, IntervalIndex): + # target might contain intervals: defer elementwise to get_loc + indexer, missing = [], [] + for i, key in enumerate(target): + try: + locs = self.get_loc(key) + if isinstance(locs, slice): + locs = np.arange( + locs.start, locs.stop, locs.step, dtype='intp') + locs = np.array(locs, ndmin=1) + except KeyError: + missing.append(i) + locs = np.array([-1]) + indexer.append(locs) + indexer = np.concatenate(indexer) + else: + target = self._maybe_convert_i8(target) + indexer, missing = self._engine.get_indexer_non_unique( + target.values) - # care about left/right closed here - value = self[i] + return ensure_platform_int(indexer), ensure_platform_int(missing) - # target.closed same as self.closed - if self.closed == target.closed: - if target_value.left < value.left: - indexer.append(np.array([-1])) - continue + def get_indexer_for(self, + target: AnyArrayLike, + **kwargs + ) -> np.ndarray: + """ + Guaranteed return of an indexer even when overlapping. - # target.closed == 'left' - elif self.closed == 'right': - if target_value.left <= value.left: - indexer.append(np.array([-1])) - continue + This dispatches to get_indexer or get_indexer_non_unique + as appropriate. - # target.closed == 'right' - elif self.closed == 'left': - if target_value.left <= value.left: - indexer.append(np.array([-1])) - continue + Returns + ------- + numpy.ndarray + List of indices. + """ + if self.is_overlapping: + return self.get_indexer_non_unique(target, **kwargs)[0] + return self.get_indexer(target, **kwargs) - indexer.append(np.arange(0, rhs + 1)) + @Appender(_index_shared_docs['get_value'] % _index_doc_kwargs) + def get_value(self, + series: ABCSeries, + key: Any + ) -> Any: + if com.is_bool_indexer(key): + loc = key + elif is_list_like(key): + if self.is_overlapping: + loc, missing = self.get_indexer_non_unique(key) + if len(missing): + raise KeyError else: - indexer.append(np.arange(lhs, rhs + 1)) - - return np.concatenate(indexer) - - @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) - def get_indexer_non_unique(self, target): - target = self._maybe_cast_indexed(ensure_index(target)) - return super().get_indexer_non_unique(target) + loc = self.get_indexer(key) + elif isinstance(key, slice): + if not (key.step is None or key.step == 1): + raise ValueError("cannot support not-default step in a slice") + loc = self._convert_slice_indexer(key, kind='getitem') + else: + loc = self.get_loc(key) + return series.iloc[loc] @Appender(_index_shared_docs['where']) def where(self, cond, other=None): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 7e199c6c9f66ba..677aefa15d200d 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1074,7 +1074,7 @@ def _get_listlike_indexer(self, key, axis, raise_missing=False): raise_missing=raise_missing) return ax[indexer], indexer - if ax.is_unique: + if ax.is_unique and not getattr(ax, 'is_overlapping', False): # If we are trying to get actual keys from empty Series, we # patiently wait for a KeyError later on - otherwise, convert if len(ax) or not len(key): diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index a443408bf9479e..af1d3ca0f9ad47 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -392,3 +392,16 @@ def test_contains(self): c = pd.Categorical(list('aabbca') + [np.nan], categories=list('cab')) assert np.nan in c + + @pytest.mark.parametrize('item, expected', [ + (pd.Interval(0, 1), True), + (1.5, True), + (pd.Interval(0.5, 1.5), False), + ('a', False), + (pd.Timestamp(1), False), + (pd.Timedelta(1), False)], ids=str) + def test_contains_interval(self, item, expected): + # GH 23705 + cat = Categorical(pd.IntervalIndex.from_breaks(range(3))) + result = item in cat + assert result is expected diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index c72951ac4cdfa6..807931567847f7 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -201,6 +201,16 @@ def test_dropna_tz_aware_datetime(self): index=[0, 3]) assert_frame_equal(result, expected) + def test_dropna_categorical_interval_index(self): + # GH 25087 + ii = pd.IntervalIndex.from_breaks([0, 2.78, 3.14, 6.28]) + ci = pd.CategoricalIndex(ii) + df = pd.DataFrame({'A': list('abc')}, index=ci) + + expected = df + result = df.dropna() + tm.assert_frame_equal(result, expected) + def test_fillna_datetime(self, datetime_frame): tf = datetime_frame tf.loc[tf.index[:5], 'A'] = np.nan diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index a5e9f5902f565c..4004d4b666a158 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -393,7 +393,6 @@ def test_repr_missing(self, constructor, expected): result = repr(obj) assert result == expected - # TODO: check this behavior is consistent with test_interval_new.py def test_get_item(self, closed): i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), closed=closed) @@ -414,120 +413,31 @@ def test_get_item(self, closed): closed=closed) tm.assert_index_equal(result, expected) - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_get_loc_value(self): - with pytest.raises(KeyError, match="^0$"): - self.index.get_loc(0) - assert self.index.get_loc(0.5) == 0 - assert self.index.get_loc(1) == 0 - assert self.index.get_loc(1.5) == 1 - assert self.index.get_loc(2) == 1 - with pytest.raises(KeyError, match="^-1$"): - self.index.get_loc(-1) - with pytest.raises(KeyError, match="^3$"): - self.index.get_loc(3) - - idx = IntervalIndex.from_tuples([(0, 2), (1, 3)]) - assert idx.get_loc(0.5) == 0 - assert idx.get_loc(1) == 0 - tm.assert_numpy_array_equal(idx.get_loc(1.5), - np.array([0, 1], dtype='intp')) - tm.assert_numpy_array_equal(np.sort(idx.get_loc(2)), - np.array([0, 1], dtype='intp')) - assert idx.get_loc(3) == 1 - with pytest.raises(KeyError, match=r"^3\.5$"): - idx.get_loc(3.5) - - idx = IntervalIndex.from_arrays([0, 2], [1, 3]) - with pytest.raises(KeyError, match=r"^1\.5$"): - idx.get_loc(1.5) - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def slice_locs_cases(self, breaks): - # TODO: same tests for more index types - index = IntervalIndex.from_breaks([0, 1, 2], closed='right') - assert index.slice_locs() == (0, 2) - assert index.slice_locs(0, 1) == (0, 1) - assert index.slice_locs(1, 1) == (0, 1) - assert index.slice_locs(0, 2) == (0, 2) - assert index.slice_locs(0.5, 1.5) == (0, 2) - assert index.slice_locs(0, 0.5) == (0, 1) - assert index.slice_locs(start=1) == (0, 2) - assert index.slice_locs(start=1.2) == (1, 2) - assert index.slice_locs(end=1) == (0, 1) - assert index.slice_locs(end=1.1) == (0, 2) - assert index.slice_locs(end=1.0) == (0, 1) - assert index.slice_locs(-1, -1) == (0, 0) - - index = IntervalIndex.from_breaks([0, 1, 2], closed='neither') - assert index.slice_locs(0, 1) == (0, 1) - assert index.slice_locs(0, 2) == (0, 2) - assert index.slice_locs(0.5, 1.5) == (0, 2) - assert index.slice_locs(1, 1) == (1, 1) - assert index.slice_locs(1, 2) == (1, 2) - - index = IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)], - closed='both') - assert index.slice_locs(1, 1) == (0, 1) - assert index.slice_locs(1, 2) == (0, 2) - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_slice_locs_int64(self): - self.slice_locs_cases([0, 1, 2]) - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_slice_locs_float64(self): - self.slice_locs_cases([0.0, 1.0, 2.0]) - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def slice_locs_decreasing_cases(self, tuples): - index = IntervalIndex.from_tuples(tuples) - assert index.slice_locs(1.5, 0.5) == (1, 3) - assert index.slice_locs(2, 0) == (1, 3) - assert index.slice_locs(2, 1) == (1, 3) - assert index.slice_locs(3, 1.1) == (0, 3) - assert index.slice_locs(3, 3) == (0, 2) - assert index.slice_locs(3.5, 3.3) == (0, 1) - assert index.slice_locs(1, -3) == (2, 3) - - slice_locs = index.slice_locs(-1, -1) - assert slice_locs[0] == slice_locs[1] - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_slice_locs_decreasing_int64(self): - self.slice_locs_cases([(2, 4), (1, 3), (0, 2)]) - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_slice_locs_decreasing_float64(self): - self.slice_locs_cases([(2., 4.), (1., 3.), (0., 2.)]) - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_slice_locs_fails(self): - index = IntervalIndex.from_tuples([(1, 2), (0, 1), (2, 3)]) - msg = ("'can only get slices from an IntervalIndex if bounds are" - " non-overlapping and all monotonic increasing or decreasing'") - with pytest.raises(KeyError, match=msg): - index.slice_locs(1, 2) - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_get_loc_interval(self): - assert self.index.get_loc(Interval(0, 1)) == 0 - assert self.index.get_loc(Interval(0, 0.5)) == 0 - assert self.index.get_loc(Interval(0, 1, 'left')) == 0 - msg = r"Interval\(2, 3, closed='right'\)" - with pytest.raises(KeyError, match=msg): - self.index.get_loc(Interval(2, 3)) - msg = r"Interval\(-1, 0, closed='left'\)" - with pytest.raises(KeyError, match=msg): - self.index.get_loc(Interval(-1, 0, 'left')) + @pytest.mark.parametrize('scalar', [-1, 0, 0.5, 3, 4.5, 5, 6]) + def test_get_loc_length_one_scalar(self, scalar, closed): + # GH 20921 + index = IntervalIndex.from_tuples([(0, 5)], closed=closed) + if scalar in index[0]: + result = index.get_loc(scalar) + assert result == 0 + else: + with pytest.raises(KeyError): + index.get_loc(scalar) - # Make consistent with test_interval_new.py (see #16316, #16386) - @pytest.mark.parametrize('item', [3, Interval(1, 4)]) - def test_get_loc_length_one(self, item, closed): + @pytest.mark.parametrize('other_closed', [ + 'left', 'right', 'both', 'neither']) + @pytest.mark.parametrize('left, right', [(0, 5), (-1, 4), (-1, 6), (6, 7)]) + def test_get_loc_length_one_interval( + self, left, right, closed, other_closed): # GH 20921 index = IntervalIndex.from_tuples([(0, 5)], closed=closed) - result = index.get_loc(item) - assert result == 0 + interval = Interval(left, right, closed=other_closed) + if interval == index[0]: + result = index.get_loc(interval) + assert result == 0 + else: + with pytest.raises(KeyError): + index.get_loc(interval) # Make consistent with test_interval_new.py (see #16316, #16386) @pytest.mark.parametrize('breaks', [ @@ -544,12 +454,11 @@ def test_get_loc_datetimelike_nonoverlapping(self, breaks): expected = 0 assert result == expected - interval = Interval(index[0].left, index[1].right) + interval = Interval(index[0].left, index[0].right) result = index.get_loc(interval) - expected = slice(0, 2) + expected = 0 assert result == expected - # Make consistent with test_interval_new.py (see #16316, #16386) @pytest.mark.parametrize('arrays', [ (date_range('20180101', periods=4), date_range('20180103', periods=4)), (date_range('20180101', periods=4, tz='US/Eastern'), @@ -558,69 +467,32 @@ def test_get_loc_datetimelike_nonoverlapping(self, breaks): timedelta_range('2 days', periods=4))], ids=lambda x: str(x[0].dtype)) def test_get_loc_datetimelike_overlapping(self, arrays): # GH 20636 - # overlapping = IntervalTree method with i8 conversion index = IntervalIndex.from_arrays(*arrays) value = index[0].mid + Timedelta('12 hours') - result = np.sort(index.get_loc(value)) - expected = np.array([0, 1], dtype='intp') - tm.assert_numpy_array_equal(result, expected) - - interval = Interval(index[0].left, index[1].right) - result = np.sort(index.get_loc(interval)) - expected = np.array([0, 1, 2], dtype='intp') - tm.assert_numpy_array_equal(result, expected) - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_get_indexer(self): - actual = self.index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3]) - expected = np.array([-1, -1, 0, 0, 1, 1, -1], dtype='intp') - tm.assert_numpy_array_equal(actual, expected) - - actual = self.index.get_indexer(self.index) - expected = np.array([0, 1], dtype='intp') - tm.assert_numpy_array_equal(actual, expected) - - index = IntervalIndex.from_breaks([0, 1, 2], closed='left') - actual = index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3]) - expected = np.array([-1, 0, 0, 1, 1, -1, -1], dtype='intp') - tm.assert_numpy_array_equal(actual, expected) - - actual = self.index.get_indexer(index[:1]) - expected = np.array([0], dtype='intp') - tm.assert_numpy_array_equal(actual, expected) - - actual = self.index.get_indexer(index) - expected = np.array([-1, 1], dtype='intp') - tm.assert_numpy_array_equal(actual, expected) - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_get_indexer_subintervals(self): - - # TODO: is this right? - # return indexers for wholly contained subintervals - target = IntervalIndex.from_breaks(np.linspace(0, 2, 5)) - actual = self.index.get_indexer(target) - expected = np.array([0, 0, 1, 1], dtype='p') - tm.assert_numpy_array_equal(actual, expected) - - target = IntervalIndex.from_breaks([0, 0.67, 1.33, 2]) - actual = self.index.get_indexer(target) - expected = np.array([0, 0, 1, 1], dtype='intp') - tm.assert_numpy_array_equal(actual, expected) + result = index.get_loc(value) + expected = slice(0, 2, None) + assert result == expected - actual = self.index.get_indexer(target[[0, -1]]) - expected = np.array([0, 1], dtype='intp') - tm.assert_numpy_array_equal(actual, expected) + interval = Interval(index[0].left, index[0].right) + result = index.get_loc(interval) + expected = 0 + assert result == expected - target = IntervalIndex.from_breaks([0, 0.33, 0.67, 1], closed='left') - actual = self.index.get_indexer(target) - expected = np.array([0, 0, 0], dtype='intp') - tm.assert_numpy_array_equal(actual, expected) + @pytest.mark.parametrize('values', [ + date_range('2018-01-04', periods=4, freq='-1D'), + date_range('2018-01-04', periods=4, freq='-1D', tz='US/Eastern'), + timedelta_range('3 days', periods=4, freq='-1D'), + np.arange(3.0, -1.0, -1.0), + np.arange(3, -1, -1)], ids=lambda x: str(x.dtype)) + def test_get_loc_decreasing(self, values): + # GH 25860 + index = IntervalIndex.from_arrays(values[1:], values[:-1]) + result = index.get_loc(index[0]) + expected = 0 + assert result == expected - # Make consistent with test_interval_new.py (see #16316, #16386) - @pytest.mark.parametrize('item', [ - [3], np.arange(1, 5), [Interval(1, 4)], interval_range(1, 4)]) + @pytest.mark.parametrize('item', [[3], np.arange(0.5, 5, 0.5)]) def test_get_indexer_length_one(self, item, closed): # GH 17284 index = IntervalIndex.from_tuples([(0, 5)], closed=closed) @@ -628,22 +500,12 @@ def test_get_indexer_length_one(self, item, closed): expected = np.array([0] * len(item), dtype='intp') tm.assert_numpy_array_equal(result, expected) - # Make consistent with test_interval_new.py (see #16316, #16386) - @pytest.mark.parametrize('arrays', [ - (date_range('20180101', periods=4), date_range('20180103', periods=4)), - (date_range('20180101', periods=4, tz='US/Eastern'), - date_range('20180103', periods=4, tz='US/Eastern')), - (timedelta_range('0 days', periods=4), - timedelta_range('2 days', periods=4))], ids=lambda x: str(x[0].dtype)) - def test_get_reindexer_datetimelike(self, arrays): - # GH 20636 - index = IntervalIndex.from_arrays(*arrays) - tuples = [(index[0].left, index[0].left + pd.Timedelta('12H')), - (index[-1].right - pd.Timedelta('12H'), index[-1].right)] - target = IntervalIndex.from_tuples(tuples) - - result = index._get_reindexer(target) - expected = np.array([0, 3], dtype='intp') + @pytest.mark.parametrize('size', [1, 5]) + def test_get_indexer_length_one_interval(self, size, closed): + # GH 17284 + index = IntervalIndex.from_tuples([(0, 5)], closed=closed) + result = index.get_indexer([Interval(0, 5, closed)] * size) + expected = np.array([0] * size, dtype='intp') tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize('breaks', [ @@ -736,23 +598,6 @@ def test_maybe_convert_i8_errors(self, breaks1, breaks2, make_key): with pytest.raises(ValueError, match=msg): index._maybe_convert_i8(key) - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_contains(self): - # Only endpoints are valid. - i = IntervalIndex.from_arrays([0, 1], [1, 2]) - - # Invalid - assert 0 not in i - assert 1 not in i - assert 2 not in i - - # Valid - assert Interval(0, 1) in i - assert Interval(0, 2) in i - assert Interval(0, 0.5) in i - assert Interval(3, 5) not in i - assert Interval(-1, 0, closed='left') not in i - def test_contains_method(self): # can select values that are IN the range of a value i = IntervalIndex.from_arrays([0, 1], [1, 2]) @@ -790,7 +635,6 @@ def test_dropna(self, closed): result = ii.dropna() tm.assert_index_equal(result, expected) - # TODO: check this behavior is consistent with test_interval_new.py def test_non_contiguous(self, closed): index = IntervalIndex.from_tuples([(0, 1), (2, 3)], closed=closed) target = [0.5, 1.5, 2.5] @@ -936,8 +780,8 @@ def test_datetime(self, tz): assert Timestamp('2000-01-01', tz=tz) not in index assert Timestamp('2000-01-01T12', tz=tz) not in index assert Timestamp('2000-01-02', tz=tz) not in index - iv_true = Interval(Timestamp('2000-01-01T08', tz=tz), - Timestamp('2000-01-01T18', tz=tz)) + iv_true = Interval(Timestamp('2000-01-02', tz=tz), + Timestamp('2000-01-03', tz=tz)) iv_false = Interval(Timestamp('1999-12-31', tz=tz), Timestamp('2000-01-01', tz=tz)) assert iv_true in index diff --git a/pandas/tests/indexes/interval/test_interval_new.py b/pandas/tests/indexes/interval/test_interval_new.py index 5599009dbc898b..cef230e98a6eec 100644 --- a/pandas/tests/indexes/interval/test_interval_new.py +++ b/pandas/tests/indexes/interval/test_interval_new.py @@ -1,11 +1,10 @@ import numpy as np import pytest -from pandas import Int64Index, Interval, IntervalIndex +from pandas import Interval, IntervalIndex +from pandas.core.indexes.base import InvalidIndexError import pandas.util.testing as tm -pytestmark = pytest.mark.skip(reason="new indexing tests for issue 16316") - class TestIntervalIndex: @@ -127,41 +126,46 @@ def test_slice_locs_with_ints_and_floats_succeeds(self): # decreasing non-overlapping index = IntervalIndex.from_tuples([(3, 4), (1, 2), (0, 1)]) - assert index.slice_locs(0, 1) == (3, 2) - assert index.slice_locs(0, 2) == (3, 1) + assert index.slice_locs(0, 1) == (3, 3) + assert index.slice_locs(0, 2) == (3, 2) assert index.slice_locs(0, 3) == (3, 1) - assert index.slice_locs(3, 1) == (1, 2) - assert index.slice_locs(3, 4) == (1, 0) - assert index.slice_locs(0, 4) == (3, 0) + assert index.slice_locs(3, 1) == (1, 3) + assert index.slice_locs(3, 4) == (1, 1) + assert index.slice_locs(0, 4) == (3, 1) @pytest.mark.parametrize("query", [ - [0, 1], [0, 2], [0, 3], [3, 1], [3, 4], [0, 4]]) + [0, 1], [0, 2], [0, 3], [0, 4]]) @pytest.mark.parametrize("tuples", [ - [(0, 2), (1, 3), (2, 4)], [(2, 4), (1, 3), (0, 2)], - [(0, 2), (0, 2), (2, 4)], [(0, 2), (2, 4), (0, 2)], + [(0, 2), (1, 3), (2, 4)], + [(2, 4), (1, 3), (0, 2)], + [(0, 2), (0, 2), (2, 4)], + [(0, 2), (2, 4), (0, 2)], [(0, 2), (0, 2), (2, 4), (1, 3)]]) def test_slice_locs_with_ints_and_floats_errors(self, tuples, query): + start, stop = query index = IntervalIndex.from_tuples(tuples) with pytest.raises(KeyError): - index.slice_locs(query) + index.slice_locs(start, stop) @pytest.mark.parametrize('query, expected', [ - ([Interval(1, 3, closed='right')], [1]), - ([Interval(1, 3, closed='left')], [-1]), - ([Interval(1, 3, closed='both')], [-1]), - ([Interval(1, 3, closed='neither')], [-1]), + ([Interval(2, 4, closed='right')], [1]), + ([Interval(2, 4, closed='left')], [-1]), + ([Interval(2, 4, closed='both')], [-1]), + ([Interval(2, 4, closed='neither')], [-1]), ([Interval(1, 4, closed='right')], [-1]), ([Interval(0, 4, closed='right')], [-1]), - ([Interval(1, 2, closed='right')], [-1]), - ([Interval(2, 4, closed='right'), Interval(1, 3, closed='right')], - [2, 1]), - ([Interval(1, 3, closed='right'), Interval(0, 2, closed='right')], + ([Interval(0.5, 1.5, closed='right')], [-1]), + ([Interval(2, 4, closed='right'), Interval(0, 1, closed='right')], [1, -1]), - ([Interval(1, 3, closed='right'), Interval(1, 3, closed='left')], + ([Interval(2, 4, closed='right'), Interval(2, 4, closed='right')], + [1, 1]), + ([Interval(5, 7, closed='right'), Interval(2, 4, closed='right')], + [2, 1]), + ([Interval(2, 4, closed='right'), Interval(2, 4, closed='left')], [1, -1])]) def test_get_indexer_with_interval(self, query, expected): - tuples = [(0, 2.5), (1, 3), (2, 4)] + tuples = [(0, 2), (2, 4), (5, 7)] index = IntervalIndex.from_tuples(tuples, closed='right') result = index.get_indexer(query) @@ -204,7 +208,7 @@ def test_get_indexer_errors(self, tuples, closed): msg = ('cannot handle overlapping indices; use ' 'IntervalIndex.get_indexer_non_unique') - with pytest.raises(ValueError, match=msg): + with pytest.raises(InvalidIndexError, match=msg): index.get_indexer([0, 2]) @pytest.mark.parametrize('query, expected', [ @@ -229,16 +233,16 @@ def test_get_indexer_non_unique_with_int_and_float(self, query, expected): index = IntervalIndex.from_tuples(tuples, closed='left') result_indexer, result_missing = index.get_indexer_non_unique(query) - expected_indexer = Int64Index(expected[0]) + expected_indexer = np.array(expected[0], dtype='intp') expected_missing = np.array(expected[1], dtype='intp') - tm.assert_index_equal(result_indexer, expected_indexer) + tm.assert_numpy_array_equal(result_indexer, expected_indexer) tm.assert_numpy_array_equal(result_missing, expected_missing) # TODO we may also want to test get_indexer for the case when # the intervals are duplicated, decreasing, non-monotonic, etc.. - def test_contains(self): + def test_contains_dunder(self): index = IntervalIndex.from_arrays([0, 1], [1, 2], closed='right') @@ -254,23 +258,3 @@ def test_contains(self): assert Interval(-1, 0, closed='left') not in index assert Interval(0, 1, closed='left') not in index assert Interval(0, 1, closed='both') not in index - - def test_contains_method(self): - - index = IntervalIndex.from_arrays([0, 1], [1, 2], closed='right') - - assert not index.contains(0) - assert index.contains(0.1) - assert index.contains(0.5) - assert index.contains(1) - - assert index.contains(Interval(0, 1, closed='right')) - assert not index.contains(Interval(0, 1, closed='left')) - assert not index.contains(Interval(0, 1, closed='both')) - assert not index.contains(Interval(0, 2, closed='right')) - - assert not index.contains(Interval(0, 3, closed='right')) - assert not index.contains(Interval(1, 3, closed='right')) - - assert not index.contains(20) - assert not index.contains(-20) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index d89d282fb785b8..26a4463d421a43 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -250,6 +250,19 @@ def test_contains(self): list('aabbca') + [np.nan], categories=list('cabdef')) assert np.nan in ci + @pytest.mark.parametrize('item, expected', [ + (pd.Interval(0, 1), True), + (1.5, True), + (pd.Interval(0.5, 1.5), False), + ('a', False), + (pd.Timestamp(1), False), + (pd.Timedelta(1), False)], ids=str) + def test_contains_interval(self, item, expected): + # GH 23705 + ci = CategoricalIndex(IntervalIndex.from_breaks(range(3))) + result = item in ci + assert result is expected + def test_map(self): ci = pd.CategoricalIndex(list('ABABC'), categories=list('CBA'), ordered=True) diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index d201b9644378fa..76f0b94ea39048 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Interval, IntervalIndex, Series +from pandas import DataFrame, IntervalIndex, Series import pandas.util.testing as tm @@ -11,26 +11,6 @@ class TestIntervalIndex: def setup_method(self, method): self.s = Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_loc_with_scalar(self): - - s = self.s - - expected = s.iloc[:3] - tm.assert_series_equal(expected, s.loc[:3]) - tm.assert_series_equal(expected, s.loc[:2.5]) - tm.assert_series_equal(expected, s.loc[0.1:2.5]) - tm.assert_series_equal(expected, s.loc[-1:3]) - - expected = s.iloc[1:4] - tm.assert_series_equal(expected, s.loc[[1.5, 2.5, 3.5]]) - tm.assert_series_equal(expected, s.loc[[2, 3, 4]]) - tm.assert_series_equal(expected, s.loc[[1.5, 3, 4]]) - - expected = s.iloc[2:5] - tm.assert_series_equal(expected, s.loc[s >= 2]) - - # TODO: check this behavior is consistent with test_interval_new.py def test_getitem_with_scalar(self): s = self.s @@ -39,7 +19,6 @@ def test_getitem_with_scalar(self): tm.assert_series_equal(expected, s[:3]) tm.assert_series_equal(expected, s[:2.5]) tm.assert_series_equal(expected, s[0.1:2.5]) - tm.assert_series_equal(expected, s[-1:3]) expected = s.iloc[1:4] tm.assert_series_equal(expected, s[[1.5, 2.5, 3.5]]) @@ -49,7 +28,6 @@ def test_getitem_with_scalar(self): expected = s.iloc[2:5] tm.assert_series_equal(expected, s[s >= 2]) - # TODO: check this behavior is consistent with test_interval_new.py @pytest.mark.parametrize('direction', ['increasing', 'decreasing']) def test_nonoverlapping_monotonic(self, direction, closed): tpls = [(0, 1), (2, 3), (4, 5)] @@ -83,137 +61,6 @@ def test_nonoverlapping_monotonic(self, direction, closed): assert s[key] == expected assert s.loc[key] == expected - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_with_interval(self): - - s = self.s - expected = 0 - - result = s.loc[Interval(0, 1)] - assert result == expected - - result = s[Interval(0, 1)] - assert result == expected - - expected = s.iloc[3:5] - result = s.loc[Interval(3, 6)] - tm.assert_series_equal(expected, result) - - expected = s.iloc[3:5] - result = s.loc[[Interval(3, 6)]] - tm.assert_series_equal(expected, result) - - expected = s.iloc[3:5] - result = s.loc[[Interval(3, 5)]] - tm.assert_series_equal(expected, result) - - # missing - with pytest.raises(KeyError): - s.loc[Interval(-2, 0)] - - with pytest.raises(KeyError): - s[Interval(-2, 0)] - - with pytest.raises(KeyError): - s.loc[Interval(5, 6)] - - with pytest.raises(KeyError): - s[Interval(5, 6)] - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_with_slices(self): - - s = self.s - - # slice of interval - with pytest.raises(NotImplementedError): - s.loc[Interval(3, 6):] - - with pytest.raises(NotImplementedError): - s[Interval(3, 6):] - - expected = s.iloc[3:5] - result = s[[Interval(3, 6)]] - tm.assert_series_equal(expected, result) - - # slice of scalar with step != 1 - with pytest.raises(ValueError): - s[0:4:2] - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_with_overlaps(self): - - s = self.s - expected = s.iloc[[3, 4, 3, 4]] - result = s.loc[[Interval(3, 6), Interval(3, 6)]] - tm.assert_series_equal(expected, result) - - idx = IntervalIndex.from_tuples([(1, 5), (3, 7)]) - s = Series(range(len(idx)), index=idx) - - result = s[4] - expected = s - tm.assert_series_equal(expected, result) - - result = s[[4]] - expected = s - tm.assert_series_equal(expected, result) - - result = s.loc[[4]] - expected = s - tm.assert_series_equal(expected, result) - - result = s[Interval(3, 5)] - expected = s - tm.assert_series_equal(expected, result) - - result = s.loc[Interval(3, 5)] - expected = s - tm.assert_series_equal(expected, result) - - # doesn't intersect unique set of intervals - with pytest.raises(KeyError): - s[[Interval(3, 5)]] - - with pytest.raises(KeyError): - s.loc[[Interval(3, 5)]] - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_non_unique(self): - - idx = IntervalIndex.from_tuples([(1, 3), (3, 7)]) - - s = Series(range(len(idx)), index=idx) - - result = s.loc[Interval(1, 3)] - assert result == 0 - - result = s.loc[[Interval(1, 3)]] - expected = s.iloc[0:1] - tm.assert_series_equal(expected, result) - - # To be removed, replaced by test_interval_new.py (see #16316, #16386) - def test_non_unique_moar(self): - - idx = IntervalIndex.from_tuples([(1, 3), (1, 3), (3, 7)]) - s = Series(range(len(idx)), index=idx) - - result = s.loc[Interval(1, 3)] - expected = s.iloc[[0, 1]] - tm.assert_series_equal(expected, result) - - # non-unique index and slices not allowed - with pytest.raises(ValueError): - s.loc[Interval(1, 3):] - - with pytest.raises(ValueError): - s[Interval(1, 3):] - - # non-unique - with pytest.raises(ValueError): - s[[Interval(1, 3)]] - - # TODO: check this behavior is consistent with test_interval_new.py def test_non_matching(self): s = self.s diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index a6c42dd0ec632c..aa016ac5dd1a74 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -4,8 +4,6 @@ from pandas import Interval, IntervalIndex, Series import pandas.util.testing as tm -pytestmark = pytest.mark.skip(reason="new indexing tests for issue 16316") - class TestIntervalIndex: @@ -72,10 +70,9 @@ def test_loc_with_scalar(self): assert s.loc[1.5] == 1 assert s.loc[2] == 1 - # TODO with __getitem__ same rules as loc, or positional ? - # assert s[1] == 0 - # assert s[1.5] == 1 - # assert s[2] == 1 + assert s[1] == 0 + assert s[1.5] == 1 + assert s[2] == 1 expected = s.iloc[1:4] tm.assert_series_equal(expected, s.loc[[1.5, 2.5, 3.5]]) @@ -107,22 +104,23 @@ def test_loc_with_slices(self): result = s[Interval(0, 1):Interval(2, 3)] tm.assert_series_equal(expected, result) - expected = s.iloc[4:] + expected = s.iloc[3:] result = s.loc[Interval(3, 4):] tm.assert_series_equal(expected, result) result = s[Interval(3, 4):] tm.assert_series_equal(expected, result) - with pytest.raises(KeyError): + msg = 'Interval objects are not currently supported' + with pytest.raises(NotImplementedError, match=msg): s.loc[Interval(3, 6):] - with pytest.raises(KeyError): + with pytest.raises(NotImplementedError, match=msg): s[Interval(3, 6):] - with pytest.raises(KeyError): + with pytest.raises(NotImplementedError, match=msg): s.loc[Interval(3, 4, closed='left'):] - with pytest.raises(KeyError): + with pytest.raises(NotImplementedError, match=msg): s[Interval(3, 4, closed='left'):] # TODO with non-existing intervals ? @@ -134,17 +132,14 @@ def test_loc_with_slices(self): tm.assert_series_equal(expected, s.loc[:3]) tm.assert_series_equal(expected, s.loc[:2.5]) tm.assert_series_equal(expected, s.loc[0.1:2.5]) + tm.assert_series_equal(expected, s.loc[-1:3]) - # TODO should this work? (-1 is not contained in any of the Intervals) - # tm.assert_series_equal(expected, s.loc[-1:3]) - - # TODO with __getitem__ same rules as loc, or positional ? - # tm.assert_series_equal(expected, s[:3]) - # tm.assert_series_equal(expected, s[:2.5]) - # tm.assert_series_equal(expected, s[0.1:2.5]) + tm.assert_series_equal(expected, s[:3]) + tm.assert_series_equal(expected, s[:2.5]) + tm.assert_series_equal(expected, s[0.1:2.5]) # slice of scalar with step != 1 - with pytest.raises(NotImplementedError): + with pytest.raises(ValueError): s[0:4:2] def test_loc_with_overlap(self): @@ -169,10 +164,10 @@ def test_loc_with_overlap(self): # interval expected = 0 result = s.loc[Interval(1, 5)] - tm.assert_series_equal(expected, result) + result == expected result = s[Interval(1, 5)] - tm.assert_series_equal(expected, result) + result == expected expected = s result = s.loc[[Interval(1, 5), Interval(3, 7)]] diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index a0e3df182b129d..92966e721aedc5 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -81,17 +81,20 @@ def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): " ambiguous|" "Cannot index with multidimensional key|" r"Wrong number of dimensions. values.ndim != ndim \[3 != 1\]|" + "No matching signature found|" # TypeError "unhashable type: 'numpy.ndarray'" # TypeError ) - if (isinstance(obj, Series) and idxr_id == 'getitem' - and index.inferred_type in [ + if (isinstance(obj, Series) and idxr_id == 'getitem' and + index.inferred_type in [ 'string', 'datetime64', 'period', 'timedelta64', 'boolean', 'categorical']): idxr[nd3] else: - if (isinstance(obj, DataFrame) and idxr_id == 'getitem' - and index.inferred_type == 'boolean'): + if (isinstance(obj, DataFrame) and idxr_id == 'getitem' and + index.inferred_type == 'boolean'): + error = TypeError + elif idxr_id == 'getitem' and index.inferred_type == 'interval': error = TypeError else: error = ValueError @@ -126,6 +129,7 @@ def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): "'pandas._libs.interval.IntervalTree' object has no attribute" " 'set_value'|" # AttributeError "unhashable type: 'numpy.ndarray'|" # TypeError + "No matching signature found|" # TypeError r"^\[\[\[" # pandas.core.indexing.IndexingError ) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 4f65251ebd9237..74ede682dfb5f1 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -878,13 +878,13 @@ def test_append_preserve_index_name(self): pd.Index(list('abc')), pd.CategoricalIndex('A B C'.split()), pd.CategoricalIndex('D E F'.split(), ordered=True), + pd.IntervalIndex.from_breaks([7, 8, 9, 10]), pd.DatetimeIndex([dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10), dt.datetime(2013, 1, 3, 7, 12)]), ] indexes_cannot_append_with_other = [ - pd.IntervalIndex.from_breaks([0, 1, 2, 3]), pd.MultiIndex.from_arrays(['A B C'.split(), 'D E F'.split()]), ] @@ -946,7 +946,7 @@ def test_append_different_columns_types(self, df_columns, series_index): def test_append_different_columns_types_raises( self, index_can_append, index_cannot_append_with_other): # GH18359 - # Dataframe.append will raise if IntervalIndex/MultiIndex appends + # Dataframe.append will raise if MultiIndex appends # or is appended to a different index type # # See also test 'test_append_different_columns_types' above for @@ -955,16 +955,10 @@ def test_append_different_columns_types_raises( df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append) ser = pd.Series([7, 8, 9], index=index_cannot_append_with_other, name=2) - msg = (r"unorderable types: (Interval|int)\(\) (<|>) " - r"(int|long|float|str|Timestamp)\(\)|" - r"Expected tuple, got (int|long|float|str)|" - r"Cannot compare type 'Timestamp' with type '(int|long)'|" - r"'(<|>)' not supported between instances of 'int' " - r"and '(str|Timestamp)'|" - r"the other index needs to be an IntervalIndex too, but was" - r" type {}|" - r"object of type '(int|float|Timestamp)' has no len\(\)|" - "Expected tuple, got str") + msg = (r"Expected tuple, got (int|long|float|str|" + r"pandas._libs.interval.Interval)|" + r"object of type '(int|float|Timestamp|" + r"pandas._libs.interval.Interval)' has no len\(\)|") with pytest.raises(TypeError, match=msg): df.append(ser) From 7ab9ff579daebd6b16c357221850f85c7e218d97 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 2 Jul 2019 22:15:55 -0400 Subject: [PATCH 134/238] TST: handle inconsistent ordering in resample_api compat test (#27196) --- ci/deps/azure-36-locale.yaml | 2 +- ci/deps/travis-36-slow.yaml | 2 +- pandas/tests/resample/test_resample_api.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index 99fa4d5c9e160f..8f8273f57c3fea 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -20,7 +20,7 @@ dependencies: - xlsxwriter=0.9.8 - xlwt=1.2.0 # universal - - pytest>=4.0.2 + - pytest>=4.0.2,<5.0.0 - pytest-xdist - pytest-mock - pytest-azurepipelines diff --git a/ci/deps/travis-36-slow.yaml b/ci/deps/travis-36-slow.yaml index 87021d5dae04ef..538a82f66e4c87 100644 --- a/ci/deps/travis-36-slow.yaml +++ b/ci/deps/travis-36-slow.yaml @@ -25,7 +25,7 @@ dependencies: - xlsxwriter - xlwt # universal - - pytest>=4.0.2 + - pytest>=4.0.2,<5.0.0 - pytest-xdist - pytest-mock - moto diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index ca2fb1acb6afa8..6943d30276a21b 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -247,7 +247,7 @@ def test_agg_consistency(): check_stacklevel=False): expected = r[['A', 'B', 'C']].agg({'r1': 'mean', 'r2': 'sum'}) result = r.agg({'r1': 'mean', 'r2': 'sum'}) - assert_frame_equal(result, expected) + assert_frame_equal(result, expected, check_like=True) # TODO: once GH 14008 is fixed, move these tests into # `Base` test class From 8393e3769b569ba5ea740a1e0c4b6249befa5ad6 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Wed, 3 Jul 2019 02:26:37 -0600 Subject: [PATCH 135/238] DEPR: Deprecate ordered=None for CategoricalDtype (#26403) --- doc/source/whatsnew/v0.23.0.rst | 22 +++++-- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/arrays/categorical.py | 10 +-- pandas/core/dtypes/dtypes.py | 62 +++++++++++++------ pandas/core/internals/construction.py | 2 +- pandas/core/series.py | 8 ++- pandas/io/packers.py | 9 +-- .../tests/arrays/categorical/test_dtypes.py | 8 +++ pandas/tests/dtypes/test_dtypes.py | 33 +++++++--- pandas/tests/indexes/test_category.py | 12 +++- pandas/tests/series/test_constructors.py | 31 +++++++++- pandas/tests/series/test_dtypes.py | 13 +++- 12 files changed, 163 insertions(+), 48 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index 41c946cc9a5593..62cf977d8c8ace 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -935,13 +935,23 @@ In previous versions, the default value for the ``ordered`` parameter was ``Fals New behavior: -.. ipython:: python +.. code-block:: ipython - from pandas.api.types import CategoricalDtype - cat = pd.Categorical(list('abcaba'), ordered=True, categories=list('cba')) - cat - cdt = CategoricalDtype(categories=list('cbad')) - cat.astype(cdt) + In [2]: from pandas.api.types import CategoricalDtype + + In [3]: cat = pd.Categorical(list('abcaba'), ordered=True, categories=list('cba')) + + In [4]: cat + Out[4]: + [a, b, c, a, b, a] + Categories (3, object): [c < b < a] + + In [5]: cdt = CategoricalDtype(categories=list('cbad')) + + In [6]: cat.astype(cdt) + Out[6]: + [a, b, c, a, b, a] + Categories (4, object): [c < b < a < d] Notice in the example above that the converted ``Categorical`` has retained ``ordered=True``. Had the default value for ``ordered`` remained as ``False``, the converted ``Categorical`` would have become unordered, despite ``ordered=False`` never being explicitly specified. To change the value of ``ordered``, explicitly pass it to the new dtype, e.g. ``CategoricalDtype(categories=list('cbad'), ordered=False)``. diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index e9d23cfd8efc12..bc7916ab39c40f 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -761,6 +761,7 @@ Other deprecations - :attr:`Series.imag` and :attr:`Series.real` are deprecated. (:issue:`18262`) - :meth:`Series.put` is deprecated. (:issue:`18262`) - :meth:`Index.item` and :meth:`Series.item` is deprecated. (:issue:`18262`) +- The default value ``ordered=None`` in :class:`~pandas.api.types.CategoricalDtype` has been deprecated in favor of ``ordered=False``. When converting between categorical types ``ordered=True`` must be explicitly passed in order to be preserved. (:issue:`26336`) - :meth:`Index.contains` is deprecated. Use ``key in index`` (``__contains__``) instead (:issue:`17753`). .. _whatsnew_0250.prior_deprecations: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b77a4f985067d3..9a4846c98bd226 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -332,7 +332,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, # sanitize input if is_categorical_dtype(values): if dtype.categories is None: - dtype = CategoricalDtype(values.categories, dtype.ordered) + dtype = CategoricalDtype(values.categories, dtype._ordered) elif not isinstance(values, (ABCIndexClass, ABCSeries)): # sanitize_array coerces np.nan to a string under certain versions # of numpy @@ -355,7 +355,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, codes, categories = factorize(values, sort=True) except TypeError: codes, categories = factorize(values, sort=False) - if dtype.ordered: + if dtype._ordered: # raise, as we don't have a sortable data structure and so # the user should give us one by specifying categories raise TypeError("'values' is not ordered, please " @@ -368,7 +368,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, "supported at this time") # we're inferring from values - dtype = CategoricalDtype(categories, dtype.ordered) + dtype = CategoricalDtype(categories, dtype._ordered) elif is_categorical_dtype(values): old_codes = (values._values.codes if isinstance(values, ABCSeries) @@ -433,7 +433,7 @@ def ordered(self): """ Whether the categories have an ordered relationship. """ - return self.dtype.ordered + return self.dtype._ordered @property def dtype(self) -> CategoricalDtype: @@ -847,7 +847,7 @@ def set_categories(self, new_categories, ordered=None, rename=False, """ inplace = validate_bool_kwarg(inplace, 'inplace') if ordered is None: - ordered = self.dtype.ordered + ordered = self.dtype._ordered new_dtype = CategoricalDtype(new_categories, ordered=ordered) cat = self if inplace else self.copy() diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 81e061a0fc7b4d..d8d910a16e32ab 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -17,6 +17,13 @@ str_type = str +# GH26403: sentinel value used for the default value of ordered in the +# CategoricalDtype constructor to detect when ordered=None is explicitly passed +ordered_sentinel = object() # type: object + +# TODO(GH26403): Replace with Optional[bool] or bool +OrderedType = Union[None, bool, object] + def register_extension_dtype(cls: Type[ExtensionDtype], ) -> Type[ExtensionDtype]: @@ -214,7 +221,9 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): _metadata = ('categories', 'ordered') _cache = {} # type: Dict[str_type, PandasExtensionDtype] - def __init__(self, categories=None, ordered: Optional[bool] = None): + def __init__(self, + categories=None, + ordered: OrderedType = ordered_sentinel): self._finalize(categories, ordered, fastpath=False) @classmethod @@ -230,7 +239,7 @@ def _from_fastpath(cls, def _from_categorical_dtype(cls, dtype: 'CategoricalDtype', categories=None, - ordered: Optional[bool] = None, + ordered: OrderedType = None, ) -> 'CategoricalDtype': if categories is ordered is None: return dtype @@ -330,11 +339,11 @@ def _from_values_or_dtype(cls, def _finalize(self, categories, - ordered: Optional[bool], + ordered: OrderedType, fastpath: bool = False, ) -> None: - if ordered is not None: + if ordered is not None and ordered is not ordered_sentinel: self.validate_ordered(ordered) if categories is not None: @@ -342,7 +351,8 @@ def _finalize(self, fastpath=fastpath) self._categories = categories - self._ordered = ordered + self._ordered = ordered if ordered is not ordered_sentinel else None + self._ordered_from_sentinel = ordered is ordered_sentinel def __setstate__(self, state: Dict[str_type, Any]) -> None: # for pickle compat. __get_state__ is defined in the @@ -355,12 +365,12 @@ def __hash__(self) -> int: # _hash_categories returns a uint64, so use the negative # space for when we have unknown categories to avoid a conflict if self.categories is None: - if self.ordered: + if self._ordered: return -1 else: return -2 # We *do* want to include the real self.ordered here - return int(self._hash_categories(self.categories, self.ordered)) + return int(self._hash_categories(self.categories, self._ordered)) def __eq__(self, other: Any) -> bool: """ @@ -379,7 +389,7 @@ def __eq__(self, other: Any) -> bool: return other == self.name elif other is self: return True - elif not (hasattr(other, 'ordered') and hasattr(other, 'categories')): + elif not (hasattr(other, '_ordered') and hasattr(other, 'categories')): return False elif self.categories is None or other.categories is None: # We're forced into a suboptimal corner thanks to math and @@ -388,10 +398,10 @@ def __eq__(self, other: Any) -> bool: # CDT(., .) = CDT(None, False) and *all* # CDT(., .) = CDT(None, True). return True - elif self.ordered or other.ordered: + elif self._ordered or other._ordered: # At least one has ordered=True; equal if both have ordered=True # and the same values for categories in the same order. - return ((self.ordered == other.ordered) and + return ((self._ordered == other._ordered) and self.categories.equals(other.categories)) else: # Neither has ordered=True; equal if both have the same categories, @@ -406,10 +416,10 @@ def __repr__(self): data = "None, " else: data = self.categories._format_data(name=self.__class__.__name__) - return tpl.format(data, self.ordered) + return tpl.format(data, self._ordered) @staticmethod - def _hash_categories(categories, ordered: Optional[bool] = True) -> int: + def _hash_categories(categories, ordered: OrderedType = True) -> int: from pandas.core.util.hashing import ( hash_array, _combine_hash_arrays, hash_tuples ) @@ -459,7 +469,7 @@ def construct_array_type(cls): return Categorical @staticmethod - def validate_ordered(ordered: bool) -> None: + def validate_ordered(ordered: OrderedType) -> None: """ Validates that we have a valid ordered parameter. If it is not a boolean, a TypeError will be raised. @@ -534,17 +544,25 @@ def update_dtype(self, dtype: 'CategoricalDtype') -> 'CategoricalDtype': msg = ('a CategoricalDtype must be passed to perform an update, ' 'got {dtype!r}').format(dtype=dtype) raise ValueError(msg) - elif dtype.categories is not None and dtype.ordered is self.ordered: - return dtype # dtype is CDT: keep current categories/ordered if None new_categories = dtype.categories if new_categories is None: new_categories = self.categories - new_ordered = dtype.ordered + new_ordered = dtype._ordered + new_ordered_from_sentinel = dtype._ordered_from_sentinel if new_ordered is None: - new_ordered = self.ordered + # maintain existing ordered if new dtype has ordered=None + new_ordered = self._ordered + if self._ordered and new_ordered_from_sentinel: + # only warn if we'd actually change the existing behavior + msg = ("Constructing a CategoricalDtype without specifying " + "`ordered` will default to `ordered=False` in a future " + "version, which will cause the resulting categorical's " + "`ordered` attribute to change to False; `ordered=True`" + " must be explicitly passed in order to be retained") + warnings.warn(msg, FutureWarning, stacklevel=3) return CategoricalDtype(new_categories, new_ordered) @@ -556,10 +574,18 @@ def categories(self): return self._categories @property - def ordered(self) -> Optional[bool]: + def ordered(self) -> OrderedType: """ Whether the categories have an ordered relationship. """ + # TODO: remove if block when ordered=None as default is deprecated + if self._ordered_from_sentinel and self._ordered is None: + # warn when accessing ordered if ordered=None and None was not + # explicitly passed to the constructor + msg = ("Constructing a CategoricalDtype without specifying " + "`ordered` will default to `ordered=False` in a future " + "version; `ordered=None` must be explicitly passed.") + warnings.warn(msg, FutureWarning, stacklevel=2) return self._ordered @property diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index bdfb854679a2c5..1044f25a6bbcd3 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -725,7 +725,7 @@ def _try_cast(arr, dtype, copy, raise_cast_failure): # We *do* allow casting to categorical, since we know # that Categorical is the only array type for 'category'. subarr = Categorical(arr, dtype.categories, - ordered=dtype.ordered) + ordered=dtype._ordered) elif is_extension_array_dtype(dtype): # create an extension array from its dtype array_type = dtype.construct_array_type()._from_sequence diff --git a/pandas/core/series.py b/pandas/core/series.py index 9179099562832c..f5f9f1ab4f9ab4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -19,7 +19,7 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( - _is_unorderable_exception, ensure_platform_int, is_bool, + _is_unorderable_exception, ensure_platform_int, is_bool, is_categorical, is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like, is_extension_array_dtype, is_extension_type, is_hashable, is_integer, is_iterator, is_list_like, is_scalar, is_string_like, is_timedelta64_dtype) @@ -170,6 +170,12 @@ def __init__(self, data=None, index=None, dtype=None, name=None, if data is None: data = {} if dtype is not None: + # GH 26336: explicitly handle 'category' to avoid warning + # TODO: Remove after CategoricalDtype defaults to ordered=False + if (isinstance(dtype, str) and dtype == 'category' and + is_categorical(data)): + dtype = data.dtype + dtype = self._validate_dtype(dtype) if isinstance(data, MultiIndex): diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 30e51e62aa764a..24995d1e2e4926 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -618,14 +618,9 @@ def decode(obj): return Interval(obj['left'], obj['right'], obj['closed']) elif typ == 'series': dtype = dtype_for(obj['dtype']) - pd_dtype = pandas_dtype(dtype) - index = obj['index'] - result = Series(unconvert(obj['data'], dtype, obj['compress']), - index=index, - dtype=pd_dtype, - name=obj['name']) - return result + data = unconvert(obj['data'], dtype, obj['compress']) + return Series(data, index=index, dtype=dtype, name=obj['name']) elif typ == 'block_manager': axes = obj['axes'] diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index b8c223ab3b04e2..14ad3c4d5e8608 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -160,6 +160,14 @@ def test_astype_category(self, dtype_ordered, cat_ordered): expected = cat tm.assert_categorical_equal(result, expected) + def test_astype_category_ordered_none_deprecated(self): + # GH 26336 + cdt1 = CategoricalDtype(categories=list('cdab'), ordered=True) + cdt2 = CategoricalDtype(categories=list('cedafb')) + cat = Categorical(list('abcdaba'), dtype=cdt1) + with tm.assert_produces_warning(FutureWarning): + cat.astype(cdt2) + def test_iter_python_types(self): # GH-19909 cat = Categorical([1, 2]) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index cf368f9980d72d..ad2195af73d96d 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -10,7 +10,8 @@ is_datetime64tz_dtype, is_datetimetz, is_dtype_equal, is_interval_dtype, is_period, is_period_dtype, is_string_dtype) from pandas.core.dtypes.dtypes import ( - CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype, registry) + CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype, + ordered_sentinel, registry) import pandas as pd from pandas import ( @@ -54,7 +55,8 @@ def test_pickle(self): class TestCategoricalDtype(Base): def create(self): - return CategoricalDtype() + # TODO(GH 26403): Remove when default ordered becomes False + return CategoricalDtype(ordered=None) def test_pickle(self): # make sure our cache is NOT pickled @@ -675,7 +677,8 @@ def test_unordered_same(self, ordered): def test_categories(self): result = CategoricalDtype(['a', 'b', 'c']) tm.assert_index_equal(result.categories, pd.Index(['a', 'b', 'c'])) - assert result.ordered is None + with tm.assert_produces_warning(FutureWarning): + assert result.ordered is None def test_equal_but_different(self, ordered_fixture): c1 = CategoricalDtype([1, 2, 3]) @@ -804,7 +807,8 @@ def test_categorical_categories(self): @pytest.mark.parametrize('new_categories', [ list('abc'), list('cba'), list('wxyz'), None]) - @pytest.mark.parametrize('new_ordered', [True, False, None]) + @pytest.mark.parametrize('new_ordered', [ + True, False, None, ordered_sentinel]) def test_update_dtype(self, ordered_fixture, new_categories, new_ordered): dtype = CategoricalDtype(list('abc'), ordered_fixture) new_dtype = CategoricalDtype(new_categories, new_ordered) @@ -813,11 +817,18 @@ def test_update_dtype(self, ordered_fixture, new_categories, new_ordered): if expected_categories is None: expected_categories = dtype.categories - expected_ordered = new_dtype.ordered - if expected_ordered is None: + expected_ordered = new_ordered + if new_ordered is ordered_sentinel or new_ordered is None: expected_ordered = dtype.ordered - result = dtype.update_dtype(new_dtype) + # GH 26336 + if new_ordered is ordered_sentinel and ordered_fixture is True: + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = dtype.update_dtype(new_dtype) + else: + result = dtype.update_dtype(new_dtype) + tm.assert_index_equal(result.categories, expected_categories) assert result.ordered is expected_ordered @@ -837,6 +848,14 @@ def test_update_dtype_errors(self, bad_dtype): with pytest.raises(ValueError, match=msg): dtype.update_dtype(bad_dtype) + @pytest.mark.parametrize('ordered', [ordered_sentinel, None, True, False]) + def test_ordered_none_default_deprecated(self, ordered): + # GH 26403: CDT.ordered only warns if ordered is not explicitly passed + dtype = CategoricalDtype(list('abc'), ordered=ordered) + warning = FutureWarning if ordered is ordered_sentinel else None + with tm.assert_produces_warning(warning): + dtype.ordered + @pytest.mark.parametrize('dtype', [ CategoricalDtype, diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 26a4463d421a43..3b5092c9010619 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -5,7 +5,7 @@ from pandas._libs import index as libindex -from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.dtypes import CategoricalDtype, ordered_sentinel import pandas as pd from pandas import Categorical, IntervalIndex @@ -503,6 +503,16 @@ def test_astype_category(self, name, dtype_ordered, index_ordered): expected = index tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('none, warning', [ + (None, None), (ordered_sentinel, FutureWarning)]) + def test_astype_category_ordered_none_deprecated(self, none, warning): + # GH 26336: only warn if None is not explicitly passed + cdt1 = CategoricalDtype(categories=list('cdab'), ordered=True) + cdt2 = CategoricalDtype(categories=list('cedafb'), ordered=none) + idx = CategoricalIndex(list('abcdaba'), dtype=cdt1) + with tm.assert_produces_warning(warning): + idx.astype(cdt2) + def test_reindex_base(self): # Determined by cat ordering. idx = CategoricalIndex(list("cab"), categories=list("cab")) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 49417942a35980..663d5ae5053030 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -12,12 +12,12 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, is_datetime64tz_dtype) +from pandas.core.dtypes.dtypes import CategoricalDtype, ordered_sentinel import pandas as pd from pandas import ( Categorical, DataFrame, Index, IntervalIndex, MultiIndex, NaT, Series, Timestamp, date_range, isna, period_range, timedelta_range) -from pandas.api.types import CategoricalDtype from pandas.core.arrays import period_array import pandas.util.testing as tm from pandas.util.testing import assert_series_equal @@ -372,6 +372,35 @@ def test_constructor_categorical_dtype(self): dtype=CategoricalDtype(['a', 'b'], ordered=True)) tm.assert_series_equal(result, expected, check_categorical=True) + def test_constructor_categorical_string(self): + # GH 26336: the string 'category' maintains existing CategoricalDtype + cdt = CategoricalDtype(categories=list('dabc'), ordered=True) + expected = Series(list('abcabc'), dtype=cdt) + + # Series(Categorical, dtype='category') keeps existing dtype + cat = Categorical(list('abcabc'), dtype=cdt) + result = Series(cat, dtype='category') + tm.assert_series_equal(result, expected) + + # Series(Series[Categorical], dtype='category') keeps existing dtype + result = Series(result, dtype='category') + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('none, warning', [ + (None, None), (ordered_sentinel, FutureWarning)]) + def test_categorical_ordered_none_deprecated(self, none, warning): + # GH 26336: only warn if None is not explicitly passed + cdt1 = CategoricalDtype(categories=list('cdab'), ordered=True) + cdt2 = CategoricalDtype(categories=list('cedafb'), ordered=none) + + cat = Categorical(list('abcdaba'), dtype=cdt1) + with tm.assert_produces_warning(warning, check_stacklevel=False): + Series(cat, dtype=cdt2) + + s = Series(cat) + with tm.assert_produces_warning(warning, check_stacklevel=False): + Series(s, dtype=cdt2) + def test_categorical_sideeffects_free(self): # Passing a categorical to a Series and then changing values in either # the series or the categorical should not change the values in the diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 59566ad3232c7e..287fd15ac3f08d 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -8,10 +8,11 @@ from pandas._libs.tslibs import iNaT +from pandas.core.dtypes.dtypes import CategoricalDtype, ordered_sentinel + import pandas as pd from pandas import ( Categorical, DataFrame, Index, Series, Timedelta, Timestamp, date_range) -from pandas.api.types import CategoricalDtype import pandas.util.testing as tm @@ -230,6 +231,16 @@ def test_astype_categories_deprecation_raises(self): with pytest.raises(ValueError, match="Got an unexpected"): s.astype('category', categories=['a', 'b'], ordered=True) + @pytest.mark.parametrize('none, warning', [ + (None, None), (ordered_sentinel, FutureWarning)]) + def test_astype_category_ordered_none_deprecated(self, none, warning): + # GH 26336: only warn if None is not explicitly passed + cdt1 = CategoricalDtype(categories=list('cdab'), ordered=True) + cdt2 = CategoricalDtype(categories=list('cedafb'), ordered=none) + s = Series(list('abcdaba'), dtype=cdt1) + with tm.assert_produces_warning(warning, check_stacklevel=False): + s.astype(cdt2) + def test_astype_from_categorical(self): items = ["a", "b", "c", "a"] s = Series(items) From 13f3f5aaec91943436ce840cad7f88bf892a68b2 Mon Sep 17 00:00:00 2001 From: Bhavani Ravi Date: Wed, 3 Jul 2019 16:51:57 +0530 Subject: [PATCH 136/238] ENH: Add max_level param to json_normalize (#26876) * ENH add max_level and ignore_keys configuration to nested_to_records max_level param defines at the level of nesting at which normalizing should stop. ignore_keys defines the keys to ignore without normalizing --- doc/source/user_guide/io.rst | 13 +++ doc/source/whatsnew/v0.25.0.rst | 23 ++++ pandas/io/json/normalize.py | 147 ++++++++++++++++--------- pandas/tests/io/json/test_normalize.py | 139 ++++++++++++++++++++--- 4 files changed, 256 insertions(+), 66 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index b9f90bf750482c..104066bcf70bbf 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2176,6 +2176,19 @@ into a flat table. json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']]) +The max_level parameter provides more control over which level to end normalization. +With max_level=1 the following snippet normalizes until 1st nesting level of the provided dict. + +.. ipython:: python + + data = [{'CreatedBy': {'Name': 'User001'}, + 'Lookup': {'TextField': 'Some text', + 'UserField': {'Id': 'ID001', + 'Name': 'Name001'}}, + 'Image': {'a': 'b'} + }] + json_normalize(data, max_level=1) + .. _io.jsonl: Line delimited json diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index bc7916ab39c40f..7169595d70093c 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -135,6 +135,29 @@ the output will truncate, if it's wider than :attr:`options.display.width` (default: 80 characters). +.. _whatsnew_0250.enhancements.json_normalize_with_max_level: + +Json normalize with max_level param support +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`json_normalize` normalizes the provided input dict to all +nested levels. The new max_level parameter provides more control over +which level to end normalization (:issue:`23843`): + +The repr now looks like this: + +.. ipython:: python + + from pandas.io.json import json_normalize + data = [{ + 'CreatedBy': {'Name': 'User001'}, + 'Lookup': {'TextField': 'Some text', + 'UserField': {'Id': 'ID001', 'Name': 'Name001'}}, + 'Image': {'a': 'b'} + }] + json_normalize(data, max_level=1) + + .. _whatsnew_0250.enhancements.other: Other enhancements diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index 2d8bc20b1195e5..5c6018d399c824 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -3,6 +3,7 @@ from collections import defaultdict import copy +from typing import DefaultDict, Dict, List, Optional, Union import numpy as np @@ -25,9 +26,11 @@ def _convert_to_line_delimits(s): return convert_json_to_lines(s) -def nested_to_record(ds, prefix="", sep=".", level=0): +def nested_to_record(ds, prefix: str = "", + sep: str = ".", level: int = 0, + max_level: Optional[int] = None): """ - A simplified json_normalize. + A simplified json_normalize Converts a nested dict into a flat dict ("record"), unlike json_normalize, it does not attempt to extract a subset of the data. @@ -36,13 +39,19 @@ def nested_to_record(ds, prefix="", sep=".", level=0): ---------- ds : dict or list of dicts prefix: the prefix, optional, default: "" - sep : string, default '.' + sep : str, default '.' Nested records will generate names separated by sep, e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar .. versionadded:: 0.20.0 - level: the number of levels in the jason string, optional, default: 0 + level: int, optional, default: 0 + The number of levels in the json string. + + max_level: int, optional, default: None + The max depth to normalize. + + .. versionadded:: 0.25.0 Returns ------- @@ -65,10 +74,8 @@ def nested_to_record(ds, prefix="", sep=".", level=0): if isinstance(ds, dict): ds = [ds] singleton = True - new_ds = [] for d in ds: - new_d = copy.deepcopy(d) for k, v in d.items(): # each key gets renamed with prefix @@ -79,16 +86,20 @@ def nested_to_record(ds, prefix="", sep=".", level=0): else: newkey = prefix + sep + k + # flatten if type is dict and + # current dict level < maximum level provided and # only dicts gets recurse-flattened # only at level>1 do we rename the rest of the keys - if not isinstance(v, dict): + if (not isinstance(v, dict) or + (max_level is not None and level >= max_level)): if level != 0: # so we skip copying for top level, common case v = new_d.pop(k) new_d[newkey] = v continue else: v = new_d.pop(k) - new_d.update(nested_to_record(v, newkey, sep, level + 1)) + new_d.update(nested_to_record(v, newkey, sep, level + 1, + max_level)) new_ds.append(new_d) if singleton: @@ -96,45 +107,58 @@ def nested_to_record(ds, prefix="", sep=".", level=0): return new_ds -def json_normalize(data, record_path=None, meta=None, - meta_prefix=None, - record_prefix=None, - errors='raise', - sep='.'): +def json_normalize(data: List[Dict], + record_path: Optional[Union[str, List]] = None, + meta: Optional[Union[str, List]] = None, + meta_prefix: Optional[str] = None, + record_prefix: Optional[str] = None, + errors: Optional[str] = 'raise', + sep: str = '.', + max_level: Optional[int] = None): """ Normalize semi-structured JSON data into a flat table. Parameters ---------- data : dict or list of dicts - Unserialized JSON objects - record_path : string or list of strings, default None + Unserialized JSON objects. + record_path : str or list of str, default None Path in each object to list of records. If not passed, data will be - assumed to be an array of records - meta : list of paths (string or list of strings), default None - Fields to use as metadata for each record in resulting table - meta_prefix : string, default None - record_prefix : string, default None + assumed to be an array of records. + meta : list of paths (str or list of str), default None + Fields to use as metadata for each record in resulting table. + meta_prefix : str, default None If True, prefix records with dotted (?) path, e.g. foo.bar.field if - path to records is ['foo', 'bar'] + meta is ['foo', 'bar']. + record_prefix : str, default None + If True, prefix records with dotted (?) path, e.g. foo.bar.field if + path to records is ['foo', 'bar']. errors : {'raise', 'ignore'}, default 'raise' + Configures error handling. * 'ignore' : will ignore KeyError if keys listed in meta are not - always present + always present. * 'raise' : will raise KeyError if keys listed in meta are not - always present + always present. .. versionadded:: 0.20.0 - sep : string, default '.' - Nested records will generate names separated by sep, - e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar + sep : str, default '.' + Nested records will generate names separated by sep. + e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar. .. versionadded:: 0.20.0 + max_level : int, default None + Max number of levels(depth of dict) to normalize. + if None, normalizes all levels. + + .. versionadded:: 0.25.0 + Returns ------- frame : DataFrame + Normalize semi-structured JSON data into a flat table. Examples -------- @@ -149,36 +173,62 @@ def json_normalize(data, record_path=None, meta=None, 1 NaN NaN Regner NaN Mose NaN 2 2.0 Faye Raker NaN NaN NaN NaN + >>> data = [{'id': 1, + ... 'name': "Cole Volk", + ... 'fitness': {'height': 130, 'weight': 60}}, + ... {'name': "Mose Reg", + ... 'fitness': {'height': 130, 'weight': 60}}, + ... {'id': 2, 'name': 'Faye Raker', + ... 'fitness': {'height': 130, 'weight': 60}}] + >>> json_normalize(data, max_level=0) + fitness id name + 0 {'height': 130, 'weight': 60} 1.0 Cole Volk + 1 {'height': 130, 'weight': 60} NaN Mose Reg + 2 {'height': 130, 'weight': 60} 2.0 Faye Raker + + Normalizes nested data upto level 1. + + >>> data = [{'id': 1, + ... 'name': "Cole Volk", + ... 'fitness': {'height': 130, 'weight': 60}}, + ... {'name': "Mose Reg", + ... 'fitness': {'height': 130, 'weight': 60}}, + ... {'id': 2, 'name': 'Faye Raker', + ... 'fitness': {'height': 130, 'weight': 60}}] + >>> json_normalize(data, max_level=1) + fitness.height fitness.weight id name + 0 130 60 1.0 Cole Volk + 1 130 60 NaN Mose Reg + 2 130 60 2.0 Faye Raker + >>> data = [{'state': 'Florida', ... 'shortname': 'FL', - ... 'info': { - ... 'governor': 'Rick Scott' - ... }, + ... 'info': {'governor': 'Rick Scott'}, ... 'counties': [{'name': 'Dade', 'population': 12345}, - ... {'name': 'Broward', 'population': 40000}, - ... {'name': 'Palm Beach', 'population': 60000}]}, + ... {'name': 'Broward', 'population': 40000}, + ... {'name': 'Palm Beach', 'population': 60000}]}, ... {'state': 'Ohio', ... 'shortname': 'OH', - ... 'info': { - ... 'governor': 'John Kasich' - ... }, + ... 'info': {'governor': 'John Kasich'}, ... 'counties': [{'name': 'Summit', 'population': 1234}, ... {'name': 'Cuyahoga', 'population': 1337}]}] >>> result = json_normalize(data, 'counties', ['state', 'shortname', - ... ['info', 'governor']]) + ... ['info', 'governor']]) >>> result - name population info.governor state shortname - 0 Dade 12345 Rick Scott Florida FL - 1 Broward 40000 Rick Scott Florida FL - 2 Palm Beach 60000 Rick Scott Florida FL - 3 Summit 1234 John Kasich Ohio OH - 4 Cuyahoga 1337 John Kasich Ohio OH + name population state shortname info.governor + 0 Dade 12345 Florida FL Rick Scott + 1 Broward 40000 Florida FL Rick Scott + 2 Palm Beach 60000 Florida FL Rick Scott + 3 Summit 1234 Ohio OH John Kasich + 4 Cuyahoga 1337 Ohio OH John Kasich >>> data = {'A': [1, 2]} >>> json_normalize(data, 'A', record_prefix='Prefix.') Prefix.0 0 1 1 2 + + Returns normalized data with columns prefixed with the given string. """ def _pull_field(js, spec): result = js @@ -206,7 +256,8 @@ def _pull_field(js, spec): # # TODO: handle record value which are lists, at least error # reasonably - data = nested_to_record(data, sep=sep) + data = nested_to_record(data, sep=sep, + max_level=max_level) return DataFrame(data) elif not isinstance(record_path, list): record_path = [record_path] @@ -219,10 +270,10 @@ def _pull_field(js, spec): meta = [m if isinstance(m, list) else [m] for m in meta] # Disastrously inefficient for now - records = [] + records = [] # type: List lengths = [] - meta_vals = defaultdict(list) + meta_vals = defaultdict(list) # type: DefaultDict if not isinstance(sep, str): sep = str(sep) meta_keys = [sep.join(val) for val in meta] @@ -241,10 +292,12 @@ def _recursive_extract(data, path, seen_meta, level=0): else: for obj in data: recs = _pull_field(obj, path[0]) + recs = [nested_to_record(r, sep=sep, + max_level=max_level) + if isinstance(r, dict) else r for r in recs] # For repeating the metadata later lengths.append(len(recs)) - for val, key in zip(meta, meta_keys): if level + 1 > len(val): meta_val = seen_meta[key] @@ -260,7 +313,6 @@ def _recursive_extract(data, path, seen_meta, level=0): "{err} is not always present" .format(err=e)) meta_vals[key].append(meta_val) - records.extend(recs) _recursive_extract(data, record_path, {}, level=0) @@ -279,8 +331,5 @@ def _recursive_extract(data, path, seen_meta, level=0): if k in result: raise ValueError('Conflicting metadata name {name}, ' 'need distinguishing prefix '.format(name=k)) - - # forcing dtype to object to avoid the metadata being casted to string result[k] = np.array(v, dtype=object).repeat(lengths) - return result diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index a7407d843c6c99..3210f7bc83bdd6 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -85,6 +85,19 @@ def missing_metadata(): ] +@pytest.fixture +def max_level_test_input_data(): + """ + input data to test json_normalize with max_level param + """ + return [{ + 'CreatedBy': {'Name': 'User001'}, + 'Lookup': {'TextField': 'Some text', + 'UserField': {'Id': 'ID001', 'Name': 'Name001'}}, + 'Image': {'a': 'b'} + }] + + class TestJSONNormalize: def test_simple_records(self): @@ -168,8 +181,6 @@ def test_more_deeply_nested(self, deep_nested): result = json_normalize(deep_nested, ['states', 'cities'], meta=['country', ['states', 'name']]) - # meta_prefix={'states': 'state_'}) - ex_data = {'country': ['USA'] * 4 + ['Germany'] * 3, 'states.name': ['California', 'California', 'Ohio', 'Ohio', 'Bayern', 'Nordrhein-Westfalen', @@ -294,6 +305,50 @@ def test_missing_field(self, author_missing_data): expected = DataFrame(ex_data) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("max_level,expected", [ + (0, [{"TextField": "Some text", + 'UserField': {'Id': 'ID001', + 'Name': 'Name001'}, + "CreatedBy": {"Name": "User001"}, + 'Image': {'a': 'b'}}, + {"TextField": "Some text", + 'UserField': {'Id': 'ID001', + 'Name': 'Name001'}, + "CreatedBy": {"Name": "User001"}, + 'Image': {'a': 'b'}}]), + (1, [{"TextField": "Some text", + "UserField.Id": "ID001", + "UserField.Name": "Name001", + "CreatedBy": {"Name": "User001"}, + 'Image': {'a': 'b'}}, + {"TextField": "Some text", + "UserField.Id": "ID001", + "UserField.Name": "Name001", + "CreatedBy": {"Name": "User001"}, + 'Image': {'a': 'b'}}])]) + def test_max_level_with_records_path(self, max_level, expected): + # GH23843: Enhanced JSON normalize + test_input = [{'CreatedBy': {'Name': 'User001'}, + 'Lookup': [{'TextField': 'Some text', + 'UserField': {'Id': 'ID001', + 'Name': 'Name001'}}, + {'TextField': 'Some text', + 'UserField': {'Id': 'ID001', + 'Name': 'Name001'}} + ], + 'Image': {'a': 'b'}, + 'tags': [{'foo': 'something', 'bar': 'else'}, + {'foo': 'something2', 'bar': 'else2'}] + }] + + result = json_normalize(test_input, + record_path=["Lookup"], + meta=[["CreatedBy"], ["Image"]], + max_level=max_level) + expected_df = DataFrame(data=expected, + columns=result.columns.values) + tm.assert_equal(expected_df, result) + class TestNestedToRecord: @@ -301,7 +356,6 @@ def test_flat_stays_flat(self): recs = [dict(flat1=1, flat2=2), dict(flat1=3, flat2=4), ] - result = nested_to_record(recs) expected = recs assert result == expected @@ -356,20 +410,6 @@ def test_missing_meta(self, missing_metadata): record_path='addresses', meta='name', errors='ignore') - ex_data = [ - {'city': 'Massillon', - 'number': 9562, - 'state': 'OH', - 'street': 'Morris St.', - 'zip': 44646, - 'name': 'Alice'}, - {'city': 'Elizabethton', - 'number': 8449, - 'state': 'TN', - 'street': 'Spring St.', - 'zip': 37643, - 'name': np.nan} - ] ex_data = [ ['Massillon', 9562, 'OH', 'Morris St.', 44646, 'Alice'], ['Elizabethton', 8449, 'TN', 'Spring St.', 37643, np.nan] @@ -460,3 +500,68 @@ def test_nonetype_multiple_levels(self): 'location.country.state.town.info.y': -33.148521423339844, 'location.country.state.town.info.z': 27.572303771972656} assert result == expected + + @pytest.mark.parametrize("max_level, expected", [ + (None, + [{'CreatedBy.Name': 'User001', + 'Lookup.TextField': 'Some text', + 'Lookup.UserField.Id': 'ID001', + 'Lookup.UserField.Name': 'Name001', + 'Image.a': 'b' + }]), + (0, + [{'CreatedBy': {'Name': 'User001'}, + 'Lookup': {'TextField': 'Some text', + 'UserField': {'Id': 'ID001', 'Name': 'Name001'}}, + 'Image': {'a': 'b'} + }]), + (1, + [{'CreatedBy.Name': 'User001', + 'Lookup.TextField': 'Some text', + 'Lookup.UserField': {'Id': 'ID001', + 'Name': 'Name001'}, + 'Image.a': 'b' + }]) + ]) + def test_with_max_level(self, max_level, + expected, max_level_test_input_data): + # GH23843: Enhanced JSON normalize + output = nested_to_record(max_level_test_input_data, + max_level=max_level) + assert output == expected + + def test_with_large_max_level(self): + # GH23843: Enhanced JSON normalize + max_level = 100 + input_data = [{'CreatedBy': { + "user": { + "name": {"firstname": "Leo", + "LastName": "Thomson"}, + "family_tree": { + "father": { + "name": "Father001", + "father": { + "Name": "Father002", + "father": { + "name": "Father003", + "father": { + "Name": "Father004", + }, + }, + } + } + } + } + }}] + expected = [ + {'CreatedBy.user.name.firstname': 'Leo', + 'CreatedBy.user.name.LastName': 'Thomson', + 'CreatedBy.user.family_tree.father.name': 'Father001', + 'CreatedBy.user.family_tree.father.father.Name': 'Father002', + 'CreatedBy.user.family_tree.father.father.father.name': + 'Father003', + 'CreatedBy.user.family_tree.father.father.father.father.Name': + 'Father004'} + ] + output = nested_to_record(input_data, max_level=max_level) + assert output == expected From 647c6356e7990b4970e7208b18a088ab4b6160d2 Mon Sep 17 00:00:00 2001 From: pilkibun <51503352+pilkibun@users.noreply.github.com> Date: Wed, 3 Jul 2019 11:26:30 +0000 Subject: [PATCH 137/238] CLN: fix cython warning (#27193) --- pandas/_libs/tslib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 0eb3e3c79aa47c..4e49f660f5e193 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -721,7 +721,7 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', return result, tz_out -cdef inline ignore_errors_out_of_bounds_fallback(ndarray[object] values): +cdef ignore_errors_out_of_bounds_fallback(ndarray[object] values): """ Fallback for array_to_datetime if an OutOfBoundsDatetime is raised and errors == "ignore" From 54c0d5a7aad1c09f9c50d8fe218187f5e1e6d1e8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 3 Jul 2019 04:27:34 -0700 Subject: [PATCH 138/238] DEPR: DataFrame.get_dtype_counts (#27145) --- doc/source/getting_started/basics.rst | 4 +- doc/source/user_guide/io.rst | 2 +- doc/source/user_guide/missing_data.rst | 2 +- doc/source/whatsnew/v0.10.1.rst | 2 +- doc/source/whatsnew/v0.11.0.rst | 2 +- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/computation/expressions.py | 6 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 8 ++ pandas/tests/frame/test_api.py | 10 +- pandas/tests/frame/test_arithmetic.py | 8 +- pandas/tests/frame/test_block_internals.py | 25 ++-- pandas/tests/frame/test_combine_concat.py | 6 +- pandas/tests/frame/test_constructors.py | 129 +++++++++------------ pandas/tests/frame/test_dtypes.py | 25 ++-- pandas/tests/frame/test_indexing.py | 35 ++++-- pandas/tests/frame/test_missing.py | 8 +- pandas/tests/frame/test_mutate_columns.py | 21 +++- pandas/tests/frame/test_reshape.py | 30 +++-- pandas/tests/frame/test_timezones.py | 19 +-- pandas/tests/generic/test_generic.py | 6 + pandas/tests/groupby/test_apply.py | 5 +- pandas/tests/groupby/test_groupby.py | 9 +- pandas/tests/io/pytables/test_pytables.py | 3 +- pandas/tests/reshape/test_pivot.py | 10 +- pandas/tests/reshape/test_reshape.py | 10 +- pandas/tests/series/test_arithmetic.py | 12 +- pandas/tests/series/test_dtypes.py | 2 - pandas/tests/sparse/frame/test_frame.py | 7 +- 29 files changed, 229 insertions(+), 180 deletions(-) diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 3ba79210a43ee1..e1508cb7b4e165 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -1968,11 +1968,11 @@ dtype of the column will be chosen to accommodate all of the data types pd.Series([1, 2, 3, 6., 'foo']) The number of columns of each type in a ``DataFrame`` can be found by calling -:meth:`~DataFrame.get_dtype_counts`. +``DataFrame.dtypes.value_counts()``. .. ipython:: python - dft.get_dtype_counts() + dft.dtypes.value_counts() Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 104066bcf70bbf..a8bc690efd3cad 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3767,7 +3767,7 @@ defaults to `nan`. store.append('df_mixed', df_mixed, min_itemsize={'values': 50}) df_mixed1 = store.select('df_mixed') df_mixed1 - df_mixed1.get_dtype_counts() + df_mixed1.dtypes.value_counts() # we have provided a minimum string column size store.root.df_mixed.table diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index ef77826e9a444f..6c36a6470f841a 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -105,7 +105,7 @@ pandas objects provide compatibility between ``NaT`` and ``NaN``. df2 df2.loc[['a', 'c', 'h'], ['one', 'timestamp']] = np.nan df2 - df2.get_dtype_counts() + df2.dtypes.value_counts() .. _missing.inserting: diff --git a/doc/source/whatsnew/v0.10.1.rst b/doc/source/whatsnew/v0.10.1.rst index 7d51ded1cad195..c4251f70d85b65 100644 --- a/doc/source/whatsnew/v0.10.1.rst +++ b/doc/source/whatsnew/v0.10.1.rst @@ -89,7 +89,7 @@ You can now store ``datetime64`` in data columns store.append('df_mixed', df_mixed) df_mixed1 = store.select('df_mixed') df_mixed1 - df_mixed1.get_dtype_counts() + df_mixed1.dtypes.value_counts() You can pass ``columns`` keyword to select to filter a list of the return columns, this is equivalent to passing a diff --git a/doc/source/whatsnew/v0.11.0.rst b/doc/source/whatsnew/v0.11.0.rst index 31fab6c9aeb74d..03480ebeed78ee 100644 --- a/doc/source/whatsnew/v0.11.0.rst +++ b/doc/source/whatsnew/v0.11.0.rst @@ -296,7 +296,7 @@ Furthermore ``datetime64[ns]`` columns are created by default, when passed datet df # datetime64[ns] out of the box - df.get_dtype_counts() + df.dtypes.value_counts() # use the traditional nan, which is mapped to NaT internally df.loc[df.index[2:4], ['A', 'timestamp']] = np.nan diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 7169595d70093c..9630595b6ac1b5 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -786,6 +786,7 @@ Other deprecations - :meth:`Index.item` and :meth:`Series.item` is deprecated. (:issue:`18262`) - The default value ``ordered=None`` in :class:`~pandas.api.types.CategoricalDtype` has been deprecated in favor of ``ordered=False``. When converting between categorical types ``ordered=True`` must be explicitly passed in order to be preserved. (:issue:`26336`) - :meth:`Index.contains` is deprecated. Use ``key in index`` (``__contains__``) instead (:issue:`17753`). +- :meth:`DataFrame.get_dtype_counts` is deprecated. (:issue:`18262`) .. _whatsnew_0250.prior_deprecations: diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index f293b3b33e8d38..b01000a7aee5bd 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -79,11 +79,11 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): # check for dtype compatibility dtypes = set() for o in [a, b]: - if hasattr(o, 'get_dtype_counts'): - s = o.get_dtype_counts() + if hasattr(o, 'dtypes'): + s = o.dtypes.value_counts() if len(s) > 1: return False - dtypes |= set(s.index) + dtypes |= set(s.index.astype(str)) elif isinstance(o, np.ndarray): dtypes |= {o.dtype.name} diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3ff3fff22f4f04..d3ce77c0684f91 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2326,7 +2326,7 @@ def _sizeof_fmt(num, size_qualifier): else: _verbose_repr() - counts = self.get_dtype_counts() + counts = self._data.get_dtype_counts() dtypes = ['{k}({kk:d})'.format(k=k[0], kk=k[1]) for k in sorted(counts.items())] lines.append('dtypes: {types}'.format(types=', '.join(dtypes))) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 822428c6787bea..0679aa27b1ad34 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5263,6 +5263,10 @@ def get_dtype_counts(self): """ Return counts of unique dtypes in this object. + .. deprecated:: 0.25.0 + + Use `.dtypes.value_counts()` instead. + Returns ------- dtype : Series @@ -5288,6 +5292,10 @@ def get_dtype_counts(self): object 1 dtype: int64 """ + warnings.warn("`get_dtype_counts` has been deprecated and will be " + "removed in a future version. For DataFrames use " + "`.dtypes.value_counts()", FutureWarning, + stacklevel=2) from pandas import Series return Series(self._data.get_dtype_counts()) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index ed224e23fbe20c..6372029f2efe7a 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -7,8 +7,8 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, Series, SparseDataFrame, compat, date_range, - timedelta_range) + Categorical, DataFrame, Series, SparseDataFrame, SparseDtype, compat, + date_range, timedelta_range) import pandas.util.testing as tm from pandas.util.testing import ( assert_almost_equal, assert_frame_equal, assert_series_equal) @@ -433,11 +433,11 @@ def test_with_datetimelikes(self): 'B': timedelta_range('1 day', periods=10)}) t = df.T - result = t.get_dtype_counts() + result = t.dtypes.value_counts() if self.klass is DataFrame: - expected = Series({'object': 10}) + expected = Series({np.dtype('object'): 10}) else: - expected = Series({'Sparse[object, nan]': 10}) + expected = Series({SparseDtype(dtype=object): 10}) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 061e0d32e1f06e..bcbea9d7a22365 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -273,8 +273,8 @@ def test_df_flex_cmp_constant_return_types(self, opname): df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]}) const = 2 - result = getattr(df, opname)(const).get_dtype_counts() - tm.assert_series_equal(result, pd.Series([2], ['bool'])) + result = getattr(df, opname)(const).dtypes.value_counts() + tm.assert_series_equal(result, pd.Series([2], index=[np.dtype(bool)])) @pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le']) def test_df_flex_cmp_constant_return_types_empty(self, opname): @@ -283,8 +283,8 @@ def test_df_flex_cmp_constant_return_types_empty(self, opname): const = 2 empty = df.iloc[:0] - result = getattr(empty, opname)(const).get_dtype_counts() - tm.assert_series_equal(result, pd.Series([2], ['bool'])) + result = getattr(empty, opname)(const).dtypes.value_counts() + tm.assert_series_equal(result, pd.Series([2], index=[np.dtype(bool)])) # ------------------------------------------------------------------- diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 6fbc884829784e..f1cbd7763474ea 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -217,7 +217,7 @@ def test_construction_with_mixed(self, float_string_frame): df = DataFrame(data) # check dtypes - result = df.get_dtype_counts().sort_values() + result = df.dtypes expected = Series({'datetime64[ns]': 3}) # mixed-type frames @@ -225,11 +225,13 @@ def test_construction_with_mixed(self, float_string_frame): float_string_frame['timedelta'] = timedelta(days=1, seconds=1) assert float_string_frame['datetime'].dtype == 'M8[ns]' assert float_string_frame['timedelta'].dtype == 'm8[ns]' - result = float_string_frame.get_dtype_counts().sort_values() - expected = Series({'float64': 4, - 'object': 1, - 'datetime64[ns]': 1, - 'timedelta64[ns]': 1}).sort_values() + result = float_string_frame.dtypes + expected = Series([np.dtype('float64')] * 4 + + [np.dtype('object'), + np.dtype('datetime64[ns]'), + np.dtype('timedelta64[ns]')], + index=list('ABCD') + ['foo', 'datetime', + 'timedelta']) assert_series_equal(result, expected) def test_construction_with_conversions(self): @@ -409,11 +411,12 @@ def test_get_numeric_data(self): df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'f': Timestamp('20010102')}, index=np.arange(10)) - result = df.get_dtype_counts() - expected = Series({'int64': 1, 'float64': 1, - datetime64name: 1, objectname: 1}) - result = result.sort_index() - expected = expected.sort_index() + result = df.dtypes + expected = Series([np.dtype('float64'), + np.dtype('int64'), + np.dtype(objectname), + np.dtype(datetime64name)], + index=['a', 'b', 'c', 'f']) assert_series_equal(result, expected) df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index faa86acb1584f2..c1d057da91b8f7 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -17,8 +17,10 @@ def test_concat_multiple_frames_dtypes(self): A = DataFrame(data=np.ones((10, 2)), columns=[ 'foo', 'bar'], dtype=np.float64) B = DataFrame(data=np.ones((10, 2)), dtype=np.float32) - results = pd.concat((A, B), axis=1).get_dtype_counts() - expected = Series(dict(float64=2, float32=2)) + results = pd.concat((A, B), axis=1).dtypes + expected = Series([np.dtype('float64')] * 2 + + [np.dtype('float32')] * 2, + index=['foo', 'bar', 0, 1]) assert_series_equal(results, expected) @pytest.mark.parametrize('data', [ diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 981dc8b32b8cc5..73a8720adb5ccc 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1579,10 +1579,11 @@ def test_constructor_with_datetimes(self): 'D': Timestamp("20010101"), 'E': datetime(2001, 1, 2, 0, 0)}, index=np.arange(10)) - result = df.get_dtype_counts() - expected = Series({'int64': 1, datetime64name: 2, objectname: 2}) - result.sort_index() - expected.sort_index() + result = df.dtypes + expected = Series([np.dtype('int64')] + + [np.dtype(objectname)] * 2 + + [np.dtype(datetime64name)] * 2, + index=list("ABCDE")) tm.assert_series_equal(result, expected) # check with ndarray construction ndim==0 (e.g. we are passing a ndim 0 @@ -1591,21 +1592,13 @@ def test_constructor_with_datetimes(self): floatname: np.array(1., dtype=floatname), intname: np.array(1, dtype=intname)}, index=np.arange(10)) - result = df.get_dtype_counts() - expected = {objectname: 1} - if intname == 'int64': - expected['int64'] = 2 - else: - expected['int64'] = 1 - expected[intname] = 1 - if floatname == 'float64': - expected['float64'] = 2 - else: - expected['float64'] = 1 - expected[floatname] = 1 - - result = result.sort_index() - expected = Series(expected).sort_index() + result = df.dtypes + expected = Series([np.dtype('float64')] + + [np.dtype('int64')] + + [np.dtype('object')] + + [np.dtype('float64')] + + [np.dtype(intname)], + index=['a', 'b', 'c', floatname, intname]) tm.assert_series_equal(result, expected) # check with ndarray construction ndim>0 @@ -1613,8 +1606,13 @@ def test_constructor_with_datetimes(self): floatname: np.array([1.] * 10, dtype=floatname), intname: np.array([1] * 10, dtype=intname)}, index=np.arange(10)) - result = df.get_dtype_counts() - result = result.sort_index() + result = df.dtypes + expected = Series([np.dtype('float64')] + + [np.dtype('int64')] + + [np.dtype('object')] + + [np.dtype('float64')] + + [np.dtype(intname)], + index=['a', 'b', 'c', floatname, intname]) tm.assert_series_equal(result, expected) # GH 2809 @@ -1622,22 +1620,16 @@ def test_constructor_with_datetimes(self): datetimes = [ts.to_pydatetime() for ts in ind] datetime_s = Series(datetimes) assert datetime_s.dtype == 'M8[ns]' - df = DataFrame({'datetime_s': datetime_s}) - result = df.get_dtype_counts() - expected = Series({datetime64name: 1}) - result = result.sort_index() - expected = expected.sort_index() - tm.assert_series_equal(result, expected) # GH 2810 ind = date_range(start="2000-01-01", freq="D", periods=10) datetimes = [ts.to_pydatetime() for ts in ind] dates = [ts.date() for ts in ind] - df = DataFrame({'datetimes': datetimes, 'dates': dates}) - result = df.get_dtype_counts() - expected = Series({datetime64name: 1, objectname: 1}) - result = result.sort_index() - expected = expected.sort_index() + df = DataFrame(datetimes, columns=['datetimes']) + df['dates'] = dates + result = df.dtypes + expected = Series([np.dtype('datetime64[ns]'), np.dtype('object')], + index=['datetimes', 'dates']) tm.assert_series_equal(result, expected) # GH 7594 @@ -1693,75 +1685,59 @@ def test_constructor_datetimes_with_nulls(self): for arr in [np.array([None, None, None, None, datetime.now(), None]), np.array([None, None, datetime.now(), None])]: - result = DataFrame(arr).get_dtype_counts() - expected = Series({'datetime64[ns]': 1}) + result = DataFrame(arr).dtypes + expected = Series([np.dtype('datetime64[ns]')]) tm.assert_series_equal(result, expected) def test_constructor_for_list_with_dtypes(self): - # TODO(wesm): unused - intname = np.dtype(np.int_).name # noqa - floatname = np.dtype(np.float_).name # noqa - datetime64name = np.dtype('M8[ns]').name - objectname = np.dtype(np.object_).name - # test list of lists/ndarrays df = DataFrame([np.arange(5) for x in range(5)]) - result = df.get_dtype_counts() - expected = Series({'int64': 5}) + result = df.dtypes + expected = Series([np.dtype('int64')] * 5) + tm.assert_series_equal(result, expected) df = DataFrame([np.array(np.arange(5), dtype='int32') for x in range(5)]) - result = df.get_dtype_counts() - expected = Series({'int32': 5}) + result = df.dtypes + expected = Series([np.dtype('int64')] * 5) + tm.assert_series_equal(result, expected) # overflow issue? (we always expecte int64 upcasting here) df = DataFrame({'a': [2 ** 31, 2 ** 31 + 1]}) - result = df.get_dtype_counts() - expected = Series({'int64': 1}) - tm.assert_series_equal(result, expected) + assert df.dtypes.iloc[0] == np.dtype('int64') # GH #2751 (construction with no index specified), make sure we cast to # platform values df = DataFrame([1, 2]) - result = df.get_dtype_counts() - expected = Series({'int64': 1}) - tm.assert_series_equal(result, expected) + assert df.dtypes.iloc[0] == np.dtype('int64') df = DataFrame([1., 2.]) - result = df.get_dtype_counts() - expected = Series({'float64': 1}) - tm.assert_series_equal(result, expected) + assert df.dtypes.iloc[0] == np.dtype('float64') df = DataFrame({'a': [1, 2]}) - result = df.get_dtype_counts() - expected = Series({'int64': 1}) - tm.assert_series_equal(result, expected) + assert df.dtypes.iloc[0] == np.dtype('int64') df = DataFrame({'a': [1., 2.]}) - result = df.get_dtype_counts() - expected = Series({'float64': 1}) - tm.assert_series_equal(result, expected) + assert df.dtypes.iloc[0] == np.dtype('float64') df = DataFrame({'a': 1}, index=range(3)) - result = df.get_dtype_counts() - expected = Series({'int64': 1}) - tm.assert_series_equal(result, expected) + assert df.dtypes.iloc[0] == np.dtype('int64') df = DataFrame({'a': 1.}, index=range(3)) - result = df.get_dtype_counts() - expected = Series({'float64': 1}) - tm.assert_series_equal(result, expected) + assert df.dtypes.iloc[0] == np.dtype('float64') # with object list df = DataFrame({'a': [1, 2, 4, 7], 'b': [1.2, 2.3, 5.1, 6.3], 'c': list('abcd'), 'd': [datetime(2000, 1, 1) for i in range(4)], 'e': [1., 2, 4., 7]}) - result = df.get_dtype_counts() - expected = Series( - {'int64': 1, 'float64': 2, datetime64name: 1, objectname: 1}) - result = result.sort_index() - expected = expected.sort_index() + result = df.dtypes + expected = Series([np.dtype('int64'), + np.dtype('float64'), + np.dtype('object'), + np.dtype('datetime64[ns]'), + np.dtype('float64')], + index=list('abcde')) tm.assert_series_equal(result, expected) def test_constructor_frame_copy(self, float_frame): @@ -2077,16 +2053,19 @@ def test_from_records_misc_brokenness(self): rows.append([datetime(2010, 1, 1), 1]) rows.append([datetime(2010, 1, 2), 'hi']) # test col upconverts to obj df2_obj = DataFrame.from_records(rows, columns=['date', 'test']) - results = df2_obj.get_dtype_counts() - expected = Series({'datetime64[ns]': 1, 'object': 1}) + result = df2_obj.dtypes + expected = Series([np.dtype('datetime64[ns]'), np.dtype('object')], + index=['date', 'test']) + tm.assert_series_equal(result, expected) rows = [] rows.append([datetime(2010, 1, 1), 1]) rows.append([datetime(2010, 1, 2), 1]) df2_obj = DataFrame.from_records(rows, columns=['date', 'test']) - results = df2_obj.get_dtype_counts().sort_index() - expected = Series({'datetime64[ns]': 1, 'int64': 1}) - tm.assert_series_equal(results, expected) + result = df2_obj.dtypes + expected = Series([np.dtype('datetime64[ns]'), np.dtype('int64')], + index=['date', 'test']) + tm.assert_series_equal(result, expected) def test_from_records_empty(self): # 3562 diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index f68770d796292b..51578ba20b0471 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -836,23 +836,28 @@ def test_timedeltas(self): df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3, freq='D')), B=Series([timedelta(days=i) for i in range(3)]))) - result = df.get_dtype_counts().sort_index() - expected = Series( - {'datetime64[ns]': 1, 'timedelta64[ns]': 1}).sort_index() + result = df.dtypes + expected = Series([np.dtype('datetime64[ns]'), + np.dtype('timedelta64[ns]')], + index=list("AB")) assert_series_equal(result, expected) df['C'] = df['A'] + df['B'] - expected = Series( - {'datetime64[ns]': 2, 'timedelta64[ns]': 1}).sort_values() - result = df.get_dtype_counts().sort_values() + result = df.dtypes + expected = Series([np.dtype('datetime64[ns]'), + np.dtype('timedelta64[ns]'), + np.dtype('datetime64[ns]')], + index=list("ABC")) assert_series_equal(result, expected) # mixed int types df['D'] = 1 - expected = Series({'datetime64[ns]': 2, - 'timedelta64[ns]': 1, - 'int64': 1}).sort_values() - result = df.get_dtype_counts().sort_values() + result = df.dtypes + expected = Series([np.dtype('datetime64[ns]'), + np.dtype('timedelta64[ns]'), + np.dtype('datetime64[ns]'), + np.dtype('int64')], + index=list("ABCD")) assert_series_equal(result, expected) def test_arg_for_errors_in_astype(self): diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 4c1abfb1a7f6fb..f8af942f676579 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -300,15 +300,23 @@ def test_getitem_boolean_casting(self, datetime_frame): df['F1'] = df['F'].copy() casted = df[df > 0] - result = casted.get_dtype_counts() - expected = Series({'float64': 4, 'int32': 2, 'int64': 2}) + result = casted.dtypes + expected = Series([np.dtype('float64')] * 4 + + [np.dtype('int32')] * 2 + + [np.dtype('int64')] * 2, + index=['A', 'B', 'C', 'D', 'E', 'E1', 'F', 'F1']) assert_series_equal(result, expected) # int block splitting df.loc[df.index[1:3], ['E1', 'F1']] = 0 casted = df[df > 0] - result = casted.get_dtype_counts() - expected = Series({'float64': 6, 'int32': 1, 'int64': 1}) + result = casted.dtypes + expected = Series([np.dtype('float64')] * 4 + + [np.dtype('int32')] + + [np.dtype('float64')] + + [np.dtype('int64')] + + [np.dtype('float64')], + index=['A', 'B', 'C', 'D', 'E', 'E1', 'F', 'F1']) assert_series_equal(result, expected) # where dtype conversions @@ -615,8 +623,9 @@ def test_setitem_cast(self, float_frame): df = DataFrame(np.random.rand(30, 3), columns=tuple('ABC')) df['event'] = np.nan df.loc[10, 'event'] = 'foo' - result = df.get_dtype_counts().sort_values() - expected = Series({'float64': 3, 'object': 1}).sort_values() + result = df.dtypes + expected = Series([np.dtype('float64')] * 3 + [np.dtype('object')], + index=['A', 'B', 'C', 'event']) assert_series_equal(result, expected) # Test that data type is preserved . #5782 @@ -1614,8 +1623,10 @@ def test_setitem_single_column_mixed_datetime(self): df['timestamp'] = Timestamp('20010102') # check our dtypes - result = df.get_dtype_counts() - expected = Series({'float64': 3, 'datetime64[ns]': 1}) + result = df.dtypes + expected = Series([np.dtype('float64')] * 3 + + [np.dtype('datetime64[ns]')], + index=['foo', 'bar', 'baz', 'timestamp']) assert_series_equal(result, expected) # set an allowable datetime64 type @@ -2637,13 +2648,17 @@ def _check_get(df, cond, check_dtypes=True): for c in ['float32', 'float64', 'int32', 'int64']}) df.iloc[1, :] = 0 - result = df.where(df >= 0).get_dtype_counts() + result = df.dtypes + expected = Series([np.dtype('float32'), + np.dtype('float64'), + np.dtype('int32'), + np.dtype('int64')], + index=['float32', 'float64', 'int32', 'int64']) # when we don't preserve boolean casts # # expected = Series({ 'float32' : 1, 'float64' : 3 }) - expected = Series({'float32': 1, 'float64': 1, 'int32': 1, 'int64': 1}) assert_series_equal(result, expected) # aligning diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 807931567847f7..e40ae6dd5494dd 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -417,15 +417,13 @@ def test_fillna_downcast(self): def test_fillna_dtype_conversion(self): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) - result = df.get_dtype_counts().sort_values() - expected = Series({'object': 5}) + result = df.dtypes + expected = Series([np.dtype('object')] * 5, index=[1, 2, 3, 4, 5]) assert_series_equal(result, expected) result = df.fillna(1) expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) - result = result.get_dtype_counts().sort_values() - expected = Series({'int64': 5}) - assert_series_equal(result, expected) + assert_frame_equal(result, expected) # empty block df = DataFrame(index=range(3), columns=['A', 'B'], dtype='float64') diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index dc2ac5f728ec7e..ffc2a515bc4b72 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -158,17 +158,26 @@ def test_insert(self): # new item df['x'] = df['a'].astype('float32') - result = Series(dict(float32=1, float64=5)) - assert (df.get_dtype_counts().sort_index() == result).all() + result = df.dtypes + expected = Series([np.dtype('float64')] * 5 + [np.dtype('float32')], + index=['foo', 'c', 'bar', 'b', 'a', 'x']) + tm.assert_series_equal(result, expected) # replacing current (in different block) df['a'] = df['a'].astype('float32') - result = Series(dict(float32=2, float64=4)) - assert (df.get_dtype_counts().sort_index() == result).all() + result = df.dtypes + expected = Series([np.dtype('float64')] * 4 + + [np.dtype('float32')] * 2, + index=['foo', 'c', 'bar', 'b', 'a', 'x']) + tm.assert_series_equal(result, expected) df['y'] = df['a'].astype('int32') - result = Series(dict(float32=2, float64=4, int32=1)) - assert (df.get_dtype_counts().sort_index() == result).all() + result = df.dtypes + expected = Series([np.dtype('float64')] * 4 + + [np.dtype('float32')] * 2 + + [np.dtype('int32')], + index=['foo', 'c', 'bar', 'b', 'a', 'x', 'y']) + tm.assert_series_equal(result, expected) with pytest.raises(ValueError, match='already exists'): df.insert(1, 'a', df['b']) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index ac8d1557a4c43c..04c1375418e674 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -502,29 +502,41 @@ def test_unstack_dtypes(self): [2, 2, 3, 4]] df = DataFrame(rows, columns=list('ABCD')) - result = df.get_dtype_counts() - expected = Series({'int64': 4}) + result = df.dtypes + expected = Series([np.dtype('int64')] * 4, + index=list('ABCD')) assert_series_equal(result, expected) # single dtype df2 = df.set_index(['A', 'B']) df3 = df2.unstack('B') - result = df3.get_dtype_counts() - expected = Series({'int64': 4}) + result = df3.dtypes + expected = Series([np.dtype('int64')] * 4, + index=pd.MultiIndex.from_arrays([ + ['C', 'C', 'D', 'D'], + [1, 2, 1, 2] + ], names=(None, 'B'))) assert_series_equal(result, expected) # mixed df2 = df.set_index(['A', 'B']) df2['C'] = 3. df3 = df2.unstack('B') - result = df3.get_dtype_counts() - expected = Series({'int64': 2, 'float64': 2}) + result = df3.dtypes + expected = Series([np.dtype('float64')] * 2 + [np.dtype('int64')] * 2, + index=pd.MultiIndex.from_arrays([ + ['C', 'C', 'D', 'D'], + [1, 2, 1, 2] + ], names=(None, 'B'))) assert_series_equal(result, expected) - df2['D'] = 'foo' df3 = df2.unstack('B') - result = df3.get_dtype_counts() - expected = Series({'float64': 2, 'object': 2}) + result = df3.dtypes + expected = Series([np.dtype('float64')] * 2 + [np.dtype('object')] * 2, + index=pd.MultiIndex.from_arrays([ + ['C', 'C', 'D', 'D'], + [1, 2, 1, 2] + ], names=(None, 'B'))) assert_series_equal(result, expected) # GH7405 diff --git a/pandas/tests/frame/test_timezones.py b/pandas/tests/frame/test_timezones.py index 5b2f846eccdd5c..b7c73daae00029 100644 --- a/pandas/tests/frame/test_timezones.py +++ b/pandas/tests/frame/test_timezones.py @@ -150,13 +150,18 @@ def test_frame_no_datetime64_dtype(self, tz): # GH#2810 (with timezones) datetimes_naive = [ts.to_pydatetime() for ts in dr] datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz] - df = DataFrame({'dr': dr, - 'dr_tz': dr_tz, - 'datetimes_naive': datetimes_naive, - 'datetimes_with_tz': datetimes_with_tz}) - result = df.get_dtype_counts().sort_index() - expected = Series({'datetime64[ns]': 2, - str(tz_expected): 2}).sort_index() + df = DataFrame({'dr': dr}) + df['dr_tz'] = dr_tz + df['datetimes_naive'] = datetimes_naive + df['datetimes_with_tz'] = datetimes_with_tz + result = df.dtypes + expected = Series([ + np.dtype('datetime64[ns]'), + DatetimeTZDtype(tz=tz), + np.dtype('datetime64[ns]'), + DatetimeTZDtype(tz=tz) + ], + index=['dr', 'dr_tz', 'datetimes_naive', 'datetimes_with_tz']) tm.assert_series_equal(result, expected) @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index b1a083213debd6..e8343a1cf318b9 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -932,3 +932,9 @@ def test_deprecated_to_dense(self): with tm.assert_produces_warning(FutureWarning): result = ser.to_dense() tm.assert_series_equal(result, ser) + + def test_deprecated_get_dtype_counts(self): + # GH 18262 + df = DataFrame([1]) + with tm.assert_produces_warning(FutureWarning): + df.get_dtype_counts() diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 0fb8673e6274a4..8f57254eae2193 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -179,8 +179,9 @@ def test_apply_with_mixed_dtype(): # GH3480, apply with mixed dtype on axis=1 breaks in 0.11 df = DataFrame({'foo1': np.random.randn(6), 'foo2': ['one', 'two', 'two', 'three', 'one', 'two']}) - result = df.apply(lambda x: x, axis=1) - tm.assert_series_equal(df.get_dtype_counts(), result.get_dtype_counts()) + result = df.apply(lambda x: x, axis=1).dtypes + expected = df.dtypes + tm.assert_series_equal(result, expected) # GH 3610 incorrect dtype conversion with as_index=False df = DataFrame({"c1": [1, 2, 6, 6, 8]}) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index dcd0d3938c6a57..d13dddac790420 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -88,10 +88,11 @@ def max_value(group): return group.loc[group['value'].idxmax()] applied = df.groupby('A').apply(max_value) - result = applied.get_dtype_counts().sort_values() - expected = Series({'float64': 2, - 'int64': 1, - 'object': 2}).sort_values() + result = applied.dtypes + expected = Series([np.dtype('object')] * 2 + + [np.dtype('float64')] * 2 + + [np.dtype('int64')], + index=['A', 'B', 'C', 'D', 'value']) assert_series_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index ec347396727182..00062b04d07d8a 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -1985,7 +1985,8 @@ def test_table_values_dtypes_roundtrip(self): df1['time2'] = Timestamp('20130102') store.append('df_mixed_dtypes1', df1) - result = store.select('df_mixed_dtypes1').get_dtype_counts() + result = store.select('df_mixed_dtypes1').dtypes.value_counts() + result.index = [str(i) for i in result.index] expected = Series({'float32': 2, 'float64': 1, 'int32': 1, 'bool': 1, 'int16': 1, 'int8': 1, 'int64': 1, 'object': 1, 'datetime64[ns]': 2}) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 7def8e53859c70..7795c356bf43ec 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -245,8 +245,9 @@ def test_pivot_dtypes(self): z = pivot_table(f, values='v', index=['a'], columns=[ 'i'], fill_value=0, aggfunc=np.sum) - result = z.get_dtype_counts() - expected = Series(dict(int64=2)) + result = z.dtypes + expected = Series([np.dtype('int64')] * 2, + index=Index(list('ab'), name='i')) tm.assert_series_equal(result, expected) # cannot convert dtypes @@ -256,8 +257,9 @@ def test_pivot_dtypes(self): z = pivot_table(f, values='v', index=['a'], columns=[ 'i'], fill_value=0, aggfunc=np.mean) - result = z.get_dtype_counts() - expected = Series(dict(float64=2)) + result = z.dtypes + expected = Series([np.dtype('float64')] * 2, + index=Index(list('ab'), name='i')) tm.assert_series_equal(result, expected) @pytest.mark.parametrize('columns,values', diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 283814d2375b1e..d0979fb86d36d5 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -101,7 +101,9 @@ def test_basic_types(self, sparse, dtype): dtype_name = self.effective_dtype(dtype).name expected = Series({dtype_name: 8}) - tm.assert_series_equal(result.get_dtype_counts(), expected) + result = result.dtypes.value_counts() + result.index = [str(i) for i in result.index] + tm.assert_series_equal(result, expected) result = get_dummies(s_df, columns=['a'], sparse=sparse, dtype=dtype) @@ -109,8 +111,10 @@ def test_basic_types(self, sparse, dtype): expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0) expected = Series(expected_counts).sort_index() - tm.assert_series_equal(result.get_dtype_counts().sort_index(), - expected) + result = result.dtypes.value_counts() + result.index = [str(i) for i in result.index] + result = result.sort_index() + tm.assert_series_equal(result, expected) def test_just_na(self, sparse): just_na_list = [np.nan] diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 43fcddea3d964f..2cc2ad080eb4ce 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -122,9 +122,9 @@ def test_ser_flex_cmp_return_dtypes(self, opname): # GH#15115 ser = Series([1, 3, 2], index=range(3)) const = 2 - - result = getattr(ser, opname)(const).get_dtype_counts() - tm.assert_series_equal(result, Series([1], ['bool'])) + result = getattr(ser, opname)(const).dtypes + expected = np.dtype('bool') + assert result == expected @pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le']) def test_ser_flex_cmp_return_dtypes_empty(self, opname): @@ -132,9 +132,9 @@ def test_ser_flex_cmp_return_dtypes_empty(self, opname): ser = Series([1, 3, 2], index=range(3)) empty = ser.iloc[:0] const = 2 - - result = getattr(empty, opname)(const).get_dtype_counts() - tm.assert_series_equal(result, Series([1], ['bool'])) + result = getattr(empty, opname)(const).dtypes + expected = np.dtype('bool') + assert result == expected @pytest.mark.parametrize('op', [operator.eq, operator.ne, operator.le, operator.lt, diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 287fd15ac3f08d..b17f24fef825eb 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -57,8 +57,6 @@ def test_dtype(self, datetime_series): # GH 26705 - Assert .ftypes is deprecated with tm.assert_produces_warning(FutureWarning): assert datetime_series.ftypes == 'float64:dense' - tm.assert_series_equal(datetime_series.get_dtype_counts(), - Series(1, ['float64'])) # GH18243 - Assert .get_ftype_counts is deprecated with tm.assert_produces_warning(FutureWarning): tm.assert_series_equal(datetime_series.get_ftype_counts(), diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 2d0b338ef53c00..d3e2e1357f9d7c 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -292,9 +292,8 @@ def test_dtypes(self): df = DataFrame(np.random.randn(10000, 4)) df.loc[:9998] = np.nan sdf = df.to_sparse() - - result = sdf.get_dtype_counts() - expected = Series({'Sparse[float64, nan]': 4}) + result = sdf.dtypes + expected = Series(['Sparse[float64, nan]'] * 4) tm.assert_series_equal(result, expected) def test_shape(self, float_frame, float_frame_int_kind, @@ -902,7 +901,7 @@ def test_corr(self, float_frame): def test_describe(self, float_frame): float_frame['foo'] = np.nan - float_frame.get_dtype_counts() + float_frame.dtypes.value_counts() str(float_frame) desc = float_frame.describe() # noqa From a0bfbf0024b87afcaf66643119a8e665b402c6db Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 3 Jul 2019 06:29:11 -0500 Subject: [PATCH 139/238] DOC: Started 0.25.1 whatsnew (#27191) --- doc/source/whatsnew/v0.24.1.rst | 2 - doc/source/whatsnew/v0.24.2.rst | 2 - doc/source/whatsnew/v0.25.1.rst | 171 ++++++++++++++++++++++++++++++++ 3 files changed, 171 insertions(+), 4 deletions(-) create mode 100644 doc/source/whatsnew/v0.25.1.rst diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index 9dffe1f0764774..1b0232cad7476b 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -1,5 +1,3 @@ -:orphan: - .. _whatsnew_0241: Whats new in 0.24.1 (February 3, 2019) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 21936ba270c776..da8064893e8a8d 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -1,5 +1,3 @@ -:orphan: - .. _whatsnew_0242: Whats new in 0.24.2 (March 12, 2019) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst new file mode 100644 index 00000000000000..8690e1974330b8 --- /dev/null +++ b/doc/source/whatsnew/v0.25.1.rst @@ -0,0 +1,171 @@ +:orphan: + +.. TODO. Remove the orphan tag. + +.. _whatsnew_0251: + +What's new in 0.25.1 (July XX, 2019) +------------------------------------ + +Enhancements +~~~~~~~~~~~~ + + +.. _whatsnew_0251.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + +- +- +- + +.. _whatsnew_0251.bug_fixes: + +Bug fixes +~~~~~~~~~ + + +Categorical +^^^^^^^^^^^ + +- +- +- + +Datetimelike +^^^^^^^^^^^^ + +- +- +- + +Timedelta +^^^^^^^^^ + +- +- +- + +Timezones +^^^^^^^^^ + +- +- +- + +Numeric +^^^^^^^ + +- +- +- + +Conversion +^^^^^^^^^^ + +- +- +- + +Strings +^^^^^^^ + +- +- +- + + +Interval +^^^^^^^^ + +- +- +- + +Indexing +^^^^^^^^ + +- +- +- + +Missing +^^^^^^^ + +- +- +- + +MultiIndex +^^^^^^^^^^ + +- +- +- + +I/O +^^^ + +- +- +- + +Plotting +^^^^^^^^ + +- +- +- + +Groupby/resample/rolling +^^^^^^^^^^^^^^^^^^^^^^^^ + +- +- +- + +Reshaping +^^^^^^^^^ + +- +- +- + +Sparse +^^^^^^ + +- +- +- + + +Build Changes +^^^^^^^^^^^^^ + +- +- +- + +ExtensionArray +^^^^^^^^^^^^^^ + +- +- +- + +Other +^^^^^ + +- +- +- + +.. _whatsnew_0.251.contributors: + +Contributors +~~~~~~~~~~~~ + +.. TODO. Change to v0.25.0..HEAD + +.. contributors:: HEAD..HEAD From 212df86131d03f3fd8b3091065b0f4355dc0a68c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 3 Jul 2019 06:29:52 -0500 Subject: [PATCH 140/238] Object dtype for empty describe (#27184) --- pandas/core/generic.py | 6 ++++-- pandas/tests/frame/test_analytics.py | 21 ++++++++++++++++++--- pandas/tests/series/test_analytics.py | 14 ++++++++++++++ 3 files changed, 36 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0679aa27b1ad34..a11b6e3ac72832 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9652,6 +9652,7 @@ def describe_categorical_1d(data): objcounts = data.value_counts() count_unique = len(objcounts[objcounts != 0]) result = [data.count(), count_unique] + dtype = None if result[1] > 0: top, freq = objcounts.index[0], objcounts.iloc[0] @@ -9676,9 +9677,10 @@ def describe_categorical_1d(data): # to maintain output shape consistency else: names += ['top', 'freq'] - result += [None, None] + result += [np.nan, np.nan] + dtype = 'object' - return pd.Series(result, index=names, name=data.name) + return pd.Series(result, index=names, name=data.name, dtype=dtype) def describe_1d(data): if is_bool_dtype(data): diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 01a398584b5e1a..9921d91d6de8c2 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -525,6 +525,17 @@ def test_bool_describe_in_mixed_frame(self): index=['count', 'unique', 'top', 'freq']) tm.assert_frame_equal(result, expected) + def test_describe_empty_object(self): + # https://github.com/pandas-dev/pandas/issues/27183 + df = pd.DataFrame({"A": [None, None]}, dtype=object) + result = df.describe() + expected = pd.DataFrame({"A": [0, 0, np.nan, np.nan]}, dtype=object, + index=['count', 'unique', 'top', 'freq']) + tm.assert_frame_equal(result, expected) + + result = df.iloc[:0].describe() + tm.assert_frame_equal(result, expected) + def test_describe_bool_frame(self): # GH 13891 df = pd.DataFrame({ @@ -590,13 +601,17 @@ def test_describe_categorical(self): def test_describe_empty_categorical_column(self): # GH 26397 - # Ensure the index of an an empty categoric DataFrame column + # Ensure the index of an an empty categorical DataFrame column # also contains (count, unique, top, freq) df = pd.DataFrame({"empty_col": Categorical([])}) result = df.describe() - expected = DataFrame({'empty_col': [0, 0, None, None]}, - index=['count', 'unique', 'top', 'freq']) + expected = DataFrame({'empty_col': [0, 0, np.nan, np.nan]}, + index=['count', 'unique', 'top', 'freq'], + dtype='object') tm.assert_frame_equal(result, expected) + # ensure NaN, not None + assert np.isnan(result.iloc[2, 0]) + assert np.isnan(result.iloc[3, 0]) def test_describe_categorical_columns(self): # GH 11558 diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index df69bb35115cfe..e48fd9ce11a7d4 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -42,6 +42,20 @@ def test_describe(self): index=['count', 'unique', 'top', 'freq']) tm.assert_series_equal(result, expected) + def test_describe_empty_object(self): + # https://github.com/pandas-dev/pandas/issues/27183 + s = pd.Series([None, None], dtype=object) + result = s.describe() + expected = pd.Series([0, 0, np.nan, np.nan], dtype=object, + index=['count', 'unique', 'top', 'freq']) + tm.assert_series_equal(result, expected) + + result = s[:0].describe() + tm.assert_series_equal(result, expected) + # ensure NaN, not None + assert np.isnan(result.iloc[2]) + assert np.isnan(result.iloc[3]) + def test_describe_with_tz(self, tz_naive_fixture): # GH 21332 tz = tz_naive_fixture From 9ac75caa4b531a2b4c7d7368101f8ec7ebc32f0e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 3 Jul 2019 06:30:13 -0500 Subject: [PATCH 141/238] DOC: whatsnew for array_ufunc (#27188) --- doc/source/whatsnew/v0.25.0.rst | 42 +++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 9630595b6ac1b5..be6de0a4c6805d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -643,6 +643,48 @@ previous behavior of returning overlapping matches. s[idxr] s.loc[idxr] + +.. _whatsnew_0250.api_breaking.ufunc: + +Binary ufuncs on Series now align +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Applying a binary ufunc like :func:`numpy.power` now aligns the inputs +when both are :class:`Series` (:issue:`23293`). + +.. ipython:: python + + s1 = pd.Series([1, 2, 3], index=['a', 'b', 'c']) + s2 = pd.Series([3, 4, 5], index=['d', 'c', 'b']) + s1 + s2 + +*Previous behavior* + +.. code-block:: python + + In [5]: np.power(s1, s2) + Out[5]: + a 1 + b 16 + c 243 + dtype: int64 + +*New behavior* + +.. ipython:: python + + np.power(s1, s2) + +This matches the behavior of other binary operations in pandas, like :meth:`Series.add`. +To retain the previous behavior, convert the other ``Series`` to an array before +applying the ufunc. + +.. ipython:: python + + np.power(s1, s2.array) + + .. _whatsnew_0250.api_breaking.deps: Increased minimum versions for dependencies From c0a496456591fc68a01b9107cd6c3e77a310b7c5 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Wed, 3 Jul 2019 14:32:53 +0300 Subject: [PATCH 142/238] DEPR: join_axes-kwarg in pd.concat (#22318) --- doc/source/user_guide/merging.rst | 18 +++++++++------- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/frame.py | 10 ++++----- pandas/core/generic.py | 3 ++- pandas/core/groupby/generic.py | 6 ++++-- pandas/core/reshape/concat.py | 33 ++++++++++++++++++----------- pandas/tests/reshape/test_concat.py | 20 +++++++++++++++++ 7 files changed, 63 insertions(+), 28 deletions(-) diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 43d44ff30c64a5..6e63e672bb9681 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -70,9 +70,8 @@ some configurable handling of "what to do with the other axes": :: - pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, - keys=None, levels=None, names=None, verify_integrity=False, - copy=True) + pd.concat(objs, axis=0, join='outer', ignore_index=False, keys=None, + levels=None, names=None, verify_integrity=False, copy=True) * ``objs`` : a sequence or mapping of Series or DataFrame objects. If a dict is passed, the sorted keys will be used as the `keys` argument, unless @@ -87,8 +86,6 @@ some configurable handling of "what to do with the other axes": n - 1. This is useful if you are concatenating objects where the concatenation axis does not have meaningful indexing information. Note the index values on the other axes are still respected in the join. -* ``join_axes`` : list of Index objects. Specific indexes to use for the other - n - 1 axes instead of performing inner/outer set logic. * ``keys`` : sequence, default None. Construct hierarchical index using the passed keys as the outermost level. If multiple levels passed, should contain tuples. @@ -147,12 +144,11 @@ Set logic on the other axes When gluing together multiple DataFrames, you have a choice of how to handle the other axes (other than the one being concatenated). This can be done in -the following three ways: +the following two ways: * Take the union of them all, ``join='outer'``. This is the default option as it results in zero information loss. * Take the intersection, ``join='inner'``. -* Use a specific index, as passed to the ``join_axes`` argument. Here is an example of each of these methods. First, the default ``join='outer'`` behavior: @@ -202,7 +198,13 @@ DataFrame: .. ipython:: python - result = pd.concat([df1, df4], axis=1, join_axes=[df1.index]) + result = pd.concat([df1, df4], axis=1).reindex(df1.index) + +Similarly, we could index before the concatenation: + +.. ipython:: python + + pd.concat([df1, df4.reindex(df1.index)], axis=1) .. ipython:: python :suppress: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index be6de0a4c6805d..70c4b5e8ddbeb5 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -809,6 +809,7 @@ Other deprecations - The deprecated ``.ix[]`` indexer now raises a more visible ``FutureWarning`` instead of ``DeprecationWarning`` (:issue:`26438`). - Deprecated the ``units=M`` (months) and ``units=Y`` (year) parameters for ``units`` of :func:`pandas.to_timedelta`, :func:`pandas.Timedelta` and :func:`pandas.TimedeltaIndex` (:issue:`16344`) +- :meth:`pandas.concat` has deprecated the ``join_axes``-keyword. Instead, use :meth:`DataFrame.reindex` or :meth:`DataFrame.reindex_like` on the result or on the inputs (:issue:`21951`) - The :attr:`SparseArray.values` attribute is deprecated. You can use ``np.asarray(...)`` or the :meth:`SparseArray.to_dense` method instead (:issue:`26421`). - The functions :func:`pandas.to_datetime` and :func:`pandas.to_timedelta` have deprecated the ``box`` keyword. Instead, use :meth:`to_numpy` or :meth:`Timestamp.to_datetime64` or :meth:`Timedelta.to_timedelta64`. (:issue:`24416`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d3ce77c0684f91..df4be417e8d021 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6806,12 +6806,12 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', # join indexes only using concat if can_concat: if how == 'left': - how = 'outer' - join_axes = [self.index] + res = concat(frames, axis=1, join='outer', + verify_integrity=True) + return res.reindex(self.index, copy=False) else: - join_axes = None - return concat(frames, axis=1, join=how, join_axes=join_axes, - verify_integrity=True) + return concat(frames, axis=1, join=how, + verify_integrity=True) joined = frames[0] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a11b6e3ac72832..106af6e565f8a4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9716,7 +9716,8 @@ def describe_1d(data): if name not in names: names.append(name) - d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) + d = pd.concat([x.reindex(names, copy=False) for x in ldesc], + axis=1, sort=False) d.columns = data.columns.copy() return d diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 210e82837118c9..9e7dcafc0b1a4e 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -562,8 +562,10 @@ def _transform_general(self, func, *args, **kwargs): applied.append(res) concat_index = obj.columns if self.axis == 0 else obj.index - concatenated = concat(applied, join_axes=[concat_index], - axis=self.axis, verify_integrity=False) + other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 + concatenated = concat(applied, axis=self.axis, verify_integrity=False) + concatenated = concatenated.reindex(concat_index, axis=other_axis, + copy=False) return self._set_result_index_ordered(concatenated) @Substitution(klass='DataFrame', selected='') diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 4523a6ad48f19f..d4272cf6e406d3 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -2,6 +2,8 @@ concat routines """ +import warnings + import numpy as np import pandas.core.dtypes.concat as _concat @@ -44,8 +46,11 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, join : {'inner', 'outer'}, default 'outer' How to handle indexes on other axis (or axes). join_axes : list of Index objects + .. deprecated:: 0.25.0 + Specific indexes to use for the other n - 1 axes instead of performing - inner/outer set logic. + inner/outer set logic. Use .reindex() before or after concatenation + as a replacement. ignore_index : bool, default False If True, do not use the index values along the concatenation axis. The resulting axis will be labeled 0, ..., n - 1. This is useful if you are @@ -221,11 +226,11 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, ... ValueError: Indexes have overlapping values: ['a'] """ - op = _Concatenator(objs, axis=axis, join_axes=join_axes, - ignore_index=ignore_index, join=join, - keys=keys, levels=levels, names=names, - verify_integrity=verify_integrity, + op = _Concatenator(objs, axis=axis, ignore_index=ignore_index, join=join, + join_axes=join_axes, keys=keys, levels=levels, + names=names, verify_integrity=verify_integrity, copy=copy, sort=sort) + return op.get_result() @@ -234,10 +239,9 @@ class _Concatenator: Orchestrates a concatenation operation for BlockManagers """ - def __init__(self, objs, axis=0, join='outer', join_axes=None, - keys=None, levels=None, names=None, - ignore_index=False, verify_integrity=False, copy=True, - sort=False): + def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, + levels=None, names=None, ignore_index=False, + verify_integrity=False, copy=True, sort=False): if isinstance(objs, (NDFrame, str)): raise TypeError('first argument must be an iterable of pandas ' 'objects, you passed an object of type ' @@ -310,9 +314,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, if sum(obj.shape) > 0 or isinstance(obj, Series)] if (len(non_empties) and (keys is None and names is None and - levels is None and - join_axes is None and - not self.intersect)): + levels is None and not self.intersect)): objs = non_empties sample = objs[0] @@ -446,7 +448,14 @@ def _get_new_axes(self): if i == self.axis: continue new_axes[i] = self._get_comb_axis(i) + else: + # GH 21951 + warnings.warn( + 'The join_axes-keyword is deprecated. Use .reindex or ' + '.reindex_like on the result to achieve the same ' + 'functionality.', FutureWarning, stacklevel=4) + if len(self.join_axes) != ndim - 1: raise AssertionError("length of join_axes must be equal " "to {length}".format(length=ndim - 1)) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 74ede682dfb5f1..031f3abf31b163 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -722,6 +722,26 @@ def test_concat_categorical_empty(self): tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + def test_concat_join_axes_deprecated(self, axis): + # GH21951 + one = pd.DataFrame([[0., 1.], [2., 3.]], columns=list('ab')) + two = pd.DataFrame([[10., 11.], [12., 13.]], index=[1, 2], + columns=list('bc')) + + expected = pd.concat([one, two], + axis=1, sort=False).reindex(index=two.index) + with tm.assert_produces_warning(expected_warning=FutureWarning): + result = pd.concat([one, two], + axis=1, sort=False, join_axes=[two.index]) + tm.assert_frame_equal(result, expected) + + expected = pd.concat([one, two], + axis=0, sort=False).reindex(columns=two.columns) + with tm.assert_produces_warning(expected_warning=FutureWarning): + result = pd.concat([one, two], + axis=0, sort=False, join_axes=[two.columns]) + tm.assert_frame_equal(result, expected) + class TestAppend: From 37b226817741bcf496fef4c2a29832d37de88234 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 3 Jul 2019 04:43:23 -0700 Subject: [PATCH 143/238] CLN: more blocks code out from try/excepts (#27200) --- pandas/core/internals/blocks.py | 77 +++++++++++-------------------- pandas/core/internals/managers.py | 4 +- 2 files changed, 30 insertions(+), 51 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6cfeb62ef736b8..a9b2c0491458cf 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -593,22 +593,21 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, values = self.get_values(dtype=dtype) # _astype_nansafe works fine with 1-d only - values = astype_nansafe( - values.ravel(), dtype, copy=True, **kwargs) + vals1d = values.ravel() + values = astype_nansafe(vals1d, dtype, copy=True, **kwargs) # TODO(extension) # should we make this attribute? - try: + if isinstance(values, np.ndarray): values = values.reshape(self.shape) - except AttributeError: - pass - newb = make_block(values, placement=self.mgr_locs, - ndim=self.ndim) except Exception: # noqa: E722 if errors == 'raise': raise newb = self.copy() if copy else self + else: + newb = make_block(values, placement=self.mgr_locs, + ndim=self.ndim) if newb.is_numeric and self.is_numeric: if newb.shape != self.shape: @@ -1311,10 +1310,6 @@ def where(self, other, cond, align=True, errors='raise', # our where function def func(cond, values, other): - if cond.ravel().all(): - return values - - values = self._coerce_values(values) other = self._try_coerce_args(other) try: @@ -1331,20 +1326,24 @@ def func(cond, values, other): result.fill(np.nan) return result - # see if we can operate on the entire block, or need item-by-item - # or if we are a single block (ndim == 1) - try: - result = func(cond, values, other) - except TypeError: - - # we cannot coerce, return a compat dtype - # we are explicitly ignoring errors - block = self.coerce_to_target_dtype(other) - blocks = block.where(orig_other, cond, align=align, - errors=errors, - try_cast=try_cast, axis=axis, - transpose=transpose) - return self._maybe_downcast(blocks, 'infer') + if cond.ravel().all(): + result = values + else: + # see if we can operate on the entire block, or need item-by-item + # or if we are a single block (ndim == 1) + values = self._coerce_values(values) + try: + result = func(cond, values, other) + except TypeError: + + # we cannot coerce, return a compat dtype + # we are explicitly ignoring errors + block = self.coerce_to_target_dtype(other) + blocks = block.where(orig_other, cond, align=align, + errors=errors, + try_cast=try_cast, axis=axis, + transpose=transpose) + return self._maybe_downcast(blocks, 'infer') if self._can_hold_na or self.ndim == 1: @@ -1456,7 +1455,8 @@ def quantile(self, qs, interpolation='linear', axis=0): len(qs)) else: # asarray needed for Sparse, see GH#24600 - # TODO: Why self.values and not values? + # Note: we use self.values below instead of values because the + # `asi8` conversion above will behave differently under `isna` mask = np.asarray(isna(self.values)) result = nanpercentile(values, np.array(qs) * 100, axis=axis, na_value=self.fill_value, @@ -2652,10 +2652,9 @@ def convert(self, *args, **kwargs): def f(m, v, i): shape = v.shape values = fn(v.ravel(), **fn_kwargs) - try: + if isinstance(values, np.ndarray): + # TODO: allow EA once reshape is supported values = values.reshape(shape) - except (AttributeError, NotImplementedError): - pass values = _block_shape(values, ndim=self.ndim) return values @@ -2669,26 +2668,6 @@ def f(m, v, i): return blocks - def set(self, locs, values): - """ - Modify Block in-place with new item value - - Returns - ------- - None - """ - try: - self.values[locs] = values - except (ValueError): - - # broadcasting error - # see GH6171 - new_shape = list(values.shape) - new_shape[0] = len(self.items) - self.values = np.empty(tuple(new_shape), dtype=self.dtype) - self.values.fill(np.nan) - self.values[locs] = values - def _maybe_downcast(self, blocks, downcast=None): if downcast is not None: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 5494b75ff9e4e9..b02d40d2362211 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1214,7 +1214,7 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): elif not allow_fill or self.ndim == 1: if allow_fill and fill_tuple[0] is None: _, fill_value = maybe_promote(blk.dtype) - fill_tuple = (fill_value, ) + fill_tuple = (fill_value,) return [blk.take_nd(slobj, axis=0, new_mgr_locs=slice(0, sllen), @@ -1579,7 +1579,7 @@ def create_block_manager_from_blocks(blocks, axes): mgr._consolidate_inplace() return mgr - except (ValueError) as e: + except ValueError as e: blocks = [getattr(b, 'values', b) for b in blocks] tot_items = sum(b.shape[0] for b in blocks) construction_error(tot_items, blocks[0].shape[1:], axes, e) From 002b2c37f37479532e5186fdb9c97f31630ba5d7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 3 Jul 2019 08:16:47 -0500 Subject: [PATCH 144/238] REGR: Group empty Series (#27194) --- pandas/core/groupby/grouper.py | 5 ++++- pandas/tests/groupby/test_grouping.py | 21 +++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index d0f28bed4399ba..9e1033be26df2c 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -606,8 +606,11 @@ def is_in_obj(gpr): groupings.append(ping) - if len(groupings) == 0: + if len(groupings) == 0 and len(obj): raise ValueError('No group keys passed!') + elif len(groupings) == 0: + groupings.append(Grouping(Index([], dtype='int'), + np.array([], dtype=np.intp))) # create the internals grouper grouper = BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 4c84c29ff98cb2..5508c290b04298 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -569,6 +569,27 @@ def test_evaluate_with_empty_groups(self, func, expected): result = getattr(g[2], func)(lambda x: x) assert_series_equal(result, expected) + def test_groupby_empty(self): + # https://github.com/pandas-dev/pandas/issues/27190 + s = pd.Series([], name='name') + gr = s.groupby([]) + + result = gr.mean() + tm.assert_series_equal(result, s) + + # check group properties + assert len(gr.grouper.groupings) == 1 + tm.assert_numpy_array_equal(gr.grouper.group_info[0], + np.array([], dtype=np.dtype("intp"))) + + tm.assert_numpy_array_equal(gr.grouper.group_info[1], + np.array([], dtype=np.dtype('int'))) + + assert gr.grouper.group_info[2] == 0 + + # check name + assert s.groupby(s).grouper.names == ['name'] + # get_group # -------------------------------- From f5038de41bcf7f921fc481baec7c8e01af89c38e Mon Sep 17 00:00:00 2001 From: Lorenzo Stella Date: Wed, 3 Jul 2019 15:17:22 +0200 Subject: [PATCH 145/238] improve test for timestamp addition/subtraction (#27192) * improve test for timestamp addition/subtraction * fix bug in test * add test cases --- .../tests/scalar/timestamp/test_arithmetic.py | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index 8310b140b50e00..4f20bdbd65ba13 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -96,9 +96,16 @@ def test_addition_subtraction_types(self): assert type(ts + td64) == Timestamp assert type(ts - td64) == Timestamp - def test_addition_subtraction_preserve_frequency(self): - ts = Timestamp('2014-03-05', freq='D') - td = timedelta(days=1) + @pytest.mark.parametrize('freq, td, td64', [ + ('S', timedelta(seconds=1), np.timedelta64(1, 's')), + ('min', timedelta(minutes=1), np.timedelta64(1, 'm')), + ('H', timedelta(hours=1), np.timedelta64(1, 'h')), + ('D', timedelta(days=1), np.timedelta64(1, 'D')), + ('W', timedelta(weeks=1), np.timedelta64(1, 'W')), + ('M', None, np.timedelta64(1, 'M')) + ]) + def test_addition_subtraction_preserve_frequency(self, freq, td, td64): + ts = Timestamp('2014-03-05 00:00:00', freq=freq) original_freq = ts.freq with tm.assert_produces_warning(FutureWarning): @@ -106,10 +113,14 @@ def test_addition_subtraction_preserve_frequency(self): assert (ts + 1).freq == original_freq assert (ts - 1).freq == original_freq - assert (ts + td).freq == original_freq - assert (ts - td).freq == original_freq + assert (ts + 1 * original_freq).freq == original_freq + assert (ts - 1 * original_freq).freq == original_freq + + if td is not None: + # timedelta does not support months as unit + assert (ts + td).freq == original_freq + assert (ts - td).freq == original_freq - td64 = np.timedelta64(1, 'D') assert (ts + td64).freq == original_freq assert (ts - td64).freq == original_freq From 9bee33402a5ada02519057cc7690667e21b860da Mon Sep 17 00:00:00 2001 From: anmyachev <45976948+anmyachev@users.noreply.github.com> Date: Wed, 3 Jul 2019 16:17:59 +0300 Subject: [PATCH 146/238] BUG: _convert_and_box_cache raise ValueError: Tz-aware datetime.datetime cannot be converted to datetime64 unless utc=True (#26097) --- doc/source/whatsnew/v0.25.0.rst | 2 + pandas/core/index.py | 10 ++- pandas/core/tools/datetimes.py | 81 ++++++++++++++------ pandas/tests/indexes/datetimes/test_tools.py | 11 +++ 4 files changed, 79 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 70c4b5e8ddbeb5..f87a702e1c5b58 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -910,6 +910,8 @@ Datetimelike - Bug in :func:`date_range` with unnecessary ``OverflowError`` being raised for very large or very small dates (:issue:`26651`) - Bug where adding :class:`Timestamp` to a ``np.timedelta64`` object would raise instead of returning a :class:`Timestamp` (:issue:`24775`) - Bug where comparing a zero-dimensional numpy array containing a ``np.datetime64`` object to a :class:`Timestamp` would incorrect raise ``TypeError`` (:issue:`26916`) +- Bug in :func:`to_datetime` which would raise ``ValueError: Tz-aware datetime.datetime cannot be converted to datetime64 unless utc=True`` when called with ``cache=True``, with ``arg`` including datetime strings with different offset (:issue:`26097`) +- Timedelta ^^^^^^^^^ diff --git a/pandas/core/index.py b/pandas/core/index.py index 2d1c22f5623a19..f14f32c67d4e15 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1,3 +1,7 @@ -# flake8: noqa -from pandas.core.indexes.api import * -from pandas.core.indexes.multi import _sparsify +from pandas.core.indexes.api import ( # noqa:F401 + CategoricalIndex, DatetimeIndex, Float64Index, Index, Int64Index, + IntervalIndex, InvalidIndexError, MultiIndex, NaT, NumericIndex, + PeriodIndex, RangeIndex, TimedeltaIndex, UInt64Index, _all_indexes_same, + _get_combined_index, _get_consensus_names, _get_objs_combined_axis, + _new_Index, _union_indexes, ensure_index, ensure_index_from_sequences) +from pandas.core.indexes.multi import _sparsify # noqa:F401 diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 5893ff0e0dd8fe..d543ae91ad344f 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1,6 +1,7 @@ from collections import abc from datetime import datetime, time from functools import partial +from typing import Optional, TypeVar, Union import numpy as np @@ -14,12 +15,25 @@ from pandas.core.dtypes.common import ( ensure_object, is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, is_float, is_integer, is_integer_dtype, - is_list_like, is_numeric_dtype, is_object_dtype, is_scalar) -from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries + is_list_like, is_numeric_dtype, is_scalar) +from pandas.core.dtypes.generic import ( + ABCDataFrame, ABCDatetimeIndex, ABCIndex, ABCIndexClass, ABCSeries) from pandas.core.dtypes.missing import notna +from pandas._typing import ArrayLike from pandas.core import algorithms +# --------------------------------------------------------------------- +# types used in annotations + +Scalar = Union[int, float, str] +DatetimeScalar = TypeVar('DatetimeScalar', Scalar, datetime) +DatetimeScalarOrArrayConvertible = Union[DatetimeScalar, list, tuple, + ArrayLike, ABCSeries] + + +# --------------------------------------------------------------------- + def _guess_datetime_format_for_array(arr, **kwargs): # Try to guess the format based on the first non-NaN element @@ -60,7 +74,43 @@ def _maybe_cache(arg, format, cache, convert_listlike): return cache_array -def _convert_and_box_cache(arg, cache_array, box, errors, name=None): +def _box_as_indexlike( + dt_array: ArrayLike, + utc: Optional[bool] = None, + name: Optional[str] = None +) -> Union[ABCIndex, ABCDatetimeIndex]: + """ + Properly boxes the ndarray of datetimes to DatetimeIndex + if it is possible or to generic Index instead + + Parameters + ---------- + dt_array: 1-d array + array of datetimes to be boxed + tz : object + None or 'utc' + name : string, default None + Name for a resulting index + + Returns + ------- + result : datetime of converted dates + - DatetimeIndex if convertible to sole datetime64 type + - general Index otherwise + """ + from pandas import DatetimeIndex, Index + if is_datetime64_dtype(dt_array): + tz = 'utc' if utc else None + return DatetimeIndex(dt_array, tz=tz, name=name) + return Index(dt_array, name=name) + + +def _convert_and_box_cache( + arg: DatetimeScalarOrArrayConvertible, + cache_array: ABCSeries, + box: bool, + name: Optional[str] = None +) -> Union[ABCIndex, np.ndarray]: """ Convert array of dates with a cache and box the result @@ -71,26 +121,19 @@ def _convert_and_box_cache(arg, cache_array, box, errors, name=None): Cache of converted, unique dates box : boolean True boxes result as an Index-like, False returns an ndarray - errors : string - 'ignore' plus box=True will convert result to Index name : string, default None Name for a DatetimeIndex Returns ------- result : datetime of converted dates - Returns: - - Index-like if box=True - ndarray if box=False """ - from pandas import Series, DatetimeIndex, Index + from pandas import Series result = Series(arg).map(cache_array) if box: - if errors == 'ignore': - return Index(result, name=name) - else: - return DatetimeIndex(result, name=name) + return _box_as_indexlike(result, utc=None, name=name) return result.values @@ -118,7 +161,6 @@ def _return_parsed_timezone_results(result, timezones, box, tz, name): - Index-like if box=True - ndarray of Timestamps if box=False - """ if tz is not None: raise ValueError("Cannot pass a tz argument when " @@ -324,13 +366,8 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, return np.array(result, dtype=object) if box: - # Ensure we return an Index in all cases where box=True - if is_datetime64_dtype(result): - return DatetimeIndex(result, tz=tz, name=name) - elif is_object_dtype(result): - # e.g. an Index of datetime objects - from pandas import Index - return Index(result, name=name) + utc = tz == 'utc' + return _box_as_indexlike(result, utc=utc, name=name) return result @@ -611,7 +648,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, elif isinstance(arg, ABCIndexClass): cache_array = _maybe_cache(arg, format, cache, convert_listlike) if not cache_array.empty: - result = _convert_and_box_cache(arg, cache_array, box, errors, + result = _convert_and_box_cache(arg, cache_array, box, name=arg.name) else: convert_listlike = partial(convert_listlike, name=arg.name) @@ -619,7 +656,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, elif is_list_like(arg): cache_array = _maybe_cache(arg, format, cache, convert_listlike) if not cache_array.empty: - result = _convert_and_box_cache(arg, cache_array, box, errors) + result = _convert_and_box_cache(arg, cache_array, box) else: result = convert_listlike(arg, box, format) else: diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index a971a1088860a7..f401a7f7c9e9b9 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -504,6 +504,17 @@ def test_to_datetime_tz(self, cache): with pytest.raises(ValueError, match=msg): pd.to_datetime(arr, cache=cache) + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_different_offsets(self, cache): + # inspired by asv timeseries.ToDatetimeNONISO8601 benchmark + # see GH-26097 for more + ts_string_1 = 'March 1, 2018 12:00:00+0400' + ts_string_2 = 'March 1, 2018 12:00:00+0500' + arr = [ts_string_1] * 5 + [ts_string_2] * 5 + expected = pd.Index([parse(x) for x in arr]) + result = pd.to_datetime(arr, cache=cache) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('cache', [True, False]) def test_to_datetime_tz_pytz(self, cache): # see gh-8260 From 3e4f000ea5fd05c85b2b0f45bc16b1943a0df555 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 3 Jul 2019 08:18:53 -0500 Subject: [PATCH 147/238] DEPR: Deprecate outer ufunc in Series.__array_ufunc__ (#27198) * DEPR: Deprecate outer ufunc in Series.__array_ufunc__ Closes https://github.com/pandas-dev/pandas/issues/27186 --- doc/source/getting_started/dsintro.rst | 5 +++++ doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/series.py | 13 +++++++++++++ pandas/tests/series/test_ufunc.py | 15 +++++++++++++++ 4 files changed, 34 insertions(+) diff --git a/doc/source/getting_started/dsintro.rst b/doc/source/getting_started/dsintro.rst index 33e5d390447d77..2fb0b163642c5a 100644 --- a/doc/source/getting_started/dsintro.rst +++ b/doc/source/getting_started/dsintro.rst @@ -753,6 +753,11 @@ The ufunc is applied to the underlying array in a Series. ser = pd.Series([1, 2, 3, 4]) np.exp(ser) +.. versionchanged:: 0.25.0 + + When multiple ``Series`` are passed to a ufunc, they are aligned before + performing the operation. + Like other parts of the library, pandas will automatically align labeled inputs as part of a ufunc with multiple inputs. For example, using :meth:`numpy.remainder` on two :class:`Series` with differently ordered labels will align before the operation. diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index f87a702e1c5b58..ea6a04ac726b76 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -821,6 +821,7 @@ Other deprecations - The :meth:`Series.get_values`, :meth:`DataFrame.get_values`, :meth:`Index.get_values`, :meth:`SparseArray.get_values` and :meth:`Categorical.get_values` methods are deprecated. One of ``np.asarray(..)`` or :meth:`~Series.to_numpy` can be used instead (:issue:`19617`). +- The 'outer' method on NumPy ufuncs, e.g. ``np.subtract.outer`` has been deprecated on :class:`Series` objects. Convert the input to an array with :attr:`Series.array` first (:issue:`27186`) - :meth:`Timedelta.resolution` is deprecated and replaced with :meth:`Timedelta.resolution_string`. In a future version, :meth:`Timedelta.resolution` will be changed to behave like the standard library :attr:`timedelta.resolution` (:issue:`21344`) - :func:`read_table` has been undeprecated. (:issue:`25220`) - :attr:`Index.dtype_str` is deprecated. (:issue:`18262`) diff --git a/pandas/core/series.py b/pandas/core/series.py index f5f9f1ab4f9ab4..acae4b0449f724 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -786,6 +786,19 @@ def __array_ufunc__( def construct_return(result): if lib.is_scalar(result): return result + elif result.ndim > 1: + # e.g. np.subtract.outer + if method == 'outer': + msg = ( + "outer method for ufunc {} is not implemented on " + "pandas objects. Returning an ndarray, but in the " + "future this will raise a 'NotImplementedError'. " + "Consider explicitly converting the Series " + "to an array with '.array' first." + ) + warnings.warn(msg.format(ufunc), FutureWarning, + stacklevel=3) + return result return self._constructor(result, index=index, name=name, diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 1a0eeb51c4921a..183aa6e3933553 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -310,3 +310,18 @@ def __repr__(self): result = np.add(s, Thing(1)) expected = pd.Series([Thing(2), Thing(3)]) tm.assert_series_equal(result, expected) + + +def test_outer(): + # https://github.com/pandas-dev/pandas/issues/27186 + s = pd.Series([1, 2, 3]) + o = np.array([1, 2, 3]) + + with tm.assert_produces_warning(FutureWarning): + result = np.subtract.outer(s, o) + expected = np.array([ + [0, -1, -2], + [1, 0, -1], + [2, 1, 0] + ], dtype=np.dtype('int64')) + tm.assert_numpy_array_equal(result, expected) From 79feb99fbca5e6aacf222442f5df381a0b116151 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 3 Jul 2019 10:50:34 -0500 Subject: [PATCH 148/238] BUG: Fixup bench for deprecation (#27207) --- asv_bench/benchmarks/categoricals.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 790157497ca36b..f1afca5941fe50 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,5 +1,3 @@ -import warnings - import numpy as np import pandas as pd import pandas.util.testing as tm @@ -133,15 +131,10 @@ def setup(self): self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str) self.s_str_cat = self.s_str.astype('category') - with warnings.catch_warnings(record=True): - self.s_str_cat_ordered = self.s_str.astype('category', - ordered=True) - + self.s_str_cat_ordered = self.s_str_cat.cat.as_ordered() self.s_int = pd.Series(np.random.randint(0, ncats, size=N)) self.s_int_cat = self.s_int.astype('category') - with warnings.catch_warnings(record=True): - self.s_int_cat_ordered = self.s_int.astype('category', - ordered=True) + self.s_int_cat_ordered = self.s_int_cat.cat.as_ordered() def time_rank_string(self): self.s_str.rank() From 4e185fcaedfe75050a3aa4e9fa175f9579825388 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 3 Jul 2019 13:02:35 -0400 Subject: [PATCH 149/238] CLN/DEPR: Final panel removal (#27101) --- doc/source/getting_started/basics.rst | 1 + doc/source/whatsnew/v0.11.0.rst | 34 +- doc/source/whatsnew/v0.15.0.rst | 19 +- doc/source/whatsnew/v0.15.2.rst | 13 +- pandas/__init__.py | 27 +- pandas/core/api.py | 1 - pandas/core/dtypes/generic.py | 1 - pandas/core/internals/concat.py | 2 - pandas/core/internals/managers.py | 2 +- pandas/core/panel.py | 1563 ------------------------- pandas/io/packers.py | 7 +- pandas/tests/api/test_api.py | 6 +- 12 files changed, 78 insertions(+), 1598 deletions(-) delete mode 100644 pandas/core/panel.py diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index e1508cb7b4e165..682d6c1ef8301f 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -784,6 +784,7 @@ In this case, provide ``pipe`` with a tuple of ``(callable, data_keyword)``. For example, we can fit a regression using statsmodels. Their API expects a formula first and a ``DataFrame`` as the second argument, ``data``. We pass in the function, keyword pair ``(sm.ols, 'data')`` to ``pipe``: .. ipython:: python + :okwarning: import statsmodels.formula.api as sm diff --git a/doc/source/whatsnew/v0.11.0.rst b/doc/source/whatsnew/v0.11.0.rst index 03480ebeed78ee..148ee349b049c0 100644 --- a/doc/source/whatsnew/v0.11.0.rst +++ b/doc/source/whatsnew/v0.11.0.rst @@ -377,15 +377,31 @@ Enhancements - ``Squeeze`` to possibly remove length 1 dimensions from an object. - .. ipython:: python - :okwarning: - - p = pd.Panel(np.random.randn(3, 4, 4), items=['ItemA', 'ItemB', 'ItemC'], - major_axis=pd.date_range('20010102', periods=4), - minor_axis=['A', 'B', 'C', 'D']) - p - p.reindex(items=['ItemA']).squeeze() - p.reindex(items=['ItemA'], minor=['B']).squeeze() + .. code-block:: python + + >>> p = pd.Panel(np.random.randn(3, 4, 4), items=['ItemA', 'ItemB', 'ItemC'], + ... major_axis=pd.date_range('20010102', periods=4), + ... minor_axis=['A', 'B', 'C', 'D']) + >>> p + + Dimensions: 3 (items) x 4 (major_axis) x 4 (minor_axis) + Items axis: ItemA to ItemC + Major_axis axis: 2001-01-02 00:00:00 to 2001-01-05 00:00:00 + Minor_axis axis: A to D + + >>> p.reindex(items=['ItemA']).squeeze() + A B C D + 2001-01-02 0.926089 -2.026458 0.501277 -0.204683 + 2001-01-03 -0.076524 1.081161 1.141361 0.479243 + 2001-01-04 0.641817 -0.185352 1.824568 0.809152 + 2001-01-05 0.575237 0.669934 1.398014 -0.399338 + + >>> p.reindex(items=['ItemA'], minor=['B']).squeeze() + 2001-01-02 -2.026458 + 2001-01-03 1.081161 + 2001-01-04 -0.185352 + 2001-01-05 0.669934 + Freq: D, Name: B, dtype: float64 - In ``pd.io.data.Options``, diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index bea2ce815d243a..c27ada6ef3b585 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -701,14 +701,19 @@ Other notable API changes: This can also be seen in multi-axis indexing with a ``Panel``. - .. ipython:: python - :okwarning: + .. code-block:: python + + >>> p = pd.Panel(np.arange(2 * 3 * 4).reshape(2, 3, 4), + ... items=['ItemA', 'ItemB'], + ... major_axis=[1, 2, 3], + ... minor_axis=['A', 'B', 'C', 'D']) + >>> p + + Dimensions: 2 (items) x 3 (major_axis) x 4 (minor_axis) + Items axis: ItemA to ItemB + Major_axis axis: 1 to 3 + Minor_axis axis: A to D - p = pd.Panel(np.arange(2 * 3 * 4).reshape(2, 3, 4), - items=['ItemA', 'ItemB'], - major_axis=[1, 2, 3], - minor_axis=['A', 'B', 'C', 'D']) - p The following would raise ``KeyError`` prior to 0.15.0: diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst index a41ad5bdf8cd66..b58eabaed6127c 100644 --- a/doc/source/whatsnew/v0.15.2.rst +++ b/doc/source/whatsnew/v0.15.2.rst @@ -160,11 +160,16 @@ Other enhancements: - ``Panel`` now supports the ``all`` and ``any`` aggregation functions. (:issue:`8302`): - .. ipython:: python - :okwarning: + .. code-block:: python - p = pd.Panel(np.random.rand(2, 5, 4) > 0.1) - p.all() + >>> p = pd.Panel(np.random.rand(2, 5, 4) > 0.1) + >>> p.all() + 0 1 2 3 + 0 True True True True + 1 True False True True + 2 True True True True + 3 False True False True + 4 True True True True - Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on `Timestamp` class (:issue:`5351`). - Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See `here `__. diff --git a/pandas/__init__.py b/pandas/__init__.py index b95c312f12eedc..5b39d954c2bc33 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -65,8 +65,7 @@ # misc np, Grouper, factorize, unique, value_counts, NamedAgg, - array, Categorical, set_eng_float_format, Series, DataFrame, - Panel) + array, Categorical, set_eng_float_format, Series, DataFrame) from pandas.core.sparse.api import ( SparseArray, SparseDataFrame, SparseSeries, SparseDtype) @@ -118,6 +117,30 @@ __git_version__ = v.get('full-revisionid') del get_versions, v + +# GH 27101 +# TODO: remove Panel compat in 1.0 +if pandas.compat.PY37: + def __getattr__(name): + if name == 'Panel': + import warnings + warnings.warn( + "The Panel class is removed from pandas. Accessing it " + "from the top-level namespace will also be removed in " + "the next version", + FutureWarning, stacklevel=2) + + class Panel: + pass + + return Panel + raise AttributeError( + "module 'pandas' has no attribute '{}'".format(name)) +else: + class Panel: + pass + + # module level doc-string __doc__ = """ pandas - a powerful data analysis and manipulation library for Python diff --git a/pandas/core/api.py b/pandas/core/api.py index 0106feabcce741..e8d21080775da8 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -35,7 +35,6 @@ from pandas.core.series import Series from pandas.core.frame import DataFrame -from pandas.core.panel import Panel # TODO: Remove import when statsmodels updates #18264 from pandas.core.reshape.reshape import get_dummies diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index 134ec95729833e..86aff93dfde143 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -45,7 +45,6 @@ def _check(cls, inst): ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe", )) ABCSparseDataFrame = create_pandas_abc_type("ABCSparseDataFrame", "_subtyp", ("sparse_frame", )) -ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel",)) ABCSparseSeries = create_pandas_abc_type("ABCSparseSeries", "_subtyp", ('sparse_series', 'sparse_time_series')) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 8f699ae24230db..6900dfc3c76d87 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -376,8 +376,6 @@ def is_uniform_join_units(join_units): all(not ju.is_na or ju.block.is_extension for ju in join_units) and # no blocks with indexers (as then the dimensions do not fit) all(not ju.indexers for ju in join_units) and - # disregard Panels - all(ju.block.ndim <= 2 for ju in join_units) and # only use this path when there is something to concatenate len(join_units) > 1) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b02d40d2362211..cdf0826bbe21ef 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -42,7 +42,7 @@ class BlockManager(PandasObject): """ - Core internal data structure to implement DataFrame, Series, Panel, etc. + Core internal data structure to implement DataFrame, Series, etc. Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a lightweight blocked set of labeled data to be manipulated by the DataFrame diff --git a/pandas/core/panel.py b/pandas/core/panel.py deleted file mode 100644 index 350c3083623eb5..00000000000000 --- a/pandas/core/panel.py +++ /dev/null @@ -1,1563 +0,0 @@ -""" -Contains data structures designed for manipulating panel (3-dimensional) data -""" -from collections import OrderedDict -import warnings - -import numpy as np - -from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender, Substitution, deprecate_kwarg -from pandas.util._validators import validate_axis_style_args - -from pandas.core.dtypes.cast import ( - cast_scalar_to_array, infer_dtype_from_scalar, maybe_cast_item) -from pandas.core.dtypes.common import ( - is_integer, is_list_like, is_scalar, is_string_like) -from pandas.core.dtypes.missing import notna - -import pandas.core.common as com -from pandas.core.frame import DataFrame -from pandas.core.generic import NDFrame -from pandas.core.index import ( - Index, MultiIndex, _get_objs_combined_axis, ensure_index) -import pandas.core.indexes.base as ibase -from pandas.core.indexing import maybe_droplevels -from pandas.core.internals import ( - BlockManager, create_block_manager_from_arrays, - create_block_manager_from_blocks) -from pandas.core.reshape.util import cartesian_product -from pandas.core.series import Series - -from pandas.io.formats.printing import pprint_thing - -_shared_doc_kwargs = dict( - axes='items, major_axis, minor_axis', - klass="Panel", - axes_single_arg="{0, 1, 2, 'items', 'major_axis', 'minor_axis'}", - optional_mapper='', optional_axis='', optional_labels='') -_shared_doc_kwargs['args_transpose'] = ( - "{ax_single}\n\tThree positional arguments from given options.".format( - ax_single=_shared_doc_kwargs['axes_single_arg'])) - - -def _ensure_like_indices(time, panels): - """ - Makes sure that time and panels are conformable. - """ - n_time = len(time) - n_panel = len(panels) - u_panels = np.unique(panels) # this sorts! - u_time = np.unique(time) - if len(u_time) == n_time: - time = np.tile(u_time, len(u_panels)) - if len(u_panels) == n_panel: - panels = np.repeat(u_panels, len(u_time)) - return time, panels - - -def panel_index(time, panels, names=None): - """ - Returns a multi-index suitable for a panel-like DataFrame. - - Parameters - ---------- - time : array-like - Time index, does not have to repeat - panels : array-like - Panel index, does not have to repeat - names : list, optional - List containing the names of the indices - - Returns - ------- - multi_index : MultiIndex - Time index is the first level, the panels are the second level. - - Examples - -------- - >>> years = range(1960,1963) - >>> panels = ['A', 'B', 'C'] - >>> panel_idx = panel_index(years, panels) - >>> panel_idx - MultiIndex([(1960, 'A'), (1961, 'A'), (1962, 'A'), (1960, 'B'), - (1961, 'B'), (1962, 'B'), (1960, 'C'), (1961, 'C'), - (1962, 'C')], dtype=object) - - or - - >>> years = np.repeat(range(1960,1963), 3) - >>> panels = np.tile(['A', 'B', 'C'], 3) - >>> panel_idx = panel_index(years, panels) - >>> panel_idx - MultiIndex([(1960, 'A'), (1960, 'B'), (1960, 'C'), (1961, 'A'), - (1961, 'B'), (1961, 'C'), (1962, 'A'), (1962, 'B'), - (1962, 'C')], dtype=object) - """ - if names is None: - names = ['time', 'panel'] - time, panels = _ensure_like_indices(time, panels) - return MultiIndex.from_arrays([time, panels], sortorder=None, names=names) - - -class Panel(NDFrame): - """ - Represents wide format panel data, stored as 3-dimensional array. - - .. deprecated:: 0.20.0 - The recommended way to represent 3-D data are with a MultiIndex on a - DataFrame via the :attr:`~Panel.to_frame()` method or with the - `xarray package `__. - Pandas provides a :attr:`~Panel.to_xarray()` method to automate this - conversion. - - Parameters - ---------- - data : ndarray (items x major x minor), or dict of DataFrames - items : Index or array-like - axis=0 - major_axis : Index or array-like - axis=1 - minor_axis : Index or array-like - axis=2 - copy : boolean, default False - Copy data from inputs. Only affects DataFrame / 2d ndarray input - dtype : dtype, default None - Data type to force, otherwise infer - """ - - @property - def _constructor(self): - return type(self) - - _constructor_sliced = DataFrame - - def __init__(self, data=None, items=None, major_axis=None, minor_axis=None, - copy=False, dtype=None): - # deprecation GH13563 - warnings.warn("\nPanel is deprecated and will be removed in a " - "future version.\nThe recommended way to represent " - "these types of 3-dimensional data are with a " - "MultiIndex on a DataFrame, via the " - "Panel.to_frame() method\n" - "Alternatively, you can use the xarray package " - "http://xarray.pydata.org/en/stable/.\n" - "Pandas provides a `.to_xarray()` method to help " - "automate this conversion.\n", - FutureWarning, stacklevel=3) - - self._init_data(data=data, items=items, major_axis=major_axis, - minor_axis=minor_axis, copy=copy, dtype=dtype) - - def _init_data(self, data, copy, dtype, **kwargs): - """ - Generate ND initialization; axes are passed - as required objects to __init__. - """ - if data is None: - data = {} - if dtype is not None: - dtype = self._validate_dtype(dtype) - - passed_axes = [kwargs.pop(a, None) for a in self._AXIS_ORDERS] - - if kwargs: - raise TypeError('_init_data() got an unexpected keyword ' - 'argument "{0}"'.format(list(kwargs.keys())[0])) - - axes = None - if isinstance(data, BlockManager): - if com._any_not_none(*passed_axes): - axes = [x if x is not None else y - for x, y in zip(passed_axes, data.axes)] - mgr = data - elif isinstance(data, dict): - mgr = self._init_dict(data, passed_axes, dtype=dtype) - copy = False - dtype = None - elif isinstance(data, (np.ndarray, list)): - mgr = self._init_matrix(data, passed_axes, dtype=dtype, copy=copy) - copy = False - dtype = None - elif is_scalar(data) and com._all_not_none(*passed_axes): - values = cast_scalar_to_array([len(x) for x in passed_axes], - data, dtype=dtype) - mgr = self._init_matrix(values, passed_axes, dtype=values.dtype, - copy=False) - copy = False - else: # pragma: no cover - raise ValueError('Panel constructor not properly called!') - - NDFrame.__init__(self, mgr, axes=axes, copy=copy, dtype=dtype) - - def _init_dict(self, data, axes, dtype=None): - haxis = axes.pop(self._info_axis_number) - - # prefilter if haxis passed - if haxis is not None: - haxis = ensure_index(haxis) - data = OrderedDict((k, v) - for k, v in data.items() - if k in haxis) - else: - keys = com.dict_keys_to_ordered_list(data) - haxis = Index(keys) - - for k, v in data.items(): - if isinstance(v, dict): - data[k] = self._constructor_sliced(v) - - # extract axis for remaining axes & create the slicemap - raxes = [self._extract_axis(self, data, axis=i) if a is None else a - for i, a in enumerate(axes)] - raxes_sm = self._extract_axes_for_slice(self, raxes) - - # shallow copy - arrays = [] - haxis_shape = [len(a) for a in raxes] - for h in haxis: - v = values = data.get(h) - if v is None: - values = np.empty(haxis_shape, dtype=dtype) - values.fill(np.nan) - elif isinstance(v, self._constructor_sliced): - d = raxes_sm.copy() - d['copy'] = False - v = v.reindex(**d) - if dtype is not None: - v = v.astype(dtype) - values = v.values - arrays.append(values) - - return self._init_arrays(arrays, haxis, [haxis] + raxes) - - def _init_arrays(self, arrays, arr_names, axes): - return create_block_manager_from_arrays(arrays, arr_names, axes) - - @classmethod - def from_dict(cls, data, intersect=False, orient='items', dtype=None): - """ - Construct Panel from dict of DataFrame objects. - - Parameters - ---------- - data : dict - {field : DataFrame} - intersect : boolean - Intersect indexes of input DataFrames - orient : {'items', 'minor'}, default 'items' - The "orientation" of the data. If the keys of the passed dict - should be the items of the result panel, pass 'items' - (default). Otherwise if the columns of the values of the passed - DataFrame objects should be the items (which in the case of - mixed-dtype data you should do), instead pass 'minor' - dtype : dtype, default None - Data type to force, otherwise infer - - Returns - ------- - Panel - """ - from collections import defaultdict - - orient = orient.lower() - if orient == 'minor': - new_data = defaultdict(OrderedDict) - for col, df in data.items(): - for item, s in df.items(): - new_data[item][col] = s - data = new_data - elif orient != 'items': # pragma: no cover - raise ValueError('Orientation must be one of {items, minor}.') - - d = cls._homogenize_dict(cls, data, intersect=intersect, dtype=dtype) - ks = list(d['data'].keys()) - if not isinstance(d['data'], OrderedDict): - ks = list(sorted(ks)) - d[cls._info_axis_name] = Index(ks) - return cls(**d) - - def __getitem__(self, key): - key = com.apply_if_callable(key, self) - - if isinstance(self._info_axis, MultiIndex): - return self._getitem_multilevel(key) - if not (is_list_like(key) or isinstance(key, slice)): - return super().__getitem__(key) - return self.loc[key] - - def _getitem_multilevel(self, key): - info = self._info_axis - loc = info.get_loc(key) - if isinstance(loc, (slice, np.ndarray)): - new_index = info[loc] - result_index = maybe_droplevels(new_index, key) - slices = [loc] + [slice(None)] * (self._AXIS_LEN - 1) - new_values = self.values[slices] - - d = self._construct_axes_dict(self._AXIS_ORDERS[1:]) - d[self._info_axis_name] = result_index - result = self._constructor(new_values, **d) - return result - else: - return self._get_item_cache(key) - - def _init_matrix(self, data, axes, dtype=None, copy=False): - values = self._prep_ndarray(self, data, copy=copy) - - if dtype is not None: - try: - values = values.astype(dtype) - except Exception: - raise ValueError('failed to cast to ' - '{datatype}'.format(datatype=dtype)) - - shape = values.shape - fixed_axes = [] - for i, ax in enumerate(axes): - if ax is None: - ax = ibase.default_index(shape[i]) - else: - ax = ensure_index(ax) - fixed_axes.append(ax) - - return create_block_manager_from_blocks([values], fixed_axes) - - # ---------------------------------------------------------------------- - # Comparison methods - - def _compare_constructor(self, other, func): - if not self._indexed_same(other): - raise Exception('Can only compare identically-labeled ' - 'same type objects') - - new_data = {col: func(self[col], other[col]) - for col in self._info_axis} - - d = self._construct_axes_dict(copy=False) - return self._constructor(data=new_data, **d) - - # ---------------------------------------------------------------------- - # Magic methods - - def __repr__(self): - """ - Return a string representation for a particular Panel. - """ - - class_name = str(self.__class__) - - dims = 'Dimensions: {dimensions}'.format(dimensions=' x '.join( - ["{shape} ({axis})".format(shape=shape, axis=axis) for axis, shape - in zip(self._AXIS_ORDERS, self.shape)])) - - def axis_pretty(a): - v = getattr(self, a) - if len(v) > 0: - return '{ax} axis: {x} to {y}'.format(ax=a.capitalize(), - x=pprint_thing(v[0]), - y=pprint_thing(v[-1])) - else: - return '{ax} axis: None'.format(ax=a.capitalize()) - - output = '\n'.join( - [class_name, dims] + [axis_pretty(a) for a in self._AXIS_ORDERS]) - return output - - def _get_plane_axes_index(self, axis): - """ - Get my plane axes indexes: these are already - (as compared with higher level planes), - as we are returning a DataFrame axes indexes. - """ - axis_name = self._get_axis_name(axis) - - if axis_name == 'major_axis': - index = 'minor_axis' - columns = 'items' - if axis_name == 'minor_axis': - index = 'major_axis' - columns = 'items' - elif axis_name == 'items': - index = 'major_axis' - columns = 'minor_axis' - - return index, columns - - def _get_plane_axes(self, axis): - """ - Get my plane axes indexes: these are already - (as compared with higher level planes), - as we are returning a DataFrame axes. - """ - return [self._get_axis(axi) - for axi in self._get_plane_axes_index(axis)] - - fromDict = from_dict - - def to_sparse(self, *args, **kwargs): - """ - NOT IMPLEMENTED: do not call this method, as sparsifying is not - supported for Panel objects and will raise an error. - - Convert to SparsePanel. - """ - raise NotImplementedError("sparsifying is not supported " - "for Panel objects") - - def to_excel(self, path, na_rep='', engine=None, **kwargs): - """ - Write each DataFrame in Panel to a separate excel sheet. - - Parameters - ---------- - path : string or ExcelWriter object - File path or existing ExcelWriter - na_rep : string, default '' - Missing data representation - engine : string, default None - write engine to use - you can also set this via the options - ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and - ``io.excel.xlsm.writer``. - - Other Parameters - ---------------- - float_format : string, default None - Format string for floating point numbers - cols : sequence, optional - Columns to write - header : boolean or list of string, default True - Write out column names. If a list of string is given it is - assumed to be aliases for the column names - index : boolean, default True - Write row names (index) - index_label : string or sequence, default None - Column label for index column(s) if desired. If None is given, and - `header` and `index` are True, then the index names are used. A - sequence should be given if the DataFrame uses MultiIndex. - startrow : upper left cell row to dump data frame - startcol : upper left cell column to dump data frame - - Notes - ----- - Keyword arguments (and na_rep) are passed to the ``to_excel`` method - for each DataFrame written. - """ - from pandas.io.excel import ExcelWriter - - if isinstance(path, str): - writer = ExcelWriter(path, engine=engine) - else: - writer = path - kwargs['na_rep'] = na_rep - - for item, df in self.iteritems(): - name = str(item) - df.to_excel(writer, name, **kwargs) - writer.save() - - def as_matrix(self): - self._consolidate_inplace() - return self._data.as_array() - - # ---------------------------------------------------------------------- - # Getting and setting elements - - def get_value(self, *args, **kwargs): - """ - Quickly retrieve single value at (item, major, minor) location. - - .. deprecated:: 0.21.0 - - Please use .at[] or .iat[] accessors. - - Parameters - ---------- - item : item label (panel item) - major : major axis label (panel item row) - minor : minor axis label (panel item column) - takeable : interpret the passed labels as indexers, default False - - Returns - ------- - value : scalar value - """ - warnings.warn("get_value is deprecated and will be removed " - "in a future release. Please use " - ".at[] or .iat[] accessors instead", FutureWarning, - stacklevel=2) - return self._get_value(*args, **kwargs) - - def _get_value(self, *args, **kwargs): - nargs = len(args) - nreq = self._AXIS_LEN - - # require an arg for each axis - if nargs != nreq: - raise TypeError('There must be an argument for each axis, you gave' - ' {0} args, but {1} are required'.format(nargs, - nreq)) - takeable = kwargs.pop('takeable', None) - - if kwargs: - raise TypeError('get_value() got an unexpected keyword ' - 'argument "{0}"'.format(list(kwargs.keys())[0])) - - if takeable is True: - lower = self._iget_item_cache(args[0]) - else: - lower = self._get_item_cache(args[0]) - - return lower._get_value(*args[1:], takeable=takeable) - _get_value.__doc__ = get_value.__doc__ - - def set_value(self, *args, **kwargs): - """ - Quickly set single value at (item, major, minor) location. - - .. deprecated:: 0.21.0 - - Please use .at[] or .iat[] accessors. - - Parameters - ---------- - item : item label (panel item) - major : major axis label (panel item row) - minor : minor axis label (panel item column) - value : scalar - takeable : interpret the passed labels as indexers, default False - - Returns - ------- - panel : Panel - If label combo is contained, will be reference to calling Panel, - otherwise a new object. - """ - warnings.warn("set_value is deprecated and will be removed " - "in a future release. Please use " - ".at[] or .iat[] accessors instead", FutureWarning, - stacklevel=2) - return self._set_value(*args, **kwargs) - - def _set_value(self, *args, **kwargs): - # require an arg for each axis and the value - nargs = len(args) - nreq = self._AXIS_LEN + 1 - - if nargs != nreq: - raise TypeError('There must be an argument for each axis plus the ' - 'value provided, you gave {0} args, but {1} are ' - 'required'.format(nargs, nreq)) - takeable = kwargs.pop('takeable', None) - - if kwargs: - raise TypeError('set_value() got an unexpected keyword ' - 'argument "{0}"'.format(list(kwargs.keys())[0])) - - try: - if takeable is True: - lower = self._iget_item_cache(args[0]) - else: - lower = self._get_item_cache(args[0]) - - lower._set_value(*args[1:], takeable=takeable) - return self - except KeyError: - axes = self._expand_axes(args) - d = self._construct_axes_dict_from(self, axes, copy=False) - result = self.reindex(**d) - args = list(args) - likely_dtype, args[-1] = infer_dtype_from_scalar(args[-1]) - made_bigger = not np.array_equal(axes[0], self._info_axis) - # how to make this logic simpler? - if made_bigger: - maybe_cast_item(result, args[0], likely_dtype) - - return result._set_value(*args) - _set_value.__doc__ = set_value.__doc__ - - def _box_item_values(self, key, values): - if self.ndim == values.ndim: - result = self._constructor(values) - - # a dup selection will yield a full ndim - if result._get_axis(0).is_unique: - result = result[key] - - return result - - d = self._construct_axes_dict_for_slice(self._AXIS_ORDERS[1:]) - return self._constructor_sliced(values, **d) - - def __setitem__(self, key, value): - key = com.apply_if_callable(key, self) - shape = tuple(self.shape) - if isinstance(value, self._constructor_sliced): - value = value.reindex( - **self._construct_axes_dict_for_slice(self._AXIS_ORDERS[1:])) - mat = value.values - elif isinstance(value, np.ndarray): - if value.shape != shape[1:]: - raise ValueError('shape of value must be {0}, shape of given ' - 'object was {1}'.format( - shape[1:], tuple(map(int, value.shape)))) - mat = np.asarray(value) - elif is_scalar(value): - mat = cast_scalar_to_array(shape[1:], value) - else: - raise TypeError('Cannot set item of ' - 'type: {dtype!s}'.format(dtype=type(value))) - - mat = mat.reshape(tuple([1]) + shape[1:]) - NDFrame._set_item(self, key, mat) - - def _unpickle_panel_compat(self, state): # pragma: no cover - """ - Unpickle the panel. - """ - from pandas.io.pickle import _unpickle_array - - _unpickle = _unpickle_array - vals, items, major, minor = state - - items = _unpickle(items) - major = _unpickle(major) - minor = _unpickle(minor) - values = _unpickle(vals) - wp = Panel(values, items, major, minor) - self._data = wp._data - - def conform(self, frame, axis='items'): - """ - Conform input DataFrame to align with chosen axis pair. - - Parameters - ---------- - frame : DataFrame - axis : {'items', 'major', 'minor'} - - Axis the input corresponds to. E.g., if axis='major', then - the frame's columns would be items, and the index would be - values of the minor axis - - Returns - ------- - DataFrame - """ - axes = self._get_plane_axes(axis) - return frame.reindex(**self._extract_axes_for_slice(self, axes)) - - def head(self, n=5): - raise NotImplementedError - - def tail(self, n=5): - raise NotImplementedError - - def round(self, decimals=0, *args, **kwargs): - """ - Round each value in Panel to a specified number of decimal places. - - .. versionadded:: 0.18.0 - - Parameters - ---------- - decimals : int - Number of decimal places to round to (default: 0). - If decimals is negative, it specifies the number of - positions to the left of the decimal point. - - Returns - ------- - Panel object - - See Also - -------- - numpy.around - """ - nv.validate_round(args, kwargs) - - if is_integer(decimals): - result = np.apply_along_axis(np.round, 0, self.values) - return self._wrap_result(result, axis=0) - raise TypeError("decimals must be an integer") - - def _needs_reindex_multi(self, axes, method, level): - """ - Don't allow a multi reindex on Panel or above ndim. - """ - return False - - def align(self, other, **kwargs): - raise NotImplementedError - - def dropna(self, axis=0, how='any', inplace=False): - """ - Drop 2D from panel, holding passed axis constant. - - Parameters - ---------- - axis : int, default 0 - Axis to hold constant. E.g. axis=1 will drop major_axis entries - having a certain amount of NA data - how : {'all', 'any'}, default 'any' - 'any': one or more values are NA in the DataFrame along the - axis. For 'all' they all must be. - inplace : bool, default False - If True, do operation inplace and return None. - - Returns - ------- - dropped : Panel - """ - axis = self._get_axis_number(axis) - - values = self.values - mask = notna(values) - - for ax in reversed(sorted(set(range(self._AXIS_LEN)) - {axis})): - mask = mask.sum(ax) - - per_slice = np.prod(values.shape[:axis] + values.shape[axis + 1:]) - - if how == 'all': - cond = mask > 0 - else: - cond = mask == per_slice - - new_ax = self._get_axis(axis)[cond] - result = self.reindex_axis(new_ax, axis=axis) - if inplace: - self._update_inplace(result) - else: - return result - - def _combine(self, other, func, axis=0): - if isinstance(other, Panel): - return self._combine_panel(other, func) - elif isinstance(other, DataFrame): - return self._combine_frame(other, func, axis=axis) - elif is_scalar(other): - return self._combine_const(other, func) - else: - raise NotImplementedError( - "{otype!s} is not supported in combine operation with " - "{selftype!s}".format(otype=type(other), selftype=type(self))) - - def _combine_const(self, other, func): - with np.errstate(all='ignore'): - new_values = func(self.values, other) - d = self._construct_axes_dict() - return self._constructor(new_values, **d) - - def _combine_frame(self, other, func, axis=0): - index, columns = self._get_plane_axes(axis) - axis = self._get_axis_number(axis) - - other = other.reindex(index=index, columns=columns) - - with np.errstate(all='ignore'): - if axis == 0: - new_values = func(self.values, other.values) - elif axis == 1: - new_values = func(self.values.swapaxes(0, 1), other.values.T) - new_values = new_values.swapaxes(0, 1) - elif axis == 2: - new_values = func(self.values.swapaxes(0, 2), other.values) - new_values = new_values.swapaxes(0, 2) - - return self._constructor(new_values, self.items, self.major_axis, - self.minor_axis) - - def _combine_panel(self, other, func): - items = self.items.union(other.items) - major = self.major_axis.union(other.major_axis) - minor = self.minor_axis.union(other.minor_axis) - - # could check that everything's the same size, but forget it - this = self.reindex(items=items, major=major, minor=minor) - other = other.reindex(items=items, major=major, minor=minor) - - with np.errstate(all='ignore'): - result_values = func(this.values, other.values) - - return self._constructor(result_values, items, major, minor) - - def major_xs(self, key): - """ - Return slice of panel along major axis. - - Parameters - ---------- - key : object - Major axis label - - Returns - ------- - y : DataFrame - Index -> minor axis, columns -> items. - - Notes - ----- - major_xs is only for getting, not setting values. - - MultiIndex Slicers is a generic way to get/set values on any level or - levels and is a superset of major_xs functionality, see - :ref:`MultiIndex Slicers ` - """ - return self.xs(key, axis=self._AXIS_LEN - 2) - - def minor_xs(self, key): - """ - Return slice of panel along minor axis. - - Parameters - ---------- - key : object - Minor axis label - - Returns - ------- - y : DataFrame - Index -> major axis, columns -> items. - - Notes - ----- - minor_xs is only for getting, not setting values. - - MultiIndex Slicers is a generic way to get/set values on any level or - levels and is a superset of minor_xs functionality, see - :ref:`MultiIndex Slicers ` - """ - return self.xs(key, axis=self._AXIS_LEN - 1) - - def xs(self, key, axis=1): - """ - Return slice of panel along selected axis. - - Parameters - ---------- - key : object - Label - axis : {'items', 'major', 'minor}, default 1/'major' - - Returns - ------- - y : ndim(self)-1 - - Notes - ----- - xs is only for getting, not setting values. - - MultiIndex Slicers is a generic way to get/set values on any level or - levels and is a superset of xs functionality, see - :ref:`MultiIndex Slicers ` - """ - axis = self._get_axis_number(axis) - if axis == 0: - return self[key] - - raise NotImplementedError("Panel is removed in pandas 0.25.0") - - _xs = xs - - def _ixs(self, i, axis=0): - """ - Parameters - ---------- - i : int, slice, or sequence of integers - axis : int - """ - - ax = self._get_axis(axis) - key = ax[i] - - # xs cannot handle a non-scalar key, so just reindex here - # if we have a multi-index and a single tuple, then its a reduction - # (GH 7516) - if not (isinstance(ax, MultiIndex) and isinstance(key, tuple)): - if is_list_like(key): - indexer = {self._get_axis_name(axis): key} - return self.reindex(**indexer) - - # a reduction - if axis == 0: - values = self._data.iget(i) - return self._box_item_values(key, values) - - # xs by position - self._consolidate_inplace() - new_data = self._data.xs(i, axis=axis, copy=True, takeable=True) - return self._construct_return_type(new_data) - - def groupby(self, function, axis='major'): - """ - Group data on given axis, returning GroupBy object. - - Parameters - ---------- - function : callable - Mapping function for chosen access - axis : {'major', 'minor', 'items'}, default 'major' - - Returns - ------- - grouped : PanelGroupBy - """ - raise NotImplementedError("Panel is removed in pandas 0.25.0") - - def to_frame(self, filter_observations=True): - """ - Transform wide format into long (stacked) format as DataFrame whose - columns are the Panel's items and whose index is a MultiIndex formed - of the Panel's major and minor axes. - - Parameters - ---------- - filter_observations : boolean, default True - Drop (major, minor) pairs without a complete set of observations - across all the items - - Returns - ------- - y : DataFrame - """ - _, N, K = self.shape - - if filter_observations: - # shaped like the return DataFrame - mask = notna(self.values).all(axis=0) - # size = mask.sum() - selector = mask.ravel() - else: - # size = N * K - selector = slice(None, None) - - data = {item: self[item].values.ravel()[selector] - for item in self.items} - - def construct_multi_parts(idx, n_repeat, n_shuffle=1): - # Replicates and shuffles MultiIndex, returns individual attributes - codes = [np.repeat(x, n_repeat) for x in idx.codes] - # Assumes that each label is divisible by n_shuffle - codes = [x.reshape(n_shuffle, -1).ravel(order='F') - for x in codes] - codes = [x[selector] for x in codes] - levels = idx.levels - names = idx.names - return codes, levels, names - - def construct_index_parts(idx, major=True): - levels = [idx] - if major: - codes = [np.arange(N).repeat(K)[selector]] - names = idx.name or 'major' - else: - codes = np.arange(K).reshape(1, K)[np.zeros(N, dtype=int)] - codes = [codes.ravel()[selector]] - names = idx.name or 'minor' - names = [names] - return codes, levels, names - - if isinstance(self.major_axis, MultiIndex): - major_codes, major_levels, major_names = construct_multi_parts( - self.major_axis, n_repeat=K) - else: - major_codes, major_levels, major_names = construct_index_parts( - self.major_axis) - - if isinstance(self.minor_axis, MultiIndex): - minor_codes, minor_levels, minor_names = construct_multi_parts( - self.minor_axis, n_repeat=N, n_shuffle=K) - else: - minor_codes, minor_levels, minor_names = construct_index_parts( - self.minor_axis, major=False) - - levels = major_levels + minor_levels - codes = major_codes + minor_codes - names = major_names + minor_names - - index = MultiIndex(levels=levels, codes=codes, names=names, - verify_integrity=False) - - return DataFrame(data, index=index, columns=self.items) - - def apply(self, func, axis='major', **kwargs): - """ - Apply function along axis (or axes) of the Panel. - - Parameters - ---------- - func : function - Function to apply to each combination of 'other' axes - e.g. if axis = 'items', the combination of major_axis/minor_axis - will each be passed as a Series; if axis = ('items', 'major'), - DataFrames of items & major axis will be passed - axis : {'items', 'minor', 'major'}, or {0, 1, 2}, or a tuple with two - axes - **kwargs - Additional keyword arguments will be passed to the function. - - Returns - ------- - result : Panel, DataFrame, or Series - - Examples - -------- - - Returns a Panel with the square root of each element - - >>> p = pd.Panel(np.random.rand(4, 3, 2)) # doctest: +SKIP - >>> p.apply(np.sqrt) - - Equivalent to p.sum(1), returning a DataFrame - - >>> p.apply(lambda x: x.sum(), axis=1) # doctest: +SKIP - - Equivalent to previous: - - >>> p.apply(lambda x: x.sum(), axis='major') # doctest: +SKIP - - Return the shapes of each DataFrame over axis 2 (i.e the shapes of - items x major), as a Series - - >>> p.apply(lambda x: x.shape, axis=(0,1)) # doctest: +SKIP - """ - - if kwargs and not isinstance(func, np.ufunc): - f = lambda x: func(x, **kwargs) - else: - f = func - - # 2d-slabs - if isinstance(axis, (tuple, list)) and len(axis) == 2: - return self._apply_2d(f, axis=axis) - - axis = self._get_axis_number(axis) - - # try ufunc like - if isinstance(f, np.ufunc): - try: - with np.errstate(all='ignore'): - result = np.apply_along_axis(func, axis, self.values) - return self._wrap_result(result, axis=axis) - except (AttributeError): - pass - - # 1d - return self._apply_1d(f, axis=axis) - - def _apply_1d(self, func, axis): - - axis_name = self._get_axis_name(axis) - ndim = self.ndim - values = self.values - - # iter thru the axes - slice_axis = self._get_axis(axis) - slice_indexer = [0] * (ndim - 1) - indexer = np.zeros(ndim, 'O') - indlist = list(range(ndim)) - indlist.remove(axis) - indexer[axis] = slice(None, None) - indexer.put(indlist, slice_indexer) - planes = [self._get_axis(axi) for axi in indlist] - shape = np.array(self.shape).take(indlist) - - # all the iteration points - points = cartesian_product(planes) - - results = [] - for i in range(np.prod(shape)): - - # construct the object - pts = tuple(p[i] for p in points) - indexer.put(indlist, slice_indexer) - - obj = Series(values[tuple(indexer)], index=slice_axis, name=pts) - result = func(obj) - - results.append(result) - - # increment the indexer - slice_indexer[-1] += 1 - n = -1 - while (slice_indexer[n] >= shape[n]) and (n > (1 - ndim)): - slice_indexer[n - 1] += 1 - slice_indexer[n] = 0 - n -= 1 - - # empty object - if not len(results): - return self._constructor(**self._construct_axes_dict()) - - # same ndim as current - if isinstance(results[0], Series): - arr = np.vstack([r.values for r in results]) - arr = arr.T.reshape(tuple([len(slice_axis)] + list(shape))) - tranp = np.array([axis] + indlist).argsort() - arr = arr.transpose(tuple(list(tranp))) - return self._constructor(arr, **self._construct_axes_dict()) - - # ndim-1 shape - results = np.array(results).reshape(shape) - if results.ndim == 2 and axis_name != self._info_axis_name: - results = results.T - planes = planes[::-1] - return self._construct_return_type(results, planes) - - def _apply_2d(self, func, axis): - """ - Handle 2-d slices, equiv to iterating over the other axis. - """ - ndim = self.ndim - axis = [self._get_axis_number(a) for a in axis] - - # construct slabs, in 2-d this is a DataFrame result - indexer_axis = list(range(ndim)) - for a in axis: - indexer_axis.remove(a) - indexer_axis = indexer_axis[0] - - slicer = [slice(None, None)] * ndim - ax = self._get_axis(indexer_axis) - - results = [] - for i, e in enumerate(ax): - slicer[indexer_axis] = i - sliced = self.iloc[tuple(slicer)] - - obj = func(sliced) - results.append((e, obj)) - - return self._construct_return_type(dict(results)) - - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): - if numeric_only: - raise NotImplementedError('Panel.{0} does not implement ' - 'numeric_only.'.format(name)) - - if axis is None and filter_type == 'bool': - # labels = None - # constructor = None - axis_number = None - axis_name = None - else: - # TODO: Make other agg func handle axis=None properly - axis = self._get_axis_number(axis) - # labels = self._get_agg_axis(axis) - # constructor = self._constructor - axis_name = self._get_axis_name(axis) - axis_number = self._get_axis_number(axis_name) - - f = lambda x: op(x, axis=axis_number, skipna=skipna, **kwds) - - with np.errstate(all='ignore'): - result = f(self.values) - - if axis is None and filter_type == 'bool': - return np.bool_(result) - axes = self._get_plane_axes(axis_name) - if result.ndim == 2 and axis_name != self._info_axis_name: - result = result.T - - return self._construct_return_type(result, axes) - - def _construct_return_type(self, result, axes=None): - """ - Return the type for the ndim of the result. - """ - ndim = getattr(result, 'ndim', None) - - # need to assume they are the same - if ndim is None: - if isinstance(result, dict): - ndim = getattr(list(result.values())[0], 'ndim', 0) - - # have a dict, so top-level is +1 dim - if ndim != 0: - ndim += 1 - - # scalar - if ndim == 0: - return Series(result) - - # same as self - elif self.ndim == ndim: - # return the construction dictionary for these axes - if axes is None: - return self._constructor(result) - return self._constructor(result, **self._construct_axes_dict()) - - # sliced - elif self.ndim == ndim + 1: - if axes is None: - return self._constructor_sliced(result) - return self._constructor_sliced( - result, **self._extract_axes_for_slice(self, axes)) - - raise ValueError('invalid _construct_return_type [self->{self}] ' - '[result->{result}]'.format(self=self, result=result)) - - def _wrap_result(self, result, axis): - axis = self._get_axis_name(axis) - axes = self._get_plane_axes(axis) - if result.ndim == 2 and axis != self._info_axis_name: - result = result.T - - return self._construct_return_type(result, axes) - - @Substitution(**_shared_doc_kwargs) - @Appender(NDFrame.reindex.__doc__) - def reindex(self, *args, **kwargs): - major = kwargs.pop("major", None) - minor = kwargs.pop('minor', None) - - if major is not None: - if kwargs.get("major_axis"): - raise TypeError("Cannot specify both 'major' and 'major_axis'") - kwargs['major_axis'] = major - if minor is not None: - if kwargs.get("minor_axis"): - raise TypeError("Cannot specify both 'minor' and 'minor_axis'") - - kwargs['minor_axis'] = minor - axes = validate_axis_style_args(self, args, kwargs, 'labels', - 'reindex') - kwargs.update(axes) - kwargs.pop('axis', None) - kwargs.pop('labels', None) - - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - # do not warn about constructing Panel when reindexing - result = super().reindex(**kwargs) - return result - - @Substitution(**_shared_doc_kwargs) - @Appender(NDFrame.rename.__doc__) - def rename(self, items=None, major_axis=None, minor_axis=None, **kwargs): - major_axis = (major_axis if major_axis is not None else - kwargs.pop('major', None)) - minor_axis = (minor_axis if minor_axis is not None else - kwargs.pop('minor', None)) - return super().rename(items=items, major_axis=major_axis, - minor_axis=minor_axis, **kwargs) - - @Substitution(**_shared_doc_kwargs) - @Appender(NDFrame.transpose.__doc__) - def transpose(self, *args, **kwargs): - # check if a list of axes was passed in instead as a - # single *args element - if (len(args) == 1 and hasattr(args[0], '__iter__') and - not is_string_like(args[0])): - axes = args[0] - else: - axes = args - - if 'axes' in kwargs and axes: - raise TypeError("transpose() got multiple values for " - "keyword argument 'axes'") - elif not axes: - axes = kwargs.pop('axes', ()) - - return super().transpose(*axes, **kwargs) - - @Substitution(**_shared_doc_kwargs) - @Appender(NDFrame.fillna.__doc__) - def fillna(self, value=None, method=None, axis=None, inplace=False, - limit=None, downcast=None, **kwargs): - return super().fillna(value=value, method=method, axis=axis, - inplace=inplace, limit=limit, downcast=downcast, - **kwargs) - - def count(self, axis='major'): - """ - Return number of observations over requested axis. - - Parameters - ---------- - axis : {'items', 'major', 'minor'} or {0, 1, 2} - - Returns - ------- - count : DataFrame - """ - i = self._get_axis_number(axis) - - values = self.values - mask = np.isfinite(values) - result = mask.sum(axis=i, dtype='int64') - - return self._wrap_result(result, axis) - - def shift(self, periods=1, freq=None, axis='major'): - """ - Shift index by desired number of periods with an optional time freq. - - The shifted data will not include the dropped periods and the - shifted axis will be smaller than the original. This is different - from the behavior of DataFrame.shift() - - Parameters - ---------- - periods : int - Number of periods to move, can be positive or negative - freq : DateOffset, timedelta, or time rule string, optional - axis : {'items', 'major', 'minor'} or {0, 1, 2} - - Returns - ------- - shifted : Panel - """ - if freq: - return self.tshift(periods, freq, axis=axis) - - return super().slice_shift(periods, axis=axis) - - def tshift(self, periods=1, freq=None, axis='major'): - return super().tshift(periods, freq, axis) - - def join(self, other, how='left', lsuffix='', rsuffix=''): - """ - Join items with other Panel either on major and minor axes column. - - Parameters - ---------- - other : Panel or list of Panels - Index should be similar to one of the columns in this one - how : {'left', 'right', 'outer', 'inner'} - How to handle indexes of the two objects. Default: 'left' - for joining on index, None otherwise - * left: use calling frame's index - * right: use input frame's index - * outer: form union of indexes - * inner: use intersection of indexes - lsuffix : string - Suffix to use from left frame's overlapping columns - rsuffix : string - Suffix to use from right frame's overlapping columns - - Returns - ------- - joined : Panel - """ - from pandas.core.reshape.concat import concat - - if isinstance(other, Panel): - join_major, join_minor = self._get_join_index(other, how) - this = self.reindex(major=join_major, minor=join_minor) - other = other.reindex(major=join_major, minor=join_minor) - merged_data = this._data.merge(other._data, lsuffix, rsuffix) - return self._constructor(merged_data) - else: - if lsuffix or rsuffix: - raise ValueError('Suffixes not supported when passing ' - 'multiple panels') - - if how == 'left': - how = 'outer' - join_axes = [self.major_axis, self.minor_axis] - elif how == 'right': - raise ValueError('Right join not supported with multiple ' - 'panels') - else: - join_axes = None - - return concat([self] + list(other), axis=0, join=how, - join_axes=join_axes, verify_integrity=True) - - @deprecate_kwarg(old_arg_name='raise_conflict', new_arg_name='errors', - mapping={False: 'ignore', True: 'raise'}) - def update(self, other, join='left', overwrite=True, filter_func=None, - errors='ignore'): - """ - Modify Panel in place using non-NA values from other Panel. - - May also use object coercible to Panel. Will align on items. - - Parameters - ---------- - other : Panel, or object coercible to Panel - The object from which the caller will be updated. - join : {'left', 'right', 'outer', 'inner'}, default 'left' - How individual DataFrames are joined. - overwrite : bool, default True - If True then overwrite values for common keys in the calling Panel. - filter_func : callable(1d-array) -> 1d-array, default None - Can choose to replace values other than NA. Return True for values - that should be updated. - errors : {'raise', 'ignore'}, default 'ignore' - If 'raise', will raise an error if a DataFrame and other both. - - .. versionchanged :: 0.24.0 - Changed from `raise_conflict=False|True` - to `errors='ignore'|'raise'`. - - See Also - -------- - DataFrame.update : Similar method for DataFrames. - dict.update : Similar method for dictionaries. - """ - - if not isinstance(other, self._constructor): - other = self._constructor(other) - - axis_name = self._info_axis_name - axis_values = self._info_axis - other = other.reindex(**{axis_name: axis_values}) - - for frame in axis_values: - self[frame].update(other[frame], join=join, overwrite=overwrite, - filter_func=filter_func, errors=errors) - - def _get_join_index(self, other, how): - if how == 'left': - join_major, join_minor = self.major_axis, self.minor_axis - elif how == 'right': - join_major, join_minor = other.major_axis, other.minor_axis - elif how == 'inner': - join_major = self.major_axis.intersection(other.major_axis) - join_minor = self.minor_axis.intersection(other.minor_axis) - elif how == 'outer': - join_major = self.major_axis.union(other.major_axis) - join_minor = self.minor_axis.union(other.minor_axis) - return join_major, join_minor - - # miscellaneous data creation - @staticmethod - def _extract_axes(self, data, axes, **kwargs): - """ - Return a list of the axis indices. - """ - return [self._extract_axis(self, data, axis=i, **kwargs) - for i, a in enumerate(axes)] - - @staticmethod - def _extract_axes_for_slice(self, axes): - """ - Return the slice dictionary for these axes. - """ - return {self._AXIS_SLICEMAP[i]: a for i, a in - zip(self._AXIS_ORDERS[self._AXIS_LEN - len(axes):], axes)} - - @staticmethod - def _prep_ndarray(self, values, copy=True): - if not isinstance(values, np.ndarray): - values = np.asarray(values) - # NumPy strings are a pain, convert to object - if issubclass(values.dtype.type, str): - values = np.array(values, dtype=object, copy=True) - else: - if copy: - values = values.copy() - if values.ndim != self._AXIS_LEN: - raise ValueError("The number of dimensions required is {0}, " - "but the number of dimensions of the " - "ndarray given was {1}".format(self._AXIS_LEN, - values.ndim)) - return values - - @staticmethod - def _homogenize_dict(self, frames, intersect=True, dtype=None): - """ - Conform set of _constructor_sliced-like objects to either - an intersection of indices / columns or a union. - - Parameters - ---------- - frames : dict - intersect : boolean, default True - - Returns - ------- - dict of aligned results & indices - """ - - result = dict() - # caller differs dict/ODict, preserved type - if isinstance(frames, OrderedDict): - result = OrderedDict() - - adj_frames = OrderedDict() - for k, v in frames.items(): - if isinstance(v, dict): - adj_frames[k] = self._constructor_sliced(v) - else: - adj_frames[k] = v - - axes = self._AXIS_ORDERS[1:] - axes_dict = {a: ax for a, ax in zip(axes, self._extract_axes( - self, adj_frames, axes, intersect=intersect))} - - reindex_dict = {self._AXIS_SLICEMAP[a]: axes_dict[a] for a in axes} - reindex_dict['copy'] = False - for key, frame in adj_frames.items(): - if frame is not None: - result[key] = frame.reindex(**reindex_dict) - else: - result[key] = None - - axes_dict['data'] = result - axes_dict['dtype'] = dtype - return axes_dict - - @staticmethod - def _extract_axis(self, data, axis=0, intersect=False): - - index = None - if len(data) == 0: - index = Index([]) - elif len(data) > 0: - raw_lengths = [] - - have_raw_arrays = False - have_frames = False - - for v in data.values(): - if isinstance(v, self._constructor_sliced): - have_frames = True - elif v is not None: - have_raw_arrays = True - raw_lengths.append(v.shape[axis]) - - if have_frames: - # we want the "old" behavior here, of sorting only - # 1. we're doing a union (intersect=False) - # 2. the indices are not aligned. - index = _get_objs_combined_axis(data.values(), axis=axis, - intersect=intersect, sort=None) - - if have_raw_arrays: - lengths = list(set(raw_lengths)) - if len(lengths) > 1: - raise ValueError('ndarrays must match shape on ' - 'axis {ax}'.format(ax=axis)) - - if have_frames: - if lengths[0] != len(index): - raise AssertionError('Length of data and index must match') - else: - index = Index(np.arange(lengths[0])) - - if index is None: - index = Index([]) - - return ensure_index(index) - - def sort_values(self, *args, **kwargs): - """ - NOT IMPLEMENTED: do not call this method, as sorting values is not - supported for Panel objects and will raise an error. - """ - super().sort_values(*args, **kwargs) - - -Panel._setup_axes(axes=['items', 'major_axis', 'minor_axis'], info_axis=0, - stat_axis=1, aliases={'major': 'major_axis', - 'minor': 'minor_axis'}, - slicers={'major_axis': 'index', - 'minor_axis': 'columns'}, - docs={}) - -Panel._add_numeric_operations() diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 24995d1e2e4926..e43f94e28d4af6 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -57,7 +57,7 @@ from pandas import ( # noqa:F401 Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index, - Index, Int64Index, Interval, IntervalIndex, MultiIndex, NaT, Panel, Period, + Index, Int64Index, Interval, IntervalIndex, MultiIndex, NaT, Period, PeriodIndex, RangeIndex, Series, TimedeltaIndex, Timestamp) from pandas.core import internals from pandas.core.arrays import DatetimeArray, IntervalArray, PeriodArray @@ -671,11 +671,6 @@ def create_block(b): # default_fill_value=obj['default_fill_value'], # default_kind=obj['default_kind'] # ) - # elif typ == 'sparse_panel': - # return SparsePanel( - # obj['data'], items=obj['items'], - # default_fill_value=obj['default_fill_value'], - # default_kind=obj['default_kind']) elif typ == 'block_index': return globals()[obj['klass']](obj['length'], obj['blocs'], obj['blengths']) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 6ed1284ff13bae..614e3172d9d48e 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -1,5 +1,5 @@ import pandas as pd -from pandas import api +from pandas import api, compat from pandas.util import testing as tm @@ -49,9 +49,11 @@ class TestPDApi(Base): 'UInt8Dtype', 'UInt16Dtype', 'UInt32Dtype', 'UInt64Dtype', 'NamedAgg', ] + if not compat.PY37: + classes.append("Panel") # these are already deprecated; awaiting removal - deprecated_classes = ['Panel'] + deprecated_classes = [] # these should be deprecated in the future deprecated_classes_in_future = [] From 903a09c6aafd3b1ab5a3efb2055f92ef03035ec9 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Wed, 3 Jul 2019 19:21:01 +0100 Subject: [PATCH 150/238] PLT: Cleaner plotting backend API, and unify Series and DataFrame accessors (#27009) --- doc/source/development/extending.rst | 27 + pandas/core/frame.py | 2 +- pandas/core/series.py | 2 +- pandas/plotting/__init__.py | 71 +- pandas/plotting/_core.py | 1192 ++++++++------------- pandas/plotting/_matplotlib/__init__.py | 39 +- pandas/plotting/_matplotlib/timeseries.py | 8 +- pandas/tests/plotting/test_frame.py | 10 +- pandas/tests/plotting/test_misc.py | 35 +- pandas/tests/plotting/test_series.py | 14 +- 10 files changed, 634 insertions(+), 766 deletions(-) diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index 12af80f1bce80f..b492a4edd70a40 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -416,3 +416,30 @@ Below is an example to define two original properties, "internal_cache" as a tem # properties defined in _metadata are retained >>> df[['A', 'B']].added_property property + +.. _extending.plotting-backends: + +Plotting backends +----------------- + +Starting in 0.25 pandas can be extended with third-party plotting backends. The +main idea is letting users select a plotting backend different than the provided +one based on Matplotlib. For example: + +.. code-block:: python + + >>> pd.set_option('plotting.backend', 'backend.module') + >>> pd.Series([1, 2, 3]).plot() + +This would be more or less equivalent to: + +.. code-block:: python + + >>> import backend.module + >>> backend.module.plot(pd.Series([1, 2, 3])) + +The backend module can then use other visualization tools (Bokeh, Altair,...) +to generate the plots. + +More information on how to implement a third-party plotting backend can be found at +https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index df4be417e8d021..1a1d6fa729065c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7946,7 +7946,7 @@ def isin(self, values): # ---------------------------------------------------------------------- # Add plotting methods to DataFrame - plot = CachedAccessor("plot", pandas.plotting.FramePlotMethods) + plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) hist = pandas.plotting.hist_frame boxplot = pandas.plotting.boxplot_frame sparse = CachedAccessor("sparse", SparseFrameAccessor) diff --git a/pandas/core/series.py b/pandas/core/series.py index acae4b0449f724..a2086c5f192493 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4537,7 +4537,7 @@ def to_period(self, freq=None, copy=True): str = CachedAccessor("str", StringMethods) dt = CachedAccessor("dt", CombinedDatetimelikeProperties) cat = CachedAccessor("cat", CategoricalAccessor) - plot = CachedAccessor("plot", pandas.plotting.SeriesPlotMethods) + plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) sparse = CachedAccessor("sparse", SparseAccessor) # ---------------------------------------------------------------------- diff --git a/pandas/plotting/__init__.py b/pandas/plotting/__init__.py index ac983e7efd618a..57a45f0f18d902 100644 --- a/pandas/plotting/__init__.py +++ b/pandas/plotting/__init__.py @@ -1,18 +1,73 @@ """ -Plotting public API +Plotting public API. + +Authors of third-party plotting backends should implement a module with a +public ``plot(data, kind, **kwargs)``. The parameter `data` will contain +the data structure and can be a `Series` or a `DataFrame`. For example, +for ``df.plot()`` the parameter `data` will contain the DataFrame `df`. +In some cases, the data structure is transformed before being sent to +the backend (see PlotAccessor.__call__ in pandas/plotting/_core.py for +the exact transformations). + +The parameter `kind` will be one of: + +- line +- bar +- barh +- box +- hist +- kde +- area +- pie +- scatter +- hexbin + +See the pandas API reference for documentation on each kind of plot. + +Any other keyword argument is currently assumed to be backend specific, +but some parameters may be unified and added to the signature in the +future (e.g. `title` which should be useful for any backend). + +Currently, all the Matplotlib functions in pandas are accessed through +the selected backend. For example, `pandas.plotting.boxplot` (equivalent +to `DataFrame.boxplot`) is also accessed in the selected backend. This +is expected to change, and the exact API is under discussion. But with +the current version, backends are expected to implement the next functions: + +- plot (describe above, used for `Series.plot` and `DataFrame.plot`) +- hist_series and hist_frame (for `Series.hist` and `DataFrame.hist`) +- boxplot (`pandas.plotting.boxplot(df)` equivalent to `DataFrame.boxplot`) +- boxplot_frame and boxplot_frame_groupby +- tsplot (deprecated) +- register and deregister (register converters for the tick formats) +- Plots not called as `Series` and `DataFrame` methods: + - table + - andrews_curves + - autocorrelation_plot + - bootstrap_plot + - lag_plot + - parallel_coordinates + - radviz + - scatter_matrix + +Use the code in pandas/plotting/_matplotib.py and +https://github.com/pyviz/hvplot as a reference on how to write a backend. + +For the discussion about the API see +https://github.com/pandas-dev/pandas/issues/26747. """ from pandas.plotting._core import ( - FramePlotMethods, SeriesPlotMethods, boxplot, boxplot_frame, - boxplot_frame_groupby, hist_frame, hist_series) + PlotAccessor, boxplot, boxplot_frame, boxplot_frame_groupby, hist_frame, + hist_series) from pandas.plotting._misc import ( andrews_curves, autocorrelation_plot, bootstrap_plot, deregister as deregister_matplotlib_converters, lag_plot, parallel_coordinates, plot_params, radviz, register as register_matplotlib_converters, scatter_matrix, table) -__all__ = ['boxplot', 'boxplot_frame', 'boxplot_frame_groupby', 'hist_frame', - 'hist_series', 'FramePlotMethods', 'SeriesPlotMethods', - 'scatter_matrix', 'radviz', 'andrews_curves', 'bootstrap_plot', - 'parallel_coordinates', 'lag_plot', 'autocorrelation_plot', - 'table', 'plot_params', 'register_matplotlib_converters', +__all__ = ['PlotAccessor', 'boxplot', 'boxplot_frame', 'boxplot_frame_groupby', + 'hist_frame', 'hist_series', 'scatter_matrix', 'radviz', + 'andrews_curves', 'bootstrap_plot', 'parallel_coordinates', + 'lag_plot', 'autocorrelation_plot', 'table', 'plot_params', + 'register_matplotlib_converters', 'deregister_matplotlib_converters'] diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index b0e928fa8022b0..2f46df29857039 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1,5 +1,6 @@ import importlib from typing import List, Type # noqa +import warnings from pandas.util._decorators import Appender @@ -8,7 +9,6 @@ import pandas from pandas.core.base import PandasObject -from pandas.core.generic import _shared_doc_kwargs, _shared_docs # Trigger matplotlib import, which implicitly registers our # converts. Implicit registration is deprecated, and when enforced @@ -18,160 +18,148 @@ except ImportError: pass -df_kind = """- 'scatter' : scatter plot - - 'hexbin' : hexbin plot""" -series_kind = "" -df_coord = """x : label or position, default None - y : label, position or list of label, positions, default None - Allows plotting of one column versus another""" -series_coord = "" - -df_unique = """stacked : bool, default False in line and - bar plots, and True in area plot. If True, create stacked plot. - sort_columns : bool, default False - Sort column names to determine plot ordering - secondary_y : bool or sequence, default False - Whether to plot on the secondary y-axis - If a list/tuple, which columns to plot on secondary y-axis""" -series_unique = """label : label argument to provide to plot - secondary_y : bool or sequence of ints, default False - If True then y-axis will be on the right""" - -df_ax = """ax : matplotlib axes object, default None - subplots : bool, default False - Make separate subplots for each column - sharex : bool, default True if ax is None else False - In case subplots=True, share x axis and set some x axis labels to - invisible; defaults to True if ax is None otherwise False if an ax - is passed in; Be aware, that passing in both an ax and sharex=True - will alter all x axis labels for all axis in a figure! - sharey : bool, default False - In case subplots=True, share y axis and set some y axis labels to - invisible - layout : tuple (optional) - (rows, columns) for the layout of subplots""" -series_ax = """ax : matplotlib axes object - If not passed, uses gca()""" - -df_note = """- If `kind` = 'scatter' and the argument `c` is the name of a dataframe - column, the values of that column are used to color each point. - - If `kind` = 'hexbin', you can control the size of the bins with the - `gridsize` argument. By default, a histogram of the counts around each - `(x, y)` point is computed. You can specify alternative aggregations - by passing values to the `C` and `reduce_C_function` arguments. - `C` specifies the value at each `(x, y)` point and `reduce_C_function` - is a function of one argument that reduces all the values in a bin to - a single number (e.g. `mean`, `max`, `sum`, `std`).""" -series_note = "" - -_shared_doc_df_kwargs = dict(klass='DataFrame', klass_obj='df', - klass_kind=df_kind, klass_coord=df_coord, - klass_ax=df_ax, klass_unique=df_unique, - klass_note=df_note) -_shared_doc_series_kwargs = dict(klass='Series', klass_obj='s', - klass_kind=series_kind, - klass_coord=series_coord, klass_ax=series_ax, - klass_unique=series_unique, - klass_note=series_note) - -_shared_docs['plot'] = """ - Make plots of %(klass)s using matplotlib / pylab. - - *New in version 0.17.0:* Each plot kind has a corresponding method on the - ``%(klass)s.plot`` accessor: - ``%(klass_obj)s.plot(kind='line')`` is equivalent to - ``%(klass_obj)s.plot.line()``. +def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, + xrot=None, ylabelsize=None, yrot=None, figsize=None, + bins=10, **kwds): + """ + Draw histogram of the input series using matplotlib. Parameters ---------- - data : %(klass)s - %(klass_coord)s - kind : str - - 'line' : line plot (default) - - 'bar' : vertical bar plot - - 'barh' : horizontal bar plot - - 'hist' : histogram - - 'box' : boxplot - - 'kde' : Kernel Density Estimation plot - - 'density' : same as 'kde' - - 'area' : area plot - - 'pie' : pie plot - %(klass_kind)s - %(klass_ax)s - figsize : a tuple (width, height) in inches - use_index : bool, default True - Use index as ticks for x axis - title : string or list - Title to use for the plot. If a string is passed, print the string at - the top of the figure. If a list is passed and `subplots` is True, - print each item in the list above the corresponding subplot. - grid : bool, default None (matlab style default) - Axis grid lines - legend : False/True/'reverse' - Place legend on axis subplots - style : list or dict - matplotlib line style per column - logx : bool or 'sym', default False - Use log scaling or symlog scaling on x axis - .. versionchanged:: 0.25.0 + by : object, optional + If passed, then used to form histograms for separate groups + ax : matplotlib axis object + If not passed, uses gca() + grid : bool, default True + Whether to show axis grid lines + xlabelsize : int, default None + If specified changes the x-axis label size + xrot : float, default None + rotation of x axis labels + ylabelsize : int, default None + If specified changes the y-axis label size + yrot : float, default None + rotation of y axis labels + figsize : tuple, default None + figure size in inches by default + bins : integer or sequence, default 10 + Number of histogram bins to be used. If an integer is given, bins + 1 + bin edges are calculated and returned. If bins is a sequence, gives + bin edges, including left edge of first bin and right edge of last + bin. In this case, bins is returned unmodified. + `**kwds` : keywords + To be passed to the actual plotting function - logy : bool or 'sym' default False - Use log scaling or symlog scaling on y axis - .. versionchanged:: 0.25.0 + Returns + ------- + matplotlib.AxesSubplot + A histogram plot. - loglog : bool or 'sym', default False - Use log scaling or symlog scaling on both x and y axes - .. versionchanged:: 0.25.0 + See Also + -------- + matplotlib.axes.Axes.hist : Plot a histogram using matplotlib. + """ + plot_backend = _get_plot_backend() + return plot_backend.hist_series(self, by=by, ax=ax, grid=grid, + xlabelsize=xlabelsize, xrot=xrot, + ylabelsize=ylabelsize, yrot=yrot, + figsize=figsize, bins=bins, **kwds) - xticks : sequence - Values to use for the xticks - yticks : sequence - Values to use for the yticks - xlim : 2-tuple/list - ylim : 2-tuple/list - rot : int, default None - Rotation for ticks (xticks for vertical, yticks for horizontal plots) - fontsize : int, default None - Font size for xticks and yticks - colormap : str or matplotlib colormap object, default None - Colormap to select colors from. If string, load colormap with that name - from matplotlib. - colorbar : bool, optional - If True, plot colorbar (only relevant for 'scatter' and 'hexbin' plots) - position : float - Specify relative alignments for bar plot layout. - From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center) - table : bool, Series or DataFrame, default False - If True, draw a table using the data in the DataFrame and the data will - be transposed to meet matplotlib's default layout. - If a Series or DataFrame is passed, use passed data to draw a table. - yerr : DataFrame, Series, array-like, dict and str - See :ref:`Plotting with Error Bars ` for - detail. - xerr : same types as yerr. - %(klass_unique)s - mark_right : bool, default True - When using a secondary_y axis, automatically mark the column - labels with "(right)" in the legend - `**kwds` : keywords - Options to pass to matplotlib plotting method + +def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, + xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, + sharey=False, figsize=None, layout=None, bins=10, **kwds): + """ + Make a histogram of the DataFrame's. + + A `histogram`_ is a representation of the distribution of data. + This function calls :meth:`matplotlib.pyplot.hist`, on each series in + the DataFrame, resulting in one histogram per column. + + .. _histogram: https://en.wikipedia.org/wiki/Histogram + + Parameters + ---------- + data : DataFrame + The pandas object holding the data. + column : string or sequence + If passed, will be used to limit data to a subset of columns. + by : object, optional + If passed, then used to form histograms for separate groups. + grid : bool, default True + Whether to show axis grid lines. + xlabelsize : int, default None + If specified changes the x-axis label size. + xrot : float, default None + Rotation of x axis labels. For example, a value of 90 displays the + x labels rotated 90 degrees clockwise. + ylabelsize : int, default None + If specified changes the y-axis label size. + yrot : float, default None + Rotation of y axis labels. For example, a value of 90 displays the + y labels rotated 90 degrees clockwise. + ax : Matplotlib axes object, default None + The axes to plot the histogram on. + sharex : bool, default True if ax is None else False + In case subplots=True, share x axis and set some x axis labels to + invisible; defaults to True if ax is None otherwise False if an ax + is passed in. + Note that passing in both an ax and sharex=True will alter all x axis + labels for all subplots in a figure. + sharey : bool, default False + In case subplots=True, share y axis and set some y axis labels to + invisible. + figsize : tuple + The size in inches of the figure to create. Uses the value in + `matplotlib.rcParams` by default. + layout : tuple, optional + Tuple of (rows, columns) for the layout of the histograms. + bins : integer or sequence, default 10 + Number of histogram bins to be used. If an integer is given, bins + 1 + bin edges are calculated and returned. If bins is a sequence, gives + bin edges, including left edge of first bin and right edge of last + bin. In this case, bins is returned unmodified. + **kwds + All other plotting keyword arguments to be passed to + :meth:`matplotlib.pyplot.hist`. Returns ------- - :class:`matplotlib.axes.Axes` or numpy.ndarray of them + matplotlib.AxesSubplot or numpy.ndarray of them - Notes - ----- + See Also + -------- + matplotlib.pyplot.hist : Plot a histogram using matplotlib. - - See matplotlib documentation online for more on this subject - - If `kind` = 'bar' or 'barh', you can specify relative alignments - for bar plot layout by `position` keyword. - From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center) - %(klass_note)s + Examples + -------- + + .. plot:: + :context: close-figs + + This example draws a histogram based on the length and width of + some animals, displayed in three bins + + >>> df = pd.DataFrame({ + ... 'length': [1.5, 0.5, 1.2, 0.9, 3], + ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1] + ... }, index= ['pig', 'rabbit', 'duck', 'chicken', 'horse']) + >>> hist = df.hist(bins=3) """ + plot_backend = _get_plot_backend() + return plot_backend.hist_frame(data, column=column, by=by, grid=grid, + xlabelsize=xlabelsize, xrot=xrot, + ylabelsize=ylabelsize, yrot=yrot, + ax=ax, sharex=sharex, sharey=sharey, + figsize=figsize, layout=layout, bins=bins, + **kwds) + -_shared_docs['boxplot'] = """ +def boxplot(data, column=None, by=None, ax=None, fontsize=None, + rot=0, grid=True, figsize=None, layout=None, return_type=None, + **kwds): + """ Make a box plot from DataFrame columns. Make a box-and-whisker plot from DataFrame columns, optionally grouped @@ -333,193 +321,6 @@ >>> type(boxplot) """ - -_shared_docs['kde'] = """ - Generate Kernel Density Estimate plot using Gaussian kernels. - - In statistics, `kernel density estimation`_ (KDE) is a non-parametric - way to estimate the probability density function (PDF) of a random - variable. This function uses Gaussian kernels and includes automatic - bandwidth determination. - - .. _kernel density estimation: - https://en.wikipedia.org/wiki/Kernel_density_estimation - - Parameters - ---------- - bw_method : str, scalar or callable, optional - The method used to calculate the estimator bandwidth. This can be - 'scott', 'silverman', a scalar constant or a callable. - If None (default), 'scott' is used. - See :class:`scipy.stats.gaussian_kde` for more information. - ind : NumPy array or integer, optional - Evaluation points for the estimated PDF. If None (default), - 1000 equally spaced points are used. If `ind` is a NumPy array, the - KDE is evaluated at the points passed. If `ind` is an integer, - `ind` number of equally spaced points are used. - **kwds : optional - Additional keyword arguments are documented in - :meth:`pandas.%(this-datatype)s.plot`. - - Returns - ------- - matplotlib.axes.Axes or numpy.ndarray of them - - See Also - -------- - scipy.stats.gaussian_kde : Representation of a kernel-density - estimate using Gaussian kernels. This is the function used - internally to estimate the PDF. - %(sibling-datatype)s.plot.kde : Generate a KDE plot for a - %(sibling-datatype)s. - - Examples - -------- - %(examples)s - """ - - -def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, - xrot=None, ylabelsize=None, yrot=None, figsize=None, - bins=10, **kwds): - """ - Draw histogram of the input series using matplotlib. - - Parameters - ---------- - by : object, optional - If passed, then used to form histograms for separate groups - ax : matplotlib axis object - If not passed, uses gca() - grid : bool, default True - Whether to show axis grid lines - xlabelsize : int, default None - If specified changes the x-axis label size - xrot : float, default None - rotation of x axis labels - ylabelsize : int, default None - If specified changes the y-axis label size - yrot : float, default None - rotation of y axis labels - figsize : tuple, default None - figure size in inches by default - bins : integer or sequence, default 10 - Number of histogram bins to be used. If an integer is given, bins + 1 - bin edges are calculated and returned. If bins is a sequence, gives - bin edges, including left edge of first bin and right edge of last - bin. In this case, bins is returned unmodified. - `**kwds` : keywords - To be passed to the actual plotting function - - Returns - ------- - matplotlib.AxesSubplot - A histogram plot. - - See Also - -------- - matplotlib.axes.Axes.hist : Plot a histogram using matplotlib. - """ - plot_backend = _get_plot_backend() - return plot_backend.hist_series(self, by=by, ax=ax, grid=grid, - xlabelsize=xlabelsize, xrot=xrot, - ylabelsize=ylabelsize, yrot=yrot, - figsize=figsize, bins=bins, **kwds) - - -def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, - xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, - sharey=False, figsize=None, layout=None, bins=10, **kwds): - """ - Make a histogram of the DataFrame's. - - A `histogram`_ is a representation of the distribution of data. - This function calls :meth:`matplotlib.pyplot.hist`, on each series in - the DataFrame, resulting in one histogram per column. - - .. _histogram: https://en.wikipedia.org/wiki/Histogram - - Parameters - ---------- - data : DataFrame - The pandas object holding the data. - column : string or sequence - If passed, will be used to limit data to a subset of columns. - by : object, optional - If passed, then used to form histograms for separate groups. - grid : bool, default True - Whether to show axis grid lines. - xlabelsize : int, default None - If specified changes the x-axis label size. - xrot : float, default None - Rotation of x axis labels. For example, a value of 90 displays the - x labels rotated 90 degrees clockwise. - ylabelsize : int, default None - If specified changes the y-axis label size. - yrot : float, default None - Rotation of y axis labels. For example, a value of 90 displays the - y labels rotated 90 degrees clockwise. - ax : Matplotlib axes object, default None - The axes to plot the histogram on. - sharex : bool, default True if ax is None else False - In case subplots=True, share x axis and set some x axis labels to - invisible; defaults to True if ax is None otherwise False if an ax - is passed in. - Note that passing in both an ax and sharex=True will alter all x axis - labels for all subplots in a figure. - sharey : bool, default False - In case subplots=True, share y axis and set some y axis labels to - invisible. - figsize : tuple - The size in inches of the figure to create. Uses the value in - `matplotlib.rcParams` by default. - layout : tuple, optional - Tuple of (rows, columns) for the layout of the histograms. - bins : integer or sequence, default 10 - Number of histogram bins to be used. If an integer is given, bins + 1 - bin edges are calculated and returned. If bins is a sequence, gives - bin edges, including left edge of first bin and right edge of last - bin. In this case, bins is returned unmodified. - **kwds - All other plotting keyword arguments to be passed to - :meth:`matplotlib.pyplot.hist`. - - Returns - ------- - matplotlib.AxesSubplot or numpy.ndarray of them - - See Also - -------- - matplotlib.pyplot.hist : Plot a histogram using matplotlib. - - Examples - -------- - - .. plot:: - :context: close-figs - - This example draws a histogram based on the length and width of - some animals, displayed in three bins - - >>> df = pd.DataFrame({ - ... 'length': [1.5, 0.5, 1.2, 0.9, 3], - ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1] - ... }, index= ['pig', 'rabbit', 'duck', 'chicken', 'horse']) - >>> hist = df.hist(bins=3) - """ - plot_backend = _get_plot_backend() - return plot_backend.hist_frame(data, column=column, by=by, grid=grid, - xlabelsize=xlabelsize, xrot=xrot, - ylabelsize=ylabelsize, yrot=yrot, - ax=ax, sharex=sharex, sharey=sharey, - figsize=figsize, layout=layout, bins=bins, - **kwds) - - -@Appender(_shared_docs['boxplot'] % _shared_doc_kwargs) -def boxplot(data, column=None, by=None, ax=None, fontsize=None, - rot=0, grid=True, figsize=None, layout=None, return_type=None, - **kwds): plot_backend = _get_plot_backend() return plot_backend.boxplot(data, column=column, by=by, ax=ax, fontsize=fontsize, rot=rot, grid=grid, @@ -527,7 +328,7 @@ def boxplot(data, column=None, by=None, ax=None, fontsize=None, return_type=return_type, **kwds) -@Appender(_shared_docs['boxplot'] % _shared_doc_kwargs) +@Appender(boxplot.__doc__) def boxplot_frame(self, column=None, by=None, ax=None, fontsize=None, rot=0, grid=True, figsize=None, layout=None, return_type=None, **kwds): @@ -597,81 +398,214 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, sharey=sharey, **kwds) -# kinds supported by both dataframe and series -_common_kinds = ['line', 'bar', 'barh', - 'kde', 'density', 'area', 'hist', 'box'] -# kinds supported by dataframe -_dataframe_kinds = ['scatter', 'hexbin'] -# kinds supported only by series or dataframe single column -_series_kinds = ['pie'] -_all_kinds = _common_kinds + _dataframe_kinds + _series_kinds +class PlotAccessor(PandasObject): + """ + Make plots of Series or DataFrame using the backend specified by the + option ``plotting.backend``. By default, matplotlib is used. + Parameters + ---------- + data : Series or DataFrame + The object for which the method is called + x : label or position, default None + Only used if data is a DataFrame. + y : label, position or list of label, positions, default None + Allows plotting of one column versus another. Only used if data is a + DataFrame. + kind : str + - 'line' : line plot (default) + - 'bar' : vertical bar plot + - 'barh' : horizontal bar plot + - 'hist' : histogram + - 'box' : boxplot + - 'kde' : Kernel Density Estimation plot + - 'density' : same as 'kde' + - 'area' : area plot + - 'pie' : pie plot + - 'scatter' : scatter plot + - 'hexbin' : hexbin plot + figsize : a tuple (width, height) in inches + use_index : bool, default True + Use index as ticks for x axis + title : string or list + Title to use for the plot. If a string is passed, print the string + at the top of the figure. If a list is passed and `subplots` is + True, print each item in the list above the corresponding subplot. + grid : bool, default None (matlab style default) + Axis grid lines + legend : False/True/'reverse' + Place legend on axis subplots + style : list or dict + matplotlib line style per column + logx : bool or 'sym', default False + Use log scaling or symlog scaling on x axis + .. versionchanged:: 0.25.0 -def _get_standard_kind(kind): - return {'density': 'kde'}.get(kind, kind) + logy : bool or 'sym' default False + Use log scaling or symlog scaling on y axis + .. versionchanged:: 0.25.0 + loglog : bool or 'sym', default False + Use log scaling or symlog scaling on both x and y axes + .. versionchanged:: 0.25.0 -def _get_plot_backend(): - """ - Return the plotting backend to use (e.g. `pandas.plotting._matplotlib`). + xticks : sequence + Values to use for the xticks + yticks : sequence + Values to use for the yticks + xlim : 2-tuple/list + ylim : 2-tuple/list + rot : int, default None + Rotation for ticks (xticks for vertical, yticks for horizontal + plots) + fontsize : int, default None + Font size for xticks and yticks + colormap : str or matplotlib colormap object, default None + Colormap to select colors from. If string, load colormap with that + name from matplotlib. + colorbar : bool, optional + If True, plot colorbar (only relevant for 'scatter' and 'hexbin' + plots) + position : float + Specify relative alignments for bar plot layout. + From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 + (center) + table : bool, Series or DataFrame, default False + If True, draw a table using the data in the DataFrame and the data + will be transposed to meet matplotlib's default layout. + If a Series or DataFrame is passed, use passed data to draw a + table. + yerr : DataFrame, Series, array-like, dict and str + See :ref:`Plotting with Error Bars ` for + detail. + xerr : DataFrame, Series, array-like, dict and str + Equivalent to yerr. + mark_right : bool, default True + When using a secondary_y axis, automatically mark the column + labels with "(right)" in the legend + `**kwds` : keywords + Options to pass to matplotlib plotting method - The plotting system of pandas has been using matplotlib, but the idea here - is that it can also work with other third-party backends. In the future, - this function will return the backend from a pandas option, and all the - rest of the code in this file will use the backend specified there for the - plotting. + Returns + ------- + :class:`matplotlib.axes.Axes` or numpy.ndarray of them + If the backend is not the default matplotlib one, the return value + will be the object returned by the backend. - The backend is imported lazily, as matplotlib is a soft dependency, and - pandas can be used without it being installed. + Notes + ----- + - See matplotlib documentation online for more on this subject + - If `kind` = 'bar' or 'barh', you can specify relative alignments + for bar plot layout by `position` keyword. + From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 + (center) """ - backend_str = pandas.get_option('plotting.backend') - if backend_str == 'matplotlib': - backend_str = 'pandas.plotting._matplotlib' - return importlib.import_module(backend_str) + _common_kinds = ('line', 'bar', 'barh', 'kde', 'density', 'area', 'hist', + 'box') + _series_kinds = ('pie',) + _dataframe_kinds = ('scatter', 'hexbin') + _kind_aliases = {'density': 'kde'} + _all_kinds = _common_kinds + _series_kinds + _dataframe_kinds + def __init__(self, data): + self._parent = data -def _plot_classes(): - plot_backend = _get_plot_backend() - # TODO restore type annotations if we create a base class for plot classes - # (a parent of MPLPlot, and classes of other backends) - classes = [plot_backend.LinePlot, plot_backend.BarPlot, - plot_backend.BarhPlot, plot_backend.AreaPlot, - plot_backend.HistPlot, plot_backend.BoxPlot, - plot_backend.ScatterPlot, plot_backend.HexBinPlot, - plot_backend.KdePlot, plot_backend.PiePlot] - return {class_._kind: class_ for class_ in classes} - - -def _plot(data, x=None, y=None, subplots=False, - ax=None, kind='line', **kwds): - kind = _get_standard_kind(kind.lower().strip()) - if kind in _all_kinds: - klass = _plot_classes()[kind] - else: - raise ValueError("%r is not a valid plot kind" % kind) - - if kind in _dataframe_kinds: - if isinstance(data, ABCDataFrame): - plot_obj = klass(data, x=x, y=y, subplots=subplots, ax=ax, - kind=kind, **kwds) + @staticmethod + def _get_call_args(backend_name, data, args, kwargs): + """ + This function makes calls to this accessor `__call__` method compatible + with the previous `SeriesPlotMethods.__call__` and + `DataFramePlotMethods.__call__`. Those had slightly different + signatures, since `DataFramePlotMethods` accepted `x` and `y` + parameters. + """ + if isinstance(data, ABCSeries): + arg_def = [ + ('kind', 'line'), ('ax', None), ('figsize', None), + ('use_index', True), ('title', None), ('grid', None), + ('legend', False), ('style', None), ('logx', False), + ('logy', False), ('loglog', False), ('xticks', None), + ('yticks', None), ('xlim', None), ('ylim', None), + ('rot', None), ('fontsize', None), ('colormap', None), + ('table', False), ('yerr', None), ('xerr', None), + ('label', None), ('secondary_y', False)] + elif isinstance(data, ABCDataFrame): + arg_def = [ + ('x', None), ('y', None), ('kind', 'line'), ('ax', None), + ('subplots', False), ('sharex', None), ('sharey', False), + ('layout', None), ('figsize', None), ('use_index', True), + ('title', None), ('grid', None), ('legend', True), + ('style', None), ('logx', False), ('logy', False), + ('loglog', False), ('xticks', None), ('yticks', None), + ('xlim', None), ('ylim', None), ('rot', None), + ('fontsize', None), ('colormap', None), ('table', False), + ('yerr', None), ('xerr', None), ('secondary_y', False), + ('sort_columns', False)] else: - raise ValueError("plot kind %r can only be used for data frames" - % kind) - - elif kind in _series_kinds: - if isinstance(data, ABCDataFrame): - if y is None and subplots is False: - msg = "{0} requires either y column or 'subplots=True'" - raise ValueError(msg.format(kind)) - elif y is not None: - if is_integer(y) and not data.columns.holds_integer(): - y = data.columns[y] - # converted to series actually. copy to not modify - data = data[y].copy() - data.index.name = y - plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds) - else: - if isinstance(data, ABCDataFrame): + raise TypeError(('Called plot accessor for type {}, expected ' + 'Series or DataFrame').format( + type(data).__name__)) + + if args and isinstance(data, ABCSeries): + msg = ('`Series.plot()` should not be called with positional ' + 'arguments, only keyword arguments. The order of ' + 'positional arguments will change in the future. ' + 'Use `Series.plot({})` instead of `Series.plot({})`.') + positional_args = str(args)[1:-1] + keyword_args = ', '.join('{}={!r}'.format(name, value) + for (name, default), value + in zip(arg_def, args)) + warnings.warn(msg.format(keyword_args, positional_args), + FutureWarning, stacklevel=3) + + pos_args = {name: value for value, (name, _) in zip(args, arg_def)} + if backend_name == 'pandas.plotting._matplotlib': + kwargs = dict(arg_def, **pos_args, **kwargs) + else: + kwargs = dict(pos_args, **kwargs) + + x = kwargs.pop('x', None) + y = kwargs.pop('y', None) + kind = kwargs.pop('kind', 'line') + return x, y, kind, kwargs + + def __call__(self, *args, **kwargs): + plot_backend = _get_plot_backend() + + x, y, kind, kwargs = self._get_call_args(plot_backend.__name__, + self._parent, args, kwargs) + + kind = self._kind_aliases.get(kind, kind) + if kind not in self._all_kinds: + raise ValueError('{} is not a valid plot kind'.format(kind)) + + # The original data structured can be transformed before passed to the + # backend. For example, for DataFrame is common to set the index as the + # `x` parameter, and return a Series with the parameter `y` as values. + data = self._parent.copy() + + if isinstance(data, pandas.core.dtypes.generic.ABCSeries): + kwargs['reuse_plot'] = True + + if kind in self._dataframe_kinds: + if isinstance(data, ABCDataFrame): + return plot_backend.plot(data, x=x, y=y, kind=kind, **kwargs) + else: + raise ValueError(("plot kind {} can only be used for " + "data frames").format(kind)) + elif kind in self._series_kinds: + if isinstance(data, ABCDataFrame): + if y is None and kwargs.get('subplots') is False: + msg = "{} requires either y column or 'subplots=True'" + raise ValueError(msg.format(kind)) + elif y is not None: + if (is_integer(y) + and not data.columns.holds_integer()): + y = data.columns[y] + # converted to series actually. copy to not modify + data = data[y].copy() + data.index.name = y + elif isinstance(data, ABCDataFrame): data_cols = data.columns if x is not None: if is_integer(x) and not data.columns.holds_integer(): @@ -679,7 +613,6 @@ def _plot(data, x=None, y=None, subplots=False, elif not isinstance(data[x], ABCSeries): raise ValueError("x must be a label or position") data = data.set_index(x) - if y is not None: # check if we have y as int or list of ints int_ylist = is_list_like(y) and all(is_integer(c) for c in y) @@ -687,13 +620,13 @@ def _plot(data, x=None, y=None, subplots=False, if int_y_arg and not data.columns.holds_integer(): y = data_cols[y] - label_kw = kwds['label'] if 'label' in kwds else False + label_kw = kwargs['label'] if 'label' in kwargs else False for kw in ['xerr', 'yerr']: - if (kw in kwds) and \ - (isinstance(kwds[kw], str) or - is_integer(kwds[kw])): + if (kw in kwargs and + (isinstance(kwargs[kw], str) + or is_integer(kwargs[kw]))): try: - kwds[kw] = data[kwds[kw]] + kwargs[kw] = data[kwargs[kw]] except (IndexError, KeyError, TypeError): pass @@ -707,312 +640,15 @@ def _plot(data, x=None, y=None, subplots=False, match = is_list_like(label_kw) and len(label_kw) == len(y) if label_kw and not match: raise ValueError( - "label should be list-like and same length as y" - ) + "label should be list-like and same length as y") label_name = label_kw or data.columns data.columns = label_name - plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds) - - plot_obj.generate() - plot_obj.draw() - return plot_obj.result - - -@Appender(_shared_docs['plot'] % _shared_doc_df_kwargs) -def plot_frame(data, x=None, y=None, kind='line', ax=None, - subplots=False, sharex=None, sharey=False, layout=None, - figsize=None, use_index=True, title=None, grid=None, - legend=True, style=None, logx=False, logy=False, loglog=False, - xticks=None, yticks=None, xlim=None, ylim=None, - rot=None, fontsize=None, colormap=None, table=False, - yerr=None, xerr=None, - secondary_y=False, sort_columns=False, - **kwds): - return _plot(data, kind=kind, x=x, y=y, ax=ax, - subplots=subplots, sharex=sharex, sharey=sharey, - layout=layout, figsize=figsize, use_index=use_index, - title=title, grid=grid, legend=legend, - style=style, logx=logx, logy=logy, loglog=loglog, - xticks=xticks, yticks=yticks, xlim=xlim, ylim=ylim, - rot=rot, fontsize=fontsize, colormap=colormap, table=table, - yerr=yerr, xerr=xerr, - secondary_y=secondary_y, sort_columns=sort_columns, - **kwds) - - -@Appender(_shared_docs['plot'] % _shared_doc_series_kwargs) -def plot_series(data, kind='line', ax=None, # Series unique - figsize=None, use_index=True, title=None, grid=None, - legend=False, style=None, logx=False, logy=False, loglog=False, - xticks=None, yticks=None, xlim=None, ylim=None, - rot=None, fontsize=None, colormap=None, table=False, - yerr=None, xerr=None, - label=None, secondary_y=False, # Series unique - **kwds): - - # FIXME move this into _matplotlib - import matplotlib.pyplot as plt - if ax is None and len(plt.get_fignums()) > 0: - with plt.rc_context(): - ax = plt.gca() - ax = getattr(ax, 'left_ax', ax) - - return _plot(data, kind=kind, ax=ax, - figsize=figsize, use_index=use_index, title=title, - grid=grid, legend=legend, - style=style, logx=logx, logy=logy, loglog=loglog, - xticks=xticks, yticks=yticks, xlim=xlim, ylim=ylim, - rot=rot, fontsize=fontsize, colormap=colormap, table=table, - yerr=yerr, xerr=xerr, - label=label, secondary_y=secondary_y, - **kwds) - - -class BasePlotMethods(PandasObject): - - def __init__(self, data): - self._parent = data # can be Series or DataFrame - - def __call__(self, *args, **kwargs): - raise NotImplementedError - - -class SeriesPlotMethods(BasePlotMethods): - """ - Series plotting accessor and method. - - Examples - -------- - >>> s.plot.line() - >>> s.plot.bar() - >>> s.plot.hist() - - Plotting methods can also be accessed by calling the accessor as a method - with the ``kind`` argument: - ``s.plot(kind='line')`` is equivalent to ``s.plot.line()`` - """ - - def __call__(self, kind='line', ax=None, - figsize=None, use_index=True, title=None, grid=None, - legend=False, style=None, logx=False, logy=False, - loglog=False, xticks=None, yticks=None, - xlim=None, ylim=None, - rot=None, fontsize=None, colormap=None, table=False, - yerr=None, xerr=None, - label=None, secondary_y=False, **kwds): - return plot_series(self._parent, kind=kind, ax=ax, figsize=figsize, - use_index=use_index, title=title, grid=grid, - legend=legend, style=style, logx=logx, logy=logy, - loglog=loglog, xticks=xticks, yticks=yticks, - xlim=xlim, ylim=ylim, rot=rot, fontsize=fontsize, - colormap=colormap, table=table, yerr=yerr, - xerr=xerr, label=label, secondary_y=secondary_y, - **kwds) - __call__.__doc__ = plot_series.__doc__ - - def line(self, **kwds): - """ - Line plot. - - Parameters - ---------- - `**kwds` : optional - Additional keyword arguments are documented in - :meth:`pandas.Series.plot`. - - Returns - ------- - :class:`matplotlib.axes.Axes` or numpy.ndarray of them - - Examples - -------- - - .. plot:: - :context: close-figs - - >>> s = pd.Series([1, 3, 2]) - >>> s.plot.line() - """ - return self(kind='line', **kwds) - - def bar(self, **kwds): - """ - Vertical bar plot. - - Parameters - ---------- - `**kwds` : optional - Additional keyword arguments are documented in - :meth:`pandas.Series.plot`. - - Returns - ------- - :class:`matplotlib.axes.Axes` or numpy.ndarray of them - """ - return self(kind='bar', **kwds) - - def barh(self, **kwds): - """ - Horizontal bar plot. - - Parameters - ---------- - `**kwds` : optional - Additional keyword arguments are documented in - :meth:`pandas.Series.plot`. - - Returns - ------- - :class:`matplotlib.axes.Axes` or numpy.ndarray of them - """ - return self(kind='barh', **kwds) - - def box(self, **kwds): - """ - Boxplot. - - Parameters - ---------- - `**kwds` : optional - Additional keyword arguments are documented in - :meth:`pandas.Series.plot`. - - Returns - ------- - :class:`matplotlib.axes.Axes` or numpy.ndarray of them - """ - return self(kind='box', **kwds) - - def hist(self, bins=10, **kwds): - """ - Histogram. - - Parameters - ---------- - bins : integer, default 10 - Number of histogram bins to be used - `**kwds` : optional - Additional keyword arguments are documented in - :meth:`pandas.Series.plot`. - - Returns - ------- - :class:`matplotlib.axes.Axes` or numpy.ndarray of them - """ - return self(kind='hist', bins=bins, **kwds) - - @Appender(_shared_docs['kde'] % { - 'this-datatype': 'Series', - 'sibling-datatype': 'DataFrame', - 'examples': """ - Given a Series of points randomly sampled from an unknown - distribution, estimate its PDF using KDE with automatic - bandwidth determination and plot the results, evaluating them at - 1000 equally spaced points (default): - - .. plot:: - :context: close-figs - - >>> s = pd.Series([1, 2, 2.5, 3, 3.5, 4, 5]) - >>> ax = s.plot.kde() - - A scalar bandwidth can be specified. Using a small bandwidth value can - lead to over-fitting, while using a large bandwidth value may result - in under-fitting: - - .. plot:: - :context: close-figs - - >>> ax = s.plot.kde(bw_method=0.3) - - .. plot:: - :context: close-figs - - >>> ax = s.plot.kde(bw_method=3) - - Finally, the `ind` parameter determines the evaluation points for the - plot of the estimated PDF: - - .. plot:: - :context: close-figs - - >>> ax = s.plot.kde(ind=[1, 2, 3, 4, 5]) - """.strip() - }) - def kde(self, bw_method=None, ind=None, **kwds): - return self(kind='kde', bw_method=bw_method, ind=ind, **kwds) - - density = kde - - def area(self, **kwds): - """ - Area plot. - - Parameters - ---------- - `**kwds` : optional - Additional keyword arguments are documented in - :meth:`pandas.Series.plot`. - - Returns - ------- - :class:`matplotlib.axes.Axes` or numpy.ndarray of them - """ - return self(kind='area', **kwds) - - def pie(self, **kwds): - """ - Pie chart. - - Parameters - ---------- - `**kwds` : optional - Additional keyword arguments are documented in - :meth:`pandas.Series.plot`. - - Returns - ------- - :class:`matplotlib.axes.Axes` or numpy.ndarray of them - """ - return self(kind='pie', **kwds) + return plot_backend.plot(data, kind=kind, **kwargs) -class FramePlotMethods(BasePlotMethods): - """DataFrame plotting accessor and method - - Examples - -------- - >>> df.plot.line() - >>> df.plot.scatter('x', 'y') - >>> df.plot.hexbin() - - These plotting methods can also be accessed by calling the accessor as a - method with the ``kind`` argument: - ``df.plot(kind='line')`` is equivalent to ``df.plot.line()`` - """ - - def __call__(self, x=None, y=None, kind='line', ax=None, - subplots=False, sharex=None, sharey=False, layout=None, - figsize=None, use_index=True, title=None, grid=None, - legend=True, style=None, logx=False, logy=False, loglog=False, - xticks=None, yticks=None, xlim=None, ylim=None, - rot=None, fontsize=None, colormap=None, table=False, - yerr=None, xerr=None, - secondary_y=False, sort_columns=False, **kwds): - return plot_frame(self._parent, kind=kind, x=x, y=y, ax=ax, - subplots=subplots, sharex=sharex, sharey=sharey, - layout=layout, figsize=figsize, use_index=use_index, - title=title, grid=grid, legend=legend, style=style, - logx=logx, logy=logy, loglog=loglog, xticks=xticks, - yticks=yticks, xlim=xlim, ylim=ylim, rot=rot, - fontsize=fontsize, colormap=colormap, table=table, - yerr=yerr, xerr=xerr, secondary_y=secondary_y, - sort_columns=sort_columns, **kwds) - __call__.__doc__ = plot_frame.__doc__ - - def line(self, x=None, y=None, **kwds): + def line(self, x=None, y=None, **kwargs): """ - Plot DataFrame columns as lines. + Plot Series or DataFrame as lines. This function is useful to plot lines using DataFrame's values as coordinates. @@ -1042,6 +678,12 @@ def line(self, x=None, y=None, **kwds): Examples -------- + .. plot:: + :context: close-figs + + >>> s = pd.Series([1, 3, 2]) + >>> s.plot.line() + .. plot:: :context: close-figs @@ -1071,9 +713,9 @@ def line(self, x=None, y=None, **kwds): >>> lines = df.plot.line(x='pig', y='horse') """ - return self(kind='line', x=x, y=y, **kwds) + return self(kind='line', x=x, y=y, **kwargs) - def bar(self, x=None, y=None, **kwds): + def bar(self, x=None, y=None, **kwargs): """ Vertical bar plot. @@ -1156,9 +798,9 @@ def bar(self, x=None, y=None, **kwds): >>> ax = df.plot.bar(x='lifespan', rot=0) """ - return self(kind='bar', x=x, y=y, **kwds) + return self(kind='bar', x=x, y=y, **kwargs) - def barh(self, x=None, y=None, **kwds): + def barh(self, x=None, y=None, **kwargs): """ Make a horizontal bar plot. @@ -1236,9 +878,9 @@ def barh(self, x=None, y=None, **kwds): ... 'lifespan': lifespan}, index=index) >>> ax = df.plot.barh(x='lifespan') """ - return self(kind='barh', x=x, y=y, **kwds) + return self(kind='barh', x=x, y=y, **kwargs) - def box(self, by=None, **kwds): + def box(self, by=None, **kwargs): r""" Make a box plot of the DataFrame columns. @@ -1286,9 +928,9 @@ def box(self, by=None, **kwds): >>> df = pd.DataFrame(data, columns=list('ABCD')) >>> ax = df.plot.box() """ - return self(kind='box', by=by, **kwds) + return self(kind='box', by=by, **kwargs) - def hist(self, by=None, bins=10, **kwds): + def hist(self, by=None, bins=10, **kwargs): """ Draw one histogram of the DataFrame's columns. @@ -1333,17 +975,83 @@ def hist(self, by=None, bins=10, **kwds): >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) """ - return self(kind='hist', by=by, bins=bins, **kwds) - - @Appender(_shared_docs['kde'] % { - 'this-datatype': 'DataFrame', - 'sibling-datatype': 'Series', - 'examples': """ - Given several Series of points randomly sampled from unknown - distributions, estimate their PDFs using KDE with automatic + return self(kind='hist', by=by, bins=bins, **kwargs) + + def kde(self, bw_method=None, ind=None, **kwargs): + """ + Generate Kernel Density Estimate plot using Gaussian kernels. + + In statistics, `kernel density estimation`_ (KDE) is a non-parametric + way to estimate the probability density function (PDF) of a random + variable. This function uses Gaussian kernels and includes automatic + bandwidth determination. + + .. _kernel density estimation: + https://en.wikipedia.org/wiki/Kernel_density_estimation + + Parameters + ---------- + bw_method : str, scalar or callable, optional + The method used to calculate the estimator bandwidth. This can be + 'scott', 'silverman', a scalar constant or a callable. + If None (default), 'scott' is used. + See :class:`scipy.stats.gaussian_kde` for more information. + ind : NumPy array or integer, optional + Evaluation points for the estimated PDF. If None (default), + 1000 equally spaced points are used. If `ind` is a NumPy array, the + KDE is evaluated at the points passed. If `ind` is an integer, + `ind` number of equally spaced points are used. + **kwds : optional + Additional keyword arguments are documented in + :meth:`pandas.%(this-datatype)s.plot`. + + Returns + ------- + matplotlib.axes.Axes or numpy.ndarray of them + + See Also + -------- + scipy.stats.gaussian_kde : Representation of a kernel-density + estimate using Gaussian kernels. This is the function used + internally to estimate the PDF. + + Examples + -------- + Given a Series of points randomly sampled from an unknown + distribution, estimate its PDF using KDE with automatic bandwidth determination and plot the results, evaluating them at 1000 equally spaced points (default): + .. plot:: + :context: close-figs + + >>> s = pd.Series([1, 2, 2.5, 3, 3.5, 4, 5]) + >>> ax = s.plot.kde() + + A scalar bandwidth can be specified. Using a small bandwidth value can + lead to over-fitting, while using a large bandwidth value may result + in under-fitting: + + .. plot:: + :context: close-figs + + >>> ax = s.plot.kde(bw_method=0.3) + + .. plot:: + :context: close-figs + + >>> ax = s.plot.kde(bw_method=3) + + Finally, the `ind` parameter determines the evaluation points for the + plot of the estimated PDF: + + .. plot:: + :context: close-figs + + >>> ax = s.plot.kde(ind=[1, 2, 3, 4, 5]) + + For DataFrame, it works in the same way: + .. plot:: :context: close-figs @@ -1374,14 +1082,12 @@ def hist(self, by=None, bins=10, **kwds): :context: close-figs >>> ax = df.plot.kde(ind=[1, 2, 3, 4, 5, 6]) - """.strip() - }) - def kde(self, bw_method=None, ind=None, **kwds): - return self(kind='kde', bw_method=bw_method, ind=ind, **kwds) + """ + return self(kind='kde', bw_method=bw_method, ind=ind, **kwargs) density = kde - def area(self, x=None, y=None, **kwds): + def area(self, x=None, y=None, **kwargs): """ Draw a stacked area plot. @@ -1452,9 +1158,9 @@ def area(self, x=None, y=None, **kwds): ... }) >>> ax = df.plot.area(x='day') """ - return self(kind='area', x=x, y=y, **kwds) + return self(kind='area', x=x, y=y, **kwargs) - def pie(self, y=None, **kwds): + def pie(self, **kwargs): """ Generate a pie plot. @@ -1501,9 +1207,13 @@ def pie(self, y=None, **kwds): >>> plot = df.plot.pie(subplots=True, figsize=(6, 3)) """ - return self(kind='pie', y=y, **kwds) + if (isinstance(self._parent, ABCDataFrame) + and kwargs.get('y', None) is None + and not kwargs.get('subplots', False)): + raise ValueError("pie requires either y column or 'subplots=True'") + return self(kind='pie', **kwargs) - def scatter(self, x, y, s=None, c=None, **kwds): + def scatter(self, x, y, s=None, c=None, **kwargs): """ Create a scatter plot with varying marker point size and color. @@ -1582,10 +1292,10 @@ def scatter(self, x, y, s=None, c=None, **kwds): ... c='species', ... colormap='viridis') """ - return self(kind='scatter', x=x, y=y, c=c, s=s, **kwds) + return self(kind='scatter', x=x, y=y, s=s, c=c, **kwargs) def hexbin(self, x, y, C=None, reduce_C_function=None, gridsize=None, - **kwds): + **kwargs): """ Generate a hexagonal binning plot. @@ -1668,7 +1378,27 @@ def hexbin(self, x, y, C=None, reduce_C_function=None, gridsize=None, ... cmap="viridis") """ if reduce_C_function is not None: - kwds['reduce_C_function'] = reduce_C_function + kwargs['reduce_C_function'] = reduce_C_function if gridsize is not None: - kwds['gridsize'] = gridsize - return self(kind='hexbin', x=x, y=y, C=C, **kwds) + kwargs['gridsize'] = gridsize + + return self(kind='hexbin', x=x, y=y, C=C, **kwargs) + + +def _get_plot_backend(): + """ + Return the plotting backend to use (e.g. `pandas.plotting._matplotlib`). + + The plotting system of pandas has been using matplotlib, but the idea here + is that it can also work with other third-party backends. In the future, + this function will return the backend from a pandas option, and all the + rest of the code in this file will use the backend specified there for the + plotting. + + The backend is imported lazily, as matplotlib is a soft dependency, and + pandas can be used without it being installed. + """ + backend_str = pandas.get_option('plotting.backend') + if backend_str == 'matplotlib': + backend_str = 'pandas.plotting._matplotlib' + return importlib.import_module(backend_str) diff --git a/pandas/plotting/_matplotlib/__init__.py b/pandas/plotting/_matplotlib/__init__.py index 1b775d03349d01..8eac6897add0e7 100644 --- a/pandas/plotting/_matplotlib/__init__.py +++ b/pandas/plotting/_matplotlib/__init__.py @@ -13,13 +13,40 @@ from pandas.plotting._matplotlib.timeseries import tsplot from pandas.plotting._matplotlib.tools import table +PLOT_CLASSES = {'line': LinePlot, + 'bar': BarPlot, + 'barh': BarhPlot, + 'box': BoxPlot, + 'hist': HistPlot, + 'kde': KdePlot, + 'area': AreaPlot, + 'pie': PiePlot, + 'scatter': ScatterPlot, + 'hexbin': HexBinPlot} + if get_option("plotting.matplotlib.register_converters"): register(explicit=False) -__all__ = ['LinePlot', 'BarPlot', 'BarhPlot', 'HistPlot', 'BoxPlot', 'KdePlot', - 'AreaPlot', 'PiePlot', 'ScatterPlot', 'HexBinPlot', 'hist_series', - 'hist_frame', 'boxplot', 'boxplot_frame', 'boxplot_frame_groupby', - 'tsplot', 'table', 'andrews_curves', 'autocorrelation_plot', - 'bootstrap_plot', 'lag_plot', 'parallel_coordinates', 'radviz', - 'scatter_matrix', 'register', 'deregister'] +def plot(data, kind, **kwargs): + # Importing pyplot at the top of the file (before the converters are + # registered) causes problems in matplotlib 2 (converters seem to not + # work) + import matplotlib.pyplot as plt + if kwargs.pop('reuse_plot', False): + ax = kwargs.get('ax') + if ax is None and len(plt.get_fignums()) > 0: + with plt.rc_context(): + ax = plt.gca() + kwargs['ax'] = getattr(ax, 'left_ax', ax) + plot_obj = PLOT_CLASSES[kind](data, **kwargs) + plot_obj.generate() + plot_obj.draw() + return plot_obj.result + + +__all__ = ['plot', 'hist_series', 'hist_frame', 'boxplot', 'boxplot_frame', + 'boxplot_frame_groupby', 'tsplot', 'table', 'andrews_curves', + 'autocorrelation_plot', 'bootstrap_plot', 'lag_plot', + 'parallel_coordinates', 'radviz', 'scatter_matrix', 'register', + 'deregister'] diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index e36ffed10d94f9..c3b548a6dfa855 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -143,12 +143,8 @@ def _replot_ax(ax, freq, kwargs): # for tsplot if isinstance(plotf, str): - # XXX _plot_classes is private and shouldn't be imported - # here. But as tsplot is deprecated, and we'll remove this - # code soon, it's probably better to not overcomplicate - # things, and just leave this the way it was implemented - from pandas.plotting._core import _plot_classes - plotf = _plot_classes()[plotf]._plot + from pandas.plotting._matplotlib import PLOT_CLASSES + plotf = PLOT_CLASSES[plotf]._plot lines.append(plotf(ax, series.index._mpl_repr(), series.values, **kwds)[0]) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 06c753d1b8e21f..272f01a12156bd 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -2225,7 +2225,7 @@ def test_unordered_ts(self): @td.skip_if_no_scipy def test_kind_both_ways(self): df = DataFrame({'x': [1, 2, 3]}) - for kind in plotting._core._common_kinds: + for kind in plotting.PlotAccessor._common_kinds: df.plot(kind=kind) getattr(df.plot, kind)() @@ -2235,7 +2235,7 @@ def test_kind_both_ways(self): def test_all_invalid_plot_data(self): df = DataFrame(list('abcd')) - for kind in plotting._core._common_kinds: + for kind in plotting.PlotAccessor._common_kinds: msg = "no numeric data to plot" with pytest.raises(TypeError, match=msg): @@ -2246,7 +2246,7 @@ def test_partially_invalid_plot_data(self): with tm.RNGContext(42): df = DataFrame(randn(10, 2), dtype=object) df[np.random.rand(df.shape[0]) > 0.5] = 'a' - for kind in plotting._core._common_kinds: + for kind in plotting.PlotAccessor._common_kinds: msg = "no numeric data to plot" with pytest.raises(TypeError, match=msg): @@ -2738,7 +2738,7 @@ def test_memory_leak(self): import gc results = {} - for kind in plotting._core._plot_classes().keys(): + for kind in plotting.PlotAccessor._all_kinds: args = {} if kind in ['hexbin', 'scatter', 'pie']: @@ -2936,7 +2936,7 @@ def test_df_grid_settings(self): # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 self._check_grid_settings( DataFrame({'a': [1, 2, 3], 'b': [2, 3, 4]}), - plotting._core._dataframe_kinds, kws={'x': 'a', 'y': 'b'}) + plotting.PlotAccessor._dataframe_kinds, kws={'x': 'a', 'y': 'b'}) def test_invalid_colormap(self): df = DataFrame(randn(3, 2), columns=['A', 'B']) diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index b58854743a42d4..b27df946aeacfd 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -9,7 +9,7 @@ import pandas.util._test_decorators as td -from pandas import DataFrame +from pandas import DataFrame, Series from pandas.tests.plotting.common import TestPlotBase, _check_plot_works import pandas.util.testing as tm @@ -25,6 +25,39 @@ def test_import_error_message(): df.plot() +def test_get_accessor_args(): + func = plotting._core.PlotAccessor._get_call_args + + msg = 'Called plot accessor for type list, expected Series or DataFrame' + with pytest.raises(TypeError, match=msg): + func(backend_name='', data=[], args=[], kwargs={}) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + x, y, kind, kwargs = func(backend_name='', data=Series(), + args=['line', None], kwargs={}) + assert x is None + assert y is None + assert kind == 'line' + assert kwargs == {'ax': None} + + x, y, kind, kwargs = func(backend_name='', data=DataFrame(), + args=['x'], kwargs={'y': 'y', + 'kind': 'bar', + 'grid': False}) + assert x == 'x' + assert y == 'y' + assert kind == 'bar' + assert kwargs == {'grid': False} + + x, y, kind, kwargs = func(backend_name='pandas.plotting._matplotlib', + data=Series(), args=[], kwargs={}) + assert x is None + assert y is None + assert kind == 'line' + assert len(kwargs) == 22 + + @td.skip_if_no_mpl class TestSeriesPlots(TestPlotBase): diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 9a954b522333dd..d10620b4e75471 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -684,8 +684,8 @@ def test_boxplot_series(self): @pytest.mark.slow def test_kind_both_ways(self): s = Series(range(3)) - kinds = (plotting._core._common_kinds + - plotting._core._series_kinds) + kinds = (plotting.PlotAccessor._common_kinds + + plotting.PlotAccessor._series_kinds) _, ax = self.plt.subplots() for kind in kinds: @@ -696,7 +696,7 @@ def test_kind_both_ways(self): def test_invalid_plot_data(self): s = Series(list('abcd')) _, ax = self.plt.subplots() - for kind in plotting._core._common_kinds: + for kind in plotting.PlotAccessor._common_kinds: msg = "no numeric data to plot" with pytest.raises(TypeError, match=msg): @@ -705,13 +705,13 @@ def test_invalid_plot_data(self): @pytest.mark.slow def test_valid_object_plot(self): s = Series(range(10), dtype=object) - for kind in plotting._core._common_kinds: + for kind in plotting.PlotAccessor._common_kinds: _check_plot_works(s.plot, kind=kind) def test_partially_invalid_plot_data(self): s = Series(['a', 'b', 1.0, 2]) _, ax = self.plt.subplots() - for kind in plotting._core._common_kinds: + for kind in plotting.PlotAccessor._common_kinds: msg = "no numeric data to plot" with pytest.raises(TypeError, match=msg): @@ -781,8 +781,8 @@ def test_table(self): def test_series_grid_settings(self): # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 self._check_grid_settings(Series([1, 2, 3]), - plotting._core._series_kinds + - plotting._core._common_kinds) + plotting.PlotAccessor._series_kinds + + plotting.PlotAccessor._common_kinds) @pytest.mark.slow def test_standard_colors(self): From 47ffcd67480f373e5531f9322b1f22ac8b381b94 Mon Sep 17 00:00:00 2001 From: Mak Sze Chun Date: Thu, 4 Jul 2019 04:40:30 +0800 Subject: [PATCH 151/238] API: ExtensionArray.argsort places the missing value at the end (#27137) --- asv_bench/benchmarks/algorithms.py | 12 ++++++++++ doc/source/whatsnew/v0.25.0.rst | 33 +++++++++++++++++++++++++- pandas/core/arrays/base.py | 11 +++++---- pandas/core/arrays/categorical.py | 18 +++++++++----- pandas/core/sorting.py | 14 ----------- pandas/tests/extension/base/methods.py | 8 +++++++ pandas/tests/frame/test_sorting.py | 4 ++-- 7 files changed, 72 insertions(+), 28 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 45ef47fde0a569..b69efb4689486f 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -141,4 +141,16 @@ def time_quantile(self, quantile, interpolation, dtype): self.idx.quantile(quantile, interpolation=interpolation) +class SortIntegerArray: + params = [10**3, 10**5] + + def setup(self, N): + data = np.arange(N, dtype=float) + data[40] = np.nan + self.array = pd.array(data, dtype='Int64') + + def time_argsort(self, N): + self.array.argsort() + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ea6a04ac726b76..5c22a3bcee227f 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -661,7 +661,7 @@ when both are :class:`Series` (:issue:`23293`). *Previous behavior* -.. code-block:: python +.. code-block:: ipython In [5]: np.power(s1, s2) Out[5]: @@ -684,6 +684,36 @@ applying the ufunc. np.power(s1, s2.array) +Categorical.argsort now places missing values at the end +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`Categorical.argsort` now places missing values at the end of the array, making it +consistent with NumPy and the rest of pandas (:issue:`21801`). + +.. ipython:: python + + cat = pd.Categorical(['b', None, 'a'], categories=['a', 'b'], ordered=True) + +*Previous behavior* + +.. code-block:: ipython + + In [2]: cat = pd.Categorical(['b', None, 'a'], categories=['a', 'b'], ordered=True) + + In [3]: cat.argsort() + Out[3]: array([1, 2, 0]) + + In [4]: cat[cat.argsort()] + Out[4]: + [NaN, a, b] + categories (2, object): [a < b] + +*New behavior* + +.. ipython:: python + + cat.argsort() + cat[cat.argsort()] .. _whatsnew_0250.api_breaking.deps: @@ -767,6 +797,7 @@ Other API changes - Removed support of gtk package for clipboards (:issue:`26563`) - Using an unsupported version of Beautiful Soup 4 will now raise an ``ImportError`` instead of a ``ValueError`` (:issue:`27063`) - :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` will now raise a ``ValueError`` when saving timezone aware data. (:issue:`27008`, :issue:`7056`) +- :meth:`ExtensionArray.argsort` places NA values at the end of the sorted array. (:issue:`21801`) - :meth:`DataFrame.to_hdf` and :meth:`Series.to_hdf` will now raise a ``NotImplementedError`` when saving a :class:`MultiIndex` with extention data types for a ``fixed`` format. (:issue:`7775`) - Passing duplicate ``names`` in :meth:`read_csv` will now raise a ``ValueError`` (:issue:`17346`) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 0762a607f20aea..803a31928ab7a5 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -23,6 +23,7 @@ from pandas._typing import ArrayLike from pandas.core import ops +from pandas.core.sorting import nargsort _not_implemented_message = "{} does not implement {}." @@ -409,7 +410,8 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): Returns ------- index_array : ndarray - Array of indices that sort ``self``. + Array of indices that sort ``self``. If NaN values are contained, + NaN values are placed at the end. See Also -------- @@ -420,10 +422,9 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): # 1. _values_for_argsort : construct the values passed to np.argsort # 2. argsort : total control over sorting. ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - values = self._values_for_argsort() - result = np.argsort(values, kind=kind, **kwargs) - if not ascending: - result = result[::-1] + + result = nargsort(self, kind=kind, ascending=ascending, + na_position='last') return result def fillna(self, value=None, method=None, limit=None): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 9a4846c98bd226..e901c11cf30547 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1531,13 +1531,14 @@ def check_for_ordered(self, op): def _values_for_argsort(self): return self._codes.copy() - def argsort(self, *args, **kwargs): - # TODO(PY2): use correct signature - # We have to do *args, **kwargs to avoid a a py2-only signature - # issue since np.argsort differs from argsort. + def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ Return the indices that would sort the Categorical. + .. versionchanged:: 0.25.0 + + Changed to sort missing values at the end. + Parameters ---------- ascending : bool, default True @@ -1574,9 +1575,14 @@ def argsort(self, *args, **kwargs): ... ordered=True) >>> cat.argsort() array([3, 0, 1, 2]) + + Missing values are placed at the end + + >>> cat = pd.Categorical([2, None, 1]) + >>> cat.argsort() + array([2, 0, 1]) """ - # Keep the implementation here just for the docstring. - return super().argsort(*args, **kwargs) + return super().argsort(ascending=ascending, kind=kind, *args, **kwargs) def sort_values(self, inplace=False, ascending=True, na_position='last'): """ diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 750a4c903176f8..b79390581612b2 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -240,20 +240,6 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): items = extract_array(items) mask = np.asarray(isna(items)) - # specially handle Categorical - if is_categorical_dtype(items): - if na_position not in {'first', 'last'}: - raise ValueError('invalid na_position: {!r}'.format(na_position)) - - cnt_null = mask.sum() - sorted_idx = items.argsort(ascending=ascending, kind=kind) - if ascending and na_position == 'last': - # NaN is coded as -1 and is listed in front after sorting - sorted_idx = np.roll(sorted_idx, -cnt_null) - elif not ascending and na_position == 'first': - # NaN is coded as -1 and is listed in the end after sorting - sorted_idx = np.roll(sorted_idx, cnt_null) - return sorted_idx if is_extension_array_dtype(items): items = items._values_for_argsort() diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index d9e61e6a227e63..9b154a8afeabcc 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -47,6 +47,14 @@ def test_argsort(self, data_for_sorting): expected = pd.Series(np.array([2, 0, 1], dtype=np.int64)) self.assert_series_equal(result, expected) + def test_argsort_missing_array(self, data_missing_for_sorting): + result = data_missing_for_sorting.argsort() + expected = np.array([2, 0, 1], dtype=np.dtype("int")) + # we don't care whether it's int32 or int64 + result = result.astype("int64", casting="safe") + expected = expected.astype("int64", casting="safe") + tm.assert_numpy_array_equal(result, expected) + def test_argsort_missing(self, data_missing_for_sorting): result = pd.Series(data_missing_for_sorting).argsort() expected = pd.Series(np.array([1, -1, 0], dtype=np.int64)) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 96aeb608ba3b82..11de77f6779e63 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -539,7 +539,7 @@ def test_sort_index_categorical_index(self): assert_frame_equal(result, expected) result = df.sort_index(ascending=False) - expected = df.iloc[[3, 2, 5, 1, 0, 4]] + expected = df.iloc[[2, 3, 0, 1, 5, 4]] assert_frame_equal(result, expected) def test_sort_index(self): @@ -629,7 +629,7 @@ def test_sort_index_na_position_with_categories(self): reversed_categories = sorted(categories, reverse=True) reversed_category_indices = sorted(category_indices, reverse=True) - reversed_na_indices = sorted(na_indices, reverse=True) + reversed_na_indices = sorted(na_indices) df = pd.DataFrame({ column_name: pd.Categorical(['A', np.nan, 'B', np.nan, 'C'], From 1659fffdf72502ad3e31c5458cd26cb6bc982ffc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 3 Jul 2019 13:42:38 -0700 Subject: [PATCH 152/238] DEPR: make Categorical.ravel() return Categorical (#27199) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/arrays/base.py | 15 +++++++++++++++ pandas/core/arrays/categorical.py | 3 +++ pandas/tests/extension/base/base.py | 1 + pandas/tests/extension/base/reshaping.py | 9 +++++++++ pandas/tests/extension/test_categorical.py | 7 ++++++- pandas/tests/extension/test_sparse.py | 5 ++++- 7 files changed, 39 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 5c22a3bcee227f..646670209cd7b6 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -862,6 +862,7 @@ Other deprecations - The default value ``ordered=None`` in :class:`~pandas.api.types.CategoricalDtype` has been deprecated in favor of ``ordered=False``. When converting between categorical types ``ordered=True`` must be explicitly passed in order to be preserved. (:issue:`26336`) - :meth:`Index.contains` is deprecated. Use ``key in index`` (``__contains__``) instead (:issue:`17753`). - :meth:`DataFrame.get_dtype_counts` is deprecated. (:issue:`18262`) +- :meth:`Categorical.ravel` will return a :class:`Categorical` instead of a ``np.ndarray`` (:issue:`27199`) .. _whatsnew_0250.prior_deprecations: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 803a31928ab7a5..21f0f3c08e93bb 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -909,6 +909,21 @@ def _formatting_values(self) -> np.ndarray: # Reshaping # ------------------------------------------------------------------------ + def ravel(self, order="C") -> ABCExtensionArray: + """ + Return a flattened view on this array. + + Parameters + ---------- + order : {None, 'C', 'F', 'A', 'K'}, default 'C' + + Notes + ----- + - Because ExtensionArrays are 1D-only, this is a no-op. + - The "order" argument is ignored, is for compatibility with NumPy. + """ + return self + @classmethod def _concat_same_type( cls, diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e901c11cf30547..5ae71ffb165e9a 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1713,6 +1713,9 @@ def ravel(self, order='C'): ------- numpy.array """ + warn("Categorical.ravel will return a Categorical object instead " + "of an ndarray in a future version.", + FutureWarning, stacklevel=2) return np.array(self) def view(self): diff --git a/pandas/tests/extension/base/base.py b/pandas/tests/extension/base/base.py index b11603c0e185a1..55cfbea479c472 100644 --- a/pandas/tests/extension/base/base.py +++ b/pandas/tests/extension/base/base.py @@ -2,6 +2,7 @@ class BaseExtensionTests: + assert_equal = staticmethod(tm.assert_equal) assert_series_equal = staticmethod(tm.assert_series_equal) assert_frame_equal = staticmethod(tm.assert_frame_equal) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index ee22ffb3ccf970..4ea78a4239e6ea 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -269,3 +269,12 @@ def test_unstack(self, data, index, obj): result = result.astype(object) self.assert_frame_equal(result, expected) + + def test_ravel(self, data): + # as long as EA is 1D-only, ravel is a no-op + result = data.ravel() + assert type(result) == type(data) + + # Check that we have a view, not a copy + result[0] = result[1] + assert data[0] == data[1] diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 4cf9f78e1531d7..046dcc1c74a03d 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -22,6 +22,7 @@ from pandas import Categorical from pandas.api.types import CategoricalDtype from pandas.tests.extension import base +import pandas.util.testing as tm def make_data(): @@ -94,7 +95,11 @@ class TestConstructors(base.BaseConstructorsTests): class TestReshaping(base.BaseReshapingTests): - pass + + def test_ravel(self, data): + # GH#27199 Categorical.ravel returns self until after deprecation cycle + with tm.assert_produces_warning(FutureWarning): + data.ravel() class TestGetitem(base.BaseGetitemTests): diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 86ca3e230ddd5d..8ce53270b7ba87 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -82,11 +82,14 @@ def data_for_grouping(request): class BaseSparseTests: - def _check_unsupported(self, data): if data.dtype == SparseDtype(int, 0): pytest.skip("Can't store nan in int array.") + @pytest.mark.xfail(reason="SparseArray does not support setitem") + def test_ravel(self, data): + super().test_ravel(data) + class TestDtype(BaseSparseTests, base.BaseDtypeTests): From 23099f7021e8cd8b4343e9b8a87c91d285e98d58 Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Wed, 3 Jul 2019 14:45:43 -0700 Subject: [PATCH 153/238] Class to read OpenDocument Tables (#25427) --- ci/deps/travis-36-cov.yaml | 1 + doc/source/user_guide/io.rst | 28 ++- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/compat/_optional.py | 1 + pandas/core/config_init.py | 9 + pandas/io/excel/_base.py | 4 +- pandas/io/excel/_odfreader.py | 176 ++++++++++++++++++ pandas/tests/io/data/blank.ods | Bin 0 -> 2813 bytes pandas/tests/io/data/blank_with_header.ods | Bin 0 -> 2893 bytes pandas/tests/io/data/invalid_value_type.ods | Bin 0 -> 8502 bytes pandas/tests/io/data/test1.ods | Bin 0 -> 4440 bytes pandas/tests/io/data/test2.ods | Bin 0 -> 2877 bytes pandas/tests/io/data/test3.ods | Bin 0 -> 2889 bytes pandas/tests/io/data/test4.ods | Bin 0 -> 2992 bytes pandas/tests/io/data/test5.ods | Bin 0 -> 2906 bytes pandas/tests/io/data/test_converters.ods | Bin 0 -> 3287 bytes .../tests/io/data/test_index_name_pre17.ods | Bin 0 -> 3699 bytes pandas/tests/io/data/test_multisheet.ods | Bin 0 -> 3797 bytes pandas/tests/io/data/test_squeeze.ods | Bin 0 -> 3218 bytes pandas/tests/io/data/test_types.ods | Bin 0 -> 3489 bytes pandas/tests/io/data/testdateoverflow.ods | Bin 0 -> 3422 bytes pandas/tests/io/data/testdtype.ods | Bin 0 -> 3196 bytes pandas/tests/io/data/testmultiindex.ods | Bin 0 -> 5575 bytes pandas/tests/io/data/testskiprows.ods | Bin 0 -> 3235 bytes pandas/tests/io/data/times_1900.ods | Bin 0 -> 3181 bytes pandas/tests/io/data/times_1904.ods | Bin 0 -> 3215 bytes pandas/tests/io/data/writertable.odt | Bin 0 -> 10313 bytes pandas/tests/io/excel/conftest.py | 2 +- pandas/tests/io/excel/test_odf.py | 39 ++++ pandas/tests/io/excel/test_readers.py | 39 +++- pandas/tests/io/excel/test_xlrd.py | 6 + 31 files changed, 295 insertions(+), 11 deletions(-) create mode 100644 pandas/io/excel/_odfreader.py create mode 100644 pandas/tests/io/data/blank.ods create mode 100644 pandas/tests/io/data/blank_with_header.ods create mode 100644 pandas/tests/io/data/invalid_value_type.ods create mode 100644 pandas/tests/io/data/test1.ods create mode 100644 pandas/tests/io/data/test2.ods create mode 100644 pandas/tests/io/data/test3.ods create mode 100644 pandas/tests/io/data/test4.ods create mode 100644 pandas/tests/io/data/test5.ods create mode 100644 pandas/tests/io/data/test_converters.ods create mode 100644 pandas/tests/io/data/test_index_name_pre17.ods create mode 100644 pandas/tests/io/data/test_multisheet.ods create mode 100644 pandas/tests/io/data/test_squeeze.ods create mode 100644 pandas/tests/io/data/test_types.ods create mode 100644 pandas/tests/io/data/testdateoverflow.ods create mode 100644 pandas/tests/io/data/testdtype.ods create mode 100644 pandas/tests/io/data/testmultiindex.ods create mode 100644 pandas/tests/io/data/testskiprows.ods create mode 100644 pandas/tests/io/data/times_1900.ods create mode 100644 pandas/tests/io/data/times_1904.ods create mode 100644 pandas/tests/io/data/writertable.odt create mode 100644 pandas/tests/io/excel/test_odf.py diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index fead806fc8e1c4..6f85c32b9a915a 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -16,6 +16,7 @@ dependencies: - nomkl - numexpr - numpy=1.15.* + - odfpy - openpyxl - pandas-gbq # https://github.com/pydata/pandas-gbq/issues/271 diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index a8bc690efd3cad..3050a630153926 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -32,6 +32,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like text;`HTML `__;:ref:`read_html`;:ref:`to_html` text; Local clipboard;:ref:`read_clipboard`;:ref:`to_clipboard` binary;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel` + binary;`OpenDocument `__;:ref:`read_excel`; binary;`HDF5 Format `__;:ref:`read_hdf`;:ref:`to_hdf` binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather` binary;`Parquet Format `__;:ref:`read_parquet`;:ref:`to_parquet` @@ -2791,9 +2792,10 @@ parse HTML tables in the top-level pandas io function ``read_html``. Excel files ----------- -The :func:`~pandas.read_excel` method can read Excel 2003 (``.xls``) and -Excel 2007+ (``.xlsx``) files using the ``xlrd`` Python -module. The :meth:`~DataFrame.to_excel` instance method is used for +The :func:`~pandas.read_excel` method can read Excel 2003 (``.xls``) +files using the ``xlrd`` Python module. Excel 2007+ (``.xlsx``) files +can be read using either ``xlrd`` or ``openpyxl``. +The :meth:`~DataFrame.to_excel` instance method is used for saving a ``DataFrame`` to Excel. Generally the semantics are similar to working with :ref:`csv` data. See the :ref:`cookbook` for some advanced strategies. @@ -3229,7 +3231,27 @@ The look and feel of Excel worksheets created from pandas can be modified using * ``float_format`` : Format string for floating point numbers (default ``None``). * ``freeze_panes`` : A tuple of two integers representing the bottommost row and rightmost column to freeze. Each of these parameters is one-based, so (1, 1) will freeze the first row and first column (default ``None``). +.. _io.ods: +OpenDocument Spreadsheets +------------------------- + +.. versionadded:: 0.25 + +The :func:`~pandas.read_excel` method can also read OpenDocument spreadsheets +using the ``odfpy`` module. The semantics and features for reading +OpenDocument spreadsheets match what can be done for `Excel files`_ using +``engine='odf'``. + +.. code-block:: python + + # Returns a DataFrame + pd.read_excel('path_to_file.ods', engine='odf') + +.. note:: + + Currently pandas only supports *reading* OpenDocument spreadsheets. Writing + is not implemented. .. _io.clipboard: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 646670209cd7b6..a9a7b040429095 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -187,6 +187,7 @@ Other enhancements - Added new option ``plotting.backend`` to be able to select a plotting backend different than the existing ``matplotlib`` one. Use ``pandas.set_option('plotting.backend', '')`` where ``` for more details (:issue:`9070`) .. _whatsnew_0250.api_breaking: diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 31746dc3d6c164..620884d66821c5 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -13,6 +13,7 @@ "lxml.etree": "3.8.0", "matplotlib": "2.2.2", "numexpr": "2.6.2", + "odfpy": "1.3.0", "openpyxl": "2.4.8", "pandas_gbq": "0.8.0", "pyarrow": "0.9.0", diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 84ca154d045fe1..7fe9f8438ac744 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -422,6 +422,7 @@ def use_inf_as_na_cb(key): _xls_options = ['xlrd'] _xlsm_options = ['xlrd', 'openpyxl'] _xlsx_options = ['xlrd', 'openpyxl'] +_ods_options = ['odf'] with cf.config_prefix("io.excel.xls"): @@ -447,6 +448,14 @@ def use_inf_as_na_cb(key): validator=str) +with cf.config_prefix("io.excel.ods"): + cf.register_option("reader", "auto", + reader_engine_doc.format( + ext='ods', + others=', '.join(_ods_options)), + validator=str) + + # Set up the io.excel specific writer configuration. writer_engine_doc = """ : string diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 8055b6609b1c4f..d10a40541bb6c7 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -768,12 +768,14 @@ class ExcelFile: Acceptable values are None or ``xlrd``. """ - from pandas.io.excel._xlrd import _XlrdReader + from pandas.io.excel._odfreader import _ODFReader from pandas.io.excel._openpyxl import _OpenpyxlReader + from pandas.io.excel._xlrd import _XlrdReader _engines = { 'xlrd': _XlrdReader, 'openpyxl': _OpenpyxlReader, + 'odf': _ODFReader, } def __init__(self, io, engine=None): diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py new file mode 100644 index 00000000000000..c820c1497c3c92 --- /dev/null +++ b/pandas/io/excel/_odfreader.py @@ -0,0 +1,176 @@ +from typing import List + +from pandas.compat._optional import import_optional_dependency + +import pandas as pd +from pandas._typing import FilePathOrBuffer, Scalar + +from pandas.io.excel._base import _BaseExcelReader + + +class _ODFReader(_BaseExcelReader): + """Read tables out of OpenDocument formatted files + + Parameters + ---------- + filepath_or_buffer: string, path to be parsed or + an open readable stream. + """ + def __init__(self, filepath_or_buffer: FilePathOrBuffer): + import_optional_dependency("odf") + super().__init__(filepath_or_buffer) + + @property + def _workbook_class(self): + from odf.opendocument import OpenDocument + return OpenDocument + + def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): + from odf.opendocument import load + return load(filepath_or_buffer) + + @property + def empty_value(self) -> str: + """Property for compat with other readers.""" + return '' + + @property + def sheet_names(self) -> List[str]: + """Return a list of sheet names present in the document""" + from odf.table import Table + + tables = self.book.getElementsByType(Table) + return [t.getAttribute("name") for t in tables] + + def get_sheet_by_index(self, index: int): + from odf.table import Table + tables = self.book.getElementsByType(Table) + return tables[index] + + def get_sheet_by_name(self, name: str): + from odf.table import Table + + tables = self.book.getElementsByType(Table) + + for table in tables: + if table.getAttribute("name") == name: + return table + + raise ValueError("sheet {name} not found".format(name)) + + def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + """Parse an ODF Table into a list of lists + """ + from odf.table import CoveredTableCell, TableCell, TableRow + + covered_cell_name = CoveredTableCell().qname + table_cell_name = TableCell().qname + cell_names = {covered_cell_name, table_cell_name} + + sheet_rows = sheet.getElementsByType(TableRow) + empty_rows = 0 + max_row_len = 0 + + table = [] # type: List[List[Scalar]] + + for i, sheet_row in enumerate(sheet_rows): + sheet_cells = [x for x in sheet_row.childNodes + if x.qname in cell_names] + empty_cells = 0 + table_row = [] # type: List[Scalar] + + for j, sheet_cell in enumerate(sheet_cells): + if sheet_cell.qname == table_cell_name: + value = self._get_cell_value(sheet_cell, convert_float) + else: + value = self.empty_value + + column_repeat = self._get_column_repeat(sheet_cell) + + # Queue up empty values, writing only if content succeeds them + if value == self.empty_value: + empty_cells += column_repeat + else: + table_row.extend([self.empty_value] * empty_cells) + empty_cells = 0 + table_row.extend([value] * column_repeat) + + if max_row_len < len(table_row): + max_row_len = len(table_row) + + row_repeat = self._get_row_repeat(sheet_row) + if self._is_empty_row(sheet_row): + empty_rows += row_repeat + else: + # add blank rows to our table + table.extend([[self.empty_value]] * empty_rows) + empty_rows = 0 + for _ in range(row_repeat): + table.append(table_row) + + # Make our table square + for row in table: + if len(row) < max_row_len: + row.extend([self.empty_value] * (max_row_len - len(row))) + + return table + + def _get_row_repeat(self, row) -> int: + """Return number of times this row was repeated + Repeating an empty row appeared to be a common way + of representing sparse rows in the table. + """ + from odf.namespaces import TABLENS + + return int(row.attributes.get((TABLENS, 'number-rows-repeated'), 1)) + + def _get_column_repeat(self, cell) -> int: + from odf.namespaces import TABLENS + return int(cell.attributes.get( + (TABLENS, 'number-columns-repeated'), 1)) + + def _is_empty_row(self, row) -> bool: + """Helper function to find empty rows + """ + for column in row.childNodes: + if len(column.childNodes) > 0: + return False + + return True + + def _get_cell_value(self, cell, convert_float: bool) -> Scalar: + from odf.namespaces import OFFICENS + cell_type = cell.attributes.get((OFFICENS, 'value-type')) + if cell_type == 'boolean': + if str(cell) == "TRUE": + return True + return False + if cell_type is None: + return self.empty_value + elif cell_type == 'float': + # GH5394 + cell_value = float(cell.attributes.get((OFFICENS, 'value'))) + + if cell_value == 0. and str(cell) != cell_value: # NA handling + return str(cell) + + if convert_float: + val = int(cell_value) + if val == cell_value: + return val + return cell_value + elif cell_type == 'percentage': + cell_value = cell.attributes.get((OFFICENS, 'value')) + return float(cell_value) + elif cell_type == 'string': + return str(cell) + elif cell_type == 'currency': + cell_value = cell.attributes.get((OFFICENS, 'value')) + return float(cell_value) + elif cell_type == 'date': + cell_value = cell.attributes.get((OFFICENS, 'date-value')) + return pd.to_datetime(cell_value) + elif cell_type == 'time': + return pd.to_datetime(str(cell)).time() + else: + raise ValueError('Unrecognized type {}'.format(cell_type)) diff --git a/pandas/tests/io/data/blank.ods b/pandas/tests/io/data/blank.ods new file mode 100644 index 0000000000000000000000000000000000000000..7ded3c3c1d688242b6af3b2bdd3afe617a23c33c GIT binary patch literal 2813 zcmZ{m2Q*yU8plUmJzCV!LR_6ON*G26!YCo5n-B~pj3^T$dW#x;)Cf09Mv2iv^pa$V z62Vn+wP8j=gdlioZW7+eU2o;RynFXKd!4iQS^saZb=LR$zCYZQ_7`TrUqcKK@2!yx zqApG+BX#VdZg+f8z5yUC#vg@r z$9ker0dQ(eZUFuF2x5T6=%h6iEdZcQJ?8WzY(R)N3JVJM@qTIh%9jM@=;mA1VLTg} zBZ1NtBKor_gl<%Q0~9=RSL2q!S$iL@L+6`*H7|9*-g}D>@gR4Jb%$vOX@!)VsL04# zd;90Wb=^mlButVnPL{tArc;qYT{9S#@gSB%=LEI(=Jw!&@|3Zsh(aH?4<4OrQ<9yFA%V5q zXrIP3y1Wl-X^w^AQ(8XmChpU2z0n&Q!w*z;EKBi95f4z6lW1%Q$>@)peUp<`D_zoO z36gG(I?|sWd1YCcZ^G}`dMC^GCznFu%`H6wf?w0me`6+GuViNYR&r^!rVqzD=AV)! zn)zVZZ(LipEwhA7&s8v$gXyMxCcsn%&*yA%JC@exKaF*L!d34B)YI%&%$6c;HDbt& zoSCr;G(EaSV-rzFbk~Ir#1m!tbd8Uulroi(o~Aa}pXLEsJ5x5sQ!p7osQ&X80)H--H80n&d7AsD!4i8-Y_Q~8_Fv*d zV`?+c;V6N03c~1`m187gh48Wd2|zkKt3H?8>$48(Pfn1R;}q8$waK2c=EHZ)g!M(; zl;Mml7Q>V#oZ~d#lDm{m!aBANc8wfNq62L@o^cupU@xeH$F^5wbA-d(Ts^molS(%c zBUU88j+6}huMNEFn~GG_PEKe6{|h6_u2UU|3ZpR#06@I~ZhpQ2r}R>3+^{vvn4{8Y z_c7{ndm+aG-t?D^Qk#tKrf)R;MbXXa!n165qbLo+PnqN`%bbGPJ6*NT`;ogIpuU9? z-fuAWGuNsil$_mF;8(Gw2ZbA3#l_1Kg$Ni=rKQ}AyT_B+^@H{M&2_TGlI4Qti_)-s zTeg!?aB*Wf9-2h1wJ5K!ov|Dj7#lWOh&IuE|DyeDGhS?^NPsL3d`5KF z0n5Y?{t?#A3fqHcM!Vpy39Hy^wpk|P>v!}YUT#c8n_E6^H4DVl$?SHmcZ3{*r*9VX zRsSK&UXRd`9tB03-E!Z4C(pnXNl;Tif&Ks~dd@3K$OK@^`y?SOP6KJ?-DRhY1vSbN~ zjC$3&-c=uDoT+>mIH!4IX@;MvN>QS>(n5nio}%OY{859sy+Y6Bp}h(Ck~+AJ^SNi? z27`4|E-2&(kxGtIP<{#Iih|ovstG6uxiXO90X!yCmEMj7p$W8I^Y@5;mMLsKCn`k2 z^z=LmlWKz=rR=uZgY5dSUqAPIvCE$qHSP077KDM*KGlIAp%}pxhyAqozR_eD?7VTD z@lxPefLK@=Cuom0;TF)o%APQt6}54zrHN=U)&+f?*Bvr-9YDCSA>1D+XlxP3>FQmt z((^V#ta$};27QmT;9po4D)-8%8and(eVYc>^js2cG^x%FZ76X|A{im)O8lH0m{)`| zVi~%SF0S|Zag&itc++K8C`W=Qksxv~+4>uQCMqp9LMj#g!N^9D+oYxHI^|Mu?A@$#P6pC_-^dZjAW1ux#6 z3}AuTfq+ZGSBodq?Hw1ln}fcv{~cC|L+m>8!zFJee4*6Zlk4vS=eT<^a$$rAqIhCT=!3uWgRZ3bZgPg&Om2MTA5_ z?$?+@u!QCP(Js$;Zm+9*Yx@IVNZSM+7I>T^Q@2y*J~BLVj;};jEQQ4Uhw;nSsdbd3 z-CZtdOw7iwZF4wRR#V+`hgiyug(YEVd8)p1pZf2$^bMf*Z>ONJm~D%Csmqw|{>kA~?FI+yh&)?mjf%_T2ljl#an z*Yf{;h50frigTg?01~Oc;=6@F1t3q$OKR4Lc`ukFZp)}ke_17IDX$_ew@i-_#CW?T z`sGMtYi+z6tYxyv?^;RJ{}G-=du-3yHonu_}})ZAjzk8~3JDD8jp_mxuW` zkLzHR>hkI@Y_%z*!eQ^$ELxs~X$jBsd5eg#0yHeqHr=bb3TduMKrp3;nw!yu>aCU+ z9N-g8JocSCCY>5%K6q|L7%0_wRbOxmvd5fd?obex;U&)_7f5kyH@Ej#D+e zO)($_e=xYBVtsWwXfmaSf@Bbo-|gqE&vIBAo_fB|iOjlk8w1i7){y=>$X(B~>+`W# z|41w;8#1z;JL>{!-pi`OIjq9Ewn_>4JV8Vjp)!%GTQ-uAhxZh#thX`e5QY8iko?1} zz)U!py-!{;1#GU@IFHkiP`ZKJ8b7Ijf-Sz+Bz|)(;n%%LW4`LV{0^W9mr!GY)1=hu ziP;Xdi2qgPY0fBow^!8c1N^^{JKg;;{$uOlrl+cZwrW2V*3{P@#_ea_&t2t*?k%;~ z{8x|pS@<(IKZFp5f1v2+5I^VJj}ZFQ^rPC>@7V`8rK3OnLPxzsso6Qebh`RCg-*K5 literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/blank_with_header.ods b/pandas/tests/io/data/blank_with_header.ods new file mode 100644 index 0000000000000000000000000000000000000000..0a2e696267fda9e6f01daeed31c3ccdcac163555 GIT binary patch literal 2893 zcmZ`*2{_bSA09NwzGdH+5N2%GzFcD;4Wn#>Fc@3LHj@k@dl|}3sLZt{yFu35SRygE z5JDsq5y~}%RPN|L-_!T?-FyDedCvL&&v}36dH&D)p7({@Fq~ik{N1DgDZ#qwAll1u zXQp*@AR2|mg`tpPVZnhuNNiwe2skpt7Zi%b1Y$s;VW<$_P@hX^R0tM?35!4>eK7$j z6c$da$p>KkUO@`*>gx4%6axUDPCMrKBn%c8jKYAT(7}UFk3;5Fxq1cO8Z)y+=gFZ= zL`e}GD$$$wz5#p zCOPQ!UnVObiUBngQFi8IivFZZ{W%c>>Kqk5qP+C96;XorS@ZAKnm*nA3Kv=bArRe? z2`pG!&vY+|oi;%GLVag^2Ci=Ej@{DOIY%g0yK)OvJsF7UnzwrHH#9kUeziw|t@gB! zyRpLdZ$UZswp;K^PQmHQkJ1VH@V54TVUaI4B)+oLx>mC=e=Re7Q8$#u{xkxgBbj|` zEcBVtnFrZri;TR*(|KXNO9!=<8Y2>UTYT<>#-i3F?A~Sb( z(o4F2lRHl*;`f2BqMu~a&Ip)T?@y~`t0Mz!99>%rgxI_Bo6qoJH-+>ghCv+r79s{= zKH3ZIvtCQ-^NV?Ka``aFWKv~ht`Y^hi#U&+Q*sGK2`(@!GG&NZsM;m8^KT0n2);=0 zItdj|IjiDcAmDIrWarJ<++973;YjAKdb(+@(g6T<-Y5 znBHGaEA}qN8=TvMEOoZBAqVAU6pU<*Uujk0cqS8bwutT}Kkl2H*~M}SD^-xIZm?I; zuz5~YDKGTT`Qc#|UXLS^P+EA$m70#I*kd%`bFAt3@D*KToGdc%qQ-;)eMn|$v)pF{R($0hQ{Q14EbSJf=7FFscY2R;EN5{8JgKuPmn+uN$2;J|-Rp!c#Q6F7 z&@TK*Z=@l+76%45bUvfX=>bb{^RDH9#@}p z^by18%M$j(@^0>@rgx(^E4Mn;Kb@%4Ye(se?7!D<_t8R>K`Wu}#YX9_8QwWqEeU

F#(eioN{k% z0KPdnIl8h&|5edBrYk?-CBpIIM)LhrXu3i~J$p@TjkwT^TBNI0-FCcK+Tp#;SFGpVI(FC^3_Ek<~{5}!Ih;Y z>SgV@?a_^cgU*cZA-DqBHnOT!-8I}PF&XaY?4<2@3+*@)LL!@w+^hEg@<1T(xq)NZ zdOj7aF_RXORe)+1lvSTNXqojrsa@kf0cT8R$PHq6t&ZSGsa2}W}#>byI!ei zb%GU&X;6aQX&8rZdy-wAoXTdZO63MYTS=5Y=lc;+{ov&)yS*#P2g6A!@!(mu=)z|9 zM-Q{R6}O$$(;&{J3vq+6XguFj29#wE&}$A%T7<%y+@oWtjvqsIi&vr7V)7j3Zl>PT$##PW%Mu zpVnk_(~^OWEEw4$b0Y1n2>RCi@2}wEbqecjbuhJ*kK~?!6uzLd)b)?Uhw~q5PqD(2 z-B0#^lzlLl<;`wm!n222y6|#EOusV2 z)U(97O=y8gG0@B^pK)3m2KpqLOi%BcT+{_-#oaKp*YuAt1qUMU#_nZ*)#JiAk13K&1F{y$t~6ODL$6%lfz57{*GPV>$hUI$J}*2 z#gtBMlJo+x308dN08MSk20FoL`j2HEAX!7DNL`o(OP}WSW(yXgCGt3dWuSzG$f@)} zIbfVvZhHUM%;!_8@?y{H=q(Mas~8QXsOMmZ&rB;@KZ9Qef)ONE;%;A+;7q1tcC(wZ z4kPeo0Y~mNFU@V7!?~ldSlU7eG#dLAeKk^U}-2^_rekkyc?IM63{dz;?@TZ z$YSRese*(MBzF3ey8N6L zfZ(RW=!yo%&R1CJzKV0C0c#3ccYKCSx&t*bj=1jQBc&SE6yw0QtUA)|Mm!WKr@BEU4xEZ<$GMCu@CV7!SML&kM^IK z2)8*#{d@HLfpDNre+0pwaX-7%58N`%yZ+0weg^)mn;*byO#h;ypG*AAw;v^PY4oGT X;qUB&+W;AlSAeuflEzMf?-}|pl<4e` literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/invalid_value_type.ods b/pandas/tests/io/data/invalid_value_type.ods new file mode 100644 index 0000000000000000000000000000000000000000..75a7a40b25d79366d8f99fd4945e8e6e7bbffae9 GIT binary patch literal 8502 zcmb`Mby$?!_Q!`9q(wqPKtNg=kyL4rp#=u%p<@^j29Q)*K|(sETT+@sH&TL>G>DWS z-SHca_a4E^ImdJF?|q(ko_GHF?6vn=d#$~{OA&;EiU~M-ka9HSfB*Q`4<3LVAZ7(K zHZ!rafxykIV76TBswx-&lnqLIt+UteDHZ?;bpZtc`29!m57H*H<^>Y@B_xP}ff3XQ z0k*RFPSpsq*J7vo2UN04sMc22cGi|gFuVUpCfv#jY7K!ILH|o8$t8B4nWzu>)tUe7 z8RAQbW|j~WBisMdK{x*#C~5|`gjoLz{EvNf+1$_8^VjG8b1Yp(G=^G1;6}f?nm-dS ziz8b&1a4>h-&=WE#9P}~nb;WF{t}fCfDAwfU?XguOMalG7C_xZ#gxC&io=6_zO9JvgyC2W1rxcsTNZ8i)8cRh61qBs{<9h8s&oNKY2%91#d}Y>|e9Vhi zQh?)H+@zOn`d;u8Z7mSlOa$RIRNR_5U(ja)2m_(ooflUMh(gm(9Ua}~oEe8nSx#@r z8k$ku?uq8CK@WX>B8j8&shpZ~|7hBOKm%uNCr`0Ir4i)3(2br%k6e+#C%J|n>gn|* z@&mMb{Ay4!uz7PUy^|aU%OFU!uv__FYE7TBfTyX%{ev{Dn$U4sJ+nZeh*NAeCPh0f) zZ<}I$Jj;ErE5{Em=D+RRX#}#7r?i$@ujb92h7sw6Us5fuYYM2&uEq_>&wdJt zE4KHUDN9HhF4w{Bt|a*s{ajOhKq7i4W}P~#AhMnh<8HfMDNmhSFsq(sipaOhvf=t0 zOh;(P$Hz0U#}yIJ#7$yiqi2lawj$UH`f4JdukKPgIRB>geqg3vuvV_TR2V*ISWoz} zBt8pr->NllbM-2g3(TFGXcP&9?3r-4`5c>vA2OzuF12OaaMo=9<4r*_-O|)vWL74z zu^m=bff15;y`1JnAeq00`TReC>&T}lHMEt*9BTB4)8`MIETHOyvID|nSsU6D z&j)}m>`D8p3Vo@Q)tnsyp5EK^)v2FU@Pom7&t!WKD_Z#PFGf4&(Xct+sh7-^l zm?sCG*Eb<=qwZm0b|BD(v7^E~p^nE#9mu{BgFjId7^+Ni*%;nLeTV<^_mJ<}4HD;;3W{JIg|ti0IJo_EQN zPh&QP6n>+rAK-f;jonflh>k%SAZoV#*@cY1dj`W8tOQ?940hb5y5~PGkd;`32`N>s z_ARA)?JgXEA=}{v_av{B2Q_m?Y%1~BC#}r1>Q(o9CJX7U-45&-o_K+sKBqHPU9hU< zIq4E^>a=R$WY^!kK6xulFOF+fVW~>c@pS%#t>=_eXY+VsONJ!>C$BEu&Qn6 z5%sp!$92s?4XIQ|^DEho>t6PQ5m?oyEzDeZaT^VlIm-0WQV4(Y7V`+L(snRyN#7OW zmC-SjkfTv{sfqvA7sbm@HFBQ?(?t|&l(^x`EQ zqp!j0GsVY|sB$360_W%Y&I9fwOa59rlmRO?z@;uZn5)kLLCLc>c%>GeZfjf zPE(+{Yt4!}siP=ak^c0ReA4+Q)o<1_y9lNu^UCHHgjG(q-mTqO|JG{t4C&&2NHCf7 z%|&=18L=PG&-Q~69P&pE{*%2B5DE&4yh5d^b48J<`+u>Ob3y!Bis}BbI zNd@feGJr_6Vg?S%ggUTNL5xpO8ijfMbg8*Yqnw?Gk8}MBX(x0V6m!*6axOW1zeFs5 zB7vV;d#G8POcP8HGtW<&&CT2#E?6diHH;AT&K3cgsDX@eslr}ul9N4YyM{Bx$612~ z2stPOhjfTrTaan{6B=@CNN~Xhk=vu*@g&nvCo^@V%=7T&3lB9F0X@%B`j>{cA z*<5%l@s_S4YA-+Vz7;hg_YrN!kD2v?M68g`vEb;&6%41CpgMX>|3Wsvf@8`jKqxE4Grv*$HB0(>Yt;AyrhXmV z2c;v_qvBXo*7l{}q)cs2vs5+o@1~uO=2P(!D-607ZCt(RZf_{R9}D6h#D;i(cefeB z;f~fuf0g=x_3jt|Ds?FB|u@)q< zeV|FN>gV)DgSXmAHkK7ilS1(X?dmEQ$G2YAN7QKN)@ZxlPKeJJ9aQ%SCUhLK3Di(S zDGx%L+sv~Qtbm@F#qXZ*HIBSGm_l1kO#eKgePd zKY}pE2MdOhT9j!p*q=bJeA82r5+YB!1q{ZJ9NbCMkUF57>sX2}4ea?E$zkPD;}gx?0Hb(y6o6)~ggOqZ zb$-TifVbeHw6NbUn8EDCO{)~T0Z_yHs^O9PG_;66Mn_+}SywvGgMuLqKL$ySeXem- zNKVAzC=QPzD1raL#jgyYC=W!&)u&c^rKp^?f9epx;uz?nD;;ti?8~@t%SIxKm^1sN z0bt))EBpjdE5f)GD>}LEb|X<>b2IA{ZSAlo8PwL>>eT~kcsKtAcAQ&bYzb*KBAOE( z3t4idS^_eALjvk1uQk4u%nJX&v_JvjB;;Qz-U&d87SqtwmH74*W0jR-pLPK@@`i?s zYHW$=TH~AcdRGcrbov5iUJbQQ68a$yhLg(9!B-rQ^*h~;O^b8|#j?LVv9Amx*WKr| zv5wyaa^0m%R$=9nZ{3$$5W|QkEAK5@gwRqEQ99%0`9*)D;@H$3Ot{B%0#->+>`Ha< z6A<=E4rW{FF1+3tPgXi0tmq?d-ZL8<#7T;b0l#6`B&{Y5Pvd;dsKJMV^3a~{o_sP!;7b)MY?3zj?U)U<_fg)Q(sFryEe&S1 zRs$%{tdJ-CGmx~EA1}GdP3`h{=(dyB)F$_7S}@TB_eyVt_!^QJhGcd(G2UnpCqCVA=b1sI6K2ntKTb|~s@ zC+HLDAf?>AO^)}a^%Z00vy!2(2V;`%Eo`+o{3qVPHl7f@xAm{zl^}`ECshzu61a14 zy#uCu?>f0qnpg^p5t6@NIcZV}h)cU2?Lln1I zOT|~1HIvFGkqc?q+nC$!`^w(X+Py)uNLS$MG0MdZ@ujz+1&Rg$Y|sJ#n27RY3wMMX z{cV02*RZrgTmnyObg47z3XnpX-dmD67kcJu?hc}f_3Pinib*z14r5d-j^7&kCM{EB z_&9rJP@b1Tnr5S7cHGv%)1QOc8QaWP(tO97c&>vKL!IZAY$maC!)9}M{Cc)=>wSw`^Q*Vf`ZJzPR5UZu zGRcDI?m&**R^9x$U!Ruyz6;3fSIk}4k$tpA=EpEuBxFU!m5wgUeuE{t9p|IfC(uJP zU%8SzN|aSwiO``RK&NpdqJa(yH&S4vvGT>#<+-Pb8AKE1SJMEoXMdTrq0(kKG}0 z68W-~7Y_*^#4OZ5bjrzl0F!--iwb#S#LLoLlXfkdX5Pi&>Ssa)3{cn6HBYTJ{d5~f z7w%r9cq6S(C`bA$PpUSN?66Md3n%h-(FIl6RAU|T`K`enQG2#Xu@ww8`%z&Wh3z9! zig-fauSQEoa$PV-F*_Z)G&^R9I+H;4jT(gaMP#Oj4(pyhVAFoC(!*D?ONMUZ684SG zO6u@_jmz_I5{H}?5Z+UbP?g+{U2j=aE|UL{{s^STupA(zO`M zUNL+x>e|v|K_`HxElB~IW+m?*+RtJ>;-zj#ffE~3Jeu{2B9ro@aegv=Xu{fjqf*T# zl4hPJEo#pvm@EEGd|1Mhr*5L?q|)frSHRu=@fiW|#)L*NM{chqB)7NjS}r0)*Hy2=_vVJX6HSzlr1Tz9gg zWEQ*CcbA{nGHJhN0_c-TuGOI@@%8|K1_BRbRiA;}m#&Sl{SUplP;*4wXy)H?!Y3E!} zQkdI=HJ8r{E8e6_ce4uSu63%=2s?Be5-%r_JM@v^%H|e+~nQrtJgndK6udCT$RWfQmfS5Wa)_kkBguU7G z?x+qYd&}I@lF|r_v~0A+YWl?MK?%t_a_dFKz9WgvZ(T#~7&jR2h_#pEn6AekgE*U) z6%UVG{exRx1kgWv=&a_Y9$pi}Td=)-fMzd6)HbSDGdMJR@^oq6omi_JYf|dV=q}dn zJJ#!-kI1lf_D(uoz>+s*Qqo<6A~oq|GuiYes#wzdE1z2mDzPOu4g}$zzVDmSJ*!uj z_YuxiOBGW)OMMu`4Epy9^^aSx@I8y1b;JQY5^?W3lYP%Y;Q!{aS;I{JIFw(~lvu`I zvz?y{nbFtNo7#z5|KcHet&g*!M-=+QkQX@20IS9xVcg}0ZNxVShi0|PM;(ZIk! zNlA&FgM-2wA;<1GeozF=QMvhbyCHy0vb{yWeiM)t#O? z#NF0ts;gt1TAU$KezLrxDwSjmL;|-)00Z4S98U5Dnaa&pvjw(^Yb#A+7!{ta1Ri1;zdv|@6_K;A z|LpGozxQ{3(hts4f7Z{*FHmJJpXqkr_z=@(;j;S&1!UeZ73U))_n2$kUH*tytO`l}JXr`G@M zDt*ri2;Txy-9r!~T<9?&fM-3XOB2G6Tzye5J`cn|2mGWS|CNcCCoYT&QHbb4pBGUP zxIYc*9|m5OP|q6}c^-FBQ2i@Qf1odFgXhttSctj*p%nfa?V|K_o`#Ntp#334{Tl6} zv~-?kkDy%?ntqLV@hWNQ595hUn2_^xn-dqPJm;9yJM)AVQ+|E*LdhF#0Hw=mgPY zv}g$letEzDUs+%N?>lGhb-c4=-- zZU`TLPXx@<)78li=HukyE{t-w7xI81osdEvo(Ok)4?ABsgu9Or($gCOvqw525I(w> zF@XS*e3B~v?KP2dR znOxTyoo2oi7XEt7IbZZ-OkjDYh$fsipfd1|{i?^;@tq@7J$|n3S?lmTdi%3;&@_I_ zm0-0_+8{8aQgsEos!YvO&(?1J_#K7N0_&KC^n{AnJ4>3B#}OZ`^jce5fYz}~+2*3N z&t5Xlm+}%bj8TsR$8-1-E7GII#sdXhtGneF?dHRbewq>#X4Mu=%`?Xk+)+zXi79ss zQ7-DTEqv5UMW5P`M1~BYHX4Cv%io<9YgBx1*D;cc)o$_g1+z3Z3Z^a{ns)^up8%x> zf)&((fC{XRX|@*T{exm}yQJ=-FkHH(rgZ5+vfmu*l+}v9$Dq=n6iHmeVnfTLXI$oCeSF@@;oRa5N<5h!nRK(r2wT6Lhy?H|+}1%cux?%A?m3WX}y7dSFgc zL*pQK+w%xf8FKtiItYw~WuHf{Mp7hzpMNBHKlizY386ArO13jKtI;T(odg?50PC@k`gR-TNKkX3q*(s-vlWTDyquKJ|6ni(9jKaFJ4a zSWbB)=t`X-RVb}R17T6(%G&&FMR(YQdE_Hk_2uIJ^0=4fp0{3xq)KaLx>A66tx;s# zuZuBzkZLuTQ>t3gw<$u0ehIls5wBZy4wg?ySa{K~7_-Exv;7}Gd~UqZ%Tt9mMr~IN z>^Ja<8J9seL{vRv*_BVC8tOfC{rSosuQLf@8J-M)<9L1s1UZ1l*Uh{e#}1p9?#6~$ zltSFK)%K>c5A0>>xZ$AqWS8Zz$g*oM2B8?hTF|8G+^FSZTI!YqWlv6Oj@5Me7h)PJ zQ$NqZ@^H)mHUev#kok%jzMw}ouoz{@o>_mK>UYNd0_a9M3(MOi|zanMV6B{Eh6#5pm*xHbgf|f>gR&*L8qBoX~!Iw)V;VM1pGU>$u05M zw8#N~Ze0N2lHUM34|kv6r@!2!cj;Zr{; zg{(5T((=7Bh)NLmp5*k0MBp{2Jb;I!+c$+lx0reF+(pProk+Js75{dVzu0;@VcC=u zgAMeym?>>dOj}W5U!|}6_HFeu!yYY?PU9l~p_-+U?foh6szYm=rBGRHP3}1CSstJH z!6>4v82oi{0Qm}83^se2ez#)6rKQt_+#+2^TK|pr>mIEj-mFvN!GS@3z+un+#=H+r z6KVAnPQ`^gCSw-&(5W-9%F>17R@VcK#K{qzaQu$Ql>BR=hQ;muNy!nE5sCIU@2OhO zqC25@wTTfOe6lqpD*atBdlo3*N*ym_RufOZJar#0k=TXZ;rHrntQGEodL}MG^zCa> z@4K;5GpG~Lbnkjg0xgs&;#+7Q@80d36MDpNv=ql_c3%1xa>Xi4!=+mH$}!=(hT<$G zOo2_)qx^{a&ulety-zZlOc%<(Uf7ezk%Af58BN;>;0Ps6DZ_-sE-SJ*q3>SSFGC8l zvwrAyuYS9(#Tc;SO70kMC`-cu!t_;?44+eOb~kSM%*xhte(1UL#D&x>zpuZHEt(hE zMcGfIMwTp{TI4M?@F@LY!V@AuYFk)2l>|Jk3k!Z)YV!CFWhTC}umX(&2jeN;HbuTF zI!cINJJE=bZb?l?@;!BgjpxIF_$an3f#n){gdD=H_}GrrE+g;SYqrCbN5JMp2PZ<2WBp;@mJHpv7O^pp z|4Pqns$3dTarHCnlV7+`O=r|0ux7(*lU44qsza~)7pHoWA!nBcQroE3Af~&@TguvQ z9c|8xP+)cIm>}J3K|KntvVJ}4nj&HNow1YAyN##z%f(?2wmzn!f3dDc)v=?!xBotN zO0GyGsb#Yd_0(AJiL3l!6+MyM)84iX;cL{qo}x{ElF*#cqI%k)~61-^;_Q(K@{8{4Rx zunE1XL?pM{lupua3Wmmoo_soGG9;70&>eejs0}(mMN^cY7K*Z54Z^9?5v}$mZ{ERNNl|#j zPgds6oZaV}OO~5uHlv@=Q_@#^Ba({Fc9QTp8^n|2CMj_cYsj6JeXbYx5-8h-k=W(4 ztQawB6I~DPKC>Ls80^i}Y$6V`zEHZ}PRBSq7`WnL4uh*>5af_hG7%ypy5XhD2wV$6 zG`HcG$r6ik+M!e!zAbL^^rsj;v@me!?Y4CJ9eb#(5Gg~tQxGMpRhKGSf(F#@2uTlsX2foj#78U+;-av^N-w|a z7A}#a#1v_j<9XsB;@1bjGKz3zI>QLhdJuXFs!K84XZ7`=w8<<-xY&szlI&f4^q1Uh z&La1>yAlCAs5Hx97lrLgv`@-h0s`Rg)gdSi>$TyrxlYTGqREsBQ|V?@Xl zBlHX>FZseJbr{-+QGGk_$)|d zG?#CAf0#pTdrUc5mF>PBwlZFo$Tc>$!ujlAI{QiA2SNPIc2n*>Sh(~zJ)px@6J`3I zI~Z2rJ!Fcn9VhSf?-05)mpH=MgzKN);ZFJLMy!n8wa=QF(L0!d$v5uqoNGe(<=O%t z=!Juov0LRueMgi zoIkRCMZV(vUPp!iYuK^#v<-_)ZG**o`#stL-L#^tF__0}hFct?fX>jvuSdN69(%HUxuym0u)yl;yZ%>R*7& zCgNNJHf_ejA68GU%PT1HepNdctIKYWP!Ie$SHUd&9|h^P5iieuCy; zFP7*vR*|>O05_@L(9k1F1#umbISQ%YMqe=FLYv7!+5#foyK zR}jre?JzS+UaI#{TzpVNP28nJZ=^)(k@bzkL@n&5lc@QbiLAG9UHIe%?k8dQj}KWr*Rfs_&qr5u#A%vVYc)r*RgF~MTNyVdG^I@8o&Ki1PQLC zdMI{fLbB$dGKU))bK}`Cwupx`isQ{v20X;^ESO*Zrc**PLz_t2U_r9ACSxXl3_I&2o~n+ zC+QA1M!$9EcajpUwcI{>_m;z#{7Z!kE89{GD5WLSSevdn5mwX}X%kB)@lv OFDK_^CUVpM_WlQNTI$OH literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/test2.ods b/pandas/tests/io/data/test2.ods new file mode 100644 index 0000000000000000000000000000000000000000..2a90db839026b57eb35a1f5c2120b8a257ea416f GIT binary patch literal 2877 zcmZ{m2{@G7AIHZQh9p}=WmjQ_OiE*?EW;QVW2eE4#xjGMm>4_NWx^GbqAU?1#Gv8I zUdWQ&jFBax;i`zP;!gda`}F_!zxSN?Jm-Di^L&5jdCvR$yuU93#>EW;{HMtQ5(9LS zKdtVMj$8(fFYn|ZyJw?k!+iM4DhVh~ zlTor4p41p}csoAu$1;~?ZEHcR8_*?RYxM2s<&Pz1lT^xJ&2K)eQjJn=U-)It*#yx3 zN*&2m%M20!Vpwbh(Ce3;*rd01APVf(lmPi{#aez^VoSctlTrE0@xQ9l%0oi~7#h8S{EyJ=L4iJ-1)(I9d%&5 z-*cZclqBe*t3DZaQ2ShQntU?Vi{)QNMNV9dpP4P3iE*GiaTFh`RiRgC`Uq6H6)GZp zj6V8rh)FRlFXEH=t)(p+qQW;EY`pFpXwWqqj6F4d>%6K&@$OPq;avkylq16XjDJ62 zN|9Lr$tr$`$=YoP51yP%l__(RQkr!gFzrdmKjMIGKZk*^KsBX~i~1PW^9Sxq+ZSAQ zL%h#0s1sf@NpxlgqQ9_9XqXqas_ghYbFmroj) zTg9zZq2_Pp0aY>^dRElP?I)F(c(jUhZ4zh9=`@7Cv@W>l*tnDIaIPsb^5b2yZ)g|t zX4~mD#Pe(6G~2s*5)x_sJY%dg5D&3-%B>rIoti7?H)5|!SJTLzYtaEU?+cx(W%~>5 zNM`|DbmdX|#pH?cF<1gI@pL2*}V(QQ5Mf0Bu%pfNoHQ|N@ zgDq3bHLn5R(3%>qOlZx1K9&(IS>G91eeDJ4i++E4+l>NndaGJE=AlcY^O07Q#Fe45 z^4uLr*_RQEF$s~tcc*{ri`NO>dds0 zAO@?xUw9`jZdf9(fE?CBbu5SWeJ~{KVA{r7T~xRJ1d@Uj>Uf1?-A8@bE4hy1hLv!2e=-M7qSun~kN8006+g0N!{U z@f*i%E}M|8Df7pLn;%2=gg?V+xy!egsp)+vvj|3Se-z+qNZdXyP6m88NC0X~7^it`Zy&uoc9B1Wr z)b>UB_`t)N}>YVea25)1~T8R}ie!P=9znLU>}o%uzK9o(Ju?9o(*$nhBW-ZVR# z#~W-Y%r&Ej!`{g z=(SIAYJ;9du}IeAV8Fft*Rt~Tm#k>U~vX|bx96U@q_iN+YPL?DIf zn!Rz+NI(z;cDxl~Hbepb62;}R+_Xb~r#L2Uk*a99D{;eYCGZNl*n zyz^Q4L$KErQ}>49!Al%Fq5a6?aK%7FxVBRVx2Saf*|MpmGfBPai0KfV~~BW+6vas8TxQkdZ?{= zKGZ_ukH{<0XK$6m&l1WOHv5|{C5ri3Z7gqg@6gxF#03xuZv4$2zipz(QB#NRL*$a_ zz)_2i`muLtWye@iXj~jiv0*CBYd_3LWQ#E4K0SW~`ha5MdB?F%ic!0u3rQ;+N6vKf zc*hxyjeRuo*WpQvO3|~0cyjcGJ)YH8Fx8h&qNYkak5K}`S3F1Je5=Qo3=B<_md$px z>h3>_H;>q1R7isV@hE1hUu1i7000#BQG9nGXd>#{KZ%mw3vbsDKEGxDCH_tGd_$UH zxuW=7wTbz|n2!F5VXxV#%+-zeTMN#al}_P{c~C7!(9%wwk?Xn<)D@LgBpI6t$I{By z>2tp_yu=EsEG2#O+j#j%9-bZ!`3xLBMr|OL%XeX6Q+CJ~!07IV*$muayvUNHy#;XJQ*|yhe<6uMz}J4&=?SB` z*U(ltkYYTmG|3xbd8Ux|O=9GkX3)+N?-9Mrp&W%+@BXRzx& zE&e3kJf#z5493>WS(-ldJ<)5Pb1(}$LGkp}eQ98}*V5J>DGy9v^g*gQF%M2!!>p#$ zQ#6HQtsgI|Y}_l^<@vidaB!-Bw`c6H0{DL;`0eeF_O~rWz`m*e-Rk{N*t2hc7{8x& zKi8rkx_9gf^({iha6}(^Rd*eE*I&bOWFzl{FZ@;gYpkt(rooBB}ADH9tQ|uZ>pt-yvF;!|w zp4w7YdHfjDAxlS|B`~?Cz>Z^56~!1a1f7NR(}oxf`jymG+b$otBT)xk2HTbkqn1ui zw89dzX}I-A@&ZoRt`_tDz63as?zi&#I)=%u)wE?P7 z(S+^+%~YQxHozcS@(_((*Q48`dJqIsSiQt{j8WR`@VTq=Yj$jk)g3&6Rf@nOpdIhc z%g98jzOnNQn{)D}(*=w2xT-H@E7B_?Bk}9aOPA|Fe(VinqC`d+)Z%D4)1v~qg!V>y zw3sxv_3lgt^EIuJdf3zZ4Ks_NuOs1;$gGQ86Hq7vnr{{Gu5Q?H`zO@vgtk zLGu-|Ew4XBY$qT2+WPb~tgFN|b=TW8OHn)6UMX_VRUvOE<@9wr@~yDtbzS1BEKhu> zRn>DsN@_jaOh#PmNgPwN_Ozhhl%0*O?RJmFSRom+M_8SO)#GSscx_-f51c}0m#^2P zx%M3>xv8qLxnW6qNA23E0#R8k{lPal$~j3OaPdI13U^zc2y_#DeN z5|(r=7pK%HK3Jhf{oLF-IG@rhf2!I7vP2t4^t|BfL47{V73{0cppg9it+%ejTJ$49 zUJ*}KUT8xg*+zsVs`8HpF-uJ9@~y3n`NN9s#iDykCaeW!qJ^={_4CAo?@iA=xY{uD z=_oN_lT=Hdkh{(%4ZWjTTE0#e`9;j-W;K*H50d3EpRumwL$5sp z6RORzI%2_6{}unhbSO?@pZoXJ<+a`05-N{;U&wM*f8biD1P8e;3r8N^VW-+QvUn9t zAnY9~;6C;36Nx^Y=vXWy^SR=v6{l%tG9SPBc=ARDDy_2E4ShdFdU0&D`naC`<+!v3 z-=4mIqm`?sLh7p~2RU8G9AcA0h7ca(t*6P!$Y9vG1+L^tF+CNIF8tW|ML&PaD$HoN zssf}dy10faznZDqu$KYvAeU}P zw(C0AG=2fmZ{-a}L1L#*5@yH)z;D@`*WK)DH?1R!46agLdzi!#Tjt&ZsB`tLtyVux zyex|o$YS(bZf9y{$Md##1k|@Q1=)q@zt))mIV?q3`S}@-+U2^SPM=;WH@pBQ!gc`; zA+4~=6Xz}@iXlqguEzX@NY@NQjA)w}cx!D5zr0s7C3`;_8^qx{oxY#p3@(ZnDy;Xq zZN>u2Np0UK+flUFEm3-RT!-!Qz7^Hmp9W_&8xMx&2=A8_k=($#snwTsrbSOVfjvS{+x zA{XPVKfSNJzL^v&ZwP)~{Ywn99-2>wj!mi|ngARggQ#it-)jhawAzim ziN#d=DAwLeu?el>n{wnI7$IAaq6Cfsb@(4Qh6HvCXX6yX4ikUE#QUc~`F38Mfs6FY zncNHNc~!u96Hv)Qb0-clXnIRCqKy+r&7MDfLcXG~v=bWAuKl|FTleSi3Q9794$Rmc zChK$Ol*{glTqnbzf(CW7_>TKkrB12w9Pf2<&$y$v37-u<)TciK9Q_%={j6_4&GK}AB7;8qru_&z_?hZrmG#)Rg9AboQ%a2$eTo1lOUROvvS3D2*CltW1IONkY~RdS?-e@Co>aj$tIDzkYesc7 z(GcTc6vhGW_}|(kN~o$jgu!byH6Ju06Bu_b+31hqI8*(z z?faoHCtdz9fj{ehE=)gk>!fn^UuEiN;m_Rs5Jph_14TcF_&MKxgh(KzAIS=zXCK6X S5_I;2l63HqvXlATKmP^ORQ{9z literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/test4.ods b/pandas/tests/io/data/test4.ods new file mode 100644 index 0000000000000000000000000000000000000000..c73a20d8b05621dfdd301f16a975576ad6116d21 GIT binary patch literal 2992 zcmZ{m2{c=28^>b_jcB!YHL8{>MWSRZliEgwN)eQ*JtBxogjkBA4IS%PTGTe^)KY6v zTNS0Wwo+>Ah$WOpNXnGd*6J6X?>jxy`R0Akd+vMhd;ZTo=icY{Jbx6Di<=Md*N_3+ z^3_Rzux}35A@;)K@YujmA{I>~`rEAcRB= zz@j}!-dJoPiXBrJ!1FbN48Z5v)BB@b0DvnP062IPDKOL*OM+a(`*zz65-8e7+r{35 z9JSw6LKDodUmhiQP?><_AW` znZY7&RFI+-0+=QxWE9>0c`sx?Mjc>Wlp!h7a|ddj6Nr(7+I zAfEY{>ca#b@mFSJf>%qf-ZbHJw;qU3ShhL+q5;VS>TSAgp-dP2mIoQE#zq`r@`a0+ zS6)~MpX=67j*bR8L2FAcwA^>0P?Bs9NRr2`6ivvBzsuvCl)dtMv&+NS)h@&Gdk^vp z3c#htXq40yM#&X_ukkc;fKP0B;tEP+#O@^qIX=q=_g%N#UcT%$u4i|H4xPI5c~-oR z*FIShEPTN`@;t4V>cSZwe$ji@>KSWivBIU>Aj+)~LyX?<&TB~Mv}ipNq$VK|<@n?C zol7y-k#XWWz0L-viyT#-^`9E#dVEPoExk|#&mj~76!@hc^usmP?xRD}%-haxoyC|y zmWfVseQ3;d1_zWMVV|G z>I?Do^8HcANsj;4Cu={ggi%L#4!V=uV7t-Vaes$D>#VA@5fQGBV*-8P@^g3hlFWl6 zg(XG1L;3kE#X3}`epype_?R`O`?l4vkeLNkbBqa1zePtkf3PU~^)_@kBQq~3f6OJ_ z^@x?NE@Pu?lPk6LFTE6#%PUiOdb{{iIy2q1~c4s55d1o=- z0d%bfcDODAEwa+(UQ@SH<|Ol?|9oWB_2mLHPSYpL>w=I3TIQt)=_jy>9!T>rC@XQa zP2sUlwOF(2{Fn-u7T@611QLMJw#|n;NykcR9lSH&6kUpr5U)K><&*%8st6qUWIJWX zh1^djkDkb1`mnwg}T~B33rHlWT2JR@9Dsd~H z9Pq!_91W)DEqvg<0zGxUeGQz`GHgHOoDCm{;yrYd*1}pF7f#qISlPr(3qMv>oUVuw zSe4w;=;Yi$T>C8d5zHhHBc_H4dF9wfWsR-|GadSfWBxCI5-Ibz$+Nl~4<*Mi)pRr9 z+{l9X@lRD4Z+KZKey{q;_Ck+qqdz5*4fdBWT!6o$U#K=G9mPhUb{GI)(+|M-5dsft z1)Kfp%bf@8w}~z5i??Vb!JuEluSUW1!2*Qyh>-4w9%Hl|y_bRF@-{rYbz;`7J42~s zQBUQB0YgAhbX<&7ZMSyA`>nCbsqCq16eEAR$IlCO+gvD4`A1@TRL%wpV4!fDwmwhm zl{e%Zu$R2CphBEdTPI)2?$19B)&gG(&N)qJ+OTflmb)xDXi2m3=b`k^aCEF`D#7mG zW(oFHiuhBF&NuZt+MDwB%Os1%3|rG{Zs0~a8W;xRWNu|edKeA#BO*U;(nH%Dl(wi- z8hEDO^XP32pT|$8>uO9?GCPjlY$2B38&P3g&96e`nNHm}M{yIf$=VL#&f!@x>6L(M zkp+Zw8c)LlgOAuMU+fdH5j~W9PF_{YT&mXXMi}*GoWQG;KN3pMCf~{$^c%jVGu1Js zEYp?LY8cz^FM9q=_{n{b8u*YS!#Vi}C77o;`CbaqDrd%$7-^F%ur24bNpDF}eAw-f zx^5)@ZcNNmBZLJl-ug@lzgV4-^ZWIstB^pE?9G(6%2`Y`ICNJ}^0C_E!aM8wg#2Q2 zxqVEwgvjwAFNIfIDX$6&)>W0_$1f@PnKC(d!doFxHvSQxO9O&=X{+#=*E>p2UFTVX zuKmFWH2#y**nWXO->~@<<^%vl*jIDoONOc!ED%sUnt=1flD-mKnQG|6b(Y_F>(*@N zHT~d6$_n|6F!D#hvk4fFqo$mB@3fu;`gX9D={@)b(Y1)sDDAtI7TTopMb=QO_bp+c zi<@*-&mLvHTm+1YaRRluq_NP~Bd5d)VKRvnzVQp)O%tnFh08aE;AApGzIiIu{V>u{ zaEmnOJo8oxUYum)ntQoHV!D3u94z(zq}^N(5JNVcnAkA9ssl`oOxCr6xpKS=c{r~r zcV166p)p1B%0!ZH*ox~I*{gPvrf*<$if*#6(NNeCZ5qBeT_q0vza=Yc^=`K{2LN!L z{VBdWAZ#G|pk5_en_G6B6mQSnmOlPo*xuRs0_C~5Zk9MSQhxSO1Uz0?v@m#Y>mw-} z9=aIO?sB5HDHdU(Pi?I$F;FQ{afxM1SarBS z&B(xWDNuRHabX&7ceq6!1pVSTpFu;wsG+IT?>kilBZuB$HenK#`xPgeX;WsR%gQmj zr#t{a!#ta@s>ghANkEG<6sRlke?o`Pi_ zc@cN|!D@CGXenb6%rwGD%}7mro_-tu zbD}8Zf$HDg2{3>7o literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/test5.ods b/pandas/tests/io/data/test5.ods new file mode 100644 index 0000000000000000000000000000000000000000..5872e2624d0330b308defb8fcf5af335e4cc92f7 GIT binary patch literal 2906 zcmZ{m2UJtp7KTHKNCJq`2LUO;f7hwUs8Xec1QMi#mPiS`gdkl8VWdfD3WAD2 zKt&~B5J4FfsiBA<(h(6s>I=?#%lCZq&Ru8S`tP&WzWY1-w?eS7g8+XINdU=DJw<_e zb1)AxC(av(A%q8FP=SGd-X17|cYwcQh`*;o01EGoR|p8i_002woG6yT+3E_Siyh0ex@45X;e+HEIiO?+UFc&cg zg3&`u!g=J0i&eXTlu}Pw{?l2uej5v#qYWw+6lZ+$HNBFLuSH%-SfVA@WARk+ZeSlq}LS%E|hE#)ty`u_{$VbE+b=>~qka z`bJ<*-g52eeB@g#90u+Qb8iPMV*cdw`p~G);gO`=o$+F7IjsJV>h6@?=$q>Dpah>z z&+}<1ns<^sb8Iwiiv%7^rBcz0zSUIwA@_ud$+C%9JBA}mr9{I?My--3_`XY-jFqR} zn(tSBkx{sNKq{xXD7-nE_|?u5T>^tLl$!OCN?wiV`+NbeB4)&nzv^Wph_CdYo;FN7 zKCN&<=1XkuUb|xNnc>Xi)h;5CN$0-|yAmm4cHZq8Sd~SE2UQL;I@ra|3RlRck+W8? z$N5p}5PBkoF`i@9Q`W)L$AMUqb?Cf4^8~v5Fmq z*&(%Oea+l7A;xc7%OA4)h>%F)_gR;R{q%w9|!K@zOz~XroH~#jtymTMB(mduP|J$mitVrZ?jNgORGRL|TJ!Ljw zY5?pT)kq{7ooPC(+#^>G@#2Qdnm+$Fe|LQFwQFxD3m-PN=|XV~r!p=lk|b`xjtEkO zXVxf@K_4HtG=~qJoBS@36C~V3kE$cTRM^q!$!<$8Rm`THC1P(oJ-j4FGa${sQIleS zZhzu+#7t~r6lmf1Yp)X2abIIK^MfOE%P!$)y+V8q-~vcXYn=EU9%_%tDY=O;M=q?l zJU8!le0-l^acN}eGpa+4=BqEd_-$<40L@8mV+#}(DBZ}x8<$XnJf)-|$x-uOT1%2R zew~fZJ(?)c7+-V!nAHB5Km(zqE}!xE%8i2RXR)k?H@u^(%B{NhPxoa8+D$zwZlwZ+ zuy>!FbZ~b)a?jN;y3~bt#6+%Z-9X7a1dC~|Md%PeSK0{QZ@_b|{#MEHoOk|KQe|>E6a#JWaz;`pUdrC>gF~>V!Fyv-BRGzMY$otlY#WX zF)~v15e}G;?>i7$qJ?q_-*KbBKKfXSULAO1*H~%M?DU2w@J9bgY!`X{YLJ-6Ez)$v z-0IPn(=FsOBvE6?!b3C0DMRZwOG`Y0TyZTyJ~y1$(vLVIQ6y8_Q5~8@jH4GnQioHX z2BaozR4R0%ga_K&ofuJG59(We-qxmi$*XzUf>#l4er}Ztg zpWM!jxZou30n8sav1~#_%?fr0BWsLO%)+9Pi^?QtPX+JOY;AF-O$?cLpD@#~i&DyA z&O1gGmup%nVPk`3rYEQ8aJNQrT}tD_7rtrT?HgaR_QN2|9O7B~@Ex)5gB}U9cIbN^eNJ#Zl0lV&}Hja-7!HLI&uGXoI z41fdg*bk-yw~ADHeNqGU+eE;KnS}Ryy!@Y5m-qG_ZWZbv9fe}A^LeTdLChk+mHq9S zY0b*OKS#-Qw&YzFdg+#DPyf6aFRo?wwv3HvK*TA3nI_?eTZ6s@-F=h2v5=%?)FQ^K zI&|0!F6L~7pG+$^X=7KaIPW-VjWoHXPSth#P|!Vst#-Q@Jnd8VeNO94kGu0jO~B@u zED3A-z)k8#qZChM@~nM07(Az|{(Los*XfLyAQY!jk}G!DAj=&Hab3}(bae8q=5uk# zOJ-|*%CR5~aOnpd$%p1S3w}$cA5)XE5?WSK$tOj1*9)ZZnTlq4iTa^IeES)?eY(2> z!5uy6@7oPCmN>bKiVXTeHo^29V4Lg_Q-raN@mb*#akXQsYwdj=)DPZi9+KEKQ35JM z6}k@Zgi5dUQi2PC*Sf@$VIqqeYl0DZ(#D{4ktFw7X_p-A26y2^&J^3;VWFr;%hFwP zb%HWGyjRuNNyD#Pca7)l7BXX=?reRL#ty;Ft=qr23OS_`wg!{zVd%Tym({nouGWKn zIAKB2xvJ@#H|f}S-nKVym`rHvL~X^dz)c@iqQeaWg_7S-T{&6^Ut~PXtuIhRewe%4 zWBFYktaxM7hq}%2Pq>o1j;szbG393k00fw4^9#0m@}?F>ia3O` zN3zh|2)$z)_zBmsDKX88Gy`OTL!-!O!?c!4R@tEa#0#KDyxzdTnx3yZFe^Gk+gb(5 z@+$P+q^h)`j#LUYQ}oh6njdi<`7Yk8ZtxRK*8nnSymzj#q&2}LVtce!Sn(fsE{*w( zJ_!)NY54Ts6Q}D(QrDJhwF;l^nH~A|3jZ zy7y$wVyO$VQ9P(kaS?Aiv_#`r@*`c!Mzh;%R4i4t-MeEPoM4^Yb_P{Q0`2Rzc5i(t zB`h@NwqF0^wVyK1+(&)-uOr1Ss{Rj4&DHn#|dW6WPTXxe*r(Q@e}|6 literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/test_converters.ods b/pandas/tests/io/data/test_converters.ods new file mode 100644 index 0000000000000000000000000000000000000000..0216fb16311d8817a64e1dcd2c648fa07c6cc0db GIT binary patch literal 3287 zcmZ`+2{_bS8=veT6$Tl)}j2XKOvdeY}MM9YAV&YcfB7+baCTSvM ztqDm+WQoZ(!Z-S!@9F!x_dDk~&pH49d4BIX&pGe=dw<8lmXV1A@b4lGNDtD_0?|K? z8XLU@`v)T;V!{xxu&^M1I4r_HG(%&oQU_l;v2@Y=Ls7T7lCQNDUP+#mj$Yw5{N_aE=_5`LWxEFiN8ay*irU_| zyD{44wDWXya>=~#KBdVkAb!~JA;R334C^VMbS?%WKQ>q@z}sXl4lYn>@hwHwHq`yZ zG6N-!8`XW~6PsCQYqKf~oXL>8lrM%i+Hj${5S1JC-7HBiQu|Cb9=#k63DQRHs?YPZ z3lOQU$uj(ijJ2epE5k7dkjM=nD z%T~-5eR3B%9Nj*1DZ2rSP=sW}D3j=!R2rt=NFA)p(OEyo*4ip6%O0<&#@rshsx*w(mma^$EIclv$D1Fdr+!nRIwVPvfSX~weAufu(A2gV}uZ?qynrv!cLAf z?w;5});uOCmEItNp(QpxLgMap$bQ`|FOy64#`imDYc^`6wDU>C&1LeO)=_@*gaIlI z8%rP1&FdFJyERLMpt{#xw)}LMp@G{OYcDO49^M~Jvt1a0!{^zQJ!cknLG4nKp0^wq z6gh{!yJ*`5J~8wC(z1<4KQA4z{J@y)%(Ca!8hHB}#z{7j)!Iiow@gVr*2>O}aCv9z z%awdPflKxMNFor~RB$aGqbVDYQ^&4^+*xZ`Z)J~#X$%Um=Sv|!wz65E4znL5>>;HG zdw>`2enov+oFE?037+NS-_sb<7Wb$Wfs#Q|G-C~ec!xi|wwlDpQHr&}kgi9?t3Hxo z2>h?b^ZaGQKEiz8FZJYY$x_r;Fv}mG%XtsY6&%om{qT?x@V_)rh86jGl?ec-76Jf( zbQ*w%hD03k#t{#=k@N5A@Fdu~91xzj=d1?@k+xhBw`)nZ-a5MgcQ%YivZBrLxo7r3 zX|W*n-#5LYB>j4a-Yl-7+r~C1uWU)@1|fZENKaK^`=`7EY`OW%^_vHK+hQIuryho# zfc0v&ZQ4O%FGoE)Y@WS>$YU6S5Yq;QBJX_EW=i9*g zMLABYaVZB;yOyV}#;n-DJ8-JiYXQMzV$ss*43+ex)qwBGa&}Fy^uWNp`da&rm4*Pi zFSZn3X!;mKep)ZP*RXjLJYTVP2cZu_!a-b#_1<;+OSPBy1#dM%z1dvdt%&z?0$i)4 zau{dc`B}By$JJ%4=d*$eu-qq8qJR|RDPsae;(fi^;S@%uF%*Mu% zeM`>3n?t4(Wxcr#goj`tFR3`;HoQW)Uc;hosCl|ftC!h%+#Ed|dpJmBG#TgxLHgb* zl)pA+jyy1a&^Z4{i2@~tyH<)k82R+fuRS(1{MeN9gEW$3e@{pG4(q#-VxW&#Ac@61 zC#J-5=>dio-s)wfa=uz(L#)!1CxapS4#PB|W7pRJ%KD-5AV&1#pPxy%5A!Y@vo@MZ z@<7_`E%}U7;z9%slL=dCrH`#(r+$Z#bJtU1C0f_+dZtf}dTu$PFpzR_DHL?TU((BJ z#KF8OPYvw*S`9Bfhhwi3xSqn#Qf0`Hu9iG{;L6bKKBZID=D89RVW4YUlDZ@lQwdqo zVJY=F$HB6agFNq4#XH~EnPLepwfQJI|ErCl0O6WPH#dOq>Vz1;|WzHHQkYXo?yCBa3 ztuNZ{{{?5E1S~|_|Ft-r);vyVt;(K+f$=^ATokR^i*kHg_uYos()3C|HIBZt=)w$F z#tBvn^4kg1isH3okY9FFlV}RZ{A{MP5XA-jq06AGr&v(np)*&09{8HineZyIL9K!1 zq{Sd|cXjPT!hF!&gw(8*5v)|dk@OmaOMr_f$xcgQ8@Z3KwezJ8uP!3>JzX@dvW0o9 zF?X!T_yT_eQL*E^0{zfUCT`#hPQy+}!2NI-0Q!{w>DdsYcjJhS==XJn3(NPD6JhFi zJrN2OBe0-Sx4NtCt5XtF!z%49l&SfT%arJ0&_Op*`iapK27hrGlunL8TyT*1r4aMp zM|&TR33qTC#v|uAo({9hb2Dx)Gcum~&ZOU}5qTEOdHTgVEpddc3nk1BvR-(Ni|E5E zK&fR{9>#_36Gh7Jv4}oDjONq}Oj7>?pX8TR@u5=}7_5C}kE`^qWMq!Tji|3MIH8NM zP)jzeg$Nsvw;$~WtbeL}F!%0FPhGV}C~}5!`_*HS43h?$Cw-8t1@SctzL0{w%jYVyni7}bwkZKqEYqD`V2 zmF53fj!oGDbdPJXcQ9@mpQ6$Q0xo=8 z+vbV&1J@9@6035!pBCseK_olWK#vmv&Oe?iD!O6EVClCmuOa}r>iM|6_qD>R|9BLdJtV!zV*mh| zeiT1cAR+>G#M3!WQ2U;f0tve7CR$IOQD3U%f`#DU2t?G!QKBwG%k4fNrP1I+Z(38o z(QcoS)OJ|8*Q%}oEqPoi@N@xDz{)4BkW{1mtvR+!S26q#v6oHS4i$`g_!?DJFj)%> zxa}GtrY0HMMO7>od2#N6IW(|%-cH|dhee#4@>ZqTz1!|>keiWR?4!f#xYdWQSsV#5 z&k$)r?XEAHq5Mj0sL##@`9yXz=hq#sl)dvaD^cqpF(w;F0bf%`tto0HPGF3>vbPec z@9UPDTogwhw+G|L-}aht3*M5=^^XvrUSmduzi9fbwOz5f-4Kr*!k&=R*AyElbo9P* z!wT4wWsNGm9?MneR9~`Hfxj8T{rI?`c*easQ+9I0C&{G(d3UyJnr&dsB9L424)*&Y zU);94qfhA^-c3%wz0J>EQb5yS6(Hh-(LTXiK^Th&HxxUZCxHrt!^c#WX|eKpzsZ$& zBbJkl2c!abvNwNT%`pa*AHtb_SAf4u^rOE&%YOvBgY6OO|BLXS2nc=nQtp0oYKLTndNAPnO;xQownjg HTmb$B5394} literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/test_index_name_pre17.ods b/pandas/tests/io/data/test_index_name_pre17.ods new file mode 100644 index 0000000000000000000000000000000000000000..56638c983d94409f5726581d30478cc2d70a6eae GIT binary patch literal 3699 zcmZ`+1yq#V+8*E#Qc}_&-3=1b5`%P!q%Z?R4&e-49zu|k4&jI(A|asE&@mt(3=Knf z!vNBVB6sxM|M&mB=k9Oq^?mRD-uK;W?X{ovJddFc-XFw(zXv z;tuf+@PdH6yr3@jAa55>4^dwa2N6#Y+yyS;=>_p{@U-`Fhj@64z`bA)kOSNq0`WG) z_M`z2{OrIESo$+&8-fP_NMN`5u@l@o01APN_`5?tntk$^6{BdO+fpVX4NT*Ps4^pA z{v=cA`L6t24VQkt75v$<1*_9SGkZ&rlAl6Ap|&KyfGHG~W~k~g zpl+3KtK3#(mF66wkbC0fc(Mu?!rn7~PgtuvyX%0!zT^!kbors&v;e_1U zF?Yra+gb1>`5^`xD1Ikoq5KS!!pY2uVXx5QaJ5B_yZxq9lQf#AX(_?%w(gPkF=D>-ddS& z|Fvt1k^Y{cj~O&ps5>@a-mn(Y!N~OO`Ryy>Qp+M@qN_Z`>9VdQvSCg5+@CWMm?hN}BQ< zQ+A87vkPg4g9Sb0V-bbEse)_T#}+2uGlJ%x5c)a11;ThHO)-No1noYZ0{wKD-3@I< z3s$iJmvm-zveXT#HxL@y#vbAtr=}Z(?XiRT?2?g-Yx|;R+ZSi2dhnx{(+pMy#C4QW z_moa%6IMv!09pL!xAmwPoUQ)2_{u5}`S%G;m+hD+M zePPT*^a$(8we})~A{YF+dKZKo2if8qI!m8+Nl^9m$@n1#J_RNb)C6@QUGU!PwFX#s z3!+(hgAA)eXhD^B@bn-%{N@s)7R4+!Rn|}wu=l;^=18DF_el*oFxJid>(jvS@?@%{ zMPGbj7MHS(?;wkf(w+Ao0DP$_@6%~qzbHd4DMehZ(bVJ8V;yg5&sT^`cb8P-P>Crd znsGHr){}Ir4!jSdcHtF3g9ZOte33nC+(!f6H?u1YBO2bOQUn9-!Oovw#pLf<44BM% zHpV5If2*XG*%QS=?fM!I@c+T6JLBnGL;wKDSOWkg*bIA55APr7#i9{ojWk}BrD*Y- z_^#FmmyStA0r*qo9NZFv@_dKV!2L094*lHxT;Hw`T!Ou_D)E(U@v?wQh|jyPK01N> zTN|57yb(btf6cf63Y++F91(D~LsuZ{xATL=5S3F%-`hd^KBX+>b2S{>^$ay={F1FoSkQAXM>+k zmoV~ZDD7NoNnW~IAMFGc5zUJ{~L6}Jeo|((uCM3aIrb%LY%hwGGx9^~nvE1#1 zv@5IJWA|qjqWokg6p>eNJ5@i4Ky?z09H4jxX!J0y_2frvJ*ScG~O?I>OC*XwxPVH3Y#U4f~x+a;P? zt9%_Ps`_>uoxw??)3H_R;kwW&a6A8}6l7iUrMN&C9n$+*W~mC^Z*fI!{87h*vr!@?+q{t%r&QN@b8p4%jh zm$?)&p#tuPMTw764h?U9R{10?3K(z9>1*CuNkvxBYqoXV(&xv^)8b=1wVw~pNH6$?HoYO$l~Z z%zP(Ognm_{xGZ($;s)f~q}3nP-K;e%9%&kA+T&<6*kw30a}kY3m>J#l?2*8Ep^S3D z6=4&idH2>`kIIE7P9Uz#`N&HP+#H$A8yrz-KIc3_!Eu74mIoZlKWBOv$W(Rb@^Tl zkdbxUXb>m5ge+N@yi4uI!diHkUi@=nSh_G+Xj8Fvji0VxUZT+|(IaZY=>k|Wb26hx zw*3t3$P@CezPG%-(&{mF-OAg&k1fwJ{etPyP-XyhSIqh8YF+0#f?Le?_T0ppzgDqU zIq15x8^gx&pfNcp7xq}*5+McC@k2vSnJ#@SuDU~P%-U3KGfxXSALTEyXmdnwC1!~9 ztdn1jjIvx7tWN^l5T?$H*o^{<3fJ(JoeqX(us<_m!+`Z+YM};6G=x8-w zs zzPNu!2C!4rs&r{%thN6PWRT#K=xUHKP10T0Ojiq!De7+Igkijb)=DbR)!VR9!J>(M z?cEdviJ6-d{2lsAc+xkQF+zaNLd?7PwBdBz{MN)l%Wg;*V^btJPHh6mvpuljQOTSJ?#pEZ8E1seTFF}VNr7Ian_-^#&-$lUO)`9stQ_n zwf9N}9#;C;et6F8F{!E8Ja7p`qD3D}5+OpnrjZ+Q8;yjs4Riw`2s=1JVnouaZ3Sa@ ztnP4bC-28`RE?ig?bAU1cWS`UKGJu;dDDpmkfddvdB*H^z4(0J&keL5G*nqSy%hs&YA(Xm+4aHZ-F2!qeM&EP3hTIh zmunR|^UQ+h_YEWT`>8X5xBLmL!n)~lSjj0%+H}uJ5&8n2z?hdq0pLW>s8{re;sF#E_21`ONfhlAmqV+}Xn=+I*!TO@{&);C%nC7*(VLucsnq}bzTi6`!VtV8cYcpipP%ZGp=yOFQ zDO((r-}_~0PIV=YSVSU|1v(xYxMMpW?o>IssjQ;Ly$$>>S)11$p%wIXuAKg#sr?0U zyQ>ur01$~iil5pY;tl%2>+vZD26*fgT?ag&gdH#O5eVN;`_eO81O6r1{+udS#&4c=xZ{8K_7rbRogJMa4$%V;wyxPhiwYYjX zUpZ;XPh6jP*ej-lUf>!$_&{UMFyYQ1rpFhxn#)_Tw;V2`mnv2LNTph+GlVYuaPZwT zV{`pUzp?)2MNm=Mk-}ZuR+nWPui1+QVB|Ij`;u|6+zrU*Y$hWn*4c68=+BhgBq{3` z-B7VfGvELj)uU5RbkKV|{|%+1PQC^W82?n{6BqmTF^dg%n#`4@InF&=t)t?;v|%|8 z299GgJqC7Y>o?Y`>+qLf-#8!QYgd!me24~S3|e-L4;<1jQjDKuhpJq=^x6Kew!*;` z{%LBk3xNN$Pd`?FjsG!IhB`m0{<}f?Rbh(#{AHJZulrrg{i<8WV&(rRyWb0c2hFd- tN~}TqM+p7i;`ht;t3@+bR$@)#&#PysgOAmk002JrVZ&ZcMY12!zW~+rm5TrX literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/test_multisheet.ods b/pandas/tests/io/data/test_multisheet.ods new file mode 100644 index 0000000000000000000000000000000000000000..39058e67b4d5bd7b32a66018a2c78dc9aa256328 GIT binary patch literal 3797 zcmZ`+2UHW=77e}kB1MW+ktTv5y(olE=3n*12F zqydA%Ajq%)2sj|X59$d{}c zh!m3(K>bIAa{xA{j_q6u0APi*&5xZB$S^+$LM{~M*J0O}TA;={<0kbz`s|E2lh-9< zKBhu_tW0gw9z}{mo#O8X^$#}igpy%rcgzk}Osv9SBh%0Bw&3wsb~ZR$*MjTo zql1Va`daYz{adb|ucdM*Gq#z(bynpVU{d2(9Mune<;wukHf+)bCRzi77=T zm}j)NQ10&L4mLlETOG-)MzK+x#7v~NGaaU=-r$NY4CN)dU%C|Obh&e-jP@X#RUs(- zb#!_+#cxMKZ*=m#W{pdOJnzSsWF!{QU8O1imdfq1qd3V zn84dQ=TcqLtY5?hr&$u_rnKJjcU~xi$%=Zc119QD<99&~d*j(cT#(@9 zZ12{R=?}Tos)Nl0-$Dy|=Cn(Ua`y>jXBb6umo5&H$q5zRJqymF?L|2@PB{WLq_f{X;wI1H8;pr9^^yVsX0tP6$D@MS?pW zPc^niqvUEkl9Y4Tw^M<5Ymt4pZ2eAz+eVaM&DNuFR!O`kC{&HYdq`QMBzIiVelKN< z8rtKSP@~WO;?~i);M9hK&q5rX;&V$|$5Vc<$@2${nzv~~4w}!ud5$mOw-M|m&dq1g zy?LO^oA9;u5F`MuNt|7E3~DUee(iNJIfJ)c8Eyys@*WeSoJvdfg2`}|DnJMKG@;YY z@N>H}yg!a64-t`BJK@)^WXDjKXhts|;NIaI9qul6-&RmihPt9Ii~7zjU^|pWzUSK) z@Qv#60N2Hs&$}>YCmI^yt4(sUhvXTP9CzAu>u04D)I_gWew1vU-fohyIiI7QK558uxN9F z=r@K%^q|2c2wqSQsgag`3o|=dZg(;Vw+-XPJ63axqDPpB3IwD$debM6k?C|up|0?K zbNz|Q&95{3ZxrUAo=^e+-@j7;{#U;;DiFqLY5)KW1OOOF`t|gOBYy~kX94Qr&bP-lJfLI_6u0=ID@+OVr9>J8Ay<~%S(O3ZlO6q+%{ z-nqBGKPP%{dA(wQg--Leq-vUD6S8?L17R>69Uq8lK%O4#o@qkj8{43Fq)d8-!^EZ! znh$q!q`CD2b-2$l)YBPJ(K}rDIICDHzsL|n;57E%LbhK_I}quli|K1eL&R)ZWb1Ir zxR9{o%BkJ2y&UUdx`J`*BQ=%q{A6JGKr-%d1e|)aJ{#glTYDPX;U^UNkyD=$Tjb@? z!RptNN5fa5osBoR<2TpzdS)X+MfLhY;cEX9qM>osMu3Yt_?2J&2-b$DF^+wrVtNO! zX`f}*(37`NFL>MGm{wt3-W+*ju$WYFnCDu^KAvKt98b!4fCE zzLtaKgp1Bb4M)bAsCN$Pkrms_^aRKO-7Oh=V(O8t_h|jgK;m9*N*mubgW3jaJ}FnQ z)@oIvw$Ailvi9Vik&xPUSh41c`6SXC)P4Td9XjpgagRcdV~Q%Hj-Wf}lv-ay5ch0n z@p$*Or2)xFi}cbUCv(=mYX$D~`$n%zQ3eZ6A~QE%K9|b*&7<2?VXatUWGGZ)@A%si zIiKgcH#{i}9}-TMAdRZ)JKvqm&przio`ea(zmREiS1n~eA^7qKf=oo^ z2-86HP`W=YTSdh%G8+2)Njj@T(iOI05pC;+XrQ?jbL4a^1KZPrE2C!S0fK%YfcZU* z-_K73#ROyz0mXPGVLamcSP<+&#eJ9BWGK%^jfDaK#RcYw9=0$5@Rjo!o#NXQB_RfZ zd?HDNxG@NZ!$h%jvv=r{)Qn$%`Ml&NbR#A0;XCA~qV{f!e3f_l^Cz2jKJ*Xo&&ZfHIo7m2$#v(~ zB$H86|MW7T+qIgY{Eez9XIZB3Yj-|&h%&6!|IrY3zk(H=s1P&Pf^>)09Um@EJ$%!H zdnoa+d1ABNltb~ARe5(_rMmB$%P5u>LC8bIGo24sii=B!BhfPTnGFIfik|)oDPiep*7i8ifnL zQbM*2Z3lfGufuZ?d%HXQK_>AB_4BFS9tFn{@$avSZZ0Mgjn3_8g(Xl~4a95Tk}aF6(TlxZ;f(c7*nO4>mpi!w7o!*+tchK*bEd$Bi#G0>3eon)<&3W zcacQ`TYt^cWwu!23Xjqfx0S5z>pAD;Rs3kA;poUF26d=hF8R)?W&JV{fjo59EG;@h zWMpwpS(d6Lo=~b?%Y~nLH+#+f!i*Q)=?RzYH9A#@Ee0XWxPd`sS8&$GJ!R7>E!qU`^OqI1s+Sgt&)JdB zV^CU>$K%Z5_r4Z7SZ?(9-BSb23|15$Dy)0kHNnbe00RS%2 z(@ft+SJuefKpqB$L%ku0KgwZcx~?yU0;B$3{D<~X?T}h&iJ~+2X zoIHb3I+r{}KWfy`Ft!Sju#e$TN2Axo8Yj~|m`rrp_Yku#GxGxKPg1VB<=emHovvHZ zP)dJ9u$%3o@7VOH|RDH5ZAeDxEoPIdT4ME*SPOZEr3n0^x()roc$6K9Y&g}*O5-(yW@j?g_Biu$-5+#NY{!Hzhaci=p=8cW&FT=)$WGQ9SL0;LVM=hy4m418xrINCN{l_ic3(6Lya#ja_20N+7T$^e z>;;DteN<)Bv`=%so%uTp$;f5@uuY_Y0r;mu`?31-`rEF7On$KbzXkirupzzvG-SW> zer^4J@;FJY;(s-QzY>3y%TFRNsfGMk@%$R%*EIVXLX44QO#ezckO>v3H3a~uNVhO4 KDIYTZnEeNEl)i8P literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/test_squeeze.ods b/pandas/tests/io/data/test_squeeze.ods new file mode 100644 index 0000000000000000000000000000000000000000..10ccf0da2693e7b612ece6b9ae28b4c2fd811b05 GIT binary patch literal 3218 zcmZ{n2UHVX8iqp&NC`-9B2}bFlSomJrbv+v2BgLi3=jg6&>{pydQ}kVhy+BcH0dBk zdh?eblpbk;rFRht8{Fcq`|q7|&z-q5?>FbneE0d@Yhpk}O%M1-69R-HRib3b7pMIU zxx2f#!!iDzaHyvz(gg;^xOkvseNav^9#FIkTE@c@j&kyVdAq|=7#Xyu7aZz@M!?}1 z6LL&;0L{+`LICTG#L$b>0Dub<0C4&yG{zqZN6YxRBl{tJs0n5EDQ3A7$Fe1}>UmLw zu+{oKn#zokep(au3uvuplzLr8@0*ilBh50jQgRSY5nqpR8R)!o*Bu@<`@hmxkdU-M zqN`p7cZWrmYMqdFAo~%Do$U&Vwau=hhPrl6@M+P1)3q9v1qK_QrJMTEbe(hUFlGAJCyjos0`8-#XjsX!Sew;8 zf=w`?B_KYIrL}~5w>srx@vD%PxmJi;6b0!KGMqqv{8*)kgUYB81D9Lo3CaXF%t5)f zV}#)u^+i~k) zb(iR&Nvp64PMcMm2de{T7F1nVF9(+OU#`1RpM6p}Z>>$wTCzbZ=h;DB&|)UUAP86V*$gd!a0X3Ph{fsrFl4Jn!SiXyRyzEX2jchlmU9 zRG%!m_d_k;Fh>veMs)@7%HI*2J!-c*N?}W>ie@ryGu^UC%yEY$cYe1DX2@hjWA3^p zq|ad=Pp}#d=qfW8Z`Ffp@jP8`w~@|V(4}Y==SV~PD+KL$OIogO-7n6~-t8}Q-+?u@ z`RI*o{4NnEj2UP<>dkXDCn6v~=j_Rbh)189TN3d*0O8H2dqK~+n}jZMC;KP>&DHGU z*g8PAxxz}`9K88=g<`rgZ*0T4fYx49B}4`-H%i>(wU0z)4r5rmi& zSfm}D1j?Y+Y=6F(%W7R3^!nDhTg}i>8(w!DA}n=K5JYiy>V0^Qhh-3q-jO6Czncjs z$)4%>oLi$LdG76bdqI)Fu0Dwp%cY=<~YeR!{?dz)*4kJZ{_Y;qa2CWcTK}(rl?Xy!; zv{K@!x;xvLGtIcebXA~Ko=d8kjJ`58#H5utC-8D#xX+8`fuxn4MgbjLUgr?UI;7BQ zI0Wx4-DhQzR8*S_B*;bI1m-!gAH=|-*JviE(aAMgyQ6NkOLaxD?gBQ;D*;0#4}!cU zT@<*+z6cK-FQle4e9KYpp{}sEupsLC1f??!^=*U`ioH3)p~GO`fkmTkb8a80W z1Y1(YU}@onnmZBfRc+nD=W)yOb$eZq*-;g55dZhM4QH#c>;`Z0}V7!gj8?M4i| ztC6eY?ms~bWWPksu~^85O8VJ7*UhvUtG`F9ccJ`U-k@g5WtNMqOTGyjVxsjh%Kc`! z`_uCU*ZDg9C)Y=ri{(fmYT>G#DZVMD_Yb2IPg)w*jV3#Qp1dDn$}xfy!>P)w7cInM(9&Z-!PNzY)2 zP_iIeKkBZFR#B8gBxkM)1JfM|)jd>FLADrC0)N@kBVRP*BbOc_I8G8RuYspnjaRO0 zyiJ!+w>e1THqiRzVAM2Uzn*2r_NYjP67TLVkvhwFN#|GzJNINQ_42SSQ)i}rNr5n{ zWHztaFgQKGq((ne@U9|_K7!q6sbaa1MR$1UZKF15J&v>2P4N3&Xw>r;DH`!ex6;^; zkG`H?>^0?IzK(qVl?(0QRU@`CYxp<;uAzXio4B)fEj9xbNU~&AkP&yyQm+Fwg=ZTO zMGGe2<@+@rl}hw6UUNB_)b_Geq!v;2M!uEy3M*nlqd%8lCF3`$#Kwn8qf@6d0Xv-a zS9(ga@HoZ~KX6C8o66&1Y;CITZ`W#L_D1{!T}dCxA;^GpDCWv?i!{Do$F-rs3Yp@Q zK5Io0rSz@I+NAyCZk5gDJ7P*iVfZew2gvkY!pQdyC<(3K@)sTbzfc6p6Z9i`A8JNd;9`)+ zY}bN=S4C^bpWXu*XtExlXY8it_^;)~Xgj1s-tkOU&8vc+;y>S+=>o!nHAhExG~HBy zPjPW-reFt(e&4rW6-91b7mliq=d&J-L1O>r@MrM-^3Ozd4ebldw@3=_vRj_$K0KN% z;g(}}|LNp<>m^p);38}n z%u`G%I$uj1*XLZ43Q@c0G;@{TrbZ-5G#mQm6C0;O@S+;?-P8@Xrj`0y109OKikKMj zsKtj_mQ`8{UPnSr{ayS%jxCXnc8A5emD8hN;l&YU;Mae+eDi#o-P823VnleiFlreFRyK4aH{(M{`r@}f_(bRP5-X@y-@zr;mPImKg#Cs!r#}*FX0$D oU;c9C{2t=>9Qzewo{qe$_-DqM7yxNb?*Pe%AUPv#PwP40?~_=RNdN!< literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/test_types.ods b/pandas/tests/io/data/test_types.ods new file mode 100644 index 0000000000000000000000000000000000000000..c9a82bfff810bca9bf129961f8eb0c8a72ef244c GIT binary patch literal 3489 zcmZ`+2UJtb7QGO9?;r>wy-Jiw7YqtW?=AEiFf<7ULIA1KiEh*g1?l7K;^2&Qb8+!A zAlPI9$bMS5476yf;}l2$pqK^#eyoJ_LwmR&uLO8{ben4@7s)WRY79JZL)RJk=7&Ab z-K4o2NFVEKy^``BaH82tc=5>d+atTDd(=AK-rbVOEXKpis#dYAuFQ>ge|R}0?O*CE ztv40LVmuyyMjT8%ZG%_2mC~|Feli;6TzIB#m!xv$lVa0HQ32B|38PALN8(I&cymTG zI%vspnz34r!+wIhH=#OW@VMO$_#JKPOon!81I;U z95jO?s6|S*bLZECCX_au)tAAJUBD(}C`0;-ZDHC~r;OFiL6-;UMeT{zmAN^q?OG9< za>1GfIniw_EY(_)gkNkju%ql*-P=J zW8^1qqNa#d%Fj5*vAbn#B8W^Y6Hp)B?fINKNAt?DE;pn(QS3A0;>b(LBqIKT^qiBiHsNe?i$W(?!tcT*}?vchs>+PyK;Nk`YAIY)50pcI1&x^Ma#bFZHMb6@7 zy&T8sFshAQS&Red@#zeU(wb^KjBUfi#77{*!AG}+E&##Np#<7H^eO)0ybK`pM=R4z zBleF%&ASU38BDp|N)V)o%G z3o|D5dD1;w{?Ym&;>PSK%XHG0_PnXacMF#l{fY(QlI;|w9q})z5yNHbJ*^_~`uaN9 zbjCCNuhO1RGPIJiz##KtiMV8O+8mVDu0^5iwHI>1ofVPCBahwjVlk%j?qsqW#%UcA zRN3#@UmHO=MRwPUgPLSxSk=R^IZm-}q%JM+;O}9De3aypW3u(P%Q8aAr&vdb4x&t- z(Z??i3`{wOafmmsCcVFu*E|&tj9pO((jSUzc$jX(-VB-S%hN1i2E$7+i7Vp^QFO`s zooL75ofyVr2=1$paLp?UoWTf0+ry~5Lz%|n(5Ejk(wz-Me#Yln6ejo}|3hs=a<=37 z$q4ktS-ME)*xU}d5gHoiYv)I-ycDYMm8-!J7VU9X(*2oRp2&LEfvP01>7sC)Rlpl8 zo8+GgqL=rA?MBCAdwsVD<}J=ZyrDURlPcw+Ec+@GYkiGkzBUGp=9HJ%T7$K#+g}s4 z7VsTOZ-tMl!`^yv45I6)rMv31npMA!Db}F*|f}Ql1g1*lkq9m}V)ydg}}z*9o7rcg`+3IhO4eiXtPvIA>dDn9jz5 zam{-ScU{7&+z|HIyB(E)esynm94Q2$4d5unC+?qq;aQMpF1y3Y)NB0eo}P)q)B3ja z`cH-|VrFCIgnS(EhDii{Mt; z6M9iY=iS~D9gEk@S5ubUcdQ->xRz!LZ=<$g!K;!3xm&@h;(0fxiA_9$%`B}a#z|Qa zXYQwrR|Bc;7KC0iQ|^x$rIF}P5t{8iQRPpSHr4uUF`jphleaIOY19haZA+Xy z88(HZxw>;!G^x;zDb}Ii@@kF%ni0e#~8F+5TN|Z!rV*%e!3`t_c7xA-1=rFou=IIDMBv zwVlMMS=&R~)^_g=6SA-CUDK66X*xLvK*e{PWy(RF;Wto zqhxeo0wDB)f_!rMKtd6pkVt>As=(khrD#$T`G{M`=P}_Lt>{l2HDF%8^vqZtTw7v30k-7oI6*C%jLJk6t;-iPPLfYeqqd(vuacJTnb+o80!700nB$nrOK^k zm$-2fc;b2@9#k;;6wmG;ueP2J4bc?j#cK_dQhFpnoU5br5HGjv%DxTF%+_V@Cg%+e z1)1fa8rLmI>n$3;f0|R{@3#YuJ)V%kAnxyH>YNa#U&K~Myno33%Esg!ol#rfeWxad zwOY*MJ&1OG|9xLja*0~a(%Po=6bw!2wB}p;jk`!iaBT*ONz9`mE>YoJwxUIUKRVrT zyH&}`f5UxlU7BYX5hrGvoNz=TA zr>ofj)lj$c`Uy;WWL2@P^WFE1M>tPm>6>du-43}o%XMKD2d~a?JJJ5QDK03T-JpWw z4R95g!+E$;mYSp~AhO_*NGH^dpM`mrA357^yWHSS1!C=S;ixCrh!EV{@| zyM28Csu{$?9~l)y-Lt6ZaK+G<|Xn=7q;rMh~qvRUn(j2B6~e(-^v zN-_F`^*M^pmZu;)ZEvsqJCuFIHQp|);hgyI=rVeM{>wChCRqrhtc0honh8`=Q%_yW z)4|K#)dl&JII$_vyP&HyYR6HF9RVt+8gZe#&A{*@pmP#JW+lU~`{%r}o~R!QVIO{Ox15#QAkvz$#rDymLr};pdp_iH{2bMU zW_QEnhKrDS7_(w{_@+SPY>FeTHk9cY`QBy$$E{eLaLX>+yzat$?XnUirEtpZeIKb) zICOGyABvDCO$kZ5VFe{*wmh|ic30YDJJ6+iP`7e9v|*G7Vou6_iVqc!{Mr8_&tEElyv z61|^0x&t0R2|&zd&REPwpAI+|etV3-$K-xJA@qJX7xd`Y%-le5(6$v3gy75r8gF zBO7M;bo3kEM7cyWk43NvoIR9bXnQYFjohI}8ZxZY5~(FRt%^5*X&h@<(y`_|OmoVo znR~I!vkobkVF2L$LcQ!EJ>D9 zWMtQCi?O`+EqtT*`mVmO_xql6J=ZzUIoJO_*SXJq|L*5D)TN}N2mC$G0m5Arq9nQ@@qW@Qg8qyMBlVTOR*-QQQdk2y7$X8q9;24TnS`w*gXES zzWhk+LvFz*r}FFE?AM8URqO|Q3v2sO^yob!8%gStPianD!G}k_JR@?1PZi0w-wUMc zMnv)KCr&Knm!bEH;O zb4y8CnZu1eTw};~i$v3zh0>?*DReE8L`MI?a=-J_N6)fLI(QUfcm{rHEBizI zb_K}$>|8<(YuLt@BA^ z^K-*Ke3$(fn(??-;Lz^nVO7cqmOgBs83P+1lZKD98f_2MY*kvJiXMsz-KnE0=LrvA zEyp|F^8oVznjD=Tf>$xgt5V3tx@A(JK2`rXwoXpkbk`r@1N96zsFf}t4&{vqx4Kp@ z3D>QNl+D+jY{Q)e&22w0`;P7|4NKRh{lZla?j9q078^K0+n zc4I({y)I{?Ai{mY5PwL&0-7mcRZaUQt*;Qtu2(5Nt^o~bdM7&GVcM;r+}qaYHQ@V- zrTMM!4d&5ttD*M3ORPQSlB+8+!XyJ|_VE*H10Fx+i=0f@2c4rww!5%3_P)V8VxSEG z%ZRY46uU!2Oy-7UsYanw-~(tS4_$u}NN7a;Kmm0Qc-9t}kA%Mll^es=T#BdQI%o40 zk^m`1A;V^1kK0JC)#Dd$zY~_r@l3sZ?n93WyAX8*@48m!J?Gr8z7%(0av;>o@6au_ zU%$)($5)}p+JH1JKU zopyW!67{-mAyh`M@66p{qo$cUutska_gq8p;bD_JzYkexGU5%p46MoxV|GDi=!AqQHFXZrss&U4tB~+%beVSuX8;vJ|@|`UA2%CO`lC;|TsAexX|)t6)dp&A!^nJ9;53g+~2k%b6R`z=%@ie>H zUXq>X2^x>30d2PF(c9;LxbKSBpP_Uu&=O4na(p}Ewdc8id3s=|1mg*5k z%~e99M_5N(wNwB;=aM zD&8eK?h(e%#+}WFU z>bvpW1FVqySBDx4o zm%G^W{j=j$Pj6X+fjcRg*0^5D&emZItYMRO*m>9yH)q;mdoY=a(!FX9F}+ zU1fsIx6+;J5*OVp+r1xgXD%^@n0-`ICwL+~E~XeWGrnWZ3H70$DM2{ol6cd4M25NS zRSI!pHRr(fR8LHdO-Gr{lY=w~g4-IN?&zw#$=4P1McAjZMaD;7I&IK$5g-$-=~yEN zbbCBl**-Zs$-M#CQi~|@zp~m)v81~r<#c=TWhmuQ2bW_LwVL{lwWJo$lDFY z+!mU;OFL02pfr;p8-;7xVX-+lZh`Z z>*YaXQ@mXzgIhVrkKPJQIt_5qJGOi4V8dxjv7rgJ2&5eFSSr1H)5)gRd`ApDHaX5I zGd09?zGAm(vfaeO@78c;O_ajt!)IGd{ozND!b%^~3Aiv&r{UqTaP=$QLs-Go@OokQ z$)tWGY|D3jr|`&|DSR=Q_0BQf|KeF>57YWrGRGi50Dzr**1utjkkB^JkaD$hbF@Qa zf0C^@9^phO1HQ2vKH27{;!`Pp;puV!eh1J_lmp$C7tnt{u54hH=wqbyTv3~2?Y{p5 zcuKJW99uNI{jtd*9O9(6wYc4NIQCZ&JA+}^9lGYb3ENgd!K0kna_6GQ=!dl5))GnR z3n(0$A|AgiST`DP&8UlD-NjB>Ow7O(^9bse8K`RR@v7Nta`8FC$f+)B8$5zY+(bAl zP{#+yDjCUHQuO)0{w6PQUHN=eO&pIIk>C=rWcdYeUp~B`qN*;msCj&;I=eMgEAVi< zgiDJ2q5nj?HCLdkF$Dl{pX`dC`&G1;)hX8r#`-rq__^>|IXQgpcg;-BbC^JI2{p)h!xyy^ms&IvnA*Rn z-KG?}iqjlh+Q~0FY)))-7H6|p=gg9=Bho89 zF`cw}tNk%ScyZy+lN5Nv>!`DXVB!5(bWr8D#fj`&pYa!)KQNZO++KV+Wqd^T?B9N; z080GazmtCi;Gdns>D^!B-+hCj?y2hk_X58ZrsT_Cox$(A-%I!}-4MC3|3^XpUHE&h u{1WDobL)R*&hI9EkFj4S%E;{uxj*_DafZ6opwlPRqvwf9)(2u zno?te0ES-?M1U@_@4I!h0PvO-01j8e`w}onysSSK(`wa)n}UMIcxN@31bih8F1w2* z93$|^)Gik80i?%ZC`dBN60^Awu-$ihiY)?1(q-d@-jFbB z2b(2#J%mp)3e;+iu(YRR%OP%L&D1nBc%5P=vn0}% z+VxqzwDp_l@R|zRH}YfLr1Cy@uT$70tG4R)Qx#nX)vZG27^e&T#A}Ohi-F&n<&C8L5pS7(u z^2HXQOpkDv@381o*<{M)(T*^9y z2KH$}5L)^>oEJ}apAXl92%#@*j@!hFBgQ`8=)UITlB!XtjJO1jA1OQce)QB|41{*N zk~S3~1qVm0vL&x?Dbqu4>FX|dMh_tqD|T;PrcS#nnev3C3KBSy9>fBrcuM!&Q6SUl zMw|frOQRUxq4o>bgCz}KF^IfE-oAvyZIH#Hg_zlhf%!M1jY}6gk2DipS`I>+i##26 zrcE=HvMJWRNlr_C;setMd7|hOZSL3lR8`_T1aCdWOl{sU8I3Qom(>c7<09u^9zLc7 z?TiW*;rsGAxpAM#mdI^`3aC|)b-ui4br-kRcBI9KV!moJhlz=RD7CnT<8iuYD4 z|6rYjl-Aa?3tb|+ipkJB6)3Vldb)Yrsp(8?H1@06?8iU@-O)0KiFDFp)a;aL&zSwE zVi=RFVCnJ%#g-oZAX?Y@tEie3>u5vnvEoSGiv<7S;cJwQSk?2N8mwH(CbB#amhXNW z&|f?Sr#m|TQ}OhW+vARt>XV5tZHKZwQkxYhrQ)4-Pk`;)S4_>{nL{&QVfsfO>}ALT zoy}hYB-R~*fXF@&Qt;)9MA&) zP4V-7dKq?f03gEy092L(u3k9bLrtJ^yxYbIKdE%A!E11zvGXQ^4b1KETEOpGV!)>{ zA>l93YMm;Fdjc)5#vi)zrKdK5zzWLXR?m%&fRYieg_RDn2?btg2uc?w^@cHn48gW# z<8%jmySxNevCI9U*$$vmX5*}|m67pa`&dZNiB_V^m7|ejhM zt`SN%xQxwez)#rpj>Hu$4I{_Yb@p<>#bJs~LdhPmG=xR{%2Ye}-PT+^7w#j1yX3}a zHN|)6C)x=W@>d;SiYx^;c(aN22a5$=b`%(~i=deadh)o>H%sC4(+|+5%%r}UYhH-z z*=Gl8tmjN-{3Xs_{lRC(Tf$U_50nj0=86s*;L8`2SdEx4H9oeemhgl}F7rVI7Yc@l z!Y|i^;kB8V#tiHPYoA+iraNt_^Ic--TV#fv+gfBe$N4e!Sbb+~SM6d;>^B(mI^!q! zS+KcV%<9HD4<`?zda7hSxo4(mc5pAybz2SXl)F5K?#(MgRju1Lk|6y1XX&+)Rl4pr z2chk(9_IO^c3n#~FmnbHOlncE%RMc$Zo9ON>^Uhfu+g|6NB;CNy2$gd2T<846=@KM zae@UWSaME~GTBG6cGWWt9T=MA0L~YeB~osKVluU{9F-UH z_;g@*$-}n!mj_TeM}b>)e%WOlwrxIk#4w5XVmYL(GhjP-_PNBHhE0`-^8Ap?e!^_` z>C(*ZS+k^6M~0X~xx4~RR}U_@e#-Vt3x%QQ{alN-&C77WWp zMaat`FQIxltggX|#9HEZVgX+oet~BqN>CH!<(xZ5W8qG>pjw@RbS=D-Zj*#LvS{LQ zx0(+NpJn&Bu)dqNFFjelH}+ikmScUT)uv@=0~8?5t+EDwpB`!~mn;dHwIJ@8+vV)2 z2013Mxy`nar#odFqkF`Ex*W|}R`LuD^2{2uroZZZT%tzqJ$nIv_Fv&Ti42zT%t&42 z-Rp3ls$I=ToNT?j>F0^dMU@MUAEsO;w%4cKQ56KGt#_m*GA^AbwMV_6H;?uwYivPA znYBmnPlP&STg3?xvBPgr^0&=1Oc=L#dHN^6`rD{>&t#=OJlz@r|VdK7@^p$6KChV7;4 zkt~}Ma+L0u%8Ha%9HJ|o`#%RQS8`@n?ku03Jy;v>o1AV5s;t`+`a21V?&SFX7gc&d zM*!eB^=_nX3763~){(;^aAE4K;3N-4$0O8CMr8@=_YNRX%D`rJmx?#Q>oQ?+RaY$dpp%llNci~!u zkJVexYX~o&@aB@zsmg_VdR>u2OKn7jfF(|dzBXFj+_-QVKkswX!`(yIzPPuNMx&E) z3B!!OBy4J>RIv0sLMYw3B0^XTcSf`lHI2}p2w9@Z5Wr9 z{D}CM-QTll%j>N|wVOuGJ`tO7ABvsTd{qh#UceS*tYf&AT~*J=CB`cf*I?50RysFi zX`eE;d9M?o!-CJh_0J`f-PJ83IiIiHl0p089{Yp)>kidR)toEy-e9lb}k zD7q2ohsV6JjpWGl{yp(1wnXfFrsme2dvRar8@c&gD+kT+;Stt*F&mTx;89KD&vWL> zB9S0WSTNf4b-y+b7WDCJ)~Kc(uik2YVe)%=Q3FUWKdIcd{mo>+zya&xfVfVAUQ}P8 z<^^bGIRwksU-8;CP}9^x;r7J?$l&h-pgAJ*%iU5>1o&SceK`BO{o7TW8XThjzf=B= zu%tfz_RW9d{w%-0af;NU{2#^mPvD=V`3sR2F1 R;TwADB}`3CokN!Z`~&E6gN6VA literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/testmultiindex.ods b/pandas/tests/io/data/testmultiindex.ods new file mode 100644 index 0000000000000000000000000000000000000000..b7f03900e6617408e0296cef406140b1e408146d GIT binary patch literal 5575 zcmZ`-1z1#Fw+0DmkQSsvazKdzrKN{%hK3nR7-A4rphmep)N|8olK!$D* z#0%fO|NZX${(JX%_C9Ctv)^@|wfEZVJnyRo!onuN_|L+E5#}WR4shGKt9ZB7*})m= z;q3~wa&>ibu(9%RaDfSU!XN+_E4Tw3;Nl8}L0oKJIYVI{0Jy6=)CvN(heAEHZf#Ox z;QnQS1*17ExaTo828J3b2FBf&;2z#iP&mNL*=fgc6gDqL)*(={&s;N8;Kc0!{6N+c zVRka3dydH!8Q+|z{HD+vsoB{h9qFDa$5pdiB;I@Adb61kb28O`{3)J>m5QoO{u1y4 zf2F~mYhU$bEJbq*lS9>J$&yoydm@bM(N#Y=dkPi4P#r*xJEa_YCn04zsZN@{PH`zXW_ zZS0cCjsUB-KYpK@SMJ|2d@h@!`9fiRnuz}6VZ}`;jm$Suy&yC_D$=upbVAN#PivKh zyz82T0V~)9^QW_$`egW5HHIA1X$0*L(t$=iz=TEWgsRR=WJ=Wm4;C~91qt+9lBuxF5gKqWO1yBvLUE(gJ>-jtcS{XwkOQISWzTaEq_p@gR2w&A z9ACmLALh?UfUu*MA1*I7%%<8rQ>wG*nZ;<|>0EB>N+IqW+?{DpEu6D7Ul5H*)}X^1 zF}#ePTXxM==Jz^I4iQX+;$qs1PCD)dCQxgt(XWaOysUkZ%JHq9^L-cv!UmRG#xg^0 z+$&b0@SK68rs^wYIoz%WFMT?-C}@Mko=Fo-^odPG6|Ak-9I#iI>V-u(fbGWbrx&5BLBX*nC>V0SeEvVY*tc!erpmB8flS_NPVzu6qV- z#W1I4UsfI5Jc3Vpu$iKozw{UIcgq=7DSP0sz>|>TiqF1}-gL4+wBrd>Fy&G8L@^TrbP?J{8+)F5u{~HmeISs0fi!k%_+H$6GicL~giWTk)Qm z%@7N8eeeDG5FP|{w~S5$w2$Xng`M6yXl`AKtPt7I`JQ-Ld?xucbM?1~Ui7Da4#y|D z5SAxR(TF^YFD`VylA_&7wrL})hJqenZbuVC`L3=J!+29jIvuizoZ6MTtJ{R}=MNUf z|4t{|o;9%h#26U(qZk;3x9P;j1?F*=f9~>$839Ilt&=&ej7_HK}_lZi~go{An?_u3GQ(VThtJ)^x1g_U@ zVnZv-yv-Jgqwv=c8b$_>l=ftfl~xTGkypOLa5i7eT8tR-Fm>yK6MEwcNlJ_`AhN(UH9l!q(dMGP>84cC3 z`ZA)-A7)VC8c*pYmwule3GO>q$oxeKx#W3URVPMtUD2sNlT!z(c_;V^_VbfY+nb4@ zwZgW<_|WEVYRQ;nSZbVOp`iAj37WGaT+I<{2Vi!6*`blp8uLRzWJNe1g|Cg`4))hr(uI4gGBOL< zw$O`mvqr#DmMyw63G#!NKVrmmaXyKAm*>&SvHNAST|qHE2Ebs422FS@L_=j0Dq zQ?gov#$g15z?|p~8~+M=KCs>8x~%s3S0g%Gdk1+>VEr}JQ5RIZjs+jpebIvIVBe=? zlmRYg&-K2}b&b~N6ej_0mk#pQkkY#yuyX7P*vTfl5hOm07E}GReYi3IY0NN@sx!^h zLRw@0BFEQ7y@gy7yQmyieAqalbb_a{GsB;mw(M_nDU2iT|py0Mt2%-iFN4603?E$xEGw@Zp;=t9sL19yB_{V%^{S zXv37utd`}MC*9P-d(MGuRq?g}q;KL&dLkhH3O{*f7K6{sdT&ZBvk=(S17ilE>jIj7 z;XzX;qAUm9ZtD>C5{d+~q!i0`A#$jo*}kA|<_sUewpYD9X|2mNUf4D3loRWjkV?1a zx$fHKz{ z#&k378R93`X0Z^hp|W|)hxW-aMU)KJgcl#l|yroiWP`Zy^f3t9hg0}GJqikU^ zCXWl)8Afqt(~lh$gjJR%*e`e4cOy{0*$`e{ zqnPkhj(vh|C@u2Gvms|eg0am9kPOpO^27vBo6k)G;{sHC^$h@8*?QztWgLq?7fkBi$iW!+m5+Av(;R(1}SfDGSxPAs@q;5RLv^(O_P?l7P=IOq`IQz znWC15VeX{ucKHD}=VO5l+jyNT01E%7W?C}j5AAhh6RV|hAxM>J{qD4IfK0}h?(^;L zQ_NOZe_^xIFbUO=)BgQGH_PV}xOf%Np_uxWRv|zkE*&0Tnq?uekWjFTa6RUn2X%)9 zz3qMbo)Jf^5cN&Vc4DqL(bRh=bLZwQBIT{OE+YS$1S1&_&nR%{N$XWn=*3Zy=V!}a zkXTp(VKFIqBBcXDC^O>K+VZppe?4v(DHc4>q&2#a!_D0D+xQ66j}@)m^D4nAx5y7# z_vr|m8$OPgf#{89RdvF%h&D7fO#%qiJOb(m)V{YNRyecfI%VnTp-J*==}O4-q9g?F zI#c?6qoXh{i1!)O^}$x9a8_FN=c)yh8QD58u@CSigkd*4ni#~WCq(efk31g^NRP~+ zl%PmA9~N3s$W5-_V1Ia@@0jdr?eVbCp;-v{2OPM)7_eOjAP9W#(RHL&eAkru+3Y~( zn5_+=Bi7!Wx^tG^#d!`P!PoK~cQfj58_8ak)HgC)Pf=-mxy_l?45FtcYiYTbVSWRx z9(}?IPcL>58u}X=(%WP=3Kl!aCKHP3iup zag1A^HQ4yKU{Xck{DY_VIBFblIl=F!O5oVM|)GJqVK|YK2UnI>Imz4 z=ZELdvZn19+1u^J%C~e*PR|~t@ko$E`%<)|riK`Hb476Vr4VnAzX1z`)6;n?#N&*) zJX0TZj`t-1X(_c8*yK!zaNP8h81t{q1@1SBi0)iet-4iS8K}OCt4lEGCFY3nt$kX| z$f8g4+HiJ$&-61u2OA%qz2`tchkkqEDC$9fIHg5miR*}zXDpyA>3~Hq`H5KQtZ}MY zFbwPx;?2*jwDN_b_Q(=+Xm|+C9V$33%3H|dR!{B`RQlEuNs4x%rY7UNnKJ~?oW}G zPVt7K4LXFGba^9gZS_Exl9Z?J{5E3Hr{=eY5+zlOdrF47>; zafFumt|?deIOtzj7+!GWM~jSFzV_u++FVj@MsCLwG)hEfgJR)KGHQLx~XhAmfM>o*h%DZrgQU64>&4aCrpoIghvx_4N6w+XXHs$w&PX z8U{wG`Sc6G@7*R?W*5_vYR0$npyI}{m5v_^5V`JSm+N)!uEZ?U&ogkkcic)Z_FX{iB;$P#A1qtIjwh0Z#c(Movm>+JTeiP`cEKOMF^s0_S*k z%JiBG1*sU3=zW+MpY{tz{pABLKJrBoczITVXoeQ~iSm@T74{QW7E(pu+G0G`G|Xy{ zXj1TgiW54n`**xab|?JNy83L7jPpUsL^XX&S1%_ekt60hpGxGm9k1LGx<_Fr(4h13 zj(FfncsxQK><|CJ{J;28#1QG>%q>r%yoZ57b$eG=(39m?(NGj{wt_j>Lg9Zgr;o9+ zj#y8K6wbqzy1b-45xkt)J3hf@7~RvtxF(`(>H`Z>8dmY1+RAy)RjBvdjG;A!?5AuLjAggaBRVMgXMl4n_$`L>0{ZK;yAGZ zA0)MfEigx3SFeb&$xE@mYmA{co__D-gSPk@Y*+hzQ(EQ)$ClEKP($vg5M|%1g-Tk1 zTSxz$I@YO))a>pX=vjbJS*C*RaidHIV6Fy z5LAS^QG-u#F(qFyNefr)H(*y~KKf$GCiQ}Hmdh3^$Aq7*4W)t2&`f2AC!x9rurwqH zls|?@8giEXfojeuRmJoFqFM{1aDUh~m?cflflr&NgWK({I39m&TQ9k|@_k#RGdc1T zx|Opmst)=kf{Lotq$Fnn$q5w;#u3OrUNq!yC#;HDo_vQ_;hw%2l>;Lu@_wL5qS))U zExm!VQRD>!*Ry8$swzK#Y564Cqj1hNja542%p0D}9%nfA9qxZN=9g8p`zRk;mFpC> zMUvt)=4ft^2@3@L3dG9LwugRAHE1q9|G1hF@3I|N>uDFm64X!N<17nxDBi*apP;X| zek^Pa=Tf1|Us#+J%~==}hx-3T$G;JJw~v2IkpIN}Q(gEQS8yv~{tvC; xpTK_}v%i5;5B{G+_fHf549~w!ERo+zvHumUS|A+UyEizu7xQf}$5Gz({sVC~Ab literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/testskiprows.ods b/pandas/tests/io/data/testskiprows.ods new file mode 100644 index 0000000000000000000000000000000000000000..443602a2c3f988d746d8dbdd2dd19a5e5784a243 GIT binary patch literal 3235 zcmZ`+2UJsA77ZP#0-0JaO3StZ@Omt?JGtQs$)?M$Gwa;DaytB{Q4+dpmWCQ%u!~xM5oj5RU@nf*k zhA-L|i6{6Y;r{*@v=e4t1hBQsB&oWuh09M)4%Z-2) z^I;cwpQZB09zzQs4YhmMHZuaqXIYGH&TonGIa6JOo5Q;D$hs{mUbT%{?qPO`Dq(qf z%6I~KzOrk~QF7N|4ZK5QS10u-8_)2m;b`YQa3NUxGR2GHGmavLg)e|ybe*+5>xaZ9 z59w~gA?61>h9VpBygGwE^u~gzr{i(lZL9dw!1B|u( zx?&n`7kzc?1@U#Pclne8J#j3T#jG7T{mod1TrL!-639*2QqfF6HqCh-rO{Z8Tt>4f zAwWo74D)mE!-`Kg2z*uYcp`4bKH=6!r+Qv=#vvK zJLD80U)DSslQ?y)r=II-A}5GzkE!mx((y#!b(@V>*Pv#z^=^wf+1oK2E3nZy*EiLA ztX`sZ?-%Etna~+TGsks_Gm7l9UB$<%qs#(bUZkV^@G8h!z0``lKupc^wWoAEsl3>bR6 z8YbWGc37a^$Eu}+-Mfu(4S+9AQ&!`uPb}u8=o;98*+AnlfqhP+X&0S@pTEscA`$kQ zaueL8nD-kk^~4+{}iB3z_o*?>O@`$sRNUA006O{+r@a zthoMoMgZWeAOJvPIl#*gi~ms*XdK^mvh-WgJl-X^apM%e3`vztWSWV;)EcG`>fQL3 z$=f{wg-st?!6uB(utCMVV#G zcD=qQu0nfII2y$pAF;k)*oHjmig~rTE~+~lHZD^8&W0Q0ibUYi-6H)Qmk+_mRhp0zSD@`k6TT_=WH}TQC-#0F-ZgHFZOMi`{W&)5RlyvfEn18X!3%LhmkC}nct1`Bc zl7#`?FrPp*xzqMV^gHFxiPf?0HKipds>-zCm5V~WZ`wERI+~Y~Op7g~3T445^N-eA ztSGH$@?N-Ns+}Fyo=(JaGT*k#S;sp>LNy zb+?qw+yz#RZs7XEGS6hWlC)N2$81R;>wc4bl9a!*%T`2uL~*pT7`OBE9j5%h^>x=KM`0mrp-)t??22^(?zm zy(`nNgF$Mph{in+?xrrmjBrA+Tv|J83V@!1~-C3w^py+luR6U4JcaGeVbWW=QN2KY@&OLvV zy|xgi&*<&Gg|=xTIWvA>EYCO`w1fq1mK}_N-2KTW-`D;Awq_BBxIez8odD?p0Djur z(!}0K+1$!h#TSl6qma0t)vG$$$cI6d!-Nt&-y32O+^8s%za1L!8Sr>Yo!M1W((>u7 zz7;$**ybEb&s<<9oNz-sv)W1}Tw%(p27cHk2|; z_5YC)<4PJKF{^Z#wb9+?9$_u5Nzu@Ta<`0P4erf)@S8LSiTGZ+gr~b^hDi*;o$7MOH56hS z(bM_XTyc*;qJ$TGCIFKglx%V%5RdjMi#XJW_q{^x`axXd=LRE!M$?wZLOZyh*GoX+ zfU<%gmY$TDQY`kY8c#JH>Sk~s1N%FZJWy%vY6^0t;#bv7N16=k7V)G``I@wj#`2gw zmBm#7`Lk+3rKd7|2+ohp>j<;+*@D2@h08|;Pkc_f0HvQWjSPY0Hx$aMFfgSFw$TjB zH6@SXszI?5D)Ya)CLO)^mM(uwoZnsio?^dTd_G3|&G=Ws!JtgENC5y$wC4|6Li+v? H6M%mJp8}3h literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/times_1900.ods b/pandas/tests/io/data/times_1900.ods new file mode 100644 index 0000000000000000000000000000000000000000..79e031c721ea34fc6097556d7a8d427f72f236aa GIT binary patch literal 3181 zcmZ`+2UHVX8Vx0ibYqaB2m%610I3Es6d_8D^w1On2Bd`&AcjtmAVmb}Rl!2cvrK zpmrAr7qlnd4UKYhb9S&rc{;daB)u_rC0tQh2dspv8ya)h)z-@ejq#Mgx_O{ccd_@- zXiq(AOfcZsuL$P>nw}JnLOKB8A`<{`^d_t)-WiRRxbNcp#-s}~ewBTav*wV$bjctQ zT>VH;eLWkVbeqJd5*OcnCIYN+$@}{*Z{v=}V?|TF-rVC!WkaK9BpdSD zl8SnJPuGz>U588-3r^w-JhH#jSg`ioX|V(*-D0_Fp*RS4ueJ0}IzTKJ=bk%vjewBq)-ZI_2hZm4j&NRLGWm1Gt2iVpt1w2;?~4a-;2dL^f0)+U7#(E zALJ45tHJ>0F%NrMs#0J*t<~Xyj(?kz*_E!{b@dgtQxU^Lk?_buexkv)l#=bMTPVuC z?!AtK!l7H?@(l8?u`?&fi@M7)uYYTQ5%T5+lRbaxPaL52;gcO<71~g#t#grn?3ZOl z{a!=-iltck7fV-GIfv);O>rM4?pmA`UV3bTNM_fZho0q0tfw31SB{;Lvserv7Q7M< zs|#U$Xr~Yi4i@K4H@;>*)S3y3C*>=lJ`Xy`X))4sGRk*4LVAMa3eyWN91D_{%xacN zeJy@q)!9D)`FLeO#^_V-nykWE6m$5#{swEf^++i2!FChz@Igy~WAcv$y$op9lnEl{ z&RPxk%I8DFSlK9>x1F+&5%aUmFzu=>{OBp1+G-+GAOCau9MLqPylU6!>adk~S;wXa z7N#TlkZHJ+Ey;(#BS;vh@OMh4k6}YPZ{u-+fiu#D{zk7*6gIvkni8XuYzX_TvGRHJaBzVk72k)yJ}*xWKC|Px8j=gs6R`SO>=Yq&-&{vIqEhW00B3>oIvg} zVL|)CS&_!GJ!DpU;zi4P4&UrTW5yUOB=ACpS?8yNFt60K^4EPpm6Q}ev6snRhjDgX zvzOYvlww_&B)TnsWLU;}!N~1~iz1!yEH!ieV!nKbav8DtZWL@jL%aSk>Cm(yz zah@|Fy=^H?VaUpUCv|!c*>3tpV6m2h9D0eveZQfb_JzvY!$Ug2|8m+aoIH-A2LO0Z z0|3+q0Jg3e&m$3_l6t~)vUpCOy|H8vr&5apEwb6r`1a+`2^+tJ`2*sh>_w+;R?&r3 zZ$J^yS-^PVSU$)SP}aV^-E+lOaaD3d&MH0GDB+$wtGB;>bE+lS82bSl*myuR>AmhS z29A#;x}-B1_9m^pS1y8^nJ5T`cGRcJ@hoKbZPo+)9|@$cD`?=#&lB>sVu{1dM67`M za8zW6dh$-yTF?*-_ig6g-3Gf2pLIt;MN-iYx^b~#)6?txGQJos$aMKuQj*R39PX|D zMZj*|%R1W99LFjAkl#4;W|74lesOyg7FHZ{oGr&;Ww)X?CRgBfCn@7&gHfz+luVrQ z0*cf3x%0q~ukfOesh;k}roPCtp=%~VOL3d(3dk@~n#3kEW^l^pHMc6kP;G08OAfty z%^t;7POH0KTzhQsOWb7lfGqM@7h}YnCx3{>?X9l%sOsCkAq@o)`rDTgisFtM7STRK z$hd=nN44oF!@#xx&7zFPO1)kvpS}4dUEmj+4)9ppw}gbsd7t>pwKXY+lenf^1$(i^ zO_bNAJi^m*Vwpg(`^YOXQmX?&D#{PdPw>K5BJ$uDJMKt-XIV45DD=$Q?Ht{^qP^Nn zJMQ!J5{Ul$9MMn7^3iycz(W|%DjxmB$UjSpfa4BQ3@~L159_o8#5E#VCbYwKtiyPxy-@W znOv7ER=V9Wwaxf}65>wsgQ>!hOiaYL;CmPbCpmDH|L*yz9q|D+*Mq%GrCh^;_v@A+ zx!0eNq8v>CWpQpC(FV#k9Lb0L^oH1{*`Z4QlL4X|T>VL>%Zn=MC8832wOAjfu&ahC zBvy`B=i52C-Vaj#WQt}(!sOYUdl)|q&Kq&6V+&TE5|6LZ)La9H<5fgX?YDiKXW1Y? z_P;|tD#X~DzbEg}Y^jZWJ>XR1{{VMCEh2N7K?58Ib>Hejv9A$iBm23z)P*!OmrZVI1tx*k77d7^CV)C&di^8amZ{U)j4tOgNL`L-+z{8J0a}Aup)gF`4&UQ?9y7>`*<#g6sB#I&(@wV|Z}3O}NC%36$0`%)?E)X0nv<)k{)TvT25uoFmU(YIBxG-T=Op{fKCL$mx4oX7_YHeGb!_p+m;Cbhw<6 z8#f|4R{DinGr?I}$KxS3xv0wQ23BW#cZ7171g(>d0L{_bEhK_vt@@wN-h~ z7Anbp`;eaBD^Ei!{>#o%e+A&L#`);(@Ae-Xt*3p2`v2DWH^PWI{B4x~#Qj-*f8#`{ zMftyq@t?py+437$PR*FV<<6fW{>-x9AsS9lHw^#IIX!LQv7;Fv_28%Gf zNR@np@7?8l-uw1hd!2puKHq=V+UNhi|J!=nc=*JCe>Zjj!u?jf2=?Ks6JooEiwD#< z&l_;VCU-s^Az>>bQFQv!Cl}YFfXX5Bh10i1M2B30{8NP+Bw4SL!rKU*qGD+ zf}auC0h+$MG&Oht06z%;aP=j)Z=gFAF7m*`z29Wmb6$>efwt|Et$x!WgSz!GC-`%T zdgeV8tICtK5yp7xI|BYEN6aq{?=1Wg;MWve9GDlFPrtP{fufz2oQM{jEpjz2Nz^Ri z%e)u)DEKmmRnJK%gPMffJkmN#wJ_j20un^a&d+Vj?`1e&H zT1!f8Bif5|O6H8!8`B*^?StH+sV&beLq{a*T^R*?4kZI`rO`9RZNmKO}?X%EYl!H48M$#zA2icQyX;OP1;=T`Kt@|gFS>m(0i zFHVL~rW?LYHD5c@Q1Hi(!P3Zn4H9RzoQnWJUu^0jj|whAe2*m|m{LrNFSuJM_?bA_ z_(uKq4(;>@eY1e~7)MJcp3P(vNEW5$sw@*jMkn4hn=*1)+HxZ@y1Gm_rX!L(*6CI_ zb+|Bdp0R?(R9^uo4OOLNw?FA3twoGaODr?wDl{4|CQhAEU5OCoiiBiD3_-rejx%#^ z<91)LYH;1ML4Tk49^au`hcVt~Gm)jR)ogSZEZ+;s6FPiu_`F#^_c9lz-O16>Zb{{rCf(kl<8Z{Xo{u36!I z!8F=M?i|f;)k*WPq{f&y#l{rKoo_z0ejekOliS$+4ycly9W3}~*6`&MC%P4ZK|dv= z2Z_jt)oH#J(oeB#&~SryNWECiLjM_Sl}n?f;OY;neQ_yRhkrw+rwCQD4<0Rx3F;-f zKAW((zq?_Hl{@g|HX&^RvN$Zr?woYZ@o`@7O6H-l!1s1gGuhPZGeX zxXC;v9j0h}FtBqxKP~UFfMk;`d~V=3Ds#k+#x}Zh z<&1D(U)e?CdAyu_OS4|DEUUAefG*HN+>zL0=X^+D4z8$`WrZ2JWC>a>Hw=CwV0&z1 z_BeA%T95@O_+Y6yMSNo-EQ}ePAC<)6H#%7YT4SR>k0!4Hf%L>=lf$VYXI7af%|{%p zqD{Wc$%Q52NkB;VIm_eOnEAZe+nZ-Qp7zQ_Px<<^znP!M=I&GF800Cd5OEqZF%vs| z?cZJ{vfzhw3T6hqOxg5Z&`T=D$^Dg%t?;r|T=D$0^NRwdia_2i|6Wxi_SbR*fHcZe3G2b9lgVZXfsX4Dl}i+h zs&BeIp#-o}QRc8`&G^=jGlZXC>(_}b>SOZmSkPXm*A-Lfp@4(o*Q66hnJt@eWl)M! zz;~FclkyeS>hKOa;xUCp$J14ANS|Lz^8WXo^UYBaWqc-kkOo)cxfp371wshJ>rx=% zyiAGY;@%n`yc?(1oZP7J+G#l@g^yEn{Wf-@anQQ}Ea_ziVtT^0zgphUzEEDR0C8Tf z=h%wx=g(^^Xi%;>CK`KsM9NgeCl)vT=p;?H#}m0ub@7l%EG#9J@N}5BA1CHP;YZnQ z95o$u;&Q)@A~)$frN;+Tq;#NET##Vsd72p4>4WqbNn5kj2kPjzZwf%sAC-s?d?Cg) zn$)RMKQ1Yr^m*Wqy}xOS6f-X$URx~w-ixvy&{xj3+Nx=5zgS|qSh5LOz#sMykS^5DifeJ5LuUDEy~v)ugDp z;)#>0e?u(xK2Y{=667h^4T?Ahpr@q>EM+)#UtyFWcB%gQcZ%gTZ+r;}43*8SfylzE z){e%y?jxvObq=?V22ba|R?(2?MOhI&w?-b>1wWdjEs|nSm?xgn>~Edkf%2G!Uzd-F z*yU=QOR*=@Rt0^7FWp;MWtRUlQOzdT6h)6|UQ>`tc{*dVGzfHvP@SGWP<6WnOnH=~ zs4r!MGZIj@BEtz*;)riaW;CBpbPw9LnT&95nAuQPQRCjaeIbb|>WRGb@Dx)^C;E?D zab|=TGsFP^g0Wli)89dT?XE;S(LncZ9~<3^-0!RmpQ$%FgDd>g@M8Z2 z;P2M@YW7$A%V_IqU!nfLdH#hk!VZ5~>ECg`7wBI&K5W_kuX6o6@OQHO0#0EQ<{zo^ kdx+nY>{p0oBJ7dkKPjiD4J5c)0b(yUY)YD6`3S(j0PgsQ=>Px# literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/writertable.odt b/pandas/tests/io/data/writertable.odt new file mode 100644 index 0000000000000000000000000000000000000000..113bd651e8cd0c4018c624b75096ab806e793d59 GIT binary patch literal 10313 zcmd6NWmsHGvo7wIpur`$2G>Ba;1FB`!{9o&TW}}1ySuxF5G28aySuv|*x$`b_CCqJ z_x!sx&swWT>Rqd9y5FkqYB@nI{%#F?UZJj{+ z01(K`7znU6wzOchx6os-1lSncFj#{0E%YpbcINsPwhXrVj<#}t1Cziiiw}hc1A97f z9>Eli?96p70LErEjJ7|T3?K``KslMW$OyOyk3T^c7Za9$ynX=#0|$eHd~7XvUs*q1 z+U2AaMJOpLsi~=1Sy{QbxR8*<5D?TbF_m#}6v4oBz`%@gag9kxKte)7Qc_Y13JMw; z8oIi=rlzJgHa4!Vu3%uE2nast=z;k7p`@e{)YQ=|ED1b3DFOm%!onGnk~wm6d5Ve! z8XCpAy1u%)LAtsh&CO#$pg@|DJeTUyR@`4C#SZm zs;afM)!)A}DCqN}eEs@$baZrjdU|DLWp{V?uFsJ}=;Wvto z(|glf7Bid}Fhk?1ZgfGUd#nY|I``>7qO*R2;z7~#4>h6^Xs5Dd=mFe;BL^m4W@uR9 z3{Lcn2iF|W4%)#Wk(mB@x_sUnc5`@H9up3mvWfK>&Ub{gnMIJjc?&A9GAd=som?-{ z&XnySwu8!UNoC8e%0wj*w^1Cg%|d$E;$;Zmy6awa-E&d+RceS`g=9 z7dD|$Xi9S|q$j9=u0wBB&DS7aR7jRex(5t3ZDyTx*u^jFcpe?3IUQ~JPTvjdzq+4Z z?_y;9zCRJ&T|ZCcFf(`FxmTXVy|-`QHb+?3G&kwY>BtWBuxXhgs9o6!&1-X}Sh*eC zs^(vqpPO?_=HA=$-?IwnbP5>74>>vC25}OEIcyAWEY+lat>+`K9N1X7;4%m)5rVl4AvBnboM+hYhan`UE+&s5 zTS6w(|81y0d5Spule8kckH>27VOG9#gp|0NDjuB3|YgIGWx!& z2_cm<2aFsAxv)yu284>^1KeGv9W+htg<0tsywq*n{Yf28xX?QX+Wt0{9a!dH=dF5 zqVv6`?fiuKCCY`;oe_mr<#1Q2FAC9a0oz|eZ0CQG#*dK6C4KT0h^5P zPSbfVL{Wam?ymfNJmPd{Vd)*N{R);bZWz!~9ye9$0$H0RYi=`EF-)+{+;iU?JzUd&)Gi|LL} z)aLcMqQ|5y5->TjRk2l*93ikd@jJjSJc{IZsTFb)&B`^UZe5v1FM;qe2q6a_hFd}( za7E{Xh8oeLPI?(vSh^?X?|s(Zt8HbmmN3o9R}I&OB6j2quu;>ojuwDyRDsree~^OCTyRaiTy*fMQ%=HRk4xF>ASo0w+DT97Ux~2SRyamM&1<8IY7t!_n=y!p&30*|HeLQ2hVbi~BORQ62MK)8o(&%89o+%)h& zUmKXCQ6=JVXE@$hYuEE`enNE%BEQz@^EK`Q;3Vf&Ry+!E29|-F$QiE>$Nt>jQg|wJwHU=EuOfMk~f&3 z$TOKyqnY!N3K~Uy9b83!9$cgWx0AV%0f)kkWU+7bXF7S#o#F#K_*HGpM3VDao)+|yj^gH;Fhn)avDSMxobi1 za=vf-y<&VOKx~8&D^0(1ecv8+;)9$X-^+~^ zGOOkU!?fdnSmXsa0P)@jV%L{5UH5#tcx|j4*jiLyHb=zug@X!HXu?asNl^k$%TljY z;UwMXeTYI2w(|3j=p>pR7j%~JX8er-1IY0r^igiU%oMHWG>OA0;Go#_>fDddUbO4B z2__rG5}_sc+IyI#Pmr;C_dm-}!*o=&8W&t%jfZmwMsC@rTrswBbJ%Pk%k*eGXVjyV znWJzc-odO`5INJurzbBLjBFouFC-jlGrgLliR(0JC4E-K9jknu6!(S*$N-P!Yb_&a zm+NT9{z3~)C!H&dR6^Hi80skAtdIAAFu@AR`uk6UNG=lXR3ZtU=p|peB^?6n*BfC@?zKg5A zS_NfuucI>MNT=%<;pN6$!?>iWlnW~?p+FM>OKZ8HpL;L1;RruqWrMMtN4)mp|-!nPJ zFh3{obg+OmbmUuFA)Cx)UD@JbIvd2@Nl(sH-ubZv8&R72TGT%2Mw)R^^X1ev9oi0bnwEW#-u8e(o-zdnF}Offn9FTz zW-EGewuFhHEjPfGyXnJDjOQmQeI}0ZJK>lvip=Y=vVl_KxCpViGjCbLn9i*K5l;A4b$AGqNnNZ0D4G zda5-eJl?#<*B`;9-B^Low1Ael8wkOK1!39{NYRgsXoY>FCp z@1&NZpv^6tVzZr0xszNn+E-xn-e4oS-h=6aF>Lu>7N@w!;PAHyg@4402Z@Wb+eJH1bNl5Z++>y ze0@{qR{;;4jE6@?l$PJ{GI8%MzGHp)oSRHqNN83Mdw-WKG4M%0y_}|*0pYD$Fj@H} zBoDYmiYL8-nyQ{ohBo+6Tkr(KDi$XyAZfhebqk$ zbheO6+Vuij6OZnGv>4=;bI5(*JVUQ~x6VnUxD=&q%2S#2?9A^bGSgJ$t}MDB)>VNo z%UdWr4xN2x zeatM=^BgDfBS%yjbN#AXg_xt%I^yTiXx>nWo}y$A*S3&Q@bcN8or*aMLQw4k=a5-j z?;E`+mK^wBuj?MSN3U1b2QLS!u>wR$72cceXKJ_I3R`3mEj(52FR?Zn*%71p(iz)4lv-tU%e}2_l93^M zT54s@nyY4HdMSIH)-Syauo& zK0{3cH;SZJ%4z__%q(eTuXHwv3C$!Y?g>a)b3VOH4A(KArj=d~M66BCB^1a4TMiXYY49 zoj-_3@R@s+1y4pT>Er$|pj3N(^!ZMd@*pz}19(wAWX0-5x*lWAfD%fVSS>0*GF1+N z1?!Y^p}Aw^>>L}8A5`J%XpL&1HmK*3k@MYSkESt?l0c3*Ped|nmBMS}LIAT4-}Qry zMa_mAWXrQsF0=QG>S$gY&>QKUjHU1d^bfNu2g3oWY5;p0j*HS&}5?>rUV)rvF zU$U=@l-+{pI6!j5pi*7oI94wOf+>lvHt}LM8JpltVsoVSMO9UmmH`z4iBGt^<^2JYqslUOY7700f9NtioPi9z0rb~zo;-YTYf z$aF=lf{;GCk^Wku2c#1(W2hU2QN`3SGOw<$TSR-R0S4N}#0@TqfzAPmb^^`rw)4hI z5Nk$PLbavT5VGa-L59*k^5laom0?m^TsA=IT?OD}2;gBK&GU`Q~NZe85BlHI{ z8o}=O_-r4|Vgg>-Wuds)Ms}hh6nN(Pwg#`4(tzOH?@V2&>Wa%rC_dy;L-lZ*Ao?wG zl})fSqxNZij8DNsLqrh(OBZM#qzt0u6z5V5fuKy#(q!0Bk;bjFDS*mP!gnXT_26F; z>U*WV*vP9$m)A@@8RE=L7*#X7W&U;zdTQ0#H+=*FOTM?|TvgKd%4^n}04Ct2QNhP4 z#*}!fh65=SS#IOon2Jc827p2E0VWgrpftYB=A|2|Q;d9!sQ1`sSXnt_5eS>Y-tv{B z`vD`|)$>KrVY<7%=(QF#lbet) z68lV5@YLR?K23)hu(hQH0?fITdK7OyF^qHKH(jZOrwuf$a=K=jR|qFJMCVOiyOper zC2wQUV}^qV7u?e`6zbzRBykHvTTFSh)@f8kshvDb*WQGih62)(MuOgRh`UiF+&qV> z9grUqiV%#FyN!f$v)@O~ojW#o&Azu=0QedqTo2iQ&r6sa4oY+!xR(7+7x~=TY9aMt4JXjQ_$-QnH%-nOm6cwt9p*Z+ME&dK=LS$H~e^|bC8Do z7fAGr7)Wl^wg*|>`*KCud|wsYkZM&uwdszFV)*la4R9h}FayFnq+-6s9TpM?Av={hbp1XXk zy1bCVOC^9i$70$doDN$ks@y9gu%j25Ud}{!96${;4N~{b>)PV#aj4D5n%`)`KvJ6> z43q#q=kG2;i4kREZq<(B0Rkx|A=zWB42?bHvRLfjYPNIj^2Vu;jV6XrsvFwimb+7+ zLcaedd!l;}`*_$Ocp?yE_R z!%j5eG@>kSo*`!oeI!!COn;qmMe&xyCkNm=UP?NN;d`}e}qXoIf|3S4zXf}MR4N+dVdy-}Cuvl)dF8z0f^RGNZwp2< z)p}F&9O-Y)KefFqm)&8cJz88Y90upzl)>!Q$We;VVGgoJC%8!FsN;welHRC5^4gB0 z+R!4i9!PFhTe}pqzE&+mX%l9vL#Bo9m*7wKhPhsvlJ@j&3l$tkS}ccAHm+#idzblQ z{%Ui~SpJw{MEhz`nG_lTqiWDKWv5=!}r1s;cH3>@}x{MaW|Qgl8Qo&yh1@!Ks$l|I1J*7 z)%AH-o81UqGHja9h;xwg+yO-1jDnAJYa$_kgQYXreU6Y z7k)wP0E}F~=_#p_3*X(Ox$Mkm4e-3_&9wHCrV<2xHzR%O4^!(0VOgEWw|F~(?8?@b zL-agxXrMWoozm|_B5D=7&yHSh+Z@=HjnhWbB?5c_XZqCH3OS>q-*l^!gx#%p>vXX2 z-ztbVeBEOh5maE=cir#hJ9?Lzk@Plz(CzXT^{HEMsH8P4frEiDK>b^{Kzd4o^=)m9 zEe!ukgIm-WY^FKU9#cosP8SKmC`~}5EfO_7Jv>rwzvhBZizkZO&c{`>`su8#aV* zD8Eoi3M2xNzYLLG2fUAKj9O?l(UY)`FfQeZBv0p|*w#U?zER(}8YRf1YdfZ@*VJN* z{gRzxVAJxxO|8w%+hmy6ylzI?!QsZp$9e#}Oeu!{P7{;#HFYK9YOqzGIwhrH|EqyKJoE8@XjP3ZzFls1 zQr;pP=bEKg=YA(XO{LosU$wh0{~|sm4zuBtz;t8UD`Xs2g4~ydh-i z*>c>MX(Hf0fuR3%^T4XNqhY}`l|`s|XM@sQOwN*nz4Lr2LoE{B-brvZ$+uDAFLs0Ah41&7ai5}EcZ=7D|R`|O2kJHDOD=>^XE+L5&A zj$cx@1lM!plxk1s43c^m>g`ctnzIa%zy%rGT+PKQuwf|Uo&M$y@e8$Gs z7C~*0FZdvCMWFDU1?bC74(SL=jIy-(67N(%eswKwA16}e`$=kleM^XXi-Zf^w~P-; zKGSM{1~Scg(&3n&CYHfz)%1#?|N7J_jc>jM%J7{}QW;7#hN3p$`G;QnqW+FgZV2H9 zbgEF27_W8>iiGmvM-$|l&G z^smx}`s+D?r&+B-dt=$~#csdK52(kE$qGg!Di6WU;|<)SL=x&f9rN`esDB@uloQnceXsRprlEv=l!VcSei2e`41|J$$zq!bgtAyx6B;v<0tP#@mizBSuI_ zir(+=g6FC?3{sQJw6(cvqy4G2XAm&;U`~qN9ig32j?}`><97~cHtEGhgpJGn#U_2hz>cy&Lq)Sy%)R2t+qL99lVV zGs4t`0hNGe>-tb^m?{AWBKU6APl`I|n0~A(jl_VaQ05YX1hfYdHn$7!g_GEju>%G9 z2~PS|$)>9&ZikO?k=m;TZ2TPR<^EAv;B$bYW;z&7;UZ+A+#oh8QK7c6xH}Ar;&8Z$ZyvKNIplUAak5Qx2 z`M$!RDwtN1TU-PYDhbcy8y81rNIWzTxgKG67)+&6mU;Y@f4sGNxx@$q2IfxvBhdUP zqW>tEJY8`g4%Yq9Hq4A5OOPGtv1V?^`1gAGuX;qk(K9v&80yo zllB1kJ;9av9g3_#YVtlYqna&R?k0wYoEh%v$y!>XM1ms|@0>K6Mh6QTF6~%fdJZJB z!KzFu$HWs}*=Ev?F-|Knq%YNBaPWkog2rd7uDDzS%kTy&?`(I{Bb+XX`GV)T3(_kQ z0JZWh{>$Hq|XY%SC(uNIA&IcgnuDxGu zDc{%hUT}>ua^43z+l_V|Es#a)Bs1^JZa48bJ#5@F3_iSAE~~Z|HWL`gGQ5u1s6HlA zdRz9CqE%lQjX*!9Xx@+eJN~>*iTUV0X%WRY^x`t2jQ{P&$27hqO46j03C-_<%Xhd| z)uf8%lMI$uDlwlYO*nxUEAu$j=ejbp$jKZW-9wAi2UlFtuA_A`dtU~eaPphQ13Jj> zt<00ryEnHn9+fKq6;9t=KY?R_UIA+7D81UQoHM3WL_R*0xcW6#`7t-QK%~dc8t2wl zKiK(h`PW>hnu>_L`_i<+9V7F3n#O1;x)Q1{^?ptLpXPl(r3>qEDE^;W;6Jr~`%`8g z+2Yqp!Vk?q@%&Fskw^OeCmsIE`hR!#PdzZOC*J=lZok3#A5q}XU{Cz|QyPB*_G?u5 zGtN_7_$kM~!TEcX_%qTI|Nay@;{WNUpHbt_D1Wz0_YWvPqR5|d{%%*|Z*cw|RsM|h zce}p-2I*&%`F}-0Ao-u(`5ATogYsjn{Yt5S@5BO+DF4l#k5TAHC;p-ND|`se) literal 0 HcmV?d00001 diff --git a/pandas/tests/io/excel/conftest.py b/pandas/tests/io/excel/conftest.py index 935db254bd2e5d..dd96fb2366152c 100644 --- a/pandas/tests/io/excel/conftest.py +++ b/pandas/tests/io/excel/conftest.py @@ -30,7 +30,7 @@ def df_ref(): return df_ref -@pytest.fixture(params=['.xls', '.xlsx', '.xlsm']) +@pytest.fixture(params=['.xls', '.xlsx', '.xlsm', '.ods']) def read_ext(request): """ Valid extensions for reading Excel files. diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py new file mode 100644 index 00000000000000..76b3fe19a0771c --- /dev/null +++ b/pandas/tests/io/excel/test_odf.py @@ -0,0 +1,39 @@ +import functools + +import numpy as np +import pytest + +import pandas as pd +import pandas.util.testing as tm + +pytest.importorskip("odf") + + +@pytest.fixture(autouse=True) +def cd_and_set_engine(monkeypatch, datapath): + func = functools.partial(pd.read_excel, engine="odf") + monkeypatch.setattr(pd, 'read_excel', func) + monkeypatch.chdir(datapath("io", "data")) + + +def test_read_invalid_types_raises(): + # the invalid_value_type.ods required manually editing + # of the included content.xml file + with pytest.raises(ValueError, + match="Unrecognized type awesome_new_type"): + pd.read_excel("invalid_value_type.ods") + + +def test_read_writer_table(): + # Also test reading tables from an text OpenDocument file + # (.odt) + index = pd.Index(["Row 1", "Row 2", "Row 3"], name="Header") + expected = pd.DataFrame([ + [1, np.nan, 7], + [2, np.nan, 8], + [3, np.nan, 9], + ], index=index, columns=["Column 1", "Unnamed: 2", "Column 3"]) + + result = pd.read_excel("writertable.odt", 'Table1', index_col=0) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index be5951fe12b469..ae69c2302e60a7 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -33,9 +33,21 @@ def ignore_xlrd_time_clock_warning(): @pytest.fixture(params=[ # Add any engines to test here - pytest.param('xlrd', marks=td.skip_if_no('xlrd')), - pytest.param('openpyxl', marks=td.skip_if_no('openpyxl')), - pytest.param(None, marks=td.skip_if_no('xlrd')), + # When defusedxml is installed it triggers deprecation warnings for + # xlrd and openpyxl, so catch those here + pytest.param('xlrd', marks=[ + td.skip_if_no('xlrd'), + pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), + ]), + pytest.param('openpyxl', marks=[ + td.skip_if_no('openpyxl'), + pytest.mark.filterwarnings("ignore:.*html argument"), + ]), + pytest.param(None, marks=[ + td.skip_if_no('xlrd'), + pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), + ]), + pytest.param("odf", marks=td.skip_if_no("odf")), ]) def engine(request): """ @@ -53,6 +65,11 @@ def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext): """ if engine == 'openpyxl' and read_ext == '.xls': pytest.skip() + if engine == 'odf' and read_ext != '.ods': + pytest.skip() + if read_ext == ".ods" and engine != "odf": + pytest.skip() + func = partial(pd.read_excel, engine=engine) monkeypatch.chdir(datapath("io", "data")) monkeypatch.setattr(pd, 'read_excel', func) @@ -62,14 +79,16 @@ def test_usecols_int(self, read_ext, df_ref): # usecols as int with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + check_stacklevel=False, + raise_on_extra_warnings=False): with ignore_xlrd_time_clock_warning(): df1 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols=3) # usecols as int with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + check_stacklevel=False, + raise_on_extra_warnings=False): with ignore_xlrd_time_clock_warning(): df2 = pd.read_excel("test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols=3) @@ -439,6 +458,9 @@ def test_bad_engine_raises(self, read_ext): @tm.network def test_read_from_http_url(self, read_ext): + if read_ext == '.ods': # TODO: remove once on master + pytest.skip() + url = ('https://raw.github.com/pandas-dev/pandas/master/' 'pandas/tests/io/data/test1' + read_ext) url_table = pd.read_excel(url) @@ -736,6 +758,10 @@ def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext): """ Change directory and set engine for ExcelFile objects. """ + if engine == 'odf' and read_ext != '.ods': + pytest.skip() + if read_ext == ".ods" and engine != "odf": + pytest.skip() if engine == 'openpyxl' and read_ext == '.xls': pytest.skip() @@ -802,7 +828,8 @@ def test_excel_table_sheet_by_index(self, read_ext, df_ref): df3 = pd.read_excel(excel, 0, index_col=0, skipfooter=1) tm.assert_frame_equal(df3, df1.iloc[:-1]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False, + raise_on_extra_warnings=False): with pd.ExcelFile('test1' + read_ext) as excel: df4 = pd.read_excel(excel, 0, index_col=0, skip_footer=1) diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 94e1435d4dfaba..d749f0ec3e2525 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -10,6 +10,12 @@ xlwt = pytest.importorskip("xlwt") +@pytest.fixture(autouse=True) +def skip_ods_files(read_ext): + if read_ext == ".ods": + pytest.skip("Not valid for xlrd") + + def test_read_xlrd_book(read_ext, frame): df = frame From 96c7ab55929eadf9386218536a695763ee65392b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 3 Jul 2019 17:59:47 -0400 Subject: [PATCH 154/238] Shorter truncated Series/DataFrame repr: introduce min_rows (#27095) --- doc/source/user_guide/options.rst | 22 +++++++++- doc/source/whatsnew/v0.25.0.rst | 24 +++++++++++ pandas/core/config_init.py | 9 +++++ pandas/core/frame.py | 11 +++-- pandas/core/series.py | 11 ++++- pandas/io/formats/format.py | 22 ++++++++-- pandas/tests/io/formats/test_format.py | 56 ++++++++++++++++++++++++++ 7 files changed, 145 insertions(+), 10 deletions(-) diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index 1f296c0d6c0889..f32a8adfd4d335 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -157,6 +157,22 @@ lines are replaced by an ellipsis. df pd.reset_option('max_rows') +Once the ``display.max_rows`` is exceeded, the ``display.min_rows`` options +determines how many rows are shown in the truncated repr. + +.. ipython:: python + + pd.set_option('max_rows', 8) + pd.set_option('max_rows', 4) + # below max_rows -> all rows shown + df = pd.DataFrame(np.random.randn(7, 2)) + df + # above max_rows -> only min_rows (4) rows shown + df = pd.DataFrame(np.random.randn(9, 2)) + df + pd.reset_option('max_rows') + pd.reset_option('min_rows') + ``display.expand_frame_repr`` allows for the representation of dataframes to stretch across pages, wrapped over the full column vs row-wise. @@ -352,8 +368,12 @@ display.max_rows 60 This sets the maximum numbe out various output. For example, this value determines whether the repr() for a dataframe prints out - fully or just a summary repr. + fully or just a truncated or summary repr. 'None' value means unlimited. +display.min_rows 10 The numbers of rows to show in a truncated + repr (when `max_rows` is exceeded). Ignored + when `max_rows` is set to None or 0. When set + to None, follows the value of `max_rows`. display.max_seq_items 100 when pretty-printing a long sequence, no more then `max_seq_items` will be printed. If items are omitted, diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a9a7b040429095..77426e950798c2 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -134,6 +134,30 @@ than :attr:`options.display.max_seq_items` (default: 100 items). Horizontally, the output will truncate, if it's wider than :attr:`options.display.width` (default: 80 characters). +.. _whatsnew_0250.enhancements.shorter_truncated_repr: + +Shorter truncated repr for Series and DataFrame +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Currently, the default display options of pandas ensure that when a Series +or DataFrame has more than 60 rows, its repr gets truncated to this maximum +of 60 rows (the ``display.max_rows`` option). However, this still gives +a repr that takes up a large part of the vertical screen estate. Therefore, +a new option ``display.min_rows`` is introduced with a default of 10 which +determines the number of rows showed in the truncated repr: + +- For small Series or DataFrames, up to ``max_rows`` number of rows is shown + (default: 60). +- For larger Series of DataFrame with a length above ``max_rows``, only + ``min_rows`` number of rows is shown (default: 10, i.e. the first and last + 5 rows). + +This dual option allows to still see the full content of relatively small +objects (e.g. ``df.head(20)`` shows all 20 rows), while giving a brief repr +for large objects. + +To restore the previous behaviour of a single threshold, set +``pd.options.display.min_rows = None``. .. _whatsnew_0250.enhancements.json_normalize_with_max_level: diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 7fe9f8438ac744..856d5076f37554 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -77,6 +77,13 @@ def use_numexpr_cb(key): correct auto-detection. """ +pc_min_rows_doc = """ +: int + The numbers of rows to show in a truncated view (when `max_rows` is + exceeded). Ignored when `max_rows` is set to None or 0. When set to + None, follows the value of `max_rows`. +""" + pc_max_cols_doc = """ : int If max_cols is exceeded, switch to truncate view. Depending on @@ -306,6 +313,8 @@ def is_terminal(): validator=is_instance_factory((int, type(None)))) cf.register_option('max_rows', 60, pc_max_rows_doc, validator=is_instance_factory([type(None), int])) + cf.register_option('min_rows', 10, pc_min_rows_doc, + validator=is_instance_factory([type(None), int])) cf.register_option('max_categories', 8, pc_max_categories_doc, validator=is_int) cf.register_option('max_colwidth', 50, max_colwidth_doc, validator=is_int) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1a1d6fa729065c..0dba7c7b5d2888 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -588,14 +588,16 @@ def __repr__(self): return buf.getvalue() max_rows = get_option("display.max_rows") + min_rows = get_option("display.min_rows") max_cols = get_option("display.max_columns") show_dimensions = get_option("display.show_dimensions") if get_option("display.expand_frame_repr"): width, _ = console.get_console_size() else: width = None - self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols, - line_width=width, show_dimensions=show_dimensions) + self.to_string(buf=buf, max_rows=max_rows, min_rows=min_rows, + max_cols=max_cols, line_width=width, + show_dimensions=show_dimensions) return buf.getvalue() @@ -633,8 +635,8 @@ def _repr_html_(self): def to_string(self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, justify=None, - max_rows=None, max_cols=None, show_dimensions=False, - decimal='.', line_width=None): + max_rows=None, min_rows=None, max_cols=None, + show_dimensions=False, decimal='.', line_width=None): """ Render a DataFrame to a console-friendly tabular output. %(shared_params)s @@ -663,6 +665,7 @@ def to_string(self, buf=None, columns=None, col_space=None, header=True, sparsify=sparsify, justify=justify, index_names=index_names, header=header, index=index, + min_rows=min_rows, max_rows=max_rows, max_cols=max_cols, show_dimensions=show_dimensions, diff --git a/pandas/core/series.py b/pandas/core/series.py index a2086c5f192493..13966d4551b541 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1505,17 +1505,20 @@ def __repr__(self): width, height = get_terminal_size() max_rows = (height if get_option("display.max_rows") == 0 else get_option("display.max_rows")) + min_rows = (height if get_option("display.max_rows") == 0 else + get_option("display.min_rows")) show_dimensions = get_option("display.show_dimensions") self.to_string(buf=buf, name=self.name, dtype=self.dtype, - max_rows=max_rows, length=show_dimensions) + min_rows=min_rows, max_rows=max_rows, + length=show_dimensions) result = buf.getvalue() return result def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True, index=True, length=False, dtype=False, name=False, - max_rows=None): + max_rows=None, min_rows=None): """ Render a string representation of the Series. @@ -1541,6 +1544,9 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True, max_rows : int, optional Maximum number of rows to show before truncating. If None, show all. + min_rows : int, optional + The number of rows to display in a truncated repr (when number + of rows is above `max_rows`). Returns ------- @@ -1552,6 +1558,7 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True, header=header, index=index, dtype=dtype, na_rep=na_rep, float_format=float_format, + min_rows=min_rows, max_rows=max_rows) result = formatter.to_string() diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 3f98fc235b2c58..98c31fbeb78e68 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -79,6 +79,9 @@ * unset. max_rows : int, optional Maximum number of rows to display in the console. + min_rows : int, optional + The number of rows to display in the console in a truncated repr + (when number of rows is above `max_rows`). max_cols : int, optional Maximum number of columns to display in the console. show_dimensions : bool, default False @@ -159,7 +162,7 @@ class SeriesFormatter: def __init__(self, series, buf=None, length=True, header=True, index=True, na_rep='NaN', name=False, float_format=None, dtype=True, - max_rows=None): + max_rows=None, min_rows=None): self.series = series self.buf = buf if buf is not None else StringIO() self.name = name @@ -168,6 +171,7 @@ def __init__(self, series, buf=None, length=True, header=True, index=True, self.length = length self.index = index self.max_rows = max_rows + self.min_rows = min_rows if float_format is None: float_format = get_option("display.float_format") @@ -179,10 +183,17 @@ def __init__(self, series, buf=None, length=True, header=True, index=True, def _chk_truncate(self): from pandas.core.reshape.concat import concat + min_rows = self.min_rows max_rows = self.max_rows + # truncation determined by max_rows, actual truncated number of rows + # used below by min_rows truncate_v = max_rows and (len(self.series) > max_rows) series = self.series if truncate_v: + if min_rows: + # if min_rows is set (not None or 0), set max_rows to minimum + # of both + max_rows = min(min_rows, max_rows) if max_rows == 1: row_num = max_rows series = series.iloc[:max_rows] @@ -391,8 +402,8 @@ def __init__(self, frame, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, justify=None, float_format=None, sparsify=None, index_names=True, line_width=None, max_rows=None, - max_cols=None, show_dimensions=False, decimal='.', - table_id=None, render_links=False, **kwds): + min_rows=None, max_cols=None, show_dimensions=False, + decimal='.', table_id=None, render_links=False, **kwds): self.frame = frame if buf is not None: self.buf = _expand_user(_stringify_path(buf)) @@ -414,6 +425,7 @@ def __init__(self, frame, buf=None, columns=None, col_space=None, self.index = index self.line_width = line_width self.max_rows = max_rows + self.min_rows = min_rows self.max_cols = max_cols self.max_rows_displayed = min(max_rows or len(self.frame), len(self.frame)) @@ -471,6 +483,10 @@ def _chk_truncate(self): max_rows = h if not hasattr(self, 'max_rows_adj'): + if max_rows: + if (len(self.frame) > max_rows) and self.min_rows: + # if truncated, set max_rows showed to min_rows + max_rows = min(self.min_rows, max_rows) self.max_rows_adj = max_rows if not hasattr(self, 'max_cols_adj'): self.max_cols_adj = max_cols diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 0eeb0e6eb2f2df..7098a382cad45b 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -377,6 +377,34 @@ def mkframe(n): printing.pprint_thing(df._repr_fits_horizontal_()) assert has_expanded_repr(df) + def test_repr_min_rows(self): + df = pd.DataFrame({'a': range(20)}) + + # default setting no truncation even if above min_rows + assert '..' not in repr(df) + + df = pd.DataFrame({'a': range(61)}) + + # default of max_rows 60 triggers truncation if above + assert '..' in repr(df) + + with option_context('display.max_rows', 10, 'display.min_rows', 4): + # truncated after first two rows + assert '..' in repr(df) + assert '2 ' not in repr(df) + + with option_context('display.max_rows', 12, 'display.min_rows', None): + # when set to None, follow value of max_rows + assert '5 5' in repr(df) + + with option_context('display.max_rows', 10, 'display.min_rows', 12): + # when set value higher as max_rows, use the minimum + assert '5 5' not in repr(df) + + with option_context('display.max_rows', None, 'display.min_rows', 12): + # max_rows of None -> never truncate + assert '..' not in repr(df) + def test_str_max_colwidth(self): # GH 7856 df = pd.DataFrame([{'a': 'foo', @@ -2284,6 +2312,34 @@ def test_show_dimensions(self): "display.show_dimensions", False): assert 'Length' not in repr(s) + def test_repr_min_rows(self): + s = pd.Series(range(20)) + + # default setting no truncation even if above min_rows + assert '..' not in repr(s) + + s = pd.Series(range(61)) + + # default of max_rows 60 triggers truncation if above + assert '..' in repr(s) + + with option_context('display.max_rows', 10, 'display.min_rows', 4): + # truncated after first two rows + assert '..' in repr(s) + assert '2 ' not in repr(s) + + with option_context('display.max_rows', 12, 'display.min_rows', None): + # when set to None, follow value of max_rows + assert '5 5' in repr(s) + + with option_context('display.max_rows', 10, 'display.min_rows', 12): + # when set value higher as max_rows, use the minimum + assert '5 5' not in repr(s) + + with option_context('display.max_rows', None, 'display.min_rows', 12): + # max_rows of None -> never truncate + assert '..' not in repr(s) + def test_to_string_name(self): s = Series(range(100), dtype='int64') s.name = 'myser' From 1be80c9a012767972b24d9428078c8c354066975 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Wed, 3 Jul 2019 18:41:48 -0600 Subject: [PATCH 155/238] ENH: Implement is_empty property for Interval structures (#27221) * ENH: Implement is_empty property for Interval structures --- doc/source/reference/arrays.rst | 2 + doc/source/reference/indexing.rst | 1 + doc/source/whatsnew/v0.25.0.rst | 1 + pandas/_libs/interval.pyx | 53 +++++++++++++++++++ pandas/core/arrays/interval.py | 1 + pandas/tests/arrays/interval/test_interval.py | 21 +++++++- pandas/tests/scalar/interval/test_interval.py | 18 +++++++ 7 files changed, 96 insertions(+), 1 deletion(-) diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index bf9520c54040df..7f464bf952bfbf 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -295,6 +295,7 @@ Properties Interval.closed Interval.closed_left Interval.closed_right + Interval.is_empty Interval.left Interval.length Interval.mid @@ -331,6 +332,7 @@ A collection of intervals may be stored in an :class:`arrays.IntervalArray`. arrays.IntervalArray.closed arrays.IntervalArray.mid arrays.IntervalArray.length + arrays.IntervalArray.is_empty arrays.IntervalArray.is_non_overlapping_monotonic arrays.IntervalArray.from_arrays arrays.IntervalArray.from_tuples diff --git a/doc/source/reference/indexing.rst b/doc/source/reference/indexing.rst index 65860eb5c2f51d..576f734d517aa0 100644 --- a/doc/source/reference/indexing.rst +++ b/doc/source/reference/indexing.rst @@ -254,6 +254,7 @@ IntervalIndex components IntervalIndex.closed IntervalIndex.length IntervalIndex.values + IntervalIndex.is_empty IntervalIndex.is_non_overlapping_monotonic IntervalIndex.is_overlapping IntervalIndex.get_loc diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 77426e950798c2..8f6cd586c29b3f 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -212,6 +212,7 @@ Other enhancements - :class:`pandas.offsets.BusinessHour` supports multiple opening hours intervals (:issue:`15481`) - :func:`read_excel` can now use ``openpyxl`` to read Excel files via the ``engine='openpyxl'`` argument. This will become the default in a future release (:issue:`11499`) - :func:`pandas.io.excel.read_excel` supports reading OpenDocument tables. Specify ``engine='odf'`` to enable. Consult the :ref:`IO User Guide ` for more details (:issue:`9070`) +- :class:`Interval`, :class:`IntervalIndex`, and :class:`~arrays.IntervalArray` have gained an :attr:`~Interval.is_empty` attribute denoting if the given interval(s) are empty (:issue:`27219`) .. _whatsnew_0250.api_breaking: diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 6c1df419865edc..3c7ec70fb1f88f 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -107,6 +107,59 @@ cdef class IntervalMixin: """Return the length of the Interval""" return self.right - self.left + @property + def is_empty(self): + """ + Indicates if an interval is empty, meaning it contains no points. + + .. versionadded:: 0.25.0 + + Returns + ------- + bool or ndarray + A boolean indicating if a scalar :class:`Interval` is empty, or a + boolean ``ndarray`` positionally indicating if an ``Interval`` in + an :class:`~arrays.IntervalArray` or :class:`IntervalIndex` is + empty. + + Examples + -------- + An :class:`Interval` that contains points is not empty: + + >>> pd.Interval(0, 1, closed='right').is_empty + False + + An ``Interval`` that does not contain any points is empty: + + >>> pd.Interval(0, 0, closed='right').is_empty + True + >>> pd.Interval(0, 0, closed='left').is_empty + True + >>> pd.Interval(0, 0, closed='neither').is_empty + True + + An ``Interval`` that contains a single point is not empty: + + >>> pd.Interval(0, 0, closed='both').is_empty + False + + An :class:`~arrays.IntervalArray` or :class:`IntervalIndex` returns a + boolean ``ndarray`` positionally indicating if an ``Interval`` is + empty: + + >>> ivs = [pd.Interval(0, 0, closed='neither'), + ... pd.Interval(1, 2, closed='neither')] + >>> pd.arrays.IntervalArray(ivs).is_empty + array([ True, False]) + + Missing values are not considered empty: + + >>> ivs = [pd.Interval(0, 0, closed='neither'), np.nan] + >>> pd.IntervalIndex(ivs).is_empty + array([ True, False]) + """ + return (self.right == self.left) & (self.closed != 'both') + def _check_closed_matches(self, other, name='other'): """Check if the closed attribute of `other` matches. diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index aa56d99d298f47..cf8ca25857f4e5 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -66,6 +66,7 @@ closed mid length +is_empty is_non_overlapping_monotonic %(extra_attributes)s\ diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 34de36b4f66659..4a7962d88a44e4 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -2,7 +2,9 @@ import pytest import pandas as pd -from pandas import Index, Interval, IntervalIndex, date_range, timedelta_range +from pandas import ( + Index, Interval, IntervalIndex, Timedelta, Timestamp, date_range, + timedelta_range) from pandas.core.arrays import IntervalArray import pandas.util.testing as tm @@ -23,6 +25,23 @@ def left_right_dtypes(request): return request.param +class TestAttributes: + @pytest.mark.parametrize('left, right', [ + (0, 1), + (Timedelta('0 days'), Timedelta('1 day')), + (Timestamp('2018-01-01'), Timestamp('2018-01-02')), + pytest.param(Timestamp('2018-01-01', tz='US/Eastern'), + Timestamp('2018-01-02', tz='US/Eastern'), + marks=pytest.mark.xfail(strict=True, reason='GH 27011'))]) + @pytest.mark.parametrize('constructor', [IntervalArray, IntervalIndex]) + def test_is_empty(self, constructor, left, right, closed): + # GH27219 + tuples = [(left, left), (left, right), np.nan] + expected = np.array([closed != 'both', False, False]) + result = constructor.from_tuples(tuples, closed=closed).is_empty + tm.assert_numpy_array_equal(result, expected) + + class TestMethods: @pytest.mark.parametrize('new_closed', [ diff --git a/pandas/tests/scalar/interval/test_interval.py b/pandas/tests/scalar/interval/test_interval.py index e19ff82b9b2672..66452443187765 100644 --- a/pandas/tests/scalar/interval/test_interval.py +++ b/pandas/tests/scalar/interval/test_interval.py @@ -94,6 +94,24 @@ def test_length_timestamp(self, tz, left, right, expected): expected = Timedelta(expected) assert result == expected + @pytest.mark.parametrize('left, right', [ + (0, 1), + (Timedelta('0 days'), Timedelta('1 day')), + (Timestamp('2018-01-01'), Timestamp('2018-01-02')), + (Timestamp('2018-01-01', tz='US/Eastern'), + Timestamp('2018-01-02', tz='US/Eastern'))]) + def test_is_empty(self, left, right, closed): + # GH27219 + # non-empty always return False + iv = Interval(left, right, closed) + assert iv.is_empty is False + + # same endpoint is empty except when closed='both' (contains one point) + iv = Interval(left, left, closed) + result = iv.is_empty + expected = closed != 'both' + assert result is expected + @pytest.mark.parametrize('left, right', [ ('a', 'z'), (('a', 'b'), ('c', 'd')), From c7d7e81d544f3a6d2723037b8ae1e92c50b54d34 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 3 Jul 2019 19:58:15 -0500 Subject: [PATCH 156/238] DOC: Add 0.25.0 to whatsnew index (#27216) --- doc/source/whatsnew/index.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 6c529d2e2e5f34..592b4748126c18 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -10,6 +10,14 @@ This is the list of changes to pandas between each release. For full details, see the commit logs at http://github.com/pandas-dev/pandas. For install and upgrade instructions, see :ref:`install`. +Version 0.25 +------------ + +.. toctree:: + :maxdepth: 2 + + v0.25.0 + Version 0.24 ------------ From ce567de90e040e7b76587cc8646a22322888b157 Mon Sep 17 00:00:00 2001 From: anmyachev <45976948+anmyachev@users.noreply.github.com> Date: Thu, 4 Jul 2019 04:38:04 +0300 Subject: [PATCH 157/238] PERF: changed default value of cache parameter to True in to_datetime function (#26043) --- asv_bench/benchmarks/io/csv.py | 25 +++--- asv_bench/benchmarks/timeseries.py | 13 ++++ doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/tools/datetimes.py | 80 ++++++++++++++++++-- pandas/tests/indexes/datetimes/test_tools.py | 20 +++++ pandas/tests/io/parser/test_parse_dates.py | 15 ++++ 6 files changed, 136 insertions(+), 18 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 6beb21883b5ab8..fbb96380a58134 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -4,7 +4,6 @@ import numpy as np import pandas.util.testing as tm from pandas import DataFrame, Categorical, date_range, read_csv, to_datetime -from pandas.io.parsers import _parser_defaults from io import StringIO from ..pandas_vb_common import BaseIO @@ -272,13 +271,12 @@ def setup(self, do_cache): self.StringIO_input = StringIO(data) def time_read_csv_cached(self, do_cache): - # kwds setting here is used to avoid breaking tests in - # previous version of pandas, because this is api changes - kwds = {} - if 'cache_dates' in _parser_defaults: - kwds['cache_dates'] = do_cache - read_csv(self.data(self.StringIO_input), header=None, - parse_dates=[0], **kwds) + try: + read_csv(self.data(self.StringIO_input), header=None, + parse_dates=[0], cache_dates=do_cache) + except TypeError: + # cache_dates is a new keyword in 0.25 + pass class ReadCSVMemoryGrowth(BaseIO): @@ -329,9 +327,14 @@ def setup(self, cache_dates): self.StringIO_input = StringIO(data) def time_read_csv_dayfirst(self, cache_dates): - read_csv(self.data(self.StringIO_input), sep=',', header=None, - names=['Date'], parse_dates=['Date'], cache_dates=cache_dates, - dayfirst=True) + try: + read_csv(self.data(self.StringIO_input), sep=',', header=None, + names=['Date'], parse_dates=['Date'], + cache_dates=cache_dates, + dayfirst=True) + except TypeError: + # cache_dates is a new keyword in 0.25 + pass def time_to_datetime_dayfirst(self, cache_dates): df = read_csv(self.data(self.StringIO_input), diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 7de1c42246ad5b..14ee8747cf81d9 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -300,6 +300,19 @@ def time_format_YYYYMMDD(self): to_datetime(self.stringsD, format='%Y%m%d') +class ToDatetimeCacheSmallCount(object): + + params = ([True, False], [50, 500, 5000, 100000]) + param_names = ['cache', 'count'] + + def setup(self, cache, count): + rng = date_range(start='1/1/1971', periods=count) + self.unique_date_strings = rng.strftime('%Y-%m-%d').tolist() + + def time_unique_date_strings(self, cache, count): + to_datetime(self.unique_date_strings, cache=cache) + + class ToDatetimeISO8601: def setup(self): diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 8f6cd586c29b3f..ab242ece981817 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -937,6 +937,7 @@ Performance improvements - Restored performance of :meth:`DatetimeIndex.__iter__` by re-enabling specialized code path (:issue:`26702`) - Improved performance when building :class:`MultiIndex` with at least one :class:`CategoricalIndex` level (:issue:`22044`) - Improved performance by removing the need for a garbage collect when checking for ``SettingWithCopyWarning`` (:issue:`27031`) +- For :meth:`to_datetime` changed default value of cache parameter to ``True`` (:issue:`26043`) .. _whatsnew_0250.bug_fixes: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index d543ae91ad344f..3e3318ed4c4b6b 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -22,6 +22,14 @@ from pandas._typing import ArrayLike from pandas.core import algorithms +from pandas.core.algorithms import unique + +# --------------------------------------------------------------------- +# types used in annotations + +ArrayConvertible = Union[list, tuple, ArrayLike, ABCSeries] + +# --------------------------------------------------------------------- # --------------------------------------------------------------------- # types used in annotations @@ -42,13 +50,67 @@ def _guess_datetime_format_for_array(arr, **kwargs): return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) +def should_cache(arg: ArrayConvertible, unique_share: float = 0.7, + check_count: Optional[int] = None) -> bool: + """ + Decides whether to do caching. + + If the percent of unique elements among `check_count` elements less + than `unique_share * 100` then we can do caching. + + Parameters + ---------- + arg: listlike, tuple, 1-d array, Series + unique_share: float, default=0.7, optional + 0 < unique_share < 1 + check_count: int, optional + 0 <= check_count <= len(arg) + + Returns + ------- + do_caching: bool + + Notes + ----- + By default for a sequence of less than 50 items in size, we don't do + caching; for the number of elements less than 5000, we take ten percent of + all elements to check for a uniqueness share; if the sequence size is more + than 5000, then we check only the first 500 elements. + All constants were chosen empirically by. + """ + do_caching = True + + # default realization + if check_count is None: + # in this case, the gain from caching is negligible + if len(arg) <= 50: + return False + + if len(arg) <= 5000: + check_count = int(len(arg) * 0.1) + else: + check_count = 500 + else: + assert 0 <= check_count <= len(arg), \ + 'check_count must be in next bounds: [0; len(arg)]' + if check_count == 0: + return False + + assert 0 < unique_share < 1, 'unique_share must be in next bounds: (0; 1)' + + unique_elements = unique(arg[:check_count]) + if len(unique_elements) > check_count * unique_share: + do_caching = False + return do_caching + + def _maybe_cache(arg, format, cache, convert_listlike): """ Create a cache of unique dates from an array of dates Parameters ---------- - arg : integer, float, string, datetime, list, tuple, 1-d array, Series + arg : listlike, tuple, 1-d array, Series format : string Strftime format to parse time cache : boolean @@ -65,11 +127,12 @@ def _maybe_cache(arg, format, cache, convert_listlike): cache_array = Series() if cache: # Perform a quicker unique check - from pandas import Index - unique_dates = Index(arg).unique() + if not should_cache(arg): + return cache_array + + unique_dates = unique(arg) if len(unique_dates) < len(arg): - cache_dates = convert_listlike(unique_dates.to_numpy(), - True, format) + cache_dates = convert_listlike(unique_dates, True, format) cache_array = Series(cache_dates, index=unique_dates) return cache_array @@ -448,7 +511,7 @@ def _adjust_to_origin(arg, origin, unit): def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, unit=None, infer_datetime_format=False, origin='unix', - cache=False): + cache=True): """ Convert argument to datetime. @@ -529,13 +592,16 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, origin. .. versionadded:: 0.20.0 - cache : boolean, default False + cache : boolean, default True If True, use a cache of unique, converted dates to apply the datetime conversion. May produce significant speed-up when parsing duplicate date strings, especially ones with timezone offsets. .. versionadded:: 0.23.0 + .. versionchanged:: 0.25.0 + - changed default value from False to True + Returns ------- ret : datetime if parsing succeeded. diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index f401a7f7c9e9b9..784633b2512cec 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -2032,3 +2032,23 @@ def test_arg_tz_ns_unit(self, offset, utc, exp): result = to_datetime([arg], unit='ns', utc=utc) expected = to_datetime([exp]) tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize('listlike,do_caching', [ + ([1, 2, 3, 4, 5, 6, 7, 8, 9, 0], False), + ([1, 1, 1, 1, 4, 5, 6, 7, 8, 9], True) +]) +def test_should_cache(listlike, do_caching): + assert tools.should_cache(listlike, check_count=len(listlike), + unique_share=0.7) == do_caching + + +@pytest.mark.parametrize('unique_share,check_count, err_message', [ + (0.5, 11, r'check_count must be in next bounds: \[0; len\(arg\)\]'), + (10, 2, r'unique_share must be in next bounds: \(0; 1\)') +]) +def test_should_cache_errors(unique_share, check_count, err_message): + arg = [5] * 10 + + with pytest.raises(AssertionError, match=err_message): + tools.should_cache(arg, unique_share, check_count) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index b0c3944e0aff82..25589a1682f7a6 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -635,6 +635,21 @@ def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): parser.read_csv(StringIO(data), parse_dates=(1,)) +@pytest.mark.parametrize("cache_dates", [True, False]) +@pytest.mark.parametrize("value", [ + 'nan', '0', '']) +def test_bad_date_parse(all_parsers, cache_dates, value): + # if we have an invalid date make sure that we handle this with + # and w/o the cache properly + parser = all_parsers + s = StringIO(('%s,\n' % value) * 50000) + + parser.read_csv(s, + header=None, names=['foo', 'bar'], parse_dates=['foo'], + infer_datetime_format=False, + cache_dates=cache_dates) + + def test_parse_dates_empty_string(all_parsers): # see gh-2263 parser = all_parsers From b3d3ce72b3fbbf9895a5a238ee893d8b8bbde5a3 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Thu, 4 Jul 2019 02:38:36 +0100 Subject: [PATCH 158/238] CLN: simplify MultiIndex._shallow_copy (#27187) * CLN: simplify MultiIndex._shallow_copy * Don't collect names in ._shallow_copy, is done in .copy already --- pandas/core/indexes/multi.py | 4 ++-- pandas/tests/indexes/multi/test_equivalence.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index fd64f18c50b34b..0823a3ed9ad597 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -929,7 +929,7 @@ def _shallow_copy(self, values=None, **kwargs): # discards freq kwargs.pop('freq', None) return MultiIndex.from_tuples(values, names=names, **kwargs) - return self.view() + return self.copy(**kwargs) @cache_readonly def dtype(self): @@ -1810,7 +1810,7 @@ def remove_unused_levels(self): new_levels.append(lev) new_codes.append(level_codes) - result = self._shallow_copy() + result = self.view() if changed: result._reset_identity() diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py index bbb821dff53d30..3bdccbb8ab38d7 100644 --- a/pandas/tests/indexes/multi/test_equivalence.py +++ b/pandas/tests/indexes/multi/test_equivalence.py @@ -175,7 +175,7 @@ def test_is_(): assert mi2.is_(mi) assert mi.is_(mi2) - assert mi.is_(mi.set_names(["C", "D"])) + assert not mi.is_(mi.set_names(["C", "D"])) mi2 = mi.view() mi2.set_names(["E", "F"], inplace=True) assert mi.is_(mi2) From 4199c98a26ed51674b5d21cba039867640225510 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 27 Jun 2019 10:48:05 -0500 Subject: [PATCH 159/238] Prepare black formatting: update flake8 config + add to CI code checks --- ci/code_checks.sh | 7 +++++++ environment.yml | 1 + requirements-dev.txt | 1 + setup.cfg | 9 +++++++-- 4 files changed, 16 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 00c430064e4a55..1494452ca136ba 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -52,6 +52,13 @@ fi ### LINTING ### if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then + echo "black --version" + black --version + + MSG='Checking black formatting' ; echo $MSG + black . --check + RET=$(($RET + $?)) ; echo $MSG "DONE" + # `setup.cfg` contains the list of error codes that are being ignored in flake8 echo "flake8 --version" diff --git a/environment.yml b/environment.yml index c21a0949fc4039..93e8302b498a0d 100644 --- a/environment.yml +++ b/environment.yml @@ -16,6 +16,7 @@ dependencies: - cython>=0.28.2 # code checks + - black - cpplint - flake8 - flake8-comprehensions # used by flake8, linting of unnecessary comprehensions diff --git a/requirements-dev.txt b/requirements-dev.txt index 169af7da5e037b..e49ad10bfc99da 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,6 +3,7 @@ python-dateutil>=2.6.1 pytz asv cython>=0.28.2 +black cpplint flake8 flake8-comprehensions diff --git a/setup.cfg b/setup.cfg index 77dc043042f794..fee0ab60f25b53 100644 --- a/setup.cfg +++ b/setup.cfg @@ -12,8 +12,10 @@ tag_prefix = v parentdir_prefix = pandas- [flake8] -max-line-length = 79 +max-line-length = 88 ignore = + E501, # longer line length + E203, # space before : (needed for how black formats slicing) W503, # line break before binary operator W504, # line break after binary operator E402, # module level import not at top of file @@ -113,9 +115,12 @@ sections=FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE, known_first_party=pandas known_third_party=Cython,numpy,dateutil,matplotlib,python-dateutil,pytz,pyarrow,pytest -multi_line_output=4 + +multi_line_output=3 +include_trailing_comma=True force_grid_wrap=0 combine_as_imports=True +line_length=88 force_sort_within_sections=True skip_glob=env, skip= From bb6135880e5e453d7701764b9f2e4ad3356a68d7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 3 Jul 2019 22:34:46 -0400 Subject: [PATCH 160/238] STYLE: Apply black formatting --- asv_bench/benchmarks/algorithms.py | 123 +- asv_bench/benchmarks/attrs_caching.py | 6 +- asv_bench/benchmarks/binary_ops.py | 51 +- asv_bench/benchmarks/categoricals.py | 114 +- asv_bench/benchmarks/ctors.py | 52 +- asv_bench/benchmarks/dtypes.py | 32 +- asv_bench/benchmarks/eval.py | 31 +- asv_bench/benchmarks/frame_ctor.py | 22 +- asv_bench/benchmarks/frame_methods.py | 223 +- asv_bench/benchmarks/gil.py | 136 +- asv_bench/benchmarks/groupby.py | 515 +- asv_bench/benchmarks/index_object.py | 75 +- asv_bench/benchmarks/indexing.py | 135 +- asv_bench/benchmarks/indexing_engines.py | 59 +- asv_bench/benchmarks/inference.py | 75 +- asv_bench/benchmarks/io/csv.py | 309 +- asv_bench/benchmarks/io/excel.py | 18 +- asv_bench/benchmarks/io/hdf.py | 106 +- asv_bench/benchmarks/io/json.py | 134 +- asv_bench/benchmarks/io/msgpack.py | 13 +- asv_bench/benchmarks/io/parsers.py | 18 +- asv_bench/benchmarks/io/pickle.py | 13 +- asv_bench/benchmarks/io/sas.py | 20 +- asv_bench/benchmarks/io/sql.py | 161 +- asv_bench/benchmarks/io/stata.py | 38 +- asv_bench/benchmarks/join_merge.py | 274 +- asv_bench/benchmarks/multiindex_object.py | 84 +- asv_bench/benchmarks/offset.py | 73 +- asv_bench/benchmarks/pandas_vb_common.py | 39 +- asv_bench/benchmarks/period.py | 92 +- asv_bench/benchmarks/plotting.py | 47 +- asv_bench/benchmarks/reindex.py | 70 +- asv_bench/benchmarks/replace.py | 29 +- asv_bench/benchmarks/reshape.py | 144 +- asv_bench/benchmarks/rolling.py | 80 +- asv_bench/benchmarks/series_methods.py | 141 +- asv_bench/benchmarks/sparse.py | 49 +- asv_bench/benchmarks/stat_ops.py | 47 +- asv_bench/benchmarks/strings.py | 84 +- asv_bench/benchmarks/timedelta.py | 59 +- asv_bench/benchmarks/timeseries.py | 206 +- asv_bench/benchmarks/timestamp.py | 35 +- ci/print_skipped.py | 33 +- doc/logo/pandas_logo.py | 16 +- doc/make.py | 266 +- doc/source/conf.py | 347 +- doc/sphinxext/announce.py | 48 +- doc/sphinxext/contributors.py | 24 +- pandas/__init__.py | 189 +- pandas/_config/__init__.py | 21 +- pandas/_config/config.py | 126 +- pandas/_config/dates.py | 12 +- pandas/_config/display.py | 11 +- pandas/_config/localization.py | 22 +- pandas/_libs/__init__.py | 9 +- pandas/_typing.py | 28 +- pandas/_version.py | 123 +- pandas/api/extensions/__init__.py | 14 +- pandas/api/types/__init__.py | 6 +- pandas/arrays/__init__.py | 27 +- pandas/compat/__init__.py | 14 +- pandas/compat/_optional.py | 17 +- pandas/compat/chainmap.py | 1 - pandas/compat/numpy/__init__.py | 47 +- pandas/compat/numpy/function.py | 296 +- pandas/compat/pickle_compat.py | 136 +- pandas/conftest.py | 274 +- pandas/core/accessor.py | 67 +- pandas/core/algorithms.py | 605 ++- pandas/core/api.py | 19 +- pandas/core/apply.py | 162 +- pandas/core/arrays/__init__.py | 5 +- pandas/core/arrays/_ranges.py | 78 +- pandas/core/arrays/array_.py | 31 +- pandas/core/arrays/base.py | 97 +- pandas/core/arrays/categorical.py | 636 +-- pandas/core/arrays/datetimelike.py | 354 +- pandas/core/arrays/datetimes.py | 647 ++- pandas/core/arrays/integer.py | 274 +- pandas/core/arrays/interval.py | 348 +- pandas/core/arrays/numpy_.py | 175 +- pandas/core/arrays/period.py | 267 +- pandas/core/arrays/sparse.py | 462 +- pandas/core/arrays/timedeltas.py | 249 +- pandas/core/base.py | 368 +- pandas/core/common.py | 50 +- pandas/core/computation/align.py | 36 +- pandas/core/computation/check.py | 5 +- pandas/core/computation/common.py | 2 +- pandas/core/computation/engines.py | 23 +- pandas/core/computation/eval.py | 95 +- pandas/core/computation/expr.py | 347 +- pandas/core/computation/expressions.py | 88 +- pandas/core/computation/ops.py | 194 +- pandas/core/computation/pytables.py | 205 +- pandas/core/computation/scope.py | 66 +- pandas/core/config_init.py | 351 +- pandas/core/dtypes/api.py | 55 +- pandas/core/dtypes/base.py | 16 +- pandas/core/dtypes/cast.py | 342 +- pandas/core/dtypes/common.py | 228 +- pandas/core/dtypes/concat.py | 167 +- pandas/core/dtypes/dtypes.py | 243 +- pandas/core/dtypes/generic.py | 125 +- pandas/core/dtypes/inference.py | 47 +- pandas/core/dtypes/missing.py | 109 +- pandas/core/frame.py | 1980 +++++--- pandas/core/generic.py | 2806 +++++++---- pandas/core/groupby/__init__.py | 5 +- pandas/core/groupby/base.py | 67 +- pandas/core/groupby/categorical.py | 15 +- pandas/core/groupby/generic.py | 467 +- pandas/core/groupby/groupby.py | 600 ++- pandas/core/groupby/grouper.py | 230 +- pandas/core/groupby/ops.py | 351 +- pandas/core/index.py | 28 +- pandas/core/indexes/accessors.py | 112 +- pandas/core/indexes/api.py | 83 +- pandas/core/indexes/base.py | 1376 +++--- pandas/core/indexes/category.py | 253 +- pandas/core/indexes/datetimelike.py | 182 +- pandas/core/indexes/datetimes.py | 582 ++- pandas/core/indexes/frozen.py | 27 +- pandas/core/indexes/interval.py | 524 +- pandas/core/indexes/multi.py | 944 ++-- pandas/core/indexes/numeric.py | 217 +- pandas/core/indexes/period.py | 406 +- pandas/core/indexes/range.py | 206 +- pandas/core/indexes/timedeltas.py | 262 +- pandas/core/indexing.py | 531 +- pandas/core/internals/__init__.py | 26 +- pandas/core/internals/blocks.py | 1119 +++-- pandas/core/internals/concat.py | 149 +- pandas/core/internals/construction.py | 236 +- pandas/core/internals/managers.py | 641 ++- pandas/core/missing.py | 292 +- pandas/core/nanops.py | 297 +- pandas/core/ops.py | 828 ++-- pandas/core/resample.py | 510 +- pandas/core/reshape/concat.py | 212 +- pandas/core/reshape/melt.py | 94 +- pandas/core/reshape/merge.py | 920 ++-- pandas/core/reshape/pivot.py | 267 +- pandas/core/reshape/reshape.py | 324 +- pandas/core/reshape/tile.py | 182 +- pandas/core/reshape/util.py | 9 +- pandas/core/series.py | 1041 ++-- pandas/core/sorting.py | 106 +- pandas/core/sparse/frame.py | 396 +- pandas/core/sparse/scipy_sparse.py | 49 +- pandas/core/sparse/series.py | 277 +- pandas/core/strings.py | 798 +-- pandas/core/tools/datetimes.py | 391 +- pandas/core/tools/numeric.py | 48 +- pandas/core/tools/timedeltas.py | 57 +- pandas/core/util/hashing.py | 132 +- pandas/core/window.py | 964 ++-- pandas/errors/__init__.py | 11 +- pandas/io/clipboard/__init__.py | 41 +- pandas/io/clipboard/clipboards.py | 64 +- pandas/io/clipboard/exceptions.py | 1 - pandas/io/clipboard/windows.py | 49 +- pandas/io/clipboards.py | 55 +- pandas/io/common.py | 149 +- pandas/io/date_converters.py | 15 +- pandas/io/excel/_base.py | 365 +- pandas/io/excel/_odfreader.py | 44 +- pandas/io/excel/_openpyxl.py | 109 +- pandas/io/excel/_util.py | 43 +- pandas/io/excel/_xlrd.py | 38 +- pandas/io/excel/_xlsxwriter.py | 240 +- pandas/io/excel/_xlwt.py | 59 +- pandas/io/feather_format.py | 36 +- pandas/io/formats/console.py | 17 +- pandas/io/formats/css.py | 156 +- pandas/io/formats/csvs.py | 188 +- pandas/io/formats/excel.py | 456 +- pandas/io/formats/format.py | 699 +-- pandas/io/formats/html.py | 285 +- pandas/io/formats/latex.py | 151 +- pandas/io/formats/printing.py | 179 +- pandas/io/formats/style.py | 445 +- pandas/io/gbq.py | 81 +- pandas/io/gcs.py | 13 +- pandas/io/html.py | 254 +- pandas/io/json/json.py | 581 ++- pandas/io/json/normalize.py | 74 +- pandas/io/json/table_schema.py | 152 +- pandas/io/msgpack/__init__.py | 4 +- pandas/io/msgpack/exceptions.py | 1 - pandas/io/packers.py | 732 +-- pandas/io/parquet.py | 136 +- pandas/io/parsers.py | 1605 +++--- pandas/io/pickle.py | 14 +- pandas/io/pytables.py | 2055 ++++---- pandas/io/s3.py | 13 +- pandas/io/sas/sas7bdat.py | 326 +- pandas/io/sas/sas_constants.py | 140 +- pandas/io/sas/sas_xport.py | 190 +- pandas/io/sas/sasreader.py | 38 +- pandas/io/spss.py | 13 +- pandas/io/sql.py | 672 ++- pandas/io/stata.py | 1243 +++-- pandas/plotting/__init__.py | 50 +- pandas/plotting/_core.py | 380 +- pandas/plotting/_matplotlib/__init__.py | 81 +- pandas/plotting/_matplotlib/boxplot.py | 254 +- pandas/plotting/_matplotlib/compat.py | 12 +- pandas/plotting/_matplotlib/converter.py | 433 +- pandas/plotting/_matplotlib/core.py | 613 ++- pandas/plotting/_matplotlib/hist.py | 316 +- pandas/plotting/_matplotlib/misc.py | 171 +- pandas/plotting/_matplotlib/style.py | 24 +- pandas/plotting/_matplotlib/timeseries.py | 143 +- pandas/plotting/_matplotlib/tools.py | 103 +- pandas/plotting/_misc.py | 138 +- pandas/testing.py | 5 +- pandas/tests/api/test_api.py | 215 +- pandas/tests/api/test_types.py | 72 +- pandas/tests/arithmetic/conftest.py | 150 +- pandas/tests/arithmetic/test_datetime64.py | 1484 +++--- pandas/tests/arithmetic/test_numeric.py | 544 ++- pandas/tests/arithmetic/test_object.py | 215 +- pandas/tests/arithmetic/test_period.py | 641 +-- pandas/tests/arithmetic/test_timedelta64.py | 1071 ++-- pandas/tests/arrays/categorical/common.py | 6 +- pandas/tests/arrays/categorical/test_algos.py | 78 +- .../arrays/categorical/test_analytics.py | 144 +- pandas/tests/arrays/categorical/test_api.py | 261 +- .../arrays/categorical/test_constructors.py | 293 +- .../tests/arrays/categorical/test_dtypes.py | 136 +- .../tests/arrays/categorical/test_indexing.py | 173 +- .../tests/arrays/categorical/test_missing.py | 36 +- .../arrays/categorical/test_operators.py | 228 +- pandas/tests/arrays/categorical/test_repr.py | 116 +- .../tests/arrays/categorical/test_sorting.py | 38 +- .../tests/arrays/categorical/test_subclass.py | 13 +- .../tests/arrays/categorical/test_warnings.py | 12 +- pandas/tests/arrays/interval/test_interval.py | 75 +- pandas/tests/arrays/interval/test_ops.py | 56 +- pandas/tests/arrays/sparse/test_accessor.py | 94 +- .../tests/arrays/sparse/test_arithmetics.py | 144 +- pandas/tests/arrays/sparse/test_array.py | 446 +- pandas/tests/arrays/sparse/test_dtype.py | 182 +- pandas/tests/arrays/sparse/test_libsparse.py | 266 +- pandas/tests/arrays/test_array.py | 381 +- pandas/tests/arrays/test_datetimelike.py | 193 +- pandas/tests/arrays/test_datetimes.py | 193 +- pandas/tests/arrays/test_integer.py | 316 +- pandas/tests/arrays/test_numpy.py | 93 +- pandas/tests/arrays/test_period.py | 194 +- pandas/tests/arrays/test_timedeltas.py | 77 +- pandas/tests/computation/test_compat.py | 15 +- pandas/tests/computation/test_eval.py | 1173 ++--- pandas/tests/config/test_config.py | 371 +- pandas/tests/config/test_localization.py | 8 +- .../dtypes/cast/test_construct_from_scalar.py | 6 +- .../dtypes/cast/test_construct_ndarray.py | 17 +- .../dtypes/cast/test_construct_object_arr.py | 6 +- pandas/tests/dtypes/cast/test_downcast.py | 30 +- .../dtypes/cast/test_find_common_type.py | 154 +- .../dtypes/cast/test_infer_datetimelike.py | 13 +- pandas/tests/dtypes/cast/test_infer_dtype.py | 93 +- pandas/tests/dtypes/cast/test_promote.py | 575 ++- pandas/tests/dtypes/cast/test_upcast.py | 90 +- pandas/tests/dtypes/test_common.py | 480 +- pandas/tests/dtypes/test_concat.py | 100 +- pandas/tests/dtypes/test_dtypes.py | 681 +-- pandas/tests/dtypes/test_generic.py | 38 +- pandas/tests/dtypes/test_inference.py | 978 ++-- pandas/tests/dtypes/test_missing.py | 344 +- pandas/tests/extension/arrow/bool.py | 32 +- pandas/tests/extension/arrow/test_bool.py | 4 +- pandas/tests/extension/base/__init__.py | 8 +- pandas/tests/extension/base/base.py | 4 +- pandas/tests/extension/base/constructors.py | 5 +- pandas/tests/extension/base/dtype.py | 27 +- pandas/tests/extension/base/getitem.py | 48 +- pandas/tests/extension/base/groupby.py | 59 +- pandas/tests/extension/base/interface.py | 9 +- pandas/tests/extension/base/io.py | 13 +- pandas/tests/extension/base/methods.py | 136 +- pandas/tests/extension/base/missing.py | 55 +- pandas/tests/extension/base/ops.py | 19 +- pandas/tests/extension/base/printing.py | 6 +- pandas/tests/extension/base/reduce.py | 13 +- pandas/tests/extension/base/reshaping.py | 199 +- pandas/tests/extension/base/setitem.py | 46 +- pandas/tests/extension/conftest.py | 25 +- pandas/tests/extension/decimal/__init__.py | 2 +- pandas/tests/extension/decimal/array.py | 35 +- .../tests/extension/decimal/test_decimal.py | 147 +- pandas/tests/extension/json/__init__.py | 2 +- pandas/tests/extension/json/array.py | 44 +- pandas/tests/extension/json/test_json.py | 82 +- pandas/tests/extension/test_categorical.py | 27 +- pandas/tests/extension/test_common.py | 31 +- pandas/tests/extension/test_datetime.py | 94 +- pandas/tests/extension/test_external_block.py | 21 +- pandas/tests/extension/test_integer.py | 59 +- pandas/tests/extension/test_interval.py | 7 +- pandas/tests/extension/test_numpy.py | 62 +- pandas/tests/extension/test_period.py | 28 +- pandas/tests/extension/test_sparse.py | 90 +- pandas/tests/frame/common.py | 114 +- pandas/tests/frame/conftest.py | 72 +- pandas/tests/frame/test_alter_axes.py | 1289 ++--- pandas/tests/frame/test_analytics.py | 2001 ++++---- pandas/tests/frame/test_api.py | 264 +- pandas/tests/frame/test_apply.py | 1013 ++-- pandas/tests/frame/test_arithmetic.py | 303 +- pandas/tests/frame/test_asof.py | 79 +- .../tests/frame/test_axis_select_reindex.py | 735 +-- pandas/tests/frame/test_block_internals.py | 461 +- pandas/tests/frame/test_combine_concat.py | 800 +-- pandas/tests/frame/test_constructors.py | 1827 +++---- pandas/tests/frame/test_convert_to.py | 634 +-- pandas/tests/frame/test_dtypes.py | 1182 +++-- pandas/tests/frame/test_duplicates.py | 300 +- pandas/tests/frame/test_indexing.py | 1975 ++++---- pandas/tests/frame/test_join.py | 154 +- pandas/tests/frame/test_missing.py | 721 +-- pandas/tests/frame/test_mutate_columns.py | 209 +- pandas/tests/frame/test_nonunique_indexes.py | 495 +- pandas/tests/frame/test_operators.py | 514 +- pandas/tests/frame/test_period.py | 101 +- pandas/tests/frame/test_quantile.py | 410 +- pandas/tests/frame/test_query_eval.py | 709 +-- pandas/tests/frame/test_rank.py | 176 +- pandas/tests/frame/test_replace.py | 1143 +++-- pandas/tests/frame/test_repr_info.py | 317 +- pandas/tests/frame/test_reshape.py | 1023 ++-- .../frame/test_sort_values_level_as_str.py | 71 +- pandas/tests/frame/test_sorting.py | 616 +-- pandas/tests/frame/test_subclass.py | 586 +-- pandas/tests/frame/test_timeseries.py | 590 +-- pandas/tests/frame/test_timezones.py | 179 +- pandas/tests/frame/test_to_csv.py | 889 ++-- pandas/tests/frame/test_validate.py | 19 +- pandas/tests/generic/test_frame.py | 220 +- pandas/tests/generic/test_generic.py | 332 +- .../generic/test_label_or_level_utils.py | 99 +- pandas/tests/generic/test_series.py | 151 +- .../tests/groupby/aggregate/test_aggregate.py | 407 +- pandas/tests/groupby/aggregate/test_cython.py | 216 +- pandas/tests/groupby/aggregate/test_other.py | 595 ++- pandas/tests/groupby/conftest.py | 98 +- pandas/tests/groupby/test_apply.py | 444 +- pandas/tests/groupby/test_bin_groupby.py | 59 +- pandas/tests/groupby/test_categorical.py | 1089 +++-- pandas/tests/groupby/test_counting.py | 125 +- pandas/tests/groupby/test_filters.py | 338 +- pandas/tests/groupby/test_function.py | 1259 ++--- pandas/tests/groupby/test_groupby.py | 1187 ++--- pandas/tests/groupby/test_grouping.py | 666 +-- pandas/tests/groupby/test_index_as_string.py | 70 +- pandas/tests/groupby/test_nth.py | 516 +- pandas/tests/groupby/test_rank.py | 566 ++- pandas/tests/groupby/test_timegrouper.py | 845 ++-- pandas/tests/groupby/test_transform.py | 750 +-- pandas/tests/groupby/test_value_counts.py | 48 +- pandas/tests/groupby/test_whitelist.py | 318 +- pandas/tests/indexes/common.py | 206 +- pandas/tests/indexes/conftest.py | 44 +- pandas/tests/indexes/datetimelike.py | 13 +- .../indexes/datetimes/test_arithmetic.py | 93 +- pandas/tests/indexes/datetimes/test_astype.py | 283 +- .../indexes/datetimes/test_construction.py | 832 ++-- .../indexes/datetimes/test_date_range.py | 699 +-- .../tests/indexes/datetimes/test_datetime.py | 221 +- .../indexes/datetimes/test_datetimelike.py | 9 +- .../tests/indexes/datetimes/test_formats.py | 262 +- .../tests/indexes/datetimes/test_indexing.py | 593 ++- pandas/tests/indexes/datetimes/test_misc.py | 289 +- .../tests/indexes/datetimes/test_missing.py | 82 +- pandas/tests/indexes/datetimes/test_ops.py | 312 +- .../indexes/datetimes/test_partial_slicing.py | 381 +- .../indexes/datetimes/test_scalar_compat.py | 272 +- pandas/tests/indexes/datetimes/test_setops.py | 263 +- .../tests/indexes/datetimes/test_timezones.py | 956 ++-- pandas/tests/indexes/datetimes/test_tools.py | 2062 ++++---- pandas/tests/indexes/interval/test_astype.py | 139 +- .../indexes/interval/test_construction.py | 240 +- .../tests/indexes/interval/test_interval.py | 581 ++- .../indexes/interval/test_interval_new.py | 213 +- .../indexes/interval/test_interval_range.py | 229 +- .../indexes/interval/test_interval_tree.py | 105 +- pandas/tests/indexes/interval/test_setops.py | 61 +- pandas/tests/indexes/multi/conftest.py | 47 +- pandas/tests/indexes/multi/test_analytics.py | 203 +- pandas/tests/indexes/multi/test_astype.py | 8 +- pandas/tests/indexes/multi/test_compat.py | 10 +- .../tests/indexes/multi/test_constructor.py | 481 +- pandas/tests/indexes/multi/test_contains.py | 66 +- pandas/tests/indexes/multi/test_conversion.py | 164 +- pandas/tests/indexes/multi/test_copy.py | 35 +- pandas/tests/indexes/multi/test_drop.py | 86 +- pandas/tests/indexes/multi/test_duplicates.py | 156 +- .../tests/indexes/multi/test_equivalence.py | 36 +- pandas/tests/indexes/multi/test_format.py | 36 +- pandas/tests/indexes/multi/test_get_set.py | 154 +- pandas/tests/indexes/multi/test_indexing.py | 237 +- pandas/tests/indexes/multi/test_integrity.py | 125 +- pandas/tests/indexes/multi/test_join.py | 50 +- pandas/tests/indexes/multi/test_missing.py | 59 +- pandas/tests/indexes/multi/test_monotonic.py | 131 +- pandas/tests/indexes/multi/test_names.py | 57 +- .../indexes/multi/test_partial_indexing.py | 42 +- pandas/tests/indexes/multi/test_reindex.py | 43 +- pandas/tests/indexes/multi/test_reshape.py | 100 +- pandas/tests/indexes/multi/test_set_ops.py | 77 +- pandas/tests/indexes/multi/test_sorting.py | 138 +- .../tests/indexes/period/test_arithmetic.py | 87 +- pandas/tests/indexes/period/test_asfreq.py | 203 +- pandas/tests/indexes/period/test_astype.py | 78 +- .../tests/indexes/period/test_construction.py | 398 +- pandas/tests/indexes/period/test_formats.py | 185 +- pandas/tests/indexes/period/test_indexing.py | 494 +- pandas/tests/indexes/period/test_ops.py | 220 +- .../indexes/period/test_partial_slicing.py | 121 +- pandas/tests/indexes/period/test_period.py | 344 +- .../tests/indexes/period/test_period_range.py | 70 +- .../indexes/period/test_scalar_compat.py | 10 +- pandas/tests/indexes/period/test_setops.py | 381 +- pandas/tests/indexes/period/test_tools.py | 322 +- pandas/tests/indexes/test_base.py | 1791 ++++--- pandas/tests/indexes/test_category.py | 679 +-- pandas/tests/indexes/test_common.py | 75 +- pandas/tests/indexes/test_frozen.py | 4 +- pandas/tests/indexes/test_numeric.py | 527 +- pandas/tests/indexes/test_numpy_compat.py | 58 +- pandas/tests/indexes/test_range.py | 426 +- pandas/tests/indexes/test_setops.py | 74 +- .../indexes/timedeltas/test_arithmetic.py | 187 +- .../tests/indexes/timedeltas/test_astype.py | 79 +- .../indexes/timedeltas/test_construction.py | 145 +- .../tests/indexes/timedeltas/test_formats.py | 108 +- .../tests/indexes/timedeltas/test_indexing.py | 265 +- pandas/tests/indexes/timedeltas/test_ops.py | 152 +- .../timedeltas/test_partial_slicing.py | 59 +- .../indexes/timedeltas/test_scalar_compat.py | 50 +- .../tests/indexes/timedeltas/test_setops.py | 98 +- .../indexes/timedeltas/test_timedelta.py | 189 +- .../timedeltas/test_timedelta_range.py | 51 +- pandas/tests/indexes/timedeltas/test_tools.py | 156 +- pandas/tests/indexing/common.py | 156 +- pandas/tests/indexing/conftest.py | 27 +- .../tests/indexing/interval/test_interval.py | 18 +- .../indexing/interval/test_interval_new.py | 35 +- pandas/tests/indexing/multiindex/conftest.py | 23 +- .../multiindex/test_chaining_and_caching.py | 29 +- .../indexing/multiindex/test_datetime.py | 8 +- .../tests/indexing/multiindex/test_getitem.py | 210 +- pandas/tests/indexing/multiindex/test_iloc.py | 82 +- .../indexing/multiindex/test_indexing_slow.py | 53 +- pandas/tests/indexing/multiindex/test_ix.py | 41 +- pandas/tests/indexing/multiindex/test_loc.py | 247 +- .../indexing/multiindex/test_multiindex.py | 82 +- .../tests/indexing/multiindex/test_partial.py | 127 +- .../tests/indexing/multiindex/test_set_ops.py | 25 +- .../tests/indexing/multiindex/test_setitem.py | 400 +- .../tests/indexing/multiindex/test_slice.py | 554 ++- .../tests/indexing/multiindex/test_sorted.py | 55 +- pandas/tests/indexing/multiindex/test_xs.py | 190 +- pandas/tests/indexing/test_callable.py | 167 +- pandas/tests/indexing/test_categorical.py | 515 +- .../indexing/test_chaining_and_caching.py | 268 +- pandas/tests/indexing/test_coercion.py | 891 ++-- pandas/tests/indexing/test_datetime.py | 238 +- pandas/tests/indexing/test_floats.py | 786 +-- pandas/tests/indexing/test_iloc.py | 435 +- pandas/tests/indexing/test_indexing.py | 929 ++-- .../tests/indexing/test_indexing_engines.py | 33 +- pandas/tests/indexing/test_indexing_slow.py | 5 +- pandas/tests/indexing/test_ix.py | 291 +- pandas/tests/indexing/test_loc.py | 882 ++-- pandas/tests/indexing/test_partial.py | 317 +- pandas/tests/indexing/test_scalar.py | 111 +- pandas/tests/indexing/test_timedelta.py | 101 +- pandas/tests/internals/test_internals.py | 934 ++-- pandas/tests/io/conftest.py | 34 +- pandas/tests/io/excel/conftest.py | 5 +- pandas/tests/io/excel/test_odf.py | 17 +- pandas/tests/io/excel/test_openpyxl.py | 98 +- pandas/tests/io/excel/test_readers.py | 767 +-- pandas/tests/io/excel/test_style.py | 143 +- pandas/tests/io/excel/test_writers.py | 745 +-- pandas/tests/io/excel/test_xlrd.py | 7 +- pandas/tests/io/excel/test_xlsxwriter.py | 21 +- pandas/tests/io/excel/test_xlwt.py | 33 +- pandas/tests/io/formats/test_console.py | 50 +- pandas/tests/io/formats/test_css.py | 297 +- .../tests/io/formats/test_eng_formatting.py | 165 +- pandas/tests/io/formats/test_format.py | 2682 +++++----- pandas/tests/io/formats/test_printing.py | 111 +- pandas/tests/io/formats/test_style.py | 1794 ++++--- pandas/tests/io/formats/test_to_csv.py | 477 +- pandas/tests/io/formats/test_to_excel.py | 471 +- pandas/tests/io/formats/test_to_html.py | 633 +-- pandas/tests/io/formats/test_to_latex.py | 212 +- .../tests/io/generate_legacy_storage_files.py | 404 +- pandas/tests/io/json/test_compression.py | 43 +- .../tests/io/json/test_json_table_schema.py | 852 ++-- pandas/tests/io/json/test_normalize.py | 774 +-- pandas/tests/io/json/test_pandas.py | 1314 +++-- pandas/tests/io/json/test_readlines.py | 81 +- pandas/tests/io/json/test_ujson.py | 512 +- pandas/tests/io/msgpack/test_buffer.py | 11 +- pandas/tests/io/msgpack/test_case.py | 96 +- pandas/tests/io/msgpack/test_except.py | 11 +- pandas/tests/io/msgpack/test_extension.py | 50 +- pandas/tests/io/msgpack/test_format.py | 101 +- pandas/tests/io/msgpack/test_limits.py | 15 +- pandas/tests/io/msgpack/test_newspec.py | 58 +- pandas/tests/io/msgpack/test_obj.py | 33 +- pandas/tests/io/msgpack/test_pack.py | 86 +- pandas/tests/io/msgpack/test_read_size.py | 42 +- pandas/tests/io/msgpack/test_seq.py | 2 +- pandas/tests/io/msgpack/test_sequnpack.py | 77 +- pandas/tests/io/msgpack/test_subtype.py | 2 +- pandas/tests/io/msgpack/test_unpack.py | 20 +- pandas/tests/io/msgpack/test_unpack_raw.py | 10 +- pandas/tests/io/parser/conftest.py | 12 +- pandas/tests/io/parser/test_c_parser_only.py | 289 +- pandas/tests/io/parser/test_comment.py | 41 +- pandas/tests/io/parser/test_common.py | 1056 ++-- pandas/tests/io/parser/test_compression.py | 22 +- pandas/tests/io/parser/test_converters.py | 42 +- pandas/tests/io/parser/test_dialect.py | 65 +- pandas/tests/io/parser/test_dtypes.py | 316 +- pandas/tests/io/parser/test_header.py | 375 +- pandas/tests/io/parser/test_index_col.py | 112 +- pandas/tests/io/parser/test_mangle_dupes.py | 89 +- pandas/tests/io/parser/test_multi_thread.py | 37 +- pandas/tests/io/parser/test_na_values.py | 384 +- pandas/tests/io/parser/test_network.py | 126 +- pandas/tests/io/parser/test_parse_dates.py | 1357 ++++-- .../io/parser/test_python_parser_only.py | 96 +- pandas/tests/io/parser/test_quoting.py | 88 +- pandas/tests/io/parser/test_read_fwf.py | 235 +- pandas/tests/io/parser/test_skiprows.py | 182 +- pandas/tests/io/parser/test_textreader.py | 259 +- pandas/tests/io/parser/test_unsupported.py | 47 +- pandas/tests/io/parser/test_usecols.py | 377 +- pandas/tests/io/pytables/test_compat.py | 24 +- pandas/tests/io/pytables/test_pytables.py | 4292 +++++++++-------- pandas/tests/io/sas/test_sas.py | 11 +- pandas/tests/io/sas/test_sas7bdat.py | 88 +- pandas/tests/io/sas/test_xport.py | 17 +- pandas/tests/io/test_clipboard.py | 191 +- pandas/tests/io/test_common.py | 267 +- pandas/tests/io/test_compression.py | 92 +- pandas/tests/io/test_date_converters.py | 13 +- pandas/tests/io/test_feather.py | 106 +- pandas/tests/io/test_gbq.py | 72 +- pandas/tests/io/test_gcs.py | 65 +- pandas/tests/io/test_html.py | 652 +-- pandas/tests/io/test_packers.py | 503 +- pandas/tests/io/test_parquet.py | 386 +- pandas/tests/io/test_pickle.py | 76 +- pandas/tests/io/test_s3.py | 8 +- pandas/tests/io/test_spss.py | 7 +- pandas/tests/io/test_sql.py | 1734 ++++--- pandas/tests/io/test_stata.py | 1482 +++--- pandas/tests/plotting/common.py | 133 +- pandas/tests/plotting/test_backend.py | 27 +- pandas/tests/plotting/test_boxplot_method.py | 290 +- pandas/tests/plotting/test_converter.py | 154 +- pandas/tests/plotting/test_datetimelike.py | 607 +-- pandas/tests/plotting/test_frame.py | 1817 +++---- pandas/tests/plotting/test_groupby.py | 38 +- pandas/tests/plotting/test_hist_method.py | 155 +- pandas/tests/plotting/test_misc.py | 354 +- pandas/tests/plotting/test_series.py | 329 +- pandas/tests/reductions/test_reductions.py | 526 +- .../tests/reductions/test_stat_reductions.py | 102 +- pandas/tests/resample/conftest.py | 36 +- pandas/tests/resample/test_base.py | 95 +- pandas/tests/resample/test_datetime_index.py | 1277 ++--- pandas/tests/resample/test_period_index.py | 815 ++-- pandas/tests/resample/test_resample_api.py | 505 +- .../tests/resample/test_resampler_grouper.py | 238 +- pandas/tests/resample/test_time_grouper.py | 241 +- pandas/tests/resample/test_timedelta.py | 110 +- pandas/tests/reshape/merge/test_join.py | 755 +-- pandas/tests/reshape/merge/test_merge.py | 2257 +++++---- pandas/tests/reshape/merge/test_merge_asof.py | 1499 +++--- .../merge/test_merge_index_as_string.py | 99 +- .../tests/reshape/merge/test_merge_ordered.py | 100 +- pandas/tests/reshape/merge/test_multi.py | 914 ++-- pandas/tests/reshape/test_concat.py | 1973 ++++---- pandas/tests/reshape/test_cut.py | 366 +- pandas/tests/reshape/test_melt.py | 1271 +++-- pandas/tests/reshape/test_pivot.py | 2829 ++++++----- pandas/tests/reshape/test_qcut.py | 131 +- pandas/tests/reshape/test_reshape.py | 568 +-- .../tests/reshape/test_union_categoricals.py | 228 +- pandas/tests/reshape/test_util.py | 16 +- pandas/tests/scalar/interval/test_interval.py | 117 +- pandas/tests/scalar/interval/test_ops.py | 26 +- pandas/tests/scalar/period/test_asfreq.py | 1140 ++--- pandas/tests/scalar/period/test_period.py | 1170 ++--- pandas/tests/scalar/test_nat.py | 263 +- .../tests/scalar/timedelta/test_arithmetic.py | 243 +- .../scalar/timedelta/test_construction.py | 286 +- pandas/tests/scalar/timedelta/test_formats.py | 49 +- .../tests/scalar/timedelta/test_timedelta.py | 649 +-- .../tests/scalar/timestamp/test_arithmetic.py | 56 +- .../scalar/timestamp/test_comparisons.py | 41 +- .../tests/scalar/timestamp/test_rendering.py | 38 +- .../tests/scalar/timestamp/test_timestamp.py | 588 ++- .../tests/scalar/timestamp/test_timezones.py | 330 +- .../tests/scalar/timestamp/test_unary_ops.py | 297 +- pandas/tests/series/common.py | 7 +- pandas/tests/series/conftest.py | 6 +- pandas/tests/series/indexing/conftest.py | 2 +- .../tests/series/indexing/test_alter_index.py | 248 +- pandas/tests/series/indexing/test_boolean.py | 185 +- pandas/tests/series/indexing/test_callable.py | 16 +- pandas/tests/series/indexing/test_datetime.py | 309 +- pandas/tests/series/indexing/test_indexing.py | 323 +- pandas/tests/series/indexing/test_loc.py | 32 +- pandas/tests/series/indexing/test_numeric.py | 155 +- pandas/tests/series/test_alter_axes.py | 210 +- pandas/tests/series/test_analytics.py | 938 ++-- pandas/tests/series/test_api.py | 300 +- pandas/tests/series/test_apply.py | 562 ++- pandas/tests/series/test_arithmetic.py | 65 +- pandas/tests/series/test_asof.py | 58 +- pandas/tests/series/test_block_internals.py | 12 +- pandas/tests/series/test_combine_concat.py | 284 +- pandas/tests/series/test_constructors.py | 774 +-- pandas/tests/series/test_datetime_values.py | 539 ++- pandas/tests/series/test_dtypes.py | 313 +- pandas/tests/series/test_duplicates.py | 82 +- pandas/tests/series/test_internals.py | 112 +- pandas/tests/series/test_io.py | 118 +- pandas/tests/series/test_missing.py | 1194 +++-- pandas/tests/series/test_operators.py | 327 +- pandas/tests/series/test_period.py | 126 +- pandas/tests/series/test_quantile.py | 122 +- pandas/tests/series/test_rank.py | 463 +- pandas/tests/series/test_replace.py | 140 +- pandas/tests/series/test_repr.py | 150 +- pandas/tests/series/test_sorting.py | 100 +- pandas/tests/series/test_subclass.py | 59 +- pandas/tests/series/test_timeseries.py | 574 ++- pandas/tests/series/test_timezones.py | 235 +- pandas/tests/series/test_ufunc.py | 121 +- pandas/tests/series/test_validate.py | 9 +- pandas/tests/sparse/frame/conftest.py | 33 +- pandas/tests/sparse/frame/test_analytics.py | 4 +- pandas/tests/sparse/frame/test_apply.py | 39 +- pandas/tests/sparse/frame/test_frame.py | 1043 ++-- pandas/tests/sparse/frame/test_indexing.py | 74 +- pandas/tests/sparse/frame/test_to_csv.py | 9 +- .../tests/sparse/frame/test_to_from_scipy.py | 60 +- pandas/tests/sparse/series/test_indexing.py | 82 +- pandas/tests/sparse/series/test_series.py | 783 +-- pandas/tests/sparse/test_combine_concat.py | 230 +- pandas/tests/sparse/test_format.py | 125 +- pandas/tests/sparse/test_groupby.py | 48 +- pandas/tests/sparse/test_indexing.py | 751 ++- pandas/tests/sparse/test_pivot.py | 68 +- pandas/tests/sparse/test_reshape.py | 4 +- pandas/tests/test_algos.py | 1361 ++++-- pandas/tests/test_base.py | 778 +-- pandas/tests/test_common.py | 57 +- pandas/tests/test_downstream.py | 36 +- pandas/tests/test_errors.py | 24 +- pandas/tests/test_expressions.py | 263 +- pandas/tests/test_join.py | 196 +- pandas/tests/test_lib.py | 37 +- pandas/tests/test_multilevel.py | 1638 ++++--- pandas/tests/test_nanops.py | 815 ++-- pandas/tests/test_optional_dependency.py | 10 +- pandas/tests/test_register_accessor.py | 46 +- pandas/tests/test_sorting.py | 265 +- pandas/tests/test_strings.py | 2511 +++++----- pandas/tests/test_take.py | 203 +- pandas/tests/test_window.py | 3378 +++++++------ pandas/tests/tools/test_numeric.py | 371 +- .../tseries/frequencies/test_freq_code.py | 184 +- .../tseries/frequencies/test_inference.py | 324 +- .../tseries/frequencies/test_to_offset.py | 184 +- pandas/tests/tseries/holiday/test_calendar.py | 38 +- pandas/tests/tseries/holiday/test_federal.py | 34 +- pandas/tests/tseries/holiday/test_holiday.py | 289 +- .../tests/tseries/holiday/test_observance.py | 70 +- pandas/tests/tseries/offsets/common.py | 14 +- pandas/tests/tseries/offsets/conftest.py | 10 +- pandas/tests/tseries/offsets/test_fiscal.py | 613 +-- pandas/tests/tseries/offsets/test_offsets.py | 4208 +++++++++------- .../offsets/test_offsets_properties.py | 68 +- pandas/tests/tseries/offsets/test_ticks.py | 174 +- .../tests/tseries/offsets/test_yqm_offsets.py | 1624 ++++--- pandas/tests/tslibs/test_api.py | 66 +- pandas/tests/tslibs/test_array_to_datetime.py | 101 +- pandas/tests/tslibs/test_ccalendar.py | 15 +- pandas/tests/tslibs/test_conversion.py | 41 +- pandas/tests/tslibs/test_libfrequencies.py | 146 +- pandas/tests/tslibs/test_liboffsets.py | 162 +- pandas/tests/tslibs/test_normalize_date.py | 25 +- pandas/tests/tslibs/test_parse_iso8601.py | 76 +- pandas/tests/tslibs/test_parsing.py | 194 +- pandas/tests/tslibs/test_period_asfreq.py | 125 +- pandas/tests/tslibs/test_timedeltas.py | 21 +- pandas/tests/tslibs/test_timezones.py | 30 +- pandas/tests/util/test_assert_almost_equal.py | 176 +- .../util/test_assert_categorical_equal.py | 12 +- .../util/test_assert_extension_array_equal.py | 25 +- pandas/tests/util/test_assert_frame_equal.py | 124 +- pandas/tests/util/test_assert_index_equal.py | 35 +- .../util/test_assert_interval_array_equal.py | 13 +- .../util/test_assert_numpy_array_equal.py | 46 +- .../util/test_assert_produces_warning.py | 9 +- pandas/tests/util/test_assert_series_equal.py | 75 +- pandas/tests/util/test_deprecate.py | 28 +- pandas/tests/util/test_deprecate_kwarg.py | 6 +- pandas/tests/util/test_hashing.py | 176 +- pandas/tests/util/test_move.py | 1 + pandas/tests/util/test_safe_import.py | 13 +- pandas/tests/util/test_util.py | 5 +- pandas/tests/util/test_validate_args.py | 27 +- .../util/test_validate_args_and_kwargs.py | 56 +- pandas/tests/util/test_validate_kwargs.py | 18 +- pandas/tseries/converter.py | 24 +- pandas/tseries/frequencies.py | 155 +- pandas/tseries/holiday.py | 131 +- pandas/tseries/offsets.py | 936 ++-- pandas/util/__init__.py | 3 +- pandas/util/_decorators.py | 106 +- pandas/util/_depr_module.py | 32 +- pandas/util/_doctools.py | 72 +- pandas/util/_print_versions.py | 85 +- pandas/util/_test_decorators.py | 89 +- pandas/util/_tester.py | 6 +- pandas/util/_validators.py | 96 +- pandas/util/testing.py | 1176 +++-- scripts/download_wheels.py | 20 +- scripts/find_commits_touching_func.py | 131 +- scripts/generate_pip_deps_from_conda.py | 61 +- scripts/merge-pr.py | 146 +- scripts/tests/conftest.py | 7 +- scripts/tests/test_validate_docstrings.py | 687 ++- scripts/validate_docstrings.py | 702 +-- setup.py | 821 ++-- versioneer.py | 213 +- 748 files changed, 126206 insertions(+), 97282 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index b69efb4689486f..436093ef195ef7 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -5,7 +5,7 @@ import pandas as pd from pandas.util import testing as tm -for imp in ['pandas.util', 'pandas.tools.hashing']: +for imp in ["pandas.util", "pandas.tools.hashing"]: try: hashing = import_module(imp) break @@ -15,15 +15,17 @@ class Factorize: - params = [[True, False], ['int', 'uint', 'float', 'string']] - param_names = ['sort', 'dtype'] + params = [[True, False], ["int", "uint", "float", "string"]] + param_names = ["sort", "dtype"] def setup(self, sort, dtype): - N = 10**5 - data = {'int': pd.Int64Index(np.arange(N).repeat(5)), - 'uint': pd.UInt64Index(np.arange(N).repeat(5)), - 'float': pd.Float64Index(np.random.randn(N).repeat(5)), - 'string': tm.makeStringIndex(N).repeat(5)} + N = 10 ** 5 + data = { + "int": pd.Int64Index(np.arange(N).repeat(5)), + "uint": pd.UInt64Index(np.arange(N).repeat(5)), + "float": pd.Float64Index(np.random.randn(N).repeat(5)), + "string": tm.makeStringIndex(N).repeat(5), + } self.idx = data[dtype] def time_factorize(self, sort, dtype): @@ -32,15 +34,17 @@ def time_factorize(self, sort, dtype): class FactorizeUnique: - params = [[True, False], ['int', 'uint', 'float', 'string']] - param_names = ['sort', 'dtype'] + params = [[True, False], ["int", "uint", "float", "string"]] + param_names = ["sort", "dtype"] def setup(self, sort, dtype): - N = 10**5 - data = {'int': pd.Int64Index(np.arange(N)), - 'uint': pd.UInt64Index(np.arange(N)), - 'float': pd.Float64Index(np.arange(N)), - 'string': tm.makeStringIndex(N)} + N = 10 ** 5 + data = { + "int": pd.Int64Index(np.arange(N)), + "uint": pd.UInt64Index(np.arange(N)), + "float": pd.Float64Index(np.arange(N)), + "string": tm.makeStringIndex(N), + } self.idx = data[dtype] assert self.idx.is_unique @@ -50,15 +54,17 @@ def time_factorize(self, sort, dtype): class Duplicated: - params = [['first', 'last', False], ['int', 'uint', 'float', 'string']] - param_names = ['keep', 'dtype'] + params = [["first", "last", False], ["int", "uint", "float", "string"]] + param_names = ["keep", "dtype"] def setup(self, keep, dtype): - N = 10**5 - data = {'int': pd.Int64Index(np.arange(N).repeat(5)), - 'uint': pd.UInt64Index(np.arange(N).repeat(5)), - 'float': pd.Float64Index(np.random.randn(N).repeat(5)), - 'string': tm.makeStringIndex(N).repeat(5)} + N = 10 ** 5 + data = { + "int": pd.Int64Index(np.arange(N).repeat(5)), + "uint": pd.UInt64Index(np.arange(N).repeat(5)), + "float": pd.Float64Index(np.random.randn(N).repeat(5)), + "string": tm.makeStringIndex(N).repeat(5), + } self.idx = data[dtype] # cache is_unique self.idx.is_unique @@ -69,15 +75,17 @@ def time_duplicated(self, keep, dtype): class DuplicatedUniqueIndex: - params = ['int', 'uint', 'float', 'string'] - param_names = ['dtype'] + params = ["int", "uint", "float", "string"] + param_names = ["dtype"] def setup(self, dtype): - N = 10**5 - data = {'int': pd.Int64Index(np.arange(N)), - 'uint': pd.UInt64Index(np.arange(N)), - 'float': pd.Float64Index(np.random.randn(N)), - 'string': tm.makeStringIndex(N)} + N = 10 ** 5 + data = { + "int": pd.Int64Index(np.arange(N)), + "uint": pd.UInt64Index(np.arange(N)), + "float": pd.Float64Index(np.random.randn(N)), + "string": tm.makeStringIndex(N), + } self.idx = data[dtype] # cache is_unique self.idx.is_unique @@ -87,18 +95,21 @@ def time_duplicated_unique(self, dtype): class Hashing: - def setup_cache(self): - N = 10**5 + N = 10 ** 5 df = pd.DataFrame( - {'strings': pd.Series(tm.makeStringIndex(10000).take( - np.random.randint(0, 10000, size=N))), - 'floats': np.random.randn(N), - 'ints': np.arange(N), - 'dates': pd.date_range('20110101', freq='s', periods=N), - 'timedeltas': pd.timedelta_range('1 day', freq='s', periods=N)}) - df['categories'] = df['strings'].astype('category') + { + "strings": pd.Series( + tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=N)) + ), + "floats": np.random.randn(N), + "ints": np.arange(N), + "dates": pd.date_range("20110101", freq="s", periods=N), + "timedeltas": pd.timedelta_range("1 day", freq="s", periods=N), + } + ) + df["categories"] = df["strings"].astype("category") df.iloc[10:20] = np.nan return df @@ -106,35 +117,39 @@ def time_frame(self, df): hashing.hash_pandas_object(df) def time_series_int(self, df): - hashing.hash_pandas_object(df['ints']) + hashing.hash_pandas_object(df["ints"]) def time_series_string(self, df): - hashing.hash_pandas_object(df['strings']) + hashing.hash_pandas_object(df["strings"]) def time_series_float(self, df): - hashing.hash_pandas_object(df['floats']) + hashing.hash_pandas_object(df["floats"]) def time_series_categorical(self, df): - hashing.hash_pandas_object(df['categories']) + hashing.hash_pandas_object(df["categories"]) def time_series_timedeltas(self, df): - hashing.hash_pandas_object(df['timedeltas']) + hashing.hash_pandas_object(df["timedeltas"]) def time_series_dates(self, df): - hashing.hash_pandas_object(df['dates']) + hashing.hash_pandas_object(df["dates"]) class Quantile: - params = [[0, 0.5, 1], - ['linear', 'nearest', 'lower', 'higher', 'midpoint'], - ['float', 'int', 'uint']] - param_names = ['quantile', 'interpolation', 'dtype'] + params = [ + [0, 0.5, 1], + ["linear", "nearest", "lower", "higher", "midpoint"], + ["float", "int", "uint"], + ] + param_names = ["quantile", "interpolation", "dtype"] def setup(self, quantile, interpolation, dtype): - N = 10**5 - data = {'int': np.arange(N), - 'uint': np.arange(N).astype(np.uint64), - 'float': np.random.randn(N)} + N = 10 ** 5 + data = { + "int": np.arange(N), + "uint": np.arange(N).astype(np.uint64), + "float": np.random.randn(N), + } self.idx = pd.Series(data[dtype].repeat(5)) def time_quantile(self, quantile, interpolation, dtype): @@ -142,12 +157,12 @@ def time_quantile(self, quantile, interpolation, dtype): class SortIntegerArray: - params = [10**3, 10**5] + params = [10 ** 3, 10 ** 5] def setup(self, N): data = np.arange(N, dtype=float) data[40] = np.nan - self.array = pd.array(data, dtype='Int64') + self.array = pd.array(data, dtype="Int64") def time_argsort(self, N): self.array.argsort() diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py index dd316a2bc88d01..c43e5dfd729aad 100644 --- a/asv_bench/benchmarks/attrs_caching.py +++ b/asv_bench/benchmarks/attrs_caching.py @@ -1,5 +1,6 @@ import numpy as np from pandas import DataFrame + try: from pandas.util import cache_readonly except ImportError: @@ -7,7 +8,6 @@ class DataFrameAttributes: - def setup(self): self.df = DataFrame(np.random.randn(10, 6)) self.cur_index = self.df.index @@ -20,14 +20,12 @@ def time_set_index(self): class CacheReadonly: - def setup(self): - class Foo: - @cache_readonly def prop(self): return 5 + self.obj = Foo() def time_cache_readonly(self): diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index 26cd66284c41ee..fd3324b78f1c3d 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -1,6 +1,7 @@ import numpy as np from pandas import DataFrame, Series, date_range from pandas.core.algorithms import checked_add_with_arr + try: import pandas.core.computation.expressions as expr except ImportError: @@ -9,14 +10,14 @@ class Ops: - params = [[True, False], ['default', 1]] - param_names = ['use_numexpr', 'threads'] + params = [[True, False], ["default", 1]] + param_names = ["use_numexpr", "threads"] def setup(self, use_numexpr, threads): self.df = DataFrame(np.random.randn(20000, 100)) self.df2 = DataFrame(np.random.randn(20000, 100)) - if threads != 'default': + if threads != "default": expr.set_numexpr_threads(threads) if not use_numexpr: expr.set_use_numexpr(False) @@ -39,18 +40,21 @@ def teardown(self, use_numexpr, threads): class Ops2: - def setup(self): - N = 10**3 + N = 10 ** 3 self.df = DataFrame(np.random.randn(N, N)) self.df2 = DataFrame(np.random.randn(N, N)) - self.df_int = DataFrame(np.random.randint(np.iinfo(np.int16).min, - np.iinfo(np.int16).max, - size=(N, N))) - self.df2_int = DataFrame(np.random.randint(np.iinfo(np.int16).min, - np.iinfo(np.int16).max, - size=(N, N))) + self.df_int = DataFrame( + np.random.randint( + np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(N, N) + ) + ) + self.df2_int = DataFrame( + np.random.randint( + np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(N, N) + ) + ) self.s = Series(np.random.randn(N)) @@ -90,16 +94,16 @@ def time_frame_series_dot(self): class Timeseries: - params = [None, 'US/Eastern'] - param_names = ['tz'] + params = [None, "US/Eastern"] + param_names = ["tz"] def setup(self, tz): - N = 10**6 + N = 10 ** 6 halfway = (N // 2) - 1 - self.s = Series(date_range('20010101', periods=N, freq='T', tz=tz)) + self.s = Series(date_range("20010101", periods=N, freq="T", tz=tz)) self.ts = self.s[halfway] - self.s2 = Series(date_range('20010101', periods=N, freq='s', tz=tz)) + self.s2 = Series(date_range("20010101", periods=N, freq="s", tz=tz)) def time_series_timestamp_compare(self, tz): self.s <= self.ts @@ -117,10 +121,10 @@ def time_timestamp_ops_diff_with_shift(self, tz): class AddOverflowScalar: params = [1, -1, 0] - param_names = ['scalar'] + param_names = ["scalar"] def setup(self, scalar): - N = 10**6 + N = 10 ** 6 self.arr = np.arange(N) def time_add_overflow_scalar(self, scalar): @@ -128,9 +132,8 @@ def time_add_overflow_scalar(self, scalar): class AddOverflowArray: - def setup(self): - N = 10**6 + N = 10 ** 6 self.arr = np.arange(N) self.arr_rev = np.arange(-N, 0) self.arr_mixed = np.array([1, -1]).repeat(N / 2) @@ -144,12 +147,12 @@ def time_add_overflow_arr_mask_nan(self): checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1) def time_add_overflow_b_mask_nan(self): - checked_add_with_arr(self.arr, self.arr_mixed, - b_mask=self.arr_nan_1) + checked_add_with_arr(self.arr, self.arr_mixed, b_mask=self.arr_nan_1) def time_add_overflow_both_arg_nan(self): - checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1, - b_mask=self.arr_nan_2) + checked_add_with_arr( + self.arr, self.arr_mixed, arr_mask=self.arr_nan_1, b_mask=self.arr_nan_2 + ) from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index f1afca5941fe50..933946b1ca1acc 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import pandas.util.testing as tm + try: from pandas.api.types import union_categoricals except ImportError: @@ -11,13 +12,12 @@ class Concat: - def setup(self): - N = 10**5 - self.s = pd.Series(list('aabbcd') * N).astype('category') + N = 10 ** 5 + self.s = pd.Series(list("aabbcd") * N).astype("category") - self.a = pd.Categorical(list('aabbcd') * N) - self.b = pd.Categorical(list('bbcdjk') * N) + self.a = pd.Categorical(list("aabbcd") * N) + self.b = pd.Categorical(list("bbcdjk") * N) def time_concat(self): pd.concat([self.s, self.s]) @@ -27,23 +27,22 @@ def time_union(self): class Constructor: - def setup(self): - N = 10**5 - self.categories = list('abcde') + N = 10 ** 5 + self.categories = list("abcde") self.cat_idx = pd.Index(self.categories) self.values = np.tile(self.categories, N) self.codes = np.tile(range(len(self.categories)), N) - self.datetimes = pd.Series(pd.date_range('1995-01-01 00:00:00', - periods=N / 10, - freq='s')) + self.datetimes = pd.Series( + pd.date_range("1995-01-01 00:00:00", periods=N / 10, freq="s") + ) self.datetimes_with_nat = self.datetimes.copy() self.datetimes_with_nat.iloc[-1] = pd.NaT self.values_some_nan = list(np.tile(self.categories + [np.nan], N)) self.values_all_nan = [np.nan] * len(self.values) - self.values_all_int8 = np.ones(N, 'int8') + self.values_all_int8 = np.ones(N, "int8") self.categorical = pd.Categorical(self.values, self.categories) self.series = pd.Series(self.categorical) @@ -78,62 +77,55 @@ def time_existing_series(self): class ValueCounts: params = [True, False] - param_names = ['dropna'] + param_names = ["dropna"] def setup(self, dropna): - n = 5 * 10**5 - arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10, - size=n)] - self.ts = pd.Series(arr).astype('category') + n = 5 * 10 ** 5 + arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)] + self.ts = pd.Series(arr).astype("category") def time_value_counts(self, dropna): self.ts.value_counts(dropna=dropna) class Repr: - def setup(self): - self.sel = pd.Series(['s1234']).astype('category') + self.sel = pd.Series(["s1234"]).astype("category") def time_rendering(self): str(self.sel) class SetCategories: - def setup(self): - n = 5 * 10**5 - arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10, - size=n)] - self.ts = pd.Series(arr).astype('category') + n = 5 * 10 ** 5 + arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)] + self.ts = pd.Series(arr).astype("category") def time_set_categories(self): self.ts.cat.set_categories(self.ts.cat.categories[::2]) class RemoveCategories: - def setup(self): - n = 5 * 10**5 - arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10, - size=n)] - self.ts = pd.Series(arr).astype('category') + n = 5 * 10 ** 5 + arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)] + self.ts = pd.Series(arr).astype("category") def time_remove_categories(self): self.ts.cat.remove_categories(self.ts.cat.categories[::2]) class Rank: - def setup(self): - N = 10**5 + N = 10 ** 5 ncats = 100 self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str) - self.s_str_cat = self.s_str.astype('category') + self.s_str_cat = self.s_str.astype("category") self.s_str_cat_ordered = self.s_str_cat.cat.as_ordered() self.s_int = pd.Series(np.random.randint(0, ncats, size=N)) - self.s_int_cat = self.s_int.astype('category') + self.s_int_cat = self.s_int.astype("category") self.s_int_cat_ordered = self.s_int_cat.cat.as_ordered() def time_rank_string(self): @@ -157,28 +149,27 @@ def time_rank_int_cat_ordered(self): class Isin: - params = ['object', 'int64'] - param_names = ['dtype'] + params = ["object", "int64"] + param_names = ["dtype"] def setup(self, dtype): np.random.seed(1234) - n = 5 * 10**5 + n = 5 * 10 ** 5 sample_size = 100 arr = [i for i in np.random.randint(0, n // 10, size=n)] - if dtype == 'object': - arr = ['s{:04d}'.format(i) for i in arr] + if dtype == "object": + arr = ["s{:04d}".format(i) for i in arr] self.sample = np.random.choice(arr, sample_size) - self.series = pd.Series(arr).astype('category') + self.series = pd.Series(arr).astype("category") def time_isin_categorical(self, dtype): self.series.isin(self.sample) class IsMonotonic: - def setup(self): N = 1000 - self.c = pd.CategoricalIndex(list('a' * N + 'b' * N + 'c' * N)) + self.c = pd.CategoricalIndex(list("a" * N + "b" * N + "c" * N)) self.s = pd.Series(self.c) def time_categorical_index_is_monotonic_increasing(self): @@ -195,9 +186,8 @@ def time_categorical_series_is_monotonic_decreasing(self): class Contains: - def setup(self): - N = 10**5 + N = 10 ** 5 self.ci = tm.makeCategoricalIndex(N) self.c = self.ci.values self.key = self.ci.categories[0] @@ -211,34 +201,33 @@ def time_categorical_contains(self): class CategoricalSlicing: - params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic'] - param_names = ['index'] + params = ["monotonic_incr", "monotonic_decr", "non_monotonic"] + param_names = ["index"] def setup(self, index): - N = 10**6 - categories = ['a', 'b', 'c'] + N = 10 ** 6 + categories = ["a", "b", "c"] values = [0] * N + [1] * N + [2] * N - if index == 'monotonic_incr': - self.data = pd.Categorical.from_codes(values, - categories=categories) - elif index == 'monotonic_decr': - self.data = pd.Categorical.from_codes(list(reversed(values)), - categories=categories) - elif index == 'non_monotonic': - self.data = pd.Categorical.from_codes([0, 1, 2] * N, - categories=categories) + if index == "monotonic_incr": + self.data = pd.Categorical.from_codes(values, categories=categories) + elif index == "monotonic_decr": + self.data = pd.Categorical.from_codes( + list(reversed(values)), categories=categories + ) + elif index == "non_monotonic": + self.data = pd.Categorical.from_codes([0, 1, 2] * N, categories=categories) else: - raise ValueError('Invalid index param: {}'.format(index)) + raise ValueError("Invalid index param: {}".format(index)) self.scalar = 10000 self.list = list(range(10000)) - self.cat_scalar = 'b' + self.cat_scalar = "b" def time_getitem_scalar(self, index): self.data[self.scalar] def time_getitem_slice(self, index): - self.data[:self.scalar] + self.data[: self.scalar] def time_getitem_list_like(self, index): self.data[[self.scalar]] @@ -251,9 +240,8 @@ def time_getitem_bool_array(self, index): class Indexing: - def setup(self): - N = 10**5 + N = 10 ** 5 self.index = pd.CategoricalIndex(range(N), range(N)) self.series = pd.Series(range(N), index=self.index).sort_index() self.category = self.index[500] @@ -268,7 +256,7 @@ def time_shallow_copy(self): self.index._shallow_copy() def time_align(self): - pd.DataFrame({'a': self.series, 'b': self.series[:500]}) + pd.DataFrame({"a": self.series, "b": self.series[:500]}) def time_intersection(self): self.index[:750].intersection(self.index[250:]) @@ -280,7 +268,7 @@ def time_reindex(self): self.index.reindex(self.index[:500]) def time_reindex_missing(self): - self.index.reindex(['a', 'b', 'c', 'd']) + self.index.reindex(["a", "b", "c", "d"]) def time_sort_values(self): self.index.sort_values(ascending=False) diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index 42adede631a010..654075292cdf62 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -42,18 +42,22 @@ def list_of_lists_with_none(arr): class SeriesConstructors: param_names = ["data_fmt", "with_index", "dtype"] - params = [[no_change, - list, - list_of_str, - gen_of_str, - arr_dict, - list_of_tuples, - gen_of_tuples, - list_of_lists, - list_of_tuples_with_none, - list_of_lists_with_none], - [False, True], - ['float', 'int']] + params = [ + [ + no_change, + list, + list_of_str, + gen_of_str, + arr_dict, + list_of_tuples, + gen_of_tuples, + list_of_lists, + list_of_tuples_with_none, + list_of_lists_with_none, + ], + [False, True], + ["float", "int"], + ] # Generators get exhausted on use, so run setup before every call number = 1 @@ -61,10 +65,11 @@ class SeriesConstructors: def setup(self, data_fmt, with_index, dtype): if data_fmt in (gen_of_str, gen_of_tuples) and with_index: - raise NotImplementedError('Series constructors do not support ' - 'using generators with indexes') - N = 10**4 - if dtype == 'float': + raise NotImplementedError( + "Series constructors do not support " "using generators with indexes" + ) + N = 10 ** 4 + if dtype == "float": arr = np.random.randn(N) else: arr = np.arange(N) @@ -76,13 +81,15 @@ def time_series_constructor(self, data_fmt, with_index, dtype): class SeriesDtypesConstructors: - def setup(self): - N = 10**4 + N = 10 ** 4 self.arr = np.random.randn(N) - self.arr_str = np.array(['foo', 'bar', 'baz'], dtype=object) - self.s = Series([Timestamp('20110101'), Timestamp('20120101'), - Timestamp('20130101')] * N * 10) + self.arr_str = np.array(["foo", "bar", "baz"], dtype=object) + self.s = Series( + [Timestamp("20110101"), Timestamp("20120101"), Timestamp("20130101")] + * N + * 10 + ) def time_index_from_array_string(self): Index(self.arr_str) @@ -98,9 +105,8 @@ def time_dtindex_from_index_with_series(self): class MultiIndexConstructor: - def setup(self): - N = 10**4 + N = 10 ** 4 self.iterables = [tm.makeStringIndex(N), range(20)] def time_multiindex_from_iterables(self): diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index 9bfaaa8696009d..60800b1f9cae71 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -2,32 +2,36 @@ import numpy as np from .pandas_vb_common import ( - numeric_dtypes, datetime_dtypes, string_dtypes, extension_dtypes) + numeric_dtypes, + datetime_dtypes, + string_dtypes, + extension_dtypes, +) -_numpy_dtypes = [np.dtype(dtype) - for dtype in (numeric_dtypes + - datetime_dtypes + - string_dtypes)] +_numpy_dtypes = [ + np.dtype(dtype) for dtype in (numeric_dtypes + datetime_dtypes + string_dtypes) +] _dtypes = _numpy_dtypes + extension_dtypes class Dtypes: - params = (_dtypes + - list(map(lambda dt: dt.name, _dtypes))) - param_names = ['dtype'] + params = _dtypes + list(map(lambda dt: dt.name, _dtypes)) + param_names = ["dtype"] def time_pandas_dtype(self, dtype): pandas_dtype(dtype) class DtypesInvalid: - param_names = ['dtype'] - params = ['scalar-string', 'scalar-int', 'list-string', 'array-string'] - data_dict = {'scalar-string': 'foo', - 'scalar-int': 1, - 'list-string': ['foo'] * 1000, - 'array-string': np.array(['foo'] * 1000)} + param_names = ["dtype"] + params = ["scalar-string", "scalar-int", "list-string", "array-string"] + data_dict = { + "scalar-string": "foo", + "scalar-int": 1, + "list-string": ["foo"] * 1000, + "array-string": np.array(["foo"] * 1000), + } def time_pandas_dtype_invalid(self, dtype): try: diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py index be47d35f2cad1b..84e94315cc28b0 100644 --- a/asv_bench/benchmarks/eval.py +++ b/asv_bench/benchmarks/eval.py @@ -1,5 +1,6 @@ import numpy as np import pandas as pd + try: import pandas.core.computation.expressions as expr except ImportError: @@ -8,8 +9,8 @@ class Eval: - params = [['numexpr', 'python'], [1, 'all']] - param_names = ['engine', 'threads'] + params = [["numexpr", "python"], [1, "all"]] + param_names = ["engine", "threads"] def setup(self, engine, threads): self.df = pd.DataFrame(np.random.randn(20000, 100)) @@ -21,44 +22,44 @@ def setup(self, engine, threads): expr.set_numexpr_threads(1) def time_add(self, engine, threads): - pd.eval('self.df + self.df2 + self.df3 + self.df4', engine=engine) + pd.eval("self.df + self.df2 + self.df3 + self.df4", engine=engine) def time_and(self, engine, threads): - pd.eval('(self.df > 0) & (self.df2 > 0) & ' - '(self.df3 > 0) & (self.df4 > 0)', engine=engine) + pd.eval( + "(self.df > 0) & (self.df2 > 0) & " "(self.df3 > 0) & (self.df4 > 0)", + engine=engine, + ) def time_chained_cmp(self, engine, threads): - pd.eval('self.df < self.df2 < self.df3 < self.df4', engine=engine) + pd.eval("self.df < self.df2 < self.df3 < self.df4", engine=engine) def time_mult(self, engine, threads): - pd.eval('self.df * self.df2 * self.df3 * self.df4', engine=engine) + pd.eval("self.df * self.df2 * self.df3 * self.df4", engine=engine) def teardown(self, engine, threads): expr.set_numexpr_threads() class Query: - def setup(self): - N = 10**6 + N = 10 ** 6 halfway = (N // 2) - 1 - index = pd.date_range('20010101', periods=N, freq='T') + index = pd.date_range("20010101", periods=N, freq="T") s = pd.Series(index) self.ts = s.iloc[halfway] - self.df = pd.DataFrame({'a': np.random.randn(N), 'dates': index}, - index=index) + self.df = pd.DataFrame({"a": np.random.randn(N), "dates": index}, index=index) data = np.random.randn(N) self.min_val = data.min() self.max_val = data.max() def time_query_datetime_index(self): - self.df.query('index < @self.ts') + self.df.query("index < @self.ts") def time_query_datetime_column(self): - self.df.query('dates < @self.ts') + self.df.query("dates < @self.ts") def time_query_with_boolean_selection(self): - self.df.query('(a >= @self.min_val) & (a <= @self.max_val)') + self.df.query("(a >= @self.min_val) & (a <= @self.max_val)") from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 9533938b30faca..acfb26bcf5d7ca 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -1,25 +1,23 @@ import numpy as np import pandas.util.testing as tm from pandas import DataFrame, Series, MultiIndex, Timestamp, date_range + try: from pandas.tseries.offsets import Nano, Hour except ImportError: # For compatibility with older versions - from pandas.core.datetools import * # noqa + from pandas.core.datetools import * # noqa class FromDicts: - def setup(self): N, K = 5000, 50 self.index = tm.makeStringIndex(N) self.columns = tm.makeStringIndex(K) - frame = DataFrame(np.random.randn(N, K), index=self.index, - columns=self.columns) + frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns) self.data = frame.to_dict() - self.dict_list = frame.to_dict(orient='records') - self.data2 = {i: {j: float(j) for j in range(100)} - for i in range(2000)} + self.dict_list = frame.to_dict(orient="records") + self.data2 = {i: {j: float(j) for j in range(100)} for i in range(2000)} def time_list_of_dict(self): DataFrame(self.dict_list) @@ -42,7 +40,6 @@ def time_nested_dict_int64(self): class FromSeries: - def setup(self): mi = MultiIndex.from_product([range(100), range(100)]) self.s = Series(np.random.randn(10000), index=mi) @@ -54,12 +51,12 @@ def time_mi_series(self): class FromDictwithTimestamp: params = [Nano(1), Hour(1)] - param_names = ['offset'] + param_names = ["offset"] def setup(self, offset): - N = 10**3 + N = 10 ** 3 np.random.seed(1234) - idx = date_range(Timestamp('1/1/1900'), freq=offset, periods=N) + idx = date_range(Timestamp("1/1/1900"), freq=offset, periods=N) df = DataFrame(np.random.randn(N, 10), index=idx) self.d = df.to_dict() @@ -70,7 +67,7 @@ def time_dict_with_timestamp_offsets(self, offset): class FromRecords: params = [None, 1000] - param_names = ['nrows'] + param_names = ["nrows"] # Generators get exhausted on use, so run setup before every call number = 1 @@ -86,7 +83,6 @@ def time_frame_from_records_generator(self, nrows): class FromNDArray: - def setup(self): N = 100000 self.data = np.random.randn(N) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 5b76eeba115a42..af4741f94d2943 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -2,17 +2,15 @@ import numpy as np -from pandas import ( - DataFrame, MultiIndex, NaT, Series, date_range, isnull, period_range) +from pandas import DataFrame, MultiIndex, NaT, Series, date_range, isnull, period_range import pandas.util.testing as tm class GetNumericData: - def setup(self): self.df = DataFrame(np.random.randn(10000, 25)) - self.df['foo'] = 'bar' - self.df['bar'] = 'baz' + self.df["foo"] = "bar" + self.df["bar"] = "baz" self.df = self.df._consolidate() def time_frame_get_numeric_data(self): @@ -20,17 +18,17 @@ def time_frame_get_numeric_data(self): class Lookup: - def setup(self): - self.df = DataFrame(np.random.randn(10000, 8), - columns=list('abcdefgh')) - self.df['foo'] = 'bar' + self.df = DataFrame(np.random.randn(10000, 8), columns=list("abcdefgh")) + self.df["foo"] = "bar" self.row_labels = list(self.df.index[::10])[:900] self.col_labels = list(self.df.columns) * 100 self.row_labels_all = np.array( - list(self.df.index) * len(self.df.columns), dtype='object') + list(self.df.index) * len(self.df.columns), dtype="object" + ) self.col_labels_all = np.array( - list(self.df.columns) * len(self.df.index), dtype='object') + list(self.df.columns) * len(self.df.index), dtype="object" + ) def time_frame_fancy_lookup(self): self.df.lookup(self.row_labels, self.col_labels) @@ -40,17 +38,21 @@ def time_frame_fancy_lookup_all(self): class Reindex: - def setup(self): - N = 10**3 + N = 10 ** 3 self.df = DataFrame(np.random.randn(N * 10, N)) self.idx = np.arange(4 * N, 7 * N) self.df2 = DataFrame( - {c: {0: np.random.randint(0, 2, N).astype(np.bool_), - 1: np.random.randint(0, N, N).astype(np.int16), - 2: np.random.randint(0, N, N).astype(np.int32), - 3: np.random.randint(0, N, N).astype(np.int64)} - [np.random.randint(0, 4)] for c in range(N)}) + { + c: { + 0: np.random.randint(0, 2, N).astype(np.bool_), + 1: np.random.randint(0, N, N).astype(np.int16), + 2: np.random.randint(0, N, N).astype(np.int32), + 3: np.random.randint(0, N, N).astype(np.int64), + }[np.random.randint(0, 4)] + for c in range(N) + } + ) def time_reindex_axis0(self): self.df.reindex(self.idx) @@ -66,18 +68,22 @@ def time_reindex_upcast(self): class Rename: - def setup(self): - N = 10**3 + N = 10 ** 3 self.df = DataFrame(np.random.randn(N * 10, N)) self.idx = np.arange(4 * N, 7 * N) self.dict_idx = {k: k for k in self.idx} self.df2 = DataFrame( - {c: {0: np.random.randint(0, 2, N).astype(np.bool_), - 1: np.random.randint(0, N, N).astype(np.int16), - 2: np.random.randint(0, N, N).astype(np.int32), - 3: np.random.randint(0, N, N).astype(np.int64)} - [np.random.randint(0, 4)] for c in range(N)}) + { + c: { + 0: np.random.randint(0, 2, N).astype(np.bool_), + 1: np.random.randint(0, N, N).astype(np.int16), + 2: np.random.randint(0, N, N).astype(np.int32), + 3: np.random.randint(0, N, N).astype(np.int64), + }[np.random.randint(0, 4)] + for c in range(N) + } + ) def time_rename_single(self): self.df.rename({0: 0}) @@ -103,13 +109,14 @@ def setup(self): N = 1000 self.df = DataFrame(np.random.randn(N * 10, N)) self.df2 = DataFrame(np.random.randn(N * 50, 10)) - self.df3 = DataFrame(np.random.randn(N, 5 * N), - columns=['C' + str(c) for c in range(N * 5)]) + self.df3 = DataFrame( + np.random.randn(N, 5 * N), columns=["C" + str(c) for c in range(N * 5)] + ) self.df4 = DataFrame(np.random.randn(N * 1000, 10)) def time_iteritems(self): # (monitor no-copying behaviour) - if hasattr(self.df, '_item_cache'): + if hasattr(self.df, "_item_cache"): self.df._item_cache.clear() for name, col in self.df.iteritems(): pass @@ -192,7 +199,6 @@ def time_iterrows(self): class ToString: - def setup(self): self.df = DataFrame(np.random.randn(100, 10)) @@ -201,11 +207,10 @@ def time_to_string_floats(self): class ToHTML: - def setup(self): nrows = 500 self.df2 = DataFrame(np.random.randn(nrows, 10)) - self.df2[0] = period_range('2000', periods=nrows) + self.df2[0] = period_range("2000", periods=nrows) self.df2[1] = range(nrows) def time_to_html_mixed(self): @@ -213,7 +218,6 @@ def time_to_html_mixed(self): class Repr: - def setup(self): nrows = 10000 data = np.random.randn(nrows, 10) @@ -238,7 +242,6 @@ def time_frame_repr_wide(self): class MaskBool: - def setup(self): data = np.random.randn(1000, 500) df = DataFrame(data) @@ -254,9 +257,8 @@ def time_frame_mask_floats(self): class Isnull: - def setup(self): - N = 10**3 + N = 10 ** 3 self.df_no_null = DataFrame(np.random.randn(N, N)) sample = np.array([np.nan, 1.0]) @@ -267,8 +269,20 @@ def setup(self): data = np.random.choice(sample, (N, N)) self.df_strings = DataFrame(data) - sample = np.array([NaT, np.nan, None, np.datetime64('NaT'), - np.timedelta64('NaT'), 0, 1, 2.0, '', 'abcd']) + sample = np.array( + [ + NaT, + np.nan, + None, + np.datetime64("NaT"), + np.timedelta64("NaT"), + 0, + 1, + 2.0, + "", + "abcd", + ] + ) data = np.random.choice(sample, (N, N)) self.df_obj = DataFrame(data) @@ -287,8 +301,8 @@ def time_isnull_obj(self): class Fillna: - params = ([True, False], ['pad', 'bfill']) - param_names = ['inplace', 'method'] + params = ([True, False], ["pad", "bfill"]) + param_names = ["inplace", "method"] def setup(self, inplace, method): values = np.random.randn(10000, 100) @@ -301,8 +315,8 @@ def time_frame_fillna(self, inplace, method): class Dropna: - params = (['all', 'any'], [0, 1]) - param_names = ['how', 'axis'] + params = (["all", "any"], [0, 1]) + param_names = ["how", "axis"] def setup(self, how, axis): self.df = DataFrame(np.random.randn(10000, 1000)) @@ -310,7 +324,7 @@ def setup(self, how, axis): self.df.ix[2000:3000] = np.nan self.df.ix[:, 60:70] = np.nan self.df_mixed = self.df.copy() - self.df_mixed['foo'] = 'bar' + self.df_mixed["foo"] = "bar" def time_dropna(self, how, axis): self.df.dropna(how=how, axis=axis) @@ -322,7 +336,7 @@ def time_dropna_axis_mixed_dtypes(self, how, axis): class Count: params = [0, 1] - param_names = ['axis'] + param_names = ["axis"] def setup(self, axis): self.df = DataFrame(np.random.randn(10000, 1000)) @@ -330,15 +344,16 @@ def setup(self, axis): self.df.ix[2000:3000] = np.nan self.df.ix[:, 60:70] = np.nan self.df_mixed = self.df.copy() - self.df_mixed['foo'] = 'bar' + self.df_mixed["foo"] = "bar" self.df.index = MultiIndex.from_arrays([self.df.index, self.df.index]) - self.df.columns = MultiIndex.from_arrays([self.df.columns, - self.df.columns]) - self.df_mixed.index = MultiIndex.from_arrays([self.df_mixed.index, - self.df_mixed.index]) - self.df_mixed.columns = MultiIndex.from_arrays([self.df_mixed.columns, - self.df_mixed.columns]) + self.df.columns = MultiIndex.from_arrays([self.df.columns, self.df.columns]) + self.df_mixed.index = MultiIndex.from_arrays( + [self.df_mixed.index, self.df_mixed.index] + ) + self.df_mixed.columns = MultiIndex.from_arrays( + [self.df_mixed.columns, self.df_mixed.columns] + ) def time_count_level_multi(self, axis): self.df.count(axis=axis, level=1) @@ -348,13 +363,12 @@ def time_count_level_mixed_dtypes_multi(self, axis): class Apply: - def setup(self): self.df = DataFrame(np.random.randn(1000, 100)) self.s = Series(np.arange(1028.0)) self.df2 = DataFrame({i: self.s for i in range(1028)}) - self.df3 = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) + self.df3 = DataFrame(np.random.randn(1000, 3), columns=list("ABC")) def time_apply_user_func(self): self.df2.apply(lambda x: np.corrcoef(x, self.s)[(0, 1)]) @@ -372,11 +386,10 @@ def time_apply_pass_thru(self): self.df.apply(lambda x: x) def time_apply_ref_by_name(self): - self.df3.apply(lambda x: x['A'] + x['B'], axis=1) + self.df3.apply(lambda x: x["A"] + x["B"], axis=1) class Dtypes: - def setup(self): self.df = DataFrame(np.random.randn(1000, 1000)) @@ -385,19 +398,18 @@ def time_frame_dtypes(self): class Equals: - def setup(self): - N = 10**3 + N = 10 ** 3 self.float_df = DataFrame(np.random.randn(N, N)) self.float_df_nan = self.float_df.copy() self.float_df_nan.iloc[-1, -1] = np.nan - self.object_df = DataFrame('foo', index=range(N), columns=range(N)) + self.object_df = DataFrame("foo", index=range(N), columns=range(N)) self.object_df_nan = self.object_df.copy() self.object_df_nan.iloc[-1, -1] = np.nan self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = ['A'] * len(self.nonunique_cols.columns) + self.nonunique_cols.columns = ["A"] * len(self.nonunique_cols.columns) self.nonunique_cols_nan = self.nonunique_cols.copy() self.nonunique_cols_nan.iloc[-1, -1] = np.nan @@ -422,8 +434,8 @@ def time_frame_object_unequal(self): class Interpolate: - params = [None, 'infer'] - param_names = ['downcast'] + params = [None, "infer"] + param_names = ["downcast"] def setup(self, downcast): N = 10000 @@ -431,12 +443,16 @@ def setup(self, downcast): self.df = DataFrame(np.random.randn(N, 100)) self.df.values[::2] = np.nan - self.df2 = DataFrame({'A': np.arange(0, N), - 'B': np.random.randint(0, 100, N), - 'C': np.random.randn(N), - 'D': np.random.randn(N)}) - self.df2.loc[1::5, 'A'] = np.nan - self.df2.loc[1::5, 'C'] = np.nan + self.df2 = DataFrame( + { + "A": np.arange(0, N), + "B": np.random.randint(0, 100, N), + "C": np.random.randn(N), + "D": np.random.randn(N), + } + ) + self.df2.loc[1::5, "A"] = np.nan + self.df2.loc[1::5, "C"] = np.nan def time_interpolate(self, downcast): self.df.interpolate(downcast=downcast) @@ -448,7 +464,7 @@ def time_interpolate_some_good(self, downcast): class Shift: # frame shift speedup issue-5609 params = [0, 1] - param_names = ['axis'] + param_names = ["axis"] def setup(self, axis): self.df = DataFrame(np.random.rand(10000, 500)) @@ -458,7 +474,6 @@ def time_shift(self, axis): class Nunique: - def setup(self): self.df = DataFrame(np.random.randn(10000, 1000)) @@ -467,14 +482,17 @@ def time_frame_nunique(self): class Duplicated: - def setup(self): - n = (1 << 20) - t = date_range('2015-01-01', freq='S', periods=(n // 64)) + n = 1 << 20 + t = date_range("2015-01-01", freq="S", periods=(n // 64)) xs = np.random.randn(n // 64).round(2) - self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n), - 'b': np.random.choice(t, n), - 'c': np.random.choice(xs, n)}) + self.df = DataFrame( + { + "a": np.random.randint(-1 << 8, 1 << 8, n), + "b": np.random.choice(t, n), + "c": np.random.choice(xs, n), + } + ) self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T def time_frame_duplicated(self): @@ -487,10 +505,10 @@ def time_frame_duplicated_wide(self): class XS: params = [0, 1] - param_names = ['axis'] + param_names = ["axis"] def setup(self, axis): - self.N = 10**4 + self.N = 10 ** 4 self.df = DataFrame(np.random.randn(self.N, self.N)) def time_frame_xs(self, axis): @@ -500,35 +518,38 @@ def time_frame_xs(self, axis): class SortValues: params = [True, False] - param_names = ['ascending'] + param_names = ["ascending"] def setup(self, ascending): - self.df = DataFrame(np.random.randn(1000000, 2), columns=list('AB')) + self.df = DataFrame(np.random.randn(1000000, 2), columns=list("AB")) def time_frame_sort_values(self, ascending): - self.df.sort_values(by='A', ascending=ascending) + self.df.sort_values(by="A", ascending=ascending) class SortIndexByColumns: - def setup(self): N = 10000 K = 10 - self.df = DataFrame({'key1': tm.makeStringIndex(N).values.repeat(K), - 'key2': tm.makeStringIndex(N).values.repeat(K), - 'value': np.random.randn(N * K)}) + self.df = DataFrame( + { + "key1": tm.makeStringIndex(N).values.repeat(K), + "key2": tm.makeStringIndex(N).values.repeat(K), + "value": np.random.randn(N * K), + } + ) def time_frame_sort_values_by_columns(self): - self.df.sort_values(by=['key1', 'key2']) + self.df.sort_values(by=["key1", "key2"]) class Quantile: params = [0, 1] - param_names = ['axis'] + param_names = ["axis"] def setup(self, axis): - self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) + self.df = DataFrame(np.random.randn(1000, 3), columns=list("ABC")) def time_frame_quantile(self, axis): self.df.quantile([0.1, 0.5], axis=axis) @@ -548,37 +569,37 @@ def time_info(self): class NSort: - params = ['first', 'last', 'all'] - param_names = ['keep'] + params = ["first", "last", "all"] + param_names = ["keep"] def setup(self, keep): - self.df = DataFrame(np.random.randn(100000, 3), - columns=list('ABC')) + self.df = DataFrame(np.random.randn(100000, 3), columns=list("ABC")) def time_nlargest_one_column(self, keep): - self.df.nlargest(100, 'A', keep=keep) + self.df.nlargest(100, "A", keep=keep) def time_nlargest_two_columns(self, keep): - self.df.nlargest(100, ['A', 'B'], keep=keep) + self.df.nlargest(100, ["A", "B"], keep=keep) def time_nsmallest_one_column(self, keep): - self.df.nsmallest(100, 'A', keep=keep) + self.df.nsmallest(100, "A", keep=keep) def time_nsmallest_two_columns(self, keep): - self.df.nsmallest(100, ['A', 'B'], keep=keep) + self.df.nsmallest(100, ["A", "B"], keep=keep) class Describe: - def setup(self): - self.df = DataFrame({ - 'a': np.random.randint(0, 100, int(1e6)), - 'b': np.random.randint(0, 100, int(1e6)), - 'c': np.random.randint(0, 100, int(1e6)) - }) + self.df = DataFrame( + { + "a": np.random.randint(0, 100, int(1e6)), + "b": np.random.randint(0, 100, int(1e6)), + "c": np.random.randint(0, 100, int(1e6)), + } + ) def time_series_describe(self): - self.df['a'].describe() + self.df["a"].describe() def time_dataframe_describe(self): self.df.describe() diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 65a03bfda48c50..0d0b75561d057a 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -2,9 +2,19 @@ import pandas.util.testing as tm from pandas import DataFrame, Series, read_csv, factorize, date_range from pandas.core.algorithms import take_1d + try: - from pandas import (rolling_median, rolling_mean, rolling_min, rolling_max, - rolling_var, rolling_skew, rolling_kurt, rolling_std) + from pandas import ( + rolling_median, + rolling_mean, + rolling_min, + rolling_max, + rolling_var, + rolling_skew, + rolling_kurt, + rolling_std, + ) + have_rolling_methods = True except ImportError: have_rolling_methods = False @@ -14,6 +24,7 @@ from pandas import algos try: from pandas.util.testing import test_parallel + have_real_test_parallel = True except ImportError: have_real_test_parallel = False @@ -21,32 +32,36 @@ def test_parallel(num_threads=1): def wrapper(fname): return fname + return wrapper + from .pandas_vb_common import BaseIO class ParallelGroupbyMethods: - params = ([2, 4, 8], ['count', 'last', 'max', 'mean', 'min', 'prod', - 'sum', 'var']) - param_names = ['threads', 'method'] + params = ([2, 4, 8], ["count", "last", "max", "mean", "min", "prod", "sum", "var"]) + param_names = ["threads", "method"] def setup(self, threads, method): if not have_real_test_parallel: raise NotImplementedError - N = 10**6 - ngroups = 10**3 - df = DataFrame({'key': np.random.randint(0, ngroups, size=N), - 'data': np.random.randn(N)}) + N = 10 ** 6 + ngroups = 10 ** 3 + df = DataFrame( + {"key": np.random.randint(0, ngroups, size=N), "data": np.random.randn(N)} + ) @test_parallel(num_threads=threads) def parallel(): - getattr(df.groupby('key')['data'], method)() + getattr(df.groupby("key")["data"], method)() + self.parallel = parallel def loop(): - getattr(df.groupby('key')['data'], method)() + getattr(df.groupby("key")["data"], method)() + self.loop = loop def time_parallel(self, threads, method): @@ -60,18 +75,19 @@ def time_loop(self, threads, method): class ParallelGroups: params = [2, 4, 8] - param_names = ['threads'] + param_names = ["threads"] def setup(self, threads): if not have_real_test_parallel: raise NotImplementedError - size = 2**22 - ngroups = 10**3 + size = 2 ** 22 + ngroups = 10 ** 3 data = Series(np.random.randint(0, ngroups, size=size)) @test_parallel(num_threads=threads) def get_groups(): data.groupby(data).groups + self.get_groups = get_groups def time_get_groups(self, threads): @@ -80,19 +96,20 @@ def time_get_groups(self, threads): class ParallelTake1D: - params = ['int64', 'float64'] - param_names = ['dtype'] + params = ["int64", "float64"] + param_names = ["dtype"] def setup(self, dtype): if not have_real_test_parallel: raise NotImplementedError - N = 10**6 - df = DataFrame({'col': np.arange(N, dtype=dtype)}) + N = 10 ** 6 + df = DataFrame({"col": np.arange(N, dtype=dtype)}) indexer = np.arange(100, len(df) - 100) @test_parallel(num_threads=2) def parallel_take1d(): - take_1d(df['col'].values, indexer) + take_1d(df["col"].values, indexer) + self.parallel_take1d = parallel_take1d def time_take1d(self, dtype): @@ -107,14 +124,14 @@ class ParallelKth: def setup(self): if not have_real_test_parallel: raise NotImplementedError - N = 10**7 - k = 5 * 10**5 - kwargs_list = [{'arr': np.random.randn(N)}, - {'arr': np.random.randn(N)}] + N = 10 ** 7 + k = 5 * 10 ** 5 + kwargs_list = [{"arr": np.random.randn(N)}, {"arr": np.random.randn(N)}] @test_parallel(num_threads=2, kwargs_list=kwargs_list) def parallel_kth_smallest(arr): algos.kth_smallest(arr, k) + self.parallel_kth_smallest = parallel_kth_smallest def time_kth_smallest(self): @@ -122,81 +139,90 @@ def time_kth_smallest(self): class ParallelDatetimeFields: - def setup(self): if not have_real_test_parallel: raise NotImplementedError - N = 10**6 - self.dti = date_range('1900-01-01', periods=N, freq='T') - self.period = self.dti.to_period('D') + N = 10 ** 6 + self.dti = date_range("1900-01-01", periods=N, freq="T") + self.period = self.dti.to_period("D") def time_datetime_field_year(self): @test_parallel(num_threads=2) def run(dti): dti.year + run(self.dti) def time_datetime_field_day(self): @test_parallel(num_threads=2) def run(dti): dti.day + run(self.dti) def time_datetime_field_daysinmonth(self): @test_parallel(num_threads=2) def run(dti): dti.days_in_month + run(self.dti) def time_datetime_field_normalize(self): @test_parallel(num_threads=2) def run(dti): dti.normalize() + run(self.dti) def time_datetime_to_period(self): @test_parallel(num_threads=2) def run(dti): - dti.to_period('S') + dti.to_period("S") + run(self.dti) def time_period_to_datetime(self): @test_parallel(num_threads=2) def run(period): period.to_timestamp() + run(self.period) class ParallelRolling: - params = ['median', 'mean', 'min', 'max', 'var', 'skew', 'kurt', 'std'] - param_names = ['method'] + params = ["median", "mean", "min", "max", "var", "skew", "kurt", "std"] + param_names = ["method"] def setup(self, method): if not have_real_test_parallel: raise NotImplementedError win = 100 arr = np.random.rand(100000) - if hasattr(DataFrame, 'rolling'): + if hasattr(DataFrame, "rolling"): df = DataFrame(arr).rolling(win) @test_parallel(num_threads=2) def parallel_rolling(): getattr(df, method)() + self.parallel_rolling = parallel_rolling elif have_rolling_methods: - rolling = {'median': rolling_median, - 'mean': rolling_mean, - 'min': rolling_min, - 'max': rolling_max, - 'var': rolling_var, - 'skew': rolling_skew, - 'kurt': rolling_kurt, - 'std': rolling_std} + rolling = { + "median": rolling_median, + "mean": rolling_mean, + "min": rolling_min, + "max": rolling_max, + "var": rolling_var, + "skew": rolling_skew, + "kurt": rolling_kurt, + "std": rolling_std, + } @test_parallel(num_threads=2) def parallel_rolling(): rolling[method](arr, win) + self.parallel_rolling = parallel_rolling else: raise NotImplementedError @@ -209,30 +235,34 @@ class ParallelReadCSV(BaseIO): number = 1 repeat = 5 - params = ['float', 'object', 'datetime'] - param_names = ['dtype'] + params = ["float", "object", "datetime"] + param_names = ["dtype"] def setup(self, dtype): if not have_real_test_parallel: raise NotImplementedError rows = 10000 cols = 50 - data = {'float': DataFrame(np.random.randn(rows, cols)), - 'datetime': DataFrame(np.random.randn(rows, cols), - index=date_range('1/1/2000', - periods=rows)), - 'object': DataFrame('foo', - index=range(rows), - columns=['object%03d'.format(i) - for i in range(5)])} - - self.fname = '__test_{}__.csv'.format(dtype) + data = { + "float": DataFrame(np.random.randn(rows, cols)), + "datetime": DataFrame( + np.random.randn(rows, cols), index=date_range("1/1/2000", periods=rows) + ), + "object": DataFrame( + "foo", + index=range(rows), + columns=["object%03d".format(i) for i in range(5)], + ), + } + + self.fname = "__test_{}__.csv".format(dtype) df = data[dtype] df.to_csv(self.fname) @test_parallel(num_threads=2) def parallel_read_csv(): read_csv(self.fname) + self.parallel_read_csv = parallel_read_csv def time_read_csv(self, dtype): @@ -244,7 +274,7 @@ class ParallelFactorize: number = 1 repeat = 5 params = [2, 4, 8] - param_names = ['threads'] + param_names = ["threads"] def setup(self, threads): if not have_real_test_parallel: @@ -255,10 +285,12 @@ def setup(self, threads): @test_parallel(num_threads=threads) def parallel(): factorize(strings) + self.parallel = parallel def loop(): factorize(strings) + self.loop = loop def time_parallel(self, threads): diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 3097ada6d20225..39b07d4734399e 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -5,18 +5,55 @@ import numpy as np from pandas import ( - Categorical, DataFrame, MultiIndex, Series, Timestamp, - date_range, period_range) + Categorical, + DataFrame, + MultiIndex, + Series, + Timestamp, + date_range, + period_range, +) import pandas.util.testing as tm method_blacklist = { - 'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean', - 'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min', - 'var', 'mad', 'describe', 'std', 'quantile'}, - 'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew', - 'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe', - 'std'} + "object": { + "median", + "prod", + "sem", + "cumsum", + "sum", + "cummin", + "mean", + "max", + "skew", + "cumprod", + "cummax", + "rank", + "pct_change", + "min", + "var", + "mad", + "describe", + "std", + "quantile", + }, + "datetime": { + "median", + "prod", + "sem", + "cumsum", + "sum", + "mean", + "skew", + "cumprod", + "cummax", + "pct_change", + "var", + "mad", + "describe", + "std", + }, } @@ -26,28 +63,31 @@ def setup(self): self.data = Series(np.random.randn(len(self.labels))) def time_groupby_apply_dict_return(self): - self.data.groupby(self.labels).apply(lambda x: {'first': x.values[0], - 'last': x.values[-1]}) + self.data.groupby(self.labels).apply( + lambda x: {"first": x.values[0], "last": x.values[-1]} + ) class Apply: - def setup_cache(self): - N = 10**4 + N = 10 ** 4 labels = np.random.randint(0, 2000, size=N) labels2 = np.random.randint(0, 3, size=N) - df = DataFrame({'key': labels, - 'key2': labels2, - 'value1': np.random.randn(N), - 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4) - }) + df = DataFrame( + { + "key": labels, + "key2": labels2, + "value1": np.random.randn(N), + "value2": ["foo", "bar", "baz", "qux"] * (N // 4), + } + ) return df def time_scalar_function_multi_col(self, df): - df.groupby(['key', 'key2']).apply(lambda x: 1) + df.groupby(["key", "key2"]).apply(lambda x: 1) def time_scalar_function_single_col(self, df): - df.groupby('key').apply(lambda x: 1) + df.groupby("key").apply(lambda x: 1) @staticmethod def df_copy_function(g): @@ -56,27 +96,29 @@ def df_copy_function(g): return g.copy() def time_copy_function_multi_col(self, df): - df.groupby(['key', 'key2']).apply(self.df_copy_function) + df.groupby(["key", "key2"]).apply(self.df_copy_function) def time_copy_overhead_single_col(self, df): - df.groupby('key').apply(self.df_copy_function) + df.groupby("key").apply(self.df_copy_function) class Groups: - param_names = ['key'] - params = ['int64_small', 'int64_large', 'object_small', 'object_large'] + param_names = ["key"] + params = ["int64_small", "int64_large", "object_small", "object_large"] def setup_cache(self): - size = 10**6 - data = {'int64_small': Series(np.random.randint(0, 100, size=size)), - 'int64_large': Series(np.random.randint(0, 10000, size=size)), - 'object_small': Series( - tm.makeStringIndex(100).take( - np.random.randint(0, 100, size=size))), - 'object_large': Series( - tm.makeStringIndex(10000).take( - np.random.randint(0, 10000, size=size)))} + size = 10 ** 6 + data = { + "int64_small": Series(np.random.randint(0, 100, size=size)), + "int64_large": Series(np.random.randint(0, 10000, size=size)), + "object_small": Series( + tm.makeStringIndex(100).take(np.random.randint(0, 100, size=size)) + ), + "object_large": Series( + tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=size)) + ), + } return data def setup(self, data, key): @@ -89,7 +131,7 @@ def time_series_groups(self, data, key): class GroupManyLabels: params = [1, 1000] - param_names = ['ncols'] + param_names = ["ncols"] def setup(self, ncols): N = 1000 @@ -103,46 +145,45 @@ def time_sum(self, ncols): class Nth: - param_names = ['dtype'] - params = ['float32', 'float64', 'datetime', 'object'] + param_names = ["dtype"] + params = ["float32", "float64", "datetime", "object"] def setup(self, dtype): - N = 10**5 + N = 10 ** 5 # with datetimes (GH7555) - if dtype == 'datetime': - values = date_range('1/1/2011', periods=N, freq='s') - elif dtype == 'object': - values = ['foo'] * N + if dtype == "datetime": + values = date_range("1/1/2011", periods=N, freq="s") + elif dtype == "object": + values = ["foo"] * N else: values = np.arange(N).astype(dtype) key = np.arange(N) - self.df = DataFrame({'key': key, 'values': values}) + self.df = DataFrame({"key": key, "values": values}) self.df.iloc[1, 1] = np.nan # insert missing data def time_frame_nth_any(self, dtype): - self.df.groupby('key').nth(0, dropna='any') + self.df.groupby("key").nth(0, dropna="any") def time_groupby_nth_all(self, dtype): - self.df.groupby('key').nth(0, dropna='all') + self.df.groupby("key").nth(0, dropna="all") def time_frame_nth(self, dtype): - self.df.groupby('key').nth(0) + self.df.groupby("key").nth(0) def time_series_nth_any(self, dtype): - self.df['values'].groupby(self.df['key']).nth(0, dropna='any') + self.df["values"].groupby(self.df["key"]).nth(0, dropna="any") def time_series_nth_all(self, dtype): - self.df['values'].groupby(self.df['key']).nth(0, dropna='all') + self.df["values"].groupby(self.df["key"]).nth(0, dropna="all") def time_series_nth(self, dtype): - self.df['values'].groupby(self.df['key']).nth(0) + self.df["values"].groupby(self.df["key"]).nth(0) class DateAttributes: - def setup(self): - rng = date_range('1/1/2000', '12/31/2005', freq='H') + rng = date_range("1/1/2000", "12/31/2005", freq="H") self.year, self.month, self.day = rng.year, rng.month, rng.day self.ts = Series(np.random.randn(len(rng)), index=rng) @@ -151,154 +192,167 @@ def time_len_groupby_object(self): class Int64: - def setup(self): arr = np.random.randint(-1 << 12, 1 << 12, (1 << 17, 5)) i = np.random.choice(len(arr), len(arr) * 5) arr = np.vstack((arr, arr[i])) i = np.random.permutation(len(arr)) arr = arr[i] - self.cols = list('abcde') + self.cols = list("abcde") self.df = DataFrame(arr, columns=self.cols) - self.df['jim'], self.df['joe'] = np.random.randn(2, len(self.df)) * 10 + self.df["jim"], self.df["joe"] = np.random.randn(2, len(self.df)) * 10 def time_overflow(self): self.df.groupby(self.cols).max() class CountMultiDtype: - def setup_cache(self): n = 10000 - offsets = np.random.randint(n, size=n).astype('timedelta64[ns]') - dates = np.datetime64('now') + offsets - dates[np.random.rand(n) > 0.5] = np.datetime64('nat') - offsets[np.random.rand(n) > 0.5] = np.timedelta64('nat') + offsets = np.random.randint(n, size=n).astype("timedelta64[ns]") + dates = np.datetime64("now") + offsets + dates[np.random.rand(n) > 0.5] = np.datetime64("nat") + offsets[np.random.rand(n) > 0.5] = np.timedelta64("nat") value2 = np.random.randn(n) value2[np.random.rand(n) > 0.5] = np.nan - obj = np.random.choice(list('ab'), size=n).astype(object) + obj = np.random.choice(list("ab"), size=n).astype(object) obj[np.random.randn(n) > 0.5] = np.nan - df = DataFrame({'key1': np.random.randint(0, 500, size=n), - 'key2': np.random.randint(0, 100, size=n), - 'dates': dates, - 'value2': value2, - 'value3': np.random.randn(n), - 'ints': np.random.randint(0, 1000, size=n), - 'obj': obj, - 'offsets': offsets}) + df = DataFrame( + { + "key1": np.random.randint(0, 500, size=n), + "key2": np.random.randint(0, 100, size=n), + "dates": dates, + "value2": value2, + "value3": np.random.randn(n), + "ints": np.random.randint(0, 1000, size=n), + "obj": obj, + "offsets": offsets, + } + ) return df def time_multi_count(self, df): - df.groupby(['key1', 'key2']).count() + df.groupby(["key1", "key2"]).count() class CountMultiInt: - def setup_cache(self): n = 10000 - df = DataFrame({'key1': np.random.randint(0, 500, size=n), - 'key2': np.random.randint(0, 100, size=n), - 'ints': np.random.randint(0, 1000, size=n), - 'ints2': np.random.randint(0, 1000, size=n)}) + df = DataFrame( + { + "key1": np.random.randint(0, 500, size=n), + "key2": np.random.randint(0, 100, size=n), + "ints": np.random.randint(0, 1000, size=n), + "ints2": np.random.randint(0, 1000, size=n), + } + ) return df def time_multi_int_count(self, df): - df.groupby(['key1', 'key2']).count() + df.groupby(["key1", "key2"]).count() def time_multi_int_nunique(self, df): - df.groupby(['key1', 'key2']).nunique() + df.groupby(["key1", "key2"]).nunique() class AggFunctions: - def setup_cache(self): - N = 10**5 - fac1 = np.array(['A', 'B', 'C'], dtype='O') - fac2 = np.array(['one', 'two'], dtype='O') - df = DataFrame({'key1': fac1.take(np.random.randint(0, 3, size=N)), - 'key2': fac2.take(np.random.randint(0, 2, size=N)), - 'value1': np.random.randn(N), - 'value2': np.random.randn(N), - 'value3': np.random.randn(N)}) + N = 10 ** 5 + fac1 = np.array(["A", "B", "C"], dtype="O") + fac2 = np.array(["one", "two"], dtype="O") + df = DataFrame( + { + "key1": fac1.take(np.random.randint(0, 3, size=N)), + "key2": fac2.take(np.random.randint(0, 2, size=N)), + "value1": np.random.randn(N), + "value2": np.random.randn(N), + "value3": np.random.randn(N), + } + ) return df def time_different_str_functions(self, df): - df.groupby(['key1', 'key2']).agg({'value1': 'mean', - 'value2': 'var', - 'value3': 'sum'}) + df.groupby(["key1", "key2"]).agg( + {"value1": "mean", "value2": "var", "value3": "sum"} + ) def time_different_numpy_functions(self, df): - df.groupby(['key1', 'key2']).agg({'value1': np.mean, - 'value2': np.var, - 'value3': np.sum}) + df.groupby(["key1", "key2"]).agg( + {"value1": np.mean, "value2": np.var, "value3": np.sum} + ) def time_different_python_functions_multicol(self, df): - df.groupby(['key1', 'key2']).agg([sum, min, max]) + df.groupby(["key1", "key2"]).agg([sum, min, max]) def time_different_python_functions_singlecol(self, df): - df.groupby('key1').agg([sum, min, max]) + df.groupby("key1").agg([sum, min, max]) class GroupStrings: - def setup(self): - n = 2 * 10**5 - alpha = list(map(''.join, product(ascii_letters, repeat=4))) + n = 2 * 10 ** 5 + alpha = list(map("".join, product(ascii_letters, repeat=4))) data = np.random.choice(alpha, (n // 5, 4), replace=False) data = np.repeat(data, 5, axis=0) - self.df = DataFrame(data, columns=list('abcd')) - self.df['joe'] = (np.random.randn(len(self.df)) * 10).round(3) + self.df = DataFrame(data, columns=list("abcd")) + self.df["joe"] = (np.random.randn(len(self.df)) * 10).round(3) self.df = self.df.sample(frac=1).reset_index(drop=True) def time_multi_columns(self): - self.df.groupby(list('abcd')).max() + self.df.groupby(list("abcd")).max() class MultiColumn: - def setup_cache(self): - N = 10**5 + N = 10 ** 5 key1 = np.tile(np.arange(100, dtype=object), 1000) key2 = key1.copy() np.random.shuffle(key1) np.random.shuffle(key2) - df = DataFrame({'key1': key1, - 'key2': key2, - 'data1': np.random.randn(N), - 'data2': np.random.randn(N)}) + df = DataFrame( + { + "key1": key1, + "key2": key2, + "data1": np.random.randn(N), + "data2": np.random.randn(N), + } + ) return df def time_lambda_sum(self, df): - df.groupby(['key1', 'key2']).agg(lambda x: x.values.sum()) + df.groupby(["key1", "key2"]).agg(lambda x: x.values.sum()) def time_cython_sum(self, df): - df.groupby(['key1', 'key2']).sum() + df.groupby(["key1", "key2"]).sum() def time_col_select_lambda_sum(self, df): - df.groupby(['key1', 'key2'])['data1'].agg(lambda x: x.values.sum()) + df.groupby(["key1", "key2"])["data1"].agg(lambda x: x.values.sum()) def time_col_select_numpy_sum(self, df): - df.groupby(['key1', 'key2'])['data1'].agg(np.sum) + df.groupby(["key1", "key2"])["data1"].agg(np.sum) class Size: - def setup(self): - n = 10**5 - offsets = np.random.randint(n, size=n).astype('timedelta64[ns]') - dates = np.datetime64('now') + offsets - self.df = DataFrame({'key1': np.random.randint(0, 500, size=n), - 'key2': np.random.randint(0, 100, size=n), - 'value1': np.random.randn(n), - 'value2': np.random.randn(n), - 'value3': np.random.randn(n), - 'dates': dates}) + n = 10 ** 5 + offsets = np.random.randint(n, size=n).astype("timedelta64[ns]") + dates = np.datetime64("now") + offsets + self.df = DataFrame( + { + "key1": np.random.randint(0, 500, size=n), + "key2": np.random.randint(0, 100, size=n), + "value1": np.random.randn(n), + "value2": np.random.randn(n), + "value3": np.random.randn(n), + "dates": dates, + } + ) self.draws = Series(np.random.randn(n)) - labels = Series(['foo', 'bar', 'baz', 'qux'] * (n // 4)) - self.cats = labels.astype('category') + labels = Series(["foo", "bar", "baz", "qux"] * (n // 4)) + self.cats = labels.astype("category") def time_multi_size(self): - self.df.groupby(['key1', 'key2']).size() + self.df.groupby(["key1", "key2"]).size() def time_category_size(self): self.draws.groupby(self.cats).size() @@ -306,15 +360,47 @@ def time_category_size(self): class GroupByMethods: - param_names = ['dtype', 'method', 'application'] - params = [['int', 'float', 'object', 'datetime'], - ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin', - 'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head', - 'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique', - 'pct_change', 'prod', 'quantile', 'rank', 'sem', 'shift', - 'size', 'skew', 'std', 'sum', 'tail', 'unique', 'value_counts', - 'var'], - ['direct', 'transformation']] + param_names = ["dtype", "method", "application"] + params = [ + ["int", "float", "object", "datetime"], + [ + "all", + "any", + "bfill", + "count", + "cumcount", + "cummax", + "cummin", + "cumprod", + "cumsum", + "describe", + "ffill", + "first", + "head", + "last", + "mad", + "max", + "min", + "median", + "mean", + "nunique", + "pct_change", + "prod", + "quantile", + "rank", + "sem", + "shift", + "size", + "skew", + "std", + "sum", + "tail", + "unique", + "value_counts", + "var", + ], + ["direct", "transformation"], + ] def setup(self, dtype, method, application): if method in method_blacklist.get(dtype, {}): @@ -323,29 +409,28 @@ def setup(self, dtype, method, application): size = ngroups * 2 rng = np.arange(ngroups) values = rng.take(np.random.randint(0, ngroups, size=size)) - if dtype == 'int': + if dtype == "int": key = np.random.randint(0, size, size=size) - elif dtype == 'float': - key = np.concatenate([np.random.random(ngroups) * 0.1, - np.random.random(ngroups) * 10.0]) - elif dtype == 'object': - key = ['foo'] * size - elif dtype == 'datetime': - key = date_range('1/1/2011', periods=size, freq='s') - - df = DataFrame({'values': values, 'key': key}) - - if application == 'transform': - if method == 'describe': + elif dtype == "float": + key = np.concatenate( + [np.random.random(ngroups) * 0.1, np.random.random(ngroups) * 10.0] + ) + elif dtype == "object": + key = ["foo"] * size + elif dtype == "datetime": + key = date_range("1/1/2011", periods=size, freq="s") + + df = DataFrame({"values": values, "key": key}) + + if application == "transform": + if method == "describe": raise NotImplementedError - self.as_group_method = lambda: df.groupby( - 'key')['values'].transform(method) - self.as_field_method = lambda: df.groupby( - 'values')['key'].transform(method) + self.as_group_method = lambda: df.groupby("key")["values"].transform(method) + self.as_field_method = lambda: df.groupby("values")["key"].transform(method) else: - self.as_group_method = getattr(df.groupby('key')['values'], method) - self.as_field_method = getattr(df.groupby('values')['key'], method) + self.as_group_method = getattr(df.groupby("key")["values"], method) + self.as_field_method = getattr(df.groupby("values")["key"], method) def time_dtype_as_group(self, dtype, method, application): self.as_group_method() @@ -356,20 +441,22 @@ def time_dtype_as_field(self, dtype, method, application): class RankWithTies: # GH 21237 - param_names = ['dtype', 'tie_method'] - params = [['float64', 'float32', 'int64', 'datetime64'], - ['first', 'average', 'dense', 'min', 'max']] + param_names = ["dtype", "tie_method"] + params = [ + ["float64", "float32", "int64", "datetime64"], + ["first", "average", "dense", "min", "max"], + ] def setup(self, dtype, tie_method): - N = 10**4 - if dtype == 'datetime64': + N = 10 ** 4 + if dtype == "datetime64": data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype) else: data = np.array([1] * N, dtype=dtype) - self.df = DataFrame({'values': data, 'key': ['foo'] * N}) + self.df = DataFrame({"values": data, "key": ["foo"] * N}) def time_rank_ties(self, dtype, tie_method): - self.df.groupby('key').rank(method=tie_method) + self.df.groupby("key").rank(method=tie_method) class Float32: @@ -382,57 +469,61 @@ def setup(self): self.df = DataFrame(dict(a=arr, b=arr)) def time_sum(self): - self.df.groupby(['a'])['b'].sum() + self.df.groupby(["a"])["b"].sum() class Categories: - def setup(self): - N = 10**5 + N = 10 ** 5 arr = np.random.random(N) - data = {'a': Categorical(np.random.randint(10000, size=N)), - 'b': arr} + data = {"a": Categorical(np.random.randint(10000, size=N)), "b": arr} self.df = DataFrame(data) - data = {'a': Categorical(np.random.randint(10000, size=N), - ordered=True), - 'b': arr} + data = { + "a": Categorical(np.random.randint(10000, size=N), ordered=True), + "b": arr, + } self.df_ordered = DataFrame(data) - data = {'a': Categorical(np.random.randint(100, size=N), - categories=np.arange(10000)), - 'b': arr} + data = { + "a": Categorical( + np.random.randint(100, size=N), categories=np.arange(10000) + ), + "b": arr, + } self.df_extra_cat = DataFrame(data) def time_groupby_sort(self): - self.df.groupby('a')['b'].count() + self.df.groupby("a")["b"].count() def time_groupby_nosort(self): - self.df.groupby('a', sort=False)['b'].count() + self.df.groupby("a", sort=False)["b"].count() def time_groupby_ordered_sort(self): - self.df_ordered.groupby('a')['b'].count() + self.df_ordered.groupby("a")["b"].count() def time_groupby_ordered_nosort(self): - self.df_ordered.groupby('a', sort=False)['b'].count() + self.df_ordered.groupby("a", sort=False)["b"].count() def time_groupby_extra_cat_sort(self): - self.df_extra_cat.groupby('a')['b'].count() + self.df_extra_cat.groupby("a")["b"].count() def time_groupby_extra_cat_nosort(self): - self.df_extra_cat.groupby('a', sort=False)['b'].count() + self.df_extra_cat.groupby("a", sort=False)["b"].count() class Datelike: # GH 14338 - params = ['period_range', 'date_range', 'date_range_tz'] - param_names = ['grouper'] + params = ["period_range", "date_range", "date_range_tz"] + param_names = ["grouper"] def setup(self, grouper): - N = 10**4 - rng_map = {'period_range': period_range, - 'date_range': date_range, - 'date_range_tz': partial(date_range, tz='US/Central')} - self.grouper = rng_map[grouper]('1900-01-01', freq='D', periods=N) - self.df = DataFrame(np.random.randn(10**4, 2)) + N = 10 ** 4 + rng_map = { + "period_range": period_range, + "date_range": date_range, + "date_range_tz": partial(date_range, tz="US/Central"), + } + self.grouper = rng_map[grouper]("1900-01-01", freq="D", periods=N) + self.df = DataFrame(np.random.randn(10 ** 4, 2)) def time_sum(self, grouper): self.df.groupby(self.grouper).sum() @@ -442,11 +533,10 @@ class SumBools: # GH 2692 def setup(self): N = 500 - self.df = DataFrame({'ii': range(N), - 'bb': [True] * N}) + self.df = DataFrame({"ii": range(N), "bb": [True] * N}) def time_groupby_sum_booleans(self): - self.df.groupby('ii').sum() + self.df.groupby("ii").sum() class SumMultiLevel: @@ -455,84 +545,85 @@ class SumMultiLevel: def setup(self): N = 50 - self.df = DataFrame({'A': list(range(N)) * 2, - 'B': range(N * 2), - 'C': 1}).set_index(['A', 'B']) + self.df = DataFrame( + {"A": list(range(N)) * 2, "B": range(N * 2), "C": 1} + ).set_index(["A", "B"]) def time_groupby_sum_multiindex(self): self.df.groupby(level=[0, 1]).sum() class Transform: - def setup(self): n1 = 400 n2 = 250 - index = MultiIndex(levels=[np.arange(n1), tm.makeStringIndex(n2)], - codes=[np.repeat(range(n1), n2).tolist(), - list(range(n2)) * n1], - names=['lev1', 'lev2']) + index = MultiIndex( + levels=[np.arange(n1), tm.makeStringIndex(n2)], + codes=[np.repeat(range(n1), n2).tolist(), list(range(n2)) * n1], + names=["lev1", "lev2"], + ) arr = np.random.randn(n1 * n2, 3) arr[::10000, 0] = np.nan arr[1::10000, 1] = np.nan arr[2::10000, 2] = np.nan - data = DataFrame(arr, index=index, columns=['col1', 'col20', 'col3']) + data = DataFrame(arr, index=index, columns=["col1", "col20", "col3"]) self.df = data n = 20000 - self.df1 = DataFrame(np.random.randint(1, n, (n, 3)), - columns=['jim', 'joe', 'jolie']) + self.df1 = DataFrame( + np.random.randint(1, n, (n, 3)), columns=["jim", "joe", "jolie"] + ) self.df2 = self.df1.copy() - self.df2['jim'] = self.df2['joe'] + self.df2["jim"] = self.df2["joe"] - self.df3 = DataFrame(np.random.randint(1, (n / 10), (n, 3)), - columns=['jim', 'joe', 'jolie']) + self.df3 = DataFrame( + np.random.randint(1, (n / 10), (n, 3)), columns=["jim", "joe", "jolie"] + ) self.df4 = self.df3.copy() - self.df4['jim'] = self.df4['joe'] + self.df4["jim"] = self.df4["joe"] def time_transform_lambda_max(self): - self.df.groupby(level='lev1').transform(lambda x: max(x)) + self.df.groupby(level="lev1").transform(lambda x: max(x)) def time_transform_ufunc_max(self): - self.df.groupby(level='lev1').transform(np.max) + self.df.groupby(level="lev1").transform(np.max) def time_transform_multi_key1(self): - self.df1.groupby(['jim', 'joe'])['jolie'].transform('max') + self.df1.groupby(["jim", "joe"])["jolie"].transform("max") def time_transform_multi_key2(self): - self.df2.groupby(['jim', 'joe'])['jolie'].transform('max') + self.df2.groupby(["jim", "joe"])["jolie"].transform("max") def time_transform_multi_key3(self): - self.df3.groupby(['jim', 'joe'])['jolie'].transform('max') + self.df3.groupby(["jim", "joe"])["jolie"].transform("max") def time_transform_multi_key4(self): - self.df4.groupby(['jim', 'joe'])['jolie'].transform('max') + self.df4.groupby(["jim", "joe"])["jolie"].transform("max") class TransformBools: - def setup(self): N = 120000 transition_points = np.sort(np.random.choice(np.arange(N), 1400)) transitions = np.zeros(N, dtype=np.bool) transitions[transition_points] = True self.g = transitions.cumsum() - self.df = DataFrame({'signal': np.random.rand(N)}) + self.df = DataFrame({"signal": np.random.rand(N)}) def time_transform_mean(self): - self.df['signal'].groupby(self.g).transform(np.mean) + self.df["signal"].groupby(self.g).transform(np.mean) class TransformNaN: # GH 12737 def setup(self): - self.df_nans = DataFrame({'key': np.repeat(np.arange(1000), 10), - 'B': np.nan, - 'C': np.nan}) - self.df_nans.loc[4::10, 'B':'C'] = 5 + self.df_nans = DataFrame( + {"key": np.repeat(np.arange(1000), 10), "B": np.nan, "C": np.nan} + ) + self.df_nans.loc[4::10, "B":"C"] = 5 def time_first(self): - self.df_nans.groupby('key').transform('first') + self.df_nans.groupby("key").transform("first") from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 1eedc1a2b30213..6541ddcb0397dc 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -1,38 +1,47 @@ import numpy as np import pandas.util.testing as tm -from pandas import (Series, date_range, DatetimeIndex, Index, RangeIndex, - Float64Index, IntervalIndex) +from pandas import ( + Series, + date_range, + DatetimeIndex, + Index, + RangeIndex, + Float64Index, + IntervalIndex, +) class SetOperations: - params = (['datetime', 'date_string', 'int', 'strings'], - ['intersection', 'union', 'symmetric_difference']) - param_names = ['dtype', 'method'] + params = ( + ["datetime", "date_string", "int", "strings"], + ["intersection", "union", "symmetric_difference"], + ) + param_names = ["dtype", "method"] def setup(self, dtype, method): - N = 10**5 - dates_left = date_range('1/1/2000', periods=N, freq='T') - fmt = '%Y-%m-%d %H:%M:%S' + N = 10 ** 5 + dates_left = date_range("1/1/2000", periods=N, freq="T") + fmt = "%Y-%m-%d %H:%M:%S" date_str_left = Index(dates_left.strftime(fmt)) int_left = Index(np.arange(N)) str_left = tm.makeStringIndex(N) - data = {'datetime': {'left': dates_left, 'right': dates_left[:-1]}, - 'date_string': {'left': date_str_left, - 'right': date_str_left[:-1]}, - 'int': {'left': int_left, 'right': int_left[:-1]}, - 'strings': {'left': str_left, 'right': str_left[:-1]}} - self.left = data[dtype]['left'] - self.right = data[dtype]['right'] + data = { + "datetime": {"left": dates_left, "right": dates_left[:-1]}, + "date_string": {"left": date_str_left, "right": date_str_left[:-1]}, + "int": {"left": int_left, "right": int_left[:-1]}, + "strings": {"left": str_left, "right": str_left[:-1]}, + } + self.left = data[dtype]["left"] + self.right = data[dtype]["right"] def time_operation(self, dtype, method): getattr(self.left, method)(self.right) class SetDisjoint: - def setup(self): - N = 10**5 + N = 10 ** 5 B = N + 20000 self.datetime_left = DatetimeIndex(range(N)) self.datetime_right = DatetimeIndex(range(N, B)) @@ -42,9 +51,8 @@ def time_datetime_difference_disjoint(self): class Datetime: - def setup(self): - self.dr = date_range('20000101', freq='D', periods=10000) + self.dr = date_range("20000101", freq="D", periods=10000) def time_is_dates_only(self): self.dr._is_dates_only @@ -52,12 +60,12 @@ def time_is_dates_only(self): class Ops: - params = ['float', 'int'] - param_names = ['dtype'] + params = ["float", "int"] + param_names = ["dtype"] def setup(self, dtype): - N = 10**6 - indexes = {'int': 'makeIntIndex', 'float': 'makeFloatIndex'} + N = 10 ** 6 + indexes = {"int": "makeIntIndex", "float": "makeFloatIndex"} self.index = getattr(tm, indexes[dtype])(N) def time_add(self, dtype): @@ -77,10 +85,9 @@ def time_modulo(self, dtype): class Range: - def setup(self): - self.idx_inc = RangeIndex(start=0, stop=10**7, step=3) - self.idx_dec = RangeIndex(start=10**7, stop=-1, step=-3) + self.idx_inc = RangeIndex(start=0, stop=10 ** 7, step=3) + self.idx_dec = RangeIndex(start=10 ** 7, stop=-1, step=-3) def time_max(self): self.idx_inc.max() @@ -102,7 +109,6 @@ def time_get_loc_dec(self): class IndexAppend: - def setup(self): N = 10000 @@ -132,19 +138,20 @@ def time_append_obj_list(self): class Indexing: - params = ['String', 'Float', 'Int'] - param_names = ['dtype'] + params = ["String", "Float", "Int"] + param_names = ["dtype"] def setup(self, dtype): - N = 10**6 - self.idx = getattr(tm, 'make{}Index'.format(dtype))(N) + N = 10 ** 6 + self.idx = getattr(tm, "make{}Index".format(dtype))(N) self.array_mask = (np.arange(N) % 3) == 0 self.series_mask = Series(self.array_mask) self.sorted = self.idx.sort_values() half = N // 2 self.non_unique = self.idx[:half].append(self.idx[:half]) - self.non_unique_sorted = (self.sorted[:half].append(self.sorted[:half]) - .sort_values()) + self.non_unique_sorted = ( + self.sorted[:half].append(self.sorted[:half]).sort_values() + ) self.key = self.sorted[N // 4] def time_boolean_array(self, dtype): @@ -188,7 +195,7 @@ def time_get_loc(self): class IntervalIndexMethod: # GH 24813 - params = [10**3, 10**5] + params = [10 ** 3, 10 ** 5] def setup(self, N): left = np.append(np.arange(N), np.array(0)) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 4e82fa55925292..489e5c4cd63ea3 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -2,26 +2,37 @@ import numpy as np import pandas.util.testing as tm -from pandas import (Series, DataFrame, MultiIndex, - Int64Index, UInt64Index, Float64Index, - IntervalIndex, CategoricalIndex, - IndexSlice, concat, date_range, option_context) +from pandas import ( + Series, + DataFrame, + MultiIndex, + Int64Index, + UInt64Index, + Float64Index, + IntervalIndex, + CategoricalIndex, + IndexSlice, + concat, + date_range, + option_context, +) class NumericSeriesIndexing: params = [ (Int64Index, UInt64Index, Float64Index), - ('unique_monotonic_inc', 'nonunique_monotonic_inc'), + ("unique_monotonic_inc", "nonunique_monotonic_inc"), ] - param_names = ['index_dtype', 'index_structure'] + param_names = ["index_dtype", "index_structure"] def setup(self, index, index_structure): - N = 10**6 + N = 10 ** 6 indices = { - 'unique_monotonic_inc': index(range(N)), - 'nonunique_monotonic_inc': index( - list(range(55)) + [54] + list(range(55, N - 1))), + "unique_monotonic_inc": index(range(N)), + "nonunique_monotonic_inc": index( + list(range(55)) + [54] + list(range(55, N - 1)) + ), } self.data = Series(np.random.rand(N), index=indices[index_structure]) self.array = np.arange(10000) @@ -82,23 +93,25 @@ def time_loc_slice(self, index, index_structure): class NonNumericSeriesIndexing: params = [ - ('string', 'datetime'), - ('unique_monotonic_inc', 'nonunique_monotonic_inc'), + ("string", "datetime"), + ("unique_monotonic_inc", "nonunique_monotonic_inc"), ] - param_names = ['index_dtype', 'index_structure'] + param_names = ["index_dtype", "index_structure"] def setup(self, index, index_structure): - N = 10**6 - indexes = {'string': tm.makeStringIndex(N), - 'datetime': date_range('1900', periods=N, freq='s')} + N = 10 ** 6 + indexes = { + "string": tm.makeStringIndex(N), + "datetime": date_range("1900", periods=N, freq="s"), + } index = indexes[index] - if index_structure == 'nonunique_monotonic_inc': + if index_structure == "nonunique_monotonic_inc": index = index.insert(item=index[2], loc=2)[:-1] self.s = Series(np.random.rand(N), index=index) self.lbl = index[80000] def time_getitem_label_slice(self, index, index_structure): - self.s[:self.lbl] + self.s[: self.lbl] def time_getitem_pos_slice(self, index, index_structure): self.s[:80000] @@ -115,12 +128,10 @@ def time_getitem_list_like(self, index, index_structure): class DataFrameStringIndexing: - def setup(self): index = tm.makeStringIndex(1000) columns = tm.makeStringIndex(30) - self.df = DataFrame(np.random.randn(1000, 30), index=index, - columns=columns) + self.df = DataFrame(np.random.randn(1000, 30), index=index, columns=columns) self.idx_scalar = index[100] self.col_scalar = columns[10] self.bool_indexer = self.df[self.col_scalar] > 0 @@ -147,7 +158,6 @@ def time_boolean_rows_object(self): class DataFrameNumericIndexing: - def setup(self): self.idx_dupe = np.array(range(30)) * 99 self.df = DataFrame(np.random.randn(10000, 5)) @@ -172,13 +182,15 @@ def time_bool_indexer(self): class Take: - params = ['int', 'datetime'] - param_names = ['index'] + params = ["int", "datetime"] + param_names = ["index"] def setup(self, index): N = 100000 - indexes = {'int': Int64Index(np.arange(N)), - 'datetime': date_range('2011-01-01', freq='S', periods=N)} + indexes = { + "int": Int64Index(np.arange(N)), + "datetime": date_range("2011-01-01", freq="S", periods=N), + } index = indexes[index] self.s = Series(np.random.rand(N), index=index) self.indexer = [True, False, True, True, False] * 20000 @@ -188,22 +200,24 @@ def time_take(self, index): class MultiIndexing: - def setup(self): mi = MultiIndex.from_product([range(1000), range(1000)]) self.s = Series(np.random.randn(1000000), index=mi) self.df = DataFrame(self.s) n = 100000 - self.mdt = DataFrame({'A': np.random.choice(range(10000, 45000, 1000), - n), - 'B': np.random.choice(range(10, 400), n), - 'C': np.random.choice(range(1, 150), n), - 'D': np.random.choice(range(10000, 45000), n), - 'x': np.random.choice(range(400), n), - 'y': np.random.choice(range(25), n)}) + self.mdt = DataFrame( + { + "A": np.random.choice(range(10000, 45000, 1000), n), + "B": np.random.choice(range(10, 400), n), + "C": np.random.choice(range(1, 150), n), + "D": np.random.choice(range(10000, 45000), n), + "x": np.random.choice(range(400), n), + "y": np.random.choice(range(25), n), + } + ) self.idx = IndexSlice[20000:30000, 20:30, 35:45, 30000:40000] - self.mdt = self.mdt.set_index(['A', 'B', 'C', 'D']).sort_index() + self.mdt = self.mdt.set_index(["A", "B", "C", "D"]).sort_index() def time_series_ix(self): self.s.ix[999] @@ -216,7 +230,6 @@ def time_index_slice(self): class IntervalIndexing: - def setup_cache(self): idx = IntervalIndex.from_breaks(np.arange(1000001)) monotonic = Series(np.arange(1000000), index=idx) @@ -237,29 +250,30 @@ def time_loc_list(self, monotonic): class CategoricalIndexIndexing: - params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic'] - param_names = ['index'] + params = ["monotonic_incr", "monotonic_decr", "non_monotonic"] + param_names = ["index"] def setup(self, index): - N = 10**5 - values = list('a' * N + 'b' * N + 'c' * N) + N = 10 ** 5 + values = list("a" * N + "b" * N + "c" * N) indices = { - 'monotonic_incr': CategoricalIndex(values), - 'monotonic_decr': CategoricalIndex(reversed(values)), - 'non_monotonic': CategoricalIndex(list('abc' * N))} + "monotonic_incr": CategoricalIndex(values), + "monotonic_decr": CategoricalIndex(reversed(values)), + "non_monotonic": CategoricalIndex(list("abc" * N)), + } self.data = indices[index] self.int_scalar = 10000 self.int_list = list(range(10000)) - self.cat_scalar = 'b' - self.cat_list = ['a', 'c'] + self.cat_scalar = "b" + self.cat_list = ["a", "c"] def time_getitem_scalar(self, index): self.data[self.int_scalar] def time_getitem_slice(self, index): - self.data[:self.int_scalar] + self.data[: self.int_scalar] def time_getitem_list_like(self, index): self.data[[self.int_scalar]] @@ -278,7 +292,6 @@ def time_get_indexer_list(self, index): class MethodLookup: - def setup_cache(self): s = Series() return s @@ -294,40 +307,36 @@ def time_lookup_loc(self, s): class GetItemSingleColumn: - def setup(self): - self.df_string_col = DataFrame(np.random.randn(3000, 1), columns=['A']) + self.df_string_col = DataFrame(np.random.randn(3000, 1), columns=["A"]) self.df_int_col = DataFrame(np.random.randn(3000, 1)) def time_frame_getitem_single_column_label(self): - self.df_string_col['A'] + self.df_string_col["A"] def time_frame_getitem_single_column_int(self): self.df_int_col[0] class AssignTimeseriesIndex: - def setup(self): N = 100000 - idx = date_range('1/1/2000', periods=N, freq='H') - self.df = DataFrame(np.random.randn(N, 1), columns=['A'], index=idx) + idx = date_range("1/1/2000", periods=N, freq="H") + self.df = DataFrame(np.random.randn(N, 1), columns=["A"], index=idx) def time_frame_assign_timeseries_index(self): - self.df['date'] = self.df.index + self.df["date"] = self.df.index class InsertColumns: - def setup(self): - self.N = 10**3 + self.N = 10 ** 3 self.df = DataFrame(index=range(self.N)) def time_insert(self): np.random.seed(1234) for i in range(100): - self.df.insert(0, i, np.random.randn(self.N), - allow_duplicates=True) + self.df.insert(0, i, np.random.randn(self.N), allow_duplicates=True) def time_assign_with_setitem(self): np.random.seed(1234) @@ -337,18 +346,18 @@ def time_assign_with_setitem(self): class ChainIndexing: - params = [None, 'warn'] - param_names = ['mode'] + params = [None, "warn"] + param_names = ["mode"] def setup(self, mode): self.N = 1000000 def time_chained_indexing(self, mode): with warnings.catch_warnings(record=True): - with option_context('mode.chained_assignment', mode): - df = DataFrame({'A': np.arange(self.N), 'B': 'foo'}) + with option_context("mode.chained_assignment", mode): + df = DataFrame({"A": np.arange(self.N), "B": "foo"}) df2 = df[df.A > self.N // 2] - df2['C'] = 1.0 + df2["C"] = 1.0 from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index 56557017818460..44a22dfa777914 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -5,33 +5,40 @@ def _get_numeric_engines(): engine_names = [ - ('Int64Engine', np.int64), ('Int32Engine', np.int32), - ('Int16Engine', np.int16), ('Int8Engine', np.int8), - ('UInt64Engine', np.uint64), ('UInt32Engine', np.uint32), - ('UInt16engine', np.uint16), ('UInt8Engine', np.uint8), - ('Float64Engine', np.float64), ('Float32Engine', np.float32), + ("Int64Engine", np.int64), + ("Int32Engine", np.int32), + ("Int16Engine", np.int16), + ("Int8Engine", np.int8), + ("UInt64Engine", np.uint64), + ("UInt32Engine", np.uint32), + ("UInt16engine", np.uint16), + ("UInt8Engine", np.uint8), + ("Float64Engine", np.float64), + ("Float32Engine", np.float32), + ] + return [ + (getattr(libindex, engine_name), dtype) + for engine_name, dtype in engine_names + if hasattr(libindex, engine_name) ] - return [(getattr(libindex, engine_name), dtype) - for engine_name, dtype in engine_names - if hasattr(libindex, engine_name)] class NumericEngineIndexing: - params = [_get_numeric_engines(), - ['monotonic_incr', 'monotonic_decr', 'non_monotonic'], - ] - param_names = ['engine_and_dtype', 'index_type'] + params = [ + _get_numeric_engines(), + ["monotonic_incr", "monotonic_decr", "non_monotonic"], + ] + param_names = ["engine_and_dtype", "index_type"] def setup(self, engine_and_dtype, index_type): engine, dtype = engine_and_dtype - N = 10**5 + N = 10 ** 5 values = list([1] * N + [2] * N + [3] * N) arr = { - 'monotonic_incr': np.array(values, dtype=dtype), - 'monotonic_decr': np.array(list(reversed(values)), - dtype=dtype), - 'non_monotonic': np.array([1, 2, 3] * N, dtype=dtype), + "monotonic_incr": np.array(values, dtype=dtype), + "monotonic_decr": np.array(list(reversed(values)), dtype=dtype), + "non_monotonic": np.array([1, 2, 3] * N, dtype=dtype), }[index_type] self.data = engine(lambda: arr, len(arr)) @@ -44,21 +51,21 @@ def time_get_loc(self, engine_and_dtype, index_type): class ObjectEngineIndexing: - params = [('monotonic_incr', 'monotonic_decr', 'non_monotonic')] - param_names = ['index_type'] + params = [("monotonic_incr", "monotonic_decr", "non_monotonic")] + param_names = ["index_type"] def setup(self, index_type): - N = 10**5 - values = list('a' * N + 'b' * N + 'c' * N) + N = 10 ** 5 + values = list("a" * N + "b" * N + "c" * N) arr = { - 'monotonic_incr': np.array(values, dtype=object), - 'monotonic_decr': np.array(list(reversed(values)), dtype=object), - 'non_monotonic': np.array(list('abc') * N, dtype=object), + "monotonic_incr": np.array(values, dtype=object), + "monotonic_decr": np.array(list(reversed(values)), dtype=object), + "non_monotonic": np.array(list("abc") * N, dtype=object), }[index_type] self.data = libindex.ObjectEngine(lambda: arr, len(arr)) # code belows avoids populating the mapping etc. while timing. - self.data.get_loc('b') + self.data.get_loc("b") def time_get_loc(self, index_type): - self.data.get_loc('b') + self.data.get_loc("b") diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 065c82207d2514..66ef4f2aec380c 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -8,56 +8,57 @@ class NumericInferOps: # from GH 7332 params = numeric_dtypes - param_names = ['dtype'] + param_names = ["dtype"] def setup(self, dtype): - N = 5 * 10**5 - self.df = DataFrame({'A': np.arange(N).astype(dtype), - 'B': np.arange(N).astype(dtype)}) + N = 5 * 10 ** 5 + self.df = DataFrame( + {"A": np.arange(N).astype(dtype), "B": np.arange(N).astype(dtype)} + ) def time_add(self, dtype): - self.df['A'] + self.df['B'] + self.df["A"] + self.df["B"] def time_subtract(self, dtype): - self.df['A'] - self.df['B'] + self.df["A"] - self.df["B"] def time_multiply(self, dtype): - self.df['A'] * self.df['B'] + self.df["A"] * self.df["B"] def time_divide(self, dtype): - self.df['A'] / self.df['B'] + self.df["A"] / self.df["B"] def time_modulo(self, dtype): - self.df['A'] % self.df['B'] + self.df["A"] % self.df["B"] class DateInferOps: # from GH 7332 def setup_cache(self): - N = 5 * 10**5 - df = DataFrame({'datetime64': np.arange(N).astype('datetime64[ms]')}) - df['timedelta'] = df['datetime64'] - df['datetime64'] + N = 5 * 10 ** 5 + df = DataFrame({"datetime64": np.arange(N).astype("datetime64[ms]")}) + df["timedelta"] = df["datetime64"] - df["datetime64"] return df def time_subtract_datetimes(self, df): - df['datetime64'] - df['datetime64'] + df["datetime64"] - df["datetime64"] def time_timedelta_plus_datetime(self, df): - df['timedelta'] + df['datetime64'] + df["timedelta"] + df["datetime64"] def time_add_timedeltas(self, df): - df['timedelta'] + df['timedelta'] + df["timedelta"] + df["timedelta"] class ToNumeric: - params = ['ignore', 'coerce'] - param_names = ['errors'] + params = ["ignore", "coerce"] + param_names = ["errors"] def setup(self, errors): N = 10000 self.float = Series(np.random.randn(N)) - self.numstr = self.float.astype('str') + self.numstr = self.float.astype("str") self.str = Series(tm.makeStringIndex(N)) def time_from_float(self, errors): @@ -72,21 +73,32 @@ def time_from_str(self, errors): class ToNumericDowncast: - param_names = ['dtype', 'downcast'] - params = [['string-float', 'string-int', 'string-nint', 'datetime64', - 'int-list', 'int32'], - [None, 'integer', 'signed', 'unsigned', 'float']] + param_names = ["dtype", "downcast"] + params = [ + [ + "string-float", + "string-int", + "string-nint", + "datetime64", + "int-list", + "int32", + ], + [None, "integer", "signed", "unsigned", "float"], + ] N = 500000 N2 = int(N / 2) - data_dict = {'string-int': ['1'] * N2 + [2] * N2, - 'string-nint': ['-1'] * N2 + [2] * N2, - 'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'], - dtype='datetime64[D]'), N), - 'string-float': ['1.1'] * N2 + [2] * N2, - 'int-list': [1] * N2 + [2] * N2, - 'int32': np.repeat(np.int32(1), N)} + data_dict = { + "string-int": ["1"] * N2 + [2] * N2, + "string-nint": ["-1"] * N2 + [2] * N2, + "datetime64": np.repeat( + np.array(["1970-01-01", "1970-01-02"], dtype="datetime64[D]"), N + ), + "string-float": ["1.1"] * N2 + [2] * N2, + "int-list": [1] * N2 + [2] * N2, + "int32": np.repeat(np.int32(1), N), + } def setup(self, dtype, downcast): self.data = self.data_dict[dtype] @@ -96,10 +108,9 @@ def time_downcast(self, dtype, downcast): class MaybeConvertNumeric: - def setup_cache(self): - N = 10**6 - arr = np.repeat([2**63], N) + np.arange(N).astype('uint64') + N = 10 ** 6 + arr = np.repeat([2 ** 63], N) + np.arange(N).astype("uint64") data = arr.astype(object) data[1::2] = arr[1::2].astype(str) data[-1] = -1 diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index fbb96380a58134..4525e504fc4dd5 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -11,27 +11,31 @@ class ToCSV(BaseIO): - fname = '__test__.csv' - params = ['wide', 'long', 'mixed'] - param_names = ['kind'] + fname = "__test__.csv" + params = ["wide", "long", "mixed"] + param_names = ["kind"] def setup(self, kind): wide_frame = DataFrame(np.random.randn(3000, 30)) - long_frame = DataFrame({'A': np.arange(50000), - 'B': np.arange(50000) + 1., - 'C': np.arange(50000) + 2., - 'D': np.arange(50000) + 3.}) - mixed_frame = DataFrame({'float': np.random.randn(5000), - 'int': np.random.randn(5000).astype(int), - 'bool': (np.arange(5000) % 2) == 0, - 'datetime': date_range('2001', - freq='s', - periods=5000), - 'object': ['foo'] * 5000}) - mixed_frame.loc[30:500, 'float'] = np.nan - data = {'wide': wide_frame, - 'long': long_frame, - 'mixed': mixed_frame} + long_frame = DataFrame( + { + "A": np.arange(50000), + "B": np.arange(50000) + 1.0, + "C": np.arange(50000) + 2.0, + "D": np.arange(50000) + 3.0, + } + ) + mixed_frame = DataFrame( + { + "float": np.random.randn(5000), + "int": np.random.randn(5000).astype(int), + "bool": (np.arange(5000) % 2) == 0, + "datetime": date_range("2001", freq="s", periods=5000), + "object": ["foo"] * 5000, + } + ) + mixed_frame.loc[30:500, "float"] = np.nan + data = {"wide": wide_frame, "long": long_frame, "mixed": mixed_frame} self.df = data[kind] def time_frame(self, kind): @@ -40,36 +44,39 @@ def time_frame(self, kind): class ToCSVDatetime(BaseIO): - fname = '__test__.csv' + fname = "__test__.csv" def setup(self): - rng = date_range('1/1/2000', periods=1000) + rng = date_range("1/1/2000", periods=1000) self.data = DataFrame(rng, index=rng) def time_frame_date_formatting(self): - self.data.to_csv(self.fname, date_format='%Y%m%d') + self.data.to_csv(self.fname, date_format="%Y%m%d") class ToCSVDatetimeBig(BaseIO): - fname = '__test__.csv' + fname = "__test__.csv" timeout = 1500 params = [1000, 10000, 100000] - param_names = ['obs'] + param_names = ["obs"] def setup(self, obs): - d = '2018-11-29' - dt = '2018-11-26 11:18:27.0' - self.data = DataFrame({'dt': [np.datetime64(dt)] * obs, - 'd': [np.datetime64(d)] * obs, - 'r': [np.random.uniform()] * obs}) + d = "2018-11-29" + dt = "2018-11-26 11:18:27.0" + self.data = DataFrame( + { + "dt": [np.datetime64(dt)] * obs, + "d": [np.datetime64(d)] * obs, + "r": [np.random.uniform()] * obs, + } + ) def time_frame(self, obs): self.data.to_csv(self.fname) class StringIORewind: - def data(self, stringio_object): stringio_object.seek(0) return stringio_object @@ -77,68 +84,84 @@ def data(self, stringio_object): class ReadCSVDInferDatetimeFormat(StringIORewind): - params = ([True, False], ['custom', 'iso8601', 'ymd']) - param_names = ['infer_datetime_format', 'format'] + params = ([True, False], ["custom", "iso8601", "ymd"]) + param_names = ["infer_datetime_format", "format"] def setup(self, infer_datetime_format, format): - rng = date_range('1/1/2000', periods=1000) - formats = {'custom': '%m/%d/%Y %H:%M:%S.%f', - 'iso8601': '%Y-%m-%d %H:%M:%S', - 'ymd': '%Y%m%d'} + rng = date_range("1/1/2000", periods=1000) + formats = { + "custom": "%m/%d/%Y %H:%M:%S.%f", + "iso8601": "%Y-%m-%d %H:%M:%S", + "ymd": "%Y%m%d", + } dt_format = formats[format] - self.StringIO_input = StringIO('\n'.join( - rng.strftime(dt_format).tolist())) + self.StringIO_input = StringIO("\n".join(rng.strftime(dt_format).tolist())) def time_read_csv(self, infer_datetime_format, format): - read_csv(self.data(self.StringIO_input), - header=None, names=['foo'], parse_dates=['foo'], - infer_datetime_format=infer_datetime_format) + read_csv( + self.data(self.StringIO_input), + header=None, + names=["foo"], + parse_dates=["foo"], + infer_datetime_format=infer_datetime_format, + ) class ReadCSVConcatDatetime(StringIORewind): - iso8601 = '%Y-%m-%d %H:%M:%S' + iso8601 = "%Y-%m-%d %H:%M:%S" def setup(self): - rng = date_range('1/1/2000', periods=50000, freq='S') - self.StringIO_input = StringIO('\n'.join( - rng.strftime(self.iso8601).tolist())) + rng = date_range("1/1/2000", periods=50000, freq="S") + self.StringIO_input = StringIO("\n".join(rng.strftime(self.iso8601).tolist())) def time_read_csv(self): - read_csv(self.data(self.StringIO_input), - header=None, names=['foo'], parse_dates=['foo'], - infer_datetime_format=False) + read_csv( + self.data(self.StringIO_input), + header=None, + names=["foo"], + parse_dates=["foo"], + infer_datetime_format=False, + ) class ReadCSVConcatDatetimeBadDateValue(StringIORewind): - params = (['nan', '0', ''],) - param_names = ['bad_date_value'] + params = (["nan", "0", ""],) + param_names = ["bad_date_value"] def setup(self, bad_date_value): - self.StringIO_input = StringIO(('%s,\n' % bad_date_value) * 50000) + self.StringIO_input = StringIO(("%s,\n" % bad_date_value) * 50000) def time_read_csv(self, bad_date_value): - read_csv(self.data(self.StringIO_input), - header=None, names=['foo', 'bar'], parse_dates=['foo'], - infer_datetime_format=False) + read_csv( + self.data(self.StringIO_input), + header=None, + names=["foo", "bar"], + parse_dates=["foo"], + infer_datetime_format=False, + ) class ReadCSVSkipRows(BaseIO): - fname = '__test__.csv' + fname = "__test__.csv" params = [None, 10000] - param_names = ['skiprows'] + param_names = ["skiprows"] def setup(self, skiprows): N = 20000 index = tm.makeStringIndex(N) - df = DataFrame({'float1': np.random.randn(N), - 'float2': np.random.randn(N), - 'string1': ['foo'] * N, - 'bool1': [True] * N, - 'int1': np.random.randint(0, N, size=N)}, - index=index) + df = DataFrame( + { + "float1": np.random.randn(N), + "float2": np.random.randn(N), + "string1": ["foo"] * N, + "bool1": [True] * N, + "int1": np.random.randint(0, N, size=N), + }, + index=index, + ) df.to_csv(self.fname) def time_skipprows(self, skiprows): @@ -146,31 +169,31 @@ def time_skipprows(self, skiprows): class ReadUint64Integers(StringIORewind): - def setup(self): - self.na_values = [2**63 + 500] - arr = np.arange(10000).astype('uint64') + 2**63 - self.data1 = StringIO('\n'.join(arr.astype(str).tolist())) + self.na_values = [2 ** 63 + 500] + arr = np.arange(10000).astype("uint64") + 2 ** 63 + self.data1 = StringIO("\n".join(arr.astype(str).tolist())) arr = arr.astype(object) arr[500] = -1 - self.data2 = StringIO('\n'.join(arr.astype(str).tolist())) + self.data2 = StringIO("\n".join(arr.astype(str).tolist())) def time_read_uint64(self): - read_csv(self.data(self.data1), header=None, names=['foo']) + read_csv(self.data(self.data1), header=None, names=["foo"]) def time_read_uint64_neg_values(self): - read_csv(self.data(self.data2), header=None, names=['foo']) + read_csv(self.data(self.data2), header=None, names=["foo"]) def time_read_uint64_na_values(self): - read_csv(self.data(self.data1), header=None, names=['foo'], - na_values=self.na_values) + read_csv( + self.data(self.data1), header=None, names=["foo"], na_values=self.na_values + ) class ReadCSVThousands(BaseIO): - fname = '__test__.csv' - params = ([',', '|'], [None, ',']) - param_names = ['sep', 'thousands'] + fname = "__test__.csv" + params = ([",", "|"], [None, ","]) + param_names = ["sep", "thousands"] def setup(self, sep, thousands): N = 10000 @@ -178,8 +201,8 @@ def setup(self, sep, thousands): data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K)) df = DataFrame(data) if thousands is not None: - fmt = ':{}'.format(thousands) - fmt = '{' + fmt + '}' + fmt = ":{}".format(thousands) + fmt = "{" + fmt + "}" df = df.applymap(lambda x: fmt.format(x)) df.to_csv(self.fname, sep=sep) @@ -188,57 +211,68 @@ def time_thousands(self, sep, thousands): class ReadCSVComment(StringIORewind): - def setup(self): - data = ['A,B,C'] + (['1,2,3 # comment'] * 100000) - self.StringIO_input = StringIO('\n'.join(data)) + data = ["A,B,C"] + (["1,2,3 # comment"] * 100000) + self.StringIO_input = StringIO("\n".join(data)) def time_comment(self): - read_csv(self.data(self.StringIO_input), comment='#', - header=None, names=list('abc')) + read_csv( + self.data(self.StringIO_input), comment="#", header=None, names=list("abc") + ) class ReadCSVFloatPrecision(StringIORewind): - params = ([',', ';'], ['.', '_'], [None, 'high', 'round_trip']) - param_names = ['sep', 'decimal', 'float_precision'] + params = ([",", ";"], [".", "_"], [None, "high", "round_trip"]) + param_names = ["sep", "decimal", "float_precision"] def setup(self, sep, decimal, float_precision): - floats = [''.join(random.choice(string.digits) for _ in range(28)) - for _ in range(15)] - rows = sep.join(['0{}'.format(decimal) + '{}'] * 3) + '\n' + floats = [ + "".join(random.choice(string.digits) for _ in range(28)) for _ in range(15) + ] + rows = sep.join(["0{}".format(decimal) + "{}"] * 3) + "\n" data = rows * 5 data = data.format(*floats) * 200 # 1000 x 3 strings csv self.StringIO_input = StringIO(data) def time_read_csv(self, sep, decimal, float_precision): - read_csv(self.data(self.StringIO_input), sep=sep, header=None, - names=list('abc'), float_precision=float_precision) + read_csv( + self.data(self.StringIO_input), + sep=sep, + header=None, + names=list("abc"), + float_precision=float_precision, + ) def time_read_csv_python_engine(self, sep, decimal, float_precision): - read_csv(self.data(self.StringIO_input), sep=sep, header=None, - engine='python', float_precision=None, names=list('abc')) + read_csv( + self.data(self.StringIO_input), + sep=sep, + header=None, + engine="python", + float_precision=None, + names=list("abc"), + ) class ReadCSVCategorical(BaseIO): - fname = '__test__.csv' + fname = "__test__.csv" def setup(self): N = 100000 - group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee'] - df = DataFrame(np.random.choice(group1, (N, 3)), columns=list('abc')) + group1 = ["aaaaaaaa", "bbbbbbb", "cccccccc", "dddddddd", "eeeeeeee"] + df = DataFrame(np.random.choice(group1, (N, 3)), columns=list("abc")) df.to_csv(self.fname, index=False) def time_convert_post(self): read_csv(self.fname).apply(Categorical) def time_convert_direct(self): - read_csv(self.fname, dtype='category') + read_csv(self.fname, dtype="category") class ReadCSVParseDates(StringIORewind): - def setup(self): data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n {},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n @@ -246,34 +280,47 @@ def setup(self): {},21:00:00,21:18:00,-0.9900,2.0100,3.6000,0.0000,270.0000\n {},22:00:00,21:56:00,-0.5900,1.7100,5.1000,0.0000,290.0000\n """ - two_cols = ['KORD,19990127'] * 5 + two_cols = ["KORD,19990127"] * 5 data = data.format(*two_cols) self.StringIO_input = StringIO(data) def time_multiple_date(self): - read_csv(self.data(self.StringIO_input), sep=',', header=None, - names=list(string.digits[:9]), - parse_dates=[[1, 2], [1, 3]]) + read_csv( + self.data(self.StringIO_input), + sep=",", + header=None, + names=list(string.digits[:9]), + parse_dates=[[1, 2], [1, 3]], + ) def time_baseline(self): - read_csv(self.data(self.StringIO_input), sep=',', header=None, - parse_dates=[1], - names=list(string.digits[:9])) + read_csv( + self.data(self.StringIO_input), + sep=",", + header=None, + parse_dates=[1], + names=list(string.digits[:9]), + ) class ReadCSVCachedParseDates(StringIORewind): params = ([True, False],) - param_names = ['do_cache'] + param_names = ["do_cache"] def setup(self, do_cache): - data = ('\n'.join('10/{}'.format(year) - for year in range(2000, 2100)) + '\n') * 10 + data = ( + "\n".join("10/{}".format(year) for year in range(2000, 2100)) + "\n" + ) * 10 self.StringIO_input = StringIO(data) def time_read_csv_cached(self, do_cache): try: - read_csv(self.data(self.StringIO_input), header=None, - parse_dates=[0], cache_dates=do_cache) + read_csv( + self.data(self.StringIO_input), + header=None, + parse_dates=[0], + cache_dates=do_cache, + ) except TypeError: # cache_dates is a new keyword in 0.25 pass @@ -299,12 +346,12 @@ def mem_parser_chunks(self): class ReadCSVParseSpecialDate(StringIORewind): - params = (['mY', 'mdY', 'hm'],) - param_names = ['value'] + params = (["mY", "mdY", "hm"],) + param_names = ["value"] objects = { - 'mY': '01-2019\n10-2019\n02/2000\n', - 'mdY': '12/02/2010\n', - 'hm': '21:34\n' + "mY": "01-2019\n10-2019\n02/2000\n", + "mdY": "12/02/2010\n", + "hm": "21:34\n", } def setup(self, value): @@ -313,38 +360,50 @@ def setup(self, value): self.StringIO_input = StringIO(data) def time_read_special_date(self, value): - read_csv(self.data(self.StringIO_input), sep=',', header=None, - names=['Date'], parse_dates=['Date']) + read_csv( + self.data(self.StringIO_input), + sep=",", + header=None, + names=["Date"], + parse_dates=["Date"], + ) class ParseDateComparison(StringIORewind): params = ([False, True],) - param_names = ['cache_dates'] + param_names = ["cache_dates"] def setup(self, cache_dates): count_elem = 10000 - data = '12-02-2010\n' * count_elem + data = "12-02-2010\n" * count_elem self.StringIO_input = StringIO(data) def time_read_csv_dayfirst(self, cache_dates): try: - read_csv(self.data(self.StringIO_input), sep=',', header=None, - names=['Date'], parse_dates=['Date'], - cache_dates=cache_dates, - dayfirst=True) + read_csv( + self.data(self.StringIO_input), + sep=",", + header=None, + names=["Date"], + parse_dates=["Date"], + cache_dates=cache_dates, + dayfirst=True, + ) except TypeError: # cache_dates is a new keyword in 0.25 pass def time_to_datetime_dayfirst(self, cache_dates): - df = read_csv(self.data(self.StringIO_input), - dtype={'date': str}, names=['date']) - to_datetime(df['date'], cache=cache_dates, dayfirst=True) + df = read_csv( + self.data(self.StringIO_input), dtype={"date": str}, names=["date"] + ) + to_datetime(df["date"], cache=cache_dates, dayfirst=True) def time_to_datetime_format_DD_MM_YYYY(self, cache_dates): - df = read_csv(self.data(self.StringIO_input), - dtype={'date': str}, names=['date']) - to_datetime(df['date'], cache=cache_dates, format='%d-%m-%Y') + df = read_csv( + self.data(self.StringIO_input), dtype={"date": str}, names=["date"] + ) + to_datetime(df["date"], cache=cache_dates, format="%d-%m-%Y") from ..pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 1decb83f2f7238..12e70f84e52038 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -6,19 +6,21 @@ class Excel: - params = ['openpyxl', 'xlsxwriter', 'xlwt'] - param_names = ['engine'] + params = ["openpyxl", "xlsxwriter", "xlwt"] + param_names = ["engine"] def setup(self, engine): N = 2000 C = 5 - self.df = DataFrame(np.random.randn(N, C), - columns=['float{}'.format(i) for i in range(C)], - index=date_range('20000101', periods=N, freq='H')) - self.df['object'] = tm.makeStringIndex(N) + self.df = DataFrame( + np.random.randn(N, C), + columns=["float{}".format(i) for i in range(C)], + index=date_range("20000101", periods=N, freq="H"), + ) + self.df["object"] = tm.makeStringIndex(N) self.bio_read = BytesIO() self.writer_read = ExcelWriter(self.bio_read, engine=engine) - self.df.to_excel(self.writer_read, sheet_name='Sheet1') + self.df.to_excel(self.writer_read, sheet_name="Sheet1") self.writer_read.save() self.bio_read.seek(0) @@ -29,7 +31,7 @@ def time_write_excel(self, engine): bio_write = BytesIO() bio_write.seek(0) writer_write = ExcelWriter(bio_write, engine=engine) - self.df.to_excel(writer_write, sheet_name='Sheet1') + self.df.to_excel(writer_write, sheet_name="Sheet1") writer_write.save() diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index a5dc28eb9508c9..2874a7889156bf 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -6,86 +6,92 @@ class HDFStoreDataFrame(BaseIO): - def setup(self): N = 25000 index = tm.makeStringIndex(N) - self.df = DataFrame({'float1': np.random.randn(N), - 'float2': np.random.randn(N)}, - index=index) - self.df_mixed = DataFrame({'float1': np.random.randn(N), - 'float2': np.random.randn(N), - 'string1': ['foo'] * N, - 'bool1': [True] * N, - 'int1': np.random.randint(0, N, size=N)}, - index=index) + self.df = DataFrame( + {"float1": np.random.randn(N), "float2": np.random.randn(N)}, index=index + ) + self.df_mixed = DataFrame( + { + "float1": np.random.randn(N), + "float2": np.random.randn(N), + "string1": ["foo"] * N, + "bool1": [True] * N, + "int1": np.random.randint(0, N, size=N), + }, + index=index, + ) self.df_wide = DataFrame(np.random.randn(N, 100)) self.start_wide = self.df_wide.index[10000] self.stop_wide = self.df_wide.index[15000] - self.df2 = DataFrame({'float1': np.random.randn(N), - 'float2': np.random.randn(N)}, - index=date_range('1/1/2000', periods=N)) + self.df2 = DataFrame( + {"float1": np.random.randn(N), "float2": np.random.randn(N)}, + index=date_range("1/1/2000", periods=N), + ) self.start = self.df2.index[10000] self.stop = self.df2.index[15000] - self.df_wide2 = DataFrame(np.random.randn(N, 100), - index=date_range('1/1/2000', periods=N)) - self.df_dc = DataFrame(np.random.randn(N, 10), - columns=['C%03d' % i for i in range(10)]) + self.df_wide2 = DataFrame( + np.random.randn(N, 100), index=date_range("1/1/2000", periods=N) + ) + self.df_dc = DataFrame( + np.random.randn(N, 10), columns=["C%03d" % i for i in range(10)] + ) - self.fname = '__test__.h5' + self.fname = "__test__.h5" self.store = HDFStore(self.fname) - self.store.put('fixed', self.df) - self.store.put('fixed_mixed', self.df_mixed) - self.store.append('table', self.df2) - self.store.append('table_mixed', self.df_mixed) - self.store.append('table_wide', self.df_wide) - self.store.append('table_wide2', self.df_wide2) + self.store.put("fixed", self.df) + self.store.put("fixed_mixed", self.df_mixed) + self.store.append("table", self.df2) + self.store.append("table_mixed", self.df_mixed) + self.store.append("table_wide", self.df_wide) + self.store.append("table_wide2", self.df_wide2) def teardown(self): self.store.close() self.remove(self.fname) def time_read_store(self): - self.store.get('fixed') + self.store.get("fixed") def time_read_store_mixed(self): - self.store.get('fixed_mixed') + self.store.get("fixed_mixed") def time_write_store(self): - self.store.put('fixed_write', self.df) + self.store.put("fixed_write", self.df) def time_write_store_mixed(self): - self.store.put('fixed_mixed_write', self.df_mixed) + self.store.put("fixed_mixed_write", self.df_mixed) def time_read_store_table_mixed(self): - self.store.select('table_mixed') + self.store.select("table_mixed") def time_write_store_table_mixed(self): - self.store.append('table_mixed_write', self.df_mixed) + self.store.append("table_mixed_write", self.df_mixed) def time_read_store_table(self): - self.store.select('table') + self.store.select("table") def time_write_store_table(self): - self.store.append('table_write', self.df) + self.store.append("table_write", self.df) def time_read_store_table_wide(self): - self.store.select('table_wide') + self.store.select("table_wide") def time_write_store_table_wide(self): - self.store.append('table_wide_write', self.df_wide) + self.store.append("table_wide_write", self.df_wide) def time_write_store_table_dc(self): - self.store.append('table_dc_write', self.df_dc, data_columns=True) + self.store.append("table_dc_write", self.df_dc, data_columns=True) def time_query_store_table_wide(self): - self.store.select('table_wide', where="index > self.start_wide and " - "index < self.stop_wide") + self.store.select( + "table_wide", where="index > self.start_wide and " "index < self.stop_wide" + ) def time_query_store_table(self): - self.store.select('table', where="index > self.start and " - "index < self.stop") + self.store.select("table", where="index > self.start and " "index < self.stop") def time_store_repr(self): repr(self.store) @@ -99,24 +105,26 @@ def time_store_info(self): class HDF(BaseIO): - params = ['table', 'fixed'] - param_names = ['format'] + params = ["table", "fixed"] + param_names = ["format"] def setup(self, format): - self.fname = '__test__.h5' + self.fname = "__test__.h5" N = 100000 C = 5 - self.df = DataFrame(np.random.randn(N, C), - columns=['float{}'.format(i) for i in range(C)], - index=date_range('20000101', periods=N, freq='H')) - self.df['object'] = tm.makeStringIndex(N) - self.df.to_hdf(self.fname, 'df', format=format) + self.df = DataFrame( + np.random.randn(N, C), + columns=["float{}".format(i) for i in range(C)], + index=date_range("20000101", periods=N, freq="H"), + ) + self.df["object"] = tm.makeStringIndex(N) + self.df.to_hdf(self.fname, "df", format=format) def time_read_hdf(self, format): - read_hdf(self.fname, 'df') + read_hdf(self.fname, "df") def time_write_hdf(self, format): - self.df.to_hdf(self.fname, 'df', format=format) + self.df.to_hdf(self.fname, "df", format=format) from ..pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 19d11e66101987..0ce42856fb14ab 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -8,16 +8,20 @@ class ReadJSON(BaseIO): fname = "__test__.json" - params = (['split', 'index', 'records'], ['int', 'datetime']) - param_names = ['orient', 'index'] + params = (["split", "index", "records"], ["int", "datetime"]) + param_names = ["orient", "index"] def setup(self, orient, index): N = 100000 - indexes = {'int': np.arange(N), - 'datetime': date_range('20000101', periods=N, freq='H')} - df = DataFrame(np.random.randn(N, 5), - columns=['float_{}'.format(i) for i in range(5)], - index=indexes[index]) + indexes = { + "int": np.arange(N), + "datetime": date_range("20000101", periods=N, freq="H"), + } + df = DataFrame( + np.random.randn(N, 5), + columns=["float_{}".format(i) for i in range(5)], + index=indexes[index], + ) df.to_json(self.fname, orient=orient) def time_read_json(self, orient, index): @@ -27,71 +31,85 @@ def time_read_json(self, orient, index): class ReadJSONLines(BaseIO): fname = "__test_lines__.json" - params = ['int', 'datetime'] - param_names = ['index'] + params = ["int", "datetime"] + param_names = ["index"] def setup(self, index): N = 100000 - indexes = {'int': np.arange(N), - 'datetime': date_range('20000101', periods=N, freq='H')} - df = DataFrame(np.random.randn(N, 5), - columns=['float_{}'.format(i) for i in range(5)], - index=indexes[index]) - df.to_json(self.fname, orient='records', lines=True) + indexes = { + "int": np.arange(N), + "datetime": date_range("20000101", periods=N, freq="H"), + } + df = DataFrame( + np.random.randn(N, 5), + columns=["float_{}".format(i) for i in range(5)], + index=indexes[index], + ) + df.to_json(self.fname, orient="records", lines=True) def time_read_json_lines(self, index): - read_json(self.fname, orient='records', lines=True) + read_json(self.fname, orient="records", lines=True) def time_read_json_lines_concat(self, index): - concat(read_json(self.fname, orient='records', lines=True, - chunksize=25000)) + concat(read_json(self.fname, orient="records", lines=True, chunksize=25000)) def peakmem_read_json_lines(self, index): - read_json(self.fname, orient='records', lines=True) + read_json(self.fname, orient="records", lines=True) def peakmem_read_json_lines_concat(self, index): - concat(read_json(self.fname, orient='records', lines=True, - chunksize=25000)) + concat(read_json(self.fname, orient="records", lines=True, chunksize=25000)) class ToJSON(BaseIO): fname = "__test__.json" - params = ['split', 'columns', 'index'] - param_names = ['orient'] + params = ["split", "columns", "index"] + param_names = ["orient"] def setup(self, lines_orient): - N = 10**5 + N = 10 ** 5 ncols = 5 - index = date_range('20000101', periods=N, freq='H') - timedeltas = timedelta_range(start=1, periods=N, freq='s') - datetimes = date_range(start=1, periods=N, freq='s') + index = date_range("20000101", periods=N, freq="H") + timedeltas = timedelta_range(start=1, periods=N, freq="s") + datetimes = date_range(start=1, periods=N, freq="s") ints = np.random.randint(100000000, size=N) floats = np.random.randn(N) strings = tm.makeStringIndex(N) self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N)) self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index) - self.df_td_int_ts = DataFrame({'td_1': timedeltas, - 'td_2': timedeltas, - 'int_1': ints, - 'int_2': ints, - 'ts_1': datetimes, - 'ts_2': datetimes}, - index=index) - self.df_int_floats = DataFrame({'int_1': ints, - 'int_2': ints, - 'int_3': ints, - 'float_1': floats, - 'float_2': floats, - 'float_3': floats}, - index=index) - self.df_int_float_str = DataFrame({'int_1': ints, - 'int_2': ints, - 'float_1': floats, - 'float_2': floats, - 'str_1': strings, - 'str_2': strings}, - index=index) + self.df_td_int_ts = DataFrame( + { + "td_1": timedeltas, + "td_2": timedeltas, + "int_1": ints, + "int_2": ints, + "ts_1": datetimes, + "ts_2": datetimes, + }, + index=index, + ) + self.df_int_floats = DataFrame( + { + "int_1": ints, + "int_2": ints, + "int_3": ints, + "float_1": floats, + "float_2": floats, + "float_3": floats, + }, + index=index, + ) + self.df_int_float_str = DataFrame( + { + "int_1": ints, + "int_2": ints, + "float_1": floats, + "float_2": floats, + "str_1": strings, + "str_2": strings, + }, + index=index, + ) def time_floats_with_int_index(self, orient): self.df.to_json(self.fname, orient=orient) @@ -109,39 +127,35 @@ def time_float_int_str(self, orient): self.df_int_float_str.to_json(self.fname, orient=orient) def time_floats_with_int_idex_lines(self, orient): - self.df.to_json(self.fname, orient='records', lines=True) + self.df.to_json(self.fname, orient="records", lines=True) def time_floats_with_dt_index_lines(self, orient): - self.df_date_idx.to_json(self.fname, orient='records', lines=True) + self.df_date_idx.to_json(self.fname, orient="records", lines=True) def time_delta_int_tstamp_lines(self, orient): - self.df_td_int_ts.to_json(self.fname, orient='records', lines=True) + self.df_td_int_ts.to_json(self.fname, orient="records", lines=True) def time_float_int_lines(self, orient): - self.df_int_floats.to_json(self.fname, orient='records', lines=True) + self.df_int_floats.to_json(self.fname, orient="records", lines=True) def time_float_int_str_lines(self, orient): - self.df_int_float_str.to_json(self.fname, orient='records', lines=True) + self.df_int_float_str.to_json(self.fname, orient="records", lines=True) class ToJSONMem: - def setup_cache(self): df = DataFrame([[1]]) - frames = { - 'int': df, - 'float': df.astype(float), - } + frames = {"int": df, "float": df.astype(float)} return frames def peakmem_int(self, frames): - df = frames['int'] + df = frames["int"] for _ in range(100_000): df.to_json() def peakmem_float(self, frames): - df = frames['float'] + df = frames["float"] for _ in range(100_000): df.to_json() diff --git a/asv_bench/benchmarks/io/msgpack.py b/asv_bench/benchmarks/io/msgpack.py index dc2642d920fd07..c43df7c2e91eda 100644 --- a/asv_bench/benchmarks/io/msgpack.py +++ b/asv_bench/benchmarks/io/msgpack.py @@ -6,15 +6,16 @@ class MSGPack(BaseIO): - def setup(self): - self.fname = '__test__.msg' + self.fname = "__test__.msg" N = 100000 C = 5 - self.df = DataFrame(np.random.randn(N, C), - columns=['float{}'.format(i) for i in range(C)], - index=date_range('20000101', periods=N, freq='H')) - self.df['object'] = tm.makeStringIndex(N) + self.df = DataFrame( + np.random.randn(N, C), + columns=["float{}".format(i) for i in range(C)], + index=date_range("20000101", periods=N, freq="H"), + ) + self.df["object"] = tm.makeStringIndex(N) self.df.to_msgpack(self.fname) def time_read_msgpack(self): diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index edba0358c821ae..40256e043a0087 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -2,7 +2,9 @@ try: from pandas._libs.tslibs.parsing import ( - _concat_date_cols, _does_string_look_like_datetime) + _concat_date_cols, + _does_string_look_like_datetime, + ) except ImportError: # Avoid whole benchmark suite import failure on asv (currently 0.4) pass @@ -10,8 +12,8 @@ class DoesStringLookLikeDatetime(object): - params = (['2Q2005', '0.0', '10000'],) - param_names = ['value'] + params = (["2Q2005", "0.0", "10000"],) + param_names = ["value"] def setup(self, value): self.objects = [value] * 1000000 @@ -23,16 +25,18 @@ def time_check_datetimes(self, value): class ConcatDateCols(object): - params = ([1234567890, 'AAAA'], [1, 2]) - param_names = ['value', 'dim'] + params = ([1234567890, "AAAA"], [1, 2]) + param_names = ["value", "dim"] def setup(self, value, dim): count_elem = 10000 if dim == 1: self.object = (np.array([value] * count_elem),) if dim == 2: - self.object = (np.array([value] * count_elem), - np.array([value] * count_elem)) + self.object = ( + np.array([value] * count_elem), + np.array([value] * count_elem), + ) def time_check_concat(self, value, dim): _concat_date_cols(self.object) diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py index 74a58bbb946aaa..286ac767c02e7e 100644 --- a/asv_bench/benchmarks/io/pickle.py +++ b/asv_bench/benchmarks/io/pickle.py @@ -6,15 +6,16 @@ class Pickle(BaseIO): - def setup(self): - self.fname = '__test__.pkl' + self.fname = "__test__.pkl" N = 100000 C = 5 - self.df = DataFrame(np.random.randn(N, C), - columns=['float{}'.format(i) for i in range(C)], - index=date_range('20000101', periods=N, freq='H')) - self.df['object'] = tm.makeStringIndex(N) + self.df = DataFrame( + np.random.randn(N, C), + columns=["float{}".format(i) for i in range(C)], + index=date_range("20000101", periods=N, freq="H"), + ) + self.df["object"] = tm.makeStringIndex(N) self.df.to_pickle(self.fname) def time_read_pickle(self): diff --git a/asv_bench/benchmarks/io/sas.py b/asv_bench/benchmarks/io/sas.py index 8181f1d41ac70b..7ce8ef8c126395 100644 --- a/asv_bench/benchmarks/io/sas.py +++ b/asv_bench/benchmarks/io/sas.py @@ -5,15 +5,25 @@ class SAS: - params = ['sas7bdat', 'xport'] - param_names = ['format'] + params = ["sas7bdat", "xport"] + param_names = ["format"] def setup(self, format): # Read files that are located in 'pandas/io/tests/sas/data' - files = {'sas7bdat': 'test1.sas7bdat', 'xport': 'paxraw_d_short.xpt'} + files = {"sas7bdat": "test1.sas7bdat", "xport": "paxraw_d_short.xpt"} file = files[format] - paths = [os.path.dirname(__file__), '..', '..', '..', 'pandas', - 'tests', 'io', 'sas', 'data', file] + paths = [ + os.path.dirname(__file__), + "..", + "..", + "..", + "pandas", + "tests", + "io", + "sas", + "data", + file, + ] self.f = os.path.join(*paths) def time_read_msgpack(self, format): diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py index ee48f3bd0a3ab5..b80872b17a9e4a 100644 --- a/asv_bench/benchmarks/io/sql.py +++ b/asv_bench/benchmarks/io/sql.py @@ -8,31 +8,35 @@ class SQL: - params = ['sqlalchemy', 'sqlite'] - param_names = ['connection'] + params = ["sqlalchemy", "sqlite"] + param_names = ["connection"] def setup(self, connection): N = 10000 - con = {'sqlalchemy': create_engine('sqlite:///:memory:'), - 'sqlite': sqlite3.connect(':memory:')} - self.table_name = 'test_type' - self.query_all = 'SELECT * FROM {}'.format(self.table_name) + con = { + "sqlalchemy": create_engine("sqlite:///:memory:"), + "sqlite": sqlite3.connect(":memory:"), + } + self.table_name = "test_type" + self.query_all = "SELECT * FROM {}".format(self.table_name) self.con = con[connection] - self.df = DataFrame({'float': np.random.randn(N), - 'float_with_nan': np.random.randn(N), - 'string': ['foo'] * N, - 'bool': [True] * N, - 'int': np.random.randint(0, N, size=N), - 'datetime': date_range('2000-01-01', - periods=N, - freq='s')}, - index=tm.makeStringIndex(N)) - self.df.loc[1000:3000, 'float_with_nan'] = np.nan - self.df['datetime_string'] = self.df['datetime'].astype(str) - self.df.to_sql(self.table_name, self.con, if_exists='replace') + self.df = DataFrame( + { + "float": np.random.randn(N), + "float_with_nan": np.random.randn(N), + "string": ["foo"] * N, + "bool": [True] * N, + "int": np.random.randint(0, N, size=N), + "datetime": date_range("2000-01-01", periods=N, freq="s"), + }, + index=tm.makeStringIndex(N), + ) + self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df["datetime_string"] = self.df["datetime"].astype(str) + self.df.to_sql(self.table_name, self.con, if_exists="replace") def time_to_sql_dataframe(self, connection): - self.df.to_sql('test1', self.con, if_exists='replace') + self.df.to_sql("test1", self.con, if_exists="replace") def time_read_sql_query(self, connection): read_sql_query(self.query_all, self.con) @@ -40,85 +44,98 @@ def time_read_sql_query(self, connection): class WriteSQLDtypes: - params = (['sqlalchemy', 'sqlite'], - ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime']) - param_names = ['connection', 'dtype'] + params = ( + ["sqlalchemy", "sqlite"], + ["float", "float_with_nan", "string", "bool", "int", "datetime"], + ) + param_names = ["connection", "dtype"] def setup(self, connection, dtype): N = 10000 - con = {'sqlalchemy': create_engine('sqlite:///:memory:'), - 'sqlite': sqlite3.connect(':memory:')} - self.table_name = 'test_type' - self.query_col = 'SELECT {} FROM {}'.format(dtype, self.table_name) + con = { + "sqlalchemy": create_engine("sqlite:///:memory:"), + "sqlite": sqlite3.connect(":memory:"), + } + self.table_name = "test_type" + self.query_col = "SELECT {} FROM {}".format(dtype, self.table_name) self.con = con[connection] - self.df = DataFrame({'float': np.random.randn(N), - 'float_with_nan': np.random.randn(N), - 'string': ['foo'] * N, - 'bool': [True] * N, - 'int': np.random.randint(0, N, size=N), - 'datetime': date_range('2000-01-01', - periods=N, - freq='s')}, - index=tm.makeStringIndex(N)) - self.df.loc[1000:3000, 'float_with_nan'] = np.nan - self.df['datetime_string'] = self.df['datetime'].astype(str) - self.df.to_sql(self.table_name, self.con, if_exists='replace') + self.df = DataFrame( + { + "float": np.random.randn(N), + "float_with_nan": np.random.randn(N), + "string": ["foo"] * N, + "bool": [True] * N, + "int": np.random.randint(0, N, size=N), + "datetime": date_range("2000-01-01", periods=N, freq="s"), + }, + index=tm.makeStringIndex(N), + ) + self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df["datetime_string"] = self.df["datetime"].astype(str) + self.df.to_sql(self.table_name, self.con, if_exists="replace") def time_to_sql_dataframe_column(self, connection, dtype): - self.df[[dtype]].to_sql('test1', self.con, if_exists='replace') + self.df[[dtype]].to_sql("test1", self.con, if_exists="replace") def time_read_sql_query_select_column(self, connection, dtype): read_sql_query(self.query_col, self.con) class ReadSQLTable: - def setup(self): N = 10000 - self.table_name = 'test' - self.con = create_engine('sqlite:///:memory:') - self.df = DataFrame({'float': np.random.randn(N), - 'float_with_nan': np.random.randn(N), - 'string': ['foo'] * N, - 'bool': [True] * N, - 'int': np.random.randint(0, N, size=N), - 'datetime': date_range('2000-01-01', - periods=N, - freq='s')}, - index=tm.makeStringIndex(N)) - self.df.loc[1000:3000, 'float_with_nan'] = np.nan - self.df['datetime_string'] = self.df['datetime'].astype(str) - self.df.to_sql(self.table_name, self.con, if_exists='replace') + self.table_name = "test" + self.con = create_engine("sqlite:///:memory:") + self.df = DataFrame( + { + "float": np.random.randn(N), + "float_with_nan": np.random.randn(N), + "string": ["foo"] * N, + "bool": [True] * N, + "int": np.random.randint(0, N, size=N), + "datetime": date_range("2000-01-01", periods=N, freq="s"), + }, + index=tm.makeStringIndex(N), + ) + self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df["datetime_string"] = self.df["datetime"].astype(str) + self.df.to_sql(self.table_name, self.con, if_exists="replace") def time_read_sql_table_all(self): read_sql_table(self.table_name, self.con) def time_read_sql_table_parse_dates(self): - read_sql_table(self.table_name, self.con, columns=['datetime_string'], - parse_dates=['datetime_string']) + read_sql_table( + self.table_name, + self.con, + columns=["datetime_string"], + parse_dates=["datetime_string"], + ) class ReadSQLTableDtypes: - params = ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime'] - param_names = ['dtype'] + params = ["float", "float_with_nan", "string", "bool", "int", "datetime"] + param_names = ["dtype"] def setup(self, dtype): N = 10000 - self.table_name = 'test' - self.con = create_engine('sqlite:///:memory:') - self.df = DataFrame({'float': np.random.randn(N), - 'float_with_nan': np.random.randn(N), - 'string': ['foo'] * N, - 'bool': [True] * N, - 'int': np.random.randint(0, N, size=N), - 'datetime': date_range('2000-01-01', - periods=N, - freq='s')}, - index=tm.makeStringIndex(N)) - self.df.loc[1000:3000, 'float_with_nan'] = np.nan - self.df['datetime_string'] = self.df['datetime'].astype(str) - self.df.to_sql(self.table_name, self.con, if_exists='replace') + self.table_name = "test" + self.con = create_engine("sqlite:///:memory:") + self.df = DataFrame( + { + "float": np.random.randn(N), + "float_with_nan": np.random.randn(N), + "string": ["foo"] * N, + "bool": [True] * N, + "int": np.random.randint(0, N, size=N), + "datetime": date_range("2000-01-01", periods=N, freq="s"), + }, + index=tm.makeStringIndex(N), + ) + self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df["datetime_string"] = self.df["datetime"].astype(str) + self.df.to_sql(self.table_name, self.con, if_exists="replace") def time_read_sql_table_column(self, dtype): read_sql_table(self.table_name, self.con, columns=[dtype]) diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py index fff10cf10a4d31..b3ed71af47dc8b 100644 --- a/asv_bench/benchmarks/io/stata.py +++ b/asv_bench/benchmarks/io/stata.py @@ -7,26 +7,30 @@ class Stata(BaseIO): - params = ['tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'] - param_names = ['convert_dates'] + params = ["tc", "td", "tm", "tw", "th", "tq", "ty"] + param_names = ["convert_dates"] def setup(self, convert_dates): - self.fname = '__test__.dta' + self.fname = "__test__.dta" N = self.N = 100000 C = self.C = 5 - self.df = DataFrame(np.random.randn(N, C), - columns=['float{}'.format(i) for i in range(C)], - index=date_range('20000101', periods=N, freq='H')) - self.df['object'] = tm.makeStringIndex(self.N) - self.df['int8_'] = np.random.randint(np.iinfo(np.int8).min, - np.iinfo(np.int8).max - 27, N) - self.df['int16_'] = np.random.randint(np.iinfo(np.int16).min, - np.iinfo(np.int16).max - 27, N) - self.df['int32_'] = np.random.randint(np.iinfo(np.int32).min, - np.iinfo(np.int32).max - 27, N) - self.df['float32_'] = np.array(np.random.randn(N), - dtype=np.float32) - self.convert_dates = {'index': convert_dates} + self.df = DataFrame( + np.random.randn(N, C), + columns=["float{}".format(i) for i in range(C)], + index=date_range("20000101", periods=N, freq="H"), + ) + self.df["object"] = tm.makeStringIndex(self.N) + self.df["int8_"] = np.random.randint( + np.iinfo(np.int8).min, np.iinfo(np.int8).max - 27, N + ) + self.df["int16_"] = np.random.randint( + np.iinfo(np.int16).min, np.iinfo(np.int16).max - 27, N + ) + self.df["int32_"] = np.random.randint( + np.iinfo(np.int32).min, np.iinfo(np.int32).max - 27, N + ) + self.df["float32_"] = np.array(np.random.randn(N), dtype=np.float32) + self.convert_dates = {"index": convert_dates} self.df.to_stata(self.fname, self.convert_dates) def time_read_stata(self, convert_dates): @@ -42,7 +46,7 @@ def setup(self, convert_dates): for i in range(10): missing_data = np.random.randn(self.N) missing_data[missing_data < 0] = np.nan - self.df['missing_{0}'.format(i)] = missing_data + self.df["missing_{0}".format(i)] = missing_data self.df.to_stata(self.fname, self.convert_dates) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index bbaba9909966ee..7c899e3dc6ac8a 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -2,8 +2,7 @@ import numpy as np import pandas.util.testing as tm -from pandas import (DataFrame, Series, MultiIndex, - date_range, concat, merge, merge_asof) +from pandas import DataFrame, Series, MultiIndex, date_range, concat, merge, merge_asof try: from pandas import merge_ordered @@ -12,16 +11,14 @@ class Append: - def setup(self): - self.df1 = DataFrame(np.random.randn(10000, 4), - columns=['A', 'B', 'C', 'D']) + self.df1 = DataFrame(np.random.randn(10000, 4), columns=["A", "B", "C", "D"]) self.df2 = self.df1.copy() self.df2.index = np.arange(10000, 20000) self.mdf1 = self.df1.copy() - self.mdf1['obj1'] = 'bar' - self.mdf1['obj2'] = 'bar' - self.mdf1['int1'] = 5 + self.mdf1["obj1"] = "bar" + self.mdf1["obj2"] = "bar" + self.mdf1["int1"] = 5 self.mdf1 = self.mdf1._consolidate() self.mdf2 = self.mdf1.copy() self.mdf2.index = self.df2.index @@ -36,15 +33,16 @@ def time_append_mixed(self): class Concat: params = [0, 1] - param_names = ['axis'] + param_names = ["axis"] def setup(self, axis): N = 1000 s = Series(N, index=tm.makeStringIndex(N)) - self.series = [s[i:- i] for i in range(1, 10)] * 50 + self.series = [s[i:-i] for i in range(1, 10)] * 50 self.small_frames = [DataFrame(np.random.randn(5, 4))] * 1000 - df = DataFrame({'A': range(N)}, - index=date_range('20130101', periods=N, freq='s')) + df = DataFrame( + {"A": range(N)}, index=date_range("20130101", periods=N, freq="s") + ) self.empty_left = [DataFrame(), df] self.empty_right = [df, DataFrame()] self.mixed_ndims = [df, df.head(N // 2)] @@ -68,14 +66,12 @@ def time_concat_mixed_ndims(self, axis): class ConcatDataFrames: params = ([0, 1], [True, False]) - param_names = ['axis', 'ignore_index'] + param_names = ["axis", "ignore_index"] def setup(self, axis, ignore_index): - frame_c = DataFrame(np.zeros((10000, 200), - dtype=np.float32, order='C')) + frame_c = DataFrame(np.zeros((10000, 200), dtype=np.float32, order="C")) self.frame_c = [frame_c] * 20 - frame_f = DataFrame(np.zeros((10000, 200), - dtype=np.float32, order='F')) + frame_f = DataFrame(np.zeros((10000, 200), dtype=np.float32, order="F")) self.frame_f = [frame_f] * 20 def time_c_ordered(self, axis, ignore_index): @@ -88,74 +84,78 @@ def time_f_ordered(self, axis, ignore_index): class Join: params = [True, False] - param_names = ['sort'] + param_names = ["sort"] def setup(self, sort): level1 = tm.makeStringIndex(10).values level2 = tm.makeStringIndex(1000).values codes1 = np.arange(10).repeat(1000) codes2 = np.tile(np.arange(1000), 10) - index2 = MultiIndex(levels=[level1, level2], - codes=[codes1, codes2]) - self.df_multi = DataFrame(np.random.randn(len(index2), 4), - index=index2, - columns=['A', 'B', 'C', 'D']) + index2 = MultiIndex(levels=[level1, level2], codes=[codes1, codes2]) + self.df_multi = DataFrame( + np.random.randn(len(index2), 4), index=index2, columns=["A", "B", "C", "D"] + ) self.key1 = np.tile(level1.take(codes1), 10) self.key2 = np.tile(level2.take(codes2), 10) - self.df = DataFrame({'data1': np.random.randn(100000), - 'data2': np.random.randn(100000), - 'key1': self.key1, - 'key2': self.key2}) - - self.df_key1 = DataFrame(np.random.randn(len(level1), 4), - index=level1, - columns=['A', 'B', 'C', 'D']) - self.df_key2 = DataFrame(np.random.randn(len(level2), 4), - index=level2, - columns=['A', 'B', 'C', 'D']) + self.df = DataFrame( + { + "data1": np.random.randn(100000), + "data2": np.random.randn(100000), + "key1": self.key1, + "key2": self.key2, + } + ) + + self.df_key1 = DataFrame( + np.random.randn(len(level1), 4), index=level1, columns=["A", "B", "C", "D"] + ) + self.df_key2 = DataFrame( + np.random.randn(len(level2), 4), index=level2, columns=["A", "B", "C", "D"] + ) shuf = np.arange(100000) np.random.shuffle(shuf) self.df_shuf = self.df.reindex(self.df.index[shuf]) def time_join_dataframe_index_multi(self, sort): - self.df.join(self.df_multi, on=['key1', 'key2'], sort=sort) + self.df.join(self.df_multi, on=["key1", "key2"], sort=sort) def time_join_dataframe_index_single_key_bigger(self, sort): - self.df.join(self.df_key2, on='key2', sort=sort) + self.df.join(self.df_key2, on="key2", sort=sort) def time_join_dataframe_index_single_key_small(self, sort): - self.df.join(self.df_key1, on='key1', sort=sort) + self.df.join(self.df_key1, on="key1", sort=sort) def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort): - self.df_shuf.join(self.df_key2, on='key2', sort=sort) + self.df_shuf.join(self.df_key2, on="key2", sort=sort) class JoinIndex: - def setup(self): N = 50000 - self.left = DataFrame(np.random.randint(1, N / 500, (N, 2)), - columns=['jim', 'joe']) - self.right = DataFrame(np.random.randint(1, N / 500, (N, 2)), - columns=['jolie', 'jolia']).set_index('jolie') + self.left = DataFrame( + np.random.randint(1, N / 500, (N, 2)), columns=["jim", "joe"] + ) + self.right = DataFrame( + np.random.randint(1, N / 500, (N, 2)), columns=["jolie", "jolia"] + ).set_index("jolie") def time_left_outer_join_index(self): - self.left.join(self.right, on='jim') + self.left.join(self.right, on="jim") class JoinNonUnique: # outer join of non-unique # GH 6329 def setup(self): - date_index = date_range('01-Jan-2013', '23-Jan-2013', freq='T') - daily_dates = date_index.to_period('D').to_timestamp('S', 'S') + date_index = date_range("01-Jan-2013", "23-Jan-2013", freq="T") + daily_dates = date_index.to_period("D").to_timestamp("S", "S") self.fracofday = date_index.values - daily_dates.values - self.fracofday = self.fracofday.astype('timedelta64[ns]') + self.fracofday = self.fracofday.astype("timedelta64[ns]") self.fracofday = self.fracofday.astype(np.float64) / 86400000000000.0 self.fracofday = Series(self.fracofday, daily_dates) - index = date_range(date_index.min(), date_index.max(), freq='D') + index = date_range(date_index.min(), date_index.max(), freq="D") self.temp = Series(1.0, index)[self.fracofday.index] def time_join_non_unique_equal(self): @@ -165,7 +165,7 @@ def time_join_non_unique_equal(self): class Merge: params = [True, False] - param_names = ['sort'] + param_names = ["sort"] def setup(self, sort): N = 10000 @@ -173,17 +173,25 @@ def setup(self, sort): indices2 = tm.makeStringIndex(N).values key = np.tile(indices[:8000], 10) key2 = np.tile(indices2[:8000], 10) - self.left = DataFrame({'key': key, 'key2': key2, - 'value': np.random.randn(80000)}) - self.right = DataFrame({'key': indices[2000:], - 'key2': indices2[2000:], - 'value2': np.random.randn(8000)}) - - self.df = DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2), - 'key2': np.tile(np.arange(250).repeat(10), 4), - 'value': np.random.randn(10000)}) - self.df2 = DataFrame({'key1': np.arange(500), - 'value2': np.random.randn(500)}) + self.left = DataFrame( + {"key": key, "key2": key2, "value": np.random.randn(80000)} + ) + self.right = DataFrame( + { + "key": indices[2000:], + "key2": indices2[2000:], + "value2": np.random.randn(8000), + } + ) + + self.df = DataFrame( + { + "key1": np.tile(np.arange(500).repeat(10), 2), + "key2": np.tile(np.arange(250).repeat(10), 4), + "value": np.random.randn(10000), + } + ) + self.df2 = DataFrame({"key1": np.arange(500), "value2": np.random.randn(500)}) self.df3 = self.df[:5000] def time_merge_2intkey(self, sort): @@ -193,125 +201,141 @@ def time_merge_dataframe_integer_2key(self, sort): merge(self.df, self.df3, sort=sort) def time_merge_dataframe_integer_key(self, sort): - merge(self.df, self.df2, on='key1', sort=sort) + merge(self.df, self.df2, on="key1", sort=sort) class I8Merge: - params = ['inner', 'outer', 'left', 'right'] - param_names = ['how'] + params = ["inner", "outer", "left", "right"] + param_names = ["how"] def setup(self, how): - low, high, n = -1000, 1000, 10**6 - self.left = DataFrame(np.random.randint(low, high, (n, 7)), - columns=list('ABCDEFG')) - self.left['left'] = self.left.sum(axis=1) - self.right = self.left.sample(frac=1).rename({'left': 'right'}, axis=1) + low, high, n = -1000, 1000, 10 ** 6 + self.left = DataFrame( + np.random.randint(low, high, (n, 7)), columns=list("ABCDEFG") + ) + self.left["left"] = self.left.sum(axis=1) + self.right = self.left.sample(frac=1).rename({"left": "right"}, axis=1) self.right = self.right.reset_index(drop=True) - self.right['right'] *= -1 + self.right["right"] *= -1 def time_i8merge(self, how): merge(self.left, self.right, how=how) class MergeCategoricals: - def setup(self): self.left_object = DataFrame( - {'X': np.random.choice(range(0, 10), size=(10000,)), - 'Y': np.random.choice(['one', 'two', 'three'], size=(10000,))}) + { + "X": np.random.choice(range(0, 10), size=(10000,)), + "Y": np.random.choice(["one", "two", "three"], size=(10000,)), + } + ) self.right_object = DataFrame( - {'X': np.random.choice(range(0, 10), size=(10000,)), - 'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10000,))}) + { + "X": np.random.choice(range(0, 10), size=(10000,)), + "Z": np.random.choice(["jjj", "kkk", "sss"], size=(10000,)), + } + ) self.left_cat = self.left_object.assign( - Y=self.left_object['Y'].astype('category')) + Y=self.left_object["Y"].astype("category") + ) self.right_cat = self.right_object.assign( - Z=self.right_object['Z'].astype('category')) + Z=self.right_object["Z"].astype("category") + ) def time_merge_object(self): - merge(self.left_object, self.right_object, on='X') + merge(self.left_object, self.right_object, on="X") def time_merge_cat(self): - merge(self.left_cat, self.right_cat, on='X') + merge(self.left_cat, self.right_cat, on="X") class MergeOrdered: - def setup(self): groups = tm.makeStringIndex(10).values - self.left = DataFrame({'group': groups.repeat(5000), - 'key': np.tile(np.arange(0, 10000, 2), 10), - 'lvalue': np.random.randn(50000)}) - self.right = DataFrame({'key': np.arange(10000), - 'rvalue': np.random.randn(10000)}) + self.left = DataFrame( + { + "group": groups.repeat(5000), + "key": np.tile(np.arange(0, 10000, 2), 10), + "lvalue": np.random.randn(50000), + } + ) + self.right = DataFrame( + {"key": np.arange(10000), "rvalue": np.random.randn(10000)} + ) def time_merge_ordered(self): - merge_ordered(self.left, self.right, on='key', left_by='group') + merge_ordered(self.left, self.right, on="key", left_by="group") class MergeAsof: - params = [['backward', 'forward', 'nearest']] - param_names = ['direction'] + params = [["backward", "forward", "nearest"]] + param_names = ["direction"] def setup(self, direction): one_count = 200000 two_count = 1000000 df1 = DataFrame( - {'time': np.random.randint(0, one_count / 20, one_count), - 'key': np.random.choice(list(string.ascii_uppercase), one_count), - 'key2': np.random.randint(0, 25, one_count), - 'value1': np.random.randn(one_count)}) + { + "time": np.random.randint(0, one_count / 20, one_count), + "key": np.random.choice(list(string.ascii_uppercase), one_count), + "key2": np.random.randint(0, 25, one_count), + "value1": np.random.randn(one_count), + } + ) df2 = DataFrame( - {'time': np.random.randint(0, two_count / 20, two_count), - 'key': np.random.choice(list(string.ascii_uppercase), two_count), - 'key2': np.random.randint(0, 25, two_count), - 'value2': np.random.randn(two_count)}) - - df1 = df1.sort_values('time') - df2 = df2.sort_values('time') - - df1['time32'] = np.int32(df1.time) - df2['time32'] = np.int32(df2.time) - - self.df1a = df1[['time', 'value1']] - self.df2a = df2[['time', 'value2']] - self.df1b = df1[['time', 'key', 'value1']] - self.df2b = df2[['time', 'key', 'value2']] - self.df1c = df1[['time', 'key2', 'value1']] - self.df2c = df2[['time', 'key2', 'value2']] - self.df1d = df1[['time32', 'value1']] - self.df2d = df2[['time32', 'value2']] - self.df1e = df1[['time', 'key', 'key2', 'value1']] - self.df2e = df2[['time', 'key', 'key2', 'value2']] + { + "time": np.random.randint(0, two_count / 20, two_count), + "key": np.random.choice(list(string.ascii_uppercase), two_count), + "key2": np.random.randint(0, 25, two_count), + "value2": np.random.randn(two_count), + } + ) + + df1 = df1.sort_values("time") + df2 = df2.sort_values("time") + + df1["time32"] = np.int32(df1.time) + df2["time32"] = np.int32(df2.time) + + self.df1a = df1[["time", "value1"]] + self.df2a = df2[["time", "value2"]] + self.df1b = df1[["time", "key", "value1"]] + self.df2b = df2[["time", "key", "value2"]] + self.df1c = df1[["time", "key2", "value1"]] + self.df2c = df2[["time", "key2", "value2"]] + self.df1d = df1[["time32", "value1"]] + self.df2d = df2[["time32", "value2"]] + self.df1e = df1[["time", "key", "key2", "value1"]] + self.df2e = df2[["time", "key", "key2", "value2"]] def time_on_int(self, direction): - merge_asof(self.df1a, self.df2a, on='time', direction=direction) + merge_asof(self.df1a, self.df2a, on="time", direction=direction) def time_on_int32(self, direction): - merge_asof(self.df1d, self.df2d, on='time32', direction=direction) + merge_asof(self.df1d, self.df2d, on="time32", direction=direction) def time_by_object(self, direction): - merge_asof(self.df1b, self.df2b, on='time', by='key', - direction=direction) + merge_asof(self.df1b, self.df2b, on="time", by="key", direction=direction) def time_by_int(self, direction): - merge_asof(self.df1c, self.df2c, on='time', by='key2', - direction=direction) + merge_asof(self.df1c, self.df2c, on="time", by="key2", direction=direction) def time_multiby(self, direction): - merge_asof(self.df1e, self.df2e, on='time', by=['key', 'key2'], - direction=direction) + merge_asof( + self.df1e, self.df2e, on="time", by=["key", "key2"], direction=direction + ) class Align: - def setup(self): - size = 5 * 10**5 - rng = np.arange(0, 10**13, 10**7) - stamps = np.datetime64('now').view('i8') + rng + size = 5 * 10 ** 5 + rng = np.arange(0, 10 ** 13, 10 ** 7) + stamps = np.datetime64("now").view("i8") + rng idx1 = np.sort(np.random.choice(stamps, size, replace=False)) idx2 = np.sort(np.random.choice(stamps, size, replace=False)) self.ts1 = Series(np.random.randn(size), idx1) @@ -321,7 +345,7 @@ def time_series_align_int64_index(self): self.ts1 + self.ts2 def time_series_align_left_monotonic(self): - self.ts1.align(self.ts2, join='left') + self.ts1.align(self.ts2, join="left") from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index c979ba6d53a08d..eda059a68e8a58 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -6,46 +6,44 @@ class GetLoc: - def setup(self): self.mi_large = MultiIndex.from_product( [np.arange(1000), np.arange(20), list(string.ascii_letters)], - names=['one', 'two', 'three']) + names=["one", "two", "three"], + ) self.mi_med = MultiIndex.from_product( - [np.arange(1000), np.arange(10), list('A')], - names=['one', 'two', 'three']) + [np.arange(1000), np.arange(10), list("A")], names=["one", "two", "three"] + ) self.mi_small = MultiIndex.from_product( - [np.arange(100), list('A'), list('A')], - names=['one', 'two', 'three']) + [np.arange(100), list("A"), list("A")], names=["one", "two", "three"] + ) def time_large_get_loc(self): - self.mi_large.get_loc((999, 19, 'Z')) + self.mi_large.get_loc((999, 19, "Z")) def time_large_get_loc_warm(self): for _ in range(1000): - self.mi_large.get_loc((999, 19, 'Z')) + self.mi_large.get_loc((999, 19, "Z")) def time_med_get_loc(self): - self.mi_med.get_loc((999, 9, 'A')) + self.mi_med.get_loc((999, 9, "A")) def time_med_get_loc_warm(self): for _ in range(1000): - self.mi_med.get_loc((999, 9, 'A')) + self.mi_med.get_loc((999, 9, "A")) def time_string_get_loc(self): - self.mi_small.get_loc((99, 'A', 'A')) + self.mi_small.get_loc((99, "A", "A")) def time_small_get_loc_warm(self): for _ in range(1000): - self.mi_small.get_loc((99, 'A', 'A')) + self.mi_small.get_loc((99, "A", "A")) class Duplicates: - def setup(self): size = 65536 - arrays = [np.random.randint(0, 8192, size), - np.random.randint(0, 1024, size)] + arrays = [np.random.randint(0, 8192, size), np.random.randint(0, 1024, size)] mask = np.random.rand(size) < 0.1 self.mi_unused_levels = MultiIndex.from_arrays(arrays) self.mi_unused_levels = self.mi_unused_levels[mask] @@ -55,15 +53,25 @@ def time_remove_unused_levels(self): class Integer: - def setup(self): - self.mi_int = MultiIndex.from_product([np.arange(1000), - np.arange(1000)], - names=['one', 'two']) - self.obj_index = np.array([(0, 10), (0, 11), (0, 12), - (0, 13), (0, 14), (0, 15), - (0, 16), (0, 17), (0, 18), - (0, 19)], dtype=object) + self.mi_int = MultiIndex.from_product( + [np.arange(1000), np.arange(1000)], names=["one", "two"] + ) + self.obj_index = np.array( + [ + (0, 10), + (0, 11), + (0, 12), + (0, 13), + (0, 14), + (0, 15), + (0, 16), + (0, 17), + (0, 18), + (0, 19), + ], + dtype=object, + ) def time_get_indexer(self): self.mi_int.get_indexer(self.obj_index) @@ -73,12 +81,9 @@ def time_is_monotonic(self): class Duplicated: - def setup(self): n, k = 200, 5000 - levels = [np.arange(n), - tm.makeStringIndex(n).values, - 1000 + np.arange(n)] + levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)] codes = [np.random.choice(n, (k * n)) for lev in levels] self.mi = MultiIndex(levels=levels, codes=codes) @@ -87,12 +92,13 @@ def time_duplicated(self): class Sortlevel: - def setup(self): n = 1182720 low, high = -4096, 4096 - arrs = [np.repeat(np.random.randint(low, high, (n // k)), k) - for k in [11, 7, 5, 3, 1]] + arrs = [ + np.repeat(np.random.randint(low, high, (n // k)), k) + for k in [11, 7, 5, 3, 1] + ] self.mi_int = MultiIndex.from_arrays(arrs)[np.random.permutation(n)] a = np.repeat(np.arange(100), 1000) @@ -111,11 +117,10 @@ def time_sortlevel_one(self): class Values: - def setup_cache(self): level1 = range(1000) - level2 = date_range(start='1/1/2012', periods=100) + level2 = date_range(start="1/1/2012", periods=100) mi = MultiIndex.from_product([level1, level2]) return mi @@ -127,17 +132,18 @@ def time_datetime_level_values_sliced(self, mi): class CategoricalLevel: - def setup(self): - self.df = DataFrame({ - 'a': np.arange(1_000_000, dtype=np.int32), - 'b': np.arange(1_000_000, dtype=np.int64), - 'c': np.arange(1_000_000, dtype=float), - }).astype({'a': 'category', 'b': 'category'}) + self.df = DataFrame( + { + "a": np.arange(1_000_000, dtype=np.int32), + "b": np.arange(1_000_000, dtype=np.int64), + "c": np.arange(1_000_000, dtype=float), + } + ).astype({"a": "category", "b": "category"}) def time_categorical_level(self): - self.df.set_index(['a', 'b']) + self.df.set_index(["a", "b"]) from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py index 9b738e699a5b3d..31c3b6fb6cb60a 100644 --- a/asv_bench/benchmarks/offset.py +++ b/asv_bench/benchmarks/offset.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd + try: import pandas.tseries.holiday # noqa except ImportError: @@ -10,35 +11,43 @@ hcal = pd.tseries.holiday.USFederalHolidayCalendar() # These offsets currently raise a NotImplimentedError with .apply_index() -non_apply = [pd.offsets.Day(), - pd.offsets.BYearEnd(), - pd.offsets.BYearBegin(), - pd.offsets.BQuarterEnd(), - pd.offsets.BQuarterBegin(), - pd.offsets.BMonthEnd(), - pd.offsets.BMonthBegin(), - pd.offsets.CustomBusinessDay(), - pd.offsets.CustomBusinessDay(calendar=hcal), - pd.offsets.CustomBusinessMonthBegin(calendar=hcal), - pd.offsets.CustomBusinessMonthEnd(calendar=hcal), - pd.offsets.CustomBusinessMonthEnd(calendar=hcal)] -other_offsets = [pd.offsets.YearEnd(), pd.offsets.YearBegin(), - pd.offsets.QuarterEnd(), pd.offsets.QuarterBegin(), - pd.offsets.MonthEnd(), pd.offsets.MonthBegin(), - pd.offsets.DateOffset(months=2, days=2), - pd.offsets.BusinessDay(), pd.offsets.SemiMonthEnd(), - pd.offsets.SemiMonthBegin()] +non_apply = [ + pd.offsets.Day(), + pd.offsets.BYearEnd(), + pd.offsets.BYearBegin(), + pd.offsets.BQuarterEnd(), + pd.offsets.BQuarterBegin(), + pd.offsets.BMonthEnd(), + pd.offsets.BMonthBegin(), + pd.offsets.CustomBusinessDay(), + pd.offsets.CustomBusinessDay(calendar=hcal), + pd.offsets.CustomBusinessMonthBegin(calendar=hcal), + pd.offsets.CustomBusinessMonthEnd(calendar=hcal), + pd.offsets.CustomBusinessMonthEnd(calendar=hcal), +] +other_offsets = [ + pd.offsets.YearEnd(), + pd.offsets.YearBegin(), + pd.offsets.QuarterEnd(), + pd.offsets.QuarterBegin(), + pd.offsets.MonthEnd(), + pd.offsets.MonthBegin(), + pd.offsets.DateOffset(months=2, days=2), + pd.offsets.BusinessDay(), + pd.offsets.SemiMonthEnd(), + pd.offsets.SemiMonthBegin(), +] offsets = non_apply + other_offsets class ApplyIndex: params = other_offsets - param_names = ['offset'] + param_names = ["offset"] def setup(self, offset): N = 10000 - self.rng = pd.date_range(start='1/1/2000', periods=N, freq='T') + self.rng = pd.date_range(start="1/1/2000", periods=N, freq="T") def time_apply_index(self, offset): offset.apply_index(self.rng) @@ -47,13 +56,15 @@ def time_apply_index(self, offset): class OnOffset: params = offsets - param_names = ['offset'] + param_names = ["offset"] def setup(self, offset): - self.dates = [datetime(2016, m, d) - for m in [10, 11, 12] - for d in [1, 2, 3, 28, 29, 30, 31] - if not (m == 11 and d == 31)] + self.dates = [ + datetime(2016, m, d) + for m in [10, 11, 12] + for d in [1, 2, 3, 28, 29, 30, 31] + if not (m == 11 and d == 31) + ] def time_on_offset(self, offset): for date in self.dates: @@ -63,11 +74,11 @@ def time_on_offset(self, offset): class OffsetSeriesArithmetic: params = offsets - param_names = ['offset'] + param_names = ["offset"] def setup(self, offset): N = 1000 - rng = pd.date_range(start='1/1/2000', periods=N, freq='T') + rng = pd.date_range(start="1/1/2000", periods=N, freq="T") self.data = pd.Series(rng) def time_add_offset(self, offset): @@ -78,11 +89,11 @@ def time_add_offset(self, offset): class OffsetDatetimeIndexArithmetic: params = offsets - param_names = ['offset'] + param_names = ["offset"] def setup(self, offset): N = 1000 - self.data = pd.date_range(start='1/1/2000', periods=N, freq='T') + self.data = pd.date_range(start="1/1/2000", periods=N, freq="T") def time_add_offset(self, offset): with warnings.catch_warnings(record=True): @@ -92,11 +103,11 @@ def time_add_offset(self, offset): class OffestDatetimeArithmetic: params = offsets - param_names = ['offset'] + param_names = ["offset"] def setup(self, offset): self.date = datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') + self.dt64 = np.datetime64("2011-01-01 09:00Z") def time_apply(self, offset): offset.apply(self.date) diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index 59b1638920666c..fdc8207021c0f3 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -5,26 +5,42 @@ import pandas as pd # Compatibility import for lib -for imp in ['pandas._libs.lib', 'pandas.lib']: +for imp in ["pandas._libs.lib", "pandas.lib"]: try: lib = import_module(imp) break except (ImportError, TypeError, ValueError): pass -numeric_dtypes = [np.int64, np.int32, np.uint32, np.uint64, np.float32, - np.float64, np.int16, np.int8, np.uint16, np.uint8] +numeric_dtypes = [ + np.int64, + np.int32, + np.uint32, + np.uint64, + np.float32, + np.float64, + np.int16, + np.int8, + np.uint16, + np.uint8, +] datetime_dtypes = [np.datetime64, np.timedelta64] string_dtypes = [np.object] try: - extension_dtypes = [pd.Int8Dtype, pd.Int16Dtype, - pd.Int32Dtype, pd.Int64Dtype, - pd.UInt8Dtype, pd.UInt16Dtype, - pd.UInt32Dtype, pd.UInt64Dtype, - pd.CategoricalDtype, - pd.IntervalDtype, - pd.DatetimeTZDtype('ns', 'UTC'), - pd.PeriodDtype('D')] + extension_dtypes = [ + pd.Int8Dtype, + pd.Int16Dtype, + pd.Int32Dtype, + pd.Int64Dtype, + pd.UInt8Dtype, + pd.UInt16Dtype, + pd.UInt32Dtype, + pd.UInt64Dtype, + pd.CategoricalDtype, + pd.IntervalDtype, + pd.DatetimeTZDtype("ns", "UTC"), + pd.PeriodDtype("D"), + ] except AttributeError: extension_dtypes = [] @@ -40,6 +56,7 @@ class BaseIO: """ Base class for IO benchmarks """ + fname = None def remove(self, f): diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index c8ba6c382cb644..2f8ae0650ab751 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -1,18 +1,33 @@ -from pandas import ( - DataFrame, Period, PeriodIndex, Series, date_range, period_range) +from pandas import DataFrame, Period, PeriodIndex, Series, date_range, period_range from pandas.tseries.frequencies import to_offset class PeriodProperties: - params = (['M', 'min'], - ['year', 'month', 'day', 'hour', 'minute', 'second', - 'is_leap_year', 'quarter', 'qyear', 'week', 'daysinmonth', - 'dayofweek', 'dayofyear', 'start_time', 'end_time']) - param_names = ['freq', 'attr'] + params = ( + ["M", "min"], + [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "is_leap_year", + "quarter", + "qyear", + "week", + "daysinmonth", + "dayofweek", + "dayofyear", + "start_time", + "end_time", + ], + ) + param_names = ["freq", "attr"] def setup(self, freq, attr): - self.per = Period('2012-06-01', freq=freq) + self.per = Period("2012-06-01", freq=freq) def time_property(self, freq, attr): getattr(self.per, attr) @@ -20,11 +35,11 @@ def time_property(self, freq, attr): class PeriodUnaryMethods: - params = ['M', 'min'] - param_names = ['freq'] + params = ["M", "min"] + param_names = ["freq"] def setup(self, freq): - self.per = Period('2012-06-01', freq=freq) + self.per = Period("2012-06-01", freq=freq) def time_to_timestamp(self, freq): self.per.to_timestamp() @@ -33,12 +48,12 @@ def time_now(self, freq): self.per.now(freq) def time_asfreq(self, freq): - self.per.asfreq('A') + self.per.asfreq("A") class PeriodConstructor: - params = [['D'], [True, False]] - param_names = ['freq', 'is_offset'] + params = [["D"], [True, False]] + param_names = ["freq", "is_offset"] def setup(self, freq, is_offset): if is_offset: @@ -47,20 +62,21 @@ def setup(self, freq, is_offset): self.freq = freq def time_period_constructor(self, freq, is_offset): - Period('2012-06-01', freq=freq) + Period("2012-06-01", freq=freq) class PeriodIndexConstructor: - params = [['D'], [True, False]] - param_names = ['freq', 'is_offset'] + params = [["D"], [True, False]] + param_names = ["freq", "is_offset"] def setup(self, freq, is_offset): - self.rng = date_range('1985', periods=1000) - self.rng2 = date_range('1985', periods=1000).to_pydatetime() + self.rng = date_range("1985", periods=1000) + self.rng2 = date_range("1985", periods=1000).to_pydatetime() self.ints = list(range(2000, 3000)) - self.daily_ints = date_range('1/1/2000', periods=1000, - freq=freq).strftime('%Y%m%d').map(int) + self.daily_ints = ( + date_range("1/1/2000", periods=1000, freq=freq).strftime("%Y%m%d").map(int) + ) if is_offset: self.freq = to_offset(freq) else: @@ -80,32 +96,35 @@ def time_from_ints_daily(self, freq, is_offset): class DataFramePeriodColumn: - def setup(self): - self.rng = period_range(start='1/1/1990', freq='S', periods=20000) + self.rng = period_range(start="1/1/1990", freq="S", periods=20000) self.df = DataFrame(index=range(len(self.rng))) def time_setitem_period_column(self): - self.df['col'] = self.rng + self.df["col"] = self.rng def time_set_index(self): # GH#21582 limited by comparisons of Period objects - self.df['col2'] = self.rng - self.df.set_index('col2', append=True) + self.df["col2"] = self.rng + self.df.set_index("col2", append=True) class Algorithms: - params = ['index', 'series'] - param_names = ['typ'] + params = ["index", "series"] + param_names = ["typ"] def setup(self, typ): - data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'), - Period('2011-03', freq='M'), Period('2011-04', freq='M')] - - if typ == 'index': - self.vector = PeriodIndex(data * 1000, freq='M') - elif typ == 'series': + data = [ + Period("2011-01", freq="M"), + Period("2011-02", freq="M"), + Period("2011-03", freq="M"), + Period("2011-04", freq="M"), + ] + + if typ == "index": + self.vector = PeriodIndex(data * 1000, freq="M") + elif typ == "series": self.vector = Series(data * 1000) def time_drop_duplicates(self, typ): @@ -116,9 +135,8 @@ def time_value_counts(self, typ): class Indexing: - def setup(self): - self.index = period_range(start='1985', periods=1000, freq='D') + self.index = period_range(start="1985", periods=1000, freq="D") self.series = Series(range(1000), index=self.index) self.period = self.index[500] @@ -135,7 +153,7 @@ def time_series_loc(self): self.series.loc[self.period] def time_align(self): - DataFrame({'a': self.series, 'b': self.series[:500]}) + DataFrame({"a": self.series, "b": self.series[:500]}) def time_intersection(self): self.index[:750].intersection(self.index[250:]) diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py index 9e3bc87c329870..4fb0876f05a0a0 100644 --- a/asv_bench/benchmarks/plotting.py +++ b/asv_bench/benchmarks/plotting.py @@ -1,27 +1,29 @@ import numpy as np from pandas import DataFrame, Series, DatetimeIndex, date_range + try: from pandas.plotting import andrews_curves except ImportError: from pandas.tools.plotting import andrews_curves import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") class SeriesPlotting: - params = [['line', 'bar', 'area', 'barh', 'hist', 'kde', 'pie']] - param_names = ['kind'] + params = [["line", "bar", "area", "barh", "hist", "kde", "pie"]] + param_names = ["kind"] def setup(self, kind): - if kind in ['bar', 'barh', 'pie']: + if kind in ["bar", "barh", "pie"]: n = 100 - elif kind in ['kde']: + elif kind in ["kde"]: n = 10000 else: n = 1000000 self.s = Series(np.random.randn(n)) - if kind in ['area', 'pie']: + if kind in ["area", "pie"]: self.s = self.s.abs() def time_series_plot(self, kind): @@ -29,41 +31,43 @@ def time_series_plot(self, kind): class FramePlotting: - params = [['line', 'bar', 'area', 'barh', 'hist', 'kde', 'pie', 'scatter', - 'hexbin']] - param_names = ['kind'] + params = [ + ["line", "bar", "area", "barh", "hist", "kde", "pie", "scatter", "hexbin"] + ] + param_names = ["kind"] def setup(self, kind): - if kind in ['bar', 'barh', 'pie']: + if kind in ["bar", "barh", "pie"]: n = 100 - elif kind in ['kde', 'scatter', 'hexbin']: + elif kind in ["kde", "scatter", "hexbin"]: n = 10000 else: n = 1000000 self.x = Series(np.random.randn(n)) self.y = Series(np.random.randn(n)) - if kind in ['area', 'pie']: + if kind in ["area", "pie"]: self.x = self.x.abs() self.y = self.y.abs() - self.df = DataFrame({'x': self.x, 'y': self.y}) + self.df = DataFrame({"x": self.x, "y": self.y}) def time_frame_plot(self, kind): - self.df.plot(x='x', y='y', kind=kind) + self.df.plot(x="x", y="y", kind=kind) class TimeseriesPlotting: - def setup(self): N = 2000 M = 5 - idx = date_range('1/1/1975', periods=N) + idx = date_range("1/1/1975", periods=N) self.df = DataFrame(np.random.randn(N, M), index=idx) - idx_irregular = DatetimeIndex(np.concatenate((idx.values[0:10], - idx.values[12:]))) - self.df2 = DataFrame(np.random.randn(len(idx_irregular), M), - index=idx_irregular) + idx_irregular = DatetimeIndex( + np.concatenate((idx.values[0:10], idx.values[12:])) + ) + self.df2 = DataFrame( + np.random.randn(len(idx_irregular), M), index=idx_irregular + ) def time_plot_regular(self): self.df.plot() @@ -79,12 +83,11 @@ def time_plot_table(self): class Misc: - def setup(self): N = 500 M = 10 self.df = DataFrame(np.random.randn(N, M)) - self.df['Name'] = ["A"] * N + self.df["Name"] = ["A"] * N def time_plot_andrews_curves(self): andrews_curves(self.df, "Name") diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index a6ceb0e93a0898..8d4c9ebaf3e891 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -1,20 +1,18 @@ import numpy as np import pandas.util.testing as tm -from pandas import (DataFrame, Series, MultiIndex, Index, date_range, - period_range) +from pandas import DataFrame, Series, MultiIndex, Index, date_range, period_range from .pandas_vb_common import lib class Reindex: - def setup(self): - rng = date_range(start='1/1/1970', periods=10000, freq='1min') - self.df = DataFrame(np.random.rand(10000, 10), index=rng, - columns=range(10)) - self.df['foo'] = 'bar' + rng = date_range(start="1/1/1970", periods=10000, freq="1min") + self.df = DataFrame(np.random.rand(10000, 10), index=rng, columns=range(10)) + self.df["foo"] = "bar" self.rng_subset = Index(rng[::2]) - self.df2 = DataFrame(index=range(10000), - data=np.random.rand(10000, 30), columns=range(30)) + self.df2 = DataFrame( + index=range(10000), data=np.random.rand(10000, 30), columns=range(30) + ) N = 5000 K = 200 level1 = tm.makeStringIndex(N).values.repeat(K) @@ -35,12 +33,12 @@ def time_reindex_multiindex(self): class ReindexMethod: - params = [['pad', 'backfill'], [date_range, period_range]] - param_names = ['method', 'constructor'] + params = [["pad", "backfill"], [date_range, period_range]] + param_names = ["method", "constructor"] def setup(self, method, constructor): N = 100000 - self.idx = constructor('1/1/2000', periods=N, freq='1min') + self.idx = constructor("1/1/2000", periods=N, freq="1min") self.ts = Series(np.random.randn(N), index=self.idx)[::2] def time_reindex_method(self, method, constructor): @@ -49,15 +47,15 @@ def time_reindex_method(self, method, constructor): class Fillna: - params = ['pad', 'backfill'] - param_names = ['method'] + params = ["pad", "backfill"] + param_names = ["method"] def setup(self, method): N = 100000 - self.idx = date_range('1/1/2000', periods=N, freq='1min') + self.idx = date_range("1/1/2000", periods=N, freq="1min") ts = Series(np.random.randn(N), index=self.idx)[::2] self.ts_reindexed = ts.reindex(self.idx) - self.ts_float32 = self.ts_reindexed.astype('float32') + self.ts_float32 = self.ts_reindexed.astype("float32") def time_reindexed(self, method): self.ts_reindexed.fillna(method=method) @@ -67,17 +65,17 @@ def time_float_32(self, method): class LevelAlign: - def setup(self): self.index = MultiIndex( levels=[np.arange(10), np.arange(100), np.arange(100)], - codes=[np.arange(10).repeat(10000), - np.tile(np.arange(100).repeat(100), 10), - np.tile(np.tile(np.arange(100), 100), 10)]) - self.df = DataFrame(np.random.randn(len(self.index), 4), - index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), - index=self.index.levels[1]) + codes=[ + np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10), + ], + ) + self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) + self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) def time_align_level(self): self.df.align(self.df_level, level=1, copy=False) @@ -89,15 +87,16 @@ def time_reindex_level(self): class DropDuplicates: params = [True, False] - param_names = ['inplace'] + param_names = ["inplace"] def setup(self, inplace): N = 10000 K = 10 key1 = tm.makeStringIndex(N).values.repeat(K) key2 = tm.makeStringIndex(N).values.repeat(K) - self.df = DataFrame({'key1': key1, 'key2': key2, - 'value': np.random.randn(N * K)}) + self.df = DataFrame( + {"key1": key1, "key2": key2, "value": np.random.randn(N * K)} + ) self.df_nan = self.df.copy() self.df_nan.iloc[:10000, :] = np.nan @@ -107,15 +106,14 @@ def setup(self, inplace): N = 1000000 K = 10000 key1 = np.random.randint(0, K, size=N) - self.df_int = DataFrame({'key1': key1}) - self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10), - dtype=bool)) + self.df_int = DataFrame({"key1": key1}) + self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10), dtype=bool)) def time_frame_drop_dups(self, inplace): - self.df.drop_duplicates(['key1', 'key2'], inplace=inplace) + self.df.drop_duplicates(["key1", "key2"], inplace=inplace) def time_frame_drop_dups_na(self, inplace): - self.df_nan.drop_duplicates(['key1', 'key2'], inplace=inplace) + self.df_nan.drop_duplicates(["key1", "key2"], inplace=inplace) def time_series_drop_dups_int(self, inplace): self.s.drop_duplicates(inplace=inplace) @@ -137,16 +135,16 @@ def setup(self): indices = tm.makeStringIndex(n) subsample_size = 40000 self.x = Series(np.random.randn(n), indices) - self.y = Series(np.random.randn(subsample_size), - index=np.random.choice(indices, subsample_size, - replace=False)) + self.y = Series( + np.random.randn(subsample_size), + index=np.random.choice(indices, subsample_size, replace=False), + ) def time_align_series_irregular_string(self): self.x + self.y class LibFastZip: - def setup(self): N = 10000 K = 10 diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index 9dff1778f8e560..6137e944e6b9e3 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -5,11 +5,11 @@ class FillNa: params = [True, False] - param_names = ['inplace'] + param_names = ["inplace"] def setup(self, inplace): - N = 10**6 - rng = pd.date_range('1/1/2000', periods=N, freq='min') + N = 10 ** 6 + rng = pd.date_range("1/1/2000", periods=N, freq="min") data = np.random.randn(N) data[::2] = np.nan self.ts = pd.Series(data, index=rng) @@ -24,13 +24,13 @@ def time_replace(self, inplace): class ReplaceDict: params = [True, False] - param_names = ['inplace'] + param_names = ["inplace"] def setup(self, inplace): - N = 10**5 - start_value = 10**5 + N = 10 ** 5 + start_value = 10 ** 5 self.to_rep = dict(enumerate(np.arange(N) + start_value)) - self.s = pd.Series(np.random.randint(N, size=10**3)) + self.s = pd.Series(np.random.randint(N, size=10 ** 3)) def time_replace_series(self, inplace): self.s.replace(self.to_rep, inplace=inplace) @@ -38,14 +38,17 @@ def time_replace_series(self, inplace): class Convert: - params = (['DataFrame', 'Series'], ['Timestamp', 'Timedelta']) - param_names = ['constructor', 'replace_data'] + params = (["DataFrame", "Series"], ["Timestamp", "Timedelta"]) + param_names = ["constructor", "replace_data"] def setup(self, constructor, replace_data): - N = 10**3 - data = {'Series': pd.Series(np.random.randint(N, size=N)), - 'DataFrame': pd.DataFrame({'A': np.random.randint(N, size=N), - 'B': np.random.randint(N, size=N)})} + N = 10 ** 3 + data = { + "Series": pd.Series(np.random.randint(N, size=N)), + "DataFrame": pd.DataFrame( + {"A": np.random.randint(N, size=N), "B": np.random.randint(N, size=N)} + ), + } self.to_replace = {i: getattr(pd, replace_data) for i in range(N)} self.data = data[constructor] diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 678403d8378054..f41e13163b3f5f 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -7,35 +7,33 @@ class Melt: - def setup(self): - self.df = DataFrame(np.random.randn(10000, 3), columns=['A', 'B', 'C']) - self.df['id1'] = np.random.randint(0, 10, 10000) - self.df['id2'] = np.random.randint(100, 1000, 10000) + self.df = DataFrame(np.random.randn(10000, 3), columns=["A", "B", "C"]) + self.df["id1"] = np.random.randint(0, 10, 10000) + self.df["id2"] = np.random.randint(100, 1000, 10000) def time_melt_dataframe(self): - melt(self.df, id_vars=['id1', 'id2']) + melt(self.df, id_vars=["id1", "id2"]) class Pivot: - def setup(self): N = 10000 - index = date_range('1/1/2000', periods=N, freq='h') - data = {'value': np.random.randn(N * 50), - 'variable': np.arange(50).repeat(N), - 'date': np.tile(index.values, 50)} + index = date_range("1/1/2000", periods=N, freq="h") + data = { + "value": np.random.randn(N * 50), + "variable": np.arange(50).repeat(N), + "date": np.tile(index.values, 50), + } self.df = DataFrame(data) def time_reshape_pivot_time_series(self): - self.df.pivot('date', 'variable', 'value') + self.df.pivot("date", "variable", "value") class SimpleReshape: - def setup(self): - arrays = [np.arange(100).repeat(100), - np.roll(np.tile(np.arange(100), 100), 25)] + arrays = [np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)] index = MultiIndex.from_arrays(arrays) self.df = DataFrame(np.random.randn(10000, 4), index=index) self.udf = self.df.unstack(1) @@ -49,7 +47,7 @@ def time_unstack(self): class Unstack: - params = ['int', 'category'] + params = ["int", "category"] def setup(self, dtype): m = 100 @@ -58,7 +56,7 @@ def setup(self, dtype): levels = np.arange(m) index = MultiIndex.from_product([levels] * 2) columns = np.arange(n) - if dtype == 'int': + if dtype == "int": values = np.arange(m * m * n).reshape(m * m, n) else: # the category branch is ~20x slower than int. So we @@ -80,84 +78,94 @@ def time_without_last_row(self, dtype): class SparseIndex: - def setup(self): NUM_ROWS = 1000 - self.df = DataFrame({'A': np.random.randint(50, size=NUM_ROWS), - 'B': np.random.randint(50, size=NUM_ROWS), - 'C': np.random.randint(-10, 10, size=NUM_ROWS), - 'D': np.random.randint(-10, 10, size=NUM_ROWS), - 'E': np.random.randint(10, size=NUM_ROWS), - 'F': np.random.randn(NUM_ROWS)}) - self.df = self.df.set_index(['A', 'B', 'C', 'D', 'E']) + self.df = DataFrame( + { + "A": np.random.randint(50, size=NUM_ROWS), + "B": np.random.randint(50, size=NUM_ROWS), + "C": np.random.randint(-10, 10, size=NUM_ROWS), + "D": np.random.randint(-10, 10, size=NUM_ROWS), + "E": np.random.randint(10, size=NUM_ROWS), + "F": np.random.randn(NUM_ROWS), + } + ) + self.df = self.df.set_index(["A", "B", "C", "D", "E"]) def time_unstack(self): self.df.unstack() class WideToLong: - def setup(self): nyrs = 20 nidvars = 20 N = 5000 - self.letters = list('ABCD') - yrvars = [l + str(num) - for l, num in product(self.letters, range(1, nyrs + 1))] + self.letters = list("ABCD") + yrvars = [l + str(num) for l, num in product(self.letters, range(1, nyrs + 1))] columns = [str(i) for i in range(nidvars)] + yrvars - self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)), - columns=columns) - self.df['id'] = self.df.index + self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)), columns=columns) + self.df["id"] = self.df.index def time_wide_to_long_big(self): - wide_to_long(self.df, self.letters, i='id', j='year') + wide_to_long(self.df, self.letters, i="id", j="year") class PivotTable: - def setup(self): N = 100000 - fac1 = np.array(['A', 'B', 'C'], dtype='O') - fac2 = np.array(['one', 'two'], dtype='O') + fac1 = np.array(["A", "B", "C"], dtype="O") + fac2 = np.array(["one", "two"], dtype="O") ind1 = np.random.randint(0, 3, size=N) ind2 = np.random.randint(0, 2, size=N) - self.df = DataFrame({'key1': fac1.take(ind1), - 'key2': fac2.take(ind2), - 'key3': fac2.take(ind2), - 'value1': np.random.randn(N), - 'value2': np.random.randn(N), - 'value3': np.random.randn(N)}) - self.df2 = DataFrame({'col1': list('abcde'), 'col2': list('fghij'), - 'col3': [1, 2, 3, 4, 5]}) - self.df2.col1 = self.df2.col1.astype('category') - self.df2.col2 = self.df2.col2.astype('category') + self.df = DataFrame( + { + "key1": fac1.take(ind1), + "key2": fac2.take(ind2), + "key3": fac2.take(ind2), + "value1": np.random.randn(N), + "value2": np.random.randn(N), + "value3": np.random.randn(N), + } + ) + self.df2 = DataFrame( + {"col1": list("abcde"), "col2": list("fghij"), "col3": [1, 2, 3, 4, 5]} + ) + self.df2.col1 = self.df2.col1.astype("category") + self.df2.col2 = self.df2.col2.astype("category") def time_pivot_table(self): - self.df.pivot_table(index='key1', columns=['key2', 'key3']) + self.df.pivot_table(index="key1", columns=["key2", "key3"]) def time_pivot_table_agg(self): - self.df.pivot_table(index='key1', columns=['key2', 'key3'], - aggfunc=['sum', 'mean']) + self.df.pivot_table( + index="key1", columns=["key2", "key3"], aggfunc=["sum", "mean"] + ) def time_pivot_table_margins(self): - self.df.pivot_table(index='key1', columns=['key2', 'key3'], - margins=True) + self.df.pivot_table(index="key1", columns=["key2", "key3"], margins=True) def time_pivot_table_categorical(self): - self.df2.pivot_table(index='col1', values='col3', columns='col2', - aggfunc=np.sum, fill_value=0) + self.df2.pivot_table( + index="col1", values="col3", columns="col2", aggfunc=np.sum, fill_value=0 + ) def time_pivot_table_categorical_observed(self): - self.df2.pivot_table(index='col1', values='col3', columns='col2', - aggfunc=np.sum, fill_value=0, observed=True) + self.df2.pivot_table( + index="col1", + values="col3", + columns="col2", + aggfunc=np.sum, + fill_value=0, + observed=True, + ) class Crosstab: - def setup(self): N = 100000 - fac1 = np.array(['A', 'B', 'C'], dtype='O') - fac2 = np.array(['one', 'two'], dtype='O') + fac1 = np.array(["A", "B", "C"], dtype="O") + fac2 = np.array(["one", "two"], dtype="O") self.ind1 = np.random.randint(0, 3, size=N) self.ind2 = np.random.randint(0, 2, size=N) self.vec1 = fac1.take(self.ind1) @@ -167,7 +175,7 @@ def time_crosstab(self): pd.crosstab(self.vec1, self.vec2) def time_crosstab_values(self): - pd.crosstab(self.vec1, self.vec2, values=self.ind1, aggfunc='sum') + pd.crosstab(self.vec1, self.vec2, values=self.ind1, aggfunc="sum") def time_crosstab_normalize(self): pd.crosstab(self.vec1, self.vec2, normalize=True) @@ -179,8 +187,10 @@ def time_crosstab_normalize_margins(self): class GetDummies: def setup(self): categories = list(string.ascii_letters[:12]) - s = pd.Series(np.random.choice(categories, size=1000000), - dtype=pd.api.types.CategoricalDtype(categories)) + s = pd.Series( + np.random.choice(categories, size=1000000), + dtype=pd.api.types.CategoricalDtype(categories), + ) self.s = s def time_get_dummies_1d(self): @@ -192,16 +202,18 @@ def time_get_dummies_1d_sparse(self): class Cut: params = [[4, 10, 1000]] - param_names = ['bins'] + param_names = ["bins"] def setup(self, bins): - N = 10**5 + N = 10 ** 5 self.int_series = pd.Series(np.arange(N).repeat(5)) self.float_series = pd.Series(np.random.randn(N).repeat(5)) - self.timedelta_series = pd.Series(np.random.randint(N, size=N), - dtype='timedelta64[ns]') - self.datetime_series = pd.Series(np.random.randint(N, size=N), - dtype='datetime64[ns]') + self.timedelta_series = pd.Series( + np.random.randint(N, size=N), dtype="timedelta64[ns]" + ) + self.datetime_series = pd.Series( + np.random.randint(N, size=N), dtype="datetime64[ns]" + ) def time_cut_int(self, bins): pd.cut(self.int_series, bins) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 033b466c8b9be6..a70977fcf539f7 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -4,15 +4,16 @@ class Methods: - params = (['DataFrame', 'Series'], - [10, 1000], - ['int', 'float'], - ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt', - 'sum']) - param_names = ['contructor', 'window', 'dtype', 'method'] + params = ( + ["DataFrame", "Series"], + [10, 1000], + ["int", "float"], + ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"], + ) + param_names = ["contructor", "window", "dtype", "method"] def setup(self, constructor, window, dtype, method): - N = 10**5 + N = 10 ** 5 arr = (100 * np.random.random(N)).astype(dtype) self.roll = getattr(pd, constructor)(arr).rolling(window) @@ -22,14 +23,15 @@ def time_rolling(self, constructor, window, dtype, method): class ExpandingMethods: - params = (['DataFrame', 'Series'], - ['int', 'float'], - ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt', - 'sum']) - param_names = ['contructor', 'window', 'dtype', 'method'] + params = ( + ["DataFrame", "Series"], + ["int", "float"], + ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"], + ) + param_names = ["contructor", "window", "dtype", "method"] def setup(self, constructor, dtype, method): - N = 10**5 + N = 10 ** 5 arr = (100 * np.random.random(N)).astype(dtype) self.expanding = getattr(pd, constructor)(arr).expanding() @@ -39,14 +41,11 @@ def time_expanding(self, constructor, dtype, method): class EWMMethods: - params = (['DataFrame', 'Series'], - [10, 1000], - ['int', 'float'], - ['mean', 'std']) - param_names = ['contructor', 'window', 'dtype', 'method'] + params = (["DataFrame", "Series"], [10, 1000], ["int", "float"], ["mean", "std"]) + param_names = ["contructor", "window", "dtype", "method"] def setup(self, constructor, window, dtype, method): - N = 10**5 + N = 10 ** 5 arr = (100 * np.random.random(N)).astype(dtype) self.ewm = getattr(pd, constructor)(arr).ewm(halflife=window) @@ -55,29 +54,28 @@ def time_ewm(self, constructor, window, dtype, method): class VariableWindowMethods(Methods): - params = (['DataFrame', 'Series'], - ['50s', '1h', '1d'], - ['int', 'float'], - ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt', - 'sum']) - param_names = ['contructor', 'window', 'dtype', 'method'] + params = ( + ["DataFrame", "Series"], + ["50s", "1h", "1d"], + ["int", "float"], + ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"], + ) + param_names = ["contructor", "window", "dtype", "method"] def setup(self, constructor, window, dtype, method): - N = 10**5 + N = 10 ** 5 arr = (100 * np.random.random(N)).astype(dtype) - index = pd.date_range('2017-01-01', periods=N, freq='5s') + index = pd.date_range("2017-01-01", periods=N, freq="5s") self.roll = getattr(pd, constructor)(arr, index=index).rolling(window) class Pairwise: - params = ([10, 1000, None], - ['corr', 'cov'], - [True, False]) - param_names = ['window', 'method', 'pairwise'] + params = ([10, 1000, None], ["corr", "cov"], [True, False]) + param_names = ["window", "method", "pairwise"] def setup(self, window, method, pairwise): - N = 10**4 + N = 10 ** 4 arr = np.random.random(N) self.df = pd.DataFrame(arr) @@ -90,25 +88,25 @@ def time_pairwise(self, window, method, pairwise): class Quantile: - params = (['DataFrame', 'Series'], - [10, 1000], - ['int', 'float'], - [0, 0.5, 1], - ['linear', 'nearest', 'lower', 'higher', 'midpoint']) - param_names = ['constructor', 'window', 'dtype', 'percentile'] + params = ( + ["DataFrame", "Series"], + [10, 1000], + ["int", "float"], + [0, 0.5, 1], + ["linear", "nearest", "lower", "higher", "midpoint"], + ) + param_names = ["constructor", "window", "dtype", "percentile"] def setup(self, constructor, window, dtype, percentile, interpolation): N = 10 ** 5 arr = np.random.random(N).astype(dtype) self.roll = getattr(pd, constructor)(arr).rolling(window) - def time_quantile(self, constructor, window, dtype, percentile, - interpolation): + def time_quantile(self, constructor, window, dtype, percentile, interpolation): self.roll.quantile(percentile, interpolation=interpolation) class PeakMemFixed: - def setup(self): N = 10 arr = 100 * np.random.random(N) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 4b1af2dc8c9327..e2835c5156f559 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -7,13 +7,13 @@ class SeriesConstructor: - params = [None, 'dict'] - param_names = ['data'] + params = [None, "dict"] + param_names = ["data"] def setup(self, data): - self.idx = date_range(start=datetime(2015, 10, 26), - end=datetime(2016, 1, 1), - freq='50s') + self.idx = date_range( + start=datetime(2015, 10, 26), end=datetime(2016, 1, 1), freq="50s" + ) dict_data = dict(zip(self.idx, range(len(self.idx)))) self.data = None if data is None else dict_data @@ -23,8 +23,8 @@ def time_constructor(self, data): class IsIn: - params = ['int64', 'uint64', 'object'] - param_names = ['dtype'] + params = ["int64", "uint64", "object"] + param_names = ["dtype"] def setup(self, dtype): self.s = Series(np.random.randint(1, 10, 100000)).astype(dtype) @@ -35,12 +35,11 @@ def time_isin(self, dtypes): class IsInFloat64: - def setup(self): self.small = Series([1, 2], dtype=np.float64) - self.many_different_values = np.arange(10**6, dtype=np.float64) - self.few_different_values = np.zeros(10**7, dtype=np.float64) - self.only_nans_values = np.full(10**7, np.nan, dtype=np.float64) + self.many_different_values = np.arange(10 ** 6, dtype=np.float64) + self.few_different_values = np.zeros(10 ** 7, dtype=np.float64) + self.only_nans_values = np.full(10 ** 7, np.nan, dtype=np.float64) def time_isin_many_different(self): # runtime is dominated by creation of the lookup-table @@ -56,19 +55,18 @@ def time_isin_nan_values(self): class IsInForObjects: - def setup(self): - self.s_nans = Series(np.full(10**4, np.nan)).astype(np.object) - self.vals_nans = np.full(10**4, np.nan).astype(np.object) + self.s_nans = Series(np.full(10 ** 4, np.nan)).astype(np.object) + self.vals_nans = np.full(10 ** 4, np.nan).astype(np.object) self.s_short = Series(np.arange(2)).astype(np.object) - self.s_long = Series(np.arange(10**5)).astype(np.object) + self.s_long = Series(np.arange(10 ** 5)).astype(np.object) self.vals_short = np.arange(2).astype(np.object) - self.vals_long = np.arange(10**5).astype(np.object) + self.vals_long = np.arange(10 ** 5).astype(np.object) # because of nans floats are special: - self.s_long_floats = Series(np.arange(10**5, - dtype=np.float)).astype(np.object) - self.vals_long_floats = np.arange(10**5, - dtype=np.float).astype(np.object) + self.s_long_floats = Series(np.arange(10 ** 5, dtype=np.float)).astype( + np.object + ) + self.vals_long_floats = np.arange(10 ** 5, dtype=np.float).astype(np.object) def time_isin_nans(self): # if nan-objects are different objects, @@ -94,8 +92,8 @@ def time_isin_long_series_long_values_floats(self): class NSort: - params = ['first', 'last', 'all'] - param_names = ['keep'] + params = ["first", "last", "all"] + param_names = ["keep"] def setup(self, keep): self.s = Series(np.random.randint(1, 10, 100000)) @@ -109,15 +107,17 @@ def time_nsmallest(self, keep): class Dropna: - params = ['int', 'datetime'] - param_names = ['dtype'] + params = ["int", "datetime"] + param_names = ["dtype"] def setup(self, dtype): - N = 10**6 - data = {'int': np.random.randint(1, 10, N), - 'datetime': date_range('2000-01-01', freq='S', periods=N)} + N = 10 ** 6 + data = { + "int": np.random.randint(1, 10, N), + "datetime": date_range("2000-01-01", freq="S", periods=N), + } self.s = Series(data[dtype]) - if dtype == 'datetime': + if dtype == "datetime": self.s[np.random.randint(1, N, 100)] = NaT def time_dropna(self, dtype): @@ -127,37 +127,47 @@ def time_dropna(self, dtype): class SearchSorted: goal_time = 0.2 - params = ['int8', 'int16', 'int32', 'int64', - 'uint8', 'uint16', 'uint32', 'uint64', - 'float16', 'float32', 'float64', - 'str'] - param_names = ['dtype'] + params = [ + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "float16", + "float32", + "float64", + "str", + ] + param_names = ["dtype"] def setup(self, dtype): - N = 10**5 + N = 10 ** 5 data = np.array([1] * N + [2] * N + [3] * N).astype(dtype) self.s = Series(data) def time_searchsorted(self, dtype): - key = '2' if dtype == 'str' else 2 + key = "2" if dtype == "str" else 2 self.s.searchsorted(key) class Map: - params = (['dict', 'Series', 'lambda'], ['object', 'category', 'int']) - param_names = 'mapper' + params = (["dict", "Series", "lambda"], ["object", "category", "int"]) + param_names = "mapper" def setup(self, mapper, dtype): map_size = 1000 map_data = Series(map_size - np.arange(map_size), dtype=dtype) # construct mapper - if mapper == 'Series': + if mapper == "Series": self.map_data = map_data - elif mapper == 'dict': + elif mapper == "dict": self.map_data = map_data.to_dict() - elif mapper == 'lambda': + elif mapper == "lambda": map_dict = map_data.to_dict() self.map_data = lambda x: map_dict[x] else: @@ -170,8 +180,8 @@ def time_map(self, mapper, *args, **kwargs): class Clip: - params = [50, 1000, 10**5] - param_names = ['n'] + params = [50, 1000, 10 ** 5] + param_names = ["n"] def setup(self, n): self.s = Series(np.random.randn(n)) @@ -182,8 +192,8 @@ def time_clip(self, n): class ValueCounts: - params = ['int', 'uint', 'float', 'object'] - param_names = ['dtype'] + params = ["int", "uint", "float", "object"] + param_names = ["dtype"] def setup(self, dtype): self.s = Series(np.random.randint(0, 1000, size=100000)).astype(dtype) @@ -193,7 +203,6 @@ def time_value_counts(self, dtype): class Dir: - def setup(self): self.s = Series(index=tm.makeStringIndex(10000)) @@ -204,21 +213,19 @@ def time_dir_strings(self): class SeriesGetattr: # https://github.com/pandas-dev/pandas/issues/19764 def setup(self): - self.s = Series(1, - index=date_range("2012-01-01", freq='s', - periods=int(1e6))) + self.s = Series(1, index=date_range("2012-01-01", freq="s", periods=int(1e6))) def time_series_datetimeindex_repr(self): - getattr(self.s, 'a', None) + getattr(self.s, "a", None) class All(object): - params = [[10**3, 10**6], ['fast', 'slow']] - param_names = ['N', 'case'] + params = [[10 ** 3, 10 ** 6], ["fast", "slow"]] + param_names = ["N", "case"] def setup(self, N, case): - val = case != 'fast' + val = case != "fast" self.s = Series([val] * N) def time_all(self, N, case): @@ -227,11 +234,11 @@ def time_all(self, N, case): class Any(object): - params = [[10**3, 10**6], ['fast', 'slow']] - param_names = ['N', 'case'] + params = [[10 ** 3, 10 ** 6], ["fast", "slow"]] + param_names = ["N", "case"] def setup(self, N, case): - val = case == 'fast' + val = case == "fast" self.s = Series([val] * N) def time_any(self, N, case): @@ -240,11 +247,25 @@ def time_any(self, N, case): class NanOps(object): - params = [['var', 'mean', 'median', 'max', 'min', 'sum', 'std', 'sem', - 'argmax', 'skew', 'kurt', 'prod'], - [10**3, 10**6], - ['int8', 'int32', 'int64', 'float64']] - param_names = ['func', 'N', 'dtype'] + params = [ + [ + "var", + "mean", + "median", + "max", + "min", + "sum", + "std", + "sem", + "argmax", + "skew", + "kurt", + "prod", + ], + [10 ** 3, 10 ** 6], + ["int8", "int32", "int64", "float64"], + ] + param_names = ["func", "N", "dtype"] def setup(self, func, N, dtype): self.s = Series([1] * N, dtype=dtype) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 281e81f21ba9c6..19d08c086a508a 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -14,11 +14,10 @@ def make_array(size, dense_proportion, fill_value, dtype): class SparseSeriesToFrame: - def setup(self): K = 50 N = 50001 - rng = date_range('1/1/2000', periods=N, freq='T') + rng = date_range("1/1/2000", periods=N, freq="T") self.series = {} for i in range(1, K): data = np.random.randn(N)[:-i] @@ -32,12 +31,11 @@ def time_series_to_frame(self): class SparseArrayConstructor: - params = ([0.1, 0.01], [0, np.nan], - [np.int64, np.float64, np.object]) - param_names = ['dense_proportion', 'fill_value', 'dtype'] + params = ([0.1, 0.01], [0, np.nan], [np.int64, np.float64, np.object]) + param_names = ["dense_proportion", "fill_value", "dtype"] def setup(self, dense_proportion, fill_value, dtype): - N = 10**6 + N = 10 ** 6 self.array = make_array(N, dense_proportion, fill_value, dtype) def time_sparse_array(self, dense_proportion, fill_value, dtype): @@ -45,7 +43,6 @@ def time_sparse_array(self, dense_proportion, fill_value, dtype): class SparseDataFrameConstructor: - def setup(self): N = 1000 self.arr = np.arange(N) @@ -56,18 +53,16 @@ def time_from_scipy(self): class FromCoo: - def setup(self): - self.matrix = scipy.sparse.coo_matrix(([3.0, 1.0, 2.0], - ([1, 0, 0], [0, 2, 3])), - shape=(100, 100)) + self.matrix = scipy.sparse.coo_matrix( + ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(100, 100) + ) def time_sparse_series_from_coo(self): pd.Series.sparse.from_coo(self.matrix) class ToCoo: - def setup(self): s = Series([np.nan] * 10000) s[0] = 3.0 @@ -77,18 +72,16 @@ def setup(self): self.ss = s.astype("Sparse") def time_sparse_series_to_coo(self): - self.ss.sparse.to_coo(row_levels=[0, 1], - column_levels=[2, 3], - sort_labels=True) + self.ss.sparse.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True) class Arithmetic: params = ([0.1, 0.01], [0, np.nan]) - param_names = ['dense_proportion', 'fill_value'] + param_names = ["dense_proportion", "fill_value"] def setup(self, dense_proportion, fill_value): - N = 10**6 + N = 10 ** 6 arr1 = make_array(N, dense_proportion, fill_value, np.int64) self.array1 = SparseArray(arr1, fill_value=fill_value) arr2 = make_array(N, dense_proportion, fill_value, np.int64) @@ -110,22 +103,24 @@ def time_divide(self, dense_proportion, fill_value): class ArithmeticBlock: params = [np.nan, 0] - param_names = ['fill_value'] + param_names = ["fill_value"] def setup(self, fill_value): - N = 10**6 - self.arr1 = self.make_block_array(length=N, num_blocks=1000, - block_size=10, fill_value=fill_value) - self.arr2 = self.make_block_array(length=N, num_blocks=1000, - block_size=10, fill_value=fill_value) + N = 10 ** 6 + self.arr1 = self.make_block_array( + length=N, num_blocks=1000, block_size=10, fill_value=fill_value + ) + self.arr2 = self.make_block_array( + length=N, num_blocks=1000, block_size=10, fill_value=fill_value + ) def make_block_array(self, length, num_blocks, block_size, fill_value): arr = np.full(length, fill_value) - indicies = np.random.choice(np.arange(0, length, block_size), - num_blocks, - replace=False) + indicies = np.random.choice( + np.arange(0, length, block_size), num_blocks, replace=False + ) for ind in indicies: - arr[ind:ind + block_size] = np.random.randint(0, 100, block_size) + arr[ind : ind + block_size] = np.random.randint(0, 100, block_size) return SparseArray(arr, fill_value=fill_value) def time_make_union(self, fill_value): diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 3514335f92e773..620a6de0f5f341 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -2,14 +2,13 @@ import pandas as pd -ops = ['mean', 'sum', 'median', 'std', 'skew', 'kurt', 'mad', 'prod', 'sem', - 'var'] +ops = ["mean", "sum", "median", "std", "skew", "kurt", "mad", "prod", "sem", "var"] class FrameOps: - params = [ops, ['float', 'int'], [0, 1], [True, False]] - param_names = ['op', 'dtype', 'axis', 'use_bottleneck'] + params = [ops, ["float", "int"], [0, 1], [True, False]] + param_names = ["op", "dtype", "axis", "use_bottleneck"] def setup(self, op, dtype, axis, use_bottleneck): df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype) @@ -17,6 +16,7 @@ def setup(self, op, dtype, axis, use_bottleneck): pd.options.compute.use_bottleneck = use_bottleneck except TypeError: from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck self.df_func = getattr(df, op) @@ -27,13 +27,15 @@ def time_op(self, op, dtype, axis, use_bottleneck): class FrameMultiIndexOps: params = ([0, 1, [0, 1]], ops) - param_names = ['level', 'op'] + param_names = ["level", "op"] def setup(self, level, op): levels = [np.arange(10), np.arange(100), np.arange(100)] - codes = [np.arange(10).repeat(10000), - np.tile(np.arange(100).repeat(100), 10), - np.tile(np.tile(np.arange(100), 100), 10)] + codes = [ + np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10), + ] index = pd.MultiIndex(levels=levels, codes=codes) df = pd.DataFrame(np.random.randn(len(index), 4), index=index) self.df_func = getattr(df, op) @@ -44,8 +46,8 @@ def time_op(self, level, op): class SeriesOps: - params = [ops, ['float', 'int'], [True, False]] - param_names = ['op', 'dtype', 'use_bottleneck'] + params = [ops, ["float", "int"], [True, False]] + param_names = ["op", "dtype", "use_bottleneck"] def setup(self, op, dtype, use_bottleneck): s = pd.Series(np.random.randn(100000)).astype(dtype) @@ -53,6 +55,7 @@ def setup(self, op, dtype, use_bottleneck): pd.options.compute.use_bottleneck = use_bottleneck except TypeError: from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck self.s_func = getattr(s, op) @@ -63,13 +66,15 @@ def time_op(self, op, dtype, use_bottleneck): class SeriesMultiIndexOps: params = ([0, 1, [0, 1]], ops) - param_names = ['level', 'op'] + param_names = ["level", "op"] def setup(self, level, op): levels = [np.arange(10), np.arange(100), np.arange(100)] - codes = [np.arange(10).repeat(10000), - np.tile(np.arange(100).repeat(100), 10), - np.tile(np.tile(np.arange(100), 100), 10)] + codes = [ + np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10), + ] index = pd.MultiIndex(levels=levels, codes=codes) s = pd.Series(np.random.randn(len(index)), index=index) self.s_func = getattr(s, op) @@ -80,11 +85,11 @@ def time_op(self, level, op): class Rank: - params = [['DataFrame', 'Series'], [True, False]] - param_names = ['constructor', 'pct'] + params = [["DataFrame", "Series"], [True, False]] + param_names = ["constructor", "pct"] def setup(self, constructor, pct): - values = np.random.randn(10**5) + values = np.random.randn(10 ** 5) self.data = getattr(pd, constructor)(values) def time_rank(self, constructor, pct): @@ -96,14 +101,15 @@ def time_average_old(self, constructor, pct): class Correlation: - params = [['spearman', 'kendall', 'pearson'], [True, False]] - param_names = ['method', 'use_bottleneck'] + params = [["spearman", "kendall", "pearson"], [True, False]] + param_names = ["method", "use_bottleneck"] def setup(self, method, use_bottleneck): try: pd.options.compute.use_bottleneck = use_bottleneck except TypeError: from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck self.df = pd.DataFrame(np.random.randn(1000, 30)) self.df2 = pd.DataFrame(np.random.randn(1000, 30)) @@ -126,13 +132,14 @@ def time_corrwith_rows(self, method, use_bottleneck): class Covariance: params = [[True, False]] - param_names = ['use_bottleneck'] + param_names = ["use_bottleneck"] def setup(self, use_bottleneck): try: pd.options.compute.use_bottleneck = use_bottleneck except TypeError: from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck self.s = pd.Series(np.random.randn(100000)) self.s2 = pd.Series(np.random.randn(100000)) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 5dbcc71b7455ef..6be2fa92d9eac3 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -6,31 +6,30 @@ class Methods: - def setup(self): - self.s = Series(tm.makeStringIndex(10**5)) + self.s = Series(tm.makeStringIndex(10 ** 5)) def time_center(self): self.s.str.center(100) def time_count(self): - self.s.str.count('A') + self.s.str.count("A") def time_endswith(self): - self.s.str.endswith('A') + self.s.str.endswith("A") def time_extract(self): with warnings.catch_warnings(record=True): - self.s.str.extract('(\\w*)A(\\w*)') + self.s.str.extract("(\\w*)A(\\w*)") def time_findall(self): - self.s.str.findall('[A-Z]+') + self.s.str.findall("[A-Z]+") def time_find(self): - self.s.str.find('[A-Z]+') + self.s.str.find("[A-Z]+") def time_rfind(self): - self.s.str.rfind('[A-Z]+') + self.s.str.rfind("[A-Z]+") def time_get(self): self.s.str.get(0) @@ -39,43 +38,43 @@ def time_len(self): self.s.str.len() def time_join(self): - self.s.str.join(' ') + self.s.str.join(" ") def time_match(self): - self.s.str.match('A') + self.s.str.match("A") def time_normalize(self): - self.s.str.normalize('NFC') + self.s.str.normalize("NFC") def time_pad(self): - self.s.str.pad(100, side='both') + self.s.str.pad(100, side="both") def time_partition(self): - self.s.str.partition('A') + self.s.str.partition("A") def time_rpartition(self): - self.s.str.rpartition('A') + self.s.str.rpartition("A") def time_replace(self): - self.s.str.replace('A', '\x01\x01') + self.s.str.replace("A", "\x01\x01") def time_translate(self): - self.s.str.translate({'A': '\x01\x01'}) + self.s.str.translate({"A": "\x01\x01"}) def time_slice(self): self.s.str.slice(5, 15, 2) def time_startswith(self): - self.s.str.startswith('A') + self.s.str.startswith("A") def time_strip(self): - self.s.str.strip('A') + self.s.str.strip("A") def time_rstrip(self): - self.s.str.rstrip('A') + self.s.str.rstrip("A") def time_lstrip(self): - self.s.str.lstrip('A') + self.s.str.lstrip("A") def time_title(self): self.s.str.title() @@ -95,13 +94,13 @@ def time_zfill(self): class Repeat: - params = ['int', 'array'] - param_names = ['repeats'] + params = ["int", "array"] + param_names = ["repeats"] def setup(self, repeats): - N = 10**5 + N = 10 ** 5 self.s = Series(tm.makeStringIndex(N)) - repeat = {'int': 1, 'array': np.random.randint(1, 3, N)} + repeat = {"int": 1, "array": np.random.randint(1, 3, N)} self.values = repeat[repeats] def time_repeat(self, repeats): @@ -110,20 +109,20 @@ def time_repeat(self, repeats): class Cat: - params = ([0, 3], [None, ','], [None, '-'], [0.0, 0.001, 0.15]) - param_names = ['other_cols', 'sep', 'na_rep', 'na_frac'] + params = ([0, 3], [None, ","], [None, "-"], [0.0, 0.001, 0.15]) + param_names = ["other_cols", "sep", "na_rep", "na_frac"] def setup(self, other_cols, sep, na_rep, na_frac): N = 10 ** 5 - mask_gen = lambda: np.random.choice([True, False], N, - p=[1 - na_frac, na_frac]) + mask_gen = lambda: np.random.choice([True, False], N, p=[1 - na_frac, na_frac]) self.s = Series(tm.makeStringIndex(N)).where(mask_gen()) if other_cols == 0: # str.cat self-concatenates only for others=None self.others = None else: - self.others = DataFrame({i: tm.makeStringIndex(N).where(mask_gen()) - for i in range(other_cols)}) + self.others = DataFrame( + {i: tm.makeStringIndex(N).where(mask_gen()) for i in range(other_cols)} + ) def time_cat(self, other_cols, sep, na_rep, na_frac): # before the concatenation (one caller + other_cols columns), the total @@ -136,52 +135,49 @@ def time_cat(self, other_cols, sep, na_rep, na_frac): class Contains: params = [True, False] - param_names = ['regex'] + param_names = ["regex"] def setup(self, regex): - self.s = Series(tm.makeStringIndex(10**5)) + self.s = Series(tm.makeStringIndex(10 ** 5)) def time_contains(self, regex): - self.s.str.contains('A', regex=regex) + self.s.str.contains("A", regex=regex) class Split: params = [True, False] - param_names = ['expand'] + param_names = ["expand"] def setup(self, expand): - self.s = Series(tm.makeStringIndex(10**5)).str.join('--') + self.s = Series(tm.makeStringIndex(10 ** 5)).str.join("--") def time_split(self, expand): - self.s.str.split('--', expand=expand) + self.s.str.split("--", expand=expand) def time_rsplit(self, expand): - self.s.str.rsplit('--', expand=expand) + self.s.str.rsplit("--", expand=expand) class Dummies: - def setup(self): - self.s = Series(tm.makeStringIndex(10**5)).str.join('|') + self.s = Series(tm.makeStringIndex(10 ** 5)).str.join("|") def time_get_dummies(self): - self.s.str.get_dummies('|') + self.s.str.get_dummies("|") class Encode: - def setup(self): self.ser = Series(tm.makeUnicodeIndex()) def time_encode_decode(self): - self.ser.str.encode('utf-8').str.decode('utf-8') + self.ser.str.encode("utf-8").str.decode("utf-8") class Slice: - def setup(self): - self.s = Series(['abcdefg', np.nan] * 500000) + self.s = Series(["abcdefg", np.nan] * 500000) def time_vector_slice(self): # GH 2602 diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index c4fe462944a2aa..36a9db529f98fb 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -3,49 +3,60 @@ import numpy as np from pandas import ( - DataFrame, Series, Timedelta, Timestamp, timedelta_range, to_timedelta) + DataFrame, + Series, + Timedelta, + Timestamp, + timedelta_range, + to_timedelta, +) class TimedeltaConstructor: - def time_from_int(self): Timedelta(123456789) def time_from_unit(self): - Timedelta(1, unit='d') + Timedelta(1, unit="d") def time_from_components(self): - Timedelta(days=1, hours=2, minutes=3, seconds=4, milliseconds=5, - microseconds=6, nanoseconds=7) + Timedelta( + days=1, + hours=2, + minutes=3, + seconds=4, + milliseconds=5, + microseconds=6, + nanoseconds=7, + ) def time_from_datetime_timedelta(self): Timedelta(datetime.timedelta(days=1, seconds=1)) def time_from_np_timedelta(self): - Timedelta(np.timedelta64(1, 'ms')) + Timedelta(np.timedelta64(1, "ms")) def time_from_string(self): - Timedelta('1 days') + Timedelta("1 days") def time_from_iso_format(self): - Timedelta('P4DT12H30M5S') + Timedelta("P4DT12H30M5S") def time_from_missing(self): - Timedelta('nat') + Timedelta("nat") class ToTimedelta: - def setup(self): self.ints = np.random.randint(0, 60, size=10000) self.str_days = [] self.str_seconds = [] for i in self.ints: - self.str_days.append('{0} days'.format(i)) - self.str_seconds.append('00:00:{0:02d}'.format(i)) + self.str_days.append("{0} days".format(i)) + self.str_seconds.append("00:00:{0:02d}".format(i)) def time_convert_int(self): - to_timedelta(self.ints, unit='s') + to_timedelta(self.ints, unit="s") def time_convert_string_days(self): to_timedelta(self.str_days) @@ -56,30 +67,28 @@ def time_convert_string_seconds(self): class ToTimedeltaErrors: - params = ['coerce', 'ignore'] - param_names = ['errors'] + params = ["coerce", "ignore"] + param_names = ["errors"] def setup(self, errors): ints = np.random.randint(0, 60, size=10000) - self.arr = ['{0} days'.format(i) for i in ints] - self.arr[-1] = 'apple' + self.arr = ["{0} days".format(i) for i in ints] + self.arr[-1] = "apple" def time_convert(self, errors): to_timedelta(self.arr, errors=errors) class TimedeltaOps: - def setup(self): self.td = to_timedelta(np.arange(1000000)) - self.ts = Timestamp('2000') + self.ts = Timestamp("2000") def time_add_td_ts(self): self.td + self.ts class TimedeltaProperties: - def setup_cache(self): td = Timedelta(days=365, minutes=35, seconds=25, milliseconds=35) return td @@ -98,10 +107,9 @@ def time_timedelta_nanoseconds(self, td): class DatetimeAccessor: - def setup_cache(self): N = 100000 - series = Series(timedelta_range('1 days', periods=N, freq='h')) + series = Series(timedelta_range("1 days", periods=N, freq="h")) return series def time_dt_accessor(self, series): @@ -121,10 +129,9 @@ def time_timedelta_nanoseconds(self, series): class TimedeltaIndexing: - def setup(self): - self.index = timedelta_range(start='1985', periods=1000, freq='D') - self.index2 = timedelta_range(start='1986', periods=1000, freq='D') + self.index = timedelta_range(start="1985", periods=1000, freq="D") + self.index2 = timedelta_range(start="1986", periods=1000, freq="D") self.series = Series(range(1000), index=self.index) self.timedelta = self.index[500] @@ -141,7 +148,7 @@ def time_series_loc(self): self.series.loc[self.timedelta] def time_align(self): - DataFrame({'a': self.series, 'b': self.series[:500]}) + DataFrame({"a": self.series, "b": self.series[:500]}) def time_intersection(self): self.index.intersection(self.index2) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 14ee8747cf81d9..a74527df25f9bd 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -4,6 +4,7 @@ import numpy as np from pandas import to_datetime, date_range, Series, DataFrame, period_range from pandas.tseries.frequencies import infer_freq + try: from pandas.plotting._matplotlib.converter import DatetimeConverter except ImportError: @@ -12,27 +13,22 @@ class DatetimeIndex: - params = ['dst', 'repeated', 'tz_aware', 'tz_local', 'tz_naive'] - param_names = ['index_type'] + params = ["dst", "repeated", "tz_aware", "tz_local", "tz_naive"] + param_names = ["index_type"] def setup(self, index_type): N = 100000 - dtidxes = {'dst': date_range(start='10/29/2000 1:00:00', - end='10/29/2000 1:59:59', freq='S'), - 'repeated': date_range(start='2000', - periods=N / 10, - freq='s').repeat(10), - 'tz_aware': date_range(start='2000', - periods=N, - freq='s', - tz='US/Eastern'), - 'tz_local': date_range(start='2000', - periods=N, - freq='s', - tz=dateutil.tz.tzlocal()), - 'tz_naive': date_range(start='2000', - periods=N, - freq='s')} + dtidxes = { + "dst": date_range( + start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="S" + ), + "repeated": date_range(start="2000", periods=N / 10, freq="s").repeat(10), + "tz_aware": date_range(start="2000", periods=N, freq="s", tz="US/Eastern"), + "tz_local": date_range( + start="2000", periods=N, freq="s", tz=dateutil.tz.tzlocal() + ), + "tz_naive": date_range(start="2000", periods=N, freq="s"), + } self.index = dtidxes[index_type] def time_add_timedelta(self, index_type): @@ -62,31 +58,31 @@ def time_to_pydatetime(self, index_type): class TzLocalize: - params = [None, 'US/Eastern', 'UTC', dateutil.tz.tzutc()] - param_names = 'tz' + params = [None, "US/Eastern", "UTC", dateutil.tz.tzutc()] + param_names = "tz" def setup(self, tz): - dst_rng = date_range(start='10/29/2000 1:00:00', - end='10/29/2000 1:59:59', freq='S') - self.index = date_range(start='10/29/2000', - end='10/29/2000 00:59:59', freq='S') + dst_rng = date_range( + start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="S" + ) + self.index = date_range(start="10/29/2000", end="10/29/2000 00:59:59", freq="S") self.index = self.index.append(dst_rng) self.index = self.index.append(dst_rng) - self.index = self.index.append(date_range(start='10/29/2000 2:00:00', - end='10/29/2000 3:00:00', - freq='S')) + self.index = self.index.append( + date_range(start="10/29/2000 2:00:00", end="10/29/2000 3:00:00", freq="S") + ) def time_infer_dst(self, tz): - self.index.tz_localize(tz, ambiguous='infer') + self.index.tz_localize(tz, ambiguous="infer") class ResetIndex: - params = [None, 'US/Eastern'] - param_names = 'tz' + params = [None, "US/Eastern"] + param_names = "tz" def setup(self, tz): - idx = date_range(start='1/1/2000', periods=1000, freq='H', tz=tz) + idx = date_range(start="1/1/2000", periods=1000, freq="H", tz=tz) self.df = DataFrame(np.random.randn(1000, 2), index=idx) def time_reest_datetimeindex(self, tz): @@ -95,12 +91,12 @@ def time_reest_datetimeindex(self, tz): class Factorize: - params = [None, 'Asia/Tokyo'] - param_names = 'tz' + params = [None, "Asia/Tokyo"] + param_names = "tz" def setup(self, tz): N = 100000 - self.dti = date_range('2011-01-01', freq='H', periods=N, tz=tz) + self.dti = date_range("2011-01-01", freq="H", periods=N, tz=tz) self.dti = self.dti.repeat(5) def time_factorize(self, tz): @@ -109,25 +105,24 @@ def time_factorize(self, tz): class InferFreq: - params = [None, 'D', 'B'] - param_names = ['freq'] + params = [None, "D", "B"] + param_names = ["freq"] def setup(self, freq): if freq is None: - self.idx = date_range(start='1/1/1700', freq='D', periods=10000) + self.idx = date_range(start="1/1/1700", freq="D", periods=10000) self.idx.freq = None else: - self.idx = date_range(start='1/1/1700', freq=freq, periods=10000) + self.idx = date_range(start="1/1/1700", freq=freq, periods=10000) def time_infer_freq(self, freq): infer_freq(self.idx) class TimeDatetimeConverter: - def setup(self): N = 100000 - self.rng = date_range(start='1/1/2000', periods=N, freq='T') + self.rng = date_range(start="1/1/2000", periods=N, freq="T") def time_convert(self): DatetimeConverter.convert(self.rng, None, None) @@ -136,11 +131,11 @@ def time_convert(self): class Iteration: params = [date_range, period_range] - param_names = ['time_index'] + param_names = ["time_index"] def setup(self, time_index): - N = 10**6 - self.idx = time_index(start='20140101', freq='T', periods=N) + N = 10 ** 6 + self.idx = time_index(start="20140101", freq="T", periods=N) self.exit = 10000 def time_iter(self, time_index): @@ -155,13 +150,13 @@ def time_iter_preexit(self, time_index): class ResampleDataFrame: - params = ['max', 'mean', 'min'] - param_names = ['method'] + params = ["max", "mean", "min"] + param_names = ["method"] def setup(self, method): - rng = date_range(start='20130101', periods=100000, freq='50L') + rng = date_range(start="20130101", periods=100000, freq="50L") df = DataFrame(np.random.randn(100000, 2), index=rng) - self.resample = getattr(df.resample('1s'), method) + self.resample = getattr(df.resample("1s"), method) def time_method(self, method): self.resample() @@ -169,16 +164,14 @@ def time_method(self, method): class ResampleSeries: - params = (['period', 'datetime'], ['5min', '1D'], ['mean', 'ohlc']) - param_names = ['index', 'freq', 'method'] + params = (["period", "datetime"], ["5min", "1D"], ["mean", "ohlc"]) + param_names = ["index", "freq", "method"] def setup(self, index, freq, method): - indexes = {'period': period_range(start='1/1/2000', - end='1/1/2001', - freq='T'), - 'datetime': date_range(start='1/1/2000', - end='1/1/2001', - freq='T')} + indexes = { + "period": period_range(start="1/1/2000", end="1/1/2001", freq="T"), + "datetime": date_range(start="1/1/2000", end="1/1/2001", freq="T"), + } idx = indexes[index] ts = Series(np.random.randn(len(idx)), index=idx) self.resample = getattr(ts.resample(freq), method) @@ -190,32 +183,35 @@ def time_resample(self, index, freq, method): class ResampleDatetetime64: # GH 7754 def setup(self): - rng3 = date_range(start='2000-01-01 00:00:00', - end='2000-01-01 10:00:00', freq='555000U') - self.dt_ts = Series(5, rng3, dtype='datetime64[ns]') + rng3 = date_range( + start="2000-01-01 00:00:00", end="2000-01-01 10:00:00", freq="555000U" + ) + self.dt_ts = Series(5, rng3, dtype="datetime64[ns]") def time_resample(self): - self.dt_ts.resample('1S').last() + self.dt_ts.resample("1S").last() class AsOf: - params = ['DataFrame', 'Series'] - param_names = ['constructor'] + params = ["DataFrame", "Series"] + param_names = ["constructor"] def setup(self, constructor): N = 10000 M = 10 - rng = date_range(start='1/1/1990', periods=N, freq='53s') - data = {'DataFrame': DataFrame(np.random.randn(N, M)), - 'Series': Series(np.random.randn(N))} + rng = date_range(start="1/1/1990", periods=N, freq="53s") + data = { + "DataFrame": DataFrame(np.random.randn(N, M)), + "Series": Series(np.random.randn(N)), + } self.ts = data[constructor] self.ts.index = rng self.ts2 = self.ts.copy() self.ts2.iloc[250:5000] = np.nan self.ts3 = self.ts.copy() self.ts3.iloc[-5000:] = np.nan - self.dates = date_range(start='1/1/1990', periods=N * 10, freq='5s') + self.dates = date_range(start="1/1/1990", periods=N * 10, freq="5s") self.date = self.dates[0] self.date_last = self.dates[-1] self.date_early = self.date - timedelta(10) @@ -248,11 +244,11 @@ def time_asof_nan_single(self, constructor): class SortIndex: params = [True, False] - param_names = ['monotonic'] + param_names = ["monotonic"] def setup(self, monotonic): - N = 10**5 - idx = date_range(start='1/1/2000', periods=N, freq='s') + N = 10 ** 5 + idx = date_range(start="1/1/2000", periods=N, freq="s") self.s = Series(np.random.randn(N), index=idx) if not monotonic: self.s = self.s.sample(frac=1) @@ -265,10 +261,9 @@ def time_get_slice(self, monotonic): class IrregularOps: - def setup(self): - N = 10**5 - idx = date_range(start='1/1/2000', periods=N, freq='s') + N = 10 ** 5 + idx = date_range(start="1/1/2000", periods=N, freq="s") s = Series(np.random.randn(N), index=idx) self.left = s.sample(frac=1) self.right = s.sample(frac=1) @@ -278,10 +273,9 @@ def time_add(self): class Lookup: - def setup(self): N = 1500000 - rng = date_range(start='1/1/2000', periods=N, freq='S') + rng = date_range(start="1/1/2000", periods=N, freq="S") self.ts = Series(1, index=rng) self.lookup_val = rng[N // 2] @@ -291,36 +285,35 @@ def time_lookup_and_cleanup(self): class ToDatetimeYYYYMMDD: - def setup(self): - rng = date_range(start='1/1/2000', periods=10000, freq='D') - self.stringsD = Series(rng.strftime('%Y%m%d')) + rng = date_range(start="1/1/2000", periods=10000, freq="D") + self.stringsD = Series(rng.strftime("%Y%m%d")) def time_format_YYYYMMDD(self): - to_datetime(self.stringsD, format='%Y%m%d') + to_datetime(self.stringsD, format="%Y%m%d") class ToDatetimeCacheSmallCount(object): params = ([True, False], [50, 500, 5000, 100000]) - param_names = ['cache', 'count'] + param_names = ["cache", "count"] def setup(self, cache, count): - rng = date_range(start='1/1/1971', periods=count) - self.unique_date_strings = rng.strftime('%Y-%m-%d').tolist() + rng = date_range(start="1/1/1971", periods=count) + self.unique_date_strings = rng.strftime("%Y-%m-%d").tolist() def time_unique_date_strings(self, cache, count): to_datetime(self.unique_date_strings, cache=cache) class ToDatetimeISO8601: - def setup(self): - rng = date_range(start='1/1/2000', periods=20000, freq='H') - self.strings = rng.strftime('%Y-%m-%d %H:%M:%S').tolist() - self.strings_nosep = rng.strftime('%Y%m%d %H:%M:%S').tolist() - self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800' - for x in rng] + rng = date_range(start="1/1/2000", periods=20000, freq="H") + self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist() + self.strings_nosep = rng.strftime("%Y%m%d %H:%M:%S").tolist() + self.strings_tz_space = [ + x.strftime("%Y-%m-%d %H:%M:%S") + " -0800" for x in rng + ] def time_iso8601(self): to_datetime(self.strings) @@ -329,22 +322,21 @@ def time_iso8601_nosep(self): to_datetime(self.strings_nosep) def time_iso8601_format(self): - to_datetime(self.strings, format='%Y-%m-%d %H:%M:%S') + to_datetime(self.strings, format="%Y-%m-%d %H:%M:%S") def time_iso8601_format_no_sep(self): - to_datetime(self.strings_nosep, format='%Y%m%d %H:%M:%S') + to_datetime(self.strings_nosep, format="%Y%m%d %H:%M:%S") def time_iso8601_tz_spaceformat(self): to_datetime(self.strings_tz_space) class ToDatetimeNONISO8601: - def setup(self): N = 10000 half = int(N / 2) - ts_string_1 = 'March 1, 2018 12:00:00+0400' - ts_string_2 = 'March 1, 2018 12:00:00+0500' + ts_string_1 = "March 1, 2018 12:00:00+0400" + ts_string_2 = "March 1, 2018 12:00:00+0500" self.same_offset = [ts_string_1] * N self.diff_offset = [ts_string_1] * half + [ts_string_2] * half @@ -356,50 +348,48 @@ def time_different_offset(self): class ToDatetimeFormatQuarters: - def setup(self): - self.s = Series(['2Q2005', '2Q05', '2005Q1', '05Q1'] * 10000) + self.s = Series(["2Q2005", "2Q05", "2005Q1", "05Q1"] * 10000) def time_infer_quarter(self): to_datetime(self.s) class ToDatetimeFormat: - def setup(self): - self.s = Series(['19MAY11', '19MAY11:00:00:00'] * 100000) - self.s2 = self.s.str.replace(':\\S+$', '') + self.s = Series(["19MAY11", "19MAY11:00:00:00"] * 100000) + self.s2 = self.s.str.replace(":\\S+$", "") def time_exact(self): - to_datetime(self.s2, format='%d%b%y') + to_datetime(self.s2, format="%d%b%y") def time_no_exact(self): - to_datetime(self.s, format='%d%b%y', exact=False) + to_datetime(self.s, format="%d%b%y", exact=False) class ToDatetimeCache: params = [True, False] - param_names = ['cache'] + param_names = ["cache"] def setup(self, cache): N = 10000 self.unique_numeric_seconds = list(range(N)) self.dup_numeric_seconds = [1000] * N - self.dup_string_dates = ['2000-02-11'] * N - self.dup_string_with_tz = ['2000-02-11 15:00:00-0800'] * N + self.dup_string_dates = ["2000-02-11"] * N + self.dup_string_with_tz = ["2000-02-11 15:00:00-0800"] * N def time_unique_seconds_and_unit(self, cache): - to_datetime(self.unique_numeric_seconds, unit='s', cache=cache) + to_datetime(self.unique_numeric_seconds, unit="s", cache=cache) def time_dup_seconds_and_unit(self, cache): - to_datetime(self.dup_numeric_seconds, unit='s', cache=cache) + to_datetime(self.dup_numeric_seconds, unit="s", cache=cache) def time_dup_string_dates(self, cache): to_datetime(self.dup_string_dates, cache=cache) def time_dup_string_dates_and_format(self, cache): - to_datetime(self.dup_string_dates, format='%Y-%m-%d', cache=cache) + to_datetime(self.dup_string_dates, format="%Y-%m-%d", cache=cache) def time_dup_string_tzoffset_dates(self, cache): to_datetime(self.dup_string_with_tz, cache=cache) @@ -407,14 +397,12 @@ def time_dup_string_tzoffset_dates(self, cache): class DatetimeAccessor: - params = [None, 'US/Eastern', 'UTC', dateutil.tz.tzutc()] - param_names = 'tz' + params = [None, "US/Eastern", "UTC", dateutil.tz.tzutc()] + param_names = "tz" def setup(self, tz): N = 100000 - self.series = Series( - date_range(start='1/1/2000', periods=N, freq='T', tz=tz) - ) + self.series = Series(date_range(start="1/1/2000", periods=N, freq="T", tz=tz)) def time_dt_accessor(self, tz): self.series.dt diff --git a/asv_bench/benchmarks/timestamp.py b/asv_bench/benchmarks/timestamp.py index c6e56804c7b213..8ebb2d8d2f35dd 100644 --- a/asv_bench/benchmarks/timestamp.py +++ b/asv_bench/benchmarks/timestamp.py @@ -7,21 +7,20 @@ class TimestampConstruction: - def time_parse_iso8601_no_tz(self): - Timestamp('2017-08-25 08:16:14') + Timestamp("2017-08-25 08:16:14") def time_parse_iso8601_tz(self): - Timestamp('2017-08-25 08:16:14-0500') + Timestamp("2017-08-25 08:16:14-0500") def time_parse_dateutil(self): - Timestamp('2017/08/25 08:16:14 AM') + Timestamp("2017/08/25 08:16:14 AM") def time_parse_today(self): - Timestamp('today') + Timestamp("today") def time_parse_now(self): - Timestamp('now') + Timestamp("now") def time_fromordinal(self): Timestamp.fromordinal(730120) @@ -31,14 +30,13 @@ def time_fromtimestamp(self): class TimestampProperties: - _tzs = [None, pytz.timezone('Europe/Amsterdam'), pytz.UTC, - dateutil.tz.tzutc()] - _freqs = [None, 'B'] + _tzs = [None, pytz.timezone("Europe/Amsterdam"), pytz.UTC, dateutil.tz.tzutc()] + _freqs = [None, "B"] params = [_tzs, _freqs] - param_names = ['tz', 'freq'] + param_names = ["tz", "freq"] def setup(self, tz, freq): - self.ts = Timestamp('2017-08-25 08:16:14', tzinfo=tz, freq=freq) + self.ts = Timestamp("2017-08-25 08:16:14", tzinfo=tz, freq=freq) def time_tz(self, tz, freq): self.ts.tz @@ -93,15 +91,14 @@ def time_month_name(self, tz, freq): class TimestampOps: - params = [None, 'US/Eastern', pytz.UTC, - dateutil.tz.tzutc()] - param_names = ['tz'] + params = [None, "US/Eastern", pytz.UTC, dateutil.tz.tzutc()] + param_names = ["tz"] def setup(self, tz): - self.ts = Timestamp('2017-08-25 08:16:14', tz=tz) + self.ts = Timestamp("2017-08-25 08:16:14", tz=tz) def time_replace_tz(self, tz): - self.ts.replace(tzinfo=pytz.timezone('US/Eastern')) + self.ts.replace(tzinfo=pytz.timezone("US/Eastern")) def time_replace_None(self, tz): self.ts.replace(tzinfo=None) @@ -124,16 +121,16 @@ def time_to_julian_date(self, tz): self.ts.to_julian_date() def time_floor(self, tz): - self.ts.floor('5T') + self.ts.floor("5T") def time_ceil(self, tz): - self.ts.ceil('5T') + self.ts.ceil("5T") class TimestampAcrossDst: def setup(self): dt = datetime.datetime(2016, 3, 27, 1) - self.tzinfo = pytz.timezone('CET').localize(dt, is_dst=False).tzinfo + self.tzinfo = pytz.timezone("CET").localize(dt, is_dst=False).tzinfo self.ts2 = Timestamp(dt) def time_replace_across_dst(self): diff --git a/ci/print_skipped.py b/ci/print_skipped.py index 859481c5d188d4..a44281044e11d0 100755 --- a/ci/print_skipped.py +++ b/ci/print_skipped.py @@ -11,45 +11,42 @@ def parse_results(filename): root = tree.getroot() skipped = [] - current_class = '' + current_class = "" i = 1 assert i - 1 == len(skipped) - for el in root.findall('testcase'): - cn = el.attrib['classname'] - for sk in el.findall('skipped'): + for el in root.findall("testcase"): + cn = el.attrib["classname"] + for sk in el.findall("skipped"): old_class = current_class current_class = cn - name = '{classname}.{name}'.format(classname=current_class, - name=el.attrib['name']) - msg = sk.attrib['message'] - out = '' + name = "{classname}.{name}".format( + classname=current_class, name=el.attrib["name"] + ) + msg = sk.attrib["message"] + out = "" if old_class != current_class: ndigits = int(math.log(i, 10) + 1) # 4 for : + space + # + space - out += ('-' * (len(name + msg) + 4 + ndigits) + '\n') - out += '#{i} {name}: {msg}'.format(i=i, name=name, msg=msg) + out += "-" * (len(name + msg) + 4 + ndigits) + "\n" + out += "#{i} {name}: {msg}".format(i=i, name=name, msg=msg) skipped.append(out) i += 1 assert i - 1 == len(skipped) assert i - 1 == len(skipped) # assert len(skipped) == int(root.attrib['skip']) - return '\n'.join(skipped) + return "\n".join(skipped) def main(): - test_files = [ - 'test-data-single.xml', - 'test-data-multiple.xml', - 'test-data.xml', - ] + test_files = ["test-data-single.xml", "test-data-multiple.xml", "test-data.xml"] - print('SKIPPED TESTS:') + print("SKIPPED TESTS:") for fn in test_files: if os.path.isfile(fn): print(parse_results(fn)) return 0 -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(main()) diff --git a/doc/logo/pandas_logo.py b/doc/logo/pandas_logo.py index c3647f0c7d2a82..5a07b094e6ad35 100644 --- a/doc/logo/pandas_logo.py +++ b/doc/logo/pandas_logo.py @@ -4,7 +4,7 @@ from matplotlib import rcParams import numpy as np -rcParams['mathtext.fontset'] = 'cm' +rcParams["mathtext.fontset"] = "cm" def fnx(): @@ -37,8 +37,12 @@ def fnx(): plt.figtext(0.05, 0.5, "pandas", size=40) plt.figtext( - 0.05, 0.2, r"$y_{it} = \beta^{\prime} x_{it} + \mu_{i} + \epsilon_{it}$", - size=16, color="#5a89a4") - -fig.savefig('pandas_logo.svg') -fig.savefig('pandas_logo.png') + 0.05, + 0.2, + r"$y_{it} = \beta^{\prime} x_{it} + \mu_{i} + \epsilon_{it}$", + size=16, + color="#5a89a4", +) + +fig.savefig("pandas_logo.svg") +fig.savefig("pandas_logo.png") diff --git a/doc/make.py b/doc/make.py index 496b3cfd4ee453..48febef20fbe66 100755 --- a/doc/make.py +++ b/doc/make.py @@ -24,9 +24,9 @@ DOC_PATH = os.path.dirname(os.path.abspath(__file__)) -SOURCE_PATH = os.path.join(DOC_PATH, 'source') -BUILD_PATH = os.path.join(DOC_PATH, 'build') -REDIRECTS_FILE = os.path.join(DOC_PATH, 'redirects.csv') +SOURCE_PATH = os.path.join(DOC_PATH, "source") +BUILD_PATH = os.path.join(DOC_PATH, "build") +REDIRECTS_FILE = os.path.join(DOC_PATH, "redirects.csv") class DocBuilder: @@ -36,8 +36,15 @@ class DocBuilder: All public methods of this class can be called as parameters of the script. """ - def __init__(self, num_jobs=0, include_api=True, single_doc=None, - verbosity=0, warnings_are_errors=False): + + def __init__( + self, + num_jobs=0, + include_api=True, + single_doc=None, + verbosity=0, + warnings_are_errors=False, + ): self.num_jobs = num_jobs self.verbosity = verbosity self.warnings_are_errors = warnings_are_errors @@ -45,16 +52,15 @@ def __init__(self, num_jobs=0, include_api=True, single_doc=None, if single_doc: single_doc = self._process_single_doc(single_doc) include_api = False - os.environ['SPHINX_PATTERN'] = single_doc + os.environ["SPHINX_PATTERN"] = single_doc elif not include_api: - os.environ['SPHINX_PATTERN'] = '-api' + os.environ["SPHINX_PATTERN"] = "-api" self.single_doc_html = None - if single_doc and single_doc.endswith('.rst'): - self.single_doc_html = os.path.splitext(single_doc)[0] + '.html' + if single_doc and single_doc.endswith(".rst"): + self.single_doc_html = os.path.splitext(single_doc)[0] + ".html" elif single_doc: - self.single_doc_html = 'reference/api/pandas.{}.html'.format( - single_doc) + self.single_doc_html = "reference/api/pandas.{}.html".format(single_doc) def _process_single_doc(self, single_doc): """ @@ -66,26 +72,30 @@ def _process_single_doc(self, single_doc): (e.g. reference/api/pandas.DataFrame.head.rst). """ base_name, extension = os.path.splitext(single_doc) - if extension in ('.rst', '.ipynb'): + if extension in (".rst", ".ipynb"): if os.path.exists(os.path.join(SOURCE_PATH, single_doc)): return single_doc else: - raise FileNotFoundError('File {} not found'.format(single_doc)) + raise FileNotFoundError("File {} not found".format(single_doc)) - elif single_doc.startswith('pandas.'): + elif single_doc.startswith("pandas."): try: obj = pandas # noqa: F821 - for name in single_doc.split('.'): + for name in single_doc.split("."): obj = getattr(obj, name) except AttributeError: - raise ImportError('Could not import {}'.format(single_doc)) + raise ImportError("Could not import {}".format(single_doc)) else: - return single_doc[len('pandas.'):] + return single_doc[len("pandas.") :] else: - raise ValueError(('--single={} not understood. Value should be a ' - 'valid path to a .rst or .ipynb file, or a ' - 'valid pandas object (e.g. categorical.rst or ' - 'pandas.DataFrame.head)').format(single_doc)) + raise ValueError( + ( + "--single={} not understood. Value should be a " + "valid path to a .rst or .ipynb file, or a " + "valid pandas object (e.g. categorical.rst or " + "pandas.DataFrame.head)" + ).format(single_doc) + ) @staticmethod def _run_os(*args): @@ -117,52 +127,55 @@ def _sphinx_build(self, kind): -------- >>> DocBuilder(num_jobs=4)._sphinx_build('html') """ - if kind not in ('html', 'latex'): - raise ValueError('kind must be html or latex, ' - 'not {}'.format(kind)) + if kind not in ("html", "latex"): + raise ValueError("kind must be html or latex, " "not {}".format(kind)) - cmd = ['sphinx-build', '-b', kind] + cmd = ["sphinx-build", "-b", kind] if self.num_jobs: - cmd += ['-j', str(self.num_jobs)] + cmd += ["-j", str(self.num_jobs)] if self.warnings_are_errors: - cmd += ['-W', '--keep-going'] + cmd += ["-W", "--keep-going"] if self.verbosity: - cmd.append('-{}'.format('v' * self.verbosity)) - cmd += ['-d', os.path.join(BUILD_PATH, 'doctrees'), - SOURCE_PATH, os.path.join(BUILD_PATH, kind)] + cmd.append("-{}".format("v" * self.verbosity)) + cmd += [ + "-d", + os.path.join(BUILD_PATH, "doctrees"), + SOURCE_PATH, + os.path.join(BUILD_PATH, kind), + ] return subprocess.call(cmd) def _open_browser(self, single_doc_html): """ Open a browser tab showing single """ - url = os.path.join('file://', DOC_PATH, 'build', 'html', - single_doc_html) + url = os.path.join("file://", DOC_PATH, "build", "html", single_doc_html) webbrowser.open(url, new=2) def _get_page_title(self, page): """ Open the rst file `page` and extract its title. """ - fname = os.path.join(SOURCE_PATH, '{}.rst'.format(page)) + fname = os.path.join(SOURCE_PATH, "{}.rst".format(page)) option_parser = docutils.frontend.OptionParser( - components=(docutils.parsers.rst.Parser,)) - doc = docutils.utils.new_document( - '', - option_parser.get_default_values()) + components=(docutils.parsers.rst.Parser,) + ) + doc = docutils.utils.new_document("", option_parser.get_default_values()) with open(fname) as f: data = f.read() parser = docutils.parsers.rst.Parser() # do not generate any warning when parsing the rst - with open(os.devnull, 'a') as f: + with open(os.devnull, "a") as f: doc.reporter.stream = f parser.parse(data, doc) - section = next(node for node in doc.children - if isinstance(node, docutils.nodes.section)) - title = next(node for node in section.children - if isinstance(node, docutils.nodes.title)) + section = next( + node for node in doc.children if isinstance(node, docutils.nodes.section) + ) + title = next( + node for node in section.children if isinstance(node, docutils.nodes.title) + ) return title.astext() @@ -171,7 +184,7 @@ def _add_redirects(self): Create in the build directory an html file with a redirect, for every row in REDIRECTS_FILE. """ - html = ''' + html = """ @@ -182,16 +195,14 @@ def _add_redirects(self):

- ''' + """ with open(REDIRECTS_FILE) as mapping_fd: reader = csv.reader(mapping_fd) for row in reader: - if not row or row[0].strip().startswith('#'): + if not row or row[0].strip().startswith("#"): continue - path = os.path.join(BUILD_PATH, - 'html', - *row[0].split('/')) + '.html' + path = os.path.join(BUILD_PATH, "html", *row[0].split("/")) + ".html" try: title = self._get_page_title(row[1]) @@ -199,24 +210,26 @@ def _add_redirects(self): # the file can be an ipynb and not an rst, or docutils # may not be able to read the rst because it has some # sphinx specific stuff - title = 'this page' + title = "this page" if os.path.exists(path): - raise RuntimeError(( - 'Redirection would overwrite an existing file: ' - '{}').format(path)) + raise RuntimeError( + ("Redirection would overwrite an existing file: " "{}").format( + path + ) + ) - with open(path, 'w') as moved_page_fd: + with open(path, "w") as moved_page_fd: moved_page_fd.write( - html.format(url='{}.html'.format(row[1]), - title=title)) + html.format(url="{}.html".format(row[1]), title=title) + ) def html(self): """ Build HTML documentation. """ - ret_code = self._sphinx_build('html') - zip_fname = os.path.join(BUILD_PATH, 'html', 'pandas.zip') + ret_code = self._sphinx_build("html") + zip_fname = os.path.join(BUILD_PATH, "html", "pandas.zip") if os.path.exists(zip_fname): os.remove(zip_fname) @@ -231,20 +244,20 @@ def latex(self, force=False): """ Build PDF documentation. """ - if sys.platform == 'win32': - sys.stderr.write('latex build has not been tested on windows\n') + if sys.platform == "win32": + sys.stderr.write("latex build has not been tested on windows\n") else: - ret_code = self._sphinx_build('latex') - os.chdir(os.path.join(BUILD_PATH, 'latex')) + ret_code = self._sphinx_build("latex") + os.chdir(os.path.join(BUILD_PATH, "latex")) if force: for i in range(3): - self._run_os('pdflatex', - '-interaction=nonstopmode', - 'pandas.tex') - raise SystemExit('You should check the file ' - '"build/latex/pandas.pdf" for problems.') + self._run_os("pdflatex", "-interaction=nonstopmode", "pandas.tex") + raise SystemExit( + "You should check the file " + '"build/latex/pandas.pdf" for problems.' + ) else: - self._run_os('make') + self._run_os("make") return ret_code def latex_forced(self): @@ -259,84 +272,101 @@ def clean(): Clean documentation generated files. """ shutil.rmtree(BUILD_PATH, ignore_errors=True) - shutil.rmtree(os.path.join(SOURCE_PATH, 'reference', 'api'), - ignore_errors=True) + shutil.rmtree(os.path.join(SOURCE_PATH, "reference", "api"), ignore_errors=True) def zip_html(self): """ Compress HTML documentation into a zip file. """ - zip_fname = os.path.join(BUILD_PATH, 'html', 'pandas.zip') + zip_fname = os.path.join(BUILD_PATH, "html", "pandas.zip") if os.path.exists(zip_fname): os.remove(zip_fname) - dirname = os.path.join(BUILD_PATH, 'html') + dirname = os.path.join(BUILD_PATH, "html") fnames = os.listdir(dirname) os.chdir(dirname) - self._run_os('zip', - zip_fname, - '-r', - '-q', - *fnames) + self._run_os("zip", zip_fname, "-r", "-q", *fnames) def main(): - cmds = [method for method in dir(DocBuilder) if not method.startswith('_')] + cmds = [method for method in dir(DocBuilder) if not method.startswith("_")] argparser = argparse.ArgumentParser( - description='pandas documentation builder', - epilog='Commands: {}'.format(','.join(cmds))) - argparser.add_argument('command', - nargs='?', - default='html', - help='command to run: {}'.format(', '.join(cmds))) - argparser.add_argument('--num-jobs', - type=int, - default=0, - help='number of jobs used by sphinx-build') - argparser.add_argument('--no-api', - default=False, - help='omit api and autosummary', - action='store_true') - argparser.add_argument('--single', - metavar='FILENAME', - type=str, - default=None, - help=('filename (relative to the "source" folder)' - ' of section or method name to compile, e.g. ' - '"development/contributing.rst",' - ' "ecosystem.rst", "pandas.DataFrame.join"')) - argparser.add_argument('--python-path', - type=str, - default=os.path.dirname(DOC_PATH), - help='path') - argparser.add_argument('-v', action='count', dest='verbosity', default=0, - help=('increase verbosity (can be repeated), ' - 'passed to the sphinx build command')) - argparser.add_argument('--warnings-are-errors', '-W', - action='store_true', - help='fail if warnings are raised') + description="pandas documentation builder", + epilog="Commands: {}".format(",".join(cmds)), + ) + argparser.add_argument( + "command", + nargs="?", + default="html", + help="command to run: {}".format(", ".join(cmds)), + ) + argparser.add_argument( + "--num-jobs", type=int, default=0, help="number of jobs used by sphinx-build" + ) + argparser.add_argument( + "--no-api", default=False, help="omit api and autosummary", action="store_true" + ) + argparser.add_argument( + "--single", + metavar="FILENAME", + type=str, + default=None, + help=( + 'filename (relative to the "source" folder)' + " of section or method name to compile, e.g. " + '"development/contributing.rst",' + ' "ecosystem.rst", "pandas.DataFrame.join"' + ), + ) + argparser.add_argument( + "--python-path", type=str, default=os.path.dirname(DOC_PATH), help="path" + ) + argparser.add_argument( + "-v", + action="count", + dest="verbosity", + default=0, + help=( + "increase verbosity (can be repeated), " + "passed to the sphinx build command" + ), + ) + argparser.add_argument( + "--warnings-are-errors", + "-W", + action="store_true", + help="fail if warnings are raised", + ) args = argparser.parse_args() if args.command not in cmds: - raise ValueError('Unknown command {}. Available options: {}'.format( - args.command, ', '.join(cmds))) + raise ValueError( + "Unknown command {}. Available options: {}".format( + args.command, ", ".join(cmds) + ) + ) # Below we update both os.environ and sys.path. The former is used by # external libraries (namely Sphinx) to compile this module and resolve # the import of `python_path` correctly. The latter is used to resolve # the import within the module, injecting it into the global namespace - os.environ['PYTHONPATH'] = args.python_path + os.environ["PYTHONPATH"] = args.python_path sys.path.insert(0, args.python_path) - globals()['pandas'] = importlib.import_module('pandas') + globals()["pandas"] = importlib.import_module("pandas") # Set the matplotlib backend to the non-interactive Agg backend for all # child processes. - os.environ['MPLBACKEND'] = 'module://matplotlib.backends.backend_agg' - - builder = DocBuilder(args.num_jobs, not args.no_api, args.single, - args.verbosity, args.warnings_are_errors) + os.environ["MPLBACKEND"] = "module://matplotlib.backends.backend_agg" + + builder = DocBuilder( + args.num_jobs, + not args.no_api, + args.single, + args.verbosity, + args.warnings_are_errors, + ) return getattr(builder, args.command)() -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(main()) diff --git a/doc/source/conf.py b/doc/source/conf.py index 2484a9d592e094..3ebc5d8b6333b2 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -34,15 +34,13 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # sys.path.append(os.path.abspath('.')) -sys.path.insert(0, os.path.abspath('../sphinxext')) -sys.path.extend([ - - # numpy standard doc extensions - os.path.join(os.path.dirname(__file__), - '..', '../..', - 'sphinxext') - -]) +sys.path.insert(0, os.path.abspath("../sphinxext")) +sys.path.extend( + [ + # numpy standard doc extensions + os.path.join(os.path.dirname(__file__), "..", "../..", "sphinxext") + ] +) # -- General configuration ----------------------------------------------- @@ -50,65 +48,66 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. # sphinxext. -extensions = ['sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.doctest', - 'sphinx.ext.extlinks', - 'sphinx.ext.todo', - 'numpydoc', # handle NumPy documentation formatted docstrings - 'IPython.sphinxext.ipython_directive', - 'IPython.sphinxext.ipython_console_highlighting', - 'matplotlib.sphinxext.plot_directive', - 'sphinx.ext.intersphinx', - 'sphinx.ext.coverage', - 'sphinx.ext.mathjax', - 'sphinx.ext.ifconfig', - 'sphinx.ext.linkcode', - 'nbsphinx', - 'contributors', # custom pandas extension - ] - -exclude_patterns = ['**.ipynb_checkpoints'] +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.doctest", + "sphinx.ext.extlinks", + "sphinx.ext.todo", + "numpydoc", # handle NumPy documentation formatted docstrings + "IPython.sphinxext.ipython_directive", + "IPython.sphinxext.ipython_console_highlighting", + "matplotlib.sphinxext.plot_directive", + "sphinx.ext.intersphinx", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "sphinx.ext.ifconfig", + "sphinx.ext.linkcode", + "nbsphinx", + "contributors", # custom pandas extension +] + +exclude_patterns = ["**.ipynb_checkpoints"] try: import nbconvert except ImportError: - logger.warn('nbconvert not installed. Skipping notebooks.') - exclude_patterns.append('**/*.ipynb') + logger.warn("nbconvert not installed. Skipping notebooks.") + exclude_patterns.append("**/*.ipynb") else: try: nbconvert.utils.pandoc.get_pandoc_version() except nbconvert.utils.pandoc.PandocMissing: - logger.warn('Pandoc not installed. Skipping notebooks.') - exclude_patterns.append('**/*.ipynb') + logger.warn("Pandoc not installed. Skipping notebooks.") + exclude_patterns.append("**/*.ipynb") # sphinx_pattern can be '-api' to exclude the API pages, # the path to a file, or a Python object # (e.g. '10min.rst' or 'pandas.DataFrame.head') source_path = os.path.dirname(os.path.abspath(__file__)) -pattern = os.environ.get('SPHINX_PATTERN') +pattern = os.environ.get("SPHINX_PATTERN") if pattern: for dirname, dirs, fnames in os.walk(source_path): for fname in fnames: - if os.path.splitext(fname)[-1] in ('.rst', '.ipynb'): - fname = os.path.relpath(os.path.join(dirname, fname), - source_path) + if os.path.splitext(fname)[-1] in (".rst", ".ipynb"): + fname = os.path.relpath(os.path.join(dirname, fname), source_path) - if (fname == 'index.rst' - and os.path.abspath(dirname) == source_path): + if fname == "index.rst" and os.path.abspath(dirname) == source_path: continue - elif pattern == '-api' and dirname == 'reference': + elif pattern == "-api" and dirname == "reference": exclude_patterns.append(fname) - elif pattern != '-api' and fname != pattern: + elif pattern != "-api" and fname != pattern: exclude_patterns.append(fname) -with open(os.path.join(source_path, 'index.rst.template')) as f: +with open(os.path.join(source_path, "index.rst.template")) as f: t = jinja2.Template(f.read()) -with open(os.path.join(source_path, 'index.rst'), 'w') as f: - f.write(t.render(include_api=pattern is None, - single_doc=(pattern - if pattern is not None and pattern != '-api' - else None))) -autosummary_generate = True if pattern is None else ['index'] +with open(os.path.join(source_path, "index.rst"), "w") as f: + f.write( + t.render( + include_api=pattern is None, + single_doc=(pattern if pattern is not None and pattern != "-api" else None), + ) + ) +autosummary_generate = True if pattern is None else ["index"] # numpydoc numpydoc_attributes_as_param_list = False @@ -122,22 +121,20 @@ import pandas as pd""" # Add any paths that contain templates here, relative to this directory. -templates_path = ['../_templates'] +templates_path = ["../_templates"] # The suffix of source filenames. -source_suffix = [ - '.rst', -] +source_suffix = [".rst"] # The encoding of source files. -source_encoding = 'utf-8' +source_encoding = "utf-8" # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'pandas' -copyright = '2008-2014, the pandas development team' +project = "pandas" +copyright = "2008-2014, the pandas development team" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -184,7 +181,7 @@ # show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] @@ -194,7 +191,7 @@ # The theme to use for HTML and HTML Help pages. Major themes that come with # Sphinx are currently 'default' and 'sphinxdoc'. -html_theme = 'nature_with_gtoc' +html_theme = "nature_with_gtoc" # The style sheet to use for HTML and HTML Help pages. A file of that name # must exist either in Sphinx' static/ path, or in one of the custom paths @@ -207,7 +204,7 @@ # html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -html_theme_path = ['themes'] +html_theme_path = ["themes"] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". @@ -223,12 +220,12 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -html_favicon = os.path.join(html_static_path[0], 'favicon.ico') +html_favicon = os.path.join(html_static_path[0], "favicon.ico") # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. @@ -250,60 +247,62 @@ # https://github.com/pandas-dev/pandas/issues/16186 moved_api_pages = [ - ('pandas.core.common.isnull', 'pandas.isna'), - ('pandas.core.common.notnull', 'pandas.notna'), - ('pandas.core.reshape.get_dummies', 'pandas.get_dummies'), - ('pandas.tools.merge.concat', 'pandas.concat'), - ('pandas.tools.merge.merge', 'pandas.merge'), - ('pandas.tools.pivot.pivot_table', 'pandas.pivot_table'), - ('pandas.tseries.tools.to_datetime', 'pandas.to_datetime'), - ('pandas.io.clipboard.read_clipboard', 'pandas.read_clipboard'), - ('pandas.io.excel.ExcelFile.parse', 'pandas.ExcelFile.parse'), - ('pandas.io.excel.read_excel', 'pandas.read_excel'), - ('pandas.io.gbq.read_gbq', 'pandas.read_gbq'), - ('pandas.io.html.read_html', 'pandas.read_html'), - ('pandas.io.json.read_json', 'pandas.read_json'), - ('pandas.io.parsers.read_csv', 'pandas.read_csv'), - ('pandas.io.parsers.read_fwf', 'pandas.read_fwf'), - ('pandas.io.parsers.read_table', 'pandas.read_table'), - ('pandas.io.pickle.read_pickle', 'pandas.read_pickle'), - ('pandas.io.pytables.HDFStore.append', 'pandas.HDFStore.append'), - ('pandas.io.pytables.HDFStore.get', 'pandas.HDFStore.get'), - ('pandas.io.pytables.HDFStore.put', 'pandas.HDFStore.put'), - ('pandas.io.pytables.HDFStore.select', 'pandas.HDFStore.select'), - ('pandas.io.pytables.read_hdf', 'pandas.read_hdf'), - ('pandas.io.sql.read_sql', 'pandas.read_sql'), - ('pandas.io.sql.read_frame', 'pandas.read_frame'), - ('pandas.io.sql.write_frame', 'pandas.write_frame'), - ('pandas.io.stata.read_stata', 'pandas.read_stata'), + ("pandas.core.common.isnull", "pandas.isna"), + ("pandas.core.common.notnull", "pandas.notna"), + ("pandas.core.reshape.get_dummies", "pandas.get_dummies"), + ("pandas.tools.merge.concat", "pandas.concat"), + ("pandas.tools.merge.merge", "pandas.merge"), + ("pandas.tools.pivot.pivot_table", "pandas.pivot_table"), + ("pandas.tseries.tools.to_datetime", "pandas.to_datetime"), + ("pandas.io.clipboard.read_clipboard", "pandas.read_clipboard"), + ("pandas.io.excel.ExcelFile.parse", "pandas.ExcelFile.parse"), + ("pandas.io.excel.read_excel", "pandas.read_excel"), + ("pandas.io.gbq.read_gbq", "pandas.read_gbq"), + ("pandas.io.html.read_html", "pandas.read_html"), + ("pandas.io.json.read_json", "pandas.read_json"), + ("pandas.io.parsers.read_csv", "pandas.read_csv"), + ("pandas.io.parsers.read_fwf", "pandas.read_fwf"), + ("pandas.io.parsers.read_table", "pandas.read_table"), + ("pandas.io.pickle.read_pickle", "pandas.read_pickle"), + ("pandas.io.pytables.HDFStore.append", "pandas.HDFStore.append"), + ("pandas.io.pytables.HDFStore.get", "pandas.HDFStore.get"), + ("pandas.io.pytables.HDFStore.put", "pandas.HDFStore.put"), + ("pandas.io.pytables.HDFStore.select", "pandas.HDFStore.select"), + ("pandas.io.pytables.read_hdf", "pandas.read_hdf"), + ("pandas.io.sql.read_sql", "pandas.read_sql"), + ("pandas.io.sql.read_frame", "pandas.read_frame"), + ("pandas.io.sql.write_frame", "pandas.write_frame"), + ("pandas.io.stata.read_stata", "pandas.read_stata"), ] # Again, tuples of (from_old, to_new) moved_classes = [ - ('pandas.tseries.resample.Resampler', 'pandas.core.resample.Resampler'), - ('pandas.formats.style.Styler', 'pandas.io.formats.style.Styler'), + ("pandas.tseries.resample.Resampler", "pandas.core.resample.Resampler"), + ("pandas.formats.style.Styler", "pandas.io.formats.style.Styler"), ] for old, new in moved_classes: # the class itself... moved_api_pages.append((old, new)) - mod, classname = new.rsplit('.', 1) + mod, classname = new.rsplit(".", 1) klass = getattr(importlib.import_module(mod), classname) - methods = [x for x in dir(klass) - if not x.startswith('_') or x in ('__iter__', '__array__')] + methods = [ + x for x in dir(klass) if not x.startswith("_") or x in ("__iter__", "__array__") + ] for method in methods: # ... and each of its public methods moved_api_pages.append( - ("{old}.{method}".format(old=old, method=method), - "{new}.{method}".format(new=new, method=method)) + ( + "{old}.{method}".format(old=old, method=method), + "{new}.{method}".format(new=new, method=method), + ) ) if pattern is None: html_additional_pages = { - 'generated/' + page[0]: 'api_redirect.html' - for page in moved_api_pages + "generated/" + page[0]: "api_redirect.html" for page in moved_api_pages } @@ -323,12 +322,14 @@ import os os.chdir(r'{}') -""".format(os.path.dirname(os.path.dirname(__file__))) +""".format( + os.path.dirname(os.path.dirname(__file__)) +) html_context = { - 'redirects': {old: new for old, new in moved_api_pages}, - 'header': header + "redirects": {old: new for old, new in moved_api_pages}, + "header": header, } # If false, no module index is generated. @@ -352,7 +353,7 @@ # html_file_suffix = '' # Output file base name for HTML help builder. -htmlhelp_basename = 'pandas' +htmlhelp_basename = "pandas" # -- Options for nbsphinx ------------------------------------------------ @@ -371,9 +372,13 @@ # Grouping the document tree into LaTeX files. List of tuples (source start # file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'pandas.tex', - 'pandas: powerful Python data analysis toolkit', - r'Wes McKinney\n\& PyData Development Team', 'manual'), + ( + "index", + "pandas.tex", + "pandas: powerful Python data analysis toolkit", + r"Wes McKinney\n\& PyData Development Team", + "manual", + ) ] # The name of an image file (relative to this directory) to place at the top of @@ -396,32 +401,32 @@ if pattern is None: intersphinx_mapping = { - 'dateutil': ("https://dateutil.readthedocs.io/en/latest/", None), - 'matplotlib': ('https://matplotlib.org/', None), - 'numpy': ('https://docs.scipy.org/doc/numpy/', None), - 'pandas-gbq': ('https://pandas-gbq.readthedocs.io/en/latest/', None), - 'py': ('https://pylib.readthedocs.io/en/latest/', None), - 'python': ('https://docs.python.org/3/', None), - 'scipy': ('https://docs.scipy.org/doc/scipy/reference/', None), - 'statsmodels': ('http://www.statsmodels.org/devel/', None), + "dateutil": ("https://dateutil.readthedocs.io/en/latest/", None), + "matplotlib": ("https://matplotlib.org/", None), + "numpy": ("https://docs.scipy.org/doc/numpy/", None), + "pandas-gbq": ("https://pandas-gbq.readthedocs.io/en/latest/", None), + "py": ("https://pylib.readthedocs.io/en/latest/", None), + "python": ("https://docs.python.org/3/", None), + "scipy": ("https://docs.scipy.org/doc/scipy/reference/", None), + "statsmodels": ("http://www.statsmodels.org/devel/", None), } # extlinks alias -extlinks = {'issue': ('https://github.com/pandas-dev/pandas/issues/%s', - 'GH'), - 'wiki': ('https://github.com/pandas-dev/pandas/wiki/%s', - 'wiki ')} +extlinks = { + "issue": ("https://github.com/pandas-dev/pandas/issues/%s", "GH"), + "wiki": ("https://github.com/pandas-dev/pandas/wiki/%s", "wiki "), +} ipython_warning_is_error = False ipython_exec_lines = [ - 'import numpy as np', - 'import pandas as pd', + "import numpy as np", + "import pandas as pd", # This ensures correct rendering on system with console encoding != utf8 # (windows). It forces pandas to encode its output reprs using utf8 # wherever the docs are built. The docs' target is the browser, not # the console, so this is fine. - 'pd.options.display.encoding="utf8"' + 'pd.options.display.encoding="utf8"', ] @@ -430,8 +435,7 @@ import sphinx from sphinx.util import rpartition -from sphinx.ext.autodoc import ( - Documenter, MethodDocumenter, AttributeDocumenter) +from sphinx.ext.autodoc import Documenter, MethodDocumenter, AttributeDocumenter from sphinx.ext.autosummary import Autosummary @@ -439,8 +443,9 @@ class AccessorDocumenter(MethodDocumenter): """ Specialized Documenter subclass for accessors. """ - objtype = 'accessor' - directivetype = 'method' + + objtype = "accessor" + directivetype = "method" # lower than MethodDocumenter so this is not chosen for normal methods priority = 0.6 @@ -448,7 +453,7 @@ class AccessorDocumenter(MethodDocumenter): def format_signature(self): # this method gives an error/warning for the accessors, therefore # overriding it (accessor has no arguments) - return '' + return "" class AccessorLevelDocumenter(Documenter): @@ -456,6 +461,7 @@ class AccessorLevelDocumenter(Documenter): Specialized Documenter subclass for objects on accessor level (methods, attributes). """ + # This is the simple straightforward version # modname is None, base the last elements (eg 'hour') # and path the part before (eg 'Series.dt') @@ -468,41 +474,40 @@ class AccessorLevelDocumenter(Documenter): def resolve_name(self, modname, parents, path, base): if modname is None: if path: - mod_cls = path.rstrip('.') + mod_cls = path.rstrip(".") else: mod_cls = None # if documenting a class-level object without path, # there must be a current class, either from a parent # auto directive ... - mod_cls = self.env.temp_data.get('autodoc:class') + mod_cls = self.env.temp_data.get("autodoc:class") # ... or from a class directive if mod_cls is None: - mod_cls = self.env.temp_data.get('py:class') + mod_cls = self.env.temp_data.get("py:class") # ... if still None, there's no way to know if mod_cls is None: return None, [] # HACK: this is added in comparison to ClassLevelDocumenter # mod_cls still exists of class.accessor, so an extra # rpartition is needed - modname, accessor = rpartition(mod_cls, '.') - modname, cls = rpartition(modname, '.') + modname, accessor = rpartition(mod_cls, ".") + modname, cls = rpartition(modname, ".") parents = [cls, accessor] # if the module name is still missing, get it like above if not modname: - modname = self.env.temp_data.get('autodoc:module') + modname = self.env.temp_data.get("autodoc:module") if not modname: - if sphinx.__version__ > '1.3': - modname = self.env.ref_context.get('py:module') + if sphinx.__version__ > "1.3": + modname = self.env.ref_context.get("py:module") else: - modname = self.env.temp_data.get('py:module') + modname = self.env.temp_data.get("py:module") # ... else, it stays None, which means invalid return modname, parents + [base] -class AccessorAttributeDocumenter(AccessorLevelDocumenter, - AttributeDocumenter): - objtype = 'accessorattribute' - directivetype = 'attribute' +class AccessorAttributeDocumenter(AccessorLevelDocumenter, AttributeDocumenter): + objtype = "accessorattribute" + directivetype = "attribute" # lower than AttributeDocumenter so this is not chosen for normal # attributes @@ -510,8 +515,8 @@ class AccessorAttributeDocumenter(AccessorLevelDocumenter, class AccessorMethodDocumenter(AccessorLevelDocumenter, MethodDocumenter): - objtype = 'accessormethod' - directivetype = 'method' + objtype = "accessormethod" + directivetype = "method" # lower than MethodDocumenter so this is not chosen for normal methods priority = 0.6 @@ -522,14 +527,15 @@ class AccessorCallableDocumenter(AccessorLevelDocumenter, MethodDocumenter): This documenter lets us removes .__call__ from the method signature for callable accessors like Series.plot """ - objtype = 'accessorcallable' - directivetype = 'method' + + objtype = "accessorcallable" + directivetype = "method" # lower than MethodDocumenter; otherwise the doc build prints warnings priority = 0.5 def format_name(self): - return MethodDocumenter.format_name(self).rstrip('.__call__') + return MethodDocumenter.format_name(self).rstrip(".__call__") class PandasAutosummary(Autosummary): @@ -537,15 +543,16 @@ class PandasAutosummary(Autosummary): This alternative autosummary class lets us override the table summary for Series.plot and DataFrame.plot in the API docs. """ + def _replace_pandas_items(self, display_name, sig, summary, real_name): # this a hack: ideally we should extract the signature from the # .__call__ method instead of hard coding this - if display_name == 'DataFrame.plot': - sig = '([x, y, kind, ax, ....])' - summary = 'DataFrame plotting accessor and method' - elif display_name == 'Series.plot': - sig = '([kind, ax, figsize, ....])' - summary = 'Series plotting accessor and method' + if display_name == "DataFrame.plot": + sig = "([x, y, kind, ax, ....])" + summary = "DataFrame plotting accessor and method" + elif display_name == "Series.plot": + sig = "([kind, ax, figsize, ....])" + summary = "Series plotting accessor and method" return (display_name, sig, summary, real_name) @staticmethod @@ -554,15 +561,15 @@ def _is_deprecated(real_name): obj, parent, modname = _import_by_name(real_name) except ImportError: return False - doc = NumpyDocString(obj.__doc__ or '') - summary = ''.join(doc['Summary'] + doc['Extended Summary']) - return '.. deprecated::' in summary + doc = NumpyDocString(obj.__doc__ or "") + summary = "".join(doc["Summary"] + doc["Extended Summary"]) + return ".. deprecated::" in summary def _add_deprecation_prefixes(self, items): for item in items: display_name, sig, summary, real_name = item if self._is_deprecated(real_name): - summary = '(DEPRECATED) %s' % summary + summary = "(DEPRECATED) %s" % summary yield display_name, sig, summary, real_name def get_items(self, names): @@ -577,18 +584,18 @@ def linkcode_resolve(domain, info): """ Determine the URL corresponding to Python object """ - if domain != 'py': + if domain != "py": return None - modname = info['module'] - fullname = info['fullname'] + modname = info["module"] + fullname = info["fullname"] submod = sys.modules.get(modname) if submod is None: return None obj = submod - for part in fullname.split('.'): + for part in fullname.split("."): try: obj = getattr(obj, part) except AttributeError: @@ -617,12 +624,14 @@ def linkcode_resolve(domain, info): fn = os.path.relpath(fn, start=os.path.dirname(pandas.__file__)) - if '+' in pandas.__version__: - return ("http://github.com/pandas-dev/pandas/blob/master/pandas/" - "{}{}".format(fn, linespec)) + if "+" in pandas.__version__: + return "http://github.com/pandas-dev/pandas/blob/master/pandas/" "{}{}".format( + fn, linespec + ) else: - return ("http://github.com/pandas-dev/pandas/blob/" - "v{}/pandas/{}{}".format(pandas.__version__, fn, linespec)) + return "http://github.com/pandas-dev/pandas/blob/" "v{}/pandas/{}{}".format( + pandas.__version__, fn, linespec + ) # remove the docstring of the flags attribute (inherited from numpy ndarray) @@ -646,7 +655,7 @@ def process_class_docstrings(app, what, name, obj, options, lines): """ if what == "class": - joined = '\n'.join(lines) + joined = "\n".join(lines) templates = [ """.. rubric:: Attributes @@ -662,25 +671,25 @@ def process_class_docstrings(app, what, name, obj, options, lines): :toctree: None -""" +""", ] for template in templates: if template in joined: - joined = joined.replace(template, '') - lines[:] = joined.split('\n') + joined = joined.replace(template, "") + lines[:] = joined.split("\n") suppress_warnings = [ # We "overwrite" autosummary with our PandasAutosummary, but # still want the regular autosummary setup to run. So we just # suppress this warning. - 'app.add_directive' + "app.add_directive" ] if pattern: # When building a single document we don't want to warn because references # to other documents are unknown, as it's expected - suppress_warnings.append('ref.ref') + suppress_warnings.append("ref.ref") def rstjinja(app, docname, source): @@ -689,12 +698,10 @@ def rstjinja(app, docname, source): """ # http://ericholscher.com/blog/2016/jul/25/integrating-jinja-rst-sphinx/ # Make sure we're outputting HTML - if app.builder.format != 'html': + if app.builder.format != "html": return src = source[0] - rendered = app.builder.templates.render_string( - src, app.config.html_context - ) + rendered = app.builder.templates.render_string(src, app.config.html_context) source[0] = rendered @@ -706,4 +713,4 @@ def setup(app): app.add_autodocumenter(AccessorAttributeDocumenter) app.add_autodocumenter(AccessorMethodDocumenter) app.add_autodocumenter(AccessorCallableDocumenter) - app.add_directive('autosummary', PandasAutosummary) + app.add_directive("autosummary", PandasAutosummary) diff --git a/doc/sphinxext/announce.py b/doc/sphinxext/announce.py index 950e3592abf6ec..1a5ab99b5a94f9 100755 --- a/doc/sphinxext/announce.py +++ b/doc/sphinxext/announce.py @@ -40,7 +40,7 @@ from git import Repo -UTF8Writer = codecs.getwriter('utf8') +UTF8Writer = codecs.getwriter("utf8") this_repo = Repo(os.path.join(os.path.dirname(__file__), "..", "..")) author_msg = """\ @@ -54,21 +54,19 @@ def get_authors(revision_range): - pat = '^.*\\t(.*)$' - lst_release, cur_release = [r.strip() for r in revision_range.split('..')] + pat = "^.*\\t(.*)$" + lst_release, cur_release = [r.strip() for r in revision_range.split("..")] # authors, in current release and previous to current release. - cur = set(re.findall(pat, this_repo.git.shortlog('-s', revision_range), - re.M)) - pre = set(re.findall(pat, this_repo.git.shortlog('-s', lst_release), - re.M)) + cur = set(re.findall(pat, this_repo.git.shortlog("-s", revision_range), re.M)) + pre = set(re.findall(pat, this_repo.git.shortlog("-s", lst_release), re.M)) # Homu is the author of auto merges, clean him out. - cur.discard('Homu') - pre.discard('Homu') + cur.discard("Homu") + pre.discard("Homu") # Append '+' to new authors. - authors = [s + ' +' for s in cur - pre] + [s for s in cur & pre] + authors = [s + " +" for s in cur - pre] + [s for s in cur & pre] authors.sort() return authors @@ -77,19 +75,19 @@ def get_pull_requests(repo, revision_range): prnums = [] # From regular merges - merges = this_repo.git.log( - '--oneline', '--merges', revision_range) + merges = this_repo.git.log("--oneline", "--merges", revision_range) issues = re.findall("Merge pull request \\#(\\d*)", merges) prnums.extend(int(s) for s in issues) # From Homu merges (Auto merges) - issues = re. findall("Auto merge of \\#(\\d*)", merges) + issues = re.findall("Auto merge of \\#(\\d*)", merges) prnums.extend(int(s) for s in issues) # From fast forward squash-merges commits = this_repo.git.log( - '--oneline', '--no-merges', '--first-parent', revision_range) - issues = re.findall('^.*\\(\\#(\\d+)\\)$', commits, re.M) + "--oneline", "--no-merges", "--first-parent", revision_range + ) + issues = re.findall("^.*\\(\\#(\\d+)\\)$", commits, re.M) prnums.extend(int(s) for s in issues) # get PR data from github repo @@ -99,27 +97,29 @@ def get_pull_requests(repo, revision_range): def build_components(revision_range, heading="Contributors"): - lst_release, cur_release = [r.strip() for r in revision_range.split('..')] + lst_release, cur_release = [r.strip() for r in revision_range.split("..")] authors = get_authors(revision_range) return { - 'heading': heading, - 'author_message': author_msg % len(authors), - 'authors': authors, + "heading": heading, + "author_message": author_msg % len(authors), + "authors": authors, } def build_string(revision_range, heading="Contributors"): components = build_components(revision_range, heading=heading) - components['uline'] = '=' * len(components['heading']) - components['authors'] = "* " + "\n* ".join(components['authors']) + components["uline"] = "=" * len(components["heading"]) + components["authors"] = "* " + "\n* ".join(components["authors"]) - tpl = textwrap.dedent("""\ + tpl = textwrap.dedent( + """\ {heading} {uline} {author_message} - {authors}""").format(**components) + {authors}""" + ).format(**components) return tpl @@ -133,6 +133,6 @@ def main(revision_range): from argparse import ArgumentParser parser = ArgumentParser(description="Generate author lists for release") - parser.add_argument('revision_range', help='..') + parser.add_argument("revision_range", help="..") args = parser.parse_args() main(args.revision_range) diff --git a/doc/sphinxext/contributors.py b/doc/sphinxext/contributors.py index 7794a24dad89b7..4256e4659715d2 100644 --- a/doc/sphinxext/contributors.py +++ b/doc/sphinxext/contributors.py @@ -17,40 +17,36 @@ class ContributorsDirective(Directive): required_arguments = 1 - name = 'contributors' + name = "contributors" def run(self): range_ = self.arguments[0] - if range_.endswith('x..HEAD'): + if range_.endswith("x..HEAD"): return [nodes.paragraph(), nodes.bullet_list()] try: components = build_components(range_) except git.GitCommandError as exc: return [ self.state.document.reporter.warning( - "Cannot find contributors for range '{}': {}".format( - range_, exc), - line=self.lineno) + "Cannot find contributors for range '{}': {}".format(range_, exc), + line=self.lineno, + ) ] else: message = nodes.paragraph() - message += nodes.Text(components['author_message']) + message += nodes.Text(components["author_message"]) listnode = nodes.bullet_list() - for author in components['authors']: + for author in components["authors"]: para = nodes.paragraph() para += nodes.Text(author) - listnode += nodes.list_item('', para) + listnode += nodes.list_item("", para) return [message, listnode] def setup(app): - app.add_directive('contributors', ContributorsDirective) + app.add_directive("contributors", ContributorsDirective) - return { - 'version': '0.1', - 'parallel_read_safe': True, - 'parallel_write_safe': True, - } + return {"version": "0.1", "parallel_read_safe": True, "parallel_write_safe": True} diff --git a/pandas/__init__.py b/pandas/__init__.py index 5b39d954c2bc33..6351b508fb0e5c 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -1,6 +1,6 @@ # flake8: noqa -__docformat__ = 'restructuredtext' +__docformat__ = "restructuredtext" # Let users know if they're missing any of our hard dependencies hard_dependencies = ("numpy", "pytz", "dateutil") @@ -13,62 +13,113 @@ missing_dependencies.append("{0}: {1}".format(dependency, str(e))) if missing_dependencies: - raise ImportError("Unable to import required dependencies:\n" + "\n".join(missing_dependencies)) + raise ImportError( + "Unable to import required dependencies:\n" + "\n".join(missing_dependencies) + ) del hard_dependencies, dependency, missing_dependencies # numpy compat from pandas.compat.numpy import ( - _np_version_under1p14, _np_version_under1p15, _np_version_under1p16, - _np_version_under1p17) + _np_version_under1p14, + _np_version_under1p15, + _np_version_under1p16, + _np_version_under1p17, +) try: - from pandas._libs import (hashtable as _hashtable, - lib as _lib, - tslib as _tslib) + from pandas._libs import hashtable as _hashtable, lib as _lib, tslib as _tslib except ImportError as e: # pragma: no cover # hack but overkill to use re - module = str(e).replace('cannot import name ', '') - raise ImportError("C extension: {0} not built. If you want to import " - "pandas from the source directory, you may need to run " - "'python setup.py build_ext --inplace --force' to build " - "the C extensions first.".format(module)) + module = str(e).replace("cannot import name ", "") + raise ImportError( + "C extension: {0} not built. If you want to import " + "pandas from the source directory, you may need to run " + "'python setup.py build_ext --inplace --force' to build " + "the C extensions first.".format(module) + ) from datetime import datetime -from pandas._config import (get_option, set_option, reset_option, - describe_option, option_context, options) +from pandas._config import ( + get_option, + set_option, + reset_option, + describe_option, + option_context, + options, +) # let init-time option registration happen import pandas.core.config_init from pandas.core.api import ( # dtype - Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, - UInt16Dtype, UInt32Dtype, UInt64Dtype, CategoricalDtype, - PeriodDtype, IntervalDtype, DatetimeTZDtype, - + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + CategoricalDtype, + PeriodDtype, + IntervalDtype, + DatetimeTZDtype, # missing - isna, isnull, notna, notnull, - + isna, + isnull, + notna, + notnull, # indexes - Index, CategoricalIndex, Int64Index, UInt64Index, RangeIndex, - Float64Index, MultiIndex, IntervalIndex, TimedeltaIndex, - DatetimeIndex, PeriodIndex, IndexSlice, - + Index, + CategoricalIndex, + Int64Index, + UInt64Index, + RangeIndex, + Float64Index, + MultiIndex, + IntervalIndex, + TimedeltaIndex, + DatetimeIndex, + PeriodIndex, + IndexSlice, # tseries - NaT, Period, period_range, Timedelta, timedelta_range, - Timestamp, date_range, bdate_range, Interval, interval_range, + NaT, + Period, + period_range, + Timedelta, + timedelta_range, + Timestamp, + date_range, + bdate_range, + Interval, + interval_range, DateOffset, - # conversion - to_numeric, to_datetime, to_timedelta, - + to_numeric, + to_datetime, + to_timedelta, # misc - np, Grouper, factorize, unique, value_counts, NamedAgg, - array, Categorical, set_eng_float_format, Series, DataFrame) + np, + Grouper, + factorize, + unique, + value_counts, + NamedAgg, + array, + Categorical, + set_eng_float_format, + Series, + DataFrame, +) from pandas.core.sparse.api import ( - SparseArray, SparseDataFrame, SparseSeries, SparseDtype) + SparseArray, + SparseDataFrame, + SparseSeries, + SparseDtype, +) from pandas.tseries.api import infer_freq from pandas.tseries import offsets @@ -76,35 +127,56 @@ from pandas.core.computation.api import eval from pandas.core.reshape.api import ( - concat, lreshape, melt, wide_to_long, merge, merge_asof, - merge_ordered, crosstab, pivot, pivot_table, get_dummies, - cut, qcut) + concat, + lreshape, + melt, + wide_to_long, + merge, + merge_asof, + merge_ordered, + crosstab, + pivot, + pivot_table, + get_dummies, + cut, + qcut, +) from pandas.util._print_versions import show_versions from pandas.io.api import ( # excel - ExcelFile, ExcelWriter, read_excel, - + ExcelFile, + ExcelWriter, + read_excel, # packers - read_msgpack, to_msgpack, - + read_msgpack, + to_msgpack, # parsers - read_csv, read_fwf, read_table, - + read_csv, + read_fwf, + read_table, # pickle - read_pickle, to_pickle, - + read_pickle, + to_pickle, # pytables - HDFStore, read_hdf, - + HDFStore, + read_hdf, # sql - read_sql, read_sql_query, + read_sql, + read_sql_query, read_sql_table, - # misc - read_clipboard, read_parquet, read_feather, read_gbq, - read_html, read_json, read_stata, read_sas, read_spss) + read_clipboard, + read_parquet, + read_feather, + read_gbq, + read_html, + read_json, + read_stata, + read_sas, + read_spss, +) from pandas.util._tester import test import pandas.testing @@ -112,31 +184,38 @@ # use the closest tagged version if possible from ._version import get_versions + v = get_versions() -__version__ = v.get('closest-tag', v['version']) -__git_version__ = v.get('full-revisionid') +__version__ = v.get("closest-tag", v["version"]) +__git_version__ = v.get("full-revisionid") del get_versions, v # GH 27101 # TODO: remove Panel compat in 1.0 if pandas.compat.PY37: + def __getattr__(name): - if name == 'Panel': + if name == "Panel": import warnings + warnings.warn( "The Panel class is removed from pandas. Accessing it " "from the top-level namespace will also be removed in " "the next version", - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2, + ) class Panel: pass return Panel - raise AttributeError( - "module 'pandas' has no attribute '{}'".format(name)) + raise AttributeError("module 'pandas' has no attribute '{}'".format(name)) + + else: + class Panel: pass diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index bf221ea444288c..65936a9fcdbf3e 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -5,11 +5,24 @@ importing `dates` and `display` ensures that keys needed by _libs are initialized. """ -__all__ = ["config", "detect_console_encoding", "get_option", "set_option", - "reset_option", "describe_option", "option_context", "options"] +__all__ = [ + "config", + "detect_console_encoding", + "get_option", + "set_option", + "reset_option", + "describe_option", + "option_context", + "options", +] from pandas._config import config from pandas._config import dates # noqa:F401 from pandas._config.config import ( - describe_option, get_option, option_context, options, reset_option, - set_option) + describe_option, + get_option, + option_context, + options, + reset_option, + set_option, +) from pandas._config.display import detect_console_encoding diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 6b685a0ce962a0..61e926035c3f21 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -54,9 +54,8 @@ from typing import Dict, List import warnings -DeprecatedOption = namedtuple('DeprecatedOption', 'key msg rkey removal_ver') -RegisteredOption = namedtuple('RegisteredOption', - 'key defval doc validator cb') +DeprecatedOption = namedtuple("DeprecatedOption", "key msg rkey removal_ver") +RegisteredOption = namedtuple("RegisteredOption", "key defval doc validator cb") # holds deprecated option metdata _deprecated_options = {} # type: Dict[str, DeprecatedOption] @@ -68,7 +67,7 @@ _global_config = {} # type: Dict[str, str] # keys which have a special meaning -_reserved_keys = ['all'] # type: List[str] +_reserved_keys = ["all"] # type: List[str] class OptionError(AttributeError, KeyError): @@ -76,6 +75,7 @@ class OptionError(AttributeError, KeyError): checks """ + # # User API @@ -85,9 +85,9 @@ def _get_single_key(pat, silent): if len(keys) == 0: if not silent: _warn_if_deprecated(pat) - raise OptionError('No such keys(s): {pat!r}'.format(pat=pat)) + raise OptionError("No such keys(s): {pat!r}".format(pat=pat)) if len(keys) > 1: - raise OptionError('Pattern matched multiple keys') + raise OptionError("Pattern matched multiple keys") key = keys[0] if not silent: @@ -110,11 +110,10 @@ def _set_option(*args, **kwargs): # must at least 1 arg deal with constraints later nargs = len(args) if not nargs or nargs % 2 != 0: - raise ValueError("Must provide an even number of non-keyword " - "arguments") + raise ValueError("Must provide an even number of non-keyword " "arguments") # default to false - silent = kwargs.pop('silent', False) + silent = kwargs.pop("silent", False) if kwargs: msg = '_set_option() got an unexpected keyword argument "{kwarg}"' @@ -139,13 +138,13 @@ def _set_option(*args, **kwargs): o.cb(key) -def _describe_option(pat='', _print_desc=True): +def _describe_option(pat="", _print_desc=True): keys = _select_options(pat) if len(keys) == 0: - raise OptionError('No such keys(s)') + raise OptionError("No such keys(s)") - s = '' + s = "" for k in keys: # filter by pat s += _build_option_description(k) @@ -160,13 +159,15 @@ def _reset_option(pat, silent=False): keys = _select_options(pat) if len(keys) == 0: - raise OptionError('No such keys(s)') + raise OptionError("No such keys(s)") - if len(keys) > 1 and len(pat) < 4 and pat != 'all': - raise ValueError('You must specify at least 4 characters when ' - 'resetting multiple keys, use the special keyword ' - '"all" to reset all the options to their default ' - 'value') + if len(keys) > 1 and len(pat) < 4 and pat != "all": + raise ValueError( + "You must specify at least 4 characters when " + "resetting multiple keys, use the special keyword " + '"all" to reset all the options to their default ' + "value" + ) for k in keys: _set_option(k, _registered_options[k].defval, silent=silent) @@ -213,6 +214,7 @@ def __getattr__(self, key): def __dir__(self): return list(self.d.keys()) + # For user convenience, we'd like to have the available options described # in the docstring. For dev convenience we'd like to generate the docstrings # dynamically instead of maintaining them by hand. To this, we use the @@ -223,7 +225,6 @@ def __dir__(self): class CallableDynamicDoc: - def __init__(self, func, doc_tmpl): self.__doc_tmpl__ = doc_tmpl self.__func__ = func @@ -233,10 +234,9 @@ def __call__(self, *args, **kwds): @property def __doc__(self): - opts_desc = _describe_option('all', _print_desc=False) + opts_desc = _describe_option("all", _print_desc=False) opts_list = pp_options_list(list(_registered_options.keys())) - return self.__doc_tmpl__.format(opts_desc=opts_desc, - opts_list=opts_list) + return self.__doc_tmpl__.format(opts_desc=opts_desc, opts_list=opts_list) _get_option_tmpl = """ @@ -394,14 +394,14 @@ class option_context: def __init__(self, *args): if not (len(args) % 2 == 0 and len(args) >= 2): - raise ValueError('Need to invoke as' - ' option_context(pat, val, [(pat, val), ...]).') + raise ValueError( + "Need to invoke as" " option_context(pat, val, [(pat, val), ...])." + ) self.ops = list(zip(args[::2], args[1::2])) def __enter__(self): - self.undo = [(pat, _get_option(pat, silent=True)) - for pat, val in self.ops] + self.undo = [(pat, _get_option(pat, silent=True)) for pat, val in self.ops] for pat, val in self.ops: _set_option(pat, val, silent=True) @@ -412,7 +412,7 @@ def __exit__(self, *args): _set_option(pat, val, silent=True) -def register_option(key, defval, doc='', validator=None, cb=None): +def register_option(key, defval, doc="", validator=None, cb=None): """Register an option in the package-wide pandas config object Parameters @@ -437,6 +437,7 @@ def register_option(key, defval, doc='', validator=None, cb=None): """ import tokenize import keyword + key = key.lower() if key in _registered_options: @@ -451,10 +452,10 @@ def register_option(key, defval, doc='', validator=None, cb=None): validator(defval) # walk the nested dict, creating dicts as needed along the path - path = key.split('.') + path = key.split(".") for k in path: - if not bool(re.match('^' + tokenize.Name + '$', k)): + if not bool(re.match("^" + tokenize.Name + "$", k)): raise ValueError("{k} is not a valid identifier".format(k=k)) if keyword.iskeyword(k): raise ValueError("{k} is a python keyword".format(k=k)) @@ -463,20 +464,20 @@ def register_option(key, defval, doc='', validator=None, cb=None): msg = "Path prefix to option '{option}' is already an option" for i, p in enumerate(path[:-1]): if not isinstance(cursor, dict): - raise OptionError(msg.format(option='.'.join(path[:i]))) + raise OptionError(msg.format(option=".".join(path[:i]))) if p not in cursor: cursor[p] = {} cursor = cursor[p] if not isinstance(cursor, dict): - raise OptionError(msg.format(option='.'.join(path[:-1]))) + raise OptionError(msg.format(option=".".join(path[:-1]))) cursor[path[-1]] = defval # initialize # save the option metadata - _registered_options[key] = RegisteredOption(key=key, defval=defval, - doc=doc, validator=validator, - cb=cb) + _registered_options[key] = RegisteredOption( + key=key, defval=defval, doc=doc, validator=validator, cb=cb + ) def deprecate_option(key, msg=None, rkey=None, removal_ver=None): @@ -526,6 +527,7 @@ def deprecate_option(key, msg=None, rkey=None, removal_ver=None): _deprecated_options[key] = DeprecatedOption(key, msg, rkey, removal_ver) + # # functions internal to the module @@ -542,14 +544,14 @@ def _select_options(pat): # else look through all of them keys = sorted(_registered_options.keys()) - if pat == 'all': # reserved key + if pat == "all": # reserved key return keys return [k for k in keys if re.search(pat, k, re.I)] def _get_root(key): - path = key.split('.') + path = key.split(".") cursor = _global_config for p in path[:-1]: cursor = cursor[p] @@ -621,12 +623,11 @@ def _warn_if_deprecated(key): else: msg = "'{key}' is deprecated".format(key=key) if d.removal_ver: - msg += (' and will be removed in {version}' - .format(version=d.removal_ver)) + msg += " and will be removed in {version}".format(version=d.removal_ver) if d.rkey: msg += ", please use '{rkey}' instead.".format(rkey=d.rkey) else: - msg += ', please refrain from using it.' + msg += ", please refrain from using it." warnings.warn(msg, FutureWarning) return True @@ -639,22 +640,22 @@ def _build_option_description(k): o = _get_registered_option(k) d = _get_deprecated_option(k) - s = '{k} '.format(k=k) + s = "{k} ".format(k=k) if o.doc: - s += '\n'.join(o.doc.strip().split('\n')) + s += "\n".join(o.doc.strip().split("\n")) else: - s += 'No description available.' + s += "No description available." if o: - s += ('\n [default: {default}] [currently: {current}]' - .format(default=o.defval, current=_get_option(k, True))) + s += "\n [default: {default}] [currently: {current}]".format( + default=o.defval, current=_get_option(k, True) + ) if d: - s += '\n (Deprecated' - s += (', use `{rkey}` instead.' - .format(rkey=d.rkey if d.rkey else '')) - s += ')' + s += "\n (Deprecated" + s += ", use `{rkey}` instead.".format(rkey=d.rkey if d.rkey else "") + s += ")" return s @@ -666,28 +667,34 @@ def pp_options_list(keys, width=80, _print=False): from itertools import groupby def pp(name, ks): - pfx = ('- ' + name + '.[' if name else '') - ls = wrap(', '.join(ks), width, initial_indent=pfx, - subsequent_indent=' ', break_long_words=False) + pfx = "- " + name + ".[" if name else "" + ls = wrap( + ", ".join(ks), + width, + initial_indent=pfx, + subsequent_indent=" ", + break_long_words=False, + ) if ls and ls[-1] and name: - ls[-1] = ls[-1] + ']' + ls[-1] = ls[-1] + "]" return ls ls = [] - singles = [x for x in sorted(keys) if x.find('.') < 0] + singles = [x for x in sorted(keys) if x.find(".") < 0] if singles: - ls += pp('', singles) - keys = [x for x in keys if x.find('.') >= 0] + ls += pp("", singles) + keys = [x for x in keys if x.find(".") >= 0] - for k, g in groupby(sorted(keys), lambda x: x[:x.rfind('.')]): - ks = [x[len(k) + 1:] for x in list(g)] + for k, g in groupby(sorted(keys), lambda x: x[: x.rfind(".")]): + ks = [x[len(k) + 1 :] for x in list(g)] ls += pp(k, ks) - s = '\n'.join(ls) + s = "\n".join(ls) if _print: print(s) else: return s + # # helpers @@ -724,7 +731,7 @@ def config_prefix(prefix): def wrap(func): def inner(key, *args, **kwds): - pkey = '{prefix}.{key}'.format(prefix=prefix, key=key) + pkey = "{prefix}.{key}".format(prefix=prefix, key=key) return func(pkey, *args, **kwds) return inner @@ -740,6 +747,7 @@ def inner(key, *args, **kwds): get_option = _get_option register_option = _register_option + # These factories and methods are handy for use as the validator # arg in register_option diff --git a/pandas/_config/dates.py b/pandas/_config/dates.py index 85300a308de620..5bf2b49ce59046 100644 --- a/pandas/_config/dates.py +++ b/pandas/_config/dates.py @@ -13,9 +13,11 @@ When True, prints and parses dates with the year first, eg 2005/01/20 """ -with cf.config_prefix('display'): +with cf.config_prefix("display"): # Needed upstream of `_libs` because these are used in tslibs.parsing - cf.register_option('date_dayfirst', False, pc_date_dayfirst_doc, - validator=cf.is_bool) - cf.register_option('date_yearfirst', False, pc_date_yearfirst_doc, - validator=cf.is_bool) + cf.register_option( + "date_dayfirst", False, pc_date_dayfirst_doc, validator=cf.is_bool + ) + cf.register_option( + "date_yearfirst", False, pc_date_yearfirst_doc, validator=cf.is_bool + ) diff --git a/pandas/_config/display.py b/pandas/_config/display.py index 7997d12e06aa91..6e5fabe2706e5e 100644 --- a/pandas/_config/display.py +++ b/pandas/_config/display.py @@ -25,14 +25,14 @@ def detect_console_encoding(): pass # try again for something better - if not encoding or 'ascii' in encoding.lower(): + if not encoding or "ascii" in encoding.lower(): try: encoding = locale.getpreferredencoding() except Exception: pass # when all else fails. this will usually be "ascii" - if not encoding or 'ascii' in encoding.lower(): + if not encoding or "ascii" in encoding.lower(): encoding = sys.getdefaultencoding() # GH#3360, save the reported defencoding at import time @@ -50,6 +50,7 @@ def detect_console_encoding(): these are generally strings meant to be displayed on the console. """ -with cf.config_prefix('display'): - cf.register_option('encoding', detect_console_encoding(), pc_encoding_doc, - validator=cf.is_text) +with cf.config_prefix("display"): + cf.register_option( + "encoding", detect_console_encoding(), pc_encoding_doc, validator=cf.is_text + ) diff --git a/pandas/_config/localization.py b/pandas/_config/localization.py index 1ca6d073f18c42..46802c64609594 100644 --- a/pandas/_config/localization.py +++ b/pandas/_config/localization.py @@ -37,7 +37,7 @@ def set_locale(new_locale, lc_var=locale.LC_ALL): locale.setlocale(lc_var, new_locale) normalized_locale = locale.getlocale() if all(x is not None for x in normalized_locale): - yield '.'.join(normalized_locale) + yield ".".join(normalized_locale) else: yield new_locale finally: @@ -99,15 +99,16 @@ def _valid_locales(locales, normalize): def _default_locale_getter(): try: - raw_locales = subprocess.check_output(['locale -a'], shell=True) + raw_locales = subprocess.check_output(["locale -a"], shell=True) except subprocess.CalledProcessError as e: - raise type(e)("{exception}, the 'locale -a' command cannot be found " - "on your system".format(exception=e)) + raise type(e)( + "{exception}, the 'locale -a' command cannot be found " + "on your system".format(exception=e) + ) return raw_locales -def get_locales(prefix=None, normalize=True, - locale_getter=_default_locale_getter): +def get_locales(prefix=None, normalize=True, locale_getter=_default_locale_getter): """ Get all the locales that are available on the system. @@ -145,11 +146,10 @@ def get_locales(prefix=None, normalize=True, # raw_locales is "\n" separated list of locales # it may contain non-decodable parts, so split # extract what we can and then rejoin. - raw_locales = raw_locales.split(b'\n') + raw_locales = raw_locales.split(b"\n") out_locales = [] for x in raw_locales: - out_locales.append(str( - x, encoding=options.display.encoding)) + out_locales.append(str(x, encoding=options.display.encoding)) except TypeError: pass @@ -157,6 +157,6 @@ def get_locales(prefix=None, normalize=True, if prefix is None: return _valid_locales(out_locales, normalize) - pattern = re.compile('{prefix}.*'.format(prefix=prefix)) - found = pattern.findall('\n'.join(out_locales)) + pattern = re.compile("{prefix}.*".format(prefix=prefix)) + found = pattern.findall("\n".join(out_locales)) return _valid_locales(found, normalize) diff --git a/pandas/_libs/__init__.py b/pandas/_libs/__init__.py index fcf5ffbfcad92a..af67cb3be71022 100644 --- a/pandas/_libs/__init__.py +++ b/pandas/_libs/__init__.py @@ -1,4 +1,11 @@ # flake8: noqa from .tslibs import ( - NaT, NaTType, OutOfBoundsDatetime, Period, Timedelta, Timestamp, iNaT) + NaT, + NaTType, + OutOfBoundsDatetime, + Period, + Timedelta, + Timestamp, + iNaT, +) diff --git a/pandas/_typing.py b/pandas/_typing.py index 8947e98bf52cee..46b1b4685ec9f4 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -9,19 +9,25 @@ from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCExtensionArray, ABCIndexClass, ABCSeries, ABCSparseSeries) + ABCDataFrame, + ABCExtensionArray, + ABCIndexClass, + ABCSeries, + ABCSparseSeries, +) -AnyArrayLike = TypeVar('AnyArrayLike', - ABCExtensionArray, - ABCIndexClass, - ABCSeries, - ABCSparseSeries, - np.ndarray) -ArrayLike = TypeVar('ArrayLike', ABCExtensionArray, np.ndarray) -DatetimeLikeScalar = TypeVar('DatetimeLikeScalar', Period, Timestamp, - Timedelta) +AnyArrayLike = TypeVar( + "AnyArrayLike", + ABCExtensionArray, + ABCIndexClass, + ABCSeries, + ABCSparseSeries, + np.ndarray, +) +ArrayLike = TypeVar("ArrayLike", ABCExtensionArray, np.ndarray) +DatetimeLikeScalar = TypeVar("DatetimeLikeScalar", Period, Timestamp, Timedelta) Dtype = Union[str, np.dtype, ExtensionDtype] FilePathOrBuffer = Union[str, Path, IO[AnyStr]] -FrameOrSeries = TypeVar('FrameOrSeries', ABCSeries, ABCDataFrame) +FrameOrSeries = TypeVar("FrameOrSeries", ABCSeries, ABCDataFrame) Scalar = Union[str, int, float] diff --git a/pandas/_version.py b/pandas/_version.py index 5031f411270a16..4f5bdf59a99d5d 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -56,6 +56,7 @@ def decorate(f: Callable) -> Callable: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f + return decorate @@ -66,9 +67,12 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen([c] + args, cwd=cwd, stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) + p = subprocess.Popen( + [c] + args, + cwd=cwd, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + ) break except EnvironmentError: e = sys.exc_info()[1] @@ -96,14 +100,19 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): dirname = os.path.basename(root) if not dirname.startswith(parentdir_prefix): if verbose: - print("guessing rootdir is '{root}', but '{dirname}' " - "doesn't start with prefix '{parentdir_prefix}'".format( - root=root, dirname=dirname, - parentdir_prefix=parentdir_prefix)) + print( + "guessing rootdir is '{root}', but '{dirname}' " + "doesn't start with prefix '{parentdir_prefix}'".format( + root=root, dirname=dirname, parentdir_prefix=parentdir_prefix + ) + ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None} + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + } @register_vcs_handler("git", "get_keywords") @@ -143,7 +152,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} + tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -152,7 +161,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". - tags = {r for r in refs if re.search(r'\d', r)} + tags = {r for r in refs if re.search(r"\d", r)} if verbose: print("discarding '{}', no digits".format(",".join(refs - tags))) if verbose: @@ -160,19 +169,24 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] + r = ref[len(tag_prefix) :] if verbose: print("picking {r}".format(r=r)) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None - } + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags"} + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + } @register_vcs_handler("git", "pieces_from_vcs") @@ -192,9 +206,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): GITS = ["git.cmd", "git.exe"] # if there is a tag, this yields TAG-NUM-gHEX[-dirty] # if there are no tags, this yields HEX[-dirty] (no NUM) - describe_out = run_command(GITS, ["describe", "--tags", "--dirty", - "--always", "--long"], - cwd=root) + describe_out = run_command( + GITS, ["describe", "--tags", "--dirty", "--always", "--long"], cwd=root + ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") @@ -217,32 +231,32 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] + git_describe = git_describe[: git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? - pieces["error"] = ("unable to parse git-describe output: " - "'{describe_out}'".format( - describe_out=describe_out)) + pieces["error"] = ( + "unable to parse git-describe output: " + "'{describe_out}'".format(describe_out=describe_out) + ) return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): - fmt = ("tag '{full_tag}' doesn't start with prefix " - "'{tag_prefix}'") + fmt = "tag '{full_tag}' doesn't start with prefix " "'{tag_prefix}'" msg = fmt.format(full_tag=full_tag, tag_prefix=tag_prefix) if verbose: print(msg) pieces["error"] = msg return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] + pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) @@ -253,8 +267,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): else: # HEX: no tags pieces["closest-tag"] = None - count_out = run_command(GITS, ["rev-list", "HEAD", "--count"], - cwd=root) + count_out = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits return pieces @@ -283,8 +296,7 @@ def render_pep440(pieces): rendered += ".dirty" else: # exception #1 - rendered = "0+untagged.{:d}.g{}".format(pieces["distance"], - pieces["short"]) + rendered = "0+untagged.{:d}.g{}".format(pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered @@ -391,10 +403,12 @@ def render_git_describe_long(pieces): def render(pieces, style): if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"]} + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + } if not style or style == "default": style = "pep440" # the default @@ -414,8 +428,12 @@ def render(pieces, style): else: raise ValueError("unknown style '{style}'".format(style=style)) - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None} + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + } def get_versions(): @@ -428,8 +446,7 @@ def get_versions(): verbose = cfg.verbose try: - return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, - verbose) + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass @@ -438,12 +455,15 @@ def get_versions(): # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. - for i in cfg.versionfile_source.split('/'): + for i in cfg.versionfile_source.split("/"): root = os.path.dirname(root) except NameError: - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to find root of source tree"} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + } try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) @@ -457,6 +477,9 @@ def get_versions(): except NotThisMethod: pass - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to compute version"} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + } diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py index 0bd2733cb494cc..431dd2b1968aee 100644 --- a/pandas/api/extensions/__init__.py +++ b/pandas/api/extensions/__init__.py @@ -1,12 +1,14 @@ """Public API for extending pandas objects.""" from pandas.core.dtypes.dtypes import ( # noqa: F401 - ExtensionDtype, register_extension_dtype) + ExtensionDtype, + register_extension_dtype, +) from pandas.core.accessor import ( # noqa: F401 - register_index_accessor, register_series_accessor) + register_index_accessor, + register_series_accessor, +) from pandas.core.algorithms import take # noqa: F401 -from pandas.core.arrays import ( # noqa: F401 - ExtensionArray, ExtensionScalarOpsMixin) +from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin # noqa: F401 -from pandas.core.accessor import ( # noqa: F401; noqa: F401 - register_dataframe_accessor) +from pandas.core.accessor import register_dataframe_accessor # noqa: F401; noqa: F401 diff --git a/pandas/api/types/__init__.py b/pandas/api/types/__init__.py index 668f79921d8e61..f32e1abe28cc13 100644 --- a/pandas/api/types/__init__.py +++ b/pandas/api/types/__init__.py @@ -5,4 +5,8 @@ from pandas.core.dtypes.api import * # noqa: F403, F401 from pandas.core.dtypes.concat import union_categoricals # noqa: F401 from pandas.core.dtypes.dtypes import ( # noqa: F401 - CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype) + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, +) diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index ab014d49236b3c..db01f2a0c674f6 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -4,16 +4,23 @@ See :ref:`extending.extension-types` for more. """ from pandas.core.arrays import ( - Categorical, DatetimeArray, IntegerArray, IntervalArray, PandasArray, - PeriodArray, SparseArray, TimedeltaArray) + Categorical, + DatetimeArray, + IntegerArray, + IntervalArray, + PandasArray, + PeriodArray, + SparseArray, + TimedeltaArray, +) __all__ = [ - 'Categorical', - 'DatetimeArray', - 'IntegerArray', - 'IntervalArray', - 'PandasArray', - 'PeriodArray', - 'SparseArray', - 'TimedeltaArray', + "Categorical", + "DatetimeArray", + "IntegerArray", + "IntervalArray", + "PandasArray", + "PeriodArray", + "SparseArray", + "TimedeltaArray", ] diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 4459e66540dac4..c9597505fa5962 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -13,7 +13,7 @@ PY36 = sys.version_info >= (3, 6) PY37 = sys.version_info >= (3, 7) -PYPY = platform.python_implementation() == 'PyPy' +PYPY = platform.python_implementation() == "PyPy" # ---------------------------------------------------------------------------- @@ -29,9 +29,7 @@ def set_function_name(f, name, cls): Bind the name/qualname attributes of the function """ f.__name__ = name - f.__qualname__ = '{klass}.{name}'.format( - klass=cls.__name__, - name=name) + f.__qualname__ = "{klass}.{name}".format(klass=cls.__name__, name=name) f.__module__ = cls.__module__ return f @@ -49,19 +47,19 @@ def raise_with_traceback(exc, traceback=Ellipsis): # https://github.com/pandas-dev/pandas/pull/9123 def is_platform_little_endian(): """ am I little endian """ - return sys.byteorder == 'little' + return sys.byteorder == "little" def is_platform_windows(): - return sys.platform == 'win32' or sys.platform == 'cygwin' + return sys.platform == "win32" or sys.platform == "cygwin" def is_platform_linux(): - return sys.platform == 'linux2' + return sys.platform == "linux2" def is_platform_mac(): - return sys.platform == 'darwin' + return sys.platform == "darwin" def is_platform_32bit(): diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 620884d66821c5..cd4e1b7e8aa4dd 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -39,23 +39,18 @@ def _get_version(module: types.ModuleType) -> str: - version = getattr(module, '__version__', None) + version = getattr(module, "__version__", None) if version is None: # xlrd uses a capitalized attribute name - version = getattr(module, '__VERSION__', None) + version = getattr(module, "__VERSION__", None) if version is None: - raise ImportError( - "Can't determine version for {}".format(module.__name__) - ) + raise ImportError("Can't determine version for {}".format(module.__name__)) return version def import_optional_dependency( - name: str, - extra: str = "", - raise_on_missing: bool = True, - on_version: str = "raise", + name: str, extra: str = "", raise_on_missing: bool = True, on_version: str = "raise" ): """ Import an optional dependency. @@ -105,9 +100,7 @@ def import_optional_dependency( if distutils.version.LooseVersion(version) < minimum_version: assert on_version in {"warn", "raise", "ignore"} msg = version_message.format( - minimum_version=minimum_version, - name=name, - actual_version=version, + minimum_version=minimum_version, name=name, actual_version=version ) if on_version == "warn": warnings.warn(msg, UserWarning) diff --git a/pandas/compat/chainmap.py b/pandas/compat/chainmap.py index e57a2ba3af0ac8..83f1da597d6a6f 100644 --- a/pandas/compat/chainmap.py +++ b/pandas/compat/chainmap.py @@ -2,7 +2,6 @@ class DeepChainMap(ChainMap): - def __setitem__(self, key, value): for mapping in self.maps: if key in mapping: diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 22bfab8b7c6d63..ce56c08d3ec147 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -8,27 +8,29 @@ # numpy versioning _np_version = np.__version__ _nlv = LooseVersion(_np_version) -_np_version_under1p14 = _nlv < LooseVersion('1.14') -_np_version_under1p15 = _nlv < LooseVersion('1.15') -_np_version_under1p16 = _nlv < LooseVersion('1.16') -_np_version_under1p17 = _nlv < LooseVersion('1.17') -_is_numpy_dev = '.dev' in str(_nlv) +_np_version_under1p14 = _nlv < LooseVersion("1.14") +_np_version_under1p15 = _nlv < LooseVersion("1.15") +_np_version_under1p16 = _nlv < LooseVersion("1.16") +_np_version_under1p17 = _nlv < LooseVersion("1.17") +_is_numpy_dev = ".dev" in str(_nlv) -if _nlv < '1.13.3': - raise ImportError('this version of pandas is incompatible with ' - 'numpy < 1.13.3\n' - 'your numpy version is {0}.\n' - 'Please upgrade numpy to >= 1.13.3 to use ' - 'this pandas version'.format(_np_version)) +if _nlv < "1.13.3": + raise ImportError( + "this version of pandas is incompatible with " + "numpy < 1.13.3\n" + "your numpy version is {0}.\n" + "Please upgrade numpy to >= 1.13.3 to use " + "this pandas version".format(_np_version) + ) -_tz_regex = re.compile('[+-]0000$') +_tz_regex = re.compile("[+-]0000$") def tz_replacer(s): if isinstance(s, str): - if s.endswith('Z'): + if s.endswith("Z"): s = s[:-1] elif _tz_regex.search(s): s = s[:-5] @@ -53,7 +55,7 @@ def np_array_datetime64_compat(arr, *args, **kwargs): warning, when need to pass '2015-01-01 09:00:00' """ # is_list_like - if (hasattr(arr, '__iter__') and not isinstance(arr, (str, bytes))): + if hasattr(arr, "__iter__") and not isinstance(arr, (str, bytes)): arr = [tz_replacer(s) for s in arr] else: arr = tz_replacer(arr) @@ -61,11 +63,12 @@ def np_array_datetime64_compat(arr, *args, **kwargs): return np.array(arr, *args, **kwargs) -__all__ = ['np', - '_np_version', - '_np_version_under1p14', - '_np_version_under1p15', - '_np_version_under1p16', - '_np_version_under1p17', - '_is_numpy_dev' - ] +__all__ = [ + "np", + "_np_version", + "_np_version_under1p14", + "_np_version_under1p15", + "_np_version_under1p16", + "_np_version_under1p17", + "_is_numpy_dev", +] diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index 572dd7272986b7..840dec2489a52a 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -26,45 +26,50 @@ from pandas._libs.lib import is_bool, is_integer from pandas.errors import UnsupportedFunctionCall from pandas.util._validators import ( - validate_args, validate_args_and_kwargs, validate_kwargs) + validate_args, + validate_args_and_kwargs, + validate_kwargs, +) class CompatValidator: - - def __init__(self, defaults, fname=None, method=None, - max_fname_arg_count=None): + def __init__(self, defaults, fname=None, method=None, max_fname_arg_count=None): self.fname = fname self.method = method self.defaults = defaults self.max_fname_arg_count = max_fname_arg_count - def __call__(self, args, kwargs, fname=None, - max_fname_arg_count=None, method=None): + def __call__(self, args, kwargs, fname=None, max_fname_arg_count=None, method=None): if args or kwargs: fname = self.fname if fname is None else fname - max_fname_arg_count = (self.max_fname_arg_count if - max_fname_arg_count is None - else max_fname_arg_count) + max_fname_arg_count = ( + self.max_fname_arg_count + if max_fname_arg_count is None + else max_fname_arg_count + ) method = self.method if method is None else method - if method == 'args': + if method == "args": validate_args(fname, args, max_fname_arg_count, self.defaults) - elif method == 'kwargs': + elif method == "kwargs": validate_kwargs(fname, kwargs, self.defaults) - elif method == 'both': - validate_args_and_kwargs(fname, args, kwargs, - max_fname_arg_count, - self.defaults) + elif method == "both": + validate_args_and_kwargs( + fname, args, kwargs, max_fname_arg_count, self.defaults + ) else: - raise ValueError("invalid validation method " - "'{method}'".format(method=method)) + raise ValueError( + "invalid validation method " "'{method}'".format(method=method) + ) ARGMINMAX_DEFAULTS = dict(out=None) -validate_argmin = CompatValidator(ARGMINMAX_DEFAULTS, fname='argmin', - method='both', max_fname_arg_count=1) -validate_argmax = CompatValidator(ARGMINMAX_DEFAULTS, fname='argmax', - method='both', max_fname_arg_count=1) +validate_argmin = CompatValidator( + ARGMINMAX_DEFAULTS, fname="argmin", method="both", max_fname_arg_count=1 +) +validate_argmax = CompatValidator( + ARGMINMAX_DEFAULTS, fname="argmax", method="both", max_fname_arg_count=1 +) def process_skipna(skipna, args): @@ -103,28 +108,30 @@ def validate_argmax_with_skipna(skipna, args, kwargs): return skipna -ARGSORT_DEFAULTS = OrderedDict() \ - # type: OrderedDict[str, Optional[Union[int, str]]] -ARGSORT_DEFAULTS['axis'] = -1 -ARGSORT_DEFAULTS['kind'] = 'quicksort' -ARGSORT_DEFAULTS['order'] = None +ARGSORT_DEFAULTS = OrderedDict() +# type: OrderedDict[str, Optional[Union[int, str]]] +ARGSORT_DEFAULTS["axis"] = -1 +ARGSORT_DEFAULTS["kind"] = "quicksort" +ARGSORT_DEFAULTS["order"] = None if LooseVersion(_np_version) >= LooseVersion("1.17.0"): # GH-26361. NumPy added radix sort and changed default to None. - ARGSORT_DEFAULTS['kind'] = None + ARGSORT_DEFAULTS["kind"] = None -validate_argsort = CompatValidator(ARGSORT_DEFAULTS, fname='argsort', - max_fname_arg_count=0, method='both') +validate_argsort = CompatValidator( + ARGSORT_DEFAULTS, fname="argsort", max_fname_arg_count=0, method="both" +) # two different signatures of argsort, this second validation # for when the `kind` param is supported -ARGSORT_DEFAULTS_KIND = OrderedDict() \ - # type: OrderedDict[str, Optional[int]] -ARGSORT_DEFAULTS_KIND['axis'] = -1 -ARGSORT_DEFAULTS_KIND['order'] = None -validate_argsort_kind = CompatValidator(ARGSORT_DEFAULTS_KIND, fname='argsort', - max_fname_arg_count=0, method='both') +ARGSORT_DEFAULTS_KIND = OrderedDict() +# type: OrderedDict[str, Optional[int]] +ARGSORT_DEFAULTS_KIND["axis"] = -1 +ARGSORT_DEFAULTS_KIND["order"] = None +validate_argsort_kind = CompatValidator( + ARGSORT_DEFAULTS_KIND, fname="argsort", max_fname_arg_count=0, method="both" +) def validate_argsort_with_ascending(ascending, args, kwargs): @@ -145,8 +152,9 @@ def validate_argsort_with_ascending(ascending, args, kwargs): CLIP_DEFAULTS = dict(out=None) # type Dict[str, Any] -validate_clip = CompatValidator(CLIP_DEFAULTS, fname='clip', - method='both', max_fname_arg_count=3) +validate_clip = CompatValidator( + CLIP_DEFAULTS, fname="clip", method="both", max_fname_arg_count=3 +) def validate_clip_with_axis(axis, args, kwargs): @@ -166,18 +174,21 @@ def validate_clip_with_axis(axis, args, kwargs): COMPRESS_DEFAULTS = OrderedDict() # type: OrderedDict[str, Any] -COMPRESS_DEFAULTS['axis'] = None -COMPRESS_DEFAULTS['out'] = None -validate_compress = CompatValidator(COMPRESS_DEFAULTS, fname='compress', - method='both', max_fname_arg_count=1) +COMPRESS_DEFAULTS["axis"] = None +COMPRESS_DEFAULTS["out"] = None +validate_compress = CompatValidator( + COMPRESS_DEFAULTS, fname="compress", method="both", max_fname_arg_count=1 +) CUM_FUNC_DEFAULTS = OrderedDict() # type: OrderedDict[str, Any] -CUM_FUNC_DEFAULTS['dtype'] = None -CUM_FUNC_DEFAULTS['out'] = None -validate_cum_func = CompatValidator(CUM_FUNC_DEFAULTS, method='both', - max_fname_arg_count=1) -validate_cumsum = CompatValidator(CUM_FUNC_DEFAULTS, fname='cumsum', - method='both', max_fname_arg_count=1) +CUM_FUNC_DEFAULTS["dtype"] = None +CUM_FUNC_DEFAULTS["out"] = None +validate_cum_func = CompatValidator( + CUM_FUNC_DEFAULTS, method="both", max_fname_arg_count=1 +) +validate_cumsum = CompatValidator( + CUM_FUNC_DEFAULTS, fname="cumsum", method="both", max_fname_arg_count=1 +) def validate_cum_func_with_skipna(skipna, args, kwargs, name): @@ -196,81 +207,88 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): ALLANY_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[bool]] -ALLANY_DEFAULTS['dtype'] = None -ALLANY_DEFAULTS['out'] = None -ALLANY_DEFAULTS['keepdims'] = False -validate_all = CompatValidator(ALLANY_DEFAULTS, fname='all', - method='both', max_fname_arg_count=1) -validate_any = CompatValidator(ALLANY_DEFAULTS, fname='any', - method='both', max_fname_arg_count=1) +ALLANY_DEFAULTS["dtype"] = None +ALLANY_DEFAULTS["out"] = None +ALLANY_DEFAULTS["keepdims"] = False +validate_all = CompatValidator( + ALLANY_DEFAULTS, fname="all", method="both", max_fname_arg_count=1 +) +validate_any = CompatValidator( + ALLANY_DEFAULTS, fname="any", method="both", max_fname_arg_count=1 +) LOGICAL_FUNC_DEFAULTS = dict(out=None, keepdims=False) -validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method='kwargs') +validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method="kwargs") MINMAX_DEFAULTS = dict(out=None, keepdims=False) -validate_min = CompatValidator(MINMAX_DEFAULTS, fname='min', - method='both', max_fname_arg_count=1) -validate_max = CompatValidator(MINMAX_DEFAULTS, fname='max', - method='both', max_fname_arg_count=1) - -RESHAPE_DEFAULTS = dict(order='C') # type: Dict[str, str] -validate_reshape = CompatValidator(RESHAPE_DEFAULTS, fname='reshape', - method='both', max_fname_arg_count=1) +validate_min = CompatValidator( + MINMAX_DEFAULTS, fname="min", method="both", max_fname_arg_count=1 +) +validate_max = CompatValidator( + MINMAX_DEFAULTS, fname="max", method="both", max_fname_arg_count=1 +) + +RESHAPE_DEFAULTS = dict(order="C") # type: Dict[str, str] +validate_reshape = CompatValidator( + RESHAPE_DEFAULTS, fname="reshape", method="both", max_fname_arg_count=1 +) REPEAT_DEFAULTS = dict(axis=None) # type: Dict[str, Any] -validate_repeat = CompatValidator(REPEAT_DEFAULTS, fname='repeat', - method='both', max_fname_arg_count=1) +validate_repeat = CompatValidator( + REPEAT_DEFAULTS, fname="repeat", method="both", max_fname_arg_count=1 +) ROUND_DEFAULTS = dict(out=None) # type: Dict[str, Any] -validate_round = CompatValidator(ROUND_DEFAULTS, fname='round', - method='both', max_fname_arg_count=1) +validate_round = CompatValidator( + ROUND_DEFAULTS, fname="round", method="both", max_fname_arg_count=1 +) -SORT_DEFAULTS = OrderedDict() \ - # type: OrderedDict[str, Optional[Union[int, str]]] -SORT_DEFAULTS['axis'] = -1 -SORT_DEFAULTS['kind'] = 'quicksort' -SORT_DEFAULTS['order'] = None -validate_sort = CompatValidator(SORT_DEFAULTS, fname='sort', - method='kwargs') +SORT_DEFAULTS = OrderedDict() +# type: OrderedDict[str, Optional[Union[int, str]]] +SORT_DEFAULTS["axis"] = -1 +SORT_DEFAULTS["kind"] = "quicksort" +SORT_DEFAULTS["order"] = None +validate_sort = CompatValidator(SORT_DEFAULTS, fname="sort", method="kwargs") STAT_FUNC_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[Any]] -STAT_FUNC_DEFAULTS['dtype'] = None -STAT_FUNC_DEFAULTS['out'] = None +STAT_FUNC_DEFAULTS["dtype"] = None +STAT_FUNC_DEFAULTS["out"] = None PROD_DEFAULTS = SUM_DEFAULTS = STAT_FUNC_DEFAULTS.copy() -SUM_DEFAULTS['keepdims'] = False -SUM_DEFAULTS['initial'] = None +SUM_DEFAULTS["keepdims"] = False +SUM_DEFAULTS["initial"] = None MEDIAN_DEFAULTS = STAT_FUNC_DEFAULTS.copy() -MEDIAN_DEFAULTS['overwrite_input'] = False -MEDIAN_DEFAULTS['keepdims'] = False - -STAT_FUNC_DEFAULTS['keepdims'] = False - -validate_stat_func = CompatValidator(STAT_FUNC_DEFAULTS, - method='kwargs') -validate_sum = CompatValidator(SUM_DEFAULTS, fname='sum', - method='both', max_fname_arg_count=1) -validate_prod = CompatValidator(PROD_DEFAULTS, fname="prod", - method="both", max_fname_arg_count=1) -validate_mean = CompatValidator(STAT_FUNC_DEFAULTS, fname='mean', - method='both', max_fname_arg_count=1) -validate_median = CompatValidator(MEDIAN_DEFAULTS, fname='median', - method='both', max_fname_arg_count=1) - -STAT_DDOF_FUNC_DEFAULTS = OrderedDict() \ - # type: OrderedDict[str, Optional[bool]] -STAT_DDOF_FUNC_DEFAULTS['dtype'] = None -STAT_DDOF_FUNC_DEFAULTS['out'] = None -STAT_DDOF_FUNC_DEFAULTS['keepdims'] = False -validate_stat_ddof_func = CompatValidator(STAT_DDOF_FUNC_DEFAULTS, - method='kwargs') +MEDIAN_DEFAULTS["overwrite_input"] = False +MEDIAN_DEFAULTS["keepdims"] = False + +STAT_FUNC_DEFAULTS["keepdims"] = False + +validate_stat_func = CompatValidator(STAT_FUNC_DEFAULTS, method="kwargs") +validate_sum = CompatValidator( + SUM_DEFAULTS, fname="sum", method="both", max_fname_arg_count=1 +) +validate_prod = CompatValidator( + PROD_DEFAULTS, fname="prod", method="both", max_fname_arg_count=1 +) +validate_mean = CompatValidator( + STAT_FUNC_DEFAULTS, fname="mean", method="both", max_fname_arg_count=1 +) +validate_median = CompatValidator( + MEDIAN_DEFAULTS, fname="median", method="both", max_fname_arg_count=1 +) + +STAT_DDOF_FUNC_DEFAULTS = OrderedDict() +# type: OrderedDict[str, Optional[bool]] +STAT_DDOF_FUNC_DEFAULTS["dtype"] = None +STAT_DDOF_FUNC_DEFAULTS["out"] = None +STAT_DDOF_FUNC_DEFAULTS["keepdims"] = False +validate_stat_ddof_func = CompatValidator(STAT_DDOF_FUNC_DEFAULTS, method="kwargs") TAKE_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[str]] -TAKE_DEFAULTS['out'] = None -TAKE_DEFAULTS['mode'] = 'raise' -validate_take = CompatValidator(TAKE_DEFAULTS, fname='take', - method='kwargs') +TAKE_DEFAULTS["out"] = None +TAKE_DEFAULTS["mode"] = "raise" +validate_take = CompatValidator(TAKE_DEFAULTS, fname="take", method="kwargs") def validate_take_with_convert(convert, args, kwargs): @@ -285,20 +303,23 @@ def validate_take_with_convert(convert, args, kwargs): args = (convert,) + args convert = True - validate_take(args, kwargs, max_fname_arg_count=3, method='both') + validate_take(args, kwargs, max_fname_arg_count=3, method="both") return convert TRANSPOSE_DEFAULTS = dict(axes=None) -validate_transpose = CompatValidator(TRANSPOSE_DEFAULTS, fname='transpose', - method='both', max_fname_arg_count=0) +validate_transpose = CompatValidator( + TRANSPOSE_DEFAULTS, fname="transpose", method="both", max_fname_arg_count=0 +) def validate_window_func(name, args, kwargs): - numpy_args = ('axis', 'dtype', 'out') - msg = ("numpy operations are not " - "valid with window objects. " - "Use .{func}() directly instead ".format(func=name)) + numpy_args = ("axis", "dtype", "out") + msg = ( + "numpy operations are not " + "valid with window objects. " + "Use .{func}() directly instead ".format(func=name) + ) if len(args) > 0: raise UnsupportedFunctionCall(msg) @@ -309,10 +330,12 @@ def validate_window_func(name, args, kwargs): def validate_rolling_func(name, args, kwargs): - numpy_args = ('axis', 'dtype', 'out') - msg = ("numpy operations are not " - "valid with window objects. " - "Use .rolling(...).{func}() instead ".format(func=name)) + numpy_args = ("axis", "dtype", "out") + msg = ( + "numpy operations are not " + "valid with window objects. " + "Use .rolling(...).{func}() instead ".format(func=name) + ) if len(args) > 0: raise UnsupportedFunctionCall(msg) @@ -323,10 +346,12 @@ def validate_rolling_func(name, args, kwargs): def validate_expanding_func(name, args, kwargs): - numpy_args = ('axis', 'dtype', 'out') - msg = ("numpy operations are not " - "valid with window objects. " - "Use .expanding(...).{func}() instead ".format(func=name)) + numpy_args = ("axis", "dtype", "out") + msg = ( + "numpy operations are not " + "valid with window objects. " + "Use .expanding(...).{func}() instead ".format(func=name) + ) if len(args) > 0: raise UnsupportedFunctionCall(msg) @@ -349,14 +374,16 @@ def validate_groupby_func(name, args, kwargs, allowed=None): kwargs = set(kwargs) - set(allowed) if len(args) + len(kwargs) > 0: - raise UnsupportedFunctionCall(( - "numpy operations are not valid " - "with groupby. Use .groupby(...)." - "{func}() instead".format(func=name))) + raise UnsupportedFunctionCall( + ( + "numpy operations are not valid " + "with groupby. Use .groupby(...)." + "{func}() instead".format(func=name) + ) + ) -RESAMPLER_NUMPY_OPS = ('min', 'max', 'sum', 'prod', - 'mean', 'std', 'var') +RESAMPLER_NUMPY_OPS = ("min", "max", "sum", "prod", "mean", "std", "var") def validate_resampler_func(method, args, kwargs): @@ -367,10 +394,13 @@ def validate_resampler_func(method, args, kwargs): """ if len(args) + len(kwargs) > 0: if method in RESAMPLER_NUMPY_OPS: - raise UnsupportedFunctionCall(( - "numpy operations are not valid " - "with resample. Use .resample(...)." - "{func}() instead".format(func=method))) + raise UnsupportedFunctionCall( + ( + "numpy operations are not valid " + "with resample. Use .resample(...)." + "{func}() instead".format(func=method) + ) + ) else: raise TypeError("too many arguments passed in") @@ -392,5 +422,7 @@ def validate_minmax_axis(axis): if axis is None: return if axis >= ndim or (axis < 0 and ndim + axis < 0): - raise ValueError("`axis` must be fewer than the number of " - "dimensions ({ndim})".format(ndim=ndim)) + raise ValueError( + "`axis` must be fewer than the number of " + "dimensions ({ndim})".format(ndim=ndim) + ) diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 3b63cbf1cfabba..0934d8529fdf79 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -26,7 +26,7 @@ def load_reduce(self): # If we have a deprecated function, # try to replace and try again. - msg = '_reconstruct: First argument must be a sub-type of ndarray' + msg = "_reconstruct: First argument must be a sub-type of ndarray" if msg in str(e): try: @@ -37,10 +37,11 @@ def load_reduce(self): pass # try to re-encode the arguments - if getattr(self, 'encoding', None) is not None: - args = tuple(arg.encode(self.encoding) - if isinstance(arg, str) - else arg for arg in args) + if getattr(self, "encoding", None) is not None: + args = tuple( + arg.encode(self.encoding) if isinstance(arg, str) else arg + for arg in args + ) try: stack[-1] = func(*args) return @@ -48,7 +49,7 @@ def load_reduce(self): pass # unknown exception, re-raise - if getattr(self, 'is_verbose', None): + if getattr(self, "is_verbose", None): print(sys.exc_info()) print(func, args) raise @@ -56,9 +57,7 @@ def load_reduce(self): # If classes are moved, provide compat here. _class_locations_map = { - ('pandas.core.sparse.array', 'SparseArray'): - ('pandas.core.arrays', 'SparseArray'), - + ("pandas.core.sparse.array", "SparseArray"): ("pandas.core.arrays", "SparseArray"), # 15477 # # TODO: When FrozenNDArray is removed, add @@ -71,75 +70,84 @@ def load_reduce(self): # # Afterwards, remove the current entry # for `pandas.core.base.FrozenNDArray`. - ('pandas.core.base', 'FrozenNDArray'): - ('pandas.core.indexes.frozen', 'FrozenNDArray'), - ('pandas.core.base', 'FrozenList'): - ('pandas.core.indexes.frozen', 'FrozenList'), - + ("pandas.core.base", "FrozenNDArray"): ( + "pandas.core.indexes.frozen", + "FrozenNDArray", + ), + ("pandas.core.base", "FrozenList"): ("pandas.core.indexes.frozen", "FrozenList"), # 10890 - ('pandas.core.series', 'TimeSeries'): - ('pandas.core.series', 'Series'), - ('pandas.sparse.series', 'SparseTimeSeries'): - ('pandas.core.sparse.series', 'SparseSeries'), - + ("pandas.core.series", "TimeSeries"): ("pandas.core.series", "Series"), + ("pandas.sparse.series", "SparseTimeSeries"): ( + "pandas.core.sparse.series", + "SparseSeries", + ), # 12588, extensions moving - ('pandas._sparse', 'BlockIndex'): - ('pandas._libs.sparse', 'BlockIndex'), - ('pandas.tslib', 'Timestamp'): - ('pandas._libs.tslib', 'Timestamp'), - + ("pandas._sparse", "BlockIndex"): ("pandas._libs.sparse", "BlockIndex"), + ("pandas.tslib", "Timestamp"): ("pandas._libs.tslib", "Timestamp"), # 18543 moving period - ('pandas._period', 'Period'): ('pandas._libs.tslibs.period', 'Period'), - ('pandas._libs.period', 'Period'): - ('pandas._libs.tslibs.period', 'Period'), - + ("pandas._period", "Period"): ("pandas._libs.tslibs.period", "Period"), + ("pandas._libs.period", "Period"): ("pandas._libs.tslibs.period", "Period"), # 18014 moved __nat_unpickle from _libs.tslib-->_libs.tslibs.nattype - ('pandas.tslib', '__nat_unpickle'): - ('pandas._libs.tslibs.nattype', '__nat_unpickle'), - ('pandas._libs.tslib', '__nat_unpickle'): - ('pandas._libs.tslibs.nattype', '__nat_unpickle'), - + ("pandas.tslib", "__nat_unpickle"): ( + "pandas._libs.tslibs.nattype", + "__nat_unpickle", + ), + ("pandas._libs.tslib", "__nat_unpickle"): ( + "pandas._libs.tslibs.nattype", + "__nat_unpickle", + ), # 15998 top-level dirs moving - ('pandas.sparse.array', 'SparseArray'): - ('pandas.core.arrays.sparse', 'SparseArray'), - ('pandas.sparse.series', 'SparseSeries'): - ('pandas.core.sparse.series', 'SparseSeries'), - ('pandas.sparse.frame', 'SparseDataFrame'): - ('pandas.core.sparse.frame', 'SparseDataFrame'), - ('pandas.indexes.base', '_new_Index'): - ('pandas.core.indexes.base', '_new_Index'), - ('pandas.indexes.base', 'Index'): - ('pandas.core.indexes.base', 'Index'), - ('pandas.indexes.numeric', 'Int64Index'): - ('pandas.core.indexes.numeric', 'Int64Index'), - ('pandas.indexes.range', 'RangeIndex'): - ('pandas.core.indexes.range', 'RangeIndex'), - ('pandas.indexes.multi', 'MultiIndex'): - ('pandas.core.indexes.multi', 'MultiIndex'), - ('pandas.tseries.index', '_new_DatetimeIndex'): - ('pandas.core.indexes.datetimes', '_new_DatetimeIndex'), - ('pandas.tseries.index', 'DatetimeIndex'): - ('pandas.core.indexes.datetimes', 'DatetimeIndex'), - ('pandas.tseries.period', 'PeriodIndex'): - ('pandas.core.indexes.period', 'PeriodIndex'), - + ("pandas.sparse.array", "SparseArray"): ( + "pandas.core.arrays.sparse", + "SparseArray", + ), + ("pandas.sparse.series", "SparseSeries"): ( + "pandas.core.sparse.series", + "SparseSeries", + ), + ("pandas.sparse.frame", "SparseDataFrame"): ( + "pandas.core.sparse.frame", + "SparseDataFrame", + ), + ("pandas.indexes.base", "_new_Index"): ("pandas.core.indexes.base", "_new_Index"), + ("pandas.indexes.base", "Index"): ("pandas.core.indexes.base", "Index"), + ("pandas.indexes.numeric", "Int64Index"): ( + "pandas.core.indexes.numeric", + "Int64Index", + ), + ("pandas.indexes.range", "RangeIndex"): ("pandas.core.indexes.range", "RangeIndex"), + ("pandas.indexes.multi", "MultiIndex"): ("pandas.core.indexes.multi", "MultiIndex"), + ("pandas.tseries.index", "_new_DatetimeIndex"): ( + "pandas.core.indexes.datetimes", + "_new_DatetimeIndex", + ), + ("pandas.tseries.index", "DatetimeIndex"): ( + "pandas.core.indexes.datetimes", + "DatetimeIndex", + ), + ("pandas.tseries.period", "PeriodIndex"): ( + "pandas.core.indexes.period", + "PeriodIndex", + ), # 19269, arrays moving - ('pandas.core.categorical', 'Categorical'): - ('pandas.core.arrays', 'Categorical'), - + ("pandas.core.categorical", "Categorical"): ("pandas.core.arrays", "Categorical"), # 19939, add timedeltaindex, float64index compat from 15998 move - ('pandas.tseries.tdi', 'TimedeltaIndex'): - ('pandas.core.indexes.timedeltas', 'TimedeltaIndex'), - ('pandas.indexes.numeric', 'Float64Index'): - ('pandas.core.indexes.numeric', 'Float64Index'), + ("pandas.tseries.tdi", "TimedeltaIndex"): ( + "pandas.core.indexes.timedeltas", + "TimedeltaIndex", + ), + ("pandas.indexes.numeric", "Float64Index"): ( + "pandas.core.indexes.numeric", + "Float64Index", + ), } # our Unpickler sub-class to override methods and some dispatcher # functions for compat -class Unpickler(pkl._Unpickler): # type: ignore +class Unpickler(pkl._Unpickler): # type: ignore def find_class(self, module, name): # override superclass key = (module, name) diff --git a/pandas/conftest.py b/pandas/conftest.py index 058361af343b63..29833ab2fc0fa5 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -24,68 +24,70 @@ # or `deadline=None` to entirely disable timeouts for that test. deadline=500, timeout=hypothesis.unlimited, - suppress_health_check=(hypothesis.HealthCheck.too_slow,) + suppress_health_check=(hypothesis.HealthCheck.too_slow,), ) hypothesis.settings.load_profile("ci") def pytest_addoption(parser): - parser.addoption("--skip-slow", action="store_true", - help="skip slow tests") - parser.addoption("--skip-network", action="store_true", - help="skip network tests") - parser.addoption("--skip-db", action="store_true", - help="skip db tests") - parser.addoption("--run-high-memory", action="store_true", - help="run high memory tests") - parser.addoption("--only-slow", action="store_true", - help="run only slow tests") - parser.addoption("--strict-data-files", action="store_true", - help="Fail if a test is skipped for missing data file.") + parser.addoption("--skip-slow", action="store_true", help="skip slow tests") + parser.addoption("--skip-network", action="store_true", help="skip network tests") + parser.addoption("--skip-db", action="store_true", help="skip db tests") + parser.addoption( + "--run-high-memory", action="store_true", help="run high memory tests" + ) + parser.addoption("--only-slow", action="store_true", help="run only slow tests") + parser.addoption( + "--strict-data-files", + action="store_true", + help="Fail if a test is skipped for missing data file.", + ) def pytest_runtest_setup(item): - if 'slow' in item.keywords and item.config.getoption("--skip-slow"): + if "slow" in item.keywords and item.config.getoption("--skip-slow"): pytest.skip("skipping due to --skip-slow") - if 'slow' not in item.keywords and item.config.getoption("--only-slow"): + if "slow" not in item.keywords and item.config.getoption("--only-slow"): pytest.skip("skipping due to --only-slow") - if 'network' in item.keywords and item.config.getoption("--skip-network"): + if "network" in item.keywords and item.config.getoption("--skip-network"): pytest.skip("skipping due to --skip-network") - if 'db' in item.keywords and item.config.getoption("--skip-db"): + if "db" in item.keywords and item.config.getoption("--skip-db"): pytest.skip("skipping due to --skip-db") - if 'high_memory' in item.keywords and not item.config.getoption( - "--run-high-memory"): - pytest.skip( - "skipping high memory test since --run-high-memory was not set") + if "high_memory" in item.keywords and not item.config.getoption( + "--run-high-memory" + ): + pytest.skip("skipping high memory test since --run-high-memory was not set") # Configurations for all tests and all test modules + @pytest.fixture(autouse=True) def configure_tests(): - pd.set_option('chained_assignment', 'raise') + pd.set_option("chained_assignment", "raise") # For running doctests: make np and pd names available + @pytest.fixture(autouse=True) def add_imports(doctest_namespace): - doctest_namespace['np'] = np - doctest_namespace['pd'] = pd + doctest_namespace["np"] = np + doctest_namespace["pd"] = pd -@pytest.fixture(params=['bsr', 'coo', 'csc', 'csr', 'dia', 'dok', 'lil']) +@pytest.fixture(params=["bsr", "coo", "csc", "csr", "dia", "dok", "lil"]) def spmatrix(request): from scipy import sparse - return getattr(sparse, request.param + '_matrix') + + return getattr(sparse, request.param + "_matrix") -@pytest.fixture(params=[0, 1, 'index', 'columns'], - ids=lambda x: "axis {!r}".format(x)) +@pytest.fixture(params=[0, 1, "index", "columns"], ids=lambda x: "axis {!r}".format(x)) def axis(request): """ Fixture for returning the axis numbers of a DataFrame. @@ -96,7 +98,7 @@ def axis(request): axis_frame = axis -@pytest.fixture(params=[0, 'index'], ids=lambda x: "axis {!r}".format(x)) +@pytest.fixture(params=[0, "index"], ids=lambda x: "axis {!r}".format(x)) def axis_series(request): """ Fixture for returning the axis numbers of a Series. @@ -112,8 +114,9 @@ def ip(): Will raise a skip if IPython is not installed. """ - pytest.importorskip('IPython', minversion="6.0.0") + pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.interactiveshell import InteractiveShell + return InteractiveShell() @@ -134,13 +137,22 @@ def ordered_fixture(request): return request.param -_all_arithmetic_operators = ['__add__', '__radd__', - '__sub__', '__rsub__', - '__mul__', '__rmul__', - '__floordiv__', '__rfloordiv__', - '__truediv__', '__rtruediv__', - '__pow__', '__rpow__', - '__mod__', '__rmod__'] +_all_arithmetic_operators = [ + "__add__", + "__radd__", + "__sub__", + "__rsub__", + "__mul__", + "__rmul__", + "__floordiv__", + "__rfloordiv__", + "__truediv__", + "__rtruediv__", + "__pow__", + "__rpow__", + "__mod__", + "__rmod__", +] @pytest.fixture(params=_all_arithmetic_operators) @@ -151,9 +163,18 @@ def all_arithmetic_operators(request): return request.param -_all_numeric_reductions = ['sum', 'max', 'min', - 'mean', 'prod', 'std', 'var', 'median', - 'kurt', 'skew'] +_all_numeric_reductions = [ + "sum", + "max", + "min", + "mean", + "prod", + "std", + "var", + "median", + "kurt", + "skew", +] @pytest.fixture(params=_all_numeric_reductions) @@ -164,7 +185,7 @@ def all_numeric_reductions(request): return request.param -_all_boolean_reductions = ['all', 'any'] +_all_boolean_reductions = ["all", "any"] @pytest.fixture(params=_all_boolean_reductions) @@ -202,13 +223,15 @@ def _get_cython_table_params(ndframe, func_names_and_expected): results = [] for func_name, expected in func_names_and_expected: results.append((ndframe, func_name, expected)) - results += [(ndframe, func, expected) for func, name in _cython_table - if name == func_name] + results += [ + (ndframe, func, expected) + for func, name in _cython_table + if name == func_name + ] return results -@pytest.fixture(params=['__eq__', '__ne__', '__le__', - '__lt__', '__ge__', '__gt__']) +@pytest.fixture(params=["__eq__", "__ne__", "__le__", "__lt__", "__ge__", "__gt__"]) def all_compare_operators(request): """ Fixture for dunder names for common compare operations @@ -223,7 +246,7 @@ def all_compare_operators(request): return request.param -@pytest.fixture(params=['__le__', '__lt__', '__ge__', '__gt__']) +@pytest.fixture(params=["__le__", "__lt__", "__ge__", "__gt__"]) def compare_operators_no_eq_ne(request): """ Fixture for dunder names for compare operations except == and != @@ -236,7 +259,7 @@ def compare_operators_no_eq_ne(request): return request.param -@pytest.fixture(params=[None, 'gzip', 'bz2', 'zip', 'xz']) +@pytest.fixture(params=[None, "gzip", "bz2", "zip", "xz"]) def compression(request): """ Fixture for trying common compression types in compression tests @@ -244,7 +267,7 @@ def compression(request): return request.param -@pytest.fixture(params=['gzip', 'bz2', 'zip', 'xz']) +@pytest.fixture(params=["gzip", "bz2", "zip", "xz"]) def compression_only(request): """ Fixture for trying common compression types in compression tests excluding @@ -261,12 +284,12 @@ def writable(request): return request.param -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def datetime_tz_utc(): return timezone.utc -@pytest.fixture(params=['utc', 'dateutil/UTC', utc, tzutc(), timezone.utc]) +@pytest.fixture(params=["utc", "dateutil/UTC", utc, tzutc(), timezone.utc]) def utc_fixture(request): """ Fixture to provide variants of UTC timezone strings and tzinfo objects @@ -274,7 +297,7 @@ def utc_fixture(request): return request.param -@pytest.fixture(params=['inner', 'outer', 'left', 'right']) +@pytest.fixture(params=["inner", "outer", "left", "right"]) def join_type(request): """ Fixture for trying all types of join operations @@ -305,7 +328,7 @@ def datapath(strict_data_files): ValueError If the path doesn't exist and the --strict-data-files option is set. """ - BASE_PATH = os.path.join(os.path.dirname(__file__), 'tests') + BASE_PATH = os.path.join(os.path.dirname(__file__), "tests") def deco(*args): path = os.path.join(BASE_PATH, *args) @@ -317,16 +340,17 @@ def deco(*args): msg = "Could not find {}." pytest.skip(msg.format(path)) return path + return deco @pytest.fixture def iris(datapath): """The iris dataset as a DataFrame.""" - return pd.read_csv(datapath('data', 'iris.csv')) + return pd.read_csv(datapath("data", "iris.csv")) -@pytest.fixture(params=['nlargest', 'nsmallest']) +@pytest.fixture(params=["nlargest", "nsmallest"]) def nselect_method(request): """ Fixture for trying all nselect methods @@ -334,7 +358,7 @@ def nselect_method(request): return request.param -@pytest.fixture(params=['left', 'right', 'both', 'neither']) +@pytest.fixture(params=["left", "right", "both", "neither"]) def closed(request): """ Fixture for trying all interval closed parameters @@ -342,7 +366,7 @@ def closed(request): return request.param -@pytest.fixture(params=['left', 'right', 'both', 'neither']) +@pytest.fixture(params=["left", "right", "both", "neither"]) def other_closed(request): """ Secondary closed fixture to allow parametrizing over all pairs of closed @@ -350,7 +374,7 @@ def other_closed(request): return request.param -@pytest.fixture(params=[None, np.nan, pd.NaT, float('nan'), np.float('NaN')]) +@pytest.fixture(params=[None, np.nan, pd.NaT, float("nan"), np.float("NaN")]) def nulls_fixture(request): """ Fixture for each null type in pandas @@ -373,11 +397,22 @@ def unique_nulls_fixture(request): unique_nulls_fixture2 = unique_nulls_fixture -TIMEZONES = [None, 'UTC', 'US/Eastern', 'Asia/Tokyo', 'dateutil/US/Pacific', - 'dateutil/Asia/Singapore', tzutc(), tzlocal(), FixedOffset(300), - FixedOffset(0), FixedOffset(-300), timezone.utc, - timezone(timedelta(hours=1)), - timezone(timedelta(hours=-1), name='foo')] +TIMEZONES = [ + None, + "UTC", + "US/Eastern", + "Asia/Tokyo", + "dateutil/US/Pacific", + "dateutil/Asia/Singapore", + tzutc(), + tzlocal(), + FixedOffset(300), + FixedOffset(0), + FixedOffset(-300), + timezone.utc, + timezone(timedelta(hours=1)), + timezone(timedelta(hours=-1), name="foo"), +] TIMEZONE_IDS = [repr(i) for i in TIMEZONES] @@ -416,19 +451,26 @@ def tz_aware_fixture(request): FLOAT_DTYPES = [float, "float32", "float64"] COMPLEX_DTYPES = [complex, "complex64", "complex128"] -STRING_DTYPES = [str, 'str', 'U'] +STRING_DTYPES = [str, "str", "U"] -DATETIME64_DTYPES = ['datetime64[ns]', 'M8[ns]'] -TIMEDELTA64_DTYPES = ['timedelta64[ns]', 'm8[ns]'] +DATETIME64_DTYPES = ["datetime64[ns]", "M8[ns]"] +TIMEDELTA64_DTYPES = ["timedelta64[ns]", "m8[ns]"] -BOOL_DTYPES = [bool, 'bool'] -BYTES_DTYPES = [bytes, 'bytes'] -OBJECT_DTYPES = [object, 'object'] +BOOL_DTYPES = [bool, "bool"] +BYTES_DTYPES = [bytes, "bytes"] +OBJECT_DTYPES = [object, "object"] ALL_REAL_DTYPES = FLOAT_DTYPES + ALL_INT_DTYPES -ALL_NUMPY_DTYPES = (ALL_REAL_DTYPES + COMPLEX_DTYPES + STRING_DTYPES + - DATETIME64_DTYPES + TIMEDELTA64_DTYPES + BOOL_DTYPES + - OBJECT_DTYPES + BYTES_DTYPES) +ALL_NUMPY_DTYPES = ( + ALL_REAL_DTYPES + + COMPLEX_DTYPES + + STRING_DTYPES + + DATETIME64_DTYPES + + TIMEDELTA64_DTYPES + + BOOL_DTYPES + + OBJECT_DTYPES + + BYTES_DTYPES +) @pytest.fixture(params=STRING_DTYPES) @@ -618,29 +660,29 @@ def any_numpy_dtype(request): # categoricals are handled separately _any_skipna_inferred_dtype = [ - ('string', ['a', np.nan, 'c']), - ('bytes', [b'a', np.nan, b'c']), - ('empty', [np.nan, np.nan, np.nan]), - ('empty', []), - ('mixed-integer', ['a', np.nan, 2]), - ('mixed', ['a', np.nan, 2.0]), - ('floating', [1.0, np.nan, 2.0]), - ('integer', [1, np.nan, 2]), - ('mixed-integer-float', [1, np.nan, 2.0]), - ('decimal', [Decimal(1), np.nan, Decimal(2)]), - ('boolean', [True, np.nan, False]), - ('datetime64', [np.datetime64('2013-01-01'), np.nan, - np.datetime64('2018-01-01')]), - ('datetime', [pd.Timestamp('20130101'), np.nan, pd.Timestamp('20180101')]), - ('date', [date(2013, 1, 1), np.nan, date(2018, 1, 1)]), + ("string", ["a", np.nan, "c"]), + ("bytes", [b"a", np.nan, b"c"]), + ("empty", [np.nan, np.nan, np.nan]), + ("empty", []), + ("mixed-integer", ["a", np.nan, 2]), + ("mixed", ["a", np.nan, 2.0]), + ("floating", [1.0, np.nan, 2.0]), + ("integer", [1, np.nan, 2]), + ("mixed-integer-float", [1, np.nan, 2.0]), + ("decimal", [Decimal(1), np.nan, Decimal(2)]), + ("boolean", [True, np.nan, False]), + ("datetime64", [np.datetime64("2013-01-01"), np.nan, np.datetime64("2018-01-01")]), + ("datetime", [pd.Timestamp("20130101"), np.nan, pd.Timestamp("20180101")]), + ("date", [date(2013, 1, 1), np.nan, date(2018, 1, 1)]), # The following two dtypes are commented out due to GH 23554 # ('complex', [1 + 1j, np.nan, 2 + 2j]), # ('timedelta64', [np.timedelta64(1, 'D'), # np.nan, np.timedelta64(2, 'D')]), - ('timedelta', [timedelta(1), np.nan, timedelta(2)]), - ('time', [time(1), np.nan, time(2)]), - ('period', [pd.Period(2013), pd.NaT, pd.Period(2018)]), - ('interval', [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)])] + ("timedelta", [timedelta(1), np.nan, timedelta(2)]), + ("time", [time(1), np.nan, time(2)]), + ("period", [pd.Period(2013), pd.NaT, pd.Period(2018)]), + ("interval", [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)]), +] ids, _ = zip(*_any_skipna_inferred_dtype) # use inferred type as fixture-id @@ -692,45 +734,55 @@ def any_skipna_inferred_dtype(request): return inferred_dtype, values -@pytest.fixture(params=[getattr(pd.offsets, o) for o in pd.offsets.__all__ if - issubclass(getattr(pd.offsets, o), pd.offsets.Tick)]) +@pytest.fixture( + params=[ + getattr(pd.offsets, o) + for o in pd.offsets.__all__ + if issubclass(getattr(pd.offsets, o), pd.offsets.Tick) + ] +) def tick_classes(request): """ Fixture for Tick based datetime offsets available for a time series. """ return request.param + # ---------------------------------------------------------------- # Global setup for tests using Hypothesis # Registering these strategies makes them globally available via st.from_type, # which is use for offsets in tests/tseries/offsets/test_offsets_properties.py -for name in 'MonthBegin MonthEnd BMonthBegin BMonthEnd'.split(): +for name in "MonthBegin MonthEnd BMonthBegin BMonthEnd".split(): cls = getattr(pd.tseries.offsets, name) - st.register_type_strategy(cls, st.builds( - cls, - n=st.integers(-99, 99), - normalize=st.booleans(), - )) + st.register_type_strategy( + cls, st.builds(cls, n=st.integers(-99, 99), normalize=st.booleans()) + ) -for name in 'YearBegin YearEnd BYearBegin BYearEnd'.split(): +for name in "YearBegin YearEnd BYearBegin BYearEnd".split(): cls = getattr(pd.tseries.offsets, name) - st.register_type_strategy(cls, st.builds( + st.register_type_strategy( cls, - n=st.integers(-5, 5), - normalize=st.booleans(), - month=st.integers(min_value=1, max_value=12), - )) - -for name in 'QuarterBegin QuarterEnd BQuarterBegin BQuarterEnd'.split(): + st.builds( + cls, + n=st.integers(-5, 5), + normalize=st.booleans(), + month=st.integers(min_value=1, max_value=12), + ), + ) + +for name in "QuarterBegin QuarterEnd BQuarterBegin BQuarterEnd".split(): cls = getattr(pd.tseries.offsets, name) - st.register_type_strategy(cls, st.builds( + st.register_type_strategy( cls, - n=st.integers(-24, 24), - normalize=st.booleans(), - startingMonth=st.integers(min_value=1, max_value=12) - )) + st.builds( + cls, + n=st.integers(-24, 24), + normalize=st.booleans(), + startingMonth=st.integers(min_value=1, max_value=12), + ), + ) @pytest.fixture diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index b092541da93e64..f84033e9c3c90f 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -13,7 +13,8 @@ class DirNamesMixin: _accessors = set() # type: Set[str] _deprecations = frozenset( - ['asobject', 'base', 'data', 'flags', 'itemsize', 'strides']) + ["asobject", "base", "data", "flags", "itemsize", "strides"] + ) def _dir_deletions(self): """ @@ -50,8 +51,7 @@ class PandasDelegate: """ def _delegate_property_get(self, name, *args, **kwargs): - raise TypeError("You cannot access the " - "property {name}".format(name=name)) + raise TypeError("You cannot access the " "property {name}".format(name=name)) def _delegate_property_set(self, name, value, *args, **kwargs): raise TypeError("The property {name} cannot be set".format(name=name)) @@ -60,8 +60,7 @@ def _delegate_method(self, name, *args, **kwargs): raise TypeError("You cannot call method {name}".format(name=name)) @classmethod - def _add_delegate_accessors(cls, delegate, accessors, typ, - overwrite=False): + def _add_delegate_accessors(cls, delegate, accessors, typ, overwrite=False): """ Add accessors to cls from the delegate class. @@ -76,7 +75,6 @@ def _add_delegate_accessors(cls, delegate, accessors, typ, """ def _create_delegator_property(name): - def _getter(self): return self._delegate_property_get(name) @@ -86,11 +84,11 @@ def _setter(self, new_values): _getter.__name__ = name _setter.__name__ = name - return property(fget=_getter, fset=_setter, - doc=getattr(delegate, name).__doc__) + return property( + fget=_getter, fset=_setter, doc=getattr(delegate, name).__doc__ + ) def _create_delegator_method(name): - def f(self, *args, **kwargs): return self._delegate_method(name, *args, **kwargs) @@ -101,7 +99,7 @@ def f(self, *args, **kwargs): for name in accessors: - if typ == 'property': + if typ == "property": f = _create_delegator_property(name) else: f = _create_delegator_method(name) @@ -138,9 +136,9 @@ def delegate_names(delegate, accessors, typ, overwrite=False): class CategoricalAccessor(PandasDelegate): [...] """ + def add_delegate_accessors(cls): - cls._add_delegate_accessors(delegate, accessors, typ, - overwrite=overwrite) + cls._add_delegate_accessors(delegate, accessors, typ, overwrite=overwrite) return cls return add_delegate_accessors @@ -151,6 +149,7 @@ def add_delegate_accessors(cls): # 1. We don't need to catch and re-raise AttributeErrors as RuntimeErrors # 2. We use a UserWarning instead of a custom Warning + class CachedAccessor: """ Custom property-like object (descriptor) for caching accessors. @@ -164,6 +163,7 @@ class CachedAccessor: should expect one of a ``Series``, ``DataFrame`` or ``Index`` as the single argument ``data`` """ + def __init__(self, name, accessor): self._name = name self._accessor = accessor @@ -185,14 +185,16 @@ def _register_accessor(name, cls): def decorator(accessor): if hasattr(cls, name): warnings.warn( - 'registration of accessor {!r} under name {!r} for type ' - '{!r} is overriding a preexisting attribute with the same ' - 'name.'.format(accessor, name, cls), + "registration of accessor {!r} under name {!r} for type " + "{!r} is overriding a preexisting attribute with the same " + "name.".format(accessor, name, cls), UserWarning, - stacklevel=2) + stacklevel=2, + ) setattr(cls, name, CachedAccessor(name, accessor)) cls._accessors.add(name) return accessor + return decorator @@ -266,25 +268,40 @@ def plot(self): """ -@Appender(_doc % dict(klass="DataFrame", - others=("register_series_accessor, " - "register_index_accessor"))) +@Appender( + _doc + % dict( + klass="DataFrame", + others=("register_series_accessor, " "register_index_accessor"), + ) +) def register_dataframe_accessor(name): from pandas import DataFrame + return _register_accessor(name, DataFrame) -@Appender(_doc % dict(klass="Series", - others=("register_dataframe_accessor, " - "register_index_accessor"))) +@Appender( + _doc + % dict( + klass="Series", + others=("register_dataframe_accessor, " "register_index_accessor"), + ) +) def register_series_accessor(name): from pandas import Series + return _register_accessor(name, Series) -@Appender(_doc % dict(klass="Index", - others=("register_dataframe_accessor, " - "register_series_accessor"))) +@Appender( + _doc + % dict( + klass="Index", + others=("register_dataframe_accessor, " "register_series_accessor"), + ) +) def register_index_accessor(name): from pandas import Index + return _register_accessor(name, Index) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4e84d7b26b7075..79f205de118789 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -13,16 +13,39 @@ from pandas.util._decorators import Appender, Substitution, deprecate_kwarg from pandas.core.dtypes.cast import ( - construct_1d_object_array_from_listlike, maybe_promote) + construct_1d_object_array_from_listlike, + maybe_promote, +) from pandas.core.dtypes.common import ( - ensure_float64, ensure_int64, ensure_object, ensure_platform_int, - ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype, - is_complex_dtype, is_datetime64_any_dtype, is_datetime64_ns_dtype, - is_datetime64tz_dtype, is_datetimelike, is_extension_array_dtype, - is_float_dtype, is_integer, is_integer_dtype, is_interval_dtype, - is_list_like, is_numeric_dtype, is_object_dtype, is_period_dtype, - is_scalar, is_signed_integer_dtype, is_sparse, is_timedelta64_dtype, - is_unsigned_integer_dtype, needs_i8_conversion) + ensure_float64, + ensure_int64, + ensure_object, + ensure_platform_int, + ensure_uint64, + is_array_like, + is_bool_dtype, + is_categorical_dtype, + is_complex_dtype, + is_datetime64_any_dtype, + is_datetime64_ns_dtype, + is_datetime64tz_dtype, + is_datetimelike, + is_extension_array_dtype, + is_float_dtype, + is_integer, + is_integer_dtype, + is_interval_dtype, + is_list_like, + is_numeric_dtype, + is_object_dtype, + is_period_dtype, + is_scalar, + is_signed_integer_dtype, + is_sparse, + is_timedelta64_dtype, + is_unsigned_integer_dtype, + needs_i8_conversion, +) from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, na_value_for_dtype @@ -62,20 +85,19 @@ def _ensure_data(values, dtype=None): # we check some simple dtypes first try: if is_object_dtype(dtype): - return ensure_object(np.asarray(values)), 'object', 'object' + return ensure_object(np.asarray(values)), "object", "object" if is_bool_dtype(values) or is_bool_dtype(dtype): # we are actually coercing to uint64 # until our algos support uint8 directly (see TODO) - return np.asarray(values).astype('uint64'), 'bool', 'uint64' + return np.asarray(values).astype("uint64"), "bool", "uint64" elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype): - return ensure_int64(values), 'int64', 'int64' - elif (is_unsigned_integer_dtype(values) or - is_unsigned_integer_dtype(dtype)): - return ensure_uint64(values), 'uint64', 'uint64' + return ensure_int64(values), "int64", "int64" + elif is_unsigned_integer_dtype(values) or is_unsigned_integer_dtype(dtype): + return ensure_uint64(values), "uint64", "uint64" elif is_float_dtype(values) or is_float_dtype(dtype): - return ensure_float64(values), 'float64', 'float64' + return ensure_float64(values), "float64", "float64" elif is_object_dtype(values) and dtype is None: - return ensure_object(np.asarray(values)), 'object', 'object' + return ensure_object(np.asarray(values)), "object", "object" elif is_complex_dtype(values) or is_complex_dtype(dtype): # ignore the fact that we are casting to float @@ -83,24 +105,28 @@ def _ensure_data(values, dtype=None): with catch_warnings(): simplefilter("ignore", np.ComplexWarning) values = ensure_float64(values) - return values, 'float64', 'float64' + return values, "float64", "float64" except (TypeError, ValueError, OverflowError): # if we are trying to coerce to a dtype # and it is incompat this will fall thru to here - return ensure_object(values), 'object', 'object' + return ensure_object(values), "object", "object" # datetimelike - if (needs_i8_conversion(values) or - is_period_dtype(dtype) or - is_datetime64_any_dtype(dtype) or - is_timedelta64_dtype(dtype)): + if ( + needs_i8_conversion(values) + or is_period_dtype(dtype) + or is_datetime64_any_dtype(dtype) + or is_timedelta64_dtype(dtype) + ): if is_period_dtype(values) or is_period_dtype(dtype): from pandas import PeriodIndex + values = PeriodIndex(values) dtype = values.dtype elif is_timedelta64_dtype(values) or is_timedelta64_dtype(dtype): from pandas import TimedeltaIndex + values = TimedeltaIndex(values) dtype = values.dtype else: @@ -108,31 +134,33 @@ def _ensure_data(values, dtype=None): if values.ndim > 1 and is_datetime64_ns_dtype(values): # Avoid calling the DatetimeIndex constructor as it is 1D only # Note: this is reached by DataFrame.rank calls GH#27027 - asi8 = values.view('i8') + asi8 = values.view("i8") dtype = values.dtype - return asi8, dtype, 'int64' + return asi8, dtype, "int64" from pandas import DatetimeIndex + values = DatetimeIndex(values) dtype = values.dtype - return values.asi8, dtype, 'int64' + return values.asi8, dtype, "int64" - elif (is_categorical_dtype(values) and - (is_categorical_dtype(dtype) or dtype is None)): - values = getattr(values, 'values', values) + elif is_categorical_dtype(values) and ( + is_categorical_dtype(dtype) or dtype is None + ): + values = getattr(values, "values", values) values = values.codes - dtype = 'category' + dtype = "category" # we are actually coercing to int64 # until our algos support int* directly (not all do) values = ensure_int64(values) - return values, dtype, 'int64' + return values, dtype, "int64" # we have failed, return object values = np.asarray(values, dtype=np.object) - return ensure_object(values), 'object', 'object' + return ensure_object(values), "object", "object" def _reconstruct_data(values, dtype, original): @@ -150,6 +178,7 @@ def _reconstruct_data(values, dtype, original): Index for extension types, otherwise ndarray casted to dtype """ from pandas import Index + if is_extension_array_dtype(dtype): values = dtype.construct_array_type()._from_sequence(values) elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype): @@ -172,7 +201,7 @@ def _ensure_arraylike(values): """ if not is_array_like(values): inferred = lib.infer_dtype(values, skipna=False) - if inferred in ['mixed', 'string', 'unicode']: + if inferred in ["mixed", "string", "unicode"]: if isinstance(values, tuple): values = list(values) values = construct_1d_object_array_from_listlike(values) @@ -182,11 +211,11 @@ def _ensure_arraylike(values): _hashtables = { - 'float64': (htable.Float64HashTable, htable.Float64Vector), - 'uint64': (htable.UInt64HashTable, htable.UInt64Vector), - 'int64': (htable.Int64HashTable, htable.Int64Vector), - 'string': (htable.StringHashTable, htable.ObjectVector), - 'object': (htable.PyObjectHashTable, htable.ObjectVector) + "float64": (htable.Float64HashTable, htable.Float64Vector), + "uint64": (htable.UInt64HashTable, htable.UInt64Vector), + "int64": (htable.Int64HashTable, htable.Int64Vector), + "string": (htable.StringHashTable, htable.ObjectVector), + "object": (htable.PyObjectHashTable, htable.ObjectVector), } @@ -206,15 +235,15 @@ def _get_hashtable_algo(values): """ values, dtype, ndtype = _ensure_data(values) - if ndtype == 'object': + if ndtype == "object": # it's cheaper to use a String Hash Table than Object; we infer # including nulls because that is the only difference between # StringHashTable and ObjectHashtable - if lib.infer_dtype(values, skipna=False) in ['string']: - ndtype = 'string' + if lib.infer_dtype(values, skipna=False) in ["string"]: + ndtype = "string" else: - ndtype = 'object' + ndtype = "object" htable, table = _hashtables[ndtype] return (htable, table, values, dtype, ndtype) @@ -226,15 +255,15 @@ def _get_data_algo(values, func_map): values = values._values_for_rank() values, dtype, ndtype = _ensure_data(values) - if ndtype == 'object': + if ndtype == "object": # it's cheaper to use a String Hash Table than Object; we infer # including nulls because that is the only difference between # StringHashTable and ObjectHashtable - if lib.infer_dtype(values, skipna=False) in ['string']: - ndtype = 'string' + if lib.infer_dtype(values, skipna=False) in ["string"]: + ndtype = "string" - f = func_map.get(ndtype, func_map['object']) + f = func_map.get(ndtype, func_map["object"]) return f, values @@ -243,6 +272,7 @@ def _get_data_algo(values, func_map): # top-level algos # # --------------- # + def match(to_match, values, na_sentinel=-1): """ Compute locations of to_match into values @@ -275,6 +305,7 @@ def match(to_match, values, na_sentinel=-1): # replace but return a numpy array # use a Series because it handles dtype conversions properly from pandas import Series + result = Series(result.ravel()).replace(-1, na_sentinel) result = result.values.reshape(result.shape) @@ -393,13 +424,19 @@ def isin(comps, values): """ if not is_list_like(comps): - raise TypeError("only list-like objects are allowed to be passed" - " to isin(), you passed a [{comps_type}]" - .format(comps_type=type(comps).__name__)) + raise TypeError( + "only list-like objects are allowed to be passed" + " to isin(), you passed a [{comps_type}]".format( + comps_type=type(comps).__name__ + ) + ) if not is_list_like(values): - raise TypeError("only list-like objects are allowed to be passed" - " to isin(), you passed a [{values_type}]" - .format(values_type=type(values).__name__)) + raise TypeError( + "only list-like objects are allowed to be passed" + " to isin(), you passed a [{values_type}]".format( + values_type=type(values).__name__ + ) + ) if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): values = construct_1d_object_array_from_listlike(list(values)) @@ -423,8 +460,8 @@ def isin(comps, values): f = lambda x, y: np.in1d(x, y) elif is_integer_dtype(comps): try: - values = values.astype('int64', copy=False) - comps = comps.astype('int64', copy=False) + values = values.astype("int64", copy=False) + comps = comps.astype("int64", copy=False) f = lambda x, y: htable.ismember_int64(x, y) except (TypeError, ValueError, OverflowError): values = values.astype(object) @@ -432,8 +469,8 @@ def isin(comps, values): elif is_float_dtype(comps): try: - values = values.astype('float64', copy=False) - comps = comps.astype('float64', copy=False) + values = values.astype("float64", copy=False) + comps = comps.astype("float64", copy=False) f = lambda x, y: htable.ismember_float64(x, y) except (TypeError, ValueError): values = values.astype(object) @@ -442,8 +479,7 @@ def isin(comps, values): return f(comps, values) -def _factorize_array(values, na_sentinel=-1, size_hint=None, - na_value=None): +def _factorize_array(values, na_sentinel=-1, size_hint=None, na_value=None): """Factorize an array-like to labels and uniques. This doesn't do any coercion of types or unboxing before factorization. @@ -467,14 +503,17 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, (hash_klass, _), values = _get_data_algo(values, _hashtables) table = hash_klass(size_hint or len(values)) - uniques, labels = table.factorize(values, na_sentinel=na_sentinel, - na_value=na_value) + uniques, labels = table.factorize( + values, na_sentinel=na_sentinel, na_value=na_value + ) labels = ensure_platform_int(labels) return labels, uniques -_shared_docs['factorize'] = """ +_shared_docs[ + "factorize" +] = """ Encode the object as an enumerated type or categorical variable. This method is useful for obtaining a numeric representation of an @@ -568,29 +607,37 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, @Substitution( - values=dedent("""\ + values=dedent( + """\ values : sequence A 1-D sequence. Sequences that aren't pandas objects are coerced to ndarrays before factorization. - """), - order=dedent("""\ + """ + ), + order=dedent( + """\ order : None .. deprecated:: 0.23.0 This parameter has no effect and is deprecated. - """), - sort=dedent("""\ + """ + ), + sort=dedent( + """\ sort : bool, default False Sort `uniques` and shuffle `labels` to maintain the relationship. - """), - size_hint=dedent("""\ + """ + ), + size_hint=dedent( + """\ size_hint : int, optional Hint to the hashtable sizer. - """), + """ + ), ) -@Appender(_shared_docs['factorize']) -@deprecate_kwarg(old_arg_name='order', new_arg_name=None) +@Appender(_shared_docs["factorize"]) +@deprecate_kwarg(old_arg_name="order", new_arg_name=None) def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): # Implementation notes: This method is responsible for 3 things # 1.) coercing data to array-like (ndarray, Index, extension array) @@ -605,28 +652,31 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): original = values if is_extension_array_dtype(values): - values = getattr(values, '_values', values) + values = getattr(values, "_values", values) labels, uniques = values.factorize(na_sentinel=na_sentinel) dtype = original.dtype else: values, dtype, _ = _ensure_data(values) - if (is_datetime64_any_dtype(original) or - is_timedelta64_dtype(original) or - is_period_dtype(original)): + if ( + is_datetime64_any_dtype(original) + or is_timedelta64_dtype(original) + or is_period_dtype(original) + ): na_value = na_value_for_dtype(original.dtype) else: na_value = None - labels, uniques = _factorize_array(values, - na_sentinel=na_sentinel, - size_hint=size_hint, - na_value=na_value) + labels, uniques = _factorize_array( + values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value + ) if sort and len(uniques) > 0: from pandas.core.sorting import safe_sort - uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, - assume_unique=True, verify=False) + + uniques, labels = safe_sort( + uniques, labels, na_sentinel=na_sentinel, assume_unique=True, verify=False + ) uniques = _reconstruct_data(uniques, dtype, original) @@ -635,13 +685,15 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): uniques = original._shallow_copy(uniques, name=None) elif isinstance(original, ABCSeries): from pandas import Index + uniques = Index(uniques) return labels, uniques -def value_counts(values, sort=True, ascending=False, normalize=False, - bins=None, dropna=True): +def value_counts( + values, sort=True, ascending=False, normalize=False, bins=None, dropna=True +): """ Compute a histogram of the counts of non-null values. @@ -666,11 +718,13 @@ def value_counts(values, sort=True, ascending=False, normalize=False, """ from pandas.core.series import Series, Index - name = getattr(values, 'name', None) + + name = getattr(values, "name", None) if bins is not None: try: from pandas.core.reshape.tile import cut + values = Series(values) ii = cut(values, bins, include_lowest=True) except TypeError: @@ -679,7 +733,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, # count, remove nulls (from the index), and but the bins result = ii.value_counts(dropna=dropna) result = result[result.index.notna()] - result.index = result.index.astype('interval') + result.index = result.index.astype("interval") result = result.sort_index() # if we are dropna and we have NO values @@ -757,7 +811,7 @@ def _value_counts_arraylike(values, dropna): return keys, counts -def duplicated(values, keep='first'): +def duplicated(values, keep="first"): """ Return boolean ndarray denoting duplicate values. @@ -829,8 +883,7 @@ def mode(values, dropna=True): return Series(result) -def rank(values, axis=0, method='average', na_option='keep', - ascending=True, pct=False): +def rank(values, axis=0, method="average", na_option="keep", ascending=True, pct=False): """ Rank the values along a given axis. @@ -856,12 +909,23 @@ def rank(values, axis=0, method='average', na_option='keep', """ if values.ndim == 1: f, values = _get_data_algo(values, _rank1d_functions) - ranks = f(values, ties_method=method, ascending=ascending, - na_option=na_option, pct=pct) + ranks = f( + values, + ties_method=method, + ascending=ascending, + na_option=na_option, + pct=pct, + ) elif values.ndim == 2: f, values = _get_data_algo(values, _rank2d_functions) - ranks = f(values, axis=axis, ties_method=method, - ascending=ascending, na_option=na_option, pct=pct) + ranks = f( + values, + axis=axis, + ties_method=method, + ascending=ascending, + na_option=na_option, + pct=pct, + ) else: raise TypeError("Array with ndim > 2 are not supported.") @@ -932,10 +996,12 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): elif not mask2.any(): to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any() else: - to_raise = (((np.iinfo(np.int64).max - - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or - ((np.iinfo(np.int64).min - - b2[mask2] > arr[mask2]) & not_nan[mask2]).any()) + to_raise = ( + ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() + or ( + (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] + ).any() + ) if to_raise: raise OverflowError("Overflow in int64 addition") @@ -943,21 +1009,21 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): _rank1d_functions = { - 'float64': algos.rank_1d_float64, - 'int64': algos.rank_1d_int64, - 'uint64': algos.rank_1d_uint64, - 'object': algos.rank_1d_object + "float64": algos.rank_1d_float64, + "int64": algos.rank_1d_int64, + "uint64": algos.rank_1d_uint64, + "object": algos.rank_1d_object, } _rank2d_functions = { - 'float64': algos.rank_2d_float64, - 'int64': algos.rank_2d_int64, - 'uint64': algos.rank_2d_uint64, - 'object': algos.rank_2d_object + "float64": algos.rank_2d_float64, + "int64": algos.rank_2d_int64, + "uint64": algos.rank_2d_uint64, + "object": algos.rank_2d_object, } -def quantile(x, q, interpolation_method='fraction'): +def quantile(x, q, interpolation_method="fraction"): """ Compute sample quantile or quantiles of the input array. For example, q=0.5 computes the median. @@ -1017,16 +1083,17 @@ def _get_score(at): if idx % 1 == 0: score = values[int(idx)] else: - if interpolation_method == 'fraction': - score = _interpolate(values[int(idx)], values[int(idx) + 1], - idx % 1) - elif interpolation_method == 'lower': + if interpolation_method == "fraction": + score = _interpolate(values[int(idx)], values[int(idx) + 1], idx % 1) + elif interpolation_method == "lower": score = values[np.floor(idx)] - elif interpolation_method == 'higher': + elif interpolation_method == "higher": score = values[np.ceil(idx)] else: - raise ValueError("interpolation_method can only be 'fraction' " - ", 'lower' or 'higher'") + raise ValueError( + "interpolation_method can only be 'fraction' " + ", 'lower' or 'higher'" + ) return score @@ -1041,21 +1108,21 @@ def _get_score(at): # select n # # --------------- # -class SelectN: +class SelectN: def __init__(self, obj, n, keep): self.obj = obj self.n = n self.keep = keep - if self.keep not in ('first', 'last', 'all'): + if self.keep not in ("first", "last", "all"): raise ValueError('keep must be either "first", "last" or "all"') def nlargest(self): - return self.compute('nlargest') + return self.compute("nlargest") def nsmallest(self): - return self.compute('nsmallest') + return self.compute("nsmallest") @staticmethod def is_valid_dtype_n_method(dtype): @@ -1063,8 +1130,9 @@ def is_valid_dtype_n_method(dtype): Helper function to determine if dtype is valid for nsmallest/nlargest methods """ - return ((is_numeric_dtype(dtype) and not is_complex_dtype(dtype)) or - needs_i8_conversion(dtype)) + return ( + is_numeric_dtype(dtype) and not is_complex_dtype(dtype) + ) or needs_i8_conversion(dtype) class SelectNSeries(SelectN): @@ -1087,9 +1155,10 @@ def compute(self, method): n = self.n dtype = self.obj.dtype if not self.is_valid_dtype_n_method(dtype): - raise TypeError("Cannot use method '{method}' with " - "dtype {dtype}".format(method=method, - dtype=dtype)) + raise TypeError( + "Cannot use method '{method}' with " + "dtype {dtype}".format(method=method, dtype=dtype) + ) if n <= 0: return self.obj[[]] @@ -1099,14 +1168,14 @@ def compute(self, method): # slow method if n >= len(self.obj): - reverse_it = (self.keep == 'last' or method == 'nlargest') - ascending = method == 'nsmallest' + reverse_it = self.keep == "last" or method == "nlargest" + ascending = method == "nsmallest" slc = np.s_[::-1] if reverse_it else np.s_[:] return dropped[slc].sort_values(ascending=ascending).head(n) # fast method arr, pandas_dtype, _ = _ensure_data(dropped.values) - if method == 'nlargest': + if method == "nlargest": arr = -arr if is_integer_dtype(pandas_dtype): # GH 21426: ensure reverse ordering at boundaries @@ -1116,7 +1185,7 @@ def compute(self, method): # GH 26154: ensure False is smaller than True arr = 1 - (-arr) - if self.keep == 'last': + if self.keep == "last": arr = arr[::-1] narr = len(arr) @@ -1124,12 +1193,12 @@ def compute(self, method): kth_val = algos.kth_smallest(arr.copy(), n - 1) ns, = np.nonzero(arr <= kth_val) - inds = ns[arr[ns].argsort(kind='mergesort')] + inds = ns[arr[ns].argsort(kind="mergesort")] - if self.keep != 'all': + if self.keep != "all": inds = inds[:n] - if self.keep == 'last': + if self.keep == "last": # reverse indices inds = narr - 1 - inds @@ -1162,6 +1231,7 @@ def __init__(self, obj, n, keep, columns): def compute(self, method): from pandas import Int64Index + n = self.n frame = self.obj columns = self.columns @@ -1169,16 +1239,18 @@ def compute(self, method): for column in columns: dtype = frame[column].dtype if not self.is_valid_dtype_n_method(dtype): - raise TypeError(( - "Column {column!r} has dtype {dtype}, cannot use method " - "{method!r} with this dtype" - ).format(column=column, dtype=dtype, method=method)) + raise TypeError( + ( + "Column {column!r} has dtype {dtype}, cannot use method " + "{method!r} with this dtype" + ).format(column=column, dtype=dtype, method=method) + ) def get_indexer(current_indexer, other_indexer): """Helper function to concat `current_indexer` and `other_indexer` depending on `method` """ - if method == 'nsmallest': + if method == "nsmallest": return current_indexer.append(other_indexer) else: return other_indexer.append(current_indexer) @@ -1200,8 +1272,8 @@ def get_indexer(current_indexer, other_indexer): series = cur_frame[column] is_last_column = len(columns) - 1 == i values = getattr(series, method)( - cur_n, - keep=self.keep if is_last_column else 'all') + cur_n, keep=self.keep if is_last_column else "all" + ) if is_last_column or len(values) <= cur_n: indexer = get_indexer(indexer, values.index) @@ -1234,12 +1306,9 @@ def get_indexer(current_indexer, other_indexer): if len(columns) == 1: return frame - ascending = method == 'nsmallest' + ascending = method == "nsmallest" - return frame.sort_values( - columns, - ascending=ascending, - kind='mergesort') + return frame.sort_values(columns, ascending=ascending, kind="mergesort") # ------- ## ---- # @@ -1308,110 +1377,103 @@ def _take_nd_object(arr, indexer, out, axis, fill_value, mask_info): _take_1d_dict = { - ('int8', 'int8'): algos.take_1d_int8_int8, - ('int8', 'int32'): algos.take_1d_int8_int32, - ('int8', 'int64'): algos.take_1d_int8_int64, - ('int8', 'float64'): algos.take_1d_int8_float64, - ('int16', 'int16'): algos.take_1d_int16_int16, - ('int16', 'int32'): algos.take_1d_int16_int32, - ('int16', 'int64'): algos.take_1d_int16_int64, - ('int16', 'float64'): algos.take_1d_int16_float64, - ('int32', 'int32'): algos.take_1d_int32_int32, - ('int32', 'int64'): algos.take_1d_int32_int64, - ('int32', 'float64'): algos.take_1d_int32_float64, - ('int64', 'int64'): algos.take_1d_int64_int64, - ('int64', 'float64'): algos.take_1d_int64_float64, - ('float32', 'float32'): algos.take_1d_float32_float32, - ('float32', 'float64'): algos.take_1d_float32_float64, - ('float64', 'float64'): algos.take_1d_float64_float64, - ('object', 'object'): algos.take_1d_object_object, - ('bool', 'bool'): _view_wrapper(algos.take_1d_bool_bool, np.uint8, - np.uint8), - ('bool', 'object'): _view_wrapper(algos.take_1d_bool_object, np.uint8, - None), - ('datetime64[ns]', 'datetime64[ns]'): _view_wrapper( - algos.take_1d_int64_int64, np.int64, np.int64, np.int64) + ("int8", "int8"): algos.take_1d_int8_int8, + ("int8", "int32"): algos.take_1d_int8_int32, + ("int8", "int64"): algos.take_1d_int8_int64, + ("int8", "float64"): algos.take_1d_int8_float64, + ("int16", "int16"): algos.take_1d_int16_int16, + ("int16", "int32"): algos.take_1d_int16_int32, + ("int16", "int64"): algos.take_1d_int16_int64, + ("int16", "float64"): algos.take_1d_int16_float64, + ("int32", "int32"): algos.take_1d_int32_int32, + ("int32", "int64"): algos.take_1d_int32_int64, + ("int32", "float64"): algos.take_1d_int32_float64, + ("int64", "int64"): algos.take_1d_int64_int64, + ("int64", "float64"): algos.take_1d_int64_float64, + ("float32", "float32"): algos.take_1d_float32_float32, + ("float32", "float64"): algos.take_1d_float32_float64, + ("float64", "float64"): algos.take_1d_float64_float64, + ("object", "object"): algos.take_1d_object_object, + ("bool", "bool"): _view_wrapper(algos.take_1d_bool_bool, np.uint8, np.uint8), + ("bool", "object"): _view_wrapper(algos.take_1d_bool_object, np.uint8, None), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + algos.take_1d_int64_int64, np.int64, np.int64, np.int64 + ), } _take_2d_axis0_dict = { - ('int8', 'int8'): algos.take_2d_axis0_int8_int8, - ('int8', 'int32'): algos.take_2d_axis0_int8_int32, - ('int8', 'int64'): algos.take_2d_axis0_int8_int64, - ('int8', 'float64'): algos.take_2d_axis0_int8_float64, - ('int16', 'int16'): algos.take_2d_axis0_int16_int16, - ('int16', 'int32'): algos.take_2d_axis0_int16_int32, - ('int16', 'int64'): algos.take_2d_axis0_int16_int64, - ('int16', 'float64'): algos.take_2d_axis0_int16_float64, - ('int32', 'int32'): algos.take_2d_axis0_int32_int32, - ('int32', 'int64'): algos.take_2d_axis0_int32_int64, - ('int32', 'float64'): algos.take_2d_axis0_int32_float64, - ('int64', 'int64'): algos.take_2d_axis0_int64_int64, - ('int64', 'float64'): algos.take_2d_axis0_int64_float64, - ('float32', 'float32'): algos.take_2d_axis0_float32_float32, - ('float32', 'float64'): algos.take_2d_axis0_float32_float64, - ('float64', 'float64'): algos.take_2d_axis0_float64_float64, - ('object', 'object'): algos.take_2d_axis0_object_object, - ('bool', 'bool'): _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8, - np.uint8), - ('bool', 'object'): _view_wrapper(algos.take_2d_axis0_bool_object, - np.uint8, None), - ('datetime64[ns]', 'datetime64[ns]'): - _view_wrapper(algos.take_2d_axis0_int64_int64, np.int64, np.int64, - fill_wrap=np.int64) + ("int8", "int8"): algos.take_2d_axis0_int8_int8, + ("int8", "int32"): algos.take_2d_axis0_int8_int32, + ("int8", "int64"): algos.take_2d_axis0_int8_int64, + ("int8", "float64"): algos.take_2d_axis0_int8_float64, + ("int16", "int16"): algos.take_2d_axis0_int16_int16, + ("int16", "int32"): algos.take_2d_axis0_int16_int32, + ("int16", "int64"): algos.take_2d_axis0_int16_int64, + ("int16", "float64"): algos.take_2d_axis0_int16_float64, + ("int32", "int32"): algos.take_2d_axis0_int32_int32, + ("int32", "int64"): algos.take_2d_axis0_int32_int64, + ("int32", "float64"): algos.take_2d_axis0_int32_float64, + ("int64", "int64"): algos.take_2d_axis0_int64_int64, + ("int64", "float64"): algos.take_2d_axis0_int64_float64, + ("float32", "float32"): algos.take_2d_axis0_float32_float32, + ("float32", "float64"): algos.take_2d_axis0_float32_float64, + ("float64", "float64"): algos.take_2d_axis0_float64_float64, + ("object", "object"): algos.take_2d_axis0_object_object, + ("bool", "bool"): _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8, np.uint8), + ("bool", "object"): _view_wrapper(algos.take_2d_axis0_bool_object, np.uint8, None), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + algos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), } _take_2d_axis1_dict = { - ('int8', 'int8'): algos.take_2d_axis1_int8_int8, - ('int8', 'int32'): algos.take_2d_axis1_int8_int32, - ('int8', 'int64'): algos.take_2d_axis1_int8_int64, - ('int8', 'float64'): algos.take_2d_axis1_int8_float64, - ('int16', 'int16'): algos.take_2d_axis1_int16_int16, - ('int16', 'int32'): algos.take_2d_axis1_int16_int32, - ('int16', 'int64'): algos.take_2d_axis1_int16_int64, - ('int16', 'float64'): algos.take_2d_axis1_int16_float64, - ('int32', 'int32'): algos.take_2d_axis1_int32_int32, - ('int32', 'int64'): algos.take_2d_axis1_int32_int64, - ('int32', 'float64'): algos.take_2d_axis1_int32_float64, - ('int64', 'int64'): algos.take_2d_axis1_int64_int64, - ('int64', 'float64'): algos.take_2d_axis1_int64_float64, - ('float32', 'float32'): algos.take_2d_axis1_float32_float32, - ('float32', 'float64'): algos.take_2d_axis1_float32_float64, - ('float64', 'float64'): algos.take_2d_axis1_float64_float64, - ('object', 'object'): algos.take_2d_axis1_object_object, - ('bool', 'bool'): _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8, - np.uint8), - ('bool', 'object'): _view_wrapper(algos.take_2d_axis1_bool_object, - np.uint8, None), - ('datetime64[ns]', 'datetime64[ns]'): - _view_wrapper(algos.take_2d_axis1_int64_int64, np.int64, np.int64, - fill_wrap=np.int64) + ("int8", "int8"): algos.take_2d_axis1_int8_int8, + ("int8", "int32"): algos.take_2d_axis1_int8_int32, + ("int8", "int64"): algos.take_2d_axis1_int8_int64, + ("int8", "float64"): algos.take_2d_axis1_int8_float64, + ("int16", "int16"): algos.take_2d_axis1_int16_int16, + ("int16", "int32"): algos.take_2d_axis1_int16_int32, + ("int16", "int64"): algos.take_2d_axis1_int16_int64, + ("int16", "float64"): algos.take_2d_axis1_int16_float64, + ("int32", "int32"): algos.take_2d_axis1_int32_int32, + ("int32", "int64"): algos.take_2d_axis1_int32_int64, + ("int32", "float64"): algos.take_2d_axis1_int32_float64, + ("int64", "int64"): algos.take_2d_axis1_int64_int64, + ("int64", "float64"): algos.take_2d_axis1_int64_float64, + ("float32", "float32"): algos.take_2d_axis1_float32_float32, + ("float32", "float64"): algos.take_2d_axis1_float32_float64, + ("float64", "float64"): algos.take_2d_axis1_float64_float64, + ("object", "object"): algos.take_2d_axis1_object_object, + ("bool", "bool"): _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8, np.uint8), + ("bool", "object"): _view_wrapper(algos.take_2d_axis1_bool_object, np.uint8, None), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + algos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), } _take_2d_multi_dict = { - ('int8', 'int8'): algos.take_2d_multi_int8_int8, - ('int8', 'int32'): algos.take_2d_multi_int8_int32, - ('int8', 'int64'): algos.take_2d_multi_int8_int64, - ('int8', 'float64'): algos.take_2d_multi_int8_float64, - ('int16', 'int16'): algos.take_2d_multi_int16_int16, - ('int16', 'int32'): algos.take_2d_multi_int16_int32, - ('int16', 'int64'): algos.take_2d_multi_int16_int64, - ('int16', 'float64'): algos.take_2d_multi_int16_float64, - ('int32', 'int32'): algos.take_2d_multi_int32_int32, - ('int32', 'int64'): algos.take_2d_multi_int32_int64, - ('int32', 'float64'): algos.take_2d_multi_int32_float64, - ('int64', 'int64'): algos.take_2d_multi_int64_int64, - ('int64', 'float64'): algos.take_2d_multi_int64_float64, - ('float32', 'float32'): algos.take_2d_multi_float32_float32, - ('float32', 'float64'): algos.take_2d_multi_float32_float64, - ('float64', 'float64'): algos.take_2d_multi_float64_float64, - ('object', 'object'): algos.take_2d_multi_object_object, - ('bool', 'bool'): _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8, - np.uint8), - ('bool', 'object'): _view_wrapper(algos.take_2d_multi_bool_object, - np.uint8, None), - ('datetime64[ns]', 'datetime64[ns]'): - _view_wrapper(algos.take_2d_multi_int64_int64, np.int64, np.int64, - fill_wrap=np.int64) + ("int8", "int8"): algos.take_2d_multi_int8_int8, + ("int8", "int32"): algos.take_2d_multi_int8_int32, + ("int8", "int64"): algos.take_2d_multi_int8_int64, + ("int8", "float64"): algos.take_2d_multi_int8_float64, + ("int16", "int16"): algos.take_2d_multi_int16_int16, + ("int16", "int32"): algos.take_2d_multi_int16_int32, + ("int16", "int64"): algos.take_2d_multi_int16_int64, + ("int16", "float64"): algos.take_2d_multi_int16_float64, + ("int32", "int32"): algos.take_2d_multi_int32_int32, + ("int32", "int64"): algos.take_2d_multi_int32_int64, + ("int32", "float64"): algos.take_2d_multi_int32_float64, + ("int64", "int64"): algos.take_2d_multi_int64_int64, + ("int64", "float64"): algos.take_2d_multi_int64_float64, + ("float32", "float32"): algos.take_2d_multi_float32_float32, + ("float32", "float64"): algos.take_2d_multi_float32_float64, + ("float64", "float64"): algos.take_2d_multi_float64_float64, + ("object", "object"): algos.take_2d_multi_object_object, + ("bool", "bool"): _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8, np.uint8), + ("bool", "object"): _view_wrapper(algos.take_2d_multi_bool_object, np.uint8, None), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + algos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), } @@ -1442,8 +1504,9 @@ def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis=0, mask_info=None): def func(arr, indexer, out, fill_value=np.nan): indexer = ensure_int64(indexer) - _take_nd_object(arr, indexer, out, axis=axis, fill_value=fill_value, - mask_info=mask_info) + _take_nd_object( + arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info + ) return func @@ -1534,16 +1597,18 @@ def take(arr, indices, axis=0, allow_fill=False, fill_value=None): if allow_fill: # Pandas style, -1 means NA validate_indices(indices, arr.shape[axis]) - result = take_1d(arr, indices, axis=axis, allow_fill=True, - fill_value=fill_value) + result = take_1d( + arr, indices, axis=axis, allow_fill=True, fill_value=fill_value + ) else: # NumPy style result = arr.take(indices, axis=axis) return result -def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, - allow_fill=True): +def take_nd( + arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=True +): """ Specialized Cython take which sets NaN values in one pass @@ -1618,7 +1683,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, mask_info = mask, needs_masking if needs_masking: if out is not None and out.dtype != dtype: - raise TypeError('Incompatible type for fill_value') + raise TypeError("Incompatible type for fill_value") else: # if not, then depromote, set fill_value to dummy # (it won't be used but we don't want the cython code @@ -1647,12 +1712,13 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, # for dataframes initialized directly from 2-d ndarrays # (s.t. df.values is c-contiguous and df._data.blocks[0] is its # f-contiguous transpose) - out = np.empty(out_shape, dtype=dtype, order='F') + out = np.empty(out_shape, dtype=dtype, order="F") else: out = np.empty(out_shape, dtype=dtype) - func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis, - mask_info=mask_info) + func = _get_take_nd_function( + arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info + ) func(arr, indexer, out, fill_value) if flip_order: @@ -1663,8 +1729,9 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, take_1d = take_nd -def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, - allow_fill=True): +def take_2d_multi( + arr, indexer, out=None, fill_value=np.nan, mask_info=None, allow_fill=True +): """ Specialized Cython take which sets NaN values in one pass """ @@ -1703,7 +1770,7 @@ def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, mask_info = (row_mask, col_mask), (row_needs, col_needs) if row_needs or col_needs: if out is not None and out.dtype != dtype: - raise TypeError('Incompatible type for fill_value') + raise TypeError("Incompatible type for fill_value") else: # if not, then depromote, set fill_value to dummy # (it won't be used but we don't want the cython code @@ -1724,8 +1791,9 @@ def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, if func is None: def func(arr, indexer, out, fill_value=np.nan): - _take_2d_multi_object(arr, indexer, out, fill_value=fill_value, - mask_info=mask_info) + _take_2d_multi_object( + arr, indexer, out, fill_value=fill_value, mask_info=mask_info + ) func(arr, indexer, out=out, fill_value=fill_value) return out @@ -1735,6 +1803,7 @@ def func(arr, indexer, out, fill_value=np.nan): # searchsorted # # ------------ # + def searchsorted(arr, value, side="left", sorter=None): """ Find indices where elements should be inserted to maintain order. @@ -1782,9 +1851,13 @@ def searchsorted(arr, value, side="left", sorter=None): if sorter is not None: sorter = ensure_platform_int(sorter) - if isinstance(arr, np.ndarray) and is_integer_dtype(arr) and ( - is_integer(value) or is_integer_dtype(value)): + if ( + isinstance(arr, np.ndarray) + and is_integer_dtype(arr) + and (is_integer(value) or is_integer_dtype(value)) + ): from .arrays.array_ import array + # if `arr` and `value` have different dtypes, `arr` would be # recast by numpy, causing a slow search. # Before searching below, we therefore try to give `value` the @@ -1802,9 +1875,11 @@ def searchsorted(arr, value, side="left", sorter=None): value = dtype.type(value) else: value = array(value, dtype=dtype) - elif not (is_object_dtype(arr) or is_numeric_dtype(arr) or - is_categorical_dtype(arr)): + elif not ( + is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr) + ): from pandas.core.series import Series + # E.g. if `arr` is an array with dtype='datetime64[ns]' # and `value` is a pd.Timestamp, we may need to convert value value_ser = Series(value)._values @@ -1819,12 +1894,12 @@ def searchsorted(arr, value, side="left", sorter=None): # ---- # _diff_special = { - 'float64': algos.diff_2d_float64, - 'float32': algos.diff_2d_float32, - 'int64': algos.diff_2d_int64, - 'int32': algos.diff_2d_int32, - 'int16': algos.diff_2d_int16, - 'int8': algos.diff_2d_int8, + "float64": algos.diff_2d_float64, + "float32": algos.diff_2d_float32, + "int64": algos.diff_2d_int64, + "int32": algos.diff_2d_int32, + "int16": algos.diff_2d_int16, + "int8": algos.diff_2d_int8, } @@ -1854,7 +1929,7 @@ def diff(arr, n, axis=0): is_timedelta = False if needs_i8_conversion(arr): dtype = np.float64 - arr = arr.view('i8') + arr = arr.view("i8") na = iNaT is_timedelta = True @@ -1904,7 +1979,11 @@ def diff(arr, n, axis=0): if is_timedelta: from pandas import TimedeltaIndex - out_arr = TimedeltaIndex(out_arr.ravel().astype('int64')).asi8.reshape( - out_arr.shape).astype('timedelta64[ns]') + + out_arr = ( + TimedeltaIndex(out_arr.ravel().astype("int64")) + .asi8.reshape(out_arr.shape) + .astype("timedelta64[ns]") + ) return out_arr diff --git a/pandas/core/api.py b/pandas/core/api.py index e8d21080775da8..f3ea0976a28694 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -23,11 +23,20 @@ from pandas.core.arrays import Categorical, array from pandas.core.groupby import Grouper, NamedAgg from pandas.io.formats.format import set_eng_float_format -from pandas.core.index import (Index, CategoricalIndex, Int64Index, - UInt64Index, RangeIndex, Float64Index, - MultiIndex, IntervalIndex, - TimedeltaIndex, DatetimeIndex, - PeriodIndex, NaT) +from pandas.core.index import ( + Index, + CategoricalIndex, + Int64Index, + UInt64Index, + RangeIndex, + Float64Index, + MultiIndex, + IntervalIndex, + TimedeltaIndex, + DatetimeIndex, + PeriodIndex, + NaT, +) from pandas.core.indexes.period import Period, period_range from pandas.core.indexes.timedeltas import Timedelta, timedelta_range from pandas.core.indexes.datetimes import Timestamp, date_range, bdate_range diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 7dc054c824fec2..2246bbfde636dd 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -7,16 +7,28 @@ from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( - is_dict_like, is_extension_type, is_list_like, is_sequence) + is_dict_like, + is_extension_type, + is_list_like, + is_sequence, +) from pandas.core.dtypes.generic import ABCSeries from pandas.io.formats.printing import pprint_thing -def frame_apply(obj, func, axis=0, broadcast=None, - raw=False, reduce=None, result_type=None, - ignore_failures=False, - args=None, kwds=None): +def frame_apply( + obj, + func, + axis=0, + broadcast=None, + raw=False, + reduce=None, + result_type=None, + ignore_failures=False, + args=None, + kwds=None, +): """ construct and return a row or column based frame apply object """ axis = obj._get_axis_number(axis) @@ -25,48 +37,71 @@ def frame_apply(obj, func, axis=0, broadcast=None, elif axis == 1: klass = FrameColumnApply - return klass(obj, func, broadcast=broadcast, - raw=raw, reduce=reduce, result_type=result_type, - ignore_failures=ignore_failures, - args=args, kwds=kwds) + return klass( + obj, + func, + broadcast=broadcast, + raw=raw, + reduce=reduce, + result_type=result_type, + ignore_failures=ignore_failures, + args=args, + kwds=kwds, + ) class FrameApply: - - def __init__(self, obj, func, broadcast, raw, reduce, result_type, - ignore_failures, args, kwds): + def __init__( + self, + obj, + func, + broadcast, + raw, + reduce, + result_type, + ignore_failures, + args, + kwds, + ): self.obj = obj self.raw = raw self.ignore_failures = ignore_failures self.args = args or () self.kwds = kwds or {} - if result_type not in [None, 'reduce', 'broadcast', 'expand']: - raise ValueError("invalid value for result_type, must be one " - "of {None, 'reduce', 'broadcast', 'expand'}") + if result_type not in [None, "reduce", "broadcast", "expand"]: + raise ValueError( + "invalid value for result_type, must be one " + "of {None, 'reduce', 'broadcast', 'expand'}" + ) if broadcast is not None: - warnings.warn("The broadcast argument is deprecated and will " - "be removed in a future version. You can specify " - "result_type='broadcast' to broadcast the result " - "to the original dimensions", - FutureWarning, stacklevel=4) + warnings.warn( + "The broadcast argument is deprecated and will " + "be removed in a future version. You can specify " + "result_type='broadcast' to broadcast the result " + "to the original dimensions", + FutureWarning, + stacklevel=4, + ) if broadcast: - result_type = 'broadcast' + result_type = "broadcast" if reduce is not None: - warnings.warn("The reduce argument is deprecated and will " - "be removed in a future version. You can specify " - "result_type='reduce' to try to reduce the result " - "to the original dimensions", - FutureWarning, stacklevel=4) + warnings.warn( + "The reduce argument is deprecated and will " + "be removed in a future version. You can specify " + "result_type='reduce' to try to reduce the result " + "to the original dimensions", + FutureWarning, + stacklevel=4, + ) if reduce: if result_type is not None: - raise ValueError( - "cannot pass both reduce=True and result_type") + raise ValueError("cannot pass both reduce=True and result_type") - result_type = 'reduce' + result_type = "reduce" self.result_type = result_type @@ -75,6 +110,7 @@ def __init__(self, obj, func, broadcast, raw, reduce, result_type, def f(x): return func(x, *args, **kwds) + else: f = func @@ -110,8 +146,7 @@ def get_result(self): # dispatch to agg if is_list_like(self.f) or is_dict_like(self.f): - return self.obj.aggregate(self.f, axis=self.axis, - *self.args, **self.kwds) + return self.obj.aggregate(self.f, axis=self.axis, *self.args, **self.kwds) # all empty if len(self.columns) == 0 and len(self.index) == 0: @@ -124,19 +159,20 @@ def get_result(self): # don't, so inspect and insert if necessary. func = getattr(self.obj, self.f) sig = inspect.getfullargspec(func) - if 'axis' in sig.args: - self.kwds['axis'] = self.axis + if "axis" in sig.args: + self.kwds["axis"] = self.axis return func(*self.args, **self.kwds) # ufunc elif isinstance(self.f, np.ufunc): - with np.errstate(all='ignore'): - results = self.obj._data.apply('apply', func=self.f) - return self.obj._constructor(data=results, index=self.index, - columns=self.columns, copy=False) + with np.errstate(all="ignore"): + results = self.obj._data.apply("apply", func=self.f) + return self.obj._constructor( + data=results, index=self.index, columns=self.columns, copy=False + ) # broadcasting - if self.result_type == 'broadcast': + if self.result_type == "broadcast": return self.apply_broadcast() # one axis empty @@ -159,13 +195,14 @@ def apply_empty_result(self): # we are not asked to reduce or infer reduction # so just return a copy of the existing object - if self.result_type not in ['reduce', None]: + if self.result_type not in ["reduce", None]: return self.obj.copy() # we may need to infer - reduce = self.result_type == 'reduce' + reduce = self.result_type == "reduce" from pandas import Series + if not reduce: EMPTY_SERIES = Series([]) @@ -190,12 +227,9 @@ def apply_raw(self): # TODO: mixed type case if result.ndim == 2: - return self.obj._constructor(result, - index=self.index, - columns=self.columns) + return self.obj._constructor(result, index=self.index, columns=self.columns) else: - return self.obj._constructor_sliced(result, - index=self.agg_axis) + return self.obj._constructor_sliced(result, index=self.agg_axis) def apply_broadcast(self, target): result_values = np.empty_like(target.values) @@ -219,9 +253,9 @@ def apply_broadcast(self, target): result_values[:, i] = res # we *always* preserve the original index / columns - result = self.obj._constructor(result_values, - index=target.index, - columns=target.columns) + result = self.obj._constructor( + result_values, index=target.index, columns=target.columns + ) return result def apply_standard(self): @@ -232,11 +266,14 @@ def apply_standard(self): # we cannot reduce using non-numpy dtypes, # as demonstrated in gh-12244 - if (self.result_type in ['reduce', None] and - not self.dtypes.apply(is_extension_type).any()): + if ( + self.result_type in ["reduce", None] + and not self.dtypes.apply(is_extension_type).any() + ): # Create a dummy Series from an empty array from pandas import Series + values = self.values index = self.obj._get_axis(self.axis) labels = self.agg_axis @@ -244,10 +281,9 @@ def apply_standard(self): dummy = Series(empty_arr, index=index, dtype=values.dtype) try: - result = reduction.reduce(values, self.f, - axis=self.axis, - dummy=dummy, - labels=labels) + result = reduction.reduce( + values, self.f, axis=self.axis, dummy=dummy, labels=labels + ) return self.obj._constructor_sliced(result, index=labels) except Exception: pass @@ -285,13 +321,12 @@ def apply_series_generator(self): results[i] = self.f(v) keys.append(v.name) except Exception as e: - if hasattr(e, 'args'): + if hasattr(e, "args"): # make sure i is defined if i is not None: k = res_index[i] - e.args = e.args + ('occurred at index %s' % - pprint_thing(k), ) + e.args = e.args + ("occurred at index %s" % pprint_thing(k),) raise self.results = results @@ -321,8 +356,7 @@ def apply_broadcast(self): @property def series_generator(self): - return (self.obj._ixs(i, axis=1) - for i in range(len(self.columns))) + return (self.obj._ixs(i, axis=1) for i in range(len(self.columns))) @property def result_index(self): @@ -362,9 +396,10 @@ def apply_broadcast(self): @property def series_generator(self): constructor = self.obj._constructor_sliced - return (constructor(arr, index=self.columns, name=name) - for i, (arr, name) in enumerate(zip(self.values, - self.index))) + return ( + constructor(arr, index=self.columns, name=name) + for i, (arr, name) in enumerate(zip(self.values, self.index)) + ) @property def result_index(self): @@ -379,12 +414,13 @@ def wrap_results_for_axis(self): results = self.results # we have requested to expand - if self.result_type == 'expand': + if self.result_type == "expand": result = self.infer_to_same_shape() # we have a non-series and don't want inference elif not isinstance(results[0], ABCSeries): from pandas import Series + result = Series(results) result.index = self.res_index diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 2d09a9eac6eab1..dab29e9ce71d31 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -1,6 +1,9 @@ from .array_ import array # noqa: F401 from .base import ( # noqa: F401 - ExtensionArray, ExtensionOpsMixin, ExtensionScalarOpsMixin) + ExtensionArray, + ExtensionOpsMixin, + ExtensionScalarOpsMixin, +) from .categorical import Categorical # noqa: F401 from .datetimes import DatetimeArray # noqa: F401 from .integer import IntegerArray, integer_array # noqa: F401 diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py index 7a83b7960a6e7f..15ff1432f16e23 100644 --- a/pandas/core/arrays/_ranges.py +++ b/pandas/core/arrays/_ranges.py @@ -12,10 +12,9 @@ from pandas.tseries.offsets import DateOffset, Tick, generate_range -def generate_regular_range(start: Timestamp, - end: Timestamp, - periods: int, - freq: DateOffset) -> Tuple[np.ndarray, str]: +def generate_regular_range( + start: Timestamp, end: Timestamp, periods: int, freq: DateOffset +) -> Tuple[np.ndarray, str]: """ Generate a range of dates with the spans between dates described by the given `freq` DateOffset. @@ -41,21 +40,22 @@ def generate_regular_range(start: Timestamp, b = Timestamp(start).value # cannot just use e = Timestamp(end) + 1 because arange breaks when # stride is too large, see GH10887 - e = (b + (Timestamp(end).value - b) // stride * stride + - stride // 2 + 1) + e = b + (Timestamp(end).value - b) // stride * stride + stride // 2 + 1 # end.tz == start.tz by this point due to _generate implementation tz = start.tz elif start is not None: b = Timestamp(start).value - e = _generate_range_overflow_safe(b, periods, stride, side='start') + e = _generate_range_overflow_safe(b, periods, stride, side="start") tz = start.tz elif end is not None: e = Timestamp(end).value + stride - b = _generate_range_overflow_safe(e, periods, stride, side='end') + b = _generate_range_overflow_safe(e, periods, stride, side="end") tz = end.tz else: - raise ValueError("at least 'start' or 'end' should be specified " - "if a 'period' is given.") + raise ValueError( + "at least 'start' or 'end' should be specified " + "if a 'period' is given." + ) with np.errstate(over="raise"): # If the range is sufficiently large, np.arange may overflow @@ -76,18 +76,16 @@ def generate_regular_range(start: Timestamp, elif end is not None: tz = end.tz - xdr = generate_range(start=start, end=end, - periods=periods, offset=freq) + xdr = generate_range(start=start, end=end, periods=periods, offset=freq) values = np.array([x.value for x in xdr], dtype=np.int64) return values, tz -def _generate_range_overflow_safe(endpoint: int, - periods: int, - stride: int, - side: str = 'start') -> int: +def _generate_range_overflow_safe( + endpoint: int, periods: int, stride: int, side: str = "start" +) -> int: """ Calculate the second endpoint for passing to np.arange, checking to avoid an integer overflow. Catch OverflowError and re-raise @@ -113,12 +111,13 @@ def _generate_range_overflow_safe(endpoint: int, OutOfBoundsDatetime """ # GH#14187 raise instead of incorrectly wrapping around - assert side in ['start', 'end'] + assert side in ["start", "end"] i64max = np.uint64(np.iinfo(np.int64).max) - msg = ('Cannot generate range with {side}={endpoint} and ' - 'periods={periods}' - .format(side=side, endpoint=endpoint, periods=periods)) + msg = ( + "Cannot generate range with {side}={endpoint} and " + "periods={periods}".format(side=side, endpoint=endpoint, periods=periods) + ) with np.errstate(over="raise"): # if periods * strides cannot be multiplied within the *uint64* bounds, @@ -130,40 +129,39 @@ def _generate_range_overflow_safe(endpoint: int, if np.abs(addend) <= i64max: # relatively easy case without casting concerns - return _generate_range_overflow_safe_signed( - endpoint, periods, stride, side) + return _generate_range_overflow_safe_signed(endpoint, periods, stride, side) - elif ((endpoint > 0 and side == 'start' and stride > 0) or - (endpoint < 0 and side == 'end' and stride > 0)): + elif (endpoint > 0 and side == "start" and stride > 0) or ( + endpoint < 0 and side == "end" and stride > 0 + ): # no chance of not-overflowing raise OutOfBoundsDatetime(msg) - elif (side == 'end' and endpoint > i64max and endpoint - stride <= i64max): + elif side == "end" and endpoint > i64max and endpoint - stride <= i64max: # in _generate_regular_range we added `stride` thereby overflowing # the bounds. Adjust to fix this. - return _generate_range_overflow_safe(endpoint - stride, - periods - 1, stride, side) + return _generate_range_overflow_safe( + endpoint - stride, periods - 1, stride, side + ) # split into smaller pieces mid_periods = periods // 2 remaining = periods - mid_periods assert 0 < remaining < periods, (remaining, periods, endpoint, stride) - midpoint = _generate_range_overflow_safe(endpoint, mid_periods, - stride, side) + midpoint = _generate_range_overflow_safe(endpoint, mid_periods, stride, side) return _generate_range_overflow_safe(midpoint, remaining, stride, side) -def _generate_range_overflow_safe_signed(endpoint: int, - periods: int, - stride: int, - side: str) -> int: +def _generate_range_overflow_safe_signed( + endpoint: int, periods: int, stride: int, side: str +) -> int: """ A special case for _generate_range_overflow_safe where `periods * stride` can be calculated without overflowing int64 bounds. """ - assert side in ['start', 'end'] - if side == 'end': + assert side in ["start", "end"] + if side == "end": stride *= -1 with np.errstate(over="raise"): @@ -191,8 +189,8 @@ def _generate_range_overflow_safe_signed(endpoint: int, if result <= i64max + np.uint64(stride): return result - raise OutOfBoundsDatetime('Cannot generate range with ' - '{side}={endpoint} and ' - 'periods={periods}' - .format(side=side, endpoint=endpoint, - periods=periods)) + raise OutOfBoundsDatetime( + "Cannot generate range with " + "{side}={endpoint} and " + "periods={periods}".format(side=side, endpoint=endpoint, periods=periods) + ) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index 1b002ad12d526f..93ee570c1f9716 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -5,15 +5,19 @@ from pandas._libs import lib, tslibs from pandas.core.dtypes.common import ( - is_datetime64_ns_dtype, is_extension_array_dtype, is_timedelta64_ns_dtype) + is_datetime64_ns_dtype, + is_extension_array_dtype, + is_timedelta64_ns_dtype, +) from pandas.core.dtypes.dtypes import ExtensionDtype, registry from pandas.core.dtypes.generic import ABCExtensionArray -def array(data: Sequence[object], - dtype: Optional[Union[str, np.dtype, ExtensionDtype]] = None, - copy: bool = True, - ) -> ABCExtensionArray: +def array( + data: Sequence[object], + dtype: Optional[Union[str, np.dtype, ExtensionDtype]] = None, + copy: bool = True, +) -> ABCExtensionArray: """ Create an array. @@ -207,16 +211,17 @@ def array(data: Sequence[object], ValueError: Cannot pass scalar '1' to 'pandas.array'. """ from pandas.core.arrays import ( - period_array, ExtensionArray, IntervalArray, PandasArray, + period_array, + ExtensionArray, + IntervalArray, + PandasArray, DatetimeArray, TimedeltaArray, ) from pandas.core.internals.arrays import extract_array if lib.is_scalar(data): - msg = ( - "Cannot pass scalar '{}' to 'pandas.array'." - ) + msg = "Cannot pass scalar '{}' to 'pandas.array'." raise ValueError(msg.format(data)) data = extract_array(data, extract_numpy=True) @@ -234,14 +239,14 @@ def array(data: Sequence[object], if dtype is None: inferred_dtype = lib.infer_dtype(data, skipna=False) - if inferred_dtype == 'period': + if inferred_dtype == "period": try: return period_array(data, copy=copy) except tslibs.IncompatibleFrequency: # We may have a mixture of frequencies. # We choose to return an ndarray, rather than raising. pass - elif inferred_dtype == 'interval': + elif inferred_dtype == "interval": try: return IntervalArray(data, copy=copy) except ValueError: @@ -249,7 +254,7 @@ def array(data: Sequence[object], # We choose to return an ndarray, rather than raising. pass - elif inferred_dtype.startswith('datetime'): + elif inferred_dtype.startswith("datetime"): # datetime, datetime64 try: return DatetimeArray._from_sequence(data, copy=copy) @@ -257,7 +262,7 @@ def array(data: Sequence[object], # Mixture of timezones, fall back to PandasArray pass - elif inferred_dtype.startswith('timedelta'): + elif inferred_dtype.startswith("timedelta"): # timedelta, timedelta64 return TimedeltaArray._from_sequence(data, copy=copy) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 21f0f3c08e93bb..2a5556ff6d357a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -17,8 +17,7 @@ from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ( - ABCExtensionArray, ABCIndexClass, ABCSeries) +from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna from pandas._typing import ArrayLike @@ -120,9 +119,10 @@ class ExtensionArray: See :ref:`extending.extension.ufunc` for more. """ + # '_typ' is for pandas.core.dtypes.generic.ABCExtensionArray. # Don't override this. - _typ = 'extension' + _typ = "extension" # ------------------------------------------------------------------------ # Constructors @@ -272,8 +272,8 @@ def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None: # __init__ method coerces that value, then so should __setitem__ # Note, also, that Series/DataFrame.where internally use __setitem__ # on a copy of the data. - raise NotImplementedError(_not_implemented_message.format( - type(self), '__setitem__') + raise NotImplementedError( + _not_implemented_message.format(type(self), "__setitem__") ) def __len__(self) -> int: @@ -393,7 +393,7 @@ def _values_for_argsort(self) -> np.ndarray: # Note: this is used in `ExtensionArray.argsort`. return np.array(self) - def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): + def argsort(self, ascending=True, kind="quicksort", *args, **kwargs): """ Return the indices that would sort this array. @@ -423,8 +423,7 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): # 2. argsort : total control over sorting. ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - result = nargsort(self, kind=kind, ascending=ascending, - na_position='last') + result = nargsort(self, kind=kind, ascending=ascending, na_position="last") return result def fillna(self, value=None, method=None, limit=None): @@ -463,15 +462,16 @@ def fillna(self, value=None, method=None, limit=None): if is_array_like(value): if len(value) != len(self): - raise ValueError("Length of 'value' does not match. Got ({}) " - " expected {}".format(len(value), len(self))) + raise ValueError( + "Length of 'value' does not match. Got ({}) " + " expected {}".format(len(value), len(self)) + ) value = value[mask] if mask.any(): if method is not None: - func = pad_1d if method == 'pad' else backfill_1d - new_values = func(self.astype(object), limit=limit, - mask=mask) + func = pad_1d if method == "pad" else backfill_1d + new_values = func(self.astype(object), limit=limit, mask=mask) new_values = self._from_sequence(new_values, dtype=self.dtype) else: # fill with value @@ -491,10 +491,7 @@ def dropna(self): """ return self[~self.isna()] - def shift( - self, - periods: int = 1, - fill_value: object = None) -> ABCExtensionArray: + def shift(self, periods: int = 1, fill_value: object = None) -> ABCExtensionArray: """ Shift values by desired number. @@ -537,14 +534,13 @@ def shift( fill_value = self.dtype.na_value empty = self._from_sequence( - [fill_value] * min(abs(periods), len(self)), - dtype=self.dtype + [fill_value] * min(abs(periods), len(self)), dtype=self.dtype ) if periods > 0: a = empty b = self[:-periods] else: - a = self[abs(periods):] + a = self[abs(periods) :] b = empty return self._concat_same_type([a, b]) @@ -633,10 +629,7 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: """ return self.astype(object), np.nan - def factorize( - self, - na_sentinel: int = -1, - ) -> Tuple[np.ndarray, ABCExtensionArray]: + def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArray]: """ Encode the extension array as an enumerated type. @@ -679,13 +672,16 @@ def factorize( arr, na_value = self._values_for_factorize() - labels, uniques = _factorize_array(arr, na_sentinel=na_sentinel, - na_value=na_value) + labels, uniques = _factorize_array( + arr, na_sentinel=na_sentinel, na_value=na_value + ) uniques = self._from_factorized(uniques, self) return labels, uniques - _extension_array_shared_docs['repeat'] = """ + _extension_array_shared_docs[ + "repeat" + ] = """ Repeat elements of a %(klass)s. Returns a new %(klass)s where each element of the current %(klass)s @@ -727,8 +723,8 @@ def factorize( Categories (3, object): [a, b, c] """ - @Substitution(klass='ExtensionArray') - @Appender(_extension_array_shared_docs['repeat']) + @Substitution(klass="ExtensionArray") + @Appender(_extension_array_shared_docs["repeat"]) def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) ind = np.arange(len(self)).repeat(repeats) @@ -739,10 +735,7 @@ def repeat(self, repeats, axis=None): # ------------------------------------------------------------------------ def take( - self, - indices: Sequence[int], - allow_fill: bool = False, - fill_value: Any = None + self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None ) -> ABCExtensionArray: """ Take elements from an array. @@ -849,25 +842,19 @@ def copy(self) -> ABCExtensionArray: def __repr__(self): from pandas.io.formats.printing import format_object_summary - template = ( - '{class_name}' - '{data}\n' - 'Length: {length}, dtype: {dtype}' - ) + template = "{class_name}" "{data}\n" "Length: {length}, dtype: {dtype}" # the short repr has no trailing newline, while the truncated # repr does. So we include a newline in our template, and strip # any trailing newlines from format_object_summary - data = format_object_summary(self, self._formatter(), - indent_for_name=False).rstrip(', \n') - class_name = '<{}>\n'.format(self.__class__.__name__) - return template.format(class_name=class_name, data=data, - length=len(self), - dtype=self.dtype) - - def _formatter( - self, - boxed: bool = False, - ) -> Callable[[Any], Optional[str]]: + data = format_object_summary( + self, self._formatter(), indent_for_name=False + ).rstrip(", \n") + class_name = "<{}>\n".format(self.__class__.__name__) + return template.format( + class_name=class_name, data=data, length=len(self), dtype=self.dtype + ) + + def _formatter(self, boxed: bool = False) -> Callable[[Any], Optional[str]]: """Formatting function for scalar values. This is used in the default '__repr__'. The returned formatting @@ -926,8 +913,7 @@ def ravel(self, order="C") -> ABCExtensionArray: @classmethod def _concat_same_type( - cls, - to_concat: Sequence[ABCExtensionArray] + cls, to_concat: Sequence[ABCExtensionArray] ) -> ABCExtensionArray: """ Concatenate multiple array @@ -985,8 +971,11 @@ def _reduce(self, name, skipna=True, **kwargs): ------ TypeError : subclass does not define reductions """ - raise TypeError("cannot perform {name} with type {dtype}".format( - name=name, dtype=self.dtype)) + raise TypeError( + "cannot perform {name} with type {dtype}".format( + name=name, dtype=self.dtype + ) + ) class ExtensionOpsMixin: @@ -1127,7 +1116,7 @@ def _maybe_convert(arr): res = np.asarray(arr) return res - if op.__name__ in {'divmod', 'rdivmod'}: + if op.__name__ in {"divmod", "rdivmod"}: a, b = zip(*res) res = _maybe_convert(a), _maybe_convert(b) else: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 5ae71ffb165e9a..c4f7d6dbe32fa6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -9,20 +9,41 @@ from pandas._libs import algos as libalgos, lib from pandas.compat.numpy import function as nv from pandas.util._decorators import ( - Appender, Substitution, cache_readonly, deprecate_kwarg) + Appender, + Substitution, + cache_readonly, + deprecate_kwarg, +) from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs -from pandas.core.dtypes.cast import ( - coerce_indexer_dtype, maybe_infer_to_datetimelike) +from pandas.core.dtypes.cast import coerce_indexer_dtype, maybe_infer_to_datetimelike from pandas.core.dtypes.common import ( - ensure_int64, ensure_object, ensure_platform_int, is_categorical, - is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like, - is_dtype_equal, is_extension_array_dtype, is_float_dtype, is_integer_dtype, - is_iterator, is_list_like, is_object_dtype, is_scalar, is_sequence, - is_timedelta64_dtype) + ensure_int64, + ensure_object, + ensure_platform_int, + is_categorical, + is_categorical_dtype, + is_datetime64_dtype, + is_datetimelike, + is_dict_like, + is_dtype_equal, + is_extension_array_dtype, + is_float_dtype, + is_integer_dtype, + is_iterator, + is_list_like, + is_object_dtype, + is_scalar, + is_sequence, + is_timedelta64_dtype, +) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ( - ABCCategoricalIndex, ABCDataFrame, ABCIndexClass, ABCSeries) + ABCCategoricalIndex, + ABCDataFrame, + ABCIndexClass, + ABCSeries, +) from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import isna, notna @@ -39,7 +60,8 @@ from .base import ExtensionArray, _extension_array_shared_docs -_take_msg = textwrap.dedent("""\ +_take_msg = textwrap.dedent( + """\ Interpreting negative values in 'indexer' as missing values. In the future, this will change to meaning positional indices from the right. @@ -47,7 +69,8 @@ Use 'allow_fill=True' to retain the previous behavior and silence this warning. - Use 'allow_fill=False' to accept the new behavior.""") + Use 'allow_fill=False' to accept the new behavior.""" +) def _cat_compare_op(op): @@ -63,28 +86,27 @@ def f(self, other): other = lib.item_from_zerodim(other) if not self.ordered: - if op in ['__lt__', '__gt__', '__le__', '__ge__']: - raise TypeError("Unordered Categoricals can only compare " - "equality or not") + if op in ["__lt__", "__gt__", "__le__", "__ge__"]: + raise TypeError( + "Unordered Categoricals can only compare " "equality or not" + ) if isinstance(other, Categorical): # Two Categoricals can only be be compared if the categories are # the same (maybe up to ordering, depending on ordered) - msg = ("Categoricals can only be compared if " - "'categories' are the same.") + msg = "Categoricals can only be compared if " "'categories' are the same." if len(self.categories) != len(other.categories): raise TypeError(msg + " Categories are different lengths") - elif (self.ordered and not (self.categories == - other.categories).all()): + elif self.ordered and not (self.categories == other.categories).all(): raise TypeError(msg) elif not set(self.categories) == set(other.categories): raise TypeError(msg) if not (self.ordered == other.ordered): - raise TypeError("Categoricals can only be compared if " - "'ordered' is the same") - if not self.ordered and not self.categories.equals( - other.categories): + raise TypeError( + "Categoricals can only be compared if " "'ordered' is the same" + ) + if not self.ordered and not self.categories.equals(other.categories): # both unordered and different order other_codes = _get_codes_for_values(other, self.categories) else: @@ -104,28 +126,32 @@ def f(self, other): ret = getattr(self._codes, op)(i) # check for NaN in self - mask = (self._codes == -1) + mask = self._codes == -1 ret[mask] = False return ret else: - if op == '__eq__': + if op == "__eq__": return np.repeat(False, len(self)) - elif op == '__ne__': + elif op == "__ne__": return np.repeat(True, len(self)) else: - msg = ("Cannot compare a Categorical for op {op} with a " - "scalar, which is not a category.") + msg = ( + "Cannot compare a Categorical for op {op} with a " + "scalar, which is not a category." + ) raise TypeError(msg.format(op=op)) else: # allow categorical vs object dtype array comparisons for equality # these are only positional comparisons - if op in ['__eq__', '__ne__']: + if op in ["__eq__", "__ne__"]: return getattr(np.array(self), op)(np.array(other)) - msg = ("Cannot compare a Categorical for op {op} with type {typ}." - "\nIf you want to compare values, use 'np.asarray(cat) " - " other'.") + msg = ( + "Cannot compare a Categorical for op {op} with type {typ}." + "\nIf you want to compare values, use 'np.asarray(cat) " + " other'." + ) raise TypeError(msg.format(op=op, typ=type(other))) f.__name__ = op @@ -308,14 +334,16 @@ class Categorical(ExtensionArray, PandasObject): __array_priority__ = 1000 _dtype = CategoricalDtype(ordered=False) # tolist is not actually deprecated, just suppressed in the __dir__ - _deprecations = frozenset(['labels', 'tolist']) - _typ = 'categorical' + _deprecations = frozenset(["labels", "tolist"]) + _typ = "categorical" - def __init__(self, values, categories=None, ordered=None, dtype=None, - fastpath=False): + def __init__( + self, values, categories=None, ordered=None, dtype=None, fastpath=False + ): - dtype = CategoricalDtype._from_values_or_dtype(values, categories, - ordered, dtype) + dtype = CategoricalDtype._from_values_or_dtype( + values, categories, ordered, dtype + ) # At this point, dtype is always a CategoricalDtype, but # we may have dtype.categories be None, and we need to # infer categories in a factorization step futher below @@ -340,9 +368,10 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, if not isinstance(values, np.ndarray): values = _convert_to_list_like(values) from pandas.core.internals.construction import sanitize_array + # By convention, empty lists result in object dtype: if len(values) == 0: - sanitize_dtype = 'object' + sanitize_dtype = "object" else: sanitize_dtype = None null_mask = isna(values) @@ -358,30 +387,35 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, if dtype._ordered: # raise, as we don't have a sortable data structure and so # the user should give us one by specifying categories - raise TypeError("'values' is not ordered, please " - "explicitly specify the categories order " - "by passing in a categories argument.") + raise TypeError( + "'values' is not ordered, please " + "explicitly specify the categories order " + "by passing in a categories argument." + ) except ValueError: # FIXME - raise NotImplementedError("> 1 ndim Categorical are not " - "supported at this time") + raise NotImplementedError( + "> 1 ndim Categorical are not " "supported at this time" + ) # we're inferring from values dtype = CategoricalDtype(categories, dtype._ordered) elif is_categorical_dtype(values): - old_codes = (values._values.codes if isinstance(values, ABCSeries) - else values.codes) - codes = _recode_for_categories(old_codes, values.dtype.categories, - dtype.categories) + old_codes = ( + values._values.codes if isinstance(values, ABCSeries) else values.codes + ) + codes = _recode_for_categories( + old_codes, values.dtype.categories, dtype.categories + ) else: codes = _get_codes_for_values(values, dtype.categories) if null_mask.any(): # Reinsert -1 placeholders for previously removed missing values - full_codes = - np.ones(null_mask.shape, dtype=codes.dtype) + full_codes = -np.ones(null_mask.shape, dtype=codes.dtype) full_codes[~null_mask] = codes codes = full_codes @@ -422,10 +456,13 @@ def categories(self): @categories.setter def categories(self, categories): new_dtype = CategoricalDtype(categories, ordered=self.ordered) - if (self.dtype.categories is not None and - len(self.dtype.categories) != len(new_dtype.categories)): - raise ValueError("new categories need to have the same number of " - "items as the old categories!") + if self.dtype.categories is not None and len(self.dtype.categories) != len( + new_dtype.categories + ): + raise ValueError( + "new categories need to have the same number of " + "items as the old categories!" + ) self._dtype = new_dtype @property @@ -462,9 +499,9 @@ def copy(self): """ Copy constructor. """ - return self._constructor(values=self._codes.copy(), - dtype=self.dtype, - fastpath=True) + return self._constructor( + values=self._codes.copy(), dtype=self.dtype, fastpath=True + ) def astype(self, dtype, copy=True): """ @@ -531,8 +568,9 @@ def base(self): return None @classmethod - def _from_inferred_categories(cls, inferred_categories, inferred_codes, - dtype, true_values=None): + def _from_inferred_categories( + cls, inferred_categories, inferred_codes, dtype, true_values=None + ): """ Construct a Categorical from inferred values. @@ -556,8 +594,9 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, from pandas import Index, to_numeric, to_datetime, to_timedelta cats = Index(inferred_categories) - known_categories = (isinstance(dtype, CategoricalDtype) and - dtype.categories is not None) + known_categories = ( + isinstance(dtype, CategoricalDtype) and dtype.categories is not None + ) if known_categories: # Convert to a specialized type with `dtype` if specified. @@ -582,8 +621,7 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, unsorted = cats.copy() categories = cats.sort_values() - codes = _recode_for_categories(inferred_codes, unsorted, - categories) + codes = _recode_for_categories(inferred_codes, unsorted, categories) dtype = CategoricalDtype(categories, ordered=False) else: dtype = CategoricalDtype(cats, ordered=False) @@ -636,31 +674,37 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): [a, b, a, b] Categories (2, object): [a < b] """ - dtype = CategoricalDtype._from_values_or_dtype(categories=categories, - ordered=ordered, - dtype=dtype) + dtype = CategoricalDtype._from_values_or_dtype( + categories=categories, ordered=ordered, dtype=dtype + ) if dtype.categories is None: - msg = ("The categories must be provided in 'categories' or " - "'dtype'. Both were None.") + msg = ( + "The categories must be provided in 'categories' or " + "'dtype'. Both were None." + ) raise ValueError(msg) codes = np.asarray(codes) # #21767 if not is_integer_dtype(codes): msg = "codes need to be array-like integers" if is_float_dtype(codes): - icodes = codes.astype('i8') + icodes = codes.astype("i8") if (icodes == codes).all(): msg = None codes = icodes - warn(("float codes will be disallowed in the future and " - "raise a ValueError"), FutureWarning, stacklevel=2) + warn( + ( + "float codes will be disallowed in the future and " + "raise a ValueError" + ), + FutureWarning, + stacklevel=2, + ) if msg: raise ValueError(msg) - if len(codes) and ( - codes.max() >= len(dtype.categories) or codes.min() < -1): - raise ValueError("codes need to be between -1 and " - "len(categories)-1") + if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1): + raise ValueError("codes need to be between -1 and " "len(categories)-1") return cls(codes, dtype=dtype, fastpath=True) @@ -710,14 +754,18 @@ def _set_categories(self, categories, fastpath=False): """ if fastpath: - new_dtype = CategoricalDtype._from_fastpath(categories, - self.ordered) + new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered) else: new_dtype = CategoricalDtype(categories, ordered=self.ordered) - if (not fastpath and self.dtype.categories is not None and - len(new_dtype.categories) != len(self.dtype.categories)): - raise ValueError("new categories need to have the same number of " - "items than the old categories!") + if ( + not fastpath + and self.dtype.categories is not None + and len(new_dtype.categories) != len(self.dtype.categories) + ): + raise ValueError( + "new categories need to have the same number of " + "items than the old categories!" + ) self._dtype = new_dtype @@ -734,8 +782,7 @@ def _set_dtype(self, dtype): We don't do any validation here. It's assumed that the dtype is a (valid) instance of `CategoricalDtype`. """ - codes = _recode_for_categories(self.codes, self.categories, - dtype.categories) + codes = _recode_for_categories(self.codes, self.categories, dtype.categories) return type(self)(codes, dtype=dtype, fastpath=True) def set_ordered(self, value, inplace=False): @@ -750,7 +797,7 @@ def set_ordered(self, value, inplace=False): Whether or not to set the ordered attribute in-place or return a copy of this categorical with ordered set to the value. """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") new_dtype = CategoricalDtype(self.categories, ordered=value) cat = self if inplace else self.copy() cat._dtype = new_dtype @@ -772,7 +819,7 @@ def as_ordered(self, inplace=False): Categorical Ordered Categorical. """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") return self.set_ordered(True, inplace=inplace) def as_unordered(self, inplace=False): @@ -790,11 +837,10 @@ def as_unordered(self, inplace=False): Categorical Unordered Categorical. """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") return self.set_ordered(False, inplace=inplace) - def set_categories(self, new_categories, ordered=None, rename=False, - inplace=False): + def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): """ Set the categories to the specified new_categories. @@ -845,20 +891,22 @@ def set_categories(self, new_categories, ordered=None, rename=False, remove_categories remove_unused_categories """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if ordered is None: ordered = self.dtype._ordered new_dtype = CategoricalDtype(new_categories, ordered=ordered) cat = self if inplace else self.copy() if rename: - if (cat.dtype.categories is not None and - len(new_dtype.categories) < len(cat.dtype.categories)): + if cat.dtype.categories is not None and len(new_dtype.categories) < len( + cat.dtype.categories + ): # remove all _codes which are larger and set to -1/NaN cat._codes[cat._codes >= len(new_dtype.categories)] = -1 else: - codes = _recode_for_categories(cat.codes, cat.categories, - new_dtype.categories) + codes = _recode_for_categories( + cat.codes, cat.categories, new_dtype.categories + ) cat._codes = codes cat._dtype = new_dtype @@ -932,12 +980,11 @@ def rename_categories(self, new_categories, inplace=False): [A, A, B] Categories (2, object): [A, B] """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() if is_dict_like(new_categories): - cat.categories = [new_categories.get(item, item) - for item in cat.categories] + cat.categories = [new_categories.get(item, item) for item in cat.categories] elif callable(new_categories): cat.categories = [new_categories(item) for item in cat.categories] else: @@ -981,12 +1028,12 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False): remove_unused_categories set_categories """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if set(self.dtype.categories) != set(new_categories): - raise ValueError("items in new_categories are not the same as in " - "old categories") - return self.set_categories(new_categories, ordered=ordered, - inplace=inplace) + raise ValueError( + "items in new_categories are not the same as in " "old categories" + ) + return self.set_categories(new_categories, ordered=ordered, inplace=inplace) def add_categories(self, new_categories, inplace=False): """ @@ -1021,13 +1068,15 @@ def add_categories(self, new_categories, inplace=False): remove_unused_categories set_categories """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if not is_list_like(new_categories): new_categories = [new_categories] already_included = set(new_categories) & set(self.dtype.categories) if len(already_included) != 0: - msg = ("new categories must not include old categories: " - "{already_included!s}") + msg = ( + "new categories must not include old categories: " + "{already_included!s}" + ) raise ValueError(msg.format(already_included=already_included)) new_categories = list(self.dtype.categories) + list(new_categories) new_dtype = CategoricalDtype(new_categories, self.ordered) @@ -1070,14 +1119,13 @@ def remove_categories(self, removals, inplace=False): remove_unused_categories set_categories """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if not is_list_like(removals): removals = [removals] removal_set = set(list(removals)) not_included = removal_set - set(self.dtype.categories) - new_categories = [c for c in self.dtype.categories - if c not in removal_set] + new_categories = [c for c in self.dtype.categories if c not in removal_set] # GH 10156 if any(isna(removals)): @@ -1088,8 +1136,9 @@ def remove_categories(self, removals, inplace=False): msg = "removals must all be in old categories: {not_included!s}" raise ValueError(msg.format(not_included=not_included)) - return self.set_categories(new_categories, ordered=self.ordered, - rename=False, inplace=inplace) + return self.set_categories( + new_categories, ordered=self.ordered, rename=False, inplace=inplace + ) def remove_unused_categories(self, inplace=False): """ @@ -1113,7 +1162,7 @@ def remove_unused_categories(self, inplace=False): remove_categories set_categories """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() idx, inv = np.unique(cat._codes, return_inverse=True) @@ -1121,8 +1170,9 @@ def remove_unused_categories(self, inplace=False): idx, inv = idx[1:], inv - 1 new_categories = cat.dtype.categories.take(idx) - new_dtype = CategoricalDtype._from_fastpath(new_categories, - ordered=self.ordered) + new_dtype = CategoricalDtype._from_fastpath( + new_categories, ordered=self.ordered + ) cat._dtype = new_dtype cat._codes = coerce_indexer_dtype(inv, new_dtype.categories) @@ -1200,23 +1250,22 @@ def map(self, mapper): """ new_categories = self.categories.map(mapper) try: - return self.from_codes(self._codes.copy(), - categories=new_categories, - ordered=self.ordered) + return self.from_codes( + self._codes.copy(), categories=new_categories, ordered=self.ordered + ) except ValueError: # NA values are represented in self._codes with -1 # np.take causes NA values to take final element in new_categories if np.any(self._codes == -1): - new_categories = new_categories.insert(len(new_categories), - np.nan) + new_categories = new_categories.insert(len(new_categories), np.nan) return np.take(new_categories, self._codes) - __eq__ = _cat_compare_op('__eq__') - __ne__ = _cat_compare_op('__ne__') - __lt__ = _cat_compare_op('__lt__') - __gt__ = _cat_compare_op('__gt__') - __le__ = _cat_compare_op('__le__') - __ge__ = _cat_compare_op('__ge__') + __eq__ = _cat_compare_op("__eq__") + __ne__ = _cat_compare_op("__ne__") + __lt__ = _cat_compare_op("__lt__") + __gt__ = _cat_compare_op("__gt__") + __le__ = _cat_compare_op("__le__") + __ge__ = _cat_compare_op("__ge__") # for Series/ndarray like compat @property @@ -1262,9 +1311,11 @@ def shift(self, periods, fill_value=None): elif fill_value in self.categories: fill_value = self.categories.get_loc(fill_value) else: - raise ValueError("'fill_value={}' is not present " - "in this Categorical's " - "categories".format(fill_value)) + raise ValueError( + "'fill_value={}' is not present " + "in this Categorical's " + "categories".format(fill_value) + ) if periods > 0: codes[:periods] = fill_value else: @@ -1296,43 +1347,43 @@ def __array__(self, dtype=None): def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # for binary ops, use our custom dunder methods result = ops.maybe_dispatch_ufunc_to_dunder_op( - self, ufunc, method, *inputs, **kwargs) + self, ufunc, method, *inputs, **kwargs + ) if result is not NotImplemented: return result # for all other cases, raise for now (similarly as what happens in # Series.__array_prepare__) - raise TypeError("Object with dtype {dtype} cannot perform " - "the numpy op {op}".format( - dtype=self.dtype, - op=ufunc.__name__)) + raise TypeError( + "Object with dtype {dtype} cannot perform " + "the numpy op {op}".format(dtype=self.dtype, op=ufunc.__name__) + ) def __setstate__(self, state): """Necessary for making this object picklable""" if not isinstance(state, dict): - raise Exception('invalid pickle state') + raise Exception("invalid pickle state") # Provide compatibility with pre-0.15.0 Categoricals. - if '_categories' not in state and '_levels' in state: - state['_categories'] = self.dtype.validate_categories(state.pop( - '_levels')) - if '_codes' not in state and 'labels' in state: - state['_codes'] = coerce_indexer_dtype( - state.pop('labels'), state['_categories']) + if "_categories" not in state and "_levels" in state: + state["_categories"] = self.dtype.validate_categories(state.pop("_levels")) + if "_codes" not in state and "labels" in state: + state["_codes"] = coerce_indexer_dtype( + state.pop("labels"), state["_categories"] + ) # 0.16.0 ordered change - if '_ordered' not in state: + if "_ordered" not in state: # >=15.0 < 0.16.0 - if 'ordered' in state: - state['_ordered'] = state.pop('ordered') + if "ordered" in state: + state["_ordered"] = state.pop("ordered") else: - state['_ordered'] = False + state["_ordered"] = False # 0.21.0 CategoricalDtype change - if '_dtype' not in state: - state['_dtype'] = CategoricalDtype(state['_categories'], - state['_ordered']) + if "_dtype" not in state: + state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"]) for k, v in state.items(): setattr(self, k, v) @@ -1371,18 +1422,20 @@ def memory_usage(self, deep=False): -------- numpy.ndarray.nbytes """ - return self._codes.nbytes + self.dtype.categories.memory_usage( - deep=deep) + return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep) - @Substitution(klass='Categorical') - @Appender(_shared_docs['searchsorted']) - def searchsorted(self, value, side='left', sorter=None): + @Substitution(klass="Categorical") + @Appender(_shared_docs["searchsorted"]) + def searchsorted(self, value, side="left", sorter=None): if not self.ordered: - raise ValueError("Categorical not ordered\nyou can use " - ".as_ordered() to change the Categorical to an " - "ordered one") + raise ValueError( + "Categorical not ordered\nyou can use " + ".as_ordered() to change the Categorical to an " + "ordered one" + ) from pandas.core.series import Series + codes = _get_codes_for_values(Series(value).values, self.categories) if -1 in codes: raise KeyError("Value(s) to be inserted must be in categories.") @@ -1411,6 +1464,7 @@ def isna(self): ret = self._codes == -1 return ret + isnull = isna def notna(self): @@ -1432,14 +1486,14 @@ def notna(self): """ return ~self.isna() + notnull = notna def put(self, *args, **kwargs): """ Replace specific elements in the Categorical with given values. """ - raise NotImplementedError(("'put' is not yet implemented " - "for Categorical")) + raise NotImplementedError(("'put' is not yet implemented " "for Categorical")) def dropna(self): """ @@ -1489,10 +1543,9 @@ def value_counts(self, dropna=True): count = bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) - ix = self._constructor(ix, dtype=self.dtype, - fastpath=True) + ix = self._constructor(ix, dtype=self.dtype, fastpath=True) - return Series(count, index=CategoricalIndex(ix), dtype='int64') + return Series(count, index=CategoricalIndex(ix), dtype="int64") def get_values(self): """ @@ -1508,8 +1561,12 @@ def get_values(self): A numpy array of the same dtype as categorical.categories.dtype or Index if datetime / periods. """ - warn("The 'get_values' method is deprecated and will be removed in a " - "future version", FutureWarning, stacklevel=2) + warn( + "The 'get_values' method is deprecated and will be removed in a " + "future version", + FutureWarning, + stacklevel=2, + ) return self._internal_get_values() def _internal_get_values(self): @@ -1517,21 +1574,22 @@ def _internal_get_values(self): if is_datetimelike(self.categories): return self.categories.take(self._codes, fill_value=np.nan) elif is_integer_dtype(self.categories) and -1 in self._codes: - return self.categories.astype("object").take(self._codes, - fill_value=np.nan) + return self.categories.astype("object").take(self._codes, fill_value=np.nan) return np.array(self) def check_for_ordered(self, op): """ assert that we are ordered """ if not self.ordered: - raise TypeError("Categorical is not ordered for operation {op}\n" - "you can use .as_ordered() to change the " - "Categorical to an ordered one\n".format(op=op)) + raise TypeError( + "Categorical is not ordered for operation {op}\n" + "you can use .as_ordered() to change the " + "Categorical to an ordered one\n".format(op=op) + ) def _values_for_argsort(self): return self._codes.copy() - def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): + def argsort(self, ascending=True, kind="quicksort", *args, **kwargs): """ Return the indices that would sort the Categorical. @@ -1584,7 +1642,7 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ return super().argsort(ascending=ascending, kind=kind, *args, **kwargs) - def sort_values(self, inplace=False, ascending=True, na_position='last'): + def sort_values(self, inplace=False, ascending=True, na_position="last"): """ Sort the Categorical by category value returning a new Categorical by default. @@ -1658,21 +1716,19 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): [NaN, NaN, 5.0, 2.0, 2.0] Categories (2, int64): [2, 5] """ - inplace = validate_bool_kwarg(inplace, 'inplace') - if na_position not in ['last', 'first']: - msg = 'invalid na_position: {na_position!r}' + inplace = validate_bool_kwarg(inplace, "inplace") + if na_position not in ["last", "first"]: + msg = "invalid na_position: {na_position!r}" raise ValueError(msg.format(na_position=na_position)) - sorted_idx = nargsort(self, - ascending=ascending, - na_position=na_position) + sorted_idx = nargsort(self, ascending=ascending, na_position=na_position) if inplace: self._codes = self._codes[sorted_idx] else: - return self._constructor(values=self._codes[sorted_idx], - dtype=self.dtype, - fastpath=True) + return self._constructor( + values=self._codes[sorted_idx], dtype=self.dtype, fastpath=True + ) def _values_for_rank(self): """ @@ -1687,11 +1743,12 @@ def _values_for_rank(self): """ from pandas import Series + if self.ordered: values = self.codes mask = values == -1 if mask.any(): - values = values.astype('float64') + values = values.astype("float64") values[mask] = np.nan elif self.categories.is_numeric(): values = np.array(self) @@ -1703,7 +1760,7 @@ def _values_for_rank(self): ) return values - def ravel(self, order='C'): + def ravel(self, order="C"): """ Return a flattened (numpy) array. @@ -1713,9 +1770,12 @@ def ravel(self, order='C'): ------- numpy.array """ - warn("Categorical.ravel will return a Categorical object instead " - "of an ndarray in a future version.", - FutureWarning, stacklevel=2) + warn( + "Categorical.ravel will return a Categorical object instead " + "of an ndarray in a future version.", + FutureWarning, + stacklevel=2, + ) return np.array(self) def view(self): @@ -1743,7 +1803,7 @@ def to_dense(self): """ return np.asarray(self) - @deprecate_kwarg(old_arg_name='fill_value', new_arg_name='value') + @deprecate_kwarg(old_arg_name="fill_value", new_arg_name="value") def fillna(self, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method. @@ -1780,8 +1840,9 @@ def fillna(self, value=None, method=None, limit=None): if value is None: value = np.nan if limit is not None: - raise NotImplementedError("specifying a limit for fillna has not " - "been implemented yet") + raise NotImplementedError( + "specifying a limit for fillna has not " "been implemented yet" + ) codes = self._codes @@ -1789,8 +1850,9 @@ def fillna(self, value=None, method=None, limit=None): if method is not None: values = self.to_dense().reshape(-1, len(self)) - values = interpolate_2d(values, method, 0, None, - value).astype(self.categories.dtype)[0] + values = interpolate_2d(values, method, 0, None, value).astype( + self.categories.dtype + )[0] codes = _get_codes_for_values(values, self.categories) else: @@ -1819,9 +1881,11 @@ def fillna(self, value=None, method=None, limit=None): codes[mask] = self.categories.get_loc(value) else: - raise TypeError('"value" parameter must be a scalar, dict ' - 'or Series, but you passed a ' - '"{0}"'.format(type(value).__name__)) + raise TypeError( + '"value" parameter must be a scalar, dict ' + "or Series, but you passed a " + '"{0}"'.format(type(value).__name__) + ) return self._constructor(codes, dtype=self.dtype, fastpath=True) @@ -1913,14 +1977,10 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): if fill_value in self.categories: fill_value = self.categories.get_loc(fill_value) else: - msg = ( - "'fill_value' ('{}') is not in this Categorical's " - "categories." - ) + msg = "'fill_value' ('{}') is not in this Categorical's " "categories." raise TypeError(msg.format(fill_value)) - codes = take(self._codes, indexer, allow_fill=allow_fill, - fill_value=fill_value) + codes = take(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value) result = type(self).from_codes(codes, dtype=dtype) return result @@ -1937,8 +1997,7 @@ def _slice(self, slicer): # in a 2-d case be passd (slice(None),....) if isinstance(slicer, tuple) and len(slicer) == 2: if not com.is_null_slice(slicer[0]): - raise AssertionError("invalid slicing for a 1-ndim " - "categorical") + raise AssertionError("invalid slicing for a 1-ndim " "categorical") slicer = slicer[1] codes = self._codes[slicer] @@ -1972,12 +2031,13 @@ def _tidy_repr(self, max_vals=10, footer=True): """ num = max_vals // 2 head = self[:num]._get_repr(length=False, footer=False) - tail = self[-(max_vals - num):]._get_repr(length=False, footer=False) + tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False) - result = '{head}, ..., {tail}'.format(head=head[:-1], tail=tail[1:]) + result = "{head}, ..., {tail}".format(head=head[:-1], tail=tail[1:]) if footer: - result = '{result}\n{footer}'.format( - result=result, footer=self._repr_footer()) + result = "{result}\n{footer}".format( + result=result, footer=self._repr_footer() + ) return str(result) @@ -1985,9 +2045,13 @@ def _repr_categories(self): """ return the base repr for the categories """ - max_categories = (10 if get_option("display.max_categories") == 0 else - get_option("display.max_categories")) + max_categories = ( + 10 + if get_option("display.max_categories") == 0 + else get_option("display.max_categories") + ) from pandas.io.formats import format as fmt + if len(self.categories) > max_categories: num = max_categories // 2 head = fmt.format_array(self.categories[:num], None) @@ -2008,7 +2072,8 @@ def _repr_categories_info(self): category_strs = self._repr_categories() dtype = str(self.categories.dtype) levheader = "Categories ({length}, {dtype}): ".format( - length=len(self.categories), dtype=dtype) + length=len(self.categories), dtype=dtype + ) width, height = get_terminal_size() max_width = get_option("display.width") or width if console.in_ipython_frontend(): @@ -2033,13 +2098,16 @@ def _repr_categories_info(self): def _repr_footer(self): - return 'Length: {length}\n{info}'.format( - length=len(self), info=self._repr_categories_info()) + return "Length: {length}\n{info}".format( + length=len(self), info=self._repr_categories_info() + ) - def _get_repr(self, length=True, na_rep='NaN', footer=True): + def _get_repr(self, length=True, na_rep="NaN", footer=True): from pandas.io.formats import format as fmt - formatter = fmt.CategoricalFormatter(self, length=length, - na_rep=na_rep, footer=footer) + + formatter = fmt.CategoricalFormatter( + self, length=length, na_rep=na_rep, footer=footer + ) result = formatter.to_string() return str(result) @@ -2054,7 +2122,7 @@ def __repr__(self): result = self._get_repr(length=len(self) > _maxlen) else: msg = self._get_repr(length=False, footer=True).replace("\n", ", ") - result = ('[], {repr_msg}'.format(repr_msg=msg)) + result = "[], {repr_msg}".format(repr_msg=msg) return result @@ -2062,7 +2130,7 @@ def _maybe_coerce_indexer(self, indexer): """ return an indexer coerced to the codes dtype """ - if isinstance(indexer, np.ndarray) and indexer.dtype.kind == 'i': + if isinstance(indexer, np.ndarray) and indexer.dtype.kind == "i": indexer = indexer.astype(self._codes.dtype) return indexer @@ -2077,8 +2145,9 @@ def __getitem__(self, key): else: return self.categories[i] else: - return self._constructor(values=self._codes[key], - dtype=self.dtype, fastpath=True) + return self._constructor( + values=self._codes[key], dtype=self.dtype, fastpath=True + ) def __setitem__(self, key, value): """ @@ -2098,8 +2167,10 @@ def __setitem__(self, key, value): # require identical categories set if isinstance(value, Categorical): if not is_dtype_equal(self, value): - raise ValueError("Cannot set a Categorical with another, " - "without identical categories") + raise ValueError( + "Cannot set a Categorical with another, " + "without identical categories" + ) if not self.categories.equals(value.categories): new_codes = _recode_for_categories( value.codes, value.categories, self.categories @@ -2109,13 +2180,16 @@ def __setitem__(self, key, value): rvalue = value if is_list_like(value) else [value] from pandas import Index + to_add = Index(rvalue).difference(self.categories) # no assignments of values not in categories, but it's always ok to set # something to np.nan if len(to_add) and not isna(to_add).all(): - raise ValueError("Cannot setitem on a Categorical with a new " - "category, set the categories first") + raise ValueError( + "Cannot setitem on a Categorical with a new " + "category, set the categories first" + ) # set by position if isinstance(key, (int, np.integer)): @@ -2127,14 +2201,12 @@ def __setitem__(self, key, value): # in a 2-d case be passd (slice(None),....) if len(key) == 2: if not com.is_null_slice(key[0]): - raise AssertionError("invalid slicing for a 1-ndim " - "categorical") + raise AssertionError("invalid slicing for a 1-ndim " "categorical") key = key[1] elif len(key) == 1: key = key[0] else: - raise AssertionError("invalid slicing for a 1-ndim " - "categorical") + raise AssertionError("invalid slicing for a 1-ndim " "categorical") # slicing in Series or Categorical elif isinstance(key, slice): @@ -2172,8 +2244,9 @@ def _reverse_indexer(self): """ categories = self.categories - r, counts = libalgos.groupsort_indexer(self.codes.astype('int64'), - categories.size) + r, counts = libalgos.groupsort_indexer( + self.codes.astype("int64"), categories.size + ) counts = counts.cumsum() result = (r[start:end] for start, end in zip(counts, counts[1:])) result = dict(zip(categories, result)) @@ -2183,7 +2256,7 @@ def _reverse_indexer(self): def _reduce(self, name, axis=0, **kwargs): func = getattr(self, name, None) if func is None: - msg = 'Categorical cannot perform the operation {op}' + msg = "Categorical cannot perform the operation {op}" raise TypeError(msg.format(op=name)) return func(**kwargs) @@ -2202,7 +2275,7 @@ def min(self, numeric_only=None, **kwargs): ------- min : the minimum of this `Categorical` """ - self.check_for_ordered('min') + self.check_for_ordered("min") if numeric_only: good = self._codes != -1 pointer = self._codes[good].min(**kwargs) @@ -2228,7 +2301,7 @@ def max(self, numeric_only=None, **kwargs): ------- max : the maximum of this `Categorical` """ - self.check_for_ordered('max') + self.check_for_ordered("max") if numeric_only: good = self._codes != -1 pointer = self._codes[good].max(**kwargs) @@ -2258,6 +2331,7 @@ def mode(self, dropna=True): """ import pandas._libs.hashtable as htable + codes = self._codes if dropna: good = self._codes != -1 @@ -2322,13 +2396,14 @@ def unique(self): return cat.set_categories(cat.categories.take(take_codes)) def _values_for_factorize(self): - codes = self.codes.astype('int64') + codes = self.codes.astype("int64") return codes, -1 @classmethod def _from_factorized(cls, uniques, original): - return original._constructor(original.categories.take(uniques), - dtype=original.dtype) + return original._constructor( + original.categories.take(uniques), dtype=original.dtype + ) def equals(self, other): """ @@ -2347,9 +2422,9 @@ def equals(self, other): # fastpath to avoid re-coding other_codes = other._codes else: - other_codes = _recode_for_categories(other.codes, - other.categories, - self.categories) + other_codes = _recode_for_categories( + other.codes, other.categories, self.categories + ) return np.array_equal(self._codes, other_codes) return False @@ -2385,14 +2460,15 @@ def describe(self): freqs = counts / float(counts.sum()) from pandas.core.reshape.concat import concat + result = concat([counts, freqs], axis=1) - result.columns = ['counts', 'freqs'] - result.index.name = 'categories' + result.columns = ["counts", "freqs"] + result.index.name = "categories" return result - @Substitution(klass='Categorical') - @Appender(_extension_array_shared_docs['repeat']) + @Substitution(klass="Categorical") + @Appender(_extension_array_shared_docs["repeat"]) def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) codes = self._codes.repeat(repeats) @@ -2452,10 +2528,14 @@ def isin(self, values): array([ True, False, True, False, True, False]) """ from pandas.core.internals.construction import sanitize_array + if not is_list_like(values): - raise TypeError("only list-like objects are allowed to be passed" - " to isin(), you passed a [{values_type}]" - .format(values_type=type(values).__name__)) + raise TypeError( + "only list-like objects are allowed to be passed" + " to isin(), you passed a [{values_type}]".format( + values_type=type(values).__name__ + ) + ) values = sanitize_array(values, None, None) null_mask = np.asarray(isna(values)) code_values = self.categories.get_indexer(values) @@ -2466,15 +2546,23 @@ def isin(self, values): # The Series.cat accessor -@delegate_names(delegate=Categorical, - accessors=["categories", "ordered"], - typ="property") -@delegate_names(delegate=Categorical, - accessors=["rename_categories", "reorder_categories", - "add_categories", "remove_categories", - "remove_unused_categories", "set_categories", - "as_ordered", "as_unordered"], - typ="method") +@delegate_names( + delegate=Categorical, accessors=["categories", "ordered"], typ="property" +) +@delegate_names( + delegate=Categorical, + accessors=[ + "rename_categories", + "reorder_categories", + "add_categories", + "remove_categories", + "remove_unused_categories", + "set_categories", + "as_ordered", + "as_unordered", + ], + typ="method", +) class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): """ Accessor object for categorical properties of the Series values. @@ -2511,8 +2599,9 @@ def __init__(self, data): @staticmethod def _validate(data): if not is_categorical_dtype(data.dtype): - raise AttributeError("Can only use .cat accessor with a " - "'category' dtype") + raise AttributeError( + "Can only use .cat accessor with a " "'category' dtype" + ) def _delegate_property_get(self, name): return getattr(self._parent, name) @@ -2526,10 +2615,12 @@ def codes(self): Return Series of codes as well as the index. """ from pandas import Series + return Series(self._parent.codes, index=self._index) def _delegate_method(self, name, *args, **kwargs): from pandas import Series + method = getattr(self._parent, name) res = method(*args, **kwargs) if res is not None: @@ -2540,10 +2631,12 @@ def categorical(self): # Note: Upon deprecation, `test_tab_completion_with_categorical` will # need to be updated. `categorical` will need to be removed from # `ok_for_cat`. - warn("`Series.cat.categorical` has been deprecated. Use the " - "attributes on 'Series.cat' directly instead.", - FutureWarning, - stacklevel=2) + warn( + "`Series.cat.categorical` has been deprecated. Use the " + "attributes on 'Series.cat' directly instead.", + FutureWarning, + stacklevel=2, + ) return self._parent @property @@ -2551,10 +2644,11 @@ def name(self): # Note: Upon deprecation, `test_tab_completion_with_categorical` will # need to be updated. `name` will need to be removed from # `ok_for_cat`. - warn("`Series.cat.name` has been deprecated. Use `Series.name` " - "instead.", - FutureWarning, - stacklevel=2) + warn( + "`Series.cat.name` has been deprecated. Use `Series.name` " "instead.", + FutureWarning, + stacklevel=2, + ) return self._name @property @@ -2562,12 +2656,14 @@ def index(self): # Note: Upon deprecation, `test_tab_completion_with_categorical` will # need to be updated. `index` will need to be removed from # ok_for_cat`. - warn("`Series.cat.index` has been deprecated. Use `Series.index` " - "instead.", - FutureWarning, - stacklevel=2) + warn( + "`Series.cat.index` has been deprecated. Use `Series.index` " "instead.", + FutureWarning, + stacklevel=2, + ) return self._index + # utility routines @@ -2576,22 +2672,20 @@ def _get_codes_for_values(values, categories): utility routine to turn values into codes given the specified categories """ from pandas.core.algorithms import _get_data_algo, _hashtables + dtype_equal = is_dtype_equal(values.dtype, categories.dtype) if dtype_equal: # To prevent erroneous dtype coercion in _get_data_algo, retrieve # the underlying numpy array. gh-22702 - values = getattr(values, '_ndarray_values', values) - categories = getattr(categories, '_ndarray_values', categories) - elif (is_extension_array_dtype(categories.dtype) and - is_object_dtype(values)): + values = getattr(values, "_ndarray_values", values) + categories = getattr(categories, "_ndarray_values", categories) + elif is_extension_array_dtype(categories.dtype) and is_object_dtype(values): # Support inferring the correct extension dtype from an array of # scalar objects. e.g. # Categorical(array[Period, Period], categories=PeriodIndex(...)) try: - values = ( - categories.dtype.construct_array_type()._from_sequence(values) - ) + values = categories.dtype.construct_array_type()._from_sequence(values) except Exception: # but that may fail for any reason, so fall back to object values = ensure_object(values) @@ -2636,8 +2730,9 @@ def _recode_for_categories(codes, old_categories, new_categories): elif new_categories.equals(old_categories): # Same categories, so no need to actually recode return codes.copy() - indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories), - new_categories) + indexer = coerce_indexer_dtype( + new_categories.get_indexer(old_categories), new_categories + ) new_codes = take_1d(indexer, codes.copy(), fill_value=-1) return new_codes @@ -2647,8 +2742,7 @@ def _convert_to_list_like(list_like): return list_like if isinstance(list_like, list): return list_like - if (is_sequence(list_like) or isinstance(list_like, tuple) or - is_iterator(list_like)): + if is_sequence(list_like) or isinstance(list_like, tuple) or is_iterator(list_like): return list(list_like) elif is_scalar(list_like): return [list_like] diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 93166759d8dbdd..540442b7eaed40 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -7,30 +7,40 @@ from pandas._libs import NaT, NaTType, Timestamp, algos, iNaT, lib from pandas._libs.tslibs.c_timestamp import maybe_integer_op_deprecated -from pandas._libs.tslibs.period import ( - DIFFERENT_FREQ, IncompatibleFrequency, Period) +from pandas._libs.tslibs.period import DIFFERENT_FREQ, IncompatibleFrequency, Period from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds from pandas._libs.tslibs.timestamps import RoundTo, round_nsint64 from pandas.compat.numpy import function as nv -from pandas.errors import ( - AbstractMethodError, NullFrequencyError, PerformanceWarning) +from pandas.errors import AbstractMethodError, NullFrequencyError, PerformanceWarning from pandas.util._decorators import Appender, Substitution from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.common import ( - is_categorical_dtype, is_datetime64_any_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal, - is_extension_array_dtype, is_float_dtype, is_integer_dtype, is_list_like, - is_object_dtype, is_offsetlike, is_period_dtype, is_string_dtype, - is_timedelta64_dtype, is_unsigned_integer_dtype, pandas_dtype) + is_categorical_dtype, + is_datetime64_any_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetime_or_timedelta_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_float_dtype, + is_integer_dtype, + is_list_like, + is_object_dtype, + is_offsetlike, + is_period_dtype, + is_string_dtype, + is_timedelta64_dtype, + is_unsigned_integer_dtype, + pandas_dtype, +) from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import isna from pandas._typing import DatetimeLikeScalar from pandas.core import missing, nanops -from pandas.core.algorithms import ( - checked_add_with_arr, take, unique1d, value_counts) +from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts import pandas.core.common as com from pandas.tseries import frequencies @@ -68,8 +78,7 @@ def _scalar_type(self) -> Type[DatetimeLikeScalar]: raise AbstractMethodError(self) def _scalar_from_string( - self, - value: str, + self, value: str ) -> Union[Period, Timestamp, Timedelta, NaTType]: """ Construct a scalar type from a string. @@ -90,10 +99,7 @@ def _scalar_from_string( """ raise AbstractMethodError(self) - def _unbox_scalar( - self, - value: Union[Period, Timestamp, Timedelta, NaTType], - ) -> int: + def _unbox_scalar(self, value: Union[Period, Timestamp, Timedelta, NaTType]) -> int: """ Unbox the integer value of a scalar `value`. @@ -113,8 +119,7 @@ def _unbox_scalar( raise AbstractMethodError(self) def _check_compatible_with( - self, - other: Union[Period, Timestamp, Timedelta, NaTType], + self, other: Union[Period, Timestamp, Timedelta, NaTType] ) -> None: """ Verify that `self` and `other` are compatible. @@ -141,8 +146,10 @@ class DatelikeOps: Common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex. """ - @Substitution(URL="https://docs.python.org/3/library/datetime.html" - "#strftime-and-strptime-behavior") + @Substitution( + URL="https://docs.python.org/3/library/datetime.html" + "#strftime-and-strptime-behavior" + ) def strftime(self, date_format): """ Convert to Index using specified date_format. @@ -179,6 +186,7 @@ def strftime(self, date_format): dtype='object') """ from pandas import Index + return Index(self._format_native_types(date_format=date_format)) @@ -187,8 +195,7 @@ class TimelikeOps: Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex. """ - _round_doc = ( - """ + _round_doc = """ Perform {op} operation on the data to the specified `freq`. Parameters @@ -247,10 +254,9 @@ class TimelikeOps: DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', '2018-01-01 12:01:00'], dtype='datetime64[ns]', freq='T') - """) + """ - _round_example = ( - """>>> rng.round('H') + _round_example = """>>> rng.round('H') DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', '2018-01-01 12:00:00'], dtype='datetime64[ns]', freq=None) @@ -262,10 +268,9 @@ class TimelikeOps: 1 2018-01-01 12:00:00 2 2018-01-01 12:00:00 dtype: datetime64[ns] - """) + """ - _floor_example = ( - """>>> rng.floor('H') + _floor_example = """>>> rng.floor('H') DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00', '2018-01-01 12:00:00'], dtype='datetime64[ns]', freq=None) @@ -278,10 +283,8 @@ class TimelikeOps: 2 2018-01-01 12:00:00 dtype: datetime64[ns] """ - ) - _ceil_example = ( - """>>> rng.ceil('H') + _ceil_example = """>>> rng.ceil('H') DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', '2018-01-01 13:00:00'], dtype='datetime64[ns]', freq=None) @@ -294,7 +297,6 @@ class TimelikeOps: 2 2018-01-01 13:00:00 dtype: datetime64[ns] """ - ) def _round(self, freq, mode, ambiguous, nonexistent): # round the local times @@ -310,23 +312,19 @@ def _round(self, freq, mode, ambiguous, nonexistent): ) @Appender((_round_doc + _round_example).format(op="round")) - def round(self, freq, ambiguous='raise', nonexistent='raise'): - return self._round( - freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent - ) + def round(self, freq, ambiguous="raise", nonexistent="raise"): + return self._round(freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent) @Appender((_round_doc + _floor_example).format(op="floor")) - def floor(self, freq, ambiguous='raise', nonexistent='raise'): + def floor(self, freq, ambiguous="raise", nonexistent="raise"): return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent) @Appender((_round_doc + _ceil_example).format(op="ceil")) - def ceil(self, freq, ambiguous='raise', nonexistent='raise'): + def ceil(self, freq, ambiguous="raise", nonexistent="raise"): return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) -class DatetimeLikeArrayMixin(ExtensionOpsMixin, - AttributesMixin, - ExtensionArray): +class DatetimeLikeArrayMixin(ExtensionOpsMixin, AttributesMixin, ExtensionArray): """ Shared Base/Mixin class for DatetimeArray, TimedeltaArray, PeriodArray @@ -365,7 +363,7 @@ def asi8(self) -> np.ndarray: An ndarray with int64 dtype. """ # do not cache or you'll create a memory leak - return self._data.view('i8') + return self._data.view("i8") @property def _ndarray_values(self): @@ -374,7 +372,7 @@ def _ndarray_values(self): # ---------------------------------------------------------------- # Rendering Methods - def _format_native_types(self, na_rep='NaT', date_format=None): + def _format_native_types(self, na_rep="NaT", date_format=None): """ Helper method for astype when converting to strings. @@ -417,9 +415,11 @@ def __getitem__(self, key): is_int = lib.is_integer(key) if lib.is_scalar(key) and not is_int: - raise IndexError("only integers, slices (`:`), ellipsis (`...`), " - "numpy.newaxis (`None`) and integer or boolean " - "arrays are valid indices") + raise IndexError( + "only integers, slices (`:`), ellipsis (`...`), " + "numpy.newaxis (`None`) and integer or boolean " + "arrays are valid indices" + ) getitem = self._data.__getitem__ if is_int: @@ -459,9 +459,9 @@ def __getitem__(self, key): return self._simple_new(result, dtype=self.dtype, freq=freq) def __setitem__( - self, - key: Union[int, Sequence[int], Sequence[bool], slice], - value: Union[NaTType, Any, Sequence[Any]] + self, + key: Union[int, Sequence[int], Sequence[bool], slice], + value: Union[NaTType, Any, Sequence[Any]], ) -> None: # I'm fudging the types a bit here. "Any" above really depends # on type(self). For PeriodArray, it's Period (or stuff coercible @@ -477,12 +477,12 @@ def __setitem__( if not is_slice: key = cast(Sequence, key) - if (len(key) != len(value) - and not com.is_bool_indexer(key)): - msg = ("shape mismatch: value array of length '{}' does " - "not match indexing result of length '{}'.") - raise ValueError(msg.format( - len(key), len(value))) + if len(key) != len(value) and not com.is_bool_indexer(key): + msg = ( + "shape mismatch: value array of length '{}' does " + "not match indexing result of length '{}'." + ) + raise ValueError(msg.format(len(key), len(value))) elif not len(key): return @@ -499,8 +499,9 @@ def __setitem__( "'value' should be a '{scalar}', 'NaT', or array of those. " "Got '{typ}' instead." ) - raise TypeError(msg.format(scalar=self._scalar_type.__name__, - typ=type(value).__name__)) + raise TypeError( + msg.format(scalar=self._scalar_type.__name__, typ=type(value).__name__) + ) self._data[key] = value self._maybe_clear_freq() @@ -515,6 +516,7 @@ def astype(self, dtype, copy=True): # 2. DatetimeArray.astype handles conversion between tz. # 3. DatetimeArray.astype handles datetime -> period from pandas import Categorical + dtype = pandas_dtype(dtype) if is_object_dtype(dtype): @@ -533,11 +535,13 @@ def astype(self, dtype, copy=True): if copy: values = values.copy() return values - elif (is_datetime_or_timedelta_dtype(dtype) and - not is_dtype_equal(self.dtype, dtype)) or is_float_dtype(dtype): + elif ( + is_datetime_or_timedelta_dtype(dtype) + and not is_dtype_equal(self.dtype, dtype) + ) or is_float_dtype(dtype): # disallow conversion between datetime/timedelta, # and conversions for any datetimelike to float - msg = 'Cannot cast {name} to dtype {dtype}' + msg = "Cannot cast {name} to dtype {dtype}" raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) elif is_categorical_dtype(dtype): return Categorical(self, dtype=dtype) @@ -589,10 +593,9 @@ def take(self, indices, allow_fill=False, fill_value=None): if allow_fill: fill_value = self._validate_fill_value(fill_value) - new_values = take(self.asi8, - indices, - allow_fill=allow_fill, - fill_value=fill_value) + new_values = take( + self.asi8, indices, allow_fill=allow_fill, fill_value=fill_value + ) return type(self)(new_values, dtype=self.dtype) @@ -624,7 +627,7 @@ def _values_for_argsort(self): # These are not part of the EA API, but we implement them because # pandas assumes they're there. - def searchsorted(self, value, side='left', sorter=None): + def searchsorted(self, value, side="left", sorter=None): """ Find indices where elements should be inserted to maintain order. @@ -652,10 +655,10 @@ def searchsorted(self, value, side='left', sorter=None): if isinstance(value, str): value = self._scalar_from_string(value) - if not (isinstance(value, (self._scalar_type, type(self))) - or isna(value)): - raise ValueError("Unexpected type for 'value': {valtype}" - .format(valtype=type(value))) + if not (isinstance(value, (self._scalar_type, type(self))) or isna(value)): + raise ValueError( + "Unexpected type for 'value': {valtype}".format(valtype=type(value)) + ) self._check_compatible_with(value) if isinstance(value, type(self)): @@ -675,7 +678,7 @@ def repeat(self, repeats, *args, **kwargs): """ nv.validate_repeat(args, kwargs) values = self._data.repeat(repeats) - return type(self)(values.view('i8'), dtype=self.dtype) + return type(self)(values.view("i8"), dtype=self.dtype) def value_counts(self, dropna=False): """ @@ -700,8 +703,9 @@ def value_counts(self, dropna=False): cls = type(self) result = value_counts(values, sort=False, dropna=dropna) - index = Index(cls(result.index.view('i8'), dtype=self.dtype), - name=result.index.name) + index = Index( + cls(result.index.view("i8"), dtype=self.dtype), name=result.index.name + ) return Series(result.values, index=index, name=result.name) def map(self, mapper): @@ -725,7 +729,7 @@ def _isnan(self): """ return if each value is nan """ - return (self.asi8 == iNaT) + return self.asi8 == iNaT @property # NB: override with cache_readonly in immutable subclasses def _hasnans(self): @@ -773,13 +777,15 @@ def fillna(self, value=None, method=None, limit=None): if is_array_like(value): if len(value) != len(self): - raise ValueError("Length of 'value' does not match. Got ({}) " - " expected {}".format(len(value), len(self))) + raise ValueError( + "Length of 'value' does not match. Got ({}) " + " expected {}".format(len(value), len(self)) + ) value = value[mask] if mask.any(): if method is not None: - if method == 'pad': + if method == "pad": func = missing.pad_1d else: func = missing.backfill_1d @@ -791,8 +797,7 @@ def fillna(self, value=None, method=None, limit=None): # to avoid modifying `self` in-place. values = values.copy() - new_values = func(values, limit=limit, - mask=mask) + new_values = func(values, limit=limit, mask=mask) if is_datetime64tz_dtype(self): # we need to pass int64 values to the constructor to avoid # re-localizing incorrectly @@ -878,9 +883,9 @@ def _validate_frequency(cls, index, freq, **kwargs): return None try: - on_freq = cls._generate_range(start=index[0], end=None, - periods=len(index), freq=freq, - **kwargs) + on_freq = cls._generate_range( + start=index[0], end=None, periods=len(index), freq=freq, **kwargs + ) if not np.array_equal(index.asi8, on_freq.asi8): raise ValueError except ValueError as e: @@ -893,9 +898,12 @@ def _validate_frequency(cls, index, freq, **kwargs): # is `NaT`, in which case the call to `cls._generate_range` will # raise a ValueError, which we re-raise with a more targeted # message. - raise ValueError('Inferred frequency {infer} from passed values ' - 'does not conform to passed frequency {passed}' - .format(infer=inferred, passed=freq.freqstr)) + raise ValueError( + "Inferred frequency {infer} from passed values " + "does not conform to passed frequency {passed}".format( + infer=inferred, passed=freq.freqstr + ) + ) # monotonicity/uniqueness properties are called via frequencies.infer_freq, # see GH#23789 @@ -917,24 +925,28 @@ def _is_unique(self): def _add_datetimelike_scalar(self, other): # Overriden by TimedeltaArray - raise TypeError("cannot add {cls} and {typ}" - .format(cls=type(self).__name__, - typ=type(other).__name__)) + raise TypeError( + "cannot add {cls} and {typ}".format( + cls=type(self).__name__, typ=type(other).__name__ + ) + ) _add_datetime_arraylike = _add_datetimelike_scalar def _sub_datetimelike_scalar(self, other): # Overridden by DatetimeArray assert other is not NaT - raise TypeError("cannot subtract a datelike from a {cls}" - .format(cls=type(self).__name__)) + raise TypeError( + "cannot subtract a datelike from a {cls}".format(cls=type(self).__name__) + ) _sub_datetime_arraylike = _sub_datetimelike_scalar def _sub_period(self, other): # Overriden by PeriodArray - raise TypeError("cannot subtract Period from a {cls}" - .format(cls=type(self).__name__)) + raise TypeError( + "cannot subtract Period from a {cls}".format(cls=type(self).__name__) + ) def _add_offset(self, offset): raise AbstractMethodError(self) @@ -973,15 +985,16 @@ def _add_timedeltalike_scalar(self, other): """ if isna(other): # i.e np.timedelta64("NaT"), not recognized by delta_to_nanoseconds - new_values = np.empty(len(self), dtype='i8') + new_values = np.empty(len(self), dtype="i8") new_values[:] = iNaT return new_values inc = delta_to_nanoseconds(other) - new_values = checked_add_with_arr(self.asi8, inc, - arr_mask=self._isnan).view('i8') + new_values = checked_add_with_arr(self.asi8, inc, arr_mask=self._isnan).view( + "i8" + ) new_values = self._maybe_mask_results(new_values) - return new_values.view('i8') + return new_values.view("i8") def _add_delta_tdi(self, other): """ @@ -994,26 +1007,29 @@ def _add_delta_tdi(self, other): if isinstance(other, np.ndarray): # ndarray[timedelta64]; wrap in TimedeltaIndex for op from pandas import TimedeltaIndex + other = TimedeltaIndex(other) self_i8 = self.asi8 other_i8 = other.asi8 - new_values = checked_add_with_arr(self_i8, other_i8, - arr_mask=self._isnan, - b_mask=other._isnan) + new_values = checked_add_with_arr( + self_i8, other_i8, arr_mask=self._isnan, b_mask=other._isnan + ) if self._hasnans or other._hasnans: mask = (self._isnan) | (other._isnan) new_values[mask] = iNaT - return new_values.view('i8') + return new_values.view("i8") def _add_nat(self): """ Add pd.NaT to self """ if is_period_dtype(self): - raise TypeError('Cannot add {cls} and {typ}' - .format(cls=type(self).__name__, - typ=type(NaT).__name__)) + raise TypeError( + "Cannot add {cls} and {typ}".format( + cls=type(self).__name__, typ=type(NaT).__name__ + ) + ) # GH#19124 pd.NaT is treated like a timedelta for both timedelta # and datetime dtypes @@ -1033,7 +1049,7 @@ def _sub_nat(self): # For period dtype, timedelta64 is a close-enough return dtype. result = np.zeros(len(self), dtype=np.int64) result.fill(iNaT) - return result.view('timedelta64[ns]') + return result.view("timedelta64[ns]") def _sub_period_array(self, other): """ @@ -1051,22 +1067,23 @@ def _sub_period_array(self, other): Array of DateOffset objects; nulls represented by NaT. """ if not is_period_dtype(self): - raise TypeError("cannot subtract {dtype}-dtype from {cls}" - .format(dtype=other.dtype, - cls=type(self).__name__)) + raise TypeError( + "cannot subtract {dtype}-dtype from {cls}".format( + dtype=other.dtype, cls=type(self).__name__ + ) + ) if len(self) != len(other): - raise ValueError("cannot subtract arrays/indices of " - "unequal length") + raise ValueError("cannot subtract arrays/indices of " "unequal length") if self.freq != other.freq: - msg = DIFFERENT_FREQ.format(cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=other.freqstr) + msg = DIFFERENT_FREQ.format( + cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr + ) raise IncompatibleFrequency(msg) - new_values = checked_add_with_arr(self.asi8, -other.asi8, - arr_mask=self._isnan, - b_mask=other._isnan) + new_values = checked_add_with_arr( + self.asi8, -other.asi8, arr_mask=self._isnan, b_mask=other._isnan + ) new_values = np.array([self.freq.base * x for x in new_values]) if self._hasnans or other._hasnans: @@ -1125,17 +1142,19 @@ def _addsub_offset_array(self, other, op): if len(other) == 1: return op(self, other[0]) - warnings.warn("Adding/subtracting array of DateOffsets to " - "{cls} not vectorized" - .format(cls=type(self).__name__), PerformanceWarning) + warnings.warn( + "Adding/subtracting array of DateOffsets to " + "{cls} not vectorized".format(cls=type(self).__name__), + PerformanceWarning, + ) # For EA self.astype('O') returns a numpy array, not an Index - left = lib.values_from_object(self.astype('O')) + left = lib.values_from_object(self.astype("O")) res_values = op(left, np.array(other)) kwargs = {} if not is_period_dtype(self): - kwargs['freq'] = 'infer' + kwargs["freq"] = "infer" return self._from_sequence(res_values, **kwargs) def _time_shift(self, periods, freq=None): @@ -1173,8 +1192,7 @@ def _time_shift(self, periods, freq=None): # Note: in the DatetimeTZ case, _generate_range will infer the # appropriate timezone from `start` and `end`, so tz does not need # to be passed explicitly. - return self._generate_range(start=start, end=end, periods=None, - freq=self.freq) + return self._generate_range(start=start, end=end, periods=None, freq=self.freq) def __add__(self, other): other = lib.item_from_zerodim(other) @@ -1214,9 +1232,11 @@ def __add__(self, other): result = self._addsub_int_array(other, operator.add) elif is_float_dtype(other): # Explicitly catch invalid dtypes - raise TypeError("cannot add {dtype}-dtype to {cls}" - .format(dtype=other.dtype, - cls=type(self).__name__)) + raise TypeError( + "cannot add {dtype}-dtype to {cls}".format( + dtype=other.dtype, cls=type(self).__name__ + ) + ) elif is_period_dtype(other): # if self is a TimedeltaArray and other is a PeriodArray with # a timedelta-like (i.e. Tick) freq, this operation is valid. @@ -1231,6 +1251,7 @@ def __add__(self, other): if is_timedelta64_dtype(result) and isinstance(result, np.ndarray): from pandas.core.arrays import TimedeltaArray + # TODO: infer freq? return TimedeltaArray(result) return result @@ -1282,14 +1303,18 @@ def __sub__(self, other): maybe_integer_op_deprecated(self) result = self._addsub_int_array(other, operator.sub) elif isinstance(other, ABCIndexClass): - raise TypeError("cannot subtract {cls} and {typ}" - .format(cls=type(self).__name__, - typ=type(other).__name__)) + raise TypeError( + "cannot subtract {cls} and {typ}".format( + cls=type(self).__name__, typ=type(other).__name__ + ) + ) elif is_float_dtype(other): # Explicitly catch invalid dtypes - raise TypeError("cannot subtract {dtype}-dtype from {cls}" - .format(dtype=other.dtype, - cls=type(self).__name__)) + raise TypeError( + "cannot subtract {dtype}-dtype from {cls}".format( + dtype=other.dtype, cls=type(self).__name__ + ) + ) elif is_extension_array_dtype(other): # Categorical op will raise; defer explicitly return NotImplemented @@ -1298,6 +1323,7 @@ def __sub__(self, other): if is_timedelta64_dtype(result) and isinstance(result, np.ndarray): from pandas.core.arrays import TimedeltaArray + # TODO: infer freq? return TimedeltaArray(result) return result @@ -1309,20 +1335,28 @@ def __rsub__(self, other): if not isinstance(other, DatetimeLikeArrayMixin): # Avoid down-casting DatetimeIndex from pandas.core.arrays import DatetimeArray + other = DatetimeArray(other) return other - self - elif (is_datetime64_any_dtype(self) and hasattr(other, 'dtype') and - not is_datetime64_any_dtype(other)): + elif ( + is_datetime64_any_dtype(self) + and hasattr(other, "dtype") + and not is_datetime64_any_dtype(other) + ): # GH#19959 datetime - datetime is well-defined as timedelta, # but any other type - datetime is not well-defined. - raise TypeError("cannot subtract {cls} from {typ}" - .format(cls=type(self).__name__, - typ=type(other).__name__)) + raise TypeError( + "cannot subtract {cls} from {typ}".format( + cls=type(self).__name__, typ=type(other).__name__ + ) + ) elif is_period_dtype(self) and is_timedelta64_dtype(other): # TODO: Can we simplify/generalize these cases at all? - raise TypeError("cannot subtract {cls} from {dtype}" - .format(cls=type(self).__name__, - dtype=other.dtype)) + raise TypeError( + "cannot subtract {cls} from {dtype}".format( + cls=type(self).__name__, dtype=other.dtype + ) + ) return -(self - other) # FIXME: DTA/TDA/PA inplace methods should actually be inplace, GH#24115 @@ -1337,8 +1371,9 @@ def __isub__(self, other): # -------------------------------------------------------------- # Comparison Methods - def _ensure_localized(self, arg, ambiguous='raise', nonexistent='raise', - from_utc=False): + def _ensure_localized( + self, arg, ambiguous="raise", nonexistent="raise", from_utc=False + ): """ Ensure that we are re-localized. @@ -1360,12 +1395,12 @@ def _ensure_localized(self, arg, ambiguous='raise', nonexistent='raise', """ # reconvert to local tz - tz = getattr(self, 'tz', None) + tz = getattr(self, "tz", None) if tz is not None: if not isinstance(arg, type(self)): arg = self._simple_new(arg) if from_utc: - arg = arg.tz_localize('UTC').tz_convert(self.tz) + arg = arg.tz_localize("UTC").tz_convert(self.tz) else: arg = arg.tz_localize( self.tz, ambiguous=ambiguous, nonexistent=nonexistent @@ -1463,8 +1498,8 @@ def mean(self, skipna=True): raise TypeError( "mean is not implemented for {cls} since the meaning is " "ambiguous. An alternative is " - "obj.to_timestamp(how='start').mean()" - .format(cls=type(self).__name__)) + "obj.to_timestamp(how='start').mean()".format(cls=type(self).__name__) + ) mask = self.isna() if skipna: @@ -1478,7 +1513,7 @@ def mean(self, skipna=True): # short-circut for empty max / min return NaT - result = nanops.nanmean(values.view('i8'), skipna=skipna) + result = nanops.nanmean(values.view("i8"), skipna=skipna) # Don't have to worry about NA `result`, since no NA went in. return self._box_func(result) @@ -1486,6 +1521,7 @@ def mean(self, skipna=True): # ------------------------------------------------------------------- # Shared Constructor Helpers + def validate_periods(periods): """ If a `periods` argument is passed to the Datetime/Timedelta Array/Index @@ -1508,8 +1544,9 @@ def validate_periods(periods): if lib.is_float(periods): periods = int(periods) elif not lib.is_integer(periods): - raise TypeError('periods must be a number, got {periods}' - .format(periods=periods)) + raise TypeError( + "periods must be a number, got {periods}".format(periods=periods) + ) return periods @@ -1569,11 +1606,11 @@ def validate_inferred_freq(freq, inferred_freq, freq_infer): """ if inferred_freq is not None: if freq is not None and freq != inferred_freq: - raise ValueError('Inferred frequency {inferred} from passed ' - 'values does not conform to passed frequency ' - '{passed}' - .format(inferred=inferred_freq, - passed=freq.freqstr)) + raise ValueError( + "Inferred frequency {inferred} from passed " + "values does not conform to passed frequency " + "{passed}".format(inferred=inferred_freq, passed=freq.freqstr) + ) elif freq is None: freq = inferred_freq freq_infer = False @@ -1600,7 +1637,7 @@ def maybe_infer_freq(freq): freq_infer = False if not isinstance(freq, DateOffset): # if a passed freq is None, don't infer automatically - if freq != 'infer': + if freq != "infer": freq = frequencies.to_offset(freq) else: freq_infer = True @@ -1628,17 +1665,16 @@ def _ensure_datetimelike_to_i8(other, to_utc=False): if lib.is_scalar(other) and isna(other): return iNaT - elif isinstance(other, (PeriodArray, ABCIndexClass, - DatetimeLikeArrayMixin)): + elif isinstance(other, (PeriodArray, ABCIndexClass, DatetimeLikeArrayMixin)): # convert tz if needed - if getattr(other, 'tz', None) is not None: + if getattr(other, "tz", None) is not None: if to_utc: - other = other.tz_convert('UTC') + other = other.tz_convert("UTC") else: other = other.tz_localize(None) else: try: - return np.array(other, copy=False).view('i8') + return np.array(other, copy=False).view("i8") except TypeError: # period array cannot be coerced to int other = Index(other) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 6b554ddf25c96d..5b540dcce53c8c 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -8,20 +8,44 @@ from pandas._libs import lib, tslib from pandas._libs.tslibs import ( - NaT, Timestamp, ccalendar, conversion, fields, iNaT, normalize_date, - resolution as libresolution, timezones, tzconversion) + NaT, + Timestamp, + ccalendar, + conversion, + fields, + iNaT, + normalize_date, + resolution as libresolution, + timezones, + tzconversion, +) import pandas.compat as compat from pandas.errors import PerformanceWarning from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( - _INT64_DTYPE, _NS_DTYPE, is_categorical_dtype, is_datetime64_dtype, - is_datetime64_ns_dtype, is_datetime64tz_dtype, is_dtype_equal, - is_extension_type, is_float_dtype, is_object_dtype, is_period_dtype, - is_string_dtype, is_timedelta64_dtype, pandas_dtype) + _INT64_DTYPE, + _NS_DTYPE, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64_ns_dtype, + is_datetime64tz_dtype, + is_dtype_equal, + is_extension_type, + is_float_dtype, + is_object_dtype, + is_period_dtype, + is_string_dtype, + is_timedelta64_dtype, + pandas_dtype, +) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCIndexClass, ABCPandasArray, ABCSeries) + ABCDataFrame, + ABCIndexClass, + ABCPandasArray, + ABCSeries, +) from pandas.core.dtypes.missing import isna from pandas.core import ops @@ -92,15 +116,16 @@ def f(self): values = self._local_timestamps() if field in self._bool_ops: - if field.endswith(('start', 'end')): + if field.endswith(("start", "end")): freq = self.freq month_kw = 12 if freq: kwds = freq.kwds - month_kw = kwds.get('startingMonth', kwds.get('month', 12)) + month_kw = kwds.get("startingMonth", kwds.get("month", 12)) - result = fields.get_start_end_field(values, field, - self.freqstr, month_kw) + result = fields.get_start_end_field( + values, field, self.freqstr, month_kw + ) else: result = fields.get_date_field(values, field) @@ -113,8 +138,9 @@ def f(self): else: result = fields.get_date_field(values, field) - result = self._maybe_mask_results(result, fill_value=None, - convert='float64') + result = self._maybe_mask_results( + result, fill_value=None, convert="float64" + ) return result @@ -127,8 +153,8 @@ def _dt_array_cmp(cls, op): """ Wrap comparison operations to convert datetime-like to datetime64 """ - opname = '__{name}__'.format(name=op.__name__) - nat_result = opname == '__ne__' + opname = "__{name}__".format(name=op.__name__) + nat_result = opname == "__ne__" def wrapper(self, other): if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): @@ -147,7 +173,7 @@ def wrapper(self, other): # string that cannot be parsed to Timestamp return ops.invalid_comparison(self, other, op) - result = op(self.asi8, other.view('i8')) + result = op(self.asi8, other.view("i8")) if isna(other): result.fill(nat_result) elif lib.is_scalar(other) or np.ndim(other) == 0: @@ -160,8 +186,9 @@ def wrapper(self, other): other = type(self)._from_sequence(other) except ValueError: other = np.array(other, dtype=np.object_) - elif not isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries, - DatetimeArray)): + elif not isinstance( + other, (np.ndarray, ABCIndexClass, ABCSeries, DatetimeArray) + ): # Following Timestamp convention, __eq__ is all-False # and __ne__ is all True, others raise TypeError. return ops.invalid_comparison(self, other, op) @@ -170,13 +197,12 @@ def wrapper(self, other): # We have to use _comp_method_OBJECT_ARRAY instead of numpy # comparison otherwise it would fail to raise when # comparing tz-aware and tz-naive - with np.errstate(all='ignore'): - result = ops._comp_method_OBJECT_ARRAY(op, - self.astype(object), - other) + with np.errstate(all="ignore"): + result = ops._comp_method_OBJECT_ARRAY( + op, self.astype(object), other + ) o_mask = isna(other) - elif not (is_datetime64_dtype(other) or - is_datetime64tz_dtype(other)): + elif not (is_datetime64_dtype(other) or is_datetime64tz_dtype(other)): # e.g. is_timedelta64_dtype(other) return ops.invalid_comparison(self, other, op) else: @@ -184,14 +210,16 @@ def wrapper(self, other): if isinstance(other, (ABCIndexClass, ABCSeries)): other = other.array - if (is_datetime64_dtype(other) and - not is_datetime64_ns_dtype(other) or - not hasattr(other, 'asi8')): + if ( + is_datetime64_dtype(other) + and not is_datetime64_ns_dtype(other) + or not hasattr(other, "asi8") + ): # e.g. other.dtype == 'datetime64[s]' # or an object-dtype ndarray other = type(self)._from_sequence(other) - result = op(self.view('i8'), other.view('i8')) + result = op(self.view("i8"), other.view("i8")) o_mask = other._isnan result = com.values_from_object(result) @@ -207,9 +235,7 @@ def wrapper(self, other): return compat.set_function_name(wrapper, opname, cls) -class DatetimeArray(dtl.DatetimeLikeArrayMixin, - dtl.TimelikeOps, - dtl.DatelikeOps): +class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps): """ Pandas ExtensionArray for tz-naive or tz-aware datetime data. @@ -245,25 +271,53 @@ class DatetimeArray(dtl.DatetimeLikeArrayMixin, ------- None """ + _typ = "datetimearray" _scalar_type = Timestamp # define my properties & methods for delegation - _bool_ops = ['is_month_start', 'is_month_end', - 'is_quarter_start', 'is_quarter_end', 'is_year_start', - 'is_year_end', 'is_leap_year'] - _object_ops = ['weekday_name', 'freq', 'tz'] - _field_ops = ['year', 'month', 'day', 'hour', 'minute', 'second', - 'weekofyear', 'week', 'weekday', 'dayofweek', - 'dayofyear', 'quarter', 'days_in_month', - 'daysinmonth', 'microsecond', - 'nanosecond'] - _other_ops = ['date', 'time', 'timetz'] + _bool_ops = [ + "is_month_start", + "is_month_end", + "is_quarter_start", + "is_quarter_end", + "is_year_start", + "is_year_end", + "is_leap_year", + ] + _object_ops = ["weekday_name", "freq", "tz"] + _field_ops = [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "weekofyear", + "week", + "weekday", + "dayofweek", + "dayofyear", + "quarter", + "days_in_month", + "daysinmonth", + "microsecond", + "nanosecond", + ] + _other_ops = ["date", "time", "timetz"] _datetimelike_ops = _field_ops + _object_ops + _bool_ops + _other_ops - _datetimelike_methods = ['to_period', 'tz_localize', - 'tz_convert', - 'normalize', 'strftime', 'round', 'floor', - 'ceil', 'month_name', 'day_name'] + _datetimelike_methods = [ + "to_period", + "tz_localize", + "tz_convert", + "normalize", + "strftime", + "round", + "floor", + "ceil", + "month_name", + "day_name", + ] # ndim is inherited from ExtensionArray, must exist to ensure # Timestamp.__richcmp__(DateTimeArray) operates pointwise @@ -286,7 +340,7 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): if isinstance(values, type(self)): # validation - dtz = getattr(dtype, 'tz', None) + dtz = getattr(dtype, "tz", None) if dtz and values.tz is None: dtype = DatetimeTZDtype(tz=dtype.tz) elif dtz and values.tz: @@ -312,7 +366,7 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): if values.ndim != 1: raise ValueError("Only 1-dimensional input arrays are supported.") - if values.dtype == 'i8': + if values.dtype == "i8": # for compat with datetime/timedelta/period shared methods, # we can sometimes get here with int64 values. These represent # nanosecond UTC (or tz-naive) unix timestamps @@ -338,7 +392,7 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): values = values.copy() if freq: freq = to_offset(freq) - if getattr(dtype, 'tz', None): + if getattr(dtype, "tz", None): # https://github.com/pandas-dev/pandas/issues/18595 # Ensure that we have a standard timezone for pytz objects. # Without this, things like adding an array of timedeltas and @@ -356,7 +410,7 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): @classmethod def _simple_new(cls, values, freq=None, dtype=_NS_DTYPE): assert isinstance(values, np.ndarray) - if values.dtype == 'i8': + if values.dtype == "i8": values = values.view(_NS_DTYPE) result = object.__new__(cls) @@ -366,20 +420,33 @@ def _simple_new(cls, values, freq=None, dtype=_NS_DTYPE): return result @classmethod - def _from_sequence(cls, data, dtype=None, copy=False, - tz=None, freq=None, - dayfirst=False, yearfirst=False, ambiguous='raise', - int_as_wall_time=False): + def _from_sequence( + cls, + data, + dtype=None, + copy=False, + tz=None, + freq=None, + dayfirst=False, + yearfirst=False, + ambiguous="raise", + int_as_wall_time=False, + ): freq, freq_infer = dtl.maybe_infer_freq(freq) subarr, tz, inferred_freq = sequence_to_dt64ns( - data, dtype=dtype, copy=copy, tz=tz, - dayfirst=dayfirst, yearfirst=yearfirst, - ambiguous=ambiguous, int_as_wall_time=int_as_wall_time) + data, + dtype=dtype, + copy=copy, + tz=tz, + dayfirst=dayfirst, + yearfirst=yearfirst, + ambiguous=ambiguous, + int_as_wall_time=int_as_wall_time, + ) - freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, - freq_infer) + freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, freq_infer) dtype = tz_to_dtype(tz) result = cls._simple_new(subarr, freq=freq, dtype=dtype) @@ -396,18 +463,28 @@ def _from_sequence(cls, data, dtype=None, copy=False, return result @classmethod - def _generate_range(cls, start, end, periods, freq, tz=None, - normalize=False, ambiguous='raise', - nonexistent='raise', closed=None): + def _generate_range( + cls, + start, + end, + periods, + freq, + tz=None, + normalize=False, + ambiguous="raise", + nonexistent="raise", + closed=None, + ): periods = dtl.validate_periods(periods) if freq is None and any(x is None for x in [periods, start, end]): - raise ValueError('Must provide freq argument if no data is ' - 'supplied') + raise ValueError("Must provide freq argument if no data is " "supplied") if com.count_not_none(start, end, periods, freq) != 3: - raise ValueError('Of the four parameters: start, end, periods, ' - 'and freq, exactly three must be specified') + raise ValueError( + "Of the four parameters: start, end, periods, " + "and freq, exactly three must be specified" + ) freq = to_offset(freq) if start is not None: @@ -418,27 +495,31 @@ def _generate_range(cls, start, end, periods, freq, tz=None, if start is None and end is None: if closed is not None: - raise ValueError("Closed has to be None if not both of start" - "and end are defined") + raise ValueError( + "Closed has to be None if not both of start" "and end are defined" + ) if start is NaT or end is NaT: raise ValueError("Neither `start` nor `end` can be NaT") left_closed, right_closed = dtl.validate_endpoints(closed) - start, end, _normalized = _maybe_normalize_endpoints(start, end, - normalize) + start, end, _normalized = _maybe_normalize_endpoints(start, end, normalize) tz = _infer_tz_from_endpoints(start, end, tz) if tz is not None: # Localize the start and end arguments start = _maybe_localize_point( - start, getattr(start, 'tz', None), start, freq, tz, - ambiguous, nonexistent + start, + getattr(start, "tz", None), + start, + freq, + tz, + ambiguous, + nonexistent, ) end = _maybe_localize_point( - end, getattr(end, 'tz', None), end, freq, tz, - ambiguous, nonexistent + end, getattr(end, "tz", None), end, freq, tz, ambiguous, nonexistent ) if freq is not None: # We break Day arithmetic (fixed 24 hour) here and opt for @@ -455,8 +536,8 @@ def _generate_range(cls, start, end, periods, freq, tz=None, if tz is not None and index.tz is None: arr = conversion.tz_localize_to_utc( - index.asi8, - tz, ambiguous=ambiguous, nonexistent=nonexistent) + index.asi8, tz, ambiguous=ambiguous, nonexistent=nonexistent + ) index = cls(arr) @@ -471,12 +552,13 @@ def _generate_range(cls, start, end, periods, freq, tz=None, # Nanosecond-granularity timestamps aren't always correctly # representable with doubles, so we limit the range that we # pass to np.linspace as much as possible - arr = np.linspace( - 0, end.value - start.value, - periods, dtype='int64') + start.value + arr = ( + np.linspace(0, end.value - start.value, periods, dtype="int64") + + start.value + ) dtype = tz_to_dtype(tz) index = cls._simple_new( - arr.astype('M8[ns]', copy=False), freq=None, dtype=dtype + arr.astype("M8[ns]", copy=False), freq=None, dtype=dtype ) if not left_closed and len(index) and index[0] == start: @@ -504,8 +586,11 @@ def _check_compatible_with(self, other): if other is NaT: return if not timezones.tz_compare(self.tz, other.tz): - raise ValueError("Timezones don't match. '{own} != {other}'" - .format(own=self.tz, other=other.tz)) + raise ValueError( + "Timezones don't match. '{own} != {other}'".format( + own=self.tz, other=other.tz + ) + ) def _maybe_clear_freq(self): self._freq = None @@ -555,8 +640,10 @@ def tz(self): @tz.setter def tz(self, value): # GH 3746: Prevent localizing or converting the index by setting tz - raise AttributeError("Cannot directly set timezone. Use tz_localize() " - "or tz_convert() as appropriate") + raise AttributeError( + "Cannot directly set timezone. Use tz_localize() " + "or tz_convert() as appropriate" + ) @property def tzinfo(self): @@ -610,9 +697,9 @@ def __iter__(self): for i in range(chunks): start_i = i * chunksize end_i = min((i + 1) * chunksize, length) - converted = tslib.ints_to_pydatetime(data[start_i:end_i], - tz=self.tz, freq=self.freq, - box="timestamp") + converted = tslib.ints_to_pydatetime( + data[start_i:end_i], tz=self.tz, freq=self.freq, box="timestamp" + ) for v in converted: yield v @@ -623,11 +710,10 @@ def astype(self, dtype, copy=True): # DatetimeLikeArrayMixin Super handles the rest. dtype = pandas_dtype(dtype) - if (is_datetime64_ns_dtype(dtype) and - not is_dtype_equal(dtype, self.dtype)): + if is_datetime64_ns_dtype(dtype) and not is_dtype_equal(dtype, self.dtype): # GH#18951: datetime64_ns dtype but not equal means different tz - new_tz = getattr(dtype, 'tz', None) - if getattr(self.dtype, 'tz', None) is None: + new_tz = getattr(dtype, "tz", None) + if getattr(self.dtype, "tz", None) is None: return self.tz_localize(new_tz) result = self.tz_convert(new_tz) if new_tz is None: @@ -636,8 +722,7 @@ def astype(self, dtype, copy=True): # ndarray, but we could maybe work around it there. result = result._data return result - elif is_datetime64tz_dtype(self.dtype) and is_dtype_equal(self.dtype, - dtype): + elif is_datetime64tz_dtype(self.dtype) and is_dtype_equal(self.dtype, dtype): if copy: return self.copy() return self @@ -656,21 +741,23 @@ def _validate_fill_value(self, fill_value): self._assert_tzawareness_compat(fill_value) fill_value = Timestamp(fill_value).value else: - raise ValueError("'fill_value' should be a Timestamp. " - "Got '{got}'.".format(got=fill_value)) + raise ValueError( + "'fill_value' should be a Timestamp. " + "Got '{got}'.".format(got=fill_value) + ) return fill_value # ----------------------------------------------------------------- # Rendering Methods - def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): + def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): from pandas.io.formats.format import _get_format_datetime64_from_values + fmt = _get_format_datetime64_from_values(self, date_format) - return tslib.format_array_from_datetime(self.asi8, - tz=self.tz, - format=fmt, - na_rep=na_rep) + return tslib.format_array_from_datetime( + self.asi8, tz=self.tz, format=fmt, na_rep=na_rep + ) # ----------------------------------------------------------------- # Comparison Methods @@ -684,12 +771,12 @@ def _has_same_tz(self, other): if isinstance(other, np.datetime64): # convert to Timestamp as np.datetime64 doesn't have tz attr other = Timestamp(other) - vzone = timezones.get_timezone(getattr(other, 'tzinfo', '__no_tz__')) + vzone = timezones.get_timezone(getattr(other, "tzinfo", "__no_tz__")) return zzone == vzone def _assert_tzawareness_compat(self, other): # adapted from _Timestamp._assert_tzawareness_compat - other_tz = getattr(other, 'tzinfo', None) + other_tz = getattr(other, "tzinfo", None) if is_datetime64tz_dtype(other): # Get tzinfo from Series dtype other_tz = other.dtype.tz @@ -698,11 +785,13 @@ def _assert_tzawareness_compat(self, other): pass elif self.tz is None: if other_tz is not None: - raise TypeError('Cannot compare tz-naive and tz-aware ' - 'datetime-like objects.') + raise TypeError( + "Cannot compare tz-naive and tz-aware " "datetime-like objects." + ) elif other_tz is None: - raise TypeError('Cannot compare tz-naive and tz-aware ' - 'datetime-like objects') + raise TypeError( + "Cannot compare tz-naive and tz-aware " "datetime-like objects" + ) # ----------------------------------------------------------------- # Arithmetic Methods @@ -718,18 +807,18 @@ def _sub_datetime_arraylike(self, other): if not self._has_same_tz(other): # require tz compat - raise TypeError("{cls} subtraction must have the same " - "timezones or no timezones" - .format(cls=type(self).__name__)) + raise TypeError( + "{cls} subtraction must have the same " + "timezones or no timezones".format(cls=type(self).__name__) + ) self_i8 = self.asi8 other_i8 = other.asi8 arr_mask = self._isnan | other._isnan - new_values = checked_add_with_arr(self_i8, -other_i8, - arr_mask=arr_mask) + new_values = checked_add_with_arr(self_i8, -other_i8, arr_mask=arr_mask) if self._hasnans or other._hasnans: new_values[arr_mask] = iNaT - return new_values.view('timedelta64[ns]') + return new_values.view("timedelta64[ns]") def _add_offset(self, offset): assert not isinstance(offset, Tick) @@ -743,11 +832,13 @@ def _add_offset(self, offset): result = result.tz_localize(self.tz) except NotImplementedError: - warnings.warn("Non-vectorized DateOffset being applied to Series " - "or DatetimeIndex", PerformanceWarning) - result = self.astype('O') + offset + warnings.warn( + "Non-vectorized DateOffset being applied to Series " "or DatetimeIndex", + PerformanceWarning, + ) + result = self.astype("O") + offset - return type(self)._from_sequence(result, freq='infer') + return type(self)._from_sequence(result, freq="infer") def _sub_datetimelike_scalar(self, other): # subtract a datetime from myself, yielding a ndarray[timedelta64[ns]] @@ -759,14 +850,14 @@ def _sub_datetimelike_scalar(self, other): if not self._has_same_tz(other): # require tz compat - raise TypeError("Timestamp subtraction must have the same " - "timezones or no timezones") + raise TypeError( + "Timestamp subtraction must have the same " "timezones or no timezones" + ) i8 = self.asi8 - result = checked_add_with_arr(i8, -other.value, - arr_mask=self._isnan) + result = checked_add_with_arr(i8, -other.value, arr_mask=self._isnan) result = self._maybe_mask_results(result) - return result.view('timedelta64[ns]') + return result.view("timedelta64[ns]") def _add_delta(self, delta): """ @@ -783,7 +874,7 @@ def _add_delta(self, delta): result : DatetimeArray """ new_values = super()._add_delta(delta) - return type(self)._from_sequence(new_values, tz=self.tz, freq='infer') + return type(self)._from_sequence(new_values, tz=self.tz, freq="infer") # ----------------------------------------------------------------- # Timezone Conversion and Localization Methods @@ -865,15 +956,15 @@ def tz_convert(self, tz): if self.tz is None: # tz naive, use tz_localize - raise TypeError('Cannot convert tz-naive timestamps, use ' - 'tz_localize to localize') + raise TypeError( + "Cannot convert tz-naive timestamps, use " "tz_localize to localize" + ) # No conversion since timestamps are all UTC to begin with dtype = tz_to_dtype(tz) return self._simple_new(self.asi8, dtype=dtype, freq=self.freq) - def tz_localize(self, tz, ambiguous='raise', nonexistent='raise', - errors=None): + def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None): """ Localize tz-naive Datetime Array/Index to tz-aware Datetime Array/Index. @@ -1021,30 +1112,35 @@ def tz_localize(self, tz, ambiguous='raise', nonexistent='raise', dtype: datetime64[ns, 'Europe/Warsaw'] """ if errors is not None: - warnings.warn("The errors argument is deprecated and will be " - "removed in a future release. Use " - "nonexistent='NaT' or nonexistent='raise' " - "instead.", FutureWarning) - if errors == 'coerce': - nonexistent = 'NaT' - elif errors == 'raise': - nonexistent = 'raise' + warnings.warn( + "The errors argument is deprecated and will be " + "removed in a future release. Use " + "nonexistent='NaT' or nonexistent='raise' " + "instead.", + FutureWarning, + ) + if errors == "coerce": + nonexistent = "NaT" + elif errors == "raise": + nonexistent = "raise" else: - raise ValueError("The errors argument must be either 'coerce' " - "or 'raise'.") + raise ValueError( + "The errors argument must be either 'coerce' " "or 'raise'." + ) - nonexistent_options = ('raise', 'NaT', 'shift_forward', - 'shift_backward') + nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward") if nonexistent not in nonexistent_options and not isinstance( - nonexistent, timedelta): - raise ValueError("The nonexistent argument must be one of 'raise'," - " 'NaT', 'shift_forward', 'shift_backward' or" - " a timedelta object") + nonexistent, timedelta + ): + raise ValueError( + "The nonexistent argument must be one of 'raise'," + " 'NaT', 'shift_forward', 'shift_backward' or" + " a timedelta object" + ) if self.tz is not None: if tz is None: - new_dates = tzconversion.tz_convert(self.asi8, timezones.UTC, - self.tz) + new_dates = tzconversion.tz_convert(self.asi8, timezones.UTC, self.tz) else: raise TypeError("Already tz-aware, use tz_convert to convert.") else: @@ -1052,7 +1148,7 @@ def tz_localize(self, tz, ambiguous='raise', nonexistent='raise', # Convert to UTC new_dates = conversion.tz_localize_to_utc( - self.asi8, tz, ambiguous=ambiguous, nonexistent=nonexistent, + self.asi8, tz, ambiguous=ambiguous, nonexistent=nonexistent ) new_dates = new_dates.view(_NS_DTYPE) dtype = tz_to_dtype(tz) @@ -1114,12 +1210,11 @@ def normalize(self): not_null = ~self.isna() DAY_NS = ccalendar.DAY_SECONDS * 1000000000 new_values = self.asi8.copy() - adjustment = (new_values[not_null] % DAY_NS) + adjustment = new_values[not_null] % DAY_NS new_values[not_null] = new_values[not_null] - adjustment else: new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz) - return type(self)._from_sequence(new_values, - freq='infer').tz_localize(self.tz) + return type(self)._from_sequence(new_values, freq="infer").tz_localize(self.tz) def to_period(self, freq=None): """ @@ -1168,15 +1263,19 @@ def to_period(self, freq=None): from pandas.core.arrays import PeriodArray if self.tz is not None: - warnings.warn("Converting to PeriodArray/Index representation " - "will drop timezone information.", UserWarning) + warnings.warn( + "Converting to PeriodArray/Index representation " + "will drop timezone information.", + UserWarning, + ) if freq is None: freq = self.freqstr or self.inferred_freq if freq is None: - raise ValueError("You must pass a freq argument as " - "current index has none.") + raise ValueError( + "You must pass a freq argument as " "current index has none." + ) freq = get_period_alias(freq) @@ -1198,8 +1297,9 @@ def to_perioddelta(self, freq): """ # TODO: consider privatizing (discussion in GH#23113) from pandas.core.arrays.timedeltas import TimedeltaArray + i8delta = self.asi8 - self.to_period(freq).to_timestamp().asi8 - m8delta = i8delta.view('m8[ns]') + m8delta = i8delta.view("m8[ns]") return TimedeltaArray(m8delta) # ----------------------------------------------------------------- @@ -1236,8 +1336,7 @@ def month_name(self, locale=None): else: values = self.asi8 - result = fields.get_date_name_field(values, 'month_name', - locale=locale) + result = fields.get_date_name_field(values, "month_name", locale=locale) result = self._maybe_mask_results(result, fill_value=None) return result @@ -1272,8 +1371,7 @@ def day_name(self, locale=None): else: values = self.asi8 - result = fields.get_date_name_field(values, 'day_name', - locale=locale) + result = fields.get_date_name_field(values, "day_name", locale=locale) result = self._maybe_mask_results(result, fill_value=None) return result @@ -1316,19 +1414,17 @@ def date(self): return tslib.ints_to_pydatetime(timestamps, box="date") - year = _field_accessor('year', 'Y', "The year of the datetime.") - month = _field_accessor('month', 'M', - "The month as January=1, December=12. ") - day = _field_accessor('day', 'D', "The days of the datetime.") - hour = _field_accessor('hour', 'h', "The hours of the datetime.") - minute = _field_accessor('minute', 'm', "The minutes of the datetime.") - second = _field_accessor('second', 's', "The seconds of the datetime.") - microsecond = _field_accessor('microsecond', 'us', - "The microseconds of the datetime.") - nanosecond = _field_accessor('nanosecond', 'ns', - "The nanoseconds of the datetime.") - weekofyear = _field_accessor('weekofyear', 'woy', - "The week ordinal of the year.") + year = _field_accessor("year", "Y", "The year of the datetime.") + month = _field_accessor("month", "M", "The month as January=1, December=12. ") + day = _field_accessor("day", "D", "The days of the datetime.") + hour = _field_accessor("hour", "h", "The hours of the datetime.") + minute = _field_accessor("minute", "m", "The minutes of the datetime.") + second = _field_accessor("second", "s", "The seconds of the datetime.") + microsecond = _field_accessor( + "microsecond", "us", "The microseconds of the datetime." + ) + nanosecond = _field_accessor("nanosecond", "ns", "The nanoseconds of the datetime.") + weekofyear = _field_accessor("weekofyear", "woy", "The week ordinal of the year.") week = weekofyear _dayofweek_doc = """ The day of the week with Monday=0, Sunday=6. @@ -1364,21 +1460,20 @@ def date(self): 2017-01-08 6 Freq: D, dtype: int64 """ - dayofweek = _field_accessor('dayofweek', 'dow', _dayofweek_doc) + dayofweek = _field_accessor("dayofweek", "dow", _dayofweek_doc) weekday = dayofweek weekday_name = _field_accessor( - 'weekday_name', - 'weekday_name', - "The name of day in a week (ex: Friday)\n\n.. deprecated:: 0.23.0") + "weekday_name", + "weekday_name", + "The name of day in a week (ex: Friday)\n\n.. deprecated:: 0.23.0", + ) - dayofyear = _field_accessor('dayofyear', 'doy', - "The ordinal day of the year.") - quarter = _field_accessor('quarter', 'q', "The quarter of the date.") + dayofyear = _field_accessor("dayofyear", "doy", "The ordinal day of the year.") + quarter = _field_accessor("quarter", "q", "The quarter of the date.") days_in_month = _field_accessor( - 'days_in_month', - 'dim', - "The number of days in the month.") + "days_in_month", "dim", "The number of days in the month." + ) daysinmonth = days_in_month _is_month_doc = """ Indicates whether the date is the {first_or_last} day of the month. @@ -1425,18 +1520,16 @@ def date(self): array([False, True, False]) """ is_month_start = _field_accessor( - 'is_month_start', - 'is_month_start', - _is_month_doc.format(first_or_last='first')) + "is_month_start", "is_month_start", _is_month_doc.format(first_or_last="first") + ) is_month_end = _field_accessor( - 'is_month_end', - 'is_month_end', - _is_month_doc.format(first_or_last='last')) + "is_month_end", "is_month_end", _is_month_doc.format(first_or_last="last") + ) is_quarter_start = _field_accessor( - 'is_quarter_start', - 'is_quarter_start', + "is_quarter_start", + "is_quarter_start", """ Indicator for whether the date is the first day of a quarter. @@ -1474,10 +1567,11 @@ def date(self): >>> idx.is_quarter_start array([False, False, True, False]) - """) + """, + ) is_quarter_end = _field_accessor( - 'is_quarter_end', - 'is_quarter_end', + "is_quarter_end", + "is_quarter_end", """ Indicator for whether the date is the last day of a quarter. @@ -1515,10 +1609,11 @@ def date(self): >>> idx.is_quarter_end array([False, True, False, False]) - """) + """, + ) is_year_start = _field_accessor( - 'is_year_start', - 'is_year_start', + "is_year_start", + "is_year_start", """ Indicate whether the date is the first day of a year. @@ -1558,10 +1653,11 @@ def date(self): >>> idx.is_year_start array([False, False, True]) - """) + """, + ) is_year_end = _field_accessor( - 'is_year_end', - 'is_year_end', + "is_year_end", + "is_year_end", """ Indicate whether the date is the last day of the year. @@ -1601,10 +1697,11 @@ def date(self): >>> idx.is_year_end array([False, True, False]) - """) + """, + ) is_leap_year = _field_accessor( - 'is_leap_year', - 'is_leap_year', + "is_leap_year", + "is_leap_year", """ Boolean indicator if the date belongs to a leap year. @@ -1641,7 +1738,8 @@ def date(self): 1 False 2 False dtype: bool - """) + """, + ) def to_julian_date(self): """ @@ -1657,19 +1755,23 @@ def to_julian_date(self): testarr = month < 3 year[testarr] -= 1 month[testarr] += 12 - return (day + - np.fix((153 * month - 457) / 5) + - 365 * year + - np.floor(year / 4) - - np.floor(year / 100) + - np.floor(year / 400) + - 1721118.5 + - (self.hour + - self.minute / 60.0 + - self.second / 3600.0 + - self.microsecond / 3600.0 / 1e+6 + - self.nanosecond / 3600.0 / 1e+9 - ) / 24.0) + return ( + day + + np.fix((153 * month - 457) / 5) + + 365 * year + + np.floor(year / 4) + - np.floor(year / 100) + + np.floor(year / 400) + + 1721118.5 + + ( + self.hour + + self.minute / 60.0 + + self.second / 3600.0 + + self.microsecond / 3600.0 / 1e6 + + self.nanosecond / 3600.0 / 1e9 + ) + / 24.0 + ) DatetimeArray._add_comparison_ops() @@ -1678,10 +1780,17 @@ def to_julian_date(self): # ------------------------------------------------------------------- # Constructor Helpers -def sequence_to_dt64ns(data, dtype=None, copy=False, - tz=None, - dayfirst=False, yearfirst=False, ambiguous='raise', - int_as_wall_time=False): + +def sequence_to_dt64ns( + data, + dtype=None, + copy=False, + tz=None, + dayfirst=False, + yearfirst=False, + ambiguous="raise", + int_as_wall_time=False, +): """ Parameters ---------- @@ -1748,13 +1857,14 @@ def sequence_to_dt64ns(data, dtype=None, copy=False, # TODO: We do not have tests specific to string-dtypes, # also complex or categorical or other extension copy = False - if lib.infer_dtype(data, skipna=False) == 'integer': + if lib.infer_dtype(data, skipna=False) == "integer": data = data.astype(np.int64) else: # data comes back here as either i8 to denote UTC timestamps # or M8[ns] to denote wall times data, inferred_tz = objects_to_datetime64ns( - data, dayfirst=dayfirst, yearfirst=yearfirst) + data, dayfirst=dayfirst, yearfirst=yearfirst + ) tz = maybe_infer_tz(tz, inferred_tz) # When a sequence of timestamp objects is passed, we always # want to treat the (now i8-valued) data as UTC timestamps, @@ -1777,8 +1887,9 @@ def sequence_to_dt64ns(data, dtype=None, copy=False, if tz is not None: # Convert tz-naive to UTC tz = timezones.maybe_get_tz(tz) - data = conversion.tz_localize_to_utc(data.view('i8'), tz, - ambiguous=ambiguous) + data = conversion.tz_localize_to_utc( + data.view("i8"), tz, ambiguous=ambiguous + ) data = data.view(_NS_DTYPE) assert data.dtype == _NS_DTYPE, data.dtype @@ -1794,8 +1905,9 @@ def sequence_to_dt64ns(data, dtype=None, copy=False, data = data.astype(np.int64, copy=False) if int_as_wall_time and tz is not None and not timezones.is_utc(tz): warnings.warn(_i8_message, FutureWarning, stacklevel=4) - data = conversion.tz_localize_to_utc(data.view('i8'), tz, - ambiguous=ambiguous) + data = conversion.tz_localize_to_utc( + data.view("i8"), tz, ambiguous=ambiguous + ) data = data.view(_NS_DTYPE) result = data.view(_NS_DTYPE) @@ -1804,7 +1916,7 @@ def sequence_to_dt64ns(data, dtype=None, copy=False, result = result.copy() assert isinstance(result, np.ndarray), type(result) - assert result.dtype == 'M8[ns]', result.dtype + assert result.dtype == "M8[ns]", result.dtype # We have to call this again after possibly inferring a tz above validate_tz_from_dtype(dtype, tz) @@ -1812,9 +1924,15 @@ def sequence_to_dt64ns(data, dtype=None, copy=False, return result, tz, inferred_freq -def objects_to_datetime64ns(data, dayfirst, yearfirst, - utc=False, errors="raise", - require_iso8601=False, allow_object=False): +def objects_to_datetime64ns( + data, + dayfirst, + yearfirst, + utc=False, + errors="raise", + require_iso8601=False, + allow_object=False, +): """ Convert data to array of timestamps. @@ -1854,14 +1972,14 @@ def objects_to_datetime64ns(data, dayfirst, yearfirst, utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, - require_iso8601=require_iso8601 + require_iso8601=require_iso8601, ) except ValueError as e: try: values, tz_parsed = conversion.datetime_to_datetime64(data) # If tzaware, these values represent unix timestamps, so we # return them as i8 to distinguish from wall times - return values.view('i8'), tz_parsed + return values.view("i8"), tz_parsed except (ValueError, TypeError): raise e @@ -1869,7 +1987,7 @@ def objects_to_datetime64ns(data, dayfirst, yearfirst, # We can take a shortcut since the datetime64 numpy array # is in UTC # Return i8 values to denote unix timestamps - return result.view('i8'), tz_parsed + return result.view("i8"), tz_parsed elif is_datetime64_dtype(result): # returning M8[ns] denotes wall-times; since tz is None # the distinction is a thin one @@ -1917,16 +2035,20 @@ def maybe_convert_dtype(data, copy): # with integer dtypes. See discussion in GH#23675 elif is_timedelta64_dtype(data): - warnings.warn("Passing timedelta64-dtype data is deprecated, will " - "raise a TypeError in a future version", - FutureWarning, stacklevel=5) + warnings.warn( + "Passing timedelta64-dtype data is deprecated, will " + "raise a TypeError in a future version", + FutureWarning, + stacklevel=5, + ) data = data.view(_NS_DTYPE) elif is_period_dtype(data): # Note: without explicitly raising here, PeriodIndex # test_setops.test_join_does_not_recur fails - raise TypeError("Passing PeriodDtype data is invalid. " - "Use `data.to_timestamp()` instead") + raise TypeError( + "Passing PeriodDtype data is invalid. " "Use `data.to_timestamp()` instead" + ) elif is_categorical_dtype(data): # GH#18664 preserve tz in going DTI->Categorical->DTI @@ -1947,6 +2069,7 @@ def maybe_convert_dtype(data, copy): # ------------------------------------------------------------------- # Validation and Inference + def maybe_infer_tz(tz, inferred_tz): """ If a timezone is inferred from data, check that it is compatible with @@ -1970,9 +2093,10 @@ def maybe_infer_tz(tz, inferred_tz): elif inferred_tz is None: pass elif not timezones.tz_compare(tz, inferred_tz): - raise TypeError('data is already tz-aware {inferred_tz}, unable to ' - 'set specified tz: {tz}' - .format(inferred_tz=inferred_tz, tz=tz)) + raise TypeError( + "data is already tz-aware {inferred_tz}, unable to " + "set specified tz: {tz}".format(inferred_tz=inferred_tz, tz=tz) + ) return tz @@ -2003,17 +2127,21 @@ def _validate_dt64_dtype(dtype): if is_dtype_equal(dtype, np.dtype("M8")): # no precision, warn dtype = _NS_DTYPE - msg = textwrap.dedent("""\ + msg = textwrap.dedent( + """\ Passing in 'datetime64' dtype with no precision is deprecated and will raise in a future version. Please pass in - 'datetime64[ns]' instead.""") + 'datetime64[ns]' instead.""" + ) warnings.warn(msg, FutureWarning, stacklevel=5) - if ((isinstance(dtype, np.dtype) and dtype != _NS_DTYPE) - or not isinstance(dtype, (np.dtype, DatetimeTZDtype))): - raise ValueError("Unexpected value for 'dtype': '{dtype}'. " - "Must be 'datetime64[ns]' or DatetimeTZDtype'." - .format(dtype=dtype)) + if (isinstance(dtype, np.dtype) and dtype != _NS_DTYPE) or not isinstance( + dtype, (np.dtype, DatetimeTZDtype) + ): + raise ValueError( + "Unexpected value for 'dtype': '{dtype}'. " + "Must be 'datetime64[ns]' or DatetimeTZDtype'.".format(dtype=dtype) + ) return dtype @@ -2046,19 +2174,20 @@ def validate_tz_from_dtype(dtype, tz): # but not by us. We *do* allow non-existent tz errors to # go through pass - dtz = getattr(dtype, 'tz', None) + dtz = getattr(dtype, "tz", None) if dtz is not None: if tz is not None and not timezones.tz_compare(tz, dtz): - raise ValueError("cannot supply both a tz and a dtype" - " with a tz") + raise ValueError("cannot supply both a tz and a dtype" " with a tz") tz = dtz if tz is not None and is_datetime64_dtype(dtype): # We also need to check for the case where the user passed a # tz-naive dtype (i.e. datetime64[ns]) if tz is not None and not timezones.tz_compare(tz, dtz): - raise ValueError("cannot supply both a tz and a " - "timezone-naive dtype (i.e. datetime64[ns])") + raise ValueError( + "cannot supply both a tz and a " + "timezone-naive dtype (i.e. datetime64[ns])" + ) return tz @@ -2086,16 +2215,16 @@ def _infer_tz_from_endpoints(start, end, tz): try: inferred_tz = timezones.infer_tzinfo(start, end) except Exception: - raise TypeError('Start and end cannot both be tz-aware with ' - 'different timezones') + raise TypeError( + "Start and end cannot both be tz-aware with " "different timezones" + ) inferred_tz = timezones.maybe_get_tz(inferred_tz) tz = timezones.maybe_get_tz(tz) if tz is not None and inferred_tz is not None: if not timezones.tz_compare(inferred_tz, tz): - raise AssertionError("Inferred time zone not equal to passed " - "time zone") + raise AssertionError("Inferred time zone not equal to passed " "time zone") elif inferred_tz is not None: tz = inferred_tz @@ -2123,8 +2252,7 @@ def _maybe_normalize_endpoints(start, end, normalize): return start, end, _normalized -def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous, - nonexistent): +def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous, nonexistent): """ Localize a start or end Timestamp to the timezone of the corresponding start or end Timestamp @@ -2149,10 +2277,9 @@ def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous, if is_none is None and is_not_none is not None: # Note: We can't ambiguous='infer' a singular ambiguous time; however, # we have historically defaulted ambiguous=False - ambiguous = ambiguous if ambiguous != 'infer' else False - localize_args = {'ambiguous': ambiguous, 'nonexistent': nonexistent, - 'tz': None} + ambiguous = ambiguous if ambiguous != "infer" else False + localize_args = {"ambiguous": ambiguous, "nonexistent": nonexistent, "tz": None} if isinstance(freq, Tick) or freq is None: - localize_args['tz'] = tz + localize_args["tz"] = tz ts = ts.tz_localize(**localize_args) return ts diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 644c2f634240f7..c999c4db232e6b 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -12,8 +12,15 @@ from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( - is_bool_dtype, is_float, is_float_dtype, is_integer, is_integer_dtype, - is_list_like, is_object_dtype, is_scalar) + is_bool_dtype, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_object_dtype, + is_scalar, +) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, notna @@ -32,23 +39,23 @@ class _IntegerDtype(ExtensionDtype): The attributes name & type are set when these subclasses are created. """ + name = None # type: str base = None type = None # type: Type na_value = np.nan def __repr__(self): - sign = 'U' if self.is_unsigned_integer else '' - return "{sign}Int{size}Dtype()".format(sign=sign, - size=8 * self.itemsize) + sign = "U" if self.is_unsigned_integer else "" + return "{sign}Int{size}Dtype()".format(sign=sign, size=8 * self.itemsize) @cache_readonly def is_signed_integer(self): - return self.kind == 'i' + return self.kind == "i" @cache_readonly def is_unsigned_integer(self): - return self.kind == 'u' + return self.kind == "u" @property def _is_numeric(self): @@ -111,15 +118,18 @@ def safe_cast(values, dtype, copy): """ try: - return values.astype(dtype, casting='safe', copy=copy) + return values.astype(dtype, casting="safe", copy=copy) except TypeError: casted = values.astype(dtype, copy=copy) if (casted == values).all(): return casted - raise TypeError("cannot safely cast non-equivalent {} to {}".format( - values.dtype, np.dtype(dtype))) + raise TypeError( + "cannot safely cast non-equivalent {} to {}".format( + values.dtype, np.dtype(dtype) + ) + ) def coerce_to_array(values, dtype, mask=None, copy=False): @@ -139,13 +149,14 @@ def coerce_to_array(values, dtype, mask=None, copy=False): tuple of (values, mask) """ # if values is integer numpy array, preserve it's dtype - if dtype is None and hasattr(values, 'dtype'): + if dtype is None and hasattr(values, "dtype"): if is_integer_dtype(values.dtype): dtype = values.dtype if dtype is not None: - if (isinstance(dtype, str) and - (dtype.startswith("Int") or dtype.startswith("UInt"))): + if isinstance(dtype, str) and ( + dtype.startswith("Int") or dtype.startswith("UInt") + ): # Avoid DeprecationWarning from NumPy about np.dtype("Int64") # https://github.com/numpy/numpy/pull/7476 dtype = dtype.lower() @@ -169,20 +180,26 @@ def coerce_to_array(values, dtype, mask=None, copy=False): values = np.array(values, copy=copy) if is_object_dtype(values): inferred_type = lib.infer_dtype(values, skipna=True) - if inferred_type == 'empty': + if inferred_type == "empty": values = np.empty(len(values)) values.fill(np.nan) - elif inferred_type not in ['floating', 'integer', - 'mixed-integer', 'mixed-integer-float']: - raise TypeError("{} cannot be converted to an IntegerDtype".format( - values.dtype)) + elif inferred_type not in [ + "floating", + "integer", + "mixed-integer", + "mixed-integer-float", + ]: + raise TypeError( + "{} cannot be converted to an IntegerDtype".format(values.dtype) + ) elif is_bool_dtype(values) and is_integer_dtype(dtype): values = np.array(values, dtype=int, copy=copy) elif not (is_integer_dtype(values) or is_float_dtype(values)): - raise TypeError("{} cannot be converted to an IntegerDtype".format( - values.dtype)) + raise TypeError( + "{} cannot be converted to an IntegerDtype".format(values.dtype) + ) if mask is None: mask = isna(values) @@ -196,7 +213,7 @@ def coerce_to_array(values, dtype, mask=None, copy=False): # infer dtype if needed if dtype is None: - dtype = np.dtype('int64') + dtype = np.dtype("int64") else: dtype = dtype.type @@ -284,13 +301,16 @@ def dtype(self): return _dtypes[str(self._data.dtype)] def __init__(self, values, mask, copy=False): - if not (isinstance(values, np.ndarray) - and is_integer_dtype(values.dtype)): - raise TypeError("values should be integer numpy array. Use " - "the 'integer_array' function instead") + if not (isinstance(values, np.ndarray) and is_integer_dtype(values.dtype)): + raise TypeError( + "values should be integer numpy array. Use " + "the 'integer_array' function instead" + ) if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)): - raise TypeError("mask should be boolean numpy array. Use " - "the 'integer_array' function instead") + raise TypeError( + "mask should be boolean numpy array. Use " + "the 'integer_array' function instead" + ) if copy: values = values.copy() @@ -315,8 +335,9 @@ def _from_factorized(cls, values, original): def _formatter(self, boxed=False): def fmt(x): if isna(x): - return 'NaN' + return "NaN" return str(x) + return fmt def __getitem__(self, item): @@ -350,10 +371,10 @@ def __array__(self, dtype=None): def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # For IntegerArray inputs, we apply the ufunc to ._data # and mask the result. - if method == 'reduce': + if method == "reduce": # Not clear how to handle missing values in reductions. Raise. raise NotImplementedError("The 'reduce' method is not supported.") - out = kwargs.get('out', ()) + out = kwargs.get("out", ()) for x in inputs + out: if not isinstance(x, self._HANDLED_TYPES + (IntegerArray,)): @@ -361,7 +382,8 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # for binary ops, use our custom dunder methods result = ops.maybe_dispatch_ufunc_to_dunder_op( - self, ufunc, method, *inputs, **kwargs) + self, ufunc, method, *inputs, **kwargs + ) if result is not NotImplemented: return result @@ -404,11 +426,11 @@ def take(self, indexer, allow_fill=False, fill_value=None): # we always fill with 1 internally # to avoid upcasting data_fill_value = 1 if isna(fill_value) else fill_value - result = take(self._data, indexer, fill_value=data_fill_value, - allow_fill=allow_fill) + result = take( + self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill + ) - mask = take(self._mask, indexer, fill_value=True, - allow_fill=allow_fill) + mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) # if we are filling # we only fill where the indexer is null @@ -545,9 +567,10 @@ def value_counts(self, dropna=True): # appending to an Index *always* infers # w/o passing the dtype array = np.append(array, [self._mask.sum()]) - index = Index(np.concatenate( - [index.values, - np.array([np.nan], dtype=object)]), dtype=object) + index = Index( + np.concatenate([index.values, np.array([np.nan], dtype=object)]), + dtype=object, + ) return Series(array, index=index) @@ -585,7 +608,7 @@ def cmp_method(self, other): elif is_list_like(other): other = np.asarray(other) if other.ndim > 0 and len(self) != len(other): - raise ValueError('Lengths must match to compare') + raise ValueError("Lengths must match to compare") other = lib.item_from_zerodim(other) @@ -593,7 +616,7 @@ def cmp_method(self, other): # comparisons, this will raise in the future with warnings.catch_warnings(): warnings.filterwarnings("ignore", "elementwise", FutureWarning) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = op(self._data, other) # nans propagate @@ -602,10 +625,10 @@ def cmp_method(self, other): else: mask = self._mask | mask - result[mask] = op_name == 'ne' + result[mask] = op_name == "ne" return result - name = '__{name}__'.format(name=op.__name__) + name = "__{name}__".format(name=op.__name__) return set_function_name(cmp_method, name, cls) def _reduce(self, name, skipna=True, **kwargs): @@ -614,19 +637,19 @@ def _reduce(self, name, skipna=True, **kwargs): # coerce to a nan-aware float if needed if mask.any(): - data = self._data.astype('float64') + data = self._data.astype("float64") data[mask] = self._na_value - op = getattr(nanops, 'nan' + name) + op = getattr(nanops, "nan" + name) result = op(data, axis=0, skipna=skipna, mask=mask) # if we have a boolean op, don't coerce - if name in ['any', 'all']: + if name in ["any", "all"]: pass # if we have a preservable numeric op, # provide coercion back to an integer type if possible - elif name in ['sum', 'min', 'max', 'prod'] and notna(result): + elif name in ["sum", "min", "max", "prod"] and notna(result): int_result = int(result) if int_result == result: result = int_result @@ -651,8 +674,9 @@ def _maybe_mask_result(self, result, mask, other, op_name): # if we have a float operand we are by-definition # a float result # or our op is a divide - if ((is_float_dtype(other) or is_float(other)) or - (op_name in ['rtruediv', 'truediv', 'rdiv', 'div'])): + if (is_float_dtype(other) or is_float(other)) or ( + op_name in ["rtruediv", "truediv", "rdiv", "div"] + ): result[mask] = np.nan return result @@ -669,14 +693,13 @@ def integer_arithmetic_method(self, other): # Rely on pandas to unbox and dispatch to us. return NotImplemented - if getattr(other, 'ndim', 0) > 1: - raise NotImplementedError( - "can only perform ops with 1-d structures") + if getattr(other, "ndim", 0) > 1: + raise NotImplementedError("can only perform ops with 1-d structures") if isinstance(other, IntegerArray): other, mask = other._data, other._mask - elif getattr(other, 'ndim', None) == 0: + elif getattr(other, "ndim", None) == 0: other = other.item() elif is_list_like(other): @@ -685,8 +708,7 @@ def integer_arithmetic_method(self, other): other = other.item() elif other.ndim == 1: if not (is_float_dtype(other) or is_integer_dtype(other)): - raise TypeError( - "can only perform ops with numeric values") + raise TypeError("can only perform ops with numeric values") else: if not (is_float(other) or is_integer(other)): raise TypeError("can only perform ops with numeric values") @@ -698,24 +720,26 @@ def integer_arithmetic_method(self, other): mask = self._mask | mask # 1 ** np.nan is 1. So we have to unmask those. - if op_name == 'pow': + if op_name == "pow": mask = np.where(self == 1, False, mask) - elif op_name == 'rpow': + elif op_name == "rpow": mask = np.where(other == 1, False, mask) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = op(self._data, other) # divmod returns a tuple - if op_name == 'divmod': + if op_name == "divmod": div, mod = result - return (self._maybe_mask_result(div, mask, other, 'floordiv'), - self._maybe_mask_result(mod, mask, other, 'mod')) + return ( + self._maybe_mask_result(div, mask, other, "floordiv"), + self._maybe_mask_result(mod, mask, other, "mod"), + ) return self._maybe_mask_result(result, mask, other, op_name) - name = '__{name}__'.format(name=op.__name__) + name = "__{name}__".format(name=op.__name__) return set_function_name(integer_arithmetic_method, name, cls) @@ -739,76 +763,108 @@ def integer_arithmetic_method(self, other): # create the Dtype Int8Dtype = register_extension_dtype( - type('Int8Dtype', (_IntegerDtype, ), { - 'type': np.int8, - 'name': 'Int8', - '__doc__': _dtype_docstring.format(dtype='int8') - }) + type( + "Int8Dtype", + (_IntegerDtype,), + { + "type": np.int8, + "name": "Int8", + "__doc__": _dtype_docstring.format(dtype="int8"), + }, + ) ) Int16Dtype = register_extension_dtype( - type('Int16Dtype', (_IntegerDtype, ), { - 'type': np.int16, - 'name': 'Int16', - '__doc__': _dtype_docstring.format(dtype='int16') - }) + type( + "Int16Dtype", + (_IntegerDtype,), + { + "type": np.int16, + "name": "Int16", + "__doc__": _dtype_docstring.format(dtype="int16"), + }, + ) ) Int32Dtype = register_extension_dtype( - type('Int32Dtype', (_IntegerDtype, ), { - 'type': np.int32, - 'name': 'Int32', - '__doc__': _dtype_docstring.format(dtype='int32') - }) + type( + "Int32Dtype", + (_IntegerDtype,), + { + "type": np.int32, + "name": "Int32", + "__doc__": _dtype_docstring.format(dtype="int32"), + }, + ) ) Int64Dtype = register_extension_dtype( - type('Int64Dtype', (_IntegerDtype, ), { - 'type': np.int64, - 'name': 'Int64', - '__doc__': _dtype_docstring.format(dtype='int64') - }) + type( + "Int64Dtype", + (_IntegerDtype,), + { + "type": np.int64, + "name": "Int64", + "__doc__": _dtype_docstring.format(dtype="int64"), + }, + ) ) UInt8Dtype = register_extension_dtype( - type('UInt8Dtype', (_IntegerDtype, ), { - 'type': np.uint8, - 'name': 'UInt8', - '__doc__': _dtype_docstring.format(dtype='uint8') - }) + type( + "UInt8Dtype", + (_IntegerDtype,), + { + "type": np.uint8, + "name": "UInt8", + "__doc__": _dtype_docstring.format(dtype="uint8"), + }, + ) ) UInt16Dtype = register_extension_dtype( - type('UInt16Dtype', (_IntegerDtype, ), { - 'type': np.uint16, - 'name': 'UInt16', - '__doc__': _dtype_docstring.format(dtype='uint16') - }) + type( + "UInt16Dtype", + (_IntegerDtype,), + { + "type": np.uint16, + "name": "UInt16", + "__doc__": _dtype_docstring.format(dtype="uint16"), + }, + ) ) UInt32Dtype = register_extension_dtype( - type('UInt32Dtype', (_IntegerDtype, ), { - 'type': np.uint32, - 'name': 'UInt32', - '__doc__': _dtype_docstring.format(dtype='uint32') - }) + type( + "UInt32Dtype", + (_IntegerDtype,), + { + "type": np.uint32, + "name": "UInt32", + "__doc__": _dtype_docstring.format(dtype="uint32"), + }, + ) ) UInt64Dtype = register_extension_dtype( - type('UInt64Dtype', (_IntegerDtype, ), { - 'type': np.uint64, - 'name': 'UInt64', - '__doc__': _dtype_docstring.format(dtype='uint64') - }) + type( + "UInt64Dtype", + (_IntegerDtype,), + { + "type": np.uint64, + "name": "UInt64", + "__doc__": _dtype_docstring.format(dtype="uint64"), + }, + ) ) _dtypes = { - 'int8': Int8Dtype(), - 'int16': Int16Dtype(), - 'int32': Int32Dtype(), - 'int64': Int64Dtype(), - 'uint8': UInt8Dtype(), - 'uint16': UInt16Dtype(), - 'uint32': UInt32Dtype(), - 'uint64': UInt64Dtype(), + "int8": Int8Dtype(), + "int16": Int16Dtype(), + "int32": Int32Dtype(), + "int64": Int64Dtype(), + "uint8": UInt8Dtype(), + "uint16": UInt16Dtype(), + "uint32": UInt32Dtype(), + "uint64": UInt64Dtype(), } diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index cf8ca25857f4e5..f9fbd7ada376e9 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -5,38 +5,49 @@ from pandas._config import get_option -from pandas._libs.interval import ( - Interval, IntervalMixin, intervals_to_interval_bounds) +from pandas._libs.interval import Interval, IntervalMixin, intervals_to_interval_bounds from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender from pandas.core.dtypes.cast import maybe_convert_platform from pandas.core.dtypes.common import ( - is_categorical_dtype, is_datetime64_any_dtype, is_float_dtype, - is_integer_dtype, is_interval, is_interval_dtype, is_scalar, - is_string_dtype, is_timedelta64_dtype, pandas_dtype) + is_categorical_dtype, + is_datetime64_any_dtype, + is_float_dtype, + is_integer_dtype, + is_interval, + is_interval_dtype, + is_scalar, + is_string_dtype, + is_timedelta64_dtype, + pandas_dtype, +) from pandas.core.dtypes.dtypes import IntervalDtype from pandas.core.dtypes.generic import ( - ABCDatetimeIndex, ABCInterval, ABCIntervalIndex, ABCPeriodIndex, ABCSeries) + ABCDatetimeIndex, + ABCInterval, + ABCIntervalIndex, + ABCPeriodIndex, + ABCSeries, +) from pandas.core.dtypes.missing import isna, notna -from pandas.core.arrays.base import ( - ExtensionArray, _extension_array_shared_docs) +from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs from pandas.core.arrays.categorical import Categorical import pandas.core.common as com from pandas.core.indexes.base import Index, ensure_index -_VALID_CLOSED = {'left', 'right', 'both', 'neither'} +_VALID_CLOSED = {"left", "right", "both", "neither"} _interval_shared_docs = {} _shared_docs_kwargs = dict( - klass='IntervalArray', - qualname='arrays.IntervalArray', - name='' + klass="IntervalArray", qualname="arrays.IntervalArray", name="" ) -_interval_shared_docs['class'] = """ +_interval_shared_docs[ + "class" +] = """ %(summary)s .. versionadded:: %(versionadded)s @@ -99,14 +110,17 @@ """ -@Appender(_interval_shared_docs['class'] % dict( - klass="IntervalArray", - summary="Pandas array for interval data that are closed on the same side.", - versionadded="0.24.0", - name='', - extra_attributes='', - extra_methods='', - examples=textwrap.dedent("""\ +@Appender( + _interval_shared_docs["class"] + % dict( + klass="IntervalArray", + summary="Pandas array for interval data that are closed on the same side.", + versionadded="0.24.0", + name="", + extra_attributes="", + extra_methods="", + examples=textwrap.dedent( + """\ Examples -------- A new ``IntervalArray`` can be constructed directly from an array-like of @@ -120,16 +134,17 @@ It may also be constructed using one of the constructor methods: :meth:`IntervalArray.from_arrays`, :meth:`IntervalArray.from_breaks`, and :meth:`IntervalArray.from_tuples`. - """), -)) + """ + ), + ) +) class IntervalArray(IntervalMixin, ExtensionArray): dtype = IntervalDtype() ndim = 1 can_hold_na = True _na_value = _fill_value = np.nan - def __new__(cls, data, closed=None, dtype=None, copy=False, - verify_integrity=True): + def __new__(cls, data, closed=None, dtype=None, copy=False, verify_integrity=True): if isinstance(data, ABCSeries) and is_interval_dtype(data): data = data.values @@ -142,25 +157,35 @@ def __new__(cls, data, closed=None, dtype=None, copy=False, # don't allow scalars if is_scalar(data): - msg = ("{}(...) must be called with a collection of some kind," - " {} was passed") + msg = ( + "{}(...) must be called with a collection of some kind," + " {} was passed" + ) raise TypeError(msg.format(cls.__name__, data)) # might need to convert empty or purely na data data = maybe_convert_platform_interval(data) left, right, infer_closed = intervals_to_interval_bounds( - data, validate_closed=closed is None) + data, validate_closed=closed is None + ) closed = closed or infer_closed - return cls._simple_new(left, right, closed, copy=copy, dtype=dtype, - verify_integrity=verify_integrity) + return cls._simple_new( + left, + right, + closed, + copy=copy, + dtype=dtype, + verify_integrity=verify_integrity, + ) @classmethod - def _simple_new(cls, left, right, closed=None, - copy=False, dtype=None, verify_integrity=True): + def _simple_new( + cls, left, right, closed=None, copy=False, dtype=None, verify_integrity=True + ): result = IntervalMixin.__new__(cls) - closed = closed or 'right' + closed = closed or "right" left = ensure_index(left, copy=copy) right = ensure_index(right, copy=copy) @@ -168,7 +193,7 @@ def _simple_new(cls, left, right, closed=None, # GH 19262: dtype must be an IntervalDtype to override inferred dtype = pandas_dtype(dtype) if not is_interval_dtype(dtype): - msg = 'dtype must be an IntervalDtype, got {dtype}' + msg = "dtype must be an IntervalDtype, got {dtype}" raise TypeError(msg.format(dtype=dtype)) elif dtype.subtype is not None: left = left.astype(dtype.subtype) @@ -181,22 +206,25 @@ def _simple_new(cls, left, right, closed=None, left = left.astype(right.dtype) if type(left) != type(right): - msg = ('must not have differing left [{ltype}] and right ' - '[{rtype}] types') - raise ValueError(msg.format(ltype=type(left).__name__, - rtype=type(right).__name__)) + msg = "must not have differing left [{ltype}] and right " "[{rtype}] types" + raise ValueError( + msg.format(ltype=type(left).__name__, rtype=type(right).__name__) + ) elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): # GH 19016 - msg = ('category, object, and string subtypes are not supported ' - 'for IntervalArray') + msg = ( + "category, object, and string subtypes are not supported " + "for IntervalArray" + ) raise TypeError(msg) elif isinstance(left, ABCPeriodIndex): - msg = 'Period dtypes are not supported, use a PeriodIndex instead' + msg = "Period dtypes are not supported, use a PeriodIndex instead" raise ValueError(msg) - elif (isinstance(left, ABCDatetimeIndex) and - str(left.tz) != str(right.tz)): - msg = ("left and right must have the same time zone, got " - "'{left_tz}' and '{right_tz}'") + elif isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz): + msg = ( + "left and right must have the same time zone, got " + "'{left_tz}' and '{right_tz}'" + ) raise ValueError(msg.format(left_tz=left.tz, right_tz=right.tz)) result._left = left @@ -219,7 +247,9 @@ def _from_factorized(cls, values, original): values = values.astype(original.dtype.subtype) return cls(values, closed=original.closed) - _interval_shared_docs['from_breaks'] = """ + _interval_shared_docs[ + "from_breaks" + ] = """ Construct an %(klass)s from an array of splits. Parameters @@ -255,14 +285,15 @@ def _from_factorized(cls, values, original): """ @classmethod - @Appender(_interval_shared_docs['from_breaks'] % _shared_docs_kwargs) - def from_breaks(cls, breaks, closed='right', copy=False, dtype=None): + @Appender(_interval_shared_docs["from_breaks"] % _shared_docs_kwargs) + def from_breaks(cls, breaks, closed="right", copy=False, dtype=None): breaks = maybe_convert_platform_interval(breaks) - return cls.from_arrays(breaks[:-1], breaks[1:], closed, copy=copy, - dtype=dtype) + return cls.from_arrays(breaks[:-1], breaks[1:], closed, copy=copy, dtype=dtype) - _interval_shared_docs['from_arrays'] = """ + _interval_shared_docs[ + "from_arrays" + ] = """ Construct from two arrays defining the left and right bounds. Parameters @@ -317,15 +348,18 @@ def from_breaks(cls, breaks, closed='right', copy=False, dtype=None): """ @classmethod - @Appender(_interval_shared_docs['from_arrays'] % _shared_docs_kwargs) - def from_arrays(cls, left, right, closed='right', copy=False, dtype=None): + @Appender(_interval_shared_docs["from_arrays"] % _shared_docs_kwargs) + def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): left = maybe_convert_platform_interval(left) right = maybe_convert_platform_interval(right) - return cls._simple_new(left, right, closed, copy=copy, - dtype=dtype, verify_integrity=True) + return cls._simple_new( + left, right, closed, copy=copy, dtype=dtype, verify_integrity=True + ) - _interval_shared_docs['from_intervals'] = """ + _interval_shared_docs[ + "from_intervals" + ] = """ Construct an %(klass)s from a 1d array of Interval objects .. deprecated:: 0.23.0 @@ -367,7 +401,9 @@ def from_arrays(cls, left, right, closed='right', copy=False, dtype=None): closed='right', dtype='interval[int64]') """ - _interval_shared_docs['from_tuples'] = """ + _interval_shared_docs[ + "from_tuples" + ] = """ Construct an %(klass)s from an array-like of tuples Parameters @@ -404,8 +440,8 @@ def from_arrays(cls, left, right, closed='right', copy=False, dtype=None): """ @classmethod - @Appender(_interval_shared_docs['from_tuples'] % _shared_docs_kwargs) - def from_tuples(cls, data, closed='right', copy=False, dtype=None): + @Appender(_interval_shared_docs["from_tuples"] % _shared_docs_kwargs) + def from_tuples(cls, data, closed="right", copy=False, dtype=None): if len(data): left, right = [], [] else: @@ -421,18 +457,19 @@ def from_tuples(cls, data, closed='right', copy=False, dtype=None): # need list of length 2 tuples, e.g. [(0, 1), (1, 2), ...] lhs, rhs = d except ValueError: - msg = ('{name}.from_tuples requires tuples of ' - 'length 2, got {tpl}').format(name=name, tpl=d) + msg = ( + "{name}.from_tuples requires tuples of " "length 2, got {tpl}" + ).format(name=name, tpl=d) raise ValueError(msg) except TypeError: - msg = ('{name}.from_tuples received an invalid ' - 'item, {tpl}').format(name=name, tpl=d) + msg = ( + "{name}.from_tuples received an invalid " "item, {tpl}" + ).format(name=name, tpl=d) raise TypeError(msg) left.append(lhs) right.append(rhs) - return cls.from_arrays(left, right, closed, copy=False, - dtype=dtype) + return cls.from_arrays(left, right, closed, copy=False, dtype=dtype) def _validate(self): """Verify that the IntervalArray is valid. @@ -445,17 +482,20 @@ def _validate(self): * left is always below right """ if self.closed not in _VALID_CLOSED: - raise ValueError("invalid option for 'closed': {closed}" - .format(closed=self.closed)) + raise ValueError( + "invalid option for 'closed': {closed}".format(closed=self.closed) + ) if len(self.left) != len(self.right): - raise ValueError('left and right must have the same length') + raise ValueError("left and right must have the same length") left_mask = notna(self.left) right_mask = notna(self.right) if not (left_mask == right_mask).all(): - raise ValueError('missing values must be missing in the same ' - 'location both left and right sides') + raise ValueError( + "missing values must be missing in the same " + "location both left and right sides" + ) if not (self.left[left_mask] <= self.right[left_mask]).all(): - raise ValueError('left side of interval must be <= right side') + raise ValueError("left side of interval must be <= right side") # --------- # Interface @@ -487,10 +527,10 @@ def __setitem__(self, key, value): needs_float_conversion = True elif is_datetime64_any_dtype(self.dtype.subtype): # need proper NaT to set directly on the numpy array - value = np.datetime64('NaT') + value = np.datetime64("NaT") elif is_timedelta64_dtype(self.dtype.subtype): # need proper NaT to set directly on the numpy array - value = np.timedelta64('NaT') + value = np.timedelta64("NaT") value_left, value_right = value, value # scalar interval @@ -512,13 +552,13 @@ def __setitem__(self, key, value): # forced to copy, update the copy, and swap in the new values. left = self.left.copy(deep=True) if needs_float_conversion: - left = left.astype('float') + left = left.astype("float") left.values[key] = value_left self._left = left right = self.right.copy(deep=True) if needs_float_conversion: - right = right.astype('float') + right = right.astype("float") right.values[key] = value_right self._right = right @@ -550,18 +590,20 @@ def fillna(self, value=None, method=None, limit=None): filled : IntervalArray with NA/NaN filled """ if method is not None: - raise TypeError('Filling by method is not supported for ' - 'IntervalArray.') + raise TypeError("Filling by method is not supported for " "IntervalArray.") if limit is not None: - raise TypeError('limit is not supported for IntervalArray.') + raise TypeError("limit is not supported for IntervalArray.") if not isinstance(value, ABCInterval): - msg = ("'IntervalArray.fillna' only supports filling with a " - "scalar 'pandas.Interval'. Got a '{}' instead." - .format(type(value).__name__)) + msg = ( + "'IntervalArray.fillna' only supports filling with a " + "scalar 'pandas.Interval'. Got a '{}' instead.".format( + type(value).__name__ + ) + ) raise TypeError(msg) - value = getattr(value, '_values', value) + value = getattr(value, "_values", value) self._check_closed_matches(value, name="value") left = self.left.fillna(value=value.left) @@ -601,8 +643,10 @@ def astype(self, dtype, copy=True): new_left = self.left.astype(dtype.subtype) new_right = self.right.astype(dtype.subtype) except TypeError: - msg = ('Cannot convert {dtype} to {new_dtype}; subtypes are ' - 'incompatible') + msg = ( + "Cannot convert {dtype} to {new_dtype}; subtypes are " + "incompatible" + ) raise TypeError(msg.format(dtype=self.dtype, new_dtype=dtype)) return self._shallow_copy(new_left, new_right) elif is_categorical_dtype(dtype): @@ -611,7 +655,7 @@ def astype(self, dtype, copy=True): try: return np.asarray(self).astype(dtype, copy=copy) except (TypeError, ValueError): - msg = 'Cannot cast {name} to dtype {dtype}' + msg = "Cannot cast {name} to dtype {dtype}" raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) @classmethod @@ -674,8 +718,7 @@ def _shallow_copy(self, left=None, right=None, closed=None): pass closed = closed or self.closed - return self._simple_new( - left, right, closed=closed, verify_integrity=False) + return self._simple_new(left, right, closed=closed, verify_integrity=False) def copy(self): """ @@ -707,8 +750,7 @@ def size(self): def shape(self): return self.left.shape - def take(self, indices, allow_fill=False, fill_value=None, axis=None, - **kwargs): + def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): """ Take elements from the IntervalArray. @@ -763,18 +805,23 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=None, if fill_value is None: fill_left = fill_right = self.left._na_value elif is_interval(fill_value): - self._check_closed_matches(fill_value, name='fill_value') + self._check_closed_matches(fill_value, name="fill_value") fill_left, fill_right = fill_value.left, fill_value.right elif not is_scalar(fill_value) and notna(fill_value): - msg = ("'IntervalArray.fillna' only supports filling with a " - "'scalar pandas.Interval or NA'. Got a '{}' instead." - .format(type(fill_value).__name__)) + msg = ( + "'IntervalArray.fillna' only supports filling with a " + "'scalar pandas.Interval or NA'. Got a '{}' instead.".format( + type(fill_value).__name__ + ) + ) raise ValueError(msg) - left_take = take(self.left, indices, - allow_fill=allow_fill, fill_value=fill_left) - right_take = take(self.right, indices, - allow_fill=allow_fill, fill_value=fill_right) + left_take = take( + self.left, indices, allow_fill=allow_fill, fill_value=fill_left + ) + right_take = take( + self.right, indices, allow_fill=allow_fill, fill_value=fill_right + ) return self._shallow_copy(left_take, right_take) @@ -797,6 +844,7 @@ def value_counts(self, dropna=True): """ # TODO: implement this is a non-naive way! from pandas.core.algorithms import value_counts + return value_counts(np.asarray(self), dropna=dropna) # Formatting @@ -806,46 +854,51 @@ def _format_data(self): # TODO: integrate with categorical and make generic # name argument is unused here; just for compat with base / categorical n = len(self) - max_seq_items = min((get_option( - 'display.max_seq_items') or n) // 10, 10) + max_seq_items = min((get_option("display.max_seq_items") or n) // 10, 10) formatter = str if n == 0: - summary = '[]' + summary = "[]" elif n == 1: first = formatter(self[0]) - summary = '[{first}]'.format(first=first) + summary = "[{first}]".format(first=first) elif n == 2: first = formatter(self[0]) last = formatter(self[-1]) - summary = '[{first}, {last}]'.format(first=first, last=last) + summary = "[{first}, {last}]".format(first=first, last=last) else: if n > max_seq_items: n = min(max_seq_items // 2, 10) head = [formatter(x) for x in self[:n]] tail = [formatter(x) for x in self[-n:]] - summary = '[{head} ... {tail}]'.format( - head=', '.join(head), tail=', '.join(tail)) + summary = "[{head} ... {tail}]".format( + head=", ".join(head), tail=", ".join(tail) + ) else: tail = [formatter(x) for x in self] - summary = '[{tail}]'.format(tail=', '.join(tail)) + summary = "[{tail}]".format(tail=", ".join(tail)) return summary def __repr__(self): - tpl = textwrap.dedent("""\ + tpl = textwrap.dedent( + """\ {cls}({data}, {lead}closed='{closed}', - {lead}dtype='{dtype}')""") - return tpl.format(cls=self.__class__.__name__, - data=self._format_data(), - lead=' ' * len(self.__class__.__name__) + ' ', - closed=self.closed, dtype=self.dtype) + {lead}dtype='{dtype}')""" + ) + return tpl.format( + cls=self.__class__.__name__, + data=self._format_data(), + lead=" " * len(self.__class__.__name__) + " ", + closed=self.closed, + dtype=self.dtype, + ) def _format_space(self): - space = ' ' * (len(self.__class__.__name__) + 1) + space = " " * (len(self.__class__.__name__) + 1) return "\n{space}".format(space=space) @property @@ -872,7 +925,9 @@ def closed(self): """ return self._closed - _interval_shared_docs['set_closed'] = """ + _interval_shared_docs[ + "set_closed" + ] = """ Return an %(klass)s identical to the current one, but closed on the specified side @@ -901,7 +956,7 @@ def closed(self): dtype='interval[int64]') """ - @Appender(_interval_shared_docs['set_closed'] % _shared_docs_kwargs) + @Appender(_interval_shared_docs["set_closed"] % _shared_docs_kwargs) def set_closed(self, closed): if closed not in _VALID_CLOSED: msg = "invalid option for 'closed': {closed}" @@ -919,8 +974,10 @@ def length(self): return self.right - self.left except TypeError: # length not defined for some types, e.g. string - msg = ('IntervalArray contains Intervals without defined length, ' - 'e.g. Intervals with string endpoints') + msg = ( + "IntervalArray contains Intervals without defined length, " + "e.g. Intervals with string endpoints" + ) raise TypeError(msg) @property @@ -934,7 +991,9 @@ def mid(self): # datetime safe version return self.left + 0.5 * self.length - _interval_shared_docs['is_non_overlapping_monotonic'] = """ + _interval_shared_docs[ + "is_non_overlapping_monotonic" + ] = """ Return True if the %(klass)s is non-overlapping (no Intervals share points) and is either monotonic increasing or monotonic decreasing, else False @@ -942,8 +1001,9 @@ def mid(self): # https://github.com/python/mypy/issues/1362 # Mypy does not support decorated properties @property # type: ignore - @Appender(_interval_shared_docs['is_non_overlapping_monotonic'] - % _shared_docs_kwargs) + @Appender( + _interval_shared_docs["is_non_overlapping_monotonic"] % _shared_docs_kwargs + ) def is_non_overlapping_monotonic(self): # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... ) # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...) @@ -951,14 +1011,18 @@ def is_non_overlapping_monotonic(self): # strict inequality for closed == 'both'; equality implies overlapping # at a point when both sides of intervals are included - if self.closed == 'both': - return bool((self.right[:-1] < self.left[1:]).all() or - (self.left[:-1] > self.right[1:]).all()) + if self.closed == "both": + return bool( + (self.right[:-1] < self.left[1:]).all() + or (self.left[:-1] > self.right[1:]).all() + ) # non-strict inequality when closed != 'both'; at least one side is # not included in the intervals, so equality does not imply overlapping - return bool((self.right[:-1] <= self.left[1:]).all() or - (self.left[:-1] >= self.right[1:]).all()) + return bool( + (self.right[:-1] <= self.left[1:]).all() + or (self.left[:-1] >= self.right[1:]).all() + ) # Conversion def __array__(self, dtype=None): @@ -979,7 +1043,9 @@ def __array__(self, dtype=None): result[i] = Interval(left[i], right[i], closed) return result - _interval_shared_docs['to_tuples'] = """ + _interval_shared_docs[ + "to_tuples" + ] = """ Return an %(return_type)s of tuples of the form (left, right) Parameters @@ -996,10 +1062,9 @@ def __array__(self, dtype=None): %(examples)s\ """ - @Appender(_interval_shared_docs['to_tuples'] % dict( - return_type='ndarray', - examples='', - )) + @Appender( + _interval_shared_docs["to_tuples"] % dict(return_type="ndarray", examples="") + ) def to_tuples(self, na_tuple=True): tuples = com.asarray_tuplesafe(zip(self.left, self.right)) if not na_tuple: @@ -1007,14 +1072,16 @@ def to_tuples(self, na_tuple=True): tuples = np.where(~self.isna(), tuples, np.nan) return tuples - @Appender(_extension_array_shared_docs['repeat'] % _shared_docs_kwargs) + @Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs) def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) left_repeat = self.left.repeat(repeats) right_repeat = self.right.repeat(repeats) return self._shallow_copy(left=left_repeat, right=right_repeat) - _interval_shared_docs['contains'] = """ + _interval_shared_docs[ + "contains" + ] = """ Check elementwise if the Intervals contain the value. Return a boolean mask whether the value is contained in the Intervals @@ -1048,19 +1115,18 @@ def repeat(self, repeats, axis=None): array([ True, False, False]) """ - @Appender(_interval_shared_docs['contains'] % _shared_docs_kwargs) + @Appender(_interval_shared_docs["contains"] % _shared_docs_kwargs) def contains(self, other): if isinstance(other, Interval): - raise NotImplementedError( - 'contains not implemented for two intervals' - ) + raise NotImplementedError("contains not implemented for two intervals") - return ( - (self.left < other if self.open_left else self.left <= other) & - (other < self.right if self.open_right else other <= self.right) + return (self.left < other if self.open_left else self.left <= other) & ( + other < self.right if self.open_right else other <= self.right ) - _interval_shared_docs['overlaps'] = """ + _interval_shared_docs[ + "overlaps" + ] = """ Check elementwise if an Interval overlaps the values in the %(klass)s. Two intervals overlap if they share a common point, including closed @@ -1104,12 +1170,12 @@ def contains(self, other): array([False, True, False]) """ - @Appender(_interval_shared_docs['overlaps'] % _shared_docs_kwargs) + @Appender(_interval_shared_docs["overlaps"] % _shared_docs_kwargs) def overlaps(self, other): if isinstance(other, (IntervalArray, ABCIntervalIndex)): raise NotImplementedError elif not isinstance(other, Interval): - msg = '`other` must be Interval-like, got {other}' + msg = "`other` must be Interval-like, got {other}" raise TypeError(msg.format(other=type(other).__name__)) # equality is okay if both endpoints are closed (overlap at a point) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 1c5dc7666c3a15..9f428a4ac10b21 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -33,7 +33,8 @@ class PandasDtype(ExtensionDtype): ---------- dtype : numpy.dtype """ - _metadata = ('_dtype',) + + _metadata = ("_dtype",) def __init__(self, dtype): dtype = np.dtype(dtype) @@ -60,11 +61,11 @@ def type(self): @property def _is_numeric(self): # exclude object, str, unicode, void. - return self.kind in set('biufc') + return self.kind in set("biufc") @property def _is_boolean(self): - return self.kind == 'b' + return self.kind == "b" @classmethod def construct_from_string(cls, string): @@ -107,6 +108,7 @@ class PandasArray(ExtensionArray, ExtensionOpsMixin, NDArrayOperatorsMixin): ------- None """ + # If you're wondering why pd.Series(cls) doesn't put the array in an # ExtensionBlock, search for `ABCPandasArray`. We check for # that _typ to ensure that that users don't unnecessarily use EAs inside @@ -171,7 +173,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # numpy.lib.mixins.NDArrayOperatorsMixin.html # The primary modification is not boxing scalar return values # in PandasArray, since pandas' ExtensionArrays are 1-d. - out = kwargs.get('out', ()) + out = kwargs.get("out", ()) for x in inputs + out: # Only support operations with instances of _HANDLED_TYPES. # Use PandasArray instead of type(self) for isinstance to @@ -181,12 +183,11 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): return NotImplemented # Defer to the implementation of the ufunc on unwrapped values. - inputs = tuple(x._ndarray if isinstance(x, PandasArray) else x - for x in inputs) + inputs = tuple(x._ndarray if isinstance(x, PandasArray) else x for x in inputs) if out: - kwargs['out'] = tuple( - x._ndarray if isinstance(x, PandasArray) else x - for x in out) + kwargs["out"] = tuple( + x._ndarray if isinstance(x, PandasArray) else x for x in out + ) result = getattr(ufunc, method)(*inputs, **kwargs) if type(result) is tuple and len(result): @@ -197,7 +198,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): else: # but not scalar reductions return result - elif method == 'at': + elif method == "at": # no return value return None else: @@ -233,7 +234,7 @@ def __setitem__(self, key, value): values = self._ndarray t = np.result_type(value, values) if t != self._ndarray.dtype: - values = values.astype(t, casting='safe') + values = values.astype(t, casting="safe") values[key] = value self._dtype = PandasDtype(t) self._ndarray = values @@ -260,15 +261,16 @@ def fillna(self, value=None, method=None, limit=None): if is_array_like(value): if len(value) != len(self): - raise ValueError("Length of 'value' does not match. Got ({}) " - " expected {}".format(len(value), len(self))) + raise ValueError( + "Length of 'value' does not match. Got ({}) " + " expected {}".format(len(value), len(self)) + ) value = value[mask] if mask.any(): if method is not None: - func = pad_1d if method == 'pad' else backfill_1d - new_values = func(self._ndarray, limit=limit, - mask=mask) + func = pad_1d if method == "pad" else backfill_1d + new_values = func(self._ndarray, limit=limit, mask=mask) new_values = self._from_sequence(new_values, dtype=self.dtype) else: # fill with value @@ -281,8 +283,9 @@ def fillna(self, value=None, method=None, limit=None): def take(self, indices, allow_fill=False, fill_value=None): from pandas.core.algorithms import take - result = take(self._ndarray, indices, allow_fill=allow_fill, - fill_value=fill_value) + result = take( + self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value + ) return type(self)(result) def copy(self): @@ -307,9 +310,7 @@ def _reduce(self, name, skipna=True, **kwargs): if meth: return meth(skipna=skipna, **kwargs) else: - msg = ( - "'{}' does not implement reduction '{}'" - ) + msg = "'{}' does not implement reduction '{}'" raise TypeError(msg.format(type(self).__name__, name)) def any(self, axis=None, out=None, keepdims=False, skipna=True): @@ -328,67 +329,80 @@ def max(self, axis=None, out=None, keepdims=False, skipna=True): nv.validate_max((), dict(out=out, keepdims=keepdims)) return nanops.nanmax(self._ndarray, axis=axis, skipna=skipna) - def sum(self, axis=None, dtype=None, out=None, keepdims=False, - initial=None, skipna=True, min_count=0): - nv.validate_sum((), dict(dtype=dtype, out=out, keepdims=keepdims, - initial=initial)) - return nanops.nansum(self._ndarray, axis=axis, skipna=skipna, - min_count=min_count) - - def prod(self, axis=None, dtype=None, out=None, keepdims=False, - initial=None, skipna=True, min_count=0): - nv.validate_prod((), dict(dtype=dtype, out=out, keepdims=keepdims, - initial=initial)) - return nanops.nanprod(self._ndarray, axis=axis, skipna=skipna, - min_count=min_count) - - def mean(self, axis=None, dtype=None, out=None, keepdims=False, - skipna=True): + def sum( + self, + axis=None, + dtype=None, + out=None, + keepdims=False, + initial=None, + skipna=True, + min_count=0, + ): + nv.validate_sum( + (), dict(dtype=dtype, out=out, keepdims=keepdims, initial=initial) + ) + return nanops.nansum( + self._ndarray, axis=axis, skipna=skipna, min_count=min_count + ) + + def prod( + self, + axis=None, + dtype=None, + out=None, + keepdims=False, + initial=None, + skipna=True, + min_count=0, + ): + nv.validate_prod( + (), dict(dtype=dtype, out=out, keepdims=keepdims, initial=initial) + ) + return nanops.nanprod( + self._ndarray, axis=axis, skipna=skipna, min_count=min_count + ) + + def mean(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): nv.validate_mean((), dict(dtype=dtype, out=out, keepdims=keepdims)) return nanops.nanmean(self._ndarray, axis=axis, skipna=skipna) - def median(self, axis=None, out=None, overwrite_input=False, - keepdims=False, skipna=True): - nv.validate_median((), dict(out=out, overwrite_input=overwrite_input, - keepdims=keepdims)) + def median( + self, axis=None, out=None, overwrite_input=False, keepdims=False, skipna=True + ): + nv.validate_median( + (), dict(out=out, overwrite_input=overwrite_input, keepdims=keepdims) + ) return nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna) - def std(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, - skipna=True): - nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out, - keepdims=keepdims), - fname='std') - return nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, - ddof=ddof) - - def var(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, - skipna=True): - nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out, - keepdims=keepdims), - fname='var') - return nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, - ddof=ddof) - - def sem(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, - skipna=True): - nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out, - keepdims=keepdims), - fname='sem') - return nanops.nansem(self._ndarray, axis=axis, skipna=skipna, - ddof=ddof) - - def kurt(self, axis=None, dtype=None, out=None, keepdims=False, - skipna=True): - nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out, - keepdims=keepdims), - fname='kurt') + def std(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True): + nv.validate_stat_ddof_func( + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="std" + ) + return nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + + def var(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True): + nv.validate_stat_ddof_func( + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="var" + ) + return nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + + def sem(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True): + nv.validate_stat_ddof_func( + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="sem" + ) + return nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + + def kurt(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): + nv.validate_stat_ddof_func( + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="kurt" + ) return nanops.nankurt(self._ndarray, axis=axis, skipna=skipna) - def skew(self, axis=None, dtype=None, out=None, keepdims=False, - skipna=True): - nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out, - keepdims=keepdims), - fname='skew') + def skew(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): + nv.validate_stat_ddof_func( + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="skew" + ) return nanops.nanskew(self._ndarray, axis=axis, skipna=skipna) # ------------------------------------------------------------------------ @@ -417,9 +431,8 @@ def to_numpy(self, dtype=None, copy=False): return result @Appender(ExtensionArray.searchsorted.__doc__) - def searchsorted(self, value, side='left', sorter=None): - return searchsorted(self.to_numpy(), value, - side=side, sorter=sorter) + def searchsorted(self, value, side="left", sorter=None): + return searchsorted(self.to_numpy(), value, side=side, sorter=sorter) # ------------------------------------------------------------------------ # Ops @@ -445,9 +458,9 @@ def arithmetic_method(self, other): return cls(result) - return compat.set_function_name(arithmetic_method, - "__{}__".format(op.__name__), - cls) + return compat.set_function_name( + arithmetic_method, "__{}__".format(op.__name__), cls + ) _create_comparison_method = _create_arithmetic_method diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index bb144764a26fcb..8291cb70affcd8 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -6,21 +6,41 @@ from pandas._libs import lib from pandas._libs.tslibs import ( - NaT, NaTType, frequencies as libfrequencies, iNaT, period as libperiod) + NaT, + NaTType, + frequencies as libfrequencies, + iNaT, + period as libperiod, +) from pandas._libs.tslibs.fields import isleapyear_arr from pandas._libs.tslibs.period import ( - DIFFERENT_FREQ, IncompatibleFrequency, Period, get_period_field_arr, - period_asfreq_arr) + DIFFERENT_FREQ, + IncompatibleFrequency, + Period, + get_period_field_arr, + period_asfreq_arr, +) from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds import pandas.compat as compat from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( - _TD_DTYPE, ensure_object, is_datetime64_dtype, is_float_dtype, - is_list_like, is_period_dtype, pandas_dtype) + _TD_DTYPE, + ensure_object, + is_datetime64_dtype, + is_float_dtype, + is_list_like, + is_period_dtype, + pandas_dtype, +) from pandas.core.dtypes.dtypes import PeriodDtype from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCIndexClass, ABCPeriodArray, ABCPeriodIndex, ABCSeries) + ABCDataFrame, + ABCIndexClass, + ABCPeriodArray, + ABCPeriodIndex, + ABCSeries, +) from pandas.core.dtypes.missing import isna, notna import pandas.core.algorithms as algos @@ -46,8 +66,8 @@ def _period_array_cmp(cls, op): """ Wrap comparison operations to convert Period-like to PeriodDtype """ - opname = '__{name}__'.format(name=op.__name__) - nat_result = opname == '__ne__' + opname = "__{name}__".format(name=op.__name__) + nat_result = opname == "__ne__" def wrapper(self, other): op = getattr(self.asi8, opname) @@ -138,6 +158,7 @@ class PeriodArray(dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): The `freq` indicates the span covered by each element of the array. All elements in the PeriodArray have the same `freq`. """ + # array priority higher than numpy scalars __array_priority__ = 1000 _attributes = ["freq"] @@ -146,14 +167,27 @@ class PeriodArray(dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): # Names others delegate to us _other_ops = [] # type: List[str] - _bool_ops = ['is_leap_year'] - _object_ops = ['start_time', 'end_time', 'freq'] - _field_ops = ['year', 'month', 'day', 'hour', 'minute', 'second', - 'weekofyear', 'weekday', 'week', 'dayofweek', - 'dayofyear', 'quarter', 'qyear', - 'days_in_month', 'daysinmonth'] + _bool_ops = ["is_leap_year"] + _object_ops = ["start_time", "end_time", "freq"] + _field_ops = [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "weekofyear", + "weekday", + "week", + "dayofweek", + "dayofyear", + "quarter", + "qyear", + "days_in_month", + "daysinmonth", + ] _datetimelike_ops = _field_ops + _object_ops + _bool_ops - _datetimelike_methods = ['strftime', 'to_timestamp', 'asfreq'] + _datetimelike_methods = ["strftime", "to_timestamp", "asfreq"] # -------------------------------------------------------------------- # Constructors @@ -174,16 +208,18 @@ def __init__(self, values, freq=None, dtype=None, copy=False): if isinstance(values, type(self)): if freq is not None and freq != values.freq: - msg = DIFFERENT_FREQ.format(cls=type(self).__name__, - own_freq=values.freq.freqstr, - other_freq=freq.freqstr) + msg = DIFFERENT_FREQ.format( + cls=type(self).__name__, + own_freq=values.freq.freqstr, + other_freq=freq.freqstr, + ) raise IncompatibleFrequency(msg) values, freq = values._data, values.freq - values = np.array(values, dtype='int64', copy=copy) + values = np.array(values, dtype="int64", copy=copy) self._data = values if freq is None: - raise ValueError('freq is not specified and cannot be inferred') + raise ValueError("freq is not specified and cannot be inferred") self._dtype = PeriodDtype(freq) @classmethod @@ -193,10 +229,10 @@ def _simple_new(cls, values, freq=None, **kwargs): @classmethod def _from_sequence( - cls, - scalars: Sequence[Optional[Period]], - dtype: Optional[PeriodDtype] = None, - copy: bool = False, + cls, + scalars: Sequence[Optional[Period]], + dtype: Optional[PeriodDtype] = None, + copy: bool = False, ) -> ABCPeriodArray: if dtype: freq = dtype.freq @@ -245,14 +281,14 @@ def _generate_range(cls, start, end, periods, freq, fields): field_count = len(fields) if start is not None or end is not None: if field_count > 0: - raise ValueError('Can either instantiate from fields ' - 'or endpoints, but not both') + raise ValueError( + "Can either instantiate from fields " "or endpoints, but not both" + ) subarr, freq = _get_ordinal_range(start, end, periods, freq) elif field_count > 0: subarr, freq = _range_from_fields(freq=freq, **fields) else: - raise ValueError('Not enough parameters to construct ' - 'Period range') + raise ValueError("Not enough parameters to construct " "Period range") return subarr, freq @@ -267,8 +303,9 @@ def _unbox_scalar(self, value: Union[Period, NaTType]) -> int: self._check_compatible_with(value) return value.ordinal else: - raise ValueError("'value' should be a Period. Got '{val}' instead." - .format(val=value)) + raise ValueError( + "'value' should be a Period. Got '{val}' instead.".format(val=value) + ) def _scalar_from_string(self, value: str) -> Period: return Period(value, freq=self.freq) @@ -301,23 +338,26 @@ def __array__(self, dtype=None): # -------------------------------------------------------------------- # Vectorized analogues of Period properties - year = _field_accessor('year', 0, "The year of the period") - month = _field_accessor('month', 3, "The month as January=1, December=12") - day = _field_accessor('day', 4, "The days of the period") - hour = _field_accessor('hour', 5, "The hour of the period") - minute = _field_accessor('minute', 6, "The minute of the period") - second = _field_accessor('second', 7, "The second of the period") - weekofyear = _field_accessor('week', 8, "The week ordinal of the year") + year = _field_accessor("year", 0, "The year of the period") + month = _field_accessor("month", 3, "The month as January=1, December=12") + day = _field_accessor("day", 4, "The days of the period") + hour = _field_accessor("hour", 5, "The hour of the period") + minute = _field_accessor("minute", 6, "The minute of the period") + second = _field_accessor("second", 7, "The second of the period") + weekofyear = _field_accessor("week", 8, "The week ordinal of the year") week = weekofyear - dayofweek = _field_accessor('dayofweek', 10, - "The day of the week with Monday=0, Sunday=6") + dayofweek = _field_accessor( + "dayofweek", 10, "The day of the week with Monday=0, Sunday=6" + ) weekday = dayofweek - dayofyear = day_of_year = _field_accessor('dayofyear', 9, - "The ordinal day of the year") - quarter = _field_accessor('quarter', 2, "The quarter of the date") - qyear = _field_accessor('qyear', 1) - days_in_month = _field_accessor('days_in_month', 11, - "The number of days in the month") + dayofyear = day_of_year = _field_accessor( + "dayofyear", 9, "The ordinal day of the year" + ) + quarter = _field_accessor("quarter", 2, "The quarter of the date") + qyear = _field_accessor("qyear", 1) + days_in_month = _field_accessor( + "days_in_month", 11, "The number of days in the month" + ) daysinmonth = days_in_month @property @@ -329,13 +369,13 @@ def is_leap_year(self): @property def start_time(self): - return self.to_timestamp(how='start') + return self.to_timestamp(how="start") @property def end_time(self): - return self.to_timestamp(how='end') + return self.to_timestamp(how="end") - def to_timestamp(self, freq=None, how='start'): + def to_timestamp(self, freq=None, how="start"): """ Cast to DatetimeArray/Index. @@ -354,15 +394,15 @@ def to_timestamp(self, freq=None, how='start'): how = libperiod._validate_end_alias(how) - end = how == 'E' + end = how == "E" if end: - if freq == 'B': + if freq == "B": # roll forward to ensure we land on B date - adjust = Timedelta(1, 'D') - Timedelta(1, 'ns') - return self.to_timestamp(how='start') + adjust + adjust = Timedelta(1, "D") - Timedelta(1, "ns") + return self.to_timestamp(how="start") + adjust else: - adjust = Timedelta(1, 'ns') - return (self + self.freq).to_timestamp(how='start') - adjust + adjust = Timedelta(1, "ns") + return (self + self.freq).to_timestamp(how="start") - adjust if freq is None: base, mult = libfrequencies.get_freq_code(self.freq) @@ -374,7 +414,7 @@ def to_timestamp(self, freq=None, how='start'): new_data = self.asfreq(freq, how=how) new_data = libperiod.periodarr_to_dt64arr(new_data.asi8, base) - return DatetimeArray._from_sequence(new_data, freq='infer') + return DatetimeArray._from_sequence(new_data, freq="infer") # -------------------------------------------------------------------- # Array-like / EA-Interface Methods @@ -392,8 +432,10 @@ def _validate_fill_value(self, fill_value): self._check_compatible_with(fill_value) fill_value = fill_value.ordinal else: - raise ValueError("'fill_value' should be a Period. " - "Got '{got}'.".format(got=fill_value)) + raise ValueError( + "'fill_value' should be a Period. " + "Got '{got}'.".format(got=fill_value) + ) return fill_value # -------------------------------------------------------------------- @@ -414,9 +456,10 @@ def _time_shift(self, periods, freq=None): Frequency increment to shift by. """ if freq is not None: - raise TypeError("`freq` argument is not supported for " - "{cls}._time_shift" - .format(cls=type(self).__name__)) + raise TypeError( + "`freq` argument is not supported for " + "{cls}._time_shift".format(cls=type(self).__name__) + ) values = self.asi8 + periods * self.freq.n if self._hasnans: values[self._isnan] = iNaT @@ -426,7 +469,7 @@ def _time_shift(self, periods, freq=None): def _box_func(self): return lambda x: Period._from_ordinal(ordinal=x, freq=self.freq) - def asfreq(self, freq=None, how='E'): + def asfreq(self, freq=None, how="E"): """ Convert the Period Array/Index to the specified frequency `freq`. @@ -469,7 +512,7 @@ def asfreq(self, freq=None, how='E'): asi8 = self.asi8 # mult1 can't be negative or 0 - end = how == 'E' + end = how == "E" if end: ordinal = asi8 + mult1 - 1 else: @@ -485,7 +528,7 @@ def asfreq(self, freq=None, how='E'): # ------------------------------------------------------------------ # Rendering Methods - def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): + def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): """ actually format my specific types """ @@ -494,14 +537,13 @@ def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): if date_format: formatter = lambda dt: dt.strftime(date_format) else: - formatter = lambda dt: '%s' % dt + formatter = lambda dt: "%s" % dt if self._hasnans: mask = self._isnan values[mask] = na_rep imask = ~mask - values[imask] = np.array([formatter(dt) for dt - in values[imask]]) + values[imask] = np.array([formatter(dt) for dt in values[imask]]) else: values = np.array([formatter(dt) for dt in values]) return values @@ -548,17 +590,15 @@ def _sub_period(self, other): @Appender(dtl.DatetimeLikeArrayMixin._addsub_int_array.__doc__) def _addsub_int_array( - self, - other: Union[ABCPeriodArray, ABCSeries, - ABCPeriodIndex, np.ndarray], - op: Callable[[Any], Any] + self, + other: Union[ABCPeriodArray, ABCSeries, ABCPeriodIndex, np.ndarray], + op: Callable[[Any], Any], ) -> ABCPeriodArray: assert op in [operator.add, operator.sub] if op is operator.sub: other = -other - res_values = algos.checked_add_with_arr(self.asi8, other, - arr_mask=self._isnan) - res_values = res_values.view('i8') + res_values = algos.checked_add_with_arr(self.asi8, other, arr_mask=self._isnan) + res_values = res_values.view("i8") res_values[self._isnan] = iNaT return type(self)(res_values, freq=self.freq) @@ -663,12 +703,12 @@ def _check_timedeltalike_freq_compat(self, other): elif isinstance(other, np.ndarray): # numpy timedelta64 array; all entries must be compatible - assert other.dtype.kind == 'm' + assert other.dtype.kind == "m" if other.dtype != _TD_DTYPE: # i.e. non-nano unit # TODO: disallow unit-less timedelta64 other = other.astype(_TD_DTYPE) - nanos = other.view('i8') + nanos = other.view("i8") else: # TimedeltaArray/Index nanos = other.asi8 @@ -712,19 +752,18 @@ def _raise_on_incompatible(left, right): else: other_freq = _delta_to_tick(Timedelta(right)).freqstr - msg = DIFFERENT_FREQ.format(cls=type(left).__name__, - own_freq=left.freqstr, - other_freq=other_freq) + msg = DIFFERENT_FREQ.format( + cls=type(left).__name__, own_freq=left.freqstr, other_freq=other_freq + ) raise IncompatibleFrequency(msg) # ------------------------------------------------------------------- # Constructor Helpers + def period_array( - data: Sequence[Optional[Period]], - freq: Optional[Tick] = None, - copy: bool = False, + data: Sequence[Optional[Period]], freq: Optional[Tick] = None, copy: bool = False ) -> PeriodArray: """ Construct a new PeriodArray from a sequence of Period scalars. @@ -796,8 +835,7 @@ def period_array( dtype = None if is_float_dtype(data) and len(data) > 0: - raise TypeError("PeriodIndex does not allow " - "floating point in construction") + raise TypeError("PeriodIndex does not allow " "floating point in construction") data = ensure_object(data) @@ -829,12 +867,11 @@ def validate_dtype_freq(dtype, freq): if dtype is not None: dtype = pandas_dtype(dtype) if not is_period_dtype(dtype): - raise ValueError('dtype must be PeriodDtype') + raise ValueError("dtype must be PeriodDtype") if freq is None: freq = dtype.freq elif freq != dtype.freq: - raise IncompatibleFrequency('specified freq and dtype ' - 'are different') + raise IncompatibleFrequency("specified freq and dtype " "are different") return freq @@ -858,8 +895,8 @@ def dt64arr_to_periodarr(data, freq, tz=None): used. """ - if data.dtype != np.dtype('M8[ns]'): - raise ValueError('Wrong dtype: {dtype}'.format(dtype=data.dtype)) + if data.dtype != np.dtype("M8[ns]"): + raise ValueError("Wrong dtype: {dtype}".format(dtype=data.dtype)) if freq is None: if isinstance(data, ABCIndexClass): @@ -873,13 +910,15 @@ def dt64arr_to_periodarr(data, freq, tz=None): data = data._values base, mult = libfrequencies.get_freq_code(freq) - return libperiod.dt64arr_to_periodarr(data.view('i8'), base, tz), freq + return libperiod.dt64arr_to_periodarr(data.view("i8"), base, tz), freq def _get_ordinal_range(start, end, periods, freq, mult=1): if com.count_not_none(start, end, periods) != 2: - raise ValueError('Of the three parameters: start, end, and periods, ' - 'exactly two must be specified') + raise ValueError( + "Of the three parameters: start, end, and periods, " + "exactly two must be specified" + ) if freq is not None: _, mult = libfrequencies.get_freq_code(freq) @@ -893,9 +932,9 @@ def _get_ordinal_range(start, end, periods, freq, mult=1): is_end_per = isinstance(end, Period) if is_start_per and is_end_per and start.freq != end.freq: - raise ValueError('start and end must have same freq') - if (start is NaT or end is NaT): - raise ValueError('start and end must not be NaT') + raise ValueError("start and end must have same freq") + if start is NaT or end is NaT: + raise ValueError("start and end must not be NaT") if freq is None: if is_start_per: @@ -903,25 +942,34 @@ def _get_ordinal_range(start, end, periods, freq, mult=1): elif is_end_per: freq = end.freq else: # pragma: no cover - raise ValueError('Could not infer freq from start/end') + raise ValueError("Could not infer freq from start/end") if periods is not None: periods = periods * mult if start is None: - data = np.arange(end.ordinal - periods + mult, - end.ordinal + 1, mult, - dtype=np.int64) + data = np.arange( + end.ordinal - periods + mult, end.ordinal + 1, mult, dtype=np.int64 + ) else: - data = np.arange(start.ordinal, start.ordinal + periods, mult, - dtype=np.int64) + data = np.arange( + start.ordinal, start.ordinal + periods, mult, dtype=np.int64 + ) else: data = np.arange(start.ordinal, end.ordinal + 1, mult, dtype=np.int64) return data, freq -def _range_from_fields(year=None, month=None, quarter=None, day=None, - hour=None, minute=None, second=None, freq=None): +def _range_from_fields( + year=None, + month=None, + quarter=None, + day=None, + hour=None, + minute=None, + second=None, + freq=None, +): if hour is None: hour = 0 if minute is None: @@ -935,7 +983,7 @@ def _range_from_fields(year=None, month=None, quarter=None, day=None, if quarter is not None: if freq is None: - freq = 'Q' + freq = "Q" base = libfrequencies.FreqGroup.FR_QTR else: base, mult = libfrequencies.get_freq_code(freq) @@ -951,8 +999,7 @@ def _range_from_fields(year=None, month=None, quarter=None, day=None, base, mult = libfrequencies.get_freq_code(freq) arrays = _make_field_arrays(year, month, day, hour, minute, second) for y, mth, d, h, mn, s in zip(*arrays): - ordinals.append(libperiod.period_ordinal( - y, mth, d, h, mn, s, 0, 0, base)) + ordinals.append(libperiod.period_ordinal(y, mth, d, h, mn, s, 0, 0, base)) return np.array(ordinals, dtype=np.int64), freq @@ -962,11 +1009,15 @@ def _make_field_arrays(*fields): for x in fields: if isinstance(x, (list, np.ndarray, ABCSeries)): if length is not None and len(x) != length: - raise ValueError('Mismatched Period array lengths') + raise ValueError("Mismatched Period array lengths") elif length is None: length = len(x) - arrays = [np.asarray(x) if isinstance(x, (np.ndarray, list, ABCSeries)) - else np.repeat(x, length) for x in fields] + arrays = [ + np.asarray(x) + if isinstance(x, (np.ndarray, list, ABCSeries)) + else np.repeat(x, length) + for x in fields + ] return arrays diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 29cc899fa6a9b3..2332da46574c5f 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -21,14 +21,29 @@ from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import ( - astype_nansafe, construct_1d_arraylike_from_scalar, find_common_type, - infer_dtype_from_scalar) + astype_nansafe, + construct_1d_arraylike_from_scalar, + find_common_type, + infer_dtype_from_scalar, +) from pandas.core.dtypes.common import ( - is_array_like, is_bool_dtype, is_datetime64_any_dtype, is_dtype_equal, - is_integer, is_object_dtype, is_scalar, is_string_dtype, pandas_dtype) + is_array_like, + is_bool_dtype, + is_datetime64_any_dtype, + is_dtype_equal, + is_integer, + is_object_dtype, + is_scalar, + is_string_dtype, + pandas_dtype, +) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.generic import ( - ABCIndexClass, ABCSeries, ABCSparseArray, ABCSparseSeries) + ABCIndexClass, + ABCSeries, + ABCSparseArray, + ABCSparseSeries, +) from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna from pandas._typing import Dtype @@ -82,21 +97,16 @@ class SparseDtype(ExtensionDtype): ------- None """ + # We include `_is_na_fill_value` in the metadata to avoid hash collisions # between SparseDtype(float, 0.0) and SparseDtype(float, nan). # Without is_na_fill_value in the comparison, those would be equal since # hash(nan) is (sometimes?) 0. - _metadata = ('_dtype', '_fill_value', '_is_na_fill_value') + _metadata = ("_dtype", "_fill_value", "_is_na_fill_value") - def __init__( - self, - dtype: Dtype = np.float64, - fill_value: Any = None - ) -> None: + def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None: from pandas.core.dtypes.missing import na_value_for_dtype - from pandas.core.dtypes.common import ( - pandas_dtype, is_string_dtype, is_scalar - ) + from pandas.core.dtypes.common import pandas_dtype, is_string_dtype, is_scalar if isinstance(dtype, type(self)): if fill_value is None: @@ -105,14 +115,15 @@ def __init__( dtype = pandas_dtype(dtype) if is_string_dtype(dtype): - dtype = np.dtype('object') + dtype = np.dtype("object") if fill_value is None: fill_value = na_value_for_dtype(dtype) if not is_scalar(fill_value): - raise ValueError("fill_value must be a scalar. Got {} " - "instead".format(fill_value)) + raise ValueError( + "fill_value must be a scalar. Got {} " "instead".format(fill_value) + ) self._dtype = dtype self._fill_value = fill_value @@ -139,9 +150,9 @@ def __eq__(self, other): # i.e. we want to treat any floating-point NaN as equal, but # not a floating-point NaN and a datetime NaT. fill_value = ( - other._is_na_fill_value and - isinstance(self.fill_value, type(other.fill_value)) or - isinstance(other.fill_value, type(self.fill_value)) + other._is_na_fill_value + and isinstance(self.fill_value, type(other.fill_value)) + or isinstance(other.fill_value, type(self.fill_value)) ) else: fill_value = self.fill_value == other.fill_value @@ -168,16 +179,19 @@ def fill_value(self): @property def _is_na_fill_value(self): from pandas.core.dtypes.missing import isna + return isna(self.fill_value) @property def _is_numeric(self): from pandas.core.dtypes.common import is_object_dtype + return not is_object_dtype(self.subtype) @property def _is_boolean(self): from pandas.core.dtypes.common import is_bool_dtype + return is_bool_dtype(self.subtype) @property @@ -197,7 +211,7 @@ def subtype(self): @property def name(self): - return 'Sparse[{}, {}]'.format(self.subtype.name, self.fill_value) + return "Sparse[{}, {}]".format(self.subtype.name, self.fill_value) def __repr__(self): return self.name @@ -241,11 +255,13 @@ def construct_from_string(cls, string): except Exception: raise TypeError(msg) else: - msg = ("Could not construct SparseDtype from '{}'.\n\nIt " - "looks like the fill_value in the string is not " - "the default for the dtype. Non-default fill_values " - "are not supported. Use the 'SparseDtype()' " - "constructor instead.") + msg = ( + "Could not construct SparseDtype from '{}'.\n\nIt " + "looks like the fill_value in the string is not " + "the default for the dtype. Non-default fill_values " + "are not supported. Use the 'SparseDtype()' " + "constructor instead." + ) if has_fill_value and str(result) != string: raise TypeError(msg.format(string)) return result @@ -274,30 +290,27 @@ def _parse_subtype(dtype): ValueError When the subtype cannot be extracted. """ - xpr = re.compile( - r"Sparse\[(?P[^,]*)(, )?(?P.*?)?\]$" - ) + xpr = re.compile(r"Sparse\[(?P[^,]*)(, )?(?P.*?)?\]$") m = xpr.match(dtype) has_fill_value = False if m: - subtype = m.groupdict()['subtype'] - has_fill_value = m.groupdict()['fill_value'] or has_fill_value + subtype = m.groupdict()["subtype"] + has_fill_value = m.groupdict()["fill_value"] or has_fill_value elif dtype == "Sparse": - subtype = 'float64' + subtype = "float64" else: raise ValueError("Cannot parse {}".format(dtype)) return subtype, has_fill_value @classmethod def is_dtype(cls, dtype): - dtype = getattr(dtype, 'dtype', dtype) - if (isinstance(dtype, str) and - dtype.startswith("Sparse")): + dtype = getattr(dtype, "dtype", dtype) + if isinstance(dtype, str) and dtype.startswith("Sparse"): sub_type, _ = cls._parse_subtype(dtype) dtype = np.dtype(sub_type) elif isinstance(dtype, cls): return True - return isinstance(dtype, np.dtype) or dtype == 'Sparse' + return isinstance(dtype, np.dtype) or dtype == "Sparse" def update_dtype(self, dtype): """ @@ -341,8 +354,7 @@ def update_dtype(self, dtype): dtype = pandas_dtype(dtype) if not isinstance(dtype, cls): - fill_value = astype_nansafe(np.array(self.fill_value), - dtype).item() + fill_value = astype_nansafe(np.array(self.fill_value), dtype).item() dtype = cls(dtype, fill_value=fill_value) return dtype @@ -381,7 +393,7 @@ def _subtype_with_str(self): # Array -_sparray_doc_kwargs = dict(klass='SparseArray') +_sparray_doc_kwargs = dict(klass="SparseArray") def _get_fill(arr: ABCSparseArray) -> np.ndarray: @@ -409,10 +421,7 @@ def _get_fill(arr: ABCSparseArray) -> np.ndarray: def _sparse_array_op( - left: ABCSparseArray, - right: ABCSparseArray, - op: Callable, - name: str + left: ABCSparseArray, right: ABCSparseArray, op: Callable, name: str ) -> Any: """ Perform a binary operation between two arrays. @@ -430,7 +439,7 @@ def _sparse_array_op( ------- SparseArray """ - if name.startswith('__'): + if name.startswith("__"): # For lookups in _libs.sparse we need non-dunder op name name = name[2:-2] @@ -454,7 +463,7 @@ def _sparse_array_op( result_dtype = None if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = op(left.to_dense(), right.to_dense()) fill = op(_get_fill(left), _get_fill(right)) @@ -463,32 +472,37 @@ def _sparse_array_op( else: index = right.sp_index elif left.sp_index.equals(right.sp_index): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = op(left.sp_values, right.sp_values) fill = op(_get_fill(left), _get_fill(right)) index = left.sp_index else: - if name[0] == 'r': + if name[0] == "r": left, right = right, left name = name[1:] - if name in ('and', 'or') and dtype == 'bool': - opname = 'sparse_{name}_uint8'.format(name=name) + if name in ("and", "or") and dtype == "bool": + opname = "sparse_{name}_uint8".format(name=name) # to make template simple, cast here left_sp_values = left.sp_values.view(np.uint8) right_sp_values = right.sp_values.view(np.uint8) result_dtype = np.bool else: - opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype) + opname = "sparse_{name}_{dtype}".format(name=name, dtype=dtype) left_sp_values = left.sp_values right_sp_values = right.sp_values sparse_op = getattr(splib, opname) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result, index, fill = sparse_op( - left_sp_values, left.sp_index, left.fill_value, - right_sp_values, right.sp_index, right.fill_value) + left_sp_values, + left.sp_index, + left.fill_value, + right_sp_values, + right.sp_index, + right.fill_value, + ) if result_dtype is None: result_dtype = result.dtype @@ -500,11 +514,11 @@ def _wrap_result(name, data, sparse_index, fill_value, dtype=None): """ wrap op result to have correct dtype """ - if name.startswith('__'): + if name.startswith("__"): # e.g. __eq__ --> eq name = name[2:-2] - if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'): + if name in ("eq", "ne", "lt", "gt", "le", "ge"): dtype = np.bool fill_value = lib.item_from_zerodim(fill_value) @@ -512,10 +526,9 @@ def _wrap_result(name, data, sparse_index, fill_value, dtype=None): if is_bool_dtype(dtype): # fill_value may be np.bool_ fill_value = bool(fill_value) - return SparseArray(data, - sparse_index=sparse_index, - fill_value=fill_value, - dtype=dtype) + return SparseArray( + data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype + ) class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): @@ -583,11 +596,19 @@ class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): None """ - _pandas_ftype = 'sparse' - _subtyp = 'sparse_array' # register ABCSparseArray + _pandas_ftype = "sparse" + _subtyp = "sparse_array" # register ABCSparseArray - def __init__(self, data, sparse_index=None, index=None, fill_value=None, - kind='integer', dtype=None, copy=False): + def __init__( + self, + data, + sparse_index=None, + index=None, + fill_value=None, + kind="integer", + dtype=None, + copy=False, + ): from pandas.core.internals import SingleBlockManager if isinstance(data, SingleBlockManager): @@ -637,9 +658,7 @@ def __init__(self, data, sparse_index=None, index=None, fill_value=None, npoints = sparse_index.length dtype = infer_dtype_from_scalar(data)[0] - data = construct_1d_arraylike_from_scalar( - data, npoints, dtype - ) + data = construct_1d_arraylike_from_scalar(data, npoints, dtype) if dtype is not None: dtype = pandas_dtype(dtype) @@ -654,6 +673,7 @@ def __init__(self, data, sparse_index=None, index=None, fill_value=None, try: # probably shared code in sanitize_series from pandas.core.internals.construction import sanitize_array + data = sanitize_array(data, index=None) except ValueError: # NumPy may raise a ValueError on data like [1, []] @@ -685,19 +705,17 @@ def __init__(self, data, sparse_index=None, index=None, fill_value=None, else: sparse_values = np.asarray(data, dtype=dtype) if len(sparse_values) != sparse_index.npoints: - raise AssertionError("Non array-like type {type} must " - "have the same length as the index" - .format(type=type(sparse_values))) + raise AssertionError( + "Non array-like type {type} must " + "have the same length as the index".format(type=type(sparse_values)) + ) self._sparse_index = sparse_index self._sparse_values = sparse_values self._dtype = SparseDtype(sparse_values.dtype, fill_value) @classmethod def _simple_new( - cls, - sparse_array: np.ndarray, - sparse_index: SparseIndex, - dtype: SparseDtype + cls, sparse_array: np.ndarray, sparse_index: SparseIndex, dtype: SparseDtype ) -> ABCSparseArray: new = cls([]) new._sparse_index = sparse_index @@ -736,9 +754,7 @@ def from_spmatrix(cls, data): length, ncol = data.shape if ncol != 1: - raise ValueError( - "'data' must have a single column, not '{}'".format(ncol) - ) + raise ValueError("'data' must have a single column, not '{}'".format(ncol)) # our sparse index classes require that the positions be strictly # increasing. So we need to sort loc, and arr accordingly. @@ -769,7 +785,7 @@ def __array__(self, dtype=None, copy=True): # a datetime64 with pandas NaT. if fill_value is NaT: # Can't put pd.NaT in a datetime64[ns] - fill_value = np.datetime64('NaT') + fill_value = np.datetime64("NaT") try: dtype = np.result_type(self.sp_values.dtype, type(fill_value)) except TypeError: @@ -840,9 +856,9 @@ def kind(self): The kind of sparse index for this array. One of {'integer', 'block'}. """ if isinstance(self.sp_index, IntIndex): - return 'integer' + return "integer" else: - return 'block' + return "block" @property def _valid_sp_values(self): @@ -906,17 +922,18 @@ def values(self): msg = ( "The SparseArray.values attribute is deprecated and will be " "removed in a future version. You can use `np.asarray(...)` or " - "the `.to_dense()` method instead.") + "the `.to_dense()` method instead." + ) warnings.warn(msg, FutureWarning, stacklevel=2) return self.to_dense() def isna(self): from pandas import isna + # If null fill value, we want SparseDtype[bool, true] # to preserve the same memory usage. dtype = SparseDtype(bool, self._null_fill_value) - return type(self)._simple_new(isna(self.sp_values), - self.sp_index, dtype) + return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype) def fillna(self, value=None, method=None, limit=None): """ @@ -951,15 +968,15 @@ def fillna(self, value=None, method=None, limit=None): When ``self.fill_value`` is not NA, the result dtype will be ``self.dtype``. Again, this preserves the amount of memory used. """ - if ((method is None and value is None) or - (method is not None and value is not None)): + if (method is None and value is None) or ( + method is not None and value is not None + ): raise ValueError("Must specify one of 'method' or 'value'.") elif method is not None: msg = "fillna with 'method' requires high memory usage." warnings.warn(msg, PerformanceWarning) - filled = interpolate_2d(np.asarray(self), method=method, - limit=limit) + filled = interpolate_2d(np.asarray(self), method=method, limit=limit) return type(self)(filled, fill_value=self.fill_value) else: @@ -990,15 +1007,14 @@ def shift(self, periods=1, fill_value=None): arr = self empty = self._from_sequence( - [fill_value] * min(abs(periods), len(self)), - dtype=arr.dtype + [fill_value] * min(abs(periods), len(self)), dtype=arr.dtype ) if periods > 0: a = empty b = arr[:-periods] else: - a = arr[abs(periods):] + a = arr[abs(periods) :] b = empty return arr._concat_same_type([a, b]) @@ -1037,8 +1053,7 @@ def factorize(self, na_sentinel=-1): # ExtensionArray.factorize -> Tuple[EA, EA] # Given that we have to return a dense array of labels, why bother # implementing an efficient factorize? - labels, uniques = algos.factorize(np.asarray(self), - na_sentinel=na_sentinel) + labels, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel) uniques = SparseArray(uniques, dtype=self.dtype) return labels, uniques @@ -1057,8 +1072,7 @@ def value_counts(self, dropna=True): """ from pandas import Index, Series - keys, counts = algos._value_counts_arraylike(self.sp_values, - dropna=dropna) + keys, counts = algos._value_counts_arraylike(self.sp_values, dropna=dropna) fcounts = self.sp_index.ngaps if fcounts > 0: if self._null_fill_value and dropna: @@ -1115,7 +1129,7 @@ def __getitem__(self, key): if com.is_bool_indexer(key) and len(self) == len(key): return self.take(np.arange(len(key), dtype=np.int32)[key]) - elif hasattr(key, '__len__'): + elif hasattr(key, "__len__"): return self.take(key) else: raise ValueError("Cannot slice with '{}'".format(key)) @@ -1128,7 +1142,7 @@ def _get_val_at(self, loc): loc += n if loc >= n or loc < 0: - raise IndexError('Out of bounds access') + raise IndexError("Out of bounds access") sp_loc = self.sp_index.lookup(loc) if sp_loc == -1: @@ -1138,30 +1152,32 @@ def _get_val_at(self, loc): def take(self, indices, allow_fill=False, fill_value=None): if is_scalar(indices): - raise ValueError("'indices' must be an array, not a " - "scalar '{}'.".format(indices)) + raise ValueError( + "'indices' must be an array, not a " "scalar '{}'.".format(indices) + ) indices = np.asarray(indices, dtype=np.int32) if indices.size == 0: result = [] - kwargs = {'dtype': self.dtype} + kwargs = {"dtype": self.dtype} elif allow_fill: result = self._take_with_fill(indices, fill_value=fill_value) kwargs = {} else: result = self._take_without_fill(indices) - kwargs = {'dtype': self.dtype} + kwargs = {"dtype": self.dtype} - return type(self)(result, fill_value=self.fill_value, kind=self.kind, - **kwargs) + return type(self)(result, fill_value=self.fill_value, kind=self.kind, **kwargs) def _take_with_fill(self, indices, fill_value=None): if fill_value is None: fill_value = self.dtype.na_value if indices.min() < -1: - raise ValueError("Invalid value in 'indices'. Must be between -1 " - "and the length of the array.") + raise ValueError( + "Invalid value in 'indices'. Must be between -1 " + "and the length of the array." + ) if indices.max() >= len(self): raise IndexError("out of bounds value in 'indices'.") @@ -1174,15 +1190,17 @@ def _take_with_fill(self, indices, fill_value=None): taken.fill(fill_value) return taken else: - raise IndexError('cannot do a non-empty take from an empty ' - 'axes.') + raise IndexError("cannot do a non-empty take from an empty " "axes.") sp_indexer = self.sp_index.lookup_array(indices) if self.sp_index.npoints == 0: # Avoid taking from the empty self.sp_values - taken = np.full(sp_indexer.shape, fill_value=fill_value, - dtype=np.result_type(type(fill_value))) + taken = np.full( + sp_indexer.shape, + fill_value=fill_value, + dtype=np.result_type(type(fill_value)), + ) else: taken = self.sp_values.take(sp_indexer) @@ -1203,8 +1221,7 @@ def _take_with_fill(self, indices, fill_value=None): result_type = taken.dtype if m0.any(): - result_type = np.result_type(result_type, - type(self.fill_value)) + result_type = np.result_type(result_type, type(self.fill_value)) taken = taken.astype(result_type) taken[old_fill_indices] = self.fill_value @@ -1223,8 +1240,7 @@ def _take_without_fill(self, indices): if (indices.max() >= n) or (indices.min() < -n): if n == 0: - raise IndexError("cannot do a non-empty take from an " - "empty axes.") + raise IndexError("cannot do a non-empty take from an " "empty axes.") else: raise IndexError("out of bounds value in 'indices'.") @@ -1234,16 +1250,17 @@ def _take_without_fill(self, indices): if self.sp_index.npoints == 0: # edge case in take... # I think just return - out = np.full(indices.shape, self.fill_value, - dtype=np.result_type(type(self.fill_value))) - arr, sp_index, fill_value = make_sparse(out, - fill_value=self.fill_value) - return type(self)(arr, sparse_index=sp_index, - fill_value=fill_value) + out = np.full( + indices.shape, + self.fill_value, + dtype=np.result_type(type(self.fill_value)), + ) + arr, sp_index, fill_value = make_sparse(out, fill_value=self.fill_value) + return type(self)(arr, sparse_index=sp_index, fill_value=fill_value) sp_indexer = self.sp_index.lookup_array(indices) taken = self.sp_values.take(sp_indexer) - fillable = (sp_indexer < 0) + fillable = sp_indexer < 0 if fillable.any(): # TODO: may need to coerce array to fill value @@ -1259,9 +1276,7 @@ def searchsorted(self, v, side="left", sorter=None): if not is_scalar(v): v = np.asarray(v) v = np.asarray(v) - return np.asarray(self, dtype=self.dtype.subtype).searchsorted( - v, side, sorter - ) + return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter) def copy(self): values = self.sp_values.copy() @@ -1276,11 +1291,13 @@ def _concat_same_type(cls, to_concat): # np.nan isn't a singleton, so we may end up with multiple # NaNs here, so we ignore tha all NA case too. if not (len(set(fill_values)) == 1 or isna(fill_values).all()): - warnings.warn("Concatenating sparse arrays with multiple fill " - "values: '{}'. Picking the first and " - "converting the rest.".format(fill_values), - PerformanceWarning, - stacklevel=6) + warnings.warn( + "Concatenating sparse arrays with multiple fill " + "values: '{}'. Picking the first and " + "converting the rest.".format(fill_values), + PerformanceWarning, + stacklevel=6, + ) keep = to_concat[0] to_concat2 = [keep] @@ -1295,9 +1312,9 @@ def _concat_same_type(cls, to_concat): if to_concat: sp_kind = to_concat[0].kind else: - sp_kind = 'integer' + sp_kind = "integer" - if sp_kind == 'integer': + if sp_kind == "integer": indices = [] for arr in to_concat: @@ -1396,15 +1413,11 @@ def astype(self, dtype=None, copy=True): """ dtype = self.dtype.update_dtype(dtype) subtype = dtype._subtype_with_str - sp_values = astype_nansafe(self.sp_values, - subtype, - copy=copy) + sp_values = astype_nansafe(self.sp_values, subtype, copy=copy) if sp_values is self.sp_values and copy: sp_values = sp_values.copy() - return self._simple_new(sp_values, - self.sp_index, - dtype) + return self._simple_new(sp_values, self.sp_index, dtype) def map(self, mapper): """ @@ -1456,8 +1469,7 @@ def map(self, mapper): fill_value = mapper(self.fill_value) sp_values = [mapper(x) for x in self.sp_values] - return type(self)(sp_values, sparse_index=self.sp_index, - fill_value=fill_value) + return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_value) def to_dense(self): """ @@ -1480,7 +1492,9 @@ def get_values(self): warnings.warn( "The 'get_values' method is deprecated and will be removed in a " "future version. Use the 'to_dense' method instead.", - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2, + ) return self._internal_get_values() _internal_get_values = to_dense @@ -1504,9 +1518,9 @@ def __setstate__(self, state): def nonzero(self): if self.fill_value == 0: - return self.sp_index.to_int_index().indices, + return (self.sp_index.to_int_index().indices,) else: - return self.sp_index.to_int_index().indices[self.sp_values != 0], + return (self.sp_index.to_int_index().indices[self.sp_values != 0],) # ------------------------------------------------------------------------ # Reductions @@ -1516,8 +1530,11 @@ def _reduce(self, name, skipna=True, **kwargs): method = getattr(self, name, None) if method is None: - raise TypeError("cannot perform {name} with type {dtype}".format( - name=name, dtype=self.dtype)) + raise TypeError( + "cannot perform {name} with type {dtype}".format( + name=name, dtype=self.dtype + ) + ) if skipna: arr = self @@ -1528,9 +1545,9 @@ def _reduce(self, name, skipna=True, **kwargs): # They should only be present when called via pandas, so do it here. # instead of in `any` / `all` (which will raise if they're present, # thanks to nv.validate - kwargs.pop('filter_type', None) - kwargs.pop('numeric_only', None) - kwargs.pop('op', None) + kwargs.pop("filter_type", None) + kwargs.pop("numeric_only", None) + kwargs.pop("op", None) return getattr(arr, name)(**kwargs) def all(self, axis=None, *args, **kwargs): @@ -1618,8 +1635,11 @@ def cumsum(self, axis=0, *args, **kwargs): if not self._null_fill_value: return SparseArray(self.to_dense()).cumsum() - return SparseArray(self.sp_values.cumsum(), sparse_index=self.sp_index, - fill_value=self.fill_value) + return SparseArray( + self.sp_values.cumsum(), + sparse_index=self.sp_index, + fill_value=self.fill_value, + ) def mean(self, axis=0, *args, **kwargs): """ @@ -1660,7 +1680,7 @@ def T(self): _HANDLED_TYPES = (np.ndarray, numbers.Number) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - out = kwargs.get('out', ()) + out = kwargs.get("out", ()) for x in inputs + out: if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)): @@ -1668,7 +1688,8 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # for binary ops, use our custom dunder methods result = ops.maybe_dispatch_ufunc_to_dunder_op( - self, ufunc, method, *inputs, **kwargs) + self, ufunc, method, *inputs, **kwargs + ) if result is not NotImplemented: return result @@ -1680,19 +1701,18 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): if isinstance(sp_values, tuple): # multiple outputs. e.g. modf arrays = tuple( - self._simple_new(sp_value, - self.sp_index, - SparseDtype(sp_value.dtype, fv)) + self._simple_new( + sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv) + ) for sp_value, fv in zip(sp_values, fill_value) ) return arrays - return self._simple_new(sp_values, - self.sp_index, - SparseDtype(sp_values.dtype, fill_value)) + return self._simple_new( + sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value) + ) - result = getattr(ufunc, method)(*[np.asarray(x) for x in inputs], - **kwargs) + result = getattr(ufunc, method)(*[np.asarray(x) for x in inputs], **kwargs) if out: if len(out) == 1: out = out[0] @@ -1700,7 +1720,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): if type(result) is tuple: return tuple(type(self)(x) for x in result) - elif method == 'at': + elif method == "at": # no return value return None else: @@ -1721,7 +1741,7 @@ def sparse_unary_method(self): dtype = SparseDtype(values.dtype, fill_value) return cls._simple_new(values, self.sp_index, dtype) - name = '__{name}__'.format(name=op.__name__) + name = "__{name}__".format(name=op.__name__) return compat.set_function_name(sparse_unary_method, name, cls) @classmethod @@ -1737,34 +1757,41 @@ def sparse_arithmetic_method(self, other): return _sparse_array_op(self, other, op, op_name) elif is_scalar(other): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): fill = op(_get_fill(self), np.asarray(other)) result = op(self.sp_values, other) - if op_name == 'divmod': + if op_name == "divmod": left, right = result lfill, rfill = fill - return (_wrap_result(op_name, left, self.sp_index, lfill), - _wrap_result(op_name, right, self.sp_index, rfill)) + return ( + _wrap_result(op_name, left, self.sp_index, lfill), + _wrap_result(op_name, right, self.sp_index, rfill), + ) return _wrap_result(op_name, result, self.sp_index, fill) else: other = np.asarray(other) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): # TODO: delete sparse stuff in core/ops.py # TODO: look into _wrap_result if len(self) != len(other): raise AssertionError( - ("length mismatch: {self} vs. {other}".format( - self=len(self), other=len(other)))) + ( + "length mismatch: {self} vs. {other}".format( + self=len(self), other=len(other) + ) + ) + ) if not isinstance(other, SparseArray): - dtype = getattr(other, 'dtype', None) - other = SparseArray(other, fill_value=self.fill_value, - dtype=dtype) + dtype = getattr(other, "dtype", None) + other = SparseArray( + other, fill_value=self.fill_value, dtype=dtype + ) return _sparse_array_op(self, other, op, op_name) - name = '__{name}__'.format(name=op.__name__) + name = "__{name}__".format(name=op.__name__) return compat.set_function_name(sparse_arithmetic_method, name, cls) @classmethod @@ -1772,7 +1799,7 @@ def _create_comparison_method(cls, op): def cmp_method(self, other): op_name = op.__name__ - if op_name in {'and_', 'or_'}: + if op_name in {"and_", "or_"}: op_name = op_name[:-1] if isinstance(other, (ABCSeries, ABCIndexClass)): @@ -1786,24 +1813,28 @@ def cmp_method(self, other): if isinstance(other, np.ndarray): # TODO: make this more flexible than just ndarray... if len(self) != len(other): - raise AssertionError("length mismatch: {self} vs. {other}" - .format(self=len(self), - other=len(other))) + raise AssertionError( + "length mismatch: {self} vs. {other}".format( + self=len(self), other=len(other) + ) + ) other = SparseArray(other, fill_value=self.fill_value) if isinstance(other, SparseArray): return _sparse_array_op(self, other, op, op_name) else: - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): fill_value = op(self.fill_value, other) result = op(self.sp_values, other) - return type(self)(result, - sparse_index=self.sp_index, - fill_value=fill_value, - dtype=np.bool_) + return type(self)( + result, + sparse_index=self.sp_index, + fill_value=fill_value, + dtype=np.bool_, + ) - name = '__{name}__'.format(name=op.__name__) + name = "__{name}__".format(name=op.__name__) return compat.set_function_name(cmp_method, name, cls) @classmethod @@ -1822,10 +1853,11 @@ def _add_comparison_ops(cls): # Formatting # ----------- def __repr__(self): - return '{self}\nFill: {fill}\n{index}'.format( + return "{self}\nFill: {fill}\n{index}".format( self=printing.pprint_thing(self), fill=printing.pprint_thing(self.fill_value), - index=printing.pprint_thing(self.sp_index)) + index=printing.pprint_thing(self.sp_index), + ) def _formatter(self, boxed=False): # Defer to the formatter from the GenericArrayFormatter calling us. @@ -1842,12 +1874,12 @@ def _maybe_to_dense(obj): """ try to convert to dense """ - if hasattr(obj, 'to_dense'): + if hasattr(obj, "to_dense"): return obj.to_dense() return obj -def make_sparse(arr, kind='block', fill_value=None, dtype=None, copy=False): +def make_sparse(arr, kind="block", fill_value=None, dtype=None, copy=False): """ Convert ndarray to sparse format @@ -1904,13 +1936,13 @@ def make_sparse(arr, kind='block', fill_value=None, dtype=None, copy=False): def _make_index(length, indices, kind): - if kind == 'block' or isinstance(kind, BlockIndex): + if kind == "block" or isinstance(kind, BlockIndex): locs, lens = splib.get_blocks(indices) index = BlockIndex(length, locs, lens) - elif kind == 'integer' or isinstance(kind, IntIndex): + elif kind == "integer" or isinstance(kind, IntIndex): index = IntIndex(length, indices) else: # pragma: no cover - raise ValueError('must be block or integer type') + raise ValueError("must be block or integer type") return index @@ -1929,9 +1961,9 @@ def _validate(self, data): raise NotImplementedError -@delegate_names(SparseArray, ['npoints', 'density', 'fill_value', - 'sp_values'], - typ='property') +@delegate_names( + SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property" +) class SparseAccessor(BaseAccessor, PandasDelegate): """ Accessor for SparseSparse from other sparse matrix data types. @@ -1945,9 +1977,9 @@ def _delegate_property_get(self, name, *args, **kwargs): return getattr(self._parent.array, name) def _delegate_method(self, name, *args, **kwargs): - if name == 'from_coo': + if name == "from_coo": return self.from_coo(*args, **kwargs) - elif name == 'to_coo': + elif name == "to_coo": return self.to_coo(*args, **kwargs) else: raise ValueError @@ -1995,13 +2027,12 @@ def from_coo(cls, A, dense_index=False): from pandas.core.sparse.scipy_sparse import _coo_to_sparse_series from pandas import Series - result = _coo_to_sparse_series(A, dense_index=dense_index, - sparse_series=False) + result = _coo_to_sparse_series(A, dense_index=dense_index, sparse_series=False) result = Series(result.array, index=result.index, copy=False) return result - def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False): + def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False): """ Create a scipy.sparse.coo_matrix from a SparseSeries with MultiIndex. @@ -2051,10 +2082,9 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False): """ from pandas.core.sparse.scipy_sparse import _sparse_series_to_coo - A, rows, columns = _sparse_series_to_coo(self._parent, - row_levels, - column_levels, - sort_labels=sort_labels) + A, rows, columns = _sparse_series_to_coo( + self._parent, row_levels, column_levels, sort_labels=sort_labels + ) return A, rows, columns def to_dense(self): @@ -2084,9 +2114,12 @@ def to_dense(self): dtype: int64 """ from pandas import Series - return Series(self._parent.array.to_dense(), - index=self._parent.index, - name=self._parent.name) + + return Series( + self._parent.array.to_dense(), + index=self._parent.index, + name=self._parent.name, + ) class SparseFrameAccessor(BaseAccessor, PandasDelegate): @@ -2136,10 +2169,7 @@ def from_spmatrix(cls, data, index=None, columns=None): data = data.tocsc() index, columns = cls._prep_index(data, index, columns) - sparrays = [ - SparseArray.from_spmatrix(data[:, i]) - for i in range(data.shape[1]) - ] + sparrays = [SparseArray.from_spmatrix(data[:, i]) for i in range(data.shape[1])] data = dict(enumerate(sparrays)) result = DataFrame(data, index=index) result.columns = columns @@ -2167,11 +2197,8 @@ def to_dense(self): """ from pandas import DataFrame - data = {k: v.array.to_dense() - for k, v in self._parent.items()} - return DataFrame(data, - index=self._parent.index, - columns=self._parent.columns) + data = {k: v.array.to_dense() for k, v in self._parent.items()} + return DataFrame(data, index=self._parent.index, columns=self._parent.columns) def to_coo(self): """ @@ -2221,8 +2248,7 @@ def density(self) -> float: Ratio of non-sparse points to total (dense) data points represented in the DataFrame. """ - return np.mean([column.array.density - for _, column in self._parent.items()]) + return np.mean([column.array.density for _, column in self._parent.items()]) @staticmethod def _prep_index(data, index, columns): @@ -2235,9 +2261,13 @@ def _prep_index(data, index, columns): columns = ibase.default_index(K) if len(columns) != K: - raise ValueError('Column length mismatch: {columns} vs. {K}' - .format(columns=len(columns), K=K)) + raise ValueError( + "Column length mismatch: {columns} vs. {K}".format( + columns=len(columns), K=K + ) + ) if len(index) != N: - raise ValueError('Index length mismatch: {index} vs. {N}' - .format(index=len(index), N=N)) + raise ValueError( + "Index length mismatch: {index} vs. {N}".format(index=len(index), N=N) + ) return index, columns diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 50bc8d6d3ae6b7..9d622d92e09790 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -9,18 +9,36 @@ from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas._libs.tslibs.fields import get_timedelta_field from pandas._libs.tslibs.timedeltas import ( - array_to_timedelta64, parse_timedelta_unit, precision_from_unit) + array_to_timedelta64, + parse_timedelta_unit, + precision_from_unit, +) import pandas.compat as compat from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( - _NS_DTYPE, _TD_DTYPE, ensure_int64, is_datetime64_dtype, is_dtype_equal, - is_float_dtype, is_integer_dtype, is_list_like, is_object_dtype, is_scalar, - is_string_dtype, is_timedelta64_dtype, is_timedelta64_ns_dtype, - pandas_dtype) + _NS_DTYPE, + _TD_DTYPE, + ensure_int64, + is_datetime64_dtype, + is_dtype_equal, + is_float_dtype, + is_integer_dtype, + is_list_like, + is_object_dtype, + is_scalar, + is_string_dtype, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, + pandas_dtype, +) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCIndexClass, ABCSeries, ABCTimedeltaIndex) + ABCDataFrame, + ABCIndexClass, + ABCSeries, + ABCTimedeltaIndex, +) from pandas.core.dtypes.missing import isna from pandas.core import ops @@ -44,8 +62,9 @@ def f(self): values = self.asi8 result = get_timedelta_field(values, alias) if self._hasnans: - result = self._maybe_mask_results(result, fill_value=None, - convert='float64') + result = self._maybe_mask_results( + result, fill_value=None, convert="float64" + ) return result @@ -58,8 +77,8 @@ def _td_array_cmp(cls, op): """ Wrap comparison operations to convert timedelta-like to timedelta64 """ - opname = '__{name}__'.format(name=op.__name__) - nat_result = opname == '__ne__' + opname = "__{name}__".format(name=op.__name__) + nat_result = opname == "__ne__" def wrapper(self, other): other = lib.item_from_zerodim(other) @@ -73,7 +92,7 @@ def wrapper(self, other): # failed to parse as timedelta return ops.invalid_comparison(self, other, op) - result = op(self.view('i8'), other.value) + result = op(self.view("i8"), other.value) if isna(other): result.fill(nat_result) @@ -89,7 +108,7 @@ def wrapper(self, other): except (ValueError, TypeError): return ops.invalid_comparison(self, other, op) - result = op(self.view('i8'), other.view('i8')) + result = op(self.view("i8"), other.view("i8")) result = com.values_from_object(result) o_mask = np.array(isna(other)) @@ -136,24 +155,30 @@ class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): ------- None """ + _typ = "timedeltaarray" _scalar_type = Timedelta __array_priority__ = 1000 # define my properties & methods for delegation _other_ops = [] # type: List[str] _bool_ops = [] # type: List[str] - _object_ops = ['freq'] - _field_ops = ['days', 'seconds', 'microseconds', 'nanoseconds'] + _object_ops = ["freq"] + _field_ops = ["days", "seconds", "microseconds", "nanoseconds"] _datetimelike_ops = _field_ops + _object_ops + _bool_ops - _datetimelike_methods = ["to_pytimedelta", "total_seconds", - "round", "floor", "ceil"] + _datetimelike_methods = [ + "to_pytimedelta", + "total_seconds", + "round", + "floor", + "ceil", + ] # Needed so that NaT.__richcmp__(DateTimeArray) operates pointwise ndim = 1 @property def _box_func(self): - return lambda x: Timedelta(x, unit='ns') + return lambda x: Timedelta(x, unit="ns") @property def dtype(self): @@ -199,7 +224,7 @@ def __init__(self, values, dtype=_TD_DTYPE, freq=None, copy=False): if values.ndim != 1: raise ValueError("Only 1-dimensional input arrays are supported.") - if values.dtype == 'i8': + if values.dtype == "i8": # for compat with datetime/timedelta/period shared methods, # we can sometimes get here with int64 values. These represent # nanosecond UTC (or tz-naive) unix timestamps @@ -239,15 +264,13 @@ def _simple_new(cls, values, freq=None, dtype=_TD_DTYPE): return result @classmethod - def _from_sequence(cls, data, dtype=_TD_DTYPE, copy=False, - freq=None, unit=None): + def _from_sequence(cls, data, dtype=_TD_DTYPE, copy=False, freq=None, unit=None): if dtype: _validate_td64_dtype(dtype) freq, freq_infer = dtl.maybe_infer_freq(freq) data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=unit) - freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, - freq_infer) + freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, freq_infer) result = cls._simple_new(data, freq=freq) @@ -267,12 +290,13 @@ def _generate_range(cls, start, end, periods, freq, closed=None): periods = dtl.validate_periods(periods) if freq is None and any(x is None for x in [periods, start, end]): - raise ValueError('Must provide freq argument if no data is ' - 'supplied') + raise ValueError("Must provide freq argument if no data is " "supplied") if com.count_not_none(start, end, periods, freq) != 3: - raise ValueError('Of the four parameters: start, end, periods, ' - 'and freq, exactly three must be specified') + raise ValueError( + "Of the four parameters: start, end, periods, " + "and freq, exactly three must be specified" + ) if start is not None: start = Timedelta(start) @@ -282,15 +306,16 @@ def _generate_range(cls, start, end, periods, freq, closed=None): if start is None and end is None: if closed is not None: - raise ValueError("Closed has to be None if not both of start" - "and end are defined") + raise ValueError( + "Closed has to be None if not both of start" "and end are defined" + ) left_closed, right_closed = dtl.validate_endpoints(closed) if freq is not None: index = _generate_regular_range(start, end, periods, freq) else: - index = np.linspace(start.value, end.value, periods).astype('i8') + index = np.linspace(start.value, end.value, periods).astype("i8") if not left_closed: index = index[1:] @@ -328,8 +353,10 @@ def _validate_fill_value(self, fill_value): elif isinstance(fill_value, (timedelta, np.timedelta64, Tick)): fill_value = Timedelta(fill_value).value else: - raise ValueError("'fill_value' should be a Timedelta. " - "Got '{got}'.".format(got=fill_value)) + raise ValueError( + "'fill_value' should be a Timedelta. " + "Got '{got}'.".format(got=fill_value) + ) return fill_value def astype(self, dtype, copy=True): @@ -346,12 +373,12 @@ def astype(self, dtype, copy=True): if self._hasnans: # avoid double-copying result = self._data.astype(dtype, copy=False) - values = self._maybe_mask_results(result, - fill_value=None, - convert='float64') + values = self._maybe_mask_results( + result, fill_value=None, convert="float64" + ) return values result = self._data.astype(dtype, copy=copy) - return result.astype('i8') + return result.astype("i8") elif is_timedelta64_ns_dtype(dtype): if copy: return self.copy() @@ -363,9 +390,10 @@ def astype(self, dtype, copy=True): def _formatter(self, boxed=False): from pandas.io.formats.format import _get_format_timedelta64 + return _get_format_timedelta64(self, box=True) - def _format_native_types(self, na_rep='NaT', date_format=None): + def _format_native_types(self, na_rep="NaT", date_format=None): from pandas.io.formats.format import _get_format_timedelta64 formatter = _get_format_timedelta64(self._data, na_rep) @@ -378,9 +406,11 @@ def _format_native_types(self, na_rep='NaT', date_format=None): def _add_offset(self, other): assert not isinstance(other, Tick) - raise TypeError("cannot add the type {typ} to a {cls}" - .format(typ=type(other).__name__, - cls=type(self).__name__)) + raise TypeError( + "cannot add the type {typ} to a {cls}".format( + typ=type(other).__name__, cls=type(self).__name__ + ) + ) def _add_delta(self, delta): """ @@ -397,7 +427,7 @@ def _add_delta(self, delta): result : TimedeltaArray """ new_values = super()._add_delta(delta) - return type(self)._from_sequence(new_values, freq='infer') + return type(self)._from_sequence(new_values, freq="infer") def _add_datetime_arraylike(self, other): """ @@ -406,6 +436,7 @@ def _add_datetime_arraylike(self, other): if isinstance(other, np.ndarray): # At this point we have already checked that dtype is datetime64 from pandas.core.arrays import DatetimeArray + other = DatetimeArray(other) # defer to implementation in DatetimeArray @@ -420,12 +451,11 @@ def _add_datetimelike_scalar(self, other): if other is NaT: # In this case we specifically interpret NaT as a datetime, not # the timedelta interpretation we would get by returning self + NaT - result = self.asi8.view('m8[ms]') + NaT.to_datetime64() + result = self.asi8.view("m8[ms]") + NaT.to_datetime64() return DatetimeArray(result) i8 = self.asi8 - result = checked_add_with_arr(i8, other.value, - arr_mask=self._isnan) + result = checked_add_with_arr(i8, other.value, arr_mask=self._isnan) result = self._maybe_mask_results(result) dtype = DatetimeTZDtype(tz=other.tz) if other.tz else _NS_DTYPE return DatetimeArray(result, dtype=dtype, freq=self.freq) @@ -438,8 +468,11 @@ def _addsub_offset_array(self, other, op): # which we re-raise as TypeError return super()._addsub_offset_array(other, op) except AttributeError: - raise TypeError("Cannot add/subtract non-tick DateOffset to {cls}" - .format(cls=type(self).__name__)) + raise TypeError( + "Cannot add/subtract non-tick DateOffset to {cls}".format( + cls=type(self).__name__ + ) + ) def __mul__(self, other): other = lib.item_from_zerodim(other) @@ -546,9 +579,11 @@ def __rtruediv__(self, other): return other / self._data elif lib.is_scalar(other): - raise TypeError("Cannot divide {typ} by {cls}" - .format(typ=type(other).__name__, - cls=type(self).__name__)) + raise TypeError( + "Cannot divide {typ} by {cls}".format( + typ=type(other).__name__, cls=type(self).__name__ + ) + ) if not hasattr(other, "dtype"): # e.g. list, tuple @@ -569,9 +604,11 @@ def __rtruediv__(self, other): return np.array(result) else: - raise TypeError("Cannot divide {dtype} data by {cls}" - .format(dtype=other.dtype, - cls=type(self).__name__)) + raise TypeError( + "Cannot divide {dtype} data by {cls}".format( + dtype=other.dtype, cls=type(self).__name__ + ) + ) def __floordiv__(self, other): if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): @@ -599,7 +636,7 @@ def __floordiv__(self, other): if self.freq is not None: # Note: freq gets division, not floor-division freq = self.freq / other - return type(self)(result.view('m8[ns]'), freq=freq) + return type(self)(result.view("m8[ns]"), freq=freq) if not hasattr(other, "dtype"): # list, tuple @@ -622,7 +659,7 @@ def __floordiv__(self, other): elif is_object_dtype(other): result = [self[n] // other[n] for n in range(len(self))] result = np.array(result) - if lib.infer_dtype(result, skipna=False) == 'timedelta': + if lib.infer_dtype(result, skipna=False) == "timedelta": result, _ = sequence_to_td64ns(result) return type(self)(result) return result @@ -633,8 +670,11 @@ def __floordiv__(self, other): else: dtype = getattr(other, "dtype", type(other).__name__) - raise TypeError("Cannot divide {typ} by {cls}" - .format(typ=dtype, cls=type(self).__name__)) + raise TypeError( + "Cannot divide {typ} by {cls}".format( + typ=dtype, cls=type(self).__name__ + ) + ) def __rfloordiv__(self, other): if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): @@ -654,9 +694,11 @@ def __rfloordiv__(self, other): result = other.__floordiv__(self._data) return result - raise TypeError("Cannot divide {typ} by {cls}" - .format(typ=type(other).__name__, - cls=type(self).__name__)) + raise TypeError( + "Cannot divide {typ} by {cls}".format( + typ=type(other).__name__, cls=type(self).__name__ + ) + ) if not hasattr(other, "dtype"): # list, tuple @@ -683,8 +725,11 @@ def __rfloordiv__(self, other): else: dtype = getattr(other, "dtype", type(other).__name__) - raise TypeError("Cannot divide {typ} by {cls}" - .format(typ=dtype, cls=type(self).__name__)) + raise TypeError( + "Cannot divide {typ} by {cls}".format( + typ=dtype, cls=type(self).__name__ + ) + ) def __mod__(self, other): # Note: This is a naive implementation, can likely be optimized @@ -813,17 +858,22 @@ def to_pytimedelta(self): """ return tslibs.ints_to_pytimedelta(self.asi8) - days = _field_accessor("days", "days", - "Number of days for each element.") - seconds = _field_accessor("seconds", "seconds", - "Number of seconds (>= 0 and less than 1 day) " - "for each element.") - microseconds = _field_accessor("microseconds", "microseconds", - "Number of microseconds (>= 0 and less " - "than 1 second) for each element.") - nanoseconds = _field_accessor("nanoseconds", "nanoseconds", - "Number of nanoseconds (>= 0 and less " - "than 1 microsecond) for each element.") + days = _field_accessor("days", "days", "Number of days for each element.") + seconds = _field_accessor( + "seconds", + "seconds", + "Number of seconds (>= 0 and less than 1 day) " "for each element.", + ) + microseconds = _field_accessor( + "microseconds", + "microseconds", + "Number of microseconds (>= 0 and less " "than 1 second) for each element.", + ) + nanoseconds = _field_accessor( + "nanoseconds", + "nanoseconds", + "Number of nanoseconds (>= 0 and less " "than 1 microsecond) for each element.", + ) @property def components(self): @@ -837,21 +887,31 @@ def components(self): """ from pandas import DataFrame - columns = ['days', 'hours', 'minutes', 'seconds', - 'milliseconds', 'microseconds', 'nanoseconds'] + columns = [ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ] hasnans = self._hasnans if hasnans: + def f(x): if isna(x): return [np.nan] * len(columns) return x.components + else: + def f(x): return x.components result = DataFrame([f(x) for x in self], columns=columns) if not hasnans: - result = result.astype('int64') + result = result.astype("int64") return result @@ -861,6 +921,7 @@ def f(x): # --------------------------------------------------------------------- # Constructor Helpers + def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): """ Parameters @@ -894,7 +955,7 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): unit = parse_timedelta_unit(unit) # Unwrap whatever we have into a np.ndarray - if not hasattr(data, 'dtype'): + if not hasattr(data, "dtype"): # e.g. list, tuple if np.ndim(data) == 0: # i.e. generator @@ -926,7 +987,7 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): frac = data - base if p: frac = np.round(frac, p) - data = (base * m + (frac * m).astype(np.int64)).view('timedelta64[ns]') + data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]") data[mask] = iNaT copy = False @@ -939,21 +1000,27 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): elif is_datetime64_dtype(data): # GH#23539 - warnings.warn("Passing datetime64-dtype data to TimedeltaIndex is " - "deprecated, will raise a TypeError in a future " - "version", - FutureWarning, stacklevel=4) + warnings.warn( + "Passing datetime64-dtype data to TimedeltaIndex is " + "deprecated, will raise a TypeError in a future " + "version", + FutureWarning, + stacklevel=4, + ) data = ensure_int64(data).view(_TD_DTYPE) else: - raise TypeError("dtype {dtype} cannot be converted to timedelta64[ns]" - .format(dtype=data.dtype)) + raise TypeError( + "dtype {dtype} cannot be converted to timedelta64[ns]".format( + dtype=data.dtype + ) + ) data = np.array(data, copy=copy) if data.ndim != 1: raise ValueError("Only 1-dimensional input arrays are supported.") - assert data.dtype == 'm8[ns]', data + assert data.dtype == "m8[ns]", data return data, inferred_freq @@ -1028,19 +1095,20 @@ def objects_to_td64ns(data, unit="ns", errors="raise"): # coerce Index to np.ndarray, converting string-dtype if necessary values = np.array(data, dtype=np.object_, copy=False) - result = array_to_timedelta64(values, - unit=unit, errors=errors) - return result.view('timedelta64[ns]') + result = array_to_timedelta64(values, unit=unit, errors=errors) + return result.view("timedelta64[ns]") def _validate_td64_dtype(dtype): dtype = pandas_dtype(dtype) if is_dtype_equal(dtype, np.dtype("timedelta64")): dtype = _TD_DTYPE - msg = textwrap.dedent("""\ + msg = textwrap.dedent( + """\ Passing in 'timedelta' dtype with no precision is deprecated and will raise in a future version. Please pass in - 'timedelta64[ns]' instead.""") + 'timedelta64[ns]' instead.""" + ) warnings.warn(msg, FutureWarning, stacklevel=4) if not is_dtype_equal(dtype, _TD_DTYPE): @@ -1062,8 +1130,9 @@ def _generate_regular_range(start, end, periods, offset): e = Timedelta(end).value + stride b = e - periods * stride else: - raise ValueError("at least 'start' or 'end' should be specified " - "if a 'period' is given.") + raise ValueError( + "at least 'start' or 'end' should be specified " "if a 'period' is given." + ) data = np.arange(b, e, stride, dtype=np.int64) return data diff --git a/pandas/core/base.py b/pandas/core/base.py index 93db65deff8202..15baf1bed0ecdf 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -16,9 +16,17 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( - is_categorical_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, - is_datetimelike, is_extension_array_dtype, is_extension_type, is_list_like, - is_object_dtype, is_scalar, is_timedelta64_ns_dtype) + is_categorical_dtype, + is_datetime64_ns_dtype, + is_datetime64tz_dtype, + is_datetimelike, + is_extension_array_dtype, + is_extension_type, + is_list_like, + is_object_dtype, + is_scalar, + is_timedelta64_ns_dtype, +) from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna @@ -28,14 +36,19 @@ import pandas.core.nanops as nanops _shared_docs = dict() -_indexops_doc_kwargs = dict(klass='IndexOpsMixin', inplace='', - unique='IndexOpsMixin', duplicated='IndexOpsMixin') +_indexops_doc_kwargs = dict( + klass="IndexOpsMixin", + inplace="", + unique="IndexOpsMixin", + duplicated="IndexOpsMixin", +) class StringMixin: """ Implements string methods so long as object defines a `__str__` method. """ + # side note - this could be made into a metaclass if more than one # object needs @@ -75,7 +88,7 @@ def _reset_cache(self, key=None): """ Reset cached properties. If ``key`` is passed, only clears that key. """ - if getattr(self, '_cache', None) is None: + if getattr(self, "_cache", None) is None: return if key is None: self._cache.clear() @@ -87,7 +100,7 @@ def __sizeof__(self): Generates the total memory usage for an object that returns either a value or Series of values """ - if hasattr(self, 'memory_usage'): + if hasattr(self, "memory_usage"): mem = self.memory_usage(deep=True) if not is_scalar(mem): mem = mem.sum() @@ -120,12 +133,14 @@ def __setattr__(self, key, value): # because # 1.) getattr is false for attributes that raise errors # 2.) cls.__dict__ doesn't traverse into base classes - if (getattr(self, "__frozen", False) and not - (key == "_cache" or - key in type(self).__dict__ or - getattr(self, key, None) is not None)): - raise AttributeError("You cannot add any new attribute '{key}'". - format(key=key)) + if getattr(self, "__frozen", False) and not ( + key == "_cache" + or key in type(self).__dict__ + or getattr(self, key, None) is not None + ): + raise AttributeError( + "You cannot add any new attribute '{key}'".format(key=key) + ) object.__setattr__(self, key, value) @@ -146,43 +161,44 @@ class SelectionMixin: mixin implementing the selection & aggregation interface on a group-like object sub-classes need to define: obj, exclusions """ + _selection = None - _internal_names = ['_cache', '__setstate__'] + _internal_names = ["_cache", "__setstate__"] _internal_names_set = set(_internal_names) - _builtin_table = OrderedDict(( - (builtins.sum, np.sum), - (builtins.max, np.max), - (builtins.min, np.min), - )) - - _cython_table = OrderedDict(( - (builtins.sum, 'sum'), - (builtins.max, 'max'), - (builtins.min, 'min'), - (np.all, 'all'), - (np.any, 'any'), - (np.sum, 'sum'), - (np.nansum, 'sum'), - (np.mean, 'mean'), - (np.nanmean, 'mean'), - (np.prod, 'prod'), - (np.nanprod, 'prod'), - (np.std, 'std'), - (np.nanstd, 'std'), - (np.var, 'var'), - (np.nanvar, 'var'), - (np.median, 'median'), - (np.nanmedian, 'median'), - (np.max, 'max'), - (np.nanmax, 'max'), - (np.min, 'min'), - (np.nanmin, 'min'), - (np.cumprod, 'cumprod'), - (np.nancumprod, 'cumprod'), - (np.cumsum, 'cumsum'), - (np.nancumsum, 'cumsum'), - )) + _builtin_table = OrderedDict( + ((builtins.sum, np.sum), (builtins.max, np.max), (builtins.min, np.min)) + ) + + _cython_table = OrderedDict( + ( + (builtins.sum, "sum"), + (builtins.max, "max"), + (builtins.min, "min"), + (np.all, "all"), + (np.any, "any"), + (np.sum, "sum"), + (np.nansum, "sum"), + (np.mean, "mean"), + (np.nanmean, "mean"), + (np.prod, "prod"), + (np.nanprod, "prod"), + (np.std, "std"), + (np.nanstd, "std"), + (np.var, "var"), + (np.nanvar, "var"), + (np.median, "median"), + (np.nanmedian, "median"), + (np.max, "max"), + (np.nanmax, "max"), + (np.min, "min"), + (np.nanmin, "min"), + (np.cumprod, "cumprod"), + (np.nancumprod, "cumprod"), + (np.cumsum, "cumsum"), + (np.nancumsum, "cumsum"), + ) + ) @property def _selection_name(self): @@ -198,8 +214,9 @@ def _selection_name(self): @property def _selection_list(self): - if not isinstance(self._selection, (list, tuple, ABCSeries, - ABCIndexClass, np.ndarray)): + if not isinstance( + self._selection, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray) + ): return [self._selection] return self._selection @@ -217,8 +234,7 @@ def ndim(self): @cache_readonly def _obj_with_exclusions(self): - if self._selection is not None and isinstance(self.obj, - ABCDataFrame): + if self._selection is not None and isinstance(self.obj, ABCDataFrame): return self.obj.reindex(columns=self._selection_list) if len(self.exclusions) > 0: @@ -228,18 +244,21 @@ def _obj_with_exclusions(self): def __getitem__(self, key): if self._selection is not None: - raise IndexError('Column(s) {selection} already selected' - .format(selection=self._selection)) + raise IndexError( + "Column(s) {selection} already selected".format( + selection=self._selection + ) + ) - if isinstance(key, (list, tuple, ABCSeries, ABCIndexClass, - np.ndarray)): + if isinstance(key, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray)): if len(self.obj.columns.intersection(key)) != len(key): bad_keys = list(set(key).difference(self.obj.columns)) - raise KeyError("Columns not found: {missing}" - .format(missing=str(bad_keys)[1:-1])) + raise KeyError( + "Columns not found: {missing}".format(missing=str(bad_keys)[1:-1]) + ) return self._gotitem(list(key), ndim=2) - elif not getattr(self, 'as_index', False): + elif not getattr(self, "as_index", False): if key not in self.obj.columns: raise KeyError("Column not found: {key}".format(key=key)) return self._gotitem(key, ndim=2) @@ -288,8 +307,9 @@ def _try_aggregate_string_function(self, arg, *args, **kwargs): # people may try to aggregate on a non-callable attribute # but don't let them think they can pass args to it assert len(args) == 0 - assert len([kwarg for kwarg in kwargs - if kwarg not in ['axis', '_level']]) == 0 + assert ( + len([kwarg for kwarg in kwargs if kwarg not in ["axis", "_level"]]) == 0 + ) return f f = getattr(np, arg, None) @@ -320,34 +340,35 @@ def _aggregate(self, arg, *args, **kwargs): is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) is_nested_renamer = False - _axis = kwargs.pop('_axis', None) + _axis = kwargs.pop("_axis", None) if _axis is None: - _axis = getattr(self, 'axis', 0) - _level = kwargs.pop('_level', None) + _axis = getattr(self, "axis", 0) + _level = kwargs.pop("_level", None) if isinstance(arg, str): - return self._try_aggregate_string_function(arg, *args, - **kwargs), None + return self._try_aggregate_string_function(arg, *args, **kwargs), None if isinstance(arg, dict): # aggregate based on the passed dict if _axis != 0: # pragma: no cover - raise ValueError('Can only pass dict with axis=0') + raise ValueError("Can only pass dict with axis=0") obj = self._selected_obj def nested_renaming_depr(level=4): # deprecation of nested renaming # GH 15931 - msg = textwrap.dedent("""\ + msg = textwrap.dedent( + """\ using a dict with renaming is deprecated and will be removed in a future version. For column-specific groupby renaming, use named aggregation >>> df.groupby(...).agg(name=('column', aggfunc)) - """) + """ + ) warnings.warn(msg, FutureWarning, stacklevel=level) # if we have a dict of any non-scalars @@ -375,17 +396,17 @@ def nested_renaming_depr(level=4): is_nested_renamer = True if k not in obj.columns: - msg = ('cannot perform renaming for {key} with a ' - 'nested dictionary').format(key=k) + msg = ( + "cannot perform renaming for {key} with a " + "nested dictionary" + ).format(key=k) raise SpecificationError(msg) nested_renaming_depr(4 + (_level or 0)) elif isinstance(obj, ABCSeries): nested_renaming_depr() - elif (isinstance(obj, ABCDataFrame) and - k not in obj.columns): - raise KeyError( - "Column '{col}' does not exist!".format(col=k)) + elif isinstance(obj, ABCDataFrame) and k not in obj.columns: + raise KeyError("Column '{col}' does not exist!".format(col=k)) arg = new_arg @@ -393,8 +414,9 @@ def nested_renaming_depr(level=4): # deprecation of renaming keys # GH 15931 keys = list(arg.keys()) - if (isinstance(obj, ABCDataFrame) and - len(obj.columns.intersection(keys)) != len(keys)): + if isinstance(obj, ABCDataFrame) and len( + obj.columns.intersection(keys) + ) != len(keys): nested_renaming_depr() from pandas.core.reshape.concat import concat @@ -405,16 +427,16 @@ def _agg_1dim(name, how, subset=None): """ colg = self._gotitem(name, ndim=1, subset=subset) if colg.ndim != 1: - raise SpecificationError("nested dictionary is ambiguous " - "in aggregation") + raise SpecificationError( + "nested dictionary is ambiguous " "in aggregation" + ) return colg.aggregate(how, _level=(_level or 0) + 1) def _agg_2dim(name, how): """ aggregate a 2-dim with how """ - colg = self._gotitem(self._selection, ndim=2, - subset=obj) + colg = self._gotitem(self._selection, ndim=2, subset=obj) return colg.aggregate(how, _level=None) def _agg(arg, func): @@ -456,8 +478,9 @@ def _agg(arg, func): # but may have multiple aggregations if len(sl) == 1: - result = _agg(arg, lambda fname, - agg_how: _agg_1dim(self._selection, agg_how)) + result = _agg( + arg, lambda fname, agg_how: _agg_1dim(self._selection, agg_how) + ) # we are selecting the same set as we are aggregating elif not len(sl - set(keys)): @@ -488,8 +511,7 @@ def is_any_series(): def is_any_frame(): # return a boolean if we have *any* nested series - return any(isinstance(r, ABCDataFrame) - for r in result.values()) + return any(isinstance(r, ABCDataFrame) for r in result.values()) if isinstance(result, list): return concat(result, keys=keys, axis=1, sort=True), True @@ -498,8 +520,7 @@ def is_any_frame(): # we have a dict of DataFrames # return a MI DataFrame - return concat([result[k] for k in keys], - keys=keys, axis=1), True + return concat([result[k] for k in keys], keys=keys, axis=1), True elif isinstance(self, ABCSeries) and is_any_series(): @@ -512,28 +533,28 @@ def is_any_frame(): # we have non-same sized objects, so # we don't automatically broadcast - raise ValueError("cannot perform both aggregation " - "and transformation operations " - "simultaneously") + raise ValueError( + "cannot perform both aggregation " + "and transformation operations " + "simultaneously" + ) return result, True # fall thru from pandas import DataFrame, Series + try: result = DataFrame(result) except ValueError: # we have a dict of scalars - result = Series(result, - name=getattr(self, 'name', None)) + result = Series(result, name=getattr(self, "name", None)) return result, True elif is_list_like(arg): # we require a list, but not an 'str' - return self._aggregate_multiple_funcs(arg, - _level=_level, - _axis=_axis), None + return self._aggregate_multiple_funcs(arg, _level=_level, _axis=_axis), None else: result = None @@ -577,8 +598,7 @@ def _aggregate_multiple_funcs(self, arg, _level, _axis): else: for index, col in enumerate(obj): try: - colg = self._gotitem(col, ndim=1, - subset=obj.iloc[:, index]) + colg = self._gotitem(col, ndim=1, subset=obj.iloc[:, index]) results.append(colg.aggregate(arg)) keys.append(col) except (TypeError, DataError): @@ -602,10 +622,12 @@ def _aggregate_multiple_funcs(self, arg, _level, _axis): from pandas.core.dtypes.cast import is_nested_object from pandas import Series + result = Series(results, index=keys, name=self.name) if is_nested_object(result): - raise ValueError("cannot combine transform and " - "aggregation operations") + raise ValueError( + "cannot combine transform and " "aggregation operations" + ) return result def _shallow_copy(self, obj=None, obj_type=None, **kwargs): @@ -656,8 +678,11 @@ def transpose(self, *args, **kwargs): nv.validate_transpose(args, kwargs) return self - T = property(transpose, doc="""\nReturn the transpose, which is by - definition self.\n""") + T = property( + transpose, + doc="""\nReturn the transpose, which is by + definition self.\n""", + ) @property def _is_homogeneous_type(self): @@ -700,8 +725,11 @@ def item(self): scalar The first element of %(klass)s. """ - warnings.warn('`item` has been deprecated and will be removed in a ' - 'future version', FutureWarning, stacklevel=2) + warnings.warn( + "`item` has been deprecated and will be removed in a " "future version", + FutureWarning, + stacklevel=2, + ) return self.values.item() @property @@ -711,9 +739,12 @@ def data(self): .. deprecated:: 0.23.0 """ - warnings.warn("{obj}.data is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, stacklevel=2) + warnings.warn( + "{obj}.data is deprecated and will be removed " + "in a future version".format(obj=type(self).__name__), + FutureWarning, + stacklevel=2, + ) return self.values.data @property @@ -723,9 +754,12 @@ def itemsize(self): .. deprecated:: 0.23.0 """ - warnings.warn("{obj}.itemsize is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, stacklevel=2) + warnings.warn( + "{obj}.itemsize is deprecated and will be removed " + "in a future version".format(obj=type(self).__name__), + FutureWarning, + stacklevel=2, + ) return self._ndarray_values.itemsize @property @@ -742,9 +776,12 @@ def strides(self): .. deprecated:: 0.23.0 """ - warnings.warn("{obj}.strides is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, stacklevel=2) + warnings.warn( + "{obj}.strides is deprecated and will be removed " + "in a future version".format(obj=type(self).__name__), + FutureWarning, + stacklevel=2, + ) return self._ndarray_values.strides @property @@ -761,9 +798,12 @@ def flags(self): .. deprecated:: 0.23.0 """ - warnings.warn("{obj}.flags is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, stacklevel=2) + warnings.warn( + "{obj}.flags is deprecated and will be removed " + "in a future version".format(obj=type(self).__name__), + FutureWarning, + stacklevel=2, + ) return self.values.flags @property @@ -773,9 +813,12 @@ def base(self): .. deprecated:: 0.23.0 """ - warnings.warn("{obj}.base is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, stacklevel=2) + warnings.warn( + "{obj}.base is deprecated and will be removed " + "in a future version".format(obj=type(self).__name__), + FutureWarning, + stacklevel=2, + ) return self.values.base @property @@ -849,13 +892,16 @@ def array(self) -> ExtensionArray: if is_datetime64_ns_dtype(result.dtype): from pandas.arrays import DatetimeArray + result = DatetimeArray(result) elif is_timedelta64_ns_dtype(result.dtype): from pandas.arrays import TimedeltaArray + result = TimedeltaArray(result) elif not is_extension_array_dtype(result.dtype): from pandas.core.arrays.numpy_ import PandasArray + result = PandasArray(result) return result @@ -1156,13 +1202,17 @@ def hasnans(self): """ return bool(isna(self).any()) - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + def _reduce( + self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds + ): """ perform the reduction type operation if we can """ func = getattr(self, name, None) if func is None: - raise TypeError("{klass} cannot perform the operation {op}".format( - klass=self.__class__.__name__, op=name)) + raise TypeError( + "{klass} cannot perform the operation {op}".format( + klass=self.__class__.__name__, op=name + ) + ) return func(skipna=skipna, **kwds) def _map_values(self, mapper, na_action=None): @@ -1191,7 +1241,7 @@ def _map_values(self, mapper, na_action=None): # as we know that we are not going to have to yield # python types if isinstance(mapper, dict): - if hasattr(mapper, '__missing__'): + if hasattr(mapper, "__missing__"): # If a dictionary subclass defines a default value method, # convert mapper to a lookup function (GH #15999). dict_with_default = mapper @@ -1202,6 +1252,7 @@ def _map_values(self, mapper, na_action=None): # we specify the keys here to handle the # possibility that they are tuples from pandas import Series + mapper = Series(mapper) if isinstance(mapper, ABCSeries): @@ -1229,11 +1280,12 @@ def _map_values(self, mapper, na_action=None): map_f = lambda values, f: values.map(f) else: values = self.astype(object) - values = getattr(values, 'values', values) - if na_action == 'ignore': + values = getattr(values, "values", values) + if na_action == "ignore": + def map_f(values, f): - return lib.map_infer_mask(values, f, - isna(values).view(np.uint8)) + return lib.map_infer_mask(values, f, isna(values).view(np.uint8)) + else: map_f = lib.map_infer @@ -1242,8 +1294,9 @@ def map_f(values, f): return new_values - def value_counts(self, normalize=False, sort=True, ascending=False, - bins=None, dropna=True): + def value_counts( + self, normalize=False, sort=True, ascending=False, bins=None, dropna=True + ): """ Return a Series containing counts of unique values. @@ -1322,18 +1375,26 @@ def value_counts(self, normalize=False, sort=True, ascending=False, dtype: int64 """ from pandas.core.algorithms import value_counts - result = value_counts(self, sort=sort, ascending=ascending, - normalize=normalize, bins=bins, dropna=dropna) + + result = value_counts( + self, + sort=sort, + ascending=ascending, + normalize=normalize, + bins=bins, + dropna=dropna, + ) return result def unique(self): values = self._values - if hasattr(values, 'unique'): + if hasattr(values, "unique"): result = values.unique() else: from pandas.core.algorithms import unique1d + result = unique1d(values) return result @@ -1402,6 +1463,7 @@ def is_monotonic(self): bool """ from pandas import Index + return Index(self).is_monotonic is_monotonic_increasing = is_monotonic @@ -1419,6 +1481,7 @@ def is_monotonic_decreasing(self): bool """ from pandas import Index + return Index(self).is_monotonic_decreasing def memory_usage(self, deep=False): @@ -1444,7 +1507,7 @@ def memory_usage(self, deep=False): Memory usage does not include memory consumed by elements that are not components of the array if deep=False or if used on PyPy """ - if hasattr(self.array, 'memory_usage'): + if hasattr(self.array, "memory_usage"): return self.array.memory_usage(deep=deep) v = self.array.nbytes @@ -1453,18 +1516,24 @@ def memory_usage(self, deep=False): return v @Substitution( - values='', order='', size_hint='', - sort=textwrap.dedent("""\ + values="", + order="", + size_hint="", + sort=textwrap.dedent( + """\ sort : boolean, default False Sort `uniques` and shuffle `labels` to maintain the relationship. - """)) - @Appender(algorithms._shared_docs['factorize']) + """ + ), + ) + @Appender(algorithms._shared_docs["factorize"]) def factorize(self, sort=False, na_sentinel=-1): return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel) - _shared_docs['searchsorted'] = ( - """ + _shared_docs[ + "searchsorted" + ] = """ Find indices where elements should be inserted to maintain order. Find the indices into a sorted %(klass)s `self` such that, if the @@ -1534,16 +1603,15 @@ def factorize(self, sort=False, na_sentinel=-1): >>> x.searchsorted(['bread'], side='right') array([3]) - """) + """ - @Substitution(klass='Index') - @Appender(_shared_docs['searchsorted']) - def searchsorted(self, value, side='left', sorter=None): - return algorithms.searchsorted(self._values, value, - side=side, sorter=sorter) + @Substitution(klass="Index") + @Appender(_shared_docs["searchsorted"]) + def searchsorted(self, value, side="left", sorter=None): + return algorithms.searchsorted(self._values, value, side=side, sorter=sorter) - def drop_duplicates(self, keep='first', inplace=False): - inplace = validate_bool_kwarg(inplace, 'inplace') + def drop_duplicates(self, keep="first", inplace=False): + inplace = validate_bool_kwarg(inplace, "inplace") if isinstance(self, ABCIndexClass): if self.is_unique: return self._shallow_copy() @@ -1555,15 +1623,17 @@ def drop_duplicates(self, keep='first', inplace=False): else: return result - def duplicated(self, keep='first'): + def duplicated(self, keep="first"): from pandas.core.algorithms import duplicated + if isinstance(self, ABCIndexClass): if self.is_unique: return np.zeros(len(self), dtype=np.bool) return duplicated(self, keep=keep) else: - return self._constructor(duplicated(self, keep=keep), - index=self.index).__finalize__(self) + return self._constructor( + duplicated(self, keep=keep), index=self.index + ).__finalize__(self) # ---------------------------------------------------------------------- # abstracts diff --git a/pandas/core/common.py b/pandas/core/common.py index 771ded04f461d7..d2dd0d03d9425f 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -18,7 +18,11 @@ from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( - is_array_like, is_bool_dtype, is_extension_array_dtype, is_integer) + is_array_like, + is_bool_dtype, + is_extension_array_dtype, + is_integer, +) from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import _iterable_not_string from pandas.core.dtypes.missing import isna, isnull, notnull # noqa @@ -114,9 +118,10 @@ def is_bool_indexer(key: Any) -> bool: When the array is an object-dtype ndarray or ExtensionArray and contains missing values. """ - na_msg = 'cannot index with vector containing NA / NaN values' - if (isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or - (is_array_like(key) and is_extension_array_dtype(key.dtype))): + na_msg = "cannot index with vector containing NA / NaN values" + if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( + is_array_like(key) and is_extension_array_dtype(key.dtype) + ): if key.dtype == np.object_: key = np.asarray(values_from_object(key)) @@ -234,7 +239,7 @@ def dict_keys_to_ordered_list(mapping): def asarray_tuplesafe(values, dtype=None): - if not (isinstance(values, (list, tuple)) or hasattr(values, '__array__')): + if not (isinstance(values, (list, tuple)) or hasattr(values, "__array__")): values = list(values) elif isinstance(values, ABCIndexClass): return values.values @@ -302,8 +307,12 @@ def is_null_slice(obj): """ We have a null slice. """ - return (isinstance(obj, slice) and obj.start is None and - obj.stop is None and obj.step is None) + return ( + isinstance(obj, slice) + and obj.start is None + and obj.stop is None + and obj.step is None + ) def is_true_slices(l): @@ -318,19 +327,20 @@ def is_full_slice(obj, l): """ We have a full length slice. """ - return (isinstance(obj, slice) and obj.start == 0 and obj.stop == l and - obj.step is None) + return ( + isinstance(obj, slice) and obj.start == 0 and obj.stop == l and obj.step is None + ) def get_callable_name(obj): # typical case has name - if hasattr(obj, '__name__'): - return getattr(obj, '__name__') + if hasattr(obj, "__name__"): + return getattr(obj, "__name__") # some objects don't; could recurse if isinstance(obj, partial): return get_callable_name(obj.func) # fall back to class name - if hasattr(obj, '__call__'): + if hasattr(obj, "__call__"): return obj.__class__.__name__ # everything failed (probably because the argument # wasn't actually callable); we return None @@ -399,14 +409,12 @@ def standardize_mapping(into): """ if not inspect.isclass(into): if isinstance(into, collections.defaultdict): - return partial( - collections.defaultdict, into.default_factory) + return partial(collections.defaultdict, into.default_factory) into = type(into) if not issubclass(into, abc.Mapping): - raise TypeError('unsupported type: {into}'.format(into=into)) + raise TypeError("unsupported type: {into}".format(into=into)) elif into == collections.defaultdict: - raise TypeError( - 'to_dict() only accepts initialized defaultdicts') + raise TypeError("to_dict() only accepts initialized defaultdicts") return into @@ -435,8 +443,9 @@ def random_state(state=None): elif state is None: return np.random else: - raise ValueError("random_state must be an integer, a numpy " - "RandomState, or None") + raise ValueError( + "random_state must be an integer, a numpy " "RandomState, or None" + ) def _pipe(obj, func, *args, **kwargs): @@ -466,7 +475,7 @@ def _pipe(obj, func, *args, **kwargs): if isinstance(func, tuple): func, target = func if target in kwargs: - msg = '%s is both the pipe target and a keyword argument' % target + msg = "%s is both the pipe target and a keyword argument" % target raise ValueError(msg) kwargs[target] = obj return func(*args, **kwargs) @@ -486,6 +495,7 @@ def f(x): return mapper[x] else: return x + else: f = mapper diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index a7524161dd80e7..10464018509630 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -18,25 +18,23 @@ def _align_core_single_unary_op(term): typ = partial(np.asanyarray, dtype=term.value.dtype) else: typ = type(term.value) - ret = typ, + ret = (typ,) - if not hasattr(term.value, 'axes'): - ret += None, + if not hasattr(term.value, "axes"): + ret += (None,) else: - ret += _zip_axes_from_type(typ, term.value.axes), + ret += (_zip_axes_from_type(typ, term.value.axes),) return ret def _zip_axes_from_type(typ, new_axes): - axes = {ax_name: new_axes[ax_ind] - for ax_ind, ax_name in typ._AXIS_NAMES.items()} + axes = {ax_name: new_axes[ax_ind] for ax_ind, ax_name in typ._AXIS_NAMES.items()} return axes def _any_pandas_objects(terms): """Check a sequence of terms for instances of PandasObject.""" - return any(isinstance(term.value, pd.core.generic.PandasObject) - for term in terms) + return any(isinstance(term.value, pd.core.generic.PandasObject) for term in terms) def _filter_special_cases(f): @@ -53,13 +51,13 @@ def wrapper(terms): return _result_type_many(*term_values), None return f(terms) + return wrapper @_filter_special_cases def _align_core(terms): - term_index = [i for i, term in enumerate(terms) - if hasattr(term.value, 'axes')] + term_index = [i for i, term in enumerate(terms) if hasattr(term.value, "axes")] term_dims = [terms[i].value.ndim for i in term_index] ndims = pd.Series(dict(zip(term_index, term_dims))) @@ -81,13 +79,13 @@ def _align_core(terms): ax, itm = axis, items if not axes[ax].is_(itm): - axes[ax] = axes[ax].join(itm, how='outer') + axes[ax] = axes[ax].join(itm, how="outer") for i, ndim in ndims.items(): for axis, items in zip(range(ndim), axes): ti = terms[i].value - if hasattr(ti, 'reindex'): + if hasattr(ti, "reindex"): transpose = isinstance(ti, pd.Series) and naxes > 1 reindexer = axes[naxes - 1] if transpose else items @@ -96,10 +94,11 @@ def _align_core(terms): ordm = np.log10(max(1, abs(reindexer_size - term_axis_size))) if ordm >= 1 and reindexer_size >= 10000: - w = ('Alignment difference on axis {axis} is larger ' - 'than an order of magnitude on term {term!r}, by ' - 'more than {ordm:.4g}; performance may suffer' - ).format(axis=axis, term=terms[i].name, ordm=ordm) + w = ( + "Alignment difference on axis {axis} is larger " + "than an order of magnitude on term {term!r}, by " + "more than {ordm:.4g}; performance may suffer" + ).format(axis=axis, term=terms[i].name, ordm=ordm) warnings.warn(w, category=PerformanceWarning, stacklevel=6) f = partial(ti.reindex, reindexer, axis=axis, copy=False) @@ -158,12 +157,11 @@ def _reconstruct_object(typ, obj, axes, dtype): res_t = np.result_type(obj.dtype, dtype) - if (not isinstance(typ, partial) and - issubclass(typ, pd.core.generic.PandasObject)): + if not isinstance(typ, partial) and issubclass(typ, pd.core.generic.PandasObject): return typ(obj, dtype=res_t, **axes) # special case for pathological things like ~True/~False - if hasattr(res_t, 'type') and typ == np.bool_ and res_t != np.bool_: + if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_: ret_value = res_t.type(obj) else: ret_value = typ(obj).astype(res_t) diff --git a/pandas/core/computation/check.py b/pandas/core/computation/check.py index fc6b9a25228242..4d205909b9e2e3 100644 --- a/pandas/core/computation/check.py +++ b/pandas/core/computation/check.py @@ -1,11 +1,10 @@ from pandas.compat._optional import import_optional_dependency -ne = import_optional_dependency("numexpr", raise_on_missing=False, - on_version="warn") +ne = import_optional_dependency("numexpr", raise_on_missing=False, on_version="warn") _NUMEXPR_INSTALLED = ne is not None if _NUMEXPR_INSTALLED: _NUMEXPR_VERSION = ne.__version__ else: _NUMEXPR_VERSION = None -__all__ = ['_NUMEXPR_INSTALLED', '_NUMEXPR_VERSION'] +__all__ = ["_NUMEXPR_INSTALLED", "_NUMEXPR_VERSION"] diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index 6a0e7981ad82bc..ddb1023479cba2 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -11,7 +11,7 @@ def _ensure_decoded(s): """ if we have bytes, decode them to unicode """ if isinstance(s, (np.bytes_, bytes)): - s = s.decode(pd.get_option('display.encoding')) + s = s.decode(pd.get_option("display.encoding")) return s diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py index c75552d15441d7..2c94b142a45b38 100644 --- a/pandas/core/computation/engines.py +++ b/pandas/core/computation/engines.py @@ -5,8 +5,7 @@ import abc from pandas.core.computation.align import _align, _reconstruct_object -from pandas.core.computation.ops import ( - UndefinedVariableError, _mathops, _reductions) +from pandas.core.computation.ops import UndefinedVariableError, _mathops, _reductions import pandas.io.formats.printing as printing @@ -29,10 +28,11 @@ def _check_ne_builtin_clash(expr): overlap = names & _ne_builtins if overlap: - s = ', '.join(map(repr, overlap)) - raise NumExprClobberingError('Variables in expression "{expr}" ' - 'overlap with builtins: ({s})' - .format(expr=expr, s=s)) + s = ", ".join(map(repr, overlap)) + raise NumExprClobberingError( + 'Variables in expression "{expr}" ' + "overlap with builtins: ({s})".format(expr=expr, s=s) + ) class AbstractEngine(metaclass=abc.ABCMeta): @@ -68,8 +68,9 @@ def evaluate(self): # make sure no names in resolvers and locals/globals clash res = self._evaluate() - return _reconstruct_object(self.result_type, res, self.aligned_axes, - self.expr.terms.return_type) + return _reconstruct_object( + self.result_type, res, self.aligned_axes, self.expr.terms.return_type + ) @property def _is_aligned(self): @@ -95,6 +96,7 @@ def _evaluate(self): class NumExprEngine(AbstractEngine): """NumExpr engine class""" + has_neg_frac = True def __init__(self, expr): @@ -112,7 +114,7 @@ def _evaluate(self): try: env = self.expr.env scope = env.full_scope - truediv = scope['truediv'] + truediv = scope["truediv"] _check_ne_builtin_clash(self.expr) return ne.evaluate(s, local_dict=scope, truediv=truediv) except KeyError as e: @@ -130,6 +132,7 @@ class PythonEngine(AbstractEngine): Mostly for testing purposes. """ + has_neg_frac = False def __init__(self, expr): @@ -142,4 +145,4 @@ def _evaluate(self): pass -_engines = {'numexpr': NumExprEngine, 'python': PythonEngine} +_engines = {"numexpr": NumExprEngine, "python": PythonEngine} diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index ef4639a3afe4c9..456ecf4b2594f4 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -38,24 +38,28 @@ def _check_engine(engine): if engine is None: if _NUMEXPR_INSTALLED: - engine = 'numexpr' + engine = "numexpr" else: - engine = 'python' + engine = "python" if engine not in _engines: valid = list(_engines.keys()) - raise KeyError('Invalid engine {engine!r} passed, valid engines are' - ' {valid}'.format(engine=engine, valid=valid)) + raise KeyError( + "Invalid engine {engine!r} passed, valid engines are" + " {valid}".format(engine=engine, valid=valid) + ) # TODO: validate this in a more general way (thinking of future engines # that won't necessarily be import-able) # Could potentially be done on engine instantiation - if engine == 'numexpr': + if engine == "numexpr": if not _NUMEXPR_INSTALLED: - raise ImportError("'numexpr' is not installed or an " - "unsupported version. Cannot use " - "engine='numexpr' for query/eval " - "if 'numexpr' is not installed") + raise ImportError( + "'numexpr' is not installed or an " + "unsupported version. Cannot use " + "engine='numexpr' for query/eval " + "if 'numexpr' is not installed" + ) return engine @@ -76,17 +80,21 @@ def _check_parser(parser): from pandas.core.computation.expr import _parsers if parser not in _parsers: - raise KeyError('Invalid parser {parser!r} passed, valid parsers are' - ' {valid}'.format(parser=parser, valid=_parsers.keys())) + raise KeyError( + "Invalid parser {parser!r} passed, valid parsers are" + " {valid}".format(parser=parser, valid=_parsers.keys()) + ) def _check_resolvers(resolvers): if resolvers is not None: for resolver in resolvers: - if not hasattr(resolver, '__getitem__'): + if not hasattr(resolver, "__getitem__"): name = type(resolver).__name__ - raise TypeError('Resolver of type {name!r} does not implement ' - 'the __getitem__ method'.format(name=name)) + raise TypeError( + "Resolver of type {name!r} does not implement " + "the __getitem__ method".format(name=name) + ) def _check_expression(expr): @@ -140,25 +148,36 @@ def _check_for_locals(expr, stack_level, parser): from pandas.core.computation.expr import tokenize_string at_top_of_stack = stack_level == 0 - not_pandas_parser = parser != 'pandas' + not_pandas_parser = parser != "pandas" if not_pandas_parser: msg = "The '@' prefix is only supported by the pandas parser" elif at_top_of_stack: - msg = ("The '@' prefix is not allowed in " - "top-level eval calls, \nplease refer to " - "your variables by name without the '@' " - "prefix") + msg = ( + "The '@' prefix is not allowed in " + "top-level eval calls, \nplease refer to " + "your variables by name without the '@' " + "prefix" + ) if at_top_of_stack or not_pandas_parser: for toknum, tokval in tokenize_string(expr): - if toknum == tokenize.OP and tokval == '@': + if toknum == tokenize.OP and tokval == "@": raise SyntaxError(msg) -def eval(expr, parser='pandas', engine=None, truediv=True, - local_dict=None, global_dict=None, resolvers=(), level=0, - target=None, inplace=False): +def eval( + expr, + parser="pandas", + engine=None, + truediv=True, + local_dict=None, + global_dict=None, + resolvers=(), + level=0, + target=None, + inplace=False, +): """ Evaluate a Python expression as a string using various backends. @@ -269,14 +288,16 @@ def eval(expr, parser='pandas', engine=None, truediv=True, if isinstance(expr, str): _check_expression(expr) - exprs = [e.strip() for e in expr.splitlines() if e.strip() != ''] + exprs = [e.strip() for e in expr.splitlines() if e.strip() != ""] else: exprs = [expr] multi_line = len(exprs) > 1 if multi_line and target is None: - raise ValueError("multi-line expressions are only valid in the " - "context of data, use DataFrame.eval") + raise ValueError( + "multi-line expressions are only valid in the " + "context of data, use DataFrame.eval" + ) ret = None first_expr = True @@ -290,12 +311,15 @@ def eval(expr, parser='pandas', engine=None, truediv=True, _check_for_locals(expr, level, parser) # get our (possibly passed-in) scope - env = _ensure_scope(level + 1, global_dict=global_dict, - local_dict=local_dict, resolvers=resolvers, - target=target) + env = _ensure_scope( + level + 1, + global_dict=global_dict, + local_dict=local_dict, + resolvers=resolvers, + target=target, + ) - parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, - truediv=truediv) + parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, truediv=truediv) # construct the engine and evaluate the parsed expression eng = _engines[engine] @@ -304,11 +328,12 @@ def eval(expr, parser='pandas', engine=None, truediv=True, if parsed_expr.assigner is None: if multi_line: - raise ValueError("Multi-line expressions are only valid" - " if all expressions contain an assignment") + raise ValueError( + "Multi-line expressions are only valid" + " if all expressions contain an assignment" + ) elif inplace: - raise ValueError("Cannot operate inplace " - "if there is no assignment") + raise ValueError("Cannot operate inplace " "if there is no assignment") # assign if needed assigner = parsed_expr.assigner diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 32bd34c4db7d7b..772fb547567e35 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -15,11 +15,27 @@ from pandas.core import common as com from pandas.core.base import StringMixin from pandas.core.computation.common import ( - _BACKTICK_QUOTED_STRING, _remove_spaces_column_name) + _BACKTICK_QUOTED_STRING, + _remove_spaces_column_name, +) from pandas.core.computation.ops import ( - _LOCAL_TAG, BinOp, Constant, Div, FuncNode, Op, Term, UnaryOp, - UndefinedVariableError, _arith_ops_syms, _bool_ops_syms, _cmp_ops_syms, - _mathops, _reductions, _unary_ops_syms, is_term) + _LOCAL_TAG, + BinOp, + Constant, + Div, + FuncNode, + Op, + Term, + UnaryOp, + UndefinedVariableError, + _arith_ops_syms, + _bool_ops_syms, + _cmp_ops_syms, + _mathops, + _reductions, + _unary_ops_syms, + is_term, +) from pandas.core.computation.scope import Scope import pandas.io.formats.printing as printing @@ -40,10 +56,13 @@ def tokenize_string(source): # Then, take all tokens till the next backtick to form a backtick quoted # string. for toknum, tokval, _, _, _ in token_generator: - if tokval == '`': - tokval = " ".join(it.takewhile( - lambda tokval: tokval != '`', - map(operator.itemgetter(1), token_generator))) + if tokval == "`": + tokval = " ".join( + it.takewhile( + lambda tokval: tokval != "`", + map(operator.itemgetter(1), token_generator), + ) + ) toknum = _BACKTICK_QUOTED_STRING yield toknum, tokval @@ -63,7 +82,7 @@ def _rewrite_assign(tok): Either the input or token or the replacement values """ toknum, tokval = tok - return toknum, '==' if tokval == '=' else tokval + return toknum, "==" if tokval == "=" else tokval def _replace_booleans(tok): @@ -82,10 +101,10 @@ def _replace_booleans(tok): """ toknum, tokval = tok if toknum == tokenize.OP: - if tokval == '&': - return tokenize.NAME, 'and' - elif tokval == '|': - return tokenize.NAME, 'or' + if tokval == "&": + return tokenize.NAME, "and" + elif tokval == "|": + return tokenize.NAME, "or" return toknum, tokval return toknum, tokval @@ -110,7 +129,7 @@ def _replace_locals(tok): is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it. """ toknum, tokval = tok - if toknum == tokenize.OP and tokval == '@': + if toknum == tokenize.OP and tokval == "@": return tokenize.OP, _LOCAL_TAG return toknum, tokval @@ -147,13 +166,19 @@ def _compose2(f, g): def _compose(*funcs): """Compose 2 or more callables""" - assert len(funcs) > 1, 'At least 2 callables must be passed to compose' + assert len(funcs) > 1, "At least 2 callables must be passed to compose" return reduce(_compose2, funcs) -def _preparse(source, f=_compose(_replace_locals, _replace_booleans, - _rewrite_assign, - _clean_spaces_backtick_quoted_names)): +def _preparse( + source, + f=_compose( + _replace_locals, + _replace_booleans, + _rewrite_assign, + _clean_spaces_backtick_quoted_names, + ), +): """Compose a collection of tokenization functions Parameters @@ -177,7 +202,7 @@ def _preparse(source, f=_compose(_replace_locals, _replace_booleans, form ``(toknum, tokval)``, where ``toknum`` is one of the constants from the ``tokenize`` module and ``tokval`` is a string. """ - assert callable(f), 'f must be callable' + assert callable(f), "f must be callable" return tokenize.untokenize((f(x) for x in tokenize_string(source))) @@ -191,15 +216,17 @@ def _is_type(t): # partition all AST nodes -_all_nodes = frozenset(filter(lambda x: isinstance(x, type) and - issubclass(x, ast.AST), - (getattr(ast, node) for node in dir(ast)))) +_all_nodes = frozenset( + filter( + lambda x: isinstance(x, type) and issubclass(x, ast.AST), + (getattr(ast, node) for node in dir(ast)), + ) +) def _filter_nodes(superclass, all_nodes=_all_nodes): """Filter out AST nodes that are subclasses of ``superclass``.""" - node_names = (node.__name__ for node in all_nodes - if issubclass(node, superclass)) + node_names = (node.__name__ for node in all_nodes if issubclass(node, superclass)) return frozenset(node_names) @@ -221,25 +248,44 @@ def _filter_nodes(superclass, all_nodes=_all_nodes): # nodes that we don't support directly but are needed for parsing -_hacked_nodes = frozenset(['Assign', 'Module', 'Expr']) - - -_unsupported_expr_nodes = frozenset(['Yield', 'GeneratorExp', 'IfExp', - 'DictComp', 'SetComp', 'Repr', 'Lambda', - 'Set', 'AST', 'Is', 'IsNot']) +_hacked_nodes = frozenset(["Assign", "Module", "Expr"]) + + +_unsupported_expr_nodes = frozenset( + [ + "Yield", + "GeneratorExp", + "IfExp", + "DictComp", + "SetComp", + "Repr", + "Lambda", + "Set", + "AST", + "Is", + "IsNot", + ] +) # these nodes are low priority or won't ever be supported (e.g., AST) -_unsupported_nodes = ((_stmt_nodes | _mod_nodes | _handler_nodes | - _arguments_nodes | _keyword_nodes | _alias_nodes | - _expr_context_nodes | _unsupported_expr_nodes) - - _hacked_nodes) +_unsupported_nodes = ( + _stmt_nodes + | _mod_nodes + | _handler_nodes + | _arguments_nodes + | _keyword_nodes + | _alias_nodes + | _expr_context_nodes + | _unsupported_expr_nodes +) - _hacked_nodes # we're adding a different assignment in some cases to be equality comparison # and we don't want `stmt` and friends in their so get only the class whose # names are capitalized _base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes -_msg = 'cannot both support and not support {intersection}'.format( - intersection=_unsupported_nodes & _base_supported_nodes) +_msg = "cannot both support and not support {intersection}".format( + intersection=_unsupported_nodes & _base_supported_nodes +) assert not _unsupported_nodes & _base_supported_nodes, _msg @@ -249,8 +295,10 @@ def _node_not_implemented(node_name, cls): """ def f(self, *args, **kwargs): - raise NotImplementedError("{name!r} nodes are not " - "implemented".format(name=node_name)) + raise NotImplementedError( + "{name!r} nodes are not " "implemented".format(name=node_name) + ) + return f @@ -262,14 +310,16 @@ def disallow(nodes): ------- disallowed : callable """ + def disallowed(cls): cls.unsupported_nodes = () for node in nodes: new_method = _node_not_implemented(node, cls) - name = 'visit_{node}'.format(node=node) + name = "visit_{node}".format(node=node) cls.unsupported_nodes += (name,) setattr(cls, name, new_method) return cls + return disallowed @@ -290,25 +340,27 @@ def f(self, node, *args, **kwargs): f : callable """ return partial(op_class, op_symbol, *args, **kwargs) + return f -_op_classes = {'binary': BinOp, 'unary': UnaryOp} +_op_classes = {"binary": BinOp, "unary": UnaryOp} def add_ops(op_classes): """Decorator to add default implementation of ops.""" + def f(cls): for op_attr_name, op_class in op_classes.items(): - ops = getattr(cls, '{name}_ops'.format(name=op_attr_name)) - ops_map = getattr(cls, '{name}_op_nodes_map'.format( - name=op_attr_name)) + ops = getattr(cls, "{name}_ops".format(name=op_attr_name)) + ops_map = getattr(cls, "{name}_op_nodes_map".format(name=op_attr_name)) for op in ops: op_node = ops_map[op] if op_node is not None: made_op = _op_maker(op_class, op) - setattr(cls, 'visit_{node}'.format(node=op_node), made_op) + setattr(cls, "visit_{node}".format(node=op_node), made_op) return cls + return f @@ -326,24 +378,43 @@ class BaseExprVisitor(ast.NodeVisitor): parser : str preparser : callable """ + const_type = Constant # type: Type[Term] term_type = Term binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms - binary_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'In', 'NotIn', - 'BitAnd', 'BitOr', 'And', 'Or', 'Add', 'Sub', 'Mult', - None, 'Pow', 'FloorDiv', 'Mod') + binary_op_nodes = ( + "Gt", + "Lt", + "GtE", + "LtE", + "Eq", + "NotEq", + "In", + "NotIn", + "BitAnd", + "BitOr", + "And", + "Or", + "Add", + "Sub", + "Mult", + None, + "Pow", + "FloorDiv", + "Mod", + ) binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes)) unary_ops = _unary_ops_syms - unary_op_nodes = 'UAdd', 'USub', 'Invert', 'Not' + unary_op_nodes = "UAdd", "USub", "Invert", "Not" unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) rewrite_map = { ast.Eq: ast.In, ast.NotEq: ast.NotIn, ast.In: ast.In, - ast.NotIn: ast.NotIn + ast.NotIn: ast.NotIn, } def __init__(self, env, engine, parser, preparser=_preparse): @@ -360,18 +431,18 @@ def visit(self, node, **kwargs): node = ast.fix_missing_locations(ast.parse(clean)) except SyntaxError as e: from keyword import iskeyword + if any(iskeyword(x) for x in clean.split()): - e.msg = ("Python keyword not valid identifier" - " in numexpr query") + e.msg = "Python keyword not valid identifier" " in numexpr query" raise e - method = 'visit_' + node.__class__.__name__ + method = "visit_" + node.__class__.__name__ visitor = getattr(self, method) return visitor(node, **kwargs) def visit_Module(self, node, **kwargs): if len(node.body) != 1: - raise SyntaxError('only a single expression is allowed') + raise SyntaxError("only a single expression is allowed") expr = node.body[0] return self.visit(expr, **kwargs) @@ -408,22 +479,29 @@ def _rewrite_membership_op(self, node, left, right): def _maybe_transform_eq_ne(self, node, left=None, right=None): if left is None: - left = self.visit(node.left, side='left') + left = self.visit(node.left, side="left") if right is None: - right = self.visit(node.right, side='right') - op, op_class, left, right = self._rewrite_membership_op(node, left, - right) + right = self.visit(node.right, side="right") + op, op_class, left, right = self._rewrite_membership_op(node, left, right) return op, op_class, left, right def _maybe_downcast_constants(self, left, right): f32 = np.dtype(np.float32) - if (left.is_scalar and hasattr(left, 'value') and - not right.is_scalar and right.return_type == f32): + if ( + left.is_scalar + and hasattr(left, "value") + and not right.is_scalar + and right.return_type == f32 + ): # right is a float32 array, left is a scalar name = self.env.add_tmp(np.float32(left.value)) left = self.term_type(name, self.env) - if (right.is_scalar and hasattr(right, 'value') and - not left.is_scalar and left.return_type == f32): + if ( + right.is_scalar + and hasattr(right, "value") + and not left.is_scalar + and left.return_type == f32 + ): # left is a float32 array, right is a scalar name = self.env.add_tmp(np.float32(right.value)) right = self.term_type(name, self.env) @@ -437,25 +515,33 @@ def _maybe_eval(self, binop, eval_in_python): # [1,2] in a + 2 * b # in that case a + 2 * b will be evaluated using numexpr, and the "in" # call will be evaluated using isin (in python space) - return binop.evaluate(self.env, self.engine, self.parser, - self.term_type, eval_in_python) - - def _maybe_evaluate_binop(self, op, op_class, lhs, rhs, - eval_in_python=('in', 'not in'), - maybe_eval_in_python=('==', '!=', '<', '>', - '<=', '>=')): + return binop.evaluate( + self.env, self.engine, self.parser, self.term_type, eval_in_python + ) + + def _maybe_evaluate_binop( + self, + op, + op_class, + lhs, + rhs, + eval_in_python=("in", "not in"), + maybe_eval_in_python=("==", "!=", "<", ">", "<=", ">="), + ): res = op(lhs, rhs) if res.has_invalid_return_type: - raise TypeError("unsupported operand type(s) for {op}:" - " '{lhs}' and '{rhs}'".format(op=res.op, - lhs=lhs.type, - rhs=rhs.type)) - - if self.engine != 'pytables': - if (res.op in _cmp_ops_syms and - getattr(lhs, 'is_datetime', False) or - getattr(rhs, 'is_datetime', False)): + raise TypeError( + "unsupported operand type(s) for {op}:" + " '{lhs}' and '{rhs}'".format(op=res.op, lhs=lhs.type, rhs=rhs.type) + ) + + if self.engine != "pytables": + if ( + res.op in _cmp_ops_syms + and getattr(lhs, "is_datetime", False) + or getattr(rhs, "is_datetime", False) + ): # all date ops must be done in python bc numexpr doesn't work # well with NaT return self._maybe_eval(res, self.binary_ops) @@ -463,13 +549,14 @@ def _maybe_evaluate_binop(self, op, op_class, lhs, rhs, if res.op in eval_in_python: # "in"/"not in" ops are always evaluated in python return self._maybe_eval(res, eval_in_python) - elif self.engine != 'pytables': - if (getattr(lhs, 'return_type', None) == object or - getattr(rhs, 'return_type', None) == object): + elif self.engine != "pytables": + if ( + getattr(lhs, "return_type", None) == object + or getattr(rhs, "return_type", None) == object + ): # evaluate "==" and "!=" in python if either of our operands # has an object return type - return self._maybe_eval(res, eval_in_python + - maybe_eval_in_python) + return self._maybe_eval(res, eval_in_python + maybe_eval_in_python) return res def visit_BinOp(self, node, **kwargs): @@ -478,7 +565,7 @@ def visit_BinOp(self, node, **kwargs): return self._maybe_evaluate_binop(op, op_class, left, right) def visit_Div(self, node, **kwargs): - truediv = self.env.scope['truediv'] + truediv = self.env.scope["truediv"] return lambda lhs, rhs: Div(lhs, rhs, truediv) def visit_UnaryOp(self, node, **kwargs): @@ -512,15 +599,17 @@ def visit_Index(self, node, **kwargs): def visit_Subscript(self, node, **kwargs): value = self.visit(node.value) slobj = self.visit(node.slice) - result = pd.eval(slobj, local_dict=self.env, engine=self.engine, - parser=self.parser) + result = pd.eval( + slobj, local_dict=self.env, engine=self.engine, parser=self.parser + ) try: # a Term instance v = value.value[result] except AttributeError: # an Op instance - lhs = pd.eval(value, local_dict=self.env, engine=self.engine, - parser=self.parser) + lhs = pd.eval( + value, local_dict=self.env, engine=self.engine, parser=self.parser + ) v = lhs[result] name = self.env.add_tmp(v) return self.term_type(name, env=self.env) @@ -551,22 +640,24 @@ def visit_Assign(self, node, **kwargs): """ if len(node.targets) != 1: - raise SyntaxError('can only assign a single expression') + raise SyntaxError("can only assign a single expression") if not isinstance(node.targets[0], ast.Name): - raise SyntaxError('left hand side of an assignment must be a ' - 'single name') + raise SyntaxError( + "left hand side of an assignment must be a " "single name" + ) if self.env.target is None: - raise ValueError('cannot assign without a target object') + raise ValueError("cannot assign without a target object") try: assigner = self.visit(node.targets[0], **kwargs) except UndefinedVariableError: assigner = node.targets[0].id - self.assigner = getattr(assigner, 'name', assigner) + self.assigner = getattr(assigner, "name", assigner) if self.assigner is None: - raise SyntaxError('left hand side of an assignment must be a ' - 'single resolvable name') + raise SyntaxError( + "left hand side of an assignment must be a " "single resolvable name" + ) return self.visit(node.value, **kwargs) @@ -587,8 +678,7 @@ def visit_Attribute(self, node, **kwargs): if isinstance(value, ast.Name) and value.id == attr: return resolved - raise ValueError("Invalid Attribute context {name}" - .format(name=ctx.__name__)) + raise ValueError("Invalid Attribute context {name}".format(name=ctx.__name__)) def visit_Call(self, node, side=None, **kwargs): @@ -608,9 +698,8 @@ def visit_Call(self, node, side=None, **kwargs): raise if res is None: - raise ValueError("Invalid function call {func}" - .format(func=node.func.id)) - if hasattr(res, 'value'): + raise ValueError("Invalid function call {func}".format(func=node.func.id)) + if hasattr(res, "value"): res = res.value if isinstance(res, FuncNode): @@ -618,8 +707,10 @@ def visit_Call(self, node, side=None, **kwargs): new_args = [self.visit(arg) for arg in node.args] if node.keywords: - raise TypeError("Function \"{name}\" does not support keyword " - "arguments".format(name=res.name)) + raise TypeError( + 'Function "{name}" does not support keyword ' + "arguments".format(name=res.name) + ) return res(*new_args, **kwargs) @@ -629,8 +720,10 @@ def visit_Call(self, node, side=None, **kwargs): for key in node.keywords: if not isinstance(key, ast.keyword): - raise ValueError("keyword error in function call " - "'{func}'".format(func=node.func.id)) + raise ValueError( + "keyword error in function call " + "'{func}'".format(func=node.func.id) + ) if key.arg: kwargs[key.arg] = self.visit(key.value).value @@ -654,8 +747,9 @@ def visit_Compare(self, node, **kwargs): left = node.left values = [] for op, comp in zip(ops, comps): - new_node = self.visit(ast.Compare(comparators=[comp], left=left, - ops=[self.translate_In(op)])) + new_node = self.visit( + ast.Compare(comparators=[comp], left=left, ops=[self.translate_In(op)]) + ) left = comp values.append(new_node) return self.visit(ast.BoolOp(op=ast.And(), values=values)) @@ -670,33 +764,39 @@ def visitor(x, y): lhs = self._try_visit_binop(x) rhs = self._try_visit_binop(y) - op, op_class, lhs, rhs = self._maybe_transform_eq_ne( - node, lhs, rhs) + op, op_class, lhs, rhs = self._maybe_transform_eq_ne(node, lhs, rhs) return self._maybe_evaluate_binop(op, node.op, lhs, rhs) operands = node.values return reduce(visitor, operands) -_python_not_supported = frozenset(['Dict', 'BoolOp', 'In', 'NotIn']) +_python_not_supported = frozenset(["Dict", "BoolOp", "In", "NotIn"]) _numexpr_supported_calls = frozenset(_reductions + _mathops) -@disallow((_unsupported_nodes | _python_not_supported) - - (_boolop_nodes | frozenset(['BoolOp', 'Attribute', 'In', 'NotIn', - 'Tuple']))) +@disallow( + (_unsupported_nodes | _python_not_supported) + - (_boolop_nodes | frozenset(["BoolOp", "Attribute", "In", "NotIn", "Tuple"])) +) class PandasExprVisitor(BaseExprVisitor): - - def __init__(self, env, engine, parser, - preparser=partial(_preparse, f=_compose( - _replace_locals, _replace_booleans, - _clean_spaces_backtick_quoted_names))): + def __init__( + self, + env, + engine, + parser, + preparser=partial( + _preparse, + f=_compose( + _replace_locals, _replace_booleans, _clean_spaces_backtick_quoted_names + ), + ), + ): super().__init__(env, engine, parser, preparser) -@disallow(_unsupported_nodes | _python_not_supported | frozenset(['Not'])) +@disallow(_unsupported_nodes | _python_not_supported | frozenset(["Not"])) class PythonExprVisitor(BaseExprVisitor): - def __init__(self, env, engine, parser, preparser=lambda x: x): super().__init__(env, engine, parser, preparser=preparser) @@ -715,19 +815,20 @@ class Expr(StringMixin): level : int, optional, default 2 """ - def __init__(self, expr, engine='numexpr', parser='pandas', env=None, - truediv=True, level=0): + def __init__( + self, expr, engine="numexpr", parser="pandas", env=None, truediv=True, level=0 + ): self.expr = expr self.env = env or Scope(level=level + 1) self.engine = engine self.parser = parser - self.env.scope['truediv'] = truediv + self.env.scope["truediv"] = truediv self._visitor = _parsers[parser](self.env, self.engine, self.parser) self.terms = self.parse() @property def assigner(self): - return getattr(self._visitor, 'assigner', None) + return getattr(self._visitor, "assigner", None) def __call__(self): return self.terms(self.env) @@ -750,4 +851,4 @@ def names(self): return frozenset(term.name for term in com.flatten(self.terms)) -_parsers = {'python': PythonExprVisitor, 'pandas': PandasExprVisitor} +_parsers = {"python": PythonExprVisitor, "pandas": PandasExprVisitor} diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index b01000a7aee5bd..dc4e6e85f6e7d5 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -28,8 +28,8 @@ # the set of dtypes that we will allow pass to numexpr _ALLOWED_DTYPES = { - 'evaluate': {'int64', 'int32', 'float64', 'float32', 'bool'}, - 'where': {'int64', 'float64', 'bool'} + "evaluate": {"int64", "int32", "float64", "float32", "bool"}, + "where": {"int64", "float64", "bool"}, } # the minimum prod shape that we will use numexpr @@ -65,7 +65,7 @@ def _evaluate_standard(op, op_str, a, b, **eval_kwargs): """ standard evaluation """ if _TEST_MODE: _store_test_result(False) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): return op(a, b) @@ -79,7 +79,7 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): # check for dtype compatibility dtypes = set() for o in [a, b]: - if hasattr(o, 'dtypes'): + if hasattr(o, "dtypes"): s = o.dtypes.value_counts() if len(s) > 1: return False @@ -94,11 +94,10 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): return False -def _evaluate_numexpr(op, op_str, a, b, truediv=True, - reversed=False, **eval_kwargs): +def _evaluate_numexpr(op, op_str, a, b, truediv=True, reversed=False, **eval_kwargs): result = None - if _can_use_numexpr(op, op_str, a, b, 'evaluate'): + if _can_use_numexpr(op, op_str, a, b, "evaluate"): try: # we were originally called by a reversed op @@ -108,13 +107,15 @@ def _evaluate_numexpr(op, op_str, a, b, truediv=True, a_value = getattr(a, "values", a) b_value = getattr(b, "values", b) - result = ne.evaluate('a_value {op} b_value'.format(op=op_str), - local_dict={'a_value': a_value, - 'b_value': b_value}, - casting='safe', truediv=truediv, - **eval_kwargs) + result = ne.evaluate( + "a_value {op} b_value".format(op=op_str), + local_dict={"a_value": a_value, "b_value": b_value}, + casting="safe", + truediv=truediv, + **eval_kwargs + ) except ValueError as detail: - if 'unknown type object' in str(detail): + if "unknown type object" in str(detail): pass if _TEST_MODE: @@ -127,26 +128,33 @@ def _evaluate_numexpr(op, op_str, a, b, truediv=True, def _where_standard(cond, a, b): - return np.where(com.values_from_object(cond), com.values_from_object(a), - com.values_from_object(b)) + return np.where( + com.values_from_object(cond), + com.values_from_object(a), + com.values_from_object(b), + ) def _where_numexpr(cond, a, b): result = None - if _can_use_numexpr(None, 'where', a, b, 'where'): + if _can_use_numexpr(None, "where", a, b, "where"): try: - cond_value = getattr(cond, 'values', cond) - a_value = getattr(a, 'values', a) - b_value = getattr(b, 'values', b) - result = ne.evaluate('where(cond_value, a_value, b_value)', - local_dict={'cond_value': cond_value, - 'a_value': a_value, - 'b_value': b_value}, - casting='safe') + cond_value = getattr(cond, "values", cond) + a_value = getattr(a, "values", a) + b_value = getattr(b, "values", b) + result = ne.evaluate( + "where(cond_value, a_value, b_value)", + local_dict={ + "cond_value": cond_value, + "a_value": a_value, + "b_value": b_value, + }, + casting="safe", + ) except ValueError as detail: - if 'unknown type object' in str(detail): + if "unknown type object" in str(detail): pass except Exception as detail: raise TypeError(str(detail)) @@ -158,40 +166,44 @@ def _where_numexpr(cond, a, b): # turn myself on -set_use_numexpr(get_option('compute.use_numexpr')) +set_use_numexpr(get_option("compute.use_numexpr")) def _has_bool_dtype(x): try: if isinstance(x, ABCDataFrame): - return 'bool' in x.dtypes + return "bool" in x.dtypes else: return x.dtype == bool except AttributeError: return isinstance(x, (bool, np.bool_)) -def _bool_arith_check(op_str, a, b, not_allowed=frozenset(('/', '//', '**')), - unsupported=None): +def _bool_arith_check( + op_str, a, b, not_allowed=frozenset(("/", "//", "**")), unsupported=None +): if unsupported is None: - unsupported = {'+': '|', '*': '&', '-': '^'} + unsupported = {"+": "|", "*": "&", "-": "^"} if _has_bool_dtype(a) and _has_bool_dtype(b): if op_str in unsupported: - warnings.warn("evaluating in Python space because the {op!r} " - "operator is not supported by numexpr for " - "the bool dtype, use {alt_op!r} instead" - .format(op=op_str, alt_op=unsupported[op_str])) + warnings.warn( + "evaluating in Python space because the {op!r} " + "operator is not supported by numexpr for " + "the bool dtype, use {alt_op!r} instead".format( + op=op_str, alt_op=unsupported[op_str] + ) + ) return False if op_str in not_allowed: - raise NotImplementedError("operator {op!r} not implemented for " - "bool dtypes".format(op=op_str)) + raise NotImplementedError( + "operator {op!r} not implemented for " "bool dtypes".format(op=op_str) + ) return True -def evaluate(op, op_str, a, b, use_numexpr=True, - **eval_kwargs): +def evaluate(op, op_str, a, b, use_numexpr=True, **eval_kwargs): """ evaluate and return the expression of the op on a and b Parameters diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index fd96739f4da768..9e6928372808e8 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -19,19 +19,36 @@ from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded -_reductions = 'sum', 'prod' - -_unary_math_ops = ('sin', 'cos', 'exp', 'log', 'expm1', 'log1p', - 'sqrt', 'sinh', 'cosh', 'tanh', 'arcsin', 'arccos', - 'arctan', 'arccosh', 'arcsinh', 'arctanh', 'abs', 'log10', - 'floor', 'ceil' - ) -_binary_math_ops = ('arctan2',) +_reductions = "sum", "prod" + +_unary_math_ops = ( + "sin", + "cos", + "exp", + "log", + "expm1", + "log1p", + "sqrt", + "sinh", + "cosh", + "tanh", + "arcsin", + "arccos", + "arctan", + "arccosh", + "arcsinh", + "arctanh", + "abs", + "log10", + "floor", + "ceil", +) +_binary_math_ops = ("arctan2",) _mathops = _unary_math_ops + _binary_math_ops -_LOCAL_TAG = '__pd_eval_local_' +_LOCAL_TAG = "__pd_eval_local_" class UndefinedVariableError(NameError): @@ -40,14 +57,13 @@ class UndefinedVariableError(NameError): def __init__(self, name, is_local): if is_local: - msg = 'local variable {0!r} is not defined' + msg = "local variable {0!r} is not defined" else: - msg = 'name {0!r} is not defined' + msg = "name {0!r} is not defined" super().__init__(msg.format(name)) class Term(StringMixin): - def __new__(cls, name, env, side=None, encoding=None): klass = Constant if not isinstance(name, str) else cls supr_new = super(Term, klass).__new__ @@ -58,14 +74,13 @@ def __init__(self, name, env, side=None, encoding=None): self.env = env self.side = side tname = str(name) - self.is_local = (tname.startswith(_LOCAL_TAG) or - tname in _DEFAULT_GLOBALS) + self.is_local = tname.startswith(_LOCAL_TAG) or tname in _DEFAULT_GLOBALS self._value = self._resolve_name() self.encoding = encoding @property def local_name(self): - return self.name.replace(_LOCAL_TAG, '') + return self.name.replace(_LOCAL_TAG, "") def __str__(self): return pprint_thing(self.name) @@ -80,9 +95,10 @@ def _resolve_name(self): res = self.env.resolve(self.local_name, is_local=self.is_local) self.update(res) - if hasattr(res, 'ndim') and res.ndim > 2: - raise NotImplementedError("N-dimensional objects, where N > 2," - " are not supported with eval") + if hasattr(res, "ndim") and res.ndim > 2: + raise NotImplementedError( + "N-dimensional objects, where N > 2," " are not supported with eval" + ) return res def update(self, value): @@ -124,9 +140,10 @@ def type(self): @property def raw(self): - return pprint_thing('{0}(name={1!r}, type={2})' - ''.format(self.__class__.__name__, self.name, - self.type)) + return pprint_thing( + "{0}(name={1!r}, type={2})" + "".format(self.__class__.__name__, self.name, self.type) + ) @property def is_datetime(self): @@ -155,7 +172,6 @@ def ndim(self): class Constant(Term): - def __init__(self, value, env, side=None, encoding=None): super().__init__(value, env, side=side, encoding=encoding) @@ -172,7 +188,7 @@ def __str__(self): return repr(self.name) -_bool_op_map = {'not': '~', 'and': '&', 'or': '|'} +_bool_op_map = {"not": "~", "and": "&", "or": "|"} class Op(StringMixin): @@ -183,7 +199,7 @@ class Op(StringMixin): def __init__(self, op, operands, *args, **kwargs): self.op = _bool_op_map.get(op, op) self.operands = operands - self.encoding = kwargs.get('encoding', None) + self.encoding = kwargs.get("encoding", None) def __iter__(self): return iter(self.operands) @@ -192,9 +208,8 @@ def __str__(self): """Print a generic n-ary operator and its operands using infix notation""" # recurse over the operands - parened = ('({0})'.format(pprint_thing(opr)) - for opr in self.operands) - return pprint_thing(' {0} '.format(self.op).join(parened)) + parened = ("({0})".format(pprint_thing(opr)) for opr in self.operands) + return pprint_thing(" {0} ".format(self.op).join(parened)) @property def return_type(self): @@ -206,7 +221,7 @@ def return_type(self): @property def has_invalid_return_type(self): types = self.operand_types - obj_dtype_set = frozenset([np.dtype('object')]) + obj_dtype_set = frozenset([np.dtype("object")]) return self.return_type == object and types - obj_dtype_set @property @@ -257,23 +272,23 @@ def _not_in(x, y): return x not in y -_cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=', 'in', 'not in' +_cmp_ops_syms = ">", "<", ">=", "<=", "==", "!=", "in", "not in" _cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne, _in, _not_in _cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs)) -_bool_ops_syms = '&', '|', 'and', 'or' +_bool_ops_syms = "&", "|", "and", "or" _bool_ops_funcs = op.and_, op.or_, op.and_, op.or_ _bool_ops_dict = dict(zip(_bool_ops_syms, _bool_ops_funcs)) -_arith_ops_syms = '+', '-', '*', '/', '**', '//', '%' -_arith_ops_funcs = (op.add, op.sub, op.mul, op.truediv, op.pow, op.floordiv, - op.mod) +_arith_ops_syms = "+", "-", "*", "/", "**", "//", "%" +_arith_ops_funcs = (op.add, op.sub, op.mul, op.truediv, op.pow, op.floordiv, op.mod) _arith_ops_dict = dict(zip(_arith_ops_syms, _arith_ops_funcs)) -_special_case_arith_ops_syms = '**', '//', '%' +_special_case_arith_ops_syms = "**", "//", "%" _special_case_arith_ops_funcs = op.pow, op.floordiv, op.mod -_special_case_arith_ops_dict = dict(zip(_special_case_arith_ops_syms, - _special_case_arith_ops_funcs)) +_special_case_arith_ops_dict = dict( + zip(_special_case_arith_ops_syms, _special_case_arith_ops_funcs) +) _binary_ops_dict = {} @@ -337,8 +352,10 @@ def __init__(self, op, lhs, rhs, **kwargs): except KeyError: # has to be made a list for python3 keys = list(_binary_ops_dict.keys()) - raise ValueError('Invalid binary operator {0!r}, valid' - ' operators are {1}'.format(op, keys)) + raise ValueError( + "Invalid binary operator {0!r}, valid" + " operators are {1}".format(op, keys) + ) def __call__(self, env): """Recursively evaluate an expression in Python space. @@ -353,7 +370,7 @@ def __call__(self, env): The result of an evaluated expression. """ # handle truediv - if self.op == '/' and env.scope['truediv']: + if self.op == "/" and env.scope["truediv"]: self.func = op.truediv # recurse over the left/right nodes @@ -378,24 +395,32 @@ def evaluate(self, env, engine, parser, term_type, eval_in_python): term_type The "pre-evaluated" expression as an instance of ``term_type`` """ - if engine == 'python': + if engine == "python": res = self(env) else: # recurse over the left/right nodes - left = self.lhs.evaluate(env, engine=engine, parser=parser, - term_type=term_type, - eval_in_python=eval_in_python) - right = self.rhs.evaluate(env, engine=engine, parser=parser, - term_type=term_type, - eval_in_python=eval_in_python) + left = self.lhs.evaluate( + env, + engine=engine, + parser=parser, + term_type=term_type, + eval_in_python=eval_in_python, + ) + right = self.rhs.evaluate( + env, + engine=engine, + parser=parser, + term_type=term_type, + eval_in_python=eval_in_python, + ) # base cases if self.op in eval_in_python: res = self.func(left.value, right.value) else: from pandas.core.computation.eval import eval - res = eval(self, local_dict=env, engine=engine, - parser=parser) + + res = eval(self, local_dict=env, engine=engine, parser=parser) name = env.add_tmp(res) return term_type(name, env=env) @@ -403,10 +428,10 @@ def evaluate(self, env, engine, parser, term_type, eval_in_python): def convert_values(self): """Convert datetimes to a comparable value in an expression. """ + def stringify(value): if self.encoding is not None: - encoder = partial(pprint_thing_encoded, - encoding=self.encoding) + encoder = partial(pprint_thing_encoded, encoding=self.encoding) else: encoder = pprint_thing return encoder(value) @@ -419,7 +444,7 @@ def stringify(value): v = stringify(v) v = Timestamp(_ensure_decoded(v)) if v.tz is not None: - v = v.tz_convert('UTC') + v = v.tz_convert("UTC") self.rhs.update(v) if is_term(rhs) and rhs.is_datetime and is_term(lhs) and lhs.is_scalar: @@ -428,14 +453,20 @@ def stringify(value): v = stringify(v) v = Timestamp(_ensure_decoded(v)) if v.tz is not None: - v = v.tz_convert('UTC') + v = v.tz_convert("UTC") self.lhs.update(v) def _disallow_scalar_only_bool_ops(self): - if ((self.lhs.is_scalar or self.rhs.is_scalar) and - self.op in _bool_ops_dict and - (not (issubclass(self.rhs.return_type, (bool, np.bool_)) and - issubclass(self.lhs.return_type, (bool, np.bool_))))): + if ( + (self.lhs.is_scalar or self.rhs.is_scalar) + and self.op in _bool_ops_dict + and ( + not ( + issubclass(self.rhs.return_type, (bool, np.bool_)) + and issubclass(self.lhs.return_type, (bool, np.bool_)) + ) + ) + ): raise NotImplementedError("cannot evaluate scalar only bool ops") @@ -457,20 +488,20 @@ class Div(BinOp): """ def __init__(self, lhs, rhs, truediv, *args, **kwargs): - super().__init__('/', lhs, rhs, *args, **kwargs) + super().__init__("/", lhs, rhs, *args, **kwargs) if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type): - raise TypeError("unsupported operand type(s) for {0}:" - " '{1}' and '{2}'".format(self.op, - lhs.return_type, - rhs.return_type)) + raise TypeError( + "unsupported operand type(s) for {0}:" + " '{1}' and '{2}'".format(self.op, lhs.return_type, rhs.return_type) + ) # do not upcast float32s to float64 un-necessarily acceptable_dtypes = [np.float32, np.float_] _cast_inplace(com.flatten(self), acceptable_dtypes, np.float_) -_unary_ops_syms = '+', '-', '~', 'not' +_unary_ops_syms = "+", "-", "~", "not" _unary_ops_funcs = op.pos, op.neg, op.invert, op.invert _unary_ops_dict = dict(zip(_unary_ops_syms, _unary_ops_funcs)) @@ -499,54 +530,55 @@ def __init__(self, op, operand): try: self.func = _unary_ops_dict[op] except KeyError: - raise ValueError('Invalid unary operator {0!r}, valid operators ' - 'are {1}'.format(op, _unary_ops_syms)) + raise ValueError( + "Invalid unary operator {0!r}, valid operators " + "are {1}".format(op, _unary_ops_syms) + ) def __call__(self, env): operand = self.operand(env) return self.func(operand) def __str__(self): - return pprint_thing('{0}({1})'.format(self.op, self.operand)) + return pprint_thing("{0}({1})".format(self.op, self.operand)) @property def return_type(self): operand = self.operand - if operand.return_type == np.dtype('bool'): - return np.dtype('bool') - if (isinstance(operand, Op) and - (operand.op in _cmp_ops_dict or operand.op in _bool_ops_dict)): - return np.dtype('bool') - return np.dtype('int') + if operand.return_type == np.dtype("bool"): + return np.dtype("bool") + if isinstance(operand, Op) and ( + operand.op in _cmp_ops_dict or operand.op in _bool_ops_dict + ): + return np.dtype("bool") + return np.dtype("int") class MathCall(Op): - def __init__(self, func, args): super().__init__(func.name, args) self.func = func def __call__(self, env): operands = [op(env) for op in self.operands] - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): return self.func.func(*operands) def __str__(self): operands = map(str, self.operands) - return pprint_thing('{0}({1})'.format(self.op, ','.join(operands))) + return pprint_thing("{0}({1})".format(self.op, ",".join(operands))) class FuncNode: def __init__(self, name): - from pandas.core.computation.check import (_NUMEXPR_INSTALLED, - _NUMEXPR_VERSION) + from pandas.core.computation.check import _NUMEXPR_INSTALLED, _NUMEXPR_VERSION + if name not in _mathops or ( - _NUMEXPR_INSTALLED and - _NUMEXPR_VERSION < LooseVersion('2.6.9') and - name in ('floor', 'ceil') + _NUMEXPR_INSTALLED + and _NUMEXPR_VERSION < LooseVersion("2.6.9") + and name in ("floor", "ceil") ): - raise ValueError( - "\"{0}\" is not a supported function".format(name)) + raise ValueError('"{0}" is not a supported function'.format(name)) self.name = name self.func = getattr(np, name) diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 25cfa8fe17697b..e4e005c024345a 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -22,18 +22,14 @@ class Scope(expr.Scope): - __slots__ = 'queryables', + __slots__ = ("queryables",) - def __init__(self, level, global_dict=None, local_dict=None, - queryables=None): - super().__init__(level + 1, - global_dict=global_dict, - local_dict=local_dict) + def __init__(self, level, global_dict=None, local_dict=None, queryables=None): + super().__init__(level + 1, global_dict=global_dict, local_dict=local_dict) self.queryables = queryables or dict() class Term(ops.Term): - def __new__(cls, name, env, side=None, encoding=None): klass = Constant if not isinstance(name, str) else cls supr_new = StringMixin.__new__ @@ -44,10 +40,9 @@ def __init__(self, name, env, side=None, encoding=None): def _resolve_name(self): # must be a queryables - if self.side == 'left': + if self.side == "left": if self.name not in self.env.queryables: - raise NameError('name {name!r} is not defined' - .format(name=self.name)) + raise NameError("name {name!r} is not defined".format(name=self.name)) return self.name # resolve the rhs (and allow it to be None) @@ -63,7 +58,6 @@ def value(self): class Constant(Term): - def __init__(self, value, env, side=None, encoding=None): super().__init__(value, env, side=side, encoding=encoding) @@ -86,7 +80,6 @@ def _disallow_scalar_only_bool_ops(self): pass def prune(self, klass): - def pr(left, right): """ create and return a new specialized BinOp from myself """ @@ -97,8 +90,9 @@ def pr(left, right): k = klass if isinstance(left, ConditionBinOp): - if (isinstance(left, ConditionBinOp) and - isinstance(right, ConditionBinOp)): + if isinstance(left, ConditionBinOp) and isinstance( + right, ConditionBinOp + ): k = JointConditionBinOp elif isinstance(left, k): return left @@ -106,16 +100,16 @@ def pr(left, right): return right elif isinstance(left, FilterBinOp): - if (isinstance(left, FilterBinOp) and - isinstance(right, FilterBinOp)): + if isinstance(left, FilterBinOp) and isinstance(right, FilterBinOp): k = JointFilterBinOp elif isinstance(left, k): return left elif isinstance(right, k): return right - return k(self.op, left, right, queryables=self.queryables, - encoding=self.encoding).evaluate() + return k( + self.op, left, right, queryables=self.queryables, encoding=self.encoding + ).evaluate() left, right = self.lhs, self.rhs @@ -152,17 +146,17 @@ def is_in_table(self): @property def kind(self): """ the kind of my field """ - return getattr(self.queryables.get(self.lhs), 'kind', None) + return getattr(self.queryables.get(self.lhs), "kind", None) @property def meta(self): """ the meta of my field """ - return getattr(self.queryables.get(self.lhs), 'meta', None) + return getattr(self.queryables.get(self.lhs), "meta", None) @property def metadata(self): """ the metadata of my field """ - return getattr(self.queryables.get(self.lhs), 'metadata', None) + return getattr(self.queryables.get(self.lhs), "metadata", None) def generate(self, v): """ create and return the op string for this TermValue """ @@ -175,64 +169,74 @@ def convert_value(self, v): def stringify(value): if self.encoding is not None: - encoder = partial(pprint_thing_encoded, - encoding=self.encoding) + encoder = partial(pprint_thing_encoded, encoding=self.encoding) else: encoder = pprint_thing return encoder(value) kind = _ensure_decoded(self.kind) meta = _ensure_decoded(self.meta) - if kind == 'datetime64' or kind == 'datetime': + if kind == "datetime64" or kind == "datetime": if isinstance(v, (int, float)): v = stringify(v) v = _ensure_decoded(v) v = Timestamp(v) if v.tz is not None: - v = v.tz_convert('UTC') + v = v.tz_convert("UTC") return TermValue(v, v.value, kind) - elif kind == 'timedelta64' or kind == 'timedelta': - v = Timedelta(v, unit='s').value + elif kind == "timedelta64" or kind == "timedelta": + v = Timedelta(v, unit="s").value return TermValue(int(v), v, kind) - elif meta == 'category': + elif meta == "category": metadata = com.values_from_object(self.metadata) - result = metadata.searchsorted(v, side='left') + result = metadata.searchsorted(v, side="left") # result returns 0 if v is first element or if v is not in metadata # check that metadata contains v if not result and v not in metadata: result = -1 - return TermValue(result, result, 'integer') - elif kind == 'integer': + return TermValue(result, result, "integer") + elif kind == "integer": v = int(float(v)) return TermValue(v, v, kind) - elif kind == 'float': + elif kind == "float": v = float(v) return TermValue(v, v, kind) - elif kind == 'bool': + elif kind == "bool": if isinstance(v, str): - v = not v.strip().lower() in ['false', 'f', 'no', - 'n', 'none', '0', - '[]', '{}', ''] + v = not v.strip().lower() in [ + "false", + "f", + "no", + "n", + "none", + "0", + "[]", + "{}", + "", + ] else: v = bool(v) return TermValue(v, v, kind) elif isinstance(v, str): # string quoting - return TermValue(v, stringify(v), 'string') + return TermValue(v, stringify(v), "string") else: - raise TypeError("Cannot compare {v} of type {typ} to {kind} column" - .format(v=v, typ=type(v), kind=kind)) + raise TypeError( + "Cannot compare {v} of type {typ} to {kind} column".format( + v=v, typ=type(v), kind=kind + ) + ) def convert_values(self): pass class FilterBinOp(BinOp): - def __str__(self): - return pprint_thing("[Filter : [{lhs}] -> [{op}]" - .format(lhs=self.filter[0], op=self.filter[1])) + return pprint_thing( + "[Filter : [{lhs}] -> [{op}]".format(lhs=self.filter[0], op=self.filter[1]) + ) def invert(self): """ invert the filter """ @@ -249,8 +253,7 @@ def format(self): def evaluate(self): if not self.is_valid: - raise ValueError("query term is not valid [{slf}]" - .format(slf=self)) + raise ValueError("query term is not valid [{slf}]".format(slf=self)) rhs = self.conform(self.rhs) values = [TermValue(v, v, self.kind).value for v in rhs] @@ -258,41 +261,36 @@ def evaluate(self): if self.is_in_table: # if too many values to create the expression, use a filter instead - if self.op in ['==', '!='] and len(values) > self._max_selectors: + if self.op in ["==", "!="] and len(values) > self._max_selectors: filter_op = self.generate_filter_op() - self.filter = ( - self.lhs, - filter_op, - pd.Index(values)) + self.filter = (self.lhs, filter_op, pd.Index(values)) return self return None # equality conditions - if self.op in ['==', '!=']: + if self.op in ["==", "!="]: filter_op = self.generate_filter_op() - self.filter = ( - self.lhs, - filter_op, - pd.Index(values)) + self.filter = (self.lhs, filter_op, pd.Index(values)) else: - raise TypeError("passing a filterable condition to a non-table " - "indexer [{slf}]".format(slf=self)) + raise TypeError( + "passing a filterable condition to a non-table " + "indexer [{slf}]".format(slf=self) + ) return self def generate_filter_op(self, invert=False): - if (self.op == '!=' and not invert) or (self.op == '==' and invert): + if (self.op == "!=" and not invert) or (self.op == "==" and invert): return lambda axis, vals: ~axis.isin(vals) else: return lambda axis, vals: axis.isin(vals) class JointFilterBinOp(FilterBinOp): - def format(self): raise NotImplementedError("unable to collapse Joint Filters") @@ -301,18 +299,17 @@ def evaluate(self): class ConditionBinOp(BinOp): - def __str__(self): - return pprint_thing("[Condition : [{cond}]]" - .format(cond=self.condition)) + return pprint_thing("[Condition : [{cond}]]".format(cond=self.condition)) def invert(self): """ invert the condition """ # if self.condition is not None: # self.condition = "~(%s)" % self.condition # return self - raise NotImplementedError("cannot use an invert condition when " - "passing to numexpr") + raise NotImplementedError( + "cannot use an invert condition when " "passing to numexpr" + ) def format(self): """ return the actual ne format """ @@ -321,8 +318,7 @@ def format(self): def evaluate(self): if not self.is_valid: - raise ValueError("query term is not valid [{slf}]" - .format(slf=self)) + raise ValueError("query term is not valid [{slf}]".format(slf=self)) # convert values if we are in the table if not self.is_in_table: @@ -332,12 +328,12 @@ def evaluate(self): values = [self.convert_value(v) for v in rhs] # equality conditions - if self.op in ['==', '!=']: + if self.op in ["==", "!="]: # too many values to create the expression? if len(values) <= self._max_selectors: vs = [self.generate(v) for v in values] - self.condition = "({cond})".format(cond=' | '.join(vs)) + self.condition = "({cond})".format(cond=" | ".join(vs)) # use a filter after reading else: @@ -349,19 +345,17 @@ def evaluate(self): class JointConditionBinOp(ConditionBinOp): - def evaluate(self): - self.condition = "({lhs} {op} {rhs})".format(lhs=self.lhs.condition, - op=self.op, - rhs=self.rhs.condition) + self.condition = "({lhs} {op} {rhs})".format( + lhs=self.lhs.condition, op=self.op, rhs=self.rhs.condition + ) return self class UnaryOp(ops.UnaryOp): - def prune(self, klass): - if self.op != '~': + if self.op != "~": raise NotImplementedError("UnaryOp only support invert type ops") operand = self.operand @@ -378,7 +372,7 @@ def prune(self, klass): return None -_op_classes = {'unary': UnaryOp} +_op_classes = {"unary": UnaryOp} class ExprVisitor(BaseExprVisitor): @@ -389,24 +383,27 @@ def __init__(self, env, engine, parser, **kwargs): super().__init__(env, engine, parser) for bin_op in self.binary_ops: bin_node = self.binary_op_nodes_map[bin_op] - setattr(self, 'visit_{node}'.format(node=bin_node), - lambda node, bin_op=bin_op: partial(BinOp, bin_op, - **kwargs)) + setattr( + self, + "visit_{node}".format(node=bin_node), + lambda node, bin_op=bin_op: partial(BinOp, bin_op, **kwargs), + ) def visit_UnaryOp(self, node, **kwargs): if isinstance(node.op, (ast.Not, ast.Invert)): - return UnaryOp('~', self.visit(node.operand)) + return UnaryOp("~", self.visit(node.operand)) elif isinstance(node.op, ast.USub): return self.const_type(-self.visit(node.operand).value, self.env) elif isinstance(node.op, ast.UAdd): - raise NotImplementedError('Unary addition not supported') + raise NotImplementedError("Unary addition not supported") def visit_Index(self, node, **kwargs): return self.visit(node.value).value def visit_Assign(self, node, **kwargs): - cmpr = ast.Compare(ops=[ast.Eq()], left=node.targets[0], - comparators=[node.value]) + cmpr = ast.Compare( + ops=[ast.Eq()], left=node.targets[0], comparators=[node.value] + ) return self.visit(cmpr) def visit_Subscript(self, node, **kwargs): @@ -422,8 +419,10 @@ def visit_Subscript(self, node, **kwargs): try: return self.const_type(value[slobj], self.env) except TypeError: - raise ValueError("cannot subscript {value!r} with " - "{slobj!r}".format(value=value, slobj=slobj)) + raise ValueError( + "cannot subscript {value!r} with " + "{slobj!r}".format(value=value, slobj=slobj) + ) def visit_Attribute(self, node, **kwargs): attr = node.attr @@ -448,8 +447,7 @@ def visit_Attribute(self, node, **kwargs): if isinstance(value, ast.Name) and value.id == attr: return resolved - raise ValueError("Invalid Attribute context {name}" - .format(name=ctx.__name__)) + raise ValueError("Invalid Attribute context {name}".format(name=ctx.__name__)) def translate_In(self, op): return ast.Eq() if isinstance(op, ast.In) else op @@ -478,8 +476,9 @@ def _validate_where(w): """ if not (isinstance(w, (Expr, str)) or is_list_like(w)): - raise TypeError("where must be passed as a string, Expr, " - "or list-like of Exprs") + raise TypeError( + "where must be passed as a string, Expr, " "or list-like of Exprs" + ) return w @@ -537,16 +536,20 @@ def __init__(self, where, queryables=None, encoding=None, scope_level=0): else: w = _validate_where(w) where[idx] = w - where = ' & '.join(map('({})'.format, com.flatten(where))) # noqa + where = " & ".join(map("({})".format, com.flatten(where))) # noqa self.expr = where self.env = Scope(scope_level + 1, local_dict=local_dict) if queryables is not None and isinstance(self.expr, str): self.env.queryables.update(queryables) - self._visitor = ExprVisitor(self.env, queryables=queryables, - parser='pytables', engine='pytables', - encoding=encoding) + self._visitor = ExprVisitor( + self.env, + queryables=queryables, + parser="pytables", + engine="pytables", + encoding=encoding, + ) self.terms = self.parse() def __str__(self): @@ -560,15 +563,17 @@ def evaluate(self): try: self.condition = self.terms.prune(ConditionBinOp) except AttributeError: - raise ValueError("cannot process expression [{expr}], [{slf}] " - "is not a valid condition".format(expr=self.expr, - slf=self)) + raise ValueError( + "cannot process expression [{expr}], [{slf}] " + "is not a valid condition".format(expr=self.expr, slf=self) + ) try: self.filter = self.terms.prune(FilterBinOp) except AttributeError: - raise ValueError("cannot process expression [{expr}], [{slf}] " - "is not a valid filter".format(expr=self.expr, - slf=self)) + raise ValueError( + "cannot process expression [{expr}], [{slf}] " + "is not a valid filter".format(expr=self.expr, slf=self) + ) return self.condition, self.filter @@ -585,11 +590,11 @@ def __init__(self, value, converted, kind): def tostring(self, encoding): """ quote the string if not encoded else encode and return """ - if self.kind == 'string': + if self.kind == "string": if encoding is not None: return self.converted return '"{converted}"'.format(converted=self.converted) - elif self.kind == 'float': + elif self.kind == "float": # python 2 str(float) is not always # round-trippable so use repr() return repr(self.converted) @@ -600,7 +605,7 @@ def maybe_expression(s): """ loose checking if s is a pytables-acceptable expression """ if not isinstance(s, str): return False - ops = ExprVisitor.binary_ops + ExprVisitor.unary_ops + ('=',) + ops = ExprVisitor.binary_ops + ExprVisitor.unary_ops + ("=",) # make sure we have an op at least return any(op in s for op in ops) diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index 729acdc52e24ae..4d5a523337f665 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -19,11 +19,17 @@ import pandas.core.computation as compu -def _ensure_scope(level, global_dict=None, local_dict=None, resolvers=(), - target=None, **kwargs): +def _ensure_scope( + level, global_dict=None, local_dict=None, resolvers=(), target=None, **kwargs +): """Ensure that we are grabbing the correct scope.""" - return Scope(level + 1, global_dict=global_dict, local_dict=local_dict, - resolvers=resolvers, target=target) + return Scope( + level + 1, + global_dict=global_dict, + local_dict=local_dict, + resolvers=resolvers, + target=target, + ) def _replacer(x): @@ -44,19 +50,19 @@ def _replacer(x): def _raw_hex_id(obj): """Return the padded hexadecimal id of ``obj``.""" # interpret as a pointer since that's what really what id returns - packed = struct.pack('@P', id(obj)) - return ''.join(map(_replacer, packed)) + packed = struct.pack("@P", id(obj)) + return "".join(map(_replacer, packed)) _DEFAULT_GLOBALS = { - 'Timestamp': Timestamp, - 'datetime': datetime.datetime, - 'True': True, - 'False': False, - 'list': list, - 'tuple': tuple, - 'inf': np.inf, - 'Inf': np.inf, + "Timestamp": Timestamp, + "datetime": datetime.datetime, + "True": True, + "False": False, + "list": list, + "tuple": tuple, + "inf": np.inf, + "Inf": np.inf, } @@ -98,10 +104,12 @@ class Scope(StringMixin): target : object temps : dict """ - __slots__ = 'level', 'scope', 'target', 'temps' - def __init__(self, level, global_dict=None, local_dict=None, resolvers=(), - target=None): + __slots__ = "level", "scope", "target", "temps" + + def __init__( + self, level, global_dict=None, local_dict=None, resolvers=(), target=None + ): self.level = level + 1 # shallow copy because we don't want to keep filling this up with what @@ -121,11 +129,9 @@ def __init__(self, level, global_dict=None, local_dict=None, resolvers=(), # shallow copy here because we don't want to replace what's in # scope when we align terms (alignment accesses the underlying # numpy array of pandas objects) - self.scope = self.scope.new_child((global_dict or - frame.f_globals).copy()) + self.scope = self.scope.new_child((global_dict or frame.f_globals).copy()) if not isinstance(local_dict, Scope): - self.scope = self.scope.new_child((local_dict or - frame.f_locals).copy()) + self.scope = self.scope.new_child((local_dict or frame.f_locals).copy()) finally: del frame @@ -138,10 +144,10 @@ def __init__(self, level, global_dict=None, local_dict=None, resolvers=(), def __str__(self): scope_keys = _get_pretty_string(list(self.scope.keys())) res_keys = _get_pretty_string(list(self.resolvers.keys())) - unicode_str = '{name}(scope={scope_keys}, resolvers={res_keys})' - return unicode_str.format(name=type(self).__name__, - scope_keys=scope_keys, - res_keys=res_keys) + unicode_str = "{name}(scope={scope_keys}, resolvers={res_keys})" + return unicode_str.format( + name=type(self).__name__, scope_keys=scope_keys, res_keys=res_keys + ) @property def has_resolvers(self): @@ -232,7 +238,7 @@ def _get_vars(self, stack, scopes): variables = itertools.product(scopes, stack) for scope, (frame, _, _, _, _, _) in variables: try: - d = getattr(frame, 'f_' + scope) + d = getattr(frame, "f_" + scope) self.scope = self.scope.new_child(d) finally: # won't remove it, but DECREF it @@ -255,7 +261,7 @@ def update(self, level): stack = inspect.stack() try: - self._get_vars(stack[:sl], scopes=['locals']) + self._get_vars(stack[:sl], scopes=["locals"]) finally: del stack[:], stack @@ -272,9 +278,9 @@ def add_tmp(self, value): name : basestring The name of the temporary variable created. """ - name = '{name}_{num}_{hex_id}'.format(name=type(value).__name__, - num=self.ntemps, - hex_id=_raw_hex_id(self)) + name = "{name}_{num}_{hex_id}".format( + name=type(value).__name__, num=self.ntemps, hex_id=_raw_hex_id(self) + ) # add to inner most scope assert name not in self.temps diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 856d5076f37554..be6086dd360f27 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -13,8 +13,13 @@ import pandas._config.config as cf from pandas._config.config import ( - is_bool, is_callable, is_instance_factory, is_int, is_one_of_factory, - is_text) + is_bool, + is_callable, + is_instance_factory, + is_int, + is_one_of_factory, + is_text, +) # compute @@ -28,6 +33,7 @@ def use_bottleneck_cb(key): from pandas.core import nanops + nanops.set_use_bottleneck(cf.get_option(key)) @@ -41,14 +47,21 @@ def use_bottleneck_cb(key): def use_numexpr_cb(key): from pandas.core.computation import expressions + expressions.set_use_numexpr(cf.get_option(key)) -with cf.config_prefix('compute'): - cf.register_option('use_bottleneck', True, use_bottleneck_doc, - validator=is_bool, cb=use_bottleneck_cb) - cf.register_option('use_numexpr', True, use_numexpr_doc, - validator=is_bool, cb=use_numexpr_cb) +with cf.config_prefix("compute"): + cf.register_option( + "use_bottleneck", + True, + use_bottleneck_doc, + validator=is_bool, + cb=use_bottleneck_cb, + ) + cf.register_option( + "use_numexpr", True, use_numexpr_doc, validator=is_bool, cb=use_numexpr_cb + ) # # options from the "display" namespace @@ -284,6 +297,7 @@ def use_numexpr_cb(key): def table_schema_cb(key): from pandas.io.formats.printing import _enable_data_resource_formatter + _enable_data_resource_formatter(cf.get_option(key)) @@ -298,84 +312,117 @@ def is_terminal(): except NameError: # assume standard Python interpreter in a terminal return True else: - if hasattr(ip, 'kernel'): # IPython as a Jupyter kernel + if hasattr(ip, "kernel"): # IPython as a Jupyter kernel return False else: # IPython in a terminal return True -with cf.config_prefix('display'): - cf.register_option('precision', 6, pc_precision_doc, validator=is_int) - cf.register_option('float_format', None, float_format_doc, - validator=is_one_of_factory([None, is_callable])) - cf.register_option('column_space', 12, validator=is_int) - cf.register_option('max_info_rows', 1690785, pc_max_info_rows_doc, - validator=is_instance_factory((int, type(None)))) - cf.register_option('max_rows', 60, pc_max_rows_doc, - validator=is_instance_factory([type(None), int])) - cf.register_option('min_rows', 10, pc_min_rows_doc, - validator=is_instance_factory([type(None), int])) - cf.register_option('max_categories', 8, pc_max_categories_doc, - validator=is_int) - cf.register_option('max_colwidth', 50, max_colwidth_doc, validator=is_int) +with cf.config_prefix("display"): + cf.register_option("precision", 6, pc_precision_doc, validator=is_int) + cf.register_option( + "float_format", + None, + float_format_doc, + validator=is_one_of_factory([None, is_callable]), + ) + cf.register_option("column_space", 12, validator=is_int) + cf.register_option( + "max_info_rows", + 1690785, + pc_max_info_rows_doc, + validator=is_instance_factory((int, type(None))), + ) + cf.register_option( + "max_rows", + 60, + pc_max_rows_doc, + validator=is_instance_factory([type(None), int]), + ) + cf.register_option( + "min_rows", + 10, + pc_min_rows_doc, + validator=is_instance_factory([type(None), int]), + ) + cf.register_option("max_categories", 8, pc_max_categories_doc, validator=is_int) + cf.register_option("max_colwidth", 50, max_colwidth_doc, validator=is_int) if is_terminal(): max_cols = 0 # automatically determine optimal number of columns else: max_cols = 20 # cannot determine optimal number of columns - cf.register_option('max_columns', max_cols, pc_max_cols_doc, - validator=is_instance_factory([type(None), int])) - cf.register_option('large_repr', 'truncate', pc_large_repr_doc, - validator=is_one_of_factory(['truncate', 'info'])) - cf.register_option('max_info_columns', 100, pc_max_info_cols_doc, - validator=is_int) - cf.register_option('colheader_justify', 'right', colheader_justify_doc, - validator=is_text) - cf.register_option('notebook_repr_html', True, pc_nb_repr_h_doc, - validator=is_bool) - cf.register_option('pprint_nest_depth', 3, pc_pprint_nest_depth, - validator=is_int) - cf.register_option('multi_sparse', True, pc_multi_sparse_doc, - validator=is_bool) - cf.register_option('expand_frame_repr', True, pc_expand_repr_doc) - cf.register_option('show_dimensions', 'truncate', pc_show_dimensions_doc, - validator=is_one_of_factory([True, False, 'truncate'])) - cf.register_option('chop_threshold', None, pc_chop_threshold_doc) - cf.register_option('max_seq_items', 100, pc_max_seq_items) - cf.register_option('width', 80, pc_width_doc, - validator=is_instance_factory([type(None), int])) - cf.register_option('memory_usage', True, pc_memory_usage_doc, - validator=is_one_of_factory([None, True, - False, 'deep'])) - cf.register_option('unicode.east_asian_width', False, - pc_east_asian_width_doc, validator=is_bool) - cf.register_option('unicode.ambiguous_as_wide', False, - pc_east_asian_width_doc, validator=is_bool) - cf.register_option('latex.repr', False, - pc_latex_repr_doc, validator=is_bool) - cf.register_option('latex.escape', True, pc_latex_escape, - validator=is_bool) - cf.register_option('latex.longtable', False, pc_latex_longtable, - validator=is_bool) - cf.register_option('latex.multicolumn', True, pc_latex_multicolumn, - validator=is_bool) - cf.register_option('latex.multicolumn_format', 'l', pc_latex_multicolumn, - validator=is_text) - cf.register_option('latex.multirow', False, pc_latex_multirow, - validator=is_bool) - cf.register_option('html.table_schema', False, pc_table_schema_doc, - validator=is_bool, cb=table_schema_cb) - cf.register_option('html.border', 1, pc_html_border_doc, - validator=is_int) - cf.register_option('html.use_mathjax', True, pc_html_use_mathjax_doc, - validator=is_bool) + cf.register_option( + "max_columns", + max_cols, + pc_max_cols_doc, + validator=is_instance_factory([type(None), int]), + ) + cf.register_option( + "large_repr", + "truncate", + pc_large_repr_doc, + validator=is_one_of_factory(["truncate", "info"]), + ) + cf.register_option("max_info_columns", 100, pc_max_info_cols_doc, validator=is_int) + cf.register_option( + "colheader_justify", "right", colheader_justify_doc, validator=is_text + ) + cf.register_option("notebook_repr_html", True, pc_nb_repr_h_doc, validator=is_bool) + cf.register_option("pprint_nest_depth", 3, pc_pprint_nest_depth, validator=is_int) + cf.register_option("multi_sparse", True, pc_multi_sparse_doc, validator=is_bool) + cf.register_option("expand_frame_repr", True, pc_expand_repr_doc) + cf.register_option( + "show_dimensions", + "truncate", + pc_show_dimensions_doc, + validator=is_one_of_factory([True, False, "truncate"]), + ) + cf.register_option("chop_threshold", None, pc_chop_threshold_doc) + cf.register_option("max_seq_items", 100, pc_max_seq_items) + cf.register_option( + "width", 80, pc_width_doc, validator=is_instance_factory([type(None), int]) + ) + cf.register_option( + "memory_usage", + True, + pc_memory_usage_doc, + validator=is_one_of_factory([None, True, False, "deep"]), + ) + cf.register_option( + "unicode.east_asian_width", False, pc_east_asian_width_doc, validator=is_bool + ) + cf.register_option( + "unicode.ambiguous_as_wide", False, pc_east_asian_width_doc, validator=is_bool + ) + cf.register_option("latex.repr", False, pc_latex_repr_doc, validator=is_bool) + cf.register_option("latex.escape", True, pc_latex_escape, validator=is_bool) + cf.register_option("latex.longtable", False, pc_latex_longtable, validator=is_bool) + cf.register_option( + "latex.multicolumn", True, pc_latex_multicolumn, validator=is_bool + ) + cf.register_option( + "latex.multicolumn_format", "l", pc_latex_multicolumn, validator=is_text + ) + cf.register_option("latex.multirow", False, pc_latex_multirow, validator=is_bool) + cf.register_option( + "html.table_schema", + False, + pc_table_schema_doc, + validator=is_bool, + cb=table_schema_cb, + ) + cf.register_option("html.border", 1, pc_html_border_doc, validator=is_int) + cf.register_option( + "html.use_mathjax", True, pc_html_use_mathjax_doc, validator=is_bool + ) tc_sim_interactive_doc = """ : boolean Whether to simulate interactive mode for purposes of testing """ -with cf.config_prefix('mode'): - cf.register_option('sim_interactive', False, tc_sim_interactive_doc) +with cf.config_prefix("mode"): + cf.register_option("sim_interactive", False, tc_sim_interactive_doc) use_inf_as_null_doc = """ : boolean @@ -396,17 +443,19 @@ def is_terminal(): def use_inf_as_na_cb(key): from pandas.core.dtypes.missing import _use_inf_as_na + _use_inf_as_na(key) -with cf.config_prefix('mode'): - cf.register_option('use_inf_as_na', False, use_inf_as_na_doc, - cb=use_inf_as_na_cb) - cf.register_option('use_inf_as_null', False, use_inf_as_null_doc, - cb=use_inf_as_na_cb) +with cf.config_prefix("mode"): + cf.register_option("use_inf_as_na", False, use_inf_as_na_doc, cb=use_inf_as_na_cb) + cf.register_option( + "use_inf_as_null", False, use_inf_as_null_doc, cb=use_inf_as_na_cb + ) -cf.deprecate_option('mode.use_inf_as_null', msg=use_inf_as_null_doc, - rkey='mode.use_inf_as_na') +cf.deprecate_option( + "mode.use_inf_as_null", msg=use_inf_as_null_doc, rkey="mode.use_inf_as_na" +) # user warnings @@ -416,9 +465,13 @@ def use_inf_as_na_cb(key): The default is warn """ -with cf.config_prefix('mode'): - cf.register_option('chained_assignment', 'warn', chained_assignment, - validator=is_one_of_factory([None, 'warn', 'raise'])) +with cf.config_prefix("mode"): + cf.register_option( + "chained_assignment", + "warn", + chained_assignment, + validator=is_one_of_factory([None, "warn", "raise"]), + ) # Set up the io.excel specific reader configuration. @@ -428,41 +481,45 @@ def use_inf_as_na_cb(key): auto, {others}. """ -_xls_options = ['xlrd'] -_xlsm_options = ['xlrd', 'openpyxl'] -_xlsx_options = ['xlrd', 'openpyxl'] -_ods_options = ['odf'] +_xls_options = ["xlrd"] +_xlsm_options = ["xlrd", "openpyxl"] +_xlsx_options = ["xlrd", "openpyxl"] +_ods_options = ["odf"] with cf.config_prefix("io.excel.xls"): - cf.register_option("reader", "auto", - reader_engine_doc.format( - ext='xls', - others=', '.join(_xls_options)), - validator=str) + cf.register_option( + "reader", + "auto", + reader_engine_doc.format(ext="xls", others=", ".join(_xls_options)), + validator=str, + ) with cf.config_prefix("io.excel.xlsm"): - cf.register_option("reader", "auto", - reader_engine_doc.format( - ext='xlsm', - others=', '.join(_xlsm_options)), - validator=str) + cf.register_option( + "reader", + "auto", + reader_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)), + validator=str, + ) with cf.config_prefix("io.excel.xlsx"): - cf.register_option("reader", "auto", - reader_engine_doc.format( - ext='xlsx', - others=', '.join(_xlsx_options)), - validator=str) + cf.register_option( + "reader", + "auto", + reader_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)), + validator=str, + ) with cf.config_prefix("io.excel.ods"): - cf.register_option("reader", "auto", - reader_engine_doc.format( - ext='ods', - others=', '.join(_ods_options)), - validator=str) + cf.register_option( + "reader", + "auto", + reader_engine_doc.format(ext="ods", others=", ".join(_ods_options)), + validator=str, + ) # Set up the io.excel specific writer configuration. @@ -472,32 +529,35 @@ def use_inf_as_na_cb(key): auto, {others}. """ -_xls_options = ['xlwt'] -_xlsm_options = ['openpyxl'] -_xlsx_options = ['openpyxl', 'xlsxwriter'] +_xls_options = ["xlwt"] +_xlsm_options = ["openpyxl"] +_xlsx_options = ["openpyxl", "xlsxwriter"] with cf.config_prefix("io.excel.xls"): - cf.register_option("writer", "auto", - writer_engine_doc.format( - ext='xls', - others=', '.join(_xls_options)), - validator=str) + cf.register_option( + "writer", + "auto", + writer_engine_doc.format(ext="xls", others=", ".join(_xls_options)), + validator=str, + ) with cf.config_prefix("io.excel.xlsm"): - cf.register_option("writer", "auto", - writer_engine_doc.format( - ext='xlsm', - others=', '.join(_xlsm_options)), - validator=str) + cf.register_option( + "writer", + "auto", + writer_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)), + validator=str, + ) with cf.config_prefix("io.excel.xlsx"): - cf.register_option("writer", "auto", - writer_engine_doc.format( - ext='xlsx', - others=', '.join(_xlsx_options)), - validator=str) + cf.register_option( + "writer", + "auto", + writer_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)), + validator=str, + ) # Set up the io.parquet specific configuration. @@ -507,10 +567,13 @@ def use_inf_as_na_cb(key): 'auto', 'pyarrow', 'fastparquet', the default is 'auto' """ -with cf.config_prefix('io.parquet'): +with cf.config_prefix("io.parquet"): cf.register_option( - 'engine', 'auto', parquet_engine_doc, - validator=is_one_of_factory(['auto', 'pyarrow', 'fastparquet'])) + "engine", + "auto", + parquet_engine_doc, + validator=is_one_of_factory(["auto", "pyarrow", "fastparquet"]), + ) # -------- # Plotting @@ -526,28 +589,35 @@ def use_inf_as_na_cb(key): def register_plotting_backend_cb(key): backend_str = cf.get_option(key) - if backend_str == 'matplotlib': + if backend_str == "matplotlib": try: import pandas.plotting._matplotlib # noqa except ImportError: - raise ImportError('matplotlib is required for plotting when the ' - 'default backend "matplotlib" is selected.') + raise ImportError( + "matplotlib is required for plotting when the " + 'default backend "matplotlib" is selected.' + ) else: return try: importlib.import_module(backend_str) except ImportError: - raise ValueError('"{}" does not seem to be an installed module. ' - 'A pandas plotting backend must be a module that ' - 'can be imported'.format(backend_str)) + raise ValueError( + '"{}" does not seem to be an installed module. ' + "A pandas plotting backend must be a module that " + "can be imported".format(backend_str) + ) -with cf.config_prefix('plotting'): - cf.register_option('backend', defval='matplotlib', - doc=plotting_backend_doc, - validator=str, - cb=register_plotting_backend_cb) +with cf.config_prefix("plotting"): + cf.register_option( + "backend", + defval="matplotlib", + doc=plotting_backend_doc, + validator=str, + cb=register_plotting_backend_cb, + ) register_converter_doc = """ @@ -569,5 +639,10 @@ def register_converter_cb(key): with cf.config_prefix("plotting.matplotlib"): - cf.register_option("register_converters", True, register_converter_doc, - validator=bool, cb=register_converter_cb) + cf.register_option( + "register_converters", + True, + register_converter_doc, + validator=bool, + cb=register_converter_cb, + ) diff --git a/pandas/core/dtypes/api.py b/pandas/core/dtypes/api.py index e9d7b9c4281bdf..2b527e1fb58900 100644 --- a/pandas/core/dtypes/api.py +++ b/pandas/core/dtypes/api.py @@ -1,14 +1,47 @@ # flake8: noqa from .common import ( - is_array_like, is_bool, is_bool_dtype, is_categorical, - is_categorical_dtype, is_complex, is_complex_dtype, - is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64_ns_dtype, - is_datetime64tz_dtype, is_datetimetz, is_dict_like, is_dtype_equal, - is_extension_array_dtype, is_extension_type, is_file_like, is_float, - is_float_dtype, is_hashable, is_int64_dtype, is_integer, is_integer_dtype, - is_interval, is_interval_dtype, is_iterator, is_list_like, is_named_tuple, - is_number, is_numeric_dtype, is_object_dtype, is_period, is_period_dtype, - is_re, is_re_compilable, is_scalar, is_signed_integer_dtype, is_sparse, - is_string_dtype, is_timedelta64_dtype, is_timedelta64_ns_dtype, - is_unsigned_integer_dtype, pandas_dtype) + is_array_like, + is_bool, + is_bool_dtype, + is_categorical, + is_categorical_dtype, + is_complex, + is_complex_dtype, + is_datetime64_any_dtype, + is_datetime64_dtype, + is_datetime64_ns_dtype, + is_datetime64tz_dtype, + is_datetimetz, + is_dict_like, + is_dtype_equal, + is_extension_array_dtype, + is_extension_type, + is_file_like, + is_float, + is_float_dtype, + is_hashable, + is_int64_dtype, + is_integer, + is_integer_dtype, + is_interval, + is_interval_dtype, + is_iterator, + is_list_like, + is_named_tuple, + is_number, + is_numeric_dtype, + is_object_dtype, + is_period, + is_period_dtype, + is_re, + is_re_compilable, + is_scalar, + is_signed_integer_dtype, + is_sparse, + is_string_dtype, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, + is_unsigned_integer_dtype, + pandas_dtype, +) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index e7191136a7d538..59ef17e3d121f2 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -68,6 +68,7 @@ class property**. ``pandas.errors.AbstractMethodError`` and no ``register`` method is provided for registering virtual subclasses. """ + _metadata = () # type: Tuple[str, ...] def __str__(self): @@ -98,8 +99,7 @@ def __eq__(self, other): return False if isinstance(other, type(self)): return all( - getattr(self, attr) == getattr(other, attr) - for attr in self._metadata + getattr(self, attr) == getattr(other, attr) for attr in self._metadata ) return False @@ -146,7 +146,7 @@ def kind(self) -> str: -------- numpy.dtype.kind """ - return 'O' + return "O" @property def name(self) -> str: @@ -223,8 +223,9 @@ def construct_from_string(cls, string: str): if not isinstance(string, str): raise TypeError("Expects a string, got {}".format(type(string))) if string != cls.name: - raise TypeError("Cannot construct a '{}' from '{}'".format( - cls.__name__, string)) + raise TypeError( + "Cannot construct a '{}' from '{}'".format(cls.__name__, string) + ) return cls() @classmethod @@ -250,10 +251,9 @@ def is_dtype(cls, dtype) -> bool: 3. ``dtype`` has a ``dtype`` attribute, and any of the above conditions is true for ``dtype.dtype``. """ - dtype = getattr(dtype, 'dtype', dtype) + dtype = getattr(dtype, "dtype", dtype) - if isinstance(dtype, (ABCSeries, ABCIndexClass, - ABCDataFrame, np.dtype)): + if isinstance(dtype, (ABCSeries, ABCIndexClass, ABCDataFrame, np.dtype)): # https://github.com/pandas-dev/pandas/issues/22960 # avoid passing data to `construct_from_string`. This could # cause a FutureWarning from numpy about failing elementwise diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index c68d469d291e7f..f483cf520754bf 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -8,19 +8,49 @@ from pandas._libs.tslibs import NaT, OutOfBoundsDatetime, Period, iNaT from .common import ( - _INT64_DTYPE, _NS_DTYPE, _POSSIBLY_CAST_DTYPES, _TD_DTYPE, ensure_int8, - ensure_int16, ensure_int32, ensure_int64, ensure_object, ensure_str, - is_bool, is_bool_dtype, is_categorical_dtype, is_complex, is_complex_dtype, - is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, - is_datetime_or_timedelta_dtype, is_datetimelike, is_dtype_equal, - is_extension_array_dtype, is_extension_type, is_float, is_float_dtype, - is_integer, is_integer_dtype, is_object_dtype, is_scalar, is_string_dtype, - is_timedelta64_dtype, is_timedelta64_ns_dtype, is_unsigned_integer_dtype, - pandas_dtype) + _INT64_DTYPE, + _NS_DTYPE, + _POSSIBLY_CAST_DTYPES, + _TD_DTYPE, + ensure_int8, + ensure_int16, + ensure_int32, + ensure_int64, + ensure_object, + ensure_str, + is_bool, + is_bool_dtype, + is_categorical_dtype, + is_complex, + is_complex_dtype, + is_datetime64_dtype, + is_datetime64_ns_dtype, + is_datetime64tz_dtype, + is_datetime_or_timedelta_dtype, + is_datetimelike, + is_dtype_equal, + is_extension_array_dtype, + is_extension_type, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_object_dtype, + is_scalar, + is_string_dtype, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, + is_unsigned_integer_dtype, + pandas_dtype, +) from .dtypes import DatetimeTZDtype, ExtensionDtype, PeriodDtype from .generic import ( - ABCDatetimeArray, ABCDatetimeIndex, ABCPeriodArray, ABCPeriodIndex, - ABCSeries) + ABCDatetimeArray, + ABCDatetimeIndex, + ABCPeriodArray, + ABCPeriodIndex, + ABCSeries, +) from .inference import is_list_like from .missing import isna, notna @@ -35,8 +65,8 @@ def maybe_convert_platform(values): if isinstance(values, (list, tuple, range)): values = construct_1d_object_array_from_listlike(values) - if getattr(values, 'dtype', None) == np.object_: - if hasattr(values, '_values'): + if getattr(values, "dtype", None) == np.object_: + if hasattr(values, "_values"): values = values._values values = lib.maybe_convert_objects(values) @@ -72,27 +102,27 @@ def trans(x): return x if isinstance(dtype, str): - if dtype == 'infer': - inferred_type = lib.infer_dtype(ensure_object(result.ravel()), - skipna=False) - if inferred_type == 'boolean': - dtype = 'bool' - elif inferred_type == 'integer': - dtype = 'int64' - elif inferred_type == 'datetime64': - dtype = 'datetime64[ns]' - elif inferred_type == 'timedelta64': - dtype = 'timedelta64[ns]' + if dtype == "infer": + inferred_type = lib.infer_dtype(ensure_object(result.ravel()), skipna=False) + if inferred_type == "boolean": + dtype = "bool" + elif inferred_type == "integer": + dtype = "int64" + elif inferred_type == "datetime64": + dtype = "datetime64[ns]" + elif inferred_type == "timedelta64": + dtype = "timedelta64[ns]" # try to upcast here - elif inferred_type == 'floating': - dtype = 'int64' + elif inferred_type == "floating": + dtype = "int64" if issubclass(result.dtype.type, np.number): def trans(x): # noqa return x.round() + else: - dtype = 'object' + dtype = "object" if isinstance(dtype, str): dtype = np.dtype(dtype) @@ -101,8 +131,7 @@ def trans(x): # noqa # don't allow upcasts here (except if empty) if dtype.kind == result.dtype.kind: - if (result.dtype.itemsize <= dtype.itemsize and - np.prod(result.shape)): + if result.dtype.itemsize <= dtype.itemsize and np.prod(result.shape): return result if is_bool_dtype(dtype) or is_integer_dtype(dtype): @@ -116,17 +145,21 @@ def trans(x): # noqa arr = np.array([r[0]]) # if we have any nulls, then we are done - if (isna(arr).any() or - not np.allclose(arr, trans(arr).astype(dtype), rtol=0)): + if isna(arr).any() or not np.allclose( + arr, trans(arr).astype(dtype), rtol=0 + ): return result # a comparable, e.g. a Decimal may slip in here - elif not isinstance(r[0], (np.integer, np.floating, np.bool, int, - float, bool)): + elif not isinstance( + r[0], (np.integer, np.floating, np.bool, int, float, bool) + ): return result - if (issubclass(result.dtype.type, (np.object_, np.number)) and - notna(result).all()): + if ( + issubclass(result.dtype.type, (np.object_, np.number)) + and notna(result).all() + ): new_result = trans(result).astype(dtype) try: if np.allclose(new_result, result, rtol=0): @@ -137,20 +170,20 @@ def trans(x): # noqa # hit here if (new_result == result).all(): return new_result - elif (issubclass(dtype.type, np.floating) and - not is_bool_dtype(result.dtype)): + elif issubclass(dtype.type, np.floating) and not is_bool_dtype(result.dtype): return result.astype(dtype) # a datetimelike # GH12821, iNaT is casted to float - elif dtype.kind in ['M', 'm'] and result.dtype.kind in ['i', 'f']: + elif dtype.kind in ["M", "m"] and result.dtype.kind in ["i", "f"]: try: result = result.astype(dtype) except Exception: if dtype.tz: # convert to datetime and change timezone from pandas import to_datetime - result = to_datetime(result).tz_localize('utc') + + result = to_datetime(result).tz_localize("utc") result = result.tz_convert(dtype.tz) elif dtype.type == Period: @@ -206,7 +239,7 @@ def maybe_upcast_putmask(result, mask, other): if is_datetimelike(result.dtype): if is_scalar(other): if isna(other): - other = result.dtype.type('nat') + other = result.dtype.type("nat") elif is_integer(other): other = np.array(other, dtype=result.dtype) elif is_integer_dtype(other): @@ -244,8 +277,7 @@ def changeit(): # we have a scalar or len 0 ndarray # and its nan and we are changing some values - if (is_scalar(other) or - (isinstance(other, np.ndarray) and other.ndim < 1)): + if is_scalar(other) or (isinstance(other, np.ndarray) and other.ndim < 1): if isna(other): return changeit() @@ -385,10 +417,10 @@ def infer_dtype_from_scalar(val, pandas_dtype=False): elif isinstance(val, (np.datetime64, datetime)): val = tslibs.Timestamp(val) if val is tslibs.NaT or val.tz is None: - dtype = np.dtype('M8[ns]') + dtype = np.dtype("M8[ns]") else: if pandas_dtype: - dtype = DatetimeTZDtype(unit='ns', tz=val.tz) + dtype = DatetimeTZDtype(unit="ns", tz=val.tz) else: # return datetimetz as object return np.object_, val @@ -396,7 +428,7 @@ def infer_dtype_from_scalar(val, pandas_dtype=False): elif isinstance(val, (np.timedelta64, timedelta)): val = tslibs.Timedelta(val).value - dtype = np.dtype('m8[ns]') + dtype = np.dtype("m8[ns]") elif is_bool(val): dtype = np.bool_ @@ -473,8 +505,7 @@ def infer_dtype_from_array(arr, pandas_dtype=False): # don't force numpy coerce with nan's inferred = lib.infer_dtype(arr, skipna=False) - if inferred in ['string', 'bytes', 'unicode', - 'mixed', 'mixed-integer']: + if inferred in ["string", "bytes", "unicode", "mixed", "mixed-integer"]: return (np.object_, arr) arr = np.asarray(arr) @@ -506,7 +537,7 @@ def maybe_infer_dtype_type(element): numpy.int64 """ tipo = None - if hasattr(element, 'dtype'): + if hasattr(element, "dtype"): tipo = element.dtype elif is_list_like(element): element = np.asarray(element) @@ -547,15 +578,16 @@ def maybe_cast_item(obj, item, dtype): if dtype in (np.object_, np.bool_): obj[item] = chunk.astype(np.object_) elif not issubclass(dtype, (np.integer, np.bool_)): # pragma: no cover - raise ValueError("Unexpected dtype encountered: {dtype}" - .format(dtype=dtype)) + raise ValueError( + "Unexpected dtype encountered: {dtype}".format(dtype=dtype) + ) def invalidate_string_dtypes(dtype_set): """Change string like dtypes to object for ``DataFrame.select_dtypes()``. """ - non_string_dtypes = dtype_set - {np.dtype('S').type, np.dtype(' 1 and coerce: - raise ValueError("Only one of 'datetime', 'numeric' or " - "'timedelta' can be True when when coerce=True.") + raise ValueError( + "Only one of 'datetime', 'numeric' or " + "'timedelta' can be True when when coerce=True." + ) if isinstance(values, (list, tuple)): # List or scalar values = np.array(values, dtype=np.object_) - elif not hasattr(values, 'dtype'): + elif not hasattr(values, "dtype"): values = np.array([values], dtype=np.object_) elif not is_object_dtype(values.dtype): # If not object, do not attempt conversion @@ -798,21 +839,23 @@ def soft_convert_objects(values, datetime=True, numeric=True, timedelta=True, # Immediate return if coerce if datetime: from pandas import to_datetime - return to_datetime(values, errors='coerce').to_numpy() + + return to_datetime(values, errors="coerce").to_numpy() elif timedelta: from pandas import to_timedelta - return to_timedelta(values, errors='coerce').to_numpy() + + return to_timedelta(values, errors="coerce").to_numpy() elif numeric: from pandas import to_numeric - return to_numeric(values, errors='coerce') + + return to_numeric(values, errors="coerce") # Soft conversions if datetime: # GH 20380, when datetime is beyond year 2262, hence outside # bound of nanosecond-resolution 64-bit integers. try: - values = lib.maybe_convert_objects(values, - convert_datetime=datetime) + values = lib.maybe_convert_objects(values, convert_datetime=datetime) except OutOfBoundsDatetime: pass @@ -822,8 +865,7 @@ def soft_convert_objects(values, datetime=True, numeric=True, timedelta=True, if numeric and is_object_dtype(values.dtype): try: - converted = lib.maybe_convert_numeric(values, set(), - coerce_numeric=True) + converted = lib.maybe_convert_numeric(values, set(), coerce_numeric=True) # If all NaNs, then do not-alter values = converted if not isna(converted).all() else values values = values.copy() if copy else values @@ -839,9 +881,9 @@ def maybe_castable(arr): # check datetime64[ns]/timedelta64[ns] are valid # otherwise try to coerce kind = arr.dtype.kind - if kind == 'M': + if kind == "M": return is_datetime64_ns_dtype(arr.dtype) - elif kind == 'm': + elif kind == "m": return is_timedelta64_ns_dtype(arr.dtype) return arr.dtype.name not in _POSSIBLY_CAST_DTYPES @@ -866,8 +908,9 @@ def maybe_infer_to_datetimelike(value, convert_dates=False): """ # TODO: why not timedelta? - if isinstance(value, (ABCDatetimeIndex, ABCPeriodIndex, - ABCDatetimeArray, ABCPeriodArray)): + if isinstance( + value, (ABCDatetimeIndex, ABCPeriodIndex, ABCDatetimeArray, ABCPeriodArray) + ): return value elif isinstance(value, ABCSeries): if isinstance(value._values, ABCDatetimeIndex): @@ -894,9 +937,7 @@ def try_datetime(v): # safe coerce to datetime64 try: # GH19671 - v = tslib.array_to_datetime(v, - require_iso8601=True, - errors='raise')[0] + v = tslib.array_to_datetime(v, require_iso8601=True, errors="raise")[0] except ValueError: # we might have a sequence of the same-datetimes with tz's @@ -907,8 +948,7 @@ def try_datetime(v): from pandas import DatetimeIndex values, tz = conversion.datetime_to_datetime64(v) - return DatetimeIndex(values).tz_localize( - 'UTC').tz_convert(tz=tz) + return DatetimeIndex(values).tz_localize("UTC").tz_convert(tz=tz) except (ValueError, TypeError): pass @@ -922,6 +962,7 @@ def try_timedelta(v): # will try first with a string & object conversion from pandas import to_timedelta + try: return to_timedelta(v)._ndarray_values.reshape(shape) except Exception: @@ -929,13 +970,13 @@ def try_timedelta(v): inferred_type = lib.infer_datetimelike_array(ensure_object(v)) - if inferred_type == 'date' and convert_dates: + if inferred_type == "date" and convert_dates: value = try_datetime(v) - elif inferred_type == 'datetime': + elif inferred_type == "datetime": value = try_datetime(v) - elif inferred_type == 'timedelta': + elif inferred_type == "timedelta": value = try_timedelta(v) - elif inferred_type == 'nat': + elif inferred_type == "nat": # if all NaT, return as datetime if isna(v).all(): @@ -946,7 +987,7 @@ def try_timedelta(v): # try timedelta first to avoid spurious datetime conversions # e.g. '00:00:01' is a timedelta but technically is also a datetime value = try_timedelta(v) - if lib.infer_dtype(value, skipna=False) in ['mixed']: + if lib.infer_dtype(value, skipna=False) in ["mixed"]: # cannot skip missing values, as NaT implies that the string # is actually a datetime value = try_datetime(v) @@ -954,7 +995,7 @@ def try_timedelta(v): return value -def maybe_cast_to_datetime(value, dtype, errors='raise'): +def maybe_cast_to_datetime(value, dtype, errors="raise"): """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT """ @@ -972,17 +1013,21 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): if is_datetime64 or is_datetime64tz or is_timedelta64: # Force the dtype if needed. - msg = ("The '{dtype}' dtype has no unit. " - "Please pass in '{dtype}[ns]' instead.") + msg = ( + "The '{dtype}' dtype has no unit. " + "Please pass in '{dtype}[ns]' instead." + ) if is_datetime64 and not is_dtype_equal(dtype, _NS_DTYPE): - if dtype.name in ('datetime64', 'datetime64[ns]'): - if dtype.name == 'datetime64': + if dtype.name in ("datetime64", "datetime64[ns]"): + if dtype.name == "datetime64": raise ValueError(msg.format(dtype=dtype.name)) dtype = _NS_DTYPE else: - raise TypeError("cannot convert datetimelike to " - "dtype [{dtype}]".format(dtype=dtype)) + raise TypeError( + "cannot convert datetimelike to " + "dtype [{dtype}]".format(dtype=dtype) + ) elif is_datetime64tz: # our NaT doesn't support tz's @@ -992,13 +1037,15 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): value = [value] elif is_timedelta64 and not is_dtype_equal(dtype, _TD_DTYPE): - if dtype.name in ('timedelta64', 'timedelta64[ns]'): - if dtype.name == 'timedelta64': + if dtype.name in ("timedelta64", "timedelta64[ns]"): + if dtype.name == "timedelta64": raise ValueError(msg.format(dtype=dtype.name)) dtype = _TD_DTYPE else: - raise TypeError("cannot convert timedeltalike to " - "dtype [{dtype}]".format(dtype=dtype)) + raise TypeError( + "cannot convert timedeltalike to " + "dtype [{dtype}]".format(dtype=dtype) + ) if is_scalar(value): if value == iNaT or isna(value): @@ -1011,8 +1058,7 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): value = iNaT # we have an array of datetime or timedeltas & nulls - elif np.prod(value.shape) or not is_dtype_equal(value.dtype, - dtype): + elif np.prod(value.shape) or not is_dtype_equal(value.dtype, dtype): try: if is_datetime64: value = to_datetime(value, errors=errors) @@ -1034,8 +1080,7 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): else: # Numeric values are UTC at this point, # so localize and convert - value = (value.tz_localize('UTC') - .tz_convert(dtype.tz)) + value = value.tz_localize("UTC").tz_convert(dtype.tz) elif is_timedelta64: value = to_timedelta(value, errors=errors)._values except OutOfBoundsDatetime: @@ -1048,12 +1093,11 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): if is_object_dtype(dtype): if value.dtype != _NS_DTYPE: value = value.astype(_NS_DTYPE) - ints = np.asarray(value).view('i8') + ints = np.asarray(value).view("i8") return tslib.ints_to_pydatetime(ints) # we have a non-castable dtype that was passed - raise TypeError('Cannot cast datetime64 to {dtype}' - .format(dtype=dtype)) + raise TypeError("Cannot cast datetime64 to {dtype}".format(dtype=dtype)) else: @@ -1061,20 +1105,24 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): # catch a datetime/timedelta that is not of ns variety # and no coercion specified - if is_array and value.dtype.kind in ['M', 'm']: + if is_array and value.dtype.kind in ["M", "m"]: dtype = value.dtype - if dtype.kind == 'M' and dtype != _NS_DTYPE: + if dtype.kind == "M" and dtype != _NS_DTYPE: value = tslibs.conversion.ensure_datetime64ns(value) - elif dtype.kind == 'm' and dtype != _TD_DTYPE: + elif dtype.kind == "m" and dtype != _TD_DTYPE: value = to_timedelta(value) # only do this if we have an array and the dtype of the array is not # setup already we are not an integer/object, so don't bother with this # conversion - elif not (is_array and not (issubclass(value.dtype.type, np.integer) or - value.dtype == np.object_)): + elif not ( + is_array + and not ( + issubclass(value.dtype.type, np.integer) or value.dtype == np.object_ + ) + ): value = maybe_infer_to_datetimelike(value) return value @@ -1099,7 +1147,7 @@ def find_common_type(types): """ if len(types) == 0: - raise ValueError('no types given') + raise ValueError("no types given") first = types[0] @@ -1113,9 +1161,9 @@ def find_common_type(types): # take lowest unit if all(is_datetime64_dtype(t) for t in types): - return np.dtype('datetime64[ns]') + return np.dtype("datetime64[ns]") if all(is_timedelta64_dtype(t) for t in types): - return np.dtype('timedelta64[ns]') + return np.dtype("timedelta64[ns]") # don't mix bool / int or float or complex # this is different from numpy, which casts bool with float/int as int @@ -1174,9 +1222,11 @@ def construct_1d_arraylike_from_scalar(value, length, dtype): """ if is_datetime64tz_dtype(dtype): from pandas import DatetimeIndex + subarr = DatetimeIndex([value] * length, dtype=dtype) elif is_categorical_dtype(dtype): from pandas import Categorical + subarr = Categorical([value] * length, dtype=dtype) else: if not isinstance(dtype, (np.dtype, type(np.dtype))): @@ -1184,7 +1234,7 @@ def construct_1d_arraylike_from_scalar(value, length, dtype): if length and is_integer_dtype(dtype) and isna(value): # coerce if we have nan for an integer dtype - dtype = np.dtype('float64') + dtype = np.dtype("float64") elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"): # we need to coerce to object dtype to avoid # to allow numpy to take our string as a scalar value @@ -1218,7 +1268,7 @@ def construct_1d_object_array_from_listlike(values): """ # numpy will try to interpret nested lists as further dimensions, hence # making a 1D array that contains list-likes is a bit tricky: - result = np.empty(len(values), dtype='object') + result = np.empty(len(values), dtype="object") result[:] = values return result @@ -1314,8 +1364,10 @@ def maybe_cast_to_integer_array(arr, dtype, copy=False): else: casted = arr.astype(dtype, copy=copy) except OverflowError: - raise OverflowError("The elements provided in the data cannot all be " - "casted to the dtype {dtype}".format(dtype=dtype)) + raise OverflowError( + "The elements provided in the data cannot all be " + "casted to the dtype {dtype}".format(dtype=dtype) + ) if np.array_equal(arr, casted): return casted @@ -1328,9 +1380,7 @@ def maybe_cast_to_integer_array(arr, dtype, copy=False): arr = np.asarray(arr) if is_unsigned_integer_dtype(dtype) and (arr < 0).any(): - raise OverflowError("Trying to coerce negative values " - "to unsigned integers") + raise OverflowError("Trying to coerce negative values " "to unsigned integers") - if is_integer_dtype(dtype) and (is_float_dtype(arr) or - is_object_dtype(arr)): + if is_integer_dtype(dtype) and (is_float_dtype(arr) or is_object_dtype(arr)): raise ValueError("Trying to coerce float values to integers") diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index b2b74e2a70ca99..d0e4bd9b4482a8 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -9,22 +9,61 @@ from pandas.compat import PY36 from pandas.core.dtypes.dtypes import ( - CategoricalDtype, DatetimeTZDtype, ExtensionDtype, IntervalDtype, - PeriodDtype, registry) + CategoricalDtype, + DatetimeTZDtype, + ExtensionDtype, + IntervalDtype, + PeriodDtype, + registry, +) from pandas.core.dtypes.generic import ( - ABCCategorical, ABCDateOffset, ABCDatetimeIndex, ABCIndexClass, - ABCPeriodArray, ABCPeriodIndex, ABCSeries) + ABCCategorical, + ABCDateOffset, + ABCDatetimeIndex, + ABCIndexClass, + ABCPeriodArray, + ABCPeriodIndex, + ABCSeries, +) from pandas.core.dtypes.inference import ( # noqa:F401 - is_array_like, is_bool, is_complex, is_decimal, is_dict_like, is_file_like, - is_float, is_hashable, is_integer, is_interval, is_iterator, is_list_like, - is_named_tuple, is_nested_list_like, is_number, is_re, is_re_compilable, - is_scalar, is_sequence, is_string_like) + is_array_like, + is_bool, + is_complex, + is_decimal, + is_dict_like, + is_file_like, + is_float, + is_hashable, + is_integer, + is_interval, + is_iterator, + is_list_like, + is_named_tuple, + is_nested_list_like, + is_number, + is_re, + is_re_compilable, + is_scalar, + is_sequence, + is_string_like, +) from pandas._typing import ArrayLike -_POSSIBLY_CAST_DTYPES = {np.dtype(t).name - for t in ['O', 'int8', 'uint8', 'int16', 'uint16', - 'int32', 'uint32', 'int64', 'uint64']} +_POSSIBLY_CAST_DTYPES = { + np.dtype(t).name + for t in [ + "O", + "int8", + "uint8", + "int16", + "uint16", + "int32", + "uint32", + "int64", + "uint64", + ] +} _NS_DTYPE = conversion.NS_DTYPE _TD_DTYPE = conversion.TD_DTYPE @@ -74,7 +113,7 @@ def ensure_str(value: Union[bytes, Any]) -> str: Ensure that bytes and non-strings get converted into ``str`` objects. """ if isinstance(value, bytes): - value = value.decode('utf-8') + value = value.decode("utf-8") elif not isinstance(value, str): value = str(value) return value @@ -97,6 +136,7 @@ def ensure_categorical(arr): if not is_categorical(arr): from pandas import Categorical + arr = Categorical(arr) return arr @@ -128,13 +168,13 @@ def ensure_int_or_float(arr: ArrayLike, copy=False) -> np.array: will remain unchanged. """ try: - return arr.astype('int64', copy=copy, casting='safe') + return arr.astype("int64", copy=copy, casting="safe") except TypeError: pass try: - return arr.astype('uint64', copy=copy, casting='safe') + return arr.astype("uint64", copy=copy, casting="safe") except TypeError: - return arr.astype('float64', copy=copy) + return arr.astype("float64", copy=copy) def ensure_python_int(value: Union[int, np.integer]) -> int: @@ -154,12 +194,13 @@ def ensure_python_int(value: Union[int, np.integer]) -> int: TypeError: if the value isn't an int or can't be converted to one. """ if not is_scalar(value): - raise TypeError("Value needs to be a scalar value, was type {}" - .format(type(value))) + raise TypeError( + "Value needs to be a scalar value, was type {}".format(type(value)) + ) msg = "Wrong type {} for value {}" try: new_value = int(value) - assert (new_value == value) + assert new_value == value except (TypeError, ValueError, AssertionError): raise TypeError(msg.format(type(value), value)) return new_value @@ -175,8 +216,10 @@ def classes_and_not_datetimelike(*klasses): evaluate if the tipo is a subclass of the klasses and not a datetimelike """ - return lambda tipo: (issubclass(tipo, klasses) and - not issubclass(tipo, (np.datetime64, np.timedelta64))) + return lambda tipo: ( + issubclass(tipo, klasses) + and not issubclass(tipo, (np.datetime64, np.timedelta64)) + ) def is_object_dtype(arr_or_dtype): @@ -267,7 +310,7 @@ def is_sparse(arr): """ from pandas.core.arrays.sparse import SparseDtype - dtype = getattr(arr, 'dtype', arr) + dtype = getattr(arr, "dtype", arr) return isinstance(dtype, SparseDtype) @@ -385,9 +428,12 @@ def is_datetimetz(arr): True """ - warnings.warn("'is_datetimetz' is deprecated and will be removed in a " - "future version. Use 'is_datetime64tz_dtype' instead.", - FutureWarning, stacklevel=2) + warnings.warn( + "'is_datetimetz' is deprecated and will be removed in a " + "future version. Use 'is_datetime64tz_dtype' instead.", + FutureWarning, + stacklevel=2, + ) return is_datetime64tz_dtype(arr) @@ -417,8 +463,7 @@ def is_offsetlike(arr_or_obj): """ if isinstance(arr_or_obj, ABCDateOffset): return True - elif (is_list_like(arr_or_obj) and len(arr_or_obj) and - is_object_dtype(arr_or_obj)): + elif is_list_like(arr_or_obj) and len(arr_or_obj) and is_object_dtype(arr_or_obj): return all(isinstance(x, ABCDateOffset) for x in arr_or_obj) return False @@ -449,9 +494,13 @@ def is_period(arr): True """ - warnings.warn("'is_period' is deprecated and will be removed in a future " - "version. Use 'is_period_dtype' or is_period_arraylike' " - "instead.", FutureWarning, stacklevel=2) + warnings.warn( + "'is_period' is deprecated and will be removed in a future " + "version. Use 'is_period_dtype' or is_period_arraylike' " + "instead.", + FutureWarning, + stacklevel=2, + ) return isinstance(arr, ABCPeriodIndex) or is_period_arraylike(arr) @@ -690,7 +739,8 @@ def is_string_dtype(arr_or_dtype): # TODO: gh-15585: consider making the checks stricter. def condition(dtype): - return dtype.kind in ('O', 'S', 'U') and not is_period_dtype(dtype) + return dtype.kind in ("O", "S", "U") and not is_period_dtype(dtype) + return _is_dtype(arr_or_dtype, condition) @@ -723,7 +773,7 @@ def is_period_arraylike(arr): return True elif isinstance(arr, (np.ndarray, ABCSeries)): return is_period_dtype(arr.dtype) - return getattr(arr, 'inferred_type', None) == 'period' + return getattr(arr, "inferred_type", None) == "period" def is_datetime_arraylike(arr): @@ -754,9 +804,11 @@ def is_datetime_arraylike(arr): if isinstance(arr, ABCDatetimeIndex): return True elif isinstance(arr, (np.ndarray, ABCSeries)): - return (is_object_dtype(arr.dtype) - and lib.infer_dtype(arr, skipna=False) == 'datetime') - return getattr(arr, 'inferred_type', None) == 'datetime' + return ( + is_object_dtype(arr.dtype) + and lib.infer_dtype(arr, skipna=False) == "datetime" + ) + return getattr(arr, "inferred_type", None) == "datetime" def is_datetimelike(arr): @@ -799,9 +851,12 @@ def is_datetimelike(arr): True """ - return (is_datetime64_dtype(arr) or is_datetime64tz_dtype(arr) or - is_timedelta64_dtype(arr) or - isinstance(arr, ABCPeriodIndex)) + return ( + is_datetime64_dtype(arr) + or is_datetime64tz_dtype(arr) + or is_timedelta64_dtype(arr) + or isinstance(arr, ABCPeriodIndex) + ) def is_dtype_equal(source, target): @@ -925,8 +980,7 @@ def is_any_int_dtype(arr_or_dtype): False """ - return _is_dtype_type( - arr_or_dtype, classes(np.integer, np.timedelta64)) + return _is_dtype_type(arr_or_dtype, classes(np.integer, np.timedelta64)) def is_integer_dtype(arr_or_dtype): @@ -981,8 +1035,7 @@ def is_integer_dtype(arr_or_dtype): False """ - return _is_dtype_type( - arr_or_dtype, classes_and_not_datetimelike(np.integer)) + return _is_dtype_type(arr_or_dtype, classes_and_not_datetimelike(np.integer)) def is_signed_integer_dtype(arr_or_dtype): @@ -1039,8 +1092,7 @@ def is_signed_integer_dtype(arr_or_dtype): False """ - return _is_dtype_type( - arr_or_dtype, classes_and_not_datetimelike(np.signedinteger)) + return _is_dtype_type(arr_or_dtype, classes_and_not_datetimelike(np.signedinteger)) def is_unsigned_integer_dtype(arr_or_dtype): @@ -1088,7 +1140,8 @@ def is_unsigned_integer_dtype(arr_or_dtype): True """ return _is_dtype_type( - arr_or_dtype, classes_and_not_datetimelike(np.unsignedinteger)) + arr_or_dtype, classes_and_not_datetimelike(np.unsignedinteger) + ) def is_int64_dtype(arr_or_dtype): @@ -1179,8 +1232,7 @@ def is_datetime64_any_dtype(arr_or_dtype): if arr_or_dtype is None: return False - return (is_datetime64_dtype(arr_or_dtype) or - is_datetime64tz_dtype(arr_or_dtype)) + return is_datetime64_dtype(arr_or_dtype) or is_datetime64tz_dtype(arr_or_dtype) def is_datetime64_ns_dtype(arr_or_dtype): @@ -1230,7 +1282,7 @@ def is_datetime64_ns_dtype(arr_or_dtype): tipo = _get_dtype(arr_or_dtype.dtype) else: return False - return tipo == _NS_DTYPE or getattr(tipo, 'base', None) == _NS_DTYPE + return tipo == _NS_DTYPE or getattr(tipo, "base", None) == _NS_DTYPE def is_timedelta64_ns_dtype(arr_or_dtype): @@ -1300,8 +1352,7 @@ def is_datetime_or_timedelta_dtype(arr_or_dtype): True """ - return _is_dtype_type( - arr_or_dtype, classes(np.datetime64, np.timedelta64)) + return _is_dtype_type(arr_or_dtype, classes(np.datetime64, np.timedelta64)) def _is_unorderable_exception(e): @@ -1325,7 +1376,7 @@ def _is_unorderable_exception(e): if PY36: return "'>' not supported between instances of" in str(e) - return 'unorderable' in str(e) + return "unorderable" in str(e) def is_numeric_v_string_like(a, b): @@ -1380,10 +1431,12 @@ def is_numeric_v_string_like(a, b): is_a_scalar_string_like = not is_a_array and is_string_like(a) is_b_scalar_string_like = not is_b_array and is_string_like(b) - return ((is_a_numeric_array and is_b_scalar_string_like) or - (is_b_numeric_array and is_a_scalar_string_like) or - (is_a_numeric_array and is_b_string_array) or - (is_b_numeric_array and is_a_string_array)) + return ( + (is_a_numeric_array and is_b_scalar_string_like) + or (is_b_numeric_array and is_a_scalar_string_like) + or (is_a_numeric_array and is_b_string_array) + or (is_b_numeric_array and is_a_string_array) + ) def is_datetimelike_v_numeric(a, b): @@ -1428,9 +1481,9 @@ def is_datetimelike_v_numeric(a, b): False """ - if not hasattr(a, 'dtype'): + if not hasattr(a, "dtype"): a = np.asarray(a) - if not hasattr(b, 'dtype'): + if not hasattr(b, "dtype"): b = np.asarray(b) def is_numeric(x): @@ -1440,8 +1493,9 @@ def is_numeric(x): return is_integer_dtype(x) or is_float_dtype(x) is_datetimelike = needs_i8_conversion - return ((is_datetimelike(a) and is_numeric(b)) or - (is_datetimelike(b) and is_numeric(a))) + return (is_datetimelike(a) and is_numeric(b)) or ( + is_datetimelike(b) and is_numeric(a) + ) def is_datetimelike_v_object(a, b): @@ -1487,14 +1541,15 @@ def is_datetimelike_v_object(a, b): False """ - if not hasattr(a, 'dtype'): + if not hasattr(a, "dtype"): a = np.asarray(a) - if not hasattr(b, 'dtype'): + if not hasattr(b, "dtype"): b = np.asarray(b) is_datetimelike = needs_i8_conversion - return ((is_datetimelike(a) and is_object_dtype(b)) or - (is_datetimelike(b) and is_object_dtype(a))) + return (is_datetimelike(a) and is_object_dtype(b)) or ( + is_datetimelike(b) and is_object_dtype(a) + ) def needs_i8_conversion(arr_or_dtype): @@ -1534,9 +1589,11 @@ def needs_i8_conversion(arr_or_dtype): if arr_or_dtype is None: return False - return (is_datetime_or_timedelta_dtype(arr_or_dtype) or - is_datetime64tz_dtype(arr_or_dtype) or - is_period_dtype(arr_or_dtype)) + return ( + is_datetime_or_timedelta_dtype(arr_or_dtype) + or is_datetime64tz_dtype(arr_or_dtype) + or is_period_dtype(arr_or_dtype) + ) def is_numeric_dtype(arr_or_dtype): @@ -1578,7 +1635,8 @@ def is_numeric_dtype(arr_or_dtype): """ return _is_dtype_type( - arr_or_dtype, classes_and_not_datetimelike(np.number, np.bool_)) + arr_or_dtype, classes_and_not_datetimelike(np.number, np.bool_) + ) def is_string_like_dtype(arr_or_dtype): @@ -1610,8 +1668,7 @@ def is_string_like_dtype(arr_or_dtype): False """ - return _is_dtype( - arr_or_dtype, lambda dtype: dtype.kind in ('S', 'U')) + return _is_dtype(arr_or_dtype, lambda dtype: dtype.kind in ("S", "U")) def is_float_dtype(arr_or_dtype): @@ -1705,10 +1762,9 @@ def is_bool_dtype(arr_or_dtype): # we don't have a boolean Index class # so its object, we need to infer to # guess this - return (arr_or_dtype.is_object and - arr_or_dtype.inferred_type == 'boolean') + return arr_or_dtype.is_object and arr_or_dtype.inferred_type == "boolean" elif is_extension_array_dtype(arr_or_dtype): - dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype) + dtype = getattr(arr_or_dtype, "dtype", arr_or_dtype) return dtype._is_boolean return issubclass(dtype.type, np.bool_) @@ -1818,9 +1874,8 @@ def is_extension_array_dtype(arr_or_dtype): >>> is_extension_array_dtype(arr.dtype) False """ - dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype) - return (isinstance(dtype, ExtensionDtype) or - registry.find(dtype) is not None) + dtype = getattr(arr_or_dtype, "dtype", arr_or_dtype) + return isinstance(dtype, ExtensionDtype) or registry.find(dtype) is not None def is_complex_dtype(arr_or_dtype): @@ -1911,7 +1966,7 @@ def _get_dtype(arr_or_dtype): return np.dtype(arr_or_dtype) # if we have an array-like - elif hasattr(arr_or_dtype, 'dtype'): + elif hasattr(arr_or_dtype, "dtype"): arr_or_dtype = arr_or_dtype.dtype return pandas_dtype(arr_or_dtype) @@ -1944,7 +1999,7 @@ def _is_dtype_type(arr_or_dtype, condition): return condition(np.dtype(arr_or_dtype).type) # if we have an array-like - if hasattr(arr_or_dtype, 'dtype'): + if hasattr(arr_or_dtype, "dtype"): arr_or_dtype = arr_or_dtype.dtype # we are not possibly a dtype @@ -2005,13 +2060,13 @@ def infer_dtype_from_object(dtype): # TODO(jreback) # should deprecate these - if dtype in ['datetimetz', 'datetime64tz']: + if dtype in ["datetimetz", "datetime64tz"]: return DatetimeTZDtype.type - elif dtype in ['period']: + elif dtype in ["period"]: raise NotImplementedError - if dtype == 'datetime' or dtype == 'timedelta': - dtype += '64' + if dtype == "datetime" or dtype == "timedelta": + dtype += "64" try: return infer_dtype_from_object(getattr(np, dtype)) except (AttributeError, TypeError): @@ -2045,9 +2100,9 @@ def _validate_date_like_dtype(dtype): try: typ = np.datetime_data(dtype)[0] except ValueError as e: - raise TypeError('{error}'.format(error=e)) - if typ != 'generic' and typ != 'ns': - msg = '{name!r} is too specific of a frequency, try passing {type!r}' + raise TypeError("{error}".format(error=e)) + if typ != "generic" and typ != "ns": + msg = "{name!r} is too specific of a frequency, try passing {type!r}" raise ValueError(msg.format(name=dtype.name, type=dtype.type.__name__)) @@ -2086,19 +2141,18 @@ def pandas_dtype(dtype): # we don't want to force a repr of the non-string if not isinstance(dtype, str): raise TypeError("data type not understood") - raise TypeError("data type '{}' not understood".format( - dtype)) + raise TypeError("data type '{}' not understood".format(dtype)) # Any invalid dtype (such as pd.Timestamp) should raise an error. # np.dtype(invalid_type).kind = 0 for such objects. However, this will # also catch some valid dtypes such as object, np.object_ and 'object' # which we safeguard against by catching them earlier and returning # np.dtype(valid_dtype) before this condition is evaluated. - if is_hashable(dtype) and dtype in [object, np.object_, 'object', 'O']: + if is_hashable(dtype) and dtype in [object, np.object_, "object", "O"]: # check hashability to avoid errors/DeprecationWarning when we get # here and `dtype` is an array return npdtype - elif npdtype.kind == 'O': + elif npdtype.kind == "O": raise TypeError("dtype '{}' not understood".format(dtype)) return npdtype diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 66f7a6365fe416..ac74ad5726a992 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -7,12 +7,27 @@ from pandas._libs import tslib, tslibs from pandas.core.dtypes.common import ( - _NS_DTYPE, _TD_DTYPE, is_bool_dtype, is_categorical_dtype, - is_datetime64_dtype, is_datetime64tz_dtype, is_dtype_equal, - is_extension_array_dtype, is_object_dtype, is_sparse, is_timedelta64_dtype) + _NS_DTYPE, + _TD_DTYPE, + is_bool_dtype, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_object_dtype, + is_sparse, + is_timedelta64_dtype, +) from pandas.core.dtypes.generic import ( - ABCDatetimeArray, ABCDatetimeIndex, ABCIndexClass, ABCPeriodIndex, - ABCRangeIndex, ABCSparseDataFrame, ABCTimedeltaIndex) + ABCDatetimeArray, + ABCDatetimeIndex, + ABCIndexClass, + ABCPeriodIndex, + ABCRangeIndex, + ABCSparseDataFrame, + ABCTimedeltaIndex, +) def get_dtype_kinds(l): @@ -31,23 +46,23 @@ def get_dtype_kinds(l): dtype = arr.dtype if is_categorical_dtype(dtype): - typ = 'category' + typ = "category" elif is_sparse(arr): - typ = 'sparse' + typ = "sparse" elif isinstance(arr, ABCRangeIndex): - typ = 'range' + typ = "range" elif is_datetime64tz_dtype(arr): # if to_concat contains different tz, # the result must be object dtype typ = str(arr.dtype) elif is_datetime64_dtype(dtype): - typ = 'datetime' + typ = "datetime" elif is_timedelta64_dtype(dtype): - typ = 'timedelta' + typ = "timedelta" elif is_object_dtype(dtype): - typ = 'object' + typ = "object" elif is_bool_dtype(dtype): - typ = 'bool' + typ = "bool" elif is_extension_array_dtype(dtype): typ = str(arr.dtype) else: @@ -66,8 +81,7 @@ def _get_series_result_type(result, objs=None): # concat Series with axis 1 if isinstance(result, dict): # concat Series with axis 1 - if all(isinstance(c, (SparseSeries, SparseDataFrame)) - for c in result.values()): + if all(isinstance(c, (SparseSeries, SparseDataFrame)) for c in result.values()): return SparseDataFrame else: return DataFrame @@ -83,13 +97,12 @@ def _get_frame_result_type(result, objs): otherwise, return 1st obj """ - if (result.blocks and ( - any(isinstance(obj, ABCSparseDataFrame) for obj in objs))): + if result.blocks and (any(isinstance(obj, ABCSparseDataFrame) for obj in objs)): from pandas.core.sparse.api import SparseDataFrame + return SparseDataFrame else: - return next(obj for obj in objs if not isinstance(obj, - ABCSparseDataFrame)) + return next(obj for obj in objs if not isinstance(obj, ABCSparseDataFrame)) def _concat_compat(to_concat, axis=0): @@ -125,24 +138,24 @@ def is_nonempty(x): # np.concatenate which has them both implemented is compiled. typs = get_dtype_kinds(to_concat) - _contains_datetime = any(typ.startswith('datetime') for typ in typs) - _contains_period = any(typ.startswith('period') for typ in typs) + _contains_datetime = any(typ.startswith("datetime") for typ in typs) + _contains_period = any(typ.startswith("period") for typ in typs) - if 'category' in typs: + if "category" in typs: # this must be prior to _concat_datetime, # to support Categorical + datetime-like return _concat_categorical(to_concat, axis=axis) - elif _contains_datetime or 'timedelta' in typs or _contains_period: + elif _contains_datetime or "timedelta" in typs or _contains_period: return _concat_datetime(to_concat, axis=axis, typs=typs) # these are mandated to handle empties as well - elif 'sparse' in typs: + elif "sparse" in typs: return _concat_sparse(to_concat, axis=axis, typs=typs) all_empty = all(not is_nonempty(x) for x in to_concat) if any(is_extension_array_dtype(x) for x in to_concat) and axis == 1: - to_concat = [np.atleast_2d(x.astype('object')) for x in to_concat] + to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat] if all_empty: # we have all empties, but may need to coerce the result dtype to @@ -151,13 +164,12 @@ def is_nonempty(x): typs = get_dtype_kinds(to_concat) if len(typs) != 1: - if (not len(typs - {'i', 'u', 'f'}) or - not len(typs - {'bool', 'i', 'u'})): + if not len(typs - {"i", "u", "f"}) or not len(typs - {"bool", "i", "u"}): # let numpy coerce pass else: # coerce to object - to_concat = [x.astype('object') for x in to_concat] + to_concat = [x.astype("object") for x in to_concat] return np.concatenate(to_concat, axis=axis) @@ -194,9 +206,14 @@ def _concat_categorical(to_concat, axis=0): return union_categoricals(categoricals) # extract the categoricals & coerce to object if needed - to_concat = [x._internal_get_values() if is_categorical_dtype(x.dtype) - else np.asarray(x).ravel() if not is_datetime64tz_dtype(x) - else np.asarray(x.astype(object)) for x in to_concat] + to_concat = [ + x._internal_get_values() + if is_categorical_dtype(x.dtype) + else np.asarray(x).ravel() + if not is_datetime64tz_dtype(x) + else np.asarray(x.astype(object)) + for x in to_concat + ] result = _concat_compat(to_concat) if axis == 1: result = result.reshape(1, len(result)) @@ -309,7 +326,7 @@ def union_categoricals(to_union, sort_categories=False, ignore_order=False): from pandas.core.arrays.categorical import _recode_for_categories if len(to_union) == 0: - raise ValueError('No Categoricals to union') + raise ValueError("No Categoricals to union") def _maybe_unwrap(x): if isinstance(x, (CategoricalIndex, Series)): @@ -322,8 +339,10 @@ def _maybe_unwrap(x): to_union = [_maybe_unwrap(x) for x in to_union] first = to_union[0] - if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype) - for other in to_union[1:]): + if not all( + is_dtype_equal(other.categories.dtype, first.categories.dtype) + for other in to_union[1:] + ): raise TypeError("dtype of categories must be the same") ordered = False @@ -332,25 +351,26 @@ def _maybe_unwrap(x): categories = first.categories ordered = first.ordered - if all(first.categories.equals(other.categories) - for other in to_union[1:]): + if all(first.categories.equals(other.categories) for other in to_union[1:]): new_codes = np.concatenate([c.codes for c in to_union]) else: - codes = [first.codes] + [_recode_for_categories(other.codes, - other.categories, - first.categories) - for other in to_union[1:]] + codes = [first.codes] + [ + _recode_for_categories(other.codes, other.categories, first.categories) + for other in to_union[1:] + ] new_codes = np.concatenate(codes) if sort_categories and not ignore_order and ordered: - raise TypeError("Cannot use sort_categories=True with " - "ordered Categoricals") + raise TypeError( + "Cannot use sort_categories=True with " "ordered Categoricals" + ) if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() indexer = categories.get_indexer(first.categories) from pandas.core.algorithms import take_1d + new_codes = take_1d(indexer, new_codes, fill_value=-1) elif ignore_order or all(not c.ordered for c in to_union): # different categories - union and recode @@ -359,23 +379,22 @@ def _maybe_unwrap(x): if sort_categories: categories = categories.sort_values() - new_codes = [_recode_for_categories(c.codes, c.categories, categories) - for c in to_union] + new_codes = [ + _recode_for_categories(c.codes, c.categories, categories) for c in to_union + ] new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message if all(c.ordered for c in to_union): - msg = ("to union ordered Categoricals, " - "all categories must be the same") + msg = "to union ordered Categoricals, " "all categories must be the same" raise TypeError(msg) else: - raise TypeError('Categorical.ordered must be the same') + raise TypeError("Categorical.ordered must be the same") if ignore_order: ordered = False - return Categorical(new_codes, categories=categories, ordered=ordered, - fastpath=True) + return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True) def _concatenate_2d(to_concat, axis): @@ -406,14 +425,14 @@ def _concat_datetime(to_concat, axis=0, typs=None): # multiple types, need to coerce to object if len(typs) != 1: - return _concatenate_2d([_convert_datetimelike_to_object(x) - for x in to_concat], - axis=axis) + return _concatenate_2d( + [_convert_datetimelike_to_object(x) for x in to_concat], axis=axis + ) # must be single dtype - if any(typ.startswith('datetime') for typ in typs): + if any(typ.startswith("datetime") for typ in typs): - if 'datetime' in typs: + if "datetime" in typs: to_concat = [x.astype(np.int64, copy=False) for x in to_concat] return _concatenate_2d(to_concat, axis=axis).view(_NS_DTYPE) else: @@ -421,11 +440,12 @@ def _concat_datetime(to_concat, axis=0, typs=None): # thus no need to care return _concat_datetimetz(to_concat) - elif 'timedelta' in typs: - return _concatenate_2d([x.view(np.int64) for x in to_concat], - axis=axis).view(_TD_DTYPE) + elif "timedelta" in typs: + return _concatenate_2d([x.view(np.int64) for x in to_concat], axis=axis).view( + _TD_DTYPE + ) - elif any(typ.startswith('period') for typ in typs): + elif any(typ.startswith("period") for typ in typs): assert len(typs) == 1 cls = to_concat[0] new_values = cls._concat_same_type(to_concat) @@ -437,12 +457,11 @@ def _convert_datetimelike_to_object(x): # if dtype is of datetimetz or timezone if x.dtype.kind == _NS_DTYPE.kind: - if getattr(x, 'tz', None) is not None: + if getattr(x, "tz", None) is not None: x = np.asarray(x.astype(object)) else: shape = x.shape - x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(), - box="timestamp") + x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(), box="timestamp") x = x.reshape(shape) elif x.dtype == _TD_DTYPE: @@ -483,17 +502,14 @@ def _concat_index_asobject(to_concat, name=None): from pandas import Index from pandas.core.arrays import ExtensionArray - klasses = (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex, - ExtensionArray) - to_concat = [x.astype(object) if isinstance(x, klasses) else x - for x in to_concat] + klasses = (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex, ExtensionArray) + to_concat = [x.astype(object) if isinstance(x, klasses) else x for x in to_concat] self = to_concat[0] attribs = self._get_attributes_dict() - attribs['name'] = name + attribs["name"] = name - to_concat = [x._values if isinstance(x, Index) else x - for x in to_concat] + to_concat = [x._values if isinstance(x, Index) else x for x in to_concat] return self._shallow_copy_with_infer(np.concatenate(to_concat), **attribs) @@ -516,14 +532,16 @@ def _concat_sparse(to_concat, axis=0, typs=None): from pandas.core.arrays import SparseArray - fill_values = [x.fill_value for x in to_concat - if isinstance(x, SparseArray)] + fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)] fill_value = fill_values[0] # TODO: Fix join unit generation so we aren't passed this. - to_concat = [x if isinstance(x, SparseArray) - else SparseArray(x.squeeze(), fill_value=fill_value) - for x in to_concat] + to_concat = [ + x + if isinstance(x, SparseArray) + else SparseArray(x.squeeze(), fill_value=fill_value) + for x in to_concat + ] return SparseArray._concat_same_type(to_concat) @@ -557,8 +575,9 @@ def _concat_rangeindex_same_dtype(indexes): return _concat_index_same_dtype(indexes, klass=Int64Index) step = rng.start - start - non_consecutive = ((step != rng.step and len(rng) > 1) or - (next_ is not None and rng.start != next_)) + non_consecutive = (step != rng.step and len(rng) > 1) or ( + next_ is not None and rng.start != next_ + ) if non_consecutive: return _concat_index_same_dtype(indexes, klass=Int64Index) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index d8d910a16e32ab..1cf452b4a6c2c6 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -9,8 +9,7 @@ from pandas._libs.interval import Interval from pandas._libs.tslibs import NaT, Period, Timestamp, timezones -from pandas.core.dtypes.generic import ( - ABCCategoricalIndex, ABCDateOffset, ABCIndexClass) +from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCDateOffset, ABCIndexClass from .base import ExtensionDtype from .inference import is_list_like @@ -25,8 +24,7 @@ OrderedType = Union[None, bool, object] -def register_extension_dtype(cls: Type[ExtensionDtype], - ) -> Type[ExtensionDtype]: +def register_extension_dtype(cls: Type[ExtensionDtype],) -> Type[ExtensionDtype]: """ Register an ExtensionType with pandas as class decorator. @@ -67,6 +65,7 @@ class Registry: Multiple extension types can be registered. These are tried in order. """ + def __init__(self): self.dtypes = [] # type: List[Type[ExtensionDtype]] @@ -81,9 +80,9 @@ def register(self, dtype: Type[ExtensionDtype]) -> None: self.dtypes.append(dtype) - def find(self, - dtype: Union[Type[ExtensionDtype], str], - ) -> Optional[Type[ExtensionDtype]]: + def find( + self, dtype: Union[Type[ExtensionDtype], str] + ) -> Optional[Type[ExtensionDtype]]: """ Parameters ---------- @@ -120,6 +119,7 @@ class PandasExtensionDtype(ExtensionDtype): THIS IS NOT A REAL NUMPY DTYPE """ + type = None # type: Any kind = None # type: Any # The Any type annotations above are here only because mypy seems to have a @@ -149,8 +149,7 @@ def __repr__(self) -> str_type: return str(self) def __hash__(self) -> int: - raise NotImplementedError("sub-classes should implement an __hash__ " - "method") + raise NotImplementedError("sub-classes should implement an __hash__ " "method") def __getstate__(self) -> Dict[str_type, Any]: # pickle support; we don't want to pickle the cache @@ -166,6 +165,7 @@ class CategoricalDtypeType(type): """ the type of CategoricalDtype, this metaclass determines subclass ability """ + pass @@ -212,35 +212,31 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): dtype: category Categories (2, object): [b < a] """ + # TODO: Document public vs. private API - name = 'category' + name = "category" type = CategoricalDtypeType # type: Type[CategoricalDtypeType] - kind = 'O' # type: str_type - str = '|O08' - base = np.dtype('O') - _metadata = ('categories', 'ordered') + kind = "O" # type: str_type + str = "|O08" + base = np.dtype("O") + _metadata = ("categories", "ordered") _cache = {} # type: Dict[str_type, PandasExtensionDtype] - def __init__(self, - categories=None, - ordered: OrderedType = ordered_sentinel): + def __init__(self, categories=None, ordered: OrderedType = ordered_sentinel): self._finalize(categories, ordered, fastpath=False) @classmethod - def _from_fastpath(cls, - categories=None, - ordered: Optional[bool] = None - ) -> 'CategoricalDtype': + def _from_fastpath( + cls, categories=None, ordered: Optional[bool] = None + ) -> "CategoricalDtype": self = cls.__new__(cls) self._finalize(categories, ordered, fastpath=True) return self @classmethod - def _from_categorical_dtype(cls, - dtype: 'CategoricalDtype', - categories=None, - ordered: OrderedType = None, - ) -> 'CategoricalDtype': + def _from_categorical_dtype( + cls, dtype: "CategoricalDtype", categories=None, ordered: OrderedType = None + ) -> "CategoricalDtype": if categories is ordered is None: return dtype if categories is None: @@ -250,12 +246,13 @@ def _from_categorical_dtype(cls, return cls(categories, ordered) @classmethod - def _from_values_or_dtype(cls, - values=None, - categories=None, - ordered: Optional[bool] = None, - dtype: Optional['CategoricalDtype'] = None, - ) -> 'CategoricalDtype': + def _from_values_or_dtype( + cls, + values=None, + categories=None, + ordered: Optional[bool] = None, + dtype: Optional["CategoricalDtype"] = None, + ) -> "CategoricalDtype": """ Construct dtype from the input parameters used in :class:`Categorical`. @@ -316,19 +313,21 @@ def _from_values_or_dtype(cls, if dtype is not None: # The dtype argument takes precedence over values.dtype (if any) if isinstance(dtype, str): - if dtype == 'category': + if dtype == "category": dtype = CategoricalDtype(categories, ordered) else: msg = "Unknown dtype {dtype!r}" raise ValueError(msg.format(dtype=dtype)) elif categories is not None or ordered is not None: - raise ValueError("Cannot specify `categories` or `ordered` " - "together with `dtype`.") + raise ValueError( + "Cannot specify `categories` or `ordered` " "together with `dtype`." + ) elif is_categorical(values): # If no "dtype" was passed, use the one from "values", but honor # the "ordered" and "categories" arguments - dtype = values.dtype._from_categorical_dtype(values.dtype, - categories, ordered) + dtype = values.dtype._from_categorical_dtype( + values.dtype, categories, ordered + ) else: # If dtype=None and values is not categorical, create a new dtype. # Note: This could potentially have categories=None and @@ -337,18 +336,15 @@ def _from_values_or_dtype(cls, return dtype - def _finalize(self, - categories, - ordered: OrderedType, - fastpath: bool = False, - ) -> None: + def _finalize( + self, categories, ordered: OrderedType, fastpath: bool = False + ) -> None: if ordered is not None and ordered is not ordered_sentinel: self.validate_ordered(ordered) if categories is not None: - categories = self.validate_categories(categories, - fastpath=fastpath) + categories = self.validate_categories(categories, fastpath=fastpath) self._categories = categories self._ordered = ordered if ordered is not ordered_sentinel else None @@ -358,8 +354,8 @@ def __setstate__(self, state: Dict[str_type, Any]) -> None: # for pickle compat. __get_state__ is defined in the # PandasExtensionDtype superclass and uses the public properties to # pickle -> need to set the settable private ones here (see GH26067) - self._categories = state.pop('categories', None) - self._ordered = state.pop('ordered', False) + self._categories = state.pop("categories", None) + self._ordered = state.pop("ordered", False) def __hash__(self) -> int: # _hash_categories returns a uint64, so use the negative @@ -389,7 +385,7 @@ def __eq__(self, other: Any) -> bool: return other == self.name elif other is self: return True - elif not (hasattr(other, '_ordered') and hasattr(other, 'categories')): + elif not (hasattr(other, "_ordered") and hasattr(other, "categories")): return False elif self.categories is None or other.categories is None: # We're forced into a suboptimal corner thanks to math and @@ -401,8 +397,9 @@ def __eq__(self, other: Any) -> bool: elif self._ordered or other._ordered: # At least one has ordered=True; equal if both have ordered=True # and the same values for categories in the same order. - return ((self._ordered == other._ordered) and - self.categories.equals(other.categories)) + return (self._ordered == other._ordered) and self.categories.equals( + other.categories + ) else: # Neither has ordered=True; equal if both have the same categories, # but same order is not necessary. There is no distinction between @@ -411,7 +408,7 @@ def __eq__(self, other: Any) -> bool: return hash(self) == hash(other) def __repr__(self): - tpl = 'CategoricalDtype(categories={}ordered={})' + tpl = "CategoricalDtype(categories={}ordered={})" if self.categories is None: data = "None, " else: @@ -421,7 +418,9 @@ def __repr__(self): @staticmethod def _hash_categories(categories, ordered: OrderedType = True) -> int: from pandas.core.util.hashing import ( - hash_array, _combine_hash_arrays, hash_tuples + hash_array, + _combine_hash_arrays, + hash_tuples, ) from pandas.core.dtypes.common import is_datetime64tz_dtype, _NS_DTYPE @@ -432,7 +431,7 @@ def _hash_categories(categories, ordered: OrderedType = True) -> int: categories = list(categories) # breaks if a np.array of categories cat_array = hash_tuples(categories) else: - if categories.dtype == 'O': + if categories.dtype == "O": if len({type(x) for x in categories}) != 1: # TODO: hash_array doesn't handle mixed types. It casts # everything to a str first, which means we treat @@ -447,13 +446,12 @@ def _hash_categories(categories, ordered: OrderedType = True) -> int: cat_array = hash_array(np.asarray(categories), categorize=False) if ordered: - cat_array = np.vstack([ - cat_array, np.arange(len(cat_array), dtype=cat_array.dtype) - ]) + cat_array = np.vstack( + [cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)] + ) else: cat_array = [cat_array] - hashed = _combine_hash_arrays(iter(cat_array), - num_items=len(cat_array)) + hashed = _combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) return np.bitwise_xor.reduce(hashed) @classmethod @@ -466,6 +464,7 @@ def construct_array_type(cls): type """ from pandas import Categorical + return Categorical @staticmethod @@ -485,6 +484,7 @@ def validate_ordered(ordered: OrderedType) -> None: If 'ordered' is not a boolean. """ from pandas.core.dtypes.common import is_bool + if not is_bool(ordered): raise TypeError("'ordered' must either be 'True' or 'False'") @@ -514,17 +514,17 @@ def validate_categories(categories, fastpath: bool = False): if not fastpath: if categories.hasnans: - raise ValueError('Categorial categories cannot be null') + raise ValueError("Categorial categories cannot be null") if not categories.is_unique: - raise ValueError('Categorical categories must be unique') + raise ValueError("Categorical categories must be unique") if isinstance(categories, ABCCategoricalIndex): categories = categories.categories return categories - def update_dtype(self, dtype: 'CategoricalDtype') -> 'CategoricalDtype': + def update_dtype(self, dtype: "CategoricalDtype") -> "CategoricalDtype": """ Returns a CategoricalDtype with categories and ordered taken from dtype if specified, otherwise falling back to self if unspecified @@ -537,12 +537,14 @@ def update_dtype(self, dtype: 'CategoricalDtype') -> 'CategoricalDtype': ------- new_dtype : CategoricalDtype """ - if isinstance(dtype, str) and dtype == 'category': + if isinstance(dtype, str) and dtype == "category": # dtype='category' should not change anything return self elif not self.is_dtype(dtype): - msg = ('a CategoricalDtype must be passed to perform an update, ' - 'got {dtype!r}').format(dtype=dtype) + msg = ( + "a CategoricalDtype must be passed to perform an update, " + "got {dtype!r}" + ).format(dtype=dtype) raise ValueError(msg) # dtype is CDT: keep current categories/ordered if None @@ -557,11 +559,13 @@ def update_dtype(self, dtype: 'CategoricalDtype') -> 'CategoricalDtype': new_ordered = self._ordered if self._ordered and new_ordered_from_sentinel: # only warn if we'd actually change the existing behavior - msg = ("Constructing a CategoricalDtype without specifying " - "`ordered` will default to `ordered=False` in a future " - "version, which will cause the resulting categorical's " - "`ordered` attribute to change to False; `ordered=True`" - " must be explicitly passed in order to be retained") + msg = ( + "Constructing a CategoricalDtype without specifying " + "`ordered` will default to `ordered=False` in a future " + "version, which will cause the resulting categorical's " + "`ordered` attribute to change to False; `ordered=True`" + " must be explicitly passed in order to be retained" + ) warnings.warn(msg, FutureWarning, stacklevel=3) return CategoricalDtype(new_categories, new_ordered) @@ -582,9 +586,11 @@ def ordered(self) -> OrderedType: if self._ordered_from_sentinel and self._ordered is None: # warn when accessing ordered if ordered=None and None was not # explicitly passed to the constructor - msg = ("Constructing a CategoricalDtype without specifying " - "`ordered` will default to `ordered=False` in a future " - "version; `ordered=None` must be explicitly passed.") + msg = ( + "Constructing a CategoricalDtype without specifying " + "`ordered` will default to `ordered=False` in a future " + "version; `ordered=None` must be explicitly passed." + ) warnings.warn(msg, FutureWarning, stacklevel=2) return self._ordered @@ -632,13 +638,14 @@ class DatetimeTZDtype(PandasExtensionDtype): >>> pd.DatetimeTZDtype(tz='dateutil/US/Central') datetime64[ns, tzfile('/usr/share/zoneinfo/US/Central')] """ + type = Timestamp # type: Type[Timestamp] - kind = 'M' # type: str_type - str = '|M8[ns]' + kind = "M" # type: str_type + str = "|M8[ns]" num = 101 - base = np.dtype('M8[ns]') + base = np.dtype("M8[ns]") na_value = NaT - _metadata = ('unit', 'tz') + _metadata = ("unit", "tz") _match = re.compile(r"(datetime64|M8)\[(?P.+), (?P.+)\]") _cache = {} # type: Dict[str_type, PandasExtensionDtype] @@ -646,7 +653,7 @@ def __init__(self, unit="ns", tz=None): if isinstance(unit, DatetimeTZDtype): unit, tz = unit.unit, unit.tz - if unit != 'ns': + if unit != "ns": if isinstance(unit, str) and tz is None: # maybe a string like datetime64[ns, tz], which we support for # now. @@ -697,6 +704,7 @@ def construct_array_type(cls): type """ from pandas.core.arrays import DatetimeArray + return DatetimeArray @classmethod @@ -722,7 +730,7 @@ def construct_from_string(cls, string): match = cls._match.match(string) if match: d = match.groupdict() - return cls(unit=d['unit'], tz=d['tz']) + return cls(unit=d["unit"], tz=d["tz"]) except Exception: # TODO(py3): Change this pass to `raise TypeError(msg) from e` pass @@ -747,16 +755,18 @@ def __eq__(self, other): if isinstance(other, str): return other == self.name - return (isinstance(other, DatetimeTZDtype) and - self.unit == other.unit and - str(self.tz) == str(other.tz)) + return ( + isinstance(other, DatetimeTZDtype) + and self.unit == other.unit + and str(self.tz) == str(other.tz) + ) def __setstate__(self, state): # for pickle compat. __get_state__ is defined in the # PandasExtensionDtype superclass and uses the public properties to # pickle -> need to set the settable private ones here (see GH26067) - self._tz = state['tz'] - self._unit = state['unit'] + self._tz = state["tz"] + self._unit = state["unit"] @register_extension_dtype @@ -787,12 +797,13 @@ class PeriodDtype(PandasExtensionDtype): >>> pd.PeriodDtype(freq=pd.offsets.MonthEnd()) period[M] """ + type = Period # type: Type[Period] - kind = 'O' # type: str_type - str = '|O08' - base = np.dtype('O') + kind = "O" # type: str_type + str = "|O08" + base = np.dtype("O") num = 102 - _metadata = ('freq',) + _metadata = ("freq",) _match = re.compile(r"(P|p)eriod\[(?P.+)\]") _cache = {} # type: Dict[str_type, PandasExtensionDtype] @@ -833,11 +844,12 @@ def freq(self): @classmethod def _parse_dtype_strict(cls, freq): if isinstance(freq, str): - if freq.startswith('period[') or freq.startswith('Period['): + if freq.startswith("period[") or freq.startswith("Period["): m = cls._match.search(freq) if m is not None: - freq = m.group('freq') + freq = m.group("freq") from pandas.tseries.frequencies import to_offset + freq = to_offset(freq) if freq is not None: return freq @@ -850,10 +862,11 @@ def construct_from_string(cls, string): Strict construction from a string, raise a TypeError if not possible """ - if (isinstance(string, str) and - (string.startswith('period[') or - string.startswith('Period[')) or - isinstance(string, ABCDateOffset)): + if ( + isinstance(string, str) + and (string.startswith("period[") or string.startswith("Period[")) + or isinstance(string, ABCDateOffset) + ): # do not parse string like U as period[U] # avoid tuple to be regarded as freq try: @@ -887,7 +900,7 @@ def __setstate__(self, state): # for pickle compat. __get_state__ is defined in the # PandasExtensionDtype superclass and uses the public properties to # pickle -> need to set the settable private ones here (see GH26067) - self._freq = state['freq'] + self._freq = state["freq"] @classmethod def is_dtype(cls, dtype): @@ -899,7 +912,7 @@ def is_dtype(cls, dtype): if isinstance(dtype, str): # PeriodDtype can be instantiated from freq string like "U", # but doesn't regard freq str like "U" as dtype. - if dtype.startswith('period[') or dtype.startswith('Period['): + if dtype.startswith("period[") or dtype.startswith("Period["): try: if cls._parse_dtype_strict(dtype) is not None: return True @@ -943,18 +956,22 @@ class IntervalDtype(PandasExtensionDtype): >>> pd.IntervalDtype(subtype='int64') interval[int64] """ - name = 'interval' + + name = "interval" kind = None # type: Optional[str_type] - str = '|O08' - base = np.dtype('O') + str = "|O08" + base = np.dtype("O") num = 103 - _metadata = ('subtype',) + _metadata = ("subtype",) _match = re.compile(r"(I|i)nterval\[(?P.+)\]") _cache = {} # type: Dict[str_type, PandasExtensionDtype] def __new__(cls, subtype=None): from pandas.core.dtypes.common import ( - is_categorical_dtype, is_string_dtype, pandas_dtype) + is_categorical_dtype, + is_string_dtype, + pandas_dtype, + ) if isinstance(subtype, IntervalDtype): return subtype @@ -964,14 +981,13 @@ def __new__(cls, subtype=None): u = object.__new__(cls) u._subtype = None return u - elif (isinstance(subtype, str) and - subtype.lower() == 'interval'): + elif isinstance(subtype, str) and subtype.lower() == "interval": subtype = None else: if isinstance(subtype, str): m = cls._match.search(subtype) if m is not None: - subtype = m.group('subtype') + subtype = m.group("subtype") try: subtype = pandas_dtype(subtype) @@ -980,8 +996,10 @@ def __new__(cls, subtype=None): if is_categorical_dtype(subtype) or is_string_dtype(subtype): # GH 19016 - msg = ('category, object, and string subtypes are not supported ' - 'for IntervalDtype') + msg = ( + "category, object, and string subtypes are not supported " + "for IntervalDtype" + ) raise TypeError(msg) try: @@ -1009,6 +1027,7 @@ def construct_array_type(cls): type """ from pandas.core.arrays import IntervalArray + return IntervalArray @classmethod @@ -1021,13 +1040,14 @@ def construct_from_string(cls, string): msg = "a string needs to be passed, got type {typ}" raise TypeError(msg.format(typ=type(string))) - if (string.lower() == 'interval' or - cls._match.search(string) is not None): + if string.lower() == "interval" or cls._match.search(string) is not None: return cls(string) - msg = ('Incorrectly formatted string passed to constructor. ' - 'Valid formats include Interval or Interval[dtype] ' - 'where dtype is numeric, datetime, or timedelta') + msg = ( + "Incorrectly formatted string passed to constructor. " + "Valid formats include Interval or Interval[dtype] " + "where dtype is numeric, datetime, or timedelta" + ) raise TypeError(msg) @property @@ -1053,13 +1073,14 @@ def __eq__(self, other): return True else: from pandas.core.dtypes.common import is_dtype_equal + return is_dtype_equal(self.subtype, other.subtype) def __setstate__(self, state): # for pickle compat. __get_state__ is defined in the # PandasExtensionDtype superclass and uses the public properties to # pickle -> need to set the settable private ones here (see GH26067) - self._subtype = state['subtype'] + self._subtype = state["subtype"] @classmethod def is_dtype(cls, dtype): @@ -1069,7 +1090,7 @@ def is_dtype(cls, dtype): """ if isinstance(dtype, str): - if dtype.lower().startswith('interval'): + if dtype.lower().startswith("interval"): try: if cls.construct_from_string(dtype) is not None: return True diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index 86aff93dfde143..de41644f09b66f 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -6,76 +6,79 @@ def create_pandas_abc_type(name, attr, comp): @classmethod def _check(cls, inst): - return getattr(inst, attr, '_typ') in comp + return getattr(inst, attr, "_typ") in comp dct = dict(__instancecheck__=_check, __subclasscheck__=_check) - meta = type("ABCBase", (type, ), dct) + meta = type("ABCBase", (type,), dct) return meta(name, tuple(), dct) -ABCIndex = create_pandas_abc_type("ABCIndex", "_typ", ("index", )) -ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ", - ("int64index", )) -ABCUInt64Index = create_pandas_abc_type("ABCUInt64Index", "_typ", - ("uint64index", )) -ABCRangeIndex = create_pandas_abc_type("ABCRangeIndex", "_typ", - ("rangeindex", )) -ABCFloat64Index = create_pandas_abc_type("ABCFloat64Index", "_typ", - ("float64index", )) -ABCMultiIndex = create_pandas_abc_type("ABCMultiIndex", "_typ", - ("multiindex", )) -ABCDatetimeIndex = create_pandas_abc_type("ABCDatetimeIndex", "_typ", - ("datetimeindex", )) -ABCTimedeltaIndex = create_pandas_abc_type("ABCTimedeltaIndex", "_typ", - ("timedeltaindex", )) -ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ", - ("periodindex", )) -ABCCategoricalIndex = create_pandas_abc_type("ABCCategoricalIndex", "_typ", - ("categoricalindex", )) -ABCIntervalIndex = create_pandas_abc_type("ABCIntervalIndex", "_typ", - ("intervalindex", )) -ABCIndexClass = create_pandas_abc_type("ABCIndexClass", "_typ", - ("index", "int64index", "rangeindex", - "float64index", "uint64index", - "multiindex", "datetimeindex", - "timedeltaindex", "periodindex", - "categoricalindex", "intervalindex")) +ABCIndex = create_pandas_abc_type("ABCIndex", "_typ", ("index",)) +ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ", ("int64index",)) +ABCUInt64Index = create_pandas_abc_type("ABCUInt64Index", "_typ", ("uint64index",)) +ABCRangeIndex = create_pandas_abc_type("ABCRangeIndex", "_typ", ("rangeindex",)) +ABCFloat64Index = create_pandas_abc_type("ABCFloat64Index", "_typ", ("float64index",)) +ABCMultiIndex = create_pandas_abc_type("ABCMultiIndex", "_typ", ("multiindex",)) +ABCDatetimeIndex = create_pandas_abc_type( + "ABCDatetimeIndex", "_typ", ("datetimeindex",) +) +ABCTimedeltaIndex = create_pandas_abc_type( + "ABCTimedeltaIndex", "_typ", ("timedeltaindex",) +) +ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",)) +ABCCategoricalIndex = create_pandas_abc_type( + "ABCCategoricalIndex", "_typ", ("categoricalindex",) +) +ABCIntervalIndex = create_pandas_abc_type( + "ABCIntervalIndex", "_typ", ("intervalindex",) +) +ABCIndexClass = create_pandas_abc_type( + "ABCIndexClass", + "_typ", + ( + "index", + "int64index", + "rangeindex", + "float64index", + "uint64index", + "multiindex", + "datetimeindex", + "timedeltaindex", + "periodindex", + "categoricalindex", + "intervalindex", + ), +) -ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series", )) -ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe", )) -ABCSparseDataFrame = create_pandas_abc_type("ABCSparseDataFrame", "_subtyp", - ("sparse_frame", )) -ABCSparseSeries = create_pandas_abc_type("ABCSparseSeries", "_subtyp", - ('sparse_series', - 'sparse_time_series')) -ABCSparseArray = create_pandas_abc_type("ABCSparseArray", "_subtyp", - ('sparse_array', 'sparse_series')) -ABCCategorical = create_pandas_abc_type("ABCCategorical", "_typ", - ("categorical")) -ABCDatetimeArray = create_pandas_abc_type("ABCDatetimeArray", "_typ", - ("datetimearray")) -ABCTimedeltaArray = create_pandas_abc_type("ABCTimedeltaArray", "_typ", - ("timedeltaarray")) -ABCPeriodArray = create_pandas_abc_type("ABCPeriodArray", "_typ", - ("periodarray", )) -ABCPeriod = create_pandas_abc_type("ABCPeriod", "_typ", ("period", )) -ABCDateOffset = create_pandas_abc_type("ABCDateOffset", "_typ", - ("dateoffset",)) -ABCInterval = create_pandas_abc_type("ABCInterval", "_typ", ("interval", )) -ABCExtensionArray = create_pandas_abc_type("ABCExtensionArray", "_typ", - ("extension", - "categorical", - "periodarray", - "datetimearray", - "timedeltaarray", - )) -ABCPandasArray = create_pandas_abc_type("ABCPandasArray", - "_typ", - ("npy_extension",)) +ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",)) +ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) +ABCSparseDataFrame = create_pandas_abc_type( + "ABCSparseDataFrame", "_subtyp", ("sparse_frame",) +) +ABCSparseSeries = create_pandas_abc_type( + "ABCSparseSeries", "_subtyp", ("sparse_series", "sparse_time_series") +) +ABCSparseArray = create_pandas_abc_type( + "ABCSparseArray", "_subtyp", ("sparse_array", "sparse_series") +) +ABCCategorical = create_pandas_abc_type("ABCCategorical", "_typ", ("categorical")) +ABCDatetimeArray = create_pandas_abc_type("ABCDatetimeArray", "_typ", ("datetimearray")) +ABCTimedeltaArray = create_pandas_abc_type( + "ABCTimedeltaArray", "_typ", ("timedeltaarray") +) +ABCPeriodArray = create_pandas_abc_type("ABCPeriodArray", "_typ", ("periodarray",)) +ABCPeriod = create_pandas_abc_type("ABCPeriod", "_typ", ("period",)) +ABCDateOffset = create_pandas_abc_type("ABCDateOffset", "_typ", ("dateoffset",)) +ABCInterval = create_pandas_abc_type("ABCInterval", "_typ", ("interval",)) +ABCExtensionArray = create_pandas_abc_type( + "ABCExtensionArray", + "_typ", + ("extension", "categorical", "periodarray", "datetimearray", "timedeltaarray"), +) +ABCPandasArray = create_pandas_abc_type("ABCPandasArray", "_typ", ("npy_extension",)) class _ABCGeneric(type): - def __instancecheck__(cls, inst): return hasattr(inst, "_data") diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 02ee777bbe7f3c..9373ea18e8a24b 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -143,10 +143,10 @@ def is_iterator(obj): False """ - if not hasattr(obj, '__iter__'): + if not hasattr(obj, "__iter__"): return False - return hasattr(obj, '__next__') + return hasattr(obj, "__next__") def is_file_like(obj): @@ -180,7 +180,7 @@ def is_file_like(obj): False """ - if not (hasattr(obj, 'read') or hasattr(obj, 'write')): + if not (hasattr(obj, "read") or hasattr(obj, "write")): return False if not hasattr(obj, "__iter__"): @@ -281,15 +281,18 @@ def is_list_like(obj, allow_sets=True): False """ - return (isinstance(obj, abc.Iterable) and - # we do not count strings/unicode/bytes as list-like - not isinstance(obj, (str, bytes)) and - - # exclude zero-dimensional numpy arrays, effectively scalars - not (isinstance(obj, np.ndarray) and obj.ndim == 0) and - - # exclude sets if allow_sets is False - not (allow_sets is False and isinstance(obj, abc.Set))) + return ( + isinstance(obj, abc.Iterable) + and + # we do not count strings/unicode/bytes as list-like + not isinstance(obj, (str, bytes)) + and + # exclude zero-dimensional numpy arrays, effectively scalars + not (isinstance(obj, np.ndarray) and obj.ndim == 0) + and + # exclude sets if allow_sets is False + not (allow_sets is False and isinstance(obj, abc.Set)) + ) def is_array_like(obj): @@ -365,8 +368,12 @@ def is_nested_list_like(obj): -------- is_list_like """ - return (is_list_like(obj) and hasattr(obj, '__len__') and - len(obj) > 0 and all(is_list_like(item) for item in obj)) + return ( + is_list_like(obj) + and hasattr(obj, "__len__") + and len(obj) > 0 + and all(is_list_like(item) for item in obj) + ) def is_dict_like(obj): @@ -394,9 +401,11 @@ def is_dict_like(obj): True """ dict_like_attrs = ("__getitem__", "keys", "__contains__") - return (all(hasattr(obj, attr) for attr in dict_like_attrs) - # [GH 25196] exclude classes - and not isinstance(obj, type)) + return ( + all(hasattr(obj, attr) for attr in dict_like_attrs) + # [GH 25196] exclude classes + and not isinstance(obj, type) + ) def is_named_tuple(obj): @@ -423,7 +432,7 @@ def is_named_tuple(obj): False """ - return isinstance(obj, tuple) and hasattr(obj, '_fields') + return isinstance(obj, tuple) and hasattr(obj, "_fields") def is_hashable(obj): @@ -489,7 +498,7 @@ def is_sequence(obj): try: iter(obj) # Can iterate over it. - len(obj) # Has a length associated with it. + len(obj) # Has a length associated with it. return not isinstance(obj, (str, bytes)) except (TypeError, AttributeError): return False diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 914a292d3db973..f540e9297738a3 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -8,15 +8,37 @@ from pandas._libs.tslibs import NaT, iNaT from .common import ( - _NS_DTYPE, _TD_DTYPE, ensure_object, is_bool_dtype, is_complex_dtype, - is_datetime64_dtype, is_datetime64tz_dtype, is_datetimelike, - is_datetimelike_v_numeric, is_dtype_equal, is_extension_array_dtype, - is_float_dtype, is_integer_dtype, is_object_dtype, is_period_dtype, - is_scalar, is_string_dtype, is_string_like_dtype, is_timedelta64_dtype, - needs_i8_conversion, pandas_dtype) + _NS_DTYPE, + _TD_DTYPE, + ensure_object, + is_bool_dtype, + is_complex_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetimelike, + is_datetimelike_v_numeric, + is_dtype_equal, + is_extension_array_dtype, + is_float_dtype, + is_integer_dtype, + is_object_dtype, + is_period_dtype, + is_scalar, + is_string_dtype, + is_string_like_dtype, + is_timedelta64_dtype, + needs_i8_conversion, + pandas_dtype, +) from .generic import ( - ABCDatetimeArray, ABCExtensionArray, ABCGeneric, ABCIndexClass, - ABCMultiIndex, ABCSeries, ABCTimedeltaArray) + ABCDatetimeArray, + ABCExtensionArray, + ABCGeneric, + ABCIndexClass, + ABCMultiIndex, + ABCSeries, + ABCTimedeltaArray, +) from .inference import is_list_like isposinf_scalar = libmissing.isposinf_scalar @@ -109,15 +131,23 @@ def _isna_new(obj): # hack (for now) because MI registers as ndarray elif isinstance(obj, ABCMultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") - elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass, - ABCExtensionArray, - ABCDatetimeArray, ABCTimedeltaArray)): + elif isinstance( + obj, + ( + ABCSeries, + np.ndarray, + ABCIndexClass, + ABCExtensionArray, + ABCDatetimeArray, + ABCTimedeltaArray, + ), + ): return _isna_ndarraylike(obj) elif isinstance(obj, ABCGeneric): return obj._constructor(obj._data.isna(func=isna)) elif isinstance(obj, list): return _isna_ndarraylike(np.asarray(obj, dtype=object)) - elif hasattr(obj, '__array__'): + elif hasattr(obj, "__array__"): return _isna_ndarraylike(np.asarray(obj)) else: return obj is None @@ -145,7 +175,7 @@ def _isna_old(obj): return obj._constructor(obj._data.isna(func=_isna_old)) elif isinstance(obj, list): return _isna_ndarraylike_old(np.asarray(obj, dtype=object)) - elif hasattr(obj, '__array__'): + elif hasattr(obj, "__array__"): return _isna_ndarraylike_old(np.asarray(obj)) else: return obj is None @@ -174,11 +204,12 @@ def _use_inf_as_na(key): programmatically-creating-variables-in-python/4859312#4859312 """ from pandas._config import get_option + flag = get_option(key) if flag: - globals()['_isna'] = _isna_old + globals()["_isna"] = _isna_old else: - globals()['_isna'] = _isna_new + globals()["_isna"] = _isna_new def _isna_ndarraylike(obj): @@ -187,7 +218,7 @@ def _isna_ndarraylike(obj): if not is_extension: # Avoid accessing `.values` on things like # PeriodIndex, which may be expensive. - values = getattr(obj, 'values', obj) + values = getattr(obj, "values", obj) else: values = obj @@ -216,20 +247,19 @@ def _isna_ndarraylike(obj): elif needs_i8_conversion(dtype): # this is the NaT pattern - result = values.view('i8') == iNaT + result = values.view("i8") == iNaT else: result = np.isnan(values) # box if isinstance(obj, ABCSeries): - result = obj._constructor( - result, index=obj.index, name=obj.name, copy=False) + result = obj._constructor(result, index=obj.index, name=obj.name, copy=False) return result def _isna_ndarraylike_old(obj): - values = getattr(obj, 'values', obj) + values = getattr(obj, "values", obj) dtype = values.dtype if is_string_dtype(dtype): @@ -245,14 +275,13 @@ def _isna_ndarraylike_old(obj): elif is_datetime64_dtype(dtype): # this is the NaT pattern - result = values.view('i8') == iNaT + result = values.view("i8") == iNaT else: result = ~np.isfinite(values) # box if isinstance(obj, ABCSeries): - result = obj._constructor( - result, index=obj.index, name=obj.name, copy=False) + result = obj._constructor(result, index=obj.index, name=obj.name, copy=False) return result @@ -353,8 +382,7 @@ def _isna_compat(arr, fill_value=np.nan): """ dtype = arr.dtype if isna(fill_value): - return not (is_bool_dtype(dtype) or - is_integer_dtype(dtype)) + return not (is_bool_dtype(dtype) or is_integer_dtype(dtype)) return True @@ -402,15 +430,15 @@ def array_equivalent(left, right, strict_nan=False): if not strict_nan: # isna considers NaN and None to be equivalent. return lib.array_equivalent_object( - ensure_object(left.ravel()), ensure_object(right.ravel())) + ensure_object(left.ravel()), ensure_object(right.ravel()) + ) for left_value, right_value in zip(left, right): if left_value is NaT and right_value is not NaT: return False elif isinstance(left_value, float) and np.isnan(left_value): - if (not isinstance(right_value, float) or - not np.isnan(right_value)): + if not isinstance(right_value, float) or not np.isnan(right_value): return False else: if left_value != right_value: @@ -434,12 +462,11 @@ def array_equivalent(left, right, strict_nan=False): if not is_dtype_equal(left.dtype, right.dtype): return False - left = left.view('i8') - right = right.view('i8') + left = left.view("i8") + right = right.view("i8") # if we have structured dtypes, compare first - if (left.dtype.type is np.void or - right.dtype.type is np.void): + if left.dtype.type is np.void or right.dtype.type is np.void: if left.dtype != right.dtype: return False @@ -457,13 +484,13 @@ def _infer_fill_value(val): val = [val] val = np.array(val, copy=False) if is_datetimelike(val): - return np.array('NaT', dtype=val.dtype) + return np.array("NaT", dtype=val.dtype) elif is_object_dtype(val.dtype): dtype = lib.infer_dtype(ensure_object(val), skipna=False) - if dtype in ['datetime', 'datetime64']: - return np.array('NaT', dtype=_NS_DTYPE) - elif dtype in ['timedelta', 'timedelta64']: - return np.array('NaT', dtype=_TD_DTYPE) + if dtype in ["datetime", "datetime64"]: + return np.array("NaT", dtype=_NS_DTYPE) + elif dtype in ["timedelta", "timedelta64"]: + return np.array("NaT", dtype=_TD_DTYPE) return np.nan @@ -506,8 +533,12 @@ def na_value_for_dtype(dtype, compat=True): if is_extension_array_dtype(dtype): return dtype.na_value - if (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype) or - is_timedelta64_dtype(dtype) or is_period_dtype(dtype)): + if ( + is_datetime64_dtype(dtype) + or is_datetime64tz_dtype(dtype) + or is_timedelta64_dtype(dtype) + or is_period_dtype(dtype) + ): return NaT elif is_float_dtype(dtype): return np.nan diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0dba7c7b5d2888..a1989fd62b6ee0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -27,44 +27,89 @@ from pandas.compat import PY36, raise_with_traceback from pandas.compat.numpy import function as nv from pandas.util._decorators import ( - Appender, Substitution, deprecate_kwarg, rewrite_axis_style_signature) -from pandas.util._validators import ( - validate_axis_style_args, validate_bool_kwarg) + Appender, + Substitution, + deprecate_kwarg, + rewrite_axis_style_signature, +) +from pandas.util._validators import validate_axis_style_args, validate_bool_kwarg from pandas.core.dtypes.cast import ( - cast_scalar_to_array, coerce_to_dtypes, find_common_type, - infer_dtype_from_scalar, invalidate_string_dtypes, maybe_cast_to_datetime, - maybe_convert_platform, maybe_downcast_to_dtype, - maybe_infer_to_datetimelike, maybe_upcast, maybe_upcast_putmask) + cast_scalar_to_array, + coerce_to_dtypes, + find_common_type, + infer_dtype_from_scalar, + invalidate_string_dtypes, + maybe_cast_to_datetime, + maybe_convert_platform, + maybe_downcast_to_dtype, + maybe_infer_to_datetimelike, + maybe_upcast, + maybe_upcast_putmask, +) from pandas.core.dtypes.common import ( - ensure_float64, ensure_int64, ensure_platform_int, infer_dtype_from_object, - is_bool_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, - is_dict_like, is_dtype_equal, is_extension_array_dtype, is_extension_type, - is_float_dtype, is_integer, is_integer_dtype, is_iterator, is_list_like, - is_named_tuple, is_nested_list_like, is_object_dtype, is_scalar, - is_sequence, needs_i8_conversion) + ensure_float64, + ensure_int64, + ensure_platform_int, + infer_dtype_from_object, + is_bool_dtype, + is_datetime64_any_dtype, + is_datetime64tz_dtype, + is_dict_like, + is_dtype_equal, + is_extension_array_dtype, + is_extension_type, + is_float_dtype, + is_integer, + is_integer_dtype, + is_iterator, + is_list_like, + is_named_tuple, + is_nested_list_like, + is_object_dtype, + is_scalar, + is_sequence, + needs_i8_conversion, +) from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCIndexClass, ABCMultiIndex, ABCSeries) + ABCDataFrame, + ABCIndexClass, + ABCMultiIndex, + ABCSeries, +) from pandas.core.dtypes.missing import isna, notna from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor from pandas.core.arrays import Categorical, ExtensionArray -from pandas.core.arrays.datetimelike import ( - DatetimeLikeArrayMixin as DatetimeLikeArray) +from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import ( - Index, MultiIndex, ensure_index, ensure_index_from_sequences) + Index, + MultiIndex, + ensure_index, + ensure_index_from_sequences, +) from pandas.core.indexes import base as ibase from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import PeriodIndex from pandas.core.indexing import ( - check_bool_indexer, convert_to_index_sliceable, maybe_droplevels) + check_bool_indexer, + convert_to_index_sliceable, + maybe_droplevels, +) from pandas.core.internals import BlockManager from pandas.core.internals.construction import ( - arrays_to_mgr, get_names_from_index, init_dict, init_ndarray, - masked_rec_array_to_mgr, reorder_arrays, sanitize_index, to_arrays) + arrays_to_mgr, + get_names_from_index, + init_dict, + init_ndarray, + masked_rec_array_to_mgr, + reorder_arrays, + sanitize_index, + to_arrays, +) from pandas.core.series import Series from pandas.io.formats import console, format as fmt @@ -75,7 +120,8 @@ # Docstring templates _shared_doc_kwargs = dict( - axes='index, columns', klass='DataFrame', + axes="index, columns", + klass="DataFrame", axes_single_arg="{0 or 'index', 1 or 'columns'}", axis="""axis : {0 or 'index', 1 or 'columns'}, default 0 If 0 or 'index': apply function to each column. @@ -91,7 +137,7 @@ .. versionchanged:: 0.23.0 Allow specifying index or column level names.""", - versionadded_to_excel='', + versionadded_to_excel="", optional_labels="""labels : array-like, optional New labels / index to conform the axis specified by 'axis' to.""", optional_axis="""axis : int or str, optional @@ -330,9 +376,9 @@ def _constructor(self): return DataFrame _constructor_sliced = Series # type: Type[Series] - _deprecations = NDFrame._deprecations | frozenset([ - 'get_value', 'set_value', 'from_items' - ]) # type: FrozenSet[str] + _deprecations = NDFrame._deprecations | frozenset( + ["get_value", "set_value", "from_items"] + ) # type: FrozenSet[str] _accessors = set() # type: Set[str] @property @@ -342,8 +388,7 @@ def _constructor_expanddim(self): # ---------------------------------------------------------------------- # Constructors - def __init__(self, data=None, index=None, columns=None, dtype=None, - copy=False): + def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False): if data is None: data = {} if dtype is not None: @@ -353,16 +398,17 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, data = data._data if isinstance(data, BlockManager): - mgr = self._init_mgr(data, axes=dict(index=index, columns=columns), - dtype=dtype, copy=copy) + mgr = self._init_mgr( + data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy + ) elif isinstance(data, dict): mgr = init_dict(data, index, columns, dtype=dtype) elif isinstance(data, ma.MaskedArray): import numpy.ma.mrecords as mrecords + # masked recarray if isinstance(data, mrecords.MaskedRecords): - mgr = masked_rec_array_to_mgr(data, index, columns, dtype, - copy) + mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy) # a masked array else: @@ -373,8 +419,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, data[mask] = fill_value else: data = data.copy() - mgr = init_ndarray(data, index, columns, dtype=dtype, - copy=copy) + mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) elif isinstance(data, (np.ndarray, Series, Index)): if data.dtype.names: @@ -383,20 +428,17 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, if columns is None: columns = data_columns mgr = init_dict(data, index, columns, dtype=dtype) - elif getattr(data, 'name', None) is not None: - mgr = init_dict({data.name: data}, index, columns, - dtype=dtype) + elif getattr(data, "name", None) is not None: + mgr = init_dict({data.name: data}, index, columns, dtype=dtype) else: - mgr = init_ndarray(data, index, columns, dtype=dtype, - copy=copy) + mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) # For data is list-like, or Iterable (will consume into list) - elif (isinstance(data, abc.Iterable) and - not isinstance(data, (str, bytes))): + elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)): if not isinstance(data, abc.Sequence): data = list(data) if len(data) > 0: - if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1: + if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: if is_named_tuple(data[0]) and columns is None: columns = data[0]._fields arrays, columns = to_arrays(data, columns, dtype=dtype) @@ -411,28 +453,30 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, else: index = ibase.default_index(len(data)) - mgr = arrays_to_mgr(arrays, columns, index, columns, - dtype=dtype) + mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) else: - mgr = init_ndarray(data, index, columns, dtype=dtype, - copy=copy) + mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) else: mgr = init_dict({}, index, columns, dtype=dtype) else: try: arr = np.array(data, dtype=dtype, copy=copy) except (ValueError, TypeError) as e: - exc = TypeError('DataFrame constructor called with ' - 'incompatible data and dtype: {e}'.format(e=e)) + exc = TypeError( + "DataFrame constructor called with " + "incompatible data and dtype: {e}".format(e=e) + ) raise_with_traceback(exc) if arr.ndim == 0 and index is not None and columns is not None: - values = cast_scalar_to_array((len(index), len(columns)), - data, dtype=dtype) - mgr = init_ndarray(values, index, columns, - dtype=values.dtype, copy=False) + values = cast_scalar_to_array( + (len(index), len(columns)), data, dtype=dtype + ) + mgr = init_ndarray( + values, index, columns, dtype=values.dtype, copy=False + ) else: - raise ValueError('DataFrame constructor not properly called!') + raise ValueError("DataFrame constructor not properly called!") NDFrame.__init__(self, mgr, fastpath=True) @@ -533,8 +577,9 @@ def _repr_fits_horizontal_(self, ignore_width=False): nb_columns = len(self.columns) # exceed max columns - if ((max_columns and nb_columns > max_columns) or - ((not ignore_width) and width and nb_columns > (width // 2))): + if (max_columns and nb_columns > max_columns) or ( + (not ignore_width) and width and nb_columns > (width // 2) + ): return False # used by repr_html under IPython notebook or scripts ignore terminal @@ -542,8 +587,7 @@ def _repr_fits_horizontal_(self, ignore_width=False): if ignore_width or not console.in_interactive_session(): return True - if (get_option('display.width') is not None or - console.in_ipython_frontend()): + if get_option("display.width") is not None or console.in_ipython_frontend(): # check at least the column row for excessive width max_rows = 1 else: @@ -560,13 +604,13 @@ def _repr_fits_horizontal_(self, ignore_width=False): if not (max_rows is None): # unlimited rows # min of two, where one may be None - d = d.iloc[:min(max_rows, len(d))] + d = d.iloc[: min(max_rows, len(d))] else: return True d.to_string(buf=buf) value = buf.getvalue() - repr_width = max(len(l) for l in value.split('\n')) + repr_width = max(len(l) for l in value.split("\n")) return repr_width < width @@ -574,9 +618,10 @@ def _info_repr(self): """ True if the repr should show the info view. """ - info_repr_option = (get_option("display.large_repr") == "info") - return info_repr_option and not (self._repr_fits_horizontal_() and - self._repr_fits_vertical_()) + info_repr_option = get_option("display.large_repr") == "info" + return info_repr_option and not ( + self._repr_fits_horizontal_() and self._repr_fits_vertical_() + ) def __repr__(self): """ @@ -595,9 +640,14 @@ def __repr__(self): width, _ = console.get_console_size() else: width = None - self.to_string(buf=buf, max_rows=max_rows, min_rows=min_rows, - max_cols=max_cols, line_width=width, - show_dimensions=show_dimensions) + self.to_string( + buf=buf, + max_rows=max_rows, + min_rows=min_rows, + max_cols=max_cols, + line_width=width, + show_dimensions=show_dimensions, + ) return buf.getvalue() @@ -611,32 +661,52 @@ def _repr_html_(self): buf = StringIO("") self.info(buf=buf) # need to escape the , should be the first line. - val = buf.getvalue().replace('<', r'<', 1) - val = val.replace('>', r'>', 1) - return '
' + val + '
' + val = buf.getvalue().replace("<", r"<", 1) + val = val.replace(">", r">", 1) + return "
" + val + "
" if get_option("display.notebook_repr_html"): max_rows = get_option("display.max_rows") max_cols = get_option("display.max_columns") show_dimensions = get_option("display.show_dimensions") - return self.to_html(max_rows=max_rows, max_cols=max_cols, - show_dimensions=show_dimensions, notebook=True) + return self.to_html( + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + notebook=True, + ) else: return None - @Substitution(header='Write out the column names. If a list of strings ' - 'is given, it is assumed to be aliases for the ' - 'column names', - col_space_type='int', - col_space='The minimum width of each column') - @Substitution(shared_params=fmt.common_docstring, - returns=fmt.return_docstring) - def to_string(self, buf=None, columns=None, col_space=None, header=True, - index=True, na_rep='NaN', formatters=None, float_format=None, - sparsify=None, index_names=True, justify=None, - max_rows=None, min_rows=None, max_cols=None, - show_dimensions=False, decimal='.', line_width=None): + @Substitution( + header="Write out the column names. If a list of strings " + "is given, it is assumed to be aliases for the " + "column names", + col_space_type="int", + col_space="The minimum width of each column", + ) + @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) + def to_string( + self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + justify=None, + max_rows=None, + min_rows=None, + max_cols=None, + show_dimensions=False, + decimal=".", + line_width=None, + ): """ Render a DataFrame to a console-friendly tabular output. %(shared_params)s @@ -658,19 +728,26 @@ def to_string(self, buf=None, columns=None, col_space=None, header=True, 2 3 6 """ - formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, - col_space=col_space, na_rep=na_rep, - formatters=formatters, - float_format=float_format, - sparsify=sparsify, justify=justify, - index_names=index_names, - header=header, index=index, - min_rows=min_rows, - max_rows=max_rows, - max_cols=max_cols, - show_dimensions=show_dimensions, - decimal=decimal, - line_width=line_width) + formatter = fmt.DataFrameFormatter( + self, + buf=buf, + columns=columns, + col_space=col_space, + na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + justify=justify, + index_names=index_names, + header=header, + index=index, + min_rows=min_rows, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=decimal, + line_width=line_width, + ) formatter.to_string() if buf is None: @@ -690,6 +767,7 @@ def style(self): io.formats.style.Styler """ from pandas.io.formats.style import Styler + return Styler(self) def iteritems(self): @@ -740,7 +818,7 @@ def iteritems(self): koala 80000 Name: population, dtype: int64 """ - if self.columns.is_unique and hasattr(self, '_item_cache'): + if self.columns.is_unique and hasattr(self, "_item_cache"): for k in self.columns: yield k, self._get_item_cache(k) else: @@ -966,9 +1044,8 @@ def dot(self, other): """ if isinstance(other, (Series, DataFrame)): common = self.columns.union(other.index) - if (len(common) > len(self.columns) or - len(common) > len(other.index)): - raise ValueError('matrices are not aligned') + if len(common) > len(self.columns) or len(common) > len(other.index): + raise ValueError("matrices are not aligned") left = self.reindex(columns=common, copy=False) right = other.reindex(index=common, copy=False) @@ -979,13 +1056,15 @@ def dot(self, other): lvals = self.values rvals = np.asarray(other) if lvals.shape[1] != rvals.shape[0]: - raise ValueError('Dot product shape mismatch, ' - '{s} vs {r}'.format(s=lvals.shape, - r=rvals.shape)) + raise ValueError( + "Dot product shape mismatch, " + "{s} vs {r}".format(s=lvals.shape, r=rvals.shape) + ) if isinstance(other, DataFrame): - return self._constructor(np.dot(lvals, rvals), index=left.index, - columns=other.columns) + return self._constructor( + np.dot(lvals, rvals), index=left.index, columns=other.columns + ) elif isinstance(other, Series): return Series(np.dot(lvals, rvals), index=left.index) elif isinstance(rvals, (np.ndarray, Index)): @@ -995,7 +1074,7 @@ def dot(self, other): else: return Series(result, index=left.index) else: # pragma: no cover - raise TypeError('unsupported type: {oth}'.format(oth=type(other))) + raise TypeError("unsupported type: {oth}".format(oth=type(other))) def __matmul__(self, other): """ @@ -1013,7 +1092,7 @@ def __rmatmul__(self, other): # IO methods (to / from other formats) @classmethod - def from_dict(cls, data, orient='columns', dtype=None, columns=None): + def from_dict(cls, data, orient="columns", dtype=None, columns=None): """ Construct DataFrame from dict of array-like or dicts. @@ -1078,19 +1157,20 @@ def from_dict(cls, data, orient='columns', dtype=None, columns=None): """ index = None orient = orient.lower() - if orient == 'index': + if orient == "index": if len(data) > 0: # TODO speed up Series case if isinstance(list(data.values())[0], (Series, dict)): data = _from_nested_dict(data) else: data, index = list(data.values()), list(data.keys()) - elif orient == 'columns': + elif orient == "columns": if columns is not None: - raise ValueError("cannot use columns parameter with " - "orient='columns'") + raise ValueError( + "cannot use columns parameter with " "orient='columns'" + ) else: # pragma: no cover - raise ValueError('only recognize index or columns for orient') + raise ValueError("only recognize index or columns for orient") return cls(data, index=index, columns=columns, dtype=dtype) @@ -1149,7 +1229,7 @@ def to_numpy(self, dtype=None, copy=False): result = np.array(self.values, dtype=dtype, copy=copy) return result - def to_dict(self, orient='dict', into=dict): + def to_dict(self, orient="dict", into=dict): """ Convert the DataFrame to a dictionary. @@ -1239,48 +1319,68 @@ def to_dict(self, orient='dict', into=dict): defaultdict(, {'col1': 2, 'col2': 0.75})] """ if not self.columns.is_unique: - warnings.warn("DataFrame columns are not unique, some " - "columns will be omitted.", UserWarning, - stacklevel=2) + warnings.warn( + "DataFrame columns are not unique, some " "columns will be omitted.", + UserWarning, + stacklevel=2, + ) # GH16122 into_c = com.standardize_mapping(into) - if orient.lower().startswith('d'): - return into_c( - (k, v.to_dict(into)) for k, v in self.items()) - elif orient.lower().startswith('l'): + if orient.lower().startswith("d"): + return into_c((k, v.to_dict(into)) for k, v in self.items()) + elif orient.lower().startswith("l"): return into_c((k, v.tolist()) for k, v in self.items()) - elif orient.lower().startswith('sp'): - return into_c((('index', self.index.tolist()), - ('columns', self.columns.tolist()), - ('data', [ - list(map(com.maybe_box_datetimelike, t)) - for t in self.itertuples(index=False, name=None) - ]))) - elif orient.lower().startswith('s'): - return into_c((k, com.maybe_box_datetimelike(v)) - for k, v in self.items()) - elif orient.lower().startswith('r'): + elif orient.lower().startswith("sp"): + return into_c( + ( + ("index", self.index.tolist()), + ("columns", self.columns.tolist()), + ( + "data", + [ + list(map(com.maybe_box_datetimelike, t)) + for t in self.itertuples(index=False, name=None) + ], + ), + ) + ) + elif orient.lower().startswith("s"): + return into_c((k, com.maybe_box_datetimelike(v)) for k, v in self.items()) + elif orient.lower().startswith("r"): columns = self.columns.tolist() - rows = (dict(zip(columns, row)) - for row in self.itertuples(index=False, name=None)) + rows = ( + dict(zip(columns, row)) + for row in self.itertuples(index=False, name=None) + ) return [ - into_c((k, com.maybe_box_datetimelike(v)) - for k, v in row.items()) - for row in rows] - elif orient.lower().startswith('i'): + into_c((k, com.maybe_box_datetimelike(v)) for k, v in row.items()) + for row in rows + ] + elif orient.lower().startswith("i"): if not self.index.is_unique: - raise ValueError( - "DataFrame index must be unique for orient='index'." - ) - return into_c((t[0], dict(zip(self.columns, t[1:]))) - for t in self.itertuples(name=None)) + raise ValueError("DataFrame index must be unique for orient='index'.") + return into_c( + (t[0], dict(zip(self.columns, t[1:]))) + for t in self.itertuples(name=None) + ) else: raise ValueError("orient '{o}' not understood".format(o=orient)) - def to_gbq(self, destination_table, project_id=None, chunksize=None, - reauth=False, if_exists='fail', auth_local_webserver=False, - table_schema=None, location=None, progress_bar=True, - credentials=None, verbose=None, private_key=None): + def to_gbq( + self, + destination_table, + project_id=None, + chunksize=None, + reauth=False, + if_exists="fail", + auth_local_webserver=False, + table_schema=None, + location=None, + progress_bar=True, + credentials=None, + verbose=None, + private_key=None, + ): """ Write a DataFrame to a Google BigQuery table. @@ -1376,16 +1476,33 @@ def to_gbq(self, destination_table, project_id=None, chunksize=None, read_gbq : Read a DataFrame from Google BigQuery. """ from pandas.io import gbq - gbq.to_gbq(self, destination_table, project_id=project_id, - chunksize=chunksize, reauth=reauth, if_exists=if_exists, - auth_local_webserver=auth_local_webserver, - table_schema=table_schema, location=location, - progress_bar=progress_bar, credentials=credentials, - verbose=verbose, private_key=private_key) + + gbq.to_gbq( + self, + destination_table, + project_id=project_id, + chunksize=chunksize, + reauth=reauth, + if_exists=if_exists, + auth_local_webserver=auth_local_webserver, + table_schema=table_schema, + location=location, + progress_bar=progress_bar, + credentials=credentials, + verbose=verbose, + private_key=private_key, + ) @classmethod - def from_records(cls, data, index=None, exclude=None, columns=None, - coerce_float=False, nrows=None): + def from_records( + cls, + data, + index=None, + exclude=None, + columns=None, + coerce_float=False, + nrows=None, + ): """ Convert structured or record ndarray to DataFrame. @@ -1428,7 +1545,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None, return cls(index=index, columns=columns) dtype = None - if hasattr(first_row, 'dtype') and first_row.dtype.names: + if hasattr(first_row, "dtype") and first_row.dtype.names: dtype = first_row.dtype values = [first_row] @@ -1455,8 +1572,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None, arr_columns.append(k) arrays.append(v) - arrays, arr_columns = reorder_arrays(arrays, arr_columns, - columns) + arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns) elif isinstance(data, (np.ndarray, DataFrame)): arrays, columns = to_arrays(data, columns) @@ -1464,8 +1580,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None, columns = ensure_index(columns) arr_columns = columns else: - arrays, arr_columns = to_arrays(data, columns, - coerce_float=coerce_float) + arrays, arr_columns = to_arrays(data, columns, coerce_float=coerce_float) arr_columns = ensure_index(arr_columns) if columns is not None: @@ -1480,8 +1595,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None, result_index = None if index is not None: - if (isinstance(index, str) or - not hasattr(index, "__iter__")): + if isinstance(index, str) or not hasattr(index, "__iter__"): i = columns.get_loc(index) exclude.add(index) if len(arrays) > 0: @@ -1490,10 +1604,8 @@ def from_records(cls, data, index=None, exclude=None, columns=None, result_index = Index([], name=index) else: try: - index_data = [arrays[arr_columns.get_loc(field)] - for field in index] - result_index = ensure_index_from_sequences(index_data, - names=index) + index_data = [arrays[arr_columns.get_loc(field)] for field in index] + result_index = ensure_index_from_sequences(index_data, names=index) exclude.update(index) except Exception: @@ -1511,8 +1623,9 @@ def from_records(cls, data, index=None, exclude=None, columns=None, return cls(mgr) - def to_records(self, index=True, convert_datetime64=None, - column_dtypes=None, index_dtypes=None): + def to_records( + self, index=True, convert_datetime64=None, column_dtypes=None, index_dtypes=None + ): """ Convert DataFrame to a NumPy record array. @@ -1604,10 +1717,13 @@ def to_records(self, index=True, convert_datetime64=None, """ if convert_datetime64 is not None: - warnings.warn("The 'convert_datetime64' parameter is " - "deprecated and will be removed in a future " - "version", - FutureWarning, stacklevel=2) + warnings.warn( + "The 'convert_datetime64' parameter is " + "deprecated and will be removed in a future " + "version", + FutureWarning, + stacklevel=2, + ) if index: if is_datetime64_any_dtype(self.index) and convert_datetime64: @@ -1619,8 +1735,7 @@ def to_records(self, index=True, convert_datetime64=None, else: ix_vals = [self.index.values] - arrays = ix_vals + [self[c]._internal_get_values() - for c in self.columns] + arrays = ix_vals + [self[c]._internal_get_values() for c in self.columns] count = 0 index_names = list(self.index.names) @@ -1628,13 +1743,12 @@ def to_records(self, index=True, convert_datetime64=None, if isinstance(self.index, MultiIndex): for i, n in enumerate(index_names): if n is None: - index_names[i] = 'level_%d' % count + index_names[i] = "level_%d" % count count += 1 elif index_names[0] is None: - index_names = ['index'] + index_names = ["index"] - names = [str(name) for name in itertools.chain(index_names, - self.columns)] + names = [str(name) for name in itertools.chain(index_names, self.columns)] else: arrays = [self[c]._internal_get_values() for c in self.columns] names = [str(c) for c in self.columns] @@ -1687,18 +1801,15 @@ def to_records(self, index=True, convert_datetime64=None, formats.append(dtype_mapping) else: element = "row" if i < index_len else "column" - msg = ("Invalid dtype {dtype} specified for " - "{element} {name}").format(dtype=dtype_mapping, - element=element, name=name) + msg = ( + "Invalid dtype {dtype} specified for " "{element} {name}" + ).format(dtype=dtype_mapping, element=element, name=name) raise ValueError(msg) - return np.rec.fromarrays( - arrays, - dtype={'names': names, 'formats': formats} - ) + return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats}) @classmethod - def from_items(cls, items, columns=None, orient='columns'): + def from_items(cls, items, columns=None, orient="columns"): """ Construct a DataFrame from a list of tuples. @@ -1730,23 +1841,28 @@ def from_items(cls, items, columns=None, orient='columns'): DataFrame """ - warnings.warn("from_items is deprecated. Please use " - "DataFrame.from_dict(dict(items), ...) instead. " - "DataFrame.from_dict(OrderedDict(items)) may be used to " - "preserve the key order.", - FutureWarning, stacklevel=2) + warnings.warn( + "from_items is deprecated. Please use " + "DataFrame.from_dict(dict(items), ...) instead. " + "DataFrame.from_dict(OrderedDict(items)) may be used to " + "preserve the key order.", + FutureWarning, + stacklevel=2, + ) keys, values = zip(*items) - if orient == 'columns': + if orient == "columns": if columns is not None: columns = ensure_index(columns) idict = dict(items) if len(idict) < len(items): if not columns.equals(ensure_index(keys)): - raise ValueError('With non-unique item names, passed ' - 'columns must be identical') + raise ValueError( + "With non-unique item names, passed " + "columns must be identical" + ) arrays = values else: arrays = [idict[k] for k in columns if k in idict] @@ -1761,10 +1877,12 @@ def from_items(cls, items, columns=None, orient='columns'): except ValueError: if not is_nested_list_like(values): - raise ValueError('The value in each (key, value) pair ' - 'must be an array, Series, or dict') + raise ValueError( + "The value in each (key, value) pair " + "must be an array, Series, or dict" + ) - elif orient == 'index': + elif orient == "index": if columns is None: raise TypeError("Must pass columns with orient='index'") @@ -1779,8 +1897,10 @@ def from_items(cls, items, columns=None, orient='columns'): except TypeError: if not is_nested_list_like(values): - raise ValueError('The value in each (key, value) pair ' - 'must be an array, Series, or dict') + raise ValueError( + "The value in each (key, value) pair " + "must be an array, Series, or dict" + ) else: # pragma: no cover raise ValueError("'orient' must be either 'columns' or 'index'") @@ -1790,7 +1910,7 @@ def _from_arrays(cls, arrays, columns, index, dtype=None): mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) return cls(mgr) - def to_sparse(self, fill_value=None, kind='block'): + def to_sparse(self, fill_value=None, kind="block"): """ Convert to SparseDataFrame. @@ -1846,21 +1966,39 @@ def to_sparse(self, fill_value=None, kind='block'): >>> type(sdf) # doctest: +SKIP """ - warnings.warn("DataFrame.to_sparse is deprecated and will be removed " - "in a future version", FutureWarning, stacklevel=2) + warnings.warn( + "DataFrame.to_sparse is deprecated and will be removed " + "in a future version", + FutureWarning, + stacklevel=2, + ) from pandas.core.sparse.api import SparseDataFrame + with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="SparseDataFrame") - return SparseDataFrame(self._series, index=self.index, - columns=self.columns, default_kind=kind, - default_fill_value=fill_value) + return SparseDataFrame( + self._series, + index=self.index, + columns=self.columns, + default_kind=kind, + default_fill_value=fill_value, + ) - @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None) - def to_stata(self, fname, convert_dates=None, write_index=True, - encoding="latin-1", byteorder=None, time_stamp=None, - data_label=None, variable_labels=None, version=114, - convert_strl=None): + @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) + def to_stata( + self, + fname, + convert_dates=None, + write_index=True, + encoding="latin-1", + byteorder=None, + time_stamp=None, + data_label=None, + variable_labels=None, + version=114, + convert_strl=None, + ): """ Export DataFrame object to Stata dta format. @@ -1943,20 +2081,29 @@ def to_stata(self, fname, convert_dates=None, write_index=True, """ kwargs = {} if version not in (114, 117): - raise ValueError('Only formats 114 and 117 supported.') + raise ValueError("Only formats 114 and 117 supported.") if version == 114: if convert_strl is not None: - raise ValueError('strl support is only available when using ' - 'format 117') + raise ValueError( + "strl support is only available when using " "format 117" + ) from pandas.io.stata import StataWriter as statawriter else: from pandas.io.stata import StataWriter117 as statawriter - kwargs['convert_strl'] = convert_strl - writer = statawriter(fname, self, convert_dates=convert_dates, - byteorder=byteorder, time_stamp=time_stamp, - data_label=data_label, write_index=write_index, - variable_labels=variable_labels, **kwargs) + kwargs["convert_strl"] = convert_strl + + writer = statawriter( + fname, + self, + convert_dates=convert_dates, + byteorder=byteorder, + time_stamp=time_stamp, + data_label=data_label, + write_index=write_index, + variable_labels=variable_labels, + **kwargs + ) writer.write_file() def to_feather(self, fname): @@ -1971,10 +2118,18 @@ def to_feather(self, fname): string file path """ from pandas.io.feather_format import to_feather + to_feather(self, fname) - def to_parquet(self, fname, engine='auto', compression='snappy', - index=None, partition_cols=None, **kwargs): + def to_parquet( + self, + fname, + engine="auto", + compression="snappy", + index=None, + partition_cols=None, + **kwargs + ): """ Write a DataFrame to the binary parquet format. @@ -2041,24 +2196,51 @@ def to_parquet(self, fname, engine='auto', compression='snappy', 1 2 4 """ from pandas.io.parquet import to_parquet - to_parquet(self, fname, engine, - compression=compression, index=index, - partition_cols=partition_cols, **kwargs) - - @Substitution(header='Whether to print column labels, default True', - col_space_type='str or int', - col_space='The minimum width of each column in CSS length ' - 'units. An int is assumed to be px units.\n\n' - ' .. versionadded:: 0.25.0\n' - ' Ability to use str') - @Substitution(shared_params=fmt.common_docstring, - returns=fmt.return_docstring) - def to_html(self, buf=None, columns=None, col_space=None, header=True, - index=True, na_rep='NaN', formatters=None, float_format=None, - sparsify=None, index_names=True, justify=None, max_rows=None, - max_cols=None, show_dimensions=False, decimal='.', - bold_rows=True, classes=None, escape=True, notebook=False, - border=None, table_id=None, render_links=False): + + to_parquet( + self, + fname, + engine, + compression=compression, + index=index, + partition_cols=partition_cols, + **kwargs + ) + + @Substitution( + header="Whether to print column labels, default True", + col_space_type="str or int", + col_space="The minimum width of each column in CSS length " + "units. An int is assumed to be px units.\n\n" + " .. versionadded:: 0.25.0\n" + " Ability to use str", + ) + @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) + def to_html( + self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + justify=None, + max_rows=None, + max_cols=None, + show_dimensions=False, + decimal=".", + bold_rows=True, + classes=None, + escape=True, + notebook=False, + border=None, + table_id=None, + render_links=False, + ): """ Render a DataFrame as an HTML table. %(shared_params)s @@ -2091,23 +2273,31 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, to_string : Convert DataFrame to a string. """ - if (justify is not None and - justify not in fmt._VALID_JUSTIFY_PARAMETERS): + if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS: raise ValueError("Invalid value for justify parameter") - formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, - col_space=col_space, na_rep=na_rep, - formatters=formatters, - float_format=float_format, - sparsify=sparsify, justify=justify, - index_names=index_names, - header=header, index=index, - bold_rows=bold_rows, escape=escape, - max_rows=max_rows, - max_cols=max_cols, - show_dimensions=show_dimensions, - decimal=decimal, table_id=table_id, - render_links=render_links) + formatter = fmt.DataFrameFormatter( + self, + buf=buf, + columns=columns, + col_space=col_space, + na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + justify=justify, + index_names=index_names, + header=header, + index=index, + bold_rows=bold_rows, + escape=escape, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=decimal, + table_id=table_id, + render_links=render_links, + ) # TODO: a generic formatter wld b in DataFrameFormatter formatter.to_html(classes=classes, notebook=notebook, border=border) @@ -2116,8 +2306,9 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, # ---------------------------------------------------------------------- - def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, - null_counts=None): + def info( + self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None + ): """ Print a concise summary of a DataFrame. @@ -2257,7 +2448,7 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, lines.append(self.index._summary()) if len(self.columns) == 0: - lines.append('Empty {name}'.format(name=type(self).__name__)) + lines.append("Empty {name}".format(name=type(self).__name__)) fmt.buffer_put_lines(buf, lines) return @@ -2265,21 +2456,18 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, # hack if max_cols is None: - max_cols = get_option('display.max_info_columns', - len(self.columns) + 1) + max_cols = get_option("display.max_info_columns", len(self.columns) + 1) - max_rows = get_option('display.max_info_rows', len(self) + 1) + max_rows = get_option("display.max_info_rows", len(self) + 1) if null_counts is None: - show_counts = ((len(self.columns) <= max_cols) and - (len(self) < max_rows)) + show_counts = (len(self.columns) <= max_cols) and (len(self) < max_rows) else: show_counts = null_counts exceeds_info_cols = len(self.columns) > max_cols def _verbose_repr(): - lines.append('Data columns (total %d columns):' % - len(self.columns)) + lines.append("Data columns (total %d columns):" % len(self.columns)) space = max(len(pprint_thing(k)) for k in self.columns) + 4 counts = None @@ -2288,9 +2476,11 @@ def _verbose_repr(): counts = self.count() if len(cols) != len(counts): # pragma: no cover raise AssertionError( - 'Columns must equal counts ' - '({cols:d} != {counts:d})'.format( - cols=len(cols), counts=len(counts))) + "Columns must equal counts " + "({cols:d} != {counts:d})".format( + cols=len(cols), counts=len(counts) + ) + ) tmpl = "{count} non-null {dtype}" dtypes = self.dtypes @@ -2302,22 +2492,24 @@ def _verbose_repr(): if show_counts: count = counts.iloc[i] - lines.append(_put_str(col, space) + tmpl.format(count=count, - dtype=dtype)) + lines.append( + _put_str(col, space) + tmpl.format(count=count, dtype=dtype) + ) def _non_verbose_repr(): - lines.append(self.columns._summary(name='Columns')) + lines.append(self.columns._summary(name="Columns")) def _sizeof_fmt(num, size_qualifier): # returns size in human readable format - for x in ['bytes', 'KB', 'MB', 'GB', 'TB']: + for x in ["bytes", "KB", "MB", "GB", "TB"]: if num < 1024.0: - return ("{num:3.1f}{size_q} " - "{x}".format(num=num, size_q=size_qualifier, x=x)) + return "{num:3.1f}{size_q} " "{x}".format( + num=num, size_q=size_qualifier, x=x + ) num /= 1024.0 - return "{num:3.1f}{size_q} {pb}".format(num=num, - size_q=size_qualifier, - pb='PB') + return "{num:3.1f}{size_q} {pb}".format( + num=num, size_q=size_qualifier, pb="PB" + ) if verbose: _verbose_repr() @@ -2330,28 +2522,29 @@ def _sizeof_fmt(num, size_qualifier): _verbose_repr() counts = self._data.get_dtype_counts() - dtypes = ['{k}({kk:d})'.format(k=k[0], kk=k[1]) for k - in sorted(counts.items())] - lines.append('dtypes: {types}'.format(types=', '.join(dtypes))) + dtypes = ["{k}({kk:d})".format(k=k[0], kk=k[1]) for k in sorted(counts.items())] + lines.append("dtypes: {types}".format(types=", ".join(dtypes))) if memory_usage is None: - memory_usage = get_option('display.memory_usage') + memory_usage = get_option("display.memory_usage") if memory_usage: # append memory usage of df to display - size_qualifier = '' - if memory_usage == 'deep': + size_qualifier = "" + if memory_usage == "deep": deep = True else: # size_qualifier is just a best effort; not guaranteed to catch # all cases (e.g., it misses categorical data even with object # categories) deep = False - if ('object' in counts or - self.index._is_memory_usage_qualified()): - size_qualifier = '+' + if "object" in counts or self.index._is_memory_usage_qualified(): + size_qualifier = "+" mem_usage = self.memory_usage(index=True, deep=deep).sum() - lines.append("memory usage: {mem}\n".format( - mem=_sizeof_fmt(mem_usage, size_qualifier))) + lines.append( + "memory usage: {mem}\n".format( + mem=_sizeof_fmt(mem_usage, size_qualifier) + ) + ) fmt.buffer_put_lines(buf, lines) @@ -2439,11 +2632,14 @@ def memory_usage(self, index=True, deep=False): >>> df['object'].astype('category').memory_usage(deep=True) 5216 """ - result = Series([c.memory_usage(index=False, deep=deep) - for col, c in self.iteritems()], index=self.columns) + result = Series( + [c.memory_usage(index=False, deep=deep) for col, c in self.iteritems()], + index=self.columns, + ) if index: - result = Series(self.index.memory_usage(deep=deep), - index=['Index']).append(result) + result = Series(self.index.memory_usage(deep=deep), index=["Index"]).append( + result + ) return result def transpose(self, *args, **kwargs): @@ -2566,13 +2762,13 @@ def _unpickle_matrix_compat(self, state): # pragma: no cover (vals, idx, cols), object_state = state index = com._unpickle_array(idx) - dm = DataFrame(vals, index=index, columns=com._unpickle_array(cols), - copy=False) + dm = DataFrame(vals, index=index, columns=com._unpickle_array(cols), copy=False) if object_state is not None: ovals, _, ocols = object_state - objects = DataFrame(ovals, index=index, - columns=com._unpickle_array(ocols), copy=False) + objects = DataFrame( + ovals, index=index, columns=com._unpickle_array(ocols), copy=False + ) dm = dm.join(objects) @@ -2599,10 +2795,13 @@ def get_value(self, index, col, takeable=False): scalar """ - warnings.warn("get_value is deprecated and will be removed " - "in a future release. Please use " - ".at[] or .iat[] accessors instead", FutureWarning, - stacklevel=2) + warnings.warn( + "get_value is deprecated and will be removed " + "in a future release. Please use " + ".at[] or .iat[] accessors instead", + FutureWarning, + stacklevel=2, + ) return self._get_value(index, col, takeable=takeable) def _get_value(self, index, col, takeable=False): @@ -2629,6 +2828,7 @@ def _get_value(self, index, col, takeable=False): col = self.columns.get_loc(col) index = self.index.get_loc(index) return self._get_value(index, col, takeable=True) + _get_value.__doc__ = get_value.__doc__ def set_value(self, index, col, value, takeable=False): @@ -2651,10 +2851,13 @@ def set_value(self, index, col, value, takeable=False): If label pair is contained, will be reference to calling DataFrame, otherwise a new object. """ - warnings.warn("set_value is deprecated and will be removed " - "in a future release. Please use " - ".at[] or .iat[] accessors instead", FutureWarning, - stacklevel=2) + warnings.warn( + "set_value is deprecated and will be removed " + "in a future release. Please use " + ".at[] or .iat[] accessors instead", + FutureWarning, + stacklevel=2, + ) return self._set_value(index, col, value, takeable=takeable) def _set_value(self, index, col, value, takeable=False): @@ -2677,6 +2880,7 @@ def _set_value(self, index, col, value, takeable=False): self._item_cache.pop(col, None) return self + _set_value.__doc__ = set_value.__doc__ def _ixs(self, i, axis=0): @@ -2706,12 +2910,15 @@ def _ixs(self, i, axis=0): return new_values # if we are a copy, mark as such - copy = (isinstance(new_values, np.ndarray) and - new_values.base is None) - result = self._constructor_sliced(new_values, - index=self.columns, - name=self.index[i], - dtype=new_values.dtype) + copy = ( + isinstance(new_values, np.ndarray) and new_values.base is None + ) + result = self._constructor_sliced( + new_values, + index=self.columns, + name=self.index[i], + dtype=new_values.dtype, + ) result._set_is_copy(self, copy=copy) return result @@ -2784,8 +2991,7 @@ def __getitem__(self, key): else: if is_iterator(key): key = list(key) - indexer = self.loc._convert_to_indexer(key, axis=1, - raise_missing=True) + indexer = self.loc._convert_to_indexer(key, axis=1, raise_missing=True) # take() does not accept boolean indexers if getattr(indexer, "dtype", None) == bool: @@ -2810,11 +3016,15 @@ def _getitem_bool_array(self, key): # go with the __setitem__ behavior since that is more consistent # with all other indexing behavior if isinstance(key, Series) and not key.index.equals(self.index): - warnings.warn("Boolean Series key will be reindexed to match " - "DataFrame index.", UserWarning, stacklevel=3) + warnings.warn( + "Boolean Series key will be reindexed to match " "DataFrame index.", + UserWarning, + stacklevel=3, + ) elif len(key) != len(self.index): - raise ValueError('Item wrong length %d instead of %d.' % - (len(key), len(self.index))) + raise ValueError( + "Item wrong length %d instead of %d." % (len(key), len(self.index)) + ) # check_bool_indexer will throw exception if Series key cannot # be reindexed to match DataFrame rows @@ -2832,8 +3042,9 @@ def _getitem_multilevel(self, key): result.columns = result_columns else: new_values = self.values[:, loc] - result = self._constructor(new_values, index=self.index, - columns=result_columns) + result = self._constructor( + new_values, index=self.index, columns=result_columns + ) result = result.__finalize__(self) # If there is only one column being returned, and its name is @@ -2846,12 +3057,12 @@ def _getitem_multilevel(self, key): top = result.columns[0] if isinstance(top, tuple): top = top[0] - if top == '': - result = result[''] + if top == "": + result = result[""] if isinstance(result, Series): - result = self._constructor_sliced(result, - index=self.index, - name=key) + result = self._constructor_sliced( + result, index=self.index, name=key + ) result._set_is_copy(self) return result @@ -2860,7 +3071,7 @@ def _getitem_multilevel(self, key): def _getitem_frame(self, key): if key.values.size and not is_bool_dtype(key.values): - raise ValueError('Must pass DataFrame with boolean values only') + raise ValueError("Must pass DataFrame with boolean values only") return self.where(key) def query(self, expr, inplace=False, **kwargs): @@ -2972,12 +3183,12 @@ def query(self, expr, inplace=False, **kwargs): A B C C 0 1 10 10 """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if not isinstance(expr, str): msg = "expr must be a string to be evaluated, {0} given" raise ValueError(msg.format(type(expr))) - kwargs['level'] = kwargs.pop('level', 0) + 1 - kwargs['target'] = None + kwargs["level"] = kwargs.pop("level", 0) + 1 + kwargs["target"] = None res = self.eval(expr, **kwargs) try: @@ -3084,17 +3295,16 @@ def eval(self, expr, inplace=False, **kwargs): """ from pandas.core.computation.eval import eval as _eval - inplace = validate_bool_kwarg(inplace, 'inplace') - resolvers = kwargs.pop('resolvers', None) - kwargs['level'] = kwargs.pop('level', 0) + 1 + inplace = validate_bool_kwarg(inplace, "inplace") + resolvers = kwargs.pop("resolvers", None) + kwargs["level"] = kwargs.pop("level", 0) + 1 if resolvers is None: index_resolvers = self._get_index_resolvers() - column_resolvers = \ - self._get_space_character_free_column_resolvers() + column_resolvers = self._get_space_character_free_column_resolvers() resolvers = column_resolvers, index_resolvers - if 'target' not in kwargs: - kwargs['target'] = self - kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers) + if "target" not in kwargs: + kwargs["target"] = self + kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers) return _eval(expr, inplace=inplace, **kwargs) def select_dtypes(self, include=None, exclude=None): @@ -3176,10 +3386,11 @@ def select_dtypes(self, include=None, exclude=None): 4 True 1.0 5 False 2.0 """ + def _get_info_slice(obj, indexer): """Slice the info axis of `obj` with `indexer`.""" - if not hasattr(obj, '_info_axis_number'): - msg = 'object of type {typ!r} has no info axis' + if not hasattr(obj, "_info_axis_number"): + msg = "object of type {typ!r} has no info axis" raise TypeError(msg.format(typ=type(obj).__name__)) slices = [slice(None)] * obj.ndim slices[obj._info_axis_number] = indexer @@ -3193,19 +3404,22 @@ def _get_info_slice(obj, indexer): selection = tuple(map(frozenset, (include, exclude))) if not any(selection): - raise ValueError('at least one of include or exclude must be ' - 'nonempty') + raise ValueError("at least one of include or exclude must be " "nonempty") # convert the myriad valid dtypes object to a single representation include, exclude = map( - lambda x: frozenset(map(infer_dtype_from_object, x)), selection) + lambda x: frozenset(map(infer_dtype_from_object, x)), selection + ) for dtypes in (include, exclude): invalidate_string_dtypes(dtypes) # can't both include AND exclude! if not include.isdisjoint(exclude): - raise ValueError('include and exclude overlap on {inc_ex}'.format( - inc_ex=(include & exclude))) + raise ValueError( + "include and exclude overlap on {inc_ex}".format( + inc_ex=(include & exclude) + ) + ) # empty include/exclude -> defaults to True # three cases (we've already raised if both are empty) @@ -3224,8 +3438,9 @@ def _get_info_slice(obj, indexer): def is_dtype_instance_mapper(idx, dtype): return idx, functools.partial(issubclass, dtype.type) - for idx, f in itertools.starmap(is_dtype_instance_mapper, - enumerate(self.dtypes)): + for idx, f in itertools.starmap( + is_dtype_instance_mapper, enumerate(self.dtypes) + ): if include: # checks for the case of empty include or exclude include_these.iloc[idx] = any(map(f, include)) if exclude: @@ -3256,7 +3471,7 @@ def __setitem__(self, key, value): if indexer is not None: return self._setitem_slice(indexer, value) - if isinstance(key, DataFrame) or getattr(key, 'ndim', None) == 2: + if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2: self._setitem_frame(key, value) elif isinstance(key, (Series, np.ndarray, list, Index)): self._setitem_array(key, value) @@ -3272,8 +3487,9 @@ def _setitem_array(self, key, value): # also raises Exception if object array with NA values if com.is_bool_indexer(key): if len(key) != len(self.index): - raise ValueError('Item wrong length %d instead of %d!' % - (len(key), len(self.index))) + raise ValueError( + "Item wrong length %d instead of %d!" % (len(key), len(self.index)) + ) key = check_bool_indexer(self.index, key) indexer = key.nonzero()[0] self._check_setitem_copy() @@ -3281,7 +3497,7 @@ def _setitem_array(self, key, value): else: if isinstance(value, DataFrame): if len(value.columns) != len(key): - raise ValueError('Columns must be same length as key') + raise ValueError("Columns must be same length as key") for k1, k2 in zip(key, value.columns): self[k1] = value[k2] else: @@ -3294,14 +3510,12 @@ def _setitem_frame(self, key, value): # df[df > df2] = 0 if isinstance(key, np.ndarray): if key.shape != self.shape: - raise ValueError( - 'Array conditional must be same shape as self' - ) + raise ValueError("Array conditional must be same shape as self") key = self._constructor(key, **self._construct_axes_dict()) if key.values.size and not is_bool_dtype(key.values): raise TypeError( - 'Must pass DataFrame or 2-d ndarray with boolean values only' + "Must pass DataFrame or 2-d ndarray with boolean values only" ) self._check_inplace_setting(value) @@ -3318,12 +3532,15 @@ def _ensure_valid_index(self, value): try: value = Series(value) except (ValueError, NotImplementedError, TypeError): - raise ValueError('Cannot set a frame with no defined index ' - 'and a value that cannot be converted to a ' - 'Series') + raise ValueError( + "Cannot set a frame with no defined index " + "and a value that cannot be converted to a " + "Series" + ) - self._data = self._data.reindex_axis(value.index.copy(), axis=1, - fill_value=np.nan) + self._data = self._data.reindex_axis( + value.index.copy(), axis=1, fill_value=np.nan + ) def _set_item(self, key, value): """ @@ -3364,8 +3581,7 @@ def insert(self, loc, column, value, allow_duplicates=False): """ self._ensure_valid_index(value) value = self._sanitize_column(column, value, broadcast=False) - self._data.insert(loc, column, value, - allow_duplicates=allow_duplicates) + self._data.insert(loc, column, value, allow_duplicates=allow_duplicates) def assign(self, **kwargs): r""" @@ -3494,8 +3710,9 @@ def reindexer(value): raise e # other - raise TypeError('incompatible index of inserted column ' - 'with frame index') + raise TypeError( + "incompatible index of inserted column " "with frame index" + ) return value if isinstance(value, Series): @@ -3541,8 +3758,7 @@ def reindexer(value): else: # cast ignores pandas dtypes. so save the dtype first - infer_dtype, _ = infer_dtype_from_scalar( - value, pandas_dtype=True) + infer_dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) # upcast value = cast_scalar_to_array(len(self.index), value) @@ -3554,8 +3770,7 @@ def reindexer(value): # broadcast across multiple columns if necessary if broadcast and key in self.columns and value.ndim == 1: - if (not self.columns.is_unique or - isinstance(self.columns, MultiIndex)): + if not self.columns.is_unique or isinstance(self.columns, MultiIndex): existing_piece = self[key] if isinstance(existing_piece, DataFrame): value = np.tile(value, (len(existing_piece.columns), 1)) @@ -3564,8 +3779,10 @@ def reindexer(value): @property def _series(self): - return {item: Series(self._data.iget(idx), index=self.index, name=item) - for idx, item in enumerate(self.columns)} + return { + item: Series(self._data.iget(idx), index=self.index, name=item) + for idx, item in enumerate(self.columns) + } def lookup(self, row_labels, col_labels): """ @@ -3599,7 +3816,7 @@ def lookup(self, row_labels, col_labels): """ n = len(row_labels) if n != len(col_labels): - raise ValueError('Row labels must have same size as column labels') + raise ValueError("Row labels must have same size as column labels") thresh = 1000 if not self._is_mixed_type or n > thresh: @@ -3607,13 +3824,13 @@ def lookup(self, row_labels, col_labels): ridx = self.index.get_indexer(row_labels) cidx = self.columns.get_indexer(col_labels) if (ridx == -1).any(): - raise KeyError('One or more row labels was not found') + raise KeyError("One or more row labels was not found") if (cidx == -1).any(): - raise KeyError('One or more column labels was not found') + raise KeyError("One or more column labels was not found") flat_index = ridx * len(self.columns) + cidx result = values.flat[flat_index] else: - result = np.empty(n, dtype='O') + result = np.empty(n, dtype="O") for i, (r, c) in enumerate(zip(row_labels, col_labels)): result[i] = self._get_value(r, c) @@ -3625,88 +3842,142 @@ def lookup(self, row_labels, col_labels): # ---------------------------------------------------------------------- # Reindexing and alignment - def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, - copy): + def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy): frame = self - columns = axes['columns'] + columns = axes["columns"] if columns is not None: - frame = frame._reindex_columns(columns, method, copy, level, - fill_value, limit, tolerance) + frame = frame._reindex_columns( + columns, method, copy, level, fill_value, limit, tolerance + ) - index = axes['index'] + index = axes["index"] if index is not None: - frame = frame._reindex_index(index, method, copy, level, - fill_value, limit, tolerance) + frame = frame._reindex_index( + index, method, copy, level, fill_value, limit, tolerance + ) return frame - def _reindex_index(self, new_index, method, copy, level, fill_value=np.nan, - limit=None, tolerance=None): - new_index, indexer = self.index.reindex(new_index, method=method, - level=level, limit=limit, - tolerance=tolerance) - return self._reindex_with_indexers({0: [new_index, indexer]}, - copy=copy, fill_value=fill_value, - allow_dups=False) - - def _reindex_columns(self, new_columns, method, copy, level, - fill_value=None, limit=None, tolerance=None): - new_columns, indexer = self.columns.reindex(new_columns, method=method, - level=level, limit=limit, - tolerance=tolerance) - return self._reindex_with_indexers({1: [new_columns, indexer]}, - copy=copy, fill_value=fill_value, - allow_dups=False) + def _reindex_index( + self, + new_index, + method, + copy, + level, + fill_value=np.nan, + limit=None, + tolerance=None, + ): + new_index, indexer = self.index.reindex( + new_index, method=method, level=level, limit=limit, tolerance=tolerance + ) + return self._reindex_with_indexers( + {0: [new_index, indexer]}, + copy=copy, + fill_value=fill_value, + allow_dups=False, + ) + + def _reindex_columns( + self, + new_columns, + method, + copy, + level, + fill_value=None, + limit=None, + tolerance=None, + ): + new_columns, indexer = self.columns.reindex( + new_columns, method=method, level=level, limit=limit, tolerance=tolerance + ) + return self._reindex_with_indexers( + {1: [new_columns, indexer]}, + copy=copy, + fill_value=fill_value, + allow_dups=False, + ) def _reindex_multi(self, axes, copy, fill_value): """ We are guaranteed non-Nones in the axes. """ - new_index, row_indexer = self.index.reindex(axes['index']) - new_columns, col_indexer = self.columns.reindex(axes['columns']) + new_index, row_indexer = self.index.reindex(axes["index"]) + new_columns, col_indexer = self.columns.reindex(axes["columns"]) if row_indexer is not None and col_indexer is not None: indexer = row_indexer, col_indexer - new_values = algorithms.take_2d_multi(self.values, indexer, - fill_value=fill_value) - return self._constructor(new_values, index=new_index, - columns=new_columns) + new_values = algorithms.take_2d_multi( + self.values, indexer, fill_value=fill_value + ) + return self._constructor(new_values, index=new_index, columns=new_columns) else: - return self._reindex_with_indexers({0: [new_index, row_indexer], - 1: [new_columns, col_indexer]}, - copy=copy, - fill_value=fill_value) - - @Appender(_shared_docs['align'] % _shared_doc_kwargs) - def align(self, other, join='outer', axis=None, level=None, copy=True, - fill_value=None, method=None, limit=None, fill_axis=0, - broadcast_axis=None): - return super().align(other, join=join, axis=axis, level=level, - copy=copy, fill_value=fill_value, method=method, - limit=limit, fill_axis=fill_axis, - broadcast_axis=broadcast_axis) + return self._reindex_with_indexers( + {0: [new_index, row_indexer], 1: [new_columns, col_indexer]}, + copy=copy, + fill_value=fill_value, + ) + + @Appender(_shared_docs["align"] % _shared_doc_kwargs) + def align( + self, + other, + join="outer", + axis=None, + level=None, + copy=True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, + broadcast_axis=None, + ): + return super().align( + other, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + broadcast_axis=broadcast_axis, + ) @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.reindex.__doc__) - @rewrite_axis_style_signature('labels', [('method', None), - ('copy', True), - ('level', None), - ('fill_value', np.nan), - ('limit', None), - ('tolerance', None)]) + @rewrite_axis_style_signature( + "labels", + [ + ("method", None), + ("copy", True), + ("level", None), + ("fill_value", np.nan), + ("limit", None), + ("tolerance", None), + ], + ) def reindex(self, *args, **kwargs): - axes = validate_axis_style_args(self, args, kwargs, 'labels', - 'reindex') + axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex") kwargs.update(axes) # Pop these, since the values are in `kwargs` under different names - kwargs.pop('axis', None) - kwargs.pop('labels', None) + kwargs.pop("axis", None) + kwargs.pop("labels", None) return super().reindex(**kwargs) - def drop(self, labels=None, axis=0, index=None, columns=None, - level=None, inplace=False, errors='raise'): + def drop( + self, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors="raise", + ): """ Drop specified labels from rows or columns. @@ -3829,14 +4100,20 @@ def drop(self, labels=None, axis=0, index=None, columns=None, falcon speed 320.0 250.0 weight 1.0 0.8 """ - return super().drop(labels=labels, axis=axis, index=index, - columns=columns, level=level, inplace=inplace, - errors=errors) + return super().drop( + labels=labels, + axis=axis, + index=index, + columns=columns, + level=level, + inplace=inplace, + errors=errors, + ) - @rewrite_axis_style_signature('mapper', [('copy', True), - ('inplace', False), - ('level', None), - ('errors', 'ignore')]) + @rewrite_axis_style_signature( + "mapper", + [("copy", True), ("inplace", False), ("level", None), ("errors", "ignore")], + ) def rename(self, *args, **kwargs): """ Alter axes labels. @@ -3946,35 +4223,63 @@ def rename(self, *args, **kwargs): 2 2 5 4 3 6 """ - axes = validate_axis_style_args(self, args, kwargs, 'mapper', 'rename') + axes = validate_axis_style_args(self, args, kwargs, "mapper", "rename") kwargs.update(axes) # Pop these, since the values are in `kwargs` under different names - kwargs.pop('axis', None) - kwargs.pop('mapper', None) + kwargs.pop("axis", None) + kwargs.pop("mapper", None) return super().rename(**kwargs) @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.fillna.__doc__) - def fillna(self, value=None, method=None, axis=None, inplace=False, - limit=None, downcast=None, **kwargs): - return super().fillna(value=value, method=method, axis=axis, - inplace=inplace, limit=limit, downcast=downcast, - **kwargs) - - @Appender(_shared_docs['replace'] % _shared_doc_kwargs) - def replace(self, to_replace=None, value=None, inplace=False, limit=None, - regex=False, method='pad'): - return super().replace(to_replace=to_replace, value=value, - inplace=inplace, limit=limit, regex=regex, - method=method) - - @Appender(_shared_docs['shift'] % _shared_doc_kwargs) + def fillna( + self, + value=None, + method=None, + axis=None, + inplace=False, + limit=None, + downcast=None, + **kwargs + ): + return super().fillna( + value=value, + method=method, + axis=axis, + inplace=inplace, + limit=limit, + downcast=downcast, + **kwargs + ) + + @Appender(_shared_docs["replace"] % _shared_doc_kwargs) + def replace( + self, + to_replace=None, + value=None, + inplace=False, + limit=None, + regex=False, + method="pad", + ): + return super().replace( + to_replace=to_replace, + value=value, + inplace=inplace, + limit=limit, + regex=regex, + method=method, + ) + + @Appender(_shared_docs["shift"] % _shared_doc_kwargs) def shift(self, periods=1, freq=None, axis=0, fill_value=None): - return super().shift(periods=periods, freq=freq, axis=axis, - fill_value=fill_value) + return super().shift( + periods=periods, freq=freq, axis=axis, fill_value=fill_value + ) - def set_index(self, keys, drop=True, append=False, inplace=False, - verify_integrity=False): + def set_index( + self, keys, drop=True, append=False, inplace=False, verify_integrity=False + ): """ Set the DataFrame index using existing columns. @@ -4064,35 +4369,39 @@ def set_index(self, keys, drop=True, append=False, inplace=False, 3 9 7 2013 84 4 16 10 2014 31 """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if not isinstance(keys, list): keys = [keys] - err_msg = ('The parameter "keys" may be a column key, one-dimensional ' - 'array, or a list containing only valid column keys and ' - 'one-dimensional arrays.') + err_msg = ( + 'The parameter "keys" may be a column key, one-dimensional ' + "array, or a list containing only valid column keys and " + "one-dimensional arrays." + ) missing = [] for col in keys: - if isinstance(col, (ABCIndexClass, ABCSeries, np.ndarray, - list, abc.Iterator)): + if isinstance( + col, (ABCIndexClass, ABCSeries, np.ndarray, list, abc.Iterator) + ): # arrays are fine as long as they are one-dimensional # iterators get converted to list below - if getattr(col, 'ndim', 1) != 1: + if getattr(col, "ndim", 1) != 1: raise ValueError(err_msg) else: # everything else gets tried as a key; see GH 24969 try: found = col in self.columns except TypeError: - raise TypeError(err_msg + ' Received column of ' - 'type {}'.format(type(col))) + raise TypeError( + err_msg + " Received column of " "type {}".format(type(col)) + ) else: if not found: missing.append(col) if missing: - raise KeyError('None of {} are in the columns'.format(missing)) + raise KeyError("None of {} are in the columns".format(missing)) if inplace: frame = self @@ -4135,18 +4444,18 @@ def set_index(self, keys, drop=True, append=False, inplace=False, if len(arrays[-1]) != len(self): # check newest element against length of calling frame, since # ensure_index_from_sequences would not raise for append=False. - raise ValueError('Length mismatch: Expected {len_self} rows, ' - 'received array of length {len_col}'.format( - len_self=len(self), - len_col=len(arrays[-1]) - )) + raise ValueError( + "Length mismatch: Expected {len_self} rows, " + "received array of length {len_col}".format( + len_self=len(self), len_col=len(arrays[-1]) + ) + ) index = ensure_index_from_sequences(arrays, names) if verify_integrity and not index.is_unique: duplicates = index[index.duplicated()].unique() - raise ValueError('Index has duplicate keys: {dup}'.format( - dup=duplicates)) + raise ValueError("Index has duplicate keys: {dup}".format(dup=duplicates)) # use set to handle duplicate column names gracefully in case of drop for c in set(to_remove): @@ -4160,8 +4469,9 @@ def set_index(self, keys, drop=True, append=False, inplace=False, if not inplace: return frame - def reset_index(self, level=None, drop=False, inplace=False, col_level=0, - col_fill=''): + def reset_index( + self, level=None, drop=False, inplace=False, col_level=0, col_fill="" + ): """ Reset the index, or a level of it. @@ -4303,7 +4613,7 @@ class max type lion mammal 80.5 run monkey mammal NaN jump """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if inplace: new_obj = self else: @@ -4339,8 +4649,7 @@ def _maybe_casted_values(index, labels=None): values = values._data if mask.any(): - values, changed = maybe_upcast_putmask( - values, mask, np.nan) + values, changed = maybe_upcast_putmask(values, mask, np.nan) if issubclass(values_type, DatetimeLikeArray): values = values_type(values, dtype=values_dtype) @@ -4357,13 +4666,14 @@ def _maybe_casted_values(index, labels=None): if not drop: if isinstance(self.index, MultiIndex): - names = [n if n is not None else ('level_%d' % i) - for (i, n) in enumerate(self.index.names)] + names = [ + n if n is not None else ("level_%d" % i) + for (i, n) in enumerate(self.index.names) + ] to_insert = zip(self.index.levels, self.index.codes) else: - default = 'index' if 'index' not in self else 'level_0' - names = ([default] if self.index.name is None - else [self.index.name]) + default = "index" if "index" not in self else "level_0" + names = [default] if self.index.name is None else [self.index.name] to_insert = ((self.index, None),) multi_col = isinstance(self.columns, MultiIndex) @@ -4372,13 +4682,14 @@ def _maybe_casted_values(index, labels=None): continue name = names[i] if multi_col: - col_name = (list(name) if isinstance(name, tuple) - else [name]) + col_name = list(name) if isinstance(name, tuple) else [name] if col_fill is None: if len(col_name) not in (1, self.columns.nlevels): - raise ValueError("col_fill=None is incompatible " - "with incomplete column name " - "{}".format(name)) + raise ValueError( + "col_fill=None is incompatible " + "with incomplete column name " + "{}".format(name) + ) col_fill = col_name[0] lev_num = self.columns._get_level_number(col_level) @@ -4397,24 +4708,23 @@ def _maybe_casted_values(index, labels=None): # ---------------------------------------------------------------------- # Reindex-based selection methods - @Appender(_shared_docs['isna'] % _shared_doc_kwargs) + @Appender(_shared_docs["isna"] % _shared_doc_kwargs) def isna(self): return super().isna() - @Appender(_shared_docs['isna'] % _shared_doc_kwargs) + @Appender(_shared_docs["isna"] % _shared_doc_kwargs) def isnull(self): return super().isnull() - @Appender(_shared_docs['notna'] % _shared_doc_kwargs) + @Appender(_shared_docs["notna"] % _shared_doc_kwargs) def notna(self): return super().notna() - @Appender(_shared_docs['notna'] % _shared_doc_kwargs) + @Appender(_shared_docs["notna"] % _shared_doc_kwargs) def notnull(self): return super().notnull() - def dropna(self, axis=0, how='any', thresh=None, subset=None, - inplace=False): + def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): """ Remove missing values. @@ -4517,17 +4827,18 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, name toy born 1 Batman Batmobile 1940-04-25 """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if isinstance(axis, (tuple, list)): # GH20987 - msg = ("supplying multiple axes to axis is deprecated and " - "will be removed in a future version.") + msg = ( + "supplying multiple axes to axis is deprecated and " + "will be removed in a future version." + ) warnings.warn(msg, FutureWarning, stacklevel=2) result = self for ax in axis: - result = result.dropna(how=how, thresh=thresh, subset=subset, - axis=ax) + result = result.dropna(how=how, thresh=thresh, subset=subset, axis=ax) else: axis = self._get_axis_number(axis) agg_axis = 1 - axis @@ -4545,15 +4856,15 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, if thresh is not None: mask = count >= thresh - elif how == 'any': + elif how == "any": mask = count == len(agg_obj._get_axis(agg_axis)) - elif how == 'all': + elif how == "all": mask = count > 0 else: if how is not None: - raise ValueError('invalid how option: {h}'.format(h=how)) + raise ValueError("invalid how option: {h}".format(h=how)) else: - raise TypeError('must specify how or thresh') + raise TypeError("must specify how or thresh") result = self.loc(axis=axis)[mask] @@ -4562,7 +4873,7 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, else: return result - def drop_duplicates(self, subset=None, keep='first', inplace=False): + def drop_duplicates(self, subset=None, keep="first", inplace=False): """ Return DataFrame with duplicate rows removed, optionally only considering certain columns. Indexes, including time indexes @@ -4587,7 +4898,7 @@ def drop_duplicates(self, subset=None, keep='first', inplace=False): if self.empty: return self.copy() - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") duplicated = self.duplicated(subset, keep=keep) if inplace: @@ -4597,7 +4908,7 @@ def drop_duplicates(self, subset=None, keep='first', inplace=False): else: return self[-duplicated] - def duplicated(self, subset=None, keep='first'): + def duplicated(self, subset=None, keep="first"): """ Return boolean Series denoting duplicate rows, optionally only considering certain columns. @@ -4626,15 +4937,19 @@ def duplicated(self, subset=None, keep='first'): def f(vals): labels, shape = algorithms.factorize( - vals, size_hint=min(len(self), _SIZE_HINT_LIMIT)) - return labels.astype('i8', copy=False), len(shape) + vals, size_hint=min(len(self), _SIZE_HINT_LIMIT) + ) + return labels.astype("i8", copy=False), len(shape) if subset is None: subset = self.columns - elif (not np.iterable(subset) or - isinstance(subset, str) or - isinstance(subset, tuple) and subset in self.columns): - subset = subset, + elif ( + not np.iterable(subset) + or isinstance(subset, str) + or isinstance(subset, tuple) + and subset in self.columns + ): + subset = (subset,) # Verify all columns in subset exist in the queried dataframe # Otherwise, raise a KeyError, same as if you try to __getitem__ with a @@ -4643,8 +4958,7 @@ def f(vals): if not diff.empty: raise KeyError(diff) - vals = (col.values for name, col in self.iteritems() - if name in subset) + vals = (col.values for name, col in self.iteritems() if name in subset) labels, shape = map(list, zip(*map(f, vals))) ids = get_group_index(labels, shape, sort=False, xnull=False) @@ -4655,23 +4969,30 @@ def f(vals): @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.sort_values.__doc__) - def sort_values(self, by, axis=0, ascending=True, inplace=False, - kind='quicksort', na_position='last'): - inplace = validate_bool_kwarg(inplace, 'inplace') + def sort_values( + self, + by, + axis=0, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + ): + inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis) if not isinstance(by, list): by = [by] if is_sequence(ascending) and len(by) != len(ascending): - raise ValueError('Length of ascending (%d) != length of by (%d)' % - (len(ascending), len(by))) + raise ValueError( + "Length of ascending (%d) != length of by (%d)" + % (len(ascending), len(by)) + ) if len(by) > 1: from pandas.core.sorting import lexsort_indexer - keys = [self._get_label_or_level_values(x, axis=axis) - for x in by] - indexer = lexsort_indexer(keys, orders=ascending, - na_position=na_position) + keys = [self._get_label_or_level_values(x, axis=axis) for x in by] + indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position) indexer = ensure_platform_int(indexer) else: from pandas.core.sorting import nargsort @@ -4682,12 +5003,13 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, if isinstance(ascending, (tuple, list)): ascending = ascending[0] - indexer = nargsort(k, kind=kind, ascending=ascending, - na_position=na_position) + indexer = nargsort( + k, kind=kind, ascending=ascending, na_position=na_position + ) - new_data = self._data.take(indexer, - axis=self._get_block_manager_axis(axis), - verify=False) + new_data = self._data.take( + indexer, axis=self._get_block_manager_axis(axis), verify=False + ) if inplace: return self._update_inplace(new_data) @@ -4696,23 +5018,33 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.sort_index.__doc__) - def sort_index(self, axis=0, level=None, ascending=True, inplace=False, - kind='quicksort', na_position='last', sort_remaining=True, - by=None): + def sort_index( + self, + axis=0, + level=None, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + sort_remaining=True, + by=None, + ): # TODO: this can be combined with Series.sort_index impl as # almost identical - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") # 10726 if by is not None: - warnings.warn("by argument to sort_index is deprecated, " - "please use .sort_values(by=...)", - FutureWarning, stacklevel=2) + warnings.warn( + "by argument to sort_index is deprecated, " + "please use .sort_values(by=...)", + FutureWarning, + stacklevel=2, + ) if level is not None: raise ValueError("unable to simultaneously sort by and level") - return self.sort_values(by, axis=axis, ascending=ascending, - inplace=inplace) + return self.sort_values(by, axis=axis, ascending=ascending, inplace=inplace) axis = self._get_axis_number(axis) labels = self._get_axis(axis) @@ -4722,34 +5054,37 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, labels = labels._sort_levels_monotonic() if level is not None: - new_axis, indexer = labels.sortlevel(level, ascending=ascending, - sort_remaining=sort_remaining) + new_axis, indexer = labels.sortlevel( + level, ascending=ascending, sort_remaining=sort_remaining + ) elif isinstance(labels, MultiIndex): from pandas.core.sorting import lexsort_indexer - indexer = lexsort_indexer(labels._get_codes_for_sorting(), - orders=ascending, - na_position=na_position) + indexer = lexsort_indexer( + labels._get_codes_for_sorting(), + orders=ascending, + na_position=na_position, + ) else: from pandas.core.sorting import nargsort # Check monotonic-ness before sort an index # GH11080 - if ((ascending and labels.is_monotonic_increasing) or - (not ascending and labels.is_monotonic_decreasing)): + if (ascending and labels.is_monotonic_increasing) or ( + not ascending and labels.is_monotonic_decreasing + ): if inplace: return else: return self.copy() - indexer = nargsort(labels, kind=kind, ascending=ascending, - na_position=na_position) + indexer = nargsort( + labels, kind=kind, ascending=ascending, na_position=na_position + ) baxis = self._get_block_manager_axis(axis) - new_data = self._data.take(indexer, - axis=baxis, - verify=False) + new_data = self._data.take(indexer, axis=baxis, verify=False) # reconstruct axis if needed new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic() @@ -4759,7 +5094,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, else: return self._constructor(new_data).__finalize__(self) - def nlargest(self, n, columns, keep='first'): + def nlargest(self, n, columns, keep="first"): """ Return the first `n` rows ordered by `columns` in descending order. @@ -4866,12 +5201,9 @@ def nlargest(self, n, columns, keep='first'): Italy 59000000 1937894 IT Brunei 434000 12128 BN """ - return algorithms.SelectNFrame(self, - n=n, - keep=keep, - columns=columns).nlargest() + return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() - def nsmallest(self, n, columns, keep='first'): + def nsmallest(self, n, columns, keep="first"): """ Return the first `n` rows ordered by `columns` in ascending order. @@ -4968,10 +5300,9 @@ def nsmallest(self, n, columns, keep='first'): Nauru 11300 182 NR Anguilla 11300 311 AI """ - return algorithms.SelectNFrame(self, - n=n, - keep=keep, - columns=columns).nsmallest() + return algorithms.SelectNFrame( + self, n=n, keep=keep, columns=columns + ).nsmallest() def swaplevel(self, i=-2, j=-1, axis=0): """ @@ -5018,9 +5349,8 @@ def reorder_levels(self, order, axis=0): type of caller (new object) """ axis = self._get_axis_number(axis) - if not isinstance(self._get_axis(axis), - MultiIndex): # pragma: no cover - raise TypeError('Can only reorder levels on a hierarchical axis.') + if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover + raise TypeError("Can only reorder levels on a hierarchical axis.") result = self.copy() @@ -5034,7 +5364,7 @@ def reorder_levels(self, order, axis=0): # Arithmetic / combination related def _combine_frame(self, other, func, fill_value=None, level=None): - this, other = self.align(other, join='outer', level=level, copy=False) + this, other = self.align(other, join="outer", level=level, copy=False) new_index, new_columns = this.index, this.columns def _arith_op(left, right): @@ -5049,13 +5379,12 @@ def _arith_op(left, right): return ops.dispatch_to_series(this, other, _arith_op) else: result = _arith_op(this.values, other.values) - return self._constructor(result, - index=new_index, columns=new_columns, - copy=False) + return self._constructor( + result, index=new_index, columns=new_columns, copy=False + ) def _combine_match_index(self, other, func, level=None): - left, right = self.align(other, join='outer', axis=0, level=level, - copy=False) + left, right = self.align(other, join="outer", axis=0, level=level, copy=False) assert left.index.equals(right.index) if left._is_mixed_type or right._is_mixed_type: @@ -5065,14 +5394,13 @@ def _combine_match_index(self, other, func, level=None): # fastpath --> operate directly on values with np.errstate(all="ignore"): new_data = func(left.values.T, right.values).T - return self._constructor(new_data, - index=left.index, columns=self.columns, - copy=False) + return self._constructor( + new_data, index=left.index, columns=self.columns, copy=False + ) def _combine_match_columns(self, other, func, level=None): assert isinstance(other, Series) - left, right = self.align(other, join='outer', axis=1, level=level, - copy=False) + left, right = self.align(other, join="outer", axis=1, level=level, copy=False) assert left.columns.equals(right.index) return ops.dispatch_to_series(left, right, func, axis="columns") @@ -5245,8 +5573,7 @@ def combine(self, other, func, fill_value=None, overwrite=True): result[col] = arr # convert_objects just in case - return self._constructor(result, index=new_index, - columns=new_columns) + return self._constructor(result, index=new_index, columns=new_columns) def combine_first(self, other): """ @@ -5304,7 +5631,7 @@ def extract_values(arr): if is_extension_array_dtype(arr.dtype): arr = arr.asi8 else: - arr = arr.view('i8') + arr = arr.view("i8") return arr def combiner(x, y): @@ -5324,10 +5651,14 @@ def combiner(x, y): return self.combine(other, combiner, overwrite=False) - @deprecate_kwarg(old_arg_name='raise_conflict', new_arg_name='errors', - mapping={False: 'ignore', True: 'raise'}) - def update(self, other, join='left', overwrite=True, filter_func=None, - errors='ignore'): + @deprecate_kwarg( + old_arg_name="raise_conflict", + new_arg_name="errors", + mapping={False: "ignore", True: "raise"}, + ) + def update( + self, other, join="left", overwrite=True, filter_func=None, errors="ignore" + ): """ Modify in place using non-NA values from another DataFrame. @@ -5440,12 +5771,14 @@ def update(self, other, join='left', overwrite=True, filter_func=None, 2 3 6.0 """ import pandas.core.computation.expressions as expressions + # TODO: Support other joins - if join != 'left': # pragma: no cover + if join != "left": # pragma: no cover raise NotImplementedError("Only left join is supported") - if errors not in ['ignore', 'raise']: - raise ValueError("The parameter errors must be either " - "'ignore' or 'raise'") + if errors not in ["ignore", "raise"]: + raise ValueError( + "The parameter errors must be either " "'ignore' or 'raise'" + ) if not isinstance(other, DataFrame): other = DataFrame(other) @@ -5456,10 +5789,10 @@ def update(self, other, join='left', overwrite=True, filter_func=None, this = self[col]._values that = other[col]._values if filter_func is not None: - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): mask = ~filter_func(this) | isna(that) else: - if errors == 'raise': + if errors == "raise": mask_this = notna(that) mask_that = notna(this) if any(mask_this & mask_that): @@ -5479,7 +5812,9 @@ def update(self, other, join='left', overwrite=True, filter_func=None, # ---------------------------------------------------------------------- # Data reshaping - _shared_docs['pivot'] = """ + _shared_docs[ + "pivot" + ] = """ Return reshaped DataFrame organized by given index / column values. Reshape data (produce a "pivot" table) based on column values. Uses @@ -5582,13 +5917,16 @@ def update(self, other, join='left', overwrite=True, filter_func=None, ValueError: Index contains duplicate entries, cannot reshape """ - @Substitution('') - @Appender(_shared_docs['pivot']) + @Substitution("") + @Appender(_shared_docs["pivot"]) def pivot(self, index=None, columns=None, values=None): from pandas.core.reshape.pivot import pivot + return pivot(self, index=index, columns=columns, values=values) - _shared_docs['pivot_table'] = """ + _shared_docs[ + "pivot_table" + ] = """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on the index and columns of the result DataFrame. @@ -5713,16 +6051,34 @@ def pivot(self, index=None, columns=None, values=None): small 2.333333 6.0 4.333333 2.0 """ - @Substitution('') - @Appender(_shared_docs['pivot_table']) - def pivot_table(self, values=None, index=None, columns=None, - aggfunc='mean', fill_value=None, margins=False, - dropna=True, margins_name='All', observed=False): + @Substitution("") + @Appender(_shared_docs["pivot_table"]) + def pivot_table( + self, + values=None, + index=None, + columns=None, + aggfunc="mean", + fill_value=None, + margins=False, + dropna=True, + margins_name="All", + observed=False, + ): from pandas.core.reshape.pivot import pivot_table - return pivot_table(self, values=values, index=index, columns=columns, - aggfunc=aggfunc, fill_value=fill_value, - margins=margins, dropna=dropna, - margins_name=margins_name, observed=observed) + + return pivot_table( + self, + values=values, + index=index, + columns=columns, + aggfunc=aggfunc, + fill_value=fill_value, + margins=margins, + dropna=dropna, + margins_name=margins_name, + observed=observed, + ) def stack(self, level=-1, dropna=True): """ @@ -5955,9 +6311,12 @@ def unstack(self, level=-1, fill_value=None): dtype: float64 """ from pandas.core.reshape.reshape import unstack + return unstack(self, level, fill_value) - _shared_docs['melt'] = (""" + _shared_docs[ + "melt" + ] = """ Unpivot a DataFrame from wide format to long format, optionally leaving identifier variables set. @@ -6050,18 +6409,32 @@ def unstack(self, level=-1, fill_value=None): 0 a B E 1 1 b B E 3 2 c B E 5 - """) - - @Appender(_shared_docs['melt'] % - dict(caller='df.melt(', - versionadded='.. versionadded:: 0.20.0\n', - other='melt')) - def melt(self, id_vars=None, value_vars=None, var_name=None, - value_name='value', col_level=None): + """ + + @Appender( + _shared_docs["melt"] + % dict( + caller="df.melt(", versionadded=".. versionadded:: 0.20.0\n", other="melt" + ) + ) + def melt( + self, + id_vars=None, + value_vars=None, + var_name=None, + value_name="value", + col_level=None, + ): from pandas.core.reshape.melt import melt - return melt(self, id_vars=id_vars, value_vars=value_vars, - var_name=var_name, value_name=value_name, - col_level=col_level) + + return melt( + self, + id_vars=id_vars, + value_vars=value_vars, + var_name=var_name, + value_name=value_name, + col_level=col_level, + ) # ---------------------------------------------------------------------- # Time series-related @@ -6160,11 +6533,12 @@ def diff(self, periods=1, axis=0): # ---------------------------------------------------------------------- # Function application - def _gotitem(self, - key: Union[str, List[str]], - ndim: int, - subset: Optional[Union[Series, ABCDataFrame]] = None, - ) -> Union[Series, ABCDataFrame]: + def _gotitem( + self, + key: Union[str, List[str]], + ndim: int, + subset: Optional[Union[Series, ABCDataFrame]] = None, + ) -> Union[Series, ABCDataFrame]: """ Sub-classes to define. Return a sliced object. @@ -6184,7 +6558,8 @@ def _gotitem(self, # TODO: _shallow_copy(subset)? return subset[key] - _agg_summary_and_see_also_doc = dedent(""" + _agg_summary_and_see_also_doc = dedent( + """ The aggregation operations are always performed over an axis, either the index (default) or the column axis. This behavior is different from `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`, @@ -6204,9 +6579,11 @@ def _gotitem(self, core.window.Expanding : Perform operations over expanding window. core.window.EWM : Perform operation over exponential weighted window. - """) + """ + ) - _agg_examples_doc = dedent(""" + _agg_examples_doc = dedent( + """ Examples -------- >>> df = pd.DataFrame([[1, 2, 3], @@ -6238,13 +6615,16 @@ def _gotitem(self, 2 8.0 3 NaN dtype: float64 - """) - - @Substitution(see_also=_agg_summary_and_see_also_doc, - examples=_agg_examples_doc, - versionadded='\n.. versionadded:: 0.20.0\n', - **_shared_doc_kwargs) - @Appender(_shared_docs['aggregate']) + """ + ) + + @Substitution( + see_also=_agg_summary_and_see_also_doc, + examples=_agg_examples_doc, + versionadded="\n.. versionadded:: 0.20.0\n", + **_shared_doc_kwargs + ) + @Appender(_shared_docs["aggregate"]) def aggregate(self, func, axis=0, *args, **kwargs): axis = self._get_axis_number(axis) @@ -6268,15 +6648,24 @@ def _aggregate(self, arg, axis=0, *args, **kwargs): agg = aggregate - @Appender(_shared_docs['transform'] % _shared_doc_kwargs) + @Appender(_shared_docs["transform"] % _shared_doc_kwargs) def transform(self, func, axis=0, *args, **kwargs): axis = self._get_axis_number(axis) if axis == 1: return self.T.transform(func, *args, **kwargs).T return super().transform(func, *args, **kwargs) - def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, - result_type=None, args=(), **kwds): + def apply( + self, + func, + axis=0, + broadcast=None, + raw=False, + reduce=None, + result_type=None, + args=(), + **kwds + ): """ Apply a function along an axis of the DataFrame. @@ -6445,15 +6834,18 @@ def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, 2 1 2 """ from pandas.core.apply import frame_apply - op = frame_apply(self, - func=func, - axis=axis, - broadcast=broadcast, - raw=raw, - reduce=reduce, - result_type=result_type, - args=args, - kwds=kwds) + + op = frame_apply( + self, + func=func, + axis=axis, + broadcast=broadcast, + raw=raw, + reduce=reduce, + result_type=result_type, + args=args, + kwds=kwds, + ) return op.get_result() def applymap(self, func): @@ -6525,8 +6917,7 @@ def infer(x): # ---------------------------------------------------------------------- # Merging / joining methods - def append(self, other, ignore_index=False, - verify_integrity=False, sort=None): + def append(self, other, ignore_index=False, verify_integrity=False, sort=None): """ Append rows of `other` to the end of caller, returning a new object. @@ -6624,8 +7015,10 @@ def append(self, other, ignore_index=False, if isinstance(other, dict): other = Series(other) if other.name is None and not ignore_index: - raise TypeError('Can only append a Series if ignore_index=True' - ' or if the Series has a name') + raise TypeError( + "Can only append a Series if ignore_index=True" + " or if the Series has a name" + ) if other.name is None: index = None @@ -6640,9 +7033,11 @@ def append(self, other, ignore_index=False, except TypeError: combined_columns = self.columns.astype(object).append(idx_diff) other = other.reindex(combined_columns, copy=False) - other = DataFrame(other.values.reshape((1, len(other))), - index=index, - columns=combined_columns) + other = DataFrame( + other.values.reshape((1, len(other))), + index=index, + columns=combined_columns, + ) other = other._convert(datetime=True, timedelta=True) if not self.columns.equals(combined_columns): self = self.reindex(columns=combined_columns) @@ -6652,16 +7047,19 @@ def append(self, other, ignore_index=False, other = other.reindex(columns=self.columns) from pandas.core.reshape.concat import concat + if isinstance(other, (list, tuple)): to_concat = [self] + other else: to_concat = [self, other] - return concat(to_concat, ignore_index=ignore_index, - verify_integrity=verify_integrity, - sort=sort) + return concat( + to_concat, + ignore_index=ignore_index, + verify_integrity=verify_integrity, + sort=sort, + ) - def join(self, other, on=None, how='left', lsuffix='', rsuffix='', - sort=False): + def join(self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False): """ Join columns of another DataFrame. @@ -6780,27 +7178,37 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', 5 K5 A5 NaN """ # For SparseDataFrame's benefit - return self._join_compat(other, on=on, how=how, lsuffix=lsuffix, - rsuffix=rsuffix, sort=sort) + return self._join_compat( + other, on=on, how=how, lsuffix=lsuffix, rsuffix=rsuffix, sort=sort + ) - def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', - sort=False): + def _join_compat( + self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False + ): from pandas.core.reshape.merge import merge from pandas.core.reshape.concat import concat if isinstance(other, Series): if other.name is None: - raise ValueError('Other Series must have a name') + raise ValueError("Other Series must have a name") other = DataFrame({other.name: other}) if isinstance(other, DataFrame): - return merge(self, other, left_on=on, how=how, - left_index=on is None, right_index=True, - suffixes=(lsuffix, rsuffix), sort=sort) + return merge( + self, + other, + left_on=on, + how=how, + left_index=on is None, + right_index=True, + suffixes=(lsuffix, rsuffix), + sort=sort, + ) else: if on is not None: - raise ValueError('Joining multiple DataFrames only supported' - ' for joining on index') + raise ValueError( + "Joining multiple DataFrames only supported" " for joining on index" + ) frames = [self] + list(other) @@ -6808,33 +7216,55 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', # join indexes only using concat if can_concat: - if how == 'left': - res = concat(frames, axis=1, join='outer', - verify_integrity=True) + if how == "left": + res = concat(frames, axis=1, join="outer", verify_integrity=True) return res.reindex(self.index, copy=False) else: - return concat(frames, axis=1, join=how, - verify_integrity=True) + return concat(frames, axis=1, join=how, verify_integrity=True) joined = frames[0] for frame in frames[1:]: - joined = merge(joined, frame, how=how, left_index=True, - right_index=True) + joined = merge( + joined, frame, how=how, left_index=True, right_index=True + ) return joined - @Substitution('') + @Substitution("") @Appender(_merge_doc, indents=2) - def merge(self, right, how='inner', on=None, left_on=None, right_on=None, - left_index=False, right_index=False, sort=False, - suffixes=('_x', '_y'), copy=True, indicator=False, - validate=None): + def merge( + self, + right, + how="inner", + on=None, + left_on=None, + right_on=None, + left_index=False, + right_index=False, + sort=False, + suffixes=("_x", "_y"), + copy=True, + indicator=False, + validate=None, + ): from pandas.core.reshape.merge import merge - return merge(self, right, how=how, on=on, left_on=left_on, - right_on=right_on, left_index=left_index, - right_index=right_index, sort=sort, suffixes=suffixes, - copy=copy, indicator=indicator, validate=validate) + + return merge( + self, + right, + how=how, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + sort=sort, + suffixes=suffixes, + copy=copy, + indicator=indicator, + validate=validate, + ) def round(self, decimals=0, *args, **kwargs): """ @@ -6936,23 +7366,21 @@ def _series_round(s, decimals): new_cols = [col for col in _dict_round(self, decimals)] elif is_integer(decimals): # Dispatch to Series.round - new_cols = [_series_round(v, decimals) - for _, v in self.iteritems()] + new_cols = [_series_round(v, decimals) for _, v in self.iteritems()] else: - raise TypeError("decimals must be an integer, a dict-like or a " - "Series") + raise TypeError("decimals must be an integer, a dict-like or a " "Series") if len(new_cols) > 0: - return self._constructor(concat(new_cols, axis=1), - index=self.index, - columns=self.columns) + return self._constructor( + concat(new_cols, axis=1), index=self.index, columns=self.columns + ) else: return self # ---------------------------------------------------------------------- # Statistical methods, etc. - def corr(self, method='pearson', min_periods=1): + def corr(self, method="pearson", min_periods=1): """ Compute pairwise correlation of columns, excluding NA/null values. @@ -7000,12 +7428,11 @@ def corr(self, method='pearson', min_periods=1): idx = cols.copy() mat = numeric_df.values - if method == 'pearson': + if method == "pearson": correl = libalgos.nancorr(ensure_float64(mat), minp=min_periods) - elif method == 'spearman': - correl = libalgos.nancorr_spearman(ensure_float64(mat), - minp=min_periods) - elif method == 'kendall' or callable(method): + elif method == "spearman": + correl = libalgos.nancorr_spearman(ensure_float64(mat), minp=min_periods) + elif method == "kendall" or callable(method): if min_periods is None: min_periods = 1 mat = ensure_float64(mat).T @@ -7022,7 +7449,7 @@ def corr(self, method='pearson', min_periods=1): if valid.sum() < min_periods: c = np.nan elif i == j: - c = 1. + c = 1.0 elif not valid.all(): c = corrf(ac[valid], bc[valid]) else: @@ -7030,9 +7457,11 @@ def corr(self, method='pearson', min_periods=1): correl[i, j] = c correl[j, i] = c else: - raise ValueError("method must be either 'pearson', " - "'spearman', 'kendall', or a callable, " - "'{method}' was supplied".format(method=method)) + raise ValueError( + "method must be either 'pearson', " + "'spearman', 'kendall', or a callable, " + "'{method}' was supplied".format(method=method) + ) return self._constructor(correl, index=idx, columns=cols) @@ -7142,12 +7571,11 @@ def cov(self, min_periods=None): baseCov = np.cov(mat.T) baseCov = baseCov.reshape((len(cols), len(cols))) else: - baseCov = libalgos.nancorr(ensure_float64(mat), cov=True, - minp=min_periods) + baseCov = libalgos.nancorr(ensure_float64(mat), cov=True, minp=min_periods) return self._constructor(baseCov, index=idx, columns=cols) - def corrwith(self, other, axis=0, drop=False, method='pearson'): + def corrwith(self, other, axis=0, drop=False, method="pearson"): """ Compute pairwise correlation between rows or columns of DataFrame with rows or columns of Series or DataFrame. DataFrames are first @@ -7183,17 +7611,16 @@ def corrwith(self, other, axis=0, drop=False, method='pearson'): this = self._get_numeric_data() if isinstance(other, Series): - return this.apply(lambda x: other.corr(x, method=method), - axis=axis) + return this.apply(lambda x: other.corr(x, method=method), axis=axis) other = other._get_numeric_data() - left, right = this.align(other, join='inner', copy=False) + left, right = this.align(other, join="inner", copy=False) if axis == 1: left = left.T right = right.T - if method == 'pearson': + if method == "pearson": # mask missing values left = left + right * 0 right = right + left * 0 @@ -7207,31 +7634,31 @@ def corrwith(self, other, axis=0, drop=False, method='pearson'): correl = num / dom - elif method in ['kendall', 'spearman'] or callable(method): + elif method in ["kendall", "spearman"] or callable(method): + def c(x): return nanops.nancorr(x[0], x[1], method=method) - correl = Series(map(c, - zip(left.values.T, right.values.T)), - index=left.columns) + correl = Series( + map(c, zip(left.values.T, right.values.T)), index=left.columns + ) else: - raise ValueError("Invalid method {method} was passed, " - "valid methods are: 'pearson', 'kendall', " - "'spearman', or callable". - format(method=method)) + raise ValueError( + "Invalid method {method} was passed, " + "valid methods are: 'pearson', 'kendall', " + "'spearman', or callable".format(method=method) + ) if not drop: # Find non-matching labels along the given axis # and append missing correlations (GH 22375) raxis = 1 if axis == 0 else 0 - result_index = (this._get_axis(raxis). - union(other._get_axis(raxis))) + result_index = this._get_axis(raxis).union(other._get_axis(raxis)) idx_diff = result_index.difference(correl.index) if len(idx_diff) > 0: - correl = correl.append(Series([np.nan] * len(idx_diff), - index=idx_diff)) + correl = correl.append(Series([np.nan] * len(idx_diff), index=idx_diff)) return correl @@ -7316,8 +7743,7 @@ def count(self, axis=0, level=None, numeric_only=False): """ axis = self._get_axis_number(axis) if level is not None: - return self._count_level(level, axis=axis, - numeric_only=numeric_only) + return self._count_level(level, axis=axis, numeric_only=numeric_only) if numeric_only: frame = self._get_numeric_data() @@ -7338,7 +7764,7 @@ def count(self, axis=0, level=None, numeric_only=False): counts = series_counts.values result = Series(counts, index=frame._get_agg_axis(axis)) - return result.astype('int64') + return result.astype("int64") def _count_level(self, level, axis=0, numeric_only=False): if numeric_only: @@ -7350,8 +7776,10 @@ def _count_level(self, level, axis=0, numeric_only=False): agg_axis = frame._get_agg_axis(axis) if not isinstance(count_axis, MultiIndex): - raise TypeError("Can only count levels on hierarchical " - "{ax}.".format(ax=self._get_axis_name(axis))) + raise TypeError( + "Can only count levels on hierarchical " + "{ax}.".format(ax=self._get_axis_name(axis)) + ) if frame._is_mixed_type: # Since we have mixed types, calling notna(frame.values) might @@ -7371,8 +7799,7 @@ def _count_level(self, level, axis=0, numeric_only=False): level_index = count_axis.levels[level] level_codes = ensure_int64(count_axis.codes[level]) - counts = lib.count_level_2d(mask, level_codes, len(level_index), - axis=0) + counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=0) result = DataFrame(counts, index=level_index, columns=agg_axis) @@ -7382,9 +7809,10 @@ def _count_level(self, level, axis=0, numeric_only=False): else: return result - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): - if axis is None and filter_type == 'bool': + def _reduce( + self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds + ): + if axis is None and filter_type == "bool": labels = None constructor = None else: @@ -7397,9 +7825,14 @@ def f(x): return op(x, axis=axis, skipna=skipna, **kwds) # exclude timedelta/datetime unless we are uniform types - if (axis == 1 and self._is_datelike_mixed_type - and (not self._is_homogeneous_type - and not is_datetime64tz_dtype(self.dtypes[0]))): + if ( + axis == 1 + and self._is_datelike_mixed_type + and ( + not self._is_homogeneous_type + and not is_datetime64tz_dtype(self.dtypes[0]) + ) + ): numeric_only = True if numeric_only is None: @@ -7407,8 +7840,7 @@ def f(x): values = self.values result = f(values) - if (filter_type == 'bool' and is_object_dtype(values) and - axis is None): + if filter_type == "bool" and is_object_dtype(values) and axis is None: # work around https://github.com/numpy/numpy/issues/10489 # TODO: combine with hasattr(result, 'dtype') further down # hard since we don't have `values` down there. @@ -7428,10 +7860,10 @@ def f(x): # column-by-column reduction, where we have mixed type. # So let's just do what we can from pandas.core.apply import frame_apply - opa = frame_apply(self, - func=f, - result_type='expand', - ignore_failures=True) + + opa = frame_apply( + self, func=f, result_type="expand", ignore_failures=True + ) result = opa.get_result() if result.ndim == self.ndim: result = result.iloc[0] @@ -7439,28 +7871,31 @@ def f(x): except Exception: pass - if filter_type is None or filter_type == 'numeric': + if filter_type is None or filter_type == "numeric": data = self._get_numeric_data() - elif filter_type == 'bool': + elif filter_type == "bool": data = self._get_bool_data() else: # pragma: no cover e = NotImplementedError( "Handling exception with filter_type {f} not" - "implemented.".format(f=filter_type)) + "implemented.".format(f=filter_type) + ) raise_with_traceback(e) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = f(data.values) labels = data._get_agg_axis(axis) else: if numeric_only: - if filter_type is None or filter_type == 'numeric': + if filter_type is None or filter_type == "numeric": data = self._get_numeric_data() - elif filter_type == 'bool': + elif filter_type == "bool": # GH 25101, # GH 24434 data = self._get_bool_data() if axis == 0 else self else: # pragma: no cover - msg = ("Generating numeric_only data with filter_type {f}" - "not supported.".format(f=filter_type)) + msg = ( + "Generating numeric_only data with filter_type {f}" + "not supported.".format(f=filter_type) + ) raise NotImplementedError(msg) values = data.values labels = data._get_agg_axis(axis) @@ -7468,11 +7903,11 @@ def f(x): values = self.values result = f(values) - if hasattr(result, 'dtype') and is_object_dtype(result.dtype): + if hasattr(result, "dtype") and is_object_dtype(result.dtype): try: - if filter_type is None or filter_type == 'numeric': + if filter_type is None or filter_type == "numeric": result = result.astype(np.float64) - elif filter_type == 'bool' and notna(result).all(): + elif filter_type == "bool" and notna(result).all(): result = result.astype(np.bool_) except (ValueError, TypeError): @@ -7609,7 +8044,7 @@ def _get_agg_axis(self, axis_num): elif axis_num == 1: return self.index else: - raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num) + raise ValueError("Axis must be 0 or 1 (got %r)" % axis_num) def mode(self, axis=0, numeric_only=False, dropna=True): """ @@ -7697,8 +8132,7 @@ def f(s): return data.apply(f, axis=axis) - def quantile(self, q=0.5, axis=0, numeric_only=True, - interpolation='linear'): + def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): """ Return values at the given quantile over requested axis. @@ -7775,10 +8209,9 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, if is_transposed: data = data.T - result = data._data.quantile(qs=q, - axis=1, - interpolation=interpolation, - transposed=is_transposed) + result = data._data.quantile( + qs=q, axis=1, interpolation=interpolation, transposed=is_transposed + ) if result.ndim == 2: result = self._constructor(result) @@ -7790,7 +8223,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, return result - def to_timestamp(self, freq=None, how='start', axis=0, copy=True): + def to_timestamp(self, freq=None, how="start", axis=0, copy=True): """ Cast to DatetimeIndex of timestamps, at *beginning* of period. @@ -7820,8 +8253,7 @@ def to_timestamp(self, freq=None, how='start', axis=0, copy=True): elif axis == 1: new_data.set_axis(0, self.columns.to_timestamp(freq=freq, how=how)) else: # pragma: no cover - raise AssertionError('Axis must be 0 or 1. Got {ax!s}'.format( - ax=axis)) + raise AssertionError("Axis must be 0 or 1. Got {ax!s}".format(ax=axis)) return self._constructor(new_data) @@ -7853,8 +8285,7 @@ def to_period(self, freq=None, axis=0, copy=True): elif axis == 1: new_data.set_axis(0, self.columns.to_period(freq=freq)) else: # pragma: no cover - raise AssertionError('Axis must be 0 or 1. Got {ax!s}'.format( - ax=axis)) + raise AssertionError("Axis must be 0 or 1. Got {ax!s}".format(ax=axis)) return self._constructor(new_data) @@ -7923,29 +8354,36 @@ def isin(self, values): """ if isinstance(values, dict): from pandas.core.reshape.concat import concat + values = collections.defaultdict(list, values) - return concat((self.iloc[:, [i]].isin(values[col]) - for i, col in enumerate(self.columns)), axis=1) + return concat( + ( + self.iloc[:, [i]].isin(values[col]) + for i, col in enumerate(self.columns) + ), + axis=1, + ) elif isinstance(values, Series): if not values.index.is_unique: - raise ValueError("cannot compute isin with " - "a duplicate axis.") - return self.eq(values.reindex_like(self), axis='index') + raise ValueError("cannot compute isin with " "a duplicate axis.") + return self.eq(values.reindex_like(self), axis="index") elif isinstance(values, DataFrame): if not (values.columns.is_unique and values.index.is_unique): - raise ValueError("cannot compute isin with " - "a duplicate axis.") + raise ValueError("cannot compute isin with " "a duplicate axis.") return self.eq(values.reindex_like(self)) else: if not is_list_like(values): - raise TypeError("only list-like or dict-like objects are " - "allowed to be passed to DataFrame.isin(), " - "you passed a " - "{0!r}".format(type(values).__name__)) + raise TypeError( + "only list-like or dict-like objects are " + "allowed to be passed to DataFrame.isin(), " + "you passed a " + "{0!r}".format(type(values).__name__) + ) return DataFrame( - algorithms.isin(self.values.ravel(), - values).reshape(self.shape), self.index, - self.columns) + algorithms.isin(self.values.ravel(), values).reshape(self.shape), + self.index, + self.columns, + ) # ---------------------------------------------------------------------- # Add plotting methods to DataFrame @@ -7955,11 +8393,17 @@ def isin(self, values): sparse = CachedAccessor("sparse", SparseFrameAccessor) -DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0, - axes_are_reversed=True, aliases={'rows': 0}, - docs={ - 'index': 'The index (row labels) of the DataFrame.', - 'columns': 'The column labels of the DataFrame.'}) +DataFrame._setup_axes( + ["index", "columns"], + info_axis=1, + stat_axis=0, + axes_are_reversed=True, + aliases={"rows": 0}, + docs={ + "index": "The index (row labels) of the DataFrame.", + "columns": "The column labels of the DataFrame.", + }, +) DataFrame._add_numeric_operations() DataFrame._add_series_or_dataframe_operations() @@ -7978,4 +8422,4 @@ def _from_nested_dict(data): def _put_str(s, space): - return '{s}'.format(s=s)[:space].ljust(space) + return "{s}".format(s=s)[:space].ljust(space) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 106af6e565f8a4..4e9f74162ae787 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -19,17 +19,32 @@ from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError -from pandas.util._decorators import ( - Appender, Substitution, rewrite_axis_style_signature) +from pandas.util._decorators import Appender, Substitution, rewrite_axis_style_signature from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask from pandas.core.dtypes.common import ( - ensure_int64, ensure_object, ensure_str, is_bool, is_bool_dtype, - is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_dict_like, is_extension_array_dtype, is_integer, is_list_like, - is_number, is_numeric_dtype, is_object_dtype, is_period_arraylike, - is_re_compilable, is_scalar, is_timedelta64_dtype, pandas_dtype) + ensure_int64, + ensure_object, + ensure_str, + is_bool, + is_bool_dtype, + is_datetime64_any_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_dict_like, + is_extension_array_dtype, + is_integer, + is_list_like, + is_number, + is_numeric_dtype, + is_object_dtype, + is_period_arraylike, + is_re_compilable, + is_scalar, + is_timedelta64_dtype, + pandas_dtype, +) from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import isna, notna @@ -41,7 +56,12 @@ from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.index import ( - Index, InvalidIndexError, MultiIndex, RangeIndex, ensure_index) + Index, + InvalidIndexError, + MultiIndex, + RangeIndex, + ensure_index, +) from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import Period, PeriodIndex import pandas.core.indexing as indexing @@ -56,12 +76,14 @@ # able to share _shared_docs = dict() _shared_doc_kwargs = dict( - axes='keywords for axes', klass='Series/DataFrame', - axes_single_arg='int or labels for object', - args_transpose='axes to permute (int or label for object)', + axes="keywords for axes", + klass="Series/DataFrame", + axes_single_arg="int or labels for object", + args_transpose="axes to permute (int or label for object)", optional_by=""" by : str or list of str - Name or list of names to sort by""") + Name or list of names to sort by""", +) # sentinel value to use as kwarg in place of None when None has special meaning # and needs to be distinguished from a user explicitly passing None. @@ -74,8 +96,11 @@ def _single_replace(self, to_replace, method, inplace, limit): replacement value is given in the replace method """ if self.ndim != 1: - raise TypeError('cannot replace {0} with method {1} on a {2}' - .format(to_replace, method, type(self).__name__)) + raise TypeError( + "cannot replace {0} with method {1} on a {2}".format( + to_replace, method, type(self).__name__ + ) + ) orig_dtype = self.dtype result = self if inplace else self.copy() @@ -87,8 +112,7 @@ def _single_replace(self, to_replace, method, inplace, limit): if values.dtype == orig_dtype and inplace: return - result = pd.Series(values, index=self.index, - dtype=self.dtype).__finalize__(self) + result = pd.Series(values, index=self.index, dtype=self.dtype).__finalize__(self) if inplace: self._update_inplace(result._data) @@ -108,15 +132,27 @@ class NDFrame(PandasObject, SelectionMixin): axes : list copy : boolean, default False """ - _internal_names = ['_data', '_cacher', '_item_cache', '_cache', '_is_copy', - '_subtyp', '_name', '_index', '_default_kind', - '_default_fill_value', '_metadata', '__array_struct__', - '__array_interface__'] # type: List[str] + + _internal_names = [ + "_data", + "_cacher", + "_item_cache", + "_cache", + "_is_copy", + "_subtyp", + "_name", + "_index", + "_default_kind", + "_default_fill_value", + "_metadata", + "__array_struct__", + "__array_interface__", + ] # type: List[str] _internal_names_set = set(_internal_names) # type: Set[str] _accessors = set() # type: Set[str] - _deprecations = frozenset([ - 'as_blocks', 'blocks', 'is_copy' - ]) # type: FrozenSet[str] + _deprecations = frozenset( + ["as_blocks", "blocks", "is_copy"] + ) # type: FrozenSet[str] _metadata = [] # type: List[str] _is_copy = None _data = None # type: BlockManager @@ -124,12 +160,14 @@ class NDFrame(PandasObject, SelectionMixin): # ---------------------------------------------------------------------- # Constructors - def __init__(self, - data: BlockManager, - axes: Optional[List[Index]] = None, - copy: bool = False, - dtype: Optional[Dtype] = None, - fastpath: bool = False): + def __init__( + self, + data: BlockManager, + axes: Optional[List[Index]] = None, + copy: bool = False, + dtype: Optional[Dtype] = None, + fastpath: bool = False, + ): if not fastpath: if dtype is not None: @@ -141,17 +179,17 @@ def __init__(self, for i, ax in enumerate(axes): data = data.reindex_axis(ax, axis=i) - object.__setattr__(self, '_is_copy', None) - object.__setattr__(self, '_data', data) - object.__setattr__(self, '_item_cache', {}) + object.__setattr__(self, "_is_copy", None) + object.__setattr__(self, "_data", data) + object.__setattr__(self, "_item_cache", {}) def _init_mgr(self, mgr, axes=None, dtype=None, copy=False): """ passed a manager and a axes dict """ for a, axe in axes.items(): if axe is not None: - mgr = mgr.reindex_axis(axe, - axis=self._get_block_manager_axis(a), - copy=False) + mgr = mgr.reindex_axis( + axe, axis=self._get_block_manager_axis(a), copy=False + ) # make a copy if explicitly requested if copy: @@ -169,14 +207,22 @@ def is_copy(self): """ Return the copy. """ - warnings.warn("Attribute 'is_copy' is deprecated and will be removed " - "in a future version.", FutureWarning, stacklevel=2) + warnings.warn( + "Attribute 'is_copy' is deprecated and will be removed " + "in a future version.", + FutureWarning, + stacklevel=2, + ) return self._is_copy @is_copy.setter def is_copy(self, msg): - warnings.warn("Attribute 'is_copy' is deprecated and will be removed " - "in a future version.", FutureWarning, stacklevel=2) + warnings.warn( + "Attribute 'is_copy' is deprecated and will be removed " + "in a future version.", + FutureWarning, + stacklevel=2, + ) self._is_copy = msg def _validate_dtype(self, dtype): @@ -186,10 +232,11 @@ def _validate_dtype(self, dtype): dtype = pandas_dtype(dtype) # a compound dtype - if dtype.kind == 'V': - raise NotImplementedError("compound dtypes are not implemented" - " in the {0} constructor" - .format(self.__class__.__name__)) + if dtype.kind == "V": + raise NotImplementedError( + "compound dtypes are not implemented" + " in the {0} constructor".format(self.__class__.__name__) + ) return dtype @@ -221,9 +268,18 @@ def _constructor_expanddim(self): # Axis @classmethod - def _setup_axes(cls, axes, info_axis=None, stat_axis=None, aliases=None, - slicers=None, axes_are_reversed=False, build_axes=True, - ns=None, docs=None): + def _setup_axes( + cls, + axes, + info_axis=None, + stat_axis=None, + aliases=None, + slicers=None, + axes_are_reversed=False, + build_axes=True, + ns=None, + docs=None, + ): """Provide axes setup for the major PandasObjects. Parameters @@ -248,7 +304,7 @@ def _setup_axes(cls, axes, info_axis=None, stat_axis=None, aliases=None, cls._AXIS_REVERSED = axes_are_reversed # typ - setattr(cls, '_typ', cls.__name__.lower()) + setattr(cls, "_typ", cls.__name__.lower()) # indexing support cls._ix = None @@ -293,13 +349,16 @@ def _construct_axes_dict_from(self, axes, **kwargs): def _construct_axes_dict_for_slice(self, axes=None, **kwargs): """Return an axes dictionary for myself.""" - d = {self._AXIS_SLICEMAP[a]: self._get_axis(a) - for a in (axes or self._AXIS_ORDERS)} + d = { + self._AXIS_SLICEMAP[a]: self._get_axis(a) + for a in (axes or self._AXIS_ORDERS) + } d.update(kwargs) return d def _construct_axes_from_arguments( - self, args, kwargs, require_all=False, sentinel=None): + self, args, kwargs, require_all=False, sentinel=None + ): """Construct and returns axes if supplied in args/kwargs. If require_all, raise if all axis arguments are not supplied @@ -319,8 +378,10 @@ def _construct_axes_from_arguments( if alias is not None: if a in kwargs: if alias in kwargs: - raise TypeError("arguments are mutually exclusive " - "for [%s,%s]" % (a, alias)) + raise TypeError( + "arguments are mutually exclusive " + "for [%s,%s]" % (a, alias) + ) continue if alias in kwargs: kwargs[a] = kwargs.pop(alias) @@ -332,8 +393,7 @@ def _construct_axes_from_arguments( kwargs[a] = args.pop(0) except IndexError: if require_all: - raise TypeError("not enough/duplicate arguments " - "specified!") + raise TypeError("not enough/duplicate arguments " "specified!") axes = {a: kwargs.pop(a, sentinel) for a in self._AXIS_ORDERS} return axes, kwargs @@ -361,8 +421,7 @@ def _get_axis_number(cls, axis): return cls._AXIS_NUMBERS[axis] except KeyError: pass - raise ValueError('No axis named {0} for object type {1}' - .format(axis, cls)) + raise ValueError("No axis named {0} for object type {1}".format(axis, cls)) @classmethod def _get_axis_name(cls, axis): @@ -375,8 +434,7 @@ def _get_axis_name(cls, axis): return cls._AXIS_NAMES[axis] except KeyError: pass - raise ValueError('No axis named {0} for object type {1}' - .format(axis, cls)) + raise ValueError("No axis named {0} for object type {1}".format(axis, cls)) def _get_axis(self, axis): name = self._get_axis_name(axis) @@ -404,7 +462,7 @@ def _get_axis_resolvers(self, axis): # prefix with 'i' or 'c' depending on the input axis # e.g., you must do ilevel_0 for the 0th level of an unnamed # multiiindex - key = '{prefix}level_{i}'.format(prefix=prefix, i=i) + key = "{prefix}level_{i}".format(prefix=prefix, i=i) level = i level_values = axis_index.get_level_values(level) @@ -436,8 +494,7 @@ def _get_space_character_free_column_resolvers(self): """ from pandas.core.computation.common import _remove_spaces_column_name - return {_remove_spaces_column_name(k): v for k, v - in self.iteritems()} + return {_remove_spaces_column_name(k): v for k, v in self.iteritems()} @property def _info_axis(self): @@ -525,7 +582,7 @@ def _expand_axes(self, key): for k, ax in zip(key, self.axes): if k not in ax: if type(k) != ax.dtype.type: - ax = ax.astype('O') + ax = ax.astype("O") new_axes.append(ax.insert(len(ax), k)) else: new_axes.append(ax) @@ -631,17 +688,21 @@ def set_axis(self, labels, axis=0, inplace=None): warnings.warn( 'set_axis now takes "labels" as first argument, and ' '"axis" as named parameter. The old form, with "axis" as ' - 'first parameter and \"labels\" as second, is still supported ' - 'but will be deprecated in a future version of pandas.', - FutureWarning, stacklevel=2) + 'first parameter and "labels" as second, is still supported ' + "but will be deprecated in a future version of pandas.", + FutureWarning, + stacklevel=2, + ) labels, axis = axis, labels if inplace is None: warnings.warn( - 'set_axis currently defaults to operating inplace.\nThis ' - 'will change in a future version of pandas, use ' - 'inplace=True to avoid this warning.', - FutureWarning, stacklevel=2) + "set_axis currently defaults to operating inplace.\nThis " + "will change in a future version of pandas, use " + "inplace=True to avoid this warning.", + FutureWarning, + stacklevel=2, + ) inplace = True if inplace: setattr(self, self._get_axis_name(axis), labels) @@ -678,21 +739,21 @@ def transpose(self, *args, **kwargs): """ # construct the args - axes, kwargs = self._construct_axes_from_arguments(args, kwargs, - require_all=True) - axes_names = tuple(self._get_axis_name(axes[a]) - for a in self._AXIS_ORDERS) - axes_numbers = tuple(self._get_axis_number(axes[a]) - for a in self._AXIS_ORDERS) + axes, kwargs = self._construct_axes_from_arguments( + args, kwargs, require_all=True + ) + axes_names = tuple(self._get_axis_name(axes[a]) for a in self._AXIS_ORDERS) + axes_numbers = tuple(self._get_axis_number(axes[a]) for a in self._AXIS_ORDERS) # we must have unique axes if len(axes) != len(set(axes)): - raise ValueError('Must specify %s unique axes' % self._AXIS_LEN) + raise ValueError("Must specify %s unique axes" % self._AXIS_LEN) - new_axes = self._construct_axes_dict_from(self, [self._get_axis(x) - for x in axes_names]) + new_axes = self._construct_axes_dict_from( + self, [self._get_axis(x) for x in axes_names] + ) new_values = self.values.transpose(axes_numbers) - if kwargs.pop('copy', None) or (len(args) and args[-1]): + if kwargs.pop("copy", None) or (len(args) and args[-1]): new_values = new_values.copy() nv.validate_transpose(tuple(), kwargs) @@ -716,8 +777,7 @@ def swapaxes(self, axis1, axis2, copy=True): mapping = {i: j, j: i} - new_axes = (self._get_axis(mapping.get(k, k)) - for k in range(self._AXIS_LEN)) + new_axes = (self._get_axis(mapping.get(k, k)) for k in range(self._AXIS_LEN)) new_values = self.values.swapaxes(i, j) if copy: new_values = new_values.copy() @@ -938,12 +998,14 @@ def squeeze(self, axis=None): >>> df_0a.squeeze() 1 """ - axis = (self._AXIS_NAMES if axis is None else - (self._get_axis_number(axis),)) + axis = self._AXIS_NAMES if axis is None else (self._get_axis_number(axis),) try: return self.iloc[ - tuple(0 if i in axis and len(a) == 1 else slice(None) - for i, a in enumerate(self.axes))] + tuple( + 0 if i in axis and len(a) == 1 else slice(None) + for i, a in enumerate(self.axes) + ) + ] except Exception: return self @@ -1088,21 +1150,23 @@ def rename(self, *args, **kwargs): See the :ref:`user guide ` for more. """ axes, kwargs = self._construct_axes_from_arguments(args, kwargs) - copy = kwargs.pop('copy', True) - inplace = kwargs.pop('inplace', False) - level = kwargs.pop('level', None) - axis = kwargs.pop('axis', None) - errors = kwargs.pop('errors', 'ignore') + copy = kwargs.pop("copy", True) + inplace = kwargs.pop("inplace", False) + level = kwargs.pop("level", None) + axis = kwargs.pop("axis", None) + errors = kwargs.pop("errors", "ignore") if axis is not None: # Validate the axis self._get_axis_number(axis) if kwargs: - raise TypeError('rename() got an unexpected keyword ' - 'argument "{0}"'.format(list(kwargs.keys())[0])) + raise TypeError( + "rename() got an unexpected keyword " + 'argument "{0}"'.format(list(kwargs.keys())[0]) + ) if com.count_not_none(*axes.values()) == 0: - raise TypeError('must pass an index to rename') + raise TypeError("must pass an index to rename") self._consolidate_inplace() result = self if inplace else self.copy(deep=copy) @@ -1120,14 +1184,15 @@ def rename(self, *args, **kwargs): # GH 13473 if not callable(v): indexer = self.axes[axis].get_indexer_for(v) - if errors == 'raise' and len(indexer[indexer == -1]): - missing_labels = [label for index, label in enumerate(v) - if indexer[index] == -1] - raise KeyError('{} not found in axis' - .format(missing_labels)) - - result._data = result._data.rename_axis(f, axis=baxis, copy=copy, - level=level) + if errors == "raise" and len(indexer[indexer == -1]): + missing_labels = [ + label for index, label in enumerate(v) if indexer[index] == -1 + ] + raise KeyError("{} not found in axis".format(missing_labels)) + + result._data = result._data.rename_axis( + f, axis=baxis, copy=copy, level=level + ) result._clear_item_cache() if inplace: @@ -1135,8 +1200,7 @@ def rename(self, *args, **kwargs): else: return result.__finalize__(self) - @rewrite_axis_style_signature('mapper', [('copy', True), - ('inplace', False)]) + @rewrite_axis_style_signature("mapper", [("copy", True), ("inplace", False)]) def rename_axis(self, mapper=sentinel, **kwargs): """ Set the name of the axis for the index or columns. @@ -1262,28 +1326,31 @@ class name monkey 2 2 """ axes, kwargs = self._construct_axes_from_arguments( - (), kwargs, sentinel=sentinel) - copy = kwargs.pop('copy', True) - inplace = kwargs.pop('inplace', False) - axis = kwargs.pop('axis', 0) + (), kwargs, sentinel=sentinel + ) + copy = kwargs.pop("copy", True) + inplace = kwargs.pop("inplace", False) + axis = kwargs.pop("axis", 0) if axis is not None: axis = self._get_axis_number(axis) if kwargs: - raise TypeError('rename_axis() got an unexpected keyword ' - 'argument "{0}"'.format(list(kwargs.keys())[0])) + raise TypeError( + "rename_axis() got an unexpected keyword " + 'argument "{0}"'.format(list(kwargs.keys())[0]) + ) - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if mapper is not sentinel: # Use v0.23 behavior if a scalar or list - non_mapper = is_scalar(mapper) or (is_list_like(mapper) and not - is_dict_like(mapper)) + non_mapper = is_scalar(mapper) or ( + is_list_like(mapper) and not is_dict_like(mapper) + ) if non_mapper: return self._set_axis_name(mapper, axis=axis, inplace=inplace) else: - raise ValueError("Use `.rename` to alter labels " - "with a mapper.") + raise ValueError("Use `.rename` to alter labels " "with a mapper.") else: # Use new behavior. Means that index and/or columns # is specified @@ -1293,16 +1360,14 @@ class name v = axes.get(self._AXIS_NAMES[axis]) if v is sentinel: continue - non_mapper = is_scalar(v) or (is_list_like(v) and not - is_dict_like(v)) + non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v)) if non_mapper: newnames = v else: f = com._get_rename_function(v) curnames = self._get_axis(axis).names newnames = [f(name) for name in curnames] - result._set_axis_name(newnames, axis=axis, - inplace=True) + result._set_axis_name(newnames, axis=axis, inplace=True) if not inplace: return result @@ -1361,7 +1426,7 @@ def _set_axis_name(self, name, axis=0, inplace=False): axis = self._get_axis_number(axis) idx = self._get_axis(axis).set_names(name) - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") renamed = self if inplace else self.copy() renamed.set_axis(idx, axis=axis, inplace=True) if not inplace: @@ -1371,8 +1436,9 @@ def _set_axis_name(self, name, axis=0, inplace=False): # Comparison Methods def _indexed_same(self, other): - return all(self._get_axis(a).equals(other._get_axis(a)) - for a in self._AXIS_ORDERS) + return all( + self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS + ) def equals(self, other): """ @@ -1467,24 +1533,32 @@ def __neg__(self): values = com.values_from_object(self) if is_bool_dtype(values): arr = operator.inv(values) - elif (is_numeric_dtype(values) or is_timedelta64_dtype(values) - or is_object_dtype(values)): + elif ( + is_numeric_dtype(values) + or is_timedelta64_dtype(values) + or is_object_dtype(values) + ): arr = operator.neg(values) else: - raise TypeError("Unary negative expects numeric dtype, not {}" - .format(values.dtype)) + raise TypeError( + "Unary negative expects numeric dtype, not {}".format(values.dtype) + ) return self.__array_wrap__(arr) def __pos__(self): values = com.values_from_object(self) - if (is_bool_dtype(values) or is_period_arraylike(values)): + if is_bool_dtype(values) or is_period_arraylike(values): arr = values - elif (is_numeric_dtype(values) or is_timedelta64_dtype(values) - or is_object_dtype(values)): + elif ( + is_numeric_dtype(values) + or is_timedelta64_dtype(values) + or is_object_dtype(values) + ): arr = operator.pos(values) else: - raise TypeError("Unary plus expects numeric dtype, not {}" - .format(values.dtype)) + raise TypeError( + "Unary plus expects numeric dtype, not {}".format(values.dtype) + ) return self.__array_wrap__(arr) def __invert__(self): @@ -1500,9 +1574,12 @@ def __invert__(self): raise def __nonzero__(self): - raise ValueError("The truth value of a {0} is ambiguous. " - "Use a.empty, a.bool(), a.item(), a.any() or a.all()." - .format(self.__class__.__name__)) + raise ValueError( + "The truth value of a {0} is ambiguous. " + "Use a.empty, a.bool(), a.item(), a.any() or a.all().".format( + self.__class__.__name__ + ) + ) __bool__ = __nonzero__ @@ -1523,8 +1600,10 @@ def bool(self): if isinstance(v, (bool, np.bool_)): return bool(v) elif is_scalar(v): - raise ValueError("bool cannot act on a non-boolean single element " - "{0}".format(self.__class__.__name__)) + raise ValueError( + "bool cannot act on a non-boolean single element " + "{0}".format(self.__class__.__name__) + ) self.__nonzero__() @@ -1565,10 +1644,12 @@ def _is_level_reference(self, key, axis=0): """ axis = self._get_axis_number(axis) - return (key is not None and - is_hashable(key) and - key in self.axes[axis].names and - not self._is_label_reference(key, axis=axis)) + return ( + key is not None + and is_hashable(key) + and key in self.axes[axis].names + and not self._is_label_reference(key, axis=axis) + ) def _is_label_reference(self, key, axis=0): """ @@ -1593,9 +1674,11 @@ def _is_label_reference(self, key, axis=0): axis = self._get_axis_number(axis) other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis) - return (key is not None and - is_hashable(key) and - any(key in self.axes[ax] for ax in other_axes)) + return ( + key is not None + and is_hashable(key) + and any(key in self.axes[ax] for ax in other_axes) + ) def _is_label_or_level_reference(self, key, axis=0): """ @@ -1617,8 +1700,9 @@ def _is_label_or_level_reference(self, key, axis=0): ------- is_label_or_level: bool """ - return (self._is_level_reference(key, axis=axis) or - self._is_label_reference(key, axis=axis)) + return self._is_level_reference(key, axis=axis) or self._is_label_reference( + key, axis=axis + ) def _check_label_or_level_ambiguity(self, key, axis=0): """ @@ -1641,27 +1725,32 @@ def _check_label_or_level_ambiguity(self, key, axis=0): axis = self._get_axis_number(axis) other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis) - if (key is not None and - is_hashable(key) and - key in self.axes[axis].names and - any(key in self.axes[ax] for ax in other_axes)): + if ( + key is not None + and is_hashable(key) + and key in self.axes[axis].names + and any(key in self.axes[ax] for ax in other_axes) + ): # Build an informative and grammatical warning - level_article, level_type = (('an', 'index') - if axis == 0 else - ('a', 'column')) - - label_article, label_type = (('a', 'column') - if axis == 0 else - ('an', 'index')) - - msg = ("'{key}' is both {level_article} {level_type} level and " - "{label_article} {label_type} label, which is ambiguous." - ).format(key=key, - level_article=level_article, - level_type=level_type, - label_article=label_article, - label_type=label_type) + level_article, level_type = ( + ("an", "index") if axis == 0 else ("a", "column") + ) + + label_article, label_type = ( + ("a", "column") if axis == 0 else ("an", "index") + ) + + msg = ( + "'{key}' is both {level_article} {level_type} level and " + "{label_article} {label_type} label, which is ambiguous." + ).format( + key=key, + level_article=level_article, + level_type=level_type, + label_article=label_article, + label_type=label_type, + ) raise ValueError(msg) def _get_label_or_level_values(self, key, axis=0): @@ -1712,21 +1801,27 @@ def _get_label_or_level_values(self, key, axis=0): # Check for duplicates if values.ndim > 1: - if other_axes and isinstance( - self._get_axis(other_axes[0]), MultiIndex): - multi_message = ('\n' - 'For a multi-index, the label must be a ' - 'tuple with elements corresponding to ' - 'each level.') + if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex): + multi_message = ( + "\n" + "For a multi-index, the label must be a " + "tuple with elements corresponding to " + "each level." + ) else: - multi_message = '' - - label_axis_name = 'column' if axis == 0 else 'index' - raise ValueError(("The {label_axis_name} label '{key}' " - "is not unique.{multi_message}") - .format(key=key, - label_axis_name=label_axis_name, - multi_message=multi_message)) + multi_message = "" + + label_axis_name = "column" if axis == 0 else "index" + raise ValueError( + ( + "The {label_axis_name} label '{key}' " + "is not unique.{multi_message}" + ).format( + key=key, + label_axis_name=label_axis_name, + multi_message=multi_message, + ) + ) return values @@ -1760,21 +1855,22 @@ def _drop_labels_or_levels(self, keys, axis=0): # Validate keys keys = com.maybe_make_list(keys) - invalid_keys = [k for k in keys if not - self._is_label_or_level_reference(k, axis=axis)] + invalid_keys = [ + k for k in keys if not self._is_label_or_level_reference(k, axis=axis) + ] if invalid_keys: - raise ValueError(("The following keys are not valid labels or " - "levels for axis {axis}: {invalid_keys}") - .format(axis=axis, - invalid_keys=invalid_keys)) + raise ValueError( + ( + "The following keys are not valid labels or " + "levels for axis {axis}: {invalid_keys}" + ).format(axis=axis, invalid_keys=invalid_keys) + ) # Compute levels and labels to drop - levels_to_drop = [k for k in keys - if self._is_level_reference(k, axis=axis)] + levels_to_drop = [k for k in keys if self._is_level_reference(k, axis=axis)] - labels_to_drop = [k for k in keys - if not self._is_level_reference(k, axis=axis)] + labels_to_drop = [k for k in keys if not self._is_level_reference(k, axis=axis)] # Perform copy upfront and then use inplace operations below. # This ensures that we always perform exactly one copy. @@ -1810,8 +1906,10 @@ def _drop_labels_or_levels(self, keys, axis=0): # Iteration def __hash__(self): - raise TypeError('{0!r} objects are mutable, thus they cannot be' - ' hashed'.format(self.__class__.__name__)) + raise TypeError( + "{0!r} objects are mutable, thus they cannot be" + " hashed".format(self.__class__.__name__) + ) def __iter__(self): """ @@ -1937,9 +2035,12 @@ def to_dense(self): %(klass)s Dense %(klass)s. """ - warnings.warn("DataFrame/Series.to_dense is deprecated " - "and will be removed in a future version", - FutureWarning, stacklevel=2) + warnings.warn( + "DataFrame/Series.to_dense is deprecated " + "and will be removed in a future version", + FutureWarning, + stacklevel=2, + ) # compat return self @@ -1948,15 +2049,14 @@ def to_dense(self): def __getstate__(self): meta = {k: getattr(self, k, None) for k in self._metadata} - return dict(_data=self._data, _typ=self._typ, _metadata=self._metadata, - **meta) + return dict(_data=self._data, _typ=self._typ, _metadata=self._metadata, **meta) def __setstate__(self, state): if isinstance(state, BlockManager): self._data = state elif isinstance(state, dict): - typ = state.get('_typ') + typ = state.get("_typ") if typ is not None: # set in the order of internal names @@ -1996,15 +2096,15 @@ def __setstate__(self, state): def __repr__(self): # string representation based upon iterating over self # (since, by definition, `PandasContainers` are iterable) - prepr = '[%s]' % ','.join(map(pprint_thing, self)) - return '%s(%s)' % (self.__class__.__name__, prepr) + prepr = "[%s]" % ",".join(map(pprint_thing, self)) + return "%s(%s)" % (self.__class__.__name__, prepr) def _repr_latex_(self): """ Returns a LaTeX representation for a particular object. Mainly for use with nbconvert (jupyter notebook conversion to pdf). """ - if config.get_option('display.latex.repr'): + if config.get_option("display.latex.repr"): return self.to_latex() else: return None @@ -2015,15 +2115,18 @@ def _repr_data_resource_(self): naming convention. """ if config.get_option("display.html.table_schema"): - data = self.head(config.get_option('display.max_rows')) - payload = json.loads(data.to_json(orient='table'), - object_pairs_hook=collections.OrderedDict) + data = self.head(config.get_option("display.max_rows")) + payload = json.loads( + data.to_json(orient="table"), object_pairs_hook=collections.OrderedDict + ) return payload # ---------------------------------------------------------------------- # I/O Methods - _shared_docs['to_excel'] = """ + _shared_docs[ + "to_excel" + ] = """ Write %(klass)s to an Excel sheet. To write a single %(klass)s to an Excel .xlsx file it is only necessary to @@ -2128,28 +2231,62 @@ def _repr_data_resource_(self): """ @Appender(_shared_docs["to_excel"] % dict(klass="object")) - def to_excel(self, excel_writer, sheet_name="Sheet1", na_rep="", - float_format=None, columns=None, header=True, index=True, - index_label=None, startrow=0, startcol=0, engine=None, - merge_cells=True, encoding=None, inf_rep="inf", verbose=True, - freeze_panes=None): + def to_excel( + self, + excel_writer, + sheet_name="Sheet1", + na_rep="", + float_format=None, + columns=None, + header=True, + index=True, + index_label=None, + startrow=0, + startcol=0, + engine=None, + merge_cells=True, + encoding=None, + inf_rep="inf", + verbose=True, + freeze_panes=None, + ): df = self if isinstance(self, ABCDataFrame) else self.to_frame() from pandas.io.formats.excel import ExcelFormatter - formatter = ExcelFormatter(df, na_rep=na_rep, cols=columns, - header=header, - float_format=float_format, index=index, - index_label=index_label, - merge_cells=merge_cells, - inf_rep=inf_rep) - formatter.write(excel_writer, sheet_name=sheet_name, startrow=startrow, - startcol=startcol, freeze_panes=freeze_panes, - engine=engine) - - def to_json(self, path_or_buf=None, orient=None, date_format=None, - double_precision=10, force_ascii=True, date_unit='ms', - default_handler=None, lines=False, compression='infer', - index=True): + + formatter = ExcelFormatter( + df, + na_rep=na_rep, + cols=columns, + header=header, + float_format=float_format, + index=index, + index_label=index_label, + merge_cells=merge_cells, + inf_rep=inf_rep, + ) + formatter.write( + excel_writer, + sheet_name=sheet_name, + startrow=startrow, + startcol=startcol, + freeze_panes=freeze_panes, + engine=engine, + ) + + def to_json( + self, + path_or_buf=None, + orient=None, + date_format=None, + double_precision=10, + force_ascii=True, + date_unit="ms", + default_handler=None, + lines=False, + compression="infer", + index=True, + ): """ Convert the object to a JSON string. @@ -2286,17 +2423,24 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None, """ from pandas.io import json - if date_format is None and orient == 'table': - date_format = 'iso' + + if date_format is None and orient == "table": + date_format = "iso" elif date_format is None: - date_format = 'epoch' - return json.to_json(path_or_buf=path_or_buf, obj=self, orient=orient, - date_format=date_format, - double_precision=double_precision, - force_ascii=force_ascii, date_unit=date_unit, - default_handler=default_handler, - lines=lines, compression=compression, - index=index) + date_format = "epoch" + return json.to_json( + path_or_buf=path_or_buf, + obj=self, + orient=orient, + date_format=date_format, + double_precision=double_precision, + force_ascii=force_ascii, + date_unit=date_unit, + default_handler=default_handler, + lines=lines, + compression=compression, + index=index, + ) def to_hdf(self, path_or_buf, key, **kwargs): """ @@ -2400,9 +2544,10 @@ def to_hdf(self, path_or_buf, key, **kwargs): >>> os.remove('data.h5') """ from pandas.io import pytables + pytables.to_hdf(path_or_buf, key, self, **kwargs) - def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs): + def to_msgpack(self, path_or_buf=None, encoding="utf-8", **kwargs): """ Serialize object to input file path using msgpack format. @@ -2429,11 +2574,21 @@ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs): """ from pandas.io import packers - return packers.to_msgpack(path_or_buf, self, encoding=encoding, - **kwargs) - def to_sql(self, name, con, schema=None, if_exists='fail', index=True, - index_label=None, chunksize=None, dtype=None, method=None): + return packers.to_msgpack(path_or_buf, self, encoding=encoding, **kwargs) + + def to_sql( + self, + name, + con, + schema=None, + if_exists="fail", + index=True, + index_label=None, + chunksize=None, + dtype=None, + method=None, + ): """ Write records stored in a DataFrame to a SQL database. @@ -2561,12 +2716,21 @@ def to_sql(self, name, con, schema=None, if_exists='fail', index=True, [(1,), (None,), (2,)] """ from pandas.io import sql - sql.to_sql(self, name, con, schema=schema, if_exists=if_exists, - index=index, index_label=index_label, chunksize=chunksize, - dtype=dtype, method=method) - def to_pickle(self, path, compression='infer', - protocol=pickle.HIGHEST_PROTOCOL): + sql.to_sql( + self, + name, + con, + schema=schema, + if_exists=if_exists, + index=index, + index_label=index_label, + chunksize=chunksize, + dtype=dtype, + method=method, + ) + + def to_pickle(self, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): """ Pickle (serialize) object to file. @@ -2621,6 +2785,7 @@ def to_pickle(self, path, compression='infer', >>> os.remove("./dummy.pkl") """ from pandas.io.pickle import to_pickle + to_pickle(self, path, compression=compression, protocol=protocol) def to_clipboard(self, excel=True, sep=None, **kwargs): @@ -2678,6 +2843,7 @@ def to_clipboard(self, excel=True, sep=None, **kwargs): ... # 4,5,6 """ from pandas.io import clipboards + clipboards.to_clipboard(self, excel=excel, sep=sep, **kwargs) def to_xarray(self): @@ -2762,12 +2928,28 @@ class (index) object 'bird' 'bird' 'mammal' 'mammal' else: return xarray.Dataset.from_dataframe(self) - def to_latex(self, buf=None, columns=None, col_space=None, header=True, - index=True, na_rep='NaN', formatters=None, float_format=None, - sparsify=None, index_names=True, bold_rows=False, - column_format=None, longtable=None, escape=None, - encoding=None, decimal='.', multicolumn=None, - multicolumn_format=None, multirow=None): + def to_latex( + self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + bold_rows=False, + column_format=None, + longtable=None, + escape=None, + encoding=None, + decimal=".", + multicolumn=None, + multicolumn_format=None, + multirow=None, + ): r""" Render an object to a LaTeX tabular environment table. @@ -2879,34 +3061,60 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True, if multicolumn is None: multicolumn = config.get_option("display.latex.multicolumn") if multicolumn_format is None: - multicolumn_format = config.get_option( - "display.latex.multicolumn_format") + multicolumn_format = config.get_option("display.latex.multicolumn_format") if multirow is None: multirow = config.get_option("display.latex.multirow") - formatter = DataFrameFormatter(self, buf=buf, columns=columns, - col_space=col_space, na_rep=na_rep, - header=header, index=index, - formatters=formatters, - float_format=float_format, - bold_rows=bold_rows, - sparsify=sparsify, - index_names=index_names, - escape=escape, decimal=decimal) - formatter.to_latex(column_format=column_format, longtable=longtable, - encoding=encoding, multicolumn=multicolumn, - multicolumn_format=multicolumn_format, - multirow=multirow) + formatter = DataFrameFormatter( + self, + buf=buf, + columns=columns, + col_space=col_space, + na_rep=na_rep, + header=header, + index=index, + formatters=formatters, + float_format=float_format, + bold_rows=bold_rows, + sparsify=sparsify, + index_names=index_names, + escape=escape, + decimal=decimal, + ) + formatter.to_latex( + column_format=column_format, + longtable=longtable, + encoding=encoding, + multicolumn=multicolumn, + multicolumn_format=multicolumn_format, + multirow=multirow, + ) if buf is None: return formatter.buf.getvalue() - def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, - columns=None, header=True, index=True, index_label=None, - mode='w', encoding=None, compression='infer', quoting=None, - quotechar='"', line_terminator=None, chunksize=None, - date_format=None, doublequote=True, - escapechar=None, decimal='.'): + def to_csv( + self, + path_or_buf=None, + sep=",", + na_rep="", + float_format=None, + columns=None, + header=True, + index=True, + index_label=None, + mode="w", + encoding=None, + compression="infer", + quoting=None, + quotechar='"', + line_terminator=None, + chunksize=None, + date_format=None, + doublequote=True, + escapechar=None, + decimal=".", + ): r""" Write object to a comma-separated values (csv) file. @@ -3012,17 +3220,29 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, df = self if isinstance(self, ABCDataFrame) else self.to_frame() from pandas.io.formats.csvs import CSVFormatter - formatter = CSVFormatter(df, path_or_buf, - line_terminator=line_terminator, sep=sep, - encoding=encoding, - compression=compression, quoting=quoting, - na_rep=na_rep, float_format=float_format, - cols=columns, header=header, index=index, - index_label=index_label, mode=mode, - chunksize=chunksize, quotechar=quotechar, - date_format=date_format, - doublequote=doublequote, - escapechar=escapechar, decimal=decimal) + + formatter = CSVFormatter( + df, + path_or_buf, + line_terminator=line_terminator, + sep=sep, + encoding=encoding, + compression=compression, + quoting=quoting, + na_rep=na_rep, + float_format=float_format, + cols=columns, + header=header, + index=index, + index_label=index_label, + mode=mode, + chunksize=chunksize, + quotechar=quotechar, + date_format=date_format, + doublequote=doublequote, + escapechar=escapechar, + decimal=decimal, + ) formatter.save() if path_or_buf is None: @@ -3082,7 +3302,7 @@ def _set_as_cached(self, item, cacher): def _reset_cacher(self): """Reset the cacher.""" - if hasattr(self, '_cacher'): + if hasattr(self, "_cacher"): del self._cacher def _iget_item_cache(self, item): @@ -3105,11 +3325,11 @@ def _maybe_cache_changed(self, item, value): @property def _is_cached(self): """Return boolean indicating if self is cached or not.""" - return getattr(self, '_cacher', None) is not None + return getattr(self, "_cacher", None) is not None def _get_cacher(self): """return my cacher or None""" - cacher = getattr(self, '_cacher', None) + cacher = getattr(self, "_cacher", None) if cacher is not None: cacher = cacher[1]() return cacher @@ -3133,7 +3353,7 @@ def _maybe_update_cacher(self, clear=False, verify_is_copy=True): """ - cacher = getattr(self, '_cacher', None) + cacher = getattr(self, "_cacher", None) if cacher is not None: ref = cacher[1]() @@ -3148,7 +3368,7 @@ def _maybe_update_cacher(self, clear=False, verify_is_copy=True): pass if verify_is_copy: - self._check_setitem_copy(stacklevel=5, t='referant') + self._check_setitem_copy(stacklevel=5, t="referant") if clear: self._clear_item_cache() @@ -3202,14 +3422,13 @@ def _check_is_chained_assignment_possible(self): if self._is_view and self._is_cached: ref = self._get_cacher() if ref is not None and ref._is_mixed_type: - self._check_setitem_copy(stacklevel=4, t='referant', - force=True) + self._check_setitem_copy(stacklevel=4, t="referant", force=True) return True elif self._is_copy: - self._check_setitem_copy(stacklevel=4, t='referant') + self._check_setitem_copy(stacklevel=4, t="referant") return False - def _check_setitem_copy(self, stacklevel=4, t='setting', force=False): + def _check_setitem_copy(self, stacklevel=4, t="setting", force=False): """ Parameters @@ -3244,7 +3463,7 @@ def _check_setitem_copy(self, stacklevel=4, t='setting', force=False): if not (force or self._is_copy): return - value = config.get_option('mode.chained_assignment') + value = config.get_option("mode.chained_assignment") if value is None: return @@ -3260,30 +3479,31 @@ def _check_setitem_copy(self, stacklevel=4, t='setting', force=False): if isinstance(self._is_copy, str): t = self._is_copy - elif t == 'referant': - t = ("\n" - "A value is trying to be set on a copy of a slice from a " - "DataFrame\n\n" - "See the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/user_guide/" - "indexing.html#returning-a-view-versus-a-copy" - ) + elif t == "referant": + t = ( + "\n" + "A value is trying to be set on a copy of a slice from a " + "DataFrame\n\n" + "See the caveats in the documentation: " + "http://pandas.pydata.org/pandas-docs/stable/user_guide/" + "indexing.html#returning-a-view-versus-a-copy" + ) else: - t = ("\n" - "A value is trying to be set on a copy of a slice from a " - "DataFrame.\n" - "Try using .loc[row_indexer,col_indexer] = value " - "instead\n\nSee the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/user_guide/" - "indexing.html#returning-a-view-versus-a-copy" - ) - - if value == 'raise': + t = ( + "\n" + "A value is trying to be set on a copy of a slice from a " + "DataFrame.\n" + "Try using .loc[row_indexer,col_indexer] = value " + "instead\n\nSee the caveats in the documentation: " + "http://pandas.pydata.org/pandas-docs/stable/user_guide/" + "indexing.html#returning-a-view-versus-a-copy" + ) + + if value == "raise": raise com.SettingWithCopyError(t) - elif value == 'warn': - warnings.warn(t, com.SettingWithCopyWarning, - stacklevel=stacklevel) + elif value == "warn": + warnings.warn(t, com.SettingWithCopyWarning, stacklevel=stacklevel) def __delitem__(self, key): """ @@ -3292,7 +3512,7 @@ def __delitem__(self, key): deleted = False maybe_shortcut = False - if hasattr(self, 'columns') and isinstance(self.columns, MultiIndex): + if hasattr(self, "columns") and isinstance(self.columns, MultiIndex): try: maybe_shortcut = key not in self.columns._engine except TypeError: @@ -3302,9 +3522,9 @@ def __delitem__(self, key): # Allow shorthand to delete all columns whose first len(key) # elements match key: if not isinstance(key, tuple): - key = (key, ) + key = (key,) for col in self.columns: - if isinstance(col, tuple) and col[:len(key)] == key: + if isinstance(col, tuple) and col[: len(key)] == key: del self[col] deleted = True if not deleted: @@ -3353,9 +3573,9 @@ def _take(self, indices, axis=0, is_copy=True): """ self._consolidate_inplace() - new_data = self._data.take(indices, - axis=self._get_block_manager_axis(axis), - verify=True) + new_data = self._data.take( + indices, axis=self._get_block_manager_axis(axis), verify=True + ) result = self._constructor(new_data).__finalize__(self) # Maybe set copy if we didn't actually change the index. @@ -3545,8 +3765,7 @@ class animal locomotion axis = self._get_axis_number(axis) labels = self._get_axis(axis) if level is not None: - loc, new_ax = labels.get_loc_level(key, level=level, - drop_level=drop_level) + loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level) # create the tuple of the indexer indexer = [slice(None)] * self.ndim @@ -3564,8 +3783,7 @@ class animal locomotion index = self.index if isinstance(index, MultiIndex): - loc, new_index = self.index.get_loc_level(key, - drop_level=drop_level) + loc, new_index = self.index.get_loc_level(key, drop_level=drop_level) else: loc = self.index.get_loc(key) @@ -3591,8 +3809,11 @@ class animal locomotion return com.maybe_box_datetimelike(new_values) result = self._constructor_sliced( - new_values, index=self.columns, - name=self.index[loc], dtype=new_values.dtype) + new_values, + index=self.columns, + name=self.index[loc], + dtype=new_values.dtype, + ) else: result = self.iloc[loc] @@ -3605,8 +3826,7 @@ class animal locomotion _xs = xs # type: Callable - def reindex_like(self, other, method=None, copy=True, limit=None, - tolerance=None): + def reindex_like(self, other, method=None, copy=True, limit=None, tolerance=None): """ Return an object with matching indices as other object. @@ -3701,28 +3921,42 @@ def reindex_like(self, other, method=None, copy=True, limit=None, 2014-02-14 NaN NaN NaN 2014-02-15 35.1 NaN medium """ - d = other._construct_axes_dict(axes=self._AXIS_ORDERS, method=method, - copy=copy, limit=limit, - tolerance=tolerance) + d = other._construct_axes_dict( + axes=self._AXIS_ORDERS, + method=method, + copy=copy, + limit=limit, + tolerance=tolerance, + ) return self.reindex(**d) - def drop(self, labels=None, axis=0, index=None, columns=None, level=None, - inplace=False, errors='raise'): + def drop( + self, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors="raise", + ): - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if labels is not None: if index is not None or columns is not None: - raise ValueError("Cannot specify both 'labels' and " - "'index'/'columns'") + raise ValueError( + "Cannot specify both 'labels' and " "'index'/'columns'" + ) axis_name = self._get_axis_name(axis) axes = {axis_name: labels} elif index is not None or columns is not None: axes, _ = self._construct_axes_from_arguments((index, columns), {}) else: - raise ValueError("Need to specify at least one of 'labels', " - "'index' or 'columns'") + raise ValueError( + "Need to specify at least one of 'labels', " "'index' or 'columns'" + ) obj = self @@ -3735,7 +3969,7 @@ def drop(self, labels=None, axis=0, index=None, columns=None, level=None, else: return obj - def _drop_axis(self, labels, axis, level=None, errors='raise'): + def _drop_axis(self, labels, axis, level=None, errors="raise"): """ Drop labels from specified axis. Used in the ``drop`` method internally. @@ -3757,7 +3991,7 @@ def _drop_axis(self, labels, axis, level=None, errors='raise'): if axis.is_unique: if level is not None: if not isinstance(axis, MultiIndex): - raise AssertionError('axis must be a MultiIndex') + raise AssertionError("axis must be a MultiIndex") new_axis = axis.drop(labels, level=level, errors=errors) else: new_axis = axis.drop(labels, errors=errors) @@ -3768,18 +4002,18 @@ def _drop_axis(self, labels, axis, level=None, errors='raise'): labels = ensure_object(com.index_labels_to_array(labels)) if level is not None: if not isinstance(axis, MultiIndex): - raise AssertionError('axis must be a MultiIndex') + raise AssertionError("axis must be a MultiIndex") indexer = ~axis.get_level_values(level).isin(labels) # GH 18561 MultiIndex.drop should raise if label is absent - if errors == 'raise' and indexer.all(): - raise KeyError('{} not found in axis'.format(labels)) + if errors == "raise" and indexer.all(): + raise KeyError("{} not found in axis".format(labels)) else: indexer = ~axis.isin(labels) # Check if label doesn't exist along axis labels_missing = (axis.get_indexer_for(labels) == -1).any() - if errors == 'raise' and labels_missing: - raise KeyError('{} not found in axis'.format(labels)) + if errors == "raise" and labels_missing: + raise KeyError("{} not found in axis".format(labels)) slicer = [slice(None)] * self.ndim slicer[self._get_axis_number(axis_name)] = indexer @@ -3803,7 +4037,7 @@ def _update_inplace(self, result, verify_is_copy=True): self._reset_cache() self._clear_item_cache() - self._data = getattr(result, '_data', result) + self._data = getattr(result, "_data", result) self._maybe_update_cacher(verify_is_copy=verify_is_copy) def add_prefix(self, prefix): @@ -3860,7 +4094,7 @@ def add_prefix(self, prefix): 2 3 5 3 4 6 """ - f = functools.partial('{prefix}{}'.format, prefix=prefix) + f = functools.partial("{prefix}{}".format, prefix=prefix) mapper = {self._info_axis_name: f} return self.rename(**mapper) @@ -3919,13 +4153,20 @@ def add_suffix(self, suffix): 2 3 5 3 4 6 """ - f = functools.partial('{}{suffix}'.format, suffix=suffix) + f = functools.partial("{}{suffix}".format, suffix=suffix) mapper = {self._info_axis_name: f} return self.rename(**mapper) - def sort_values(self, by=None, axis=0, ascending=True, inplace=False, - kind='quicksort', na_position='last'): + def sort_values( + self, + by=None, + axis=0, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + ): """ Sort by the values along either axis. @@ -4015,8 +4256,16 @@ def sort_values(self, by=None, axis=0, ascending=True, inplace=False, """ raise AbstractMethodError(self) - def sort_index(self, axis=0, level=None, ascending=True, inplace=False, - kind='quicksort', na_position='last', sort_remaining=True): + def sort_index( + self, + axis=0, + level=None, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + sort_remaining=True, + ): """ Sort object by labels (along an axis). @@ -4048,7 +4297,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, sorted_obj : DataFrame or None DataFrame with sorted index if inplace=False, None otherwise. """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis) axis_name = self._get_axis_name(axis) labels = self._get_axis(axis) @@ -4273,12 +4522,12 @@ def reindex(self, *args, **kwargs): # construct the args axes, kwargs = self._construct_axes_from_arguments(args, kwargs) - method = missing.clean_reindex_fill_method(kwargs.pop('method', None)) - level = kwargs.pop('level', None) - copy = kwargs.pop('copy', True) - limit = kwargs.pop('limit', None) - tolerance = kwargs.pop('tolerance', None) - fill_value = kwargs.pop('fill_value', None) + method = missing.clean_reindex_fill_method(kwargs.pop("method", None)) + level = kwargs.pop("level", None) + copy = kwargs.pop("copy", True) + limit = kwargs.pop("limit", None) + tolerance = kwargs.pop("tolerance", None) + fill_value = kwargs.pop("fill_value", None) # Series.reindex doesn't use / need the axis kwarg # We pop and ignore it here, to make writing Series/Frame generic code @@ -4286,15 +4535,20 @@ def reindex(self, *args, **kwargs): kwargs.pop("axis", None) if kwargs: - raise TypeError('reindex() got an unexpected keyword ' - 'argument "{0}"'.format(list(kwargs.keys())[0])) + raise TypeError( + "reindex() got an unexpected keyword " + 'argument "{0}"'.format(list(kwargs.keys())[0]) + ) self._consolidate_inplace() # if all axes that are requested to reindex are equal, then only copy # if indicated must have index names equal here as well as values - if all(self._get_axis(axis).identical(ax) - for axis, ax in axes.items() if ax is not None): + if all( + self._get_axis(axis).identical(ax) + for axis, ax in axes.items() + if ax is not None + ): if copy: return self.copy() return self @@ -4307,11 +4561,11 @@ def reindex(self, *args, **kwargs): pass # perform the reindex on the axes - return self._reindex_axes(axes, level, limit, tolerance, method, - fill_value, copy).__finalize__(self) + return self._reindex_axes( + axes, level, limit, tolerance, method, fill_value, copy + ).__finalize__(self) - def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, - copy): + def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy): """Perform the reindex for all the axes.""" obj = self for a in self._AXIS_ORDERS: @@ -4320,26 +4574,35 @@ def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, continue ax = self._get_axis(a) - new_index, indexer = ax.reindex(labels, level=level, limit=limit, - tolerance=tolerance, method=method) + new_index, indexer = ax.reindex( + labels, level=level, limit=limit, tolerance=tolerance, method=method + ) axis = self._get_axis_number(a) - obj = obj._reindex_with_indexers({axis: [new_index, indexer]}, - fill_value=fill_value, - copy=copy, allow_dups=False) + obj = obj._reindex_with_indexers( + {axis: [new_index, indexer]}, + fill_value=fill_value, + copy=copy, + allow_dups=False, + ) return obj def _needs_reindex_multi(self, axes, method, level): """Check if we do need a multi reindex.""" - return ((com.count_not_none(*axes.values()) == self._AXIS_LEN) and - method is None and level is None and not self._is_mixed_type) + return ( + (com.count_not_none(*axes.values()) == self._AXIS_LEN) + and method is None + and level is None + and not self._is_mixed_type + ) def _reindex_multi(self, axes, copy, fill_value): return NotImplemented - def _reindex_with_indexers(self, reindexers, fill_value=None, copy=False, - allow_dups=False): + def _reindex_with_indexers( + self, reindexers, fill_value=None, copy=False, allow_dups=False + ): """allow_dups indicates an internal call here """ # reindex doing multiple operations on different axes if indicated @@ -4356,10 +4619,14 @@ def _reindex_with_indexers(self, reindexers, fill_value=None, copy=False, indexer = ensure_int64(indexer) # TODO: speed up on homogeneous DataFrame objects - new_data = new_data.reindex_indexer(index, indexer, axis=baxis, - fill_value=fill_value, - allow_dups=allow_dups, - copy=copy) + new_data = new_data.reindex_indexer( + index, + indexer, + axis=baxis, + fill_value=fill_value, + allow_dups=allow_dups, + copy=copy, + ) if copy and new_data is self._data: new_data = new_data.copy() @@ -4429,8 +4696,10 @@ def filter(self, items=None, like=None, regex=None, axis=None): nkw = com.count_not_none(items, like, regex) if nkw > 1: - raise TypeError('Keyword arguments `items`, `like`, or `regex` ' - 'are mutually exclusive') + raise TypeError( + "Keyword arguments `items`, `like`, or `regex` " + "are mutually exclusive" + ) if axis is None: axis = self._info_axis_name @@ -4438,21 +4707,24 @@ def filter(self, items=None, like=None, regex=None, axis=None): if items is not None: name = self._get_axis_name(axis) - return self.reindex( - **{name: [r for r in items if r in labels]}) + return self.reindex(**{name: [r for r in items if r in labels]}) elif like: + def f(x): return like in ensure_str(x) + values = labels.map(f) return self.loc(axis=axis)[values] elif regex: + def f(x): return matcher.search(ensure_str(x)) is not None + matcher = re.compile(regex) values = labels.map(f) return self.loc(axis=axis)[values] else: - raise TypeError('Must pass either `items`, `like`, or `regex`') + raise TypeError("Must pass either `items`, `like`, or `regex`") def head(self, n=5): """ @@ -4574,8 +4846,15 @@ def tail(self, n=5): return self.iloc[0:0] return self.iloc[-n:] - def sample(self, n=None, frac=None, replace=False, weights=None, - random_state=None, axis=None): + def sample( + self, + n=None, + frac=None, + replace=False, + weights=None, + random_state=None, + axis=None, + ): """ Return a random sample of items from an axis of object. @@ -4683,28 +4962,33 @@ def sample(self, n=None, frac=None, replace=False, weights=None, try: weights = self[weights] except KeyError: - raise KeyError("String passed to weights not a " - "valid column") + raise KeyError( + "String passed to weights not a " "valid column" + ) else: - raise ValueError("Strings can only be passed to " - "weights when sampling from rows on " - "a DataFrame") + raise ValueError( + "Strings can only be passed to " + "weights when sampling from rows on " + "a DataFrame" + ) else: - raise ValueError("Strings cannot be passed as weights " - "when sampling from a Series.") + raise ValueError( + "Strings cannot be passed as weights " + "when sampling from a Series." + ) - weights = pd.Series(weights, dtype='float64') + weights = pd.Series(weights, dtype="float64") if len(weights) != axis_length: - raise ValueError("Weights and axis to be sampled must be of " - "same length") + raise ValueError( + "Weights and axis to be sampled must be of " "same length" + ) if (weights == np.inf).any() or (weights == -np.inf).any(): raise ValueError("weight vector may not include `inf` values") if (weights < 0).any(): - raise ValueError("weight vector many not include negative " - "values") + raise ValueError("weight vector many not include negative " "values") # If has nan, set to zero. weights = weights.fillna(0) @@ -4726,18 +5010,20 @@ def sample(self, n=None, frac=None, replace=False, weights=None, elif n is None and frac is not None: n = int(round(frac * axis_length)) elif n is not None and frac is not None: - raise ValueError('Please enter a value for `frac` OR `n`, not ' - 'both') + raise ValueError("Please enter a value for `frac` OR `n`, not " "both") # Check for negative sizes if n < 0: - raise ValueError("A negative number of rows requested. Please " - "provide positive value.") + raise ValueError( + "A negative number of rows requested. Please " "provide positive value." + ) locs = rs.choice(axis_length, size=n, replace=replace, p=weights) return self.take(locs, axis=axis, is_copy=False) - _shared_docs['pipe'] = (r""" + _shared_docs[ + "pipe" + ] = r""" Apply func(self, \*args, \*\*kwargs). Parameters @@ -4786,13 +5072,14 @@ def sample(self, n=None, frac=None, replace=False, weights=None, ... .pipe(g, arg1=a) ... .pipe((f, 'arg2'), arg1=a, arg3=c) ... ) - """) + """ - @Appender(_shared_docs['pipe'] % _shared_doc_kwargs) + @Appender(_shared_docs["pipe"] % _shared_doc_kwargs) def pipe(self, func, *args, **kwargs): return com._pipe(self, func, *args, **kwargs) - _shared_docs['aggregate'] = dedent(""" + _shared_docs["aggregate"] = dedent( + """ Aggregate using one or more operations over the specified axis. %(versionadded)s Parameters @@ -4830,9 +5117,12 @@ def pipe(self, func, *args, **kwargs): `agg` is an alias for `aggregate`. Use the alias. A passed user-defined-function will be passed a Series for evaluation. - %(examples)s""") + %(examples)s""" + ) - _shared_docs['transform'] = (""" + _shared_docs[ + "transform" + ] = """ Call ``func`` on self producing a %(klass)s with transformed values and that has the same axis length as self. @@ -4898,7 +5188,7 @@ def pipe(self, func, *args, **kwargs): 0 0.000000 1.000000 1 1.000000 2.718282 2 1.414214 7.389056 - """) + """ # ---------------------------------------------------------------------- # Attribute access @@ -4928,8 +5218,11 @@ def __getattr__(self, name): # Note: obj.x will always call obj.__getattribute__('x') prior to # calling obj.__getattr__('x'). - if (name in self._internal_names_set or name in self._metadata or - name in self._accessors): + if ( + name in self._internal_names_set + or name in self._metadata + or name in self._accessors + ): return object.__getattribute__(self, name) else: if self._info_axis._can_hold_identifiers_and_holds_name(name): @@ -4968,19 +5261,24 @@ def __setattr__(self, name, value): object.__setattr__(self, name, value) except (AttributeError, TypeError): if isinstance(self, ABCDataFrame) and (is_list_like(value)): - warnings.warn("Pandas doesn't allow columns to be " - "created via a new attribute name - see " - "https://pandas.pydata.org/pandas-docs/" - "stable/indexing.html#attribute-access", - stacklevel=2) + warnings.warn( + "Pandas doesn't allow columns to be " + "created via a new attribute name - see " + "https://pandas.pydata.org/pandas-docs/" + "stable/indexing.html#attribute-access", + stacklevel=2, + ) object.__setattr__(self, name, value) def _dir_additions(self): """ add the string-like attributes from the info_axis. If info_axis is a MultiIndex, it's first level values are used. """ - additions = {c for c in self._info_axis.unique(level=0)[:100] - if isinstance(c, str) and c.isidentifier()} + additions = { + c + for c in self._info_axis.unique(level=0)[:100] + if isinstance(c, str) and c.isidentifier() + } return super()._dir_additions().union(additions) # ---------------------------------------------------------------------- @@ -5021,7 +5319,7 @@ def _consolidate(self, inplace=False): ------- consolidated : same type as caller """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if inplace: self._consolidate_inplace() else: @@ -5057,14 +5355,15 @@ def _check_inplace_setting(self, value): except Exception: pass - raise TypeError('Cannot do inplace boolean setting on ' - 'mixed-types with a non np.nan value') + raise TypeError( + "Cannot do inplace boolean setting on " + "mixed-types with a non np.nan value" + ) return True def _get_numeric_data(self): - return self._constructor( - self._data.get_numeric_data()).__finalize__(self) + return self._constructor(self._data.get_numeric_data()).__finalize__(self) def _get_bool_data(self): return self._constructor(self._data.get_bool_data()).__finalize__(self) @@ -5111,11 +5410,14 @@ def as_matrix(self, columns=None): This method is provided for backwards compatibility. Generally, it is recommended to use '.values'. """ - warnings.warn("Method .as_matrix will be removed in a future version. " - "Use .values instead.", FutureWarning, stacklevel=2) + warnings.warn( + "Method .as_matrix will be removed in a future version. " + "Use .values instead.", + FutureWarning, + stacklevel=2, + ) self._consolidate_inplace() - return self._data.as_array(transpose=self._AXIS_REVERSED, - items=columns) + return self._data.as_array(transpose=self._AXIS_REVERSED, items=columns) @property def values(self): @@ -5253,7 +5555,9 @@ def get_values(self): warnings.warn( "The 'get_values' method is deprecated and will be removed in a " "future version. Use '.values' or 'np.asarray(..)' instead.", - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2, + ) return self._internal_get_values() def _internal_get_values(self): @@ -5292,11 +5596,15 @@ def get_dtype_counts(self): object 1 dtype: int64 """ - warnings.warn("`get_dtype_counts` has been deprecated and will be " - "removed in a future version. For DataFrames use " - "`.dtypes.value_counts()", FutureWarning, - stacklevel=2) + warnings.warn( + "`get_dtype_counts` has been deprecated and will be " + "removed in a future version. For DataFrames use " + "`.dtypes.value_counts()", + FutureWarning, + stacklevel=2, + ) from pandas import Series + return Series(self._data.get_dtype_counts()) def get_ftype_counts(self): @@ -5335,11 +5643,14 @@ def get_ftype_counts(self): object:dense 1 dtype: int64 """ - warnings.warn("get_ftype_counts is deprecated and will " - "be removed in a future version", - FutureWarning, stacklevel=2) + warnings.warn( + "get_ftype_counts is deprecated and will " "be removed in a future version", + FutureWarning, + stacklevel=2, + ) from pandas import Series + return Series(self._data.get_ftype_counts()) @property @@ -5375,8 +5686,8 @@ def dtypes(self): dtype: object """ from pandas import Series - return Series(self._data.get_dtypes(), index=self._info_axis, - dtype=np.object_) + + return Series(self._data.get_dtypes(), index=self._info_axis, dtype=np.object_) @property def ftypes(self): @@ -5423,14 +5734,17 @@ def ftypes(self): 3 float64:sparse dtype: object """ - warnings.warn("DataFrame.ftypes is deprecated and will " - "be removed in a future version. " - "Use DataFrame.dtypes instead.", - FutureWarning, stacklevel=2) + warnings.warn( + "DataFrame.ftypes is deprecated and will " + "be removed in a future version. " + "Use DataFrame.dtypes instead.", + FutureWarning, + stacklevel=2, + ) from pandas import Series - return Series(self._data.get_ftypes(), index=self._info_axis, - dtype=np.object_) + + return Series(self._data.get_ftypes(), index=self._info_axis, dtype=np.object_) def as_blocks(self, copy=True): """ @@ -5450,9 +5764,11 @@ def as_blocks(self, copy=True): ------- values : a dict of dtype -> Constructor Types """ - warnings.warn("as_blocks is deprecated and will " - "be removed in a future version", - FutureWarning, stacklevel=2) + warnings.warn( + "as_blocks is deprecated and will " "be removed in a future version", + FutureWarning, + stacklevel=2, + ) return self._to_dict_of_blocks(copy=copy) @property @@ -5471,10 +5787,12 @@ def _to_dict_of_blocks(self, copy=True): Internal ONLY """ - return {k: self._constructor(v).__finalize__(self) - for k, v, in self._data.to_dict(copy=copy).items()} + return { + k: self._constructor(v).__finalize__(self) + for k, v, in self._data.to_dict(copy=copy).items() + } - def astype(self, dtype, copy=True, errors='raise', **kwargs): + def astype(self, dtype, copy=True, errors="raise", **kwargs): """ Cast a pandas object to a specified dtype ``dtype``. @@ -5579,33 +5897,43 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): if is_dict_like(dtype): if self.ndim == 1: # i.e. Series if len(dtype) > 1 or self.name not in dtype: - raise KeyError('Only the Series name can be used for ' - 'the key in Series dtype mappings.') + raise KeyError( + "Only the Series name can be used for " + "the key in Series dtype mappings." + ) new_type = dtype[self.name] return self.astype(new_type, copy, errors, **kwargs) for col_name in dtype.keys(): if col_name not in self: - raise KeyError('Only a column name can be used for the ' - 'key in a dtype mappings argument.') + raise KeyError( + "Only a column name can be used for the " + "key in a dtype mappings argument." + ) results = [] for col_name, col in self.iteritems(): if col_name in dtype: - results.append(col.astype(dtype=dtype[col_name], copy=copy, - errors=errors, **kwargs)) + results.append( + col.astype( + dtype=dtype[col_name], copy=copy, errors=errors, **kwargs + ) + ) else: results.append(results.append(col.copy() if copy else col)) elif is_extension_array_dtype(dtype) and self.ndim > 1: # GH 18099/22869: columnwise conversion to extension dtype # GH 24704: use iloc to handle duplicate column names - results = (self.iloc[:, i].astype(dtype, copy=copy) - for i in range(len(self.columns))) + results = ( + self.iloc[:, i].astype(dtype, copy=copy) + for i in range(len(self.columns)) + ) else: # else, only a single dtype is given - new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors, - **kwargs) + new_data = self._data.astype( + dtype=dtype, copy=copy, errors=errors, **kwargs + ) return self._constructor(new_data).__finalize__(self) # GH 19920: retain column metadata after concat @@ -5735,8 +6063,9 @@ def __deepcopy__(self, memo=None): memo = {} return self.copy(deep=True) - def _convert(self, datetime=False, numeric=False, timedelta=False, - coerce=False, copy=True): + def _convert( + self, datetime=False, numeric=False, timedelta=False, coerce=False, copy=True + ): """ Attempt to infer better dtype for object columns @@ -5762,9 +6091,14 @@ def _convert(self, datetime=False, numeric=False, timedelta=False, converted : same as input object """ return self._constructor( - self._data.convert(datetime=datetime, numeric=numeric, - timedelta=timedelta, coerce=coerce, - copy=copy)).__finalize__(self) + self._data.convert( + datetime=datetime, + numeric=numeric, + timedelta=timedelta, + coerce=coerce, + copy=copy, + ) + ).__finalize__(self) def infer_objects(self): """ @@ -5809,15 +6143,23 @@ def infer_objects(self): # python objects will still be converted to # native numpy numeric types return self._constructor( - self._data.convert(datetime=True, numeric=False, - timedelta=True, coerce=False, - copy=True)).__finalize__(self) + self._data.convert( + datetime=True, numeric=False, timedelta=True, coerce=False, copy=True + ) + ).__finalize__(self) # ---------------------------------------------------------------------- # Filling NA's - def fillna(self, value=None, method=None, axis=None, inplace=False, - limit=None, downcast=None): + def fillna( + self, + value=None, + method=None, + axis=None, + inplace=False, + limit=None, + downcast=None, + ): """ Fill NA/NaN values using the specified method. @@ -5914,7 +6256,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, 2 NaN 1.0 NaN 5 3 NaN 3.0 NaN 4 """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") value, method = validate_fillna_kwargs(value, method) self._consolidate_inplace() @@ -5926,6 +6268,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, axis = self._get_axis_number(axis) from pandas import DataFrame + if value is None: if self._is_mixed_type and axis == 1: @@ -5938,10 +6281,14 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, return result - new_data = self._data.interpolate(method=method, axis=axis, - limit=limit, inplace=inplace, - coerce=True, - downcast=downcast) + new_data = self._data.interpolate( + method=method, + axis=axis, + limit=limit, + inplace=inplace, + coerce=True, + downcast=downcast, + ) else: if len(self._get_axis(axis)) == 0: return self @@ -5949,23 +6296,28 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, if self.ndim == 1: if isinstance(value, (dict, ABCSeries)): from pandas import Series + value = Series(value) elif not is_list_like(value): pass else: - raise TypeError('"value" parameter must be a scalar, dict ' - 'or Series, but you passed a ' - '"{0}"'.format(type(value).__name__)) - - new_data = self._data.fillna(value=value, limit=limit, - inplace=inplace, - downcast=downcast) + raise TypeError( + '"value" parameter must be a scalar, dict ' + "or Series, but you passed a " + '"{0}"'.format(type(value).__name__) + ) + + new_data = self._data.fillna( + value=value, limit=limit, inplace=inplace, downcast=downcast + ) elif isinstance(value, (dict, ABCSeries)): if axis == 1: - raise NotImplementedError('Currently only can fill ' - 'with dict/Series column ' - 'by column') + raise NotImplementedError( + "Currently only can fill " + "with dict/Series column " + "by column" + ) result = self if inplace else self.copy() for k, v in value.items(): @@ -5976,9 +6328,9 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, return result if not inplace else None elif not is_list_like(value): - new_data = self._data.fillna(value=value, limit=limit, - inplace=inplace, - downcast=downcast) + new_data = self._data.fillna( + value=value, limit=limit, inplace=inplace, downcast=downcast + ) elif isinstance(value, DataFrame) and self.ndim == 2: new_data = self.where(self.notna(), value) else: @@ -5998,8 +6350,9 @@ def ffill(self, axis=None, inplace=False, limit=None, downcast=None): %(klass)s Object with missing values filled. """ - return self.fillna(method='ffill', axis=axis, inplace=inplace, - limit=limit, downcast=downcast) + return self.fillna( + method="ffill", axis=axis, inplace=inplace, limit=limit, downcast=downcast + ) def bfill(self, axis=None, inplace=False, limit=None, downcast=None): """ @@ -6010,10 +6363,13 @@ def bfill(self, axis=None, inplace=False, limit=None, downcast=None): %(klass)s Object with missing values filled. """ - return self.fillna(method='bfill', axis=axis, inplace=inplace, - limit=limit, downcast=downcast) + return self.fillna( + method="bfill", axis=axis, inplace=inplace, limit=limit, downcast=downcast + ) - _shared_docs['replace'] = (""" + _shared_docs[ + "replace" + ] = """ Replace values given in `to_replace` with `value`. Values of the %(klass)s are replaced with other values dynamically. @@ -6302,15 +6658,23 @@ def bfill(self, axis=None, inplace=False, limit=None, downcast=None): 3 b 4 b dtype: object - """) + """ - @Appender(_shared_docs['replace'] % _shared_doc_kwargs) - def replace(self, to_replace=None, value=None, inplace=False, limit=None, - regex=False, method='pad'): - inplace = validate_bool_kwarg(inplace, 'inplace') + @Appender(_shared_docs["replace"] % _shared_doc_kwargs) + def replace( + self, + to_replace=None, + value=None, + inplace=False, + limit=None, + regex=False, + method="pad", + ): + inplace = validate_bool_kwarg(inplace, "inplace") if not is_bool(regex) and to_replace is not None: - raise AssertionError("'to_replace' must be 'None' if 'regex' is " - "not a bool") + raise AssertionError( + "'to_replace' must be 'None' if 'regex' is " "not a bool" + ) self._consolidate_inplace() @@ -6322,17 +6686,18 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, if isinstance(to_replace, (tuple, list)): if isinstance(self, pd.DataFrame): - return self.apply(_single_replace, - args=(to_replace, method, inplace, - limit)) - return _single_replace(self, to_replace, method, inplace, - limit) + return self.apply( + _single_replace, args=(to_replace, method, inplace, limit) + ) + return _single_replace(self, to_replace, method, inplace, limit) if not is_dict_like(to_replace): if not is_dict_like(regex): - raise TypeError('If "to_replace" and "value" are both None' - ' and "to_replace" is not a list, then ' - 'regex must be a mapping') + raise TypeError( + 'If "to_replace" and "value" are both None' + ' and "to_replace" is not a list, then ' + "regex must be a mapping" + ) to_replace = regex regex = True @@ -6343,9 +6708,11 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, if any(are_mappings): if not all(are_mappings): - raise TypeError("If a nested mapping is passed, all values" - " of the top level mapping must be " - "mappings") + raise TypeError( + "If a nested mapping is passed, all values" + " of the top level mapping must be " + "mappings" + ) # passed a nested dict/Series to_rep_dict = {} value_dict = {} @@ -6353,8 +6720,10 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, for k, v in items: keys, values = list(zip(*v.items())) or ([], []) if set(keys) & set(values): - raise ValueError("Replacement not allowed with " - "overlapping keys and values") + raise ValueError( + "Replacement not allowed with " + "overlapping keys and values" + ) to_rep_dict[k] = list(keys) value_dict[k] = list(values) @@ -6362,8 +6731,9 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, else: to_replace, value = keys, values - return self.replace(to_replace, value, inplace=inplace, - limit=limit, regex=regex) + return self.replace( + to_replace, value, inplace=inplace, limit=limit, regex=regex + ) else: # need a non-zero len on all axes @@ -6379,55 +6749,67 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, if c in value and c in self: # object conversion is handled in # series.replace which is called recursively - res[c] = res[c].replace(to_replace=src, - value=value[c], - inplace=False, - regex=regex) + res[c] = res[c].replace( + to_replace=src, + value=value[c], + inplace=False, + regex=regex, + ) return None if inplace else res # {'A': NA} -> 0 elif not is_list_like(value): - keys = [(k, src) for k, src in to_replace.items() - if k in self] + keys = [(k, src) for k, src in to_replace.items() if k in self] keys_len = len(keys) - 1 for i, (k, src) in enumerate(keys): convert = i == keys_len - new_data = new_data.replace(to_replace=src, - value=value, - filter=[k], - inplace=inplace, - regex=regex, - convert=convert) + new_data = new_data.replace( + to_replace=src, + value=value, + filter=[k], + inplace=inplace, + regex=regex, + convert=convert, + ) else: - raise TypeError('value argument must be scalar, dict, or ' - 'Series') + raise TypeError("value argument must be scalar, dict, or " "Series") elif is_list_like(to_replace): # [NA, ''] -> [0, 'missing'] if is_list_like(value): if len(to_replace) != len(value): - raise ValueError('Replacement lists must match ' - 'in length. Expecting %d got %d ' % - (len(to_replace), len(value))) - - new_data = self._data.replace_list(src_list=to_replace, - dest_list=value, - inplace=inplace, - regex=regex) + raise ValueError( + "Replacement lists must match " + "in length. Expecting %d got %d " + % (len(to_replace), len(value)) + ) + + new_data = self._data.replace_list( + src_list=to_replace, + dest_list=value, + inplace=inplace, + regex=regex, + ) else: # [NA, ''] -> 0 - new_data = self._data.replace(to_replace=to_replace, - value=value, inplace=inplace, - regex=regex) + new_data = self._data.replace( + to_replace=to_replace, value=value, inplace=inplace, regex=regex + ) elif to_replace is None: - if not (is_re_compilable(regex) or - is_list_like(regex) or is_dict_like(regex)): - raise TypeError("'regex' must be a string or a compiled " - "regular expression or a list or dict of " - "strings or regular expressions, you " - "passed a" - " {0!r}".format(type(regex).__name__)) - return self.replace(regex, value, inplace=inplace, limit=limit, - regex=True) + if not ( + is_re_compilable(regex) + or is_list_like(regex) + or is_dict_like(regex) + ): + raise TypeError( + "'regex' must be a string or a compiled " + "regular expression or a list or dict of " + "strings or regular expressions, you " + "passed a" + " {0!r}".format(type(regex).__name__) + ) + return self.replace( + regex, value, inplace=inplace, limit=limit, regex=True + ) else: # dest iterable dict-like @@ -6436,18 +6818,22 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, for k, v in value.items(): if k in self: - new_data = new_data.replace(to_replace=to_replace, - value=v, filter=[k], - inplace=inplace, - regex=regex) + new_data = new_data.replace( + to_replace=to_replace, + value=v, + filter=[k], + inplace=inplace, + regex=regex, + ) elif not is_list_like(value): # NA -> 0 - new_data = self._data.replace(to_replace=to_replace, - value=value, inplace=inplace, - regex=regex) + new_data = self._data.replace( + to_replace=to_replace, value=value, inplace=inplace, regex=regex + ) else: - msg = ('Invalid "to_replace" type: ' - '{0!r}').format(type(to_replace).__name__) + msg = ('Invalid "to_replace" type: ' "{0!r}").format( + type(to_replace).__name__ + ) raise TypeError(msg) # pragma: no cover if inplace: @@ -6455,7 +6841,9 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, else: return self._constructor(new_data).__finalize__(self) - _shared_docs['interpolate'] = """ + _shared_docs[ + "interpolate" + ] = """ Please note that only ``method='linear'`` is supported for DataFrame/Series with a MultiIndex. @@ -6644,14 +7032,22 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, Name: d, dtype: float64 """ - @Appender(_shared_docs['interpolate'] % _shared_doc_kwargs) - def interpolate(self, method='linear', axis=0, limit=None, inplace=False, - limit_direction='forward', limit_area=None, - downcast=None, **kwargs): + @Appender(_shared_docs["interpolate"] % _shared_doc_kwargs) + def interpolate( + self, + method="linear", + axis=0, + limit=None, + inplace=False, + limit_direction="forward", + limit_area=None, + downcast=None, + **kwargs + ): """ Interpolate values according to different methods. """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if axis == 0: ax = self._info_axis_name @@ -6668,47 +7064,59 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, else: alt_ax = ax - if (isinstance(_maybe_transposed_self.index, MultiIndex) and - method != 'linear'): - raise ValueError("Only `method=linear` interpolation is supported " - "on MultiIndexes.") + if isinstance(_maybe_transposed_self.index, MultiIndex) and method != "linear": + raise ValueError( + "Only `method=linear` interpolation is supported " "on MultiIndexes." + ) - if _maybe_transposed_self._data.get_dtype_counts().get( - 'object') == len(_maybe_transposed_self.T): - raise TypeError("Cannot interpolate with all object-dtype columns " - "in the DataFrame. Try setting at least one " - "column to a numeric dtype.") + if _maybe_transposed_self._data.get_dtype_counts().get("object") == len( + _maybe_transposed_self.T + ): + raise TypeError( + "Cannot interpolate with all object-dtype columns " + "in the DataFrame. Try setting at least one " + "column to a numeric dtype." + ) # create/use the index - if method == 'linear': + if method == "linear": # prior default index = np.arange(len(_maybe_transposed_self._get_axis(alt_ax))) else: index = _maybe_transposed_self._get_axis(alt_ax) methods = {"index", "values", "nearest", "time"} is_numeric_or_datetime = ( - is_numeric_dtype(index) or - is_datetime64_dtype(index) or - is_timedelta64_dtype(index) + is_numeric_dtype(index) + or is_datetime64_dtype(index) + or is_timedelta64_dtype(index) ) if method not in methods and not is_numeric_or_datetime: raise ValueError( "Index column must be numeric or datetime type when " "using {method} method other than linear. " "Try setting a numeric or datetime index column before " - "interpolating.".format(method=method)) + "interpolating.".format(method=method) + ) if isna(index).any(): - raise NotImplementedError("Interpolation with NaNs in the index " - "has not been implemented. Try filling " - "those NaNs before interpolating.") + raise NotImplementedError( + "Interpolation with NaNs in the index " + "has not been implemented. Try filling " + "those NaNs before interpolating." + ) data = _maybe_transposed_self._data - new_data = data.interpolate(method=method, axis=ax, index=index, - values=_maybe_transposed_self, limit=limit, - limit_direction=limit_direction, - limit_area=limit_area, - inplace=inplace, downcast=downcast, - **kwargs) + new_data = data.interpolate( + method=method, + axis=ax, + index=index, + values=_maybe_transposed_self, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + inplace=inplace, + downcast=downcast, + **kwargs + ) if inplace: if axis == 1: @@ -6823,6 +7231,7 @@ def asof(self, where, subset=None): """ if isinstance(where, str): from pandas import to_datetime + where = to_datetime(where) if not self.index.is_monotonic: @@ -6848,6 +7257,7 @@ def asof(self, where, subset=None): if where < start: if not is_series: from pandas import Series + return Series(index=self.columns, name=where) return np.nan @@ -6858,7 +7268,7 @@ def asof(self, where, subset=None): # code path whether *where* is a scalar or list. # See PR: https://github.com/pandas-dev/pandas/pull/14476 if is_series: - loc = self.index.searchsorted(where, side='right') + loc = self.index.searchsorted(where, side="right") if loc > 0: loc -= 1 @@ -6876,9 +7286,11 @@ def asof(self, where, subset=None): return self._constructor(np.nan, index=where, name=self.name) elif is_list: from pandas import DataFrame + return DataFrame(np.nan, index=where, columns=self.columns) else: from pandas import Series + return Series(np.nan, index=self.columns, name=where[0]) locs = self.index.asof_locs(where, ~(nulls.values)) @@ -6893,7 +7305,9 @@ def asof(self, where, subset=None): # ---------------------------------------------------------------------- # Action Methods - _shared_docs['isna'] = """ + _shared_docs[ + "isna" + ] = """ Detect missing values. Return a boolean same-sized object indicating if the values are NA. @@ -6953,15 +7367,17 @@ def asof(self, where, subset=None): dtype: bool """ - @Appender(_shared_docs['isna'] % _shared_doc_kwargs) + @Appender(_shared_docs["isna"] % _shared_doc_kwargs) def isna(self): return isna(self).__finalize__(self) - @Appender(_shared_docs['isna'] % _shared_doc_kwargs) + @Appender(_shared_docs["isna"] % _shared_doc_kwargs) def isnull(self): return isna(self).__finalize__(self) - _shared_docs['notna'] = """ + _shared_docs[ + "notna" + ] = """ Detect existing (non-missing) values. Return a boolean same-sized object indicating if the values are not NA. @@ -7021,23 +7437,24 @@ def isnull(self): dtype: bool """ - @Appender(_shared_docs['notna'] % _shared_doc_kwargs) + @Appender(_shared_docs["notna"] % _shared_doc_kwargs) def notna(self): return notna(self).__finalize__(self) - @Appender(_shared_docs['notna'] % _shared_doc_kwargs) + @Appender(_shared_docs["notna"] % _shared_doc_kwargs) def notnull(self): return notna(self).__finalize__(self) def _clip_with_scalar(self, lower, upper, inplace=False): - if ((lower is not None and np.any(isna(lower))) or - (upper is not None and np.any(isna(upper)))): + if (lower is not None and np.any(isna(lower))) or ( + upper is not None and np.any(isna(upper)) + ): raise ValueError("Cannot use an NA value as a clip threshold") result = self mask = isna(self.values) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): if upper is not None: subset = self.to_numpy() <= upper result = result.where(subset, upper, axis=None, inplace=False) @@ -7060,7 +7477,7 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): # method is self.le for upper bound and self.ge for lower bound if is_scalar(threshold) and is_number(threshold): - if method.__name__ == 'le': + if method.__name__ == "le": return self._clip_with_scalar(None, threshold, inplace=inplace) return self._clip_with_scalar(threshold, None, inplace=inplace) @@ -7073,12 +7490,10 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): if isinstance(self, ABCSeries): threshold = pd.Series(threshold, index=self.index) else: - threshold = _align_method_FRAME(self, threshold, - axis) + threshold = _align_method_FRAME(self, threshold, axis) return self.where(subset, threshold, axis=axis, inplace=inplace) - def clip(self, lower=None, upper=None, axis=None, inplace=False, - *args, **kwargs): + def clip(self, lower=None, upper=None, axis=None, inplace=False, *args, **kwargs): """ Trim values at input threshold(s). @@ -7151,7 +7566,7 @@ def clip(self, lower=None, upper=None, axis=None, inplace=False, 3 6 8 4 5 3 """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") axis = nv.validate_clip_with_axis(axis, args, kwargs) if axis is not None: @@ -7173,19 +7588,22 @@ def clip(self, lower=None, upper=None, axis=None, inplace=False, lower, upper = min(lower, upper), max(lower, upper) # fast-path for scalars - if ((lower is None or (is_scalar(lower) and is_number(lower))) and - (upper is None or (is_scalar(upper) and is_number(upper)))): + if (lower is None or (is_scalar(lower) and is_number(lower))) and ( + upper is None or (is_scalar(upper) and is_number(upper)) + ): return self._clip_with_scalar(lower, upper, inplace=inplace) result = self if lower is not None: - result = result._clip_with_one_bound(lower, method=self.ge, - axis=axis, inplace=inplace) + result = result._clip_with_one_bound( + lower, method=self.ge, axis=axis, inplace=inplace + ) if upper is not None: if inplace: result = self - result = result._clip_with_one_bound(upper, method=self.le, - axis=axis, inplace=inplace) + result = result._clip_with_one_bound( + upper, method=self.le, axis=axis, inplace=inplace + ) return result @@ -7263,11 +7681,14 @@ def clip_upper(self, threshold, axis=None, inplace=False): 4 1 dtype: int64 """ - warnings.warn('clip_upper(threshold) is deprecated, ' - 'use clip(upper=threshold) instead', - FutureWarning, stacklevel=2) - return self._clip_with_one_bound(threshold, method=self.le, - axis=axis, inplace=inplace) + warnings.warn( + "clip_upper(threshold) is deprecated, " "use clip(upper=threshold) instead", + FutureWarning, + stacklevel=2, + ) + return self._clip_with_one_bound( + threshold, method=self.le, axis=axis, inplace=inplace + ) def clip_lower(self, threshold, axis=None, inplace=False): """ @@ -7379,14 +7800,27 @@ def clip_lower(self, threshold, axis=None, inplace=False): 1 4 5 2 5 6 """ - warnings.warn('clip_lower(threshold) is deprecated, ' - 'use clip(lower=threshold) instead', - FutureWarning, stacklevel=2) - return self._clip_with_one_bound(threshold, method=self.ge, - axis=axis, inplace=inplace) - - def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, - group_keys=True, squeeze=False, observed=False, **kwargs): + warnings.warn( + "clip_lower(threshold) is deprecated, " "use clip(lower=threshold) instead", + FutureWarning, + stacklevel=2, + ) + return self._clip_with_one_bound( + threshold, method=self.ge, axis=axis, inplace=inplace + ) + + def groupby( + self, + by=None, + axis=0, + level=None, + as_index=True, + sort=True, + group_keys=True, + squeeze=False, + observed=False, + **kwargs + ): """ Group DataFrame or Series using a mapper or by a Series of columns. @@ -7501,12 +7935,20 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, if level is None and by is None: raise TypeError("You have to supply one of 'by' and 'level'") axis = self._get_axis_number(axis) - return groupby(self, by=by, axis=axis, level=level, as_index=as_index, - sort=sort, group_keys=group_keys, squeeze=squeeze, - observed=observed, **kwargs) - - def asfreq(self, freq, method=None, how=None, normalize=False, - fill_value=None): + return groupby( + self, + by=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + **kwargs + ) + + def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): """ Convert TimeSeries to specified frequency. @@ -7601,8 +8043,15 @@ def asfreq(self, freq, method=None, how=None, normalize=False, 2000-01-01 00:03:00 3.0 """ from pandas.core.resample import asfreq - return asfreq(self, freq, method=method, how=how, normalize=normalize, - fill_value=fill_value) + + return asfreq( + self, + freq, + method=method, + how=how, + normalize=normalize, + fill_value=fill_value, + ) def at_time(self, time, asof=False, axis=None): """ @@ -7656,12 +8105,13 @@ def at_time(self, time, asof=False, axis=None): try: indexer = index.indexer_at_time(time, asof=asof) except AttributeError: - raise TypeError('Index must be DatetimeIndex') + raise TypeError("Index must be DatetimeIndex") return self._take(indexer, axis=axis) - def between_time(self, start_time, end_time, include_start=True, - include_end=True, axis=None): + def between_time( + self, start_time, end_time, include_start=True, include_end=True, axis=None + ): """ Select values between particular times of the day (e.g., 9:00-9:30 AM). @@ -7726,16 +8176,32 @@ def between_time(self, start_time, end_time, include_start=True, index = self._get_axis(axis) try: indexer = index.indexer_between_time( - start_time, end_time, include_start=include_start, - include_end=include_end) + start_time, + end_time, + include_start=include_start, + include_end=include_end, + ) except AttributeError: - raise TypeError('Index must be DatetimeIndex') + raise TypeError("Index must be DatetimeIndex") return self._take(indexer, axis=axis) - def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, - label=None, convention='start', kind=None, loffset=None, - limit=None, base=0, on=None, level=None): + def resample( + self, + rule, + how=None, + axis=0, + fill_method=None, + closed=None, + label=None, + convention="start", + kind=None, + loffset=None, + limit=None, + base=0, + on=None, + level=None, + ): """ Resample time-series data. @@ -8020,17 +8486,25 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, 2000-01-04 36 90 """ - from pandas.core.resample import (resample, - _maybe_process_deprecations) + from pandas.core.resample import resample, _maybe_process_deprecations + axis = self._get_axis_number(axis) - r = resample(self, freq=rule, label=label, closed=closed, - axis=axis, kind=kind, loffset=loffset, - convention=convention, - base=base, key=on, level=level) - return _maybe_process_deprecations(r, - how=how, - fill_method=fill_method, - limit=limit) + r = resample( + self, + freq=rule, + label=label, + closed=closed, + axis=axis, + kind=kind, + loffset=loffset, + convention=convention, + base=base, + key=on, + level=level, + ) + return _maybe_process_deprecations( + r, how=how, fill_method=fill_method, limit=limit + ) def first(self, offset): """ @@ -8088,9 +8562,9 @@ def first(self, offset): end_date = end = self.index[0] + offset # Tick-like, e.g. 3 weeks - if not offset.isAnchored() and hasattr(offset, '_inc'): + if not offset.isAnchored() and hasattr(offset, "_inc"): if end_date in self.index: - end = self.index.searchsorted(end_date, side='left') + end = self.index.searchsorted(end_date, side="left") return self.iloc[:end] return self.loc[:end] @@ -8150,11 +8624,18 @@ def last(self, offset): offset = to_offset(offset) start_date = self.index[-1] - offset - start = self.index.searchsorted(start_date, side='right') + start = self.index.searchsorted(start_date, side="right") return self.iloc[start:] - def rank(self, axis=0, method='average', numeric_only=None, - na_option='keep', ascending=True, pct=False): + def rank( + self, + axis=0, + method="average", + numeric_only=None, + na_option="keep", + ascending=True, + pct=False, + ): """ Compute numerical data ranks (1 through n) along axis. @@ -8238,14 +8719,19 @@ def rank(self, axis=0, method='average', numeric_only=None, """ axis = self._get_axis_number(axis) - if na_option not in {'keep', 'top', 'bottom'}: + if na_option not in {"keep", "top", "bottom"}: msg = "na_option must be one of 'keep', 'top', or 'bottom'" raise ValueError(msg) def ranker(data): - ranks = algos.rank(data.values, axis=axis, method=method, - ascending=ascending, na_option=na_option, - pct=pct) + ranks = algos.rank( + data.values, + axis=axis, + method=method, + ascending=ascending, + na_option=na_option, + pct=pct, + ) ranks = self._constructor(ranks, **data._construct_axes_dict()) return ranks.__finalize__(self) @@ -8264,7 +8750,9 @@ def ranker(data): return ranker(data) - _shared_docs['align'] = (""" + _shared_docs[ + "align" + ] = """ Align two objects on their axes with the specified join method for each axis Index. @@ -8304,13 +8792,24 @@ def ranker(data): ------- (left, right) : (%(klass)s, type of other) Aligned objects. - """) + """ - @Appender(_shared_docs['align'] % _shared_doc_kwargs) - def align(self, other, join='outer', axis=None, level=None, copy=True, - fill_value=None, method=None, limit=None, fill_axis=0, - broadcast_axis=None): + @Appender(_shared_docs["align"] % _shared_doc_kwargs) + def align( + self, + other, + join="outer", + axis=None, + level=None, + copy=True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, + broadcast_axis=None, + ): from pandas import DataFrame, Series + method = missing.clean_fill_method(method) if broadcast_axis == 1 and self.ndim != other.ndim: @@ -8318,41 +8817,80 @@ def align(self, other, join='outer', axis=None, level=None, copy=True, # this means other is a DataFrame, and we need to broadcast # self cons = self._constructor_expanddim - df = cons({c: self for c in other.columns}, - **other._construct_axes_dict()) - return df._align_frame(other, join=join, axis=axis, - level=level, copy=copy, - fill_value=fill_value, method=method, - limit=limit, fill_axis=fill_axis) + df = cons( + {c: self for c in other.columns}, **other._construct_axes_dict() + ) + return df._align_frame( + other, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + ) elif isinstance(other, Series): # this means self is a DataFrame, and we need to broadcast # other cons = other._constructor_expanddim - df = cons({c: other for c in self.columns}, - **self._construct_axes_dict()) - return self._align_frame(df, join=join, axis=axis, level=level, - copy=copy, fill_value=fill_value, - method=method, limit=limit, - fill_axis=fill_axis) + df = cons( + {c: other for c in self.columns}, **self._construct_axes_dict() + ) + return self._align_frame( + df, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + ) if axis is not None: axis = self._get_axis_number(axis) if isinstance(other, DataFrame): - return self._align_frame(other, join=join, axis=axis, level=level, - copy=copy, fill_value=fill_value, - method=method, limit=limit, - fill_axis=fill_axis) + return self._align_frame( + other, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + ) elif isinstance(other, Series): - return self._align_series(other, join=join, axis=axis, level=level, - copy=copy, fill_value=fill_value, - method=method, limit=limit, - fill_axis=fill_axis) + return self._align_series( + other, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + ) else: # pragma: no cover - raise TypeError('unsupported type: %s' % type(other)) - - def _align_frame(self, other, join='outer', axis=None, level=None, - copy=True, fill_value=None, method=None, limit=None, - fill_axis=0): + raise TypeError("unsupported type: %s" % type(other)) + + def _align_frame( + self, + other, + join="outer", + axis=None, + level=None, + copy=True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, + ): # defaults join_index, join_columns = None, None ilidx, iridx = None, None @@ -8363,26 +8901,30 @@ def _align_frame(self, other, join='outer', axis=None, level=None, if axis is None or axis == 0: if not self.index.equals(other.index): join_index, ilidx, iridx = self.index.join( - other.index, how=join, level=level, return_indexers=True) + other.index, how=join, level=level, return_indexers=True + ) if axis is None or axis == 1: if not is_series and not self.columns.equals(other.columns): join_columns, clidx, cridx = self.columns.join( - other.columns, how=join, level=level, return_indexers=True) + other.columns, how=join, level=level, return_indexers=True + ) if is_series: reindexers = {0: [join_index, ilidx]} else: reindexers = {0: [join_index, ilidx], 1: [join_columns, clidx]} - left = self._reindex_with_indexers(reindexers, copy=copy, - fill_value=fill_value, - allow_dups=True) + left = self._reindex_with_indexers( + reindexers, copy=copy, fill_value=fill_value, allow_dups=True + ) # other must be always DataFrame - right = other._reindex_with_indexers({0: [join_index, iridx], - 1: [join_columns, cridx]}, - copy=copy, fill_value=fill_value, - allow_dups=True) + right = other._reindex_with_indexers( + {0: [join_index, iridx], 1: [join_columns, cridx]}, + copy=copy, + fill_value=fill_value, + allow_dups=True, + ) if method is not None: left = left.fillna(axis=fill_axis, method=method, limit=limit) @@ -8397,25 +8939,33 @@ def _align_frame(self, other, join='outer', axis=None, level=None, return left.__finalize__(self), right.__finalize__(other) - def _align_series(self, other, join='outer', axis=None, level=None, - copy=True, fill_value=None, method=None, limit=None, - fill_axis=0): + def _align_series( + self, + other, + join="outer", + axis=None, + level=None, + copy=True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, + ): is_series = isinstance(self, ABCSeries) # series/series compat, other must always be a Series if is_series: if axis: - raise ValueError('cannot align series to a series other than ' - 'axis 0') + raise ValueError("cannot align series to a series other than " "axis 0") # equal if self.index.equals(other.index): join_index, lidx, ridx = None, None, None else: - join_index, lidx, ridx = self.index.join(other.index, how=join, - level=level, - return_indexers=True) + join_index, lidx, ridx = self.index.join( + other.index, how=join, level=level, return_indexers=True + ) left = self._reindex_indexer(join_index, lidx, copy) right = other._reindex_indexer(join_index, ridx, copy) @@ -8428,8 +8978,8 @@ def _align_series(self, other, join='outer', axis=None, level=None, lidx, ridx = None, None if not self.index.equals(other.index): join_index, lidx, ridx = self.index.join( - other.index, how=join, level=level, - return_indexers=True) + other.index, how=join, level=level, return_indexers=True + ) if lidx is not None: fdata = fdata.reindex_indexer(join_index, lidx, axis=1) @@ -8439,13 +8989,13 @@ def _align_series(self, other, join='outer', axis=None, level=None, lidx, ridx = None, None if not self.columns.equals(other.index): join_index, lidx, ridx = self.columns.join( - other.index, how=join, level=level, - return_indexers=True) + other.index, how=join, level=level, return_indexers=True + ) if lidx is not None: fdata = fdata.reindex_indexer(join_index, lidx, axis=0) else: - raise ValueError('Must specify axis=0 or 1') + raise ValueError("Must specify axis=0 or 1") if copy and fdata is self._data: fdata = fdata.copy() @@ -8460,8 +9010,7 @@ def _align_series(self, other, join='outer', axis=None, level=None, # fill fill_na = notna(fill_value) or (method is not None) if fill_na: - left = left.fillna(fill_value, method=method, limit=limit, - axis=fill_axis) + left = left.fillna(fill_value, method=method, limit=limit, axis=fill_axis) right = right.fillna(fill_value, method=method, limit=limit) # if DatetimeIndex have different tz, convert to UTC @@ -8474,24 +9023,31 @@ def _align_series(self, other, join='outer', axis=None, level=None, return left.__finalize__(self), right.__finalize__(other) - def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, - errors='raise', try_cast=False): + def _where( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=False, + ): """ Equivalent to public method `where`, except that `other` is not applied as a function even if callable. Used in __setitem__. """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") # align the cond to same shape as myself cond = com.apply_if_callable(cond, self) if isinstance(cond, NDFrame): - cond, _ = cond.align(self, join='right', broadcast_axis=1) + cond, _ = cond.align(self, join="right", broadcast_axis=1) else: - if not hasattr(cond, 'shape'): + if not hasattr(cond, "shape"): cond = np.asanyarray(cond) if cond.shape != self.shape: - raise ValueError('Array conditional must be same shape as ' - 'self') + raise ValueError("Array conditional must be same shape as " "self") cond = self._constructor(cond, **self._construct_axes_dict()) # make sure we are boolean @@ -8513,24 +9069,26 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, # try to align with other try_quick = True - if hasattr(other, 'align'): + if hasattr(other, "align"): # align with me if other.ndim <= self.ndim: - _, other = self.align(other, join='left', axis=axis, - level=level, fill_value=np.nan) + _, other = self.align( + other, join="left", axis=axis, level=level, fill_value=np.nan + ) # if we are NOT aligned, raise as we cannot where index - if (axis is None and - not all(other._get_axis(i).equals(ax) - for i, ax in enumerate(self.axes))): + if axis is None and not all( + other._get_axis(i).equals(ax) for i, ax in enumerate(self.axes) + ): raise InvalidIndexError # slice me out of the other else: - raise NotImplementedError("cannot align with a higher " - "dimensional NDFrame") + raise NotImplementedError( + "cannot align with a higher " "dimensional NDFrame" + ) if isinstance(other, np.ndarray): @@ -8571,12 +9129,14 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, other = new_other else: - raise ValueError('Length of replacements must equal ' - 'series length') + raise ValueError( + "Length of replacements must equal " "series length" + ) else: - raise ValueError('other must be the same shape as self ' - 'when an ndarray') + raise ValueError( + "other must be the same shape as self " "when an ndarray" + ) # we are the same shape, so create an actual object for alignment else: @@ -8585,10 +9145,10 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, if axis is None: axis = 0 - if self.ndim == getattr(other, 'ndim', 0): + if self.ndim == getattr(other, "ndim", 0): align = True else: - align = (self._get_axis_number(axis) == 1) + align = self._get_axis_number(axis) == 1 block_axis = self._get_block_manager_axis(axis) @@ -8597,20 +9157,32 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, # reconstruct the block manager self._check_inplace_setting(other) - new_data = self._data.putmask(mask=cond, new=other, align=align, - inplace=True, axis=block_axis, - transpose=self._AXIS_REVERSED) + new_data = self._data.putmask( + mask=cond, + new=other, + align=align, + inplace=True, + axis=block_axis, + transpose=self._AXIS_REVERSED, + ) self._update_inplace(new_data) else: - new_data = self._data.where(other=other, cond=cond, align=align, - errors=errors, - try_cast=try_cast, axis=block_axis, - transpose=self._AXIS_REVERSED) + new_data = self._data.where( + other=other, + cond=cond, + align=align, + errors=errors, + try_cast=try_cast, + axis=block_axis, + transpose=self._AXIS_REVERSED, + ) return self._constructor(new_data).__finalize__(self) - _shared_docs['where'] = (""" + _shared_docs[ + "where" + ] = """ Replace values where the condition is %(cond_rev)s. Parameters @@ -8731,36 +9303,75 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, 2 True True 3 True True 4 True True - """) + """ - @Appender(_shared_docs['where'] % dict(_shared_doc_kwargs, cond="True", - cond_rev="False", name='where', - name_other='mask')) - def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, - errors='raise', try_cast=False): + @Appender( + _shared_docs["where"] + % dict( + _shared_doc_kwargs, + cond="True", + cond_rev="False", + name="where", + name_other="mask", + ) + ) + def where( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=False, + ): other = com.apply_if_callable(other, self) - return self._where(cond, other, inplace, axis, level, - errors=errors, try_cast=try_cast) - - @Appender(_shared_docs['where'] % dict(_shared_doc_kwargs, cond="False", - cond_rev="True", name='mask', - name_other='where')) - def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None, - errors='raise', try_cast=False): - - inplace = validate_bool_kwarg(inplace, 'inplace') + return self._where( + cond, other, inplace, axis, level, errors=errors, try_cast=try_cast + ) + + @Appender( + _shared_docs["where"] + % dict( + _shared_doc_kwargs, + cond="False", + cond_rev="True", + name="mask", + name_other="where", + ) + ) + def mask( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=False, + ): + + inplace = validate_bool_kwarg(inplace, "inplace") cond = com.apply_if_callable(cond, self) # see gh-21891 if not hasattr(cond, "__invert__"): cond = np.array(cond) - return self.where(~cond, other=other, inplace=inplace, axis=axis, - level=level, try_cast=try_cast, - errors=errors) - - _shared_docs['shift'] = (""" + return self.where( + ~cond, + other=other, + inplace=inplace, + axis=axis, + level=level, + try_cast=try_cast, + errors=errors, + ) + + _shared_docs[ + "shift" + ] = """ Shift index by desired number of periods with an optional time `freq`. When `freq` is not passed, shift the index without realigning the data. @@ -8830,17 +9441,18 @@ def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None, 2 0 0 0 3 10 13 17 4 20 23 27 - """) + """ - @Appender(_shared_docs['shift'] % _shared_doc_kwargs) + @Appender(_shared_docs["shift"] % _shared_doc_kwargs) def shift(self, periods=1, freq=None, axis=0, fill_value=None): if periods == 0: return self.copy() block_axis = self._get_block_manager_axis(axis) if freq is None: - new_data = self._data.shift(periods=periods, axis=block_axis, - fill_value=fill_value) + new_data = self._data.shift( + periods=periods, axis=block_axis, fill_value=fill_value + ) else: return self.tshift(periods, freq) @@ -8908,13 +9520,13 @@ def tshift(self, periods=1, freq=None, axis=0): index = self._get_axis(axis) if freq is None: - freq = getattr(index, 'freq', None) + freq = getattr(index, "freq", None) if freq is None: - freq = getattr(index, 'inferred_freq', None) + freq = getattr(index, "inferred_freq", None) if freq is None: - msg = 'Freq was not given and was not set in the index' + msg = "Freq was not given and was not set in the index" raise ValueError(msg) if periods == 0: @@ -8930,8 +9542,10 @@ def tshift(self, periods=1, freq=None, axis=0): new_data = self._data.copy() new_data.axes[block_axis] = index.shift(periods) else: - msg = ('Given freq %s does not match PeriodIndex freq %s' % - (freq.rule_code, orig_freq.rule_code)) + msg = "Given freq %s does not match PeriodIndex freq %s" % ( + freq.rule_code, + orig_freq.rule_code, + ) raise ValueError(msg) else: new_data = self._data.copy() @@ -9072,21 +9686,20 @@ def truncate(self, before=None, after=None, axis=None, copy=True): # treat like a slice if ax.is_all_dates: from pandas.core.tools.datetimes import to_datetime + before = to_datetime(before) after = to_datetime(after) if before is not None and after is not None: if before > after: - raise ValueError('Truncate: %s must be after %s' % - (after, before)) + raise ValueError("Truncate: %s must be after %s" % (after, before)) slicer = [slice(None, None)] * self._AXIS_LEN slicer[axis] = slice(before, after) result = self.loc[tuple(slicer)] if isinstance(ax, MultiIndex): - setattr(result, self._get_axis_name(axis), - ax.truncate(before, after)) + setattr(result, self._get_axis_name(axis), ax.truncate(before, after)) if copy: result = result.copy() @@ -9121,11 +9734,12 @@ def tz_convert(self, tz, axis=0, level=None, copy=True): ax = self._get_axis(axis) def _tz_convert(ax, tz): - if not hasattr(ax, 'tz_convert'): + if not hasattr(ax, "tz_convert"): if len(ax) > 0: ax_name = self._get_axis_name(axis) - raise TypeError('%s is not a valid DatetimeIndex or ' - 'PeriodIndex' % ax_name) + raise TypeError( + "%s is not a valid DatetimeIndex or " "PeriodIndex" % ax_name + ) else: ax = DatetimeIndex([], tz=tz) else: @@ -9147,8 +9761,9 @@ def _tz_convert(ax, tz): result = result.set_axis(ax, axis=axis, inplace=False) return result.__finalize__(self) - def tz_localize(self, tz, axis=0, level=None, copy=True, - ambiguous='raise', nonexistent='raise'): + def tz_localize( + self, tz, axis=0, level=None, copy=True, ambiguous="raise", nonexistent="raise" + ): """ Localize tz-naive index of a Series or DataFrame to target time zone. @@ -9269,38 +9884,37 @@ def tz_localize(self, tz, axis=0, level=None, copy=True, 2015-03-29 03:30:00+02:00 1 dtype: int64 """ - nonexistent_options = ('raise', 'NaT', 'shift_forward', - 'shift_backward') + nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward") if nonexistent not in nonexistent_options and not isinstance( - nonexistent, timedelta): - raise ValueError("The nonexistent argument must be one of 'raise'," - " 'NaT', 'shift_forward', 'shift_backward' or" - " a timedelta object") + nonexistent, timedelta + ): + raise ValueError( + "The nonexistent argument must be one of 'raise'," + " 'NaT', 'shift_forward', 'shift_backward' or" + " a timedelta object" + ) axis = self._get_axis_number(axis) ax = self._get_axis(axis) def _tz_localize(ax, tz, ambiguous, nonexistent): - if not hasattr(ax, 'tz_localize'): + if not hasattr(ax, "tz_localize"): if len(ax) > 0: ax_name = self._get_axis_name(axis) - raise TypeError('%s is not a valid DatetimeIndex or ' - 'PeriodIndex' % ax_name) + raise TypeError( + "%s is not a valid DatetimeIndex or " "PeriodIndex" % ax_name + ) else: ax = DatetimeIndex([], tz=tz) else: - ax = ax.tz_localize( - tz, ambiguous=ambiguous, nonexistent=nonexistent - ) + ax = ax.tz_localize(tz, ambiguous=ambiguous, nonexistent=nonexistent) return ax # if a level is given it must be a MultiIndex level or # equivalent to the axis name if isinstance(ax, MultiIndex): level = ax._get_level_number(level) - new_level = _tz_localize( - ax.levels[level], tz, ambiguous, nonexistent - ) + new_level = _tz_localize(ax.levels[level], tz, ambiguous, nonexistent) ax = ax.set_levels(new_level, level=level) else: if level not in (None, 0, ax.name): @@ -9641,14 +10255,18 @@ def describe(self, percentiles=None, include=None, exclude=None): formatted_percentiles = format_percentiles(percentiles) def describe_numeric_1d(series): - stat_index = (['count', 'mean', 'std', 'min'] + - formatted_percentiles + ['max']) - d = ([series.count(), series.mean(), series.std(), series.min()] + - series.quantile(percentiles).tolist() + [series.max()]) + stat_index = ( + ["count", "mean", "std", "min"] + formatted_percentiles + ["max"] + ) + d = ( + [series.count(), series.mean(), series.std(), series.min()] + + series.quantile(percentiles).tolist() + + [series.max()] + ) return pd.Series(d, index=stat_index, name=series.name) def describe_categorical_1d(data): - names = ['count', 'unique'] + names = ["count", "unique"] objcounts = data.value_counts() count_unique = len(objcounts[objcounts != 0]) result = [data.count(), count_unique] @@ -9658,27 +10276,30 @@ def describe_categorical_1d(data): if is_datetime64_any_dtype(data): tz = data.dt.tz - asint = data.dropna().values.view('i8') + asint = data.dropna().values.view("i8") top = Timestamp(top) if top.tzinfo is not None and tz is not None: # Don't tz_localize(None) if key is already tz-aware top = top.tz_convert(tz) else: top = top.tz_localize(tz) - names += ['top', 'freq', 'first', 'last'] - result += [top, freq, - Timestamp(asint.min(), tz=tz), - Timestamp(asint.max(), tz=tz)] + names += ["top", "freq", "first", "last"] + result += [ + top, + freq, + Timestamp(asint.min(), tz=tz), + Timestamp(asint.max(), tz=tz), + ] else: - names += ['top', 'freq'] + names += ["top", "freq"] result += [top, freq] # If the DataFrame is empty, set 'top' and 'freq' to None # to maintain output shape consistency else: - names += ['top', 'freq'] + names += ["top", "freq"] result += [np.nan, np.nan] - dtype = 'object' + dtype = "object" return pd.Series(result, index=names, name=data.name, dtype=dtype) @@ -9699,7 +10320,7 @@ def describe_1d(data): data = self.select_dtypes(include=[np.number]) if len(data.columns) == 0: data = self - elif include == 'all': + elif include == "all": if exclude is not None: msg = "exclude must be None when include is 'all'" raise ValueError(msg) @@ -9716,8 +10337,7 @@ def describe_1d(data): if name not in names: names.append(name) - d = pd.concat([x.reindex(names, copy=False) for x in ldesc], - axis=1, sort=False) + d = pd.concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False) d.columns = data.columns.copy() return d @@ -9726,8 +10346,7 @@ def _check_percentile(self, q): Validate percentiles (used by describe and quantile). """ - msg = ("percentiles should all be in the interval [0, 1]. " - "Try {0} instead.") + msg = "percentiles should all be in the interval [0, 1]. " "Try {0} instead." q = np.asarray(q) if q.ndim == 0: if not 0 <= q <= 1: @@ -9737,7 +10356,9 @@ def _check_percentile(self, q): raise ValueError(msg.format(q / 100.0)) return q - _shared_docs['pct_change'] = """ + _shared_docs[ + "pct_change" + ] = """ Percentage change between the current and a prior element. Computes the percentage change from the immediately previous row by @@ -9852,18 +10473,16 @@ def _check_percentile(self, q): APPL NaN 0.337604 0.012002 """ - @Appender(_shared_docs['pct_change'] % _shared_doc_kwargs) - def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, - **kwargs): + @Appender(_shared_docs["pct_change"] % _shared_doc_kwargs) + def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, **kwargs): # TODO: Not sure if above is correct - need someone to confirm. - axis = self._get_axis_number(kwargs.pop('axis', self._stat_axis_name)) + axis = self._get_axis_number(kwargs.pop("axis", self._stat_axis_name)) if fill_method is None: data = self else: data = self.fillna(method=fill_method, limit=limit, axis=axis) - rs = (data.div(data.shift(periods=periods, freq=freq, axis=axis, - **kwargs)) - 1) + rs = data.div(data.shift(periods=periods, freq=freq, axis=axis, **kwargs)) - 1 rs = rs.reindex_like(data) if freq is None: mask = isna(com.values_from_object(data)) @@ -9890,16 +10509,40 @@ def _add_numeric_operations(cls): axis_descr, name, name2 = _doc_parms(cls) cls.any = _make_logical_function( - cls, 'any', name, name2, axis_descr, _any_desc, nanops.nanany, - _any_see_also, _any_examples, empty_value=False) + cls, + "any", + name, + name2, + axis_descr, + _any_desc, + nanops.nanany, + _any_see_also, + _any_examples, + empty_value=False, + ) cls.all = _make_logical_function( - cls, 'all', name, name2, axis_descr, _all_desc, nanops.nanall, - _all_see_also, _all_examples, empty_value=True) - - @Substitution(desc="Return the mean absolute deviation of the values " - "for the requested axis.", - name1=name, name2=name2, axis_descr=axis_descr, - min_count='', see_also='', examples='') + cls, + "all", + name, + name2, + axis_descr, + _all_desc, + nanops.nanall, + _all_see_also, + _all_examples, + empty_value=True, + ) + + @Substitution( + desc="Return the mean absolute deviation of the values " + "for the requested axis.", + name1=name, + name2=name2, + axis_descr=axis_descr, + min_count="", + see_also="", + examples="", + ) @Appender(_num_doc) def mad(self, axis=None, skipna=None, level=None): if skipna is None: @@ -9907,8 +10550,7 @@ def mad(self, axis=None, skipna=None, level=None): if axis is None: axis = self._stat_axis_number if level is not None: - return self._agg_by_level('mad', axis=axis, level=level, - skipna=skipna) + return self._agg_by_level("mad", axis=axis, level=level, skipna=skipna) data = self._get_numeric_data() if axis == 0: @@ -9920,31 +10562,54 @@ def mad(self, axis=None, skipna=None, level=None): cls.mad = mad cls.sem = _make_stat_function_ddof( - cls, 'sem', name, name2, axis_descr, + cls, + "sem", + name, + name2, + axis_descr, "Return unbiased standard error of the mean over requested " "axis.\n\nNormalized by N-1 by default. This can be changed " "using the ddof argument", - nanops.nansem) + nanops.nansem, + ) cls.var = _make_stat_function_ddof( - cls, 'var', name, name2, axis_descr, + cls, + "var", + name, + name2, + axis_descr, "Return unbiased variance over requested axis.\n\nNormalized by " "N-1 by default. This can be changed using the ddof argument", - nanops.nanvar) + nanops.nanvar, + ) cls.std = _make_stat_function_ddof( - cls, 'std', name, name2, axis_descr, + cls, + "std", + name, + name2, + axis_descr, "Return sample standard deviation over requested axis." "\n\nNormalized by N-1 by default. This can be changed using the " "ddof argument", - nanops.nanstd) - - @Substitution(desc="Return the compound percentage of the values for " - "the requested axis.\n\n.. deprecated:: 0.25.0", - name1=name, name2=name2, axis_descr=axis_descr, - min_count='', see_also='', examples='') + nanops.nanstd, + ) + + @Substitution( + desc="Return the compound percentage of the values for " + "the requested axis.\n\n.. deprecated:: 0.25.0", + name1=name, + name2=name2, + axis_descr=axis_descr, + min_count="", + see_also="", + examples="", + ) @Appender(_num_doc) def compound(self, axis=None, skipna=None, level=None): - msg = ("The 'compound' method is deprecated and will be" - "removed in a future version.") + msg = ( + "The 'compound' method is deprecated and will be" + "removed in a future version." + ) warnings.warn(msg, FutureWarning, stacklevel=2) if skipna is None: skipna = True @@ -9953,63 +10618,146 @@ def compound(self, axis=None, skipna=None, level=None): cls.compound = compound cls.cummin = _make_cum_function( - cls, 'cummin', name, name2, axis_descr, "minimum", - lambda y, axis: np.minimum.accumulate(y, axis), "min", - np.inf, np.nan, _cummin_examples) + cls, + "cummin", + name, + name2, + axis_descr, + "minimum", + lambda y, axis: np.minimum.accumulate(y, axis), + "min", + np.inf, + np.nan, + _cummin_examples, + ) cls.cumsum = _make_cum_function( - cls, 'cumsum', name, name2, axis_descr, "sum", - lambda y, axis: y.cumsum(axis), "sum", 0., - np.nan, _cumsum_examples) + cls, + "cumsum", + name, + name2, + axis_descr, + "sum", + lambda y, axis: y.cumsum(axis), + "sum", + 0.0, + np.nan, + _cumsum_examples, + ) cls.cumprod = _make_cum_function( - cls, 'cumprod', name, name2, axis_descr, "product", - lambda y, axis: y.cumprod(axis), "prod", 1., - np.nan, _cumprod_examples) + cls, + "cumprod", + name, + name2, + axis_descr, + "product", + lambda y, axis: y.cumprod(axis), + "prod", + 1.0, + np.nan, + _cumprod_examples, + ) cls.cummax = _make_cum_function( - cls, 'cummax', name, name2, axis_descr, "maximum", - lambda y, axis: np.maximum.accumulate(y, axis), "max", - -np.inf, np.nan, _cummax_examples) + cls, + "cummax", + name, + name2, + axis_descr, + "maximum", + lambda y, axis: np.maximum.accumulate(y, axis), + "max", + -np.inf, + np.nan, + _cummax_examples, + ) cls.sum = _make_min_count_stat_function( - cls, 'sum', name, name2, axis_descr, + cls, + "sum", + name, + name2, + axis_descr, """Return the sum of the values for the requested axis.\n This is equivalent to the method ``numpy.sum``.""", - nanops.nansum, _stat_func_see_also, _sum_examples) + nanops.nansum, + _stat_func_see_also, + _sum_examples, + ) cls.mean = _make_stat_function( - cls, 'mean', name, name2, axis_descr, - 'Return the mean of the values for the requested axis.', - nanops.nanmean) + cls, + "mean", + name, + name2, + axis_descr, + "Return the mean of the values for the requested axis.", + nanops.nanmean, + ) cls.skew = _make_stat_function( - cls, 'skew', name, name2, axis_descr, - 'Return unbiased skew over requested axis\nNormalized by N-1.', - nanops.nanskew) + cls, + "skew", + name, + name2, + axis_descr, + "Return unbiased skew over requested axis\nNormalized by N-1.", + nanops.nanskew, + ) cls.kurt = _make_stat_function( - cls, 'kurt', name, name2, axis_descr, + cls, + "kurt", + name, + name2, + axis_descr, "Return unbiased kurtosis over requested axis using Fisher's " "definition of\nkurtosis (kurtosis of normal == 0.0). Normalized " "by N-1.", - nanops.nankurt) + nanops.nankurt, + ) cls.kurtosis = cls.kurt cls.prod = _make_min_count_stat_function( - cls, 'prod', name, name2, axis_descr, - 'Return the product of the values for the requested axis.', - nanops.nanprod, examples=_prod_examples) + cls, + "prod", + name, + name2, + axis_descr, + "Return the product of the values for the requested axis.", + nanops.nanprod, + examples=_prod_examples, + ) cls.product = cls.prod cls.median = _make_stat_function( - cls, 'median', name, name2, axis_descr, - 'Return the median of the values for the requested axis.', - nanops.nanmedian) + cls, + "median", + name, + name2, + axis_descr, + "Return the median of the values for the requested axis.", + nanops.nanmedian, + ) cls.max = _make_stat_function( - cls, 'max', name, name2, axis_descr, + cls, + "max", + name, + name2, + axis_descr, """Return the maximum of the values for the requested axis.\n If you want the *index* of the maximum, use ``idxmax``. This is the equivalent of the ``numpy.ndarray`` method ``argmax``.""", - nanops.nanmax, _stat_func_see_also, _max_examples) + nanops.nanmax, + _stat_func_see_also, + _max_examples, + ) cls.min = _make_stat_function( - cls, 'min', name, name2, axis_descr, + cls, + "min", + name, + name2, + axis_descr, """Return the minimum of the values for the requested axis.\n If you want the *index* of the minimum, use ``idxmin``. This is the equivalent of the ``numpy.ndarray`` method ``argmin``.""", - nanops.nanmin, _stat_func_see_also, _min_examples) + nanops.nanmin, + _stat_func_see_also, + _min_examples, + ) @classmethod def _add_series_only_operations(cls): @@ -10023,18 +10771,26 @@ def _add_series_only_operations(cls): def nanptp(values, axis=0, skipna=True): nmax = nanops.nanmax(values, axis, skipna) nmin = nanops.nanmin(values, axis, skipna) - warnings.warn("Method .ptp is deprecated and will be removed " - "in a future version. Use numpy.ptp instead.", - FutureWarning, stacklevel=4) + warnings.warn( + "Method .ptp is deprecated and will be removed " + "in a future version. Use numpy.ptp instead.", + FutureWarning, + stacklevel=4, + ) return nmax - nmin cls.ptp = _make_stat_function( - cls, 'ptp', name, name2, axis_descr, + cls, + "ptp", + name, + name2, + axis_descr, """Return the difference between the maximum value and the minimum value in the object. This is the equivalent of the ``numpy.ndarray`` method ``ptp``.\n\n.. deprecated:: 0.24.0 Use numpy.ptp instead""", - nanptp) + nanptp, + ) @classmethod def _add_series_or_dataframe_operations(cls): @@ -10046,48 +10802,80 @@ def _add_series_or_dataframe_operations(cls): from pandas.core import window as rwindow @Appender(rwindow.rolling.__doc__) - def rolling(self, window, min_periods=None, center=False, - win_type=None, on=None, axis=0, closed=None): + def rolling( + self, + window, + min_periods=None, + center=False, + win_type=None, + on=None, + axis=0, + closed=None, + ): axis = self._get_axis_number(axis) - return rwindow.rolling(self, window=window, - min_periods=min_periods, - center=center, win_type=win_type, - on=on, axis=axis, closed=closed) + return rwindow.rolling( + self, + window=window, + min_periods=min_periods, + center=center, + win_type=win_type, + on=on, + axis=axis, + closed=closed, + ) cls.rolling = rolling @Appender(rwindow.expanding.__doc__) def expanding(self, min_periods=1, center=False, axis=0): axis = self._get_axis_number(axis) - return rwindow.expanding(self, min_periods=min_periods, - center=center, axis=axis) + return rwindow.expanding( + self, min_periods=min_periods, center=center, axis=axis + ) cls.expanding = expanding @Appender(rwindow.ewm.__doc__) - def ewm(self, com=None, span=None, halflife=None, alpha=None, - min_periods=0, adjust=True, ignore_na=False, - axis=0): + def ewm( + self, + com=None, + span=None, + halflife=None, + alpha=None, + min_periods=0, + adjust=True, + ignore_na=False, + axis=0, + ): axis = self._get_axis_number(axis) - return rwindow.ewm(self, com=com, span=span, halflife=halflife, - alpha=alpha, min_periods=min_periods, - adjust=adjust, ignore_na=ignore_na, axis=axis) + return rwindow.ewm( + self, + com=com, + span=span, + halflife=halflife, + alpha=alpha, + min_periods=min_periods, + adjust=adjust, + ignore_na=ignore_na, + axis=axis, + ) cls.ewm = ewm - @Appender(_shared_docs['transform'] % dict(axis="", **_shared_doc_kwargs)) + @Appender(_shared_docs["transform"] % dict(axis="", **_shared_doc_kwargs)) def transform(self, func, *args, **kwargs): result = self.agg(func, *args, **kwargs) if is_scalar(result) or len(result) != len(self): - raise ValueError("transforms cannot produce " - "aggregated results") + raise ValueError("transforms cannot produce " "aggregated results") return result # ---------------------------------------------------------------------- # Misc methods - _shared_docs['valid_index'] = """ + _shared_docs[ + "valid_index" + ] = """ Return index for %(position)s non-NA/null value. Returns @@ -10113,7 +10901,7 @@ def _find_valid_index(self, how): ------- idx_first_valid : type of index """ - assert how in ['first', 'last'] + assert how in ["first", "last"] if len(self) == 0: # early stop return None @@ -10122,10 +10910,10 @@ def _find_valid_index(self, how): if self.ndim == 2: is_valid = is_valid.any(1) # reduce axis 1 - if how == 'first': + if how == "first": idxpos = is_valid.values[::].argmax() - if how == 'last': + if how == "last": idxpos = len(self) - 1 - is_valid.values[::-1].argmax() chk_notna = is_valid.iat[idxpos] @@ -10135,23 +10923,25 @@ def _find_valid_index(self, how): return None return idx - @Appender(_shared_docs['valid_index'] % {'position': 'first', - 'klass': 'Series/DataFrame'}) + @Appender( + _shared_docs["valid_index"] % {"position": "first", "klass": "Series/DataFrame"} + ) def first_valid_index(self): - return self._find_valid_index('first') + return self._find_valid_index("first") - @Appender(_shared_docs['valid_index'] % {'position': 'last', - 'klass': 'Series/DataFrame'}) + @Appender( + _shared_docs["valid_index"] % {"position": "last", "klass": "Series/DataFrame"} + ) def last_valid_index(self): - return self._find_valid_index('last') + return self._find_valid_index("last") def _doc_parms(cls): """Return a tuple of the doc parms.""" - axis_descr = "{%s}" % ', '.join("{0} ({1})".format(a, i) - for i, a in enumerate(cls._AXIS_ORDERS)) - name = (cls._constructor_sliced.__name__ - if cls._AXIS_LEN > 1 else 'scalar') + axis_descr = "{%s}" % ", ".join( + "{0} ({1})".format(a, i) for i, a in enumerate(cls._AXIS_ORDERS) + ) + name = cls._constructor_sliced.__name__ if cls._AXIS_LEN > 1 else "scalar" name2 = cls.__name__ return axis_descr, name, name2 @@ -10684,7 +11474,9 @@ def _doc_parms(cls): Series([], dtype: bool) """ -_shared_docs['stat_func_example'] = """ +_shared_docs[ + "stat_func_example" +] = """ Examples -------- @@ -10718,12 +11510,9 @@ def _doc_parms(cls): cold {level_output_1} Name: legs, dtype: int64""" -_sum_examples = _shared_docs['stat_func_example'].format( - stat_func='sum', - verb='Sum', - default_output=14, - level_output_0=6, - level_output_1=8) +_sum_examples = _shared_docs["stat_func_example"].format( + stat_func="sum", verb="Sum", default_output=14, level_output_0=6, level_output_1=8 +) _sum_examples += """ @@ -10747,19 +11536,13 @@ def _doc_parms(cls): >>> pd.Series([np.nan]).sum(min_count=1) nan""" -_max_examples = _shared_docs['stat_func_example'].format( - stat_func='max', - verb='Max', - default_output=8, - level_output_0=4, - level_output_1=8) +_max_examples = _shared_docs["stat_func_example"].format( + stat_func="max", verb="Max", default_output=8, level_output_0=4, level_output_1=8 +) -_min_examples = _shared_docs['stat_func_example'].format( - stat_func='min', - verb='Min', - default_output=0, - level_output_0=2, - level_output_1=0) +_min_examples = _shared_docs["stat_func_example"].format( + stat_func="min", verb="Min", default_output=0, level_output_0=2, level_output_1=0 +) _stat_func_see_also = """ @@ -10812,18 +11595,31 @@ def _doc_parms(cls): """ -def _make_min_count_stat_function(cls, name, name1, name2, axis_descr, desc, - f, see_also='', examples=''): - @Substitution(desc=desc, name1=name1, name2=name2, - axis_descr=axis_descr, min_count=_min_count_stub, - see_also=see_also, examples=examples) +def _make_min_count_stat_function( + cls, name, name1, name2, axis_descr, desc, f, see_also="", examples="" +): + @Substitution( + desc=desc, + name1=name1, + name2=name2, + axis_descr=axis_descr, + min_count=_min_count_stub, + see_also=see_also, + examples=examples, + ) @Appender(_num_doc) - def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, - min_count=0, - **kwargs): - if name == 'sum': + def stat_func( + self, + axis=None, + skipna=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs + ): + if name == "sum": nv.validate_sum(tuple(), kwargs) - elif name == 'prod': + elif name == "prod": nv.validate_prod(tuple(), kwargs) else: nv.validate_stat_func(tuple(), kwargs, fname=name) @@ -10832,23 +11628,38 @@ def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, if axis is None: axis = self._stat_axis_number if level is not None: - return self._agg_by_level(name, axis=axis, level=level, - skipna=skipna, min_count=min_count) - return self._reduce(f, name, axis=axis, skipna=skipna, - numeric_only=numeric_only, min_count=min_count) + return self._agg_by_level( + name, axis=axis, level=level, skipna=skipna, min_count=min_count + ) + return self._reduce( + f, + name, + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + min_count=min_count, + ) return set_function_name(stat_func, name, cls) -def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f, - see_also='', examples=''): - @Substitution(desc=desc, name1=name1, name2=name2, - axis_descr=axis_descr, min_count='', see_also=see_also, - examples=examples) +def _make_stat_function( + cls, name, name1, name2, axis_descr, desc, f, see_also="", examples="" +): + @Substitution( + desc=desc, + name1=name1, + name2=name2, + axis_descr=axis_descr, + min_count="", + see_also=see_also, + examples=examples, + ) @Appender(_num_doc) - def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, - **kwargs): - if name == 'median': + def stat_func( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + ): + if name == "median": nv.validate_median(tuple(), kwargs) else: nv.validate_stat_func(tuple(), kwargs, fname=name) @@ -10857,39 +11668,57 @@ def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, if axis is None: axis = self._stat_axis_number if level is not None: - return self._agg_by_level(name, axis=axis, level=level, - skipna=skipna) - return self._reduce(f, name, axis=axis, skipna=skipna, - numeric_only=numeric_only) + return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) + return self._reduce( + f, name, axis=axis, skipna=skipna, numeric_only=numeric_only + ) return set_function_name(stat_func, name, cls) def _make_stat_function_ddof(cls, name, name1, name2, axis_descr, desc, f): - @Substitution(desc=desc, name1=name1, name2=name2, - axis_descr=axis_descr) + @Substitution(desc=desc, name1=name1, name2=name2, axis_descr=axis_descr) @Appender(_num_ddof_doc) - def stat_func(self, axis=None, skipna=None, level=None, ddof=1, - numeric_only=None, **kwargs): + def stat_func( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): nv.validate_stat_ddof_func(tuple(), kwargs, fname=name) if skipna is None: skipna = True if axis is None: axis = self._stat_axis_number if level is not None: - return self._agg_by_level(name, axis=axis, level=level, - skipna=skipna, ddof=ddof) - return self._reduce(f, name, axis=axis, numeric_only=numeric_only, - skipna=skipna, ddof=ddof) + return self._agg_by_level( + name, axis=axis, level=level, skipna=skipna, ddof=ddof + ) + return self._reduce( + f, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof + ) return set_function_name(stat_func, name, cls) -def _make_cum_function(cls, name, name1, name2, axis_descr, desc, - accum_func, accum_func_name, mask_a, mask_b, examples): - @Substitution(desc=desc, name1=name1, name2=name2, - axis_descr=axis_descr, accum_func_name=accum_func_name, - examples=examples) +def _make_cum_function( + cls, + name, + name1, + name2, + axis_descr, + desc, + accum_func, + accum_func_name, + mask_a, + mask_b, + examples, +): + @Substitution( + desc=desc, + name1=name1, + name2=name2, + axis_descr=axis_descr, + accum_func_name=accum_func_name, + examples=examples, + ) @Appender(_cnum_doc) def cum_func(self, axis=None, skipna=True, *args, **kwargs): skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name) @@ -10900,8 +11729,7 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): y = com.values_from_object(self).copy() - if (skipna and - issubclass(y.dtype.type, (np.datetime64, np.timedelta64))): + if skipna and issubclass(y.dtype.type, (np.datetime64, np.timedelta64)): result = accum_func(y, axis) mask = isna(self) np.putmask(result, mask, iNaT) @@ -10914,29 +11742,41 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): result = accum_func(y, axis) d = self._construct_axes_dict() - d['copy'] = False + d["copy"] = False return self._constructor(result, **d).__finalize__(self) return set_function_name(cum_func, name, cls) -def _make_logical_function(cls, name, name1, name2, axis_descr, desc, f, - see_also, examples, empty_value): - @Substitution(desc=desc, name1=name1, name2=name2, - axis_descr=axis_descr, see_also=see_also, examples=examples, - empty_value=empty_value) +def _make_logical_function( + cls, name, name1, name2, axis_descr, desc, f, see_also, examples, empty_value +): + @Substitution( + desc=desc, + name1=name1, + name2=name2, + axis_descr=axis_descr, + see_also=see_also, + examples=examples, + empty_value=empty_value, + ) @Appender(_bool_doc) - def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, - **kwargs): + def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): nv.validate_logical_func(tuple(), kwargs, fname=name) if level is not None: if bool_only is not None: - raise NotImplementedError("Option bool_only is not " - "implemented with option level.") - return self._agg_by_level(name, axis=axis, level=level, - skipna=skipna) - return self._reduce(f, name, axis=axis, skipna=skipna, - numeric_only=bool_only, filter_type='bool') + raise NotImplementedError( + "Option bool_only is not " "implemented with option level." + ) + return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) + return self._reduce( + f, + name, + axis=axis, + skipna=skipna, + numeric_only=bool_only, + filter_type="bool", + ) return set_function_name(logical_func, name, cls) diff --git a/pandas/core/groupby/__init__.py b/pandas/core/groupby/__init__.py index fe50bd91a4f565..252f20ed400680 100644 --- a/pandas/core/groupby/__init__.py +++ b/pandas/core/groupby/__init__.py @@ -1,4 +1,7 @@ from pandas.core.groupby.generic import ( # noqa: F401 - DataFrameGroupBy, NamedAgg, SeriesGroupBy) + DataFrameGroupBy, + NamedAgg, + SeriesGroupBy, +) from pandas.core.groupby.groupby import GroupBy # noqa: F401 from pandas.core.groupby.grouper import Grouper # noqa: F401 diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index cffe0e589c6bc9..5c4f1fa3fbddf0 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -21,7 +21,9 @@ def outer(self, *args, **kwargs): def f(x): x = self._shallow_copy(x, groupby=self._groupby) return getattr(x, name)(*args, **kwargs) + return self._groupby.apply(f) + outer.__name__ = name return outer @@ -51,10 +53,7 @@ def _gotitem(self, key, ndim, subset=None): except IndexError: groupby = self._groupby - self = self.__class__(subset, - groupby=groupby, - parent=self, - **kwargs) + self = self.__class__(subset, groupby=groupby, parent=self, **kwargs) self._reset_cache() if subset.ndim == 2: if is_scalar(key) and key in subset or is_list_like(key): @@ -64,25 +63,41 @@ def _gotitem(self, key, ndim, subset=None): # special case to prevent duplicate plots when catching exceptions when # forwarding methods from NDFrames -plotting_methods = frozenset(['plot', 'hist']) - -common_apply_whitelist = frozenset([ - 'quantile', 'fillna', 'mad', 'take', - 'idxmax', 'idxmin', 'tshift', - 'skew', 'corr', 'cov', 'diff' -]) | plotting_methods - -series_apply_whitelist = ((common_apply_whitelist | - {'nlargest', 'nsmallest', - 'is_monotonic_increasing', - 'is_monotonic_decreasing'}) - ) | frozenset(['dtype', 'unique']) - -dataframe_apply_whitelist = ((common_apply_whitelist | - frozenset(['dtypes', 'corrwith']))) - -cython_transforms = frozenset(['cumprod', 'cumsum', 'shift', - 'cummin', 'cummax']) - -cython_cast_blacklist = frozenset(['rank', 'count', 'size', 'idxmin', - 'idxmax']) +plotting_methods = frozenset(["plot", "hist"]) + +common_apply_whitelist = ( + frozenset( + [ + "quantile", + "fillna", + "mad", + "take", + "idxmax", + "idxmin", + "tshift", + "skew", + "corr", + "cov", + "diff", + ] + ) + | plotting_methods +) + +series_apply_whitelist = ( + ( + common_apply_whitelist + | { + "nlargest", + "nsmallest", + "is_monotonic_increasing", + "is_monotonic_decreasing", + } + ) +) | frozenset(["dtype", "unique"]) + +dataframe_apply_whitelist = common_apply_whitelist | frozenset(["dtypes", "corrwith"]) + +cython_transforms = frozenset(["cumprod", "cumsum", "shift", "cummin", "cummax"]) + +cython_cast_blacklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"]) diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index 85f51323a97b53..fcf52ecfcbbcd1 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -2,7 +2,10 @@ from pandas.core.algorithms import unique1d from pandas.core.arrays.categorical import ( - Categorical, CategoricalDtype, _recode_for_categories) + Categorical, + CategoricalDtype, + _recode_for_categories, +) def recode_for_groupby(c, sort, observed): @@ -49,9 +52,7 @@ def recode_for_groupby(c, sort, observed): # we recode according to the uniques categories = c.categories.take(take_codes) - codes = _recode_for_categories(c.codes, - c.categories, - categories) + codes = _recode_for_categories(c.codes, c.categories, categories) # return a new categorical that maps our new codes # and categories @@ -68,8 +69,7 @@ def recode_for_groupby(c, sort, observed): # But for groupby to work, all categories should be present, # including those missing from the data (GH-13179), which .unique() # above dropped - cat = cat.add_categories( - c.categories[~c.categories.isin(cat.categories)]) + cat = cat.add_categories(c.categories[~c.categories.isin(cat.categories)]) return c.reorder_categories(cat.categories), None @@ -96,5 +96,4 @@ def recode_from_groupby(c, sort, ci): return ci.set_categories(c.categories) # we are not sorting, so add unobserved to the end - return ci.add_categories( - c.categories[~c.categories.isin(ci.categories)]) + return ci.add_categories(c.categories[~c.categories.isin(ci.categories)]) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9e7dcafc0b1a4e..7fd0ca94e79979 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -21,12 +21,20 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution -from pandas.core.dtypes.cast import ( - maybe_convert_objects, maybe_downcast_to_dtype) +from pandas.core.dtypes.cast import maybe_convert_objects, maybe_downcast_to_dtype from pandas.core.dtypes.common import ( - ensure_int64, ensure_platform_int, is_bool, is_datetimelike, is_dict_like, - is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype, - is_object_dtype, is_scalar) + ensure_int64, + ensure_platform_int, + is_bool, + is_datetimelike, + is_dict_like, + is_integer_dtype, + is_interval_dtype, + is_list_like, + is_numeric_dtype, + is_object_dtype, + is_scalar, +) from pandas.core.dtypes.missing import isna, notna from pandas._typing import FrameOrSeries @@ -36,8 +44,7 @@ from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame, _shared_docs from pandas.core.groupby import base -from pandas.core.groupby.groupby import ( - GroupBy, _apply_docs, _transform_template) +from pandas.core.groupby.groupby import GroupBy, _apply_docs, _transform_template from pandas.core.index import Index, MultiIndex import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block @@ -55,10 +62,9 @@ ScalarResult = typing.TypeVar("ScalarResult") -def whitelist_method_generator(base_class: Type[GroupBy], - klass: Type[FrameOrSeries], - whitelist: FrozenSet[str], - ) -> Iterator[str]: +def whitelist_method_generator( + base_class: Type[GroupBy], klass: Type[FrameOrSeries], whitelist: FrozenSet[str] +) -> Iterator[str]: """ Yields all GroupBy member defs for DataFrame/Series names in whitelist. @@ -80,8 +86,7 @@ class where members are defined. Since we don't want to override methods explicitly defined in the base class, any such name is skipped. """ - property_wrapper_template = \ - """@property + property_wrapper_template = """@property def %(name)s(self) : \"""%(doc)s\""" return self.__getattr__('%(name)s')""" @@ -94,14 +99,13 @@ def %(name)s(self) : # ugly, but we need the name string itself in the method. f = getattr(klass, name) doc = f.__doc__ - doc = doc if type(doc) == str else '' + doc = doc if type(doc) == str else "" wrapper_template = property_wrapper_template - params = {'name': name, 'doc': doc} + params = {"name": name, "doc": doc} yield wrapper_template % params class NDFrameGroupBy(GroupBy): - def _iterate_slices(self): if self.axis == 0: # kludge @@ -119,16 +123,15 @@ def _iterate_slices(self): continue yield val, slicer(val) - def _cython_agg_general(self, how, alt=None, numeric_only=True, - min_count=-1): + def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): new_items, new_blocks = self._cython_agg_blocks( - how, alt=alt, numeric_only=numeric_only, min_count=min_count) + how, alt=alt, numeric_only=numeric_only, min_count=min_count + ) return self._wrap_agged_blocks(new_items, new_blocks) _block_agg_axis = 0 - def _cython_agg_blocks(self, how, alt=None, numeric_only=True, - min_count=-1): + def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): # TODO: the actual managing of mgr_locs is a PITA # here, it should happen via BlockManager.combine @@ -145,7 +148,8 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, locs = block.mgr_locs.as_array try: result, _ = self.grouper.aggregate( - block.values, how, axis=agg_axis, min_count=min_count) + block.values, how, axis=agg_axis, min_count=min_count + ) except NotImplementedError: # generally if we have numeric_only=False # and non-applicable functions @@ -181,7 +185,7 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, new_blocks.append(newb) if len(new_blocks) == 0: - raise DataError('No numeric types to aggregate') + raise DataError("No numeric types to aggregate") # reset the locs in the blocks to correspond to our # current ordering @@ -203,13 +207,13 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, offset = 0 for b in new_blocks: loc = len(b.mgr_locs) - b.mgr_locs = indexer[offset:(offset + loc)] + b.mgr_locs = indexer[offset : (offset + loc)] offset += loc return new_items, new_blocks def aggregate(self, func, *args, **kwargs): - _level = kwargs.pop('_level', None) + _level = kwargs.pop("_level", None) relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) if relabeling: @@ -218,8 +222,7 @@ def aggregate(self, func, *args, **kwargs): kwargs = {} elif func is None: # nicer error message - raise TypeError("Must provide 'func' or tuples of " - "'(column, aggfunc).") + raise TypeError("Must provide 'func' or tuples of " "'(column, aggfunc).") func = _maybe_mangle_lambdas(func) @@ -238,11 +241,12 @@ def aggregate(self, func, *args, **kwargs): try: assert not args and not kwargs result = self._aggregate_multiple_funcs( - [func], _level=_level, _axis=self.axis) + [func], _level=_level, _axis=self.axis + ) result.columns = Index( - result.columns.levels[0], - name=self._selected_obj.columns.name) + result.columns.levels[0], name=self._selected_obj.columns.name + ) if isinstance(self.obj, SparseDataFrame): # Backwards compat for groupby.agg() with sparse @@ -266,7 +270,7 @@ def aggregate(self, func, *args, **kwargs): def _aggregate_generic(self, func, *args, **kwargs): if self.grouper.nkeys != 1: - raise AssertionError('Number of keys must be 1') + raise AssertionError("Number of keys must be 1") axis = self.axis obj = self._obj_with_exclusions @@ -275,16 +279,14 @@ def _aggregate_generic(self, func, *args, **kwargs): if axis != obj._info_axis_number: try: for name, data in self: - result[name] = self._try_cast(func(data, *args, **kwargs), - data) + result[name] = self._try_cast(func(data, *args, **kwargs), data) except Exception: return self._aggregate_item_by_item(func, *args, **kwargs) else: for name in self.indices: try: data = self.get_group(name, obj=obj) - result[name] = self._try_cast(func(data, *args, **kwargs), - data) + result[name] = self._try_cast(func(data, *args, **kwargs), data) except Exception: wrapper = lambda x: func(x, *args, **kwargs) result[name] = data.apply(wrapper, axis=axis) @@ -304,8 +306,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): for item in obj: try: data = obj[item] - colg = SeriesGroupBy(data, selection=item, - grouper=self.grouper) + colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) cast = self._transform_should_cast(func) @@ -342,8 +343,7 @@ def _decide_output_index(self, output, labels): pass if isinstance(labels, MultiIndex): - output_keys = MultiIndex.from_tuples(output_keys, - names=labels.names) + output_keys = MultiIndex.from_tuples(output_keys, names=labels.names) return output_keys @@ -369,8 +369,7 @@ def first_not_none(values): # We'd prefer it return an empty dataframe. return DataFrame() elif isinstance(v, DataFrame): - return self._concat_objects(keys, values, - not_indexed_same=not_indexed_same) + return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) elif self.grouper.groupings is not None: if len(self.grouper.groupings) > 1: key_index = self.grouper.result_index @@ -400,8 +399,7 @@ def first_not_none(values): return DataFrame() elif isinstance(v, NDFrame): values = [ - x if x is not None else - v._constructor(**v._construct_axes_dict()) + x if x is not None else v._constructor(**v._construct_axes_dict()) for x in values ] @@ -410,11 +408,8 @@ def first_not_none(values): if isinstance(v, (np.ndarray, Index, Series)): if isinstance(v, Series): applied_index = self._selected_obj._get_axis(self.axis) - all_indexed_same = _all_indexes_same([ - x.index for x in values - ]) - singular_series = (len(values) == 1 and - applied_index.nlevels == 1) + all_indexed_same = _all_indexes_same([x.index for x in values]) + singular_series = len(values) == 1 and applied_index.nlevels == 1 # GH3596 # provide a reduction (Frame -> Series) if groups are @@ -438,13 +433,12 @@ def first_not_none(values): # path added as of GH 5545 elif all_indexed_same: from pandas.core.reshape.concat import concat + return concat(values) if not all_indexed_same: # GH 8467 - return self._concat_objects( - keys, values, not_indexed_same=True, - ) + return self._concat_objects(keys, values, not_indexed_same=True) try: if self.axis == 0: @@ -462,33 +456,37 @@ def first_not_none(values): # normally use vstack as its faster than concat # and if we have mi-columns - if (isinstance(v.index, MultiIndex) or - key_index is None or - isinstance(key_index, MultiIndex)): - stacked_values = np.vstack([ - np.asarray(v) for v in values - ]) - result = DataFrame(stacked_values, index=key_index, - columns=index) + if ( + isinstance(v.index, MultiIndex) + or key_index is None + or isinstance(key_index, MultiIndex) + ): + stacked_values = np.vstack([np.asarray(v) for v in values]) + result = DataFrame( + stacked_values, index=key_index, columns=index + ) else: # GH5788 instead of stacking; concat gets the # dtypes correct from pandas.core.reshape.concat import concat - result = concat(values, keys=key_index, - names=key_index.names, - axis=self.axis).unstack() + + result = concat( + values, + keys=key_index, + names=key_index.names, + axis=self.axis, + ).unstack() result.columns = index else: - stacked_values = np.vstack([np.asarray(v) - for v in values]) - result = DataFrame(stacked_values.T, index=v.index, - columns=key_index) + stacked_values = np.vstack([np.asarray(v) for v in values]) + result = DataFrame( + stacked_values.T, index=v.index, columns=key_index + ) except (ValueError, AttributeError): # GH1738: values is list of arrays of unequal lengths fall # through to the outer else caluse - return Series(values, index=key_index, - name=self._selection_name) + return Series(values, index=key_index, name=self._selection_name) # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here @@ -507,14 +505,13 @@ def first_not_none(values): # self._selection_name not passed through to Series as the # result should not take the name of original selection # of columns - return (Series(values, index=key_index) - ._convert(datetime=True, - coerce=coerce)) + return Series(values, index=key_index)._convert( + datetime=True, coerce=coerce + ) else: # Handle cases like BinGrouper - return self._concat_objects(keys, values, - not_indexed_same=not_indexed_same) + return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) def _transform_general(self, func, *args, **kwargs): from pandas.core.reshape.concat import concat @@ -526,7 +523,7 @@ def _transform_general(self, func, *args, **kwargs): path = None for name, group in gen: - object.__setattr__(group, 'name', name) + object.__setattr__(group, "name", name) if path is None: # Try slow path and fast path. @@ -535,7 +532,7 @@ def _transform_general(self, func, *args, **kwargs): except TypeError: return self._transform_item_by_item(obj, fast_path) except ValueError: - msg = 'transform must return a scalar value for each group' + msg = "transform must return a scalar value for each group" raise ValueError(msg) else: res = path(group) @@ -553,9 +550,12 @@ def _transform_general(self, func, *args, **kwargs): r.index = group.index else: r = DataFrame( - np.concatenate([res.values] * len(group.index) - ).reshape(group.shape), - columns=group.columns, index=group.index) + np.concatenate([res.values] * len(group.index)).reshape( + group.shape + ), + columns=group.columns, + index=group.index, + ) applied.append(r) else: @@ -564,11 +564,10 @@ def _transform_general(self, func, *args, **kwargs): concat_index = obj.columns if self.axis == 0 else obj.index other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 concatenated = concat(applied, axis=self.axis, verify_integrity=False) - concatenated = concatenated.reindex(concat_index, axis=other_axis, - copy=False) + concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False) return self._set_result_index_ordered(concatenated) - @Substitution(klass='DataFrame', selected='') + @Substitution(klass="DataFrame", selected="") @Appender(_transform_template) def transform(self, func, *args, **kwargs): @@ -614,18 +613,19 @@ def _transform_fast(self, result, obj, func_nm): res = self._try_cast(res, obj.iloc[:, i]) output.append(res) - return DataFrame._from_arrays(output, columns=result.columns, - index=obj.index) + return DataFrame._from_arrays(output, columns=result.columns, index=obj.index) def _define_paths(self, func, *args, **kwargs): if isinstance(func, str): fast_path = lambda group: getattr(group, func)(*args, **kwargs) slow_path = lambda group: group.apply( - lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis) + lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis + ) else: fast_path = lambda group: func(group, *args, **kwargs) slow_path = lambda group: group.apply( - lambda x: func(x, *args, **kwargs), axis=self.axis) + lambda x: func(x, *args, **kwargs), axis=self.axis + ) return fast_path, slow_path def _choose_path(self, fast_path, slow_path, group): @@ -663,7 +663,7 @@ def _transform_item_by_item(self, obj, wrapper): pass if len(output) == 0: # pragma: no cover - raise TypeError('Transform function invalid for data types') + raise TypeError("Transform function invalid for data types") columns = obj.columns if len(output) < len(obj.columns): @@ -712,7 +712,7 @@ def filter(self, func, dropna=True, *args, **kwargs): # noqa gen = self.grouper.get_iterator(obj, axis=self.axis) for name, group in gen: - object.__setattr__(group, 'name', name) + object.__setattr__(group, "name", name) res = func(group, *args, **kwargs) @@ -727,9 +727,10 @@ def filter(self, func, dropna=True, *args, **kwargs): # noqa indices.append(self._get_index(name)) else: # non scalars aren't allowed - raise TypeError("filter function returned a %s, " - "but expected a scalar bool" % - type(res).__name__) + raise TypeError( + "filter function returned a %s, " + "but expected a scalar bool" % type(res).__name__ + ) return self._apply_filter(indices, dropna) @@ -739,8 +740,7 @@ class SeriesGroupBy(GroupBy): # Make class defs of attributes on SeriesGroupBy whitelist _apply_whitelist = base.series_apply_whitelist - for _def_str in whitelist_method_generator( - GroupBy, Series, _apply_whitelist): + for _def_str in whitelist_method_generator(GroupBy, Series, _apply_whitelist): exec(_def_str) @property @@ -755,15 +755,18 @@ def _selection_name(self): else: return self._selection - _agg_see_also_doc = dedent(""" + _agg_see_also_doc = dedent( + """ See Also -------- pandas.Series.groupby.apply pandas.Series.groupby.transform pandas.Series.aggregate - """) + """ + ) - _agg_examples_doc = dedent(""" + _agg_examples_doc = dedent( + """ Examples -------- >>> s = pd.Series([1, 2, 3, 4]) @@ -800,27 +803,33 @@ def _selection_name(self): minimum maximum 1 1 2 2 3 4 - """) + """ + ) - @Appender(_apply_docs['template'] - .format(input='series', - examples=_apply_docs['series_examples'])) + @Appender( + _apply_docs["template"].format( + input="series", examples=_apply_docs["series_examples"] + ) + ) def apply(self, func, *args, **kwargs): return super().apply(func, *args, **kwargs) - @Substitution(see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - versionadded='', - klass='Series', - axis='') - @Appender(_shared_docs['aggregate']) + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="Series", + axis="", + ) + @Appender(_shared_docs["aggregate"]) def aggregate(self, func_or_funcs=None, *args, **kwargs): - _level = kwargs.pop('_level', None) + _level = kwargs.pop("_level", None) relabeling = func_or_funcs is None columns = None - no_arg_message = ("Must provide 'func_or_funcs' or named " - "aggregation **kwargs.") + no_arg_message = ( + "Must provide 'func_or_funcs' or named " "aggregation **kwargs." + ) if relabeling: columns = list(kwargs) if not PY36: @@ -839,8 +848,7 @@ def aggregate(self, func_or_funcs=None, *args, **kwargs): # Catch instances of lists / tuples # but not the class list / tuple itself. func_or_funcs = _maybe_mangle_lambdas(func_or_funcs) - ret = self._aggregate_multiple_funcs(func_or_funcs, - (_level or 0) + 1) + ret = self._aggregate_multiple_funcs(func_or_funcs, (_level or 0) + 1) if relabeling: ret.columns = columns else: @@ -860,11 +868,12 @@ def aggregate(self, func_or_funcs=None, *args, **kwargs): ret = Series(result, index=index) if not self.as_index: # pragma: no cover - print('Warning, ignoring as_index=True') + print("Warning, ignoring as_index=True") # _level handled at higher if not _level and isinstance(ret, dict): from pandas import concat + ret = concat(ret, axis=1) return ret @@ -877,20 +886,21 @@ def _aggregate_multiple_funcs(self, arg, _level): # have not shown a higher level one # GH 15931 if isinstance(self._selected_obj, Series) and _level <= 1: - msg = dedent("""\ + msg = dedent( + """\ using a dict on a Series for aggregation is deprecated and will be removed in a future version. Use \ named aggregation instead. >>> grouper.agg(name_1=func_1, name_2=func_2) - """) + """ + ) warnings.warn(msg, FutureWarning, stacklevel=3) columns = list(arg.keys()) arg = arg.items() elif any(isinstance(x, (tuple, list)) for x in arg): - arg = [(x, x) if not isinstance(x, (tuple, list)) else x - for x in arg] + arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg] # indicated column order columns = next(zip(*arg)) @@ -907,8 +917,9 @@ def _aggregate_multiple_funcs(self, arg, _level): obj = self if name in results: raise SpecificationError( - 'Function names must be unique, found multiple named ' - '{}'.format(name)) + "Function names must be unique, found multiple named " + "{}".format(name) + ) # reset the cache so that we # only include the named selection @@ -938,15 +949,13 @@ def _wrap_output(self, output, index, names=None): return Series(output, index=index, name=name) def _wrap_aggregated_output(self, output, names=None): - result = self._wrap_output(output=output, - index=self.grouper.result_index, - names=names) + result = self._wrap_output( + output=output, index=self.grouper.result_index, names=names + ) return self._reindex_output(result)._convert(datetime=True) def _wrap_transformed_output(self, output, names=None): - return self._wrap_output(output=output, - index=self.obj.index, - names=names) + return self._wrap_output(output=output, index=self.obj.index, names=names) def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: @@ -971,17 +980,13 @@ def _get_index(): return result if isinstance(values[0], Series): - return self._concat_objects(keys, values, - not_indexed_same=not_indexed_same) + return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) elif isinstance(values[0], DataFrame): # possible that Series -> DataFrame by applied function - return self._concat_objects(keys, values, - not_indexed_same=not_indexed_same) + return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: # GH #6265 #24880 - result = Series(data=values, - index=_get_index(), - name=self._selection_name) + result = Series(data=values, index=_get_index(), name=self._selection_name) return self._reindex_output(result) def _aggregate_named(self, func, *args, **kwargs): @@ -991,12 +996,12 @@ def _aggregate_named(self, func, *args, **kwargs): group.name = name output = func(group, *args, **kwargs) if isinstance(output, (Series, Index, np.ndarray)): - raise Exception('Must produce aggregated value') + raise Exception("Must produce aggregated value") result[name] = self._try_cast(output, group) return result - @Substitution(klass='Series', selected='A.') + @Substitution(klass="Series", selected="A.") @Appender(_transform_template) def transform(self, func, *args, **kwargs): func = self._is_cython_func(func) or func @@ -1009,17 +1014,18 @@ def transform(self, func, *args, **kwargs): else: # cythonized aggregation and merge return self._transform_fast( - lambda: getattr(self, func)(*args, **kwargs), func) + lambda: getattr(self, func)(*args, **kwargs), func + ) # reg transform klass = self._selected_obj.__class__ results = [] wrapper = lambda x: func(x, *args, **kwargs) for name, group in self: - object.__setattr__(group, 'name', name) + object.__setattr__(group, "name", name) res = wrapper(group) - if hasattr(res, 'values'): + if hasattr(res, "values"): res = res.values indexer = self._get_index(name) @@ -1029,6 +1035,7 @@ def transform(self, func, *args, **kwargs): # check for empty "results" to avoid concat ValueError if results: from pandas.core.reshape.concat import concat + result = concat(results).sort_index() else: result = Series() @@ -1099,8 +1106,9 @@ def true_and_notna(x, *args, **kwargs): return b and notna(b) try: - indices = [self._get_index(name) for name, group in self - if true_and_notna(group)] + indices = [ + self._get_index(name) for name, group in self if true_and_notna(group) + ] except ValueError: raise TypeError("the filter must return a boolean result") except TypeError: @@ -1125,7 +1133,7 @@ def nunique(self, dropna=True): try: sorter = np.lexsort((val, ids)) except TypeError: # catches object dtypes - msg = 'val.dtype must be object, got {}'.format(val.dtype) + msg = "val.dtype must be object, got {}".format(val.dtype) assert val.dtype == object, msg val, _ = algorithms.factorize(val, sort=False) sorter = np.lexsort((val, ids)) @@ -1149,7 +1157,7 @@ def nunique(self, dropna=True): inc[mask & np.r_[False, mask[:-1]]] = 0 inc[idx] = 1 - out = np.add.reduceat(inc, idx).astype('int64', copy=False) + out = np.add.reduceat(inc, idx).astype("int64", copy=False) if len(ids): # NaN/NaT group exists if the head of ids is -1, # so remove it from res and exclude its index from idx @@ -1167,9 +1175,7 @@ def nunique(self, dropna=True): res, out = np.zeros(len(ri), dtype=out.dtype), res res[ids[idx]] = out - return Series(res, - index=ri, - name=self._selection_name) + return Series(res, index=ri, name=self._selection_name) @Appender(Series.describe.__doc__) def describe(self, **kwargs): @@ -1178,8 +1184,9 @@ def describe(self, **kwargs): return result.T return result.unstack() - def value_counts(self, normalize=False, sort=True, ascending=False, - bins=None, dropna=True): + def value_counts( + self, normalize=False, sort=True, ascending=False, bins=None, dropna=True + ): from pandas.core.reshape.tile import cut from pandas.core.reshape.merge import _get_join_indexers @@ -1187,11 +1194,13 @@ def value_counts(self, normalize=False, sort=True, ascending=False, if bins is not None and not np.iterable(bins): # scalar bins cannot be done at top level # in a backward compatible way - return self.apply(Series.value_counts, - normalize=normalize, - sort=sort, - ascending=ascending, - bins=bins) + return self.apply( + Series.value_counts, + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins, + ) ids, _, _ = self.grouper.group_info val = self.obj._internal_get_values() @@ -1244,7 +1253,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, out, labels = out[mask], [label[mask] for label in labels] if normalize: - out = out.astype('float') + out = out.astype("float") d = np.diff(np.r_[idx, len(ids)]) if dropna: m = ids[lab == -1] @@ -1260,8 +1269,9 @@ def value_counts(self, normalize=False, sort=True, ascending=False, out, labels[-1] = out[sorter], labels[-1][sorter] if bins is None: - mi = MultiIndex(levels=levels, codes=labels, names=names, - verify_integrity=False) + mi = MultiIndex( + levels=levels, codes=labels, names=names, verify_integrity=False + ) if is_integer_dtype(out): out = ensure_int64(out) @@ -1269,18 +1279,17 @@ def value_counts(self, normalize=False, sort=True, ascending=False, # for compat. with libgroupby.value_counts need to ensure every # bin is present at every index level, null filled with zeros - diff = np.zeros(len(out), dtype='bool') + diff = np.zeros(len(out), dtype="bool") for lab in labels[:-1]: diff |= np.r_[True, lab[1:] != lab[:-1]] ncat, nbin = diff.sum(), len(levels[-1]) - left = [np.repeat(np.arange(ncat), nbin), - np.tile(np.arange(nbin), ncat)] + left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)] right = [diff.cumsum() - 1, labels[-1]] - _, idx = _get_join_indexers(left, right, sort=False, how='left') + _, idx = _get_join_indexers(left, right, sort=False, how="left") out = np.where(idx != -1, out[idx], 0) if sort: @@ -1291,8 +1300,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, codes = list(map(lambda lab: np.repeat(lab[diff], nbin), labels[:-1])) codes.append(left[-1]) - mi = MultiIndex(levels=levels, codes=codes, names=names, - verify_integrity=False) + mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) if is_integer_dtype(out): out = ensure_int64(out) @@ -1315,22 +1323,26 @@ def count(self): minlength = ngroups or 0 out = np.bincount(ids[mask], minlength=minlength) - return Series(out, - index=self.grouper.result_index, - name=self._selection_name, - dtype='int64') + return Series( + out, + index=self.grouper.result_index, + name=self._selection_name, + dtype="int64", + ) def _apply_to_column_groupbys(self, func): """ return a pass thru """ return func(self) - def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None): + def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None): """Calculate pct_change of each value to previous entry in group""" # TODO: Remove this conditional when #23918 is fixed if freq: - return self.apply(lambda x: x.pct_change(periods=periods, - fill_method=fill_method, - limit=limit, freq=freq)) + return self.apply( + lambda x: x.pct_change( + periods=periods, fill_method=fill_method, limit=limit, freq=freq + ) + ) filled = getattr(self, fill_method)(limit=limit) fill_grp = filled.groupby(self.grouper.labels) shifted = fill_grp.shift(periods=periods, freq=freq) @@ -1344,21 +1356,23 @@ class DataFrameGroupBy(NDFrameGroupBy): # # Make class defs of attributes on DataFrameGroupBy whitelist. - for _def_str in whitelist_method_generator( - GroupBy, DataFrame, _apply_whitelist): + for _def_str in whitelist_method_generator(GroupBy, DataFrame, _apply_whitelist): exec(_def_str) _block_agg_axis = 1 - _agg_see_also_doc = dedent(""" + _agg_see_also_doc = dedent( + """ See Also -------- pandas.DataFrame.groupby.apply pandas.DataFrame.groupby.transform pandas.DataFrame.aggregate - """) + """ + ) - _agg_examples_doc = dedent(""" + _agg_examples_doc = dedent( + """ Examples -------- @@ -1426,14 +1440,17 @@ class DataFrameGroupBy(NDFrameGroupBy): As usual, the aggregation can be a callable or a string alias. See :ref:`groupby.aggregate.named` for more. - """) - - @Substitution(see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - versionadded='', - klass='DataFrame', - axis='') - @Appender(_shared_docs['aggregate']) + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="DataFrame", + axis="", + ) + @Appender(_shared_docs["aggregate"]) def aggregate(self, arg=None, *args, **kwargs): return super().aggregate(arg, *args, **kwargs) @@ -1456,17 +1473,21 @@ def _gotitem(self, key, ndim, subset=None): if ndim == 2: if subset is None: subset = self.obj - return DataFrameGroupBy(subset, self.grouper, selection=key, - grouper=self.grouper, - exclusions=self.exclusions, - as_index=self.as_index, - observed=self.observed) + return DataFrameGroupBy( + subset, + self.grouper, + selection=key, + grouper=self.grouper, + exclusions=self.exclusions, + as_index=self.as_index, + observed=self.observed, + ) elif ndim == 1: if subset is None: subset = self.obj[key] - return SeriesGroupBy(subset, selection=key, - grouper=self.grouper, - observed=self.observed) + return SeriesGroupBy( + subset, selection=key, grouper=self.grouper, observed=self.observed + ) raise AssertionError("invalid ndim for _gotitem") @@ -1474,11 +1495,9 @@ def _wrap_generic_output(self, result, obj): result_index = self.grouper.levels[0] if self.axis == 0: - return DataFrame(result, index=obj.columns, - columns=result_index).T + return DataFrame(result, index=obj.columns, columns=result_index).T else: - return DataFrame(result, index=obj.index, - columns=result_index) + return DataFrame(result, index=obj.index, columns=result_index) def _get_data_to_aggregate(self): obj = self._obj_with_exclusions @@ -1489,10 +1508,16 @@ def _get_data_to_aggregate(self): def _insert_inaxis_grouper_inplace(self, result): # zip in reverse so we can always insert at loc 0 - izip = zip(* map(reversed, ( - self.grouper.names, - self.grouper.get_group_levels(), - [grp.in_axis for grp in self.grouper.groupings]))) + izip = zip( + *map( + reversed, + ( + self.grouper.names, + self.grouper.get_group_levels(), + [grp.in_axis for grp in self.grouper.groupings], + ), + ) + ) for name, lev, in_axis in izip: if in_axis: @@ -1540,17 +1565,21 @@ def _wrap_agged_blocks(self, items, blocks): def _iterate_column_groupbys(self): for i, colname in enumerate(self._selected_obj.columns): - yield colname, SeriesGroupBy(self._selected_obj.iloc[:, i], - selection=colname, - grouper=self.grouper, - exclusions=self.exclusions) + yield colname, SeriesGroupBy( + self._selected_obj.iloc[:, i], + selection=colname, + grouper=self.grouper, + exclusions=self.exclusions, + ) def _apply_to_column_groupbys(self, func): from pandas.core.reshape.concat import concat + return concat( - (func(col_groupby) for _, col_groupby - in self._iterate_column_groupbys()), - keys=self._selected_obj.columns, axis=1) + (func(col_groupby) for _, col_groupby in self._iterate_column_groupbys()), + keys=self._selected_obj.columns, + axis=1, + ) def count(self): """ @@ -1567,12 +1596,10 @@ def count(self): ids, _, ngroups = self.grouper.group_info mask = ids != -1 - val = ((mask & ~_isna(np.atleast_2d(blk.get_values()))) - for blk in data.blocks) + val = ((mask & ~_isna(np.atleast_2d(blk.get_values()))) for blk in data.blocks) loc = (blk.mgr_locs for blk in data.blocks) - counter = partial( - lib.count_level_2d, labels=ids, max_bin=ngroups, axis=1) + counter = partial(lib.count_level_2d, labels=ids, max_bin=ngroups, axis=1) blk = map(make_block, map(counter, val), loc) return self._wrap_agged_blocks(data.items, list(blk)) @@ -1628,14 +1655,15 @@ def nunique(self, dropna=True): obj = self._selected_obj def groupby_series(obj, col=None): - return SeriesGroupBy(obj, - selection=col, - grouper=self.grouper).nunique(dropna=dropna) + return SeriesGroupBy(obj, selection=col, grouper=self.grouper).nunique( + dropna=dropna + ) if isinstance(obj, Series): results = groupby_series(obj) else: from pandas.core.reshape.concat import concat + results = [groupby_series(obj[col], col) for col in obj.columns] results = concat(results, axis=1) results.columns.names = obj.columns.names @@ -1669,10 +1697,7 @@ def _is_multi_agg_with_relabel(**kwargs): >>> _is_multi_agg_with_relabel() False """ - return all( - isinstance(v, tuple) and len(v) == 2 - for v in kwargs.values() - ) and kwargs + return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and kwargs def _normalize_keyword_aggregation(kwargs): @@ -1719,8 +1744,7 @@ def _normalize_keyword_aggregation(kwargs): aggspec[column].append(aggfunc) else: aggspec[column] = [aggfunc] - order.append((column, - com.get_callable_name(aggfunc) or aggfunc)) + order.append((column, com.get_callable_name(aggfunc) or aggfunc)) return aggspec, columns, order @@ -1730,6 +1754,7 @@ def _normalize_keyword_aggregation(kwargs): # typing.Sequence[Callable[..., ScalarResult]] # -> typing.Sequence[Callable[..., ScalarResult]]: + def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: """ Possibly mangle a list of aggfuncs. @@ -1756,7 +1781,7 @@ def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: for aggfunc in aggfuncs: if com.get_callable_name(aggfunc) == "": aggfunc = functools.partial(aggfunc) - aggfunc.__name__ = ''.format(i) + aggfunc.__name__ = "".format(i) i += 1 mangled_aggfuncs.append(aggfunc) @@ -1828,13 +1853,15 @@ def _recast_datetimelike_result(result: DataFrame) -> DataFrame: """ result = result.copy() - obj_cols = [idx for idx in range(len(result.columns)) - if is_object_dtype(result.dtypes[idx])] + obj_cols = [ + idx for idx in range(len(result.columns)) if is_object_dtype(result.dtypes[idx]) + ] # See GH#26285 for n in obj_cols: - converted = maybe_convert_objects(result.iloc[:, n].values, - convert_numeric=False) + converted = maybe_convert_objects( + result.iloc[:, n].values, convert_numeric=False + ) result.iloc[:, n] = converted return result diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 925f006de92b6a..aa71fd68086fb6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -28,16 +28,24 @@ class providing the base-class of operations. from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( - ensure_float, is_datetime64tz_dtype, is_extension_array_dtype, - is_numeric_dtype, is_scalar) + ensure_float, + is_datetime64tz_dtype, + is_extension_array_dtype, + is_numeric_dtype, + is_scalar, +) from pandas.core.dtypes.missing import isna, notna -from pandas.api.types import ( - is_datetime64_dtype, is_integer_dtype, is_object_dtype) +from pandas.api.types import is_datetime64_dtype, is_integer_dtype, is_object_dtype import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical from pandas.core.base import ( - DataError, GroupByError, PandasObject, SelectionMixin, SpecificationError) + DataError, + GroupByError, + PandasObject, + SelectionMixin, + SpecificationError, +) import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame @@ -169,7 +177,8 @@ class providing the base-class of operations. Examples -------- {examples} - """) + """, +) _pipe_template = """ Apply a function `func` with arguments to this %(klass)s object and return @@ -303,14 +312,17 @@ def __init__(self, groupby): def __call__(self, *args, **kwargs): def f(self): return self.plot(*args, **kwargs) - f.__name__ = 'plot' + + f.__name__ = "plot" return self._groupby.apply(f) def __getattr__(self, name): def attr(*args, **kwargs): def f(self): return getattr(self.plot, name)(*args, **kwargs) + return self._groupby.apply(f) + return attr @@ -328,10 +340,22 @@ class _GroupBy(PandasObject, SelectionMixin): _group_selection = None _apply_whitelist = frozenset() # type: FrozenSet[str] - def __init__(self, obj, keys=None, axis=0, level=None, - grouper=None, exclusions=None, selection=None, as_index=True, - sort=True, group_keys=True, squeeze=False, - observed=False, **kwargs): + def __init__( + self, + obj, + keys=None, + axis=0, + level=None, + grouper=None, + exclusions=None, + selection=None, + as_index=True, + sort=True, + group_keys=True, + squeeze=False, + observed=False, + **kwargs + ): self._selection = selection @@ -342,9 +366,9 @@ def __init__(self, obj, keys=None, axis=0, level=None, if not as_index: if not isinstance(obj, DataFrame): - raise TypeError('as_index=False only valid with DataFrame') + raise TypeError("as_index=False only valid with DataFrame") if axis != 0: - raise ValueError('as_index=False only valid for axis=0') + raise ValueError("as_index=False only valid for axis=0") self.as_index = as_index self.keys = keys @@ -352,16 +376,20 @@ def __init__(self, obj, keys=None, axis=0, level=None, self.group_keys = group_keys self.squeeze = squeeze self.observed = observed - self.mutated = kwargs.pop('mutated', False) + self.mutated = kwargs.pop("mutated", False) if grouper is None: from pandas.core.groupby.grouper import _get_grouper - grouper, exclusions, obj = _get_grouper(obj, keys, - axis=axis, - level=level, - sort=sort, - observed=observed, - mutated=self.mutated) + + grouper, exclusions, obj = _get_grouper( + obj, + keys, + axis=axis, + level=level, + sort=sort, + observed=observed, + mutated=self.mutated, + ) self.obj = obj self.axis = obj._get_axis_number(axis) @@ -369,7 +397,7 @@ def __init__(self, obj, keys=None, axis=0, level=None, self.exclusions = set(exclusions) if exclusions else set() # we accept no other args - validate_kwargs('group', kwargs, {}) + validate_kwargs("group", kwargs, {}) def __len__(self): return len(self.groups) @@ -428,13 +456,12 @@ def get_converter(s): if len(self.indices) > 0: index_sample = next(iter(self.indices)) else: - index_sample = None # Dummy sample + index_sample = None # Dummy sample name_sample = names[0] if isinstance(index_sample, tuple): if not isinstance(name_sample, tuple): - msg = ("must supply a tuple to get_group with multiple" - " grouping keys") + msg = "must supply a tuple to get_group with multiple" " grouping keys" raise ValueError(msg) if not len(name_sample) == len(index_sample): try: @@ -442,13 +469,14 @@ def get_converter(s): return [self.indices[name] for name in names] except KeyError: # turns out it wasn't a tuple - msg = ("must supply a same-length tuple to get_group" - " with multiple grouping keys") + msg = ( + "must supply a same-length tuple to get_group" + " with multiple grouping keys" + ) raise ValueError(msg) converters = [get_converter(s) for s in index_sample] - names = (tuple(f(n) for f, n in zip(converters, name)) - for name in names) + names = (tuple(f(n) for f, n in zip(converters, name)) for name in names) else: converter = get_converter(index_sample) @@ -482,7 +510,7 @@ def _reset_group_selection(self): if self._group_selection is not None: # GH12839 clear cached selection too when changing group selection self._group_selection = None - self._reset_cache('_selected_obj') + self._reset_cache("_selected_obj") def _set_group_selection(self): """ @@ -493,21 +521,21 @@ def _set_group_selection(self): NOTE: this should be paired with a call to _reset_group_selection """ grp = self.grouper - if not (self.as_index and - getattr(grp, 'groupings', None) is not None and - self.obj.ndim > 1 and - self._group_selection is None): + if not ( + self.as_index + and getattr(grp, "groupings", None) is not None + and self.obj.ndim > 1 + and self._group_selection is None + ): return ax = self.obj._info_axis - groupers = [g.name for g in grp.groupings - if g.level is None and g.in_axis] + groupers = [g.name for g in grp.groupings if g.level is None and g.in_axis] if len(groupers): # GH12839 clear selected obj cache when group selection changes - self._group_selection = ax.difference(Index(groupers), - sort=False).tolist() - self._reset_cache('_selected_obj') + self._group_selection = ax.difference(Index(groupers), sort=False).tolist() + self._reset_cache("_selected_obj") def _set_result_index_ordered(self, result): # set the result index on the passed values object and @@ -516,13 +544,11 @@ def _set_result_index_ordered(self, result): # the values/counts are repeated according to the group index # shortcut if we have an already ordered grouper if not self.grouper.is_monotonic: - index = Index(np.concatenate( - self._get_indices(self.grouper.result_index))) + index = Index(np.concatenate(self._get_indices(self.grouper.result_index))) result.set_axis(index, axis=self.axis, inplace=True) result = result.sort_index(axis=self.axis) - result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, - inplace=True) + result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) return result def _dir_additions(self): @@ -536,12 +562,14 @@ def __getattr__(self, attr): if hasattr(self.obj, attr): return self._make_wrapper(attr) - raise AttributeError("%r object has no attribute %r" % - (type(self).__name__, attr)) + raise AttributeError( + "%r object has no attribute %r" % (type(self).__name__, attr) + ) - @Substitution(klass='GroupBy', - versionadded='.. versionadded:: 0.21.0', - examples="""\ + @Substitution( + klass="GroupBy", + versionadded=".. versionadded:: 0.21.0", + examples="""\ >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]}) >>> df A B @@ -557,7 +585,8 @@ def __getattr__(self, attr): B A a 2 -b 2""") +b 2""", + ) @Appender(_pipe_template) def pipe(self, func, *args, **kwargs): return com._pipe(self, func, *args, **kwargs) @@ -567,10 +596,11 @@ def pipe(self, func, *args, **kwargs): def _make_wrapper(self, name): if name not in self._apply_whitelist: is_callable = callable(getattr(self._selected_obj, name, None)) - kind = ' callable ' if is_callable else ' ' - msg = ("Cannot access{0}attribute {1!r} of {2!r} objects, try " - "using the 'apply' method".format(kind, name, - type(self).__name__)) + kind = " callable " if is_callable else " " + msg = ( + "Cannot access{0}attribute {1!r} of {2!r} objects, try " + "using the 'apply' method".format(kind, name, type(self).__name__) + ) raise AttributeError(msg) self._set_group_selection() @@ -587,9 +617,8 @@ def wrapper(*args, **kwargs): # a little trickery for aggregation functions that need an axis # argument kwargs_with_axis = kwargs.copy() - if ('axis' not in kwargs_with_axis or - kwargs_with_axis['axis'] is None): - kwargs_with_axis['axis'] = self.axis + if "axis" not in kwargs_with_axis or kwargs_with_axis["axis"] is None: + kwargs_with_axis["axis"] = self.axis def curried_with_axis(x): return f(x, *args, **kwargs_with_axis) @@ -620,8 +649,7 @@ def curried(x): # if we don't have this method to indicated to aggregate to # mark this column as an error try: - return self._aggregate_item_by_item(name, - *args, **kwargs) + return self._aggregate_item_by_item(name, *args, **kwargs) except (AttributeError): raise ValueError @@ -664,9 +692,11 @@ def __iter__(self): """ return self.grouper.get_iterator(self.obj, axis=self.axis) - @Appender(_apply_docs['template'] - .format(input="dataframe", - examples=_apply_docs['dataframe_examples'])) + @Appender( + _apply_docs["template"].format( + input="dataframe", examples=_apply_docs["dataframe_examples"] + ) + ) def apply(self, func, *args, **kwargs): func = self._is_builtin_func(func) @@ -679,16 +709,18 @@ def apply(self, func, *args, **kwargs): @wraps(func) def f(g): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): return func(g, *args, **kwargs) + else: - raise ValueError('func must be a callable if args or ' - 'kwargs are supplied') + raise ValueError( + "func must be a callable if args or " "kwargs are supplied" + ) else: f = func # ignore SettingWithCopy here in case the user mutates - with option_context('mode.chained_assignment', None): + with option_context("mode.chained_assignment", None): try: result = self._python_apply_general(f) except Exception: @@ -707,13 +739,11 @@ def f(g): return result def _python_apply_general(self, f): - keys, values, mutated = self.grouper.apply(f, self._selected_obj, - self.axis) + keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis) return self._wrap_applied_output( - keys, - values, - not_indexed_same=mutated or self.mutated) + keys, values, not_indexed_same=mutated or self.mutated + ) def _iterate_slices(self): yield self._selection_name, self._selected_obj @@ -775,7 +805,7 @@ def _try_cast(self, result, obj, numeric_only=False): # to the target timezone try: result = obj._values._from_sequence( - result, dtype='datetime64[ns, UTC]' + result, dtype="datetime64[ns, UTC]" ) result = result.astype(dtype) except TypeError: @@ -813,7 +843,8 @@ def _transform_should_cast(self, func_nm): Whether transform should attempt to cast the result of aggregation """ return (self.size().fillna(0) > 0).any() and ( - func_nm not in base.cython_cast_blacklist) + func_nm not in base.cython_cast_blacklist + ) def _cython_transform(self, how, numeric_only=True, **kwargs): output = collections.OrderedDict() @@ -823,8 +854,7 @@ def _cython_transform(self, how, numeric_only=True, **kwargs): continue try: - result, names = self.grouper.transform(obj.values, how, - **kwargs) + result, names = self.grouper.transform(obj.values, how, **kwargs) except NotImplementedError: continue except AssertionError as e: @@ -835,12 +865,11 @@ def _cython_transform(self, how, numeric_only=True, **kwargs): output[name] = result if len(output) == 0: - raise DataError('No numeric types to aggregate') + raise DataError("No numeric types to aggregate") return self._wrap_transformed_output(output, names) - def _cython_agg_general(self, how, alt=None, numeric_only=True, - min_count=-1): + def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): output = {} for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) @@ -848,14 +877,15 @@ def _cython_agg_general(self, how, alt=None, numeric_only=True, continue try: - result, names = self.grouper.aggregate(obj.values, how, - min_count=min_count) + result, names = self.grouper.aggregate( + obj.values, how, min_count=min_count + ) except AssertionError as e: raise GroupByError(str(e)) output[name] = self._try_cast(result, obj) if len(output) == 0: - raise DataError('No numeric types to aggregate') + raise DataError("No numeric types to aggregate") return self._wrap_aggregated_output(output, names) @@ -918,7 +948,8 @@ def reset_identity(values): # GH 14776 if isinstance(ax, MultiIndex) and not ax.is_unique: indexer = algorithms.unique1d( - result.index.get_indexer_for(ax.values)) + result.index.get_indexer_for(ax.values) + ) result = result.take(indexer, axis=self.axis) else: result = result.reindex(ax, axis=self.axis) @@ -933,9 +964,14 @@ def reset_identity(values): group_levels = self.grouper.levels group_names = self.grouper.names - result = concat(values, axis=self.axis, keys=group_keys, - levels=group_levels, names=group_names, - sort=False) + result = concat( + values, + axis=self.axis, + keys=group_keys, + levels=group_levels, + names=group_names, + sort=False, + ) else: # GH5610, returns a MI, with the first level being a @@ -946,8 +982,10 @@ def reset_identity(values): values = reset_identity(values) result = concat(values, axis=self.axis) - if (isinstance(result, Series) and - getattr(self, '_selection_name', None) is not None): + if ( + isinstance(result, Series) + and getattr(self, "_selection_name", None) is not None + ): result.name = self._selection_name @@ -955,7 +993,7 @@ def reset_identity(values): def _apply_filter(self, indices, dropna): if len(indices) == 0: - indices = np.array([], dtype='int64') + indices = np.array([], dtype="int64") else: indices = np.sort(np.concatenate(indices)) if dropna: @@ -1038,6 +1076,7 @@ class GroupBy(_GroupBy): See the online documentation for full exposition on these topics and much more """ + def _bool_agg(self, val_test, skipna): """ Shared func to call any / all Cython GroupBy implementations. @@ -1054,16 +1093,20 @@ def objs_to_bool(vals: np.ndarray) -> Tuple[np.ndarray, Type]: def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray: return result.astype(inference, copy=False) - return self._get_cythonized_result('group_any_all', self.grouper, - aggregate=True, - cython_dtype=np.uint8, - needs_values=True, - needs_mask=True, - pre_processing=objs_to_bool, - post_processing=result_to_bool, - val_test=val_test, skipna=skipna) - - @Substitution(name='groupby') + return self._get_cythonized_result( + "group_any_all", + self.grouper, + aggregate=True, + cython_dtype=np.uint8, + needs_values=True, + needs_mask=True, + pre_processing=objs_to_bool, + post_processing=result_to_bool, + val_test=val_test, + skipna=skipna, + ) + + @Substitution(name="groupby") @Appender(_common_see_also) def any(self, skipna=True): """ @@ -1078,9 +1121,9 @@ def any(self, skipna=True): ------- bool """ - return self._bool_agg('any', skipna) + return self._bool_agg("any", skipna) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def all(self, skipna=True): """ @@ -1095,9 +1138,9 @@ def all(self, skipna=True): ------- bool """ - return self._bool_agg('all', skipna) + return self._bool_agg("all", skipna) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def count(self): """ @@ -1112,7 +1155,7 @@ def count(self): # defined here for API doc raise NotImplementedError - @Substitution(name='groupby') + @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def mean(self, *args, **kwargs): """ @@ -1156,10 +1199,11 @@ def mean(self, *args, **kwargs): 2 4.0 Name: B, dtype: float64 """ - nv.validate_groupby_func('mean', args, kwargs, ['numeric_only']) + nv.validate_groupby_func("mean", args, kwargs, ["numeric_only"]) try: return self._cython_agg_general( - 'mean', alt=lambda x, axis: Series(x).mean(**kwargs), **kwargs) + "mean", alt=lambda x, axis: Series(x).mean(**kwargs), **kwargs + ) except GroupByError: raise except Exception: # pragma: no cover @@ -1167,7 +1211,7 @@ def mean(self, *args, **kwargs): f = lambda x: x.mean(axis=self.axis, **kwargs) return self._python_agg_general(f) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def median(self, **kwargs): """ @@ -1182,10 +1226,10 @@ def median(self, **kwargs): """ try: return self._cython_agg_general( - 'median', - alt=lambda x, - axis: Series(x).median(axis=axis, **kwargs), - **kwargs) + "median", + alt=lambda x, axis: Series(x).median(axis=axis, **kwargs), + **kwargs + ) except GroupByError: raise except Exception: # pragma: no cover @@ -1194,10 +1238,11 @@ def f(x): if isinstance(x, np.ndarray): x = Series(x) return x.median(axis=self.axis, **kwargs) + with _group_selection_context(self): return self._python_agg_general(f) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def std(self, ddof=1, *args, **kwargs): """ @@ -1217,10 +1262,10 @@ def std(self, ddof=1, *args, **kwargs): """ # TODO: implement at Cython level? - nv.validate_groupby_func('std', args, kwargs) + nv.validate_groupby_func("std", args, kwargs) return np.sqrt(self.var(ddof=ddof, **kwargs)) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def var(self, ddof=1, *args, **kwargs): """ @@ -1238,13 +1283,14 @@ def var(self, ddof=1, *args, **kwargs): Series or DataFrame Variance of values within each group. """ - nv.validate_groupby_func('var', args, kwargs) + nv.validate_groupby_func("var", args, kwargs) if ddof == 1: try: return self._cython_agg_general( - 'var', + "var", alt=lambda x, axis: Series(x).var(ddof=ddof, **kwargs), - **kwargs) + **kwargs + ) except Exception: f = lambda x: x.var(ddof=ddof, **kwargs) with _group_selection_context(self): @@ -1254,7 +1300,7 @@ def var(self, ddof=1, *args, **kwargs): with _group_selection_context(self): return self._python_agg_general(f) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def sem(self, ddof=1): """ @@ -1274,7 +1320,7 @@ def sem(self, ddof=1): """ return self.std(ddof=ddof) / np.sqrt(self.count()) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def size(self): """ @@ -1288,7 +1334,7 @@ def size(self): result = self.grouper.size() if isinstance(self.obj, Series): - result.name = getattr(self.obj, 'name', None) + result.name = getattr(self.obj, "name", None) return result @classmethod @@ -1297,9 +1343,7 @@ def _add_numeric_operations(cls): Add numeric operations to the GroupBy generically. """ - def groupby_function(name, alias, npfunc, - numeric_only=True, - min_count=-1): + def groupby_function(name, alias, npfunc, numeric_only=True, min_count=-1): _local_template = """ Compute %(f)s of group values. @@ -1310,38 +1354,34 @@ def groupby_function(name, alias, npfunc, Computed %(f)s of values within each group. """ - @Substitution(name='groupby', f=name) + @Substitution(name="groupby", f=name) @Appender(_common_see_also) @Appender(_local_template) def f(self, **kwargs): - if 'numeric_only' not in kwargs: - kwargs['numeric_only'] = numeric_only - if 'min_count' not in kwargs: - kwargs['min_count'] = min_count + if "numeric_only" not in kwargs: + kwargs["numeric_only"] = numeric_only + if "min_count" not in kwargs: + kwargs["min_count"] = min_count self._set_group_selection() # try a cython aggregation if we can try: - return self._cython_agg_general( - alias, alt=npfunc, **kwargs) + return self._cython_agg_general(alias, alt=npfunc, **kwargs) except AssertionError as e: raise SpecificationError(str(e)) except Exception: pass # apply a non-cython aggregation - result = self.aggregate( - lambda x: npfunc(x, axis=self.axis)) + result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) # coerce the resulting columns if we can if isinstance(result, DataFrame): for col in result.columns: - result[col] = self._try_cast( - result[col], self.obj[col]) + result[col] = self._try_cast(result[col], self.obj[col]) else: - result = self._try_cast( - result, self.obj) + result = self._try_cast(result, self.obj) return result @@ -1350,7 +1390,6 @@ def f(self, **kwargs): return f def first_compat(x, axis=0): - def first(x): x = x.to_numpy() @@ -1365,7 +1404,6 @@ def first(x): return first(x) def last_compat(x, axis=0): - def last(x): x = x.to_numpy() x = x[notna(x)] @@ -1378,16 +1416,14 @@ def last(x): else: return last(x) - cls.sum = groupby_function('sum', 'add', np.sum, min_count=0) - cls.prod = groupby_function('prod', 'prod', np.prod, min_count=0) - cls.min = groupby_function('min', 'min', np.min, numeric_only=False) - cls.max = groupby_function('max', 'max', np.max, numeric_only=False) - cls.first = groupby_function('first', 'first', first_compat, - numeric_only=False) - cls.last = groupby_function('last', 'last', last_compat, - numeric_only=False) + cls.sum = groupby_function("sum", "add", np.sum, min_count=0) + cls.prod = groupby_function("prod", "prod", np.prod, min_count=0) + cls.min = groupby_function("min", "min", np.min, numeric_only=False) + cls.max = groupby_function("max", "max", np.max, numeric_only=False) + cls.first = groupby_function("first", "first", first_compat, numeric_only=False) + cls.last = groupby_function("last", "last", last_compat, numeric_only=False) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def ohlc(self): """ @@ -1401,8 +1437,7 @@ def ohlc(self): Open, high, low and close values within each group. """ - return self._apply_to_column_groupbys( - lambda x: x._cython_agg_general('ohlc')) + return self._apply_to_column_groupbys(lambda x: x._cython_agg_general("ohlc")) @Appender(DataFrame.describe.__doc__) def describe(self, **kwargs): @@ -1519,18 +1554,20 @@ def resample(self, rule, *args, **kwargs): 5 2000-01-01 00:00:20 5 1 """ from pandas.core.resample import get_resampler_for_grouping + return get_resampler_for_grouping(self, rule, *args, **kwargs) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def rolling(self, *args, **kwargs): """ Return a rolling grouper, providing rolling functionality per group. """ from pandas.core.window import RollingGroupby + return RollingGroupby(self, *args, **kwargs) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def expanding(self, *args, **kwargs): """ @@ -1538,6 +1575,7 @@ def expanding(self, *args, **kwargs): functionality per group. """ from pandas.core.window import ExpandingGroupby + return ExpandingGroupby(self, *args, **kwargs) def _fill(self, direction, limit=None): @@ -1567,13 +1605,17 @@ def _fill(self, direction, limit=None): if limit is None: limit = -1 - return self._get_cythonized_result('group_fillna_indexer', - self.grouper, needs_mask=True, - cython_dtype=np.int64, - result_is_index=True, - direction=direction, limit=limit) - - @Substitution(name='groupby') + return self._get_cythonized_result( + "group_fillna_indexer", + self.grouper, + needs_mask=True, + cython_dtype=np.int64, + result_is_index=True, + direction=direction, + limit=limit, + ) + + @Substitution(name="groupby") def pad(self, limit=None): """ Forward fill the values. @@ -1595,10 +1637,11 @@ def pad(self, limit=None): Series.fillna DataFrame.fillna """ - return self._fill('ffill', limit=limit) + return self._fill("ffill", limit=limit) + ffill = pad - @Substitution(name='groupby') + @Substitution(name="groupby") def backfill(self, limit=None): """ Backward fill the values. @@ -1620,14 +1663,13 @@ def backfill(self, limit=None): Series.fillna DataFrame.fillna """ - return self._fill('bfill', limit=limit) + return self._fill("bfill", limit=limit) + bfill = backfill - @Substitution(name='groupby') + @Substitution(name="groupby") @Substitution(see_also=_common_see_also) - def nth(self, - n: Union[int, List[int]], - dropna: Optional[str] = None) -> DataFrame: + def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFrame: """ Take the nth row from each group if n is an int, or a subset of rows if n is a list of ints. @@ -1717,8 +1759,7 @@ def nth(self, self._set_group_selection() mask_left = np.in1d(self._cumcount_array(), nth_array) - mask_right = np.in1d(self._cumcount_array(ascending=False) + 1, - -nth_array) + mask_right = np.in1d(self._cumcount_array(ascending=False) + 1, -nth_array) mask = mask_left | mask_right ids, _, _ = self.grouper.group_info @@ -1736,19 +1777,19 @@ def nth(self, # dropna is truthy if isinstance(n, valid_containers): - raise ValueError( - "dropna option with a list of nth values is not supported") + raise ValueError("dropna option with a list of nth values is not supported") - if dropna not in ['any', 'all']: + if dropna not in ["any", "all"]: # Note: when agg-ing picker doesn't raise this, just returns NaN - raise ValueError("For a DataFrame groupby, dropna must be " - "either None, 'any' or 'all', " - "(was passed {dropna}).".format( - dropna=dropna)) + raise ValueError( + "For a DataFrame groupby, dropna must be " + "either None, 'any' or 'all', " + "(was passed {dropna}).".format(dropna=dropna) + ) # old behaviour, but with all and any support for DataFrames. # modified in GH 7559 to have better perf - max_len = n if n >= 0 else - 1 - n + max_len = n if n >= 0 else -1 - n dropped = self.obj.dropna(how=dropna, axis=self.axis) # get a new grouper for our dropped obj @@ -1765,13 +1806,17 @@ def nth(self, # create a grouper with the original parameters, but on dropped # object from pandas.core.groupby.grouper import _get_grouper - grouper, _, _ = _get_grouper(dropped, key=self.keys, - axis=self.axis, level=self.level, - sort=self.sort, - mutated=self.mutated) - grb = dropped.groupby( - grouper, as_index=self.as_index, sort=self.sort) + grouper, _, _ = _get_grouper( + dropped, + key=self.keys, + axis=self.axis, + level=self.level, + sort=self.sort, + mutated=self.mutated, + ) + + grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort) sizes, result = grb.size(), grb.nth(n) mask = (sizes < max_len).values @@ -1780,15 +1825,16 @@ def nth(self, result.loc[mask] = np.nan # reset/reindex to the original groups - if (len(self.obj) == len(dropped) or - len(result) == len(self.grouper.result_index)): + if len(self.obj) == len(dropped) or len(result) == len( + self.grouper.result_index + ): result.index = self.grouper.result_index else: result = result.reindex(self.grouper.result_index) return result - def quantile(self, q=0.5, interpolation='linear'): + def quantile(self, q=0.5, interpolation="linear"): """ Return group values at the given quantile, a la numpy.percentile. @@ -1823,44 +1869,46 @@ def quantile(self, q=0.5, interpolation='linear'): b 3.0 """ - def pre_processor( - vals: np.ndarray - ) -> Tuple[np.ndarray, Optional[Type]]: + def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]: if is_object_dtype(vals): - raise TypeError("'quantile' cannot be performed against " - "'object' dtypes!") + raise TypeError( + "'quantile' cannot be performed against " "'object' dtypes!" + ) inference = None if is_integer_dtype(vals): inference = np.int64 elif is_datetime64_dtype(vals): - inference = 'datetime64[ns]' + inference = "datetime64[ns]" vals = vals.astype(np.float) return vals, inference - def post_processor( - vals: np.ndarray, - inference: Optional[Type] - ) -> np.ndarray: + def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: if inference: # Check for edge case - if not (is_integer_dtype(inference) and - interpolation in {'linear', 'midpoint'}): + if not ( + is_integer_dtype(inference) + and interpolation in {"linear", "midpoint"} + ): vals = vals.astype(inference) return vals - return self._get_cythonized_result('group_quantile', self.grouper, - aggregate=True, - needs_values=True, - needs_mask=True, - cython_dtype=np.float64, - pre_processing=pre_processor, - post_processing=post_processor, - q=q, interpolation=interpolation) - - @Substitution(name='groupby') + return self._get_cythonized_result( + "group_quantile", + self.grouper, + aggregate=True, + needs_values=True, + needs_mask=True, + cython_dtype=np.float64, + pre_processing=pre_processor, + post_processing=post_processor, + q=q, + interpolation=interpolation, + ) + + @Substitution(name="groupby") def ngroup(self, ascending=True): """ Number each group from 0 to the number of groups - 1. @@ -1931,7 +1979,7 @@ def ngroup(self, ascending=True): result = self.ngroups - 1 - result return result - @Substitution(name='groupby') + @Substitution(name="groupby") def cumcount(self, ascending=True): """ Number each item in each group from 0 to the length of that group - 1. @@ -1990,10 +2038,11 @@ def cumcount(self, ascending=True): cumcounts = self._cumcount_array(ascending=ascending) return Series(cumcounts, index) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) - def rank(self, method='average', ascending=True, na_option='keep', - pct=False, axis=0): + def rank( + self, method="average", ascending=True, na_option="keep", pct=False, axis=0 + ): """ Provide the rank of values within each group. @@ -2020,14 +2069,20 @@ def rank(self, method='average', ascending=True, na_option='keep', ------- DataFrame with ranking of values within each group """ - if na_option not in {'keep', 'top', 'bottom'}: + if na_option not in {"keep", "top", "bottom"}: msg = "na_option must be one of 'keep', 'top', or 'bottom'" raise ValueError(msg) - return self._cython_transform('rank', numeric_only=False, - ties_method=method, ascending=ascending, - na_option=na_option, pct=pct, axis=axis) - - @Substitution(name='groupby') + return self._cython_transform( + "rank", + numeric_only=False, + ties_method=method, + ascending=ascending, + na_option=na_option, + pct=pct, + axis=axis, + ) + + @Substitution(name="groupby") @Appender(_common_see_also) def cumprod(self, axis=0, *args, **kwargs): """ @@ -2037,14 +2092,13 @@ def cumprod(self, axis=0, *args, **kwargs): ------- Series or DataFrame """ - nv.validate_groupby_func('cumprod', args, kwargs, - ['numeric_only', 'skipna']) + nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"]) if axis != 0: return self.apply(lambda x: x.cumprod(axis=axis, **kwargs)) - return self._cython_transform('cumprod', **kwargs) + return self._cython_transform("cumprod", **kwargs) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def cumsum(self, axis=0, *args, **kwargs): """ @@ -2054,14 +2108,13 @@ def cumsum(self, axis=0, *args, **kwargs): ------- Series or DataFrame """ - nv.validate_groupby_func('cumsum', args, kwargs, - ['numeric_only', 'skipna']) + nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"]) if axis != 0: return self.apply(lambda x: x.cumsum(axis=axis, **kwargs)) - return self._cython_transform('cumsum', **kwargs) + return self._cython_transform("cumsum", **kwargs) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def cummin(self, axis=0, **kwargs): """ @@ -2074,9 +2127,9 @@ def cummin(self, axis=0, **kwargs): if axis != 0: return self.apply(lambda x: np.minimum.accumulate(x, axis)) - return self._cython_transform('cummin', numeric_only=False) + return self._cython_transform("cummin", numeric_only=False) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def cummax(self, axis=0, **kwargs): """ @@ -2089,14 +2142,22 @@ def cummax(self, axis=0, **kwargs): if axis != 0: return self.apply(lambda x: np.maximum.accumulate(x, axis)) - return self._cython_transform('cummax', numeric_only=False) - - def _get_cythonized_result(self, how, grouper, aggregate=False, - cython_dtype=None, needs_values=False, - needs_mask=False, needs_ngroups=False, - result_is_index=False, - pre_processing=None, post_processing=None, - **kwargs): + return self._cython_transform("cummax", numeric_only=False) + + def _get_cythonized_result( + self, + how, + grouper, + aggregate=False, + cython_dtype=None, + needs_values=False, + needs_mask=False, + needs_ngroups=False, + result_is_index=False, + pre_processing=None, + post_processing=None, + **kwargs + ): """ Get result for Cythonized functions. @@ -2140,8 +2201,9 @@ def _get_cythonized_result(self, how, grouper, aggregate=False, `Series` or `DataFrame` with filled values """ if result_is_index and aggregate: - raise ValueError("'result_is_index' and 'aggregate' cannot both " - "be True!") + raise ValueError( + "'result_is_index' and 'aggregate' cannot both " "be True!" + ) if post_processing: if not callable(pre_processing): raise ValueError("'post_processing' must be a callable!") @@ -2149,8 +2211,9 @@ def _get_cythonized_result(self, how, grouper, aggregate=False, if not callable(pre_processing): raise ValueError("'pre_processing' must be a callable!") if not needs_values: - raise ValueError("Cannot use 'pre_processing' without " - "specifying 'needs_values'!") + raise ValueError( + "Cannot use 'pre_processing' without " "specifying 'needs_values'!" + ) labels, _, ngroups = grouper.group_info output = collections.OrderedDict() @@ -2197,7 +2260,7 @@ def _get_cythonized_result(self, how, grouper, aggregate=False, else: return self._wrap_transformed_output(output) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def shift(self, periods=1, freq=None, axis=0, fill_value=None): """ @@ -2220,19 +2283,20 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): """ if freq is not None or axis != 0 or not isna(fill_value): - return self.apply(lambda x: x.shift(periods, freq, - axis, fill_value)) - - return self._get_cythonized_result('group_shift_indexer', - self.grouper, cython_dtype=np.int64, - needs_ngroups=True, - result_is_index=True, - periods=periods) - - @Substitution(name='groupby') + return self.apply(lambda x: x.shift(periods, freq, axis, fill_value)) + + return self._get_cythonized_result( + "group_shift_indexer", + self.grouper, + cython_dtype=np.int64, + needs_ngroups=True, + result_is_index=True, + periods=periods, + ) + + @Substitution(name="groupby") @Appender(_common_see_also) - def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, - axis=0): + def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0): """ Calculate pct_change of each value to previous entry in group. @@ -2242,16 +2306,21 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, Percentage changes within each group. """ if freq is not None or axis != 0: - return self.apply(lambda x: x.pct_change(periods=periods, - fill_method=fill_method, - limit=limit, freq=freq, - axis=axis)) + return self.apply( + lambda x: x.pct_change( + periods=periods, + fill_method=fill_method, + limit=limit, + freq=freq, + axis=axis, + ) + ) filled = getattr(self, fill_method)(limit=limit) fill_grp = filled.groupby(self.grouper.labels) shifted = fill_grp.shift(periods=periods, freq=freq) return (filled / shifted) - 1 - @Substitution(name='groupby') + @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def head(self, n=5): """ @@ -2282,7 +2351,7 @@ def head(self, n=5): mask = self._cumcount_array() < n return self._selected_obj[mask] - @Substitution(name='groupby') + @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def tail(self, n=5): """ @@ -2347,16 +2416,19 @@ def _reindex_output(self, output): return output # reindexing only applies to a Categorical grouper - elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex)) - for ping in groupings): + elif not any( + isinstance(ping.grouper, (Categorical, CategoricalIndex)) + for ping in groupings + ): return output levels_list = [ping.group_index for ping in groupings] index, _ = MultiIndex.from_product( - levels_list, names=self.grouper.names).sortlevel() + levels_list, names=self.grouper.names + ).sortlevel() if self.as_index: - d = {self.obj._get_axis_name(self.axis): index, 'copy': False} + d = {self.obj._get_axis_name(self.axis): index, "copy": False} return output.reindex(**d) # GH 13204 @@ -2370,15 +2442,15 @@ def _reindex_output(self, output): # reindex `output`, and then reset the in-axis grouper columns. # Select in-axis groupers - in_axis_grps = ((i, ping.name) for (i, ping) - in enumerate(groupings) if ping.in_axis) + in_axis_grps = ( + (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis + ) g_nums, g_names = zip(*in_axis_grps) output = output.drop(labels=list(g_names), axis=1) # Set a temp index and reindex (possibly expanding) - output = output.set_index(self.grouper.result_index - ).reindex(index, copy=False) + output = output.set_index(self.grouper.result_index).reindex(index, copy=False) # Reset in-axis grouper columns # (using level numbers `g_nums` because level names may not be unique) @@ -2394,11 +2466,13 @@ def _reindex_output(self, output): def groupby(obj, by, **kwds): if isinstance(obj, Series): from pandas.core.groupby.generic import SeriesGroupBy + klass = SeriesGroupBy elif isinstance(obj, DataFrame): from pandas.core.groupby.generic import DataFrameGroupBy + klass = DataFrameGroupBy else: # pragma: no cover - raise TypeError('invalid type: {}'.format(obj)) + raise TypeError("invalid type: {}".format(obj)) return klass(obj, by, **kwds) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 9e1033be26df2c..818d844ca79947 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -11,8 +11,14 @@ from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( - ensure_categorical, is_categorical_dtype, is_datetime64_dtype, is_hashable, - is_list_like, is_scalar, is_timedelta64_dtype) + ensure_categorical, + is_categorical_dtype, + is_datetime64_dtype, + is_hashable, + is_list_like, + is_scalar, + is_timedelta64_dtype, +) from pandas.core.dtypes.generic import ABCSeries import pandas.core.algorithms as algorithms @@ -85,12 +91,13 @@ class Grouper: >>> df.groupby(Grouper(level='date', freq='60s', axis=1)) """ - _attributes = ('key', 'level', 'freq', 'axis', - 'sort') # type: Tuple[str, ...] + + _attributes = ("key", "level", "freq", "axis", "sort") # type: Tuple[str, ...] def __new__(cls, *args, **kwargs): - if kwargs.get('freq') is not None: + if kwargs.get("freq") is not None: from pandas.core.resample import TimeGrouper + cls = TimeGrouper return super().__new__(cls) @@ -125,11 +132,14 @@ def _get_grouper(self, obj, validate=True): """ self._set_grouper(obj) - self.grouper, exclusions, self.obj = _get_grouper(self.obj, [self.key], - axis=self.axis, - level=self.level, - sort=self.sort, - validate=validate) + self.grouper, exclusions, self.obj = _get_grouper( + self.obj, + [self.key], + axis=self.axis, + level=self.level, + sort=self.sort, + validate=validate, + ) return self.binner, self.grouper, self.obj def _set_grouper(self, obj, sort=False): @@ -145,8 +155,7 @@ def _set_grouper(self, obj, sort=False): """ if self.key is not None and self.level is not None: - raise ValueError( - "The Grouper cannot specify both a key and a level!") + raise ValueError("The Grouper cannot specify both a key and a level!") # Keep self.grouper value before overriding if self._grouper is None: @@ -156,13 +165,13 @@ def _set_grouper(self, obj, sort=False): if self.key is not None: key = self.key # The 'on' is already defined - if (getattr(self.grouper, 'name', None) == key and - isinstance(obj, ABCSeries)): + if getattr(self.grouper, "name", None) == key and isinstance( + obj, ABCSeries + ): ax = self._grouper.take(obj.index) else: if key not in obj._info_axis: - raise KeyError( - "The grouper name {0} is not found".format(key)) + raise KeyError("The grouper name {0} is not found".format(key)) ax = Index(obj[key], name=key) else: @@ -174,18 +183,16 @@ def _set_grouper(self, obj, sort=False): # equivalent to the axis name if isinstance(ax, MultiIndex): level = ax._get_level_number(level) - ax = Index(ax._get_level_values(level), - name=ax.names[level]) + ax = Index(ax._get_level_values(level), name=ax.names[level]) else: if level not in (0, ax.name): - raise ValueError( - "The level {0} is not valid".format(level)) + raise ValueError("The level {0} is not valid".format(level)) # possibly sort if (self.sort or sort) and not ax.is_monotonic: # use stable sort to support first, last, nth - indexer = self.indexer = ax.argsort(kind='mergesort') + indexer = self.indexer = ax.argsort(kind="mergesort") ax = ax.take(indexer) obj = obj._take(indexer, axis=self.axis, is_copy=False) @@ -198,9 +205,11 @@ def groups(self): return self.grouper.groups def __repr__(self): - attrs_list = ("{}={!r}".format(attr_name, getattr(self, attr_name)) - for attr_name in self._attributes - if getattr(self, attr_name) is not None) + attrs_list = ( + "{}={!r}".format(attr_name, getattr(self, attr_name)) + for attr_name in self._attributes + if getattr(self, attr_name) is not None + ) attrs = ", ".join(attrs_list) cls_name = self.__class__.__name__ return "{}({})".format(cls_name, attrs) @@ -234,8 +243,17 @@ class Grouping: * groups : dict of {group -> label_list} """ - def __init__(self, index, grouper=None, obj=None, name=None, level=None, - sort=True, observed=False, in_axis=False): + def __init__( + self, + index, + grouper=None, + obj=None, + name=None, + level=None, + sort=True, + observed=False, + in_axis=False, + ): self.name = name self.level = level @@ -260,14 +278,15 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, if level is not None: if not isinstance(level, int): if level not in index.names: - raise AssertionError('Level {} not in index'.format(level)) + raise AssertionError("Level {} not in index".format(level)) level = index.names.index(level) if self.name is None: self.name = index.names[level] - self.grouper, self._labels, self._group_index = \ - index._get_grouper_for_level(self.grouper, level) + self.grouper, self._labels, self._group_index = index._get_grouper_for_level( + self.grouper, level + ) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get labels @@ -293,8 +312,10 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, elif is_categorical_dtype(self.grouper): from pandas.core.groupby.categorical import recode_for_groupby + self.grouper, self.all_grouper = recode_for_groupby( - self.grouper, self.sort, observed) + self.grouper, self.sort, observed + ) categories = self.grouper.categories # we make a CategoricalIndex out of the cat grouper @@ -310,42 +331,47 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self._group_index = CategoricalIndex( Categorical.from_codes( - codes=codes, - categories=categories, - ordered=self.grouper.ordered)) + codes=codes, categories=categories, ordered=self.grouper.ordered + ) + ) # we are done if isinstance(self.grouper, Grouping): self.grouper = self.grouper.grouper # no level passed - elif not isinstance(self.grouper, - (Series, Index, ExtensionArray, np.ndarray)): - if getattr(self.grouper, 'ndim', 1) != 1: + elif not isinstance( + self.grouper, (Series, Index, ExtensionArray, np.ndarray) + ): + if getattr(self.grouper, "ndim", 1) != 1: t = self.name or str(type(self.grouper)) - raise ValueError( - "Grouper for '{}' not 1-dimensional".format(t)) + raise ValueError("Grouper for '{}' not 1-dimensional".format(t)) self.grouper = self.index.map(self.grouper) - if not (hasattr(self.grouper, "__len__") and - len(self.grouper) == len(self.index)): - errmsg = ('Grouper result violates len(labels) == ' - 'len(data)\nresult: %s' % - pprint_thing(self.grouper)) + if not ( + hasattr(self.grouper, "__len__") + and len(self.grouper) == len(self.index) + ): + errmsg = ( + "Grouper result violates len(labels) == " + "len(data)\nresult: %s" % pprint_thing(self.grouper) + ) self.grouper = None # Try for sanity raise AssertionError(errmsg) # if we have a date/time-like grouper, make sure that we have # Timestamps like - if getattr(self.grouper, 'dtype', None) is not None: + if getattr(self.grouper, "dtype", None) is not None: if is_datetime64_dtype(self.grouper): from pandas import to_datetime + self.grouper = to_datetime(self.grouper) elif is_timedelta64_dtype(self.grouper): from pandas import to_timedelta + self.grouper = to_timedelta(self.grouper) def __repr__(self): - return 'Grouping({0})'.format(self.name) + return "Grouping({0})".format(self.name) def __iter__(self): return iter(self.indices) @@ -376,8 +402,8 @@ def labels(self): def result_index(self): if self.all_grouper is not None: from pandas.core.groupby.categorical import recode_from_groupby - return recode_from_groupby(self.all_grouper, - self.sort, self.group_index) + + return recode_from_groupby(self.all_grouper, self.sort, self.group_index) return self.group_index @property @@ -393,20 +419,26 @@ def _make_labels(self): labels = self.grouper.label_info uniques = self.grouper.result_index else: - labels, uniques = algorithms.factorize( - self.grouper, sort=self.sort) + labels, uniques = algorithms.factorize(self.grouper, sort=self.sort) uniques = Index(uniques, name=self.name) self._labels = labels self._group_index = uniques @cache_readonly def groups(self): - return self.index.groupby(Categorical.from_codes(self.labels, - self.group_index)) - - -def _get_grouper(obj, key=None, axis=0, level=None, sort=True, - observed=False, mutated=False, validate=True): + return self.index.groupby(Categorical.from_codes(self.labels, self.group_index)) + + +def _get_grouper( + obj, + key=None, + axis=0, + level=None, + sort=True, + observed=False, + mutated=False, + validate=True, +): """ create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. @@ -460,18 +492,17 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, if nlevels == 1: level = level[0] elif nlevels == 0: - raise ValueError('No group keys passed!') + raise ValueError("No group keys passed!") else: - raise ValueError('multiple levels only valid with ' - 'MultiIndex') + raise ValueError("multiple levels only valid with " "MultiIndex") if isinstance(level, str): if obj.index.name != level: - raise ValueError('level name {} is not the name of the ' - 'index'.format(level)) + raise ValueError( + "level name {} is not the name of the " "index".format(level) + ) elif level > 0 or level < -1: - raise ValueError( - 'level > 0 or level < -1 only valid with MultiIndex') + raise ValueError("level > 0 or level < -1 only valid with MultiIndex") # NOTE: `group_axis` and `group_axis.get_level_values(level)` # are same in this section. @@ -501,13 +532,16 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, all_hashable = is_tuple and is_hashable(key) if is_tuple: - if ((all_hashable and key not in obj and set(key).issubset(obj)) - or not all_hashable): + if ( + all_hashable and key not in obj and set(key).issubset(obj) + ) or not all_hashable: # column names ('a', 'b') -> ['a', 'b'] # arrays like (a, b) -> [a, b] - msg = ("Interpreting tuple 'by' as a list of keys, rather than " - "a single key. Use 'by=[...]' instead of 'by=(...)'. In " - "the future, a tuple will always mean a single key.") + msg = ( + "Interpreting tuple 'by' as a list of keys, rather than " + "a single key. Use 'by=[...]' instead of 'by=(...)'. In " + "the future, a tuple will always mean a single key." + ) warnings.warn(msg, FutureWarning, stacklevel=5) key = list(key) @@ -521,15 +555,22 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, # what are we after, exactly? any_callable = any(callable(g) or isinstance(g, dict) for g in keys) any_groupers = any(isinstance(g, Grouper) for g in keys) - any_arraylike = any(isinstance(g, (list, tuple, Series, Index, np.ndarray)) - for g in keys) + any_arraylike = any( + isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys + ) # is this an index replacement? - if (not any_callable and not any_arraylike and not any_groupers and - match_axis_length and level is None): + if ( + not any_callable + and not any_arraylike + and not any_groupers + and match_axis_length + and level is None + ): if isinstance(obj, DataFrame): - all_in_columns_index = all(g in obj.columns or g in - obj.index.names for g in keys) + all_in_columns_index = all( + g in obj.columns or g in obj.index.names for g in keys + ) elif isinstance(obj, Series): all_in_columns_index = all(g in obj.index.names for g in keys) @@ -588,29 +629,37 @@ def is_in_obj(gpr): if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: raise ValueError( - ("Length of grouper ({len_gpr}) and axis ({len_axis})" - " must be same length" - .format(len_gpr=len(gpr), len_axis=obj.shape[axis]))) + ( + "Length of grouper ({len_gpr}) and axis ({len_axis})" + " must be same length".format( + len_gpr=len(gpr), len_axis=obj.shape[axis] + ) + ) + ) # create the Grouping # allow us to passing the actual Grouping as the gpr - ping = (Grouping(group_axis, - gpr, - obj=obj, - name=name, - level=level, - sort=sort, - observed=observed, - in_axis=in_axis) - if not isinstance(gpr, Grouping) else gpr) + ping = ( + Grouping( + group_axis, + gpr, + obj=obj, + name=name, + level=level, + sort=sort, + observed=observed, + in_axis=in_axis, + ) + if not isinstance(gpr, Grouping) + else gpr + ) groupings.append(ping) if len(groupings) == 0 and len(obj): - raise ValueError('No group keys passed!') + raise ValueError("No group keys passed!") elif len(groupings) == 0: - groupings.append(Grouping(Index([], dtype='int'), - np.array([], dtype=np.intp))) + groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) # create the internals grouper grouper = BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) @@ -618,8 +667,7 @@ def is_in_obj(gpr): def _is_label_like(val): - return (isinstance(val, (str, tuple)) or - (val is not None and is_scalar(val))) + return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val)) def _convert_grouper(axis, grouper): @@ -632,7 +680,7 @@ def _convert_grouper(axis, grouper): return grouper.reindex(axis)._values elif isinstance(grouper, (list, Series, Index, np.ndarray)): if len(grouper) != len(axis): - raise ValueError('Grouper and axis must be same length') + raise ValueError("Grouper and axis must be same length") return grouper else: return grouper diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index dd44bc6990d598..33341a489866bb 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -17,10 +17,21 @@ from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( - ensure_float64, ensure_int64, ensure_int_or_float, ensure_object, - ensure_platform_int, is_bool_dtype, is_categorical_dtype, is_complex_dtype, - is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype, is_sparse, - is_timedelta64_dtype, needs_i8_conversion) + ensure_float64, + ensure_int64, + ensure_int_or_float, + ensure_object, + ensure_platform_int, + is_bool_dtype, + is_categorical_dtype, + is_complex_dtype, + is_datetime64_any_dtype, + is_integer_dtype, + is_numeric_dtype, + is_sparse, + is_timedelta64_dtype, + needs_i8_conversion, +) from pandas.core.dtypes.missing import _maybe_fill, isna import pandas.core.algorithms as algorithms @@ -32,8 +43,13 @@ from pandas.core.index import Index, MultiIndex, ensure_index from pandas.core.series import Series from pandas.core.sorting import ( - compress_group_index, decons_obs_group_ids, get_flattened_iterator, - get_group_index, get_group_index_sorter, get_indexer_dict) + compress_group_index, + decons_obs_group_ids, + get_flattened_iterator, + get_group_index, + get_group_index_sorter, + get_indexer_dict, +) def generate_bins_generic(values, binner, closed): @@ -78,8 +94,9 @@ def generate_bins_generic(values, binner, closed): r_bin = binner[i + 1] # count values in current bin, advance to next bin - while j < lenidx and (values[j] < r_bin or - (closed == 'right' and values[j] == r_bin)): + while j < lenidx and ( + values[j] < r_bin or (closed == "right" and values[j] == r_bin) + ): j += 1 bins[bc] = j @@ -111,8 +128,9 @@ class BaseGrouper: """ - def __init__(self, axis, groupings, sort=True, group_keys=True, - mutated=False, indexer=None): + def __init__( + self, axis, groupings, sort=True, group_keys=True, mutated=False, indexer=None + ): self._filter_empty_groups = self.compressed = len(groupings) != 1 self.axis = axis self.groupings = groupings @@ -166,10 +184,7 @@ def _get_group_keys(self): comp_ids, _, ngroups = self.group_info # provide "flattened" iterator for multi-group setting - return get_flattened_iterator(comp_ids, - ngroups, - self.levels, - self.labels) + return get_flattened_iterator(comp_ids, ngroups, self.levels, self.labels) def apply(self, f, data, axis=0): mutated = self.mutated @@ -179,8 +194,11 @@ def apply(self, f, data, axis=0): # oh boy f_name = com.get_callable_name(f) - if (f_name not in base.plotting_methods and - hasattr(splitter, 'fast_apply') and axis == 0): + if ( + f_name not in base.plotting_methods + and hasattr(splitter, "fast_apply") + and axis == 0 + ): try: result_values, mutated = splitter.fast_apply(f, group_keys) @@ -199,7 +217,7 @@ def apply(self, f, data, axis=0): pass for key, (i, group) in zip(group_keys, splitter): - object.__setattr__(group, 'name', key) + object.__setattr__(group, "name", key) # result_values is None if fast apply path wasn't taken # or fast apply aborted with an unexpected exception. @@ -230,8 +248,7 @@ def indices(self): return self.groupings[0].indices else: label_list = [ping.labels for ping in self.groupings] - keys = [com.values_from_object(ping.group_index) - for ping in self.groupings] + keys = [com.values_from_object(ping.group_index) for ping in self.groupings] return get_indexer_dict(label_list, keys) @property @@ -257,9 +274,7 @@ def size(self): out = np.bincount(ids[ids != -1], minlength=ngroup) else: out = [] - return Series(out, - index=self.result_index, - dtype='int64') + return Series(out, index=self.result_index, dtype="int64") @cache_readonly def groups(self): @@ -296,8 +311,7 @@ def label_info(self): def _get_compressed_labels(self): all_labels = [ping.labels for ping in self.groupings] if len(all_labels) > 1: - group_index = get_group_index(all_labels, self.shape, - sort=True, xnull=True) + group_index = get_group_index(all_labels, self.shape, sort=True, xnull=True) return compress_group_index(group_index, sort=self.sort) ping = self.groupings[0] @@ -311,8 +325,7 @@ def ngroups(self): def recons_labels(self): comp_ids, obs_ids, _ = self.group_info labels = (ping.labels for ping in self.groupings) - return decons_obs_group_ids( - comp_ids, obs_ids, self.shape, labels, xnull=True) + return decons_obs_group_ids(comp_ids, obs_ids, self.shape, labels, xnull=True) @cache_readonly def result_index(self): @@ -321,10 +334,9 @@ def result_index(self): codes = self.recons_labels levels = [ping.result_index for ping in self.groupings] - result = MultiIndex(levels=levels, - codes=codes, - verify_integrity=False, - names=self.names) + result = MultiIndex( + levels=levels, codes=codes, verify_integrity=False, names=self.names + ) return result def get_group_levels(self): @@ -344,49 +356,45 @@ def get_group_levels(self): # Aggregation functions _cython_functions = { - 'aggregate': { - 'add': 'group_add', - 'prod': 'group_prod', - 'min': 'group_min', - 'max': 'group_max', - 'mean': 'group_mean', - 'median': { - 'name': 'group_median' + "aggregate": { + "add": "group_add", + "prod": "group_prod", + "min": "group_min", + "max": "group_max", + "mean": "group_mean", + "median": {"name": "group_median"}, + "var": "group_var", + "first": { + "name": "group_nth", + "f": lambda func, a, b, c, d, e: func(a, b, c, d, 1, -1), }, - 'var': 'group_var', - 'first': { - 'name': 'group_nth', - 'f': lambda func, a, b, c, d, e: func(a, b, c, d, 1, -1) + "last": "group_last", + "ohlc": "group_ohlc", + }, + "transform": { + "cumprod": "group_cumprod", + "cumsum": "group_cumsum", + "cummin": "group_cummin", + "cummax": "group_cummax", + "rank": { + "name": "group_rank", + "f": lambda func, a, b, c, d, e, **kwargs: func( + a, + b, + c, + e, + kwargs.get("ties_method", "average"), + kwargs.get("ascending", True), + kwargs.get("pct", False), + kwargs.get("na_option", "keep"), + ), }, - 'last': 'group_last', - 'ohlc': 'group_ohlc', }, - - 'transform': { - 'cumprod': 'group_cumprod', - 'cumsum': 'group_cumsum', - 'cummin': 'group_cummin', - 'cummax': 'group_cummax', - 'rank': { - 'name': 'group_rank', - 'f': lambda func, a, b, c, d, e, **kwargs: func( - a, b, c, e, - kwargs.get('ties_method', 'average'), - kwargs.get('ascending', True), - kwargs.get('pct', False), - kwargs.get('na_option', 'keep') - ) - } - } } - _cython_arity = { - 'ohlc': 4, # OHLC - } + _cython_arity = {"ohlc": 4} # OHLC - _name_functions = { - 'ohlc': lambda *args: ['open', 'high', 'low', 'close'] - } + _name_functions = {"ohlc": lambda *args: ["open", "high", "low", "close"]} def _is_builtin_func(self, arg): """ @@ -407,19 +415,22 @@ def get_func(fname): return f # otherwise find dtype-specific version, falling back to object - for dt in [dtype_str, 'object']: - f = getattr(libgroupby, "{fname}_{dtype_str}".format( - fname=fname, dtype_str=dt), None) + for dt in [dtype_str, "object"]: + f = getattr( + libgroupby, + "{fname}_{dtype_str}".format(fname=fname, dtype_str=dt), + None, + ) if f is not None: return f ftype = self._cython_functions[kind][how] if isinstance(ftype, dict): - func = afunc = get_func(ftype['name']) + func = afunc = get_func(ftype["name"]) # a sub-function - f = ftype.get('f') + f = ftype.get("f") if f is not None: def wrapper(*args, **kwargs): @@ -434,14 +445,13 @@ def wrapper(*args, **kwargs): if func is None: raise NotImplementedError( "function is not implemented for this dtype: " - "[how->{how},dtype->{dtype_str}]".format(how=how, - dtype_str=dtype_str)) + "[how->{how},dtype->{dtype_str}]".format(how=how, dtype_str=dtype_str) + ) return func - def _cython_operation(self, kind, values, how, axis, min_count=-1, - **kwargs): - assert kind in ['transform', 'aggregate'] + def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): + assert kind in ["transform", "aggregate"] # can we do this operation with our cython functions # if not raise NotImplementedError @@ -453,17 +463,18 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError( - "{} are not support in cython ops".format(values.dtype)) + "{} are not support in cython ops".format(values.dtype) + ) elif is_datetime64_any_dtype(values): - if how in ['add', 'prod', 'cumsum', 'cumprod']: + if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( - "datetime64 type does not support {} " - "operations".format(how)) + "datetime64 type does not support {} " "operations".format(how) + ) elif is_timedelta64_dtype(values): - if how in ['prod', 'cumprod']: + if how in ["prod", "cumprod"]: raise NotImplementedError( - "timedelta64 type does not support {} " - "operations".format(how)) + "timedelta64 type does not support {} " "operations".format(how) + ) arity = self._cython_arity.get(how, 1) @@ -478,15 +489,16 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, assert axis == 1, axis values = values.T if arity > 1: - raise NotImplementedError("arity of more than 1 is not " - "supported for the 'how' argument") + raise NotImplementedError( + "arity of more than 1 is not " "supported for the 'how' argument" + ) out_shape = (self.ngroups,) + values.shape[1:] is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: - values = values.view('int64') + values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) @@ -503,59 +515,65 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, values = values.astype(object) try: - func = self._get_cython_function( - kind, how, values, is_numeric) + func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: values = ensure_float64(values) - func = self._get_cython_function( - kind, how, values, is_numeric) + func = self._get_cython_function(kind, how, values, is_numeric) else: raise - if how == 'rank': - out_dtype = 'float' + if how == "rank": + out_dtype = "float" else: if is_numeric: - out_dtype = '{kind}{itemsize}'.format( - kind=values.dtype.kind, itemsize=values.dtype.itemsize) + out_dtype = "{kind}{itemsize}".format( + kind=values.dtype.kind, itemsize=values.dtype.itemsize + ) else: - out_dtype = 'object' + out_dtype = "object" labels, _, _ = self.group_info - if kind == 'aggregate': - result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), - fill_value=np.nan) + if kind == "aggregate": + result = _maybe_fill( + np.empty(out_shape, dtype=out_dtype), fill_value=np.nan + ) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( - result, counts, values, labels, func, is_numeric, - is_datetimelike, min_count) - elif kind == 'transform': - result = _maybe_fill(np.empty_like(values, dtype=out_dtype), - fill_value=np.nan) + result, + counts, + values, + labels, + func, + is_numeric, + is_datetimelike, + min_count, + ) + elif kind == "transform": + result = _maybe_fill( + np.empty_like(values, dtype=out_dtype), fill_value=np.nan + ) # TODO: min_count result = self._transform( - result, values, labels, func, is_numeric, is_datetimelike, - **kwargs) + result, values, labels, func, is_numeric, is_datetimelike, **kwargs + ) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): - result = result.astype('float64') + result = result.astype("float64") result[mask] = np.nan - if (kind == 'aggregate' and - self._filter_empty_groups and not counts.all()): + if kind == "aggregate" and self._filter_empty_groups and not counts.all(): if result.ndim == 2: try: - result = lib.row_bool_subset( - result, (counts > 0).view(np.uint8)) + result = lib.row_bool_subset(result, (counts > 0).view(np.uint8)) except ValueError: result = lib.row_bool_subset_object( - ensure_object(result), - (counts > 0).view(np.uint8)) + ensure_object(result), (counts > 0).view(np.uint8) + ) else: result = result[counts > 0] @@ -574,45 +592,69 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, return result, names def aggregate(self, values, how, axis=0, min_count=-1): - return self._cython_operation('aggregate', values, how, axis, - min_count=min_count) + return self._cython_operation( + "aggregate", values, how, axis, min_count=min_count + ) def transform(self, values, how, axis=0, **kwargs): - return self._cython_operation('transform', values, how, axis, **kwargs) - - def _aggregate(self, result, counts, values, comp_ids, agg_func, - is_numeric, is_datetimelike, min_count=-1): + return self._cython_operation("transform", values, how, axis, **kwargs) + + def _aggregate( + self, + result, + counts, + values, + comp_ids, + agg_func, + is_numeric, + is_datetimelike, + min_count=-1, + ): if values.ndim > 3: # punting for now - raise NotImplementedError("number of dimensions is currently " - "limited to 3") + raise NotImplementedError( + "number of dimensions is currently " "limited to 3" + ) elif values.ndim > 2: for i, chunk in enumerate(values.transpose(2, 0, 1)): chunk = chunk.squeeze() - agg_func(result[:, :, i], counts, chunk, comp_ids, - min_count) + agg_func(result[:, :, i], counts, chunk, comp_ids, min_count) else: agg_func(result, counts, values, comp_ids, min_count) return result - def _transform(self, result, values, comp_ids, transform_func, - is_numeric, is_datetimelike, **kwargs): + def _transform( + self, + result, + values, + comp_ids, + transform_func, + is_numeric, + is_datetimelike, + **kwargs + ): comp_ids, _, ngroups = self.group_info if values.ndim > 3: # punting for now - raise NotImplementedError("number of dimensions is currently " - "limited to 3") + raise NotImplementedError( + "number of dimensions is currently " "limited to 3" + ) elif values.ndim > 2: for i, chunk in enumerate(values.transpose(2, 0, 1)): - transform_func(result[:, :, i], values, - comp_ids, ngroups, is_datetimelike, **kwargs) + transform_func( + result[:, :, i], + values, + comp_ids, + ngroups, + is_datetimelike, + **kwargs + ) else: - transform_func(result, values, comp_ids, ngroups, is_datetimelike, - **kwargs) + transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) return result @@ -626,7 +668,7 @@ def _aggregate_series_fast(self, obj, func): func = self._is_builtin_func(func) if obj.index._has_complex_internals: - raise TypeError('Incompatible index for Cython grouper') + raise TypeError("Incompatible index for Cython grouper") group_index, _, ngroups = self.group_info @@ -634,10 +676,8 @@ def _aggregate_series_fast(self, obj, func): dummy = obj._get_values(slice(None, 0)) indexer = get_group_index_sorter(group_index, ngroups) obj = obj._take(indexer) - group_index = algorithms.take_nd( - group_index, indexer, allow_fill=False) - grouper = reduction.SeriesGrouper(obj, func, group_index, ngroups, - dummy) + group_index = algorithms.take_nd(group_index, indexer, allow_fill=False) + grouper = reduction.SeriesGrouper(obj, func, group_index, ngroups, dummy) result, counts = grouper.get_result() return result, counts @@ -653,9 +693,9 @@ def _aggregate_series_pure_python(self, obj, func): for label, group in splitter: res = func(group) if result is None: - if (isinstance(res, (Series, Index, np.ndarray))): - raise ValueError('Function does not reduce') - result = np.empty(ngroups, dtype='O') + if isinstance(res, (Series, Index, np.ndarray)): + raise ValueError("Function does not reduce") + result = np.empty(ngroups, dtype="O") counts[label] = group.shape[0] result[label] = res @@ -695,8 +735,9 @@ class BinGrouper(BaseGrouper): """ - def __init__(self, bins, binlabels, filter_empty=False, mutated=False, - indexer=None): + def __init__( + self, bins, binlabels, filter_empty=False, mutated=False, indexer=None + ): self.bins = ensure_int64(bins) self.binlabels = ensure_index(binlabels) self._filter_empty_groups = filter_empty @@ -709,8 +750,11 @@ def groups(self): # this is mainly for compat # GH 3881 - result = {key: value for key, value in zip(self.binlabels, self.bins) - if key is not NaT} + result = { + key: value + for key, value in zip(self.binlabels, self.bins) + if key is not NaT + } return result @property @@ -736,8 +780,7 @@ def get_iterator(self, data, axis=0): for each group """ if isinstance(data, NDFrame): - slicer = lambda start, edge: data._slice( - slice(start, edge), axis=axis) + slicer = lambda start, edge: data._slice(slice(start, edge), axis=axis) length = len(data.axes[axis]) else: slicer = lambda start, edge: data[slice(start, edge)] @@ -776,9 +819,11 @@ def group_info(self): else: comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep) - return (comp_ids.astype('int64', copy=False), - obs_group_ids.astype('int64', copy=False), - ngroups) + return ( + comp_ids.astype("int64", copy=False), + obs_group_ids.astype("int64", copy=False), + ngroups, + ) @cache_readonly def result_index(self): @@ -798,8 +843,11 @@ def names(self): @property def groupings(self): from pandas.core.groupby.grouper import Grouping - return [Grouping(lvl, lvl, in_axis=False, level=None, name=name) - for lvl, name in zip(self.levels, self.names)] + + return [ + Grouping(lvl, lvl, in_axis=False, level=None, name=name) + for lvl, name in zip(self.levels, self.names) + ] def agg_series(self, obj, func): dummy = obj[:0] @@ -830,7 +878,6 @@ def _is_indexed_like(obj, axes): class DataSplitter: - def __init__(self, data, labels, ngroups, axis=0): self.data = data self.labels = ensure_int64(labels) @@ -878,13 +925,11 @@ def apply(self, f): class SeriesSplitter(DataSplitter): - def _chop(self, sdata, slice_obj): return sdata._get_values(slice_obj) class FrameSplitter(DataSplitter): - def fast_apply(self, f, names): # must return keys::list, values::list, mutated::bool try: diff --git a/pandas/core/index.py b/pandas/core/index.py index f14f32c67d4e15..d308ac1a9b1c74 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1,7 +1,25 @@ from pandas.core.indexes.api import ( # noqa:F401 - CategoricalIndex, DatetimeIndex, Float64Index, Index, Int64Index, - IntervalIndex, InvalidIndexError, MultiIndex, NaT, NumericIndex, - PeriodIndex, RangeIndex, TimedeltaIndex, UInt64Index, _all_indexes_same, - _get_combined_index, _get_consensus_names, _get_objs_combined_axis, - _new_Index, _union_indexes, ensure_index, ensure_index_from_sequences) + CategoricalIndex, + DatetimeIndex, + Float64Index, + Index, + Int64Index, + IntervalIndex, + InvalidIndexError, + MultiIndex, + NaT, + NumericIndex, + PeriodIndex, + RangeIndex, + TimedeltaIndex, + UInt64Index, + _all_indexes_same, + _get_combined_index, + _get_consensus_names, + _get_objs_combined_axis, + _new_Index, + _union_indexes, + ensure_index, + ensure_index_from_sequences, +) from pandas.core.indexes.multi import _sparsify # noqa:F401 diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 602e11a08b4ed2..5ba23990cbd51e 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -4,9 +4,15 @@ import numpy as np from pandas.core.dtypes.common import ( - is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_datetime_arraylike, is_integer_dtype, is_list_like, is_period_arraylike, - is_timedelta64_dtype) + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetime_arraylike, + is_integer_dtype, + is_list_like, + is_period_arraylike, + is_timedelta64_dtype, +) from pandas.core.dtypes.generic import ABCSeries from pandas.core.accessor import PandasDelegate, delegate_names @@ -18,15 +24,16 @@ class Properties(PandasDelegate, PandasObject, NoNewAttributesMixin): - def __init__(self, data, orig): if not isinstance(data, ABCSeries): - raise TypeError("cannot convert an object of type {0} to a " - "datetimelike index".format(type(data))) + raise TypeError( + "cannot convert an object of type {0} to a " + "datetimelike index".format(type(data)) + ) self._parent = data self.orig = orig - self.name = getattr(data, 'name', None) + self.name = getattr(data, "name", None) self._freeze() def _get_values(self): @@ -47,11 +54,14 @@ def _get_values(self): if is_datetime_arraylike(data): return DatetimeIndex(data, copy=False, name=self.name) - raise TypeError("cannot convert an object of type {0} to a " - "datetimelike index".format(type(data))) + raise TypeError( + "cannot convert an object of type {0} to a " + "datetimelike index".format(type(data)) + ) def _delegate_property_get(self, name): from pandas import Series + values = self._get_values() result = getattr(values, name) @@ -59,7 +69,7 @@ def _delegate_property_get(self, name): # maybe need to upcast (ints) if isinstance(result, np.ndarray): if is_integer_dtype(result): - result = result.astype('int64') + result = result.astype("int64") elif not is_list_like(result): return result @@ -75,19 +85,24 @@ def _delegate_property_get(self, name): result = Series(result, index=index, name=self.name) # setting this object will show a SettingWithCopyWarning/Error - result._is_copy = ("modifications to a property of a datetimelike " - "object are not supported and are discarded. " - "Change values on the original.") + result._is_copy = ( + "modifications to a property of a datetimelike " + "object are not supported and are discarded. " + "Change values on the original." + ) return result def _delegate_property_set(self, name, value, *args, **kwargs): - raise ValueError("modifications to a property of a datetimelike " - "object are not supported. Change values on the " - "original.") + raise ValueError( + "modifications to a property of a datetimelike " + "object are not supported. Change values on the " + "original." + ) def _delegate_method(self, name, *args, **kwargs): from pandas import Series + values = self._get_values() method = getattr(values, name) @@ -99,19 +114,21 @@ def _delegate_method(self, name, *args, **kwargs): result = Series(result, index=self._parent.index, name=self.name) # setting this object will show a SettingWithCopyWarning/Error - result._is_copy = ("modifications to a method of a datetimelike " - "object are not supported and are discarded. " - "Change values on the original.") + result._is_copy = ( + "modifications to a method of a datetimelike " + "object are not supported and are discarded. " + "Change values on the original." + ) return result -@delegate_names(delegate=DatetimeArray, - accessors=DatetimeArray._datetimelike_ops, - typ="property") -@delegate_names(delegate=DatetimeArray, - accessors=DatetimeArray._datetimelike_methods, - typ="method") +@delegate_names( + delegate=DatetimeArray, accessors=DatetimeArray._datetimelike_ops, typ="property" +) +@delegate_names( + delegate=DatetimeArray, accessors=DatetimeArray._datetimelike_methods, typ="method" +) class DatetimeProperties(Properties): """ Accessor object for datetimelike properties of the Series values. @@ -177,12 +194,14 @@ def freq(self): return self._get_values().inferred_freq -@delegate_names(delegate=TimedeltaArray, - accessors=TimedeltaArray._datetimelike_ops, - typ="property") -@delegate_names(delegate=TimedeltaArray, - accessors=TimedeltaArray._datetimelike_methods, - typ="method") +@delegate_names( + delegate=TimedeltaArray, accessors=TimedeltaArray._datetimelike_ops, typ="property" +) +@delegate_names( + delegate=TimedeltaArray, + accessors=TimedeltaArray._datetimelike_methods, + typ="method", +) class TimedeltaProperties(Properties): """ Accessor object for datetimelike properties of the Series values. @@ -266,12 +285,12 @@ def freq(self): return self._get_values().inferred_freq -@delegate_names(delegate=PeriodArray, - accessors=PeriodArray._datetimelike_ops, - typ="property") -@delegate_names(delegate=PeriodArray, - accessors=PeriodArray._datetimelike_methods, - typ="method") +@delegate_names( + delegate=PeriodArray, accessors=PeriodArray._datetimelike_ops, typ="property" +) +@delegate_names( + delegate=PeriodArray, accessors=PeriodArray._datetimelike_methods, typ="method" +) class PeriodProperties(Properties): """ Accessor object for datetimelike properties of the Series values. @@ -287,9 +306,9 @@ class PeriodProperties(Properties): """ -class CombinedDatetimelikeProperties(DatetimeProperties, - TimedeltaProperties, PeriodProperties): - +class CombinedDatetimelikeProperties( + DatetimeProperties, TimedeltaProperties, PeriodProperties +): def __new__(cls, data): # CombinedDatetimelikeProperties isn't really instantiated. Instead # we need to choose which parent (datetime or timedelta) is @@ -298,14 +317,14 @@ def __new__(cls, data): from pandas import Series if not isinstance(data, Series): - raise TypeError("cannot convert an object of type {0} to a " - "datetimelike index".format(type(data))) + raise TypeError( + "cannot convert an object of type {0} to a " + "datetimelike index".format(type(data)) + ) orig = data if is_categorical_dtype(data) else None if orig is not None: - data = Series(orig.values.categories, - name=orig.name, - copy=False) + data = Series(orig.values.categories, name=orig.name, copy=False) try: if is_datetime64_dtype(data.dtype): @@ -321,5 +340,4 @@ def __new__(cls, data): except Exception: pass # we raise an attribute error anyway - raise AttributeError("Can only use .dt accessor with datetimelike " - "values") + raise AttributeError("Can only use .dt accessor with datetimelike " "values") diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 6299fc482d0dfa..a17f74286d59f3 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -5,40 +5,64 @@ import pandas.core.common as com from pandas.core.indexes.base import ( - Index, _new_Index, ensure_index, ensure_index_from_sequences) + Index, + _new_Index, + ensure_index, + ensure_index_from_sequences, +) from pandas.core.indexes.base import InvalidIndexError # noqa:F401 from pandas.core.indexes.category import CategoricalIndex # noqa:F401 from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.interval import IntervalIndex # noqa:F401 from pandas.core.indexes.multi import MultiIndex # noqa:F401 from pandas.core.indexes.numeric import ( # noqa:F401 - Float64Index, Int64Index, NumericIndex, UInt64Index) + Float64Index, + Int64Index, + NumericIndex, + UInt64Index, +) from pandas.core.indexes.period import PeriodIndex from pandas.core.indexes.range import RangeIndex # noqa:F401 from pandas.core.indexes.timedeltas import TimedeltaIndex -_sort_msg = textwrap.dedent("""\ +_sort_msg = textwrap.dedent( + """\ Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass 'sort=False'. To retain the current behavior and silence the warning, pass 'sort=True'. -""") +""" +) # TODO: there are many places that rely on these private methods existing in # pandas.core.index -__all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index', - 'CategoricalIndex', 'IntervalIndex', 'RangeIndex', 'UInt64Index', - 'InvalidIndexError', 'TimedeltaIndex', - 'PeriodIndex', 'DatetimeIndex', - '_new_Index', 'NaT', - 'ensure_index', 'ensure_index_from_sequences', - '_get_combined_index', - '_get_objs_combined_axis', '_union_indexes', - '_get_consensus_names', - '_all_indexes_same'] +__all__ = [ + "Index", + "MultiIndex", + "NumericIndex", + "Float64Index", + "Int64Index", + "CategoricalIndex", + "IntervalIndex", + "RangeIndex", + "UInt64Index", + "InvalidIndexError", + "TimedeltaIndex", + "PeriodIndex", + "DatetimeIndex", + "_new_Index", + "NaT", + "ensure_index", + "ensure_index_from_sequences", + "_get_combined_index", + "_get_objs_combined_axis", + "_union_indexes", + "_get_consensus_names", + "_all_indexes_same", +] def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): @@ -64,8 +88,7 @@ def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): ------- Index """ - obs_idxes = [obj._get_axis(axis) for obj in objs - if hasattr(obj, '_get_axis')] + obs_idxes = [obj._get_axis(axis) for obj in objs if hasattr(obj, "_get_axis")] if obs_idxes: return _get_combined_index(obs_idxes, intersect=intersect, sort=sort) @@ -142,7 +165,7 @@ def _union_indexes(indexes, sort=True): Index """ if len(indexes) == 0: - raise AssertionError('Must have at least 1 Index to union') + raise AssertionError("Must have at least 1 Index to union") if len(indexes) == 1: result = indexes[0] if isinstance(result, list): @@ -165,24 +188,24 @@ def _unique_indices(inds): ------- Index """ + def conv(i): if isinstance(i, Index): i = i.tolist() return i - return Index( - lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort)) + return Index(lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort)) - if kind == 'special': + if kind == "special": result = indexes[0] - if hasattr(result, 'union_many'): + if hasattr(result, "union_many"): return result.union_many(indexes[1:]) else: for other in indexes[1:]: result = result.union(other) return result - elif kind == 'array': + elif kind == "array": index = indexes[0] for other in indexes[1:]: if not index.equals(other): @@ -227,17 +250,18 @@ def _sanitize_and_check(indexes): if list in kinds: if len(kinds) > 1: - indexes = [Index(com.try_sort(x)) - if not isinstance(x, Index) else - x for x in indexes] + indexes = [ + Index(com.try_sort(x)) if not isinstance(x, Index) else x + for x in indexes + ] kinds.remove(list) else: - return indexes, 'list' + return indexes, "list" if len(kinds) > 1 or Index not in kinds: - return indexes, 'special' + return indexes, "special" else: - return indexes, 'array' + return indexes, "array" def _get_consensus_names(indexes): @@ -259,8 +283,7 @@ def _get_consensus_names(indexes): # find the non-none names, need to tupleify to make # the set hashable, then reverse on return - consensus_names = {tuple(i.names) for i in indexes - if com._any_not_none(*i.names)} + consensus_names = {tuple(i.names) for i in indexes if com._any_not_none(*i.names)} if len(consensus_names) == 1: return list(list(consensus_names)[0]) return [None] * indexes[0].nlevels diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6e0d26750df00d..973a022cfc3f15 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -17,18 +17,47 @@ from pandas.core.dtypes.cast import maybe_cast_to_integer_array from pandas.core.dtypes.common import ( - ensure_categorical, ensure_int64, ensure_object, ensure_platform_int, - is_bool, is_bool_dtype, is_categorical, is_categorical_dtype, - is_datetime64_any_dtype, is_datetime64tz_dtype, is_dtype_equal, - is_extension_array_dtype, is_float, is_float_dtype, is_hashable, - is_integer, is_integer_dtype, is_interval_dtype, is_iterator, is_list_like, - is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype, - is_timedelta64_dtype, is_unsigned_integer_dtype, pandas_dtype) + ensure_categorical, + ensure_int64, + ensure_object, + ensure_platform_int, + is_bool, + is_bool_dtype, + is_categorical, + is_categorical_dtype, + is_datetime64_any_dtype, + is_datetime64tz_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_float, + is_float_dtype, + is_hashable, + is_integer, + is_integer_dtype, + is_interval_dtype, + is_iterator, + is_list_like, + is_object_dtype, + is_period_dtype, + is_scalar, + is_signed_integer_dtype, + is_timedelta64_dtype, + is_unsigned_integer_dtype, + pandas_dtype, +) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCDateOffset, ABCDatetimeArray, ABCIndexClass, - ABCMultiIndex, ABCPandasArray, ABCPeriodIndex, ABCSeries, - ABCTimedeltaArray, ABCTimedeltaIndex) + ABCDataFrame, + ABCDateOffset, + ABCDatetimeArray, + ABCIndexClass, + ABCMultiIndex, + ABCPandasArray, + ABCPeriodIndex, + ABCSeries, + ABCTimedeltaArray, + ABCTimedeltaIndex, +) from pandas.core.dtypes.missing import array_equivalent, isna from pandas.core import ops @@ -44,16 +73,24 @@ from pandas.core.strings import StringMethods from pandas.io.formats.printing import ( - default_pprint, format_object_attrs, format_object_summary, pprint_thing) - -__all__ = ['Index'] - -_unsortable_types = frozenset(('mixed', 'mixed-integer')) - -_index_doc_kwargs = dict(klass='Index', inplace='', - target_klass='Index', - raises_section='', - unique='Index', duplicated='np.ndarray') + default_pprint, + format_object_attrs, + format_object_summary, + pprint_thing, +) + +__all__ = ["Index"] + +_unsortable_types = frozenset(("mixed", "mixed-integer")) + +_index_doc_kwargs = dict( + klass="Index", + inplace="", + target_klass="Index", + raises_section="", + unique="Index", + duplicated="np.ndarray", +) _index_shared_docs = dict() @@ -61,15 +98,15 @@ def _make_comparison_op(op, cls): def cmp_method(self, other): if isinstance(other, (np.ndarray, Index, ABCSeries)): if other.ndim > 0 and len(self) != len(other): - raise ValueError('Lengths must match to compare') + raise ValueError("Lengths must match to compare") if is_object_dtype(self) and not isinstance(self, ABCMultiIndex): # don't pass MultiIndex - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = ops._comp_method_OBJECT_ARRAY(op, self.values, other) else: - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = op(self.values, np.asarray(other)) # technically we could support bool dtyped Index @@ -81,7 +118,7 @@ def cmp_method(self, other): except TypeError: return result - name = '__{name}__'.format(name=op.__name__) + name = "__{name}__".format(name=op.__name__) # TODO: docstring? return set_function_name(cmp_method, name, cls) @@ -93,12 +130,14 @@ def index_arithmetic_method(self, other): elif isinstance(other, ABCTimedeltaIndex): # Defer to subclass implementation return NotImplemented - elif (isinstance(other, (np.ndarray, ABCTimedeltaArray)) and - is_timedelta64_dtype(other)): + elif isinstance( + other, (np.ndarray, ABCTimedeltaArray) + ) and is_timedelta64_dtype(other): # GH#22390; wrap in Series for op, this will in turn wrap in # TimedeltaIndex, but will correctly raise TypeError instead of # NullFrequencyError for add/sub ops from pandas import Series + other = Series(other) out = op(self, other) return Index(out, name=self.name) @@ -112,7 +151,7 @@ def index_arithmetic_method(self, other): return self._evaluate_with_datetime_like(other, op) values = self.values - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = op(values, other) result = missing.dispatch_missing(op, values, other, result) @@ -125,7 +164,7 @@ def index_arithmetic_method(self, other): result = Index(result, **attrs) return result - name = '__{name}__'.format(name=op.__name__) + name = "__{name}__".format(name=op.__name__) # TODO: docstring? return set_function_name(index_arithmetic_method, name, cls) @@ -147,6 +186,7 @@ def _new_Index(cls, d): # ordinals through __new__ GH #13277 if issubclass(cls, ABCPeriodIndex): from pandas.core.indexes.period import _new_PeriodIndex + return _new_PeriodIndex(cls, **d) return cls.__new__(cls, **d) @@ -191,8 +231,9 @@ class Index(IndexOpsMixin, PandasObject): >>> pd.Index(list('abc')) Index(['a', 'b', 'c'], dtype='object') """ + # tolist is not actually deprecated, just suppressed in the __dir__ - _deprecations = DirNamesMixin._deprecations | frozenset(['tolist']) + _deprecations = DirNamesMixin._deprecations | frozenset(["tolist"]) # To hand over control to subclasses _join_precedence = 1 @@ -213,12 +254,12 @@ def _inner_indexer(self, left, right): def _outer_indexer(self, left, right): return libjoin.outer_join_indexer(left, right) - _typ = 'index' + _typ = "index" _data = None _id = None name = None - _comparables = ['name'] - _attributes = ['name'] + _comparables = ["name"] + _attributes = ["name"] _is_numeric_dtype = False _can_hold_na = True @@ -231,27 +272,39 @@ def _outer_indexer(self, left, right): _engine_type = libindex.ObjectEngine - _accessors = {'str'} + _accessors = {"str"} str = CachedAccessor("str", StringMethods) # -------------------------------------------------------------------- # Constructors - def __new__(cls, data=None, dtype=None, copy=False, name=None, - fastpath=None, tupleize_cols=True, **kwargs): + def __new__( + cls, + data=None, + dtype=None, + copy=False, + name=None, + fastpath=None, + tupleize_cols=True, + **kwargs + ): - if name is None and hasattr(data, 'name'): + if name is None and hasattr(data, "name"): name = data.name if fastpath is not None: - warnings.warn("The 'fastpath' keyword is deprecated, and will be " - "removed in a future version.", - FutureWarning, stacklevel=2) + warnings.warn( + "The 'fastpath' keyword is deprecated, and will be " + "removed in a future version.", + FutureWarning, + stacklevel=2, + ) if fastpath: return cls._simple_new(data, name) from .range import RangeIndex + if isinstance(data, ABCPandasArray): # ensure users don't accidentally put a PandasArray in an index. data = data.to_numpy() @@ -265,20 +318,23 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # categorical elif is_categorical_dtype(data) or is_categorical_dtype(dtype): from .category import CategoricalIndex - return CategoricalIndex(data, dtype=dtype, copy=copy, name=name, - **kwargs) + + return CategoricalIndex(data, dtype=dtype, copy=copy, name=name, **kwargs) # interval - elif ((is_interval_dtype(data) or is_interval_dtype(dtype)) and - not is_object_dtype(dtype)): + elif ( + is_interval_dtype(data) or is_interval_dtype(dtype) + ) and not is_object_dtype(dtype): from .interval import IntervalIndex - closed = kwargs.get('closed', None) - return IntervalIndex(data, dtype=dtype, name=name, copy=copy, - closed=closed) - elif (is_datetime64_any_dtype(data) or - (dtype is not None and is_datetime64_any_dtype(dtype)) or - 'tz' in kwargs): + closed = kwargs.get("closed", None) + return IntervalIndex(data, dtype=dtype, name=name, copy=copy, closed=closed) + + elif ( + is_datetime64_any_dtype(data) + or (dtype is not None and is_datetime64_any_dtype(dtype)) + or "tz" in kwargs + ): from pandas import DatetimeIndex if dtype is not None and is_dtype_equal(_o_dtype, dtype): @@ -291,25 +347,30 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, result = DatetimeIndex(data, copy=False, name=name, **kwargs) return result.astype(object) else: - result = DatetimeIndex(data, copy=copy, name=name, - dtype=dtype, **kwargs) + result = DatetimeIndex( + data, copy=copy, name=name, dtype=dtype, **kwargs + ) return result - elif (is_timedelta64_dtype(data) or - (dtype is not None and is_timedelta64_dtype(dtype))): + elif is_timedelta64_dtype(data) or ( + dtype is not None and is_timedelta64_dtype(dtype) + ): from pandas import TimedeltaIndex + if dtype is not None and is_dtype_equal(_o_dtype, dtype): # Note we can pass copy=False because the .astype below # will always make a copy result = TimedeltaIndex(data, copy=False, name=name, **kwargs) return result.astype(object) else: - result = TimedeltaIndex(data, copy=copy, name=name, - dtype=dtype, **kwargs) + result = TimedeltaIndex( + data, copy=copy, name=name, dtype=dtype, **kwargs + ) return result elif is_period_dtype(data) and not is_object_dtype(dtype): from pandas import PeriodIndex + result = PeriodIndex(data, copy=copy, name=name, **kwargs) return result @@ -320,12 +381,12 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # coerce to the provided dtype data = dtype.construct_array_type()._from_sequence( - data, dtype=dtype, copy=False) + data, dtype=dtype, copy=False + ) # coerce to the object dtype data = data.astype(object) - return Index(data, dtype=object, copy=copy, name=name, - **kwargs) + return Index(data, dtype=object, copy=copy, name=name, **kwargs) # index-like elif isinstance(data, (np.ndarray, Index, ABCSeries)): @@ -339,13 +400,13 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # GH 11836 if is_integer_dtype(dtype): inferred = lib.infer_dtype(data, skipna=False) - if inferred == 'integer': - data = maybe_cast_to_integer_array(data, dtype, - copy=copy) - elif inferred in ['floating', 'mixed-integer-float']: + if inferred == "integer": + data = maybe_cast_to_integer_array(data, dtype, copy=copy) + elif inferred in ["floating", "mixed-integer-float"]: if isna(data).any(): - raise ValueError('cannot convert float ' - 'NaN to integer') + raise ValueError( + "cannot convert float " "NaN to integer" + ) if inferred == "mixed-integer-float": data = maybe_cast_to_integer_array(data, dtype) @@ -354,22 +415,23 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # then coerce to integer. try: return cls._try_convert_to_int_index( - data, copy, name, dtype) + data, copy, name, dtype + ) except ValueError: pass # Return an actual float index. from .numeric import Float64Index - return Float64Index(data, copy=copy, dtype=dtype, - name=name) - elif inferred == 'string': + return Float64Index(data, copy=copy, dtype=dtype, name=name) + + elif inferred == "string": pass else: data = data.astype(dtype) elif is_float_dtype(dtype): inferred = lib.infer_dtype(data, skipna=False) - if inferred == 'string': + if inferred == "string": pass else: data = data.astype(dtype) @@ -378,25 +440,29 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, except (TypeError, ValueError) as e: msg = str(e) - if ("cannot convert float" in msg or - "Trying to coerce float values to integer" in msg): + if ( + "cannot convert float" in msg + or "Trying to coerce float values to integer" in msg + ): raise # maybe coerce to a sub-class - from pandas.core.indexes.period import ( - PeriodIndex, IncompatibleFrequency) + from pandas.core.indexes.period import PeriodIndex, IncompatibleFrequency if is_signed_integer_dtype(data.dtype): from .numeric import Int64Index + return Int64Index(data, copy=copy, dtype=dtype, name=name) elif is_unsigned_integer_dtype(data.dtype): from .numeric import UInt64Index + return UInt64Index(data, copy=copy, dtype=dtype, name=name) elif is_float_dtype(data.dtype): from .numeric import Float64Index + return Float64Index(data, copy=copy, dtype=dtype, name=name) elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data): - subarr = data.astype('object') + subarr = data.astype("object") else: subarr = com.asarray_tuplesafe(data, dtype=object) @@ -407,54 +473,57 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if dtype is None: inferred = lib.infer_dtype(subarr, skipna=False) - if inferred == 'integer': + if inferred == "integer": try: - return cls._try_convert_to_int_index( - subarr, copy, name, dtype) + return cls._try_convert_to_int_index(subarr, copy, name, dtype) except ValueError: pass - return Index(subarr, copy=copy, - dtype=object, name=name) - elif inferred in ['floating', 'mixed-integer-float']: + return Index(subarr, copy=copy, dtype=object, name=name) + elif inferred in ["floating", "mixed-integer-float"]: from .numeric import Float64Index + return Float64Index(subarr, copy=copy, name=name) - elif inferred == 'interval': + elif inferred == "interval": from .interval import IntervalIndex + try: return IntervalIndex(subarr, name=name, copy=copy) except ValueError: # GH27172: mixed closed Intervals --> object dtype pass - elif inferred == 'boolean': + elif inferred == "boolean": # don't support boolean explicitly ATM pass - elif inferred != 'string': - if inferred.startswith('datetime'): - if (lib.is_datetime_with_singletz_array(subarr) or - 'tz' in kwargs): + elif inferred != "string": + if inferred.startswith("datetime"): + if ( + lib.is_datetime_with_singletz_array(subarr) + or "tz" in kwargs + ): # only when subarr has the same tz from pandas import DatetimeIndex + try: - return DatetimeIndex(subarr, copy=copy, - name=name, **kwargs) + return DatetimeIndex( + subarr, copy=copy, name=name, **kwargs + ) except OutOfBoundsDatetime: pass - elif inferred.startswith('timedelta'): + elif inferred.startswith("timedelta"): from pandas import TimedeltaIndex - return TimedeltaIndex(subarr, copy=copy, name=name, - **kwargs) - elif inferred == 'period': + + return TimedeltaIndex(subarr, copy=copy, name=name, **kwargs) + elif inferred == "period": try: return PeriodIndex(subarr, name=name, **kwargs) except IncompatibleFrequency: pass return cls._simple_new(subarr, name) - elif hasattr(data, '__array__'): - return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, - **kwargs) + elif hasattr(data, "__array__"): + return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) elif data is None or is_scalar(data): cls._scalar_data_error(data) else: @@ -467,8 +536,10 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # we must be all tuples, otherwise don't construct # 10697 from .multi import MultiIndex + return MultiIndex.from_tuples( - data, names=name or kwargs.get('names')) + data, names=name or kwargs.get("names") + ) # other iterable of some kind subarr = com.asarray_tuplesafe(data, dtype=object) return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs) @@ -512,14 +583,15 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): Must be careful not to recurse. """ - if not hasattr(values, 'dtype'): + if not hasattr(values, "dtype"): if (values is None or not len(values)) and dtype is not None: values = np.empty(0, dtype=dtype) else: values = np.array(values, copy=False) if is_object_dtype(values): - values = cls(values, name=name, dtype=dtype, - **kwargs)._ndarray_values + values = cls( + values, name=name, dtype=dtype, **kwargs + )._ndarray_values if isinstance(values, (ABCSeries, ABCIndexClass)): # Index._data must always be an ndarray. @@ -553,7 +625,9 @@ def _get_attributes_dict(self): """ return {k: getattr(self, k, None) for k in self._attributes} - _index_shared_docs['_shallow_copy'] = """ + _index_shared_docs[ + "_shallow_copy" + ] = """ Create a new Index with the same class as the caller, don't copy the data, use the same object attributes with passed in attributes taking precedence. @@ -566,17 +640,17 @@ def _get_attributes_dict(self): kwargs : updates the default attributes for this Index """ - @Appender(_index_shared_docs['_shallow_copy']) + @Appender(_index_shared_docs["_shallow_copy"]) def _shallow_copy(self, values=None, **kwargs): if values is None: values = self.values attributes = self._get_attributes_dict() attributes.update(kwargs) - if not len(values) and 'dtype' not in kwargs: - attributes['dtype'] = self.dtype + if not len(values) and "dtype" not in kwargs: + attributes["dtype"] = self.dtype # _simple_new expects an the type of self._data - values = getattr(values, '_values', values) + values = getattr(values, "_values", values) if isinstance(values, ABCDatetimeArray): # `self.values` returns `self` for tz-aware, so we need to unwrap # more specifically @@ -599,9 +673,9 @@ def _shallow_copy_with_infer(self, values, **kwargs): """ attributes = self._get_attributes_dict() attributes.update(kwargs) - attributes['copy'] = False - if not len(values) and 'dtype' not in kwargs: - attributes['dtype'] = self.dtype + attributes["copy"] = False + if not len(values) and "dtype" not in kwargs: + attributes["dtype"] = self.dtype if self._infer_as_myclass: try: return self._constructor(values, **attributes) @@ -630,8 +704,7 @@ def is_(self, other): True if both have same underlying data, False otherwise : bool """ # use something other than None to be clearer - return self._id is getattr( - other, '_id', Ellipsis) and self._id is not None + return self._id is getattr(other, "_id", Ellipsis) and self._id is not None def _reset_identity(self): """ @@ -690,12 +763,15 @@ def dtype_str(self): .. deprecated:: 0.25.0 """ - warnings.warn('`dtype_str` has been deprecated. Call `str` on the ' - 'dtype attribute instead.', FutureWarning, - stacklevel=2) + warnings.warn( + "`dtype_str` has been deprecated. Call `str` on the " + "dtype attribute instead.", + FutureWarning, + stacklevel=2, + ) return str(self.dtype) - def ravel(self, order='C'): + def ravel(self, order="C"): """ Return an ndarray of the flattened values of the underlying data. @@ -714,7 +790,7 @@ def view(self, cls=None): # we need to see if we are subclassing an # index type here - if cls is not None and not hasattr(cls, '_typ'): + if cls is not None and not hasattr(cls, "_typ"): result = self._data.view(cls) else: result = self._shallow_copy() @@ -722,7 +798,9 @@ def view(self, cls=None): result._id = self._id return result - _index_shared_docs['astype'] = """ + _index_shared_docs[ + "astype" + ] = """ Create an Index with values cast to dtypes. The class of a new Index is determined by dtype. When conversion is impossible, a ValueError exception is raised. @@ -747,22 +825,22 @@ def view(self, cls=None): Index with values cast to specified dtype. """ - @Appender(_index_shared_docs['astype']) + @Appender(_index_shared_docs["astype"]) def astype(self, dtype, copy=True): if is_dtype_equal(self.dtype, dtype): return self.copy() if copy else self elif is_categorical_dtype(dtype): from .category import CategoricalIndex - return CategoricalIndex(self.values, name=self.name, dtype=dtype, - copy=copy) + + return CategoricalIndex(self.values, name=self.name, dtype=dtype, copy=copy) elif is_datetime64tz_dtype(dtype): # TODO(GH-24559): Remove this block, use the following elif. # avoid FutureWarning from DatetimeIndex constructor. from pandas import DatetimeIndex + tz = pandas_dtype(dtype).tz - return (DatetimeIndex(np.asarray(self)) - .tz_localize("UTC").tz_convert(tz)) + return DatetimeIndex(np.asarray(self)).tz_localize("UTC").tz_convert(tz) elif is_extension_array_dtype(dtype): return Index(np.asarray(self), dtype=dtype, copy=copy) @@ -770,15 +848,20 @@ def astype(self, dtype, copy=True): try: if is_datetime64tz_dtype(dtype): from pandas import DatetimeIndex - return DatetimeIndex(self.values, name=self.name, dtype=dtype, - copy=copy) - return Index(self.values.astype(dtype, copy=copy), name=self.name, - dtype=dtype) + + return DatetimeIndex( + self.values, name=self.name, dtype=dtype, copy=copy + ) + return Index( + self.values.astype(dtype, copy=copy), name=self.name, dtype=dtype + ) except (TypeError, ValueError): - msg = 'Cannot cast {name} to dtype {dtype}' + msg = "Cannot cast {name} to dtype {dtype}" raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) - _index_shared_docs['take'] = """ + _index_shared_docs[ + "take" + ] = """ Return a new %(klass)s of the values selected by the indices. For internal compatibility with numpy arrays. @@ -804,26 +887,29 @@ def astype(self, dtype, copy=True): numpy.ndarray.take """ - @Appender(_index_shared_docs['take'] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, - fill_value=None, **kwargs): + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): if kwargs: nv.validate_take(tuple(), kwargs) indices = ensure_platform_int(indices) if self._can_hold_na: - taken = self._assert_take_fillable(self.values, indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=self._na_value) + taken = self._assert_take_fillable( + self.values, + indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=self._na_value, + ) else: if allow_fill and fill_value is not None: - msg = 'Unable to fill values because {0} cannot contain NA' + msg = "Unable to fill values because {0} cannot contain NA" raise ValueError(msg.format(self.__class__.__name__)) taken = self.values.take(indices) return self._shallow_copy(taken) - def _assert_take_fillable(self, values, indices, allow_fill=True, - fill_value=None, na_value=np.nan): + def _assert_take_fillable( + self, values, indices, allow_fill=True, fill_value=None, na_value=np.nan + ): """ Internal method to handle NA filling of take. """ @@ -832,18 +918,21 @@ def _assert_take_fillable(self, values, indices, allow_fill=True, # only fill if we are passing a non-None fill_value if allow_fill and fill_value is not None: if (indices < -1).any(): - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) raise ValueError(msg) - taken = algos.take(values, - indices, - allow_fill=allow_fill, - fill_value=na_value) + taken = algos.take( + values, indices, allow_fill=allow_fill, fill_value=na_value + ) else: taken = values.take(indices) return taken - _index_shared_docs['repeat'] = """ + _index_shared_docs[ + "repeat" + ] = """ Repeat elements of a %(klass)s. Returns a new %(klass)s where each element of the current %(klass)s @@ -880,7 +969,7 @@ def _assert_take_fillable(self, values, indices, allow_fill=True, Index(['a', 'b', 'b', 'c', 'c', 'c'], dtype='object') """ - @Appender(_index_shared_docs['repeat'] % _index_doc_kwargs) + @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) return self._shallow_copy(self._values.repeat(repeats)) @@ -888,7 +977,9 @@ def repeat(self, repeats, axis=None): # -------------------------------------------------------------------- # Copying Methods - _index_shared_docs['copy'] = """ + _index_shared_docs[ + "copy" + ] = """ Make a copy of this object. Name and dtype sets those attributes on the new object. @@ -908,14 +999,14 @@ def repeat(self, repeats, axis=None): ``deep``, but if ``deep`` is passed it will attempt to deepcopy. """ - @Appender(_index_shared_docs['copy']) + @Appender(_index_shared_docs["copy"]) def copy(self, name=None, deep=False, dtype=None, **kwargs): if deep: new_index = self._shallow_copy(self._data.copy()) else: new_index = self._shallow_copy() - names = kwargs.get('names') + names = kwargs.get("names") names = self._validate_names(name=name, names=names, deep=deep) new_index = new_index.set_names(names) @@ -949,12 +1040,11 @@ def __repr__(self): attrs = self._format_attrs() space = self._format_space() - prepr = (",%s" % - space).join("%s=%s" % (k, v) for k, v in attrs) + prepr = (",%s" % space).join("%s=%s" % (k, v) for k, v in attrs) # no data provided, just attributes if data is None: - data = '' + data = "" res = "%s(%s%s)" % (klass, data, prepr) @@ -983,12 +1073,16 @@ def _format_data(self, name=None): """ # do we want to justify (only do so for non-objects) - is_justify = not (self.inferred_type in ('string', 'unicode') or - (self.inferred_type == 'categorical' and - is_object_dtype(self.categories))) + is_justify = not ( + self.inferred_type in ("string", "unicode") + or ( + self.inferred_type == "categorical" and is_object_dtype(self.categories) + ) + ) - return format_object_summary(self, self._formatter_func, - is_justify=is_justify, name=name) + return format_object_summary( + self, self._formatter_func, is_justify=is_justify, name=name + ) def _format_attrs(self): """ @@ -1006,16 +1100,18 @@ def format(self, name=False, formatter=None, **kwargs): """ header = [] if name: - header.append(pprint_thing(self.name, - escape_chars=('\t', '\r', '\n')) if - self.name is not None else '') + header.append( + pprint_thing(self.name, escape_chars=("\t", "\r", "\n")) + if self.name is not None + else "" + ) if formatter is not None: return header + list(self.map(formatter)) return self._format_with_header(header, **kwargs) - def _format_with_header(self, header, na_rep='NaN', **kwargs): + def _format_with_header(self, header, na_rep="NaN", **kwargs): values = self.values from pandas.io.formats.format import format_array @@ -1027,8 +1123,7 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs): values = lib.maybe_convert_objects(values, safe=1) if is_object_dtype(values.dtype): - result = [pprint_thing(x, escape_chars=('\t', '\r', '\n')) - for x in values] + result = [pprint_thing(x, escape_chars=("\t", "\r", "\n")) for x in values] # could have nans mask = isna(values) @@ -1038,7 +1133,7 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs): result = result.tolist() else: - result = _trim_front(format_array(values, None, justify='left')) + result = _trim_front(format_array(values, None, justify="left")) return header + result def to_native_types(self, slicer=None, **kwargs): @@ -1072,7 +1167,7 @@ def to_native_types(self, slicer=None, **kwargs): values = values[slicer] return values._format_native_types(**kwargs) - def _format_native_types(self, na_rep='', quoting=None, **kwargs): + def _format_native_types(self, na_rep="", quoting=None, **kwargs): """ Actually format specific types of the index. """ @@ -1100,19 +1195,18 @@ def _summary(self, name=None): """ if len(self) > 0: head = self[0] - if hasattr(head, 'format') and not isinstance(head, str): + if hasattr(head, "format") and not isinstance(head, str): head = head.format() tail = self[-1] - if hasattr(tail, 'format') and not isinstance(tail, str): + if hasattr(tail, "format") and not isinstance(tail, str): tail = tail.format() - index_summary = ', %s to %s' % (pprint_thing(head), - pprint_thing(tail)) + index_summary = ", %s to %s" % (pprint_thing(head), pprint_thing(tail)) else: - index_summary = '' + index_summary = "" if name is None: name = type(self).__name__ - return '%s: %s entries%s' % (name, len(self), index_summary) + return "%s: %s entries%s" % (name, len(self), index_summary) def summary(self, name=None): """ @@ -1120,8 +1214,11 @@ def summary(self, name=None): .. deprecated:: 0.23.0 """ - warnings.warn("'summary' is deprecated and will be removed in a " - "future version.", FutureWarning, stacklevel=2) + warnings.warn( + "'summary' is deprecated and will be removed in a " "future version.", + FutureWarning, + stacklevel=2, + ) return self._summary(name) # -------------------------------------------------------------------- @@ -1227,6 +1324,7 @@ def to_frame(self, index=True, name=None): """ from pandas import DataFrame + if name is None: name = self.name or 0 result = DataFrame({name: self._values.copy()}) @@ -1244,6 +1342,7 @@ def _validate_names(self, name=None, names=None, deep=False): Index and plural 'names' parameter for MultiIndex. """ from copy import deepcopy + if names is not None and name is not None: raise TypeError("Can only provide one of `names` and `name`") elif names is None and name is None: @@ -1258,7 +1357,7 @@ def _validate_names(self, name=None, names=None, deep=False): return name def _get_names(self): - return FrozenList((self.name, )) + return FrozenList((self.name,)) def _set_names(self, values, level=None): """ @@ -1277,17 +1376,17 @@ def _set_names(self, values, level=None): TypeError if each name is not hashable. """ if not is_list_like(values): - raise ValueError('Names must be a list-like') + raise ValueError("Names must be a list-like") if len(values) != 1: - raise ValueError('Length of new names must be 1, got %d' % - len(values)) + raise ValueError("Length of new names must be 1, got %d" % len(values)) # GH 20527 # All items in 'name' need to be hashable: for name in values: if not is_hashable(name): - raise TypeError('{}.name must be a hashable type' - .format(self.__class__.__name__)) + raise TypeError( + "{}.name must be a hashable type".format(self.__class__.__name__) + ) self.name = values[0] names = property(fset=_set_names, fget=_get_names) @@ -1350,10 +1449,9 @@ def set_names(self, names, level=None, inplace=False): """ if level is not None and not isinstance(self, ABCMultiIndex): - raise ValueError('Level must be None for non-MultiIndex') + raise ValueError("Level must be None for non-MultiIndex") - if level is not None and not is_list_like(level) and is_list_like( - names): + if level is not None and not is_list_like(level) and is_list_like(names): msg = "Names must be a string when a single level is provided." raise TypeError(msg) @@ -1450,15 +1548,16 @@ def _validate_index_level(self, level): """ if isinstance(level, int): if level < 0 and level != -1: - raise IndexError("Too many levels: Index has only 1 level," - " %d is not a valid level number" % (level, )) + raise IndexError( + "Too many levels: Index has only 1 level," + " %d is not a valid level number" % (level,) + ) elif level > 0: - raise IndexError("Too many levels:" - " Index has only 1 level, not %d" % - (level + 1)) + raise IndexError( + "Too many levels:" " Index has only 1 level, not %d" % (level + 1) + ) elif level != self.name: - raise KeyError('Level %s must be same as name (%s)' % - (level, self.name)) + raise KeyError("Level %s must be same as name (%s)" % (level, self.name)) def _get_level_number(self, level): self._validate_index_level(level) @@ -1552,9 +1651,11 @@ def droplevel(self, level=0): if len(level) == 0: return self if len(level) >= self.nlevels: - raise ValueError("Cannot remove {} levels from an index with {} " - "levels: at least one level must be " - "left.".format(len(level), self.nlevels)) + raise ValueError( + "Cannot remove {} levels from an index with {} " + "levels: at least one level must be " + "left.".format(len(level), self.nlevels) + ) # The two checks above guarantee that here self is a MultiIndex new_levels = list(self.levels) @@ -1578,10 +1679,17 @@ def droplevel(self, level=0): return result else: from .multi import MultiIndex - return MultiIndex(levels=new_levels, codes=new_codes, - names=new_names, verify_integrity=False) - _index_shared_docs['_get_grouper_for_level'] = """ + return MultiIndex( + levels=new_levels, + codes=new_codes, + names=new_names, + verify_integrity=False, + ) + + _index_shared_docs[ + "_get_grouper_for_level" + ] = """ Get index grouper corresponding to an index level Parameters @@ -1601,7 +1709,7 @@ def droplevel(self, level=0): Index of unique values for level. """ - @Appender(_index_shared_docs['_get_grouper_for_level']) + @Appender(_index_shared_docs["_get_grouper_for_level"]) def _get_grouper_for_level(self, mapper, level=None): assert level is None or level == 0 if mapper is None: @@ -1704,16 +1812,16 @@ def has_duplicates(self): return not self.is_unique def is_boolean(self): - return self.inferred_type in ['boolean'] + return self.inferred_type in ["boolean"] def is_integer(self): - return self.inferred_type in ['integer'] + return self.inferred_type in ["integer"] def is_floating(self): - return self.inferred_type in ['floating', 'mixed-integer-float'] + return self.inferred_type in ["floating", "mixed-integer-float"] def is_numeric(self): - return self.inferred_type in ['integer', 'floating'] + return self.inferred_type in ["integer", "floating"] def is_object(self): return is_object_dtype(self.dtype) @@ -1752,19 +1860,19 @@ def is_categorical(self): >>> s.index.is_categorical() False """ - return self.inferred_type in ['categorical'] + return self.inferred_type in ["categorical"] def is_interval(self): - return self.inferred_type in ['interval'] + return self.inferred_type in ["interval"] def is_mixed(self): - return self.inferred_type in ['mixed'] + return self.inferred_type in ["mixed"] def holds_integer(self): """ Whether the type is an integer type. """ - return self.inferred_type in ['integer', 'mixed-integer'] + return self.inferred_type in ["integer", "mixed-integer"] @cache_readonly def inferred_type(self): @@ -1793,7 +1901,7 @@ def __setstate__(self, state): """ if isinstance(state, dict): - self._data = state.pop('data') + self._data = state.pop("data") for k, v in state.items(): setattr(self, k, v) @@ -1909,6 +2017,7 @@ def isna(self): array([False, True, True, True], dtype=bool) """ return self._isnan + isnull = isna def notna(self): @@ -1956,9 +2065,12 @@ def notna(self): array([ True, True, True, False]) """ return ~self.isna() + notnull = notna - _index_shared_docs['fillna'] = """ + _index_shared_docs[ + "fillna" + ] = """ Fill NA/NaN values with the specified value Parameters @@ -1976,7 +2088,7 @@ def notna(self): filled : Index """ - @Appender(_index_shared_docs['fillna']) + @Appender(_index_shared_docs["fillna"]) def fillna(self, value=None, downcast=None): self._assert_can_do_op(value) if self.hasnans: @@ -1987,7 +2099,9 @@ def fillna(self, value=None, downcast=None): return Index(result, name=self.name) return self._shallow_copy() - _index_shared_docs['dropna'] = """ + _index_shared_docs[ + "dropna" + ] = """ Return Index without NA/NaN values Parameters @@ -2001,9 +2115,9 @@ def fillna(self, value=None, downcast=None): valid : Index """ - @Appender(_index_shared_docs['dropna']) - def dropna(self, how='any'): - if how not in ('any', 'all'): + @Appender(_index_shared_docs["dropna"]) + def dropna(self, how="any"): + if how not in ("any", "all"): raise ValueError("invalid how option: {0}".format(how)) if self.hasnans: @@ -2013,8 +2127,9 @@ def dropna(self, how='any'): # -------------------------------------------------------------------- # Uniqueness Methods - _index_shared_docs['index_unique'] = ( - """ + _index_shared_docs[ + "index_unique" + ] = """ Return unique values in the index. Uniques are returned in order of appearance, this does NOT sort. @@ -2033,16 +2148,16 @@ def dropna(self, how='any'): -------- unique Series.unique - """) + """ - @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs) + @Appender(_index_shared_docs["index_unique"] % _index_doc_kwargs) def unique(self, level=None): if level is not None: self._validate_index_level(level) result = super().unique() return self._shallow_copy(result) - def drop_duplicates(self, keep='first'): + def drop_duplicates(self, keep="first"): """ Return Index with duplicate values removed. @@ -2090,7 +2205,7 @@ def drop_duplicates(self, keep='first'): """ return super().drop_duplicates(keep=keep) - def duplicated(self, keep='first'): + def duplicated(self, keep="first"): """ Indicate duplicate index values. @@ -2198,10 +2313,13 @@ def get_duplicates(self): >>> pd.Index(dates).get_duplicates() # doctest: +SKIP DatetimeIndex([], dtype='datetime64[ns]', freq=None) """ - warnings.warn("'get_duplicates' is deprecated and will be removed in " - "a future release. You can use " - "idx[idx.duplicated()].unique() instead", - FutureWarning, stacklevel=2) + warnings.warn( + "'get_duplicates' is deprecated and will be removed in " + "a future release. You can use " + "idx[idx.duplicated()].unique() instead", + FutureWarning, + stacklevel=2, + ) return self[self.duplicated()].unique() @@ -2266,9 +2384,12 @@ def __xor__(self, other): return self.symmetric_difference(other) def __nonzero__(self): - raise ValueError("The truth value of a {0} is ambiguous. " - "Use a.empty, a.bool(), a.item(), a.any() or a.all()." - .format(self.__class__.__name__)) + raise ValueError( + "The truth value of a {0} is ambiguous. " + "Use a.empty, a.bool(), a.item(), a.any() or a.all().".format( + self.__class__.__name__ + ) + ) __bool__ = __nonzero__ @@ -2324,13 +2445,14 @@ def _is_compatible_with_other(self, other): ------- bool """ - return (type(self) is type(other) - and is_dtype_equal(self.dtype, other.dtype)) + return type(self) is type(other) and is_dtype_equal(self.dtype, other.dtype) def _validate_sort_keyword(self, sort): if sort not in [None, False]: - raise ValueError("The 'sort' keyword only takes the values of " - "None or False; {0} was passed.".format(sort)) + raise ValueError( + "The 'sort' keyword only takes the values of " + "None or False; {0} was passed.".format(sort) + ) def union(self, other, sort=None): """ @@ -2443,8 +2565,7 @@ def _union(self, other, sort): indexer, = (indexer == -1).nonzero() if len(indexer) > 0: - other_diff = algos.take_nd(rvals, indexer, - allow_fill=False) + other_diff = algos.take_nd(rvals, indexer, allow_fill=False) result = _concat._concat_compat((lvals, other_diff)) else: @@ -2454,9 +2575,12 @@ def _union(self, other, sort): try: result = sorting.safe_sort(result) except TypeError as e: - warnings.warn("{}, sort order is undefined for " - "incomparable objects".format(e), - RuntimeWarning, stacklevel=3) + warnings.warn( + "{}, sort order is undefined for " + "incomparable objects".format(e), + RuntimeWarning, + stacklevel=3, + ) # for subclasses return self._wrap_setop_result(other, result) @@ -2464,7 +2588,9 @@ def _union(self, other, sort): def _wrap_setop_result(self, other, result): return self._constructor(result, name=get_op_result_name(self, other)) - _index_shared_docs['intersection'] = """ + _index_shared_docs[ + "intersection" + ] = """ Form the intersection of two Index objects. This returns a new Index with elements common to the index and `other`. @@ -2500,7 +2626,7 @@ def _wrap_setop_result(self, other, result): """ # TODO: standardize return type of non-union setops type(self vs other) - @Appender(_index_shared_docs['intersection']) + @Appender(_index_shared_docs["intersection"]) def intersection(self, other, sort=False): self._validate_sort_keyword(sort) self._assert_can_do_setop(other) @@ -2510,8 +2636,8 @@ def intersection(self, other, sort=False): return self._get_reconciled_name_object(other) if not is_dtype_equal(self.dtype, other.dtype): - this = self.astype('O') - other = other.astype('O') + this = self.astype("O") + other = other.astype("O") return this.intersection(other, sort=sort) # TODO(EA): setops-refactor, clean all this up @@ -2536,8 +2662,7 @@ def intersection(self, other, sort=False): indexer = indexer.take((indexer != -1).nonzero()[0]) except Exception: # duplicates - indexer = algos.unique1d( - Index(rvals).get_indexer_non_unique(lvals)[0]) + indexer = algos.unique1d(Index(rvals).get_indexer_non_unique(lvals)[0]) indexer = indexer[indexer != -1] taken = other.take(indexer) @@ -2609,8 +2734,7 @@ def difference(self, other, sort=None): indexer = this.get_indexer(other) indexer = indexer.take((indexer != -1).nonzero()[0]) - label_diff = np.setdiff1d(np.arange(this.size), indexer, - assume_unique=True) + label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) the_diff = this.values.take(label_diff) if sort is None: try: @@ -2679,8 +2803,9 @@ def symmetric_difference(self, other, result_name=None, sort=None): # {this} minus {other} common_indexer = indexer.take((indexer != -1).nonzero()[0]) - left_indexer = np.setdiff1d(np.arange(this.size), common_indexer, - assume_unique=True) + left_indexer = np.setdiff1d( + np.arange(this.size), common_indexer, assume_unique=True + ) left_diff = this.values.take(left_indexer) # {other} minus {this} @@ -2695,14 +2820,14 @@ def symmetric_difference(self, other, result_name=None, sort=None): pass attribs = self._get_attributes_dict() - attribs['name'] = result_name - if 'freq' in attribs: - attribs['freq'] = None + attribs["name"] = result_name + if "freq" in attribs: + attribs["freq"] = None return self._shallow_copy_with_infer(the_diff, **attribs) def _assert_can_do_setop(self, other): if not is_list_like(other): - raise TypeError('Input must be Index or array-like') + raise TypeError("Input must be Index or array-like") return True def _convert_can_do_setop(self, other): @@ -2716,7 +2841,9 @@ def _convert_can_do_setop(self, other): # -------------------------------------------------------------------- # Indexing Methods - _index_shared_docs['get_loc'] = """ + _index_shared_docs[ + "get_loc" + ] = """ Get integer location, slice or boolean mask for requested label. Parameters @@ -2754,25 +2881,29 @@ def _convert_can_do_setop(self, other): array([False, True, False, True], dtype=bool) """ - @Appender(_index_shared_docs['get_loc']) + @Appender(_index_shared_docs["get_loc"]) def get_loc(self, key, method=None, tolerance=None): if method is None: if tolerance is not None: - raise ValueError('tolerance argument only valid if using pad, ' - 'backfill or nearest lookups') + raise ValueError( + "tolerance argument only valid if using pad, " + "backfill or nearest lookups" + ) try: return self._engine.get_loc(key) except KeyError: return self._engine.get_loc(self._maybe_cast_indexer(key)) indexer = self.get_indexer([key], method=method, tolerance=tolerance) if indexer.ndim > 1 or indexer.size > 1: - raise TypeError('get_loc requires scalar valued input') + raise TypeError("get_loc requires scalar valued input") loc = indexer.item() if loc == -1: raise KeyError(key) return loc - _index_shared_docs['get_indexer'] = """ + _index_shared_docs[ + "get_indexer" + ] = """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the current data to the new index. @@ -2819,7 +2950,7 @@ def get_loc(self, key, method=None, tolerance=None): and ``x`` is marked by -1, as it is not in ``index``. """ - @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) + @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) target = ensure_index(target) @@ -2834,30 +2965,37 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): pself, ptarget = self._maybe_promote(target) if pself is not self or ptarget is not target: - return pself.get_indexer(ptarget, method=method, limit=limit, - tolerance=tolerance) + return pself.get_indexer( + ptarget, method=method, limit=limit, tolerance=tolerance + ) if not is_dtype_equal(self.dtype, target.dtype): this = self.astype(object) target = target.astype(object) - return this.get_indexer(target, method=method, limit=limit, - tolerance=tolerance) + return this.get_indexer( + target, method=method, limit=limit, tolerance=tolerance + ) if not self.is_unique: - raise InvalidIndexError('Reindexing only valid with uniquely' - ' valued Index objects') + raise InvalidIndexError( + "Reindexing only valid with uniquely" " valued Index objects" + ) - if method == 'pad' or method == 'backfill': + if method == "pad" or method == "backfill": indexer = self._get_fill_indexer(target, method, limit, tolerance) - elif method == 'nearest': + elif method == "nearest": indexer = self._get_nearest_indexer(target, limit, tolerance) else: if tolerance is not None: - raise ValueError('tolerance argument only valid if doing pad, ' - 'backfill or nearest reindexing') + raise ValueError( + "tolerance argument only valid if doing pad, " + "backfill or nearest reindexing" + ) if limit is not None: - raise ValueError('limit argument only valid if doing pad, ' - 'backfill or nearest reindexing') + raise ValueError( + "limit argument only valid if doing pad, " + "backfill or nearest reindexing" + ) indexer = self._engine.get_indexer(target._ndarray_values) @@ -2867,22 +3005,23 @@ def _convert_tolerance(self, tolerance, target): # override this method on subclasses tolerance = np.asarray(tolerance) if target.size != tolerance.size and tolerance.size > 1: - raise ValueError('list-like tolerance size must match ' - 'target index size') + raise ValueError("list-like tolerance size must match " "target index size") return tolerance def _get_fill_indexer(self, target, method, limit=None, tolerance=None): if self.is_monotonic_increasing and target.is_monotonic_increasing: - method = (self._engine.get_pad_indexer if method == 'pad' else - self._engine.get_backfill_indexer) + method = ( + self._engine.get_pad_indexer + if method == "pad" + else self._engine.get_backfill_indexer + ) indexer = method(target._ndarray_values, limit) else: - indexer = self._get_fill_indexer_searchsorted(target, method, - limit) + indexer = self._get_fill_indexer_searchsorted(target, method, limit) if tolerance is not None: - indexer = self._filter_indexer_tolerance(target._ndarray_values, - indexer, - tolerance) + indexer = self._filter_indexer_tolerance( + target._ndarray_values, indexer, tolerance + ) return indexer def _get_fill_indexer_searchsorted(self, target, method, limit=None): @@ -2891,17 +3030,18 @@ def _get_fill_indexer_searchsorted(self, target, method, limit=None): indexes and non-monotonic targets. """ if limit is not None: - raise ValueError('limit argument for %r method only well-defined ' - 'if index and target are monotonic' % method) + raise ValueError( + "limit argument for %r method only well-defined " + "if index and target are monotonic" % method + ) - side = 'left' if method == 'pad' else 'right' + side = "left" if method == "pad" else "right" # find exact matches first (this simplifies the algorithm) indexer = self.get_indexer(target) - nonexact = (indexer == -1) - indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], - side) - if side == 'left': + nonexact = indexer == -1 + indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], side) + if side == "left": # searchsorted returns "indices into a sorted array such that, # if the corresponding elements in v were inserted before the # indices, the order of a would be preserved". @@ -2921,19 +3061,21 @@ def _get_nearest_indexer(self, target, limit, tolerance): values that can be subtracted from each other (e.g., not strings or tuples). """ - left_indexer = self.get_indexer(target, 'pad', limit=limit) - right_indexer = self.get_indexer(target, 'backfill', limit=limit) + left_indexer = self.get_indexer(target, "pad", limit=limit) + right_indexer = self.get_indexer(target, "backfill", limit=limit) target = np.asarray(target) left_distances = abs(self.values[left_indexer] - target) right_distances = abs(self.values[right_indexer] - target) op = operator.lt if self.is_monotonic_increasing else operator.le - indexer = np.where(op(left_distances, right_distances) | - (right_indexer == -1), left_indexer, right_indexer) + indexer = np.where( + op(left_distances, right_distances) | (right_indexer == -1), + left_indexer, + right_indexer, + ) if tolerance is not None: - indexer = self._filter_indexer_tolerance(target, indexer, - tolerance) + indexer = self._filter_indexer_tolerance(target, indexer, tolerance) return indexer def _filter_indexer_tolerance(self, target, indexer, tolerance): @@ -2944,7 +3086,9 @@ def _filter_indexer_tolerance(self, target, indexer, tolerance): # -------------------------------------------------------------------- # Indexer Conversion Methods - _index_shared_docs['_convert_scalar_indexer'] = """ + _index_shared_docs[ + "_convert_scalar_indexer" + ] = """ Convert a scalar indexer. Parameters @@ -2953,43 +3097,47 @@ def _filter_indexer_tolerance(self, target, indexer, tolerance): kind : {'ix', 'loc', 'getitem', 'iloc'} or None """ - @Appender(_index_shared_docs['_convert_scalar_indexer']) + @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + assert kind in ["ix", "loc", "getitem", "iloc", None] - if kind == 'iloc': - return self._validate_indexer('positional', key, kind) + if kind == "iloc": + return self._validate_indexer("positional", key, kind) - if len(self) and not isinstance(self, ABCMultiIndex,): + if len(self) and not isinstance(self, ABCMultiIndex): # we can raise here if we are definitive that this # is positional indexing (eg. .ix on with a float) # or label indexing if we are using a type able # to be represented in the index - if kind in ['getitem', 'ix'] and is_float(key): + if kind in ["getitem", "ix"] and is_float(key): if not self.is_floating(): - return self._invalid_indexer('label', key) + return self._invalid_indexer("label", key) - elif kind in ['loc'] and is_float(key): + elif kind in ["loc"] and is_float(key): # we want to raise KeyError on string/mixed here # technically we *could* raise a TypeError # on anything but mixed though - if self.inferred_type not in ['floating', - 'mixed-integer-float', - 'string', - 'unicode', - 'mixed']: - return self._invalid_indexer('label', key) - - elif kind in ['loc'] and is_integer(key): + if self.inferred_type not in [ + "floating", + "mixed-integer-float", + "string", + "unicode", + "mixed", + ]: + return self._invalid_indexer("label", key) + + elif kind in ["loc"] and is_integer(key): if not self.holds_integer(): - return self._invalid_indexer('label', key) + return self._invalid_indexer("label", key) return key - _index_shared_docs['_convert_slice_indexer'] = """ + _index_shared_docs[ + "_convert_slice_indexer" + ] = """ Convert a slice indexer. By definition, these are labels unless 'iloc' is passed in. @@ -3001,19 +3149,21 @@ def _convert_scalar_indexer(self, key, kind=None): kind : {'ix', 'loc', 'getitem', 'iloc'} or None """ - @Appender(_index_shared_docs['_convert_slice_indexer']) + @Appender(_index_shared_docs["_convert_slice_indexer"]) def _convert_slice_indexer(self, key, kind=None): - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + assert kind in ["ix", "loc", "getitem", "iloc", None] # if we are not a slice, then we are done if not isinstance(key, slice): return key # validate iloc - if kind == 'iloc': - return slice(self._validate_indexer('slice', key.start, kind), - self._validate_indexer('slice', key.stop, kind), - self._validate_indexer('slice', key.step, kind)) + if kind == "iloc": + return slice( + self._validate_indexer("slice", key.start, kind), + self._validate_indexer("slice", key.stop, kind), + self._validate_indexer("slice", key.step, kind), + ) # potentially cast the bounds to integers start, stop, step = key.start, key.stop, key.step @@ -3026,15 +3176,17 @@ def is_int(v): is_index_slice = is_int(start) and is_int(stop) is_positional = is_index_slice and not self.is_integer() - if kind == 'getitem': + if kind == "getitem": """ called from the getitem slicers, validate that we are in fact integers """ if self.is_integer() or is_index_slice: - return slice(self._validate_indexer('slice', key.start, kind), - self._validate_indexer('slice', key.stop, kind), - self._validate_indexer('slice', key.step, kind)) + return slice( + self._validate_indexer("slice", key.start, kind), + self._validate_indexer("slice", key.stop, kind), + self._validate_indexer("slice", key.step, kind), + ) # convert the slice to an indexer here @@ -3048,7 +3200,7 @@ def is_int(v): self.get_loc(stop) is_positional = False except KeyError: - if self.inferred_type == 'mixed-integer-float': + if self.inferred_type == "mixed-integer-float": raise if is_null_slicer: @@ -3091,7 +3243,9 @@ def _convert_listlike_indexer(self, keyarr, kind=None): indexer = self._convert_list_indexer(keyarr, kind=kind) return indexer, keyarr - _index_shared_docs['_convert_arr_indexer'] = """ + _index_shared_docs[ + "_convert_arr_indexer" + ] = """ Convert an array-like indexer to the appropriate dtype. Parameters @@ -3104,12 +3258,14 @@ def _convert_listlike_indexer(self, keyarr, kind=None): converted_keyarr : array-like """ - @Appender(_index_shared_docs['_convert_arr_indexer']) + @Appender(_index_shared_docs["_convert_arr_indexer"]) def _convert_arr_indexer(self, keyarr): keyarr = com.asarray_tuplesafe(keyarr) return keyarr - _index_shared_docs['_convert_index_indexer'] = """ + _index_shared_docs[ + "_convert_index_indexer" + ] = """ Convert an Index indexer to the appropriate dtype. Parameters @@ -3122,11 +3278,13 @@ def _convert_arr_indexer(self, keyarr): converted_keyarr : Index (or sub-class) """ - @Appender(_index_shared_docs['_convert_index_indexer']) + @Appender(_index_shared_docs["_convert_index_indexer"]) def _convert_index_indexer(self, keyarr): return keyarr - _index_shared_docs['_convert_list_indexer'] = """ + _index_shared_docs[ + "_convert_list_indexer" + ] = """ Convert a list-like indexer to the appropriate dtype. Parameters @@ -3140,13 +3298,16 @@ def _convert_index_indexer(self, keyarr): positional indexer or None """ - @Appender(_index_shared_docs['_convert_list_indexer']) + @Appender(_index_shared_docs["_convert_list_indexer"]) def _convert_list_indexer(self, keyarr, kind=None): - if (kind in [None, 'iloc', 'ix'] and - is_integer_dtype(keyarr) and not self.is_floating() and - not isinstance(keyarr, ABCPeriodIndex)): - - if self.inferred_type == 'mixed-integer': + if ( + kind in [None, "iloc", "ix"] + and is_integer_dtype(keyarr) + and not self.is_floating() + and not isinstance(keyarr, ABCPeriodIndex) + ): + + if self.inferred_type == "mixed-integer": indexer = self.get_indexer(keyarr) if (indexer >= 0).all(): return indexer @@ -3157,9 +3318,10 @@ def _convert_list_indexer(self, keyarr, kind=None): # IndexError in maybe_convert_indices indexer[indexer < 0] = len(self) from pandas.core.indexing import maybe_convert_indices + return maybe_convert_indices(indexer, len(self)) - elif not self.inferred_type == 'integer': + elif not self.inferred_type == "integer": keyarr = np.where(keyarr < 0, len(self) + keyarr, keyarr) return keyarr @@ -3169,10 +3331,12 @@ def _invalid_indexer(self, form, key): """ Consistent invalid indexer message. """ - raise TypeError("cannot do {form} indexing on {klass} with these " - "indexers [{key}] of {kind}".format( - form=form, klass=type(self), key=key, - kind=type(key))) + raise TypeError( + "cannot do {form} indexing on {klass} with these " + "indexers [{key}] of {kind}".format( + form=form, klass=type(self), key=key, kind=type(key) + ) + ) # -------------------------------------------------------------------- # Reindex Methods @@ -3194,8 +3358,7 @@ def _can_reindex(self, indexer): if not self.is_unique and len(indexer): raise ValueError("cannot reindex from a duplicate axis") - def reindex(self, target, method=None, level=None, limit=None, - tolerance=None): + def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ Create index with target's values (move/add/delete values as necessary). @@ -3213,14 +3376,14 @@ def reindex(self, target, method=None, level=None, limit=None, """ # GH6552: preserve names when reindexing to non-named target # (i.e. neither Index nor Series). - preserve_names = not hasattr(target, 'name') + preserve_names = not hasattr(target, "name") # GH7774: preserve dtype/tz if target is empty and not an Index. target = _ensure_has_len(target) # target may be an iterator if not isinstance(target, Index) and len(target) == 0: attrs = self._get_attributes_dict() - attrs.pop('freq', None) # don't preserve freq + attrs.pop("freq", None) # don't preserve freq values = self._data[:0] # appropriately-dtyped empty array target = self._simple_new(values, dtype=self.dtype, **attrs) else: @@ -3228,23 +3391,25 @@ def reindex(self, target, method=None, level=None, limit=None, if level is not None: if method is not None: - raise TypeError('Fill method not supported if level passed') - _, indexer, _ = self._join_level(target, level, how='right', - return_indexers=True) + raise TypeError("Fill method not supported if level passed") + _, indexer, _ = self._join_level( + target, level, how="right", return_indexers=True + ) else: if self.equals(target): indexer = None else: # check is_overlapping for IntervalIndex compat - if (self.is_unique and - not getattr(self, 'is_overlapping', False)): - indexer = self.get_indexer(target, method=method, - limit=limit, - tolerance=tolerance) + if self.is_unique and not getattr(self, "is_overlapping", False): + indexer = self.get_indexer( + target, method=method, limit=limit, tolerance=tolerance + ) else: if method is not None or limit is not None: - raise ValueError("cannot reindex a non-unique index " - "with a method or limit") + raise ValueError( + "cannot reindex a non-unique index " + "with a method or limit" + ) indexer, missing = self.get_indexer_non_unique(target) if preserve_names and target.nlevels == 1 and target.name != self.name: @@ -3315,7 +3480,9 @@ def _reindex_non_unique(self, target): # -------------------------------------------------------------------- # Join Methods - _index_shared_docs['join'] = """ + _index_shared_docs[ + "join" + ] = """ Compute join_index and indexers to conform data structures to the new index. @@ -3336,9 +3503,8 @@ def _reindex_non_unique(self, target): join_index, (left_indexer, right_indexer) """ - @Appender(_index_shared_docs['join']) - def join(self, other, how='left', level=None, return_indexers=False, - sort=False): + @Appender(_index_shared_docs["join"]) + def join(self, other, how="left", level=None, return_indexers=False, sort=False): self_is_mi = isinstance(self, ABCMultiIndex) other_is_mi = isinstance(other, ABCMultiIndex) @@ -3350,17 +3516,17 @@ def join(self, other, how='left', level=None, return_indexers=False, if self.names == other.names: pass else: - return self._join_multi(other, how=how, - return_indexers=return_indexers) + return self._join_multi(other, how=how, return_indexers=return_indexers) # join on the level if level is not None and (self_is_mi or other_is_mi): - return self._join_level(other, level, how=how, - return_indexers=return_indexers) + return self._join_level( + other, level, how=how, return_indexers=return_indexers + ) other = ensure_index(other) - if len(other) == 0 and how in ('left', 'outer'): + if len(other) == 0 and how in ("left", "outer"): join_index = self._shallow_copy() if return_indexers: rindexer = np.repeat(-1, len(join_index)) @@ -3368,7 +3534,7 @@ def join(self, other, how='left', level=None, return_indexers=False, else: return join_index - if len(self) == 0 and how in ('right', 'outer'): + if len(self) == 0 and how in ("right", "outer"): join_index = other._shallow_copy() if return_indexers: lindexer = np.repeat(-1, len(join_index)) @@ -3377,47 +3543,52 @@ def join(self, other, how='left', level=None, return_indexers=False, return join_index if self._join_precedence < other._join_precedence: - how = {'right': 'left', 'left': 'right'}.get(how, how) - result = other.join(self, how=how, level=level, - return_indexers=return_indexers) + how = {"right": "left", "left": "right"}.get(how, how) + result = other.join( + self, how=how, level=level, return_indexers=return_indexers + ) if return_indexers: x, y, z = result result = x, z, y return result if not is_dtype_equal(self.dtype, other.dtype): - this = self.astype('O') - other = other.astype('O') + this = self.astype("O") + other = other.astype("O") return this.join(other, how=how, return_indexers=return_indexers) _validate_join_method(how) if not self.is_unique and not other.is_unique: - return self._join_non_unique(other, how=how, - return_indexers=return_indexers) + return self._join_non_unique( + other, how=how, return_indexers=return_indexers + ) elif not self.is_unique or not other.is_unique: if self.is_monotonic and other.is_monotonic: - return self._join_monotonic(other, how=how, - return_indexers=return_indexers) + return self._join_monotonic( + other, how=how, return_indexers=return_indexers + ) else: - return self._join_non_unique(other, how=how, - return_indexers=return_indexers) + return self._join_non_unique( + other, how=how, return_indexers=return_indexers + ) elif self.is_monotonic and other.is_monotonic: try: - return self._join_monotonic(other, how=how, - return_indexers=return_indexers) + return self._join_monotonic( + other, how=how, return_indexers=return_indexers + ) except TypeError: pass - if how == 'left': + if how == "left": join_index = self - elif how == 'right': + elif how == "right": join_index = other - elif how == 'inner': + elif how == "inner": # TODO: sort=False here for backwards compat. It may # be better to use the sort parameter passed into join join_index = self.intersection(other, sort=False) - elif how == 'outer': + elif how == "outer": # TODO: sort=True here for backwards compat. It may # be better to use the sort parameter passed into join join_index = self.union(other) @@ -3465,23 +3636,23 @@ def _join_multi(self, other, how, return_indexers=True): # Join left and right # Join on same leveled multi-index frames is supported - join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how, - return_indexers=True) + join_idx, lidx, ridx = self_jnlevels.join( + other_jnlevels, how, return_indexers=True + ) # Restore the dropped levels # Returned index level order is # common levels, ldrop_names, rdrop_names dropped_names = ldrop_names + rdrop_names - levels, codes, names = ( - _restore_dropped_levels_multijoin(self, other, - dropped_names, - join_idx, - lidx, ridx)) + levels, codes, names = _restore_dropped_levels_multijoin( + self, other, dropped_names, join_idx, lidx, ridx + ) # Re-create the multi-index - multi_join_idx = MultiIndex(levels=levels, codes=codes, - names=names, verify_integrity=False) + multi_join_idx = MultiIndex( + levels=levels, codes=codes, names=names, verify_integrity=False + ) multi_join_idx = multi_join_idx.remove_unused_levels() @@ -3496,24 +3667,24 @@ def _join_multi(self, other, how, return_indexers=True): self, other = other, self flip_order = True # flip if join method is right or left - how = {'right': 'left', 'left': 'right'}.get(how, how) + how = {"right": "left", "left": "right"}.get(how, how) level = other.names.index(jl) - result = self._join_level(other, level, how=how, - return_indexers=return_indexers) + result = self._join_level( + other, level, how=how, return_indexers=return_indexers + ) if flip_order: if isinstance(result, tuple): return result[0], result[2], result[1] return result - def _join_non_unique(self, other, how='left', return_indexers=False): + def _join_non_unique(self, other, how="left", return_indexers=False): from pandas.core.reshape.merge import _get_join_indexers - left_idx, right_idx = _get_join_indexers([self._ndarray_values], - [other._ndarray_values], - how=how, - sort=True) + left_idx, right_idx = _get_join_indexers( + [self._ndarray_values], [other._ndarray_values], how=how, sort=True + ) left_idx = ensure_platform_int(left_idx) right_idx = ensure_platform_int(right_idx) @@ -3529,8 +3700,9 @@ def _join_non_unique(self, other, how='left', return_indexers=False): else: return join_index - def _join_level(self, other, level, how='left', return_indexers=False, - keep_order=True): + def _join_level( + self, other, level, how="left", return_indexers=False, keep_order=True + ): """ The join method *only* affects the level of the resulting MultiIndex. Otherwise it just exactly aligns the Index data to the @@ -3548,7 +3720,7 @@ def _get_leaf_sorter(labels): order of higher levels. """ if labels[0].size == 0: - return np.empty(0, dtype='int64') + return np.empty(0, dtype="int64") if len(labels) == 1: lab = ensure_int64(labels[0]) @@ -3566,41 +3738,44 @@ def _get_leaf_sorter(labels): return lib.get_level_sorter(lab, ensure_int64(starts)) if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): - raise TypeError('Join on level between two MultiIndex objects ' - 'is ambiguous') + raise TypeError( + "Join on level between two MultiIndex objects " "is ambiguous" + ) left, right = self, other flip_order = not isinstance(self, MultiIndex) if flip_order: left, right = right, left - how = {'right': 'left', 'left': 'right'}.get(how, how) + how = {"right": "left", "left": "right"}.get(how, how) level = left._get_level_number(level) old_level = left.levels[level] if not right.is_unique: - raise NotImplementedError('Index._join_level on non-unique index ' - 'is not implemented') + raise NotImplementedError( + "Index._join_level on non-unique index " "is not implemented" + ) - new_level, left_lev_indexer, right_lev_indexer = \ - old_level.join(right, how=how, return_indexers=True) + new_level, left_lev_indexer, right_lev_indexer = old_level.join( + right, how=how, return_indexers=True + ) if left_lev_indexer is None: if keep_order or len(left) == 0: left_indexer = None join_index = left else: # sort the leaves - left_indexer = _get_leaf_sorter(left.codes[:level + 1]) + left_indexer = _get_leaf_sorter(left.codes[: level + 1]) join_index = left[left_indexer] else: left_lev_indexer = ensure_int64(left_lev_indexer) - rev_indexer = lib.get_reverse_indexer(left_lev_indexer, - len(old_level)) + rev_indexer = lib.get_reverse_indexer(left_lev_indexer, len(old_level)) - new_lev_codes = algos.take_nd(rev_indexer, left.codes[level], - allow_fill=False) + new_lev_codes = algos.take_nd( + rev_indexer, left.codes[level], allow_fill=False + ) new_codes = list(left.codes) new_codes[level] = new_lev_codes @@ -3619,10 +3794,11 @@ def _get_leaf_sorter(labels): if level == 0: # outer most level, take the fast route ngroups = 1 + new_lev_codes.max() left_indexer, counts = libalgos.groupsort_indexer( - new_lev_codes, ngroups) + new_lev_codes, ngroups + ) # missing values are placed first; drop them! - left_indexer = left_indexer[counts[0]:] + left_indexer = left_indexer[counts[0] :] new_codes = [lab[left_indexer] for lab in new_codes] else: # sort the leaves @@ -3631,7 +3807,7 @@ def _get_leaf_sorter(labels): if not mask_all: new_codes = [lab[mask] for lab in new_codes] - left_indexer = _get_leaf_sorter(new_codes[:level + 1]) + left_indexer = _get_leaf_sorter(new_codes[: level + 1]) new_codes = [lab[left_indexer] for lab in new_codes] # left_indexers are w.r.t masked frame. @@ -3639,13 +3815,17 @@ def _get_leaf_sorter(labels): if not mask_all: left_indexer = mask.nonzero()[0][left_indexer] - join_index = MultiIndex(levels=new_levels, codes=new_codes, - names=left.names, verify_integrity=False) + join_index = MultiIndex( + levels=new_levels, + codes=new_codes, + names=left.names, + verify_integrity=False, + ) if right_lev_indexer is not None: - right_indexer = algos.take_nd(right_lev_indexer, - join_index.codes[level], - allow_fill=False) + right_indexer = algos.take_nd( + right_lev_indexer, join_index.codes[level], allow_fill=False + ) else: right_indexer = join_index.codes[level] @@ -3653,17 +3833,19 @@ def _get_leaf_sorter(labels): left_indexer, right_indexer = right_indexer, left_indexer if return_indexers: - left_indexer = (None if left_indexer is None - else ensure_platform_int(left_indexer)) - right_indexer = (None if right_indexer is None - else ensure_platform_int(right_indexer)) + left_indexer = ( + None if left_indexer is None else ensure_platform_int(left_indexer) + ) + right_indexer = ( + None if right_indexer is None else ensure_platform_int(right_indexer) + ) return join_index, left_indexer, right_indexer else: return join_index - def _join_monotonic(self, other, how='left', return_indexers=False): + def _join_monotonic(self, other, how="left", return_indexers=False): if self.equals(other): - ret_index = other if how == 'right' else self + ret_index = other if how == "right" else self if return_indexers: return ret_index, None, None else: @@ -3674,28 +3856,28 @@ def _join_monotonic(self, other, how='left', return_indexers=False): if self.is_unique and other.is_unique: # We can perform much better than the general case - if how == 'left': + if how == "left": join_index = self lidx = None ridx = self._left_indexer_unique(sv, ov) - elif how == 'right': + elif how == "right": join_index = other lidx = self._left_indexer_unique(ov, sv) ridx = None - elif how == 'inner': + elif how == "inner": join_index, lidx, ridx = self._inner_indexer(sv, ov) join_index = self._wrap_joined_index(join_index, other) - elif how == 'outer': + elif how == "outer": join_index, lidx, ridx = self._outer_indexer(sv, ov) join_index = self._wrap_joined_index(join_index, other) else: - if how == 'left': + if how == "left": join_index, lidx, ridx = self._left_indexer(sv, ov) - elif how == 'right': + elif how == "right": join_index, ridx, lidx = self._left_indexer(ov, sv) - elif how == 'inner': + elif how == "inner": join_index, lidx, ridx = self._inner_indexer(sv, ov) - elif how == 'outer': + elif how == "outer": join_index, lidx, ridx = self._outer_indexer(sv, ov) join_index = self._wrap_joined_index(join_index, other) @@ -3813,7 +3995,9 @@ def get_values(self): warnings.warn( "The 'get_values' method is deprecated and will be removed in a " "future version. Use '.to_numpy()' or '.array' instead.", - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2, + ) return self._internal_get_values() def _internal_get_values(self): @@ -3827,7 +4011,9 @@ def memory_usage(self, deep=False): result += self._engine.sizeof(deep=deep) return result - _index_shared_docs['where'] = """ + _index_shared_docs[ + "where" + ] = """ Return an Index of same shape as self and whose corresponding entries are from self where cond is True and otherwise are from other. @@ -3844,7 +4030,7 @@ def memory_usage(self, deep=False): Index """ - @Appender(_index_shared_docs['where']) + @Appender(_index_shared_docs["where"]) def where(self, cond, other=None): if other is None: other = self._na_value @@ -3890,11 +4076,12 @@ def _try_convert_to_int_index(cls, data, copy, name, dtype): """ from .numeric import Int64Index, UInt64Index + if not is_unsigned_integer_dtype(dtype): # skip int64 conversion attempt if uint-like dtype is passed, as # this could return Int64Index when UInt64Index is what's desired try: - res = data.astype('i8', copy=False) + res = data.astype("i8", copy=False) if (res == data).all(): return Int64Index(res, copy=copy, name=name) except (OverflowError, TypeError, ValueError): @@ -3903,7 +4090,7 @@ def _try_convert_to_int_index(cls, data, copy, name, dtype): # Conversion to int64 failed (possibly due to overflow) or was skipped, # so let's try now with uint64. try: - res = data.astype('u8', copy=False) + res = data.astype("u8", copy=False) if (res == data).all(): return UInt64Index(res, copy=copy, name=name) except (OverflowError, TypeError, ValueError): @@ -3913,14 +4100,17 @@ def _try_convert_to_int_index(cls, data, copy, name, dtype): @classmethod def _scalar_data_error(cls, data): - raise TypeError('{0}(...) must be called with a collection of some ' - 'kind, {1} was passed'.format(cls.__name__, - repr(data))) + raise TypeError( + "{0}(...) must be called with a collection of some " + "kind, {1} was passed".format(cls.__name__, repr(data)) + ) @classmethod def _string_data_error(cls, data): - raise TypeError('String dtype not supported, you may need ' - 'to explicitly cast to a numeric type') + raise TypeError( + "String dtype not supported, you may need " + "to explicitly cast to a numeric type" + ) @classmethod def _coerce_to_ndarray(cls, data): @@ -4000,7 +4190,9 @@ def is_type_compatible(self, kind): """ return kind == self.inferred_type - _index_shared_docs['contains'] = """ + _index_shared_docs[ + "contains" + ] = """ Return a boolean indicating whether the provided key is in the index. Parameters @@ -4030,7 +4222,7 @@ def is_type_compatible(self, kind): False """ - @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) + @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) def __contains__(self, key): hash(key) try: @@ -4052,7 +4244,10 @@ def contains(self, key): warnings.warn( "The 'contains' method is deprecated and will be removed in a " "future version. Use 'key in index' instead of " - "'index.contains(key)'", FutureWarning, stacklevel=2) + "'index.contains(key)'", + FutureWarning, + stacklevel=2, + ) return key in self def __hash__(self): @@ -4131,7 +4326,7 @@ def append(self, other): for obj in to_concat: if not isinstance(obj, Index): - raise TypeError('all inputs must be Index') + raise TypeError("all inputs must be Index") names = {obj.name for obj in to_concat} name = None if len(names) > 1 else self.name @@ -4197,8 +4392,9 @@ def equals(self, other): return other.equals(self) try: - return array_equivalent(com.values_from_object(self), - com.values_from_object(other)) + return array_equivalent( + com.values_from_object(self), com.values_from_object(other) + ) except Exception: return False @@ -4213,10 +4409,16 @@ def identical(self, other): If two Index objects have equal elements and same type True, otherwise False. """ - return (self.equals(other) and - all((getattr(self, c, None) == getattr(other, c, None) - for c in self._comparables)) and - type(self) == type(other)) + return ( + self.equals(other) + and all( + ( + getattr(self, c, None) == getattr(other, c, None) + for c in self._comparables + ) + ) + and type(self) == type(other) + ) def asof(self, label): """ @@ -4275,7 +4477,7 @@ def asof(self, label): ValueError: index must be monotonic increasing or decreasing """ try: - loc = self.get_loc(label, method='pad') + loc = self.get_loc(label, method="pad") except KeyError: return self._na_value else: @@ -4312,7 +4514,7 @@ def asof_locs(self, where, mask): which correspond to the return values of the `asof` function for every element in `where`. """ - locs = self.values[mask].searchsorted(where.values, side='right') + locs = self.values[mask].searchsorted(where.values, side="right") locs = np.where(locs > 0, locs - 1, 0) result = np.arange(len(self))[mask].take(locs) @@ -4380,8 +4582,9 @@ def sort(self, *args, **kwargs): """ Use sort_values instead. """ - raise TypeError("cannot sort an Index object in-place, use " - "sort_values instead") + raise TypeError( + "cannot sort an Index object in-place, use " "sort_values instead" + ) def shift(self, periods=1, freq=None): """ @@ -4439,8 +4642,7 @@ def shift(self, periods=1, freq=None): '2012-03-01'], dtype='datetime64[ns]', freq='MS') """ - raise NotImplementedError("Not supported for type %s" % - type(self).__name__) + raise NotImplementedError("Not supported for type %s" % type(self).__name__) def argsort(self, *args, **kwargs): """ @@ -4482,7 +4684,9 @@ def argsort(self, *args, **kwargs): result = np.array(self) return result.argsort(*args, **kwargs) - _index_shared_docs['get_value'] = """ + _index_shared_docs[ + "get_value" + ] = """ Fast lookup of value from 1-dimensional ndarray. Only use this if you know what you're doing. @@ -4492,13 +4696,13 @@ def argsort(self, *args, **kwargs): A value in the Series with the index of the key value in self. """ - @Appender(_index_shared_docs['get_value'] % _index_doc_kwargs) + @Appender(_index_shared_docs["get_value"] % _index_doc_kwargs) def get_value(self, series, key): # if we have something that is Index-like, then # use this, e.g. DatetimeIndex # Things like `Series._get_value` (via .at) pass the EA directly here. - s = getattr(series, '_values', series) + s = getattr(series, "_values", series) if isinstance(s, (ExtensionArray, Index)) and is_scalar(key): # GH 20882, 21257 # Unify Index and ExtensionArray treatment @@ -4510,8 +4714,7 @@ def get_value(self, series, key): iloc = self.get_loc(key) return s[iloc] except KeyError: - if (len(self) > 0 and - (self.holds_integer() or self.is_boolean())): + if len(self) > 0 and (self.holds_integer() or self.is_boolean()): raise elif is_integer(key): return s[key] @@ -4519,10 +4722,9 @@ def get_value(self, series, key): s = com.values_from_object(series) k = com.values_from_object(key) - k = self._convert_scalar_indexer(k, kind='getitem') + k = self._convert_scalar_indexer(k, kind="getitem") try: - return self._engine.get_value(s, k, - tz=getattr(series.dtype, 'tz', None)) + return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None)) except KeyError as e1: if len(self) > 0 and (self.holds_integer() or self.is_boolean()): raise @@ -4553,10 +4755,13 @@ def set_value(self, arr, key, value): ----- Only use this if you know what you're doing. """ - self._engine.set_value(com.values_from_object(arr), - com.values_from_object(key), value) + self._engine.set_value( + com.values_from_object(arr), com.values_from_object(key), value + ) - _index_shared_docs['get_indexer_non_unique'] = """ + _index_shared_docs[ + "get_indexer_non_unique" + ] = """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the current data to the new index. @@ -4576,7 +4781,7 @@ def set_value(self, arr, key, value): These correspond to the -1 in the indexer array. """ - @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) + @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) def get_indexer_non_unique(self, target): target = ensure_index(target) if is_categorical(target): @@ -4614,11 +4819,12 @@ def get_indexer_for(self, target, **kwargs): def _maybe_promote(self, other): # A hack, but it works from pandas import DatetimeIndex - if self.inferred_type == 'date' and isinstance(other, DatetimeIndex): + + if self.inferred_type == "date" and isinstance(other, DatetimeIndex): return DatetimeIndex(self), other - elif self.inferred_type == 'boolean': + elif self.inferred_type == "boolean": if not is_object_dtype(self.dtype): - return self.astype('object'), other.astype('object') + return self.astype("object"), other.astype("object") return self, other def groupby(self, values): @@ -4669,6 +4875,7 @@ def map(self, mapper, na_action=None): """ from .multi import MultiIndex + new_values = super()._map_values(mapper, na_action=na_action) attributes = self._get_attributes_dict() @@ -4677,17 +4884,16 @@ def map(self, mapper, na_action=None): if new_values.size and isinstance(new_values[0], tuple): if isinstance(self, MultiIndex): names = self.names - elif attributes.get('name'): - names = [attributes.get('name')] * len(new_values[0]) + elif attributes.get("name"): + names = [attributes.get("name")] * len(new_values[0]) else: names = None - return MultiIndex.from_tuples(new_values, - names=names) + return MultiIndex.from_tuples(new_values, names=names) - attributes['copy'] = False + attributes["copy"] = False if not new_values.size: # empty - attributes['dtype'] = self.dtype + attributes["dtype"] = self.dtype return Index(new_values, **attributes) @@ -4823,8 +5029,7 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): >>> idx.slice_indexer(start='b', end=('c', 'g')) slice(1, 3) """ - start_slice, end_slice = self.slice_locs(start, end, step=step, - kind=kind) + start_slice, end_slice = self.slice_locs(start, end, step=step, kind=kind) # return a slice if not is_scalar(start_slice): @@ -4854,17 +5059,19 @@ def _validate_indexer(self, form, key, kind): If we are positional indexer, validate that we have appropriate typed bounds must be an integer. """ - assert kind in ['ix', 'loc', 'getitem', 'iloc'] + assert kind in ["ix", "loc", "getitem", "iloc"] if key is None: pass elif is_integer(key): pass - elif kind in ['iloc', 'getitem']: + elif kind in ["iloc", "getitem"]: self._invalid_indexer(form, key) return key - _index_shared_docs['_maybe_cast_slice_bound'] = """ + _index_shared_docs[ + "_maybe_cast_slice_bound" + ] = """ This function should be overloaded in subclasses that allow non-trivial casting on label-slice bounds, e.g. datetime-like indices allowing strings containing formatted datetimes. @@ -4885,38 +5092,38 @@ def _validate_indexer(self, form, key, kind): """ - @Appender(_index_shared_docs['_maybe_cast_slice_bound']) + @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) def _maybe_cast_slice_bound(self, label, side, kind): - assert kind in ['ix', 'loc', 'getitem', None] + assert kind in ["ix", "loc", "getitem", None] # We are a plain index here (sub-class override this method if they # wish to have special treatment for floats/ints, e.g. Float64Index and # datetimelike Indexes # reject them if is_float(label): - if not (kind in ['ix'] and (self.holds_integer() or - self.is_floating())): - self._invalid_indexer('slice', label) + if not (kind in ["ix"] and (self.holds_integer() or self.is_floating())): + self._invalid_indexer("slice", label) # we are trying to find integer bounds on a non-integer based index # this is rejected (generally .loc gets you here) elif is_integer(label): - self._invalid_indexer('slice', label) + self._invalid_indexer("slice", label) return label - def _searchsorted_monotonic(self, label, side='left'): + def _searchsorted_monotonic(self, label, side="left"): if self.is_monotonic_increasing: return self.searchsorted(label, side=side) elif self.is_monotonic_decreasing: # np.searchsorted expects ascending sort order, have to reverse # everything for it to work (element ordering, search side and # resulting value). - pos = self[::-1].searchsorted(label, side='right' if side == 'left' - else 'left') + pos = self[::-1].searchsorted( + label, side="right" if side == "left" else "left" + ) return len(self) - pos - raise ValueError('index must be monotonic increasing or decreasing') + raise ValueError("index must be monotonic increasing or decreasing") def get_slice_bound(self, label, side, kind): """ @@ -4936,12 +5143,13 @@ def get_slice_bound(self, label, side, kind): int Index of label. """ - assert kind in ['ix', 'loc', 'getitem', None] + assert kind in ["ix", "loc", "getitem", None] - if side not in ('left', 'right'): - raise ValueError("Invalid value for side kwarg," - " must be either 'left' or 'right': %s" % - (side, )) + if side not in ("left", "right"): + raise ValueError( + "Invalid value for side kwarg," + " must be either 'left' or 'right': %s" % (side,) + ) original_label = label @@ -4963,20 +5171,22 @@ def get_slice_bound(self, label, side, kind): # get_loc may return a boolean array or an array of indices, which # is OK as long as they are representable by a slice. if is_bool_dtype(slc): - slc = lib.maybe_booleans_to_slice(slc.view('u1')) + slc = lib.maybe_booleans_to_slice(slc.view("u1")) else: - slc = lib.maybe_indices_to_slice(slc.astype('i8'), len(self)) + slc = lib.maybe_indices_to_slice(slc.astype("i8"), len(self)) if isinstance(slc, np.ndarray): - raise KeyError("Cannot get %s slice bound for non-unique " - "label: %r" % (side, original_label)) + raise KeyError( + "Cannot get %s slice bound for non-unique " + "label: %r" % (side, original_label) + ) if isinstance(slc, slice): - if side == 'left': + if side == "left": return slc.start else: return slc.stop else: - if side == 'right': + if side == "right": return slc + 1 else: return slc @@ -5013,7 +5223,7 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): >>> idx.slice_locs(start='b', end='c') (1, 3) """ - inc = (step is None or step >= 0) + inc = step is None or step >= 0 if not inc: # If it's a reverse slice, temporarily swap bounds. @@ -5021,8 +5231,7 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): # GH 16785: If start and end happen to be date strings with UTC offsets # attempt to parse and check that the offsets are the same - if (isinstance(start, (str, datetime)) - and isinstance(end, (str, datetime))): + if isinstance(start, (str, datetime)) and isinstance(end, (str, datetime)): try: ts_start = Timestamp(start) ts_end = Timestamp(end) @@ -5030,18 +5239,17 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): pass else: if not tz_compare(ts_start.tzinfo, ts_end.tzinfo): - raise ValueError("Both dates must have the " - "same UTC offset") + raise ValueError("Both dates must have the " "same UTC offset") start_slice = None if start is not None: - start_slice = self.get_slice_bound(start, 'left', kind) + start_slice = self.get_slice_bound(start, "left", kind) if start_slice is None: start_slice = 0 end_slice = None if end is not None: - end_slice = self.get_slice_bound(end, 'right', kind) + end_slice = self.get_slice_bound(end, "right", kind) if end_slice is None: end_slice = len(self) @@ -5102,7 +5310,7 @@ def insert(self, loc, item): idx = np.concatenate((_self[:loc], item, _self[loc:])) return self._shallow_copy_with_infer(idx) - def drop(self, labels, errors='raise'): + def drop(self, labels, errors="raise"): """ Make new Index with passed list of labels deleted. @@ -5121,14 +5329,13 @@ def drop(self, labels, errors='raise'): KeyError If not all of the labels are found in the selected axis """ - arr_dtype = 'object' if self.dtype == 'object' else None + arr_dtype = "object" if self.dtype == "object" else None labels = com.index_labels_to_array(labels, dtype=arr_dtype) indexer = self.get_indexer(labels) mask = indexer == -1 if mask.any(): - if errors != 'ignore': - raise KeyError( - '{} not found in axis'.format(labels[mask])) + if errors != "ignore": + raise KeyError("{} not found in axis".format(labels[mask])) indexer = indexer[~mask] return self.delete(indexer) @@ -5138,17 +5345,18 @@ def drop(self, labels, errors='raise'): def _evaluate_with_timedelta_like(self, other, op): # Timedelta knows how to operate with np.array, so dispatch to that # operation and then wrap the results - if self._is_numeric_dtype and op.__name__ in ['add', 'sub', - 'radd', 'rsub']: - raise TypeError("Operation {opname} between {cls} and {other} " - "is invalid".format(opname=op.__name__, - cls=self.dtype, - other=type(other).__name__)) + if self._is_numeric_dtype and op.__name__ in ["add", "sub", "radd", "rsub"]: + raise TypeError( + "Operation {opname} between {cls} and {other} " + "is invalid".format( + opname=op.__name__, cls=self.dtype, other=type(other).__name__ + ) + ) other = Timedelta(other) values = self.values - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = op(values, other) attrs = self._get_attributes_dict() @@ -5177,32 +5385,32 @@ def _add_numeric_methods_add_sub_disabled(cls): """ Add in the numeric add/sub methods to disable. """ - cls.__add__ = make_invalid_op('__add__') - cls.__radd__ = make_invalid_op('__radd__') - cls.__iadd__ = make_invalid_op('__iadd__') - cls.__sub__ = make_invalid_op('__sub__') - cls.__rsub__ = make_invalid_op('__rsub__') - cls.__isub__ = make_invalid_op('__isub__') + cls.__add__ = make_invalid_op("__add__") + cls.__radd__ = make_invalid_op("__radd__") + cls.__iadd__ = make_invalid_op("__iadd__") + cls.__sub__ = make_invalid_op("__sub__") + cls.__rsub__ = make_invalid_op("__rsub__") + cls.__isub__ = make_invalid_op("__isub__") @classmethod def _add_numeric_methods_disabled(cls): """ Add in numeric methods to disable other than add/sub. """ - cls.__pow__ = make_invalid_op('__pow__') - cls.__rpow__ = make_invalid_op('__rpow__') - cls.__mul__ = make_invalid_op('__mul__') - cls.__rmul__ = make_invalid_op('__rmul__') - cls.__floordiv__ = make_invalid_op('__floordiv__') - cls.__rfloordiv__ = make_invalid_op('__rfloordiv__') - cls.__truediv__ = make_invalid_op('__truediv__') - cls.__rtruediv__ = make_invalid_op('__rtruediv__') - cls.__mod__ = make_invalid_op('__mod__') - cls.__divmod__ = make_invalid_op('__divmod__') - cls.__neg__ = make_invalid_op('__neg__') - cls.__pos__ = make_invalid_op('__pos__') - cls.__abs__ = make_invalid_op('__abs__') - cls.__inv__ = make_invalid_op('__inv__') + cls.__pow__ = make_invalid_op("__pow__") + cls.__rpow__ = make_invalid_op("__rpow__") + cls.__mul__ = make_invalid_op("__mul__") + cls.__rmul__ = make_invalid_op("__rmul__") + cls.__floordiv__ = make_invalid_op("__floordiv__") + cls.__rfloordiv__ = make_invalid_op("__rfloordiv__") + cls.__truediv__ = make_invalid_op("__truediv__") + cls.__rtruediv__ = make_invalid_op("__rtruediv__") + cls.__mod__ = make_invalid_op("__mod__") + cls.__divmod__ = make_invalid_op("__divmod__") + cls.__neg__ = make_invalid_op("__neg__") + cls.__pos__ = make_invalid_op("__pos__") + cls.__abs__ = make_invalid_op("__abs__") + cls.__inv__ = make_invalid_op("__inv__") def _maybe_update_attributes(self, attrs): """ @@ -5215,9 +5423,10 @@ def _validate_for_numeric_unaryop(self, op, opstr): Validate if we can perform a numeric unary operation. """ if not self._is_numeric_dtype: - raise TypeError("cannot evaluate a numeric op " - "{opstr} for type: {typ}" - .format(opstr=opstr, typ=type(self).__name__)) + raise TypeError( + "cannot evaluate a numeric op " + "{opstr} for type: {typ}".format(opstr=opstr, typ=type(self).__name__) + ) def _validate_for_numeric_binop(self, other, op): """ @@ -5228,30 +5437,32 @@ def _validate_for_numeric_binop(self, other, op): ----- This is an internal method called by ops. """ - opstr = '__{opname}__'.format(opname=op.__name__) + opstr = "__{opname}__".format(opname=op.__name__) # if we are an inheritor of numeric, # but not actually numeric (e.g. DatetimeIndex/PeriodIndex) if not self._is_numeric_dtype: - raise TypeError("cannot evaluate a numeric op {opstr} " - "for type: {typ}" - .format(opstr=opstr, typ=type(self).__name__)) + raise TypeError( + "cannot evaluate a numeric op {opstr} " + "for type: {typ}".format(opstr=opstr, typ=type(self).__name__) + ) if isinstance(other, Index): if not other._is_numeric_dtype: - raise TypeError("cannot evaluate a numeric op " - "{opstr} with type: {typ}" - .format(opstr=opstr, typ=type(other))) + raise TypeError( + "cannot evaluate a numeric op " + "{opstr} with type: {typ}".format(opstr=opstr, typ=type(other)) + ) elif isinstance(other, np.ndarray) and not other.ndim: other = other.item() if isinstance(other, (Index, ABCSeries, np.ndarray)): if len(self) != len(other): - raise ValueError("cannot evaluate a numeric op with " - "unequal lengths") + raise ValueError("cannot evaluate a numeric op with " "unequal lengths") other = com.values_from_object(other) - if other.dtype.kind not in ['f', 'i', 'u']: - raise TypeError("cannot evaluate a numeric op " - "with a non-numeric dtype") + if other.dtype.kind not in ["f", "i", "u"]: + raise TypeError( + "cannot evaluate a numeric op " "with a non-numeric dtype" + ) elif isinstance(other, (ABCDateOffset, np.timedelta64, timedelta)): # higher up to handle pass @@ -5292,8 +5503,8 @@ def _add_numeric_methods_unary(cls): """ Add in numeric unary methods. """ - def _make_evaluate_unary(op, opstr): + def _make_evaluate_unary(op, opstr): def _evaluate_numeric_unary(self): self._validate_for_numeric_unaryop(op, opstr) @@ -5304,10 +5515,10 @@ def _evaluate_numeric_unary(self): _evaluate_numeric_unary.__name__ = opstr return _evaluate_numeric_unary - cls.__neg__ = _make_evaluate_unary(operator.neg, '__neg__') - cls.__pos__ = _make_evaluate_unary(operator.pos, '__pos__') - cls.__abs__ = _make_evaluate_unary(np.abs, '__abs__') - cls.__inv__ = _make_evaluate_unary(lambda x: -x, '__inv__') + cls.__neg__ = _make_evaluate_unary(operator.neg, "__neg__") + cls.__pos__ = _make_evaluate_unary(operator.pos, "__pos__") + cls.__abs__ = _make_evaluate_unary(np.abs, "__abs__") + cls.__inv__ = _make_evaluate_unary(lambda x: -x, "__inv__") @classmethod def _add_numeric_methods(cls): @@ -5334,7 +5545,8 @@ def _add_logical_methods(cls): %(outname)s : bool or array_like (if axis is specified) A single element array_like may be converted to bool.""" - _index_shared_docs['index_all'] = dedent(""" + _index_shared_docs["index_all"] = dedent( + """ See Also -------- @@ -5372,9 +5584,11 @@ def _add_logical_methods(cls): >>> pd.Index([0, 0, 0]).any() False - """) + """ + ) - _index_shared_docs['index_any'] = dedent(""" + _index_shared_docs["index_any"] = dedent( + """ See Also -------- @@ -5395,16 +5609,19 @@ def _add_logical_methods(cls): >>> index = pd.Index([0, 0, 0]) >>> index.any() False - """) + """ + ) def _make_logical_function(name, desc, f): @Substitution(outname=name, desc=desc) - @Appender(_index_shared_docs['index_' + name]) + @Appender(_index_shared_docs["index_" + name]) @Appender(_doc) def logical_func(self, *args, **kwargs): result = f(self.values) - if (isinstance(result, (np.ndarray, ABCSeries, Index)) and - result.ndim == 0): + if ( + isinstance(result, (np.ndarray, ABCSeries, Index)) + and result.ndim == 0 + ): # return NumPy type return result.dtype.type(result.item()) else: # pragma: no cover @@ -5413,20 +5630,20 @@ def logical_func(self, *args, **kwargs): logical_func.__name__ = name return logical_func - cls.all = _make_logical_function('all', 'Return whether all elements ' - 'are True.', - np.all) - cls.any = _make_logical_function('any', - 'Return whether any element is True.', - np.any) + cls.all = _make_logical_function( + "all", "Return whether all elements " "are True.", np.all + ) + cls.any = _make_logical_function( + "any", "Return whether any element is True.", np.any + ) @classmethod def _add_logical_methods_disabled(cls): """ Add in logical methods to disable. """ - cls.all = make_invalid_op('all') - cls.any = make_invalid_op('any') + cls.all = make_invalid_op("all") + cls.any = make_invalid_op("any") Index._add_numeric_methods_disabled() @@ -5511,7 +5728,7 @@ def ensure_index(index_like, copy=False): if copy: index_like = index_like.copy() return index_like - if hasattr(index_like, 'name'): + if hasattr(index_like, "name"): return Index(index_like, name=index_like.name, copy=copy) if is_iterator(index_like): @@ -5527,6 +5744,7 @@ def ensure_index(index_like, copy=False): if len(converted) > 0 and all_arrays: from .multi import MultiIndex + return MultiIndex.from_arrays(converted) else: index_like = converted @@ -5535,6 +5753,7 @@ def ensure_index(index_like, copy=False): # so only need to do this if not list instance if copy: from copy import copy + index_like = copy(index_like) return Index(index_like) @@ -5557,16 +5776,17 @@ def _trim_front(strings): Trims zeros and decimal points. """ trimmed = strings - while len(strings) > 0 and all(x[0] == ' ' for x in trimmed): + while len(strings) > 0 and all(x[0] == " " for x in trimmed): trimmed = [x[1:] for x in trimmed] return trimmed def _validate_join_method(method): - if method not in ['left', 'right', 'inner', 'outer']: - raise ValueError('do not recognize join method %s' % method) + if method not in ["left", "right", "inner", "outer"]: + raise ValueError("do not recognize join method %s" % method) def default_index(n): from pandas.core.index import RangeIndex + return RangeIndex(0, n, name=None) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 321297335cf236..9550d68f1d32bf 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -12,8 +12,12 @@ from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( - ensure_platform_int, is_categorical_dtype, is_interval_dtype, is_list_like, - is_scalar) + ensure_platform_int, + is_categorical_dtype, + is_interval_dtype, + is_list_like, + is_scalar, +) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ABCCategorical, ABCSeries from pandas.core.dtypes.missing import isna @@ -29,20 +33,26 @@ from pandas.core.ops import get_op_result_name _index_doc_kwargs = dict(ibase._index_doc_kwargs) -_index_doc_kwargs.update(dict(target_klass='CategoricalIndex')) +_index_doc_kwargs.update(dict(target_klass="CategoricalIndex")) @accessor.delegate_names( delegate=Categorical, - accessors=["rename_categories", - "reorder_categories", - "add_categories", - "remove_categories", - "remove_unused_categories", - "set_categories", - "as_ordered", "as_unordered", - "min", "max"], - typ='method', overwrite=True) + accessors=[ + "rename_categories", + "reorder_categories", + "add_categories", + "remove_categories", + "remove_unused_categories", + "set_categories", + "as_ordered", + "as_unordered", + "min", + "max", + ], + typ="method", + overwrite=True, +) class CategoricalIndex(Index, accessor.PandasDelegate): """ Index based on an underlying :class:`Categorical`. @@ -134,37 +144,48 @@ class CategoricalIndex(Index, accessor.PandasDelegate): 'c' """ - _typ = 'categoricalindex' + _typ = "categoricalindex" @property def _engine_type(self): # self.codes can have dtype int8, int16, int32 or int64, so we need # to return the corresponding engine type (libindex.Int8Engine, etc.). - return {np.int8: libindex.Int8Engine, - np.int16: libindex.Int16Engine, - np.int32: libindex.Int32Engine, - np.int64: libindex.Int64Engine, - }[self.codes.dtype.type] + return { + np.int8: libindex.Int8Engine, + np.int16: libindex.Int16Engine, + np.int32: libindex.Int32Engine, + np.int64: libindex.Int64Engine, + }[self.codes.dtype.type] - _attributes = ['name'] + _attributes = ["name"] # -------------------------------------------------------------------- # Constructors - def __new__(cls, data=None, categories=None, ordered=None, dtype=None, - copy=False, name=None, fastpath=None): + def __new__( + cls, + data=None, + categories=None, + ordered=None, + dtype=None, + copy=False, + name=None, + fastpath=None, + ): if fastpath is not None: - warnings.warn("The 'fastpath' keyword is deprecated, and will be " - "removed in a future version.", - FutureWarning, stacklevel=2) + warnings.warn( + "The 'fastpath' keyword is deprecated, and will be " + "removed in a future version.", + FutureWarning, + stacklevel=2, + ) if fastpath: return cls._simple_new(data, name=name, dtype=dtype) - dtype = CategoricalDtype._from_values_or_dtype(data, categories, - ordered, dtype) + dtype = CategoricalDtype._from_values_or_dtype(data, categories, ordered, dtype) - if name is None and hasattr(data, 'name'): + if name is None and hasattr(data, "name"): name = data.name if not is_categorical_dtype(data): @@ -221,8 +242,7 @@ def _create_categorical(cls, data, dtype=None): ------- Categorical """ - if (isinstance(data, (cls, ABCSeries)) and - is_categorical_dtype(data)): + if isinstance(data, (cls, ABCSeries)) and is_categorical_dtype(data): data = data.values if not isinstance(data, ABCCategorical): @@ -248,7 +268,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): # -------------------------------------------------------------------- - @Appender(_index_shared_docs['_shallow_copy']) + @Appender(_index_shared_docs["_shallow_copy"]) def _shallow_copy(self, values=None, dtype=None, **kwargs): if dtype is None: dtype = self.dtype @@ -269,17 +289,18 @@ def _is_dtype_compat(self, other): if isinstance(other, CategoricalIndex): other = other._values if not other.is_dtype_equal(self): - raise TypeError("categories must match existing categories " - "when appending") + raise TypeError( + "categories must match existing categories " "when appending" + ) else: values = other if not is_list_like(values): values = [values] - other = CategoricalIndex(self._create_categorical( - other, dtype=self.dtype)) + other = CategoricalIndex(self._create_categorical(other, dtype=self.dtype)) if not other.isin(values).all(): - raise TypeError("cannot append a non-category item to a " - "CategoricalIndex") + raise TypeError( + "cannot append a non-category item to a " "CategoricalIndex" + ) return other @@ -320,26 +341,31 @@ def _format_attrs(self): """ Return a list of tuples of the (attr,formatted_value) """ - max_categories = (10 if get_option("display.max_categories") == 0 else - get_option("display.max_categories")) + max_categories = ( + 10 + if get_option("display.max_categories") == 0 + else get_option("display.max_categories") + ) attrs = [ - ('categories', - ibase.default_pprint(self.categories, - max_seq_items=max_categories)), - ('ordered', self.ordered)] + ( + "categories", + ibase.default_pprint(self.categories, max_seq_items=max_categories), + ), + ("ordered", self.ordered), + ] if self.name is not None: - attrs.append(('name', ibase.default_pprint(self.name))) - attrs.append(('dtype', "'%s'" % self.dtype.name)) - max_seq_items = get_option('display.max_seq_items') or len(self) + attrs.append(("name", ibase.default_pprint(self.name))) + attrs.append(("dtype", "'%s'" % self.dtype.name)) + max_seq_items = get_option("display.max_seq_items") or len(self) if len(self) > max_seq_items: - attrs.append(('length', len(self))) + attrs.append(("length", len(self))) return attrs # -------------------------------------------------------------------- @property def inferred_type(self): - return 'categorical' + return "categorical" @property def values(self): @@ -378,7 +404,7 @@ def ordered(self): def _reverse_indexer(self): return self._data._reverse_indexer() - @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) + @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) def __contains__(self, key): # if key is a NaN, check if any NaN is in self. if isna(key): @@ -390,10 +416,11 @@ def __array__(self, dtype=None): """ the array interface, return my values """ return np.array(self._data, dtype=dtype) - @Appender(_index_shared_docs['astype']) + @Appender(_index_shared_docs["astype"]) def astype(self, dtype, copy=True): if is_interval_dtype(dtype): from pandas import IntervalIndex + return IntervalIndex(np.array(self)) elif is_categorical_dtype(dtype): # GH 18630 @@ -408,7 +435,7 @@ def _isnan(self): """ return if each value is nan""" return self._data.codes == -1 - @Appender(ibase._index_shared_docs['fillna']) + @Appender(ibase._index_shared_docs["fillna"]) def fillna(self, value, downcast=None): self._assert_can_do_op(value) return CategoricalIndex(self._data.fillna(value), name=self.name) @@ -435,7 +462,7 @@ def is_monotonic_increasing(self): def is_monotonic_decreasing(self): return self._engine.is_monotonic_decreasing - @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs) + @Appender(_index_shared_docs["index_unique"] % _index_doc_kwargs) def unique(self, level=None): if level is not None: self._validate_index_level(level) @@ -445,14 +472,15 @@ def unique(self, level=None): return self._shallow_copy(result, dtype=result.dtype) @Appender(Index.duplicated.__doc__) - def duplicated(self, keep='first'): + def duplicated(self, keep="first"): from pandas._libs.hashtable import duplicated_int64 - codes = self.codes.astype('i8') + + codes = self.codes.astype("i8") return duplicated_int64(codes, keep) def _to_safe_for_reshape(self): """ convert to object if we are a categorical """ - return self.astype('object') + return self.astype("object") def get_loc(self, key, method=None): """ @@ -493,9 +521,7 @@ def get_loc(self, key, method=None): except KeyError: raise KeyError(key) - def get_value(self, - series: AnyArrayLike, - key: Any): + def get_value(self, series: AnyArrayLike, key: Any): """ Fast lookup of value from 1-dimensional ndarray. Only use this if you know what you're doing @@ -515,7 +541,7 @@ def get_value(self, """ try: k = com.values_from_object(key) - k = self._convert_scalar_indexer(k, kind='getitem') + k = self._convert_scalar_indexer(k, kind="getitem") indexer = self.get_loc(k) return series.take([indexer])[0] except (KeyError, TypeError): @@ -528,7 +554,7 @@ def _can_reindex(self, indexer): """ always allow reindexing """ pass - @Appender(_index_shared_docs['where']) + @Appender(_index_shared_docs["where"]) def where(self, cond, other=None): # TODO: Investigate an alternative implementation with # 1. copy the underlying Categorical @@ -540,8 +566,7 @@ def where(self, cond, other=None): cat = Categorical(values, dtype=self.dtype) return self._shallow_copy(cat, **self._get_attributes_dict()) - def reindex(self, target, method=None, level=None, limit=None, - tolerance=None): + def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ Create index with target's values (move/add/delete values as necessary) @@ -555,14 +580,17 @@ def reindex(self, target, method=None, level=None, limit=None, """ if method is not None: - raise NotImplementedError("argument method is not implemented for " - "CategoricalIndex.reindex") + raise NotImplementedError( + "argument method is not implemented for " "CategoricalIndex.reindex" + ) if level is not None: - raise NotImplementedError("argument level is not implemented for " - "CategoricalIndex.reindex") + raise NotImplementedError( + "argument level is not implemented for " "CategoricalIndex.reindex" + ) if limit is not None: - raise NotImplementedError("argument limit is not implemented for " - "CategoricalIndex.reindex") + raise NotImplementedError( + "argument limit is not implemented for " "CategoricalIndex.reindex" + ) target = ibase.ensure_index(target) @@ -587,8 +615,7 @@ def reindex(self, target, method=None, level=None, limit=None, if (cats == -1).any(): # coerce to a regular index here! result = Index(np.array(self), name=self.name) - new_target, indexer, _ = result._reindex_non_unique( - np.array(target)) + new_target, indexer, _ = result._reindex_non_unique(np.array(target)) else: codes = new_target.codes.copy() @@ -628,7 +655,7 @@ def _reindex_non_unique(self, target): return new_target, indexer, new_indexer - @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) + @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): from pandas.core.arrays.categorical import _recode_for_categories @@ -636,24 +663,26 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): target = ibase.ensure_index(target) if self.is_unique and self.equals(target): - return np.arange(len(self), dtype='intp') - - if method == 'pad' or method == 'backfill': - raise NotImplementedError("method='pad' and method='backfill' not " - "implemented yet for CategoricalIndex") - elif method == 'nearest': - raise NotImplementedError("method='nearest' not implemented yet " - 'for CategoricalIndex') - - if (isinstance(target, CategoricalIndex) and - self.values.is_dtype_equal(target)): + return np.arange(len(self), dtype="intp") + + if method == "pad" or method == "backfill": + raise NotImplementedError( + "method='pad' and method='backfill' not " + "implemented yet for CategoricalIndex" + ) + elif method == "nearest": + raise NotImplementedError( + "method='nearest' not implemented yet " "for CategoricalIndex" + ) + + if isinstance(target, CategoricalIndex) and self.values.is_dtype_equal(target): if self.values.equals(target.values): # we have the same codes codes = target.codes else: - codes = _recode_for_categories(target.codes, - target.categories, - self.values.categories) + codes = _recode_for_categories( + target.codes, target.categories, self.values.categories + ) else: if isinstance(target, CategoricalIndex): code_indexer = self.categories.get_indexer(target.categories) @@ -664,7 +693,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): indexer, _ = self._engine.get_indexer_non_unique(codes) return ensure_platform_int(indexer) - @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) + @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) def get_indexer_non_unique(self, target): target = ibase.ensure_index(target) @@ -680,14 +709,14 @@ def get_indexer_non_unique(self, target): indexer, missing = self._engine.get_indexer_non_unique(codes) return ensure_platform_int(indexer), missing - @Appender(_index_shared_docs['_convert_scalar_indexer']) + @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): if self.categories._defer_to_indexing: return self.categories._convert_scalar_indexer(key, kind=kind) return super()._convert_scalar_indexer(key, kind=kind) - @Appender(_index_shared_docs['_convert_list_indexer']) + @Appender(_index_shared_docs["_convert_list_indexer"]) def _convert_list_indexer(self, keyarr, kind=None): # Return our indexer or raise if all of the values are not included in # the categories @@ -701,11 +730,12 @@ def _convert_list_indexer(self, keyarr, kind=None): raise KeyError( "a list-indexer must only " "include values that are " - "in the categories") + "in the categories" + ) return self.get_indexer(keyarr) - @Appender(_index_shared_docs['_convert_arr_indexer']) + @Appender(_index_shared_docs["_convert_arr_indexer"]) def _convert_arr_indexer(self, keyarr): keyarr = com.asarray_tuplesafe(keyarr) @@ -714,19 +744,21 @@ def _convert_arr_indexer(self, keyarr): return self._shallow_copy(keyarr) - @Appender(_index_shared_docs['_convert_index_indexer']) + @Appender(_index_shared_docs["_convert_index_indexer"]) def _convert_index_indexer(self, keyarr): return self._shallow_copy(keyarr) - @Appender(_index_shared_docs['take'] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, - fill_value=None, **kwargs): + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) indices = ensure_platform_int(indices) - taken = self._assert_take_fillable(self.codes, indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=-1) + taken = self._assert_take_fillable( + self.codes, + indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=-1, + ) return self._create_from_codes(taken) def is_dtype_equal(self, other): @@ -834,8 +866,10 @@ def insert(self, loc, item): """ code = self.categories.get_indexer([item]) if (code == -1) and not (is_scalar(item) and isna(item)): - raise TypeError("cannot insert an item into a CategoricalIndex " - "that is not already an existing category") + raise TypeError( + "cannot insert an item into a CategoricalIndex " + "that is not already an existing category" + ) codes = self.codes codes = np.concatenate((codes[:loc], code, codes[loc:])) @@ -850,8 +884,7 @@ def _concat_same_dtype(self, to_concat, name): Concatenate to_concat which has the same class ValueError if other is not in the categories """ - codes = np.concatenate([self._is_dtype_compat(c).codes - for c in to_concat]) + codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat]) result = self._create_from_codes(codes, name=name) # if name is None, _create_from_codes sets self.name result.name = name @@ -866,7 +899,7 @@ def _add_comparison_methods(cls): """ add in comparison methods """ def _make_compare(op): - opname = '__{op}__'.format(op=op.__name__) + opname = "__{op}__".format(op=op.__name__) def _evaluate_compare(self, other): @@ -875,19 +908,19 @@ def _evaluate_compare(self, other): if isinstance(other, CategoricalIndex): other = other._values elif isinstance(other, Index): - other = self._create_categorical( - other._values, dtype=self.dtype) + other = self._create_categorical(other._values, dtype=self.dtype) - if isinstance(other, (ABCCategorical, np.ndarray, - ABCSeries)): + if isinstance(other, (ABCCategorical, np.ndarray, ABCSeries)): if len(self.values) != len(other): raise ValueError("Lengths must match to compare") if isinstance(other, ABCCategorical): if not self.values.is_dtype_equal(other): - raise TypeError("categorical index comparisons must " - "have the same categories and ordered " - "attributes") + raise TypeError( + "categorical index comparisons must " + "have the same categories and ordered " + "attributes" + ) result = op(self.values, other) if isinstance(result, ABCSeries): @@ -908,7 +941,7 @@ def _evaluate_compare(self, other): def _delegate_method(self, name, *args, **kwargs): """ method delegation to the ._values """ method = getattr(self._values, name) - if 'inplace' in kwargs: + if "inplace" in kwargs: raise ValueError("cannot use inplace with CategoricalIndex") res = method(*args, **kwargs) if is_scalar(res): diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index e141f7b5c5b230..f2e6f631ae9ee2 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -14,15 +14,23 @@ from pandas.util._decorators import Appender, cache_readonly, deprecate_kwarg from pandas.core.dtypes.common import ( - ensure_int64, is_dtype_equal, is_float, is_integer, is_list_like, - is_period_dtype, is_scalar) + ensure_int64, + is_dtype_equal, + is_float, + is_integer, + is_list_like, + is_period_dtype, + is_scalar, +) from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries from pandas.core import algorithms, ops from pandas.core.accessor import PandasDelegate from pandas.core.arrays import ExtensionOpsMixin from pandas.core.arrays.datetimelike import ( - DatetimeLikeArrayMixin, _ensure_datetimelike_to_i8) + DatetimeLikeArrayMixin, + _ensure_datetimelike_to_i8, +) import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.tools.timedeltas import to_timedelta @@ -58,24 +66,24 @@ class DatetimeIndexOpsMixin(ExtensionOpsMixin): """ common ops mixin to support a unified interface datetimelike Index """ + _data = None # DatetimeLikeArrayMixin assumes subclasses are mutable, so these are # properties there. They can be made into cache_readonly for Index # subclasses bc they are immutable inferred_freq = cache_readonly( - DatetimeLikeArrayMixin.inferred_freq.fget) # type: ignore + DatetimeLikeArrayMixin.inferred_freq.fget + ) # type: ignore _isnan = cache_readonly(DatetimeLikeArrayMixin._isnan.fget) # type: ignore - hasnans = cache_readonly( - DatetimeLikeArrayMixin._hasnans.fget) # type: ignore + hasnans = cache_readonly(DatetimeLikeArrayMixin._hasnans.fget) # type: ignore _hasnans = hasnans # for index / array -agnostic code _resolution = cache_readonly( - DatetimeLikeArrayMixin._resolution.fget) # type: ignore - resolution = cache_readonly( - DatetimeLikeArrayMixin.resolution.fget) # type: ignore + DatetimeLikeArrayMixin._resolution.fget + ) # type: ignore + resolution = cache_readonly(DatetimeLikeArrayMixin.resolution.fget) # type: ignore - _maybe_mask_results = ea_passthrough( - DatetimeLikeArrayMixin._maybe_mask_results) + _maybe_mask_results = ea_passthrough(DatetimeLikeArrayMixin._maybe_mask_results) __iter__ = ea_passthrough(DatetimeLikeArrayMixin.__iter__) mean = ea_passthrough(DatetimeLikeArrayMixin.mean) @@ -114,6 +122,7 @@ def _create_comparison_method(cls, op): """ Create a comparison method that dispatches to ``cls.values``. """ + def wrapper(self, other): if isinstance(other, ABCSeries): # the arrays defer to Series for comparison ops but the indexes @@ -124,7 +133,7 @@ def wrapper(self, other): return result wrapper.__doc__ = op.__doc__ - wrapper.__name__ = '__{}__'.format(op.__name__) + wrapper.__name__ = "__{}__".format(op.__name__) return wrapper @property @@ -182,12 +191,14 @@ def _join_i8_wrapper(joinf, dtype, with_indexers=True): @staticmethod def wrapper(left, right): - if isinstance(left, (np.ndarray, ABCIndex, ABCSeries, - DatetimeLikeArrayMixin)): - left = left.view('i8') - if isinstance(right, (np.ndarray, ABCIndex, ABCSeries, - DatetimeLikeArrayMixin)): - right = right.view('i8') + if isinstance( + left, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin) + ): + left = left.view("i8") + if isinstance( + right, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin) + ): + right = right.view("i8") results = joinf(left, right) if with_indexers: join_index, left_indexer, right_indexer = results @@ -197,27 +208,30 @@ def wrapper(left, right): return wrapper - def _ensure_localized(self, arg, ambiguous='raise', nonexistent='raise', - from_utc=False): + def _ensure_localized( + self, arg, ambiguous="raise", nonexistent="raise", from_utc=False + ): # See DatetimeLikeArrayMixin._ensure_localized.__doc__ - if getattr(self, 'tz', None): + if getattr(self, "tz", None): # ensure_localized is only relevant for tz-aware DTI - result = self._data._ensure_localized(arg, - ambiguous=ambiguous, - nonexistent=nonexistent, - from_utc=from_utc) + result = self._data._ensure_localized( + arg, ambiguous=ambiguous, nonexistent=nonexistent, from_utc=from_utc + ) return type(self)._simple_new(result, name=self.name) return arg def _box_values(self, values): return self._data._box_values(values) - @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) + @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) def __contains__(self, key): try: res = self.get_loc(key) - return (is_scalar(res) or isinstance(res, slice) or - (is_list_like(res) and len(res))) + return ( + is_scalar(res) + or isinstance(res, slice) + or (is_list_like(res) and len(res)) + ) except (KeyError, TypeError, ValueError): return False @@ -232,7 +246,7 @@ def map(self, mapper, na_action=None): result = Index(result) if not isinstance(result, Index): - raise TypeError('The map function must return an Index object') + raise TypeError("The map function must return an Index object") return result except Exception: return self.astype(object).map(mapper) @@ -250,23 +264,22 @@ def sort_values(self, return_indexer=False, ascending=True): else: sorted_values = np.sort(self._ndarray_values) attribs = self._get_attributes_dict() - freq = attribs['freq'] + freq = attribs["freq"] if freq is not None and not is_period_dtype(self): if freq.n > 0 and not ascending: freq = freq * -1 elif freq.n < 0 and ascending: freq = freq * -1 - attribs['freq'] = freq + attribs["freq"] = freq if not ascending: sorted_values = sorted_values[::-1] return self._simple_new(sorted_values, **attribs) - @Appender(_index_shared_docs['take'] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, - fill_value=None, **kwargs): + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) indices = ensure_int64(indices) @@ -274,10 +287,13 @@ def take(self, indices, axis=0, allow_fill=True, if isinstance(maybe_slice, slice): return self[maybe_slice] - taken = self._assert_take_fillable(self.asi8, indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=iNaT) + taken = self._assert_take_fillable( + self.asi8, + indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=iNaT, + ) # keep freq in PeriodArray/Index, reset otherwise freq = self.freq if is_period_dtype(self) else None @@ -298,16 +314,18 @@ def asobject(self): *this is an internal non-public method* """ - warnings.warn("'asobject' is deprecated. Use 'astype(object)'" - " instead", FutureWarning, stacklevel=2) + warnings.warn( + "'asobject' is deprecated. Use 'astype(object)'" " instead", + FutureWarning, + stacklevel=2, + ) return self.astype(object) def _convert_tolerance(self, tolerance, target): tolerance = np.asarray(to_timedelta(tolerance).to_numpy()) if target.size != tolerance.size and tolerance.size > 1: - raise ValueError('list-like tolerance size must match ' - 'target index size') + raise ValueError("list-like tolerance size must match " "target index size") return tolerance def tolist(self): @@ -370,7 +388,7 @@ def argmin(self, axis=None, skipna=True, *args, **kwargs): if mask.all() or not skipna: return -1 i8 = i8.copy() - i8[mask] = np.iinfo('int64').max + i8[mask] = np.iinfo("int64").max return i8.argmin() def max(self, axis=None, skipna=True, *args, **kwargs): @@ -433,7 +451,7 @@ def argmax(self, axis=None, skipna=True, *args, **kwargs): # -------------------------------------------------------------------- # Rendering Methods - def _format_with_header(self, header, na_rep='NaT', **kwargs): + def _format_with_header(self, header, na_rep="NaT", **kwargs): return header + list(self._format_native_types(na_rep, **kwargs)) @property @@ -446,11 +464,11 @@ def _format_attrs(self): """ attrs = super()._format_attrs() for attrib in self._attributes: - if attrib == 'freq': + if attrib == "freq": freq = self.freqstr if freq is not None: freq = "'%s'" % freq - attrs.append(('freq', freq)) + attrs.append(("freq", freq)) return attrs # -------------------------------------------------------------------- @@ -466,17 +484,17 @@ def _convert_scalar_indexer(self, key, kind=None): kind : {'ix', 'loc', 'getitem', 'iloc'} or None """ - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + assert kind in ["ix", "loc", "getitem", "iloc", None] # we don't allow integer/float indexing for loc # we don't allow float indexing for ix/getitem if is_scalar(key): is_int = is_integer(key) is_flt = is_float(key) - if kind in ['loc'] and (is_int or is_flt): - self._invalid_indexer('index', key) - elif kind in ['ix', 'getitem'] and is_flt: - self._invalid_indexer('index', key) + if kind in ["loc"] and (is_int or is_flt): + self._invalid_indexer("index", key) + elif kind in ["ix", "getitem"] and is_flt: + self._invalid_indexer("index", key) return super()._convert_scalar_indexer(key, kind=kind) @@ -497,6 +515,7 @@ def __add__(self, other): def __radd__(self, other): # alias for __add__ return self.__add__(other) + cls.__radd__ = __radd__ def __sub__(self, other): @@ -555,21 +574,24 @@ def intersection(self, other, sort=False): result.freq = to_offset(result.inferred_freq) return result - elif (other.freq is None or self.freq is None or - other.freq != self.freq or - not other.freq.isAnchored() or - (not self.is_monotonic or not other.is_monotonic)): + elif ( + other.freq is None + or self.freq is None + or other.freq != self.freq + or not other.freq.isAnchored() + or (not self.is_monotonic or not other.is_monotonic) + ): result = Index.intersection(self, other, sort=sort) # Invalidate the freq of `result`, which may not be correct at # this point, depending on the values. result.freq = None - if hasattr(self, 'tz'): - result = self._shallow_copy(result._values, name=result.name, - tz=result.tz, freq=None) + if hasattr(self, "tz"): + result = self._shallow_copy( + result._values, name=result.name, tz=result.tz, freq=None + ) else: - result = self._shallow_copy(result._values, name=result.name, - freq=None) + result = self._shallow_copy(result._values, name=result.name, freq=None) if result.freq is None: result.freq = to_offset(result.inferred_freq) return result @@ -592,17 +614,17 @@ def intersection(self, other, sort=False): left_chunk = left.values[lslice] return self._shallow_copy(left_chunk) - @Appender(_index_shared_docs['repeat'] % _index_doc_kwargs) + @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) freq = self.freq if is_period_dtype(self) else None return self._shallow_copy(self.asi8.repeat(repeats), freq=freq) - @Appender(_index_shared_docs['where'] % _index_doc_kwargs) + @Appender(_index_shared_docs["where"] % _index_doc_kwargs) def where(self, cond, other=None): other = _ensure_datetimelike_to_i8(other, to_utc=True) values = _ensure_datetimelike_to_i8(self, to_utc=True) - result = np.where(cond, values, other).astype('i8') + result = np.where(cond, values, other).astype("i8") result = self._ensure_localized(result, from_utc=True) return self._shallow_copy(result) @@ -622,17 +644,19 @@ def _summary(self, name=None): """ formatter = self._formatter_func if len(self) > 0: - index_summary = ', %s to %s' % (formatter(self[0]), - formatter(self[-1])) + index_summary = ", %s to %s" % (formatter(self[0]), formatter(self[-1])) else: - index_summary = '' + index_summary = "" if name is None: name = type(self).__name__ - result = '%s: %s entries%s' % (printing.pprint_thing(name), - len(self), index_summary) + result = "%s: %s entries%s" % ( + printing.pprint_thing(name), + len(self), + index_summary, + ) if self.freq: - result += '\nFreq: %s' % self.freqstr + result += "\nFreq: %s" % self.freqstr # display as values, not quoted result = result.replace("'", "") @@ -643,10 +667,10 @@ def _concat_same_dtype(self, to_concat, name): Concatenate to_concat which has the same class. """ attribs = self._get_attributes_dict() - attribs['name'] = name + attribs["name"] = name # do not pass tz to set because tzlocal cannot be hashed if len({str(x.dtype) for x in to_concat}) != 1: - raise ValueError('to_concat must have the same tz') + raise ValueError("to_concat must have the same tz") new_data = type(self._values)._concat_same_type(to_concat).asi8 @@ -655,11 +679,11 @@ def _concat_same_dtype(self, to_concat, name): is_diff_evenly_spaced = len(unique_deltas(new_data)) == 1 if not is_period_dtype(self) and not is_diff_evenly_spaced: # reset freq - attribs['freq'] = None + attribs["freq"] = None return self._simple_new(new_data, **attribs) - @Appender(_index_shared_docs['astype']) + @Appender(_index_shared_docs["astype"]) def astype(self, dtype, copy=True): if is_dtype_equal(self.dtype, dtype) and copy is False: # Ensure that self.astype(self.dtype) is self @@ -669,10 +693,9 @@ def astype(self, dtype, copy=True): # pass copy=False because any copying will be done in the # _data.astype call above - return Index(new_values, - dtype=new_values.dtype, name=self.name, copy=False) + return Index(new_values, dtype=new_values.dtype, name=self.name, copy=False) - @deprecate_kwarg(old_arg_name='n', new_arg_name='periods') + @deprecate_kwarg(old_arg_name="n", new_arg_name="periods") def shift(self, periods, freq=None): """ Shift index by desired number of time frequency increments. @@ -714,8 +737,10 @@ def wrap_arithmetic_op(self, other, result): if isinstance(result, tuple): # divmod, rdivmod assert len(result) == 2 - return (wrap_arithmetic_op(self, other, result[0]), - wrap_arithmetic_op(self, other, result[1])) + return ( + wrap_arithmetic_op(self, other, result[0]), + wrap_arithmetic_op(self, other, result[1]), + ) if not isinstance(result, Index): # Index.__new__ will choose appropriate subclass for dtype @@ -763,6 +788,7 @@ class DatetimelikeDelegateMixin(PandasDelegate): The set of properties whose results should should *not* be boxed in an index, after being returned from the array """ + # raw_methods : dispatch methods that shouldn't be boxed in an Index _raw_methods = set() # type: Set[str] # raw_properties : dispatch properties that shouldn't be boxed in an Index diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e2658b66f83ba1..5024eebe03bb47 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -10,20 +10,33 @@ from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.core.dtypes.common import ( - _NS_DTYPE, ensure_int64, is_float, is_integer, is_list_like, is_scalar, - is_string_like) + _NS_DTYPE, + ensure_int64, + is_float, + is_integer, + is_list_like, + is_scalar, + is_string_like, +) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna from pandas.core.accessor import delegate_names from pandas.core.arrays.datetimes import ( - DatetimeArray, _to_M8, tz_to_dtype, validate_tz_from_dtype) + DatetimeArray, + _to_M8, + tz_to_dtype, + validate_tz_from_dtype, +) from pandas.core.base import _shared_docs import pandas.core.common as com from pandas.core.indexes.base import Index from pandas.core.indexes.datetimelike import ( - DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, ea_passthrough) + DatetimeIndexOpsMixin, + DatetimelikeDelegateMixin, + ea_passthrough, +) from pandas.core.indexes.numeric import Int64Index from pandas.core.ops import get_op_result_name import pandas.core.tools.datetimes as tools @@ -56,42 +69,31 @@ class DatetimeDelegateMixin(DatetimelikeDelegateMixin): # Some are "raw" methods, the result is not not re-boxed in an Index # We also have a few "extra" attrs, which may or may not be raw, # which we we dont' want to expose in the .dt accessor. - _extra_methods = [ - 'to_period', - 'to_perioddelta', - 'to_julian_date', - ] - _extra_raw_methods = [ - 'to_pydatetime', - '_local_timestamps', - '_has_same_tz', - ] - _extra_raw_properties = [ - '_box_func', - 'tz', 'tzinfo', - ] - _delegated_properties = ( - DatetimeArray._datetimelike_ops + _extra_raw_properties - ) + _extra_methods = ["to_period", "to_perioddelta", "to_julian_date"] + _extra_raw_methods = ["to_pydatetime", "_local_timestamps", "_has_same_tz"] + _extra_raw_properties = ["_box_func", "tz", "tzinfo"] + _delegated_properties = DatetimeArray._datetimelike_ops + _extra_raw_properties _delegated_methods = ( - DatetimeArray._datetimelike_methods + _extra_methods + - _extra_raw_methods + DatetimeArray._datetimelike_methods + _extra_methods + _extra_raw_methods + ) + _raw_properties = ( + {"date", "time", "timetz"} + | set(DatetimeArray._bool_ops) + | set(_extra_raw_properties) ) - _raw_properties = { - 'date', - 'time', - 'timetz', - } | set(DatetimeArray._bool_ops) | set(_extra_raw_properties) _raw_methods = set(_extra_raw_methods) _delegate_class = DatetimeArray -@delegate_names(DatetimeArray, - DatetimeDelegateMixin._delegated_properties, - typ="property") -@delegate_names(DatetimeArray, - DatetimeDelegateMixin._delegated_methods, - typ="method", overwrite=False) +@delegate_names( + DatetimeArray, DatetimeDelegateMixin._delegated_properties, typ="property" +) +@delegate_names( + DatetimeArray, + DatetimeDelegateMixin._delegated_methods, + typ="method", + overwrite=False, +) class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index, DatetimeDelegateMixin): """ Immutable ndarray of datetime64 data, represented internally as int64, and @@ -221,25 +223,26 @@ class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index, DatetimeDelegateMixin): Creating a DatetimeIndex based on `start`, `periods`, and `end` has been deprecated in favor of :func:`date_range`. """ - _typ = 'datetimeindex' + + _typ = "datetimeindex" _join_precedence = 10 def _join_i8_wrapper(joinf, **kwargs): - return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype='M8[ns]', - **kwargs) + return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype="M8[ns]", **kwargs) _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer_int64) _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer_int64) _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer_int64) _left_indexer_unique = _join_i8_wrapper( - libjoin.left_join_indexer_unique_int64, with_indexers=False) + libjoin.left_join_indexer_unique_int64, with_indexers=False + ) _engine_type = libindex.DatetimeEngine _tz = None _freq = None - _comparables = ['name', 'freqstr', 'tz'] - _attributes = ['name', 'tz', 'freq'] + _comparables = ["name", "freqstr", "tz"] + _attributes = ["name", "tz", "freq"] _is_numeric_dtype = False _infer_as_myclass = True @@ -256,48 +259,81 @@ def _join_i8_wrapper(joinf, **kwargs): # -------------------------------------------------------------------- # Constructors - def __new__(cls, data=None, - freq=None, start=None, end=None, periods=None, tz=None, - normalize=False, closed=None, ambiguous='raise', - dayfirst=False, yearfirst=False, dtype=None, - copy=False, name=None, verify_integrity=None): + def __new__( + cls, + data=None, + freq=None, + start=None, + end=None, + periods=None, + tz=None, + normalize=False, + closed=None, + ambiguous="raise", + dayfirst=False, + yearfirst=False, + dtype=None, + copy=False, + name=None, + verify_integrity=None, + ): if verify_integrity is not None: - warnings.warn("The 'verify_integrity' argument is deprecated, " - "will be removed in a future version.", - FutureWarning, stacklevel=2) + warnings.warn( + "The 'verify_integrity' argument is deprecated, " + "will be removed in a future version.", + FutureWarning, + stacklevel=2, + ) else: verify_integrity = True if data is None: dtarr = DatetimeArray._generate_range( - start, end, periods, - freq=freq, tz=tz, normalize=normalize, - closed=closed, ambiguous=ambiguous) - warnings.warn("Creating a DatetimeIndex by passing range " - "endpoints is deprecated. Use " - "`pandas.date_range` instead.", - FutureWarning, stacklevel=2) - return cls._simple_new( - dtarr._data, freq=dtarr.freq, tz=dtarr.tz, name=name) + start, + end, + periods, + freq=freq, + tz=tz, + normalize=normalize, + closed=closed, + ambiguous=ambiguous, + ) + warnings.warn( + "Creating a DatetimeIndex by passing range " + "endpoints is deprecated. Use " + "`pandas.date_range` instead.", + FutureWarning, + stacklevel=2, + ) + return cls._simple_new(dtarr._data, freq=dtarr.freq, tz=dtarr.tz, name=name) if is_scalar(data): - raise TypeError("{cls}() must be called with a " - "collection of some kind, {data} was passed" - .format(cls=cls.__name__, data=repr(data))) + raise TypeError( + "{cls}() must be called with a " + "collection of some kind, {data} was passed".format( + cls=cls.__name__, data=repr(data) + ) + ) # - Cases checked above all return/raise before reaching here - # - if name is None and hasattr(data, 'name'): + if name is None and hasattr(data, "name"): name = data.name dtarr = DatetimeArray._from_sequence( - data, dtype=dtype, copy=copy, tz=tz, freq=freq, - dayfirst=dayfirst, yearfirst=yearfirst, ambiguous=ambiguous, - int_as_wall_time=True) - - subarr = cls._simple_new(dtarr, name=name, - freq=dtarr.freq, tz=dtarr.tz) + data, + dtype=dtype, + copy=copy, + tz=tz, + freq=freq, + dayfirst=dayfirst, + yearfirst=yearfirst, + ambiguous=ambiguous, + int_as_wall_time=True, + ) + + subarr = cls._simple_new(dtarr, name=name, freq=dtarr.freq, tz=dtarr.tz) return subarr @classmethod @@ -337,8 +373,11 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None): # -------------------------------------------------------------------- def __array__(self, dtype=None): - if (dtype is None and isinstance(self._data, DatetimeArray) - and getattr(self.dtype, 'tz', None)): + if ( + dtype is None + and isinstance(self._data, DatetimeArray) + and getattr(self.dtype, "tz", None) + ): msg = ( "Converting timezone-aware DatetimeArray to timezone-naive " "ndarray with 'datetime64[ns]' dtype. In the future, this " @@ -348,7 +387,7 @@ def __array__(self, dtype=None): "To keep the old behavior, pass 'dtype=\"datetime64[ns]\"'." ) warnings.warn(msg, FutureWarning, stacklevel=3) - dtype = 'M8[ns]' + dtype = "M8[ns]" return np.asarray(self._data, dtype=dtype) @property @@ -363,8 +402,10 @@ def tz(self): @tz.setter def tz(self, value): # GH 3746: Prevent localizing or converting the index by setting tz - raise AttributeError("Cannot directly set timezone. Use tz_localize() " - "or tz_convert() as appropriate") + raise AttributeError( + "Cannot directly set timezone. Use tz_localize() " + "or tz_convert() as appropriate" + ) tzinfo = tz @@ -372,6 +413,7 @@ def tz(self, value): def _is_dates_only(self): """Return a boolean if we are only dates (and don't have a timezone)""" from pandas.io.formats.format import _is_dates_only + return _is_dates_only(self.values) and self.tz is None def __reduce__(self): @@ -413,20 +455,21 @@ def __setstate__(self, state): else: raise Exception("invalid pickle state") + _unpickle_compat = __setstate__ def _convert_for_op(self, value): """ Convert value to be insertable to ndarray """ if self._has_same_tz(value): return _to_M8(value) - raise ValueError('Passed item and index have different timezone') + raise ValueError("Passed item and index have different timezone") def _maybe_update_attributes(self, attrs): """ Update Index attributes (e.g. freq) depending on op """ - freq = attrs.get('freq', None) + freq = attrs.get("freq", None) if freq is not None: # no need to infer if freq is None - attrs['freq'] = 'infer' + attrs["freq"] = "infer" return attrs # -------------------------------------------------------------------- @@ -436,18 +479,19 @@ def _mpl_repr(self): # how to represent ourselves to matplotlib return libts.ints_to_pydatetime(self.asi8, self.tz) - def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): + def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): from pandas.io.formats.format import _get_format_datetime64_from_values + fmt = _get_format_datetime64_from_values(self, date_format) - return libts.format_array_from_datetime(self.asi8, - tz=self.tz, - format=fmt, - na_rep=na_rep) + return libts.format_array_from_datetime( + self.asi8, tz=self.tz, format=fmt, na_rep=na_rep + ) @property def _formatter_func(self): from pandas.io.formats.format import _get_format_datetime64 + formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) return lambda x: "'%s'" % formatter(x, tz=self.tz) @@ -477,8 +521,9 @@ def _union(self, other, sort): # TODO: we shouldn't be setting attributes like this; # in all the tests this equality already holds result._data._dtype = this.dtype - if (result.freq is None and - (this.freq is not None or other.freq is not None)): + if result.freq is None and ( + this.freq is not None or other.freq is not None + ): result.freq = to_offset(result.inferred_freq) return result @@ -561,7 +606,7 @@ def _fast_union(self, other, sort=None): elif sort is False: left, right = self, other left_start = left[0] - loc = right.searchsorted(left_start, side='left') + loc = right.searchsorted(left_start, side="left") right_chunk = right.values[:loc] dates = _concat._concat_compat((left.values, right_chunk)) return self._shallow_copy(dates) @@ -577,7 +622,7 @@ def _fast_union(self, other, sort=None): # concatenate dates if left_end < right_end: - loc = right.searchsorted(left_end, side='right') + loc = right.searchsorted(left_end, side="right") right_chunk = right.values[loc:] dates = _concat._concat_compat((left.values, right_chunk)) return self._shallow_copy(dates) @@ -666,18 +711,26 @@ def to_series(self, keep_tz=None, index=None, name=None): name = self.name if keep_tz is None and self.tz is not None: - warnings.warn("The default of the 'keep_tz' keyword in " - "DatetimeIndex.to_series will change " - "to True in a future release. You can set " - "'keep_tz=True' to obtain the future behaviour and " - "silence this warning.", FutureWarning, stacklevel=2) + warnings.warn( + "The default of the 'keep_tz' keyword in " + "DatetimeIndex.to_series will change " + "to True in a future release. You can set " + "'keep_tz=True' to obtain the future behaviour and " + "silence this warning.", + FutureWarning, + stacklevel=2, + ) keep_tz = False elif keep_tz is False: - warnings.warn("Specifying 'keep_tz=False' is deprecated and this " - "option will be removed in a future release. If " - "you want to remove the timezone information, you " - "can do 'idx.tz_convert(None)' before calling " - "'to_series'.", FutureWarning, stacklevel=2) + warnings.warn( + "Specifying 'keep_tz=False' is deprecated and this " + "option will be removed in a future release. If " + "you want to remove the timezone information, you " + "can do 'idx.tz_convert(None)' before calling " + "'to_series'.", + FutureWarning, + stacklevel=2, + ) if keep_tz and self.tz is not None: # preserve the tz & copy @@ -687,7 +740,7 @@ def to_series(self, keep_tz=None, index=None, name=None): return Series(values, index=index, name=name) - def snap(self, freq='S'): + def snap(self, freq="S"): """ Snap time stamps to nearest occurring frequency @@ -712,52 +765,67 @@ def snap(self, freq='S'): snapped[i] = s # we know it conforms; skip check - return DatetimeIndex._simple_new(snapped, name=self.name, tz=self.tz, - freq=freq) + return DatetimeIndex._simple_new(snapped, name=self.name, tz=self.tz, freq=freq) - def join(self, other, how='left', level=None, return_indexers=False, - sort=False): + def join(self, other, how="left", level=None, return_indexers=False, sort=False): """ See Index.join """ - if (not isinstance(other, DatetimeIndex) and len(other) > 0 and - other.inferred_type not in ('floating', 'integer', 'mixed-integer', - 'mixed-integer-float', 'mixed')): + if ( + not isinstance(other, DatetimeIndex) + and len(other) > 0 + and other.inferred_type + not in ( + "floating", + "integer", + "mixed-integer", + "mixed-integer-float", + "mixed", + ) + ): try: other = DatetimeIndex(other) except (TypeError, ValueError): pass this, other = self._maybe_utc_convert(other) - return Index.join(this, other, how=how, level=level, - return_indexers=return_indexers, sort=sort) + return Index.join( + this, + other, + how=how, + level=level, + return_indexers=return_indexers, + sort=sort, + ) def _maybe_utc_convert(self, other): this = self if isinstance(other, DatetimeIndex): if self.tz is not None: if other.tz is None: - raise TypeError('Cannot join tz-naive with tz-aware ' - 'DatetimeIndex') + raise TypeError( + "Cannot join tz-naive with tz-aware " "DatetimeIndex" + ) elif other.tz is not None: - raise TypeError('Cannot join tz-naive with tz-aware ' - 'DatetimeIndex') + raise TypeError("Cannot join tz-naive with tz-aware " "DatetimeIndex") if not timezones.tz_compare(self.tz, other.tz): - this = self.tz_convert('UTC') - other = other.tz_convert('UTC') + this = self.tz_convert("UTC") + other = other.tz_convert("UTC") return this, other def _wrap_joined_index(self, joined, other): name = get_op_result_name(self, other) - if (isinstance(other, DatetimeIndex) and - self.freq == other.freq and - self._can_fast_union(other)): + if ( + isinstance(other, DatetimeIndex) + and self.freq == other.freq + and self._can_fast_union(other) + ): joined = self._shallow_copy(joined) joined.name = name return joined else: - tz = getattr(other, 'tz', None) + tz = getattr(other, "tz", None) return self._simple_new(joined, name, tz=tz) def _parsed_string_to_bounds(self, reso, parsed): @@ -776,41 +844,63 @@ def _parsed_string_to_bounds(self, reso, parsed): lower, upper: pd.Timestamp """ - valid_resos = {'year', 'month', 'quarter', 'day', 'hour', 'minute', - 'second', 'minute', 'second', 'microsecond'} + valid_resos = { + "year", + "month", + "quarter", + "day", + "hour", + "minute", + "second", + "minute", + "second", + "microsecond", + } if reso not in valid_resos: raise KeyError - if reso == 'year': + if reso == "year": start = Timestamp(parsed.year, 1, 1) end = Timestamp(parsed.year, 12, 31, 23, 59, 59, 999999) - elif reso == 'month': + elif reso == "month": d = ccalendar.get_days_in_month(parsed.year, parsed.month) start = Timestamp(parsed.year, parsed.month, 1) end = Timestamp(parsed.year, parsed.month, d, 23, 59, 59, 999999) - elif reso == 'quarter': + elif reso == "quarter": qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead d = ccalendar.get_days_in_month(parsed.year, qe) # at end of month start = Timestamp(parsed.year, parsed.month, 1) end = Timestamp(parsed.year, qe, d, 23, 59, 59, 999999) - elif reso == 'day': + elif reso == "day": start = Timestamp(parsed.year, parsed.month, parsed.day) end = start + timedelta(days=1) - Nano(1) - elif reso == 'hour': - start = Timestamp(parsed.year, parsed.month, parsed.day, - parsed.hour) + elif reso == "hour": + start = Timestamp(parsed.year, parsed.month, parsed.day, parsed.hour) end = start + timedelta(hours=1) - Nano(1) - elif reso == 'minute': - start = Timestamp(parsed.year, parsed.month, parsed.day, - parsed.hour, parsed.minute) + elif reso == "minute": + start = Timestamp( + parsed.year, parsed.month, parsed.day, parsed.hour, parsed.minute + ) end = start + timedelta(minutes=1) - Nano(1) - elif reso == 'second': - start = Timestamp(parsed.year, parsed.month, parsed.day, - parsed.hour, parsed.minute, parsed.second) + elif reso == "second": + start = Timestamp( + parsed.year, + parsed.month, + parsed.day, + parsed.hour, + parsed.minute, + parsed.second, + ) end = start + timedelta(seconds=1) - Nano(1) - elif reso == 'microsecond': - start = Timestamp(parsed.year, parsed.month, parsed.day, - parsed.hour, parsed.minute, parsed.second, - parsed.microsecond) + elif reso == "microsecond": + start = Timestamp( + parsed.year, + parsed.month, + parsed.day, + parsed.hour, + parsed.minute, + parsed.second, + parsed.microsecond, + ) end = start + timedelta(microseconds=1) - Nano(1) # GH 24076 # If an incoming date string contained a UTC offset, need to localize @@ -818,9 +908,11 @@ def _parsed_string_to_bounds(self, reso, parsed): # timezone if parsed.tzinfo is not None: if self.tz is None: - raise ValueError("The index must be timezone aware " - "when indexing with a date string with a " - "UTC offset") + raise ValueError( + "The index must be timezone aware " + "when indexing with a date string with a " + "UTC offset" + ) start = start.tz_localize(parsed.tzinfo).tz_convert(self.tz) end = end.tz_localize(parsed.tzinfo).tz_convert(self.tz) elif self.tz is not None: @@ -830,15 +922,18 @@ def _parsed_string_to_bounds(self, reso, parsed): def _partial_date_slice(self, reso, parsed, use_lhs=True, use_rhs=True): is_monotonic = self.is_monotonic - if (is_monotonic and reso in ['day', 'hour', 'minute', 'second'] and - self._resolution >= Resolution.get_reso(reso)): + if ( + is_monotonic + and reso in ["day", "hour", "minute", "second"] + and self._resolution >= Resolution.get_reso(reso) + ): # These resolution/monotonicity validations came from GH3931, # GH3452 and GH2369. # See also GH14826 raise KeyError - if reso == 'microsecond': + if reso == "microsecond": # _partial_date_slice doesn't allow microsecond resolution, but # _parsed_string_to_bounds allows it. raise KeyError @@ -849,17 +944,15 @@ def _partial_date_slice(self, reso, parsed, use_lhs=True, use_rhs=True): if is_monotonic: # we are out of range - if (len(stamps) and ((use_lhs and t1.value < stamps[0] and - t2.value < stamps[0]) or - ((use_rhs and t1.value > stamps[-1] and - t2.value > stamps[-1])))): + if len(stamps) and ( + (use_lhs and t1.value < stamps[0] and t2.value < stamps[0]) + or ((use_rhs and t1.value > stamps[-1] and t2.value > stamps[-1])) + ): raise KeyError # a monotonic (sorted) series can be sliced - left = stamps.searchsorted( - t1.value, side='left') if use_lhs else None - right = stamps.searchsorted( - t2.value, side='right') if use_rhs else None + left = stamps.searchsorted(t1.value, side="left") if use_lhs else None + right = stamps.searchsorted(t2.value, side="right") if use_rhs else None return slice(left, right) @@ -870,7 +963,7 @@ def _partial_date_slice(self, reso, parsed, use_lhs=True, use_rhs=True): return (lhs_mask & rhs_mask).nonzero()[0] def _maybe_promote(self, other): - if other.inferred_type == 'date': + if other.inferred_type == "date": other = DatetimeIndex(other) return self, other @@ -896,8 +989,7 @@ def get_value(self, series, key): return series.take(locs) try: - return com.maybe_box(self, Index.get_value(self, series, key), - series, key) + return com.maybe_box(self, Index.get_value(self, series, key), series, key) except KeyError: try: loc = self._get_string_slice(key) @@ -920,8 +1012,7 @@ def get_value_maybe_box(self, series, key): key = key.tz_localize(self.tz) elif not isinstance(key, Timestamp): key = Timestamp(key) - values = self._engine.get_value(com.values_from_object(series), - key, tz=self.tz) + values = self._engine.get_value(com.values_from_object(series), key, tz=self.tz) return com.maybe_box(self, values, series, key) def get_loc(self, key, method=None, tolerance=None): @@ -948,14 +1039,17 @@ def get_loc(self, key, method=None, tolerance=None): elif isinstance(key, timedelta): # GH#20464 - raise TypeError("Cannot index {cls} with {other}" - .format(cls=type(self).__name__, - other=type(key).__name__)) + raise TypeError( + "Cannot index {cls} with {other}".format( + cls=type(self).__name__, other=type(key).__name__ + ) + ) if isinstance(key, time): if method is not None: - raise NotImplementedError('cannot yet lookup inexact labels ' - 'when key is a time object') + raise NotImplementedError( + "cannot yet lookup inexact labels " "when key is a time object" + ) return self.indexer_at_time(key) try: @@ -977,7 +1071,7 @@ def get_loc(self, key, method=None, tolerance=None): raise KeyError(key) except ValueError as e: # list-like tolerance size must match target index size - if 'list-like' in str(e): + if "list-like" in str(e): raise e raise KeyError(key) @@ -1000,14 +1094,13 @@ def _maybe_cast_slice_bound(self, label, side, kind): Value of `side` parameter should be validated in caller. """ - assert kind in ['ix', 'loc', 'getitem', None] + assert kind in ["ix", "loc", "getitem", None] if is_float(label) or isinstance(label, time) or is_integer(label): - self._invalid_indexer('slice', label) + self._invalid_indexer("slice", label) if isinstance(label, str): - freq = getattr(self, 'freqstr', - getattr(self, 'inferred_freq', None)) + freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) _, parsed, reso = parsing.parse_time_string(label, freq) lower, upper = self._parsed_string_to_bounds(reso, parsed) # lower, upper form the half-open interval: @@ -1017,17 +1110,15 @@ def _maybe_cast_slice_bound(self, label, side, kind): # length > 1 (is_monotonic_decreasing gives True for empty # and length 1 index) if self._is_strictly_monotonic_decreasing and len(self) > 1: - return upper if side == 'left' else lower - return lower if side == 'left' else upper + return upper if side == "left" else lower + return lower if side == "left" else upper else: return label def _get_string_slice(self, key, use_lhs=True, use_rhs=True): - freq = getattr(self, 'freqstr', - getattr(self, 'inferred_freq', None)) + freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) _, parsed, reso = parsing.parse_time_string(key, freq) - loc = self._partial_date_slice(reso, parsed, use_lhs=use_lhs, - use_rhs=use_rhs) + loc = self._partial_date_slice(reso, parsed, use_lhs=use_lhs, use_rhs=use_rhs) return loc def slice_indexer(self, start=None, end=None, step=None, kind=None): @@ -1049,11 +1140,11 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): # an array of (self.hour, self.minute, self.seconds, self.microsecond). if isinstance(start, time) and isinstance(end, time): if step is not None and step != 1: - raise ValueError('Must have step size of 1 with time slices') + raise ValueError("Must have step size of 1 with time slices") return self.indexer_between_time(start, end) if isinstance(start, time) or isinstance(end, time): - raise KeyError('Cannot mix time and non-time slice keys') + raise KeyError("Cannot mix time and non-time slice keys") try: return Index.slice_indexer(self, start, end, step, kind=kind) @@ -1061,17 +1152,16 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): # For historical reasons DatetimeIndex by default supports # value-based partial (aka string) slices on non-monotonic arrays, # let's try that. - if ((start is None or isinstance(start, str)) and - (end is None or isinstance(end, str))): + if (start is None or isinstance(start, str)) and ( + end is None or isinstance(end, str) + ): mask = True if start is not None: - start_casted = self._maybe_cast_slice_bound( - start, 'left', kind) + start_casted = self._maybe_cast_slice_bound(start, "left", kind) mask = start_casted <= self if end is not None: - end_casted = self._maybe_cast_slice_bound( - end, 'right', kind) + end_casted = self._maybe_cast_slice_bound(end, "right", kind) mask = (self <= end_casted) & mask indexer = mask.nonzero()[0][::step] @@ -1091,10 +1181,8 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): _is_unique = Index.is_unique _timezone = cache_readonly(DatetimeArray._timezone.fget) # type: ignore - is_normalized = cache_readonly( - DatetimeArray.is_normalized.fget) # type: ignore - _resolution = cache_readonly( - DatetimeArray._resolution.fget) # type: ignore + is_normalized = cache_readonly(DatetimeArray.is_normalized.fget) # type: ignore + _resolution = cache_readonly(DatetimeArray._resolution.fget) # type: ignore strftime = ea_passthrough(DatetimeArray.strftime) _has_same_tz = ea_passthrough(DatetimeArray._has_same_tz) @@ -1104,9 +1192,12 @@ def offset(self): """ get/set the frequency of the instance """ - msg = ('{cls}.offset has been deprecated and will be removed ' - 'in a future version; use {cls}.freq instead.' - .format(cls=type(self).__name__)) + msg = ( + "{cls}.offset has been deprecated and will be removed " + "in a future version; use {cls}.freq instead.".format( + cls=type(self).__name__ + ) + ) warnings.warn(msg, FutureWarning, stacklevel=2) return self.freq @@ -1115,9 +1206,12 @@ def offset(self, value): """ get/set the frequency of the instance """ - msg = ('{cls}.offset has been deprecated and will be removed ' - 'in a future version; use {cls}.freq instead.' - .format(cls=type(self).__name__)) + msg = ( + "{cls}.offset has been deprecated and will be removed " + "in a future version; use {cls}.freq instead.".format( + cls=type(self).__name__ + ) + ) warnings.warn(msg, FutureWarning, stacklevel=2) self.freq = value @@ -1138,9 +1232,9 @@ def _box_func(self): # -------------------------------------------------------------------- - @Substitution(klass='DatetimeIndex') - @Appender(_shared_docs['searchsorted']) - def searchsorted(self, value, side='left', sorter=None): + @Substitution(klass="DatetimeIndex") + @Appender(_shared_docs["searchsorted"]) + def searchsorted(self, value, side="left", sorter=None): if isinstance(value, (np.ndarray, Index)): value = np.array(value, dtype=_NS_DTYPE, copy=False) else: @@ -1149,13 +1243,13 @@ def searchsorted(self, value, side='left', sorter=None): return self.values.searchsorted(value, side=side) def is_type_compatible(self, typ): - return typ == self.inferred_type or typ == 'datetime' + return typ == self.inferred_type or typ == "datetime" @property def inferred_type(self): # b/c datetime is represented as microseconds since the epoch, make # sure we can't have ambiguous indexing - return 'datetime64' + return "datetime64" @property def is_all_dates(self): @@ -1185,28 +1279,26 @@ def insert(self, loc, item): if isinstance(item, (datetime, np.datetime64)): self._assert_can_do_op(item) if not self._has_same_tz(item) and not isna(item): - raise ValueError( - 'Passed item and index have different timezone') + raise ValueError("Passed item and index have different timezone") # check freq can be preserved on edge cases if self.size and self.freq is not None: - if ((loc == 0 or loc == -len(self)) and - item + self.freq == self[0]): + if (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: freq = self.freq elif (loc == len(self)) and item - self.freq == self[-1]: freq = self.freq item = _to_M8(item, tz=self.tz) try: - new_dates = np.concatenate((self[:loc].asi8, [item.view(np.int64)], - self[loc:].asi8)) + new_dates = np.concatenate( + (self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8) + ) return self._shallow_copy(new_dates, freq=freq) except (AttributeError, TypeError): # fall back to object index if isinstance(item, str): return self.astype(object).insert(loc, item) - raise TypeError( - "cannot insert DatetimeIndex with incompatible label") + raise TypeError("cannot insert DatetimeIndex with incompatible label") def delete(self, loc): """ @@ -1229,10 +1321,9 @@ def delete(self, loc): freq = self.freq else: if is_list_like(loc): - loc = lib.maybe_indices_to_slice( - ensure_int64(np.array(loc)), len(self)) + loc = lib.maybe_indices_to_slice(ensure_int64(np.array(loc)), len(self)) if isinstance(loc, slice) and loc.step in (1, None): - if (loc.start in (0, None) or loc.stop in (len(self), None)): + if loc.start in (0, None) or loc.stop in (len(self), None): freq = self.freq return self._shallow_copy(new_dates, freq=freq) @@ -1262,6 +1353,7 @@ def indexer_at_time(self, time, asof=False): if isinstance(time, str): from dateutil.parser import parse + time = parse(time).time() if time.tzinfo: @@ -1273,8 +1365,9 @@ def indexer_at_time(self, time, asof=False): micros = _time_to_micros(time) return (micros == time_micros).nonzero()[0] - def indexer_between_time(self, start_time, end_time, include_start=True, - include_end=True): + def indexer_between_time( + self, start_time, end_time, include_start=True, include_end=True + ): """ Return index locations of values between particular times of day (e.g., 9:00-9:30AM). @@ -1318,8 +1411,7 @@ def indexer_between_time(self, start_time, end_time, include_start=True, else: join_op = operator.or_ - mask = join_op(lop(start_micros, time_micros), - rop(time_micros, end_micros)) + mask = join_op(lop(start_micros, time_micros), rop(time_micros, end_micros)) return mask.nonzero()[0] @@ -1330,8 +1422,17 @@ def indexer_between_time(self, start_time, end_time, include_start=True, DatetimeIndex._add_datetimelike_methods() -def date_range(start=None, end=None, periods=None, freq=None, tz=None, - normalize=False, name=None, closed=None, **kwargs): +def date_range( + start=None, + end=None, + periods=None, + freq=None, + tz=None, + normalize=False, + name=None, + closed=None, + **kwargs +): """ Return a fixed frequency DatetimeIndex. @@ -1470,19 +1571,34 @@ def date_range(start=None, end=None, periods=None, freq=None, tz=None, """ if freq is None and com._any_none(periods, start, end): - freq = 'D' + freq = "D" dtarr = DatetimeArray._generate_range( - start=start, end=end, periods=periods, - freq=freq, tz=tz, normalize=normalize, - closed=closed, **kwargs) - return DatetimeIndex._simple_new( - dtarr, tz=dtarr.tz, freq=dtarr.freq, name=name) - - -def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, - normalize=True, name=None, weekmask=None, holidays=None, - closed=None, **kwargs): + start=start, + end=end, + periods=periods, + freq=freq, + tz=tz, + normalize=normalize, + closed=closed, + **kwargs + ) + return DatetimeIndex._simple_new(dtarr, tz=dtarr.tz, freq=dtarr.freq, name=name) + + +def bdate_range( + start=None, + end=None, + periods=None, + freq="B", + tz=None, + normalize=True, + name=None, + weekmask=None, + holidays=None, + closed=None, + **kwargs +): """ Return a fixed frequency DatetimeIndex, with business day as the default frequency @@ -1548,24 +1664,34 @@ def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, dtype='datetime64[ns]', freq='B') """ if freq is None: - msg = 'freq must be specified for bdate_range; use date_range instead' + msg = "freq must be specified for bdate_range; use date_range instead" raise TypeError(msg) - if is_string_like(freq) and freq.startswith('C'): + if is_string_like(freq) and freq.startswith("C"): try: - weekmask = weekmask or 'Mon Tue Wed Thu Fri' + weekmask = weekmask or "Mon Tue Wed Thu Fri" freq = prefix_mapping[freq](holidays=holidays, weekmask=weekmask) except (KeyError, TypeError): - msg = 'invalid custom frequency string: {freq}'.format(freq=freq) + msg = "invalid custom frequency string: {freq}".format(freq=freq) raise ValueError(msg) elif holidays or weekmask: - msg = ('a custom frequency string is required when holidays or ' - 'weekmask are passed, got frequency {freq}').format(freq=freq) + msg = ( + "a custom frequency string is required when holidays or " + "weekmask are passed, got frequency {freq}" + ).format(freq=freq) raise ValueError(msg) - return date_range(start=start, end=end, periods=periods, - freq=freq, tz=tz, normalize=normalize, name=name, - closed=closed, **kwargs) + return date_range( + start=start, + end=end, + periods=periods, + freq=freq, + tz=tz, + normalize=normalize, + name=name, + closed=closed, + **kwargs + ) def _time_to_micros(time): diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index aeb0fa119ab337..2e5b3ff8ef502d 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -28,6 +28,7 @@ class FrozenList(PandasObject, list): because it's technically non-hashable, will be used for lookups, appropriately, etc. """ + # Side note: This has to be of type list. Otherwise, # it messes up PyTables type checks. @@ -105,16 +106,15 @@ def __hash__(self): def _disabled(self, *args, **kwargs): """This method will not function because object is immutable.""" - raise TypeError("'%s' does not support mutable operations." % - self.__class__.__name__) + raise TypeError( + "'%s' does not support mutable operations." % self.__class__.__name__ + ) def __str__(self): - return pprint_thing(self, quote_strings=True, - escape_chars=('\t', '\r', '\n')) + return pprint_thing(self, quote_strings=True, escape_chars=("\t", "\r", "\n")) def __repr__(self): - return "%s(%s)" % (self.__class__.__name__, - str(self)) + return "%s(%s)" % (self.__class__.__name__, str(self)) __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled pop = append = extend = remove = sort = insert = _disabled @@ -124,9 +124,12 @@ class FrozenNDArray(PandasObject, np.ndarray): # no __array_finalize__ for now because no metadata def __new__(cls, data, dtype=None, copy=False): - warnings.warn("\nFrozenNDArray is deprecated and will be removed in a " - "future version.\nPlease use `numpy.ndarray` instead.\n", - FutureWarning, stacklevel=2) + warnings.warn( + "\nFrozenNDArray is deprecated and will be removed in a " + "future version.\nPlease use `numpy.ndarray` instead.\n", + FutureWarning, + stacklevel=2, + ) if copy is None: copy = not isinstance(data, FrozenNDArray) @@ -135,8 +138,7 @@ def __new__(cls, data, dtype=None, copy=False): def _disabled(self, *args, **kwargs): """This method will not function because object is immutable.""" - raise TypeError("'%s' does not support mutable operations." % - self.__class__) + raise TypeError("'%s' does not support mutable operations." % self.__class__) __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled put = itemset = fill = _disabled @@ -153,8 +155,7 @@ def __repr__(self): """ Return a string representation for this object. """ - prepr = pprint_thing(self, escape_chars=('\t', '\r', '\n'), - quote_strings=True) + prepr = pprint_thing(self, escape_chars=("\t", "\r", "\n"), quote_strings=True) return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype) @deprecate_kwarg(old_arg_name="v", new_arg_name="value") diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 83bc5963f4f9ef..b14cff8cc6adec 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -14,11 +14,25 @@ from pandas.util._exceptions import rewrite_exception from pandas.core.dtypes.cast import ( - find_common_type, infer_dtype_from_scalar, maybe_downcast_to_dtype) + find_common_type, + infer_dtype_from_scalar, + maybe_downcast_to_dtype, +) from pandas.core.dtypes.common import ( - ensure_platform_int, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, - is_dtype_equal, is_float, is_float_dtype, is_integer, is_integer_dtype, - is_interval_dtype, is_list_like, is_number, is_object_dtype, is_scalar) + ensure_platform_int, + is_datetime64tz_dtype, + is_datetime_or_timedelta_dtype, + is_dtype_equal, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_interval_dtype, + is_list_like, + is_number, + is_object_dtype, + is_scalar, +) from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import isna @@ -27,7 +41,12 @@ import pandas.core.common as com import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( - Index, InvalidIndexError, _index_shared_docs, default_pprint, ensure_index) + Index, + InvalidIndexError, + _index_shared_docs, + default_pprint, + ensure_index, +) from pandas.core.indexes.datetimes import DatetimeIndex, date_range from pandas.core.indexes.multi import MultiIndex from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range @@ -36,48 +55,54 @@ from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import DateOffset -_VALID_CLOSED = {'left', 'right', 'both', 'neither'} +_VALID_CLOSED = {"left", "right", "both", "neither"} _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( - dict(klass='IntervalIndex', - qualname="IntervalIndex", - target_klass='IntervalIndex or list of Intervals', - name=textwrap.dedent("""\ + dict( + klass="IntervalIndex", + qualname="IntervalIndex", + target_klass="IntervalIndex or list of Intervals", + name=textwrap.dedent( + """\ name : object, optional Name to be stored in the index. - """), - )) + """ + ), + ) +) def _get_next_label(label): - dtype = getattr(label, 'dtype', type(label)) + dtype = getattr(label, "dtype", type(label)) if isinstance(label, (Timestamp, Timedelta)): - dtype = 'datetime64' + dtype = "datetime64" if is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype): - return label + np.timedelta64(1, 'ns') + return label + np.timedelta64(1, "ns") elif is_integer_dtype(dtype): return label + 1 elif is_float_dtype(dtype): return np.nextafter(label, np.infty) else: - raise TypeError('cannot determine next label for type {typ!r}' - .format(typ=type(label))) + raise TypeError( + "cannot determine next label for type {typ!r}".format(typ=type(label)) + ) def _get_prev_label(label): - dtype = getattr(label, 'dtype', type(label)) + dtype = getattr(label, "dtype", type(label)) if isinstance(label, (Timestamp, Timedelta)): - dtype = 'datetime64' + dtype = "datetime64" if is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype): - return label - np.timedelta64(1, 'ns') + return label - np.timedelta64(1, "ns") elif is_integer_dtype(dtype): return label - 1 elif is_float_dtype(dtype): return np.nextafter(label, -np.infty) else: - raise TypeError('cannot determine next label for type {typ!r}' - .format(typ=type(label))) + raise TypeError( + "cannot determine next label for type {typ!r}".format(typ=type(label)) + ) def _get_interval_closed_bounds(interval): @@ -106,6 +131,7 @@ class SetopCheck: This is called to decorate the set operations of IntervalIndex to perform the type check in advance. """ + def __init__(self, op_name): self.op_name = op_name @@ -115,36 +141,43 @@ def func(intvidx_self, other, sort=False): other = ensure_index(other) if not isinstance(other, IntervalIndex): - result = getattr(intvidx_self.astype(object), - self.op_name)(other) - if self.op_name in ('difference',): + result = getattr(intvidx_self.astype(object), self.op_name)(other) + if self.op_name in ("difference",): result = result.astype(intvidx_self.dtype) return result elif intvidx_self.closed != other.closed: - msg = ('can only do set operations between two IntervalIndex ' - 'objects that are closed on the same side') + msg = ( + "can only do set operations between two IntervalIndex " + "objects that are closed on the same side" + ) raise ValueError(msg) # GH 19016: ensure set op will not return a prohibited dtype subtypes = [intvidx_self.dtype.subtype, other.dtype.subtype] common_subtype = find_common_type(subtypes) if is_object_dtype(common_subtype): - msg = ('can only do {op} between two IntervalIndex ' - 'objects that have compatible dtypes') + msg = ( + "can only do {op} between two IntervalIndex " + "objects that have compatible dtypes" + ) raise TypeError(msg.format(op=self.op_name)) return setop(intvidx_self, other, sort) + return func -@Appender(_interval_shared_docs['class'] % dict( - klass="IntervalIndex", - summary="Immutable index of intervals that are closed on the same side.", - name=_index_doc_kwargs['name'], - versionadded="0.20.0", - extra_attributes="is_overlapping\nvalues\n", - extra_methods="", - examples=textwrap.dedent("""\ +@Appender( + _interval_shared_docs["class"] + % dict( + klass="IntervalIndex", + summary="Immutable index of intervals that are closed on the same side.", + name=_index_doc_kwargs["name"], + versionadded="0.20.0", + extra_attributes="is_overlapping\nvalues\n", + extra_methods="", + examples=textwrap.dedent( + """\ Examples -------- A new ``IntervalIndex`` is typically constructed using @@ -161,13 +194,14 @@ def func(intvidx_self, other, sort=False): See further examples in the doc strings of ``interval_range`` and the mentioned constructor methods. - """), - -)) + """ + ), + ) +) class IntervalIndex(IntervalMixin, Index): - _typ = 'intervalindex' - _comparables = ['name'] - _attributes = ['name', 'closed'] + _typ = "intervalindex" + _comparables = ["name"] + _attributes = ["name", "closed"] # we would like our indexing holder to defer to us _defer_to_indexing = True @@ -178,15 +212,21 @@ class IntervalIndex(IntervalMixin, Index): # -------------------------------------------------------------------- # Constructors - def __new__(cls, data, closed=None, dtype=None, copy=False, - name=None, verify_integrity=True): + def __new__( + cls, data, closed=None, dtype=None, copy=False, name=None, verify_integrity=True + ): - if name is None and hasattr(data, 'name'): + if name is None and hasattr(data, "name"): name = data.name with rewrite_exception("IntervalArray", cls.__name__): - array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype, - verify_integrity=verify_integrity) + array = IntervalArray( + data, + closed=closed, + copy=copy, + dtype=dtype, + verify_integrity=verify_integrity, + ) return cls._simple_new(array, name) @@ -210,29 +250,32 @@ def _simple_new(cls, array, name, closed=None): return result @classmethod - @Appender(_interval_shared_docs['from_breaks'] % _index_doc_kwargs) - def from_breaks(cls, breaks, closed='right', name=None, copy=False, - dtype=None): + @Appender(_interval_shared_docs["from_breaks"] % _index_doc_kwargs) + def from_breaks(cls, breaks, closed="right", name=None, copy=False, dtype=None): with rewrite_exception("IntervalArray", cls.__name__): - array = IntervalArray.from_breaks(breaks, closed=closed, copy=copy, - dtype=dtype) + array = IntervalArray.from_breaks( + breaks, closed=closed, copy=copy, dtype=dtype + ) return cls._simple_new(array, name=name) @classmethod - @Appender(_interval_shared_docs['from_arrays'] % _index_doc_kwargs) - def from_arrays(cls, left, right, closed='right', name=None, copy=False, - dtype=None): + @Appender(_interval_shared_docs["from_arrays"] % _index_doc_kwargs) + def from_arrays( + cls, left, right, closed="right", name=None, copy=False, dtype=None + ): with rewrite_exception("IntervalArray", cls.__name__): - array = IntervalArray.from_arrays(left, right, closed, copy=copy, - dtype=dtype) + array = IntervalArray.from_arrays( + left, right, closed, copy=copy, dtype=dtype + ) return cls._simple_new(array, name=name) @classmethod - @Appender(_interval_shared_docs['from_intervals'] % _index_doc_kwargs) - def from_intervals(cls, data, closed=None, name=None, copy=False, - dtype=None): - msg = ('IntervalIndex.from_intervals is deprecated and will be ' - 'removed in a future version; Use IntervalIndex(...) instead') + @Appender(_interval_shared_docs["from_intervals"] % _index_doc_kwargs) + def from_intervals(cls, data, closed=None, name=None, copy=False, dtype=None): + msg = ( + "IntervalIndex.from_intervals is deprecated and will be " + "removed in a future version; Use IntervalIndex(...) instead" + ) warnings.warn(msg, FutureWarning, stacklevel=2) with rewrite_exception("IntervalArray", cls.__name__): array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype) @@ -243,17 +286,15 @@ def from_intervals(cls, data, closed=None, name=None, copy=False, return cls._simple_new(array, name=name) @classmethod - @Appender(_interval_shared_docs['from_tuples'] % _index_doc_kwargs) - def from_tuples(cls, data, closed='right', name=None, copy=False, - dtype=None): + @Appender(_interval_shared_docs["from_tuples"] % _index_doc_kwargs) + def from_tuples(cls, data, closed="right", name=None, copy=False, dtype=None): with rewrite_exception("IntervalArray", cls.__name__): - arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, - dtype=dtype) + arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, dtype=dtype) return cls._simple_new(arr, name=name) # -------------------------------------------------------------------- - @Appender(_index_shared_docs['_shallow_copy']) + @Appender(_index_shared_docs["_shallow_copy"]) def _shallow_copy(self, left=None, right=None, **kwargs): result = self._data._shallow_copy(left=left, right=right) attributes = self._get_attributes_dict() @@ -295,9 +336,11 @@ def __contains__(self, key): except KeyError: return False - @Appender(_interval_shared_docs['to_tuples'] % dict( - return_type="Index", - examples=""" + @Appender( + _interval_shared_docs["to_tuples"] + % dict( + return_type="Index", + examples=""" Examples -------- >>> idx = pd.IntervalIndex.from_arrays([0, np.nan, 2], [1, np.nan, 3]) @@ -305,15 +348,15 @@ def __contains__(self, key): Index([(0.0, 1.0), (nan, nan), (2.0, 3.0)], dtype='object') >>> idx.to_tuples(na_tuple=False) Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object')""", - )) + ) + ) def to_tuples(self, na_tuple=True): tuples = self._data.to_tuples(na_tuple=na_tuple) return Index(tuples) @cache_readonly def _multiindex(self): - return MultiIndex.from_arrays([self.left, self.right], - names=['left', 'right']) + return MultiIndex.from_arrays([self.left, self.right], names=["left", "right"]) @property def left(self): @@ -339,7 +382,7 @@ def closed(self): """ return self._data._closed - @Appender(_interval_shared_docs['set_closed'] % _index_doc_kwargs) + @Appender(_interval_shared_docs["set_closed"] % _index_doc_kwargs) def set_closed(self, closed): if closed not in _VALID_CLOSED: msg = "invalid option for 'closed': {closed}" @@ -369,13 +412,15 @@ def shape(self): @property def itemsize(self): - msg = ('IntervalIndex.itemsize is deprecated and will be removed in ' - 'a future version') + msg = ( + "IntervalIndex.itemsize is deprecated and will be removed in " + "a future version" + ) warnings.warn(msg, FutureWarning, stacklevel=2) # suppress the warning from the underlying left/right itemsize with warnings.catch_warnings(): - warnings.simplefilter('ignore') + warnings.simplefilter("ignore") return self.left.itemsize + self.right.itemsize def __len__(self): @@ -405,12 +450,11 @@ def __array_wrap__(self, result, context=None): return result def __reduce__(self): - d = dict(left=self.left, - right=self.right) + d = dict(left=self.left, right=self.right) d.update(self._get_attributes_dict()) return _new_IntervalIndex, (self.__class__, d), None - @Appender(_index_shared_docs['copy']) + @Appender(_index_shared_docs["copy"]) def copy(self, deep=False, name=None): array = self._data if deep: @@ -421,9 +465,9 @@ def copy(self, deep=False, name=None): return self._simple_new(array, **attributes) - @Appender(_index_shared_docs['astype']) + @Appender(_index_shared_docs["astype"]) def astype(self, dtype, copy=True): - with rewrite_exception('IntervalArray', self.__class__.__name__): + with rewrite_exception("IntervalArray", self.__class__.__name__): new_values = self.values.astype(dtype, copy=copy) if is_interval_dtype(new_values): return self._shallow_copy(new_values.left, new_values.right) @@ -437,14 +481,13 @@ def dtype(self): @property def inferred_type(self): """Return a string of the type inferred from the values""" - return 'interval' + return "interval" @Appender(Index.memory_usage.__doc__) def memory_usage(self, deep=False): # we don't use an explicit engine # so return the bytes here - return (self.left.memory_usage(deep=deep) + - self.right.memory_usage(deep=deep)) + return self.left.memory_usage(deep=deep) + self.right.memory_usage(deep=deep) @cache_readonly def mid(self): @@ -502,8 +545,7 @@ def is_unique(self): return True @cache_readonly - @Appender(_interval_shared_docs['is_non_overlapping_monotonic'] - % _index_doc_kwargs) + @Appender(_interval_shared_docs["is_non_overlapping_monotonic"] % _index_doc_kwargs) def is_non_overlapping_monotonic(self): return self._data.is_non_overlapping_monotonic @@ -562,16 +604,16 @@ def is_overlapping(self): # GH 23309 return self._engine.is_overlapping - @Appender(_index_shared_docs['_convert_scalar_indexer']) + @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - if kind == 'iloc': + if kind == "iloc": return super()._convert_scalar_indexer(key, kind=kind) return key def _maybe_cast_slice_bound(self, label, side, kind): return getattr(self, side)._maybe_cast_slice_bound(label, side, kind) - @Appender(_index_shared_docs['_convert_list_indexer']) + @Appender(_index_shared_docs["_convert_list_indexer"]) def _convert_list_indexer(self, keyarr, kind=None): """ we are passed a list-like indexer. Return the @@ -598,7 +640,7 @@ def _maybe_cast_indexed(self, key): if is_integer(key): key = float(key) elif isinstance(key, (np.ndarray, Index)): - key = key.astype('float64') + key = key.astype("float64") elif is_integer_dtype(subtype): if is_integer(key): key = int(key) @@ -691,8 +733,10 @@ def _maybe_convert_i8(self, key): # ensure consistency with IntervalIndex subtype subtype = self.dtype.subtype - msg = ('Cannot index an IntervalIndex of subtype {subtype} with ' - 'values of dtype {other}') + msg = ( + "Cannot index an IntervalIndex of subtype {subtype} with " + "values of dtype {other}" + ) if not is_dtype_equal(subtype, key_dtype): raise ValueError(msg.format(subtype=subtype, other=key_dtype)) @@ -702,27 +746,30 @@ def _check_method(self, method): if method is None: return - if method in ['bfill', 'backfill', 'pad', 'ffill', 'nearest']: - msg = 'method {method} not yet implemented for IntervalIndex' + if method in ["bfill", "backfill", "pad", "ffill", "nearest"]: + msg = "method {method} not yet implemented for IntervalIndex" raise NotImplementedError(msg.format(method=method)) raise ValueError("Invalid fill method") def _searchsorted_monotonic(self, label, side, exclude_label=False): if not self.is_non_overlapping_monotonic: - raise KeyError('can only get slices from an IntervalIndex if ' - 'bounds are non-overlapping and all monotonic ' - 'increasing or decreasing') + raise KeyError( + "can only get slices from an IntervalIndex if " + "bounds are non-overlapping and all monotonic " + "increasing or decreasing" + ) if isinstance(label, IntervalMixin): - msg = 'Interval objects are not currently supported' + msg = "Interval objects are not currently supported" raise NotImplementedError(msg) # GH 20921: "not is_monotonic_increasing" for the second condition # instead of "is_monotonic_decreasing" to account for single element # indexes being both increasing and decreasing - if ((side == 'left' and self.left.is_monotonic_increasing) or - (side == 'right' and not self.left.is_monotonic_increasing)): + if (side == "left" and self.left.is_monotonic_increasing) or ( + side == "right" and not self.left.is_monotonic_increasing + ): sub_idx = self.right if self.open_right or exclude_label: label = _get_next_label(label) @@ -736,9 +783,11 @@ def _searchsorted_monotonic(self, label, side, exclude_label=False): def _find_non_overlapping_monotonic_bounds(self, key): if isinstance(key, IntervalMixin): start = self._searchsorted_monotonic( - key.left, 'left', exclude_label=key.open_left) + key.left, "left", exclude_label=key.open_left + ) stop = self._searchsorted_monotonic( - key.right, 'right', exclude_label=key.open_right) + key.right, "right", exclude_label=key.open_right + ) elif isinstance(key, slice): # slice start, stop = key.start, key.stop @@ -747,22 +796,21 @@ def _find_non_overlapping_monotonic_bounds(self, key): if start is None: start = 0 else: - start = self._searchsorted_monotonic(start, 'left') + start = self._searchsorted_monotonic(start, "left") if stop is None: stop = len(self) else: - stop = self._searchsorted_monotonic(stop, 'right') + stop = self._searchsorted_monotonic(stop, "right") else: # scalar or index-like - start = self._searchsorted_monotonic(key, 'left') - stop = self._searchsorted_monotonic(key, 'right') + start = self._searchsorted_monotonic(key, "left") + stop = self._searchsorted_monotonic(key, "right") return start, stop - def get_loc(self, - key: Any, - method: Optional[str] = None - ) -> Union[int, slice, np.ndarray]: + def get_loc( + self, key: Any, method: Optional[str] = None + ) -> Union[int, slice, np.ndarray]: """ Get integer location, slice or boolean mask for requested label. @@ -827,29 +875,40 @@ def get_loc(self, raise KeyError(key) elif matches == 1: return mask.argmax() - return lib.maybe_booleans_to_slice(mask.view('u1')) - - @Substitution(**dict(_index_doc_kwargs, - **{'raises_section': textwrap.dedent(""" + return lib.maybe_booleans_to_slice(mask.view("u1")) + + @Substitution( + **dict( + _index_doc_kwargs, + **{ + "raises_section": textwrap.dedent( + """ Raises ------ NotImplementedError If any method argument other than the default of None is specified as these are not yet implemented. - """)})) - @Appender(_index_shared_docs['get_indexer']) - def get_indexer(self, - target: AnyArrayLike, - method: Optional[str] = None, - limit: Optional[int] = None, - tolerance: Optional[Any] = None - ) -> np.ndarray: + """ + ) + } + ) + ) + @Appender(_index_shared_docs["get_indexer"]) + def get_indexer( + self, + target: AnyArrayLike, + method: Optional[str] = None, + limit: Optional[int] = None, + tolerance: Optional[Any] = None, + ) -> np.ndarray: self._check_method(method) if self.is_overlapping: - msg = ('cannot handle overlapping indices; use ' - 'IntervalIndex.get_indexer_non_unique') + msg = ( + "cannot handle overlapping indices; use " + "IntervalIndex.get_indexer_non_unique" + ) raise InvalidIndexError(msg) target = ensure_index(target) @@ -857,11 +916,12 @@ def get_indexer(self, if isinstance(target, IntervalIndex): # equal indexes -> 1:1 positional match if self.equals(target): - return np.arange(len(self), dtype='intp') + return np.arange(len(self), dtype="intp") # different closed or incompatible subtype -> no matches - common_subtype = find_common_type([ - self.dtype.subtype, target.dtype.subtype]) + common_subtype = find_common_type( + [self.dtype.subtype, target.dtype.subtype] + ) if self.closed != target.closed or is_object_dtype(common_subtype): return np.repeat(np.intp(-1), len(target)) @@ -888,16 +948,17 @@ def get_indexer(self, return ensure_platform_int(indexer) - @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) - def get_indexer_non_unique(self, - target: AnyArrayLike - ) -> Tuple[np.ndarray, np.ndarray]: + @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) + def get_indexer_non_unique( + self, target: AnyArrayLike + ) -> Tuple[np.ndarray, np.ndarray]: target = ensure_index(target) # check that target IntervalIndex is compatible if isinstance(target, IntervalIndex): - common_subtype = find_common_type([ - self.dtype.subtype, target.dtype.subtype]) + common_subtype = find_common_type( + [self.dtype.subtype, target.dtype.subtype] + ) if self.closed != target.closed or is_object_dtype(common_subtype): # different closed or incompatible subtype -> no matches return np.repeat(-1, len(target)), np.arange(len(target)) @@ -909,8 +970,7 @@ def get_indexer_non_unique(self, try: locs = self.get_loc(key) if isinstance(locs, slice): - locs = np.arange( - locs.start, locs.stop, locs.step, dtype='intp') + locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp") locs = np.array(locs, ndmin=1) except KeyError: missing.append(i) @@ -919,15 +979,11 @@ def get_indexer_non_unique(self, indexer = np.concatenate(indexer) else: target = self._maybe_convert_i8(target) - indexer, missing = self._engine.get_indexer_non_unique( - target.values) + indexer, missing = self._engine.get_indexer_non_unique(target.values) return ensure_platform_int(indexer), ensure_platform_int(missing) - def get_indexer_for(self, - target: AnyArrayLike, - **kwargs - ) -> np.ndarray: + def get_indexer_for(self, target: AnyArrayLike, **kwargs) -> np.ndarray: """ Guaranteed return of an indexer even when overlapping. @@ -943,11 +999,8 @@ def get_indexer_for(self, return self.get_indexer_non_unique(target, **kwargs)[0] return self.get_indexer(target, **kwargs) - @Appender(_index_shared_docs['get_value'] % _index_doc_kwargs) - def get_value(self, - series: ABCSeries, - key: Any - ) -> Any: + @Appender(_index_shared_docs["get_value"] % _index_doc_kwargs) + def get_value(self, series: ABCSeries, key: Any) -> Any: if com.is_bool_indexer(key): loc = key @@ -961,12 +1014,12 @@ def get_value(self, elif isinstance(key, slice): if not (key.step is None or key.step == 1): raise ValueError("cannot support not-default step in a slice") - loc = self._convert_slice_indexer(key, kind='getitem') + loc = self._convert_slice_indexer(key, kind="getitem") else: loc = self.get_loc(key) return series.iloc[loc] - @Appender(_index_shared_docs['where']) + @Appender(_index_shared_docs["where"]) def where(self, cond, other=None): if other is None: other = self._na_value @@ -1002,16 +1055,18 @@ def insert(self, loc, item): """ if isinstance(item, Interval): if item.closed != self.closed: - raise ValueError('inserted item must be closed on the same ' - 'side as the index') + raise ValueError( + "inserted item must be closed on the same " "side as the index" + ) left_insert = item.left right_insert = item.right elif is_scalar(item) and isna(item): # GH 18295 left_insert = right_insert = item else: - raise ValueError('can only insert Interval objects and NA into ' - 'an IntervalIndex') + raise ValueError( + "can only insert Interval objects and NA into " "an IntervalIndex" + ) new_left = self.left.insert(loc, left_insert) new_right = self.right.insert(loc, right_insert) @@ -1023,16 +1078,18 @@ def _concat_same_dtype(self, to_concat, name): we allow a 0-len index here as well """ if not len({i.closed for i in to_concat if len(i)}) == 1: - msg = ('can only append two IntervalIndex objects ' - 'that are closed on the same side') + msg = ( + "can only append two IntervalIndex objects " + "that are closed on the same side" + ) raise ValueError(msg) return super()._concat_same_dtype(to_concat, name) - @Appender(_index_shared_docs['take'] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, - fill_value=None, **kwargs): - result = self._data.take(indices, axis=axis, allow_fill=allow_fill, - fill_value=fill_value, **kwargs) + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): + result = self._data.take( + indices, axis=axis, allow_fill=allow_fill, fill_value=fill_value, **kwargs + ) attributes = self._get_attributes_dict() return self._simple_new(result, **attributes) @@ -1051,56 +1108,56 @@ def __getitem__(self, value): def _format_with_header(self, header, **kwargs): return header + list(self._format_native_types(**kwargs)) - def _format_native_types(self, na_rep='NaN', quoting=None, **kwargs): + def _format_native_types(self, na_rep="NaN", quoting=None, **kwargs): """ actually format my specific types """ from pandas.io.formats.format import ExtensionArrayFormatter - return ExtensionArrayFormatter(values=self, - na_rep=na_rep, - justify='all', - leading_space=False).get_result() + + return ExtensionArrayFormatter( + values=self, na_rep=na_rep, justify="all", leading_space=False + ).get_result() def _format_data(self, name=None): # TODO: integrate with categorical and make generic # name argument is unused here; just for compat with base / categorical n = len(self) - max_seq_items = min((get_option( - 'display.max_seq_items') or n) // 10, 10) + max_seq_items = min((get_option("display.max_seq_items") or n) // 10, 10) formatter = str if n == 0: - summary = '[]' + summary = "[]" elif n == 1: first = formatter(self[0]) - summary = '[{first}]'.format(first=first) + summary = "[{first}]".format(first=first) elif n == 2: first = formatter(self[0]) last = formatter(self[-1]) - summary = '[{first}, {last}]'.format(first=first, last=last) + summary = "[{first}, {last}]".format(first=first, last=last) else: if n > max_seq_items: n = min(max_seq_items // 2, 10) head = [formatter(x) for x in self[:n]] tail = [formatter(x) for x in self[-n:]] - summary = '[{head} ... {tail}]'.format( - head=', '.join(head), tail=', '.join(tail)) + summary = "[{head} ... {tail}]".format( + head=", ".join(head), tail=", ".join(tail) + ) else: tail = [formatter(x) for x in self] - summary = '[{tail}]'.format(tail=', '.join(tail)) + summary = "[{tail}]".format(tail=", ".join(tail)) - return summary + ',' + self._format_space() + return summary + "," + self._format_space() def _format_attrs(self): - attrs = [('closed', repr(self.closed))] + attrs = [("closed", repr(self.closed))] if self.name is not None: - attrs.append(('name', default_pprint(self.name))) - attrs.append(('dtype', "'{dtype}'".format(dtype=self.dtype))) + attrs.append(("name", default_pprint(self.name))) + attrs.append(("dtype", "'{dtype}'".format(dtype=self.dtype))) return attrs def _format_space(self): - space = ' ' * (len(self.__class__.__name__) + 1) + space = " " * (len(self.__class__.__name__) + 1) return "\n{space}".format(space=space) # -------------------------------------------------------------------- @@ -1120,30 +1177,30 @@ def equals(self, other): if not isinstance(other, IntervalIndex): if not is_interval_dtype(other): return False - other = Index(getattr(other, '.values', other)) + other = Index(getattr(other, ".values", other)) - return (self.left.equals(other.left) and - self.right.equals(other.right) and - self.closed == other.closed) + return ( + self.left.equals(other.left) + and self.right.equals(other.right) + and self.closed == other.closed + ) - @Appender(_interval_shared_docs['contains'] % _index_doc_kwargs) + @Appender(_interval_shared_docs["contains"] % _index_doc_kwargs) def contains(self, other): return self._data.contains(other) - @Appender(_interval_shared_docs['overlaps'] % _index_doc_kwargs) + @Appender(_interval_shared_docs["overlaps"] % _index_doc_kwargs) def overlaps(self, other): return self._data.overlaps(other) - @Appender(_index_shared_docs['intersection']) - @SetopCheck(op_name='intersection') - def intersection(self, - other: 'IntervalIndex', - sort: bool = False - ) -> 'IntervalIndex': + @Appender(_index_shared_docs["intersection"]) + @SetopCheck(op_name="intersection") + def intersection( + self, other: "IntervalIndex", sort: bool = False + ) -> "IntervalIndex": if self.left.is_unique and self.right.is_unique: taken = self._intersection_unique(other) - elif (other.left.is_unique and other.right.is_unique and - self.isna().sum() <= 1): + elif other.left.is_unique and other.right.is_unique and self.isna().sum() <= 1: # Swap other/self if other is unique and self does not have # multiple NaNs taken = other._intersection_unique(self) @@ -1156,9 +1213,7 @@ def intersection(self, return taken - def _intersection_unique(self, - other: 'IntervalIndex' - ) -> 'IntervalIndex': + def _intersection_unique(self, other: "IntervalIndex") -> "IntervalIndex": """ Used when the IntervalIndex does not have any common endpoint, no mater left or right. @@ -1180,9 +1235,7 @@ def _intersection_unique(self, return self.take(indexer) - def _intersection_non_unique(self, - other: 'IntervalIndex' - ) -> 'IntervalIndex': + def _intersection_non_unique(self, other: "IntervalIndex") -> "IntervalIndex": """ Used when the IntervalIndex does have some common endpoints, on either sides. @@ -1218,8 +1271,7 @@ def _intersection_non_unique(self, def _setop(op_name, sort=None): @SetopCheck(op_name=op_name) def func(self, other, sort=sort): - result = getattr(self._multiindex, op_name)(other._multiindex, - sort=sort) + result = getattr(self._multiindex, op_name)(other._multiindex, sort=sort) result_name = get_op_result_name(self, other) # GH 19101: ensure empty results have correct dtype @@ -1228,8 +1280,7 @@ def func(self, other, sort=sort): else: result = result.values - return type(self).from_tuples(result, closed=self.closed, - name=result_name) + return type(self).from_tuples(result, closed=self.closed, name=result_name) return func @@ -1241,9 +1292,9 @@ def is_all_dates(self): """ return False - union = _setop('union') - difference = _setop('difference') - symmetric_difference = _setop('symmetric_difference') + union = _setop("union") + difference = _setop("difference") + symmetric_difference = _setop("symmetric_difference") # TODO: arithmetic operations @@ -1253,24 +1304,31 @@ def is_all_dates(self): def _is_valid_endpoint(endpoint): """helper for interval_range to check if start/end are valid types""" - return any([is_number(endpoint), - isinstance(endpoint, Timestamp), - isinstance(endpoint, Timedelta), - endpoint is None]) + return any( + [ + is_number(endpoint), + isinstance(endpoint, Timestamp), + isinstance(endpoint, Timedelta), + endpoint is None, + ] + ) def _is_type_compatible(a, b): """helper for interval_range to check type compat of start/end/freq""" is_ts_compat = lambda x: isinstance(x, (Timestamp, DateOffset)) is_td_compat = lambda x: isinstance(x, (Timedelta, DateOffset)) - return ((is_number(a) and is_number(b)) or - (is_ts_compat(a) and is_ts_compat(b)) or - (is_td_compat(a) and is_td_compat(b)) or - com._any_none(a, b)) + return ( + (is_number(a) and is_number(b)) + or (is_ts_compat(a) and is_ts_compat(b)) + or (is_td_compat(a) and is_td_compat(b)) + or com._any_none(a, b) + ) -def interval_range(start=None, end=None, periods=None, freq=None, - name=None, closed='right'): +def interval_range( + start=None, end=None, periods=None, freq=None, name=None, closed="right" +): """ Return a fixed frequency IntervalIndex @@ -1363,36 +1421,44 @@ def interval_range(start=None, end=None, periods=None, freq=None, endpoint = start if start is not None else end if freq is None and com._any_none(periods, start, end): - freq = 1 if is_number(endpoint) else 'D' + freq = 1 if is_number(endpoint) else "D" if com.count_not_none(start, end, periods, freq) != 3: - raise ValueError('Of the four parameters: start, end, periods, and ' - 'freq, exactly three must be specified') + raise ValueError( + "Of the four parameters: start, end, periods, and " + "freq, exactly three must be specified" + ) if not _is_valid_endpoint(start): - msg = 'start must be numeric or datetime-like, got {start}' + msg = "start must be numeric or datetime-like, got {start}" raise ValueError(msg.format(start=start)) elif not _is_valid_endpoint(end): - msg = 'end must be numeric or datetime-like, got {end}' + msg = "end must be numeric or datetime-like, got {end}" raise ValueError(msg.format(end=end)) if is_float(periods): periods = int(periods) elif not is_integer(periods) and periods is not None: - msg = 'periods must be a number, got {periods}' + msg = "periods must be a number, got {periods}" raise TypeError(msg.format(periods=periods)) if freq is not None and not is_number(freq): try: freq = to_offset(freq) except ValueError: - raise ValueError('freq must be numeric or convertible to ' - 'DateOffset, got {freq}'.format(freq=freq)) + raise ValueError( + "freq must be numeric or convertible to " + "DateOffset, got {freq}".format(freq=freq) + ) # verify type compatibility - if not all([_is_type_compatible(start, end), - _is_type_compatible(start, freq), - _is_type_compatible(end, freq)]): + if not all( + [ + _is_type_compatible(start, end), + _is_type_compatible(start, freq), + _is_type_compatible(end, freq), + ] + ): raise TypeError("start, end, freq need to be type compatible") # +1 to convert interval count to breaks count (n breaks = n-1 intervals) @@ -1415,7 +1481,7 @@ def interval_range(start=None, end=None, periods=None, freq=None, breaks = np.linspace(start, end, periods) if all(is_integer(x) for x in com._not_none(start, end, freq)): # np.linspace always produces float output - breaks = maybe_downcast_to_dtype(breaks, 'int64') + breaks = maybe_downcast_to_dtype(breaks, "int64") else: # delegate to the appropriate range function if isinstance(endpoint, Timestamp): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 0823a3ed9ad597..71b551adaf3ef1 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -7,16 +7,23 @@ from pandas._config import get_option -from pandas._libs import ( - Timestamp, algos as libalgos, index as libindex, lib, tslibs) +from pandas._libs import Timestamp, algos as libalgos, index as libindex, lib, tslibs from pandas.compat.numpy import function as nv from pandas.errors import PerformanceWarning, UnsortedIndexError from pandas.util._decorators import Appender, cache_readonly, deprecate_kwarg from pandas.core.dtypes.common import ( - ensure_int64, ensure_platform_int, is_categorical_dtype, is_hashable, - is_integer, is_iterator, is_list_like, is_object_dtype, is_scalar, - pandas_dtype) + ensure_int64, + ensure_platform_int, + is_categorical_dtype, + is_hashable, + is_integer, + is_iterator, + is_list_like, + is_object_dtype, + is_scalar, + pandas_dtype, +) from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.dtypes.missing import array_equivalent, isna @@ -25,25 +32,32 @@ import pandas.core.common as com import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( - Index, InvalidIndexError, _index_shared_docs, ensure_index) + Index, + InvalidIndexError, + _index_shared_docs, + ensure_index, +) from pandas.core.indexes.frozen import FrozenList, _ensure_frozen import pandas.core.missing as missing from pandas.io.formats.printing import ( - format_object_attrs, format_object_summary, pprint_thing) + format_object_attrs, + format_object_summary, + pprint_thing, +) _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( - dict(klass='MultiIndex', - target_klass='MultiIndex or list of tuples')) + dict(klass="MultiIndex", target_klass="MultiIndex or list of tuples") +) -class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, - libindex.UInt64Engine): +class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine): """ This class manages a MultiIndex by mapping label combinations to positive integers. """ + _base = libindex.UInt64Engine def _codes_to_ints(self, codes): @@ -77,13 +91,13 @@ def _codes_to_ints(self, codes): return np.bitwise_or.reduce(codes, axis=1) -class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, - libindex.ObjectEngine): +class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine): """ This class manages those (extreme) cases in which the number of possible label combinations overflows the 64 bits integers, and uses an ObjectEngine containing Python integers. """ + _base = libindex.ObjectEngine def _codes_to_ints(self, codes): @@ -106,7 +120,7 @@ def _codes_to_ints(self, codes): # Shift the representation of each level by the pre-calculated number # of bits. Since this can overflow uint64, first make sure we are # working with Python integers: - codes = codes.astype('object') << self.offsets + codes = codes.astype("object") << self.offsets # Now sum and OR are in fact interchangeable. This is a simple # composition of the (disjunct) significant bits of each level (i.e. @@ -205,20 +219,29 @@ class MultiIndex(Index): """ # initialize to zero-length tuples to make everything work - _typ = 'multiindex' + _typ = "multiindex" _names = FrozenList() _levels = FrozenList() _codes = FrozenList() - _comparables = ['names'] + _comparables = ["names"] rename = Index.set_names # -------------------------------------------------------------------- # Constructors - @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes') - def __new__(cls, levels=None, codes=None, sortorder=None, names=None, - dtype=None, copy=False, name=None, - verify_integrity=True, _set_identity=True): + @deprecate_kwarg(old_arg_name="labels", new_arg_name="codes") + def __new__( + cls, + levels=None, + codes=None, + sortorder=None, + names=None, + dtype=None, + copy=False, + name=None, + verify_integrity=True, + _set_identity=True, + ): # compat with Index if name is not None: @@ -226,9 +249,9 @@ def __new__(cls, levels=None, codes=None, sortorder=None, names=None, if levels is None or codes is None: raise TypeError("Must pass both levels and codes") if len(levels) != len(codes): - raise ValueError('Length of levels and codes must be the same.') + raise ValueError("Length of levels and codes must be the same.") if len(levels) == 0: - raise ValueError('Must pass non-zero number of levels/codes') + raise ValueError("Must pass non-zero number of levels/codes") result = object.__new__(MultiIndex) @@ -302,32 +325,39 @@ def _verify_integrity(self, codes=None, levels=None): levels = levels or self.levels if len(levels) != len(codes): - raise ValueError("Length of levels and codes must match. NOTE:" - " this index is in an inconsistent state.") + raise ValueError( + "Length of levels and codes must match. NOTE:" + " this index is in an inconsistent state." + ) codes_length = len(codes[0]) for i, (level, level_codes) in enumerate(zip(levels, codes)): if len(level_codes) != codes_length: - raise ValueError("Unequal code lengths: %s" % - ([len(code_) for code_ in codes])) + raise ValueError( + "Unequal code lengths: %s" % ([len(code_) for code_ in codes]) + ) if len(level_codes) and level_codes.max() >= len(level): - msg = ("On level {level}, code max ({max_code}) >= length of " - "level ({level_len}). NOTE: this index is in an " - "inconsistent state".format( - level=i, max_code=level_codes.max(), - level_len=len(level))) + msg = ( + "On level {level}, code max ({max_code}) >= length of " + "level ({level_len}). NOTE: this index is in an " + "inconsistent state".format( + level=i, max_code=level_codes.max(), level_len=len(level) + ) + ) raise ValueError(msg) if len(level_codes) and level_codes.min() < -1: - raise ValueError("On level {level}, code value ({code})" - " < -1".format( - level=i, code=level_codes.min())) + raise ValueError( + "On level {level}, code value ({code})" + " < -1".format(level=i, code=level_codes.min()) + ) if not level.is_unique: - raise ValueError("Level values must be unique: {values} on " - "level {level}".format( - values=[value for value in level], - level=i)) - - codes = [self._validate_codes(level, code) - for level, code in zip(levels, codes)] + raise ValueError( + "Level values must be unique: {values} on " + "level {level}".format(values=[value for value in level], level=i) + ) + + codes = [ + self._validate_codes(level, code) for level, code in zip(levels, codes) + ] new_codes = FrozenList(codes) return new_codes @@ -383,7 +413,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None): # raise ValueError, if not for i in range(1, len(arrays)): if len(arrays[i]) != len(arrays[i - 1]): - raise ValueError('all arrays must be same length') + raise ValueError("all arrays must be same length") from pandas.core.arrays.categorical import _factorize_from_iterables @@ -391,8 +421,13 @@ def from_arrays(cls, arrays, sortorder=None, names=None): if names is None: names = [getattr(arr, "name", None) for arr in arrays] - return MultiIndex(levels=levels, codes=codes, sortorder=sortorder, - names=names, verify_integrity=False) + return MultiIndex( + levels=levels, + codes=codes, + sortorder=sortorder, + names=names, + verify_integrity=False, + ) @classmethod def from_tuples(cls, tuples, sortorder=None, names=None): @@ -432,13 +467,13 @@ def from_tuples(cls, tuples, sortorder=None, names=None): names=['number', 'color']) """ if not is_list_like(tuples): - raise TypeError('Input must be a list / sequence of tuple-likes.') + raise TypeError("Input must be a list / sequence of tuple-likes.") elif is_iterator(tuples): tuples = list(tuples) if len(tuples) == 0: if names is None: - msg = 'Cannot infer number of levels from empty list' + msg = "Cannot infer number of levels from empty list" raise TypeError(msg) arrays = [[]] * len(names) elif isinstance(tuples, (np.ndarray, Index)): @@ -591,8 +626,10 @@ def array(self): ------ ValueError """ - msg = ("MultiIndex has no single backing array. Use " - "'MultiIndex.to_numpy()' to get a NumPy array of tuples.") + msg = ( + "MultiIndex has no single backing array. Use " + "'MultiIndex.to_numpy()' to get a NumPy array of tuples." + ) raise ValueError(msg) @property @@ -617,22 +654,23 @@ def _is_homogeneous_type(self): """ return len({x.dtype for x in self.levels}) <= 1 - def _set_levels(self, levels, level=None, copy=False, validate=True, - verify_integrity=False): + def _set_levels( + self, levels, level=None, copy=False, validate=True, verify_integrity=False + ): # This is NOT part of the levels property because it should be # externally not allowed to set levels. User beware if you change # _levels directly if validate and len(levels) == 0: - raise ValueError('Must set non-zero number of levels.') + raise ValueError("Must set non-zero number of levels.") if validate and level is None and len(levels) != self.nlevels: - raise ValueError('Length of levels must match number of levels.') + raise ValueError("Length of levels must match number of levels.") if validate and level is not None and len(levels) != len(level): - raise ValueError('Length of levels must match length of level.') + raise ValueError("Length of levels must match length of level.") if level is None: new_levels = FrozenList( - ensure_index(lev, copy=copy)._shallow_copy() - for lev in levels) + ensure_index(lev, copy=copy)._shallow_copy() for lev in levels + ) else: level = [self._get_level_number(l) for l in level] new_levels = list(self._levels) @@ -652,8 +690,7 @@ def _set_levels(self, levels, level=None, copy=False, validate=True, self._tuples = None self._reset_cache() - def set_levels(self, levels, level=None, inplace=False, - verify_integrity=True): + def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): """ Set new levels on MultiIndex. Defaults to returning new index. @@ -722,8 +759,9 @@ def set_levels(self, levels, level=None, inplace=False, else: idx = self._shallow_copy() idx._reset_identity() - idx._set_levels(levels, level=level, validate=True, - verify_integrity=verify_integrity) + idx._set_levels( + levels, level=level, validate=True, verify_integrity=verify_integrity + ) if not inplace: return idx @@ -733,29 +771,34 @@ def codes(self): @property def labels(self): - warnings.warn((".labels was deprecated in version 0.24.0. " - "Use .codes instead."), - FutureWarning, stacklevel=2) + warnings.warn( + (".labels was deprecated in version 0.24.0. " "Use .codes instead."), + FutureWarning, + stacklevel=2, + ) return self.codes - def _set_codes(self, codes, level=None, copy=False, validate=True, - verify_integrity=False): + def _set_codes( + self, codes, level=None, copy=False, validate=True, verify_integrity=False + ): if validate and level is None and len(codes) != self.nlevels: raise ValueError("Length of codes must match number of levels") if validate and level is not None and len(codes) != len(level): - raise ValueError('Length of codes must match length of levels.') + raise ValueError("Length of codes must match length of levels.") if level is None: new_codes = FrozenList( _ensure_frozen(level_codes, lev, copy=copy)._shallow_copy() - for lev, level_codes in zip(self.levels, codes)) + for lev, level_codes in zip(self.levels, codes) + ) else: level = [self._get_level_number(l) for l in level] new_codes = list(self._codes) for lev_idx, level_codes in zip(level, codes): lev = self.levels[lev_idx] new_codes[lev_idx] = _ensure_frozen( - level_codes, lev, copy=copy)._shallow_copy() + level_codes, lev, copy=copy + )._shallow_copy() new_codes = FrozenList(new_codes) if verify_integrity: @@ -766,17 +809,24 @@ def _set_codes(self, codes, level=None, copy=False, validate=True, self._tuples = None self._reset_cache() - def set_labels(self, labels, level=None, inplace=False, - verify_integrity=True): - warnings.warn((".set_labels was deprecated in version 0.24.0. " - "Use .set_codes instead."), - FutureWarning, stacklevel=2) - return self.set_codes(codes=labels, level=level, inplace=inplace, - verify_integrity=verify_integrity) + def set_labels(self, labels, level=None, inplace=False, verify_integrity=True): + warnings.warn( + ( + ".set_labels was deprecated in version 0.24.0. " + "Use .set_codes instead." + ), + FutureWarning, + stacklevel=2, + ) + return self.set_codes( + codes=labels, + level=level, + inplace=inplace, + verify_integrity=verify_integrity, + ) - @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes') - def set_codes(self, codes, level=None, inplace=False, - verify_integrity=True): + @deprecate_kwarg(old_arg_name="labels", new_arg_name="codes") + def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): """ Set new codes on MultiIndex. Defaults to returning new index. @@ -852,9 +902,17 @@ def set_codes(self, codes, level=None, inplace=False, if not inplace: return idx - @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes') - def copy(self, names=None, dtype=None, levels=None, codes=None, - deep=False, _set_identity=False, **kwargs): + @deprecate_kwarg(old_arg_name="labels", new_arg_name="codes") + def copy( + self, + names=None, + dtype=None, + levels=None, + codes=None, + deep=False, + _set_identity=False, + **kwargs + ): """ Make a copy of this object. Names, dtype, levels and codes can be passed and will be set on new copy. @@ -876,11 +934,12 @@ def copy(self, names=None, dtype=None, levels=None, codes=None, ``deep``, but if ``deep`` is passed it will attempt to deepcopy. This could be potentially expensive on large MultiIndex objects. """ - name = kwargs.get('name') + name = kwargs.get("name") names = self._validate_names(name=name, names=names, deep=deep) if deep: from copy import deepcopy + if levels is None: levels = deepcopy(self.levels) if codes is None: @@ -890,9 +949,14 @@ def copy(self, names=None, dtype=None, levels=None, codes=None, levels = self.levels if codes is None: codes = self.codes - return MultiIndex(levels=levels, codes=codes, names=names, - sortorder=self.sortorder, verify_integrity=False, - _set_identity=_set_identity) + return MultiIndex( + levels=levels, + codes=codes, + names=names, + sortorder=self.sortorder, + verify_integrity=False, + _set_identity=_set_identity, + ) def __array__(self, dtype=None): """ the array interface, return my values """ @@ -908,12 +972,14 @@ def _shallow_copy_with_infer(self, values, **kwargs): # On equal MultiIndexes the difference is empty. # Therefore, an empty MultiIndex is returned GH13490 if len(values) == 0: - return MultiIndex(levels=[[] for _ in range(self.nlevels)], - codes=[[] for _ in range(self.nlevels)], - **kwargs) + return MultiIndex( + levels=[[] for _ in range(self.nlevels)], + codes=[[] for _ in range(self.nlevels)], + **kwargs + ) return self._shallow_copy(values, **kwargs) - @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) + @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) def __contains__(self, key): hash(key) try: @@ -922,23 +988,25 @@ def __contains__(self, key): except (LookupError, TypeError, ValueError): return False - @Appender(_index_shared_docs['_shallow_copy']) + @Appender(_index_shared_docs["_shallow_copy"]) def _shallow_copy(self, values=None, **kwargs): if values is not None: - names = kwargs.pop('names', kwargs.pop('name', self.names)) + names = kwargs.pop("names", kwargs.pop("name", self.names)) # discards freq - kwargs.pop('freq', None) + kwargs.pop("freq", None) return MultiIndex.from_tuples(values, names=names, **kwargs) return self.copy(**kwargs) @cache_readonly def dtype(self): - return np.dtype('O') + return np.dtype("O") def _is_memory_usage_qualified(self): """ return a boolean if we need a qualified .info display """ + def f(l): - return 'mixed' in l or 'string' in l or 'unicode' in l + return "mixed" in l or "string" in l or "unicode" in l + return any(f(l) for l in self._inferred_type_levels) @Appender(Index.memory_usage.__doc__) @@ -989,8 +1057,9 @@ def _format_data(self, name=None): """ Return the formatted data as a unicode string """ - return format_object_summary(self, self._formatter_func, - name=name, line_break_each_value=True) + return format_object_summary( + self, self._formatter_func, name=name, line_break_each_value=True + ) def _format_attrs(self): """ @@ -998,7 +1067,7 @@ def _format_attrs(self): """ return format_object_attrs(self, include_dtype=False) - def _format_native_types(self, na_rep='nan', **kwargs): + def _format_native_types(self, na_rep="nan", **kwargs): new_levels = [] new_codes = [] @@ -1006,7 +1075,7 @@ def _format_native_types(self, na_rep='nan', **kwargs): for level, level_codes in zip(self.levels, self.codes): level = level._format_native_types(na_rep=na_rep, **kwargs) # add nan values, if there are any - mask = (level_codes == -1) + mask = level_codes == -1 if mask.any(): nan_index = len(level) level = np.append(level, na_rep) @@ -1017,17 +1086,27 @@ def _format_native_types(self, na_rep='nan', **kwargs): if len(new_levels) == 1: # a single-level multi-index - return Index(new_levels[0].take( - new_codes[0]))._format_native_types() + return Index(new_levels[0].take(new_codes[0]))._format_native_types() else: # reconstruct the multi-index - mi = MultiIndex(levels=new_levels, codes=new_codes, - names=self.names, sortorder=self.sortorder, - verify_integrity=False) + mi = MultiIndex( + levels=new_levels, + codes=new_codes, + names=self.names, + sortorder=self.sortorder, + verify_integrity=False, + ) return mi.values - def format(self, space=2, sparsify=None, adjoin=True, names=False, - na_rep=None, formatter=None): + def format( + self, + space=2, + sparsify=None, + adjoin=True, + names=False, + na_rep=None, + formatter=None, + ): if len(self) == 0: return [] @@ -1048,9 +1127,10 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False, else: # weird all NA case - formatted = [pprint_thing(na if isna(x) else x, - escape_chars=('\t', '\r', '\n')) - for x in algos.take_1d(lev._values, level_codes)] + formatted = [ + pprint_thing(na if isna(x) else x, escape_chars=("\t", "\r", "\n")) + for x in algos.take_1d(lev._values, level_codes) + ] stringified_levels.append(formatted) result_levels = [] @@ -1058,9 +1138,11 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False, level = [] if names: - level.append(pprint_thing(name, - escape_chars=('\t', '\r', '\n')) - if name is not None else '') + level.append( + pprint_thing(name, escape_chars=("\t", "\r", "\n")) + if name is not None + else "" + ) level.extend(np.array(lev, dtype=object)) result_levels.append(level) @@ -1069,20 +1151,22 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False, sparsify = get_option("display.multi_sparse") if sparsify: - sentinel = '' + sentinel = "" # GH3547 # use value of sparsify as sentinel, unless it's an obvious # "Truthy" value if sparsify not in [True, 1]: sentinel = sparsify # little bit of a kludge job for #1217 - result_levels = _sparsify(result_levels, start=int(names), - sentinel=sentinel) + result_levels = _sparsify( + result_levels, start=int(names), sentinel=sentinel + ) if adjoin: from pandas.io.formats.format import _get_adjustment + adj = _get_adjustment() - return adj.adjoin(space, *result_levels).split('\n') + return adj.adjoin(space, *result_levels).split("\n") else: return result_levels @@ -1122,14 +1206,15 @@ def _set_names(self, names, level=None, validate=True): # GH 15110 # Don't allow a single string for names in a MultiIndex if names is not None and not is_list_like(names): - raise ValueError('Names should be list-like for a MultiIndex') + raise ValueError("Names should be list-like for a MultiIndex") names = list(names) if validate and level is not None and len(names) != len(level): - raise ValueError('Length of names must match length of level.') + raise ValueError("Length of names must match length of level.") if validate and level is None and len(names) != self.nlevels: - raise ValueError('Length of names must match number of levels in ' - 'MultiIndex.') + raise ValueError( + "Length of names must match number of levels in " "MultiIndex." + ) if level is None: level = range(self.nlevels) @@ -1142,14 +1227,18 @@ def _set_names(self, names, level=None, validate=True): # GH 20527 # All items in 'names' need to be hashable: if not is_hashable(name): - raise TypeError('{}.name must be a hashable type' - .format(self.__class__.__name__)) + raise TypeError( + "{}.name must be a hashable type".format( + self.__class__.__name__ + ) + ) self.levels[l].rename(name, inplace=True) - names = property(fset=_set_names, fget=_get_names, - doc="""\nNames of levels in MultiIndex\n""") + names = property( + fset=_set_names, fget=_get_names, doc="""\nNames of levels in MultiIndex\n""" + ) - @Appender(_index_shared_docs['_get_grouper_for_level']) + @Appender(_index_shared_docs["_get_grouper_for_level"]) def _get_grouper_for_level(self, mapper, level): indexer = self.codes[level] level_index = self.levels[level] @@ -1185,29 +1274,34 @@ def _constructor(self): @cache_readonly def inferred_type(self): - return 'mixed' + return "mixed" def _get_level_number(self, level): count = self.names.count(level) if (count > 1) and not is_integer(level): - raise ValueError('The name %s occurs multiple times, use a ' - 'level number' % level) + raise ValueError( + "The name %s occurs multiple times, use a " "level number" % level + ) try: level = self.names.index(level) except ValueError: if not is_integer(level): - raise KeyError('Level %s not found' % str(level)) + raise KeyError("Level %s not found" % str(level)) elif level < 0: level += self.nlevels if level < 0: orig_level = level - self.nlevels - raise IndexError('Too many levels: Index has only %d ' - 'levels, %d is not a valid level number' % - (self.nlevels, orig_level)) + raise IndexError( + "Too many levels: Index has only %d " + "levels, %d is not a valid level number" + % (self.nlevels, orig_level) + ) # Note: levels are zero-based elif level >= self.nlevels: - raise IndexError('Too many levels: Index has only %d levels, ' - 'not %d' % (self.nlevels, level + 1)) + raise IndexError( + "Too many levels: Index has only %d levels, " + "not %d" % (self.nlevels, level + 1) + ) return level _tuples = None @@ -1226,7 +1320,7 @@ def _engine(self): # equivalent to sorting lexicographically the codes themselves. Notice # that each level needs to be shifted by the number of bits needed to # represent the _previous_ ones: - offsets = np.concatenate([lev_bits[1:], [0]]).astype('uint64') + offsets = np.concatenate([lev_bits[1:], [0]]).astype("uint64") # Check the total number of bits needed for our representation: if lev_bits[0] > 64: @@ -1245,8 +1339,7 @@ def values(self): vals = self._get_level_values(i) if is_categorical_dtype(vals): vals = vals._internal_get_values() - if (isinstance(vals.dtype, ExtensionDtype) - or hasattr(vals, '_box_values')): + if isinstance(vals.dtype, ExtensionDtype) or hasattr(vals, "_box_values"): vals = vals.astype(object) vals = np.array(vals, copy=False) values.append(vals) @@ -1267,8 +1360,9 @@ def is_monotonic_increasing(self): """ # reversed() because lexsort() wants the most significant key last. - values = [self._get_level_values(i).values - for i in reversed(range(len(self.levels)))] + values = [ + self._get_level_values(i).values for i in reversed(range(len(self.levels))) + ] try: sort_order = np.lexsort(values) return Index(sort_order).is_monotonic @@ -1289,7 +1383,7 @@ def is_monotonic_decreasing(self): @cache_readonly def _have_mixed_levels(self): """ return a boolean list indicated if we have mixed levels """ - return ['mixed' in l for l in self._inferred_type_levels] + return ["mixed" in l for l in self._inferred_type_levels] @cache_readonly def _inferred_type_levels(self): @@ -1300,6 +1394,7 @@ def _inferred_type_levels(self): def _hashed_values(self): """ return a uint64 ndarray of my hashed values """ from pandas.core.util.hashing import hash_tuples + return hash_tuples(self) def _hashed_indexing_key(self, key): @@ -1333,12 +1428,14 @@ def f(k, stringify): if stringify and not isinstance(k, str): k = str(k) return k - key = tuple(f(k, stringify) - for k, stringify in zip(key, self._have_mixed_levels)) + + key = tuple( + f(k, stringify) for k, stringify in zip(key, self._have_mixed_levels) + ) return hash_tuple(key) @Appender(Index.duplicated.__doc__) - def duplicated(self, keep='first'): + def duplicated(self, keep="first"): from pandas.core.sorting import get_group_index from pandas._libs.hashtable import duplicated_int64 @@ -1351,14 +1448,14 @@ def fillna(self, value=None, downcast=None): """ fillna is not implemented for MultiIndex """ - raise NotImplementedError('isna is not defined for MultiIndex') + raise NotImplementedError("isna is not defined for MultiIndex") - @Appender(_index_shared_docs['dropna']) - def dropna(self, how='any'): + @Appender(_index_shared_docs["dropna"]) + def dropna(self, how="any"): nans = [level_codes == -1 for level_codes in self.codes] - if how == 'any': + if how == "any": indexer = np.any(nans, axis=0) - elif how == 'all': + elif how == "all": indexer = np.all(nans, axis=0) else: raise ValueError("invalid how option: {0}".format(how)) @@ -1380,8 +1477,9 @@ def _try_mi(k): new_values = series._values[loc] new_index = self[loc] new_index = maybe_droplevels(new_index, k) - return series._constructor(new_values, index=new_index, - name=series.name).__finalize__(self) + return series._constructor( + new_values, index=new_index, name=series.name + ).__finalize__(self) try: return self._engine.get_value(s, k) @@ -1419,8 +1517,13 @@ def _try_mi(k): try: return _try_mi(Timestamp(key)) - except (KeyError, TypeError, - IndexError, ValueError, tslibs.OutOfBoundsDatetime): + except ( + KeyError, + TypeError, + IndexError, + ValueError, + tslibs.OutOfBoundsDatetime, + ): pass raise InvalidIndexError(key) @@ -1447,8 +1550,7 @@ def _get_level_values(self, level, unique=False): level_codes = self.codes[level] if unique: level_codes = algos.unique(level_codes) - filled = algos.take_1d(values._values, level_codes, - fill_value=values._na_value) + filled = algos.take_1d(values._values, level_codes, fill_value=values._na_value) values = values._shallow_copy(filled) return values @@ -1488,7 +1590,7 @@ def get_level_values(self, level): values = self._get_level_values(level) return values - @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs) + @Appender(_index_shared_docs["index_unique"] % _index_doc_kwargs) def unique(self, level=None): if level is None: @@ -1528,26 +1630,31 @@ def to_frame(self, index=True, name=None): """ from pandas import DataFrame + if name is not None: if not is_list_like(name): - raise TypeError("'name' must be a list / sequence " - "of column names.") + raise TypeError("'name' must be a list / sequence " "of column names.") if len(name) != len(self.levels): - raise ValueError("'name' should have same length as " - "number of levels on index.") + raise ValueError( + "'name' should have same length as " "number of levels on index." + ) idx_names = name else: idx_names = self.names # Guarantee resulting column order result = DataFrame( - OrderedDict([ - ((level if lvlname is None else lvlname), - self._get_level_values(level)) - for lvlname, level in zip(idx_names, range(len(self.levels))) - ]), - copy=False + OrderedDict( + [ + ( + (level if lvlname is None else lvlname), + self._get_level_values(level), + ) + for lvlname, level in zip(idx_names, range(len(self.levels))) + ] + ), + copy=False, ) if index: @@ -1598,14 +1705,16 @@ def to_hierarchical(self, n_repeat, n_shuffle=1): ) """ levels = self.levels - codes = [np.repeat(level_codes, n_repeat) for - level_codes in self.codes] + codes = [np.repeat(level_codes, n_repeat) for level_codes in self.codes] # Assumes that each level_codes is divisible by n_shuffle - codes = [x.reshape(n_shuffle, -1).ravel(order='F') for x in codes] + codes = [x.reshape(n_shuffle, -1).ravel(order="F") for x in codes] names = self.names - warnings.warn("Method .to_hierarchical is deprecated and will " - "be removed in a future version", - FutureWarning, stacklevel=2) + warnings.warn( + "Method .to_hierarchical is deprecated and will " + "be removed in a future version", + FutureWarning, + stacklevel=2, + ) return MultiIndex(levels=levels, codes=codes, names=names) def to_flat_index(self): @@ -1728,9 +1837,13 @@ def _sort_levels_monotonic(self): new_levels.append(lev) new_codes.append(level_codes) - return MultiIndex(new_levels, new_codes, - names=self.names, sortorder=self.sortorder, - verify_integrity=False) + return MultiIndex( + new_levels, + new_codes, + names=self.names, + sortorder=self.sortorder, + verify_integrity=False, + ) def remove_unused_levels(self): """ @@ -1835,19 +1948,22 @@ def levshape(self): def __reduce__(self): """Necessary for making this object picklable""" - d = dict(levels=[lev for lev in self.levels], - codes=[level_codes for level_codes in self.codes], - sortorder=self.sortorder, names=list(self.names)) + d = dict( + levels=[lev for lev in self.levels], + codes=[level_codes for level_codes in self.codes], + sortorder=self.sortorder, + names=list(self.names), + ) return ibase._new_Index, (self.__class__, d), None def __setstate__(self, state): """Necessary for making this object picklable""" if isinstance(state, dict): - levels = state.get('levels') - codes = state.get('codes') - sortorder = state.get('sortorder') - names = state.get('names') + levels = state.get("levels") + codes = state.get("codes") + sortorder = state.get("sortorder") + names = state.get("names") elif isinstance(state, tuple): @@ -1887,30 +2003,40 @@ def __getitem__(self, key): new_codes = [level_codes[key] for level_codes in self.codes] - return MultiIndex(levels=self.levels, codes=new_codes, - names=self.names, sortorder=sortorder, - verify_integrity=False) + return MultiIndex( + levels=self.levels, + codes=new_codes, + names=self.names, + sortorder=sortorder, + verify_integrity=False, + ) - @Appender(_index_shared_docs['take'] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, - fill_value=None, **kwargs): + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) indices = ensure_platform_int(indices) - taken = self._assert_take_fillable(self.codes, indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=-1) - return MultiIndex(levels=self.levels, codes=taken, - names=self.names, verify_integrity=False) - - def _assert_take_fillable(self, values, indices, allow_fill=True, - fill_value=None, na_value=None): + taken = self._assert_take_fillable( + self.codes, + indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=-1, + ) + return MultiIndex( + levels=self.levels, codes=taken, names=self.names, verify_integrity=False + ) + + def _assert_take_fillable( + self, values, indices, allow_fill=True, fill_value=None, na_value=None + ): """ Internal method to handle NA filling of take """ # only fill if we are passing a non-None fill_value if allow_fill and fill_value is not None: if (indices < -1).any(): - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) raise ValueError(msg) taken = [lab.take(indices) for lab in self.codes] mask = indices == -1 @@ -1940,8 +2066,9 @@ def append(self, other): if not isinstance(other, (list, tuple)): other = [other] - if all((isinstance(o, MultiIndex) and o.nlevels >= self.nlevels) - for o in other): + if all( + (isinstance(o, MultiIndex) and o.nlevels >= self.nlevels) for o in other + ): arrays = [] for i in range(self.nlevels): label = self._get_level_values(i) @@ -1949,7 +2076,7 @@ def append(self, other): arrays.append(label.append(appended)) return MultiIndex.from_arrays(arrays, names=self.names) - to_concat = (self.values, ) + tuple(k._values for k in other) + to_concat = (self.values,) + tuple(k._values for k in other) new_tuples = np.concatenate(to_concat) # if all(isinstance(x, MultiIndex) for x in other): @@ -1961,21 +2088,27 @@ def append(self, other): def argsort(self, *args, **kwargs): return self.values.argsort(*args, **kwargs) - @Appender(_index_shared_docs['repeat'] % _index_doc_kwargs) + @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) - return MultiIndex(levels=self.levels, - codes=[level_codes.view(np.ndarray).repeat(repeats) - for level_codes in self.codes], - names=self.names, sortorder=self.sortorder, - verify_integrity=False) + return MultiIndex( + levels=self.levels, + codes=[ + level_codes.view(np.ndarray).repeat(repeats) + for level_codes in self.codes + ], + names=self.names, + sortorder=self.sortorder, + verify_integrity=False, + ) def where(self, cond, other=None): - raise NotImplementedError(".where is not supported for " - "MultiIndex operations") + raise NotImplementedError( + ".where is not supported for " "MultiIndex operations" + ) - @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes') - def drop(self, codes, level=None, errors='raise'): + @deprecate_kwarg(old_arg_name="labels", new_arg_name="codes") + def drop(self, codes, level=None, errors="raise"): """ Make new MultiIndex with passed list of codes deleted @@ -1998,9 +2131,8 @@ def drop(self, codes, level=None, errors='raise'): indexer = self.get_indexer(codes) mask = indexer == -1 if mask.any(): - if errors != 'ignore': - raise ValueError('codes %s not contained in axis' % - codes[mask]) + if errors != "ignore": + raise ValueError("codes %s not contained in axis" % codes[mask]) except Exception: pass @@ -2016,18 +2148,20 @@ def drop(self, codes, level=None, errors='raise'): inds.extend(range(loc.start, loc.stop)) elif com.is_bool_indexer(loc): if self.lexsort_depth == 0: - warnings.warn('dropping on a non-lexsorted multi-index' - ' without a level parameter may impact ' - 'performance.', - PerformanceWarning, - stacklevel=3) + warnings.warn( + "dropping on a non-lexsorted multi-index" + " without a level parameter may impact " + "performance.", + PerformanceWarning, + stacklevel=3, + ) loc = loc.nonzero()[0] inds.extend(loc) else: - msg = 'unsupported indexer of type {}'.format(type(loc)) + msg = "unsupported indexer of type {}".format(type(loc)) raise AssertionError(msg) except KeyError: - if errors != 'ignore': + if errors != "ignore": raise return self.delete(inds) @@ -2101,8 +2235,9 @@ def swaplevel(self, i=-2, j=-1): new_codes[i], new_codes[j] = new_codes[j], new_codes[i] new_names[i], new_names[j] = new_names[j], new_names[i] - return MultiIndex(levels=new_levels, codes=new_codes, - names=new_names, verify_integrity=False) + return MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) def reorder_levels(self, order): """ @@ -2117,15 +2252,17 @@ def reorder_levels(self, order): """ order = [self._get_level_number(i) for i in order] if len(order) != self.nlevels: - raise AssertionError('Length of order must be same as ' - 'number of levels (%d), got %d' % - (self.nlevels, len(order))) + raise AssertionError( + "Length of order must be same as " + "number of levels (%d), got %d" % (self.nlevels, len(order)) + ) new_levels = [self.levels[i] for i in order] new_codes = [self.codes[i] for i in order] new_names = [self.names[i] for i in order] - return MultiIndex(levels=new_levels, codes=new_codes, - names=new_names, verify_integrity=False) + return MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) def __getslice__(self, i, j): return self.__getitem__(slice(i, j)) @@ -2141,13 +2278,15 @@ def _get_codes_for_sorting(self): from pandas.core.arrays import Categorical def cats(level_codes): - return np.arange(np.array(level_codes).max() + 1 if - len(level_codes) else 0, - dtype=level_codes.dtype) + return np.arange( + np.array(level_codes).max() + 1 if len(level_codes) else 0, + dtype=level_codes.dtype, + ) - return [Categorical.from_codes(level_codes, cats(level_codes), - ordered=True) - for level_codes in self.codes] + return [ + Categorical.from_codes(level_codes, cats(level_codes), ordered=True) + for level_codes in self.codes + ] def sortlevel(self, level=0, ascending=True, sort_remaining=True): """ @@ -2184,8 +2323,10 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): raise ValueError("level must have same length as ascending") from pandas.core.sorting import lexsort_indexer - indexer = lexsort_indexer([self.codes[lev] for lev in level], - orders=ascending) + + indexer = lexsort_indexer( + [self.codes[lev] for lev in level], orders=ascending + ) # level ordering else: @@ -2209,8 +2350,7 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): else: sortorder = level[0] - indexer = indexer_from_factorized(primary, primshp, - compress=False) + indexer = indexer_from_factorized(primary, primshp, compress=False) if not ascending: indexer = indexer[::-1] @@ -2218,9 +2358,13 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): indexer = ensure_platform_int(indexer) new_codes = [level_codes.take(indexer) for level_codes in self.codes] - new_index = MultiIndex(codes=new_codes, levels=self.levels, - names=self.names, sortorder=sortorder, - verify_integrity=False) + new_index = MultiIndex( + codes=new_codes, + levels=self.levels, + names=self.names, + sortorder=sortorder, + verify_integrity=False, + ) return new_index, indexer @@ -2240,8 +2384,7 @@ def _convert_listlike_indexer(self, keyarr, kind=None): indexer, keyarr = super()._convert_listlike_indexer(keyarr, kind=kind) # are we indexing a specific level - if indexer is None and len(keyarr) and not isinstance(keyarr[0], - tuple): + if indexer is None and len(keyarr) and not isinstance(keyarr[0], tuple): level = 0 _, indexer = self.reindex(keyarr, level=level) @@ -2252,11 +2395,11 @@ def _convert_listlike_indexer(self, keyarr, kind=None): check = self.levels[0].get_indexer(keyarr) mask = check == -1 if mask.any(): - raise KeyError('%s not in index' % keyarr[mask]) + raise KeyError("%s not in index" % keyarr[mask]) return indexer, keyarr - @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) + @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) target = ensure_index(target) @@ -2272,34 +2415,36 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): # let's instead try with a straight Index if method is None: - return Index(self.values).get_indexer(target, - method=method, - limit=limit, - tolerance=tolerance) + return Index(self.values).get_indexer( + target, method=method, limit=limit, tolerance=tolerance + ) if not self.is_unique: - raise ValueError('Reindexing only valid with uniquely valued ' - 'Index objects') + raise ValueError( + "Reindexing only valid with uniquely valued " "Index objects" + ) - if method == 'pad' or method == 'backfill': + if method == "pad" or method == "backfill": if tolerance is not None: - raise NotImplementedError("tolerance not implemented yet " - 'for MultiIndex') + raise NotImplementedError( + "tolerance not implemented yet " "for MultiIndex" + ) indexer = self._engine.get_indexer(target, method, limit) - elif method == 'nearest': - raise NotImplementedError("method='nearest' not implemented yet " - 'for MultiIndex; see GitHub issue 9365') + elif method == "nearest": + raise NotImplementedError( + "method='nearest' not implemented yet " + "for MultiIndex; see GitHub issue 9365" + ) else: indexer = self._engine.get_indexer(target) return ensure_platform_int(indexer) - @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) + @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) def get_indexer_non_unique(self, target): return super().get_indexer_non_unique(target) - def reindex(self, target, method=None, level=None, limit=None, - tolerance=None): + def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ Create index with target's values (move/add/delete values as necessary) @@ -2313,11 +2458,11 @@ def reindex(self, target, method=None, level=None, limit=None, """ # GH6552: preserve names when reindexing to non-named target # (i.e. neither Index nor Series). - preserve_names = not hasattr(target, 'names') + preserve_names = not hasattr(target, "names") if level is not None: if method is not None: - raise TypeError('Fill method not supported if level passed') + raise TypeError("Fill method not supported if level passed") # GH7774: preserve dtype/tz if target is empty and not an Index. # target may be an iterator @@ -2325,23 +2470,22 @@ def reindex(self, target, method=None, level=None, limit=None, if len(target) == 0 and not isinstance(target, Index): idx = self.levels[level] attrs = idx._get_attributes_dict() - attrs.pop('freq', None) # don't preserve freq - target = type(idx)._simple_new(np.empty(0, dtype=idx.dtype), - **attrs) + attrs.pop("freq", None) # don't preserve freq + target = type(idx)._simple_new(np.empty(0, dtype=idx.dtype), **attrs) else: target = ensure_index(target) - target, indexer, _ = self._join_level(target, level, how='right', - return_indexers=True, - keep_order=False) + target, indexer, _ = self._join_level( + target, level, how="right", return_indexers=True, keep_order=False + ) else: target = ensure_index(target) if self.equals(target): indexer = None else: if self.is_unique: - indexer = self.get_indexer(target, method=method, - limit=limit, - tolerance=tolerance) + indexer = self.get_indexer( + target, method=method, limit=limit, tolerance=tolerance + ) else: raise ValueError("cannot handle a non-unique multi-index!") @@ -2354,8 +2498,11 @@ def reindex(self, target, method=None, level=None, limit=None, # hopefully? target = MultiIndex.from_tuples(target) - if (preserve_names and target.nlevels == self.nlevels and - target.names != self.names): + if ( + preserve_names + and target.nlevels == self.nlevels + and target.names != self.names + ): target = target.copy(deep=False) target.names = self.names @@ -2364,7 +2511,7 @@ def reindex(self, target, method=None, level=None, limit=None, def get_slice_bound(self, label, side, kind): if not isinstance(label, tuple): - label = label, + label = (label,) return self._partial_tup_index(label, side=side) def slice_locs(self, start=None, end=None, step=None, kind=None): @@ -2423,12 +2570,12 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): # happens in get_slice_bound method), but it adds meaningful doc. return super().slice_locs(start, end, step, kind=kind) - def _partial_tup_index(self, tup, side='left'): + def _partial_tup_index(self, tup, side="left"): if len(tup) > self.lexsort_depth: raise UnsortedIndexError( - 'Key length (%d) was greater than MultiIndex' - ' lexsort depth (%d)' % - (len(tup), self.lexsort_depth)) + "Key length (%d) was greater than MultiIndex" + " lexsort depth (%d)" % (len(tup), self.lexsort_depth) + ) n = len(tup) start, end = 0, len(self) @@ -2437,20 +2584,19 @@ def _partial_tup_index(self, tup, side='left'): section = labs[start:end] if lab not in lev: - if not lev.is_type_compatible(lib.infer_dtype([lab], - skipna=False)): - raise TypeError('Level type mismatch: %s' % lab) + if not lev.is_type_compatible(lib.infer_dtype([lab], skipna=False)): + raise TypeError("Level type mismatch: %s" % lab) # short circuit loc = lev.searchsorted(lab, side=side) - if side == 'right' and loc >= 0: + if side == "right" and loc >= 0: loc -= 1 return start + section.searchsorted(loc, side=side) idx = lev.get_loc(lab) if k < n - 1: - end = start + section.searchsorted(idx, side='right') - start = start + section.searchsorted(idx, side='left') + end = start + section.searchsorted(idx, side="right") + start = start + section.searchsorted(idx, side="left") else: return start + section.searchsorted(idx, side=side) @@ -2495,19 +2641,21 @@ def get_loc(self, key, method=None): 1 """ if method is not None: - raise NotImplementedError('only the default get_loc method is ' - 'currently supported for MultiIndex') + raise NotImplementedError( + "only the default get_loc method is " + "currently supported for MultiIndex" + ) def _maybe_to_slice(loc): """convert integer indexer to boolean mask or slice if possible""" - if not isinstance(loc, np.ndarray) or loc.dtype != 'int64': + if not isinstance(loc, np.ndarray) or loc.dtype != "int64": return loc loc = lib.maybe_indices_to_slice(loc, len(self)) if isinstance(loc, slice): return loc - mask = np.empty(len(self), dtype='bool') + mask = np.empty(len(self), dtype="bool") mask.fill(False) mask[loc] = True return mask @@ -2518,8 +2666,10 @@ def _maybe_to_slice(loc): keylen = len(key) if self.nlevels < keylen: - raise KeyError('Key length ({0}) exceeds index depth ({1})' - ''.format(keylen, self.nlevels)) + raise KeyError( + "Key length ({0}) exceeds index depth ({1})" + "".format(keylen, self.nlevels) + ) if keylen == self.nlevels and self.is_unique: return self._engine.get_loc(key) @@ -2530,8 +2680,9 @@ def _maybe_to_slice(loc): # needs linear search within the slice i = self.lexsort_depth lead_key, follow_key = key[:i], key[i:] - start, stop = (self.slice_locs(lead_key, lead_key) - if lead_key else (0, len(self))) + start, stop = ( + self.slice_locs(lead_key, lead_key) if lead_key else (0, len(self)) + ) if start == stop: raise KeyError(key) @@ -2539,10 +2690,13 @@ def _maybe_to_slice(loc): if not follow_key: return slice(start, stop) - warnings.warn('indexing past lexsort depth may impact performance.', - PerformanceWarning, stacklevel=10) + warnings.warn( + "indexing past lexsort depth may impact performance.", + PerformanceWarning, + stacklevel=10, + ) - loc = np.arange(start, stop, dtype='int64') + loc = np.arange(start, stop, dtype="int64") for i, k in enumerate(follow_key, len(lead_key)): mask = self.codes[i][loc] == self.levels[i].get_loc(k) @@ -2551,8 +2705,7 @@ def _maybe_to_slice(loc): if not len(loc): raise KeyError(key) - return (_maybe_to_slice(loc) if len(loc) != stop - start else - slice(start, stop)) + return _maybe_to_slice(loc) if len(loc) != stop - start else slice(start, stop) def get_loc_level(self, key, level=0, drop_level=True): """ @@ -2612,8 +2765,9 @@ def maybe_droplevels(indexer, levels, drop_level): if isinstance(level, (tuple, list)): if len(key) != len(level): - raise AssertionError('Key for location must have same ' - 'length as number of levels') + raise AssertionError( + "Key for location must have same " "length as number of levels" + ) result = None for lev, k in zip(level, key): loc, new_index = self.get_loc_level(k, level=lev) @@ -2649,10 +2803,10 @@ def maybe_droplevels(indexer, levels, drop_level): def partial_selection(key, indexer=None): if indexer is None: indexer = self.get_loc(key) - ilevels = [i for i in range(len(key)) - if key[i] != slice(None, None)] - return indexer, maybe_droplevels(indexer, ilevels, - drop_level) + ilevels = [ + i for i in range(len(key)) if key[i] != slice(None, None) + ] + return indexer, maybe_droplevels(indexer, ilevels, drop_level) if len(key) == self.nlevels and self.is_unique: # Complete key in unique index -> standard get_loc @@ -2683,8 +2837,7 @@ def partial_selection(key, indexer=None): indexer &= k_index if indexer is None: indexer = slice(None, None) - ilevels = [i for i in range(len(key)) - if key[i] != slice(None, None)] + ilevels = [i for i in range(len(key)) if key[i] != slice(None, None)] return indexer, maybe_droplevels(indexer, ilevels, drop_level) else: indexer = self._get_level_indexer(key, level=level) @@ -2698,8 +2851,7 @@ def _get_level_indexer(self, key, level=0, indexer=None): level_index = self.levels[level] level_codes = self.codes[level] - def convert_indexer(start, stop, step, indexer=indexer, - codes=level_codes): + def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): # given the inputs and the codes/indexer, compute an indexer set # if we have a provided indexer, then this need not consider # the entire labels set @@ -2714,6 +2866,7 @@ def convert_indexer(start, stop, step, indexer=indexer, # that the result are the mappings to the set that we have # selected from pandas import Series + mapper = Series(indexer) indexer = codes.take(ensure_platform_int(indexer)) result = Series(Index(indexer).isin(r).nonzero()[0]) @@ -2721,8 +2874,7 @@ def convert_indexer(start, stop, step, indexer=indexer, else: m = np.zeros(len(codes), dtype=bool) - m[np.in1d(codes, r, - assume_unique=Index(codes).is_unique)] = True + m[np.in1d(codes, r, assume_unique=Index(codes).is_unique)] = True return m @@ -2744,8 +2896,9 @@ def convert_indexer(start, stop, step, indexer=indexer, # we have a partial slice (like looking up a partial date # string) - start = stop = level_index.slice_indexer(key.start, key.stop, - key.step, kind='loc') + start = stop = level_index.slice_indexer( + key.start, key.stop, key.step, kind="loc" + ) step = start.step if isinstance(start, slice) or isinstance(stop, slice): @@ -2753,8 +2906,8 @@ def convert_indexer(start, stop, step, indexer=indexer, # a partial date slicer on a DatetimeIndex generates a slice # note that the stop ALREADY includes the stopped point (if # it was a string sliced) - start = getattr(start, 'start', start) - stop = getattr(stop, 'stop', stop) + start = getattr(start, "start", start) + stop = getattr(stop, "stop", stop) return convert_indexer(start, stop, step) elif level > 0 or self.lexsort_depth == 0 or step is not None: @@ -2764,8 +2917,8 @@ def convert_indexer(start, stop, step, indexer=indexer, return convert_indexer(start, stop + 1, step) else: # sorted, so can return slice object -> view - i = level_codes.searchsorted(start, side='left') - j = level_codes.searchsorted(stop, side='right') + i = level_codes.searchsorted(start, side="left") + j = level_codes.searchsorted(stop, side="right") return slice(i, j, step) else: @@ -2780,8 +2933,8 @@ def convert_indexer(start, stop, step, indexer=indexer, raise KeyError(key) return locs - i = level_codes.searchsorted(code, side='left') - j = level_codes.searchsorted(code, side='right') + i = level_codes.searchsorted(code, side="left") + j = level_codes.searchsorted(code, side="right") if i == j: # The label is present in self.levels[level] but unused: raise KeyError(key) @@ -2826,10 +2979,11 @@ def get_locs(self, seq): # must be lexsorted to at least as many levels true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s] if true_slices and true_slices[-1] >= self.lexsort_depth: - raise UnsortedIndexError('MultiIndex slicing requires the index ' - 'to be lexsorted: slicing on levels {0}, ' - 'lexsort depth {1}' - .format(true_slices, self.lexsort_depth)) + raise UnsortedIndexError( + "MultiIndex slicing requires the index " + "to be lexsorted: slicing on levels {0}, " + "lexsort depth {1}".format(true_slices, self.lexsort_depth) + ) # indexer # this is the list of all values that we want to select n = len(self) @@ -2843,9 +2997,11 @@ def _convert_to_indexer(r): r = m.nonzero()[0] elif com.is_bool_indexer(r): if len(r) != n: - raise ValueError("cannot index with a boolean indexer " - "that is not the same length as the " - "index") + raise ValueError( + "cannot index with a boolean indexer " + "that is not the same length as the " + "index" + ) r = r.nonzero()[0] return Int64Index(r) @@ -2861,8 +3017,7 @@ def _update_indexer(idxr, indexer=indexer): if com.is_bool_indexer(k): # a boolean indexer, must be the same length! k = np.asarray(k) - indexer = _update_indexer(_convert_to_indexer(k), - indexer=indexer) + indexer = _update_indexer(_convert_to_indexer(k), indexer=indexer) elif is_list_like(k): # a collection of labels to include from this level (these @@ -2871,10 +3026,9 @@ def _update_indexer(idxr, indexer=indexer): for x in k: try: idxrs = _convert_to_indexer( - self._get_level_indexer(x, level=i, - indexer=indexer)) - indexers = (idxrs if indexers is None - else indexers | idxrs) + self._get_level_indexer(x, level=i, indexer=indexer) + ) + indexers = idxrs if indexers is None else indexers | idxrs except KeyError: # ignore not founds @@ -2893,14 +3047,20 @@ def _update_indexer(idxr, indexer=indexer): elif isinstance(k, slice): # a slice, include BOTH of the labels - indexer = _update_indexer(_convert_to_indexer( - self._get_level_indexer(k, level=i, indexer=indexer)), - indexer=indexer) + indexer = _update_indexer( + _convert_to_indexer( + self._get_level_indexer(k, level=i, indexer=indexer) + ), + indexer=indexer, + ) else: # a single label - indexer = _update_indexer(_convert_to_indexer( - self.get_loc_level(k, level=i, drop_level=False)[0]), - indexer=indexer) + indexer = _update_indexer( + _convert_to_indexer( + self.get_loc_level(k, level=i, drop_level=False)[0] + ), + indexer=indexer, + ) # empty indexer if indexer is None: @@ -2923,7 +3083,7 @@ def truncate(self, before=None, after=None): truncated : MultiIndex """ if after and before and after < before: - raise ValueError('after < before') + raise ValueError("after < before") i, j = self.levels[0].slice_locs(before, after) left, right = self.slice_locs(before, after) @@ -2934,8 +3094,7 @@ def truncate(self, before=None, after=None): new_codes = [level_codes[left:right] for level_codes in self.codes] new_codes[0] = new_codes[0] - i - return MultiIndex(levels=new_levels, codes=new_codes, - verify_integrity=False) + return MultiIndex(levels=new_levels, codes=new_codes, verify_integrity=False) def equals(self, other): """ @@ -2965,14 +3124,15 @@ def equals(self, other): for i in range(self.nlevels): self_codes = self.codes[i] self_codes = self_codes[self_codes != -1] - self_values = algos.take_nd(np.asarray(self.levels[i]._values), - self_codes, allow_fill=False) + self_values = algos.take_nd( + np.asarray(self.levels[i]._values), self_codes, allow_fill=False + ) other_codes = other.codes[i] other_codes = other_codes[other_codes != -1] other_values = algos.take_nd( - np.asarray(other.levels[i]._values), - other_codes, allow_fill=False) + np.asarray(other.levels[i]._values), other_codes, allow_fill=False + ) # since we use NaT both datetime64 and timedelta64 # we can have a situation where a level is typed say @@ -3041,12 +3201,13 @@ def union(self, other, sort=None): # TODO: Index.union returns other when `len(self)` is 0. - uniq_tuples = lib.fast_unique_multiple([self._ndarray_values, - other._ndarray_values], - sort=sort) + uniq_tuples = lib.fast_unique_multiple( + [self._ndarray_values, other._ndarray_values], sort=sort + ) - return MultiIndex.from_arrays(zip(*uniq_tuples), sortorder=0, - names=result_names) + return MultiIndex.from_arrays( + zip(*uniq_tuples), sortorder=0, names=result_names + ) def intersection(self, other, sort=False): """ @@ -3084,12 +3245,16 @@ def intersection(self, other, sort=False): uniq_tuples = sorted(uniq_tuples) if len(uniq_tuples) == 0: - return MultiIndex(levels=self.levels, - codes=[[]] * self.nlevels, - names=result_names, verify_integrity=False) + return MultiIndex( + levels=self.levels, + codes=[[]] * self.nlevels, + names=result_names, + verify_integrity=False, + ) else: - return MultiIndex.from_arrays(zip(*uniq_tuples), sortorder=0, - names=result_names) + return MultiIndex.from_arrays( + zip(*uniq_tuples), sortorder=0, names=result_names + ) def difference(self, other, sort=None): """ @@ -3120,38 +3285,43 @@ def difference(self, other, sort=None): return self if self.equals(other): - return MultiIndex(levels=self.levels, - codes=[[]] * self.nlevels, - names=result_names, verify_integrity=False) + return MultiIndex( + levels=self.levels, + codes=[[]] * self.nlevels, + names=result_names, + verify_integrity=False, + ) this = self._get_unique_index() indexer = this.get_indexer(other) indexer = indexer.take((indexer != -1).nonzero()[0]) - label_diff = np.setdiff1d(np.arange(this.size), indexer, - assume_unique=True) + label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) difference = this.values.take(label_diff) if sort is None: difference = sorted(difference) if len(difference) == 0: - return MultiIndex(levels=[[]] * self.nlevels, - codes=[[]] * self.nlevels, - names=result_names, verify_integrity=False) + return MultiIndex( + levels=[[]] * self.nlevels, + codes=[[]] * self.nlevels, + names=result_names, + verify_integrity=False, + ) else: - return MultiIndex.from_tuples(difference, sortorder=0, - names=result_names) + return MultiIndex.from_tuples(difference, sortorder=0, names=result_names) - @Appender(_index_shared_docs['astype']) + @Appender(_index_shared_docs["astype"]) def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if is_categorical_dtype(dtype): - msg = '> 1 ndim Categorical are not supported at this time' + msg = "> 1 ndim Categorical are not supported at this time" raise NotImplementedError(msg) elif not is_object_dtype(dtype): - msg = ('Setting {cls} dtype to anything other than object ' - 'is not supported').format(cls=self.__class__) + msg = ( + "Setting {cls} dtype to anything other than object " "is not supported" + ).format(cls=self.__class__) raise TypeError(msg) elif copy is True: return self._shallow_copy() @@ -3160,13 +3330,15 @@ def astype(self, dtype, copy=True): def _convert_can_do_setop(self, other): result_names = self.names - if not hasattr(other, 'names'): + if not hasattr(other, "names"): if len(other) == 0: - other = MultiIndex(levels=[[]] * self.nlevels, - codes=[[]] * self.nlevels, - verify_integrity=False) + other = MultiIndex( + levels=[[]] * self.nlevels, + codes=[[]] * self.nlevels, + verify_integrity=False, + ) else: - msg = 'other must be a MultiIndex or a list of tuples' + msg = "other must be a MultiIndex or a list of tuples" try: other = MultiIndex.from_tuples(other) except TypeError: @@ -3192,10 +3364,9 @@ def insert(self, loc, item): # Pad the key with empty strings if lower levels of the key # aren't specified: if not isinstance(item, tuple): - item = (item, ) + ('', ) * (self.nlevels - 1) + item = (item,) + ("",) * (self.nlevels - 1) elif len(item) != self.nlevels: - raise ValueError('Item must have length equal to number of ' - 'levels.') + raise ValueError("Item must have length equal to number of " "levels.") new_levels = [] new_codes = [] @@ -3210,11 +3381,11 @@ def insert(self, loc, item): lev_loc = level.get_loc(k) new_levels.append(level) - new_codes.append(np.insert( - ensure_int64(level_codes), loc, lev_loc)) + new_codes.append(np.insert(ensure_int64(level_codes), loc, lev_loc)) - return MultiIndex(levels=new_levels, codes=new_codes, - names=self.names, verify_integrity=False) + return MultiIndex( + levels=new_levels, codes=new_codes, names=self.names, verify_integrity=False + ) def delete(self, loc): """ @@ -3225,8 +3396,12 @@ def delete(self, loc): new_index : MultiIndex """ new_codes = [np.delete(level_codes, loc) for level_codes in self.codes] - return MultiIndex(levels=self.levels, codes=new_codes, - names=self.names, verify_integrity=False) + return MultiIndex( + levels=self.levels, + codes=new_codes, + names=self.names, + verify_integrity=False, + ) def _wrap_joined_index(self, joined, other): names = self.names if self.names == other.names else None @@ -3235,8 +3410,7 @@ def _wrap_joined_index(self, joined, other): @Appender(Index.isin.__doc__) def isin(self, values, level=None): if level is None: - values = MultiIndex.from_tuples(values, - names=self.names).values + values = MultiIndex.from_tuples(values, names=self.names).values return algos.isin(self.values, values) else: num = self._get_level_number(level) @@ -3255,14 +3429,14 @@ def isin(self, values, level=None): MultiIndex._add_logical_methods_disabled() -def _sparsify(label_list, start=0, sentinel=''): +def _sparsify(label_list, start=0, sentinel=""): pivoted = list(zip(*label_list)) k = len(label_list) - result = pivoted[:start + 1] + result = pivoted[: start + 1] prev = pivoted[start] - for cur in pivoted[start + 1:]: + for cur in pivoted[start + 1 :]: sparse_cur = [] for i, (p, t) in enumerate(zip(prev, cur)): @@ -3284,4 +3458,4 @@ def _sparsify(label_list, start=0, sentinel=''): def _get_na_rep(dtype): - return {np.datetime64: 'NaT', np.timedelta64: 'NaT'}.get(dtype, 'NaN') + return {np.datetime64: "NaT", np.timedelta64: "NaT"}.get(dtype, "NaN") diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 5f9c1f22887cc8..daf26d53aa6e22 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -6,18 +6,29 @@ from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( - is_bool, is_bool_dtype, is_dtype_equal, is_extension_array_dtype, is_float, - is_float_dtype, is_integer_dtype, is_scalar, needs_i8_conversion, - pandas_dtype) + is_bool, + is_bool_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_float, + is_float_dtype, + is_integer_dtype, + is_scalar, + needs_i8_conversion, + pandas_dtype, +) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.generic import ( - ABCFloat64Index, ABCInt64Index, ABCRangeIndex, ABCUInt64Index) + ABCFloat64Index, + ABCInt64Index, + ABCRangeIndex, + ABCUInt64Index, +) from pandas.core.dtypes.missing import isna from pandas.core import algorithms import pandas.core.common as com -from pandas.core.indexes.base import ( - Index, InvalidIndexError, _index_shared_docs) +from pandas.core.indexes.base import Index, InvalidIndexError, _index_shared_docs from pandas.core.ops import get_op_result_name _num_index_shared_docs = dict() @@ -30,15 +41,18 @@ class NumericIndex(Index): This is an abstract class """ + _is_numeric_dtype = True - def __new__(cls, data=None, dtype=None, copy=False, name=None, - fastpath=None): + def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=None): if fastpath is not None: - warnings.warn("The 'fastpath' keyword is deprecated, and will be " - "removed in a future version.", - FutureWarning, stacklevel=2) + warnings.warn( + "The 'fastpath' keyword is deprecated, and will be " + "removed in a future version.", + FutureWarning, + stacklevel=2, + ) if fastpath: return cls._simple_new(data, name=name) @@ -54,18 +68,18 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, else: subarr = data - if name is None and hasattr(data, 'name'): + if name is None and hasattr(data, "name"): name = data.name return cls._simple_new(subarr, name=name) - @Appender(_index_shared_docs['_maybe_cast_slice_bound']) + @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) def _maybe_cast_slice_bound(self, label, side, kind): - assert kind in ['ix', 'loc', 'getitem', None] + assert kind in ["ix", "loc", "getitem", None] # we will try to coerce to integers return self._maybe_cast_indexer(label) - @Appender(_index_shared_docs['_shallow_copy']) + @Appender(_index_shared_docs["_shallow_copy"]) def _shallow_copy(self, values=None, **kwargs): if values is not None and not self._can_hold_na: # Ensure we are not returning an Int64Index with float data: @@ -85,17 +99,24 @@ def _convert_for_op(self, value): def _convert_tolerance(self, tolerance, target): tolerance = np.asarray(tolerance) if target.size != tolerance.size and tolerance.size > 1: - raise ValueError('list-like tolerance size must match ' - 'target index size') + raise ValueError("list-like tolerance size must match " "target index size") if not np.issubdtype(tolerance.dtype, np.number): if tolerance.ndim > 0: - raise ValueError(('tolerance argument for %s must contain ' - 'numeric elements if it is list type') % - (type(self).__name__,)) + raise ValueError( + ( + "tolerance argument for %s must contain " + "numeric elements if it is list type" + ) + % (type(self).__name__,) + ) else: - raise ValueError(('tolerance argument for %s must be numeric ' - 'if it is a scalar: %r') % - (type(self).__name__, tolerance)) + raise ValueError( + ( + "tolerance argument for %s must be numeric " + "if it is a scalar: %r" + ) + % (type(self).__name__, tolerance) + ) return tolerance @classmethod @@ -131,9 +152,8 @@ def _union(self, other, sort): # float | [u]int -> float (the special case) # | -> T # | -> object - needs_cast = ( - (is_integer_dtype(self.dtype) and is_float_dtype(other.dtype)) or - (is_integer_dtype(other.dtype) and is_float_dtype(self.dtype)) + needs_cast = (is_integer_dtype(self.dtype) and is_float_dtype(other.dtype)) or ( + is_integer_dtype(other.dtype) and is_float_dtype(self.dtype) ) if needs_cast: first = self.astype("float") @@ -143,7 +163,9 @@ def _union(self, other, sort): return super()._union(other, sort) -_num_index_shared_docs['class_descr'] = """ +_num_index_shared_docs[ + "class_descr" +] = """ Immutable ndarray implementing an ordered, sliceable set. The basic object storing axis labels for all pandas objects. %(klass)s is a special case of `Index` with purely %(ltype)s labels. %(extra)s @@ -174,12 +196,7 @@ def _union(self, other, sort): An Index instance can **only** contain hashable objects. """ -_int64_descr_args = dict( - klass='Int64Index', - ltype='integer', - dtype='int64', - extra='' -) +_int64_descr_args = dict(klass="Int64Index", ltype="integer", dtype="int64", extra="") class IntegerIndex(NumericIndex): @@ -201,9 +218,9 @@ def __contains__(self, key): class Int64Index(IntegerIndex): - __doc__ = _num_index_shared_docs['class_descr'] % _int64_descr_args + __doc__ = _num_index_shared_docs["class_descr"] % _int64_descr_args - _typ = 'int64index' + _typ = "int64index" _can_hold_na = False _engine_type = libindex.Int64Engine _default_dtype = np.int64 @@ -211,19 +228,19 @@ class Int64Index(IntegerIndex): @property def inferred_type(self): """Always 'integer' for ``Int64Index``""" - return 'integer' + return "integer" @property def asi8(self): # do not cache or you'll create a memory leak - return self.values.view('i8') + return self.values.view("i8") - @Appender(_index_shared_docs['_convert_scalar_indexer']) + @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + assert kind in ["ix", "loc", "getitem", "iloc", None] # don't coerce ilocs to integers - if kind != 'iloc': + if kind != "iloc": key = self._maybe_cast_indexer(key) return super()._convert_scalar_indexer(key, kind=kind) @@ -238,16 +255,12 @@ def _assert_safe_casting(cls, data, subarr): """ if not issubclass(data.dtype.type, np.signedinteger): if not np.array_equal(data, subarr): - raise TypeError('Unsafe NumPy casting, you must ' - 'explicitly cast') + raise TypeError("Unsafe NumPy casting, you must " "explicitly cast") def _is_compatible_with_other(self, other): - return ( - super()._is_compatible_with_other(other) - or all(isinstance(type(obj), (ABCInt64Index, - ABCFloat64Index, - ABCRangeIndex)) - for obj in [self, other]) + return super()._is_compatible_with_other(other) or all( + isinstance(type(obj), (ABCInt64Index, ABCFloat64Index, ABCRangeIndex)) + for obj in [self, other] ) @@ -255,17 +268,14 @@ def _is_compatible_with_other(self, other): Int64Index._add_logical_methods() _uint64_descr_args = dict( - klass='UInt64Index', - ltype='unsigned integer', - dtype='uint64', - extra='' + klass="UInt64Index", ltype="unsigned integer", dtype="uint64", extra="" ) class UInt64Index(IntegerIndex): - __doc__ = _num_index_shared_docs['class_descr'] % _uint64_descr_args + __doc__ = _num_index_shared_docs["class_descr"] % _uint64_descr_args - _typ = 'uint64index' + _typ = "uint64index" _can_hold_na = False _engine_type = libindex.UInt64Engine _default_dtype = np.uint64 @@ -273,23 +283,23 @@ class UInt64Index(IntegerIndex): @property def inferred_type(self): """Always 'integer' for ``UInt64Index``""" - return 'integer' + return "integer" @property def asi8(self): # do not cache or you'll create a memory leak - return self.values.view('u8') + return self.values.view("u8") - @Appender(_index_shared_docs['_convert_scalar_indexer']) + @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + assert kind in ["ix", "loc", "getitem", "iloc", None] # don't coerce ilocs to integers - if kind != 'iloc': + if kind != "iloc": key = self._maybe_cast_indexer(key) return super()._convert_scalar_indexer(key, kind=kind) - @Appender(_index_shared_docs['_convert_arr_indexer']) + @Appender(_index_shared_docs["_convert_arr_indexer"]) def _convert_arr_indexer(self, keyarr): # Cast the indexer to uint64 if possible so # that the values returned from indexing are @@ -299,7 +309,7 @@ def _convert_arr_indexer(self, keyarr): return com.asarray_tuplesafe(keyarr, dtype=np.uint64) return keyarr - @Appender(_index_shared_docs['_convert_index_indexer']) + @Appender(_index_shared_docs["_convert_index_indexer"]) def _convert_index_indexer(self, keyarr): # Cast the indexer to uint64 if possible so # that the values returned from indexing are @@ -319,15 +329,12 @@ def _assert_safe_casting(cls, data, subarr): """ if not issubclass(data.dtype.type, np.unsignedinteger): if not np.array_equal(data, subarr): - raise TypeError('Unsafe NumPy casting, you must ' - 'explicitly cast') + raise TypeError("Unsafe NumPy casting, you must " "explicitly cast") def _is_compatible_with_other(self, other): - return ( - super()._is_compatible_with_other(other) - or all(isinstance(type(obj), (ABCUInt64Index, - ABCFloat64Index)) - for obj in [self, other]) + return super()._is_compatible_with_other(other) or all( + isinstance(type(obj), (ABCUInt64Index, ABCFloat64Index)) + for obj in [self, other] ) @@ -335,67 +342,73 @@ def _is_compatible_with_other(self, other): UInt64Index._add_logical_methods() _float64_descr_args = dict( - klass='Float64Index', - dtype='float64', - ltype='float', - extra='' + klass="Float64Index", dtype="float64", ltype="float", extra="" ) class Float64Index(NumericIndex): - __doc__ = _num_index_shared_docs['class_descr'] % _float64_descr_args + __doc__ = _num_index_shared_docs["class_descr"] % _float64_descr_args - _typ = 'float64index' + _typ = "float64index" _engine_type = libindex.Float64Engine _default_dtype = np.float64 @property def inferred_type(self): """Always 'floating' for ``Float64Index``""" - return 'floating' + return "floating" - @Appender(_index_shared_docs['astype']) + @Appender(_index_shared_docs["astype"]) def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if needs_i8_conversion(dtype): - msg = ('Cannot convert Float64Index to dtype {dtype}; integer ' - 'values are required for conversion').format(dtype=dtype) + msg = ( + "Cannot convert Float64Index to dtype {dtype}; integer " + "values are required for conversion" + ).format(dtype=dtype) raise TypeError(msg) - elif (is_integer_dtype(dtype) and - not is_extension_array_dtype(dtype)) and self.hasnans: + elif ( + is_integer_dtype(dtype) and not is_extension_array_dtype(dtype) + ) and self.hasnans: # TODO(jreback); this can change once we have an EA Index type # GH 13149 - raise ValueError('Cannot convert NA to integer') + raise ValueError("Cannot convert NA to integer") return super().astype(dtype, copy=copy) - @Appender(_index_shared_docs['_convert_scalar_indexer']) + @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + assert kind in ["ix", "loc", "getitem", "iloc", None] - if kind == 'iloc': - return self._validate_indexer('positional', key, kind) + if kind == "iloc": + return self._validate_indexer("positional", key, kind) return key - @Appender(_index_shared_docs['_convert_slice_indexer']) + @Appender(_index_shared_docs["_convert_slice_indexer"]) def _convert_slice_indexer(self, key, kind=None): # if we are not a slice, then we are done if not isinstance(key, slice): return key - if kind == 'iloc': + if kind == "iloc": return super()._convert_slice_indexer(key, kind=kind) # translate to locations return self.slice_indexer(key.start, key.stop, key.step, kind=kind) - def _format_native_types(self, na_rep='', float_format=None, decimal='.', - quoting=None, **kwargs): + def _format_native_types( + self, na_rep="", float_format=None, decimal=".", quoting=None, **kwargs + ): from pandas.io.formats.format import FloatArrayFormatter - formatter = FloatArrayFormatter(self.values, na_rep=na_rep, - float_format=float_format, - decimal=decimal, quoting=quoting, - fixed_width=False) + + formatter = FloatArrayFormatter( + self.values, + na_rep=na_rep, + float_format=float_format, + decimal=decimal, + quoting=quoting, + fixed_width=False, + ) return formatter.get_result_as_array() def get_value(self, series, key): @@ -424,8 +437,7 @@ def equals(self, other): try: if not isinstance(other, Float64Index): other = self._constructor(other) - if (not is_dtype_equal(self.dtype, other.dtype) or - self.shape != other.shape): + if not is_dtype_equal(self.dtype, other.dtype) or self.shape != other.shape: return False left, right = self._ndarray_values, other._ndarray_values return ((left == right) | (self._isnan & other._isnan)).all() @@ -451,7 +463,7 @@ def __contains__(self, other): return False - @Appender(_index_shared_docs['get_loc']) + @Appender(_index_shared_docs["get_loc"]) def get_loc(self, key, method=None, tolerance=None): try: if np.all(np.isnan(key)) or is_bool(key): @@ -477,13 +489,12 @@ def isin(self, values, level=None): return algorithms.isin(np.array(self), values) def _is_compatible_with_other(self, other): - return ( - super()._is_compatible_with_other(other) - or all(isinstance(type(obj), (ABCInt64Index, - ABCFloat64Index, - ABCUInt64Index, - ABCRangeIndex)) - for obj in [self, other]) + return super()._is_compatible_with_other(other) or all( + isinstance( + type(obj), + (ABCInt64Index, ABCFloat64Index, ABCUInt64Index, ABCRangeIndex), + ) + for obj in [self, other] ) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index f61b2e679f0c81..0013df44614e86 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -4,26 +4,31 @@ import numpy as np from pandas._libs import index as libindex -from pandas._libs.tslibs import ( - NaT, frequencies as libfrequencies, iNaT, resolution) -from pandas._libs.tslibs.period import ( - DIFFERENT_FREQ, IncompatibleFrequency, Period) +from pandas._libs.tslibs import NaT, frequencies as libfrequencies, iNaT, resolution +from pandas._libs.tslibs.period import DIFFERENT_FREQ, IncompatibleFrequency, Period from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.core.dtypes.common import ( - is_bool_dtype, is_datetime64_any_dtype, is_float, is_float_dtype, - is_integer, is_integer_dtype, pandas_dtype) + is_bool_dtype, + is_datetime64_any_dtype, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + pandas_dtype, +) from pandas.core import common as com from pandas.core.accessor import delegate_names from pandas.core.algorithms import unique1d -from pandas.core.arrays.period import ( - PeriodArray, period_array, validate_dtype_freq) +from pandas.core.arrays.period import PeriodArray, period_array, validate_dtype_freq from pandas.core.base import _shared_docs import pandas.core.indexes.base as ibase from pandas.core.indexes.base import _index_shared_docs, ensure_index from pandas.core.indexes.datetimelike import ( - DatetimeIndexOpsMixin, DatetimelikeDelegateMixin) + DatetimeIndexOpsMixin, + DatetimelikeDelegateMixin, +) from pandas.core.indexes.datetimes import DatetimeIndex, Index, Int64Index from pandas.core.missing import isna from pandas.core.ops import get_op_result_name @@ -33,8 +38,7 @@ from pandas.tseries.offsets import DateOffset, Tick _index_doc_kwargs = dict(ibase._index_doc_kwargs) -_index_doc_kwargs.update( - dict(target_klass='PeriodIndex or list of Periods')) +_index_doc_kwargs.update(dict(target_klass="PeriodIndex or list of Periods")) # --- Period index sketch @@ -42,9 +46,9 @@ def _new_PeriodIndex(cls, **d): # GH13277 for unpickling - values = d.pop('data') - if values.dtype == 'int64': - freq = d.pop('freq', None) + values = d.pop("data") + if values.dtype == "int64": + freq = d.pop("freq", None) values = PeriodArray(values, freq=freq) return cls._simple_new(values, **d) else: @@ -55,21 +59,17 @@ class PeriodDelegateMixin(DatetimelikeDelegateMixin): """ Delegate from PeriodIndex to PeriodArray. """ + _delegate_class = PeriodArray _delegated_properties = PeriodArray._datetimelike_ops - _delegated_methods = ( - set(PeriodArray._datetimelike_methods) | {'_addsub_int_array'} - ) - _raw_properties = {'is_leap_year'} - - -@delegate_names(PeriodArray, - PeriodDelegateMixin._delegated_properties, - typ='property') -@delegate_names(PeriodArray, - PeriodDelegateMixin._delegated_methods, - typ="method", - overwrite=True) + _delegated_methods = set(PeriodArray._datetimelike_methods) | {"_addsub_int_array"} + _raw_properties = {"is_leap_year"} + + +@delegate_names(PeriodArray, PeriodDelegateMixin._delegated_properties, typ="property") +@delegate_names( + PeriodArray, PeriodDelegateMixin._delegated_methods, typ="method", overwrite=True +) class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): """ Immutable ndarray holding ordinal values indicating regular periods in @@ -161,8 +161,9 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): -------- >>> idx = pd.PeriodIndex(year=year_arr, quarter=q_arr) """ - _typ = 'periodindex' - _attributes = ['name', 'freq'] + + _typ = "periodindex" + _attributes = ["name", "freq"] # define my properties & methods for delegation _is_numeric_dtype = False @@ -175,39 +176,59 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): # ------------------------------------------------------------------------ # Index Constructors - def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, - periods=None, tz=None, dtype=None, copy=False, name=None, - **fields): - - valid_field_set = {'year', 'month', 'day', 'quarter', - 'hour', 'minute', 'second'} + def __new__( + cls, + data=None, + ordinal=None, + freq=None, + start=None, + end=None, + periods=None, + tz=None, + dtype=None, + copy=False, + name=None, + **fields + ): + + valid_field_set = { + "year", + "month", + "day", + "quarter", + "hour", + "minute", + "second", + } if not set(fields).issubset(valid_field_set): - raise TypeError('__new__() got an unexpected keyword argument {}'. - format(list(set(fields) - valid_field_set)[0])) + raise TypeError( + "__new__() got an unexpected keyword argument {}".format( + list(set(fields) - valid_field_set)[0] + ) + ) - if name is None and hasattr(data, 'name'): + if name is None and hasattr(data, "name"): name = data.name if data is None and ordinal is None: # range-based. - data, freq2 = PeriodArray._generate_range(start, end, periods, - freq, fields) + data, freq2 = PeriodArray._generate_range(start, end, periods, freq, fields) # PeriodArray._generate range does validate that fields is # empty when really using the range-based constructor. if not fields: - msg = ("Creating a PeriodIndex by passing range " - "endpoints is deprecated. Use " - "`pandas.period_range` instead.") + msg = ( + "Creating a PeriodIndex by passing range " + "endpoints is deprecated. Use " + "`pandas.period_range` instead." + ) # period_range differs from PeriodIndex for cases like # start="2000", periods=4 # PeriodIndex interprets that as A-DEC freq. # period_range interprets it as 'D' freq. - cond = ( - freq is None and ( - (start and not isinstance(start, Period)) or - (end and not isinstance(end, Period)) - ) + cond = freq is None and ( + (start and not isinstance(start, Period)) + or (end and not isinstance(end, Period)) ) if cond: msg += ( @@ -291,11 +312,12 @@ def freq(self, value): value = Period._maybe_convert_freq(value) # TODO: When this deprecation is enforced, PeriodIndex.freq can # be removed entirely, and we'll just inherit. - msg = ('Setting {cls}.freq has been deprecated and will be ' - 'removed in a future version; use {cls}.asfreq instead. ' - 'The {cls}.freq setter is not guaranteed to work.') - warnings.warn(msg.format(cls=type(self).__name__), - FutureWarning, stacklevel=2) + msg = ( + "Setting {cls}.freq has been deprecated and will be " + "removed in a future version; use {cls}.asfreq instead. " + "The {cls}.freq setter is not guaranteed to work." + ) + warnings.warn(msg.format(cls=type(self).__name__), FutureWarning, stacklevel=2) # PeriodArray._freq isn't actually mutable. We set the private _freq # here, but people shouldn't be doing this anyway. self._data._freq = value @@ -309,8 +331,7 @@ def _shallow_copy(self, values=None, **kwargs): values = values._values if not isinstance(values, PeriodArray): - if (isinstance(values, np.ndarray) and - is_integer_dtype(values.dtype)): + if isinstance(values, np.ndarray) and is_integer_dtype(values.dtype): values = PeriodArray(values, freq=self.freq) else: # in particular, I would like to avoid period_array here. @@ -322,12 +343,12 @@ def _shallow_copy(self, values=None, **kwargs): values = period_array(values, freq=self.freq) # We don't allow changing `freq` in _shallow_copy. - validate_dtype_freq(self.dtype, kwargs.get('freq')) + validate_dtype_freq(self.dtype, kwargs.get("freq")) attributes = self._get_attributes_dict() attributes.update(kwargs) - if not len(values) and 'dtype' not in kwargs: - attributes['dtype'] = self.dtype + if not len(values) and "dtype" not in kwargs: + attributes["dtype"] = self.dtype return self._simple_new(values, **attributes) def _shallow_copy_with_infer(self, values=None, **kwargs): @@ -347,6 +368,7 @@ def func(x): return x else: return Period._from_ordinal(ordinal=x, freq=self.freq) + return func def _maybe_convert_timedelta(self, other): @@ -366,8 +388,7 @@ def _maybe_convert_timedelta(self, other): IncompatibleFrequency : if the input cannot be written as a multiple of self.freq. Note IncompatibleFrequency subclasses ValueError. """ - if isinstance( - other, (timedelta, np.timedelta64, Tick, np.ndarray)): + if isinstance(other, (timedelta, np.timedelta64, Tick, np.ndarray)): offset = frequencies.to_offset(self.freq.rule_code) if isinstance(offset, Tick): # _check_timedeltalike_freq_compat will raise if incompatible @@ -379,9 +400,9 @@ def _maybe_convert_timedelta(self, other): if base == self.freq.rule_code: return other.n - msg = DIFFERENT_FREQ.format(cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=other.freqstr) + msg = DIFFERENT_FREQ.format( + cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr + ) raise IncompatibleFrequency(msg) elif is_integer(other): # integer is passed to .shift via @@ -390,19 +411,17 @@ def _maybe_convert_timedelta(self, other): return other # raise when input doesn't have freq - msg = DIFFERENT_FREQ.format(cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=None) + msg = DIFFERENT_FREQ.format( + cls=type(self).__name__, own_freq=self.freqstr, other_freq=None + ) raise IncompatibleFrequency(msg) # ------------------------------------------------------------------------ # Rendering Methods - def _format_native_types(self, na_rep='NaT', quoting=None, **kwargs): + def _format_native_types(self, na_rep="NaT", quoting=None, **kwargs): # just dispatch, return ndarray - return self._data._format_native_types(na_rep=na_rep, - quoting=quoting, - **kwargs) + return self._data._format_native_types(na_rep=na_rep, quoting=quoting, **kwargs) def _mpl_repr(self): # how to represent ourselves to matplotlib @@ -419,7 +438,7 @@ def _formatter_func(self): def _engine(self): return self._engine_type(lambda: self, len(self)) - @Appender(_index_shared_docs['contains']) + @Appender(_index_shared_docs["contains"]) def __contains__(self, key): if isinstance(key, Period): if key.freq != self.freq: @@ -471,14 +490,13 @@ def __array_wrap__(self, result, context=None): name = self.name left = context[1][0] right = context[1][1] - if (isinstance(left, PeriodIndex) and - isinstance(right, PeriodIndex)): + if isinstance(left, PeriodIndex) and isinstance(right, PeriodIndex): name = left.name if left.name == right.name else None return Index(result, name=name) elif isinstance(left, Period) or isinstance(right, Period): return Index(result, name=name) elif isinstance(func, np.ufunc): - if 'M->M' not in func.types: + if "M->M" not in func.types: msg = "ufunc '{0}' not supported for the PeriodIndex" # This should be TypeError, but TypeError cannot be raised # from here because numpy catches. @@ -501,37 +519,41 @@ def asof_locs(self, where, mask): where_idx = PeriodIndex(where_idx.values, freq=self.freq) locs = self._ndarray_values[mask].searchsorted( - where_idx._ndarray_values, side='right') + where_idx._ndarray_values, side="right" + ) locs = np.where(locs > 0, locs - 1, 0) result = np.arange(len(self))[mask].take(locs) first = mask.argmax() - result[(locs == 0) & (where_idx._ndarray_values < - self._ndarray_values[first])] = -1 + result[ + (locs == 0) & (where_idx._ndarray_values < self._ndarray_values[first]) + ] = -1 return result - @Appender(_index_shared_docs['astype']) - def astype(self, dtype, copy=True, how='start'): + @Appender(_index_shared_docs["astype"]) + def astype(self, dtype, copy=True, how="start"): dtype = pandas_dtype(dtype) if is_datetime64_any_dtype(dtype): # 'how' is index-specific, isn't part of the EA interface. - tz = getattr(dtype, 'tz', None) + tz = getattr(dtype, "tz", None) return self.to_timestamp(how=how).tz_localize(tz) # TODO: should probably raise on `how` here, so we don't ignore it. return super().astype(dtype, copy=copy) - @Substitution(klass='PeriodIndex') - @Appender(_shared_docs['searchsorted']) - def searchsorted(self, value, side='left', sorter=None): + @Substitution(klass="PeriodIndex") + @Appender(_shared_docs["searchsorted"]) + def searchsorted(self, value, side="left", sorter=None): if isinstance(value, Period): if value.freq != self.freq: - msg = DIFFERENT_FREQ.format(cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=value.freqstr) + msg = DIFFERENT_FREQ.format( + cls=type(self).__name__, + own_freq=self.freqstr, + other_freq=value.freqstr, + ) raise IncompatibleFrequency(msg) value = value.ordinal elif isinstance(value, str): @@ -540,8 +562,7 @@ def searchsorted(self, value, side='left', sorter=None): except DateParseError: raise KeyError("Cannot interpret '{}' as period".format(value)) - return self._ndarray_values.searchsorted(value, side=side, - sorter=sorter) + return self._ndarray_values.searchsorted(value, side=side, sorter=sorter) @property def is_all_dates(self): @@ -556,7 +577,7 @@ def is_full(self): if len(self) == 0: return True if not self.is_monotonic: - raise ValueError('Index is not monotonic') + raise ValueError("Index is not monotonic") values = self.asi8 return ((values[1:] - values[:-1]) < 2).all() @@ -564,7 +585,7 @@ def is_full(self): def inferred_type(self): # b/c data is represented as ints make sure we can't have ambiguous # indexing - return 'period' + return "period" def get_value(self, series, key): """ @@ -573,9 +594,7 @@ def get_value(self, series, key): """ s = com.values_from_object(series) try: - return com.maybe_box(self, - super().get_value(s, key), - series, key) + return com.maybe_box(self, super().get_value(s, key), series, key) except (KeyError, IndexError): try: asdt, parsed, reso = parse_time_string(key, self.freq) @@ -587,8 +606,8 @@ def get_value(self, series, key): # if our data is higher resolution than requested key, slice if grp < freqn: iv = Period(asdt, freq=(grp, 1)) - ord1 = iv.asfreq(self.freq, how='S').ordinal - ord2 = iv.asfreq(self.freq, how='E').ordinal + ord1 = iv.asfreq(self.freq, how="S").ordinal + ord2 = iv.asfreq(self.freq, how="E").ordinal if ord2 < vals[0] or ord1 > vals[-1]: raise KeyError(key) @@ -598,8 +617,9 @@ def get_value(self, series, key): return series[key] elif grp == freqn: key = Period(asdt, freq=self.freq).ordinal - return com.maybe_box(self, self._engine.get_value(s, key), - series, key) + return com.maybe_box( + self, self._engine.get_value(s, key), series, key + ) else: raise KeyError(key) except TypeError: @@ -607,17 +627,18 @@ def get_value(self, series, key): period = Period(key, self.freq) key = period.value if isna(period) else period.ordinal - return com.maybe_box(self, self._engine.get_value(s, key), - series, key) + return com.maybe_box(self, self._engine.get_value(s, key), series, key) - @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) + @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): target = ensure_index(target) - if hasattr(target, 'freq') and target.freq != self.freq: - msg = DIFFERENT_FREQ.format(cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=target.freqstr) + if hasattr(target, "freq") and target.freq != self.freq: + msg = DIFFERENT_FREQ.format( + cls=type(self).__name__, + own_freq=self.freqstr, + other_freq=target.freqstr, + ) raise IncompatibleFrequency(msg) if isinstance(target, PeriodIndex): @@ -625,8 +646,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): if tolerance is not None: tolerance = self._convert_tolerance(tolerance, target) - return Index.get_indexer(self._int64index, target, method, - limit, tolerance) + return Index.get_indexer(self._int64index, target, method, limit, tolerance) def _get_unique_index(self, dropna=False): """ @@ -682,8 +702,7 @@ def get_loc(self, key, method=None, tolerance=None): try: ordinal = iNaT if key is NaT else key.ordinal if tolerance is not None: - tolerance = self._convert_tolerance(tolerance, - np.asarray(key)) + tolerance = self._convert_tolerance(tolerance, np.asarray(key)) return self._int64index.get_loc(ordinal, method, tolerance) except KeyError: @@ -709,7 +728,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): Value of `side` parameter should be validated in caller. """ - assert kind in ['ix', 'loc', 'getitem'] + assert kind in ["ix", "loc", "getitem"] if isinstance(label, datetime): return Period(label, freq=self.freq) @@ -717,86 +736,105 @@ def _maybe_cast_slice_bound(self, label, side, kind): try: _, parsed, reso = parse_time_string(label, self.freq) bounds = self._parsed_string_to_bounds(reso, parsed) - return bounds[0 if side == 'left' else 1] + return bounds[0 if side == "left" else 1] except Exception: raise KeyError(label) elif is_integer(label) or is_float(label): - self._invalid_indexer('slice', label) + self._invalid_indexer("slice", label) return label def _parsed_string_to_bounds(self, reso, parsed): - if reso == 'year': - t1 = Period(year=parsed.year, freq='A') - elif reso == 'month': - t1 = Period(year=parsed.year, month=parsed.month, freq='M') - elif reso == 'quarter': + if reso == "year": + t1 = Period(year=parsed.year, freq="A") + elif reso == "month": + t1 = Period(year=parsed.year, month=parsed.month, freq="M") + elif reso == "quarter": q = (parsed.month - 1) // 3 + 1 - t1 = Period(year=parsed.year, quarter=q, freq='Q-DEC') - elif reso == 'day': - t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day, - freq='D') - elif reso == 'hour': - t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day, - hour=parsed.hour, freq='H') - elif reso == 'minute': - t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day, - hour=parsed.hour, minute=parsed.minute, freq='T') - elif reso == 'second': - t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day, - hour=parsed.hour, minute=parsed.minute, - second=parsed.second, freq='S') + t1 = Period(year=parsed.year, quarter=q, freq="Q-DEC") + elif reso == "day": + t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day, freq="D") + elif reso == "hour": + t1 = Period( + year=parsed.year, + month=parsed.month, + day=parsed.day, + hour=parsed.hour, + freq="H", + ) + elif reso == "minute": + t1 = Period( + year=parsed.year, + month=parsed.month, + day=parsed.day, + hour=parsed.hour, + minute=parsed.minute, + freq="T", + ) + elif reso == "second": + t1 = Period( + year=parsed.year, + month=parsed.month, + day=parsed.day, + hour=parsed.hour, + minute=parsed.minute, + second=parsed.second, + freq="S", + ) else: raise KeyError(reso) - return (t1.asfreq(self.freq, how='start'), - t1.asfreq(self.freq, how='end')) + return (t1.asfreq(self.freq, how="start"), t1.asfreq(self.freq, how="end")) def _get_string_slice(self, key): if not self.is_monotonic: - raise ValueError('Partial indexing only valid for ' - 'ordered time series') + raise ValueError("Partial indexing only valid for " "ordered time series") key, parsed, reso = parse_time_string(key, self.freq) grp = resolution.Resolution.get_freq_group(reso) freqn = resolution.get_freq_group(self.freq) - if reso in ['day', 'hour', 'minute', 'second'] and not grp < freqn: + if reso in ["day", "hour", "minute", "second"] and not grp < freqn: raise KeyError(key) t1, t2 = self._parsed_string_to_bounds(reso, parsed) - return slice(self.searchsorted(t1.ordinal, side='left'), - self.searchsorted(t2.ordinal, side='right')) + return slice( + self.searchsorted(t1.ordinal, side="left"), + self.searchsorted(t2.ordinal, side="right"), + ) def _convert_tolerance(self, tolerance, target): - tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance, - target) + tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance, target) if target.size != tolerance.size and tolerance.size > 1: - raise ValueError('list-like tolerance size must match ' - 'target index size') + raise ValueError("list-like tolerance size must match " "target index size") return self._maybe_convert_timedelta(tolerance) def insert(self, loc, item): if not isinstance(item, Period) or self.freq != item.freq: return self.astype(object).insert(loc, item) - idx = np.concatenate((self[:loc].asi8, np.array([item.ordinal]), - self[loc:].asi8)) + idx = np.concatenate( + (self[:loc].asi8, np.array([item.ordinal]), self[loc:].asi8) + ) return self._shallow_copy(idx) - def join(self, other, how='left', level=None, return_indexers=False, - sort=False): + def join(self, other, how="left", level=None, return_indexers=False, sort=False): """ See Index.join """ self._assert_can_do_setop(other) if not isinstance(other, PeriodIndex): - return self.astype(object).join(other, how=how, level=level, - return_indexers=return_indexers, - sort=sort) - - result = Int64Index.join(self, other, how=how, level=level, - return_indexers=return_indexers, - sort=sort) + return self.astype(object).join( + other, how=how, level=level, return_indexers=return_indexers, sort=sort + ) + + result = Int64Index.join( + self, + other, + how=how, + level=level, + return_indexers=return_indexers, + sort=sort, + ) if return_indexers: result, lidx, ridx = result @@ -813,9 +851,9 @@ def _assert_can_do_setop(self, other): # *Can't* use PeriodIndexes of different freqs # *Can* use PeriodIndex/DatetimeIndex if isinstance(other, PeriodIndex) and self.freq != other.freq: - msg = DIFFERENT_FREQ.format(cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=other.freqstr) + msg = DIFFERENT_FREQ.format( + cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr + ) raise IncompatibleFrequency(msg) def _wrap_setop_result(self, other, result): @@ -826,8 +864,7 @@ def _wrap_setop_result(self, other, result): def _apply_meta(self, rawarr): if not isinstance(rawarr, PeriodIndex): - rawarr = PeriodIndex._simple_new(rawarr, freq=self.freq, - name=self.name) + rawarr = PeriodIndex._simple_new(rawarr, freq=self.freq, name=self.name) return rawarr def __setstate__(self, state): @@ -863,9 +900,12 @@ def __setstate__(self, state): @property def flags(self): """ return the ndarray.flags for the underlying data """ - warnings.warn("{obj}.flags is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, stacklevel=2) + warnings.warn( + "{obj}.flags is deprecated and will be removed " + "in a future version".format(obj=type(self).__name__), + FutureWarning, + stacklevel=2, + ) return self._ndarray_values.flags def item(self): @@ -876,22 +916,29 @@ def item(self): .. deprecated 0.25.0 """ - warnings.warn('`item` has been deprecated and will be removed in a ' - 'future version', FutureWarning, stacklevel=2) + warnings.warn( + "`item` has been deprecated and will be removed in a " "future version", + FutureWarning, + stacklevel=2, + ) # TODO(DatetimeArray): remove if len(self) == 1: return self[0] else: # copy numpy's message here because Py26 raises an IndexError - raise ValueError('can only convert an array of size 1 to a ' - 'Python scalar') + raise ValueError( + "can only convert an array of size 1 to a " "Python scalar" + ) @property def data(self): """ return the data pointer of the underlying data """ - warnings.warn("{obj}.data is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, stacklevel=2) + warnings.warn( + "{obj}.data is deprecated and will be removed " + "in a future version".format(obj=type(self).__name__), + FutureWarning, + stacklevel=2, + ) return np.asarray(self._data).data @property @@ -899,9 +946,12 @@ def base(self): """ return the base object if the memory of the underlying data is shared """ - warnings.warn("{obj}.base is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, stacklevel=2) + warnings.warn( + "{obj}.base is deprecated and will be removed " + "in a future version".format(obj=type(self).__name__), + FutureWarning, + stacklevel=2, + ) return np.asarray(self._data) @@ -963,13 +1013,13 @@ def period_range(start=None, end=None, periods=None, freq=None, name=None): dtype='period[M]', freq='M') """ if com.count_not_none(start, end, periods) != 2: - raise ValueError('Of the three parameters: start, end, and periods, ' - 'exactly two must be specified') - if freq is None and (not isinstance(start, Period) - and not isinstance(end, Period)): - freq = 'D' - - data, freq = PeriodArray._generate_range(start, end, periods, freq, - fields={}) + raise ValueError( + "Of the three parameters: start, end, and periods, " + "exactly two must be specified" + ) + if freq is None and (not isinstance(start, Period) and not isinstance(end, Period)): + freq = "D" + + data, freq = PeriodArray._generate_range(start, end, periods, freq, fields={}) data = PeriodArray(data, freq=freq) return PeriodIndex(data, name=name) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 70ca0b349e7ed5..16098c474a4732 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -13,10 +13,16 @@ from pandas.core.dtypes import concat as _concat from pandas.core.dtypes.common import ( - ensure_platform_int, ensure_python_int, is_int64_dtype, is_integer, - is_integer_dtype, is_list_like, is_scalar, is_timedelta64_dtype) -from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCSeries, ABCTimedeltaIndex) + ensure_platform_int, + ensure_python_int, + is_int64_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_scalar, + is_timedelta64_dtype, +) +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCTimedeltaIndex from pandas.core import ops import pandas.core.common as com @@ -65,7 +71,7 @@ class RangeIndex(Int64Index): Int64Index : Index of int64 data. """ - _typ = 'rangeindex' + _typ = "rangeindex" _engine_type = libindex.Int64Engine _range = None # type: range @@ -74,13 +80,24 @@ class RangeIndex(Int64Index): # -------------------------------------------------------------------- # Constructors - def __new__(cls, start=None, stop=None, step=None, - dtype=None, copy=False, name=None, fastpath=None): + def __new__( + cls, + start=None, + stop=None, + step=None, + dtype=None, + copy=False, + name=None, + fastpath=None, + ): if fastpath is not None: - warnings.warn("The 'fastpath' keyword is deprecated, and will be " - "removed in a future version.", - FutureWarning, stacklevel=2) + warnings.warn( + "The 'fastpath' keyword is deprecated, and will be " + "removed in a future version.", + FutureWarning, + stacklevel=2, + ) if fastpath: return cls._simple_new(range(start, stop, step), name=name) @@ -121,8 +138,9 @@ def from_range(cls, data, name=None, dtype=None): """ if not isinstance(data, range): raise TypeError( - '{0}(...) must be called with object coercible to a ' - 'range, {1} was passed'.format(cls.__name__, repr(data))) + "{0}(...) must be called with object coercible to a " + "range, {1} was passed".format(cls.__name__, repr(data)) + ) cls._validate_dtype(dtype) return cls._simple_new(data, dtype=dtype, name=name) @@ -153,7 +171,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): def _validate_dtype(dtype): """ require dtype to be None or int64 """ if not (dtype is None or is_int64_dtype(dtype)): - raise TypeError('Invalid to pass a non-int64 dtype to RangeIndex') + raise TypeError("Invalid to pass a non-int64 dtype to RangeIndex") @cache_readonly def _constructor(self): @@ -170,8 +188,9 @@ def _data(self): triggering the construction. """ if self._cached_data is None: - self._cached_data = np.arange(self.start, self.stop, self.step, - dtype=np.int64) + self._cached_data = np.arange( + self.start, self.stop, self.step, dtype=np.int64 + ) return self._cached_data @cache_readonly @@ -181,9 +200,7 @@ def _int64index(self): def _get_data_as_items(self): """ return a list of tuples of start, stop, step """ rng = self._range - return [('start', rng.start), - ('stop', rng.stop), - ('step', rng.step)] + return [("start", rng.start), ("stop", rng.stop), ("step", rng.step)] def __reduce__(self): d = self._get_attributes_dict() @@ -199,20 +216,22 @@ def _format_attrs(self): """ attrs = self._get_data_as_items() if self.name is not None: - attrs.append(('name', ibase.default_pprint(self.name))) + attrs.append(("name", ibase.default_pprint(self.name))) return attrs def _format_data(self, name=None): # we are formatting thru the attributes return None - def _format_with_header(self, header, na_rep='NaN', **kwargs): + def _format_with_header(self, header, na_rep="NaN", **kwargs): return header + list(map(pprint_thing, self._range)) # -------------------------------------------------------------------- - _deprecation_message = ("RangeIndex.{} is deprecated and will be " - "removed in a future version. Use RangeIndex.{} " - "instead") + _deprecation_message = ( + "RangeIndex.{} is deprecated and will be " + "removed in a future version. Use RangeIndex.{} " + "instead" + ) @cache_readonly def start(self): @@ -230,8 +249,11 @@ def _start(self): .. deprecated:: 0.25.0 Use ``start`` instead. """ - warnings.warn(self._deprecation_message.format("_start", "start"), - DeprecationWarning, stacklevel=2) + warnings.warn( + self._deprecation_message.format("_start", "start"), + DeprecationWarning, + stacklevel=2, + ) return self.start @cache_readonly @@ -250,8 +272,11 @@ def _stop(self): Use ``stop`` instead. """ # GH 25710 - warnings.warn(self._deprecation_message.format("_stop", "stop"), - DeprecationWarning, stacklevel=2) + warnings.warn( + self._deprecation_message.format("_stop", "stop"), + DeprecationWarning, + stacklevel=2, + ) return self.stop @cache_readonly @@ -271,8 +296,11 @@ def _step(self): Use ``step`` instead. """ # GH 25710 - warnings.warn(self._deprecation_message.format("_step", "step"), - DeprecationWarning, stacklevel=2) + warnings.warn( + self._deprecation_message.format("_step", "step"), + DeprecationWarning, + stacklevel=2, + ) return self.step @cache_readonly @@ -281,8 +309,10 @@ def nbytes(self): Return the number of bytes in the underlying data. """ rng = self._range - return getsizeof(rng) + sum(getsizeof(getattr(rng, attr_name)) - for attr_name in ['start', 'stop', 'step']) + return getsizeof(rng) + sum( + getsizeof(getattr(rng, attr_name)) + for attr_name in ["start", "stop", "step"] + ) def memory_usage(self, deep=False): """ @@ -338,7 +368,7 @@ def __contains__(self, key: Union[int, np.integer]) -> bool: return False return key in self._range - @Appender(_index_shared_docs['get_loc']) + @Appender(_index_shared_docs["get_loc"]) def get_loc(self, key, method=None, tolerance=None): if is_integer(key) and method is None and tolerance is None: new_key = int(key) @@ -348,25 +378,21 @@ def get_loc(self, key, method=None, tolerance=None): raise KeyError(key) return super().get_loc(key, method=method, tolerance=tolerance) - @Appender(_index_shared_docs['get_indexer']) + @Appender(_index_shared_docs["get_indexer"]) def get_indexer(self, target, method=None, limit=None, tolerance=None): if not (method is None and tolerance is None and is_list_like(target)): - return super().get_indexer(target, method=method, - tolerance=tolerance) + return super().get_indexer(target, method=method, tolerance=tolerance) if self.step > 0: start, stop, step = self.start, self.stop, self.step else: # Work on reversed range for simplicity: - start, stop, step = (self.stop - self.step, - self.start + 1, - - self.step) + start, stop, step = (self.stop - self.step, self.start + 1, -self.step) target_array = np.asarray(target) if not (is_integer_dtype(target_array) and target_array.ndim == 1): # checks/conversions/roundings are delegated to general method - return super().get_indexer(target, method=method, - tolerance=tolerance) + return super().get_indexer(target, method=method, tolerance=tolerance) locs = target_array - start valid = (locs % step == 0) & (locs >= 0) & (target_array < stop) @@ -381,16 +407,16 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): def tolist(self): return list(self._range) - @Appender(_index_shared_docs['_shallow_copy']) + @Appender(_index_shared_docs["_shallow_copy"]) def _shallow_copy(self, values=None, **kwargs): if values is None: name = kwargs.get("name", self.name) return self._simple_new(self._range, name=name) else: - kwargs.setdefault('name', self.name) + kwargs.setdefault("name", self.name) return self._int64index._shallow_copy(values, **kwargs) - @Appender(ibase._index_shared_docs['copy']) + @Appender(ibase._index_shared_docs["copy"]) def copy(self, name=None, deep=False, dtype=None, **kwargs): self._validate_dtype(dtype) if name is None: @@ -401,8 +427,7 @@ def _minmax(self, meth): no_steps = len(self) - 1 if no_steps == -1: return np.nan - elif ((meth == 'min' and self.step > 0) or - (meth == 'max' and self.step < 0)): + elif (meth == "min" and self.step > 0) or (meth == "max" and self.step < 0): return self.start return self.start + self.step * no_steps @@ -411,13 +436,13 @@ def min(self, axis=None, skipna=True, *args, **kwargs): """The minimum value of the RangeIndex""" nv.validate_minmax_axis(axis) nv.validate_min(args, kwargs) - return self._minmax('min') + return self._minmax("min") def max(self, axis=None, skipna=True, *args, **kwargs): """The maximum value of the RangeIndex""" nv.validate_minmax_axis(axis) nv.validate_max(args, kwargs) - return self._minmax('max') + return self._minmax("max") def argsort(self, *args, **kwargs): """ @@ -501,8 +526,7 @@ def intersection(self, other, sort=False): # calculate parameters for the RangeIndex describing the # intersection disregarding the lower bounds - tmp_start = first.start + (second.start - first.start) * \ - first.step // gcd * s + tmp_start = first.start + (second.start - first.start) * first.step // gcd * s new_step = first.step * second.step // gcd new_range = range(tmp_start, int_high, new_step) new_index = self._simple_new(new_range) @@ -586,35 +610,39 @@ def _union(self, other, sort): start_r = min(start_s, start_o) end_r = max(end_s, end_o) if step_o == step_s: - if ((start_s - start_o) % step_s == 0 and - (start_s - end_o) <= step_s and - (start_o - end_s) <= step_s): + if ( + (start_s - start_o) % step_s == 0 + and (start_s - end_o) <= step_s + and (start_o - end_s) <= step_s + ): return self.__class__(start_r, end_r + step_s, step_s) - if ((step_s % 2 == 0) and - (abs(start_s - start_o) <= step_s / 2) and - (abs(end_s - end_o) <= step_s / 2)): - return self.__class__(start_r, - end_r + step_s / 2, - step_s / 2) + if ( + (step_s % 2 == 0) + and (abs(start_s - start_o) <= step_s / 2) + and (abs(end_s - end_o) <= step_s / 2) + ): + return self.__class__(start_r, end_r + step_s / 2, step_s / 2) elif step_o % step_s == 0: - if ((start_o - start_s) % step_s == 0 and - (start_o + step_s >= start_s) and - (end_o - step_s <= end_s)): + if ( + (start_o - start_s) % step_s == 0 + and (start_o + step_s >= start_s) + and (end_o - step_s <= end_s) + ): return self.__class__(start_r, end_r + step_s, step_s) elif step_s % step_o == 0: - if ((start_s - start_o) % step_o == 0 and - (start_s + step_o >= start_o) and - (end_s - step_o <= end_o)): + if ( + (start_s - start_o) % step_o == 0 + and (start_s + step_o >= start_o) + and (end_s - step_o <= end_o) + ): return self.__class__(start_r, end_r + step_o, step_o) return self._int64index._union(other, sort=sort) - @Appender(_index_shared_docs['join']) - def join(self, other, how='left', level=None, return_indexers=False, - sort=False): - if how == 'outer' and self is not other: + @Appender(_index_shared_docs["join"]) + def join(self, other, how="left", level=None, return_indexers=False, sort=False): + if how == "outer" and self is not other: # note: could return RangeIndex in more circumstances - return self._int64index.join(other, how, level, return_indexers, - sort) + return self._int64index.join(other, how, level, return_indexers, sort) return super().join(other, how, level, return_indexers, sort) @@ -643,14 +671,17 @@ def __getitem__(self, key): try: return self._range[new_key] except IndexError: - raise IndexError("index {key} is out of bounds for axis 0 " - "with size {size}".format(key=key, - size=len(self))) + raise IndexError( + "index {key} is out of bounds for axis 0 " + "with size {size}".format(key=key, size=len(self)) + ) elif is_scalar(key): - raise IndexError("only integers, slices (`:`), " - "ellipsis (`...`), numpy.newaxis (`None`) " - "and integer or boolean " - "arrays are valid indices") + raise IndexError( + "only integers, slices (`:`), " + "ellipsis (`...`), numpy.newaxis (`None`) " + "and integer or boolean " + "arrays are valid indices" + ) # fall back to Int64Index return super().__getitem__(key) @@ -659,9 +690,7 @@ def __floordiv__(self, other): return NotImplemented if is_integer(other) and other != 0: - if (len(self) == 0 or - self.start % other == 0 and - self.step % other == 0): + if len(self) == 0 or self.start % other == 0 and self.step % other == 0: start = self.start // other step = self.step // other stop = start + len(self) * step @@ -717,7 +746,7 @@ def _evaluate_numeric_binop(self, other): try: # apply if we have an override if step: - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): rstep = step(left.step, right) # we don't have a representable op @@ -728,7 +757,7 @@ def _evaluate_numeric_binop(self, other): else: rstep = left.step - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): rstart = op(left.start, right) rstop = op(left.stop, right) @@ -737,9 +766,8 @@ def _evaluate_numeric_binop(self, other): # for compat with numpy / Int64Index # even if we can represent as a RangeIndex, return # as a Float64Index if we have float-like descriptors - if not all(is_integer(x) for x in - [rstart, rstop, rstep]): - result = result.astype('float64') + if not all(is_integer(x) for x in [rstart, rstop, rstep]): + result = result.astype("float64") return result @@ -748,7 +776,7 @@ def _evaluate_numeric_binop(self, other): return op(self._int64index, other) # TODO: Do attrs get handled reliably? - name = '__{name}__'.format(name=op.__name__) + name = "__{name}__".format(name=op.__name__) return compat.set_function_name(_evaluate_numeric_binop, name, cls) cls.__add__ = _make_evaluate_binop(operator.add) @@ -757,10 +785,8 @@ def _evaluate_numeric_binop(self, other): cls.__rsub__ = _make_evaluate_binop(ops.rsub) cls.__mul__ = _make_evaluate_binop(operator.mul, step=operator.mul) cls.__rmul__ = _make_evaluate_binop(ops.rmul, step=ops.rmul) - cls.__truediv__ = _make_evaluate_binop(operator.truediv, - step=operator.truediv) - cls.__rtruediv__ = _make_evaluate_binop(ops.rtruediv, - step=ops.rtruediv) + cls.__truediv__ = _make_evaluate_binop(operator.truediv, step=operator.truediv) + cls.__rtruediv__ = _make_evaluate_binop(ops.rtruediv, step=ops.rtruediv) RangeIndex._add_numeric_methods() diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index ba5507fa71e8c6..29ed3c6b973181 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -4,13 +4,20 @@ import numpy as np -from pandas._libs import ( - NaT, Timedelta, index as libindex, join as libjoin, lib) +from pandas._libs import NaT, Timedelta, index as libindex, join as libjoin, lib from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.common import ( - _TD_DTYPE, ensure_int64, is_float, is_integer, is_list_like, is_scalar, - is_timedelta64_dtype, is_timedelta64_ns_dtype, pandas_dtype) + _TD_DTYPE, + ensure_int64, + is_float, + is_integer, + is_list_like, + is_scalar, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, + pandas_dtype, +) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.missing import isna @@ -21,8 +28,11 @@ import pandas.core.common as com from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.datetimelike import ( - DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, maybe_unwrap_index, - wrap_arithmetic_op) + DatetimeIndexOpsMixin, + DatetimelikeDelegateMixin, + maybe_unwrap_index, + wrap_arithmetic_op, +) from pandas.core.indexes.numeric import Int64Index from pandas.core.ops import get_op_result_name @@ -47,28 +57,24 @@ class TimedeltaDelegateMixin(DatetimelikeDelegateMixin): # We also have a few "extra" attrs, which may or may not be raw, # which we we dont' want to expose in the .dt accessor. _delegate_class = TimedeltaArray - _delegated_properties = (TimedeltaArray._datetimelike_ops + [ - 'components', - ]) - _delegated_methods = TimedeltaArray._datetimelike_methods + [ - '_box_values', - ] - _raw_properties = { - 'components', - } - _raw_methods = { - 'to_pytimedelta', - } - - -@delegate_names(TimedeltaArray, - TimedeltaDelegateMixin._delegated_properties, - typ="property") -@delegate_names(TimedeltaArray, - TimedeltaDelegateMixin._delegated_methods, - typ="method", overwrite=False) -class TimedeltaIndex(DatetimeIndexOpsMixin, dtl.TimelikeOps, Int64Index, - TimedeltaDelegateMixin): + _delegated_properties = TimedeltaArray._datetimelike_ops + ["components"] + _delegated_methods = TimedeltaArray._datetimelike_methods + ["_box_values"] + _raw_properties = {"components"} + _raw_methods = {"to_pytimedelta"} + + +@delegate_names( + TimedeltaArray, TimedeltaDelegateMixin._delegated_properties, typ="property" +) +@delegate_names( + TimedeltaArray, + TimedeltaDelegateMixin._delegated_methods, + typ="method", + overwrite=False, +) +class TimedeltaIndex( + DatetimeIndexOpsMixin, dtl.TimelikeOps, Int64Index, TimedeltaDelegateMixin +): """ Immutable ndarray of timedelta64 data, represented internally as int64, and which can be boxed to timedelta objects @@ -148,23 +154,23 @@ class TimedeltaIndex(DatetimeIndexOpsMixin, dtl.TimelikeOps, Int64Index, been deprecated in favor of :func:`timedelta_range`. """ - _typ = 'timedeltaindex' + _typ = "timedeltaindex" _join_precedence = 10 def _join_i8_wrapper(joinf, **kwargs): - return DatetimeIndexOpsMixin._join_i8_wrapper( - joinf, dtype='m8[ns]', **kwargs) + return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype="m8[ns]", **kwargs) _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer_int64) _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer_int64) _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer_int64) _left_indexer_unique = _join_i8_wrapper( - libjoin.left_join_indexer_unique_int64, with_indexers=False) + libjoin.left_join_indexer_unique_int64, with_indexers=False + ) _engine_type = libindex.TimedeltaEngine - _comparables = ['name', 'freq'] - _attributes = ['name', 'freq'] + _comparables = ["name", "freq"] + _attributes = ["name", "freq"] _is_numeric_dtype = True _infer_as_myclass = True @@ -181,44 +187,67 @@ def _join_i8_wrapper(joinf, **kwargs): # ------------------------------------------------------------------- # Constructors - def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, - periods=None, closed=None, dtype=_TD_DTYPE, copy=False, - name=None, verify_integrity=None): + def __new__( + cls, + data=None, + unit=None, + freq=None, + start=None, + end=None, + periods=None, + closed=None, + dtype=_TD_DTYPE, + copy=False, + name=None, + verify_integrity=None, + ): if verify_integrity is not None: - warnings.warn("The 'verify_integrity' argument is deprecated, " - "will be removed in a future version.", - FutureWarning, stacklevel=2) + warnings.warn( + "The 'verify_integrity' argument is deprecated, " + "will be removed in a future version.", + FutureWarning, + stacklevel=2, + ) else: verify_integrity = True if data is None: freq, freq_infer = dtl.maybe_infer_freq(freq) - warnings.warn("Creating a TimedeltaIndex by passing range " - "endpoints is deprecated. Use " - "`pandas.timedelta_range` instead.", - FutureWarning, stacklevel=2) - result = TimedeltaArray._generate_range(start, end, periods, freq, - closed=closed) + warnings.warn( + "Creating a TimedeltaIndex by passing range " + "endpoints is deprecated. Use " + "`pandas.timedelta_range` instead.", + FutureWarning, + stacklevel=2, + ) + result = TimedeltaArray._generate_range( + start, end, periods, freq, closed=closed + ) return cls._simple_new(result._data, freq=freq, name=name) if is_scalar(data): - raise TypeError('{cls}() must be called with a ' - 'collection of some kind, {data} was passed' - .format(cls=cls.__name__, data=repr(data))) - - if unit in {'Y', 'y', 'M'}: - warnings.warn("M and Y units are deprecated and " - "will be removed in a future version.", - FutureWarning, stacklevel=2) + raise TypeError( + "{cls}() must be called with a " + "collection of some kind, {data} was passed".format( + cls=cls.__name__, data=repr(data) + ) + ) + + if unit in {"Y", "y", "M"}: + warnings.warn( + "M and Y units are deprecated and " + "will be removed in a future version.", + FutureWarning, + stacklevel=2, + ) if isinstance(data, TimedeltaArray): if copy: data = data.copy() return cls._simple_new(data, name=name, freq=freq) - if (isinstance(data, TimedeltaIndex) and - freq is None and name is None): + if isinstance(data, TimedeltaIndex) and freq is None and name is None: if copy: return data.copy() else: @@ -226,8 +255,9 @@ def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, # - Cases checked above all return/raise before reaching here - # - tdarr = TimedeltaArray._from_sequence(data, freq=freq, unit=unit, - dtype=dtype, copy=copy) + tdarr = TimedeltaArray._from_sequence( + data, freq=freq, unit=unit, dtype=dtype, copy=copy + ) return cls._simple_new(tdarr._data, freq=tdarr.freq, name=name) @classmethod @@ -235,14 +265,13 @@ def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE): # `dtype` is passed by _shallow_copy in corner cases, should always # be timedelta64[ns] if present if not isinstance(values, TimedeltaArray): - values = TimedeltaArray._simple_new(values, dtype=dtype, - freq=freq) + values = TimedeltaArray._simple_new(values, dtype=dtype, freq=freq) else: if freq is None: freq = values.freq assert isinstance(values, TimedeltaArray), type(values) assert dtype == _TD_DTYPE, dtype - assert values.dtype == 'm8[ns]', values.dtype + assert values.dtype == "m8[ns]", values.dtype tdarr = TimedeltaArray._simple_new(values._data, freq=freq) result = object.__new__(cls) @@ -262,14 +291,15 @@ def __setstate__(self, state): super().__setstate__(state) else: raise Exception("invalid pickle state") + _unpickle_compat = __setstate__ def _maybe_update_attributes(self, attrs): """ Update Index attributes (e.g. freq) depending on op """ - freq = attrs.get('freq', None) + freq = attrs.get("freq", None) if freq is not None: # no need to infer if freq is None - attrs['freq'] = 'infer' + attrs["freq"] = "infer" return attrs # ------------------------------------------------------------------- @@ -278,13 +308,15 @@ def _maybe_update_attributes(self, attrs): @property def _formatter_func(self): from pandas.io.formats.format import _get_format_timedelta64 + return _get_format_timedelta64(self, box=True) - def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): + def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): from pandas.io.formats.format import Timedelta64Formatter - return Timedelta64Formatter(values=self, - nat_rep=na_rep, - justify='all').get_result() + + return Timedelta64Formatter( + values=self, nat_rep=na_rep, justify="all" + ).get_result() # ------------------------------------------------------------------- # Wrapping TimedeltaArray @@ -307,7 +339,7 @@ def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): @property def _box_func(self): - return lambda x: Timedelta(x, unit='ns') + return lambda x: Timedelta(x, unit="ns") def __getitem__(self, key): result = self._data.__getitem__(key) @@ -317,7 +349,7 @@ def __getitem__(self, key): # ------------------------------------------------------------------- - @Appender(_index_shared_docs['astype']) + @Appender(_index_shared_docs["astype"]) def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if is_timedelta64_dtype(dtype) and not is_timedelta64_ns_dtype(dtype): @@ -327,7 +359,7 @@ def astype(self, dtype, copy=True): result = self._data.astype(dtype, copy=copy) if self.hasnans: return Index(result, name=self.name) - return Index(result.astype('i8'), name=self.name) + return Index(result.astype("i8"), name=self.name) return DatetimeIndexOpsMixin.astype(self, dtype, copy=copy) def _union(self, other, sort): @@ -350,8 +382,7 @@ def _union(self, other, sort): result.freq = to_offset(result.inferred_freq) return result - def join(self, other, how='left', level=None, return_indexers=False, - sort=False): + def join(self, other, how="left", level=None, return_indexers=False, sort=False): """ See Index.join """ @@ -361,9 +392,14 @@ def join(self, other, how='left', level=None, return_indexers=False, except (TypeError, ValueError): pass - return Index.join(self, other, how=how, level=level, - return_indexers=return_indexers, - sort=sort) + return Index.join( + self, + other, + how=how, + level=level, + return_indexers=return_indexers, + sort=sort, + ) def intersection(self, other, sort=False): """ @@ -395,8 +431,11 @@ def intersection(self, other, sort=False): def _wrap_joined_index(self, joined, other): name = get_op_result_name(self, other) - if (isinstance(other, TimedeltaIndex) and self.freq == other.freq and - self._can_fast_union(other)): + if ( + isinstance(other, TimedeltaIndex) + and self.freq == other.freq + and self._can_fast_union(other) + ): joined = self._shallow_copy(joined, name=name) return joined else: @@ -447,7 +486,7 @@ def _fast_union(self, other): # concatenate if left_end < right_end: - loc = right.searchsorted(left_end, side='right') + loc = right.searchsorted(left_end, side="right") right_chunk = right.values[loc:] dates = _concat._concat_compat((left.values, right_chunk)) return self._shallow_copy(dates) @@ -455,7 +494,7 @@ def _fast_union(self, other): return left def _maybe_promote(self, other): - if other.inferred_type == 'timedelta': + if other.inferred_type == "timedelta": other = TimedeltaIndex(other) return self, other @@ -470,8 +509,7 @@ def get_value(self, series, key): return self.get_value_maybe_box(series, key) try: - return com.maybe_box(self, Index.get_value(self, series, key), - series, key) + return com.maybe_box(self, Index.get_value(self, series, key), series, key) except KeyError: try: loc = self._get_string_slice(key) @@ -547,25 +585,23 @@ def _maybe_cast_slice_bound(self, label, side, kind): label : object """ - assert kind in ['ix', 'loc', 'getitem', None] + assert kind in ["ix", "loc", "getitem", None] if isinstance(label, str): parsed = Timedelta(label) lbound = parsed.round(parsed.resolution_string) - if side == 'left': + if side == "left": return lbound else: - return (lbound + to_offset(parsed.resolution_string) - - Timedelta(1, 'ns')) - elif ((is_integer(label) or is_float(label)) and - not is_timedelta64_dtype(label)): - self._invalid_indexer('slice', label) + return lbound + to_offset(parsed.resolution_string) - Timedelta(1, "ns") + elif (is_integer(label) or is_float(label)) and not is_timedelta64_dtype(label): + self._invalid_indexer("slice", label) return label def _get_string_slice(self, key): if is_integer(key) or is_float(key) or key is NaT: - self._invalid_indexer('slice', key) + self._invalid_indexer("slice", key) loc = self._partial_td_slice(key) return loc @@ -577,9 +613,9 @@ def _partial_td_slice(self, key): raise NotImplementedError - @Substitution(klass='TimedeltaIndex') - @Appender(_shared_docs['searchsorted']) - def searchsorted(self, value, side='left', sorter=None): + @Substitution(klass="TimedeltaIndex") + @Appender(_shared_docs["searchsorted"]) + def searchsorted(self, value, side="left", sorter=None): if isinstance(value, (np.ndarray, Index)): value = np.array(value, dtype=_TD_DTYPE, copy=False) else: @@ -588,11 +624,11 @@ def searchsorted(self, value, side='left', sorter=None): return self.values.searchsorted(value, side=side, sorter=sorter) def is_type_compatible(self, typ): - return typ == self.inferred_type or typ == 'timedelta' + return typ == self.inferred_type or typ == "timedelta" @property def inferred_type(self): - return 'timedelta64' + return "timedelta64" @property def is_all_dates(self): @@ -628,16 +664,16 @@ def insert(self, loc, item): # check freq can be preserved on edge cases if self.freq is not None: - if ((loc == 0 or loc == -len(self)) and - item + self.freq == self[0]): + if (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: freq = self.freq elif (loc == len(self)) and item - self.freq == self[-1]: freq = self.freq item = Timedelta(item).asm8.view(_TD_DTYPE) try: - new_tds = np.concatenate((self[:loc].asi8, [item.view(np.int64)], - self[loc:].asi8)) + new_tds = np.concatenate( + (self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8) + ) return self._shallow_copy(new_tds, freq=freq) except (AttributeError, TypeError): @@ -645,8 +681,7 @@ def insert(self, loc, item): # fall back to object index if isinstance(item, str): return self.astype(object).insert(loc, item) - raise TypeError( - "cannot insert TimedeltaIndex with incompatible label") + raise TypeError("cannot insert TimedeltaIndex with incompatible label") def delete(self, loc): """ @@ -663,16 +698,15 @@ def delete(self, loc): """ new_tds = np.delete(self.asi8, loc) - freq = 'infer' + freq = "infer" if is_integer(loc): if loc in (0, -len(self), -1, len(self) - 1): freq = self.freq else: if is_list_like(loc): - loc = lib.maybe_indices_to_slice( - ensure_int64(np.array(loc)), len(self)) + loc = lib.maybe_indices_to_slice(ensure_int64(np.array(loc)), len(self)) if isinstance(loc, slice) and loc.step in (1, None): - if (loc.start in (0, None) or loc.stop in (len(self), None)): + if loc.start in (0, None) or loc.stop in (len(self), None): freq = self.freq return TimedeltaIndex(new_tds, name=self.name, freq=freq) @@ -690,15 +724,20 @@ def _is_convertible_to_index(other): """ if isinstance(other, TimedeltaIndex): return True - elif (len(other) > 0 and - other.inferred_type not in ('floating', 'mixed-integer', 'integer', - 'mixed-integer-float', 'mixed')): + elif len(other) > 0 and other.inferred_type not in ( + "floating", + "mixed-integer", + "integer", + "mixed-integer-float", + "mixed", + ): return True return False -def timedelta_range(start=None, end=None, periods=None, freq=None, - name=None, closed=None): +def timedelta_range( + start=None, end=None, periods=None, freq=None, name=None, closed=None +): """ Return a fixed frequency TimedeltaIndex, with day as the default frequency @@ -765,9 +804,8 @@ def timedelta_range(start=None, end=None, periods=None, freq=None, dtype='timedelta64[ns]', freq=None) """ if freq is None and com._any_none(periods, start, end): - freq = 'D' + freq = "D" freq, freq_infer = dtl.maybe_infer_freq(freq) - tdarr = TimedeltaArray._generate_range(start, end, periods, freq, - closed=closed) + tdarr = TimedeltaArray._generate_range(start, end, periods, freq, closed=closed) return TimedeltaIndex._simple_new(tdarr._data, freq=tdarr.freq, name=name) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 677aefa15d200d..ccc3a027af70d8 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -9,8 +9,17 @@ from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( - ensure_platform_int, is_float, is_integer, is_integer_dtype, is_iterator, - is_list_like, is_numeric_dtype, is_scalar, is_sequence, is_sparse) + ensure_platform_int, + is_float, + is_integer, + is_integer_dtype, + is_iterator, + is_list_like, + is_numeric_dtype, + is_scalar, + is_sequence, + is_sparse, +) from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import _infer_fill_value, isna @@ -22,11 +31,11 @@ def get_indexers_list(): return [ - ('ix', _IXIndexer), - ('iloc', _iLocIndexer), - ('loc', _LocIndexer), - ('at', _AtIndexer), - ('iat', _iAtIndexer), + ("ix", _IXIndexer), + ("iloc", _iLocIndexer), + ("loc", _LocIndexer), + ("at", _AtIndexer), + ("iat", _iAtIndexer), ] @@ -102,12 +111,11 @@ def __call__(self, axis=None): return new_self def __iter__(self): - raise NotImplementedError('ix is not iterable') + raise NotImplementedError("ix is not iterable") def __getitem__(self, key): if type(key) is tuple: - key = tuple(com.apply_if_callable(x, self.obj) - for x in key) + key = tuple(com.apply_if_callable(x, self.obj) for x in key) try: values = self.obj._get_value(*key) if is_scalar(values): @@ -134,7 +142,7 @@ def _get_label(self, label, axis=None): # see GH5667 return self.obj._xs(label, axis=axis) elif isinstance(label, tuple) and isinstance(label[axis], slice): - raise IndexingError('no slices here, handle elsewhere') + raise IndexingError("no slices here, handle elsewhere") return self.obj._xs(label, axis=axis) @@ -154,7 +162,7 @@ def _get_setitem_indexer(self, key): axis = self.obj._get_axis(0) - if isinstance(axis, MultiIndex) and self.name != 'iloc': + if isinstance(axis, MultiIndex) and self.name != "iloc": try: return axis.get_loc(key) except Exception: @@ -174,14 +182,13 @@ def _get_setitem_indexer(self, key): except TypeError as e: # invalid indexer type vs 'other' indexing errors - if 'cannot do' in str(e): + if "cannot do" in str(e): raise raise IndexingError(key) def __setitem__(self, key, value): if isinstance(key, tuple): - key = tuple(com.apply_if_callable(x, self.obj) - for x in key) + key = tuple(com.apply_if_callable(x, self.obj) for x in key) else: key = com.apply_if_callable(key, self.obj) indexer = self._get_setitem_indexer(key) @@ -216,13 +223,14 @@ def _has_valid_tuple(self, key): """ check the key for valid keys across my indexer """ for i, k in enumerate(key): if i >= self.obj.ndim: - raise IndexingError('Too many indexers') + raise IndexingError("Too many indexers") try: self._validate_key(k, i) except ValueError: - raise ValueError("Location based indexing can only have " - "[{types}] types" - .format(types=self._valid_types)) + raise ValueError( + "Location based indexing can only have " + "[{types}] types".format(types=self._valid_types) + ) def _is_nested_tuple_indexer(self, tup): if any(isinstance(ax, MultiIndex) for ax in self.obj.axes): @@ -235,14 +243,15 @@ def _convert_tuple(self, key, is_setter=False): axis = self.obj._get_axis_number(self.axis) for i in range(self.ndim): if i == axis: - keyidx.append(self._convert_to_indexer( - key, axis=axis, is_setter=is_setter)) + keyidx.append( + self._convert_to_indexer(key, axis=axis, is_setter=is_setter) + ) else: keyidx.append(slice(None)) else: for i, k in enumerate(key): if i >= self.obj.ndim: - raise IndexingError('Too many indexers') + raise IndexingError("Too many indexers") idx = self._convert_to_indexer(k, axis=i, is_setter=is_setter) keyidx.append(idx) return tuple(keyidx) @@ -272,8 +281,7 @@ def _has_valid_positional_setitem_indexer(self, indexer): will raise if needed, does not modify the indexer externally """ if isinstance(indexer, dict): - raise IndexError("{0} cannot enlarge its target object" - .format(self.name)) + raise IndexError("{0} cannot enlarge its target object".format(self.name)) else: if not isinstance(indexer, tuple): indexer = self._tuplify(indexer) @@ -286,11 +294,14 @@ def _has_valid_positional_setitem_indexer(self, indexer): pass elif is_integer(i): if i >= len(ax): - raise IndexError("{name} cannot enlarge its target " - "object".format(name=self.name)) + raise IndexError( + "{name} cannot enlarge its target " + "object".format(name=self.name) + ) elif isinstance(i, dict): - raise IndexError("{name} cannot enlarge its target object" - .format(name=self.name)) + raise IndexError( + "{name} cannot enlarge its target object".format(name=self.name) + ) return True @@ -299,6 +310,7 @@ def _setitem_with_indexer(self, indexer, value): # also has the side effect of consolidating in-place from pandas import Series + info_axis = self.obj._info_axis_number # maybe partial set @@ -309,8 +321,7 @@ def _setitem_with_indexer(self, indexer, value): if not take_split_path and self.obj._data.blocks: blk, = self.obj._data.blocks if 1 < blk.ndim: # in case of dict, keys are indices - val = list(value.values()) if isinstance(value, - dict) else value + val = list(value.values()) if isinstance(value, dict) else value take_split_path = not blk._can_hold_element(val) if isinstance(indexer, tuple) and len(indexer) == len(self.obj.axes): @@ -320,8 +331,9 @@ def _setitem_with_indexer(self, indexer, value): # if we have any multi-indexes that have non-trivial slices # (not null slices) then we must take the split path, xref # GH 10360 - if (isinstance(ax, MultiIndex) and - not (is_integer(i) or com.is_null_slice(i))): + if isinstance(ax, MultiIndex) and not ( + is_integer(i) or com.is_null_slice(i) + ): take_split_path = True break @@ -346,13 +358,14 @@ def _setitem_with_indexer(self, indexer, value): # or a list-like on the non-info axes if we have a # list-like len_non_info_axes = ( - len(_ax) for _i, _ax in enumerate(self.obj.axes) - if _i != i + len(_ax) for _i, _ax in enumerate(self.obj.axes) if _i != i ) if any(not l for l in len_non_info_axes): if not is_list_like_indexer(value): - raise ValueError("cannot set a frame with no " - "defined index and a scalar") + raise ValueError( + "cannot set a frame with no " + "defined index and a scalar" + ) self.obj[key] = value return self.obj @@ -360,7 +373,8 @@ def _setitem_with_indexer(self, indexer, value): self.obj[key] = _infer_fill_value(value) new_indexer = convert_from_missing_indexer_tuple( - indexer, self.obj.axes) + indexer, self.obj.axes + ) self._setitem_with_indexer(new_indexer, value) return self.obj @@ -402,21 +416,19 @@ def _setitem_with_indexer(self, indexer, value): if index.is_unique: new_indexer = index.get_indexer([new_index[-1]]) if (new_indexer != -1).any(): - return self._setitem_with_indexer(new_indexer, - value) + return self._setitem_with_indexer(new_indexer, value) # this preserves dtype of the value new_values = Series([value])._values if len(self.obj._values): try: - new_values = np.concatenate([self.obj._values, - new_values]) + new_values = np.concatenate([self.obj._values, new_values]) except TypeError: as_obj = self.obj.astype(object) - new_values = np.concatenate([as_obj, - new_values]) + new_values = np.concatenate([as_obj, new_values]) self.obj._data = self.obj._constructor( - new_values, index=new_index, name=self.obj.name)._data + new_values, index=new_index, name=self.obj.name + )._data self.obj._maybe_update_cacher(clear=True) return self.obj @@ -424,14 +436,14 @@ def _setitem_with_indexer(self, indexer, value): # no columns and scalar if not len(self.obj.columns): - raise ValueError("cannot set a frame with no defined " - "columns") + raise ValueError( + "cannot set a frame with no defined " "columns" + ) # append a Series if isinstance(value, Series): - value = value.reindex(index=self.obj.columns, - copy=True) + value = value.reindex(index=self.obj.columns, copy=True) value.name = indexer # a list-list @@ -440,11 +452,11 @@ def _setitem_with_indexer(self, indexer, value): # must have conforming columns if is_list_like_indexer(value): if len(value) != len(self.obj.columns): - raise ValueError("cannot set a row with " - "mismatched columns") + raise ValueError( + "cannot set a row with " "mismatched columns" + ) - value = Series(value, index=self.obj.columns, - name=indexer) + value = Series(value, index=self.obj.columns, name=indexer) self.obj._data = self.obj.append(value)._data self.obj._maybe_update_cacher(clear=True) @@ -469,46 +481,48 @@ def _setitem_with_indexer(self, indexer, value): # if we have a partial multiindex, then need to adjust the plane # indexer here - if (len(labels) == 1 and - isinstance(self.obj[labels[0]].axes[0], MultiIndex)): + if len(labels) == 1 and isinstance(self.obj[labels[0]].axes[0], MultiIndex): item = labels[0] obj = self.obj[item] index = obj.index idx = indexer[:info_axis][0] - plane_indexer = tuple([idx]) + indexer[info_axis + 1:] + plane_indexer = tuple([idx]) + indexer[info_axis + 1 :] lplane_indexer = length_of_indexer(plane_indexer[0], index) # require that we are setting the right number of values that # we are indexing - if is_list_like_indexer(value) and np.iterable( - value) and lplane_indexer != len(value): + if ( + is_list_like_indexer(value) + and np.iterable(value) + and lplane_indexer != len(value) + ): if len(obj[idx]) != len(value): - raise ValueError("cannot set using a multi-index " - "selection indexer with a different " - "length than the value") + raise ValueError( + "cannot set using a multi-index " + "selection indexer with a different " + "length than the value" + ) # make sure we have an ndarray - value = getattr(value, 'values', value).ravel() + value = getattr(value, "values", value).ravel() # we can directly set the series here # as we select a slice indexer on the mi idx = index._convert_slice_indexer(idx) obj._consolidate_inplace() obj = obj.copy() - obj._data = obj._data.setitem(indexer=tuple([idx]), - value=value) + obj._data = obj._data.setitem(indexer=tuple([idx]), value=value) self.obj[item] = obj return # non-mi else: - plane_indexer = indexer[:info_axis] + indexer[info_axis + 1:] + plane_indexer = indexer[:info_axis] + indexer[info_axis + 1 :] if info_axis > 0: plane_axis = self.obj.axes[:info_axis][0] - lplane_indexer = length_of_indexer(plane_indexer[0], - plane_axis) + lplane_indexer = length_of_indexer(plane_indexer[0], plane_axis) else: lplane_indexer = 0 @@ -521,10 +535,10 @@ def setter(item, v): # which means essentially reassign to the columns of a # multi-dim object # GH6149 (null slice), GH10408 (full bounds) - if (isinstance(pi, tuple) and - all(com.is_null_slice(idx) or - com.is_full_slice(idx, len(self.obj)) - for idx in pi)): + if isinstance(pi, tuple) and all( + com.is_null_slice(idx) or com.is_full_slice(idx, len(self.obj)) + for idx in pi + ): s = v else: # set the item, possibly having a dtype change @@ -538,8 +552,11 @@ def setter(item, v): def can_do_equal_len(): """ return True if we have an equal len settable """ - if (not len(labels) == 1 or not np.iterable(value) or - is_scalar(plane_indexer[0])): + if ( + not len(labels) == 1 + or not np.iterable(value) + or is_scalar(plane_indexer[0]) + ): return False item = labels[0] @@ -556,7 +573,7 @@ def can_do_equal_len(): # we need an iterable, with a ndim of at least 1 # eg. don't pass through np.array(0) - if is_list_like_indexer(value) and getattr(value, 'ndim', 1) > 0: + if is_list_like_indexer(value) and getattr(value, "ndim", 1) > 0: # we have an equal len Frame if isinstance(value, ABCDataFrame) and value.ndim > 1: @@ -567,8 +584,8 @@ def can_do_equal_len(): if item in value: sub_indexer[info_axis] = item v = self._align_series( - tuple(sub_indexer), value[item], - multiindex_indexer) + tuple(sub_indexer), value[item], multiindex_indexer + ) else: v = np.nan @@ -578,16 +595,18 @@ def can_do_equal_len(): # hasattr first, to avoid coercing to ndarray without reason. # But we may be relying on the ndarray coercion to check ndim. # Why not just convert to an ndarray earlier on if needed? - elif ((hasattr(value, 'ndim') and value.ndim == 2) - or (not hasattr(value, 'ndim') and - np.array(value).ndim) == 2): + elif (hasattr(value, "ndim") and value.ndim == 2) or ( + not hasattr(value, "ndim") and np.array(value).ndim + ) == 2: # note that this coerces the dtype if we are mixed # GH 7551 value = np.array(value, dtype=object) if len(labels) != value.shape[1]: - raise ValueError('Must have equal len keys and value ' - 'when setting with an ndarray') + raise ValueError( + "Must have equal len keys and value " + "when setting with an ndarray" + ) for i, item in enumerate(labels): @@ -602,8 +621,10 @@ def can_do_equal_len(): else: if len(labels) != len(value): - raise ValueError('Must have equal len keys and value ' - 'when setting with an iterable') + raise ValueError( + "Must have equal len keys and value " + "when setting with an iterable" + ) for item, v in zip(labels, value): setter(item, v) @@ -620,12 +641,16 @@ def can_do_equal_len(): # if we are setting on the info axis ONLY # set using those methods to avoid block-splitting # logic here - if (len(indexer) > info_axis and - is_integer(indexer[info_axis]) and - all(com.is_null_slice(idx) - for i, idx in enumerate(indexer) - if i != info_axis) and - item_labels.is_unique): + if ( + len(indexer) > info_axis + and is_integer(indexer[info_axis]) + and all( + com.is_null_slice(idx) + for i, idx in enumerate(indexer) + if i != info_axis + ) + and item_labels.is_unique + ): self.obj[item_labels[indexer[info_axis]]] = value return @@ -643,8 +668,7 @@ def can_do_equal_len(): # actually do the set self.obj._consolidate_inplace() - self.obj._data = self.obj._data.setitem(indexer=indexer, - value=value) + self.obj._data = self.obj._data.setitem(indexer=indexer, value=value) self.obj._maybe_update_cacher(clear=True) def _align_series(self, indexer, ser, multiindex_indexer=False): @@ -677,6 +701,7 @@ def _align_series(self, indexer, ser, multiindex_indexer=False): # flatten np.ndarray indexers def ravel(i): return i.ravel() if isinstance(i, np.ndarray) else i + indexer = tuple(map(ravel, indexer)) aligners = [not com.is_null_slice(idx) for idx in indexer] @@ -696,8 +721,7 @@ def ravel(i): # we have a frame, with multiple indexers on both axes; and a # series, so need to broadcast (see GH5206) - if (sum_aligners == self.ndim and - all(is_sequence(_) for _ in indexer)): + if sum_aligners == self.ndim and all(is_sequence(_) for _ in indexer): ser = ser.reindex(obj.axes[0][indexer[0]], copy=True)._values # single indexer @@ -741,7 +765,7 @@ def ravel(i): return ser.reindex(ax)._values - raise ValueError('Incompatible indexer with Series') + raise ValueError("Incompatible indexer with Series") def _align_frame(self, indexer, df): is_frame = self.obj.ndim == 2 @@ -772,8 +796,7 @@ def _align_frame(self, indexer, df): val = df.reindex(idx, columns=cols)._values return val - elif ((isinstance(indexer, slice) or is_list_like_indexer(indexer)) and - is_frame): + elif (isinstance(indexer, slice) or is_list_like_indexer(indexer)) and is_frame: ax = self.obj.index[indexer] if df.index.equals(ax): val = df.copy()._values @@ -781,16 +804,20 @@ def _align_frame(self, indexer, df): # we have a multi-index and are trying to align # with a particular, level GH3738 - if (isinstance(ax, MultiIndex) and - isinstance(df.index, MultiIndex) and - ax.nlevels != df.index.nlevels): - raise TypeError("cannot align on a multi-index with out " - "specifying the join levels") + if ( + isinstance(ax, MultiIndex) + and isinstance(df.index, MultiIndex) + and ax.nlevels != df.index.nlevels + ): + raise TypeError( + "cannot align on a multi-index with out " + "specifying the join levels" + ) val = df.reindex(index=ax)._values return val - raise ValueError('Incompatible indexer with DataFrame') + raise ValueError("Incompatible indexer with DataFrame") def _getitem_tuple(self, tup): try: @@ -809,7 +836,7 @@ def _getitem_tuple(self, tup): retval = self.obj for i, key in enumerate(tup): if i >= self.obj.ndim: - raise IndexingError('Too many indexers') + raise IndexingError("Too many indexers") if com.is_null_slice(key): continue @@ -859,8 +886,10 @@ def _multi_take(self, tup): """ # GH 836 o = self.obj - d = {axis: self._get_listlike_indexer(key, axis) - for (key, axis) in zip(tup, o._AXIS_ORDERS)} + d = { + axis: self._get_listlike_indexer(key, axis) + for (key, axis) in zip(tup, o._AXIS_ORDERS) + } return o._reindex_with_indexers(d, copy=True, allow_dups=True) def _convert_for_reindex(self, key, axis=None): @@ -878,8 +907,7 @@ def _handle_lowerdim_multi_index_axis0(self, tup): except KeyError as ek: # raise KeyError if number of indexers match # else IndexingError will be raised - if (len(tup) <= self.obj.index.nlevels - and len(tup) > self.obj.ndim): + if len(tup) <= self.obj.index.nlevels and len(tup) > self.obj.ndim: raise ek except Exception as e1: if isinstance(tup[0], (slice, Index)): @@ -907,7 +935,7 @@ def _getitem_lowerdim(self, tup): ax0 = self.obj._get_axis(0) # ...but iloc should handle the tuple as simple integer-location # instead of checking it as multiindex representation (GH 13797) - if isinstance(ax0, MultiIndex) and self.name != 'iloc': + if isinstance(ax0, MultiIndex) and self.name != "iloc": result = self._handle_lowerdim_multi_index_axis0(tup) if result is not None: return result @@ -929,15 +957,18 @@ def _getitem_lowerdim(self, tup): elif section.ndim == self.ndim: # we're in the middle of slicing through a MultiIndex # revise the key wrt to `section` by inserting an _NS - new_key = tup[:i] + (_NS,) + tup[i + 1:] + new_key = tup[:i] + (_NS,) + tup[i + 1 :] else: - new_key = tup[:i] + tup[i + 1:] + new_key = tup[:i] + tup[i + 1 :] # unfortunately need an odious kludge here because of # DataFrame transposing convention - if (isinstance(section, ABCDataFrame) and i > 0 and - len(new_key) == 2): + if ( + isinstance(section, ABCDataFrame) + and i > 0 + and len(new_key) == 2 + ): a, b = new_key new_key = b, a @@ -951,7 +982,7 @@ def _getitem_lowerdim(self, tup): # This is an elided recursive call to iloc/loc/etc' return getattr(section, self.name)[new_key] - raise IndexingError('not applicable') + raise IndexingError("not applicable") def _getitem_nested_tuple(self, tup): # we have a nested tuple so have at least 1 multi-index level @@ -984,7 +1015,7 @@ def _getitem_nested_tuple(self, tup): axis += 1 # if we have a scalar, we are done - if is_scalar(obj) or not hasattr(obj, 'ndim'): + if is_scalar(obj) or not hasattr(obj, "ndim"): break # has the dim of the obj changed? @@ -1006,12 +1037,12 @@ def _getitem_axis(self, key, axis=None): labels = self.obj._get_axis(axis) if isinstance(key, slice): return self._get_slice_axis(key, axis=axis) - elif (is_list_like_indexer(key) and - not (isinstance(key, tuple) and - isinstance(labels, MultiIndex))): + elif is_list_like_indexer(key) and not ( + isinstance(key, tuple) and isinstance(labels, MultiIndex) + ): - if hasattr(key, 'ndim') and key.ndim > 1: - raise ValueError('Cannot index with multidimensional key') + if hasattr(key, "ndim") and key.ndim > 1: + raise ValueError("Cannot index with multidimensional key") return self._getitem_iterable(key, axis=axis) else: @@ -1066,15 +1097,13 @@ def _get_listlike_indexer(self, key, axis, raise_missing=False): # Have the index compute an indexer or return None # if it cannot handle: - indexer, keyarr = ax._convert_listlike_indexer(key, - kind=self.name) + indexer, keyarr = ax._convert_listlike_indexer(key, kind=self.name) # We only act on all found values: if indexer is not None and (indexer != -1).all(): - self._validate_read_indexer(key, indexer, axis, - raise_missing=raise_missing) + self._validate_read_indexer(key, indexer, axis, raise_missing=raise_missing) return ax[indexer], indexer - if ax.is_unique and not getattr(ax, 'is_overlapping', False): + if ax.is_unique and not getattr(ax, "is_overlapping", False): # If we are trying to get actual keys from empty Series, we # patiently wait for a KeyError later on - otherwise, convert if len(ax) or not len(key): @@ -1084,9 +1113,9 @@ def _get_listlike_indexer(self, key, axis, raise_missing=False): else: keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr) - self._validate_read_indexer(keyarr, indexer, - o._get_axis_number(axis), - raise_missing=raise_missing) + self._validate_read_indexer( + keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing + ) return keyarr, indexer def _getitem_iterable(self, key, axis=None): @@ -1129,10 +1158,10 @@ def _getitem_iterable(self, key, axis=None): return self.obj._take(inds, axis=axis) else: # A collection of keys - keyarr, indexer = self._get_listlike_indexer(key, axis, - raise_missing=False) - return self.obj._reindex_with_indexers({axis: [keyarr, indexer]}, - copy=True, allow_dups=True) + keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False) + return self.obj._reindex_with_indexers( + {axis: [keyarr, indexer]}, copy=True, allow_dups=True + ) def _validate_read_indexer(self, key, indexer, axis, raise_missing=False): """ @@ -1171,11 +1200,13 @@ def _validate_read_indexer(self, key, indexer, axis, raise_missing=False): if missing == len(indexer): raise KeyError( "None of [{key}] are in the [{axis}]".format( - key=key, axis=self.obj._get_axis_name(axis))) + key=key, axis=self.obj._get_axis_name(axis) + ) + ) # We (temporarily) allow for some missing keys with .loc, except in # some cases (e.g. setting) in which "raise_missing" will be False - if not(self.name == 'loc' and not raise_missing): + if not (self.name == "loc" and not raise_missing): not_found = list(set(key) - set(ax)) raise KeyError("{} not in index".format(not_found)) @@ -1185,19 +1216,19 @@ def _validate_read_indexer(self, key, indexer, axis, raise_missing=False): # code, so we want to avoid warning & then # just raising - _missing_key_warning = textwrap.dedent(""" + _missing_key_warning = textwrap.dedent( + """ Passing list-likes to .loc or [] with any missing label will raise KeyError in the future, you can use .reindex() as an alternative. See the documentation here: - https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike""") # noqa + https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike""" + ) # noqa if not (ax.is_categorical() or ax.is_interval()): - warnings.warn(_missing_key_warning, - FutureWarning, stacklevel=6) + warnings.warn(_missing_key_warning, FutureWarning, stacklevel=6) - def _convert_to_indexer(self, obj, axis=None, is_setter=False, - raise_missing=False): + def _convert_to_indexer(self, obj, axis=None, is_setter=False, raise_missing=False): """ Convert indexing key into something we can use to do actual fancy indexing on an ndarray @@ -1239,7 +1270,7 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False, except LookupError: if isinstance(obj, tuple) and isinstance(labels, MultiIndex): if is_setter and len(obj) == labels.nlevels: - return {'key': obj} + return {"key": obj} raise except TypeError: pass @@ -1255,14 +1286,14 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False, if is_setter: # always valid - if self.name == 'loc': - return {'key': obj} + if self.name == "loc": + return {"key": obj} # a positional - if (obj >= self.obj.shape[axis] and - not isinstance(labels, MultiIndex)): - raise ValueError("cannot set by positional indexing with " - "enlargement") + if obj >= self.obj.shape[axis] and not isinstance(labels, MultiIndex): + raise ValueError( + "cannot set by positional indexing with " "enlargement" + ) return obj @@ -1277,8 +1308,7 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False, return inds else: # When setting, missing keys are not allowed, even with .loc: - kwargs = {'raise_missing': True if is_setter else - raise_missing} + kwargs = {"raise_missing": True if is_setter else raise_missing} return self._get_listlike_indexer(obj, axis, **kwargs)[1] else: try: @@ -1286,7 +1316,7 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False, except LookupError: # allow a not found key only if we are a setter if not is_list_like_indexer(obj) and is_setter: - return {'key': obj} + return {"key": obj} raise def _tuplify(self, loc): @@ -1305,7 +1335,7 @@ def _get_slice_axis(self, slice_obj, axis=None): indexer = self._convert_slice_indexer(slice_obj, axis) if isinstance(indexer, slice): - return self._slice(indexer, axis=axis, kind='iloc') + return self._slice(indexer, axis=axis, kind="iloc") else: return self.obj._take(indexer, axis=axis) @@ -1334,17 +1364,18 @@ class _IXIndexer(_NDFrameIndexer): See more at :ref:`Advanced Indexing `. """ - _ix_deprecation_warning = textwrap.dedent(""" + _ix_deprecation_warning = textwrap.dedent( + """ .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing See the documentation here: - http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated""") # noqa + http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated""" + ) # noqa def __init__(self, name, obj): - warnings.warn(self._ix_deprecation_warning, - FutureWarning, stacklevel=2) + warnings.warn(self._ix_deprecation_warning, FutureWarning, stacklevel=2) super().__init__(name, obj) @Appender(_NDFrameIndexer._validate_key.__doc__) @@ -1413,8 +1444,7 @@ class _LocationIndexer(_NDFrameIndexer): def __getitem__(self, key): if type(key) is tuple: - key = tuple(com.apply_if_callable(x, self.obj) - for x in key) + key = tuple(com.apply_if_callable(x, self.obj) for x in key) try: if self._is_scalar_access(key): return self._getitem_scalar(key) @@ -1458,11 +1488,12 @@ def _get_slice_axis(self, slice_obj, axis=None): return obj.copy(deep=False) labels = obj._get_axis(axis) - indexer = labels.slice_indexer(slice_obj.start, slice_obj.stop, - slice_obj.step, kind=self.name) + indexer = labels.slice_indexer( + slice_obj.start, slice_obj.stop, slice_obj.step, kind=self.name + ) if isinstance(indexer, slice): - return self._slice(indexer, axis=axis, kind='iloc') + return self._slice(indexer, axis=axis, kind="iloc") else: return self.obj._take(indexer, axis=axis) @@ -1705,9 +1736,11 @@ class _LocIndexer(_LocationIndexer): viper mark ii 7 1 """ - _valid_types = ("labels (MUST BE IN THE INDEX), slices of labels (BOTH " - "endpoints included! Can be slices of integers if the " - "index is integers), listlike of labels, boolean") + _valid_types = ( + "labels (MUST BE IN THE INDEX), slices of labels (BOTH " + "endpoints included! Can be slices of integers if the " + "index is integers), listlike of labels, boolean" + ) _exception = KeyError @Appender(_NDFrameIndexer._validate_key.__doc__) @@ -1732,7 +1765,7 @@ def _is_scalar_access(self, key): # that provide the equivalent access of .at and .iat # a) avoid getting things via sections and (to minimize dtype changes) # b) provide a performant path - if not hasattr(key, '__len__'): + if not hasattr(key, "__len__"): return False if len(key) != self.ndim: @@ -1761,7 +1794,7 @@ def _get_partial_string_timestamp_match_key(self, key, labels): """Translate any partial string timestamp matches in key, returning the new key (GH 10331)""" if isinstance(labels, MultiIndex): - if (isinstance(key, str) and labels.levels[0].is_all_dates): + if isinstance(key, str) and labels.levels[0].is_all_dates: # Convert key '2016-01-01' to # ('2016-01-01'[, slice(None, None, None)]+) key = tuple([key] + [slice(None)] * (len(labels.levels) - 1)) @@ -1771,8 +1804,7 @@ def _get_partial_string_timestamp_match_key(self, key, labels): # (..., slice('2016-01-01', '2016-01-01', None), ...) new_key = [] for i, component in enumerate(key): - if (isinstance(component, str) and - labels.levels[i].is_all_dates): + if isinstance(component, str) and labels.levels[i].is_all_dates: new_key.append(slice(component, component, None)) else: new_key.append(component) @@ -1810,23 +1842,30 @@ def _getitem_axis(self, key, axis=None): key = list(key) elif isinstance(key, ABCDataFrame): # GH 15438 - raise NotImplementedError("Indexing a MultiIndex with a " - "DataFrame key is not " - "implemented") - elif hasattr(key, 'ndim') and key.ndim > 1: - raise NotImplementedError("Indexing a MultiIndex with a " - "multidimensional key is not " - "implemented") - - if (not isinstance(key, tuple) and len(key) > 1 and - not isinstance(key[0], tuple)): + raise NotImplementedError( + "Indexing a MultiIndex with a " + "DataFrame key is not " + "implemented" + ) + elif hasattr(key, "ndim") and key.ndim > 1: + raise NotImplementedError( + "Indexing a MultiIndex with a " + "multidimensional key is not " + "implemented" + ) + + if ( + not isinstance(key, tuple) + and len(key) > 1 + and not isinstance(key[0], tuple) + ): key = tuple([key]) # an iterable multi-selection if not (isinstance(key, tuple) and isinstance(labels, MultiIndex)): - if hasattr(key, 'ndim') and key.ndim > 1: - raise ValueError('Cannot index with multidimensional key') + if hasattr(key, "ndim") and key.ndim > 1: + raise ValueError("Cannot index with multidimensional key") return self._getitem_iterable(key, axis=axis) @@ -1978,19 +2017,25 @@ class _iLocIndexer(_LocationIndexer): 2 1000 3000 """ - _valid_types = ("integer, integer slice (START point is INCLUDED, END " - "point is EXCLUDED), listlike of integers, boolean array") + _valid_types = ( + "integer, integer slice (START point is INCLUDED, END " + "point is EXCLUDED), listlike of integers, boolean array" + ) _exception = IndexError def _validate_key(self, key, axis): if com.is_bool_indexer(key): - if hasattr(key, 'index') and isinstance(key.index, Index): - if key.index.inferred_type == 'integer': - raise NotImplementedError("iLocation based boolean " - "indexing on an integer type " - "is not available") - raise ValueError("iLocation based boolean indexing cannot use " - "an indexable as a mask") + if hasattr(key, "index") and isinstance(key.index, Index): + if key.index.inferred_type == "integer": + raise NotImplementedError( + "iLocation based boolean " + "indexing on an integer type " + "is not available" + ) + raise ValueError( + "iLocation based boolean indexing cannot use " + "an indexable as a mask" + ) return if isinstance(key, slice): @@ -2000,22 +2045,25 @@ def _validate_key(self, key, axis): elif isinstance(key, tuple): # a tuple should already have been caught by this point # so don't treat a tuple as a valid indexer - raise IndexingError('Too many indexers') + raise IndexingError("Too many indexers") elif is_list_like_indexer(key): arr = np.array(key) len_axis = len(self.obj._get_axis(axis)) # check that the key has a numeric dtype if not is_numeric_dtype(arr.dtype): - raise IndexError(".iloc requires numeric indexers, got " - "{arr}".format(arr=arr)) + raise IndexError( + ".iloc requires numeric indexers, got " "{arr}".format(arr=arr) + ) # check that the key does not exceed the maximum size of the index if len(arr) and (arr.max() >= len_axis or arr.min() < -len_axis): raise IndexError("positional indexers are out-of-bounds") else: - raise ValueError("Can only index by location with " - "a [{types}]".format(types=self._valid_types)) + raise ValueError( + "Can only index by location with " + "a [{types}]".format(types=self._valid_types) + ) def _has_valid_setitem_indexer(self, indexer): self._has_valid_positional_setitem_indexer(indexer) @@ -2025,7 +2073,7 @@ def _is_scalar_access(self, key): # that provide the equivalent access of .at and .iat # a) avoid getting things via sections and (to minimize dtype changes) # b) provide a performant path - if not hasattr(key, '__len__'): + if not hasattr(key, "__len__"): return False if len(key) != self.ndim: @@ -2084,7 +2132,7 @@ def _getitem_tuple(self, tup): axis = 0 for i, key in enumerate(tup): if i >= self.obj.ndim: - raise IndexingError('Too many indexers') + raise IndexingError("Too many indexers") if com.is_null_slice(key): axis += 1 @@ -2111,7 +2159,7 @@ def _get_slice_axis(self, slice_obj, axis=None): slice_obj = self._convert_slice_indexer(slice_obj, axis) if isinstance(slice_obj, slice): - return self._slice(slice_obj, axis=axis, kind='iloc') + return self._slice(slice_obj, axis=axis, kind="iloc") else: return self.obj._take(slice_obj, axis=axis) @@ -2158,8 +2206,9 @@ def _getitem_axis(self, key, axis=None): else: key = item_from_zerodim(key) if not is_integer(key): - raise TypeError("Cannot index by location index with a " - "non-integer key") + raise TypeError( + "Cannot index by location index with a " "non-integer key" + ) # validate the location self._validate_integer(key, axis) @@ -2182,8 +2231,10 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False): self._validate_key(obj, axis) return obj except ValueError: - raise ValueError("Can only index by location with " - "a [{types}]".format(types=self._valid_types)) + raise ValueError( + "Can only index by location with " + "a [{types}]".format(types=self._valid_types) + ) class _ScalarAccessIndexer(_NDFrameIndexer): @@ -2199,15 +2250,14 @@ def __getitem__(self, key): if not is_list_like_indexer(key): key = tuple([key]) else: - raise ValueError('Invalid call for scalar access (getting)!') + raise ValueError("Invalid call for scalar access (getting)!") key = self._convert_key(key) return self.obj._get_value(*key, takeable=self._takeable) def __setitem__(self, key, value): if isinstance(key, tuple): - key = tuple(com.apply_if_callable(x, self.obj) - for x in key) + key = tuple(com.apply_if_callable(x, self.obj) for x in key) else: # scalar callable may return tuple key = com.apply_if_callable(key, self.obj) @@ -2215,8 +2265,7 @@ def __setitem__(self, key, value): if not isinstance(key, tuple): key = self._tuplify(key) if len(key) != self.obj.ndim: - raise ValueError('Not enough indexers for scalar access ' - '(setting)!') + raise ValueError("Not enough indexers for scalar access " "(setting)!") key = list(self._convert_key(key, is_setter=True)) key.append(value) self.obj._set_value(*key, takeable=self._takeable) @@ -2283,13 +2332,17 @@ def _convert_key(self, key, is_setter=False): for ax, i in zip(self.obj.axes, key): if ax.is_integer(): if not is_integer(i): - raise ValueError("At based indexing on an integer index " - "can only have integer indexers") + raise ValueError( + "At based indexing on an integer index " + "can only have integer indexers" + ) else: if is_integer(i) and not ax.holds_integer(): - raise ValueError("At based indexing on an non-integer " - "index can only have non-integer " - "indexers") + raise ValueError( + "At based indexing on an non-integer " + "index can only have non-integer " + "indexers" + ) return key @@ -2348,8 +2401,7 @@ def _convert_key(self, key, is_setter=False): """ require integer args (and convert to label arguments) """ for a, i in zip(self.obj.axes, key): if not is_integer(i): - raise ValueError("iAt based indexing can only have integer " - "indexers") + raise ValueError("iAt based indexing can only have integer " "indexers") return key @@ -2388,7 +2440,7 @@ def convert_to_index_sliceable(obj, key): """ idx = obj.index if isinstance(key, slice): - return idx._convert_slice_indexer(key, kind='getitem') + return idx._convert_slice_indexer(key, kind="getitem") elif isinstance(key, str): @@ -2440,9 +2492,11 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: result = result.reindex(index) mask = isna(result._values) if mask.any(): - raise IndexingError('Unalignable boolean Series provided as ' - 'indexer (index of the boolean Series and of ' - 'the indexed object do not match).') + raise IndexingError( + "Unalignable boolean Series provided as " + "indexer (index of the boolean Series and of " + "the indexed object do not match)." + ) result = result.astype(bool)._values else: if is_sparse(result): @@ -2452,8 +2506,8 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: # GH26658 if len(result) != len(index): raise IndexError( - 'Item wrong length {} instead of {}.'.format(len(result), - len(index))) + "Item wrong length {} instead of {}.".format(len(result), len(index)) + ) return result @@ -2488,18 +2542,24 @@ def check_setitem_lengths(indexer, value, values): # boolean with truth values == len of the value is ok too if isinstance(indexer, (np.ndarray, list)): if is_list_like(value) and len(indexer) != len(value): - if not (isinstance(indexer, np.ndarray) and - indexer.dtype == np.bool_ and - len(indexer[indexer]) == len(value)): - raise ValueError("cannot set using a list-like indexer " - "with a different length than the value") + if not ( + isinstance(indexer, np.ndarray) + and indexer.dtype == np.bool_ + and len(indexer[indexer]) == len(value) + ): + raise ValueError( + "cannot set using a list-like indexer " + "with a different length than the value" + ) # slice elif isinstance(indexer, slice): if is_list_like(value) and len(values): if len(value) != length_of_indexer(indexer, values): - raise ValueError("cannot set using a slice indexer with a " - "different length than the value") + raise ValueError( + "cannot set using a slice indexer with a " + "different length than the value" + ) def convert_missing_indexer(indexer): @@ -2511,7 +2571,7 @@ def convert_missing_indexer(indexer): if isinstance(indexer, dict): # a missing key (but not a tuple indexer) - indexer = indexer['key'] + indexer = indexer["key"] if isinstance(indexer, bool): raise KeyError("cannot use a single bool to index into setitem") @@ -2526,8 +2586,7 @@ def convert_from_missing_indexer_tuple(indexer, axes): """ def get_indexer(_i, _idx): - return (axes[_i].get_loc(_idx['key']) if isinstance(_idx, dict) else - _idx) + return axes[_i].get_loc(_idx["key"]) if isinstance(_idx, dict) else _idx return tuple(get_indexer(_i, _idx) for _i, _idx in enumerate(indexer)) @@ -2608,8 +2667,9 @@ def validate_indices(indices, n): if len(indices): min_idx = indices.min() if min_idx < -1: - msg = ("'indices' contains values less than allowed ({} < {})" - .format(min_idx, -1)) + msg = "'indices' contains values less than allowed ({} < {})".format( + min_idx, -1 + ) raise ValueError(msg) max_idx = indices.max() @@ -2648,8 +2708,7 @@ def is_nested_tuple(tup, labels): def is_list_like_indexer(key): # allow a list_like, but exclude NamedTuples which can be indexers - return is_list_like(key) and not (isinstance(key, tuple) and - type(key) is not tuple) + return is_list_like(key) and not (isinstance(key, tuple) and type(key) is not tuple) def is_label_like(key): @@ -2658,8 +2717,11 @@ def is_label_like(key): def need_slice(obj): - return (obj.start is not None or obj.stop is not None or - (obj.step is not None and obj.step != 1)) + return ( + obj.start is not None + or obj.stop is not None + or (obj.step is not None and obj.step != 1) + ) def maybe_droplevels(index, key): @@ -2697,8 +2759,9 @@ def _non_reducing_slice(slice_): def pred(part): # true when slice does *not* reduce, False when part is a tuple, # i.e. MultiIndex slice - return ((isinstance(part, slice) or is_list_like(part)) - and not isinstance(part, tuple)) + return (isinstance(part, slice) or is_list_like(part)) and not isinstance( + part, tuple + ) if not is_list_like(slice_): if not isinstance(slice_, slice): diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index bf46e5d1a74e47..8ac0df2fa4e0a0 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,16 +1,28 @@ - from .blocks import ( # noqa: F401 - Block, BoolBlock, CategoricalBlock, ComplexBlock, DatetimeBlock, - DatetimeTZBlock, ExtensionBlock, FloatBlock, IntBlock, ObjectBlock, - TimeDeltaBlock) + Block, + BoolBlock, + CategoricalBlock, + ComplexBlock, + DatetimeBlock, + DatetimeTZBlock, + ExtensionBlock, + FloatBlock, + IntBlock, + ObjectBlock, + TimeDeltaBlock, +) from .managers import ( # noqa: F401 - BlockManager, SingleBlockManager, create_block_manager_from_arrays, - create_block_manager_from_blocks) + BlockManager, + SingleBlockManager, + create_block_manager_from_arrays, + create_block_manager_from_blocks, +) from .blocks import _safe_reshape # noqa: F401; io.packers from .blocks import make_block # noqa: F401; io.pytables, io.packers from .managers import ( # noqa: F401; reshape.concat, reshape.merge _transform_index, - concatenate_block_managers) + concatenate_block_managers, +) from .blocks import _block_shape # noqa:F401; io.pytables diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index a9b2c0491458cf..34186b60de27c3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -13,27 +13,63 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( - astype_nansafe, find_common_type, infer_dtype_from, - infer_dtype_from_scalar, maybe_convert_objects, maybe_downcast_to_dtype, - maybe_infer_dtype_type, maybe_promote, maybe_upcast, soft_convert_objects) + astype_nansafe, + find_common_type, + infer_dtype_from, + infer_dtype_from_scalar, + maybe_convert_objects, + maybe_downcast_to_dtype, + maybe_infer_dtype_type, + maybe_promote, + maybe_upcast, + soft_convert_objects, +) from pandas.core.dtypes.common import ( - _NS_DTYPE, _TD_DTYPE, ensure_platform_int, is_bool_dtype, is_categorical, - is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_dtype_equal, is_extension_array_dtype, is_extension_type, - is_float_dtype, is_integer, is_integer_dtype, is_interval_dtype, - is_list_like, is_numeric_v_string_like, is_object_dtype, is_period_dtype, - is_re, is_re_compilable, is_sparse, is_timedelta64_dtype, pandas_dtype) + _NS_DTYPE, + _TD_DTYPE, + ensure_platform_int, + is_bool_dtype, + is_categorical, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_extension_type, + is_float_dtype, + is_integer, + is_integer_dtype, + is_interval_dtype, + is_list_like, + is_numeric_v_string_like, + is_object_dtype, + is_period_dtype, + is_re, + is_re_compilable, + is_sparse, + is_timedelta64_dtype, + pandas_dtype, +) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.dtypes import CategoricalDtype, ExtensionDtype from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCDatetimeIndex, ABCExtensionArray, ABCIndexClass, - ABCPandasArray, ABCSeries) -from pandas.core.dtypes.missing import ( - _isna_compat, array_equivalent, isna, notna) + ABCDataFrame, + ABCDatetimeIndex, + ABCExtensionArray, + ABCIndexClass, + ABCPandasArray, + ABCSeries, +) +from pandas.core.dtypes.missing import _isna_compat, array_equivalent, isna, notna import pandas.core.algorithms as algos from pandas.core.arrays import ( - Categorical, DatetimeArray, ExtensionArray, PandasDtype, TimedeltaArray) + Categorical, + DatetimeArray, + ExtensionArray, + PandasDtype, + TimedeltaArray, +) from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.indexing import check_setitem_lengths @@ -51,7 +87,8 @@ class Block(PandasObject): Index-ignorant; let the container take care of that """ - __slots__ = ['_mgr_locs', 'values', 'ndim'] + + __slots__ = ["_mgr_locs", "values", "ndim"] is_numeric = False is_float = False is_integer = False @@ -67,7 +104,7 @@ class Block(PandasObject): _can_consolidate = True _verify_integrity = True _validate_ndim = True - _ftype = 'dense' + _ftype = "dense" _concatenator = staticmethod(np.concatenate) def __init__(self, values, placement, ndim=None): @@ -75,11 +112,11 @@ def __init__(self, values, placement, ndim=None): self.mgr_locs = placement self.values = values - if (self._validate_ndim and self.ndim and - len(self.mgr_locs) != len(self.values)): + if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values): raise ValueError( - 'Wrong number of items passed {val}, placement implies ' - '{mgr}'.format(val=len(self.values), mgr=len(self.mgr_locs))) + "Wrong number of items passed {val}, placement implies " + "{mgr}".format(val=len(self.values), mgr=len(self.mgr_locs)) + ) def _check_ndim(self, values, ndim): """ @@ -106,8 +143,7 @@ def _check_ndim(self, values, ndim): ndim = values.ndim if self._validate_ndim and values.ndim != ndim: - msg = ("Wrong number of dimensions. values.ndim != ndim " - "[{} != {}]") + msg = "Wrong number of dimensions. values.ndim != ndim " "[{} != {}]" raise ValueError(msg.format(values.ndim, ndim)) return ndim @@ -218,32 +254,38 @@ def make_block(self, values, placement=None): return make_block(values, placement=placement, ndim=self.ndim) - def make_block_same_class(self, values, placement=None, ndim=None, - dtype=None): + def make_block_same_class(self, values, placement=None, ndim=None, dtype=None): """ Wrap given values in a block of same type as self. """ if dtype is not None: # issue 19431 fastparquet is passing this - warnings.warn("dtype argument is deprecated, will be removed " - "in a future release.", FutureWarning) + warnings.warn( + "dtype argument is deprecated, will be removed " "in a future release.", + FutureWarning, + ) if placement is None: placement = self.mgr_locs - return make_block(values, placement=placement, ndim=ndim, - klass=self.__class__, dtype=dtype) + return make_block( + values, placement=placement, ndim=ndim, klass=self.__class__, dtype=dtype + ) def __repr__(self): # don't want to print out all of the items here name = pprint_thing(self.__class__.__name__) if self._is_single_block: - result = '{name}: {len} dtype: {dtype}'.format( - name=name, len=len(self), dtype=self.dtype) + result = "{name}: {len} dtype: {dtype}".format( + name=name, len=len(self), dtype=self.dtype + ) else: - shape = ' x '.join(pprint_thing(s) for s in self.shape) - result = '{name}: {index}, {shape}, dtype: {dtype}'.format( - name=name, index=pprint_thing(self.mgr_locs.indexer), - shape=shape, dtype=self.dtype) + shape = " x ".join(pprint_thing(s) for s in self.shape) + result = "{name}: {index}, {shape}, dtype: {dtype}".format( + name=name, + index=pprint_thing(self.mgr_locs.indexer), + shape=shape, + dtype=self.dtype, + ) return result @@ -292,7 +334,7 @@ def dtype(self): @property def ftype(self): - if getattr(self.values, '_pandas_ftype', False): + if getattr(self.values, "_pandas_ftype", False): dtype = self.dtype.subtype else: dtype = self.dtype @@ -305,10 +347,12 @@ def concat_same_type(self, to_concat, placement=None): """ Concatenate list of single blocks of the same type. """ - values = self._concatenator([blk.values for blk in to_concat], - axis=self.ndim - 1) + values = self._concatenator( + [blk.values for blk in to_concat], axis=self.ndim - 1 + ) return self.make_block_same_class( - values, placement=placement or slice(0, len(values), 1)) + values, placement=placement or slice(0, len(values), 1) + ) def iget(self, i): return self.values[i] @@ -334,11 +378,10 @@ def apply(self, func, **kwargs): """ apply the function to my values; return a block if we are not one """ - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = func(self.values, **kwargs) if not isinstance(result, Block): - result = self.make_block(values=_block_shape(result, - ndim=self.ndim)) + result = self.make_block(values=_block_shape(result, ndim=self.ndim)) return result @@ -346,17 +389,18 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): """ fillna on the block with the value. If we fail, then convert to ObjectBlock and try again """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") mask = isna(self.values) if limit is not None: if not is_integer(limit): - raise ValueError('Limit must be an integer') + raise ValueError("Limit must be an integer") if limit < 1: - raise ValueError('Limit must be greater than 0') + raise ValueError("Limit must be greater than 0") if self.ndim > 2: - raise NotImplementedError("number of dimensions for 'fillna' " - "is currently limited to 2") + raise NotImplementedError( + "number of dimensions for 'fillna' " "is currently limited to 2" + ) mask[mask.cumsum(self.ndim - 1) > limit] = False if not self._can_hold_na: @@ -371,8 +415,9 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): self._try_coerce_args(value) blocks = self.putmask(mask, value, inplace=inplace) - blocks = [b.make_block(values=self._try_coerce_result(b.values)) - for b in blocks] + blocks = [ + b.make_block(values=self._try_coerce_result(b.values)) for b in blocks + ] return self._maybe_downcast(blocks, downcast) except (TypeError, ValueError): @@ -387,10 +432,7 @@ def f(m, v, i): # slice out our block if i is not None: block = block.getitem_block(slice(i, i + 1)) - return block.fillna(value, - limit=limit, - inplace=inplace, - downcast=None) + return block.fillna(value, limit=limit, inplace=inplace, downcast=None) return self.split_and_operate(mask, f, inplace) @@ -424,8 +466,7 @@ def make_a_block(nv, ref_loc): # Put back the dimension that was taken from it and make # a block out of the result. nv = _block_shape(nv, ndim=self.ndim) - block = self.make_block(values=nv, - placement=ref_loc) + block = self.make_block(values=nv, placement=ref_loc) return block # ndim == 1 @@ -481,7 +522,7 @@ def downcast(self, dtypes=None): # try to cast all non-floats here if dtypes is None: - dtypes = 'infer' + dtypes = "infer" nv = maybe_downcast_to_dtype(values, dtypes) return self.make_block(nv) @@ -490,16 +531,17 @@ def downcast(self, dtypes=None): if dtypes is None: return self - if not (dtypes == 'infer' or isinstance(dtypes, dict)): - raise ValueError("downcast must have a dictionary or 'infer' as " - "its argument") + if not (dtypes == "infer" or isinstance(dtypes, dict)): + raise ValueError( + "downcast must have a dictionary or 'infer' as " "its argument" + ) # operate column-by-column # this is expensive as it splits the blocks items-by-item def f(m, v, i): - if dtypes == 'infer': - dtype = 'infer' + if dtypes == "infer": + dtype = "infer" else: raise AssertionError("dtypes as dict is not supported yet") @@ -509,12 +551,10 @@ def f(m, v, i): return self.split_and_operate(None, f, False) - def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs): - return self._astype(dtype, copy=copy, errors=errors, values=values, - **kwargs) + def astype(self, dtype, copy=False, errors="raise", values=None, **kwargs): + return self._astype(dtype, copy=copy, errors=errors, values=values, **kwargs) - def _astype(self, dtype, copy=False, errors='raise', values=None, - **kwargs): + def _astype(self, dtype, copy=False, errors="raise", values=None, **kwargs): """Coerce to the new type Parameters @@ -530,31 +570,34 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, ------- Block """ - errors_legal_values = ('raise', 'ignore') + errors_legal_values = ("raise", "ignore") if errors not in errors_legal_values: - invalid_arg = ("Expected value of kwarg 'errors' to be one of {}. " - "Supplied value is '{}'".format( - list(errors_legal_values), errors)) + invalid_arg = ( + "Expected value of kwarg 'errors' to be one of {}. " + "Supplied value is '{}'".format(list(errors_legal_values), errors) + ) raise ValueError(invalid_arg) - if (inspect.isclass(dtype) and - issubclass(dtype, ExtensionDtype)): - msg = ("Expected an instance of {}, but got the class instead. " - "Try instantiating 'dtype'.".format(dtype.__name__)) + if inspect.isclass(dtype) and issubclass(dtype, ExtensionDtype): + msg = ( + "Expected an instance of {}, but got the class instead. " + "Try instantiating 'dtype'.".format(dtype.__name__) + ) raise TypeError(msg) # may need to convert to categorical if self.is_categorical_astype(dtype): # deprecated 17636 - for deprecated_arg in ('categories', 'ordered'): + for deprecated_arg in ("categories", "ordered"): if deprecated_arg in kwargs: - raise ValueError('Got an unexpected argument: {}'.format( - deprecated_arg)) + raise ValueError( + "Got an unexpected argument: {}".format(deprecated_arg) + ) - categories = kwargs.get('categories', None) - ordered = kwargs.get('ordered', None) + categories = kwargs.get("categories", None) + ordered = kwargs.get("ordered", None) if com._any_not_none(categories, ordered): dtype = CategoricalDtype(categories, ordered) @@ -602,12 +645,11 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, values = values.reshape(self.shape) except Exception: # noqa: E722 - if errors == 'raise': + if errors == "raise": raise newb = self.copy() if copy else self else: - newb = make_block(values, placement=self.mgr_locs, - ndim=self.ndim) + newb = make_block(values, placement=self.mgr_locs, ndim=self.ndim) if newb.is_numeric and self.is_numeric: if newb.shape != self.shape: @@ -615,9 +657,13 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, "cannot set astype for copy = [{copy}] for dtype " "({dtype} [{shape}]) to different shape " "({newb_dtype} [{newb_shape}])".format( - copy=copy, dtype=self.dtype.name, - shape=self.shape, newb_dtype=newb.dtype.name, - newb_shape=newb.shape)) + copy=copy, + dtype=self.dtype.name, + shape=self.shape, + newb_dtype=newb.dtype.name, + newb_shape=newb.shape, + ) + ) return newb def convert(self, copy=True, **kwargs): @@ -647,7 +693,7 @@ def _try_cast_result(self, result, dtype=None): pass elif self.is_float and result.dtype == self.dtype: # protect against a bool/object showing up here - if isinstance(dtype, str) and dtype == 'infer': + if isinstance(dtype, str) and dtype == "infer": return result # This is only reached via Block.setitem, where dtype is always @@ -678,9 +724,12 @@ def _try_coerce_args(self, other): if np.any(notna(other)) and not self._can_hold_element(other): # coercion issues # let higher levels handle - raise TypeError("cannot convert {} to an {}".format( - type(other).__name__, - type(self).__name__.lower().replace('Block', ''))) + raise TypeError( + "cannot convert {} to an {}".format( + type(other).__name__, + type(self).__name__.lower().replace("Block", ""), + ) + ) return other @@ -693,8 +742,7 @@ def _try_coerce_and_cast_result(self, result, dtype=None): result = self._try_cast_result(result, dtype=dtype) return result - def to_native_types(self, slicer=None, na_rep='nan', quoting=None, - **kwargs): + def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.get_values() @@ -706,7 +754,7 @@ def to_native_types(self, slicer=None, na_rep='nan', quoting=None, if not self.is_object and not quoting: values = values.astype(str) else: - values = np.array(values, dtype='object') + values = np.array(values, dtype="object") values[mask] = na_rep return values @@ -719,14 +767,15 @@ def copy(self, deep=True): values = values.copy() return self.make_block_same_class(values, ndim=self.ndim) - def replace(self, to_replace, value, inplace=False, filter=None, - regex=False, convert=True): + def replace( + self, to_replace, value, inplace=False, filter=None, regex=False, convert=True + ): """replace the to_replace value with value, possible to create new blocks here this is just a call to putmask. regex is not used here. It is used in ObjectBlocks. It is here for API compatibility. """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") original_to_replace = to_replace # try to replace, if we raise an error, convert to ObjectBlock and @@ -742,12 +791,14 @@ def replace(self, to_replace, value, inplace=False, filter=None, # try again with a compatible block block = self.astype(object) - return block.replace(to_replace=original_to_replace, - value=value, - inplace=inplace, - filter=filter, - regex=regex, - convert=convert) + return block.replace( + to_replace=original_to_replace, + value=value, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert, + ) mask = missing.mask_missing(values, to_replace) if filter is not None: @@ -764,20 +815,23 @@ def replace(self, to_replace, value, inplace=False, filter=None, # try again with a compatible block block = self.astype(object) - return block.replace(to_replace=original_to_replace, - value=value, - inplace=inplace, - filter=filter, - regex=regex, - convert=convert) + return block.replace( + to_replace=original_to_replace, + value=value, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert, + ) if convert: - blocks = [b.convert(by_item=True, numeric=False, - copy=not inplace) for b in blocks] + blocks = [ + b.convert(by_item=True, numeric=False, copy=not inplace) for b in blocks + ] return blocks def _replace_single(self, *args, **kwargs): """ no-op on a non-ObjectBlock """ - return self if kwargs['inplace'] else self.copy() + return self if kwargs["inplace"] else self.copy() def setitem(self, indexer, value): """Set the value inplace, returning a a maybe different typed block. @@ -809,17 +863,16 @@ def setitem(self, indexer, value): value = self._try_coerce_args(value) values = self._coerce_values(values) # can keep its own dtype - if hasattr(value, 'dtype') and is_dtype_equal(values.dtype, - value.dtype): + if hasattr(value, "dtype") and is_dtype_equal(values.dtype, value.dtype): dtype = self.dtype else: - dtype = 'infer' + dtype = "infer" except (TypeError, ValueError): # current dtype cannot store value, coerce to common dtype find_dtype = False - if hasattr(value, 'dtype'): + if hasattr(value, "dtype"): dtype = value.dtype find_dtype = True @@ -828,11 +881,10 @@ def setitem(self, indexer, value): # NaN promotion is handled in latter path dtype = False else: - dtype, _ = infer_dtype_from_scalar(value, - pandas_dtype=True) + dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) find_dtype = True else: - dtype = 'infer' + dtype = "infer" if find_dtype: dtype = find_common_type([values.dtype, dtype]) @@ -860,8 +912,9 @@ def _is_scalar_indexer(indexer): if arr_value.ndim == 1: if not isinstance(indexer, tuple): indexer = tuple([indexer]) - return any(isinstance(idx, np.ndarray) and len(idx) == 0 - for idx in indexer) + return any( + isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer + ) return False def _is_empty_indexer(indexer): @@ -872,8 +925,9 @@ def _is_empty_indexer(indexer): if arr_value.ndim == 1: if not isinstance(indexer, tuple): indexer = tuple([indexer]) - return any(isinstance(idx, np.ndarray) and len(idx) == 0 - for idx in indexer) + return any( + isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer + ) return False # empty indexers @@ -889,9 +943,11 @@ def _is_empty_indexer(indexer): # if we are an exact match (ex-broadcasting), # then use the resultant dtype - elif (len(arr_value.shape) and - arr_value.shape[0] == values.shape[0] and - np.prod(arr_value.shape) == np.prod(values.shape)): + elif ( + len(arr_value.shape) + and arr_value.shape[0] == values.shape[0] + and np.prod(arr_value.shape) == np.prod(values.shape) + ): values[indexer] = value try: values = values.astype(arr_value.dtype) @@ -907,8 +963,7 @@ def _is_empty_indexer(indexer): block = self.make_block(transf(values)) return block - def putmask(self, mask, new, align=True, inplace=False, axis=0, - transpose=False): + def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False): """ putmask the data to the block; it is possible that we may create a new dtype of block @@ -931,8 +986,8 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, new_values = self.values if inplace else self.values.copy() - new = getattr(new, 'values', new) - mask = getattr(mask, 'values', mask) + new = getattr(new, "values", new) + mask = getattr(mask, "values", mask) # if we are passed a scalar None, convert it here if not is_list_like(new) and isna(new) and not self.is_object: @@ -946,10 +1001,9 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, # If the default repeat behavior in np.putmask would go in the # wrong direction, then explicitly repeat and reshape new instead - if getattr(new, 'ndim', 0) >= 1: + if getattr(new, "ndim", 0) >= 1: if self.ndim - 1 == new.ndim and axis == 1: - new = np.repeat( - new, new_values.shape[-1]).reshape(self.shape) + new = np.repeat(new, new_values.shape[-1]).reshape(self.shape) new = new.astype(new_values.dtype) # we require exact matches between the len of the @@ -959,15 +1013,18 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, # # TODO: this prob needs some better checking # for 2D cases - if ((is_list_like(new) and - np.any(mask[mask]) and - getattr(new, 'ndim', 1) == 1)): - - if not (mask.shape[-1] == len(new) or - mask[mask].shape[-1] == len(new) or - len(new) == 1): - raise ValueError("cannot assign mismatch " - "length to masked array") + if ( + is_list_like(new) + and np.any(mask[mask]) + and getattr(new, "ndim", 1) == 1 + ): + + if not ( + mask.shape[-1] == len(new) + or mask[mask].shape[-1] == len(new) + or len(new) == 1 + ): + raise ValueError("cannot assign mismatch " "length to masked array") np.putmask(new_values, mask, new) @@ -980,7 +1037,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, axis = new_values.ndim - axis - 1 # Pseudo-broadcast - if getattr(new, 'ndim', 0) >= 1: + if getattr(new, "ndim", 0) >= 1: if self.ndim - 1 == new.ndim: new_shape = list(new.shape) new_shape.insert(axis, 1) @@ -1038,40 +1095,47 @@ def coerce_to_target_dtype(self, other): # we don't upcast to bool return self.astype(object) - elif ((self.is_float or self.is_complex) and - (is_integer_dtype(dtype) or is_float_dtype(dtype))): + elif (self.is_float or self.is_complex) and ( + is_integer_dtype(dtype) or is_float_dtype(dtype) + ): # don't coerce float/complex to int return self - elif (self.is_datetime or - is_datetime64_dtype(dtype) or - is_datetime64tz_dtype(dtype)): + elif ( + self.is_datetime + or is_datetime64_dtype(dtype) + or is_datetime64tz_dtype(dtype) + ): # not a datetime - if not ((is_datetime64_dtype(dtype) or - is_datetime64tz_dtype(dtype)) and self.is_datetime): + if not ( + (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype)) + and self.is_datetime + ): return self.astype(object) # don't upcast timezone with different timezone or no timezone - mytz = getattr(self.dtype, 'tz', None) - othertz = getattr(dtype, 'tz', None) + mytz = getattr(self.dtype, "tz", None) + othertz = getattr(dtype, "tz", None) if str(mytz) != str(othertz): return self.astype(object) - raise AssertionError("possible recursion in " - "coerce_to_target_dtype: {} {}".format( - self, other)) + raise AssertionError( + "possible recursion in " + "coerce_to_target_dtype: {} {}".format(self, other) + ) - elif (self.is_timedelta or is_timedelta64_dtype(dtype)): + elif self.is_timedelta or is_timedelta64_dtype(dtype): # not a timedelta if not (is_timedelta64_dtype(dtype) and self.is_timedelta): return self.astype(object) - raise AssertionError("possible recursion in " - "coerce_to_target_dtype: {} {}".format( - self, other)) + raise AssertionError( + "possible recursion in " + "coerce_to_target_dtype: {} {}".format(self, other) + ) try: return self.astype(dtype) @@ -1080,12 +1144,23 @@ def coerce_to_target_dtype(self, other): return self.astype(object) - def interpolate(self, method='pad', axis=0, index=None, values=None, - inplace=False, limit=None, limit_direction='forward', - limit_area=None, fill_value=None, coerce=False, - downcast=None, **kwargs): - - inplace = validate_bool_kwarg(inplace, 'inplace') + def interpolate( + self, + method="pad", + axis=0, + index=None, + values=None, + inplace=False, + limit=None, + limit_direction="forward", + limit_area=None, + fill_value=None, + coerce=False, + downcast=None, + **kwargs + ): + + inplace = validate_bool_kwarg(inplace, "inplace") def check_int_bool(self, inplace): # Only FloatBlocks will contain NaNs. @@ -1106,30 +1181,48 @@ def check_int_bool(self, inplace): r = check_int_bool(self, inplace) if r is not None: return r - return self._interpolate_with_fill(method=m, axis=axis, - inplace=inplace, limit=limit, - fill_value=fill_value, - coerce=coerce, - downcast=downcast) + return self._interpolate_with_fill( + method=m, + axis=axis, + inplace=inplace, + limit=limit, + fill_value=fill_value, + coerce=coerce, + downcast=downcast, + ) # validate the interp method m = missing.clean_interp_method(method, **kwargs) r = check_int_bool(self, inplace) if r is not None: return r - return self._interpolate(method=m, index=index, values=values, - axis=axis, limit=limit, - limit_direction=limit_direction, - limit_area=limit_area, - fill_value=fill_value, inplace=inplace, - downcast=downcast, **kwargs) - - def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, - limit=None, fill_value=None, coerce=False, - downcast=None): + return self._interpolate( + method=m, + index=index, + values=values, + axis=axis, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + fill_value=fill_value, + inplace=inplace, + downcast=downcast, + **kwargs + ) + + def _interpolate_with_fill( + self, + method="pad", + axis=0, + inplace=False, + limit=None, + fill_value=None, + coerce=False, + downcast=None, + ): """ fillna but using the interpolate machinery """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") # if we are coercing, then don't force the conversion # if the block can't hold the type @@ -1143,21 +1236,36 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, values = self.values if inplace else self.values.copy() values = self._coerce_values(values) fill_value = self._try_coerce_args(fill_value) - values = missing.interpolate_2d(values, method=method, axis=axis, - limit=limit, fill_value=fill_value, - dtype=self.dtype) + values = missing.interpolate_2d( + values, + method=method, + axis=axis, + limit=limit, + fill_value=fill_value, + dtype=self.dtype, + ) values = self._try_coerce_result(values) blocks = [self.make_block_same_class(values, ndim=self.ndim)] return self._maybe_downcast(blocks, downcast) - def _interpolate(self, method=None, index=None, values=None, - fill_value=None, axis=0, limit=None, - limit_direction='forward', limit_area=None, - inplace=False, downcast=None, **kwargs): + def _interpolate( + self, + method=None, + index=None, + values=None, + fill_value=None, + axis=0, + limit=None, + limit_direction="forward", + limit_area=None, + inplace=False, + downcast=None, + **kwargs + ): """ interpolate using scipy wrappers """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") data = self.values if inplace else self.values.copy() # only deal with floats @@ -1169,10 +1277,12 @@ def _interpolate(self, method=None, index=None, values=None, if fill_value is None: fill_value = self.fill_value - if method in ('krogh', 'piecewise_polynomial', 'pchip'): + if method in ("krogh", "piecewise_polynomial", "pchip"): if not index.is_monotonic: - raise ValueError("{0} interpolation requires that the " - "index be monotonic.".format(method)) + raise ValueError( + "{0} interpolation requires that the " + "index be monotonic.".format(method) + ) # process 1-d slices in the axis direction def func(x): @@ -1180,11 +1290,17 @@ def func(x): # process a 1-d slice, returning it # should the axis argument be handled below in apply_along_axis? # i.e. not an arg to missing.interpolate_1d - return missing.interpolate_1d(index, x, method=method, limit=limit, - limit_direction=limit_direction, - limit_area=limit_area, - fill_value=fill_value, - bounds_error=False, **kwargs) + return missing.interpolate_1d( + index, + x, + method=method, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + fill_value=fill_value, + bounds_error=False, + **kwargs + ) # interp each column independently interp_values = np.apply_along_axis(func, axis, data) @@ -1206,12 +1322,14 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): if fill_tuple is None: fill_value = self.fill_value - new_values = algos.take_nd(values, indexer, axis=axis, - allow_fill=False, fill_value=fill_value) + new_values = algos.take_nd( + values, indexer, axis=axis, allow_fill=False, fill_value=fill_value + ) else: fill_value = fill_tuple[0] - new_values = algos.take_nd(values, indexer, axis=axis, - allow_fill=True, fill_value=fill_value) + new_values = algos.take_nd( + values, indexer, axis=axis, allow_fill=True, fill_value=fill_value + ) if new_mgr_locs is None: if axis == 0: @@ -1247,8 +1365,7 @@ def shift(self, periods, axis=0, fill_value=None): axis = new_values.ndim - axis - 1 if np.prod(new_values.shape): - new_values = np.roll(new_values, ensure_platform_int(periods), - axis=axis) + new_values = np.roll(new_values, ensure_platform_int(periods), axis=axis) axis_indexer = [slice(None)] * self.ndim if periods > 0: @@ -1263,8 +1380,16 @@ def shift(self, periods, axis=0, fill_value=None): return [self.make_block(new_values)] - def where(self, other, cond, align=True, errors='raise', - try_cast=False, axis=0, transpose=False): + def where( + self, + other, + cond, + align=True, + errors="raise", + try_cast=False, + axis=0, + transpose=False, + ): """ evaluate the block; return result block(s) from the result @@ -1286,27 +1411,27 @@ def where(self, other, cond, align=True, errors='raise', a new block(s), the result of the func """ import pandas.core.computation.expressions as expressions - assert errors in ['raise', 'ignore'] + + assert errors in ["raise", "ignore"] values = self.values orig_other = other if transpose: values = values.T - other = getattr(other, '_values', getattr(other, 'values', other)) - cond = getattr(cond, 'values', cond) + other = getattr(other, "_values", getattr(other, "values", other)) + cond = getattr(cond, "values", cond) # If the default broadcasting would go in the wrong direction, then # explicitly reshape other instead - if getattr(other, 'ndim', 0) >= 1: + if getattr(other, "ndim", 0) >= 1: if values.ndim - 1 == other.ndim and axis == 1: - other = other.reshape(tuple(other.shape + (1, ))) + other = other.reshape(tuple(other.shape + (1,))) elif transpose and values.ndim == self.ndim - 1: cond = cond.T - if not hasattr(cond, 'shape'): - raise ValueError("where must have a condition that is ndarray " - "like") + if not hasattr(cond, "shape"): + raise ValueError("where must have a condition that is ndarray " "like") # our where function def func(cond, values, other): @@ -1316,13 +1441,14 @@ def func(cond, values, other): fastres = expressions.where(cond, values, other) return self._try_coerce_result(fastres) except Exception as detail: - if errors == 'raise': + if errors == "raise": raise TypeError( - 'Could not operate [{other!r}] with block values ' - '[{detail!s}]'.format(other=other, detail=detail)) + "Could not operate [{other!r}] with block values " + "[{detail!s}]".format(other=other, detail=detail) + ) else: # return the values - result = np.empty(values.shape, dtype='float64') + result = np.empty(values.shape, dtype="float64") result.fill(np.nan) return result @@ -1339,11 +1465,16 @@ def func(cond, values, other): # we cannot coerce, return a compat dtype # we are explicitly ignoring errors block = self.coerce_to_target_dtype(other) - blocks = block.where(orig_other, cond, align=align, - errors=errors, - try_cast=try_cast, axis=axis, - transpose=transpose) - return self._maybe_downcast(blocks, 'infer') + blocks = block.where( + orig_other, + cond, + align=align, + errors=errors, + try_cast=try_cast, + axis=axis, + transpose=transpose, + ) + return self._maybe_downcast(blocks, "infer") if self._can_hold_na or self.ndim == 1: @@ -1359,8 +1490,7 @@ def func(cond, values, other): # might need to separate out blocks axis = cond.ndim - 1 cond = cond.swapaxes(axis, 0) - mask = np.array([cond[i].all() for i in range(cond.shape[0])], - dtype=bool) + mask = np.array([cond[i].all() for i in range(cond.shape[0])], dtype=bool) result_blocks = [] for m in [mask, ~mask]: @@ -1410,7 +1540,7 @@ def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): blocks = [make_block(new_values, placement=new_placement)] return blocks, mask - def quantile(self, qs, interpolation='linear', axis=0): + def quantile(self, qs, interpolation="linear", axis=0): """ compute the quantiles of the @@ -1450,18 +1580,23 @@ def quantile(self, qs, interpolation='linear', axis=0): if is_empty: # create the array of na_values # 2d len(values) * len(qs) - result = np.repeat(np.array([self.fill_value] * len(qs)), - len(values)).reshape(len(values), - len(qs)) + result = np.repeat( + np.array([self.fill_value] * len(qs)), len(values) + ).reshape(len(values), len(qs)) else: # asarray needed for Sparse, see GH#24600 # Note: we use self.values below instead of values because the # `asi8` conversion above will behave differently under `isna` mask = np.asarray(isna(self.values)) - result = nanpercentile(values, np.array(qs) * 100, - axis=axis, na_value=self.fill_value, - mask=mask, ndim=self.ndim, - interpolation=interpolation) + result = nanpercentile( + values, + np.array(qs) * 100, + axis=axis, + na_value=self.fill_value, + mask=mask, + ndim=self.ndim, + interpolation=interpolation, + ) result = np.array(result, copy=False) result = result.T @@ -1472,14 +1607,13 @@ def quantile(self, qs, interpolation='linear', axis=0): result = result[..., 0] result = lib.item_from_zerodim(result) - ndim = getattr(result, 'ndim', None) or 0 + ndim = getattr(result, "ndim", None) or 0 result = self._try_coerce_result(result) - return make_block(result, - placement=np.arange(len(result)), - ndim=ndim) + return make_block(result, placement=np.arange(len(result)), ndim=ndim) - def _replace_coerce(self, to_replace, value, inplace=True, regex=False, - convert=False, mask=None): + def _replace_coerce( + self, to_replace, value, inplace=True, regex=False, convert=False, mask=None + ): """ Replace value corresponding to the given boolean array with another value. @@ -1509,15 +1643,20 @@ def _replace_coerce(self, to_replace, value, inplace=True, regex=False, self = self.coerce_to_target_dtype(value) return self.putmask(mask, value, inplace=inplace) else: - return self._replace_single(to_replace, value, inplace=inplace, - regex=regex, - convert=convert, - mask=mask) + return self._replace_single( + to_replace, + value, + inplace=inplace, + regex=regex, + convert=convert, + mask=mask, + ) return self class NonConsolidatableMixIn: """ hold methods for the nonconsolidatable blocks """ + _can_consolidate = False _verify_integrity = False _validate_ndim = False @@ -1546,7 +1685,7 @@ def __init__(self, values, placement, ndim=None): @property def shape(self): if self.ndim == 1: - return (len(self.values)), + return ((len(self.values)),) return (len(self.mgr_locs), len(self.values)) def iget(self, col): @@ -1572,8 +1711,7 @@ def set(self, locs, values, check=False): assert locs.tolist() == [0] self.values = values - def putmask(self, mask, new, align=True, inplace=False, axis=0, - transpose=False): + def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False): """ putmask the data to the block; we must be a single block and not generate other blocks @@ -1591,7 +1729,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, ------- a new block, the result of the putmask """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") # use block's copy logic. # .values may be an Index which does shallow copy by default @@ -1654,6 +1792,7 @@ class ExtensionBlock(NonConsolidatableMixIn, Block): ExtensionArrays are limited to 1-D. """ + is_extension = True def __init__(self, values, placement, ndim=None): @@ -1754,8 +1893,7 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): # axis doesn't matter; we are really a single-dim object # but are passed the axis depending on the calling routing # if its REALLY axis 0, then this will be a reindex and not a take - new_values = self.values.take(indexer, fill_value=fill_value, - allow_fill=True) + new_values = self.values.take(indexer, fill_value=fill_value, allow_fill=True) if self.ndim == 1 and new_mgr_locs is None: new_mgr_locs = [0] @@ -1778,8 +1916,7 @@ def _slice(self, slicer): if isinstance(slicer, tuple) and len(slicer) == 2: if not com.is_null_slice(slicer[0]): - raise AssertionError("invalid slicing for a 1-ndim " - "categorical") + raise AssertionError("invalid slicing for a 1-ndim " "categorical") slicer = slicer[1] return self.values[slicer] @@ -1798,8 +1935,7 @@ def _try_cast_result(self, result, dtype=None): """ try: - result = self._holder._from_sequence( - result.ravel(), dtype=dtype) + result = self._holder._from_sequence(result.ravel(), dtype=dtype) except Exception: pass @@ -1809,7 +1945,7 @@ def formatting_values(self): # Deprecating the ability to override _formatting_values. # Do the warning here, it's only user in pandas, since we # have to check if the subclass overrode it. - fv = getattr(type(self.values), '_formatting_values', None) + fv = getattr(type(self.values), "_formatting_values", None) if fv and fv != ExtensionArray._formatting_values: msg = ( "'ExtensionArray._formatting_values' is deprecated. " @@ -1824,32 +1960,35 @@ def concat_same_type(self, to_concat, placement=None): """ Concatenate list of single blocks of the same type. """ - values = self._holder._concat_same_type( - [blk.values for blk in to_concat]) + values = self._holder._concat_same_type([blk.values for blk in to_concat]) placement = placement or slice(0, len(values), 1) - return self.make_block_same_class(values, ndim=self.ndim, - placement=placement) + return self.make_block_same_class(values, ndim=self.ndim, placement=placement) def fillna(self, value, limit=None, inplace=False, downcast=None): values = self.values if inplace else self.values.copy() values = values.fillna(value=value, limit=limit) - return [self.make_block_same_class(values=values, - placement=self.mgr_locs, - ndim=self.ndim)] + return [ + self.make_block_same_class( + values=values, placement=self.mgr_locs, ndim=self.ndim + ) + ] - def interpolate(self, method='pad', axis=0, inplace=False, limit=None, - fill_value=None, **kwargs): + def interpolate( + self, method="pad", axis=0, inplace=False, limit=None, fill_value=None, **kwargs + ): values = self.values if inplace else self.values.copy() return self.make_block_same_class( - values=values.fillna(value=fill_value, method=method, - limit=limit), - placement=self.mgr_locs) + values=values.fillna(value=fill_value, method=method, limit=limit), + placement=self.mgr_locs, + ) - def shift(self, - periods: int, - axis: libinternals.BlockPlacement = 0, - fill_value: Any = None) -> List['ExtensionBlock']: + def shift( + self, + periods: int, + axis: libinternals.BlockPlacement = 0, + fill_value: Any = None, + ) -> List["ExtensionBlock"]: """ Shift the block by `periods`. @@ -1859,11 +1998,21 @@ def shift(self, return [ self.make_block_same_class( self.values.shift(periods=periods, fill_value=fill_value), - placement=self.mgr_locs, ndim=self.ndim) + placement=self.mgr_locs, + ndim=self.ndim, + ) ] - def where(self, other, cond, align=True, errors='raise', - try_cast=False, axis=0, transpose=False): + def where( + self, + other, + cond, + align=True, + errors="raise", + try_cast=False, + axis=0, + transpose=False, + ): if isinstance(other, ABCDataFrame): # ExtensionArrays are 1-D, so if we get here then # `other` should be a DataFrame with a single column. @@ -1904,15 +2053,14 @@ def where(self, other, cond, align=True, errors='raise', # TypeError for SparseArray, which implements just to raise # a TypeError result = self._holder._from_sequence( - np.where(cond, self.values, other), - dtype=dtype, + np.where(cond, self.values, other), dtype=dtype ) return self.make_block_same_class(result, placement=self.mgr_locs) @property def _ftype(self): - return getattr(self.values, '_pandas_ftype', Block._ftype) + return getattr(self.values, "_pandas_ftype", Block._ftype) def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): # ExtensionArray-safe unstack. @@ -1931,9 +2079,9 @@ def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): blocks = [ self.make_block_same_class( - self.values.take(indices, allow_fill=True, - fill_value=fill_value), - [place]) + self.values.take(indices, allow_fill=True, fill_value=fill_value), + [place], + ) for indices, place in zip(new_values.T, new_placement) ] return blocks, mask @@ -1974,16 +2122,25 @@ class FloatBlock(FloatOrComplexBlock): def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: - return (issubclass(tipo.type, (np.floating, np.integer)) and - not issubclass(tipo.type, (np.datetime64, np.timedelta64))) - return ( - isinstance( - element, (float, int, np.floating, np.int_)) and - not isinstance(element, (bool, np.bool_, datetime, timedelta, - np.datetime64, np.timedelta64))) + return issubclass(tipo.type, (np.floating, np.integer)) and not issubclass( + tipo.type, (np.datetime64, np.timedelta64) + ) + return isinstance( + element, (float, int, np.floating, np.int_) + ) and not isinstance( + element, + (bool, np.bool_, datetime, timedelta, np.datetime64, np.timedelta64), + ) - def to_native_types(self, slicer=None, na_rep='', float_format=None, - decimal='.', quoting=None, **kwargs): + def to_native_types( + self, + slicer=None, + na_rep="", + float_format=None, + decimal=".", + quoting=None, + **kwargs + ): """ convert to our native types format, slicing if desired """ values = self.values @@ -1993,29 +2150,33 @@ def to_native_types(self, slicer=None, na_rep='', float_format=None, # see gh-13418: no special formatting is desired at the # output (important for appropriate 'quoting' behaviour), # so do not pass it through the FloatArrayFormatter - if float_format is None and decimal == '.': + if float_format is None and decimal == ".": mask = isna(values) if not quoting: values = values.astype(str) else: - values = np.array(values, dtype='object') + values = np.array(values, dtype="object") values[mask] = na_rep return values from pandas.io.formats.format import FloatArrayFormatter - formatter = FloatArrayFormatter(values, na_rep=na_rep, - float_format=float_format, - decimal=decimal, quoting=quoting, - fixed_width=False) + + formatter = FloatArrayFormatter( + values, + na_rep=na_rep, + float_format=float_format, + decimal=decimal, + quoting=quoting, + fixed_width=False, + ) return formatter.get_result_as_array() def should_store(self, value): # when inserting a column should not coerce integers to floats # unnecessarily - return (issubclass(value.dtype.type, np.floating) and - value.dtype == self.dtype) + return issubclass(value.dtype.type, np.floating) and value.dtype == self.dtype class ComplexBlock(FloatOrComplexBlock): @@ -2025,13 +2186,10 @@ class ComplexBlock(FloatOrComplexBlock): def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: - return issubclass(tipo.type, - (np.floating, np.integer, np.complexfloating)) - return ( - isinstance( - element, - (float, int, complex, np.float_, np.int_)) and - not isinstance(element, (bool, np.bool_))) + return issubclass(tipo.type, (np.floating, np.integer, np.complexfloating)) + return isinstance( + element, (float, int, complex, np.float_, np.int_) + ) and not isinstance(element, (bool, np.bool_)) def should_store(self, value): return issubclass(value.dtype.type, np.complexfloating) @@ -2045,10 +2203,11 @@ class IntBlock(NumericBlock): def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: - return (issubclass(tipo.type, np.integer) and - not issubclass(tipo.type, (np.datetime64, - np.timedelta64)) and - self.dtype.itemsize >= tipo.itemsize) + return ( + issubclass(tipo.type, np.integer) + and not issubclass(tipo.type, (np.datetime64, np.timedelta64)) + and self.dtype.itemsize >= tipo.itemsize + ) return is_integer(element) def should_store(self, value): @@ -2123,8 +2282,8 @@ def _astype(self, dtype, **kwargs): # if we are passed a datetime64[ns, tz] if is_datetime64tz_dtype(dtype): values = self.values - if getattr(values, 'tz', None) is None: - values = DatetimeArray(values).tz_localize('UTC') + if getattr(values, "tz", None) is None: + values = DatetimeArray(values).tz_localize("UTC") values = values.tz_convert(dtype.tz) return self.make_block(values) @@ -2135,11 +2294,10 @@ def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: return tipo == _NS_DTYPE or tipo == np.int64 - return (is_integer(element) or isinstance(element, datetime) or - isna(element)) + return is_integer(element) or isinstance(element, datetime) or isna(element) def _coerce_values(self, values): - return values.view('i8') + return values.view("i8") def _try_coerce_args(self, other): """ @@ -2163,12 +2321,13 @@ def _try_coerce_args(self, other): other = tslibs.iNaT elif isinstance(other, (datetime, np.datetime64, date)): other = self._box_func(other) - if getattr(other, 'tz') is not None: - raise TypeError("cannot coerce a Timestamp with a tz on a " - "naive Block") - other = other.asm8.view('i8') - elif hasattr(other, 'dtype') and is_datetime64_dtype(other): - other = other.astype('i8', copy=False).view('i8') + if getattr(other, "tz") is not None: + raise TypeError( + "cannot coerce a Timestamp with a tz on a " "naive Block" + ) + other = other.asm8.view("i8") + elif hasattr(other, "dtype") and is_datetime64_dtype(other): + other = other.astype("i8", copy=False).view("i8") else: # coercion issues # let higher levels handle @@ -2179,8 +2338,8 @@ def _try_coerce_args(self, other): def _try_coerce_result(self, result): """ reverse of try_coerce_args """ if isinstance(result, np.ndarray): - if result.dtype.kind in ['i', 'f']: - result = result.astype('M8[ns]') + if result.dtype.kind in ["i", "f"]: + result = result.astype("M8[ns]") elif isinstance(result, (np.integer, np.float, np.datetime64)): result = self._box_func(result) @@ -2190,29 +2349,36 @@ def _try_coerce_result(self, result): def _box_func(self): return tslibs.Timestamp - def to_native_types(self, slicer=None, na_rep=None, date_format=None, - quoting=None, **kwargs): + def to_native_types( + self, slicer=None, na_rep=None, date_format=None, quoting=None, **kwargs + ): """ convert to our native types format, slicing if desired """ values = self.values - i8values = self.values.view('i8') + i8values = self.values.view("i8") if slicer is not None: values = values[..., slicer] i8values = i8values[..., slicer] from pandas.io.formats.format import _get_format_datetime64_from_values + fmt = _get_format_datetime64_from_values(values, date_format) result = tslib.format_array_from_datetime( - i8values.ravel(), tz=getattr(self.values, 'tz', None), - format=fmt, na_rep=na_rep).reshape(i8values.shape) + i8values.ravel(), + tz=getattr(self.values, "tz", None), + format=fmt, + na_rep=na_rep, + ).reshape(i8values.shape) return np.atleast_2d(result) def should_store(self, value): - return (issubclass(value.dtype.type, np.datetime64) and - not is_datetime64tz_dtype(value) and - not is_extension_array_dtype(value)) + return ( + issubclass(value.dtype.type, np.datetime64) + and not is_datetime64tz_dtype(value) + and not is_extension_array_dtype(value) + ) def set(self, locs, values): """ @@ -2227,11 +2393,12 @@ def set(self, locs, values): self.values[locs] = values def external_values(self): - return np.asarray(self.values.astype('datetime64[ns]', copy=False)) + return np.asarray(self.values.astype("datetime64[ns]", copy=False)) class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): """ implement a datetime64 block with a tz attribute """ + __slots__ = () is_datetimetz = True is_extension = True @@ -2350,7 +2517,7 @@ def _try_coerce_args(self, other): other = _block_shape(other.asi8, ndim=self.ndim) elif isinstance(other, (np.datetime64, datetime, date)): other = tslibs.Timestamp(other) - tz = getattr(other, 'tz', None) + tz = getattr(other, "tz", None) # test we can have an equal time zone if tz is None or str(tz) != str(self.values.tz): @@ -2364,8 +2531,8 @@ def _try_coerce_args(self, other): def _try_coerce_result(self, result): """ reverse of try_coerce_args """ if isinstance(result, np.ndarray): - if result.dtype.kind in ['i', 'f']: - result = result.astype('M8[ns]') + if result.dtype.kind in ["i", "f"]: + result = result.astype("M8[ns]") elif isinstance(result, (np.integer, np.float, np.datetime64)): result = self._box_func(result) @@ -2376,8 +2543,9 @@ def _try_coerce_result(self, result): if result.ndim > 1: result = result.reshape(np.prod(result.shape)) # GH#24096 new values invalidates a frequency - result = self._holder._simple_new(result, freq=None, - dtype=self.values.dtype) + result = self._holder._simple_new( + result, freq=None, dtype=self.values.dtype + ) return result @@ -2410,7 +2578,7 @@ def diff(self, n, axis=0): # Reshape the new_values like how algos.diff does for timedelta data new_values = new_values.reshape(1, len(new_values)) - new_values = new_values.astype('timedelta64[ns]') + new_values = new_values.astype("timedelta64[ns]") return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] def concat_same_type(self, to_concat, placement=None): @@ -2445,16 +2613,16 @@ def setitem(self, indexer, value): try: return super().setitem(indexer, value) except (ValueError, TypeError): - newb = make_block(self.values.astype(object), - placement=self.mgr_locs, - klass=ObjectBlock) + newb = make_block( + self.values.astype(object), placement=self.mgr_locs, klass=ObjectBlock + ) return newb.setitem(indexer, value) def equals(self, other): # override for significant performance improvement if self.dtype != other.dtype or self.shape != other.shape: return False - return (self.values.view('i8') == other.values.view('i8')).all() + return (self.values.view("i8") == other.values.view("i8")).all() class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): @@ -2477,14 +2645,15 @@ def _holder(self): @property def _box_func(self): - return lambda x: Timedelta(x, unit='ns') + return lambda x: Timedelta(x, unit="ns") def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, (np.timedelta64, np.int64)) return is_integer(element) or isinstance( - element, (timedelta, np.timedelta64, np.int64)) + element, (timedelta, np.timedelta64, np.int64) + ) def fillna(self, value, **kwargs): @@ -2492,16 +2661,19 @@ def fillna(self, value, **kwargs): # interpreted as nanoseconds if is_integer(value) and not isinstance(value, np.timedelta64): # Deprecation GH#24694, GH#19233 - warnings.warn("Passing integers to fillna is deprecated, will " - "raise a TypeError in a future version. To retain " - "the old behavior, pass pd.Timedelta(seconds=n) " - "instead.", - FutureWarning, stacklevel=6) - value = Timedelta(value, unit='s') + warnings.warn( + "Passing integers to fillna is deprecated, will " + "raise a TypeError in a future version. To retain " + "the old behavior, pass pd.Timedelta(seconds=n) " + "instead.", + FutureWarning, + stacklevel=6, + ) + value = Timedelta(value, unit="s") return super().fillna(value, **kwargs) def _coerce_values(self, values): - return values.view('i8') + return values.view("i8") def _try_coerce_args(self, other): """ @@ -2523,8 +2695,8 @@ def _try_coerce_args(self, other): other = tslibs.iNaT elif isinstance(other, (timedelta, np.timedelta64)): other = Timedelta(other).value - elif hasattr(other, 'dtype') and is_timedelta64_dtype(other): - other = other.astype('i8', copy=False).view('i8') + elif hasattr(other, "dtype") and is_timedelta64_dtype(other): + other = other.astype("i8", copy=False).view("i8") else: # coercion issues # let higher levels handle @@ -2536,8 +2708,8 @@ def _try_coerce_result(self, result): """ reverse of try_coerce_args / try_operate """ if isinstance(result, np.ndarray): mask = isna(result) - if result.dtype.kind in ['i', 'f']: - result = result.astype('m8[ns]') + if result.dtype.kind in ["i", "f"]: + result = result.astype("m8[ns]") result[mask] = tslibs.iNaT elif isinstance(result, (np.integer, np.float)): @@ -2546,11 +2718,11 @@ def _try_coerce_result(self, result): return result def should_store(self, value): - return (issubclass(value.dtype.type, np.timedelta64) and - not is_extension_array_dtype(value)) + return issubclass( + value.dtype.type, np.timedelta64 + ) and not is_extension_array_dtype(value) - def to_native_types(self, slicer=None, na_rep=None, quoting=None, - **kwargs): + def to_native_types(self, slicer=None, na_rep=None, quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values @@ -2560,7 +2732,7 @@ def to_native_types(self, slicer=None, na_rep=None, quoting=None, rvalues = np.empty(values.shape, dtype=object) if na_rep is None: - na_rep = 'NaT' + na_rep = "NaT" rvalues[mask] = na_rep imask = (~mask).ravel() @@ -2568,9 +2740,10 @@ def to_native_types(self, slicer=None, na_rep=None, quoting=None, # should use the formats.format.Timedelta64Formatter here # to figure what format to pass to the Timedelta # e.g. to not show the decimals say - rvalues.flat[imask] = np.array([Timedelta(val)._repr_base(format='all') - for val in values.ravel()[imask]], - dtype=object) + rvalues.flat[imask] = np.array( + [Timedelta(val)._repr_base(format="all") for val in values.ravel()[imask]], + dtype=object, + ) return rvalues def external_values(self, dtype=None): @@ -2589,17 +2762,25 @@ def _can_hold_element(self, element): return isinstance(element, (bool, np.bool_)) def should_store(self, value): - return (issubclass(value.dtype.type, np.bool_) and not - is_extension_array_dtype(value)) + return issubclass(value.dtype.type, np.bool_) and not is_extension_array_dtype( + value + ) - def replace(self, to_replace, value, inplace=False, filter=None, - regex=False, convert=True): - inplace = validate_bool_kwarg(inplace, 'inplace') + def replace( + self, to_replace, value, inplace=False, filter=None, regex=False, convert=True + ): + inplace = validate_bool_kwarg(inplace, "inplace") to_replace_values = np.atleast_1d(to_replace) if not np.can_cast(to_replace_values, bool): return self - return super().replace(to_replace, value, inplace=inplace, - filter=filter, regex=regex, convert=convert) + return super().replace( + to_replace, + value, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert, + ) class ObjectBlock(Block): @@ -2630,9 +2811,9 @@ def convert(self, *args, **kwargs): if args: raise NotImplementedError - by_item = kwargs.get('by_item', True) + by_item = kwargs.get("by_item", True) - new_inputs = ['coerce', 'datetime', 'numeric', 'timedelta'] + new_inputs = ["coerce", "datetime", "numeric", "timedelta"] new_style = False for kw in new_inputs: new_style |= kw in kwargs @@ -2642,9 +2823,8 @@ def convert(self, *args, **kwargs): fn_inputs = new_inputs else: fn = maybe_convert_objects - fn_inputs = ['convert_dates', 'convert_numeric', - 'convert_timedeltas'] - fn_inputs += ['copy'] + fn_inputs = ["convert_dates", "convert_numeric", "convert_timedeltas"] + fn_inputs += ["copy"] fn_kwargs = {key: kwargs[key] for key in fn_inputs if key in kwargs} @@ -2663,8 +2843,7 @@ def f(m, v, i): blocks = self.split_and_operate(None, f, False) else: values = f(None, self.values.ravel(), None) - blocks = [make_block(values, ndim=self.ndim, - placement=self.mgr_locs)] + blocks = [make_block(values, ndim=self.ndim, placement=self.mgr_locs)] return blocks @@ -2674,8 +2853,7 @@ def _maybe_downcast(self, blocks, downcast=None): return blocks # split and convert the blocks - return _extend_blocks([b.convert(datetime=True, numeric=False) - for b in blocks]) + return _extend_blocks([b.convert(datetime=True, numeric=False) for b in blocks]) def _can_hold_element(self, element): return True @@ -2696,16 +2874,21 @@ def _try_coerce_args(self, other): return other def should_store(self, value): - return not (issubclass(value.dtype.type, - (np.integer, np.floating, np.complexfloating, - np.datetime64, np.bool_)) or - # TODO(ExtensionArray): remove is_extension_type - # when all extension arrays have been ported. - is_extension_type(value) or - is_extension_array_dtype(value)) - - def replace(self, to_replace, value, inplace=False, filter=None, - regex=False, convert=True): + return not ( + issubclass( + value.dtype.type, + (np.integer, np.floating, np.complexfloating, np.datetime64, np.bool_), + ) + or + # TODO(ExtensionArray): remove is_extension_type + # when all extension arrays have been ported. + is_extension_type(value) + or is_extension_array_dtype(value) + ) + + def replace( + self, to_replace, value, inplace=False, filter=None, regex=False, convert=True + ): to_rep_is_list = is_list_like(to_replace) value_is_list = is_list_like(value) both_lists = to_rep_is_list and value_is_list @@ -2715,19 +2898,35 @@ def replace(self, to_replace, value, inplace=False, filter=None, blocks = [self] if not either_list and is_re(to_replace): - return self._replace_single(to_replace, value, inplace=inplace, - filter=filter, regex=True, - convert=convert) + return self._replace_single( + to_replace, + value, + inplace=inplace, + filter=filter, + regex=True, + convert=convert, + ) elif not (either_list or regex): - return super().replace(to_replace, value, inplace=inplace, - filter=filter, regex=regex, convert=convert) + return super().replace( + to_replace, + value, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert, + ) elif both_lists: for to_rep, v in zip(to_replace, value): result_blocks = [] for b in blocks: - result = b._replace_single(to_rep, v, inplace=inplace, - filter=filter, regex=regex, - convert=convert) + result = b._replace_single( + to_rep, + v, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert, + ) result_blocks = _extend_blocks(result, result_blocks) blocks = result_blocks return result_blocks @@ -2736,19 +2935,37 @@ def replace(self, to_replace, value, inplace=False, filter=None, for to_rep in to_replace: result_blocks = [] for b in blocks: - result = b._replace_single(to_rep, value, inplace=inplace, - filter=filter, regex=regex, - convert=convert) + result = b._replace_single( + to_rep, + value, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert, + ) result_blocks = _extend_blocks(result, result_blocks) blocks = result_blocks return result_blocks - return self._replace_single(to_replace, value, inplace=inplace, - filter=filter, convert=convert, - regex=regex) + return self._replace_single( + to_replace, + value, + inplace=inplace, + filter=filter, + convert=convert, + regex=regex, + ) - def _replace_single(self, to_replace, value, inplace=False, filter=None, - regex=False, convert=True, mask=None): + def _replace_single( + self, + to_replace, + value, + inplace=False, + filter=None, + regex=False, + convert=True, + mask=None, + ): """ Replace elements by the given value. @@ -2772,7 +2989,7 @@ def _replace_single(self, to_replace, value, inplace=False, filter=None, ------- a new block, the result after replacing """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") # to_replace is regex compilable to_rep_re = regex and is_re_compilable(to_replace) @@ -2782,8 +2999,9 @@ def _replace_single(self, to_replace, value, inplace=False, filter=None, # only one will survive if to_rep_re and regex_re: - raise AssertionError('only one of to_replace and regex can be ' - 'regex compilable') + raise AssertionError( + "only one of to_replace and regex can be " "regex compilable" + ) # if regex was passed as something that can be a regex (rather than a # boolean) @@ -2805,8 +3023,9 @@ def _replace_single(self, to_replace, value, inplace=False, filter=None, else: # if the thing to replace is not a string or compiled regex call # the superclass method -> to_replace is some kind of object - return super().replace(to_replace, value, inplace=inplace, - filter=filter, regex=regex) + return super().replace( + to_replace, value, inplace=inplace, filter=filter, regex=regex + ) new_values = self.values if inplace else self.values.copy() @@ -2819,6 +3038,7 @@ def re_replacer(s): return value if rx.search(s) is not None else s except TypeError: return s + else: # value is guaranteed to be a string here, s can be either a string # or null if it's null it gets returned @@ -2846,8 +3066,9 @@ def re_replacer(s): block = block.convert(by_item=True, numeric=False) return block - def _replace_coerce(self, to_replace, value, inplace=True, regex=False, - convert=False, mask=None): + def _replace_coerce( + self, to_replace, value, inplace=True, regex=False, convert=False, mask=None + ): """ Replace value corresponding to the given boolean array with another value. @@ -2873,11 +3094,17 @@ def _replace_coerce(self, to_replace, value, inplace=True, regex=False, """ if mask.any(): block = super()._replace_coerce( - to_replace=to_replace, value=value, inplace=inplace, - regex=regex, convert=convert, mask=mask) + to_replace=to_replace, + value=value, + inplace=inplace, + regex=regex, + convert=convert, + mask=mask, + ) if convert: - block = [b.convert(by_item=True, numeric=False, copy=True) - for b in block] + block = [ + b.convert(by_item=True, numeric=False, copy=True) for b in block + ] return block return self @@ -2893,9 +3120,7 @@ def __init__(self, values, placement, ndim=None): from pandas.core.arrays.categorical import _maybe_to_categorical # coerce to categorical if we can - super().__init__(_maybe_to_categorical(values), - placement=placement, - ndim=ndim) + super().__init__(_maybe_to_categorical(values), placement=placement, ndim=ndim) @property def _holder(self): @@ -2913,8 +3138,7 @@ def _try_coerce_result(self, result): # GH12564: CategoricalBlock is 1-dim only # while returned results could be any dim - if ((not is_categorical_dtype(result)) and - isinstance(result, np.ndarray)): + if (not is_categorical_dtype(result)) and isinstance(result, np.ndarray): result = _block_shape(result, ndim=self.ndim) return result @@ -2925,7 +3149,7 @@ def to_dense(self): # other types. return self.values._internal_get_values() - def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): + def to_native_types(self, slicer=None, na_rep="", quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values @@ -2933,7 +3157,7 @@ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): # Categorical is always one dimension values = values[slicer] mask = isna(values) - values = np.array(values, dtype='object') + values = np.array(values, dtype="object") values[mask] = na_rep # we are expected to return a 2-d ndarray @@ -2952,15 +3176,24 @@ def concat_same_type(self, to_concat, placement=None): 1. Change Categorical._concat_same_type to use union_categoricals 2. Delete this method. """ - values = self._concatenator([blk.values for blk in to_concat], - axis=self.ndim - 1) + values = self._concatenator( + [blk.values for blk in to_concat], axis=self.ndim - 1 + ) # not using self.make_block_same_class as values can be object dtype return make_block( - values, placement=placement or slice(0, len(values), 1), - ndim=self.ndim) + values, placement=placement or slice(0, len(values), 1), ndim=self.ndim + ) - def where(self, other, cond, align=True, errors='raise', - try_cast=False, axis=0, transpose=False): + def where( + self, + other, + cond, + align=True, + errors="raise", + try_cast=False, + axis=0, + transpose=False, + ): # TODO(CategoricalBlock.where): # This can all be deleted in favor of ExtensionBlock.where once # we enforce the deprecation. @@ -2980,16 +3213,22 @@ def where(self, other, cond, align=True, errors='raise', ) except (TypeError, ValueError): warnings.warn(object_msg, FutureWarning, stacklevel=6) - result = self.astype(object).where(other, cond, align=align, - errors=errors, - try_cast=try_cast, - axis=axis, transpose=transpose) + result = self.astype(object).where( + other, + cond, + align=align, + errors=errors, + try_cast=try_cast, + axis=axis, + transpose=transpose, + ) return result # ----------------------------------------------------------------- # Constructor Helpers + def get_block_type(values, dtype=None): """ Find the appropriate Block subclass to use for the given values and dtype. @@ -3036,8 +3275,7 @@ def get_block_type(values, dtype=None): return cls -def make_block(values, placement, klass=None, ndim=None, dtype=None, - fastpath=None): +def make_block(values, placement, klass=None, ndim=None, dtype=None, fastpath=None): # Ensure that we don't allow PandasArray / PandasDtype in internals. # For now, blocks should be backed by ndarrays when possible. if isinstance(values, ABCPandasArray): @@ -3050,8 +3288,10 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, if fastpath is not None: # GH#19265 pyarrow is passing this - warnings.warn("fastpath argument is deprecated, will be removed " - "in a future release.", FutureWarning) + warnings.warn( + "fastpath argument is deprecated, will be removed " "in a future release.", + FutureWarning, + ) if klass is None: dtype = dtype or values.dtype klass = get_block_type(values, dtype) @@ -3066,9 +3306,11 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, # ----------------------------------------------------------------- + def _extend_blocks(result, blocks=None): """ return a new extended blocks, givin the result """ from pandas.core.internals import BlockManager + if blocks is None: blocks = [] if isinstance(result, list): @@ -3093,7 +3335,7 @@ def _block_shape(values, ndim=1, shape=None): # TODO: https://github.com/pandas-dev/pandas/issues/23023 # block.shape is incorrect for "2D" ExtensionArrays # We can't, and don't need to, reshape. - values = values.reshape(tuple((1, ) + shape)) + values = values.reshape(tuple((1,) + shape)) return values @@ -3193,13 +3435,14 @@ def _putmask_smart(v, m, n): # only compare integers/floats # don't compare integers to datetimelikes - if (not is_numeric_v_string_like(nn, nn_at) and - (is_float_dtype(nn.dtype) or - is_integer_dtype(nn.dtype) and - is_float_dtype(nn_at.dtype) or - is_integer_dtype(nn_at.dtype))): - - comp = (nn == nn_at) + if not is_numeric_v_string_like(nn, nn_at) and ( + is_float_dtype(nn.dtype) + or is_integer_dtype(nn.dtype) + and is_float_dtype(nn_at.dtype) + or is_integer_dtype(nn_at.dtype) + ): + + comp = nn == nn_at if is_list_like(comp) and comp.all(): nv = v.copy() nv[m] = nn_at diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 6900dfc3c76d87..9ccd4b80869a0b 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -9,9 +9,16 @@ from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( - _get_dtype, is_categorical_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_extension_array_dtype, is_float_dtype, - is_numeric_dtype, is_sparse, is_timedelta64_dtype) + _get_dtype, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_extension_array_dtype, + is_float_dtype, + is_numeric_dtype, + is_sparse, + is_timedelta64_dtype, +) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.missing import isna @@ -54,9 +61,9 @@ def get_mgr_concatenation_plan(mgr, indexers): blklocs = mgr._blklocs plan = [] - for blkno, placements in libinternals.get_blkno_placements(blknos, - mgr.nblocks, - group=False): + for blkno, placements in libinternals.get_blkno_placements( + blknos, mgr.nblocks, group=False + ): assert placements.is_slice_like @@ -72,18 +79,26 @@ def get_mgr_concatenation_plan(mgr, indexers): blk = mgr.blocks[blkno] ax0_blk_indexer = blklocs[placements.indexer] - unit_no_ax0_reindexing = (len(placements) == len(blk.mgr_locs) and - # Fastpath detection of join unit not - # needing to reindex its block: no ax0 - # reindexing took place and block - # placement was sequential before. - ((ax0_indexer is None and - blk.mgr_locs.is_slice_like and - blk.mgr_locs.as_slice.step == 1) or - # Slow-ish detection: all indexer locs - # are sequential (and length match is - # checked above). - (np.diff(ax0_blk_indexer) == 1).all())) + unit_no_ax0_reindexing = ( + len(placements) == len(blk.mgr_locs) + and + # Fastpath detection of join unit not + # needing to reindex its block: no ax0 + # reindexing took place and block + # placement was sequential before. + ( + ( + ax0_indexer is None + and blk.mgr_locs.is_slice_like + and blk.mgr_locs.as_slice.step == 1 + ) + or + # Slow-ish detection: all indexer locs + # are sequential (and length match is + # checked above). + (np.diff(ax0_blk_indexer) == 1).all() + ) + ) # Omit indexer if no item reindexing is required. if unit_no_ax0_reindexing: @@ -99,7 +114,6 @@ def get_mgr_concatenation_plan(mgr, indexers): class JoinUnit: - def __init__(self, block, shape, indexers=None): # Passing shape explicitly is required for cases when block is None. if indexers is None: @@ -109,9 +123,9 @@ def __init__(self, block, shape, indexers=None): self.shape = shape def __repr__(self): - return '{name}({block!r}, {indexers})'.format( - name=self.__class__.__name__, block=self.block, - indexers=self.indexers) + return "{name}({block!r}, {indexers})".format( + name=self.__class__.__name__, block=self.block, indexers=self.indexers + ) @cache_readonly def needs_filling(self): @@ -130,8 +144,7 @@ def dtype(self): if not self.needs_filling: return self.block.dtype else: - return _get_dtype(maybe_promote(self.block.dtype, - self.block.fill_value)[0]) + return _get_dtype(maybe_promote(self.block.dtype, self.block.fill_value)[0]) @cache_readonly def is_na(self): @@ -152,11 +165,11 @@ def is_na(self): elif self.block.is_extension: values_flat = values else: - values_flat = values.ravel(order='K') + values_flat = values.ravel(order="K") total_len = values_flat.shape[0] chunk_len = max(total_len // 40, 1000) for i in range(0, total_len, chunk_len): - if not isna(values_flat[i:i + chunk_len]).all(): + if not isna(values_flat[i : i + chunk_len]).all(): return False return True @@ -170,24 +183,26 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): fill_value = upcasted_na if self.is_na: - if getattr(self.block, 'is_object', False): + if getattr(self.block, "is_object", False): # we want to avoid filling with np.nan if we are # using None; we already know that we are all # nulls - values = self.block.values.ravel(order='K') + values = self.block.values.ravel(order="K") if len(values) and values[0] is None: fill_value = None - if (getattr(self.block, 'is_datetimetz', False) or - is_datetime64tz_dtype(empty_dtype)): + if getattr(self.block, "is_datetimetz", False) or is_datetime64tz_dtype( + empty_dtype + ): if self.block is None: array = empty_dtype.construct_array_type() - return array(np.full(self.shape[1], fill_value.value), - dtype=empty_dtype) + return array( + np.full(self.shape[1], fill_value.value), dtype=empty_dtype + ) pass - elif getattr(self.block, 'is_categorical', False): + elif getattr(self.block, "is_categorical", False): pass - elif getattr(self.block, 'is_extension', False): + elif getattr(self.block, "is_extension", False): pass else: missing_arr = np.empty(self.shape, dtype=empty_dtype) @@ -218,8 +233,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): else: for ax, indexer in self.indexers.items(): - values = algos.take_nd(values, indexer, axis=ax, - fill_value=fill_value) + values = algos.take_nd(values, indexer, axis=ax, fill_value=fill_value) return values @@ -234,9 +248,10 @@ def concatenate_join_units(join_units, concat_axis, copy): empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units) - to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype, - upcasted_na=upcasted_na) - for ju in join_units] + to_concat = [ + ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na) + for ju in join_units + ] if len(to_concat) == 1: # Only one block, nothing to concatenate. @@ -292,25 +307,25 @@ def get_empty_dtype_and_na(join_units): continue if is_categorical_dtype(dtype): - upcast_cls = 'category' + upcast_cls = "category" elif is_datetime64tz_dtype(dtype): - upcast_cls = 'datetimetz' + upcast_cls = "datetimetz" elif issubclass(dtype.type, np.bool_): - upcast_cls = 'bool' + upcast_cls = "bool" elif issubclass(dtype.type, np.object_): - upcast_cls = 'object' + upcast_cls = "object" elif is_datetime64_dtype(dtype): - upcast_cls = 'datetime' + upcast_cls = "datetime" elif is_timedelta64_dtype(dtype): - upcast_cls = 'timedelta' + upcast_cls = "timedelta" elif is_sparse(dtype): upcast_cls = dtype.subtype.name elif is_extension_array_dtype(dtype): - upcast_cls = 'object' + upcast_cls = "object" elif is_float_dtype(dtype) or is_numeric_dtype(dtype): upcast_cls = dtype.name else: - upcast_cls = 'float' + upcast_cls = "float" # Null blocks should not influence upcast class selection, unless there # are only null blocks, when same upcasting rules must be applied to @@ -324,24 +339,24 @@ def get_empty_dtype_and_na(join_units): upcast_classes = null_upcast_classes # create the result - if 'object' in upcast_classes: + if "object" in upcast_classes: return np.dtype(np.object_), np.nan - elif 'bool' in upcast_classes: + elif "bool" in upcast_classes: if has_none_blocks: return np.dtype(np.object_), np.nan else: return np.dtype(np.bool_), None - elif 'category' in upcast_classes: + elif "category" in upcast_classes: return np.dtype(np.object_), np.nan - elif 'datetimetz' in upcast_classes: + elif "datetimetz" in upcast_classes: # GH-25014. We use NaT instead of iNaT, since this eventually # ends up in DatetimeArray.take, which does not allow iNaT. - dtype = upcast_classes['datetimetz'] + dtype = upcast_classes["datetimetz"] return dtype[0], tslibs.NaT - elif 'datetime' in upcast_classes: - return np.dtype('M8[ns]'), tslibs.iNaT - elif 'timedelta' in upcast_classes: - return np.dtype('m8[ns]'), tslibs.iNaT + elif "datetime" in upcast_classes: + return np.dtype("M8[ns]"), tslibs.iNaT + elif "timedelta" in upcast_classes: + return np.dtype("m8[ns]"), tslibs.iNaT else: # pragma try: g = np.find_common_type(upcast_classes, []) @@ -370,21 +385,25 @@ def is_uniform_join_units(join_units): """ return ( # all blocks need to have the same type - all(type(ju.block) is type(join_units[0].block) for ju in join_units) and # noqa + all(type(ju.block) is type(join_units[0].block) for ju in join_units) + and # noqa # no blocks that would get missing values (can lead to type upcasts) # unless we're an extension dtype. - all(not ju.is_na or ju.block.is_extension for ju in join_units) and + all(not ju.is_na or ju.block.is_extension for ju in join_units) + and # no blocks with indexers (as then the dimensions do not fit) - all(not ju.indexers for ju in join_units) and + all(not ju.indexers for ju in join_units) + and # only use this path when there is something to concatenate - len(join_units) > 1) + len(join_units) > 1 + ) def is_uniform_reindex(join_units): return ( # TODO: should this be ju.block._can_hold_na? - all(ju.block and ju.block.is_extension for ju in join_units) and - len({ju.block.dtype.name for ju in join_units}) == 1 + all(ju.block and ju.block.is_extension for ju in join_units) + and len({ju.block.dtype.name for ju in join_units}) == 1 ) @@ -413,8 +432,7 @@ def trim_join_unit(join_unit, length): extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:] join_unit.shape = (length,) + join_unit.shape[1:] - return JoinUnit(block=extra_block, indexers=extra_indexers, - shape=extra_shape) + return JoinUnit(block=extra_block, indexers=extra_indexers, shape=extra_shape) def combine_concat_plans(plans, concat_axis): @@ -471,8 +489,7 @@ def _next_or_none(seq): if len(plc) > min_len: # trim_join_unit updates unit in place, so only # placement needs to be sliced to skip min_len. - next_items[i] = (plc[min_len:], - trim_join_unit(unit, min_len)) + next_items[i] = (plc[min_len:], trim_join_unit(unit, min_len)) else: yielded_placement = plc next_items[i] = _next_or_none(plans[i]) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 1044f25a6bbcd3..4d64be34e624f8 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -12,26 +12,53 @@ from pandas.compat import raise_with_traceback from pandas.core.dtypes.cast import ( - construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na, - construct_1d_object_array_from_listlike, infer_dtype_from_scalar, - maybe_cast_to_datetime, maybe_cast_to_integer_array, maybe_castable, - maybe_convert_platform, maybe_infer_to_datetimelike, maybe_upcast) + construct_1d_arraylike_from_scalar, + construct_1d_ndarray_preserving_na, + construct_1d_object_array_from_listlike, + infer_dtype_from_scalar, + maybe_cast_to_datetime, + maybe_cast_to_integer_array, + maybe_castable, + maybe_convert_platform, + maybe_infer_to_datetimelike, + maybe_upcast, +) from pandas.core.dtypes.common import ( - is_categorical_dtype, is_datetime64tz_dtype, is_dtype_equal, - is_extension_array_dtype, is_extension_type, is_float_dtype, - is_integer_dtype, is_iterator, is_list_like, is_object_dtype, pandas_dtype) + is_categorical_dtype, + is_datetime64tz_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_extension_type, + is_float_dtype, + is_integer_dtype, + is_iterator, + is_list_like, + is_object_dtype, + pandas_dtype, +) from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, ABCPeriodIndex, ABCSeries, - ABCTimedeltaIndex) + ABCDataFrame, + ABCDatetimeIndex, + ABCIndexClass, + ABCPeriodIndex, + ABCSeries, + ABCTimedeltaIndex, +) from pandas.core.dtypes.missing import isna from pandas.core import algorithms, common as com from pandas.core.arrays import Categorical, ExtensionArray, period_array from pandas.core.index import ( - Index, _get_objs_combined_axis, _union_indexes, ensure_index) + Index, + _get_objs_combined_axis, + _union_indexes, + ensure_index, +) from pandas.core.indexes import base as ibase from pandas.core.internals import ( - create_block_manager_from_arrays, create_block_manager_from_blocks) + create_block_manager_from_arrays, + create_block_manager_from_blocks, +) from pandas.core.internals.arrays import extract_array # --------------------------------------------------------------------- @@ -101,6 +128,7 @@ def masked_rec_array_to_mgr(data, index, columns, dtype, copy): # --------------------------------------------------------------------- # DataFrame Constructor Interface + def init_ndarray(values, index, columns, dtype=None, copy=False): # input must be a ndarray, list, Series, index @@ -119,24 +147,23 @@ def init_ndarray(values, index, columns, dtype=None, copy=False): # we could have a categorical type passed or coerced to 'category' # recast this to an arrays_to_mgr - if (is_categorical_dtype(getattr(values, 'dtype', None)) or - is_categorical_dtype(dtype)): + if is_categorical_dtype(getattr(values, "dtype", None)) or is_categorical_dtype( + dtype + ): - if not hasattr(values, 'dtype'): + if not hasattr(values, "dtype"): values = prep_ndarray(values, copy=copy) values = values.ravel() elif copy: values = values.copy() index, columns = _get_axes(len(values), 1, index, columns) - return arrays_to_mgr([values], columns, index, columns, - dtype=dtype) + return arrays_to_mgr([values], columns, index, columns, dtype=dtype) elif is_extension_array_dtype(values): # GH#19157 if columns is None: columns = [0] - return arrays_to_mgr([values], columns, index, columns, - dtype=dtype) + return arrays_to_mgr([values], columns, index, columns, dtype=dtype) # by definition an array here # the dtypes will be coerced to a single dtype @@ -147,9 +174,10 @@ def init_ndarray(values, index, columns, dtype=None, copy=False): try: values = values.astype(dtype) except Exception as orig: - e = ValueError("failed to cast to '{dtype}' (Exception " - "was: {orig})".format(dtype=dtype, - orig=orig)) + e = ValueError( + "failed to cast to '{dtype}' (Exception " + "was: {orig})".format(dtype=dtype, orig=orig) + ) raise_with_traceback(e) index, columns = _get_axes(*values.shape, index=index, columns=columns) @@ -171,8 +199,9 @@ def init_ndarray(values, index, columns, dtype=None, copy=False): from pandas.core.internals.blocks import make_block # TODO: What about re-joining object columns? - block_values = [make_block(dvals_list[n], placement=[n]) - for n in range(len(dvals_list))] + block_values = [ + make_block(dvals_list[n], placement=[n]) for n in range(len(dvals_list)) + ] else: datelike_vals = maybe_infer_to_datetimelike(values) @@ -190,6 +219,7 @@ def init_dict(data, index, columns, dtype=None): """ if columns is not None: from pandas.core.series import Series + arrays = Series(data, index=columns, dtype=object) data_names = arrays.index @@ -208,8 +238,7 @@ def init_dict(data, index, columns, dtype=None): nan_dtype = object else: nan_dtype = dtype - val = construct_1d_arraylike_from_scalar(np.nan, len(index), - nan_dtype) + val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) arrays.loc[missing] = [val] * missing.sum() else: @@ -218,15 +247,18 @@ def init_dict(data, index, columns, dtype=None): arrays = (com.maybe_iterable_to_list(data[k]) for k in keys) # GH#24096 need copy to be deep for datetime64tz case # TODO: See if we can avoid these copies - arrays = [arr if not isinstance(arr, ABCIndexClass) else arr._data - for arr in arrays] - arrays = [arr if not is_datetime64tz_dtype(arr) else - arr.copy() for arr in arrays] + arrays = [ + arr if not isinstance(arr, ABCIndexClass) else arr._data for arr in arrays + ] + arrays = [ + arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays + ] return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) # --------------------------------------------------------------------- + def prep_ndarray(values, copy=True): if not isinstance(values, (np.ndarray, ABCSeries, Index)): if len(values) == 0: @@ -239,7 +271,7 @@ def convert(v): # this is equiv of np.asarray, but does object conversion # and platform dtype preservation try: - if is_list_like(values[0]) or hasattr(values[0], 'len'): + if is_list_like(values[0]) or hasattr(values[0], "len"): values = np.array([convert(v) for v in values]) elif isinstance(values[0], np.ndarray) and values[0].ndim == 0: # GH#21861 @@ -259,7 +291,7 @@ def convert(v): if values.ndim == 1: values = values.reshape((values.shape[0], 1)) elif values.ndim != 2: - raise ValueError('Must pass 2-d input') + raise ValueError("Must pass 2-d input") return values @@ -279,15 +311,16 @@ def _homogenize(data, index, dtype=None): else: if isinstance(val, dict): if oindex is None: - oindex = index.astype('O') + oindex = index.astype("O") if isinstance(index, (ABCDatetimeIndex, ABCTimedeltaIndex)): val = com.dict_compat(val) else: val = dict(val) val = lib.fast_multiget(val, oindex.values, default=np.nan) - val = sanitize_array(val, index, dtype=dtype, copy=False, - raise_cast_failure=False) + val = sanitize_array( + val, index, dtype=dtype, copy=False, raise_cast_failure=False + ) homogenized.append(val) @@ -313,13 +346,12 @@ def extract_index(data): elif isinstance(val, dict): have_dicts = True indexes.append(list(val.keys())) - elif is_list_like(val) and getattr(val, 'ndim', 1) == 1: + elif is_list_like(val) and getattr(val, "ndim", 1) == 1: have_raw_arrays = True raw_lengths.append(len(val)) if not indexes and not raw_lengths: - raise ValueError('If using all scalar values, you must pass' - ' an index') + raise ValueError("If using all scalar values, you must pass" " an index") if have_series or have_dicts: index = _union_indexes(indexes) @@ -327,17 +359,19 @@ def extract_index(data): if have_raw_arrays: lengths = list(set(raw_lengths)) if len(lengths) > 1: - raise ValueError('arrays must all be same length') + raise ValueError("arrays must all be same length") if have_dicts: - raise ValueError('Mixing dicts with non-Series may lead to ' - 'ambiguous ordering.') + raise ValueError( + "Mixing dicts with non-Series may lead to " "ambiguous ordering." + ) if have_series: if lengths[0] != len(index): - msg = ('array length {length} does not match index ' - 'length {idx_len}' - .format(length=lengths[0], idx_len=len(index))) + msg = ( + "array length {length} does not match index " + "length {idx_len}".format(length=lengths[0], idx_len=len(index)) + ) raise ValueError(msg) else: index = ibase.default_index(lengths[0]) @@ -347,8 +381,12 @@ def extract_index(data): def reorder_arrays(arrays, arr_columns, columns): # reorder according to the columns - if (columns is not None and len(columns) and arr_columns is not None and - len(arr_columns)): + if ( + columns is not None + and len(columns) + and arr_columns is not None + and len(arr_columns) + ): indexer = ensure_index(arr_columns).get_indexer(columns) arr_columns = ensure_index([arr_columns[i] for i in indexer]) arrays = [arrays[i] for i in indexer] @@ -356,18 +394,18 @@ def reorder_arrays(arrays, arr_columns, columns): def get_names_from_index(data): - has_some_name = any(getattr(s, 'name', None) is not None for s in data) + has_some_name = any(getattr(s, "name", None) is not None for s in data) if not has_some_name: return ibase.default_index(len(data)) index = list(range(len(data))) count = 0 for i, s in enumerate(data): - n = getattr(s, 'name', None) + n = getattr(s, "name", None) if n is not None: index[i] = n else: - index[i] = 'Unnamed {count}'.format(count=count) + index[i] = "Unnamed {count}".format(count=count) count += 1 return index @@ -392,14 +430,18 @@ def _get_axes(N, K, index, columns): # --------------------------------------------------------------------- # Conversion of Inputs to Arrays + def to_arrays(data, columns, coerce_float=False, dtype=None): """ Return list of arrays, columns. """ if isinstance(data, ABCDataFrame): if columns is not None: - arrays = [data._ixs(i, axis=1).values - for i, col in enumerate(data.columns) if col in columns] + arrays = [ + data._ixs(i, axis=1).values + for i, col in enumerate(data.columns) + if col in columns + ] else: columns = data.columns arrays = [data._ixs(i, axis=1).values for i in range(len(columns))] @@ -413,21 +455,23 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): return [[]] * len(columns), columns return [], [] # columns if columns is not None else [] if isinstance(data[0], (list, tuple)): - return _list_to_arrays(data, columns, coerce_float=coerce_float, - dtype=dtype) + return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) elif isinstance(data[0], abc.Mapping): - return _list_of_dict_to_arrays(data, columns, - coerce_float=coerce_float, dtype=dtype) + return _list_of_dict_to_arrays( + data, columns, coerce_float=coerce_float, dtype=dtype + ) elif isinstance(data[0], ABCSeries): - return _list_of_series_to_arrays(data, columns, - coerce_float=coerce_float, - dtype=dtype) + return _list_of_series_to_arrays( + data, columns, coerce_float=coerce_float, dtype=dtype + ) elif isinstance(data[0], Categorical): if columns is None: columns = ibase.default_index(len(data)) return data, columns - elif (isinstance(data, (np.ndarray, ABCSeries, Index)) and - data.dtype.names is not None): + elif ( + isinstance(data, (np.ndarray, ABCSeries, Index)) + and data.dtype.names is not None + ): columns = list(data.dtype.names) arrays = [data[k] for k in columns] @@ -435,8 +479,7 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): else: # last ditch effort data = [tuple(x) for x in data] - return _list_to_arrays(data, columns, coerce_float=coerce_float, - dtype=dtype) + return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) def _list_to_arrays(data, columns, coerce_float=False, dtype=None): @@ -447,8 +490,9 @@ def _list_to_arrays(data, columns, coerce_float=False, dtype=None): content = list(lib.to_object_array(data).T) # gh-26429 do not raise user-facing AssertionError try: - result = _convert_object_array(content, columns, dtype=dtype, - coerce_float=coerce_float) + result = _convert_object_array( + content, columns, dtype=dtype, coerce_float=coerce_float + ) except AssertionError as e: raise ValueError(e) from e return result @@ -462,7 +506,7 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): aligned_values = [] for s in data: - index = getattr(s, 'index', None) + index = getattr(s, "index", None) if index is None: index = ibase.default_index(len(s)) @@ -478,8 +522,9 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): if values.dtype == np.object_: content = list(values.T) - return _convert_object_array(content, columns, dtype=dtype, - coerce_float=coerce_float) + return _convert_object_array( + content, columns, dtype=dtype, coerce_float=coerce_float + ) else: return values.T, columns @@ -495,8 +540,9 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): data = [(type(d) is dict) and d or dict(d) for d in data] content = list(lib.dicts_to_array(data, list(columns)).T) - return _convert_object_array(content, columns, dtype=dtype, - coerce_float=coerce_float) + return _convert_object_array( + content, columns, dtype=dtype, coerce_float=coerce_float + ) def _convert_object_array(content, columns, coerce_float=False, dtype=None): @@ -505,9 +551,10 @@ def _convert_object_array(content, columns, coerce_float=False, dtype=None): else: if len(columns) != len(content): # pragma: no cover # caller's responsibility to check for this... - raise AssertionError('{col:d} columns passed, passed data had ' - '{con} columns'.format(col=len(columns), - con=len(content))) + raise AssertionError( + "{col:d} columns passed, passed data had " + "{con} columns".format(col=len(columns), con=len(content)) + ) # provide soft conversion of object dtypes def convert(arr): @@ -524,6 +571,7 @@ def convert(arr): # --------------------------------------------------------------------- # Series-Based + def sanitize_index(data, index, copy=False): """ Sanitize an index type to return an ndarray of the underlying, pass @@ -534,7 +582,7 @@ def sanitize_index(data, index, copy=False): return data if len(data) != len(index): - raise ValueError('Length of values does not match length of index') + raise ValueError("Length of values does not match length of index") if isinstance(data, ABCIndexClass) and not copy: pass @@ -546,14 +594,13 @@ def sanitize_index(data, index, copy=False): elif isinstance(data, np.ndarray): # coerce datetimelike types - if data.dtype.kind in ['M', 'm']: + if data.dtype.kind in ["M", "m"]: data = sanitize_array(data, index, copy=copy) return data -def sanitize_array(data, index, dtype=None, copy=False, - raise_cast_failure=False): +def sanitize_array(data, index, dtype=None, copy=False, raise_cast_failure=False): """ Sanitize input data to an ndarray, copy if specified, coerce to the dtype if specified. @@ -576,8 +623,7 @@ def sanitize_array(data, index, dtype=None, copy=False, # GH#846 if isinstance(data, np.ndarray): - if (dtype is not None - and is_float_dtype(data.dtype) and is_integer_dtype(dtype)): + if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype): # possibility of nan -> garbage try: subarr = _try_cast(data, dtype, copy, True) @@ -617,13 +663,13 @@ def sanitize_array(data, index, dtype=None, copy=False, elif isinstance(data, range): # GH#16804 - arr = np.arange(data.start, data.stop, data.step, dtype='int64') + arr = np.arange(data.start, data.stop, data.step, dtype="int64") subarr = _try_cast(arr, dtype, copy, raise_cast_failure) else: subarr = _try_cast(data, dtype, copy, raise_cast_failure) # scalar like, GH - if getattr(subarr, 'ndim', 0) == 0: + if getattr(subarr, "ndim", 0) == 0: if isinstance(data, list): # pragma: no cover subarr = np.array(data, dtype=object) elif index is not None: @@ -636,8 +682,7 @@ def sanitize_array(data, index, dtype=None, copy=False, # need to possibly convert the value here value = maybe_cast_to_datetime(value, dtype) - subarr = construct_1d_arraylike_from_scalar( - value, len(index), dtype) + subarr = construct_1d_arraylike_from_scalar(value, len(index), dtype) else: return subarr.item() @@ -649,11 +694,12 @@ def sanitize_array(data, index, dtype=None, copy=False, # a 1-element ndarray if len(subarr) != len(index) and len(subarr) == 1: subarr = construct_1d_arraylike_from_scalar( - subarr[0], len(index), subarr.dtype) + subarr[0], len(index), subarr.dtype + ) elif subarr.ndim > 1: if isinstance(data, np.ndarray): - raise Exception('Data must be 1-dimensional') + raise Exception("Data must be 1-dimensional") else: subarr = com.asarray_tuplesafe(data, dtype=dtype) @@ -668,12 +714,13 @@ def sanitize_array(data, index, dtype=None, copy=False, data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) - if (not (is_extension_array_dtype(subarr.dtype) or - is_extension_array_dtype(dtype)) and - is_object_dtype(subarr.dtype) and - not is_object_dtype(dtype)): + if ( + not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)) + and is_object_dtype(subarr.dtype) + and not is_object_dtype(dtype) + ): inferred = lib.infer_dtype(subarr, skipna=False) - if inferred == 'period': + if inferred == "period": try: subarr = period_array(subarr) except IncompatibleFrequency: @@ -710,13 +757,13 @@ def _try_cast(arr, dtype, copy, raise_cast_failure): subarr = maybe_cast_to_datetime(arr, dtype) # Take care in creating object arrays (but iterators are not # supported): - if is_object_dtype(dtype) and (is_list_like(subarr) and - not (is_iterator(subarr) or - isinstance(subarr, np.ndarray))): + if is_object_dtype(dtype) and ( + is_list_like(subarr) + and not (is_iterator(subarr) or isinstance(subarr, np.ndarray)) + ): subarr = construct_1d_object_array_from_listlike(subarr) elif not is_extension_type(subarr): - subarr = construct_1d_ndarray_preserving_na(subarr, dtype, - copy=copy) + subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy) except OutOfBoundsDatetime: # in case of out of bound datetime64 -> always raise raise @@ -724,8 +771,7 @@ def _try_cast(arr, dtype, copy, raise_cast_failure): if is_categorical_dtype(dtype): # We *do* allow casting to categorical, since we know # that Categorical is the only array type for 'category'. - subarr = Categorical(arr, dtype.categories, - ordered=dtype._ordered) + subarr = Categorical(arr, dtype.categories, ordered=dtype._ordered) elif is_extension_array_dtype(dtype): # create an extension array from its dtype array_type = dtype.construct_array_type()._from_sequence diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index cdf0826bbe21ef..c5254aaa4af5fa 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -11,12 +11,21 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( - find_common_type, infer_dtype_from_scalar, maybe_convert_objects, - maybe_promote) + find_common_type, + infer_dtype_from_scalar, + maybe_convert_objects, + maybe_promote, +) from pandas.core.dtypes.common import ( - _NS_DTYPE, is_datetimelike_v_numeric, is_extension_array_dtype, - is_extension_type, is_list_like, is_numeric_v_string_like, is_scalar, - is_sparse) + _NS_DTYPE, + is_datetimelike_v_numeric, + is_extension_array_dtype, + is_extension_type, + is_list_like, + is_numeric_v_string_like, + is_scalar, + is_sparse, +) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCExtensionArray, ABCSeries @@ -30,12 +39,23 @@ from pandas.io.formats.printing import pprint_thing from .blocks import ( - Block, CategoricalBlock, DatetimeTZBlock, ExtensionBlock, - ObjectValuesExtensionBlock, _extend_blocks, _merge_blocks, _safe_reshape, - get_block_type, make_block) + Block, + CategoricalBlock, + DatetimeTZBlock, + ExtensionBlock, + ObjectValuesExtensionBlock, + _extend_blocks, + _merge_blocks, + _safe_reshape, + get_block_type, + make_block, +) from .concat import ( # all for concatenate_block_managers - combine_concat_plans, concatenate_join_units, get_mgr_concatenation_plan, - is_uniform_join_units) + combine_concat_plans, + concatenate_join_units, + get_mgr_concatenation_plan, + is_uniform_join_units, +) # TODO: flexible with index=None and/or items=None @@ -91,22 +111,33 @@ class BlockManager(PandasObject): ----- This is *not* a public API class """ - __slots__ = ['axes', 'blocks', '_ndim', '_shape', '_known_consolidated', - '_is_consolidated', '_blknos', '_blklocs'] - def __init__(self, - blocks: Sequence[Block], - axes: Sequence[Index], - do_integrity_check: bool = True): + __slots__ = [ + "axes", + "blocks", + "_ndim", + "_shape", + "_known_consolidated", + "_is_consolidated", + "_blknos", + "_blklocs", + ] + + def __init__( + self, + blocks: Sequence[Block], + axes: Sequence[Index], + do_integrity_check: bool = True, + ): self.axes = [ensure_index(ax) for ax in axes] self.blocks = tuple(blocks) # type: Tuple[Block, ...] for block in blocks: if self.ndim != block.ndim: raise AssertionError( - 'Number of Block dimensions ({block}) must equal ' - 'number of axes ({self})'.format(block=block.ndim, - self=self.ndim)) + "Number of Block dimensions ({block}) must equal " + "number of axes ({self})".format(block=block.ndim, self=self.ndim) + ) if do_integrity_check: self._verify_integrity() @@ -118,8 +149,7 @@ def __init__(self, def make_empty(self, axes=None): """ return an empty BlockManager with the items axis of len 0 """ if axes is None: - axes = [ensure_index([])] + [ensure_index(a) - for a in self.axes[1:]] + axes = [ensure_index([])] + [ensure_index(a) for a in self.axes[1:]] # preserve dtype if possible if self.ndim == 1: @@ -149,8 +179,9 @@ def set_axis(self, axis, new_labels): if new_len != old_len: raise ValueError( - 'Length mismatch: Expected axis has {old} elements, new ' - 'values have {new} elements'.format(old=old_len, new=new_len)) + "Length mismatch: Expected axis has {old} elements, new " + "values have {new} elements".format(old=old_len, new=new_len) + ) self.axes[axis] = new_labels @@ -178,8 +209,9 @@ def _is_single_block(self): return False blk = self.blocks[0] - return (blk.mgr_locs.is_slice_like and - blk.mgr_locs.as_slice == slice(0, len(self), 1)) + return blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice == slice( + 0, len(self), 1 + ) def _rebuild_blknos_and_blklocs(self): """ @@ -234,10 +266,12 @@ def __getstate__(self): axes_array = [ax for ax in self.axes] extra_state = { - '0.14.1': { - 'axes': axes_array, - 'blocks': [dict(values=b.values, mgr_locs=b.mgr_locs.indexer) - for b in self.blocks] + "0.14.1": { + "axes": axes_array, + "blocks": [ + dict(values=b.values, mgr_locs=b.mgr_locs.indexer) + for b in self.blocks + ], } } @@ -249,12 +283,12 @@ def __setstate__(self, state): def unpickle_block(values, mgr_locs): return make_block(values, placement=mgr_locs) - if (isinstance(state, tuple) and len(state) >= 4 and - '0.14.1' in state[3]): - state = state[3]['0.14.1'] - self.axes = [ensure_index(ax) for ax in state['axes']] - self.blocks = tuple(unpickle_block(b['values'], b['mgr_locs']) - for b in state['blocks']) + if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]: + state = state[3]["0.14.1"] + self.axes = [ensure_index(ax) for ax in state["axes"]] + self.blocks = tuple( + unpickle_block(b["values"], b["mgr_locs"]) for b in state["blocks"] + ) else: # discard anything after 3rd, support beta pickling format for a # little while longer @@ -272,12 +306,14 @@ def unpickle_block(values, mgr_locs): # block items corresponded to manager items 1-to-1. all_mgr_locs = [slice(0, len(bitems[0]))] else: - all_mgr_locs = [self.axes[0].get_indexer(blk_items) - for blk_items in bitems] + all_mgr_locs = [ + self.axes[0].get_indexer(blk_items) for blk_items in bitems + ] self.blocks = tuple( unpickle_block(values, mgr_locs) - for values, mgr_locs in zip(bvalues, all_mgr_locs)) + for values, mgr_locs in zip(bvalues, all_mgr_locs) + ) self._post_setstate() @@ -293,12 +329,12 @@ def __repr__(self): output = pprint_thing(self.__class__.__name__) for i, ax in enumerate(self.axes): if i == 0: - output += '\nItems: {ax}'.format(ax=ax) + output += "\nItems: {ax}".format(ax=ax) else: - output += '\nAxis {i}: {ax}'.format(i=i, ax=ax) + output += "\nAxis {i}: {ax}".format(i=i, ax=ax) for block in self.blocks: - output += '\n{block}'.format(block=pprint_thing(block)) + output += "\n{block}".format(block=pprint_thing(block)) return output def _verify_integrity(self): @@ -308,13 +344,21 @@ def _verify_integrity(self): if block._verify_integrity and block.shape[1:] != mgr_shape[1:]: construction_error(tot_items, block.shape[1:], self.axes) if len(self.items) != tot_items: - raise AssertionError('Number of manager items must equal union of ' - 'block items\n# manager items: {0}, # ' - 'tot_items: {1}'.format( - len(self.items), tot_items)) + raise AssertionError( + "Number of manager items must equal union of " + "block items\n# manager items: {0}, # " + "tot_items: {1}".format(len(self.items), tot_items) + ) - def apply(self, f, axes=None, filter=None, do_integrity_check=False, - consolidate=True, **kwargs): + def apply( + self, + f, + axes=None, + filter=None, + do_integrity_check=False, + consolidate=True, + **kwargs + ): """ iterate over the blocks, collect and create a new block manager @@ -344,37 +388,39 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, # All items are included, as if there were no filtering filter = None else: - kwargs['filter'] = filter_locs + kwargs["filter"] = filter_locs if consolidate: self._consolidate_inplace() - if f == 'where': + if f == "where": align_copy = True - if kwargs.get('align', True): - align_keys = ['other', 'cond'] + if kwargs.get("align", True): + align_keys = ["other", "cond"] else: - align_keys = ['cond'] - elif f == 'putmask': + align_keys = ["cond"] + elif f == "putmask": align_copy = False - if kwargs.get('align', True): - align_keys = ['new', 'mask'] + if kwargs.get("align", True): + align_keys = ["new", "mask"] else: - align_keys = ['mask'] - elif f == 'fillna': + align_keys = ["mask"] + elif f == "fillna": # fillna internally does putmask, maybe it's better to do this # at mgr, not block level? align_copy = False - align_keys = ['value'] + align_keys = ["value"] else: align_keys = [] # TODO(EA): may interfere with ExtensionBlock.setitem for blocks # with a .values attribute. - aligned_args = {k: kwargs[k] - for k in align_keys - if not isinstance(kwargs[k], ABCExtensionArray) and - hasattr(kwargs[k], 'values')} + aligned_args = { + k: kwargs[k] + for k in align_keys + if not isinstance(kwargs[k], ABCExtensionArray) + and hasattr(kwargs[k], "values") + } for b in self.blocks: if filter is not None: @@ -386,22 +432,29 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, b_items = self.items[b.mgr_locs.indexer] for k, obj in aligned_args.items(): - axis = getattr(obj, '_info_axis_number', 0) - kwargs[k] = obj.reindex(b_items, axis=axis, - copy=align_copy) + axis = getattr(obj, "_info_axis_number", 0) + kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy) applied = getattr(b, f)(**kwargs) result_blocks = _extend_blocks(applied, result_blocks) if len(result_blocks) == 0: return self.make_empty(axes or self.axes) - bm = self.__class__(result_blocks, axes or self.axes, - do_integrity_check=do_integrity_check) + bm = self.__class__( + result_blocks, axes or self.axes, do_integrity_check=do_integrity_check + ) bm._consolidate_inplace() return bm - def quantile(self, axis=0, consolidate=True, transposed=False, - interpolation='linear', qs=None, numeric_only=None): + def quantile( + self, + axis=0, + consolidate=True, + transposed=False, + interpolation="linear", + qs=None, + numeric_only=None, + ): """ Iterate over blocks applying quantile reduction. This routine is intended for reduction type operations and @@ -434,6 +487,7 @@ def get_axe(block, qs, axes): # Because Series dispatches to DataFrame, we will always have # block.ndim == 2 from pandas import Float64Index + if is_list_like(qs): ax = Float64Index(qs) else: @@ -466,14 +520,14 @@ def get_axe(block, qs, axes): b.mgr_locs = sb.mgr_locs else: - new_axes[axis] = Index(np.concatenate( - [ax.values for ax in axes])) + new_axes[axis] = Index(np.concatenate([ax.values for ax in axes])) if transposed: new_axes = new_axes[::-1] - blocks = [b.make_block(b.values.T, - placement=np.arange(b.shape[1]) - ) for b in blocks] + blocks = [ + b.make_block(b.values.T, placement=np.arange(b.shape[1])) + for b in blocks + ] return self.__class__(blocks, new_axes) @@ -493,51 +547,49 @@ def get_axe(block, qs, axes): values = values.take(indexer) return SingleBlockManager( - [make_block(values, - ndim=1, - placement=np.arange(len(values)))], - axes[0]) + [make_block(values, ndim=1, placement=np.arange(len(values)))], axes[0] + ) def isna(self, func, **kwargs): - return self.apply('apply', func=func, **kwargs) + return self.apply("apply", func=func, **kwargs) def where(self, **kwargs): - return self.apply('where', **kwargs) + return self.apply("where", **kwargs) def setitem(self, **kwargs): - return self.apply('setitem', **kwargs) + return self.apply("setitem", **kwargs) def putmask(self, **kwargs): - return self.apply('putmask', **kwargs) + return self.apply("putmask", **kwargs) def diff(self, **kwargs): - return self.apply('diff', **kwargs) + return self.apply("diff", **kwargs) def interpolate(self, **kwargs): - return self.apply('interpolate', **kwargs) + return self.apply("interpolate", **kwargs) def shift(self, **kwargs): - return self.apply('shift', **kwargs) + return self.apply("shift", **kwargs) def fillna(self, **kwargs): - return self.apply('fillna', **kwargs) + return self.apply("fillna", **kwargs) def downcast(self, **kwargs): - return self.apply('downcast', **kwargs) + return self.apply("downcast", **kwargs) def astype(self, dtype, **kwargs): - return self.apply('astype', dtype=dtype, **kwargs) + return self.apply("astype", dtype=dtype, **kwargs) def convert(self, **kwargs): - return self.apply('convert', **kwargs) + return self.apply("convert", **kwargs) def replace(self, **kwargs): - return self.apply('replace', **kwargs) + return self.apply("replace", **kwargs) def replace_list(self, src_list, dest_list, inplace=False, regex=False): """ do a list replace """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") # figure out our mask a-priori to avoid repeated replacements values = self.as_array() @@ -549,9 +601,10 @@ def comp(s, regex=False): """ if isna(s): return isna(values) - if hasattr(s, 'asm8'): - return _compare_or_regex_search(maybe_convert_objects(values), - getattr(s, 'asm8'), regex) + if hasattr(s, "asm8"): + return _compare_or_regex_search( + maybe_convert_objects(values), getattr(s, "asm8"), regex + ) return _compare_or_regex_search(values, s, regex) masks = [comp(s, regex) for i, s in enumerate(src_list)] @@ -568,9 +621,14 @@ def comp(s, regex=False): for b in rb: m = masks[i][b.mgr_locs.indexer] convert = i == src_len - result = b._replace_coerce(mask=m, to_replace=s, value=d, - inplace=inplace, - convert=convert, regex=regex) + result = b._replace_coerce( + mask=m, + to_replace=s, + value=d, + inplace=inplace, + convert=convert, + regex=regex, + ) if m.any(): new_rb = _extend_blocks(result, new_rb) else: @@ -659,15 +717,15 @@ def combine(self, blocks, copy=True): return self.make_empty() # FIXME: optimization potential - indexer = np.sort(np.concatenate([b.mgr_locs.as_array - for b in blocks])) + indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) new_blocks = [] for b in blocks: b = b.copy(deep=copy) - b.mgr_locs = algos.take_1d(inv_indexer, b.mgr_locs.as_array, - axis=0, allow_fill=False) + b.mgr_locs = algos.take_1d( + inv_indexer, b.mgr_locs.as_array, axis=0, allow_fill=False + ) new_blocks.append(b) axes = list(self.axes) @@ -717,15 +775,14 @@ def copy(self, deep=True): """ # this preserves the notion of view copying of axes if deep: - if deep == 'all': + if deep == "all": copy = lambda ax: ax.copy(deep=True) else: copy = lambda ax: ax.view() new_axes = [copy(ax) for ax in self.axes] else: new_axes = list(self.axes) - return self.apply('copy', axes=new_axes, deep=deep, - do_integrity_check=False) + return self.apply("copy", axes=new_axes, deep=deep, do_integrity_check=False) def as_array(self, transpose=False, items=None): """Convert the blockmanager data into an numpy array. @@ -775,7 +832,7 @@ def _interleave(self): if is_sparse(dtype): dtype = dtype.subtype elif is_extension_array_dtype(dtype): - dtype = 'object' + dtype = "object" result = np.empty(self.shape, dtype=dtype) @@ -787,7 +844,7 @@ def _interleave(self): itemmask[rl.indexer] = 1 if not itemmask.all(): - raise AssertionError('Some items were not contained in blocks') + raise AssertionError("Some items were not contained in blocks") return result @@ -813,8 +870,7 @@ def to_dict(self, copy=True): for b in self.blocks: bd.setdefault(str(b.dtype), []).append(b) - return {dtype: self.combine(blocks, copy=copy) - for dtype, blocks in bd.items()} + return {dtype: self.combine(blocks, copy=copy) for dtype, blocks in bd.items()} def fast_xs(self, loc): """ @@ -853,9 +909,7 @@ def fast_xs(self, loc): result[rl] = blk._try_coerce_result(blk.iget((i, loc))) if is_extension_array_dtype(dtype): - result = dtype.construct_array_type()._from_sequence( - result, dtype=dtype - ) + result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) return result @@ -907,9 +961,9 @@ def get(self, item, fastpath=True): raise TypeError("cannot label index with a null key") indexer = self.items.get_indexer_for([item]) - return self.reindex_indexer(new_axis=self.items[indexer], - indexer=indexer, axis=0, - allow_dups=True) + return self.reindex_indexer( + new_axis=self.items[indexer], indexer=indexer, axis=0, allow_dups=True + ) def iget(self, i, fastpath=True): """ @@ -924,10 +978,13 @@ def iget(self, i, fastpath=True): # fastpath shortcut for select a single-dim from a 2-dim BM return SingleBlockManager( - [block.make_block_same_class(values, - placement=slice(0, len(values)), - ndim=1)], - self.axes[1]) + [ + block.make_block_same_class( + values, placement=slice(0, len(values)), ndim=1 + ) + ], + self.axes[1], + ) def delete(self, item): """ @@ -962,8 +1019,9 @@ def delete(self, item): # FIXME: use Index.delete as soon as it uses fastpath=True self.axes[0] = self.items[~is_deleted] - self.blocks = tuple(b for blkno, b in enumerate(self.blocks) - if not is_blk_deleted[blkno]) + self.blocks = tuple( + b for blkno, b in enumerate(self.blocks) if not is_blk_deleted[blkno] + ) self._shape = None self._rebuild_blknos_and_blklocs() @@ -977,28 +1035,32 @@ def set(self, item, value): # TODO(EA): Remove an is_extension_ when all extension types satisfy # the interface - value_is_extension_type = (is_extension_type(value) or - is_extension_array_dtype(value)) + value_is_extension_type = is_extension_type(value) or is_extension_array_dtype( + value + ) # categorical/sparse/datetimetz if value_is_extension_type: def value_getitem(placement): return value + else: if value.ndim == self.ndim - 1: value = _safe_reshape(value, (1,) + value.shape) def value_getitem(placement): return value + else: def value_getitem(placement): return value[placement.indexer] if value.shape[1:] != self.shape[1:]: - raise AssertionError('Shape of new values must be compatible ' - 'with manager shape') + raise AssertionError( + "Shape of new values must be compatible " "with manager shape" + ) try: loc = self.items.get_loc(item) @@ -1016,9 +1078,9 @@ def value_getitem(placement): unfit_mgr_locs = [] unfit_val_locs = [] removed_blknos = [] - for blkno, val_locs in libinternals.get_blkno_placements(blknos, - self.nblocks, - group=True): + for blkno, val_locs in libinternals.get_blkno_placements( + blknos, self.nblocks, group=True + ): blk = self.blocks[blkno] blk_locs = blklocs[val_locs.indexer] if blk.should_store(value): @@ -1042,12 +1104,13 @@ def value_getitem(placement): new_blknos = np.empty(self.nblocks, dtype=np.int64) new_blknos.fill(-1) - new_blknos[~is_deleted] = np.arange(self.nblocks - - len(removed_blknos)) - self._blknos = algos.take_1d(new_blknos, self._blknos, axis=0, - allow_fill=False) - self.blocks = tuple(blk for i, blk in enumerate(self.blocks) - if i not in set(removed_blknos)) + new_blknos[~is_deleted] = np.arange(self.nblocks - len(removed_blknos)) + self._blknos = algos.take_1d( + new_blknos, self._blknos, axis=0, allow_fill=False + ) + self.blocks = tuple( + blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos) + ) if unfit_val_locs: unfit_mgr_locs = np.concatenate(unfit_mgr_locs) @@ -1058,12 +1121,15 @@ def value_getitem(placement): # This code (ab-)uses the fact that sparse blocks contain only # one item. new_blocks.extend( - make_block(values=value.copy(), ndim=self.ndim, - placement=slice(mgr_loc, mgr_loc + 1)) - for mgr_loc in unfit_mgr_locs) - - self._blknos[unfit_mgr_locs] = (np.arange(unfit_count) + - len(self.blocks)) + make_block( + values=value.copy(), + ndim=self.ndim, + placement=slice(mgr_loc, mgr_loc + 1), + ) + for mgr_loc in unfit_mgr_locs + ) + + self._blknos[unfit_mgr_locs] = np.arange(unfit_count) + len(self.blocks) self._blklocs[unfit_mgr_locs] = 0 else: @@ -1071,8 +1137,12 @@ def value_getitem(placement): unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:]) new_blocks.append( - make_block(values=value_getitem(unfit_val_items), - ndim=self.ndim, placement=unfit_mgr_locs)) + make_block( + values=value_getitem(unfit_val_items), + ndim=self.ndim, + placement=unfit_mgr_locs, + ) + ) self._blknos[unfit_mgr_locs] = len(self.blocks) self._blklocs[unfit_mgr_locs] = np.arange(unfit_count) @@ -1097,7 +1167,7 @@ def insert(self, loc, item, value, allow_duplicates=False): """ if not allow_duplicates and item in self.items: # Should this be a different kind of error?? - raise ValueError('cannot insert {}, already exists'.format(item)) + raise ValueError("cannot insert {}, already exists".format(item)) if not isinstance(loc, int): raise TypeError("loc must be int") @@ -1105,8 +1175,7 @@ def insert(self, loc, item, value, allow_duplicates=False): # insert to the axis; this could possibly raise a TypeError new_axis = self.items.insert(loc, item) - block = make_block(values=value, ndim=self.ndim, - placement=slice(loc, loc + 1)) + block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1)) for blkno, count in _fast_count_smallints(self._blknos[loc:]): blk = self.blocks[blkno] @@ -1134,20 +1203,24 @@ def insert(self, loc, item, value, allow_duplicates=False): if len(self.blocks) > 100: self._consolidate_inplace() - def reindex_axis(self, new_index, axis, method=None, limit=None, - fill_value=None, copy=True): + def reindex_axis( + self, new_index, axis, method=None, limit=None, fill_value=None, copy=True + ): """ Conform block manager to new index. """ new_index = ensure_index(new_index) - new_index, indexer = self.axes[axis].reindex(new_index, method=method, - limit=limit) + new_index, indexer = self.axes[axis].reindex( + new_index, method=method, limit=limit + ) - return self.reindex_indexer(new_index, indexer, axis=axis, - fill_value=fill_value, copy=copy) + return self.reindex_indexer( + new_index, indexer, axis=axis, fill_value=fill_value, copy=copy + ) - def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, - allow_dups=False, copy=True): + def reindex_indexer( + self, new_axis, indexer, axis, fill_value=None, allow_dups=False, copy=True + ): """ Parameters ---------- @@ -1178,12 +1251,18 @@ def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, raise IndexError("Requested axis not found in manager") if axis == 0: - new_blocks = self._slice_take_blocks_ax0(indexer, - fill_tuple=(fill_value,)) + new_blocks = self._slice_take_blocks_ax0(indexer, fill_tuple=(fill_value,)) else: - new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=( - fill_value if fill_value is not None else blk.fill_value,)) - for blk in self.blocks] + new_blocks = [ + blk.take_nd( + indexer, + axis=axis, + fill_tuple=( + fill_value if fill_value is not None else blk.fill_value, + ), + ) + for blk in self.blocks + ] new_axes = list(self.axes) new_axes[axis] = new_axis @@ -1204,30 +1283,38 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): allow_fill = fill_tuple is not None sl_type, slobj, sllen = _preprocess_slice_or_indexer( - slice_or_indexer, self.shape[0], allow_fill=allow_fill) + slice_or_indexer, self.shape[0], allow_fill=allow_fill + ) if self._is_single_block: blk = self.blocks[0] - if sl_type in ('slice', 'mask'): + if sl_type in ("slice", "mask"): return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] elif not allow_fill or self.ndim == 1: if allow_fill and fill_tuple[0] is None: _, fill_value = maybe_promote(blk.dtype) fill_tuple = (fill_value,) - return [blk.take_nd(slobj, axis=0, - new_mgr_locs=slice(0, sllen), - fill_tuple=fill_tuple)] + return [ + blk.take_nd( + slobj, + axis=0, + new_mgr_locs=slice(0, sllen), + fill_tuple=fill_tuple, + ) + ] - if sl_type in ('slice', 'mask'): + if sl_type in ("slice", "mask"): blknos = self._blknos[slobj] blklocs = self._blklocs[slobj] else: - blknos = algos.take_1d(self._blknos, slobj, fill_value=-1, - allow_fill=allow_fill) - blklocs = algos.take_1d(self._blklocs, slobj, fill_value=-1, - allow_fill=allow_fill) + blknos = algos.take_1d( + self._blknos, slobj, fill_value=-1, allow_fill=allow_fill + ) + blklocs = algos.take_1d( + self._blklocs, slobj, fill_value=-1, allow_fill=allow_fill + ) # When filling blknos, make sure blknos is updated before appending to # blocks list, that way new blkno is exactly len(blocks). @@ -1235,15 +1322,16 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): # FIXME: mgr_groupby_blknos must return mgr_locs in ascending order, # pytables serialization will break otherwise. blocks = [] - for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, - self.nblocks, - group=True): + for blkno, mgr_locs in libinternals.get_blkno_placements( + blknos, self.nblocks, group=True + ): if blkno == -1: # If we've got here, fill_tuple was not None. fill_value = fill_tuple[0] - blocks.append(self._make_na_block(placement=mgr_locs, - fill_value=fill_value)) + blocks.append( + self._make_na_block(placement=mgr_locs, fill_value=fill_value) + ) else: blk = self.blocks[blkno] @@ -1258,9 +1346,14 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): blocks.append(newblk) else: - blocks.append(blk.take_nd(blklocs[mgr_locs.indexer], - axis=0, new_mgr_locs=mgr_locs, - fill_tuple=None)) + blocks.append( + blk.take_nd( + blklocs[mgr_locs.indexer], + axis=0, + new_mgr_locs=mgr_locs, + fill_tuple=None, + ) + ) return blocks @@ -1282,10 +1375,11 @@ def take(self, indexer, axis=1, verify=True, convert=True): Take items along any axis. """ self._consolidate_inplace() - indexer = (np.arange(indexer.start, indexer.stop, indexer.step, - dtype='int64') - if isinstance(indexer, slice) - else np.asanyarray(indexer, dtype='int64')) + indexer = ( + np.arange(indexer.start, indexer.stop, indexer.step, dtype="int64") + if isinstance(indexer, slice) + else np.asanyarray(indexer, dtype="int64") + ) n = self.shape[axis] if convert: @@ -1293,12 +1387,14 @@ def take(self, indexer, axis=1, verify=True, convert=True): if verify: if ((indexer == -1) | (indexer >= n)).any(): - raise Exception('Indices must be nonzero and less than ' - 'the axis length') + raise Exception( + "Indices must be nonzero and less than " "the axis length" + ) new_labels = self.axes[axis].take(indexer) - return self.reindex_indexer(new_axis=new_labels, indexer=indexer, - axis=axis, allow_dups=True) + return self.reindex_indexer( + new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True + ) def equals(self, other): self_axes, other_axes = self.axes, other.axes @@ -1320,8 +1416,9 @@ def canonicalize(block): self_blocks = sorted(self.blocks, key=canonicalize) other_blocks = sorted(other.blocks, key=canonicalize) - return all(block.equals(oblock) - for block, oblock in zip(self_blocks, other_blocks)) + return all( + block.equals(oblock) for block, oblock in zip(self_blocks, other_blocks) + ) def unstack(self, unstacker_func, fill_value): """Return a blockmanager with all blocks unstacked. @@ -1346,11 +1443,10 @@ def unstack(self, unstacker_func, fill_value): for blk in self.blocks: blocks, mask = blk._unstack( - partial(unstacker_func, - value_columns=self.items[blk.mgr_locs.indexer]), + partial(unstacker_func, value_columns=self.items[blk.mgr_locs.indexer]), new_columns, n_rows, - fill_value + fill_value, ) new_blocks.extend(blocks) @@ -1370,15 +1466,18 @@ class SingleBlockManager(BlockManager): _known_consolidated = True __slots__ = () - def __init__(self, - block: Block, - axis: Union[Index, List[Index]], - do_integrity_check: bool = False, - fastpath: bool = False): + def __init__( + self, + block: Block, + axis: Union[Index, List[Index]], + do_integrity_check: bool = False, + fastpath: bool = False, + ): if isinstance(axis, list): if len(axis) != 1: - raise ValueError("cannot create SingleBlockManager with more " - "than 1 axis") + raise ValueError( + "cannot create SingleBlockManager with more " "than 1 axis" + ) axis = axis[0] # passed from constructor, single block, single axis @@ -1390,8 +1489,9 @@ def __init__(self, if len(block) == 0: block = [np.array([])] elif len(block) != 1: - raise ValueError('Cannot create SingleBlockManager with ' - 'more than 1 block') + raise ValueError( + "Cannot create SingleBlockManager with " "more than 1 block" + ) block = block[0] else: self.axes = [ensure_index(axis)] @@ -1406,8 +1506,9 @@ def __init__(self, block = _consolidate(block) if len(block) != 1: - raise ValueError('Cannot create SingleBlockManager with ' - 'more than 1 block') + raise ValueError( + "Cannot create SingleBlockManager with " "more than 1 block" + ) block = block[0] if not isinstance(block, Block): @@ -1440,8 +1541,9 @@ def get_slice(self, slobj, axis=0): if axis >= self.ndim: raise IndexError("Requested axis not found in manager") - return self.__class__(self._block._slice(slobj), - self.index[slobj], fastpath=True) + return self.__class__( + self._block._slice(slobj), self.index[slobj], fastpath=True + ) @property def index(self): @@ -1449,8 +1551,8 @@ def index(self): def convert(self, **kwargs): """ convert the whole block as one """ - kwargs['by_item'] = False - return self.apply('convert', **kwargs) + kwargs["by_item"] = False + return self.apply("convert", **kwargs) @property def dtype(self): @@ -1547,13 +1649,11 @@ def concat(self, to_concat, new_axis): else: values = [x.values for x in blocks] values = _concat._concat_compat(values) - new_block = make_block( - values, placement=slice(0, len(values), 1)) + new_block = make_block(values, placement=slice(0, len(values), 1)) else: values = [x._block.values for x in to_concat] values = _concat._concat_compat(values) - new_block = make_block( - values, placement=slice(0, len(values), 1)) + new_block = make_block(values, placement=slice(0, len(values), 1)) mgr = SingleBlockManager(new_block, new_axis) return mgr @@ -1562,6 +1662,7 @@ def concat(self, to_concat, new_axis): # -------------------------------------------------------------------- # Constructor Helpers + def create_block_manager_from_blocks(blocks, axes): try: if len(blocks) == 1 and not isinstance(blocks[0], Block): @@ -1572,15 +1673,16 @@ def create_block_manager_from_blocks(blocks, axes): # It's OK if a single block is passed as values, its placement # is basically "all items", but if there're many, don't bother # converting, it's an error anyway. - blocks = [make_block(values=blocks[0], - placement=slice(0, len(axes[0])))] + blocks = [ + make_block(values=blocks[0], placement=slice(0, len(axes[0]))) + ] mgr = BlockManager(blocks, axes) mgr._consolidate_inplace() return mgr except ValueError as e: - blocks = [getattr(b, 'values', b) for b in blocks] + blocks = [getattr(b, "values", b) for b in blocks] tot_items = sum(b.shape[0] for b in blocks) construction_error(tot_items, blocks[0].shape[1:], axes, e) @@ -1612,12 +1714,14 @@ def construction_error(tot_items, block_shape, axes, e=None): raise e if block_shape[0] == 0: raise ValueError("Empty data passed with indices specified.") - raise ValueError("Shape of passed values is {0}, indices imply {1}".format( - passed, implied)) + raise ValueError( + "Shape of passed values is {0}, indices imply {1}".format(passed, implied) + ) # ----------------------------------------------------------------------- + def form_blocks(arrays, names, axes): # put "leftover" items in float bucket, where else? # generalize? @@ -1643,60 +1747,61 @@ def form_blocks(arrays, names, axes): items_dict[block_type.__name__].append((i, k, v)) blocks = [] - if len(items_dict['FloatBlock']): - float_blocks = _multi_blockify(items_dict['FloatBlock']) + if len(items_dict["FloatBlock"]): + float_blocks = _multi_blockify(items_dict["FloatBlock"]) blocks.extend(float_blocks) - if len(items_dict['ComplexBlock']): - complex_blocks = _multi_blockify(items_dict['ComplexBlock']) + if len(items_dict["ComplexBlock"]): + complex_blocks = _multi_blockify(items_dict["ComplexBlock"]) blocks.extend(complex_blocks) - if len(items_dict['TimeDeltaBlock']): - timedelta_blocks = _multi_blockify(items_dict['TimeDeltaBlock']) + if len(items_dict["TimeDeltaBlock"]): + timedelta_blocks = _multi_blockify(items_dict["TimeDeltaBlock"]) blocks.extend(timedelta_blocks) - if len(items_dict['IntBlock']): - int_blocks = _multi_blockify(items_dict['IntBlock']) + if len(items_dict["IntBlock"]): + int_blocks = _multi_blockify(items_dict["IntBlock"]) blocks.extend(int_blocks) - if len(items_dict['DatetimeBlock']): - datetime_blocks = _simple_blockify(items_dict['DatetimeBlock'], - _NS_DTYPE) + if len(items_dict["DatetimeBlock"]): + datetime_blocks = _simple_blockify(items_dict["DatetimeBlock"], _NS_DTYPE) blocks.extend(datetime_blocks) - if len(items_dict['DatetimeTZBlock']): - dttz_blocks = [make_block(array, - klass=DatetimeTZBlock, - placement=[i]) - for i, _, array in items_dict['DatetimeTZBlock']] + if len(items_dict["DatetimeTZBlock"]): + dttz_blocks = [ + make_block(array, klass=DatetimeTZBlock, placement=[i]) + for i, _, array in items_dict["DatetimeTZBlock"] + ] blocks.extend(dttz_blocks) - if len(items_dict['BoolBlock']): - bool_blocks = _simple_blockify(items_dict['BoolBlock'], np.bool_) + if len(items_dict["BoolBlock"]): + bool_blocks = _simple_blockify(items_dict["BoolBlock"], np.bool_) blocks.extend(bool_blocks) - if len(items_dict['ObjectBlock']) > 0: - object_blocks = _simple_blockify(items_dict['ObjectBlock'], np.object_) + if len(items_dict["ObjectBlock"]) > 0: + object_blocks = _simple_blockify(items_dict["ObjectBlock"], np.object_) blocks.extend(object_blocks) - if len(items_dict['CategoricalBlock']) > 0: - cat_blocks = [make_block(array, klass=CategoricalBlock, placement=[i]) - for i, _, array in items_dict['CategoricalBlock']] + if len(items_dict["CategoricalBlock"]) > 0: + cat_blocks = [ + make_block(array, klass=CategoricalBlock, placement=[i]) + for i, _, array in items_dict["CategoricalBlock"] + ] blocks.extend(cat_blocks) - if len(items_dict['ExtensionBlock']): + if len(items_dict["ExtensionBlock"]): external_blocks = [ make_block(array, klass=ExtensionBlock, placement=[i]) - for i, _, array in items_dict['ExtensionBlock'] + for i, _, array in items_dict["ExtensionBlock"] ] blocks.extend(external_blocks) - if len(items_dict['ObjectValuesExtensionBlock']): + if len(items_dict["ObjectValuesExtensionBlock"]): external_blocks = [ make_block(array, klass=ObjectValuesExtensionBlock, placement=[i]) - for i, _, array in items_dict['ObjectValuesExtensionBlock'] + for i, _, array in items_dict["ObjectValuesExtensionBlock"] ] blocks.extend(external_blocks) @@ -1756,7 +1861,7 @@ def _asarray_compat(x): def _shape_compat(x): if isinstance(x, ABCSeries): - return len(x), + return (len(x),) else: return x.shape @@ -1773,7 +1878,8 @@ def _shape_compat(x): def _interleaved_dtype( - blocks: List[Block]) -> Optional[Union[np.dtype, ExtensionDtype]]: + blocks: List[Block] +) -> Optional[Union[np.dtype, ExtensionDtype]]: """Find the common dtype for `blocks`. Parameters @@ -1802,8 +1908,9 @@ def _consolidate(blocks): new_blocks = [] for (_can_consolidate, dtype), group_blocks in grouper: - merged_blocks = _merge_blocks(list(group_blocks), dtype=dtype, - _can_consolidate=_can_consolidate) + merged_blocks = _merge_blocks( + list(group_blocks), dtype=dtype, _can_consolidate=_can_consolidate + ) new_blocks = _extend_blocks(merged_blocks, new_blocks) return new_blocks @@ -1828,8 +1935,9 @@ def _compare_or_regex_search(a, b, regex=False): if not regex: op = lambda x: operator.eq(x, b) else: - op = np.vectorize(lambda x: bool(re.search(b, x)) if isinstance(x, str) - else False) + op = np.vectorize( + lambda x: bool(re.search(b, x)) if isinstance(x, str) else False + ) is_a_array = isinstance(a, np.ndarray) is_b_array = isinstance(b, np.ndarray) @@ -1848,14 +1956,16 @@ def _compare_or_regex_search(a, b, regex=False): type_names = [type(a).__name__, type(b).__name__] if is_a_array: - type_names[0] = 'ndarray(dtype={dtype})'.format(dtype=a.dtype) + type_names[0] = "ndarray(dtype={dtype})".format(dtype=a.dtype) if is_b_array: - type_names[1] = 'ndarray(dtype={dtype})'.format(dtype=b.dtype) + type_names[1] = "ndarray(dtype={dtype})".format(dtype=b.dtype) raise TypeError( - "Cannot compare types {a!r} and {b!r}".format(a=type_names[0], - b=type_names[1])) + "Cannot compare types {a!r} and {b!r}".format( + a=type_names[0], b=type_names[1] + ) + ) return result @@ -1869,8 +1979,10 @@ def _transform_index(index, func, level=None): """ if isinstance(index, MultiIndex): if level is not None: - items = [tuple(func(y) if i == level else y - for i, y in enumerate(x)) for x in index] + items = [ + tuple(func(y) if i == level else y for i, y in enumerate(x)) + for x in index + ] else: items = [tuple(func(y) for y in x) for x in index] return MultiIndex.from_tuples(items, names=index.names) @@ -1888,16 +2000,20 @@ def _fast_count_smallints(arr): def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill): if isinstance(slice_or_indexer, slice): - return ('slice', slice_or_indexer, - libinternals.slice_len(slice_or_indexer, length)) - elif (isinstance(slice_or_indexer, np.ndarray) and - slice_or_indexer.dtype == np.bool_): - return 'mask', slice_or_indexer, slice_or_indexer.sum() + return ( + "slice", + slice_or_indexer, + libinternals.slice_len(slice_or_indexer, length), + ) + elif ( + isinstance(slice_or_indexer, np.ndarray) and slice_or_indexer.dtype == np.bool_ + ): + return "mask", slice_or_indexer, slice_or_indexer.sum() else: indexer = np.asanyarray(slice_or_indexer, dtype=np.int64) if not allow_fill: indexer = maybe_convert_indices(indexer, length) - return 'fancy', indexer, len(indexer) + return "fancy", indexer, len(indexer) def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): @@ -1912,8 +2028,9 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): copy : bool """ - concat_plans = [get_mgr_concatenation_plan(mgr, indexers) - for mgr, indexers in mgrs_indexers] + concat_plans = [ + get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers + ] concat_plan = combine_concat_plans(concat_plans, concat_axis) blocks = [] @@ -1929,11 +2046,13 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): b = b.make_block_same_class(values, placement=placement) elif is_uniform_join_units(join_units): b = join_units[0].block.concat_same_type( - [ju.block for ju in join_units], placement=placement) + [ju.block for ju in join_units], placement=placement + ) else: b = make_block( concatenate_join_units(join_units, concat_axis, copy=copy), - placement=placement) + placement=placement, + ) blocks.append(b) return BlockManager(blocks, axes) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 4230b212f567a1..ad4b5e45238067 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -10,9 +10,17 @@ from pandas.core.dtypes.cast import infer_dtype_from_array from pandas.core.dtypes.common import ( - ensure_float64, is_datetime64_dtype, is_datetime64tz_dtype, is_float_dtype, - is_integer, is_integer_dtype, is_numeric_v_string_like, is_scalar, - is_timedelta64_dtype, needs_i8_conversion) + ensure_float64, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_float_dtype, + is_integer, + is_integer_dtype, + is_numeric_v_string_like, + is_scalar, + is_timedelta64_dtype, + needs_i8_conversion, +) from pandas.core.dtypes.missing import isna @@ -69,47 +77,73 @@ def mask_missing(arr, values_to_mask): def clean_fill_method(method, allow_nearest=False): # asfreq is compat for resampling - if method in [None, 'asfreq']: + if method in [None, "asfreq"]: return None if isinstance(method, str): method = method.lower() - if method == 'ffill': - method = 'pad' - elif method == 'bfill': - method = 'backfill' + if method == "ffill": + method = "pad" + elif method == "bfill": + method = "backfill" - valid_methods = ['pad', 'backfill'] - expecting = 'pad (ffill) or backfill (bfill)' + valid_methods = ["pad", "backfill"] + expecting = "pad (ffill) or backfill (bfill)" if allow_nearest: - valid_methods.append('nearest') - expecting = 'pad (ffill), backfill (bfill) or nearest' + valid_methods.append("nearest") + expecting = "pad (ffill), backfill (bfill) or nearest" if method not in valid_methods: - msg = ('Invalid fill method. Expecting {expecting}. Got {method}' - .format(expecting=expecting, method=method)) + msg = "Invalid fill method. Expecting {expecting}. Got {method}".format( + expecting=expecting, method=method + ) raise ValueError(msg) return method def clean_interp_method(method, **kwargs): - order = kwargs.get('order') - valid = ['linear', 'time', 'index', 'values', 'nearest', 'zero', 'slinear', - 'quadratic', 'cubic', 'barycentric', 'polynomial', 'krogh', - 'piecewise_polynomial', 'pchip', 'akima', 'spline', - 'from_derivatives'] - if method in ('spline', 'polynomial') and order is None: - raise ValueError("You must specify the order of the spline or " - "polynomial.") + order = kwargs.get("order") + valid = [ + "linear", + "time", + "index", + "values", + "nearest", + "zero", + "slinear", + "quadratic", + "cubic", + "barycentric", + "polynomial", + "krogh", + "piecewise_polynomial", + "pchip", + "akima", + "spline", + "from_derivatives", + ] + if method in ("spline", "polynomial") and order is None: + raise ValueError("You must specify the order of the spline or " "polynomial.") if method not in valid: - raise ValueError("method must be one of {valid}. Got '{method}' " - "instead.".format(valid=valid, method=method)) + raise ValueError( + "method must be one of {valid}. Got '{method}' " + "instead.".format(valid=valid, method=method) + ) return method -def interpolate_1d(xvalues, yvalues, method='linear', limit=None, - limit_direction='forward', limit_area=None, fill_value=None, - bounds_error=False, order=None, **kwargs): +def interpolate_1d( + xvalues, + yvalues, + method="linear", + limit=None, + limit_direction="forward", + limit_area=None, + fill_value=None, + bounds_error=False, + order=None, + **kwargs +): """ Logic for the 1-d interpolation. The result should be 1-d, inputs xvalues and yvalues will each be 1-d arrays of the same length. @@ -132,39 +166,44 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, if valid.all(): return yvalues - if method == 'time': - if not getattr(xvalues, 'is_all_dates', None): + if method == "time": + if not getattr(xvalues, "is_all_dates", None): # if not issubclass(xvalues.dtype.type, np.datetime64): - raise ValueError('time-weighted interpolation only works ' - 'on Series or DataFrames with a ' - 'DatetimeIndex') - method = 'values' - - valid_limit_directions = ['forward', 'backward', 'both'] + raise ValueError( + "time-weighted interpolation only works " + "on Series or DataFrames with a " + "DatetimeIndex" + ) + method = "values" + + valid_limit_directions = ["forward", "backward", "both"] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: - msg = ('Invalid limit_direction: expecting one of {valid!r}, ' - 'got {invalid!r}.') - raise ValueError(msg.format(valid=valid_limit_directions, - invalid=limit_direction)) + msg = "Invalid limit_direction: expecting one of {valid!r}, " "got {invalid!r}." + raise ValueError( + msg.format(valid=valid_limit_directions, invalid=limit_direction) + ) if limit_area is not None: - valid_limit_areas = ['inside', 'outside'] + valid_limit_areas = ["inside", "outside"] limit_area = limit_area.lower() if limit_area not in valid_limit_areas: - raise ValueError('Invalid limit_area: expecting one of {}, got ' - '{}.'.format(valid_limit_areas, limit_area)) + raise ValueError( + "Invalid limit_area: expecting one of {}, got " + "{}.".format(valid_limit_areas, limit_area) + ) # default limit is unlimited GH #16282 if limit is None: # limit = len(xvalues) pass elif not is_integer(limit): - raise ValueError('Limit must be an integer') + raise ValueError("Limit must be an integer") elif limit < 1: - raise ValueError('Limit must be greater than 0') + raise ValueError("Limit must be greater than 0") from pandas import Series + ys = Series(yvalues) # These are sets of index pointers to invalid values... i.e. {0, 1, etc... @@ -182,9 +221,9 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, # are more than'limit' away from the prior non-NaN. # set preserve_nans based on direction using _interp_limit - if limit_direction == 'forward': + if limit_direction == "forward": preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) - elif limit_direction == 'backward': + elif limit_direction == "backward": preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) else: # both directions... just use _interp_limit @@ -192,22 +231,22 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, # if limit_area is set, add either mid or outside indices # to preserve_nans GH #16284 - if limit_area == 'inside': + if limit_area == "inside": # preserve NaNs on the outside preserve_nans |= start_nans | end_nans - elif limit_area == 'outside': + elif limit_area == "outside": # preserve NaNs on the inside preserve_nans |= mid_nans # sort preserve_nans and covert to list preserve_nans = sorted(preserve_nans) - xvalues = getattr(xvalues, 'values', xvalues) - yvalues = getattr(yvalues, 'values', yvalues) + xvalues = getattr(xvalues, "values", xvalues) + yvalues = getattr(yvalues, "values", yvalues) result = yvalues.copy() - if method in ['linear', 'time', 'index', 'values']: - if method in ('values', 'index'): + if method in ["linear", "time", "index", "values"]: + if method in ("values", "index"): inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if needs_i8_conversion(inds.dtype.type): @@ -220,73 +259,99 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, result[preserve_nans] = np.nan return result - sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic', - 'barycentric', 'krogh', 'spline', 'polynomial', - 'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima'] + sp_methods = [ + "nearest", + "zero", + "slinear", + "quadratic", + "cubic", + "barycentric", + "krogh", + "spline", + "polynomial", + "from_derivatives", + "piecewise_polynomial", + "pchip", + "akima", + ] if method in sp_methods: inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if issubclass(inds.dtype.type, np.datetime64): inds = inds.view(np.int64) - result[invalid] = _interpolate_scipy_wrapper(inds[valid], - yvalues[valid], - inds[invalid], - method=method, - fill_value=fill_value, - bounds_error=bounds_error, - order=order, **kwargs) + result[invalid] = _interpolate_scipy_wrapper( + inds[valid], + yvalues[valid], + inds[invalid], + method=method, + fill_value=fill_value, + bounds_error=bounds_error, + order=order, + **kwargs + ) result[preserve_nans] = np.nan return result -def _interpolate_scipy_wrapper(x, y, new_x, method, fill_value=None, - bounds_error=False, order=None, **kwargs): +def _interpolate_scipy_wrapper( + x, y, new_x, method, fill_value=None, bounds_error=False, order=None, **kwargs +): """ Passed off to scipy.interpolate.interp1d. method is scipy's kind. Returns an array interpolated at new_x. Add any new methods to the list in _clean_interp_method. """ - extra = '{method} interpolation requires SciPy.'.format(method=method) - import_optional_dependency('scipy', extra=extra) + extra = "{method} interpolation requires SciPy.".format(method=method) + import_optional_dependency("scipy", extra=extra) from scipy import interpolate new_x = np.asarray(new_x) # ignores some kwargs that could be passed along. alt_methods = { - 'barycentric': interpolate.barycentric_interpolate, - 'krogh': interpolate.krogh_interpolate, - 'from_derivatives': _from_derivatives, - 'piecewise_polynomial': _from_derivatives, + "barycentric": interpolate.barycentric_interpolate, + "krogh": interpolate.krogh_interpolate, + "from_derivatives": _from_derivatives, + "piecewise_polynomial": _from_derivatives, } - if getattr(x, 'is_all_dates', False): + if getattr(x, "is_all_dates", False): # GH 5975, scipy.interp1d can't hande datetime64s - x, new_x = x._values.astype('i8'), new_x.astype('i8') + x, new_x = x._values.astype("i8"), new_x.astype("i8") - if method == 'pchip': + if method == "pchip": try: - alt_methods['pchip'] = interpolate.pchip_interpolate + alt_methods["pchip"] = interpolate.pchip_interpolate except AttributeError: - raise ImportError("Your version of Scipy does not support " - "PCHIP interpolation.") - elif method == 'akima': - alt_methods['akima'] = _akima_interpolate - - interp1d_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic', - 'polynomial'] + raise ImportError( + "Your version of Scipy does not support " "PCHIP interpolation." + ) + elif method == "akima": + alt_methods["akima"] = _akima_interpolate + + interp1d_methods = [ + "nearest", + "zero", + "slinear", + "quadratic", + "cubic", + "polynomial", + ] if method in interp1d_methods: - if method == 'polynomial': + if method == "polynomial": method = order - terp = interpolate.interp1d(x, y, kind=method, fill_value=fill_value, - bounds_error=bounds_error) + terp = interpolate.interp1d( + x, y, kind=method, fill_value=fill_value, bounds_error=bounds_error + ) new_y = terp(new_x) - elif method == 'spline': + elif method == "spline": # GH #10633, #24014 if isna(order) or (order <= 0): - raise ValueError("order needs to be specified and greater than 0; " - "got order: {}".format(order)) + raise ValueError( + "order needs to be specified and greater than 0; " + "got order: {}".format(order) + ) terp = interpolate.UnivariateSpline(x, y, k=order, **kwargs) new_y = terp(new_x) else: @@ -341,8 +406,7 @@ def _from_derivatives(xi, yi, x, order=None, der=0, extrapolate=False): # return the method for compat with scipy version & backwards compat method = interpolate.BPoly.from_derivatives - m = method(xi, yi.reshape(-1, 1), - orders=order, extrapolate=extrapolate) + m = method(xi, yi.reshape(-1, 1), orders=order, extrapolate=extrapolate) return m(x) @@ -384,6 +448,7 @@ def _akima_interpolate(xi, yi, x, der=0, axis=0): """ from scipy import interpolate + P = interpolate.Akima1DInterpolator(xi, yi, axis=axis) if der == 0: @@ -394,8 +459,9 @@ def _akima_interpolate(xi, yi, x, der=0, axis=0): return [P(x, nu) for nu in der] -def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None, - dtype=None): +def interpolate_2d( + values, method="pad", axis=0, limit=None, fill_value=None, dtype=None +): """ Perform an actual interpolation of values, values will be make 2-d if needed fills inplace, returns the result. @@ -407,8 +473,7 @@ def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None, ndim = values.ndim if values.ndim == 1: if axis != 0: # pragma: no cover - raise AssertionError("cannot interpolate on a ndim == 1 with " - "axis != 0") + raise AssertionError("cannot interpolate on a ndim == 1 with " "axis != 0") values = values.reshape(tuple((1,) + values.shape)) if fill_value is None: @@ -417,12 +482,12 @@ def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None, mask = mask_missing(transf(values), fill_value) method = clean_fill_method(method) - if method == 'pad': - values = transf(pad_2d( - transf(values), limit=limit, mask=mask, dtype=dtype)) + if method == "pad": + values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype)) else: - values = transf(backfill_2d( - transf(values), limit=limit, mask=mask, dtype=dtype)) + values = transf( + backfill_2d(transf(values), limit=limit, mask=mask, dtype=dtype) + ) # reshape back if ndim == 1: @@ -438,8 +503,11 @@ def _cast_values_for_fillna(values, dtype): # TODO: for int-dtypes we make a copy, but for everything else this # alters the values in-place. Is this intentional? - if (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype) or - is_timedelta64_dtype(dtype)): + if ( + is_datetime64_dtype(dtype) + or is_datetime64tz_dtype(dtype) + or is_timedelta64_dtype(dtype) + ): values = values.view(np.int64) elif is_integer_dtype(values): @@ -498,7 +566,7 @@ def backfill_2d(values, limit=None, mask=None, dtype=None): return values -_fill_methods = {'pad': pad_1d, 'backfill': backfill_1d} +_fill_methods = {"pad": pad_1d, "backfill": backfill_1d} def get_fill_func(method): @@ -523,10 +591,10 @@ def fill_zeros(result, x, y, name, fill): if fill is None or is_float_dtype(result): return result - if name.startswith(('r', '__r')): + if name.startswith(("r", "__r")): x, y = y, x - is_variable_type = (hasattr(y, 'dtype') or hasattr(y, 'type')) + is_variable_type = hasattr(y, "dtype") or hasattr(y, "type") is_scalar_type = is_scalar(y) if not is_variable_type and not is_scalar_type: @@ -544,15 +612,15 @@ def fill_zeros(result, x, y, name, fill): mask = ((y == 0) & ~np.isnan(result)).ravel() shape = result.shape - result = result.astype('float64', copy=False).ravel() + result = result.astype("float64", copy=False).ravel() np.putmask(result, mask, fill) # if we have a fill of inf, then sign it correctly # (GH 6178 and PR 9308) if np.isinf(fill): - signs = y if name.startswith(('r', '__r')) else x - signs = np.sign(signs.astype('float', copy=False)) + signs = y if name.startswith(("r", "__r")) else x + signs = np.sign(signs.astype("float", copy=False)) negative_inf_mask = (signs.ravel() < 0) & mask np.putmask(result, negative_inf_mask, -fill) @@ -606,7 +674,7 @@ def mask_zero_div_zero(x, y, result, copy=False): if nan_mask.any() or neginf_mask.any() or posinf_mask.any(): # Fill negative/0 with -inf, positive/0 with +inf, 0/0 with NaN - result = result.astype('float64', copy=copy).ravel() + result = result.astype("float64", copy=copy).ravel() np.putmask(result, nan_mask, np.nan) np.putmask(result, posinf_mask, np.inf) @@ -633,9 +701,8 @@ def dispatch_missing(op, left, right, result): ------- result : ndarray """ - opstr = '__{opname}__'.format(opname=op.__name__).replace('____', '__') - if op in [operator.truediv, operator.floordiv, - getattr(operator, 'div', None)]: + opstr = "__{opname}__".format(opname=op.__name__).replace("____", "__") + if op in [operator.truediv, operator.floordiv, getattr(operator, "div", None)]: result = mask_zero_div_zero(left, right, result) elif op is operator.mod: result = fill_zeros(result, left, right, opstr, np.nan) @@ -684,8 +751,9 @@ def _interp_limit(invalid, fw_limit, bw_limit): def inner(invalid, limit): limit = min(limit, N) windowed = _rolling_window(invalid, limit + 1).all(1) - idx = (set(np.where(windowed)[0] + limit) | - set(np.where((~invalid[:limit + 1]).cumsum() == 0)[0])) + idx = set(np.where(windowed)[0] + limit) | set( + np.where((~invalid[: limit + 1]).cumsum() == 0)[0] + ) return idx if fw_limit is not None: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index cc8b241bedba1f..ce14cb22a88cee 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -12,18 +12,30 @@ from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask from pandas.core.dtypes.common import ( - _get_dtype, is_any_int_dtype, is_bool_dtype, is_complex, is_complex_dtype, - is_datetime64_dtype, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, - is_float, is_float_dtype, is_integer, is_integer_dtype, is_numeric_dtype, - is_object_dtype, is_scalar, is_timedelta64_dtype, pandas_dtype) + _get_dtype, + is_any_int_dtype, + is_bool_dtype, + is_complex, + is_complex_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetime_or_timedelta_dtype, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_numeric_dtype, + is_object_dtype, + is_scalar, + is_timedelta64_dtype, + pandas_dtype, +) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna import pandas.core.common as com -bn = import_optional_dependency("bottleneck", - raise_on_missing=False, - on_version="warn") +bn = import_optional_dependency("bottleneck", raise_on_missing=False, on_version="warn") _BOTTLENECK_INSTALLED = bn is not None _USE_BOTTLENECK = False @@ -35,28 +47,26 @@ def set_use_bottleneck(v=True): _USE_BOTTLENECK = v -set_use_bottleneck(get_option('compute.use_bottleneck')) +set_use_bottleneck(get_option("compute.use_bottleneck")) class disallow: - def __init__(self, *dtypes): super().__init__() self.dtypes = tuple(pandas_dtype(dtype).type for dtype in dtypes) def check(self, obj): - return hasattr(obj, 'dtype') and issubclass(obj.dtype.type, - self.dtypes) + return hasattr(obj, "dtype") and issubclass(obj.dtype.type, self.dtypes) def __call__(self, f): @functools.wraps(f) def _f(*args, **kwargs): obj_iter = itertools.chain(args, kwargs.values()) if any(self.check(obj) for obj in obj_iter): - msg = 'reduction operation {name!r} not allowed for this dtype' - raise TypeError(msg.format(name=f.__name__.replace('nan', ''))) + msg = "reduction operation {name!r} not allowed for this dtype" + raise TypeError(msg.format(name=f.__name__.replace("nan", ""))) try: - with np.errstate(invalid='ignore'): + with np.errstate(invalid="ignore"): return f(*args, **kwargs) except ValueError as e: # we want to transform an object array @@ -71,7 +81,6 @@ def _f(*args, **kwargs): class bottleneck_switch: - def __init__(self, name=None, **kwargs): self.name = name self.kwargs = kwargs @@ -91,7 +100,7 @@ def f(values, axis=None, skipna=True, **kwds): if k not in kwds: kwds[k] = v try: - if values.size == 0 and kwds.get('min_count') is None: + if values.size == 0 and kwds.get("min_count") is None: # We are empty, returning NA for our type # Only applies for the default `min_count` of None # since that affects how empty arrays are handled. @@ -100,8 +109,7 @@ def f(values, axis=None, skipna=True, **kwds): # It *may* just be `var` return _na_for_min_count(values, axis) - if (_USE_BOTTLENECK and skipna and - _bn_ok_dtype(values.dtype, bn_name)): + if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name): result = bn_func(values, axis=axis, **kwds) # prefer to treat inf/-inf as NA, but must compute the func @@ -130,9 +138,9 @@ def f(values, axis=None, skipna=True, **kwds): def _bn_ok_dtype(dt, name): # Bottleneck chokes on datetime64 - if (not is_object_dtype(dt) and - not (is_datetime_or_timedelta_dtype(dt) or - is_datetime64tz_dtype(dt))): + if not is_object_dtype(dt) and not ( + is_datetime_or_timedelta_dtype(dt) or is_datetime64tz_dtype(dt) + ): # GH 15507 # bottleneck does not properly upcast during the sum @@ -142,7 +150,7 @@ def _bn_ok_dtype(dt, name): # further we also want to preserve NaN when all elements # are NaN, unlinke bottleneck/numpy which consider this # to be 0 - if name in ['nansum', 'nanprod']: + if name in ["nansum", "nanprod"]: return False return True @@ -151,9 +159,9 @@ def _bn_ok_dtype(dt, name): def _has_infs(result): if isinstance(result, np.ndarray): - if result.dtype == 'f8': + if result.dtype == "f8": return lib.has_infs_f8(result.ravel()) - elif result.dtype == 'f4': + elif result.dtype == "f4": return lib.has_infs_f4(result.ravel()) try: return np.isinf(result).any() @@ -170,7 +178,7 @@ def _get_fill_value(dtype, fill_value=None, fill_value_typ=None): if fill_value_typ is None: return np.nan else: - if fill_value_typ == '+inf': + if fill_value_typ == "+inf": return np.inf else: return -np.inf @@ -178,15 +186,16 @@ def _get_fill_value(dtype, fill_value=None, fill_value_typ=None): if fill_value_typ is None: return tslibs.iNaT else: - if fill_value_typ == '+inf': + if fill_value_typ == "+inf": # need the max int here return _int64_max else: return tslibs.iNaT -def _maybe_get_mask(values: np.ndarray, skipna: bool, - mask: Optional[np.ndarray]) -> Optional[np.ndarray]: +def _maybe_get_mask( + values: np.ndarray, skipna: bool, mask: Optional[np.ndarray] +) -> Optional[np.ndarray]: """ This function will compute a mask iff it is necessary. Otherwise, return the provided mask (potentially None) when a mask does not need to be computed. @@ -227,11 +236,13 @@ def _maybe_get_mask(values: np.ndarray, skipna: bool, return mask -def _get_values(values: np.ndarray, skipna: bool, fill_value: Any = None, - fill_value_typ: Optional[str] = None, - mask: Optional[np.ndarray] = None - ) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, - np.dtype, Any]: +def _get_values( + values: np.ndarray, + skipna: bool, + fill_value: Any = None, + fill_value_typ: Optional[str] = None, + mask: Optional[np.ndarray] = None, +) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, np.dtype, Any]: """ Utility to get the values view, mask, dtype, dtype_max, and fill_value. If both mask and fill_value/fill_value_typ are not None and skipna is True, @@ -288,8 +299,9 @@ def _get_values(values: np.ndarray, skipna: bool, fill_value: Any = None, # get our fill value (in case we need to provide an alternative # dtype for it) - fill_value = _get_fill_value(dtype, fill_value=fill_value, - fill_value_typ=fill_value_typ) + fill_value = _get_fill_value( + dtype, fill_value=fill_value, fill_value_typ=fill_value_typ + ) copy = (mask is not None) and (fill_value is not None) @@ -315,16 +327,19 @@ def _get_values(values: np.ndarray, skipna: bool, fill_value: Any = None, def _isfinite(values): if is_datetime_or_timedelta_dtype(values): return isna(values) - if (is_complex_dtype(values) or is_float_dtype(values) or - is_integer_dtype(values) or is_bool_dtype(values)): + if ( + is_complex_dtype(values) + or is_float_dtype(values) + or is_integer_dtype(values) + or is_bool_dtype(values) + ): return ~np.isfinite(values) - return ~np.isfinite(values.astype('float64')) + return ~np.isfinite(values.astype("float64")) def _na_ok_dtype(dtype): # TODO: what about datetime64tz? PeriodDtype? - return not issubclass(dtype.type, - (np.integer, np.timedelta64, np.datetime64)) + return not issubclass(dtype.type, (np.integer, np.timedelta64, np.datetime64)) def _wrap_results(result, dtype, fill_value=None): @@ -335,7 +350,7 @@ def _wrap_results(result, dtype, fill_value=None): # GH#24293 fill_value = iNaT if not isinstance(result, np.ndarray): - tz = getattr(dtype, 'tz', None) + tz = getattr(dtype, "tz", None) assert not isna(fill_value), "Expected non-null fill_value" if result == fill_value: result = np.nan @@ -351,9 +366,9 @@ def _wrap_results(result, dtype, fill_value=None): if np.fabs(result) > _int64_max: raise ValueError("overflow in timedelta operation") - result = tslibs.Timedelta(result, unit='ns') + result = tslibs.Timedelta(result, unit="ns") else: - result = result.astype('i8').view(dtype) + result = result.astype("i8").view(dtype) return result @@ -375,14 +390,13 @@ def _na_for_min_count(values, axis): """ # we either return np.nan or pd.NaT if is_numeric_dtype(values): - values = values.astype('float64') + values = values.astype("float64") fill_value = na_value_for_dtype(values.dtype) if values.ndim == 1: return fill_value else: - result_shape = (values.shape[:axis] + - values.shape[axis + 1:]) + result_shape = values.shape[:axis] + values.shape[axis + 1 :] result = np.empty(result_shape, dtype=values.dtype) result.fill(fill_value) return result @@ -416,8 +430,7 @@ def nanany(values, axis=None, skipna=True, mask=None): >>> nanops.nanany(s) False """ - values, _, _, _, _ = _get_values(values, skipna, fill_value=False, - mask=mask) + values, _, _, _, _ = _get_values(values, skipna, fill_value=False, mask=mask) return values.any(axis) @@ -449,12 +462,11 @@ def nanall(values, axis=None, skipna=True, mask=None): >>> nanops.nanall(s) False """ - values, _, _, _, _ = _get_values(values, skipna, fill_value=True, - mask=mask) + values, _, _, _, _ = _get_values(values, skipna, fill_value=True, mask=mask) return values.all(axis) -@disallow('M8') +@disallow("M8") def nansum(values, axis=None, skipna=True, min_count=0, mask=None): """ Sum the elements along an axis ignoring NaNs @@ -479,21 +491,21 @@ def nansum(values, axis=None, skipna=True, min_count=0, mask=None): >>> nanops.nansum(s) 3.0 """ - values, mask, dtype, dtype_max, _ = _get_values(values, skipna, - fill_value=0, mask=mask) + values, mask, dtype, dtype_max, _ = _get_values( + values, skipna, fill_value=0, mask=mask + ) dtype_sum = dtype_max if is_float_dtype(dtype): dtype_sum = dtype elif is_timedelta64_dtype(dtype): dtype_sum = np.float64 the_sum = values.sum(axis, dtype=dtype_sum) - the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, - min_count=min_count) + the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count) return _wrap_results(the_sum, dtype) -@disallow('M8', DatetimeTZDtype) +@disallow("M8", DatetimeTZDtype) @bottleneck_switch() def nanmean(values, axis=None, skipna=True, mask=None): """ @@ -520,12 +532,17 @@ def nanmean(values, axis=None, skipna=True, mask=None): >>> nanops.nanmean(s) 1.5 """ - values, mask, dtype, dtype_max, _ = _get_values(values, skipna, - fill_value=0, mask=mask) + values, mask, dtype, dtype_max, _ = _get_values( + values, skipna, fill_value=0, mask=mask + ) dtype_sum = dtype_max dtype_count = np.float64 - if (is_integer_dtype(dtype) or is_timedelta64_dtype(dtype) or - is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype)): + if ( + is_integer_dtype(dtype) + or is_timedelta64_dtype(dtype) + or is_datetime64_dtype(dtype) + or is_datetime64tz_dtype(dtype) + ): dtype_sum = np.float64 elif is_float_dtype(dtype): dtype_sum = dtype @@ -533,7 +550,7 @@ def nanmean(values, axis=None, skipna=True, mask=None): count = _get_counts(values.shape, mask, axis, dtype=dtype_count) the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) - if axis is not None and getattr(the_sum, 'ndim', False): + if axis is not None and getattr(the_sum, "ndim", False): with np.errstate(all="ignore"): # suppress division by zero warnings the_mean = the_sum / count @@ -546,7 +563,7 @@ def nanmean(values, axis=None, skipna=True, mask=None): return _wrap_results(the_mean, dtype) -@disallow('M8') +@disallow("M8") @bottleneck_switch() def nanmedian(values, axis=None, skipna=True, mask=None): """ @@ -571,6 +588,7 @@ def nanmedian(values, axis=None, skipna=True, mask=None): >>> nanops.nanmedian(s) 2.0 """ + def get_median(x): mask = notna(x) if not skipna and not mask.all(): @@ -579,7 +597,7 @@ def get_median(x): values, mask, dtype, dtype_max, _ = _get_values(values, skipna, mask=mask) if not is_float_dtype(values): - values = values.astype('f8') + values = values.astype("f8") if mask is not None: values[mask] = np.nan @@ -595,7 +613,8 @@ def get_median(x): if notempty: if not skipna: return _wrap_results( - np.apply_along_axis(get_median, axis, values), dtype) + np.apply_along_axis(get_median, axis, values), dtype + ) # fastpath for the skipna case return _wrap_results(np.nanmedian(values, axis), dtype) @@ -614,10 +633,13 @@ def get_median(x): return _wrap_results(get_median(values) if notempty else np.nan, dtype) -def _get_counts_nanvar(value_counts: Tuple[int], mask: Optional[np.ndarray], - axis: Optional[int], ddof: int, - dtype=float) -> Tuple[Union[int, np.ndarray], - Union[int, np.ndarray]]: +def _get_counts_nanvar( + value_counts: Tuple[int], + mask: Optional[np.ndarray], + axis: Optional[int], + ddof: int, + dtype=float, +) -> Tuple[Union[int, np.ndarray], Union[int, np.ndarray]]: """ Get the count of non-null values along an axis, accounting for degrees of freedom. @@ -656,7 +678,7 @@ def _get_counts_nanvar(value_counts: Tuple[int], mask: Optional[np.ndarray], return count, d -@disallow('M8') +@disallow("M8") @bottleneck_switch(ddof=1) def nanstd(values, axis=None, skipna=True, ddof=1, mask=None): """ @@ -686,12 +708,11 @@ def nanstd(values, axis=None, skipna=True, ddof=1, mask=None): >>> nanops.nanstd(s) 1.0 """ - result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, - mask=mask)) + result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask)) return _wrap_results(result, values.dtype) -@disallow('M8') +@disallow("M8") @bottleneck_switch(ddof=1) def nanvar(values, axis=None, skipna=True, ddof=1, mask=None): """ @@ -725,13 +746,12 @@ def nanvar(values, axis=None, skipna=True, ddof=1, mask=None): dtype = values.dtype mask = _maybe_get_mask(values, skipna, mask) if is_any_int_dtype(values): - values = values.astype('f8') + values = values.astype("f8") if mask is not None: values[mask] = np.nan if is_float_dtype(values): - count, d = _get_counts_nanvar(values.shape, mask, axis, ddof, - values.dtype) + count, d = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype) else: count, d = _get_counts_nanvar(values.shape, mask, axis, ddof) @@ -761,7 +781,7 @@ def nanvar(values, axis=None, skipna=True, ddof=1, mask=None): return _wrap_results(result, values.dtype) -@disallow('M8', 'm8') +@disallow("M8", "m8") def nansem(values, axis=None, skipna=True, ddof=1, mask=None): """ Compute the standard error in the mean along given axis while ignoring NaNs @@ -797,7 +817,7 @@ def nansem(values, axis=None, skipna=True, ddof=1, mask=None): mask = _maybe_get_mask(values, skipna, mask) if not is_float_dtype(values.dtype): - values = values.astype('f8') + values = values.astype("f8") count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype) var = nanvar(values, axis, skipna, ddof=ddof) @@ -806,20 +826,18 @@ def nansem(values, axis=None, skipna=True, ddof=1, mask=None): def _nanminmax(meth, fill_value_typ): - - @bottleneck_switch(name='nan' + meth) + @bottleneck_switch(name="nan" + meth) def reduction(values, axis=None, skipna=True, mask=None): values, mask, dtype, dtype_max, fill_value = _get_values( - values, skipna, fill_value_typ=fill_value_typ, mask=mask) + values, skipna, fill_value_typ=fill_value_typ, mask=mask + ) - if ((axis is not None and values.shape[axis] == 0) or - values.size == 0): + if (axis is not None and values.shape[axis] == 0) or values.size == 0: try: result = getattr(values, meth)(axis, dtype=dtype_max) result.fill(np.nan) - except (AttributeError, TypeError, - ValueError, np.core._internal.AxisError): + except (AttributeError, TypeError, ValueError, np.core._internal.AxisError): result = np.nan else: result = getattr(values, meth)(axis) @@ -830,11 +848,11 @@ def reduction(values, axis=None, skipna=True, mask=None): return reduction -nanmin = _nanminmax('min', fill_value_typ='+inf') -nanmax = _nanminmax('max', fill_value_typ='-inf') +nanmin = _nanminmax("min", fill_value_typ="+inf") +nanmax = _nanminmax("max", fill_value_typ="-inf") -@disallow('O') +@disallow("O") def nanargmax(values, axis=None, skipna=True, mask=None): """ Parameters @@ -858,13 +876,14 @@ def nanargmax(values, axis=None, skipna=True, mask=None): 4 """ values, mask, dtype, _, _ = _get_values( - values, True, fill_value_typ='-inf', mask=mask) + values, True, fill_value_typ="-inf", mask=mask + ) result = values.argmax(axis) result = _maybe_arg_null_out(result, axis, mask, skipna) return result -@disallow('O') +@disallow("O") def nanargmin(values, axis=None, skipna=True, mask=None): """ Parameters @@ -888,13 +907,14 @@ def nanargmin(values, axis=None, skipna=True, mask=None): 0 """ values, mask, dtype, _, _ = _get_values( - values, True, fill_value_typ='+inf', mask=mask) + values, True, fill_value_typ="+inf", mask=mask + ) result = values.argmin(axis) result = _maybe_arg_null_out(result, axis, mask, skipna) return result -@disallow('M8', 'm8') +@disallow("M8", "m8") def nanskew(values, axis=None, skipna=True, mask=None): """ Compute the sample skewness. @@ -926,7 +946,7 @@ def nanskew(values, axis=None, skipna=True, mask=None): values = com.values_from_object(values) mask = _maybe_get_mask(values, skipna, mask) if not is_float_dtype(values.dtype): - values = values.astype('f8') + values = values.astype("f8") count = _get_counts(values.shape, mask, axis) else: count = _get_counts(values.shape, mask, axis, dtype=values.dtype) @@ -954,7 +974,7 @@ def nanskew(values, axis=None, skipna=True, mask=None): m2 = _zero_out_fperr(m2) m3 = _zero_out_fperr(m3) - with np.errstate(invalid='ignore', divide='ignore'): + with np.errstate(invalid="ignore", divide="ignore"): result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5) dtype = values.dtype @@ -972,7 +992,7 @@ def nanskew(values, axis=None, skipna=True, mask=None): return result -@disallow('M8', 'm8') +@disallow("M8", "m8") def nankurt(values, axis=None, skipna=True, mask=None): """ Compute the sample excess kurtosis @@ -1005,7 +1025,7 @@ def nankurt(values, axis=None, skipna=True, mask=None): values = com.values_from_object(values) mask = _maybe_get_mask(values, skipna, mask) if not is_float_dtype(values.dtype): - values = values.astype('f8') + values = values.astype("f8") count = _get_counts(values.shape, mask, axis) else: count = _get_counts(values.shape, mask, axis, dtype=values.dtype) @@ -1026,7 +1046,7 @@ def nankurt(values, axis=None, skipna=True, mask=None): m2 = adjusted2.sum(axis, dtype=np.float64) m4 = adjusted4.sum(axis, dtype=np.float64) - with np.errstate(invalid='ignore', divide='ignore'): + with np.errstate(invalid="ignore", divide="ignore"): adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3)) numer = count * (count + 1) * (count - 1) * m4 denom = (count - 2) * (count - 3) * m2 ** 2 @@ -1046,7 +1066,7 @@ def nankurt(values, axis=None, skipna=True, mask=None): if denom == 0: return 0 - with np.errstate(invalid='ignore', divide='ignore'): + with np.errstate(invalid="ignore", divide="ignore"): result = numer / denom - adj dtype = values.dtype @@ -1060,7 +1080,7 @@ def nankurt(values, axis=None, skipna=True, mask=None): return result -@disallow('M8', 'm8') +@disallow("M8", "m8") def nanprod(values, axis=None, skipna=True, min_count=0, mask=None): """ Parameters @@ -1093,18 +1113,17 @@ def nanprod(values, axis=None, skipna=True, min_count=0, mask=None): values = values.copy() values[mask] = 1 result = values.prod(axis) - return _maybe_null_out(result, axis, mask, values.shape, - min_count=min_count) + return _maybe_null_out(result, axis, mask, values.shape, min_count=min_count) -def _maybe_arg_null_out(result: np.ndarray, axis: Optional[int], - mask: Optional[np.ndarray], - skipna: bool) -> Union[np.ndarray, int]: +def _maybe_arg_null_out( + result: np.ndarray, axis: Optional[int], mask: Optional[np.ndarray], skipna: bool +) -> Union[np.ndarray, int]: # helper function for nanargmin/nanargmax if mask is None: return result - if axis is None or not getattr(result, 'ndim', False): + if axis is None or not getattr(result, "ndim", False): if skipna: if mask.all(): result = -1 @@ -1121,8 +1140,12 @@ def _maybe_arg_null_out(result: np.ndarray, axis: Optional[int], return result -def _get_counts(values_shape: Tuple[int], mask: Optional[np.ndarray], - axis: Optional[int], dtype=float) -> Union[int, np.ndarray]: +def _get_counts( + values_shape: Tuple[int], + mask: Optional[np.ndarray], + axis: Optional[int], + dtype=float, +) -> Union[int, np.ndarray]: """ Get the count of non-null values along an axis Parameters @@ -1161,18 +1184,21 @@ def _get_counts(values_shape: Tuple[int], mask: Optional[np.ndarray], return np.array(count, dtype=dtype) -def _maybe_null_out(result: np.ndarray, axis: Optional[int], - mask: Optional[np.ndarray], shape: Tuple, - min_count: int = 1) -> np.ndarray: - if (mask is not None and axis is not None and - getattr(result, 'ndim', False)): +def _maybe_null_out( + result: np.ndarray, + axis: Optional[int], + mask: Optional[np.ndarray], + shape: Tuple, + min_count: int = 1, +) -> np.ndarray: + if mask is not None and axis is not None and getattr(result, "ndim", False): null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0 if np.any(null_mask): if is_numeric_dtype(result): if np.iscomplexobj(result): - result = result.astype('c16') + result = result.astype("c16") else: - result = result.astype('f8') + result = result.astype("f8") result[null_mask] = np.nan else: # GH12941, use None to auto cast null @@ -1191,19 +1217,19 @@ def _maybe_null_out(result: np.ndarray, axis: Optional[int], def _zero_out_fperr(arg): # #18044 reference this behavior to fix rolling skew/kurt issue if isinstance(arg, np.ndarray): - with np.errstate(invalid='ignore'): + with np.errstate(invalid="ignore"): return np.where(np.abs(arg) < 1e-14, 0, arg) else: return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg -@disallow('M8', 'm8') -def nancorr(a, b, method='pearson', min_periods=None): +@disallow("M8", "m8") +def nancorr(a, b, method="pearson", min_periods=None): """ a, b: ndarrays """ if len(a) != len(b): - raise AssertionError('Operands to nancorr must have same size') + raise AssertionError("Operands to nancorr must have same size") if min_periods is None: min_periods = 1 @@ -1221,7 +1247,7 @@ def nancorr(a, b, method='pearson', min_periods=None): def get_corr_func(method): - if method in ['kendall', 'spearman']: + if method in ["kendall", "spearman"]: from scipy.stats import kendalltau, spearmanr elif callable(method): return method @@ -1238,18 +1264,14 @@ def _kendall(a, b): def _spearman(a, b): return spearmanr(a, b)[0] - _cor_methods = { - 'pearson': _pearson, - 'kendall': _kendall, - 'spearman': _spearman - } + _cor_methods = {"pearson": _pearson, "kendall": _kendall, "spearman": _spearman} return _cor_methods[method] -@disallow('M8', 'm8') +@disallow("M8", "m8") def nancov(a, b, min_periods=None): if len(a) != len(b): - raise AssertionError('Operands to nancov must have same size') + raise AssertionError("Operands to nancov must have same size") if min_periods is None: min_periods = 1 @@ -1284,10 +1306,12 @@ def _ensure_numeric(x): try: x = complex(x) except Exception: - raise TypeError('Could not convert {value!s} to numeric' - .format(value=x)) + raise TypeError( + "Could not convert {value!s} to numeric".format(value=x) + ) return x + # NA-friendly array comparisons @@ -1297,12 +1321,12 @@ def f(x, y): ymask = isna(y) mask = xmask | ymask - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = op(x, y) if mask.any(): if is_bool_dtype(result): - result = result.astype('O') + result = result.astype("O") np.putmask(result, mask, np.nan) return result @@ -1344,8 +1368,7 @@ def _nanpercentile_1d(values, mask, q, na_value, interpolation): if lib.is_scalar(q): return na_value else: - return np.array([na_value] * len(q), - dtype=values.dtype) + return np.array([na_value] * len(q), dtype=values.dtype) return np.percentile(values, q, interpolation=interpolation) @@ -1372,8 +1395,9 @@ def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation): """ if not lib.is_scalar(mask) and mask.any(): if ndim == 1: - return _nanpercentile_1d(values, mask, q, na_value, - interpolation=interpolation) + return _nanpercentile_1d( + values, mask, q, na_value, interpolation=interpolation + ) else: # for nonconsolidatable blocks mask is 1D, but values 2D if mask.ndim < values.ndim: @@ -1381,9 +1405,10 @@ def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation): if axis == 0: values = values.T mask = mask.T - result = [_nanpercentile_1d(val, m, q, na_value, - interpolation=interpolation) - for (val, m) in zip(list(values), list(mask))] + result = [ + _nanpercentile_1d(val, m, q, na_value, interpolation=interpolation) + for (val, m) in zip(list(values), list(mask)) + ] result = np.array(result, dtype=values.dtype, copy=False).T return result else: diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 5dd84550732121..5c58a1433ba3cc 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -16,16 +16,34 @@ from pandas.util._decorators import Appender from pandas.core.dtypes.cast import ( - construct_1d_object_array_from_listlike, find_common_type, - maybe_upcast_putmask) + construct_1d_object_array_from_listlike, + find_common_type, + maybe_upcast_putmask, +) from pandas.core.dtypes.common import ( - ensure_object, is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_datetimelike_v_numeric, is_extension_array_dtype, - is_integer_dtype, is_list_like, is_object_dtype, is_period_dtype, - is_scalar, is_timedelta64_dtype, needs_i8_conversion) + ensure_object, + is_bool_dtype, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetimelike_v_numeric, + is_extension_array_dtype, + is_integer_dtype, + is_list_like, + is_object_dtype, + is_period_dtype, + is_scalar, + is_timedelta64_dtype, + needs_i8_conversion, +) from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCIndex, ABCIndexClass, ABCSeries, ABCSparseArray, - ABCSparseSeries) + ABCDataFrame, + ABCIndex, + ABCIndexClass, + ABCSeries, + ABCSparseArray, + ABCSparseSeries, +) from pandas.core.dtypes.missing import isna, notna import pandas as pd @@ -80,8 +98,8 @@ def _maybe_match_name(a, b): -------- pandas.core.common.consensus_name_attr """ - a_has = hasattr(a, 'name') - b_has = hasattr(b, 'name') + a_has = hasattr(a, "name") + b_has = hasattr(b, "name") if a_has and b_has: if a.name == b.name: return a.name @@ -137,6 +155,7 @@ def maybe_upcast_for_op(obj): # Reversed Operations not available in the stdlib operator module. # Defining these instead of using lambdas allows us to reference them by name. + def radd(left, right): return right + left @@ -166,8 +185,9 @@ def rmod(left, right): # formatting operation; this is a TypeError # otherwise perform the op if isinstance(right, str): - raise TypeError("{typ} cannot perform the operation mod".format( - typ=type(left).__name__)) + raise TypeError( + "{typ} cannot perform the operation mod".format(typ=type(left).__name__) + ) return right % left @@ -194,6 +214,7 @@ def rxor(left, right): # ----------------------------------------------------------------------------- + def make_invalid_op(name): """ Return a binary method that always raises a TypeError. @@ -206,9 +227,12 @@ def make_invalid_op(name): ------- invalid_op : function """ + def invalid_op(self, other=None): - raise TypeError("cannot perform {name} with this index type: " - "{typ}".format(name=name, typ=type(self).__name__)) + raise TypeError( + "cannot perform {name} with this index type: " + "{typ}".format(name=name, typ=type(self).__name__) + ) invalid_op.__name__ = name return invalid_op @@ -239,18 +263,18 @@ def _gen_eval_kwargs(name): # Series appear to only pass __add__, __radd__, ... # but DataFrame gets both these dunder names _and_ non-dunder names # add, radd, ... - name = name.replace('__', '') + name = name.replace("__", "") - if name.startswith('r'): - if name not in ['radd', 'rand', 'ror', 'rxor']: + if name.startswith("r"): + if name not in ["radd", "rand", "ror", "rxor"]: # Exclude commutative operations - kwargs['reversed'] = True + kwargs["reversed"] = True - if name in ['truediv', 'rtruediv']: - kwargs['truediv'] = True + if name in ["truediv", "rtruediv"]: + kwargs["truediv"] = True - if name in ['ne']: - kwargs['masker'] = True + if name in ["ne"]: + kwargs["masker"] = True return kwargs @@ -269,11 +293,11 @@ def _gen_fill_zeros(name): ------- fill_value : {None, np.nan, np.inf} """ - name = name.strip('__') - if 'div' in name: + name = name.strip("__") + if "div" in name: # truediv, floordiv, div, and reversed variants fill_value = np.inf - elif 'mod' in name: + elif "mod" in name: # mod, rmod fill_value = np.nan else: @@ -295,15 +319,15 @@ def _get_frame_op_default_axis(name): ------- default_axis: str or None """ - if name.replace('__r', '__') in ['__and__', '__or__', '__xor__']: + if name.replace("__r", "__") in ["__and__", "__or__", "__xor__"]: # bool methods - return 'columns' - elif name.startswith('__'): + return "columns" + elif name.startswith("__"): # __add__, __mul__, ... return None else: # add, mul, ... - return 'columns' + return "columns" def _get_opstr(op, cls): @@ -321,41 +345,43 @@ def _get_opstr(op, cls): op_str : string or None """ # numexpr is available for non-sparse classes - subtyp = getattr(cls, '_subtyp', '') - use_numexpr = 'sparse' not in subtyp + subtyp = getattr(cls, "_subtyp", "") + use_numexpr = "sparse" not in subtyp if not use_numexpr: # if we're not using numexpr, then don't pass a str_rep return None - return {operator.add: '+', - radd: '+', - operator.mul: '*', - rmul: '*', - operator.sub: '-', - rsub: '-', - operator.truediv: '/', - rtruediv: '/', - operator.floordiv: '//', - rfloordiv: '//', - operator.mod: None, # TODO: Why None for mod but '%' for rmod? - rmod: '%', - operator.pow: '**', - rpow: '**', - operator.eq: '==', - operator.ne: '!=', - operator.le: '<=', - operator.lt: '<', - operator.ge: '>=', - operator.gt: '>', - operator.and_: '&', - rand_: '&', - operator.or_: '|', - ror_: '|', - operator.xor: '^', - rxor: '^', - divmod: None, - rdivmod: None}[op] + return { + operator.add: "+", + radd: "+", + operator.mul: "*", + rmul: "*", + operator.sub: "-", + rsub: "-", + operator.truediv: "/", + rtruediv: "/", + operator.floordiv: "//", + rfloordiv: "//", + operator.mod: None, # TODO: Why None for mod but '%' for rmod? + rmod: "%", + operator.pow: "**", + rpow: "**", + operator.eq: "==", + operator.ne: "!=", + operator.le: "<=", + operator.lt: "<", + operator.ge: ">=", + operator.gt: ">", + operator.and_: "&", + rand_: "&", + operator.or_: "|", + ror_: "|", + operator.xor: "^", + rxor: "^", + divmod: None, + rdivmod: None, + }[op] def _get_op_name(op, special): @@ -372,9 +398,9 @@ def _get_op_name(op, special): ------- op_name : str """ - opname = op.__name__.strip('_') + opname = op.__name__.strip("_") if special: - opname = '__{opname}__'.format(opname=opname) + opname = "__{opname}__".format(opname=opname) return opname @@ -564,77 +590,89 @@ def _get_op_name(op, special): _op_descriptions = { # Arithmetic Operators - 'add': {'op': '+', - 'desc': 'Addition', - 'reverse': 'radd', - 'series_examples': _add_example_SERIES}, - 'sub': {'op': '-', - 'desc': 'Subtraction', - 'reverse': 'rsub', - 'series_examples': _sub_example_SERIES}, - 'mul': {'op': '*', - 'desc': 'Multiplication', - 'reverse': 'rmul', - 'series_examples': _mul_example_SERIES, - 'df_examples': None}, - 'mod': {'op': '%', - 'desc': 'Modulo', - 'reverse': 'rmod', - 'series_examples': _mod_example_SERIES}, - 'pow': {'op': '**', - 'desc': 'Exponential power', - 'reverse': 'rpow', - 'series_examples': _pow_example_SERIES, - 'df_examples': None}, - 'truediv': {'op': '/', - 'desc': 'Floating division', - 'reverse': 'rtruediv', - 'series_examples': _div_example_SERIES, - 'df_examples': None}, - 'floordiv': {'op': '//', - 'desc': 'Integer division', - 'reverse': 'rfloordiv', - 'series_examples': _floordiv_example_SERIES, - 'df_examples': None}, - 'divmod': {'op': 'divmod', - 'desc': 'Integer division and modulo', - 'reverse': 'rdivmod', - 'series_examples': None, - 'df_examples': None}, - + "add": { + "op": "+", + "desc": "Addition", + "reverse": "radd", + "series_examples": _add_example_SERIES, + }, + "sub": { + "op": "-", + "desc": "Subtraction", + "reverse": "rsub", + "series_examples": _sub_example_SERIES, + }, + "mul": { + "op": "*", + "desc": "Multiplication", + "reverse": "rmul", + "series_examples": _mul_example_SERIES, + "df_examples": None, + }, + "mod": { + "op": "%", + "desc": "Modulo", + "reverse": "rmod", + "series_examples": _mod_example_SERIES, + }, + "pow": { + "op": "**", + "desc": "Exponential power", + "reverse": "rpow", + "series_examples": _pow_example_SERIES, + "df_examples": None, + }, + "truediv": { + "op": "/", + "desc": "Floating division", + "reverse": "rtruediv", + "series_examples": _div_example_SERIES, + "df_examples": None, + }, + "floordiv": { + "op": "//", + "desc": "Integer division", + "reverse": "rfloordiv", + "series_examples": _floordiv_example_SERIES, + "df_examples": None, + }, + "divmod": { + "op": "divmod", + "desc": "Integer division and modulo", + "reverse": "rdivmod", + "series_examples": None, + "df_examples": None, + }, # Comparison Operators - 'eq': {'op': '==', - 'desc': 'Equal to', - 'reverse': None, - 'series_examples': None}, - 'ne': {'op': '!=', - 'desc': 'Not equal to', - 'reverse': None, - 'series_examples': None}, - 'lt': {'op': '<', - 'desc': 'Less than', - 'reverse': None, - 'series_examples': None}, - 'le': {'op': '<=', - 'desc': 'Less than or equal to', - 'reverse': None, - 'series_examples': None}, - 'gt': {'op': '>', - 'desc': 'Greater than', - 'reverse': None, - 'series_examples': None}, - 'ge': {'op': '>=', - 'desc': 'Greater than or equal to', - 'reverse': None, - 'series_examples': None} + "eq": {"op": "==", "desc": "Equal to", "reverse": None, "series_examples": None}, + "ne": { + "op": "!=", + "desc": "Not equal to", + "reverse": None, + "series_examples": None, + }, + "lt": {"op": "<", "desc": "Less than", "reverse": None, "series_examples": None}, + "le": { + "op": "<=", + "desc": "Less than or equal to", + "reverse": None, + "series_examples": None, + }, + "gt": {"op": ">", "desc": "Greater than", "reverse": None, "series_examples": None}, + "ge": { + "op": ">=", + "desc": "Greater than or equal to", + "reverse": None, + "series_examples": None, + }, } # type: Dict[str, Dict[str, Optional[str]]] _op_names = list(_op_descriptions.keys()) for key in _op_names: - reverse_op = _op_descriptions[key]['reverse'] + reverse_op = _op_descriptions[key]["reverse"] if reverse_op is not None: _op_descriptions[reverse_op] = _op_descriptions[key].copy() - _op_descriptions[reverse_op]['reverse'] = key + _op_descriptions[reverse_op]["reverse"] = key _flex_doc_SERIES = """ Return {desc} of series and other, element-wise (binary operator `{op_name}`). @@ -1007,42 +1045,43 @@ def _make_flex_doc(op_name, typ): ------- doc : str """ - op_name = op_name.replace('__', '') + op_name = op_name.replace("__", "") op_desc = _op_descriptions[op_name] - if op_name.startswith('r'): - equiv = 'other ' + op_desc['op'] + ' ' + typ + if op_name.startswith("r"): + equiv = "other " + op_desc["op"] + " " + typ else: - equiv = typ + ' ' + op_desc['op'] + ' other' + equiv = typ + " " + op_desc["op"] + " other" - if typ == 'series': + if typ == "series": base_doc = _flex_doc_SERIES doc_no_examples = base_doc.format( - desc=op_desc['desc'], + desc=op_desc["desc"], op_name=op_name, equiv=equiv, - reverse=op_desc['reverse'] + reverse=op_desc["reverse"], ) - if op_desc['series_examples']: - doc = doc_no_examples + op_desc['series_examples'] + if op_desc["series_examples"]: + doc = doc_no_examples + op_desc["series_examples"] else: doc = doc_no_examples - elif typ == 'dataframe': + elif typ == "dataframe": base_doc = _flex_doc_FRAME doc = base_doc.format( - desc=op_desc['desc'], + desc=op_desc["desc"], op_name=op_name, equiv=equiv, - reverse=op_desc['reverse'] + reverse=op_desc["reverse"], ) else: - raise AssertionError('Invalid typ argument.') + raise AssertionError("Invalid typ argument.") return doc # ----------------------------------------------------------------------------- # Masking NA values and fallbacks for operations numpy does not support + def fill_binop(left, right, fill_value): """ If a non-None fill_value is given, replace null entries in left and right @@ -1097,8 +1136,7 @@ def mask_cmp_op(x, y, op): if isinstance(y, (np.ndarray, ABCSeries)): yrav = y.ravel() mask = notna(xrav) & notna(yrav) - result[mask] = op(np.array(list(xrav[mask])), - np.array(list(yrav[mask]))) + result[mask] = op(np.array(list(xrav[mask])), np.array(list(yrav[mask]))) else: mask = notna(xrav) result[mask] = op(np.array(list(xrav[mask])), y) @@ -1140,12 +1178,11 @@ def masked_arith_op(x, y, op): # Without specifically raising here we get mismatched # errors in Py3 (TypeError) vs Py2 (ValueError) # Note: Only = an issue in DataFrame case - raise ValueError('Cannot broadcast operands together.') + raise ValueError("Cannot broadcast operands together.") if mask.any(): - with np.errstate(all='ignore'): - result[mask] = op(xrav[mask], - com.values_from_object(yrav[mask])) + with np.errstate(all="ignore"): + result[mask] = op(xrav[mask], com.values_from_object(yrav[mask])) else: assert is_scalar(y), type(y) @@ -1161,7 +1198,7 @@ def masked_arith_op(x, y, op): mask = np.where(y == 1, False, mask) if mask.any(): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result[mask] = op(xrav[mask], y) result, changed = maybe_upcast_putmask(result, ~mask, np.nan) @@ -1193,14 +1230,18 @@ def invalid_comparison(left, right, op): elif op is operator.ne: res_values = np.ones(left.shape, dtype=bool) else: - raise TypeError("Invalid comparison between dtype={dtype} and {typ}" - .format(dtype=left.dtype, typ=type(right).__name__)) + raise TypeError( + "Invalid comparison between dtype={dtype} and {typ}".format( + dtype=left.dtype, typ=type(right).__name__ + ) + ) return res_values # ----------------------------------------------------------------------------- # Dispatch logic + def should_series_dispatch(left, right, op): """ Identify cases where a DataFrame operation should dispatch to its @@ -1226,8 +1267,9 @@ def should_series_dispatch(left, right, op): ldtype = left.dtypes.iloc[0] rdtype = right.dtypes.iloc[0] - if ((is_timedelta64_dtype(ldtype) and is_integer_dtype(rdtype)) or - (is_timedelta64_dtype(rdtype) and is_integer_dtype(ldtype))): + if (is_timedelta64_dtype(ldtype) and is_integer_dtype(rdtype)) or ( + is_timedelta64_dtype(rdtype) and is_integer_dtype(ldtype) + ): # numpy integer dtypes as timedelta64 dtypes in this scenario return True @@ -1263,15 +1305,13 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): if lib.is_scalar(right) or np.ndim(right) == 0: def column_op(a, b): - return {i: func(a.iloc[:, i], b) - for i in range(len(a.columns))} + return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))} elif isinstance(right, ABCDataFrame): assert right._indexed_same(left) def column_op(a, b): - return {i: func(a.iloc[:, i], b.iloc[:, i]) - for i in range(len(a.columns))} + return {i: func(a.iloc[:, i], b.iloc[:, i]) for i in range(len(a.columns))} elif isinstance(right, ABCSeries) and axis == "columns": # We only get here if called via left._combine_match_columns, @@ -1279,15 +1319,13 @@ def column_op(a, b): assert right.index.equals(left.columns) def column_op(a, b): - return {i: func(a.iloc[:, i], b.iloc[i]) - for i in range(len(a.columns))} + return {i: func(a.iloc[:, i], b.iloc[i]) for i in range(len(a.columns))} elif isinstance(right, ABCSeries): assert right.index.equals(left.index) # Handle other cases later def column_op(a, b): - return {i: func(a.iloc[:, i], b) - for i in range(len(a.columns))} + return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))} else: # Remaining cases have less-obvious dispatch rules @@ -1324,15 +1362,17 @@ def dispatch_to_index_op(op, left, right, index_class): # avoid accidentally allowing integer add/sub. For datetime64[tz] dtypes, # left_idx may inherit a freq from a cached DatetimeIndex. # See discussion in GH#19147. - if getattr(left_idx, 'freq', None) is not None: + if getattr(left_idx, "freq", None) is not None: left_idx = left_idx._shallow_copy(freq=None) try: result = op(left_idx, right) except NullFrequencyError: # DatetimeIndex and TimedeltaIndex with freq == None raise ValueError # on add/sub of integers (or int-like). We re-raise as a TypeError. - raise TypeError('incompatible type for a datetime/timedelta ' - 'operation [{name}]'.format(name=op.__name__)) + raise TypeError( + "incompatible type for a datetime/timedelta " + "operation [{name}]".format(name=op.__name__) + ) return result @@ -1359,9 +1399,8 @@ def dispatch_to_extension_op(op, left, right): res_values = op(new_left, new_right) res_name = get_op_result_name(left, right) - if op.__name__ in ['divmod', 'rdivmod']: - return _construct_divmod_result( - left, res_values, left.index, res_name) + if op.__name__ in ["divmod", "rdivmod"]: + return _construct_divmod_result(left, res_values, left.index, res_name) return _construct_result(left, res_values, left.index, res_name) @@ -1370,6 +1409,7 @@ def dispatch_to_extension_op(op, left, right): # Functions that add arithmetic methods to objects, given arithmetic factory # methods + def _get_method_wrappers(cls): """ Find the appropriate operation-wrappers to use when defining flex/special @@ -1451,33 +1491,39 @@ def _create_methods(cls, arith_method, comp_method, bool_method, special): rpow=arith_method(cls, rpow, special), rmod=arith_method(cls, rmod, special)) # yapf: enable - new_methods['div'] = new_methods['truediv'] - new_methods['rdiv'] = new_methods['rtruediv'] + new_methods["div"] = new_methods["truediv"] + new_methods["rdiv"] = new_methods["rtruediv"] if have_divmod: # divmod doesn't have an op that is supported by numexpr - new_methods['divmod'] = arith_method(cls, divmod, special) - new_methods['rdivmod'] = arith_method(cls, rdivmod, special) + new_methods["divmod"] = arith_method(cls, divmod, special) + new_methods["rdivmod"] = arith_method(cls, rdivmod, special) - new_methods.update(dict( - eq=comp_method(cls, operator.eq, special), - ne=comp_method(cls, operator.ne, special), - lt=comp_method(cls, operator.lt, special), - gt=comp_method(cls, operator.gt, special), - le=comp_method(cls, operator.le, special), - ge=comp_method(cls, operator.ge, special))) + new_methods.update( + dict( + eq=comp_method(cls, operator.eq, special), + ne=comp_method(cls, operator.ne, special), + lt=comp_method(cls, operator.lt, special), + gt=comp_method(cls, operator.gt, special), + le=comp_method(cls, operator.le, special), + ge=comp_method(cls, operator.ge, special), + ) + ) if bool_method: new_methods.update( - dict(and_=bool_method(cls, operator.and_, special), - or_=bool_method(cls, operator.or_, special), - # For some reason ``^`` wasn't used in original. - xor=bool_method(cls, operator.xor, special), - rand_=bool_method(cls, rand_, special), - ror_=bool_method(cls, ror_, special), - rxor=bool_method(cls, rxor, special))) + dict( + and_=bool_method(cls, operator.and_, special), + or_=bool_method(cls, operator.or_, special), + # For some reason ``^`` wasn't used in original. + xor=bool_method(cls, operator.xor, special), + rand_=bool_method(cls, rand_, special), + ror_=bool_method(cls, ror_, special), + rxor=bool_method(cls, rxor, special), + ) + ) if special: - dunderize = lambda x: '__{name}__'.format(name=x.strip('_')) + dunderize = lambda x: "__{name}__".format(name=x.strip("_")) else: dunderize = lambda x: x new_methods = {dunderize(k): v for k, v in new_methods.items()} @@ -1490,8 +1536,7 @@ def add_methods(cls, new_methods): # of the same name, it is OK to over-write it. The exception is # inplace methods (__iadd__, __isub__, ...) for SparseArray, which # retain the np.ndarray versions. - force = not (issubclass(cls, ABCSparseArray) and - name.startswith('__i')) + force = not (issubclass(cls, ABCSparseArray) and name.startswith("__i")) if force or name not in cls.__dict__: setattr(cls, name, method) @@ -1509,8 +1554,9 @@ def add_special_arithmetic_methods(cls): special methods will be defined and pinned to this class """ _, _, arith_method, comp_method, bool_method = _get_method_wrappers(cls) - new_methods = _create_methods(cls, arith_method, comp_method, bool_method, - special=True) + new_methods = _create_methods( + cls, arith_method, comp_method, bool_method, special=True + ) # inplace operators (I feel like these should get passed an `inplace=True` # or just be removed @@ -1524,8 +1570,9 @@ def f(self, other): # this makes sure that we are aligned like the input # we are updating inplace so we want to ignore is_copy - self._update_inplace(result.reindex_like(self, copy=False)._data, - verify_is_copy=False) + self._update_inplace( + result.reindex_like(self, copy=False)._data, verify_is_copy=False + ) return self @@ -1533,18 +1580,24 @@ def f(self, other): return f new_methods.update( - dict(__iadd__=_wrap_inplace_method(new_methods["__add__"]), - __isub__=_wrap_inplace_method(new_methods["__sub__"]), - __imul__=_wrap_inplace_method(new_methods["__mul__"]), - __itruediv__=_wrap_inplace_method(new_methods["__truediv__"]), - __ifloordiv__=_wrap_inplace_method(new_methods["__floordiv__"]), - __imod__=_wrap_inplace_method(new_methods["__mod__"]), - __ipow__=_wrap_inplace_method(new_methods["__pow__"]))) + dict( + __iadd__=_wrap_inplace_method(new_methods["__add__"]), + __isub__=_wrap_inplace_method(new_methods["__sub__"]), + __imul__=_wrap_inplace_method(new_methods["__mul__"]), + __itruediv__=_wrap_inplace_method(new_methods["__truediv__"]), + __ifloordiv__=_wrap_inplace_method(new_methods["__floordiv__"]), + __imod__=_wrap_inplace_method(new_methods["__mod__"]), + __ipow__=_wrap_inplace_method(new_methods["__pow__"]), + ) + ) new_methods.update( - dict(__iand__=_wrap_inplace_method(new_methods["__and__"]), - __ior__=_wrap_inplace_method(new_methods["__or__"]), - __ixor__=_wrap_inplace_method(new_methods["__xor__"]))) + dict( + __iand__=_wrap_inplace_method(new_methods["__and__"]), + __ior__=_wrap_inplace_method(new_methods["__or__"]), + __ixor__=_wrap_inplace_method(new_methods["__xor__"]), + ) + ) add_methods(cls, new_methods=new_methods) @@ -1560,14 +1613,18 @@ def add_flex_arithmetic_methods(cls): flex methods will be defined and pinned to this class """ flex_arith_method, flex_comp_method, _, _, _ = _get_method_wrappers(cls) - new_methods = _create_methods(cls, flex_arith_method, - flex_comp_method, bool_method=None, - special=False) - new_methods.update(dict(multiply=new_methods['mul'], - subtract=new_methods['sub'], - divide=new_methods['div'])) + new_methods = _create_methods( + cls, flex_arith_method, flex_comp_method, bool_method=None, special=False + ) + new_methods.update( + dict( + multiply=new_methods["mul"], + subtract=new_methods["sub"], + divide=new_methods["div"], + ) + ) # opt out of bool flex methods for now - assert not any(kname in new_methods for kname in ('ror_', 'rxor', 'rand_')) + assert not any(kname in new_methods for kname in ("ror_", "rxor", "rand_")) add_methods(cls, new_methods=new_methods) @@ -1575,6 +1632,7 @@ def add_flex_arithmetic_methods(cls): # ----------------------------------------------------------------------------- # Series + def _align_method_SERIES(left, right, align_asobject=False): """ align lhs and rhs Series """ @@ -1612,10 +1670,8 @@ def _construct_divmod_result(left, result, index, name, dtype=None): """divmod returns a tuple of like indexed series instead of a single series. """ return ( - _construct_result(left, result[0], index=index, name=name, - dtype=dtype), - _construct_result(left, result[1], index=index, name=name, - dtype=dtype), + _construct_result(left, result[0], index=index, name=name, dtype=dtype), + _construct_result(left, result[1], index=index, name=name, dtype=dtype), ) @@ -1628,8 +1684,9 @@ def _arith_method_SERIES(cls, op, special): op_name = _get_op_name(op, special) eval_kwargs = _gen_eval_kwargs(op_name) fill_zeros = _gen_fill_zeros(op_name) - construct_result = (_construct_divmod_result - if op in [divmod, rdivmod] else _construct_result) + construct_result = ( + _construct_divmod_result if op in [divmod, rdivmod] else _construct_result + ) def na_op(x, y): """ @@ -1651,21 +1708,20 @@ def na_op(x, y): TypeError : invalid operation """ import pandas.core.computation.expressions as expressions + try: result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs) except TypeError: result = masked_arith_op(x, y, op) except Exception: # TODO: more specific? if is_object_dtype(x): - return libalgos.arrmap_object(x, - lambda val: op(val, y)) + return libalgos.arrmap_object(x, lambda val: op(val, y)) raise if isinstance(result, tuple): # e.g. divmod result = tuple( - missing.fill_zeros(r, x, y, op_name, fill_zeros) - for r in result + missing.fill_zeros(r, x, y, op_name, fill_zeros) for r in result ) else: result = missing.fill_zeros(result, x, y, op_name, fill_zeros) @@ -1680,27 +1736,29 @@ def wrapper(left, right): right = maybe_upcast_for_op(right) if is_categorical_dtype(left): - raise TypeError("{typ} cannot perform the operation " - "{op}".format(typ=type(left).__name__, op=str_rep)) + raise TypeError( + "{typ} cannot perform the operation " + "{op}".format(typ=type(left).__name__, op=str_rep) + ) elif is_datetime64_dtype(left) or is_datetime64tz_dtype(left): # Give dispatch_to_index_op a chance for tests like # test_dt64_series_add_intlike, which the index dispatching handles # specifically. result = dispatch_to_index_op(op, left, right, pd.DatetimeIndex) - return construct_result(left, result, - index=left.index, name=res_name, - dtype=result.dtype) + return construct_result( + left, result, index=left.index, name=res_name, dtype=result.dtype + ) - elif (is_extension_array_dtype(left) or - (is_extension_array_dtype(right) and not is_scalar(right))): + elif is_extension_array_dtype(left) or ( + is_extension_array_dtype(right) and not is_scalar(right) + ): # GH#22378 disallow scalar to exclude e.g. "category", "Int64" return dispatch_to_extension_op(op, left, right) elif is_timedelta64_dtype(left): result = dispatch_to_index_op(op, left, right, pd.TimedeltaIndex) - return construct_result(left, result, - index=left.index, name=res_name) + return construct_result(left, result, index=left.index, name=res_name) elif is_timedelta64_dtype(right): # We should only get here with non-scalar or timedelta64('NaT') @@ -1709,19 +1767,20 @@ def wrapper(left, right): # that may incorrectly raise TypeError when we # should get NullFrequencyError result = op(pd.Index(left), right) - return construct_result(left, result, - index=left.index, name=res_name, - dtype=result.dtype) + return construct_result( + left, result, index=left.index, name=res_name, dtype=result.dtype + ) lvalues = left.values rvalues = right if isinstance(rvalues, ABCSeries): rvalues = rvalues.values - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = na_op(lvalues, rvalues) - return construct_result(left, result, - index=left.index, name=res_name, dtype=None) + return construct_result( + left, result, index=left.index, name=res_name, dtype=None + ) wrapper.__name__ = op_name return wrapper @@ -1749,7 +1808,7 @@ def _comp_method_SERIES(cls, op, special): code duplication. """ op_name = _get_op_name(op, special) - masker = _gen_eval_kwargs(op_name).get('masker', False) + masker = _gen_eval_kwargs(op_name).get("masker", False) def na_op(x, y): # TODO: @@ -1779,12 +1838,12 @@ def na_op(x, y): mask = None if not is_scalar(y) and needs_i8_conversion(y): mask = isna(x) | isna(y) - y = y.view('i8') - x = x.view('i8') + y = y.view("i8") + x = x.view("i8") method = getattr(x, op_name, None) if method is not None: - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = method(y) if result is NotImplemented: return invalid_comparison(x, y, op) @@ -1812,21 +1871,20 @@ def wrapper(self, other, axis=None): return NotImplemented elif isinstance(other, ABCSeries) and not self._indexed_same(other): - raise ValueError("Can only compare identically-labeled " - "Series objects") + raise ValueError("Can only compare identically-labeled " "Series objects") elif is_categorical_dtype(self): # Dispatch to Categorical implementation; pd.CategoricalIndex # behavior is non-canonical GH#19513 res_values = dispatch_to_index_op(op, self, other, pd.Categorical) - return self._constructor(res_values, index=self.index, - name=res_name) + return self._constructor(res_values, index=self.index, name=res_name) elif is_datetime64_dtype(self) or is_datetime64tz_dtype(self): # Dispatch to DatetimeIndex to ensure identical # Series/Index behavior - if (isinstance(other, datetime.date) and - not isinstance(other, datetime.datetime)): + if isinstance(other, datetime.date) and not isinstance( + other, datetime.datetime + ): # https://github.com/pandas-dev/pandas/issues/21152 # Compatibility for difference between Series comparison w/ # datetime and date @@ -1844,27 +1902,23 @@ def wrapper(self, other, axis=None): future = "a TypeError will be raised" else: future = ( - "'the values will not compare equal to the " - "'datetime.date'" + "'the values will not compare equal to the " "'datetime.date'" ) - msg = '\n'.join(textwrap.wrap(msg.format(future=future))) + msg = "\n".join(textwrap.wrap(msg.format(future=future))) warnings.warn(msg, FutureWarning, stacklevel=2) other = pd.Timestamp(other) - res_values = dispatch_to_index_op(op, self, other, - pd.DatetimeIndex) + res_values = dispatch_to_index_op(op, self, other, pd.DatetimeIndex) - return self._constructor(res_values, index=self.index, - name=res_name) + return self._constructor(res_values, index=self.index, name=res_name) elif is_timedelta64_dtype(self): - res_values = dispatch_to_index_op(op, self, other, - pd.TimedeltaIndex) - return self._constructor(res_values, index=self.index, - name=res_name) + res_values = dispatch_to_index_op(op, self, other, pd.TimedeltaIndex) + return self._constructor(res_values, index=self.index, name=res_name) - elif (is_extension_array_dtype(self) or - (is_extension_array_dtype(other) and not is_scalar(other))): + elif is_extension_array_dtype(self) or ( + is_extension_array_dtype(other) and not is_scalar(other) + ): # Note: the `not is_scalar(other)` condition rules out # e.g. other == "category" return dispatch_to_extension_op(op, self, other) @@ -1874,14 +1928,15 @@ def wrapper(self, other, axis=None): res_values = na_op(self.values, other.values) # rename is needed in case res_name is None and res_values.name # is not. - return self._constructor(res_values, index=self.index, - name=res_name).rename(res_name) + return self._constructor( + res_values, index=self.index, name=res_name + ).rename(res_name) elif isinstance(other, (np.ndarray, pd.Index)): # do not check length of zerodim array # as it will broadcast if other.ndim != 0 and len(self) != len(other): - raise ValueError('Lengths must match to compare') + raise ValueError("Lengths must match to compare") res_values = na_op(self.values, np.asarray(other)) result = self._constructor(res_values, index=self.index) @@ -1895,22 +1950,25 @@ def wrapper(self, other, axis=None): res_values = np.ones(len(self), dtype=bool) else: res_values = np.zeros(len(self), dtype=bool) - return self._constructor(res_values, index=self.index, - name=res_name, dtype='bool') + return self._constructor( + res_values, index=self.index, name=res_name, dtype="bool" + ) else: values = self.to_numpy() - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): res = na_op(values, other) if is_scalar(res): - raise TypeError('Could not compare {typ} type with Series' - .format(typ=type(other))) + raise TypeError( + "Could not compare {typ} type with Series".format(typ=type(other)) + ) # always return a full value series here res_values = com.values_from_object(res) - return self._constructor(res_values, index=self.index, - name=res_name, dtype='bool') + return self._constructor( + res_values, index=self.index, name=res_name, dtype="bool" + ) wrapper.__name__ = op_name return wrapper @@ -1941,12 +1999,19 @@ def na_op(x, y): y = bool(y) try: result = libops.scalar_binop(x, y, op) - except (TypeError, ValueError, AttributeError, - OverflowError, NotImplementedError): - raise TypeError("cannot compare a dtyped [{dtype}] array " - "with a scalar of type [{typ}]" - .format(dtype=x.dtype, - typ=type(y).__name__)) + except ( + TypeError, + ValueError, + AttributeError, + OverflowError, + NotImplementedError, + ): + raise TypeError( + "cannot compare a dtyped [{dtype}] array " + "with a scalar of type [{typ}]".format( + dtype=x.dtype, typ=type(y).__name__ + ) + ) return result @@ -1984,11 +2049,9 @@ def wrapper(self, other): # For int vs int `^`, `|`, `&` are bitwise operators and return # integer dtypes. Otherwise these are boolean ops - filler = (fill_int if is_self_int_dtype and is_other_int_dtype - else fill_bool) + filler = fill_int if is_self_int_dtype and is_other_int_dtype else fill_bool res_values = na_op(self.values, ovalues) - unfilled = self._constructor(res_values, - index=self.index, name=res_name) + unfilled = self._constructor(res_values, index=self.index, name=res_name) filled = filler(unfilled) return finalizer(filled) @@ -1998,7 +2061,7 @@ def wrapper(self, other): def _flex_method_SERIES(cls, op, special): name = _get_op_name(op, special) - doc = _make_flex_doc(name, 'series') + doc = _make_flex_doc(name, "series") @Appender(doc) def flex_wrapper(self, other, level=None, fill_value=None, axis=0): @@ -2009,15 +2072,14 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): return self._binop(other, op, level=level, fill_value=fill_value) elif isinstance(other, (np.ndarray, list, tuple)): if len(other) != len(self): - raise ValueError('Lengths must be equal') + raise ValueError("Lengths must be equal") other = self._constructor(other, self.index) return self._binop(other, op, level=level, fill_value=fill_value) else: if fill_value is not None: self = self.fillna(fill_value) - return self._constructor(op(self, other), - self.index).__finalize__(self) + return self._constructor(op(self, other), self.index).__finalize__(self) flex_wrapper.__name__ = name return flex_wrapper @@ -2027,8 +2089,7 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): # DataFrame -def _combine_series_frame(self, other, func, fill_value=None, axis=None, - level=None): +def _combine_series_frame(self, other, func, fill_value=None, axis=None, level=None): """ Apply binary operator `func` to self, other using alignment and fill conventions determined by the fill_value, axis, and level kwargs. @@ -2047,8 +2108,9 @@ def _combine_series_frame(self, other, func, fill_value=None, axis=None, result : DataFrame """ if fill_value is not None: - raise NotImplementedError("fill_value {fill} not supported." - .format(fill=fill_value)) + raise NotImplementedError( + "fill_value {fill} not supported.".format(fill=fill_value) + ) if axis is not None: axis = self._get_axis_number(axis) @@ -2062,8 +2124,9 @@ def _combine_series_frame(self, other, func, fill_value=None, axis=None, if not len(self): # Ambiguous case, use _series so works with DataFrame - return self._constructor(data=self._series, index=self.index, - columns=self.columns) + return self._constructor( + data=self._series, index=self.index, columns=self.columns + ) # default axis is columns return self._combine_match_columns(other, func, level=level) @@ -2073,17 +2136,20 @@ def _align_method_FRAME(left, right, axis): """ convert rhs to meet lhs dims if input is list, tuple or np.ndarray """ def to_series(right): - msg = ('Unable to coerce to Series, length must be {req_len}: ' - 'given {given_len}') - if axis is not None and left._get_axis_name(axis) == 'index': + msg = ( + "Unable to coerce to Series, length must be {req_len}: " "given {given_len}" + ) + if axis is not None and left._get_axis_name(axis) == "index": if len(left.index) != len(right): - raise ValueError(msg.format(req_len=len(left.index), - given_len=len(right))) + raise ValueError( + msg.format(req_len=len(left.index), given_len=len(right)) + ) right = left._constructor_sliced(right, index=left.index) else: if len(left.columns) != len(right): - raise ValueError(msg.format(req_len=len(left.columns), - given_len=len(right))) + raise ValueError( + msg.format(req_len=len(left.columns), given_len=len(right)) + ) right = left._constructor_sliced(right, index=left.columns) return right @@ -2094,32 +2160,32 @@ def to_series(right): elif right.ndim == 2: if right.shape == left.shape: - right = left._constructor(right, index=left.index, - columns=left.columns) + right = left._constructor(right, index=left.index, columns=left.columns) elif right.shape[0] == left.shape[0] and right.shape[1] == 1: # Broadcast across columns right = np.broadcast_to(right, left.shape) - right = left._constructor(right, - index=left.index, - columns=left.columns) + right = left._constructor(right, index=left.index, columns=left.columns) elif right.shape[1] == left.shape[1] and right.shape[0] == 1: # Broadcast along rows right = to_series(right[0, :]) else: - raise ValueError("Unable to coerce to DataFrame, shape " - "must be {req_shape}: given {given_shape}" - .format(req_shape=left.shape, - given_shape=right.shape)) + raise ValueError( + "Unable to coerce to DataFrame, shape " + "must be {req_shape}: given {given_shape}".format( + req_shape=left.shape, given_shape=right.shape + ) + ) elif right.ndim > 2: - raise ValueError('Unable to coerce to Series/DataFrame, dim ' - 'must be <= 2: {dim}'.format(dim=right.shape)) + raise ValueError( + "Unable to coerce to Series/DataFrame, dim " + "must be <= 2: {dim}".format(dim=right.shape) + ) - elif (is_list_like(right) and - not isinstance(right, (ABCSeries, ABCDataFrame))): + elif is_list_like(right) and not isinstance(right, (ABCSeries, ABCDataFrame)): # GH17901 right = to_series(right) @@ -2146,7 +2212,7 @@ def na_op(x, y): if op_name in _op_descriptions: # i.e. include "add" but not "__add__" - doc = _make_flex_doc(op_name, 'dataframe') + doc = _make_flex_doc(op_name, "dataframe") else: doc = _arith_doc_FRAME % op_name @@ -2163,9 +2229,9 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): # For these values of `axis`, we end up dispatching to Series op, # so do not want the masked op. pass_op = op if axis in [0, "columns", None] else na_op - return _combine_series_frame(self, other, pass_op, - fill_value=fill_value, axis=axis, - level=level) + return _combine_series_frame( + self, other, pass_op, fill_value=fill_value, axis=axis, level=level + ) else: if fill_value is not None: self = self.fillna(fill_value) @@ -2185,14 +2251,15 @@ def _flex_comp_method_FRAME(cls, op, special): def na_op(x, y): try: - with np.errstate(invalid='ignore'): + with np.errstate(invalid="ignore"): result = op(x, y) except TypeError: result = mask_cmp_op(x, y, op) return result - doc = _flex_comp_doc_FRAME.format(op_name=op_name, - desc=_op_descriptions[op_name]['desc']) + doc = _flex_comp_doc_FRAME.format( + op_name=op_name, desc=_op_descriptions[op_name]["desc"] + ) @Appender(doc) def f(self, other, axis=default_axis, level=None): @@ -2202,14 +2269,13 @@ def f(self, other, axis=default_axis, level=None): if isinstance(other, ABCDataFrame): # Another DataFrame if not self._indexed_same(other): - self, other = self.align(other, 'outer', - level=level, copy=False) + self, other = self.align(other, "outer", level=level, copy=False) return dispatch_to_series(self, other, na_op, str_rep) elif isinstance(other, ABCSeries): - return _combine_series_frame(self, other, na_op, - fill_value=None, axis=axis, - level=level) + return _combine_series_frame( + self, other, na_op, fill_value=None, axis=axis, level=level + ) else: assert np.ndim(other) == 0, other return self._combine_const(other, na_op) @@ -2223,7 +2289,7 @@ def _comp_method_FRAME(cls, func, special): str_rep = _get_opstr(func, cls) op_name = _get_op_name(func, special) - @Appender('Wrapper for comparison method {name}'.format(name=op_name)) + @Appender("Wrapper for comparison method {name}".format(name=op_name)) def f(self, other): other = _align_method_FRAME(self, other, axis=None) @@ -2231,14 +2297,15 @@ def f(self, other): if isinstance(other, ABCDataFrame): # Another DataFrame if not self._indexed_same(other): - raise ValueError('Can only compare identically-labeled ' - 'DataFrame objects') + raise ValueError( + "Can only compare identically-labeled " "DataFrame objects" + ) return dispatch_to_series(self, other, func, str_rep) elif isinstance(other, ABCSeries): - return _combine_series_frame(self, other, func, - fill_value=None, axis=None, - level=None) + return _combine_series_frame( + self, other, func, fill_value=None, axis=None, level=None + ) else: # straight boolean comparisons we want to allow all columns @@ -2254,6 +2321,7 @@ def f(self, other): # ----------------------------------------------------------------------------- # Sparse + def _cast_sparse_series_op(left, right, opname): """ For SparseSeries operation, coerce to float64 if the result is expected @@ -2272,15 +2340,15 @@ def _cast_sparse_series_op(left, right, opname): """ from pandas.core.sparse.api import SparseDtype - opname = opname.strip('_') + opname = opname.strip("_") # TODO: This should be moved to the array? if is_integer_dtype(left) and is_integer_dtype(right): # series coerces to float64 if result should have NaN/inf - if opname in ('floordiv', 'mod') and (right.to_dense() == 0).any(): + if opname in ("floordiv", "mod") and (right.to_dense() == 0).any(): left = left.astype(SparseDtype(np.float64, left.fill_value)) right = right.astype(SparseDtype(np.float64, right.fill_value)) - elif opname in ('rfloordiv', 'rmod') and (left.to_dense() == 0).any(): + elif opname in ("rfloordiv", "rmod") and (left.to_dense() == 0).any(): left = left.astype(SparseDtype(np.float64, left.fill_value)) right = right.astype(SparseDtype(np.float64, right.fill_value)) @@ -2302,25 +2370,25 @@ def wrapper(self, other): other = other.to_sparse(fill_value=self.fill_value) return _sparse_series_op(self, other, op, op_name) elif is_scalar(other): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): new_values = op(self.values, other) - return self._constructor(new_values, - index=self.index, - name=self.name) + return self._constructor(new_values, index=self.index, name=self.name) else: # pragma: no cover - raise TypeError('operation with {other} not supported' - .format(other=type(other))) + raise TypeError( + "operation with {other} not supported".format(other=type(other)) + ) wrapper.__name__ = op_name return wrapper def _sparse_series_op(left, right, op, name): - left, right = left.align(right, join='outer', copy=False) + left, right = left.align(right, join="outer", copy=False) new_index = left.index new_name = get_op_result_name(left, right) from pandas.core.arrays.sparse import _sparse_array_op + lvalues, rvalues = _cast_sparse_series_op(left.values, right.values, name) result = _sparse_array_op(lvalues, rvalues, op, name) return left._constructor(result, index=new_index, name=new_name) @@ -2335,36 +2403,40 @@ def _arith_method_SPARSE_ARRAY(cls, op, special): def wrapper(self, other): from pandas.core.arrays.sparse.array import ( - SparseArray, _sparse_array_op, _wrap_result, _get_fill) + SparseArray, + _sparse_array_op, + _wrap_result, + _get_fill, + ) + if isinstance(other, np.ndarray): if len(self) != len(other): - raise AssertionError("length mismatch: {self} vs. {other}" - .format(self=len(self), other=len(other))) + raise AssertionError( + "length mismatch: {self} vs. {other}".format( + self=len(self), other=len(other) + ) + ) if not isinstance(other, SparseArray): - dtype = getattr(other, 'dtype', None) - other = SparseArray(other, fill_value=self.fill_value, - dtype=dtype) + dtype = getattr(other, "dtype", None) + other = SparseArray(other, fill_value=self.fill_value, dtype=dtype) return _sparse_array_op(self, other, op, op_name) elif is_scalar(other): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): fill = op(_get_fill(self), np.asarray(other)) result = op(self.sp_values, other) return _wrap_result(op_name, result, self.sp_index, fill) else: # pragma: no cover - raise TypeError('operation with {other} not supported' - .format(other=type(other))) + raise TypeError( + "operation with {other} not supported".format(other=type(other)) + ) wrapper.__name__ = op_name return wrapper def maybe_dispatch_ufunc_to_dunder_op( - self: ArrayLike, - ufunc: Callable, - method: str, - *inputs: ArrayLike, - **kwargs: Any + self: ArrayLike, ufunc: Callable, method: str, *inputs: ArrayLike, **kwargs: Any ): """ Dispatch a ufunc to the equivalent dunder method. @@ -2387,33 +2459,48 @@ def maybe_dispatch_ufunc_to_dunder_op( The result of applying the ufunc """ # special has the ufuncs we dispatch to the dunder op on - special = {'add', 'sub', 'mul', 'pow', 'mod', 'floordiv', 'truediv', - 'divmod', 'eq', 'ne', 'lt', 'gt', 'le', 'ge', 'remainder', - 'matmul'} + special = { + "add", + "sub", + "mul", + "pow", + "mod", + "floordiv", + "truediv", + "divmod", + "eq", + "ne", + "lt", + "gt", + "le", + "ge", + "remainder", + "matmul", + } aliases = { - 'subtract': 'sub', - 'multiply': 'mul', - 'floor_divide': 'floordiv', - 'true_divide': 'truediv', - 'power': 'pow', - 'remainder': 'mod', - 'divide': 'div', - 'equal': 'eq', - 'not_equal': 'ne', - 'less': 'lt', - 'less_equal': 'le', - 'greater': 'gt', - 'greater_equal': 'ge', + "subtract": "sub", + "multiply": "mul", + "floor_divide": "floordiv", + "true_divide": "truediv", + "power": "pow", + "remainder": "mod", + "divide": "div", + "equal": "eq", + "not_equal": "ne", + "less": "lt", + "less_equal": "le", + "greater": "gt", + "greater_equal": "ge", } # For op(., Array) -> Array.__r{op}__ flipped = { - 'lt': '__gt__', - 'le': '__ge__', - 'gt': '__lt__', - 'ge': '__le__', - 'eq': '__eq__', - 'ne': '__ne__', + "lt": "__gt__", + "le": "__ge__", + "gt": "__lt__", + "ge": "__le__", + "eq": "__eq__", + "ne": "__ne__", } op_name = ufunc.__name__ @@ -2422,13 +2509,12 @@ def maybe_dispatch_ufunc_to_dunder_op( def not_implemented(*args, **kwargs): return NotImplemented - if (method == '__call__' and op_name in special - and kwargs.get('out') is None): + if method == "__call__" and op_name in special and kwargs.get("out") is None: if isinstance(inputs[0], type(self)): - name = '__{}__'.format(op_name) + name = "__{}__".format(op_name) return getattr(self, name, not_implemented)(inputs[1]) else: - name = flipped.get(op_name, '__r{}__'.format(op_name)) + name = flipped.get(op_name, "__r{}__".format(op_name)) return getattr(self, name, not_implemented)(inputs[0]) else: return NotImplemented diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 632b5a9c5e0024..b4a3e6ed71bf41 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -21,8 +21,7 @@ from pandas.core.generic import _shared_docs from pandas.core.groupby.base import GroupByMixin from pandas.core.groupby.generic import SeriesGroupBy -from pandas.core.groupby.groupby import ( - GroupBy, _GroupBy, _pipe_template, groupby) +from pandas.core.groupby.groupby import GroupBy, _GroupBy, _pipe_template, groupby from pandas.core.groupby.grouper import Grouper from pandas.core.groupby.ops import BinGrouper from pandas.core.indexes.datetimes import DatetimeIndex, date_range @@ -60,8 +59,16 @@ class Resampler(_GroupBy): """ # to the groupby descriptor - _attributes = ['freq', 'axis', 'closed', 'label', 'convention', - 'loffset', 'base', 'kind'] + _attributes = [ + "freq", + "axis", + "closed", + "label", + "convention", + "loffset", + "base", + "kind", + ] def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): self.groupby = groupby @@ -83,11 +90,14 @@ def __str__(self): """ Provide a nice str repr of our rolling object. """ - attrs = ("{k}={v}".format(k=k, v=getattr(self.groupby, k)) - for k in self._attributes if - getattr(self.groupby, k, None) is not None) - return "{klass} [{attrs}]".format(klass=self.__class__.__name__, - attrs=', '.join(attrs)) + attrs = ( + "{k}={v}".format(k=k, v=getattr(self.groupby, k)) + for k in self._attributes + if getattr(self.groupby, k, None) is not None + ) + return "{klass} [{attrs}]".format( + klass=self.__class__.__name__, attrs=", ".join(attrs) + ) def __getattr__(self, attr): if attr in self._internal_names_set: @@ -129,8 +139,8 @@ def _typ(self): Masquerade for compat as a Series or a DataFrame. """ if isinstance(self._selected_obj, pd.Series): - return 'series' - return 'dataframe' + return "series" + return "dataframe" @property def _from_selection(self): @@ -139,9 +149,9 @@ def _from_selection(self): """ # upsampling and PeriodIndex resampling do not work # with selection, this state used to catch and raise an error - return (self.groupby is not None and - (self.groupby.key is not None or - self.groupby.level is not None)) + return self.groupby is not None and ( + self.groupby.key is not None or self.groupby.level is not None + ) def _convert_obj(self, obj): """ @@ -186,9 +196,10 @@ def _assure_grouper(self): """ self._set_binner() - @Substitution(klass='Resampler', - versionadded='.. versionadded:: 0.23.0', - examples=""" + @Substitution( + klass="Resampler", + versionadded=".. versionadded:: 0.23.0", + examples=""" >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, ... index=pd.date_range('2012-08-02', periods=4)) >>> df @@ -204,20 +215,24 @@ def _assure_grouper(self): >>> df.resample('2D').pipe(lambda x: x.max() - x.min()) A 2012-08-02 1 - 2012-08-04 1""") + 2012-08-04 1""", + ) @Appender(_pipe_template) def pipe(self, func, *args, **kwargs): return super().pipe(func, *args, **kwargs) - _agg_see_also_doc = dedent(""" + _agg_see_also_doc = dedent( + """ See Also -------- DataFrame.groupby.aggregate DataFrame.resample.transform DataFrame.aggregate - """) + """ + ) - _agg_examples_doc = dedent(""" + _agg_examples_doc = dedent( + """ Examples -------- >>> s = pd.Series([1,2,3,4,5], @@ -251,14 +266,17 @@ def pipe(self, func, *args, **kwargs): 2013-01-01 00:00:00 3 2.121320 2013-01-01 00:00:02 7 4.949747 2013-01-01 00:00:04 5 NaN - """) - - @Substitution(see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - versionadded='', - klass='DataFrame', - axis='') - @Appender(_shared_docs['aggregate']) + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="DataFrame", + axis="", + ) + @Appender(_shared_docs["aggregate"]) def aggregate(self, func, *args, **kwargs): self._set_binner() @@ -266,10 +284,7 @@ def aggregate(self, func, *args, **kwargs): if result is None: how = func grouper = None - result = self._groupby_and_aggregate(how, - grouper, - *args, - **kwargs) + result = self._groupby_and_aggregate(how, grouper, *args, **kwargs) result = self._apply_loffset(result) return result @@ -295,8 +310,7 @@ def transform(self, arg, *args, **kwargs): -------- >>> resampled.transform(lambda x: (x - x.mean()) / x.std()) """ - return self._selected_obj.groupby(self.groupby).transform( - arg, *args, **kwargs) + return self._selected_obj.groupby(self.groupby).transform(arg, *args, **kwargs) def _downsample(self, f): raise AbstractMethodError(self) @@ -370,10 +384,9 @@ def _apply_loffset(self, result): """ needs_offset = ( - isinstance(self.loffset, (DateOffset, timedelta, - np.timedelta64)) and - isinstance(result.index, DatetimeIndex) and - len(result.index) > 0 + isinstance(self.loffset, (DateOffset, timedelta, np.timedelta64)) + and isinstance(result.index, DatetimeIndex) + and len(result.index) > 0 ) if needs_offset: @@ -401,7 +414,7 @@ def _wrap_result(self, result): result.index = obj.index.asfreq(self.freq) else: result.index = obj.index._shallow_copy(freq=self.freq) - result.name = getattr(obj, 'name', None) + result.name = getattr(obj, "name", None) return result @@ -423,7 +436,8 @@ def pad(self, limit=None): Series.fillna DataFrame.fillna """ - return self._upsample('pad', limit=limit) + return self._upsample("pad", limit=limit) + ffill = pad def nearest(self, limit=None): @@ -486,7 +500,7 @@ def nearest(self, limit=None): 2018-01-01 01:00:00 2.0 Freq: 15T, dtype: float64 """ - return self._upsample('nearest', limit=limit) + return self._upsample("nearest", limit=limit) def backfill(self, limit=None): """ @@ -589,7 +603,8 @@ def backfill(self, limit=None): 2018-01-01 01:45:00 6.0 5.0 2018-01-01 02:00:00 6.0 5.0 """ - return self._upsample('backfill', limit=limit) + return self._upsample("backfill", limit=limit) + bfill = backfill def fillna(self, method, limit=None): @@ -752,21 +767,34 @@ def fillna(self, method, limit=None): """ return self._upsample(method, limit=limit) - @Appender(_shared_docs['interpolate'] % _shared_docs_kwargs) - def interpolate(self, method='linear', axis=0, limit=None, inplace=False, - limit_direction='forward', limit_area=None, - downcast=None, **kwargs): + @Appender(_shared_docs["interpolate"] % _shared_docs_kwargs) + def interpolate( + self, + method="linear", + axis=0, + limit=None, + inplace=False, + limit_direction="forward", + limit_area=None, + downcast=None, + **kwargs + ): """ Interpolate values according to different methods. .. versionadded:: 0.18.1 """ result = self._upsample(None) - return result.interpolate(method=method, axis=axis, limit=limit, - inplace=inplace, - limit_direction=limit_direction, - limit_area=limit_area, - downcast=downcast, **kwargs) + return result.interpolate( + method=method, + axis=axis, + limit=limit, + inplace=inplace, + limit_direction=limit_direction, + limit_area=limit_area, + downcast=downcast, + **kwargs + ) def asfreq(self, fill_value=None): """ @@ -790,7 +818,7 @@ def asfreq(self, fill_value=None): Series.asfreq DataFrame.asfreq """ - return self._upsample('asfreq', fill_value=fill_value) + return self._upsample("asfreq", fill_value=fill_value) def std(self, ddof=1, *args, **kwargs): """ @@ -806,8 +834,8 @@ def std(self, ddof=1, *args, **kwargs): DataFrame or Series Standard deviation of values within each group. """ - nv.validate_resampler_func('std', args, kwargs) - return self._downsample('std', ddof=ddof) + nv.validate_resampler_func("std", args, kwargs) + return self._downsample("std", ddof=ddof) def var(self, ddof=1, *args, **kwargs): """ @@ -823,16 +851,16 @@ def var(self, ddof=1, *args, **kwargs): DataFrame or Series Variance of values within each group. """ - nv.validate_resampler_func('var', args, kwargs) - return self._downsample('var', ddof=ddof) + nv.validate_resampler_func("var", args, kwargs) + return self._downsample("var", ddof=ddof) @Appender(GroupBy.size.__doc__) def size(self): # It's a special case as higher level does return # a copy of 0-len objects. GH14962 - result = self._downsample('size') + result = self._downsample("size") if not len(self.ax) and isinstance(self._selected_obj, ABCDataFrame): - result = pd.Series([], index=result.index, dtype='int64') + result = pd.Series([], index=result.index, dtype="int64") return result def quantile(self, q=0.5, **kwargs): @@ -856,40 +884,45 @@ def quantile(self, q=0.5, **kwargs): DataFrame.quantile DataFrameGroupBy.quantile """ - return self._downsample('quantile', q=q, **kwargs) + return self._downsample("quantile", q=q, **kwargs) # downsample methods -for method in ['sum', 'prod']: +for method in ["sum", "prod"]: def f(self, _method=method, min_count=0, *args, **kwargs): nv.validate_resampler_func(_method, args, kwargs) return self._downsample(_method, min_count=min_count) + f.__doc__ = getattr(GroupBy, method).__doc__ setattr(Resampler, method, f) # downsample methods -for method in ['min', 'max', 'first', 'last', 'mean', 'sem', - 'median', 'ohlc']: +for method in ["min", "max", "first", "last", "mean", "sem", "median", "ohlc"]: def g(self, _method=method, *args, **kwargs): nv.validate_resampler_func(_method, args, kwargs) return self._downsample(_method) + g.__doc__ = getattr(GroupBy, method).__doc__ setattr(Resampler, method, g) # groupby & aggregate methods -for method in ['count']: +for method in ["count"]: + def h(self, _method=method): return self._downsample(_method) + h.__doc__ = getattr(GroupBy, method).__doc__ setattr(Resampler, method, h) # series only methods -for method in ['nunique']: +for method in ["nunique"]: + def h(self, _method=method): return self._downsample(_method) + h.__doc__ = getattr(SeriesGroupBy, method).__doc__ setattr(Resampler, method, h) @@ -913,26 +946,30 @@ def _maybe_process_deprecations(r, how=None, fill_method=None, limit=None): # if we have both a how and fill_method, then show # the following warning if fill_method is None: - warnings.warn("how in .resample() is deprecated\n" - "the new syntax is " - ".resample(...).{method}".format( - method=method), - FutureWarning, stacklevel=3) + warnings.warn( + "how in .resample() is deprecated\n" + "the new syntax is " + ".resample(...).{method}".format(method=method), + FutureWarning, + stacklevel=3, + ) r = r.aggregate(how) if fill_method is not None: # show the prior function call - method = '.' + method if how is not None else '' + method = "." + method if how is not None else "" args = "limit={0}".format(limit) if limit is not None else "" - warnings.warn("fill_method is deprecated to .resample()\n" - "the new syntax is .resample(...){method}" - ".{fill_method}({args})".format( - method=method, - fill_method=fill_method, - args=args), - FutureWarning, stacklevel=3) + warnings.warn( + "fill_method is deprecated to .resample()\n" + "the new syntax is .resample(...){method}" + ".{fill_method}({args})".format( + method=method, fill_method=fill_method, args=args + ), + FutureWarning, + stacklevel=3, + ) if how is not None: r = getattr(r, fill_method)(limit=limit) @@ -946,10 +983,11 @@ class _GroupByMixin(GroupByMixin): """ Provide the groupby facilities. """ + def __init__(self, obj, *args, **kwargs): - parent = kwargs.pop('parent', None) - groupby = kwargs.pop('groupby', None) + parent = kwargs.pop("parent", None) + groupby = kwargs.pop("groupby", None) if parent is None: parent = obj @@ -988,7 +1026,6 @@ def func(x): class DatetimeIndexResampler(Resampler): - @property def _resampler_for_grouping(self): return DatetimeIndexResamplerGroupby @@ -996,7 +1033,7 @@ def _resampler_for_grouping(self): def _get_binner_for_time(self): # this is how we are actually creating the bins - if self.kind == 'period': + if self.kind == "period": return self.groupby._get_time_period_bins(self.ax) return self.groupby._get_time_bins(self.ax) @@ -1030,8 +1067,7 @@ def _downsample(self, how, **kwargs): # we are downsampling # we want to call the actual grouper method here - result = obj.groupby( - self.grouper, axis=self.axis).aggregate(how, **kwargs) + result = obj.groupby(self.grouper, axis=self.axis).aggregate(how, **kwargs) result = self._apply_loffset(result) return self._wrap_result(result) @@ -1042,7 +1078,7 @@ def _adjust_binner_for_upsample(self, binner): The range of a new index should not be outside specified range """ - if self.closed == 'right': + if self.closed == "right": binner = binner[1:] else: binner = binner[:-1] @@ -1066,12 +1102,14 @@ def _upsample(self, method, limit=None, fill_value=None): """ self._set_binner() if self.axis: - raise AssertionError('axis must be 0') + raise AssertionError("axis must be 0") if self._from_selection: - raise ValueError("Upsampling from level= or on= selection" - " is not supported, use .set_index(...)" - " to explicitly set index to" - " datetime-like") + raise ValueError( + "Upsampling from level= or on= selection" + " is not supported, use .set_index(...)" + " to explicitly set index to" + " datetime-like" + ) ax = self.ax obj = self._selected_obj @@ -1083,8 +1121,9 @@ def _upsample(self, method, limit=None, fill_value=None): result = obj.copy() result.index = res_index else: - result = obj.reindex(res_index, method=method, - limit=limit, fill_value=fill_value) + result = obj.reindex( + res_index, method=method, limit=limit, fill_value=fill_value + ) result = self._apply_loffset(result) return self._wrap_result(result) @@ -1094,7 +1133,7 @@ def _wrap_result(self, result): # we may have a different kind that we were asked originally # convert if needed - if self.kind == 'period' and not isinstance(result.index, PeriodIndex): + if self.kind == "period" and not isinstance(result.index, PeriodIndex): result.index = result.index.to_period(self.freq) return result @@ -1105,19 +1144,19 @@ class DatetimeIndexResamplerGroupby(_GroupByMixin, DatetimeIndexResampler): .. versionadded:: 0.18.1 """ + @property def _constructor(self): return DatetimeIndexResampler class PeriodIndexResampler(DatetimeIndexResampler): - @property def _resampler_for_grouping(self): return PeriodIndexResamplerGroupby def _get_binner_for_time(self): - if self.kind == 'timestamp': + if self.kind == "timestamp": return super()._get_binner_for_time() return self.groupby._get_period_bins(self.ax) @@ -1126,18 +1165,20 @@ def _convert_obj(self, obj): if self._from_selection: # see GH 14008, GH 12871 - msg = ("Resampling from level= or on= selection" - " with a PeriodIndex is not currently supported," - " use .set_index(...) to explicitly set index") + msg = ( + "Resampling from level= or on= selection" + " with a PeriodIndex is not currently supported," + " use .set_index(...) to explicitly set index" + ) raise NotImplementedError(msg) if self.loffset is not None: # Cannot apply loffset/timedelta to PeriodIndex -> convert to # timestamps - self.kind = 'timestamp' + self.kind = "timestamp" # convert to timestamp - if self.kind == 'timestamp': + if self.kind == "timestamp": obj = obj.to_timestamp(how=self.convention) return obj @@ -1153,7 +1194,7 @@ def _downsample(self, how, **kwargs): """ # we may need to actually resample as if we are timestamps - if self.kind == 'timestamp': + if self.kind == "timestamp": return super()._downsample(how, **kwargs) how = self._is_cython_func(how) or how @@ -1161,10 +1202,9 @@ def _downsample(self, how, **kwargs): if is_subperiod(ax.freq, self.freq): # Downsampling - return self._groupby_and_aggregate(how, grouper=self.grouper, - **kwargs) + return self._groupby_and_aggregate(how, grouper=self.grouper, **kwargs) elif is_superperiod(ax.freq, self.freq): - if how == 'ohlc': + if how == "ohlc": # GH #13083 # upsampling to subperiods is handled as an asfreq, which works # for pure aggregating/reducing methods @@ -1176,8 +1216,9 @@ def _downsample(self, how, **kwargs): return self.asfreq() raise IncompatibleFrequency( - 'Frequency {} cannot be resampled to {}, as they are not ' - 'sub or super periods'.format(ax.freq, self.freq)) + "Frequency {} cannot be resampled to {}, as they are not " + "sub or super periods".format(ax.freq, self.freq) + ) def _upsample(self, method, limit=None, fill_value=None): """ @@ -1197,9 +1238,8 @@ def _upsample(self, method, limit=None, fill_value=None): """ # we may need to actually resample as if we are timestamps - if self.kind == 'timestamp': - return super()._upsample(method, limit=limit, - fill_value=fill_value) + if self.kind == "timestamp": + return super()._upsample(method, limit=limit, fill_value=fill_value) self._set_binner() ax = self.ax @@ -1211,8 +1251,9 @@ def _upsample(self, method, limit=None, fill_value=None): # Get the fill indexer indexer = memb.get_indexer(new_index, method=method, limit=limit) - return self._wrap_result(_take_new_index( - obj, indexer, new_index, axis=self.axis)) + return self._wrap_result( + _take_new_index(obj, indexer, new_index, axis=self.axis) + ) class PeriodIndexResamplerGroupby(_GroupByMixin, PeriodIndexResampler): @@ -1221,13 +1262,13 @@ class PeriodIndexResamplerGroupby(_GroupByMixin, PeriodIndexResampler): .. versionadded:: 0.18.1 """ + @property def _constructor(self): return PeriodIndexResampler class TimedeltaIndexResampler(DatetimeIndexResampler): - @property def _resampler_for_grouping(self): return TimedeltaIndexResamplerGroupby @@ -1251,6 +1292,7 @@ class TimedeltaIndexResamplerGroupby(_GroupByMixin, TimedeltaIndexResampler): .. versionadded:: 0.18.1 """ + @property def _constructor(self): return TimedeltaIndexResampler @@ -1267,22 +1309,20 @@ def resample(obj, kind=None, **kwds): resample.__doc__ = Resampler.__doc__ -def get_resampler_for_grouping(groupby, rule, how=None, fill_method=None, - limit=None, kind=None, **kwargs): +def get_resampler_for_grouping( + groupby, rule, how=None, fill_method=None, limit=None, kind=None, **kwargs +): """ Return our appropriate resampler when grouping as well. """ # .resample uses 'on' similar to how .groupby uses 'key' - kwargs['key'] = kwargs.pop('on', None) + kwargs["key"] = kwargs.pop("on", None) tg = TimeGrouper(freq=rule, **kwargs) resampler = tg._get_resampler(groupby.obj, kind=kind) r = resampler._get_resampler_for_grouping(groupby=groupby) - return _maybe_process_deprecations(r, - how=how, - fill_method=fill_method, - limit=limit) + return _maybe_process_deprecations(r, how=how, fill_method=fill_method, limit=limit) class TimeGrouper(Grouper): @@ -1297,45 +1337,61 @@ class TimeGrouper(Grouper): convention : {'start', 'end', 'e', 's'} If axis is PeriodIndex """ - _attributes = Grouper._attributes + ('closed', 'label', 'how', - 'loffset', 'kind', 'convention', - 'base') - def __init__(self, freq='Min', closed=None, label=None, how='mean', - axis=0, fill_method=None, limit=None, loffset=None, - kind=None, convention=None, base=0, **kwargs): + _attributes = Grouper._attributes + ( + "closed", + "label", + "how", + "loffset", + "kind", + "convention", + "base", + ) + + def __init__( + self, + freq="Min", + closed=None, + label=None, + how="mean", + axis=0, + fill_method=None, + limit=None, + loffset=None, + kind=None, + convention=None, + base=0, + **kwargs + ): # Check for correctness of the keyword arguments which would # otherwise silently use the default if misspelled - if label not in {None, 'left', 'right'}: - raise ValueError('Unsupported value {} for `label`'.format(label)) - if closed not in {None, 'left', 'right'}: - raise ValueError('Unsupported value {} for `closed`'.format( - closed)) - if convention not in {None, 'start', 'end', 'e', 's'}: - raise ValueError('Unsupported value {} for `convention`' - .format(convention)) + if label not in {None, "left", "right"}: + raise ValueError("Unsupported value {} for `label`".format(label)) + if closed not in {None, "left", "right"}: + raise ValueError("Unsupported value {} for `closed`".format(closed)) + if convention not in {None, "start", "end", "e", "s"}: + raise ValueError("Unsupported value {} for `convention`".format(convention)) freq = to_offset(freq) - end_types = {'M', 'A', 'Q', 'BM', 'BA', 'BQ', 'W'} + end_types = {"M", "A", "Q", "BM", "BA", "BQ", "W"} rule = freq.rule_code - if (rule in end_types or - ('-' in rule and rule[:rule.find('-')] in end_types)): + if rule in end_types or ("-" in rule and rule[: rule.find("-")] in end_types): if closed is None: - closed = 'right' + closed = "right" if label is None: - label = 'right' + label = "right" else: if closed is None: - closed = 'left' + closed = "left" if label is None: - label = 'left' + label = "left" self.closed = closed self.label = label self.kind = kind - self.convention = convention or 'E' + self.convention = convention or "E" self.convention = self.convention.lower() if isinstance(loffset, str): @@ -1348,7 +1404,7 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean', self.base = base # always sort time groupers - kwargs['sort'] = True + kwargs["sort"] = True super().__init__(freq=freq, axis=axis, **kwargs) @@ -1375,23 +1431,17 @@ def _get_resampler(self, obj, kind=None): ax = self.ax if isinstance(ax, DatetimeIndex): - return DatetimeIndexResampler(obj, - groupby=self, - kind=kind, - axis=self.axis) - elif isinstance(ax, PeriodIndex) or kind == 'period': - return PeriodIndexResampler(obj, - groupby=self, - kind=kind, - axis=self.axis) + return DatetimeIndexResampler(obj, groupby=self, kind=kind, axis=self.axis) + elif isinstance(ax, PeriodIndex) or kind == "period": + return PeriodIndexResampler(obj, groupby=self, kind=kind, axis=self.axis) elif isinstance(ax, TimedeltaIndex): - return TimedeltaIndexResampler(obj, - groupby=self, - axis=self.axis) + return TimedeltaIndexResampler(obj, groupby=self, axis=self.axis) - raise TypeError("Only valid with DatetimeIndex, " - "TimedeltaIndex or PeriodIndex, " - "but got an instance of %r" % type(ax).__name__) + raise TypeError( + "Only valid with DatetimeIndex, " + "TimedeltaIndex or PeriodIndex, " + "but got an instance of %r" % type(ax).__name__ + ) def _get_grouper(self, obj, validate=True): # create the resampler and return our binner @@ -1401,43 +1451,46 @@ def _get_grouper(self, obj, validate=True): def _get_time_bins(self, ax): if not isinstance(ax, DatetimeIndex): - raise TypeError('axis must be a DatetimeIndex, but got ' - 'an instance of %r' % type(ax).__name__) + raise TypeError( + "axis must be a DatetimeIndex, but got " + "an instance of %r" % type(ax).__name__ + ) if len(ax) == 0: - binner = labels = DatetimeIndex( - data=[], freq=self.freq, name=ax.name) + binner = labels = DatetimeIndex(data=[], freq=self.freq, name=ax.name) return binner, [], labels - first, last = _get_timestamp_range_edges(ax.min(), ax.max(), - self.freq, - closed=self.closed, - base=self.base) + first, last = _get_timestamp_range_edges( + ax.min(), ax.max(), self.freq, closed=self.closed, base=self.base + ) # GH #12037 # use first/last directly instead of call replace() on them # because replace() will swallow the nanosecond part # thus last bin maybe slightly before the end if the end contains # nanosecond part and lead to `Values falls after last bin` error - binner = labels = date_range(freq=self.freq, - start=first, - end=last, - tz=ax.tz, - name=ax.name, - ambiguous='infer', - nonexistent='shift_forward') + binner = labels = date_range( + freq=self.freq, + start=first, + end=last, + tz=ax.tz, + name=ax.name, + ambiguous="infer", + nonexistent="shift_forward", + ) ax_values = ax.asi8 binner, bin_edges = self._adjust_bin_edges(binner, ax_values) # general version, knowing nothing about relative frequencies bins = lib.generate_bins_dt64( - ax_values, bin_edges, self.closed, hasnans=ax.hasnans) + ax_values, bin_edges, self.closed, hasnans=ax.hasnans + ) - if self.closed == 'right': + if self.closed == "right": labels = binner - if self.label == 'right': + if self.label == "right": labels = labels[1:] - elif self.label == 'right': + elif self.label == "right": labels = labels[1:] if ax.hasnans: @@ -1448,15 +1501,15 @@ def _get_time_bins(self, ax): # adjust the labels # GH4076 if len(bins) < len(labels): - labels = labels[:len(bins)] + labels = labels[: len(bins)] return binner, bins, labels def _adjust_bin_edges(self, binner, ax_values): # Some hacks for > daily data, see #1471, #1458, #1483 - if self.freq != 'D' and is_superperiod(self.freq, 'D'): - if self.closed == 'right': + if self.freq != "D" and is_superperiod(self.freq, "D"): + if self.closed == "right": # GH 21459, GH 9119: Adjust the bins relative to the wall time bin_edges = binner.tz_localize(None) bin_edges = bin_edges + timedelta(1) - Nano(1) @@ -1474,22 +1527,22 @@ def _adjust_bin_edges(self, binner, ax_values): def _get_time_delta_bins(self, ax): if not isinstance(ax, TimedeltaIndex): - raise TypeError('axis must be a TimedeltaIndex, but got ' - 'an instance of %r' % type(ax).__name__) + raise TypeError( + "axis must be a TimedeltaIndex, but got " + "an instance of %r" % type(ax).__name__ + ) if not len(ax): - binner = labels = TimedeltaIndex( - data=[], freq=self.freq, name=ax.name) + binner = labels = TimedeltaIndex(data=[], freq=self.freq, name=ax.name) return binner, [], labels start, end = ax.min(), ax.max() - labels = binner = timedelta_range(start=start, - end=end, - freq=self.freq, - name=ax.name) + labels = binner = timedelta_range( + start=start, end=end, freq=self.freq, name=ax.name + ) end_stamps = labels + self.freq - bins = ax.searchsorted(end_stamps, side='left') + bins = ax.searchsorted(end_stamps, side="left") # Addresses GH #10530 if self.base > 0: @@ -1499,8 +1552,10 @@ def _get_time_delta_bins(self, ax): def _get_time_period_bins(self, ax): if not isinstance(ax, DatetimeIndex): - raise TypeError('axis must be a DatetimeIndex, but got ' - 'an instance of %r' % type(ax).__name__) + raise TypeError( + "axis must be a DatetimeIndex, but got " + "an instance of %r" % type(ax).__name__ + ) freq = self.freq @@ -1508,22 +1563,23 @@ def _get_time_period_bins(self, ax): binner = labels = PeriodIndex(data=[], freq=freq, name=ax.name) return binner, [], labels - labels = binner = pd.period_range(start=ax[0], - end=ax[-1], - freq=freq, - name=ax.name) + labels = binner = pd.period_range( + start=ax[0], end=ax[-1], freq=freq, name=ax.name + ) - end_stamps = (labels + freq).asfreq(freq, 's').to_timestamp() + end_stamps = (labels + freq).asfreq(freq, "s").to_timestamp() if ax.tzinfo: end_stamps = end_stamps.tz_localize(ax.tzinfo) - bins = ax.searchsorted(end_stamps, side='left') + bins = ax.searchsorted(end_stamps, side="left") return binner, bins, labels def _get_period_bins(self, ax): if not isinstance(ax, PeriodIndex): - raise TypeError('axis must be a PeriodIndex, but got ' - 'an instance of %r' % type(ax).__name__) + raise TypeError( + "axis must be a PeriodIndex, but got " + "an instance of %r" % type(ax).__name__ + ) memb = ax.asfreq(self.freq, how=self.convention) @@ -1535,33 +1591,30 @@ def _get_period_bins(self, ax): # if index contains no valid (non-NaT) values, return empty index if not len(memb): - binner = labels = PeriodIndex( - data=[], freq=self.freq, name=ax.name) + binner = labels = PeriodIndex(data=[], freq=self.freq, name=ax.name) return binner, [], labels freq_mult = self.freq.n start = ax.min().asfreq(self.freq, how=self.convention) - end = ax.max().asfreq(self.freq, how='end') + end = ax.max().asfreq(self.freq, how="end") bin_shift = 0 # GH 23882 if self.base: # get base adjusted bin edge labels - p_start, end = _get_period_range_edges(start, - end, - self.freq, - closed=self.closed, - base=self.base) + p_start, end = _get_period_range_edges( + start, end, self.freq, closed=self.closed, base=self.base + ) # Get offset for bin edge (not label edge) adjustment - start_offset = (pd.Period(start, self.freq) - - pd.Period(p_start, self.freq)) + start_offset = pd.Period(start, self.freq) - pd.Period(p_start, self.freq) bin_shift = start_offset.n % freq_mult start = p_start - labels = binner = pd.period_range(start=start, end=end, - freq=self.freq, name=ax.name) + labels = binner = pd.period_range( + start=start, end=end, freq=self.freq, name=ax.name + ) i8 = memb.asi8 @@ -1572,7 +1625,7 @@ def _get_period_bins(self, ax): rng += freq_mult # adjust bin edge indexes to account for base rng -= bin_shift - bins = memb.searchsorted(rng, side='left') + bins = memb.searchsorted(rng, side="left") if nat_count > 0: # NaT handling as in pandas._lib.lib.generate_bins_dt64() @@ -1594,13 +1647,14 @@ def _take_new_index(obj, indexer, new_index, axis=0): elif isinstance(obj, DataFrame): if axis == 1: raise NotImplementedError("axis 1 is not supported") - return DataFrame(obj._data.reindex_indexer( - new_axis=new_index, indexer=indexer, axis=1)) + return DataFrame( + obj._data.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1) + ) else: raise ValueError("'obj' should be either a Series or a DataFrame") -def _get_timestamp_range_edges(first, last, offset, closed='left', base=0): +def _get_timestamp_range_edges(first, last, offset, closed="left", base=0): """ Adjust the `first` Timestamp to the preceding Timestamp that resides on the provided offset. Adjust the `last` Timestamp to the following @@ -1634,8 +1688,9 @@ def _get_timestamp_range_edges(first, last, offset, closed='left', base=0): first = first.tz_localize(None) last = last.tz_localize(None) - first, last = _adjust_dates_anchored(first, last, offset, - closed=closed, base=base) + first, last = _adjust_dates_anchored( + first, last, offset, closed=closed, base=base + ) if isinstance(offset, Day): first = first.tz_localize(tz) last = last.tz_localize(tz) @@ -1645,7 +1700,7 @@ def _get_timestamp_range_edges(first, last, offset, closed='left', base=0): first = first.normalize() last = last.normalize() - if closed == 'left': + if closed == "left": first = Timestamp(offset.rollback(first)) else: first = Timestamp(first - offset) @@ -1655,7 +1710,7 @@ def _get_timestamp_range_edges(first, last, offset, closed='left', base=0): return first, last -def _get_period_range_edges(first, last, offset, closed='left', base=0): +def _get_period_range_edges(first, last, offset, closed="left", base=0): """ Adjust the provided `first` and `last` Periods to the respective Period of the given offset that encompasses them. @@ -1686,15 +1741,16 @@ def _get_period_range_edges(first, last, offset, closed='left', base=0): adjust_first = not offset.onOffset(first) adjust_last = offset.onOffset(last) - first, last = _get_timestamp_range_edges(first, last, offset, - closed=closed, base=base) + first, last = _get_timestamp_range_edges( + first, last, offset, closed=closed, base=base + ) first = (first + adjust_first * offset).to_period(offset) last = (last - adjust_last * offset).to_period(offset) return first, last -def _adjust_dates_anchored(first, last, offset, closed='right', base=0): +def _adjust_dates_anchored(first, last, offset, closed="right", base=0): # First and last offsets should be calculated from the start day to fix an # error cause by resampling across multiple days when a one day period is # not a multiple of the frequency. @@ -1708,9 +1764,9 @@ def _adjust_dates_anchored(first, last, offset, closed='right', base=0): last_tzinfo = last.tzinfo start_day_nanos = first.normalize().value if first_tzinfo is not None: - first = first.tz_convert('UTC') + first = first.tz_convert("UTC") if last_tzinfo is not None: - last = last.tz_convert('UTC') + last = last.tz_convert("UTC") base_nanos = (base % offset.n) * offset.nanos // offset.n start_day_nanos += base_nanos @@ -1718,7 +1774,7 @@ def _adjust_dates_anchored(first, last, offset, closed='right', base=0): foffset = (first.value - start_day_nanos) % offset.nanos loffset = (last.value - start_day_nanos) % offset.nanos - if closed == 'right': + if closed == "right": if foffset > 0: # roll back fresult = first.value - foffset @@ -1746,9 +1802,9 @@ def _adjust_dates_anchored(first, last, offset, closed='right', base=0): fresult = Timestamp(fresult) lresult = Timestamp(lresult) if first_tzinfo is not None: - fresult = fresult.tz_localize('UTC').tz_convert(first_tzinfo) + fresult = fresult.tz_localize("UTC").tz_convert(first_tzinfo) if last_tzinfo is not None: - lresult = lresult.tz_localize('UTC').tz_convert(last_tzinfo) + lresult = lresult.tz_localize("UTC").tz_convert(last_tzinfo) return fresult, lresult @@ -1761,7 +1817,7 @@ def asfreq(obj, freq, method=None, how=None, normalize=False, fill_value=None): raise NotImplementedError("'method' argument is not supported") if how is None: - how = 'E' + how = "E" new_obj = obj.copy() new_obj.index = obj.index.asfreq(freq, how=how) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index d4272cf6e406d3..5a476dceca1f3f 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -11,11 +11,16 @@ from pandas import DataFrame, Index, MultiIndex, Series from pandas.core import common as com from pandas.core.arrays.categorical import ( - _factorize_from_iterable, _factorize_from_iterables) + _factorize_from_iterable, + _factorize_from_iterables, +) from pandas.core.generic import NDFrame from pandas.core.index import ( - _all_indexes_same, _get_consensus_names, _get_objs_combined_axis, - ensure_index) + _all_indexes_same, + _get_consensus_names, + _get_objs_combined_axis, + ensure_index, +) import pandas.core.indexes.base as ibase from pandas.core.internals import concatenate_block_managers @@ -23,9 +28,19 @@ # Concatenate DataFrame objects -def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, - keys=None, levels=None, names=None, verify_integrity=False, - sort=None, copy=True): +def concat( + objs, + axis=0, + join="outer", + join_axes=None, + ignore_index=False, + keys=None, + levels=None, + names=None, + verify_integrity=False, + sort=None, + copy=True, +): """ Concatenate pandas objects along a particular axis with optional set logic along the other axes. @@ -226,10 +241,19 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, ... ValueError: Indexes have overlapping values: ['a'] """ - op = _Concatenator(objs, axis=axis, ignore_index=ignore_index, join=join, - join_axes=join_axes, keys=keys, levels=levels, - names=names, verify_integrity=verify_integrity, - copy=copy, sort=sort) + op = _Concatenator( + objs, + axis=axis, + ignore_index=ignore_index, + join=join, + join_axes=join_axes, + keys=keys, + levels=levels, + names=names, + verify_integrity=verify_integrity, + copy=copy, + sort=sort, + ) return op.get_result() @@ -239,21 +263,35 @@ class _Concatenator: Orchestrates a concatenation operation for BlockManagers """ - def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, - levels=None, names=None, ignore_index=False, - verify_integrity=False, copy=True, sort=False): + def __init__( + self, + objs, + axis=0, + join="outer", + join_axes=None, + keys=None, + levels=None, + names=None, + ignore_index=False, + verify_integrity=False, + copy=True, + sort=False, + ): if isinstance(objs, (NDFrame, str)): - raise TypeError('first argument must be an iterable of pandas ' - 'objects, you passed an object of type ' - '"{name}"'.format(name=type(objs).__name__)) + raise TypeError( + "first argument must be an iterable of pandas " + "objects, you passed an object of type " + '"{name}"'.format(name=type(objs).__name__) + ) - if join == 'outer': + if join == "outer": self.intersect = False - elif join == 'inner': + elif join == "inner": self.intersect = True else: # pragma: no cover - raise ValueError('Only can inner (intersect) or outer (union) ' - 'join the other axis') + raise ValueError( + "Only can inner (intersect) or outer (union) " "join the other axis" + ) if isinstance(objs, dict): if keys is None: @@ -263,7 +301,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, objs = list(objs) if len(objs) == 0: - raise ValueError('No objects to concatenate') + raise ValueError("No objects to concatenate") if keys is None: objs = list(com._not_none(*objs)) @@ -277,19 +315,20 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, clean_keys.append(k) clean_objs.append(v) objs = clean_objs - name = getattr(keys, 'name', None) + name = getattr(keys, "name", None) keys = Index(clean_keys, name=name) if len(objs) == 0: - raise ValueError('All objects passed were None') + raise ValueError("All objects passed were None") # consolidate data & figure out what our result ndim is going to be ndims = set() for obj in objs: if not isinstance(obj, (Series, DataFrame)): - msg = ("cannot concatenate object of type '{}';" - ' only Series and DataFrame objs are valid' - .format(type(obj))) + msg = ( + "cannot concatenate object of type '{}';" + " only Series and DataFrame objs are valid".format(type(obj)) + ) raise TypeError(msg) # consolidate @@ -310,11 +349,13 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, else: # filter out the empties if we have not multi-index possibilities # note to keep empty Series as it affect to result columns / name - non_empties = [obj for obj in objs - if sum(obj.shape) > 0 or isinstance(obj, Series)] + non_empties = [ + obj for obj in objs if sum(obj.shape) > 0 or isinstance(obj, Series) + ] - if (len(non_empties) and (keys is None and names is None and - levels is None and not self.intersect)): + if len(non_empties) and ( + keys is None and names is None and levels is None and not self.intersect + ): objs = non_empties sample = objs[0] @@ -335,8 +376,10 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, self._is_series = isinstance(sample, Series) if not 0 <= axis <= sample.ndim: - raise AssertionError("axis must be between 0 and {ndim}, input was" - " {axis}".format(ndim=sample.ndim, axis=axis)) + raise AssertionError( + "axis must be between 0 and {ndim}, input was" + " {axis}".format(ndim=sample.ndim, axis=axis) + ) # if we have mixed ndims, then convert to highest ndim # creating column numbers as needed @@ -351,11 +394,13 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, pass elif ndim != max_ndim - 1: - raise ValueError("cannot concatenate unaligned mixed " - "dimensional NDFrame objects") + raise ValueError( + "cannot concatenate unaligned mixed " + "dimensional NDFrame objects" + ) else: - name = getattr(obj, 'name', None) + name = getattr(obj, "name", None) if ignore_index or name is None: name = current_column current_column += 1 @@ -372,7 +417,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, self.axis = axis self.join_axes = join_axes self.keys = keys - self.names = names or getattr(keys, 'names', None) + self.names = names or getattr(keys, "names", None) self.levels = levels self.sort = sort @@ -391,10 +436,11 @@ def get_result(self): if self.axis == 0: name = com.consensus_name_attr(self.objs) - mgr = self.objs[0]._data.concat([x._data for x in self.objs], - self.new_axes) + mgr = self.objs[0]._data.concat( + [x._data for x in self.objs], self.new_axes + ) cons = _concat._get_series_result_type(mgr, self.objs) - return cons(mgr, name=name).__finalize__(self, method='concat') + return cons(mgr, name=name).__finalize__(self, method="concat") # combine as columns in a frame else: @@ -404,7 +450,7 @@ def get_result(self): index, columns = self.new_axes df = cons(data, index=index) df.columns = columns - return df.__finalize__(self, method='concat') + return df.__finalize__(self, method="concat") # combine block managers else: @@ -424,14 +470,15 @@ def get_result(self): mgrs_indexers.append((obj._data, indexers)) new_data = concatenate_block_managers( - mgrs_indexers, self.new_axes, concat_axis=self.axis, - copy=self.copy) + mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy + ) if not self.copy: new_data._consolidate_inplace() cons = _concat._get_frame_result_type(new_data, self.objs) - return (cons._from_axes(new_data, self.new_axes) - .__finalize__(self, method='concat')) + return cons._from_axes(new_data, self.new_axes).__finalize__( + self, method="concat" + ) def _get_result_dim(self): if self._is_series and self.axis == 1: @@ -452,13 +499,18 @@ def _get_new_axes(self): else: # GH 21951 warnings.warn( - 'The join_axes-keyword is deprecated. Use .reindex or ' - '.reindex_like on the result to achieve the same ' - 'functionality.', FutureWarning, stacklevel=4) + "The join_axes-keyword is deprecated. Use .reindex or " + ".reindex_like on the result to achieve the same " + "functionality.", + FutureWarning, + stacklevel=4, + ) if len(self.join_axes) != ndim - 1: - raise AssertionError("length of join_axes must be equal " - "to {length}".format(length=ndim - 1)) + raise AssertionError( + "length of join_axes must be equal " + "to {length}".format(length=ndim - 1) + ) # ufff... indices = list(range(ndim)) @@ -473,13 +525,12 @@ def _get_new_axes(self): def _get_comb_axis(self, i): data_axis = self.objs[0]._get_block_manager_axis(i) try: - return _get_objs_combined_axis(self.objs, axis=data_axis, - intersect=self.intersect, - sort=self.sort) + return _get_objs_combined_axis( + self.objs, axis=data_axis, intersect=self.intersect, sort=self.sort + ) except IndexError: types = [type(x).__name__ for x in self.objs] - raise TypeError("Cannot concatenate list of {types}" - .format(types=types)) + raise TypeError("Cannot concatenate list of {types}".format(types=types)) def _get_concat_axis(self): """ @@ -497,9 +548,10 @@ def _get_concat_axis(self): has_names = False for i, x in enumerate(self.objs): if not isinstance(x, Series): - raise TypeError("Cannot concatenate type 'Series' " - "with object of type {type!r}" - .format(type=type(x).__name__)) + raise TypeError( + "Cannot concatenate type 'Series' " + "with object of type {type!r}".format(type=type(x).__name__) + ) if x.name is not None: names[i] = x.name has_names = True @@ -522,8 +574,9 @@ def _get_concat_axis(self): if self.keys is None: concat_axis = _concat_indexes(indexes) else: - concat_axis = _make_concat_multiindex(indexes, self.keys, - self.levels, self.names) + concat_axis = _make_concat_multiindex( + indexes, self.keys, self.levels, self.names + ) self._maybe_check_integrity(concat_axis) @@ -533,8 +586,10 @@ def _maybe_check_integrity(self, concat_index): if self.verify_integrity: if not concat_index.is_unique: overlap = concat_index[concat_index.duplicated()].unique() - raise ValueError('Indexes have overlapping values: ' - '{overlap!s}'.format(overlap=overlap)) + raise ValueError( + "Indexes have overlapping values: " + "{overlap!s}".format(overlap=overlap) + ) def _concat_indexes(indexes): @@ -543,8 +598,9 @@ def _concat_indexes(indexes): def _make_concat_multiindex(indexes, keys, levels=None, names=None): - if ((levels is None and isinstance(keys[0], tuple)) or - (levels is not None and len(levels) > 1)): + if (levels is None and isinstance(keys[0], tuple)) or ( + levels is not None and len(levels) > 1 + ): zipped = list(zip(*keys)) if names is None: names = [None] * len(zipped) @@ -575,8 +631,11 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): try: i = level.get_loc(key) except KeyError: - raise ValueError('Key {key!s} not in level {level!s}' - .format(key=key, level=level)) + raise ValueError( + "Key {key!s} not in level {level!s}".format( + key=key, level=level + ) + ) to_concat.append(np.repeat(i, len(index))) codes_list.append(np.concatenate(to_concat)) @@ -597,14 +656,17 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): else: # make sure that all of the passed indices have the same nlevels if not len({idx.nlevels for idx in indexes}) == 1: - raise AssertionError("Cannot concat indices that do" - " not have the same number of levels") + raise AssertionError( + "Cannot concat indices that do" + " not have the same number of levels" + ) # also copies names = names + _get_consensus_names(indexes) - return MultiIndex(levels=levels, codes=codes_list, names=names, - verify_integrity=False) + return MultiIndex( + levels=levels, codes=codes_list, names=names, verify_integrity=False + ) new_index = indexes[0] n = len(new_index) @@ -625,8 +687,11 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): mask = mapped == -1 if mask.any(): - raise ValueError('Values not found in passed level: {hlevel!s}' - .format(hlevel=hlevel[mask])) + raise ValueError( + "Values not found in passed level: {hlevel!s}".format( + hlevel=hlevel[mask] + ) + ) new_codes.append(np.repeat(mapped, n)) @@ -640,5 +705,6 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): if len(new_names) < len(new_levels): new_names.extend(new_index.names) - return MultiIndex(levels=new_levels, codes=new_codes, names=new_names, - verify_integrity=False) + return MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index d655a8be13de71..9a69942a70e017 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -15,12 +15,18 @@ from pandas.core.tools.numeric import to_numeric -@Appender(_shared_docs['melt'] % - dict(caller='pd.melt(df, ', - versionadded="", - other='DataFrame.melt')) -def melt(frame, id_vars=None, value_vars=None, var_name=None, - value_name='value', col_level=None): +@Appender( + _shared_docs["melt"] + % dict(caller="pd.melt(df, ", versionadded="", other="DataFrame.melt") +) +def melt( + frame, + id_vars=None, + value_vars=None, + var_name=None, + value_name="value", + col_level=None, +): # TODO: what about the existing index? # If multiindex, gather names of columns on all level for checking presence # of `id_vars` and `value_vars` @@ -31,36 +37,42 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] - elif (isinstance(frame.columns, ABCMultiIndex) and - not isinstance(id_vars, list)): - raise ValueError('id_vars must be a list of tuples when columns' - ' are a MultiIndex') + elif isinstance(frame.columns, ABCMultiIndex) and not isinstance(id_vars, list): + raise ValueError( + "id_vars must be a list of tuples when columns" " are a MultiIndex" + ) else: # Check that `id_vars` are in frame id_vars = list(id_vars) missing = Index(np.ravel(id_vars)).difference(cols) if not missing.empty: - raise KeyError("The following 'id_vars' are not present" - " in the DataFrame: {missing}" - "".format(missing=list(missing))) + raise KeyError( + "The following 'id_vars' are not present" + " in the DataFrame: {missing}" + "".format(missing=list(missing)) + ) else: id_vars = [] if value_vars is not None: if not is_list_like(value_vars): value_vars = [value_vars] - elif (isinstance(frame.columns, ABCMultiIndex) and - not isinstance(value_vars, list)): - raise ValueError('value_vars must be a list of tuples when' - ' columns are a MultiIndex') + elif isinstance(frame.columns, ABCMultiIndex) and not isinstance( + value_vars, list + ): + raise ValueError( + "value_vars must be a list of tuples when" " columns are a MultiIndex" + ) else: value_vars = list(value_vars) # Check that `value_vars` are in frame missing = Index(np.ravel(value_vars)).difference(cols) if not missing.empty: - raise KeyError("The following 'value_vars' are not present in" - " the DataFrame: {missing}" - "".format(missing=list(missing))) + raise KeyError( + "The following 'value_vars' are not present in" + " the DataFrame: {missing}" + "".format(missing=list(missing)) + ) frame = frame.loc[:, id_vars + value_vars] else: frame = frame.copy() @@ -74,11 +86,13 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, if len(frame.columns.names) == len(set(frame.columns.names)): var_name = frame.columns.names else: - var_name = ['variable_{i}'.format(i=i) - for i in range(len(frame.columns.names))] + var_name = [ + "variable_{i}".format(i=i) for i in range(len(frame.columns.names)) + ] else: - var_name = [frame.columns.name if frame.columns.name is not None - else 'variable'] + var_name = [ + frame.columns.name if frame.columns.name is not None else "variable" + ] if isinstance(var_name, str): var_name = [var_name] @@ -96,11 +110,10 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, mcolumns = id_vars + var_name + [value_name] - mdata[value_name] = frame.values.ravel('F') + mdata[value_name] = frame.values.ravel("F") for i, col in enumerate(var_name): # asanyarray will keep the columns as an Index - mdata[col] = np.asanyarray(frame.columns - ._get_level_values(i)).repeat(N) + mdata[col] = np.asanyarray(frame.columns._get_level_values(i)).repeat(N) return frame._constructor(mdata, columns=mcolumns) @@ -150,7 +163,7 @@ def lreshape(data, groups, dropna=True, label=None): for seq in values: if len(seq) != K: - raise ValueError('All column lists must be same length') + raise ValueError("All column lists must be same length") mdata = {} pivot_cols = [] @@ -159,6 +172,7 @@ def lreshape(data, groups, dropna=True, label=None): to_concat = [data[col].values for col in names] import pandas.core.dtypes.concat as _concat + mdata[target] = _concat._concat_compat(to_concat) pivot_cols.append(target) @@ -175,7 +189,7 @@ def lreshape(data, groups, dropna=True, label=None): return data._constructor(mdata, columns=id_cols + pivot_cols) -def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'): +def wide_to_long(df, stubnames, i, j, sep="", suffix=r"\d+"): r""" Wide panel to long format. Less flexible but more user-friendly than melt. @@ -403,20 +417,27 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'): 3 one 2.1 two 2.9 """ + def get_var_names(df, stub, sep, suffix): - regex = r'^{stub}{sep}{suffix}$'.format( - stub=re.escape(stub), sep=re.escape(sep), suffix=suffix) + regex = r"^{stub}{sep}{suffix}$".format( + stub=re.escape(stub), sep=re.escape(sep), suffix=suffix + ) pattern = re.compile(regex) return [col for col in df.columns if pattern.match(col)] def melt_stub(df, stub, i, j, value_vars, sep): - newdf = melt(df, id_vars=i, value_vars=value_vars, - value_name=stub.rstrip(sep), var_name=j) + newdf = melt( + df, + id_vars=i, + value_vars=value_vars, + value_name=stub.rstrip(sep), + var_name=j, + ) newdf[j] = Categorical(newdf[j]) newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "") # GH17627 Cast numerics suffixes to int/float - newdf[j] = to_numeric(newdf[j], errors='ignore') + newdf[j] = to_numeric(newdf[j], errors="ignore") return newdf.set_index(i + [j]) @@ -441,9 +462,8 @@ def melt_stub(df, stub, i, j, value_vars, sep): value_vars_flattened = [e for sublist in value_vars for e in sublist] id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) - melted = [melt_stub(df, s, i, j, v, sep) - for s, v in zip(stubnames, value_vars)] - melted = melted[0].join(melted[1:], how='outer') + melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)] + melted = melted[0].join(melted[1:], how="outer") if len(i) == 1: new = df[id_vars].set_index(i).join(melted) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 549c69486ebfa0..4f910f6a278ad8 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -15,12 +15,28 @@ from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.common import ( - ensure_float64, ensure_int64, ensure_object, is_array_like, is_bool, - is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_datetimelike, is_dtype_equal, - is_extension_array_dtype, is_float_dtype, is_int64_dtype, is_integer, - is_integer_dtype, is_list_like, is_number, is_numeric_dtype, - is_object_dtype, needs_i8_conversion) + ensure_float64, + ensure_int64, + ensure_object, + is_array_like, + is_bool, + is_bool_dtype, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetimelike, + is_dtype_equal, + is_extension_array_dtype, + is_float_dtype, + is_int64_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_number, + is_numeric_dtype, + is_object_dtype, + needs_i8_conversion, +) from pandas.core.dtypes.missing import isnull, na_value_for_dtype from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta @@ -33,26 +49,46 @@ from pandas.core.sorting import is_int64_overflow_possible -@Substitution('\nleft : DataFrame') +@Substitution("\nleft : DataFrame") @Appender(_merge_doc, indents=0) -def merge(left, right, how='inner', on=None, left_on=None, right_on=None, - left_index=False, right_index=False, sort=False, - suffixes=('_x', '_y'), copy=True, indicator=False, - validate=None): - op = _MergeOperation(left, right, how=how, on=on, left_on=left_on, - right_on=right_on, left_index=left_index, - right_index=right_index, sort=sort, suffixes=suffixes, - copy=copy, indicator=indicator, - validate=validate) +def merge( + left, + right, + how="inner", + on=None, + left_on=None, + right_on=None, + left_index=False, + right_index=False, + sort=False, + suffixes=("_x", "_y"), + copy=True, + indicator=False, + validate=None, +): + op = _MergeOperation( + left, + right, + how=how, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + sort=sort, + suffixes=suffixes, + copy=copy, + indicator=indicator, + validate=validate, + ) return op.get_result() if __debug__: - merge.__doc__ = _merge_doc % '\nleft : DataFrame' + merge.__doc__ = _merge_doc % "\nleft : DataFrame" -def _groupby_and_merge(by, on, left, right, _merge_pieces, - check_duplicates=True): +def _groupby_and_merge(by, on, left, right, _merge_pieces, check_duplicates=True): """ groupby & merge; we are always performing a left-by type operation @@ -85,7 +121,7 @@ def _groupby_and_merge(by, on, left, right, _merge_pieces, on = [on] if right.duplicated(by + on).any(): - right = right.drop_duplicates(by + on, keep='last') + right = right.drop_duplicates(by + on, keep="last") rby = right.groupby(by, sort=False) except KeyError: rby = None @@ -100,8 +136,7 @@ def _groupby_and_merge(by, on, left, right, _merge_pieces, except KeyError: # key doesn't exist in left lcols = lhs.columns.tolist() - cols = lcols + [r for r in right.columns - if r not in set(lcols)] + cols = lcols + [r for r in right.columns if r not in set(lcols)] merged = lhs.reindex(columns=cols) merged.index = range(len(merged)) pieces.append(merged) @@ -123,16 +158,24 @@ def _groupby_and_merge(by, on, left, right, _merge_pieces, # preserve the original order # if we have a missing piece this can be reset from pandas.core.reshape.concat import concat + result = concat(pieces, ignore_index=True) result = result.reindex(columns=pieces[0].columns, copy=False) return result, lby -def merge_ordered(left, right, on=None, - left_on=None, right_on=None, - left_by=None, right_by=None, - fill_method=None, suffixes=('_x', '_y'), - how='outer'): +def merge_ordered( + left, + right, + on=None, + left_on=None, + right_on=None, + left_by=None, + right_by=None, + fill_method=None, + suffixes=("_x", "_y"), + how="outer", +): """ Perform merge with optional filling/interpolation designed for ordered data like time series data. Optionally perform group-wise merge (see @@ -211,36 +254,57 @@ def merge_ordered(left, right, on=None, 8 b d 2 3.0 9 b e 3 3.0 """ + def _merger(x, y): # perform the ordered merge operation - op = _OrderedMerge(x, y, on=on, left_on=left_on, right_on=right_on, - suffixes=suffixes, fill_method=fill_method, - how=how) + op = _OrderedMerge( + x, + y, + on=on, + left_on=left_on, + right_on=right_on, + suffixes=suffixes, + fill_method=fill_method, + how=how, + ) return op.get_result() if left_by is not None and right_by is not None: - raise ValueError('Can only group either left or right frames') + raise ValueError("Can only group either left or right frames") elif left_by is not None: - result, _ = _groupby_and_merge(left_by, on, left, right, - lambda x, y: _merger(x, y), - check_duplicates=False) + result, _ = _groupby_and_merge( + left_by, on, left, right, lambda x, y: _merger(x, y), check_duplicates=False + ) elif right_by is not None: - result, _ = _groupby_and_merge(right_by, on, right, left, - lambda x, y: _merger(y, x), - check_duplicates=False) + result, _ = _groupby_and_merge( + right_by, + on, + right, + left, + lambda x, y: _merger(y, x), + check_duplicates=False, + ) else: result = _merger(left, right) return result -def merge_asof(left, right, on=None, - left_on=None, right_on=None, - left_index=False, right_index=False, - by=None, left_by=None, right_by=None, - suffixes=('_x', '_y'), - tolerance=None, - allow_exact_matches=True, - direction='backward'): +def merge_asof( + left, + right, + on=None, + left_on=None, + right_on=None, + left_index=False, + right_index=False, + by=None, + left_by=None, + right_by=None, + suffixes=("_x", "_y"), + tolerance=None, + allow_exact_matches=True, + direction="backward", +): """ Perform an asof merge. This is similar to a left-join except that we match on nearest key rather than equal keys. @@ -458,14 +522,23 @@ def merge_asof(left, right, on=None, 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN """ - op = _AsOfMerge(left, right, - on=on, left_on=left_on, right_on=right_on, - left_index=left_index, right_index=right_index, - by=by, left_by=left_by, right_by=right_by, - suffixes=suffixes, - how='asof', tolerance=tolerance, - allow_exact_matches=allow_exact_matches, - direction=direction) + op = _AsOfMerge( + left, + right, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + by=by, + left_by=left_by, + right_by=right_by, + suffixes=suffixes, + how="asof", + tolerance=tolerance, + allow_exact_matches=allow_exact_matches, + direction=direction, + ) return op.get_result() @@ -476,13 +549,26 @@ class _MergeOperation: Perform a database (SQL) merge operation between two DataFrame objects using either columns as keys or their row indexes """ - _merge_type = 'merge' - def __init__(self, left, right, how='inner', on=None, - left_on=None, right_on=None, axis=1, - left_index=False, right_index=False, sort=True, - suffixes=('_x', '_y'), copy=True, indicator=False, - validate=None): + _merge_type = "merge" + + def __init__( + self, + left, + right, + how="inner", + on=None, + left_on=None, + right_on=None, + axis=1, + left_index=False, + right_index=False, + sort=True, + suffixes=("_x", "_y"), + copy=True, + indicator=False, + validate=None, + ): left = validate_operand(left) right = validate_operand(right) self.left = self.orig_left = left @@ -506,34 +592,39 @@ def __init__(self, left, right, how='inner', on=None, if isinstance(self.indicator, str): self.indicator_name = self.indicator elif isinstance(self.indicator, bool): - self.indicator_name = '_merge' if self.indicator else None + self.indicator_name = "_merge" if self.indicator else None else: raise ValueError( - 'indicator option can only accept boolean or string arguments') + "indicator option can only accept boolean or string arguments" + ) if not is_bool(left_index): raise ValueError( - 'left_index parameter must be of type bool, not ' - '{left_index}'.format(left_index=type(left_index))) + "left_index parameter must be of type bool, not " + "{left_index}".format(left_index=type(left_index)) + ) if not is_bool(right_index): raise ValueError( - 'right_index parameter must be of type bool, not ' - '{right_index}'.format(right_index=type(right_index))) + "right_index parameter must be of type bool, not " + "{right_index}".format(right_index=type(right_index)) + ) # warn user when merging between different levels if left.columns.nlevels != right.columns.nlevels: - msg = ('merging between different levels can give an unintended ' - 'result ({left} levels on the left, {right} on the right)' - ).format(left=left.columns.nlevels, - right=right.columns.nlevels) + msg = ( + "merging between different levels can give an unintended " + "result ({left} levels on the left, {right} on the right)" + ).format(left=left.columns.nlevels, right=right.columns.nlevels) warnings.warn(msg, UserWarning) self._validate_specification() # note this function has side effects - (self.left_join_keys, - self.right_join_keys, - self.join_names) = self._get_merge_keys() + ( + self.left_join_keys, + self.right_join_keys, + self.join_names, + ) = self._get_merge_keys() # validate the merge keys dtypes. We may need to coerce # to avoid incompat dtypes @@ -547,16 +638,16 @@ def __init__(self, left, right, how='inner', on=None, def get_result(self): if self.indicator: - self.left, self.right = self._indicator_pre_merge( - self.left, self.right) + self.left, self.right = self._indicator_pre_merge(self.left, self.right) join_index, left_indexer, right_indexer = self._get_join_info() ldata, rdata = self.left._data, self.right._data lsuf, rsuf = self.suffixes - llabels, rlabels = _items_overlap_with_suffix(ldata.items, lsuf, - rdata.items, rsuf) + llabels, rlabels = _items_overlap_with_suffix( + ldata.items, lsuf, rdata.items, rsuf + ) lindexers = {1: left_indexer} if left_indexer is not None else {} rindexers = {1: right_indexer} if right_indexer is not None else {} @@ -564,7 +655,9 @@ def get_result(self): result_data = concatenate_block_managers( [(ldata, lindexers), (rdata, rindexers)], axes=[llabels.append(rlabels), join_index], - concat_axis=0, copy=self.copy) + concat_axis=0, + copy=self.copy, + ) typ = self.left._constructor result = typ(result_data).__finalize__(self, method=self._merge_type) @@ -582,40 +675,42 @@ def _indicator_pre_merge(self, left, right): columns = left.columns.union(right.columns) - for i in ['_left_indicator', '_right_indicator']: + for i in ["_left_indicator", "_right_indicator"]: if i in columns: - raise ValueError("Cannot use `indicator=True` option when " - "data contains a column named {name}" - .format(name=i)) + raise ValueError( + "Cannot use `indicator=True` option when " + "data contains a column named {name}".format(name=i) + ) if self.indicator_name in columns: raise ValueError( - "Cannot use name of an existing column for indicator column") + "Cannot use name of an existing column for indicator column" + ) left = left.copy() right = right.copy() - left['_left_indicator'] = 1 - left['_left_indicator'] = left['_left_indicator'].astype('int8') + left["_left_indicator"] = 1 + left["_left_indicator"] = left["_left_indicator"].astype("int8") - right['_right_indicator'] = 2 - right['_right_indicator'] = right['_right_indicator'].astype('int8') + right["_right_indicator"] = 2 + right["_right_indicator"] = right["_right_indicator"].astype("int8") return left, right def _indicator_post_merge(self, result): - result['_left_indicator'] = result['_left_indicator'].fillna(0) - result['_right_indicator'] = result['_right_indicator'].fillna(0) + result["_left_indicator"] = result["_left_indicator"].fillna(0) + result["_right_indicator"] = result["_right_indicator"].fillna(0) - result[self.indicator_name] = Categorical((result['_left_indicator'] + - result['_right_indicator']), - categories=[1, 2, 3]) - result[self.indicator_name] = ( - result[self.indicator_name] - .cat.rename_categories(['left_only', 'right_only', 'both'])) + result[self.indicator_name] = Categorical( + (result["_left_indicator"] + result["_right_indicator"]), + categories=[1, 2, 3], + ) + result[self.indicator_name] = result[self.indicator_name].cat.rename_categories( + ["left_only", "right_only", "both"] + ) - result = result.drop(labels=['_left_indicator', '_right_indicator'], - axis=1) + result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1) return result def _maybe_restore_index_levels(self, result): @@ -639,12 +734,14 @@ def _maybe_restore_index_levels(self, result): None """ names_to_restore = [] - for name, left_key, right_key in zip(self.join_names, - self.left_on, - self.right_on): - if (self.orig_left._is_level_reference(left_key) and - self.orig_right._is_level_reference(right_key) and - name not in result.index.names): + for name, left_key, right_key in zip( + self.join_names, self.left_on, self.right_on + ): + if ( + self.orig_left._is_level_reference(left_key) + and self.orig_right._is_level_reference(right_key) + and name not in result.index.names + ): names_to_restore.append(name) @@ -674,8 +771,9 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): if left_has_missing: take_right = self.right_join_keys[i] - if not is_dtype_equal(result[name].dtype, - self.left[name].dtype): + if not is_dtype_equal( + result[name].dtype, self.left[name].dtype + ): take_left = self.left[name]._values elif name in self.right: @@ -686,12 +784,12 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): if right_has_missing: take_left = self.left_join_keys[i] - if not is_dtype_equal(result[name].dtype, - self.right[name].dtype): + if not is_dtype_equal( + result[name].dtype, self.right[name].dtype + ): take_right = self.right[name]._values - elif left_indexer is not None \ - and is_array_like(self.left_join_keys[i]): + elif left_indexer is not None and is_array_like(self.left_join_keys[i]): take_left = self.left_join_keys[i] take_right = self.right_join_keys[i] @@ -701,15 +799,13 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): lvals = result[name]._values else: lfill = na_value_for_dtype(take_left.dtype) - lvals = algos.take_1d(take_left, left_indexer, - fill_value=lfill) + lvals = algos.take_1d(take_left, left_indexer, fill_value=lfill) if take_right is None: rvals = result[name]._values else: rfill = na_value_for_dtype(take_right.dtype) - rvals = algos.take_1d(take_right, right_indexer, - fill_value=rfill) + rvals = algos.take_1d(take_right, right_indexer, fill_value=rfill) # if we have an all missing left_indexer # make sure to just use the right values @@ -724,61 +820,66 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): elif result._is_level_reference(name): if isinstance(result.index, MultiIndex): key_col.name = name - idx_list = [result.index.get_level_values(level_name) - if level_name != name else key_col - for level_name in result.index.names] + idx_list = [ + result.index.get_level_values(level_name) + if level_name != name + else key_col + for level_name in result.index.names + ] result.set_index(idx_list, inplace=True) else: result.index = Index(key_col, name=name) else: - result.insert(i, name or 'key_{i}'.format(i=i), key_col) + result.insert(i, name or "key_{i}".format(i=i), key_col) def _get_join_indexers(self): """ return the join indexers """ - return _get_join_indexers(self.left_join_keys, - self.right_join_keys, - sort=self.sort, - how=self.how) + return _get_join_indexers( + self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how + ) def _get_join_info(self): left_ax = self.left._data.axes[self.axis] right_ax = self.right._data.axes[self.axis] - if self.left_index and self.right_index and self.how != 'asof': - join_index, left_indexer, right_indexer = \ - left_ax.join(right_ax, how=self.how, return_indexers=True, - sort=self.sort) - elif self.right_index and self.how == 'left': - join_index, left_indexer, right_indexer = \ - _left_join_on_index(left_ax, right_ax, self.left_join_keys, - sort=self.sort) - - elif self.left_index and self.how == 'right': - join_index, right_indexer, left_indexer = \ - _left_join_on_index(right_ax, left_ax, self.right_join_keys, - sort=self.sort) + if self.left_index and self.right_index and self.how != "asof": + join_index, left_indexer, right_indexer = left_ax.join( + right_ax, how=self.how, return_indexers=True, sort=self.sort + ) + elif self.right_index and self.how == "left": + join_index, left_indexer, right_indexer = _left_join_on_index( + left_ax, right_ax, self.left_join_keys, sort=self.sort + ) + + elif self.left_index and self.how == "right": + join_index, right_indexer, left_indexer = _left_join_on_index( + right_ax, left_ax, self.right_join_keys, sort=self.sort + ) else: - (left_indexer, - right_indexer) = self._get_join_indexers() + (left_indexer, right_indexer) = self._get_join_indexers() if self.right_index: if len(self.left) > 0: - join_index = self._create_join_index(self.left.index, - self.right.index, - left_indexer, - right_indexer, - how='right') + join_index = self._create_join_index( + self.left.index, + self.right.index, + left_indexer, + right_indexer, + how="right", + ) else: join_index = self.right.index.take(right_indexer) left_indexer = np.array([-1] * len(join_index)) elif self.left_index: if len(self.right) > 0: - join_index = self._create_join_index(self.right.index, - self.left.index, - right_indexer, - left_indexer, - how='left') + join_index = self._create_join_index( + self.right.index, + self.left.index, + right_indexer, + left_indexer, + how="left", + ) else: join_index = self.left.index.take(left_indexer) right_indexer = np.array([-1] * len(join_index)) @@ -789,8 +890,9 @@ def _get_join_info(self): join_index = join_index.astype(object) return join_index, left_indexer, right_indexer - def _create_join_index(self, index, other_index, indexer, - other_indexer, how='left'): + def _create_join_index( + self, index, other_index, indexer, other_indexer, how="left" + ): """ Create a join index by rearranging one index to match another @@ -805,8 +907,7 @@ def _create_join_index(self, index, other_index, indexer, ------- join_index """ - if (self.how in (how, 'outer') and - not isinstance(other_index, MultiIndex)): + if self.how in (how, "outer") and not isinstance(other_index, MultiIndex): # if final index requires values in other_index but not target # index, indexer may hold missing (-1) values, causing Index.take # to take the final value in target index. So, we set the last @@ -863,8 +964,7 @@ def _get_merge_keys(self): join_names.append(None) # what to do? else: if rk is not None: - right_keys.append( - right._get_label_or_level_values(rk)) + right_keys.append(right._get_label_or_level_values(rk)) join_names.append(rk) else: # work-around for merge_asof(right_index=True) @@ -873,8 +973,7 @@ def _get_merge_keys(self): else: if not is_rkey(rk): if rk is not None: - right_keys.append( - right._get_label_or_level_values(rk)) + right_keys.append(right._get_label_or_level_values(rk)) else: # work-around for merge_asof(right_index=True) right_keys.append(right.index) @@ -902,9 +1001,12 @@ def _get_merge_keys(self): left_keys.append(left._get_label_or_level_values(k)) join_names.append(k) if isinstance(self.right.index, MultiIndex): - right_keys = [lev._values.take(lev_codes) for lev, lev_codes - in zip(self.right.index.levels, - self.right.index.codes)] + right_keys = [ + lev._values.take(lev_codes) + for lev, lev_codes in zip( + self.right.index.levels, self.right.index.codes + ) + ] else: right_keys = [self.right.index._values] elif _any(self.right_on): @@ -916,9 +1018,12 @@ def _get_merge_keys(self): right_keys.append(right._get_label_or_level_values(k)) join_names.append(k) if isinstance(self.left.index, MultiIndex): - left_keys = [lev._values.take(lev_codes) for lev, lev_codes - in zip(self.left.index.levels, - self.left.index.codes)] + left_keys = [ + lev._values.take(lev_codes) + for lev, lev_codes in zip( + self.left.index.levels, self.left.index.codes + ) + ] else: left_keys = [self.left.index.values] @@ -937,9 +1042,9 @@ def _maybe_coerce_merge_keys(self): # for example if these are categorical, but are not dtype_equal # or if we have object and integer dtypes - for lk, rk, name in zip(self.left_join_keys, - self.right_join_keys, - self.join_names): + for lk, rk, name in zip( + self.left_join_keys, self.right_join_keys, self.join_names + ): if (len(lk) and not len(rk)) or (not len(lk) and len(rk)): continue @@ -960,10 +1065,11 @@ def _maybe_coerce_merge_keys(self): elif is_dtype_equal(lk.dtype, rk.dtype): continue - msg = ("You are trying to merge on {lk_dtype} and " - "{rk_dtype} columns. If you wish to proceed " - "you should use pd.concat".format(lk_dtype=lk.dtype, - rk_dtype=rk.dtype)) + msg = ( + "You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format(lk_dtype=lk.dtype, rk_dtype=rk.dtype) + ) # if we are numeric, then allow differing # kinds to proceed, eg. int64 and int8, int and float @@ -976,51 +1082,60 @@ def _maybe_coerce_merge_keys(self): # check whether ints and floats elif is_integer_dtype(rk) and is_float_dtype(lk): if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all(): - warnings.warn('You are merging on int and float ' - 'columns where the float values ' - 'are not equal to their int ' - 'representation', UserWarning) + warnings.warn( + "You are merging on int and float " + "columns where the float values " + "are not equal to their int " + "representation", + UserWarning, + ) continue elif is_float_dtype(rk) and is_integer_dtype(lk): if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all(): - warnings.warn('You are merging on int and float ' - 'columns where the float values ' - 'are not equal to their int ' - 'representation', UserWarning) + warnings.warn( + "You are merging on int and float " + "columns where the float values " + "are not equal to their int " + "representation", + UserWarning, + ) continue # let's infer and see if we are ok - elif (lib.infer_dtype(lk, skipna=False) - == lib.infer_dtype(rk, skipna=False)): + elif lib.infer_dtype(lk, skipna=False) == lib.infer_dtype( + rk, skipna=False + ): continue # Check if we are trying to merge on obviously # incompatible dtypes GH 9780, GH 15800 # bool values are coerced to object - elif ((lk_is_object and is_bool_dtype(rk)) or - (is_bool_dtype(lk) and rk_is_object)): + elif (lk_is_object and is_bool_dtype(rk)) or ( + is_bool_dtype(lk) and rk_is_object + ): pass # object values are allowed to be merged - elif ((lk_is_object and is_numeric_dtype(rk)) or - (is_numeric_dtype(lk) and rk_is_object)): + elif (lk_is_object and is_numeric_dtype(rk)) or ( + is_numeric_dtype(lk) and rk_is_object + ): inferred_left = lib.infer_dtype(lk, skipna=False) inferred_right = lib.infer_dtype(rk, skipna=False) - bool_types = ['integer', 'mixed-integer', 'boolean', 'empty'] - string_types = ['string', 'unicode', 'mixed', 'bytes', 'empty'] + bool_types = ["integer", "mixed-integer", "boolean", "empty"] + string_types = ["string", "unicode", "mixed", "bytes", "empty"] # inferred bool - if (inferred_left in bool_types and - inferred_right in bool_types): + if inferred_left in bool_types and inferred_right in bool_types: pass # unless we are merging non-string-like with string-like - elif ((inferred_left in string_types and - inferred_right not in string_types) or - (inferred_right in string_types and - inferred_left not in string_types)): + elif ( + inferred_left in string_types and inferred_right not in string_types + ) or ( + inferred_right in string_types and inferred_left not in string_types + ): raise ValueError(msg) # datetimelikes must match exactly @@ -1045,12 +1160,10 @@ def _maybe_coerce_merge_keys(self): # incompatible dtypes. See GH 16900. if name in self.left.columns: typ = lk.categories.dtype if lk_is_cat else object - self.left = self.left.assign( - **{name: self.left[name].astype(typ)}) + self.left = self.left.assign(**{name: self.left[name].astype(typ)}) if name in self.right.columns: typ = rk.categories.dtype if rk_is_cat else object - self.right = self.right.assign( - **{name: self.right[name].astype(typ)}) + self.right = self.right.assign(**{name: self.right[name].astype(typ)}) def _validate_specification(self): # Hm, any way to make this logic less complicated?? @@ -1060,43 +1173,53 @@ def _validate_specification(self): self.left_on, self.right_on = (), () elif self.left_index: if self.right_on is None: - raise MergeError('Must pass right_on or right_index=True') + raise MergeError("Must pass right_on or right_index=True") elif self.right_index: if self.left_on is None: - raise MergeError('Must pass left_on or left_index=True') + raise MergeError("Must pass left_on or left_index=True") else: # use the common columns - common_cols = self.left.columns.intersection( - self.right.columns) + common_cols = self.left.columns.intersection(self.right.columns) if len(common_cols) == 0: raise MergeError( - 'No common columns to perform merge on. ' - 'Merge options: left_on={lon}, right_on={ron}, ' - 'left_index={lidx}, right_index={ridx}' - .format(lon=self.left_on, ron=self.right_on, - lidx=self.left_index, ridx=self.right_index)) + "No common columns to perform merge on. " + "Merge options: left_on={lon}, right_on={ron}, " + "left_index={lidx}, right_index={ridx}".format( + lon=self.left_on, + ron=self.right_on, + lidx=self.left_index, + ridx=self.right_index, + ) + ) if not common_cols.is_unique: - raise MergeError("Data columns not unique: {common!r}" - .format(common=common_cols)) + raise MergeError( + "Data columns not unique: {common!r}".format(common=common_cols) + ) self.left_on = self.right_on = common_cols elif self.on is not None: if self.left_on is not None or self.right_on is not None: - raise MergeError('Can only pass argument "on" OR "left_on" ' - 'and "right_on", not a combination of both.') + raise MergeError( + 'Can only pass argument "on" OR "left_on" ' + 'and "right_on", not a combination of both.' + ) self.left_on = self.right_on = self.on elif self.left_on is not None: n = len(self.left_on) if self.right_index: if len(self.left_on) != self.right.index.nlevels: - raise ValueError('len(left_on) must equal the number ' - 'of levels in the index of "right"') + raise ValueError( + "len(left_on) must equal the number " + 'of levels in the index of "right"' + ) self.right_on = [None] * n elif self.right_on is not None: n = len(self.right_on) if self.left_index: if len(self.right_on) != self.left.index.nlevels: - raise ValueError('len(right_on) must equal the number ' - 'of levels in the index of "left"') + raise ValueError( + "len(right_on) must equal the number " + 'of levels in the index of "left"' + ) self.left_on = [None] * n if len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") @@ -1107,46 +1230,53 @@ def _validate(self, validate): if self.left_index: left_unique = self.orig_left.index.is_unique else: - left_unique = MultiIndex.from_arrays(self.left_join_keys - ).is_unique + left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique if self.right_index: right_unique = self.orig_right.index.is_unique else: - right_unique = MultiIndex.from_arrays(self.right_join_keys - ).is_unique + right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique # Check data integrity if validate in ["one_to_one", "1:1"]: if not left_unique and not right_unique: - raise MergeError("Merge keys are not unique in either left" - " or right dataset; not a one-to-one merge") + raise MergeError( + "Merge keys are not unique in either left" + " or right dataset; not a one-to-one merge" + ) elif not left_unique: - raise MergeError("Merge keys are not unique in left dataset;" - " not a one-to-one merge") + raise MergeError( + "Merge keys are not unique in left dataset;" + " not a one-to-one merge" + ) elif not right_unique: - raise MergeError("Merge keys are not unique in right dataset;" - " not a one-to-one merge") + raise MergeError( + "Merge keys are not unique in right dataset;" + " not a one-to-one merge" + ) elif validate in ["one_to_many", "1:m"]: if not left_unique: - raise MergeError("Merge keys are not unique in left dataset;" - " not a one-to-many merge") + raise MergeError( + "Merge keys are not unique in left dataset;" + " not a one-to-many merge" + ) elif validate in ["many_to_one", "m:1"]: if not right_unique: - raise MergeError("Merge keys are not unique in right dataset;" - " not a many-to-one merge") + raise MergeError( + "Merge keys are not unique in right dataset;" + " not a many-to-one merge" + ) - elif validate in ['many_to_many', 'm:m']: + elif validate in ["many_to_many", "m:m"]: pass else: raise ValueError("Not a valid argument for validate") -def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', - **kwargs): +def _get_join_indexers(left_keys, right_keys, sort=False, how="inner", **kwargs): """ Parameters @@ -1164,14 +1294,15 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', """ from functools import partial - assert len(left_keys) == len(right_keys), \ - 'left_key and right_keys must be the same length' + assert len(left_keys) == len( + right_keys + ), "left_key and right_keys must be the same length" # bind `sort` arg. of _factorize_keys fkeys = partial(_factorize_keys, sort=sort) # get left & right join labels and num. of levels at each location - llab, rlab, shape = map(list, zip(* map(fkeys, left_keys, right_keys))) + llab, rlab, shape = map(list, zip(*map(fkeys, left_keys, right_keys))) # get flat i8 keys from label lists lkey, rkey = _get_join_keys(llab, rlab, shape, sort) @@ -1183,15 +1314,16 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', # preserve left frame order if how == 'left' and sort == False kwargs = copy.copy(kwargs) - if how == 'left': - kwargs['sort'] = sort + if how == "left": + kwargs["sort"] = sort join_func = _join_functions[how] return join_func(lkey, rkey, count, **kwargs) -def _restore_dropped_levels_multijoin(left, right, dropped_level_names, - join_index, lindexer, rindexer): +def _restore_dropped_levels_multijoin( + left, right, dropped_level_names, join_index, lindexer, rindexer +): """ *this is an internal non-public method* @@ -1232,8 +1364,7 @@ def _convert_to_mulitindex(index): if isinstance(index, MultiIndex): return index else: - return MultiIndex.from_arrays([index.values], - names=[index.name]) + return MultiIndex.from_arrays([index.values], names=[index.name]) # For multi-multi joins with one overlapping level, # the returned index if of type Index @@ -1280,21 +1411,39 @@ def _convert_to_mulitindex(index): class _OrderedMerge(_MergeOperation): - _merge_type = 'ordered_merge' - - def __init__(self, left, right, on=None, left_on=None, right_on=None, - left_index=False, right_index=False, axis=1, - suffixes=('_x', '_y'), copy=True, - fill_method=None, how='outer'): + _merge_type = "ordered_merge" + + def __init__( + self, + left, + right, + on=None, + left_on=None, + right_on=None, + left_index=False, + right_index=False, + axis=1, + suffixes=("_x", "_y"), + copy=True, + fill_method=None, + how="outer", + ): self.fill_method = fill_method - _MergeOperation.__init__(self, left, right, on=on, left_on=left_on, - left_index=left_index, - right_index=right_index, - right_on=right_on, axis=axis, - how=how, suffixes=suffixes, - sort=True # factorize sorts - ) + _MergeOperation.__init__( + self, + left, + right, + on=on, + left_on=left_on, + left_index=left_index, + right_index=right_index, + right_on=right_on, + axis=axis, + how=how, + suffixes=suffixes, + sort=True, # factorize sorts + ) def get_result(self): join_index, left_indexer, right_indexer = self._get_join_info() @@ -1303,25 +1452,26 @@ def get_result(self): ldata, rdata = self.left._data, self.right._data lsuf, rsuf = self.suffixes - llabels, rlabels = _items_overlap_with_suffix(ldata.items, lsuf, - rdata.items, rsuf) + llabels, rlabels = _items_overlap_with_suffix( + ldata.items, lsuf, rdata.items, rsuf + ) - if self.fill_method == 'ffill': + if self.fill_method == "ffill": left_join_indexer = libjoin.ffill_indexer(left_indexer) right_join_indexer = libjoin.ffill_indexer(right_indexer) else: left_join_indexer = left_indexer right_join_indexer = right_indexer - lindexers = { - 1: left_join_indexer} if left_join_indexer is not None else {} - rindexers = { - 1: right_join_indexer} if right_join_indexer is not None else {} + lindexers = {1: left_join_indexer} if left_join_indexer is not None else {} + rindexers = {1: right_join_indexer} if right_join_indexer is not None else {} result_data = concatenate_block_managers( [(ldata, lindexers), (rdata, rindexers)], axes=[llabels.append(rlabels), join_index], - concat_axis=0, copy=self.copy) + concat_axis=0, + copy=self.copy, + ) typ = self.left._constructor result = typ(result_data).__finalize__(self, method=self._merge_type) @@ -1332,43 +1482,56 @@ def get_result(self): def _asof_function(direction): - name = 'asof_join_{dir}'.format(dir=direction) + name = "asof_join_{dir}".format(dir=direction) return getattr(libjoin, name, None) def _asof_by_function(direction): - name = 'asof_join_{dir}_on_X_by_Y'.format(dir=direction) + name = "asof_join_{dir}_on_X_by_Y".format(dir=direction) return getattr(libjoin, name, None) _type_casters = { - 'int64_t': ensure_int64, - 'double': ensure_float64, - 'object': ensure_object, + "int64_t": ensure_int64, + "double": ensure_float64, + "object": ensure_object, } def _get_cython_type_upcast(dtype): """ Upcast a dtype to 'int64_t', 'double', or 'object' """ if is_integer_dtype(dtype): - return 'int64_t' + return "int64_t" elif is_float_dtype(dtype): - return 'double' + return "double" else: - return 'object' + return "object" class _AsOfMerge(_OrderedMerge): - _merge_type = 'asof_merge' - - def __init__(self, left, right, on=None, left_on=None, right_on=None, - left_index=False, right_index=False, - by=None, left_by=None, right_by=None, - axis=1, suffixes=('_x', '_y'), copy=True, - fill_method=None, - how='asof', tolerance=None, - allow_exact_matches=True, - direction='backward'): + _merge_type = "asof_merge" + + def __init__( + self, + left, + right, + on=None, + left_on=None, + right_on=None, + left_index=False, + right_index=False, + by=None, + left_by=None, + right_by=None, + axis=1, + suffixes=("_x", "_y"), + copy=True, + fill_method=None, + how="asof", + tolerance=None, + allow_exact_matches=True, + direction="backward", + ): self.by = by self.left_by = left_by @@ -1377,11 +1540,20 @@ def __init__(self, left, right, on=None, left_on=None, right_on=None, self.allow_exact_matches = allow_exact_matches self.direction = direction - _OrderedMerge.__init__(self, left, right, on=on, left_on=left_on, - right_on=right_on, left_index=left_index, - right_index=right_index, axis=axis, - how=how, suffixes=suffixes, - fill_method=fill_method) + _OrderedMerge.__init__( + self, + left, + right, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + axis=axis, + how=how, + suffixes=suffixes, + fill_method=fill_method, + ) def _validate_specification(self): super()._validate_specification() @@ -1402,13 +1574,12 @@ def _validate_specification(self): # set 'by' columns if self.by is not None: if self.left_by is not None or self.right_by is not None: - raise MergeError('Can only pass by OR left_by ' - 'and right_by') + raise MergeError("Can only pass by OR left_by " "and right_by") self.left_by = self.right_by = self.by if self.left_by is None and self.right_by is not None: - raise MergeError('missing left_by') + raise MergeError("missing left_by") if self.left_by is not None and self.right_by is None: - raise MergeError('missing right_by') + raise MergeError("missing right_by") # add 'by' to our key-list so we can have it in the # output as a key @@ -1419,15 +1590,16 @@ def _validate_specification(self): self.right_by = [self.right_by] if len(self.left_by) != len(self.right_by): - raise MergeError('left_by and right_by must be same length') + raise MergeError("left_by and right_by must be same length") self.left_on = self.left_by + list(self.left_on) self.right_on = self.right_by + list(self.right_on) # check 'direction' is valid - if self.direction not in ['backward', 'forward', 'nearest']: - raise MergeError('direction invalid: {direction}' - .format(direction=self.direction)) + if self.direction not in ["backward", "forward", "nearest"]: + raise MergeError( + "direction invalid: {direction}".format(direction=self.direction) + ) @property def _asof_key(self): @@ -1437,15 +1609,12 @@ def _asof_key(self): def _get_merge_keys(self): # note this function has side effects - (left_join_keys, - right_join_keys, - join_names) = super()._get_merge_keys() + (left_join_keys, right_join_keys, join_names) = super()._get_merge_keys() # validate index types are the same for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): if not is_dtype_equal(lk.dtype, rk.dtype): - if (is_categorical_dtype(lk.dtype) and - is_categorical_dtype(rk.dtype)): + if is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype): # The generic error message is confusing for categoricals. # # In this function, the join keys include both the original @@ -1454,15 +1623,19 @@ def _get_merge_keys(self): # are not supported for the former, but will fail # later with a ValueError, so we don't *need* to check # for them here. - msg = ("incompatible merge keys [{i}] {lkdtype} and " - "{rkdtype}, both sides category, but not equal ones" - .format(i=i, lkdtype=repr(lk.dtype), - rkdtype=repr(rk.dtype))) + msg = ( + "incompatible merge keys [{i}] {lkdtype} and " + "{rkdtype}, both sides category, but not equal ones".format( + i=i, lkdtype=repr(lk.dtype), rkdtype=repr(rk.dtype) + ) + ) else: - msg = ("incompatible merge keys [{i}] {lkdtype} and " - "{rkdtype}, must be the same type" - .format(i=i, lkdtype=repr(lk.dtype), - rkdtype=repr(rk.dtype))) + msg = ( + "incompatible merge keys [{i}] {lkdtype} and " + "{rkdtype}, must be the same type".format( + i=i, lkdtype=repr(lk.dtype), rkdtype=repr(rk.dtype) + ) + ) raise MergeError(msg) # validate tolerance; must be a Timedelta if we have a DTI @@ -1473,10 +1646,12 @@ def _get_merge_keys(self): else: lt = left_join_keys[-1] - msg = ("incompatible tolerance {tolerance}, must be compat " - "with type {lkdtype}".format( - tolerance=type(self.tolerance), - lkdtype=repr(lt.dtype))) + msg = ( + "incompatible tolerance {tolerance}, must be compat " + "with type {lkdtype}".format( + tolerance=type(self.tolerance), lkdtype=repr(lt.dtype) + ) + ) if is_datetime64_dtype(lt) or is_datetime64tz_dtype(lt): if not isinstance(self.tolerance, Timedelta): @@ -1511,16 +1686,18 @@ def _get_join_indexers(self): def flip(xs): """ unlike np.transpose, this returns an array of tuples """ - labels = list(string.ascii_lowercase[:len(xs)]) + labels = list(string.ascii_lowercase[: len(xs)]) dtypes = [x.dtype for x in xs] labeled_dtypes = list(zip(labels, dtypes)) return np.array(list(zip(*xs)), labeled_dtypes) # values to compare - left_values = (self.left.index.values if self.left_index else - self.left_join_keys[-1]) - right_values = (self.right.index.values if self.right_index else - self.right_join_keys[-1]) + left_values = ( + self.left.index.values if self.left_index else self.left_join_keys[-1] + ) + right_values = ( + self.right.index.values if self.right_index else self.right_join_keys[-1] + ) tolerance = self.tolerance # we require sortedness and non-null values in the join keys @@ -1529,20 +1706,20 @@ def flip(xs): if not Index(left_values).is_monotonic: if isnull(left_values).any(): - raise ValueError(msg_missings.format(side='left')) + raise ValueError(msg_missings.format(side="left")) else: - raise ValueError(msg_sorted.format(side='left')) + raise ValueError(msg_sorted.format(side="left")) if not Index(right_values).is_monotonic: if isnull(right_values).any(): - raise ValueError(msg_missings.format(side='right')) + raise ValueError(msg_missings.format(side="right")) else: - raise ValueError(msg_sorted.format(side='right')) + raise ValueError(msg_sorted.format(side="right")) # initial type conversion as needed if needs_i8_conversion(left_values): - left_values = left_values.view('i8') - right_values = right_values.view('i8') + left_values = left_values.view("i8") + right_values = right_values.view("i8") if tolerance is not None: tolerance = tolerance.value @@ -1572,19 +1749,18 @@ def flip(xs): # choose appropriate function by type func = _asof_by_function(self.direction) - return func(left_values, - right_values, - left_by_values, - right_by_values, - self.allow_exact_matches, - tolerance) + return func( + left_values, + right_values, + left_by_values, + right_by_values, + self.allow_exact_matches, + tolerance, + ) else: # choose appropriate function by type func = _asof_function(self.direction) - return func(left_values, - right_values, - self.allow_exact_matches, - tolerance) + return func(left_values, right_values, self.allow_exact_matches, tolerance) def _get_multiindex_indexer(join_keys, index, sort): @@ -1594,13 +1770,11 @@ def _get_multiindex_indexer(join_keys, index, sort): fkeys = partial(_factorize_keys, sort=sort) # left & right join labels and num. of levels at each location - rcodes, lcodes, shape = map(list, zip(* map(fkeys, - index.levels, - join_keys))) + rcodes, lcodes, shape = map(list, zip(*map(fkeys, index.levels, join_keys))) if sort: rcodes = list(map(np.take, rcodes, index.codes)) else: - i8copy = lambda a: a.astype('i8', subok=False, copy=True) + i8copy = lambda a: a.astype("i8", subok=False, copy=True) rcodes = list(map(i8copy, index.codes)) # fix right labels if there were any nulls @@ -1628,29 +1802,31 @@ def _get_single_indexer(join_key, index, sort=False): left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) left_indexer, right_indexer = libjoin.left_outer_join( - ensure_int64(left_key), - ensure_int64(right_key), - count, sort=sort) + ensure_int64(left_key), ensure_int64(right_key), count, sort=sort + ) return left_indexer, right_indexer def _left_join_on_index(left_ax, right_ax, join_keys, sort=False): if len(join_keys) > 1: - if not ((isinstance(right_ax, MultiIndex) and - len(join_keys) == right_ax.nlevels)): - raise AssertionError("If more than one join key is given then " - "'right_ax' must be a MultiIndex and the " - "number of join keys must be the number of " - "levels in right_ax") - - left_indexer, right_indexer = \ - _get_multiindex_indexer(join_keys, right_ax, sort=sort) + if not ( + (isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels) + ): + raise AssertionError( + "If more than one join key is given then " + "'right_ax' must be a MultiIndex and the " + "number of join keys must be the number of " + "levels in right_ax" + ) + + left_indexer, right_indexer = _get_multiindex_indexer( + join_keys, right_ax, sort=sort + ) else: jkey = join_keys[0] - left_indexer, right_indexer = \ - _get_single_indexer(jkey, right_ax, sort=sort) + left_indexer, right_indexer = _get_single_indexer(jkey, right_ax, sort=sort) if sort or len(left_ax) != len(left_indexer): # if asked to sort or there are 1-to-many matches @@ -1667,22 +1843,22 @@ def _right_outer_join(x, y, max_groups): _join_functions = { - 'inner': libjoin.inner_join, - 'left': libjoin.left_outer_join, - 'right': _right_outer_join, - 'outer': libjoin.full_outer_join, + "inner": libjoin.inner_join, + "left": libjoin.left_outer_join, + "right": _right_outer_join, + "outer": libjoin.full_outer_join, } def _factorize_keys(lk, rk, sort=True): # Some pre-processing for non-ndarray lk / rk if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): - lk = getattr(lk, '_values', lk)._data - rk = getattr(rk, '_values', rk)._data + lk = getattr(lk, "_values", lk)._data + rk = getattr(rk, "_values", rk)._data - elif (is_categorical_dtype(lk) and - is_categorical_dtype(rk) and - lk.is_dtype_equal(rk)): + elif ( + is_categorical_dtype(lk) and is_categorical_dtype(rk) and lk.is_dtype_equal(rk) + ): if lk.categories.equals(rk.categories): # if we exactly match in categories, allow us to factorize on codes rk = rk.codes @@ -1693,9 +1869,11 @@ def _factorize_keys(lk, rk, sort=True): lk = ensure_int64(lk.codes) rk = ensure_int64(rk) - elif (is_extension_array_dtype(lk.dtype) and - is_extension_array_dtype(rk.dtype) and - lk.dtype == rk.dtype): + elif ( + is_extension_array_dtype(lk.dtype) + and is_extension_array_dtype(rk.dtype) + and lk.dtype == rk.dtype + ): lk, _ = lk._values_for_factorize() rk, _ = rk._values_for_factorize() @@ -1705,8 +1883,9 @@ def _factorize_keys(lk, rk, sort=True): klass = libhashtable.Int64Factorizer lk = ensure_int64(com.values_from_object(lk)) rk = ensure_int64(com.values_from_object(rk)) - elif (issubclass(lk.dtype.type, (np.timedelta64, np.datetime64)) and - issubclass(rk.dtype.type, (np.timedelta64, np.datetime64))): + elif issubclass(lk.dtype.type, (np.timedelta64, np.datetime64)) and issubclass( + rk.dtype.type, (np.timedelta64, np.datetime64) + ): # GH#23917 TODO: Needs tests for non-matching dtypes klass = libhashtable.Int64Factorizer lk = ensure_int64(com.values_from_object(lk)) @@ -1765,12 +1944,12 @@ def _get_join_keys(llab, rlab, shape, sort): nlev = next(filter(pred, range(len(shape), 0, -1))) # get keys for the first `nlev` levels - stride = np.prod(shape[1:nlev], dtype='i8') - lkey = stride * llab[0].astype('i8', subok=False, copy=False) - rkey = stride * rlab[0].astype('i8', subok=False, copy=False) + stride = np.prod(shape[1:nlev], dtype="i8") + lkey = stride * llab[0].astype("i8", subok=False, copy=False) + rkey = stride * rlab[0].astype("i8", subok=False, copy=False) for i in range(1, nlev): - with np.errstate(divide='ignore'): + with np.errstate(divide="ignore"): stride //= shape[i] lkey += llab[i] * stride rkey += rlab[i] * stride @@ -1803,12 +1982,14 @@ def validate_operand(obj): return obj elif isinstance(obj, Series): if obj.name is None: - raise ValueError('Cannot merge a Series without a name') + raise ValueError("Cannot merge a Series without a name") else: return obj.to_frame() else: - raise TypeError('Can only merge Series or DataFrame objects, ' - 'a {obj} was passed'.format(obj=type(obj))) + raise TypeError( + "Can only merge Series or DataFrame objects, " + "a {obj} was passed".format(obj=type(obj)) + ) def _items_overlap_with_suffix(left, lsuffix, right, rsuffix): @@ -1823,8 +2004,10 @@ def _items_overlap_with_suffix(left, lsuffix, right, rsuffix): return left, right if not lsuffix and not rsuffix: - raise ValueError('columns overlap but no suffix specified: ' - '{rename}'.format(rename=to_rename)) + raise ValueError( + "columns overlap but no suffix specified: " + "{rename}".format(rename=to_rename) + ) def renamer(x, suffix): """ @@ -1843,11 +2026,10 @@ def renamer(x, suffix): x : renamed column name """ if x in to_rename and suffix is not None: - return '{x}{suffix}'.format(x=x, suffix=suffix) + return "{x}{suffix}".format(x=x, suffix=suffix) return x lrenamer = partial(renamer, suffix=lsuffix) rrenamer = partial(renamer, suffix=rsuffix) - return (_transform_index(left, lrenamer), - _transform_index(right, rrenamer)) + return (_transform_index(left, lrenamer), _transform_index(right, rrenamer)) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 6374dd1b463f3a..188f2edd96590a 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -17,11 +17,20 @@ # Note: We need to make sure `frame` is imported before `pivot`, otherwise # _shared_docs['pivot_table'] will not yet exist. TODO: Fix this dependency -@Substitution('\ndata : DataFrame') -@Appender(_shared_docs['pivot_table'], indents=1) -def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', - fill_value=None, margins=False, dropna=True, - margins_name='All', observed=False): +@Substitution("\ndata : DataFrame") +@Appender(_shared_docs["pivot_table"], indents=1) +def pivot_table( + data, + values=None, + index=None, + columns=None, + aggfunc="mean", + fill_value=None, + margins=False, + dropna=True, + margins_name="All", + observed=False, +): index = _convert_by(index) columns = _convert_by(columns) @@ -29,14 +38,20 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', pieces = [] keys = [] for func in aggfunc: - table = pivot_table(data, values=values, index=index, - columns=columns, - fill_value=fill_value, aggfunc=func, - margins=margins, dropna=dropna, - margins_name=margins_name, - observed=observed) + table = pivot_table( + data, + values=values, + index=index, + columns=columns, + fill_value=fill_value, + aggfunc=func, + margins=margins, + dropna=dropna, + margins_name=margins_name, + observed=observed, + ) pieces.append(table) - keys.append(getattr(func, '__name__', func)) + keys.append(getattr(func, "__name__", func)) return concat(pieces, keys=keys, axis=1) @@ -80,7 +95,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', grouped = data.groupby(keys, observed=observed) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): - agged = agged.dropna(how='all') + agged = agged.dropna(how="all") # gh-21133 # we want to down cast if @@ -88,8 +103,12 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', # as we grouped with a NaN value # and then dropped, coercing to floats for v in values: - if (v in data and is_integer_dtype(data[v]) and - v in agged and not is_integer_dtype(agged[v])): + if ( + v in data + and is_integer_dtype(data[v]) + and v in agged + and not is_integer_dtype(agged[v]) + ): agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged @@ -97,7 +116,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. - index_names = agged.index.names[:len(index)] + index_names = agged.index.names[: len(index)] to_unstack = [] for i in range(len(index), len(keys)): name = agged.index.names[i] @@ -109,33 +128,47 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', if not dropna: from pandas import MultiIndex + if table.index.nlevels > 1: - m = MultiIndex.from_arrays(cartesian_product(table.index.levels), - names=table.index.names) + m = MultiIndex.from_arrays( + cartesian_product(table.index.levels), names=table.index.names + ) table = table.reindex(m, axis=0) if table.columns.nlevels > 1: - m = MultiIndex.from_arrays(cartesian_product(table.columns.levels), - names=table.columns.names) + m = MultiIndex.from_arrays( + cartesian_product(table.columns.levels), names=table.columns.names + ) table = table.reindex(m, axis=1) if isinstance(table, ABCDataFrame): table = table.sort_index(axis=1) if fill_value is not None: - table = table.fillna(value=fill_value, downcast='infer') + table = table.fillna(value=fill_value, downcast="infer") if margins: if dropna: data = data[data.notna().all(axis=1)] - table = _add_margins(table, data, values, rows=index, - cols=columns, aggfunc=aggfunc, - observed=dropna, - margins_name=margins_name, fill_value=fill_value) + table = _add_margins( + table, + data, + values, + rows=index, + cols=columns, + aggfunc=aggfunc, + observed=dropna, + margins_name=margins_name, + fill_value=fill_value, + ) # discard the top level - if (values_passed and not values_multi and not table.empty and - (table.columns.nlevels > 1)): + if ( + values_passed + and not values_multi + and not table.empty + and (table.columns.nlevels > 1) + ): table = table[values[0]] if len(index) == 0 and len(columns) > 0: @@ -143,15 +176,24 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', # GH 15193 Make sure empty columns are removed if dropna=True if isinstance(table, ABCDataFrame) and dropna: - table = table.dropna(how='all', axis=1) + table = table.dropna(how="all", axis=1) return table -def _add_margins(table, data, values, rows, cols, aggfunc, - observed=None, margins_name='All', fill_value=None): +def _add_margins( + table, + data, + values, + rows, + cols, + aggfunc, + observed=None, + margins_name="All", + fill_value=None, +): if not isinstance(margins_name, str): - raise ValueError('margins_name argument must be a string') + raise ValueError("margins_name argument must be a string") msg = 'Conflicting name "{name}" in margins'.format(name=margins_name) for level in table.index.names: @@ -161,13 +203,13 @@ def _add_margins(table, data, values, rows, cols, aggfunc, grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name) # could be passed a Series object with no 'columns' - if hasattr(table, 'columns'): + if hasattr(table, "columns"): for level in table.columns.names[1:]: if margins_name in table.columns.get_level_values(level): raise ValueError(msg) if len(rows) > 1: - key = (margins_name,) + ('',) * (len(rows) - 1) + key = (margins_name,) + ("",) * (len(rows) - 1) else: key = margins_name @@ -177,17 +219,24 @@ def _add_margins(table, data, values, rows, cols, aggfunc, return table.append(Series({key: grand_margin[margins_name]})) if values: - marginal_result_set = _generate_marginal_results(table, data, values, - rows, cols, aggfunc, - observed, - grand_margin, - margins_name) + marginal_result_set = _generate_marginal_results( + table, + data, + values, + rows, + cols, + aggfunc, + observed, + grand_margin, + margins_name, + ) if not isinstance(marginal_result_set, tuple): return marginal_result_set result, margin_keys, row_margin = marginal_result_set else: marginal_result_set = _generate_marginal_results_without_values( - table, data, rows, cols, aggfunc, observed, margins_name) + table, data, rows, cols, aggfunc, observed, margins_name + ) if not isinstance(marginal_result_set, tuple): return marginal_result_set result, margin_keys, row_margin = marginal_result_set @@ -200,6 +249,7 @@ def _add_margins(table, data, values, rows, cols, aggfunc, row_margin[k] = grand_margin[k[0]] from pandas import DataFrame + margin_dummy = DataFrame(row_margin, columns=[key]).T row_names = result.index.names @@ -218,8 +268,7 @@ def _add_margins(table, data, values, rows, cols, aggfunc, return result -def _compute_grand_margin(data, values, aggfunc, - margins_name='All'): +def _compute_grand_margin(data, values, aggfunc, margins_name="All"): if values: grand_margin = {} @@ -241,26 +290,22 @@ def _compute_grand_margin(data, values, aggfunc, return {margins_name: aggfunc(data.index)} -def _generate_marginal_results(table, data, values, rows, cols, aggfunc, - observed, - grand_margin, - margins_name='All'): +def _generate_marginal_results( + table, data, values, rows, cols, aggfunc, observed, grand_margin, margins_name="All" +): if len(cols) > 0: # need to "interleave" the margins table_pieces = [] margin_keys = [] def _all_key(key): - return (key, margins_name) + ('',) * (len(cols) - 1) + return (key, margins_name) + ("",) * (len(cols) - 1) if len(rows) > 0: - margin = data[rows + values].groupby( - rows, observed=observed).agg(aggfunc) + margin = data[rows + values].groupby(rows, observed=observed).agg(aggfunc) cat_axis = 1 - for key, piece in table.groupby(level=0, - axis=cat_axis, - observed=observed): + for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed): all_key = _all_key(key) # we are going to mutate this, so need to copy! @@ -270,9 +315,11 @@ def _all_key(key): except TypeError: # we cannot reshape, so coerce the axis - piece.set_axis(piece._get_axis( - cat_axis)._to_safe_for_reshape(), - axis=cat_axis, inplace=True) + piece.set_axis( + piece._get_axis(cat_axis)._to_safe_for_reshape(), + axis=cat_axis, + inplace=True, + ) piece[all_key] = margin[key] table_pieces.append(piece) @@ -280,9 +327,7 @@ def _all_key(key): else: margin = grand_margin cat_axis = 0 - for key, piece in table.groupby(level=0, - axis=cat_axis, - observed=observed): + for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed): all_key = _all_key(key) table_pieces.append(piece) table_pieces.append(Series(margin[key], index=[all_key])) @@ -297,8 +342,7 @@ def _all_key(key): margin_keys = table.columns if len(cols) > 0: - row_margin = data[cols + values].groupby( - cols, observed=observed).agg(aggfunc) + row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc) row_margin = row_margin.stack() # slight hack @@ -311,8 +355,8 @@ def _all_key(key): def _generate_marginal_results_without_values( - table, data, rows, cols, aggfunc, - observed, margins_name='All'): + table, data, rows, cols, aggfunc, observed, margins_name="All" +): if len(cols) > 0: # need to "interleave" the margins margin_keys = [] @@ -320,20 +364,17 @@ def _generate_marginal_results_without_values( def _all_key(): if len(cols) == 1: return margins_name - return (margins_name, ) + ('', ) * (len(cols) - 1) + return (margins_name,) + ("",) * (len(cols) - 1) if len(rows) > 0: - margin = data[rows].groupby(rows, - observed=observed).apply(aggfunc) + margin = data[rows].groupby(rows, observed=observed).apply(aggfunc) all_key = _all_key() table[all_key] = margin result = table margin_keys.append(all_key) else: - margin = data.groupby(level=0, - axis=0, - observed=observed).apply(aggfunc) + margin = data.groupby(level=0, axis=0, observed=observed).apply(aggfunc) all_key = _all_key() table[all_key] = margin result = table @@ -354,17 +395,19 @@ def _all_key(): def _convert_by(by): if by is None: by = [] - elif (is_scalar(by) or - isinstance(by, (np.ndarray, Index, ABCSeries, Grouper)) or - hasattr(by, '__call__')): + elif ( + is_scalar(by) + or isinstance(by, (np.ndarray, Index, ABCSeries, Grouper)) + or hasattr(by, "__call__") + ): by = [by] else: by = list(by) return by -@Substitution('\ndata : DataFrame') -@Appender(_shared_docs['pivot'], indents=1) +@Substitution("\ndata : DataFrame") +@Appender(_shared_docs["pivot"], indents=1) def pivot(data, index=None, columns=None, values=None): if values is None: cols = [columns] if index is None else [index, columns] @@ -379,17 +422,26 @@ def pivot(data, index=None, columns=None, values=None): if is_list_like(values) and not isinstance(values, tuple): # Exclude tuple because it is seen as a single column name - indexed = data._constructor(data[values].values, index=index, - columns=values) + indexed = data._constructor( + data[values].values, index=index, columns=values + ) else: - indexed = data._constructor_sliced(data[values].values, - index=index) + indexed = data._constructor_sliced(data[values].values, index=index) return indexed.unstack(columns) -def crosstab(index, columns, values=None, rownames=None, colnames=None, - aggfunc=None, margins=False, margins_name='All', dropna=True, - normalize=False): +def crosstab( + index, + columns, + values=None, + rownames=None, + colnames=None, + aggfunc=None, + margins=False, + margins_name="All", + dropna=True, + normalize=False, +): """ Compute a simple cross tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an @@ -490,11 +542,10 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, index = com.maybe_make_list(index) columns = com.maybe_make_list(columns) - rownames = _get_names(index, rownames, prefix='row') - colnames = _get_names(columns, colnames, prefix='col') + rownames = _get_names(index, rownames, prefix="row") + colnames = _get_names(columns, colnames, prefix="col") - common_idx = _get_objs_combined_axis(index + columns, intersect=True, - sort=False) + common_idx = _get_objs_combined_axis(index + columns, intersect=True, sort=False) data = {} data.update(zip(rownames, index)) @@ -507,30 +558,38 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, raise ValueError("values cannot be used without an aggfunc.") from pandas import DataFrame + df = DataFrame(data, index=common_idx) if values is None: - df['__dummy__'] = 0 - kwargs = {'aggfunc': len, 'fill_value': 0} + df["__dummy__"] = 0 + kwargs = {"aggfunc": len, "fill_value": 0} else: - df['__dummy__'] = values - kwargs = {'aggfunc': aggfunc} - - table = df.pivot_table('__dummy__', index=rownames, columns=colnames, - margins=margins, margins_name=margins_name, - dropna=dropna, **kwargs) + df["__dummy__"] = values + kwargs = {"aggfunc": aggfunc} + + table = df.pivot_table( + "__dummy__", + index=rownames, + columns=colnames, + margins=margins, + margins_name=margins_name, + dropna=dropna, + **kwargs + ) # Post-process if normalize is not False: - table = _normalize(table, normalize=normalize, margins=margins, - margins_name=margins_name) + table = _normalize( + table, normalize=normalize, margins=margins, margins_name=margins_name + ) return table -def _normalize(table, normalize, margins, margins_name='All'): +def _normalize(table, normalize, margins, margins_name="All"): if not isinstance(normalize, (bool, str)): - axis_subs = {0: 'index', 1: 'columns'} + axis_subs = {0: "index", 1: "columns"} try: normalize = axis_subs[normalize] except KeyError: @@ -540,12 +599,12 @@ def _normalize(table, normalize, margins, margins_name='All'): # Actual Normalizations normalizers = { - 'all': lambda x: x / x.sum(axis=1).sum(axis=0), - 'columns': lambda x: x / x.sum(), - 'index': lambda x: x.div(x.sum(axis=1), axis=0) + "all": lambda x: x / x.sum(axis=1).sum(axis=0), + "columns": lambda x: x / x.sum(), + "index": lambda x: x.div(x.sum(axis=1), axis=0), } - normalizers[True] = normalizers['all'] + normalizers[True] = normalizers["all"] try: f = normalizers[normalize] @@ -568,12 +627,12 @@ def _normalize(table, normalize, margins, margins_name='All'): table = _normalize(table, normalize=normalize, margins=False) # Fix Margins - if normalize == 'columns': + if normalize == "columns": column_margin = column_margin / column_margin.sum() table = concat([table, column_margin], axis=1) table = table.fillna(0) - elif normalize == 'index': + elif normalize == "index": index_margin = index_margin / index_margin.sum() table = table.append(index_margin) table = table.fillna(0) @@ -599,17 +658,17 @@ def _normalize(table, normalize, margins, margins_name='All'): return table -def _get_names(arrs, names, prefix='row'): +def _get_names(arrs, names, prefix="row"): if names is None: names = [] for i, arr in enumerate(arrs): if isinstance(arr, ABCSeries) and arr.name is not None: names.append(arr.name) else: - names.append('{prefix}_{i}'.format(prefix=prefix, i=i)) + names.append("{prefix}_{i}".format(prefix=prefix, i=i)) else: if len(names) != len(arrs): - raise AssertionError('arrays and names must have the same length') + raise AssertionError("arrays and names must have the same length") if not isinstance(names, list): names = list(names) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c59f9ffc480556..5d932d7ded9b8c 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -9,8 +9,14 @@ from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( - ensure_platform_int, is_bool_dtype, is_extension_array_dtype, - is_integer_dtype, is_list_like, is_object_dtype, needs_i8_conversion) + ensure_platform_int, + is_bool_dtype, + is_extension_array_dtype, + is_integer_dtype, + is_list_like, + is_object_dtype, + needs_i8_conversion, +) from pandas.core.dtypes.missing import notna import pandas.core.algorithms as algos @@ -21,8 +27,11 @@ from pandas.core.internals.arrays import extract_array from pandas.core.series import Series from pandas.core.sorting import ( - compress_group_index, decons_obs_group_ids, get_compressed_ids, - get_group_index) + compress_group_index, + decons_obs_group_ids, + get_compressed_ids, + get_group_index, +) class _Unstacker: @@ -76,8 +85,15 @@ class _Unstacker: unstacked : DataFrame """ - def __init__(self, values, index, level=-1, value_columns=None, - fill_value=None, constructor=None): + def __init__( + self, + values, + index, + level=-1, + value_columns=None, + fill_value=None, + constructor=None, + ): if values.ndim == 1: values = values[:, np.newaxis] @@ -90,7 +106,7 @@ def __init__(self, values, index, level=-1, value_columns=None, self.constructor = constructor if value_columns is None and values.shape[1] != 1: # pragma: no cover - raise ValueError('must pass column labels for multi-column data') + raise ValueError("must pass column labels for multi-column data") self.index = index.remove_unused_levels() @@ -110,16 +126,16 @@ def __init__(self, values, index, level=-1, value_columns=None, # If the data frame is too big, the number of unique index combination # will cause int32 overflow on windows environments. # We want to check and raise an error before this happens - num_rows = np.max([index_level.size for index_level - in self.new_index_levels]) + num_rows = np.max([index_level.size for index_level in self.new_index_levels]) num_columns = self.removed_level.size # GH20601: This forces an overflow if the number of cells is too high. num_cells = np.multiply(num_rows, num_columns, dtype=np.int32) if num_rows > 0 and num_columns > 0 and num_cells <= 0: - raise ValueError('Unstacked DataFrame is too big, ' - 'causing int32 overflow') + raise ValueError( + "Unstacked DataFrame is too big, " "causing int32 overflow" + ) self._make_sorted_values_labels() self._make_selectors() @@ -129,8 +145,8 @@ def _make_sorted_values_labels(self): codes = list(self.index.codes) levs = list(self.index.levels) - to_sort = codes[:v] + codes[v + 1:] + [codes[v]] - sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]] + to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] + sizes = [len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]] comp_index, obs_ids = get_compressed_ids(to_sort, sizes) ngroups = len(obs_ids) @@ -160,8 +176,7 @@ def _make_selectors(self): mask.put(selector, True) if mask.sum() < len(self.index): - raise ValueError('Index contains duplicate entries, ' - 'cannot reshape') + raise ValueError("Index contains duplicate entries, " "cannot reshape") self.group_index = comp_index self.mask = mask @@ -188,11 +203,11 @@ def get_new_values(self): # we can simply reshape if we don't have a mask if mask_all and len(values): - new_values = (self.sorted_values - .reshape(length, width, stride) - .swapaxes(1, 2) - .reshape(result_shape) - ) + new_values = ( + self.sorted_values.reshape(length, width, stride) + .swapaxes(1, 2) + .reshape(result_shape) + ) new_mask = np.ones(result_shape, dtype=bool) return new_values, new_mask @@ -214,25 +229,27 @@ def get_new_values(self): # and possibly coerce an input to our output dtype # e.g. ints -> floats if needs_i8_conversion(values): - sorted_values = sorted_values.view('i8') - new_values = new_values.view('i8') - name = 'int64' + sorted_values = sorted_values.view("i8") + new_values = new_values.view("i8") + name = "int64" elif is_bool_dtype(values): - sorted_values = sorted_values.astype('object') - new_values = new_values.astype('object') - name = 'object' + sorted_values = sorted_values.astype("object") + new_values = new_values.astype("object") + name = "object" else: sorted_values = sorted_values.astype(name, copy=False) # fill in our values & mask f = getattr(_reshape, "unstack_{name}".format(name=name)) - f(sorted_values, - mask.view('u1'), - stride, - length, - width, - new_values, - new_mask.view('u1')) + f( + sorted_values, + mask.view("u1"), + stride, + length, + width, + new_values, + new_mask.view("u1"), + ) # reconstruct dtype if needed if needs_i8_conversion(values): @@ -255,8 +272,7 @@ def get_new_columns(self): new_levels = self.value_columns.levels + (self.removed_level_full,) new_names = self.value_columns.names + (self.removed_name,) - new_codes = [lab.take(propagator) - for lab in self.value_columns.codes] + new_codes = [lab.take(propagator) for lab in self.value_columns.codes] else: new_levels = [self.value_columns, self.removed_level_full] new_names = [self.value_columns.name, self.removed_name] @@ -274,12 +290,12 @@ def get_new_columns(self): # The entire level is then just a repetition of the single chunk: new_codes.append(np.tile(repeater, width)) - return MultiIndex(levels=new_levels, codes=new_codes, - names=new_names, verify_integrity=False) + return MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) def get_new_index(self): - result_codes = [lab.take(self.compressor) - for lab in self.sorted_labels[:-1]] + result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]] # construct the new index if len(self.new_index_levels) == 1: @@ -288,8 +304,12 @@ def get_new_index(self): lev = lev.insert(len(lev), lev._na_value) return lev.take(lab) - return MultiIndex(levels=self.new_index_levels, codes=result_codes, - names=self.new_index_names, verify_integrity=False) + return MultiIndex( + levels=self.new_index_levels, + codes=result_codes, + names=self.new_index_names, + verify_integrity=False, + ) def _unstack_multiple(data, clocs, fill_value=None): @@ -315,23 +335,24 @@ def _unstack_multiple(data, clocs, fill_value=None): group_index = get_group_index(ccodes, shape, sort=False, xnull=False) comp_ids, obs_ids = compress_group_index(group_index, sort=False) - recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, - xnull=False) + recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False) if rlocs == []: # Everything is in clocs, so the dummy df has a regular index - dummy_index = Index(obs_ids, name='__placeholder__') + dummy_index = Index(obs_ids, name="__placeholder__") else: - dummy_index = MultiIndex(levels=rlevels + [obs_ids], - codes=rcodes + [comp_ids], - names=rnames + ['__placeholder__'], - verify_integrity=False) + dummy_index = MultiIndex( + levels=rlevels + [obs_ids], + codes=rcodes + [comp_ids], + names=rnames + ["__placeholder__"], + verify_integrity=False, + ) if isinstance(data, Series): dummy = data.copy() dummy.index = dummy_index - unstacked = dummy.unstack('__placeholder__', fill_value=fill_value) + unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) new_levels = clevels new_names = cnames new_codes = recons_codes @@ -348,7 +369,7 @@ def _unstack_multiple(data, clocs, fill_value=None): dummy = data.copy() dummy.index = dummy_index - unstacked = dummy.unstack('__placeholder__', fill_value=fill_value) + unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) if isinstance(unstacked, Series): unstcols = unstacked.index else: @@ -360,8 +381,9 @@ def _unstack_multiple(data, clocs, fill_value=None): for rec in recons_codes: new_codes.append(rec.take(unstcols.codes[-1])) - new_columns = MultiIndex(levels=new_levels, codes=new_codes, - names=new_names, verify_integrity=False) + new_columns = MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) if isinstance(unstacked, Series): unstacked.index = new_columns @@ -388,24 +410,32 @@ def unstack(obj, level, fill_value=None): else: if is_extension_array_dtype(obj.dtype): return _unstack_extension_series(obj, level, fill_value) - unstacker = _Unstacker(obj.values, obj.index, level=level, - fill_value=fill_value, - constructor=obj._constructor_expanddim) + unstacker = _Unstacker( + obj.values, + obj.index, + level=level, + fill_value=fill_value, + constructor=obj._constructor_expanddim, + ) return unstacker.get_result() def _unstack_frame(obj, level, fill_value=None): if obj._is_mixed_type: - unstacker = partial(_Unstacker, index=obj.index, - level=level, fill_value=fill_value) - blocks = obj._data.unstack(unstacker, - fill_value=fill_value) + unstacker = partial( + _Unstacker, index=obj.index, level=level, fill_value=fill_value + ) + blocks = obj._data.unstack(unstacker, fill_value=fill_value) return obj._constructor(blocks) else: - unstacker = _Unstacker(obj.values, obj.index, level=level, - value_columns=obj.columns, - fill_value=fill_value, - constructor=obj._constructor) + unstacker = _Unstacker( + obj.values, + obj.index, + level=level, + value_columns=obj.columns, + fill_value=fill_value, + constructor=obj._constructor, + ) return unstacker.get_result() @@ -441,18 +471,22 @@ def _unstack_extension_series(series, level, fill_value): dummy_arr = np.arange(len(series)) # fill_value=-1, since we will do a series.values.take later - result = _Unstacker(dummy_arr, series.index, - level=level, fill_value=-1).get_result() + result = _Unstacker( + dummy_arr, series.index, level=level, fill_value=-1 + ).get_result() out = [] values = extract_array(series, extract_numpy=False) for col, indices in result.iteritems(): - out.append(Series(values.take(indices.values, - allow_fill=True, - fill_value=fill_value), - name=col, index=result.index)) - return concat(out, axis='columns', copy=False, keys=result.columns) + out.append( + Series( + values.take(indices.values, allow_fill=True, fill_value=fill_value), + name=col, + index=result.index, + ) + ) + return concat(out, axis="columns", copy=False, keys=result.columns) def stack(frame, level=-1, dropna=True): @@ -464,6 +498,7 @@ def stack(frame, level=-1, dropna=True): ------- stacked : Series """ + def factorize(index): if index.is_unique: return index, np.arange(len(index)) @@ -487,15 +522,18 @@ def factorize(index): new_names = list(frame.index.names) new_names.append(frame.columns.name) - new_index = MultiIndex(levels=new_levels, codes=new_codes, - names=new_names, verify_integrity=False) + new_index = MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) else: - levels, (ilab, clab) = zip(*map(factorize, (frame.index, - frame.columns))) + levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns))) codes = ilab.repeat(K), np.tile(clab, N).ravel() - new_index = MultiIndex(levels=levels, codes=codes, - names=[frame.index.name, frame.columns.name], - verify_integrity=False) + new_index = MultiIndex( + levels=levels, + codes=codes, + names=[frame.index.name, frame.columns.name], + verify_integrity=False, + ) if frame._is_homogeneous_type: # For homogeneous EAs, frame.values will coerce to object. So @@ -505,9 +543,9 @@ def factorize(index): if is_extension_array_dtype(dtype): arr = dtype.construct_array_type() - new_values = arr._concat_same_type([ - col._values for _, col in frame.iteritems() - ]) + new_values = arr._concat_same_type( + [col._values for _, col in frame.iteritems()] + ) new_values = _reorder_for_extension_array_stack(new_values, N, K) else: # homogeneous, non-EA @@ -558,8 +596,10 @@ def stack_multiple(frame, level, dropna=True): level = updated_level else: - raise ValueError("level should contain all level names or all level " - "numbers, not a mixture of the two.") + raise ValueError( + "level should contain all level names or all level " + "numbers, not a mixture of the two." + ) return result @@ -604,9 +644,16 @@ def _convert_level_number(level_num, columns): # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: - tuples = list(zip(*[lev.take(level_codes) for lev, level_codes - in zip(this.columns.levels[:-1], - this.columns.codes[:-1])])) + tuples = list( + zip( + *[ + lev.take(level_codes) + for lev, level_codes in zip( + this.columns.levels[:-1], this.columns.codes[:-1] + ) + ] + ) + ) unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) @@ -641,8 +688,9 @@ def _convert_level_number(level_num, columns): chunk.columns = level_vals.take(chunk.columns.codes[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: - if (frame._is_homogeneous_type and - is_extension_array_dtype(frame.dtypes.iloc[0])): + if frame._is_homogeneous_type and is_extension_array_dtype( + frame.dtypes.iloc[0] + ): dtype = this[this.columns[loc]].dtypes.iloc[0] subset = this[this.columns[loc]] @@ -682,21 +730,30 @@ def _convert_level_number(level_num, columns): new_codes.append(np.tile(level_codes, N)) new_names.append(frame.columns.names[level_num]) - new_index = MultiIndex(levels=new_levels, codes=new_codes, - names=new_names, verify_integrity=False) + new_index = MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) result = frame._constructor(new_data, index=new_index, columns=new_columns) # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... if dropna: - result = result.dropna(axis=0, how='all') + result = result.dropna(axis=0, how="all") return result -def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, - columns=None, sparse=False, drop_first=False, dtype=None): +def get_dummies( + data, + prefix=None, + prefix_sep="_", + dummy_na=False, + columns=None, + sparse=False, + drop_first=False, + dtype=None, +): """ Convert categorical variable into dummy/indicator variables. @@ -800,29 +857,31 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, from pandas.core.reshape.concat import concat from itertools import cycle - dtypes_to_encode = ['object', 'category'] + dtypes_to_encode = ["object", "category"] if isinstance(data, DataFrame): # determine columns being encoded if columns is None: - data_to_encode = data.select_dtypes( - include=dtypes_to_encode) + data_to_encode = data.select_dtypes(include=dtypes_to_encode) else: data_to_encode = data[columns] # validate prefixes and separator to avoid silently dropping cols def check_len(item, name): - len_msg = ("Length of '{name}' ({len_item}) did not match the " - "length of the columns being encoded ({len_enc}).") + len_msg = ( + "Length of '{name}' ({len_item}) did not match the " + "length of the columns being encoded ({len_enc})." + ) if is_list_like(item): if not len(item) == data_to_encode.shape[1]: - len_msg = len_msg.format(name=name, len_item=len(item), - len_enc=data_to_encode.shape[1]) + len_msg = len_msg.format( + name=name, len_item=len(item), len_enc=data_to_encode.shape[1] + ) raise ValueError(len_msg) - check_len(prefix, 'prefix') - check_len(prefix_sep, 'prefix_sep') + check_len(prefix, "prefix") + check_len(prefix_sep, "prefix_sep") if isinstance(prefix, str): prefix = cycle([prefix]) @@ -850,25 +909,43 @@ def check_len(item, name): # columns to prepend to result. with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)] - for (col, pre, sep) in zip(data_to_encode.iteritems(), prefix, - prefix_sep): + for (col, pre, sep) in zip(data_to_encode.iteritems(), prefix, prefix_sep): # col is (column_name, column), use just column data here - dummy = _get_dummies_1d(col[1], prefix=pre, prefix_sep=sep, - dummy_na=dummy_na, sparse=sparse, - drop_first=drop_first, dtype=dtype) + dummy = _get_dummies_1d( + col[1], + prefix=pre, + prefix_sep=sep, + dummy_na=dummy_na, + sparse=sparse, + drop_first=drop_first, + dtype=dtype, + ) with_dummies.append(dummy) result = concat(with_dummies, axis=1) else: - result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na, - sparse=sparse, - drop_first=drop_first, - dtype=dtype) + result = _get_dummies_1d( + data, + prefix, + prefix_sep, + dummy_na, + sparse=sparse, + drop_first=drop_first, + dtype=dtype, + ) return result -def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, - sparse=False, drop_first=False, dtype=None): +def _get_dummies_1d( + data, + prefix, + prefix_sep="_", + dummy_na=False, + sparse=False, + drop_first=False, + dtype=None, +): from pandas.core.reshape.concat import concat + # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) @@ -907,13 +984,10 @@ def get_empty_frame(data): # PY2 embedded unicode, gh-22084 def _make_col_name(prefix, prefix_sep, level): - fstr = '{prefix}{prefix_sep}{level}' - return fstr.format(prefix=prefix, - prefix_sep=prefix_sep, - level=level) + fstr = "{prefix}{prefix_sep}{level}" + return fstr.format(prefix=prefix, prefix_sep=prefix_sep, level=level) - dummy_cols = [_make_col_name(prefix, prefix_sep, level) - for level in levels] + dummy_cols = [_make_col_name(prefix, prefix_sep, level) for level in levels] if isinstance(data, Series): index = data.index @@ -945,10 +1019,12 @@ def _make_col_name(prefix, prefix_sep, level): sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): - sarr = SparseArray(np.ones(len(ixs), dtype=dtype), - sparse_index=IntIndex(N, ixs), - fill_value=fill_value, - dtype=dtype) + sarr = SparseArray( + np.ones(len(ixs), dtype=dtype), + sparse_index=IntIndex(N, ixs), + fill_value=fill_value, + dtype=dtype, + ) sparse_series.append(Series(data=sarr, index=index, name=col)) out = concat(sparse_series, axis=1, copy=False) @@ -968,7 +1044,7 @@ def _make_col_name(prefix, prefix_sep, level): return DataFrame(dummy_mat, index=index, columns=dummy_cols) -def make_axis_dummies(frame, axis='minor', transform=None): +def make_axis_dummies(frame, axis="minor", transform=None): """ Construct 1-0 dummy variables corresponding to designated axis labels @@ -989,7 +1065,7 @@ def make_axis_dummies(frame, axis='minor', transform=None): dummies : DataFrame Column names taken from chosen axis """ - numbers = {'major': 0, 'minor': 1} + numbers = {"major": 0, "minor": 1} num = numbers.get(axis, axis) items = frame.index.levels[num] diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 96124331e43ef8..0446f53345671d 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -8,20 +8,43 @@ from pandas._libs.lib import infer_dtype from pandas.core.dtypes.common import ( - _NS_DTYPE, ensure_int64, is_categorical_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_integer, - is_scalar, is_timedelta64_dtype) + _NS_DTYPE, + ensure_int64, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetime_or_timedelta_dtype, + is_integer, + is_scalar, + is_timedelta64_dtype, +) from pandas.core.dtypes.missing import isna from pandas import ( - Categorical, Index, Interval, IntervalIndex, Series, Timedelta, Timestamp, - to_datetime, to_timedelta) + Categorical, + Index, + Interval, + IntervalIndex, + Series, + Timedelta, + Timestamp, + to_datetime, + to_timedelta, +) import pandas.core.algorithms as algos import pandas.core.nanops as nanops -def cut(x, bins, right=True, labels=None, retbins=False, precision=3, - include_lowest=False, duplicates='raise'): +def cut( + x, + bins, + right=True, + labels=None, + retbins=False, + precision=3, + include_lowest=False, + duplicates="raise", +): """ Bin values into discrete intervals. @@ -199,18 +222,19 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, sz = x.size if sz == 0: - raise ValueError('Cannot cut empty array') + raise ValueError("Cannot cut empty array") rng = (nanops.nanmin(x), nanops.nanmax(x)) mn, mx = [mi + 0.0 for mi in rng] if np.isinf(mn) or np.isinf(mx): # GH 24314 - raise ValueError('cannot specify integer `bins` when input data ' - 'contains infinity') + raise ValueError( + "cannot specify integer `bins` when input data " "contains infinity" + ) elif mn == mx: # adjust end points before binning - mn -= .001 * abs(mn) if mn != 0 else .001 - mx += .001 * abs(mx) if mx != 0 else .001 + mn -= 0.001 * abs(mn) if mn != 0 else 0.001 + mx += 0.001 * abs(mx) if mx != 0 else 0.001 bins = np.linspace(mn, mx, bins + 1, endpoint=True) else: # adjust end points after binning bins = np.linspace(mn, mx, bins + 1, endpoint=True) @@ -222,7 +246,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, elif isinstance(bins, IntervalIndex): if bins.is_overlapping: - raise ValueError('Overlapping IntervalIndex is not accepted.') + raise ValueError("Overlapping IntervalIndex is not accepted.") else: if is_datetime64tz_dtype(bins): @@ -232,20 +256,26 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, bins = _convert_bin_to_numeric_type(bins, dtype) # GH 26045: cast to float64 to avoid an overflow - if (np.diff(bins.astype('float64')) < 0).any(): - raise ValueError('bins must increase monotonically.') - - fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels, - precision=precision, - include_lowest=include_lowest, - dtype=dtype, - duplicates=duplicates) - - return _postprocess_for_cut(fac, bins, retbins, x_is_series, - series_index, name, dtype) - - -def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): + if (np.diff(bins.astype("float64")) < 0).any(): + raise ValueError("bins must increase monotonically.") + + fac, bins = _bins_to_cuts( + x, + bins, + right=right, + labels=labels, + precision=precision, + include_lowest=include_lowest, + dtype=dtype, + duplicates=duplicates, + ) + + return _postprocess_for_cut( + fac, bins, retbins, x_is_series, series_index, name, dtype + ) + + +def qcut(x, q, labels=None, retbins=False, precision=3, duplicates="raise"): """ Quantile-based discretization function. Discretize variable into equal-sized buckets based on rank or based on sample quantiles. For example @@ -309,21 +339,37 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): else: quantiles = q bins = algos.quantile(x, quantiles) - fac, bins = _bins_to_cuts(x, bins, labels=labels, - precision=precision, include_lowest=True, - dtype=dtype, duplicates=duplicates) - - return _postprocess_for_cut(fac, bins, retbins, x_is_series, - series_index, name, dtype) - - -def _bins_to_cuts(x, bins, right=True, labels=None, - precision=3, include_lowest=False, - dtype=None, duplicates='raise'): - - if duplicates not in ['raise', 'drop']: - raise ValueError("invalid value for 'duplicates' parameter, " - "valid options are: raise, drop") + fac, bins = _bins_to_cuts( + x, + bins, + labels=labels, + precision=precision, + include_lowest=True, + dtype=dtype, + duplicates=duplicates, + ) + + return _postprocess_for_cut( + fac, bins, retbins, x_is_series, series_index, name, dtype + ) + + +def _bins_to_cuts( + x, + bins, + right=True, + labels=None, + precision=3, + include_lowest=False, + dtype=None, + duplicates="raise", +): + + if duplicates not in ["raise", "drop"]: + raise ValueError( + "invalid value for 'duplicates' parameter, " + "valid options are: raise, drop" + ) if isinstance(bins, IntervalIndex): # we have a fast-path here @@ -334,14 +380,16 @@ def _bins_to_cuts(x, bins, right=True, labels=None, unique_bins = algos.unique(bins) if len(unique_bins) < len(bins) and len(bins) != 2: - if duplicates == 'raise': - raise ValueError("Bin edges must be unique: {bins!r}.\nYou " - "can drop duplicate edges by setting " - "the 'duplicates' kwarg".format(bins=bins)) + if duplicates == "raise": + raise ValueError( + "Bin edges must be unique: {bins!r}.\nYou " + "can drop duplicate edges by setting " + "the 'duplicates' kwarg".format(bins=bins) + ) else: bins = unique_bins - side = 'left' if right else 'right' + side = "left" if right else "right" ids = ensure_int64(bins.searchsorted(x, side=side)) if include_lowest: @@ -352,13 +400,14 @@ def _bins_to_cuts(x, bins, right=True, labels=None, if labels is not False: if labels is None: - labels = _format_labels(bins, precision, right=right, - include_lowest=include_lowest, - dtype=dtype) + labels = _format_labels( + bins, precision, right=right, include_lowest=include_lowest, dtype=dtype + ) else: if len(labels) != len(bins) - 1: - raise ValueError('Bin labels must be one fewer than ' - 'the number of bin edges') + raise ValueError( + "Bin labels must be one fewer than " "the number of bin edges" + ) if not is_categorical_dtype(labels): labels = Categorical(labels, categories=labels, ordered=True) @@ -386,10 +435,10 @@ def _coerce_to_type(x): dtype = x.dtype elif is_datetime64_dtype(x): x = to_datetime(x) - dtype = np.dtype('datetime64[ns]') + dtype = np.dtype("datetime64[ns]") elif is_timedelta64_dtype(x): x = to_timedelta(x) - dtype = np.dtype('timedelta64[ns]') + dtype = np.dtype("timedelta64[ns]") if dtype is not None: # GH 19768: force NaT to NaN during integer conversion @@ -414,12 +463,12 @@ def _convert_bin_to_numeric_type(bins, dtype): """ bins_dtype = infer_dtype(bins, skipna=False) if is_timedelta64_dtype(dtype): - if bins_dtype in ['timedelta', 'timedelta64']: + if bins_dtype in ["timedelta", "timedelta64"]: bins = to_timedelta(bins).view(np.int64) else: raise ValueError("bins must be of timedelta64 dtype") elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): - if bins_dtype in ['datetime', 'datetime64']: + if bins_dtype in ["datetime", "datetime64"]: bins = to_datetime(bins).view(np.int64) else: raise ValueError("bins must be of datetime64 dtype") @@ -443,28 +492,26 @@ def _convert_bin_to_datelike_type(bins, dtype): datelike """ if is_datetime64tz_dtype(dtype): - bins = to_datetime(bins.astype(np.int64), - utc=True).tz_convert(dtype.tz) + bins = to_datetime(bins.astype(np.int64), utc=True).tz_convert(dtype.tz) elif is_datetime_or_timedelta_dtype(dtype): bins = Index(bins.astype(np.int64), dtype=dtype) return bins -def _format_labels(bins, precision, right=True, - include_lowest=False, dtype=None): +def _format_labels(bins, precision, right=True, include_lowest=False, dtype=None): """ based on the dtype, return our labels """ - closed = 'right' if right else 'left' + closed = "right" if right else "left" if is_datetime64tz_dtype(dtype): formatter = partial(Timestamp, tz=dtype.tz) - adjust = lambda x: x - Timedelta('1ns') + adjust = lambda x: x - Timedelta("1ns") elif is_datetime64_dtype(dtype): formatter = Timestamp - adjust = lambda x: x - Timedelta('1ns') + adjust = lambda x: x - Timedelta("1ns") elif is_timedelta64_dtype(dtype): formatter = Timedelta - adjust = lambda x: x - Timedelta('1ns') + adjust = lambda x: x - Timedelta("1ns") else: precision = _infer_precision(precision, bins) formatter = lambda x: _round_frac(x, precision) @@ -478,7 +525,7 @@ def _format_labels(bins, precision, right=True, # account that we are all right closed v = adjust(labels[0].left) - i = IntervalIndex([Interval(v, labels[0].right, closed='right')]) + i = IntervalIndex([Interval(v, labels[0].right, closed="right")]) labels = i.append(labels[1:]) return labels @@ -500,7 +547,7 @@ def _preprocess_for_cut(x): # Check that the passed array is a Pandas or Numpy object # We don't want to strip away a Pandas data-type here (e.g. datetimetz) - ndim = getattr(x, 'ndim', None) + ndim = getattr(x, "ndim", None) if ndim is None: x = np.asarray(x) if x.ndim != 1: @@ -509,8 +556,7 @@ def _preprocess_for_cut(x): return x_is_series, series_index, name, x -def _postprocess_for_cut(fac, bins, retbins, x_is_series, - series_index, name, dtype): +def _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name, dtype): """ handles post processing for the cut method where we combine the index information if the originally passed diff --git a/pandas/core/reshape/util.py b/pandas/core/reshape/util.py index 9975fe65ac0fe3..044e058904dcec 100644 --- a/pandas/core/reshape/util.py +++ b/pandas/core/reshape/util.py @@ -51,6 +51,9 @@ def cartesian_product(X): # if any factor is empty, the cartesian product is empty b = np.zeros_like(cumprodX) - return [np.tile(np.repeat(np.asarray(com.values_from_object(x)), b[i]), - np.product(a[i])) - for i, x in enumerate(X)] + return [ + np.tile( + np.repeat(np.asarray(com.values_from_object(x)), b[i]), np.product(a[i]) + ) + for i, x in enumerate(X) + ] diff --git a/pandas/core/series.py b/pandas/core/series.py index 13966d4551b541..b3a7f38aef8ef5 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -19,15 +19,38 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( - _is_unorderable_exception, ensure_platform_int, is_bool, is_categorical, - is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like, - is_extension_array_dtype, is_extension_type, is_hashable, is_integer, - is_iterator, is_list_like, is_scalar, is_string_like, is_timedelta64_dtype) + _is_unorderable_exception, + ensure_platform_int, + is_bool, + is_categorical, + is_categorical_dtype, + is_datetime64_dtype, + is_datetimelike, + is_dict_like, + is_extension_array_dtype, + is_extension_type, + is_hashable, + is_integer, + is_iterator, + is_list_like, + is_scalar, + is_string_like, + is_timedelta64_dtype, +) from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCDatetimeArray, ABCDatetimeIndex, ABCSeries, - ABCSparseArray, ABCSparseSeries) + ABCDataFrame, + ABCDatetimeArray, + ABCDatetimeIndex, + ABCSeries, + ABCSparseArray, + ABCSparseSeries, +) from pandas.core.dtypes.missing import ( - isna, na_value_for_dtype, notna, remove_na_arraylike) + isna, + na_value_for_dtype, + notna, + remove_na_arraylike, +) import pandas as pd from pandas.core import algorithms, base, generic, nanops, ops @@ -37,7 +60,12 @@ from pandas.core.arrays.sparse import SparseAccessor import pandas.core.common as com from pandas.core.index import ( - Float64Index, Index, InvalidIndexError, MultiIndex, ensure_index) + Float64Index, + Index, + InvalidIndexError, + MultiIndex, + ensure_index, +) from pandas.core.indexes.accessors import CombinedDatetimelikeProperties import pandas.core.indexes.base as ibase from pandas.core.indexes.datetimes import DatetimeIndex @@ -52,17 +80,24 @@ import pandas.io.formats.format as fmt import pandas.plotting -__all__ = ['Series'] +__all__ = ["Series"] _shared_doc_kwargs = dict( - axes='index', klass='Series', axes_single_arg="{0 or 'index'}", + axes="index", + klass="Series", + axes_single_arg="{0 or 'index'}", axis="""axis : {0 or 'index'} Parameter needed for compatibility with DataFrame.""", inplace="""inplace : boolean, default False If True, performs operation inplace and returns None.""", - unique='np.ndarray', duplicated='Series', - optional_by='', optional_mapper='', optional_labels='', optional_axis='', - versionadded_to_excel='\n .. versionadded:: 0.20.0\n') + unique="np.ndarray", + duplicated="Series", + optional_by="", + optional_mapper="", + optional_labels="", + optional_axis="", + versionadded_to_excel="\n .. versionadded:: 0.20.0\n", +) # see gh-16971 @@ -74,8 +109,11 @@ def remove_na(arr): Use s[s.notnull()] instead. """ - warnings.warn("remove_na is deprecated and is a private " - "function. Do not use.", FutureWarning, stacklevel=2) + warnings.warn( + "remove_na is deprecated and is a private " "function. Do not use.", + FutureWarning, + stacklevel=2, + ) return remove_na_arraylike(arr) @@ -87,12 +125,12 @@ def _coerce_method(converter): def wrapper(self): if len(self) == 1: return converter(self.iloc[0]) - raise TypeError("cannot convert the series to " - "{0}".format(str(converter))) + raise TypeError("cannot convert the series to " "{0}".format(str(converter))) wrapper.__name__ = "__{name}__".format(name=converter.__name__) return wrapper + # ---------------------------------------------------------------------- # Series class @@ -133,23 +171,26 @@ class Series(base.IndexOpsMixin, generic.NDFrame): copy : bool, default False Copy input data. """ - _metadata = ['name'] - _accessors = {'dt', 'cat', 'str', 'sparse'} + + _metadata = ["name"] + _accessors = {"dt", "cat", "str", "sparse"} # tolist is not actually deprecated, just suppressed in the __dir__ _deprecations = generic.NDFrame._deprecations | frozenset( - ['asobject', 'reshape', 'get_value', 'set_value', - 'valid', 'tolist']) + ["asobject", "reshape", "get_value", "set_value", "valid", "tolist"] + ) # Override cache_readonly bc Series is mutable - hasnans = property(base.IndexOpsMixin.hasnans.func, - doc=base.IndexOpsMixin.hasnans.__doc__) + hasnans = property( + base.IndexOpsMixin.hasnans.func, doc=base.IndexOpsMixin.hasnans.__doc__ + ) _data = None # type: SingleBlockManager # ---------------------------------------------------------------------- # Constructors - def __init__(self, data=None, index=None, dtype=None, name=None, - copy=False, fastpath=False): + def __init__( + self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False + ): # we are called internally, so short-circuit if fastpath: @@ -172,15 +213,19 @@ def __init__(self, data=None, index=None, dtype=None, name=None, if dtype is not None: # GH 26336: explicitly handle 'category' to avoid warning # TODO: Remove after CategoricalDtype defaults to ordered=False - if (isinstance(dtype, str) and dtype == 'category' and - is_categorical(data)): + if ( + isinstance(dtype, str) + and dtype == "category" + and is_categorical(data) + ): dtype = data.dtype dtype = self._validate_dtype(dtype) if isinstance(data, MultiIndex): - raise NotImplementedError("initializing a Series from a " - "MultiIndex is not supported") + raise NotImplementedError( + "initializing a Series from a " "MultiIndex is not supported" + ) elif isinstance(data, Index): if name is None: name = data.name @@ -191,8 +236,7 @@ def __init__(self, data=None, index=None, dtype=None, name=None, else: # need to copy to avoid aliasing issues data = data._values.copy() - if (isinstance(data, ABCDatetimeIndex) and - data.tz is not None): + if isinstance(data, ABCDatetimeIndex) and data.tz is not None: # GH#24096 need copy to be deep for datetime64tz case # TODO: See if we can avoid these copies data = data._values.copy(deep=True) @@ -218,16 +262,19 @@ def __init__(self, data=None, index=None, dtype=None, name=None, elif not data.index.equals(index) or copy: # GH#19275 SingleBlockManager input should only be called # internally - raise AssertionError('Cannot pass both SingleBlockManager ' - '`data` argument and a different ' - '`index` argument. `copy` must ' - 'be False.') + raise AssertionError( + "Cannot pass both SingleBlockManager " + "`data` argument and a different " + "`index` argument. `copy` must " + "be False." + ) elif is_extension_array_dtype(data): pass elif isinstance(data, (set, frozenset)): - raise TypeError("{0!r} type is unordered" - "".format(data.__class__.__name__)) + raise TypeError( + "{0!r} type is unordered" "".format(data.__class__.__name__) + ) elif isinstance(data, ABCSparseArray): # handle sparse passed here (and force conversion) data = data.to_dense() @@ -245,22 +292,20 @@ def __init__(self, data=None, index=None, dtype=None, name=None, try: if len(index) != len(data): raise ValueError( - 'Length of passed values is {val}, ' - 'index implies {ind}' - .format(val=len(data), ind=len(index))) + "Length of passed values is {val}, " + "index implies {ind}".format(val=len(data), ind=len(index)) + ) except TypeError: pass # create/copy the manager if isinstance(data, SingleBlockManager): if dtype is not None: - data = data.astype(dtype=dtype, errors='ignore', - copy=copy) + data = data.astype(dtype=dtype, errors="ignore", copy=copy) elif copy: data = data.copy() else: - data = sanitize_array(data, index, dtype, copy, - raise_cast_failure=True) + data = sanitize_array(data, index, dtype, copy, raise_cast_failure=True) data = SingleBlockManager(data, index, fastpath=True) @@ -317,8 +362,9 @@ def _init_dict(self, data, index=None, dtype=None): return s._data, s.index @classmethod - def from_array(cls, arr, index=None, name=None, dtype=None, copy=False, - fastpath=False): + def from_array( + cls, arr, index=None, name=None, dtype=None, copy=False, fastpath=False + ): """ Construct Series from array. @@ -330,14 +376,20 @@ def from_array(cls, arr, index=None, name=None, dtype=None, copy=False, Series Constructed Series. """ - warnings.warn("'from_array' is deprecated and will be removed in a " - "future version. Please use the pd.Series(..) " - "constructor instead.", FutureWarning, stacklevel=2) + warnings.warn( + "'from_array' is deprecated and will be removed in a " + "future version. Please use the pd.Series(..) " + "constructor instead.", + FutureWarning, + stacklevel=2, + ) if isinstance(arr, ABCSparseArray): from pandas.core.sparse.series import SparseSeries + cls = SparseSeries - return cls(arr, index=index, name=name, dtype=dtype, - copy=copy, fastpath=fastpath) + return cls( + arr, index=index, name=name, dtype=dtype, copy=copy, fastpath=fastpath + ) # ---------------------------------------------------------------------- @@ -348,6 +400,7 @@ def _constructor(self): @property def _constructor_expanddim(self): from pandas.core.frame import DataFrame + return DataFrame # types @@ -367,8 +420,7 @@ def _set_axis(self, axis, labels, fastpath=False): is_all_dates = labels.is_all_dates if is_all_dates: - if not isinstance(labels, - (DatetimeIndex, PeriodIndex, TimedeltaIndex)): + if not isinstance(labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): try: labels = DatetimeIndex(labels) # need to set here because we changed the index @@ -381,15 +433,15 @@ def _set_axis(self, axis, labels, fastpath=False): self._set_subtyp(is_all_dates) - object.__setattr__(self, '_index', labels) + object.__setattr__(self, "_index", labels) if not fastpath: self._data.set_axis(axis, labels) def _set_subtyp(self, is_all_dates): if is_all_dates: - object.__setattr__(self, '_subtyp', 'time_series') + object.__setattr__(self, "_subtyp", "time_series") else: - object.__setattr__(self, '_subtyp', 'series') + object.__setattr__(self, "_subtyp", "series") def _update_inplace(self, result, **kwargs): # we want to call the generic version and not the IndexOpsMixin @@ -405,8 +457,8 @@ def name(self): @name.setter def name(self, value): if value is not None and not is_hashable(value): - raise TypeError('Series.name must be a hashable type') - object.__setattr__(self, '_name', value) + raise TypeError("Series.name must be a hashable type") + object.__setattr__(self, "_name", value) # ndarray compatibility @property @@ -431,10 +483,13 @@ def ftype(self): .. deprecated:: 0.25.0 Use :func:`dtype` instead. """ - warnings.warn("Series.ftype is deprecated and will " - "be removed in a future version. " - "Use Series.dtype instead.", - FutureWarning, stacklevel=2) + warnings.warn( + "Series.ftype is deprecated and will " + "be removed in a future version. " + "Use Series.dtype instead.", + FutureWarning, + stacklevel=2, + ) return self._data.ftype @@ -446,10 +501,13 @@ def ftypes(self): .. deprecated:: 0.25.0 Use :func:`dtypes` instead. """ - warnings.warn("Series.ftypes is deprecated and will " - "be removed in a future version. " - "Use Series.dtype instead.", - FutureWarning, stacklevel=2) + warnings.warn( + "Series.ftypes is deprecated and will " + "be removed in a future version. " + "Use Series.dtype instead.", + FutureWarning, + stacklevel=2, + ) return self._data.ftype @@ -524,7 +582,9 @@ def get_values(self): warnings.warn( "The 'get_values' method is deprecated and will be removed in a " "future version. Use '.to_numpy()' or '.array' instead.", - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2, + ) return self._internal_get_values() def _internal_get_values(self): @@ -541,12 +601,15 @@ def asobject(self): *this is an internal non-public method* """ - warnings.warn("'asobject' is deprecated. Use 'astype(object)'" - " instead", FutureWarning, stacklevel=2) + warnings.warn( + "'asobject' is deprecated. Use 'astype(object)'" " instead", + FutureWarning, + stacklevel=2, + ) return self.astype(object).values # ops - def ravel(self, order='C'): + def ravel(self, order="C"): """ Return the flattened underlying data as an ndarray. @@ -576,9 +639,11 @@ def compress(self, condition, *args, **kwargs): -------- numpy.ndarray.compress """ - msg = ("Series.compress(condition) is deprecated. " - "Use 'Series[condition]' or " - "'np.asarray(series).compress(condition)' instead.") + msg = ( + "Series.compress(condition) is deprecated. " + "Use 'Series[condition]' or " + "'np.asarray(series).compress(condition)' instead." + ) warnings.warn(msg, FutureWarning, stacklevel=2) nv.validate_compress(args, kwargs) return self[condition] @@ -624,9 +689,11 @@ def nonzero(self): d 4 dtype: int64 """ - msg = ("Series.nonzero() is deprecated " - "and will be removed in a future version." - "Use Series.to_numpy().nonzero() instead") + msg = ( + "Series.nonzero() is deprecated " + "and will be removed in a future version." + "Use Series.to_numpy().nonzero() instead" + ) warnings.warn(msg, FutureWarning, stacklevel=2) return self._values.nonzero() @@ -640,8 +707,11 @@ def put(self, *args, **kwargs): -------- numpy.ndarray.put """ - warnings.warn('`put` has been deprecated and will be removed in a' - 'future version.', FutureWarning, stacklevel=2) + warnings.warn( + "`put` has been deprecated and will be removed in a" "future version.", + FutureWarning, + stacklevel=2, + ) self._values.put(*args, **kwargs) def __len__(self): @@ -716,27 +786,26 @@ def view(self, dtype=None): 4 2 dtype: int8 """ - return self._constructor(self._values.view(dtype), - index=self.index).__finalize__(self) + return self._constructor( + self._values.view(dtype), index=self.index + ).__finalize__(self) # ---------------------------------------------------------------------- # NDArray Compat _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray) def __array_ufunc__( - self, - ufunc: Callable, - method: str, - *inputs: Any, - **kwargs: Any + self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any ): # TODO: handle DataFrame from pandas.core.internals.construction import extract_array + cls = type(self) # for binary ops, use our custom dunder methods result = ops.maybe_dispatch_ufunc_to_dunder_op( - self, ufunc, method, *inputs, **kwargs) + self, ufunc, method, *inputs, **kwargs + ) if result is not NotImplemented: return result @@ -745,19 +814,19 @@ def __array_ufunc__( for item in inputs: higher_priority = ( - hasattr(item, '__array_priority__') and - item.__array_priority__ > self.__array_priority__ + hasattr(item, "__array_priority__") + and item.__array_priority__ > self.__array_priority__ ) has_array_ufunc = ( - hasattr(item, '__array_ufunc__') and - type(item).__array_ufunc__ not in no_defer and - not isinstance(item, self._HANDLED_TYPES) + hasattr(item, "__array_ufunc__") + and type(item).__array_ufunc__ not in no_defer + and not isinstance(item, self._HANDLED_TYPES) ) if higher_priority or has_array_ufunc: return NotImplemented # align all the inputs. - names = [getattr(x, 'name') for x in inputs if hasattr(x, 'name')] + names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] types = tuple(type(x) for x in inputs) # TODO: dataframe alignable = [x for x, t in zip(inputs, types) if issubclass(t, Series)] @@ -770,8 +839,10 @@ def __array_ufunc__( index = alignable[0].index for s in alignable[1:]: index |= s.index - inputs = tuple(x.reindex(index) if issubclass(t, Series) else x - for x, t in zip(inputs, types)) + inputs = tuple( + x.reindex(index) if issubclass(t, Series) else x + for x, t in zip(inputs, types) + ) else: index = self.index @@ -788,7 +859,7 @@ def construct_return(result): return result elif result.ndim > 1: # e.g. np.subtract.outer - if method == 'outer': + if method == "outer": msg = ( "outer method for ufunc {} is not implemented on " "pandas objects. Returning an ndarray, but in the " @@ -796,18 +867,14 @@ def construct_return(result): "Consider explicitly converting the Series " "to an array with '.array' first." ) - warnings.warn(msg.format(ufunc), FutureWarning, - stacklevel=3) + warnings.warn(msg.format(ufunc), FutureWarning, stacklevel=3) return result - return self._constructor(result, - index=index, - name=name, - copy=False) + return self._constructor(result, index=index, name=name, copy=False) if type(result) is tuple: # multiple return values return tuple(construct_return(x) for x in result) - elif method == 'at': + elif method == "at": # no return value return None else: @@ -860,8 +927,11 @@ def __array__(self, dtype=None): array(['1999-12-31T23:00:00.000000000', ...], dtype='datetime64[ns]') """ - if (dtype is None and isinstance(self.array, ABCDatetimeArray) - and getattr(self.dtype, 'tz', None)): + if ( + dtype is None + and isinstance(self.array, ABCDatetimeArray) + and getattr(self.dtype, "tz", None) + ): msg = ( "Converting timezone-aware DatetimeArray to timezone-naive " "ndarray with 'datetime64[ns]' dtype. In the future, this " @@ -871,7 +941,7 @@ def __array__(self, dtype=None): "To keep the old behavior, pass 'dtype=\"datetime64[ns]\"'." ) warnings.warn(msg, FutureWarning, stacklevel=3) - dtype = 'M8[ns]' + dtype = "M8[ns]" return np.asarray(self.array, dtype) # ---------------------------------------------------------------------- @@ -884,8 +954,11 @@ def real(self): .. deprecated 0.25.0 """ - warnings.warn("`real` has be deprecated and will be removed in a " - "future verison", FutureWarning, stacklevel=2) + warnings.warn( + "`real` has be deprecated and will be removed in a " "future verison", + FutureWarning, + stacklevel=2, + ) return self.values.real @real.setter @@ -899,8 +972,11 @@ def imag(self): .. deprecated 0.25.0 """ - warnings.warn("`imag` has be deprecated and will be removed in a " - "future verison", FutureWarning, stacklevel=2) + warnings.warn( + "`imag` has be deprecated and will be removed in a " "future verison", + FutureWarning, + stacklevel=2, + ) return self.values.imag @imag.setter @@ -916,8 +992,8 @@ def imag(self, v): def _unpickle_series_compat(self, state): if isinstance(state, dict): - self._data = state['_data'] - self.name = state['name'] + self._data = state["_data"] + self.name = state["name"] self.index = self._data.index elif isinstance(state, tuple): @@ -975,7 +1051,7 @@ def _ixs(self, i, axis=0): raise except Exception: if isinstance(i, slice): - indexer = self.index._convert_slice_indexer(i, kind='iloc') + indexer = self.index._convert_slice_indexer(i, kind="iloc") return self._get_values(indexer) else: label = self.index[i] @@ -989,8 +1065,7 @@ def _is_mixed_type(self): return False def _slice(self, slobj, axis=0, kind=None): - slobj = self.index._convert_slice_indexer(slobj, - kind=kind or 'getitem') + slobj = self.index._convert_slice_indexer(slobj, kind=kind or "getitem") return self._get_values(slobj) def __getitem__(self, key): @@ -1006,8 +1081,8 @@ def __getitem__(self, key): try: if not is_scalar(self.index.get_loc(key)): result = self._constructor( - result, index=[key] * len(result), - dtype=self.dtype).__finalize__(self) + result, index=[key] * len(result), dtype=self.dtype + ).__finalize__(self) except KeyError: pass return result @@ -1024,8 +1099,7 @@ def __getitem__(self, key): else: # we can try to coerce the indexer (or this will raise) - new_key = self.index._convert_scalar_indexer(key, - kind='getitem') + new_key = self.index._convert_scalar_indexer(key, kind="getitem") if type(new_key) != type(key): return self.__getitem__(new_key) raise @@ -1044,11 +1118,13 @@ def __getitem__(self, key): def _get_with(self, key): # other: fancy integer or otherwise if isinstance(key, slice): - indexer = self.index._convert_slice_indexer(key, kind='getitem') + indexer = self.index._convert_slice_indexer(key, kind="getitem") return self._get_values(indexer) elif isinstance(key, ABCDataFrame): - raise TypeError('Indexing a Series with DataFrame is not ' - 'supported, use the appropriate DataFrame column') + raise TypeError( + "Indexing a Series with DataFrame is not " + "supported, use the appropriate DataFrame column" + ) elif isinstance(key, tuple): try: return self._get_values_tuple(key) @@ -1068,12 +1144,12 @@ def _get_with(self, key): else: key_type = lib.infer_dtype(key, skipna=False) - if key_type == 'integer': + if key_type == "integer": if self.index.is_integer() or self.index.is_floating(): return self.loc[key] else: return self._get_values(key) - elif key_type == 'boolean': + elif key_type == "boolean": return self._get_values(key) try: @@ -1096,17 +1172,19 @@ def _get_values_tuple(self, key): return self._get_values(key) if not isinstance(self.index, MultiIndex): - raise ValueError('Can only tuple-index with a MultiIndex') + raise ValueError("Can only tuple-index with a MultiIndex") # If key is contained, would have returned by now indexer, new_index = self.index.get_loc_level(key) - return self._constructor(self._values[indexer], - index=new_index).__finalize__(self) + return self._constructor(self._values[indexer], index=new_index).__finalize__( + self + ) def _get_values(self, indexer): try: - return self._constructor(self._data.get_slice(indexer), - fastpath=True).__finalize__(self) + return self._constructor( + self._data.get_slice(indexer), fastpath=True + ).__finalize__(self) except Exception: return self._values[indexer] @@ -1121,8 +1199,7 @@ def setitem(key, value): raise except (KeyError, ValueError): values = self._values - if (is_integer(key) and - not self.index.inferred_type == 'integer'): + if is_integer(key) and not self.index.inferred_type == "integer": values[key] = value return @@ -1137,8 +1214,7 @@ def setitem(key, value): value = iNaT try: - self.index._engine.set_value(self._values, key, - value) + self.index._engine.set_value(self._values, key, value) return except TypeError: pass @@ -1147,8 +1223,7 @@ def setitem(key, value): return except TypeError as e: - if (isinstance(key, tuple) and - not isinstance(self.index, MultiIndex)): + if isinstance(key, tuple) and not isinstance(self.index, MultiIndex): raise ValueError("Can only tuple-index with a MultiIndex") # python 3 type errors should be raised @@ -1183,7 +1258,7 @@ def _set_with_engine(self, key, value): def _set_with(self, key, value): # other: fancy integer or otherwise if isinstance(key, slice): - indexer = self.index._convert_slice_indexer(key, kind='getitem') + indexer = self.index._convert_slice_indexer(key, kind="getitem") return self._set_values(indexer, value) else: if isinstance(key, tuple): @@ -1205,12 +1280,12 @@ def _set_with(self, key, value): else: key_type = lib.infer_dtype(key, skipna=False) - if key_type == 'integer': - if self.index.inferred_type == 'integer': + if key_type == "integer": + if self.index.inferred_type == "integer": self._set_labels(key, value) else: return self._set_values(key, value) - elif key_type == 'boolean': + elif key_type == "boolean": self._set_values(key.astype(np.bool_), value) else: self._set_labels(key, value) @@ -1223,7 +1298,7 @@ def _set_labels(self, key, value): indexer = self.index.get_indexer(key) mask = indexer == -1 if mask.any(): - raise ValueError('%s not contained in the index' % str(key[mask])) + raise ValueError("%s not contained in the index" % str(key[mask])) self._set_values(indexer, value) def _set_values(self, key, value): @@ -1287,8 +1362,7 @@ def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) new_index = self.index.repeat(repeats) new_values = self._values.repeat(repeats) - return self._constructor(new_values, - index=new_index).__finalize__(self) + return self._constructor(new_values, index=new_index).__finalize__(self) def get_value(self, label, takeable=False): """ @@ -1306,16 +1380,20 @@ def get_value(self, label, takeable=False): ------- scalar value """ - warnings.warn("get_value is deprecated and will be removed " - "in a future release. Please use " - ".at[] or .iat[] accessors instead", FutureWarning, - stacklevel=2) + warnings.warn( + "get_value is deprecated and will be removed " + "in a future release. Please use " + ".at[] or .iat[] accessors instead", + FutureWarning, + stacklevel=2, + ) return self._get_value(label, takeable=takeable) def _get_value(self, label, takeable=False): if takeable is True: return com.maybe_box_datetimelike(self._values[label]) return self.index.get_value(self._values, label) + _get_value.__doc__ = get_value.__doc__ def set_value(self, label, value, takeable=False): @@ -1342,10 +1420,13 @@ def set_value(self, label, value, takeable=False): If label is contained, will be reference to calling Series, otherwise a new object. """ - warnings.warn("set_value is deprecated and will be removed " - "in a future release. Please use " - ".at[] or .iat[] accessors instead", FutureWarning, - stacklevel=2) + warnings.warn( + "set_value is deprecated and will be removed " + "in a future release. Please use " + ".at[] or .iat[] accessors instead", + FutureWarning, + stacklevel=2, + ) return self._set_value(label, value, takeable=takeable) def _set_value(self, label, value, takeable=False): @@ -1360,6 +1441,7 @@ def _set_value(self, label, value, takeable=False): self.loc[label] = value return self + _set_value.__doc__ = set_value.__doc__ def reset_index(self, level=None, drop=False, name=None, inplace=False): @@ -1470,7 +1552,7 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): 2 baz one 2 3 baz two 3 """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if drop: new_index = ibase.default_index(len(self)) if level is not None: @@ -1485,11 +1567,13 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): # set name if it was passed, otherwise, keep the previous name self.name = name or self.name else: - return self._constructor(self._values.copy(), - index=new_index).__finalize__(self) + return self._constructor( + self._values.copy(), index=new_index + ).__finalize__(self) elif inplace: - raise TypeError('Cannot reset_index inplace on a Series ' - 'to create a DataFrame') + raise TypeError( + "Cannot reset_index inplace on a Series " "to create a DataFrame" + ) else: df = self.to_frame(name) return df.reset_index(level=level, drop=drop) @@ -1503,22 +1587,43 @@ def __repr__(self): """ buf = StringIO("") width, height = get_terminal_size() - max_rows = (height if get_option("display.max_rows") == 0 else - get_option("display.max_rows")) - min_rows = (height if get_option("display.max_rows") == 0 else - get_option("display.min_rows")) + max_rows = ( + height + if get_option("display.max_rows") == 0 + else get_option("display.max_rows") + ) + min_rows = ( + height + if get_option("display.max_rows") == 0 + else get_option("display.min_rows") + ) show_dimensions = get_option("display.show_dimensions") - self.to_string(buf=buf, name=self.name, dtype=self.dtype, - min_rows=min_rows, max_rows=max_rows, - length=show_dimensions) + self.to_string( + buf=buf, + name=self.name, + dtype=self.dtype, + min_rows=min_rows, + max_rows=max_rows, + length=show_dimensions, + ) result = buf.getvalue() return result - def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True, - index=True, length=False, dtype=False, name=False, - max_rows=None, min_rows=None): + def to_string( + self, + buf=None, + na_rep="NaN", + float_format=None, + header=True, + index=True, + length=False, + dtype=False, + name=False, + max_rows=None, + min_rows=None, + ): """ Render a string representation of the Series. @@ -1554,19 +1659,27 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True, String representation of Series if ``buf=None``, otherwise None. """ - formatter = fmt.SeriesFormatter(self, name=name, length=length, - header=header, index=index, - dtype=dtype, na_rep=na_rep, - float_format=float_format, - min_rows=min_rows, - max_rows=max_rows) + formatter = fmt.SeriesFormatter( + self, + name=name, + length=length, + header=header, + index=index, + dtype=dtype, + na_rep=na_rep, + float_format=float_format, + min_rows=min_rows, + max_rows=max_rows, + ) result = formatter.to_string() # catch contract violations if not isinstance(result, str): - raise AssertionError("result must be of type unicode, type" - " of result is {0!r}" - "".format(result.__class__.__name__)) + raise AssertionError( + "result must be of type unicode, type" + " of result is {0!r}" + "".format(result.__class__.__name__) + ) if buf is None: return result @@ -1574,7 +1687,7 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True, try: buf.write(result) except AttributeError: - with open(buf, 'w') as f: + with open(buf, "w") as f: f.write(result) # ---------------------------------------------------------------------- @@ -1691,7 +1804,7 @@ def to_frame(self, name=None): return df - def to_sparse(self, kind='block', fill_value=None): + def to_sparse(self, kind="block", fill_value=None): """ Convert Series to SparseSeries. @@ -1709,16 +1822,19 @@ def to_sparse(self, kind='block', fill_value=None): Sparse representation of the Series. """ - warnings.warn("Series.to_sparse is deprecated and will be removed " - "in a future version", FutureWarning, stacklevel=2) + warnings.warn( + "Series.to_sparse is deprecated and will be removed " "in a future version", + FutureWarning, + stacklevel=2, + ) from pandas.core.sparse.series import SparseSeries values = SparseArray(self, kind=kind, fill_value=fill_value) with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="SparseSeries") - return SparseSeries( - values, index=self.index, name=self.name - ).__finalize__(self) + return SparseSeries(values, index=self.index, name=self.name).__finalize__( + self + ) def _set_name(self, name, inplace=False): """ @@ -1730,7 +1846,7 @@ def _set_name(self, name, inplace=False): inplace : bool whether to modify `self` directly or return a copy """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") ser = self if inplace else self.copy() ser.name = name return ser @@ -1777,8 +1893,7 @@ def count(self, level=None): obs = level_codes[notna(self.values)] out = np.bincount(obs, minlength=len(lev) or None) - return self._constructor(out, index=lev, - dtype='int64').__finalize__(self) + return self._constructor(out, index=lev, dtype="int64").__finalize__(self) def mode(self, dropna=True): """ @@ -1865,7 +1980,7 @@ def unique(self): result = super().unique() return result - def drop_duplicates(self, keep='first', inplace=False): + def drop_duplicates(self, keep="first", inplace=False): """ Return Series with duplicate values removed. @@ -1939,7 +2054,7 @@ def drop_duplicates(self, keep='first', inplace=False): """ return super().drop_duplicates(keep=keep, inplace=inplace) - def duplicated(self, keep='first'): + def duplicated(self, keep="first"): """ Indicate duplicate Series values. @@ -2158,24 +2273,32 @@ def idxmax(self, axis=0, skipna=True, *args, **kwargs): # ndarray compat argmin = deprecate( - 'argmin', idxmin, '0.21.0', - msg=dedent(""" + "argmin", + idxmin, + "0.21.0", + msg=dedent( + """ The current behaviour of 'Series.argmin' is deprecated, use 'idxmin' instead. The behavior of 'argmin' will be corrected to return the positional minimum in the future. For now, use 'series.values.argmin' or 'np.argmin(np.array(values))' to get the position of the minimum - row.""") + row.""" + ), ) argmax = deprecate( - 'argmax', idxmax, '0.21.0', - msg=dedent(""" + "argmax", + idxmax, + "0.21.0", + msg=dedent( + """ The current behaviour of 'Series.argmax' is deprecated, use 'idxmax' instead. The behavior of 'argmax' will be corrected to return the positional maximum in the future. For now, use 'series.values.argmax' or 'np.argmax(np.array(values))' to get the position of the maximum - row.""") + row.""" + ), ) def round(self, decimals=0, *args, **kwargs): @@ -2214,7 +2337,7 @@ def round(self, decimals=0, *args, **kwargs): return result - def quantile(self, q=0.5, interpolation='linear'): + def quantile(self, q=0.5, interpolation="linear"): """ Return value at the given quantile. @@ -2265,21 +2388,18 @@ def quantile(self, q=0.5, interpolation='linear'): # about 2D cases. df = self.to_frame() - result = df.quantile(q=q, interpolation=interpolation, - numeric_only=False) + result = df.quantile(q=q, interpolation=interpolation, numeric_only=False) if result.ndim == 2: result = result.iloc[:, 0] if is_list_like(q): result.name = self.name - return self._constructor(result, - index=Float64Index(q), - name=self.name) + return self._constructor(result, index=Float64Index(q), name=self.name) else: # scalar return result.iloc[0] - def corr(self, other, method='pearson', min_periods=None): + def corr(self, other, method="pearson", min_periods=None): """ Compute correlation with `other` Series, excluding missing values. @@ -2315,17 +2435,20 @@ def corr(self, other, method='pearson', min_periods=None): >>> s1.corr(s2, method=histogram_intersection) 0.3 """ - this, other = self.align(other, join='inner', copy=False) + this, other = self.align(other, join="inner", copy=False) if len(this) == 0: return np.nan - if method in ['pearson', 'spearman', 'kendall'] or callable(method): - return nanops.nancorr(this.values, other.values, method=method, - min_periods=min_periods) + if method in ["pearson", "spearman", "kendall"] or callable(method): + return nanops.nancorr( + this.values, other.values, method=method, min_periods=min_periods + ) - raise ValueError("method must be either 'pearson', " - "'spearman', 'kendall', or a callable, " - "'{method}' was supplied".format(method=method)) + raise ValueError( + "method must be either 'pearson', " + "'spearman', 'kendall', or a callable, " + "'{method}' was supplied".format(method=method) + ) def cov(self, other, min_periods=None): """ @@ -2351,11 +2474,10 @@ def cov(self, other, min_periods=None): >>> s1.cov(s2) -0.01685762652715874 """ - this, other = self.align(other, join='inner', copy=False) + this, other = self.align(other, join="inner", copy=False) if len(this) == 0: return np.nan - return nanops.nancov(this.values, other.values, - min_periods=min_periods) + return nanops.nancov(this.values, other.values, min_periods=min_periods) def diff(self, periods=1): """ @@ -2517,11 +2639,11 @@ def dot(self, other): array([24, 14]) """ from pandas.core.frame import DataFrame + if isinstance(other, (Series, DataFrame)): common = self.index.union(other.index) - if (len(common) > len(self.index) or - len(common) > len(other.index)): - raise ValueError('matrices are not aligned') + if len(common) > len(self.index) or len(common) > len(other.index): + raise ValueError("matrices are not aligned") left = self.reindex(index=common, copy=False) right = other.reindex(index=common, copy=False) @@ -2531,18 +2653,20 @@ def dot(self, other): lvals = self.values rvals = np.asarray(other) if lvals.shape[0] != rvals.shape[0]: - raise Exception('Dot product shape mismatch, %s vs %s' % - (lvals.shape, rvals.shape)) + raise Exception( + "Dot product shape mismatch, %s vs %s" % (lvals.shape, rvals.shape) + ) if isinstance(other, DataFrame): - return self._constructor(np.dot(lvals, rvals), - index=other.columns).__finalize__(self) + return self._constructor( + np.dot(lvals, rvals), index=other.columns + ).__finalize__(self) elif isinstance(other, Series): return np.dot(lvals, rvals) elif isinstance(rvals, np.ndarray): return np.dot(lvals, rvals) else: # pragma: no cover - raise TypeError('unsupported type: %s' % type(other)) + raise TypeError("unsupported type: %s" % type(other)) def __matmul__(self, other): """ @@ -2556,11 +2680,10 @@ def __rmatmul__(self, other): """ return self.dot(np.transpose(other)) - @Substitution(klass='Series') - @Appender(base._shared_docs['searchsorted']) - def searchsorted(self, value, side='left', sorter=None): - return algorithms.searchsorted(self._values, value, - side=side, sorter=sorter) + @Substitution(klass="Series") + @Appender(base._shared_docs["searchsorted"]) + def searchsorted(self, value, side="left", sorter=None): + return algorithms.searchsorted(self._values, value, side=side, sorter=sorter) # ------------------------------------------------------------------- # Combination @@ -2644,8 +2767,9 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): to_concat = [self] + to_append else: to_concat = [self, to_append] - return concat(to_concat, ignore_index=ignore_index, - verify_integrity=verify_integrity) + return concat( + to_concat, ignore_index=ignore_index, verify_integrity=verify_integrity + ) def _binop(self, other, func, level=None, fill_value=None): """ @@ -2668,24 +2792,22 @@ def _binop(self, other, func, level=None, fill_value=None): """ if not isinstance(other, Series): - raise AssertionError('Other operand must be Series') + raise AssertionError("Other operand must be Series") new_index = self.index this = self if not self.index.equals(other.index): - this, other = self.align(other, level=level, join='outer', - copy=False) + this, other = self.align(other, level=level, join="outer", copy=False) new_index = this.index - this_vals, other_vals = ops.fill_binop(this.values, other.values, - fill_value) + this_vals, other_vals = ops.fill_binop(this.values, other.values, fill_value) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = func(this_vals, other_vals) name = ops.get_op_result_name(self, other) - if func.__name__ in ['divmod', 'rdivmod']: + if func.__name__ in ["divmod", "rdivmod"]: ret = ops._construct_divmod_result(self, result, new_index, name) else: ret = ops._construct_result(self, result, new_index, name) @@ -2770,13 +2892,13 @@ def combine(self, other, func, fill_value=None): for idx in new_index: lv = self.get(idx, fill_value) rv = other.get(idx, fill_value) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): new_values.append(func(lv, rv)) else: # Assume that other is a scalar, so apply the function for # each element in the Series new_index = self.index - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): new_values = [func(lv, other) for lv in self._values] new_name = self.name @@ -2890,8 +3012,14 @@ def update(self, other): # ---------------------------------------------------------------------- # Reindexing, sorting - def sort_values(self, axis=0, ascending=True, inplace=False, - kind='quicksort', na_position='last'): + def sort_values( + self, + axis=0, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + ): """ Sort by the values. @@ -2996,14 +3124,16 @@ def sort_values(self, axis=0, ascending=True, inplace=False, 0 z dtype: object """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") # Validate the axis parameter self._get_axis_number(axis) # GH 5856/5853 if inplace and self._is_cached: - raise ValueError("This Series is a view of some other array, to " - "sort in-place you must create a copy") + raise ValueError( + "This Series is a view of some other array, to " + "sort in-place you must create a copy" + ) def _try_kind_sort(arr): # easier to ask forgiveness than permission @@ -3013,7 +3143,7 @@ def _try_kind_sort(arr): except TypeError: # stable sort not available for object dtype # uses the argsort default quicksort - return arr.argsort(kind='quicksort') + return arr.argsort(kind="quicksort") arr = self._values sortedIdx = np.empty(len(self), dtype=np.int32) @@ -3027,26 +3157,28 @@ def _try_kind_sort(arr): if is_list_like(ascending): if len(ascending) != 1: - raise ValueError('Length of ascending (%d) must be 1 ' - 'for Series' % (len(ascending))) + raise ValueError( + "Length of ascending (%d) must be 1 " + "for Series" % (len(ascending)) + ) ascending = ascending[0] if not is_bool(ascending): - raise ValueError('ascending must be boolean') + raise ValueError("ascending must be boolean") if not ascending: argsorted = argsorted[::-1] - if na_position == 'last': + if na_position == "last": n = good.sum() sortedIdx[:n] = idx[good][argsorted] sortedIdx[n:] = idx[bad] - elif na_position == 'first': + elif na_position == "first": n = bad.sum() sortedIdx[n:] = idx[good][argsorted] sortedIdx[:n] = idx[bad] else: - raise ValueError('invalid na_position: {!r}'.format(na_position)) + raise ValueError("invalid na_position: {!r}".format(na_position)) result = self._constructor(arr[sortedIdx], index=self.index[sortedIdx]) @@ -3055,8 +3187,16 @@ def _try_kind_sort(arr): else: return result.__finalize__(self) - def sort_index(self, axis=0, level=None, ascending=True, inplace=False, - kind='quicksort', na_position='last', sort_remaining=True): + def sort_index( + self, + axis=0, + level=None, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + sort_remaining=True, + ): """ Sort Series by index labels. @@ -3169,34 +3309,40 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, """ # TODO: this can be combined with DataFrame.sort_index impl as # almost identical - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") # Validate the axis parameter self._get_axis_number(axis) index = self.index if level is not None: - new_index, indexer = index.sortlevel(level, ascending=ascending, - sort_remaining=sort_remaining) + new_index, indexer = index.sortlevel( + level, ascending=ascending, sort_remaining=sort_remaining + ) elif isinstance(index, MultiIndex): from pandas.core.sorting import lexsort_indexer + labels = index._sort_levels_monotonic() - indexer = lexsort_indexer(labels._get_codes_for_sorting(), - orders=ascending, - na_position=na_position) + indexer = lexsort_indexer( + labels._get_codes_for_sorting(), + orders=ascending, + na_position=na_position, + ) else: from pandas.core.sorting import nargsort # Check monotonic-ness before sort an index # GH11080 - if ((ascending and index.is_monotonic_increasing) or - (not ascending and index.is_monotonic_decreasing)): + if (ascending and index.is_monotonic_increasing) or ( + not ascending and index.is_monotonic_decreasing + ): if inplace: return else: return self.copy() - indexer = nargsort(index, kind=kind, ascending=ascending, - na_position=na_position) + indexer = nargsort( + index, kind=kind, ascending=ascending, na_position=na_position + ) indexer = ensure_platform_int(indexer) new_index = index.take(indexer) @@ -3210,7 +3356,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, else: return result.__finalize__(self) - def argsort(self, axis=0, kind='quicksort', order=None): + def argsort(self, axis=0, kind="quicksort", order=None): """ Override ndarray.argsort. Argsorts the value, omitting NA/null values, and places the result in the same locations as the non-NA values. @@ -3239,18 +3385,16 @@ def argsort(self, axis=0, kind='quicksort', order=None): mask = isna(values) if mask.any(): - result = Series(-1, index=self.index, name=self.name, - dtype='int64') + result = Series(-1, index=self.index, name=self.name, dtype="int64") notmask = ~mask result[notmask] = np.argsort(values[notmask], kind=kind) - return self._constructor(result, - index=self.index).__finalize__(self) + return self._constructor(result, index=self.index).__finalize__(self) else: return self._constructor( - np.argsort(values, kind=kind), index=self.index, - dtype='int64').__finalize__(self) + np.argsort(values, kind=kind), index=self.index, dtype="int64" + ).__finalize__(self) - def nlargest(self, n=5, keep='first'): + def nlargest(self, n=5, keep="first"): """ Return the largest `n` elements. @@ -3348,7 +3492,7 @@ def nlargest(self, n=5, keep='first'): """ return algorithms.SelectNSeries(self, n=n, keep=keep).nlargest() - def nsmallest(self, n=5, keep='first'): + def nsmallest(self, n=5, keep="first"): """ Return the smallest `n` elements. @@ -3465,8 +3609,9 @@ def swaplevel(self, i=-2, j=-1, copy=True): the two innermost levels of the index. """ new_index = self.index.swaplevel(i, j) - return self._constructor(self._values, index=new_index, - copy=copy).__finalize__(self) + return self._constructor(self._values, index=new_index, copy=copy).__finalize__( + self + ) def reorder_levels(self, order): """ @@ -3484,7 +3629,7 @@ def reorder_levels(self, order): type of caller (new object) """ if not isinstance(self.index, MultiIndex): # pragma: no cover - raise Exception('Can only reorder levels on a hierarchical axis.') + raise Exception("Can only reorder levels on a hierarchical axis.") result = self.copy() result.index = result.index.reorder_levels(order) @@ -3532,6 +3677,7 @@ def unstack(self, level=-1, fill_value=None): b 2 4 """ from pandas.core.reshape.reshape import unstack + return unstack(self, level, fill_value) # ---------------------------------------------------------------------- @@ -3612,10 +3758,8 @@ def map(self, arg, na_action=None): 3 I am a rabbit dtype: object """ - new_values = super()._map_values( - arg, na_action=na_action) - return self._constructor(new_values, - index=self.index).__finalize__(self) + new_values = super()._map_values(arg, na_action=na_action) + return self._constructor(new_values, index=self.index).__finalize__(self) def _gotitem(self, key, ndim, subset=None): """ @@ -3631,14 +3775,17 @@ def _gotitem(self, key, ndim, subset=None): """ return self - _agg_see_also_doc = dedent(""" + _agg_see_also_doc = dedent( + """ See Also -------- Series.apply : Invoke function on a Series. Series.transform : Transform function producing a Series with like indexes. - """) + """ + ) - _agg_examples_doc = dedent(""" + _agg_examples_doc = dedent( + """ Examples -------- >>> s = pd.Series([1, 2, 3, 4]) @@ -3656,13 +3803,16 @@ def _gotitem(self, key, ndim, subset=None): min 1 max 4 dtype: int64 - """) + """ + ) - @Substitution(see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - versionadded='\n.. versionadded:: 0.20.0\n', - **_shared_doc_kwargs) - @Appender(generic._shared_docs['aggregate']) + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="\n.. versionadded:: 0.20.0\n", + **_shared_doc_kwargs + ) + @Appender(generic._shared_docs["aggregate"]) def aggregate(self, func, axis=0, *args, **kwargs): # Validate the axis parameter self._get_axis_number(axis) @@ -3671,8 +3821,8 @@ def aggregate(self, func, axis=0, *args, **kwargs): # we can be called from an inner function which # passes this meta-data - kwargs.pop('_axis', None) - kwargs.pop('_level', None) + kwargs.pop("_axis", None) + kwargs.pop("_level", None) # try a regular apply, this evaluates lambdas # row-by-row; however if the lambda is expected a Series @@ -3691,7 +3841,7 @@ def aggregate(self, func, axis=0, *args, **kwargs): agg = aggregate - @Appender(generic._shared_docs['transform'] % _shared_doc_kwargs) + @Appender(generic._shared_docs["transform"] % _shared_doc_kwargs) def transform(self, func, axis=0, *args, **kwargs): # Validate the axis parameter self._get_axis_number(axis) @@ -3795,8 +3945,9 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): dtype: float64 """ if len(self) == 0: - return self._constructor(dtype=self.dtype, - index=self.index).__finalize__(self) + return self._constructor(dtype=self.dtype, index=self.index).__finalize__( + self + ) # dispatch to agg if isinstance(func, (list, dict)): @@ -3808,12 +3959,14 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): # handle ufuncs and lambdas if kwds or args and not isinstance(func, np.ufunc): + def f(x): return func(x, *args, **kwds) + else: f = func - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): if isinstance(f, np.ufunc): return f(self) @@ -3827,14 +3980,13 @@ def f(x): if len(mapped) and isinstance(mapped[0], Series): # GH 25959 use pd.array instead of tolist # so extension arrays can be used - return self._constructor_expanddim(pd.array(mapped), - index=self.index) + return self._constructor_expanddim(pd.array(mapped), index=self.index) else: - return self._constructor(mapped, - index=self.index).__finalize__(self) + return self._constructor(mapped, index=self.index).__finalize__(self) - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + def _reduce( + self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds + ): """ Perform a reduction operation. @@ -3864,17 +4016,24 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, # dispatch to numpy arrays elif isinstance(delegate, np.ndarray): if numeric_only: - raise NotImplementedError('Series.{0} does not implement ' - 'numeric_only.'.format(name)) - with np.errstate(all='ignore'): + raise NotImplementedError( + "Series.{0} does not implement " "numeric_only.".format(name) + ) + with np.errstate(all="ignore"): return op(delegate, skipna=skipna, **kwds) # TODO(EA) dispatch to Index # remove once all internals extension types are # moved to ExtensionArrays - return delegate._reduce(op=op, name=name, axis=axis, skipna=skipna, - numeric_only=numeric_only, - filter_type=filter_type, **kwds) + return delegate._reduce( + op=op, + name=name, + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + filter_type=filter_type, + **kwds + ) def _reindex_indexer(self, new_index, indexer, copy): if indexer is None: @@ -3882,8 +4041,9 @@ def _reindex_indexer(self, new_index, indexer, copy): return self.copy() return self - new_values = algorithms.take_1d(self._values, indexer, - allow_fill=True, fill_value=None) + new_values = algorithms.take_1d( + self._values, indexer, allow_fill=True, fill_value=None + ) return self._constructor(new_values, index=new_index) def _needs_reindex_multi(self, axes, method, level): @@ -3893,14 +4053,32 @@ def _needs_reindex_multi(self, axes, method, level): """ return False - @Appender(generic._shared_docs['align'] % _shared_doc_kwargs) - def align(self, other, join='outer', axis=None, level=None, copy=True, - fill_value=None, method=None, limit=None, fill_axis=0, - broadcast_axis=None): - return super().align(other, join=join, axis=axis, level=level, - copy=copy, fill_value=fill_value, method=method, - limit=limit, fill_axis=fill_axis, - broadcast_axis=broadcast_axis) + @Appender(generic._shared_docs["align"] % _shared_doc_kwargs) + def align( + self, + other, + join="outer", + axis=None, + level=None, + copy=True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, + broadcast_axis=None, + ): + return super().align( + other, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + broadcast_axis=broadcast_axis, + ) def rename(self, index=None, **kwargs): """ @@ -3963,13 +4141,13 @@ def rename(self, index=None, **kwargs): 5 3 dtype: int64 """ - kwargs['inplace'] = validate_bool_kwarg(kwargs.get('inplace', False), - 'inplace') + kwargs["inplace"] = validate_bool_kwarg(kwargs.get("inplace", False), "inplace") - non_mapping = is_scalar(index) or (is_list_like(index) and - not is_dict_like(index)) + non_mapping = is_scalar(index) or ( + is_list_like(index) and not is_dict_like(index) + ) if non_mapping: - return self._set_name(index, inplace=kwargs.get('inplace')) + return self._set_name(index, inplace=kwargs.get("inplace")) return super().rename(index=index, **kwargs) @Substitution(**_shared_doc_kwargs) @@ -3977,8 +4155,16 @@ def rename(self, index=None, **kwargs): def reindex(self, index=None, **kwargs): return super().reindex(index=index, **kwargs) - def drop(self, labels=None, axis=0, index=None, columns=None, - level=None, inplace=False, errors='raise'): + def drop( + self, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors="raise", + ): """ Return Series with specified index labels removed. @@ -4065,29 +4251,62 @@ def drop(self, labels=None, axis=0, index=None, columns=None, length 0.3 dtype: float64 """ - return super().drop(labels=labels, axis=axis, index=index, - columns=columns, level=level, inplace=inplace, - errors=errors) + return super().drop( + labels=labels, + axis=axis, + index=index, + columns=columns, + level=level, + inplace=inplace, + errors=errors, + ) @Substitution(**_shared_doc_kwargs) @Appender(generic.NDFrame.fillna.__doc__) - def fillna(self, value=None, method=None, axis=None, inplace=False, - limit=None, downcast=None, **kwargs): - return super().fillna(value=value, method=method, axis=axis, - inplace=inplace, limit=limit, downcast=downcast, - **kwargs) - - @Appender(generic._shared_docs['replace'] % _shared_doc_kwargs) - def replace(self, to_replace=None, value=None, inplace=False, limit=None, - regex=False, method='pad'): - return super().replace(to_replace=to_replace, value=value, - inplace=inplace, limit=limit, regex=regex, - method=method) - - @Appender(generic._shared_docs['shift'] % _shared_doc_kwargs) + def fillna( + self, + value=None, + method=None, + axis=None, + inplace=False, + limit=None, + downcast=None, + **kwargs + ): + return super().fillna( + value=value, + method=method, + axis=axis, + inplace=inplace, + limit=limit, + downcast=downcast, + **kwargs + ) + + @Appender(generic._shared_docs["replace"] % _shared_doc_kwargs) + def replace( + self, + to_replace=None, + value=None, + inplace=False, + limit=None, + regex=False, + method="pad", + ): + return super().replace( + to_replace=to_replace, + value=value, + inplace=inplace, + limit=limit, + regex=regex, + method=method, + ) + + @Appender(generic._shared_docs["shift"] % _shared_doc_kwargs) def shift(self, periods=1, freq=None, axis=0, fill_value=None): - return super().shift(periods=periods, freq=freq, axis=axis, - fill_value=fill_value) + return super().shift( + periods=periods, freq=freq, axis=axis, fill_value=fill_value + ) def memory_usage(self, index=True, deep=False): """ @@ -4153,13 +4372,14 @@ def _take(self, indices, axis=0, is_copy=False): # https://github.com/pandas-dev/pandas/issues/20664 # TODO: remove when the default Categorical.take behavior changes indices = maybe_convert_indices(indices, len(self._get_axis(axis))) - kwargs = {'allow_fill': False} + kwargs = {"allow_fill": False} else: kwargs = {} new_values = self._values.take(indices, **kwargs) - result = (self._constructor(new_values, index=new_index, - fastpath=True).__finalize__(self)) + result = self._constructor( + new_values, index=new_index, fastpath=True + ).__finalize__(self) # Maybe set copy if we didn't actually change the index. if is_copy: @@ -4306,21 +4526,51 @@ def between(self, left, right, inclusive=True): @Appender(generic.NDFrame.to_csv.__doc__) def to_csv(self, *args, **kwargs): - names = ["path_or_buf", "sep", "na_rep", "float_format", "columns", - "header", "index", "index_label", "mode", "encoding", - "compression", "quoting", "quotechar", "line_terminator", - "chunksize", "date_format", "doublequote", - "escapechar", "decimal"] - - old_names = ["path_or_buf", "index", "sep", "na_rep", "float_format", - "header", "index_label", "mode", "encoding", - "compression", "date_format", "decimal"] + names = [ + "path_or_buf", + "sep", + "na_rep", + "float_format", + "columns", + "header", + "index", + "index_label", + "mode", + "encoding", + "compression", + "quoting", + "quotechar", + "line_terminator", + "chunksize", + "date_format", + "doublequote", + "escapechar", + "decimal", + ] + + old_names = [ + "path_or_buf", + "index", + "sep", + "na_rep", + "float_format", + "header", + "index_label", + "mode", + "encoding", + "compression", + "date_format", + "decimal", + ] if "path" in kwargs: - warnings.warn("The signature of `Series.to_csv` was aligned " - "to that of `DataFrame.to_csv`, and argument " - "'path' will be renamed to 'path_or_buf'.", - FutureWarning, stacklevel=2) + warnings.warn( + "The signature of `Series.to_csv` was aligned " + "to that of `DataFrame.to_csv`, and argument " + "'path' will be renamed to 'path_or_buf'.", + FutureWarning, + stacklevel=2, + ) kwargs["path_or_buf"] = kwargs.pop("path") if len(args) > 1: @@ -4330,49 +4580,57 @@ def to_csv(self, *args, **kwargs): if not (is_string_like(maybe_sep) and len(maybe_sep) == 1): # old signature - warnings.warn("The signature of `Series.to_csv` was aligned " - "to that of `DataFrame.to_csv`. Note that the " - "order of arguments changed, and the new one " - "has 'sep' in first place, for which \"{}\" is " - "not a valid value. The old order will cease to " - "be supported in a future version. Please refer " - "to the documentation for `DataFrame.to_csv` " - "when updating your function " - "calls.".format(maybe_sep), - FutureWarning, stacklevel=2) + warnings.warn( + "The signature of `Series.to_csv` was aligned " + "to that of `DataFrame.to_csv`. Note that the " + "order of arguments changed, and the new one " + "has 'sep' in first place, for which \"{}\" is " + "not a valid value. The old order will cease to " + "be supported in a future version. Please refer " + "to the documentation for `DataFrame.to_csv` " + "when updating your function " + "calls.".format(maybe_sep), + FutureWarning, + stacklevel=2, + ) names = old_names - pos_args = dict(zip(names[:len(args)], args)) + pos_args = dict(zip(names[: len(args)], args)) for key in pos_args: if key in kwargs: - raise ValueError("Argument given by name ('{}') and position " - "({})".format(key, names.index(key))) + raise ValueError( + "Argument given by name ('{}') and position " + "({})".format(key, names.index(key)) + ) kwargs[key] = pos_args[key] if kwargs.get("header", None) is None: - warnings.warn("The signature of `Series.to_csv` was aligned " - "to that of `DataFrame.to_csv`, and argument " - "'header' will change its default value from False " - "to True: please pass an explicit value to suppress " - "this warning.", FutureWarning, - stacklevel=2) + warnings.warn( + "The signature of `Series.to_csv` was aligned " + "to that of `DataFrame.to_csv`, and argument " + "'header' will change its default value from False " + "to True: please pass an explicit value to suppress " + "this warning.", + FutureWarning, + stacklevel=2, + ) kwargs["header"] = False # Backwards compatibility. return self.to_frame().to_csv(**kwargs) - @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs) + @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) def isna(self): return super().isna() - @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs) + @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) def isnull(self): return super().isnull() - @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs) + @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs) def notna(self): return super().notna() - @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs) + @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs) def notnull(self): return super().notnull() @@ -4447,11 +4705,13 @@ def dropna(self, axis=0, inplace=False, **kwargs): 5 I stay dtype: object """ - inplace = validate_bool_kwarg(inplace, 'inplace') - kwargs.pop('how', None) + inplace = validate_bool_kwarg(inplace, "inplace") + kwargs.pop("how", None) if kwargs: - raise TypeError('dropna() got an unexpected keyword ' - 'argument "{0}"'.format(list(kwargs.keys())[0])) + raise TypeError( + "dropna() got an unexpected keyword " + 'argument "{0}"'.format(list(kwargs.keys())[0]) + ) # Validate the axis parameter self._get_axis_number(axis or 0) @@ -4480,14 +4740,18 @@ def valid(self, inplace=False, **kwargs): Series Series without null values. """ - warnings.warn("Method .valid will be removed in a future version. " - "Use .dropna instead.", FutureWarning, stacklevel=2) + warnings.warn( + "Method .valid will be removed in a future version. " + "Use .dropna instead.", + FutureWarning, + stacklevel=2, + ) return self.dropna(inplace=inplace, **kwargs) # ---------------------------------------------------------------------- # Time series-oriented methods - def to_timestamp(self, freq=None, how='start', copy=True): + def to_timestamp(self, freq=None, how="start", copy=True): """ Cast to DatetimeIndex of Timestamps, at *beginning* of period. @@ -4510,8 +4774,7 @@ def to_timestamp(self, freq=None, how='start', copy=True): new_values = new_values.copy() new_index = self.index.to_timestamp(freq=freq, how=how) - return self._constructor(new_values, - index=new_index).__finalize__(self) + return self._constructor(new_values, index=new_index).__finalize__(self) def to_period(self, freq=None, copy=True): """ @@ -4535,8 +4798,7 @@ def to_period(self, freq=None, copy=True): new_values = new_values.copy() new_index = self.index.to_period(freq=freq) - return self._constructor(new_values, - index=new_index).__finalize__(self) + return self._constructor(new_values, index=new_index).__finalize__(self) # ---------------------------------------------------------------------- # Accessor Methods @@ -4552,8 +4814,13 @@ def to_period(self, freq=None, copy=True): hist = pandas.plotting.hist_series -Series._setup_axes(['index'], info_axis=0, stat_axis=0, aliases={'rows': 0}, - docs={'index': 'The index (axis labels) of the Series.'}) +Series._setup_axes( + ["index"], + info_axis=0, + stat_axis=0, + aliases={"rows": 0}, + docs={"index": "The index (axis labels) of the Series."}, +) Series._add_numeric_operations() Series._add_series_only_operations() Series._add_series_or_dataframe_operations() diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index b79390581612b2..523c4dc5e867b1 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -6,8 +6,12 @@ from pandas.core.dtypes.cast import infer_dtype_from_array from pandas.core.dtypes.common import ( - ensure_int64, ensure_platform_int, is_categorical_dtype, - is_extension_array_dtype, is_list_like) + ensure_int64, + ensure_platform_int, + is_categorical_dtype, + is_extension_array_dtype, + is_list_like, +) from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algorithms @@ -42,6 +46,7 @@ def get_group_index(labels, shape, sort, xnull): An array of type int64 where two elements are equal if their corresponding labels are equal at all location. """ + def _int64_cut_off(shape): acc = 1 for i, mul in enumerate(shape): @@ -69,8 +74,8 @@ def maybe_lift(lab, size): nlev = _int64_cut_off(shape) # compute flat ids for the first `nlev` levels - stride = np.prod(shape[1:nlev], dtype='i8') - out = stride * labels[0].astype('i8', subok=False, copy=False) + stride = np.prod(shape[1:nlev], dtype="i8") + out = stride * labels[0].astype("i8", subok=False, copy=False) for i in range(1, nlev): if shape[i] == 0: @@ -132,7 +137,7 @@ def decons_group_index(comp_labels, shape): if is_int64_overflow_possible(shape): # at some point group indices are factorized, # and may not be deconstructed here! wrong path! - raise ValueError('cannot deconstruct factorized group indices!') + raise ValueError("cannot deconstruct factorized group indices!") label_list = [] factor = 1 @@ -158,17 +163,16 @@ def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull): """ if not xnull: - lift = np.fromiter(((a == -1).any() for a in labels), dtype='i8') - shape = np.asarray(shape, dtype='i8') + lift + lift = np.fromiter(((a == -1).any() for a in labels), dtype="i8") + shape = np.asarray(shape, dtype="i8") + lift if not is_int64_overflow_possible(shape): # obs ids are deconstructable! take the fast route! out = decons_group_index(obs_ids, shape) - return out if xnull or not lift.any() \ - else [x - y for x, y in zip(out, lift)] + return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)] i = unique_label_indices(comp_ids) - i8copy = lambda a: a.astype('i8', subok=False, copy=True) + i8copy = lambda a: a.astype("i8", subok=False, copy=True) return [i8copy(lab[i]) for lab in labels] @@ -184,7 +188,7 @@ def indexer_from_factorized(labels, shape, compress=True): return get_group_index_sorter(ids, ngroups) -def lexsort_indexer(keys, orders=None, na_position='last'): +def lexsort_indexer(keys, orders=None, na_position="last"): from pandas.core.arrays import Categorical labels = [] @@ -204,22 +208,22 @@ def lexsort_indexer(keys, orders=None, na_position='last'): else: c = Categorical(key, ordered=True) - if na_position not in ['last', 'first']: - raise ValueError('invalid na_position: {!r}'.format(na_position)) + if na_position not in ["last", "first"]: + raise ValueError("invalid na_position: {!r}".format(na_position)) n = len(c.categories) codes = c.codes.copy() - mask = (c.codes == -1) + mask = c.codes == -1 if order: # ascending - if na_position == 'last': + if na_position == "last": codes = np.where(mask, n, codes) - elif na_position == 'first': + elif na_position == "first": codes += 1 else: # not order means descending - if na_position == 'last': + if na_position == "last": codes = np.where(mask, n, n - codes - 1) - elif na_position == 'first': + elif na_position == "first": codes = np.where(mask, 0, n - codes) if mask.any(): n += 1 @@ -230,7 +234,7 @@ def lexsort_indexer(keys, orders=None, na_position='last'): return indexer_from_factorized(labels, shape) -def nargsort(items, kind='quicksort', ascending=True, na_position='last'): +def nargsort(items, kind="quicksort", ascending=True, na_position="last"): """ This is intended to be a drop-in replacement for np.argsort which handles NaNs. It adds ascending and na_position parameters. @@ -258,12 +262,12 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): indexer = indexer[::-1] # Finally, place the NaNs at the end or the beginning according to # na_position - if na_position == 'last': + if na_position == "last": indexer = np.concatenate([indexer, nan_idx]) - elif na_position == 'first': + elif na_position == "first": indexer = np.concatenate([nan_idx, indexer]) else: - raise ValueError('invalid na_position: {!r}'.format(na_position)) + raise ValueError("invalid na_position: {!r}".format(na_position)) return indexer @@ -279,8 +283,7 @@ def __init__(self, comp_ids, ngroups, levels, labels): self.comp_ids = comp_ids.astype(np.int64) self.k = len(labels) - self.tables = [hashtable.Int64HashTable(ngroups) - for _ in range(self.k)] + self.tables = [hashtable.Int64HashTable(ngroups) for _ in range(self.k)] self._populate_tables() @@ -289,8 +292,10 @@ def _populate_tables(self): table.map(self.comp_ids, labs.astype(np.int64)) def get_key(self, comp_id): - return tuple(level[table.get_item(comp_id)] - for table, level in zip(self.tables, self.levels)) + return tuple( + level[table.get_item(comp_id)] + for table, level in zip(self.tables, self.levels) + ) def get_flattened_iterator(comp_ids, ngroups, levels, labels): @@ -304,9 +309,11 @@ def get_indexer_dict(label_list, keys): shape = list(map(len, keys)) group_index = get_group_index(label_list, shape, sort=True, xnull=True) - ngroups = ((group_index.size and group_index.max()) + 1) \ - if is_int64_overflow_possible(shape) \ - else np.prod(shape, dtype='i8') + ngroups = ( + ((group_index.size and group_index.max()) + 1) + if is_int64_overflow_possible(shape) + else np.prod(shape, dtype="i8") + ) sorter = get_group_index_sorter(group_index, ngroups) @@ -319,6 +326,7 @@ def get_indexer_dict(label_list, keys): # ---------------------------------------------------------------------- # sorting levels...cleverly? + def get_group_index_sorter(group_index, ngroups): """ algos.groupsort_indexer implements `counting sort` and it is at least @@ -336,14 +344,12 @@ def get_group_index_sorter(group_index, ngroups): count = len(group_index) alpha = 0.0 # taking complexities literally; there may be beta = 1.0 # some room for fine-tuning these parameters - do_groupsort = (count > 0 and ((alpha + beta * ngroups) < - (count * np.log(count)))) + do_groupsort = count > 0 and ((alpha + beta * ngroups) < (count * np.log(count))) if do_groupsort: - sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), - ngroups) + sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups) return ensure_platform_int(sorter) else: - return group_index.argsort(kind='mergesort') + return group_index.argsort(kind="mergesort") def compress_group_index(group_index, sort=True): @@ -387,8 +393,7 @@ def _reorder_by_uniques(uniques, labels): return uniques, labels -def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, - verify=True): +def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, verify=True): """ Sort ``values`` and reorder corresponding ``labels``. ``values`` should be unique if ``labels`` is not None. @@ -433,26 +438,27 @@ def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, * If ``labels`` is not None and ``values`` contain duplicates. """ if not is_list_like(values): - raise TypeError("Only list-like objects are allowed to be passed to" - "safe_sort as values") + raise TypeError( + "Only list-like objects are allowed to be passed to" "safe_sort as values" + ) - if (not isinstance(values, np.ndarray) - and not is_extension_array_dtype(values)): + if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values): # don't convert to string types dtype, _ = infer_dtype_from_array(values) values = np.asarray(values, dtype=dtype) def sort_mixed(values): # order ints before strings, safe in py3 - str_pos = np.array([isinstance(x, str) for x in values], - dtype=bool) + str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) nums = np.sort(values[~str_pos]) strs = np.sort(values[str_pos]) return np.concatenate([nums, np.asarray(strs, dtype=object)]) sorter = None - if (not is_extension_array_dtype(values) - and lib.infer_dtype(values, skipna=False) == 'mixed-integer'): + if ( + not is_extension_array_dtype(values) + and lib.infer_dtype(values, skipna=False) == "mixed-integer" + ): # unorderable in py3 if mixed str/int ordered = sort_mixed(values) else: @@ -469,18 +475,22 @@ def sort_mixed(values): return ordered if not is_list_like(labels): - raise TypeError("Only list-like objects or None are allowed to be" - "passed to safe_sort as labels") + raise TypeError( + "Only list-like objects or None are allowed to be" + "passed to safe_sort as labels" + ) labels = ensure_platform_int(np.asarray(labels)) from pandas import Index + if not assume_unique and not Index(values).is_unique: raise ValueError("values should be unique if labels is not None") if sorter is None: # mixed types (hash_klass, _), values = algorithms._get_data_algo( - values, algorithms._hashtables) + values, algorithms._hashtables + ) t = hash_klass(len(values)) t.map_locations(values) sorter = ensure_platform_int(t.lookup(ordered)) @@ -498,7 +508,7 @@ def sort_mixed(values): reverse_indexer.put(sorter, np.arange(len(sorter))) # Out of bound indices will be masked with `na_sentinel` next, so we # may deal with them here without performance loss using `mode='wrap'` - new_labels = reverse_indexer.take(labels, mode='wrap') + new_labels = reverse_indexer.take(labels, mode="wrap") mask = labels == na_sentinel if verify: diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 6a0ba5f93c5092..f195e4b5f4e373 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -20,14 +20,13 @@ from pandas.core.frame import DataFrame import pandas.core.generic as generic from pandas.core.index import Index, MultiIndex, ensure_index -from pandas.core.internals import ( - BlockManager, create_block_manager_from_arrays) +from pandas.core.internals import BlockManager, create_block_manager_from_arrays from pandas.core.internals.construction import extract_index, prep_ndarray import pandas.core.ops as ops from pandas.core.series import Series from pandas.core.sparse.series import SparseSeries -_shared_doc_kwargs = dict(klass='SparseDataFrame') +_shared_doc_kwargs = dict(klass="SparseDataFrame") depr_msg = """\ SparseDataFrame is deprecated and will be removed in a future version. Use a regular DataFrame whose columns are SparseArrays instead. @@ -62,10 +61,19 @@ class SparseDataFrame(DataFrame): Default fill_value for converting Series to SparseSeries (default: nan). Will not override SparseSeries passed in. """ - _subtyp = 'sparse_frame' - def __init__(self, data=None, index=None, columns=None, default_kind=None, - default_fill_value=None, dtype=None, copy=False): + _subtyp = "sparse_frame" + + def __init__( + self, + data=None, + index=None, + columns=None, + default_kind=None, + default_fill_value=None, + dtype=None, + copy=False, + ): warnings.warn(depr_msg, FutureWarning, stacklevel=2) # pick up the defaults from the Sparse structures @@ -83,7 +91,7 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None, index = data.index if default_fill_value is None: default_fill_value = data.fill_value - if columns is None and hasattr(data, 'name'): + if columns is None and hasattr(data, "name"): columns = [data.name] if columns is None: raise Exception("cannot pass a series w/o a name or columns") @@ -92,30 +100,33 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None, if default_fill_value is None: default_fill_value = np.nan if default_kind is None: - default_kind = 'block' + default_kind = "block" self._default_kind = default_kind self._default_fill_value = default_fill_value if is_scipy_sparse(data): - mgr = self._init_spmatrix(data, index, columns, dtype=dtype, - fill_value=default_fill_value) + mgr = self._init_spmatrix( + data, index, columns, dtype=dtype, fill_value=default_fill_value + ) elif isinstance(data, dict): mgr = self._init_dict(data, index, columns, dtype=dtype) elif isinstance(data, (np.ndarray, list)): mgr = self._init_matrix(data, index, columns, dtype=dtype) elif isinstance(data, SparseDataFrame): - mgr = self._init_mgr(data._data, - dict(index=index, columns=columns), - dtype=dtype, copy=copy) + mgr = self._init_mgr( + data._data, dict(index=index, columns=columns), dtype=dtype, copy=copy + ) elif isinstance(data, DataFrame): mgr = self._init_dict(data, data.index, data.columns, dtype=dtype) elif isinstance(data, Series): - mgr = self._init_dict(data.to_frame(), data.index, - columns=None, dtype=dtype) + mgr = self._init_dict( + data.to_frame(), data.index, columns=None, dtype=dtype + ) elif isinstance(data, BlockManager): - mgr = self._init_mgr(data, axes=dict(index=index, columns=columns), - dtype=dtype, copy=copy) + mgr = self._init_mgr( + data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy + ) elif data is None: data = DataFrame() @@ -128,15 +139,20 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None, columns = Index([]) else: for c in columns: - data[c] = SparseArray(self._default_fill_value, - index=index, kind=self._default_kind, - fill_value=self._default_fill_value) + data[c] = SparseArray( + self._default_fill_value, + index=index, + kind=self._default_kind, + fill_value=self._default_fill_value, + ) mgr = to_manager(data, columns, index) if dtype is not None: mgr = mgr.astype(dtype) else: - msg = ('SparseDataFrame called with unknown type "{data_type}" ' - 'for data argument') + msg = ( + 'SparseDataFrame called with unknown type "{data_type}" ' + "for data argument" + ) raise TypeError(msg.format(data_type=type(data).__name__)) generic.NDFrame.__init__(self, mgr) @@ -160,9 +176,14 @@ def _init_dict(self, data, index, columns, dtype=None): index = extract_index(list(data.values())) def sp_maker(x): - return SparseArray(x, kind=self._default_kind, - fill_value=self._default_fill_value, - copy=True, dtype=dtype) + return SparseArray( + x, + kind=self._default_kind, + fill_value=self._default_fill_value, + copy=True, + dtype=dtype, + ) + sdict = {} for k, v in data.items(): if isinstance(v, Series): @@ -188,11 +209,14 @@ def sp_maker(x): if len(columns.difference(sdict)): # TODO: figure out how to handle this case, all nan's? # add in any other columns we want to have (completeness) - nan_arr = np.empty(len(index), dtype='float64') + nan_arr = np.empty(len(index), dtype="float64") nan_arr.fill(np.nan) - nan_arr = SparseArray(nan_arr, kind=self._default_kind, - fill_value=self._default_fill_value, - copy=False) + nan_arr = SparseArray( + nan_arr, + kind=self._default_kind, + fill_value=self._default_fill_value, + copy=False, + ) sdict.update((c, nan_arr) for c in columns if c not in sdict) return to_manager(sdict, columns, index) @@ -206,8 +230,7 @@ def _init_matrix(self, data, index, columns, dtype=None): data = {idx: data[:, i] for i, idx in enumerate(columns)} return self._init_dict(data, index, columns, dtype) - def _init_spmatrix(self, data, index, columns, dtype=None, - fill_value=None): + def _init_spmatrix(self, data, index, columns, dtype=None, fill_value=None): """ Init self from scipy.sparse matrix. """ @@ -225,16 +248,24 @@ def _init_spmatrix(self, data, index, columns, dtype=None, blocs, blens = get_blocks(rows) sdict[columns[col]] = SparseSeries( - rowvals.values, index=index, + rowvals.values, + index=index, fill_value=fill_value, - sparse_index=BlockIndex(N, blocs, blens)) + sparse_index=BlockIndex(N, blocs, blens), + ) # Add any columns that were empty and thus not grouped on above - sdict.update({column: SparseSeries(index=index, - fill_value=fill_value, - sparse_index=BlockIndex(N, [], [])) - for column in columns - if column not in sdict}) + sdict.update( + { + column: SparseSeries( + index=index, + fill_value=fill_value, + sparse_index=BlockIndex(N, [], []), + ) + for column in columns + if column not in sdict + } + ) return self._init_dict(sdict, index, columns, dtype) @@ -249,9 +280,13 @@ def __repr__(self): def __getstate__(self): # pickling - return dict(_typ=self._typ, _subtyp=self._subtyp, _data=self._data, - _default_fill_value=self._default_fill_value, - _default_kind=self._default_kind) + return dict( + _typ=self._typ, + _subtyp=self._subtyp, + _data=self._data, + _default_fill_value=self._default_fill_value, + _default_kind=self._default_kind, + ) def _unpickle_sparse_frame_compat(self, state): """ @@ -261,20 +296,23 @@ def _unpickle_sparse_frame_compat(self, state): if not isinstance(cols, Index): # pragma: no cover from pandas.io.pickle import _unpickle_array + columns = _unpickle_array(cols) else: columns = cols if not isinstance(idx, Index): # pragma: no cover from pandas.io.pickle import _unpickle_array + index = _unpickle_array(idx) else: index = idx series_dict = DataFrame() for col, (sp_index, sp_values) in series.items(): - series_dict[col] = SparseSeries(sp_values, sparse_index=sp_index, - fill_value=fv) + series_dict[col] = SparseSeries( + sp_values, sparse_index=sp_index, fill_value=fv + ) self._data = to_manager(series_dict, columns, index) self._default_fill_value = fv @@ -289,12 +327,14 @@ def _apply_columns(self, func): Get new SparseDataFrame applying func to each columns """ - new_data = {col: func(series) - for col, series in self.items()} + new_data = {col: func(series) for col, series in self.items()} return self._constructor( - data=new_data, index=self.index, columns=self.columns, - default_fill_value=self.default_fill_value).__finalize__(self) + data=new_data, + index=self.index, + columns=self.columns, + default_fill_value=self.default_fill_value, + ).__finalize__(self) def astype(self, dtype): return self._apply_columns(lambda x: x.astype(dtype)) @@ -322,23 +362,27 @@ def density(self): Ratio of non-sparse points to total (dense) data points represented in the frame """ - tot_nonsparse = sum(ser.sp_index.npoints - for _, ser in self.items()) + tot_nonsparse = sum(ser.sp_index.npoints for _, ser in self.items()) tot = len(self.index) * len(self.columns) return tot_nonsparse / float(tot) - def fillna(self, value=None, method=None, axis=0, inplace=False, - limit=None, downcast=None): - new_self = super().fillna(value=value, method=method, axis=axis, - inplace=inplace, limit=limit, - downcast=downcast) + def fillna( + self, value=None, method=None, axis=0, inplace=False, limit=None, downcast=None + ): + new_self = super().fillna( + value=value, + method=method, + axis=axis, + inplace=inplace, + limit=limit, + downcast=downcast, + ) if not inplace: self = new_self # set the fill value if we are filling as a scalar with nothing special # going on - if (value is not None and value == value and method is None and - limit is None): + if value is not None and value == value and method is None and limit is None: self._default_fill_value = value if not inplace: @@ -362,29 +406,35 @@ def _sanitize_column(self, key, value, **kwargs): sanitized_column : SparseArray """ + def sp_maker(x, index=None): - return SparseArray(x, index=index, - fill_value=self._default_fill_value, - kind=self._default_kind) + return SparseArray( + x, + index=index, + fill_value=self._default_fill_value, + kind=self._default_kind, + ) + if isinstance(value, SparseSeries): clean = value.reindex(self.index).as_sparse_array( - fill_value=self._default_fill_value, kind=self._default_kind) + fill_value=self._default_fill_value, kind=self._default_kind + ) elif isinstance(value, SparseArray): if len(value) != len(self.index): - raise ValueError('Length of values does not match ' - 'length of index') + raise ValueError("Length of values does not match " "length of index") clean = value - elif hasattr(value, '__iter__'): + elif hasattr(value, "__iter__"): if isinstance(value, Series): clean = value.reindex(self.index) if not isinstance(value, SparseSeries): clean = sp_maker(clean) else: if len(value) != len(self.index): - raise ValueError('Length of values does not match ' - 'length of index') + raise ValueError( + "Length of values does not match " "length of index" + ) clean = sp_maker(value) # Scalar @@ -412,10 +462,13 @@ def get_value(self, index, col, takeable=False): ------- value : scalar value """ - warnings.warn("get_value is deprecated and will be removed " - "in a future release. Please use " - ".at[] or .iat[] accessors instead", FutureWarning, - stacklevel=2) + warnings.warn( + "get_value is deprecated and will be removed " + "in a future release. Please use " + ".at[] or .iat[] accessors instead", + FutureWarning, + stacklevel=2, + ) return self._get_value(index, col, takeable=takeable) def _get_value(self, index, col, takeable=False): @@ -425,6 +478,7 @@ def _get_value(self, index, col, takeable=False): series = self._get_item_cache(col) return series._get_value(index, takeable=takeable) + _get_value.__doc__ = get_value.__doc__ def set_value(self, index, col, value, takeable=False): @@ -452,17 +506,21 @@ def set_value(self, index, col, value, takeable=False): ------- frame : DataFrame """ - warnings.warn("set_value is deprecated and will be removed " - "in a future release. Please use " - ".at[] or .iat[] accessors instead", FutureWarning, - stacklevel=2) + warnings.warn( + "set_value is deprecated and will be removed " + "in a future release. Please use " + ".at[] or .iat[] accessors instead", + FutureWarning, + stacklevel=2, + ) return self._set_value(index, col, value, takeable=takeable) def _set_value(self, index, col, value, takeable=False): - dense = self.to_dense()._set_value( - index, col, value, takeable=takeable) - return dense.to_sparse(kind=self._default_kind, - fill_value=self._default_fill_value) + dense = self.to_dense()._set_value(index, col, value, takeable=takeable) + return dense.to_sparse( + kind=self._default_kind, fill_value=self._default_fill_value + ) + _set_value.__doc__ = set_value.__doc__ def _slice(self, slobj, axis=0, kind=None): @@ -503,7 +561,7 @@ def _combine_frame(self, other, func, fill_value=None, level=None): if level is not None: raise NotImplementedError("'level' argument is not supported") - this, other = self.align(other, join='outer', level=level, copy=False) + this, other = self.align(other, join="outer", level=level, copy=False) new_index, new_columns = this.index, this.columns if self.empty and other.empty: @@ -527,10 +585,12 @@ def _combine_frame(self, other, func, fill_value=None, level=None): new_fill_value = self._get_op_result_fill_value(other, func) - return self._constructor(data=new_data, index=new_index, - columns=new_columns, - default_fill_value=new_fill_value - ).__finalize__(self) + return self._constructor( + data=new_data, + index=new_index, + columns=new_columns, + default_fill_value=new_fill_value, + ).__finalize__(self) def _combine_match_index(self, other, func, level=None): new_data = {} @@ -538,8 +598,7 @@ def _combine_match_index(self, other, func, level=None): if level is not None: raise NotImplementedError("'level' argument is not supported") - this, other = self.align(other, join='outer', axis=0, level=level, - copy=False) + this, other = self.align(other, join="outer", axis=0, level=level, copy=False) for col, series in this.items(): new_data[col] = func(series.values, other.values) @@ -547,8 +606,11 @@ def _combine_match_index(self, other, func, level=None): fill_value = self._get_op_result_fill_value(other, func) return self._constructor( - new_data, index=this.index, columns=self.columns, - default_fill_value=fill_value).__finalize__(self) + new_data, + index=this.index, + columns=self.columns, + default_fill_value=fill_value, + ).__finalize__(self) def _combine_match_columns(self, other, func, level=None): # patched version of DataFrame._combine_match_columns to account for @@ -559,8 +621,7 @@ def _combine_match_columns(self, other, func, level=None): if level is not None: raise NotImplementedError("'level' argument is not supported") - left, right = self.align(other, join='outer', axis=1, level=level, - copy=False) + left, right = self.align(other, join="outer", axis=1, level=level, copy=False) assert left.columns.equals(right.index) new_data = {} @@ -569,8 +630,11 @@ def _combine_match_columns(self, other, func, level=None): new_data[col] = func(left[col], float(right[col])) return self._constructor( - new_data, index=left.index, columns=left.columns, - default_fill_value=self.default_fill_value).__finalize__(self) + new_data, + index=left.index, + columns=left.columns, + default_fill_value=self.default_fill_value, + ).__finalize__(self) def _combine_const(self, other, func): return self._apply_columns(lambda x: func(x, other)) @@ -581,7 +645,7 @@ def _get_op_result_fill_value(self, other, func): if isinstance(other, DataFrame): # i.e. called from _combine_frame - other_default = getattr(other, 'default_fill_value', np.nan) + other_default = getattr(other, "default_fill_value", np.nan) # if the fill values are the same use them? or use a valid one if own_default == other_default: @@ -601,18 +665,18 @@ def _get_op_result_fill_value(self, other, func): if isna(other.fill_value) or isna(own_default): fill_value = np.nan else: - fill_value = func(np.float64(own_default), - np.float64(other.fill_value)) + fill_value = func(np.float64(own_default), np.float64(other.fill_value)) else: raise NotImplementedError(type(other)) return fill_value - def _reindex_index(self, index, method, copy, level, fill_value=np.nan, - limit=None, takeable=False): + def _reindex_index( + self, index, method, copy, level, fill_value=np.nan, limit=None, takeable=False + ): if level is not None: - raise TypeError('Reindex by level not supported for sparse') + raise TypeError("Reindex by level not supported for sparse") if self.index.equals(index): if copy: @@ -621,8 +685,9 @@ def _reindex_index(self, index, method, copy, level, fill_value=np.nan, return self if len(self.index) == 0: - return self._constructor( - index=index, columns=self.columns).__finalize__(self) + return self._constructor(index=index, columns=self.columns).__finalize__( + self + ) indexer = self.index.get_indexer(index, method, limit=limit) indexer = ensure_platform_int(indexer) @@ -647,13 +712,17 @@ def _reindex_index(self, index, method, copy, level, fill_value=np.nan, new_series[col] = new return self._constructor( - new_series, index=index, columns=self.columns, - default_fill_value=self._default_fill_value).__finalize__(self) + new_series, + index=index, + columns=self.columns, + default_fill_value=self._default_fill_value, + ).__finalize__(self) - def _reindex_columns(self, columns, method, copy, level, fill_value=None, - limit=None, takeable=False): + def _reindex_columns( + self, columns, method, copy, level, fill_value=None, limit=None, takeable=False + ): if level is not None: - raise TypeError('Reindex by level not supported for sparse') + raise TypeError("Reindex by level not supported for sparse") if notna(fill_value): raise NotImplementedError("'fill_value' argument is not supported") @@ -667,21 +736,31 @@ def _reindex_columns(self, columns, method, copy, level, fill_value=None, # TODO: fill value handling sdict = {k: v for k, v in self.items() if k in columns} return self._constructor( - sdict, index=self.index, columns=columns, - default_fill_value=self._default_fill_value).__finalize__(self) - - def _reindex_with_indexers(self, reindexers, method=None, fill_value=None, - limit=None, copy=False, allow_dups=False): + sdict, + index=self.index, + columns=columns, + default_fill_value=self._default_fill_value, + ).__finalize__(self) + + def _reindex_with_indexers( + self, + reindexers, + method=None, + fill_value=None, + limit=None, + copy=False, + allow_dups=False, + ): if method is not None or limit is not None: - raise NotImplementedError("cannot reindex with a method or limit " - "with sparse") + raise NotImplementedError( + "cannot reindex with a method or limit " "with sparse" + ) if fill_value is None: fill_value = np.nan - reindexers = {self._get_axis_number(a): val - for (a, val) in reindexers.items()} + reindexers = {self._get_axis_number(a): val for (a, val) in reindexers.items()} index, row_indexer = reindexers.get(0, (None, None)) columns, col_indexer = reindexers.get(1, (None, None)) @@ -695,30 +774,32 @@ def _reindex_with_indexers(self, reindexers, method=None, fill_value=None, continue if row_indexer is not None: new_arrays[col] = algos.take_1d( - self[col]._internal_get_values(), - row_indexer, - fill_value=fill_value) + self[col]._internal_get_values(), row_indexer, fill_value=fill_value + ) else: new_arrays[col] = self[col] - return self._constructor(new_arrays, index=index, - columns=columns).__finalize__(self) + return self._constructor(new_arrays, index=index, columns=columns).__finalize__( + self + ) - def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', - sort=False): + def _join_compat( + self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False + ): if on is not None: - raise NotImplementedError("'on' keyword parameter is not yet " - "implemented") + raise NotImplementedError( + "'on' keyword parameter is not yet " "implemented" + ) return self._join_index(other, how, lsuffix, rsuffix) def _join_index(self, other, how, lsuffix, rsuffix): if isinstance(other, Series): if other.name is None: - raise ValueError('Other Series must have a name') + raise ValueError("Other Series must have a name") other = SparseDataFrame( - {other.name: other}, - default_fill_value=self._default_fill_value) + {other.name: other}, default_fill_value=self._default_fill_value + ) join_index = self.index.join(other.index, how=how) @@ -728,23 +809,26 @@ def _join_index(self, other, how, lsuffix, rsuffix): this, other = this._maybe_rename_join(other, lsuffix, rsuffix) from pandas import concat + return concat([this, other], axis=1, verify_integrity=True) def _maybe_rename_join(self, other, lsuffix, rsuffix): to_rename = self.columns.intersection(other.columns) if len(to_rename) > 0: if not lsuffix and not rsuffix: - raise ValueError('columns overlap but no suffix specified: ' - '{to_rename}'.format(to_rename=to_rename)) + raise ValueError( + "columns overlap but no suffix specified: " + "{to_rename}".format(to_rename=to_rename) + ) def lrenamer(x): if x in to_rename: - return '{x}{lsuffix}'.format(x=x, lsuffix=lsuffix) + return "{x}{lsuffix}".format(x=x, lsuffix=lsuffix) return x def rrenamer(x): if x in to_rename: - return '{x}{rsuffix}'.format(x=x, rsuffix=rsuffix) + return "{x}{rsuffix}".format(x=x, rsuffix=rsuffix) return x this = self.rename(columns=lrenamer) @@ -760,9 +844,12 @@ def transpose(self, *args, **kwargs): """ nv.validate_transpose(args, kwargs) return self._constructor( - self.values.T, index=self.columns, columns=self.index, + self.values.T, + index=self.columns, + columns=self.index, default_fill_value=self._default_fill_value, - default_kind=self._default_kind).__finalize__(self) + default_kind=self._default_kind, + ).__finalize__(self) T = property(transpose) @@ -793,18 +880,19 @@ def cumsum(self, axis=0, *args, **kwargs): return self.apply(lambda x: x.cumsum(), axis=axis) - @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs) + @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) def isna(self): return self._apply_columns(lambda x: x.isna()) + isnull = isna - @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs) + @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs) def notna(self): return self._apply_columns(lambda x: x.notna()) + notnull = notna - def apply(self, func, axis=0, broadcast=None, reduce=None, - result_type=None): + def apply(self, func, axis=0, broadcast=None, reduce=None, result_type=None): """ Analogous to DataFrame.apply, for SparseDataFrame @@ -865,17 +953,23 @@ def apply(self, func, axis=0, broadcast=None, reduce=None, applied.fill_value = func(v.fill_value) new_series[k] = applied return self._constructor( - new_series, index=self.index, columns=self.columns, + new_series, + index=self.index, + columns=self.columns, default_fill_value=self._default_fill_value, - default_kind=self._default_kind).__finalize__(self) + default_kind=self._default_kind, + ).__finalize__(self) from pandas.core.apply import frame_apply - op = frame_apply(self, - func=func, - axis=axis, - reduce=reduce, - broadcast=broadcast, - result_type=result_type) + + op = frame_apply( + self, + func=func, + axis=axis, + reduce=reduce, + broadcast=broadcast, + result_type=result_type, + ) return op.get_result() def applymap(self, func): @@ -904,8 +998,7 @@ def to_manager(sdf, columns, index): # from BlockManager perspective axes = [ensure_index(columns), ensure_index(index)] - return create_block_manager_from_arrays( - [sdf[c] for c in columns], columns, axes) + return create_block_manager_from_arrays([sdf[c] for c in columns], columns, axes) def stack_sparse_frame(frame): @@ -925,7 +1018,7 @@ def stack_sparse_frame(frame): # SparseDataFrame with a non-np.NaN fill value (fails earlier). for _, series in frame.items(): if not np.isnan(series.fill_value): - raise TypeError('This routine assumes NaN fill value') + raise TypeError("This routine assumes NaN fill value") int_index = series.sp_index.to_int_index() inds_to_concat.append(int_index.indices) @@ -933,12 +1026,13 @@ def stack_sparse_frame(frame): major_codes = np.concatenate(inds_to_concat) stacked_values = np.concatenate(vals_to_concat) - index = MultiIndex(levels=[frame.index, frame.columns], - codes=[major_codes, minor_codes], - verify_integrity=False) + index = MultiIndex( + levels=[frame.index, frame.columns], + codes=[major_codes, minor_codes], + verify_integrity=False, + ) - lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index, - columns=['foo']) + lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index, columns=["foo"]) return lp.sort_index(level=0) @@ -966,7 +1060,7 @@ def homogenize(series_dict): for _, series in series_dict.items(): if not np.isnan(series.fill_value): - raise TypeError('this method is only valid with NaN fill values') + raise TypeError("this method is only valid with NaN fill values") if index is None: index = series.sp_index diff --git a/pandas/core/sparse/scipy_sparse.py b/pandas/core/sparse/scipy_sparse.py index 7ff0f465756613..73638f5965119b 100644 --- a/pandas/core/sparse/scipy_sparse.py +++ b/pandas/core/sparse/scipy_sparse.py @@ -13,13 +13,12 @@ def _check_is_partition(parts, whole): whole = set(whole) parts = [set(x) for x in parts] if set.intersection(*parts) != set(): - raise ValueError( - 'Is not a partition because intersection is not null.') + raise ValueError("Is not a partition because intersection is not null.") if set.union(*parts) != whole: - raise ValueError('Is not a partition because union is not the whole.') + raise ValueError("Is not a partition because union is not the whole.") -def _to_ijv(ss, row_levels=(0, ), column_levels=(1, ), sort_labels=False): +def _to_ijv(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): """ For arbitrary (MultiIndexed) SparseSeries return (v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for passing to scipy.sparse.coo constructor. """ @@ -36,8 +35,7 @@ def get_indexers(levels): # TODO: how to do this better? cleanly slice nonnull_labels given the # coord - values_ilabels = [tuple(x[i] for i in levels) - for x in nonnull_labels.index] + values_ilabels = [tuple(x[i] for i in levels) for x in nonnull_labels.index] if len(levels) == 1: values_ilabels = [x[0] for x in values_ilabels] @@ -55,12 +53,11 @@ def _get_label_to_i_dict(labels, sort_labels=False): if sort_labels: labels = sorted(list(labels)) d = OrderedDict((k, i) for i, k in enumerate(labels)) - return (d) + return d def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): ilabels = list(zip(*[index._get_level_values(i) for i in subset])) - labels_to_i = _get_label_to_i_dict(ilabels, - sort_labels=sort_labels) + labels_to_i = _get_label_to_i_dict(ilabels, sort_labels=sort_labels) labels_to_i = Series(labels_to_i) if len(subset) > 1: labels_to_i.index = MultiIndex.from_tuples(labels_to_i.index) @@ -69,11 +66,12 @@ def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): labels_to_i.index = Index(x[0] for x in labels_to_i.index) labels_to_i.index.name = index.names[subset[0]] - labels_to_i.name = 'value' - return (labels_to_i) + labels_to_i.name = "value" + return labels_to_i - labels_to_i = _get_index_subset_to_coord_dict(ss.index, levels, - sort_labels=sort_labels) + labels_to_i = _get_index_subset_to_coord_dict( + ss.index, levels, sort_labels=sort_labels + ) # ##################################################################### # ##################################################################### @@ -88,8 +86,7 @@ def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): return values, i_coord, j_coord, i_labels, j_labels -def _sparse_series_to_coo(ss, row_levels=(0, ), column_levels=(1, ), - sort_labels=False): +def _sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): """ Convert a SparseSeries to a scipy.sparse.coo_matrix using index levels row_levels, column_levels as the row and column @@ -99,25 +96,26 @@ def _sparse_series_to_coo(ss, row_levels=(0, ), column_levels=(1, ), import scipy.sparse if ss.index.nlevels < 2: - raise ValueError('to_coo requires MultiIndex with nlevels > 2') + raise ValueError("to_coo requires MultiIndex with nlevels > 2") if not ss.index.is_unique: - raise ValueError('Duplicate index entries are not allowed in to_coo ' - 'transformation.') + raise ValueError( + "Duplicate index entries are not allowed in to_coo " "transformation." + ) # to keep things simple, only rely on integer indexing (not labels) row_levels = [ss.index._get_level_number(x) for x in row_levels] column_levels = [ss.index._get_level_number(x) for x in column_levels] - v, i, j, rows, columns = _to_ijv(ss, row_levels=row_levels, - column_levels=column_levels, - sort_labels=sort_labels) + v, i, j, rows, columns = _to_ijv( + ss, row_levels=row_levels, column_levels=column_levels, sort_labels=sort_labels + ) sparse_matrix = scipy.sparse.coo_matrix( - (v, (i, j)), shape=(len(rows), len(columns))) + (v, (i, j)), shape=(len(rows), len(columns)) + ) return sparse_matrix, rows, columns -def _coo_to_sparse_series(A, dense_index: bool = False, - sparse_series: bool = True): +def _coo_to_sparse_series(A, dense_index: bool = False, sparse_series: bool = True): """ Convert a scipy.sparse.coo_matrix to a SparseSeries. @@ -141,8 +139,7 @@ def _coo_to_sparse_series(A, dense_index: bool = False, try: s = Series(A.data, MultiIndex.from_arrays((A.row, A.col))) except AttributeError: - raise TypeError('Expected coo_matrix. Got {} instead.' - .format(type(A).__name__)) + raise TypeError("Expected coo_matrix. Got {} instead.".format(type(A).__name__)) s = s.sort_index() if sparse_series: # TODO(SparseSeries): remove this and the sparse_series keyword. diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 88b6634db92b64..43f2609f46bd62 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -24,12 +24,15 @@ from pandas.core.internals import SingleBlockManager import pandas.core.ops as ops from pandas.core.series import Series -from pandas.core.sparse.scipy_sparse import ( - _coo_to_sparse_series, _sparse_series_to_coo) +from pandas.core.sparse.scipy_sparse import _coo_to_sparse_series, _sparse_series_to_coo -_shared_doc_kwargs = dict(axes='index', klass='SparseSeries', - axes_single_arg="{0, 'index'}", - optional_labels='', optional_axis='') +_shared_doc_kwargs = dict( + axes="index", + klass="SparseSeries", + axes_single_arg="{0, 'index'}", + optional_labels="", + optional_axis="", +) depr_msg = """\ @@ -70,11 +73,21 @@ class SparseSeries(Series): must change values, convert to dense, make your changes, then convert back to sparse """ - _subtyp = 'sparse_series' - def __init__(self, data=None, index=None, sparse_index=None, kind='block', - fill_value=None, name=None, dtype=None, copy=False, - fastpath=False): + _subtyp = "sparse_series" + + def __init__( + self, + data=None, + index=None, + sparse_index=None, + kind="block", + fill_value=None, + name=None, + dtype=None, + copy=False, + fastpath=False, + ): warnings.warn(depr_msg, FutureWarning, stacklevel=2) # TODO: Most of this should be refactored and shared with Series # 1. BlockManager -> array @@ -102,55 +115,67 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block', data = np.full(len(index), fill_value=data) super().__init__( - SparseArray(data, - sparse_index=sparse_index, - kind=kind, - dtype=dtype, - fill_value=fill_value, - copy=copy), - index=index, name=name, - copy=False, fastpath=fastpath + SparseArray( + data, + sparse_index=sparse_index, + kind=kind, + dtype=dtype, + fill_value=fill_value, + copy=copy, + ), + index=index, + name=name, + copy=False, + fastpath=fastpath, ) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # avoid infinite recursion for other SparseSeries inputs - inputs = tuple( - x.values if isinstance(x, type(self)) else x - for x in inputs - ) + inputs = tuple(x.values if isinstance(x, type(self)) else x for x in inputs) result = self.values.__array_ufunc__(ufunc, method, *inputs, **kwargs) - return self._constructor(result, index=self.index, - sparse_index=self.sp_index, - fill_value=result.fill_value, - copy=False).__finalize__(self) + return self._constructor( + result, + index=self.index, + sparse_index=self.sp_index, + fill_value=result.fill_value, + copy=False, + ).__finalize__(self) # unary ops # TODO: See if this can be shared def __pos__(self): result = self.values.__pos__() - return self._constructor(result, index=self.index, - sparse_index=self.sp_index, - fill_value=result.fill_value, - copy=False).__finalize__(self) + return self._constructor( + result, + index=self.index, + sparse_index=self.sp_index, + fill_value=result.fill_value, + copy=False, + ).__finalize__(self) def __neg__(self): result = self.values.__neg__() - return self._constructor(result, index=self.index, - sparse_index=self.sp_index, - fill_value=result.fill_value, - copy=False).__finalize__(self) + return self._constructor( + result, + index=self.index, + sparse_index=self.sp_index, + fill_value=result.fill_value, + copy=False, + ).__finalize__(self) def __invert__(self): result = self.values.__invert__() - return self._constructor(result, index=self.index, - sparse_index=self.sp_index, - fill_value=result.fill_value, - copy=False).__finalize__(self) + return self._constructor( + result, + index=self.index, + sparse_index=self.sp_index, + fill_value=result.fill_value, + copy=False, + ).__finalize__(self) @property def block(self): - warnings.warn("SparseSeries.block is deprecated.", FutureWarning, - stacklevel=2) + warnings.warn("SparseSeries.block is deprecated.", FutureWarning, stacklevel=2) return self._data._block @property @@ -174,18 +199,29 @@ def npoints(self): return self.values.npoints @classmethod - def from_array(cls, arr, index=None, name=None, copy=False, - fill_value=None, fastpath=False): + def from_array( + cls, arr, index=None, name=None, copy=False, fill_value=None, fastpath=False + ): """Construct SparseSeries from array. .. deprecated:: 0.23.0 Use the pd.SparseSeries(..) constructor instead. """ - warnings.warn("'from_array' is deprecated and will be removed in a " - "future version. Please use the pd.SparseSeries(..) " - "constructor instead.", FutureWarning, stacklevel=2) - return cls(arr, index=index, name=name, copy=copy, - fill_value=fill_value, fastpath=fastpath) + warnings.warn( + "'from_array' is deprecated and will be removed in a " + "future version. Please use the pd.SparseSeries(..) " + "constructor instead.", + FutureWarning, + stacklevel=2, + ) + return cls( + arr, + index=index, + name=name, + copy=copy, + fill_value=fill_value, + fastpath=fastpath, + ) @property def _constructor(self): @@ -194,14 +230,15 @@ def _constructor(self): @property def _constructor_expanddim(self): from pandas.core.sparse.api import SparseDataFrame + return SparseDataFrame @property def kind(self): if isinstance(self.sp_index, BlockIndex): - return 'block' + return "block" elif isinstance(self.sp_index, IntIndex): - return 'integer' + return "integer" def as_sparse_array(self, kind=None, fill_value=None, copy=False): """ return my self as a sparse array, do not copy by default """ @@ -210,26 +247,36 @@ def as_sparse_array(self, kind=None, fill_value=None, copy=False): fill_value = self.fill_value if kind is None: kind = self.kind - return SparseArray(self.values, sparse_index=self.sp_index, - fill_value=fill_value, kind=kind, copy=copy) + return SparseArray( + self.values, + sparse_index=self.sp_index, + fill_value=fill_value, + kind=kind, + copy=copy, + ) def __repr__(self): with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Sparse") series_rep = Series.__repr__(self) - rep = '{series}\n{index!r}'.format(series=series_rep, - index=self.sp_index) + rep = "{series}\n{index!r}".format(series=series_rep, index=self.sp_index) return rep - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + def _reduce( + self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds + ): """ perform a reduction operation """ return op(self.array.to_dense(), skipna=skipna, **kwds) def __getstate__(self): # pickling - return dict(_typ=self._typ, _subtyp=self._subtyp, _data=self._data, - fill_value=self.fill_value, name=self.name) + return dict( + _typ=self._typ, + _subtyp=self._subtyp, + _data=self._data, + fill_value=self.fill_value, + name=self.name, + ) def _unpickle_series_compat(self, state): @@ -246,8 +293,9 @@ def _unpickle_series_compat(self, state): # create a sparse array if not isinstance(data, SparseArray): - data = SparseArray(data, sparse_index=sp_index, - fill_value=fill_value, copy=False) + data = SparseArray( + data, sparse_index=sp_index, fill_value=fill_value, copy=False + ) # recreate data = SingleBlockManager(data, index, fastpath=True) @@ -258,9 +306,9 @@ def _unpickle_series_compat(self, state): def _set_subtyp(self, is_all_dates): if is_all_dates: - object.__setattr__(self, '_subtyp', 'sparse_time_series') + object.__setattr__(self, "_subtyp", "sparse_time_series") else: - object.__setattr__(self, '_subtyp', 'sparse_series') + object.__setattr__(self, "_subtyp", "sparse_series") def _ixs(self, i, axis=0): """ @@ -294,8 +342,9 @@ def __getitem__(self, key): def _get_values(self, indexer): try: - return self._constructor(self._data.get_slice(indexer), - fastpath=True).__finalize__(self) + return self._constructor( + self._data.get_slice(indexer), fastpath=True + ).__finalize__(self) except Exception: return self[indexer] @@ -311,8 +360,9 @@ def abs(self): ------- abs: same type as caller """ - return self._constructor(np.abs(self.values), - index=self.index).__finalize__(self) + return self._constructor(np.abs(self.values), index=self.index).__finalize__( + self + ) def get(self, label, default=None): """ @@ -353,16 +403,20 @@ def get_value(self, label, takeable=False): ------- value : scalar value """ - warnings.warn("get_value is deprecated and will be removed " - "in a future release. Please use " - ".at[] or .iat[] accessors instead", FutureWarning, - stacklevel=2) + warnings.warn( + "get_value is deprecated and will be removed " + "in a future release. Please use " + ".at[] or .iat[] accessors instead", + FutureWarning, + stacklevel=2, + ) return self._get_value(label, takeable=takeable) def _get_value(self, label, takeable=False): loc = label if takeable is True else self.index.get_loc(label) return self._get_val_at(loc) + _get_value.__doc__ = get_value.__doc__ def set_value(self, label, value, takeable=False): @@ -392,10 +446,13 @@ def set_value(self, label, value, takeable=False): ------- series : SparseSeries """ - warnings.warn("set_value is deprecated and will be removed " - "in a future release. Please use " - ".at[] or .iat[] accessors instead", FutureWarning, - stacklevel=2) + warnings.warn( + "set_value is deprecated and will be removed " + "in a future release. Please use " + ".at[] or .iat[] accessors instead", + FutureWarning, + stacklevel=2, + ) return self._set_value(label, value, takeable=takeable) def _set_value(self, label, value, takeable=False): @@ -407,10 +464,10 @@ def _set_value(self, label, value, takeable=False): if new_values is not None: values = new_values new_index = values.index - values = SparseArray(values, fill_value=self.fill_value, - kind=self.kind) + values = SparseArray(values, fill_value=self.fill_value, kind=self.kind) self._data = SingleBlockManager(values, new_index) self._index = new_index + _set_value.__doc__ = set_value.__doc__ def _set_values(self, key, value): @@ -424,8 +481,7 @@ def _set_values(self, key, value): values = self.values.to_dense() values[key] = libindex.convert_scalar(values, value) - values = SparseArray(values, fill_value=self.fill_value, - kind=self.kind) + values = SparseArray(values, fill_value=self.fill_value, kind=self.kind) self._data = SingleBlockManager(values, self.index) def to_dense(self): @@ -436,8 +492,7 @@ def to_dense(self): ------- s : Series """ - return Series(self.values.to_dense(), index=self.index, - name=self.name) + return Series(self.values.to_dense(), index=self.index, name=self.name) @property def density(self): @@ -453,18 +508,21 @@ def copy(self, deep=True): new_data = self.values if deep: new_data = new_data.copy() - return self._constructor(new_data, sparse_index=self.sp_index, - fill_value=self.fill_value, - index=self.index.copy(), - name=self.name).__finalize__(self) + return self._constructor( + new_data, + sparse_index=self.sp_index, + fill_value=self.fill_value, + index=self.index.copy(), + name=self.name, + ).__finalize__(self) @Substitution(**_shared_doc_kwargs) @Appender(generic.NDFrame.reindex.__doc__) - def reindex(self, index=None, method=None, copy=True, limit=None, - **kwargs): + def reindex(self, index=None, method=None, copy=True, limit=None, **kwargs): # TODO: remove? - return super().reindex(index=index, method=method, copy=copy, - limit=limit, **kwargs) + return super().reindex( + index=index, method=method, copy=copy, limit=limit, **kwargs + ) def sparse_reindex(self, new_index): """ @@ -482,10 +540,11 @@ def sparse_reindex(self, new_index): raise TypeError("new index must be a SparseIndex") values = self.values values = values.sp_index.to_int_index().reindex( - values.sp_values.astype('float64'), values.fill_value, new_index) - values = SparseArray(values, - sparse_index=new_index, - fill_value=self.values.fill_value) + values.sp_values.astype("float64"), values.fill_value, new_index + ) + values = SparseArray( + values, sparse_index=new_index, fill_value=self.values.fill_value + ) return self._constructor(values, index=self.index).__finalize__(self) def cumsum(self, axis=0, *args, **kwargs): @@ -512,25 +571,30 @@ def cumsum(self, axis=0, *args, **kwargs): new_array = self.values.cumsum() return self._constructor( - new_array, index=self.index, - sparse_index=new_array.sp_index).__finalize__(self) + new_array, index=self.index, sparse_index=new_array.sp_index + ).__finalize__(self) # TODO: SparseSeries.isna is Sparse, while Series.isna is dense - @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs) + @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) def isna(self): - arr = SparseArray(isna(self.values.sp_values), - sparse_index=self.values.sp_index, - fill_value=isna(self.fill_value)) + arr = SparseArray( + isna(self.values.sp_values), + sparse_index=self.values.sp_index, + fill_value=isna(self.fill_value), + ) return self._constructor(arr, index=self.index).__finalize__(self) isnull = isna - @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs) + @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs) def notna(self): - arr = SparseArray(notna(self.values.sp_values), - sparse_index=self.values.sp_index, - fill_value=notna(self.fill_value)) + arr = SparseArray( + notna(self.values.sp_values), + sparse_index=self.values.sp_index, + fill_value=notna(self.fill_value), + ) return self._constructor(arr, index=self.index).__finalize__(self) + notnull = notna def dropna(self, axis=0, inplace=False, **kwargs): @@ -542,8 +606,9 @@ def dropna(self, axis=0, inplace=False, **kwargs): self._get_axis_number(axis or 0) dense_valid = self.to_dense().dropna() if inplace: - raise NotImplementedError("Cannot perform inplace dropna" - " operations on a SparseSeries") + raise NotImplementedError( + "Cannot perform inplace dropna" " operations on a SparseSeries" + ) if isna(self.fill_value): return dense_valid else: @@ -570,10 +635,10 @@ def combine_first(self, other): return dense_combined.to_sparse(fill_value=self.fill_value) @Appender(SparseAccessor.to_coo.__doc__) - def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False): - A, rows, columns = _sparse_series_to_coo(self, row_levels, - column_levels, - sort_labels=sort_labels) + def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False): + A, rows, columns = _sparse_series_to_coo( + self, row_levels, column_levels, sort_labels=sort_labels + ) return A, rows, columns @classmethod diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 710b29c6a6536c..70700653c47957 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -12,8 +12,15 @@ from pandas.util._decorators import Appender, deprecate_kwarg from pandas.core.dtypes.common import ( - ensure_object, is_bool_dtype, is_categorical_dtype, is_integer, - is_list_like, is_re, is_scalar, is_string_like) + ensure_object, + is_bool_dtype, + is_categorical_dtype, + is_integer, + is_list_like, + is_re, + is_scalar, + is_string_like, +) from pandas.core.dtypes.generic import ABCIndexClass, ABCMultiIndex, ABCSeries from pandas.core.dtypes.missing import isna @@ -22,11 +29,15 @@ import pandas.core.common as com _cpython_optimized_encoders = ( - "utf-8", "utf8", "latin-1", "latin1", "iso-8859-1", "mbcs", "ascii" -) -_cpython_optimized_decoders = _cpython_optimized_encoders + ( - "utf-16", "utf-32" + "utf-8", + "utf8", + "latin-1", + "latin1", + "iso-8859-1", + "mbcs", + "ascii", ) +_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") _shared_docs = dict() # type: Dict[str, str] @@ -80,11 +91,12 @@ def cat_safe(list_of_columns: List, sep: str): # object dtype), np.sum will fail; catch and return with better message for column in list_of_columns: dtype = lib.infer_dtype(column, skipna=True) - if dtype not in ['string', 'empty']: + if dtype not in ["string", "empty"]: raise TypeError( - 'Concatenation requires list-likes containing only ' - 'strings (or missing values). Offending values found in ' - 'column {}'.format(dtype)) from None + "Concatenation requires list-likes containing only " + "strings (or missing values). Offending values found in " + "column {}".format(dtype) + ) from None return result @@ -109,8 +121,10 @@ def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): except (TypeError, AttributeError) as e: # Reraise the exception if callable `f` got wrong number of args. # The user may want to be warned by this, instead of getting NaN - p_err = (r'((takes)|(missing)) (?(2)from \d+ to )?\d+ ' - r'(?(3)required )positional arguments?') + p_err = ( + r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " + r"(?(3)required )positional arguments?" + ) if len(e.args) >= 1 and re.search(p_err, e.args[0]): raise e @@ -330,9 +344,12 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): regex = re.compile(pat, flags=flags) if regex.groups > 0: - warnings.warn("This pattern has match groups. To actually get the" - " groups, use str.extract.", UserWarning, - stacklevel=3) + warnings.warn( + "This pattern has match groups. To actually get the" + " groups, use str.extract.", + UserWarning, + stacklevel=3, + ) f = lambda x: bool(regex.search(x)) else: @@ -585,8 +602,9 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): if regex: if is_compiled_re: if (case is not None) or (flags != 0): - raise ValueError("case and flags cannot be set" - " when pat is a compiled regex") + raise ValueError( + "case and flags cannot be set" " when pat is a compiled regex" + ) else: # not a compiled regex # set default case @@ -604,11 +622,11 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): f = lambda x: x.replace(pat, repl, n) else: if is_compiled_re: - raise ValueError("Cannot use a compiled regex as replacement " - "pattern with regex=False") + raise ValueError( + "Cannot use a compiled regex as replacement " "pattern with regex=False" + ) if callable(repl): - raise ValueError("Cannot use a callable replacement when " - "regex=False") + raise ValueError("Cannot use a callable replacement when " "regex=False") f = lambda x: x.replace(pat, repl, n) return _na_map(f, arr) @@ -655,6 +673,7 @@ def str_repeat(arr, repeats): dtype: object """ if is_scalar(repeats): + def scalar_rep(x): try: return bytes.__mul__(x, repeats) @@ -732,6 +751,7 @@ def f(x): return [np.nan if item is None else item for item in m.groups()] else: return empty_row + return f @@ -764,7 +784,8 @@ def _str_extract_noexpand(arr, pat, flags=0): [groups_or_na(val) for val in arr], columns=columns, index=arr.index, - dtype=object) + dtype=object, + ) return result, name @@ -792,7 +813,8 @@ def _str_extract_frame(arr, pat, flags=0): [groups_or_na(val) for val in arr], columns=columns, index=result_index, - dtype=object) + dtype=object, + ) def str_extract(arr, pat, flags=0, expand=True): @@ -980,27 +1002,25 @@ def str_extractall(arr, pat, flags=0): if isinstance(subject, str): if not is_mi: - subject_key = (subject_key, ) + subject_key = (subject_key,) for match_i, match_tuple in enumerate(regex.findall(subject)): if isinstance(match_tuple, str): match_tuple = (match_tuple,) - na_tuple = [np.NaN if group == "" else group - for group in match_tuple] + na_tuple = [np.NaN if group == "" else group for group in match_tuple] match_list.append(na_tuple) - result_key = tuple(subject_key + (match_i, )) + result_key = tuple(subject_key + (match_i,)) index_list.append(result_key) from pandas import MultiIndex - index = MultiIndex.from_tuples( - index_list, names=arr.index.names + ["match"]) - result = arr._constructor_expanddim(match_list, index=index, - columns=columns) + index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) + + result = arr._constructor_expanddim(match_list, index=index, columns=columns) return result -def str_get_dummies(arr, sep='|'): +def str_get_dummies(arr, sep="|"): """ Split each string in the Series by sep and return a DataFrame of dummy/indicator variables. @@ -1034,7 +1054,7 @@ def str_get_dummies(arr, sep='|'): 1 0 0 0 2 1 0 1 """ - arr = arr.fillna('') + arr = arr.fillna("") try: arr = sep + arr + sep except TypeError: @@ -1212,7 +1232,7 @@ def str_findall(arr, pat, flags=0): return _na_map(regex.findall, arr) -def str_find(arr, sub, start=0, end=None, side='left'): +def str_find(arr, sub, start=0, end=None, side="left"): """ Return indexes in each strings in the Series/Index where the substring is fully contained between [start:end]. Return -1 on failure. @@ -1235,15 +1255,15 @@ def str_find(arr, sub, start=0, end=None, side='left'): """ if not isinstance(sub, str): - msg = 'expected a string object, not {0}' + msg = "expected a string object, not {0}" raise TypeError(msg.format(type(sub).__name__)) - if side == 'left': - method = 'find' - elif side == 'right': - method = 'rfind' + if side == "left": + method = "find" + elif side == "right": + method = "rfind" else: # pragma: no cover - raise ValueError('Invalid side') + raise ValueError("Invalid side") if end is None: f = lambda x: getattr(x, method)(sub, start) @@ -1253,17 +1273,17 @@ def str_find(arr, sub, start=0, end=None, side='left'): return _na_map(f, arr, dtype=int) -def str_index(arr, sub, start=0, end=None, side='left'): +def str_index(arr, sub, start=0, end=None, side="left"): if not isinstance(sub, str): - msg = 'expected a string object, not {0}' + msg = "expected a string object, not {0}" raise TypeError(msg.format(type(sub).__name__)) - if side == 'left': - method = 'index' - elif side == 'right': - method = 'rindex' + if side == "left": + method = "index" + elif side == "right": + method = "rindex" else: # pragma: no cover - raise ValueError('Invalid side') + raise ValueError("Invalid side") if end is None: f = lambda x: getattr(x, method)(sub, start) @@ -1273,7 +1293,7 @@ def str_index(arr, sub, start=0, end=None, side='left'): return _na_map(f, arr, dtype=int) -def str_pad(arr, width, side='left', fillchar=' '): +def str_pad(arr, width, side="left", fillchar=" "): """ Pad strings in the Series/Index up to width. @@ -1327,24 +1347,24 @@ def str_pad(arr, width, side='left', fillchar=' '): dtype: object """ if not isinstance(fillchar, str): - msg = 'fillchar must be a character, not {0}' + msg = "fillchar must be a character, not {0}" raise TypeError(msg.format(type(fillchar).__name__)) if len(fillchar) != 1: - raise TypeError('fillchar must be a character, not str') + raise TypeError("fillchar must be a character, not str") if not is_integer(width): - msg = 'width must be of integer type, not {0}' + msg = "width must be of integer type, not {0}" raise TypeError(msg.format(type(width).__name__)) - if side == 'left': + if side == "left": f = lambda x: x.rjust(width, fillchar) - elif side == 'right': + elif side == "right": f = lambda x: x.ljust(width, fillchar) - elif side == 'both': + elif side == "both": f = lambda x: x.center(width, fillchar) else: # pragma: no cover - raise ValueError('Invalid side') + raise ValueError("Invalid side") return _na_map(f, arr) @@ -1522,14 +1542,14 @@ def str_slice_replace(arr, start=None, stop=None, repl=None): dtype: object """ if repl is None: - repl = '' + repl = "" def f(x): - if x[start:stop] == '': + if x[start:stop] == "": local_stop = start else: local_stop = stop - y = '' + y = "" if start is not None: y += x[:start] y += repl @@ -1540,7 +1560,7 @@ def f(x): return _na_map(f, arr) -def str_strip(arr, to_strip=None, side='both'): +def str_strip(arr, to_strip=None, side="both"): """ Strip whitespace (including newlines) from each string in the Series/Index. @@ -1554,14 +1574,14 @@ def str_strip(arr, to_strip=None, side='both'): ------- Series or Index """ - if side == 'both': + if side == "both": f = lambda x: x.strip(to_strip) - elif side == 'left': + elif side == "left": f = lambda x: x.lstrip(to_strip) - elif side == 'right': + elif side == "right": f = lambda x: x.rstrip(to_strip) else: # pragma: no cover - raise ValueError('Invalid side') + raise ValueError("Invalid side") return _na_map(f, arr) @@ -1622,11 +1642,11 @@ def str_wrap(arr, width, **kwargs): 1 another line\nto be\nwrapped dtype: object """ - kwargs['width'] = width + kwargs["width"] = width tw = textwrap.TextWrapper(**kwargs) - return _na_map(lambda s: '\n'.join(tw.wrap(s)), arr) + return _na_map(lambda s: "\n".join(tw.wrap(s)), arr) def str_translate(arr, table): @@ -1700,12 +1720,14 @@ def str_get(arr, i): 5 None dtype: object """ + def f(x): if isinstance(x, dict): return x.get(i) elif len(x) > i >= -len(x): return x[i] return np.nan + return _na_map(f, arr) @@ -1801,8 +1823,9 @@ def forbid_nonstring_types(forbidden, name=None): # deal with None forbidden = [] if forbidden is None else forbidden - allowed_types = {'string', 'empty', 'bytes', - 'mixed', 'mixed-integer'} - set(forbidden) + allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set( + forbidden + ) def _forbid_nonstring_types(func): func_name = func.__name__ if name is None else name @@ -1810,18 +1833,22 @@ def _forbid_nonstring_types(func): @wraps(func) def wrapper(self, *args, **kwargs): if self._inferred_dtype not in allowed_types: - msg = ('Cannot use .str.{name} with values of inferred dtype ' - '{inf_type!r}.'.format(name=func_name, - inf_type=self._inferred_dtype)) + msg = ( + "Cannot use .str.{name} with values of inferred dtype " + "{inf_type!r}.".format( + name=func_name, inf_type=self._inferred_dtype + ) + ) raise TypeError(msg) return func(self, *args, **kwargs) + wrapper.__name__ = func_name return wrapper + return _forbid_nonstring_types -def _noarg_wrapper(f, name=None, docstring=None, forbidden_types=['bytes'], - **kargs): +def _noarg_wrapper(f, name=None, docstring=None, forbidden_types=["bytes"], **kargs): @forbid_nonstring_types(forbidden_types, name=name) def wrapper(self): result = _na_map(f, self._parent, **kargs) @@ -1831,13 +1858,14 @@ def wrapper(self): if docstring is not None: wrapper.__doc__ = docstring else: - raise ValueError('Provide docstring') + raise ValueError("Provide docstring") return wrapper -def _pat_wrapper(f, flags=False, na=False, name=None, - forbidden_types=['bytes'], **kwargs): +def _pat_wrapper( + f, flags=False, na=False, name=None, forbidden_types=["bytes"], **kwargs +): @forbid_nonstring_types(forbidden_types, name=name) def wrapper1(self, pat): result = f(self._parent, pat) @@ -1919,21 +1947,21 @@ def _validate(data): dtype : inferred dtype of data """ if isinstance(data, ABCMultiIndex): - raise AttributeError('Can only use .str accessor with Index, ' - 'not MultiIndex') + raise AttributeError( + "Can only use .str accessor with Index, " "not MultiIndex" + ) # see _libs/lib.pyx for list of inferred types - allowed_types = ['string', 'empty', 'bytes', 'mixed', 'mixed-integer'] + allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] - values = getattr(data, 'values', data) # Series / Index - values = getattr(values, 'categories', values) # categorical / normal + values = getattr(data, "values", data) # Series / Index + values = getattr(values, "categories", values) # categorical / normal # missing values obfuscate type inference -> skip inferred_dtype = lib.infer_dtype(values, skipna=True) if inferred_dtype not in allowed_types: - raise AttributeError("Can only use .str accessor with string " - "values!") + raise AttributeError("Can only use .str accessor with string " "values!") return inferred_dtype def __getitem__(self, key): @@ -1950,8 +1978,9 @@ def __iter__(self): i += 1 g = self.get(i) - def _wrap_result(self, result, use_codes=True, - name=None, expand=None, fill_value=np.nan): + def _wrap_result( + self, result, use_codes=True, name=None, expand=None, fill_value=np.nan + ): from pandas import Index, Series, MultiIndex @@ -1962,10 +1991,11 @@ def _wrap_result(self, result, use_codes=True, # before the transformation... if use_codes and self._is_categorical: # if self._orig is a CategoricalIndex, there is no .cat-accessor - result = take_1d(result, Series(self._orig, copy=False).cat.codes, - fill_value=fill_value) + result = take_1d( + result, Series(self._orig, copy=False).cat.codes, fill_value=fill_value + ) - if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'): + if not hasattr(result, "ndim") or not hasattr(result, "dtype"): return result assert result.ndim < 3 @@ -1987,8 +2017,9 @@ def cons_row(x): if result: # propagate nan values to match longest sequence (GH 18450) max_len = max(len(x) for x in result) - result = [x * max_len if len(x) == 0 or x[0] is np.nan - else x for x in result] + result = [ + x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result + ] if not isinstance(expand, bool): raise ValueError("expand must be True or False") @@ -1997,7 +2028,7 @@ def cons_row(x): # if expand is False, result should have the same name # as the original otherwise specified if name is None: - name = getattr(result, 'name', None) + name = getattr(result, "name", None) if name is None: # do not use logical or, _orig may be a DataFrame # which has "name" column @@ -2058,9 +2089,11 @@ def _get_series_list(self, others, ignore_index=False): # self._orig is either Series or Index idx = self._orig if isinstance(self._orig, Index) else self._orig.index - err_msg = ('others must be Series, Index, DataFrame, np.ndarray or ' - 'list-like (either containing only strings or containing ' - 'only objects of type Series/Index/list-like/np.ndarray)') + err_msg = ( + "others must be Series, Index, DataFrame, np.ndarray or " + "list-like (either containing only strings or containing " + "only objects of type Series/Index/list-like/np.ndarray)" + ) # Generally speaking, all objects without an index inherit the index # `idx` of the calling Series/Index - i.e. must have matching length. @@ -2069,13 +2102,13 @@ def _get_series_list(self, others, ignore_index=False): if isinstance(others, Series): warn = not others.index.equals(idx) # only reconstruct Series when absolutely necessary - los = [Series(others.values, index=idx) - if ignore_index and warn else others] + los = [ + Series(others.values, index=idx) if ignore_index and warn else others + ] return (los, warn) elif isinstance(others, Index): warn = not others.equals(idx) - los = [Series(others.values, - index=(idx if ignore_index else others))] + los = [Series(others.values, index=(idx if ignore_index else others))] return (los, warn) elif isinstance(others, DataFrame): warn = not others.index.equals(idx) @@ -2105,45 +2138,51 @@ def _get_series_list(self, others, ignore_index=False): # GH 21950 - DeprecationWarning # only allowing Series/Index/np.ndarray[1-dim] will greatly # simply this function post-deprecation. - if not (isinstance(nxt, (Series, Index)) or - (isinstance(nxt, np.ndarray) and nxt.ndim == 1)): + if not ( + isinstance(nxt, (Series, Index)) + or (isinstance(nxt, np.ndarray) and nxt.ndim == 1) + ): depr_warn = True - if not isinstance(nxt, (DataFrame, Series, - Index, np.ndarray)): + if not isinstance(nxt, (DataFrame, Series, Index, np.ndarray)): # safety for non-persistent list-likes (e.g. iterators) # do not map indexed/typed objects; info needed below nxt = list(nxt) # known types for which we can avoid deep inspection - no_deep = ((isinstance(nxt, np.ndarray) and nxt.ndim == 1) - or isinstance(nxt, (Series, Index))) + no_deep = ( + isinstance(nxt, np.ndarray) and nxt.ndim == 1 + ) or isinstance(nxt, (Series, Index)) # nested list-likes are forbidden: # -> elements of nxt must not be list-like - is_legal = ((no_deep and nxt.dtype == object) - or all(not is_list_like(x) for x in nxt)) + is_legal = (no_deep and nxt.dtype == object) or all( + not is_list_like(x) for x in nxt + ) # DataFrame is false positive of is_legal # because "x in df" returns column names if not is_legal or isinstance(nxt, DataFrame): raise TypeError(err_msg) - nxt, wnx = self._get_series_list(nxt, - ignore_index=ignore_index) + nxt, wnx = self._get_series_list(nxt, ignore_index=ignore_index) los = los + nxt join_warn = join_warn or wnx if depr_warn: - warnings.warn('list-likes other than Series, Index, or ' - 'np.ndarray WITHIN another list-like are ' - 'deprecated and will be removed in a future ' - 'version.', FutureWarning, stacklevel=4) + warnings.warn( + "list-likes other than Series, Index, or " + "np.ndarray WITHIN another list-like are " + "deprecated and will be removed in a future " + "version.", + FutureWarning, + stacklevel=4, + ) return (los, join_warn) elif all(not is_list_like(x) for x in others): return ([Series(others, index=idx)], False) raise TypeError(err_msg) - @forbid_nonstring_types(['bytes', 'mixed', 'mixed-integer']) + @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"]) def cat(self, others=None, sep=None, na_rep=None, join=None): """ Concatenate strings in the Series/Index with given separator. @@ -2284,7 +2323,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): if isinstance(others, str): raise ValueError("Did you mean to supply a `sep` keyword?") if sep is None: - sep = '' + sep = "" if isinstance(self._orig, Index): data = Series(self._orig, index=self._orig) @@ -2303,38 +2342,50 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): try: # turn anything in "others" into lists of Series - others, warn = self._get_series_list(others, - ignore_index=(join is None)) + others, warn = self._get_series_list(others, ignore_index=(join is None)) except ValueError: # do not catch TypeError raised by _get_series_list if join is None: - raise ValueError('All arrays must be same length, except ' - 'those having an index if `join` is not None') + raise ValueError( + "All arrays must be same length, except " + "those having an index if `join` is not None" + ) else: - raise ValueError('If `others` contains arrays or lists (or ' - 'other list-likes without an index), these ' - 'must all be of the same length as the ' - 'calling Series/Index.') + raise ValueError( + "If `others` contains arrays or lists (or " + "other list-likes without an index), these " + "must all be of the same length as the " + "calling Series/Index." + ) if join is None and warn: - warnings.warn("A future version of pandas will perform index " - "alignment when `others` is a Series/Index/" - "DataFrame (or a list-like containing one). To " - "disable alignment (the behavior before v.0.23) and " - "silence this warning, use `.values` on any Series/" - "Index/DataFrame in `others`. To enable alignment " - "and silence this warning, pass `join='left'|" - "'outer'|'inner'|'right'`. The future default will " - "be `join='left'`.", FutureWarning, stacklevel=3) + warnings.warn( + "A future version of pandas will perform index " + "alignment when `others` is a Series/Index/" + "DataFrame (or a list-like containing one). To " + "disable alignment (the behavior before v.0.23) and " + "silence this warning, use `.values` on any Series/" + "Index/DataFrame in `others`. To enable alignment " + "and silence this warning, pass `join='left'|" + "'outer'|'inner'|'right'`. The future default will " + "be `join='left'`.", + FutureWarning, + stacklevel=3, + ) # if join is None, _get_series_list already force-aligned indexes - join = 'left' if join is None else join + join = "left" if join is None else join # align if required if any(not data.index.equals(x.index) for x in others): # Need to add keys for uniqueness in case of duplicate columns - others = concat(others, axis=1, - join=(join if join == 'inner' else 'outer'), - keys=range(len(others)), sort=False, copy=False) + others = concat( + others, + axis=1, + join=(join if join == "inner" else "outer"), + keys=range(len(others)), + sort=False, + copy=False, + ) data, others = data.align(others, join=join) others = [others[x] for x in others] # again list of Series @@ -2349,12 +2400,12 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): np.putmask(result, union_mask, np.nan) not_masked = ~union_mask - result[not_masked] = cat_safe([x[not_masked] for x in all_cols], - sep) + result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep) elif na_rep is not None and union_mask.any(): # fill NaNs with na_rep in case there are actually any NaNs - all_cols = [np.where(nm, na_rep, col) - for nm, col in zip(na_masks, all_cols)] + all_cols = [ + np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols) + ] result = cat_safe(all_cols, sep) else: # no NaNs - can just concatenate @@ -2364,11 +2415,14 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): # add dtype for case that result is all-NA result = Index(result, dtype=object, name=self._orig.name) else: # Series - result = Series(result, dtype=object, index=data.index, - name=self._orig.name) + result = Series( + result, dtype=object, index=data.index, name=self._orig.name + ) return result - _shared_docs['str_split'] = (r""" + _shared_docs[ + "str_split" + ] = r""" Split strings around given separator/delimiter. Splits the string in the Series/Index from the %(side)s, @@ -2496,25 +2550,23 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): >>> s.str.split(r"\+|=", expand=True) 0 1 2 0 1 1 2 - """) + """ - @Appender(_shared_docs['str_split'] % { - 'side': 'beginning', - 'method': 'split'}) - @forbid_nonstring_types(['bytes']) + @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) + @forbid_nonstring_types(["bytes"]) def split(self, pat=None, n=-1, expand=False): result = str_split(self._parent, pat, n=n) return self._wrap_result(result, expand=expand) - @Appender(_shared_docs['str_split'] % { - 'side': 'end', - 'method': 'rsplit'}) - @forbid_nonstring_types(['bytes']) + @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) + @forbid_nonstring_types(["bytes"]) def rsplit(self, pat=None, n=-1, expand=False): result = str_rsplit(self._parent, pat, n=n) return self._wrap_result(result, expand=expand) - _shared_docs['str_partition'] = (""" + _shared_docs[ + "str_partition" + ] = """ Split the string at the %(side)s occurrence of `sep`. This method splits the string at the %(side)s occurrence of `sep`, @@ -2595,32 +2647,36 @@ def rsplit(self, pat=None, n=-1, expand=False): >>> idx.str.partition(expand=False) Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object') - """) - - @Appender(_shared_docs['str_partition'] % { - 'side': 'first', - 'return': '3 elements containing the string itself, followed by two ' - 'empty strings', - 'also': 'rpartition : Split the string at the last occurrence of ' - '`sep`.' - }) - @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep') - @forbid_nonstring_types(['bytes']) - def partition(self, sep=' ', expand=True): + """ + + @Appender( + _shared_docs["str_partition"] + % { + "side": "first", + "return": "3 elements containing the string itself, followed by two " + "empty strings", + "also": "rpartition : Split the string at the last occurrence of " "`sep`.", + } + ) + @deprecate_kwarg(old_arg_name="pat", new_arg_name="sep") + @forbid_nonstring_types(["bytes"]) + def partition(self, sep=" ", expand=True): f = lambda x: x.partition(sep) result = _na_map(f, self._parent) return self._wrap_result(result, expand=expand) - @Appender(_shared_docs['str_partition'] % { - 'side': 'last', - 'return': '3 elements containing two empty strings, followed by the ' - 'string itself', - 'also': 'partition : Split the string at the first occurrence of ' - '`sep`.' - }) - @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep') - @forbid_nonstring_types(['bytes']) - def rpartition(self, sep=' ', expand=True): + @Appender( + _shared_docs["str_partition"] + % { + "side": "last", + "return": "3 elements containing two empty strings, followed by the " + "string itself", + "also": "partition : Split the string at the first occurrence of " "`sep`.", + } + ) + @deprecate_kwarg(old_arg_name="pat", new_arg_name="sep") + @forbid_nonstring_types(["bytes"]) + def rpartition(self, sep=" ", expand=True): f = lambda x: x.rpartition(sep) result = _na_map(f, self._parent) return self._wrap_result(result, expand=expand) @@ -2631,44 +2687,48 @@ def get(self, i): return self._wrap_result(result) @copy(str_join) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def join(self, sep): result = str_join(self._parent, sep) return self._wrap_result(result) @copy(str_contains) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): - result = str_contains(self._parent, pat, case=case, flags=flags, na=na, - regex=regex) + result = str_contains( + self._parent, pat, case=case, flags=flags, na=na, regex=regex + ) return self._wrap_result(result, fill_value=na) @copy(str_match) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def match(self, pat, case=True, flags=0, na=np.nan): result = str_match(self._parent, pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na) @copy(str_replace) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): - result = str_replace(self._parent, pat, repl, n=n, case=case, - flags=flags, regex=regex) + result = str_replace( + self._parent, pat, repl, n=n, case=case, flags=flags, regex=regex + ) return self._wrap_result(result) @copy(str_repeat) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def repeat(self, repeats): result = str_repeat(self._parent, repeats) return self._wrap_result(result) @copy(str_pad) - @forbid_nonstring_types(['bytes']) - def pad(self, width, side='left', fillchar=' '): + @forbid_nonstring_types(["bytes"]) + def pad(self, width, side="left", fillchar=" "): result = str_pad(self._parent, width, side=side, fillchar=fillchar) return self._wrap_result(result) - _shared_docs['str_pad'] = (""" + _shared_docs[ + "str_pad" + ] = """ Filling %(side)s side of strings in the Series/Index with an additional character. Equivalent to :meth:`str.%(method)s`. @@ -2683,25 +2743,24 @@ def pad(self, width, side='left', fillchar=' '): Returns ------- filled : Series/Index of objects - """) + """ - @Appender(_shared_docs['str_pad'] % dict(side='left and right', - method='center')) - @forbid_nonstring_types(['bytes']) - def center(self, width, fillchar=' '): - return self.pad(width, side='both', fillchar=fillchar) + @Appender(_shared_docs["str_pad"] % dict(side="left and right", method="center")) + @forbid_nonstring_types(["bytes"]) + def center(self, width, fillchar=" "): + return self.pad(width, side="both", fillchar=fillchar) - @Appender(_shared_docs['str_pad'] % dict(side='right', method='ljust')) - @forbid_nonstring_types(['bytes']) - def ljust(self, width, fillchar=' '): - return self.pad(width, side='right', fillchar=fillchar) + @Appender(_shared_docs["str_pad"] % dict(side="right", method="ljust")) + @forbid_nonstring_types(["bytes"]) + def ljust(self, width, fillchar=" "): + return self.pad(width, side="right", fillchar=fillchar) - @Appender(_shared_docs['str_pad'] % dict(side='left', method='rjust')) - @forbid_nonstring_types(['bytes']) - def rjust(self, width, fillchar=' '): - return self.pad(width, side='left', fillchar=fillchar) + @Appender(_shared_docs["str_pad"] % dict(side="left", method="rjust")) + @forbid_nonstring_types(["bytes"]) + def rjust(self, width, fillchar=" "): + return self.pad(width, side="left", fillchar=fillchar) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def zfill(self, width): """ Pad strings in the Series/Index by prepending '0' characters. @@ -2762,7 +2821,7 @@ def zfill(self, width): 4 NaN dtype: object """ - result = str_pad(self._parent, width, side='left', fillchar='0') + result = str_pad(self._parent, width, side="left", fillchar="0") return self._wrap_result(result) @copy(str_slice) @@ -2771,7 +2830,7 @@ def slice(self, start=None, stop=None, step=None): return self._wrap_result(result) @copy(str_slice_replace) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def slice_replace(self, start=None, stop=None, repl=None): result = str_slice_replace(self._parent, start, stop, repl) return self._wrap_result(result) @@ -2783,12 +2842,14 @@ def decode(self, encoding, errors="strict"): return self._wrap_result(result) @copy(str_encode) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def encode(self, encoding, errors="strict"): result = str_encode(self._parent, encoding, errors) return self._wrap_result(result) - _shared_docs['str_strip'] = (r""" + _shared_docs[ + "str_strip" + ] = r""" Remove leading and trailing characters. Strip whitespaces (including newlines) or a set of specified characters @@ -2849,67 +2910,69 @@ def encode(self, encoding, errors="strict"): 2 Cat 3 NaN dtype: object - """) + """ - @Appender(_shared_docs['str_strip'] % dict(side='left and right sides', - method='strip')) - @forbid_nonstring_types(['bytes']) + @Appender( + _shared_docs["str_strip"] % dict(side="left and right sides", method="strip") + ) + @forbid_nonstring_types(["bytes"]) def strip(self, to_strip=None): - result = str_strip(self._parent, to_strip, side='both') + result = str_strip(self._parent, to_strip, side="both") return self._wrap_result(result) - @Appender(_shared_docs['str_strip'] % dict(side='left side', - method='lstrip')) - @forbid_nonstring_types(['bytes']) + @Appender(_shared_docs["str_strip"] % dict(side="left side", method="lstrip")) + @forbid_nonstring_types(["bytes"]) def lstrip(self, to_strip=None): - result = str_strip(self._parent, to_strip, side='left') + result = str_strip(self._parent, to_strip, side="left") return self._wrap_result(result) - @Appender(_shared_docs['str_strip'] % dict(side='right side', - method='rstrip')) - @forbid_nonstring_types(['bytes']) + @Appender(_shared_docs["str_strip"] % dict(side="right side", method="rstrip")) + @forbid_nonstring_types(["bytes"]) def rstrip(self, to_strip=None): - result = str_strip(self._parent, to_strip, side='right') + result = str_strip(self._parent, to_strip, side="right") return self._wrap_result(result) @copy(str_wrap) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def wrap(self, width, **kwargs): result = str_wrap(self._parent, width, **kwargs) return self._wrap_result(result) @copy(str_get_dummies) - @forbid_nonstring_types(['bytes']) - def get_dummies(self, sep='|'): + @forbid_nonstring_types(["bytes"]) + def get_dummies(self, sep="|"): # we need to cast to Series of strings as only that has all # methods available for making the dummies... data = self._orig.astype(str) if self._is_categorical else self._parent result, name = str_get_dummies(data, sep) - return self._wrap_result(result, use_codes=(not self._is_categorical), - name=name, expand=True) + return self._wrap_result( + result, use_codes=(not self._is_categorical), name=name, expand=True + ) @copy(str_translate) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def translate(self, table): result = str_translate(self._parent, table) return self._wrap_result(result) - count = _pat_wrapper(str_count, flags=True, name='count') - startswith = _pat_wrapper(str_startswith, na=True, name='startswith') - endswith = _pat_wrapper(str_endswith, na=True, name='endswith') - findall = _pat_wrapper(str_findall, flags=True, name='findall') + count = _pat_wrapper(str_count, flags=True, name="count") + startswith = _pat_wrapper(str_startswith, na=True, name="startswith") + endswith = _pat_wrapper(str_endswith, na=True, name="endswith") + findall = _pat_wrapper(str_findall, flags=True, name="findall") @copy(str_extract) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def extract(self, pat, flags=0, expand=True): return str_extract(self, pat, flags=flags, expand=expand) @copy(str_extractall) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def extractall(self, pat, flags=0): return str_extractall(self._orig, pat, flags=flags) - _shared_docs['find'] = (""" + _shared_docs[ + "find" + ] = """ Return %(side)s indexes in each strings in the Series/Index where the substring is fully contained between [start:end]. Return -1 on failure. Equivalent to standard :meth:`str.%(method)s`. @@ -2930,26 +2993,35 @@ def extractall(self, pat, flags=0): See Also -------- %(also)s - """) + """ - @Appender(_shared_docs['find'] % - dict(side='lowest', method='find', - also='rfind : Return highest indexes in each strings.')) - @forbid_nonstring_types(['bytes']) + @Appender( + _shared_docs["find"] + % dict( + side="lowest", + method="find", + also="rfind : Return highest indexes in each strings.", + ) + ) + @forbid_nonstring_types(["bytes"]) def find(self, sub, start=0, end=None): - result = str_find(self._parent, sub, start=start, end=end, side='left') + result = str_find(self._parent, sub, start=start, end=end, side="left") return self._wrap_result(result) - @Appender(_shared_docs['find'] % - dict(side='highest', method='rfind', - also='find : Return lowest indexes in each strings.')) - @forbid_nonstring_types(['bytes']) + @Appender( + _shared_docs["find"] + % dict( + side="highest", + method="rfind", + also="find : Return lowest indexes in each strings.", + ) + ) + @forbid_nonstring_types(["bytes"]) def rfind(self, sub, start=0, end=None): - result = str_find(self._parent, sub, - start=start, end=end, side='right') + result = str_find(self._parent, sub, start=start, end=end, side="right") return self._wrap_result(result) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def normalize(self, form): """ Return the Unicode normal form for the strings in the Series/Index. @@ -2966,11 +3038,14 @@ def normalize(self, form): normalized : Series/Index of objects """ import unicodedata + f = lambda x: unicodedata.normalize(form, x) result = _na_map(f, self._parent) return self._wrap_result(result) - _shared_docs['index'] = (""" + _shared_docs[ + "index" + ] = """ Return %(side)s indexes in each strings where the substring is fully contained between [start:end]. This is the same as ``str.%(similar)s`` except instead of returning -1, it raises a ValueError @@ -2992,27 +3067,39 @@ def normalize(self, form): See Also -------- %(also)s - """) + """ - @Appender(_shared_docs['index'] % - dict(side='lowest', similar='find', method='index', - also='rindex : Return highest indexes in each strings.')) - @forbid_nonstring_types(['bytes']) + @Appender( + _shared_docs["index"] + % dict( + side="lowest", + similar="find", + method="index", + also="rindex : Return highest indexes in each strings.", + ) + ) + @forbid_nonstring_types(["bytes"]) def index(self, sub, start=0, end=None): - result = str_index(self._parent, sub, - start=start, end=end, side='left') + result = str_index(self._parent, sub, start=start, end=end, side="left") return self._wrap_result(result) - @Appender(_shared_docs['index'] % - dict(side='highest', similar='rfind', method='rindex', - also='index : Return lowest indexes in each strings.')) - @forbid_nonstring_types(['bytes']) + @Appender( + _shared_docs["index"] + % dict( + side="highest", + similar="rfind", + method="rindex", + also="index : Return lowest indexes in each strings.", + ) + ) + @forbid_nonstring_types(["bytes"]) def rindex(self, sub, start=0, end=None): - result = str_index(self._parent, sub, - start=start, end=end, side='right') + result = str_index(self._parent, sub, start=start, end=end, side="right") return self._wrap_result(result) - _shared_docs['len'] = (""" + _shared_docs[ + "len" + ] = """ Compute the length of each element in the Series/Index. The element may be a sequence (such as a string, tuple or list) or a collection (such as a dictionary). @@ -3055,11 +3142,14 @@ def rindex(self, sub, start=0, end=None): 4 4.0 5 3.0 dtype: float64 - """) - len = _noarg_wrapper(len, docstring=_shared_docs['len'], - forbidden_types=None, dtype=int) + """ + len = _noarg_wrapper( + len, docstring=_shared_docs["len"], forbidden_types=None, dtype=int + ) - _shared_docs['casemethods'] = (""" + _shared_docs[ + "casemethods" + ] = """ Convert strings in the Series/Index to %(type)s. %(version)s Equivalent to :meth:`str.%(method)s`. @@ -3124,45 +3214,56 @@ def rindex(self, sub, start=0, end=None): 2 THIS IS A SENTENCE 3 sWaPcAsE dtype: object - """) + """ # _doc_args holds dict of strings to use in substituting casemethod docs _doc_args = {} # type: Dict[str, Dict[str, str]] - _doc_args['lower'] = dict(type='lowercase', method='lower', version='') - _doc_args['upper'] = dict(type='uppercase', method='upper', version='') - _doc_args['title'] = dict(type='titlecase', method='title', version='') - _doc_args['capitalize'] = dict(type='be capitalized', method='capitalize', - version='') - _doc_args['swapcase'] = dict(type='be swapcased', method='swapcase', - version='') - _doc_args['casefold'] = dict(type='be casefolded', method='casefold', - version='\n .. versionadded:: 0.25.0\n') - lower = _noarg_wrapper(lambda x: x.lower(), - name='lower', - docstring=_shared_docs['casemethods'] % - _doc_args['lower']) - upper = _noarg_wrapper(lambda x: x.upper(), - name='upper', - docstring=_shared_docs['casemethods'] % - _doc_args['upper']) - title = _noarg_wrapper(lambda x: x.title(), - name='title', - docstring=_shared_docs['casemethods'] % - _doc_args['title']) - capitalize = _noarg_wrapper(lambda x: x.capitalize(), - name='capitalize', - docstring=_shared_docs['casemethods'] % - _doc_args['capitalize']) - swapcase = _noarg_wrapper(lambda x: x.swapcase(), - name='swapcase', - docstring=_shared_docs['casemethods'] % - _doc_args['swapcase']) - casefold = _noarg_wrapper(lambda x: x.casefold(), - name='casefold', - docstring=_shared_docs['casemethods'] % - _doc_args['casefold']) - - _shared_docs['ismethods'] = (""" + _doc_args["lower"] = dict(type="lowercase", method="lower", version="") + _doc_args["upper"] = dict(type="uppercase", method="upper", version="") + _doc_args["title"] = dict(type="titlecase", method="title", version="") + _doc_args["capitalize"] = dict( + type="be capitalized", method="capitalize", version="" + ) + _doc_args["swapcase"] = dict(type="be swapcased", method="swapcase", version="") + _doc_args["casefold"] = dict( + type="be casefolded", + method="casefold", + version="\n .. versionadded:: 0.25.0\n", + ) + lower = _noarg_wrapper( + lambda x: x.lower(), + name="lower", + docstring=_shared_docs["casemethods"] % _doc_args["lower"], + ) + upper = _noarg_wrapper( + lambda x: x.upper(), + name="upper", + docstring=_shared_docs["casemethods"] % _doc_args["upper"], + ) + title = _noarg_wrapper( + lambda x: x.title(), + name="title", + docstring=_shared_docs["casemethods"] % _doc_args["title"], + ) + capitalize = _noarg_wrapper( + lambda x: x.capitalize(), + name="capitalize", + docstring=_shared_docs["casemethods"] % _doc_args["capitalize"], + ) + swapcase = _noarg_wrapper( + lambda x: x.swapcase(), + name="swapcase", + docstring=_shared_docs["casemethods"] % _doc_args["swapcase"], + ) + casefold = _noarg_wrapper( + lambda x: x.casefold(), + name="casefold", + docstring=_shared_docs["casemethods"] % _doc_args["casefold"], + ) + + _shared_docs[ + "ismethods" + ] = """ Check whether all characters in each string are %(type)s. This is equivalent to running the Python string method @@ -3301,52 +3402,61 @@ def rindex(self, sub, start=0, end=None): 2 False 3 False dtype: bool - """) - _doc_args['isalnum'] = dict(type='alphanumeric', method='isalnum') - _doc_args['isalpha'] = dict(type='alphabetic', method='isalpha') - _doc_args['isdigit'] = dict(type='digits', method='isdigit') - _doc_args['isspace'] = dict(type='whitespace', method='isspace') - _doc_args['islower'] = dict(type='lowercase', method='islower') - _doc_args['isupper'] = dict(type='uppercase', method='isupper') - _doc_args['istitle'] = dict(type='titlecase', method='istitle') - _doc_args['isnumeric'] = dict(type='numeric', method='isnumeric') - _doc_args['isdecimal'] = dict(type='decimal', method='isdecimal') - isalnum = _noarg_wrapper(lambda x: x.isalnum(), - name='isalnum', - docstring=_shared_docs['ismethods'] % - _doc_args['isalnum']) - isalpha = _noarg_wrapper(lambda x: x.isalpha(), - name='isalpha', - docstring=_shared_docs['ismethods'] % - _doc_args['isalpha']) - isdigit = _noarg_wrapper(lambda x: x.isdigit(), - name='isdigit', - docstring=_shared_docs['ismethods'] % - _doc_args['isdigit']) - isspace = _noarg_wrapper(lambda x: x.isspace(), - name='isspace', - docstring=_shared_docs['ismethods'] % - _doc_args['isspace']) - islower = _noarg_wrapper(lambda x: x.islower(), - name='islower', - docstring=_shared_docs['ismethods'] % - _doc_args['islower']) - isupper = _noarg_wrapper(lambda x: x.isupper(), - name='isupper', - docstring=_shared_docs['ismethods'] % - _doc_args['isupper']) - istitle = _noarg_wrapper(lambda x: x.istitle(), - name='istitle', - docstring=_shared_docs['ismethods'] % - _doc_args['istitle']) - isnumeric = _noarg_wrapper(lambda x: x.isnumeric(), - name='isnumeric', - docstring=_shared_docs['ismethods'] % - _doc_args['isnumeric']) - isdecimal = _noarg_wrapper(lambda x: x.isdecimal(), - name='isdecimal', - docstring=_shared_docs['ismethods'] % - _doc_args['isdecimal']) + """ + _doc_args["isalnum"] = dict(type="alphanumeric", method="isalnum") + _doc_args["isalpha"] = dict(type="alphabetic", method="isalpha") + _doc_args["isdigit"] = dict(type="digits", method="isdigit") + _doc_args["isspace"] = dict(type="whitespace", method="isspace") + _doc_args["islower"] = dict(type="lowercase", method="islower") + _doc_args["isupper"] = dict(type="uppercase", method="isupper") + _doc_args["istitle"] = dict(type="titlecase", method="istitle") + _doc_args["isnumeric"] = dict(type="numeric", method="isnumeric") + _doc_args["isdecimal"] = dict(type="decimal", method="isdecimal") + isalnum = _noarg_wrapper( + lambda x: x.isalnum(), + name="isalnum", + docstring=_shared_docs["ismethods"] % _doc_args["isalnum"], + ) + isalpha = _noarg_wrapper( + lambda x: x.isalpha(), + name="isalpha", + docstring=_shared_docs["ismethods"] % _doc_args["isalpha"], + ) + isdigit = _noarg_wrapper( + lambda x: x.isdigit(), + name="isdigit", + docstring=_shared_docs["ismethods"] % _doc_args["isdigit"], + ) + isspace = _noarg_wrapper( + lambda x: x.isspace(), + name="isspace", + docstring=_shared_docs["ismethods"] % _doc_args["isspace"], + ) + islower = _noarg_wrapper( + lambda x: x.islower(), + name="islower", + docstring=_shared_docs["ismethods"] % _doc_args["islower"], + ) + isupper = _noarg_wrapper( + lambda x: x.isupper(), + name="isupper", + docstring=_shared_docs["ismethods"] % _doc_args["isupper"], + ) + istitle = _noarg_wrapper( + lambda x: x.istitle(), + name="istitle", + docstring=_shared_docs["ismethods"] % _doc_args["istitle"], + ) + isnumeric = _noarg_wrapper( + lambda x: x.isnumeric(), + name="isnumeric", + docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"], + ) + isdecimal = _noarg_wrapper( + lambda x: x.isdecimal(), + name="isdecimal", + docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"], + ) @classmethod def _make_accessor(cls, data): diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 3e3318ed4c4b6b..e9d2c3f07bfae0 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -8,16 +8,33 @@ from pandas._libs import tslib, tslibs from pandas._libs.tslibs import Timestamp, conversion, parsing from pandas._libs.tslibs.parsing import ( # noqa - DateParseError, _format_is_iso, _guess_datetime_format, parse_time_string) + DateParseError, + _format_is_iso, + _guess_datetime_format, + parse_time_string, +) from pandas._libs.tslibs.strptime import array_strptime from pandas.util._decorators import deprecate_kwarg from pandas.core.dtypes.common import ( - ensure_object, is_datetime64_dtype, is_datetime64_ns_dtype, - is_datetime64tz_dtype, is_float, is_integer, is_integer_dtype, - is_list_like, is_numeric_dtype, is_scalar) + ensure_object, + is_datetime64_dtype, + is_datetime64_ns_dtype, + is_datetime64tz_dtype, + is_float, + is_integer, + is_integer_dtype, + is_list_like, + is_numeric_dtype, + is_scalar, +) from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCDatetimeIndex, ABCIndex, ABCIndexClass, ABCSeries) + ABCDataFrame, + ABCDatetimeIndex, + ABCIndex, + ABCIndexClass, + ABCSeries, +) from pandas.core.dtypes.missing import notna from pandas._typing import ArrayLike @@ -35,9 +52,10 @@ # types used in annotations Scalar = Union[int, float, str] -DatetimeScalar = TypeVar('DatetimeScalar', Scalar, datetime) -DatetimeScalarOrArrayConvertible = Union[DatetimeScalar, list, tuple, - ArrayLike, ABCSeries] +DatetimeScalar = TypeVar("DatetimeScalar", Scalar, datetime) +DatetimeScalarOrArrayConvertible = Union[ + DatetimeScalar, list, tuple, ArrayLike, ABCSeries +] # --------------------------------------------------------------------- @@ -50,8 +68,9 @@ def _guess_datetime_format_for_array(arr, **kwargs): return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) -def should_cache(arg: ArrayConvertible, unique_share: float = 0.7, - check_count: Optional[int] = None) -> bool: +def should_cache( + arg: ArrayConvertible, unique_share: float = 0.7, check_count: Optional[int] = None +) -> bool: """ Decides whether to do caching. @@ -91,12 +110,13 @@ def should_cache(arg: ArrayConvertible, unique_share: float = 0.7, else: check_count = 500 else: - assert 0 <= check_count <= len(arg), \ - 'check_count must be in next bounds: [0; len(arg)]' + assert ( + 0 <= check_count <= len(arg) + ), "check_count must be in next bounds: [0; len(arg)]" if check_count == 0: return False - assert 0 < unique_share < 1, 'unique_share must be in next bounds: (0; 1)' + assert 0 < unique_share < 1, "unique_share must be in next bounds: (0; 1)" unique_elements = unique(arg[:check_count]) if len(unique_elements) > check_count * unique_share: @@ -124,6 +144,7 @@ def _maybe_cache(arg, format, cache, convert_listlike): Cache of converted, unique dates. Can be empty """ from pandas import Series + cache_array = Series() if cache: # Perform a quicker unique check @@ -138,9 +159,7 @@ def _maybe_cache(arg, format, cache, convert_listlike): def _box_as_indexlike( - dt_array: ArrayLike, - utc: Optional[bool] = None, - name: Optional[str] = None + dt_array: ArrayLike, utc: Optional[bool] = None, name: Optional[str] = None ) -> Union[ABCIndex, ABCDatetimeIndex]: """ Properly boxes the ndarray of datetimes to DatetimeIndex @@ -162,8 +181,9 @@ def _box_as_indexlike( - general Index otherwise """ from pandas import DatetimeIndex, Index + if is_datetime64_dtype(dt_array): - tz = 'utc' if utc else None + tz = "utc" if utc else None return DatetimeIndex(dt_array, tz=tz, name=name) return Index(dt_array, name=name) @@ -172,7 +192,7 @@ def _convert_and_box_cache( arg: DatetimeScalarOrArrayConvertible, cache_array: ABCSeries, box: bool, - name: Optional[str] = None + name: Optional[str] = None, ) -> Union[ABCIndex, np.ndarray]: """ Convert array of dates with a cache and box the result @@ -194,6 +214,7 @@ def _convert_and_box_cache( - ndarray if box=False """ from pandas import Series + result = Series(arg).map(cache_array) if box: return _box_as_indexlike(result, utc=None, name=name) @@ -226,21 +247,34 @@ def _return_parsed_timezone_results(result, timezones, box, tz, name): - ndarray of Timestamps if box=False """ if tz is not None: - raise ValueError("Cannot pass a tz argument when " - "parsing strings with timezone " - "information.") - tz_results = np.array([Timestamp(res).tz_localize(zone) for res, zone - in zip(result, timezones)]) + raise ValueError( + "Cannot pass a tz argument when " + "parsing strings with timezone " + "information." + ) + tz_results = np.array( + [Timestamp(res).tz_localize(zone) for res, zone in zip(result, timezones)] + ) if box: from pandas import Index + return Index(tz_results, name=name) return tz_results -def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, - unit=None, errors=None, - infer_datetime_format=None, dayfirst=None, - yearfirst=None, exact=None): +def _convert_listlike_datetimes( + arg, + box, + format, + name=None, + tz=None, + unit=None, + errors=None, + infer_datetime_format=None, + dayfirst=None, + yearfirst=None, + exact=None, +): """ Helper function for to_datetime. Performs the conversions of 1D listlike of dates @@ -279,16 +313,18 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, from pandas import DatetimeIndex from pandas.core.arrays import DatetimeArray from pandas.core.arrays.datetimes import ( - maybe_convert_dtype, objects_to_datetime64ns) + maybe_convert_dtype, + objects_to_datetime64ns, + ) if isinstance(arg, (list, tuple)): - arg = np.array(arg, dtype='O') + arg = np.array(arg, dtype="O") # these are shortcutable if is_datetime64tz_dtype(arg): if not isinstance(arg, (DatetimeArray, DatetimeIndex)): return DatetimeIndex(arg, tz=tz, name=name) - if tz == 'utc': + if tz == "utc": arg = arg.tz_convert(None).tz_localize(tz) return arg @@ -304,12 +340,12 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") - arg = getattr(arg, 'values', arg) - result, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, - errors=errors) + arg = getattr(arg, "values", arg) + result, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) if box: - if errors == 'ignore': + if errors == "ignore": from pandas import Index + result = Index(result, name=name) else: result = DatetimeIndex(result, name=name) @@ -317,7 +353,7 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, # GH 25546: Apply tz_parsed first (from arg), then tz (from caller) # result will be naive but in UTC try: - result = result.tz_localize('UTC').tz_convert(tz_parsed) + result = result.tz_localize("UTC").tz_convert(tz_parsed) except AttributeError: # Regular Index from 'ignore' path return result @@ -327,9 +363,10 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, else: result = result.tz_convert(tz) return result - elif getattr(arg, 'ndim', 1) > 1: - raise TypeError('arg must be a string, datetime, list, tuple, ' - '1-d array, or Series') + elif getattr(arg, "ndim", 1) > 1: + raise TypeError( + "arg must be a string, datetime, list, tuple, " "1-d array, or Series" + ) # warn if passing timedelta64, raise for PeriodDtype # NB: this must come after unit transformation @@ -358,30 +395,33 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, if format is not None: try: # shortcut formatting here - if format == '%Y%m%d': + if format == "%Y%m%d": try: # pass orig_arg as float-dtype may have been converted to # datetime64[ns] orig_arg = ensure_object(orig_arg) result = _attempt_YYYYMMDD(orig_arg, errors=errors) except (ValueError, TypeError, tslibs.OutOfBoundsDatetime): - raise ValueError("cannot convert the input to " - "'%Y%m%d' date format") + raise ValueError( + "cannot convert the input to " "'%Y%m%d' date format" + ) # fallback if result is None: try: result, timezones = array_strptime( - arg, format, exact=exact, errors=errors) - if '%Z' in format or '%z' in format: + arg, format, exact=exact, errors=errors + ) + if "%Z" in format or "%z" in format: return _return_parsed_timezone_results( - result, timezones, box, tz, name) + result, timezones, box, tz, name + ) except tslibs.OutOfBoundsDatetime: - if errors == 'raise': + if errors == "raise": raise - elif errors == 'coerce': - result = np.empty(arg.shape, dtype='M8[ns]') - iresult = result.view('i8') + elif errors == "coerce": + result = np.empty(arg.shape, dtype="M8[ns]") + iresult = result.view("i8") iresult.fill(tslibs.iNaT) else: result = arg @@ -390,11 +430,11 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, # to array_to_datetime - terminate here # for specified formats if not infer_datetime_format: - if errors == 'raise': + if errors == "raise": raise - elif errors == 'coerce': - result = np.empty(arg.shape, dtype='M8[ns]') - iresult = result.view('i8') + elif errors == "coerce": + result = np.empty(arg.shape, dtype="M8[ns]") + iresult = result.view("i8") iresult.fill(tslibs.iNaT) else: result = arg @@ -409,27 +449,30 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, if result is None: assert format is None or infer_datetime_format - utc = tz == 'utc' + utc = tz == "utc" result, tz_parsed = objects_to_datetime64ns( - arg, dayfirst=dayfirst, yearfirst=yearfirst, - utc=utc, errors=errors, require_iso8601=require_iso8601, - allow_object=True) + arg, + dayfirst=dayfirst, + yearfirst=yearfirst, + utc=utc, + errors=errors, + require_iso8601=require_iso8601, + allow_object=True, + ) if tz_parsed is not None: if box: # We can take a shortcut since the datetime64 numpy array # is in UTC - return DatetimeIndex._simple_new(result, name=name, - tz=tz_parsed) + return DatetimeIndex._simple_new(result, name=name, tz=tz_parsed) else: # Convert the datetime64 numpy array to an numpy array # of datetime objects - result = [Timestamp(ts, tz=tz_parsed).to_pydatetime() - for ts in result] + result = [Timestamp(ts, tz=tz_parsed).to_pydatetime() for ts in result] return np.array(result, dtype=object) if box: - utc = tz == 'utc' + utc = tz == "utc" return _box_as_indexlike(result, utc=utc, name=name) return result @@ -452,16 +495,15 @@ def _adjust_to_origin(arg, origin, unit): ------- ndarray or scalar of adjusted date(s) """ - if origin == 'julian': + if origin == "julian": original = arg j0 = Timestamp(0).to_julian_date() - if unit != 'D': + if unit != "D": raise ValueError("unit must be 'D' for origin='julian'") try: arg = arg - j0 except TypeError: - raise ValueError("incompatible 'arg' type for given " - "'origin'='julian'") + raise ValueError("incompatible 'arg' type for given " "'origin'='julian'") # preemptively check this for a nice range j_max = Timestamp.max.to_julian_date() - j0 @@ -469,30 +511,36 @@ def _adjust_to_origin(arg, origin, unit): if np.any(arg > j_max) or np.any(arg < j_min): raise tslibs.OutOfBoundsDatetime( "{original} is Out of Bounds for " - "origin='julian'".format(original=original)) + "origin='julian'".format(original=original) + ) else: # arg must be numeric - if not ((is_scalar(arg) and (is_integer(arg) or is_float(arg))) or - is_numeric_dtype(np.asarray(arg))): + if not ( + (is_scalar(arg) and (is_integer(arg) or is_float(arg))) + or is_numeric_dtype(np.asarray(arg)) + ): raise ValueError( "'{arg}' is not compatible with origin='{origin}'; " "it must be numeric with a unit specified ".format( - arg=arg, - origin=origin)) + arg=arg, origin=origin + ) + ) # we are going to offset back to unix / epoch time try: offset = Timestamp(origin) except tslibs.OutOfBoundsDatetime: raise tslibs.OutOfBoundsDatetime( - "origin {origin} is Out of Bounds".format(origin=origin)) + "origin {origin} is Out of Bounds".format(origin=origin) + ) except ValueError: - raise ValueError("origin {origin} cannot be converted " - "to a Timestamp".format(origin=origin)) + raise ValueError( + "origin {origin} cannot be converted " + "to a Timestamp".format(origin=origin) + ) if offset.tz is not None: - raise ValueError( - "origin offset {} must be tz-naive".format(offset)) + raise ValueError("origin offset {} must be tz-naive".format(offset)) offset -= Timestamp(0) # convert the offset to the unit of the arg @@ -501,17 +549,28 @@ def _adjust_to_origin(arg, origin, unit): # scalars & ndarray-like can handle the addition if is_list_like(arg) and not isinstance( - arg, (ABCSeries, ABCIndexClass, np.ndarray)): + arg, (ABCSeries, ABCIndexClass, np.ndarray) + ): arg = np.asarray(arg) arg = arg + offset return arg -@deprecate_kwarg(old_arg_name='box', new_arg_name=None) -def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, - utc=None, box=True, format=None, exact=True, - unit=None, infer_datetime_format=False, origin='unix', - cache=True): +@deprecate_kwarg(old_arg_name="box", new_arg_name=None) +def to_datetime( + arg, + errors="raise", + dayfirst=False, + yearfirst=False, + utc=None, + box=True, + format=None, + exact=True, + unit=None, + infer_datetime_format=False, + origin="unix", + cache=True, +): """ Convert argument to datetime. @@ -686,14 +745,20 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, if arg is None: return None - if origin != 'unix': + if origin != "unix": arg = _adjust_to_origin(arg, origin, unit) - tz = 'utc' if utc else None - convert_listlike = partial(_convert_listlike_datetimes, tz=tz, unit=unit, - dayfirst=dayfirst, yearfirst=yearfirst, - errors=errors, exact=exact, - infer_datetime_format=infer_datetime_format) + tz = "utc" if utc else None + convert_listlike = partial( + _convert_listlike_datetimes, + tz=tz, + unit=unit, + dayfirst=dayfirst, + yearfirst=yearfirst, + errors=errors, + exact=exact, + infer_datetime_format=infer_datetime_format, + ) if isinstance(arg, Timestamp): result = arg @@ -714,8 +779,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, elif isinstance(arg, ABCIndexClass): cache_array = _maybe_cache(arg, format, cache, convert_listlike) if not cache_array.empty: - result = _convert_and_box_cache(arg, cache_array, box, - name=arg.name) + result = _convert_and_box_cache(arg, cache_array, box, name=arg.name) else: convert_listlike = partial(convert_listlike, name=arg.name) result = convert_listlike(arg, box, format) @@ -732,28 +796,29 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, # mappings for assembling units -_unit_map = {'year': 'year', - 'years': 'year', - 'month': 'month', - 'months': 'month', - 'day': 'day', - 'days': 'day', - 'hour': 'h', - 'hours': 'h', - 'minute': 'm', - 'minutes': 'm', - 'second': 's', - 'seconds': 's', - 'ms': 'ms', - 'millisecond': 'ms', - 'milliseconds': 'ms', - 'us': 'us', - 'microsecond': 'us', - 'microseconds': 'us', - 'ns': 'ns', - 'nanosecond': 'ns', - 'nanoseconds': 'ns' - } +_unit_map = { + "year": "year", + "years": "year", + "month": "month", + "months": "month", + "day": "day", + "days": "day", + "hour": "h", + "hours": "h", + "minute": "m", + "minutes": "m", + "second": "s", + "seconds": "s", + "ms": "ms", + "millisecond": "ms", + "milliseconds": "ms", + "us": "us", + "microsecond": "us", + "microseconds": "us", + "ns": "ns", + "nanosecond": "ns", + "nanoseconds": "ns", +} def _assemble_from_unit_mappings(arg, errors, box, tz): @@ -780,6 +845,7 @@ def _assemble_from_unit_mappings(arg, errors, box, tz): Series """ from pandas import to_timedelta, to_numeric, DataFrame + arg = DataFrame(arg) if not arg.columns.is_unique: raise ValueError("cannot assemble with duplicate keys") @@ -799,19 +865,23 @@ def f(value): unit_rev = {v: k for k, v in unit.items()} # we require at least Ymd - required = ['year', 'month', 'day'] + required = ["year", "month", "day"] req = sorted(list(set(required) - set(unit_rev.keys()))) if len(req): - raise ValueError("to assemble mappings requires at least that " - "[year, month, day] be specified: [{required}] " - "is missing".format(required=','.join(req))) + raise ValueError( + "to assemble mappings requires at least that " + "[year, month, day] be specified: [{required}] " + "is missing".format(required=",".join(req)) + ) # keys we don't recognize excess = sorted(list(set(unit_rev.keys()) - set(_unit_map.values()))) if len(excess): - raise ValueError("extra keys have been passed " - "to the datetime assemblage: " - "[{excess}]".format(excess=','.join(excess))) + raise ValueError( + "extra keys have been passed " + "to the datetime assemblage: " + "[{excess}]".format(excess=",".join(excess)) + ) def coerce(values): # we allow coercion to if errors allows @@ -819,28 +889,29 @@ def coerce(values): # prevent overflow in case of int8 or int16 if is_integer_dtype(values): - values = values.astype('int64', copy=False) + values = values.astype("int64", copy=False) return values - values = (coerce(arg[unit_rev['year']]) * 10000 + - coerce(arg[unit_rev['month']]) * 100 + - coerce(arg[unit_rev['day']])) + values = ( + coerce(arg[unit_rev["year"]]) * 10000 + + coerce(arg[unit_rev["month"]]) * 100 + + coerce(arg[unit_rev["day"]]) + ) try: - values = to_datetime(values, format='%Y%m%d', errors=errors, utc=tz) + values = to_datetime(values, format="%Y%m%d", errors=errors, utc=tz) except (TypeError, ValueError) as e: - raise ValueError("cannot assemble the " - "datetimes: {error}".format(error=e)) + raise ValueError("cannot assemble the " "datetimes: {error}".format(error=e)) - for u in ['h', 'm', 's', 'ms', 'us', 'ns']: + for u in ["h", "m", "s", "ms", "us", "ns"]: value = unit_rev.get(u) if value is not None and value in arg: try: - values += to_timedelta(coerce(arg[value]), - unit=u, - errors=errors) + values += to_timedelta(coerce(arg[value]), unit=u, errors=errors) except (TypeError, ValueError) as e: - raise ValueError("cannot assemble the datetimes [{value}]: " - "{error}".format(value=value, error=e)) + raise ValueError( + "cannot assemble the datetimes [{value}]: " + "{error}".format(value=value, error=e) + ) if not box: return values.values return values @@ -861,18 +932,18 @@ def _attempt_YYYYMMDD(arg, errors): def calc(carg): # calculate the actual result carg = carg.astype(object) - parsed = parsing.try_parse_year_month_day(carg / 10000, - carg / 100 % 100, - carg % 100) + parsed = parsing.try_parse_year_month_day( + carg / 10000, carg / 100 % 100, carg % 100 + ) return tslib.array_to_datetime(parsed, errors=errors)[0] def calc_with_mask(carg, mask): - result = np.empty(carg.shape, dtype='M8[ns]') - iresult = result.view('i8') + result = np.empty(carg.shape, dtype="M8[ns]") + iresult = result.view("i8") iresult[~mask] = tslibs.iNaT masked_result = calc(carg[mask].astype(np.float64).astype(np.int64)) - result[mask] = masked_result.astype('M8[ns]') + result[mask] = masked_result.astype("M8[ns]") return result # try intlike / strings that are ints @@ -899,8 +970,16 @@ def calc_with_mask(carg, mask): # Fixed time formats for time parsing -_time_formats = ["%H:%M", "%H%M", "%I:%M%p", "%I%M%p", - "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p"] +_time_formats = [ + "%H:%M", + "%H%M", + "%I:%M%p", + "%I%M%p", + "%H:%M:%S", + "%H%M%S", + "%I:%M:%S%p", + "%I%M%S%p", +] def _guess_time_format_for_array(arr): @@ -918,7 +997,7 @@ def _guess_time_format_for_array(arr): return None -def to_time(arg, format=None, infer_time_format=False, errors='raise'): +def to_time(arg, format=None, infer_time_format=False, errors="raise"): """ Parse time strings to time objects using fixed strptime formats ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", @@ -949,11 +1028,12 @@ def to_time(arg, format=None, infer_time_format=False, errors='raise'): def _convert_listlike(arg, format): if isinstance(arg, (list, tuple)): - arg = np.array(arg, dtype='O') + arg = np.array(arg, dtype="O") - elif getattr(arg, 'ndim', 1) > 1: - raise TypeError('arg must be a string, datetime, list, tuple, ' - '1-d array, or Series') + elif getattr(arg, "ndim", 1) > 1: + raise TypeError( + "arg must be a string, datetime, list, tuple, " "1-d array, or Series" + ) arg = ensure_object(arg) @@ -966,12 +1046,13 @@ def _convert_listlike(arg, format): try: times.append(datetime.strptime(element, format).time()) except (ValueError, TypeError): - if errors == 'raise': - msg = ("Cannot convert {element} to a time with given " - "format {format}").format(element=element, - format=format) + if errors == "raise": + msg = ( + "Cannot convert {element} to a time with given " + "format {format}" + ).format(element=element, format=format) raise ValueError(msg) - elif errors == 'ignore': + elif errors == "ignore": return arg else: times.append(None) @@ -982,8 +1063,7 @@ def _convert_listlike(arg, format): time_object = None for time_format in formats: try: - time_object = datetime.strptime(element, - time_format).time() + time_object = datetime.strptime(element, time_format).time() if not format_found: # Put the found format in front fmt = formats.pop(formats.index(time_format)) @@ -995,10 +1075,11 @@ def _convert_listlike(arg, format): if time_object is not None: times.append(time_object) - elif errors == 'raise': - raise ValueError("Cannot convert arg {arg} to " - "a time".format(arg=arg)) - elif errors == 'ignore': + elif errors == "raise": + raise ValueError( + "Cannot convert arg {arg} to " "a time".format(arg=arg) + ) + elif errors == "ignore": return arg else: times.append(None) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index d7a1b1119ce4be..e1a976b874c25d 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -4,14 +4,19 @@ from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( - ensure_object, is_datetime_or_timedelta_dtype, is_decimal, is_number, - is_numeric_dtype, is_scalar) + ensure_object, + is_datetime_or_timedelta_dtype, + is_decimal, + is_number, + is_numeric_dtype, + is_scalar, +) from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries import pandas as pd -def to_numeric(arg, errors='raise', downcast=None): +def to_numeric(arg, errors="raise", downcast=None): """ Convert argument to a numeric type. @@ -102,11 +107,11 @@ def to_numeric(arg, errors='raise', downcast=None): 3 -3.0 dtype: float64 """ - if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): - raise ValueError('invalid downcasting method provided') + if downcast not in (None, "integer", "signed", "unsigned", "float"): + raise ValueError("invalid downcasting method provided") - if errors not in ('ignore', 'raise', 'coerce'): - raise ValueError('invalid error value specified') + if errors not in ("ignore", "raise", "coerce"): + raise ValueError("invalid error value specified") is_series = False is_index = False @@ -121,16 +126,16 @@ def to_numeric(arg, errors='raise', downcast=None): if values is None: values = arg.values elif isinstance(arg, (list, tuple)): - values = np.array(arg, dtype='O') + values = np.array(arg, dtype="O") elif is_scalar(arg): if is_decimal(arg): return float(arg) if is_number(arg): return arg is_scalars = True - values = np.array([arg], dtype='O') - elif getattr(arg, 'ndim', 1) > 1: - raise TypeError('arg must be a list, tuple, 1-d array, or Series') + values = np.array([arg], dtype="O") + elif getattr(arg, "ndim", 1) > 1: + raise TypeError("arg must be a list, tuple, 1-d array, or Series") else: values = arg @@ -141,12 +146,13 @@ def to_numeric(arg, errors='raise', downcast=None): values = values.astype(np.int64) else: values = ensure_object(values) - coerce_numeric = errors not in ('ignore', 'raise') - values = lib.maybe_convert_numeric(values, set(), - coerce_numeric=coerce_numeric) + coerce_numeric = errors not in ("ignore", "raise") + values = lib.maybe_convert_numeric( + values, set(), coerce_numeric=coerce_numeric + ) except Exception: - if errors == 'raise': + if errors == "raise": raise # attempt downcast only if the data has been successfully converted @@ -154,12 +160,12 @@ def to_numeric(arg, errors='raise', downcast=None): if downcast is not None and is_numeric_dtype(values): typecodes = None - if downcast in ('integer', 'signed'): - typecodes = np.typecodes['Integer'] - elif downcast == 'unsigned' and np.min(values) >= 0: - typecodes = np.typecodes['UnsignedInteger'] - elif downcast == 'float': - typecodes = np.typecodes['Float'] + if downcast in ("integer", "signed"): + typecodes = np.typecodes["Integer"] + elif downcast == "unsigned" and np.min(values) >= 0: + typecodes = np.typecodes["UnsignedInteger"] + elif downcast == "float": + typecodes = np.typecodes["Float"] # pandas support goes only to np.float32, # as float dtypes smaller than that are diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 5e89b73c8754e7..2c594a3df27ea1 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -16,8 +16,8 @@ from pandas.core.arrays.timedeltas import sequence_to_td64ns -@deprecate_kwarg(old_arg_name='box', new_arg_name=None) -def to_timedelta(arg, unit='ns', box=True, errors='raise'): +@deprecate_kwarg(old_arg_name="box", new_arg_name=None) +def to_timedelta(arg, unit="ns", box=True, errors="raise"): """ Convert argument to timedelta. @@ -96,50 +96,49 @@ def to_timedelta(arg, unit='ns', box=True, errors='raise'): """ unit = parse_timedelta_unit(unit) - if errors not in ('ignore', 'raise', 'coerce'): - raise ValueError("errors must be one of 'ignore', " - "'raise', or 'coerce'}") + if errors not in ("ignore", "raise", "coerce"): + raise ValueError("errors must be one of 'ignore', " "'raise', or 'coerce'}") - if unit in {'Y', 'y', 'M'}: - warnings.warn("M and Y units are deprecated and " - "will be removed in a future version.", - FutureWarning, stacklevel=2) + if unit in {"Y", "y", "M"}: + warnings.warn( + "M and Y units are deprecated and " "will be removed in a future version.", + FutureWarning, + stacklevel=2, + ) if arg is None: return arg elif isinstance(arg, ABCSeries): - values = _convert_listlike(arg._values, unit=unit, - box=False, errors=errors) + values = _convert_listlike(arg._values, unit=unit, box=False, errors=errors) return arg._constructor(values, index=arg.index, name=arg.name) elif isinstance(arg, ABCIndexClass): - return _convert_listlike(arg, unit=unit, box=box, - errors=errors, name=arg.name) + return _convert_listlike(arg, unit=unit, box=box, errors=errors, name=arg.name) elif isinstance(arg, np.ndarray) and arg.ndim == 0: # extract array scalar and process below arg = arg.item() - elif is_list_like(arg) and getattr(arg, 'ndim', 1) == 1: + elif is_list_like(arg) and getattr(arg, "ndim", 1) == 1: return _convert_listlike(arg, unit=unit, box=box, errors=errors) - elif getattr(arg, 'ndim', 1) > 1: - raise TypeError('arg must be a string, timedelta, list, tuple, ' - '1-d array, or Series') + elif getattr(arg, "ndim", 1) > 1: + raise TypeError( + "arg must be a string, timedelta, list, tuple, " "1-d array, or Series" + ) # ...so it must be a scalar value. Return scalar. - return _coerce_scalar_to_timedelta_type(arg, unit=unit, - box=box, errors=errors) + return _coerce_scalar_to_timedelta_type(arg, unit=unit, box=box, errors=errors) -def _coerce_scalar_to_timedelta_type(r, unit='ns', box=True, errors='raise'): +def _coerce_scalar_to_timedelta_type(r, unit="ns", box=True, errors="raise"): """Convert string 'r' to a timedelta object.""" try: result = Timedelta(r, unit) if not box: # explicitly view as timedelta64 for case when result is pd.NaT - result = result.asm8.view('timedelta64[ns]') + result = result.asm8.view("timedelta64[ns]") except ValueError: - if errors == 'raise': + if errors == "raise": raise - elif errors == 'ignore': + elif errors == "ignore": return r # coerce @@ -148,10 +147,10 @@ def _coerce_scalar_to_timedelta_type(r, unit='ns', box=True, errors='raise'): return result -def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None): +def _convert_listlike(arg, unit="ns", box=True, errors="raise", name=None): """Convert a list of objects to a timedelta index object.""" - if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'): + if isinstance(arg, (list, tuple)) or not hasattr(arg, "dtype"): # This is needed only to ensure that in the case where we end up # returning arg (errors == "ignore"), and where the input is a # generator, we return a useful list-like instead of a @@ -159,10 +158,9 @@ def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None): arg = np.array(list(arg), dtype=object) try: - value = sequence_to_td64ns(arg, unit=unit, - errors=errors, copy=False)[0] + value = sequence_to_td64ns(arg, unit=unit, errors=errors, copy=False)[0] except ValueError: - if errors == 'ignore': + if errors == "ignore": return arg else: # This else-block accounts for the cases when errors='raise' @@ -176,5 +174,6 @@ def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None): if box: from pandas import TimedeltaIndex - value = TimedeltaIndex(value, unit='ns', name=name) + + value = TimedeltaIndex(value, unit="ns", name=name) return value diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index a916f2f06df21e..f07133baed4359 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -10,13 +10,20 @@ from pandas.core.dtypes.cast import infer_dtype_from_scalar from pandas.core.dtypes.common import ( - is_categorical_dtype, is_extension_array_dtype, is_list_like) + is_categorical_dtype, + is_extension_array_dtype, + is_list_like, +) from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCIndexClass, ABCMultiIndex, ABCSeries) + ABCDataFrame, + ABCIndexClass, + ABCMultiIndex, + ABCSeries, +) from pandas.core.dtypes.missing import isna # 16 byte long hashing key -_default_hash_key = '0123456789123456' +_default_hash_key = "0123456789123456" def _combine_hash_arrays(arrays, num_items): @@ -42,13 +49,14 @@ def _combine_hash_arrays(arrays, num_items): out ^= a out *= mult mult += np.uint64(82520 + inverse_i + inverse_i) - assert i + 1 == num_items, 'Fed in wrong num_items' + assert i + 1 == num_items, "Fed in wrong num_items" out += np.uint64(97531) return out -def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, - categorize=True): +def hash_pandas_object( + obj, index=True, encoding="utf8", hash_key=None, categorize=True +): """ Return a data hash of the Index/Series/DataFrame @@ -72,53 +80,63 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, Series of uint64, same length as the object """ from pandas import Series + if hash_key is None: hash_key = _default_hash_key if isinstance(obj, ABCMultiIndex): - return Series(hash_tuples(obj, encoding, hash_key), - dtype='uint64', copy=False) + return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False) if isinstance(obj, ABCIndexClass): - h = hash_array(obj.values, encoding, hash_key, - categorize).astype('uint64', copy=False) - h = Series(h, index=obj, dtype='uint64', copy=False) + h = hash_array(obj.values, encoding, hash_key, categorize).astype( + "uint64", copy=False + ) + h = Series(h, index=obj, dtype="uint64", copy=False) elif isinstance(obj, ABCSeries): - h = hash_array(obj.values, encoding, hash_key, - categorize).astype('uint64', copy=False) + h = hash_array(obj.values, encoding, hash_key, categorize).astype( + "uint64", copy=False + ) if index: - index_iter = (hash_pandas_object(obj.index, - index=False, - encoding=encoding, - hash_key=hash_key, - categorize=categorize).values - for _ in [None]) + index_iter = ( + hash_pandas_object( + obj.index, + index=False, + encoding=encoding, + hash_key=hash_key, + categorize=categorize, + ).values + for _ in [None] + ) arrays = itertools.chain([h], index_iter) h = _combine_hash_arrays(arrays, 2) - h = Series(h, index=obj.index, dtype='uint64', copy=False) + h = Series(h, index=obj.index, dtype="uint64", copy=False) elif isinstance(obj, ABCDataFrame): hashes = (hash_array(series.values) for _, series in obj.iteritems()) num_items = len(obj.columns) if index: - index_hash_generator = (hash_pandas_object(obj.index, - index=False, - encoding=encoding, - hash_key=hash_key, - categorize=categorize).values # noqa - for _ in [None]) + index_hash_generator = ( + hash_pandas_object( + obj.index, + index=False, + encoding=encoding, + hash_key=hash_key, + categorize=categorize, + ).values # noqa + for _ in [None] + ) num_items += 1 hashes = itertools.chain(hashes, index_hash_generator) h = _combine_hash_arrays(hashes, num_items) - h = Series(h, index=obj.index, dtype='uint64', copy=False) + h = Series(h, index=obj.index, dtype="uint64", copy=False) else: raise TypeError("Unexpected type for hashing %s" % type(obj)) return h -def hash_tuples(vals, encoding='utf8', hash_key=None): +def hash_tuples(vals, encoding="utf8", hash_key=None): """ Hash an MultiIndex / list-of-tuples efficiently @@ -147,17 +165,15 @@ def hash_tuples(vals, encoding='utf8', hash_key=None): vals = MultiIndex.from_tuples(vals) # create a list-of-Categoricals - vals = [Categorical(vals.codes[level], - vals.levels[level], - ordered=False, - fastpath=True) - for level in range(vals.nlevels)] + vals = [ + Categorical(vals.codes[level], vals.levels[level], ordered=False, fastpath=True) + for level in range(vals.nlevels) + ] # hash the list-of-ndarrays - hashes = (_hash_categorical(cat, - encoding=encoding, - hash_key=hash_key) - for cat in vals) + hashes = ( + _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals + ) h = _combine_hash_arrays(hashes, len(vals)) if is_tuple: h = h[0] @@ -165,7 +181,7 @@ def hash_tuples(vals, encoding='utf8', hash_key=None): return h -def hash_tuple(val, encoding='utf8', hash_key=None): +def hash_tuple(val, encoding="utf8", hash_key=None): """ Hash a single tuple efficiently @@ -180,8 +196,7 @@ def hash_tuple(val, encoding='utf8', hash_key=None): hash """ - hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key) - for v in val) + hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key) for v in val) h = _combine_hash_arrays(hashes, len(val))[0] @@ -205,8 +220,7 @@ def _hash_categorical(c, encoding, hash_key): """ # Convert ExtensionArrays to ndarrays values = np.asarray(c.categories.values) - hashed = hash_array(values, encoding, hash_key, - categorize=False) + hashed = hash_array(values, encoding, hash_key, categorize=False) # we have uint64, as we don't directly support missing values # we don't want to use take_nd which will coerce to float @@ -219,7 +233,7 @@ def _hash_categorical(c, encoding, hash_key): if len(hashed): result = hashed.take(c.codes) else: - result = np.zeros(len(mask), dtype='uint64') + result = np.zeros(len(mask), dtype="uint64") if mask.any(): result[mask] = np.iinfo(np.uint64).max @@ -227,7 +241,7 @@ def _hash_categorical(c, encoding, hash_key): return result -def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): +def hash_array(vals, encoding="utf8", hash_key=None, categorize=True): """ Given a 1d array, return an array of deterministic integers. @@ -250,7 +264,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): 1d uint64 numpy array of hash values, same length as the vals """ - if not hasattr(vals, 'dtype'): + if not hasattr(vals, "dtype"): raise TypeError("must pass a ndarray-like") dtype = vals.dtype @@ -274,39 +288,40 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. elif isinstance(dtype, np.bool): - vals = vals.astype('u8') + vals = vals.astype("u8") elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): - vals = vals.view('i8').astype('u8', copy=False) + vals = vals.view("i8").astype("u8", copy=False) elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: - vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') + vals = vals.view("u{}".format(vals.dtype.itemsize)).astype("u8") else: # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: from pandas import factorize, Categorical, Index + codes, categories = factorize(vals, sort=False) - cat = Categorical(codes, Index(categories), - ordered=False, fastpath=True) + cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) try: vals = hashing.hash_object_array(vals, hash_key, encoding) except TypeError: # we have mixed types - vals = hashing.hash_object_array(vals.astype(str).astype(object), - hash_key, encoding) + vals = hashing.hash_object_array( + vals.astype(str).astype(object), hash_key, encoding + ) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 - vals *= np.uint64(0xbf58476d1ce4e5b9) + vals *= np.uint64(0xBF58476D1CE4E5B9) vals ^= vals >> 27 - vals *= np.uint64(0x94d049bb133111eb) + vals *= np.uint64(0x94D049BB133111EB) vals ^= vals >> 31 return vals -def _hash_scalar(val, encoding='utf8', hash_key=None): +def _hash_scalar(val, encoding="utf8", hash_key=None): """ Hash scalar value @@ -317,9 +332,9 @@ def _hash_scalar(val, encoding='utf8', hash_key=None): if isna(val): # this is to be consistent with the _hash_categorical implementation - return np.array([np.iinfo(np.uint64).max], dtype='u8') + return np.array([np.iinfo(np.uint64).max], dtype="u8") - if getattr(val, 'tzinfo', None) is not None: + if getattr(val, "tzinfo", None) is not None: # for tz-aware datetimes, we need the underlying naive UTC value and # not the tz aware object or pd extension type (as # infer_dtype_from_scalar would do) @@ -330,5 +345,4 @@ def _hash_scalar(val, encoding='utf8', hash_key=None): dtype, val = infer_dtype_from_scalar(val) vals = np.array([val], dtype=dtype) - return hash_array(vals, hash_key=hash_key, encoding=encoding, - categorize=False) + return hash_array(vals, hash_key=hash_key, encoding=encoding, categorize=False) diff --git a/pandas/core/window.py b/pandas/core/window.py index 8f888ba510b0eb..27588249b1b3c7 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -16,11 +16,24 @@ from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.core.dtypes.common import ( - ensure_float64, is_bool, is_float_dtype, is_integer, is_integer_dtype, - is_list_like, is_scalar, is_timedelta64_dtype, needs_i8_conversion) + ensure_float64, + is_bool, + is_float_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_scalar, + is_timedelta64_dtype, + needs_i8_conversion, +) from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCDateOffset, ABCDatetimeIndex, ABCPeriodIndex, ABCSeries, - ABCTimedeltaIndex) + ABCDataFrame, + ABCDateOffset, + ABCDatetimeIndex, + ABCPeriodIndex, + ABCSeries, + ABCTimedeltaIndex, +) from pandas.core.base import DataError, PandasObject, SelectionMixin import pandas.core.common as com @@ -42,13 +55,29 @@ class _Window(PandasObject, SelectionMixin): - _attributes = ['window', 'min_periods', 'center', 'win_type', - 'axis', 'on', 'closed'] + _attributes = [ + "window", + "min_periods", + "center", + "win_type", + "axis", + "on", + "closed", + ] exclusions = set() # type: Set[str] - def __init__(self, obj, window=None, min_periods=None, - center=False, win_type=None, axis=0, on=None, closed=None, - **kwargs): + def __init__( + self, + obj, + window=None, + min_periods=None, + center=False, + win_type=None, + axis=0, + on=None, + closed=None, + **kwargs + ): self.__dict__.update(kwargs) self.blocks = [] @@ -77,18 +106,20 @@ def _on(self): @property def is_freq_type(self): - return self.win_type == 'freq' + return self.win_type == "freq" def validate(self): if self.center is not None and not is_bool(self.center): raise ValueError("center must be a boolean") - if (self.min_periods is not None and - not is_integer(self.min_periods)): + if self.min_periods is not None and not is_integer(self.min_periods): raise ValueError("min_periods must be an integer") - if (self.closed is not None and - self.closed not in ['right', 'both', 'left', 'neither']): - raise ValueError("closed must be 'right', 'left', 'both' or " - "'neither'") + if self.closed is not None and self.closed not in [ + "right", + "both", + "left", + "neither", + ]: + raise ValueError("closed must be 'right', 'left', 'both' or " "'neither'") def _convert_freq(self): """ @@ -110,8 +141,7 @@ def _create_blocks(self): # filter out the on from the object if self.on is not None: if obj.ndim == 2: - obj = obj.reindex(columns=obj.columns.difference([self.on]), - copy=False) + obj = obj.reindex(columns=obj.columns.difference([self.on]), copy=False) blocks = obj._to_dict_of_blocks(copy=False).values() return blocks, obj, index @@ -145,8 +175,9 @@ def __getattr__(self, attr): if attr in self.obj: return self[attr] - raise AttributeError("%r object has no attribute %r" % - (type(self).__name__, attr)) + raise AttributeError( + "%r object has no attribute %r" % (type(self).__name__, attr) + ) def _dir_additions(self): return self.obj._dir_additions() @@ -163,15 +194,18 @@ def __repr__(self): Provide a nice str repr of our rolling object. """ - attrs = ("{k}={v}".format(k=k, v=getattr(self, k)) - for k in self._attributes - if getattr(self, k, None) is not None) - return "{klass} [{attrs}]".format(klass=self._window_type, - attrs=','.join(attrs)) + attrs = ( + "{k}={v}".format(k=k, v=getattr(self, k)) + for k in self._attributes + if getattr(self, k, None) is not None + ) + return "{klass} [{attrs}]".format( + klass=self._window_type, attrs=",".join(attrs) + ) def __iter__(self): - url = 'https://github.com/pandas-dev/pandas/issues/11704' - raise NotImplementedError('See issue #11704 {url}'.format(url=url)) + url = "https://github.com/pandas-dev/pandas/issues/11704" + raise NotImplementedError("See issue #11704 {url}".format(url=url)) def _get_index(self, index=None): """ @@ -191,7 +225,7 @@ def _get_index(self, index=None): def _prep_values(self, values=None, kill_inf=True): if values is None: - values = getattr(self._selected_obj, 'values', self._selected_obj) + values = getattr(self._selected_obj, "values", self._selected_obj) # GH #12373 : rolling functions error on float32 data # make sure the data is coerced to float64 @@ -200,17 +234,18 @@ def _prep_values(self, values=None, kill_inf=True): elif is_integer_dtype(values.dtype): values = ensure_float64(values) elif needs_i8_conversion(values.dtype): - raise NotImplementedError("ops for {action} for this " - "dtype {dtype} are not " - "implemented".format( - action=self._window_type, - dtype=values.dtype)) + raise NotImplementedError( + "ops for {action} for this " + "dtype {dtype} are not " + "implemented".format(action=self._window_type, dtype=values.dtype) + ) else: try: values = ensure_float64(values) except (ValueError, TypeError): - raise TypeError("cannot handle this type -> {0}" - "".format(values.dtype)) + raise TypeError( + "cannot handle this type -> {0}" "".format(values.dtype) + ) if kill_inf: values = values.copy() @@ -233,11 +268,14 @@ def _wrap_result(self, result, block=None, obj=None): if block is not None: if is_timedelta64_dtype(block.values.dtype): from pandas import to_timedelta - result = to_timedelta( - result.ravel(), unit='ns').values.reshape(result.shape) + + result = to_timedelta(result.ravel(), unit="ns").values.reshape( + result.shape + ) if result.ndim == 1: from pandas import Series + return Series(result, index, name=obj.name) return type(obj)(result, index=index, columns=block.columns) @@ -291,10 +329,10 @@ def _wrap_results(self, results, blocks, obj, exclude=None): columns = [c for c in columns if c not in exclude] if not columns: - raise DataError('No numeric types to aggregate') + raise DataError("No numeric types to aggregate") if not len(final): - return obj.astype('float64') + return obj.astype("float64") return concat(final, axis=1).reindex(columns=columns, copy=False) def _center_window(self, result, window): @@ -302,8 +340,9 @@ def _center_window(self, result, window): Center the result in the window. """ if self.axis > result.ndim - 1: - raise ValueError("Requested axis is larger then no. of argument " - "dimensions") + raise ValueError( + "Requested axis is larger then no. of argument " "dimensions" + ) offset = _offset(window, True) if offset > 0: @@ -323,7 +362,8 @@ def aggregate(self, func, *args, **kwargs): agg = aggregate - _shared_docs['sum'] = dedent(""" + _shared_docs["sum"] = dedent( + """ Calculate %(name)s sum of given DataFrame or Series. Parameters @@ -396,9 +436,11 @@ def aggregate(self, func, *args, **kwargs): 2 6.0 14.0 3 9.0 29.0 4 12.0 50.0 - """) + """ + ) - _shared_docs['mean'] = dedent(""" + _shared_docs["mean"] = dedent( + """ Calculate the %(name)s mean of the values. Parameters @@ -440,7 +482,8 @@ def aggregate(self, func, *args, **kwargs): 2 2.0 3 3.0 dtype: float64 - """) + """ + ) class Window(_Window): @@ -605,17 +648,16 @@ def validate(self): if window <= 0: raise ValueError("window must be > 0 ") import_optional_dependency( - "scipy", - extra="Scipy is required to generate window weight." + "scipy", extra="Scipy is required to generate window weight." ) import scipy.signal as sig if not isinstance(self.win_type, str): - raise ValueError('Invalid win_type {0}'.format(self.win_type)) + raise ValueError("Invalid win_type {0}".format(self.win_type)) if getattr(sig, self.win_type, None) is None: - raise ValueError('Invalid win_type {0}'.format(self.win_type)) + raise ValueError("Invalid win_type {0}".format(self.win_type)) else: - raise ValueError('Invalid window {0}'.format(window)) + raise ValueError("Invalid window {0}".format(window)) def _prep_window(self, **kwargs): """ @@ -631,16 +673,17 @@ def _prep_window(self, **kwargs): # the below may pop from kwargs def _validate_win_type(win_type, kwargs): - arg_map = {'kaiser': ['beta'], - 'gaussian': ['std'], - 'general_gaussian': ['power', 'width'], - 'slepian': ['width'], - 'exponential': ['tau'], - } + arg_map = { + "kaiser": ["beta"], + "gaussian": ["std"], + "general_gaussian": ["power", "width"], + "slepian": ["width"], + "exponential": ["tau"], + } if win_type in arg_map: win_args = _pop_args(win_type, arg_map[win_type], kwargs) - if win_type == 'exponential': + if win_type == "exponential": # exponential window requires the first arg (center) # to be set to None (necessary for symmetric window) win_args.insert(0, None) @@ -650,7 +693,7 @@ def _validate_win_type(win_type, kwargs): return win_type def _pop_args(win_type, arg_names, kwargs): - msg = '%s window requires %%s' % win_type + msg = "%s window requires %%s" % win_type all_args = [] for n in arg_names: if n not in kwargs: @@ -694,7 +737,7 @@ def _apply_window(self, mean=True, **kwargs): del block_list[i] continue else: - raise DataError('No numeric types to aggregate') + raise DataError("No numeric types to aggregate") if values.size == 0: results.append(values.copy()) @@ -705,10 +748,12 @@ def _apply_window(self, mean=True, **kwargs): def f(arg, *args, **kwargs): minp = _use_window(self.min_periods, len(window)) - return libwindow.roll_window(np.concatenate((arg, - additional_nans)) - if center else arg, window, minp, - avg=mean) + return libwindow.roll_window( + np.concatenate((arg, additional_nans)) if center else arg, + window, + minp, + avg=mean, + ) result = np.apply_along_axis(f, self.axis, values) @@ -718,14 +763,17 @@ def f(arg, *args, **kwargs): return self._wrap_results(results, block_list, obj, exclude) - _agg_see_also_doc = dedent(""" + _agg_see_also_doc = dedent( + """ See Also -------- pandas.DataFrame.rolling.aggregate pandas.DataFrame.aggregate - """) + """ + ) - _agg_examples_doc = dedent(""" + _agg_examples_doc = dedent( + """ Examples -------- @@ -755,14 +803,17 @@ def f(arg, *args, **kwargs): 7 0.906020 1.283573 0.085482 8 -0.096361 0.818139 0.472290 9 0.070889 0.134399 -0.031308 - """) - - @Substitution(see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - versionadded='', - klass='Series/DataFrame', - axis='') - @Appender(_shared_docs['aggregate']) + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="Series/DataFrame", + axis="", + ) + @Appender(_shared_docs["aggregate"]) def aggregate(self, arg, *args, **kwargs): result, how = self._aggregate(arg, *args, **kwargs) if result is None: @@ -774,16 +825,16 @@ def aggregate(self, arg, *args, **kwargs): agg = aggregate - @Substitution(name='window') - @Appender(_shared_docs['sum']) + @Substitution(name="window") + @Appender(_shared_docs["sum"]) def sum(self, *args, **kwargs): - nv.validate_window_func('sum', args, kwargs) + nv.validate_window_func("sum", args, kwargs) return self._apply_window(mean=False, **kwargs) - @Substitution(name='window') - @Appender(_shared_docs['mean']) + @Substitution(name="window") + @Appender(_shared_docs["mean"]) def mean(self, *args, **kwargs): - nv.validate_window_func('mean', args, kwargs) + nv.validate_window_func("mean", args, kwargs) return self._apply_window(mean=True, **kwargs) @@ -793,8 +844,8 @@ class _GroupByMixin(GroupByMixin): """ def __init__(self, obj, *args, **kwargs): - parent = kwargs.pop('parent', None) # noqa - groupby = kwargs.pop('groupby', None) + parent = kwargs.pop("parent", None) # noqa + groupby = kwargs.pop("groupby", None) if groupby is None: groupby, obj = obj, obj.obj self._groupby = groupby @@ -802,12 +853,13 @@ def __init__(self, obj, *args, **kwargs): self._groupby.grouper.mutated = True super().__init__(obj, *args, **kwargs) - count = GroupByMixin._dispatch('count') - corr = GroupByMixin._dispatch('corr', other=None, pairwise=None) - cov = GroupByMixin._dispatch('cov', other=None, pairwise=None) + count = GroupByMixin._dispatch("count") + corr = GroupByMixin._dispatch("corr", other=None, pairwise=None) + cov = GroupByMixin._dispatch("cov", other=None, pairwise=None) - def _apply(self, func, name=None, window=None, center=None, - check_minp=None, **kwargs): + def _apply( + self, func, name=None, window=None, center=None, check_minp=None, **kwargs + ): """ Dispatch to apply; we are stripping all of the _apply kwargs and performing the original function call on the grouped object. @@ -825,13 +877,13 @@ def f(x, name=name, *args): class _Rolling(_Window): - @property def _constructor(self): return Rolling - def _apply(self, func, name=None, window=None, center=None, - check_minp=None, **kwargs): + def _apply( + self, func, name=None, window=None, center=None, check_minp=None, **kwargs + ): """ Rolling statistical measure using supplied function. @@ -874,7 +926,7 @@ def _apply(self, func, name=None, window=None, center=None, del block_list[i] continue else: - raise DataError('No numeric types to aggregate') + raise DataError("No numeric types to aggregate") if values.size == 0: results.append(values.copy()) @@ -884,15 +936,16 @@ def _apply(self, func, name=None, window=None, center=None, if isinstance(func, str): cfunc = getattr(libwindow, func, None) if cfunc is None: - raise ValueError("we do not support this function " - "in libwindow.{func}".format(func=func)) + raise ValueError( + "we do not support this function " + "in libwindow.{func}".format(func=func) + ) def func(arg, window, min_periods=None, closed=None): minp = check_minp(min_periods, window) # ensure we are only rolling on floats arg = ensure_float64(arg) - return cfunc(arg, - window, minp, indexi, closed, **kwargs) + return cfunc(arg, window, minp, indexi, closed, **kwargs) # calculation function if center: @@ -900,16 +953,21 @@ def func(arg, window, min_periods=None, closed=None): additional_nans = np.array([np.NaN] * offset) def calc(x): - return func(np.concatenate((x, additional_nans)), - window, min_periods=self.min_periods, - closed=self.closed) + return func( + np.concatenate((x, additional_nans)), + window, + min_periods=self.min_periods, + closed=self.closed, + ) + else: def calc(x): - return func(x, window, min_periods=self.min_periods, - closed=self.closed) + return func( + x, window, min_periods=self.min_periods, closed=self.closed + ) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): if values.ndim > 1: result = np.apply_along_axis(calc, self.axis, values) else: @@ -925,7 +983,8 @@ def calc(x): class _Rolling_and_Expanding(_Rolling): - _shared_docs['count'] = dedent(r""" + _shared_docs["count"] = dedent( + r""" The %(name)s count of any non-NaN observations inside the window. Returns @@ -961,7 +1020,8 @@ class _Rolling_and_Expanding(_Rolling): 2 2.0 3 3.0 dtype: float64 - """) + """ + ) def count(self): @@ -975,15 +1035,20 @@ def count(self): results = [] for b in blocks: result = b.notna().astype(int) - result = self._constructor(result, window=window, min_periods=0, - center=self.center, - axis=self.axis, - closed=self.closed).sum() + result = self._constructor( + result, + window=window, + min_periods=0, + center=self.center, + axis=self.axis, + closed=self.closed, + ).sum() results.append(result) return self._wrap_results(results, blocks, obj) - _shared_docs['apply'] = dedent(r""" + _shared_docs["apply"] = dedent( + r""" The %(name)s function's apply function. Parameters @@ -1015,13 +1080,14 @@ def count(self): -------- Series.%(name)s : Series %(name)s. DataFrame.%(name)s : DataFrame %(name)s. - """) + """ + ) def apply(self, func, raw=None, args=(), kwargs={}): from pandas import Series # TODO: _level is unused? - _level = kwargs.pop('_level', None) # noqa + _level = kwargs.pop("_level", None) # noqa window = self._get_window() offset = _offset(window, self.center) index, indexi = self._get_index() @@ -1034,7 +1100,10 @@ def apply(self, func, raw=None, args=(), kwargs={}): "applied function. In the future, this will change to passing " "it as Series objects. You need to specify 'raw=True' to keep " "the current behaviour, and you can pass 'raw=False' to " - "silence this warning", FutureWarning, stacklevel=3) + "silence this warning", + FutureWarning, + stacklevel=3, + ) raw = True def f(arg, window, min_periods, closed): @@ -1042,30 +1111,32 @@ def f(arg, window, min_periods, closed): if not raw: arg = Series(arg, index=self.obj.index) return libwindow.roll_generic( - arg, window, minp, indexi, - closed, offset, func, raw, args, kwargs) + arg, window, minp, indexi, closed, offset, func, raw, args, kwargs + ) - return self._apply(f, func, args=args, kwargs=kwargs, - center=False, raw=raw) + return self._apply(f, func, args=args, kwargs=kwargs, center=False, raw=raw) def sum(self, *args, **kwargs): - nv.validate_window_func('sum', args, kwargs) - return self._apply('roll_sum', 'sum', **kwargs) + nv.validate_window_func("sum", args, kwargs) + return self._apply("roll_sum", "sum", **kwargs) - _shared_docs['max'] = dedent(""" + _shared_docs["max"] = dedent( + """ Calculate the %(name)s maximum. Parameters ---------- *args, **kwargs Arguments and keyword arguments to be passed into func. - """) + """ + ) def max(self, *args, **kwargs): - nv.validate_window_func('max', args, kwargs) - return self._apply('roll_max', 'max', **kwargs) + nv.validate_window_func("max", args, kwargs) + return self._apply("roll_max", "max", **kwargs) - _shared_docs['min'] = dedent(""" + _shared_docs["min"] = dedent( + """ Calculate the %(name)s minimum. Parameters @@ -1098,17 +1169,19 @@ def max(self, *args, **kwargs): 3 2.0 4 2.0 dtype: float64 - """) + """ + ) def min(self, *args, **kwargs): - nv.validate_window_func('min', args, kwargs) - return self._apply('roll_min', 'min', **kwargs) + nv.validate_window_func("min", args, kwargs) + return self._apply("roll_min", "min", **kwargs) def mean(self, *args, **kwargs): - nv.validate_window_func('mean', args, kwargs) - return self._apply('roll_mean', 'mean', **kwargs) + nv.validate_window_func("mean", args, kwargs) + return self._apply("roll_mean", "mean", **kwargs) - _shared_docs['median'] = dedent(""" + _shared_docs["median"] = dedent( + """ Calculate the %(name)s median. Parameters @@ -1141,12 +1214,14 @@ def mean(self, *args, **kwargs): 3 2.0 4 3.0 dtype: float64 - """) + """ + ) def median(self, **kwargs): - return self._apply('roll_median_c', 'median', **kwargs) + return self._apply("roll_median_c", "median", **kwargs) - _shared_docs['std'] = dedent(""" + _shared_docs["std"] = dedent( + """ Calculate %(name)s standard deviation. Normalized by N-1 by default. This can be changed using the `ddof` @@ -1202,22 +1277,26 @@ def median(self, **kwargs): 5 0.836660 6 0.786796 dtype: float64 - """) + """ + ) def std(self, ddof=1, *args, **kwargs): - nv.validate_window_func('std', args, kwargs) + nv.validate_window_func("std", args, kwargs) window = self._get_window() index, indexi = self._get_index() def f(arg, *args, **kwargs): minp = _require_min_periods(1)(self.min_periods, window) - return _zsqrt(libwindow.roll_var(arg, window, minp, indexi, - self.closed, ddof)) + return _zsqrt( + libwindow.roll_var(arg, window, minp, indexi, self.closed, ddof) + ) - return self._apply(f, 'std', check_minp=_require_min_periods(1), - ddof=ddof, **kwargs) + return self._apply( + f, "std", check_minp=_require_min_periods(1), ddof=ddof, **kwargs + ) - _shared_docs['var'] = dedent(""" + _shared_docs["var"] = dedent( + """ Calculate unbiased %(name)s variance. Normalized by N-1 by default. This can be changed using the `ddof` @@ -1273,15 +1352,18 @@ def f(arg, *args, **kwargs): 5 0.700000 6 0.619048 dtype: float64 - """) + """ + ) def var(self, ddof=1, *args, **kwargs): - nv.validate_window_func('var', args, kwargs) - return self._apply('roll_var', 'var', - check_minp=_require_min_periods(1), ddof=ddof, - **kwargs) - - _shared_docs['skew'] = """ + nv.validate_window_func("var", args, kwargs) + return self._apply( + "roll_var", "var", check_minp=_require_min_periods(1), ddof=ddof, **kwargs + ) + + _shared_docs[ + "skew" + ] = """ Unbiased %(name)s skewness. Parameters @@ -1291,10 +1373,12 @@ def var(self, ddof=1, *args, **kwargs): """ def skew(self, **kwargs): - return self._apply('roll_skew', 'skew', - check_minp=_require_min_periods(3), **kwargs) + return self._apply( + "roll_skew", "skew", check_minp=_require_min_periods(3), **kwargs + ) - _shared_docs['kurt'] = dedent(""" + _shared_docs["kurt"] = dedent( + """ Calculate unbiased %(name)s kurtosis. This function uses Fisher's definition of kurtosis without bias. @@ -1322,13 +1406,16 @@ def skew(self, **kwargs): Notes ----- A minimum of 4 periods is required for the %(name)s calculation. - """) + """ + ) def kurt(self, **kwargs): - return self._apply('roll_kurt', 'kurt', - check_minp=_require_min_periods(4), **kwargs) + return self._apply( + "roll_kurt", "kurt", check_minp=_require_min_periods(4), **kwargs + ) - _shared_docs['quantile'] = dedent(""" + _shared_docs["quantile"] = dedent( + """ Calculate the %(name)s quantile. Parameters @@ -1380,29 +1467,29 @@ def kurt(self, **kwargs): 2 2.5 3 3.5 dtype: float64 - """) + """ + ) - def quantile(self, quantile, interpolation='linear', **kwargs): + def quantile(self, quantile, interpolation="linear", **kwargs): window = self._get_window() index, indexi = self._get_index() def f(arg, *args, **kwargs): minp = _use_window(self.min_periods, window) if quantile == 1.0: - return libwindow.roll_max(arg, window, minp, indexi, - self.closed) + return libwindow.roll_max(arg, window, minp, indexi, self.closed) elif quantile == 0.0: - return libwindow.roll_min(arg, window, minp, indexi, - self.closed) + return libwindow.roll_min(arg, window, minp, indexi, self.closed) else: - return libwindow.roll_quantile(arg, window, minp, indexi, - self.closed, quantile, - interpolation) + return libwindow.roll_quantile( + arg, window, minp, indexi, self.closed, quantile, interpolation + ) - return self._apply(f, 'quantile', quantile=quantile, - **kwargs) + return self._apply(f, "quantile", quantile=quantile, **kwargs) - _shared_docs['cov'] = """ + _shared_docs[ + "cov" + ] = """ Calculate the %(name)s sample covariance. Parameters @@ -1440,19 +1527,21 @@ def cov(self, other=None, pairwise=None, ddof=1, **kwargs): def _get_cov(X, Y): # GH #12373 : rolling functions error on float32 data # to avoid potential overflow, cast the data to float64 - X = X.astype('float64') - Y = Y.astype('float64') - mean = lambda x: x.rolling(window, self.min_periods, - center=self.center).mean(**kwargs) - count = (X + Y).rolling(window=window, - center=self.center).count(**kwargs) + X = X.astype("float64") + Y = Y.astype("float64") + mean = lambda x: x.rolling( + window, self.min_periods, center=self.center + ).mean(**kwargs) + count = (X + Y).rolling(window=window, center=self.center).count(**kwargs) bias_adj = count / (count - ddof) return (mean(X * Y) - mean(X) * mean(Y)) * bias_adj - return _flex_binary_moment(self._selected_obj, other._selected_obj, - _get_cov, pairwise=bool(pairwise)) + return _flex_binary_moment( + self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise) + ) - _shared_docs['corr'] = dedent(""" + _shared_docs["corr"] = dedent( + """ Calculate %(name)s correlation. Parameters @@ -1559,7 +1648,8 @@ def _get_cov(X, Y): Y 0.626300 1.000000 4 X 1.000000 0.555368 Y 0.555368 1.000000 - """) + """ + ) def corr(self, other=None, pairwise=None, **kwargs): if other is None: @@ -1570,60 +1660,66 @@ def corr(self, other=None, pairwise=None, **kwargs): window = self._get_window(other) def _get_corr(a, b): - a = a.rolling(window=window, min_periods=self.min_periods, - center=self.center) - b = b.rolling(window=window, min_periods=self.min_periods, - center=self.center) + a = a.rolling( + window=window, min_periods=self.min_periods, center=self.center + ) + b = b.rolling( + window=window, min_periods=self.min_periods, center=self.center + ) return a.cov(b, **kwargs) / (a.std(**kwargs) * b.std(**kwargs)) - return _flex_binary_moment(self._selected_obj, other._selected_obj, - _get_corr, pairwise=bool(pairwise)) + return _flex_binary_moment( + self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) + ) class Rolling(_Rolling_and_Expanding): - @cache_readonly def is_datetimelike(self): - return isinstance(self._on, - (ABCDatetimeIndex, - ABCTimedeltaIndex, - ABCPeriodIndex)) + return isinstance( + self._on, (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex) + ) @cache_readonly def _on(self): if self.on is None: return self.obj.index - elif (isinstance(self.obj, ABCDataFrame) and - self.on in self.obj.columns): + elif isinstance(self.obj, ABCDataFrame) and self.on in self.obj.columns: from pandas import Index + return Index(self.obj[self.on]) else: - raise ValueError("invalid on specified as {0}, " - "must be a column (if DataFrame) " - "or None".format(self.on)) + raise ValueError( + "invalid on specified as {0}, " + "must be a column (if DataFrame) " + "or None".format(self.on) + ) def validate(self): super().validate() # we allow rolling on a datetimelike index - if ((self.obj.empty or self.is_datetimelike) and - isinstance(self.window, (str, ABCDateOffset, timedelta))): + if (self.obj.empty or self.is_datetimelike) and isinstance( + self.window, (str, ABCDateOffset, timedelta) + ): self._validate_monotonic() freq = self._validate_freq() # we don't allow center if self.center: - raise NotImplementedError("center is not implemented " - "for datetimelike and offset " - "based windows") + raise NotImplementedError( + "center is not implemented " + "for datetimelike and offset " + "based windows" + ) # this will raise ValueError on non-fixed freqs self.win_freq = self.window self.window = freq.nanos - self.win_type = 'freq' + self.win_type = "freq" # min_periods must be an integer if self.min_periods is None: @@ -1635,38 +1731,44 @@ def validate(self): raise ValueError("window must be non-negative") if not self.is_datetimelike and self.closed is not None: - raise ValueError("closed only implemented for datetimelike " - "and offset based windows") + raise ValueError( + "closed only implemented for datetimelike " "and offset based windows" + ) def _validate_monotonic(self): """ Validate on is_monotonic. """ if not self._on.is_monotonic: - formatted = self.on or 'index' - raise ValueError("{0} must be " - "monotonic".format(formatted)) + formatted = self.on or "index" + raise ValueError("{0} must be " "monotonic".format(formatted)) def _validate_freq(self): """ Validate & return window frequency. """ from pandas.tseries.frequencies import to_offset + try: return to_offset(self.window) except (TypeError, ValueError): - raise ValueError("passed window {0} is not " - "compatible with a datetimelike " - "index".format(self.window)) + raise ValueError( + "passed window {0} is not " + "compatible with a datetimelike " + "index".format(self.window) + ) - _agg_see_also_doc = dedent(""" + _agg_see_also_doc = dedent( + """ See Also -------- Series.rolling DataFrame.rolling - """) + """ + ) - _agg_examples_doc = dedent(""" + _agg_examples_doc = dedent( + """ Examples -------- @@ -1709,83 +1811,87 @@ def _validate_freq(self): 7 2.718061 -1.647453 8 -0.289082 -1.647453 9 0.212668 -1.647453 - """) - - @Substitution(see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - versionadded='', - klass='Series/Dataframe', - axis='') - @Appender(_shared_docs['aggregate']) + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="Series/Dataframe", + axis="", + ) + @Appender(_shared_docs["aggregate"]) def aggregate(self, arg, *args, **kwargs): return super().aggregate(arg, *args, **kwargs) agg = aggregate - @Substitution(name='rolling') - @Appender(_shared_docs['count']) + @Substitution(name="rolling") + @Appender(_shared_docs["count"]) def count(self): # different impl for freq counting if self.is_freq_type: - return self._apply('roll_count', 'count') + return self._apply("roll_count", "count") return super().count() - @Substitution(name='rolling') - @Appender(_shared_docs['apply']) + @Substitution(name="rolling") + @Appender(_shared_docs["apply"]) def apply(self, func, raw=None, args=(), kwargs={}): return super().apply(func, raw=raw, args=args, kwargs=kwargs) - @Substitution(name='rolling') - @Appender(_shared_docs['sum']) + @Substitution(name="rolling") + @Appender(_shared_docs["sum"]) def sum(self, *args, **kwargs): - nv.validate_rolling_func('sum', args, kwargs) + nv.validate_rolling_func("sum", args, kwargs) return super().sum(*args, **kwargs) - @Substitution(name='rolling') + @Substitution(name="rolling") @Appender(_doc_template) - @Appender(_shared_docs['max']) + @Appender(_shared_docs["max"]) def max(self, *args, **kwargs): - nv.validate_rolling_func('max', args, kwargs) + nv.validate_rolling_func("max", args, kwargs) return super().max(*args, **kwargs) - @Substitution(name='rolling') - @Appender(_shared_docs['min']) + @Substitution(name="rolling") + @Appender(_shared_docs["min"]) def min(self, *args, **kwargs): - nv.validate_rolling_func('min', args, kwargs) + nv.validate_rolling_func("min", args, kwargs) return super().min(*args, **kwargs) - @Substitution(name='rolling') - @Appender(_shared_docs['mean']) + @Substitution(name="rolling") + @Appender(_shared_docs["mean"]) def mean(self, *args, **kwargs): - nv.validate_rolling_func('mean', args, kwargs) + nv.validate_rolling_func("mean", args, kwargs) return super().mean(*args, **kwargs) - @Substitution(name='rolling') - @Appender(_shared_docs['median']) + @Substitution(name="rolling") + @Appender(_shared_docs["median"]) def median(self, **kwargs): return super().median(**kwargs) - @Substitution(name='rolling') - @Appender(_shared_docs['std']) + @Substitution(name="rolling") + @Appender(_shared_docs["std"]) def std(self, ddof=1, *args, **kwargs): - nv.validate_rolling_func('std', args, kwargs) + nv.validate_rolling_func("std", args, kwargs) return super().std(ddof=ddof, **kwargs) - @Substitution(name='rolling') - @Appender(_shared_docs['var']) + @Substitution(name="rolling") + @Appender(_shared_docs["var"]) def var(self, ddof=1, *args, **kwargs): - nv.validate_rolling_func('var', args, kwargs) + nv.validate_rolling_func("var", args, kwargs) return super().var(ddof=ddof, **kwargs) - @Substitution(name='rolling') + @Substitution(name="rolling") @Appender(_doc_template) - @Appender(_shared_docs['skew']) + @Appender(_shared_docs["skew"]) def skew(self, **kwargs): return super().skew(**kwargs) - _agg_doc = dedent(""" + _agg_doc = dedent( + """ Examples -------- @@ -1807,28 +1913,30 @@ def skew(self, **kwargs): 3 -1.200000 4 3.999946 dtype: float64 - """) + """ + ) @Appender(_agg_doc) - @Substitution(name='rolling') - @Appender(_shared_docs['kurt']) + @Substitution(name="rolling") + @Appender(_shared_docs["kurt"]) def kurt(self, **kwargs): return super().kurt(**kwargs) - @Substitution(name='rolling') - @Appender(_shared_docs['quantile']) - def quantile(self, quantile, interpolation='linear', **kwargs): - return super().quantile(quantile=quantile, interpolation=interpolation, - **kwargs) + @Substitution(name="rolling") + @Appender(_shared_docs["quantile"]) + def quantile(self, quantile, interpolation="linear", **kwargs): + return super().quantile( + quantile=quantile, interpolation=interpolation, **kwargs + ) - @Substitution(name='rolling') + @Substitution(name="rolling") @Appender(_doc_template) - @Appender(_shared_docs['cov']) + @Appender(_shared_docs["cov"]) def cov(self, other=None, pairwise=None, ddof=1, **kwargs): return super().cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs) - @Substitution(name='rolling') - @Appender(_shared_docs['corr']) + @Substitution(name="rolling") + @Appender(_shared_docs["corr"]) def corr(self, other=None, pairwise=None, **kwargs): return super().corr(other=other, pairwise=pairwise, **kwargs) @@ -1840,6 +1948,7 @@ class RollingGroupby(_GroupByMixin, Rolling): .. versionadded:: 0.18.1 """ + @property def _constructor(self): return Rolling @@ -1913,12 +2022,10 @@ class Expanding(_Rolling_and_Expanding): 4 7.0 """ - _attributes = ['min_periods', 'center', 'axis'] + _attributes = ["min_periods", "center", "axis"] - def __init__(self, obj, min_periods=1, center=False, axis=0, - **kwargs): - super().__init__(obj=obj, min_periods=min_periods, center=center, - axis=axis) + def __init__(self, obj, min_periods=1, center=False, axis=0, **kwargs): + super().__init__(obj=obj, min_periods=min_periods, center=center, axis=axis) @property def _constructor(self): @@ -1945,15 +2052,18 @@ def _get_window(self, other=None): other = self.min_periods or -1 return max(length, other) - _agg_see_also_doc = dedent(""" + _agg_see_also_doc = dedent( + """ See Also -------- DataFrame.expanding.aggregate DataFrame.rolling.aggregate DataFrame.aggregate - """) + """ + ) - _agg_examples_doc = dedent(""" + _agg_examples_doc = dedent( + """ Examples -------- @@ -1983,79 +2093,82 @@ def _get_window(self, other=None): 7 0.680292 0.132049 0.548693 8 0.067236 0.948257 0.163353 9 -0.286980 0.618493 -0.694496 - """) - - @Substitution(see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - versionadded='', - klass='Series/Dataframe', - axis='') - @Appender(_shared_docs['aggregate']) + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="Series/Dataframe", + axis="", + ) + @Appender(_shared_docs["aggregate"]) def aggregate(self, arg, *args, **kwargs): return super().aggregate(arg, *args, **kwargs) agg = aggregate - @Substitution(name='expanding') - @Appender(_shared_docs['count']) + @Substitution(name="expanding") + @Appender(_shared_docs["count"]) def count(self, **kwargs): return super().count(**kwargs) - @Substitution(name='expanding') - @Appender(_shared_docs['apply']) + @Substitution(name="expanding") + @Appender(_shared_docs["apply"]) def apply(self, func, raw=None, args=(), kwargs={}): - return super().apply( - func, raw=raw, args=args, kwargs=kwargs) + return super().apply(func, raw=raw, args=args, kwargs=kwargs) - @Substitution(name='expanding') - @Appender(_shared_docs['sum']) + @Substitution(name="expanding") + @Appender(_shared_docs["sum"]) def sum(self, *args, **kwargs): - nv.validate_expanding_func('sum', args, kwargs) + nv.validate_expanding_func("sum", args, kwargs) return super().sum(*args, **kwargs) - @Substitution(name='expanding') + @Substitution(name="expanding") @Appender(_doc_template) - @Appender(_shared_docs['max']) + @Appender(_shared_docs["max"]) def max(self, *args, **kwargs): - nv.validate_expanding_func('max', args, kwargs) + nv.validate_expanding_func("max", args, kwargs) return super().max(*args, **kwargs) - @Substitution(name='expanding') - @Appender(_shared_docs['min']) + @Substitution(name="expanding") + @Appender(_shared_docs["min"]) def min(self, *args, **kwargs): - nv.validate_expanding_func('min', args, kwargs) + nv.validate_expanding_func("min", args, kwargs) return super().min(*args, **kwargs) - @Substitution(name='expanding') - @Appender(_shared_docs['mean']) + @Substitution(name="expanding") + @Appender(_shared_docs["mean"]) def mean(self, *args, **kwargs): - nv.validate_expanding_func('mean', args, kwargs) + nv.validate_expanding_func("mean", args, kwargs) return super().mean(*args, **kwargs) - @Substitution(name='expanding') - @Appender(_shared_docs['median']) + @Substitution(name="expanding") + @Appender(_shared_docs["median"]) def median(self, **kwargs): return super().median(**kwargs) - @Substitution(name='expanding') - @Appender(_shared_docs['std']) + @Substitution(name="expanding") + @Appender(_shared_docs["std"]) def std(self, ddof=1, *args, **kwargs): - nv.validate_expanding_func('std', args, kwargs) + nv.validate_expanding_func("std", args, kwargs) return super().std(ddof=ddof, **kwargs) - @Substitution(name='expanding') - @Appender(_shared_docs['var']) + @Substitution(name="expanding") + @Appender(_shared_docs["var"]) def var(self, ddof=1, *args, **kwargs): - nv.validate_expanding_func('var', args, kwargs) + nv.validate_expanding_func("var", args, kwargs) return super().var(ddof=ddof, **kwargs) - @Substitution(name='expanding') + @Substitution(name="expanding") @Appender(_doc_template) - @Appender(_shared_docs['skew']) + @Appender(_shared_docs["skew"]) def skew(self, **kwargs): return super().skew(**kwargs) - _agg_doc = dedent(""" + _agg_doc = dedent( + """ Examples -------- @@ -2077,29 +2190,30 @@ def skew(self, **kwargs): 3 -1.200000 4 4.999874 dtype: float64 - """) + """ + ) @Appender(_agg_doc) - @Substitution(name='expanding') - @Appender(_shared_docs['kurt']) + @Substitution(name="expanding") + @Appender(_shared_docs["kurt"]) def kurt(self, **kwargs): return super().kurt(**kwargs) - @Substitution(name='expanding') - @Appender(_shared_docs['quantile']) - def quantile(self, quantile, interpolation='linear', **kwargs): - return super().quantile(quantile=quantile, - interpolation=interpolation, - **kwargs) + @Substitution(name="expanding") + @Appender(_shared_docs["quantile"]) + def quantile(self, quantile, interpolation="linear", **kwargs): + return super().quantile( + quantile=quantile, interpolation=interpolation, **kwargs + ) - @Substitution(name='expanding') + @Substitution(name="expanding") @Appender(_doc_template) - @Appender(_shared_docs['cov']) + @Appender(_shared_docs["cov"]) def cov(self, other=None, pairwise=None, ddof=1, **kwargs): return super().cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs) - @Substitution(name='expanding') - @Appender(_shared_docs['corr']) + @Substitution(name="expanding") + @Appender(_shared_docs["corr"]) def corr(self, other=None, pairwise=None, **kwargs): return super().corr(other=other, pairwise=pairwise, **kwargs) @@ -2111,6 +2225,7 @@ class ExpandingGroupby(_GroupByMixin, Expanding): .. versionadded:: 0.18.1 """ + @property def _constructor(self): return Expanding @@ -2239,11 +2354,20 @@ class EWM(_Rolling): 3 1.615385 4 3.670213 """ - _attributes = ['com', 'min_periods', 'adjust', 'ignore_na', 'axis'] - - def __init__(self, obj, com=None, span=None, halflife=None, alpha=None, - min_periods=0, adjust=True, ignore_na=False, - axis=0): + _attributes = ["com", "min_periods", "adjust", "ignore_na", "axis"] + + def __init__( + self, + obj, + com=None, + span=None, + halflife=None, + alpha=None, + min_periods=0, + adjust=True, + ignore_na=False, + axis=0, + ): self.obj = obj self.com = _get_center_of_mass(com, span, halflife, alpha) self.min_periods = min_periods @@ -2256,13 +2380,16 @@ def __init__(self, obj, com=None, span=None, halflife=None, alpha=None, def _constructor(self): return EWM - _agg_see_also_doc = dedent(""" + _agg_see_also_doc = dedent( + """ See Also -------- pandas.DataFrame.rolling.aggregate - """) + """ + ) - _agg_examples_doc = dedent(""" + _agg_examples_doc = dedent( + """ Examples -------- @@ -2292,14 +2419,17 @@ def _constructor(self): 7 0.680292 0.132049 0.548693 8 0.067236 0.948257 0.163353 9 -0.286980 0.618493 -0.694496 - """) - - @Substitution(see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - versionadded='', - klass='Series/Dataframe', - axis='') - @Appender(_shared_docs['aggregate']) + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="Series/Dataframe", + axis="", + ) + @Appender(_shared_docs["aggregate"]) def aggregate(self, arg, *args, **kwargs): return super().aggregate(arg, *args, **kwargs) @@ -2333,7 +2463,7 @@ def _apply(self, func, **kwargs): del block_list[i] continue else: - raise DataError('No numeric types to aggregate') + raise DataError("No numeric types to aggregate") if values.size == 0: results.append(values.copy()) @@ -2343,18 +2473,25 @@ def _apply(self, func, **kwargs): if isinstance(func, str): cfunc = getattr(libwindow, func, None) if cfunc is None: - raise ValueError("we do not support this function " - "in libwindow.{func}".format(func=func)) + raise ValueError( + "we do not support this function " + "in libwindow.{func}".format(func=func) + ) def func(arg): - return cfunc(arg, self.com, int(self.adjust), - int(self.ignore_na), int(self.min_periods)) + return cfunc( + arg, + self.com, + int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + ) results.append(np.apply_along_axis(func, self.axis, values)) return self._wrap_results(results, block_list, obj, exclude) - @Substitution(name='ewm') + @Substitution(name="ewm") @Appender(_doc_template) def mean(self, *args, **kwargs): """ @@ -2365,38 +2502,44 @@ def mean(self, *args, **kwargs): *args, **kwargs Arguments and keyword arguments to be passed into func. """ - nv.validate_window_func('mean', args, kwargs) - return self._apply('ewma', **kwargs) + nv.validate_window_func("mean", args, kwargs) + return self._apply("ewma", **kwargs) - @Substitution(name='ewm') + @Substitution(name="ewm") @Appender(_doc_template) @Appender(_bias_template) def std(self, bias=False, *args, **kwargs): """ Exponential weighted moving stddev. """ - nv.validate_window_func('std', args, kwargs) + nv.validate_window_func("std", args, kwargs) return _zsqrt(self.var(bias=bias, **kwargs)) vol = std - @Substitution(name='ewm') + @Substitution(name="ewm") @Appender(_doc_template) @Appender(_bias_template) def var(self, bias=False, *args, **kwargs): """ Exponential weighted moving variance. """ - nv.validate_window_func('var', args, kwargs) + nv.validate_window_func("var", args, kwargs) def f(arg): - return libwindow.ewmcov(arg, arg, self.com, int(self.adjust), - int(self.ignore_na), int(self.min_periods), - int(bias)) + return libwindow.ewmcov( + arg, + arg, + self.com, + int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + int(bias), + ) return self._apply(f, **kwargs) - @Substitution(name='ewm') + @Substitution(name="ewm") @Appender(_doc_template) @Appender(_pairwise_template) def cov(self, other=None, pairwise=None, bias=False, **kwargs): @@ -2412,16 +2555,22 @@ def cov(self, other=None, pairwise=None, bias=False, **kwargs): def _get_cov(X, Y): X = self._shallow_copy(X) Y = self._shallow_copy(Y) - cov = libwindow.ewmcov(X._prep_values(), Y._prep_values(), - self.com, int(self.adjust), - int(self.ignore_na), int(self.min_periods), - int(bias)) + cov = libwindow.ewmcov( + X._prep_values(), + Y._prep_values(), + self.com, + int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + int(bias), + ) return X._wrap_result(cov) - return _flex_binary_moment(self._selected_obj, other._selected_obj, - _get_cov, pairwise=bool(pairwise)) + return _flex_binary_moment( + self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise) + ) - @Substitution(name='ewm') + @Substitution(name="ewm") @Appender(_doc_template) @Appender(_pairwise_template) def corr(self, other=None, pairwise=None, **kwargs): @@ -2439,35 +2588,47 @@ def _get_corr(X, Y): Y = self._shallow_copy(Y) def _cov(x, y): - return libwindow.ewmcov(x, y, self.com, int(self.adjust), - int(self.ignore_na), - int(self.min_periods), - 1) + return libwindow.ewmcov( + x, + y, + self.com, + int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + 1, + ) x_values = X._prep_values() y_values = Y._prep_values() - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): cov = _cov(x_values, y_values) x_var = _cov(x_values, x_values) y_var = _cov(y_values, y_values) corr = cov / _zsqrt(x_var * y_var) return X._wrap_result(corr) - return _flex_binary_moment(self._selected_obj, other._selected_obj, - _get_corr, pairwise=bool(pairwise)) + return _flex_binary_moment( + self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) + ) + # Helper Funcs def _flex_binary_moment(arg1, arg2, f, pairwise=False): - if not (isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame)) and - isinstance(arg2, (np.ndarray, ABCSeries, ABCDataFrame))): - raise TypeError("arguments to moment function must be of type " - "np.ndarray/Series/DataFrame") - - if (isinstance(arg1, (np.ndarray, ABCSeries)) and - isinstance(arg2, (np.ndarray, ABCSeries))): + if not ( + isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame)) + and isinstance(arg2, (np.ndarray, ABCSeries, ABCDataFrame)) + ): + raise TypeError( + "arguments to moment function must be of type " + "np.ndarray/Series/DataFrame" + ) + + if isinstance(arg1, (np.ndarray, ABCSeries)) and isinstance( + arg2, (np.ndarray, ABCSeries) + ): X, Y = _prep_binary(arg1, arg2) return f(X, Y) @@ -2495,7 +2656,7 @@ def dataframe_from_int_dict(data, frame_template): raise ValueError("'arg2' columns are not unique") with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) - X, Y = arg1.align(arg2, join='outer') + X, Y = arg1.align(arg2, join="outer") X = X + 0 * Y Y = Y + 0 * X @@ -2505,8 +2666,7 @@ def dataframe_from_int_dict(data, frame_template): for col in res_columns: if col in X and col in Y: results[col] = f(X[col], Y[col]) - return DataFrame(results, index=X.index, - columns=res_columns) + return DataFrame(results, index=X.index, columns=res_columns) elif pairwise is True: results = defaultdict(dict) for i, k1 in enumerate(arg1.columns): @@ -2515,8 +2675,9 @@ def dataframe_from_int_dict(data, frame_template): # Symmetric case results[i][j] = results[j][i] else: - results[i][j] = f(*_prep_binary(arg1.iloc[:, i], - arg2.iloc[:, j])) + results[i][j] = f( + *_prep_binary(arg1.iloc[:, i], arg2.iloc[:, j]) + ) from pandas import MultiIndex, concat @@ -2525,50 +2686,60 @@ def dataframe_from_int_dict(data, frame_template): # construct result frame result = concat( - [concat([results[i][j] - for j, c in enumerate(arg2.columns)], - ignore_index=True) - for i, c in enumerate(arg1.columns)], + [ + concat( + [results[i][j] for j, c in enumerate(arg2.columns)], + ignore_index=True, + ) + for i, c in enumerate(arg1.columns) + ], ignore_index=True, - axis=1) + axis=1, + ) result.columns = arg1.columns # set the index and reorder if arg2.columns.nlevels > 1: result.index = MultiIndex.from_product( - arg2.columns.levels + [result_index]) + arg2.columns.levels + [result_index] + ) result = result.reorder_levels([2, 0, 1]).sort_index() else: result.index = MultiIndex.from_product( - [range(len(arg2.columns)), - range(len(result_index))]) + [range(len(arg2.columns)), range(len(result_index))] + ) result = result.swaplevel(1, 0).sort_index() result.index = MultiIndex.from_product( - [result_index] + [arg2.columns]) + [result_index] + [arg2.columns] + ) else: # empty result result = DataFrame( - index=MultiIndex(levels=[arg1.index, arg2.columns], - codes=[[], []]), + index=MultiIndex( + levels=[arg1.index, arg2.columns], codes=[[], []] + ), columns=arg2.columns, - dtype='float64') + dtype="float64", + ) # reset our index names to arg1 names # reset our column names to arg2 names # careful not to mutate the original names - result.columns = result.columns.set_names( - arg1.columns.names) + result.columns = result.columns.set_names(arg1.columns.names) result.index = result.index.set_names( - result_index.names + arg2.columns.names) + result_index.names + arg2.columns.names + ) return result else: raise ValueError("'pairwise' is not True/False") else: - results = {i: f(*_prep_binary(arg1.iloc[:, i], arg2)) - for i, col in enumerate(arg1.columns)} + results = { + i: f(*_prep_binary(arg1.iloc[:, i], arg2)) + for i, col in enumerate(arg1.columns) + } return dataframe_from_int_dict(results, arg1) else: @@ -2578,8 +2749,7 @@ def dataframe_from_int_dict(data, frame_template): def _get_center_of_mass(comass, span, halflife, alpha): valid_count = com.count_not_none(comass, span, halflife, alpha) if valid_count > 1: - raise ValueError("comass, span, halflife, and alpha " - "are mutually exclusive") + raise ValueError("comass, span, halflife, and alpha " "are mutually exclusive") # Convert to center of mass; domain checks ensure 0 < alpha <= 1 if comass is not None: @@ -2588,7 +2758,7 @@ def _get_center_of_mass(comass, span, halflife, alpha): elif span is not None: if span < 1: raise ValueError("span must satisfy: span >= 1") - comass = (span - 1) / 2. + comass = (span - 1) / 2.0 elif halflife is not None: if halflife <= 0: raise ValueError("halflife must satisfy: halflife > 0") @@ -2607,7 +2777,7 @@ def _get_center_of_mass(comass, span, halflife, alpha): def _offset(window, center): if not is_integer(window): window = len(window) - offset = (window - 1) / 2. if center else 0 + offset = (window - 1) / 2.0 if center else 0 try: return int(offset) except TypeError: @@ -2632,7 +2802,7 @@ def _use_window(minp, window): def _zsqrt(x): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = np.sqrt(x) mask = x < 0 @@ -2648,7 +2818,7 @@ def _zsqrt(x): def _prep_binary(arg1, arg2): if not isinstance(arg2, type(arg1)): - raise Exception('Input arrays must be of the same type!') + raise Exception("Input arrays must be of the same type!") # mask out values, this also makes a common index... X = arg1 + 0 * arg2 @@ -2662,7 +2832,7 @@ def _prep_binary(arg1, arg2): def rolling(obj, win_type=None, **kwds): if not isinstance(obj, (ABCSeries, ABCDataFrame)): - raise TypeError('invalid type: %s' % type(obj)) + raise TypeError("invalid type: %s" % type(obj)) if win_type is not None: return Window(obj, win_type=win_type, **kwds) @@ -2675,7 +2845,7 @@ def rolling(obj, win_type=None, **kwds): def expanding(obj, **kwds): if not isinstance(obj, (ABCSeries, ABCDataFrame)): - raise TypeError('invalid type: %s' % type(obj)) + raise TypeError("invalid type: %s" % type(obj)) return Expanding(obj, **kwds) @@ -2685,7 +2855,7 @@ def expanding(obj, **kwds): def ewm(obj, **kwds): if not isinstance(obj, (ABCSeries, ABCDataFrame)): - raise TypeError('invalid type: %s' % type(obj)) + raise TypeError("invalid type: %s" % type(obj)) return EWM(obj, **kwds) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 3b8904f4c1ef61..3177937ac4ba19 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -174,17 +174,18 @@ class AbstractMethodError(NotImplementedError): while keeping compatibility with Python 2 and Python 3. """ - def __init__(self, class_instance, methodtype='method'): - types = {'method', 'classmethod', 'staticmethod', 'property'} + def __init__(self, class_instance, methodtype="method"): + types = {"method", "classmethod", "staticmethod", "property"} if methodtype not in types: - msg = 'methodtype must be one of {}, got {} instead.'.format( - methodtype, types) + msg = "methodtype must be one of {}, got {} instead.".format( + methodtype, types + ) raise ValueError(msg) self.methodtype = methodtype self.class_instance = class_instance def __str__(self): - if self.methodtype == 'classmethod': + if self.methodtype == "classmethod": name = self.class_instance.__name__ else: name = self.class_instance.__class__.__name__ diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py index e033d882a73f7e..caa928731fb3a9 100644 --- a/pandas/io/clipboard/__init__.py +++ b/pandas/io/clipboard/__init__.py @@ -23,15 +23,20 @@ This module does not work with PyGObject yet. """ -__version__ = '1.5.27' +__version__ = "1.5.27" import os import platform import subprocess from .clipboards import ( - init_klipper_clipboard, init_no_clipboard, init_osx_clipboard, - init_qt_clipboard, init_xclip_clipboard, init_xsel_clipboard) + init_klipper_clipboard, + init_no_clipboard, + init_osx_clipboard, + init_qt_clipboard, + init_xclip_clipboard, + init_xsel_clipboard, +) from .windows import init_windows_clipboard # `import qtpy` sys.exit()s if DISPLAY is not in the environment. @@ -42,20 +47,24 @@ def _executable_exists(name): - return subprocess.call([CHECK_CMD, name], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0 + return ( + subprocess.call( + [CHECK_CMD, name], stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + == 0 + ) def determine_clipboard(): # Determine the OS/platform and set # the copy() and paste() functions accordingly. - if 'cygwin' in platform.system().lower(): + if "cygwin" in platform.system().lower(): # FIXME: pyperclip currently does not support Cygwin, # see https://github.com/asweigart/pyperclip/issues/55 pass - elif os.name == 'nt' or platform.system() == 'Windows': + elif os.name == "nt" or platform.system() == "Windows": return init_windows_clipboard() - if os.name == 'mac' or platform.system() == 'Darwin': + if os.name == "mac" or platform.system() == "Darwin": return init_osx_clipboard() if HAS_DISPLAY: # Determine which command/module is installed, if any. @@ -94,13 +103,15 @@ def determine_clipboard(): def set_clipboard(clipboard): global copy, paste - clipboard_types = {'osx': init_osx_clipboard, - 'qt': init_qt_clipboard, - 'xclip': init_xclip_clipboard, - 'xsel': init_xsel_clipboard, - 'klipper': init_klipper_clipboard, - 'windows': init_windows_clipboard, - 'no': init_no_clipboard} + clipboard_types = { + "osx": init_osx_clipboard, + "qt": init_qt_clipboard, + "xclip": init_xclip_clipboard, + "xsel": init_xsel_clipboard, + "klipper": init_klipper_clipboard, + "windows": init_windows_clipboard, + "no": init_no_clipboard, + } copy, paste = clipboard_types[clipboard]() diff --git a/pandas/io/clipboard/clipboards.py b/pandas/io/clipboard/clipboards.py index 52abdeafb5eccb..cb4ed8ed549d02 100644 --- a/pandas/io/clipboard/clipboards.py +++ b/pandas/io/clipboard/clipboards.py @@ -9,15 +9,13 @@ def init_osx_clipboard(): def copy_osx(text): - p = subprocess.Popen(['pbcopy', 'w'], - stdin=subprocess.PIPE, close_fds=True) - p.communicate(input=text.encode('utf-8')) + p = subprocess.Popen(["pbcopy", "w"], stdin=subprocess.PIPE, close_fds=True) + p.communicate(input=text.encode("utf-8")) def paste_osx(): - p = subprocess.Popen(['pbpaste', 'r'], - stdout=subprocess.PIPE, close_fds=True) + p = subprocess.Popen(["pbpaste", "r"], stdout=subprocess.PIPE, close_fds=True) stdout, stderr = p.communicate() - return stdout.decode('utf-8') + return stdout.decode("utf-8") return copy_osx, paste_osx @@ -51,30 +49,34 @@ def paste_qt(): def init_xclip_clipboard(): def copy_xclip(text): - p = subprocess.Popen(['xclip', '-selection', 'c'], - stdin=subprocess.PIPE, close_fds=True) - p.communicate(input=text.encode('utf-8')) + p = subprocess.Popen( + ["xclip", "-selection", "c"], stdin=subprocess.PIPE, close_fds=True + ) + p.communicate(input=text.encode("utf-8")) def paste_xclip(): - p = subprocess.Popen(['xclip', '-selection', 'c', '-o'], - stdout=subprocess.PIPE, close_fds=True) + p = subprocess.Popen( + ["xclip", "-selection", "c", "-o"], stdout=subprocess.PIPE, close_fds=True + ) stdout, stderr = p.communicate() - return stdout.decode('utf-8') + return stdout.decode("utf-8") return copy_xclip, paste_xclip def init_xsel_clipboard(): def copy_xsel(text): - p = subprocess.Popen(['xsel', '-b', '-i'], - stdin=subprocess.PIPE, close_fds=True) - p.communicate(input=text.encode('utf-8')) + p = subprocess.Popen( + ["xsel", "-b", "-i"], stdin=subprocess.PIPE, close_fds=True + ) + p.communicate(input=text.encode("utf-8")) def paste_xsel(): - p = subprocess.Popen(['xsel', '-b', '-o'], - stdout=subprocess.PIPE, close_fds=True) + p = subprocess.Popen( + ["xsel", "-b", "-o"], stdout=subprocess.PIPE, close_fds=True + ) stdout, stderr = p.communicate() - return stdout.decode('utf-8') + return stdout.decode("utf-8") return copy_xsel, paste_xsel @@ -82,25 +84,34 @@ def paste_xsel(): def init_klipper_clipboard(): def copy_klipper(text): p = subprocess.Popen( - ['qdbus', 'org.kde.klipper', '/klipper', 'setClipboardContents', - text.encode('utf-8')], - stdin=subprocess.PIPE, close_fds=True) + [ + "qdbus", + "org.kde.klipper", + "/klipper", + "setClipboardContents", + text.encode("utf-8"), + ], + stdin=subprocess.PIPE, + close_fds=True, + ) p.communicate(input=None) def paste_klipper(): p = subprocess.Popen( - ['qdbus', 'org.kde.klipper', '/klipper', 'getClipboardContents'], - stdout=subprocess.PIPE, close_fds=True) + ["qdbus", "org.kde.klipper", "/klipper", "getClipboardContents"], + stdout=subprocess.PIPE, + close_fds=True, + ) stdout, stderr = p.communicate() # Workaround for https://bugs.kde.org/show_bug.cgi?id=342874 # TODO: https://github.com/asweigart/pyperclip/issues/43 - clipboardContents = stdout.decode('utf-8') + clipboardContents = stdout.decode("utf-8") # even if blank, Klipper will append a newline at the end assert len(clipboardContents) > 0 # make sure that newline is there - assert clipboardContents.endswith('\n') - if clipboardContents.endswith('\n'): + assert clipboardContents.endswith("\n") + if clipboardContents.endswith("\n"): clipboardContents = clipboardContents[:-1] return clipboardContents @@ -109,7 +120,6 @@ def paste_klipper(): def init_no_clipboard(): class ClipboardUnavailable: - def __call__(self, *args, **kwargs): raise PyperclipException(EXCEPT_MSG) diff --git a/pandas/io/clipboard/exceptions.py b/pandas/io/clipboard/exceptions.py index 6276b06b9d7fea..eaf5578b5cd1bb 100644 --- a/pandas/io/clipboard/exceptions.py +++ b/pandas/io/clipboard/exceptions.py @@ -6,7 +6,6 @@ class PyperclipException(RuntimeError): class PyperclipWindowsException(PyperclipException): - def __init__(self, message): message += " ({err})".format(err=ctypes.WinError()) super().__init__(message) diff --git a/pandas/io/clipboard/windows.py b/pandas/io/clipboard/windows.py index 72abc729663420..2935dfdc2ae197 100644 --- a/pandas/io/clipboard/windows.py +++ b/pandas/io/clipboard/windows.py @@ -10,7 +10,6 @@ class CheckedCall: - def __init__(self, f): super().__setattr__("f", f) @@ -25,15 +24,38 @@ def __setattr__(self, key, value): def init_windows_clipboard(): - from ctypes.wintypes import (HGLOBAL, LPVOID, DWORD, LPCSTR, INT, HWND, - HINSTANCE, HMENU, BOOL, UINT, HANDLE) + from ctypes.wintypes import ( + HGLOBAL, + LPVOID, + DWORD, + LPCSTR, + INT, + HWND, + HINSTANCE, + HMENU, + BOOL, + UINT, + HANDLE, + ) windll = ctypes.windll - msvcrt = ctypes.CDLL('msvcrt') + msvcrt = ctypes.CDLL("msvcrt") safeCreateWindowExA = CheckedCall(windll.user32.CreateWindowExA) - safeCreateWindowExA.argtypes = [DWORD, LPCSTR, LPCSTR, DWORD, INT, INT, - INT, INT, HWND, HMENU, HINSTANCE, LPVOID] + safeCreateWindowExA.argtypes = [ + DWORD, + LPCSTR, + LPCSTR, + DWORD, + INT, + INT, + INT, + INT, + HWND, + HMENU, + HINSTANCE, + LPVOID, + ] safeCreateWindowExA.restype = HWND safeDestroyWindow = CheckedCall(windll.user32.DestroyWindow) @@ -86,8 +108,9 @@ def window(): """ # we really just need the hwnd, so setting "STATIC" # as predefined lpClass is just fine. - hwnd = safeCreateWindowExA(0, b"STATIC", None, 0, 0, 0, 0, 0, - None, None, None, None) + hwnd = safeCreateWindowExA( + 0, b"STATIC", None, 0, 0, 0, 0, 0, None, None, None, None + ) try: yield hwnd finally: @@ -135,12 +158,14 @@ def copy_windows(text): # the object must have been allocated using the # function with the GMEM_MOVEABLE flag. count = wcslen(text) + 1 - handle = safeGlobalAlloc(GMEM_MOVEABLE, - count * sizeof(c_wchar)) + handle = safeGlobalAlloc(GMEM_MOVEABLE, count * sizeof(c_wchar)) locked_handle = safeGlobalLock(handle) - ctypes.memmove(c_wchar_p(locked_handle), c_wchar_p(text), - count * sizeof(c_wchar)) + ctypes.memmove( + c_wchar_p(locked_handle), + c_wchar_p(text), + count * sizeof(c_wchar), + ) safeGlobalUnlock(handle) safeSetClipboardData(CF_UNICODETEXT, handle) diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index dc30285895dd50..0006824f09fe7a 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -7,7 +7,7 @@ from pandas import get_option, option_context -def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover +def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover r""" Read text from clipboard and pass to read_csv. See read_csv for the full argument list @@ -22,22 +22,21 @@ def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover ------- parsed : DataFrame """ - encoding = kwargs.pop('encoding', 'utf-8') + encoding = kwargs.pop("encoding", "utf-8") # only utf-8 is valid for passed value because that's what clipboard # supports - if encoding is not None and encoding.lower().replace('-', '') != 'utf8': - raise NotImplementedError( - 'reading from clipboard only supports utf-8 encoding') + if encoding is not None and encoding.lower().replace("-", "") != "utf8": + raise NotImplementedError("reading from clipboard only supports utf-8 encoding") from pandas.io.clipboard import clipboard_get from pandas.io.parsers import read_csv + text = clipboard_get() # Try to decode (if needed, as "text" might already be a string here). try: - text = text.decode(kwargs.get('encoding') - or get_option('display.encoding')) + text = text.decode(kwargs.get("encoding") or get_option("display.encoding")) except AttributeError: pass @@ -45,7 +44,7 @@ def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover # inspect no more then the 10 first lines, if they # all contain an equal number (>0) of tabs, infer # that this came from excel and set 'sep' accordingly - lines = text[:10000].split('\n')[:-1][:10] + lines = text[:10000].split("\n")[:-1][:10] # Need to remove leading white space, since read_csv # accepts: @@ -53,21 +52,23 @@ def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover # 0 1 2 # 1 3 4 - counts = {x.lstrip().count('\t') for x in lines} + counts = {x.lstrip().count("\t") for x in lines} if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: - sep = '\t' + sep = "\t" # Edge case where sep is specified to be None, return to default - if sep is None and kwargs.get('delim_whitespace') is None: - sep = r'\s+' + if sep is None and kwargs.get("delim_whitespace") is None: + sep = r"\s+" # Regex separator currently only works with python engine. # Default to python if separator is multi-character (regex) - if len(sep) > 1 and kwargs.get('engine') is None: - kwargs['engine'] = 'python' - elif len(sep) > 1 and kwargs.get('engine') == 'c': - warnings.warn('read_clipboard with regex separator does not work' - ' properly with c engine') + if len(sep) > 1 and kwargs.get("engine") is None: + kwargs["engine"] = "python" + elif len(sep) > 1 and kwargs.get("engine") == "c": + warnings.warn( + "read_clipboard with regex separator does not work" + " properly with c engine" + ) return read_csv(StringIO(text), sep=sep, **kwargs) @@ -95,37 +96,39 @@ def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover - Windows: - OS X: """ - encoding = kwargs.pop('encoding', 'utf-8') + encoding = kwargs.pop("encoding", "utf-8") # testing if an invalid encoding is passed to clipboard - if encoding is not None and encoding.lower().replace('-', '') != 'utf8': - raise ValueError('clipboard only supports utf-8 encoding') + if encoding is not None and encoding.lower().replace("-", "") != "utf8": + raise ValueError("clipboard only supports utf-8 encoding") from pandas.io.clipboard import clipboard_set + if excel is None: excel = True if excel: try: if sep is None: - sep = '\t' + sep = "\t" buf = StringIO() # clipboard_set (pyperclip) expects unicode - obj.to_csv(buf, sep=sep, encoding='utf-8', **kwargs) + obj.to_csv(buf, sep=sep, encoding="utf-8", **kwargs) text = buf.getvalue() clipboard_set(text) return except TypeError: - warnings.warn('to_clipboard in excel mode requires a single ' - 'character separator.') + warnings.warn( + "to_clipboard in excel mode requires a single " "character separator." + ) elif sep is not None: - warnings.warn('to_clipboard with excel=False ignores the sep argument') + warnings.warn("to_clipboard with excel=False ignores the sep argument") if isinstance(obj, ABCDataFrame): # str(df) has various unhelpful defaults, like truncation - with option_context('display.max_colwidth', 999999): + with option_context("display.max_colwidth", 999999): objstr = obj.to_string(**kwargs) else: objstr = str(obj) diff --git a/pandas/io/common.py b/pandas/io/common.py index 34635ebf64ad6e..9a9620e2d06633 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -12,14 +12,23 @@ import pathlib from urllib.error import URLError # noqa from urllib.parse import ( # noqa - urlencode, urljoin, urlparse as parse_url, uses_netloc, uses_params, - uses_relative) + urlencode, + urljoin, + urlparse as parse_url, + uses_netloc, + uses_params, + uses_relative, +) from urllib.request import pathname2url, urlopen import zipfile from pandas.errors import ( # noqa - AbstractMethodError, DtypeWarning, EmptyDataError, ParserError, - ParserWarning) + AbstractMethodError, + DtypeWarning, + EmptyDataError, + ParserError, + ParserWarning, +) from pandas.core.dtypes.common import is_file_like @@ -29,13 +38,29 @@ # common NA values # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', -_NA_VALUES = {'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', - 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', - '-nan', ''} +_NA_VALUES = { + "-1.#IND", + "1.#QNAN", + "1.#IND", + "-1.#QNAN", + "#N/A N/A", + "#N/A", + "N/A", + "n/a", + "NA", + "#NA", + "NULL", + "null", + "NaN", + "-NaN", + "nan", + "-nan", + "", +} _VALID_URLS = set(uses_relative + uses_netloc + uses_params) -_VALID_URLS.discard('') +_VALID_URLS.discard("") class BaseIterator: @@ -88,10 +113,12 @@ def _expand_user(filepath_or_buffer): def _validate_header_arg(header): if isinstance(header, bool): - raise TypeError("Passing a bool to header is invalid. " - "Use header=None for no header or " - "header=int or list-like of ints to specify " - "the row(s) making up the column names") + raise TypeError( + "Passing a bool to header is invalid. " + "Use header=None for no header or " + "header=int or list-like of ints to specify " + "the row(s) making up the column names" + ) def _stringify_path(filepath_or_buffer): @@ -116,7 +143,7 @@ def _stringify_path(filepath_or_buffer): Any other object is passed through unchanged, which includes bytes, strings, buffers, or anything else that's not even path-like. """ - if hasattr(filepath_or_buffer, '__fspath__'): + if hasattr(filepath_or_buffer, "__fspath__"): return filepath_or_buffer.__fspath__() elif isinstance(filepath_or_buffer, pathlib.Path): return str(filepath_or_buffer) @@ -126,7 +153,7 @@ def _stringify_path(filepath_or_buffer): def is_s3_url(url): """Check for an s3, s3n, or s3a url""" try: - return parse_url(url).scheme in ['s3', 's3n', 's3a'] + return parse_url(url).scheme in ["s3", "s3n", "s3a"] except Exception: return False @@ -134,13 +161,14 @@ def is_s3_url(url): def is_gcs_url(url): """Check for a gcs url""" try: - return parse_url(url).scheme in ['gcs', 'gs'] + return parse_url(url).scheme in ["gcs", "gs"] except Exception: return False -def get_filepath_or_buffer(filepath_or_buffer, encoding=None, - compression=None, mode=None): +def get_filepath_or_buffer( + filepath_or_buffer, encoding=None, compression=None, mode=None +): """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. @@ -164,27 +192,27 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, if _is_url(filepath_or_buffer): req = urlopen(filepath_or_buffer) - content_encoding = req.headers.get('Content-Encoding', None) - if content_encoding == 'gzip': + content_encoding = req.headers.get("Content-Encoding", None) + if content_encoding == "gzip": # Override compression based on Content-Encoding header - compression = 'gzip' + compression = "gzip" reader = BytesIO(req.read()) req.close() return reader, encoding, compression, True if is_s3_url(filepath_or_buffer): from pandas.io import s3 - return s3.get_filepath_or_buffer(filepath_or_buffer, - encoding=encoding, - compression=compression, - mode=mode) + + return s3.get_filepath_or_buffer( + filepath_or_buffer, encoding=encoding, compression=compression, mode=mode + ) if is_gcs_url(filepath_or_buffer): from pandas.io import gcs - return gcs.get_filepath_or_buffer(filepath_or_buffer, - encoding=encoding, - compression=compression, - mode=mode) + + return gcs.get_filepath_or_buffer( + filepath_or_buffer, encoding=encoding, compression=compression, mode=mode + ) if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression, False @@ -208,15 +236,10 @@ def file_path_to_url(path): ------- a valid FILE URL """ - return urljoin('file:', pathname2url(path)) + return urljoin("file:", pathname2url(path)) -_compression_to_extension = { - 'gzip': '.gz', - 'bz2': '.bz2', - 'zip': '.zip', - 'xz': '.xz', -} +_compression_to_extension = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"} def _infer_compression(filepath_or_buffer, compression): @@ -250,7 +273,7 @@ def _infer_compression(filepath_or_buffer, compression): return None # Infer compression - if compression == 'infer': + if compression == "infer": # Convert all path types (e.g. pathlib.Path) to strings filepath_or_buffer = _stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, str): @@ -267,14 +290,15 @@ def _infer_compression(filepath_or_buffer, compression): if compression in _compression_to_extension: return compression - msg = 'Unrecognized compression type: {}'.format(compression) - valid = ['infer', None] + sorted(_compression_to_extension) - msg += '\nValid compression types are {}'.format(valid) + msg = "Unrecognized compression type: {}".format(compression) + valid = ["infer", None] + sorted(_compression_to_extension) + msg += "\nValid compression types are {}".format(valid) raise ValueError(msg) -def _get_handle(path_or_buf, mode, encoding=None, compression=None, - memory_map=False, is_text=True): +def _get_handle( + path_or_buf, mode, encoding=None, compression=None, memory_map=False, is_text=True +): """ Get file handle for given path/buffer and mode. @@ -304,6 +328,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, """ try: from s3fs import S3File + need_text_wrapping = (BytesIO, S3File) except ImportError: need_text_wrapping = (BytesIO,) @@ -321,45 +346,47 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, if compression: # GZ Compression - if compression == 'gzip': + if compression == "gzip": if is_path: f = gzip.open(path_or_buf, mode) else: f = gzip.GzipFile(fileobj=path_or_buf) # BZ Compression - elif compression == 'bz2': + elif compression == "bz2": if is_path: f = bz2.BZ2File(path_or_buf, mode) else: f = bz2.BZ2File(path_or_buf) # ZIP Compression - elif compression == 'zip': + elif compression == "zip": zf = BytesZipFile(path_or_buf, mode) # Ensure the container is closed as well. handles.append(zf) - if zf.mode == 'w': + if zf.mode == "w": f = zf - elif zf.mode == 'r': + elif zf.mode == "r": zip_names = zf.namelist() if len(zip_names) == 1: f = zf.open(zip_names.pop()) elif len(zip_names) == 0: - raise ValueError('Zero files found in ZIP file {}' - .format(path_or_buf)) + raise ValueError( + "Zero files found in ZIP file {}".format(path_or_buf) + ) else: - raise ValueError('Multiple files found in ZIP file.' - ' Only one file per ZIP: {}' - .format(zip_names)) + raise ValueError( + "Multiple files found in ZIP file." + " Only one file per ZIP: {}".format(zip_names) + ) # XZ Compression - elif compression == 'xz': + elif compression == "xz": f = lzma.LZMAFile(path_or_buf, mode) # Unrecognized Compression else: - msg = 'Unrecognized compression type: {}'.format(compression) + msg = "Unrecognized compression type: {}".format(compression) raise ValueError(msg) handles.append(f) @@ -370,7 +397,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, f = open(path_or_buf, mode, encoding=encoding, newline="") elif is_text: # No explicit encoding - f = open(path_or_buf, mode, errors='replace', newline="") + f = open(path_or_buf, mode, errors="replace", newline="") else: # Binary mode f = open(path_or_buf, mode) @@ -379,10 +406,11 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, # Convert BytesIO or file objects passed with an encoding if is_text and (compression or isinstance(f, need_text_wrapping)): from io import TextIOWrapper - f = TextIOWrapper(f, encoding=encoding, newline='') + + f = TextIOWrapper(f, encoding=encoding, newline="") handles.append(f) - if memory_map and hasattr(f, 'fileno'): + if memory_map and hasattr(f, "fileno"): try: g = MMapWrapper(f) f.close() @@ -405,10 +433,11 @@ class BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore BytesIO provides attributes of file-like object and ZipFile.writestr writes bytes strings into a member of the archive. """ + # GH 17778 def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs): - if mode in ['wb', 'rb']: - mode = mode.replace('b', '') + if mode in ["wb", "rb"]: + mode = mode.replace("b", "") super().__init__(file, mode, compression, **kwargs) def write(self, data): @@ -446,12 +475,12 @@ def __next__(self): # readline returns bytes, not str, but Python's CSV reader # expects str, so convert the output to str before continuing - newline = newline.decode('utf-8') + newline = newline.decode("utf-8") # mmap doesn't raise if reading past the allocated # data but instead returns an empty string, so raise # if that is returned - if newline == '': + if newline == "": raise StopIteration return newline diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py index 75f353f28549cd..ab64bc14344f1f 100644 --- a/pandas/io/date_converters.py +++ b/pandas/io/date_converters.py @@ -17,17 +17,16 @@ def parse_date_fields(year_col, month_col, day_col): return parsing.try_parse_year_month_day(year_col, month_col, day_col) -def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, - second_col): +def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, second_col): year_col = _maybe_cast(year_col) month_col = _maybe_cast(month_col) day_col = _maybe_cast(day_col) hour_col = _maybe_cast(hour_col) minute_col = _maybe_cast(minute_col) second_col = _maybe_cast(second_col) - return parsing.try_parse_datetime_components(year_col, month_col, day_col, - hour_col, minute_col, - second_col) + return parsing.try_parse_datetime_components( + year_col, month_col, day_col, hour_col, minute_col, second_col + ) def generic_parser(parse_func, *cols): @@ -57,7 +56,9 @@ def _check_columns(cols): for i, n in enumerate(map(len, tail)): if n != N: - raise AssertionError('All columns must have the same length: {0}; ' - 'column {1} has length {2}'.format(N, i, n)) + raise AssertionError( + "All columns must have the same length: {0}; " + "column {1} has length {2}".format(N, i, n) + ) return N diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index d10a40541bb6c7..fae8f4203e9a0a 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -11,21 +11,29 @@ from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_kwarg -from pandas.core.dtypes.common import ( - is_bool, is_float, is_integer, is_list_like) +from pandas.core.dtypes.common import is_bool, is_float, is_integer, is_list_like from pandas.core.frame import DataFrame from pandas.io.common import ( - _NA_VALUES, _is_url, _stringify_path, _validate_header_arg, - get_filepath_or_buffer) + _NA_VALUES, + _is_url, + _stringify_path, + _validate_header_arg, + get_filepath_or_buffer, +) from pandas.io.excel._util import ( - _fill_mi_header, _get_default_writer, _maybe_convert_usecols, - _pop_header_name, get_writer) + _fill_mi_header, + _get_default_writer, + _maybe_convert_usecols, + _pop_header_name, + get_writer, +) from pandas.io.formats.printing import pprint_thing from pandas.io.parsers import TextParser -_read_excel_doc = """ +_read_excel_doc = ( + """ Read an Excel file into a pandas DataFrame. Support both `xls` and `xlsx` file extensions from a local filesystem or URL. @@ -124,8 +132,9 @@ na_values : scalar, str, list-like, or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted - as NaN: '""" + fill("', '".join( - sorted(_NA_VALUES)), 70, subsequent_indent=" ") + """'. + as NaN: '""" + + fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ") + + """'. keep_default_na : bool, default True If na_values are specified and keep_default_na is False the default NaN values are overridden, otherwise they're appended to. @@ -251,47 +260,53 @@ 1 string2 2.0 2 None NaN """ +) @Appender(_read_excel_doc) @deprecate_kwarg("skip_footer", "skipfooter") -def read_excel(io, - sheet_name=0, - header=0, - names=None, - index_col=None, - usecols=None, - squeeze=False, - dtype=None, - engine=None, - converters=None, - true_values=None, - false_values=None, - skiprows=None, - nrows=None, - na_values=None, - keep_default_na=True, - verbose=False, - parse_dates=False, - date_parser=None, - thousands=None, - comment=None, - skip_footer=0, - skipfooter=0, - convert_float=True, - mangle_dupe_cols=True, - **kwds): - - for arg in ('sheet', 'sheetname', 'parse_cols'): +def read_excel( + io, + sheet_name=0, + header=0, + names=None, + index_col=None, + usecols=None, + squeeze=False, + dtype=None, + engine=None, + converters=None, + true_values=None, + false_values=None, + skiprows=None, + nrows=None, + na_values=None, + keep_default_na=True, + verbose=False, + parse_dates=False, + date_parser=None, + thousands=None, + comment=None, + skip_footer=0, + skipfooter=0, + convert_float=True, + mangle_dupe_cols=True, + **kwds +): + + for arg in ("sheet", "sheetname", "parse_cols"): if arg in kwds: - raise TypeError("read_excel() got an unexpected keyword argument " - "`{}`".format(arg)) + raise TypeError( + "read_excel() got an unexpected keyword argument " "`{}`".format(arg) + ) if not isinstance(io, ExcelFile): io = ExcelFile(io, engine=engine) elif engine and engine != io.engine: - raise ValueError("Engine should not be specified when passing " - "an ExcelFile - ExcelFile already has the engine set") + raise ValueError( + "Engine should not be specified when passing " + "an ExcelFile - ExcelFile already has the engine set" + ) return io.parse( sheet_name=sheet_name, @@ -316,19 +331,17 @@ def read_excel(io, skipfooter=skipfooter, convert_float=convert_float, mangle_dupe_cols=mangle_dupe_cols, - **kwds) + **kwds + ) class _BaseExcelReader(metaclass=abc.ABCMeta): - def __init__(self, filepath_or_buffer): # If filepath_or_buffer is a url, load the data into a BytesIO if _is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) - elif not isinstance(filepath_or_buffer, - (ExcelFile, self._workbook_class)): - filepath_or_buffer, _, _, _ = get_filepath_or_buffer( - filepath_or_buffer) + elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): + filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer) if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer @@ -339,8 +352,9 @@ def __init__(self, filepath_or_buffer): elif isinstance(filepath_or_buffer, str): self.book = self.load_workbook(filepath_or_buffer) else: - raise ValueError('Must explicitly set engine if not passing in' - ' buffer or path for io.') + raise ValueError( + "Must explicitly set engine if not passing in" " buffer or path for io." + ) @property @abc.abstractmethod @@ -368,28 +382,30 @@ def get_sheet_by_index(self, index): def get_sheet_data(self, sheet, convert_float): pass - def parse(self, - sheet_name=0, - header=0, - names=None, - index_col=None, - usecols=None, - squeeze=False, - dtype=None, - true_values=None, - false_values=None, - skiprows=None, - nrows=None, - na_values=None, - verbose=False, - parse_dates=False, - date_parser=None, - thousands=None, - comment=None, - skipfooter=0, - convert_float=True, - mangle_dupe_cols=True, - **kwds): + def parse( + self, + sheet_name=0, + header=0, + names=None, + index_col=None, + usecols=None, + squeeze=False, + dtype=None, + true_values=None, + false_values=None, + skiprows=None, + nrows=None, + na_values=None, + verbose=False, + parse_dates=False, + date_parser=None, + thousands=None, + comment=None, + skipfooter=0, + convert_float=True, + mangle_dupe_cols=True, + **kwds + ): _validate_header_arg(header) @@ -439,8 +455,7 @@ def parse(self, if is_integer(skiprows): row += skiprows - data[row], control_row = _fill_mi_header(data[row], - control_row) + data[row], control_row = _fill_mi_header(data[row], control_row) if index_col is not None: header_name, _ = _pop_header_name(data[row], index_col) @@ -460,7 +475,7 @@ def parse(self, last = data[offset][col] for row in range(offset + 1, len(data)): - if data[row][col] == '' or data[row][col] is None: + if data[row][col] == "" or data[row][col] is None: data[row][col] = last else: last = data[row][col] @@ -469,33 +484,36 @@ def parse(self, # GH 12292 : error when read one empty column from excel file try: - parser = TextParser(data, - names=names, - header=header, - index_col=index_col, - has_index_names=has_index_names, - squeeze=squeeze, - dtype=dtype, - true_values=true_values, - false_values=false_values, - skiprows=skiprows, - nrows=nrows, - na_values=na_values, - parse_dates=parse_dates, - date_parser=date_parser, - thousands=thousands, - comment=comment, - skipfooter=skipfooter, - usecols=usecols, - mangle_dupe_cols=mangle_dupe_cols, - **kwds) + parser = TextParser( + data, + names=names, + header=header, + index_col=index_col, + has_index_names=has_index_names, + squeeze=squeeze, + dtype=dtype, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + nrows=nrows, + na_values=na_values, + parse_dates=parse_dates, + date_parser=date_parser, + thousands=thousands, + comment=comment, + skipfooter=skipfooter, + usecols=usecols, + mangle_dupe_cols=mangle_dupe_cols, + **kwds + ) output[asheetname] = parser.read(nrows=nrows) if not squeeze or isinstance(output[asheetname], DataFrame): if header_names: output[asheetname].columns = output[ - asheetname].columns.set_names(header_names) + asheetname + ].columns.set_names(header_names) except EmptyDataError: # No Data, return an empty DataFrame @@ -570,6 +588,7 @@ class ExcelWriter(metaclass=abc.ABCMeta): >>> with ExcelWriter('path_to_file.xlsx', mode='a') as writer: ... df.to_excel(writer, sheet_name='Sheet3') """ + # Defining an ExcelWriter implementation (see abstract methods for more...) # - Mandatory @@ -595,21 +614,18 @@ def __new__(cls, path, engine=None, **kwargs): # only switch class if generic(ExcelWriter) if cls is ExcelWriter: - if engine is None or (isinstance(engine, str) and - engine == 'auto'): + if engine is None or (isinstance(engine, str) and engine == "auto"): if isinstance(path, str): ext = os.path.splitext(path)[-1][1:] else: - ext = 'xlsx' + ext = "xlsx" try: - engine = config.get_option('io.excel.{ext}.writer' - .format(ext=ext)) - if engine == 'auto': + engine = config.get_option("io.excel.{ext}.writer".format(ext=ext)) + if engine == "auto": engine = _get_default_writer(ext) except KeyError: - raise ValueError("No engine for filetype: '{ext}'" - .format(ext=ext)) + raise ValueError("No engine for filetype: '{ext}'".format(ext=ext)) cls = get_writer(engine) return object.__new__(cls) @@ -632,8 +648,9 @@ def engine(self): pass @abc.abstractmethod - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, - freeze_panes=None): + def write_cells( + self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None + ): """ Write given formatted cells into Excel an excel sheet @@ -657,14 +674,20 @@ def save(self): """ pass - def __init__(self, path, engine=None, - date_format=None, datetime_format=None, mode='w', - **engine_kwargs): + def __init__( + self, + path, + engine=None, + date_format=None, + datetime_format=None, + mode="w", + **engine_kwargs + ): # validate that this engine can handle the extension if isinstance(path, str): ext = os.path.splitext(path)[-1] else: - ext = 'xls' if engine == 'xlwt' else 'xlsx' + ext = "xls" if engine == "xlwt" else "xlsx" self.check_extension(ext) @@ -673,11 +696,11 @@ def __init__(self, path, engine=None, self.cur_sheet = None if date_format is None: - self.date_format = 'YYYY-MM-DD' + self.date_format = "YYYY-MM-DD" else: self.date_format = date_format if datetime_format is None: - self.datetime_format = 'YYYY-MM-DD HH:MM:SS' + self.datetime_format = "YYYY-MM-DD HH:MM:SS" else: self.datetime_format = datetime_format @@ -690,8 +713,9 @@ def _get_sheet_name(self, sheet_name): if sheet_name is None: sheet_name = self.cur_sheet if sheet_name is None: # pragma: no cover - raise ValueError('Must pass explicit sheet_name or set ' - 'cur_sheet property') + raise ValueError( + "Must pass explicit sheet_name or set " "cur_sheet property" + ) return sheet_name def _value_with_fmt(self, val): @@ -721,7 +745,7 @@ def _value_with_fmt(self, val): fmt = self.date_format elif isinstance(val, timedelta): val = val.total_seconds() / float(86400) - fmt = '0' + fmt = "0" else: val = str(val) @@ -731,12 +755,12 @@ def _value_with_fmt(self, val): def check_extension(cls, ext): """checks that path's extension against the Writer's supported extensions. If it isn't supported, raises UnsupportedFiletypeError.""" - if ext.startswith('.'): + if ext.startswith("."): ext = ext[1:] if not any(ext in extension for extension in cls.supported_extensions): - msg = ("Invalid extension for engine '{engine}': '{ext}'" - .format(engine=pprint_thing(cls.engine), - ext=pprint_thing(ext))) + msg = "Invalid extension for engine '{engine}': '{ext}'".format( + engine=pprint_thing(cls.engine), ext=pprint_thing(ext) + ) raise ValueError(msg) else: return True @@ -772,15 +796,11 @@ class ExcelFile: from pandas.io.excel._openpyxl import _OpenpyxlReader from pandas.io.excel._xlrd import _XlrdReader - _engines = { - 'xlrd': _XlrdReader, - 'openpyxl': _OpenpyxlReader, - 'odf': _ODFReader, - } + _engines = {"xlrd": _XlrdReader, "openpyxl": _OpenpyxlReader, "odf": _ODFReader} def __init__(self, io, engine=None): if engine is None: - engine = 'xlrd' + engine = "xlrd" if engine not in self._engines: raise ValueError("Unknown engine: {engine}".format(engine=engine)) @@ -795,27 +815,29 @@ def __init__(self, io, engine=None): def __fspath__(self): return self._io - def parse(self, - sheet_name=0, - header=0, - names=None, - index_col=None, - usecols=None, - squeeze=False, - converters=None, - true_values=None, - false_values=None, - skiprows=None, - nrows=None, - na_values=None, - parse_dates=False, - date_parser=None, - thousands=None, - comment=None, - skipfooter=0, - convert_float=True, - mangle_dupe_cols=True, - **kwds): + def parse( + self, + sheet_name=0, + header=0, + names=None, + index_col=None, + usecols=None, + squeeze=False, + converters=None, + true_values=None, + false_values=None, + skiprows=None, + nrows=None, + na_values=None, + parse_dates=False, + date_parser=None, + thousands=None, + comment=None, + skipfooter=0, + convert_float=True, + mangle_dupe_cols=True, + **kwds + ): """ Parse specified sheet(s) into a DataFrame @@ -827,30 +849,33 @@ def parse(self, DataFrame or dict of DataFrames DataFrame from the passed in Excel file. """ - if 'chunksize' in kwds: - raise NotImplementedError("chunksize keyword of read_excel " - "is not implemented") - - return self._reader.parse(sheet_name=sheet_name, - header=header, - names=names, - index_col=index_col, - usecols=usecols, - squeeze=squeeze, - converters=converters, - true_values=true_values, - false_values=false_values, - skiprows=skiprows, - nrows=nrows, - na_values=na_values, - parse_dates=parse_dates, - date_parser=date_parser, - thousands=thousands, - comment=comment, - skipfooter=skipfooter, - convert_float=convert_float, - mangle_dupe_cols=mangle_dupe_cols, - **kwds) + if "chunksize" in kwds: + raise NotImplementedError( + "chunksize keyword of read_excel " "is not implemented" + ) + + return self._reader.parse( + sheet_name=sheet_name, + header=header, + names=names, + index_col=index_col, + usecols=usecols, + squeeze=squeeze, + converters=converters, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + nrows=nrows, + na_values=na_values, + parse_dates=parse_dates, + date_parser=date_parser, + thousands=thousands, + comment=comment, + skipfooter=skipfooter, + convert_float=convert_float, + mangle_dupe_cols=mangle_dupe_cols, + **kwds + ) @property def book(self): @@ -862,7 +887,7 @@ def sheet_names(self): def close(self): """close io if necessary""" - if hasattr(self.io, 'close'): + if hasattr(self.io, "close"): self.io.close() def __enter__(self): diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index c820c1497c3c92..3be36663bac796 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -16,6 +16,7 @@ class _ODFReader(_BaseExcelReader): filepath_or_buffer: string, path to be parsed or an open readable stream. """ + def __init__(self, filepath_or_buffer: FilePathOrBuffer): import_optional_dependency("odf") super().__init__(filepath_or_buffer) @@ -23,16 +24,18 @@ def __init__(self, filepath_or_buffer: FilePathOrBuffer): @property def _workbook_class(self): from odf.opendocument import OpenDocument + return OpenDocument def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): from odf.opendocument import load + return load(filepath_or_buffer) @property def empty_value(self) -> str: """Property for compat with other readers.""" - return '' + return "" @property def sheet_names(self) -> List[str]: @@ -44,6 +47,7 @@ def sheet_names(self) -> List[str]: def get_sheet_by_index(self, index: int): from odf.table import Table + tables = self.book.getElementsByType(Table) return tables[index] @@ -74,8 +78,7 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: table = [] # type: List[List[Scalar]] for i, sheet_row in enumerate(sheet_rows): - sheet_cells = [x for x in sheet_row.childNodes - if x.qname in cell_names] + sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] empty_cells = 0 table_row = [] # type: List[Scalar] @@ -122,12 +125,12 @@ def _get_row_repeat(self, row) -> int: """ from odf.namespaces import TABLENS - return int(row.attributes.get((TABLENS, 'number-rows-repeated'), 1)) + return int(row.attributes.get((TABLENS, "number-rows-repeated"), 1)) def _get_column_repeat(self, cell) -> int: from odf.namespaces import TABLENS - return int(cell.attributes.get( - (TABLENS, 'number-columns-repeated'), 1)) + + return int(cell.attributes.get((TABLENS, "number-columns-repeated"), 1)) def _is_empty_row(self, row) -> bool: """Helper function to find empty rows @@ -140,18 +143,19 @@ def _is_empty_row(self, row) -> bool: def _get_cell_value(self, cell, convert_float: bool) -> Scalar: from odf.namespaces import OFFICENS - cell_type = cell.attributes.get((OFFICENS, 'value-type')) - if cell_type == 'boolean': + + cell_type = cell.attributes.get((OFFICENS, "value-type")) + if cell_type == "boolean": if str(cell) == "TRUE": return True return False if cell_type is None: return self.empty_value - elif cell_type == 'float': + elif cell_type == "float": # GH5394 - cell_value = float(cell.attributes.get((OFFICENS, 'value'))) + cell_value = float(cell.attributes.get((OFFICENS, "value"))) - if cell_value == 0. and str(cell) != cell_value: # NA handling + if cell_value == 0.0 and str(cell) != cell_value: # NA handling return str(cell) if convert_float: @@ -159,18 +163,18 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: if val == cell_value: return val return cell_value - elif cell_type == 'percentage': - cell_value = cell.attributes.get((OFFICENS, 'value')) + elif cell_type == "percentage": + cell_value = cell.attributes.get((OFFICENS, "value")) return float(cell_value) - elif cell_type == 'string': + elif cell_type == "string": return str(cell) - elif cell_type == 'currency': - cell_value = cell.attributes.get((OFFICENS, 'value')) + elif cell_type == "currency": + cell_value = cell.attributes.get((OFFICENS, "value")) return float(cell_value) - elif cell_type == 'date': - cell_value = cell.attributes.get((OFFICENS, 'date-value')) + elif cell_type == "date": + cell_value = cell.attributes.get((OFFICENS, "date-value")) return pd.to_datetime(cell_value) - elif cell_type == 'time': + elif cell_type == "time": return pd.to_datetime(str(cell)).time() else: - raise ValueError('Unrecognized type {}'.format(cell_type)) + raise ValueError("Unrecognized type {}".format(cell_type)) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 7b1e203bd33ad1..d8f5da5ab5bc68 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -11,17 +11,18 @@ class _OpenpyxlWriter(ExcelWriter): - engine = 'openpyxl' - supported_extensions = ('.xlsx', '.xlsm') + engine = "openpyxl" + supported_extensions = (".xlsx", ".xlsm") - def __init__(self, path, engine=None, mode='w', **engine_kwargs): + def __init__(self, path, engine=None, mode="w", **engine_kwargs): # Use the openpyxl module as the Excel writer. from openpyxl.workbook import Workbook super().__init__(path, mode=mode, **engine_kwargs) - if self.mode == 'a': # Load from existing workbook + if self.mode == "a": # Load from existing workbook from openpyxl import load_workbook + book = load_workbook(self.path) self.book = book else: @@ -52,12 +53,16 @@ def _convert_to_style(cls, style_dict): """ from openpyxl.style import Style + xls_style = Style() for key, value in style_dict.items(): for nk, nv in value.items(): if key == "borders": - (xls_style.borders.__getattribute__(nk) - .__setattr__('border_style', nv)) + ( + xls_style.borders.__getattribute__(nk).__setattr__( + "border_style", nv + ) + ) else: xls_style.__getattribute__(key).__setattr__(nk, nv) @@ -86,16 +91,13 @@ def _convert_to_style_kwargs(cls, style_dict): appropriate class. """ - _style_key_map = { - 'borders': 'border', - } + _style_key_map = {"borders": "border"} style_kwargs = {} for k, v in style_dict.items(): if k in _style_key_map: k = _style_key_map[k] - _conv_to_x = getattr(cls, '_convert_to_{k}'.format(k=k), - lambda x: None) + _conv_to_x = getattr(cls, "_convert_to_{k}".format(k=k), lambda x: None) new_v = _conv_to_x(v) if new_v: style_kwargs[k] = new_v @@ -160,19 +162,19 @@ def _convert_to_font(cls, font_dict): from openpyxl.styles import Font _font_key_map = { - 'sz': 'size', - 'b': 'bold', - 'i': 'italic', - 'u': 'underline', - 'strike': 'strikethrough', - 'vertalign': 'vertAlign', + "sz": "size", + "b": "bold", + "i": "italic", + "u": "underline", + "strike": "strikethrough", + "vertalign": "vertAlign", } font_kwargs = {} for k, v in font_dict.items(): if k in _font_key_map: k = _font_key_map[k] - if k == 'color': + if k == "color": v = cls._convert_to_color(v) font_kwargs[k] = v @@ -222,17 +224,15 @@ def _convert_to_fill(cls, fill_dict): from openpyxl.styles import PatternFill, GradientFill _pattern_fill_key_map = { - 'patternType': 'fill_type', - 'patterntype': 'fill_type', - 'fgColor': 'start_color', - 'fgcolor': 'start_color', - 'bgColor': 'end_color', - 'bgcolor': 'end_color', + "patternType": "fill_type", + "patterntype": "fill_type", + "fgColor": "start_color", + "fgcolor": "start_color", + "bgColor": "end_color", + "bgcolor": "end_color", } - _gradient_fill_key_map = { - 'fill_type': 'type', - } + _gradient_fill_key_map = {"fill_type": "type"} pfill_kwargs = {} gfill_kwargs = {} @@ -242,9 +242,9 @@ def _convert_to_fill(cls, fill_dict): pk = _pattern_fill_key_map[k] if k in _gradient_fill_key_map: gk = _gradient_fill_key_map[k] - if pk in ['start_color', 'end_color']: + if pk in ["start_color", "end_color"]: v = cls._convert_to_color(v) - if gk == 'stop': + if gk == "stop": v = cls._convert_to_stop(v) if pk: pfill_kwargs[pk] = v @@ -277,9 +277,7 @@ def _convert_to_side(cls, side_spec): from openpyxl.styles import Side - _side_key_map = { - 'border_style': 'style', - } + _side_key_map = {"border_style": "style"} if isinstance(side_spec, str): return Side(style=side_spec) @@ -288,7 +286,7 @@ def _convert_to_side(cls, side_spec): for k, v in side_spec.items(): if k in _side_key_map: k = _side_key_map[k] - if k == 'color': + if k == "color": v = cls._convert_to_color(v) side_kwargs[k] = v @@ -320,18 +318,15 @@ def _convert_to_border(cls, border_dict): from openpyxl.styles import Border - _border_key_map = { - 'diagonalup': 'diagonalUp', - 'diagonaldown': 'diagonalDown', - } + _border_key_map = {"diagonalup": "diagonalUp", "diagonaldown": "diagonalDown"} border_kwargs = {} for k, v in border_dict.items(): if k in _border_key_map: k = _border_key_map[k] - if k == 'color': + if k == "color": v = cls._convert_to_color(v) - if k in ['left', 'right', 'top', 'bottom', 'diagonal']: + if k in ["left", "right", "top", "bottom", "diagonal"]: v = cls._convert_to_side(v) border_kwargs[k] = v @@ -374,7 +369,7 @@ def _convert_to_number_format(cls, number_format_dict): ------- number_format : str """ - return number_format_dict['format_code'] + return number_format_dict["format_code"] @classmethod def _convert_to_protection(cls, protection_dict): @@ -394,8 +389,9 @@ def _convert_to_protection(cls, protection_dict): return Protection(**protection_dict) - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, - freeze_panes=None): + def write_cells( + self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None + ): # Write the frame cells using openpyxl. sheet_name = self._get_sheet_name(sheet_name) @@ -409,13 +405,13 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, self.sheets[sheet_name] = wks if _validate_freeze_panes(freeze_panes): - wks.freeze_panes = wks.cell(row=freeze_panes[0] + 1, - column=freeze_panes[1] + 1) + wks.freeze_panes = wks.cell( + row=freeze_panes[0] + 1, column=freeze_panes[1] + 1 + ) for cell in cells: xcell = wks.cell( - row=startrow + cell.row + 1, - column=startcol + cell.col + 1 + row=startrow + cell.row + 1, column=startcol + cell.col + 1 ) xcell.value, fmt = self._value_with_fmt(cell.val) if fmt: @@ -439,7 +435,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, start_row=startrow + cell.row + 1, start_column=startcol + cell.col + 1, end_column=startcol + cell.mergeend + 1, - end_row=startrow + cell.mergestart + 1 + end_row=startrow + cell.mergestart + 1, ) # When cells are merged only the top-left cell is preserved @@ -462,7 +458,6 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, class _OpenpyxlReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None: """Reader using openpyxl engine. @@ -477,12 +472,15 @@ def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None: @property def _workbook_class(self): from openpyxl import Workbook + return Workbook def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): from openpyxl import load_workbook - return load_workbook(filepath_or_buffer, - read_only=True, data_only=True, keep_links=False) + + return load_workbook( + filepath_or_buffer, read_only=True, data_only=True, keep_links=False + ) @property def sheet_names(self) -> List[str]: @@ -499,13 +497,13 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: # TODO: replace with openpyxl constants if cell.is_date: return cell.value - elif cell.data_type == 'e': + elif cell.data_type == "e": return np.nan - elif cell.data_type == 'b': + elif cell.data_type == "b": return bool(cell.value) elif cell.value is None: - return '' # compat with xlrd - elif cell.data_type == 'n': + return "" # compat with xlrd + elif cell.data_type == "n": # GH5394 if convert_float: val = int(cell.value) @@ -519,7 +517,6 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: data = [] # type: List[List[Scalar]] for row in sheet.rows: - data.append( - [self._convert_cell(cell, convert_float) for cell in row]) + data.append([self._convert_cell(cell, convert_float) for cell in row]) return data diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index 286efea9f120e9..2ba3842d5c0c93 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -37,12 +37,12 @@ def _get_default_writer(ext): str The default engine for the extension. """ - _default_writers = {'xlsx': 'openpyxl', 'xlsm': 'openpyxl', 'xls': 'xlwt'} - xlsxwriter = import_optional_dependency("xlsxwriter", - raise_on_missing=False, - on_version="warn") + _default_writers = {"xlsx": "openpyxl", "xlsm": "openpyxl", "xls": "xlwt"} + xlsxwriter = import_optional_dependency( + "xlsxwriter", raise_on_missing=False, on_version="warn" + ) if xlsxwriter: - _default_writers['xlsx'] = 'xlsxwriter' + _default_writers["xlsx"] = "xlsxwriter" return _default_writers[ext] @@ -50,8 +50,7 @@ def get_writer(engine_name): try: return _writers[engine_name] except KeyError: - raise ValueError("No Excel writer '{engine}'" - .format(engine=engine_name)) + raise ValueError("No Excel writer '{engine}'".format(engine=engine_name)) def _excel2num(x): @@ -137,10 +136,15 @@ def _maybe_convert_usecols(usecols): return usecols if is_integer(usecols): - warnings.warn(("Passing in an integer for `usecols` has been " - "deprecated. Please pass in a list of int from " - "0 to `usecols` inclusive instead."), - FutureWarning, stacklevel=2) + warnings.warn( + ( + "Passing in an integer for `usecols` has been " + "deprecated. Please pass in a list of int from " + "0 to `usecols` inclusive instead." + ), + FutureWarning, + stacklevel=2, + ) return list(range(usecols + 1)) if isinstance(usecols, str): @@ -151,14 +155,15 @@ def _maybe_convert_usecols(usecols): def _validate_freeze_panes(freeze_panes): if freeze_panes is not None: - if ( - len(freeze_panes) == 2 and - all(isinstance(item, int) for item in freeze_panes) + if len(freeze_panes) == 2 and all( + isinstance(item, int) for item in freeze_panes ): return True - raise ValueError("freeze_panes must be of form (row, column)" - " where row and column are integers") + raise ValueError( + "freeze_panes must be of form (row, column)" + " where row and column are integers" + ) # freeze_panes wasn't specified, return False so it won't be applied # to output sheet @@ -168,7 +173,7 @@ def _validate_freeze_panes(freeze_panes): def _trim_excel_header(row): # trim header row so auto-index inference works # xlrd uses '' , openpyxl None - while len(row) > 0 and (row[0] == '' or row[0] is None): + while len(row) > 0 and (row[0] == "" or row[0] is None): row = row[1:] return row @@ -195,7 +200,7 @@ def _fill_mi_header(row, control_row): if not control_row[i]: last = row[i] - if row[i] == '' or row[i] is None: + if row[i] == "" or row[i] is None: row[i] = last else: control_row[i] = False @@ -228,4 +233,4 @@ def _pop_header_name(row, index_col): header_name = row[i] header_name = None if header_name == "" else header_name - return header_name, row[:i] + [''] + row[i + 1:] + return header_name, row[:i] + [""] + row[i + 1 :] diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index fcc432dc7a5add..be1b78eeb146ec 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -8,7 +8,6 @@ class _XlrdReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer): """Reader using xlrd engine. @@ -24,10 +23,12 @@ def __init__(self, filepath_or_buffer): @property def _workbook_class(self): from xlrd import Book + return Book def load_workbook(self, filepath_or_buffer): from xlrd import open_workbook + if hasattr(filepath_or_buffer, "read"): data = filepath_or_buffer.read() return open_workbook(file_contents=data) @@ -45,9 +46,13 @@ def get_sheet_by_index(self, index): return self.book.sheet_by_index(index) def get_sheet_data(self, sheet, convert_float): - from xlrd import (xldate, XL_CELL_DATE, - XL_CELL_ERROR, XL_CELL_BOOLEAN, - XL_CELL_NUMBER) + from xlrd import ( + xldate, + XL_CELL_DATE, + XL_CELL_ERROR, + XL_CELL_BOOLEAN, + XL_CELL_NUMBER, + ) epoch1904 = self.book.datemode @@ -59,8 +64,7 @@ def _parse_cell(cell_contents, cell_typ): # Use the newer xlrd datetime handling. try: - cell_contents = xldate.xldate_as_datetime( - cell_contents, epoch1904) + cell_contents = xldate.xldate_as_datetime(cell_contents, epoch1904) except OverflowError: return cell_contents @@ -68,12 +72,15 @@ def _parse_cell(cell_contents, cell_typ): # so we treat dates on the epoch as times only. # Also, Excel supports 1900 and 1904 epochs. year = (cell_contents.timetuple())[0:3] - if ((not epoch1904 and year == (1899, 12, 31)) or - (epoch1904 and year == (1904, 1, 1))): - cell_contents = time(cell_contents.hour, - cell_contents.minute, - cell_contents.second, - cell_contents.microsecond) + if (not epoch1904 and year == (1899, 12, 31)) or ( + epoch1904 and year == (1904, 1, 1) + ): + cell_contents = time( + cell_contents.hour, + cell_contents.minute, + cell_contents.second, + cell_contents.microsecond, + ) elif cell_typ == XL_CELL_ERROR: cell_contents = np.nan @@ -90,9 +97,10 @@ def _parse_cell(cell_contents, cell_typ): data = [] for i in range(sheet.nrows): - row = [_parse_cell(value, typ) - for value, typ in zip(sheet.row_values(i), - sheet.row_types(i))] + row = [ + _parse_cell(value, typ) + for value, typ in zip(sheet.row_values(i), sheet.row_types(i)) + ] data.append(row) return data diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py index 2ddfcf3de5a8f1..07bf265da4863b 100644 --- a/pandas/io/excel/_xlsxwriter.py +++ b/pandas/io/excel/_xlsxwriter.py @@ -9,75 +9,69 @@ class _XlsxStyler: # Ordering necessary for both determinism and because some are keyed by # prefixes of others. STYLE_MAPPING = { - 'font': [ - (('name',), 'font_name'), - (('sz',), 'font_size'), - (('size',), 'font_size'), - (('color', 'rgb',), 'font_color'), - (('color',), 'font_color'), - (('b',), 'bold'), - (('bold',), 'bold'), - (('i',), 'italic'), - (('italic',), 'italic'), - (('u',), 'underline'), - (('underline',), 'underline'), - (('strike',), 'font_strikeout'), - (('vertAlign',), 'font_script'), - (('vertalign',), 'font_script'), + "font": [ + (("name",), "font_name"), + (("sz",), "font_size"), + (("size",), "font_size"), + (("color", "rgb"), "font_color"), + (("color",), "font_color"), + (("b",), "bold"), + (("bold",), "bold"), + (("i",), "italic"), + (("italic",), "italic"), + (("u",), "underline"), + (("underline",), "underline"), + (("strike",), "font_strikeout"), + (("vertAlign",), "font_script"), + (("vertalign",), "font_script"), ], - 'number_format': [ - (('format_code',), 'num_format'), - ((), 'num_format',), + "number_format": [(("format_code",), "num_format"), ((), "num_format")], + "protection": [(("locked",), "locked"), (("hidden",), "hidden")], + "alignment": [ + (("horizontal",), "align"), + (("vertical",), "valign"), + (("text_rotation",), "rotation"), + (("wrap_text",), "text_wrap"), + (("indent",), "indent"), + (("shrink_to_fit",), "shrink"), ], - 'protection': [ - (('locked',), 'locked'), - (('hidden',), 'hidden'), + "fill": [ + (("patternType",), "pattern"), + (("patterntype",), "pattern"), + (("fill_type",), "pattern"), + (("start_color", "rgb"), "fg_color"), + (("fgColor", "rgb"), "fg_color"), + (("fgcolor", "rgb"), "fg_color"), + (("start_color",), "fg_color"), + (("fgColor",), "fg_color"), + (("fgcolor",), "fg_color"), + (("end_color", "rgb"), "bg_color"), + (("bgColor", "rgb"), "bg_color"), + (("bgcolor", "rgb"), "bg_color"), + (("end_color",), "bg_color"), + (("bgColor",), "bg_color"), + (("bgcolor",), "bg_color"), ], - 'alignment': [ - (('horizontal',), 'align'), - (('vertical',), 'valign'), - (('text_rotation',), 'rotation'), - (('wrap_text',), 'text_wrap'), - (('indent',), 'indent'), - (('shrink_to_fit',), 'shrink'), - ], - 'fill': [ - (('patternType',), 'pattern'), - (('patterntype',), 'pattern'), - (('fill_type',), 'pattern'), - (('start_color', 'rgb',), 'fg_color'), - (('fgColor', 'rgb',), 'fg_color'), - (('fgcolor', 'rgb',), 'fg_color'), - (('start_color',), 'fg_color'), - (('fgColor',), 'fg_color'), - (('fgcolor',), 'fg_color'), - (('end_color', 'rgb',), 'bg_color'), - (('bgColor', 'rgb',), 'bg_color'), - (('bgcolor', 'rgb',), 'bg_color'), - (('end_color',), 'bg_color'), - (('bgColor',), 'bg_color'), - (('bgcolor',), 'bg_color'), - ], - 'border': [ - (('color', 'rgb',), 'border_color'), - (('color',), 'border_color'), - (('style',), 'border'), - (('top', 'color', 'rgb',), 'top_color'), - (('top', 'color',), 'top_color'), - (('top', 'style',), 'top'), - (('top',), 'top'), - (('right', 'color', 'rgb',), 'right_color'), - (('right', 'color',), 'right_color'), - (('right', 'style',), 'right'), - (('right',), 'right'), - (('bottom', 'color', 'rgb',), 'bottom_color'), - (('bottom', 'color',), 'bottom_color'), - (('bottom', 'style',), 'bottom'), - (('bottom',), 'bottom'), - (('left', 'color', 'rgb',), 'left_color'), - (('left', 'color',), 'left_color'), - (('left', 'style',), 'left'), - (('left',), 'left'), + "border": [ + (("color", "rgb"), "border_color"), + (("color",), "border_color"), + (("style",), "border"), + (("top", "color", "rgb"), "top_color"), + (("top", "color"), "top_color"), + (("top", "style"), "top"), + (("top",), "top"), + (("right", "color", "rgb"), "right_color"), + (("right", "color"), "right_color"), + (("right", "style"), "right"), + (("right",), "right"), + (("bottom", "color", "rgb"), "bottom_color"), + (("bottom", "color"), "bottom_color"), + (("bottom", "style"), "bottom"), + (("bottom",), "bottom"), + (("left", "color", "rgb"), "left_color"), + (("left", "color"), "left_color"), + (("left", "style"), "left"), + (("left",), "left"), ], } @@ -96,14 +90,14 @@ def convert(cls, style_dict, num_format_str=None): props = {} if num_format_str is not None: - props['num_format'] = num_format_str + props["num_format"] = num_format_str if style_dict is None: return props - if 'borders' in style_dict: + if "borders" in style_dict: style_dict = style_dict.copy() - style_dict['border'] = style_dict.pop('borders') + style_dict["border"] = style_dict.pop("borders") for style_group_key, style_group in style_dict.items(): for src, dst in cls.STYLE_MAPPING.get(style_group_key, []): @@ -120,51 +114,76 @@ def convert(cls, style_dict, num_format_str=None): else: props[dst] = v - if isinstance(props.get('pattern'), str): + if isinstance(props.get("pattern"), str): # TODO: support other fill patterns - props['pattern'] = 0 if props['pattern'] == 'none' else 1 + props["pattern"] = 0 if props["pattern"] == "none" else 1 - for k in ['border', 'top', 'right', 'bottom', 'left']: + for k in ["border", "top", "right", "bottom", "left"]: if isinstance(props.get(k), str): try: - props[k] = ['none', 'thin', 'medium', 'dashed', 'dotted', - 'thick', 'double', 'hair', 'mediumDashed', - 'dashDot', 'mediumDashDot', 'dashDotDot', - 'mediumDashDotDot', - 'slantDashDot'].index(props[k]) + props[k] = [ + "none", + "thin", + "medium", + "dashed", + "dotted", + "thick", + "double", + "hair", + "mediumDashed", + "dashDot", + "mediumDashDot", + "dashDotDot", + "mediumDashDotDot", + "slantDashDot", + ].index(props[k]) except ValueError: props[k] = 2 - if isinstance(props.get('font_script'), str): - props['font_script'] = ['baseline', 'superscript', - 'subscript'].index(props['font_script']) + if isinstance(props.get("font_script"), str): + props["font_script"] = ["baseline", "superscript", "subscript"].index( + props["font_script"] + ) - if isinstance(props.get('underline'), str): - props['underline'] = {'none': 0, 'single': 1, 'double': 2, - 'singleAccounting': 33, - 'doubleAccounting': 34}[props['underline']] + if isinstance(props.get("underline"), str): + props["underline"] = { + "none": 0, + "single": 1, + "double": 2, + "singleAccounting": 33, + "doubleAccounting": 34, + }[props["underline"]] return props class _XlsxWriter(ExcelWriter): - engine = 'xlsxwriter' - supported_extensions = ('.xlsx',) - - def __init__(self, path, engine=None, - date_format=None, datetime_format=None, mode='w', - **engine_kwargs): + engine = "xlsxwriter" + supported_extensions = (".xlsx",) + + def __init__( + self, + path, + engine=None, + date_format=None, + datetime_format=None, + mode="w", + **engine_kwargs + ): # Use the xlsxwriter module as the Excel writer. import xlsxwriter - if mode == 'a': - raise ValueError('Append mode is not supported with xlsxwriter!') + if mode == "a": + raise ValueError("Append mode is not supported with xlsxwriter!") - super().__init__(path, engine=engine, - date_format=date_format, - datetime_format=datetime_format, - mode=mode, - **engine_kwargs) + super().__init__( + path, + engine=engine, + date_format=date_format, + datetime_format=datetime_format, + mode=mode, + **engine_kwargs + ) self.book = xlsxwriter.Workbook(path, **engine_kwargs) @@ -175,8 +194,9 @@ def save(self): return self.book.close() - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, - freeze_panes=None): + def write_cells( + self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None + ): # Write the frame cells using xlsxwriter. sheet_name = self._get_sheet_name(sheet_name) @@ -186,7 +206,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, wks = self.book.add_worksheet(sheet_name) self.sheets[sheet_name] = wks - style_dict = {'null': None} + style_dict = {"null": None} if _validate_freeze_panes(freeze_panes): wks.freeze_panes(*(freeze_panes)) @@ -201,17 +221,17 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, if stylekey in style_dict: style = style_dict[stylekey] else: - style = self.book.add_format( - _XlsxStyler.convert(cell.style, fmt)) + style = self.book.add_format(_XlsxStyler.convert(cell.style, fmt)) style_dict[stylekey] = style if cell.mergestart is not None and cell.mergeend is not None: - wks.merge_range(startrow + cell.row, - startcol + cell.col, - startrow + cell.mergestart, - startcol + cell.mergeend, - val, style) + wks.merge_range( + startrow + cell.row, + startcol + cell.col, + startrow + cell.mergestart, + startcol + cell.mergeend, + val, + style, + ) else: - wks.write(startrow + cell.row, - startcol + cell.col, - val, style) + wks.write(startrow + cell.row, startcol + cell.col, val, style) diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py index 62a57b99fe5563..fe3d0a208de6a4 100644 --- a/pandas/io/excel/_xlwt.py +++ b/pandas/io/excel/_xlwt.py @@ -5,22 +5,22 @@ class _XlwtWriter(ExcelWriter): - engine = 'xlwt' - supported_extensions = ('.xls',) + engine = "xlwt" + supported_extensions = (".xls",) - def __init__(self, path, engine=None, encoding=None, mode='w', - **engine_kwargs): + def __init__(self, path, engine=None, encoding=None, mode="w", **engine_kwargs): # Use the xlwt module as the Excel writer. import xlwt - engine_kwargs['engine'] = engine - if mode == 'a': - raise ValueError('Append mode is not supported with xlwt!') + engine_kwargs["engine"] = engine + + if mode == "a": + raise ValueError("Append mode is not supported with xlwt!") super().__init__(path, mode=mode, **engine_kwargs) if encoding is None: - encoding = 'ascii' + encoding = "ascii" self.book = xlwt.Workbook(encoding=encoding) self.fm_datetime = xlwt.easyxf(num_format_str=self.datetime_format) self.fm_date = xlwt.easyxf(num_format_str=self.date_format) @@ -31,8 +31,9 @@ def save(self): """ return self.book.save(self.path) - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, - freeze_panes=None): + def write_cells( + self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None + ): # Write the frame cells using xlwt. sheet_name = self._get_sheet_name(sheet_name) @@ -64,19 +65,19 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, style_dict[stylekey] = style if cell.mergestart is not None and cell.mergeend is not None: - wks.write_merge(startrow + cell.row, - startrow + cell.mergestart, - startcol + cell.col, - startcol + cell.mergeend, - val, style) + wks.write_merge( + startrow + cell.row, + startrow + cell.mergestart, + startcol + cell.col, + startcol + cell.mergeend, + val, + style, + ) else: - wks.write(startrow + cell.row, - startcol + cell.col, - val, style) + wks.write(startrow + cell.row, startcol + cell.col, val, style) @classmethod - def _style_to_xlwt(cls, item, firstlevel=True, field_sep=',', - line_sep=';'): + def _style_to_xlwt(cls, item, firstlevel=True, field_sep=",", line_sep=";"): """helper which recursively generate an xlwt easy style string for example: @@ -91,17 +92,19 @@ def _style_to_xlwt(cls, item, firstlevel=True, field_sep=',', border: top thin, right thin, bottom thin, left thin; \ align: horiz center; """ - if hasattr(item, 'items'): + if hasattr(item, "items"): if firstlevel: - it = ["{key}: {val}" - .format(key=key, val=cls._style_to_xlwt(value, False)) - for key, value in item.items()] + it = [ + "{key}: {val}".format(key=key, val=cls._style_to_xlwt(value, False)) + for key, value in item.items() + ] out = "{sep} ".format(sep=(line_sep).join(it)) return out else: - it = ["{key} {val}" - .format(key=key, val=cls._style_to_xlwt(value, False)) - for key, value in item.items()] + it = [ + "{key} {val}".format(key=key, val=cls._style_to_xlwt(value, False)) + for key, value in item.items() + ] out = "{sep} ".format(sep=(field_sep).join(it)) return out else: @@ -123,7 +126,7 @@ def _convert_to_style(cls, style_dict, num_format_str=None): if style_dict: xlwt_stylestr = cls._style_to_xlwt(style_dict) - style = xlwt.easyxf(xlwt_stylestr, field_sep=',', line_sep=';') + style = xlwt.easyxf(xlwt_stylestr, field_sep=",", line_sep=";") else: style = xlwt.XFStyle() if num_format_str is not None: diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 93252f3a09ceb5..05608f69c0d9da 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -28,7 +28,7 @@ def to_feather(df, path): if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") - valid_types = {'string', 'unicode'} + valid_types = {"string", "unicode"} # validate index # -------------- @@ -37,20 +37,24 @@ def to_feather(df, path): # raise on anything else as we don't serialize the index if not isinstance(df.index, Int64Index): - raise ValueError("feather does not support serializing {} " - "for the index; you can .reset_index()" - "to make the index into column(s)".format( - type(df.index))) + raise ValueError( + "feather does not support serializing {} " + "for the index; you can .reset_index()" + "to make the index into column(s)".format(type(df.index)) + ) if not df.index.equals(RangeIndex.from_range(range(len(df)))): - raise ValueError("feather does not support serializing a " - "non-default index for the index; you " - "can .reset_index() to make the index " - "into column(s)") + raise ValueError( + "feather does not support serializing a " + "non-default index for the index; you " + "can .reset_index() to make the index " + "into column(s)" + ) if df.index.name is not None: - raise ValueError("feather does not serialize index meta-data on a " - "default index") + raise ValueError( + "feather does not serialize index meta-data on a " "default index" + ) # validate columns # ---------------- @@ -62,7 +66,7 @@ def to_feather(df, path): feather.write_feather(df, path) -@deprecate_kwarg(old_arg_name='nthreads', new_arg_name='use_threads') +@deprecate_kwarg(old_arg_name="nthreads", new_arg_name="use_threads") def read_feather(path, columns=None, use_threads=True): """ Load a feather-format object from the file path @@ -95,12 +99,10 @@ def read_feather(path, columns=None, use_threads=True): path = _stringify_path(path) - if LooseVersion(pyarrow.__version__) < LooseVersion('0.11.0'): + if LooseVersion(pyarrow.__version__) < LooseVersion("0.11.0"): int_use_threads = int(use_threads) if int_use_threads < 1: int_use_threads = 1 - return feather.read_feather(path, columns=columns, - nthreads=int_use_threads) + return feather.read_feather(path, columns=columns, nthreads=int_use_threads) - return feather.read_feather(path, columns=columns, - use_threads=bool(use_threads)) + return feather.read_feather(path, columns=columns, use_threads=bool(use_threads)) diff --git a/pandas/io/formats/console.py b/pandas/io/formats/console.py index 19c822e5dc270c..7f8f2fbea23528 100644 --- a/pandas/io/formats/console.py +++ b/pandas/io/formats/console.py @@ -12,9 +12,9 @@ def get_console_size(): """ from pandas import get_option - display_width = get_option('display.width') + display_width = get_option("display.width") # deprecated. - display_height = get_option('display.max_rows') + display_height = get_option("display.max_rows") # Consider # interactive shell terminal, can detect term size @@ -31,8 +31,9 @@ def get_console_size(): # sane defaults for interactive non-shell terminal # match default for width,height in config_init from pandas._config.config import get_default_val - terminal_width = get_default_val('display.width') - terminal_height = get_default_val('display.max_rows') + + terminal_width = get_default_val("display.width") + terminal_height = get_default_val("display.max_rows") else: # pure terminal terminal_width, terminal_height = get_terminal_size() @@ -48,6 +49,7 @@ def get_console_size(): # ---------------------------------------------------------------------- # Detect our environment + def in_interactive_session(): """ check if we're running in an interactive shell @@ -59,9 +61,8 @@ def check_main(): try: import __main__ as main except ModuleNotFoundError: - return get_option('mode.sim_interactive') - return (not hasattr(main, '__file__') or - get_option('mode.sim_interactive')) + return get_option("mode.sim_interactive") + return not hasattr(main, "__file__") or get_option("mode.sim_interactive") try: return __IPYTHON__ or check_main() # noqa @@ -75,7 +76,7 @@ def in_ipython_frontend(): """ try: ip = get_ipython() # noqa - return 'zmq' in str(type(ip)).lower() + return "zmq" in str(type(ip)).lower() except NameError: pass diff --git a/pandas/io/formats/css.py b/pandas/io/formats/css.py index 2527e45650ea3a..92fe87cddb35b4 100644 --- a/pandas/io/formats/css.py +++ b/pandas/io/formats/css.py @@ -7,6 +7,7 @@ class CSSWarning(UserWarning): """This CSS syntax cannot currently be parsed""" + pass @@ -63,9 +64,9 @@ def __call__(self, declarations_str, inherited=None): props[prop] = val for prop, val in list(props.items()): - if val == 'inherit': - val = inherited.get(prop, 'initial') - if val == 'initial': + if val == "inherit": + val = inherited.get(prop, "initial") + if val == "initial": val = None if val is None: @@ -75,90 +76,94 @@ def __call__(self, declarations_str, inherited=None): props[prop] = val # 2. resolve relative font size - if props.get('font-size'): - if 'font-size' in inherited: - em_pt = inherited['font-size'] - assert em_pt[-2:] == 'pt' + if props.get("font-size"): + if "font-size" in inherited: + em_pt = inherited["font-size"] + assert em_pt[-2:] == "pt" em_pt = float(em_pt[:-2]) else: em_pt = None - props['font-size'] = self.size_to_pt( - props['font-size'], em_pt, conversions=self.FONT_SIZE_RATIOS) + props["font-size"] = self.size_to_pt( + props["font-size"], em_pt, conversions=self.FONT_SIZE_RATIOS + ) - font_size = float(props['font-size'][:-2]) + font_size = float(props["font-size"][:-2]) else: font_size = None # 3. TODO: resolve other font-relative units for side in self.SIDES: - prop = 'border-{side}-width'.format(side=side) + prop = "border-{side}-width".format(side=side) if prop in props: props[prop] = self.size_to_pt( - props[prop], em_pt=font_size, - conversions=self.BORDER_WIDTH_RATIOS) - for prop in ['margin-{side}'.format(side=side), - 'padding-{side}'.format(side=side)]: + props[prop], em_pt=font_size, conversions=self.BORDER_WIDTH_RATIOS + ) + for prop in [ + "margin-{side}".format(side=side), + "padding-{side}".format(side=side), + ]: if prop in props: # TODO: support % props[prop] = self.size_to_pt( - props[prop], em_pt=font_size, - conversions=self.MARGIN_RATIOS) + props[prop], em_pt=font_size, conversions=self.MARGIN_RATIOS + ) return props UNIT_RATIOS = { - 'rem': ('pt', 12), - 'ex': ('em', .5), + "rem": ("pt", 12), + "ex": ("em", 0.5), # 'ch': - 'px': ('pt', .75), - 'pc': ('pt', 12), - 'in': ('pt', 72), - 'cm': ('in', 1 / 2.54), - 'mm': ('in', 1 / 25.4), - 'q': ('mm', .25), - '!!default': ('em', 0), + "px": ("pt", 0.75), + "pc": ("pt", 12), + "in": ("pt", 72), + "cm": ("in", 1 / 2.54), + "mm": ("in", 1 / 25.4), + "q": ("mm", 0.25), + "!!default": ("em", 0), } FONT_SIZE_RATIOS = UNIT_RATIOS.copy() - FONT_SIZE_RATIOS.update({ - '%': ('em', .01), - 'xx-small': ('rem', .5), - 'x-small': ('rem', .625), - 'small': ('rem', .8), - 'medium': ('rem', 1), - 'large': ('rem', 1.125), - 'x-large': ('rem', 1.5), - 'xx-large': ('rem', 2), - 'smaller': ('em', 1 / 1.2), - 'larger': ('em', 1.2), - '!!default': ('em', 1), - }) + FONT_SIZE_RATIOS.update( + { + "%": ("em", 0.01), + "xx-small": ("rem", 0.5), + "x-small": ("rem", 0.625), + "small": ("rem", 0.8), + "medium": ("rem", 1), + "large": ("rem", 1.125), + "x-large": ("rem", 1.5), + "xx-large": ("rem", 2), + "smaller": ("em", 1 / 1.2), + "larger": ("em", 1.2), + "!!default": ("em", 1), + } + ) MARGIN_RATIOS = UNIT_RATIOS.copy() - MARGIN_RATIOS.update({ - 'none': ('pt', 0), - }) + MARGIN_RATIOS.update({"none": ("pt", 0)}) BORDER_WIDTH_RATIOS = UNIT_RATIOS.copy() - BORDER_WIDTH_RATIOS.update({ - 'none': ('pt', 0), - 'thick': ('px', 4), - 'medium': ('px', 2), - 'thin': ('px', 1), - # Default: medium only if solid - }) + BORDER_WIDTH_RATIOS.update( + { + "none": ("pt", 0), + "thick": ("px", 4), + "medium": ("px", 2), + "thin": ("px", 1), + # Default: medium only if solid + } + ) def size_to_pt(self, in_val, em_pt=None, conversions=UNIT_RATIOS): def _error(): - warnings.warn('Unhandled size: {val!r}'.format(val=in_val), - CSSWarning) - return self.size_to_pt('1!!default', conversions=conversions) + warnings.warn("Unhandled size: {val!r}".format(val=in_val), CSSWarning) + return self.size_to_pt("1!!default", conversions=conversions) try: - val, unit = re.match(r'^(\S*?)([a-zA-Z%!].*)', in_val).groups() + val, unit = re.match(r"^(\S*?)([a-zA-Z%!].*)", in_val).groups() except AttributeError: return _error() - if val == '': + if val == "": # hack for 'large' etc. val = 1 else: @@ -167,13 +172,13 @@ def _error(): except ValueError: return _error() - while unit != 'pt': - if unit == 'em': + while unit != "pt": + if unit == "em": if em_pt is None: - unit = 'rem' + unit = "rem" else: val *= em_pt - unit = 'pt' + unit = "pt" continue try: @@ -184,14 +189,14 @@ def _error(): val = round(val, 5) if int(val) == val: - size_fmt = '{fmt:d}pt'.format(fmt=int(val)) + size_fmt = "{fmt:d}pt".format(fmt=int(val)) else: - size_fmt = '{fmt:f}pt'.format(fmt=val) + size_fmt = "{fmt:f}pt".format(fmt=val) return size_fmt def atomize(self, declarations): for prop, value in declarations: - attr = 'expand_' + prop.replace('-', '_') + attr = "expand_" + prop.replace("-", "_") try: expand = getattr(self, attr) except AttributeError: @@ -206,7 +211,7 @@ def atomize(self, declarations): 3: [0, 1, 2, 1], 4: [0, 1, 2, 3], } - SIDES = ('top', 'right', 'bottom', 'left') + SIDES = ("top", "right", "bottom", "left") def _side_expander(prop_fmt): def expand(self, prop, value): @@ -214,34 +219,39 @@ def expand(self, prop, value): try: mapping = self.SIDE_SHORTHANDS[len(tokens)] except KeyError: - warnings.warn('Could not expand "{prop}: {val}"' - .format(prop=prop, val=value), CSSWarning) + warnings.warn( + 'Could not expand "{prop}: {val}"'.format(prop=prop, val=value), + CSSWarning, + ) return for key, idx in zip(self.SIDES, mapping): yield prop_fmt.format(key), tokens[idx] return expand - expand_border_color = _side_expander('border-{:s}-color') - expand_border_style = _side_expander('border-{:s}-style') - expand_border_width = _side_expander('border-{:s}-width') - expand_margin = _side_expander('margin-{:s}') - expand_padding = _side_expander('padding-{:s}') + expand_border_color = _side_expander("border-{:s}-color") + expand_border_style = _side_expander("border-{:s}-style") + expand_border_width = _side_expander("border-{:s}-width") + expand_margin = _side_expander("margin-{:s}") + expand_padding = _side_expander("padding-{:s}") def parse(self, declarations_str): """Generates (prop, value) pairs from declarations In a future version may generate parsed tokens from tinycss/tinycss2 """ - for decl in declarations_str.split(';'): + for decl in declarations_str.split(";"): if not decl.strip(): continue - prop, sep, val = decl.partition(':') + prop, sep, val = decl.partition(":") prop = prop.strip().lower() # TODO: don't lowercase case sensitive parts of values (strings) val = val.strip().lower() if sep: yield prop, val else: - warnings.warn('Ill-formatted attribute: expected a colon ' - 'in {decl!r}'.format(decl=decl), CSSWarning) + warnings.warn( + "Ill-formatted attribute: expected a colon " + "in {decl!r}".format(decl=decl), + CSSWarning, + ) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index e1d95862ec872c..d86bf432b83c48 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -13,22 +13,45 @@ from pandas._libs import writers as libwriters from pandas.core.dtypes.generic import ( - ABCDatetimeIndex, ABCIndexClass, ABCMultiIndex, ABCPeriodIndex) + ABCDatetimeIndex, + ABCIndexClass, + ABCMultiIndex, + ABCPeriodIndex, +) from pandas.core.dtypes.missing import notna from pandas.io.common import ( - UnicodeWriter, _get_handle, _infer_compression, get_filepath_or_buffer) + UnicodeWriter, + _get_handle, + _infer_compression, + get_filepath_or_buffer, +) class CSVFormatter: - - def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', - float_format=None, cols=None, header=True, index=True, - index_label=None, mode='w', encoding=None, - compression='infer', quoting=None, line_terminator='\n', - chunksize=None, quotechar='"', - date_format=None, doublequote=True, escapechar=None, - decimal='.'): + def __init__( + self, + obj, + path_or_buf=None, + sep=",", + na_rep="", + float_format=None, + cols=None, + header=True, + index=True, + index_label=None, + mode="w", + encoding=None, + compression="infer", + quoting=None, + line_terminator="\n", + chunksize=None, + quotechar='"', + date_format=None, + doublequote=True, + escapechar=None, + decimal=".", + ): self.obj = obj @@ -48,7 +71,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', self.index_label = index_label self.mode = mode if encoding is None: - encoding = 'utf-8' + encoding = "utf-8" self.encoding = encoding self.compression = _infer_compression(self.path_or_buf, compression) @@ -73,15 +96,18 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', # validate mi options if self.has_mi_columns: if cols is not None: - raise TypeError("cannot specify cols with a MultiIndex on the " - "columns") + raise TypeError( + "cannot specify cols with a MultiIndex on the " "columns" + ) if cols is not None: if isinstance(cols, ABCIndexClass): - cols = cols.to_native_types(na_rep=na_rep, - float_format=float_format, - date_format=date_format, - quoting=self.quoting) + cols = cols.to_native_types( + na_rep=na_rep, + float_format=float_format, + date_format=date_format, + quoting=self.quoting, + ) else: cols = list(cols) self.obj = self.obj.loc[:, cols] @@ -90,10 +116,12 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', # and make sure sure cols is just a list of labels cols = self.obj.columns if isinstance(cols, ABCIndexClass): - cols = cols.to_native_types(na_rep=na_rep, - float_format=float_format, - date_format=date_format, - quoting=self.quoting) + cols = cols.to_native_types( + na_rep=na_rep, + float_format=float_format, + date_format=date_format, + quoting=self.quoting, + ) else: cols = list(cols) @@ -110,13 +138,17 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', self.chunksize = int(chunksize) self.data_index = obj.index - if (isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) and - date_format is not None): + if ( + isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) + and date_format is not None + ): from pandas import Index - self.data_index = Index([x.strftime(date_format) if notna(x) else - '' for x in self.data_index]) - self.nlevels = getattr(self.data_index, 'nlevels', 1) + self.data_index = Index( + [x.strftime(date_format) if notna(x) else "" for x in self.data_index] + ) + + self.nlevels = getattr(self.data_index, "nlevels", 1) if not index: self.nlevels = 0 @@ -125,15 +157,14 @@ def save(self): Create the writer & save """ # GH21227 internal compression is not used when file-like passed. - if self.compression and hasattr(self.path_or_buf, 'write'): - msg = ("compression has no effect when passing file-like " - "object as input.") + if self.compression and hasattr(self.path_or_buf, "write"): + msg = "compression has no effect when passing file-like " "object as input." warnings.warn(msg, RuntimeWarning, stacklevel=2) # when zip compression is called. is_zip = isinstance(self.path_or_buf, ZipFile) or ( - not hasattr(self.path_or_buf, 'write') - and self.compression == 'zip') + not hasattr(self.path_or_buf, "write") and self.compression == "zip" + ) if is_zip: # zipfile doesn't support writing string to archive. uses string @@ -141,25 +172,31 @@ def save(self): # file handle. GH21241, GH21118 f = StringIO() close = False - elif hasattr(self.path_or_buf, 'write'): + elif hasattr(self.path_or_buf, "write"): f = self.path_or_buf close = False else: - f, handles = _get_handle(self.path_or_buf, self.mode, - encoding=self.encoding, - compression=self.compression) + f, handles = _get_handle( + self.path_or_buf, + self.mode, + encoding=self.encoding, + compression=self.compression, + ) close = True try: - writer_kwargs = dict(lineterminator=self.line_terminator, - delimiter=self.sep, quoting=self.quoting, - doublequote=self.doublequote, - escapechar=self.escapechar, - quotechar=self.quotechar) - if self.encoding == 'ascii': + writer_kwargs = dict( + lineterminator=self.line_terminator, + delimiter=self.sep, + quoting=self.quoting, + doublequote=self.doublequote, + escapechar=self.escapechar, + quotechar=self.quotechar, + ) + if self.encoding == "ascii": self.writer = csvlib.writer(f, **writer_kwargs) else: - writer_kwargs['encoding'] = self.encoding + writer_kwargs["encoding"] = self.encoding self.writer = UnicodeWriter(f, **writer_kwargs) self._save() @@ -168,12 +205,15 @@ def save(self): if is_zip: # GH17778 handles zip compression separately. buf = f.getvalue() - if hasattr(self.path_or_buf, 'write'): + if hasattr(self.path_or_buf, "write"): self.path_or_buf.write(buf) else: - f, handles = _get_handle(self.path_or_buf, self.mode, - encoding=self.encoding, - compression=self.compression) + f, handles = _get_handle( + self.path_or_buf, + self.mode, + encoding=self.encoding, + compression=self.compression, + ) f.write(buf) close = True if close: @@ -191,15 +231,17 @@ def _save_header(self): header = self.header encoded_labels = [] - has_aliases = isinstance(header, (tuple, list, np.ndarray, - ABCIndexClass)) + has_aliases = isinstance(header, (tuple, list, np.ndarray, ABCIndexClass)) if not (has_aliases or self.header): return if has_aliases: if len(header) != len(cols): - raise ValueError(('Writing {ncols} cols but got {nalias} ' - 'aliases'.format(ncols=len(cols), - nalias=len(header)))) + raise ValueError( + ( + "Writing {ncols} cols but got {nalias} " + "aliases".format(ncols=len(cols), nalias=len(header)) + ) + ) else: write_cols = header else: @@ -213,16 +255,17 @@ def _save_header(self): index_label = [] for i, name in enumerate(obj.index.names): if name is None: - name = '' + name = "" index_label.append(name) else: index_label = obj.index.name if index_label is None: - index_label = [''] + index_label = [""] else: index_label = [index_label] - elif not isinstance(index_label, - (list, tuple, np.ndarray, ABCIndexClass)): + elif not isinstance( + index_label, (list, tuple, np.ndarray, ABCIndexClass) + ): # given a string for a DF with Index index_label = [index_label] @@ -249,7 +292,7 @@ def _save_header(self): col_line.append(columns.names[i]) if isinstance(index_label, list) and len(index_label) > 1: - col_line.extend([''] * (len(index_label) - 1)) + col_line.extend([""] * (len(index_label) - 1)) col_line.extend(columns._get_level_values(i)) @@ -258,8 +301,8 @@ def _save_header(self): # Write out the index line if it's not empty. # Otherwise, we will print out an extraneous # blank line between the mi and the data rows. - if encoded_labels and set(encoded_labels) != {''}: - encoded_labels.extend([''] * len(columns)) + if encoded_labels and set(encoded_labels) != {""}: + encoded_labels.extend([""] * len(columns)) writer.writerow(encoded_labels) def _save(self): @@ -288,21 +331,26 @@ def _save_chunk(self, start_i, end_i): slicer = slice(start_i, end_i) for i in range(len(self.blocks)): b = self.blocks[i] - d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, - float_format=self.float_format, - decimal=self.decimal, - date_format=self.date_format, - quoting=self.quoting) + d = b.to_native_types( + slicer=slicer, + na_rep=self.na_rep, + float_format=self.float_format, + decimal=self.decimal, + date_format=self.date_format, + quoting=self.quoting, + ) for col_loc, col in zip(b.mgr_locs, d): # self.data is a preallocated list self.data[col_loc] = col - ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, - float_format=self.float_format, - decimal=self.decimal, - date_format=self.date_format, - quoting=self.quoting) + ix = data_index.to_native_types( + slicer=slicer, + na_rep=self.na_rep, + float_format=self.float_format, + decimal=self.decimal, + date_format=self.date_format, + quoting=self.quoting, + ) - libwriters.write_csv_rows(self.data, ix, self.nlevels, - self.cols, self.writer) + libwriters.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer) diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 66a00bf9ab0540..012d2d93582412 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -21,11 +21,10 @@ class ExcelCell: - __fields__ = ('row', 'col', 'val', 'style', 'mergestart', 'mergeend') + __fields__ = ("row", "col", "val", "style", "mergestart", "mergeend") __slots__ = __fields__ - def __init__(self, row, col, val, style=None, mergestart=None, - mergeend=None): + def __init__(self, row, col, val, style=None, mergestart=None, mergeend=None): self.row = row self.col = col self.val = val @@ -50,6 +49,7 @@ class CSSToExcelConverter: CSS declarations understood to be the containing scope for the CSS processed by :meth:`__call__`. """ + # NB: Most of the methods here could be classmethods, as only __init__ # and __call__ make use of instance attributes. We leave them as # instancemethods so that users can easily experiment with extensions @@ -84,11 +84,11 @@ def __call__(self, declarations_str): def build_xlstyle(self, props): out = { - 'alignment': self.build_alignment(props), - 'border': self.build_border(props), - 'fill': self.build_fill(props), - 'font': self.build_font(props), - 'number_format': self.build_number_format(props), + "alignment": self.build_alignment(props), + "border": self.build_border(props), + "fill": self.build_fill(props), + "font": self.build_font(props), + "number_format": self.build_number_format(props), } # TODO: handle cell width and height: needs support in pandas.io.excel @@ -106,33 +106,40 @@ def remove_none(d): return out VERTICAL_MAP = { - 'top': 'top', - 'text-top': 'top', - 'middle': 'center', - 'baseline': 'bottom', - 'bottom': 'bottom', - 'text-bottom': 'bottom', + "top": "top", + "text-top": "top", + "middle": "center", + "baseline": "bottom", + "bottom": "bottom", + "text-bottom": "bottom", # OpenXML also has 'justify', 'distributed' } def build_alignment(self, props): # TODO: text-indent, padding-left -> alignment.indent - return {'horizontal': props.get('text-align'), - 'vertical': self.VERTICAL_MAP.get(props.get('vertical-align')), - 'wrap_text': (None if props.get('white-space') is None else - props['white-space'] not in - ('nowrap', 'pre', 'pre-line')) - } + return { + "horizontal": props.get("text-align"), + "vertical": self.VERTICAL_MAP.get(props.get("vertical-align")), + "wrap_text": ( + None + if props.get("white-space") is None + else props["white-space"] not in ("nowrap", "pre", "pre-line") + ), + } def build_border(self, props): - return {side: { - 'style': self._border_style(props.get('border-{side}-style' - .format(side=side)), - props.get('border-{side}-width' - .format(side=side))), - 'color': self.color_to_excel( - props.get('border-{side}-color'.format(side=side))), - } for side in ['top', 'right', 'bottom', 'left']} + return { + side: { + "style": self._border_style( + props.get("border-{side}-style".format(side=side)), + props.get("border-{side}-width".format(side=side)), + ), + "color": self.color_to_excel( + props.get("border-{side}-color".format(side=side)) + ), + } + for side in ["top", "right", "bottom", "left"] + } def _border_style(self, style, width): # convert styles and widths to openxml, one of: @@ -151,61 +158,70 @@ def _border_style(self, style, width): # 'thin' if width is None and style is None: return None - if style == 'none' or style == 'hidden': + if style == "none" or style == "hidden": return None if width is None: - width = '2pt' + width = "2pt" width = float(width[:-2]) if width < 1e-5: return None elif width < 1.3: - width_name = 'thin' + width_name = "thin" elif width < 2.8: - width_name = 'medium' + width_name = "medium" else: - width_name = 'thick' + width_name = "thick" - if style in (None, 'groove', 'ridge', 'inset', 'outset'): + if style in (None, "groove", "ridge", "inset", "outset"): # not handled - style = 'solid' + style = "solid" - if style == 'double': - return 'double' - if style == 'solid': + if style == "double": + return "double" + if style == "solid": return width_name - if style == 'dotted': - if width_name in ('hair', 'thin'): - return 'dotted' - return 'mediumDashDotDot' - if style == 'dashed': - if width_name in ('hair', 'thin'): - return 'dashed' - return 'mediumDashed' + if style == "dotted": + if width_name in ("hair", "thin"): + return "dotted" + return "mediumDashDotDot" + if style == "dashed": + if width_name in ("hair", "thin"): + return "dashed" + return "mediumDashed" def build_fill(self, props): # TODO: perhaps allow for special properties # -excel-pattern-bgcolor and -excel-pattern-type - fill_color = props.get('background-color') - if fill_color not in (None, 'transparent', 'none'): - return { - 'fgColor': self.color_to_excel(fill_color), - 'patternType': 'solid', - } - - BOLD_MAP = {'bold': True, 'bolder': True, '600': True, '700': True, - '800': True, '900': True, - 'normal': False, 'lighter': False, '100': False, '200': False, - '300': False, '400': False, '500': False} - ITALIC_MAP = {'normal': False, 'italic': True, 'oblique': True} + fill_color = props.get("background-color") + if fill_color not in (None, "transparent", "none"): + return {"fgColor": self.color_to_excel(fill_color), "patternType": "solid"} + + BOLD_MAP = { + "bold": True, + "bolder": True, + "600": True, + "700": True, + "800": True, + "900": True, + "normal": False, + "lighter": False, + "100": False, + "200": False, + "300": False, + "400": False, + "500": False, + } + ITALIC_MAP = {"normal": False, "italic": True, "oblique": True} def build_font(self, props): - size = props.get('font-size') + size = props.get("font-size") if size is not None: - assert size.endswith('pt') + assert size.endswith("pt") size = float(size[:-2]) - font_names_tmp = re.findall(r'''(?x) + font_names_tmp = re.findall( + r"""(?x) ( "(?:[^"]|\\")+" | @@ -213,13 +229,15 @@ def build_font(self, props): | [^'",]+ )(?=,|\s*$) - ''', props.get('font-family', '')) + """, + props.get("font-family", ""), + ) font_names = [] for name in font_names_tmp: if name[:1] == '"': name = name[1:-1].replace('\\"', '"') - elif name[:1] == '\'': - name = name[1:-1].replace('\\\'', '\'') + elif name[:1] == "'": + name = name[1:-1].replace("\\'", "'") else: name = name.strip() if name: @@ -227,40 +245,40 @@ def build_font(self, props): family = None for name in font_names: - if name == 'serif': + if name == "serif": family = 1 # roman break - elif name == 'sans-serif': + elif name == "sans-serif": family = 2 # swiss break - elif name == 'cursive': + elif name == "cursive": family = 4 # script break - elif name == 'fantasy': + elif name == "fantasy": family = 5 # decorative break - decoration = props.get('text-decoration') + decoration = props.get("text-decoration") if decoration is not None: decoration = decoration.split() else: decoration = () return { - 'name': font_names[0] if font_names else None, - 'family': family, - 'size': size, - 'bold': self.BOLD_MAP.get(props.get('font-weight')), - 'italic': self.ITALIC_MAP.get(props.get('font-style')), - 'underline': ('single' if - 'underline' in decoration - else None), - 'strike': ('line-through' in decoration) or None, - 'color': self.color_to_excel(props.get('color')), + "name": font_names[0] if font_names else None, + "family": family, + "size": size, + "bold": self.BOLD_MAP.get(props.get("font-weight")), + "italic": self.ITALIC_MAP.get(props.get("font-style")), + "underline": ("single" if "underline" in decoration else None), + "strike": ("line-through" in decoration) or None, + "color": self.color_to_excel(props.get("color")), # shadow if nonzero digit before shadow color - 'shadow': (bool(re.search('^[^#(]*[1-9]', - props['text-shadow'])) - if 'text-shadow' in props else None), + "shadow": ( + bool(re.search("^[^#(]*[1-9]", props["text-shadow"])) + if "text-shadow" in props + else None + ), # 'vertAlign':, # 'charset': , # 'scheme': , @@ -269,43 +287,42 @@ def build_font(self, props): } NAMED_COLORS = { - 'maroon': '800000', - 'brown': 'A52A2A', - 'red': 'FF0000', - 'pink': 'FFC0CB', - 'orange': 'FFA500', - 'yellow': 'FFFF00', - 'olive': '808000', - 'green': '008000', - 'purple': '800080', - 'fuchsia': 'FF00FF', - 'lime': '00FF00', - 'teal': '008080', - 'aqua': '00FFFF', - 'blue': '0000FF', - 'navy': '000080', - 'black': '000000', - 'gray': '808080', - 'grey': '808080', - 'silver': 'C0C0C0', - 'white': 'FFFFFF', + "maroon": "800000", + "brown": "A52A2A", + "red": "FF0000", + "pink": "FFC0CB", + "orange": "FFA500", + "yellow": "FFFF00", + "olive": "808000", + "green": "008000", + "purple": "800080", + "fuchsia": "FF00FF", + "lime": "00FF00", + "teal": "008080", + "aqua": "00FFFF", + "blue": "0000FF", + "navy": "000080", + "black": "000000", + "gray": "808080", + "grey": "808080", + "silver": "C0C0C0", + "white": "FFFFFF", } def color_to_excel(self, val): if val is None: return None - if val.startswith('#') and len(val) == 7: + if val.startswith("#") and len(val) == 7: return val[1:].upper() - if val.startswith('#') and len(val) == 4: + if val.startswith("#") and len(val) == 4: return (val[1] * 2 + val[2] * 2 + val[3] * 2).upper() try: return self.NAMED_COLORS[val] except KeyError: - warnings.warn('Unhandled color format: {val!r}'.format(val=val), - CSSWarning) + warnings.warn("Unhandled color format: {val!r}".format(val=val), CSSWarning) def build_number_format(self, props): - return {'format_code': props.get('number-format')} + return {"format_code": props.get("number-format")} class ExcelFormatter: @@ -341,15 +358,25 @@ class ExcelFormatter: This is only called for body cells. """ - max_rows = 2**20 - max_cols = 2**14 - - def __init__(self, df, na_rep='', float_format=None, cols=None, - header=True, index=True, index_label=None, merge_cells=False, - inf_rep='inf', style_converter=None): + max_rows = 2 ** 20 + max_cols = 2 ** 14 + + def __init__( + self, + df, + na_rep="", + float_format=None, + cols=None, + header=True, + index=True, + index_label=None, + merge_cells=False, + inf_rep="inf", + style_converter=None, + ): self.rowcounter = 0 self.na_rep = na_rep - if hasattr(df, 'render'): + if hasattr(df, "render"): self.styler = df df = df.data if style_converter is None: @@ -362,8 +389,7 @@ def __init__(self, df, na_rep='', float_format=None, cols=None, # all missing, raise if not len(Index(cols) & df.columns): - raise KeyError( - "passes columns are not ALL present dataframe") + raise KeyError("passes columns are not ALL present dataframe") # deprecatedin gh-17295 # 1 missing is ok (for now) @@ -371,7 +397,8 @@ def __init__(self, df, na_rep='', float_format=None, cols=None, warnings.warn( "Not all names specified in 'columns' are found; " "this will raise a KeyError in the future", - FutureWarning) + FutureWarning, + ) self.df = df.reindex(columns=cols) self.columns = self.df.columns @@ -384,13 +411,16 @@ def __init__(self, df, na_rep='', float_format=None, cols=None, @property def header_style(self): - return {"font": {"bold": True}, - "borders": {"top": "thin", - "right": "thin", - "bottom": "thin", - "left": "thin"}, - "alignment": {"horizontal": "center", - "vertical": "top"}} + return { + "font": {"bold": True}, + "borders": { + "top": "thin", + "right": "thin", + "bottom": "thin", + "left": "thin", + }, + "alignment": {"horizontal": "center", "vertical": "top"}, + } def _format_value(self, val): if is_scalar(val) and missing.isna(val): @@ -399,30 +429,35 @@ def _format_value(self, val): if missing.isposinf_scalar(val): val = self.inf_rep elif missing.isneginf_scalar(val): - val = '-{inf}'.format(inf=self.inf_rep) + val = "-{inf}".format(inf=self.inf_rep) elif self.float_format is not None: val = float(self.float_format % val) - if getattr(val, 'tzinfo', None) is not None: - raise ValueError('Excel does not support datetimes with ' - 'timezones. Please ensure that datetimes ' - 'are timezone unaware before writing to Excel.') + if getattr(val, "tzinfo", None) is not None: + raise ValueError( + "Excel does not support datetimes with " + "timezones. Please ensure that datetimes " + "are timezone unaware before writing to Excel." + ) return val def _format_header_mi(self): if self.columns.nlevels > 1: if not self.index: - raise NotImplementedError("Writing to Excel with MultiIndex" - " columns and no index " - "('index'=False) is not yet " - "implemented.") + raise NotImplementedError( + "Writing to Excel with MultiIndex" + " columns and no index " + "('index'=False) is not yet " + "implemented." + ) has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index)) if not (has_aliases or self.header): return columns = self.columns - level_strs = columns.format(sparsify=self.merge_cells, adjoin=False, - names=False) + level_strs = columns.format( + sparsify=self.merge_cells, adjoin=False, names=False + ) level_lengths = get_level_lengths(level_strs) coloffset = 0 lnum = 0 @@ -436,17 +471,24 @@ def _format_header_mi(self): name = columns.names[lnum] yield ExcelCell(lnum, coloffset, name, self.header_style) - for lnum, (spans, levels, level_codes) in enumerate(zip( - level_lengths, columns.levels, columns.codes)): + for lnum, (spans, levels, level_codes) in enumerate( + zip(level_lengths, columns.levels, columns.codes) + ): values = levels.take(level_codes) for i in spans: if spans[i] > 1: - yield ExcelCell(lnum, coloffset + i + 1, values[i], - self.header_style, lnum, - coloffset + i + spans[i]) + yield ExcelCell( + lnum, + coloffset + i + 1, + values[i], + self.header_style, + lnum, + coloffset + i + spans[i], + ) else: - yield ExcelCell(lnum, coloffset + i + 1, values[i], - self.header_style) + yield ExcelCell( + lnum, coloffset + i + 1, values[i], self.header_style + ) else: # Format in legacy format with dots to indicate levels. for i, values in enumerate(zip(*level_strs)): @@ -468,15 +510,17 @@ def _format_header_regular(self): colnames = self.columns if has_aliases: if len(self.header) != len(self.columns): - raise ValueError('Writing {cols} cols but got {alias} ' - 'aliases'.format(cols=len(self.columns), - alias=len(self.header))) + raise ValueError( + "Writing {cols} cols but got {alias} " + "aliases".format(cols=len(self.columns), alias=len(self.header)) + ) else: colnames = self.header for colindex, colname in enumerate(colnames): - yield ExcelCell(self.rowcounter, colindex + coloffset, colname, - self.header_style) + yield ExcelCell( + self.rowcounter, colindex + coloffset, colname, self.header_style + ) def _format_header(self): if isinstance(self.columns, ABCMultiIndex): @@ -486,12 +530,14 @@ def _format_header(self): gen2 = () if self.df.index.names: - row = [x if x is not None else '' - for x in self.df.index.names] + [''] * len(self.columns) - if reduce(lambda x, y: x and y, map(lambda x: x != '', row)): - gen2 = (ExcelCell(self.rowcounter, colindex, val, - self.header_style) - for colindex, val in enumerate(row)) + row = [x if x is not None else "" for x in self.df.index.names] + [ + "" + ] * len(self.columns) + if reduce(lambda x, y: x and y, map(lambda x: x != "", row)): + gen2 = ( + ExcelCell(self.rowcounter, colindex, val, self.header_style) + for colindex, val in enumerate(row) + ) self.rowcounter += 1 return itertools.chain(gen, gen2) @@ -511,9 +557,9 @@ def _format_regular_rows(self): if self.index: # check aliases # if list only take first as this is not a MultiIndex - if (self.index_label and - isinstance(self.index_label, (list, tuple, np.ndarray, - Index))): + if self.index_label and isinstance( + self.index_label, (list, tuple, np.ndarray, Index) + ): index_label = self.index_label[0] # if string good to go elif self.index_label and isinstance(self.index_label, str): @@ -525,8 +571,7 @@ def _format_regular_rows(self): self.rowcounter += 1 if index_label and self.header is not False: - yield ExcelCell(self.rowcounter - 1, 0, index_label, - self.header_style) + yield ExcelCell(self.rowcounter - 1, 0, index_label, self.header_style) # write index_values index_values = self.df.index @@ -534,8 +579,7 @@ def _format_regular_rows(self): index_values = self.df.index.to_timestamp() for idx, idxval in enumerate(index_values): - yield ExcelCell(self.rowcounter + idx, 0, idxval, - self.header_style) + yield ExcelCell(self.rowcounter + idx, 0, idxval, self.header_style) coloffset = 1 else: @@ -554,9 +598,9 @@ def _format_hierarchical_rows(self): if self.index: index_labels = self.df.index.names # check for aliases - if (self.index_label and - isinstance(self.index_label, (list, tuple, np.ndarray, - Index))): + if self.index_label and isinstance( + self.index_label, (list, tuple, np.ndarray, Index) + ): index_labels = self.index_label # MultiIndex columns require an extra row @@ -570,40 +614,52 @@ def _format_hierarchical_rows(self): if com._any_not_none(*index_labels) and self.header is not False: for cidx, name in enumerate(index_labels): - yield ExcelCell(self.rowcounter - 1, cidx, name, - self.header_style) + yield ExcelCell(self.rowcounter - 1, cidx, name, self.header_style) if self.merge_cells: # Format hierarchical rows as merged cells. - level_strs = self.df.index.format(sparsify=True, adjoin=False, - names=False) + level_strs = self.df.index.format( + sparsify=True, adjoin=False, names=False + ) level_lengths = get_level_lengths(level_strs) - for spans, levels, level_codes in zip(level_lengths, - self.df.index.levels, - self.df.index.codes): + for spans, levels, level_codes in zip( + level_lengths, self.df.index.levels, self.df.index.codes + ): - values = levels.take(level_codes, - allow_fill=levels._can_hold_na, - fill_value=True) + values = levels.take( + level_codes, allow_fill=levels._can_hold_na, fill_value=True + ) for i in spans: if spans[i] > 1: - yield ExcelCell(self.rowcounter + i, gcolidx, - values[i], self.header_style, - self.rowcounter + i + spans[i] - 1, - gcolidx) + yield ExcelCell( + self.rowcounter + i, + gcolidx, + values[i], + self.header_style, + self.rowcounter + i + spans[i] - 1, + gcolidx, + ) else: - yield ExcelCell(self.rowcounter + i, gcolidx, - values[i], self.header_style) + yield ExcelCell( + self.rowcounter + i, + gcolidx, + values[i], + self.header_style, + ) gcolidx += 1 else: # Format hierarchical rows with non-merged values. for indexcolvals in zip(*self.df.index): for idx, indexcolval in enumerate(indexcolvals): - yield ExcelCell(self.rowcounter + idx, gcolidx, - indexcolval, self.header_style) + yield ExcelCell( + self.rowcounter + idx, + gcolidx, + indexcolval, + self.header_style, + ) gcolidx += 1 for cell in self._generate_body(gcolidx): @@ -623,18 +679,23 @@ def _generate_body(self, coloffset): series = self.df.iloc[:, colidx] for i, val in enumerate(series): if styles is not None: - xlstyle = self.style_converter(';'.join(styles[i, colidx])) - yield ExcelCell(self.rowcounter + i, colidx + coloffset, val, - xlstyle) + xlstyle = self.style_converter(";".join(styles[i, colidx])) + yield ExcelCell(self.rowcounter + i, colidx + coloffset, val, xlstyle) def get_formatted_cells(self): - for cell in itertools.chain(self._format_header(), - self._format_body()): + for cell in itertools.chain(self._format_header(), self._format_body()): cell.val = self._format_value(cell.val) yield cell - def write(self, writer, sheet_name='Sheet1', startrow=0, - startcol=0, freeze_panes=None, engine=None): + def write( + self, + writer, + sheet_name="Sheet1", + startrow=0, + startcol=0, + freeze_panes=None, + engine=None, + ): """ writer : string or ExcelWriter object File path or existing ExcelWriter @@ -657,10 +718,11 @@ def write(self, writer, sheet_name='Sheet1', startrow=0, num_rows, num_cols = self.df.shape if num_rows > self.max_rows or num_cols > self.max_cols: - raise ValueError("This sheet is too large! Your sheet size is: " + - "{}, {} ".format(num_rows, num_cols) + - "Max sheet size is: {}, {}". - format(self.max_rows, self.max_cols)) + raise ValueError( + "This sheet is too large! Your sheet size is: " + + "{}, {} ".format(num_rows, num_cols) + + "Max sheet size is: {}, {}".format(self.max_rows, self.max_cols) + ) if isinstance(writer, ExcelWriter): need_save = False @@ -669,8 +731,12 @@ def write(self, writer, sheet_name='Sheet1', startrow=0, need_save = True formatted_cells = self.get_formatted_cells() - writer.write_cells(formatted_cells, sheet_name, - startrow=startrow, startcol=startcol, - freeze_panes=freeze_panes) + writer.write_cells( + formatted_cells, + sheet_name, + startrow=startrow, + startcol=startcol, + freeze_panes=freeze_panes, + ) if need_save: writer.save() diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 98c31fbeb78e68..c4e3dd1c755cf2 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -17,12 +17,26 @@ from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas.core.dtypes.common import ( - is_categorical_dtype, is_complex_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_extension_array_dtype, is_float, is_float_dtype, - is_integer, is_integer_dtype, is_list_like, is_numeric_dtype, is_scalar, - is_timedelta64_dtype) + is_categorical_dtype, + is_complex_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_extension_array_dtype, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_numeric_dtype, + is_scalar, + is_timedelta64_dtype, +) from pandas.core.dtypes.generic import ( - ABCIndexClass, ABCMultiIndex, ABCSeries, ABCSparseArray) + ABCIndexClass, + ABCMultiIndex, + ABCSeries, + ABCSparseArray, +) from pandas.core.dtypes.missing import isna, notna from pandas.core.base import PandasObject @@ -92,9 +106,19 @@ .. versionadded:: 0.18.0 """ -_VALID_JUSTIFY_PARAMETERS = ("left", "right", "center", "justify", - "justify-all", "start", "end", "inherit", - "match-parent", "initial", "unset") +_VALID_JUSTIFY_PARAMETERS = ( + "left", + "right", + "center", + "justify", + "justify-all", + "start", + "end", + "inherit", + "match-parent", + "initial", + "unset", +) return_docstring = """ Returns @@ -105,9 +129,7 @@ class CategoricalFormatter: - - def __init__(self, categorical, buf=None, length=True, na_rep='NaN', - footer=True): + def __init__(self, categorical, buf=None, length=True, na_rep="NaN", footer=True): self.categorical = categorical self.buf = buf if buf is not None else StringIO("") self.na_rep = na_rep @@ -115,25 +137,29 @@ def __init__(self, categorical, buf=None, length=True, na_rep='NaN', self.footer = footer def _get_footer(self): - footer = '' + footer = "" if self.length: if footer: - footer += ', ' + footer += ", " footer += "Length: {length}".format(length=len(self.categorical)) level_info = self.categorical._repr_categories_info() # Levels are added in a newline if footer: - footer += '\n' + footer += "\n" footer += level_info return str(footer) def _get_formatted_values(self): - return format_array(self.categorical._internal_get_values(), None, - float_format=None, na_rep=self.na_rep) + return format_array( + self.categorical._internal_get_values(), + None, + float_format=None, + na_rep=self.na_rep, + ) def to_string(self): categorical = self.categorical @@ -142,27 +168,37 @@ def to_string(self): if self.footer: return self._get_footer() else: - return '' + return "" fmt_values = self._get_formatted_values() - result = ['{i}'.format(i=i) for i in fmt_values] + result = ["{i}".format(i=i) for i in fmt_values] result = [i.strip() for i in result] - result = ', '.join(result) - result = ['[' + result + ']'] + result = ", ".join(result) + result = ["[" + result + "]"] if self.footer: footer = self._get_footer() if footer: result.append(footer) - return str('\n'.join(result)) + return str("\n".join(result)) class SeriesFormatter: - - def __init__(self, series, buf=None, length=True, header=True, index=True, - na_rep='NaN', name=False, float_format=None, dtype=True, - max_rows=None, min_rows=None): + def __init__( + self, + series, + buf=None, + length=True, + header=True, + index=True, + na_rep="NaN", + name=False, + float_format=None, + dtype=True, + max_rows=None, + min_rows=None, + ): self.series = series self.buf = buf if buf is not None else StringIO() self.name = name @@ -183,6 +219,7 @@ def __init__(self, series, buf=None, length=True, header=True, index=True, def _chk_truncate(self): from pandas.core.reshape.concat import concat + min_rows = self.min_rows max_rows = self.max_rows # truncation determined by max_rows, actual truncated number of rows @@ -199,8 +236,7 @@ def _chk_truncate(self): series = series.iloc[:max_rows] else: row_num = max_rows // 2 - series = concat((series.iloc[:row_num], - series.iloc[-row_num:])) + series = concat((series.iloc[:row_num], series.iloc[-row_num:])) self.tr_row_num = row_num else: self.tr_row_num = None @@ -209,32 +245,31 @@ def _chk_truncate(self): def _get_footer(self): name = self.series.name - footer = '' + footer = "" - if getattr(self.series.index, 'freq', None) is not None: - footer += 'Freq: {freq}'.format(freq=self.series.index.freqstr) + if getattr(self.series.index, "freq", None) is not None: + footer += "Freq: {freq}".format(freq=self.series.index.freqstr) if self.name is not False and name is not None: if footer: - footer += ', ' + footer += ", " - series_name = pprint_thing(name, - escape_chars=('\t', '\r', '\n')) - footer += (("Name: {sname}".format(sname=series_name)) - if name is not None else "") + series_name = pprint_thing(name, escape_chars=("\t", "\r", "\n")) + footer += ( + ("Name: {sname}".format(sname=series_name)) if name is not None else "" + ) - if (self.length is True or - (self.length == 'truncate' and self.truncate_v)): + if self.length is True or (self.length == "truncate" and self.truncate_v): if footer: - footer += ', ' - footer += 'Length: {length}'.format(length=len(self.series)) + footer += ", " + footer += "Length: {length}".format(length=len(self.series)) if self.dtype is not False and self.dtype is not None: - name = getattr(self.tr_series.dtype, 'name', None) + name = getattr(self.tr_series.dtype, "name", None) if name: if footer: - footer += ', ' - footer += 'dtype: {typ}'.format(typ=pprint_thing(name)) + footer += ", " + footer += "dtype: {typ}".format(typ=pprint_thing(name)) # level infos are added to the end and in a new line, like it is done # for Categoricals @@ -260,8 +295,9 @@ def _get_formatted_index(self): def _get_formatted_values(self): values_to_format = self.tr_series._formatting_values() - return format_array(values_to_format, None, - float_format=self.float_format, na_rep=self.na_rep) + return format_array( + values_to_format, None, float_format=self.float_format, na_rep=self.na_rep + ) def to_string(self): series = self.tr_series @@ -269,7 +305,8 @@ def to_string(self): if len(series) == 0: return "{name}([], {footer})".format( - name=self.series.__class__.__name__, footer=footer) + name=self.series.__class__.__name__, footer=footer + ) fmt_index, have_header = self._get_formatted_index() fmt_values = self._get_formatted_values() @@ -279,14 +316,14 @@ def to_string(self): row_num = self.tr_row_num width = self.adj.len(fmt_values[row_num - 1]) if width > 3: - dot_str = '...' + dot_str = "..." else: - dot_str = '..' + dot_str = ".." # Series uses mode=center because it has single value columns # DataFrame uses mode=left - dot_str = self.adj.justify([dot_str], width, mode='center')[0] + dot_str = self.adj.justify([dot_str], width, mode="center")[0] fmt_values.insert(row_num + n_header_rows, dot_str) - fmt_index.insert(row_num + 1, '') + fmt_index.insert(row_num + 1, "") if self.index: result = self.adj.adjoin(3, *[fmt_index[1:], fmt_values]) @@ -294,32 +331,29 @@ def to_string(self): result = self.adj.adjoin(3, fmt_values) if self.header and have_header: - result = fmt_index[0] + '\n' + result + result = fmt_index[0] + "\n" + result if footer: - result += '\n' + footer + result += "\n" + footer - return str(''.join(result)) + return str("".join(result)) class TextAdjustment: - def __init__(self): self.encoding = get_option("display.encoding") def len(self, text): return len(text) - def justify(self, texts, max_len, mode='right'): + def justify(self, texts, max_len, mode="right"): return justify(texts, max_len, mode=mode) def adjoin(self, space, *lists, **kwargs): - return adjoin(space, *lists, strlen=self.len, - justfunc=self.justify, **kwargs) + return adjoin(space, *lists, strlen=self.len, justfunc=self.justify, **kwargs) class EastAsianTextAdjustment(TextAdjustment): - def __init__(self): super().__init__() if get_option("display.unicode.ambiguous_as_wide"): @@ -330,7 +364,7 @@ def __init__(self): # Definition of East Asian Width # http://unicode.org/reports/tr11/ # Ambiguous width can be changed by option - self._EAW_MAP = {'Na': 1, 'N': 1, 'W': 2, 'F': 2, 'H': 1} + self._EAW_MAP = {"Na": 1, "N": 1, "W": 2, "F": 2, "H": 1} def len(self, text): """ @@ -339,17 +373,18 @@ def len(self, text): if not isinstance(text, str): return len(text) - return sum(self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width) - for c in text) + return sum( + self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width) for c in text + ) - def justify(self, texts, max_len, mode='right'): + def justify(self, texts, max_len, mode="right"): # re-calculate padding space per str considering East Asian Width def _get_pad(t): return max_len - self.len(t) + len(t) - if mode == 'left': + if mode == "left": return [x.ljust(_get_pad(x)) for x in texts] - elif mode == 'center': + elif mode == "center": return [x.center(_get_pad(x)) for x in texts] else: return [x.rjust(_get_pad(x)) for x in texts] @@ -370,8 +405,9 @@ class TableFormatter: @property def should_show_dimensions(self): - return (self.show_dimensions is True or - (self.show_dimensions == 'truncate' and self.is_truncated)) + return self.show_dimensions is True or ( + self.show_dimensions == "truncate" and self.is_truncated + ) def _get_formatter(self, i): if isinstance(self.formatters, (list, tuple)): @@ -395,15 +431,33 @@ class DataFrameFormatter(TableFormatter): """ - __doc__ = __doc__ if __doc__ else '' + __doc__ = __doc__ if __doc__ else "" __doc__ += common_docstring + return_docstring - def __init__(self, frame, buf=None, columns=None, col_space=None, - header=True, index=True, na_rep='NaN', formatters=None, - justify=None, float_format=None, sparsify=None, - index_names=True, line_width=None, max_rows=None, - min_rows=None, max_cols=None, show_dimensions=False, - decimal='.', table_id=None, render_links=False, **kwds): + def __init__( + self, + frame, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep="NaN", + formatters=None, + justify=None, + float_format=None, + sparsify=None, + index_names=True, + line_width=None, + max_rows=None, + min_rows=None, + max_cols=None, + show_dimensions=False, + decimal=".", + table_id=None, + render_links=False, + **kwds + ): self.frame = frame if buf is not None: self.buf = _expand_user(_stringify_path(buf)) @@ -427,8 +481,7 @@ def __init__(self, frame, buf=None, columns=None, col_space=None, self.max_rows = max_rows self.min_rows = min_rows self.max_cols = max_cols - self.max_rows_displayed = min(max_rows or len(self.frame), - len(self.frame)) + self.max_rows_displayed = min(max_rows or len(self.frame), len(self.frame)) self.show_dimensions = show_dimensions self.table_id = table_id self.render_links = render_links @@ -469,8 +522,7 @@ def _chk_truncate(self): prompt_row = 1 if self.show_dimensions: show_dimension_rows = 3 - n_add_rows = (self.header + dot_row + show_dimension_rows + - prompt_row) + n_add_rows = self.header + dot_row + show_dimension_rows + prompt_row # rows available to fill with actual data max_rows_adj = self.h - n_add_rows self.max_rows_adj = max_rows_adj @@ -482,13 +534,13 @@ def _chk_truncate(self): if max_rows == 0 and len(self.frame) > h: max_rows = h - if not hasattr(self, 'max_rows_adj'): + if not hasattr(self, "max_rows_adj"): if max_rows: if (len(self.frame) > max_rows) and self.min_rows: # if truncated, set max_rows showed to min_rows max_rows = min(self.min_rows, max_rows) self.max_rows_adj = max_rows - if not hasattr(self, 'max_cols_adj'): + if not hasattr(self, "max_cols_adj"): self.max_cols_adj = max_cols max_cols_adj = self.max_cols_adj @@ -505,9 +557,10 @@ def _chk_truncate(self): frame = frame.iloc[:, :max_cols] col_num = max_cols else: - col_num = (max_cols_adj // 2) - frame = concat((frame.iloc[:, :col_num], - frame.iloc[:, -col_num:]), axis=1) + col_num = max_cols_adj // 2 + frame = concat( + (frame.iloc[:, :col_num], frame.iloc[:, -col_num:]), axis=1 + ) self.tr_col_num = col_num if truncate_v: if max_rows_adj == 1: @@ -515,8 +568,7 @@ def _chk_truncate(self): frame = frame.iloc[:max_rows, :] else: row_num = max_rows_adj // 2 - frame = concat((frame.iloc[:row_num, :], - frame.iloc[-row_num:, :])) + frame = concat((frame.iloc[:row_num, :], frame.iloc[-row_num:, :])) self.tr_row_num = row_num else: self.tr_row_num = None @@ -539,37 +591,44 @@ def _to_str_columns(self): stringified = [] for i, c in enumerate(frame): fmt_values = self._format_col(i) - fmt_values = _make_fixed_width(fmt_values, self.justify, - minimum=(self.col_space or 0), - adj=self.adj) + fmt_values = _make_fixed_width( + fmt_values, + self.justify, + minimum=(self.col_space or 0), + adj=self.adj, + ) stringified.append(fmt_values) else: if is_list_like(self.header): if len(self.header) != len(self.columns): - raise ValueError(('Writing {ncols} cols but got {nalias} ' - 'aliases' - .format(ncols=len(self.columns), - nalias=len(self.header)))) + raise ValueError( + ( + "Writing {ncols} cols but got {nalias} " + "aliases".format( + ncols=len(self.columns), nalias=len(self.header) + ) + ) + ) str_columns = [[label] for label in self.header] else: str_columns = self._get_formatted_column_labels(frame) if self.show_row_idx_names: for x in str_columns: - x.append('') + x.append("") stringified = [] for i, c in enumerate(frame): cheader = str_columns[i] - header_colwidth = max(self.col_space or 0, - *(self.adj.len(x) for x in cheader)) + header_colwidth = max( + self.col_space or 0, *(self.adj.len(x) for x in cheader) + ) fmt_values = self._format_col(i) - fmt_values = _make_fixed_width(fmt_values, self.justify, - minimum=header_colwidth, - adj=self.adj) + fmt_values = _make_fixed_width( + fmt_values, self.justify, minimum=header_colwidth, adj=self.adj + ) - max_len = max(max(self.adj.len(x) for x in fmt_values), - header_colwidth) + max_len = max(max(self.adj.len(x) for x in fmt_values), header_colwidth) cheader = self.adj.justify(cheader, max_len, mode=self.justify) stringified.append(cheader + fmt_values) @@ -583,7 +642,7 @@ def _to_str_columns(self): if truncate_h: col_num = self.tr_col_num - strcols.insert(self.tr_col_num + 1, [' ...'] * (len(str_index))) + strcols.insert(self.tr_col_num + 1, [" ..."] * (len(str_index))) if truncate_v: n_header_rows = len(str_index) - len(frame) row_num = self.tr_row_num @@ -594,17 +653,17 @@ def _to_str_columns(self): if truncate_h: is_dot_col = ix == col_num + 1 if cwidth > 3 or is_dot_col: - my_str = '...' + my_str = "..." else: - my_str = '..' + my_str = ".." if ix == 0: - dot_mode = 'left' + dot_mode = "left" elif is_dot_col: cwidth = 4 - dot_mode = 'right' + dot_mode = "right" else: - dot_mode = 'right' + dot_mode = "right" dot_str = self.adj.justify([my_str], cwidth, mode=dot_mode)[0] strcols[ix].insert(row_num + n_header_rows, dot_str) return strcols @@ -618,10 +677,11 @@ def to_string(self): frame = self.frame if len(frame.columns) == 0 or len(frame.index) == 0: - info_line = ('Empty {name}\nColumns: {col}\nIndex: {idx}' - .format(name=type(self.frame).__name__, - col=pprint_thing(frame.columns), - idx=pprint_thing(frame.index))) + info_line = "Empty {name}\nColumns: {col}\nIndex: {idx}".format( + name=type(self.frame).__name__, + col=pprint_thing(frame.columns), + idx=pprint_thing(frame.index), + ) text = info_line else: @@ -629,27 +689,27 @@ def to_string(self): if self.line_width is None: # no need to wrap around just print # the whole frame text = self.adj.adjoin(1, *strcols) - elif (not isinstance(self.max_cols, int) or - self.max_cols > 0): # need to wrap around + elif ( + not isinstance(self.max_cols, int) or self.max_cols > 0 + ): # need to wrap around text = self._join_multiline(*strcols) else: # max_cols == 0. Try to fit frame to terminal - text = self.adj.adjoin(1, *strcols).split('\n') + text = self.adj.adjoin(1, *strcols).split("\n") max_len = Series(text).str.len().max() # plus truncate dot col dif = max_len - self.w # '+ 1' to avoid too wide repr (GH PR #17023) adj_dif = dif + 1 - col_lens = Series([Series(ele).apply(len).max() - for ele in strcols]) + col_lens = Series([Series(ele).apply(len).max() for ele in strcols]) n_cols = len(col_lens) counter = 0 while adj_dif > 0 and n_cols > 1: counter += 1 - mid = int(round(n_cols / 2.)) + mid = int(round(n_cols / 2.0)) mid_ix = col_lens.index[mid] col_len = col_lens[mid_ix] # adjoin adds one - adj_dif -= (col_len + 1) + adj_dif -= col_len + 1 col_lens = col_lens.drop(mid_ix) n_cols = len(col_lens) # subtract index column @@ -666,8 +726,11 @@ def to_string(self): self.buf.writelines(text) if self.should_show_dimensions: - self.buf.write("\n\n[{nrows} rows x {ncols} columns]" - .format(nrows=len(frame), ncols=len(frame.columns))) + self.buf.write( + "\n\n[{nrows} rows x {ncols} columns]".format( + nrows=len(frame), ncols=len(frame.columns) + ) + ) def _join_multiline(self, *strcols): lwidth = self.line_width @@ -675,11 +738,12 @@ def _join_multiline(self, *strcols): strcols = list(strcols) if self.index: idx = strcols.pop(0) - lwidth -= np.array([self.adj.len(x) - for x in idx]).max() + adjoin_width + lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width - col_widths = [np.array([self.adj.len(x) for x in col]).max() if - len(col) > 0 else 0 for col in strcols] + col_widths = [ + np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0 + for col in strcols + ] col_bins = _binify(col_widths, lwidth) nbins = len(col_bins) @@ -696,46 +760,62 @@ def _join_multiline(self, *strcols): row.insert(0, idx) if nbins > 1: if ed <= len(strcols) and i < nbins - 1: - row.append([' \\'] + [' '] * (nrows - 1)) + row.append([" \\"] + [" "] * (nrows - 1)) else: - row.append([' '] * nrows) + row.append([" "] * nrows) str_lst.append(self.adj.adjoin(adjoin_width, *row)) st = ed - return '\n\n'.join(str_lst) - - def to_latex(self, column_format=None, longtable=False, encoding=None, - multicolumn=False, multicolumn_format=None, multirow=False): + return "\n\n".join(str_lst) + + def to_latex( + self, + column_format=None, + longtable=False, + encoding=None, + multicolumn=False, + multicolumn_format=None, + multirow=False, + ): """ Render a DataFrame to a LaTeX tabular/longtable environment output. """ from pandas.io.formats.latex import LatexFormatter - latex_renderer = LatexFormatter(self, column_format=column_format, - longtable=longtable, - multicolumn=multicolumn, - multicolumn_format=multicolumn_format, - multirow=multirow) + + latex_renderer = LatexFormatter( + self, + column_format=column_format, + longtable=longtable, + multicolumn=multicolumn, + multicolumn_format=multicolumn_format, + multirow=multirow, + ) if encoding is None: - encoding = 'utf-8' + encoding = "utf-8" - if hasattr(self.buf, 'write'): + if hasattr(self.buf, "write"): latex_renderer.write_result(self.buf) elif isinstance(self.buf, str): import codecs - with codecs.open(self.buf, 'w', encoding=encoding) as f: + + with codecs.open(self.buf, "w", encoding=encoding) as f: latex_renderer.write_result(f) else: - raise TypeError('buf is not a file name and it has no write ' - 'method') + raise TypeError("buf is not a file name and it has no write " "method") def _format_col(self, i): frame = self.tr_frame formatter = self._get_formatter(i) values_to_format = frame.iloc[:, i]._formatting_values() - return format_array(values_to_format, formatter, - float_format=self.float_format, na_rep=self.na_rep, - space=self.col_space, decimal=self.decimal) + return format_array( + values_to_format, + formatter, + float_format=self.float_format, + na_rep=self.na_rep, + space=self.col_space, + decimal=self.decimal, + ) def to_html(self, classes=None, notebook=False, border=None): """ @@ -755,16 +835,16 @@ def to_html(self, classes=None, notebook=False, border=None): .. versionadded:: 0.19.0 """ from pandas.io.formats.html import HTMLFormatter, NotebookFormatter + Klass = NotebookFormatter if notebook else HTMLFormatter html = Klass(self, classes=classes, border=border).render() - if hasattr(self.buf, 'write'): + if hasattr(self.buf, "write"): buffer_put_lines(self.buf, html) elif isinstance(self.buf, str): - with open(self.buf, 'w') as f: + with open(self.buf, "w") as f: buffer_put_lines(f, html) else: - raise TypeError('buf is not a file name and it has no write ' - ' method') + raise TypeError("buf is not a file name and it has no write " " method") def _get_formatted_column_labels(self, frame): from pandas.core.index import _sparsify @@ -781,13 +861,17 @@ def _get_formatted_column_labels(self, frame): need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) def space_format(x, y): - if (y not in self.formatters and - need_leadsp[x] and not restrict_formatting): - return ' ' + y + if ( + y not in self.formatters + and need_leadsp[x] + and not restrict_formatting + ): + return " " + y return y - str_columns = list(zip(*[[space_format(x, y) for y in x] - for x in fmt_columns])) + str_columns = list( + zip(*[[space_format(x, y) for y in x] for x in fmt_columns]) + ) if self.sparsify and len(str_columns): str_columns = _sparsify(str_columns) @@ -796,10 +880,10 @@ def space_format(x, y): fmt_columns = columns.format() dtypes = self.frame.dtypes need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) - str_columns = [[' ' + x if not self._get_formatter(i) and - need_leadsp[x] else x] - for i, (col, x) in enumerate(zip(columns, - fmt_columns))] + str_columns = [ + [" " + x if not self._get_formatter(i) and need_leadsp[x] else x] + for i, (col, x) in enumerate(zip(columns, fmt_columns)) + ] # self.str_columns = str_columns return str_columns @@ -813,43 +897,45 @@ def has_column_names(self): @property def show_row_idx_names(self): - return all((self.has_index_names, - self.index, - self.show_index_names)) + return all((self.has_index_names, self.index, self.show_index_names)) @property def show_col_idx_names(self): - return all((self.has_column_names, - self.show_index_names, - self.header)) + return all((self.has_column_names, self.show_index_names, self.header)) def _get_formatted_index(self, frame): # Note: this is only used by to_string() and to_latex(), not by # to_html(). index = frame.index columns = frame.columns - fmt = self._get_formatter('__index__') + fmt = self._get_formatter("__index__") if isinstance(index, ABCMultiIndex): fmt_index = index.format( - sparsify=self.sparsify, adjoin=False, - names=self.show_row_idx_names, formatter=fmt) + sparsify=self.sparsify, + adjoin=False, + names=self.show_row_idx_names, + formatter=fmt, + ) else: - fmt_index = [index.format( - name=self.show_row_idx_names, formatter=fmt)] + fmt_index = [index.format(name=self.show_row_idx_names, formatter=fmt)] - fmt_index = [tuple(_make_fixed_width(list(x), justify='left', - minimum=(self.col_space or 0), - adj=self.adj)) for x in fmt_index] + fmt_index = [ + tuple( + _make_fixed_width( + list(x), justify="left", minimum=(self.col_space or 0), adj=self.adj + ) + ) + for x in fmt_index + ] - adjoined = self.adj.adjoin(1, *fmt_index).split('\n') + adjoined = self.adj.adjoin(1, *fmt_index).split("\n") # empty space for columns if self.show_col_idx_names: - col_header = ['{x}'.format(x=x) - for x in self._get_column_name_list()] + col_header = ["{x}".format(x=x) for x in self._get_column_name_list()] else: - col_header = [''] * columns.nlevels + col_header = [""] * columns.nlevels if self.header: return col_header + adjoined @@ -860,19 +946,27 @@ def _get_column_name_list(self): names = [] columns = self.frame.columns if isinstance(columns, ABCMultiIndex): - names.extend('' if name is None else name - for name in columns.names) + names.extend("" if name is None else name for name in columns.names) else: - names.append('' if columns.name is None else columns.name) + names.append("" if columns.name is None else columns.name) return names + # ---------------------------------------------------------------------- # Array formatters -def format_array(values, formatter, float_format=None, na_rep='NaN', - digits=None, space=None, justify='right', decimal='.', - leading_space=None): +def format_array( + values, + formatter, + float_format=None, + na_rep="NaN", + digits=None, + space=None, + justify="right", + decimal=".", + leading_space=None, +): """ Format an array for printing. @@ -924,19 +1018,36 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', if digits is None: digits = get_option("display.precision") - fmt_obj = fmt_klass(values, digits=digits, na_rep=na_rep, - float_format=float_format, formatter=formatter, - space=space, justify=justify, decimal=decimal, - leading_space=leading_space) + fmt_obj = fmt_klass( + values, + digits=digits, + na_rep=na_rep, + float_format=float_format, + formatter=formatter, + space=space, + justify=justify, + decimal=decimal, + leading_space=leading_space, + ) return fmt_obj.get_result() class GenericArrayFormatter: - - def __init__(self, values, digits=7, formatter=None, na_rep='NaN', - space=12, float_format=None, justify='right', decimal='.', - quoting=None, fixed_width=True, leading_space=None): + def __init__( + self, + values, + digits=7, + formatter=None, + na_rep="NaN", + space=12, + float_format=None, + justify="right", + decimal=".", + quoting=None, + fixed_width=True, + leading_space=None, + ): self.values = values self.digits = digits self.na_rep = na_rep @@ -957,15 +1068,18 @@ def _format_strings(self): if self.float_format is None: float_format = get_option("display.float_format") if float_format is None: - fmt_str = ('{{x: .{prec:d}g}}' - .format(prec=get_option("display.precision"))) + fmt_str = "{{x: .{prec:d}g}}".format( + prec=get_option("display.precision") + ) float_format = lambda x: fmt_str.format(x=x) else: float_format = self.float_format formatter = ( - self.formatter if self.formatter is not None else - (lambda x: pprint_thing(x, escape_chars=('\t', '\r', '\n')))) + self.formatter + if self.formatter is not None + else (lambda x: pprint_thing(x, escape_chars=("\t", "\r", "\n"))) + ) def _format(x): if self.na_rep is not None and is_scalar(x) and isna(x): @@ -973,18 +1087,18 @@ def _format(x): # try block for np.isnat specifically # determine na_rep if x is None or NaT-like if x is None: - return 'None' + return "None" elif x is NaT or np.isnat(x): - return 'NaT' + return "NaT" except (TypeError, ValueError): # np.isnat only handles datetime or timedelta objects pass return self.na_rep elif isinstance(x, PandasObject): - return '{x}'.format(x=x) + return "{x}".format(x=x) else: # object dtype - return '{x}'.format(x=formatter(x)) + return "{x}".format(x=formatter(x)) vals = self.values if isinstance(vals, Index): @@ -1000,16 +1114,16 @@ def _format(x): fmt_values = [] for i, v in enumerate(vals): if not is_float_type[i] and leading_space: - fmt_values.append(' {v}'.format(v=_format(v))) + fmt_values.append(" {v}".format(v=_format(v))) elif is_float_type[i]: fmt_values.append(float_format(v)) else: if leading_space is False: # False specifically, so that the default is # to include a space if we get here. - tpl = '{v}' + tpl = "{v}" else: - tpl = ' {v}' + tpl = " {v}" fmt_values.append(tpl.format(v=_format(v))) return fmt_values @@ -1047,15 +1161,20 @@ def _value_formatter(self, float_format=None, threshold=None): # when there is no float_format, we use str instead of '%g' # because str(0.0) = '0.0' while '%g' % 0.0 = '0' if float_format: + def base_formatter(v): return float_format(value=v) if notna(v) else self.na_rep + else: + def base_formatter(v): return str(v) if notna(v) else self.na_rep - if self.decimal != '.': + if self.decimal != ".": + def decimal_formatter(v): - return base_formatter(v).replace('.', self.decimal, 1) + return base_formatter(v).replace(".", self.decimal, 1) + else: decimal_formatter = base_formatter @@ -1093,8 +1212,8 @@ def format_values_with(float_format): # default formatter leaves a space to the left when formatting # floats, must be consistent for left-justifying NaNs (GH #25061) - if self.justify == 'left': - na_rep = ' ' + self.na_rep + if self.justify == "left": + na_rep = " " + self.na_rep else: na_rep = self.na_rep @@ -1102,13 +1221,14 @@ def format_values_with(float_format): values = self.values is_complex = is_complex_dtype(values) mask = isna(values) - if hasattr(values, 'to_dense'): # sparse numpy ndarray + if hasattr(values, "to_dense"): # sparse numpy ndarray values = values.to_dense() - values = np.array(values, dtype='object') + values = np.array(values, dtype="object") values[mask] = na_rep imask = (~mask).ravel() - values.flat[imask] = np.array([formatter(val) - for val in values.ravel()[imask]]) + values.flat[imask] = np.array( + [formatter(val) for val in values.ravel()[imask]] + ) if self.fixed_width: if is_complex: @@ -1122,8 +1242,9 @@ def format_values_with(float_format): # The default is otherwise to use str instead of a formatting string if self.float_format is None: if self.fixed_width: - float_format = partial('{value: .{digits:d}f}'.format, - digits=self.digits) + float_format = partial( + "{value: .{digits:d}f}".format, digits=self.digits + ) else: float_format = self.float_format else: @@ -1144,18 +1265,18 @@ def format_values_with(float_format): else: too_long = False - with np.errstate(invalid='ignore'): + with np.errstate(invalid="ignore"): abs_vals = np.abs(self.values) # this is pretty arbitrary for now # large values: more that 8 characters including decimal symbol # and first digit, hence > 1e6 has_large_values = (abs_vals > 1e6).any() - has_small_values = ((abs_vals < 10**(-self.digits)) & - (abs_vals > 0)).any() + has_small_values = ( + (abs_vals < 10 ** (-self.digits)) & (abs_vals > 0) + ).any() if has_small_values or (too_long and has_large_values): - float_format = partial('{value: .{digits:d}e}'.format, - digits=self.digits) + float_format = partial("{value: .{digits:d}e}".format, digits=self.digits) formatted_values = format_values_with(float_format) return formatted_values @@ -1169,16 +1290,14 @@ def _format_strings(self): class IntArrayFormatter(GenericArrayFormatter): - def _format_strings(self): - formatter = self.formatter or (lambda x: '{x: d}'.format(x=x)) + formatter = self.formatter or (lambda x: "{x: d}".format(x=x)) fmt_values = [formatter(x) for x in self.values] return fmt_values class Datetime64Formatter(GenericArrayFormatter): - - def __init__(self, values, nat_rep='NaT', date_format=None, **kwargs): + def __init__(self, values, nat_rep="NaT", date_format=None, **kwargs): super().__init__(values, **kwargs) self.nat_rep = nat_rep self.date_format = date_format @@ -1196,9 +1315,9 @@ def _format_strings(self): fmt_values = format_array_from_datetime( values.asi8.ravel(), - format=_get_format_datetime64_from_values(values, - self.date_format), - na_rep=self.nat_rep).reshape(values.shape) + format=_get_format_datetime64_from_values(values, self.date_format), + na_rep=self.nat_rep, + ).reshape(values.shape) return fmt_values.tolist() @@ -1216,12 +1335,16 @@ def _format_strings(self): else: array = np.asarray(values) - fmt_values = format_array(array, - formatter, - float_format=self.float_format, - na_rep=self.na_rep, digits=self.digits, - space=self.space, justify=self.justify, - leading_space=self.leading_space) + fmt_values = format_array( + array, + formatter, + float_format=self.float_format, + na_rep=self.na_rep, + digits=self.digits, + space=self.space, + justify=self.justify, + leading_space=self.leading_space, + ) return fmt_values @@ -1261,9 +1384,12 @@ def format_percentiles(percentiles): percentiles = np.asarray(percentiles) # It checks for np.NaN as well - with np.errstate(invalid='ignore'): - if not is_numeric_dtype(percentiles) or not np.all(percentiles >= 0) \ - or not np.all(percentiles <= 1): + with np.errstate(invalid="ignore"): + if ( + not is_numeric_dtype(percentiles) + or not np.all(percentiles >= 0) + or not np.all(percentiles <= 1) + ): raise ValueError("percentiles should all be in the interval [0,1]") percentiles = 100 * percentiles @@ -1271,21 +1397,21 @@ def format_percentiles(percentiles): if np.all(int_idx): out = percentiles.astype(int).astype(str) - return [i + '%' for i in out] + return [i + "%" for i in out] unique_pcts = np.unique(percentiles) to_begin = unique_pcts[0] if unique_pcts[0] > 0 else None to_end = 100 - unique_pcts[-1] if unique_pcts[-1] < 100 else None # Least precision that keeps percentiles unique after rounding - prec = -np.floor(np.log10(np.min( - np.ediff1d(unique_pcts, to_begin=to_begin, to_end=to_end) - ))).astype(int) + prec = -np.floor( + np.log10(np.min(np.ediff1d(unique_pcts, to_begin=to_begin, to_end=to_end))) + ).astype(int) prec = max(1, prec) out = np.empty_like(percentiles, dtype=object) out[int_idx] = percentiles[int_idx].astype(int).astype(str) out[~int_idx] = percentiles[~int_idx].round(prec).astype(str) - return [i + '%' for i in out] + return [i + "%" for i in out] def _is_dates_only(values): @@ -1298,20 +1424,21 @@ def _is_dates_only(values): values_int = values.asi8 consider_values = values_int != iNaT - one_day_nanos = (86400 * 1e9) - even_days = np.logical_and(consider_values, - values_int % int(one_day_nanos) != 0).sum() == 0 + one_day_nanos = 86400 * 1e9 + even_days = ( + np.logical_and(consider_values, values_int % int(one_day_nanos) != 0).sum() == 0 + ) if even_days: return True return False -def _format_datetime64(x, tz=None, nat_rep='NaT'): +def _format_datetime64(x, tz=None, nat_rep="NaT"): if x is None or (is_scalar(x) and isna(x)): return nat_rep if tz is not None or not isinstance(x, Timestamp): - if getattr(x, 'tzinfo', None) is not None: + if getattr(x, "tzinfo", None) is not None: x = Timestamp(x).tz_convert(tz) else: x = Timestamp(x).tz_localize(tz) @@ -1319,7 +1446,7 @@ def _format_datetime64(x, tz=None, nat_rep='NaT'): return str(x) -def _format_datetime64_dateonly(x, nat_rep='NaT', date_format=None): +def _format_datetime64_dateonly(x, nat_rep="NaT", date_format=None): if x is None or (is_scalar(x) and isna(x)): return nat_rep @@ -1332,11 +1459,12 @@ def _format_datetime64_dateonly(x, nat_rep='NaT', date_format=None): return x._date_repr -def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None): +def _get_format_datetime64(is_dates_only, nat_rep="NaT", date_format=None): if is_dates_only: return lambda x, tz=None: _format_datetime64_dateonly( - x, nat_rep=nat_rep, date_format=date_format) + x, nat_rep=nat_rep, date_format=date_format + ) else: return lambda x, tz=None: _format_datetime64(x, tz=tz, nat_rep=nat_rep) @@ -1356,36 +1484,34 @@ def _get_format_datetime64_from_values(values, date_format): class Datetime64TZFormatter(Datetime64Formatter): - def _format_strings(self): """ we by definition have a TZ """ values = self.values.astype(object) is_dates_only = _is_dates_only(values) - formatter = (self.formatter or - _get_format_datetime64(is_dates_only, - date_format=self.date_format)) + formatter = self.formatter or _get_format_datetime64( + is_dates_only, date_format=self.date_format + ) fmt_values = [formatter(x) for x in values] return fmt_values class Timedelta64Formatter(GenericArrayFormatter): - - def __init__(self, values, nat_rep='NaT', box=False, **kwargs): + def __init__(self, values, nat_rep="NaT", box=False, **kwargs): super().__init__(values, **kwargs) self.nat_rep = nat_rep self.box = box def _format_strings(self): - formatter = (self.formatter or - _get_format_timedelta64(self.values, nat_rep=self.nat_rep, - box=self.box)) + formatter = self.formatter or _get_format_timedelta64( + self.values, nat_rep=self.nat_rep, box=self.box + ) fmt_values = np.array([formatter(x) for x in self.values]) return fmt_values -def _get_format_timedelta64(values, nat_rep='NaT', box=False): +def _get_format_timedelta64(values, nat_rep="NaT", box=False): """ Return a formatter function for a range of timedeltas. These will all have the same format argument @@ -1397,18 +1523,20 @@ def _get_format_timedelta64(values, nat_rep='NaT', box=False): consider_values = values_int != iNaT - one_day_nanos = (86400 * 1e9) - even_days = np.logical_and(consider_values, - values_int % one_day_nanos != 0).sum() == 0 - all_sub_day = np.logical_and( - consider_values, np.abs(values_int) >= one_day_nanos).sum() == 0 + one_day_nanos = 86400 * 1e9 + even_days = ( + np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0 + ) + all_sub_day = ( + np.logical_and(consider_values, np.abs(values_int) >= one_day_nanos).sum() == 0 + ) if even_days: format = None elif all_sub_day: - format = 'sub_day' + format = "sub_day" else: - format = 'long' + format = "long" def _formatter(x): if x is None or (is_scalar(x) and isna(x)): @@ -1424,9 +1552,9 @@ def _formatter(x): return _formatter -def _make_fixed_width(strings, justify='right', minimum=None, adj=None): +def _make_fixed_width(strings, justify="right", minimum=None, adj=None): - if len(strings) == 0 or justify == 'all': + if len(strings) == 0 or justify == "all": return strings if adj is None: @@ -1444,7 +1572,7 @@ def _make_fixed_width(strings, justify='right', minimum=None, adj=None): def just(x): if conf_max is not None: if (conf_max > 3) & (adj.len(x) > max_len): - x = x[:max_len - 3] + '...' + x = x[: max_len - 3] + "..." return x strings = [just(x) for x in strings] @@ -1452,41 +1580,46 @@ def just(x): return result -def _trim_zeros_complex(str_complexes, na_rep='NaN'): +def _trim_zeros_complex(str_complexes, na_rep="NaN"): """ Separates the real and imaginary parts from the complex number, and executes the _trim_zeros_float method on each of those. """ + def separate_and_trim(str_complex, na_rep): - num_arr = str_complex.split('+') - return (_trim_zeros_float([num_arr[0]], na_rep) + - ['+'] + - _trim_zeros_float([num_arr[1][:-1]], na_rep) + - ['j']) + num_arr = str_complex.split("+") + return ( + _trim_zeros_float([num_arr[0]], na_rep) + + ["+"] + + _trim_zeros_float([num_arr[1][:-1]], na_rep) + + ["j"] + ) - return [''.join(separate_and_trim(x, na_rep)) for x in str_complexes] + return ["".join(separate_and_trim(x, na_rep)) for x in str_complexes] -def _trim_zeros_float(str_floats, na_rep='NaN'): +def _trim_zeros_float(str_floats, na_rep="NaN"): """ Trims zeros, leaving just one before the decimal points if need be. """ trimmed = str_floats def _is_number(x): - return (x != na_rep and not x.endswith('inf')) + return x != na_rep and not x.endswith("inf") def _cond(values): finite = [x for x in values if _is_number(x)] - return (len(finite) > 0 and all(x.endswith('0') for x in finite) and - not (any(('e' in x) or ('E' in x) for x in finite))) + return ( + len(finite) > 0 + and all(x.endswith("0") for x in finite) + and not (any(("e" in x) or ("E" in x) for x in finite)) + ) while _cond(trimmed): trimmed = [x[:-1] if _is_number(x) else x for x in trimmed] # leave one 0 after the decimal points if need be. - return [x + "0" if x.endswith('.') and _is_number(x) else x - for x in trimmed] + return [x + "0" if x.endswith(".") and _is_number(x) else x for x in trimmed] def _has_names(index): @@ -1521,7 +1654,7 @@ class EngFormatter: 15: "P", 18: "E", 21: "Z", - 24: "Y" + 24: "Y", } def __init__(self, accuracy=None, use_eng_prefix=False): @@ -1551,13 +1684,14 @@ def __call__(self, num): """ import decimal import math + dnum = decimal.Decimal(str(num)) if decimal.Decimal.is_nan(dnum): - return 'NaN' + return "NaN" if decimal.Decimal.is_infinite(dnum): - return 'inf' + return "inf" sign = 1 @@ -1578,17 +1712,16 @@ def __call__(self, num): prefix = self.ENG_PREFIXES[int_pow10] else: if int_pow10 < 0: - prefix = 'E-{pow10:02d}'.format(pow10=-int_pow10) + prefix = "E-{pow10:02d}".format(pow10=-int_pow10) else: - prefix = 'E+{pow10:02d}'.format(pow10=int_pow10) + prefix = "E+{pow10:02d}".format(pow10=int_pow10) - mant = sign * dnum / (10**pow10) + mant = sign * dnum / (10 ** pow10) if self.accuracy is None: # pragma: no cover format_str = "{mant: g}{prefix}" else: - format_str = ("{{mant: .{acc:d}f}}{{prefix}}" - .format(acc=self.accuracy)) + format_str = "{{mant: .{acc:d}f}}{{prefix}}".format(acc=self.accuracy) formatted = format_str.format(mant=mant, prefix=prefix) @@ -1628,7 +1761,7 @@ def _binify(cols, line_width): return bins -def get_level_lengths(levels, sentinel=''): +def get_level_lengths(levels, sentinel=""): """For each index in each level the function returns lengths of indexes. Parameters @@ -1681,4 +1814,4 @@ def buffer_put_lines(buf, lines): """ if any(isinstance(x, str) for x in lines): lines = [str(x) for x in lines] - buf.write('\n'.join(lines)) + buf.write("\n".join(lines)) diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 6fc36324092b5c..e6aae44baa69b9 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -35,17 +35,16 @@ def __init__(self, formatter, classes=None, border=None): self.frame = self.fmt.frame self.columns = self.fmt.tr_frame.columns self.elements = [] - self.bold_rows = self.fmt.kwds.get('bold_rows', False) - self.escape = self.fmt.kwds.get('escape', True) + self.bold_rows = self.fmt.kwds.get("bold_rows", False) + self.escape = self.fmt.kwds.get("escape", True) self.show_dimensions = self.fmt.show_dimensions if border is None: - border = get_option('display.html.border') + border = get_option("display.html.border") self.border = border self.table_id = self.fmt.table_id self.render_links = self.fmt.render_links if isinstance(self.fmt.col_space, int): - self.fmt.col_space = ('{colspace}px' - .format(colspace=self.fmt.col_space)) + self.fmt.col_space = "{colspace}px".format(colspace=self.fmt.col_space) @property def show_row_idx_names(self): @@ -83,7 +82,7 @@ def ncols(self): def write(self, s, indent=0): rs = pprint_thing(s) - self.elements.append(' ' * indent + rs) + self.elements.append(" " * indent + rs) def write_th(self, s, header=False, indent=0, tags=None): """ @@ -109,25 +108,23 @@ def write_th(self, s, header=False, indent=0, tags=None): A written cell. """ if header and self.fmt.col_space is not None: - tags = (tags or "") - tags += ('style="min-width: {colspace};"' - .format(colspace=self.fmt.col_space)) + tags = tags or "" + tags += 'style="min-width: {colspace};"'.format(colspace=self.fmt.col_space) - return self._write_cell(s, kind='th', indent=indent, tags=tags) + return self._write_cell(s, kind="th", indent=indent, tags=tags) def write_td(self, s, indent=0, tags=None): - return self._write_cell(s, kind='td', indent=indent, tags=tags) + return self._write_cell(s, kind="td", indent=indent, tags=tags) - def _write_cell(self, s, kind='td', indent=0, tags=None): + def _write_cell(self, s, kind="td", indent=0, tags=None): if tags is not None: - start_tag = '<{kind} {tags}>'.format(kind=kind, tags=tags) + start_tag = "<{kind} {tags}>".format(kind=kind, tags=tags) else: - start_tag = '<{kind}>'.format(kind=kind) + start_tag = "<{kind}>".format(kind=kind) if self.escape: # escape & first to prevent double escaping of & - esc = OrderedDict([('&', r'&'), ('<', r'<'), - ('>', r'>')]) + esc = OrderedDict([("&", r"&"), ("<", r"<"), (">", r">")]) else: esc = {} @@ -135,25 +132,35 @@ def _write_cell(self, s, kind='td', indent=0, tags=None): if self.render_links and _is_url(rs): rs_unescaped = pprint_thing(s, escape_chars={}).strip() - start_tag += '
'.format( - url=rs_unescaped) - end_a = '' + start_tag += ''.format(url=rs_unescaped) + end_a = "" else: - end_a = '' - - self.write('{start}{rs}{end_a}'.format( - start=start_tag, rs=rs, end_a=end_a, kind=kind), indent) - - def write_tr(self, line, indent=0, indent_delta=0, header=False, - align=None, tags=None, nindex_levels=0): + end_a = "" + + self.write( + "{start}{rs}{end_a}".format( + start=start_tag, rs=rs, end_a=end_a, kind=kind + ), + indent, + ) + + def write_tr( + self, + line, + indent=0, + indent_delta=0, + header=False, + align=None, + tags=None, + nindex_levels=0, + ): if tags is None: tags = {} if align is None: - self.write('', indent) + self.write("", indent) else: - self.write('' - .format(align=align), indent) + self.write(''.format(align=align), indent) indent += indent_delta for i, s in enumerate(line): @@ -164,31 +171,34 @@ def write_tr(self, line, indent=0, indent_delta=0, header=False, self.write_td(s, indent, tags=val_tag) indent -= indent_delta - self.write('', indent) + self.write("", indent) def render(self): self._write_table() if self.should_show_dimensions: by = chr(215) # × - self.write('

{rows} rows {by} {cols} columns

' - .format(rows=len(self.frame), - by=by, - cols=len(self.frame.columns))) + self.write( + "

{rows} rows {by} {cols} columns

".format( + rows=len(self.frame), by=by, cols=len(self.frame.columns) + ) + ) return self.elements def _write_table(self, indent=0): - _classes = ['dataframe'] # Default class. + _classes = ["dataframe"] # Default class. use_mathjax = get_option("display.html.use_mathjax") if not use_mathjax: - _classes.append('tex2jax_ignore') + _classes.append("tex2jax_ignore") if self.classes is not None: if isinstance(self.classes, str): self.classes = self.classes.split() if not isinstance(self.classes, (list, tuple)): - raise TypeError('classes must be a string, list, or tuple, ' - 'not {typ}'.format(typ=type(self.classes))) + raise TypeError( + "classes must be a string, list, or tuple, " + "not {typ}".format(typ=type(self.classes)) + ) _classes.extend(self.classes) if self.table_id is None: @@ -196,16 +206,19 @@ def _write_table(self, indent=0): else: id_section = ' id="{table_id}"'.format(table_id=self.table_id) - self.write('' - .format(border=self.border, cls=' '.join(_classes), - id_section=id_section), indent) + self.write( + '
'.format( + border=self.border, cls=" ".join(_classes), id_section=id_section + ), + indent, + ) if self.fmt.header or self.show_row_idx_names: self._write_header(indent + self.indent_delta) self._write_body(indent + self.indent_delta) - self.write('
', indent) + self.write("", indent) def _write_col_header(self, indent): truncate_h = self.fmt.truncate_h @@ -217,12 +230,10 @@ def _write_col_header(self, indent): sentinel = object() else: sentinel = False - levels = self.columns.format(sparsify=sentinel, adjoin=False, - names=False) + levels = self.columns.format(sparsify=sentinel, adjoin=False, names=False) level_lengths = get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 - for lnum, (records, values) in enumerate(zip(level_lengths, - levels)): + for lnum, (records, values) in enumerate(zip(level_lengths, levels)): if truncate_h: # modify the header lines ins_col = self.fmt.tr_col_num @@ -235,21 +246,23 @@ def _write_col_header(self, indent): elif tag + span > ins_col: recs_new[tag] = span + 1 if lnum == inner_lvl: - values = (values[:ins_col] + ('...',) + - values[ins_col:]) + values = ( + values[:ins_col] + ("...",) + values[ins_col:] + ) else: # sparse col headers do not receive a ... - values = (values[:ins_col] + - (values[ins_col - 1], ) + - values[ins_col:]) + values = ( + values[:ins_col] + + (values[ins_col - 1],) + + values[ins_col:] + ) else: recs_new[tag] = span # if ins_col lies between tags, all col headers # get ... if tag + span == ins_col: recs_new[ins_col] = 1 - values = (values[:ins_col] + ('...',) + - values[ins_col:]) + values = values[:ins_col] + ("...",) + values[ins_col:] records = recs_new inner_lvl = len(level_lengths) - 1 if lnum == inner_lvl: @@ -263,8 +276,7 @@ def _write_col_header(self, indent): recs_new[tag] = span recs_new[ins_col] = 1 records = recs_new - values = (values[:ins_col] + ['...'] + - values[ins_col:]) + values = values[:ins_col] + ["..."] + values[ins_col:] # see gh-22579 # Column Offset Bug with to_html(index=False) with @@ -272,7 +284,7 @@ def _write_col_header(self, indent): # Initially fill row with blank cells before column names. # TODO: Refactor to remove code duplication with code # block below for standard columns index. - row = [''] * (self.row_levels - 1) + row = [""] * (self.row_levels - 1) if self.fmt.index or self.show_col_idx_names: # see gh-22747 # If to_html(index_names=False) do not show columns @@ -283,9 +295,9 @@ def _write_col_header(self, indent): # parity with DataFrameFormatter class. if self.fmt.show_index_names: name = self.columns.names[lnum] - row.append(pprint_thing(name or '')) + row.append(pprint_thing(name or "")) else: - row.append('') + row.append("") tags = {} j = len(row) @@ -297,8 +309,7 @@ def _write_col_header(self, indent): continue j += 1 row.append(v) - self.write_tr(row, indent, self.indent_delta, tags=tags, - header=True) + self.write_tr(row, indent, self.indent_delta, tags=tags, header=True) else: # see gh-22579 # Column misalignment also occurs for @@ -306,7 +317,7 @@ def _write_col_header(self, indent): # Initially fill row with blank cells before column names. # TODO: Refactor to remove code duplication with code block # above for columns MultiIndex. - row = [''] * (self.row_levels - 1) + row = [""] * (self.row_levels - 1) if self.fmt.index or self.show_col_idx_names: # see gh-22747 # If to_html(index_names=False) do not show columns @@ -314,27 +325,27 @@ def _write_col_header(self, indent): # TODO: Refactor to use _get_column_name_list from # DataFrameFormatter class. if self.fmt.show_index_names: - row.append(self.columns.name or '') + row.append(self.columns.name or "") else: - row.append('') + row.append("") row.extend(self._get_columns_formatted_values()) align = self.fmt.justify if truncate_h: ins_col = self.row_levels + self.fmt.tr_col_num - row.insert(ins_col, '...') + row.insert(ins_col, "...") - self.write_tr(row, indent, self.indent_delta, header=True, - align=align) + self.write_tr(row, indent, self.indent_delta, header=True, align=align) def _write_row_header(self, indent): truncate_h = self.fmt.truncate_h - row = ([x if x is not None else '' for x in self.frame.index.names] - + [''] * (self.ncols + (1 if truncate_h else 0))) + row = [x if x is not None else "" for x in self.frame.index.names] + [""] * ( + self.ncols + (1 if truncate_h else 0) + ) self.write_tr(row, indent, self.indent_delta, header=True) def _write_header(self, indent): - self.write('', indent) + self.write("", indent) if self.fmt.header: self._write_col_header(indent + self.indent_delta) @@ -342,27 +353,24 @@ def _write_header(self, indent): if self.show_row_idx_names: self._write_row_header(indent + self.indent_delta) - self.write('', indent) + self.write("", indent) def _get_formatted_values(self): - with option_context('display.max_colwidth', 999999): - fmt_values = {i: self.fmt._format_col(i) - for i in range(self.ncols)} + with option_context("display.max_colwidth", 999999): + fmt_values = {i: self.fmt._format_col(i) for i in range(self.ncols)} return fmt_values def _write_body(self, indent): - self.write('', indent) + self.write("", indent) fmt_values = self._get_formatted_values() # write values if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex): - self._write_hierarchical_rows( - fmt_values, indent + self.indent_delta) + self._write_hierarchical_rows(fmt_values, indent + self.indent_delta) else: - self._write_regular_rows( - fmt_values, indent + self.indent_delta) + self._write_regular_rows(fmt_values, indent + self.indent_delta) - self.write('', indent) + self.write("", indent) def _write_regular_rows(self, fmt_values, indent): truncate_h = self.fmt.truncate_h @@ -371,7 +379,7 @@ def _write_regular_rows(self, fmt_values, indent): nrows = len(self.fmt.tr_frame) if self.fmt.index: - fmt = self.fmt._get_formatter('__index__') + fmt = self.fmt._get_formatter("__index__") if fmt is not None: index_values = self.fmt.tr_frame.index.map(fmt) else: @@ -381,9 +389,14 @@ def _write_regular_rows(self, fmt_values, indent): for i in range(nrows): if truncate_v and i == (self.fmt.tr_row_num): - str_sep_row = ['...'] * len(row) - self.write_tr(str_sep_row, indent, self.indent_delta, - tags=None, nindex_levels=self.row_levels) + str_sep_row = ["..."] * len(row) + self.write_tr( + str_sep_row, + indent, + self.indent_delta, + tags=None, + nindex_levels=self.row_levels, + ) row = [] if self.fmt.index: @@ -393,14 +406,15 @@ def _write_regular_rows(self, fmt_values, indent): # a standard index when the columns index is named. # Add blank cell before data cells. elif self.show_col_idx_names: - row.append('') + row.append("") row.extend(fmt_values[j][i] for j in range(self.ncols)) if truncate_h: dot_col_ix = self.fmt.tr_col_num + self.row_levels - row.insert(dot_col_ix, '...') - self.write_tr(row, indent, self.indent_delta, tags=None, - nindex_levels=self.row_levels) + row.insert(dot_col_ix, "...") + self.write_tr( + row, indent, self.indent_delta, tags=None, nindex_levels=self.row_levels + ) def _write_hierarchical_rows(self, fmt_values, indent): template = 'rowspan="{span}" valign="top"' @@ -410,15 +424,13 @@ def _write_hierarchical_rows(self, fmt_values, indent): frame = self.fmt.tr_frame nrows = len(frame) - idx_values = frame.index.format(sparsify=False, adjoin=False, - names=False) + idx_values = frame.index.format(sparsify=False, adjoin=False, names=False) idx_values = list(zip(*idx_values)) if self.fmt.sparsify: # GH3547 sentinel = object() - levels = frame.index.format(sparsify=sentinel, adjoin=False, - names=False) + levels = frame.index.format(sparsify=sentinel, adjoin=False, names=False) level_lengths = get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 @@ -438,12 +450,12 @@ def _write_hierarchical_rows(self, fmt_values, indent): # GH 14882 - Make sure insertion done once if not inserted: dot_row = list(idx_values[ins_row - 1]) - dot_row[-1] = '...' + dot_row[-1] = "..." idx_values.insert(ins_row, tuple(dot_row)) inserted = True else: dot_row = list(idx_values[ins_row]) - dot_row[inner_lvl - lnum] = '...' + dot_row[inner_lvl - lnum] = "..." idx_values[ins_row] = tuple(dot_row) else: rec_new[tag] = span @@ -452,19 +464,20 @@ def _write_hierarchical_rows(self, fmt_values, indent): if tag + span == ins_row: rec_new[ins_row] = 1 if lnum == 0: - idx_values.insert(ins_row, tuple( - ['...'] * len(level_lengths))) + idx_values.insert( + ins_row, tuple(["..."] * len(level_lengths)) + ) # GH 14882 - Place ... in correct level elif inserted: dot_row = list(idx_values[ins_row]) - dot_row[inner_lvl - lnum] = '...' + dot_row[inner_lvl - lnum] = "..." idx_values[ins_row] = tuple(dot_row) level_lengths[lnum] = rec_new level_lengths[inner_lvl][ins_row] = 1 for ix_col in range(len(fmt_values)): - fmt_values[ix_col].insert(ins_row, '...') + fmt_values[ix_col].insert(ins_row, "...") nrows += 1 for i in range(nrows): @@ -486,27 +499,44 @@ def _write_hierarchical_rows(self, fmt_values, indent): row.extend(fmt_values[j][i] for j in range(self.ncols)) if truncate_h: - row.insert(self.row_levels - sparse_offset + - self.fmt.tr_col_num, '...') - self.write_tr(row, indent, self.indent_delta, tags=tags, - nindex_levels=len(levels) - sparse_offset) + row.insert( + self.row_levels - sparse_offset + self.fmt.tr_col_num, "..." + ) + self.write_tr( + row, + indent, + self.indent_delta, + tags=tags, + nindex_levels=len(levels) - sparse_offset, + ) else: row = [] for i in range(len(frame)): if truncate_v and i == (self.fmt.tr_row_num): - str_sep_row = ['...'] * len(row) - self.write_tr(str_sep_row, indent, self.indent_delta, - tags=None, nindex_levels=self.row_levels) - - idx_values = list(zip(*frame.index.format( - sparsify=False, adjoin=False, names=False))) + str_sep_row = ["..."] * len(row) + self.write_tr( + str_sep_row, + indent, + self.indent_delta, + tags=None, + nindex_levels=self.row_levels, + ) + + idx_values = list( + zip(*frame.index.format(sparsify=False, adjoin=False, names=False)) + ) row = [] row.extend(idx_values[i]) row.extend(fmt_values[j][i] for j in range(self.ncols)) if truncate_h: - row.insert(self.row_levels + self.fmt.tr_col_num, '...') - self.write_tr(row, indent, self.indent_delta, tags=None, - nindex_levels=frame.index.nlevels) + row.insert(self.row_levels + self.fmt.tr_col_num, "...") + self.write_tr( + row, + indent, + self.indent_delta, + tags=None, + nindex_levels=frame.index.nlevels, + ) class NotebookFormatter(HTMLFormatter): @@ -534,34 +564,25 @@ def write_style(self): .dataframe %s { %s: %s; }""" - element_props = [('tbody tr th:only-of-type', - 'vertical-align', - 'middle'), - ('tbody tr th', - 'vertical-align', - 'top')] + element_props = [ + ("tbody tr th:only-of-type", "vertical-align", "middle"), + ("tbody tr th", "vertical-align", "top"), + ] if isinstance(self.columns, ABCMultiIndex): - element_props.append(('thead tr th', - 'text-align', - 'left')) + element_props.append(("thead tr th", "text-align", "left")) if self.show_row_idx_names: - element_props.append(('thead tr:last-of-type th', - 'text-align', - 'right')) + element_props.append( + ("thead tr:last-of-type th", "text-align", "right") + ) else: - element_props.append(('thead th', - 'text-align', - 'right')) - template_mid = '\n\n'.join(map(lambda t: template_select % t, - element_props)) - template = dedent('\n'.join((template_first, - template_mid, - template_last))) + element_props.append(("thead th", "text-align", "right")) + template_mid = "\n\n".join(map(lambda t: template_select % t, element_props)) + template = dedent("\n".join((template_first, template_mid, template_last))) self.write(template) def render(self): - self.write('
') + self.write("
") self.write_style() super().render() - self.write('
') + self.write("
") return self.elements diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index 33bc413e9c3fe5..dad099b747701e 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -26,11 +26,18 @@ class LatexFormatter(TableFormatter): HTMLFormatter """ - def __init__(self, formatter, column_format=None, longtable=False, - multicolumn=False, multicolumn_format=None, multirow=False): + def __init__( + self, + formatter, + column_format=None, + longtable=False, + multicolumn=False, + multicolumn_format=None, + multirow=False, + ): self.fmt = formatter self.frame = self.fmt.frame - self.bold_rows = self.fmt.kwds.get('bold_rows', False) + self.bold_rows = self.fmt.kwds.get("bold_rows", False) self.column_format = column_format self.longtable = longtable self.multicolumn = multicolumn @@ -44,25 +51,28 @@ def write_result(self, buf): # string representation of the columns if len(self.frame.columns) == 0 or len(self.frame.index) == 0: - info_line = ('Empty {name}\nColumns: {col}\nIndex: {idx}' - .format(name=type(self.frame).__name__, - col=self.frame.columns, - idx=self.frame.index)) + info_line = "Empty {name}\nColumns: {col}\nIndex: {idx}".format( + name=type(self.frame).__name__, + col=self.frame.columns, + idx=self.frame.index, + ) strcols = [[info_line]] else: strcols = self.fmt._to_str_columns() def get_col_type(dtype): if issubclass(dtype.type, np.number): - return 'r' + return "r" else: - return 'l' + return "l" # reestablish the MultiIndex that has been joined by _to_str_column if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex): out = self.frame.index.format( - adjoin=False, sparsify=self.fmt.sparsify, - names=self.fmt.has_index_names, na_rep=self.fmt.na_rep + adjoin=False, + sparsify=self.fmt.sparsify, + names=self.fmt.has_index_names, + na_rep=self.fmt.na_rep, ) # index.format will sparsify repeated entries with empty strings @@ -71,17 +81,18 @@ def pad_empties(x): for pad in reversed(x): if pad: break - return [x[0]] + [i if i else ' ' * len(pad) for i in x[1:]] + return [x[0]] + [i if i else " " * len(pad) for i in x[1:]] + out = (pad_empties(i) for i in out) # Add empty spaces for each column level clevels = self.frame.columns.nlevels - out = [[' ' * len(i[-1])] * clevels + i for i in out] + out = [[" " * len(i[-1])] * clevels + i for i in out] # Add the column names to the last index column cnames = self.frame.columns.names if any(cnames): - new_names = [i if i else '{}' for i in cnames] + new_names = [i if i else "{}" for i in cnames] out[self.frame.index.nlevels - 1][:clevels] = new_names # Get rid of old multiindex column and add new ones @@ -90,22 +101,22 @@ def pad_empties(x): column_format = self.column_format if column_format is None: dtypes = self.frame.dtypes._values - column_format = ''.join(map(get_col_type, dtypes)) + column_format = "".join(map(get_col_type, dtypes)) if self.fmt.index: - index_format = 'l' * self.frame.index.nlevels + index_format = "l" * self.frame.index.nlevels column_format = index_format + column_format elif not isinstance(column_format, str): # pragma: no cover - raise AssertionError('column_format must be str or unicode, ' - 'not {typ}'.format(typ=type(column_format))) + raise AssertionError( + "column_format must be str or unicode, " + "not {typ}".format(typ=type(column_format)) + ) if not self.longtable: - buf.write('\\begin{{tabular}}{{{fmt}}}\n' - .format(fmt=column_format)) - buf.write('\\toprule\n') + buf.write("\\begin{{tabular}}{{{fmt}}}\n".format(fmt=column_format)) + buf.write("\\toprule\n") else: - buf.write('\\begin{{longtable}}{{{fmt}}}\n' - .format(fmt=column_format)) - buf.write('\\toprule\n') + buf.write("\\begin{{longtable}}{{{fmt}}}\n".format(fmt=column_format)) + buf.write("\\toprule\n") ilevels = self.frame.index.nlevels clevels = self.frame.columns.nlevels @@ -117,50 +128,63 @@ def pad_empties(x): for i, row in enumerate(strrows): if i == nlevels and self.fmt.header: - buf.write('\\midrule\n') # End of header + buf.write("\\midrule\n") # End of header if self.longtable: - buf.write('\\endhead\n') - buf.write('\\midrule\n') - buf.write('\\multicolumn{{{n}}}{{r}}{{{{Continued on next ' - 'page}}}} \\\\\n'.format(n=len(row))) - buf.write('\\midrule\n') - buf.write('\\endfoot\n\n') - buf.write('\\bottomrule\n') - buf.write('\\endlastfoot\n') - if self.fmt.kwds.get('escape', True): + buf.write("\\endhead\n") + buf.write("\\midrule\n") + buf.write( + "\\multicolumn{{{n}}}{{r}}{{{{Continued on next " + "page}}}} \\\\\n".format(n=len(row)) + ) + buf.write("\\midrule\n") + buf.write("\\endfoot\n\n") + buf.write("\\bottomrule\n") + buf.write("\\endlastfoot\n") + if self.fmt.kwds.get("escape", True): # escape backslashes first - crow = [(x.replace('\\', '\\textbackslash ') - .replace('_', '\\_') - .replace('%', '\\%').replace('$', '\\$') - .replace('#', '\\#').replace('{', '\\{') - .replace('}', '\\}').replace('~', '\\textasciitilde ') - .replace('^', '\\textasciicircum ') - .replace('&', '\\&') - if (x and x != '{}') else '{}') for x in row] + crow = [ + ( + x.replace("\\", "\\textbackslash ") + .replace("_", "\\_") + .replace("%", "\\%") + .replace("$", "\\$") + .replace("#", "\\#") + .replace("{", "\\{") + .replace("}", "\\}") + .replace("~", "\\textasciitilde ") + .replace("^", "\\textasciicircum ") + .replace("&", "\\&") + if (x and x != "{}") + else "{}" + ) + for x in row + ] else: - crow = [x if x else '{}' for x in row] + crow = [x if x else "{}" for x in row] if self.bold_rows and self.fmt.index: # bold row labels - crow = ['\\textbf{{{x}}}'.format(x=x) - if j < ilevels and x.strip() not in ['', '{}'] else x - for j, x in enumerate(crow)] + crow = [ + "\\textbf{{{x}}}".format(x=x) + if j < ilevels and x.strip() not in ["", "{}"] + else x + for j, x in enumerate(crow) + ] if i < clevels and self.fmt.header and self.multicolumn: # sum up columns to multicolumns crow = self._format_multicolumn(crow, ilevels) - if (i >= nlevels and self.fmt.index and self.multirow and - ilevels > 1): + if i >= nlevels and self.fmt.index and self.multirow and ilevels > 1: # sum up rows to multirows crow = self._format_multirow(crow, ilevels, i, strrows) - buf.write(' & '.join(crow)) - buf.write(' \\\\\n') + buf.write(" & ".join(crow)) + buf.write(" \\\\\n") if self.multirow and i < len(strrows) - 1: self._print_cline(buf, i, len(strcols)) if not self.longtable: - buf.write('\\bottomrule\n') - buf.write('\\end{tabular}\n') + buf.write("\\bottomrule\n") + buf.write("\\end{tabular}\n") else: - buf.write('\\end{longtable}\n') + buf.write("\\end{longtable}\n") def _format_multicolumn(self, row, ilevels): r""" @@ -174,17 +198,20 @@ def _format_multicolumn(self, row, ilevels): """ row2 = list(row[:ilevels]) ncol = 1 - coltext = '' + coltext = "" def append_col(): # write multicolumn if needed if ncol > 1: - row2.append('\\multicolumn{{{ncol:d}}}{{{fmt:s}}}{{{txt:s}}}' - .format(ncol=ncol, fmt=self.multicolumn_format, - txt=coltext.strip())) + row2.append( + "\\multicolumn{{{ncol:d}}}{{{fmt:s}}}{{{txt:s}}}".format( + ncol=ncol, fmt=self.multicolumn_format, txt=coltext.strip() + ) + ) # don't modify where not needed else: row2.append(coltext) + for c in row[ilevels:]: # if next col has text, write the previous if c.strip(): @@ -213,15 +240,16 @@ def _format_multirow(self, row, ilevels, i, rows): for j in range(ilevels): if row[j].strip(): nrow = 1 - for r in rows[i + 1:]: + for r in rows[i + 1 :]: if not r[j].strip(): nrow += 1 else: break if nrow > 1: # overwrite non-multirow entry - row[j] = '\\multirow{{{nrow:d}}}{{*}}{{{row:s}}}'.format( - nrow=nrow, row=row[j].strip()) + row[j] = "\\multirow{{{nrow:d}}}{{*}}{{{row:s}}}".format( + nrow=nrow, row=row[j].strip() + ) # save when to end the current block with \cline self.clinebuf.append([i + nrow - 1, j + 1]) return row @@ -232,7 +260,6 @@ def _print_cline(self, buf, i, icol): """ for cl in self.clinebuf: if cl[0] == i: - buf.write('\\cline{{{cl:d}-{icol:d}}}\n' - .format(cl=cl[1], icol=icol)) + buf.write("\\cline{{{cl:d}-{icol:d}}}\n".format(cl=cl[1], icol=icol)) # remove entries that have been written to buffer self.clinebuf = [x for x in self.clinebuf if x[0] != i] diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 73d8586a0a8c9a..4958d8246610e9 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -25,8 +25,8 @@ def adjoin(space, *lists, **kwargs): justfunc : callable function used to justify str. Needed for unicode handling. """ - strlen = kwargs.pop('strlen', len) - justfunc = kwargs.pop('justfunc', justify) + strlen = kwargs.pop("strlen", len) + justfunc = kwargs.pop("justfunc", justify) out_lines = [] newLists = [] @@ -35,34 +35,33 @@ def adjoin(space, *lists, **kwargs): lengths.append(max(map(len, lists[-1]))) maxLen = max(map(len, lists)) for i, lst in enumerate(lists): - nl = justfunc(lst, lengths[i], mode='left') - nl.extend([' ' * lengths[i]] * (maxLen - len(lst))) + nl = justfunc(lst, lengths[i], mode="left") + nl.extend([" " * lengths[i]] * (maxLen - len(lst))) newLists.append(nl) toJoin = zip(*newLists) for lines in toJoin: out_lines.append(_join_unicode(lines)) - return _join_unicode(out_lines, sep='\n') + return _join_unicode(out_lines, sep="\n") -def justify(texts, max_len, mode='right'): +def justify(texts, max_len, mode="right"): """ Perform ljust, center, rjust against string or list-like """ - if mode == 'left': + if mode == "left": return [x.ljust(max_len) for x in texts] - elif mode == 'center': + elif mode == "center": return [x.center(max_len) for x in texts] else: return [x.rjust(max_len) for x in texts] -def _join_unicode(lines, sep=''): +def _join_unicode(lines, sep=""): try: return sep.join(lines) except UnicodeDecodeError: sep = str(sep) - return sep.join([x.decode('utf-8') if isinstance(x, str) else x - for x in lines]) + return sep.join([x.decode("utf-8") if isinstance(x, str) else x for x in lines]) # Unicode consolidation @@ -99,7 +98,7 @@ def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds): if isinstance(seq, set): fmt = "{{{body}}}" else: - fmt = "[{body}]" if hasattr(seq, '__setitem__') else "({body})" + fmt = "[{body}]" if hasattr(seq, "__setitem__") else "({body})" if max_seq_items is False: nitems = len(seq) @@ -108,15 +107,16 @@ def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds): s = iter(seq) # handle sets, no slicing - r = [pprint_thing(next(s), - _nest_lvl + 1, max_seq_items=max_seq_items, **kwds) - for i in range(min(nitems, len(seq)))] + r = [ + pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds) + for i in range(min(nitems, len(seq))) + ] body = ", ".join(r) if nitems < len(seq): body += ", ..." elif isinstance(seq, tuple) and len(seq) == 1: - body += ',' + body += "," return fmt.format(body=body) @@ -139,10 +139,10 @@ def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds): for k, v in list(seq.items())[:nitems]: pairs.append( pfmt.format( - key=pprint_thing(k, _nest_lvl + 1, - max_seq_items=max_seq_items, **kwds), - val=pprint_thing(v, _nest_lvl + 1, - max_seq_items=max_seq_items, **kwds))) + key=pprint_thing(k, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds), + val=pprint_thing(v, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds), + ) + ) if nitems < len(seq): return fmt.format(things=", ".join(pairs) + ", ...") @@ -150,8 +150,14 @@ def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds): return fmt.format(things=", ".join(pairs)) -def pprint_thing(thing, _nest_lvl=0, escape_chars=None, default_escapes=False, - quote_strings=False, max_seq_items=None): +def pprint_thing( + thing, + _nest_lvl=0, + escape_chars=None, + default_escapes=False, + quote_strings=False, + max_seq_items=None, +): """ This function is the sanctioned way of converting objects to a unicode representation. @@ -188,9 +194,9 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): result = str(thing) # we should try this first except UnicodeDecodeError: # either utf-8 or we replace errors - result = str(thing).decode('utf-8', "replace") + result = str(thing).decode("utf-8", "replace") - translate = {'\t': r'\t', '\n': r'\n', '\r': r'\r', } + translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r"} if isinstance(escape_chars, dict): if default_escapes: translate.update(escape_chars) @@ -204,17 +210,22 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): return str(result) - if hasattr(thing, '__next__'): + if hasattr(thing, "__next__"): return str(thing) - elif (isinstance(thing, dict) and - _nest_lvl < get_option("display.pprint_nest_depth")): - result = _pprint_dict(thing, _nest_lvl, quote_strings=True, - max_seq_items=max_seq_items) - elif (is_sequence(thing) and - _nest_lvl < get_option("display.pprint_nest_depth")): - result = _pprint_seq(thing, _nest_lvl, escape_chars=escape_chars, - quote_strings=quote_strings, - max_seq_items=max_seq_items) + elif isinstance(thing, dict) and _nest_lvl < get_option( + "display.pprint_nest_depth" + ): + result = _pprint_dict( + thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items + ) + elif is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth"): + result = _pprint_seq( + thing, + _nest_lvl, + escape_chars=escape_chars, + quote_strings=quote_strings, + max_seq_items=max_seq_items, + ) elif isinstance(thing, str) and quote_strings: result = "'{thing}'".format(thing=as_escaped_unicode(thing)) else: @@ -223,16 +234,17 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): return str(result) # always unicode -def pprint_thing_encoded(object, encoding='utf-8', errors='replace', **kwds): +def pprint_thing_encoded(object, encoding="utf-8", errors="replace", **kwds): value = pprint_thing(object) # get unicode representation of object return value.encode(encoding, errors, **kwds) def _enable_data_resource_formatter(enable): - if 'IPython' not in sys.modules: + if "IPython" not in sys.modules: # definitely not in IPython return from IPython import get_ipython + ip = get_ipython() if ip is None: # still not in IPython @@ -247,8 +259,9 @@ def _enable_data_resource_formatter(enable): from IPython.core.formatters import BaseFormatter class TableSchemaFormatter(BaseFormatter): - print_method = '_repr_data_resource_' + print_method = "_repr_data_resource_" _return_type = (dict,) + # register it: formatters[mimetype] = TableSchemaFormatter() # enable it if it's been disabled: @@ -259,13 +272,19 @@ class TableSchemaFormatter(BaseFormatter): formatters[mimetype].enabled = False -default_pprint = lambda x, max_seq_items=None: \ - pprint_thing(x, escape_chars=('\t', '\r', '\n'), quote_strings=True, - max_seq_items=max_seq_items) +default_pprint = lambda x, max_seq_items=None: pprint_thing( + x, escape_chars=("\t", "\r", "\n"), quote_strings=True, max_seq_items=max_seq_items +) -def format_object_summary(obj, formatter, is_justify=True, name=None, - indent_for_name=True, line_break_each_value=False): +def format_object_summary( + obj, + formatter, + is_justify=True, + name=None, + indent_for_name=True, + line_break_each_value=False, +): """ Return the formatted obj as a unicode string @@ -299,14 +318,14 @@ def format_object_summary(obj, formatter, is_justify=True, name=None, display_width, _ = get_console_size() if display_width is None: - display_width = get_option('display.width') or 80 + display_width = get_option("display.width") or 80 if name is None: name = obj.__class__.__name__ if indent_for_name: name_len = len(name) - space1 = "\n%s" % (' ' * (name_len + 1)) - space2 = "\n%s" % (' ' * (name_len + 2)) + space1 = "\n%s" % (" " * (name_len + 1)) + space2 = "\n%s" % (" " * (name_len + 2)) else: space1 = "\n" space2 = "\n " # space for the opening '[' @@ -315,10 +334,10 @@ def format_object_summary(obj, formatter, is_justify=True, name=None, if line_break_each_value: # If we want to vertically align on each value of obj, we need to # separate values by a line break and indent the values - sep = ',\n ' + ' ' * len(name) + sep = ",\n " + " " * len(name) else: - sep = ',' - max_seq_items = get_option('display.max_seq_items') or n + sep = "," + max_seq_items = get_option("display.max_seq_items") or n # are we a truncated display is_truncated = n > max_seq_items @@ -328,8 +347,7 @@ def format_object_summary(obj, formatter, is_justify=True, name=None, def _extend_line(s, line, value, display_width, next_line_prefix): - if (adj.len(line.rstrip()) + adj.len(value.rstrip()) >= - display_width): + if adj.len(line.rstrip()) + adj.len(value.rstrip()) >= display_width: s += line.rstrip() line = next_line_prefix line += value @@ -341,17 +359,17 @@ def best_len(values): else: return 0 - close = ', ' + close = ", " if n == 0: - summary = '[]{}'.format(close) + summary = "[]{}".format(close) elif n == 1 and not line_break_each_value: first = formatter(obj[0]) - summary = '[{}]{}'.format(first, close) + summary = "[{}]{}".format(first, close) elif n == 2 and not line_break_each_value: first = formatter(obj[0]) last = formatter(obj[-1]) - summary = '[{}, {}]{}'.format(first, last, close) + summary = "[{}, {}]{}".format(first, last, close) else: if n > max_seq_items: @@ -369,8 +387,10 @@ def best_len(values): # strings will right align when head and tail are stacked # vertically. head, tail = _justify(head, tail) - elif (is_truncated or not (len(', '.join(head)) < display_width and - len(', '.join(tail)) < display_width)): + elif is_truncated or not ( + len(", ".join(head)) < display_width + and len(", ".join(tail)) < display_width + ): # Each string in head and tail should align with each other max_length = max(best_len(head), best_len(tail)) head = [x.rjust(max_length) for x in head] @@ -396,37 +416,34 @@ def best_len(values): line = space2 for max_items in range(len(head)): - word = head[max_items] + sep + ' ' - summary, line = _extend_line(summary, line, word, - display_width, space2) + word = head[max_items] + sep + " " + summary, line = _extend_line(summary, line, word, display_width, space2) if is_truncated: # remove trailing space of last line - summary += line.rstrip() + space2 + '...' + summary += line.rstrip() + space2 + "..." line = space2 for max_items in range(len(tail) - 1): - word = tail[max_items] + sep + ' ' - summary, line = _extend_line(summary, line, word, - display_width, space2) + word = tail[max_items] + sep + " " + summary, line = _extend_line(summary, line, word, display_width, space2) # last value: no sep added + 1 space of width used for trailing ',' - summary, line = _extend_line(summary, line, tail[-1], - display_width - 2, space2) + summary, line = _extend_line(summary, line, tail[-1], display_width - 2, space2) summary += line # right now close is either '' or ', ' # Now we want to include the ']', but not the maybe space. - close = ']' + close.rstrip(' ') + close = "]" + close.rstrip(" ") summary += close if len(summary) > (display_width) or line_break_each_value: summary += space1 else: # one row - summary += ' ' + summary += " " # remove initial space - summary = '[' + summary[len(space2):] + summary = "[" + summary[len(space2) :] return summary @@ -461,10 +478,12 @@ def _justify(head, tail): max_length = [max(x, y) for x, y in zip(max_length, length)] # justify each item in each list-like in head and tail using max_length - head = [tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) - for seq in head] - tail = [tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) - for seq in tail] + head = [ + tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) for seq in head + ] + tail = [ + tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) for seq in tail + ] return head, tail @@ -486,13 +505,13 @@ def format_object_attrs(obj, include_dtype=True): """ attrs = [] - if hasattr(obj, 'dtype') and include_dtype: - attrs.append(('dtype', "'{}'".format(obj.dtype))) - if getattr(obj, 'name', None) is not None: - attrs.append(('name', default_pprint(obj.name))) - elif getattr(obj, 'names', None) is not None and any(obj.names): - attrs.append(('names', default_pprint(obj.names))) - max_seq_items = get_option('display.max_seq_items') or len(obj) + if hasattr(obj, "dtype") and include_dtype: + attrs.append(("dtype", "'{}'".format(obj.dtype))) + if getattr(obj, "name", None) is not None: + attrs.append(("name", default_pprint(obj.name))) + elif getattr(obj, "names", None) is not None and any(obj.names): + attrs.append(("names", default_pprint(obj.names))) + max_seq_items = get_option("display.max_seq_items") or len(obj) if len(obj) > max_seq_items: - attrs.append(('length', len(obj))) + attrs.append(("length", len(obj))) return attrs diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 0d9b5fe4314a3f..e7aa5d22995c66 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -26,14 +26,13 @@ from pandas.core.generic import _shared_docs from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice -jinja2 = import_optional_dependency( - "jinja2", extra="DataFrame.style requires jinja2." -) +jinja2 = import_optional_dependency("jinja2", extra="DataFrame.style requires jinja2.") try: import matplotlib.pyplot as plt from matplotlib import colors + has_mpl = True except ImportError: has_mpl = False @@ -108,15 +107,21 @@ class Styler: * Blank cells include ``blank`` * Data cells include ``data`` """ + loader = jinja2.PackageLoader("pandas", "io/formats/templates") - env = jinja2.Environment( - loader=loader, - trim_blocks=True, - ) + env = jinja2.Environment(loader=loader, trim_blocks=True) template = env.get_template("html.tpl") - def __init__(self, data, precision=None, table_styles=None, uuid=None, - caption=None, table_attributes=None, cell_ids=True): + def __init__( + self, + data, + precision=None, + table_styles=None, + uuid=None, + caption=None, + table_attributes=None, + cell_ids=True, + ): self.ctx = defaultdict(list) self._todo = [] @@ -135,7 +140,7 @@ def __init__(self, data, precision=None, table_styles=None, uuid=None, self.table_styles = table_styles self.caption = caption if precision is None: - precision = get_option('display.precision') + precision = get_option("display.precision") self.precision = precision self.table_attributes = table_attributes self.hidden_index = False @@ -146,7 +151,7 @@ def __init__(self, data, precision=None, table_styles=None, uuid=None, def default_display_func(x): if is_float(x): - return '{:>.{precision}g}'.format(x, precision=self.precision) + return "{:>.{precision}g}".format(x, precision=self.precision) else: return x @@ -158,29 +163,59 @@ def _repr_html_(self): """ return self.render() - @Appender(_shared_docs['to_excel'] % dict( - axes='index, columns', klass='Styler', - axes_single_arg="{0 or 'index', 1 or 'columns'}", - optional_by=""" + @Appender( + _shared_docs["to_excel"] + % dict( + axes="index, columns", + klass="Styler", + axes_single_arg="{0 or 'index', 1 or 'columns'}", + optional_by=""" by : str or list of str Name or list of names which refer to the axis items.""", - versionadded_to_excel='\n .. versionadded:: 0.20')) - def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', - float_format=None, columns=None, header=True, index=True, - index_label=None, startrow=0, startcol=0, engine=None, - merge_cells=True, encoding=None, inf_rep='inf', verbose=True, - freeze_panes=None): + versionadded_to_excel="\n .. versionadded:: 0.20", + ) + ) + def to_excel( + self, + excel_writer, + sheet_name="Sheet1", + na_rep="", + float_format=None, + columns=None, + header=True, + index=True, + index_label=None, + startrow=0, + startcol=0, + engine=None, + merge_cells=True, + encoding=None, + inf_rep="inf", + verbose=True, + freeze_panes=None, + ): from pandas.io.formats.excel import ExcelFormatter - formatter = ExcelFormatter(self, na_rep=na_rep, cols=columns, - header=header, - float_format=float_format, index=index, - index_label=index_label, - merge_cells=merge_cells, - inf_rep=inf_rep) - formatter.write(excel_writer, sheet_name=sheet_name, startrow=startrow, - startcol=startcol, freeze_panes=freeze_panes, - engine=engine) + + formatter = ExcelFormatter( + self, + na_rep=na_rep, + cols=columns, + header=header, + float_format=float_format, + index=index, + index_label=index_label, + merge_cells=merge_cells, + inf_rep=inf_rep, + ) + formatter.write( + excel_writer, + sheet_name=sheet_name, + startrow=startrow, + startcol=startcol, + freeze_panes=freeze_panes, + engine=engine, + ) def _translate(self): """ @@ -227,29 +262,43 @@ def format_attr(pair): for r in range(n_clvls): # Blank for Index columns... - row_es = [{"type": "th", - "value": BLANK_VALUE, - "display_value": BLANK_VALUE, - "is_visible": not hidden_index, - "class": " ".join([BLANK_CLASS])}] * (n_rlvls - 1) + row_es = [ + { + "type": "th", + "value": BLANK_VALUE, + "display_value": BLANK_VALUE, + "is_visible": not hidden_index, + "class": " ".join([BLANK_CLASS]), + } + ] * (n_rlvls - 1) # ... except maybe the last for columns.names name = self.data.columns.names[r] - cs = [BLANK_CLASS if name is None else INDEX_NAME_CLASS, - "level{lvl}".format(lvl=r)] + cs = [ + BLANK_CLASS if name is None else INDEX_NAME_CLASS, + "level{lvl}".format(lvl=r), + ] name = BLANK_VALUE if name is None else name - row_es.append({"type": "th", - "value": name, - "display_value": name, - "class": " ".join(cs), - "is_visible": not hidden_index}) + row_es.append( + { + "type": "th", + "value": name, + "display_value": name, + "class": " ".join(cs), + "is_visible": not hidden_index, + } + ) if clabels: for c, value in enumerate(clabels[r]): - cs = [COL_HEADING_CLASS, "level{lvl}".format(lvl=r), - "col{col}".format(col=c)] - cs.extend(cell_context.get( - "col_headings", {}).get(r, {}).get(c, [])) + cs = [ + COL_HEADING_CLASS, + "level{lvl}".format(lvl=r), + "col{col}".format(col=c), + ] + cs.extend( + cell_context.get("col_headings", {}).get(r, {}).get(c, []) + ) es = { "type": "th", "value": value, @@ -265,23 +314,24 @@ def format_attr(pair): row_es.append(es) head.append(row_es) - if (self.data.index.names and - com._any_not_none(*self.data.index.names) and - not hidden_index): + if ( + self.data.index.names + and com._any_not_none(*self.data.index.names) + and not hidden_index + ): index_header_row = [] for c, name in enumerate(self.data.index.names): - cs = [INDEX_NAME_CLASS, - "level{lvl}".format(lvl=c)] - name = '' if name is None else name - index_header_row.append({"type": "th", "value": name, - "class": " ".join(cs)}) + cs = [INDEX_NAME_CLASS, "level{lvl}".format(lvl=c)] + name = "" if name is None else name + index_header_row.append( + {"type": "th", "value": name, "class": " ".join(cs)} + ) index_header_row.extend( - [{"type": "th", - "value": BLANK_VALUE, - "class": " ".join([BLANK_CLASS]) - }] * (len(clabels[0]) - len(hidden_columns))) + [{"type": "th", "value": BLANK_VALUE, "class": " ".join([BLANK_CLASS])}] + * (len(clabels[0]) - len(hidden_columns)) + ) head.append(index_header_row) @@ -289,16 +339,18 @@ def format_attr(pair): for r, idx in enumerate(self.data.index): row_es = [] for c, value in enumerate(rlabels[r]): - rid = [ROW_HEADING_CLASS, "level{lvl}".format(lvl=c), - "row{row}".format(row=r)] + rid = [ + ROW_HEADING_CLASS, + "level{lvl}".format(lvl=c), + "row{row}".format(row=r), + ] es = { "type": "th", - "is_visible": (_is_visible(r, c, idx_lengths) and - not hidden_index), + "is_visible": (_is_visible(r, c, idx_lengths) and not hidden_index), "value": value, "display_value": value, "id": "_".join(rid[1:]), - "class": " ".join(rid) + "class": " ".join(rid), } rowspan = idx_lengths.get((c, r), 0) if rowspan > 1: @@ -308,19 +360,19 @@ def format_attr(pair): row_es.append(es) for c, col in enumerate(self.data.columns): - cs = [DATA_CLASS, "row{row}".format(row=r), - "col{col}".format(col=c)] + cs = [DATA_CLASS, "row{row}".format(row=r), "col{col}".format(col=c)] cs.extend(cell_context.get("data", {}).get(r, {}).get(c, [])) formatter = self._display_funcs[(r, c)] value = self.data.iloc[r, c] - row_dict = {"type": "td", - "value": value, - "class": " ".join(cs), - "display_value": formatter(value), - "is_visible": (c not in hidden_columns)} + row_dict = { + "type": "td", + "value": value, + "class": " ".join(cs), + "display_value": formatter(value), + "is_visible": (c not in hidden_columns), + } # only add an id if the cell has a style - if (self.cell_ids or - not(len(ctx[r, c]) == 1 and ctx[r, c][0] == '')): + if self.cell_ids or not (len(ctx[r, c]) == 1 and ctx[r, c][0] == ""): row_dict["id"] = "_".join(cs[1:]) row_es.append(row_dict) props = [] @@ -329,25 +381,34 @@ def format_attr(pair): if x.count(":"): props.append(x.split(":")) else: - props.append(['', '']) - cellstyle.append({'props': props, - 'selector': "row{row}_col{col}" - .format(row=r, col=c)}) + props.append(["", ""]) + cellstyle.append( + { + "props": props, + "selector": "row{row}_col{col}".format(row=r, col=c), + } + ) body.append(row_es) table_attr = self.table_attributes use_mathjax = get_option("display.html.use_mathjax") if not use_mathjax: - table_attr = table_attr or '' + table_attr = table_attr or "" if 'class="' in table_attr: - table_attr = table_attr.replace('class="', - 'class="tex2jax_ignore ') + table_attr = table_attr.replace('class="', 'class="tex2jax_ignore ') else: table_attr += ' class="tex2jax_ignore"' - return dict(head=head, cellstyle=cellstyle, body=body, uuid=uuid, - precision=precision, table_styles=table_styles, - caption=caption, table_attributes=table_attr) + return dict( + head=head, + cellstyle=cellstyle, + body=body, + uuid=uuid, + precision=precision, + table_styles=table_styles, + caption=caption, + table_attributes=table_attr, + ) def format(self, formatter, subset=None): """ @@ -460,9 +521,8 @@ def render(self, **kwargs): # filter out empty styles, every cell will have a class # but the list of props may just be [['', '']]. # so we have the neested anys below - trimmed = [x for x in d['cellstyle'] - if any(any(y) for y in x['props'])] - d['cellstyle'] = trimmed + trimmed = [x for x in d["cellstyle"] if any(any(y) for y in x["props"])] + d["cellstyle"] = trimmed d.update(kwargs) return self.template.render(**d) @@ -485,9 +545,13 @@ def _update_ctx(self, attrs): self.ctx[(i, j)].append(pair) def _copy(self, deepcopy=False): - styler = Styler(self.data, precision=self.precision, - caption=self.caption, uuid=self.uuid, - table_styles=self.table_styles) + styler = Styler( + self.data, + precision=self.precision, + caption=self.caption, + uuid=self.uuid, + table_styles=self.table_styles, + ) if deepcopy: styler.ctx = copy.deepcopy(self.ctx) styler._todo = copy.deepcopy(self._todo) @@ -532,30 +596,34 @@ def _apply(self, func, axis=0, subset=None, **kwargs): subset = _non_reducing_slice(subset) data = self.data.loc[subset] if axis is not None: - result = data.apply(func, axis=axis, - result_type='expand', **kwargs) + result = data.apply(func, axis=axis, result_type="expand", **kwargs) result.columns = data.columns else: result = func(data, **kwargs) if not isinstance(result, pd.DataFrame): raise TypeError( "Function {func!r} must return a DataFrame when " - "passed to `Styler.apply` with axis=None" - .format(func=func)) - if not (result.index.equals(data.index) and - result.columns.equals(data.columns)): - msg = ('Result of {func!r} must have identical index and ' - 'columns as the input'.format(func=func)) + "passed to `Styler.apply` with axis=None".format(func=func) + ) + if not ( + result.index.equals(data.index) and result.columns.equals(data.columns) + ): + msg = ( + "Result of {func!r} must have identical index and " + "columns as the input".format(func=func) + ) raise ValueError(msg) result_shape = result.shape expected_shape = self.data.loc[subset].shape if result_shape != expected_shape: - msg = ("Function {func!r} returned the wrong shape.\n" - "Result has shape: {res}\n" - "Expected shape: {expect}".format(func=func, - res=result.shape, - expect=expected_shape)) + msg = ( + "Function {func!r} returned the wrong shape.\n" + "Result has shape: {res}\n" + "Expected shape: {expect}".format( + func=func, res=result.shape, expect=expected_shape + ) + ) raise ValueError(msg) self._update_ctx(result) return self @@ -605,8 +673,9 @@ def apply(self, func, axis=0, subset=None, **kwargs): >>> df = pd.DataFrame(np.random.randn(5, 2)) >>> df.style.apply(highlight_max) """ - self._todo.append((lambda instance: getattr(instance, '_apply'), - (func, axis, subset), kwargs)) + self._todo.append( + (lambda instance: getattr(instance, "_apply"), (func, axis, subset), kwargs) + ) return self def _applymap(self, func, subset=None, **kwargs): @@ -641,8 +710,9 @@ def applymap(self, func, subset=None, **kwargs): -------- Styler.where """ - self._todo.append((lambda instance: getattr(instance, '_applymap'), - (func, subset), kwargs)) + self._todo.append( + (lambda instance: getattr(instance, "_applymap"), (func, subset), kwargs) + ) return self def where(self, cond, value, other=None, subset=None, **kwargs): @@ -677,10 +747,11 @@ def where(self, cond, value, other=None, subset=None, **kwargs): """ if other is None: - other = '' + other = "" - return self.applymap(lambda val: value if cond(val) else other, - subset=subset, **kwargs) + return self.applymap( + lambda val: value if cond(val) else other, subset=subset, **kwargs + ) def set_precision(self, precision): """ @@ -858,10 +929,11 @@ def hide_columns(self, subset): @staticmethod def _highlight_null(v, null_color): - return ('background-color: {color}'.format(color=null_color) - if pd.isna(v) else '') + return ( + "background-color: {color}".format(color=null_color) if pd.isna(v) else "" + ) - def highlight_null(self, null_color='red'): + def highlight_null(self, null_color="red"): """ Shade the background ``null_color`` for missing values. @@ -876,8 +948,15 @@ def highlight_null(self, null_color='red'): self.applymap(self._highlight_null, null_color=null_color) return self - def background_gradient(self, cmap='PuBu', low=0, high=0, axis=0, - subset=None, text_color_threshold=0.408): + def background_gradient( + self, + cmap="PuBu", + low=0, + high=0, + axis=0, + subset=None, + text_color_threshold=0.408, + ): """ Color the background in a gradient according to the data in each column (optionally row). @@ -921,19 +1000,26 @@ def background_gradient(self, cmap='PuBu', low=0, high=0, axis=0, """ subset = _maybe_numeric_slice(self.data, subset) subset = _non_reducing_slice(subset) - self.apply(self._background_gradient, cmap=cmap, subset=subset, - axis=axis, low=low, high=high, - text_color_threshold=text_color_threshold) + self.apply( + self._background_gradient, + cmap=cmap, + subset=subset, + axis=axis, + low=low, + high=high, + text_color_threshold=text_color_threshold, + ) return self @staticmethod - def _background_gradient(s, cmap='PuBu', low=0, high=0, - text_color_threshold=0.408): + def _background_gradient(s, cmap="PuBu", low=0, high=0, text_color_threshold=0.408): """ Color background in a range according to the data. """ - if (not isinstance(text_color_threshold, (float, int)) or - not 0 <= text_color_threshold <= 1): + if ( + not isinstance(text_color_threshold, (float, int)) + or not 0 <= text_color_threshold <= 1 + ): msg = "`text_color_threshold` must be a value from 0 to 1." raise ValueError(msg) @@ -971,8 +1057,8 @@ def relative_luminance(rgba): def css(rgba): dark = relative_luminance(rgba) < text_color_threshold - text_color = '#f1f1f1' if dark else '#000000' - return 'background-color: {b};color: {c};'.format( + text_color = "#f1f1f1" if dark else "#000000" + return "background-color: {b};color: {c};".format( b=colors.rgb2hex(rgba), c=text_color ) @@ -981,7 +1067,8 @@ def css(rgba): else: return pd.DataFrame( [[css(rgba) for rgba in row] for row in rgbas], - index=s.index, columns=s.columns + index=s.index, + columns=s.columns, ) def set_properties(self, subset=None, **kwargs): @@ -1006,8 +1093,7 @@ def set_properties(self, subset=None, **kwargs): >>> df.style.set_properties(color="white", align="right") >>> df.style.set_properties(**{'background-color': 'yellow'}) """ - values = ';'.join('{p}: {v}'.format(p=p, v=v) - for p, v in kwargs.items()) + values = ";".join("{p}: {v}".format(p=p, v=v) for p, v in kwargs.items()) f = lambda x: values return self.applymap(f, subset=subset) @@ -1023,10 +1109,10 @@ def _bar(s, align, colors, width=100, vmin=None, vmax=None): smax = s.max() if vmax is None else vmax if isinstance(smax, ABCSeries): smax = smax.max() - if align == 'mid': + if align == "mid": smin = min(0, smin) smax = max(0, smax) - elif align == 'zero': + elif align == "zero": # For "zero" mode, we want the range to be symmetrical around zero. smax = max(abs(smin), abs(smax)) smin = -smax @@ -1038,26 +1124,26 @@ def css_bar(start, end, color): """ Generate CSS code to draw a bar from start to end. """ - css = 'width: 10em; height: 80%;' + css = "width: 10em; height: 80%;" if end > start: - css += 'background: linear-gradient(90deg,' + css += "background: linear-gradient(90deg," if start > 0: - css += ' transparent {s:.1f}%, {c} {s:.1f}%, '.format( + css += " transparent {s:.1f}%, {c} {s:.1f}%, ".format( s=start, c=color ) - css += '{c} {e:.1f}%, transparent {e:.1f}%)'.format( - e=min(end, width), c=color, + css += "{c} {e:.1f}%, transparent {e:.1f}%)".format( + e=min(end, width), c=color ) return css def css(x): if pd.isna(x): - return '' + return "" # avoid deprecated indexing `colors[x > zero]` color = colors[1] if x > zero else colors[0] - if align == 'left': + if align == "left": return css_bar(0, x, color) else: return css_bar(min(x, zero), max(x, zero), color) @@ -1067,11 +1153,20 @@ def css(x): else: return pd.DataFrame( [[css(x) for x in row] for row in normed], - index=s.index, columns=s.columns + index=s.index, + columns=s.columns, ) - def bar(self, subset=None, axis=0, color='#d65f5f', width=100, - align='left', vmin=None, vmax=None): + def bar( + self, + subset=None, + axis=0, + color="#d65f5f", + width=100, + align="left", + vmin=None, + vmax=None, + ): """ Draw bar chart in the cell backgrounds. @@ -1120,7 +1215,7 @@ def bar(self, subset=None, axis=0, color='#d65f5f', width=100, ------- self : Styler """ - if align not in ('left', 'zero', 'mid'): + if align not in ("left", "zero", "mid"): raise ValueError("`align` must be one of {'left', 'zero',' mid'}") if not (is_list_like(color)): @@ -1128,19 +1223,28 @@ def bar(self, subset=None, axis=0, color='#d65f5f', width=100, elif len(color) == 1: color = [color[0], color[0]] elif len(color) > 2: - raise ValueError("`color` must be string or a list-like" - " of length 2: [`color_neg`, `color_pos`]" - " (eg: color=['#d65f5f', '#5fba7d'])") + raise ValueError( + "`color` must be string or a list-like" + " of length 2: [`color_neg`, `color_pos`]" + " (eg: color=['#d65f5f', '#5fba7d'])" + ) subset = _maybe_numeric_slice(self.data, subset) subset = _non_reducing_slice(subset) - self.apply(self._bar, subset=subset, axis=axis, - align=align, colors=color, width=width, - vmin=vmin, vmax=vmax) + self.apply( + self._bar, + subset=subset, + axis=axis, + align=align, + colors=color, + width=width, + vmin=vmin, + vmax=vmax, + ) return self - def highlight_max(self, subset=None, color='yellow', axis=0): + def highlight_max(self, subset=None, color="yellow", axis=0): """ Highlight the maximum by shading the background. @@ -1158,10 +1262,9 @@ def highlight_max(self, subset=None, color='yellow', axis=0): ------- self : Styler """ - return self._highlight_handler(subset=subset, color=color, axis=axis, - max_=True) + return self._highlight_handler(subset=subset, color=color, axis=axis, max_=True) - def highlight_min(self, subset=None, color='yellow', axis=0): + def highlight_min(self, subset=None, color="yellow", axis=0): """ Highlight the minimum by shading the background. @@ -1179,35 +1282,37 @@ def highlight_min(self, subset=None, color='yellow', axis=0): ------- self : Styler """ - return self._highlight_handler(subset=subset, color=color, axis=axis, - max_=False) + return self._highlight_handler( + subset=subset, color=color, axis=axis, max_=False + ) - def _highlight_handler(self, subset=None, color='yellow', axis=None, - max_=True): + def _highlight_handler(self, subset=None, color="yellow", axis=None, max_=True): subset = _non_reducing_slice(_maybe_numeric_slice(self.data, subset)) - self.apply(self._highlight_extrema, color=color, axis=axis, - subset=subset, max_=max_) + self.apply( + self._highlight_extrema, color=color, axis=axis, subset=subset, max_=max_ + ) return self @staticmethod - def _highlight_extrema(data, color='yellow', max_=True): + def _highlight_extrema(data, color="yellow", max_=True): """ Highlight the min or max in a Series or DataFrame. """ - attr = 'background-color: {0}'.format(color) + attr = "background-color: {0}".format(color) if data.ndim == 1: # Series from .apply if max_: extrema = data == data.max() else: extrema = data == data.min() - return [attr if v else '' for v in extrema] + return [attr if v else "" for v in extrema] else: # DataFrame from .tee if max_: extrema = data == data.max().max() else: extrema = data == data.min().min() - return pd.DataFrame(np.where(extrema, attr, ''), - index=data.index, columns=data.columns) + return pd.DataFrame( + np.where(extrema, attr, ""), index=data.index, columns=data.columns + ) @classmethod def from_custom_template(cls, searchpath, name): @@ -1227,10 +1332,7 @@ def from_custom_template(cls, searchpath, name): MyStyler : subclass of Styler Has the correct ``env`` and ``template`` class attributes set. """ - loader = jinja2.ChoiceLoader([ - jinja2.FileSystemLoader(searchpath), - cls.loader, - ]) + loader = jinja2.ChoiceLoader([jinja2.FileSystemLoader(searchpath), cls.loader]) class MyStyler(cls): env = jinja2.Environment(loader=loader) @@ -1333,27 +1435,28 @@ def _get_level_lengths(index, hidden_elements=None): lengths = {} if index.nlevels == 1: for i, value in enumerate(levels): - if(i not in hidden_elements): + if i not in hidden_elements: lengths[(0, i)] = 1 return lengths for i, lvl in enumerate(levels): for j, row in enumerate(lvl): - if not get_option('display.multi_sparse'): + if not get_option("display.multi_sparse"): lengths[(i, j)] = 1 elif (row != sentinel) and (j not in hidden_elements): last_label = j lengths[(i, last_label)] = 1 - elif (row != sentinel): + elif row != sentinel: # even if its hidden, keep track of it in case # length >1 and later elements are visible last_label = j lengths[(i, last_label)] = 0 - elif(j not in hidden_elements): + elif j not in hidden_elements: lengths[(i, last_label)] += 1 non_zero_lengths = { - element: length for element, length in lengths.items() if length >= 1} + element: length for element, length in lengths.items() if length >= 1 + } return non_zero_lengths @@ -1364,6 +1467,8 @@ def _maybe_wrap_formatter(formatter): elif callable(formatter): return formatter else: - msg = ("Expected a template string or callable, got {formatter} " - "instead".format(formatter=formatter)) + msg = ( + "Expected a template string or callable, got {formatter} " + "instead".format(formatter=formatter) + ) raise TypeError(msg) diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index a9eff003f22491..d29078cad93188 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -9,17 +9,25 @@ def _try_import(): "pandas-gbq is required to load data from Google BigQuery. " "See the docs: https://pandas-gbq.readthedocs.io." ) - pandas_gbq = import_optional_dependency( - "pandas_gbq", - extra=msg, - ) + pandas_gbq = import_optional_dependency("pandas_gbq", extra=msg) return pandas_gbq -def read_gbq(query, project_id=None, index_col=None, col_order=None, - reauth=False, auth_local_webserver=False, dialect=None, - location=None, configuration=None, credentials=None, - use_bqstorage_api=None, private_key=None, verbose=None): +def read_gbq( + query, + project_id=None, + index_col=None, + col_order=None, + reauth=False, + auth_local_webserver=False, + dialect=None, + location=None, + configuration=None, + credentials=None, + use_bqstorage_api=None, + private_key=None, + verbose=None, +): """ Load data from Google BigQuery. @@ -155,21 +163,48 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, # END: deprecated kwargs return pandas_gbq.read_gbq( - query, project_id=project_id, index_col=index_col, - col_order=col_order, reauth=reauth, - auth_local_webserver=auth_local_webserver, dialect=dialect, - location=location, configuration=configuration, - credentials=credentials, **kwargs) + query, + project_id=project_id, + index_col=index_col, + col_order=col_order, + reauth=reauth, + auth_local_webserver=auth_local_webserver, + dialect=dialect, + location=location, + configuration=configuration, + credentials=credentials, + **kwargs + ) -def to_gbq(dataframe, destination_table, project_id=None, chunksize=None, - reauth=False, if_exists='fail', auth_local_webserver=False, - table_schema=None, location=None, progress_bar=True, - credentials=None, verbose=None, private_key=None): +def to_gbq( + dataframe, + destination_table, + project_id=None, + chunksize=None, + reauth=False, + if_exists="fail", + auth_local_webserver=False, + table_schema=None, + location=None, + progress_bar=True, + credentials=None, + verbose=None, + private_key=None, +): pandas_gbq = _try_import() - pandas_gbq.to_gbq(dataframe, destination_table, project_id=project_id, - chunksize=chunksize, reauth=reauth, if_exists=if_exists, - auth_local_webserver=auth_local_webserver, - table_schema=table_schema, location=location, - progress_bar=progress_bar, credentials=credentials, - verbose=verbose, private_key=private_key) + pandas_gbq.to_gbq( + dataframe, + destination_table, + project_id=project_id, + chunksize=chunksize, + reauth=reauth, + if_exists=if_exists, + auth_local_webserver=auth_local_webserver, + table_schema=table_schema, + location=location, + progress_bar=progress_bar, + credentials=credentials, + verbose=verbose, + private_key=private_key, + ) diff --git a/pandas/io/gcs.py b/pandas/io/gcs.py index 862ccbb291c013..1f5e0faedc6d2d 100644 --- a/pandas/io/gcs.py +++ b/pandas/io/gcs.py @@ -2,18 +2,17 @@ from pandas.compat._optional import import_optional_dependency gcsfs = import_optional_dependency( - "gcsfs", - extra="The gcsfs library is required to handle GCS files" + "gcsfs", extra="The gcsfs library is required to handle GCS files" ) -def get_filepath_or_buffer(filepath_or_buffer, encoding=None, - compression=None, mode=None): +def get_filepath_or_buffer( + filepath_or_buffer, encoding=None, compression=None, mode=None +): if mode is None: - mode = 'rb' + mode = "rb" fs = gcsfs.GCSFileSystem() - filepath_or_buffer = fs.open( - filepath_or_buffer, mode) + filepath_or_buffer = fs.open(filepath_or_buffer, mode) return filepath_or_buffer, None, compression, True diff --git a/pandas/io/html.py b/pandas/io/html.py index f080e1d1fc1888..91f5e5a949ac32 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -35,16 +35,17 @@ def _importers(): return global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB - bs4 = import_optional_dependency("bs4", raise_on_missing=False, - on_version="ignore") + bs4 = import_optional_dependency("bs4", raise_on_missing=False, on_version="ignore") _HAS_BS4 = bs4 is not None - lxml = import_optional_dependency("lxml.etree", raise_on_missing=False, - on_version="ignore") + lxml = import_optional_dependency( + "lxml.etree", raise_on_missing=False, on_version="ignore" + ) _HAS_LXML = lxml is not None - html5lib = import_optional_dependency("html5lib", raise_on_missing=False, - on_version="ignore") + html5lib = import_optional_dependency( + "html5lib", raise_on_missing=False, on_version="ignore" + ) _HAS_HTML5LIB = html5lib is not None _IMPORTS = True @@ -53,7 +54,7 @@ def _importers(): ############# # READ HTML # ############# -_RE_WHITESPACE = re.compile(r'[\r\n]+|\s{2,}') +_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}") def _remove_whitespace(s, regex=_RE_WHITESPACE): @@ -72,7 +73,7 @@ def _remove_whitespace(s, regex=_RE_WHITESPACE): subd : str or unicode `s` with all extra whitespace replaced with a single space. """ - return regex.sub(' ', s.strip()) + return regex.sub(" ", s.strip()) def _get_skiprows(skiprows): @@ -100,8 +101,9 @@ def _get_skiprows(skiprows): return skiprows elif skiprows is None: return 0 - raise TypeError('%r is not a valid type for skipping rows' % - type(skiprows).__name__) + raise TypeError( + "%r is not a valid type for skipping rows" % type(skiprows).__name__ + ) def _read(obj): @@ -118,13 +120,13 @@ def _read(obj): if _is_url(obj): with urlopen(obj) as url: text = url.read() - elif hasattr(obj, 'read'): + elif hasattr(obj, "read"): text = obj.read() elif isinstance(obj, (str, bytes)): text = obj try: if os.path.isfile(text): - with open(text, 'rb') as f: + with open(text, "rb") as f: return f.read() except (TypeError, ValueError): pass @@ -397,8 +399,7 @@ def _parse_thead_tbody_tfoot(self, table_html): footer_rows = self._parse_tfoot_tr(table_html) def row_is_all_th(row): - return all(self._equals_tag(t, 'th') for t in - self._parse_td(row)) + return all(self._equals_tag(t, "th") for t in self._parse_td(row)) if not header_rows: # The table has no . Move the top all- rows from @@ -449,14 +450,13 @@ def _expand_colspan_rowspan(self, rows): prev_i, prev_text, prev_rowspan = remainder.pop(0) texts.append(prev_text) if prev_rowspan > 1: - next_remainder.append((prev_i, prev_text, - prev_rowspan - 1)) + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) index += 1 # Append the text from this , colspan times text = _remove_whitespace(self._text_getter(td)) - rowspan = int(self._attr_getter(td, 'rowspan') or 1) - colspan = int(self._attr_getter(td, 'colspan') or 1) + rowspan = int(self._attr_getter(td, "rowspan") or 1) + colspan = int(self._attr_getter(td, "colspan") or 1) for _ in range(colspan): texts.append(text) @@ -468,8 +468,7 @@ def _expand_colspan_rowspan(self, rows): for prev_i, prev_text, prev_rowspan in remainder: texts.append(prev_text) if prev_rowspan > 1: - next_remainder.append((prev_i, prev_text, - prev_rowspan - 1)) + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) all_texts.append(texts) remainder = next_remainder @@ -482,8 +481,7 @@ def _expand_colspan_rowspan(self, rows): for prev_i, prev_text, prev_rowspan in remainder: texts.append(prev_text) if prev_rowspan > 1: - next_remainder.append((prev_i, prev_text, - prev_rowspan - 1)) + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) all_texts.append(texts) remainder = next_remainder @@ -508,8 +506,12 @@ def _handle_hidden_tables(self, tbl_list, attr_name): if not self.displayed_only: return tbl_list - return [x for x in tbl_list if "display:none" not in - getattr(x, attr_name).get('style', '').replace(" ", "")] + return [ + x + for x in tbl_list + if "display:none" + not in getattr(x, attr_name).get("style", "").replace(" ", "") + ] class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): @@ -529,14 +531,15 @@ class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) from bs4 import SoupStrainer - self._strainer = SoupStrainer('table') + + self._strainer = SoupStrainer("table") def _parse_tables(self, doc, match, attrs): element_name = self._strainer.name tables = doc.find_all(element_name, attrs=attrs) if not tables: - raise ValueError('No tables found') + raise ValueError("No tables found") result = [] unique_tables = set() @@ -544,18 +547,17 @@ def _parse_tables(self, doc, match, attrs): for table in tables: if self.displayed_only: - for elem in table.find_all( - style=re.compile(r"display:\s*none")): + for elem in table.find_all(style=re.compile(r"display:\s*none")): elem.decompose() - if (table not in unique_tables and - table.find(text=match) is not None): + if table not in unique_tables and table.find(text=match) is not None: result.append(table) unique_tables.add(table) if not result: - raise ValueError("No tables found matching pattern {patt!r}" - .format(patt=match.pattern)) + raise ValueError( + "No tables found matching pattern {patt!r}".format(patt=match.pattern) + ) return result def _text_getter(self, obj): @@ -565,31 +567,32 @@ def _equals_tag(self, obj, tag): return obj.name == tag def _parse_td(self, row): - return row.find_all(('td', 'th'), recursive=False) + return row.find_all(("td", "th"), recursive=False) def _parse_thead_tr(self, table): - return table.select('thead tr') + return table.select("thead tr") def _parse_tbody_tr(self, table): - from_tbody = table.select('tbody tr') - from_root = table.find_all('tr', recursive=False) + from_tbody = table.select("tbody tr") + from_root = table.find_all("tr", recursive=False) # HTML spec: at most one of these lists has content return from_tbody + from_root def _parse_tfoot_tr(self, table): - return table.select('tfoot tr') + return table.select("tfoot tr") def _setup_build_doc(self): raw_text = _read(self.io) if not raw_text: - raise ValueError('No text parsed from document: {doc}' - .format(doc=self.io)) + raise ValueError("No text parsed from document: {doc}".format(doc=self.io)) return raw_text def _build_doc(self): from bs4 import BeautifulSoup - return BeautifulSoup(self._setup_build_doc(), features='html5lib', - from_encoding=self.encoding) + + return BeautifulSoup( + self._setup_build_doc(), features="html5lib", from_encoding=self.encoding + ) def _build_xpath_expr(attrs): @@ -607,15 +610,15 @@ def _build_xpath_expr(attrs): An XPath expression that checks for the given HTML attributes. """ # give class attribute as class_ because class is a python keyword - if 'class_' in attrs: - attrs['class'] = attrs.pop('class_') + if "class_" in attrs: + attrs["class"] = attrs.pop("class_") s = ["@{key}={val!r}".format(key=k, val=v) for k, v in attrs.items()] - return '[{expr}]'.format(expr=' and '.join(s)) + return "[{expr}]".format(expr=" and ".join(s)) -_re_namespace = {'re': 'http://exslt.org/regular-expressions'} -_valid_schemes = 'http', 'file', 'ftp' +_re_namespace = {"re": "http://exslt.org/regular-expressions"} +_valid_schemes = "http", "file", "ftp" class _LxmlFrameParser(_HtmlFrameParser): @@ -645,14 +648,14 @@ def _text_getter(self, obj): def _parse_td(self, row): # Look for direct children only: the "row" element here may be a # or (see _parse_thead_tr). - return row.xpath('./td|./th') + return row.xpath("./td|./th") def _parse_tables(self, doc, match, kwargs): pattern = match.pattern # 1. check all descendants for the given pattern and only search tables # 2. go up the tree until we find a table - query = '//table//*[re:test(text(), {patt!r})]/ancestor::table' + query = "//table//*[re:test(text(), {patt!r})]/ancestor::table" xpath_expr = query.format(patt=pattern) # if any table attributes were given build an xpath expression to @@ -668,14 +671,14 @@ def _parse_tables(self, doc, match, kwargs): # lxml utilizes XPATH 1.0 which does not have regex # support. As a result, we find all elements with a style # attribute and iterate them to check for display:none - for elem in table.xpath('.//*[@style]'): - if "display:none" in elem.attrib.get( - "style", "").replace(" ", ""): + for elem in table.xpath(".//*[@style]"): + if "display:none" in elem.attrib.get("style", "").replace(" ", ""): elem.getparent().remove(elem) if not tables: - raise ValueError("No tables found matching regex {patt!r}" - .format(patt=pattern)) + raise ValueError( + "No tables found matching regex {patt!r}".format(patt=pattern) + ) return tables def _equals_tag(self, obj, tag): @@ -699,6 +702,7 @@ def _build_doc(self): """ from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError + parser = HTMLParser(recover=True, encoding=self.encoding) try: @@ -724,15 +728,15 @@ def _build_doc(self): else: raise e else: - if not hasattr(r, 'text_content'): + if not hasattr(r, "text_content"): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) return r def _parse_thead_tr(self, table): rows = [] - for thead in table.xpath('.//thead'): - rows.extend(thead.xpath('./tr')) + for thead in table.xpath(".//thead"): + rows.extend(thead.xpath("./tr")) # HACK: lxml does not clean up the clearly-erroneous # foobar. (Missing ). Add @@ -740,20 +744,20 @@ def _parse_thead_tr(self, table): # children as though it's a . # # Better solution would be to use html5lib. - elements_at_root = thead.xpath('./td|./th') + elements_at_root = thead.xpath("./td|./th") if elements_at_root: rows.append(thead) return rows def _parse_tbody_tr(self, table): - from_tbody = table.xpath('.//tbody//tr') - from_root = table.xpath('./tr') + from_tbody = table.xpath(".//tbody//tr") + from_root = table.xpath("./tr") # HTML spec: at most one of these lists has content return from_tbody + from_root def _parse_tfoot_tr(self, table): - return table.xpath('.//tfoot//tr') + return table.xpath(".//tfoot//tr") def _expand_elements(body): @@ -761,15 +765,15 @@ def _expand_elements(body): lens_max = lens.max() not_max = lens[lens != lens_max] - empty = [''] + empty = [""] for ind, length in not_max.items(): body[ind] += empty * (lens_max - length) def _data_to_frame(**kwargs): - head, body, foot = kwargs.pop('data') - header = kwargs.pop('header') - kwargs['skiprows'] = _get_skiprows(kwargs['skiprows']) + head, body, foot = kwargs.pop("data") + header = kwargs.pop("header") + kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"]) if head: body = head + body @@ -779,8 +783,7 @@ def _data_to_frame(**kwargs): header = 0 else: # ignore all-empty-text rows - header = [i for i, row in enumerate(head) - if any(text for text in row)] + header = [i for i, row in enumerate(head) if any(text for text in row)] if foot: body += foot @@ -792,9 +795,12 @@ def _data_to_frame(**kwargs): return df -_valid_parsers = {'lxml': _LxmlFrameParser, None: _LxmlFrameParser, - 'html5lib': _BeautifulSoupHtml5LibFrameParser, - 'bs4': _BeautifulSoupHtml5LibFrameParser} +_valid_parsers = { + "lxml": _LxmlFrameParser, + None: _LxmlFrameParser, + "html5lib": _BeautifulSoupHtml5LibFrameParser, + "bs4": _BeautifulSoupHtml5LibFrameParser, +} def _parser_dispatch(flavor): @@ -819,18 +825,18 @@ def _parser_dispatch(flavor): """ valid_parsers = list(_valid_parsers.keys()) if flavor not in valid_parsers: - raise ValueError('{invalid!r} is not a valid flavor, valid flavors ' - 'are {valid}' - .format(invalid=flavor, valid=valid_parsers)) + raise ValueError( + "{invalid!r} is not a valid flavor, valid flavors " + "are {valid}".format(invalid=flavor, valid=valid_parsers) + ) - if flavor in ('bs4', 'html5lib'): + if flavor in ("bs4", "html5lib"): if not _HAS_HTML5LIB: raise ImportError("html5lib not found, please install it") if not _HAS_BS4: - raise ImportError( - "BeautifulSoup4 (bs4) not found, please install it") + raise ImportError("BeautifulSoup4 (bs4) not found, please install it") # Although we call this above, we want to raise here right before use. - bs4 = import_optional_dependency('bs4') # noqa:F841 + bs4 = import_optional_dependency("bs4") # noqa:F841 else: if not _HAS_LXML: @@ -839,23 +845,23 @@ def _parser_dispatch(flavor): def _print_as_set(s): - return ('{' + '{arg}'.format(arg=', '.join( - pprint_thing(el) for el in s)) + '}') + return "{" + "{arg}".format(arg=", ".join(pprint_thing(el) for el in s)) + "}" def _validate_flavor(flavor): if flavor is None: - flavor = 'lxml', 'bs4' + flavor = "lxml", "bs4" elif isinstance(flavor, str): - flavor = flavor, + flavor = (flavor,) elif isinstance(flavor, abc.Iterable): if not all(isinstance(flav, str) for flav in flavor): - raise TypeError('Object of type {typ!r} is not an iterable of ' - 'strings' - .format(typ=type(flavor).__name__)) + raise TypeError( + "Object of type {typ!r} is not an iterable of " + "strings".format(typ=type(flavor).__name__) + ) else: - fmt = '{flavor!r}' if isinstance(flavor, str) else '{flavor}' - fmt += ' is not a valid flavor' + fmt = "{flavor!r}" if isinstance(flavor, str) else "{flavor}" + fmt += " is not a valid flavor" raise ValueError(fmt.format(flavor=flavor)) flavor = tuple(flavor) @@ -863,10 +869,12 @@ def _validate_flavor(flavor): flavor_set = set(flavor) if not flavor_set & valid_flavors: - raise ValueError('{invalid} is not a valid set of flavors, valid ' - 'flavors are {valid}' - .format(invalid=_print_as_set(flavor_set), - valid=_print_as_set(valid_flavors))) + raise ValueError( + "{invalid} is not a valid set of flavors, valid " + "flavors are {valid}".format( + invalid=_print_as_set(flavor_set), valid=_print_as_set(valid_flavors) + ) + ) return flavor @@ -885,15 +893,17 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): except Exception as caught: # if `io` is an io-like object, check if it's seekable # and try to rewind it before trying the next parser - if hasattr(io, 'seekable') and io.seekable(): + if hasattr(io, "seekable") and io.seekable(): io.seek(0) - elif hasattr(io, 'seekable') and not io.seekable(): + elif hasattr(io, "seekable") and not io.seekable(): # if we couldn't rewind it, let the user know - raise ValueError('The flavor {} failed to parse your input. ' - 'Since you passed a non-rewindable file ' - 'object, we can\'t rewind it to try ' - 'another parser. Try read_html() with a ' - 'different flavor.'.format(flav)) + raise ValueError( + "The flavor {} failed to parse your input. " + "Since you passed a non-rewindable file " + "object, we can't rewind it to try " + "another parser. Try read_html() with a " + "different flavor.".format(flav) + ) retained = caught else: @@ -910,11 +920,23 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): return ret -def read_html(io, match='.+', flavor=None, header=None, index_col=None, - skiprows=None, attrs=None, parse_dates=False, - thousands=',', encoding=None, - decimal='.', converters=None, na_values=None, - keep_default_na=True, displayed_only=True): +def read_html( + io, + match=".+", + flavor=None, + header=None, + index_col=None, + skiprows=None, + attrs=None, + parse_dates=False, + thousands=",", + encoding=None, + decimal=".", + converters=None, + na_values=None, + keep_default_na=True, + displayed_only=True, +): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -1060,13 +1082,25 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, # Type check here. We don't want to parse only to fail because of an # invalid value of an integer skiprows. if isinstance(skiprows, numbers.Integral) and skiprows < 0: - raise ValueError('cannot skip rows starting from the end of the ' - 'data (you passed a negative value)') + raise ValueError( + "cannot skip rows starting from the end of the " + "data (you passed a negative value)" + ) _validate_header_arg(header) - return _parse(flavor=flavor, io=io, match=match, header=header, - index_col=index_col, skiprows=skiprows, - parse_dates=parse_dates, - thousands=thousands, attrs=attrs, encoding=encoding, - decimal=decimal, converters=converters, na_values=na_values, - keep_default_na=keep_default_na, - displayed_only=displayed_only) + return _parse( + flavor=flavor, + io=io, + match=match, + header=header, + index_col=index_col, + skiprows=skiprows, + parse_dates=parse_dates, + thousands=thousands, + attrs=attrs, + encoding=encoding, + decimal=decimal, + converters=converters, + na_values=na_values, + keep_default_na=keep_default_na, + displayed_only=displayed_only, + ) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index f14b615471ccc5..f3f0f417acaabc 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -14,8 +14,12 @@ from pandas.core.reshape.concat import concat from pandas.io.common import ( - BaseIterator, _get_handle, _infer_compression, _stringify_path, - get_filepath_or_buffer) + BaseIterator, + _get_handle, + _infer_compression, + _stringify_path, + get_filepath_or_buffer, +) from pandas.io.formats.printing import pprint_thing from pandas.io.parsers import _validate_integer @@ -25,27 +29,36 @@ loads = json.loads dumps = json.dumps -TABLE_SCHEMA_VERSION = '0.20.0' +TABLE_SCHEMA_VERSION = "0.20.0" # interface to/from -def to_json(path_or_buf, obj, orient=None, date_format='epoch', - double_precision=10, force_ascii=True, date_unit='ms', - default_handler=None, lines=False, compression='infer', - index=True): - - if not index and orient not in ['split', 'table']: - raise ValueError("'index=False' is only valid when 'orient' is " - "'split' or 'table'") +def to_json( + path_or_buf, + obj, + orient=None, + date_format="epoch", + double_precision=10, + force_ascii=True, + date_unit="ms", + default_handler=None, + lines=False, + compression="infer", + index=True, +): + + if not index and orient not in ["split", "table"]: + raise ValueError( + "'index=False' is only valid when 'orient' is " "'split' or 'table'" + ) path_or_buf = _stringify_path(path_or_buf) - if lines and orient != 'records': - raise ValueError( - "'lines' keyword only valid when 'orient' is records") + if lines and orient != "records": + raise ValueError("'lines' keyword only valid when 'orient' is records") - if orient == 'table' and isinstance(obj, Series): - obj = obj.to_frame(name=obj.name or 'values') - if orient == 'table' and isinstance(obj, DataFrame): + if orient == "table" and isinstance(obj, Series): + obj = obj.to_frame(name=obj.name or "values") + if orient == "table" and isinstance(obj, DataFrame): writer = JSONTableWriter elif isinstance(obj, Series): writer = SeriesWriter @@ -55,16 +68,21 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', raise NotImplementedError("'obj' should be a Series or a DataFrame") s = writer( - obj, orient=orient, date_format=date_format, - double_precision=double_precision, ensure_ascii=force_ascii, - date_unit=date_unit, default_handler=default_handler, - index=index).write() + obj, + orient=orient, + date_format=date_format, + double_precision=double_precision, + ensure_ascii=force_ascii, + date_unit=date_unit, + default_handler=default_handler, + index=index, + ).write() if lines: s = _convert_to_line_delimits(s) if isinstance(path_or_buf, str): - fh, handles = _get_handle(path_or_buf, 'w', compression=compression) + fh, handles = _get_handle(path_or_buf, "w", compression=compression) try: fh.write(s) finally: @@ -76,8 +94,17 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', class Writer: - def __init__(self, obj, orient, date_format, double_precision, - ensure_ascii, date_unit, index, default_handler=None): + def __init__( + self, + obj, + orient, + date_format, + double_precision, + ensure_ascii, + date_unit, + index, + default_handler=None, + ): self.obj = obj if orient is None: @@ -98,12 +125,26 @@ def _format_axes(self): raise AbstractMethodError(self) def write(self): - return self._write(self.obj, self.orient, self.double_precision, - self.ensure_ascii, self.date_unit, - self.date_format == 'iso', self.default_handler) + return self._write( + self.obj, + self.orient, + self.double_precision, + self.ensure_ascii, + self.date_unit, + self.date_format == "iso", + self.default_handler, + ) - def _write(self, obj, orient, double_precision, ensure_ascii, - date_unit, iso_dates, default_handler): + def _write( + self, + obj, + orient, + double_precision, + ensure_ascii, + date_unit, + iso_dates, + default_handler, + ): return dumps( obj, orient=orient, @@ -111,91 +152,147 @@ def _write(self, obj, orient, double_precision, ensure_ascii, ensure_ascii=ensure_ascii, date_unit=date_unit, iso_dates=iso_dates, - default_handler=default_handler + default_handler=default_handler, ) class SeriesWriter(Writer): - _default_orient = 'index' + _default_orient = "index" def _format_axes(self): - if not self.obj.index.is_unique and self.orient == 'index': - raise ValueError("Series index must be unique for orient=" - "'{orient}'".format(orient=self.orient)) + if not self.obj.index.is_unique and self.orient == "index": + raise ValueError( + "Series index must be unique for orient=" + "'{orient}'".format(orient=self.orient) + ) - def _write(self, obj, orient, double_precision, ensure_ascii, - date_unit, iso_dates, default_handler): - if not self.index and orient == 'split': + def _write( + self, + obj, + orient, + double_precision, + ensure_ascii, + date_unit, + iso_dates, + default_handler, + ): + if not self.index and orient == "split": obj = {"name": obj.name, "data": obj.values} - return super()._write(obj, orient, double_precision, ensure_ascii, - date_unit, iso_dates, default_handler) + return super()._write( + obj, + orient, + double_precision, + ensure_ascii, + date_unit, + iso_dates, + default_handler, + ) class FrameWriter(Writer): - _default_orient = 'columns' + _default_orient = "columns" def _format_axes(self): """ Try to format axes if they are datelike. """ - if not self.obj.index.is_unique and self.orient in ( - 'index', 'columns'): - raise ValueError("DataFrame index must be unique for orient=" - "'{orient}'.".format(orient=self.orient)) + if not self.obj.index.is_unique and self.orient in ("index", "columns"): + raise ValueError( + "DataFrame index must be unique for orient=" + "'{orient}'.".format(orient=self.orient) + ) if not self.obj.columns.is_unique and self.orient in ( - 'index', 'columns', 'records'): - raise ValueError("DataFrame columns must be unique for orient=" - "'{orient}'.".format(orient=self.orient)) - - def _write(self, obj, orient, double_precision, ensure_ascii, - date_unit, iso_dates, default_handler): - if not self.index and orient == 'split': - obj = obj.to_dict(orient='split') + "index", + "columns", + "records", + ): + raise ValueError( + "DataFrame columns must be unique for orient=" + "'{orient}'.".format(orient=self.orient) + ) + + def _write( + self, + obj, + orient, + double_precision, + ensure_ascii, + date_unit, + iso_dates, + default_handler, + ): + if not self.index and orient == "split": + obj = obj.to_dict(orient="split") del obj["index"] - return super()._write(obj, orient, double_precision, ensure_ascii, - date_unit, iso_dates, default_handler) + return super()._write( + obj, + orient, + double_precision, + ensure_ascii, + date_unit, + iso_dates, + default_handler, + ) class JSONTableWriter(FrameWriter): - _default_orient = 'records' - - def __init__(self, obj, orient, date_format, double_precision, - ensure_ascii, date_unit, index, default_handler=None): + _default_orient = "records" + + def __init__( + self, + obj, + orient, + date_format, + double_precision, + ensure_ascii, + date_unit, + index, + default_handler=None, + ): """ Adds a `schema` attribute with the Table Schema, resets the index (can't do in caller, because the schema inference needs to know what the index is, forces orient to records, and forces date_format to 'iso'. """ - super().__init__(obj, orient, date_format, double_precision, - ensure_ascii, date_unit, index, - default_handler=default_handler) - - if date_format != 'iso': - msg = ("Trying to write with `orient='table'` and " - "`date_format='{fmt}'`. Table Schema requires dates " - "to be formatted with `date_format='iso'`" - .format(fmt=date_format)) + super().__init__( + obj, + orient, + date_format, + double_precision, + ensure_ascii, + date_unit, + index, + default_handler=default_handler, + ) + + if date_format != "iso": + msg = ( + "Trying to write with `orient='table'` and " + "`date_format='{fmt}'`. Table Schema requires dates " + "to be formatted with `date_format='iso'`".format(fmt=date_format) + ) raise ValueError(msg) self.schema = build_table_schema(obj, index=self.index) # NotImplemented on a column MultiIndex if obj.ndim == 2 and isinstance(obj.columns, MultiIndex): - raise NotImplementedError( - "orient='table' is not supported for MultiIndex") + raise NotImplementedError("orient='table' is not supported for MultiIndex") # TODO: Do this timedelta properly in objToJSON.c See GH #15137 - if ((obj.ndim == 1) and (obj.name in set(obj.index.names)) or - len(obj.columns & obj.index.names)): + if ( + (obj.ndim == 1) + and (obj.name in set(obj.index.names)) + or len(obj.columns & obj.index.names) + ): msg = "Overlapping names between the index and columns" raise ValueError(msg) obj = obj.copy() - timedeltas = obj.select_dtypes(include=['timedelta']).columns + timedeltas = obj.select_dtypes(include=["timedelta"]).columns if len(timedeltas): - obj[timedeltas] = obj[timedeltas].applymap( - lambda x: x.isoformat()) + obj[timedeltas] = obj[timedeltas].applymap(lambda x: x.isoformat()) # Convert PeriodIndex to datetimes before serialzing if is_period_dtype(obj.index): obj.index = obj.index.to_timestamp() @@ -205,23 +302,51 @@ def __init__(self, obj, orient, date_format, double_precision, self.obj = obj.reset_index(drop=True) else: self.obj = obj.reset_index(drop=False) - self.date_format = 'iso' - self.orient = 'records' + self.date_format = "iso" + self.orient = "records" self.index = index - def _write(self, obj, orient, double_precision, ensure_ascii, - date_unit, iso_dates, default_handler): - data = super()._write(obj, orient, double_precision, ensure_ascii, - date_unit, iso_dates, default_handler) + def _write( + self, + obj, + orient, + double_precision, + ensure_ascii, + date_unit, + iso_dates, + default_handler, + ): + data = super()._write( + obj, + orient, + double_precision, + ensure_ascii, + date_unit, + iso_dates, + default_handler, + ) serialized = '{{"schema": {schema}, "data": {data}}}'.format( - schema=dumps(self.schema), data=data) + schema=dumps(self.schema), data=data + ) return serialized -def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, - convert_axes=None, convert_dates=True, keep_default_dates=True, - numpy=False, precise_float=False, date_unit=None, encoding=None, - lines=False, chunksize=None, compression='infer'): +def read_json( + path_or_buf=None, + orient=None, + typ="frame", + dtype=None, + convert_axes=None, + convert_dates=True, + keep_default_dates=True, + numpy=False, + precise_float=False, + date_unit=None, + encoding=None, + lines=False, + chunksize=None, + compression="infer", +): """ Convert a JSON string to pandas object. @@ -414,27 +539,36 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, {"index": "row 2", "col 1": "c", "col 2": "d"}]}' """ - if orient == 'table' and dtype: + if orient == "table" and dtype: raise ValueError("cannot pass both dtype and orient='table'") - if orient == 'table' and convert_axes: + if orient == "table" and convert_axes: raise ValueError("cannot pass both convert_axes and orient='table'") - if dtype is None and orient != 'table': + if dtype is None and orient != "table": dtype = True - if convert_axes is None and orient != 'table': + if convert_axes is None and orient != "table": convert_axes = True compression = _infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( - path_or_buf, encoding=encoding, compression=compression, + path_or_buf, encoding=encoding, compression=compression ) json_reader = JsonReader( - filepath_or_buffer, orient=orient, typ=typ, dtype=dtype, - convert_axes=convert_axes, convert_dates=convert_dates, - keep_default_dates=keep_default_dates, numpy=numpy, - precise_float=precise_float, date_unit=date_unit, encoding=encoding, - lines=lines, chunksize=chunksize, compression=compression, + filepath_or_buffer, + orient=orient, + typ=typ, + dtype=dtype, + convert_axes=convert_axes, + convert_dates=convert_dates, + keep_default_dates=keep_default_dates, + numpy=numpy, + precise_float=precise_float, + date_unit=date_unit, + encoding=encoding, + lines=lines, + chunksize=chunksize, + compression=compression, ) if chunksize: @@ -457,9 +591,24 @@ class JsonReader(BaseIterator): ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the whole document. """ - def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes, - convert_dates, keep_default_dates, numpy, precise_float, - date_unit, encoding, lines, chunksize, compression): + + def __init__( + self, + filepath_or_buffer, + orient, + typ, + dtype, + convert_axes, + convert_dates, + keep_default_dates, + numpy, + precise_float, + date_unit, + encoding, + lines, + chunksize, + compression, + ): self.path_or_buf = filepath_or_buffer self.orient = orient @@ -494,9 +643,9 @@ def _preprocess_data(self, data): If self.chunksize, we prepare the data for the `__next__` method. Otherwise, we read it into memory for the `read` method. """ - if hasattr(data, 'read') and not self.chunksize: + if hasattr(data, "read") and not self.chunksize: data = data.read() - if not hasattr(data, 'read') and self.chunksize: + if not hasattr(data, "read") and self.chunksize: data = StringIO(data) return data @@ -522,9 +671,12 @@ def _get_data_from_filepath(self, filepath_or_buffer): pass if exists or self.compression is not None: - data, _ = _get_handle(filepath_or_buffer, 'r', - encoding=self.encoding, - compression=self.compression) + data, _ = _get_handle( + filepath_or_buffer, + "r", + encoding=self.encoding, + compression=self.compression, + ) self.should_close = True self.open_stream = data @@ -535,7 +687,7 @@ def _combine_lines(self, lines): Combines a list of JSON objects into one JSON object. """ lines = filter(None, map(lambda x: x.strip(), lines)) - return '[' + ','.join(lines) + ']' + return "[" + ",".join(lines) + "]" def read(self): """ @@ -545,9 +697,7 @@ def read(self): obj = concat(self) elif self.lines: data = ensure_str(self.data) - obj = self._get_object_parser( - self._combine_lines(data.split('\n')) - ) + obj = self._get_object_parser(self._combine_lines(data.split("\n"))) else: obj = self._get_object_parser(self.data) self.close() @@ -560,19 +710,22 @@ def _get_object_parser(self, json): typ = self.typ dtype = self.dtype kwargs = { - "orient": self.orient, "dtype": self.dtype, + "orient": self.orient, + "dtype": self.dtype, "convert_axes": self.convert_axes, "convert_dates": self.convert_dates, - "keep_default_dates": self.keep_default_dates, "numpy": self.numpy, - "precise_float": self.precise_float, "date_unit": self.date_unit + "keep_default_dates": self.keep_default_dates, + "numpy": self.numpy, + "precise_float": self.precise_float, + "date_unit": self.date_unit, } obj = None - if typ == 'frame': + if typ == "frame": obj = FrameParser(json, **kwargs).parse() - if typ == 'series' or obj is None: + if typ == "series" or obj is None: if not isinstance(dtype, bool): - kwargs['dtype'] = dtype + kwargs["dtype"] = dtype obj = SeriesParser(json, **kwargs).parse() return obj @@ -608,16 +761,26 @@ def __next__(self): class Parser: - _STAMP_UNITS = ('s', 'ms', 'us', 'ns') + _STAMP_UNITS = ("s", "ms", "us", "ns") _MIN_STAMPS = { - 's': 31536000, - 'ms': 31536000000, - 'us': 31536000000000, - 'ns': 31536000000000000} - - def __init__(self, json, orient, dtype=None, convert_axes=True, - convert_dates=True, keep_default_dates=False, numpy=False, - precise_float=False, date_unit=None): + "s": 31536000, + "ms": 31536000000, + "us": 31536000000000, + "ns": 31536000000000000, + } + + def __init__( + self, + json, + orient, + dtype=None, + convert_axes=True, + convert_dates=True, + keep_default_dates=False, + numpy=False, + precise_float=False, + date_unit=None, + ): self.json = json if orient is None: @@ -632,11 +795,12 @@ def __init__(self, json, orient, dtype=None, convert_axes=True, if date_unit is not None: date_unit = date_unit.lower() if date_unit not in self._STAMP_UNITS: - raise ValueError('date_unit must be one of {units}' - .format(units=self._STAMP_UNITS)) + raise ValueError( + "date_unit must be one of {units}".format(units=self._STAMP_UNITS) + ) self.min_stamp = self._MIN_STAMPS[date_unit] else: - self.min_stamp = self._MIN_STAMPS['s'] + self.min_stamp = self._MIN_STAMPS["s"] self.numpy = numpy self.precise_float = precise_float @@ -653,8 +817,11 @@ def check_keys_split(self, decoded): bad_keys = set(decoded.keys()).difference(set(self._split_keys)) if bad_keys: bad_keys = ", ".join(bad_keys) - raise ValueError("JSON data had unexpected key(s): {bad_keys}" - .format(bad_keys=pprint_thing(bad_keys))) + raise ValueError( + "JSON data had unexpected key(s): {bad_keys}".format( + bad_keys=pprint_thing(bad_keys) + ) + ) def parse(self): @@ -679,16 +846,15 @@ def _convert_axes(self): """ for axis in self.obj._AXIS_NUMBERS.keys(): new_axis, result = self._try_convert_data( - axis, self.obj._get_axis(axis), use_dtypes=False, - convert_dates=True) + axis, self.obj._get_axis(axis), use_dtypes=False, convert_dates=True + ) if result: setattr(self.obj, axis, new_axis) def _try_convert_types(self): raise AbstractMethodError(self) - def _try_convert_data(self, name, data, use_dtypes=True, - convert_dates=True): + def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True): """ Try to parse a ndarray like into a column by inferring dtype. """ @@ -701,8 +867,9 @@ def _try_convert_data(self, name, data, use_dtypes=True, pass else: # dtype to force - dtype = (self.dtype.get(name) - if isinstance(self.dtype, dict) else self.dtype) + dtype = ( + self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype + ) if dtype is not None: try: dtype = np.dtype(dtype) @@ -717,32 +884,32 @@ def _try_convert_data(self, name, data, use_dtypes=True, result = False - if data.dtype == 'object': + if data.dtype == "object": # try float try: - data = data.astype('float64') + data = data.astype("float64") result = True except (TypeError, ValueError): pass - if data.dtype.kind == 'f': + if data.dtype.kind == "f": - if data.dtype != 'float64': + if data.dtype != "float64": # coerce floats to 64 try: - data = data.astype('float64') + data = data.astype("float64") result = True except (TypeError, ValueError): pass # don't coerce 0-len data - if len(data) and (data.dtype == 'float' or data.dtype == 'object'): + if len(data) and (data.dtype == "float" or data.dtype == "object"): # coerce ints if we can try: - new_data = data.astype('int64') + new_data = data.astype("int64") if (new_data == data).all(): data = new_data result = True @@ -750,11 +917,11 @@ def _try_convert_data(self, name, data, use_dtypes=True, pass # coerce ints to 64 - if data.dtype == 'int': + if data.dtype == "int": # coerce floats to 64 try: - data = data.astype('int64') + data = data.astype("int64") result = True except (TypeError, ValueError): pass @@ -774,24 +941,26 @@ def _try_convert_to_date(self, data): return data, False new_data = data - if new_data.dtype == 'object': + if new_data.dtype == "object": try: - new_data = data.astype('int64') + new_data = data.astype("int64") except (TypeError, ValueError, OverflowError): pass # ignore numbers that are out of range if issubclass(new_data.dtype.type, np.number): - in_range = (isna(new_data.values) | (new_data > self.min_stamp) | - (new_data.values == iNaT)) + in_range = ( + isna(new_data.values) + | (new_data > self.min_stamp) + | (new_data.values == iNaT) + ) if not in_range.all(): return data, False date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS for date_unit in date_units: try: - new_data = to_datetime(new_data, errors='raise', - unit=date_unit) + new_data = to_datetime(new_data, errors="raise", unit=date_unit) except ValueError: continue except Exception: @@ -804,52 +973,62 @@ def _try_convert_dates(self): class SeriesParser(Parser): - _default_orient = 'index' - _split_keys = ('name', 'index', 'data') + _default_orient = "index" + _split_keys = ("name", "index", "data") def _parse_no_numpy(self): json = self.json orient = self.orient if orient == "split": - decoded = {str(k): v for k, v in loads( - json, precise_float=self.precise_float).items()} + decoded = { + str(k): v + for k, v in loads(json, precise_float=self.precise_float).items() + } self.check_keys_split(decoded) self.obj = Series(dtype=None, **decoded) else: - self.obj = Series( - loads(json, precise_float=self.precise_float), dtype=None) + self.obj = Series(loads(json, precise_float=self.precise_float), dtype=None) def _parse_numpy(self): json = self.json orient = self.orient if orient == "split": - decoded = loads(json, dtype=None, numpy=True, - precise_float=self.precise_float) + decoded = loads( + json, dtype=None, numpy=True, precise_float=self.precise_float + ) decoded = {str(k): v for k, v in decoded.items()} self.check_keys_split(decoded) self.obj = Series(**decoded) elif orient == "columns" or orient == "index": - self.obj = Series(*loads(json, dtype=None, numpy=True, - labelled=True, - precise_float=self.precise_float)) + self.obj = Series( + *loads( + json, + dtype=None, + numpy=True, + labelled=True, + precise_float=self.precise_float, + ) + ) else: - self.obj = Series(loads(json, dtype=None, numpy=True, - precise_float=self.precise_float)) + self.obj = Series( + loads(json, dtype=None, numpy=True, precise_float=self.precise_float) + ) def _try_convert_types(self): if self.obj is None: return obj, result = self._try_convert_data( - 'data', self.obj, convert_dates=self.convert_dates) + "data", self.obj, convert_dates=self.convert_dates + ) if result: self.obj = obj class FrameParser(Parser): - _default_orient = 'columns' - _split_keys = ('columns', 'index', 'data') + _default_orient = "columns" + _split_keys = ("columns", "index", "data") def _parse_numpy(self): @@ -857,24 +1036,37 @@ def _parse_numpy(self): orient = self.orient if orient == "columns": - args = loads(json, dtype=None, numpy=True, labelled=True, - precise_float=self.precise_float) + args = loads( + json, + dtype=None, + numpy=True, + labelled=True, + precise_float=self.precise_float, + ) if len(args): args = (args[0].T, args[2], args[1]) self.obj = DataFrame(*args) elif orient == "split": - decoded = loads(json, dtype=None, numpy=True, - precise_float=self.precise_float) + decoded = loads( + json, dtype=None, numpy=True, precise_float=self.precise_float + ) decoded = {str(k): v for k, v in decoded.items()} self.check_keys_split(decoded) self.obj = DataFrame(**decoded) elif orient == "values": - self.obj = DataFrame(loads(json, dtype=None, numpy=True, - precise_float=self.precise_float)) + self.obj = DataFrame( + loads(json, dtype=None, numpy=True, precise_float=self.precise_float) + ) else: - self.obj = DataFrame(*loads(json, dtype=None, numpy=True, - labelled=True, - precise_float=self.precise_float)) + self.obj = DataFrame( + *loads( + json, + dtype=None, + numpy=True, + labelled=True, + precise_float=self.precise_float, + ) + ) def _parse_no_numpy(self): @@ -883,21 +1075,25 @@ def _parse_no_numpy(self): if orient == "columns": self.obj = DataFrame( - loads(json, precise_float=self.precise_float), dtype=None) + loads(json, precise_float=self.precise_float), dtype=None + ) elif orient == "split": - decoded = {str(k): v for k, v in loads( - json, precise_float=self.precise_float).items()} + decoded = { + str(k): v + for k, v in loads(json, precise_float=self.precise_float).items() + } self.check_keys_split(decoded) self.obj = DataFrame(dtype=None, **decoded) elif orient == "index": self.obj = DataFrame( - loads(json, precise_float=self.precise_float), dtype=None).T - elif orient == 'table': - self.obj = parse_table_schema(json, - precise_float=self.precise_float) + loads(json, precise_float=self.precise_float), dtype=None + ).T + elif orient == "table": + self.obj = parse_table_schema(json, precise_float=self.precise_float) else: self.obj = DataFrame( - loads(json, precise_float=self.precise_float), dtype=None) + loads(json, precise_float=self.precise_float), dtype=None + ) def _process_converter(self, f, filt=None): """ @@ -931,7 +1127,8 @@ def _try_convert_types(self): self._try_convert_dates() self._process_converter( - lambda col, c: self._try_convert_data(col, c, convert_dates=False)) + lambda col, c: self._try_convert_data(col, c, convert_dates=False) + ) def _try_convert_dates(self): if self.obj is None: @@ -951,16 +1148,20 @@ def is_ok(col): return False col_lower = col.lower() - if (col_lower.endswith('_at') or - col_lower.endswith('_time') or - col_lower == 'modified' or - col_lower == 'date' or - col_lower == 'datetime' or - col_lower.startswith('timestamp')): + if ( + col_lower.endswith("_at") + or col_lower.endswith("_time") + or col_lower == "modified" + or col_lower == "date" + or col_lower == "datetime" + or col_lower.startswith("timestamp") + ): return True return False self._process_converter( lambda col, c: self._try_convert_to_date(c), - lambda col, c: ((self.keep_default_dates and is_ok(col)) or - col in convert_dates)) + lambda col, c: ( + (self.keep_default_dates and is_ok(col)) or col in convert_dates + ), + ) diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index 5c6018d399c824..c09dc177ccbd1c 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -19,16 +19,20 @@ def _convert_to_line_delimits(s): # Determine we have a JSON list to turn to lines otherwise just return the # json object, only lists can - if not s[0] == '[' and s[-1] == ']': + if not s[0] == "[" and s[-1] == "]": return s s = s[1:-1] return convert_json_to_lines(s) -def nested_to_record(ds, prefix: str = "", - sep: str = ".", level: int = 0, - max_level: Optional[int] = None): +def nested_to_record( + ds, + prefix: str = "", + sep: str = ".", + level: int = 0, + max_level: Optional[int] = None, +): """ A simplified json_normalize @@ -90,16 +94,16 @@ def nested_to_record(ds, prefix: str = "", # current dict level < maximum level provided and # only dicts gets recurse-flattened # only at level>1 do we rename the rest of the keys - if (not isinstance(v, dict) or - (max_level is not None and level >= max_level)): + if not isinstance(v, dict) or ( + max_level is not None and level >= max_level + ): if level != 0: # so we skip copying for top level, common case v = new_d.pop(k) new_d[newkey] = v continue else: v = new_d.pop(k) - new_d.update(nested_to_record(v, newkey, sep, level + 1, - max_level)) + new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level)) new_ds.append(new_d) if singleton: @@ -107,14 +111,16 @@ def nested_to_record(ds, prefix: str = "", return new_ds -def json_normalize(data: List[Dict], - record_path: Optional[Union[str, List]] = None, - meta: Optional[Union[str, List]] = None, - meta_prefix: Optional[str] = None, - record_prefix: Optional[str] = None, - errors: Optional[str] = 'raise', - sep: str = '.', - max_level: Optional[int] = None): +def json_normalize( + data: List[Dict], + record_path: Optional[Union[str, List]] = None, + meta: Optional[Union[str, List]] = None, + meta_prefix: Optional[str] = None, + record_prefix: Optional[str] = None, + errors: Optional[str] = "raise", + sep: str = ".", + max_level: Optional[int] = None, +): """ Normalize semi-structured JSON data into a flat table. @@ -230,6 +236,7 @@ def json_normalize(data: List[Dict], Returns normalized data with columns prefixed with the given string. """ + def _pull_field(js, spec): result = js if isinstance(spec, list): @@ -256,8 +263,7 @@ def _pull_field(js, spec): # # TODO: handle record value which are lists, at least error # reasonably - data = nested_to_record(data, sep=sep, - max_level=max_level) + data = nested_to_record(data, sep=sep, max_level=max_level) return DataFrame(data) elif not isinstance(record_path, list): record_path = [record_path] @@ -287,14 +293,16 @@ def _recursive_extract(data, path, seen_meta, level=0): if level + 1 == len(val): seen_meta[key] = _pull_field(obj, val[-1]) - _recursive_extract(obj[path[0]], path[1:], - seen_meta, level=level + 1) + _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1) else: for obj in data: recs = _pull_field(obj, path[0]) - recs = [nested_to_record(r, sep=sep, - max_level=max_level) - if isinstance(r, dict) else r for r in recs] + recs = [ + nested_to_record(r, sep=sep, max_level=max_level) + if isinstance(r, dict) + else r + for r in recs + ] # For repeating the metadata later lengths.append(len(recs)) @@ -305,13 +313,14 @@ def _recursive_extract(data, path, seen_meta, level=0): try: meta_val = _pull_field(obj, val[level:]) except KeyError as e: - if errors == 'ignore': + if errors == "ignore": meta_val = np.nan else: - raise KeyError("Try running with " - "errors='ignore' as key " - "{err} is not always present" - .format(err=e)) + raise KeyError( + "Try running with " + "errors='ignore' as key " + "{err} is not always present".format(err=e) + ) meta_vals[key].append(meta_val) records.extend(recs) @@ -320,8 +329,7 @@ def _recursive_extract(data, path, seen_meta, level=0): result = DataFrame(records) if record_prefix is not None: - result = result.rename( - columns=lambda x: "{p}{c}".format(p=record_prefix, c=x)) + result = result.rename(columns=lambda x: "{p}{c}".format(p=record_prefix, c=x)) # Data types, a problem for k, v in meta_vals.items(): @@ -329,7 +337,9 @@ def _recursive_extract(data, path, seen_meta, level=0): k = meta_prefix + k if k in result: - raise ValueError('Conflicting metadata name {name}, ' - 'need distinguishing prefix '.format(name=k)) + raise ValueError( + "Conflicting metadata name {name}, " + "need distinguishing prefix ".format(name=k) + ) result[k] = np.array(v, dtype=object).repeat(lengths) return result diff --git a/pandas/io/json/table_schema.py b/pandas/io/json/table_schema.py index a54f5cdf723a39..045127c63af5c2 100644 --- a/pandas/io/json/table_schema.py +++ b/pandas/io/json/table_schema.py @@ -8,9 +8,16 @@ import pandas._libs.json as json from pandas.core.dtypes.common import ( - is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_integer_dtype, is_numeric_dtype, is_period_dtype, - is_string_dtype, is_timedelta64_dtype) + is_bool_dtype, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_integer_dtype, + is_numeric_dtype, + is_period_dtype, + is_string_dtype, + is_timedelta64_dtype, +) from pandas import DataFrame from pandas.api.types import CategoricalDtype @@ -50,70 +57,71 @@ def as_json_table_type(x): =============== ================= """ if is_integer_dtype(x): - return 'integer' + return "integer" elif is_bool_dtype(x): - return 'boolean' + return "boolean" elif is_numeric_dtype(x): - return 'number' - elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or - is_period_dtype(x)): - return 'datetime' + return "number" + elif is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x): + return "datetime" elif is_timedelta64_dtype(x): - return 'duration' + return "duration" elif is_categorical_dtype(x): - return 'any' + return "any" elif is_string_dtype(x): - return 'string' + return "string" else: - return 'any' + return "any" def set_default_names(data): """Sets index names to 'index' for regular, or 'level_x' for Multi""" if com._all_not_none(*data.index.names): nms = data.index.names - if len(nms) == 1 and data.index.name == 'index': + if len(nms) == 1 and data.index.name == "index": warnings.warn("Index name of 'index' is not round-trippable") - elif len(nms) > 1 and any(x.startswith('level_') for x in nms): - warnings.warn("Index names beginning with 'level_' are not " - "round-trippable") + elif len(nms) > 1 and any(x.startswith("level_") for x in nms): + warnings.warn( + "Index names beginning with 'level_' are not " "round-trippable" + ) return data data = data.copy() if data.index.nlevels > 1: - names = [name if name is not None else 'level_{}'.format(i) - for i, name in enumerate(data.index.names)] + names = [ + name if name is not None else "level_{}".format(i) + for i, name in enumerate(data.index.names) + ] data.index.names = names else: - data.index.name = data.index.name or 'index' + data.index.name = data.index.name or "index" return data def convert_pandas_type_to_json_field(arr, dtype=None): dtype = dtype or arr.dtype if arr.name is None: - name = 'values' + name = "values" else: name = arr.name - field = {'name': name, - 'type': as_json_table_type(dtype)} + field = {"name": name, "type": as_json_table_type(dtype)} if is_categorical_dtype(arr): - if hasattr(arr, 'categories'): + if hasattr(arr, "categories"): cats = arr.categories ordered = arr.ordered else: cats = arr.cat.categories ordered = arr.cat.ordered - field['constraints'] = {"enum": list(cats)} - field['ordered'] = ordered + field["constraints"] = {"enum": list(cats)} + field["ordered"] = ordered elif is_period_dtype(arr): - field['freq'] = arr.freqstr + field["freq"] = arr.freqstr elif is_datetime64tz_dtype(arr): - if hasattr(arr, 'dt'): - field['tz'] = arr.dt.tz.zone + if hasattr(arr, "dt"): + field["tz"] = arr.dt.tz.zone else: - field['tz'] = arr.tz.zone + field["tz"] = arr.tz.zone return field @@ -154,28 +162,29 @@ def convert_json_field_to_pandas_type(field): 'tz': 'US/Central'}) 'datetime64[ns, US/Central]' """ - typ = field['type'] - if typ == 'string': - return 'object' - elif typ == 'integer': - return 'int64' - elif typ == 'number': - return 'float64' - elif typ == 'boolean': - return 'bool' - elif typ == 'duration': - return 'timedelta64' - elif typ == 'datetime': - if field.get('tz'): - return 'datetime64[ns, {tz}]'.format(tz=field['tz']) + typ = field["type"] + if typ == "string": + return "object" + elif typ == "integer": + return "int64" + elif typ == "number": + return "float64" + elif typ == "boolean": + return "bool" + elif typ == "duration": + return "timedelta64" + elif typ == "datetime": + if field.get("tz"): + return "datetime64[ns, {tz}]".format(tz=field["tz"]) else: - return 'datetime64[ns]' - elif typ == 'any': - if 'constraints' in field and 'ordered' in field: - return CategoricalDtype(categories=field['constraints']['enum'], - ordered=field['ordered']) + return "datetime64[ns]" + elif typ == "any": + if "constraints" in field and "ordered" in field: + return CategoricalDtype( + categories=field["constraints"]["enum"], ordered=field["ordered"] + ) else: - return 'object' + return "object" raise ValueError("Unsupported or invalid field type: {}".format(typ)) @@ -245,17 +254,17 @@ def build_table_schema(data, index=True, primary_key=None, version=True): else: fields.append(convert_pandas_type_to_json_field(data)) - schema['fields'] = fields + schema["fields"] = fields if index and data.index.is_unique and primary_key is None: if data.index.nlevels == 1: - schema['primaryKey'] = [data.index.name] + schema["primaryKey"] = [data.index.name] else: - schema['primaryKey'] = data.index.names + schema["primaryKey"] = data.index.names elif primary_key is not None: - schema['primaryKey'] = primary_key + schema["primaryKey"] = primary_key if version: - schema['pandas_version'] = '0.20.0' + schema["pandas_version"] = "0.20.0" return schema @@ -296,31 +305,34 @@ def parse_table_schema(json, precise_float): pandas.read_json """ table = loads(json, precise_float=precise_float) - col_order = [field['name'] for field in table['schema']['fields']] - df = DataFrame(table['data'], columns=col_order)[col_order] + col_order = [field["name"] for field in table["schema"]["fields"]] + df = DataFrame(table["data"], columns=col_order)[col_order] - dtypes = {field['name']: convert_json_field_to_pandas_type(field) - for field in table['schema']['fields']} + dtypes = { + field["name"]: convert_json_field_to_pandas_type(field) + for field in table["schema"]["fields"] + } # Cannot directly use as_type with timezone data on object; raise for now - if any(str(x).startswith('datetime64[ns, ') for x in dtypes.values()): - raise NotImplementedError('table="orient" can not yet read timezone ' - 'data') + if any(str(x).startswith("datetime64[ns, ") for x in dtypes.values()): + raise NotImplementedError('table="orient" can not yet read timezone ' "data") # No ISO constructor for Timedelta as of yet, so need to raise - if 'timedelta64' in dtypes.values(): - raise NotImplementedError('table="orient" can not yet read ' - 'ISO-formatted Timedelta data') + if "timedelta64" in dtypes.values(): + raise NotImplementedError( + 'table="orient" can not yet read ' "ISO-formatted Timedelta data" + ) df = df.astype(dtypes) - if 'primaryKey' in table['schema']: - df = df.set_index(table['schema']['primaryKey']) + if "primaryKey" in table["schema"]: + df = df.set_index(table["schema"]["primaryKey"]) if len(df.index.names) == 1: - if df.index.name == 'index': + if df.index.name == "index": df.index.name = None else: - df.index.names = [None if x.startswith('level_') else x for x in - df.index.names] + df.index.names = [ + None if x.startswith("level_") else x for x in df.index.names + ] return df diff --git a/pandas/io/msgpack/__init__.py b/pandas/io/msgpack/__init__.py index f8feffcf492403..9b09cffd83f755 100644 --- a/pandas/io/msgpack/__init__.py +++ b/pandas/io/msgpack/__init__.py @@ -6,8 +6,9 @@ from pandas.io.msgpack._version import version # noqa -class ExtType(namedtuple('ExtType', 'code data')): +class ExtType(namedtuple("ExtType", "code data")): """ExtType represents ext type in msgpack.""" + def __new__(cls, code, data): if not isinstance(code, int): raise TypeError("code must be int") @@ -17,6 +18,7 @@ def __new__(cls, code, data): raise ValueError("code must be 0~127") return super().__new__(cls, code, data) + import os # noqa from pandas.io.msgpack._packer import Packer # noqa diff --git a/pandas/io/msgpack/exceptions.py b/pandas/io/msgpack/exceptions.py index ae0f74a6700bda..40f5a8af8f5831 100644 --- a/pandas/io/msgpack/exceptions.py +++ b/pandas/io/msgpack/exceptions.py @@ -15,7 +15,6 @@ class UnpackValueError(UnpackException, ValueError): class ExtraData(ValueError): - def __init__(self, unpacked, extra): self.unpacked = unpacked self.extra = extra diff --git a/pandas/io/packers.py b/pandas/io/packers.py index e43f94e28d4af6..b0ce7a4ccb12af 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -49,16 +49,37 @@ from pandas.compat._optional import import_optional_dependency from pandas.errors import PerformanceWarning from pandas.util._move import ( - BadMove as _BadMove, move_into_mutable_buffer as _move_into_mutable_buffer) + BadMove as _BadMove, + move_into_mutable_buffer as _move_into_mutable_buffer, +) from pandas.core.dtypes.common import ( - is_categorical_dtype, is_datetime64tz_dtype, is_object_dtype, - needs_i8_conversion, pandas_dtype) + is_categorical_dtype, + is_datetime64tz_dtype, + is_object_dtype, + needs_i8_conversion, + pandas_dtype, +) from pandas import ( # noqa:F401 - Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index, - Index, Int64Index, Interval, IntervalIndex, MultiIndex, NaT, Period, - PeriodIndex, RangeIndex, Series, TimedeltaIndex, Timestamp) + Categorical, + CategoricalIndex, + DataFrame, + DatetimeIndex, + Float64Index, + Index, + Int64Index, + Interval, + IntervalIndex, + MultiIndex, + NaT, + Period, + PeriodIndex, + RangeIndex, + Series, + TimedeltaIndex, + Timestamp, +) from pandas.core import internals from pandas.core.arrays import DatetimeArray, IntervalArray, PeriodArray from pandas.core.arrays.sparse import BlockIndex, IntIndex @@ -95,19 +116,22 @@ def to_msgpack(path_or_buf, *args, **kwargs): compress : type of compressor (zlib or blosc), default to None (no compression) """ - warnings.warn("to_msgpack is deprecated and will be removed in a " - "future version.\n" - "It is recommended to use pyarrow for on-the-wire " - "transmission of pandas objects.", - FutureWarning, stacklevel=3) + warnings.warn( + "to_msgpack is deprecated and will be removed in a " + "future version.\n" + "It is recommended to use pyarrow for on-the-wire " + "transmission of pandas objects.", + FutureWarning, + stacklevel=3, + ) global compressor - compressor = kwargs.pop('compress', None) - append = kwargs.pop('append', None) + compressor = kwargs.pop("compress", None) + append = kwargs.pop("append", None) if append: - mode = 'a+b' + mode = "a+b" else: - mode = 'wb' + mode = "wb" def writer(fh): for a in args: @@ -125,7 +149,7 @@ def writer(fh): writer(path_or_buf) -def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs): +def read_msgpack(path_or_buf, encoding="utf-8", iterator=False, **kwargs): """ Load msgpack pandas object from the specified file path @@ -152,11 +176,14 @@ def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs): read_msgpack is only guaranteed to be backwards compatible to pandas 0.20.3. """ - warnings.warn("The read_msgpack is deprecated and will be removed in a " - "future version.\n" - "It is recommended to use pyarrow for on-the-wire " - "transmission of pandas objects.", - FutureWarning, stacklevel=3) + warnings.warn( + "The read_msgpack is deprecated and will be removed in a " + "future version.\n" + "It is recommended to use pyarrow for on-the-wire " + "transmission of pandas objects.", + FutureWarning, + stacklevel=3, + ) path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf) if iterator: @@ -182,7 +209,7 @@ def read(fh): exists = False if exists: - with open(path_or_buf, 'rb') as fh: + with open(path_or_buf, "rb") as fh: return read(fh) if isinstance(path_or_buf, bytes): @@ -194,25 +221,25 @@ def read(fh): finally: if fh is not None: fh.close() - elif hasattr(path_or_buf, 'read') and callable(path_or_buf.read): + elif hasattr(path_or_buf, "read") and callable(path_or_buf.read): # treat as a buffer like return read(path_or_buf) - raise ValueError('path_or_buf needs to be a string file path or file-like') + raise ValueError("path_or_buf needs to be a string file path or file-like") -dtype_dict = {21: np.dtype('M8[ns]'), - 'datetime64[ns]': np.dtype('M8[ns]'), - 'datetime64[us]': np.dtype('M8[us]'), - 22: np.dtype('m8[ns]'), - 'timedelta64[ns]': np.dtype('m8[ns]'), - 'timedelta64[us]': np.dtype('m8[us]'), - - # this is platform int, which we need to remap to np.int64 - # for compat on windows platforms - 7: np.dtype('int64'), - 'category': 'category' - } +dtype_dict = { + 21: np.dtype("M8[ns]"), + "datetime64[ns]": np.dtype("M8[ns]"), + "datetime64[us]": np.dtype("M8[us]"), + 22: np.dtype("m8[ns]"), + "timedelta64[ns]": np.dtype("m8[ns]"), + "timedelta64[us]": np.dtype("m8[us]"), + # this is platform int, which we need to remap to np.int64 + # for compat on windows platforms + 7: np.dtype("int64"), + "category": "category", +} def dtype_for(t): @@ -222,13 +249,11 @@ def dtype_for(t): return np.typeDict.get(t, t) -c2f_dict = {'complex': np.float64, - 'complex128': np.float64, - 'complex64': np.float32} +c2f_dict = {"complex": np.float64, "complex128": np.float64, "complex64": np.float32} # windows (32 bit) compat -if hasattr(np, 'float128'): - c2f_dict['complex256'] = np.float128 +if hasattr(np, "float128"): + c2f_dict["complex256"] = np.float128 def c2f(r, i, ctype_name): @@ -252,13 +277,12 @@ def convert(values): return values.ravel().tolist() if needs_i8_conversion(dtype): - values = values.view('i8') + values = values.view("i8") v = values.ravel() - if compressor == 'zlib': + if compressor == "zlib": zlib = import_optional_dependency( - "zlib", - extra="zlib is required when `compress='zlib'`." + "zlib", extra="zlib is required when `compress='zlib'`." ) # return string arrays like they are @@ -269,10 +293,9 @@ def convert(values): v = v.tostring() return ExtType(0, zlib.compress(v)) - elif compressor == 'blosc': + elif compressor == "blosc": blosc = import_optional_dependency( - "blosc", - extra="zlib is required when `compress='blosc'`." + "blosc", extra="zlib is required when `compress='blosc'`." ) # return string arrays like they are @@ -303,19 +326,17 @@ def unconvert(values, dtype, compress=None): dtype = pandas_dtype(dtype).base if not as_is_ext: - values = values.encode('latin1') + values = values.encode("latin1") if compress: - if compress == 'zlib': + if compress == "zlib": zlib = import_optional_dependency( - "zlib", - extra="zlib is required when `compress='zlib'`." + "zlib", extra="zlib is required when `compress='zlib'`." ) decompress = zlib.decompress - elif compress == 'blosc': + elif compress == "blosc": blosc = import_optional_dependency( - "blosc", - extra="zlib is required when `compress='blosc'`." + "blosc", extra="zlib is required when `compress='blosc'`." ) decompress = blosc.decompress else: @@ -323,8 +344,7 @@ def unconvert(values, dtype, compress=None): try: return np.frombuffer( - _move_into_mutable_buffer(decompress(values)), - dtype=dtype, + _move_into_mutable_buffer(decompress(values)), dtype=dtype ) except _BadMove as e: # Pull the decompressed data off of the `_BadMove` exception. @@ -338,8 +358,8 @@ def unconvert(values, dtype, compress=None): # warn even though we need to make a copy because we are only # copying at most 1 byte. warnings.warn( - 'copying data after decompressing; this may mean that' - ' decompress is caching its result', + "copying data after decompressing; this may mean that" + " decompress is caching its result", PerformanceWarning, ) # fall through to copying `np.fromstring` @@ -358,76 +378,87 @@ def encode(obj): tobj = type(obj) if isinstance(obj, Index): if isinstance(obj, RangeIndex): - return {'typ': 'range_index', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'start': obj._range.start, - 'stop': obj._range.stop, - 'step': obj._range.step, - } + return { + "typ": "range_index", + "klass": obj.__class__.__name__, + "name": getattr(obj, "name", None), + "start": obj._range.start, + "stop": obj._range.stop, + "step": obj._range.step, + } elif isinstance(obj, PeriodIndex): - return {'typ': 'period_index', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'freq': getattr(obj, 'freqstr', None), - 'dtype': obj.dtype.name, - 'data': convert(obj.asi8), - 'compress': compressor} + return { + "typ": "period_index", + "klass": obj.__class__.__name__, + "name": getattr(obj, "name", None), + "freq": getattr(obj, "freqstr", None), + "dtype": obj.dtype.name, + "data": convert(obj.asi8), + "compress": compressor, + } elif isinstance(obj, DatetimeIndex): - tz = getattr(obj, 'tz', None) + tz = getattr(obj, "tz", None) # store tz info and data as UTC if tz is not None: tz = tz.zone - obj = obj.tz_convert('UTC') - return {'typ': 'datetime_index', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'dtype': obj.dtype.name, - 'data': convert(obj.asi8), - 'freq': getattr(obj, 'freqstr', None), - 'tz': tz, - 'compress': compressor} + obj = obj.tz_convert("UTC") + return { + "typ": "datetime_index", + "klass": obj.__class__.__name__, + "name": getattr(obj, "name", None), + "dtype": obj.dtype.name, + "data": convert(obj.asi8), + "freq": getattr(obj, "freqstr", None), + "tz": tz, + "compress": compressor, + } elif isinstance(obj, (IntervalIndex, IntervalArray)): if isinstance(obj, IntervalIndex): - typ = 'interval_index' + typ = "interval_index" else: - typ = 'interval_array' - return {'typ': typ, - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'left': getattr(obj, 'left', None), - 'right': getattr(obj, 'right', None), - 'closed': getattr(obj, 'closed', None)} + typ = "interval_array" + return { + "typ": typ, + "klass": obj.__class__.__name__, + "name": getattr(obj, "name", None), + "left": getattr(obj, "left", None), + "right": getattr(obj, "right", None), + "closed": getattr(obj, "closed", None), + } elif isinstance(obj, MultiIndex): - return {'typ': 'multi_index', - 'klass': obj.__class__.__name__, - 'names': getattr(obj, 'names', None), - 'dtype': obj.dtype.name, - 'data': convert(obj.values), - 'compress': compressor} + return { + "typ": "multi_index", + "klass": obj.__class__.__name__, + "names": getattr(obj, "names", None), + "dtype": obj.dtype.name, + "data": convert(obj.values), + "compress": compressor, + } else: - return {'typ': 'index', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'dtype': obj.dtype.name, - 'data': convert(obj.values), - 'compress': compressor} + return { + "typ": "index", + "klass": obj.__class__.__name__, + "name": getattr(obj, "name", None), + "dtype": obj.dtype.name, + "data": convert(obj.values), + "compress": compressor, + } elif isinstance(obj, Categorical): - return {'typ': 'category', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'codes': obj.codes, - 'categories': obj.categories, - 'ordered': obj.ordered, - 'compress': compressor} + return { + "typ": "category", + "klass": obj.__class__.__name__, + "name": getattr(obj, "name", None), + "codes": obj.codes, + "categories": obj.categories, + "ordered": obj.ordered, + "compress": compressor, + } elif isinstance(obj, Series): if isinstance(obj, SparseSeries): - raise NotImplementedError( - 'msgpack sparse series is not implemented' - ) + raise NotImplementedError("msgpack sparse series is not implemented") # d = {'typ': 'sparse_series', # 'klass': obj.__class__.__name__, # 'dtype': obj.dtype.name, @@ -439,18 +470,18 @@ def encode(obj): # d[f] = getattr(obj, f, None) # return d else: - return {'typ': 'series', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'index': obj.index, - 'dtype': obj.dtype.name, - 'data': convert(obj.values), - 'compress': compressor} + return { + "typ": "series", + "klass": obj.__class__.__name__, + "name": getattr(obj, "name", None), + "index": obj.index, + "dtype": obj.dtype.name, + "data": convert(obj.values), + "compress": compressor, + } elif issubclass(tobj, NDFrame): if isinstance(obj, SparseDataFrame): - raise NotImplementedError( - 'msgpack sparse frame is not implemented' - ) + raise NotImplementedError("msgpack sparse frame is not implemented") # d = {'typ': 'sparse_dataframe', # 'klass': obj.__class__.__name__, # 'columns': obj.columns} @@ -466,19 +497,27 @@ def encode(obj): data = data.consolidate() # the block manager - return {'typ': 'block_manager', - 'klass': obj.__class__.__name__, - 'axes': data.axes, - 'blocks': [{'locs': b.mgr_locs.as_array, - 'values': convert(b.values), - 'shape': b.values.shape, - 'dtype': b.dtype.name, - 'klass': b.__class__.__name__, - 'compress': compressor} for b in data.blocks] + return { + "typ": "block_manager", + "klass": obj.__class__.__name__, + "axes": data.axes, + "blocks": [ + { + "locs": b.mgr_locs.as_array, + "values": convert(b.values), + "shape": b.values.shape, + "dtype": b.dtype.name, + "klass": b.__class__.__name__, + "compress": compressor, } - - elif isinstance(obj, (datetime, date, np.datetime64, timedelta, - np.timedelta64)) or obj is NaT: + for b in data.blocks + ], + } + + elif ( + isinstance(obj, (datetime, date, np.datetime64, timedelta, np.timedelta64)) + or obj is NaT + ): if isinstance(obj, Timestamp): tz = obj.tzinfo if tz is not None: @@ -486,71 +525,73 @@ def encode(obj): freq = obj.freq if freq is not None: freq = freq.freqstr - return {'typ': 'timestamp', - 'value': obj.value, - 'freq': freq, - 'tz': tz} + return {"typ": "timestamp", "value": obj.value, "freq": freq, "tz": tz} if obj is NaT: - return {'typ': 'nat'} + return {"typ": "nat"} elif isinstance(obj, np.timedelta64): - return {'typ': 'timedelta64', - 'data': obj.view('i8')} + return {"typ": "timedelta64", "data": obj.view("i8")} elif isinstance(obj, timedelta): - return {'typ': 'timedelta', - 'data': (obj.days, obj.seconds, obj.microseconds)} + return { + "typ": "timedelta", + "data": (obj.days, obj.seconds, obj.microseconds), + } elif isinstance(obj, np.datetime64): - return {'typ': 'datetime64', - 'data': str(obj)} + return {"typ": "datetime64", "data": str(obj)} elif isinstance(obj, datetime): - return {'typ': 'datetime', - 'data': obj.isoformat()} + return {"typ": "datetime", "data": obj.isoformat()} elif isinstance(obj, date): - return {'typ': 'date', - 'data': obj.isoformat()} - raise Exception( - "cannot encode this datetimelike object: {obj}".format(obj=obj)) + return {"typ": "date", "data": obj.isoformat()} + raise Exception("cannot encode this datetimelike object: {obj}".format(obj=obj)) elif isinstance(obj, Period): - return {'typ': 'period', - 'ordinal': obj.ordinal, - 'freq': obj.freqstr} + return {"typ": "period", "ordinal": obj.ordinal, "freq": obj.freqstr} elif isinstance(obj, Interval): - return {'typ': 'interval', - 'left': obj.left, - 'right': obj.right, - 'closed': obj.closed} + return { + "typ": "interval", + "left": obj.left, + "right": obj.right, + "closed": obj.closed, + } elif isinstance(obj, BlockIndex): - return {'typ': 'block_index', - 'klass': obj.__class__.__name__, - 'blocs': obj.blocs, - 'blengths': obj.blengths, - 'length': obj.length} + return { + "typ": "block_index", + "klass": obj.__class__.__name__, + "blocs": obj.blocs, + "blengths": obj.blengths, + "length": obj.length, + } elif isinstance(obj, IntIndex): - return {'typ': 'int_index', - 'klass': obj.__class__.__name__, - 'indices': obj.indices, - 'length': obj.length} + return { + "typ": "int_index", + "klass": obj.__class__.__name__, + "indices": obj.indices, + "length": obj.length, + } elif isinstance(obj, np.ndarray): - return {'typ': 'ndarray', - 'shape': obj.shape, - 'ndim': obj.ndim, - 'dtype': obj.dtype.name, - 'data': convert(obj), - 'compress': compressor} + return { + "typ": "ndarray", + "shape": obj.shape, + "ndim": obj.ndim, + "dtype": obj.dtype.name, + "data": convert(obj), + "compress": compressor, + } elif isinstance(obj, np.number): if np.iscomplexobj(obj): - return {'typ': 'np_scalar', - 'sub_typ': 'np_complex', - 'dtype': obj.dtype.name, - 'real': np.real(obj).__repr__(), - 'imag': np.imag(obj).__repr__()} + return { + "typ": "np_scalar", + "sub_typ": "np_complex", + "dtype": obj.dtype.name, + "real": np.real(obj).__repr__(), + "imag": np.imag(obj).__repr__(), + } else: - return {'typ': 'np_scalar', - 'dtype': obj.dtype.name, - 'data': obj.__repr__()} + return {"typ": "np_scalar", "dtype": obj.dtype.name, "data": obj.__repr__()} elif isinstance(obj, complex): - return {'typ': 'np_complex', - 'real': np.real(obj).__repr__(), - 'imag': np.imag(obj).__repr__()} + return { + "typ": "np_complex", + "real": np.real(obj).__repr__(), + "imag": np.imag(obj).__repr__(), + } return obj @@ -560,105 +601,101 @@ def decode(obj): Decoder for deserializing numpy data types. """ - typ = obj.get('typ') + typ = obj.get("typ") if typ is None: return obj - elif typ == 'timestamp': - freq = obj['freq'] if 'freq' in obj else obj['offset'] - return Timestamp(obj['value'], tz=obj['tz'], freq=freq) - elif typ == 'nat': + elif typ == "timestamp": + freq = obj["freq"] if "freq" in obj else obj["offset"] + return Timestamp(obj["value"], tz=obj["tz"], freq=freq) + elif typ == "nat": return NaT - elif typ == 'period': - return Period(ordinal=obj['ordinal'], freq=obj['freq']) - elif typ == 'index': - dtype = dtype_for(obj['dtype']) - data = unconvert(obj['data'], dtype, - obj.get('compress')) - return Index(data, dtype=dtype, name=obj['name']) - elif typ == 'range_index': - return RangeIndex(obj['start'], - obj['stop'], - obj['step'], - name=obj['name']) - elif typ == 'multi_index': - dtype = dtype_for(obj['dtype']) - data = unconvert(obj['data'], dtype, - obj.get('compress')) + elif typ == "period": + return Period(ordinal=obj["ordinal"], freq=obj["freq"]) + elif typ == "index": + dtype = dtype_for(obj["dtype"]) + data = unconvert(obj["data"], dtype, obj.get("compress")) + return Index(data, dtype=dtype, name=obj["name"]) + elif typ == "range_index": + return RangeIndex(obj["start"], obj["stop"], obj["step"], name=obj["name"]) + elif typ == "multi_index": + dtype = dtype_for(obj["dtype"]) + data = unconvert(obj["data"], dtype, obj.get("compress")) data = [tuple(x) for x in data] - return MultiIndex.from_tuples(data, names=obj['names']) - elif typ == 'period_index': - data = unconvert(obj['data'], np.int64, obj.get('compress')) - d = dict(name=obj['name'], freq=obj['freq']) - freq = d.pop('freq', None) + return MultiIndex.from_tuples(data, names=obj["names"]) + elif typ == "period_index": + data = unconvert(obj["data"], np.int64, obj.get("compress")) + d = dict(name=obj["name"], freq=obj["freq"]) + freq = d.pop("freq", None) return PeriodIndex(PeriodArray(data, freq), **d) - elif typ == 'datetime_index': - data = unconvert(obj['data'], np.int64, obj.get('compress')) - d = dict(name=obj['name'], freq=obj['freq']) + elif typ == "datetime_index": + data = unconvert(obj["data"], np.int64, obj.get("compress")) + d = dict(name=obj["name"], freq=obj["freq"]) result = DatetimeIndex(data, **d) - tz = obj['tz'] + tz = obj["tz"] # reverse tz conversion if tz is not None: - result = result.tz_localize('UTC').tz_convert(tz) + result = result.tz_localize("UTC").tz_convert(tz) return result - elif typ in ('interval_index', 'interval_array'): - return globals()[obj['klass']].from_arrays(obj['left'], - obj['right'], - obj['closed'], - name=obj['name']) - elif typ == 'category': - from_codes = globals()[obj['klass']].from_codes - return from_codes(codes=obj['codes'], - categories=obj['categories'], - ordered=obj['ordered']) - - elif typ == 'interval': - return Interval(obj['left'], obj['right'], obj['closed']) - elif typ == 'series': - dtype = dtype_for(obj['dtype']) - index = obj['index'] - data = unconvert(obj['data'], dtype, obj['compress']) - return Series(data, index=index, dtype=dtype, name=obj['name']) - - elif typ == 'block_manager': - axes = obj['axes'] + elif typ in ("interval_index", "interval_array"): + return globals()[obj["klass"]].from_arrays( + obj["left"], obj["right"], obj["closed"], name=obj["name"] + ) + elif typ == "category": + from_codes = globals()[obj["klass"]].from_codes + return from_codes( + codes=obj["codes"], categories=obj["categories"], ordered=obj["ordered"] + ) + + elif typ == "interval": + return Interval(obj["left"], obj["right"], obj["closed"]) + elif typ == "series": + dtype = dtype_for(obj["dtype"]) + index = obj["index"] + data = unconvert(obj["data"], dtype, obj["compress"]) + return Series(data, index=index, dtype=dtype, name=obj["name"]) + + elif typ == "block_manager": + axes = obj["axes"] def create_block(b): - values = _safe_reshape(unconvert( - b['values'], dtype_for(b['dtype']), - b['compress']), b['shape']) + values = _safe_reshape( + unconvert(b["values"], dtype_for(b["dtype"]), b["compress"]), b["shape"] + ) # locs handles duplicate column names, and should be used instead # of items; see GH 9618 - if 'locs' in b: - placement = b['locs'] + if "locs" in b: + placement = b["locs"] else: - placement = axes[0].get_indexer(b['items']) + placement = axes[0].get_indexer(b["items"]) - if is_datetime64tz_dtype(b['dtype']): + if is_datetime64tz_dtype(b["dtype"]): assert isinstance(values, np.ndarray), type(values) - assert values.dtype == 'M8[ns]', values.dtype - values = DatetimeArray(values, dtype=b['dtype']) - - return make_block(values=values, - klass=getattr(internals, b['klass']), - placement=placement, - dtype=b['dtype']) - - blocks = [create_block(b) for b in obj['blocks']] - return globals()[obj['klass']](BlockManager(blocks, axes)) - elif typ == 'datetime': - return parse(obj['data']) - elif typ == 'datetime64': - return np.datetime64(parse(obj['data'])) - elif typ == 'date': - return parse(obj['data']).date() - elif typ == 'timedelta': - return timedelta(*obj['data']) - elif typ == 'timedelta64': - return np.timedelta64(int(obj['data'])) + assert values.dtype == "M8[ns]", values.dtype + values = DatetimeArray(values, dtype=b["dtype"]) + + return make_block( + values=values, + klass=getattr(internals, b["klass"]), + placement=placement, + dtype=b["dtype"], + ) + + blocks = [create_block(b) for b in obj["blocks"]] + return globals()[obj["klass"]](BlockManager(blocks, axes)) + elif typ == "datetime": + return parse(obj["data"]) + elif typ == "datetime64": + return np.datetime64(parse(obj["data"])) + elif typ == "date": + return parse(obj["data"]).date() + elif typ == "timedelta": + return timedelta(*obj["data"]) + elif typ == "timedelta64": + return np.timedelta64(int(obj["data"])) # elif typ == 'sparse_series': # dtype = dtype_for(obj['dtype']) # return SparseSeries( @@ -671,94 +708,129 @@ def create_block(b): # default_fill_value=obj['default_fill_value'], # default_kind=obj['default_kind'] # ) - elif typ == 'block_index': - return globals()[obj['klass']](obj['length'], obj['blocs'], - obj['blengths']) - elif typ == 'int_index': - return globals()[obj['klass']](obj['length'], obj['indices']) - elif typ == 'ndarray': - return unconvert(obj['data'], np.typeDict[obj['dtype']], - obj.get('compress')).reshape(obj['shape']) - elif typ == 'np_scalar': - if obj.get('sub_typ') == 'np_complex': - return c2f(obj['real'], obj['imag'], obj['dtype']) + elif typ == "block_index": + return globals()[obj["klass"]](obj["length"], obj["blocs"], obj["blengths"]) + elif typ == "int_index": + return globals()[obj["klass"]](obj["length"], obj["indices"]) + elif typ == "ndarray": + return unconvert( + obj["data"], np.typeDict[obj["dtype"]], obj.get("compress") + ).reshape(obj["shape"]) + elif typ == "np_scalar": + if obj.get("sub_typ") == "np_complex": + return c2f(obj["real"], obj["imag"], obj["dtype"]) else: - dtype = dtype_for(obj['dtype']) + dtype = dtype_for(obj["dtype"]) try: - return dtype(obj['data']) + return dtype(obj["data"]) except (ValueError, TypeError): - return dtype.type(obj['data']) - elif typ == 'np_complex': - return complex(obj['real'] + '+' + obj['imag'] + 'j') + return dtype.type(obj["data"]) + elif typ == "np_complex": + return complex(obj["real"] + "+" + obj["imag"] + "j") elif isinstance(obj, (dict, list, set)): return obj else: return obj -def pack(o, default=encode, - encoding='utf-8', unicode_errors='strict', use_single_float=False, - autoreset=1, use_bin_type=1): +def pack( + o, + default=encode, + encoding="utf-8", + unicode_errors="strict", + use_single_float=False, + autoreset=1, + use_bin_type=1, +): """ Pack an object and return the packed bytes. """ - return Packer(default=default, encoding=encoding, - unicode_errors=unicode_errors, - use_single_float=use_single_float, - autoreset=autoreset, - use_bin_type=use_bin_type).pack(o) - - -def unpack(packed, object_hook=decode, - list_hook=None, use_list=False, encoding='utf-8', - unicode_errors='strict', object_pairs_hook=None, - max_buffer_size=0, ext_hook=ExtType): + return Packer( + default=default, + encoding=encoding, + unicode_errors=unicode_errors, + use_single_float=use_single_float, + autoreset=autoreset, + use_bin_type=use_bin_type, + ).pack(o) + + +def unpack( + packed, + object_hook=decode, + list_hook=None, + use_list=False, + encoding="utf-8", + unicode_errors="strict", + object_pairs_hook=None, + max_buffer_size=0, + ext_hook=ExtType, +): """ Unpack a packed object, return an iterator Note: packed lists will be returned as tuples """ - return Unpacker(packed, object_hook=object_hook, - list_hook=list_hook, - use_list=use_list, encoding=encoding, - unicode_errors=unicode_errors, - object_pairs_hook=object_pairs_hook, - max_buffer_size=max_buffer_size, - ext_hook=ext_hook) + return Unpacker( + packed, + object_hook=object_hook, + list_hook=list_hook, + use_list=use_list, + encoding=encoding, + unicode_errors=unicode_errors, + object_pairs_hook=object_pairs_hook, + max_buffer_size=max_buffer_size, + ext_hook=ext_hook, + ) class Packer(_Packer): - - def __init__(self, default=encode, - encoding='utf-8', - unicode_errors='strict', - use_single_float=False, - autoreset=1, - use_bin_type=1): - super().__init__(default=default, encoding=encoding, - unicode_errors=unicode_errors, - use_single_float=use_single_float, - autoreset=autoreset, - use_bin_type=use_bin_type) + def __init__( + self, + default=encode, + encoding="utf-8", + unicode_errors="strict", + use_single_float=False, + autoreset=1, + use_bin_type=1, + ): + super().__init__( + default=default, + encoding=encoding, + unicode_errors=unicode_errors, + use_single_float=use_single_float, + autoreset=autoreset, + use_bin_type=use_bin_type, + ) class Unpacker(_Unpacker): - - def __init__(self, file_like=None, read_size=0, use_list=False, - object_hook=decode, - object_pairs_hook=None, list_hook=None, encoding='utf-8', - unicode_errors='strict', max_buffer_size=0, ext_hook=ExtType): - super().__init__(file_like=file_like, - read_size=read_size, - use_list=use_list, - object_hook=object_hook, - object_pairs_hook=object_pairs_hook, - list_hook=list_hook, - encoding=encoding, - unicode_errors=unicode_errors, - max_buffer_size=max_buffer_size, - ext_hook=ext_hook) + def __init__( + self, + file_like=None, + read_size=0, + use_list=False, + object_hook=decode, + object_pairs_hook=None, + list_hook=None, + encoding="utf-8", + unicode_errors="strict", + max_buffer_size=0, + ext_hook=ExtType, + ): + super().__init__( + file_like=file_like, + read_size=read_size, + use_list=use_list, + object_hook=object_hook, + object_pairs_hook=object_pairs_hook, + list_hook=list_hook, + encoding=encoding, + unicode_errors=unicode_errors, + max_buffer_size=max_buffer_size, + ext_hook=ext_hook, + ) class Iterator: @@ -784,13 +856,13 @@ def __iter__(self): path_exists = False if path_exists: - fh = open(self.path, 'rb') + fh = open(self.path, "rb") else: fh = BytesIO(self.path) else: - if not hasattr(self.path, 'read'): + if not hasattr(self.path, "read"): fh = BytesIO(self.path) else: diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 9a846d1c7845cd..3db05b94e5dce4 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -13,10 +13,10 @@ def get_engine(engine): """ return our implementation """ - if engine == 'auto': - engine = get_option('io.parquet.engine') + if engine == "auto": + engine = get_option("io.parquet.engine") - if engine == 'auto': + if engine == "auto": # try engines in this order try: return PyArrowImpl() @@ -28,17 +28,19 @@ def get_engine(engine): except ImportError: pass - raise ImportError("Unable to find a usable engine; " - "tried using: 'pyarrow', 'fastparquet'.\n" - "pyarrow or fastparquet is required for parquet " - "support") + raise ImportError( + "Unable to find a usable engine; " + "tried using: 'pyarrow', 'fastparquet'.\n" + "pyarrow or fastparquet is required for parquet " + "support" + ) - if engine not in ['pyarrow', 'fastparquet']: + if engine not in ["pyarrow", "fastparquet"]: raise ValueError("engine must be one of 'pyarrow', 'fastparquet'") - if engine == 'pyarrow': + if engine == "pyarrow": return PyArrowImpl() - elif engine == 'fastparquet': + elif engine == "fastparquet": return FastParquetImpl() @@ -53,14 +55,12 @@ def validate_dataframe(df): raise ValueError("to_parquet only supports IO with DataFrames") # must have value column names (strings only) - if df.columns.inferred_type not in {'string', 'unicode'}: + if df.columns.inferred_type not in {"string", "unicode"}: raise ValueError("parquet must have string column names") # index level names must be strings valid_names = all( - isinstance(name, str) - for name in df.index.names - if name is not None + isinstance(name, str) for name in df.index.names if name is not None ) if not valid_names: raise ValueError("Index level names must be strings") @@ -73,42 +73,57 @@ def read(self, path, columns=None, **kwargs): class PyArrowImpl(BaseImpl): - def __init__(self): pyarrow = import_optional_dependency( - "pyarrow", - extra="pyarrow is required for parquet support." + "pyarrow", extra="pyarrow is required for parquet support." ) import pyarrow.parquet + self.api = pyarrow - def write(self, df, path, compression='snappy', - coerce_timestamps='ms', index=None, partition_cols=None, - **kwargs): + def write( + self, + df, + path, + compression="snappy", + coerce_timestamps="ms", + index=None, + partition_cols=None, + **kwargs + ): self.validate_dataframe(df) - path, _, _, _ = get_filepath_or_buffer(path, mode='wb') + path, _, _, _ = get_filepath_or_buffer(path, mode="wb") if index is None: from_pandas_kwargs = {} else: - from_pandas_kwargs = {'preserve_index': index} + from_pandas_kwargs = {"preserve_index": index} table = self.api.Table.from_pandas(df, **from_pandas_kwargs) if partition_cols is not None: self.api.parquet.write_to_dataset( - table, path, compression=compression, + table, + path, + compression=compression, coerce_timestamps=coerce_timestamps, - partition_cols=partition_cols, **kwargs) + partition_cols=partition_cols, + **kwargs + ) else: self.api.parquet.write_table( - table, path, compression=compression, - coerce_timestamps=coerce_timestamps, **kwargs) + table, + path, + compression=compression, + coerce_timestamps=coerce_timestamps, + **kwargs + ) def read(self, path, columns=None, **kwargs): path, _, _, should_close = get_filepath_or_buffer(path) - kwargs['use_pandas_metadata'] = True - result = self.api.parquet.read_table(path, columns=columns, - **kwargs).to_pandas() + kwargs["use_pandas_metadata"] = True + result = self.api.parquet.read_table( + path, columns=columns, **kwargs + ).to_pandas() if should_close: try: path.close() @@ -119,47 +134,53 @@ def read(self, path, columns=None, **kwargs): class FastParquetImpl(BaseImpl): - def __init__(self): # since pandas is a dependency of fastparquet # we need to import on first use fastparquet = import_optional_dependency( - "fastparquet", - extra="fastparquet is required for parquet support." + "fastparquet", extra="fastparquet is required for parquet support." ) self.api = fastparquet - def write(self, df, path, compression='snappy', index=None, - partition_cols=None, **kwargs): + def write( + self, df, path, compression="snappy", index=None, partition_cols=None, **kwargs + ): self.validate_dataframe(df) # thriftpy/protocol/compact.py:339: # DeprecationWarning: tostring() is deprecated. # Use tobytes() instead. - if 'partition_on' in kwargs and partition_cols is not None: - raise ValueError("Cannot use both partition_on and " - "partition_cols. Use partition_cols for " - "partitioning data") - elif 'partition_on' in kwargs: - partition_cols = kwargs.pop('partition_on') + if "partition_on" in kwargs and partition_cols is not None: + raise ValueError( + "Cannot use both partition_on and " + "partition_cols. Use partition_cols for " + "partitioning data" + ) + elif "partition_on" in kwargs: + partition_cols = kwargs.pop("partition_on") if partition_cols is not None: - kwargs['file_scheme'] = 'hive' + kwargs["file_scheme"] = "hive" if is_s3_url(path): # path is s3:// so we need to open the s3file in 'wb' mode. # TODO: Support 'ab' - path, _, _, _ = get_filepath_or_buffer(path, mode='wb') + path, _, _, _ = get_filepath_or_buffer(path, mode="wb") # And pass the opened s3file to the fastparquet internal impl. - kwargs['open_with'] = lambda path, _: path + kwargs["open_with"] = lambda path, _: path else: path, _, _, _ = get_filepath_or_buffer(path) with catch_warnings(record=True): - self.api.write(path, df, compression=compression, - write_index=index, partition_on=partition_cols, - **kwargs) + self.api.write( + path, + df, + compression=compression, + write_index=index, + partition_on=partition_cols, + **kwargs + ) def read(self, path, columns=None, **kwargs): if is_s3_url(path): @@ -178,8 +199,15 @@ def read(self, path, columns=None, **kwargs): return parquet_file.to_pandas(columns=columns, **kwargs) -def to_parquet(df, path, engine='auto', compression='snappy', index=None, - partition_cols=None, **kwargs): +def to_parquet( + df, + path, + engine="auto", + compression="snappy", + index=None, + partition_cols=None, + **kwargs +): """ Write a DataFrame to the parquet format. @@ -215,11 +243,17 @@ def to_parquet(df, path, engine='auto', compression='snappy', index=None, Additional keyword arguments passed to the engine """ impl = get_engine(engine) - return impl.write(df, path, compression=compression, index=index, - partition_cols=partition_cols, **kwargs) + return impl.write( + df, + path, + compression=compression, + index=index, + partition_cols=partition_cols, + **kwargs + ) -def read_parquet(path, engine='auto', columns=None, **kwargs): +def read_parquet(path, engine="auto", columns=None, **kwargs): """ Load a parquet object from the file path, returning a DataFrame. diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8fe0e466e7c0ac..78440939ebc01f 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -19,15 +19,30 @@ import pandas._libs.parsers as parsers from pandas._libs.tslibs import parsing from pandas.errors import ( - AbstractMethodError, EmptyDataError, ParserError, ParserWarning) + AbstractMethodError, + EmptyDataError, + ParserError, + ParserWarning, +) from pandas.util._decorators import Appender from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( - ensure_object, ensure_str, is_bool_dtype, is_categorical_dtype, - is_dtype_equal, is_extension_array_dtype, is_float, is_integer, - is_integer_dtype, is_list_like, is_object_dtype, is_scalar, - is_string_dtype, pandas_dtype) + ensure_object, + ensure_str, + is_bool_dtype, + is_categorical_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_float, + is_integer, + is_integer_dtype, + is_list_like, + is_object_dtype, + is_scalar, + is_string_dtype, + pandas_dtype, +) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import isna @@ -35,24 +50,31 @@ from pandas.core import algorithms from pandas.core.arrays import Categorical from pandas.core.frame import DataFrame -from pandas.core.index import ( - Index, MultiIndex, RangeIndex, ensure_index_from_sequences) +from pandas.core.index import Index, MultiIndex, RangeIndex, ensure_index_from_sequences from pandas.core.series import Series from pandas.core.tools import datetimes as tools from pandas.io.common import ( - _NA_VALUES, BaseIterator, UnicodeReader, UTF8Recoder, _get_handle, - _infer_compression, _validate_header_arg, get_filepath_or_buffer, - is_file_like) + _NA_VALUES, + BaseIterator, + UnicodeReader, + UTF8Recoder, + _get_handle, + _infer_compression, + _validate_header_arg, + get_filepath_or_buffer, + is_file_like, +) from pandas.io.date_converters import generic_parser # BOM character (byte order mark) # This exists at the beginning of a file to indicate endianness # of a file (stream). Unfortunately, this marker screws up parsing, # so we need to remove it if we see it. -_BOM = '\ufeff' +_BOM = "\ufeff" -_doc_read_csv_and_table = r""" +_doc_read_csv_and_table = ( + r""" {summary} Also supports optionally iterating or breaking of the file @@ -168,8 +190,9 @@ na_values : scalar, str, list-like, or dict, optional Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as - NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), - 70, subsequent_indent=" ") + """'. + NaN: '""" + + fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ") + + """'. keep_default_na : bool, default True Whether or not to include the default NaN values when parsing the data. Depending on whether `na_values` is passed in, the behavior is as follows: @@ -343,6 +366,7 @@ -------- >>> pd.{func_name}('data.csv') # doctest: +SKIP """ +) def _validate_integer(name, val, min_val=0): @@ -361,8 +385,9 @@ def _validate_integer(name, val, min_val=0): min_val : int Minimum allowed value (val < min_val will result in a ValueError) """ - msg = "'{name:s}' must be an integer >={min_val:d}".format(name=name, - min_val=min_val) + msg = "'{name:s}' must be an integer >={min_val:d}".format( + name=name, min_val=min_val + ) if val is not None: if is_float(val): @@ -394,18 +419,18 @@ def _validate_names(names): if names is not None: if len(names) != len(set(names)): - raise ValueError('Duplicate names are not allowed.') + raise ValueError("Duplicate names are not allowed.") return names def _read(filepath_or_buffer: FilePathOrBuffer, kwds): """Generic reader of line files.""" - encoding = kwds.get('encoding', None) + encoding = kwds.get("encoding", None) if encoding is not None: - encoding = re.sub('_', '-', encoding).lower() - kwds['encoding'] = encoding + encoding = re.sub("_", "-", encoding).lower() + kwds["encoding"] = encoding - compression = kwds.get('compression', 'infer') + compression = kwds.get("compression", "infer") compression = _infer_compression(filepath_or_buffer, compression) # TODO: get_filepath_or_buffer could return @@ -413,17 +438,18 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): # though mypy handling of conditional imports is difficult. # See https://github.com/python/mypy/issues/1297 fp_or_buf, _, compression, should_close = get_filepath_or_buffer( - filepath_or_buffer, encoding, compression) - kwds['compression'] = compression + filepath_or_buffer, encoding, compression + ) + kwds["compression"] = compression - if kwds.get('date_parser', None) is not None: - if isinstance(kwds['parse_dates'], bool): - kwds['parse_dates'] = True + if kwds.get("date_parser", None) is not None: + if isinstance(kwds["parse_dates"], bool): + kwds["parse_dates"] = True # Extract some of the arguments (pass chunksize on). - iterator = kwds.get('iterator', False) - chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1) - nrows = kwds.get('nrows', None) + iterator = kwds.get("iterator", False) + chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1) + nrows = kwds.get("nrows", None) # Check for duplicates in names. _validate_names(kwds.get("names", None)) @@ -449,147 +475,127 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): _parser_defaults = { - 'delimiter': None, - - 'escapechar': None, - 'quotechar': '"', - 'quoting': csv.QUOTE_MINIMAL, - 'doublequote': True, - 'skipinitialspace': False, - 'lineterminator': None, - - 'header': 'infer', - 'index_col': None, - 'names': None, - 'prefix': None, - 'skiprows': None, - 'skipfooter': 0, - 'nrows': None, - 'na_values': None, - 'keep_default_na': True, - - 'true_values': None, - 'false_values': None, - 'converters': None, - 'dtype': None, - 'cache_dates': True, - - 'thousands': None, - 'comment': None, - 'decimal': b'.', - + "delimiter": None, + "escapechar": None, + "quotechar": '"', + "quoting": csv.QUOTE_MINIMAL, + "doublequote": True, + "skipinitialspace": False, + "lineterminator": None, + "header": "infer", + "index_col": None, + "names": None, + "prefix": None, + "skiprows": None, + "skipfooter": 0, + "nrows": None, + "na_values": None, + "keep_default_na": True, + "true_values": None, + "false_values": None, + "converters": None, + "dtype": None, + "cache_dates": True, + "thousands": None, + "comment": None, + "decimal": b".", # 'engine': 'c', - 'parse_dates': False, - 'keep_date_col': False, - 'dayfirst': False, - 'date_parser': None, - 'usecols': None, - + "parse_dates": False, + "keep_date_col": False, + "dayfirst": False, + "date_parser": None, + "usecols": None, # 'iterator': False, - 'chunksize': None, - 'verbose': False, - 'encoding': None, - 'squeeze': False, - 'compression': None, - 'mangle_dupe_cols': True, - 'infer_datetime_format': False, - 'skip_blank_lines': True + "chunksize": None, + "verbose": False, + "encoding": None, + "squeeze": False, + "compression": None, + "mangle_dupe_cols": True, + "infer_datetime_format": False, + "skip_blank_lines": True, } _c_parser_defaults = { - 'delim_whitespace': False, - 'na_filter': True, - 'low_memory': True, - 'memory_map': False, - 'error_bad_lines': True, - 'warn_bad_lines': True, - 'float_precision': None + "delim_whitespace": False, + "na_filter": True, + "low_memory": True, + "memory_map": False, + "error_bad_lines": True, + "warn_bad_lines": True, + "float_precision": None, } -_fwf_defaults = { - 'colspecs': 'infer', - 'infer_nrows': 100, - 'widths': None, -} +_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} -_c_unsupported = {'skipfooter'} -_python_unsupported = { - 'low_memory', - 'float_precision', -} +_c_unsupported = {"skipfooter"} +_python_unsupported = {"low_memory", "float_precision"} _deprecated_defaults = {} # type: Dict[str, Any] _deprecated_args = set() # type: Set[str] -def _make_parser_function(name, default_sep=','): - - def parser_f(filepath_or_buffer: FilePathOrBuffer, - sep=default_sep, - delimiter=None, - - # Column and Index Locations and Names - header='infer', - names=None, - index_col=None, - usecols=None, - squeeze=False, - prefix=None, - mangle_dupe_cols=True, - - # General Parsing Configuration - dtype=None, - engine=None, - converters=None, - true_values=None, - false_values=None, - skipinitialspace=False, - skiprows=None, - skipfooter=0, - nrows=None, - - # NA and Missing Data Handling - na_values=None, - keep_default_na=True, - na_filter=True, - verbose=False, - skip_blank_lines=True, - - # Datetime Handling - parse_dates=False, - infer_datetime_format=False, - keep_date_col=False, - date_parser=None, - dayfirst=False, - cache_dates=True, - - # Iteration - iterator=False, - chunksize=None, - - # Quoting, Compression, and File Format - compression='infer', - thousands=None, - decimal=b'.', - lineterminator=None, - quotechar='"', - quoting=csv.QUOTE_MINIMAL, - doublequote=True, - escapechar=None, - comment=None, - encoding=None, - dialect=None, - - # Error Handling - error_bad_lines=True, - warn_bad_lines=True, - - # Internal - delim_whitespace=False, - low_memory=_c_parser_defaults['low_memory'], - memory_map=False, - float_precision=None): +def _make_parser_function(name, default_sep=","): + def parser_f( + filepath_or_buffer: FilePathOrBuffer, + sep=default_sep, + delimiter=None, + # Column and Index Locations and Names + header="infer", + names=None, + index_col=None, + usecols=None, + squeeze=False, + prefix=None, + mangle_dupe_cols=True, + # General Parsing Configuration + dtype=None, + engine=None, + converters=None, + true_values=None, + false_values=None, + skipinitialspace=False, + skiprows=None, + skipfooter=0, + nrows=None, + # NA and Missing Data Handling + na_values=None, + keep_default_na=True, + na_filter=True, + verbose=False, + skip_blank_lines=True, + # Datetime Handling + parse_dates=False, + infer_datetime_format=False, + keep_date_col=False, + date_parser=None, + dayfirst=False, + cache_dates=True, + # Iteration + iterator=False, + chunksize=None, + # Quoting, Compression, and File Format + compression="infer", + thousands=None, + decimal=b".", + lineterminator=None, + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + doublequote=True, + escapechar=None, + comment=None, + encoding=None, + dialect=None, + # Error Handling + error_bad_lines=True, + warn_bad_lines=True, + # Internal + delim_whitespace=False, + low_memory=_c_parser_defaults["low_memory"], + memory_map=False, + float_precision=None, + ): # gh-23761 # @@ -614,69 +620,68 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer, delimiter = sep if delim_whitespace and delimiter != default_sep: - raise ValueError("Specified a delimiter with both sep and" - " delim_whitespace=True; you can only" - " specify one.") + raise ValueError( + "Specified a delimiter with both sep and" + " delim_whitespace=True; you can only" + " specify one." + ) if engine is not None: engine_specified = True else: - engine = 'c' + engine = "c" engine_specified = False - kwds.update(delimiter=delimiter, - engine=engine, - dialect=dialect, - compression=compression, - engine_specified=engine_specified, - - doublequote=doublequote, - escapechar=escapechar, - quotechar=quotechar, - quoting=quoting, - skipinitialspace=skipinitialspace, - lineterminator=lineterminator, - - header=header, - index_col=index_col, - names=names, - prefix=prefix, - skiprows=skiprows, - skipfooter=skipfooter, - na_values=na_values, - true_values=true_values, - false_values=false_values, - keep_default_na=keep_default_na, - thousands=thousands, - comment=comment, - decimal=decimal, - - parse_dates=parse_dates, - keep_date_col=keep_date_col, - dayfirst=dayfirst, - date_parser=date_parser, - cache_dates=cache_dates, - - nrows=nrows, - iterator=iterator, - chunksize=chunksize, - converters=converters, - dtype=dtype, - usecols=usecols, - verbose=verbose, - encoding=encoding, - squeeze=squeeze, - memory_map=memory_map, - float_precision=float_precision, - - na_filter=na_filter, - delim_whitespace=delim_whitespace, - warn_bad_lines=warn_bad_lines, - error_bad_lines=error_bad_lines, - low_memory=low_memory, - mangle_dupe_cols=mangle_dupe_cols, - infer_datetime_format=infer_datetime_format, - skip_blank_lines=skip_blank_lines) + kwds.update( + delimiter=delimiter, + engine=engine, + dialect=dialect, + compression=compression, + engine_specified=engine_specified, + doublequote=doublequote, + escapechar=escapechar, + quotechar=quotechar, + quoting=quoting, + skipinitialspace=skipinitialspace, + lineterminator=lineterminator, + header=header, + index_col=index_col, + names=names, + prefix=prefix, + skiprows=skiprows, + skipfooter=skipfooter, + na_values=na_values, + true_values=true_values, + false_values=false_values, + keep_default_na=keep_default_na, + thousands=thousands, + comment=comment, + decimal=decimal, + parse_dates=parse_dates, + keep_date_col=keep_date_col, + dayfirst=dayfirst, + date_parser=date_parser, + cache_dates=cache_dates, + nrows=nrows, + iterator=iterator, + chunksize=chunksize, + converters=converters, + dtype=dtype, + usecols=usecols, + verbose=verbose, + encoding=encoding, + squeeze=squeeze, + memory_map=memory_map, + float_precision=float_precision, + na_filter=na_filter, + delim_whitespace=delim_whitespace, + warn_bad_lines=warn_bad_lines, + error_bad_lines=error_bad_lines, + low_memory=low_memory, + mangle_dupe_cols=mangle_dupe_cols, + infer_datetime_format=infer_datetime_format, + skip_blank_lines=skip_blank_lines, + ) return _read(filepath_or_buffer, kwds) @@ -685,27 +690,32 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer, return parser_f -read_csv = _make_parser_function('read_csv', default_sep=',') -read_csv = Appender(_doc_read_csv_and_table.format( - func_name='read_csv', - summary=('Read a comma-separated values (csv) file ' - 'into DataFrame.'), - _default_sep="','") - )(read_csv) - -read_table = _make_parser_function('read_table', default_sep='\t') -read_table = Appender(_doc_read_csv_and_table.format( - func_name='read_table', - summary='Read general delimited file into DataFrame.', - _default_sep=r"'\\t' (tab-stop)") - )(read_table) - - -def read_fwf(filepath_or_buffer: FilePathOrBuffer, - colspecs='infer', - widths=None, - infer_nrows=100, - **kwds): +read_csv = _make_parser_function("read_csv", default_sep=",") +read_csv = Appender( + _doc_read_csv_and_table.format( + func_name="read_csv", + summary=("Read a comma-separated values (csv) file " "into DataFrame."), + _default_sep="','", + ) +)(read_csv) + +read_table = _make_parser_function("read_table", default_sep="\t") +read_table = Appender( + _doc_read_csv_and_table.format( + func_name="read_table", + summary="Read general delimited file into DataFrame.", + _default_sep=r"'\\t' (tab-stop)", + ) +)(read_table) + + +def read_fwf( + filepath_or_buffer: FilePathOrBuffer, + colspecs="infer", + widths=None, + infer_nrows=100, + **kwds +): r""" Read a table of fixed-width formatted lines into DataFrame. @@ -765,9 +775,8 @@ def read_fwf(filepath_or_buffer: FilePathOrBuffer, # Check input arguments. if colspecs is None and widths is None: raise ValueError("Must specify either colspecs or widths") - elif colspecs not in (None, 'infer') and widths is not None: - raise ValueError("You must specify only one of 'widths' and " - "'colspecs'") + elif colspecs not in (None, "infer") and widths is not None: + raise ValueError("You must specify only one of 'widths' and " "'colspecs'") # Compute 'colspecs' from 'widths', if specified. if widths is not None: @@ -776,9 +785,9 @@ def read_fwf(filepath_or_buffer: FilePathOrBuffer, colspecs.append((col, col + w)) col += w - kwds['colspecs'] = colspecs - kwds['infer_nrows'] = infer_nrows - kwds['engine'] = 'python-fwf' + kwds["colspecs"] = colspecs + kwds["infer_nrows"] = infer_nrows + kwds["engine"] = "python-fwf" return _read(filepath_or_buffer, kwds) @@ -796,25 +805,34 @@ def __init__(self, f, engine=None, **kwds): if engine is not None: engine_specified = True else: - engine = 'python' + engine = "python" engine_specified = False - self._engine_specified = kwds.get('engine_specified', engine_specified) + self._engine_specified = kwds.get("engine_specified", engine_specified) - if kwds.get('dialect') is not None: - dialect = kwds['dialect'] + if kwds.get("dialect") is not None: + dialect = kwds["dialect"] if dialect in csv.list_dialects(): dialect = csv.get_dialect(dialect) # Any valid dialect should have these attributes. # If any are missing, we will raise automatically. - for param in ('delimiter', 'doublequote', 'escapechar', - 'skipinitialspace', 'quotechar', 'quoting'): + for param in ( + "delimiter", + "doublequote", + "escapechar", + "skipinitialspace", + "quotechar", + "quoting", + ): try: dialect_val = getattr(dialect, param) except AttributeError: - raise ValueError("Invalid dialect '{dialect}' provided" - .format(dialect=kwds['dialect'])) + raise ValueError( + "Invalid dialect '{dialect}' provided".format( + dialect=kwds["dialect"] + ) + ) parser_default = _parser_defaults[param] provided = kwds.get(param, parser_default) @@ -825,21 +843,24 @@ def __init__(self, f, engine=None, **kwds): # Don't warn if the default parameter was passed in, # even if it conflicts with the dialect (gh-23761). if provided != parser_default and provided != dialect_val: - msg = ("Conflicting values for '{param}': '{val}' was " - "provided, but the dialect specifies '{diaval}'. " - "Using the dialect-specified value.".format( - param=param, val=provided, diaval=dialect_val)) + msg = ( + "Conflicting values for '{param}': '{val}' was " + "provided, but the dialect specifies '{diaval}'. " + "Using the dialect-specified value.".format( + param=param, val=provided, diaval=dialect_val + ) + ) # Annoying corner case for not warning about # conflicts between dialect and delimiter parameter. # Refer to the outer "_read_" function for more info. - if not (param == "delimiter" and - kwds.pop("sep_override", False)): + if not (param == "delimiter" and kwds.pop("sep_override", False)): conflict_msgs.append(msg) if conflict_msgs: - warnings.warn('\n\n'.join(conflict_msgs), ParserWarning, - stacklevel=2) + warnings.warn( + "\n\n".join(conflict_msgs), ParserWarning, stacklevel=2 + ) kwds[param] = dialect_val if kwds.get("skipfooter"): @@ -848,8 +869,8 @@ def __init__(self, f, engine=None, **kwds): if kwds.get("nrows"): raise ValueError("'skipfooter' not supported with 'nrows'") - if kwds.get('header', 'infer') == 'infer': - kwds['header'] = 0 if kwds.get('names') is None else None + if kwds.get("header", "infer") == "infer": + kwds["header"] = 0 if kwds.get("names") is None else None self.orig_options = kwds @@ -860,16 +881,16 @@ def __init__(self, f, engine=None, **kwds): options = self._get_options_with_defaults(engine) - self.chunksize = options.pop('chunksize', None) - self.nrows = options.pop('nrows', None) - self.squeeze = options.pop('squeeze', False) + self.chunksize = options.pop("chunksize", None) + self.nrows = options.pop("nrows", None) + self.squeeze = options.pop("squeeze", False) # might mutate self.engine self.engine = self._check_file_or_buffer(f, engine) self.options, self.engine = self._clean_options(options, engine) - if 'has_index_names' in kwds: - self.options['has_index_names'] = kwds['has_index_names'] + if "has_index_names" in kwds: + self.options["has_index_names"] = kwds["has_index_names"] self._make_engine(self.engine) @@ -885,9 +906,10 @@ def _get_options_with_defaults(self, engine): value = kwds.get(argname, default) # see gh-12935 - if argname == 'mangle_dupe_cols' and not value: - raise ValueError('Setting mangle_dupe_cols=False is ' - 'not supported yet') + if argname == "mangle_dupe_cols" and not value: + raise ValueError( + "Setting mangle_dupe_cols=False is " "not supported yet" + ) else: options[argname] = value @@ -895,21 +917,21 @@ def _get_options_with_defaults(self, engine): if argname in kwds: value = kwds[argname] - if engine != 'c' and value != default: - if ('python' in engine and - argname not in _python_unsupported): + if engine != "c" and value != default: + if "python" in engine and argname not in _python_unsupported: pass elif value == _deprecated_defaults.get(argname, default): pass else: raise ValueError( - 'The %r option is not supported with the' - ' %r engine' % (argname, engine)) + "The %r option is not supported with the" + " %r engine" % (argname, engine) + ) else: value = _deprecated_defaults.get(argname, default) options[argname] = value - if engine == 'python-fwf': + if engine == "python-fwf": for argname, default in _fwf_defaults.items(): options[argname] = kwds.get(argname, default) @@ -926,8 +948,7 @@ def _check_file_or_buffer(self, f, engine): # needs to have that attribute ("next" for Python 2.x, "__next__" # for Python 3.x) if engine != "c" and not hasattr(f, next_attr): - msg = ("The 'python' engine cannot iterate " - "through this file buffer.") + msg = "The 'python' engine cannot iterate " "through this file buffer." raise ValueError(msg) return engine @@ -938,36 +959,39 @@ def _clean_options(self, options, engine): engine_specified = self._engine_specified fallback_reason = None - sep = options['delimiter'] - delim_whitespace = options['delim_whitespace'] + sep = options["delimiter"] + delim_whitespace = options["delim_whitespace"] # C engine not supported yet - if engine == 'c': - if options['skipfooter'] > 0: - fallback_reason = ("the 'c' engine does not support" - " skipfooter") - engine = 'python' + if engine == "c": + if options["skipfooter"] > 0: + fallback_reason = "the 'c' engine does not support" " skipfooter" + engine = "python" - encoding = sys.getfilesystemencoding() or 'utf-8' + encoding = sys.getfilesystemencoding() or "utf-8" if sep is None and not delim_whitespace: - if engine == 'c': - fallback_reason = ("the 'c' engine does not support" - " sep=None with delim_whitespace=False") - engine = 'python' + if engine == "c": + fallback_reason = ( + "the 'c' engine does not support" + " sep=None with delim_whitespace=False" + ) + engine = "python" elif sep is not None and len(sep) > 1: - if engine == 'c' and sep == r'\s+': - result['delim_whitespace'] = True - del result['delimiter'] - elif engine not in ('python', 'python-fwf'): + if engine == "c" and sep == r"\s+": + result["delim_whitespace"] = True + del result["delimiter"] + elif engine not in ("python", "python-fwf"): # wait until regex engine integrated - fallback_reason = ("the 'c' engine does not support" - " regex separators (separators > 1 char and" - r" different from '\s+' are" - " interpreted as regex)") - engine = 'python' + fallback_reason = ( + "the 'c' engine does not support" + " regex separators (separators > 1 char and" + r" different from '\s+' are" + " interpreted as regex)" + ) + engine = "python" elif delim_whitespace: - if 'python' in engine: - result['delimiter'] = r'\s+' + if "python" in engine: + result["delimiter"] = r"\s+" elif sep is not None: encodeable = True try: @@ -975,73 +999,85 @@ def _clean_options(self, options, engine): encodeable = False except UnicodeDecodeError: encodeable = False - if not encodeable and engine not in ('python', 'python-fwf'): - fallback_reason = ("the separator encoded in {encoding}" - " is > 1 char long, and the 'c' engine" - " does not support such separators" - .format(encoding=encoding)) - engine = 'python' - - quotechar = options['quotechar'] - if (quotechar is not None and - isinstance(quotechar, (str, bytes))): - if (len(quotechar) == 1 and ord(quotechar) > 127 and - engine not in ('python', 'python-fwf')): - fallback_reason = ("ord(quotechar) > 127, meaning the " - "quotechar is larger than one byte, " - "and the 'c' engine does not support " - "such quotechars") - engine = 'python' + if not encodeable and engine not in ("python", "python-fwf"): + fallback_reason = ( + "the separator encoded in {encoding}" + " is > 1 char long, and the 'c' engine" + " does not support such separators".format(encoding=encoding) + ) + engine = "python" + + quotechar = options["quotechar"] + if quotechar is not None and isinstance(quotechar, (str, bytes)): + if ( + len(quotechar) == 1 + and ord(quotechar) > 127 + and engine not in ("python", "python-fwf") + ): + fallback_reason = ( + "ord(quotechar) > 127, meaning the " + "quotechar is larger than one byte, " + "and the 'c' engine does not support " + "such quotechars" + ) + engine = "python" if fallback_reason and engine_specified: raise ValueError(fallback_reason) - if engine == 'c': + if engine == "c": for arg in _c_unsupported: del result[arg] - if 'python' in engine: + if "python" in engine: for arg in _python_unsupported: if fallback_reason and result[arg] != _c_parser_defaults[arg]: - msg = ("Falling back to the 'python' engine because" - " {reason}, but this causes {option!r} to be" - " ignored as it is not supported by the 'python'" - " engine.").format(reason=fallback_reason, - option=arg) + msg = ( + "Falling back to the 'python' engine because" + " {reason}, but this causes {option!r} to be" + " ignored as it is not supported by the 'python'" + " engine." + ).format(reason=fallback_reason, option=arg) raise ValueError(msg) del result[arg] if fallback_reason: - warnings.warn(("Falling back to the 'python' engine because" - " {0}; you can avoid this warning by specifying" - " engine='python'.").format(fallback_reason), - ParserWarning, stacklevel=5) + warnings.warn( + ( + "Falling back to the 'python' engine because" + " {0}; you can avoid this warning by specifying" + " engine='python'." + ).format(fallback_reason), + ParserWarning, + stacklevel=5, + ) - index_col = options['index_col'] - names = options['names'] - converters = options['converters'] - na_values = options['na_values'] - skiprows = options['skiprows'] + index_col = options["index_col"] + names = options["names"] + converters = options["converters"] + na_values = options["na_values"] + skiprows = options["skiprows"] - _validate_header_arg(options['header']) + _validate_header_arg(options["header"]) - depr_warning = '' + depr_warning = "" for arg in _deprecated_args: parser_default = _c_parser_defaults[arg] depr_default = _deprecated_defaults[arg] - msg = ("The '{arg}' argument has been deprecated " - "and will be removed in a future version." - .format(arg=arg)) + msg = ( + "The '{arg}' argument has been deprecated " + "and will be removed in a future version.".format(arg=arg) + ) if result.get(arg, depr_default) != depr_default: # raise Exception(result.get(arg, depr_default), depr_default) - depr_warning += msg + '\n\n' + depr_warning += msg + "\n\n" else: result[arg] = parser_default - if depr_warning != '': + if depr_warning != "": warnings.warn(depr_warning, FutureWarning, stacklevel=2) if index_col is True: @@ -1049,26 +1085,28 @@ def _clean_options(self, options, engine): if _is_index_col(index_col): if not isinstance(index_col, (list, tuple, np.ndarray)): index_col = [index_col] - result['index_col'] = index_col + result["index_col"] = index_col names = list(names) if names is not None else names # type conversion-related if converters is not None: if not isinstance(converters, dict): - raise TypeError('Type converters must be a dict or' - ' subclass, input was ' - 'a {0!r}'.format(type(converters).__name__)) + raise TypeError( + "Type converters must be a dict or" + " subclass, input was " + "a {0!r}".format(type(converters).__name__) + ) else: converters = {} # Converting values to NA - keep_default_na = options['keep_default_na'] + keep_default_na = options["keep_default_na"] na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) # handle skiprows; this is internally handled by the # c-engine, so only need for python parsers - if engine != 'c': + if engine != "c": if is_integer(skiprows): skiprows = list(range(skiprows)) if skiprows is None: @@ -1077,11 +1115,11 @@ def _clean_options(self, options, engine): skiprows = set(skiprows) # put stuff back - result['names'] = names - result['converters'] = converters - result['na_values'] = na_values - result['na_fvalues'] = na_fvalues - result['skiprows'] = skiprows + result["names"] = names + result["converters"] = converters + result["na_values"] = na_values + result["na_fvalues"] = na_fvalues + result["skiprows"] = skiprows return result, engine @@ -1092,25 +1130,27 @@ def __next__(self): self.close() raise - def _make_engine(self, engine='c'): - if engine == 'c': + def _make_engine(self, engine="c"): + if engine == "c": self._engine = CParserWrapper(self.f, **self.options) else: - if engine == 'python': + if engine == "python": klass = PythonParser - elif engine == 'python-fwf': + elif engine == "python-fwf": klass = FixedWidthFieldParser else: - raise ValueError('Unknown engine: {engine} (valid options are' - ' "c", "python", or' ' "python-fwf")'.format( - engine=engine)) + raise ValueError( + "Unknown engine: {engine} (valid options are" + ' "c", "python", or' + ' "python-fwf")'.format(engine=engine) + ) self._engine = klass(self.f, **self.options) def _failover_to_python(self): raise AbstractMethodError(self) def read(self, nrows=None): - nrows = _validate_integer('nrows', nrows) + nrows = _validate_integer("nrows", nrows) ret = self._engine.read(nrows) # May alter columns / col_dict @@ -1166,8 +1206,11 @@ def _is_potential_multi_index(columns): ------- boolean : Whether or not columns could become a MultiIndex """ - return (len(columns) and not isinstance(columns, MultiIndex) and - all(isinstance(c, tuple) for c in columns)) + return ( + len(columns) + and not isinstance(columns, MultiIndex) + and all(isinstance(c, tuple) for c in columns) + ) def _evaluate_usecols(usecols, names): @@ -1271,8 +1314,10 @@ def _validate_usecols_arg(usecols): 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like is passed in or None if a callable or None is passed in. """ - msg = ("'usecols' must either be list-like of all strings, all unicode, " - "all integers or a callable.") + msg = ( + "'usecols' must either be list-like of all strings, all unicode, " + "all integers or a callable." + ) if usecols is not None: if callable(usecols): return usecols, None @@ -1285,8 +1330,7 @@ def _validate_usecols_arg(usecols): usecols_dtype = lib.infer_dtype(usecols, skipna=False) - if usecols_dtype not in ("empty", "integer", - "string", "unicode"): + if usecols_dtype not in ("empty", "integer", "string", "unicode"): raise ValueError(msg) usecols = set(usecols) @@ -1301,9 +1345,11 @@ def _validate_parse_dates_arg(parse_dates): is a non-boolean scalar. Raises a ValueError if that is the case. """ - msg = ("Only booleans, lists, and " - "dictionaries are accepted " - "for the 'parse_dates' parameter") + msg = ( + "Only booleans, lists, and " + "dictionaries are accepted " + "for the 'parse_dates' parameter" + ) if parse_dates is not None: if is_scalar(parse_dates): @@ -1317,62 +1363,65 @@ def _validate_parse_dates_arg(parse_dates): class ParserBase: - def __init__(self, kwds): - self.names = kwds.get('names') + self.names = kwds.get("names") self.orig_names = None - self.prefix = kwds.pop('prefix', None) + self.prefix = kwds.pop("prefix", None) - self.index_col = kwds.get('index_col', None) + self.index_col = kwds.get("index_col", None) self.unnamed_cols = set() self.index_names = None self.col_names = None - self.parse_dates = _validate_parse_dates_arg( - kwds.pop('parse_dates', False)) - self.date_parser = kwds.pop('date_parser', None) - self.dayfirst = kwds.pop('dayfirst', False) - self.keep_date_col = kwds.pop('keep_date_col', False) + self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False)) + self.date_parser = kwds.pop("date_parser", None) + self.dayfirst = kwds.pop("dayfirst", False) + self.keep_date_col = kwds.pop("keep_date_col", False) - self.na_values = kwds.get('na_values') - self.na_fvalues = kwds.get('na_fvalues') - self.na_filter = kwds.get('na_filter', False) - self.keep_default_na = kwds.get('keep_default_na', True) + self.na_values = kwds.get("na_values") + self.na_fvalues = kwds.get("na_fvalues") + self.na_filter = kwds.get("na_filter", False) + self.keep_default_na = kwds.get("keep_default_na", True) - self.true_values = kwds.get('true_values') - self.false_values = kwds.get('false_values') - self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) - self.infer_datetime_format = kwds.pop('infer_datetime_format', False) - self.cache_dates = kwds.pop('cache_dates', True) + self.true_values = kwds.get("true_values") + self.false_values = kwds.get("false_values") + self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True) + self.infer_datetime_format = kwds.pop("infer_datetime_format", False) + self.cache_dates = kwds.pop("cache_dates", True) self._date_conv = _make_date_converter( date_parser=self.date_parser, dayfirst=self.dayfirst, infer_datetime_format=self.infer_datetime_format, - cache_dates=self.cache_dates + cache_dates=self.cache_dates, ) # validate header options for mi - self.header = kwds.get('header') + self.header = kwds.get("header") if isinstance(self.header, (list, tuple, np.ndarray)): if not all(map(is_integer, self.header)): raise ValueError("header must be integer or list of integers") - if kwds.get('usecols'): - raise ValueError("cannot specify usecols when " - "specifying a multi-index header") - if kwds.get('names'): - raise ValueError("cannot specify names when " - "specifying a multi-index header") + if kwds.get("usecols"): + raise ValueError( + "cannot specify usecols when " "specifying a multi-index header" + ) + if kwds.get("names"): + raise ValueError( + "cannot specify names when " "specifying a multi-index header" + ) # validate index_col that only contains integers if self.index_col is not None: - is_sequence = isinstance(self.index_col, (list, tuple, - np.ndarray)) - if not (is_sequence and - all(map(is_integer, self.index_col)) or - is_integer(self.index_col)): - raise ValueError("index_col must only contain row numbers " - "when specifying a multi-index header") + is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray)) + if not ( + is_sequence + and all(map(is_integer, self.index_col)) + or is_integer(self.index_col) + ): + raise ValueError( + "index_col must only contain row numbers " + "when specifying a multi-index header" + ) # GH 16338 elif self.header is not None and not is_integer(self.header): @@ -1392,10 +1441,11 @@ def close(self): @property def _has_complex_date_col(self): - return (isinstance(self.parse_dates, dict) or - (isinstance(self.parse_dates, list) and - len(self.parse_dates) > 0 and - isinstance(self.parse_dates[0], list))) + return isinstance(self.parse_dates, dict) or ( + isinstance(self.parse_dates, list) + and len(self.parse_dates) > 0 + and isinstance(self.parse_dates[0], list) + ) def _should_parse_dates(self, i): if isinstance(self.parse_dates, bool): @@ -1408,14 +1458,17 @@ def _should_parse_dates(self, i): j = self.index_col[i] if is_scalar(self.parse_dates): - return ((j == self.parse_dates) or - (name is not None and name == self.parse_dates)) + return (j == self.parse_dates) or ( + name is not None and name == self.parse_dates + ) else: - return ((j in self.parse_dates) or - (name is not None and name in self.parse_dates)) + return (j in self.parse_dates) or ( + name is not None and name in self.parse_dates + ) - def _extract_multi_indexer_columns(self, header, index_names, col_names, - passed_names=False): + def _extract_multi_indexer_columns( + self, header, index_names, col_names, passed_names=False + ): """ extract and return the names, index_names, col_names header is a list-of-lists returned from the parsers """ if len(header) < 2: @@ -1434,9 +1487,9 @@ def _extract_multi_indexer_columns(self, header, index_names, col_names, # clean the index_names index_names = header.pop(-1) - index_names, names, index_col = _clean_index_names(index_names, - self.index_col, - self.unnamed_cols) + index_names, names, index_col = _clean_index_names( + index_names, self.index_col, self.unnamed_cols + ) # extract the columns field_count = len(header[0]) @@ -1453,15 +1506,17 @@ def extract(r): if all(ensure_str(col[n]) in self.unnamed_cols for col in columns): raise ParserError( "Passed header=[{header}] are too many rows for this " - "multi_index of columns" - .format(header=','.join(str(x) for x in self.header)) + "multi_index of columns".format( + header=",".join(str(x) for x in self.header) + ) ) # Clean the column names (if we have an index_col). if len(ic): - col_names = [r[0] if (len(r[0]) and - r[0] not in self.unnamed_cols) else None - for r in header] + col_names = [ + r[0] if (len(r[0]) and r[0] not in self.unnamed_cols) else None + for r in header + ] else: col_names = [None] * len(header) @@ -1487,11 +1542,11 @@ def _maybe_dedup_names(self, names): counts[col] = cur_count + 1 if is_potential_mi: - col = col[:-1] + ('{column}.{count}'.format( - column=col[-1], count=cur_count),) + col = col[:-1] + ( + "{column}.{count}".format(column=col[-1], count=cur_count), + ) else: - col = '{column}.{count}'.format( - column=col, count=cur_count) + col = "{column}.{count}".format(column=col, count=cur_count) cur_count = counts[col] names[i] = col @@ -1514,10 +1569,9 @@ def _make_index(self, data, alldata, columns, indexnamerow=False): index = self._agg_index(index) elif self._has_complex_date_col: if not self._name_processed: - (self.index_names, _, - self.index_col) = _clean_index_names(list(columns), - self.index_col, - self.unnamed_cols) + (self.index_names, _, self.index_col) = _clean_index_names( + list(columns), self.index_col, self.unnamed_cols + ) self._name_processed = True index = self._get_complex_date_index(data, columns) index = self._agg_index(index, try_parse_dates=False) @@ -1538,7 +1592,7 @@ def _get_simple_index(self, data, columns): def ix(col): if not isinstance(col, str): return col - raise ValueError('Index {col} invalid'.format(col=col)) + raise ValueError("Index {col} invalid".format(col=col)) to_remove = [] index = [] @@ -1562,8 +1616,11 @@ def _get_name(icol): return icol if col_names is None: - raise ValueError(('Must supply column order to use {icol!s} ' - 'as index').format(icol=icol)) + raise ValueError( + ("Must supply column order to use {icol!s} " "as index").format( + icol=icol + ) + ) for i, c in enumerate(col_names): if i == icol: @@ -1603,8 +1660,8 @@ def _agg_index(self, index, try_parse_dates=True): col_name = self.index_names[i] if col_name is not None: col_na_values, col_na_fvalues = _get_na_values( - col_name, self.na_values, self.na_fvalues, - self.keep_default_na) + col_name, self.na_values, self.na_fvalues, self.keep_default_na + ) arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues) arrays.append(arr) @@ -1614,8 +1671,9 @@ def _agg_index(self, index, try_parse_dates=True): return index - def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, - converters=None, dtypes=None): + def _convert_to_ndarrays( + self, dct, na_values, na_fvalues, verbose=False, converters=None, dtypes=None + ): result = {} for c, values in dct.items(): conv_f = None if converters is None else converters.get(c, None) @@ -1627,50 +1685,61 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, if self.na_filter: col_na_values, col_na_fvalues = _get_na_values( - c, na_values, na_fvalues, self.keep_default_na) + c, na_values, na_fvalues, self.keep_default_na + ) else: col_na_values, col_na_fvalues = set(), set() if conv_f is not None: # conv_f applied to data before inference if cast_type is not None: - warnings.warn(("Both a converter and dtype were specified " - "for column {0} - only the converter will " - "be used").format(c), ParserWarning, - stacklevel=7) + warnings.warn( + ( + "Both a converter and dtype were specified " + "for column {0} - only the converter will " + "be used" + ).format(c), + ParserWarning, + stacklevel=7, + ) try: values = lib.map_infer(values, conv_f) except ValueError: - mask = algorithms.isin( - values, list(na_values)).view(np.uint8) + mask = algorithms.isin(values, list(na_values)).view(np.uint8) values = lib.map_infer_mask(values, conv_f, mask) cvals, na_count = self._infer_types( - values, set(col_na_values) | col_na_fvalues, - try_num_bool=False) + values, set(col_na_values) | col_na_fvalues, try_num_bool=False + ) else: - is_str_or_ea_dtype = (is_string_dtype(cast_type) - or is_extension_array_dtype(cast_type)) + is_str_or_ea_dtype = is_string_dtype( + cast_type + ) or is_extension_array_dtype(cast_type) # skip inference if specified dtype is object # or casting to an EA try_num_bool = not (cast_type and is_str_or_ea_dtype) # general type inference and conversion cvals, na_count = self._infer_types( - values, set(col_na_values) | col_na_fvalues, - try_num_bool) + values, set(col_na_values) | col_na_fvalues, try_num_bool + ) # type specified in dtype param or cast_type is an EA - if cast_type and (not is_dtype_equal(cvals, cast_type) - or is_extension_array_dtype(cast_type)): + if cast_type and ( + not is_dtype_equal(cvals, cast_type) + or is_extension_array_dtype(cast_type) + ): try: - if (is_bool_dtype(cast_type) and - not is_categorical_dtype(cast_type) - and na_count > 0): - raise ValueError("Bool column has NA values in " - "column {column}" - .format(column=c)) + if ( + is_bool_dtype(cast_type) + and not is_categorical_dtype(cast_type) + and na_count > 0 + ): + raise ValueError( + "Bool column has NA values in " + "column {column}".format(column=c) + ) except (AttributeError, TypeError): # invalid input to is_bool_dtype pass @@ -1678,8 +1747,11 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, result[c] = cvals if verbose and na_count: - print('Filled {count} NA values in column {c!s}'.format( - count=na_count, c=c)) + print( + "Filled {count} NA values in column {c!s}".format( + count=na_count, c=c + ) + ) return result def _infer_types(self, values, na_values, try_num_bool=True): @@ -1715,17 +1787,18 @@ def _infer_types(self, values, na_values, try_num_bool=True): except Exception: result = values if values.dtype == np.object_: - na_count = parsers.sanitize_objects(result, - na_values, False) + na_count = parsers.sanitize_objects(result, na_values, False) else: result = values if values.dtype == np.object_: na_count = parsers.sanitize_objects(values, na_values, False) if result.dtype == np.object_ and try_num_bool: - result = libops.maybe_convert_bool(np.asarray(values), - true_values=self.true_values, - false_values=self.false_values) + result = libops.maybe_convert_bool( + np.asarray(values), + true_values=self.true_values, + false_values=self.false_values, + ) return result, na_count @@ -1747,8 +1820,10 @@ def _cast_types(self, values, cast_type, column): """ if is_categorical_dtype(cast_type): - known_cats = (isinstance(cast_type, CategoricalDtype) and - cast_type.categories is not None) + known_cats = ( + isinstance(cast_type, CategoricalDtype) + and cast_type.categories is not None + ) if not is_object_dtype(values) and not known_cats: # XXX this is for consistency with @@ -1758,8 +1833,8 @@ def _cast_types(self, values, cast_type, column): cats = Index(values).unique().dropna() values = Categorical._from_inferred_categories( - cats, cats.get_indexer(values), cast_type, - true_values=self.true_values) + cats, cats.get_indexer(values), cast_type, true_values=self.true_values + ) # use the EA's implementation of casting elif is_extension_array_dtype(cast_type): @@ -1767,23 +1842,22 @@ def _cast_types(self, values, cast_type, column): cast_type = pandas_dtype(cast_type) array_type = cast_type.construct_array_type() try: - return array_type._from_sequence_of_strings(values, - dtype=cast_type) + return array_type._from_sequence_of_strings(values, dtype=cast_type) except NotImplementedError: raise NotImplementedError( "Extension Array: {ea} must implement " "_from_sequence_of_strings in order " - "to be used in parser methods".format(ea=array_type)) + "to be used in parser methods".format(ea=array_type) + ) else: try: - values = astype_nansafe(values, cast_type, - copy=True, skipna=True) + values = astype_nansafe(values, cast_type, copy=True, skipna=True) except ValueError: raise ValueError( "Unable to convert column {column} to type " - "{cast_type}".format( - column=column, cast_type=cast_type)) + "{cast_type}".format(column=column, cast_type=cast_type) + ) return values def _do_date_conversions(self, names, data): @@ -1791,8 +1865,14 @@ def _do_date_conversions(self, names, data): if self.parse_dates is not None: data, names = _process_date_conversion( - data, self._date_conv, self.parse_dates, self.index_col, - self.index_names, names, keep_date_col=self.keep_date_col) + data, + self._date_conv, + self.parse_dates, + self.index_col, + self.index_names, + names, + keep_date_col=self.keep_date_col, + ) return names, data @@ -1808,22 +1888,20 @@ def __init__(self, src, **kwds): ParserBase.__init__(self, kwds) - if (kwds.get('compression') is None - and 'utf-16' in (kwds.get('encoding') or '')): + if kwds.get("compression") is None and "utf-16" in (kwds.get("encoding") or ""): # if source is utf-16 plain text, convert source to utf-8 if isinstance(src, str): - src = open(src, 'rb') + src = open(src, "rb") self.handles.append(src) - src = UTF8Recoder(src, kwds['encoding']) - kwds['encoding'] = 'utf-8' + src = UTF8Recoder(src, kwds["encoding"]) + kwds["encoding"] = "utf-8" # #2442 - kwds['allow_leading_cols'] = self.index_col is not False + kwds["allow_leading_cols"] = self.index_col is not False # GH20529, validate usecol arg before TextReader - self.usecols, self.usecols_dtype = _validate_usecols_arg( - kwds['usecols']) - kwds['usecols'] = self.usecols + self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) + kwds["usecols"] = self.usecols self._reader = parsers.TextReader(src, **kwds) self.unnamed_cols = self._reader.unnamed_cols @@ -1835,19 +1913,18 @@ def __init__(self, src, **kwds): else: if len(self._reader.header) > 1: # we have a multi index in the columns - self.names, self.index_names, self.col_names, passed_names = ( - self._extract_multi_indexer_columns( - self._reader.header, self.index_names, self.col_names, - passed_names - ) + self.names, self.index_names, self.col_names, passed_names = self._extract_multi_indexer_columns( + self._reader.header, self.index_names, self.col_names, passed_names ) else: self.names = list(self._reader.header[0]) if self.names is None: if self.prefix: - self.names = ['{prefix}{i}'.format(prefix=self.prefix, i=i) - for i in range(self._reader.table_width)] + self.names = [ + "{prefix}{i}".format(prefix=self.prefix, i=i) + for i in range(self._reader.table_width) + ] else: self.names = list(range(self._reader.table_width)) @@ -1865,19 +1942,23 @@ def __init__(self, src, **kwds): usecols = _evaluate_usecols(self.usecols, self.orig_names) # GH 14671 - if (self.usecols_dtype == 'string' and - not set(usecols).issubset(self.orig_names)): + if self.usecols_dtype == "string" and not set(usecols).issubset( + self.orig_names + ): _validate_usecols_names(usecols, self.orig_names) # GH 25623 # validate that column indices in usecols are not out of bounds - elif self.usecols_dtype == 'integer': + elif self.usecols_dtype == "integer": indices = range(self._reader.table_width) _validate_usecols_names(usecols, indices) if len(self.names) > len(usecols): - self.names = [n for i, n in enumerate(self.names) - if (i in usecols or n in usecols)] + self.names = [ + n + for i, n in enumerate(self.names) + if (i in usecols or n in usecols) + ] if len(self.names) < len(usecols): _validate_usecols_names(usecols, self.names) @@ -1887,14 +1968,12 @@ def __init__(self, src, **kwds): self.orig_names = self.names if not self._has_complex_date_col: - if (self._reader.leading_cols == 0 and - _is_index_col(self.index_col)): + if self._reader.leading_cols == 0 and _is_index_col(self.index_col): self._name_processed = True - (index_names, self.names, - self.index_col) = _clean_index_names(self.names, - self.index_col, - self.unnamed_cols) + (index_names, self.names, self.index_col) = _clean_index_names( + self.names, self.index_col, self.unnamed_cols + ) if self.index_names is None: self.index_names = index_names @@ -1922,13 +2001,12 @@ def _set_noconvert_columns(self): undergo such conversions. """ names = self.orig_names - if self.usecols_dtype == 'integer': + if self.usecols_dtype == "integer": # A set of integers will be converted to a list in # the correct order every single time. usecols = list(self.usecols) usecols.sort() - elif (callable(self.usecols) or - self.usecols_dtype not in ('empty', None)): + elif callable(self.usecols) or self.usecols_dtype not in ("empty", None): # The names attribute should have the correct columns # in the proper order for indexing with parse_dates. usecols = self.names[:] @@ -1979,16 +2057,19 @@ def read(self, nrows=None): self._first_chunk = False names = self._maybe_dedup_names(self.orig_names) index, columns, col_dict = _get_empty_meta( - names, self.index_col, self.index_names, - dtype=self.kwds.get('dtype')) - columns = self._maybe_make_multi_index_columns( - columns, self.col_names) + names, + self.index_col, + self.index_names, + dtype=self.kwds.get("dtype"), + ) + columns = self._maybe_make_multi_index_columns(columns, self.col_names) if self.usecols is not None: columns = self._filter_usecols(columns) - col_dict = dict(filter(lambda item: item[0] in columns, - col_dict.items())) + col_dict = dict( + filter(lambda item: item[0] in columns, col_dict.items()) + ) return index, columns, col_dict @@ -2002,7 +2083,7 @@ def read(self, nrows=None): if self._reader.leading_cols: if self._has_complex_date_col: - raise NotImplementedError('file structure not yet supported') + raise NotImplementedError("file structure not yet supported") # implicit index, no index names arrays = [] @@ -2013,8 +2094,7 @@ def read(self, nrows=None): else: values = data.pop(self.index_col[i]) - values = self._maybe_parse_dates(values, i, - try_parse_dates=True) + values = self._maybe_parse_dates(values, i, try_parse_dates=True) arrays.append(values) index = ensure_index_from_sequences(arrays) @@ -2058,8 +2138,9 @@ def _filter_usecols(self, names): # hackish usecols = _evaluate_usecols(self.usecols, names) if usecols is not None and len(names) != len(usecols): - names = [name for i, name in enumerate(names) - if i in usecols or name in usecols] + names = [ + name for i, name in enumerate(names) if i in usecols or name in usecols + ] return names def _get_index_names(self): @@ -2067,9 +2148,9 @@ def _get_index_names(self): idx_names = None if self._reader.leading_cols == 0 and self.index_col is not None: - (idx_names, names, - self.index_col) = _clean_index_names(names, self.index_col, - self.unnamed_cols) + (idx_names, names, self.index_col) = _clean_index_names( + names, self.index_col, self.unnamed_cols + ) return names, idx_names @@ -2133,16 +2214,15 @@ def TextParser(*args, **kwds): 'high' for the high-precision converter, and 'round_trip' for the round-trip converter. """ - kwds['engine'] = 'python' + kwds["engine"] = "python" return TextFileReader(*args, **kwds) def count_empty_vals(vals): - return sum(1 for v in vals if v == '' or v is None) + return sum(1 for v in vals if v == "" or v is None) class PythonParser(ParserBase): - def __init__(self, f, **kwds): """ Workhorse function for processing nested list into DataFrame @@ -2156,58 +2236,61 @@ def __init__(self, f, **kwds): self.pos = 0 self.line_pos = 0 - self.encoding = kwds['encoding'] - self.compression = kwds['compression'] - self.memory_map = kwds['memory_map'] - self.skiprows = kwds['skiprows'] + self.encoding = kwds["encoding"] + self.compression = kwds["compression"] + self.memory_map = kwds["memory_map"] + self.skiprows = kwds["skiprows"] if callable(self.skiprows): self.skipfunc = self.skiprows else: self.skipfunc = lambda x: x in self.skiprows - self.skipfooter = _validate_skipfooter_arg(kwds['skipfooter']) - self.delimiter = kwds['delimiter'] + self.skipfooter = _validate_skipfooter_arg(kwds["skipfooter"]) + self.delimiter = kwds["delimiter"] - self.quotechar = kwds['quotechar'] + self.quotechar = kwds["quotechar"] if isinstance(self.quotechar, str): self.quotechar = str(self.quotechar) - self.escapechar = kwds['escapechar'] - self.doublequote = kwds['doublequote'] - self.skipinitialspace = kwds['skipinitialspace'] - self.lineterminator = kwds['lineterminator'] - self.quoting = kwds['quoting'] - self.usecols, self.usecols_dtype = _validate_usecols_arg( - kwds['usecols']) - self.skip_blank_lines = kwds['skip_blank_lines'] + self.escapechar = kwds["escapechar"] + self.doublequote = kwds["doublequote"] + self.skipinitialspace = kwds["skipinitialspace"] + self.lineterminator = kwds["lineterminator"] + self.quoting = kwds["quoting"] + self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) + self.skip_blank_lines = kwds["skip_blank_lines"] - self.warn_bad_lines = kwds['warn_bad_lines'] - self.error_bad_lines = kwds['error_bad_lines'] + self.warn_bad_lines = kwds["warn_bad_lines"] + self.error_bad_lines = kwds["error_bad_lines"] - self.names_passed = kwds['names'] or None + self.names_passed = kwds["names"] or None self.has_index_names = False - if 'has_index_names' in kwds: - self.has_index_names = kwds['has_index_names'] + if "has_index_names" in kwds: + self.has_index_names = kwds["has_index_names"] - self.verbose = kwds['verbose'] - self.converters = kwds['converters'] + self.verbose = kwds["verbose"] + self.converters = kwds["converters"] - self.dtype = kwds['dtype'] - self.thousands = kwds['thousands'] - self.decimal = kwds['decimal'] + self.dtype = kwds["dtype"] + self.thousands = kwds["thousands"] + self.decimal = kwds["decimal"] - self.comment = kwds['comment'] + self.comment = kwds["comment"] self._comment_lines = [] - f, handles = _get_handle(f, 'r', encoding=self.encoding, - compression=self.compression, - memory_map=self.memory_map) + f, handles = _get_handle( + f, + "r", + encoding=self.encoding, + compression=self.compression, + memory_map=self.memory_map, + ) self.handles.extend(handles) # Set self.data to something that can read lines. - if hasattr(f, 'readline'): + if hasattr(f, "readline"): self._make_reader(f) else: self.data = f @@ -2215,17 +2298,18 @@ def __init__(self, f, **kwds): # Get columns in two steps: infer from data, then # infer column indices from self.usecols if it is specified. self._col_indices = None - (self.columns, self.num_original_columns, - self.unnamed_cols) = self._infer_columns() + ( + self.columns, + self.num_original_columns, + self.unnamed_cols, + ) = self._infer_columns() # Now self.columns has the set of columns that we will process. # The original set is stored in self.original_columns. if len(self.columns) > 1: # we are processing a multi index column - self.columns, self.index_names, self.col_names, _ = ( - self._extract_multi_indexer_columns( - self.columns, self.index_names, self.col_names - ) + self.columns, self.index_names, self.col_names, _ = self._extract_multi_indexer_columns( + self.columns, self.index_names, self.col_names ) # Update list of original names to include all indices. self.num_original_columns = len(self.columns) @@ -2239,8 +2323,9 @@ def __init__(self, f, **kwds): # multiple date column thing turning into a real spaghetti factory if not self._has_complex_date_col: - (index_names, self.orig_names, self.columns) = ( - self._get_index_name(self.columns)) + (index_names, self.orig_names, self.columns) = self._get_index_name( + self.columns + ) self._name_processed = True if self.index_names is None: self.index_names = index_names @@ -2251,14 +2336,18 @@ def __init__(self, f, **kwds): self._no_thousands_columns = None if len(self.decimal) != 1: - raise ValueError('Only length-1 decimal markers supported') + raise ValueError("Only length-1 decimal markers supported") if self.thousands is None: self.nonnum = re.compile( - r'[^-^0-9^{decimal}]+'.format(decimal=self.decimal)) + r"[^-^0-9^{decimal}]+".format(decimal=self.decimal) + ) else: - self.nonnum = re.compile(r'[^-^0-9^{thousands}^{decimal}]+'.format( - thousands=self.thousands, decimal=self.decimal)) + self.nonnum = re.compile( + r"[^-^0-9^{thousands}^{decimal}]+".format( + thousands=self.thousands, decimal=self.decimal + ) + ) def _set_no_thousands_columns(self): # Create a set of column ids that are not to be stripped of thousands @@ -2301,8 +2390,9 @@ def _make_reader(self, f): if sep is None or len(sep) == 1: if self.lineterminator: - raise ValueError('Custom line terminators not supported in ' - 'python parser (yet)') + raise ValueError( + "Custom line terminators not supported in " "python parser (yet)" + ) class MyDialect(csv.Dialect): delimiter = self.delimiter @@ -2311,7 +2401,7 @@ class MyDialect(csv.Dialect): doublequote = self.doublequote skipinitialspace = self.skipinitialspace quoting = self.quoting - lineterminator = '\n' + lineterminator = "\n" dia = MyDialect @@ -2334,23 +2424,25 @@ class MyDialect(csv.Dialect): sniffed = csv.Sniffer().sniff(line) dia.delimiter = sniffed.delimiter if self.encoding is not None: - self.buf.extend(list( - UnicodeReader(StringIO(line), - dialect=dia, - encoding=self.encoding))) + self.buf.extend( + list( + UnicodeReader( + StringIO(line), dialect=dia, encoding=self.encoding + ) + ) + ) else: - self.buf.extend(list(csv.reader(StringIO(line), - dialect=dia))) + self.buf.extend(list(csv.reader(StringIO(line), dialect=dia))) if self.encoding is not None: - reader = UnicodeReader(f, dialect=dia, - encoding=self.encoding, - strict=True) + reader = UnicodeReader( + f, dialect=dia, encoding=self.encoding, strict=True + ) else: - reader = csv.reader(f, dialect=dia, - strict=True) + reader = csv.reader(f, dialect=dia, strict=True) else: + def _read(): line = f.readline() pat = re.compile(sep) @@ -2359,6 +2451,7 @@ def _read(): for line in f: yield pat.split(line.strip()) + reader = _read() self.data = reader @@ -2380,9 +2473,9 @@ def read(self, rows=None): # DataFrame with the right metadata, even though it's length 0 names = self._maybe_dedup_names(self.orig_names) index, columns, col_dict = _get_empty_meta( - names, self.index_col, self.index_names, self.dtype) - columns = self._maybe_make_multi_index_columns( - columns, self.col_names) + names, self.index_col, self.index_names, self.dtype + ) + columns = self._maybe_make_multi_index_columns(columns, self.col_names) return index, columns, col_dict # handle new style for names in index @@ -2462,9 +2555,14 @@ def _clean_mapping(mapping): clean_na_values = self.na_values clean_na_fvalues = self.na_fvalues - return self._convert_to_ndarrays(data, clean_na_values, - clean_na_fvalues, self.verbose, - clean_conv, clean_dtypes) + return self._convert_to_ndarrays( + data, + clean_na_values, + clean_na_fvalues, + self.verbose, + clean_conv, + clean_dtypes, + ) def _infer_columns(self): names = self.names @@ -2495,8 +2593,9 @@ def _infer_columns(self): except StopIteration: if self.line_pos < hr: raise ValueError( - 'Passed header={hr} but only {pos} lines in ' - 'file'.format(hr=hr, pos=(self.line_pos + 1))) + "Passed header={hr} but only {pos} lines in " + "file".format(hr=hr, pos=(self.line_pos + 1)) + ) # We have an empty file, so check # if columns are provided. That will @@ -2508,8 +2607,7 @@ def _infer_columns(self): return columns, num_original_columns, unnamed_cols if not self.names: - raise EmptyDataError( - "No columns to parse from file") + raise EmptyDataError("No columns to parse from file") line = self.names[:] @@ -2517,10 +2615,11 @@ def _infer_columns(self): this_unnamed_cols = [] for i, c in enumerate(line): - if c == '': + if c == "": if have_mi_columns: - col_name = ("Unnamed: {i}_level_{level}" - .format(i=i, level=level)) + col_name = "Unnamed: {i}_level_{level}".format( + i=i, level=level + ) else: col_name = "Unnamed: {i}".format(i=i) @@ -2537,8 +2636,7 @@ def _infer_columns(self): while cur_count > 0: counts[col] = cur_count + 1 - col = '{column}.{count}'.format( - column=col, count=cur_count) + col = "{column}.{count}".format(column=col, count=cur_count) cur_count = counts[col] this_columns[i] = col @@ -2550,8 +2648,7 @@ def _infer_columns(self): # line for the rest of the parsing code if hr == header[-1]: lc = len(this_columns) - ic = (len(self.index_col) - if self.index_col is not None else 0) + ic = len(self.index_col) if self.index_col is not None else 0 unnamed_count = len(this_unnamed_cols) if lc != unnamed_count and lc - ic > unnamed_count: @@ -2560,8 +2657,7 @@ def _infer_columns(self): self.buf = [self.buf[-1]] columns.append(this_columns) - unnamed_cols.update({this_columns[i] - for i in this_unnamed_cols}) + unnamed_cols.update({this_columns[i] for i in this_unnamed_cols}) if len(columns) == 1: num_original_columns = len(this_columns) @@ -2571,21 +2667,21 @@ def _infer_columns(self): # GH 25623 # validate that column indices in usecols are not out of bounds - if self.usecols_dtype == 'integer': + if self.usecols_dtype == "integer": for col in columns: indices = range(len(col)) _validate_usecols_names(self.usecols, indices) if names is not None: - if ((self.usecols is not None and - len(names) != len(self.usecols)) or - (self.usecols is None and - len(names) != len(columns[0]))): - raise ValueError('Number of passed names did not match ' - 'number of header fields in the file') + if (self.usecols is not None and len(names) != len(self.usecols)) or ( + self.usecols is None and len(names) != len(columns[0]) + ): + raise ValueError( + "Number of passed names did not match " + "number of header fields in the file" + ) if len(columns) > 1: - raise TypeError('Cannot pass names with multi-index ' - 'columns') + raise TypeError("Cannot pass names with multi-index " "columns") if self.usecols is not None: # Set _use_cols. We don't store columns because they are @@ -2603,8 +2699,7 @@ def _infer_columns(self): except StopIteration: if not names: - raise EmptyDataError( - "No columns to parse from file") + raise EmptyDataError("No columns to parse from file") line = names[:] @@ -2613,13 +2708,17 @@ def _infer_columns(self): # GH 25623 # validate that column indices in usecols are not out of bounds - if self.usecols_dtype == 'integer': + if self.usecols_dtype == "integer": _validate_usecols_names(self.usecols, range(ncols)) if not names: if self.prefix: - columns = [['{prefix}{idx}'.format( - prefix=self.prefix, idx=i) for i in range(ncols)]] + columns = [ + [ + "{prefix}{idx}".format(prefix=self.prefix, idx=i) + for i in range(ncols) + ] + ] else: columns = [list(range(ncols))] columns = self._handle_usecols(columns, columns[0]) @@ -2628,11 +2727,10 @@ def _infer_columns(self): columns = self._handle_usecols([names], names) num_original_columns = len(names) else: - if (not callable(self.usecols) and - len(names) != len(self.usecols)): + if not callable(self.usecols) and len(names) != len(self.usecols): raise ValueError( - 'Number of passed names did not match number of ' - 'header fields in the file' + "Number of passed names did not match number of " + "header fields in the file" ) # Ignore output but set used columns. self._handle_usecols([names], names) @@ -2652,8 +2750,9 @@ def _handle_usecols(self, columns, usecols_key): col_indices = _evaluate_usecols(self.usecols, usecols_key) elif any(isinstance(u, str) for u in self.usecols): if len(columns) > 1: - raise ValueError("If using multiple headers, usecols must " - "be integers.") + raise ValueError( + "If using multiple headers, usecols must " "be integers." + ) col_indices = [] for col in self.usecols: @@ -2667,8 +2766,10 @@ def _handle_usecols(self, columns, usecols_key): else: col_indices = self.usecols - columns = [[n for i, n in enumerate(column) if i in col_indices] - for column in columns] + columns = [ + [n for i, n in enumerate(column) if i in col_indices] + for column in columns + ] self._col_indices = col_indices return columns @@ -2724,7 +2825,7 @@ def _check_for_bom(self, first_row): # Extract any remaining data after the second # quotation mark. if len(first_row_bom) > end + 1: - new_row += first_row_bom[end + 1:] + new_row += first_row_bom[end + 1 :] return [new_row] + first_row[1:] elif len(first_row_bom) > 1: @@ -2759,9 +2860,9 @@ def _next_line(self): line = self._check_comments([self.data[self.pos]])[0] self.pos += 1 # either uncommented or blank to begin with - if (not self.skip_blank_lines and - (self._is_line_empty( - self.data[self.pos - 1]) or line)): + if not self.skip_blank_lines and ( + self._is_line_empty(self.data[self.pos - 1]) or line + ): break elif self.skip_blank_lines: ret = self._remove_empty_lines([line]) @@ -2819,8 +2920,8 @@ def _alert_malformed(self, msg, row_num): if self.error_bad_lines: raise ParserError(msg) elif self.warn_bad_lines: - base = 'Skipping line {row_num}: '.format(row_num=row_num) - sys.stderr.write(base + msg + '\n') + base = "Skipping line {row_num}: ".format(row_num=row_num) + sys.stderr.write(base + msg + "\n") def _next_iter_line(self, row_num): """ @@ -2841,19 +2942,23 @@ def _next_iter_line(self, row_num): if self.warn_bad_lines or self.error_bad_lines: msg = str(e) - if 'NULL byte' in msg: - msg = ('NULL byte detected. This byte ' - 'cannot be processed in Python\'s ' - 'native csv library at the moment, ' - 'so please pass in engine=\'c\' instead') + if "NULL byte" in msg: + msg = ( + "NULL byte detected. This byte " + "cannot be processed in Python's " + "native csv library at the moment, " + "so please pass in engine='c' instead" + ) if self.skipfooter > 0: - reason = ('Error could possibly be due to ' - 'parsing errors in the skipped footer rows ' - '(the skipfooter keyword is only applied ' - 'after Python\'s csv library has parsed ' - 'all rows).') - msg += '. ' + reason + reason = ( + "Error could possibly be due to " + "parsing errors in the skipped footer rows " + "(the skipfooter keyword is only applied " + "after Python's csv library has parsed " + "all rows)." + ) + msg += ". " + reason self._alert_malformed(msg, row_num) return None @@ -2865,11 +2970,10 @@ def _check_comments(self, lines): for l in lines: rl = [] for x in l: - if (not isinstance(x, str) or - self.comment not in x): + if not isinstance(x, str) or self.comment not in x: rl.append(x) else: - x = x[:x.find(self.comment)] + x = x[: x.find(self.comment)] if len(x) > 0: rl.append(x) break @@ -2895,8 +2999,11 @@ def _remove_empty_lines(self, lines): ret = [] for l in lines: # Remove empty lines and lines with only one whitespace value - if (len(l) > 1 or len(l) == 1 and - (not isinstance(l[0], str) or l[0].strip())): + if ( + len(l) > 1 + or len(l) == 1 + and (not isinstance(l[0], str) or l[0].strip()) + ): ret.append(l) return ret @@ -2904,20 +3011,21 @@ def _check_thousands(self, lines): if self.thousands is None: return lines - return self._search_replace_num_columns(lines=lines, - search=self.thousands, - replace='') + return self._search_replace_num_columns( + lines=lines, search=self.thousands, replace="" + ) def _search_replace_num_columns(self, lines, search, replace): ret = [] for l in lines: rl = [] for i, x in enumerate(l): - if (not isinstance(x, str) or - search not in x or - (self._no_thousands_columns and - i in self._no_thousands_columns) or - self.nonnum.search(x.strip())): + if ( + not isinstance(x, str) + or search not in x + or (self._no_thousands_columns and i in self._no_thousands_columns) + or self.nonnum.search(x.strip()) + ): rl.append(x) else: rl.append(x.replace(search, replace)) @@ -2925,12 +3033,12 @@ def _search_replace_num_columns(self, lines, search, replace): return ret def _check_decimal(self, lines): - if self.decimal == _parser_defaults['decimal']: + if self.decimal == _parser_defaults["decimal"]: return lines - return self._search_replace_num_columns(lines=lines, - search=self.decimal, - replace='.') + return self._search_replace_num_columns( + lines=lines, search=self.decimal, replace="." + ) def _clear_buffer(self): self.buf = [] @@ -2995,9 +3103,9 @@ def _get_index_name(self, columns): else: # Case 2 - (index_name, columns_, - self.index_col) = _clean_index_names(columns, self.index_col, - self.unnamed_cols) + (index_name, columns_, self.index_col) = _clean_index_names( + columns, self.index_col, self.unnamed_cols + ) return index_name, orig_names, columns @@ -3012,9 +3120,7 @@ def _rows_to_cols(self, content): # Check that there are no rows with too many # elements in their row (rows with too few # elements are padded with NaN). - if (max_len > col_len and - self.index_col is not False and - self.usecols is None): + if max_len > col_len and self.index_col is not False and self.usecols is None: footers = self.skipfooter if self.skipfooter else 0 bad_lines = [] @@ -3037,32 +3143,43 @@ def _rows_to_cols(self, content): content.append(l) for row_num, actual_len in bad_lines: - msg = ('Expected {col_len} fields in line {line}, saw ' - '{length}'.format(col_len=col_len, line=(row_num + 1), - length=actual_len)) - if (self.delimiter and - len(self.delimiter) > 1 and - self.quoting != csv.QUOTE_NONE): + msg = ( + "Expected {col_len} fields in line {line}, saw " + "{length}".format( + col_len=col_len, line=(row_num + 1), length=actual_len + ) + ) + if ( + self.delimiter + and len(self.delimiter) > 1 + and self.quoting != csv.QUOTE_NONE + ): # see gh-13374 - reason = ('Error could possibly be due to quotes being ' - 'ignored when a multi-char delimiter is used.') - msg += '. ' + reason + reason = ( + "Error could possibly be due to quotes being " + "ignored when a multi-char delimiter is used." + ) + msg += ". " + reason self._alert_malformed(msg, row_num + 1) # see gh-13320 - zipped_content = list(lib.to_object_array( - content, min_width=col_len).T) + zipped_content = list(lib.to_object_array(content, min_width=col_len).T) if self.usecols: if self._implicit_index: zipped_content = [ - a for i, a in enumerate(zipped_content) - if (i < len(self.index_col) or - i - len(self.index_col) in self._col_indices)] + a + for i, a in enumerate(zipped_content) + if ( + i < len(self.index_col) + or i - len(self.index_col) in self._col_indices + ) + ] else: - zipped_content = [a for i, a in enumerate(zipped_content) - if i in self._col_indices] + zipped_content = [ + a for i, a in enumerate(zipped_content) if i in self._col_indices + ] return zipped_content def _get_lines(self, rows=None): @@ -3084,16 +3201,19 @@ def _get_lines(self, rows=None): if self.pos > len(self.data): raise StopIteration if rows is None: - new_rows = self.data[self.pos:] + new_rows = self.data[self.pos :] new_pos = len(self.data) else: - new_rows = self.data[self.pos:self.pos + rows] + new_rows = self.data[self.pos : self.pos + rows] new_pos = self.pos + rows # Check for stop rows. n.b.: self.skiprows is a set. if self.skiprows: - new_rows = [row for i, row in enumerate(new_rows) - if not self.skipfunc(i + self.pos)] + new_rows = [ + row + for i, row in enumerate(new_rows) + if not self.skipfunc(i + self.pos) + ] lines.extend(new_rows) self.pos = new_pos @@ -3109,8 +3229,7 @@ def _get_lines(self, rows=None): rows = 0 while True: - new_row = self._next_iter_line( - row_num=self.pos + rows + 1) + new_row = self._next_iter_line(row_num=self.pos + rows + 1) rows += 1 if new_row is not None: @@ -3118,8 +3237,11 @@ def _get_lines(self, rows=None): except StopIteration: if self.skiprows: - new_rows = [row for i, row in enumerate(new_rows) - if not self.skipfunc(i + self.pos)] + new_rows = [ + row + for i, row in enumerate(new_rows) + if not self.skipfunc(i + self.pos) + ] lines.extend(new_rows) if len(lines) == 0: raise @@ -3130,7 +3252,7 @@ def _get_lines(self, rows=None): lines = new_rows if self.skipfooter: - lines = lines[:-self.skipfooter] + lines = lines[: -self.skipfooter] lines = self._check_comments(lines) if self.skip_blank_lines: @@ -3139,8 +3261,9 @@ def _get_lines(self, rows=None): return self._check_decimal(lines) -def _make_date_converter(date_parser=None, dayfirst=False, - infer_datetime_format=False, cache_dates=True): +def _make_date_converter( + date_parser=None, dayfirst=False, infer_datetime_format=False, cache_dates=True +): def converter(*date_cols): if date_parser is None: strs = parsing._concat_date_cols(date_cols) @@ -3150,25 +3273,22 @@ def converter(*date_cols): ensure_object(strs), utc=None, dayfirst=dayfirst, - errors='ignore', + errors="ignore", infer_datetime_format=infer_datetime_format, - cache=cache_dates + cache=cache_dates, ).to_numpy() except ValueError: return tools.to_datetime( - parsing.try_parse_dates(strs, dayfirst=dayfirst), - cache=cache_dates + parsing.try_parse_dates(strs, dayfirst=dayfirst), cache=cache_dates ) else: try: result = tools.to_datetime( - date_parser(*date_cols), - errors='ignore', - cache=cache_dates + date_parser(*date_cols), errors="ignore", cache=cache_dates ) if isinstance(result, datetime.datetime): - raise Exception('scalar parser') + raise Exception("scalar parser") return result except Exception: try: @@ -3176,22 +3296,29 @@ def converter(*date_cols): parsing.try_parse_dates( parsing._concat_date_cols(date_cols), parser=date_parser, - dayfirst=dayfirst), - errors='ignore') + dayfirst=dayfirst, + ), + errors="ignore", + ) except Exception: return generic_parser(date_parser, *date_cols) return converter -def _process_date_conversion(data_dict, converter, parse_spec, - index_col, index_names, columns, - keep_date_col=False): +def _process_date_conversion( + data_dict, + converter, + parse_spec, + index_col, + index_names, + columns, + keep_date_col=False, +): def _isindex(colspec): - return ((isinstance(index_col, list) and - colspec in index_col) or - (isinstance(index_names, list) and - colspec in index_names)) + return (isinstance(index_col, list) and colspec in index_col) or ( + isinstance(index_names, list) and colspec in index_names + ) new_cols = [] new_data = {} @@ -3215,11 +3342,12 @@ def _isindex(colspec): data_dict[colspec] = converter(data_dict[colspec]) else: new_name, col, old_names = _try_convert_dates( - converter, colspec, data_dict, orig_names) + converter, colspec, data_dict, orig_names + ) if new_name in data_dict: raise ValueError( - 'New date column already in dict {name}'.format( - name=new_name)) + "New date column already in dict {name}".format(name=new_name) + ) new_data[new_name] = col new_cols.append(new_name) date_cols.update(old_names) @@ -3229,10 +3357,12 @@ def _isindex(colspec): for new_name, colspec in parse_spec.items(): if new_name in data_dict: raise ValueError( - 'Date column {name} already in dict'.format(name=new_name)) + "Date column {name} already in dict".format(name=new_name) + ) - _, col, old_names = _try_convert_dates(converter, colspec, - data_dict, orig_names) + _, col, old_names = _try_convert_dates( + converter, colspec, data_dict, orig_names + ) new_data[new_name] = col new_cols.append(new_name) @@ -3261,7 +3391,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns): else: colnames.append(c) - new_name = '_'.join(str(x) for x in colnames) + new_name = "_".join(str(x) for x in colnames) to_parse = [data_dict[c] for c in colnames if c in data_dict] new_col = parser(*to_parse) @@ -3377,8 +3507,7 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None): for i, n in enumerate(index_col): columns.pop(n - i) - col_dict = {col_name: Series([], dtype=dtype[col_name]) - for col_name in columns} + col_dict = {col_name: Series([], dtype=dtype[col_name]) for col_name in columns} return index, columns, col_dict @@ -3473,29 +3602,35 @@ class FixedWidthReader(BaseIterator): A reader of fixed-width lines. """ - def __init__(self, f, colspecs, delimiter, comment, skiprows=None, - infer_nrows=100): + def __init__(self, f, colspecs, delimiter, comment, skiprows=None, infer_nrows=100): self.f = f self.buffer = None - self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t ' + self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t " self.comment = comment - if colspecs == 'infer': - self.colspecs = self.detect_colspecs(infer_nrows=infer_nrows, - skiprows=skiprows) + if colspecs == "infer": + self.colspecs = self.detect_colspecs( + infer_nrows=infer_nrows, skiprows=skiprows + ) else: self.colspecs = colspecs if not isinstance(self.colspecs, (tuple, list)): - raise TypeError("column specifications must be a list or tuple, " - "input was a %r" % type(colspecs).__name__) + raise TypeError( + "column specifications must be a list or tuple, " + "input was a %r" % type(colspecs).__name__ + ) for colspec in self.colspecs: - if not (isinstance(colspec, (tuple, list)) and - len(colspec) == 2 and - isinstance(colspec[0], (int, np.integer, type(None))) and - isinstance(colspec[1], (int, np.integer, type(None)))): - raise TypeError('Each column specification must be ' - '2 element tuple or list of integers') + if not ( + isinstance(colspec, (tuple, list)) + and len(colspec) == 2 + and isinstance(colspec[0], (int, np.integer, type(None))) + and isinstance(colspec[1], (int, np.integer, type(None))) + ): + raise TypeError( + "Each column specification must be " + "2 element tuple or list of integers" + ) def get_rows(self, infer_nrows, skiprows=None): """ @@ -3537,8 +3672,8 @@ def get_rows(self, infer_nrows, skiprows=None): def detect_colspecs(self, infer_nrows=100, skiprows=None): # Regex escape the delimiters - delimiters = ''.join(r'\{}'.format(x) for x in self.delimiter) - pattern = re.compile('([^{}]+)'.format(delimiters)) + delimiters = "".join(r"\{}".format(x) for x in self.delimiter) + pattern = re.compile("([^{}]+)".format(delimiters)) rows = self.get_rows(infer_nrows, skiprows) if not rows: raise EmptyDataError("No rows from which to infer column width") @@ -3548,7 +3683,7 @@ def detect_colspecs(self, infer_nrows=100, skiprows=None): rows = [row.partition(self.comment)[0] for row in rows] for row in rows: for m in pattern.finditer(row): - mask[m.start():m.end()] = 1 + mask[m.start() : m.end()] = 1 shifted = np.roll(mask, 1) shifted[0] = 0 edges = np.where((mask ^ shifted) == 1)[0] @@ -3565,8 +3700,7 @@ def __next__(self): else: line = next(self.f) # Note: 'colspecs' is a sequence of half-open intervals. - return [line[fromm:to].strip(self.delimiter) - for (fromm, to) in self.colspecs] + return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs] class FixedWidthFieldParser(PythonParser): @@ -3577,11 +3711,16 @@ class FixedWidthFieldParser(PythonParser): def __init__(self, f, **kwds): # Support iterators, convert to a list. - self.colspecs = kwds.pop('colspecs') - self.infer_nrows = kwds.pop('infer_nrows') + self.colspecs = kwds.pop("colspecs") + self.infer_nrows = kwds.pop("infer_nrows") PythonParser.__init__(self, f, **kwds) def _make_reader(self, f): - self.data = FixedWidthReader(f, self.colspecs, self.delimiter, - self.comment, self.skiprows, - self.infer_nrows) + self.data = FixedWidthReader( + f, + self.colspecs, + self.delimiter, + self.comment, + self.skiprows, + self.infer_nrows, + ) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index afe1622d99eac3..4e390de87fc607 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -10,8 +10,7 @@ from pandas.io.common import _get_handle, _stringify_path -def to_pickle(obj, path, compression='infer', - protocol=pickle.HIGHEST_PROTOCOL): +def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): """ Pickle (serialize) object to file. @@ -70,9 +69,7 @@ def to_pickle(obj, path, compression='infer', >>> os.remove("./dummy.pkl") """ path = _stringify_path(path) - f, fh = _get_handle(path, 'wb', - compression=compression, - is_text=False) + f, fh = _get_handle(path, "wb", compression=compression, is_text=False) if protocol < 0: protocol = pickle.HIGHEST_PROTOCOL try: @@ -83,7 +80,7 @@ def to_pickle(obj, path, compression='infer', _f.close() -def read_pickle(path, compression='infer'): +def read_pickle(path, compression="infer"): """ Load pickled pandas object (or any object) from file. @@ -145,7 +142,7 @@ def read_pickle(path, compression='infer'): >>> os.remove("./dummy.pkl") """ path = _stringify_path(path) - f, fh = _get_handle(path, 'rb', compression=compression, is_text=False) + f, fh = _get_handle(path, "rb", compression=compression, is_text=False) # 1) try standard libary Pickle # 2) try pickle_compat (older pandas version) to handle subclass changes @@ -160,12 +157,13 @@ def read_pickle(path, compression='infer'): try: return pc.load(f, encoding=None) except Exception: # noqa: E722 - return pc.load(f, encoding='latin1') + return pc.load(f, encoding="latin1") finally: f.close() for _f in fh: _f.close() + # compat with sparse pickle / unpickle diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index f439e365fbcf0b..9206463e18fb31 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -22,15 +22,31 @@ from pandas.errors import PerformanceWarning from pandas.core.dtypes.common import ( - ensure_object, is_categorical_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_extension_type, is_list_like, - is_timedelta64_dtype) + ensure_object, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_extension_type, + is_list_like, + is_timedelta64_dtype, +) from pandas.core.dtypes.missing import array_equivalent from pandas import ( - DataFrame, DatetimeIndex, Index, Int64Index, MultiIndex, PeriodIndex, - Series, SparseDataFrame, SparseSeries, TimedeltaIndex, concat, isna, - to_datetime) + DataFrame, + DatetimeIndex, + Index, + Int64Index, + MultiIndex, + PeriodIndex, + Series, + SparseDataFrame, + SparseSeries, + TimedeltaIndex, + concat, + isna, + to_datetime, +) from pandas.core.arrays.categorical import Categorical from pandas.core.arrays.sparse import BlockIndex, IntIndex import pandas.core.common as com @@ -42,16 +58,16 @@ from pandas.io.formats.printing import adjoin, pprint_thing # versioning attribute -_version = '0.15.2' +_version = "0.15.2" # encoding -_default_encoding = 'UTF-8' +_default_encoding = "UTF-8" def _ensure_decoded(s): """ if we have bytes, decode them to unicode """ if isinstance(s, np.bytes_): - s = s.decode('UTF-8') + s = s.decode("UTF-8") return s @@ -145,12 +161,7 @@ class DuplicateWarning(Warning): """ # formats -_FORMAT_MAP = { - 'f': 'fixed', - 'fixed': 'fixed', - 't': 'table', - 'table': 'table', -} +_FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"} format_deprecate_doc = """ the table keyword has been deprecated @@ -163,38 +174,35 @@ class DuplicateWarning(Warning): # map object types _TYPE_MAP = { - - Series: 'series', - SparseSeries: 'sparse_series', - DataFrame: 'frame', - SparseDataFrame: 'sparse_frame', + Series: "series", + SparseSeries: "sparse_series", + DataFrame: "frame", + SparseDataFrame: "sparse_frame", } # storer class map _STORER_MAP = { - 'Series': 'LegacySeriesFixed', - 'DataFrame': 'LegacyFrameFixed', - 'DataMatrix': 'LegacyFrameFixed', - 'series': 'SeriesFixed', - 'sparse_series': 'SparseSeriesFixed', - 'frame': 'FrameFixed', - 'sparse_frame': 'SparseFrameFixed', + "Series": "LegacySeriesFixed", + "DataFrame": "LegacyFrameFixed", + "DataMatrix": "LegacyFrameFixed", + "series": "SeriesFixed", + "sparse_series": "SparseSeriesFixed", + "frame": "FrameFixed", + "sparse_frame": "SparseFrameFixed", } # table class map _TABLE_MAP = { - 'generic_table': 'GenericTable', - 'appendable_series': 'AppendableSeriesTable', - 'appendable_multiseries': 'AppendableMultiSeriesTable', - 'appendable_frame': 'AppendableFrameTable', - 'appendable_multiframe': 'AppendableMultiFrameTable', - 'worm': 'WORMTable', + "generic_table": "GenericTable", + "appendable_series": "AppendableSeriesTable", + "appendable_multiseries": "AppendableMultiSeriesTable", + "appendable_frame": "AppendableFrameTable", + "appendable_multiframe": "AppendableMultiFrameTable", + "worm": "WORMTable", } # axes map -_AXES_MAP = { - DataFrame: [0], -} +_AXES_MAP = {DataFrame: [0]} # register our configuration options dropna_doc = """ @@ -207,12 +215,13 @@ class DuplicateWarning(Warning): put will default to 'fixed' and append will default to 'table' """ -with config.config_prefix('io.hdf'): - config.register_option('dropna_table', False, dropna_doc, - validator=config.is_bool) +with config.config_prefix("io.hdf"): + config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool) config.register_option( - 'default_format', None, format_doc, - validator=config.is_one_of_factory(['fixed', 'table', None]) + "default_format", + None, + format_doc, + validator=config.is_one_of_factory(["fixed", "table", None]), ) # oh the troubles to reduce import time @@ -225,6 +234,7 @@ def _tables(): global _table_file_open_policy_is_strict if _table_mod is None: import tables + _table_mod = tables # set the file open policy @@ -232,17 +242,27 @@ def _tables(): # depending on the HDF5 version try: _table_file_open_policy_is_strict = ( - tables.file._FILE_OPEN_POLICY == 'strict') + tables.file._FILE_OPEN_POLICY == "strict" + ) except AttributeError: pass return _table_mod + # interface to/from ### -def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, - append=None, **kwargs): +def to_hdf( + path_or_buf, + key, + value, + mode=None, + complevel=None, + complib=None, + append=None, + **kwargs +): """ store this object, close it if we opened it """ if append: @@ -252,14 +272,15 @@ def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, path_or_buf = _stringify_path(path_or_buf) if isinstance(path_or_buf, str): - with HDFStore(path_or_buf, mode=mode, complevel=complevel, - complib=complib) as store: + with HDFStore( + path_or_buf, mode=mode, complevel=complevel, complib=complib + ) as store: f(store) else: f(path_or_buf) -def read_hdf(path_or_buf, key=None, mode='r', **kwargs): +def read_hdf(path_or_buf, key=None, mode="r", **kwargs): """ Read from the store, close it if we opened it. @@ -319,24 +340,27 @@ def read_hdf(path_or_buf, key=None, mode='r', **kwargs): >>> reread = pd.read_hdf('./store.h5') """ - if mode not in ['r', 'r+', 'a']: - raise ValueError('mode {0} is not allowed while performing a read. ' - 'Allowed modes are r, r+ and a.'.format(mode)) + if mode not in ["r", "r+", "a"]: + raise ValueError( + "mode {0} is not allowed while performing a read. " + "Allowed modes are r, r+ and a.".format(mode) + ) # grab the scope - if 'where' in kwargs: - kwargs['where'] = _ensure_term(kwargs['where'], scope_level=1) + if "where" in kwargs: + kwargs["where"] = _ensure_term(kwargs["where"], scope_level=1) if isinstance(path_or_buf, HDFStore): if not path_or_buf.is_open: - raise IOError('The HDFStore must be open for reading.') + raise IOError("The HDFStore must be open for reading.") store = path_or_buf auto_close = False else: path_or_buf = _stringify_path(path_or_buf) if not isinstance(path_or_buf, str): - raise NotImplementedError('Support for generic buffers has not ' - 'been implemented.') + raise NotImplementedError( + "Support for generic buffers has not " "been implemented." + ) try: exists = os.path.exists(path_or_buf) @@ -346,7 +370,8 @@ def read_hdf(path_or_buf, key=None, mode='r', **kwargs): if not exists: raise FileNotFoundError( - 'File {path} does not exist'.format(path=path_or_buf)) + "File {path} does not exist".format(path=path_or_buf) + ) store = HDFStore(path_or_buf, mode=mode, **kwargs) # can't auto open/close if we are using an iterator @@ -357,7 +382,7 @@ def read_hdf(path_or_buf, key=None, mode='r', **kwargs): if key is None: groups = store.groups() if len(groups) == 0: - raise ValueError('No dataset in HDF5 file.') + raise ValueError("No dataset in HDF5 file.") candidate_only_group = groups[0] # For the HDF file to have only one dataset, all other groups @@ -366,8 +391,10 @@ def read_hdf(path_or_buf, key=None, mode='r', **kwargs): # before their children.) for group_to_check in groups[1:]: if not _is_metadata_of(group_to_check, candidate_only_group): - raise ValueError('key must be provided when HDF5 file ' - 'contains multiple datasets.') + raise ValueError( + "key must be provided when HDF5 file " + "contains multiple datasets." + ) key = candidate_only_group._v_pathname return store.select(key, auto_close=auto_close, **kwargs) except (ValueError, TypeError, KeyError): @@ -388,7 +415,7 @@ def _is_metadata_of(group, parent_group): current = group while current._v_depth > 1: parent = current._v_parent - if parent == parent_group and current._v_name == 'meta': + if parent == parent_group and current._v_name == "meta": return True current = current._v_parent return False @@ -439,25 +466,28 @@ class HDFStore: >>> store.close() """ - def __init__(self, path, mode=None, complevel=None, complib=None, - fletcher32=False, **kwargs): + def __init__( + self, path, mode=None, complevel=None, complib=None, fletcher32=False, **kwargs + ): - if 'format' in kwargs: - raise ValueError('format is not a defined argument for HDFStore') + if "format" in kwargs: + raise ValueError("format is not a defined argument for HDFStore") tables = import_optional_dependency("tables") if complib is not None and complib not in tables.filters.all_complibs: raise ValueError( "complib only supports {libs} compression.".format( - libs=tables.filters.all_complibs)) + libs=tables.filters.all_complibs + ) + ) if complib is None and complevel is not None: complib = tables.filters.default_complib self._path = _stringify_path(path) if mode is None: - mode = 'a' + mode = "a" self._mode = mode self._handle = None self._complevel = complevel if complevel else 0 @@ -496,7 +526,9 @@ def __getattr__(self, name): pass raise AttributeError( "'{object}' object has no attribute '{name}'".format( - object=type(self).__name__, name=name)) + object=type(self).__name__, name=name + ) + ) def __contains__(self, key): """ check for existence of this key @@ -513,8 +545,9 @@ def __len__(self): return len(self.groups()) def __repr__(self): - return '{type}\nFile path: {path}\n'.format( - type=type(self), path=pprint_thing(self._path)) + return "{type}\nFile path: {path}\n".format( + type=type(self), path=pprint_thing(self._path) + ) def __enter__(self): return self @@ -546,7 +579,7 @@ def items(self): iteritems = items - def open(self, mode='a', **kwargs): + def open(self, mode="a", **kwargs): """ Open the file in the specified mode @@ -560,16 +593,15 @@ def open(self, mode='a', **kwargs): if self._mode != mode: # if we are changing a write mode to read, ok - if self._mode in ['a', 'w'] and mode in ['r', 'r+']: + if self._mode in ["a", "w"] and mode in ["r", "r+"]: pass - elif mode in ['w']: + elif mode in ["w"]: # this would truncate, raise here if self.is_open: raise PossibleDataLossError( "Re-opening the file [{0}] with mode [{1}] " - "will delete the current file!" - .format(self._path, self._mode) + "will delete the current file!".format(self._path, self._mode) ) self._mode = mode @@ -579,16 +611,16 @@ def open(self, mode='a', **kwargs): self.close() if self._complevel and self._complevel > 0: - self._filters = _tables().Filters(self._complevel, self._complib, - fletcher32=self._fletcher32) + self._filters = _tables().Filters( + self._complevel, self._complib, fletcher32=self._fletcher32 + ) try: self._handle = tables.open_file(self._path, self._mode, **kwargs) except (IOError) as e: # pragma: no cover - if 'can not be written' in str(e): - print( - 'Opening {path} in read-only mode'.format(path=self._path)) - self._handle = tables.open_file(self._path, 'r', **kwargs) + if "can not be written" in str(e): + print("Opening {path} in read-only mode".format(path=self._path)) + self._handle = tables.open_file(self._path, "r", **kwargs) else: raise @@ -596,7 +628,7 @@ def open(self, mode='a', **kwargs): # trap PyTables >= 3.1 FILE_OPEN_POLICY exception # to provide an updated message - if 'FILE_OPEN_POLICY' in str(e): + if "FILE_OPEN_POLICY" in str(e): e = ValueError( "PyTables [{version}] no longer supports opening multiple " "files\n" @@ -605,9 +637,11 @@ def open(self, mode='a', **kwargs): "and not open the same file multiple times at once,\n" "upgrade the HDF5 version, or downgrade to PyTables 3.0.0 " "which allows\n" - "files to be opened multiple times at once\n" - .format(version=tables.__version__, - hdf_version=tables.get_hdf5_version())) + "files to be opened multiple times at once\n".format( + version=tables.__version__, + hdf_version=tables.get_hdf5_version(), + ) + ) raise e @@ -615,7 +649,7 @@ def open(self, mode='a', **kwargs): # trying to read from a non-existent file causes an error which # is not part of IOError, make it one - if self._mode == 'r' and 'Unable to open/create file' in str(e): + if self._mode == "r" and "Unable to open/create file" in str(e): raise IOError(str(e)) raise @@ -674,11 +708,21 @@ def get(self, key): """ group = self.get_node(key) if group is None: - raise KeyError('No object named {key} in the file'.format(key=key)) + raise KeyError("No object named {key} in the file".format(key=key)) return self._read_group(group) - def select(self, key, where=None, start=None, stop=None, columns=None, - iterator=False, chunksize=None, auto_close=False, **kwargs): + def select( + self, + key, + where=None, + start=None, + stop=None, + columns=None, + iterator=False, + chunksize=None, + auto_close=False, + **kwargs + ): """ Retrieve pandas object stored in file, optionally based on where criteria @@ -702,7 +746,7 @@ def select(self, key, where=None, start=None, stop=None, columns=None, """ group = self.get_node(key) if group is None: - raise KeyError('No object named {key} in the file'.format(key=key)) + raise KeyError("No object named {key} in the file".format(key=key)) # create the storer and axes where = _ensure_term(where, scope_level=1) @@ -711,19 +755,25 @@ def select(self, key, where=None, start=None, stop=None, columns=None, # function to call on iteration def func(_start, _stop, _where): - return s.read(start=_start, stop=_stop, - where=_where, - columns=columns) + return s.read(start=_start, stop=_stop, where=_where, columns=columns) # create the iterator - it = TableIterator(self, s, func, where=where, nrows=s.nrows, - start=start, stop=stop, iterator=iterator, - chunksize=chunksize, auto_close=auto_close) + it = TableIterator( + self, + s, + func, + where=where, + nrows=s.nrows, + start=start, + stop=stop, + iterator=iterator, + chunksize=chunksize, + auto_close=auto_close, + ) return it.get_result() - def select_as_coordinates( - self, key, where=None, start=None, stop=None, **kwargs): + def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs): """ return the selection as an Index @@ -735,8 +785,9 @@ def select_as_coordinates( stop : integer (defaults to None), row number to stop selection """ where = _ensure_term(where, scope_level=1) - return self.get_storer(key).read_coordinates(where=where, start=start, - stop=stop, **kwargs) + return self.get_storer(key).read_coordinates( + where=where, start=start, stop=stop, **kwargs + ) def select_column(self, key, column, **kwargs): """ @@ -758,9 +809,19 @@ def select_column(self, key, column, **kwargs): """ return self.get_storer(key).read_column(column=column, **kwargs) - def select_as_multiple(self, keys, where=None, selector=None, columns=None, - start=None, stop=None, iterator=False, - chunksize=None, auto_close=False, **kwargs): + def select_as_multiple( + self, + keys, + where=None, + selector=None, + columns=None, + start=None, + stop=None, + iterator=False, + chunksize=None, + auto_close=False, + **kwargs + ): """ Retrieve pandas objects from multiple tables Parameters @@ -786,9 +847,16 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, if isinstance(keys, (list, tuple)) and len(keys) == 1: keys = keys[0] if isinstance(keys, str): - return self.select(key=keys, where=where, columns=columns, - start=start, stop=stop, iterator=iterator, - chunksize=chunksize, **kwargs) + return self.select( + key=keys, + where=where, + columns=columns, + start=start, + stop=stop, + iterator=iterator, + chunksize=chunksize, + **kwargs + ) if not isinstance(keys, (list, tuple)): raise TypeError("keys must be a list/tuple") @@ -817,8 +885,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, if nrows is None: nrows = t.nrows elif t.nrows != nrows: - raise ValueError( - "all tables must have exactly the same nrows!") + raise ValueError("all tables must have exactly the same nrows!") # axis is the concentration axes axis = list({t.non_index_axes[0][0] for t in tbls})[0] @@ -827,17 +894,29 @@ def func(_start, _stop, _where): # retrieve the objs, _where is always passed as a set of # coordinates here - objs = [t.read(where=_where, columns=columns, start=_start, - stop=_stop, **kwargs) for t in tbls] + objs = [ + t.read( + where=_where, columns=columns, start=_start, stop=_stop, **kwargs + ) + for t in tbls + ] # concat and return - return concat(objs, axis=axis, - verify_integrity=False)._consolidate() + return concat(objs, axis=axis, verify_integrity=False)._consolidate() # create the iterator - it = TableIterator(self, s, func, where=where, nrows=nrows, - start=start, stop=stop, iterator=iterator, - chunksize=chunksize, auto_close=auto_close) + it = TableIterator( + self, + s, + func, + where=where, + nrows=nrows, + start=start, + stop=stop, + iterator=iterator, + chunksize=chunksize, + auto_close=auto_close, + ) return it.get_result(coordinates=True) @@ -867,7 +946,7 @@ def put(self, key, value, format=None, append=False, **kwargs): the store settable by the option 'io.hdf.dropna_table' """ if format is None: - format = get_option("io.hdf.default_format") or 'fixed' + format = get_option("io.hdf.default_format") or "fixed" kwargs = self._validate_format(format, kwargs) self._write_to_group(key, value, append=append, **kwargs) @@ -902,7 +981,8 @@ def remove(self, key, where=None, start=None, stop=None): if where is not None: raise ValueError( - "trying to remove a node with a non-None where clause!") + "trying to remove a node with a non-None where clause!" + ) # we are actually trying to remove a node (with children) s = self.get_node(key) @@ -918,11 +998,13 @@ def remove(self, key, where=None, start=None, stop=None): else: if not s.is_table: raise ValueError( - 'can only remove with where on objects written as tables') + "can only remove with where on objects written as tables" + ) return s.delete(where=where, start=start, stop=stop) - def append(self, key, value, format=None, append=True, columns=None, - dropna=None, **kwargs): + def append( + self, key, value, format=None, append=True, columns=None, dropna=None, **kwargs + ): """ Append to Table in file. Node must already exist and be Table format. @@ -957,19 +1039,20 @@ def append(self, key, value, format=None, append=True, columns=None, data in the table, so be careful """ if columns is not None: - raise TypeError("columns is not a supported keyword in append, " - "try data_columns") + raise TypeError( + "columns is not a supported keyword in append, " "try data_columns" + ) if dropna is None: dropna = get_option("io.hdf.dropna_table") if format is None: - format = get_option("io.hdf.default_format") or 'table' + format = get_option("io.hdf.default_format") or "table" kwargs = self._validate_format(format, kwargs) - self._write_to_group(key, value, append=append, dropna=dropna, - **kwargs) + self._write_to_group(key, value, append=append, dropna=dropna, **kwargs) - def append_to_multiple(self, d, value, selector, data_columns=None, - axes=None, dropna=False, **kwargs): + def append_to_multiple( + self, d, value, selector, data_columns=None, axes=None, dropna=False, **kwargs + ): """ Append to multiple tables @@ -992,9 +1075,11 @@ def append_to_multiple(self, d, value, selector, data_columns=None, """ if axes is not None: - raise TypeError("axes is currently not accepted as a parameter to" - " append_to_multiple; you can create the " - "tables independently instead") + raise TypeError( + "axes is currently not accepted as a parameter to" + " append_to_multiple; you can create the " + "tables independently instead" + ) if not isinstance(d, dict): raise ValueError( @@ -1035,7 +1120,7 @@ def append_to_multiple(self, d, value, selector, data_columns=None, # ensure rows are synchronized across the tables if dropna: - idxs = (value[cols].dropna(how='all').index for cols in d.values()) + idxs = (value[cols].dropna(how="all").index for cols in d.values()) valid_index = next(idxs) for index in idxs: valid_index = valid_index.intersection(index) @@ -1069,8 +1154,7 @@ def create_table_index(self, key, **kwargs): return if not s.is_table: - raise TypeError( - "cannot create table index on a Fixed format store") + raise TypeError("cannot create table index on a Fixed format store") s.create_index(**kwargs) def groups(self): @@ -1084,12 +1168,16 @@ def groups(self): _tables() self._check_if_open() return [ - g for g in self._handle.walk_groups() - if (not isinstance(g, _table_mod.link.Link) and - (getattr(g._v_attrs, 'pandas_type', None) or - getattr(g, 'table', None) or - (isinstance(g, _table_mod.table.Table) and - g._v_name != 'table'))) + g + for g in self._handle.walk_groups() + if ( + not isinstance(g, _table_mod.link.Link) + and ( + getattr(g._v_attrs, "pandas_type", None) + or getattr(g, "table", None) + or (isinstance(g, _table_mod.table.Table) and g._v_name != "table") + ) + ) ] def walk(self, where="/"): @@ -1123,27 +1211,27 @@ def walk(self, where="/"): _tables() self._check_if_open() for g in self._handle.walk_groups(where): - if getattr(g._v_attrs, 'pandas_type', None) is not None: + if getattr(g._v_attrs, "pandas_type", None) is not None: continue groups = [] leaves = [] for child in g._v_children.values(): - pandas_type = getattr(child._v_attrs, 'pandas_type', None) + pandas_type = getattr(child._v_attrs, "pandas_type", None) if pandas_type is None: if isinstance(child, _table_mod.group.Group): groups.append(child._v_name) else: leaves.append(child._v_name) - yield (g._v_pathname.rstrip('/'), groups, leaves) + yield (g._v_pathname.rstrip("/"), groups, leaves) def get_node(self, key): """ return the node with the key or None if it does not exist """ self._check_if_open() try: - if not key.startswith('/'): - key = '/' + key + if not key.startswith("/"): + key = "/" + key return self._handle.get_node(self.root, key) except _table_mod.exceptions.NoSuchNodeError: return None @@ -1152,14 +1240,23 @@ def get_storer(self, key): """ return the storer object for a key, raise if not in the file """ group = self.get_node(key) if group is None: - raise KeyError('No object named {key} in the file'.format(key=key)) + raise KeyError("No object named {key} in the file".format(key=key)) s = self._create_storer(group) s.infer_axes() return s - def copy(self, file, mode='w', propindexes=True, keys=None, complib=None, - complevel=None, fletcher32=False, overwrite=True): + def copy( + self, + file, + mode="w", + propindexes=True, + keys=None, + complib=None, + complevel=None, + fletcher32=False, + overwrite=True, + ): """ copy the existing store to a new file, upgrading in place Parameters @@ -1176,11 +1273,8 @@ def copy(self, file, mode='w', propindexes=True, keys=None, complib=None, """ new_store = HDFStore( - file, - mode=mode, - complib=complib, - complevel=complevel, - fletcher32=fletcher32) + file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32 + ) if keys is None: keys = list(self.keys()) if not isinstance(keys, (tuple, list)): @@ -1200,9 +1294,11 @@ def copy(self, file, mode='w', propindexes=True, keys=None, complib=None, if propindexes: index = [a.name for a in s.axes if a.is_indexed] new_store.append( - k, data, index=index, - data_columns=getattr(s, 'data_columns', None), - encoding=s.encoding + k, + data, + index=index, + data_columns=getattr(s, "data_columns", None), + encoding=s.encoding, ) else: new_store.put(k, data, encoding=s.encoding) @@ -1219,8 +1315,9 @@ def info(self): ------- str """ - output = '{type}\nFile path: {path}\n'.format( - type=type(self), path=pprint_thing(self._path)) + output = "{type}\nFile path: {path}\n".format( + type=type(self), path=pprint_thing(self._path) + ) if self.is_open: lkeys = sorted(list(self.keys())) if len(lkeys): @@ -1232,17 +1329,18 @@ def info(self): s = self.get_storer(k) if s is not None: keys.append(pprint_thing(s.pathname or k)) - values.append( - pprint_thing(s or 'invalid_HDFStore node')) + values.append(pprint_thing(s or "invalid_HDFStore node")) except Exception as detail: keys.append(k) values.append( "[invalid_HDFStore node: {detail}]".format( - detail=pprint_thing(detail))) + detail=pprint_thing(detail) + ) + ) output += adjoin(12, keys, values) else: - output += 'Empty' + output += "Empty" else: output += "File is CLOSED" @@ -1259,58 +1357,64 @@ def _validate_format(self, format, kwargs): # validate try: - kwargs['format'] = _FORMAT_MAP[format.lower()] + kwargs["format"] = _FORMAT_MAP[format.lower()] except KeyError: - raise TypeError("invalid HDFStore format specified [{0}]" - .format(format)) + raise TypeError("invalid HDFStore format specified [{0}]".format(format)) return kwargs - def _create_storer(self, group, format=None, value=None, append=False, - **kwargs): + def _create_storer(self, group, format=None, value=None, append=False, **kwargs): """ return a suitable class to operate """ def error(t): raise TypeError( "cannot properly create the storer for: [{t}] [group->" "{group},value->{value},format->{format},append->{append}," - "kwargs->{kwargs}]".format(t=t, group=group, - value=type(value), format=format, - append=append, kwargs=kwargs)) + "kwargs->{kwargs}]".format( + t=t, + group=group, + value=type(value), + format=format, + append=append, + kwargs=kwargs, + ) + ) - pt = _ensure_decoded(getattr(group._v_attrs, 'pandas_type', None)) - tt = _ensure_decoded(getattr(group._v_attrs, 'table_type', None)) + pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None)) + tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None)) # infer the pt from the passed value if pt is None: if value is None: _tables() - if (getattr(group, 'table', None) or - isinstance(group, _table_mod.table.Table)): - pt = 'frame_table' - tt = 'generic_table' + if getattr(group, "table", None) or isinstance( + group, _table_mod.table.Table + ): + pt = "frame_table" + tt = "generic_table" else: raise TypeError( "cannot create a storer if the object is not existing " - "nor a value are passed") + "nor a value are passed" + ) else: try: pt = _TYPE_MAP[type(value)] except KeyError: - error('_TYPE_MAP') + error("_TYPE_MAP") # we are actually a table - if format == 'table': - pt += '_table' + if format == "table": + pt += "_table" # a storer node - if 'table' not in pt: + if "table" not in pt: try: return globals()[_STORER_MAP[pt]](self, group, **kwargs) except KeyError: - error('_STORER_MAP') + error("_STORER_MAP") # existing node (and must be a table) if tt is None: @@ -1318,43 +1422,52 @@ def error(t): # if we are a writer, determine the tt if value is not None: - if pt == 'series_table': - index = getattr(value, 'index', None) + if pt == "series_table": + index = getattr(value, "index", None) if index is not None: if index.nlevels == 1: - tt = 'appendable_series' + tt = "appendable_series" elif index.nlevels > 1: - tt = 'appendable_multiseries' - elif pt == 'frame_table': - index = getattr(value, 'index', None) + tt = "appendable_multiseries" + elif pt == "frame_table": + index = getattr(value, "index", None) if index is not None: if index.nlevels == 1: - tt = 'appendable_frame' + tt = "appendable_frame" elif index.nlevels > 1: - tt = 'appendable_multiframe' - elif pt == 'wide_table': - tt = 'appendable_panel' - elif pt == 'ndim_table': - tt = 'appendable_ndim' + tt = "appendable_multiframe" + elif pt == "wide_table": + tt = "appendable_panel" + elif pt == "ndim_table": + tt = "appendable_ndim" else: # distinguish between a frame/table - tt = 'legacy_panel' + tt = "legacy_panel" try: fields = group.table._v_attrs.fields - if len(fields) == 1 and fields[0] == 'value': - tt = 'legacy_frame' + if len(fields) == 1 and fields[0] == "value": + tt = "legacy_frame" except IndexError: pass try: return globals()[_TABLE_MAP[tt]](self, group, **kwargs) except KeyError: - error('_TABLE_MAP') - - def _write_to_group(self, key, value, format, index=True, append=False, - complib=None, encoding=None, **kwargs): + error("_TABLE_MAP") + + def _write_to_group( + self, + key, + value, + format, + index=True, + append=False, + complib=None, + encoding=None, + **kwargs + ): group = self.get_node(key) # remove the node if we are not appending @@ -1364,43 +1477,41 @@ def _write_to_group(self, key, value, format, index=True, append=False, # we don't want to store a table node at all if are object is 0-len # as there are not dtypes - if getattr(value, 'empty', None) and (format == 'table' or append): + if getattr(value, "empty", None) and (format == "table" or append): return if group is None: - paths = key.split('/') + paths = key.split("/") # recursively create the groups - path = '/' + path = "/" for p in paths: if not len(p): continue new_path = path - if not path.endswith('/'): - new_path += '/' + if not path.endswith("/"): + new_path += "/" new_path += p group = self.get_node(new_path) if group is None: group = self._handle.create_group(path, p) path = new_path - s = self._create_storer(group, format, value, append=append, - encoding=encoding, **kwargs) + s = self._create_storer( + group, format, value, append=append, encoding=encoding, **kwargs + ) if append: # raise if we are trying to append to a Fixed format, # or a table that exists (and we are putting) - if (not s.is_table or - (s.is_table and format == 'fixed' and s.is_exists)): - raise ValueError('Can only append to Tables') + if not s.is_table or (s.is_table and format == "fixed" and s.is_exists): + raise ValueError("Can only append to Tables") if not s.is_exists: s.set_object_info() else: s.set_object_info() if not s.is_table and complib: - raise ValueError( - 'Compression not supported on Fixed format stores' - ) + raise ValueError("Compression not supported on Fixed format stores") # write the object s.write(obj=value, append=append, complib=complib, **kwargs) @@ -1435,8 +1546,19 @@ class TableIterator: kwargs : the passed kwargs """ - def __init__(self, store, s, func, where, nrows, start=None, stop=None, - iterator=False, chunksize=None, auto_close=False): + def __init__( + self, + store, + s, + func, + where, + nrows, + start=None, + stop=None, + iterator=False, + chunksize=None, + auto_close=False, + ): self.store = store self.s = s self.func = func @@ -1491,8 +1613,7 @@ def get_result(self, coordinates=False): # return the actual iterator if self.chunksize is not None: if not self.s.is_table: - raise TypeError( - "can only use an iterator or chunksize on a table") + raise TypeError("can only use an iterator or chunksize on a table") self.coordinates = self.s.read_coordinates(where=self.where) @@ -1500,8 +1621,9 @@ def get_result(self, coordinates=False): # if specified read via coordinates (necessary for multiple selections if coordinates: - where = self.s.read_coordinates(where=self.where, start=self.start, - stop=self.stop) + where = self.s.read_coordinates( + where=self.where, start=self.start, stop=self.stop + ) else: where = self.where @@ -1525,13 +1647,27 @@ class IndexCol: pos : the position in the pytables """ + is_an_indexable = True is_data_indexable = True - _info_fields = ['freq', 'tz', 'index_name'] - - def __init__(self, values=None, kind=None, typ=None, cname=None, - itemsize=None, name=None, axis=None, kind_attr=None, - pos=None, freq=None, tz=None, index_name=None, **kwargs): + _info_fields = ["freq", "tz", "index_name"] + + def __init__( + self, + values=None, + kind=None, + typ=None, + cname=None, + itemsize=None, + name=None, + axis=None, + kind_attr=None, + pos=None, + freq=None, + tz=None, + index_name=None, + **kwargs + ): self.values = values self.kind = kind self.typ = typ @@ -1581,20 +1717,21 @@ def set_table(self, table): def __repr__(self): temp = tuple( - map(pprint_thing, - (self.name, - self.cname, - self.axis, - self.pos, - self.kind))) - return ','.join(("{key}->{value}".format(key=key, value=value) - for key, value in zip( - ['name', 'cname', 'axis', 'pos', 'kind'], temp))) + map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind)) + ) + return ",".join( + ( + "{key}->{value}".format(key=key, value=value) + for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp) + ) + ) def __eq__(self, other): """ compare 2 col items """ - return all(getattr(self, a, None) == getattr(other, a, None) - for a in ['name', 'cname', 'axis', 'pos']) + return all( + getattr(self, a, None) == getattr(other, a, None) + for a in ["name", "cname", "axis", "pos"] + ) def __ne__(self, other): return not self.__eq__(other) @@ -1620,8 +1757,7 @@ def infer(self, handler): new_self.read_metadata(handler) return new_self - def convert(self, values, nan_rep, encoding, errors, start=None, - stop=None): + def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): """ set the values from this selection: take = take ownership """ # values is a recarray @@ -1632,9 +1768,9 @@ def convert(self, values, nan_rep, encoding, errors, start=None, kwargs = dict() if self.freq is not None: - kwargs['freq'] = _ensure_decoded(self.freq) + kwargs["freq"] = _ensure_decoded(self.freq) if self.index_name is not None: - kwargs['name'] = _ensure_decoded(self.index_name) + kwargs["name"] = _ensure_decoded(self.index_name) # making an Index instance could throw a number of different errors try: self.values = Index(values, **kwargs) @@ -1642,8 +1778,8 @@ def convert(self, values, nan_rep, encoding, errors, start=None, # if the output freq is different that what we recorded, # it should be None (see also 'doc example part 2') - if 'freq' in kwargs: - kwargs['freq'] = None + if "freq" in kwargs: + kwargs["freq"] = None self.values = Index(values, **kwargs) self.values = _set_tz(self.values, self.tz) @@ -1680,14 +1816,13 @@ def maybe_set_size(self, min_itemsize=None): """ maybe set a string col itemsize: min_itemsize can be an integer or a dict with this columns name with an integer size """ - if _ensure_decoded(self.kind) == 'string': + if _ensure_decoded(self.kind) == "string": if isinstance(min_itemsize, dict): min_itemsize = min_itemsize.get(self.name) if min_itemsize is not None and self.typ.itemsize < min_itemsize: - self.typ = _tables( - ).StringCol(itemsize=min_itemsize, pos=self.pos) + self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos) def validate(self, handler, append): self.validate_names() @@ -1707,7 +1842,7 @@ def validate_col(self, itemsize=None): """ validate this column: return the compared against itemsize """ # validate this column for string truncation (or reset to the max size) - if _ensure_decoded(self.kind) == 'string': + if _ensure_decoded(self.kind) == "string": c = self.col if c is not None: if itemsize is None: @@ -1718,8 +1853,9 @@ def validate_col(self, itemsize=None): "[{cname}] column but\nthis column has a limit of " "[{c_itemsize}]!\nConsider using min_itemsize to " "preset the sizes on these columns".format( - itemsize=itemsize, cname=self.cname, - c_itemsize=c.itemsize)) + itemsize=itemsize, cname=self.cname, c_itemsize=c.itemsize + ) + ) return c.itemsize return None @@ -1731,8 +1867,8 @@ def validate_attr(self, append): if existing_kind is not None and existing_kind != self.kind: raise TypeError( "incompatible kind in col [{existing} - " - "{self_kind}]".format( - existing=existing_kind, self_kind=self.kind)) + "{self_kind}]".format(existing=existing_kind, self_kind=self.kind) + ) def update_info(self, info): """ set/update the info for this indexable with the key/value @@ -1747,7 +1883,7 @@ def update_info(self, info): if key in idx and value is not None and existing_value != value: # frequency/name just warn - if key in ['freq', 'index_name']: + if key in ["freq", "index_name"]: ws = attribute_conflict_doc % (key, existing_value, value) warnings.warn(ws, AttributeConflictWarning, stacklevel=6) @@ -1760,8 +1896,12 @@ def update_info(self, info): "invalid info for [{name}] for [{key}], " "existing_value [{existing_value}] conflicts with " "new value [{value}]".format( - name=self.name, key=key, - existing_value=existing_value, value=value)) + name=self.name, + key=key, + existing_value=existing_value, + value=value, + ) + ) else: if value is not None or existing_value is not None: idx[key] = value @@ -1788,13 +1928,18 @@ def read_metadata(self, handler): def validate_metadata(self, handler): """ validate that kind=category does not change the categories """ - if self.meta == 'category': + if self.meta == "category": new_metadata = self.metadata cur_metadata = handler.read_metadata(self.cname) - if (new_metadata is not None and cur_metadata is not None and - not array_equivalent(new_metadata, cur_metadata)): - raise ValueError("cannot append a categorical with " - "different categories to the existing") + if ( + new_metadata is not None + and cur_metadata is not None + and not array_equivalent(new_metadata, cur_metadata) + ): + raise ValueError( + "cannot append a categorical with " + "different categories to the existing" + ) def write_metadata(self, handler): """ set the meta data """ @@ -1810,8 +1955,7 @@ class GenericIndexCol(IndexCol): def is_indexed(self): return False - def convert(self, values, nan_rep, encoding, errors, start=None, - stop=None): + def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): """ set the values from this selection: take = take ownership Parameters @@ -1829,8 +1973,7 @@ def convert(self, values, nan_rep, encoding, errors, start=None, """ start = start if start is not None else 0 - stop = (min(stop, self.table.nrows) - if stop is not None else self.table.nrows) + stop = min(stop, self.table.nrows) if stop is not None else self.table.nrows self.values = Int64Index(np.arange(stop - start)) return self @@ -1855,17 +1998,17 @@ class DataCol(IndexCol): meta : a string description of the metadata metadata : the actual metadata """ + is_an_indexable = False is_data_indexable = False - _info_fields = ['tz', 'ordered'] + _info_fields = ["tz", "ordered"] @classmethod - def create_for_block( - cls, i=None, name=None, cname=None, version=None, **kwargs): + def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs): """ return a new datacol with the block i """ if cname is None: - cname = name or 'values_block_{idx}'.format(idx=i) + cname = name or "values_block_{idx}".format(idx=i) if name is None: name = cname @@ -1881,34 +2024,45 @@ def create_for_block( return cls(name=name, cname=cname, **kwargs) - def __init__(self, values=None, kind=None, typ=None, - cname=None, data=None, meta=None, metadata=None, - block=None, **kwargs): - super().__init__(values=values, kind=kind, typ=typ, cname=cname, - **kwargs) + def __init__( + self, + values=None, + kind=None, + typ=None, + cname=None, + data=None, + meta=None, + metadata=None, + block=None, + **kwargs + ): + super().__init__(values=values, kind=kind, typ=typ, cname=cname, **kwargs) self.dtype = None - self.dtype_attr = '{name}_dtype'.format(name=self.name) + self.dtype_attr = "{name}_dtype".format(name=self.name) self.meta = meta - self.meta_attr = '{name}_meta'.format(name=self.name) + self.meta_attr = "{name}_meta".format(name=self.name) self.set_data(data) self.set_metadata(metadata) def __repr__(self): temp = tuple( - map(pprint_thing, - (self.name, - self.cname, - self.dtype, - self.kind, - self.shape))) - return ','.join(("{key}->{value}".format(key=key, value=value) - for key, value in zip( - ['name', 'cname', 'dtype', 'kind', 'shape'], temp))) + map( + pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape) + ) + ) + return ",".join( + ( + "{key}->{value}".format(key=key, value=value) + for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp) + ) + ) def __eq__(self, other): """ compare 2 col items """ - return all(getattr(self, a, None) == getattr(other, a, None) - for a in ['name', 'cname', 'dtype', 'pos']) + return all( + getattr(self, a, None) == getattr(other, a, None) + for a in ["name", "cname", "dtype", "pos"] + ) def set_data(self, data, dtype=None): self.data = data @@ -1937,39 +2091,49 @@ def set_kind(self): if self.dtype is not None: dtype = _ensure_decoded(self.dtype) - if dtype.startswith('string') or dtype.startswith('bytes'): - self.kind = 'string' - elif dtype.startswith('float'): - self.kind = 'float' - elif dtype.startswith('complex'): - self.kind = 'complex' - elif dtype.startswith('int') or dtype.startswith('uint'): - self.kind = 'integer' - elif dtype.startswith('date'): - self.kind = 'datetime' - elif dtype.startswith('timedelta'): - self.kind = 'timedelta' - elif dtype.startswith('bool'): - self.kind = 'bool' + if dtype.startswith("string") or dtype.startswith("bytes"): + self.kind = "string" + elif dtype.startswith("float"): + self.kind = "float" + elif dtype.startswith("complex"): + self.kind = "complex" + elif dtype.startswith("int") or dtype.startswith("uint"): + self.kind = "integer" + elif dtype.startswith("date"): + self.kind = "datetime" + elif dtype.startswith("timedelta"): + self.kind = "timedelta" + elif dtype.startswith("bool"): + self.kind = "bool" else: raise AssertionError( "cannot interpret dtype of [{dtype}] in [{obj}]".format( - dtype=dtype, obj=self)) + dtype=dtype, obj=self + ) + ) # set my typ if we need if self.typ is None: self.typ = getattr(self.description, self.cname, None) - def set_atom(self, block, block_items, existing_col, min_itemsize, - nan_rep, info, encoding=None, errors='strict'): + def set_atom( + self, + block, + block_items, + existing_col, + min_itemsize, + nan_rep, + info, + encoding=None, + errors="strict", + ): """ create and setup my atom from the block b """ self.values = list(block_items) # short-cut certain block types if block.is_categorical: - return self.set_atom_categorical(block, items=block_items, - info=info) + return self.set_atom_categorical(block, items=block_items, info=info) elif block.is_datetimetz: return self.set_atom_datetime64tz(block, info=info) elif block.is_datetime: @@ -1982,32 +2146,31 @@ def set_atom(self, block, block_items, existing_col, min_itemsize, dtype = block.dtype.name inferred_type = lib.infer_dtype(block.values, skipna=False) - if inferred_type == 'date': - raise TypeError( - "[date] is not implemented as a table column") - elif inferred_type == 'datetime': + if inferred_type == "date": + raise TypeError("[date] is not implemented as a table column") + elif inferred_type == "datetime": # after 8260 # this only would be hit for a mutli-timezone dtype # which is an error raise TypeError( - "too many timezones in this block, create separate " - "data columns" + "too many timezones in this block, create separate " "data columns" ) - elif inferred_type == 'unicode': - raise TypeError( - "[unicode] is not implemented as a table column") + elif inferred_type == "unicode": + raise TypeError("[unicode] is not implemented as a table column") # this is basically a catchall; if say a datetime64 has nans then will # end up here ### - elif inferred_type == 'string' or dtype == 'object': + elif inferred_type == "string" or dtype == "object": self.set_atom_string( - block, block_items, + block, + block_items, existing_col, min_itemsize, nan_rep, encoding, - errors) + errors, + ) # set as a data block else: @@ -2016,8 +2179,9 @@ def set_atom(self, block, block_items, existing_col, min_itemsize, def get_atom_string(self, block, itemsize): return _tables().StringCol(itemsize=itemsize, shape=block.shape[0]) - def set_atom_string(self, block, block_items, existing_col, min_itemsize, - nan_rep, encoding, errors): + def set_atom_string( + self, block, block_items, existing_col, min_itemsize, nan_rep, encoding, errors + ): # fill nan items with myself, don't disturb the blocks by # trying to downcast block = block.fillna(nan_rep, downcast=False) @@ -2027,7 +2191,7 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize, # see if we have a valid string type inferred_type = lib.infer_dtype(data.ravel(), skipna=False) - if inferred_type != 'string': + if inferred_type != "string": # we cannot serialize this data, so report an exception on a column # by column basis @@ -2035,11 +2199,12 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize, col = block.iget(i) inferred_type = lib.infer_dtype(col.ravel(), skipna=False) - if inferred_type != 'string': + if inferred_type != "string": raise TypeError( "Cannot serialize the column [{item}] because\n" "its data contents are [{type}] object dtype".format( - item=item, type=inferred_type) + item=item, type=inferred_type + ) ) # itemsize is the maximum length of a string (along any dimension) @@ -2048,8 +2213,9 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize, # specified min_itemsize? if isinstance(min_itemsize, dict): - min_itemsize = int(min_itemsize.get( - self.name) or min_itemsize.get('values') or 0) + min_itemsize = int( + min_itemsize.get(self.name) or min_itemsize.get("values") or 0 + ) itemsize = max(min_itemsize or 0, itemsize) # check for column in the values conflicts @@ -2059,16 +2225,17 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize, itemsize = eci self.itemsize = itemsize - self.kind = 'string' + self.kind = "string" self.typ = self.get_atom_string(block, itemsize) - self.set_data(data_converted.astype( - '|S{size}'.format(size=itemsize), copy=False)) + self.set_data( + data_converted.astype("|S{size}".format(size=itemsize), copy=False) + ) def get_atom_coltype(self, kind=None): """ return the PyTables column class for this column """ if kind is None: kind = self.kind - if self.kind.startswith('uint'): + if self.kind.startswith("uint"): col_name = "UInt{name}Col".format(name=kind[4:]) else: col_name = "{name}Col".format(name=kind.capitalize()) @@ -2080,9 +2247,8 @@ def get_atom_data(self, block, kind=None): def set_atom_complex(self, block): self.kind = block.dtype.name - itemsize = int(self.kind.split('complex')[-1]) // 8 - self.typ = _tables().ComplexCol( - itemsize=itemsize, shape=block.shape[0]) + itemsize = int(self.kind.split("complex")[-1]) // 8 + self.typ = _tables().ComplexCol(itemsize=itemsize, shape=block.shape[0]) self.set_data(block.values.astype(self.typ.type, copy=False)) def set_atom_data(self, block): @@ -2096,7 +2262,7 @@ def set_atom_categorical(self, block, items, info=None, values=None): values = block.values codes = values.codes - self.kind = 'integer' + self.kind = "integer" self.dtype = codes.dtype.name if values.ndim > 1: raise NotImplementedError("only support 1-d categoricals") @@ -2109,7 +2275,7 @@ def set_atom_categorical(self, block, items, info=None, values=None): self.set_data(_block_shape(codes)) # write the categories - self.meta = 'category' + self.meta = "category" self.set_metadata(block.values.categories) # update the info @@ -2119,11 +2285,11 @@ def get_atom_datetime64(self, block): return _tables().Int64Col(shape=block.shape[0]) def set_atom_datetime64(self, block, values=None): - self.kind = 'datetime64' + self.kind = "datetime64" self.typ = self.get_atom_datetime64(block) if values is None: - values = block.values.view('i8') - self.set_data(values, 'datetime64') + values = block.values.view("i8") + self.set_data(values, "datetime64") def set_atom_datetime64tz(self, block, info, values=None): @@ -2137,23 +2303,23 @@ def set_atom_datetime64tz(self, block, info, values=None): self.tz = _get_tz(block.values.tz) self.update_info(info) - self.kind = 'datetime64' + self.kind = "datetime64" self.typ = self.get_atom_datetime64(block) - self.set_data(values, 'datetime64') + self.set_data(values, "datetime64") def get_atom_timedelta64(self, block): return _tables().Int64Col(shape=block.shape[0]) def set_atom_timedelta64(self, block, values=None): - self.kind = 'timedelta64' + self.kind = "timedelta64" self.typ = self.get_atom_timedelta64(block) if values is None: - values = block.values.view('i8') - self.set_data(values, 'timedelta64') + values = block.values.view("i8") + self.set_data(values, "timedelta64") @property def shape(self): - return getattr(self.data, 'shape', None) + return getattr(self.data, "shape", None) @property def cvalues(self): @@ -2164,19 +2330,19 @@ def validate_attr(self, append): """validate that we have the same order as the existing & same dtype""" if append: existing_fields = getattr(self.attrs, self.kind_attr, None) - if (existing_fields is not None and - existing_fields != list(self.values)): - raise ValueError("appended items do not match existing items" - " in table!") + if existing_fields is not None and existing_fields != list(self.values): + raise ValueError( + "appended items do not match existing items" " in table!" + ) existing_dtype = getattr(self.attrs, self.dtype_attr, None) - if (existing_dtype is not None and - existing_dtype != self.dtype): - raise ValueError("appended items dtype do not match existing " - "items dtype in table!") + if existing_dtype is not None and existing_dtype != self.dtype: + raise ValueError( + "appended items dtype do not match existing " + "items dtype in table!" + ) - def convert(self, values, nan_rep, encoding, errors, start=None, - stop=None): + def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): """set the data from this selection (and convert to the correct dtype if we can) """ @@ -2195,27 +2361,28 @@ def convert(self, values, nan_rep, encoding, errors, start=None, dtype = _ensure_decoded(self.dtype) # reverse converts - if dtype == 'datetime64': + if dtype == "datetime64": # recreate with tz if indicated self.data = _set_tz(self.data, self.tz, coerce=True) - elif dtype == 'timedelta64': - self.data = np.asarray(self.data, dtype='m8[ns]') - elif dtype == 'date': + elif dtype == "timedelta64": + self.data = np.asarray(self.data, dtype="m8[ns]") + elif dtype == "date": try: self.data = np.asarray( - [date.fromordinal(v) for v in self.data], dtype=object) + [date.fromordinal(v) for v in self.data], dtype=object + ) except ValueError: self.data = np.asarray( - [date.fromtimestamp(v) for v in self.data], - dtype=object) - elif dtype == 'datetime': + [date.fromtimestamp(v) for v in self.data], dtype=object + ) + elif dtype == "datetime": self.data = np.asarray( - [datetime.fromtimestamp(v) for v in self.data], - dtype=object) + [datetime.fromtimestamp(v) for v in self.data], dtype=object + ) - elif meta == 'category': + elif meta == "category": # we have a categorical categories = self.metadata @@ -2236,21 +2403,22 @@ def convert(self, values, nan_rep, encoding, errors, start=None, categories = categories[~mask] codes[codes != -1] -= mask.astype(int).cumsum().values - self.data = Categorical.from_codes(codes, - categories=categories, - ordered=self.ordered) + self.data = Categorical.from_codes( + codes, categories=categories, ordered=self.ordered + ) else: try: self.data = self.data.astype(dtype, copy=False) except TypeError: - self.data = self.data.astype('O', copy=False) + self.data = self.data.astype("O", copy=False) # convert nans / decode - if _ensure_decoded(self.kind) == 'string': + if _ensure_decoded(self.kind) == "string": self.data = _unconvert_string_array( - self.data, nan_rep=nan_rep, encoding=encoding, errors=errors) + self.data, nan_rep=nan_rep, encoding=encoding, errors=errors + ) return self @@ -2272,6 +2440,7 @@ def set_attr(self): class DataIndexableCol(DataCol): """ represent a data column that can be indexed """ + is_data_indexable = True def validate_names(self): @@ -2311,13 +2480,13 @@ class Fixed: parent : my parent HDFStore group : the group node where the table resides """ + pandas_kind = None # type: str obj_type = None # type: Type[Union[DataFrame, Series]] ndim = None # type: int is_table = False - def __init__(self, parent, group, encoding=None, errors='strict', - **kwargs): + def __init__(self, parent, group, encoding=None, errors="strict", **kwargs): self.parent = parent self.group = group self.encoding = _ensure_encoding(encoding) @@ -2326,15 +2495,13 @@ def __init__(self, parent, group, encoding=None, errors='strict', @property def is_old_version(self): - return (self.version[0] <= 0 and self.version[1] <= 10 and - self.version[2] < 1) + return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1 def set_version(self): """ compute and set our version """ - version = _ensure_decoded( - getattr(self.group._v_attrs, 'pandas_version', None)) + version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None)) try: - self.version = tuple(int(x) for x in version.split('.')) + self.version = tuple(int(x) for x in version.split(".")) if len(self.version) == 2: self.version = self.version + (0,) except AttributeError: @@ -2342,12 +2509,11 @@ def set_version(self): @property def pandas_type(self): - return _ensure_decoded(getattr(self.group._v_attrs, - 'pandas_type', None)) + return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None)) @property def format_type(self): - return 'fixed' + return "fixed" def __repr__(self): """ return a pretty representation of myself """ @@ -2355,10 +2521,10 @@ def __repr__(self): s = self.shape if s is not None: if isinstance(s, (list, tuple)): - s = "[{shape}]".format( - shape=','.join(pprint_thing(x) for x in s)) + s = "[{shape}]".format(shape=",".join(pprint_thing(x) for x in s)) return "{type:12.12} (shape->{shape})".format( - type=self.pandas_type, shape=s) + type=self.pandas_type, shape=s + ) return self.pandas_type def set_object_info(self): @@ -2426,7 +2592,7 @@ def is_exists(self): @property def nrows(self): - return getattr(self.storable, 'nrows', None) + return getattr(self.storable, "nrows", None) def validate(self, other): """ validate against an existing storable """ @@ -2450,11 +2616,13 @@ def infer_axes(self): def read(self, **kwargs): raise NotImplementedError( - "cannot read on an abstract storer: subclasses should implement") + "cannot read on an abstract storer: subclasses should implement" + ) def write(self, **kwargs): raise NotImplementedError( - "cannot write on an abstract storer: sublcasses should implement") + "cannot write on an abstract storer: sublcasses should implement" + ) def delete(self, where=None, start=None, stop=None, **kwargs): """ @@ -2471,13 +2639,14 @@ def delete(self, where=None, start=None, stop=None, **kwargs): class GenericFixed(Fixed): """ a generified fixed version """ - _index_type_map = {DatetimeIndex: 'datetime', PeriodIndex: 'period'} + + _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"} _reverse_index_map = {v: k for k, v in _index_type_map.items()} attributes = [] # type: List[str] # indexer helpders def _class_to_alias(self, cls): - return self._index_type_map.get(cls, '') + return self._index_type_map.get(cls, "") def _alias_to_class(self, alias): if isinstance(alias, type): # pragma: no cover @@ -2487,17 +2656,20 @@ def _alias_to_class(self, alias): def _get_index_factory(self, klass): if klass == DatetimeIndex: + def f(values, freq=None, tz=None): # data are already in UTC, localize and convert if tz present - result = DatetimeIndex._simple_new(values.values, name=None, - freq=freq) + result = DatetimeIndex._simple_new(values.values, name=None, freq=freq) if tz is not None: - result = result.tz_localize('UTC').tz_convert(tz) + result = result.tz_localize("UTC").tz_convert(tz) return result + return f elif klass == PeriodIndex: + def f(values, freq=None, tz=None): return PeriodIndex._simple_new(values, name=None, freq=freq) + return f return klass @@ -2509,16 +2681,20 @@ def validate_read(self, kwargs): """ kwargs = copy.copy(kwargs) - columns = kwargs.pop('columns', None) + columns = kwargs.pop("columns", None) if columns is not None: - raise TypeError("cannot pass a column specification when reading " - "a Fixed format store. this store must be " - "selected in its entirety") - where = kwargs.pop('where', None) + raise TypeError( + "cannot pass a column specification when reading " + "a Fixed format store. this store must be " + "selected in its entirety" + ) + where = kwargs.pop("where", None) if where is not None: - raise TypeError("cannot pass a where specification when reading " - "from a Fixed format store. this store must be " - "selected in its entirety") + raise TypeError( + "cannot pass a where specification when reading " + "from a Fixed format store. this store must be " + "selected in its entirety" + ) return kwargs @property @@ -2532,8 +2708,8 @@ def set_attrs(self): def get_attrs(self): """ retrieve our attributes """ - self.encoding = _ensure_encoding(getattr(self.attrs, 'encoding', None)) - self.errors = _ensure_decoded(getattr(self.attrs, 'errors', 'strict')) + self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) + self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) for n in self.attributes: setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None))) @@ -2543,16 +2719,17 @@ def write(self, obj, **kwargs): def read_array(self, key, start=None, stop=None): """ read an array for the specified node (off of group """ import tables + node = getattr(self.group, key) attrs = node._v_attrs - transposed = getattr(attrs, 'transposed', False) + transposed = getattr(attrs, "transposed", False) if isinstance(node, tables.VLArray): ret = node[0][start:stop] else: - dtype = getattr(attrs, 'value_type', None) - shape = getattr(attrs, 'shape', None) + dtype = getattr(attrs, "value_type", None) + shape = getattr(attrs, "shape", None) if shape is not None: # length 0 axis @@ -2560,13 +2737,13 @@ def read_array(self, key, start=None, stop=None): else: ret = node[start:stop] - if dtype == 'datetime64': + if dtype == "datetime64": # reconstruct a timezone if indicated - ret = _set_tz(ret, getattr(attrs, 'tz', None), coerce=True) + ret = _set_tz(ret, getattr(attrs, "tz", None), coerce=True) - elif dtype == 'timedelta64': - ret = np.asarray(ret, dtype='m8[ns]') + elif dtype == "timedelta64": + ret = np.asarray(ret, dtype="m8[ns]") if transposed: return ret.T @@ -2574,37 +2751,37 @@ def read_array(self, key, start=None, stop=None): return ret def read_index(self, key, **kwargs): - variety = _ensure_decoded( - getattr(self.attrs, '{key}_variety'.format(key=key))) + variety = _ensure_decoded(getattr(self.attrs, "{key}_variety".format(key=key))) - if variety == 'multi': + if variety == "multi": return self.read_multi_index(key, **kwargs) - elif variety == 'block': + elif variety == "block": return self.read_block_index(key, **kwargs) - elif variety == 'sparseint': + elif variety == "sparseint": return self.read_sparse_intindex(key, **kwargs) - elif variety == 'regular': + elif variety == "regular": _, index = self.read_index_node(getattr(self.group, key), **kwargs) return index else: # pragma: no cover raise TypeError( - 'unrecognized index variety: {variety}'.format( - variety=variety)) + "unrecognized index variety: {variety}".format(variety=variety) + ) def write_index(self, key, index): if isinstance(index, MultiIndex): - setattr(self.attrs, '{key}_variety'.format(key=key), 'multi') + setattr(self.attrs, "{key}_variety".format(key=key), "multi") self.write_multi_index(key, index) elif isinstance(index, BlockIndex): - setattr(self.attrs, '{key}_variety'.format(key=key), 'block') + setattr(self.attrs, "{key}_variety".format(key=key), "block") self.write_block_index(key, index) elif isinstance(index, IntIndex): - setattr(self.attrs, '{key}_variety'.format(key=key), 'sparseint') + setattr(self.attrs, "{key}_variety".format(key=key), "sparseint") self.write_sparse_intindex(key, index) else: - setattr(self.attrs, '{key}_variety'.format(key=key), 'regular') - converted = _convert_index(index, self.encoding, self.errors, - self.format_type).set_name('index') + setattr(self.attrs, "{key}_variety".format(key=key), "regular") + converted = _convert_index( + index, self.encoding, self.errors, self.format_type + ).set_name("index") self.write_array(key, converted.values) @@ -2615,113 +2792,124 @@ def write_index(self, key, index): if isinstance(index, (DatetimeIndex, PeriodIndex)): node._v_attrs.index_class = self._class_to_alias(type(index)) - if hasattr(index, 'freq'): + if hasattr(index, "freq"): node._v_attrs.freq = index.freq - if hasattr(index, 'tz') and index.tz is not None: + if hasattr(index, "tz") and index.tz is not None: node._v_attrs.tz = _get_tz(index.tz) def write_block_index(self, key, index): - self.write_array('{key}_blocs'.format(key=key), index.blocs) - self.write_array('{key}_blengths'.format(key=key), index.blengths) - setattr(self.attrs, '{key}_length'.format(key=key), index.length) + self.write_array("{key}_blocs".format(key=key), index.blocs) + self.write_array("{key}_blengths".format(key=key), index.blengths) + setattr(self.attrs, "{key}_length".format(key=key), index.length) def read_block_index(self, key, **kwargs): - length = getattr(self.attrs, '{key}_length'.format(key=key)) - blocs = self.read_array('{key}_blocs'.format(key=key), **kwargs) - blengths = self.read_array('{key}_blengths'.format(key=key), **kwargs) + length = getattr(self.attrs, "{key}_length".format(key=key)) + blocs = self.read_array("{key}_blocs".format(key=key), **kwargs) + blengths = self.read_array("{key}_blengths".format(key=key), **kwargs) return BlockIndex(length, blocs, blengths) def write_sparse_intindex(self, key, index): - self.write_array('{key}_indices'.format(key=key), index.indices) - setattr(self.attrs, '{key}_length'.format(key=key), index.length) + self.write_array("{key}_indices".format(key=key), index.indices) + setattr(self.attrs, "{key}_length".format(key=key), index.length) def read_sparse_intindex(self, key, **kwargs): - length = getattr(self.attrs, '{key}_length'.format(key=key)) - indices = self.read_array('{key}_indices'.format(key=key), **kwargs) + length = getattr(self.attrs, "{key}_length".format(key=key)) + indices = self.read_array("{key}_indices".format(key=key), **kwargs) return IntIndex(length, indices) def write_multi_index(self, key, index): - setattr(self.attrs, '{key}_nlevels'.format(key=key), index.nlevels) + setattr(self.attrs, "{key}_nlevels".format(key=key), index.nlevels) - for i, (lev, level_codes, name) in enumerate(zip(index.levels, - index.codes, - index.names)): + for i, (lev, level_codes, name) in enumerate( + zip(index.levels, index.codes, index.names) + ): # write the level if is_extension_type(lev): - raise NotImplementedError("Saving a MultiIndex with an " - "extension dtype is not supported.") - level_key = '{key}_level{idx}'.format(key=key, idx=i) - conv_level = _convert_index(lev, self.encoding, self.errors, - self.format_type).set_name(level_key) + raise NotImplementedError( + "Saving a MultiIndex with an " "extension dtype is not supported." + ) + level_key = "{key}_level{idx}".format(key=key, idx=i) + conv_level = _convert_index( + lev, self.encoding, self.errors, self.format_type + ).set_name(level_key) self.write_array(level_key, conv_level.values) node = getattr(self.group, level_key) node._v_attrs.kind = conv_level.kind node._v_attrs.name = name # write the name - setattr(node._v_attrs, '{key}_name{name}'.format( - key=key, name=name), name) + setattr(node._v_attrs, "{key}_name{name}".format(key=key, name=name), name) # write the labels - label_key = '{key}_label{idx}'.format(key=key, idx=i) + label_key = "{key}_label{idx}".format(key=key, idx=i) self.write_array(label_key, level_codes) def read_multi_index(self, key, **kwargs): - nlevels = getattr(self.attrs, '{key}_nlevels'.format(key=key)) + nlevels = getattr(self.attrs, "{key}_nlevels".format(key=key)) levels = [] codes = [] names = [] for i in range(nlevels): - level_key = '{key}_level{idx}'.format(key=key, idx=i) - name, lev = self.read_index_node(getattr(self.group, level_key), - **kwargs) + level_key = "{key}_level{idx}".format(key=key, idx=i) + name, lev = self.read_index_node(getattr(self.group, level_key), **kwargs) levels.append(lev) names.append(name) - label_key = '{key}_label{idx}'.format(key=key, idx=i) + label_key = "{key}_label{idx}".format(key=key, idx=i) level_codes = self.read_array(label_key, **kwargs) codes.append(level_codes) - return MultiIndex(levels=levels, codes=codes, names=names, - verify_integrity=True) + return MultiIndex( + levels=levels, codes=codes, names=names, verify_integrity=True + ) def read_index_node(self, node, start=None, stop=None): data = node[start:stop] # If the index was an empty array write_array_empty() will # have written a sentinel. Here we relace it with the original. - if ('shape' in node._v_attrs and - self._is_empty_array(getattr(node._v_attrs, 'shape'))): - data = np.empty(getattr(node._v_attrs, 'shape'), - dtype=getattr(node._v_attrs, 'value_type')) + if "shape" in node._v_attrs and self._is_empty_array( + getattr(node._v_attrs, "shape") + ): + data = np.empty( + getattr(node._v_attrs, "shape"), + dtype=getattr(node._v_attrs, "value_type"), + ) kind = _ensure_decoded(node._v_attrs.kind) name = None - if 'name' in node._v_attrs: + if "name" in node._v_attrs: name = _ensure_str(node._v_attrs.name) name = _ensure_decoded(name) - index_class = self._alias_to_class(_ensure_decoded( - getattr(node._v_attrs, 'index_class', ''))) + index_class = self._alias_to_class( + _ensure_decoded(getattr(node._v_attrs, "index_class", "")) + ) factory = self._get_index_factory(index_class) kwargs = {} - if 'freq' in node._v_attrs: - kwargs['freq'] = node._v_attrs['freq'] - - if 'tz' in node._v_attrs: - kwargs['tz'] = node._v_attrs['tz'] - - if kind in ('date', 'datetime'): - index = factory(_unconvert_index(data, kind, - encoding=self.encoding, - errors=self.errors), - dtype=object, **kwargs) + if "freq" in node._v_attrs: + kwargs["freq"] = node._v_attrs["freq"] + + if "tz" in node._v_attrs: + kwargs["tz"] = node._v_attrs["tz"] + + if kind in ("date", "datetime"): + index = factory( + _unconvert_index( + data, kind, encoding=self.encoding, errors=self.errors + ), + dtype=object, + **kwargs + ) else: - index = factory(_unconvert_index(data, kind, - encoding=self.encoding, - errors=self.errors), **kwargs) + index = factory( + _unconvert_index( + data, kind, encoding=self.encoding, errors=self.errors + ), + **kwargs + ) index.name = name @@ -2749,11 +2937,13 @@ def write_array(self, key, value, items=None): transposed = False if is_categorical_dtype(value): - raise NotImplementedError('Cannot store a category dtype in ' - 'a HDF5 dataset that uses format=' - '"fixed". Use format="table".') + raise NotImplementedError( + "Cannot store a category dtype in " + "a HDF5 dataset that uses format=" + '"fixed". Use format="table".' + ) if not empty_array: - if hasattr(value, 'T'): + if hasattr(value, "T"): # ExtensionArrays (1d) may not have transpose. value = value.T transposed = True @@ -2769,9 +2959,9 @@ def write_array(self, key, value, items=None): if atom is not None: # create an empty chunked array and fill it from value if not empty_array: - ca = self._handle.create_carray(self.group, key, atom, - value.shape, - filters=self._filters) + ca = self._handle.create_carray( + self.group, key, atom, value.shape, filters=self._filters + ) ca[:] = value getattr(self.group, key)._v_attrs.transposed = transposed @@ -2787,7 +2977,7 @@ def write_array(self, key, value, items=None): inferred_type = lib.infer_dtype(value.ravel(), skipna=False) if empty_array: pass - elif inferred_type == 'string': + elif inferred_type == "string": pass else: try: @@ -2797,32 +2987,26 @@ def write_array(self, key, value, items=None): ws = performance_doc % (inferred_type, key, items) warnings.warn(ws, PerformanceWarning, stacklevel=7) - vlarr = self._handle.create_vlarray(self.group, key, - _tables().ObjectAtom()) + vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom()) vlarr.append(value) else: if empty_array: self.write_array_empty(key, value) else: if is_datetime64_dtype(value.dtype): - self._handle.create_array( - self.group, key, value.view('i8')) - getattr( - self.group, key)._v_attrs.value_type = 'datetime64' + self._handle.create_array(self.group, key, value.view("i8")) + getattr(self.group, key)._v_attrs.value_type = "datetime64" elif is_datetime64tz_dtype(value.dtype): # store as UTC # with a zone - self._handle.create_array(self.group, key, - value.asi8) + self._handle.create_array(self.group, key, value.asi8) node = getattr(self.group, key) node._v_attrs.tz = _get_tz(value.tz) - node._v_attrs.value_type = 'datetime64' + node._v_attrs.value_type = "datetime64" elif is_timedelta64_dtype(value.dtype): - self._handle.create_array( - self.group, key, value.view('i8')) - getattr( - self.group, key)._v_attrs.value_type = 'timedelta64' + self._handle.create_array(self.group, key, value.view("i8")) + getattr(self.group, key)._v_attrs.value_type = "timedelta64" else: self._handle.create_array(self.group, key, value) @@ -2830,117 +3014,122 @@ def write_array(self, key, value, items=None): class LegacyFixed(GenericFixed): - def read_index_legacy(self, key, start=None, stop=None): node = getattr(self.group, key) data = node[start:stop] kind = node._v_attrs.kind - return _unconvert_index_legacy(data, kind, encoding=self.encoding, - errors=self.errors) + return _unconvert_index_legacy( + data, kind, encoding=self.encoding, errors=self.errors + ) class LegacySeriesFixed(LegacyFixed): - def read(self, **kwargs): kwargs = self.validate_read(kwargs) - index = self.read_index_legacy('index') - values = self.read_array('values') + index = self.read_index_legacy("index") + values = self.read_array("values") return Series(values, index=index) class LegacyFrameFixed(LegacyFixed): - def read(self, **kwargs): kwargs = self.validate_read(kwargs) - index = self.read_index_legacy('index') - columns = self.read_index_legacy('columns') - values = self.read_array('values') + index = self.read_index_legacy("index") + columns = self.read_index_legacy("columns") + values = self.read_array("values") return DataFrame(values, index=index, columns=columns) class SeriesFixed(GenericFixed): - pandas_kind = 'series' - attributes = ['name'] + pandas_kind = "series" + attributes = ["name"] @property def shape(self): try: - return len(getattr(self.group, 'values')), + return (len(getattr(self.group, "values")),) except (TypeError, AttributeError): return None def read(self, **kwargs): kwargs = self.validate_read(kwargs) - index = self.read_index('index', **kwargs) - values = self.read_array('values', **kwargs) + index = self.read_index("index", **kwargs) + values = self.read_array("values", **kwargs) return Series(values, index=index, name=self.name) def write(self, obj, **kwargs): super().write(obj, **kwargs) - self.write_index('index', obj.index) - self.write_array('values', obj.values) + self.write_index("index", obj.index) + self.write_array("values", obj.values) self.attrs.name = obj.name class SparseFixed(GenericFixed): - def validate_read(self, kwargs): """ we don't support start, stop kwds in Sparse """ kwargs = super().validate_read(kwargs) - if 'start' in kwargs or 'stop' in kwargs: - raise NotImplementedError("start and/or stop are not supported " - "in fixed Sparse reading") + if "start" in kwargs or "stop" in kwargs: + raise NotImplementedError( + "start and/or stop are not supported " "in fixed Sparse reading" + ) return kwargs class SparseSeriesFixed(SparseFixed): - pandas_kind = 'sparse_series' - attributes = ['name', 'fill_value', 'kind'] + pandas_kind = "sparse_series" + attributes = ["name", "fill_value", "kind"] def read(self, **kwargs): kwargs = self.validate_read(kwargs) - index = self.read_index('index') - sp_values = self.read_array('sp_values') - sp_index = self.read_index('sp_index') - return SparseSeries(sp_values, index=index, sparse_index=sp_index, - kind=self.kind or 'block', - fill_value=self.fill_value, - name=self.name) + index = self.read_index("index") + sp_values = self.read_array("sp_values") + sp_index = self.read_index("sp_index") + return SparseSeries( + sp_values, + index=index, + sparse_index=sp_index, + kind=self.kind or "block", + fill_value=self.fill_value, + name=self.name, + ) def write(self, obj, **kwargs): super().write(obj, **kwargs) - self.write_index('index', obj.index) - self.write_index('sp_index', obj.sp_index) - self.write_array('sp_values', obj.sp_values) + self.write_index("index", obj.index) + self.write_index("sp_index", obj.sp_index) + self.write_array("sp_values", obj.sp_values) self.attrs.name = obj.name self.attrs.fill_value = obj.fill_value self.attrs.kind = obj.kind class SparseFrameFixed(SparseFixed): - pandas_kind = 'sparse_frame' - attributes = ['default_kind', 'default_fill_value'] + pandas_kind = "sparse_frame" + attributes = ["default_kind", "default_fill_value"] def read(self, **kwargs): kwargs = self.validate_read(kwargs) - columns = self.read_index('columns') + columns = self.read_index("columns") sdict = {} for c in columns: - key = 'sparse_series_{columns}'.format(columns=c) + key = "sparse_series_{columns}".format(columns=c) s = SparseSeriesFixed(self.parent, getattr(self.group, key)) s.infer_axes() sdict[c] = s.read() - return SparseDataFrame(sdict, columns=columns, - default_kind=self.default_kind, - default_fill_value=self.default_fill_value) + return SparseDataFrame( + sdict, + columns=columns, + default_kind=self.default_kind, + default_fill_value=self.default_fill_value, + ) def write(self, obj, **kwargs): """ write it as a collection of individual sparse series """ super().write(obj, **kwargs) for name, ss in obj.items(): - key = 'sparse_series_{name}'.format(name=name) + key = "sparse_series_{name}".format(name=name) if key not in self.group._v_children: node = self._handle.create_group(self.group, key) else: @@ -2949,11 +3138,11 @@ def write(self, obj, **kwargs): s.write(ss) self.attrs.default_fill_value = obj.default_fill_value self.attrs.default_kind = obj.default_kind - self.write_index('columns', obj.columns) + self.write_index("columns", obj.columns) class BlockManagerFixed(GenericFixed): - attributes = ['ndim', 'nblocks'] + attributes = ["ndim", "nblocks"] is_shape_reversed = False @property @@ -2964,16 +3153,16 @@ def shape(self): # items items = 0 for i in range(self.nblocks): - node = getattr(self.group, 'block{idx}_items'.format(idx=i)) - shape = getattr(node, 'shape', None) + node = getattr(self.group, "block{idx}_items".format(idx=i)) + shape = getattr(node, "shape", None) if shape is not None: items += shape[0] # data shape - node = getattr(self.group, 'block0_values') - shape = getattr(node, 'shape', None) + node = getattr(self.group, "block0_values") + shape = getattr(node, "shape", None) if shape is not None: - shape = list(shape[0:(ndim - 1)]) + shape = list(shape[0 : (ndim - 1)]) else: shape = [] @@ -2997,19 +3186,18 @@ def read(self, start=None, stop=None, **kwargs): for i in range(self.ndim): _start, _stop = (start, stop) if i == select_axis else (None, None) - ax = self.read_index('axis{idx}'.format( - idx=i), start=_start, stop=_stop) + ax = self.read_index("axis{idx}".format(idx=i), start=_start, stop=_stop) axes.append(ax) items = axes[0] blocks = [] for i in range(self.nblocks): - blk_items = self.read_index('block{idx}_items'.format(idx=i)) - values = self.read_array('block{idx}_values'.format(idx=i), - start=_start, stop=_stop) - blk = make_block(values, - placement=items.get_indexer(blk_items)) + blk_items = self.read_index("block{idx}_items".format(idx=i)) + values = self.read_array( + "block{idx}_values".format(idx=i), start=_start, stop=_stop + ) + blk = make_block(values, placement=items.get_indexer(blk_items)) blocks.append(blk) return self.obj_type(BlockManager(blocks, axes)) @@ -3024,22 +3212,22 @@ def write(self, obj, **kwargs): for i, ax in enumerate(data.axes): if i == 0: if not ax.is_unique: - raise ValueError( - "Columns index has to be unique for fixed format") - self.write_index('axis{idx}'.format(idx=i), ax) + raise ValueError("Columns index has to be unique for fixed format") + self.write_index("axis{idx}".format(idx=i), ax) # Supporting mixed-type DataFrame objects...nontrivial self.attrs.nblocks = len(data.blocks) for i, blk in enumerate(data.blocks): # I have no idea why, but writing values before items fixed #2299 blk_items = data.items.take(blk.mgr_locs) - self.write_array('block{idx}_values'.format(idx=i), - blk.values, items=blk_items) - self.write_index('block{idx}_items'.format(idx=i), blk_items) + self.write_array( + "block{idx}_values".format(idx=i), blk.values, items=blk_items + ) + self.write_index("block{idx}_items".format(idx=i), blk_items) class FrameFixed(BlockManagerFixed): - pandas_kind = 'frame' + pandas_kind = "frame" obj_type = DataFrame @@ -3068,7 +3256,8 @@ class Table(Fixed): metadata : the names of the metadata columns """ - pandas_kind = 'wide_table' + + pandas_kind = "wide_table" table_type = None # type: str levels = 1 is_table = True @@ -3087,31 +3276,35 @@ def __init__(self, *args, **kwargs): @property def table_type_short(self): - return self.table_type.split('_')[0] + return self.table_type.split("_")[0] @property def format_type(self): - return 'table' + return "table" def __repr__(self): """ return a pretty representation of myself """ self.infer_axes() - dc = ",dc->[{columns}]".format(columns=(','.join( - self.data_columns) if len(self.data_columns) else '')) + dc = ",dc->[{columns}]".format( + columns=(",".join(self.data_columns) if len(self.data_columns) else "") + ) - ver = '' + ver = "" if self.is_old_version: - ver = "[{version}]".format( - version='.'.join(str(x) for x in self.version)) + ver = "[{version}]".format(version=".".join(str(x) for x in self.version)) return ( "{pandas_type:12.12}{ver} (typ->{table_type},nrows->{nrows}," "ncols->{ncols},indexers->[{index_axes}]{dc})".format( - pandas_type=self.pandas_type, ver=ver, - table_type=self.table_type_short, nrows=self.nrows, + pandas_type=self.pandas_type, + ver=ver, + table_type=self.table_type_short, + nrows=self.nrows, ncols=self.ncols, - index_axes=(','.join(a.name for a in self.index_axes)), dc=dc - )) + index_axes=(",".join(a.name for a in self.index_axes)), + dc=dc, + ) + ) def __getitem__(self, c): """ return the axis for c """ @@ -3129,9 +3322,11 @@ def validate(self, other): raise TypeError( "incompatible table_type with existing " "[{other} - {self}]".format( - other=other.table_type, self=self.table_type)) + other=other.table_type, self=self.table_type + ) + ) - for c in ['index_axes', 'non_index_axes', 'values_axes']: + for c in ["index_axes", "non_index_axes", "values_axes"]: sv = getattr(self, c, None) ov = getattr(other, c, None) if sv != ov: @@ -3143,12 +3338,15 @@ def validate(self, other): raise ValueError( "invalid combinate of [{c}] on appending data " "[{sax}] vs current table [{oax}]".format( - c=c, sax=sax, oax=oax)) + c=c, sax=sax, oax=oax + ) + ) # should never get here raise Exception( "invalid combinate of [{c}] on appending data [{sv}] vs " - "current table [{ov}]".format(c=c, sv=sv, ov=ov)) + "current table [{ov}]".format(c=c, sv=sv, ov=ov) + ) @property def is_multi_index(self): @@ -3157,20 +3355,22 @@ def is_multi_index(self): def validate_metadata(self, existing): """ create / validate metadata """ - self.metadata = [ - c.name for c in self.values_axes if c.metadata is not None] + self.metadata = [c.name for c in self.values_axes if c.metadata is not None] def validate_multiindex(self, obj): """validate that we can store the multi-index; reset and return the new object """ - levels = [l if l is not None else "level_{0}".format(i) - for i, l in enumerate(obj.index.names)] + levels = [ + l if l is not None else "level_{0}".format(i) + for i, l in enumerate(obj.index.names) + ] try: return obj.reset_index(), levels except ValueError: - raise ValueError("duplicate names/columns in the multi-index when " - "storing as a table") + raise ValueError( + "duplicate names/columns in the multi-index when " "storing as a table" + ) @property def nrows_expected(self): @@ -3180,11 +3380,11 @@ def nrows_expected(self): @property def is_exists(self): """ has this table been created """ - return 'table' in self.group + return "table" in self.group @property def storable(self): - return getattr(self.group, 'table', None) + return getattr(self.group, "table", None) @property def table(self): @@ -3215,19 +3415,28 @@ def is_transposed(self): @property def data_orientation(self): """return a tuple of my permutated axes, non_indexable at the front""" - return tuple(itertools.chain([int(a[0]) for a in self.non_index_axes], - [int(a.axis) for a in self.index_axes])) + return tuple( + itertools.chain( + [int(a[0]) for a in self.non_index_axes], + [int(a.axis) for a in self.index_axes], + ) + ) def queryables(self): """ return a dict of the kinds allowable columns for this object """ # compute the values_axes queryables return dict( - [(a.cname, a) for a in self.index_axes] + - [(self.storage_obj_type._AXIS_NAMES[axis], None) - for axis, values in self.non_index_axes] + - [(v.cname, v) for v in self.values_axes - if v.name in set(self.data_columns)] + [(a.cname, a) for a in self.index_axes] + + [ + (self.storage_obj_type._AXIS_NAMES[axis], None) + for axis, values in self.non_index_axes + ] + + [ + (v.cname, v) + for v in self.values_axes + if v.name in set(self.data_columns) + ] ) def index_cols(self): @@ -3240,8 +3449,7 @@ def values_cols(self): def _get_metadata_path(self, key): """ return the metadata pathname for this key """ - return "{group}/meta/{key}/meta".format(group=self.group._v_pathname, - key=key) + return "{group}/meta/{key}/meta".format(group=self.group._v_pathname, key=key) def write_metadata(self, key, values): """ @@ -3254,13 +3462,18 @@ def write_metadata(self, key, values): """ values = Series(values) - self.parent.put(self._get_metadata_path(key), values, format='table', - encoding=self.encoding, errors=self.errors, - nan_rep=self.nan_rep) + self.parent.put( + self._get_metadata_path(key), + values, + format="table", + encoding=self.encoding, + errors=self.errors, + nan_rep=self.nan_rep, + ) def read_metadata(self, key): """ return the meta data array for this key """ - if getattr(getattr(self.group, 'meta', None), key, None) is not None: + if getattr(getattr(self.group, "meta", None), key, None) is not None: return self.parent.select(self._get_metadata_path(key)) return None @@ -3284,34 +3497,24 @@ def set_attrs(self): def get_attrs(self): """ retrieve our attributes """ - self.non_index_axes = getattr( - self.attrs, 'non_index_axes', None) or [] - self.data_columns = getattr( - self.attrs, 'data_columns', None) or [] - self.info = getattr( - self.attrs, 'info', None) or dict() - self.nan_rep = getattr(self.attrs, 'nan_rep', None) - self.encoding = _ensure_encoding( - getattr(self.attrs, 'encoding', None)) - self.errors = _ensure_decoded(getattr(self.attrs, 'errors', 'strict')) - self.levels = getattr( - self.attrs, 'levels', None) or [] - self.index_axes = [ - a.infer(self) for a in self.indexables if a.is_an_indexable - ] + self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or [] + self.data_columns = getattr(self.attrs, "data_columns", None) or [] + self.info = getattr(self.attrs, "info", None) or dict() + self.nan_rep = getattr(self.attrs, "nan_rep", None) + self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) + self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) + self.levels = getattr(self.attrs, "levels", None) or [] + self.index_axes = [a.infer(self) for a in self.indexables if a.is_an_indexable] self.values_axes = [ a.infer(self) for a in self.indexables if not a.is_an_indexable ] - self.metadata = getattr( - self.attrs, 'metadata', None) or [] + self.metadata = getattr(self.attrs, "metadata", None) or [] def validate_version(self, where=None): """ are we trying to operate on an old version? """ if where is not None: - if (self.version[0] <= 0 and self.version[1] <= 10 and - self.version[2] < 1): - ws = incompatibility_doc % '.'.join( - [str(x) for x in self.version]) + if self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1: + ws = incompatibility_doc % ".".join([str(x) for x in self.version]) warnings.warn(ws, IncompatibilityWarning) def validate_min_itemsize(self, min_itemsize): @@ -3327,12 +3530,13 @@ def validate_min_itemsize(self, min_itemsize): for k, v in min_itemsize.items(): # ok, apply generally - if k == 'values': + if k == "values": continue if k not in q: raise ValueError( "min_itemsize has the key [{key}] which is not an axis or " - "data_column".format(key=k)) + "data_column".format(key=k) + ) @property def indexables(self): @@ -3342,10 +3546,12 @@ def indexables(self): self._indexables = [] # index columns - self._indexables.extend([ - IndexCol(name=name, axis=axis, pos=i) - for i, (axis, name) in enumerate(self.attrs.index_cols) - ]) + self._indexables.extend( + [ + IndexCol(name=name, axis=axis, pos=i) + for i, (axis, name) in enumerate(self.attrs.index_cols) + ] + ) # values columns dc = set(self.data_columns) @@ -3355,11 +3561,13 @@ def f(i, c): klass = DataCol if c in dc: klass = DataIndexableCol - return klass.create_for_block(i=i, name=c, pos=base_pos + i, - version=self.version) + return klass.create_for_block( + i=i, name=c, pos=base_pos + i, version=self.version + ) self._indexables.extend( - [f(i, c) for i, c in enumerate(self.attrs.values_cols)]) + [f(i, c) for i, c in enumerate(self.attrs.values_cols)] + ) return self._indexables @@ -3395,9 +3603,9 @@ def create_index(self, columns=None, optlevel=None, kind=None): kw = dict() if optlevel is not None: - kw['optlevel'] = optlevel + kw["optlevel"] = optlevel if kind is not None: - kw['kind'] = kind + kw["kind"] = kind table = self.table for c in columns: @@ -3413,23 +3621,24 @@ def create_index(self, columns=None, optlevel=None, kind=None): if kind is not None and cur_kind != kind: v.remove_index() else: - kw['kind'] = cur_kind + kw["kind"] = cur_kind if optlevel is not None and cur_optlevel != optlevel: v.remove_index() else: - kw['optlevel'] = cur_optlevel + kw["optlevel"] = cur_optlevel # create the index if not v.is_indexed: - if v.type.startswith('complex'): + if v.type.startswith("complex"): raise TypeError( - 'Columns containing complex values can be stored ' - 'but cannot' - ' be indexed when using table format. Either use ' - 'fixed format, set index=False, or do not include ' - 'the columns containing complex values to ' - 'data_columns when initializing the table.') + "Columns containing complex values can be stored " + "but cannot" + " be indexed when using table format. Either use " + "fixed format, set index=False, or do not include " + "the columns containing complex values to " + "data_columns when initializing the table." + ) v.create_index(**kw) def read_axes(self, where, **kwargs): @@ -3453,9 +3662,14 @@ def read_axes(self, where, **kwargs): a.set_info(self.info) # `kwargs` may contain `start` and `stop` arguments if passed to # `store.select()`. If set they determine the index size. - a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding, - errors=self.errors, start=kwargs.get('start'), - stop=kwargs.get('stop')) + a.convert( + values, + nan_rep=self.nan_rep, + encoding=self.encoding, + errors=self.errors, + start=kwargs.get("start"), + stop=kwargs.get("stop"), + ) return True @@ -3473,9 +3687,11 @@ def validate_data_columns(self, data_columns, min_itemsize): axis, axis_labels = self.non_index_axes[0] info = self.info.get(axis, dict()) - if info.get('type') == 'MultiIndex' and data_columns: - raise ValueError("cannot use a multi-index on axis [{0}] with " - "data_columns {1}".format(axis, data_columns)) + if info.get("type") == "MultiIndex" and data_columns: + raise ValueError( + "cannot use a multi-index on axis [{0}] with " + "data_columns {1}".format(axis, data_columns) + ) # evaluate the passed data_columns, True == use all columns # take only valide axis labels @@ -3488,16 +3704,27 @@ def validate_data_columns(self, data_columns, min_itemsize): if isinstance(min_itemsize, dict): existing_data_columns = set(data_columns) - data_columns.extend([ - k for k in min_itemsize.keys() - if k != 'values' and k not in existing_data_columns - ]) + data_columns.extend( + [ + k + for k in min_itemsize.keys() + if k != "values" and k not in existing_data_columns + ] + ) # return valid columns in the order of our axis return [c for c in data_columns if c in axis_labels] - def create_axes(self, axes, obj, validate=True, nan_rep=None, - data_columns=None, min_itemsize=None, **kwargs): + def create_axes( + self, + axes, + obj, + validate=True, + nan_rep=None, + data_columns=None, + min_itemsize=None, + **kwargs + ): """ create and return the axes legacy tables create an indexable column, indexable index, non-indexable fields @@ -3524,8 +3751,8 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, except KeyError: raise TypeError( "cannot properly create the storer for: [group->{group}," - "value->{value}]".format( - group=self.group._v_name, value=type(obj))) + "value->{value}]".format(group=self.group._v_name, value=type(obj)) + ) # map axes to numbers axes = [obj._get_axis_number(a) for a in axes] @@ -3546,7 +3773,8 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, # currently support on ndim-1 axes if len(axes) != self.ndim - 1: raise ValueError( - "currently only support ndim-1 indexers in an AppendableTable") + "currently only support ndim-1 indexers in an AppendableTable" + ) # create according to the new data self.non_index_axes = [] @@ -3554,7 +3782,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, # nan_representation if nan_rep is None: - nan_rep = 'nan' + nan_rep = "nan" self.nan_rep = nan_rep @@ -3564,9 +3792,11 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, if i in axes: name = obj._AXIS_NAMES[i] - index_axes_map[i] = _convert_index( - a, self.encoding, self.errors, self.format_type - ).set_name(name).set_axis(i) + index_axes_map[i] = ( + _convert_index(a, self.encoding, self.errors, self.format_type) + .set_name(name) + .set_axis(i) + ) else: # we might be able to change the axes on the appending data if @@ -3575,18 +3805,20 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, if existing_table is not None: indexer = len(self.non_index_axes) exist_axis = existing_table.non_index_axes[indexer][1] - if not array_equivalent(np.array(append_axis), - np.array(exist_axis)): + if not array_equivalent( + np.array(append_axis), np.array(exist_axis) + ): # ahah! -> reindex - if array_equivalent(np.array(sorted(append_axis)), - np.array(sorted(exist_axis))): + if array_equivalent( + np.array(sorted(append_axis)), np.array(sorted(exist_axis)) + ): append_axis = exist_axis # the non_index_axes info info = _get_info(self.info, i) - info['names'] = list(a.names) - info['type'] = a.__class__.__name__ + info["names"] = list(a.names) + info["type"] = a.__class__.__name__ self.non_index_axes.append((i, append_axis)) @@ -3614,12 +3846,10 @@ def get_blk_items(mgr, blocks): blk_items = get_blk_items(block_obj._data, blocks) if len(self.non_index_axes): axis, axis_labels = self.non_index_axes[0] - data_columns = self.validate_data_columns( - data_columns, min_itemsize) + data_columns = self.validate_data_columns(data_columns, min_itemsize) if len(data_columns): mgr = block_obj.reindex( - Index(axis_labels).difference(Index(data_columns)), - axis=axis + Index(axis_labels).difference(Index(data_columns)), axis=axis )._data blocks = list(mgr.blocks) @@ -3631,8 +3861,10 @@ def get_blk_items(mgr, blocks): # reorder the blocks in the same order as the existing_table if we can if existing_table is not None: - by_items = {tuple(b_items.tolist()): (b, b_items) - for b, b_items in zip(blocks, blk_items)} + by_items = { + tuple(b_items.tolist()): (b, b_items) + for b, b_items in zip(blocks, blk_items) + } new_blocks = [] new_blk_items = [] for ea in existing_table.values_axes: @@ -3645,8 +3877,9 @@ def get_blk_items(mgr, blocks): raise ValueError( "cannot match existing table structure for [{items}] " "on appending data".format( - items=(','.join(pprint_thing(item) for - item in items)))) + items=(",".join(pprint_thing(item) for item in items)) + ) + ) blocks = new_blocks blk_items = new_blk_items @@ -3659,8 +3892,7 @@ def get_blk_items(mgr, blocks): name = None # we have a data_column - if (data_columns and len(b_items) == 1 and - b_items[0] in data_columns): + if data_columns and len(b_items) == 1 and b_items[0] in data_columns: klass = DataIndexableCol name = b_items[0] self.data_columns.append(name) @@ -3674,21 +3906,24 @@ def get_blk_items(mgr, blocks): raise ValueError( "Incompatible appended table [{blocks}]" "with existing table [{table}]".format( - blocks=blocks, - table=existing_table.values_axes)) + blocks=blocks, table=existing_table.values_axes + ) + ) else: existing_col = None try: - col = klass.create_for_block( - i=i, name=name, version=self.version) - col.set_atom(block=b, block_items=b_items, - existing_col=existing_col, - min_itemsize=min_itemsize, - nan_rep=nan_rep, - encoding=self.encoding, - errors=self.errors, - info=self.info) + col = klass.create_for_block(i=i, name=name, version=self.version) + col.set_atom( + block=b, + block_items=b_items, + existing_col=existing_col, + min_itemsize=min_itemsize, + nan_rep=nan_rep, + encoding=self.encoding, + errors=self.errors, + info=self.info, + ) col.set_pos(j) self.values_axes.append(col) @@ -3698,7 +3933,9 @@ def get_blk_items(mgr, blocks): raise Exception( "cannot find the correct atom type -> " "[dtype->{name},items->{items}] {detail!s}".format( - name=b.dtype.name, items=b_items, detail=detail)) + name=b.dtype.name, items=b_items, detail=detail + ) + ) j += 1 # validate our min_itemsize @@ -3747,8 +3984,7 @@ def process_filter(field, filt): filt = filt.union(Index(self.levels)) takers = op(axis_values, filt) - return obj.loc._getitem_axis(takers, - axis=axis_number) + return obj.loc._getitem_axis(takers, axis=axis_number) # this might be the name of a file IN an axis elif field in axis_values: @@ -3761,38 +3997,42 @@ def process_filter(field, filt): if isinstance(obj, DataFrame): axis_number = 1 - axis_number takers = op(values, filt) - return obj.loc._getitem_axis(takers, - axis=axis_number) + return obj.loc._getitem_axis(takers, axis=axis_number) - raise ValueError("cannot find the field [{field}] for " - "filtering!".format(field=field)) + raise ValueError( + "cannot find the field [{field}] for " + "filtering!".format(field=field) + ) obj = process_filter(field, filt) return obj - def create_description(self, complib=None, complevel=None, - fletcher32=False, expectedrows=None): + def create_description( + self, complib=None, complevel=None, fletcher32=False, expectedrows=None + ): """ create the description of the table from the axes & values """ # provided expected rows if its passed if expectedrows is None: expectedrows = max(self.nrows_expected, 10000) - d = dict(name='table', expectedrows=expectedrows) + d = dict(name="table", expectedrows=expectedrows) # description from the axes & values - d['description'] = {a.cname: a.typ for a in self.axes} + d["description"] = {a.cname: a.typ for a in self.axes} if complib: if complevel is None: complevel = self._complevel or 9 filters = _tables().Filters( - complevel=complevel, complib=complib, - fletcher32=fletcher32 or self._fletcher32) - d['filters'] = filters + complevel=complevel, + complib=complib, + fletcher32=fletcher32 or self._fletcher32, + ) + d["filters"] = filters elif self._filters is not None: - d['filters'] = self._filters + d["filters"] = self._filters return d @@ -3809,15 +4049,14 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs): return False # create the selection - self.selection = Selection( - self, where=where, start=start, stop=stop, **kwargs) + self.selection = Selection(self, where=where, start=start, stop=stop, **kwargs) coords = self.selection.select_coords() if self.selection.filter is not None: for field, op, filt in self.selection.filter.format(): data = self.read_column( - field, start=coords.min(), stop=coords.max() + 1) - coords = coords[ - op(data.iloc[coords - coords.min()], filt).values] + field, start=coords.min(), stop=coords.max() + 1 + ) + coords = coords[op(data.iloc[coords - coords.min()], filt).values] return Index(coords) @@ -3834,8 +4073,7 @@ def read_column(self, column, where=None, start=None, stop=None): return False if where is not None: - raise TypeError("read_column does not currently accept a where " - "clause") + raise TypeError("read_column does not currently accept a where " "clause") # find the axes for a in self.axes: @@ -3844,20 +4082,27 @@ def read_column(self, column, where=None, start=None, stop=None): if not a.is_data_indexable: raise ValueError( "column [{column}] can not be extracted individually; " - "it is not data indexable".format(column=column)) + "it is not data indexable".format(column=column) + ) # column must be an indexable or a data column c = getattr(self.table.cols, column) a.set_info(self.info) - return Series(_set_tz(a.convert(c[start:stop], - nan_rep=self.nan_rep, - encoding=self.encoding, - errors=self.errors - ).take_data(), - a.tz, True), name=column) + return Series( + _set_tz( + a.convert( + c[start:stop], + nan_rep=self.nan_rep, + encoding=self.encoding, + errors=self.errors, + ).take_data(), + a.tz, + True, + ), + name=column, + ) - raise KeyError( - "column [{column}] not found in the table".format(column=column)) + raise KeyError("column [{column}] not found in the table".format(column=column)) class WORMTable(Table): @@ -3866,7 +4111,8 @@ class WORMTable(Table): table. writing is a one-time operation the data are stored in a format that allows for searching the data on disk """ - table_type = 'worm' + + table_type = "worm" def read(self, **kwargs): """ read the indices and the indexing array, calculate offset rows and @@ -3889,12 +4135,13 @@ class LegacyTable(Table): that can be easily searched """ + _indexables = [ - IndexCol(name='index', axis=1, pos=0), - IndexCol(name='column', axis=2, pos=1, index_kind='columns_kind'), - DataCol(name='fields', cname='values', kind_attr='fields', pos=2) + IndexCol(name="index", axis=1, pos=0), + IndexCol(name="column", axis=2, pos=1, index_kind="columns_kind"), + DataCol(name="fields", cname="values", kind_attr="fields", pos=2), ] # type: Optional[List[IndexCol]] - table_type = 'legacy' + table_type = "legacy" ndim = 3 def write(self, **kwargs): @@ -3911,20 +4158,32 @@ def read(self, where=None, columns=None, **kwargs): class AppendableTable(LegacyTable): """ support the new appendable table formats """ - _indexables = None - table_type = 'appendable' - def write(self, obj, axes=None, append=False, complib=None, - complevel=None, fletcher32=None, min_itemsize=None, - chunksize=None, expectedrows=None, dropna=False, **kwargs): + _indexables = None + table_type = "appendable" + + def write( + self, + obj, + axes=None, + append=False, + complib=None, + complevel=None, + fletcher32=None, + min_itemsize=None, + chunksize=None, + expectedrows=None, + dropna=False, + **kwargs + ): if not append and self.is_exists: - self._handle.remove_node(self.group, 'table') + self._handle.remove_node(self.group, "table") # create the axes - self.create_axes(axes=axes, obj=obj, validate=append, - min_itemsize=min_itemsize, - **kwargs) + self.create_axes( + axes=axes, obj=obj, validate=append, min_itemsize=min_itemsize, **kwargs + ) for a in self.axes: a.validate(self, append) @@ -3932,10 +4191,12 @@ def write(self, obj, axes=None, append=False, complib=None, if not self.is_exists: # create the table - options = self.create_description(complib=complib, - complevel=complevel, - fletcher32=fletcher32, - expectedrows=expectedrows) + options = self.create_description( + complib=complib, + complevel=complevel, + fletcher32=fletcher32, + expectedrows=expectedrows, + ) # set the table attributes self.set_attrs() @@ -3973,7 +4234,7 @@ def write_data(self, chunksize, dropna=False): # column, otherwise ignore the mask mask = isna(a.data).all(axis=0) if isinstance(mask, np.ndarray): - masks.append(mask.astype('u1', copy=False)) + masks.append(mask.astype("u1", copy=False)) # consolidate masks if len(masks): @@ -3992,13 +4253,13 @@ def write_data(self, chunksize, dropna=False): # broadcast to all other indexes except myself if i > 0 and i < nindexes: - repeater = np.prod( - [indexes[bi].shape[0] for bi in range(0, i)]) + repeater = np.prod([indexes[bi].shape[0] for bi in range(0, i)]) idx = np.tile(idx, repeater) if i < nindexes - 1: - repeater = np.prod([indexes[bi].shape[0] - for bi in range(i + 1, nindexes)]) + repeater = np.prod( + [indexes[bi].shape[0] for bi in range(i + 1, nindexes)] + ) idx = np.repeat(idx, repeater) bindexes.append(idx) @@ -4006,8 +4267,7 @@ def write_data(self, chunksize, dropna=False): # transpose the values so first dimension is last # reshape the values if needed values = [a.take_data() for a in self.values_axes] - values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) - for v in values] + values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values] bvalues = [] for i, v in enumerate(values): new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape @@ -4029,7 +4289,8 @@ def write_data(self, chunksize, dropna=False): rows, indexes=[a[start_i:end_i] for a in bindexes], mask=mask[start_i:end_i] if mask is not None else None, - values=[v[start_i:end_i] for v in bvalues]) + values=[v[start_i:end_i] for v in bvalues], + ) def write_data_chunk(self, rows, indexes, mask, values): """ @@ -4068,8 +4329,7 @@ def write_data_chunk(self, rows, indexes, mask, values): rows = rows[m] except Exception as detail: - raise Exception( - "cannot create row-data -> {detail}".format(detail=detail)) + raise Exception("cannot create row-data -> {detail}".format(detail=detail)) try: if len(rows): @@ -4077,8 +4337,8 @@ def write_data_chunk(self, rows, indexes, mask, values): self.table.flush() except Exception as detail: raise TypeError( - "tables cannot write this data -> {detail}".format( - detail=detail)) + "tables cannot write this data -> {detail}".format(detail=detail) + ) def delete(self, where=None, start=None, stop=None, **kwargs): @@ -4101,8 +4361,7 @@ def delete(self, where=None, start=None, stop=None, **kwargs): # create the selection table = self.table - self.selection = Selection( - self, where, start=start, stop=stop, **kwargs) + self.selection = Selection(self, where, start=start, stop=stop, **kwargs) values = self.selection.select_coords() # delete the rows in reverse order @@ -4131,8 +4390,9 @@ def delete(self, where=None, start=None, stop=None, **kwargs): pg = groups.pop() for g in reversed(groups): rows = sorted_series.take(range(g, pg)) - table.remove_rows(start=rows[rows.index[0] - ], stop=rows[rows.index[-1]] + 1) + table.remove_rows( + start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1 + ) pg = g self.table.flush() @@ -4143,8 +4403,9 @@ def delete(self, where=None, start=None, stop=None, **kwargs): class AppendableFrameTable(AppendableTable): """ support the new appendable table formats """ - pandas_kind = 'frame_table' - table_type = 'appendable_frame' + + pandas_kind = "frame_table" + table_type = "appendable_frame" ndim = 2 obj_type = DataFrame # type: Type[Union[DataFrame, Series]] @@ -4163,29 +4424,32 @@ def read(self, where=None, columns=None, **kwargs): if not self.read_axes(where=where, **kwargs): return None - info = (self.info.get(self.non_index_axes[0][0], dict()) - if len(self.non_index_axes) else dict()) + info = ( + self.info.get(self.non_index_axes[0][0], dict()) + if len(self.non_index_axes) + else dict() + ) index = self.index_axes[0].values frames = [] for a in self.values_axes: # we could have a multi-index constructor here # ensure_index doesn't recognized our list-of-tuples here - if info.get('type') == 'MultiIndex': + if info.get("type") == "MultiIndex": cols = MultiIndex.from_tuples(a.values) else: cols = Index(a.values) - names = info.get('names') + names = info.get("names") if names is not None: cols.set_names(names, inplace=True) if self.is_transposed: values = a.cvalues index_ = cols - cols_ = Index(index, name=getattr(index, 'name', None)) + cols_ = Index(index, name=getattr(index, "name", None)) else: values = a.cvalues.T - index_ = Index(index, name=getattr(index, 'name', None)) + index_ = Index(index, name=getattr(index, "name", None)) cols_ = cols # if we have a DataIndexableCol, its shape will only be 1 dim @@ -4209,8 +4473,9 @@ def read(self, where=None, columns=None, **kwargs): class AppendableSeriesTable(AppendableFrameTable): """ support the new appendable table formats """ - pandas_kind = 'series_table' - table_type = 'appendable_series' + + pandas_kind = "series_table" + table_type = "appendable_series" ndim = 2 obj_type = Series storage_obj_type = DataFrame @@ -4225,11 +4490,10 @@ def get_object(self, obj): def write(self, obj, data_columns=None, **kwargs): """ we are going to write this as a frame table """ if not isinstance(obj, DataFrame): - name = obj.name or 'values' + name = obj.name or "values" obj = DataFrame({name: obj}, index=obj.index) obj.columns = [name] - return super().write(obj=obj, data_columns=obj.columns.tolist(), - **kwargs) + return super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs) def read(self, columns=None, **kwargs): @@ -4245,19 +4509,20 @@ def read(self, columns=None, **kwargs): s = s.iloc[:, 0] # remove the default name - if s.name == 'values': + if s.name == "values": s.name = None return s class AppendableMultiSeriesTable(AppendableSeriesTable): """ support the new appendable table formats """ - pandas_kind = 'series_table' - table_type = 'appendable_multiseries' + + pandas_kind = "series_table" + table_type = "appendable_multiseries" def write(self, obj, **kwargs): """ we are going to write this as a frame table """ - name = obj.name or 'values' + name = obj.name or "values" obj, self.levels = self.validate_multiindex(obj) cols = list(self.levels) cols.append(name) @@ -4267,8 +4532,9 @@ def write(self, obj, **kwargs): class GenericTable(AppendableFrameTable): """ a table that read/writes the generic pytables table format """ - pandas_kind = 'frame_table' - table_type = 'generic_table' + + pandas_kind = "frame_table" + table_type = "generic_table" ndim = 2 obj_type = DataFrame @@ -4278,7 +4544,7 @@ def pandas_type(self): @property def storable(self): - return getattr(self.group, 'table', None) or self.group + return getattr(self.group, "table", None) or self.group def get_attrs(self): """ retrieve our attributes """ @@ -4286,10 +4552,10 @@ def get_attrs(self): self.nan_rep = None self.levels = [] - self.index_axes = [a.infer(self) - for a in self.indexables if a.is_an_indexable] - self.values_axes = [a.infer(self) - for a in self.indexables if not a.is_an_indexable] + self.index_axes = [a.infer(self) for a in self.indexables if a.is_an_indexable] + self.values_axes = [ + a.infer(self) for a in self.indexables if not a.is_an_indexable + ] self.data_columns = [a.name for a in self.values_axes] @property @@ -4300,12 +4566,13 @@ def indexables(self): d = self.description # the index columns is just a simple index - self._indexables = [GenericIndexCol(name='index', axis=0)] + self._indexables = [GenericIndexCol(name="index", axis=0)] for i, n in enumerate(d._v_names): dc = GenericDataIndexableCol( - name=n, pos=i, values=[n], version=self.version) + name=n, pos=i, values=[n], version=self.version + ) self._indexables.append(dc) return self._indexables @@ -4317,14 +4584,15 @@ def write(self, **kwargs): class AppendableMultiFrameTable(AppendableFrameTable): """ a frame with a multi-index """ - table_type = 'appendable_multiframe' + + table_type = "appendable_multiframe" obj_type = DataFrame ndim = 2 _re_levels = re.compile(r"^level_\d+$") @property def table_type_short(self): - return 'appendable_multi' + return "appendable_multi" def write(self, obj, data_columns=None, **kwargs): if data_columns is None: @@ -4343,9 +4611,9 @@ def read(self, **kwargs): df = df.set_index(self.levels) # remove names for 'level_%d' - df.index = df.index.set_names([ - None if self._re_levels.search(l) else l for l in df.index.names - ]) + df.index = df.index.set_names( + [None if self._re_levels.search(l) else l for l in df.index.names] + ) return df @@ -4379,6 +4647,7 @@ def _get_info(info, name): idx = info[name] = dict() return idx + # tz to/from coercion @@ -4404,146 +4673,172 @@ def _set_tz(values, tz, preserve_UTC=False, coerce=False): coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray """ if tz is not None: - name = getattr(values, 'name', None) + name = getattr(values, "name", None) values = values.ravel() tz = timezones.get_timezone(_ensure_decoded(tz)) values = DatetimeIndex(values, name=name) if values.tz is None: - values = values.tz_localize('UTC').tz_convert(tz) + values = values.tz_localize("UTC").tz_convert(tz) if preserve_UTC: - if tz == 'UTC': + if tz == "UTC": values = list(values) elif coerce: - values = np.asarray(values, dtype='M8[ns]') + values = np.asarray(values, dtype="M8[ns]") return values -def _convert_index(index, encoding=None, errors='strict', format_type=None): - index_name = getattr(index, 'name', None) +def _convert_index(index, encoding=None, errors="strict", format_type=None): + index_name = getattr(index, "name", None) if isinstance(index, DatetimeIndex): converted = index.asi8 - return IndexCol(converted, 'datetime64', _tables().Int64Col(), - freq=getattr(index, 'freq', None), - tz=getattr(index, 'tz', None), - index_name=index_name) + return IndexCol( + converted, + "datetime64", + _tables().Int64Col(), + freq=getattr(index, "freq", None), + tz=getattr(index, "tz", None), + index_name=index_name, + ) elif isinstance(index, TimedeltaIndex): converted = index.asi8 - return IndexCol(converted, 'timedelta64', _tables().Int64Col(), - freq=getattr(index, 'freq', None), - index_name=index_name) + return IndexCol( + converted, + "timedelta64", + _tables().Int64Col(), + freq=getattr(index, "freq", None), + index_name=index_name, + ) elif isinstance(index, (Int64Index, PeriodIndex)): atom = _tables().Int64Col() # avoid to store ndarray of Period objects - return IndexCol(index._ndarray_values, 'integer', atom, - freq=getattr(index, 'freq', None), - index_name=index_name) + return IndexCol( + index._ndarray_values, + "integer", + atom, + freq=getattr(index, "freq", None), + index_name=index_name, + ) if isinstance(index, MultiIndex): - raise TypeError('MultiIndex not supported here!') + raise TypeError("MultiIndex not supported here!") inferred_type = lib.infer_dtype(index, skipna=False) values = np.asarray(index) - if inferred_type == 'datetime64': - converted = values.view('i8') - return IndexCol(converted, 'datetime64', _tables().Int64Col(), - freq=getattr(index, 'freq', None), - tz=getattr(index, 'tz', None), - index_name=index_name) - elif inferred_type == 'timedelta64': - converted = values.view('i8') - return IndexCol(converted, 'timedelta64', _tables().Int64Col(), - freq=getattr(index, 'freq', None), - index_name=index_name) - elif inferred_type == 'datetime': - converted = np.asarray([(time.mktime(v.timetuple()) + - v.microsecond / 1E6) for v in values], - dtype=np.float64) - return IndexCol(converted, 'datetime', _tables().Time64Col(), - index_name=index_name) - elif inferred_type == 'date': - converted = np.asarray([v.toordinal() for v in values], - dtype=np.int32) - return IndexCol(converted, 'date', _tables().Time32Col(), - index_name=index_name) - elif inferred_type == 'string': + if inferred_type == "datetime64": + converted = values.view("i8") + return IndexCol( + converted, + "datetime64", + _tables().Int64Col(), + freq=getattr(index, "freq", None), + tz=getattr(index, "tz", None), + index_name=index_name, + ) + elif inferred_type == "timedelta64": + converted = values.view("i8") + return IndexCol( + converted, + "timedelta64", + _tables().Int64Col(), + freq=getattr(index, "freq", None), + index_name=index_name, + ) + elif inferred_type == "datetime": + converted = np.asarray( + [(time.mktime(v.timetuple()) + v.microsecond / 1e6) for v in values], + dtype=np.float64, + ) + return IndexCol( + converted, "datetime", _tables().Time64Col(), index_name=index_name + ) + elif inferred_type == "date": + converted = np.asarray([v.toordinal() for v in values], dtype=np.int32) + return IndexCol(converted, "date", _tables().Time32Col(), index_name=index_name) + elif inferred_type == "string": # atom = _tables().ObjectAtom() # return np.asarray(values, dtype='O'), 'object', atom converted = _convert_string_array(values, encoding, errors) itemsize = converted.dtype.itemsize return IndexCol( - converted, 'string', _tables().StringCol(itemsize), - itemsize=itemsize, index_name=index_name + converted, + "string", + _tables().StringCol(itemsize), + itemsize=itemsize, + index_name=index_name, ) - elif inferred_type == 'unicode': - if format_type == 'fixed': + elif inferred_type == "unicode": + if format_type == "fixed": atom = _tables().ObjectAtom() - return IndexCol(np.asarray(values, dtype='O'), 'object', atom, - index_name=index_name) + return IndexCol( + np.asarray(values, dtype="O"), "object", atom, index_name=index_name + ) raise TypeError( - "[unicode] is not supported as a in index type for [{0}] formats" - .format(format_type) + "[unicode] is not supported as a in index type for [{0}] formats".format( + format_type + ) ) - elif inferred_type == 'integer': + elif inferred_type == "integer": # take a guess for now, hope the values fit atom = _tables().Int64Col() - return IndexCol(np.asarray(values, dtype=np.int64), 'integer', atom, - index_name=index_name) - elif inferred_type == 'floating': + return IndexCol( + np.asarray(values, dtype=np.int64), "integer", atom, index_name=index_name + ) + elif inferred_type == "floating": atom = _tables().Float64Col() - return IndexCol(np.asarray(values, dtype=np.float64), 'float', atom, - index_name=index_name) + return IndexCol( + np.asarray(values, dtype=np.float64), "float", atom, index_name=index_name + ) else: # pragma: no cover atom = _tables().ObjectAtom() - return IndexCol(np.asarray(values, dtype='O'), 'object', atom, - index_name=index_name) + return IndexCol( + np.asarray(values, dtype="O"), "object", atom, index_name=index_name + ) -def _unconvert_index(data, kind, encoding=None, errors='strict'): +def _unconvert_index(data, kind, encoding=None, errors="strict"): kind = _ensure_decoded(kind) - if kind == 'datetime64': + if kind == "datetime64": index = DatetimeIndex(data) - elif kind == 'timedelta64': + elif kind == "timedelta64": index = TimedeltaIndex(data) - elif kind == 'datetime': - index = np.asarray([datetime.fromtimestamp(v) for v in data], - dtype=object) - elif kind == 'date': + elif kind == "datetime": + index = np.asarray([datetime.fromtimestamp(v) for v in data], dtype=object) + elif kind == "date": try: - index = np.asarray( - [date.fromordinal(v) for v in data], dtype=object) + index = np.asarray([date.fromordinal(v) for v in data], dtype=object) except (ValueError): - index = np.asarray( - [date.fromtimestamp(v) for v in data], dtype=object) - elif kind in ('integer', 'float'): + index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object) + elif kind in ("integer", "float"): index = np.asarray(data) - elif kind in ('string'): - index = _unconvert_string_array(data, nan_rep=None, encoding=encoding, - errors=errors) - elif kind == 'object': + elif kind in ("string"): + index = _unconvert_string_array( + data, nan_rep=None, encoding=encoding, errors=errors + ) + elif kind == "object": index = np.asarray(data[0]) else: # pragma: no cover - raise ValueError('unrecognized index type {kind}'.format(kind=kind)) + raise ValueError("unrecognized index type {kind}".format(kind=kind)) return index -def _unconvert_index_legacy(data, kind, legacy=False, encoding=None, - errors='strict'): +def _unconvert_index_legacy(data, kind, legacy=False, encoding=None, errors="strict"): kind = _ensure_decoded(kind) - if kind == 'datetime': + if kind == "datetime": index = to_datetime(data) - elif kind in ('integer'): + elif kind in ("integer"): index = np.asarray(data, dtype=object) - elif kind in ('string'): - index = _unconvert_string_array(data, nan_rep=None, encoding=encoding, - errors=errors) + elif kind in ("string"): + index = _unconvert_string_array( + data, nan_rep=None, encoding=encoding, errors=errors + ) else: # pragma: no cover - raise ValueError('unrecognized index type {kind}'.format(kind=kind)) + raise ValueError("unrecognized index type {kind}".format(kind=kind)) return index @@ -4566,8 +4861,9 @@ def _convert_string_array(data, encoding, errors, itemsize=None): # encode if needed if encoding is not None and len(data): - data = Series(data.ravel()).str.encode( - encoding, errors).values.reshape(data.shape) + data = ( + Series(data.ravel()).str.encode(encoding, errors).values.reshape(data.shape) + ) # create the sized dtype if itemsize is None: @@ -4578,8 +4874,7 @@ def _convert_string_array(data, encoding, errors, itemsize=None): return data -def _unconvert_string_array(data, nan_rep=None, encoding=None, - errors='strict'): +def _unconvert_string_array(data, nan_rep=None, encoding=None, errors="strict"): """ inverse of _convert_string_array @@ -4612,7 +4907,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None, data = data.astype(dtype, copy=False).astype(object, copy=False) if nan_rep is None: - nan_rep = 'nan' + nan_rep = "nan" data = libwriters.string_array_replace_from_nan_rep(data, nan_rep) return data.reshape(shape) @@ -4628,20 +4923,19 @@ def _maybe_convert(values, val_kind, encoding, errors): def _get_converter(kind, encoding, errors): kind = _ensure_decoded(kind) - if kind == 'datetime64': - return lambda x: np.asarray(x, dtype='M8[ns]') - elif kind == 'datetime': + if kind == "datetime64": + return lambda x: np.asarray(x, dtype="M8[ns]") + elif kind == "datetime": return lambda x: to_datetime(x, cache=True).to_pydatetime() - elif kind == 'string': - return lambda x: _unconvert_string_array(x, encoding=encoding, - errors=errors) + elif kind == "string": + return lambda x: _unconvert_string_array(x, encoding=encoding, errors=errors) else: # pragma: no cover - raise ValueError('invalid kind {kind}'.format(kind=kind)) + raise ValueError("invalid kind {kind}".format(kind=kind)) def _need_convert(kind): kind = _ensure_decoded(kind) - if kind in ('datetime', 'datetime64', 'string'): + if kind in ("datetime", "datetime64", "string"): return True return False @@ -4674,7 +4968,7 @@ def __init__(self, table, where=None, start=None, stop=None): # see if we have a passed coordinate like try: inferred = lib.infer_dtype(where, skipna=False) - if inferred == 'integer' or inferred == 'boolean': + if inferred == "integer" or inferred == "boolean": where = np.asarray(where) if where.dtype == np.bool_: start, stop = self.start, self.stop @@ -4684,13 +4978,11 @@ def __init__(self, table, where=None, start=None, stop=None): stop = self.table.nrows self.coordinates = np.arange(start, stop)[where] elif issubclass(where.dtype.type, np.integer): - if ((self.start is not None and - (where < self.start).any()) or - (self.stop is not None and - (where >= self.stop).any())): + if (self.start is not None and (where < self.start).any()) or ( + self.stop is not None and (where >= self.stop).any() + ): raise ValueError( - "where must have index locations >= start and " - "< stop" + "where must have index locations >= start and " "< stop" ) self.coordinates = where @@ -4723,8 +5015,9 @@ def generate(self, where): "reference to\n" " an axis (e.g. 'index' or 'columns'), or a " "data_column\n" - " The currently defined references are: {1}\n" - .format(where, ','.join(q.keys())) + " The currently defined references are: {1}\n".format( + where, ",".join(q.keys()) + ) ) def select(self): @@ -4732,9 +5025,9 @@ def select(self): generate the selection """ if self.condition is not None: - return self.table.table.read_where(self.condition.format(), - start=self.start, - stop=self.stop) + return self.table.table.read_where( + self.condition.format(), start=self.start, stop=self.stop + ) elif self.coordinates is not None: return self.table.table.read_coordinates(self.coordinates) return self.table.table.read(start=self.start, stop=self.stop) @@ -4755,9 +5048,9 @@ def select_coords(self): stop += nrows if self.condition is not None: - return self.table.table.get_where_list(self.condition.format(), - start=start, stop=stop, - sort=True) + return self.table.table.get_where_list( + self.condition.format(), start=start, stop=stop, sort=True + ) elif self.coordinates is not None: return self.coordinates diff --git a/pandas/io/s3.py b/pandas/io/s3.py index d784e8d473aacf..0a7c082fec51c2 100644 --- a/pandas/io/s3.py +++ b/pandas/io/s3.py @@ -4,8 +4,7 @@ from pandas.compat._optional import import_optional_dependency s3fs = import_optional_dependency( - "s3fs", - extra="The s3fs package is required to handle s3 files." + "s3fs", extra="The s3fs package is required to handle s3 files." ) @@ -15,12 +14,13 @@ def _strip_schema(url): return result.netloc + result.path -def get_filepath_or_buffer(filepath_or_buffer, encoding=None, - compression=None, mode=None): +def get_filepath_or_buffer( + filepath_or_buffer, encoding=None, compression=None, mode=None +): from botocore.exceptions import NoCredentialsError if mode is None: - mode = 'rb' + mode = "rb" fs = s3fs.S3FileSystem(anon=False) try: @@ -33,6 +33,5 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, # A NoCredentialsError is raised if you don't have creds # for that bucket. fs = s3fs.S3FileSystem(anon=True) - filepath_or_buffer = fs.open( - _strip_schema(filepath_or_buffer), mode) + filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode) return filepath_or_buffer, None, compression, True diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 634bdfa93ba2e6..7cc9dc11a8cccb 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -65,9 +65,17 @@ class SAS7BDATReader(BaseIterator): bytes. """ - def __init__(self, path_or_buf, index=None, convert_dates=True, - blank_missing=True, chunksize=None, encoding=None, - convert_text=True, convert_header_text=True): + def __init__( + self, + path_or_buf, + index=None, + convert_dates=True, + blank_missing=True, + chunksize=None, + encoding=None, + convert_text=True, + convert_header_text=True, + ): self.index = index self.convert_dates = convert_dates @@ -96,7 +104,7 @@ def __init__(self, path_or_buf, index=None, convert_dates=True, self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf) if isinstance(self._path_or_buf, str): - self._path_or_buf = open(self._path_or_buf, 'rb') + self._path_or_buf = open(self._path_or_buf, "rb") self.handle = self._path_or_buf self._get_properties() @@ -113,7 +121,7 @@ def column_data_offsets(self): def column_types(self): """Returns a numpy character array of the column types: s (string) or d (double)""" - return np.asarray(self._column_types, dtype=np.dtype('S1')) + return np.asarray(self._column_types, dtype=np.dtype("S1")) def close(self): try: @@ -126,7 +134,7 @@ def _get_properties(self): # Check magic number self._path_or_buf.seek(0) self._cached_page = self._path_or_buf.read(288) - if self._cached_page[0:len(const.magic)] != const.magic: + if self._cached_page[0 : len(const.magic)] != const.magic: self.close() raise ValueError("magic number mismatch (not a SAS file?)") @@ -150,9 +158,8 @@ def _get_properties(self): total_align = align1 + align2 # Get endianness information - buf = self._read_bytes(const.endianness_offset, - const.endianness_length) - if buf == b'\x01': + buf = self._read_bytes(const.endianness_offset, const.endianness_length) + if buf == b"\x01": self.byte_order = "<" else: self.byte_order = ">" @@ -166,36 +173,39 @@ def _get_properties(self): # Get platform information buf = self._read_bytes(const.platform_offset, const.platform_length) - if buf == b'1': + if buf == b"1": self.platform = "unix" - elif buf == b'2': + elif buf == b"2": self.platform = "windows" else: self.platform = "unknown" buf = self._read_bytes(const.dataset_offset, const.dataset_length) - self.name = buf.rstrip(b'\x00 ') + self.name = buf.rstrip(b"\x00 ") if self.convert_header_text: - self.name = self.name.decode( - self.encoding or self.default_encoding) + self.name = self.name.decode(self.encoding or self.default_encoding) buf = self._read_bytes(const.file_type_offset, const.file_type_length) - self.file_type = buf.rstrip(b'\x00 ') + self.file_type = buf.rstrip(b"\x00 ") if self.convert_header_text: self.file_type = self.file_type.decode( - self.encoding or self.default_encoding) + self.encoding or self.default_encoding + ) # Timestamp is epoch 01/01/1960 epoch = datetime(1960, 1, 1) - x = self._read_float(const.date_created_offset + align1, - const.date_created_length) - self.date_created = epoch + pd.to_timedelta(x, unit='s') - x = self._read_float(const.date_modified_offset + align1, - const.date_modified_length) - self.date_modified = epoch + pd.to_timedelta(x, unit='s') - - self.header_length = self._read_int(const.header_size_offset + align1, - const.header_size_length) + x = self._read_float( + const.date_created_offset + align1, const.date_created_length + ) + self.date_created = epoch + pd.to_timedelta(x, unit="s") + x = self._read_float( + const.date_modified_offset + align1, const.date_modified_length + ) + self.date_modified = epoch + pd.to_timedelta(x, unit="s") + + self.header_length = self._read_int( + const.header_size_offset + align1, const.header_size_length + ) # Read the rest of the header into cached_page. buf = self._path_or_buf.read(self.header_length - 288) @@ -204,44 +214,53 @@ def _get_properties(self): self.close() raise ValueError("The SAS7BDAT file appears to be truncated.") - self._page_length = self._read_int(const.page_size_offset + align1, - const.page_size_length) - self._page_count = self._read_int(const.page_count_offset + align1, - const.page_count_length) - - buf = self._read_bytes(const.sas_release_offset + total_align, - const.sas_release_length) - self.sas_release = buf.rstrip(b'\x00 ') + self._page_length = self._read_int( + const.page_size_offset + align1, const.page_size_length + ) + self._page_count = self._read_int( + const.page_count_offset + align1, const.page_count_length + ) + + buf = self._read_bytes( + const.sas_release_offset + total_align, const.sas_release_length + ) + self.sas_release = buf.rstrip(b"\x00 ") if self.convert_header_text: self.sas_release = self.sas_release.decode( - self.encoding or self.default_encoding) + self.encoding or self.default_encoding + ) - buf = self._read_bytes(const.sas_server_type_offset + total_align, - const.sas_server_type_length) - self.server_type = buf.rstrip(b'\x00 ') + buf = self._read_bytes( + const.sas_server_type_offset + total_align, const.sas_server_type_length + ) + self.server_type = buf.rstrip(b"\x00 ") if self.convert_header_text: self.server_type = self.server_type.decode( - self.encoding or self.default_encoding) + self.encoding or self.default_encoding + ) - buf = self._read_bytes(const.os_version_number_offset + total_align, - const.os_version_number_length) - self.os_version = buf.rstrip(b'\x00 ') + buf = self._read_bytes( + const.os_version_number_offset + total_align, const.os_version_number_length + ) + self.os_version = buf.rstrip(b"\x00 ") if self.convert_header_text: self.os_version = self.os_version.decode( - self.encoding or self.default_encoding) + self.encoding or self.default_encoding + ) - buf = self._read_bytes(const.os_name_offset + total_align, - const.os_name_length) - buf = buf.rstrip(b'\x00 ') + buf = self._read_bytes(const.os_name_offset + total_align, const.os_name_length) + buf = buf.rstrip(b"\x00 ") if len(buf) > 0: self.os_name = buf.decode(self.encoding or self.default_encoding) else: - buf = self._read_bytes(const.os_maker_offset + total_align, - const.os_maker_length) - self.os_name = buf.rstrip(b'\x00 ') + buf = self._read_bytes( + const.os_maker_offset + total_align, const.os_maker_length + ) + self.os_name = buf.rstrip(b"\x00 ") if self.convert_header_text: self.os_name = self.os_name.decode( - self.encoding or self.default_encoding) + self.encoding or self.default_encoding + ) def __next__(self): da = self.read(nrows=self.chunksize or 1) @@ -281,7 +300,7 @@ def _read_bytes(self, offset, length): if offset + length > len(self._cached_page): self.close() raise ValueError("The cached page is too small.") - return self._cached_page[offset:offset + length] + return self._cached_page[offset : offset + length] def _parse_metadata(self): done = False @@ -291,8 +310,7 @@ def _parse_metadata(self): break if len(self._cached_page) != self._page_length: self.close() - raise ValueError( - "Failed to read a meta data page from the SAS file.") + raise ValueError("Failed to read a meta data page from the SAS file.") done = self._process_page_meta() def _process_page_meta(self): @@ -302,43 +320,45 @@ def _process_page_meta(self): self._process_page_metadata() is_data_page = self._current_page_type & const.page_data_type is_mix_page = self._current_page_type in const.page_mix_types - return (is_data_page or is_mix_page - or self._current_page_data_subheader_pointers != []) + return ( + is_data_page + or is_mix_page + or self._current_page_data_subheader_pointers != [] + ) def _read_page_header(self): bit_offset = self._page_bit_offset tx = const.page_type_offset + bit_offset self._current_page_type = self._read_int(tx, const.page_type_length) tx = const.block_count_offset + bit_offset - self._current_page_block_count = self._read_int( - tx, const.block_count_length) + self._current_page_block_count = self._read_int(tx, const.block_count_length) tx = const.subheader_count_offset + bit_offset - self._current_page_subheaders_count = ( - self._read_int(tx, const.subheader_count_length)) + self._current_page_subheaders_count = self._read_int( + tx, const.subheader_count_length + ) def _process_page_metadata(self): bit_offset = self._page_bit_offset for i in range(self._current_page_subheaders_count): pointer = self._process_subheader_pointers( - const.subheader_pointers_offset + bit_offset, i) + const.subheader_pointers_offset + bit_offset, i + ) if pointer.length == 0: continue if pointer.compression == const.truncated_subheader_id: continue - subheader_signature = self._read_subheader_signature( - pointer.offset) - subheader_index = ( - self._get_subheader_index(subheader_signature, - pointer.compression, pointer.ptype)) + subheader_signature = self._read_subheader_signature(pointer.offset) + subheader_index = self._get_subheader_index( + subheader_signature, pointer.compression, pointer.ptype + ) self._process_subheader(subheader_index, pointer) def _get_subheader_index(self, signature, compression, ptype): index = const.subheader_signature_to_index.get(signature) if index is None: - f1 = ((compression == const.compressed_subheader_id) or - (compression == 0)) - f2 = (ptype == const.compressed_subheader_type) + f1 = (compression == const.compressed_subheader_id) or (compression == 0) + f2 = ptype == const.compressed_subheader_type if (self.compression != "") and f1 and f2: index = const.SASIndex.data_subheader_index else: @@ -349,8 +369,7 @@ def _get_subheader_index(self, signature, compression, ptype): def _process_subheader_pointers(self, offset, subheader_pointer_index): subheader_pointer_length = self._subheader_pointer_length - total_offset = (offset + - subheader_pointer_length * subheader_pointer_index) + total_offset = offset + subheader_pointer_length * subheader_pointer_index subheader_offset = self._read_int(total_offset, self._int_length) total_offset += self._int_length @@ -416,13 +435,17 @@ def _process_rowsize_subheader(self, offset, length): lcp_offset += 378 self.row_length = self._read_int( - offset + const.row_length_offset_multiplier * int_len, int_len) + offset + const.row_length_offset_multiplier * int_len, int_len + ) self.row_count = self._read_int( - offset + const.row_count_offset_multiplier * int_len, int_len) + offset + const.row_count_offset_multiplier * int_len, int_len + ) self.col_count_p1 = self._read_int( - offset + const.col_count_p1_multiplier * int_len, int_len) + offset + const.col_count_p1_multiplier * int_len, int_len + ) self.col_count_p2 = self._read_int( - offset + const.col_count_p2_multiplier * int_len, int_len) + offset + const.col_count_p2_multiplier * int_len, int_len + ) mx = const.row_count_on_mix_page_offset_multiplier * int_len self._mix_page_row_count = self._read_int(offset + mx, int_len) self._lcs = self._read_int(lcs_offset, 2) @@ -432,13 +455,15 @@ def _process_columnsize_subheader(self, offset, length): int_len = self._int_length offset += int_len self.column_count = self._read_int(offset, int_len) - if (self.col_count_p1 + self.col_count_p2 != - self.column_count): + if self.col_count_p1 + self.col_count_p2 != self.column_count: print( "Warning: column count mismatch ({p1} + {p2} != " "{column_count})\n".format( - p1=self.col_count_p1, p2=self.col_count_p2, - column_count=self.column_count)) + p1=self.col_count_p1, + p2=self.col_count_p2, + column_count=self.column_count, + ) + ) # Unknown purpose def _process_subheader_counts(self, offset, length): @@ -476,60 +501,74 @@ def _process_columntext_subheader(self, offset, length): if self.U64: offset1 += 4 buf = self._read_bytes(offset1, self._lcp) - self.creator_proc = buf[0:self._lcp] + self.creator_proc = buf[0 : self._lcp] elif compression_literal == const.rle_compression: offset1 = offset + 40 if self.U64: offset1 += 4 buf = self._read_bytes(offset1, self._lcp) - self.creator_proc = buf[0:self._lcp] + self.creator_proc = buf[0 : self._lcp] elif self._lcs > 0: self._lcp = 0 offset1 = offset + 16 if self.U64: offset1 += 4 buf = self._read_bytes(offset1, self._lcs) - self.creator_proc = buf[0:self._lcp] + self.creator_proc = buf[0 : self._lcp] if self.convert_header_text: if hasattr(self, "creator_proc"): self.creator_proc = self.creator_proc.decode( - self.encoding or self.default_encoding) + self.encoding or self.default_encoding + ) def _process_columnname_subheader(self, offset, length): int_len = self._int_length offset += int_len column_name_pointers_count = (length - 2 * int_len - 12) // 8 for i in range(column_name_pointers_count): - text_subheader = offset + const.column_name_pointer_length * \ - (i + 1) + const.column_name_text_subheader_offset - col_name_offset = offset + const.column_name_pointer_length * \ - (i + 1) + const.column_name_offset_offset - col_name_length = offset + const.column_name_pointer_length * \ - (i + 1) + const.column_name_length_offset + text_subheader = ( + offset + + const.column_name_pointer_length * (i + 1) + + const.column_name_text_subheader_offset + ) + col_name_offset = ( + offset + + const.column_name_pointer_length * (i + 1) + + const.column_name_offset_offset + ) + col_name_length = ( + offset + + const.column_name_pointer_length * (i + 1) + + const.column_name_length_offset + ) idx = self._read_int( - text_subheader, const.column_name_text_subheader_length) + text_subheader, const.column_name_text_subheader_length + ) col_offset = self._read_int( - col_name_offset, const.column_name_offset_length) - col_len = self._read_int( - col_name_length, const.column_name_length_length) + col_name_offset, const.column_name_offset_length + ) + col_len = self._read_int(col_name_length, const.column_name_length_length) name_str = self.column_names_strings[idx] - self.column_names.append(name_str[col_offset:col_offset + col_len]) + self.column_names.append(name_str[col_offset : col_offset + col_len]) def _process_columnattributes_subheader(self, offset, length): int_len = self._int_length - column_attributes_vectors_count = ( - length - 2 * int_len - 12) // (int_len + 8) + column_attributes_vectors_count = (length - 2 * int_len - 12) // (int_len + 8) for i in range(column_attributes_vectors_count): - col_data_offset = (offset + int_len + - const.column_data_offset_offset + - i * (int_len + 8)) - col_data_len = (offset + 2 * int_len + - const.column_data_length_offset + - i * (int_len + 8)) - col_types = (offset + 2 * int_len + - const.column_type_offset + i * (int_len + 8)) + col_data_offset = ( + offset + int_len + const.column_data_offset_offset + i * (int_len + 8) + ) + col_data_len = ( + offset + + 2 * int_len + + const.column_data_length_offset + + i * (int_len + 8) + ) + col_types = ( + offset + 2 * int_len + const.column_type_offset + i * (int_len + 8) + ) x = self._read_int(col_data_offset, int_len) self._column_data_offsets.append(x) @@ -538,7 +577,7 @@ def _process_columnattributes_subheader(self, offset, length): self._column_data_lengths.append(x) x = self._read_int(col_types, const.column_type_length) - self._column_types.append(b'd' if x == 1 else b's') + self._column_types.append(b"d" if x == 1 else b"s") def _process_columnlist_subheader(self, offset, length): # unknown purpose @@ -547,47 +586,38 @@ def _process_columnlist_subheader(self, offset, length): def _process_format_subheader(self, offset, length): int_len = self._int_length text_subheader_format = ( - offset + - const.column_format_text_subheader_index_offset + - 3 * int_len) - col_format_offset = (offset + - const.column_format_offset_offset + - 3 * int_len) - col_format_len = (offset + - const.column_format_length_offset + - 3 * int_len) + offset + const.column_format_text_subheader_index_offset + 3 * int_len + ) + col_format_offset = offset + const.column_format_offset_offset + 3 * int_len + col_format_len = offset + const.column_format_length_offset + 3 * int_len text_subheader_label = ( - offset + - const.column_label_text_subheader_index_offset + - 3 * int_len) - col_label_offset = (offset + - const.column_label_offset_offset + - 3 * int_len) + offset + const.column_label_text_subheader_index_offset + 3 * int_len + ) + col_label_offset = offset + const.column_label_offset_offset + 3 * int_len col_label_len = offset + const.column_label_length_offset + 3 * int_len - x = self._read_int(text_subheader_format, - const.column_format_text_subheader_index_length) + x = self._read_int( + text_subheader_format, const.column_format_text_subheader_index_length + ) format_idx = min(x, len(self.column_names_strings) - 1) format_start = self._read_int( - col_format_offset, const.column_format_offset_length) - format_len = self._read_int( - col_format_len, const.column_format_length_length) + col_format_offset, const.column_format_offset_length + ) + format_len = self._read_int(col_format_len, const.column_format_length_length) label_idx = self._read_int( - text_subheader_label, - const.column_label_text_subheader_index_length) + text_subheader_label, const.column_label_text_subheader_index_length + ) label_idx = min(label_idx, len(self.column_names_strings) - 1) - label_start = self._read_int( - col_label_offset, const.column_label_offset_length) - label_len = self._read_int(col_label_len, - const.column_label_length_length) + label_start = self._read_int(col_label_offset, const.column_label_offset_length) + label_len = self._read_int(col_label_len, const.column_label_length_length) label_names = self.column_names_strings[label_idx] - column_label = label_names[label_start: label_start + label_len] + column_label = label_names[label_start : label_start + label_len] format_names = self.column_names_strings[format_idx] - column_format = format_names[format_start: format_start + format_len] + column_format = format_names[format_start : format_start + format_len] current_column_number = len(self.columns) col = _column() @@ -619,8 +649,8 @@ def read(self, nrows=None): if nrows > m: nrows = m - nd = self._column_types.count(b'd') - ns = self._column_types.count(b's') + nd = self._column_types.count(b"d") + ns = self._column_types.count(b"s") self._string_chunk = np.empty((ns, nrows), dtype=np.object) self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8) @@ -642,10 +672,8 @@ def _read_next_page(self): return True elif len(self._cached_page) != self._page_length: self.close() - msg = ("failed to read complete page from file " - "(read {:d} of {:d} bytes)") - raise ValueError(msg.format(len(self._cached_page), - self._page_length)) + msg = "failed to read complete page from file " "(read {:d} of {:d} bytes)" + raise ValueError(msg.format(len(self._cached_page), self._page_length)) self._read_page_header() page_type = self._current_page_type @@ -671,32 +699,34 @@ def _chunk_to_dataframe(self): name = self.column_names[j] - if self._column_types[j] == b'd': - rslt[name] = self._byte_chunk[jb, :].view( - dtype=self.byte_order + 'd') + if self._column_types[j] == b"d": + rslt[name] = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d") rslt[name] = np.asarray(rslt[name], dtype=np.float64) if self.convert_dates: unit = None if self.column_formats[j] in const.sas_date_formats: - unit = 'd' + unit = "d" elif self.column_formats[j] in const.sas_datetime_formats: - unit = 's' + unit = "s" if unit: - rslt[name] = pd.to_datetime(rslt[name], unit=unit, - origin="1960-01-01") + rslt[name] = pd.to_datetime( + rslt[name], unit=unit, origin="1960-01-01" + ) jb += 1 - elif self._column_types[j] == b's': + elif self._column_types[j] == b"s": rslt[name] = self._string_chunk[js, :] if self.convert_text and (self.encoding is not None): rslt[name] = rslt[name].str.decode( - self.encoding or self.default_encoding) + self.encoding or self.default_encoding + ) if self.blank_missing: ii = rslt[name].str.len() == 0 rslt.loc[ii, name] = np.nan js += 1 else: self.close() - raise ValueError("unknown column type {type}".format( - type=self._column_types[j])) + raise ValueError( + "unknown column type {type}".format(type=self._column_types[j]) + ) return rslt diff --git a/pandas/io/sas/sas_constants.py b/pandas/io/sas/sas_constants.py index c37a26cd62ad25..23b23a1bf09c0d 100644 --- a/pandas/io/sas/sas_constants.py +++ b/pandas/io/sas/sas_constants.py @@ -1,13 +1,15 @@ -magic = (b"\x00\x00\x00\x00\x00\x00\x00\x00" + - b"\x00\x00\x00\x00\xc2\xea\x81\x60" + - b"\xb3\x14\x11\xcf\xbd\x92\x08\x00" + - b"\x09\xc7\x31\x8c\x18\x1f\x10\x11") +magic = ( + b"\x00\x00\x00\x00\x00\x00\x00\x00" + + b"\x00\x00\x00\x00\xc2\xea\x81\x60" + + b"\xb3\x14\x11\xcf\xbd\x92\x08\x00" + + b"\x09\xc7\x31\x8c\x18\x1f\x10\x11" +) -align_1_checker_value = b'3' +align_1_checker_value = b"3" align_1_offset = 32 align_1_length = 1 align_1_value = 4 -u64_byte_checker_value = b'3' +u64_byte_checker_value = b"3" align_2_offset = 35 align_2_length = 1 align_2_value = 4 @@ -91,15 +93,22 @@ column_label_offset_length = 2 column_label_length_offset = 32 column_label_length_length = 2 -rle_compression = b'SASYZCRL' -rdc_compression = b'SASYZCR2' +rle_compression = b"SASYZCRL" +rdc_compression = b"SASYZCR2" compression_literals = [rle_compression, rdc_compression] # Incomplete list of encodings, using SAS nomenclature: # http://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm -encoding_names = {29: "latin1", 20: "utf-8", 33: "cyrillic", 60: "wlatin2", - 61: "wcyrillic", 62: "wlatin1", 90: "ebcdic870"} +encoding_names = { + 29: "latin1", + 20: "utf-8", + 33: "cyrillic", + 60: "wlatin2", + 61: "wcyrillic", + 62: "wlatin1", + 90: "ebcdic870", +} class SASIndex: @@ -144,28 +153,101 @@ class SASIndex: b"\xFE\xFF\xFF\xFF": SASIndex.column_list_index, b"\xFF\xFF\xFF\xFE": SASIndex.column_list_index, b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_list_index, - b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index} + b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index, +} # List of frequently used SAS date and datetime formats # http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_intervals_sect009.htm # https://github.com/epam/parso/blob/master/src/main/java/com/epam/parso/impl/SasFileConstants.java -sas_date_formats = ("DATE", "DAY", "DDMMYY", "DOWNAME", "JULDAY", "JULIAN", - "MMDDYY", "MMYY", "MMYYC", "MMYYD", "MMYYP", "MMYYS", - "MMYYN", "MONNAME", "MONTH", "MONYY", "QTR", "QTRR", - "NENGO", "WEEKDATE", "WEEKDATX", "WEEKDAY", "WEEKV", - "WORDDATE", "WORDDATX", "YEAR", "YYMM", "YYMMC", "YYMMD", - "YYMMP", "YYMMS", "YYMMN", "YYMON", "YYMMDD", "YYQ", - "YYQC", "YYQD", "YYQP", "YYQS", "YYQN", "YYQR", "YYQRC", - "YYQRD", "YYQRP", "YYQRS", "YYQRN", - "YYMMDDP", "YYMMDDC", "E8601DA", "YYMMDDN", "MMDDYYC", - "MMDDYYS", "MMDDYYD", "YYMMDDS", "B8601DA", "DDMMYYN", - "YYMMDDD", "DDMMYYB", "DDMMYYP", "MMDDYYP", "YYMMDDB", - "MMDDYYN", "DDMMYYC", "DDMMYYD", "DDMMYYS", - "MINGUO") +sas_date_formats = ( + "DATE", + "DAY", + "DDMMYY", + "DOWNAME", + "JULDAY", + "JULIAN", + "MMDDYY", + "MMYY", + "MMYYC", + "MMYYD", + "MMYYP", + "MMYYS", + "MMYYN", + "MONNAME", + "MONTH", + "MONYY", + "QTR", + "QTRR", + "NENGO", + "WEEKDATE", + "WEEKDATX", + "WEEKDAY", + "WEEKV", + "WORDDATE", + "WORDDATX", + "YEAR", + "YYMM", + "YYMMC", + "YYMMD", + "YYMMP", + "YYMMS", + "YYMMN", + "YYMON", + "YYMMDD", + "YYQ", + "YYQC", + "YYQD", + "YYQP", + "YYQS", + "YYQN", + "YYQR", + "YYQRC", + "YYQRD", + "YYQRP", + "YYQRS", + "YYQRN", + "YYMMDDP", + "YYMMDDC", + "E8601DA", + "YYMMDDN", + "MMDDYYC", + "MMDDYYS", + "MMDDYYD", + "YYMMDDS", + "B8601DA", + "DDMMYYN", + "YYMMDDD", + "DDMMYYB", + "DDMMYYP", + "MMDDYYP", + "YYMMDDB", + "MMDDYYN", + "DDMMYYC", + "DDMMYYD", + "DDMMYYS", + "MINGUO", +) -sas_datetime_formats = ("DATETIME", "DTWKDATX", - "B8601DN", "B8601DT", "B8601DX", "B8601DZ", "B8601LX", - "E8601DN", "E8601DT", "E8601DX", "E8601DZ", "E8601LX", - "DATEAMPM", "DTDATE", "DTMONYY", "DTMONYY", "DTWKDATX", - "DTYEAR", "TOD", "MDYAMPM") +sas_datetime_formats = ( + "DATETIME", + "DTWKDATX", + "B8601DN", + "B8601DT", + "B8601DX", + "B8601DZ", + "B8601LX", + "E8601DN", + "E8601DT", + "E8601DX", + "E8601DZ", + "E8601LX", + "DATEAMPM", + "DTDATE", + "DTMONYY", + "DTMONYY", + "DTWKDATX", + "DTYEAR", + "TOD", + "MDYAMPM", +) diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index 0dbea452230d60..34b93d72d0e296 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -21,17 +21,39 @@ from pandas.io.common import BaseIterator, get_filepath_or_buffer -_correct_line1 = ("HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!" - "000000000000000000000000000000 ") -_correct_header1 = ("HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!" - "000000000000000001600000000") -_correct_header2 = ("HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!" - "000000000000000000000000000000 ") -_correct_obs_header = ("HEADER RECORD*******OBS HEADER RECORD!!!!!!!" - "000000000000000000000000000000 ") -_fieldkeys = ['ntype', 'nhfun', 'field_length', 'nvar0', 'name', 'label', - 'nform', 'nfl', 'num_decimals', 'nfj', 'nfill', 'niform', - 'nifl', 'nifd', 'npos', '_'] +_correct_line1 = ( + "HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!" + "000000000000000000000000000000 " +) +_correct_header1 = ( + "HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!" "000000000000000001600000000" +) +_correct_header2 = ( + "HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!" + "000000000000000000000000000000 " +) +_correct_obs_header = ( + "HEADER RECORD*******OBS HEADER RECORD!!!!!!!" + "000000000000000000000000000000 " +) +_fieldkeys = [ + "ntype", + "nhfun", + "field_length", + "nvar0", + "name", + "label", + "nform", + "nfl", + "num_decimals", + "nfj", + "nfill", + "niform", + "nifl", + "nifd", + "npos", + "_", +] _base_params_doc = """\ @@ -80,10 +102,12 @@ >>> for chunk in itr: >>> do_something(chunk) -""" % {"_base_params_doc": _base_params_doc, - "_format_params_doc": _format_params_doc, - "_params2_doc": _params2_doc, - "_iterator_doc": _iterator_doc} +""" % { + "_base_params_doc": _base_params_doc, + "_format_params_doc": _format_params_doc, + "_params2_doc": _params2_doc, + "_iterator_doc": _iterator_doc, +} _xport_reader_doc = """\ @@ -98,8 +122,10 @@ Contains information about the file fields : list Contains information about the variables in the file -""" % {"_base_params_doc": _base_params_doc, - "_params2_doc": _params2_doc} +""" % { + "_base_params_doc": _base_params_doc, + "_params2_doc": _params2_doc, +} _read_method_doc = """\ @@ -142,9 +168,9 @@ def _split_line(s, parts): out = {} start = 0 for name, length in parts: - out[name] = s[start:start + length].strip() + out[name] = s[start : start + length].strip() start += length - del out['_'] + del out["_"] return out @@ -158,10 +184,10 @@ def _handle_truncated_float_vec(vec, nbytes): # The R "foreign" library if nbytes != 8: - vec1 = np.zeros(len(vec), np.dtype('S8')) - dtype = np.dtype('S%d,S%d' % (nbytes, 8 - nbytes)) + vec1 = np.zeros(len(vec), np.dtype("S8")) + dtype = np.dtype("S%d,S%d" % (nbytes, 8 - nbytes)) vec2 = vec1.view(dtype=dtype) - vec2['f0'] = vec + vec2["f0"] = vec return vec2 return vec @@ -173,14 +199,14 @@ def _parse_float_vec(vec): native 8 byte floats. """ - dtype = np.dtype('>u4,>u4') + dtype = np.dtype(">u4,>u4") vec1 = vec.view(dtype=dtype) - xport1 = vec1['f0'] - xport2 = vec1['f1'] + xport1 = vec1["f0"] + xport2 = vec1["f1"] # Start by setting first half of ieee number to first half of IBM # number sans exponent - ieee1 = xport1 & 0x00ffffff + ieee1 = xport1 & 0x00FFFFFF # The fraction bit to the left of the binary point in the ieee # format was set and the number was shifted 0, 1, 2, or 3 @@ -203,7 +229,7 @@ def _parse_float_vec(vec): ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift))) # clear the 1 bit to the left of the binary point - ieee1 &= 0xffefffff + ieee1 &= 0xFFEFFFFF # set the exponent of the ieee number to be the actual exponent # plus the shift count + 1023. Or this into the first half of the @@ -212,14 +238,15 @@ def _parse_float_vec(vec): # incremented by 1 and the fraction bits left 4 positions to the # right of the radix point. (had to add >> 24 because C treats & # 0x7f as 0x7f000000 and Python doesn't) - ieee1 |= ((((((xport1 >> 24) & 0x7f) - 65) << 2) + - shift + 1023) << 20) | (xport1 & 0x80000000) + ieee1 |= ((((((xport1 >> 24) & 0x7F) - 65) << 2) + shift + 1023) << 20) | ( + xport1 & 0x80000000 + ) - ieee = np.empty((len(ieee1),), dtype='>u4,>u4') - ieee['f0'] = ieee1 - ieee['f1'] = ieee2 - ieee = ieee.view(dtype='>f8') - ieee = ieee.astype('f8') + ieee = np.empty((len(ieee1),), dtype=">u4,>u4") + ieee["f0"] = ieee1 + ieee["f1"] = ieee2 + ieee = ieee.view(dtype=">f8") + ieee = ieee.astype("f8") return ieee @@ -227,8 +254,9 @@ def _parse_float_vec(vec): class XportReader(BaseIterator): __doc__ = _xport_reader_doc - def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1', - chunksize=None): + def __init__( + self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None + ): self._encoding = encoding self._lines_read = 0 @@ -236,12 +264,15 @@ def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1', self._chunksize = chunksize if isinstance(filepath_or_buffer, str): - (filepath_or_buffer, encoding, - compression, should_close) = get_filepath_or_buffer( - filepath_or_buffer, encoding=encoding) + ( + filepath_or_buffer, + encoding, + compression, + should_close, + ) = get_filepath_or_buffer(filepath_or_buffer, encoding=encoding) if isinstance(filepath_or_buffer, (str, bytes)): - self.filepath_or_buffer = open(filepath_or_buffer, 'rb') + self.filepath_or_buffer = open(filepath_or_buffer, "rb") else: # Copy to BytesIO, and ensure no encoding contents = filepath_or_buffer.read() @@ -269,23 +300,22 @@ def _read_header(self): raise ValueError("Header record is not an XPORT file.") line2 = self._get_row() - fif = [['prefix', 24], ['version', 8], ['OS', 8], - ['_', 24], ['created', 16]] + fif = [["prefix", 24], ["version", 8], ["OS", 8], ["_", 24], ["created", 16]] file_info = _split_line(line2, fif) - if file_info['prefix'] != "SAS SAS SASLIB": + if file_info["prefix"] != "SAS SAS SASLIB": self.close() raise ValueError("Header record has invalid prefix.") - file_info['created'] = _parse_date(file_info['created']) + file_info["created"] = _parse_date(file_info["created"]) self.file_info = file_info line3 = self._get_row() - file_info['modified'] = _parse_date(line3[:16]) + file_info["modified"] = _parse_date(line3[:16]) # read member header header1 = self._get_row() header2 = self._get_row() headflag1 = header1.startswith(_correct_header1) - headflag2 = (header2 == _correct_header2) + headflag2 = header2 == _correct_header2 if not (headflag1 and headflag2): self.close() raise ValueError("Member header not found") @@ -293,17 +323,24 @@ def _read_header(self): fieldnamelength = int(header1[-5:-2]) # member info - mem = [['prefix', 8], ['set_name', 8], ['sasdata', 8], - ['version', 8], ['OS', 8], ['_', 24], ['created', 16]] + mem = [ + ["prefix", 8], + ["set_name", 8], + ["sasdata", 8], + ["version", 8], + ["OS", 8], + ["_", 24], + ["created", 16], + ] member_info = _split_line(self._get_row(), mem) - mem = [['modified', 16], ['_', 16], ['label', 40], ['type', 8]] + mem = [["modified", 16], ["_", 16], ["label", 40], ["type", 8]] member_info.update(_split_line(self._get_row(), mem)) - member_info['modified'] = _parse_date(member_info['modified']) - member_info['created'] = _parse_date(member_info['created']) + member_info["modified"] = _parse_date(member_info["modified"]) + member_info["created"] = _parse_date(member_info["created"]) self.member_info = member_info # read field names - types = {1: 'numeric', 2: 'char'} + types = {1: "numeric", 2: "char"} fieldcount = int(self._get_row()[54:58]) datalength = fieldnamelength * fieldcount # round up to nearest 80 @@ -314,19 +351,21 @@ def _read_header(self): obs_length = 0 while len(fielddata) >= fieldnamelength: # pull data for one field - field, fielddata = (fielddata[:fieldnamelength], - fielddata[fieldnamelength:]) + field, fielddata = ( + fielddata[:fieldnamelength], + fielddata[fieldnamelength:], + ) # rest at end gets ignored, so if field is short, pad out # to match struct pattern below field = field.ljust(140) - fieldstruct = struct.unpack('>hhhh8s40s8shhh2s8shhl52s', field) + fieldstruct = struct.unpack(">hhhh8s40s8shhh2s8shhl52s", field) field = dict(zip(_fieldkeys, fieldstruct)) - del field['_'] - field['ntype'] = types[field['ntype']] - fl = field['field_length'] - if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)): + del field["_"] + field["ntype"] = types[field["ntype"]] + fl = field["field_length"] + if field["ntype"] == "numeric" and ((fl < 2) or (fl > 8)): self.close() msg = "Floating field width {0} is not between 2 and 8." raise TypeError(msg.format(fl)) @@ -337,7 +376,7 @@ def _read_header(self): except AttributeError: pass - obs_length += field['field_length'] + obs_length += field["field_length"] fields += [field] header = self._get_row() @@ -350,11 +389,13 @@ def _read_header(self): self.record_start = self.filepath_or_buffer.tell() self.nobs = self._record_count() - self.columns = [x['name'].decode() for x in self.fields] + self.columns = [x["name"].decode() for x in self.fields] # Setup the dtype. - dtypel = [('s' + str(i), "S" + str(field['field_length'])) - for i, field in enumerate(self.fields)] + dtypel = [ + ("s" + str(i), "S" + str(field["field_length"])) + for i, field in enumerate(self.fields) + ] dtype = np.dtype(dtypel) self._dtype = dtype @@ -372,8 +413,7 @@ def _record_count(self): """ self.filepath_or_buffer.seek(0, 2) - total_records_length = (self.filepath_or_buffer.tell() - - self.record_start) + total_records_length = self.filepath_or_buffer.tell() - self.record_start if total_records_length % 80 != 0: warnings.warn("xport file may be corrupted") @@ -416,10 +456,13 @@ def get_chunk(self, size=None): return self.read(nrows=size) def _missing_double(self, vec): - v = vec.view(dtype='u1,u1,u2,u4') - miss = (v['f1'] == 0) & (v['f2'] == 0) & (v['f3'] == 0) - miss1 = (((v['f0'] >= 0x41) & (v['f0'] <= 0x5a)) | - (v['f0'] == 0x5f) | (v['f0'] == 0x2e)) + v = vec.view(dtype="u1,u1,u2,u4") + miss = (v["f1"] == 0) & (v["f2"] == 0) & (v["f3"] == 0) + miss1 = ( + ((v["f0"] >= 0x41) & (v["f0"] <= 0x5A)) + | (v["f0"] == 0x5F) + | (v["f0"] == 0x2E) + ) miss &= miss1 return miss @@ -439,15 +482,14 @@ def read(self, nrows=None): df = pd.DataFrame(index=range(read_lines)) for j, x in enumerate(self.columns): - vec = data['s%d' % j] - ntype = self.fields[j]['ntype'] + vec = data["s%d" % j] + ntype = self.fields[j]["ntype"] if ntype == "numeric": - vec = _handle_truncated_float_vec( - vec, self.fields[j]['field_length']) + vec = _handle_truncated_float_vec(vec, self.fields[j]["field_length"]) miss = self._missing_double(vec) v = _parse_float_vec(vec) v[miss] = np.nan - elif self.fields[j]['ntype'] == 'char': + elif self.fields[j]["ntype"] == "char": v = [y.rstrip() for y in vec] if self._encoding is not None: diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 0726e17e3bbabd..680425f421eec5 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -4,8 +4,14 @@ from pandas.io.common import _stringify_path -def read_sas(filepath_or_buffer, format=None, index=None, encoding=None, - chunksize=None, iterator=False): +def read_sas( + filepath_or_buffer, + format=None, + index=None, + encoding=None, + chunksize=None, + iterator=False, +): """ Read SAS files stored as either XPORT or SAS7BDAT format files. @@ -31,9 +37,11 @@ def read_sas(filepath_or_buffer, format=None, index=None, encoding=None, or XportReader """ if format is None: - buffer_error_msg = ("If this is a buffer object rather " - "than a string name, you must specify " - "a format string") + buffer_error_msg = ( + "If this is a buffer object rather " + "than a string name, you must specify " + "a format string" + ) filepath_or_buffer = _stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, str): raise ValueError(buffer_error_msg) @@ -45,18 +53,20 @@ def read_sas(filepath_or_buffer, format=None, index=None, encoding=None, else: raise ValueError("unable to infer format of SAS file") - if format.lower() == 'xport': + if format.lower() == "xport": from pandas.io.sas.sas_xport import XportReader - reader = XportReader(filepath_or_buffer, index=index, - encoding=encoding, - chunksize=chunksize) - elif format.lower() == 'sas7bdat': + + reader = XportReader( + filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize + ) + elif format.lower() == "sas7bdat": from pandas.io.sas.sas7bdat import SAS7BDATReader - reader = SAS7BDATReader(filepath_or_buffer, index=index, - encoding=encoding, - chunksize=chunksize) + + reader = SAS7BDATReader( + filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize + ) else: - raise ValueError('unknown SAS format') + raise ValueError("unknown SAS format") if iterator or chunksize: return reader diff --git a/pandas/io/spss.py b/pandas/io/spss.py index b1b92fc2b84399..983ac1c818c424 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -7,9 +7,11 @@ from pandas.core.api import DataFrame -def read_spss(path: Union[str, Path], - usecols: Optional[Sequence[str]] = None, - convert_categoricals: bool = True) -> DataFrame: +def read_spss( + path: Union[str, Path], + usecols: Optional[Sequence[str]] = None, + convert_categoricals: bool = True, +) -> DataFrame: """ Load an SPSS file from the file path, returning a DataFrame. @@ -36,6 +38,7 @@ def read_spss(path: Union[str, Path], else: usecols = list(usecols) # pyreadstat requires a list - df, _ = pyreadstat.read_sav(path, usecols=usecols, - apply_value_formats=convert_categoricals) + df, _ = pyreadstat.read_sav( + path, usecols=usecols, apply_value_formats=convert_categoricals + ) return df diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 6cb57077be76a4..211571c7dbaa13 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -14,8 +14,7 @@ import pandas._libs.lib as lib from pandas.compat import raise_with_traceback -from pandas.core.dtypes.common import ( - is_datetime64tz_dtype, is_dict_like, is_list_like) +from pandas.core.dtypes.common import is_datetime64tz_dtype, is_dict_like, is_list_like from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna @@ -43,12 +42,14 @@ def _is_sqlalchemy_connectable(con): if _SQLALCHEMY_INSTALLED is None: try: import sqlalchemy + _SQLALCHEMY_INSTALLED = True except ImportError: _SQLALCHEMY_INSTALLED = False if _SQLALCHEMY_INSTALLED: import sqlalchemy # noqa: F811 + return isinstance(con, sqlalchemy.engine.Connectable) else: return False @@ -58,7 +59,7 @@ def _convert_params(sql, params): """Convert SQL and params args to DBAPI2.0 compliant format.""" args = [sql] if params is not None: - if hasattr(params, 'keys'): # test if params is a mapping + if hasattr(params, "keys"): # test if params is a mapping args += [params] else: args += [list(params)] @@ -71,28 +72,30 @@ def _process_parse_dates_argument(parse_dates): if parse_dates is True or parse_dates is None or parse_dates is False: parse_dates = [] - elif not hasattr(parse_dates, '__iter__'): + elif not hasattr(parse_dates, "__iter__"): parse_dates = [parse_dates] return parse_dates def _handle_date_column(col, utc=None, format=None): if isinstance(format, dict): - return to_datetime(col, errors='ignore', **format) + return to_datetime(col, errors="ignore", **format) else: # Allow passing of formatting string for integers # GH17855 - if format is None and (issubclass(col.dtype.type, np.floating) or - issubclass(col.dtype.type, np.integer)): - format = 's' - if format in ['D', 'd', 'h', 'm', 's', 'ms', 'us', 'ns']: - return to_datetime(col, errors='coerce', unit=format, utc=utc) + if format is None and ( + issubclass(col.dtype.type, np.floating) + or issubclass(col.dtype.type, np.integer) + ): + format = "s" + if format in ["D", "d", "h", "m", "s", "ms", "us", "ns"]: + return to_datetime(col, errors="coerce", unit=format, utc=utc) elif is_datetime64tz_dtype(col): # coerce to UTC timezone # GH11216 return to_datetime(col, utc=True) else: - return to_datetime(col, errors='coerce', format=format, utc=utc) + return to_datetime(col, errors="coerce", format=format, utc=utc) def _parse_date_columns(data_frame, parse_dates): @@ -116,12 +119,10 @@ def _parse_date_columns(data_frame, parse_dates): return data_frame -def _wrap_result(data, columns, index_col=None, coerce_float=True, - parse_dates=None): +def _wrap_result(data, columns, index_col=None, coerce_float=True, parse_dates=None): """Wrap result set of query in a DataFrame.""" - frame = DataFrame.from_records(data, columns=columns, - coerce_float=coerce_float) + frame = DataFrame.from_records(data, columns=columns, coerce_float=coerce_float) frame = _parse_date_columns(frame, parse_dates) @@ -162,9 +163,17 @@ def execute(sql, con, cur=None, params=None): # ----------------------------------------------------------------------------- # -- Read and write to DataFrames -def read_sql_table(table_name, con, schema=None, index_col=None, - coerce_float=True, parse_dates=None, columns=None, - chunksize=None): + +def read_sql_table( + table_name, + con, + schema=None, + index_col=None, + coerce_float=True, + parse_dates=None, + columns=None, + chunksize=None, +): """ Read SQL database table into a DataFrame. @@ -223,10 +232,12 @@ def read_sql_table(table_name, con, schema=None, index_col=None, con = _engine_builder(con) if not _is_sqlalchemy_connectable(con): - raise NotImplementedError("read_sql_table only supported for " - "SQLAlchemy connectable.") + raise NotImplementedError( + "read_sql_table only supported for " "SQLAlchemy connectable." + ) import sqlalchemy from sqlalchemy.schema import MetaData + meta = MetaData(con, schema=schema) try: meta.reflect(only=[table_name], views=True) @@ -235,8 +246,13 @@ def read_sql_table(table_name, con, schema=None, index_col=None, pandas_sql = SQLDatabase(con, meta=meta) table = pandas_sql.read_table( - table_name, index_col=index_col, coerce_float=coerce_float, - parse_dates=parse_dates, columns=columns, chunksize=chunksize) + table_name, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + columns=columns, + chunksize=chunksize, + ) if table is not None: return table @@ -244,8 +260,15 @@ def read_sql_table(table_name, con, schema=None, index_col=None, raise ValueError("Table {name} not found".format(name=table_name), con) -def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None, - parse_dates=None, chunksize=None): +def read_sql_query( + sql, + con, + index_col=None, + coerce_float=True, + params=None, + parse_dates=None, + chunksize=None, +): """Read SQL query into a DataFrame. Returns a DataFrame corresponding to the result set of the query @@ -301,12 +324,25 @@ def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None, """ pandas_sql = pandasSQL_builder(con) return pandas_sql.read_query( - sql, index_col=index_col, params=params, coerce_float=coerce_float, - parse_dates=parse_dates, chunksize=chunksize) - - -def read_sql(sql, con, index_col=None, coerce_float=True, params=None, - parse_dates=None, columns=None, chunksize=None): + sql, + index_col=index_col, + params=params, + coerce_float=coerce_float, + parse_dates=parse_dates, + chunksize=chunksize, + ) + + +def read_sql( + sql, + con, + index_col=None, + coerce_float=True, + params=None, + parse_dates=None, + columns=None, + chunksize=None, +): """ Read SQL query or database table into a DataFrame. @@ -366,9 +402,13 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None, if isinstance(pandas_sql, SQLiteDatabase): return pandas_sql.read_query( - sql, index_col=index_col, params=params, - coerce_float=coerce_float, parse_dates=parse_dates, - chunksize=chunksize) + sql, + index_col=index_col, + params=params, + coerce_float=coerce_float, + parse_dates=parse_dates, + chunksize=chunksize, + ) try: _is_table_name = pandas_sql.has_table(sql) @@ -379,17 +419,36 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None, if _is_table_name: pandas_sql.meta.reflect(only=[sql]) return pandas_sql.read_table( - sql, index_col=index_col, coerce_float=coerce_float, - parse_dates=parse_dates, columns=columns, chunksize=chunksize) + sql, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + columns=columns, + chunksize=chunksize, + ) else: return pandas_sql.read_query( - sql, index_col=index_col, params=params, - coerce_float=coerce_float, parse_dates=parse_dates, - chunksize=chunksize) + sql, + index_col=index_col, + params=params, + coerce_float=coerce_float, + parse_dates=parse_dates, + chunksize=chunksize, + ) -def to_sql(frame, name, con, schema=None, if_exists='fail', index=True, - index_label=None, chunksize=None, dtype=None, method=None): +def to_sql( + frame, + name, + con, + schema=None, + if_exists="fail", + index=True, + index_label=None, + chunksize=None, + dtype=None, + method=None, +): """ Write records stored in a DataFrame to a SQL database. @@ -435,7 +494,7 @@ def to_sql(frame, name, con, schema=None, if_exists='fail', index=True, .. versionadded:: 0.24.0 """ - if if_exists not in ('fail', 'replace', 'append'): + if if_exists not in ("fail", "replace", "append"): raise ValueError("'{0}' is not valid for if_exists".format(if_exists)) pandas_sql = pandasSQL_builder(con, schema=schema) @@ -443,12 +502,21 @@ def to_sql(frame, name, con, schema=None, if_exists='fail', index=True, if isinstance(frame, Series): frame = frame.to_frame() elif not isinstance(frame, DataFrame): - raise NotImplementedError("'frame' argument should be either a " - "Series or a DataFrame") + raise NotImplementedError( + "'frame' argument should be either a " "Series or a DataFrame" + ) - pandas_sql.to_sql(frame, name, if_exists=if_exists, index=index, - index_label=index_label, schema=schema, - chunksize=chunksize, dtype=dtype, method=method) + pandas_sql.to_sql( + frame, + name, + if_exists=if_exists, + index=index, + index_label=index_label, + schema=schema, + chunksize=chunksize, + dtype=dtype, + method=method, + ) def has_table(table_name, con, schema=None): @@ -496,8 +564,7 @@ def _engine_builder(con): return con -def pandasSQL_builder(con, schema=None, meta=None, - is_cursor=False): +def pandasSQL_builder(con, schema=None, meta=None, is_cursor=False): """ Convenience function to return the correct PandasSQL subclass based on the provided parameters. @@ -521,11 +588,22 @@ class SQLTable(PandasObject): Also holds various flags needed to avoid having to pass them between functions all the time. """ + # TODO: support for multiIndex - def __init__(self, name, pandas_sql_engine, frame=None, index=True, - if_exists='fail', prefix='pandas', index_label=None, - schema=None, keys=None, dtype=None): + def __init__( + self, + name, + pandas_sql_engine, + frame=None, + index=True, + if_exists="fail", + prefix="pandas", + index_label=None, + schema=None, + keys=None, + dtype=None, + ): self.name = name self.pd_sql = pandas_sql_engine self.prefix = prefix @@ -544,14 +622,14 @@ def __init__(self, name, pandas_sql_engine, frame=None, index=True, self.table = self.pd_sql.get_table(self.name, self.schema) if self.table is None: - raise ValueError( - "Could not init table '{name}'".format(name=name)) + raise ValueError("Could not init table '{name}'".format(name=name)) def exists(self): return self.pd_sql.has_table(self.name, self.schema) def sql_schema(self): from sqlalchemy.schema import CreateTable + return str(CreateTable(self.table).compile(self.pd_sql.connectable)) def _execute_create(self): @@ -561,17 +639,19 @@ def _execute_create(self): def create(self): if self.exists(): - if self.if_exists == 'fail': + if self.if_exists == "fail": raise ValueError( - "Table '{name}' already exists.".format(name=self.name)) - elif self.if_exists == 'replace': + "Table '{name}' already exists.".format(name=self.name) + ) + elif self.if_exists == "replace": self.pd_sql.drop_table(self.name, self.schema) self._execute_create() - elif self.if_exists == 'append': + elif self.if_exists == "append": pass else: raise ValueError( - "'{0}' is not valid for if_exists".format(self.if_exists)) + "'{0}' is not valid for if_exists".format(self.if_exists) + ) else: self._execute_create() @@ -606,8 +686,7 @@ def insert_data(self): try: temp.reset_index(inplace=True) except ValueError as err: - raise ValueError( - "duplicate name in index/columns: {0}".format(err)) + raise ValueError("duplicate name in index/columns: {0}".format(err)) else: temp = self.frame @@ -626,7 +705,7 @@ def insert_data(self): d = np.atleast_2d(d) else: # convert to microsecond resolution for datetime.datetime - d = b.values.astype('M8[us]').astype(object) + d = b.values.astype("M8[us]").astype(object) else: d = np.array(b.get_values(), dtype=object) @@ -645,12 +724,12 @@ def insert(self, chunksize=None, method=None): # set insert method if method is None: exec_insert = self._execute_insert - elif method == 'multi': + elif method == "multi": exec_insert = self._execute_insert_multi elif callable(method): exec_insert = partial(method, self) else: - raise ValueError('Invalid parameter `method`: {}'.format(method)) + raise ValueError("Invalid parameter `method`: {}".format(method)) keys, data_list = self.insert_data() @@ -662,7 +741,7 @@ def insert(self, chunksize=None, method=None): if chunksize is None: chunksize = nrows elif chunksize == 0: - raise ValueError('chunksize argument should be non-zero') + raise ValueError("chunksize argument should be non-zero") chunks = int(nrows / chunksize) + 1 @@ -676,8 +755,9 @@ def insert(self, chunksize=None, method=None): chunk_iter = zip(*[arr[start_i:end_i] for arr in data_list]) exec_insert(conn, keys, chunk_iter) - def _query_iterator(self, result, chunksize, columns, coerce_float=True, - parse_dates=None): + def _query_iterator( + self, result, chunksize, columns, coerce_float=True, parse_dates=None + ): """Return generator through chunked result set.""" while True: @@ -686,7 +766,8 @@ def _query_iterator(self, result, chunksize, columns, coerce_float=True, break else: self.frame = DataFrame.from_records( - data, columns=columns, coerce_float=coerce_float) + data, columns=columns, coerce_float=coerce_float + ) self._harmonize_columns(parse_dates=parse_dates) @@ -695,11 +776,11 @@ def _query_iterator(self, result, chunksize, columns, coerce_float=True, yield self.frame - def read(self, coerce_float=True, parse_dates=None, columns=None, - chunksize=None): + def read(self, coerce_float=True, parse_dates=None, columns=None, chunksize=None): if columns is not None and len(columns) > 0: from sqlalchemy import select + cols = [self.table.c[n] for n in columns] if self.index is not None: [cols.insert(0, self.table.c[idx]) for idx in self.index[::-1]] @@ -711,13 +792,18 @@ def read(self, coerce_float=True, parse_dates=None, columns=None, column_names = result.keys() if chunksize is not None: - return self._query_iterator(result, chunksize, column_names, - coerce_float=coerce_float, - parse_dates=parse_dates) + return self._query_iterator( + result, + chunksize, + column_names, + coerce_float=coerce_float, + parse_dates=parse_dates, + ) else: data = result.fetchall() self.frame = DataFrame.from_records( - data, columns=column_names, coerce_float=coerce_float) + data, columns=column_names, coerce_float=coerce_float + ) self._harmonize_columns(parse_dates=parse_dates) @@ -737,16 +823,22 @@ def _index_name(self, index, index_label): if len(index_label) != nlevels: raise ValueError( "Length of 'index_label' should match number of " - "levels, which is {0}".format(nlevels)) + "levels, which is {0}".format(nlevels) + ) else: return index_label # return the used column labels for the index columns - if (nlevels == 1 and 'index' not in self.frame.columns and - self.frame.index.name is None): - return ['index'] + if ( + nlevels == 1 + and "index" not in self.frame.columns + and self.frame.index.name is None + ): + return ["index"] else: - return [l if l is not None else "level_{0}".format(i) - for i, l in enumerate(self.frame.index.names)] + return [ + l if l is not None else "level_{0}".format(i) + for i, l in enumerate(self.frame.index.names) + ] # for reading: index=(list of) string to specify column to set as index elif isinstance(index, str): @@ -760,14 +852,11 @@ def _get_column_names_and_types(self, dtype_mapper): column_names_and_types = [] if self.index is not None: for i, idx_label in enumerate(self.index): - idx_type = dtype_mapper( - self.frame.index._get_level_values(i)) + idx_type = dtype_mapper(self.frame.index._get_level_values(i)) column_names_and_types.append((str(idx_label), idx_type, True)) column_names_and_types += [ - (str(self.frame.columns[i]), - dtype_mapper(self.frame.iloc[:, i]), - False) + (str(self.frame.columns[i]), dtype_mapper(self.frame.iloc[:, i]), False) for i in range(len(self.frame.columns)) ] @@ -776,19 +865,19 @@ def _get_column_names_and_types(self, dtype_mapper): def _create_table_setup(self): from sqlalchemy import Table, Column, PrimaryKeyConstraint - column_names_and_types = self._get_column_names_and_types( - self._sqlalchemy_type - ) + column_names_and_types = self._get_column_names_and_types(self._sqlalchemy_type) - columns = [Column(name, typ, index=is_index) - for name, typ, is_index in column_names_and_types] + columns = [ + Column(name, typ, index=is_index) + for name, typ, is_index in column_names_and_types + ] if self.keys is not None: if not is_list_like(self.keys): keys = [self.keys] else: keys = self.keys - pkc = PrimaryKeyConstraint(*keys, name=self.name + '_pk') + pkc = PrimaryKeyConstraint(*keys, name=self.name + "_pk") columns.append(pkc) schema = self.schema or self.pd_sql.meta.schema @@ -796,6 +885,7 @@ def _create_table_setup(self): # At this point, attach to new metadata, only attach to self.meta # once table is created. from sqlalchemy.schema import MetaData + meta = MetaData(self.pd_sql, schema=schema) return Table(self.name, meta, *columns, schema=schema) @@ -826,15 +916,17 @@ def _harmonize_columns(self, parse_dates=None): fmt = parse_dates[col_name] except TypeError: fmt = None - self.frame[col_name] = _handle_date_column( - df_col, format=fmt) + self.frame[col_name] = _handle_date_column(df_col, format=fmt) continue # the type the dataframe column should have col_type = self._get_dtype(sql_col.type) - if (col_type is datetime or col_type is date or - col_type is DatetimeTZDtype): + if ( + col_type is datetime + or col_type is date + or col_type is DatetimeTZDtype + ): # Convert tz-aware Datetime SQL columns to UTC utc = col_type is DatetimeTZDtype self.frame[col_name] = _handle_date_column(df_col, utc=utc) @@ -844,9 +936,8 @@ def _harmonize_columns(self, parse_dates=None): elif len(df_col) == df_col.count(): # No NA values, can convert ints and bools - if col_type is np.dtype('int64') or col_type is bool: - self.frame[col_name] = df_col.astype( - col_type, copy=False) + if col_type is np.dtype("int64") or col_type is bool: + self.frame[col_name] = df_col.astype(col_type, copy=False) except KeyError: pass # this column not in results @@ -860,11 +951,19 @@ def _sqlalchemy_type(self, col): # Needed for inserting typed data containing NULLs, GH 8778. col_type = lib.infer_dtype(col, skipna=True) - from sqlalchemy.types import (BigInteger, Integer, Float, - Text, Boolean, - DateTime, Date, Time, TIMESTAMP) + from sqlalchemy.types import ( + BigInteger, + Integer, + Float, + Text, + Boolean, + DateTime, + Date, + Time, + TIMESTAMP, + ) - if col_type == 'datetime64' or col_type == 'datetime': + if col_type == "datetime64" or col_type == "datetime": # GH 9086: TIMESTAMP is the suggested type if the column contains # timezone information try: @@ -875,41 +974,44 @@ def _sqlalchemy_type(self, col): if col.tz is not None: return TIMESTAMP(timezone=True) return DateTime - if col_type == 'timedelta64': - warnings.warn("the 'timedelta' type is not supported, and will be " - "written as integer values (ns frequency) to the " - "database.", UserWarning, stacklevel=8) + if col_type == "timedelta64": + warnings.warn( + "the 'timedelta' type is not supported, and will be " + "written as integer values (ns frequency) to the " + "database.", + UserWarning, + stacklevel=8, + ) return BigInteger - elif col_type == 'floating': - if col.dtype == 'float32': + elif col_type == "floating": + if col.dtype == "float32": return Float(precision=23) else: return Float(precision=53) - elif col_type == 'integer': - if col.dtype == 'int32': + elif col_type == "integer": + if col.dtype == "int32": return Integer else: return BigInteger - elif col_type == 'boolean': + elif col_type == "boolean": return Boolean - elif col_type == 'date': + elif col_type == "date": return Date - elif col_type == 'time': + elif col_type == "time": return Time - elif col_type == 'complex': - raise ValueError('Complex datatypes not supported') + elif col_type == "complex": + raise ValueError("Complex datatypes not supported") return Text def _get_dtype(self, sqltype): - from sqlalchemy.types import (Integer, Float, Boolean, DateTime, - Date, TIMESTAMP) + from sqlalchemy.types import Integer, Float, Boolean, DateTime, Date, TIMESTAMP if isinstance(sqltype, Float): return float elif isinstance(sqltype, Integer): # TODO: Refine integer size. - return np.dtype('int64') + return np.dtype("int64") elif isinstance(sqltype, TIMESTAMP): # we have a timezone capable type if not sqltype.timezone: @@ -931,12 +1033,16 @@ class PandasSQL(PandasObject): """ def read_sql(self, *args, **kwargs): - raise ValueError("PandasSQL must be created with an SQLAlchemy " - "connectable or sqlite connection") + raise ValueError( + "PandasSQL must be created with an SQLAlchemy " + "connectable or sqlite connection" + ) def to_sql(self, *args, **kwargs): - raise ValueError("PandasSQL must be created with an SQLAlchemy " - "connectable or sqlite connection") + raise ValueError( + "PandasSQL must be created with an SQLAlchemy " + "connectable or sqlite connection" + ) class SQLDatabase(PandasSQL): @@ -963,6 +1069,7 @@ def __init__(self, engine, schema=None, meta=None): self.connectable = engine if not meta: from sqlalchemy.schema import MetaData + meta = MetaData(self.connectable, schema=schema) self.meta = meta @@ -970,7 +1077,7 @@ def __init__(self, engine, schema=None, meta=None): @contextmanager def run_transaction(self): with self.connectable.begin() as tx: - if hasattr(tx, 'execute'): + if hasattr(tx, "execute"): yield tx else: yield self.connectable @@ -979,9 +1086,16 @@ def execute(self, *args, **kwargs): """Simple passthrough to SQLAlchemy connectable""" return self.connectable.execute(*args, **kwargs) - def read_table(self, table_name, index_col=None, coerce_float=True, - parse_dates=None, columns=None, schema=None, - chunksize=None): + def read_table( + self, + table_name, + index_col=None, + coerce_float=True, + parse_dates=None, + columns=None, + schema=None, + chunksize=None, + ): """Read SQL database table into a DataFrame. Parameters @@ -1024,13 +1138,17 @@ def read_table(self, table_name, index_col=None, coerce_float=True, """ table = SQLTable(table_name, self, index=index_col, schema=schema) - return table.read(coerce_float=coerce_float, - parse_dates=parse_dates, columns=columns, - chunksize=chunksize) + return table.read( + coerce_float=coerce_float, + parse_dates=parse_dates, + columns=columns, + chunksize=chunksize, + ) @staticmethod - def _query_iterator(result, chunksize, columns, index_col=None, - coerce_float=True, parse_dates=None): + def _query_iterator( + result, chunksize, columns, index_col=None, coerce_float=True, parse_dates=None + ): """Return generator through chunked result set""" while True: @@ -1038,12 +1156,23 @@ def _query_iterator(result, chunksize, columns, index_col=None, if not data: break else: - yield _wrap_result(data, columns, index_col=index_col, - coerce_float=coerce_float, - parse_dates=parse_dates) + yield _wrap_result( + data, + columns, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + ) - def read_query(self, sql, index_col=None, coerce_float=True, - parse_dates=None, params=None, chunksize=None): + def read_query( + self, + sql, + index_col=None, + coerce_float=True, + parse_dates=None, + params=None, + chunksize=None, + ): """Read SQL query into a DataFrame. Parameters @@ -1090,22 +1219,39 @@ def read_query(self, sql, index_col=None, coerce_float=True, columns = result.keys() if chunksize is not None: - return self._query_iterator(result, chunksize, columns, - index_col=index_col, - coerce_float=coerce_float, - parse_dates=parse_dates) + return self._query_iterator( + result, + chunksize, + columns, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + ) else: data = result.fetchall() - frame = _wrap_result(data, columns, index_col=index_col, - coerce_float=coerce_float, - parse_dates=parse_dates) + frame = _wrap_result( + data, + columns, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + ) return frame read_sql = read_query - def to_sql(self, frame, name, if_exists='fail', index=True, - index_label=None, schema=None, chunksize=None, dtype=None, - method=None): + def to_sql( + self, + frame, + name, + if_exists="fail", + index=True, + index_label=None, + schema=None, + chunksize=None, + dtype=None, + method=None, + ): """ Write records stored in a DataFrame to a SQL database. @@ -1152,24 +1298,33 @@ def to_sql(self, frame, name, if_exists='fail', index=True, if dtype is not None: from sqlalchemy.types import to_instance, TypeEngine + for col, my_type in dtype.items(): if not isinstance(to_instance(my_type), TypeEngine): - raise ValueError('The type of {column} is not a ' - 'SQLAlchemy type '.format(column=col)) + raise ValueError( + "The type of {column} is not a " + "SQLAlchemy type ".format(column=col) + ) - table = SQLTable(name, self, frame=frame, index=index, - if_exists=if_exists, index_label=index_label, - schema=schema, dtype=dtype) + table = SQLTable( + name, + self, + frame=frame, + index=index, + if_exists=if_exists, + index_label=index_label, + schema=schema, + dtype=dtype, + ) table.create() table.insert(chunksize, method=method) - if (not name.isdigit() and not name.islower()): + if not name.isdigit() and not name.islower(): # check for potentially case sensitivity issues (GH7815) # Only check when name is not a number and name is not lower case engine = self.connectable.engine with self.connectable.connect() as conn: table_names = engine.table_names( - schema=schema or self.meta.schema, - connection=conn, + schema=schema or self.meta.schema, connection=conn ) if name not in table_names: msg = ( @@ -1186,20 +1341,19 @@ def tables(self): def has_table(self, name, schema=None): return self.connectable.run_callable( - self.connectable.dialect.has_table, - name, - schema or self.meta.schema, + self.connectable.dialect.has_table, name, schema or self.meta.schema ) def get_table(self, table_name, schema=None): schema = schema or self.meta.schema if schema: - tbl = self.meta.tables.get('.'.join([schema, table_name])) + tbl = self.meta.tables.get(".".join([schema, table_name])) else: tbl = self.meta.tables.get(table_name) # Avoid casting double-precision floats into decimals from sqlalchemy import Numeric + for column in tbl.columns: if isinstance(column.type, Numeric): column.type.asdecimal = False @@ -1214,8 +1368,9 @@ def drop_table(self, table_name, schema=None): self.meta.clear() def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): - table = SQLTable(table_name, self, frame=frame, index=False, keys=keys, - dtype=dtype) + table = SQLTable( + table_name, self, frame=frame, index=False, keys=keys, dtype=dtype + ) return str(table.sql_schema()) @@ -1223,13 +1378,13 @@ def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): # sqlite-specific sql strings and handler class # dictionary used for readability purposes _SQL_TYPES = { - 'string': 'TEXT', - 'floating': 'REAL', - 'integer': 'INTEGER', - 'datetime': 'TIMESTAMP', - 'date': 'DATE', - 'time': 'TIME', - 'boolean': 'INTEGER', + "string": "TEXT", + "floating": "REAL", + "integer": "INTEGER", + "datetime": "TIMESTAMP", + "date": "DATE", + "time": "TIME", + "boolean": "INTEGER", } @@ -1238,7 +1393,8 @@ def _get_unicode_name(name): uname = str(name).encode("utf-8", "strict").decode("utf-8") except UnicodeError: raise ValueError( - "Cannot convert identifier to UTF-8: '{name}'".format(name=name)) + "Cannot convert identifier to UTF-8: '{name}'".format(name=name) + ) return uname @@ -1256,13 +1412,15 @@ def _get_valid_sqlite_name(name): nul_index = uname.find("\x00") if nul_index >= 0: - raise ValueError('SQLite identifier cannot contain NULs') + raise ValueError("SQLite identifier cannot contain NULs") return '"' + uname.replace('"', '""') + '"' -_SAFE_NAMES_WARNING = ("The spaces in these column names will not be changed. " - "In pandas versions < 0.14, spaces were converted to " - "underscores.") +_SAFE_NAMES_WARNING = ( + "The spaces in these column names will not be changed. " + "In pandas versions < 0.14, spaces were converted to " + "underscores." +) class SQLiteTable(SQLTable): @@ -1275,6 +1433,7 @@ def __init__(self, *args, **kwargs): # GH 8341 # register an adapter callable for datetime.time object import sqlite3 + # this will transform time(12,34,56,789) into '12:34:56.000789' # (this is what sqlalchemy does) sqlite3.register_adapter(time, lambda _: _.strftime("%H:%M:%S.%f")) @@ -1290,18 +1449,18 @@ def _execute_create(self): def insert_statement(self): names = list(map(str, self.frame.columns)) - wld = '?' # wildcard char + wld = "?" # wildcard char escape = _get_valid_sqlite_name if self.index is not None: [names.insert(0, idx) for idx in self.index[::-1]] bracketed_names = [escape(column) for column in names] - col_names = ','.join(bracketed_names) - wildcards = ','.join([wld] * len(names)) - insert_statement = \ - 'INSERT INTO {table} ({columns}) VALUES ({wld})'.format( - table=escape(self.name), columns=col_names, wld=wildcards) + col_names = ",".join(bracketed_names) + wildcards = ",".join([wld] * len(names)) + insert_statement = "INSERT INTO {table} ({columns}) VALUES ({wld})".format( + table=escape(self.name), columns=col_names, wld=wildcards + ) return insert_statement def _execute_insert(self, conn, keys, data_iter): @@ -1314,19 +1473,18 @@ def _create_table_setup(self): structure of a DataFrame. The first entry will be a CREATE TABLE statement while the rest will be CREATE INDEX statements. """ - column_names_and_types = self._get_column_names_and_types( - self._sql_type_name - ) + column_names_and_types = self._get_column_names_and_types(self._sql_type_name) - pat = re.compile(r'\s+') + pat = re.compile(r"\s+") column_names = [col_name for col_name, _, _ in column_names_and_types] if any(map(pat.search, column_names)): warnings.warn(_SAFE_NAMES_WARNING, stacklevel=6) escape = _get_valid_sqlite_name - create_tbl_stmts = [escape(cname) + ' ' + ctype - for cname, ctype, _ in column_names_and_types] + create_tbl_stmts = [ + escape(cname) + " " + ctype for cname, ctype, _ in column_names_and_types + ] if self.keys is not None and len(self.keys): if not is_list_like(self.keys): @@ -1336,19 +1494,31 @@ def _create_table_setup(self): cnames_br = ", ".join(escape(c) for c in keys) create_tbl_stmts.append( "CONSTRAINT {tbl}_pk PRIMARY KEY ({cnames_br})".format( - tbl=self.name, cnames_br=cnames_br)) - - create_stmts = ["CREATE TABLE " + escape(self.name) + " (\n" + - ',\n '.join(create_tbl_stmts) + "\n)"] + tbl=self.name, cnames_br=cnames_br + ) + ) + + create_stmts = [ + "CREATE TABLE " + + escape(self.name) + + " (\n" + + ",\n ".join(create_tbl_stmts) + + "\n)" + ] - ix_cols = [cname for cname, _, is_index in column_names_and_types - if is_index] + ix_cols = [cname for cname, _, is_index in column_names_and_types if is_index] if len(ix_cols): cnames = "_".join(ix_cols) cnames_br = ",".join(escape(c) for c in ix_cols) create_stmts.append( - "CREATE INDEX " + escape("ix_" + self.name + "_" + cnames) + - "ON " + escape(self.name) + " (" + cnames_br + ")") + "CREATE INDEX " + + escape("ix_" + self.name + "_" + cnames) + + "ON " + + escape(self.name) + + " (" + + cnames_br + + ")" + ) return create_stmts @@ -1361,10 +1531,14 @@ def _sql_type_name(self, col): # Needed for inserting typed data containing NULLs, GH 8778. col_type = lib.infer_dtype(col, skipna=True) - if col_type == 'timedelta64': - warnings.warn("the 'timedelta' type is not supported, and will be " - "written as integer values (ns frequency) to the " - "database.", UserWarning, stacklevel=8) + if col_type == "timedelta64": + warnings.warn( + "the 'timedelta' type is not supported, and will be " + "written as integer values (ns frequency) to the " + "database.", + UserWarning, + stacklevel=8, + ) col_type = "integer" elif col_type == "datetime64": @@ -1374,7 +1548,7 @@ def _sql_type_name(self, col): col_type = "string" elif col_type == "complex": - raise ValueError('Complex datatypes not supported') + raise ValueError("Complex datatypes not supported") if col_type not in _SQL_TYPES: col_type = "string" @@ -1426,17 +1600,19 @@ def execute(self, *args, **kwargs): except Exception: # pragma: no cover ex = DatabaseError( "Execution failed on sql: {sql}\n{exc}\nunable " - "to rollback".format(sql=args[0], exc=exc)) + "to rollback".format(sql=args[0], exc=exc) + ) raise_with_traceback(ex) ex = DatabaseError( - "Execution failed on sql '{sql}': {exc}".format( - sql=args[0], exc=exc)) + "Execution failed on sql '{sql}': {exc}".format(sql=args[0], exc=exc) + ) raise_with_traceback(ex) @staticmethod - def _query_iterator(cursor, chunksize, columns, index_col=None, - coerce_float=True, parse_dates=None): + def _query_iterator( + cursor, chunksize, columns, index_col=None, coerce_float=True, parse_dates=None + ): """Return generator through chunked result set""" while True: @@ -1447,29 +1623,48 @@ def _query_iterator(cursor, chunksize, columns, index_col=None, cursor.close() break else: - yield _wrap_result(data, columns, index_col=index_col, - coerce_float=coerce_float, - parse_dates=parse_dates) + yield _wrap_result( + data, + columns, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + ) - def read_query(self, sql, index_col=None, coerce_float=True, params=None, - parse_dates=None, chunksize=None): + def read_query( + self, + sql, + index_col=None, + coerce_float=True, + params=None, + parse_dates=None, + chunksize=None, + ): args = _convert_params(sql, params) cursor = self.execute(*args) columns = [col_desc[0] for col_desc in cursor.description] if chunksize is not None: - return self._query_iterator(cursor, chunksize, columns, - index_col=index_col, - coerce_float=coerce_float, - parse_dates=parse_dates) + return self._query_iterator( + cursor, + chunksize, + columns, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + ) else: data = self._fetchall_as_list(cursor) cursor.close() - frame = _wrap_result(data, columns, index_col=index_col, - coerce_float=coerce_float, - parse_dates=parse_dates) + frame = _wrap_result( + data, + columns, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + ) return frame def _fetchall_as_list(self, cur): @@ -1478,9 +1673,18 @@ def _fetchall_as_list(self, cur): result = list(result) return result - def to_sql(self, frame, name, if_exists='fail', index=True, - index_label=None, schema=None, chunksize=None, dtype=None, - method=None): + def to_sql( + self, + frame, + name, + if_exists="fail", + index=True, + index_label=None, + schema=None, + chunksize=None, + dtype=None, + method=None, + ): """ Write records stored in a DataFrame to a SQL database. @@ -1527,12 +1731,21 @@ def to_sql(self, frame, name, if_exists='fail', index=True, if dtype is not None: for col, my_type in dtype.items(): if not isinstance(my_type, str): - raise ValueError('{column} ({type!s}) not a string'.format( - column=col, type=my_type)) + raise ValueError( + "{column} ({type!s}) not a string".format( + column=col, type=my_type + ) + ) - table = SQLiteTable(name, self, frame=frame, index=index, - if_exists=if_exists, index_label=index_label, - dtype=dtype) + table = SQLiteTable( + name, + self, + frame=frame, + index=index, + if_exists=if_exists, + index_label=index_label, + dtype=dtype, + ) table.create() table.insert(chunksize, method) @@ -1541,23 +1754,24 @@ def has_table(self, name, schema=None): # escape = _get_valid_sqlite_name # esc_name = escape(name) - wld = '?' - query = ("SELECT name FROM sqlite_master " - "WHERE type='table' AND name={wld};").format(wld=wld) + wld = "?" + query = ( + "SELECT name FROM sqlite_master " "WHERE type='table' AND name={wld};" + ).format(wld=wld) - return len(self.execute(query, [name, ]).fetchall()) > 0 + return len(self.execute(query, [name]).fetchall()) > 0 def get_table(self, table_name, schema=None): return None # not supported in fallback mode def drop_table(self, name, schema=None): - drop_sql = "DROP TABLE {name}".format( - name=_get_valid_sqlite_name(name)) + drop_sql = "DROP TABLE {name}".format(name=_get_valid_sqlite_name(name)) self.execute(drop_sql) def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): - table = SQLiteTable(table_name, self, frame=frame, index=False, - keys=keys, dtype=dtype) + table = SQLiteTable( + table_name, self, frame=frame, index=False, keys=keys, dtype=dtype + ) return str(table.sql_schema()) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 00b7a29b27b63f..7087d2ee963cbd 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -26,20 +26,31 @@ from pandas.util._decorators import Appender, deprecate_kwarg from pandas.core.dtypes.common import ( - ensure_object, is_categorical_dtype, is_datetime64_dtype) + ensure_object, + is_categorical_dtype, + is_datetime64_dtype, +) from pandas import ( - Categorical, DatetimeIndex, NaT, Timestamp, concat, isna, to_datetime, - to_timedelta) + Categorical, + DatetimeIndex, + NaT, + Timestamp, + concat, + isna, + to_datetime, + to_timedelta, +) from pandas.core.frame import DataFrame from pandas.core.series import Series -from pandas.io.common import ( - BaseIterator, _stringify_path, get_filepath_or_buffer) +from pandas.io.common import BaseIterator, _stringify_path, get_filepath_or_buffer -_version_error = ("Version of given Stata file is not 104, 105, 108, " - "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), " - "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)") +_version_error = ( + "Version of given Stata file is not 104, 105, 108, " + "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), " + "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)" +) _statafile_processing_params1 = """\ convert_dates : boolean, defaults to True @@ -111,9 +122,13 @@ >>> itr = pd.read_stata('filename.dta', chunksize=10000) >>> for chunk in itr: ... do_something(chunk) -""" % (_statafile_processing_params1, _encoding_params, - _statafile_processing_params2, _chunksize_params, - _iterator_params) +""" % ( + _statafile_processing_params1, + _encoding_params, + _statafile_processing_params2, + _chunksize_params, + _iterator_params, +) _data_method_doc = """\ Read observations from Stata file, converting them into a dataframe @@ -129,7 +144,10 @@ Returns ------- DataFrame -""" % (_statafile_processing_params1, _statafile_processing_params2) +""" % ( + _statafile_processing_params1, + _statafile_processing_params2, +) _read_method_doc = """\ Reads observations from Stata file, converting them into a dataframe @@ -144,7 +162,10 @@ Returns ------- DataFrame -""" % (_statafile_processing_params1, _statafile_processing_params2) +""" % ( + _statafile_processing_params1, + _statafile_processing_params2, +) _stata_reader_doc = """\ @@ -161,26 +182,42 @@ %s %s %s -""" % (_statafile_processing_params1, _statafile_processing_params2, - _encoding_params, _chunksize_params) +""" % ( + _statafile_processing_params1, + _statafile_processing_params2, + _encoding_params, + _chunksize_params, +) @Appender(_read_stata_doc) -@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None) -@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col') -def read_stata(filepath_or_buffer, convert_dates=True, - convert_categoricals=True, encoding=None, index_col=None, - convert_missing=False, preserve_dtypes=True, columns=None, - order_categoricals=True, chunksize=None, iterator=False): - - reader = StataReader(filepath_or_buffer, - convert_dates=convert_dates, - convert_categoricals=convert_categoricals, - index_col=index_col, convert_missing=convert_missing, - preserve_dtypes=preserve_dtypes, - columns=columns, - order_categoricals=order_categoricals, - chunksize=chunksize) +@deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) +@deprecate_kwarg(old_arg_name="index", new_arg_name="index_col") +def read_stata( + filepath_or_buffer, + convert_dates=True, + convert_categoricals=True, + encoding=None, + index_col=None, + convert_missing=False, + preserve_dtypes=True, + columns=None, + order_categoricals=True, + chunksize=None, + iterator=False, +): + + reader = StataReader( + filepath_or_buffer, + convert_dates=convert_dates, + convert_categoricals=convert_categoricals, + index_col=index_col, + convert_missing=convert_missing, + preserve_dtypes=preserve_dtypes, + columns=columns, + order_categoricals=order_categoricals, + chunksize=chunksize, + ) if iterator or chunksize: data = reader @@ -261,12 +298,12 @@ def convert_year_month_safe(year, month): using datetime. """ if year.max() < MAX_YEAR and year.min() > MIN_YEAR: - return to_datetime(100 * year + month, format='%Y%m') + return to_datetime(100 * year + month, format="%Y%m") else: - index = getattr(year, 'index', None) + index = getattr(year, "index", None) return Series( - [datetime.datetime(y, m, 1) for y, m in zip(year, month)], - index=index) + [datetime.datetime(y, m, 1) for y, m in zip(year, month)], index=index + ) def convert_year_days_safe(year, days): """ @@ -274,12 +311,13 @@ def convert_year_days_safe(year, days): datetime or datetime64 Series """ if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR: - return (to_datetime(year, format='%Y') + - to_timedelta(days, unit='d')) + return to_datetime(year, format="%Y") + to_timedelta(days, unit="d") else: - index = getattr(year, 'index', None) - value = [datetime.datetime(y, 1, 1) + relativedelta(days=int(d)) - for y, d in zip(year, days)] + index = getattr(year, "index", None) + value = [ + datetime.datetime(y, 1, 1) + relativedelta(days=int(d)) + for y, d in zip(year, days) + ] return Series(value, index=index) def convert_delta_safe(base, deltas, unit): @@ -288,18 +326,19 @@ def convert_delta_safe(base, deltas, unit): versions if the deltas satisfy restrictions required to be expressed as dates in pandas. """ - index = getattr(deltas, 'index', None) - if unit == 'd': + index = getattr(deltas, "index", None) + if unit == "d": if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA: values = [base + relativedelta(days=int(d)) for d in deltas] return Series(values, index=index) - elif unit == 'ms': + elif unit == "ms": if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA: - values = [base + relativedelta(microseconds=(int(d) * 1000)) - for d in deltas] + values = [ + base + relativedelta(microseconds=(int(d) * 1000)) for d in deltas + ] return Series(values, index=index) else: - raise ValueError('format not understood') + raise ValueError("format not understood") base = to_datetime(base) deltas = to_timedelta(deltas, unit=unit) return base + deltas @@ -317,11 +356,10 @@ def convert_delta_safe(base, deltas, unit): if fmt.startswith(("%tc", "tc")): # Delta ms relative to base base = stata_epoch ms = dates - conv_dates = convert_delta_safe(base, ms, 'ms') + conv_dates = convert_delta_safe(base, ms, "ms") elif fmt.startswith(("%tC", "tC")): - warnings.warn("Encountered %tC format. Leaving in Stata " - "Internal Format.") + warnings.warn("Encountered %tC format. Leaving in Stata " "Internal Format.") conv_dates = Series(dates, dtype=np.object) if has_bad_values: conv_dates[bad_locs] = NaT @@ -330,7 +368,7 @@ def convert_delta_safe(base, deltas, unit): elif fmt.startswith(("%td", "td", "%d", "d")): base = stata_epoch days = dates - conv_dates = convert_delta_safe(base, days, 'd') + conv_dates = convert_delta_safe(base, days, "d") # does not count leap days - 7 days is a week. # 52nd week may have more than 7 days elif fmt.startswith(("%tw", "tw")): @@ -383,34 +421,35 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): if is_datetime64_dtype(dates.values): if delta: delta = dates - stata_epoch - d['delta'] = delta.values.astype( - np.int64) // 1000 # microseconds + d["delta"] = delta.values.astype(np.int64) // 1000 # microseconds if days or year: dates = DatetimeIndex(dates) - d['year'], d['month'] = dates.year, dates.month + d["year"], d["month"] = dates.year, dates.month if days: - days = (dates.astype(np.int64) - - to_datetime(d['year'], format='%Y').astype(np.int64)) - d['days'] = days // NS_PER_DAY + days = dates.astype(np.int64) - to_datetime( + d["year"], format="%Y" + ).astype(np.int64) + d["days"] = days // NS_PER_DAY - elif infer_dtype(dates, skipna=False) == 'datetime': + elif infer_dtype(dates, skipna=False) == "datetime": if delta: delta = dates.values - stata_epoch - f = lambda x: \ - US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds + f = lambda x: US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds v = np.vectorize(f) - d['delta'] = v(delta) + d["delta"] = v(delta) if year: year_month = dates.apply(lambda x: 100 * x.year + x.month) - d['year'] = year_month.values // 100 - d['month'] = (year_month.values - d['year'] * 100) + d["year"] = year_month.values // 100 + d["month"] = year_month.values - d["year"] * 100 if days: f = lambda x: (x - datetime.datetime(x.year, 1, 1)).days v = np.vectorize(f) - d['days'] = v(dates) + d["days"] = v(dates) else: - raise ValueError('Columns containing dates must contain either ' - 'datetime64, datetime.datetime or null values.') + raise ValueError( + "Columns containing dates must contain either " + "datetime64, datetime.datetime or null values." + ) return DataFrame(d, index=index) @@ -434,26 +473,26 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): conv_dates = d.delta // US_PER_DAY elif fmt in ["%tw", "tw"]: d = parse_dates_safe(dates, year=True, days=True) - conv_dates = (52 * (d.year - stata_epoch.year) + d.days // 7) + conv_dates = 52 * (d.year - stata_epoch.year) + d.days // 7 elif fmt in ["%tm", "tm"]: d = parse_dates_safe(dates, year=True) - conv_dates = (12 * (d.year - stata_epoch.year) + d.month - 1) + conv_dates = 12 * (d.year - stata_epoch.year) + d.month - 1 elif fmt in ["%tq", "tq"]: d = parse_dates_safe(dates, year=True) conv_dates = 4 * (d.year - stata_epoch.year) + (d.month - 1) // 3 elif fmt in ["%th", "th"]: d = parse_dates_safe(dates, year=True) - conv_dates = (2 * (d.year - stata_epoch.year) + - (d.month > 6).astype(np.int)) + conv_dates = 2 * (d.year - stata_epoch.year) + (d.month > 6).astype(np.int) elif fmt in ["%ty", "ty"]: d = parse_dates_safe(dates, year=True) conv_dates = d.year else: raise ValueError( - "Format {fmt} is not a known Stata date format".format(fmt=fmt)) + "Format {fmt} is not a known Stata date format".format(fmt=fmt) + ) conv_dates = Series(conv_dates, dtype=np.float64) - missing_value = struct.unpack('= 2 ** 53: - ws = precision_loss_doc % ('uint64', 'float64') + ws = precision_loss_doc % ("uint64", "float64") data[col] = data[col].astype(dtype) @@ -561,28 +602,31 @@ def _cast_to_stata_types(data): if data[col].max() > 32740 or data[col].min() < -32767: data[col] = data[col].astype(np.int32) elif dtype == np.int64: - if (data[col].max() <= 2147483620 and - data[col].min() >= -2147483647): + if data[col].max() <= 2147483620 and data[col].min() >= -2147483647: data[col] = data[col].astype(np.int32) else: data[col] = data[col].astype(np.float64) if data[col].max() >= 2 ** 53 or data[col].min() <= -2 ** 53: - ws = precision_loss_doc % ('int64', 'float64') + ws = precision_loss_doc % ("int64", "float64") elif dtype in (np.float32, np.float64): value = data[col].max() if np.isinf(value): - raise ValueError('Column {col} has a maximum value of ' - 'infinity which is outside the range ' - 'supported by Stata.'.format(col=col)) + raise ValueError( + "Column {col} has a maximum value of " + "infinity which is outside the range " + "supported by Stata.".format(col=col) + ) if dtype == np.float32 and value > float32_max: data[col] = data[col].astype(np.float64) elif dtype == np.float64: if value > float64_max: - raise ValueError('Column {col} has a maximum value ' - '({val}) outside the range supported by ' - 'Stata ({float64_max})' - .format(col=col, val=value, - float64_max=float64_max)) + raise ValueError( + "Column {col} has a maximum value " + "({val}) outside the range supported by " + "Stata ({float64_max})".format( + col=col, val=value, float64_max=float64_max + ) + ) if ws: warnings.warn(ws, PossiblePrecisionLoss) @@ -630,8 +674,10 @@ def __init__(self, catarray): category = vl[1] if not isinstance(category, str): category = str(category) - warnings.warn(value_label_mismatch_doc.format(catarray.name), - ValueLabelTypeMismatch) + warnings.warn( + value_label_mismatch_doc.format(catarray.name), + ValueLabelTypeMismatch, + ) self.off.append(self.text_len) self.text_len += len(category) + 1 # +1 for the padding @@ -640,9 +686,11 @@ def __init__(self, catarray): self.n += 1 if self.text_len > 32000: - raise ValueError('Stata value labels for a single variable must ' - 'have a combined length less than 32,000 ' - 'characters.') + raise ValueError( + "Stata value labels for a single variable must " + "have a combined length less than 32,000 " + "characters." + ) # Ensure int32 self.off = np.array(self.off, dtype=np.int32) @@ -674,11 +722,11 @@ def generate_value_label(self, byteorder, encoding): self._encoding = encoding bio = BytesIO() - null_string = '\x00' - null_byte = b'\x00' + null_string = "\x00" + null_byte = b"\x00" # len - bio.write(struct.pack(byteorder + 'i', self.len)) + bio.write(struct.pack(byteorder + "i", self.len)) # labname labname = self._encode(_pad_bytes(self.labname[:32], 33)) @@ -686,22 +734,22 @@ def generate_value_label(self, byteorder, encoding): # padding - 3 bytes for i in range(3): - bio.write(struct.pack('c', null_byte)) + bio.write(struct.pack("c", null_byte)) # value_label_table # n - int32 - bio.write(struct.pack(byteorder + 'i', self.n)) + bio.write(struct.pack(byteorder + "i", self.n)) # textlen - int32 - bio.write(struct.pack(byteorder + 'i', self.text_len)) + bio.write(struct.pack(byteorder + "i", self.text_len)) # off - int32 array (n elements) for offset in self.off: - bio.write(struct.pack(byteorder + 'i', offset)) + bio.write(struct.pack(byteorder + "i", offset)) # val - int32 array (n elements) for value in self.val: - bio.write(struct.pack(byteorder + 'i', value)) + bio.write(struct.pack(byteorder + "i", value)) # txt - Text labels, null terminated for text in self.txt: @@ -760,36 +808,37 @@ class StataMissingValue: bases = (101, 32741, 2147483621) for b in bases: # Conversion to long to avoid hash issues on 32 bit platforms #8968 - MISSING_VALUES[b] = '.' + MISSING_VALUES[b] = "." for i in range(1, 27): - MISSING_VALUES[i + b] = '.' + chr(96 + i) + MISSING_VALUES[i + b] = "." + chr(96 + i) - float32_base = b'\x00\x00\x00\x7f' - increment = struct.unpack(' 0: MISSING_VALUES[value] += chr(96 + i) - int_value = struct.unpack(' 0: MISSING_VALUES[value] += chr(96 + i) - int_value = struct.unpack('q', struct.pack(' 0 + self.has_string_data = len([x for x in self.typlist if type(x) is int]) > 0 # calculate size of a data record self.col_sizes = [self._calcsize(typ) for typ in self.typlist] @@ -1038,10 +1135,9 @@ def _read_new_header(self, first_char): raise ValueError(_version_error) self._set_encoding() self.path_or_buf.read(21) # - self.byteorder = self.path_or_buf.read(3) == b'MSF' and '>' or '<' + self.byteorder = self.path_or_buf.read(3) == b"MSF" and ">" or "<" self.path_or_buf.read(15) # - self.nvar = struct.unpack(self.byteorder + 'H', - self.path_or_buf.read(2))[0] + self.nvar = struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0] self.path_or_buf.read(7) # self.nobs = self._get_nobs() @@ -1053,27 +1149,35 @@ def _read_new_header(self, first_char): self.path_or_buf.read(8) # 0x0000000000000000 self.path_or_buf.read(8) # position of - self._seek_vartypes = struct.unpack( - self.byteorder + 'q', self.path_or_buf.read(8))[0] + 16 - self._seek_varnames = struct.unpack( - self.byteorder + 'q', self.path_or_buf.read(8))[0] + 10 - self._seek_sortlist = struct.unpack( - self.byteorder + 'q', self.path_or_buf.read(8))[0] + 10 - self._seek_formats = struct.unpack( - self.byteorder + 'q', self.path_or_buf.read(8))[0] + 9 - self._seek_value_label_names = struct.unpack( - self.byteorder + 'q', self.path_or_buf.read(8))[0] + 19 + self._seek_vartypes = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 16 + ) + self._seek_varnames = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 10 + ) + self._seek_sortlist = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 10 + ) + self._seek_formats = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 9 + ) + self._seek_value_label_names = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 19 + ) # Requires version-specific treatment self._seek_variable_labels = self._get_seek_variable_labels() self.path_or_buf.read(8) # - self.data_location = struct.unpack( - self.byteorder + 'q', self.path_or_buf.read(8))[0] + 6 - self.seek_strls = struct.unpack( - self.byteorder + 'q', self.path_or_buf.read(8))[0] + 7 - self.seek_value_labels = struct.unpack( - self.byteorder + 'q', self.path_or_buf.read(8))[0] + 14 + self.data_location = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 6 + ) + self.seek_strls = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 7 + ) + self.seek_value_labels = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 14 + ) self.typlist, self.dtyplist = self._get_dtypes(self._seek_vartypes) @@ -1082,8 +1186,8 @@ def _read_new_header(self, first_char): self.path_or_buf.seek(self._seek_sortlist) self.srtlist = struct.unpack( - self.byteorder + ('h' * (self.nvar + 1)), - self.path_or_buf.read(2 * (self.nvar + 1)) + self.byteorder + ("h" * (self.nvar + 1)), + self.path_or_buf.read(2 * (self.nvar + 1)), )[:-1] self.path_or_buf.seek(self._seek_formats) @@ -1099,9 +1203,10 @@ def _read_new_header(self, first_char): def _get_dtypes(self, seek_vartypes): self.path_or_buf.seek(seek_vartypes) - raw_typlist = [struct.unpack(self.byteorder + 'H', - self.path_or_buf.read(2))[0] - for i in range(self.nvar)] + raw_typlist = [ + struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0] + for i in range(self.nvar) + ] def f(typ): if typ <= 2045: @@ -1109,8 +1214,7 @@ def f(typ): try: return self.TYPE_MAP_XML[typ] except KeyError: - raise ValueError("cannot convert stata types [{0}]". - format(typ)) + raise ValueError("cannot convert stata types [{0}]".format(typ)) typlist = [f(x) for x in raw_typlist] @@ -1120,8 +1224,7 @@ def f(typ): try: return self.DTYPE_MAP_XML[typ] except KeyError: - raise ValueError("cannot convert stata dtype [{0}]" - .format(typ)) + raise ValueError("cannot convert stata dtype [{0}]".format(typ)) dtyplist = [f(x) for x in raw_typlist] @@ -1133,8 +1236,7 @@ def _get_varlist(self): elif self.format_version == 118: b = 129 - return [self._decode(self.path_or_buf.read(b)) - for i in range(self.nvar)] + return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)] # Returns the format list def _get_fmtlist(self): @@ -1147,8 +1249,7 @@ def _get_fmtlist(self): else: b = 7 - return [self._decode(self.path_or_buf.read(b)) - for i in range(self.nvar)] + return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)] # Returns the label list def _get_lbllist(self): @@ -1158,36 +1259,35 @@ def _get_lbllist(self): b = 33 else: b = 9 - return [self._decode(self.path_or_buf.read(b)) - for i in range(self.nvar)] + return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)] def _get_variable_labels(self): if self.format_version == 118: - vlblist = [self._decode(self.path_or_buf.read(321)) - for i in range(self.nvar)] + vlblist = [ + self._decode(self.path_or_buf.read(321)) for i in range(self.nvar) + ] elif self.format_version > 105: - vlblist = [self._decode(self.path_or_buf.read(81)) - for i in range(self.nvar)] + vlblist = [ + self._decode(self.path_or_buf.read(81)) for i in range(self.nvar) + ] else: - vlblist = [self._decode(self.path_or_buf.read(32)) - for i in range(self.nvar)] + vlblist = [ + self._decode(self.path_or_buf.read(32)) for i in range(self.nvar) + ] return vlblist def _get_nobs(self): if self.format_version == 118: - return struct.unpack(self.byteorder + 'Q', - self.path_or_buf.read(8))[0] + return struct.unpack(self.byteorder + "Q", self.path_or_buf.read(8))[0] else: - return struct.unpack(self.byteorder + 'I', - self.path_or_buf.read(4))[0] + return struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] def _get_data_label(self): if self.format_version == 118: - strlen = struct.unpack(self.byteorder + 'H', - self.path_or_buf.read(2))[0] + strlen = struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0] return self._decode(self.path_or_buf.read(strlen)) elif self.format_version == 117: - strlen = struct.unpack('b', self.path_or_buf.read(1))[0] + strlen = struct.unpack("b", self.path_or_buf.read(1))[0] return self._decode(self.path_or_buf.read(strlen)) elif self.format_version > 105: return self._decode(self.path_or_buf.read(81)) @@ -1196,10 +1296,10 @@ def _get_data_label(self): def _get_time_stamp(self): if self.format_version == 118: - strlen = struct.unpack('b', self.path_or_buf.read(1))[0] + strlen = struct.unpack("b", self.path_or_buf.read(1))[0] return self.path_or_buf.read(strlen).decode("utf-8") elif self.format_version == 117: - strlen = struct.unpack('b', self.path_or_buf.read(1))[0] + strlen = struct.unpack("b", self.path_or_buf.read(1))[0] return self._decode(self.path_or_buf.read(strlen)) elif self.format_version > 104: return self._decode(self.path_or_buf.read(18)) @@ -1214,23 +1314,22 @@ def _get_seek_variable_labels(self): # variable, 20 for the closing tag and 17 for the opening tag return self._seek_value_label_names + (33 * self.nvar) + 20 + 17 elif self.format_version == 118: - return struct.unpack(self.byteorder + 'q', - self.path_or_buf.read(8))[0] + 17 + return struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 17 else: raise ValueError() def _read_old_header(self, first_char): - self.format_version = struct.unpack('b', first_char)[0] + self.format_version = struct.unpack("b", first_char)[0] if self.format_version not in [104, 105, 108, 111, 113, 114, 115]: raise ValueError(_version_error) self._set_encoding() - self.byteorder = struct.unpack( - 'b', self.path_or_buf.read(1))[0] == 0x1 and '>' or '<' - self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0] + self.byteorder = ( + struct.unpack("b", self.path_or_buf.read(1))[0] == 0x1 and ">" or "<" + ) + self.filetype = struct.unpack("b", self.path_or_buf.read(1))[0] self.path_or_buf.read(1) # unused - self.nvar = struct.unpack(self.byteorder + 'H', - self.path_or_buf.read(2))[0] + self.nvar = struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0] self.nobs = self._get_nobs() self._data_label = self._get_data_label() @@ -1239,8 +1338,7 @@ def _read_old_header(self, first_char): # descriptors if self.format_version > 108: - typlist = [ord(self.path_or_buf.read(1)) - for i in range(self.nvar)] + typlist = [ord(self.path_or_buf.read(1)) for i in range(self.nvar)] else: buf = self.path_or_buf.read(self.nvar) typlistb = np.frombuffer(buf, dtype=np.uint8) @@ -1254,23 +1352,31 @@ def _read_old_header(self, first_char): try: self.typlist = [self.TYPE_MAP[typ] for typ in typlist] except ValueError: - raise ValueError("cannot convert stata types [{0}]" - .format(','.join(str(x) for x in typlist))) + raise ValueError( + "cannot convert stata types [{0}]".format( + ",".join(str(x) for x in typlist) + ) + ) try: self.dtyplist = [self.DTYPE_MAP[typ] for typ in typlist] except ValueError: - raise ValueError("cannot convert stata dtypes [{0}]" - .format(','.join(str(x) for x in typlist))) + raise ValueError( + "cannot convert stata dtypes [{0}]".format( + ",".join(str(x) for x in typlist) + ) + ) if self.format_version > 108: - self.varlist = [self._decode(self.path_or_buf.read(33)) - for i in range(self.nvar)] + self.varlist = [ + self._decode(self.path_or_buf.read(33)) for i in range(self.nvar) + ] else: - self.varlist = [self._decode(self.path_or_buf.read(9)) - for i in range(self.nvar)] + self.varlist = [ + self._decode(self.path_or_buf.read(9)) for i in range(self.nvar) + ] self.srtlist = struct.unpack( - self.byteorder + ('h' * (self.nvar + 1)), - self.path_or_buf.read(2 * (self.nvar + 1)) + self.byteorder + ("h" * (self.nvar + 1)), + self.path_or_buf.read(2 * (self.nvar + 1)), )[:-1] self.fmtlist = self._get_fmtlist() @@ -1286,14 +1392,17 @@ def _read_old_header(self, first_char): if self.format_version > 104: while True: - data_type = struct.unpack(self.byteorder + 'b', - self.path_or_buf.read(1))[0] + data_type = struct.unpack( + self.byteorder + "b", self.path_or_buf.read(1) + )[0] if self.format_version > 108: - data_len = struct.unpack(self.byteorder + 'i', - self.path_or_buf.read(4))[0] + data_len = struct.unpack( + self.byteorder + "i", self.path_or_buf.read(4) + )[0] else: - data_len = struct.unpack(self.byteorder + 'h', - self.path_or_buf.read(2))[0] + data_len = struct.unpack( + self.byteorder + "h", self.path_or_buf.read(2) + )[0] if data_type == 0: break self.path_or_buf.read(data_len) @@ -1309,18 +1418,16 @@ def _setup_dtype(self): dtype = [] # Convert struct data types to numpy data type for i, typ in enumerate(self.typlist): if typ in self.NUMPY_TYPE_MAP: - dtype.append(('s' + str(i), self.byteorder + - self.NUMPY_TYPE_MAP[typ])) + dtype.append(("s" + str(i), self.byteorder + self.NUMPY_TYPE_MAP[typ])) else: - dtype.append(('s' + str(i), 'S' + str(typ))) + dtype.append(("s" + str(i), "S" + str(typ))) dtype = np.dtype(dtype) self._dtype = dtype return self._dtype def _calcsize(self, fmt): - return (type(fmt) is int and fmt or - struct.calcsize(self.byteorder + fmt)) + return type(fmt) is int and fmt or struct.calcsize(self.byteorder + fmt) def _decode(self, s): # have bytes not strings, so must decode @@ -1336,7 +1443,7 @@ def _decode(self, s): has been incorrectly encoded by Stata or some other software. You should verify the string values returned are correct.""" warnings.warn(msg.format(encoding=self._encoding), UnicodeWarning) - return s.decode('latin-1') + return s.decode("latin-1") def _read_value_labels(self): if self._value_labels_read: @@ -1359,7 +1466,7 @@ def _read_value_labels(self): while True: if self.format_version >= 117: - if self.path_or_buf.read(5) == b' + if self.path_or_buf.read(5) == b" break # end of value label table slength = self.path_or_buf.read(4) @@ -1371,16 +1478,14 @@ def _read_value_labels(self): labname = self._decode(self.path_or_buf.read(129)) self.path_or_buf.read(3) # padding - n = struct.unpack(self.byteorder + 'I', - self.path_or_buf.read(4))[0] - txtlen = struct.unpack(self.byteorder + 'I', - self.path_or_buf.read(4))[0] - off = np.frombuffer(self.path_or_buf.read(4 * n), - dtype=self.byteorder + "i4", - count=n) - val = np.frombuffer(self.path_or_buf.read(4 * n), - dtype=self.byteorder + "i4", - count=n) + n = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] + txtlen = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] + off = np.frombuffer( + self.path_or_buf.read(4 * n), dtype=self.byteorder + "i4", count=n + ) + val = np.frombuffer( + self.path_or_buf.read(4 * n), dtype=self.byteorder + "i4", count=n + ) ii = np.argsort(off) off = off[ii] val = val[ii] @@ -1388,8 +1493,7 @@ def _read_value_labels(self): self.value_label_dict[labname] = dict() for i in range(n): end = off[i + 1] if i < n - 1 else txtlen - self.value_label_dict[labname][val[i]] = \ - self._decode(txt[off[i]:end]) + self.value_label_dict[labname][val[i]] = self._decode(txt[off[i] : end]) if self.format_version >= 117: self.path_or_buf.read(6) # self._value_labels_read = True @@ -1397,25 +1501,23 @@ def _read_value_labels(self): def _read_strls(self): self.path_or_buf.seek(self.seek_strls) # Wrap v_o in a string to allow uint64 values as keys on 32bit OS - self.GSO = {'0': ''} + self.GSO = {"0": ""} while True: - if self.path_or_buf.read(3) != b'GSO': + if self.path_or_buf.read(3) != b"GSO": break if self.format_version == 117: - v_o = struct.unpack(self.byteorder + 'Q', - self.path_or_buf.read(8))[0] + v_o = struct.unpack(self.byteorder + "Q", self.path_or_buf.read(8))[0] else: buf = self.path_or_buf.read(12) # Only tested on little endian file on little endian machine. - if self.byteorder == '<': + if self.byteorder == "<": buf = buf[0:2] + buf[4:10] else: buf = buf[0:2] + buf[6:] - v_o = struct.unpack('Q', buf)[0] - typ = struct.unpack('B', self.path_or_buf.read(1))[0] - length = struct.unpack(self.byteorder + 'I', - self.path_or_buf.read(4))[0] + v_o = struct.unpack("Q", buf)[0] + typ = struct.unpack("B", self.path_or_buf.read(1))[0] + length = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] va = self.path_or_buf.read(length) if typ == 130: va = va[0:-1].decode(self._encoding) @@ -1455,11 +1557,18 @@ def get_chunk(self, size=None): return self.read(nrows=size) @Appender(_read_method_doc) - @deprecate_kwarg(old_arg_name='index', new_arg_name='index_col') - def read(self, nrows=None, convert_dates=None, - convert_categoricals=None, index_col=None, - convert_missing=None, preserve_dtypes=None, - columns=None, order_categoricals=None): + @deprecate_kwarg(old_arg_name="index", new_arg_name="index_col") + def read( + self, + nrows=None, + convert_dates=None, + convert_categoricals=None, + index_col=None, + convert_missing=None, + preserve_dtypes=None, + columns=None, + order_categoricals=None, + ): # Handle empty file or chunk. If reading incrementally raise # StopIteration. If reading the whole thing return an empty # data frame. @@ -1507,8 +1616,9 @@ def read(self, nrows=None, convert_dates=None, offset = self._lines_read * dtype.itemsize self.path_or_buf.seek(self.data_location + offset) read_lines = min(nrows, self.nobs - self._lines_read) - data = np.frombuffer(self.path_or_buf.read(read_len), dtype=dtype, - count=read_lines) + data = np.frombuffer( + self.path_or_buf.read(read_len), dtype=dtype, count=read_lines + ) self._lines_read += read_lines if self._lines_read == self.nobs: @@ -1543,8 +1653,7 @@ def read(self, nrows=None, convert_dates=None, # Decode strings for col, typ in zip(data, self.typlist): if type(typ) is int: - data[col] = data[col].apply( - self._decode, convert_dtype=True) + data[col] = data[col].apply(self._decode, convert_dtype=True) data = self._insert_strls(data) @@ -1561,7 +1670,8 @@ def read(self, nrows=None, convert_dates=None, if dtype != np.dtype(object) and dtype != self.dtyplist[i]: requires_type_conversion = True data_formatted.append( - (col, Series(data[col], ix, self.dtyplist[i]))) + (col, Series(data[col], ix, self.dtyplist[i])) + ) else: data_formatted.append((col, data[col])) if requires_type_conversion: @@ -1571,24 +1681,25 @@ def read(self, nrows=None, convert_dates=None, data = self._do_convert_missing(data, convert_missing) if convert_dates: + def any_startswith(x: str) -> bool: return any(x.startswith(fmt) for fmt in _date_formats) + cols = np.where([any_startswith(x) for x in self.fmtlist])[0] for i in cols: col = data.columns[i] try: data[col] = _stata_elapsed_date_to_datetime_vec( - data[col], - self.fmtlist[i]) + data[col], self.fmtlist[i] + ) except ValueError: self.close() raise if convert_categoricals and self.format_version > 108: - data = self._do_convert_categoricals(data, - self.value_label_dict, - self.lbllist, - order_categoricals) + data = self._do_convert_categoricals( + data, self.value_label_dict, self.lbllist, order_categoricals + ) if not preserve_dtypes: retyped_data = [] @@ -1628,8 +1739,7 @@ def _do_convert_missing(self, data, convert_missing): if convert_missing: # Replacement follows Stata notation missing_loc = np.argwhere(missing._ndarray_values) - umissing, umissing_loc = np.unique(series[missing], - return_inverse=True) + umissing, umissing_loc = np.unique(series[missing], return_inverse=True) replacement = Series(series, dtype=np.object) for j, um in enumerate(umissing): missing_value = StataMissingValue(um) @@ -1646,16 +1756,15 @@ def _do_convert_missing(self, data, convert_missing): if replacements: columns = data.columns replacements = DataFrame(replacements) - data = concat([data.drop(replacements.columns, 1), - replacements], 1) + data = concat([data.drop(replacements.columns, 1), replacements], 1) data = data[columns] return data def _insert_strls(self, data): - if not hasattr(self, 'GSO') or len(self.GSO) == 0: + if not hasattr(self, "GSO") or len(self.GSO) == 0: return data for i, typ in enumerate(self.typlist): - if typ != 'Q': + if typ != "Q": continue # Wrap v_o in a string to allow uint64 values as keys on 32bit OS data.iloc[:, i] = [self.GSO[str(k)] for k in data.iloc[:, i]] @@ -1666,12 +1775,13 @@ def _do_select_columns(self, data, columns): if not self._column_selector_set: column_set = set(columns) if len(column_set) != len(columns): - raise ValueError('columns contains duplicate entries') + raise ValueError("columns contains duplicate entries") unmatched = column_set.difference(data.columns) if unmatched: - raise ValueError('The following columns were not found in the ' - 'Stata data set: ' + - ', '.join(list(unmatched))) + raise ValueError( + "The following columns were not found in the " + "Stata data set: " + ", ".join(list(unmatched)) + ) # Copy information for retained columns for later processing dtyplist = [] typlist = [] @@ -1692,8 +1802,9 @@ def _do_select_columns(self, data, columns): return data[columns] - def _do_convert_categoricals(self, data, value_label_dict, lbllist, - order_categoricals): + def _do_convert_categoricals( + self, data, value_label_dict, lbllist, order_categoricals + ): """ Converts categorical columns to Categorical type. """ @@ -1714,7 +1825,7 @@ def _do_convert_categoricals(self, data, value_label_dict, lbllist, except ValueError: vc = Series(categories).value_counts() repeats = list(vc.index[vc > 1]) - repeats = '-' * 80 + '\n' + '\n'.join(repeats) + repeats = "-" * 80 + "\n" + "\n".join(repeats) # GH 25772 msg = """ Value labels for column {col} are not unique. These cannot be converted to @@ -1784,7 +1895,7 @@ def _open_file_binary_write(fname): own : bool True if the file was created, otherwise False """ - if hasattr(fname, 'write'): + if hasattr(fname, "write"): # if 'b' not in fname.mode: return fname, False return open(fname, "wb"), True @@ -1796,8 +1907,7 @@ def _set_endianness(endianness): elif endianness.lower() in [">", "big"]: return ">" else: # pragma : no cover - raise ValueError( - "Endianness {endian} not understood".format(endian=endianness)) + raise ValueError("Endianness {endian} not understood".format(endian=endianness)) def _pad_bytes(name, length): @@ -1811,12 +1921,25 @@ def _convert_datetime_to_stata_type(fmt): """ Convert from one of the stata date formats to a type in TYPE_MAP. """ - if fmt in ["tc", "%tc", "td", "%td", "tw", "%tw", "tm", "%tm", "tq", - "%tq", "th", "%th", "ty", "%ty"]: + if fmt in [ + "tc", + "%tc", + "td", + "%td", + "tw", + "%tw", + "tm", + "%tm", + "tq", + "%tq", + "th", + "%th", + "ty", + "%ty", + ]: return np.float64 # Stata expects doubles for SIFs else: - raise NotImplementedError( - "Format {fmt} not implemented".format(fmt=fmt)) + raise NotImplementedError("Format {fmt} not implemented".format(fmt=fmt)) def _maybe_convert_to_int_keys(convert_dates, varlist): @@ -1828,8 +1951,7 @@ def _maybe_convert_to_int_keys(convert_dates, varlist): new_dict.update({varlist.index(key): convert_dates[key]}) else: if not isinstance(key, int): - raise ValueError("convert_dates key must be a " - "column or an integer") + raise ValueError("convert_dates key must be a " "column or an integer") new_dict.update({key: convert_dates[key]}) return new_dict @@ -1868,11 +1990,11 @@ def _dtype_to_stata_type(dtype, column): return 251 else: # pragma : no cover raise NotImplementedError( - "Data type {dtype} not supported.".format(dtype=dtype)) + "Data type {dtype} not supported.".format(dtype=dtype) + ) -def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, - force_strl=False): +def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, force_strl=False): """ Map numpy dtype to stata's default format for this type. Not terribly important since users can change this in Stata. Semantics are @@ -1894,23 +2016,24 @@ def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, else: max_str_len = 2045 if force_strl: - return '%9s' + return "%9s" if dtype.type == np.object_: inferred_dtype = infer_dtype(column, skipna=True) - if not (inferred_dtype in ('string', 'unicode') or - len(column) == 0): - raise ValueError('Column `{col}` cannot be exported.\n\nOnly ' - 'string-like object arrays containing all ' - 'strings or a mix of strings and None can be ' - 'exported. Object arrays containing only null ' - 'values are prohibited. Other object types' - 'cannot be exported and must first be converted ' - 'to one of the supported ' - 'types.'.format(col=column.name)) + if not (inferred_dtype in ("string", "unicode") or len(column) == 0): + raise ValueError( + "Column `{col}` cannot be exported.\n\nOnly " + "string-like object arrays containing all " + "strings or a mix of strings and None can be " + "exported. Object arrays containing only null " + "values are prohibited. Other object types" + "cannot be exported and must first be converted " + "to one of the supported " + "types.".format(col=column.name) + ) itemsize = max_len_string_array(ensure_object(column.values)) if itemsize > max_str_len: if dta_version >= 117: - return '%9s' + return "%9s" else: raise ValueError(excessive_string_length_error % column.name) return "%" + str(max(itemsize, 1)) + "s" @@ -1924,7 +2047,8 @@ def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, return "%8.0g" else: # pragma : no cover raise NotImplementedError( - "Data type {dtype} not supported.".format(dtype=dtype)) + "Data type {dtype} not supported.".format(dtype=dtype) + ) class StataWriter(StataParser): @@ -1998,14 +2122,23 @@ class StataWriter(StataParser): _max_string_length = 244 - @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None) - def __init__(self, fname, data, convert_dates=None, write_index=True, - encoding="latin-1", byteorder=None, time_stamp=None, - data_label=None, variable_labels=None): + @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) + def __init__( + self, + fname, + data, + convert_dates=None, + write_index=True, + encoding="latin-1", + byteorder=None, + time_stamp=None, + data_label=None, + variable_labels=None, + ): super().__init__() self._convert_dates = {} if convert_dates is None else convert_dates self._write_index = write_index - self._encoding = 'latin-1' + self._encoding = "latin-1" self._time_stamp = time_stamp self._data_label = data_label self._variable_labels = variable_labels @@ -2024,8 +2157,7 @@ def _write(self, to_write): """ Helper to call encode before writing to file for Python 3 compat. """ - self._file.write(to_write.encode(self._encoding or - self._default_encoding)) + self._file.write(to_write.encode(self._encoding or self._default_encoding)) def _prepare_categoricals(self, data): """Check for categorical columns, retain categorical information for @@ -2044,8 +2176,10 @@ def _prepare_categoricals(self, data): self._value_labels.append(StataValueLabel(data[col])) dtype = data[col].cat.codes.dtype if dtype == np.int64: - raise ValueError('It is not possible to export ' - 'int64-based categorical data to Stata.') + raise ValueError( + "It is not possible to export " + "int64-based categorical data to Stata." + ) values = data[col].cat.codes.values.copy() # Upcast if needed so that correct missing values can be set @@ -2073,9 +2207,9 @@ def _replace_nans(self, data): dtype = data[c].dtype if dtype in (np.float32, np.float64): if dtype == np.float32: - replacement = self.MISSING_VALUES['f'] + replacement = self.MISSING_VALUES["f"] else: - replacement = self.MISSING_VALUES['d'] + replacement = self.MISSING_VALUES["d"] data[c] = data[c].fillna(replacement) return data @@ -2108,26 +2242,30 @@ def _check_column_names(self, data): name = str(name) for c in name: - if ((c < 'A' or c > 'Z') and (c < 'a' or c > 'z') and - (c < '0' or c > '9') and c != '_'): - name = name.replace(c, '_') + if ( + (c < "A" or c > "Z") + and (c < "a" or c > "z") + and (c < "0" or c > "9") + and c != "_" + ): + name = name.replace(c, "_") # Variable name must not be a reserved word if name in self.RESERVED_WORDS: - name = '_' + name + name = "_" + name # Variable name may not start with a number - if name[0] >= '0' and name[0] <= '9': - name = '_' + name + if name[0] >= "0" and name[0] <= "9": + name = "_" + name - name = name[:min(len(name), 32)] + name = name[: min(len(name), 32)] if not name == orig_name: # check for duplicates while columns.count(name) > 0: # prepend ascending number to avoid duplicates - name = '_' + str(duplicate_var_id) + name - name = name[:min(len(name), 32)] + name = "_" + str(duplicate_var_id) + name + name = name[: min(len(name), 32)] duplicate_var_id += 1 converted_names[orig_name] = name @@ -2147,13 +2285,13 @@ def _check_column_names(self, data): for orig_name, name in converted_names.items(): # need to possibly encode the orig name if its unicode try: - orig_name = orig_name.encode('utf-8') + orig_name = orig_name.encode("utf-8") except (UnicodeDecodeError, AttributeError): pass - msg = '{0} -> {1}'.format(orig_name, name) + msg = "{0} -> {1}".format(orig_name, name) conversion_warning.append(msg) - ws = invalid_name_doc.format('\n '.join(conversion_warning)) + ws = invalid_name_doc.format("\n ".join(conversion_warning)) warnings.warn(ws, InvalidColumnName) self._converted_names = converted_names @@ -2201,14 +2339,13 @@ def _prepare_pandas(self, data): if col in self._convert_dates: continue if is_datetime64_dtype(data[col]): - self._convert_dates[col] = 'tc' + self._convert_dates[col] = "tc" - self._convert_dates = _maybe_convert_to_int_keys(self._convert_dates, - self.varlist) + self._convert_dates = _maybe_convert_to_int_keys( + self._convert_dates, self.varlist + ) for key in self._convert_dates: - new_type = _convert_datetime_to_stata_type( - self._convert_dates[key] - ) + new_type = _convert_datetime_to_stata_type(self._convert_dates[key]) dtypes[key] = np.dtype(new_type) self._set_formats_and_types(data, dtypes) @@ -2221,8 +2358,7 @@ def _prepare_pandas(self, data): def write_file(self): self._file, self._own_file = _open_file_binary_write(self._fname) try: - self._write_header(time_stamp=self._time_stamp, - data_label=self._data_label) + self._write_header(time_stamp=self._time_stamp, data_label=self._data_label) self._write_map() self._write_variable_types() self._write_varnames() @@ -2244,9 +2380,12 @@ def write_file(self): if self._own_file: os.unlink(self._fname) except Exception: - warnings.warn('This save was not successful but {0} could not ' - 'be deleted. This file is not ' - 'valid.'.format(self._fname), ResourceWarning) + warnings.warn( + "This save was not successful but {0} could not " + "be deleted. This file is not " + "valid.".format(self._fname), + ResourceWarning, + ) raise exc else: self._close() @@ -2290,8 +2429,7 @@ def _write_expansion_fields(self): def _write_value_labels(self): for vl in self._value_labels: - self._file.write(vl.generate_value_label(self._byteorder, - self._encoding)) + self._file.write(vl.generate_value_label(self._byteorder, self._encoding)) def _write_header(self, data_label=None, time_stamp=None): byteorder = self._byteorder @@ -2311,9 +2449,7 @@ def _write_header(self, data_label=None, time_stamp=None): if data_label is None: self._file.write(self._null_terminate(_pad_bytes("", 80))) else: - self._file.write( - self._null_terminate(_pad_bytes(data_label[:80], 80)) - ) + self._file.write(self._null_terminate(_pad_bytes(data_label[:80], 80))) # time stamp, 18 bytes, char, null terminated # format dd Mon yyyy hh:mm if time_stamp is None: @@ -2322,17 +2458,31 @@ def _write_header(self, data_label=None, time_stamp=None): raise ValueError("time_stamp should be datetime type") # GH #13856 # Avoid locale-specific month conversion - months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', - 'Sep', 'Oct', 'Nov', 'Dec'] + months = [ + "Jan", + "Feb", + "Mar", + "Apr", + "May", + "Jun", + "Jul", + "Aug", + "Sep", + "Oct", + "Nov", + "Dec", + ] month_lookup = {i + 1: month for i, month in enumerate(months)} - ts = (time_stamp.strftime("%d ") + - month_lookup[time_stamp.month] + - time_stamp.strftime(" %Y %H:%M")) + ts = ( + time_stamp.strftime("%d ") + + month_lookup[time_stamp.month] + + time_stamp.strftime(" %Y %H:%M") + ) self._file.write(self._null_terminate(ts)) def _write_variable_types(self): for typ in self.typlist: - self._file.write(struct.pack('B', typ)) + self._file.write(struct.pack("B", typ)) def _write_varnames(self): # varlist names are checked by _check_column_names @@ -2366,7 +2516,7 @@ def _write_value_label_names(self): def _write_variable_labels(self): # Missing labels are 80 blank characters plus null termination - blank = _pad_bytes('', 81) + blank = _pad_bytes("", 81) if self._variable_labels is None: for i in range(self.nvar): @@ -2377,13 +2527,16 @@ def _write_variable_labels(self): if col in self._variable_labels: label = self._variable_labels[col] if len(label) > 80: - raise ValueError('Variable labels must be 80 characters ' - 'or fewer') + raise ValueError( + "Variable labels must be 80 characters " "or fewer" + ) is_latin1 = all(ord(c) < 256 for c in label) if not is_latin1: - raise ValueError('Variable labels must contain only ' - 'characters that can be encoded in ' - 'Latin-1') + raise ValueError( + "Variable labels must contain only " + "characters that can be encoded in " + "Latin-1" + ) self._write(_pad_bytes(label, 81)) else: self._write(blank) @@ -2400,8 +2553,9 @@ def _prepare_data(self): if self._convert_dates is not None: for i, col in enumerate(data): if i in convert_dates: - data[col] = _datetime_to_stata_elapsed_vec(data[col], - self.fmtlist[i]) + data[col] = _datetime_to_stata_elapsed_vec( + data[col], self.fmtlist[i] + ) # 2. Convert strls data = self._convert_strls(data) @@ -2411,8 +2565,8 @@ def _prepare_data(self): for i, col in enumerate(data): typ = typlist[i] if typ <= self._max_string_length: - data[col] = data[col].fillna('').apply(_pad_bytes, args=(typ,)) - stype = 'S{type}'.format(type=typ) + data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,)) + stype = "S{type}".format(type=typ) dtypes[col] = stype data[col] = data[col].str.encode(self._encoding).astype(stype) else: @@ -2428,7 +2582,7 @@ def _write_data(self): self._file.write(data.tobytes()) def _null_terminate(self, s, as_string=False): - null_byte = '\x00' + null_byte = "\x00" s += null_byte if not as_string: @@ -2484,8 +2638,8 @@ def _pad_bytes_new(name, length): Takes a bytes instance and pads it with null bytes until it's length chars. """ if isinstance(name, str): - name = bytes(name, 'utf-8') - return name + b'\x00' * (length - len(name)) + name = bytes(name, "utf-8") + return name + b"\x00" * (length - len(name)) class StataStrLWriter: @@ -2519,23 +2673,23 @@ class StataStrLWriter: def __init__(self, df, columns, version=117, byteorder=None): if version not in (117, 118, 119): - raise ValueError('Only dta versions 117, 118 and 119 supported') + raise ValueError("Only dta versions 117, 118 and 119 supported") self._dta_ver = version self.df = df self.columns = columns - self._gso_table = OrderedDict((('', (0, 0)),)) + self._gso_table = OrderedDict((("", (0, 0)),)) if byteorder is None: byteorder = sys.byteorder self._byteorder = _set_endianness(byteorder) - gso_v_type = 'I' # uint32 - gso_o_type = 'Q' # uint64 - self._encoding = 'utf-8' + gso_v_type = "I" # uint32 + gso_o_type = "Q" # uint64 + self._encoding = "utf-8" if version == 117: o_size = 4 - gso_o_type = 'I' # 117 used uint32 - self._encoding = 'latin-1' + gso_o_type = "I" # 117 used uint32 + self._encoding = "latin-1" elif version == 118: o_size = 6 else: # version == 119 @@ -2588,7 +2742,7 @@ def generate_table(self): for j, (col, v) in enumerate(col_index): val = row[col] # Allow columns with mixed str and None (GH 23633) - val = '' if val is None else val + val = "" if val is None else val key = gso_table.get(val, None) if key is None: # Stata prefers human numbers @@ -2636,12 +2790,12 @@ def generate_blob(self, gso_table): # 3 u4 u8 u1 u4 string + null term bio = BytesIO() - gso = bytes('GSO', 'ascii') - gso_type = struct.pack(self._byteorder + 'B', 130) - null = struct.pack(self._byteorder + 'B', 0) + gso = bytes("GSO", "ascii") + gso_type = struct.pack(self._byteorder + "B", 130) + null = struct.pack(self._byteorder + "B", 0) v_type = self._byteorder + self._gso_v_type o_type = self._byteorder + self._gso_o_type - len_type = self._byteorder + 'I' + len_type = self._byteorder + "I" for strl, vo in gso_table.items(): if vo == (0, 0): continue @@ -2660,7 +2814,7 @@ def generate_blob(self, gso_table): bio.write(gso_type) # llll - utf8_string = bytes(strl, 'utf-8') + utf8_string = bytes(strl, "utf-8") bio.write(struct.pack(len_type, len(utf8_string) + 1)) # xxx...xxx @@ -2748,17 +2902,33 @@ class StataWriter117(StataWriter): _max_string_length = 2045 - @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None) - def __init__(self, fname, data, convert_dates=None, write_index=True, - encoding="latin-1", byteorder=None, time_stamp=None, - data_label=None, variable_labels=None, convert_strl=None): + @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) + def __init__( + self, + fname, + data, + convert_dates=None, + write_index=True, + encoding="latin-1", + byteorder=None, + time_stamp=None, + data_label=None, + variable_labels=None, + convert_strl=None, + ): # Shallow copy since convert_strl might be modified later self._convert_strl = [] if convert_strl is None else convert_strl[:] - super().__init__(fname, data, convert_dates, write_index, - byteorder=byteorder, time_stamp=time_stamp, - data_label=data_label, - variable_labels=variable_labels) + super().__init__( + fname, + data, + convert_dates, + write_index, + byteorder=byteorder, + time_stamp=time_stamp, + data_label=data_label, + variable_labels=variable_labels, + ) self._map = None self._strl_blob = None @@ -2766,9 +2936,8 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, def _tag(val, tag): """Surround val with """ if isinstance(val, str): - val = bytes(val, 'utf-8') - return (bytes('<' + tag + '>', 'utf-8') + val + - bytes('', 'utf-8')) + val = bytes(val, "utf-8") + return bytes("<" + tag + ">", "utf-8") + val + bytes("", "utf-8") def _update_map(self, tag): """Update map location for tag with file position""" @@ -2777,22 +2946,22 @@ def _update_map(self, tag): def _write_header(self, data_label=None, time_stamp=None): """Write the file header""" byteorder = self._byteorder - self._file.write(bytes('', 'utf-8')) + self._file.write(bytes("", "utf-8")) bio = BytesIO() # ds_format - 117 - bio.write(self._tag(bytes('117', 'utf-8'), 'release')) + bio.write(self._tag(bytes("117", "utf-8"), "release")) # byteorder - bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", 'byteorder')) + bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder")) # number of vars, 2 bytes assert self.nvar < 2 ** 16 - bio.write(self._tag(struct.pack(byteorder + "H", self.nvar), 'K')) + bio.write(self._tag(struct.pack(byteorder + "H", self.nvar), "K")) # number of obs, 4 bytes - bio.write(self._tag(struct.pack(byteorder + "I", self.nobs), 'N')) + bio.write(self._tag(struct.pack(byteorder + "I", self.nobs), "N")) # data label 81 bytes, char, null terminated - label = data_label[:80] if data_label is not None else '' + label = data_label[:80] if data_label is not None else "" label_len = struct.pack(byteorder + "B", len(label)) - label = label_len + bytes(label, 'utf-8') - bio.write(self._tag(label, 'label')) + label = label_len + bytes(label, "utf-8") + bio.write(self._tag(label, "label")) # time stamp, 18 bytes, char, null terminated # format dd Mon yyyy hh:mm if time_stamp is None: @@ -2800,155 +2969,176 @@ def _write_header(self, data_label=None, time_stamp=None): elif not isinstance(time_stamp, datetime.datetime): raise ValueError("time_stamp should be datetime type") # Avoid locale-specific month conversion - months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', - 'Sep', 'Oct', 'Nov', 'Dec'] + months = [ + "Jan", + "Feb", + "Mar", + "Apr", + "May", + "Jun", + "Jul", + "Aug", + "Sep", + "Oct", + "Nov", + "Dec", + ] month_lookup = {i + 1: month for i, month in enumerate(months)} - ts = (time_stamp.strftime("%d ") + - month_lookup[time_stamp.month] + - time_stamp.strftime(" %Y %H:%M")) + ts = ( + time_stamp.strftime("%d ") + + month_lookup[time_stamp.month] + + time_stamp.strftime(" %Y %H:%M") + ) # '\x11' added due to inspection of Stata file - ts = b'\x11' + bytes(ts, 'utf8') - bio.write(self._tag(ts, 'timestamp')) + ts = b"\x11" + bytes(ts, "utf8") + bio.write(self._tag(ts, "timestamp")) bio.seek(0) - self._file.write(self._tag(bio.read(), 'header')) + self._file.write(self._tag(bio.read(), "header")) def _write_map(self): """Called twice during file write. The first populates the values in the map with 0s. The second call writes the final map locations when all blocks have been written.""" if self._map is None: - self._map = OrderedDict((('stata_data', 0), - ('map', self._file.tell()), - ('variable_types', 0), - ('varnames', 0), - ('sortlist', 0), - ('formats', 0), - ('value_label_names', 0), - ('variable_labels', 0), - ('characteristics', 0), - ('data', 0), - ('strls', 0), - ('value_labels', 0), - ('stata_data_close', 0), - ('end-of-file', 0))) + self._map = OrderedDict( + ( + ("stata_data", 0), + ("map", self._file.tell()), + ("variable_types", 0), + ("varnames", 0), + ("sortlist", 0), + ("formats", 0), + ("value_label_names", 0), + ("variable_labels", 0), + ("characteristics", 0), + ("data", 0), + ("strls", 0), + ("value_labels", 0), + ("stata_data_close", 0), + ("end-of-file", 0), + ) + ) # Move to start of map - self._file.seek(self._map['map']) + self._file.seek(self._map["map"]) bio = BytesIO() for val in self._map.values(): - bio.write(struct.pack(self._byteorder + 'Q', val)) + bio.write(struct.pack(self._byteorder + "Q", val)) bio.seek(0) - self._file.write(self._tag(bio.read(), 'map')) + self._file.write(self._tag(bio.read(), "map")) def _write_variable_types(self): - self._update_map('variable_types') + self._update_map("variable_types") bio = BytesIO() for typ in self.typlist: - bio.write(struct.pack(self._byteorder + 'H', typ)) + bio.write(struct.pack(self._byteorder + "H", typ)) bio.seek(0) - self._file.write(self._tag(bio.read(), 'variable_types')) + self._file.write(self._tag(bio.read(), "variable_types")) def _write_varnames(self): - self._update_map('varnames') + self._update_map("varnames") bio = BytesIO() for name in self.varlist: name = self._null_terminate(name, True) name = _pad_bytes_new(name[:32], 33) bio.write(name) bio.seek(0) - self._file.write(self._tag(bio.read(), 'varnames')) + self._file.write(self._tag(bio.read(), "varnames")) def _write_sortlist(self): - self._update_map('sortlist') - self._file.write(self._tag(b'\x00\00' * (self.nvar + 1), 'sortlist')) + self._update_map("sortlist") + self._file.write(self._tag(b"\x00\00" * (self.nvar + 1), "sortlist")) def _write_formats(self): - self._update_map('formats') + self._update_map("formats") bio = BytesIO() for fmt in self.fmtlist: bio.write(_pad_bytes_new(fmt, 49)) bio.seek(0) - self._file.write(self._tag(bio.read(), 'formats')) + self._file.write(self._tag(bio.read(), "formats")) def _write_value_label_names(self): - self._update_map('value_label_names') + self._update_map("value_label_names") bio = BytesIO() for i in range(self.nvar): # Use variable name when categorical - name = '' # default name + name = "" # default name if self._is_col_cat[i]: name = self.varlist[i] name = self._null_terminate(name, True) name = _pad_bytes_new(name[:32], 33) bio.write(name) bio.seek(0) - self._file.write(self._tag(bio.read(), 'value_label_names')) + self._file.write(self._tag(bio.read(), "value_label_names")) def _write_variable_labels(self): # Missing labels are 80 blank characters plus null termination - self._update_map('variable_labels') + self._update_map("variable_labels") bio = BytesIO() - blank = _pad_bytes_new('', 81) + blank = _pad_bytes_new("", 81) if self._variable_labels is None: for _ in range(self.nvar): bio.write(blank) bio.seek(0) - self._file.write(self._tag(bio.read(), 'variable_labels')) + self._file.write(self._tag(bio.read(), "variable_labels")) return for col in self.data: if col in self._variable_labels: label = self._variable_labels[col] if len(label) > 80: - raise ValueError('Variable labels must be 80 characters ' - 'or fewer') + raise ValueError( + "Variable labels must be 80 characters " "or fewer" + ) is_latin1 = all(ord(c) < 256 for c in label) if not is_latin1: - raise ValueError('Variable labels must contain only ' - 'characters that can be encoded in ' - 'Latin-1') + raise ValueError( + "Variable labels must contain only " + "characters that can be encoded in " + "Latin-1" + ) bio.write(_pad_bytes_new(label, 81)) else: bio.write(blank) bio.seek(0) - self._file.write(self._tag(bio.read(), 'variable_labels')) + self._file.write(self._tag(bio.read(), "variable_labels")) def _write_characteristics(self): - self._update_map('characteristics') - self._file.write(self._tag(b'', 'characteristics')) + self._update_map("characteristics") + self._file.write(self._tag(b"", "characteristics")) def _write_data(self): - self._update_map('data') + self._update_map("data") data = self.data - self._file.write(b'') + self._file.write(b"") self._file.write(data.tobytes()) - self._file.write(b'') + self._file.write(b"") def _write_strls(self): - self._update_map('strls') - strls = b'' + self._update_map("strls") + strls = b"" if self._strl_blob is not None: strls = self._strl_blob - self._file.write(self._tag(strls, 'strls')) + self._file.write(self._tag(strls, "strls")) def _write_expansion_fields(self): """No-op in dta 117+""" pass def _write_value_labels(self): - self._update_map('value_labels') + self._update_map("value_labels") bio = BytesIO() for vl in self._value_labels: lab = vl.generate_value_label(self._byteorder, self._encoding) - lab = self._tag(lab, 'lbl') + lab = self._tag(lab, "lbl") bio.write(lab) bio.seek(0) - self._file.write(self._tag(bio.read(), 'value_labels')) + self._file.write(self._tag(bio.read(), "value_labels")) def _write_file_close_tag(self): - self._update_map('stata_data_close') - self._file.write(bytes('', 'utf-8')) - self._update_map('end-of-file') + self._update_map("stata_data_close") + self._file.write(bytes("", "utf-8")) + self._update_map("end-of-file") def _update_strl_names(self): """Update column names for conversion to strl if they might have been @@ -2963,8 +3153,10 @@ def _convert_strls(self, data): """Convert columns to StrLs if either very large or in the convert_strl variable""" convert_cols = [ - col for i, col in enumerate(data) - if self.typlist[i] == 32768 or col in self._convert_strl] + col + for i, col in enumerate(data) + if self.typlist[i] == 32768 or col in self._convert_strl + ] if convert_cols: ssw = StataStrLWriter(data, convert_cols) @@ -2978,9 +3170,8 @@ def _set_formats_and_types(self, data, dtypes): self.fmtlist = [] for col, dtype in dtypes.iteritems(): force_strl = col in self._convert_strl - fmt = _dtype_to_default_stata_fmt(dtype, data[col], - dta_version=117, - force_strl=force_strl) + fmt = _dtype_to_default_stata_fmt( + dtype, data[col], dta_version=117, force_strl=force_strl + ) self.fmtlist.append(fmt) - self.typlist.append(_dtype_to_stata_type_117(dtype, data[col], - force_strl)) + self.typlist.append(_dtype_to_stata_type_117(dtype, data[col], force_strl)) diff --git a/pandas/plotting/__init__.py b/pandas/plotting/__init__.py index 57a45f0f18d902..ebe047c58b889e 100644 --- a/pandas/plotting/__init__.py +++ b/pandas/plotting/__init__.py @@ -57,17 +57,43 @@ https://github.com/pandas-dev/pandas/issues/26747. """ from pandas.plotting._core import ( - PlotAccessor, boxplot, boxplot_frame, boxplot_frame_groupby, hist_frame, - hist_series) + PlotAccessor, + boxplot, + boxplot_frame, + boxplot_frame_groupby, + hist_frame, + hist_series, +) from pandas.plotting._misc import ( - andrews_curves, autocorrelation_plot, bootstrap_plot, - deregister as deregister_matplotlib_converters, lag_plot, - parallel_coordinates, plot_params, radviz, - register as register_matplotlib_converters, scatter_matrix, table) + andrews_curves, + autocorrelation_plot, + bootstrap_plot, + deregister as deregister_matplotlib_converters, + lag_plot, + parallel_coordinates, + plot_params, + radviz, + register as register_matplotlib_converters, + scatter_matrix, + table, +) -__all__ = ['PlotAccessor', 'boxplot', 'boxplot_frame', 'boxplot_frame_groupby', - 'hist_frame', 'hist_series', 'scatter_matrix', 'radviz', - 'andrews_curves', 'bootstrap_plot', 'parallel_coordinates', - 'lag_plot', 'autocorrelation_plot', 'table', 'plot_params', - 'register_matplotlib_converters', - 'deregister_matplotlib_converters'] +__all__ = [ + "PlotAccessor", + "boxplot", + "boxplot_frame", + "boxplot_frame_groupby", + "hist_frame", + "hist_series", + "scatter_matrix", + "radviz", + "andrews_curves", + "bootstrap_plot", + "parallel_coordinates", + "lag_plot", + "autocorrelation_plot", + "table", + "plot_params", + "register_matplotlib_converters", + "deregister_matplotlib_converters", +] diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 2f46df29857039..5e67d9a5879145 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -19,9 +19,19 @@ pass -def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, - xrot=None, ylabelsize=None, yrot=None, figsize=None, - bins=10, **kwds): +def hist_series( + self, + by=None, + ax=None, + grid=True, + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, + figsize=None, + bins=10, + **kwds +): """ Draw histogram of the input series using matplotlib. @@ -61,15 +71,38 @@ def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, matplotlib.axes.Axes.hist : Plot a histogram using matplotlib. """ plot_backend = _get_plot_backend() - return plot_backend.hist_series(self, by=by, ax=ax, grid=grid, - xlabelsize=xlabelsize, xrot=xrot, - ylabelsize=ylabelsize, yrot=yrot, - figsize=figsize, bins=bins, **kwds) - - -def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, - xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, - sharey=False, figsize=None, layout=None, bins=10, **kwds): + return plot_backend.hist_series( + self, + by=by, + ax=ax, + grid=grid, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + figsize=figsize, + bins=bins, + **kwds + ) + + +def hist_frame( + data, + column=None, + by=None, + grid=True, + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, + ax=None, + sharex=False, + sharey=False, + figsize=None, + layout=None, + bins=10, + **kwds +): """ Make a histogram of the DataFrame's. @@ -148,17 +181,38 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, >>> hist = df.hist(bins=3) """ plot_backend = _get_plot_backend() - return plot_backend.hist_frame(data, column=column, by=by, grid=grid, - xlabelsize=xlabelsize, xrot=xrot, - ylabelsize=ylabelsize, yrot=yrot, - ax=ax, sharex=sharex, sharey=sharey, - figsize=figsize, layout=layout, bins=bins, - **kwds) - - -def boxplot(data, column=None, by=None, ax=None, fontsize=None, - rot=0, grid=True, figsize=None, layout=None, return_type=None, - **kwds): + return plot_backend.hist_frame( + data, + column=column, + by=by, + grid=grid, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + ax=ax, + sharex=sharex, + sharey=sharey, + figsize=figsize, + layout=layout, + bins=bins, + **kwds + ) + + +def boxplot( + data, + column=None, + by=None, + ax=None, + fontsize=None, + rot=0, + grid=True, + figsize=None, + layout=None, + return_type=None, + **kwds +): """ Make a box plot from DataFrame columns. @@ -322,26 +376,65 @@ def boxplot(data, column=None, by=None, ax=None, fontsize=None, """ plot_backend = _get_plot_backend() - return plot_backend.boxplot(data, column=column, by=by, ax=ax, - fontsize=fontsize, rot=rot, grid=grid, - figsize=figsize, layout=layout, - return_type=return_type, **kwds) + return plot_backend.boxplot( + data, + column=column, + by=by, + ax=ax, + fontsize=fontsize, + rot=rot, + grid=grid, + figsize=figsize, + layout=layout, + return_type=return_type, + **kwds + ) @Appender(boxplot.__doc__) -def boxplot_frame(self, column=None, by=None, ax=None, fontsize=None, rot=0, - grid=True, figsize=None, layout=None, - return_type=None, **kwds): +def boxplot_frame( + self, + column=None, + by=None, + ax=None, + fontsize=None, + rot=0, + grid=True, + figsize=None, + layout=None, + return_type=None, + **kwds +): plot_backend = _get_plot_backend() - return plot_backend.boxplot_frame(self, column=column, by=by, ax=ax, - fontsize=fontsize, rot=rot, grid=grid, - figsize=figsize, layout=layout, - return_type=return_type, **kwds) - - -def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, - rot=0, grid=True, ax=None, figsize=None, - layout=None, sharex=False, sharey=True, **kwds): + return plot_backend.boxplot_frame( + self, + column=column, + by=by, + ax=ax, + fontsize=fontsize, + rot=rot, + grid=grid, + figsize=figsize, + layout=layout, + return_type=return_type, + **kwds + ) + + +def boxplot_frame_groupby( + grouped, + subplots=True, + column=None, + fontsize=None, + rot=0, + grid=True, + ax=None, + figsize=None, + layout=None, + sharex=False, + sharey=True, + **kwds +): """ Make box plots from DataFrameGroupBy data. @@ -393,9 +486,19 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, """ plot_backend = _get_plot_backend() return plot_backend.boxplot_frame_groupby( - grouped, subplots=subplots, column=column, fontsize=fontsize, rot=rot, - grid=grid, ax=ax, figsize=figsize, layout=layout, sharex=sharex, - sharey=sharey, **kwds) + grouped, + subplots=subplots, + column=column, + fontsize=fontsize, + rot=rot, + grid=grid, + ax=ax, + figsize=figsize, + layout=layout, + sharex=sharex, + sharey=sharey, + **kwds + ) class PlotAccessor(PandasObject): @@ -500,11 +603,11 @@ class PlotAccessor(PandasObject): From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center) """ - _common_kinds = ('line', 'bar', 'barh', 'kde', 'density', 'area', 'hist', - 'box') - _series_kinds = ('pie',) - _dataframe_kinds = ('scatter', 'hexbin') - _kind_aliases = {'density': 'kde'} + + _common_kinds = ("line", "bar", "barh", "kde", "density", "area", "hist", "box") + _series_kinds = ("pie",) + _dataframe_kinds = ("scatter", "hexbin") + _kind_aliases = {"density": "kde"} _all_kinds = _common_kinds + _series_kinds + _dataframe_kinds def __init__(self, data): @@ -521,63 +624,106 @@ def _get_call_args(backend_name, data, args, kwargs): """ if isinstance(data, ABCSeries): arg_def = [ - ('kind', 'line'), ('ax', None), ('figsize', None), - ('use_index', True), ('title', None), ('grid', None), - ('legend', False), ('style', None), ('logx', False), - ('logy', False), ('loglog', False), ('xticks', None), - ('yticks', None), ('xlim', None), ('ylim', None), - ('rot', None), ('fontsize', None), ('colormap', None), - ('table', False), ('yerr', None), ('xerr', None), - ('label', None), ('secondary_y', False)] + ("kind", "line"), + ("ax", None), + ("figsize", None), + ("use_index", True), + ("title", None), + ("grid", None), + ("legend", False), + ("style", None), + ("logx", False), + ("logy", False), + ("loglog", False), + ("xticks", None), + ("yticks", None), + ("xlim", None), + ("ylim", None), + ("rot", None), + ("fontsize", None), + ("colormap", None), + ("table", False), + ("yerr", None), + ("xerr", None), + ("label", None), + ("secondary_y", False), + ] elif isinstance(data, ABCDataFrame): arg_def = [ - ('x', None), ('y', None), ('kind', 'line'), ('ax', None), - ('subplots', False), ('sharex', None), ('sharey', False), - ('layout', None), ('figsize', None), ('use_index', True), - ('title', None), ('grid', None), ('legend', True), - ('style', None), ('logx', False), ('logy', False), - ('loglog', False), ('xticks', None), ('yticks', None), - ('xlim', None), ('ylim', None), ('rot', None), - ('fontsize', None), ('colormap', None), ('table', False), - ('yerr', None), ('xerr', None), ('secondary_y', False), - ('sort_columns', False)] + ("x", None), + ("y", None), + ("kind", "line"), + ("ax", None), + ("subplots", False), + ("sharex", None), + ("sharey", False), + ("layout", None), + ("figsize", None), + ("use_index", True), + ("title", None), + ("grid", None), + ("legend", True), + ("style", None), + ("logx", False), + ("logy", False), + ("loglog", False), + ("xticks", None), + ("yticks", None), + ("xlim", None), + ("ylim", None), + ("rot", None), + ("fontsize", None), + ("colormap", None), + ("table", False), + ("yerr", None), + ("xerr", None), + ("secondary_y", False), + ("sort_columns", False), + ] else: - raise TypeError(('Called plot accessor for type {}, expected ' - 'Series or DataFrame').format( - type(data).__name__)) + raise TypeError( + ( + "Called plot accessor for type {}, expected " "Series or DataFrame" + ).format(type(data).__name__) + ) if args and isinstance(data, ABCSeries): - msg = ('`Series.plot()` should not be called with positional ' - 'arguments, only keyword arguments. The order of ' - 'positional arguments will change in the future. ' - 'Use `Series.plot({})` instead of `Series.plot({})`.') + msg = ( + "`Series.plot()` should not be called with positional " + "arguments, only keyword arguments. The order of " + "positional arguments will change in the future. " + "Use `Series.plot({})` instead of `Series.plot({})`." + ) positional_args = str(args)[1:-1] - keyword_args = ', '.join('{}={!r}'.format(name, value) - for (name, default), value - in zip(arg_def, args)) - warnings.warn(msg.format(keyword_args, positional_args), - FutureWarning, stacklevel=3) + keyword_args = ", ".join( + "{}={!r}".format(name, value) + for (name, default), value in zip(arg_def, args) + ) + warnings.warn( + msg.format(keyword_args, positional_args), FutureWarning, stacklevel=3 + ) pos_args = {name: value for value, (name, _) in zip(args, arg_def)} - if backend_name == 'pandas.plotting._matplotlib': + if backend_name == "pandas.plotting._matplotlib": kwargs = dict(arg_def, **pos_args, **kwargs) else: kwargs = dict(pos_args, **kwargs) - x = kwargs.pop('x', None) - y = kwargs.pop('y', None) - kind = kwargs.pop('kind', 'line') + x = kwargs.pop("x", None) + y = kwargs.pop("y", None) + kind = kwargs.pop("kind", "line") return x, y, kind, kwargs def __call__(self, *args, **kwargs): plot_backend = _get_plot_backend() - x, y, kind, kwargs = self._get_call_args(plot_backend.__name__, - self._parent, args, kwargs) + x, y, kind, kwargs = self._get_call_args( + plot_backend.__name__, self._parent, args, kwargs + ) kind = self._kind_aliases.get(kind, kind) if kind not in self._all_kinds: - raise ValueError('{} is not a valid plot kind'.format(kind)) + raise ValueError("{} is not a valid plot kind".format(kind)) # The original data structured can be transformed before passed to the # backend. For example, for DataFrame is common to set the index as the @@ -585,22 +731,22 @@ def __call__(self, *args, **kwargs): data = self._parent.copy() if isinstance(data, pandas.core.dtypes.generic.ABCSeries): - kwargs['reuse_plot'] = True + kwargs["reuse_plot"] = True if kind in self._dataframe_kinds: if isinstance(data, ABCDataFrame): return plot_backend.plot(data, x=x, y=y, kind=kind, **kwargs) else: - raise ValueError(("plot kind {} can only be used for " - "data frames").format(kind)) + raise ValueError( + ("plot kind {} can only be used for " "data frames").format(kind) + ) elif kind in self._series_kinds: if isinstance(data, ABCDataFrame): - if y is None and kwargs.get('subplots') is False: + if y is None and kwargs.get("subplots") is False: msg = "{} requires either y column or 'subplots=True'" raise ValueError(msg.format(kind)) elif y is not None: - if (is_integer(y) - and not data.columns.holds_integer()): + if is_integer(y) and not data.columns.holds_integer(): y = data.columns[y] # converted to series actually. copy to not modify data = data[y].copy() @@ -620,11 +766,11 @@ def __call__(self, *args, **kwargs): if int_y_arg and not data.columns.holds_integer(): y = data_cols[y] - label_kw = kwargs['label'] if 'label' in kwargs else False - for kw in ['xerr', 'yerr']: - if (kw in kwargs and - (isinstance(kwargs[kw], str) - or is_integer(kwargs[kw]))): + label_kw = kwargs["label"] if "label" in kwargs else False + for kw in ["xerr", "yerr"]: + if kw in kwargs and ( + isinstance(kwargs[kw], str) or is_integer(kwargs[kw]) + ): try: kwargs[kw] = data[kwargs[kw]] except (IndexError, KeyError, TypeError): @@ -640,7 +786,8 @@ def __call__(self, *args, **kwargs): match = is_list_like(label_kw) and len(label_kw) == len(y) if label_kw and not match: raise ValueError( - "label should be list-like and same length as y") + "label should be list-like and same length as y" + ) label_name = label_kw or data.columns data.columns = label_name @@ -713,7 +860,7 @@ def line(self, x=None, y=None, **kwargs): >>> lines = df.plot.line(x='pig', y='horse') """ - return self(kind='line', x=x, y=y, **kwargs) + return self(kind="line", x=x, y=y, **kwargs) def bar(self, x=None, y=None, **kwargs): """ @@ -798,7 +945,7 @@ def bar(self, x=None, y=None, **kwargs): >>> ax = df.plot.bar(x='lifespan', rot=0) """ - return self(kind='bar', x=x, y=y, **kwargs) + return self(kind="bar", x=x, y=y, **kwargs) def barh(self, x=None, y=None, **kwargs): """ @@ -878,7 +1025,7 @@ def barh(self, x=None, y=None, **kwargs): ... 'lifespan': lifespan}, index=index) >>> ax = df.plot.barh(x='lifespan') """ - return self(kind='barh', x=x, y=y, **kwargs) + return self(kind="barh", x=x, y=y, **kwargs) def box(self, by=None, **kwargs): r""" @@ -928,7 +1075,7 @@ def box(self, by=None, **kwargs): >>> df = pd.DataFrame(data, columns=list('ABCD')) >>> ax = df.plot.box() """ - return self(kind='box', by=by, **kwargs) + return self(kind="box", by=by, **kwargs) def hist(self, by=None, bins=10, **kwargs): """ @@ -975,7 +1122,7 @@ def hist(self, by=None, bins=10, **kwargs): >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) """ - return self(kind='hist', by=by, bins=bins, **kwargs) + return self(kind="hist", by=by, bins=bins, **kwargs) def kde(self, bw_method=None, ind=None, **kwargs): """ @@ -1083,7 +1230,7 @@ def kde(self, bw_method=None, ind=None, **kwargs): >>> ax = df.plot.kde(ind=[1, 2, 3, 4, 5, 6]) """ - return self(kind='kde', bw_method=bw_method, ind=ind, **kwargs) + return self(kind="kde", bw_method=bw_method, ind=ind, **kwargs) density = kde @@ -1158,7 +1305,7 @@ def area(self, x=None, y=None, **kwargs): ... }) >>> ax = df.plot.area(x='day') """ - return self(kind='area', x=x, y=y, **kwargs) + return self(kind="area", x=x, y=y, **kwargs) def pie(self, **kwargs): """ @@ -1207,11 +1354,13 @@ def pie(self, **kwargs): >>> plot = df.plot.pie(subplots=True, figsize=(6, 3)) """ - if (isinstance(self._parent, ABCDataFrame) - and kwargs.get('y', None) is None - and not kwargs.get('subplots', False)): + if ( + isinstance(self._parent, ABCDataFrame) + and kwargs.get("y", None) is None + and not kwargs.get("subplots", False) + ): raise ValueError("pie requires either y column or 'subplots=True'") - return self(kind='pie', **kwargs) + return self(kind="pie", **kwargs) def scatter(self, x, y, s=None, c=None, **kwargs): """ @@ -1292,10 +1441,9 @@ def scatter(self, x, y, s=None, c=None, **kwargs): ... c='species', ... colormap='viridis') """ - return self(kind='scatter', x=x, y=y, s=s, c=c, **kwargs) + return self(kind="scatter", x=x, y=y, s=s, c=c, **kwargs) - def hexbin(self, x, y, C=None, reduce_C_function=None, gridsize=None, - **kwargs): + def hexbin(self, x, y, C=None, reduce_C_function=None, gridsize=None, **kwargs): """ Generate a hexagonal binning plot. @@ -1378,11 +1526,11 @@ def hexbin(self, x, y, C=None, reduce_C_function=None, gridsize=None, ... cmap="viridis") """ if reduce_C_function is not None: - kwargs['reduce_C_function'] = reduce_C_function + kwargs["reduce_C_function"] = reduce_C_function if gridsize is not None: - kwargs['gridsize'] = gridsize + kwargs["gridsize"] = gridsize - return self(kind='hexbin', x=x, y=y, C=C, **kwargs) + return self(kind="hexbin", x=x, y=y, C=C, **kwargs) def _get_plot_backend(): @@ -1398,7 +1546,7 @@ def _get_plot_backend(): The backend is imported lazily, as matplotlib is a soft dependency, and pandas can be used without it being installed. """ - backend_str = pandas.get_option('plotting.backend') - if backend_str == 'matplotlib': - backend_str = 'pandas.plotting._matplotlib' + backend_str = pandas.get_option("plotting.backend") + if backend_str == "matplotlib": + backend_str = "pandas.plotting._matplotlib" return importlib.import_module(backend_str) diff --git a/pandas/plotting/_matplotlib/__init__.py b/pandas/plotting/_matplotlib/__init__.py index 8eac6897add0e7..d3b7a34b6c9230 100644 --- a/pandas/plotting/_matplotlib/__init__.py +++ b/pandas/plotting/_matplotlib/__init__.py @@ -1,28 +1,46 @@ from pandas._config import get_option from pandas.plotting._matplotlib.boxplot import ( - BoxPlot, boxplot, boxplot_frame, boxplot_frame_groupby) + BoxPlot, + boxplot, + boxplot_frame, + boxplot_frame_groupby, +) from pandas.plotting._matplotlib.converter import deregister, register from pandas.plotting._matplotlib.core import ( - AreaPlot, BarhPlot, BarPlot, HexBinPlot, LinePlot, PiePlot, ScatterPlot) -from pandas.plotting._matplotlib.hist import ( - HistPlot, KdePlot, hist_frame, hist_series) + AreaPlot, + BarhPlot, + BarPlot, + HexBinPlot, + LinePlot, + PiePlot, + ScatterPlot, +) +from pandas.plotting._matplotlib.hist import HistPlot, KdePlot, hist_frame, hist_series from pandas.plotting._matplotlib.misc import ( - andrews_curves, autocorrelation_plot, bootstrap_plot, lag_plot, - parallel_coordinates, radviz, scatter_matrix) + andrews_curves, + autocorrelation_plot, + bootstrap_plot, + lag_plot, + parallel_coordinates, + radviz, + scatter_matrix, +) from pandas.plotting._matplotlib.timeseries import tsplot from pandas.plotting._matplotlib.tools import table -PLOT_CLASSES = {'line': LinePlot, - 'bar': BarPlot, - 'barh': BarhPlot, - 'box': BoxPlot, - 'hist': HistPlot, - 'kde': KdePlot, - 'area': AreaPlot, - 'pie': PiePlot, - 'scatter': ScatterPlot, - 'hexbin': HexBinPlot} +PLOT_CLASSES = { + "line": LinePlot, + "bar": BarPlot, + "barh": BarhPlot, + "box": BoxPlot, + "hist": HistPlot, + "kde": KdePlot, + "area": AreaPlot, + "pie": PiePlot, + "scatter": ScatterPlot, + "hexbin": HexBinPlot, +} if get_option("plotting.matplotlib.register_converters"): register(explicit=False) @@ -33,20 +51,35 @@ def plot(data, kind, **kwargs): # registered) causes problems in matplotlib 2 (converters seem to not # work) import matplotlib.pyplot as plt - if kwargs.pop('reuse_plot', False): - ax = kwargs.get('ax') + + if kwargs.pop("reuse_plot", False): + ax = kwargs.get("ax") if ax is None and len(plt.get_fignums()) > 0: with plt.rc_context(): ax = plt.gca() - kwargs['ax'] = getattr(ax, 'left_ax', ax) + kwargs["ax"] = getattr(ax, "left_ax", ax) plot_obj = PLOT_CLASSES[kind](data, **kwargs) plot_obj.generate() plot_obj.draw() return plot_obj.result -__all__ = ['plot', 'hist_series', 'hist_frame', 'boxplot', 'boxplot_frame', - 'boxplot_frame_groupby', 'tsplot', 'table', 'andrews_curves', - 'autocorrelation_plot', 'bootstrap_plot', 'lag_plot', - 'parallel_coordinates', 'radviz', 'scatter_matrix', 'register', - 'deregister'] +__all__ = [ + "plot", + "hist_series", + "hist_frame", + "boxplot", + "boxplot_frame", + "boxplot_frame_groupby", + "tsplot", + "table", + "andrews_curves", + "autocorrelation_plot", + "bootstrap_plot", + "lag_plot", + "parallel_coordinates", + "radviz", + "scatter_matrix", + "register", + "deregister", +] diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index f8bc531e3c344d..8ff7441df53548 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -17,18 +17,17 @@ class BoxPlot(LinePlot): - _kind = 'box' - _layout_type = 'horizontal' + _kind = "box" + _layout_type = "horizontal" - _valid_return_types = (None, 'axes', 'dict', 'both') + _valid_return_types = (None, "axes", "dict", "both") # namedtuple to hold results - BP = namedtuple("Boxplot", ['ax', 'lines']) + BP = namedtuple("Boxplot", ["ax", "lines"]) - def __init__(self, data, return_type='axes', **kwargs): + def __init__(self, data, return_type="axes", **kwargs): # Do not call LinePlot.__init__ which may fill nan if return_type not in self._valid_return_types: - raise ValueError( - "return_type must be {None, 'axes', 'dict', 'both'}") + raise ValueError("return_type must be {None, 'axes', 'dict', 'both'}") self.return_type = return_type MPLPlot.__init__(self, data, **kwargs) @@ -37,13 +36,13 @@ def _args_adjust(self): if self.subplots: # Disable label ax sharing. Otherwise, all subplots shows last # column label - if self.orientation == 'vertical': + if self.orientation == "vertical": self.sharex = False else: self.sharey = False @classmethod - def _plot(cls, ax, y, column_num=None, return_type='axes', **kwds): + def _plot(cls, ax, y, column_num=None, return_type="axes", **kwds): if y.ndim == 2: y = [remove_na_arraylike(v) for v in y] # Boxplot fails with empty arrays, so need to add a NaN @@ -54,52 +53,53 @@ def _plot(cls, ax, y, column_num=None, return_type='axes', **kwds): y = remove_na_arraylike(y) bp = ax.boxplot(y, **kwds) - if return_type == 'dict': + if return_type == "dict": return bp, bp - elif return_type == 'both': + elif return_type == "both": return cls.BP(ax=ax, lines=bp), bp else: return ax, bp def _validate_color_args(self): - if 'color' in self.kwds: + if "color" in self.kwds: if self.colormap is not None: - warnings.warn("'color' and 'colormap' cannot be used " - "simultaneously. Using 'color'") - self.color = self.kwds.pop('color') + warnings.warn( + "'color' and 'colormap' cannot be used " + "simultaneously. Using 'color'" + ) + self.color = self.kwds.pop("color") if isinstance(self.color, dict): - valid_keys = ['boxes', 'whiskers', 'medians', 'caps'] + valid_keys = ["boxes", "whiskers", "medians", "caps"] for key, values in self.color.items(): if key not in valid_keys: - raise ValueError("color dict contains invalid " - "key '{0}' " - "The key must be either {1}" - .format(key, valid_keys)) + raise ValueError( + "color dict contains invalid " + "key '{0}' " + "The key must be either {1}".format(key, valid_keys) + ) else: self.color = None # get standard colors for default - colors = _get_standard_colors(num_colors=3, - colormap=self.colormap, - color=None) + colors = _get_standard_colors(num_colors=3, colormap=self.colormap, color=None) # use 2 colors by default, for box/whisker and median # flier colors isn't needed here # because it can be specified by ``sym`` kw self._boxes_c = colors[0] self._whiskers_c = colors[0] self._medians_c = colors[2] - self._caps_c = 'k' # mpl default + self._caps_c = "k" # mpl default - def _get_colors(self, num_colors=None, color_kwds='color'): + def _get_colors(self, num_colors=None, color_kwds="color"): pass def maybe_color_bp(self, bp): if isinstance(self.color, dict): - boxes = self.color.get('boxes', self._boxes_c) - whiskers = self.color.get('whiskers', self._whiskers_c) - medians = self.color.get('medians', self._medians_c) - caps = self.color.get('caps', self._caps_c) + boxes = self.color.get("boxes", self._boxes_c) + whiskers = self.color.get("whiskers", self._whiskers_c) + medians = self.color.get("medians", self._medians_c) + caps = self.color.get("caps", self._caps_c) else: # Other types are forwarded to matplotlib # If None, use default colors @@ -108,10 +108,10 @@ def maybe_color_bp(self, bp): medians = self.color or self._medians_c caps = self.color or self._caps_c - setp(bp['boxes'], color=boxes, alpha=1) - setp(bp['whiskers'], color=whiskers, alpha=1) - setp(bp['medians'], color=medians, alpha=1) - setp(bp['caps'], color=caps, alpha=1) + setp(bp["boxes"], color=boxes, alpha=1) + setp(bp["whiskers"], color=whiskers, alpha=1) + setp(bp["medians"], color=medians, alpha=1) + setp(bp["caps"], color=caps, alpha=1) def _make_plot(self): if self.subplots: @@ -121,8 +121,9 @@ def _make_plot(self): ax = self._get_ax(i) kwds = self.kwds.copy() - ret, bp = self._plot(ax, y, column_num=i, - return_type=self.return_type, **kwds) + ret, bp = self._plot( + ax, y, column_num=i, return_type=self.return_type, **kwds + ) self.maybe_color_bp(bp) self._return_obj[label] = ret @@ -133,8 +134,9 @@ def _make_plot(self): ax = self._get_ax(0) kwds = self.kwds.copy() - ret, bp = self._plot(ax, y, column_num=0, - return_type=self.return_type, **kwds) + ret, bp = self._plot( + ax, y, column_num=0, return_type=self.return_type, **kwds + ) self.maybe_color_bp(bp) self._return_obj = ret @@ -145,7 +147,7 @@ def _make_plot(self): self._set_ticklabels(ax, labels) def _set_ticklabels(self, ax, labels): - if self.orientation == 'vertical': + if self.orientation == "vertical": ax.set_xticklabels(labels) else: ax.set_yticklabels(labels) @@ -158,10 +160,10 @@ def _post_plot_logic(self, ax, data): @property def orientation(self): - if self.kwds.get('vert', True): - return 'vertical' + if self.kwds.get("vert", True): + return "vertical" else: - return 'horizontal' + return "horizontal" @property def result(self): @@ -171,18 +173,28 @@ def result(self): return self._return_obj -def _grouped_plot_by_column(plotf, data, columns=None, by=None, - numeric_only=True, grid=False, - figsize=None, ax=None, layout=None, - return_type=None, **kwargs): +def _grouped_plot_by_column( + plotf, + data, + columns=None, + by=None, + numeric_only=True, + grid=False, + figsize=None, + ax=None, + layout=None, + return_type=None, + **kwargs +): grouped = data.groupby(by) if columns is None: if not isinstance(by, (list, tuple)): by = [by] columns = data._get_numeric_data().columns.difference(by) naxes = len(columns) - fig, axes = _subplots(naxes=naxes, sharex=True, sharey=True, - figsize=figsize, ax=ax, layout=layout) + fig, axes = _subplots( + naxes=naxes, sharex=True, sharey=True, figsize=figsize, ax=ax, layout=layout + ) _axes = _flatten(axes) @@ -205,52 +217,63 @@ def _grouped_plot_by_column(plotf, data, columns=None, by=None, result = axes byline = by[0] if len(by) == 1 else by - fig.suptitle('Boxplot grouped by {byline}'.format(byline=byline)) + fig.suptitle("Boxplot grouped by {byline}".format(byline=byline)) fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) return result -def boxplot(data, column=None, by=None, ax=None, fontsize=None, - rot=0, grid=True, figsize=None, layout=None, return_type=None, - **kwds): +def boxplot( + data, + column=None, + by=None, + ax=None, + fontsize=None, + rot=0, + grid=True, + figsize=None, + layout=None, + return_type=None, + **kwds +): import matplotlib.pyplot as plt + # validate return_type: if return_type not in BoxPlot._valid_return_types: raise ValueError("return_type must be {'axes', 'dict', 'both'}") if isinstance(data, ABCSeries): - data = data.to_frame('x') - column = 'x' + data = data.to_frame("x") + column = "x" def _get_colors(): # num_colors=3 is required as method maybe_color_bp takes the colors # in positions 0 and 2. - return _get_standard_colors(color=kwds.get('color'), num_colors=3) + return _get_standard_colors(color=kwds.get("color"), num_colors=3) def maybe_color_bp(bp): - if 'color' not in kwds: - setp(bp['boxes'], color=colors[0], alpha=1) - setp(bp['whiskers'], color=colors[0], alpha=1) - setp(bp['medians'], color=colors[2], alpha=1) + if "color" not in kwds: + setp(bp["boxes"], color=colors[0], alpha=1) + setp(bp["whiskers"], color=colors[0], alpha=1) + setp(bp["medians"], color=colors[2], alpha=1) def plot_group(keys, values, ax): keys = [pprint_thing(x) for x in keys] values = [np.asarray(remove_na_arraylike(v)) for v in values] bp = ax.boxplot(values, **kwds) if fontsize is not None: - ax.tick_params(axis='both', labelsize=fontsize) - if kwds.get('vert', 1): + ax.tick_params(axis="both", labelsize=fontsize) + if kwds.get("vert", 1): ax.set_xticklabels(keys, rotation=rot) else: ax.set_yticklabels(keys, rotation=rot) maybe_color_bp(bp) # Return axes in multiplot case, maybe revisit later # 985 - if return_type == 'dict': + if return_type == "dict": return bp - elif return_type == 'both': + elif return_type == "both": return BoxPlot.BP(ax=ax, lines=bp) else: return ax @@ -267,19 +290,27 @@ def plot_group(keys, values, ax): if by is not None: # Prefer array return type for 2-D plots to match the subplot layout # https://github.com/pandas-dev/pandas/pull/12216#issuecomment-241175580 - result = _grouped_plot_by_column(plot_group, data, columns=columns, - by=by, grid=grid, figsize=figsize, - ax=ax, layout=layout, - return_type=return_type) + result = _grouped_plot_by_column( + plot_group, + data, + columns=columns, + by=by, + grid=grid, + figsize=figsize, + ax=ax, + layout=layout, + return_type=return_type, + ) else: if return_type is None: - return_type = 'axes' + return_type = "axes" if layout is not None: - raise ValueError("The 'layout' keyword is not supported when " - "'by' is None") + raise ValueError( + "The 'layout' keyword is not supported when " "'by' is None" + ) if ax is None: - rc = {'figure.figsize': figsize} if figsize is not None else {} + rc = {"figure.figsize": figsize} if figsize is not None else {} with plt.rc_context(rc): ax = plt.gca() data = data._get_numeric_data() @@ -294,37 +325,75 @@ def plot_group(keys, values, ax): return result -def boxplot_frame(self, column=None, by=None, ax=None, fontsize=None, rot=0, - grid=True, figsize=None, layout=None, - return_type=None, **kwds): +def boxplot_frame( + self, + column=None, + by=None, + ax=None, + fontsize=None, + rot=0, + grid=True, + figsize=None, + layout=None, + return_type=None, + **kwds +): import matplotlib.pyplot as plt + converter._WARN = False # no warning for pandas plots - ax = boxplot(self, column=column, by=by, ax=ax, fontsize=fontsize, - grid=grid, rot=rot, figsize=figsize, layout=layout, - return_type=return_type, **kwds) + ax = boxplot( + self, + column=column, + by=by, + ax=ax, + fontsize=fontsize, + grid=grid, + rot=rot, + figsize=figsize, + layout=layout, + return_type=return_type, + **kwds + ) plt.draw_if_interactive() return ax -def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, - rot=0, grid=True, ax=None, figsize=None, - layout=None, sharex=False, sharey=True, **kwds): +def boxplot_frame_groupby( + grouped, + subplots=True, + column=None, + fontsize=None, + rot=0, + grid=True, + ax=None, + figsize=None, + layout=None, + sharex=False, + sharey=True, + **kwds +): converter._WARN = False # no warning for pandas plots if subplots is True: naxes = len(grouped) - fig, axes = _subplots(naxes=naxes, squeeze=False, - ax=ax, sharex=sharex, sharey=sharey, - figsize=figsize, layout=layout) + fig, axes = _subplots( + naxes=naxes, + squeeze=False, + ax=ax, + sharex=sharex, + sharey=sharey, + figsize=figsize, + layout=layout, + ) axes = _flatten(axes) ret = pd.Series() for (key, group), ax in zip(grouped, axes): - d = group.boxplot(ax=ax, column=column, fontsize=fontsize, - rot=rot, grid=grid, **kwds) + d = group.boxplot( + ax=ax, column=column, fontsize=fontsize, rot=rot, grid=grid, **kwds + ) ax.set_title(pprint_thing(key)) ret.loc[key] = d - fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, - right=0.9, wspace=0.2) + fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) else: keys, frames = zip(*grouped) if grouped.axis == 0: @@ -334,7 +403,14 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, df = frames[0].join(frames[1::]) else: df = frames[0] - ret = df.boxplot(column=column, fontsize=fontsize, rot=rot, - grid=grid, ax=ax, figsize=figsize, - layout=layout, **kwds) + ret = df.boxplot( + column=column, + fontsize=fontsize, + rot=rot, + grid=grid, + ax=ax, + figsize=figsize, + layout=layout, + **kwds + ) return ret diff --git a/pandas/plotting/_matplotlib/compat.py b/pandas/plotting/_matplotlib/compat.py index 36bbe0f4ec1746..e7855068334f7a 100644 --- a/pandas/plotting/_matplotlib/compat.py +++ b/pandas/plotting/_matplotlib/compat.py @@ -9,12 +9,14 @@ def inner(): import matplotlib as mpl except ImportError: return False - return (op(LooseVersion(mpl.__version__), LooseVersion(version)) and - str(mpl.__version__)[0] != '0') + return ( + op(LooseVersion(mpl.__version__), LooseVersion(version)) + and str(mpl.__version__)[0] != "0" + ) return inner -_mpl_ge_2_2_3 = _mpl_version('2.2.3', operator.ge) -_mpl_ge_3_0_0 = _mpl_version('3.0.0', operator.ge) -_mpl_ge_3_1_0 = _mpl_version('3.1.0', operator.ge) +_mpl_ge_2_2_3 = _mpl_version("2.2.3", operator.ge) +_mpl_ge_3_0_0 = _mpl_version("3.0.0", operator.ge) +_mpl_ge_3_1_0 = _mpl_version("3.1.0", operator.ge) diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 30ef7a64dec4ae..b20dd3212c7cb0 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -14,8 +14,13 @@ from pandas._libs.tslibs.frequencies import FreqGroup, get_freq from pandas.core.dtypes.common import ( - is_datetime64_ns_dtype, is_float, is_float_dtype, is_integer, - is_integer_dtype, is_nested_list_like) + is_datetime64_ns_dtype, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_nested_list_like, +) from pandas.core.dtypes.generic import ABCSeries import pandas.core.common as com @@ -25,9 +30,9 @@ import pandas.core.tools.datetimes as tools # constants -HOURS_PER_DAY = 24. -MIN_PER_HOUR = 60. -SEC_PER_MIN = 60. +HOURS_PER_DAY = 24.0 +MIN_PER_HOUR = 60.0 +SEC_PER_MIN = 60.0 SEC_PER_HOUR = SEC_PER_MIN * MIN_PER_HOUR SEC_PER_DAY = SEC_PER_HOUR * HOURS_PER_DAY @@ -75,8 +80,7 @@ def deregister(): # restore the old keys for unit, formatter in _mpl_units.items(): - if type(formatter) not in {DatetimeConverter, PeriodConverter, - TimeConverter}: + if type(formatter) not in {DatetimeConverter, PeriodConverter, TimeConverter}: # make it idempotent by excluding ours. units.registry[unit] = formatter @@ -85,21 +89,22 @@ def _check_implicitly_registered(): global _WARN if _WARN: - msg = ("Using an implicitly registered datetime converter for a " - "matplotlib plotting method. The converter was registered " - "by pandas on import. Future versions of pandas will require " - "you to explicitly register matplotlib converters.\n\n" - "To register the converters:\n\t" - ">>> from pandas.plotting import register_matplotlib_converters" - "\n\t" - ">>> register_matplotlib_converters()") + msg = ( + "Using an implicitly registered datetime converter for a " + "matplotlib plotting method. The converter was registered " + "by pandas on import. Future versions of pandas will require " + "you to explicitly register matplotlib converters.\n\n" + "To register the converters:\n\t" + ">>> from pandas.plotting import register_matplotlib_converters" + "\n\t" + ">>> register_matplotlib_converters()" + ) warnings.warn(msg, FutureWarning) _WARN = False def _to_ordinalf(tm): - tot_sec = (tm.hour * 3600 + tm.minute * 60 + tm.second + - float(tm.microsecond / 1e6)) + tot_sec = tm.hour * 3600 + tm.minute * 60 + tm.second + float(tm.microsecond / 1e6) return tot_sec @@ -107,7 +112,7 @@ def time2num(d): if isinstance(d, str): parsed = tools.to_datetime(d) if not isinstance(parsed, datetime): - raise ValueError('Could not parse time {d}'.format(d=d)) + raise ValueError("Could not parse time {d}".format(d=d)) return _to_ordinalf(parsed.time()) if isinstance(d, pydt.time): return _to_ordinalf(d) @@ -115,12 +120,10 @@ def time2num(d): class TimeConverter(units.ConversionInterface): - @staticmethod def convert(value, unit, axis): valid_types = (str, pydt.time) - if (isinstance(value, valid_types) or is_integer(value) or - is_float(value)): + if isinstance(value, valid_types) or is_integer(value) or is_float(value): return time2num(value) if isinstance(value, Index): return value.map(time2num) @@ -130,21 +133,20 @@ def convert(value, unit, axis): @staticmethod def axisinfo(unit, axis): - if unit != 'time': + if unit != "time": return None majloc = AutoLocator() majfmt = TimeFormatter(majloc) - return units.AxisInfo(majloc=majloc, majfmt=majfmt, label='time') + return units.AxisInfo(majloc=majloc, majfmt=majfmt, label="time") @staticmethod def default_units(x, axis): - return 'time' + return "time" # time formatter class TimeFormatter(Formatter): - def __init__(self, locs): self.locs = locs @@ -166,7 +168,7 @@ def __call__(self, x, pos=0): A string in HH:MM:SS.mmmuuu format. Microseconds, milliseconds and seconds are only displayed if non-zero. """ - fmt = '%H:%M:%S.%f' + fmt = "%H:%M:%S.%f" s = int(x) msus = int(round((x - s) * 1e6)) ms = msus // 1000 @@ -179,39 +181,35 @@ def __call__(self, x, pos=0): elif ms != 0: return pydt.time(h, m, s, msus).strftime(fmt)[:-3] elif s != 0: - return pydt.time(h, m, s).strftime('%H:%M:%S') + return pydt.time(h, m, s).strftime("%H:%M:%S") - return pydt.time(h, m).strftime('%H:%M') + return pydt.time(h, m).strftime("%H:%M") # Period Conversion class PeriodConverter(dates.DateConverter): - @staticmethod def convert(values, units, axis): if is_nested_list_like(values): - values = [PeriodConverter._convert_1d(v, units, axis) - for v in values] + values = [PeriodConverter._convert_1d(v, units, axis) for v in values] else: values = PeriodConverter._convert_1d(values, units, axis) return values @staticmethod def _convert_1d(values, units, axis): - if not hasattr(axis, 'freq'): - raise TypeError('Axis must have `freq` set to convert to Periods') - valid_types = (str, datetime, Period, pydt.date, pydt.time, - np.datetime64) - if (isinstance(values, valid_types) or is_integer(values) or - is_float(values)): + if not hasattr(axis, "freq"): + raise TypeError("Axis must have `freq` set to convert to Periods") + valid_types = (str, datetime, Period, pydt.date, pydt.time, np.datetime64) + if isinstance(values, valid_types) or is_integer(values) or is_float(values): return get_datevalue(values, axis.freq) elif isinstance(values, PeriodIndex): return values.asfreq(axis.freq)._ndarray_values elif isinstance(values, Index): return values.map(lambda x: get_datevalue(x, axis.freq)) - elif lib.infer_dtype(values, skipna=False) == 'period': + elif lib.infer_dtype(values, skipna=False) == "period": # https://github.com/pandas-dev/pandas/issues/24304 # convert ndarray[period] -> PeriodIndex return PeriodIndex(values, freq=axis.freq)._ndarray_values @@ -223,11 +221,13 @@ def _convert_1d(values, units, axis): def get_datevalue(date, freq): if isinstance(date, Period): return date.asfreq(freq).ordinal - elif isinstance(date, (str, datetime, pydt.date, pydt.time, - np.datetime64)): + elif isinstance(date, (str, datetime, pydt.date, pydt.time, np.datetime64)): return Period(date, freq).ordinal - elif (is_integer(date) or is_float(date) or - (isinstance(date, (np.ndarray, Index)) and (date.size == 1))): + elif ( + is_integer(date) + or is_float(date) + or (isinstance(date, (np.ndarray, Index)) and (date.size == 1)) + ): return date elif date is None: return None @@ -240,9 +240,8 @@ def _dt_to_float_ordinal(dt): preserving hours, minutes, seconds and microseconds. Return value is a :func:`float`. """ - if (isinstance(dt, (np.ndarray, Index, ABCSeries) - ) and is_datetime64_ns_dtype(dt)): - base = dates.epoch2num(dt.asi8 / 1.0E9) + if isinstance(dt, (np.ndarray, Index, ABCSeries)) and is_datetime64_ns_dtype(dt): + base = dates.epoch2num(dt.asi8 / 1.0e9) else: base = dates.date2num(dt) return base @@ -250,14 +249,12 @@ def _dt_to_float_ordinal(dt): # Datetime Conversion class DatetimeConverter(dates.DateConverter): - @staticmethod def convert(values, unit, axis): # values might be a 1-d array, or a list-like of arrays. _check_implicitly_registered() if is_nested_list_like(values): - values = [DatetimeConverter._convert_1d(v, unit, axis) - for v in values] + values = [DatetimeConverter._convert_1d(v, unit, axis) for v in values] else: values = DatetimeConverter._convert_1d(values, unit, axis) return values @@ -276,7 +273,7 @@ def try_parse(values): return _dt_to_float_ordinal(tslibs.Timestamp(values)) elif isinstance(values, pydt.time): return dates.date2num(values) - elif (is_integer(values) or is_float(values)): + elif is_integer(values) or is_float(values): return values elif isinstance(values, str): return try_parse(values) @@ -319,13 +316,13 @@ def axisinfo(unit, axis): datemin = pydt.date(2000, 1, 1) datemax = pydt.date(2010, 1, 1) - return units.AxisInfo(majloc=majloc, majfmt=majfmt, label='', - default_limits=(datemin, datemax)) + return units.AxisInfo( + majloc=majloc, majfmt=majfmt, label="", default_limits=(datemin, datemax) + ) class PandasAutoDateFormatter(dates.AutoDateFormatter): - - def __init__(self, locator, tz=None, defaultfmt='%Y-%m-%d'): + def __init__(self, locator, tz=None, defaultfmt="%Y-%m-%d"): dates.AutoDateFormatter.__init__(self, locator, tz, defaultfmt) # matplotlib.dates._UTC has no _utcoffset called by pandas if self._tz is dates.UTC: @@ -333,15 +330,14 @@ def __init__(self, locator, tz=None, defaultfmt='%Y-%m-%d'): class PandasAutoDateLocator(dates.AutoDateLocator): - def get_locator(self, dmin, dmax): - 'Pick the best locator based on a distance.' + "Pick the best locator based on a distance." _check_implicitly_registered() delta = relativedelta(dmax, dmin) num_days = (delta.years * 12.0 + delta.months) * 31.0 + delta.days num_sec = (delta.hours * 60.0 + delta.minutes) * 60.0 + delta.seconds - tot_sec = num_days * 86400. + num_sec + tot_sec = num_days * 86400.0 + num_sec if abs(tot_sec) < self.minticks: self._freq = -1 @@ -360,11 +356,11 @@ def _get_unit(self): class MilliSecondLocator(dates.DateLocator): - UNIT = 1. / (24 * 3600 * 1000) + UNIT = 1.0 / (24 * 3600 * 1000) def __init__(self, tz): dates.DateLocator.__init__(self, tz) - self._interval = 1. + self._interval = 1.0 def _get_unit(self): return self.get_unit_generic(-1) @@ -411,24 +407,25 @@ def __call__(self): break else: # We went through the whole loop without breaking, default to 1 - self._interval = 1000. + self._interval = 1000.0 estimate = (nmax - nmin) / (self._get_unit() * self._get_interval()) if estimate > self.MAXTICKS * 2: - raise RuntimeError(('MillisecondLocator estimated to generate ' - '{estimate:d} ticks from {dmin} to {dmax}: ' - 'exceeds Locator.MAXTICKS' - '* 2 ({arg:d}) ').format( - estimate=estimate, dmin=dmin, dmax=dmax, - arg=self.MAXTICKS * 2)) - - freq = '%dL' % self._get_interval() + raise RuntimeError( + ( + "MillisecondLocator estimated to generate " + "{estimate:d} ticks from {dmin} to {dmax}: " + "exceeds Locator.MAXTICKS" + "* 2 ({arg:d}) " + ).format(estimate=estimate, dmin=dmin, dmax=dmax, arg=self.MAXTICKS * 2) + ) + + freq = "%dL" % self._get_interval() tz = self.tz.tzname(None) st = _from_ordinal(dates.date2num(dmin)) # strip tz ed = _from_ordinal(dates.date2num(dmax)) - all_dates = date_range(start=st, end=ed, - freq=freq, tz=tz).astype(object) + all_dates = date_range(start=st, end=ed, freq=freq, tz=tz).astype(object) try: if len(all_dates) > 0: @@ -485,8 +482,9 @@ def _from_ordinal(x, tz=None): microsecond = int(1e6 * remainder) if microsecond < 10: microsecond = 0 # compensate for rounding errors - dt = datetime(dt.year, dt.month, dt.day, int(hour), int(minute), - int(second), microsecond) + dt = datetime( + dt.year, dt.month, dt.day, int(hour), int(minute), int(second), microsecond + ) if tz is not None: dt = dt.astimezone(tz) @@ -495,6 +493,7 @@ def _from_ordinal(x, tz=None): return dt + # Fixed frequency dynamic tick locators and formatters # ------------------------------------------------------------------------- @@ -548,9 +547,9 @@ def has_level_label(label_flags, vmin): if the minimum view limit is not an exact integer, then the first tick label won't be shown, so we must adjust for that. """ - if label_flags.size == 0 or (label_flags.size == 1 and - label_flags[0] == 0 and - vmin % 1 > 0.0): + if label_flags.size == 0 or ( + label_flags.size == 1 and label_flags[0] == 0 and vmin % 1 > 0.0 + ): return False else: return True @@ -592,33 +591,34 @@ def _daily_finder(vmin, vmax, freq): # save this for later usage vmin_orig = vmin - (vmin, vmax) = (Period(ordinal=int(vmin), freq=freq), - Period(ordinal=int(vmax), freq=freq)) + (vmin, vmax) = ( + Period(ordinal=int(vmin), freq=freq), + Period(ordinal=int(vmax), freq=freq), + ) span = vmax.ordinal - vmin.ordinal + 1 dates_ = period_range(start=vmin, end=vmax, freq=freq) # Initialize the output - info = np.zeros(span, - dtype=[('val', np.int64), ('maj', bool), - ('min', bool), ('fmt', '|S20')]) - info['val'][:] = dates_._ndarray_values - info['fmt'][:] = '' - info['maj'][[0, -1]] = True + info = np.zeros( + span, dtype=[("val", np.int64), ("maj", bool), ("min", bool), ("fmt", "|S20")] + ) + info["val"][:] = dates_._ndarray_values + info["fmt"][:] = "" + info["maj"][[0, -1]] = True # .. and set some shortcuts - info_maj = info['maj'] - info_min = info['min'] - info_fmt = info['fmt'] + info_maj = info["maj"] + info_min = info["min"] + info_fmt = info["fmt"] def first_label(label_flags): - if (label_flags[0] == 0) and (label_flags.size > 1) and \ - ((vmin_orig % 1) > 0.0): + if (label_flags[0] == 0) and (label_flags.size > 1) and ((vmin_orig % 1) > 0.0): return label_flags[1] else: return label_flags[0] # Case 1. Less than a month if span <= periodspermonth: - day_start = period_break(dates_, 'day') - month_start = period_break(dates_, 'month') + day_start = period_break(dates_, "day") + month_start = period_break(dates_, "month") def _hour_finder(label_interval, force_year_start): _hour = dates_.hour @@ -626,39 +626,38 @@ def _hour_finder(label_interval, force_year_start): hour_start = (_hour - _prev_hour) != 0 info_maj[day_start] = True info_min[hour_start & (_hour % label_interval == 0)] = True - year_start = period_break(dates_, 'year') - info_fmt[hour_start & (_hour % label_interval == 0)] = '%H:%M' - info_fmt[day_start] = '%H:%M\n%d-%b' - info_fmt[year_start] = '%H:%M\n%d-%b\n%Y' + year_start = period_break(dates_, "year") + info_fmt[hour_start & (_hour % label_interval == 0)] = "%H:%M" + info_fmt[day_start] = "%H:%M\n%d-%b" + info_fmt[year_start] = "%H:%M\n%d-%b\n%Y" if force_year_start and not has_level_label(year_start, vmin_orig): - info_fmt[first_label(day_start)] = '%H:%M\n%d-%b\n%Y' + info_fmt[first_label(day_start)] = "%H:%M\n%d-%b\n%Y" def _minute_finder(label_interval): - hour_start = period_break(dates_, 'hour') + hour_start = period_break(dates_, "hour") _minute = dates_.minute _prev_minute = (dates_ - 1 * dates_.freq).minute minute_start = (_minute - _prev_minute) != 0 info_maj[hour_start] = True info_min[minute_start & (_minute % label_interval == 0)] = True - year_start = period_break(dates_, 'year') - info_fmt = info['fmt'] - info_fmt[minute_start & (_minute % label_interval == 0)] = '%H:%M' - info_fmt[day_start] = '%H:%M\n%d-%b' - info_fmt[year_start] = '%H:%M\n%d-%b\n%Y' + year_start = period_break(dates_, "year") + info_fmt = info["fmt"] + info_fmt[minute_start & (_minute % label_interval == 0)] = "%H:%M" + info_fmt[day_start] = "%H:%M\n%d-%b" + info_fmt[year_start] = "%H:%M\n%d-%b\n%Y" def _second_finder(label_interval): - minute_start = period_break(dates_, 'minute') + minute_start = period_break(dates_, "minute") _second = dates_.second _prev_second = (dates_ - 1 * dates_.freq).second second_start = (_second - _prev_second) != 0 - info['maj'][minute_start] = True - info['min'][second_start & (_second % label_interval == 0)] = True - year_start = period_break(dates_, 'year') - info_fmt = info['fmt'] - info_fmt[second_start & (_second % - label_interval == 0)] = '%H:%M:%S' - info_fmt[day_start] = '%H:%M:%S\n%d-%b' - info_fmt[year_start] = '%H:%M:%S\n%d-%b\n%Y' + info["maj"][minute_start] = True + info["min"][second_start & (_second % label_interval == 0)] = True + year_start = period_break(dates_, "year") + info_fmt = info["fmt"] + info_fmt[second_start & (_second % label_interval == 0)] = "%H:%M:%S" + info_fmt[day_start] = "%H:%M:%S\n%d-%b" + info_fmt[year_start] = "%H:%M:%S\n%d-%b\n%Y" if span < periodsperday / 12000.0: _second_finder(1) @@ -695,81 +694,81 @@ def _second_finder(label_interval): else: info_maj[month_start] = True info_min[day_start] = True - year_start = period_break(dates_, 'year') - info_fmt = info['fmt'] - info_fmt[day_start] = '%d' - info_fmt[month_start] = '%d\n%b' - info_fmt[year_start] = '%d\n%b\n%Y' + year_start = period_break(dates_, "year") + info_fmt = info["fmt"] + info_fmt[day_start] = "%d" + info_fmt[month_start] = "%d\n%b" + info_fmt[year_start] = "%d\n%b\n%Y" if not has_level_label(year_start, vmin_orig): if not has_level_label(month_start, vmin_orig): - info_fmt[first_label(day_start)] = '%d\n%b\n%Y' + info_fmt[first_label(day_start)] = "%d\n%b\n%Y" else: - info_fmt[first_label(month_start)] = '%d\n%b\n%Y' + info_fmt[first_label(month_start)] = "%d\n%b\n%Y" # Case 2. Less than three months elif span <= periodsperyear // 4: - month_start = period_break(dates_, 'month') + month_start = period_break(dates_, "month") info_maj[month_start] = True if freq < FreqGroup.FR_HR: - info['min'] = True + info["min"] = True else: - day_start = period_break(dates_, 'day') - info['min'][day_start] = True - week_start = period_break(dates_, 'week') - year_start = period_break(dates_, 'year') - info_fmt[week_start] = '%d' - info_fmt[month_start] = '\n\n%b' - info_fmt[year_start] = '\n\n%b\n%Y' + day_start = period_break(dates_, "day") + info["min"][day_start] = True + week_start = period_break(dates_, "week") + year_start = period_break(dates_, "year") + info_fmt[week_start] = "%d" + info_fmt[month_start] = "\n\n%b" + info_fmt[year_start] = "\n\n%b\n%Y" if not has_level_label(year_start, vmin_orig): if not has_level_label(month_start, vmin_orig): - info_fmt[first_label(week_start)] = '\n\n%b\n%Y' + info_fmt[first_label(week_start)] = "\n\n%b\n%Y" else: - info_fmt[first_label(month_start)] = '\n\n%b\n%Y' + info_fmt[first_label(month_start)] = "\n\n%b\n%Y" # Case 3. Less than 14 months ............... elif span <= 1.15 * periodsperyear: - year_start = period_break(dates_, 'year') - month_start = period_break(dates_, 'month') - week_start = period_break(dates_, 'week') + year_start = period_break(dates_, "year") + month_start = period_break(dates_, "month") + week_start = period_break(dates_, "week") info_maj[month_start] = True info_min[week_start] = True info_min[year_start] = False info_min[month_start] = False - info_fmt[month_start] = '%b' - info_fmt[year_start] = '%b\n%Y' + info_fmt[month_start] = "%b" + info_fmt[year_start] = "%b\n%Y" if not has_level_label(year_start, vmin_orig): - info_fmt[first_label(month_start)] = '%b\n%Y' + info_fmt[first_label(month_start)] = "%b\n%Y" # Case 4. Less than 2.5 years ............... elif span <= 2.5 * periodsperyear: - year_start = period_break(dates_, 'year') - quarter_start = period_break(dates_, 'quarter') - month_start = period_break(dates_, 'month') + year_start = period_break(dates_, "year") + quarter_start = period_break(dates_, "quarter") + month_start = period_break(dates_, "month") info_maj[quarter_start] = True info_min[month_start] = True - info_fmt[quarter_start] = '%b' - info_fmt[year_start] = '%b\n%Y' + info_fmt[quarter_start] = "%b" + info_fmt[year_start] = "%b\n%Y" # Case 4. Less than 4 years ................. elif span <= 4 * periodsperyear: - year_start = period_break(dates_, 'year') - month_start = period_break(dates_, 'month') + year_start = period_break(dates_, "year") + month_start = period_break(dates_, "month") info_maj[year_start] = True info_min[month_start] = True info_min[year_start] = False month_break = dates_[month_start].month jan_or_jul = month_start[(month_break == 1) | (month_break == 7)] - info_fmt[jan_or_jul] = '%b' - info_fmt[year_start] = '%b\n%Y' + info_fmt[jan_or_jul] = "%b" + info_fmt[year_start] = "%b\n%Y" # Case 5. Less than 11 years ................ elif span <= 11 * periodsperyear: - year_start = period_break(dates_, 'year') - quarter_start = period_break(dates_, 'quarter') + year_start = period_break(dates_, "year") + quarter_start = period_break(dates_, "quarter") info_maj[year_start] = True info_min[quarter_start] = True info_min[year_start] = False - info_fmt[year_start] = '%Y' + info_fmt[year_start] = "%Y" # Case 6. More than 12 years ................ else: - year_start = period_break(dates_, 'year') + year_start = period_break(dates_, "year") year_break = dates_[year_start].year nyears = span / periodsperyear (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears) @@ -777,7 +776,7 @@ def _second_finder(label_interval): info_maj[major_idx] = True minor_idx = year_start[(year_break % min_anndef == 0)] info_min[minor_idx] = True - info_fmt[major_idx] = '%Y' + info_fmt[major_idx] = "%Y" return info @@ -790,54 +789,54 @@ def _monthly_finder(vmin, vmax, freq): span = vmax - vmin + 1 # Initialize the output - info = np.zeros(span, - dtype=[('val', int), ('maj', bool), ('min', bool), - ('fmt', '|S8')]) - info['val'] = np.arange(vmin, vmax + 1) - dates_ = info['val'] - info['fmt'] = '' + info = np.zeros( + span, dtype=[("val", int), ("maj", bool), ("min", bool), ("fmt", "|S8")] + ) + info["val"] = np.arange(vmin, vmax + 1) + dates_ = info["val"] + info["fmt"] = "" year_start = (dates_ % 12 == 0).nonzero()[0] - info_maj = info['maj'] - info_fmt = info['fmt'] + info_maj = info["maj"] + info_fmt = info["fmt"] if span <= 1.15 * periodsperyear: info_maj[year_start] = True - info['min'] = True + info["min"] = True - info_fmt[:] = '%b' - info_fmt[year_start] = '%b\n%Y' + info_fmt[:] = "%b" + info_fmt[year_start] = "%b\n%Y" if not has_level_label(year_start, vmin_orig): if dates_.size > 1: idx = 1 else: idx = 0 - info_fmt[idx] = '%b\n%Y' + info_fmt[idx] = "%b\n%Y" elif span <= 2.5 * periodsperyear: quarter_start = (dates_ % 3 == 0).nonzero() info_maj[year_start] = True # TODO: Check the following : is it really info['fmt'] ? - info['fmt'][quarter_start] = True - info['min'] = True + info["fmt"][quarter_start] = True + info["min"] = True - info_fmt[quarter_start] = '%b' - info_fmt[year_start] = '%b\n%Y' + info_fmt[quarter_start] = "%b" + info_fmt[year_start] = "%b\n%Y" elif span <= 4 * periodsperyear: info_maj[year_start] = True - info['min'] = True + info["min"] = True jan_or_jul = (dates_ % 12 == 0) | (dates_ % 12 == 6) - info_fmt[jan_or_jul] = '%b' - info_fmt[year_start] = '%b\n%Y' + info_fmt[jan_or_jul] = "%b" + info_fmt[year_start] = "%b\n%Y" elif span <= 11 * periodsperyear: quarter_start = (dates_ % 3 == 0).nonzero() info_maj[year_start] = True - info['min'][quarter_start] = True + info["min"][quarter_start] = True - info_fmt[year_start] = '%Y' + info_fmt[year_start] = "%Y" else: nyears = span / periodsperyear @@ -845,9 +844,9 @@ def _monthly_finder(vmin, vmax, freq): years = dates_[year_start] // 12 + 1 major_idx = year_start[(years % maj_anndef == 0)] info_maj[major_idx] = True - info['min'][year_start[(years % min_anndef == 0)]] = True + info["min"][year_start[(years % min_anndef == 0)]] = True - info_fmt[major_idx] = '%Y' + info_fmt[major_idx] = "%Y" return info @@ -858,33 +857,33 @@ def _quarterly_finder(vmin, vmax, freq): (vmin, vmax) = (int(vmin), int(vmax)) span = vmax - vmin + 1 - info = np.zeros(span, - dtype=[('val', int), ('maj', bool), ('min', bool), - ('fmt', '|S8')]) - info['val'] = np.arange(vmin, vmax + 1) - info['fmt'] = '' - dates_ = info['val'] - info_maj = info['maj'] - info_fmt = info['fmt'] + info = np.zeros( + span, dtype=[("val", int), ("maj", bool), ("min", bool), ("fmt", "|S8")] + ) + info["val"] = np.arange(vmin, vmax + 1) + info["fmt"] = "" + dates_ = info["val"] + info_maj = info["maj"] + info_fmt = info["fmt"] year_start = (dates_ % 4 == 0).nonzero()[0] if span <= 3.5 * periodsperyear: info_maj[year_start] = True - info['min'] = True + info["min"] = True - info_fmt[:] = 'Q%q' - info_fmt[year_start] = 'Q%q\n%F' + info_fmt[:] = "Q%q" + info_fmt[year_start] = "Q%q\n%F" if not has_level_label(year_start, vmin_orig): if dates_.size > 1: idx = 1 else: idx = 0 - info_fmt[idx] = 'Q%q\n%F' + info_fmt[idx] = "Q%q\n%F" elif span <= 11 * periodsperyear: info_maj[year_start] = True - info['min'] = True - info_fmt[year_start] = '%F' + info["min"] = True + info_fmt[year_start] = "%F" else: years = dates_[year_start] // 4 + 1 @@ -892,8 +891,8 @@ def _quarterly_finder(vmin, vmax, freq): (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears) major_idx = year_start[(years % maj_anndef == 0)] info_maj[major_idx] = True - info['min'][year_start[(years % min_anndef == 0)]] = True - info_fmt[major_idx] = '%F' + info["min"][year_start[(years % min_anndef == 0)]] = True + info_fmt[major_idx] = "%F" return info @@ -902,18 +901,18 @@ def _annual_finder(vmin, vmax, freq): (vmin, vmax) = (int(vmin), int(vmax + 1)) span = vmax - vmin + 1 - info = np.zeros(span, - dtype=[('val', int), ('maj', bool), ('min', bool), - ('fmt', '|S8')]) - info['val'] = np.arange(vmin, vmax + 1) - info['fmt'] = '' - dates_ = info['val'] + info = np.zeros( + span, dtype=[("val", int), ("maj", bool), ("min", bool), ("fmt", "|S8")] + ) + info["val"] = np.arange(vmin, vmax + 1) + info["fmt"] = "" + dates_ = info["val"] (min_anndef, maj_anndef) = _get_default_annual_spacing(span) major_idx = dates_ % maj_anndef == 0 - info['maj'][major_idx] = True - info['min'][(dates_ % min_anndef == 0)] = True - info['fmt'][major_idx] = '%Y' + info["maj"][major_idx] = True + info["min"][(dates_ % min_anndef == 0)] = True + info["fmt"][major_idx] = "%Y" return info @@ -929,7 +928,7 @@ def get_finder(freq): return _quarterly_finder elif freq == FreqGroup.FR_MTH: return _monthly_finder - elif ((freq >= FreqGroup.FR_BUS) or fgroup == FreqGroup.FR_WK): + elif (freq >= FreqGroup.FR_BUS) or fgroup == FreqGroup.FR_WK: return _daily_finder else: # pragma: no cover errmsg = "Unsupported frequency: {freq}".format(freq=freq) @@ -954,8 +953,17 @@ class TimeSeries_DateLocator(Locator): day : {int}, optional """ - def __init__(self, freq, minor_locator=False, dynamic_mode=True, - base=1, quarter=1, month=1, day=1, plot_obj=None): + def __init__( + self, + freq, + minor_locator=False, + dynamic_mode=True, + base=1, + quarter=1, + month=1, + day=1, + plot_obj=None, + ): if isinstance(freq, str): freq = get_freq(freq) self.freq = freq @@ -976,11 +984,11 @@ def _get_default_locs(self, vmin, vmax): locator = self.plot_obj.date_axis_info if self.isminor: - return np.compress(locator['min'], locator['val']) - return np.compress(locator['maj'], locator['val']) + return np.compress(locator["min"], locator["val"]) + return np.compress(locator["maj"], locator["val"]) def __call__(self): - 'Return the locations of the ticks.' + "Return the locations of the ticks." # axis calls Locator.set_axis inside set_m_formatter _check_implicitly_registered() @@ -1015,6 +1023,7 @@ def autoscale(self): vmax += 1 return nonsingular(vmin, vmax) + # ------------------------------------------------------------------------- # --- Formatter --- # ------------------------------------------------------------------------- @@ -1035,8 +1044,7 @@ class TimeSeries_DateFormatter(Formatter): Whether the formatter works in dynamic mode or not. """ - def __init__(self, freq, minor_locator=False, dynamic_mode=True, - plot_obj=None): + def __init__(self, freq, minor_locator=False, dynamic_mode=True, plot_obj=None): if isinstance(freq, str): freq = get_freq(freq) self.format = None @@ -1057,15 +1065,14 @@ def _set_default_format(self, vmin, vmax): info = self.plot_obj.date_axis_info if self.isminor: - format = np.compress(info['min'] & np.logical_not(info['maj']), - info) + format = np.compress(info["min"] & np.logical_not(info["maj"]), info) else: - format = np.compress(info['maj'], info) + format = np.compress(info["maj"], info) self.formatdict = {x: f for (x, _, _, f) in format} return self.formatdict def set_locs(self, locs): - 'Sets the locations of the ticks' + "Sets the locations of the ticks" # don't actually use the locs. This is just needed to work with # matplotlib. Force to use vmin, vmax _check_implicitly_registered() @@ -1084,9 +1091,9 @@ def __call__(self, x, pos=0): _check_implicitly_registered() if self.formatdict is None: - return '' + return "" else: - fmt = self.formatdict.pop(x, '') + fmt = self.formatdict.pop(x, "") return Period(ordinal=int(x), freq=self.freq).strftime(fmt) @@ -1104,12 +1111,12 @@ def format_timedelta_ticks(x, pos, n_decimals): m, s = divmod(s, 60) h, m = divmod(m, 60) d, h = divmod(h, 24) - decimals = int(ns * 10**(n_decimals - 9)) - s = r'{:02d}:{:02d}:{:02d}'.format(int(h), int(m), int(s)) + decimals = int(ns * 10 ** (n_decimals - 9)) + s = r"{:02d}:{:02d}:{:02d}".format(int(h), int(m), int(s)) if n_decimals > 0: - s += '.{{:0{:0d}d}}'.format(n_decimals).format(decimals) + s += ".{{:0{:0d}d}}".format(n_decimals).format(decimals) if d != 0: - s = '{:d} days '.format(int(d)) + s + s = "{:d} days ".format(int(d)) + s return s def __call__(self, x, pos=0): diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 5fb4d201223bd1..d25715e6d167b3 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -10,9 +10,19 @@ from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( - is_hashable, is_integer, is_iterator, is_list_like, is_number) + is_hashable, + is_integer, + is_iterator, + is_list_like, + is_number, +) from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCIndexClass, ABCMultiIndex, ABCPeriodIndex, ABCSeries) + ABCDataFrame, + ABCIndexClass, + ABCMultiIndex, + ABCPeriodIndex, + ABCSeries, +) from pandas.core.dtypes.missing import isna, notna import pandas.core.common as com @@ -22,10 +32,16 @@ from pandas.plotting._matplotlib.compat import _mpl_ge_3_0_0 from pandas.plotting._matplotlib.style import _get_standard_colors from pandas.plotting._matplotlib.tools import ( - _flatten, _get_all_lines, _get_xlim, _handle_shared_axes, _subplots, - format_date_labels, table) - -if get_option('plotting.matplotlib.register_converters'): + _flatten, + _get_all_lines, + _get_xlim, + _handle_shared_axes, + _subplots, + format_date_labels, + table, +) + +if get_option("plotting.matplotlib.register_converters"): converter.register(explicit=False) @@ -38,29 +54,63 @@ class MPLPlot: data : """ + @property def _kind(self): """Specify kind str. Must be overridden in child class""" raise NotImplementedError - _layout_type = 'vertical' + _layout_type = "vertical" _default_rot = 0 orientation = None # type: Optional[str] - _pop_attributes = ['label', 'style', 'logy', 'logx', 'loglog', - 'mark_right', 'stacked'] - _attr_defaults = {'logy': False, 'logx': False, 'loglog': False, - 'mark_right': True, 'stacked': False} - - def __init__(self, data, kind=None, by=None, subplots=False, sharex=None, - sharey=False, use_index=True, - figsize=None, grid=None, legend=True, rot=None, - ax=None, fig=None, title=None, xlim=None, ylim=None, - xticks=None, yticks=None, - sort_columns=False, fontsize=None, - secondary_y=False, colormap=None, - table=False, layout=None, **kwds): + _pop_attributes = [ + "label", + "style", + "logy", + "logx", + "loglog", + "mark_right", + "stacked", + ] + _attr_defaults = { + "logy": False, + "logx": False, + "loglog": False, + "mark_right": True, + "stacked": False, + } + + def __init__( + self, + data, + kind=None, + by=None, + subplots=False, + sharex=None, + sharey=False, + use_index=True, + figsize=None, + grid=None, + legend=True, + rot=None, + ax=None, + fig=None, + title=None, + xlim=None, + ylim=None, + xticks=None, + yticks=None, + sort_columns=False, + fontsize=None, + secondary_y=False, + colormap=None, + table=False, + layout=None, + **kwds + ): import matplotlib.pyplot as plt + converter._WARN = False # no warning for pandas plots self.data = data self.by = by @@ -104,7 +154,7 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=None, self.rot = self._default_rot if grid is None: - grid = False if secondary_y else plt.rcParams['axes.grid'] + grid = False if secondary_y else plt.rcParams["axes.grid"] self.grid = grid self.legend = legend @@ -120,22 +170,23 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=None, self.axes = None # parse errorbar input if given - xerr = kwds.pop('xerr', None) - yerr = kwds.pop('yerr', None) - self.errors = {kw: self._parse_errorbars(kw, err) - for kw, err in zip(['xerr', 'yerr'], [xerr, yerr])} - - if not isinstance(secondary_y, (bool, tuple, list, - np.ndarray, ABCIndexClass)): + xerr = kwds.pop("xerr", None) + yerr = kwds.pop("yerr", None) + self.errors = { + kw: self._parse_errorbars(kw, err) + for kw, err in zip(["xerr", "yerr"], [xerr, yerr]) + } + + if not isinstance(secondary_y, (bool, tuple, list, np.ndarray, ABCIndexClass)): secondary_y = [secondary_y] self.secondary_y = secondary_y # ugly TypeError if user passes matplotlib's `cmap` name. # Probably better to accept either. - if 'cmap' in kwds and colormap: + if "cmap" in kwds and colormap: raise TypeError("Only specify one of `cmap` and `colormap`.") - elif 'cmap' in kwds: - self.colormap = kwds.pop('cmap') + elif "cmap" in kwds: + self.colormap = kwds.pop("cmap") else: self.colormap = colormap @@ -146,40 +197,54 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=None, self._validate_color_args() def _validate_color_args(self): - if 'color' not in self.kwds and 'colors' in self.kwds: - warnings.warn(("'colors' is being deprecated. Please use 'color'" - "instead of 'colors'")) - colors = self.kwds.pop('colors') - self.kwds['color'] = colors - - if ('color' in self.kwds and self.nseries == 1 and - not is_list_like(self.kwds['color'])): + if "color" not in self.kwds and "colors" in self.kwds: + warnings.warn( + ( + "'colors' is being deprecated. Please use 'color'" + "instead of 'colors'" + ) + ) + colors = self.kwds.pop("colors") + self.kwds["color"] = colors + + if ( + "color" in self.kwds + and self.nseries == 1 + and not is_list_like(self.kwds["color"]) + ): # support series.plot(color='green') - self.kwds['color'] = [self.kwds['color']] - - if ('color' in self.kwds and isinstance(self.kwds['color'], tuple) and - self.nseries == 1 and len(self.kwds['color']) in (3, 4)): + self.kwds["color"] = [self.kwds["color"]] + + if ( + "color" in self.kwds + and isinstance(self.kwds["color"], tuple) + and self.nseries == 1 + and len(self.kwds["color"]) in (3, 4) + ): # support RGB and RGBA tuples in series plot - self.kwds['color'] = [self.kwds['color']] + self.kwds["color"] = [self.kwds["color"]] - if ('color' in self.kwds or 'colors' in self.kwds) and \ - self.colormap is not None: - warnings.warn("'color' and 'colormap' cannot be used " - "simultaneously. Using 'color'") + if ( + "color" in self.kwds or "colors" in self.kwds + ) and self.colormap is not None: + warnings.warn( + "'color' and 'colormap' cannot be used " "simultaneously. Using 'color'" + ) - if 'color' in self.kwds and self.style is not None: + if "color" in self.kwds and self.style is not None: if is_list_like(self.style): styles = self.style else: styles = [self.style] # need only a single match for s in styles: - if re.match('^[a-z]+?', s) is not None: + if re.match("^[a-z]+?", s) is not None: raise ValueError( "Cannot pass 'style' string with a color " "symbol and 'color' keyword argument. Please" " use one or the other or pass 'style' " - "without a color symbol") + "without a color symbol" + ) def _iter_data(self, data=None, keep_index=False, fillna=None): if data is None: @@ -227,19 +292,17 @@ def _args_adjust(self): def _has_plotted_object(self, ax): """check whether ax has data""" - return (len(ax.lines) != 0 or - len(ax.artists) != 0 or - len(ax.containers) != 0) + return len(ax.lines) != 0 or len(ax.artists) != 0 or len(ax.containers) != 0 def _maybe_right_yaxis(self, ax, axes_num): if not self.on_right(axes_num): # secondary axes may be passed via ax kw return self._get_ax_layer(ax) - if hasattr(ax, 'right_ax'): + if hasattr(ax, "right_ax"): # if it has right_ax proparty, ``ax`` must be left axes return ax.right_ax - elif hasattr(ax, 'left_ax'): + elif hasattr(ax, "left_ax"): # if it has left_ax proparty, ``ax`` must be right axes return ax else: @@ -254,18 +317,22 @@ def _maybe_right_yaxis(self, ax, axes_num): orig_ax.get_yaxis().set_visible(False) if self.logy is True or self.loglog is True: - new_ax.set_yscale('log') - elif self.logy == 'sym' or self.loglog == 'sym': - new_ax.set_yscale('symlog') + new_ax.set_yscale("log") + elif self.logy == "sym" or self.loglog == "sym": + new_ax.set_yscale("symlog") return new_ax def _setup_subplots(self): if self.subplots: - fig, axes = _subplots(naxes=self.nseries, - sharex=self.sharex, sharey=self.sharey, - figsize=self.figsize, ax=self.ax, - layout=self.layout, - layout_type=self._layout_type) + fig, axes = _subplots( + naxes=self.nseries, + sharex=self.sharex, + sharey=self.sharey, + figsize=self.figsize, + ax=self.ax, + layout=self.layout, + layout_type=self._layout_type, + ) else: if self.ax is None: fig = self.plt.figure(figsize=self.figsize) @@ -278,7 +345,7 @@ def _setup_subplots(self): axes = _flatten(axes) - valid_log = {False, True, 'sym', None} + valid_log = {False, True, "sym", None} input_log = {self.logx, self.logy, self.loglog} if input_log - valid_log: invalid_log = next(iter((input_log - valid_log))) @@ -288,14 +355,14 @@ def _setup_subplots(self): ) if self.logx is True or self.loglog is True: - [a.set_xscale('log') for a in axes] - elif self.logx == 'sym' or self.loglog == 'sym': - [a.set_xscale('symlog') for a in axes] + [a.set_xscale("log") for a in axes] + elif self.logx == "sym" or self.loglog == "sym": + [a.set_xscale("symlog") for a in axes] if self.logy is True or self.loglog is True: - [a.set_yscale('log') for a in axes] - elif self.logy == 'sym' or self.loglog == 'sym': - [a.set_yscale('symlog') for a in axes] + [a.set_yscale("log") for a in axes] + elif self.logy == "sym" or self.loglog == "sym": + [a.set_yscale("symlog") for a in axes] self.fig = fig self.axes = axes @@ -312,9 +379,10 @@ def result(self): return self.axes else: sec_true = isinstance(self.secondary_y, bool) and self.secondary_y - all_sec = (is_list_like(self.secondary_y) and - len(self.secondary_y) == self.nseries) - if (sec_true or all_sec): + all_sec = ( + is_list_like(self.secondary_y) and len(self.secondary_y) == self.nseries + ) + if sec_true or all_sec: # if all data is plotted on secondary, return right axes return self._get_ax_layer(self.axes[0], primary=False) else: @@ -326,16 +394,15 @@ def _compute_plot_data(self): if isinstance(data, ABCSeries): label = self.label if label is None and data.name is None: - label = 'None' + label = "None" data = data.to_frame(name=label) # GH16953, _convert is needed as fallback, for ``Series`` # with ``dtype == object`` data = data._convert(datetime=True, timedelta=True) - numeric_data = data.select_dtypes(include=[np.number, - "datetime", - "datetimetz", - "timedelta"]) + numeric_data = data.select_dtypes( + include=[np.number, "datetime", "datetimetz", "timedelta"] + ) try: is_empty = numeric_data.empty @@ -344,7 +411,7 @@ def _compute_plot_data(self): # no non-numeric frames or series allowed if is_empty: - raise TypeError('no numeric data to plot') + raise TypeError("no numeric data to plot") # GH25587: cast ExtensionArray of pandas (IntegerArray, etc.) to # np.ndarray before plot. @@ -370,23 +437,19 @@ def _add_table(self): def _post_plot_logic_common(self, ax, data): """Common post process for each axes""" - if self.orientation == 'vertical' or self.orientation is None: - self._apply_axis_properties(ax.xaxis, rot=self.rot, - fontsize=self.fontsize) + if self.orientation == "vertical" or self.orientation is None: + self._apply_axis_properties(ax.xaxis, rot=self.rot, fontsize=self.fontsize) self._apply_axis_properties(ax.yaxis, fontsize=self.fontsize) - if hasattr(ax, 'right_ax'): - self._apply_axis_properties(ax.right_ax.yaxis, - fontsize=self.fontsize) + if hasattr(ax, "right_ax"): + self._apply_axis_properties(ax.right_ax.yaxis, fontsize=self.fontsize) - elif self.orientation == 'horizontal': - self._apply_axis_properties(ax.yaxis, rot=self.rot, - fontsize=self.fontsize) + elif self.orientation == "horizontal": + self._apply_axis_properties(ax.yaxis, rot=self.rot, fontsize=self.fontsize) self._apply_axis_properties(ax.xaxis, fontsize=self.fontsize) - if hasattr(ax, 'right_ax'): - self._apply_axis_properties(ax.right_ax.yaxis, - fontsize=self.fontsize) + if hasattr(ax, "right_ax"): + self._apply_axis_properties(ax.right_ax.yaxis, fontsize=self.fontsize) else: # pragma no cover raise ValueError @@ -399,10 +462,15 @@ def _adorn_subplots(self): if len(self.axes) > 0: all_axes = self._get_subplots() nrows, ncols = self._get_axes_layout() - _handle_shared_axes(axarr=all_axes, nplots=len(all_axes), - naxes=nrows * ncols, nrows=nrows, - ncols=ncols, sharex=self.sharex, - sharey=self.sharey) + _handle_shared_axes( + axarr=all_axes, + nplots=len(all_axes), + naxes=nrows * ncols, + nrows=nrows, + ncols=ncols, + sharex=self.sharex, + sharey=self.sharey, + ) for ax in self.axes: if self.yticks is not None: @@ -423,12 +491,13 @@ def _adorn_subplots(self): if self.subplots: if is_list_like(self.title): if len(self.title) != self.nseries: - msg = ('The length of `title` must equal the number ' - 'of columns if using `title` of type `list` ' - 'and `subplots=True`.\n' - 'length of title = {}\n' - 'number of columns = {}').format( - len(self.title), self.nseries) + msg = ( + "The length of `title` must equal the number " + "of columns if using `title` of type `list` " + "and `subplots=True`.\n" + "length of title = {}\n" + "number of columns = {}" + ).format(len(self.title), self.nseries) raise ValueError(msg) for (ax, title) in zip(self.axes, self.title): @@ -437,8 +506,10 @@ def _adorn_subplots(self): self.fig.suptitle(self.title) else: if is_list_like(self.title): - msg = ('Using `title` of type `list` is not supported ' - 'unless `subplots=True` is passed') + msg = ( + "Using `title` of type `list` is not supported " + "unless `subplots=True` is passed" + ) raise ValueError(msg) self.axes[0].set_title(self.title) @@ -466,15 +537,14 @@ def legend_title(self): name = pprint_thing(name) return name else: - stringified = map(pprint_thing, - self.data.columns.names) - return ','.join(stringified) + stringified = map(pprint_thing, self.data.columns.names) + return ",".join(stringified) def _add_legend_handle(self, handle, label, index=None): if label is not None: if self.mark_right and index is not None: if self.on_right(index): - label = label + ' (right)' + label = label + " (right)" self.legend_handles.append(handle) self.legend_labels.append(label) @@ -483,7 +553,7 @@ def _make_legend(self): handles = [] labels = [] - title = '' + title = "" if not self.subplots: if leg is not None: @@ -492,7 +562,7 @@ def _make_legend(self): labels = [x.get_text() for x in leg.get_texts()] if self.legend: - if self.legend == 'reverse': + if self.legend == "reverse": self.legend_handles = reversed(self.legend_handles) self.legend_labels = reversed(self.legend_labels) @@ -502,17 +572,16 @@ def _make_legend(self): title = self.legend_title if len(handles) > 0: - ax.legend(handles, labels, loc='best', title=title) + ax.legend(handles, labels, loc="best", title=title) elif self.subplots and self.legend: for ax in self.axes: if ax.get_visible(): - ax.legend(loc='best') + ax.legend(loc="best") def _get_ax_legend(self, ax): leg = ax.get_legend() - other_ax = (getattr(ax, 'left_ax', None) or - getattr(ax, 'right_ax', None)) + other_ax = getattr(ax, "left_ax", None) or getattr(ax, "right_ax", None) other_leg = None if other_ax is not None: other_leg = other_ax.get_legend() @@ -524,14 +593,14 @@ def _get_ax_legend(self, ax): @cache_readonly def plt(self): import matplotlib.pyplot as plt + return plt _need_to_set_index = False def _get_xticks(self, convert_period=False): index = self.data.index - is_datetype = index.inferred_type in ('datetime', 'date', - 'datetime64', 'time') + is_datetype = index.inferred_type in ("datetime", "date", "datetime64", "time") if self.use_index: if convert_period and isinstance(index, ABCPeriodIndex): @@ -568,10 +637,10 @@ def _plot(cls, ax, x, y, style=None, is_errorbar=False, **kwds): x = x._mpl_repr() if is_errorbar: - if 'xerr' in kwds: - kwds['xerr'] = np.array(kwds.get('xerr')) - if 'yerr' in kwds: - kwds['yerr'] = np.array(kwds.get('yerr')) + if "xerr" in kwds: + kwds["xerr"] = np.array(kwds.get("xerr")) + if "yerr" in kwds: + kwds["yerr"] = np.array(kwds.get("yerr")) return ax.errorbar(x, y, **kwds) else: # prevent style kwarg from going to errorbar, where it is @@ -586,7 +655,7 @@ def _get_index_name(self): if isinstance(self.data.index, ABCMultiIndex): name = self.data.index.names if com._any_not_none(*name): - name = ','.join(pprint_thing(x) for x in name) + name = ",".join(pprint_thing(x) for x in name) else: name = None else: @@ -600,9 +669,9 @@ def _get_index_name(self): def _get_ax_layer(cls, ax, primary=True): """get left (primary) or right (secondary) axes""" if primary: - return getattr(ax, 'left_ax', ax) + return getattr(ax, "left_ax", ax) else: - return getattr(ax, 'right_ax', ax) + return getattr(ax, "right_ax", ax) def _get_ax(self, i): # get the twinx ax if appropriate @@ -630,8 +699,7 @@ def on_right(self, i): if isinstance(self.secondary_y, bool): return self.secondary_y - if isinstance(self.secondary_y, (tuple, list, - np.ndarray, ABCIndexClass)): + if isinstance(self.secondary_y, (tuple, list, np.ndarray, ABCIndexClass)): return self.data.columns[i] in self.secondary_y def _apply_style_colors(self, colors, kwds, col_num, label): @@ -651,19 +719,21 @@ def _apply_style_colors(self, colors, kwds, col_num, label): else: style = self.style - has_color = 'color' in kwds or self.colormap is not None - nocolor_style = style is None or re.match('[a-z]+', style) is None + has_color = "color" in kwds or self.colormap is not None + nocolor_style = style is None or re.match("[a-z]+", style) is None if (has_color or self.subplots) and nocolor_style: - kwds['color'] = colors[col_num % len(colors)] + kwds["color"] = colors[col_num % len(colors)] return style, kwds - def _get_colors(self, num_colors=None, color_kwds='color'): + def _get_colors(self, num_colors=None, color_kwds="color"): if num_colors is None: num_colors = self.nseries - return _get_standard_colors(num_colors=num_colors, - colormap=self.colormap, - color=self.kwds.get(color_kwds)) + return _get_standard_colors( + num_colors=num_colors, + colormap=self.colormap, + color=self.kwds.get(color_kwds), + ) def _parse_errorbars(self, label, err): """ @@ -719,12 +789,15 @@ def match_labels(data, e): # asymmetrical error bars if err.ndim == 3: - if (err_shape[0] != self.nseries) or \ - (err_shape[1] != 2) or \ - (err_shape[2] != len(self.data)): - msg = "Asymmetrical error bars should be provided " + \ - "with the shape (%u, 2, %u)" % \ - (self.nseries, len(self.data)) + if ( + (err_shape[0] != self.nseries) + or (err_shape[1] != 2) + or (err_shape[2] != len(self.data)) + ): + msg = ( + "Asymmetrical error bars should be provided " + + "with the shape (%u, 2, %u)" % (self.nseries, len(self.data)) + ) raise ValueError(msg) # broadcast errors to each data series @@ -743,7 +816,7 @@ def match_labels(data, e): def _get_errorbars(self, label=None, index=None, xerr=True, yerr=True): errors = {} - for kw, flag in zip(['xerr', 'yerr'], [xerr, yerr]): + for kw, flag in zip(["xerr", "yerr"], [xerr, yerr]): if flag: err = self.errors[kw] # user provided label-matched dataframe of errors @@ -761,8 +834,10 @@ def _get_errorbars(self, label=None, index=None, xerr=True, yerr=True): def _get_subplots(self): from matplotlib.axes import Subplot - return [ax for ax in self.axes[0].get_figure().get_axes() - if isinstance(ax, Subplot)] + + return [ + ax for ax in self.axes[0].get_figure().get_axes() if isinstance(ax, Subplot) + ] def _get_axes_layout(self): axes = self._get_subplots() @@ -781,20 +856,20 @@ class PlanePlot(MPLPlot): Abstract class for plotting on plane, currently scatter and hexbin. """ - _layout_type = 'single' + _layout_type = "single" def __init__(self, data, x, y, **kwargs): MPLPlot.__init__(self, data, **kwargs) if x is None or y is None: - raise ValueError(self._kind + ' requires an x and y column') + raise ValueError(self._kind + " requires an x and y column") if is_integer(x) and not self.data.columns.holds_integer(): x = self.data.columns[x] if is_integer(y) and not self.data.columns.holds_integer(): y = self.data.columns[y] if len(self.data[x]._get_numeric_data()) == 0: - raise ValueError(self._kind + ' requires x column to be numeric') + raise ValueError(self._kind + " requires x column to be numeric") if len(self.data[y]._get_numeric_data()) == 0: - raise ValueError(self._kind + ' requires y column to be numeric') + raise ValueError(self._kind + " requires y column to be numeric") self.x = x self.y = y @@ -830,10 +905,14 @@ def _plot_colorbar(self, ax, **kwds): points = ax.get_position().get_points() cbar_points = cbar.ax.get_position().get_points() - cbar.ax.set_position([cbar_points[0, 0], - points[0, 1], - cbar_points[1, 0] - cbar_points[0, 0], - points[1, 1] - points[0, 1]]) + cbar.ax.set_position( + [ + cbar_points[0, 0], + points[0, 1], + cbar_points[1, 0] - cbar_points[0, 0], + points[1, 1] - points[0, 1], + ] + ) # To see the discrepancy in axis heights uncomment # the following two lines: # print(points[1, 1] - points[0, 1]) @@ -841,7 +920,7 @@ def _plot_colorbar(self, ax, **kwds): class ScatterPlot(PlanePlot): - _kind = 'scatter' + _kind = "scatter" def __init__(self, data, x, y, s=None, c=None, **kwargs): if s is None: @@ -860,16 +939,16 @@ def _make_plot(self): c_is_column = is_hashable(c) and c in self.data.columns # plot a colorbar only if a colormap is provided or necessary - cb = self.kwds.pop('colorbar', self.colormap or c_is_column) + cb = self.kwds.pop("colorbar", self.colormap or c_is_column) # pandas uses colormap, matplotlib uses cmap. - cmap = self.colormap or 'Greys' + cmap = self.colormap or "Greys" cmap = self.plt.cm.get_cmap(cmap) color = self.kwds.pop("color", None) if c is not None and color is not None: - raise TypeError('Specify exactly one of `c` and `color`') + raise TypeError("Specify exactly one of `c` and `color`") elif c is None and color is None: - c_values = self.plt.rcParams['patch.facecolor'] + c_values = self.plt.rcParams["patch.facecolor"] elif color is not None: c_values = color elif c_is_column: @@ -877,14 +956,20 @@ def _make_plot(self): else: c_values = c - if self.legend and hasattr(self, 'label'): + if self.legend and hasattr(self, "label"): label = self.label else: label = None - scatter = ax.scatter(data[x].values, data[y].values, c=c_values, - label=label, cmap=cmap, **self.kwds) + scatter = ax.scatter( + data[x].values, + data[y].values, + c=c_values, + label=label, + cmap=cmap, + **self.kwds + ) if cb: - cbar_label = c if c_is_column else '' + cbar_label = c if c_is_column else "" self._plot_colorbar(ax, label=cbar_label) if label is not None: @@ -896,13 +981,12 @@ def _make_plot(self): errors_y = self._get_errorbars(label=y, index=0, xerr=False) if len(errors_x) > 0 or len(errors_y) > 0: err_kwds = dict(errors_x, **errors_y) - err_kwds['ecolor'] = scatter.get_facecolor()[0] - ax.errorbar(data[x].values, data[y].values, - linestyle='none', **err_kwds) + err_kwds["ecolor"] = scatter.get_facecolor()[0] + ax.errorbar(data[x].values, data[y].values, linestyle="none", **err_kwds) class HexBinPlot(PlanePlot): - _kind = 'hexbin' + _kind = "hexbin" def __init__(self, data, x, y, C=None, **kwargs): super().__init__(data, x, y, **kwargs) @@ -914,17 +998,16 @@ def _make_plot(self): x, y, data, C = self.x, self.y, self.data, self.C ax = self.axes[0] # pandas uses colormap, matplotlib uses cmap. - cmap = self.colormap or 'BuGn' + cmap = self.colormap or "BuGn" cmap = self.plt.cm.get_cmap(cmap) - cb = self.kwds.pop('colorbar', True) + cb = self.kwds.pop("colorbar", True) if C is None: c_values = None else: c_values = data[C].values - ax.hexbin(data[x].values, data[y].values, C=c_values, cmap=cmap, - **self.kwds) + ax.hexbin(data[x].values, data[y].values, C=c_values, cmap=cmap, **self.kwds) if cb: self._plot_colorbar(ax) @@ -933,18 +1016,19 @@ def _make_legend(self): class LinePlot(MPLPlot): - _kind = 'line' + _kind = "line" _default_rot = 0 - orientation = 'vertical' + orientation = "vertical" def __init__(self, data, **kwargs): from pandas.plotting import plot_params + MPLPlot.__init__(self, data, **kwargs) if self.stacked: self.data = self.data.fillna(value=0) - self.x_compat = plot_params['x_compat'] - if 'x_compat' in self.kwds: - self.x_compat = bool(self.kwds.pop('x_compat')) + self.x_compat = plot_params["x_compat"] + if "x_compat" in self.kwds: + self.x_compat = bool(self.kwds.pop("x_compat")) def _is_ts_plot(self): # this is slightly deceptive @@ -952,15 +1036,16 @@ def _is_ts_plot(self): def _use_dynamic_x(self): from pandas.plotting._matplotlib.timeseries import _use_dynamic_x + return _use_dynamic_x(self._get_ax(0), self.data) def _make_plot(self): if self._is_ts_plot(): - from pandas.plotting._matplotlib.timeseries import ( - _maybe_convert_index) + from pandas.plotting._matplotlib.timeseries import _maybe_convert_index + data = _maybe_convert_index(self._get_ax(0), self.data) - x = data.index # dummy, not used + x = data.index # dummy, not used plotf = self._ts_plot it = self._iter_data(data=data, keep_index=True) else: @@ -981,12 +1066,18 @@ def _make_plot(self): kwds = dict(kwds, **errors) label = pprint_thing(label) # .encode('utf-8') - kwds['label'] = label - - newlines = plotf(ax, x, y, style=style, column_num=i, - stacking_id=stacking_id, - is_errorbar=is_errorbar, - **kwds) + kwds["label"] = label + + newlines = plotf( + ax, + x, + y, + style=style, + column_num=i, + stacking_id=stacking_id, + is_errorbar=is_errorbar, + **kwds + ) self._add_legend_handle(newlines[0], label, index=i) lines = _get_all_lines(ax) @@ -994,22 +1085,24 @@ def _make_plot(self): ax.set_xlim(left, right) @classmethod - def _plot(cls, ax, x, y, style=None, column_num=None, - stacking_id=None, **kwds): + def _plot(cls, ax, x, y, style=None, column_num=None, stacking_id=None, **kwds): # column_num is used to get the target column from protf in line and # area plots if column_num == 0: cls._initialize_stacker(ax, stacking_id, len(y)) - y_values = cls._get_stacked_values(ax, stacking_id, y, kwds['label']) + y_values = cls._get_stacked_values(ax, stacking_id, y, kwds["label"]) lines = MPLPlot._plot(ax, x, y_values, style=style, **kwds) cls._update_stacker(ax, stacking_id, y) return lines @classmethod def _ts_plot(cls, ax, x, data, style=None, **kwds): - from pandas.plotting._matplotlib.timeseries import (_maybe_resample, - _decorate_axes, - format_dateaxis) + from pandas.plotting._matplotlib.timeseries import ( + _maybe_resample, + _decorate_axes, + format_dateaxis, + ) + # accept x to be consistent with normal plot func, # x is not passed to tsplot as it uses data.index as x coordinate # column_num must be in kwds for stacking purpose @@ -1018,9 +1111,9 @@ def _ts_plot(cls, ax, x, data, style=None, **kwds): # Set ax with freq info _decorate_axes(ax, freq, kwds) # digging deeper - if hasattr(ax, 'left_ax'): + if hasattr(ax, "left_ax"): _decorate_axes(ax.left_ax, freq, kwds) - if hasattr(ax, 'right_ax'): + if hasattr(ax, "right_ax"): _decorate_axes(ax.right_ax, freq, kwds) ax._plot_data.append((data, cls._kind, kwds)) @@ -1039,9 +1132,9 @@ def _get_stacking_id(self): def _initialize_stacker(cls, ax, stacking_id, n): if stacking_id is None: return - if not hasattr(ax, '_stacker_pos_prior'): + if not hasattr(ax, "_stacker_pos_prior"): ax._stacker_pos_prior = {} - if not hasattr(ax, '_stacker_neg_prior'): + if not hasattr(ax, "_stacker_neg_prior"): ax._stacker_neg_prior = {} ax._stacker_pos_prior[stacking_id] = np.zeros(n) ax._stacker_neg_prior[stacking_id] = np.zeros(n) @@ -1050,7 +1143,7 @@ def _initialize_stacker(cls, ax, stacking_id, n): def _get_stacked_values(cls, ax, stacking_id, values, label): if stacking_id is None: return values - if not hasattr(ax, '_stacker_pos_prior'): + if not hasattr(ax, "_stacker_pos_prior"): # stacker may not be initialized for subplots cls._initialize_stacker(ax, stacking_id, len(values)) @@ -1059,10 +1152,11 @@ def _get_stacked_values(cls, ax, stacking_id, values, label): elif (values <= 0).all(): return ax._stacker_neg_prior[stacking_id] + values - raise ValueError('When stacked is True, each column must be either ' - 'all positive or negative.' - '{0} contains both positive and negative values' - .format(label)) + raise ValueError( + "When stacked is True, each column must be either " + "all positive or negative." + "{0} contains both positive and negative values".format(label) + ) @classmethod def _update_stacker(cls, ax, stacking_id, values): @@ -1080,7 +1174,7 @@ def get_label(i): try: return pprint_thing(data.index[i]) except Exception: - return '' + return "" if self._need_to_set_index: xticks = ax.get_xticks() @@ -1088,10 +1182,12 @@ def get_label(i): ax.set_xticklabels(xticklabels) ax.xaxis.set_major_locator(FixedLocator(xticks)) - condition = (not self._use_dynamic_x() and - data.index.is_all_dates and - not self.subplots or - (self.subplots and self.sharex)) + condition = ( + not self._use_dynamic_x() + and data.index.is_all_dates + and not self.subplots + or (self.subplots and self.sharex) + ) index_name = self._get_index_name() @@ -1107,31 +1203,40 @@ def get_label(i): class AreaPlot(LinePlot): - _kind = 'area' + _kind = "area" def __init__(self, data, **kwargs): - kwargs.setdefault('stacked', True) + kwargs.setdefault("stacked", True) data = data.fillna(value=0) LinePlot.__init__(self, data, **kwargs) if not self.stacked: # use smaller alpha to distinguish overlap - self.kwds.setdefault('alpha', 0.5) + self.kwds.setdefault("alpha", 0.5) if self.logy or self.loglog: raise ValueError("Log-y scales are not supported in area plot") @classmethod - def _plot(cls, ax, x, y, style=None, column_num=None, - stacking_id=None, is_errorbar=False, **kwds): + def _plot( + cls, + ax, + x, + y, + style=None, + column_num=None, + stacking_id=None, + is_errorbar=False, + **kwds + ): if column_num == 0: cls._initialize_stacker(ax, stacking_id, len(y)) - y_values = cls._get_stacked_values(ax, stacking_id, y, kwds['label']) + y_values = cls._get_stacked_values(ax, stacking_id, y, kwds["label"]) # need to remove label, because subplots uses mpl legend as it is line_kwds = kwds.copy() - line_kwds.pop('label') + line_kwds.pop("label") lines = MPLPlot._plot(ax, x, y_values, style=style, **line_kwds) # get data from the line to get coordinates for fill_between @@ -1147,8 +1252,8 @@ def _plot(cls, ax, x, y, style=None, column_num=None, else: start = np.zeros(len(y)) - if 'color' not in kwds: - kwds['color'] = lines[0].get_color() + if "color" not in kwds: + kwds["color"] = lines[0].get_color() rect = ax.fill_between(xdata, start, y_values, **kwds) cls._update_stacker(ax, stacking_id, y) @@ -1168,33 +1273,33 @@ def _post_plot_logic(self, ax, data): class BarPlot(MPLPlot): - _kind = 'bar' + _kind = "bar" _default_rot = 90 - orientation = 'vertical' + orientation = "vertical" def __init__(self, data, **kwargs): # we have to treat a series differently than a # 1-column DataFrame w.r.t. color handling self._is_series = isinstance(data, ABCSeries) - self.bar_width = kwargs.pop('width', 0.5) - pos = kwargs.pop('position', 0.5) - kwargs.setdefault('align', 'center') + self.bar_width = kwargs.pop("width", 0.5) + pos = kwargs.pop("position", 0.5) + kwargs.setdefault("align", "center") self.tick_pos = np.arange(len(data)) - self.bottom = kwargs.pop('bottom', 0) - self.left = kwargs.pop('left', 0) + self.bottom = kwargs.pop("bottom", 0) + self.left = kwargs.pop("left", 0) - self.log = kwargs.pop('log', False) + self.log = kwargs.pop("log", False) MPLPlot.__init__(self, data, **kwargs) if self.stacked or self.subplots: self.tickoffset = self.bar_width * pos - if kwargs['align'] == 'edge': + if kwargs["align"] == "edge": self.lim_offset = self.bar_width / 2 else: self.lim_offset = 0 else: - if kwargs['align'] == 'edge': + if kwargs["align"] == "edge": w = self.bar_width / self.nseries self.tickoffset = self.bar_width * (pos - 0.5) + w * 0.5 self.lim_offset = w * 0.5 @@ -1231,18 +1336,17 @@ def _make_plot(self): ax = self._get_ax(i) kwds = self.kwds.copy() if self._is_series: - kwds['color'] = colors + kwds["color"] = colors else: - kwds['color'] = colors[i % ncolors] + kwds["color"] = colors[i % ncolors] errors = self._get_errorbars(label=label, index=i) kwds = dict(kwds, **errors) label = pprint_thing(label) - if (('yerr' in kwds) or ('xerr' in kwds)) \ - and (kwds.get('ecolor') is None): - kwds['ecolor'] = mpl.rcParams['xtick.color'] + if (("yerr" in kwds) or ("xerr" in kwds)) and (kwds.get("ecolor") is None): + kwds["ecolor"] = mpl.rcParams["xtick.color"] start = 0 if self.log and (y >= 1).all(): @@ -1251,24 +1355,45 @@ def _make_plot(self): if self.subplots: w = self.bar_width / 2 - rect = self._plot(ax, self.ax_pos + w, y, self.bar_width, - start=start, label=label, - log=self.log, **kwds) + rect = self._plot( + ax, + self.ax_pos + w, + y, + self.bar_width, + start=start, + label=label, + log=self.log, + **kwds + ) ax.set_title(label) elif self.stacked: mask = y > 0 start = np.where(mask, pos_prior, neg_prior) + self._start_base w = self.bar_width / 2 - rect = self._plot(ax, self.ax_pos + w, y, self.bar_width, - start=start, label=label, - log=self.log, **kwds) + rect = self._plot( + ax, + self.ax_pos + w, + y, + self.bar_width, + start=start, + label=label, + log=self.log, + **kwds + ) pos_prior = pos_prior + np.where(mask, y, 0) neg_prior = neg_prior + np.where(mask, 0, y) else: w = self.bar_width / K - rect = self._plot(ax, self.ax_pos + (i + 0.5) * w, y, w, - start=start, label=label, - log=self.log, **kwds) + rect = self._plot( + ax, + self.ax_pos + (i + 0.5) * w, + y, + w, + start=start, + label=label, + log=self.log, + **kwds + ) self._add_legend_handle(rect, label, index=i) def _post_plot_logic(self, ax, data): @@ -1292,9 +1417,9 @@ def _decorate_ticks(self, ax, name, ticklabels, start_edge, end_edge): class BarhPlot(BarPlot): - _kind = 'barh' + _kind = "barh" _default_rot = 0 - orientation = 'horizontal' + orientation = "horizontal" @property def _start_base(self): @@ -1314,8 +1439,8 @@ def _decorate_ticks(self, ax, name, ticklabels, start_edge, end_edge): class PiePlot(MPLPlot): - _kind = 'pie' - _layout_type = 'horizontal' + _kind = "pie" + _layout_type = "horizontal" def __init__(self, data, kind=None, **kwargs): data = data.fillna(value=0) @@ -1333,9 +1458,8 @@ def _validate_color_args(self): pass def _make_plot(self): - colors = self._get_colors( - num_colors=len(self.data), color_kwds='colors') - self.kwds.setdefault('colors', colors) + colors = self._get_colors(num_colors=len(self.data), color_kwds="colors") + self.kwds.setdefault("colors", colors) for i, (label, y) in enumerate(self._iter_data()): ax = self._get_ax(i) @@ -1347,23 +1471,22 @@ def _make_plot(self): def blank_labeler(label, value): if value == 0: - return '' + return "" else: return label idx = [pprint_thing(v) for v in self.data.index] - labels = kwds.pop('labels', idx) + labels = kwds.pop("labels", idx) # labels is used for each wedge's labels # Blank out labels for values of 0 so they don't overlap # with nonzero wedges if labels is not None: - blabels = [blank_labeler(l, value) for - l, value in zip(labels, y)] + blabels = [blank_labeler(l, value) for l, value in zip(labels, y)] else: blabels = None results = ax.pie(y, labels=blabels, **kwds) - if kwds.get('autopct', None) is not None: + if kwds.get("autopct", None) is not None: patches, texts, autotexts = results else: patches, texts = results diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index d34c0cb6a3889f..5213e09f14067d 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -11,15 +11,14 @@ from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib import converter from pandas.plotting._matplotlib.core import LinePlot, MPLPlot -from pandas.plotting._matplotlib.tools import ( - _flatten, _set_ticks_props, _subplots) +from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots class HistPlot(LinePlot): - _kind = 'hist' + _kind = "hist" def __init__(self, data, bins=10, bottom=0, **kwargs): - self.bins = bins # use mpl default + self.bins = bins # use mpl default self.bottom = bottom # Do not call LinePlot.__init__ which may fill nan MPLPlot.__init__(self, data, **kwargs) @@ -27,28 +26,38 @@ def __init__(self, data, bins=10, bottom=0, **kwargs): def _args_adjust(self): if is_integer(self.bins): # create common bin edge - values = (self.data._convert(datetime=True)._get_numeric_data()) + values = self.data._convert(datetime=True)._get_numeric_data() values = np.ravel(values) values = values[~isna(values)] hist, self.bins = np.histogram( - values, bins=self.bins, - range=self.kwds.get('range', None), - weights=self.kwds.get('weights', None)) + values, + bins=self.bins, + range=self.kwds.get("range", None), + weights=self.kwds.get("weights", None), + ) if is_list_like(self.bottom): self.bottom = np.array(self.bottom) @classmethod - def _plot(cls, ax, y, style=None, bins=None, bottom=0, column_num=0, - stacking_id=None, **kwds): + def _plot( + cls, + ax, + y, + style=None, + bins=None, + bottom=0, + column_num=0, + stacking_id=None, + **kwds + ): if column_num == 0: cls._initialize_stacker(ax, stacking_id, len(bins) - 1) y = y[~isna(y)] base = np.zeros(len(bins) - 1) - bottom = bottom + \ - cls._get_stacked_values(ax, stacking_id, base, kwds['label']) + bottom = bottom + cls._get_stacked_values(ax, stacking_id, base, kwds["label"]) # ignore style n, bins, patches = ax.hist(y, bins=bins, bottom=bottom, **kwds) cls._update_stacker(ax, stacking_id, n) @@ -64,41 +73,40 @@ def _make_plot(self): kwds = self.kwds.copy() label = pprint_thing(label) - kwds['label'] = label + kwds["label"] = label style, kwds = self._apply_style_colors(colors, kwds, i, label) if style is not None: - kwds['style'] = style + kwds["style"] = style kwds = self._make_plot_keywords(kwds, y) - artists = self._plot(ax, y, column_num=i, - stacking_id=stacking_id, **kwds) + artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds) self._add_legend_handle(artists[0], label, index=i) def _make_plot_keywords(self, kwds, y): """merge BoxPlot/KdePlot properties to passed kwds""" # y is required for KdePlot - kwds['bottom'] = self.bottom - kwds['bins'] = self.bins + kwds["bottom"] = self.bottom + kwds["bins"] = self.bins return kwds def _post_plot_logic(self, ax, data): - if self.orientation == 'horizontal': - ax.set_xlabel('Frequency') + if self.orientation == "horizontal": + ax.set_xlabel("Frequency") else: - ax.set_ylabel('Frequency') + ax.set_ylabel("Frequency") @property def orientation(self): - if self.kwds.get('orientation', None) == 'horizontal': - return 'horizontal' + if self.kwds.get("orientation", None) == "horizontal": + return "horizontal" else: - return 'vertical' + return "vertical" class KdePlot(HistPlot): - _kind = 'kde' - orientation = 'vertical' + _kind = "kde" + orientation = "vertical" def __init__(self, data, bw_method=None, ind=None, **kwargs): MPLPlot.__init__(self, data, **kwargs) @@ -112,19 +120,34 @@ def _get_ind(self, y): if self.ind is None: # np.nanmax() and np.nanmin() ignores the missing values sample_range = np.nanmax(y) - np.nanmin(y) - ind = np.linspace(np.nanmin(y) - 0.5 * sample_range, - np.nanmax(y) + 0.5 * sample_range, 1000) + ind = np.linspace( + np.nanmin(y) - 0.5 * sample_range, + np.nanmax(y) + 0.5 * sample_range, + 1000, + ) elif is_integer(self.ind): sample_range = np.nanmax(y) - np.nanmin(y) - ind = np.linspace(np.nanmin(y) - 0.5 * sample_range, - np.nanmax(y) + 0.5 * sample_range, self.ind) + ind = np.linspace( + np.nanmin(y) - 0.5 * sample_range, + np.nanmax(y) + 0.5 * sample_range, + self.ind, + ) else: ind = self.ind return ind @classmethod - def _plot(cls, ax, y, style=None, bw_method=None, ind=None, - column_num=None, stacking_id=None, **kwds): + def _plot( + cls, + ax, + y, + style=None, + bw_method=None, + ind=None, + column_num=None, + stacking_id=None, + **kwds + ): from scipy.stats import gaussian_kde y = remove_na_arraylike(y) @@ -135,22 +158,36 @@ def _plot(cls, ax, y, style=None, bw_method=None, ind=None, return lines def _make_plot_keywords(self, kwds, y): - kwds['bw_method'] = self.bw_method - kwds['ind'] = self._get_ind(y) + kwds["bw_method"] = self.bw_method + kwds["ind"] = self._get_ind(y) return kwds def _post_plot_logic(self, ax, data): - ax.set_ylabel('Density') - - -def _grouped_plot(plotf, data, column=None, by=None, numeric_only=True, - figsize=None, sharex=True, sharey=True, layout=None, - rot=0, ax=None, **kwargs): - - if figsize == 'default': + ax.set_ylabel("Density") + + +def _grouped_plot( + plotf, + data, + column=None, + by=None, + numeric_only=True, + figsize=None, + sharex=True, + sharey=True, + layout=None, + rot=0, + ax=None, + **kwargs +): + + if figsize == "default": # allowed to specify mpl default with 'default' - warnings.warn("figsize='default' is deprecated. Specify figure " - "size by tuple instead", FutureWarning, stacklevel=5) + warnings.warn( + "figsize='default' is deprecated. Specify figure " "size by tuple instead", + FutureWarning, + stacklevel=5, + ) figsize = None grouped = data.groupby(by) @@ -158,9 +195,9 @@ def _grouped_plot(plotf, data, column=None, by=None, numeric_only=True, grouped = grouped[column] naxes = len(grouped) - fig, axes = _subplots(naxes=naxes, figsize=figsize, - sharex=sharex, sharey=sharey, ax=ax, - layout=layout) + fig, axes = _subplots( + naxes=naxes, figsize=figsize, sharex=sharex, sharey=sharey, ax=ax, layout=layout + ) _axes = _flatten(axes) @@ -174,10 +211,24 @@ def _grouped_plot(plotf, data, column=None, by=None, numeric_only=True, return fig, axes -def _grouped_hist(data, column=None, by=None, ax=None, bins=50, figsize=None, - layout=None, sharex=False, sharey=False, rot=90, grid=True, - xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, - **kwargs): +def _grouped_hist( + data, + column=None, + by=None, + ax=None, + bins=50, + figsize=None, + layout=None, + sharex=False, + sharey=False, + rot=90, + grid=True, + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, + **kwargs +): """ Grouped histogram @@ -200,76 +251,139 @@ def _grouped_hist(data, column=None, by=None, ax=None, bins=50, figsize=None, ------- collection of Matplotlib Axes """ + def plot_group(group, ax): ax.hist(group.dropna().values, bins=bins, **kwargs) converter._WARN = False # no warning for pandas plots xrot = xrot or rot - fig, axes = _grouped_plot(plot_group, data, column=column, - by=by, sharex=sharex, sharey=sharey, ax=ax, - figsize=figsize, layout=layout, rot=rot) - - _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot, - ylabelsize=ylabelsize, yrot=yrot) - - fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, - hspace=0.5, wspace=0.3) + fig, axes = _grouped_plot( + plot_group, + data, + column=column, + by=by, + sharex=sharex, + sharey=sharey, + ax=ax, + figsize=figsize, + layout=layout, + rot=rot, + ) + + _set_ticks_props( + axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot + ) + + fig.subplots_adjust( + bottom=0.15, top=0.9, left=0.1, right=0.9, hspace=0.5, wspace=0.3 + ) return axes -def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, - xrot=None, ylabelsize=None, yrot=None, figsize=None, - bins=10, **kwds): +def hist_series( + self, + by=None, + ax=None, + grid=True, + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, + figsize=None, + bins=10, + **kwds +): import matplotlib.pyplot as plt + if by is None: - if kwds.get('layout', None) is not None: - raise ValueError("The 'layout' keyword is not supported when " - "'by' is None") + if kwds.get("layout", None) is not None: + raise ValueError( + "The 'layout' keyword is not supported when " "'by' is None" + ) # hack until the plotting interface is a bit more unified - fig = kwds.pop('figure', plt.gcf() if plt.get_fignums() else - plt.figure(figsize=figsize)) - if (figsize is not None and tuple(figsize) != - tuple(fig.get_size_inches())): + fig = kwds.pop( + "figure", plt.gcf() if plt.get_fignums() else plt.figure(figsize=figsize) + ) + if figsize is not None and tuple(figsize) != tuple(fig.get_size_inches()): fig.set_size_inches(*figsize, forward=True) if ax is None: ax = fig.gca() elif ax.get_figure() != fig: - raise AssertionError('passed axis not bound to passed figure') + raise AssertionError("passed axis not bound to passed figure") values = self.dropna().values ax.hist(values, bins=bins, **kwds) ax.grid(grid) axes = np.array([ax]) - _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot, - ylabelsize=ylabelsize, yrot=yrot) + _set_ticks_props( + axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot + ) else: - if 'figure' in kwds: - raise ValueError("Cannot pass 'figure' when using the " - "'by' argument, since a new 'Figure' instance " - "will be created") - axes = _grouped_hist(self, by=by, ax=ax, grid=grid, figsize=figsize, - bins=bins, xlabelsize=xlabelsize, xrot=xrot, - ylabelsize=ylabelsize, yrot=yrot, **kwds) - - if hasattr(axes, 'ndim'): + if "figure" in kwds: + raise ValueError( + "Cannot pass 'figure' when using the " + "'by' argument, since a new 'Figure' instance " + "will be created" + ) + axes = _grouped_hist( + self, + by=by, + ax=ax, + grid=grid, + figsize=figsize, + bins=bins, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + **kwds + ) + + if hasattr(axes, "ndim"): if axes.ndim == 1 and len(axes) == 1: return axes[0] return axes -def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, - xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, - sharey=False, figsize=None, layout=None, bins=10, **kwds): +def hist_frame( + data, + column=None, + by=None, + grid=True, + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, + ax=None, + sharex=False, + sharey=False, + figsize=None, + layout=None, + bins=10, + **kwds +): converter._WARN = False # no warning for pandas plots if by is not None: - axes = _grouped_hist(data, column=column, by=by, ax=ax, grid=grid, - figsize=figsize, sharex=sharex, sharey=sharey, - layout=layout, bins=bins, xlabelsize=xlabelsize, - xrot=xrot, ylabelsize=ylabelsize, - yrot=yrot, **kwds) + axes = _grouped_hist( + data, + column=column, + by=by, + ax=ax, + grid=grid, + figsize=figsize, + sharex=sharex, + sharey=sharey, + layout=layout, + bins=bins, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + **kwds + ) return axes if column is not None: @@ -280,12 +394,17 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, naxes = len(data.columns) if naxes == 0: - raise ValueError("hist method requires numerical columns, " - "nothing to plot.") - - fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False, - sharex=sharex, sharey=sharey, figsize=figsize, - layout=layout) + raise ValueError("hist method requires numerical columns, " "nothing to plot.") + + fig, axes = _subplots( + naxes=naxes, + ax=ax, + squeeze=False, + sharex=sharex, + sharey=sharey, + figsize=figsize, + layout=layout, + ) _axes = _flatten(axes) for i, col in enumerate(com.try_sort(data.columns)): @@ -294,8 +413,9 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, ax.set_title(col) ax.grid(grid) - _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot, - ylabelsize=ylabelsize, yrot=yrot) + _set_ticks_props( + axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot + ) fig.subplots_adjust(wspace=0.3, hspace=0.3) return axes diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py index 663a3c5153fac9..6d5a94c4d5ff8a 100644 --- a/pandas/plotting/_matplotlib/misc.py +++ b/pandas/plotting/_matplotlib/misc.py @@ -11,14 +11,23 @@ from pandas.plotting._matplotlib.tools import _set_ticks_props, _subplots -def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, - diagonal='hist', marker='.', density_kwds=None, - hist_kwds=None, range_padding=0.05, **kwds): +def scatter_matrix( + frame, + alpha=0.5, + figsize=None, + ax=None, + grid=False, + diagonal="hist", + marker=".", + density_kwds=None, + hist_kwds=None, + range_padding=0.05, + **kwds +): df = frame._get_numeric_data() n = df.columns.size naxes = n * n - fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax, - squeeze=False) + fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False) # no gaps between subplots fig.subplots_adjust(wspace=0, hspace=0) @@ -31,13 +40,13 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, density_kwds = density_kwds or {} # GH 14855 - kwds.setdefault('edgecolors', 'none') + kwds.setdefault("edgecolors", "none") boundaries_list = [] for a in df.columns: values = df[a].values[mask[a].values] rmin_, rmax_ = np.min(values), np.max(values) - rdelta_ext = (rmax_ - rmin_) * range_padding / 2. + rdelta_ext = (rmax_ - rmin_) * range_padding / 2.0 boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext)) for i, a in enumerate(df.columns): @@ -48,11 +57,12 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, values = df[a].values[mask[a].values] # Deal with the diagonal by drawing a histogram there. - if diagonal == 'hist': + if diagonal == "hist": ax.hist(values, **hist_kwds) - elif diagonal in ('kde', 'density'): + elif diagonal in ("kde", "density"): from scipy.stats import gaussian_kde + y = values gkde = gaussian_kde(y) ind = np.linspace(y.min(), y.max(), 1000) @@ -63,8 +73,9 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, else: common = (mask[a] & mask[b]).values - ax.scatter(df[b][common], df[a][common], - marker=marker, alpha=alpha, **kwds) + ax.scatter( + df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds + ) ax.set_xlim(boundaries_list[j]) ax.set_ylim(boundaries_list[i]) @@ -99,7 +110,7 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, def _get_marker_compat(marker): if marker not in mlines.lineMarkers: - return 'o' + return "o" return marker @@ -120,16 +131,20 @@ def normalize(series): ax = plt.gca(xlim=[-1, 1], ylim=[-1, 1]) to_plot = {} - colors = _get_standard_colors(num_colors=len(classes), colormap=colormap, - color_type='random', color=color) + colors = _get_standard_colors( + num_colors=len(classes), colormap=colormap, color_type="random", color=color + ) for kls in classes: to_plot[kls] = [[], []] m = len(frame.columns) - 1 - s = np.array([(np.cos(t), np.sin(t)) - for t in [2.0 * np.pi * (i / float(m)) - for i in range(m)]]) + s = np.array( + [ + (np.cos(t), np.sin(t)) + for t in [2.0 * np.pi * (i / float(m)) for i in range(m)] + ] + ) for i in range(n): row = df.iloc[i].values @@ -140,35 +155,50 @@ def normalize(series): to_plot[kls][1].append(y[1]) for i, kls in enumerate(classes): - ax.scatter(to_plot[kls][0], to_plot[kls][1], color=colors[i], - label=pprint_thing(kls), **kwds) + ax.scatter( + to_plot[kls][0], + to_plot[kls][1], + color=colors[i], + label=pprint_thing(kls), + **kwds + ) ax.legend() - ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor='none')) + ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor="none")) for xy, name in zip(s, df.columns): - ax.add_patch(patches.Circle(xy, radius=0.025, facecolor='gray')) + ax.add_patch(patches.Circle(xy, radius=0.025, facecolor="gray")) if xy[0] < 0.0 and xy[1] < 0.0: - ax.text(xy[0] - 0.025, xy[1] - 0.025, name, - ha='right', va='top', size='small') + ax.text( + xy[0] - 0.025, xy[1] - 0.025, name, ha="right", va="top", size="small" + ) elif xy[0] < 0.0 and xy[1] >= 0.0: - ax.text(xy[0] - 0.025, xy[1] + 0.025, name, - ha='right', va='bottom', size='small') + ax.text( + xy[0] - 0.025, + xy[1] + 0.025, + name, + ha="right", + va="bottom", + size="small", + ) elif xy[0] >= 0.0 and xy[1] < 0.0: - ax.text(xy[0] + 0.025, xy[1] - 0.025, name, - ha='left', va='top', size='small') + ax.text( + xy[0] + 0.025, xy[1] - 0.025, name, ha="left", va="top", size="small" + ) elif xy[0] >= 0.0 and xy[1] >= 0.0: - ax.text(xy[0] + 0.025, xy[1] + 0.025, name, - ha='left', va='bottom', size='small') + ax.text( + xy[0] + 0.025, xy[1] + 0.025, name, ha="left", va="bottom", size="small" + ) - ax.axis('equal') + ax.axis("equal") return ax -def andrews_curves(frame, class_column, ax=None, samples=200, color=None, - colormap=None, **kwds): +def andrews_curves( + frame, class_column, ax=None, samples=200, color=None, colormap=None, **kwds +): import matplotlib.pyplot as plt def function(amplitudes): @@ -187,10 +217,13 @@ def f(t): harmonics = np.arange(0, coeffs.shape[0]) + 1 trig_args = np.outer(harmonics, t) - result += np.sum(coeffs[:, 0, np.newaxis] * np.sin(trig_args) + - coeffs[:, 1, np.newaxis] * np.cos(trig_args), - axis=0) + result += np.sum( + coeffs[:, 0, np.newaxis] * np.sin(trig_args) + + coeffs[:, 1, np.newaxis] * np.cos(trig_args), + axis=0, + ) return result + return f n = len(frame) @@ -200,9 +233,9 @@ def f(t): t = np.linspace(-np.pi, np.pi, samples) used_legends = set() - color_values = _get_standard_colors(num_colors=len(classes), - colormap=colormap, color_type='random', - color=color) + color_values = _get_standard_colors( + num_colors=len(classes), colormap=colormap, color_type="random", color=color + ) colors = dict(zip(classes, color_values)) if ax is None: ax = plt.gca(xlim=(-np.pi, np.pi)) @@ -218,7 +251,7 @@ def f(t): else: ax.plot(t, y, color=colors[kls], **kwds) - ax.legend(loc='upper right') + ax.legend(loc="upper right") ax.grid() return ax @@ -226,14 +259,16 @@ def f(t): def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): import matplotlib.pyplot as plt + # random.sample(ndarray, int) fails on python 3.3, sigh data = list(series.values) samplings = [random.sample(data, size) for _ in range(samples)] means = np.array([np.mean(sampling) for sampling in samplings]) medians = np.array([np.median(sampling) for sampling in samplings]) - midranges = np.array([(min(sampling) + max(sampling)) * 0.5 - for sampling in samplings]) + midranges = np.array( + [(min(sampling) + max(sampling)) * 0.5 for sampling in samplings] + ) if fig is None: fig = plt.figure() x = list(range(samples)) @@ -268,13 +303,24 @@ def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): return fig -def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, - use_columns=False, xticks=None, colormap=None, - axvlines=True, axvlines_kwds=None, sort_labels=False, - **kwds): +def parallel_coordinates( + frame, + class_column, + cols=None, + ax=None, + color=None, + use_columns=False, + xticks=None, + colormap=None, + axvlines=True, + axvlines_kwds=None, + sort_labels=False, + **kwds +): import matplotlib.pyplot as plt + if axvlines_kwds is None: - axvlines_kwds = {'linewidth': 1, 'color': 'black'} + axvlines_kwds = {"linewidth": 1, "color": "black"} n = len(frame) classes = frame[class_column].drop_duplicates() @@ -292,13 +338,13 @@ def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, # determine values to use for xticks if use_columns is True: if not np.all(np.isreal(list(df.columns))): - raise ValueError('Columns must be numeric to be used as xticks') + raise ValueError("Columns must be numeric to be used as xticks") x = df.columns elif xticks is not None: if not np.all(np.isreal(xticks)): - raise ValueError('xticks specified must be numeric') + raise ValueError("xticks specified must be numeric") elif len(xticks) != ncols: - raise ValueError('Length of xticks must match number of columns') + raise ValueError("Length of xticks must match number of columns") x = xticks else: x = list(range(ncols)) @@ -306,9 +352,9 @@ def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, if ax is None: ax = plt.gca() - color_values = _get_standard_colors(num_colors=len(classes), - colormap=colormap, color_type='random', - color=color) + color_values = _get_standard_colors( + num_colors=len(classes), colormap=colormap, color_type="random", color=color + ) if sort_labels: classes = sorted(classes) @@ -332,7 +378,7 @@ def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, ax.set_xticks(x) ax.set_xticklabels(df.columns) ax.set_xlim(x[0], x[-1]) - ax.legend(loc='upper right') + ax.legend(loc="upper right") ax.grid() return ax @@ -340,7 +386,8 @@ def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, def lag_plot(series, lag=1, ax=None, **kwds): # workaround because `c='b'` is hardcoded in matplotlibs scatter method import matplotlib.pyplot as plt - kwds.setdefault('c', plt.rcParams['patch.facecolor']) + + kwds.setdefault("c", plt.rcParams["patch.facecolor"]) data = series.values y1 = data[:-lag] @@ -364,21 +411,21 @@ def autocorrelation_plot(series, ax=None, **kwds): c0 = np.sum((data - mean) ** 2) / float(n) def r(h): - return ((data[:n - h] - mean) * - (data[h:] - mean)).sum() / float(n) / c0 + return ((data[: n - h] - mean) * (data[h:] - mean)).sum() / float(n) / c0 + x = np.arange(n) + 1 y = [r(loc) for loc in x] z95 = 1.959963984540054 z99 = 2.5758293035489004 - ax.axhline(y=z99 / np.sqrt(n), linestyle='--', color='grey') - ax.axhline(y=z95 / np.sqrt(n), color='grey') - ax.axhline(y=0.0, color='black') - ax.axhline(y=-z95 / np.sqrt(n), color='grey') - ax.axhline(y=-z99 / np.sqrt(n), linestyle='--', color='grey') + ax.axhline(y=z99 / np.sqrt(n), linestyle="--", color="grey") + ax.axhline(y=z95 / np.sqrt(n), color="grey") + ax.axhline(y=0.0, color="black") + ax.axhline(y=-z95 / np.sqrt(n), color="grey") + ax.axhline(y=-z99 / np.sqrt(n), linestyle="--", color="grey") ax.set_xlabel("Lag") ax.set_ylabel("Autocorrelation") ax.plot(x, y, **kwds) - if 'label' in kwds: + if "label" in kwds: ax.legend() ax.grid() return ax diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py index 8c9e3ea330dd30..e1bba5856e271b 100644 --- a/pandas/plotting/_matplotlib/style.py +++ b/pandas/plotting/_matplotlib/style.py @@ -10,9 +10,11 @@ import pandas.core.common as com -def _get_standard_colors(num_colors=None, colormap=None, color_type='default', - color=None): +def _get_standard_colors( + num_colors=None, colormap=None, color_type="default", color=None +): import matplotlib.pyplot as plt + if color is None and colormap is not None: if isinstance(colormap, str): cmap = colormap @@ -22,24 +24,23 @@ def _get_standard_colors(num_colors=None, colormap=None, color_type='default', colors = [colormap(num) for num in np.linspace(0, 1, num=num_colors)] elif color is not None: if colormap is not None: - warnings.warn("'color' and 'colormap' cannot be used " - "simultaneously. Using 'color'") + warnings.warn( + "'color' and 'colormap' cannot be used " "simultaneously. Using 'color'" + ) colors = list(color) if is_list_like(color) else color else: - if color_type == 'default': + if color_type == "default": # need to call list() on the result to copy so we don't # modify the global rcParams below try: - colors = [c['color'] - for c in list(plt.rcParams['axes.prop_cycle'])] + colors = [c["color"] for c in list(plt.rcParams["axes.prop_cycle"])] except KeyError: - colors = list(plt.rcParams.get('axes.color_cycle', - list('bgrcmyk'))) + colors = list(plt.rcParams.get("axes.color_cycle", list("bgrcmyk"))) if isinstance(colors, str): colors = list(colors) colors = colors[0:num_colors] - elif color_type == 'random': + elif color_type == "random": def random_color(column): """ Returns a random color represented as a list of length 3""" @@ -66,8 +67,7 @@ def _maybe_valid_colors(colors): # check whether each character can be convertible to colors maybe_color_cycle = _maybe_valid_colors(list(colors)) if maybe_single_color and maybe_color_cycle and len(colors) > 1: - hex_color = [c['color'] - for c in list(plt.rcParams['axes.prop_cycle'])] + hex_color = [c["color"] for c in list(plt.rcParams["axes.prop_cycle"])] colors = [hex_color[int(colors[1])]] elif maybe_single_color: colors = [colors] diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index c3b548a6dfa855..f3fcb090e98837 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -6,16 +6,26 @@ import numpy as np from pandas._libs.tslibs.frequencies import ( - FreqGroup, get_base_alias, get_freq, is_subperiod, is_superperiod) + FreqGroup, + get_base_alias, + get_freq, + is_subperiod, + is_superperiod, +) from pandas._libs.tslibs.period import Period from pandas.core.dtypes.generic import ( - ABCDatetimeIndex, ABCPeriodIndex, ABCTimedeltaIndex) + ABCDatetimeIndex, + ABCPeriodIndex, + ABCTimedeltaIndex, +) from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.converter import ( - TimeSeries_DateFormatter, TimeSeries_DateLocator, - TimeSeries_TimedeltaFormatter) + TimeSeries_DateFormatter, + TimeSeries_DateLocator, + TimeSeries_TimedeltaFormatter, +) import pandas.tseries.frequencies as frequencies from pandas.tseries.offsets import DateOffset @@ -41,9 +51,13 @@ def tsplot(series, plotf, ax=None, **kwargs): Use Series.plot() instead """ import matplotlib.pyplot as plt - warnings.warn("'tsplot' is deprecated and will be removed in a " - "future version. Please use Series.plot() instead.", - FutureWarning, stacklevel=2) + + warnings.warn( + "'tsplot' is deprecated and will be removed in a " + "future version. Please use Series.plot() instead.", + FutureWarning, + stacklevel=2, + ) # Used inferred freq is possible, need a test case for inferred if ax is None: @@ -66,7 +80,7 @@ def _maybe_resample(series, ax, kwargs): freq, ax_freq = _get_freq(ax, series) if freq is None: # pragma: no cover - raise ValueError('Cannot use dynamic axis without frequency info') + raise ValueError("Cannot use dynamic axis without frequency info") # Convert DatetimeIndex to PeriodIndex if isinstance(series.index, ABCDatetimeIndex): @@ -75,28 +89,30 @@ def _maybe_resample(series, ax, kwargs): if ax_freq is not None and freq != ax_freq: if is_superperiod(freq, ax_freq): # upsample input series = series.copy() - series.index = series.index.asfreq(ax_freq, how='s') + series.index = series.index.asfreq(ax_freq, how="s") freq = ax_freq elif _is_sup(freq, ax_freq): # one is weekly - how = kwargs.pop('how', 'last') - series = getattr(series.resample('D'), how)().dropna() + how = kwargs.pop("how", "last") + series = getattr(series.resample("D"), how)().dropna() series = getattr(series.resample(ax_freq), how)().dropna() freq = ax_freq elif is_subperiod(freq, ax_freq) or _is_sub(freq, ax_freq): _upsample_others(ax, freq, kwargs) else: # pragma: no cover - raise ValueError('Incompatible frequency conversion') + raise ValueError("Incompatible frequency conversion") return freq, series def _is_sub(f1, f2): - return ((f1.startswith('W') and is_subperiod('D', f2)) or - (f2.startswith('W') and is_subperiod(f1, 'D'))) + return (f1.startswith("W") and is_subperiod("D", f2)) or ( + f2.startswith("W") and is_subperiod(f1, "D") + ) def _is_sup(f1, f2): - return ((f1.startswith('W') and is_superperiod('D', f2)) or - (f2.startswith('W') and is_superperiod(f1, 'D'))) + return (f1.startswith("W") and is_superperiod("D", f2)) or ( + f2.startswith("W") and is_superperiod(f1, "D") + ) def _upsample_others(ax, freq, kwargs): @@ -105,9 +121,9 @@ def _upsample_others(ax, freq, kwargs): _replot_ax(ax, freq, kwargs) other_ax = None - if hasattr(ax, 'left_ax'): + if hasattr(ax, "left_ax"): other_ax = ax.left_ax - if hasattr(ax, 'right_ax'): + if hasattr(ax, "right_ax"): other_ax = ax.right_ax if other_ax is not None: @@ -115,16 +131,15 @@ def _upsample_others(ax, freq, kwargs): lines.extend(rlines) labels.extend(rlabels) - if (legend is not None and kwargs.get('legend', True) and - len(lines) > 0): + if legend is not None and kwargs.get("legend", True) and len(lines) > 0: title = legend.get_title().get_text() - if title == 'None': + if title == "None": title = None - ax.legend(lines, labels, loc='best', title=title) + ax.legend(lines, labels, loc="best", title=title) def _replot_ax(ax, freq, kwargs): - data = getattr(ax, '_plot_data', None) + data = getattr(ax, "_plot_data", None) # clear current axes and data ax._plot_data = [] @@ -137,17 +152,17 @@ def _replot_ax(ax, freq, kwargs): if data is not None: for series, plotf, kwds in data: series = series.copy() - idx = series.index.asfreq(freq, how='S') + idx = series.index.asfreq(freq, how="S") series.index = idx ax._plot_data.append((series, plotf, kwds)) # for tsplot if isinstance(plotf, str): from pandas.plotting._matplotlib import PLOT_CLASSES + plotf = PLOT_CLASSES[plotf]._plot - lines.append(plotf(ax, series.index._mpl_repr(), - series.values, **kwds)[0]) + lines.append(plotf(ax, series.index._mpl_repr(), series.values, **kwds)[0]) labels.append(pprint_thing(series.name)) return lines, labels @@ -155,16 +170,16 @@ def _replot_ax(ax, freq, kwargs): def _decorate_axes(ax, freq, kwargs): """Initialize axes for time-series plotting""" - if not hasattr(ax, '_plot_data'): + if not hasattr(ax, "_plot_data"): ax._plot_data = [] ax.freq = freq xaxis = ax.get_xaxis() xaxis.freq = freq - if not hasattr(ax, 'legendlabels'): - ax.legendlabels = [kwargs.get('label', None)] + if not hasattr(ax, "legendlabels"): + ax.legendlabels = [kwargs.get("label", None)] else: - ax.legendlabels.append(kwargs.get('label', None)) + ax.legendlabels.append(kwargs.get("label", None)) ax.view_interval = None ax.date_axis_info = None @@ -175,19 +190,19 @@ def _get_ax_freq(ax): Also checks shared axes (eg when using secondary yaxis, sharex=True or twinx) """ - ax_freq = getattr(ax, 'freq', None) + ax_freq = getattr(ax, "freq", None) if ax_freq is None: # check for left/right ax in case of secondary yaxis - if hasattr(ax, 'left_ax'): - ax_freq = getattr(ax.left_ax, 'freq', None) - elif hasattr(ax, 'right_ax'): - ax_freq = getattr(ax.right_ax, 'freq', None) + if hasattr(ax, "left_ax"): + ax_freq = getattr(ax.left_ax, "freq", None) + elif hasattr(ax, "right_ax"): + ax_freq = getattr(ax.right_ax, "freq", None) if ax_freq is None: # check if a shared ax (sharex/twinx) has already freq set shared_axes = ax.get_shared_x_axes().get_siblings(ax) if len(shared_axes) > 1: for shared_ax in shared_axes: - ax_freq = getattr(shared_ax, 'freq', None) + ax_freq = getattr(shared_ax, "freq", None) if ax_freq is not None: break return ax_freq @@ -195,9 +210,9 @@ def _get_ax_freq(ax): def _get_freq(ax, series): # get frequency from data - freq = getattr(series.index, 'freq', None) + freq = getattr(series.index, "freq", None) if freq is None: - freq = getattr(series.index, 'inferred_freq', None) + freq = getattr(series.index, "inferred_freq", None) ax_freq = _get_ax_freq(ax) @@ -241,17 +256,17 @@ def _use_dynamic_x(ax, data): if isinstance(data.index, ABCDatetimeIndex): base = get_freq(freq) x = data.index - if (base <= FreqGroup.FR_DAY): + if base <= FreqGroup.FR_DAY: return x[:1].is_normalized return Period(x[0], freq).to_timestamp(tz=x.tz) == x[0] return True def _get_index_freq(data): - freq = getattr(data.index, 'freq', None) + freq = getattr(data.index, "freq", None) if freq is None: - freq = getattr(data.index, 'inferred_freq', None) - if freq == 'B': + freq = getattr(data.index, "inferred_freq", None) + if freq == "B": weekdays = np.unique(data.index.dayofweek) if (5 in weekdays) or (6 in weekdays): freq = None @@ -262,10 +277,10 @@ def _maybe_convert_index(ax, data): # tsplot converts automatically, but don't want to convert index # over and over for DataFrames if isinstance(data.index, (ABCDatetimeIndex, ABCPeriodIndex)): - freq = getattr(data.index, 'freq', None) + freq = getattr(data.index, "freq", None) if freq is None: - freq = getattr(data.index, 'inferred_freq', None) + freq = getattr(data.index, "inferred_freq", None) if isinstance(freq, DateOffset): freq = freq.rule_code @@ -273,7 +288,7 @@ def _maybe_convert_index(ax, data): freq = _get_ax_freq(ax) if freq is None: - raise ValueError('Could not get frequency alias for plotting') + raise ValueError("Could not get frequency alias for plotting") freq = get_base_alias(freq) freq = frequencies.get_period_alias(freq) @@ -288,6 +303,7 @@ def _maybe_convert_index(ax, data): # Patch methods for subplot. Only format_dateaxis is currently used. # Do we need the rest for convenience? + def format_timedelta_ticks(x, pos, n_decimals): """ Convert seconds to 'D days HH:MM:SS.F' @@ -296,12 +312,12 @@ def format_timedelta_ticks(x, pos, n_decimals): m, s = divmod(s, 60) h, m = divmod(m, 60) d, h = divmod(h, 24) - decimals = int(ns * 10**(n_decimals - 9)) - s = r'{:02d}:{:02d}:{:02d}'.format(int(h), int(m), int(s)) + decimals = int(ns * 10 ** (n_decimals - 9)) + s = r"{:02d}:{:02d}:{:02d}".format(int(h), int(m), int(s)) if n_decimals > 0: - s += '.{{:0{:0d}d}}'.format(n_decimals).format(decimals) + s += ".{{:0{:0d}d}}".format(n_decimals).format(decimals) if d != 0: - s = '{:d} days '.format(int(d)) + s + s = "{:d} days ".format(int(d)) + s return s @@ -325,21 +341,21 @@ def format_dateaxis(subplot, freq, index): # interface. DatetimeIndex uses matplotlib.date directly if isinstance(index, ABCPeriodIndex): - majlocator = TimeSeries_DateLocator(freq, dynamic_mode=True, - minor_locator=False, - plot_obj=subplot) - minlocator = TimeSeries_DateLocator(freq, dynamic_mode=True, - minor_locator=True, - plot_obj=subplot) + majlocator = TimeSeries_DateLocator( + freq, dynamic_mode=True, minor_locator=False, plot_obj=subplot + ) + minlocator = TimeSeries_DateLocator( + freq, dynamic_mode=True, minor_locator=True, plot_obj=subplot + ) subplot.xaxis.set_major_locator(majlocator) subplot.xaxis.set_minor_locator(minlocator) - majformatter = TimeSeries_DateFormatter(freq, dynamic_mode=True, - minor_locator=False, - plot_obj=subplot) - minformatter = TimeSeries_DateFormatter(freq, dynamic_mode=True, - minor_locator=True, - plot_obj=subplot) + majformatter = TimeSeries_DateFormatter( + freq, dynamic_mode=True, minor_locator=False, plot_obj=subplot + ) + minformatter = TimeSeries_DateFormatter( + freq, dynamic_mode=True, minor_locator=True, plot_obj=subplot + ) subplot.xaxis.set_major_formatter(majformatter) subplot.xaxis.set_minor_formatter(minformatter) @@ -347,9 +363,8 @@ def format_dateaxis(subplot, freq, index): subplot.format_coord = functools.partial(_format_coord, freq) elif isinstance(index, ABCTimedeltaIndex): - subplot.xaxis.set_major_formatter( - TimeSeries_TimedeltaFormatter()) + subplot.xaxis.set_major_formatter(TimeSeries_TimedeltaFormatter()) else: - raise TypeError('index type not supported') + raise TypeError("index type not supported") pylab.draw_if_interactive() diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index acb5ab7b8e04be..8472eb3a3d887d 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -14,7 +14,7 @@ def format_date_labels(ax, rot): # mini version of autofmt_xdate try: for label in ax.get_xticklabels(): - label.set_ha('right') + label.set_ha("right") label.set_rotation(rot) fig = ax.get_figure() fig.subplots_adjust(bottom=0.2) @@ -28,7 +28,7 @@ def table(ax, data, rowLabels=None, colLabels=None, **kwargs): elif isinstance(data, ABCDataFrame): pass else: - raise ValueError('Input data must be DataFrame or Series') + raise ValueError("Input data must be DataFrame or Series") if rowLabels is None: rowLabels = data.index @@ -38,16 +38,16 @@ def table(ax, data, rowLabels=None, colLabels=None, **kwargs): cellText = data.values - table = matplotlib.table.table(ax, cellText=cellText, - rowLabels=rowLabels, - colLabels=colLabels, **kwargs) + table = matplotlib.table.table( + ax, cellText=cellText, rowLabels=rowLabels, colLabels=colLabels, **kwargs + ) return table -def _get_layout(nplots, layout=None, layout_type='box'): +def _get_layout(nplots, layout=None, layout_type="box"): if layout is not None: if not isinstance(layout, (tuple, list)) or len(layout) != 2: - raise ValueError('Layout must be a tuple of (rows, columns)') + raise ValueError("Layout must be a tuple of (rows, columns)") nrows, ncols = layout @@ -62,17 +62,20 @@ def _get_layout(nplots, layout=None, layout_type='box'): raise ValueError(msg) if nrows * ncols < nplots: - raise ValueError('Layout of {nrows}x{ncols} must be larger ' - 'than required size {nplots}'.format( - nrows=nrows, ncols=ncols, nplots=nplots)) + raise ValueError( + "Layout of {nrows}x{ncols} must be larger " + "than required size {nplots}".format( + nrows=nrows, ncols=ncols, nplots=nplots + ) + ) return layout - if layout_type == 'single': + if layout_type == "single": return (1, 1) - elif layout_type == 'horizontal': + elif layout_type == "horizontal": return (1, nplots) - elif layout_type == 'vertical': + elif layout_type == "vertical": return (nplots, 1) layouts = {1: (1, 1), 2: (1, 2), 3: (2, 2), 4: (2, 2)} @@ -88,12 +91,21 @@ def _get_layout(nplots, layout=None, layout_type='box'): else: return k, k + # copied from matplotlib/pyplot.py and modified for pandas.plotting -def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, - subplot_kw=None, ax=None, layout=None, layout_type='box', - **fig_kw): +def _subplots( + naxes=None, + sharex=False, + sharey=False, + squeeze=True, + subplot_kw=None, + ax=None, + layout=None, + layout_type="box", + **fig_kw +): """Create a figure with a set of subplots already made. This utility wrapper makes it convenient to create common layouts of @@ -168,6 +180,7 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, plt.subplots(2, 2, subplot_kw=dict(polar=True)) """ import matplotlib.pyplot as plt + if subplot_kw is None: subplot_kw = {} @@ -177,19 +190,26 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, if is_list_like(ax): ax = _flatten(ax) if layout is not None: - warnings.warn("When passing multiple axes, layout keyword is " - "ignored", UserWarning) + warnings.warn( + "When passing multiple axes, layout keyword is " "ignored", + UserWarning, + ) if sharex or sharey: - warnings.warn("When passing multiple axes, sharex and sharey " - "are ignored. These settings must be specified " - "when creating axes", UserWarning, - stacklevel=4) + warnings.warn( + "When passing multiple axes, sharex and sharey " + "are ignored. These settings must be specified " + "when creating axes", + UserWarning, + stacklevel=4, + ) if len(ax) == naxes: fig = ax[0].get_figure() return fig, ax else: - raise ValueError("The number of passed axes must be {0}, the " - "same as the output plot".format(naxes)) + raise ValueError( + "The number of passed axes must be {0}, the " + "same as the output plot".format(naxes) + ) fig = ax.get_figure() # if ax is passed and a number of subplots is 1, return ax as it is @@ -199,9 +219,12 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, else: return fig, _flatten(ax) else: - warnings.warn("To output multiple subplots, the figure containing " - "the passed axes is being cleared", UserWarning, - stacklevel=4) + warnings.warn( + "To output multiple subplots, the figure containing " + "the passed axes is being cleared", + UserWarning, + stacklevel=4, + ) fig.clear() nrows, ncols = _get_layout(naxes, layout=layout, layout_type=layout_type) @@ -215,9 +238,9 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, ax0 = fig.add_subplot(nrows, ncols, 1, **subplot_kw) if sharex: - subplot_kw['sharex'] = ax0 + subplot_kw["sharex"] = ax0 if sharey: - subplot_kw['sharey'] = ax0 + subplot_kw["sharey"] = ax0 axarr[0] = ax0 # Note off-by-one counting because add_subplot uses the MATLAB 1-based @@ -228,8 +251,8 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, # interfere with proper axis limits on the visible axes if # they share axes e.g. issue #7528 if i >= naxes: - kwds['sharex'] = None - kwds['sharey'] = None + kwds["sharex"] = None + kwds["sharey"] = None ax = fig.add_subplot(nrows, ncols, i + 1, **kwds) axarr[i] = ax @@ -264,10 +287,10 @@ def _remove_labels_from_axis(axis): if isinstance(axis.get_minor_locator(), ticker.NullLocator): axis.set_minor_locator(ticker.AutoLocator()) if isinstance(axis.get_minor_formatter(), ticker.NullFormatter): - axis.set_minor_formatter(ticker.FormatStrFormatter('')) + axis.set_minor_formatter(ticker.FormatStrFormatter("")) for t in axis.get_minorticklabels(): t.set_visible(False) - except Exception: # pragma no cover + except Exception: # pragma no cover raise axis.get_label().set_visible(False) @@ -289,8 +312,7 @@ def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey): # the last in the column, because below is no subplot/gap. if not layout[ax.rowNum + 1, ax.colNum]: continue - if sharex or len(ax.get_shared_x_axes() - .get_siblings(ax)) > 1: + if sharex or len(ax.get_shared_x_axes().get_siblings(ax)) > 1: _remove_labels_from_axis(ax.xaxis) except IndexError: @@ -299,8 +321,7 @@ def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey): for ax in axarr: if ax.is_last_row(): continue - if sharex or len(ax.get_shared_x_axes() - .get_siblings(ax)) > 1: + if sharex or len(ax.get_shared_x_axes().get_siblings(ax)) > 1: _remove_labels_from_axis(ax.xaxis) if ncols > 1: @@ -325,10 +346,10 @@ def _flatten(axes): def _get_all_lines(ax): lines = ax.get_lines() - if hasattr(ax, 'right_ax'): + if hasattr(ax, "right_ax"): lines += ax.right_ax.get_lines() - if hasattr(ax, 'left_ax'): + if hasattr(ax, "left_ax"): lines += ax.left_ax.get_lines() return lines @@ -343,9 +364,9 @@ def _get_xlim(lines): return left, right -def _set_ticks_props(axes, xlabelsize=None, xrot=None, - ylabelsize=None, yrot=None): +def _set_ticks_props(axes, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None): import matplotlib.pyplot as plt + for ax in _flatten(axes): if xlabelsize is not None: plt.setp(ax.get_xticklabels(), fontsize=xlabelsize) diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index f240faf45dfce5..435562f7d12626 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -25,8 +25,9 @@ def table(ax, data, rowLabels=None, colLabels=None, **kwargs): matplotlib table object """ plot_backend = _get_plot_backend() - return plot_backend.table(ax=ax, data=data, rowLabels=None, colLabels=None, - **kwargs) + return plot_backend.table( + ax=ax, data=data, rowLabels=None, colLabels=None, **kwargs + ) def register(explicit=True): @@ -70,9 +71,19 @@ def deregister(): plot_backend.deregister() -def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, - diagonal='hist', marker='.', density_kwds=None, - hist_kwds=None, range_padding=0.05, **kwds): +def scatter_matrix( + frame, + alpha=0.5, + figsize=None, + ax=None, + grid=False, + diagonal="hist", + marker=".", + density_kwds=None, + hist_kwds=None, + range_padding=0.05, + **kwds +): """ Draw a matrix of scatter plots. @@ -115,9 +126,18 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, """ plot_backend = _get_plot_backend() return plot_backend.scatter_matrix( - frame=frame, alpha=alpha, figsize=figsize, ax=ax, grid=grid, - diagonal=diagonal, marker=marker, density_kwds=density_kwds, - hist_kwds=hist_kwds, range_padding=range_padding, **kwds) + frame=frame, + alpha=alpha, + figsize=figsize, + ax=ax, + grid=grid, + diagonal=diagonal, + marker=marker, + density_kwds=density_kwds, + hist_kwds=hist_kwds, + range_padding=range_padding, + **kwds + ) def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): @@ -183,13 +203,20 @@ def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): >>> rad_viz = pd.plotting.radviz(df, 'Category') # doctest: +SKIP """ plot_backend = _get_plot_backend() - return plot_backend.radviz(frame=frame, class_column=class_column, ax=ax, - color=color, colormap=colormap, **kwds) - - -@deprecate_kwarg(old_arg_name='data', new_arg_name='frame') -def andrews_curves(frame, class_column, ax=None, samples=200, color=None, - colormap=None, **kwds): + return plot_backend.radviz( + frame=frame, + class_column=class_column, + ax=ax, + color=color, + colormap=colormap, + **kwds + ) + + +@deprecate_kwarg(old_arg_name="data", new_arg_name="frame") +def andrews_curves( + frame, class_column, ax=None, samples=200, color=None, colormap=None, **kwds +): """ Generate a matplotlib plot of Andrews curves, for visualising clusters of multivariate data. @@ -223,9 +250,15 @@ def andrews_curves(frame, class_column, ax=None, samples=200, color=None, class:`matplotlip.axis.Axes` """ plot_backend = _get_plot_backend() - return plot_backend.andrews_curves(frame=frame, class_column=class_column, - ax=ax, samples=samples, color=color, - colormap=colormap, **kwds) + return plot_backend.andrews_curves( + frame=frame, + class_column=class_column, + ax=ax, + samples=samples, + color=color, + colormap=colormap, + **kwds + ) def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): @@ -275,16 +308,27 @@ def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): >>> fig = pd.plotting.bootstrap_plot(s) # doctest: +SKIP """ plot_backend = _get_plot_backend() - return plot_backend.bootstrap_plot(series=series, fig=fig, size=size, - samples=samples, **kwds) - - -@deprecate_kwarg(old_arg_name='colors', new_arg_name='color') -@deprecate_kwarg(old_arg_name='data', new_arg_name='frame', stacklevel=3) -def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, - use_columns=False, xticks=None, colormap=None, - axvlines=True, axvlines_kwds=None, sort_labels=False, - **kwds): + return plot_backend.bootstrap_plot( + series=series, fig=fig, size=size, samples=samples, **kwds + ) + + +@deprecate_kwarg(old_arg_name="colors", new_arg_name="color") +@deprecate_kwarg(old_arg_name="data", new_arg_name="frame", stacklevel=3) +def parallel_coordinates( + frame, + class_column, + cols=None, + ax=None, + color=None, + use_columns=False, + xticks=None, + colormap=None, + axvlines=True, + axvlines_kwds=None, + sort_labels=False, + **kwds +): """Parallel coordinates plotting. Parameters @@ -332,10 +376,19 @@ def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, """ plot_backend = _get_plot_backend() return plot_backend.parallel_coordinates( - frame=frame, class_column=class_column, cols=cols, ax=ax, color=color, - use_columns=use_columns, xticks=xticks, colormap=colormap, - axvlines=axvlines, axvlines_kwds=axvlines_kwds, - sort_labels=sort_labels, **kwds) + frame=frame, + class_column=class_column, + cols=cols, + ax=ax, + color=color, + use_columns=use_columns, + xticks=xticks, + colormap=colormap, + axvlines=axvlines, + axvlines_kwds=axvlines_kwds, + sort_labels=sort_labels, + **kwds + ) def lag_plot(series, lag=1, ax=None, **kwds): @@ -392,9 +445,12 @@ def tsplot(series, plotf, ax=None, **kwargs): .. deprecated:: 0.23.0 Use Series.plot() instead """ - warnings.warn("'tsplot' is deprecated and will be removed in a " - "future version. Please use Series.plot() instead.", - FutureWarning, stacklevel=2) + warnings.warn( + "'tsplot' is deprecated and will be removed in a " + "future version. Please use Series.plot() instead.", + FutureWarning, + stacklevel=2, + ) plot_backend = _get_plot_backend() return plot_backend.tsplot(series=series, plotf=plotf, ax=ax, **kwargs) @@ -408,19 +464,20 @@ class _Options(dict): """ # alias so the names are same as plotting method parameter names - _ALIASES = {'x_compat': 'xaxis.compat'} - _DEFAULT_KEYS = ['xaxis.compat'] + _ALIASES = {"x_compat": "xaxis.compat"} + _DEFAULT_KEYS = ["xaxis.compat"] def __init__(self, deprecated=False): self._deprecated = deprecated # self['xaxis.compat'] = False - super().__setitem__('xaxis.compat', False) + super().__setitem__("xaxis.compat", False) def __getitem__(self, key): key = self._get_canonical_key(key) if key not in self: raise ValueError( - '{key} is not a valid pandas plotting option'.format(key=key)) + "{key} is not a valid pandas plotting option".format(key=key) + ) return super().__getitem__(key) def __setitem__(self, key, value): @@ -430,8 +487,7 @@ def __setitem__(self, key, value): def __delitem__(self, key): key = self._get_canonical_key(key) if key in self._DEFAULT_KEYS: - raise ValueError( - 'Cannot remove default parameter {key}'.format(key=key)) + raise ValueError("Cannot remove default parameter {key}".format(key=key)) return super().__delitem__(key) def __contains__(self, key): diff --git a/pandas/testing.py b/pandas/testing.py index dbea1ecc7362a7..acae47367d9977 100644 --- a/pandas/testing.py +++ b/pandas/testing.py @@ -5,4 +5,7 @@ """ from pandas.util.testing import ( - assert_frame_equal, assert_index_equal, assert_series_equal) + assert_frame_equal, + assert_index_equal, + assert_series_equal, +) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 614e3172d9d48e..326bef7f4b480b 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -4,13 +4,12 @@ class Base: - def check(self, namespace, expected, ignored=None): # see which names are in the namespace, minus optional # ignored ones # compare vs the expected - result = sorted(f for f in dir(namespace) if not f.startswith('__')) + result = sorted(f for f in dir(namespace) if not f.startswith("__")) if ignored is not None: result = sorted(list(set(result) - set(ignored))) @@ -22,33 +21,74 @@ class TestPDApi(Base): # these are optionally imported based on testing # & need to be ignored - ignored = ['tests', 'locale', 'conftest'] + ignored = ["tests", "locale", "conftest"] # top-level sub-packages - lib = ['api', 'arrays', 'compat', 'core', 'errors', 'pandas', - 'plotting', 'test', 'testing', 'tseries', - 'util', 'options', 'io'] + lib = [ + "api", + "arrays", + "compat", + "core", + "errors", + "pandas", + "plotting", + "test", + "testing", + "tseries", + "util", + "options", + "io", + ] # these are already deprecated; awaiting removal deprecated_modules = [] # misc - misc = ['IndexSlice', 'NaT'] + misc = ["IndexSlice", "NaT"] # top-level classes - classes = ['Categorical', 'CategoricalIndex', 'DataFrame', 'DateOffset', - 'DatetimeIndex', 'ExcelFile', 'ExcelWriter', 'Float64Index', - 'Grouper', 'HDFStore', 'Index', 'Int64Index', 'MultiIndex', - 'Period', 'PeriodIndex', 'RangeIndex', 'UInt64Index', - 'Series', 'SparseArray', 'SparseDataFrame', 'SparseDtype', - 'SparseSeries', 'Timedelta', - 'TimedeltaIndex', 'Timestamp', 'Interval', 'IntervalIndex', - 'CategoricalDtype', 'PeriodDtype', 'IntervalDtype', - 'DatetimeTZDtype', - 'Int8Dtype', 'Int16Dtype', 'Int32Dtype', 'Int64Dtype', - 'UInt8Dtype', 'UInt16Dtype', 'UInt32Dtype', 'UInt64Dtype', - 'NamedAgg', - ] + classes = [ + "Categorical", + "CategoricalIndex", + "DataFrame", + "DateOffset", + "DatetimeIndex", + "ExcelFile", + "ExcelWriter", + "Float64Index", + "Grouper", + "HDFStore", + "Index", + "Int64Index", + "MultiIndex", + "Period", + "PeriodIndex", + "RangeIndex", + "UInt64Index", + "Series", + "SparseArray", + "SparseDataFrame", + "SparseDtype", + "SparseSeries", + "Timedelta", + "TimedeltaIndex", + "Timestamp", + "Interval", + "IntervalIndex", + "CategoricalDtype", + "PeriodDtype", + "IntervalDtype", + "DatetimeTZDtype", + "Int8Dtype", + "Int16Dtype", + "Int32Dtype", + "Int64Dtype", + "UInt8Dtype", + "UInt16Dtype", + "UInt32Dtype", + "UInt64Dtype", + "NamedAgg", + ] if not compat.PY37: classes.append("Panel") @@ -59,35 +99,77 @@ class TestPDApi(Base): deprecated_classes_in_future = [] # external modules exposed in pandas namespace - modules = ['np', 'datetime'] + modules = ["np", "datetime"] # top-level functions - funcs = ['array', 'bdate_range', 'concat', 'crosstab', 'cut', - 'date_range', 'interval_range', 'eval', - 'factorize', 'get_dummies', - 'infer_freq', 'isna', 'isnull', 'lreshape', - 'melt', 'notna', 'notnull', 'offsets', - 'merge', 'merge_ordered', 'merge_asof', - 'period_range', - 'pivot', 'pivot_table', 'qcut', - 'show_versions', 'timedelta_range', 'unique', - 'value_counts', 'wide_to_long'] + funcs = [ + "array", + "bdate_range", + "concat", + "crosstab", + "cut", + "date_range", + "interval_range", + "eval", + "factorize", + "get_dummies", + "infer_freq", + "isna", + "isnull", + "lreshape", + "melt", + "notna", + "notnull", + "offsets", + "merge", + "merge_ordered", + "merge_asof", + "period_range", + "pivot", + "pivot_table", + "qcut", + "show_versions", + "timedelta_range", + "unique", + "value_counts", + "wide_to_long", + ] # top-level option funcs - funcs_option = ['reset_option', 'describe_option', 'get_option', - 'option_context', 'set_option', - 'set_eng_float_format'] + funcs_option = [ + "reset_option", + "describe_option", + "get_option", + "option_context", + "set_option", + "set_eng_float_format", + ] # top-level read_* funcs - funcs_read = ['read_clipboard', 'read_csv', 'read_excel', 'read_fwf', - 'read_gbq', 'read_hdf', 'read_html', 'read_json', - 'read_msgpack', 'read_pickle', 'read_sas', 'read_sql', - 'read_sql_query', 'read_sql_table', 'read_stata', - 'read_table', 'read_feather', 'read_parquet', 'read_spss'] + funcs_read = [ + "read_clipboard", + "read_csv", + "read_excel", + "read_fwf", + "read_gbq", + "read_hdf", + "read_html", + "read_json", + "read_msgpack", + "read_pickle", + "read_sas", + "read_sql", + "read_sql_query", + "read_sql_table", + "read_stata", + "read_table", + "read_feather", + "read_parquet", + "read_spss", + ] # top-level to_* funcs - funcs_to = ['to_datetime', 'to_msgpack', - 'to_numeric', 'to_pickle', 'to_timedelta'] + funcs_to = ["to_datetime", "to_msgpack", "to_numeric", "to_pickle", "to_timedelta"] # top-level to deprecate in the future deprecated_funcs_in_future = [] @@ -96,28 +178,45 @@ class TestPDApi(Base): deprecated_funcs = [] # private modules in pandas namespace - private_modules = ['_config', '_hashtable', '_lib', '_libs', - '_np_version_under1p14', '_np_version_under1p15', - '_np_version_under1p16', '_np_version_under1p17', - '_tslib', '_typing', '_version'] + private_modules = [ + "_config", + "_hashtable", + "_lib", + "_libs", + "_np_version_under1p14", + "_np_version_under1p15", + "_np_version_under1p16", + "_np_version_under1p17", + "_tslib", + "_typing", + "_version", + ] def test_api(self): - self.check(pd, - self.lib + self.misc + - self.modules + self.deprecated_modules + - self.classes + self.deprecated_classes + - self.deprecated_classes_in_future + - self.funcs + self.funcs_option + - self.funcs_read + self.funcs_to + - self.deprecated_funcs_in_future + - self.deprecated_funcs + self.private_modules, - self.ignored) + self.check( + pd, + self.lib + + self.misc + + self.modules + + self.deprecated_modules + + self.classes + + self.deprecated_classes + + self.deprecated_classes_in_future + + self.funcs + + self.funcs_option + + self.funcs_read + + self.funcs_to + + self.deprecated_funcs_in_future + + self.deprecated_funcs + + self.private_modules, + self.ignored, + ) class TestApi(Base): - allowed = ['types', 'extensions'] + allowed = ["types", "extensions"] def test_api(self): @@ -126,10 +225,10 @@ def test_api(self): class TestTesting(Base): - funcs = ['assert_frame_equal', 'assert_series_equal', - 'assert_index_equal'] + funcs = ["assert_frame_equal", "assert_series_equal", "assert_index_equal"] def test_testing(self): from pandas import testing + self.check(testing, self.funcs) diff --git a/pandas/tests/api/test_types.py b/pandas/tests/api/test_types.py index d6090225c0a7f7..e2ff77715e3012 100644 --- a/pandas/tests/api/test_types.py +++ b/pandas/tests/api/test_types.py @@ -6,28 +6,53 @@ class TestTypes(Base): - allowed = ['is_bool', 'is_bool_dtype', - 'is_categorical', 'is_categorical_dtype', 'is_complex', - 'is_complex_dtype', 'is_datetime64_any_dtype', - 'is_datetime64_dtype', 'is_datetime64_ns_dtype', - 'is_datetime64tz_dtype', 'is_dtype_equal', - 'is_extension_type', 'is_float', 'is_float_dtype', - 'is_int64_dtype', 'is_integer', - 'is_integer_dtype', 'is_number', 'is_numeric_dtype', - 'is_object_dtype', 'is_scalar', 'is_sparse', - 'is_string_dtype', 'is_signed_integer_dtype', - 'is_timedelta64_dtype', 'is_timedelta64_ns_dtype', - 'is_unsigned_integer_dtype', - 'is_period_dtype', 'is_interval', 'is_interval_dtype', - 'is_re', 'is_re_compilable', - 'is_dict_like', 'is_iterator', 'is_file_like', - 'is_list_like', 'is_hashable', 'is_array_like', - 'is_named_tuple', - 'pandas_dtype', 'union_categoricals', 'infer_dtype', - 'is_extension_array_dtype'] - deprecated = ['is_period', 'is_datetimetz'] - dtypes = ['CategoricalDtype', 'DatetimeTZDtype', - 'PeriodDtype', 'IntervalDtype'] + allowed = [ + "is_bool", + "is_bool_dtype", + "is_categorical", + "is_categorical_dtype", + "is_complex", + "is_complex_dtype", + "is_datetime64_any_dtype", + "is_datetime64_dtype", + "is_datetime64_ns_dtype", + "is_datetime64tz_dtype", + "is_dtype_equal", + "is_extension_type", + "is_float", + "is_float_dtype", + "is_int64_dtype", + "is_integer", + "is_integer_dtype", + "is_number", + "is_numeric_dtype", + "is_object_dtype", + "is_scalar", + "is_sparse", + "is_string_dtype", + "is_signed_integer_dtype", + "is_timedelta64_dtype", + "is_timedelta64_ns_dtype", + "is_unsigned_integer_dtype", + "is_period_dtype", + "is_interval", + "is_interval_dtype", + "is_re", + "is_re_compilable", + "is_dict_like", + "is_iterator", + "is_file_like", + "is_list_like", + "is_hashable", + "is_array_like", + "is_named_tuple", + "pandas_dtype", + "union_categoricals", + "infer_dtype", + "is_extension_array_dtype", + ] + deprecated = ["is_period", "is_datetimetz"] + dtypes = ["CategoricalDtype", "DatetimeTZDtype", "PeriodDtype", "IntervalDtype"] def test_types(self): @@ -36,6 +61,5 @@ def test_types(self): def test_deprecated_from_api_types(self): for t in self.deprecated: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): getattr(types, t)(1) diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index c6547c32f3ce72..c67a67bb31d625 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -11,24 +11,26 @@ def id_func(x): if isinstance(x, tuple): assert len(x) == 2 - return x[0].__name__ + '-' + str(x[1]) + return x[0].__name__ + "-" + str(x[1]) else: return x.__name__ # ------------------------------------------------------------------ + @pytest.fixture(params=[1, np.array(1, dtype=np.int64)]) def one(request): # zero-dim integer array behaves like an integer return request.param -zeros = [box_cls([0] * 5, dtype=dtype) - for box_cls in [pd.Index, np.array] - for dtype in [np.int64, np.uint64, np.float64]] -zeros.extend([np.array(0, dtype=dtype) - for dtype in [np.int64, np.uint64, np.float64]]) +zeros = [ + box_cls([0] * 5, dtype=dtype) + for box_cls in [pd.Index, np.array] + for dtype in [np.int64, np.uint64, np.float64] +] +zeros.extend([np.array(0, dtype=dtype) for dtype in [np.int64, np.uint64, np.float64]]) zeros.extend([0, 0.0]) @@ -42,11 +44,16 @@ def zero(request): # ------------------------------------------------------------------ # Vector Fixtures -@pytest.fixture(params=[pd.Float64Index(np.arange(5, dtype='float64')), - pd.Int64Index(np.arange(5, dtype='int64')), - pd.UInt64Index(np.arange(5, dtype='uint64')), - pd.RangeIndex(5)], - ids=lambda x: type(x).__name__) + +@pytest.fixture( + params=[ + pd.Float64Index(np.arange(5, dtype="float64")), + pd.Int64Index(np.arange(5, dtype="int64")), + pd.UInt64Index(np.arange(5, dtype="uint64")), + pd.RangeIndex(5), + ], + ids=lambda x: type(x).__name__, +) def numeric_idx(request): """ Several types of numeric-dtypes Index objects @@ -57,10 +64,15 @@ def numeric_idx(request): # ------------------------------------------------------------------ # Scalar Fixtures -@pytest.fixture(params=[pd.Timedelta('5m4s').to_pytimedelta(), - pd.Timedelta('5m4s'), - pd.Timedelta('5m4s').to_timedelta64()], - ids=lambda x: type(x).__name__) + +@pytest.fixture( + params=[ + pd.Timedelta("5m4s").to_pytimedelta(), + pd.Timedelta("5m4s"), + pd.Timedelta("5m4s").to_timedelta64(), + ], + ids=lambda x: type(x).__name__, +) def scalar_td(request): """ Several variants of Timedelta scalars representing 5 minutes and 4 seconds @@ -68,13 +80,17 @@ def scalar_td(request): return request.param -@pytest.fixture(params=[pd.offsets.Day(3), - pd.offsets.Hour(72), - pd.Timedelta(days=3).to_pytimedelta(), - pd.Timedelta('72:00:00'), - np.timedelta64(3, 'D'), - np.timedelta64(72, 'h')], - ids=lambda x: type(x).__name__) +@pytest.fixture( + params=[ + pd.offsets.Day(3), + pd.offsets.Hour(72), + pd.Timedelta(days=3).to_pytimedelta(), + pd.Timedelta("72:00:00"), + np.timedelta64(3, "D"), + np.timedelta64(72, "h"), + ], + ids=lambda x: type(x).__name__, +) def three_days(request): """ Several timedelta-like and DateOffset objects that each represent @@ -83,13 +99,17 @@ def three_days(request): return request.param -@pytest.fixture(params=[pd.offsets.Hour(2), - pd.offsets.Minute(120), - pd.Timedelta(hours=2).to_pytimedelta(), - pd.Timedelta(seconds=2 * 3600), - np.timedelta64(2, 'h'), - np.timedelta64(120, 'm')], - ids=lambda x: type(x).__name__) +@pytest.fixture( + params=[ + pd.offsets.Hour(2), + pd.offsets.Minute(120), + pd.Timedelta(hours=2).to_pytimedelta(), + pd.Timedelta(seconds=2 * 3600), + np.timedelta64(2, "h"), + np.timedelta64(120, "m"), + ], + ids=lambda x: type(x).__name__, +) def two_hours(request): """ Several timedelta-like and DateOffset objects that each represent @@ -98,14 +118,21 @@ def two_hours(request): return request.param -_common_mismatch = [pd.offsets.YearBegin(2), - pd.offsets.MonthBegin(1), - pd.offsets.Minute()] +_common_mismatch = [ + pd.offsets.YearBegin(2), + pd.offsets.MonthBegin(1), + pd.offsets.Minute(), +] -@pytest.fixture(params=[pd.Timedelta(minutes=30).to_pytimedelta(), - np.timedelta64(30, 's'), - pd.Timedelta(seconds=30)] + _common_mismatch) +@pytest.fixture( + params=[ + pd.Timedelta(minutes=30).to_pytimedelta(), + np.timedelta64(30, "s"), + pd.Timedelta(seconds=30), + ] + + _common_mismatch +) def not_hourly(request): """ Several timedelta-like and DateOffset instances that are _not_ @@ -114,9 +141,14 @@ def not_hourly(request): return request.param -@pytest.fixture(params=[np.timedelta64(4, 'h'), - pd.Timedelta(hours=23).to_pytimedelta(), - pd.Timedelta('23:00:00')] + _common_mismatch) +@pytest.fixture( + params=[ + np.timedelta64(4, "h"), + pd.Timedelta(hours=23).to_pytimedelta(), + pd.Timedelta("23:00:00"), + ] + + _common_mismatch +) def not_daily(request): """ Several timedelta-like and DateOffset instances that are _not_ @@ -125,9 +157,14 @@ def not_daily(request): return request.param -@pytest.fixture(params=[np.timedelta64(365, 'D'), - pd.Timedelta(days=365).to_pytimedelta(), - pd.Timedelta(days=365)] + _common_mismatch) +@pytest.fixture( + params=[ + np.timedelta64(365, "D"), + pd.Timedelta(days=365).to_pytimedelta(), + pd.Timedelta(days=365), + ] + + _common_mismatch +) def mismatched_freq(request): """ Several timedelta-like and DateOffset instances that are _not_ @@ -138,8 +175,8 @@ def mismatched_freq(request): # ------------------------------------------------------------------ -@pytest.fixture(params=[pd.Index, pd.Series, pd.DataFrame], - ids=id_func) + +@pytest.fixture(params=[pd.Index, pd.Series, pd.DataFrame], ids=id_func) def box(request): """ Several array-like containers that should have effectively identical @@ -148,11 +185,10 @@ def box(request): return request.param -@pytest.fixture(params=[pd.Index, - pd.Series, - pytest.param(pd.DataFrame, - marks=pytest.mark.xfail)], - ids=id_func) +@pytest.fixture( + params=[pd.Index, pd.Series, pytest.param(pd.DataFrame, marks=pytest.mark.xfail)], + ids=id_func, +) def box_df_fail(request): """ Fixture equivalent to `box` fixture but xfailing the DataFrame case. @@ -160,12 +196,15 @@ def box_df_fail(request): return request.param -@pytest.fixture(params=[(pd.Index, False), - (pd.Series, False), - (pd.DataFrame, False), - pytest.param((pd.DataFrame, True), - marks=pytest.mark.xfail)], - ids=id_func) +@pytest.fixture( + params=[ + (pd.Index, False), + (pd.Series, False), + (pd.DataFrame, False), + pytest.param((pd.DataFrame, True), marks=pytest.mark.xfail), + ], + ids=id_func, +) def box_transpose_fail(request): """ Fixture similar to `box` but testing both transpose cases for DataFrame, @@ -175,8 +214,7 @@ def box_transpose_fail(request): return request.param -@pytest.fixture(params=[pd.Index, pd.Series, pd.DataFrame, tm.to_array], - ids=id_func) +@pytest.fixture(params=[pd.Index, pd.Series, pd.DataFrame, tm.to_array], ids=id_func) def box_with_array(request): """ Fixture to test behavior for Index, Series, DataFrame, and pandas Array diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 908e197ec1d282..6037273450a1c4 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -17,8 +17,15 @@ import pandas as pd from pandas import ( - DatetimeIndex, NaT, Period, Series, Timedelta, TimedeltaIndex, Timestamp, - date_range) + DatetimeIndex, + NaT, + Period, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, + date_range, +) from pandas.core.indexes.datetimes import _to_M8 import pandas.util.testing as tm @@ -37,6 +44,7 @@ def assert_all(obj): # ------------------------------------------------------------------ # Comparisons + class TestDatetime64ArrayLikeComparisons: # Comparison tests for datetime64 vectors fully parametrized over # DataFrame/Series/DatetimeIndex/DateteimeArray. Ideally all comparison @@ -47,7 +55,7 @@ def test_compare_zerodim(self, tz_naive_fixture, box_with_array): tz = tz_naive_fixture box = box_with_array xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - dti = date_range('20130101', periods=3, tz=tz) + dti = date_range("20130101", periods=3, tz=tz) other = np.array(dti.to_numpy()[0]) @@ -60,13 +68,17 @@ def test_compare_zerodim(self, tz_naive_fixture, box_with_array): class TestDatetime64DataFrameComparison: - @pytest.mark.parametrize('timestamps', [ - [pd.Timestamp('2012-01-01 13:00:00+00:00')] * 2, - [pd.Timestamp('2012-01-01 13:00:00')] * 2]) + @pytest.mark.parametrize( + "timestamps", + [ + [pd.Timestamp("2012-01-01 13:00:00+00:00")] * 2, + [pd.Timestamp("2012-01-01 13:00:00")] * 2, + ], + ) def test_tz_aware_scalar_comparison(self, timestamps): # GH#15966 - df = pd.DataFrame({'test': timestamps}) - expected = pd.DataFrame({'test': [False, False]}) + df = pd.DataFrame({"test": timestamps}) + expected = pd.DataFrame({"test": [False, False]}) tm.assert_frame_equal(df == -1, expected) def test_dt64_nat_comparison(self): @@ -82,21 +94,26 @@ def test_dt64_nat_comparison(self): class TestDatetime64SeriesComparison: # TODO: moved from tests.series.test_operators; needs cleanup - @pytest.mark.parametrize('pair', [ - ([pd.Timestamp('2011-01-01'), NaT, pd.Timestamp('2011-01-03')], - [NaT, NaT, pd.Timestamp('2011-01-03')]), - - ([pd.Timedelta('1 days'), NaT, pd.Timedelta('3 days')], - [NaT, NaT, pd.Timedelta('3 days')]), - - ([pd.Period('2011-01', freq='M'), NaT, - pd.Period('2011-03', freq='M')], - [NaT, NaT, pd.Period('2011-03', freq='M')]), - - ]) - @pytest.mark.parametrize('reverse', [True, False]) - @pytest.mark.parametrize('box', [Series, pd.Index]) - @pytest.mark.parametrize('dtype', [None, object]) + @pytest.mark.parametrize( + "pair", + [ + ( + [pd.Timestamp("2011-01-01"), NaT, pd.Timestamp("2011-01-03")], + [NaT, NaT, pd.Timestamp("2011-01-03")], + ), + ( + [pd.Timedelta("1 days"), NaT, pd.Timedelta("3 days")], + [NaT, NaT, pd.Timedelta("3 days")], + ), + ( + [pd.Period("2011-01", freq="M"), NaT, pd.Period("2011-03", freq="M")], + [NaT, NaT, pd.Period("2011-03", freq="M")], + ), + ], + ) + @pytest.mark.parametrize("reverse", [True, False]) + @pytest.mark.parametrize("box", [Series, pd.Index]) + @pytest.mark.parametrize("dtype", [None, object]) def test_nat_comparisons(self, dtype, box, reverse, pair): l, r = pair if reverse: @@ -131,7 +148,7 @@ def test_comparison_invalid(self, box_with_array): xbox = box_with_array if box_with_array is not pd.Index else np.ndarray ser = Series(range(5)) - ser2 = Series(pd.date_range('20010101', periods=5)) + ser2 = Series(pd.date_range("20010101", periods=5)) ser = tm.box_expected(ser, box_with_array) ser2 = tm.box_expected(ser2, box_with_array) @@ -145,7 +162,7 @@ def test_comparison_invalid(self, box_with_array): result = x != y expected = tm.box_expected([True] * 5, xbox) tm.assert_equal(result, expected) - msg = 'Invalid comparison between' + msg = "Invalid comparison between" with pytest.raises(TypeError, match=msg): x >= y with pytest.raises(TypeError, match=msg): @@ -155,12 +172,15 @@ def test_comparison_invalid(self, box_with_array): with pytest.raises(TypeError, match=msg): x <= y - @pytest.mark.parametrize('data', [ - [Timestamp('2011-01-01'), NaT, Timestamp('2011-01-03')], - [Timedelta('1 days'), NaT, Timedelta('3 days')], - [Period('2011-01', freq='M'), NaT, Period('2011-03', freq='M')] - ]) - @pytest.mark.parametrize('dtype', [None, object]) + @pytest.mark.parametrize( + "data", + [ + [Timestamp("2011-01-01"), NaT, Timestamp("2011-01-03")], + [Timedelta("1 days"), NaT, Timedelta("3 days")], + [Period("2011-01", freq="M"), NaT, Period("2011-03", freq="M")], + ], + ) + @pytest.mark.parametrize("dtype", [None, object]) def test_nat_comparisons_scalar(self, dtype, data, box_with_array): if box_with_array is tm.to_array and dtype is object: # dont bother testing ndarray comparison methods as this fails @@ -195,7 +215,7 @@ def test_nat_comparisons_scalar(self, dtype, data, box_with_array): tm.assert_equal(NaT <= left, expected) def test_series_comparison_scalars(self): - series = Series(date_range('1/1/2000', periods=10)) + series = Series(date_range("1/1/2000", periods=10)) val = datetime(2000, 1, 4) result = series > val @@ -210,12 +230,12 @@ def test_series_comparison_scalars(self): def test_dt64_ser_cmp_date_warning(self): # https://github.com/pandas-dev/pandas/issues/21359 # Remove this test and enble invalid test below - ser = pd.Series(pd.date_range('20010101', periods=10), name='dates') + ser = pd.Series(pd.date_range("20010101", periods=10), name="dates") date = ser.iloc[0].to_pydatetime().date() with tm.assert_produces_warning(FutureWarning) as m: result = ser == date - expected = pd.Series([True] + [False] * 9, name='dates') + expected = pd.Series([True] + [False] * 9, name="dates") tm.assert_series_equal(result, expected) assert "Comparing Series of datetimes " in str(m[0].message) assert "will not compare equal" in str(m[0].message) @@ -232,18 +252,17 @@ def test_dt64_ser_cmp_date_warning(self): with tm.assert_produces_warning(FutureWarning) as m: result = ser < date - tm.assert_series_equal(result, pd.Series([False] * 10, name='dates')) + tm.assert_series_equal(result, pd.Series([False] * 10, name="dates")) assert "a TypeError will be raised" in str(m[0].message) with tm.assert_produces_warning(FutureWarning) as m: result = ser >= date - tm.assert_series_equal(result, pd.Series([True] * 10, name='dates')) + tm.assert_series_equal(result, pd.Series([True] * 10, name="dates")) assert "a TypeError will be raised" in str(m[0].message) with tm.assert_produces_warning(FutureWarning) as m: result = ser > date - tm.assert_series_equal(result, pd.Series([False] + [True] * 9, - name='dates')) + tm.assert_series_equal(result, pd.Series([False] + [True] * 9, name="dates")) assert "a TypeError will be raised" in str(m[0].message) @pytest.mark.skip(reason="GH#21359") @@ -252,7 +271,7 @@ def test_dt64ser_cmp_date_invalid(self, box_with_array): # match DatetimeIndex/Timestamp. This also matches the behavior # of stdlib datetime.datetime - ser = pd.date_range('20010101', periods=10) + ser = pd.date_range("20010101", periods=10) date = ser.iloc[0].to_pydatetime().date() ser = tm.box_expected(ser, box_with_array) @@ -267,12 +286,9 @@ def test_dt64ser_cmp_date_invalid(self, box_with_array): with pytest.raises(TypeError): ser <= date - @pytest.mark.parametrize("left,right", [ - ("lt", "gt"), - ("le", "ge"), - ("eq", "eq"), - ("ne", "ne"), - ]) + @pytest.mark.parametrize( + "left,right", [("lt", "gt"), ("le", "ge"), ("eq", "eq"), ("ne", "ne")] + ) def test_timestamp_compare_series(self, left, right): # see gh-4982 # Make sure we can compare Timestamps on the right AND left hand side. @@ -309,7 +325,7 @@ def test_dt64arr_timestamp_equality(self, box_with_array): # GH#11034 xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - ser = pd.Series([pd.Timestamp('2000-01-29 01:59:00'), 'NaT']) + ser = pd.Series([pd.Timestamp("2000-01-29 01:59:00"), "NaT"]) ser = tm.box_expected(ser, box_with_array) result = ser != ser @@ -336,19 +352,20 @@ def test_dt64arr_timestamp_equality(self, box_with_array): expected = tm.box_expected([False, False], xbox) tm.assert_equal(result, expected) - @pytest.mark.parametrize('op', [operator.eq, operator.ne, - operator.gt, operator.ge, - operator.lt, operator.le]) + @pytest.mark.parametrize( + "op", + [operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le], + ) def test_comparison_tzawareness_compat(self, op): # GH#18162 - dr = pd.date_range('2016-01-01', periods=6) - dz = dr.tz_localize('US/Pacific') + dr = pd.date_range("2016-01-01", periods=6) + dz = dr.tz_localize("US/Pacific") # Check that there isn't a problem aware-aware and naive-naive do not # raise naive_series = Series(dr) aware_series = Series(dz) - msg = 'Cannot compare tz-naive and tz-aware' + msg = "Cannot compare tz-naive and tz-aware" with pytest.raises(TypeError, match=msg): op(dz, naive_series) with pytest.raises(TypeError, match=msg): @@ -361,10 +378,10 @@ def test_comparison_tzawareness_compat(self, op): class TestDatetimeIndexComparisons: # TODO: moved from tests.indexes.test_base; parametrize and de-duplicate - @pytest.mark.parametrize("op", [ - operator.eq, operator.ne, operator.gt, operator.lt, - operator.ge, operator.le - ]) + @pytest.mark.parametrize( + "op", + [operator.eq, operator.ne, operator.gt, operator.lt, operator.ge, operator.le], + ) def test_comparators(self, op): index = tm.makeDateIndex(100) element = index[len(index) // 2] @@ -377,12 +394,13 @@ def test_comparators(self, op): assert isinstance(index_result, np.ndarray) tm.assert_numpy_array_equal(arr_result, index_result) - @pytest.mark.parametrize('other', [datetime(2016, 1, 1), - Timestamp('2016-01-01'), - np.datetime64('2016-01-01')]) + @pytest.mark.parametrize( + "other", + [datetime(2016, 1, 1), Timestamp("2016-01-01"), np.datetime64("2016-01-01")], + ) def test_dti_cmp_datetimelike(self, other, tz_naive_fixture): tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=2, tz=tz) + dti = pd.date_range("2016-01-01", periods=2, tz=tz) if tz is not None: if isinstance(other, np.datetime64): # no tzaware version available @@ -413,7 +431,7 @@ def dt64arr_cmp_non_datetime(self, tz_naive_fixture, box_with_array): # GH#19301 by convention datetime.date is not considered comparable # to Timestamp or DatetimeIndex. This may change in the future. tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=2, tz=tz) + dti = pd.date_range("2016-01-01", periods=2, tz=tz) dtarr = tm.box_expected(dti, box_with_array) other = datetime(2016, 1, 1).date() @@ -428,28 +446,29 @@ def dt64arr_cmp_non_datetime(self, tz_naive_fixture, box_with_array): with pytest.raises(TypeError): dtarr >= other - @pytest.mark.parametrize('other', [None, np.nan, pd.NaT]) + @pytest.mark.parametrize("other", [None, np.nan, pd.NaT]) def test_dti_eq_null_scalar(self, other, tz_naive_fixture): # GH#19301 tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=2, tz=tz) + dti = pd.date_range("2016-01-01", periods=2, tz=tz) assert not (dti == other).any() - @pytest.mark.parametrize('other', [None, np.nan, pd.NaT]) + @pytest.mark.parametrize("other", [None, np.nan, pd.NaT]) def test_dti_ne_null_scalar(self, other, tz_naive_fixture): # GH#19301 tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=2, tz=tz) + dti = pd.date_range("2016-01-01", periods=2, tz=tz) assert (dti != other).all() - @pytest.mark.parametrize('other', [None, np.nan]) - def test_dti_cmp_null_scalar_inequality(self, tz_naive_fixture, other, - box_with_array): + @pytest.mark.parametrize("other", [None, np.nan]) + def test_dti_cmp_null_scalar_inequality( + self, tz_naive_fixture, other, box_with_array + ): # GH#19301 tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=2, tz=tz) + dti = pd.date_range("2016-01-01", periods=2, tz=tz) dtarr = tm.box_expected(dti, box_with_array) - msg = 'Invalid comparison between' + msg = "Invalid comparison between" with pytest.raises(TypeError, match=msg): dtarr < other with pytest.raises(TypeError, match=msg): @@ -459,7 +478,7 @@ def test_dti_cmp_null_scalar_inequality(self, tz_naive_fixture, other, with pytest.raises(TypeError, match=msg): dtarr >= other - @pytest.mark.parametrize('dtype', [None, object]) + @pytest.mark.parametrize("dtype", [None, object]) def test_dti_cmp_nat(self, dtype, box_with_array): if box_with_array is tm.to_array and dtype is object: # dont bother testing ndarray comparison methods as this fails @@ -468,9 +487,10 @@ def test_dti_cmp_nat(self, dtype, box_with_array): xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - left = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT, - pd.Timestamp('2011-01-03')]) - right = pd.DatetimeIndex([pd.NaT, pd.NaT, pd.Timestamp('2011-01-03')]) + left = pd.DatetimeIndex( + [pd.Timestamp("2011-01-01"), pd.NaT, pd.Timestamp("2011-01-03")] + ) + right = pd.DatetimeIndex([pd.NaT, pd.NaT, pd.Timestamp("2011-01-03")]) left = tm.box_expected(left, box_with_array) right = tm.box_expected(right, box_with_array) @@ -508,15 +528,22 @@ def test_dti_cmp_nat_behaves_like_float_cmp_nan(self): fidx1 = pd.Index([1.0, np.nan, 3.0, np.nan, 5.0, 7.0]) fidx2 = pd.Index([2.0, 3.0, np.nan, np.nan, 6.0, 7.0]) - didx1 = pd.DatetimeIndex(['2014-01-01', pd.NaT, '2014-03-01', pd.NaT, - '2014-05-01', '2014-07-01']) - didx2 = pd.DatetimeIndex(['2014-02-01', '2014-03-01', pd.NaT, pd.NaT, - '2014-06-01', '2014-07-01']) - darr = np.array([np_datetime64_compat('2014-02-01 00:00Z'), - np_datetime64_compat('2014-03-01 00:00Z'), - np_datetime64_compat('nat'), np.datetime64('nat'), - np_datetime64_compat('2014-06-01 00:00Z'), - np_datetime64_compat('2014-07-01 00:00Z')]) + didx1 = pd.DatetimeIndex( + ["2014-01-01", pd.NaT, "2014-03-01", pd.NaT, "2014-05-01", "2014-07-01"] + ) + didx2 = pd.DatetimeIndex( + ["2014-02-01", "2014-03-01", pd.NaT, pd.NaT, "2014-06-01", "2014-07-01"] + ) + darr = np.array( + [ + np_datetime64_compat("2014-02-01 00:00Z"), + np_datetime64_compat("2014-03-01 00:00Z"), + np_datetime64_compat("nat"), + np.datetime64("nat"), + np_datetime64_compat("2014-06-01 00:00Z"), + np_datetime64_compat("2014-07-01 00:00Z"), + ] + ) cases = [(fidx1, fidx2), (didx1, didx2), (didx1, darr)] @@ -593,20 +620,21 @@ def test_dti_cmp_nat_behaves_like_float_cmp_nan(self): expected = np.array([True, True, False, True, True, True]) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('op', [operator.eq, operator.ne, - operator.gt, operator.ge, - operator.lt, operator.le]) + @pytest.mark.parametrize( + "op", + [operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le], + ) def test_comparison_tzawareness_compat(self, op, box_df_fail): # GH#18162 box = box_df_fail - dr = pd.date_range('2016-01-01', periods=6) - dz = dr.tz_localize('US/Pacific') + dr = pd.date_range("2016-01-01", periods=6) + dz = dr.tz_localize("US/Pacific") dr = tm.box_expected(dr, box) dz = tm.box_expected(dz, box) - msg = 'Cannot compare tz-naive and tz-aware' + msg = "Cannot compare tz-naive and tz-aware" with pytest.raises(TypeError, match=msg): op(dr, dz) @@ -636,23 +664,24 @@ def test_comparison_tzawareness_compat(self, op, box_df_fail): assert (dr == list(dr)).all() assert (dz == list(dz)).all() - @pytest.mark.parametrize('op', [operator.eq, operator.ne, - operator.gt, operator.ge, - operator.lt, operator.le]) + @pytest.mark.parametrize( + "op", + [operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le], + ) def test_comparison_tzawareness_compat_scalars(self, op, box_with_array): # GH#18162 - dr = pd.date_range('2016-01-01', periods=6) - dz = dr.tz_localize('US/Pacific') + dr = pd.date_range("2016-01-01", periods=6) + dz = dr.tz_localize("US/Pacific") dr = tm.box_expected(dr, box_with_array) dz = tm.box_expected(dz, box_with_array) # Check comparisons against scalar Timestamps - ts = pd.Timestamp('2000-03-14 01:59') - ts_tz = pd.Timestamp('2000-03-14 01:59', tz='Europe/Amsterdam') + ts = pd.Timestamp("2000-03-14 01:59") + ts_tz = pd.Timestamp("2000-03-14 01:59", tz="Europe/Amsterdam") assert_all(dr > ts) - msg = 'Cannot compare tz-naive and tz-aware' + msg = "Cannot compare tz-naive and tz-aware" with pytest.raises(TypeError, match=msg): op(dr, ts_tz) @@ -664,49 +693,54 @@ def test_comparison_tzawareness_compat_scalars(self, op, box_with_array): with pytest.raises(TypeError, match=msg): op(ts, dz) - @pytest.mark.parametrize('op', [operator.eq, operator.ne, - operator.gt, operator.ge, - operator.lt, operator.le]) - @pytest.mark.parametrize('other', [datetime(2016, 1, 1), - Timestamp('2016-01-01'), - np.datetime64('2016-01-01')]) + @pytest.mark.parametrize( + "op", + [operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le], + ) + @pytest.mark.parametrize( + "other", + [datetime(2016, 1, 1), Timestamp("2016-01-01"), np.datetime64("2016-01-01")], + ) # Bug in NumPy? https://github.com/numpy/numpy/issues/13841 # Raising in __eq__ will fallback to NumPy, which warns, fails, # then re-raises the original exception. So we just need to ignore. @pytest.mark.filterwarnings("ignore:elementwise comp:DeprecationWarning") - def test_scalar_comparison_tzawareness(self, op, other, tz_aware_fixture, - box_with_array): + def test_scalar_comparison_tzawareness( + self, op, other, tz_aware_fixture, box_with_array + ): tz = tz_aware_fixture - dti = pd.date_range('2016-01-01', periods=2, tz=tz) + dti = pd.date_range("2016-01-01", periods=2, tz=tz) dtarr = tm.box_expected(dti, box_with_array) - msg = 'Cannot compare tz-naive and tz-aware' + msg = "Cannot compare tz-naive and tz-aware" with pytest.raises(TypeError, match=msg): op(dtarr, other) with pytest.raises(TypeError, match=msg): op(other, dtarr) - @pytest.mark.parametrize('op', [operator.eq, operator.ne, - operator.gt, operator.ge, - operator.lt, operator.le]) + @pytest.mark.parametrize( + "op", + [operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le], + ) def test_nat_comparison_tzawareness(self, op): # GH#19276 # tzaware DatetimeIndex should not raise when compared to NaT - dti = pd.DatetimeIndex(['2014-01-01', pd.NaT, '2014-03-01', pd.NaT, - '2014-05-01', '2014-07-01']) + dti = pd.DatetimeIndex( + ["2014-01-01", pd.NaT, "2014-03-01", pd.NaT, "2014-05-01", "2014-07-01"] + ) expected = np.array([op == operator.ne] * len(dti)) result = op(dti, pd.NaT) tm.assert_numpy_array_equal(result, expected) - result = op(dti.tz_localize('US/Pacific'), pd.NaT) + result = op(dti.tz_localize("US/Pacific"), pd.NaT) tm.assert_numpy_array_equal(result, expected) def test_dti_cmp_str(self, tz_naive_fixture): # GH#22074 # regardless of tz, we expect these comparisons are valid tz = tz_naive_fixture - rng = date_range('1/1/2000', periods=10, tz=tz) - other = '1/1/2000' + rng = date_range("1/1/2000", periods=10, tz=tz) + other = "1/1/2000" result = rng == other expected = np.array([True] + [False] * 9) @@ -732,15 +766,13 @@ def test_dti_cmp_str(self, tz_naive_fixture): expected = np.array([True] * 10) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('other', ['foo', 99, 4.0, - object(), timedelta(days=2)]) - def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, - box_with_array): + @pytest.mark.parametrize("other", ["foo", 99, 4.0, object(), timedelta(days=2)]) + def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, box_with_array): # GH#22074 tz = tz_naive_fixture xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - rng = date_range('1/1/2000', periods=10, tz=tz) + rng = date_range("1/1/2000", periods=10, tz=tz) rng = tm.box_expected(rng, box_with_array) result = rng == other @@ -752,7 +784,7 @@ def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, expected = np.array([True] * 10) expected = tm.box_expected(expected, xbox) tm.assert_equal(result, expected) - msg = 'Invalid comparison between' + msg = "Invalid comparison between" with pytest.raises(TypeError, match=msg): rng < other with pytest.raises(TypeError, match=msg): @@ -763,22 +795,26 @@ def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, rng >= other def test_dti_cmp_list(self): - rng = date_range('1/1/2000', periods=10) + rng = date_range("1/1/2000", periods=10) result = rng == list(rng) expected = rng == rng tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('other', [ - pd.timedelta_range('1D', periods=10), - pd.timedelta_range('1D', periods=10).to_series(), - pd.timedelta_range('1D', periods=10).asi8.view('m8[ns]') - ], ids=lambda x: type(x).__name__) + @pytest.mark.parametrize( + "other", + [ + pd.timedelta_range("1D", periods=10), + pd.timedelta_range("1D", periods=10).to_series(), + pd.timedelta_range("1D", periods=10).asi8.view("m8[ns]"), + ], + ids=lambda x: type(x).__name__, + ) def test_dti_cmp_tdi_tzawareness(self, other): # GH#22074 # reversion test that we _don't_ call _assert_tzawareness_compat # when comparing against TimedeltaIndex - dti = date_range('2000-01-01', periods=10, tz='Asia/Tokyo') + dti = date_range("2000-01-01", periods=10, tz="Asia/Tokyo") result = dti == other expected = np.array([False] * 10) @@ -787,7 +823,7 @@ def test_dti_cmp_tdi_tzawareness(self, other): result = dti != other expected = np.array([True] * 10) tm.assert_numpy_array_equal(result, expected) - msg = 'Invalid comparison between' + msg = "Invalid comparison between" with pytest.raises(TypeError, match=msg): dti < other with pytest.raises(TypeError, match=msg): @@ -799,16 +835,16 @@ def test_dti_cmp_tdi_tzawareness(self, other): def test_dti_cmp_object_dtype(self): # GH#22074 - dti = date_range('2000-01-01', periods=10, tz='Asia/Tokyo') + dti = date_range("2000-01-01", periods=10, tz="Asia/Tokyo") - other = dti.astype('O') + other = dti.astype("O") result = dti == other expected = np.array([True] * 10) tm.assert_numpy_array_equal(result, expected) other = dti.tz_localize(None) - msg = 'Cannot compare tz-naive and tz-aware' + msg = "Cannot compare tz-naive and tz-aware" with pytest.raises(TypeError, match=msg): # tzawareness failure dti != other @@ -825,6 +861,7 @@ def test_dti_cmp_object_dtype(self): # ------------------------------------------------------------------ # Arithmetic + class TestDatetime64Arithmetic: # This class is intended for "finished" tests that are fully parametrized # over DataFrame/Series/Index/DatetimeArray @@ -832,14 +869,14 @@ class TestDatetime64Arithmetic: # ------------------------------------------------------------- # Addition/Subtraction of timedelta-like - def test_dt64arr_add_timedeltalike_scalar(self, tz_naive_fixture, - two_hours, box_with_array): + def test_dt64arr_add_timedeltalike_scalar( + self, tz_naive_fixture, two_hours, box_with_array + ): # GH#22005, GH#22163 check DataFrame doesn't raise TypeError tz = tz_naive_fixture - rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) - expected = pd.date_range('2000-01-01 02:00', - '2000-02-01 02:00', tz=tz) + rng = pd.date_range("2000-01-01", "2000-02-01", tz=tz) + expected = pd.date_range("2000-01-01 02:00", "2000-02-01 02:00", tz=tz) rng = tm.box_expected(rng, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -847,13 +884,13 @@ def test_dt64arr_add_timedeltalike_scalar(self, tz_naive_fixture, result = rng + two_hours tm.assert_equal(result, expected) - def test_dt64arr_iadd_timedeltalike_scalar(self, tz_naive_fixture, - two_hours, box_with_array): + def test_dt64arr_iadd_timedeltalike_scalar( + self, tz_naive_fixture, two_hours, box_with_array + ): tz = tz_naive_fixture - rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) - expected = pd.date_range('2000-01-01 02:00', - '2000-02-01 02:00', tz=tz) + rng = pd.date_range("2000-01-01", "2000-02-01", tz=tz) + expected = pd.date_range("2000-01-01 02:00", "2000-02-01 02:00", tz=tz) rng = tm.box_expected(rng, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -861,13 +898,13 @@ def test_dt64arr_iadd_timedeltalike_scalar(self, tz_naive_fixture, rng += two_hours tm.assert_equal(rng, expected) - def test_dt64arr_sub_timedeltalike_scalar(self, tz_naive_fixture, - two_hours, box_with_array): + def test_dt64arr_sub_timedeltalike_scalar( + self, tz_naive_fixture, two_hours, box_with_array + ): tz = tz_naive_fixture - rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) - expected = pd.date_range('1999-12-31 22:00', - '2000-01-31 22:00', tz=tz) + rng = pd.date_range("2000-01-01", "2000-02-01", tz=tz) + expected = pd.date_range("1999-12-31 22:00", "2000-01-31 22:00", tz=tz) rng = tm.box_expected(rng, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -875,13 +912,13 @@ def test_dt64arr_sub_timedeltalike_scalar(self, tz_naive_fixture, result = rng - two_hours tm.assert_equal(result, expected) - def test_dt64arr_isub_timedeltalike_scalar(self, tz_naive_fixture, - two_hours, box_with_array): + def test_dt64arr_isub_timedeltalike_scalar( + self, tz_naive_fixture, two_hours, box_with_array + ): tz = tz_naive_fixture - rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) - expected = pd.date_range('1999-12-31 22:00', - '2000-01-31 22:00', tz=tz) + rng = pd.date_range("2000-01-01", "2000-02-01", tz=tz) + expected = pd.date_range("1999-12-31 22:00", "2000-01-31 22:00", tz=tz) rng = tm.box_expected(rng, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -892,26 +929,28 @@ def test_dt64arr_isub_timedeltalike_scalar(self, tz_naive_fixture, def test_dt64arr_add_td64_scalar(self, box_with_array): # scalar timedeltas/np.timedelta64 objects # operate with np.timedelta64 correctly - ser = Series([Timestamp('20130101 9:01'), Timestamp('20130101 9:02')]) + ser = Series([Timestamp("20130101 9:01"), Timestamp("20130101 9:02")]) - expected = Series([Timestamp('20130101 9:01:01'), - Timestamp('20130101 9:02:01')]) + expected = Series( + [Timestamp("20130101 9:01:01"), Timestamp("20130101 9:02:01")] + ) dtarr = tm.box_expected(ser, box_with_array) expected = tm.box_expected(expected, box_with_array) - result = dtarr + np.timedelta64(1, 's') + result = dtarr + np.timedelta64(1, "s") tm.assert_equal(result, expected) - result = np.timedelta64(1, 's') + dtarr + result = np.timedelta64(1, "s") + dtarr tm.assert_equal(result, expected) - expected = Series([Timestamp('20130101 9:01:00.005'), - Timestamp('20130101 9:02:00.005')]) + expected = Series( + [Timestamp("20130101 9:01:00.005"), Timestamp("20130101 9:02:00.005")] + ) expected = tm.box_expected(expected, box_with_array) - result = dtarr + np.timedelta64(5, 'ms') + result = dtarr + np.timedelta64(5, "ms") tm.assert_equal(result, expected) - result = np.timedelta64(5, 'ms') + dtarr + result = np.timedelta64(5, "ms") + dtarr tm.assert_equal(result, expected) def test_dt64arr_add_sub_td64_nat(self, box_with_array, tz_naive_fixture): @@ -933,19 +972,18 @@ def test_dt64arr_add_sub_td64_nat(self, box_with_array, tz_naive_fixture): tm.assert_equal(result, expected) result = obj - other tm.assert_equal(result, expected) - msg = 'cannot subtract' + msg = "cannot subtract" with pytest.raises(TypeError, match=msg): other - obj - def test_dt64arr_add_sub_td64ndarray(self, tz_naive_fixture, - box_with_array): + def test_dt64arr_add_sub_td64ndarray(self, tz_naive_fixture, box_with_array): tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=3, tz=tz) - tdi = pd.TimedeltaIndex(['-1 Day', '-1 Day', '-1 Day']) + dti = pd.date_range("2016-01-01", periods=3, tz=tz) + tdi = pd.TimedeltaIndex(["-1 Day", "-1 Day", "-1 Day"]) tdarr = tdi.values - expected = pd.date_range('2015-12-31', periods=3, tz=tz) + expected = pd.date_range("2015-12-31", periods=3, tz=tz) dtarr = tm.box_expected(dti, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -955,28 +993,32 @@ def test_dt64arr_add_sub_td64ndarray(self, tz_naive_fixture, result = tdarr + dtarr tm.assert_equal(result, expected) - expected = pd.date_range('2016-01-02', periods=3, tz=tz) + expected = pd.date_range("2016-01-02", periods=3, tz=tz) expected = tm.box_expected(expected, box_with_array) result = dtarr - tdarr tm.assert_equal(result, expected) - msg = 'cannot subtract|bad operand type for unary -' + msg = "cannot subtract|bad operand type for unary -" with pytest.raises(TypeError, match=msg): tdarr - dtarr # ----------------------------------------------------------------- # Subtraction of datetime-like scalars - @pytest.mark.parametrize('ts', [ - pd.Timestamp('2013-01-01'), - pd.Timestamp('2013-01-01').to_pydatetime(), - pd.Timestamp('2013-01-01').to_datetime64()]) + @pytest.mark.parametrize( + "ts", + [ + pd.Timestamp("2013-01-01"), + pd.Timestamp("2013-01-01").to_pydatetime(), + pd.Timestamp("2013-01-01").to_datetime64(), + ], + ) def test_dt64arr_sub_dtscalar(self, box_with_array, ts): # GH#8554, GH#22163 DataFrame op should _not_ return dt64 dtype - idx = pd.date_range('2013-01-01', periods=3) + idx = pd.date_range("2013-01-01", periods=3) idx = tm.box_expected(idx, box_with_array) - expected = pd.TimedeltaIndex(['0 Days', '1 Day', '2 Days']) + expected = pd.TimedeltaIndex(["0 Days", "1 Day", "2 Days"]) expected = tm.box_expected(expected, box_with_array) result = idx - ts @@ -985,13 +1027,13 @@ def test_dt64arr_sub_dtscalar(self, box_with_array, ts): def test_dt64arr_sub_datetime64_not_ns(self, box_with_array): # GH#7996, GH#22163 ensure non-nano datetime64 is converted to nano # for DataFrame operation - dt64 = np.datetime64('2013-01-01') - assert dt64.dtype == 'datetime64[D]' + dt64 = np.datetime64("2013-01-01") + assert dt64.dtype == "datetime64[D]" - dti = pd.date_range('20130101', periods=3) + dti = pd.date_range("20130101", periods=3) dtarr = tm.box_expected(dti, box_with_array) - expected = pd.TimedeltaIndex(['0 Days', '1 Day', '2 Days']) + expected = pd.TimedeltaIndex(["0 Days", "1 Day", "2 Days"]) expected = tm.box_expected(expected, box_with_array) result = dtarr - dt64 @@ -1001,14 +1043,12 @@ def test_dt64arr_sub_datetime64_not_ns(self, box_with_array): tm.assert_equal(result, -expected) def test_dt64arr_sub_timestamp(self, box_with_array): - ser = pd.date_range('2014-03-17', periods=2, freq='D', - tz='US/Eastern') + ser = pd.date_range("2014-03-17", periods=2, freq="D", tz="US/Eastern") ts = ser[0] ser = tm.box_expected(ser, box_with_array) - delta_series = pd.Series([np.timedelta64(0, 'D'), - np.timedelta64(1, 'D')]) + delta_series = pd.Series([np.timedelta64(0, "D"), np.timedelta64(1, "D")]) expected = tm.box_expected(delta_series, box_with_array) tm.assert_equal(ser - ts, expected) @@ -1016,19 +1056,19 @@ def test_dt64arr_sub_timestamp(self, box_with_array): def test_dt64arr_sub_NaT(self, box_with_array): # GH#18808 - dti = pd.DatetimeIndex([pd.NaT, pd.Timestamp('19900315')]) + dti = pd.DatetimeIndex([pd.NaT, pd.Timestamp("19900315")]) ser = tm.box_expected(dti, box_with_array) result = ser - pd.NaT - expected = pd.Series([pd.NaT, pd.NaT], dtype='timedelta64[ns]') + expected = pd.Series([pd.NaT, pd.NaT], dtype="timedelta64[ns]") expected = tm.box_expected(expected, box_with_array) tm.assert_equal(result, expected) - dti_tz = dti.tz_localize('Asia/Tokyo') + dti_tz = dti.tz_localize("Asia/Tokyo") ser_tz = tm.box_expected(dti_tz, box_with_array) result = ser_tz - pd.NaT - expected = pd.Series([pd.NaT, pd.NaT], dtype='timedelta64[ns]') + expected = pd.Series([pd.NaT, pd.NaT], dtype="timedelta64[ns]") expected = tm.box_expected(expected, box_with_array) tm.assert_equal(result, expected) @@ -1036,7 +1076,7 @@ def test_dt64arr_sub_NaT(self, box_with_array): # Subtraction of datetime-like array-like def test_dt64arr_naive_sub_dt64ndarray(self, box_with_array): - dti = pd.date_range('2016-01-01', periods=3, tz=None) + dti = pd.date_range("2016-01-01", periods=3, tz=None) dt64vals = dti.values dtarr = tm.box_expected(dti, box_with_array) @@ -1047,15 +1087,16 @@ def test_dt64arr_naive_sub_dt64ndarray(self, box_with_array): result = dt64vals - dtarr tm.assert_equal(result, expected) - def test_dt64arr_aware_sub_dt64ndarray_raises(self, tz_aware_fixture, - box_with_array): + def test_dt64arr_aware_sub_dt64ndarray_raises( + self, tz_aware_fixture, box_with_array + ): tz = tz_aware_fixture - dti = pd.date_range('2016-01-01', periods=3, tz=tz) + dti = pd.date_range("2016-01-01", periods=3, tz=tz) dt64vals = dti.values dtarr = tm.box_expected(dti, box_with_array) - msg = 'subtraction must have the same timezones or' + msg = "subtraction must have the same timezones or" with pytest.raises(TypeError, match=msg): dtarr - dt64vals with pytest.raises(TypeError, match=msg): @@ -1064,15 +1105,14 @@ def test_dt64arr_aware_sub_dt64ndarray_raises(self, tz_aware_fixture, # ------------------------------------------------------------- # Addition of datetime-like others (invalid) - def test_dt64arr_add_dt64ndarray_raises(self, tz_naive_fixture, - box_with_array): + def test_dt64arr_add_dt64ndarray_raises(self, tz_naive_fixture, box_with_array): tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=3, tz=tz) + dti = pd.date_range("2016-01-01", periods=3, tz=tz) dt64vals = dti.values dtarr = tm.box_expected(dti, box_with_array) - msg = 'cannot add' + msg = "cannot add" with pytest.raises(TypeError, match=msg): dtarr + dt64vals with pytest.raises(TypeError, match=msg): @@ -1080,22 +1120,22 @@ def test_dt64arr_add_dt64ndarray_raises(self, tz_naive_fixture, def test_dt64arr_add_timestamp_raises(self, box_with_array): # GH#22163 ensure DataFrame doesn't cast Timestamp to i8 - idx = DatetimeIndex(['2011-01-01', '2011-01-02']) + idx = DatetimeIndex(["2011-01-01", "2011-01-02"]) idx = tm.box_expected(idx, box_with_array) - msg = 'cannot add' + msg = "cannot add" with pytest.raises(TypeError, match=msg): - idx + Timestamp('2011-01-01') + idx + Timestamp("2011-01-01") with pytest.raises(TypeError, match=msg): - Timestamp('2011-01-01') + idx + Timestamp("2011-01-01") + idx # ------------------------------------------------------------- # Other Invalid Addition/Subtraction - @pytest.mark.parametrize('other', [3.14, np.array([2.0, 3.0])]) + @pytest.mark.parametrize("other", [3.14, np.array([2.0, 3.0])]) def test_dt64arr_add_sub_float(self, other, box_with_array): - dti = DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D') + dti = DatetimeIndex(["2011-01-01", "2011-01-02"], freq="D") dtarr = tm.box_expected(dti, box_with_array) - msg = '|'.join(['unsupported operand type', 'cannot (add|subtract)']) + msg = "|".join(["unsupported operand type", "cannot (add|subtract)"]) with pytest.raises(TypeError, match=msg): dtarr + other with pytest.raises(TypeError, match=msg): @@ -1105,18 +1145,25 @@ def test_dt64arr_add_sub_float(self, other, box_with_array): with pytest.raises(TypeError, match=msg): other - dtarr - @pytest.mark.parametrize('pi_freq', ['D', 'W', 'Q', 'H']) - @pytest.mark.parametrize('dti_freq', [None, 'D']) - def test_dt64arr_add_sub_parr(self, dti_freq, pi_freq, - box_with_array, box_with_array2): + @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "H"]) + @pytest.mark.parametrize("dti_freq", [None, "D"]) + def test_dt64arr_add_sub_parr( + self, dti_freq, pi_freq, box_with_array, box_with_array2 + ): # GH#20049 subtracting PeriodIndex should raise TypeError - dti = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], freq=dti_freq) + dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], freq=dti_freq) pi = dti.to_period(pi_freq) dtarr = tm.box_expected(dti, box_with_array) parr = tm.box_expected(pi, box_with_array2) - msg = '|'.join(['cannot (add|subtract)', 'unsupported operand', - 'descriptor.*requires', 'ufunc.*cannot use operands']) + msg = "|".join( + [ + "cannot (add|subtract)", + "unsupported operand", + "descriptor.*requires", + "ufunc.*cannot use operands", + ] + ) with pytest.raises(TypeError, match=msg): dtarr + parr with pytest.raises(TypeError, match=msg): @@ -1126,15 +1173,15 @@ def test_dt64arr_add_sub_parr(self, dti_freq, pi_freq, with pytest.raises(TypeError, match=msg): parr - dtarr - @pytest.mark.parametrize('dti_freq', [None, 'D']) + @pytest.mark.parametrize("dti_freq", [None, "D"]) def test_dt64arr_add_sub_period_scalar(self, dti_freq, box_with_array): # GH#13078 # not supported, check TypeError - per = pd.Period('2011-01-01', freq='D') + per = pd.Period("2011-01-01", freq="D") - idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], freq=dti_freq) + idx = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], freq=dti_freq) dtarr = tm.box_expected(idx, box_with_array) - msg = '|'.join(['unsupported operand type', 'cannot (add|subtract)']) + msg = "|".join(["unsupported operand type", "cannot (add|subtract)"]) with pytest.raises(TypeError, match=msg): dtarr + per with pytest.raises(TypeError, match=msg): @@ -1154,9 +1201,10 @@ class TestDatetime64DateOffsetArithmetic: def test_dt64arr_series_add_tick_DateOffset(self, box_with_array): # GH#4532 # operate with pd.offsets - ser = Series([Timestamp('20130101 9:01'), Timestamp('20130101 9:02')]) - expected = Series([Timestamp('20130101 9:01:05'), - Timestamp('20130101 9:02:05')]) + ser = Series([Timestamp("20130101 9:01"), Timestamp("20130101 9:02")]) + expected = Series( + [Timestamp("20130101 9:01:05"), Timestamp("20130101 9:02:05")] + ) ser = tm.box_expected(ser, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1170,9 +1218,10 @@ def test_dt64arr_series_add_tick_DateOffset(self, box_with_array): def test_dt64arr_series_sub_tick_DateOffset(self, box_with_array): # GH#4532 # operate with pd.offsets - ser = Series([Timestamp('20130101 9:01'), Timestamp('20130101 9:02')]) - expected = Series([Timestamp('20130101 9:00:55'), - Timestamp('20130101 9:01:55')]) + ser = Series([Timestamp("20130101 9:01"), Timestamp("20130101 9:02")]) + expected = Series( + [Timestamp("20130101 9:00:55"), Timestamp("20130101 9:01:55")] + ) ser = tm.box_expected(ser, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1186,13 +1235,13 @@ def test_dt64arr_series_sub_tick_DateOffset(self, box_with_array): with pytest.raises(TypeError, match=msg): pd.offsets.Second(5) - ser - @pytest.mark.parametrize('cls_name', ['Day', 'Hour', 'Minute', 'Second', - 'Milli', 'Micro', 'Nano']) - def test_dt64arr_add_sub_tick_DateOffset_smoke(self, cls_name, - box_with_array): + @pytest.mark.parametrize( + "cls_name", ["Day", "Hour", "Minute", "Second", "Milli", "Micro", "Nano"] + ) + def test_dt64arr_add_sub_tick_DateOffset_smoke(self, cls_name, box_with_array): # GH#4532 # smoke tests for valid DateOffsets - ser = Series([Timestamp('20130101 9:01'), Timestamp('20130101 9:02')]) + ser = Series([Timestamp("20130101 9:01"), Timestamp("20130101 9:02")]) ser = tm.box_expected(ser, box_with_array) offset_cls = getattr(pd.offsets, cls_name) @@ -1203,15 +1252,17 @@ def test_dt64arr_add_sub_tick_DateOffset_smoke(self, cls_name, def test_dti_add_tick_tzaware(self, tz_aware_fixture, box_with_array): # GH#21610, GH#22163 ensure DataFrame doesn't return object-dtype tz = tz_aware_fixture - if tz == 'US/Pacific': - dates = date_range('2012-11-01', periods=3, tz=tz) + if tz == "US/Pacific": + dates = date_range("2012-11-01", periods=3, tz=tz) offset = dates + pd.offsets.Hour(5) assert dates[0] + pd.offsets.Hour(5) == offset[0] - dates = date_range('2010-11-01 00:00', - periods=3, tz=tz, freq='H') - expected = DatetimeIndex(['2010-11-01 05:00', '2010-11-01 06:00', - '2010-11-01 07:00'], freq='H', tz=tz) + dates = date_range("2010-11-01 00:00", periods=3, tz=tz, freq="H") + expected = DatetimeIndex( + ["2010-11-01 05:00", "2010-11-01 06:00", "2010-11-01 07:00"], + freq="H", + tz=tz, + ) dates = tm.box_expected(dates, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1219,7 +1270,7 @@ def test_dti_add_tick_tzaware(self, tz_aware_fixture, box_with_array): # TODO: parametrize over the scalar being added? radd? sub? offset = dates + pd.offsets.Hour(5) tm.assert_equal(offset, expected) - offset = dates + np.timedelta64(5, 'h') + offset = dates + np.timedelta64(5, "h") tm.assert_equal(offset, expected) offset = dates + timedelta(hours=5) tm.assert_equal(offset, expected) @@ -1229,21 +1280,31 @@ def test_dti_add_tick_tzaware(self, tz_aware_fixture, box_with_array): def test_dt64arr_add_sub_relativedelta_offsets(self, box_with_array): # GH#10699 - vec = DatetimeIndex([Timestamp('2000-01-05 00:15:00'), - Timestamp('2000-01-31 00:23:00'), - Timestamp('2000-01-01'), - Timestamp('2000-03-31'), - Timestamp('2000-02-29'), - Timestamp('2000-12-31'), - Timestamp('2000-05-15'), - Timestamp('2001-06-15')]) + vec = DatetimeIndex( + [ + Timestamp("2000-01-05 00:15:00"), + Timestamp("2000-01-31 00:23:00"), + Timestamp("2000-01-01"), + Timestamp("2000-03-31"), + Timestamp("2000-02-29"), + Timestamp("2000-12-31"), + Timestamp("2000-05-15"), + Timestamp("2001-06-15"), + ] + ) vec = tm.box_expected(vec, box_with_array) vec_items = vec.squeeze() if box_with_array is pd.DataFrame else vec # DateOffset relativedelta fastpath - relative_kwargs = [('years', 2), ('months', 5), ('days', 3), - ('hours', 5), ('minutes', 10), ('seconds', 2), - ('microseconds', 5)] + relative_kwargs = [ + ("years", 2), + ("months", 5), + ("days", 3), + ("hours", 5), + ("minutes", 10), + ("seconds", 2), + ("microseconds", 5), + ] for i, kwd in enumerate(relative_kwargs): off = pd.DateOffset(**dict([kwd])) @@ -1255,7 +1316,7 @@ def test_dt64arr_add_sub_relativedelta_offsets(self, box_with_array): expected = tm.box_expected(expected, box_with_array) tm.assert_equal(expected, vec - off) - off = pd.DateOffset(**dict(relative_kwargs[:i + 1])) + off = pd.DateOffset(**dict(relative_kwargs[: i + 1])) expected = DatetimeIndex([x + off for x in vec_items]) expected = tm.box_expected(expected, box_with_array) @@ -1273,30 +1334,57 @@ def test_dt64arr_add_sub_relativedelta_offsets(self, box_with_array): # TODO: redundant with test_dt64arr_add_sub_DateOffset? that includes # tz-aware cases which this does not - @pytest.mark.parametrize('cls_and_kwargs', [ - 'YearBegin', ('YearBegin', {'month': 5}), - 'YearEnd', ('YearEnd', {'month': 5}), - 'MonthBegin', 'MonthEnd', - 'SemiMonthEnd', 'SemiMonthBegin', - 'Week', ('Week', {'weekday': 3}), - 'Week', ('Week', {'weekday': 6}), - 'BusinessDay', 'BDay', 'QuarterEnd', 'QuarterBegin', - 'CustomBusinessDay', 'CDay', 'CBMonthEnd', - 'CBMonthBegin', 'BMonthBegin', 'BMonthEnd', - 'BusinessHour', 'BYearBegin', 'BYearEnd', - 'BQuarterBegin', ('LastWeekOfMonth', {'weekday': 2}), - ('FY5253Quarter', {'qtr_with_extra_week': 1, - 'startingMonth': 1, - 'weekday': 2, - 'variation': 'nearest'}), - ('FY5253', {'weekday': 0, 'startingMonth': 2, 'variation': 'nearest'}), - ('WeekOfMonth', {'weekday': 2, 'week': 2}), - 'Easter', ('DateOffset', {'day': 4}), - ('DateOffset', {'month': 5})]) - @pytest.mark.parametrize('normalize', [True, False]) - @pytest.mark.parametrize('n', [0, 5]) - def test_dt64arr_add_sub_DateOffsets(self, box_with_array, - n, normalize, cls_and_kwargs): + @pytest.mark.parametrize( + "cls_and_kwargs", + [ + "YearBegin", + ("YearBegin", {"month": 5}), + "YearEnd", + ("YearEnd", {"month": 5}), + "MonthBegin", + "MonthEnd", + "SemiMonthEnd", + "SemiMonthBegin", + "Week", + ("Week", {"weekday": 3}), + "Week", + ("Week", {"weekday": 6}), + "BusinessDay", + "BDay", + "QuarterEnd", + "QuarterBegin", + "CustomBusinessDay", + "CDay", + "CBMonthEnd", + "CBMonthBegin", + "BMonthBegin", + "BMonthEnd", + "BusinessHour", + "BYearBegin", + "BYearEnd", + "BQuarterBegin", + ("LastWeekOfMonth", {"weekday": 2}), + ( + "FY5253Quarter", + { + "qtr_with_extra_week": 1, + "startingMonth": 1, + "weekday": 2, + "variation": "nearest", + }, + ), + ("FY5253", {"weekday": 0, "startingMonth": 2, "variation": "nearest"}), + ("WeekOfMonth", {"weekday": 2, "week": 2}), + "Easter", + ("DateOffset", {"day": 4}), + ("DateOffset", {"month": 5}), + ], + ) + @pytest.mark.parametrize("normalize", [True, False]) + @pytest.mark.parametrize("n", [0, 5]) + def test_dt64arr_add_sub_DateOffsets( + self, box_with_array, n, normalize, cls_and_kwargs + ): # GH#10699 # assert vectorized operation matches pointwise operations @@ -1308,19 +1396,27 @@ def test_dt64arr_add_sub_DateOffsets(self, box_with_array, cls_name = cls_and_kwargs kwargs = {} - if n == 0 and cls_name in ['WeekOfMonth', 'LastWeekOfMonth', - 'FY5253Quarter', 'FY5253']: + if n == 0 and cls_name in [ + "WeekOfMonth", + "LastWeekOfMonth", + "FY5253Quarter", + "FY5253", + ]: # passing n = 0 is invalid for these offset classes return - vec = DatetimeIndex([Timestamp('2000-01-05 00:15:00'), - Timestamp('2000-01-31 00:23:00'), - Timestamp('2000-01-01'), - Timestamp('2000-03-31'), - Timestamp('2000-02-29'), - Timestamp('2000-12-31'), - Timestamp('2000-05-15'), - Timestamp('2001-06-15')]) + vec = DatetimeIndex( + [ + Timestamp("2000-01-05 00:15:00"), + Timestamp("2000-01-31 00:23:00"), + Timestamp("2000-01-01"), + Timestamp("2000-03-31"), + Timestamp("2000-02-29"), + Timestamp("2000-12-31"), + Timestamp("2000-05-15"), + Timestamp("2001-06-15"), + ] + ) vec = tm.box_expected(vec, box_with_array) vec_items = vec.squeeze() if box_with_array is pd.DataFrame else vec @@ -1351,40 +1447,58 @@ def test_dt64arr_add_sub_DateOffsets(self, box_with_array, def test_dt64arr_add_sub_DateOffset(self, box_with_array): # GH#10699 - s = date_range('2000-01-01', '2000-01-31', name='a') + s = date_range("2000-01-01", "2000-01-31", name="a") s = tm.box_expected(s, box_with_array) result = s + pd.DateOffset(years=1) result2 = pd.DateOffset(years=1) + s - exp = date_range('2001-01-01', '2001-01-31', name='a') + exp = date_range("2001-01-01", "2001-01-31", name="a") exp = tm.box_expected(exp, box_with_array) tm.assert_equal(result, exp) tm.assert_equal(result2, exp) result = s - pd.DateOffset(years=1) - exp = date_range('1999-01-01', '1999-01-31', name='a') + exp = date_range("1999-01-01", "1999-01-31", name="a") exp = tm.box_expected(exp, box_with_array) tm.assert_equal(result, exp) - s = DatetimeIndex([Timestamp('2000-01-15 00:15:00', tz='US/Central'), - Timestamp('2000-02-15', tz='US/Central')], name='a') + s = DatetimeIndex( + [ + Timestamp("2000-01-15 00:15:00", tz="US/Central"), + Timestamp("2000-02-15", tz="US/Central"), + ], + name="a", + ) s = tm.box_expected(s, box_with_array) result = s + pd.offsets.Day() result2 = pd.offsets.Day() + s - exp = DatetimeIndex([Timestamp('2000-01-16 00:15:00', tz='US/Central'), - Timestamp('2000-02-16', tz='US/Central')], - name='a') + exp = DatetimeIndex( + [ + Timestamp("2000-01-16 00:15:00", tz="US/Central"), + Timestamp("2000-02-16", tz="US/Central"), + ], + name="a", + ) exp = tm.box_expected(exp, box_with_array) tm.assert_equal(result, exp) tm.assert_equal(result2, exp) - s = DatetimeIndex([Timestamp('2000-01-15 00:15:00', tz='US/Central'), - Timestamp('2000-02-15', tz='US/Central')], name='a') + s = DatetimeIndex( + [ + Timestamp("2000-01-15 00:15:00", tz="US/Central"), + Timestamp("2000-02-15", tz="US/Central"), + ], + name="a", + ) s = tm.box_expected(s, box_with_array) result = s + pd.offsets.MonthEnd() result2 = pd.offsets.MonthEnd() + s - exp = DatetimeIndex([Timestamp('2000-01-31 00:15:00', tz='US/Central'), - Timestamp('2000-02-29', tz='US/Central')], - name='a') + exp = DatetimeIndex( + [ + Timestamp("2000-01-31 00:15:00", tz="US/Central"), + Timestamp("2000-02-29", tz="US/Central"), + ], + name="a", + ) exp = tm.box_expected(exp, box_with_array) tm.assert_equal(result, exp) tm.assert_equal(result2, exp) @@ -1393,90 +1507,114 @@ def test_dt64arr_add_sub_DateOffset(self, box_with_array): def test_dt64arr_add_mixed_offset_array(self, box_with_array): # GH#10699 # array of offsets - s = DatetimeIndex([Timestamp('2000-1-1'), Timestamp('2000-2-1')]) + s = DatetimeIndex([Timestamp("2000-1-1"), Timestamp("2000-2-1")]) s = tm.box_expected(s, box_with_array) warn = None if box_with_array is pd.DataFrame else PerformanceWarning - with tm.assert_produces_warning(warn, - clear=[pd.core.arrays.datetimelike]): - other = pd.Index([pd.offsets.DateOffset(years=1), - pd.offsets.MonthEnd()]) + with tm.assert_produces_warning(warn, clear=[pd.core.arrays.datetimelike]): + other = pd.Index([pd.offsets.DateOffset(years=1), pd.offsets.MonthEnd()]) other = tm.box_expected(other, box_with_array) result = s + other - exp = DatetimeIndex([Timestamp('2001-1-1'), - Timestamp('2000-2-29')]) + exp = DatetimeIndex([Timestamp("2001-1-1"), Timestamp("2000-2-29")]) exp = tm.box_expected(exp, box_with_array) tm.assert_equal(result, exp) # same offset - other = pd.Index([pd.offsets.DateOffset(years=1), - pd.offsets.DateOffset(years=1)]) + other = pd.Index( + [pd.offsets.DateOffset(years=1), pd.offsets.DateOffset(years=1)] + ) other = tm.box_expected(other, box_with_array) result = s + other - exp = DatetimeIndex([Timestamp('2001-1-1'), - Timestamp('2001-2-1')]) + exp = DatetimeIndex([Timestamp("2001-1-1"), Timestamp("2001-2-1")]) exp = tm.box_expected(exp, box_with_array) tm.assert_equal(result, exp) # TODO: overlap with test_dt64arr_add_mixed_offset_array? - def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, - box_with_array): + def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, box_with_array): # GH#18849 tz = tz_naive_fixture - dti = pd.date_range('2017-01-01', periods=2, tz=tz) + dti = pd.date_range("2017-01-01", periods=2, tz=tz) dtarr = tm.box_expected(dti, box_with_array) other = np.array([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) warn = None if box_with_array is pd.DataFrame else PerformanceWarning - with tm.assert_produces_warning(warn, - clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning(warn, clear=[pd.core.arrays.datetimelike]): res = dtarr + other - expected = DatetimeIndex([dti[n] + other[n] for n in range(len(dti))], - name=dti.name, freq='infer') + expected = DatetimeIndex( + [dti[n] + other[n] for n in range(len(dti))], name=dti.name, freq="infer" + ) expected = tm.box_expected(expected, box_with_array) tm.assert_equal(res, expected) - with tm.assert_produces_warning(warn, - clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning(warn, clear=[pd.core.arrays.datetimelike]): res2 = other + dtarr tm.assert_equal(res2, expected) - with tm.assert_produces_warning(warn, - clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning(warn, clear=[pd.core.arrays.datetimelike]): res = dtarr - other - expected = DatetimeIndex([dti[n] - other[n] for n in range(len(dti))], - name=dti.name, freq='infer') + expected = DatetimeIndex( + [dti[n] - other[n] for n in range(len(dti))], name=dti.name, freq="infer" + ) expected = tm.box_expected(expected, box_with_array) tm.assert_equal(res, expected) - @pytest.mark.parametrize("op, offset, exp, exp_freq", [ - ('__add__', pd.DateOffset(months=3, days=10), - [Timestamp('2014-04-11'), Timestamp('2015-04-11'), - Timestamp('2016-04-11'), Timestamp('2017-04-11')], - None), - ('__add__', pd.DateOffset(months=3), - [Timestamp('2014-04-01'), Timestamp('2015-04-01'), - Timestamp('2016-04-01'), Timestamp('2017-04-01')], - "AS-APR"), - ('__sub__', pd.DateOffset(months=3, days=10), - [Timestamp('2013-09-21'), Timestamp('2014-09-21'), - Timestamp('2015-09-21'), Timestamp('2016-09-21')], - None), - ('__sub__', pd.DateOffset(months=3), - [Timestamp('2013-10-01'), Timestamp('2014-10-01'), - Timestamp('2015-10-01'), Timestamp('2016-10-01')], - "AS-OCT") - ]) - def test_dti_add_sub_nonzero_mth_offset(self, op, offset, - exp, exp_freq, - tz_aware_fixture, - box_with_array): + @pytest.mark.parametrize( + "op, offset, exp, exp_freq", + [ + ( + "__add__", + pd.DateOffset(months=3, days=10), + [ + Timestamp("2014-04-11"), + Timestamp("2015-04-11"), + Timestamp("2016-04-11"), + Timestamp("2017-04-11"), + ], + None, + ), + ( + "__add__", + pd.DateOffset(months=3), + [ + Timestamp("2014-04-01"), + Timestamp("2015-04-01"), + Timestamp("2016-04-01"), + Timestamp("2017-04-01"), + ], + "AS-APR", + ), + ( + "__sub__", + pd.DateOffset(months=3, days=10), + [ + Timestamp("2013-09-21"), + Timestamp("2014-09-21"), + Timestamp("2015-09-21"), + Timestamp("2016-09-21"), + ], + None, + ), + ( + "__sub__", + pd.DateOffset(months=3), + [ + Timestamp("2013-10-01"), + Timestamp("2014-10-01"), + Timestamp("2015-10-01"), + Timestamp("2016-10-01"), + ], + "AS-OCT", + ), + ], + ) + def test_dti_add_sub_nonzero_mth_offset( + self, op, offset, exp, exp_freq, tz_aware_fixture, box_with_array + ): # GH 26258 tz = tz_aware_fixture - date = date_range(start='01 Jan 2014', end='01 Jan 2017', freq='AS', - tz=tz) + date = date_range(start="01 Jan 2014", end="01 Jan 2017", freq="AS", tz=tz) date = tm.box_expected(date, box_with_array, False) mth = getattr(date, op) result = mth(offset) @@ -1491,7 +1629,7 @@ class TestDatetime64OverflowHandling: def test_dt64_overflow_masking(self, box_with_array): # GH#25317 - left = Series([Timestamp('1969-12-31')]) + left = Series([Timestamp("1969-12-31")]) right = Series([NaT]) left = tm.box_expected(left, box_with_array) @@ -1505,11 +1643,11 @@ def test_dt64_overflow_masking(self, box_with_array): def test_dt64_series_arith_overflow(self): # GH#12534, fixed by GH#19024 - dt = pd.Timestamp('1700-01-31') - td = pd.Timedelta('20000 Days') - dti = pd.date_range('1949-09-30', freq='100Y', periods=4) + dt = pd.Timestamp("1700-01-31") + td = pd.Timedelta("20000 Days") + dti = pd.date_range("1949-09-30", freq="100Y", periods=4) ser = pd.Series(dti) - msg = 'Overflow in int64 addition' + msg = "Overflow in int64 addition" with pytest.raises(OverflowError, match=msg): ser - dt with pytest.raises(OverflowError, match=msg): @@ -1520,37 +1658,43 @@ def test_dt64_series_arith_overflow(self): td + ser ser.iloc[-1] = pd.NaT - expected = pd.Series(['2004-10-03', '2104-10-04', '2204-10-04', 'NaT'], - dtype='datetime64[ns]') + expected = pd.Series( + ["2004-10-03", "2104-10-04", "2204-10-04", "NaT"], dtype="datetime64[ns]" + ) res = ser + td tm.assert_series_equal(res, expected) res = td + ser tm.assert_series_equal(res, expected) ser.iloc[1:] = pd.NaT - expected = pd.Series(['91279 Days', 'NaT', 'NaT', 'NaT'], - dtype='timedelta64[ns]') + expected = pd.Series( + ["91279 Days", "NaT", "NaT", "NaT"], dtype="timedelta64[ns]" + ) res = ser - dt tm.assert_series_equal(res, expected) res = dt - ser tm.assert_series_equal(res, -expected) def test_datetimeindex_sub_timestamp_overflow(self): - dtimax = pd.to_datetime(['now', pd.Timestamp.max]) - dtimin = pd.to_datetime(['now', pd.Timestamp.min]) - - tsneg = Timestamp('1950-01-01') - ts_neg_variants = [tsneg, - tsneg.to_pydatetime(), - tsneg.to_datetime64().astype('datetime64[ns]'), - tsneg.to_datetime64().astype('datetime64[D]')] - - tspos = Timestamp('1980-01-01') - ts_pos_variants = [tspos, - tspos.to_pydatetime(), - tspos.to_datetime64().astype('datetime64[ns]'), - tspos.to_datetime64().astype('datetime64[D]')] - msg = 'Overflow in int64 addition' + dtimax = pd.to_datetime(["now", pd.Timestamp.max]) + dtimin = pd.to_datetime(["now", pd.Timestamp.min]) + + tsneg = Timestamp("1950-01-01") + ts_neg_variants = [ + tsneg, + tsneg.to_pydatetime(), + tsneg.to_datetime64().astype("datetime64[ns]"), + tsneg.to_datetime64().astype("datetime64[D]"), + ] + + tspos = Timestamp("1980-01-01") + ts_pos_variants = [ + tspos, + tspos.to_pydatetime(), + tspos.to_datetime64().astype("datetime64[ns]"), + tspos.to_datetime64().astype("datetime64[D]"), + ] + msg = "Overflow in int64 addition" for variant in ts_neg_variants: with pytest.raises(OverflowError, match=msg): dtimax - variant @@ -1571,11 +1715,11 @@ def test_datetimeindex_sub_timestamp_overflow(self): def test_datetimeindex_sub_datetimeindex_overflow(self): # GH#22492, GH#22508 - dtimax = pd.to_datetime(['now', pd.Timestamp.max]) - dtimin = pd.to_datetime(['now', pd.Timestamp.min]) + dtimax = pd.to_datetime(["now", pd.Timestamp.max]) + dtimin = pd.to_datetime(["now", pd.Timestamp.min]) - ts_neg = pd.to_datetime(['1950-01-01', '1950-01-01']) - ts_pos = pd.to_datetime(['1980-01-01', '1980-01-01']) + ts_neg = pd.to_datetime(["1950-01-01", "1950-01-01"]) + ts_pos = pd.to_datetime(["1980-01-01", "1980-01-01"]) # General tests expected = pd.Timestamp.max.value - ts_pos[1].value @@ -1585,7 +1729,7 @@ def test_datetimeindex_sub_datetimeindex_overflow(self): expected = pd.Timestamp.min.value - ts_neg[1].value result = dtimin - ts_neg assert result[1].value == expected - msg = 'Overflow in int64 addition' + msg = "Overflow in int64 addition" with pytest.raises(OverflowError, match=msg): dtimax - ts_neg @@ -1594,26 +1738,25 @@ def test_datetimeindex_sub_datetimeindex_overflow(self): # Edge cases tmin = pd.to_datetime([pd.Timestamp.min]) - t1 = tmin + pd.Timedelta.max + pd.Timedelta('1us') + t1 = tmin + pd.Timedelta.max + pd.Timedelta("1us") with pytest.raises(OverflowError, match=msg): t1 - tmin tmax = pd.to_datetime([pd.Timestamp.max]) - t2 = tmax + pd.Timedelta.min - pd.Timedelta('1us') + t2 = tmax + pd.Timedelta.min - pd.Timedelta("1us") with pytest.raises(OverflowError, match=msg): tmax - t2 class TestTimestampSeriesArithmetic: - def test_empty_series_add_sub(self): # GH#13844 - a = Series(dtype='M8[ns]') - b = Series(dtype='m8[ns]') + a = Series(dtype="M8[ns]") + b = Series(dtype="m8[ns]") tm.assert_series_equal(a, a + b) tm.assert_series_equal(a, a - b) tm.assert_series_equal(a, b + a) - msg = 'cannot subtract' + msg = "cannot subtract" with pytest.raises(TypeError, match=msg): b - a @@ -1624,11 +1767,21 @@ def test_operators_datetimelike(self): td1.iloc[2] = np.nan # ## datetime64 ### - dt1 = Series([pd.Timestamp('20111230'), pd.Timestamp('20120101'), - pd.Timestamp('20120103')]) + dt1 = Series( + [ + pd.Timestamp("20111230"), + pd.Timestamp("20120101"), + pd.Timestamp("20120103"), + ] + ) dt1.iloc[2] = np.nan - dt2 = Series([pd.Timestamp('20111231'), pd.Timestamp('20120102'), - pd.Timestamp('20120104')]) + dt2 = Series( + [ + pd.Timestamp("20111231"), + pd.Timestamp("20120102"), + pd.Timestamp("20120104"), + ] + ) dt1 - dt2 dt2 - dt1 @@ -1648,7 +1801,7 @@ def test_dt64ser_sub_datetime_dtype(self): dt = datetime(1993, 6, 22, 13, 30) ser = Series([ts]) result = pd.to_timedelta(np.abs(ser - dt)) - assert result.dtype == 'timedelta64[ns]' + assert result.dtype == "timedelta64[ns]" # ------------------------------------------------------------- # TODO: This next block of tests came from tests.series.test_operators, @@ -1666,8 +1819,9 @@ def check(get_ser, test_ser): op = getattr(get_ser, op_str, None) # Previously, _validate_for_numeric_binop in core/indexes/base.py # did this for us. - with pytest.raises(TypeError, - match='operate|[cC]annot|unsupported operand'): + with pytest.raises( + TypeError, match="operate|[cC]annot|unsupported operand" + ): op(test_ser) # ## timedelta64 ### @@ -1675,50 +1829,51 @@ def check(get_ser, test_ser): td1.iloc[2] = np.nan # ## datetime64 ### - dt1 = Series([Timestamp('20111230'), Timestamp('20120101'), - Timestamp('20120103')]) + dt1 = Series( + [Timestamp("20111230"), Timestamp("20120101"), Timestamp("20120103")] + ) dt1.iloc[2] = np.nan - dt2 = Series([Timestamp('20111231'), Timestamp('20120102'), - Timestamp('20120104')]) - if op_str not in ['__sub__', '__rsub__']: + dt2 = Series( + [Timestamp("20111231"), Timestamp("20120102"), Timestamp("20120104")] + ) + if op_str not in ["__sub__", "__rsub__"]: check(dt1, dt2) # ## datetime64 with timetimedelta ### # TODO(jreback) __rsub__ should raise? - if op_str not in ['__add__', '__radd__', '__sub__']: + if op_str not in ["__add__", "__radd__", "__sub__"]: check(dt1, td1) # 8260, 10763 # datetime64 with tz - tz = 'US/Eastern' - dt1 = Series(date_range('2000-01-01 09:00:00', periods=5, - tz=tz), name='foo') + tz = "US/Eastern" + dt1 = Series(date_range("2000-01-01 09:00:00", periods=5, tz=tz), name="foo") dt2 = dt1.copy() dt2.iloc[2] = np.nan - td1 = Series(pd.timedelta_range('1 days 1 min', periods=5, freq='H')) + td1 = Series(pd.timedelta_range("1 days 1 min", periods=5, freq="H")) td2 = td1.copy() td2.iloc[1] = np.nan - if op_str not in ['__add__', '__radd__', '__sub__', '__rsub__']: + if op_str not in ["__add__", "__radd__", "__sub__", "__rsub__"]: check(dt2, td2) def test_sub_single_tz(self): # GH#12290 - s1 = Series([pd.Timestamp('2016-02-10', tz='America/Sao_Paulo')]) - s2 = Series([pd.Timestamp('2016-02-08', tz='America/Sao_Paulo')]) + s1 = Series([pd.Timestamp("2016-02-10", tz="America/Sao_Paulo")]) + s2 = Series([pd.Timestamp("2016-02-08", tz="America/Sao_Paulo")]) result = s1 - s2 - expected = Series([Timedelta('2days')]) + expected = Series([Timedelta("2days")]) tm.assert_series_equal(result, expected) result = s2 - s1 - expected = Series([Timedelta('-2days')]) + expected = Series([Timedelta("-2days")]) tm.assert_series_equal(result, expected) def test_dt64tz_series_sub_dtitz(self): # GH#19071 subtracting tzaware DatetimeIndex from tzaware Series # (with same tz) raises, fixed by #19024 - dti = pd.date_range('1999-09-30', periods=10, tz='US/Pacific') + dti = pd.date_range("1999-09-30", periods=10, tz="US/Pacific") ser = pd.Series(dti) - expected = pd.Series(pd.TimedeltaIndex(['0days'] * 10)) + expected = pd.Series(pd.TimedeltaIndex(["0days"] * 10)) res = dti - ser tm.assert_series_equal(res, expected) @@ -1729,68 +1884,78 @@ def test_sub_datetime_compat(self): # see GH#14088 s = Series([datetime(2016, 8, 23, 12, tzinfo=pytz.utc), pd.NaT]) dt = datetime(2016, 8, 22, 12, tzinfo=pytz.utc) - exp = Series([Timedelta('1 days'), pd.NaT]) + exp = Series([Timedelta("1 days"), pd.NaT]) tm.assert_series_equal(s - dt, exp) tm.assert_series_equal(s - Timestamp(dt), exp) def test_dt64_series_add_mixed_tick_DateOffset(self): # GH#4532 # operate with pd.offsets - s = Series([Timestamp('20130101 9:01'), Timestamp('20130101 9:02')]) + s = Series([Timestamp("20130101 9:01"), Timestamp("20130101 9:02")]) result = s + pd.offsets.Milli(5) result2 = pd.offsets.Milli(5) + s - expected = Series([Timestamp('20130101 9:01:00.005'), - Timestamp('20130101 9:02:00.005')]) + expected = Series( + [Timestamp("20130101 9:01:00.005"), Timestamp("20130101 9:02:00.005")] + ) tm.assert_series_equal(result, expected) tm.assert_series_equal(result2, expected) result = s + pd.offsets.Minute(5) + pd.offsets.Milli(5) - expected = Series([Timestamp('20130101 9:06:00.005'), - Timestamp('20130101 9:07:00.005')]) + expected = Series( + [Timestamp("20130101 9:06:00.005"), Timestamp("20130101 9:07:00.005")] + ) tm.assert_series_equal(result, expected) def test_datetime64_ops_nat(self): # GH#11349 - datetime_series = Series([NaT, Timestamp('19900315')]) - nat_series_dtype_timestamp = Series([NaT, NaT], dtype='datetime64[ns]') - single_nat_dtype_datetime = Series([NaT], dtype='datetime64[ns]') + datetime_series = Series([NaT, Timestamp("19900315")]) + nat_series_dtype_timestamp = Series([NaT, NaT], dtype="datetime64[ns]") + single_nat_dtype_datetime = Series([NaT], dtype="datetime64[ns]") # subtraction - tm.assert_series_equal(-NaT + datetime_series, - nat_series_dtype_timestamp) - msg = 'Unary negative expects' + tm.assert_series_equal(-NaT + datetime_series, nat_series_dtype_timestamp) + msg = "Unary negative expects" with pytest.raises(TypeError, match=msg): -single_nat_dtype_datetime + datetime_series - tm.assert_series_equal(-NaT + nat_series_dtype_timestamp, - nat_series_dtype_timestamp) + tm.assert_series_equal( + -NaT + nat_series_dtype_timestamp, nat_series_dtype_timestamp + ) with pytest.raises(TypeError, match=msg): -single_nat_dtype_datetime + nat_series_dtype_timestamp # addition - tm.assert_series_equal(nat_series_dtype_timestamp + NaT, - nat_series_dtype_timestamp) - tm.assert_series_equal(NaT + nat_series_dtype_timestamp, - nat_series_dtype_timestamp) - - tm.assert_series_equal(nat_series_dtype_timestamp + NaT, - nat_series_dtype_timestamp) - tm.assert_series_equal(NaT + nat_series_dtype_timestamp, - nat_series_dtype_timestamp) + tm.assert_series_equal( + nat_series_dtype_timestamp + NaT, nat_series_dtype_timestamp + ) + tm.assert_series_equal( + NaT + nat_series_dtype_timestamp, nat_series_dtype_timestamp + ) + + tm.assert_series_equal( + nat_series_dtype_timestamp + NaT, nat_series_dtype_timestamp + ) + tm.assert_series_equal( + NaT + nat_series_dtype_timestamp, nat_series_dtype_timestamp + ) # ------------------------------------------------------------- # Invalid Operations # TODO: this block also needs to be de-duplicated and parametrized - @pytest.mark.parametrize('dt64_series', [ - Series([Timestamp('19900315'), Timestamp('19900315')]), - Series([pd.NaT, Timestamp('19900315')]), - Series([pd.NaT, pd.NaT], dtype='datetime64[ns]')]) - @pytest.mark.parametrize('one', [1, 1.0, np.array(1)]) + @pytest.mark.parametrize( + "dt64_series", + [ + Series([Timestamp("19900315"), Timestamp("19900315")]), + Series([pd.NaT, Timestamp("19900315")]), + Series([pd.NaT, pd.NaT], dtype="datetime64[ns]"), + ], + ) + @pytest.mark.parametrize("one", [1, 1.0, np.array(1)]) def test_dt64_mul_div_numeric_invalid(self, one, dt64_series): # multiplication - msg = 'cannot perform .* with this index type' + msg = "cannot perform .* with this index type" with pytest.raises(TypeError, match=msg): dt64_series * one with pytest.raises(TypeError, match=msg): @@ -1802,21 +1967,24 @@ def test_dt64_mul_div_numeric_invalid(self, one, dt64_series): with pytest.raises(TypeError, match=msg): one / dt64_series - @pytest.mark.parametrize('op', ['__add__', '__radd__', - '__sub__', '__rsub__']) - @pytest.mark.parametrize('tz', [None, 'Asia/Tokyo']) + @pytest.mark.parametrize("op", ["__add__", "__radd__", "__sub__", "__rsub__"]) + @pytest.mark.parametrize("tz", [None, "Asia/Tokyo"]) def test_dt64_series_add_intlike(self, tz, op): # GH#19123 - dti = pd.DatetimeIndex(['2016-01-02', '2016-02-03', 'NaT'], tz=tz) + dti = pd.DatetimeIndex(["2016-01-02", "2016-02-03", "NaT"], tz=tz) ser = Series(dti) - other = Series([20, 30, 40], dtype='uint8') + other = Series([20, 30, 40], dtype="uint8") method = getattr(ser, op) - msg = '|'.join(['incompatible type for a .* operation', - 'cannot evaluate a numeric op', - 'ufunc .* cannot use operands', - 'cannot (add|subtract)']) + msg = "|".join( + [ + "incompatible type for a .* operation", + "cannot evaluate a numeric op", + "ufunc .* cannot use operands", + "cannot (add|subtract)", + ] + ) with pytest.raises(TypeError, match=msg): method(1) with pytest.raises(TypeError, match=msg): @@ -1830,13 +1998,12 @@ def test_dt64_series_add_intlike(self, tz, op): # Timezone-Centric Tests def test_operators_datetimelike_with_timezones(self): - tz = 'US/Eastern' - dt1 = Series(date_range('2000-01-01 09:00:00', periods=5, - tz=tz), name='foo') + tz = "US/Eastern" + dt1 = Series(date_range("2000-01-01 09:00:00", periods=5, tz=tz), name="foo") dt2 = dt1.copy() dt2.iloc[2] = np.nan - td1 = Series(pd.timedelta_range('1 days 1 min', periods=5, freq='H')) + td1 = Series(pd.timedelta_range("1 days 1 min", periods=5, freq="H")) td2 = td1.copy() td2.iloc[1] = np.nan @@ -1885,7 +2052,7 @@ def test_operators_datetimelike_with_timezones(self): result = dt2 - td2 exp = (dt2.dt.tz_localize(None) - td2).dt.tz_localize(tz) tm.assert_series_equal(result, exp) - msg = 'cannot (add|subtract)' + msg = "cannot (add|subtract)" with pytest.raises(TypeError, match=msg): td1 - dt1 with pytest.raises(TypeError, match=msg): @@ -1900,40 +2067,32 @@ class TestDatetimeIndexArithmetic: def test_dti_add_int(self, tz_naive_fixture, one): # Variants of `one` for #19012 tz = tz_naive_fixture - rng = pd.date_range('2000-01-01 09:00', freq='H', - periods=10, tz=tz) + rng = pd.date_range("2000-01-01 09:00", freq="H", periods=10, tz=tz) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = rng + one - expected = pd.date_range('2000-01-01 10:00', freq='H', - periods=10, tz=tz) + expected = pd.date_range("2000-01-01 10:00", freq="H", periods=10, tz=tz) tm.assert_index_equal(result, expected) def test_dti_iadd_int(self, tz_naive_fixture, one): tz = tz_naive_fixture - rng = pd.date_range('2000-01-01 09:00', freq='H', - periods=10, tz=tz) - expected = pd.date_range('2000-01-01 10:00', freq='H', - periods=10, tz=tz) + rng = pd.date_range("2000-01-01 09:00", freq="H", periods=10, tz=tz) + expected = pd.date_range("2000-01-01 10:00", freq="H", periods=10, tz=tz) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): rng += one tm.assert_index_equal(rng, expected) def test_dti_sub_int(self, tz_naive_fixture, one): tz = tz_naive_fixture - rng = pd.date_range('2000-01-01 09:00', freq='H', - periods=10, tz=tz) + rng = pd.date_range("2000-01-01 09:00", freq="H", periods=10, tz=tz) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = rng - one - expected = pd.date_range('2000-01-01 08:00', freq='H', - periods=10, tz=tz) + expected = pd.date_range("2000-01-01 08:00", freq="H", periods=10, tz=tz) tm.assert_index_equal(result, expected) def test_dti_isub_int(self, tz_naive_fixture, one): tz = tz_naive_fixture - rng = pd.date_range('2000-01-01 09:00', freq='H', - periods=10, tz=tz) - expected = pd.date_range('2000-01-01 08:00', freq='H', - periods=10, tz=tz) + rng = pd.date_range("2000-01-01 09:00", freq="H", periods=10, tz=tz) + expected = pd.date_range("2000-01-01 08:00", freq="H", periods=10, tz=tz) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): rng -= one tm.assert_index_equal(rng, expected) @@ -1941,16 +2100,15 @@ def test_dti_isub_int(self, tz_naive_fixture, one): # ------------------------------------------------------------- # __add__/__sub__ with integer arrays - @pytest.mark.parametrize('freq', ['H', 'D']) - @pytest.mark.parametrize('int_holder', [np.array, pd.Index]) + @pytest.mark.parametrize("freq", ["H", "D"]) + @pytest.mark.parametrize("int_holder", [np.array, pd.Index]) def test_dti_add_intarray_tick(self, int_holder, freq): # GH#19959 - dti = pd.date_range('2016-01-01', periods=2, freq=freq) + dti = pd.date_range("2016-01-01", periods=2, freq=freq) other = int_holder([4, -1]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = DatetimeIndex([dti[n] + other[n] - for n in range(len(dti))]) + expected = DatetimeIndex([dti[n] + other[n] for n in range(len(dti))]) result = dti + other tm.assert_index_equal(result, expected) @@ -1958,16 +2116,15 @@ def test_dti_add_intarray_tick(self, int_holder, freq): result = other + dti tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('freq', ['W', 'M', 'MS', 'Q']) - @pytest.mark.parametrize('int_holder', [np.array, pd.Index]) + @pytest.mark.parametrize("freq", ["W", "M", "MS", "Q"]) + @pytest.mark.parametrize("int_holder", [np.array, pd.Index]) def test_dti_add_intarray_non_tick(self, int_holder, freq): # GH#19959 - dti = pd.date_range('2016-01-01', periods=2, freq=freq) + dti = pd.date_range("2016-01-01", periods=2, freq=freq) other = int_holder([4, -1]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = DatetimeIndex([dti[n] + other[n] - for n in range(len(dti))]) + expected = DatetimeIndex([dti[n] + other[n] for n in range(len(dti))]) # tm.assert_produces_warning does not handle cases where we expect # two warnings, in this case PerformanceWarning and FutureWarning. @@ -1982,13 +2139,13 @@ def test_dti_add_intarray_non_tick(self, int_holder, freq): result = other + dti tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('int_holder', [np.array, pd.Index]) + @pytest.mark.parametrize("int_holder", [np.array, pd.Index]) def test_dti_add_intarray_no_freq(self, int_holder): # GH#19959 - dti = pd.DatetimeIndex(['2016-01-01', 'NaT', '2017-04-05 06:07:08']) + dti = pd.DatetimeIndex(["2016-01-01", "NaT", "2017-04-05 06:07:08"]) other = int_holder([9, 4, -1]) - nfmsg = 'Cannot shift with no freq' - tmsg = 'cannot subtract DatetimeArray from' + nfmsg = "Cannot shift with no freq" + tmsg = "cannot subtract DatetimeArray from" with pytest.raises(NullFrequencyError, match=nfmsg): dti + other with pytest.raises(NullFrequencyError, match=nfmsg): @@ -2004,9 +2161,9 @@ def test_dti_add_intarray_no_freq(self, int_holder): def test_dti_add_tdi(self, tz_naive_fixture): # GH#17558 tz = tz_naive_fixture - dti = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10) - tdi = pd.timedelta_range('0 days', periods=10) - expected = pd.date_range('2017-01-01', periods=10, tz=tz) + dti = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) + tdi = pd.timedelta_range("0 days", periods=10) + expected = pd.date_range("2017-01-01", periods=10, tz=tz) # add with TimdeltaIndex result = dti + tdi @@ -2025,40 +2182,40 @@ def test_dti_add_tdi(self, tz_naive_fixture): def test_dti_iadd_tdi(self, tz_naive_fixture): # GH#17558 tz = tz_naive_fixture - dti = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10) - tdi = pd.timedelta_range('0 days', periods=10) - expected = pd.date_range('2017-01-01', periods=10, tz=tz) + dti = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) + tdi = pd.timedelta_range("0 days", periods=10) + expected = pd.date_range("2017-01-01", periods=10, tz=tz) # iadd with TimdeltaIndex - result = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10) + result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) result += tdi tm.assert_index_equal(result, expected) - result = pd.timedelta_range('0 days', periods=10) + result = pd.timedelta_range("0 days", periods=10) result += dti tm.assert_index_equal(result, expected) # iadd with timedelta64 array - result = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10) + result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) result += tdi.values tm.assert_index_equal(result, expected) - result = pd.timedelta_range('0 days', periods=10) + result = pd.timedelta_range("0 days", periods=10) result += dti tm.assert_index_equal(result, expected) def test_dti_sub_tdi(self, tz_naive_fixture): # GH#17558 tz = tz_naive_fixture - dti = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10) - tdi = pd.timedelta_range('0 days', periods=10) - expected = pd.date_range('2017-01-01', periods=10, tz=tz, freq='-1D') + dti = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) + tdi = pd.timedelta_range("0 days", periods=10) + expected = pd.date_range("2017-01-01", periods=10, tz=tz, freq="-1D") # sub with TimedeltaIndex result = dti - tdi tm.assert_index_equal(result, expected) - msg = 'cannot subtract .*TimedeltaArray' + msg = "cannot subtract .*TimedeltaArray" with pytest.raises(TypeError, match=msg): tdi - dti @@ -2066,34 +2223,38 @@ def test_dti_sub_tdi(self, tz_naive_fixture): result = dti - tdi.values tm.assert_index_equal(result, expected) - msg = 'cannot subtract DatetimeArray from' + msg = "cannot subtract DatetimeArray from" with pytest.raises(TypeError, match=msg): tdi.values - dti def test_dti_isub_tdi(self, tz_naive_fixture): # GH#17558 tz = tz_naive_fixture - dti = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10) - tdi = pd.timedelta_range('0 days', periods=10) - expected = pd.date_range('2017-01-01', periods=10, tz=tz, freq='-1D') + dti = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) + tdi = pd.timedelta_range("0 days", periods=10) + expected = pd.date_range("2017-01-01", periods=10, tz=tz, freq="-1D") # isub with TimedeltaIndex - result = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10) + result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) result -= tdi tm.assert_index_equal(result, expected) - msg = 'cannot subtract .* from a TimedeltaArray' + msg = "cannot subtract .* from a TimedeltaArray" with pytest.raises(TypeError, match=msg): tdi -= dti # isub with timedelta64 array - result = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10) + result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) result -= tdi.values tm.assert_index_equal(result, expected) - msg = '|'.join(['cannot perform __neg__ with this index type:', - 'ufunc subtract cannot use operands with types', - 'cannot subtract DatetimeArray from']) + msg = "|".join( + [ + "cannot perform __neg__ with this index type:", + "ufunc subtract cannot use operands with types", + "cannot subtract DatetimeArray from", + ] + ) with pytest.raises(TypeError, match=msg): tdi.values -= dti @@ -2102,20 +2263,24 @@ def test_dti_isub_tdi(self, tz_naive_fixture): # TODO: A couple other tests belong in this section. Move them in # A PR where there isn't already a giant diff. - @pytest.mark.parametrize('addend', [ - datetime(2011, 1, 1), - DatetimeIndex(['2011-01-01', '2011-01-02']), - DatetimeIndex(['2011-01-01', '2011-01-02']).tz_localize('US/Eastern'), - np.datetime64('2011-01-01'), - Timestamp('2011-01-01') - ], ids=lambda x: type(x).__name__) - @pytest.mark.parametrize('tz', [None, 'US/Eastern']) + @pytest.mark.parametrize( + "addend", + [ + datetime(2011, 1, 1), + DatetimeIndex(["2011-01-01", "2011-01-02"]), + DatetimeIndex(["2011-01-01", "2011-01-02"]).tz_localize("US/Eastern"), + np.datetime64("2011-01-01"), + Timestamp("2011-01-01"), + ], + ids=lambda x: type(x).__name__, + ) + @pytest.mark.parametrize("tz", [None, "US/Eastern"]) def test_add_datetimelike_and_dti(self, addend, tz): # GH#9631 - dti = DatetimeIndex(['2011-01-01', '2011-01-02']).tz_localize(tz) - msg = ('cannot add DatetimeArray and {0}' - .format(type(addend).__name__)).replace('DatetimeIndex', - 'DatetimeArray') + dti = DatetimeIndex(["2011-01-01", "2011-01-02"]).tz_localize(tz) + msg = ( + "cannot add DatetimeArray and {0}".format(type(addend).__name__) + ).replace("DatetimeIndex", "DatetimeArray") with pytest.raises(TypeError, match=msg): dti + addend with pytest.raises(TypeError, match=msg): @@ -2127,9 +2292,9 @@ def test_sub_dti_dti(self): # previously performed setop (deprecated in 0.16.0), now changed to # return subtraction -> TimeDeltaIndex (GH ...) - dti = date_range('20130101', periods=3) - dti_tz = date_range('20130101', periods=3).tz_localize('US/Eastern') - dti_tz2 = date_range('20130101', periods=3).tz_localize('UTC') + dti = date_range("20130101", periods=3) + dti_tz = date_range("20130101", periods=3).tz_localize("US/Eastern") + dti_tz2 = date_range("20130101", periods=3).tz_localize("UTC") expected = TimedeltaIndex([0, 0, 0]) result = dti - dti @@ -2137,7 +2302,7 @@ def test_sub_dti_dti(self): result = dti_tz - dti_tz tm.assert_index_equal(result, expected) - msg = 'DatetimeArray subtraction must have the same timezones or' + msg = "DatetimeArray subtraction must have the same timezones or" with pytest.raises(TypeError, match=msg): dti_tz - dti @@ -2152,16 +2317,16 @@ def test_sub_dti_dti(self): tm.assert_index_equal(dti, expected) # different length raises ValueError - dti1 = date_range('20130101', periods=3) - dti2 = date_range('20130101', periods=4) - msg = 'cannot add indices of unequal length' + dti1 = date_range("20130101", periods=3) + dti2 = date_range("20130101", periods=4) + msg = "cannot add indices of unequal length" with pytest.raises(ValueError, match=msg): dti1 - dti2 # NaN propagation - dti1 = DatetimeIndex(['2012-01-01', np.nan, '2012-01-03']) - dti2 = DatetimeIndex(['2012-01-02', '2012-01-03', np.nan]) - expected = TimedeltaIndex(['1 days', np.nan, np.nan]) + dti1 = DatetimeIndex(["2012-01-01", np.nan, "2012-01-03"]) + dti2 = DatetimeIndex(["2012-01-02", "2012-01-03", np.nan]) + expected = TimedeltaIndex(["1 days", np.nan, np.nan]) result = dti2 - dti1 tm.assert_index_equal(result, expected) @@ -2169,14 +2334,18 @@ def test_sub_dti_dti(self): # TODO: Most of this block is moved from series or frame tests, needs # cleanup, box-parametrization, and de-duplication - @pytest.mark.parametrize('op', [operator.add, operator.sub]) + @pytest.mark.parametrize("op", [operator.add, operator.sub]) def test_timedelta64_equal_timedelta_supported_ops(self, op): - ser = Series([Timestamp('20130301'), - Timestamp('20130228 23:00:00'), - Timestamp('20130228 22:00:00'), - Timestamp('20130228 21:00:00')]) + ser = Series( + [ + Timestamp("20130301"), + Timestamp("20130228 23:00:00"), + Timestamp("20130228 22:00:00"), + Timestamp("20130228 21:00:00"), + ] + ) - intervals = ['D', 'h', 'm', 's', 'us'] + intervals = ["D", "h", "m", "s", "us"] # TODO: unused # npy16_mappings = {'D': 24 * 60 * 60 * 1000000, @@ -2191,8 +2360,7 @@ def timedelta64(*args): for d, h, m, s, us in product(*([range(2)] * 5)): nptd = timedelta64(d, h, m, s, us) - pytd = timedelta(days=d, hours=h, minutes=m, seconds=s, - microseconds=us) + pytd = timedelta(days=d, hours=h, minutes=m, seconds=s, microseconds=us) lhs = op(ser, nptd) rhs = op(ser, pytd) @@ -2200,106 +2368,119 @@ def timedelta64(*args): def test_ops_nat_mixed_datetime64_timedelta64(self): # GH#11349 - timedelta_series = Series([NaT, Timedelta('1s')]) - datetime_series = Series([NaT, Timestamp('19900315')]) - nat_series_dtype_timedelta = Series([NaT, NaT], - dtype='timedelta64[ns]') - nat_series_dtype_timestamp = Series([NaT, NaT], dtype='datetime64[ns]') - single_nat_dtype_datetime = Series([NaT], dtype='datetime64[ns]') - single_nat_dtype_timedelta = Series([NaT], dtype='timedelta64[ns]') + timedelta_series = Series([NaT, Timedelta("1s")]) + datetime_series = Series([NaT, Timestamp("19900315")]) + nat_series_dtype_timedelta = Series([NaT, NaT], dtype="timedelta64[ns]") + nat_series_dtype_timestamp = Series([NaT, NaT], dtype="datetime64[ns]") + single_nat_dtype_datetime = Series([NaT], dtype="datetime64[ns]") + single_nat_dtype_timedelta = Series([NaT], dtype="timedelta64[ns]") # subtraction - tm.assert_series_equal(datetime_series - single_nat_dtype_datetime, - nat_series_dtype_timedelta) + tm.assert_series_equal( + datetime_series - single_nat_dtype_datetime, nat_series_dtype_timedelta + ) - tm.assert_series_equal(datetime_series - single_nat_dtype_timedelta, - nat_series_dtype_timestamp) - tm.assert_series_equal(-single_nat_dtype_timedelta + datetime_series, - nat_series_dtype_timestamp) + tm.assert_series_equal( + datetime_series - single_nat_dtype_timedelta, nat_series_dtype_timestamp + ) + tm.assert_series_equal( + -single_nat_dtype_timedelta + datetime_series, nat_series_dtype_timestamp + ) # without a Series wrapping the NaT, it is ambiguous # whether it is a datetime64 or timedelta64 # defaults to interpreting it as timedelta64 - tm.assert_series_equal(nat_series_dtype_timestamp - - single_nat_dtype_datetime, - nat_series_dtype_timedelta) - - tm.assert_series_equal(nat_series_dtype_timestamp - - single_nat_dtype_timedelta, - nat_series_dtype_timestamp) - tm.assert_series_equal(-single_nat_dtype_timedelta + - nat_series_dtype_timestamp, - nat_series_dtype_timestamp) - msg = 'cannot subtract a datelike' + tm.assert_series_equal( + nat_series_dtype_timestamp - single_nat_dtype_datetime, + nat_series_dtype_timedelta, + ) + + tm.assert_series_equal( + nat_series_dtype_timestamp - single_nat_dtype_timedelta, + nat_series_dtype_timestamp, + ) + tm.assert_series_equal( + -single_nat_dtype_timedelta + nat_series_dtype_timestamp, + nat_series_dtype_timestamp, + ) + msg = "cannot subtract a datelike" with pytest.raises(TypeError, match=msg): timedelta_series - single_nat_dtype_datetime # addition - tm.assert_series_equal(nat_series_dtype_timestamp + - single_nat_dtype_timedelta, - nat_series_dtype_timestamp) - tm.assert_series_equal(single_nat_dtype_timedelta + - nat_series_dtype_timestamp, - nat_series_dtype_timestamp) - - tm.assert_series_equal(nat_series_dtype_timestamp + - single_nat_dtype_timedelta, - nat_series_dtype_timestamp) - tm.assert_series_equal(single_nat_dtype_timedelta + - nat_series_dtype_timestamp, - nat_series_dtype_timestamp) - - tm.assert_series_equal(nat_series_dtype_timedelta + - single_nat_dtype_datetime, - nat_series_dtype_timestamp) - tm.assert_series_equal(single_nat_dtype_datetime + - nat_series_dtype_timedelta, - nat_series_dtype_timestamp) + tm.assert_series_equal( + nat_series_dtype_timestamp + single_nat_dtype_timedelta, + nat_series_dtype_timestamp, + ) + tm.assert_series_equal( + single_nat_dtype_timedelta + nat_series_dtype_timestamp, + nat_series_dtype_timestamp, + ) + + tm.assert_series_equal( + nat_series_dtype_timestamp + single_nat_dtype_timedelta, + nat_series_dtype_timestamp, + ) + tm.assert_series_equal( + single_nat_dtype_timedelta + nat_series_dtype_timestamp, + nat_series_dtype_timestamp, + ) + + tm.assert_series_equal( + nat_series_dtype_timedelta + single_nat_dtype_datetime, + nat_series_dtype_timestamp, + ) + tm.assert_series_equal( + single_nat_dtype_datetime + nat_series_dtype_timedelta, + nat_series_dtype_timestamp, + ) def test_ufunc_coercions(self): - idx = date_range('2011-01-01', periods=3, freq='2D', name='x') + idx = date_range("2011-01-01", periods=3, freq="2D", name="x") - delta = np.timedelta64(1, 'D') + delta = np.timedelta64(1, "D") for result in [idx + delta, np.add(idx, delta)]: assert isinstance(result, DatetimeIndex) - exp = date_range('2011-01-02', periods=3, freq='2D', name='x') + exp = date_range("2011-01-02", periods=3, freq="2D", name="x") tm.assert_index_equal(result, exp) - assert result.freq == '2D' + assert result.freq == "2D" for result in [idx - delta, np.subtract(idx, delta)]: assert isinstance(result, DatetimeIndex) - exp = date_range('2010-12-31', periods=3, freq='2D', name='x') + exp = date_range("2010-12-31", periods=3, freq="2D", name="x") tm.assert_index_equal(result, exp) - assert result.freq == '2D' + assert result.freq == "2D" - delta = np.array([np.timedelta64(1, 'D'), np.timedelta64(2, 'D'), - np.timedelta64(3, 'D')]) + delta = np.array( + [np.timedelta64(1, "D"), np.timedelta64(2, "D"), np.timedelta64(3, "D")] + ) for result in [idx + delta, np.add(idx, delta)]: assert isinstance(result, DatetimeIndex) - exp = DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-08'], - freq='3D', name='x') + exp = DatetimeIndex( + ["2011-01-02", "2011-01-05", "2011-01-08"], freq="3D", name="x" + ) tm.assert_index_equal(result, exp) - assert result.freq == '3D' + assert result.freq == "3D" for result in [idx - delta, np.subtract(idx, delta)]: assert isinstance(result, DatetimeIndex) - exp = DatetimeIndex(['2010-12-31', '2011-01-01', '2011-01-02'], - freq='D', name='x') + exp = DatetimeIndex( + ["2010-12-31", "2011-01-01", "2011-01-02"], freq="D", name="x" + ) tm.assert_index_equal(result, exp) - assert result.freq == 'D' + assert result.freq == "D" - @pytest.mark.parametrize('names', [('foo', None, None), - ('baz', 'bar', None), - ('bar', 'bar', 'bar')]) - @pytest.mark.parametrize('tz', [None, 'America/Chicago']) + @pytest.mark.parametrize( + "names", [("foo", None, None), ("baz", "bar", None), ("bar", "bar", "bar")] + ) + @pytest.mark.parametrize("tz", [None, "America/Chicago"]) def test_dti_add_series(self, tz, names): # GH#13905 - index = DatetimeIndex(['2016-06-28 05:30', '2016-06-28 05:31'], - tz=tz, name=names[0]) - ser = Series([Timedelta(seconds=5)] * 2, - index=index, name=names[1]) - expected = Series(index + Timedelta(seconds=5), - index=index, name=names[2]) + index = DatetimeIndex( + ["2016-06-28 05:30", "2016-06-28 05:31"], tz=tz, name=names[0] + ) + ser = Series([Timedelta(seconds=5)] * 2, index=index, name=names[1]) + expected = Series(index + Timedelta(seconds=5), index=index, name=names[2]) # passing name arg isn't enough when names[2] is None expected.name = names[2] @@ -2315,89 +2496,99 @@ def test_dti_add_series(self, tz, names): result4 = index + ser.values tm.assert_index_equal(result4, expected) - @pytest.mark.parametrize('names', [(None, None, None), - ('foo', 'bar', None), - ('foo', 'foo', 'foo')]) + @pytest.mark.parametrize( + "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] + ) def test_dti_add_offset_index(self, tz_naive_fixture, names): # GH#18849, GH#19744 tz = tz_naive_fixture - dti = pd.date_range('2017-01-01', periods=2, tz=tz, name=names[0]) - other = pd.Index([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], - name=names[1]) + dti = pd.date_range("2017-01-01", periods=2, tz=tz, name=names[0]) + other = pd.Index([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], name=names[1]) - with tm.assert_produces_warning(PerformanceWarning, - clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning( + PerformanceWarning, clear=[pd.core.arrays.datetimelike] + ): res = dti + other - expected = DatetimeIndex([dti[n] + other[n] for n in range(len(dti))], - name=names[2], freq='infer') + expected = DatetimeIndex( + [dti[n] + other[n] for n in range(len(dti))], name=names[2], freq="infer" + ) tm.assert_index_equal(res, expected) - with tm.assert_produces_warning(PerformanceWarning, - clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning( + PerformanceWarning, clear=[pd.core.arrays.datetimelike] + ): res2 = other + dti tm.assert_index_equal(res2, expected) - @pytest.mark.parametrize('names', [(None, None, None), - ('foo', 'bar', None), - ('foo', 'foo', 'foo')]) + @pytest.mark.parametrize( + "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] + ) def test_dti_sub_offset_index(self, tz_naive_fixture, names): # GH#18824, GH#19744 tz = tz_naive_fixture - dti = pd.date_range('2017-01-01', periods=2, tz=tz, name=names[0]) - other = pd.Index([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], - name=names[1]) + dti = pd.date_range("2017-01-01", periods=2, tz=tz, name=names[0]) + other = pd.Index([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], name=names[1]) - with tm.assert_produces_warning(PerformanceWarning, - clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning( + PerformanceWarning, clear=[pd.core.arrays.datetimelike] + ): res = dti - other - expected = DatetimeIndex([dti[n] - other[n] for n in range(len(dti))], - name=names[2], freq='infer') + expected = DatetimeIndex( + [dti[n] - other[n] for n in range(len(dti))], name=names[2], freq="infer" + ) tm.assert_index_equal(res, expected) - @pytest.mark.parametrize('names', [(None, None, None), - ('foo', 'bar', None), - ('foo', 'foo', 'foo')]) + @pytest.mark.parametrize( + "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] + ) def test_dti_with_offset_series(self, tz_naive_fixture, names): # GH#18849 tz = tz_naive_fixture - dti = pd.date_range('2017-01-01', periods=2, tz=tz, name=names[0]) - other = Series([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], - name=names[1]) + dti = pd.date_range("2017-01-01", periods=2, tz=tz, name=names[0]) + other = Series([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], name=names[1]) - expected_add = Series([dti[n] + other[n] for n in range(len(dti))], - name=names[2]) + expected_add = Series( + [dti[n] + other[n] for n in range(len(dti))], name=names[2] + ) - with tm.assert_produces_warning(PerformanceWarning, - clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning( + PerformanceWarning, clear=[pd.core.arrays.datetimelike] + ): res = dti + other tm.assert_series_equal(res, expected_add) - with tm.assert_produces_warning(PerformanceWarning, - clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning( + PerformanceWarning, clear=[pd.core.arrays.datetimelike] + ): res2 = other + dti tm.assert_series_equal(res2, expected_add) - expected_sub = Series([dti[n] - other[n] for n in range(len(dti))], - name=names[2]) + expected_sub = Series( + [dti[n] - other[n] for n in range(len(dti))], name=names[2] + ) - with tm.assert_produces_warning(PerformanceWarning, - clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning( + PerformanceWarning, clear=[pd.core.arrays.datetimelike] + ): res3 = dti - other tm.assert_series_equal(res3, expected_sub) -@pytest.mark.parametrize('years', [-1, 0, 1]) -@pytest.mark.parametrize('months', [-2, 0, 2]) +@pytest.mark.parametrize("years", [-1, 0, 1]) +@pytest.mark.parametrize("months", [-2, 0, 2]) def test_shift_months(years, months): - dti = DatetimeIndex([Timestamp('2000-01-05 00:15:00'), - Timestamp('2000-01-31 00:23:00'), - Timestamp('2000-01-01'), - Timestamp('2000-02-29'), - Timestamp('2000-12-31')]) + dti = DatetimeIndex( + [ + Timestamp("2000-01-05 00:15:00"), + Timestamp("2000-01-31 00:23:00"), + Timestamp("2000-01-01"), + Timestamp("2000-02-29"), + Timestamp("2000-12-31"), + ] + ) actual = DatetimeIndex(shift_months(dti.asi8, years * 12 + months)) - raw = [x + pd.offsets.DateOffset(years=years, months=months) - for x in dti] + raw = [x + pd.offsets.DateOffset(years=years, months=months) for x in dti] expected = DatetimeIndex(raw) tm.assert_index_equal(actual, expected) @@ -2406,12 +2597,13 @@ class SubDatetime(datetime): pass -@pytest.mark.parametrize("lh,rh", [ - (SubDatetime(2000, 1, 1), - Timedelta(hours=1)), - (Timedelta(hours=1), - SubDatetime(2000, 1, 1)) -]) +@pytest.mark.parametrize( + "lh,rh", + [ + (SubDatetime(2000, 1, 1), Timedelta(hours=1)), + (Timedelta(hours=1), SubDatetime(2000, 1, 1)), + ], +) def test_dt_subclass_add_timedelta(lh, rh): # GH 25851 # ensure that subclassed datetime works for diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 31c7f47bcf5bd2..7dcd0cc820061b 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -34,7 +34,7 @@ def test_operator_series_comparison_zerorank(self): def test_df_numeric_cmp_dt64_raises(self): # GH#8932, GH#22163 ts = pd.Timestamp.now() - df = pd.DataFrame({'x': range(5)}) + df = pd.DataFrame({"x": range(5)}) with pytest.raises(TypeError): df > ts with pytest.raises(TypeError): @@ -52,30 +52,46 @@ def test_compare_invalid(self): # ops testing a = pd.Series(np.random.randn(5), name=0) b = pd.Series(np.random.randn(5)) - b.name = pd.Timestamp('2000-01-01') + b.name = pd.Timestamp("2000-01-01") tm.assert_series_equal(a / b, 1 / (b / a)) # ------------------------------------------------------------------ # Numeric dtypes Arithmetic with Timedelta Scalar + class TestNumericArraylikeArithmeticWithTimedeltaLike: # TODO: also check name retentention - @pytest.mark.parametrize('box_cls', [np.array, pd.Index, pd.Series]) - @pytest.mark.parametrize('left', [ - pd.RangeIndex(10, 40, 10)] + [cls([10, 20, 30], dtype=dtype) - for dtype in ['i1', 'i2', 'i4', 'i8', - 'u1', 'u2', 'u4', 'u8', - 'f2', 'f4', 'f8'] - for cls in [pd.Series, pd.Index]], - ids=lambda x: type(x).__name__ + str(x.dtype)) + @pytest.mark.parametrize("box_cls", [np.array, pd.Index, pd.Series]) + @pytest.mark.parametrize( + "left", + [pd.RangeIndex(10, 40, 10)] + + [ + cls([10, 20, 30], dtype=dtype) + for dtype in [ + "i1", + "i2", + "i4", + "i8", + "u1", + "u2", + "u4", + "u8", + "f2", + "f4", + "f8", + ] + for cls in [pd.Series, pd.Index] + ], + ids=lambda x: type(x).__name__ + str(x.dtype), + ) def test_mul_td64arr(self, left, box_cls): # GH#22390 - right = np.array([1, 2, 3], dtype='m8[s]') + right = np.array([1, 2, 3], dtype="m8[s]") right = box_cls(right) - expected = pd.TimedeltaIndex(['10s', '40s', '90s']) + expected = pd.TimedeltaIndex(["10s", "40s", "90s"]) if isinstance(left, pd.Series) or box_cls is pd.Series: expected = pd.Series(expected) @@ -86,20 +102,35 @@ def test_mul_td64arr(self, left, box_cls): tm.assert_equal(result, expected) # TODO: also check name retentention - @pytest.mark.parametrize('box_cls', [np.array, pd.Index, pd.Series]) - @pytest.mark.parametrize('left', [ - pd.RangeIndex(10, 40, 10)] + [cls([10, 20, 30], dtype=dtype) - for dtype in ['i1', 'i2', 'i4', 'i8', - 'u1', 'u2', 'u4', 'u8', - 'f2', 'f4', 'f8'] - for cls in [pd.Series, pd.Index]], - ids=lambda x: type(x).__name__ + str(x.dtype)) + @pytest.mark.parametrize("box_cls", [np.array, pd.Index, pd.Series]) + @pytest.mark.parametrize( + "left", + [pd.RangeIndex(10, 40, 10)] + + [ + cls([10, 20, 30], dtype=dtype) + for dtype in [ + "i1", + "i2", + "i4", + "i8", + "u1", + "u2", + "u4", + "u8", + "f2", + "f4", + "f8", + ] + for cls in [pd.Series, pd.Index] + ], + ids=lambda x: type(x).__name__ + str(x.dtype), + ) def test_div_td64arr(self, left, box_cls): # GH#22390 - right = np.array([10, 40, 90], dtype='m8[s]') + right = np.array([10, 40, 90], dtype="m8[s]") right = box_cls(right) - expected = pd.TimedeltaIndex(['1s', '2s', '3s']) + expected = pd.TimedeltaIndex(["1s", "2s", "3s"]) if isinstance(left, pd.Series) or box_cls is pd.Series: expected = pd.Series(expected) @@ -118,24 +149,28 @@ def test_div_td64arr(self, left, box_cls): # TODO: de-duplicate with test_numeric_arr_mul_tdscalar def test_ops_series(self): # regression test for G#H8813 - td = Timedelta('1 day') + td = Timedelta("1 day") other = pd.Series([1, 2]) - expected = pd.Series(pd.to_timedelta(['1 day', '2 days'])) + expected = pd.Series(pd.to_timedelta(["1 day", "2 days"])) tm.assert_series_equal(expected, td * other) tm.assert_series_equal(expected, other * td) # TODO: also test non-nanosecond timedelta64 and Tick objects; # see test_numeric_arr_rdiv_tdscalar for note on these failing - @pytest.mark.parametrize('scalar_td', [ - Timedelta(days=1), - Timedelta(days=1).to_timedelta64(), - Timedelta(days=1).to_pytimedelta()], - ids=lambda x: type(x).__name__) + @pytest.mark.parametrize( + "scalar_td", + [ + Timedelta(days=1), + Timedelta(days=1).to_timedelta64(), + Timedelta(days=1).to_pytimedelta(), + ], + ids=lambda x: type(x).__name__, + ) def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box): # GH#19333 index = numeric_idx - expected = pd.timedelta_range('0 days', '4 days') + expected = pd.timedelta_range("0 days", "4 days") index = tm.box_expected(index, box) expected = tm.box_expected(expected, box) @@ -149,7 +184,7 @@ def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box): def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box): index = numeric_idx[1:3] - expected = TimedeltaIndex(['3 Days', '36 Hours']) + expected = TimedeltaIndex(["3 Days", "36 Hours"]) index = tm.box_expected(index, box) expected = tm.box_expected(expected, box) @@ -160,15 +195,19 @@ def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box): with pytest.raises(TypeError): index / three_days - @pytest.mark.parametrize('other', [ - pd.Timedelta(hours=31), - pd.Timedelta(hours=31).to_pytimedelta(), - pd.Timedelta(hours=31).to_timedelta64(), - pd.Timedelta(hours=31).to_timedelta64().astype('m8[h]'), - np.timedelta64('NaT'), - np.timedelta64('NaT', 'D'), - pd.offsets.Minute(3), - pd.offsets.Second(0)]) + @pytest.mark.parametrize( + "other", + [ + pd.Timedelta(hours=31), + pd.Timedelta(hours=31).to_pytimedelta(), + pd.Timedelta(hours=31).to_timedelta64(), + pd.Timedelta(hours=31).to_timedelta64().astype("m8[h]"), + np.timedelta64("NaT"), + np.timedelta64("NaT", "D"), + pd.offsets.Minute(3), + pd.offsets.Second(0), + ], + ) def test_add_sub_timedeltalike_invalid(self, numeric_idx, other, box): left = tm.box_expected(numeric_idx, box) with pytest.raises(TypeError): @@ -184,46 +223,41 @@ def test_add_sub_timedeltalike_invalid(self, numeric_idx, other, box): # ------------------------------------------------------------------ # Arithmetic -class TestDivisionByZero: +class TestDivisionByZero: def test_div_zero(self, zero, numeric_idx): idx = numeric_idx - expected = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf], - dtype=np.float64) + expected = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64) result = idx / zero tm.assert_index_equal(result, expected) - ser_compat = Series(idx).astype('i8') / np.array(zero).astype('i8') + ser_compat = Series(idx).astype("i8") / np.array(zero).astype("i8") tm.assert_series_equal(ser_compat, Series(result)) def test_floordiv_zero(self, zero, numeric_idx): idx = numeric_idx - expected = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf], - dtype=np.float64) + expected = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64) result = idx // zero tm.assert_index_equal(result, expected) - ser_compat = Series(idx).astype('i8') // np.array(zero).astype('i8') + ser_compat = Series(idx).astype("i8") // np.array(zero).astype("i8") tm.assert_series_equal(ser_compat, Series(result)) def test_mod_zero(self, zero, numeric_idx): idx = numeric_idx - expected = pd.Index([np.nan, np.nan, np.nan, np.nan, np.nan], - dtype=np.float64) + expected = pd.Index([np.nan, np.nan, np.nan, np.nan, np.nan], dtype=np.float64) result = idx % zero tm.assert_index_equal(result, expected) - ser_compat = Series(idx).astype('i8') % np.array(zero).astype('i8') + ser_compat = Series(idx).astype("i8") % np.array(zero).astype("i8") tm.assert_series_equal(ser_compat, Series(result)) def test_divmod_zero(self, zero, numeric_idx): idx = numeric_idx - exleft = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf], - dtype=np.float64) - exright = pd.Index([np.nan, np.nan, np.nan, np.nan, np.nan], - dtype=np.float64) + exleft = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64) + exright = pd.Index([np.nan, np.nan, np.nan, np.nan, np.nan], dtype=np.float64) result = divmod(idx, zero) tm.assert_index_equal(result[0], exleft) @@ -231,19 +265,34 @@ def test_divmod_zero(self, zero, numeric_idx): # ------------------------------------------------------------------ - @pytest.mark.parametrize('dtype2', [ - np.int64, np.int32, np.int16, np.int8, - np.float64, np.float32, np.float16, - np.uint64, np.uint32, np.uint16, np.uint8]) - @pytest.mark.parametrize('dtype1', [np.int64, np.float64, np.uint64]) + @pytest.mark.parametrize( + "dtype2", + [ + np.int64, + np.int32, + np.int16, + np.int8, + np.float64, + np.float32, + np.float16, + np.uint64, + np.uint32, + np.uint16, + np.uint8, + ], + ) + @pytest.mark.parametrize("dtype1", [np.int64, np.float64, np.uint64]) def test_ser_div_ser(self, dtype1, dtype2): # no longer do integer div for any ops, but deal with the 0's - first = Series([3, 4, 5, 8], name='first').astype(dtype1) - second = Series([0, 0, 0, 3], name='second').astype(dtype2) - - with np.errstate(all='ignore'): - expected = Series(first.values.astype(np.float64) / second.values, - dtype='float64', name=None) + first = Series([3, 4, 5, 8], name="first").astype(dtype1) + second = Series([0, 0, 0, 3], name="second").astype(dtype2) + + with np.errstate(all="ignore"): + expected = Series( + first.values.astype(np.float64) / second.values, + dtype="float64", + name=None, + ) expected.iloc[0:3] = np.inf result = first / second @@ -254,7 +303,7 @@ def test_rdiv_zero_compat(self): # GH#8674 zero_array = np.array([0] * 5) data = np.random.randn(5) - expected = Series([0.] * 5) + expected = Series([0.0] * 5) result = zero_array / Series(data) tm.assert_series_equal(result, expected) @@ -267,55 +316,54 @@ def test_rdiv_zero_compat(self): def test_div_zero_inf_signs(self): # GH#9144, inf signing - ser = Series([-1, 0, 1], name='first') - expected = Series([-np.inf, np.nan, np.inf], name='first') + ser = Series([-1, 0, 1], name="first") + expected = Series([-np.inf, np.nan, np.inf], name="first") result = ser / 0 tm.assert_series_equal(result, expected) def test_rdiv_zero(self): # GH#9144 - ser = Series([-1, 0, 1], name='first') - expected = Series([0.0, np.nan, 0.0], name='first') + ser = Series([-1, 0, 1], name="first") + expected = Series([0.0, np.nan, 0.0], name="first") result = 0 / ser tm.assert_series_equal(result, expected) def test_floordiv_div(self): # GH#9144 - ser = Series([-1, 0, 1], name='first') + ser = Series([-1, 0, 1], name="first") result = ser // 0 - expected = Series([-np.inf, np.nan, np.inf], name='first') + expected = Series([-np.inf, np.nan, np.inf], name="first") tm.assert_series_equal(result, expected) def test_df_div_zero_df(self): # integer div, but deal with the 0's (GH#9144) - df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) result = df / df first = pd.Series([1.0, 1.0, 1.0, 1.0]) second = pd.Series([np.nan, np.nan, np.nan, 1]) - expected = pd.DataFrame({'first': first, 'second': second}) + expected = pd.DataFrame({"first": first, "second": second}) tm.assert_frame_equal(result, expected) def test_df_div_zero_array(self): # integer div, but deal with the 0's (GH#9144) - df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) first = pd.Series([1.0, 1.0, 1.0, 1.0]) second = pd.Series([np.nan, np.nan, np.nan, 1]) - expected = pd.DataFrame({'first': first, 'second': second}) + expected = pd.DataFrame({"first": first, "second": second}) - with np.errstate(all='ignore'): - arr = df.values.astype('float') / df.values - result = pd.DataFrame(arr, index=df.index, - columns=df.columns) + with np.errstate(all="ignore"): + arr = df.values.astype("float") / df.values + result = pd.DataFrame(arr, index=df.index, columns=df.columns) tm.assert_frame_equal(result, expected) def test_df_div_zero_int(self): # integer div, but deal with the 0's (GH#9144) - df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) result = df / 0 expected = pd.DataFrame(np.inf, index=df.index, columns=df.columns) @@ -323,10 +371,9 @@ def test_df_div_zero_int(self): tm.assert_frame_equal(result, expected) # numpy has a slightly different (wrong) treatment - with np.errstate(all='ignore'): - arr = df.values.astype('float64') / 0 - result2 = pd.DataFrame(arr, index=df.index, - columns=df.columns) + with np.errstate(all="ignore"): + arr = df.values.astype("float64") / 0 + result2 = pd.DataFrame(arr, index=df.index, columns=df.columns) tm.assert_frame_equal(result2, expected) def test_df_div_zero_series_does_not_commute(self): @@ -342,45 +389,44 @@ def test_df_div_zero_series_does_not_commute(self): def test_df_mod_zero_df(self): # GH#3590, modulo as ints - df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) # this is technically wrong, as the integer portion is coerced to float # ### - first = pd.Series([0, 0, 0, 0], dtype='float64') + first = pd.Series([0, 0, 0, 0], dtype="float64") second = pd.Series([np.nan, np.nan, np.nan, 0]) - expected = pd.DataFrame({'first': first, 'second': second}) + expected = pd.DataFrame({"first": first, "second": second}) result = df % df tm.assert_frame_equal(result, expected) def test_df_mod_zero_array(self): # GH#3590, modulo as ints - df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) # this is technically wrong, as the integer portion is coerced to float # ### - first = pd.Series([0, 0, 0, 0], dtype='float64') + first = pd.Series([0, 0, 0, 0], dtype="float64") second = pd.Series([np.nan, np.nan, np.nan, 0]) - expected = pd.DataFrame({'first': first, 'second': second}) + expected = pd.DataFrame({"first": first, "second": second}) # numpy has a slightly different (wrong) treatment - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): arr = df.values % df.values - result2 = pd.DataFrame(arr, index=df.index, - columns=df.columns, dtype='float64') + result2 = pd.DataFrame(arr, index=df.index, columns=df.columns, dtype="float64") result2.iloc[0:3, 1] = np.nan tm.assert_frame_equal(result2, expected) def test_df_mod_zero_int(self): # GH#3590, modulo as ints - df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) result = df % 0 expected = pd.DataFrame(np.nan, index=df.index, columns=df.columns) tm.assert_frame_equal(result, expected) # numpy has a slightly different (wrong) treatment - with np.errstate(all='ignore'): - arr = df.values.astype('float64') % 0 + with np.errstate(all="ignore"): + arr = df.values.astype("float64") % 0 result2 = pd.DataFrame(arr, index=df.index, columns=df.columns) tm.assert_frame_equal(result2, expected) @@ -398,14 +444,20 @@ class TestMultiplicationDivision: # __mul__, __rmul__, __div__, __rdiv__, __floordiv__, __rfloordiv__ # for non-timestamp/timedelta/period dtypes - @pytest.mark.parametrize('box', [ - pytest.param(pd.Index, - marks=pytest.mark.xfail(reason="Index.__div__ always " - "raises", - raises=TypeError)), - pd.Series, - pd.DataFrame - ], ids=lambda x: x.__name__) + @pytest.mark.parametrize( + "box", + [ + pytest.param( + pd.Index, + marks=pytest.mark.xfail( + reason="Index.__div__ always " "raises", raises=TypeError + ), + ), + pd.Series, + pd.DataFrame, + ], + ids=lambda x: x.__name__, + ) def test_divide_decimal(self, box): # resolves issue GH#9787 ser = Series([Decimal(10)]) @@ -425,8 +477,8 @@ def test_div_equiv_binop(self): # Test Series.div as well as Series.__div__ # float/integer issue # GH#7785 - first = Series([1, 0], name='first') - second = Series([-0.01, -0.02], name='second') + first = Series([1, 0], name="first") + second = Series([-0.01, -0.02], name="second") expected = Series([-0.01, -np.inf]) result = second.div(first) @@ -438,14 +490,14 @@ def test_div_equiv_binop(self): def test_div_int(self, numeric_idx): idx = numeric_idx result = idx / 1 - expected = idx.astype('float64') + expected = idx.astype("float64") tm.assert_index_equal(result, expected) result = idx / 2 expected = Index(idx.values / 2) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('op', [operator.mul, ops.rmul, operator.floordiv]) + @pytest.mark.parametrize("op", [operator.mul, ops.rmul, operator.floordiv]) def test_mul_int_identity(self, op, numeric_idx, box): idx = numeric_idx idx = tm.box_expected(idx, box) @@ -457,10 +509,10 @@ def test_mul_int_array(self, numeric_idx): idx = numeric_idx didx = idx * idx - result = idx * np.array(5, dtype='int64') + result = idx * np.array(5, dtype="int64") tm.assert_index_equal(result, idx * 5) - arr_dtype = 'uint64' if isinstance(idx, pd.UInt64Index) else 'int64' + arr_dtype = "uint64" if isinstance(idx, pd.UInt64Index) else "int64" result = idx * np.arange(5, dtype=arr_dtype) tm.assert_index_equal(result, didx) @@ -468,13 +520,13 @@ def test_mul_int_series(self, numeric_idx): idx = numeric_idx didx = idx * idx - arr_dtype = 'uint64' if isinstance(idx, pd.UInt64Index) else 'int64' + arr_dtype = "uint64" if isinstance(idx, pd.UInt64Index) else "int64" result = idx * Series(np.arange(5, dtype=arr_dtype)) tm.assert_series_equal(result, Series(didx)) def test_mul_float_series(self, numeric_idx): idx = numeric_idx - rng5 = np.arange(5, dtype='float64') + rng5 = np.arange(5, dtype="float64") result = idx * Series(rng5 + 0.1) expected = Series(rng5 * (rng5 + 0.1)) @@ -490,7 +542,7 @@ def test_mul_index(self, numeric_idx): def test_mul_datelike_raises(self, numeric_idx): idx = numeric_idx with pytest.raises(TypeError): - idx * pd.date_range('20130101', periods=5) + idx * pd.date_range("20130101", periods=5) def test_mul_size_mismatch_raises(self, numeric_idx): idx = numeric_idx @@ -499,7 +551,7 @@ def test_mul_size_mismatch_raises(self, numeric_idx): with pytest.raises(ValueError): idx * np.array([1, 2]) - @pytest.mark.parametrize('op', [operator.pow, ops.rpow]) + @pytest.mark.parametrize("op", [operator.pow, ops.rpow]) def test_pow_float(self, op, numeric_idx, box): # test power calculations both ways, GH#14973 idx = numeric_idx @@ -526,7 +578,7 @@ def test_divmod_scalar(self, numeric_idx): idx = numeric_idx result = divmod(idx, 2) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): div, mod = divmod(idx.values, 2) expected = Index(div), Index(mod) @@ -538,7 +590,7 @@ def test_divmod_ndarray(self, numeric_idx): other = np.ones(idx.values.shape, dtype=idx.values.dtype) * 2 result = divmod(idx, other) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): div, mod = divmod(idx.values, other) expected = Index(div), Index(mod) @@ -550,18 +602,19 @@ def test_divmod_series(self, numeric_idx): other = np.ones(idx.values.shape, dtype=idx.values.dtype) * 2 result = divmod(idx, Series(other)) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): div, mod = divmod(idx.values, other) expected = Series(div), Series(mod) for r, e in zip(result, expected): tm.assert_series_equal(r, e) - @pytest.mark.parametrize('other', [np.nan, 7, -23, 2.718, -3.14, np.inf]) + @pytest.mark.parametrize("other", [np.nan, 7, -23, 2.718, -3.14, np.inf]) def test_ops_np_scalar(self, other): vals = np.random.randn(5, 3) - f = lambda x: pd.DataFrame(x, index=list('ABCDE'), - columns=['jim', 'joe', 'jolie']) + f = lambda x: pd.DataFrame( + x, index=list("ABCDE"), columns=["jim", "joe", "jolie"] + ) df = f(vals) @@ -574,44 +627,39 @@ def test_ops_np_scalar(self, other): def test_operators_frame(self): # rpow does not work with DataFrame ts = tm.makeTimeSeries() - ts.name = 'ts' + ts.name = "ts" - df = pd.DataFrame({'A': ts}) + df = pd.DataFrame({"A": ts}) - tm.assert_series_equal(ts + ts, ts + df['A'], - check_names=False) - tm.assert_series_equal(ts ** ts, ts ** df['A'], - check_names=False) - tm.assert_series_equal(ts < ts, ts < df['A'], - check_names=False) - tm.assert_series_equal(ts / ts, ts / df['A'], - check_names=False) + tm.assert_series_equal(ts + ts, ts + df["A"], check_names=False) + tm.assert_series_equal(ts ** ts, ts ** df["A"], check_names=False) + tm.assert_series_equal(ts < ts, ts < df["A"], check_names=False) + tm.assert_series_equal(ts / ts, ts / df["A"], check_names=False) # TODO: this came from tests.series.test_analytics, needs cleanup and # de-duplication with test_modulo above def test_modulo2(self): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): # GH#3590, modulo as ints - p = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) - result = p['first'] % p['second'] - expected = Series(p['first'].values % p['second'].values, - dtype='float64') + p = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) + result = p["first"] % p["second"] + expected = Series(p["first"].values % p["second"].values, dtype="float64") expected.iloc[0:3] = np.nan tm.assert_series_equal(result, expected) - result = p['first'] % 0 - expected = Series(np.nan, index=p.index, name='first') + result = p["first"] % 0 + expected = Series(np.nan, index=p.index, name="first") tm.assert_series_equal(result, expected) - p = p.astype('float64') - result = p['first'] % p['second'] - expected = Series(p['first'].values % p['second'].values) + p = p.astype("float64") + result = p["first"] % p["second"] + expected = Series(p["first"].values % p["second"].values) tm.assert_series_equal(result, expected) - p = p.astype('float64') - result = p['first'] % p['second'] - result2 = p['second'] % p['first'] + p = p.astype("float64") + result = p["first"] % p["second"] + result2 = p["second"] % p["first"] assert not result.equals(result2) # GH#9144 @@ -633,30 +681,26 @@ class TestAdditionSubtraction: # TODO: This came from series.test.test_operators, needs cleanup def test_arith_ops_df_compat(self): # GH#1134 - s1 = pd.Series([1, 2, 3], index=list('ABC'), name='x') - s2 = pd.Series([2, 2, 2], index=list('ABD'), name='x') + s1 = pd.Series([1, 2, 3], index=list("ABC"), name="x") + s2 = pd.Series([2, 2, 2], index=list("ABD"), name="x") - exp = pd.Series([3.0, 4.0, np.nan, np.nan], - index=list('ABCD'), name='x') + exp = pd.Series([3.0, 4.0, np.nan, np.nan], index=list("ABCD"), name="x") tm.assert_series_equal(s1 + s2, exp) tm.assert_series_equal(s2 + s1, exp) - exp = pd.DataFrame({'x': [3.0, 4.0, np.nan, np.nan]}, - index=list('ABCD')) + exp = pd.DataFrame({"x": [3.0, 4.0, np.nan, np.nan]}, index=list("ABCD")) tm.assert_frame_equal(s1.to_frame() + s2.to_frame(), exp) tm.assert_frame_equal(s2.to_frame() + s1.to_frame(), exp) # different length - s3 = pd.Series([1, 2, 3], index=list('ABC'), name='x') - s4 = pd.Series([2, 2, 2, 2], index=list('ABCD'), name='x') + s3 = pd.Series([1, 2, 3], index=list("ABC"), name="x") + s4 = pd.Series([2, 2, 2, 2], index=list("ABCD"), name="x") - exp = pd.Series([3, 4, 5, np.nan], - index=list('ABCD'), name='x') + exp = pd.Series([3, 4, 5, np.nan], index=list("ABCD"), name="x") tm.assert_series_equal(s3 + s4, exp) tm.assert_series_equal(s4 + s3, exp) - exp = pd.DataFrame({'x': [3, 4, 5, np.nan]}, - index=list('ABCD')) + exp = pd.DataFrame({"x": [3, 4, 5, np.nan]}, index=list("ABCD")) tm.assert_frame_equal(s3.to_frame() + s4.to_frame(), exp) tm.assert_frame_equal(s4.to_frame() + s3.to_frame(), exp) @@ -664,17 +708,17 @@ def test_arith_ops_df_compat(self): def test_series_frame_radd_bug(self): # GH#353 vals = pd.Series(tm.rands_array(5, 10)) - result = 'foo_' + vals - expected = vals.map(lambda x: 'foo_' + x) + result = "foo_" + vals + expected = vals.map(lambda x: "foo_" + x) tm.assert_series_equal(result, expected) - frame = pd.DataFrame({'vals': vals}) - result = 'foo_' + frame - expected = pd.DataFrame({'vals': vals.map(lambda x: 'foo_' + x)}) + frame = pd.DataFrame({"vals": vals}) + result = "foo_" + frame + expected = pd.DataFrame({"vals": vals.map(lambda x: "foo_" + x)}) tm.assert_frame_equal(result, expected) ts = tm.makeTimeSeries() - ts.name = 'ts' + ts.name = "ts" # really raise this time now = pd.Timestamp.now().to_pydatetime() @@ -694,8 +738,10 @@ def test_datetime64_with_index(self): # GH#4629 # arithmetic datetime64 ops with an index - ser = pd.Series(pd.date_range('20130101', periods=5), - index=pd.date_range('20130101', periods=5)) + ser = pd.Series( + pd.date_range("20130101", periods=5), + index=pd.date_range("20130101", periods=5), + ) expected = ser - ser.index.to_series() result = ser - ser.index tm.assert_series_equal(result, expected) @@ -704,17 +750,18 @@ def test_datetime64_with_index(self): # GH#18850 result = ser - ser.index.to_period() - df = pd.DataFrame(np.random.randn(5, 2), - index=pd.date_range('20130101', periods=5)) - df['date'] = pd.Timestamp('20130102') - df['expected'] = df['date'] - df.index.to_series() - df['result'] = df['date'] - df.index - tm.assert_series_equal(df['result'], df['expected'], check_names=False) + df = pd.DataFrame( + np.random.randn(5, 2), index=pd.date_range("20130101", periods=5) + ) + df["date"] = pd.Timestamp("20130102") + df["expected"] = df["date"] - df.index.to_series() + df["result"] = df["date"] - df.index + tm.assert_series_equal(df["result"], df["expected"], check_names=False) # TODO: taken from tests.frame.test_operators, needs cleanup def test_frame_operators(self, float_frame): frame = float_frame - frame2 = pd.DataFrame(float_frame, columns=['D', 'C', 'B', 'A']) + frame2 = pd.DataFrame(float_frame, columns=["D", "C", "B", "A"]) garbage = np.random.random(4) colSeries = pd.Series(garbage, index=np.array(frame.columns)) @@ -742,15 +789,14 @@ def test_frame_operators(self, float_frame): expected = frame2 * 2 tm.assert_frame_equal(added, expected) - df = pd.DataFrame({'a': ['a', None, 'b']}) - tm.assert_frame_equal(df + df, - pd.DataFrame({'a': ['aa', np.nan, 'bb']})) + df = pd.DataFrame({"a": ["a", None, "b"]}) + tm.assert_frame_equal(df + df, pd.DataFrame({"a": ["aa", np.nan, "bb"]})) # Test for issue #10181 - for dtype in ('float', 'int64'): + for dtype in ("float", "int64"): frames = [ pd.DataFrame(dtype=dtype), - pd.DataFrame(columns=['A'], dtype=dtype), + pd.DataFrame(columns=["A"], dtype=dtype), pd.DataFrame(index=[0], dtype=dtype), ] for df in frames: @@ -765,11 +811,10 @@ def _check_op(series, other, op, pos_only=False, check_dtype=True): cython_or_numpy = op(left, right) python = left.combine(right, op) - tm.assert_series_equal(cython_or_numpy, python, - check_dtype=check_dtype) + tm.assert_series_equal(cython_or_numpy, python, check_dtype=check_dtype) def check(series, other): - simple_ops = ['add', 'sub', 'mul', 'truediv', 'floordiv', 'mod'] + simple_ops = ["add", "sub", "mul", "truediv", "floordiv", "mod"] for opname in simple_ops: _check_op(series, other, getattr(operator, opname)) @@ -781,11 +826,10 @@ def check(series, other): _check_op(series, other, lambda x, y: operator.truediv(y, x)) _check_op(series, other, lambda x, y: operator.floordiv(y, x)) _check_op(series, other, lambda x, y: operator.mul(y, x)) - _check_op(series, other, lambda x, y: operator.pow(y, x), - pos_only=True) + _check_op(series, other, lambda x, y: operator.pow(y, x), pos_only=True) _check_op(series, other, lambda x, y: operator.mod(y, x)) - tser = tm.makeTimeSeries().rename('ts') + tser = tm.makeTimeSeries().rename("ts") check(tser, tser * 2) check(tser, tser * 0) check(tser, tser[::2]) @@ -815,7 +859,7 @@ def check(series, other): else: other_np = other other_np = np.asarray(other_np) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): expecteds = divmod(series.values, np.asarray(other_np)) for result, expected in zip(results, expecteds): @@ -825,7 +869,7 @@ def check(series, other): assert result.name == series.name tm.assert_index_equal(result.index, series.index) - tser = tm.makeTimeSeries().rename('ts') + tser = tm.makeTimeSeries().rename("ts") check(tser, tser * 2) check(tser, tser * 0) check(tser, tser[::2]) @@ -833,88 +877,90 @@ def check(series, other): class TestUFuncCompat: - - @pytest.mark.parametrize('holder', [pd.Int64Index, pd.UInt64Index, - pd.Float64Index, pd.RangeIndex, - pd.Series]) + @pytest.mark.parametrize( + "holder", + [pd.Int64Index, pd.UInt64Index, pd.Float64Index, pd.RangeIndex, pd.Series], + ) def test_ufunc_compat(self, holder): box = pd.Series if holder is pd.Series else pd.Index if holder is pd.RangeIndex: idx = pd.RangeIndex(0, 5) else: - idx = holder(np.arange(5, dtype='int64')) + idx = holder(np.arange(5, dtype="int64")) result = np.sin(idx) - expected = box(np.sin(np.arange(5, dtype='int64'))) + expected = box(np.sin(np.arange(5, dtype="int64"))) tm.assert_equal(result, expected) - @pytest.mark.parametrize('holder', [pd.Int64Index, pd.UInt64Index, - pd.Float64Index, pd.Series]) + @pytest.mark.parametrize( + "holder", [pd.Int64Index, pd.UInt64Index, pd.Float64Index, pd.Series] + ) def test_ufunc_coercions(self, holder): - idx = holder([1, 2, 3, 4, 5], name='x') + idx = holder([1, 2, 3, 4, 5], name="x") box = pd.Series if holder is pd.Series else pd.Index result = np.sqrt(idx) - assert result.dtype == 'f8' and isinstance(result, box) - exp = pd.Float64Index(np.sqrt(np.array([1, 2, 3, 4, 5])), name='x') + assert result.dtype == "f8" and isinstance(result, box) + exp = pd.Float64Index(np.sqrt(np.array([1, 2, 3, 4, 5])), name="x") exp = tm.box_expected(exp, box) tm.assert_equal(result, exp) - result = np.divide(idx, 2.) - assert result.dtype == 'f8' and isinstance(result, box) - exp = pd.Float64Index([0.5, 1., 1.5, 2., 2.5], name='x') + result = np.divide(idx, 2.0) + assert result.dtype == "f8" and isinstance(result, box) + exp = pd.Float64Index([0.5, 1.0, 1.5, 2.0, 2.5], name="x") exp = tm.box_expected(exp, box) tm.assert_equal(result, exp) # _evaluate_numeric_binop - result = idx + 2. - assert result.dtype == 'f8' and isinstance(result, box) - exp = pd.Float64Index([3., 4., 5., 6., 7.], name='x') + result = idx + 2.0 + assert result.dtype == "f8" and isinstance(result, box) + exp = pd.Float64Index([3.0, 4.0, 5.0, 6.0, 7.0], name="x") exp = tm.box_expected(exp, box) tm.assert_equal(result, exp) - result = idx - 2. - assert result.dtype == 'f8' and isinstance(result, box) - exp = pd.Float64Index([-1., 0., 1., 2., 3.], name='x') + result = idx - 2.0 + assert result.dtype == "f8" and isinstance(result, box) + exp = pd.Float64Index([-1.0, 0.0, 1.0, 2.0, 3.0], name="x") exp = tm.box_expected(exp, box) tm.assert_equal(result, exp) - result = idx * 1. - assert result.dtype == 'f8' and isinstance(result, box) - exp = pd.Float64Index([1., 2., 3., 4., 5.], name='x') + result = idx * 1.0 + assert result.dtype == "f8" and isinstance(result, box) + exp = pd.Float64Index([1.0, 2.0, 3.0, 4.0, 5.0], name="x") exp = tm.box_expected(exp, box) tm.assert_equal(result, exp) - result = idx / 2. - assert result.dtype == 'f8' and isinstance(result, box) - exp = pd.Float64Index([0.5, 1., 1.5, 2., 2.5], name='x') + result = idx / 2.0 + assert result.dtype == "f8" and isinstance(result, box) + exp = pd.Float64Index([0.5, 1.0, 1.5, 2.0, 2.5], name="x") exp = tm.box_expected(exp, box) tm.assert_equal(result, exp) - @pytest.mark.parametrize('holder', [pd.Int64Index, pd.UInt64Index, - pd.Float64Index, pd.Series]) + @pytest.mark.parametrize( + "holder", [pd.Int64Index, pd.UInt64Index, pd.Float64Index, pd.Series] + ) def test_ufunc_multiple_return_values(self, holder): - obj = holder([1, 2, 3], name='x') + obj = holder([1, 2, 3], name="x") box = pd.Series if holder is pd.Series else pd.Index result = np.modf(obj) assert isinstance(result, tuple) - exp1 = pd.Float64Index([0., 0., 0.], name='x') - exp2 = pd.Float64Index([1., 2., 3.], name='x') + exp1 = pd.Float64Index([0.0, 0.0, 0.0], name="x") + exp2 = pd.Float64Index([1.0, 2.0, 3.0], name="x") tm.assert_equal(result[0], tm.box_expected(exp1, box)) tm.assert_equal(result[1], tm.box_expected(exp2, box)) def test_ufunc_at(self): - s = pd.Series([0, 1, 2], index=[1, 2, 3], name='x') + s = pd.Series([0, 1, 2], index=[1, 2, 3], name="x") np.add.at(s, [0, 2], 10) - expected = pd.Series([10, 1, 12], index=[1, 2, 3], name='x') + expected = pd.Series([10, 1, 12], index=[1, 2, 3], name="x") tm.assert_series_equal(s, expected) class TestObjectDtypeEquivalence: # Tests that arithmetic operations match operations executed elementwise - @pytest.mark.parametrize('dtype', [None, object]) + @pytest.mark.parametrize("dtype", [None, object]) def test_numarr_with_dtype_add_nan(self, dtype, box): ser = pd.Series([1, 2, 3], dtype=dtype) expected = pd.Series([np.nan, np.nan, np.nan], dtype=dtype) @@ -928,7 +974,7 @@ def test_numarr_with_dtype_add_nan(self, dtype, box): result = ser + np.nan tm.assert_equal(result, expected) - @pytest.mark.parametrize('dtype', [None, object]) + @pytest.mark.parametrize("dtype", [None, object]) def test_numarr_with_dtype_add_int(self, dtype, box): ser = pd.Series([1, 2, 3], dtype=dtype) expected = pd.Series([2, 3, 4], dtype=dtype) @@ -943,14 +989,16 @@ def test_numarr_with_dtype_add_int(self, dtype, box): tm.assert_equal(result, expected) # TODO: moved from tests.series.test_operators; needs cleanup - @pytest.mark.parametrize('op', [operator.add, operator.sub, operator.mul, - operator.truediv, operator.floordiv]) + @pytest.mark.parametrize( + "op", + [operator.add, operator.sub, operator.mul, operator.truediv, operator.floordiv], + ) def test_operators_reverse_object(self, op): # GH#56 arr = pd.Series(np.random.randn(10), index=np.arange(10), dtype=object) - result = op(1., arr) - expected = op(1., arr.astype(float)) + result = op(1.0, arr) + expected = op(1.0, arr.astype(float)) tm.assert_series_equal(result.astype(float), expected) @@ -971,11 +1019,20 @@ def check_binop(self, ops, scalars, idxs): tm.assert_index_equal(result, expected) def test_binops(self): - ops = [operator.add, operator.sub, operator.mul, operator.floordiv, - operator.truediv] + ops = [ + operator.add, + operator.sub, + operator.mul, + operator.floordiv, + operator.truediv, + ] scalars = [-1, 1, 2] - idxs = [pd.RangeIndex(0, 10, 1), pd.RangeIndex(0, 20, 2), - pd.RangeIndex(-10, 10, 2), pd.RangeIndex(5, -5, -1)] + idxs = [ + pd.RangeIndex(0, 10, 1), + pd.RangeIndex(0, 20, 2), + pd.RangeIndex(-10, 10, 2), + pd.RangeIndex(5, -5, -1), + ] self.check_binop(ops, scalars, idxs) def test_binops_pow(self): @@ -987,9 +1044,17 @@ def test_binops_pow(self): self.check_binop(ops, scalars, idxs) # TODO: mod, divmod? - @pytest.mark.parametrize('op', [operator.add, operator.sub, - operator.mul, operator.floordiv, - operator.truediv, operator.pow]) + @pytest.mark.parametrize( + "op", + [ + operator.add, + operator.sub, + operator.mul, + operator.floordiv, + operator.truediv, + operator.pow, + ], + ) def test_arithmetic_with_frame_or_series(self, op): # check that we return NotImplemented when operating with Series # or DataFrame @@ -1024,7 +1089,7 @@ def test_numeric_compat2(self): tm.assert_index_equal(result, expected, exact=True) result = idx / 2 - expected = pd.RangeIndex(0, 5, 1).astype('float64') + expected = pd.RangeIndex(0, 5, 1).astype("float64") tm.assert_index_equal(result, expected, exact=True) result = idx / 4 @@ -1050,19 +1115,22 @@ def test_numeric_compat2(self): cases_exact = [ (pd.RangeIndex(0, 1000, 2), 2, pd.RangeIndex(0, 500, 1)), (pd.RangeIndex(-99, -201, -3), -3, pd.RangeIndex(33, 67, 1)), - (pd.RangeIndex(0, 1000, 1), 2, - pd.RangeIndex(0, 1000, 1)._int64index // 2), - (pd.RangeIndex(0, 100, 1), 2.0, - pd.RangeIndex(0, 100, 1)._int64index // 2.0), + (pd.RangeIndex(0, 1000, 1), 2, pd.RangeIndex(0, 1000, 1)._int64index // 2), + ( + pd.RangeIndex(0, 100, 1), + 2.0, + pd.RangeIndex(0, 100, 1)._int64index // 2.0, + ), (pd.RangeIndex(0), 50, pd.RangeIndex(0)), (pd.RangeIndex(2, 4, 2), 3, pd.RangeIndex(0, 1, 1)), (pd.RangeIndex(-5, -10, -6), 4, pd.RangeIndex(-2, -1, 1)), - (pd.RangeIndex(-100, -200, 3), 2, pd.RangeIndex(0))] + (pd.RangeIndex(-100, -200, 3), 2, pd.RangeIndex(0)), + ] for idx, div, expected in cases_exact: tm.assert_index_equal(idx // div, expected, exact=True) - @pytest.mark.parametrize('dtype', [np.int64, np.float64]) - @pytest.mark.parametrize('delta', [1, 0, -1]) + @pytest.mark.parametrize("dtype", [np.int64, np.float64]) + @pytest.mark.parametrize("delta", [1, 0, -1]) def test_addsub_arithmetic(self, dtype, delta): # GH#8142 delta = dtype(delta) diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index dd931939ddf51e..f7f6ba8b114e75 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -17,12 +17,11 @@ class TestObjectComparisons: - def test_comparison_object_numeric_nas(self): ser = Series(np.random.randn(10), dtype=object) shifted = ser.shift(2) - ops = ['lt', 'le', 'gt', 'ge', 'eq', 'ne'] + ops = ["lt", "le", "gt", "ge", "eq", "ne"] for op in ops: func = getattr(operator, op) @@ -31,24 +30,24 @@ def test_comparison_object_numeric_nas(self): tm.assert_series_equal(result, expected) def test_object_comparisons(self): - ser = Series(['a', 'b', np.nan, 'c', 'a']) + ser = Series(["a", "b", np.nan, "c", "a"]) - result = ser == 'a' + result = ser == "a" expected = Series([True, False, False, False, True]) tm.assert_series_equal(result, expected) - result = ser < 'a' + result = ser < "a" expected = Series([False, False, False, False, False]) tm.assert_series_equal(result, expected) - result = ser != 'a' - expected = -(ser == 'a') + result = ser != "a" + expected = -(ser == "a") tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('dtype', [None, object]) + @pytest.mark.parametrize("dtype", [None, object]) def test_more_na_comparisons(self, dtype): - left = Series(['a', np.nan, 'c'], dtype=dtype) - right = Series(['a', np.nan, 'd'], dtype=dtype) + left = Series(["a", np.nan, "c"], dtype=dtype) + right = Series(["a", np.nan, "d"], dtype=dtype) result = left == right expected = Series([True, False, False]) @@ -70,6 +69,7 @@ def test_more_na_comparisons(self, dtype): # ------------------------------------------------------------------ # Arithmetic + class TestArithmetic: # TODO: parametrize @@ -94,7 +94,7 @@ def test_add_extension_scalar(self, other, box, op): # Check that scalars satisfying is_extension_array_dtype(obj) # do not incorrectly try to dispatch to an ExtensionArray operation - arr = pd.Series(['a', 'b', 'c']) + arr = pd.Series(["a", "b", "c"]) expected = pd.Series([op(x, other) for x in arr]) arr = tm.box_expected(arr, box) @@ -103,60 +103,73 @@ def test_add_extension_scalar(self, other, box, op): result = op(arr, other) tm.assert_equal(result, expected) - @pytest.mark.parametrize('box', [ - pytest.param(pd.Index, - marks=pytest.mark.xfail(reason="Does not mask nulls", - raises=TypeError)), - pd.Series, - pd.DataFrame - ], ids=lambda x: x.__name__) + @pytest.mark.parametrize( + "box", + [ + pytest.param( + pd.Index, + marks=pytest.mark.xfail(reason="Does not mask nulls", raises=TypeError), + ), + pd.Series, + pd.DataFrame, + ], + ids=lambda x: x.__name__, + ) def test_objarr_add_str(self, box): - ser = pd.Series(['x', np.nan, 'x']) - expected = pd.Series(['xa', np.nan, 'xa']) + ser = pd.Series(["x", np.nan, "x"]) + expected = pd.Series(["xa", np.nan, "xa"]) ser = tm.box_expected(ser, box) expected = tm.box_expected(expected, box) - result = ser + 'a' + result = ser + "a" tm.assert_equal(result, expected) - @pytest.mark.parametrize('box', [ - pytest.param(pd.Index, - marks=pytest.mark.xfail(reason="Does not mask nulls", - raises=TypeError)), - pd.Series, - pd.DataFrame - ], ids=lambda x: x.__name__) + @pytest.mark.parametrize( + "box", + [ + pytest.param( + pd.Index, + marks=pytest.mark.xfail(reason="Does not mask nulls", raises=TypeError), + ), + pd.Series, + pd.DataFrame, + ], + ids=lambda x: x.__name__, + ) def test_objarr_radd_str(self, box): - ser = pd.Series(['x', np.nan, 'x']) - expected = pd.Series(['ax', np.nan, 'ax']) + ser = pd.Series(["x", np.nan, "x"]) + expected = pd.Series(["ax", np.nan, "ax"]) ser = tm.box_expected(ser, box) expected = tm.box_expected(expected, box) - result = 'a' + ser + result = "a" + ser tm.assert_equal(result, expected) - @pytest.mark.parametrize('data', [ - [1, 2, 3], - [1.1, 2.2, 3.3], - [Timestamp('2011-01-01'), Timestamp('2011-01-02'), pd.NaT], - ['x', 'y', 1]]) - @pytest.mark.parametrize('dtype', [None, object]) + @pytest.mark.parametrize( + "data", + [ + [1, 2, 3], + [1.1, 2.2, 3.3], + [Timestamp("2011-01-01"), Timestamp("2011-01-02"), pd.NaT], + ["x", "y", 1], + ], + ) + @pytest.mark.parametrize("dtype", [None, object]) def test_objarr_radd_str_invalid(self, dtype, data, box): ser = Series(data, dtype=dtype) ser = tm.box_expected(ser, box) with pytest.raises(TypeError): - 'foo_' + ser + "foo_" + ser - @pytest.mark.parametrize('op', [operator.add, ops.radd, - operator.sub, ops.rsub]) + @pytest.mark.parametrize("op", [operator.add, ops.radd, operator.sub, ops.rsub]) def test_objarr_add_invalid(self, op, box): # invalid ops obj_ser = tm.makeObjectSeries() - obj_ser.name = 'objects' + obj_ser.name = "objects" obj_ser = tm.box_expected(obj_ser, box) with pytest.raises(Exception): @@ -166,70 +179,86 @@ def test_objarr_add_invalid(self, op, box): # TODO: Moved from tests.series.test_operators; needs cleanup def test_operators_na_handling(self): - ser = Series(['foo', 'bar', 'baz', np.nan]) - result = 'prefix_' + ser - expected = pd.Series(['prefix_foo', 'prefix_bar', - 'prefix_baz', np.nan]) + ser = Series(["foo", "bar", "baz", np.nan]) + result = "prefix_" + ser + expected = pd.Series(["prefix_foo", "prefix_bar", "prefix_baz", np.nan]) tm.assert_series_equal(result, expected) - result = ser + '_suffix' - expected = pd.Series(['foo_suffix', 'bar_suffix', - 'baz_suffix', np.nan]) + result = ser + "_suffix" + expected = pd.Series(["foo_suffix", "bar_suffix", "baz_suffix", np.nan]) tm.assert_series_equal(result, expected) # TODO: parametrize over box - @pytest.mark.parametrize('dtype', [None, object]) + @pytest.mark.parametrize("dtype", [None, object]) def test_series_with_dtype_radd_timedelta(self, dtype): # note this test is _not_ aimed at timedelta64-dtyped Series - ser = pd.Series([pd.Timedelta('1 days'), pd.Timedelta('2 days'), - pd.Timedelta('3 days')], dtype=dtype) - expected = pd.Series([pd.Timedelta('4 days'), pd.Timedelta('5 days'), - pd.Timedelta('6 days')]) - - result = pd.Timedelta('3 days') + ser + ser = pd.Series( + [pd.Timedelta("1 days"), pd.Timedelta("2 days"), pd.Timedelta("3 days")], + dtype=dtype, + ) + expected = pd.Series( + [pd.Timedelta("4 days"), pd.Timedelta("5 days"), pd.Timedelta("6 days")] + ) + + result = pd.Timedelta("3 days") + ser tm.assert_series_equal(result, expected) - result = ser + pd.Timedelta('3 days') + result = ser + pd.Timedelta("3 days") tm.assert_series_equal(result, expected) # TODO: cleanup & parametrize over box def test_mixed_timezone_series_ops_object(self): # GH#13043 - ser = pd.Series([pd.Timestamp('2015-01-01', tz='US/Eastern'), - pd.Timestamp('2015-01-01', tz='Asia/Tokyo')], - name='xxx') + ser = pd.Series( + [ + pd.Timestamp("2015-01-01", tz="US/Eastern"), + pd.Timestamp("2015-01-01", tz="Asia/Tokyo"), + ], + name="xxx", + ) assert ser.dtype == object - exp = pd.Series([pd.Timestamp('2015-01-02', tz='US/Eastern'), - pd.Timestamp('2015-01-02', tz='Asia/Tokyo')], - name='xxx') - tm.assert_series_equal(ser + pd.Timedelta('1 days'), exp) - tm.assert_series_equal(pd.Timedelta('1 days') + ser, exp) + exp = pd.Series( + [ + pd.Timestamp("2015-01-02", tz="US/Eastern"), + pd.Timestamp("2015-01-02", tz="Asia/Tokyo"), + ], + name="xxx", + ) + tm.assert_series_equal(ser + pd.Timedelta("1 days"), exp) + tm.assert_series_equal(pd.Timedelta("1 days") + ser, exp) # object series & object series - ser2 = pd.Series([pd.Timestamp('2015-01-03', tz='US/Eastern'), - pd.Timestamp('2015-01-05', tz='Asia/Tokyo')], - name='xxx') + ser2 = pd.Series( + [ + pd.Timestamp("2015-01-03", tz="US/Eastern"), + pd.Timestamp("2015-01-05", tz="Asia/Tokyo"), + ], + name="xxx", + ) assert ser2.dtype == object - exp = pd.Series([pd.Timedelta('2 days'), pd.Timedelta('4 days')], - name='xxx') + exp = pd.Series([pd.Timedelta("2 days"), pd.Timedelta("4 days")], name="xxx") tm.assert_series_equal(ser2 - ser, exp) tm.assert_series_equal(ser - ser2, -exp) - ser = pd.Series([pd.Timedelta('01:00:00'), pd.Timedelta('02:00:00')], - name='xxx', dtype=object) + ser = pd.Series( + [pd.Timedelta("01:00:00"), pd.Timedelta("02:00:00")], + name="xxx", + dtype=object, + ) assert ser.dtype == object - exp = pd.Series([pd.Timedelta('01:30:00'), pd.Timedelta('02:30:00')], - name='xxx') - tm.assert_series_equal(ser + pd.Timedelta('00:30:00'), exp) - tm.assert_series_equal(pd.Timedelta('00:30:00') + ser, exp) + exp = pd.Series( + [pd.Timedelta("01:30:00"), pd.Timedelta("02:30:00")], name="xxx" + ) + tm.assert_series_equal(ser + pd.Timedelta("00:30:00"), exp) + tm.assert_series_equal(pd.Timedelta("00:30:00") + ser, exp) # TODO: cleanup & parametrize over box def test_iadd_preserves_name(self): # GH#17067, GH#19723 __iadd__ and __isub__ should preserve index name ser = pd.Series([1, 2, 3]) - ser.index.name = 'foo' + ser.index.name = "foo" ser.index += 1 assert ser.index.name == "foo" @@ -239,19 +268,19 @@ def test_iadd_preserves_name(self): def test_add_string(self): # from bug report - index = pd.Index(['a', 'b', 'c']) - index2 = index + 'foo' + index = pd.Index(["a", "b", "c"]) + index2 = index + "foo" - assert 'a' not in index2 - assert 'afoo' in index2 + assert "a" not in index2 + assert "afoo" in index2 def test_iadd_string(self): - index = pd.Index(['a', 'b', 'c']) + index = pd.Index(["a", "b", "c"]) # doesn't fail test unless there is a check before `+=` - assert 'a' in index + assert "a" in index - index += '_x' - assert 'a_x' in index + index += "_x" + assert "a_x" in index def test_add(self): index = tm.makeStringIndex(100) @@ -261,16 +290,16 @@ def test_add(self): tm.assert_index_equal(index.tolist() + index, expected) # test add and radd - index = pd.Index(list('abc')) - expected = pd.Index(['a1', 'b1', 'c1']) - tm.assert_index_equal(index + '1', expected) - expected = pd.Index(['1a', '1b', '1c']) - tm.assert_index_equal('1' + index, expected) + index = pd.Index(list("abc")) + expected = pd.Index(["a1", "b1", "c1"]) + tm.assert_index_equal(index + "1", expected) + expected = pd.Index(["1a", "1b", "1c"]) + tm.assert_index_equal("1" + index, expected) def test_sub_fail(self): index = tm.makeStringIndex(100) with pytest.raises(TypeError): - index - 'a' + index - "a" with pytest.raises(TypeError): index - index with pytest.raises(TypeError): @@ -290,10 +319,10 @@ def test_sub_object(self): tm.assert_index_equal(result, expected) with pytest.raises(TypeError): - index - 'foo' + index - "foo" with pytest.raises(TypeError): - index - np.array([2, 'foo']) + index - np.array([2, "foo"]) def test_rsub_object(self): # GH#19369 @@ -307,7 +336,7 @@ def test_rsub_object(self): tm.assert_index_equal(result, expected) with pytest.raises(TypeError): - 'foo' - index + "foo" - index with pytest.raises(TypeError): np.array([True, pd.Timestamp.now()]) - index diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 413d58d9429e7e..bd21335a7f9c76 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -29,7 +29,7 @@ def test_compare_zerodim(self, box_with_array): # GH#26689 make sure we unbox zero-dimensional arrays xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - pi = pd.period_range('2000', periods=4) + pi = pd.period_range("2000", periods=4) other = np.array(pi.to_numpy()[0]) pi = tm.box_expected(pi, box_with_array) @@ -44,14 +44,14 @@ class TestPeriodIndexComparisons: @pytest.mark.parametrize("other", ["2017", 2017]) def test_eq(self, other): - idx = PeriodIndex(['2017', '2017', '2018'], freq="D") + idx = PeriodIndex(["2017", "2017", "2018"], freq="D") expected = np.array([True, True, False]) result = idx == other tm.assert_numpy_array_equal(result, expected) def test_pi_cmp_period(self): - idx = period_range('2007-01', periods=20, freq='M') + idx = period_range("2007-01", periods=20, freq="M") result = idx < idx[10] exp = idx.values < idx.values[10] @@ -61,9 +61,9 @@ def test_pi_cmp_period(self): def test_parr_cmp_period_scalar2(self, box_with_array): xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - pi = pd.period_range('2000-01-01', periods=10, freq='D') + pi = pd.period_range("2000-01-01", periods=10, freq="D") - val = Period('2000-01-04', freq='D') + val = Period("2000-01-04", freq="D") expected = [x > val for x in pi] ser = tm.box_expected(pi, box_with_array) @@ -77,15 +77,14 @@ def test_parr_cmp_period_scalar2(self, box_with_array): expected = tm.box_expected(expected, xbox) tm.assert_equal(result, expected) - @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) def test_parr_cmp_period_scalar(self, freq, box_with_array): # GH#13200 xbox = np.ndarray if box_with_array is pd.Index else box_with_array - base = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], - freq=freq) + base = PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq=freq) base = tm.box_expected(base, box_with_array) - per = Period('2011-02', freq=freq) + per = Period("2011-02", freq=freq) exp = np.array([False, True, False, False]) exp = tm.box_expected(exp, xbox) @@ -117,18 +116,16 @@ def test_parr_cmp_period_scalar(self, freq, box_with_array): tm.assert_equal(base <= per, exp) tm.assert_equal(per >= base, exp) - @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) def test_parr_cmp_pi(self, freq, box_with_array): # GH#13200 xbox = np.ndarray if box_with_array is pd.Index else box_with_array - base = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], - freq=freq) + base = PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq=freq) base = tm.box_expected(base, box_with_array) # TODO: could also box idx? - idx = PeriodIndex(['2011-02', '2011-01', '2011-03', '2011-05'], - freq=freq) + idx = PeriodIndex(["2011-02", "2011-01", "2011-03", "2011-05"], freq=freq) exp = np.array([False, False, True, False]) exp = tm.box_expected(exp, xbox) @@ -154,25 +151,25 @@ def test_parr_cmp_pi(self, freq, box_with_array): exp = tm.box_expected(exp, xbox) tm.assert_equal(base <= idx, exp) - @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) def test_parr_cmp_pi_mismatched_freq_raises(self, freq, box_with_array): # GH#13200 # different base freq - base = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], - freq=freq) + base = PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq=freq) base = tm.box_expected(base, box_with_array) msg = "Input has different freq=A-DEC from " with pytest.raises(IncompatibleFrequency, match=msg): - base <= Period('2011', freq='A') + base <= Period("2011", freq="A") with pytest.raises(IncompatibleFrequency, match=msg): - Period('2011', freq='A') >= base + Period("2011", freq="A") >= base # TODO: Could parametrize over boxes for idx? - idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='A') - rev_msg = (r'Input has different freq=(M|2M|3M) from ' - r'PeriodArray\(freq=A-DEC\)') + idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="A") + rev_msg = ( + r"Input has different freq=(M|2M|3M) from " r"PeriodArray\(freq=A-DEC\)" + ) idx_msg = rev_msg if box_with_array is tm.to_array else msg with pytest.raises(IncompatibleFrequency, match=idx_msg): base <= idx @@ -180,41 +177,40 @@ def test_parr_cmp_pi_mismatched_freq_raises(self, freq, box_with_array): # Different frequency msg = "Input has different freq=4M from " with pytest.raises(IncompatibleFrequency, match=msg): - base <= Period('2011', freq='4M') + base <= Period("2011", freq="4M") with pytest.raises(IncompatibleFrequency, match=msg): - Period('2011', freq='4M') >= base + Period("2011", freq="4M") >= base - idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='4M') - rev_msg = (r'Input has different freq=(M|2M|3M) from ' - r'PeriodArray\(freq=4M\)') + idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="4M") + rev_msg = r"Input has different freq=(M|2M|3M) from " r"PeriodArray\(freq=4M\)" idx_msg = rev_msg if box_with_array is tm.to_array else msg with pytest.raises(IncompatibleFrequency, match=idx_msg): base <= idx - @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) def test_pi_cmp_nat(self, freq): - idx1 = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-05'], freq=freq) + idx1 = PeriodIndex(["2011-01", "2011-02", "NaT", "2011-05"], freq=freq) - result = idx1 > Period('2011-02', freq=freq) + result = idx1 > Period("2011-02", freq=freq) exp = np.array([False, False, False, True]) tm.assert_numpy_array_equal(result, exp) - result = Period('2011-02', freq=freq) < idx1 + result = Period("2011-02", freq=freq) < idx1 tm.assert_numpy_array_equal(result, exp) - result = idx1 == Period('NaT', freq=freq) + result = idx1 == Period("NaT", freq=freq) exp = np.array([False, False, False, False]) tm.assert_numpy_array_equal(result, exp) - result = Period('NaT', freq=freq) == idx1 + result = Period("NaT", freq=freq) == idx1 tm.assert_numpy_array_equal(result, exp) - result = idx1 != Period('NaT', freq=freq) + result = idx1 != Period("NaT", freq=freq) exp = np.array([True, True, True, True]) tm.assert_numpy_array_equal(result, exp) - result = Period('NaT', freq=freq) != idx1 + result = Period("NaT", freq=freq) != idx1 tm.assert_numpy_array_equal(result, exp) - idx2 = PeriodIndex(['2011-02', '2011-01', '2011-04', 'NaT'], freq=freq) + idx2 = PeriodIndex(["2011-02", "2011-01", "2011-04", "NaT"], freq=freq) result = idx1 < idx2 exp = np.array([True, False, False, False]) tm.assert_numpy_array_equal(result, exp) @@ -235,11 +231,11 @@ def test_pi_cmp_nat(self, freq): exp = np.array([False, False, True, False]) tm.assert_numpy_array_equal(result, exp) - @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) def test_pi_cmp_nat_mismatched_freq_raises(self, freq): - idx1 = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-05'], freq=freq) + idx1 = PeriodIndex(["2011-01", "2011-02", "NaT", "2011-05"], freq=freq) - diff = PeriodIndex(['2011-02', '2011-01', '2011-04', 'NaT'], freq='4M') + diff = PeriodIndex(["2011-02", "2011-01", "2011-04", "NaT"], freq="4M") msg = "Input has different freq=4M from Period(Array|Index)" with pytest.raises(IncompatibleFrequency, match=msg): idx1 > diff @@ -248,11 +244,12 @@ def test_pi_cmp_nat_mismatched_freq_raises(self, freq): idx1 == diff # TODO: De-duplicate with test_pi_cmp_nat - @pytest.mark.parametrize('dtype', [object, None]) + @pytest.mark.parametrize("dtype", [object, None]) def test_comp_nat(self, dtype): - left = pd.PeriodIndex([pd.Period('2011-01-01'), pd.NaT, - pd.Period('2011-01-03')]) - right = pd.PeriodIndex([pd.NaT, pd.NaT, pd.Period('2011-01-03')]) + left = pd.PeriodIndex( + [pd.Period("2011-01-01"), pd.NaT, pd.Period("2011-01-03")] + ) + right = pd.PeriodIndex([pd.NaT, pd.NaT, pd.Period("2011-01-03")]) if dtype is not None: left = left.astype(dtype) @@ -282,15 +279,23 @@ def test_comp_nat(self, dtype): class TestPeriodSeriesComparisons: def test_cmp_series_period_series_mixed_freq(self): # GH#13200 - base = Series([Period('2011', freq='A'), - Period('2011-02', freq='M'), - Period('2013', freq='A'), - Period('2011-04', freq='M')]) - - ser = Series([Period('2012', freq='A'), - Period('2011-01', freq='M'), - Period('2013', freq='A'), - Period('2011-05', freq='M')]) + base = Series( + [ + Period("2011", freq="A"), + Period("2011-02", freq="M"), + Period("2013", freq="A"), + Period("2011-04", freq="M"), + ] + ) + + ser = Series( + [ + Period("2012", freq="A"), + Period("2011-01", freq="M"), + Period("2013", freq="A"), + Period("2011-05", freq="M"), + ] + ) exp = Series([False, False, True, False]) tm.assert_series_equal(base == ser, exp) @@ -313,6 +318,7 @@ def test_cmp_series_period_series_mixed_freq(self): class TestPeriodIndexSeriesComparisonConsistency: """ Test PeriodIndex and Period Series Ops consistency """ + # TODO: needs parametrization+de-duplication def _check(self, values, func, expected): @@ -332,41 +338,43 @@ def _check(self, values, func, expected): tm.assert_series_equal(result, exp) def test_pi_comp_period(self): - idx = PeriodIndex(['2011-01', '2011-02', '2011-03', - '2011-04'], freq='M', name='idx') + idx = PeriodIndex( + ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx" + ) - f = lambda x: x == pd.Period('2011-03', freq='M') + f = lambda x: x == pd.Period("2011-03", freq="M") exp = np.array([False, False, True, False], dtype=np.bool) self._check(idx, f, exp) - f = lambda x: pd.Period('2011-03', freq='M') == x + f = lambda x: pd.Period("2011-03", freq="M") == x self._check(idx, f, exp) - f = lambda x: x != pd.Period('2011-03', freq='M') + f = lambda x: x != pd.Period("2011-03", freq="M") exp = np.array([True, True, False, True], dtype=np.bool) self._check(idx, f, exp) - f = lambda x: pd.Period('2011-03', freq='M') != x + f = lambda x: pd.Period("2011-03", freq="M") != x self._check(idx, f, exp) - f = lambda x: pd.Period('2011-03', freq='M') >= x + f = lambda x: pd.Period("2011-03", freq="M") >= x exp = np.array([True, True, True, False], dtype=np.bool) self._check(idx, f, exp) - f = lambda x: x > pd.Period('2011-03', freq='M') + f = lambda x: x > pd.Period("2011-03", freq="M") exp = np.array([False, False, False, True], dtype=np.bool) self._check(idx, f, exp) - f = lambda x: pd.Period('2011-03', freq='M') >= x + f = lambda x: pd.Period("2011-03", freq="M") >= x exp = np.array([True, True, True, False], dtype=np.bool) self._check(idx, f, exp) def test_pi_comp_period_nat(self): - idx = PeriodIndex(['2011-01', 'NaT', '2011-03', - '2011-04'], freq='M', name='idx') + idx = PeriodIndex( + ["2011-01", "NaT", "2011-03", "2011-04"], freq="M", name="idx" + ) - f = lambda x: x == pd.Period('2011-03', freq='M') + f = lambda x: x == pd.Period("2011-03", freq="M") exp = np.array([False, False, True, False], dtype=np.bool) self._check(idx, f, exp) - f = lambda x: pd.Period('2011-03', freq='M') == x + f = lambda x: pd.Period("2011-03", freq="M") == x self._check(idx, f, exp) f = lambda x: x == pd.NaT @@ -375,10 +383,10 @@ def test_pi_comp_period_nat(self): f = lambda x: pd.NaT == x self._check(idx, f, exp) - f = lambda x: x != pd.Period('2011-03', freq='M') + f = lambda x: x != pd.Period("2011-03", freq="M") exp = np.array([True, True, False, True], dtype=np.bool) self._check(idx, f, exp) - f = lambda x: pd.Period('2011-03', freq='M') != x + f = lambda x: pd.Period("2011-03", freq="M") != x self._check(idx, f, exp) f = lambda x: x != pd.NaT @@ -387,11 +395,11 @@ def test_pi_comp_period_nat(self): f = lambda x: pd.NaT != x self._check(idx, f, exp) - f = lambda x: pd.Period('2011-03', freq='M') >= x + f = lambda x: pd.Period("2011-03", freq="M") >= x exp = np.array([True, False, True, False], dtype=np.bool) self._check(idx, f, exp) - f = lambda x: x < pd.Period('2011-03', freq='M') + f = lambda x: x < pd.Period("2011-03", freq="M") exp = np.array([True, False, False, False], dtype=np.bool) self._check(idx, f, exp) @@ -407,34 +415,46 @@ def test_pi_comp_period_nat(self): # ------------------------------------------------------------------ # Arithmetic -class TestPeriodFrameArithmetic: +class TestPeriodFrameArithmetic: def test_ops_frame_period(self): # GH#13043 - df = pd.DataFrame({'A': [pd.Period('2015-01', freq='M'), - pd.Period('2015-02', freq='M')], - 'B': [pd.Period('2014-01', freq='M'), - pd.Period('2014-02', freq='M')]}) - assert df['A'].dtype == 'Period[M]' - assert df['B'].dtype == 'Period[M]' - - p = pd.Period('2015-03', freq='M') + df = pd.DataFrame( + { + "A": [pd.Period("2015-01", freq="M"), pd.Period("2015-02", freq="M")], + "B": [pd.Period("2014-01", freq="M"), pd.Period("2014-02", freq="M")], + } + ) + assert df["A"].dtype == "Period[M]" + assert df["B"].dtype == "Period[M]" + + p = pd.Period("2015-03", freq="M") off = p.freq # dtype will be object because of original dtype - exp = pd.DataFrame({'A': np.array([2 * off, 1 * off], dtype=object), - 'B': np.array([14 * off, 13 * off], dtype=object)}) + exp = pd.DataFrame( + { + "A": np.array([2 * off, 1 * off], dtype=object), + "B": np.array([14 * off, 13 * off], dtype=object), + } + ) tm.assert_frame_equal(p - df, exp) tm.assert_frame_equal(df - p, -1 * exp) - df2 = pd.DataFrame({'A': [pd.Period('2015-05', freq='M'), - pd.Period('2015-06', freq='M')], - 'B': [pd.Period('2015-05', freq='M'), - pd.Period('2015-06', freq='M')]}) - assert df2['A'].dtype == 'Period[M]' - assert df2['B'].dtype == 'Period[M]' - - exp = pd.DataFrame({'A': np.array([4 * off, 4 * off], dtype=object), - 'B': np.array([16 * off, 16 * off], dtype=object)}) + df2 = pd.DataFrame( + { + "A": [pd.Period("2015-05", freq="M"), pd.Period("2015-06", freq="M")], + "B": [pd.Period("2015-05", freq="M"), pd.Period("2015-06", freq="M")], + } + ) + assert df2["A"].dtype == "Period[M]" + assert df2["B"].dtype == "Period[M]" + + exp = pd.DataFrame( + { + "A": np.array([4 * off, 4 * off], dtype=object), + "B": np.array([16 * off, 16 * off], dtype=object), + } + ) tm.assert_frame_equal(df2 - df, exp) tm.assert_frame_equal(df - df2, -1 * exp) @@ -447,8 +467,8 @@ class TestPeriodIndexArithmetic: # and PeriodIndex (with matching freq) def test_parr_add_iadd_parr_raises(self, box_with_array): - rng = pd.period_range('1/1/2000', freq='D', periods=5) - other = pd.period_range('1/6/2000', freq='D', periods=5) + rng = pd.period_range("1/1/2000", freq="D", periods=5) + other = pd.period_range("1/6/2000", freq="D", periods=5) # TODO: parametrize over boxes for other? rng = tm.box_expected(rng, box_with_array) @@ -467,8 +487,8 @@ def test_pi_sub_isub_pi(self): # For historical reference see GH#14164, GH#13077. # PeriodIndex subtraction originally performed set difference, # then changed to raise TypeError before being implemented in GH#20049 - rng = pd.period_range('1/1/2000', freq='D', periods=5) - other = pd.period_range('1/6/2000', freq='D', periods=5) + rng = pd.period_range("1/1/2000", freq="D", periods=5) + other = pd.period_range("1/6/2000", freq="D", periods=5) off = rng.freq expected = pd.Index([-5 * off] * 5) @@ -479,7 +499,7 @@ def test_pi_sub_isub_pi(self): tm.assert_index_equal(rng, expected) def test_pi_sub_pi_with_nat(self): - rng = pd.period_range('1/1/2000', freq='D', periods=5) + rng = pd.period_range("1/1/2000", freq="D", periods=5) other = rng[1:].insert(0, pd.NaT) assert other[1:].equals(rng[1:]) @@ -489,68 +509,77 @@ def test_pi_sub_pi_with_nat(self): tm.assert_index_equal(result, expected) def test_parr_sub_pi_mismatched_freq(self, box_with_array): - rng = pd.period_range('1/1/2000', freq='D', periods=5) - other = pd.period_range('1/6/2000', freq='H', periods=5) + rng = pd.period_range("1/1/2000", freq="D", periods=5) + other = pd.period_range("1/6/2000", freq="H", periods=5) # TODO: parametrize over boxes for other? rng = tm.box_expected(rng, box_with_array) with pytest.raises(IncompatibleFrequency): rng - other - @pytest.mark.parametrize('n', [1, 2, 3, 4]) + @pytest.mark.parametrize("n", [1, 2, 3, 4]) def test_sub_n_gt_1_ticks(self, tick_classes, n): # GH 23878 - p1_d = '19910905' - p2_d = '19920406' + p1_d = "19910905" + p2_d = "19920406" p1 = pd.PeriodIndex([p1_d], freq=tick_classes(n)) p2 = pd.PeriodIndex([p2_d], freq=tick_classes(n)) - expected = (pd.PeriodIndex([p2_d], freq=p2.freq.base) - - pd.PeriodIndex([p1_d], freq=p1.freq.base)) + expected = pd.PeriodIndex([p2_d], freq=p2.freq.base) - pd.PeriodIndex( + [p1_d], freq=p1.freq.base + ) tm.assert_index_equal((p2 - p1), expected) - @pytest.mark.parametrize('n', [1, 2, 3, 4]) - @pytest.mark.parametrize('offset, kwd_name', [ - (pd.offsets.YearEnd, 'month'), - (pd.offsets.QuarterEnd, 'startingMonth'), - (pd.offsets.MonthEnd, None), - (pd.offsets.Week, 'weekday') - ]) + @pytest.mark.parametrize("n", [1, 2, 3, 4]) + @pytest.mark.parametrize( + "offset, kwd_name", + [ + (pd.offsets.YearEnd, "month"), + (pd.offsets.QuarterEnd, "startingMonth"), + (pd.offsets.MonthEnd, None), + (pd.offsets.Week, "weekday"), + ], + ) def test_sub_n_gt_1_offsets(self, offset, kwd_name, n): # GH 23878 kwds = {kwd_name: 3} if kwd_name is not None else {} - p1_d = '19910905' - p2_d = '19920406' + p1_d = "19910905" + p2_d = "19920406" freq = offset(n, normalize=False, **kwds) p1 = pd.PeriodIndex([p1_d], freq=freq) p2 = pd.PeriodIndex([p2_d], freq=freq) result = p2 - p1 - expected = (pd.PeriodIndex([p2_d], freq=freq.base) - - pd.PeriodIndex([p1_d], freq=freq.base)) + expected = pd.PeriodIndex([p2_d], freq=freq.base) - pd.PeriodIndex( + [p1_d], freq=freq.base + ) tm.assert_index_equal(result, expected) # ------------------------------------------------------------- # Invalid Operations - @pytest.mark.parametrize('other', [3.14, np.array([2.0, 3.0])]) - @pytest.mark.parametrize('op', [operator.add, ops.radd, - operator.sub, ops.rsub]) + @pytest.mark.parametrize("other", [3.14, np.array([2.0, 3.0])]) + @pytest.mark.parametrize("op", [operator.add, ops.radd, operator.sub, ops.rsub]) def test_parr_add_sub_float_raises(self, op, other, box_with_array): - dti = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D') - pi = dti.to_period('D') + dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], freq="D") + pi = dti.to_period("D") pi = tm.box_expected(pi, box_with_array) with pytest.raises(TypeError): op(pi, other) - @pytest.mark.parametrize('other', [pd.Timestamp.now(), - pd.Timestamp.now().to_pydatetime(), - pd.Timestamp.now().to_datetime64()]) + @pytest.mark.parametrize( + "other", + [ + pd.Timestamp.now(), + pd.Timestamp.now().to_pydatetime(), + pd.Timestamp.now().to_datetime64(), + ], + ) def test_parr_add_sub_datetime_scalar(self, other, box_with_array): # GH#23215 - rng = pd.period_range('1/1/2000', freq='D', periods=3) + rng = pd.period_range("1/1/2000", freq="D", periods=3) rng = tm.box_expected(rng, box_with_array) with pytest.raises(TypeError): @@ -566,8 +595,8 @@ def test_parr_add_sub_datetime_scalar(self, other, box_with_array): # __add__/__sub__ with ndarray[datetime64] and ndarray[timedelta64] def test_parr_add_sub_dt64_array_raises(self, box_with_array): - rng = pd.period_range('1/1/2000', freq='D', periods=3) - dti = pd.date_range('2016-01-01', periods=3) + rng = pd.period_range("1/1/2000", freq="D", periods=3) + dti = pd.date_range("2016-01-01", periods=3) dtarr = dti.values rng = tm.box_expected(rng, box_with_array) @@ -583,8 +612,8 @@ def test_parr_add_sub_dt64_array_raises(self, box_with_array): dtarr - rng def test_pi_add_sub_td64_array_non_tick_raises(self): - rng = pd.period_range('1/1/2000', freq='Q', periods=3) - tdi = pd.TimedeltaIndex(['-1 Day', '-1 Day', '-1 Day']) + rng = pd.period_range("1/1/2000", freq="Q", periods=3) + tdi = pd.TimedeltaIndex(["-1 Day", "-1 Day", "-1 Day"]) tdarr = tdi.values with pytest.raises(IncompatibleFrequency): @@ -600,11 +629,11 @@ def test_pi_add_sub_td64_array_non_tick_raises(self): def test_pi_add_sub_td64_array_tick(self): # PeriodIndex + Timedelta-like is allowed only with # tick-like frequencies - rng = pd.period_range('1/1/2000', freq='90D', periods=3) - tdi = pd.TimedeltaIndex(['-1 Day', '-1 Day', '-1 Day']) + rng = pd.period_range("1/1/2000", freq="90D", periods=3) + tdi = pd.TimedeltaIndex(["-1 Day", "-1 Day", "-1 Day"]) tdarr = tdi.values - expected = pd.period_range('12/31/1999', freq='90D', periods=3) + expected = pd.period_range("12/31/1999", freq="90D", periods=3) result = rng + tdi tm.assert_index_equal(result, expected) result = rng + tdarr @@ -614,7 +643,7 @@ def test_pi_add_sub_td64_array_tick(self): result = tdarr + rng tm.assert_index_equal(result, expected) - expected = pd.period_range('1/2/2000', freq='90D', periods=3) + expected = pd.period_range("1/2/2000", freq="90D", periods=3) result = rng - tdi tm.assert_index_equal(result, expected) @@ -630,13 +659,17 @@ def test_pi_add_sub_td64_array_tick(self): # ----------------------------------------------------------------- # operations with array/Index of DateOffset objects - @pytest.mark.parametrize('box', [np.array, pd.Index]) + @pytest.mark.parametrize("box", [np.array, pd.Index]) def test_pi_add_offset_array(self, box): # GH#18849 - pi = pd.PeriodIndex([pd.Period('2015Q1'), pd.Period('2016Q2')]) - offs = box([pd.offsets.QuarterEnd(n=1, startingMonth=12), - pd.offsets.QuarterEnd(n=-2, startingMonth=12)]) - expected = pd.PeriodIndex([pd.Period('2015Q2'), pd.Period('2015Q4')]) + pi = pd.PeriodIndex([pd.Period("2015Q1"), pd.Period("2016Q2")]) + offs = box( + [ + pd.offsets.QuarterEnd(n=1, startingMonth=12), + pd.offsets.QuarterEnd(n=-2, startingMonth=12), + ] + ) + expected = pd.PeriodIndex([pd.Period("2015Q2"), pd.Period("2015Q4")]) with tm.assert_produces_warning(PerformanceWarning): res = pi + offs @@ -646,8 +679,7 @@ def test_pi_add_offset_array(self, box): res2 = offs + pi tm.assert_index_equal(res2, expected) - unanchored = np.array([pd.offsets.Hour(n=1), - pd.offsets.Minute(n=-2)]) + unanchored = np.array([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)]) # addition/subtraction ops with incompatible offsets should issue # a PerformanceWarning and _then_ raise a TypeError. with pytest.raises(IncompatibleFrequency): @@ -657,12 +689,16 @@ def test_pi_add_offset_array(self, box): with tm.assert_produces_warning(PerformanceWarning): unanchored + pi - @pytest.mark.parametrize('box', [np.array, pd.Index]) + @pytest.mark.parametrize("box", [np.array, pd.Index]) def test_pi_sub_offset_array(self, box): # GH#18824 - pi = pd.PeriodIndex([pd.Period('2015Q1'), pd.Period('2016Q2')]) - other = box([pd.offsets.QuarterEnd(n=1, startingMonth=12), - pd.offsets.QuarterEnd(n=-2, startingMonth=12)]) + pi = pd.PeriodIndex([pd.Period("2015Q1"), pd.Period("2016Q2")]) + other = box( + [ + pd.offsets.QuarterEnd(n=1, startingMonth=12), + pd.offsets.QuarterEnd(n=-2, startingMonth=12), + ] + ) expected = PeriodIndex([pi[n] - other[n] for n in range(len(pi))]) @@ -683,9 +719,9 @@ def test_pi_sub_offset_array(self, box): def test_pi_add_iadd_int(self, one): # Variants of `one` for #19012 - rng = pd.period_range('2000-01-01 09:00', freq='H', periods=10) + rng = pd.period_range("2000-01-01 09:00", freq="H", periods=10) result = rng + one - expected = pd.period_range('2000-01-01 10:00', freq='H', periods=10) + expected = pd.period_range("2000-01-01 10:00", freq="H", periods=10) tm.assert_index_equal(result, expected) rng += one tm.assert_index_equal(rng, expected) @@ -695,16 +731,16 @@ def test_pi_sub_isub_int(self, one): PeriodIndex.__sub__ and __isub__ with several representations of the integer 1, e.g. int, np.int64, np.uint8, ... """ - rng = pd.period_range('2000-01-01 09:00', freq='H', periods=10) + rng = pd.period_range("2000-01-01 09:00", freq="H", periods=10) result = rng - one - expected = pd.period_range('2000-01-01 08:00', freq='H', periods=10) + expected = pd.period_range("2000-01-01 08:00", freq="H", periods=10) tm.assert_index_equal(result, expected) rng -= one tm.assert_index_equal(rng, expected) - @pytest.mark.parametrize('five', [5, np.array(5, dtype=np.int64)]) + @pytest.mark.parametrize("five", [5, np.array(5, dtype=np.int64)]) def test_pi_sub_intlike(self, five): - rng = period_range('2007-01', periods=50) + rng = period_range("2007-01", periods=50) result = rng - five exp = rng + (-five) @@ -713,16 +749,16 @@ def test_pi_sub_intlike(self, five): def test_pi_sub_isub_offset(self): # offset # DateOffset - rng = pd.period_range('2014', '2024', freq='A') + rng = pd.period_range("2014", "2024", freq="A") result = rng - pd.offsets.YearEnd(5) - expected = pd.period_range('2009', '2019', freq='A') + expected = pd.period_range("2009", "2019", freq="A") tm.assert_index_equal(result, expected) rng -= pd.offsets.YearEnd(5) tm.assert_index_equal(rng, expected) - rng = pd.period_range('2014-01', '2016-12', freq='M') + rng = pd.period_range("2014-01", "2016-12", freq="M") result = rng - pd.offsets.MonthEnd(5) - expected = pd.period_range('2013-08', '2016-07', freq='M') + expected = pd.period_range("2013-08", "2016-07", freq="M") tm.assert_index_equal(result, expected) rng -= pd.offsets.MonthEnd(5) @@ -733,10 +769,10 @@ def test_pi_add_offset_n_gt1(self, box_transpose_fail): # add offset to PeriodIndex with freq.n > 1 box, transpose = box_transpose_fail - per = pd.Period('2016-01', freq='2M') + per = pd.Period("2016-01", freq="2M") pi = pd.PeriodIndex([per]) - expected = pd.PeriodIndex(['2016-03'], freq='2M') + expected = pd.PeriodIndex(["2016-03"], freq="2M") pi = tm.box_expected(pi, box, transpose=transpose) expected = tm.box_expected(expected, box, transpose=transpose) @@ -750,41 +786,41 @@ def test_pi_add_offset_n_gt1(self, box_transpose_fail): def test_pi_add_offset_n_gt1_not_divisible(self, box_with_array): # GH#23215 # PeriodIndex with freq.n > 1 add offset with offset.n % freq.n != 0 - pi = pd.PeriodIndex(['2016-01'], freq='2M') - expected = pd.PeriodIndex(['2016-04'], freq='2M') + pi = pd.PeriodIndex(["2016-01"], freq="2M") + expected = pd.PeriodIndex(["2016-04"], freq="2M") # FIXME: with transposing these tests fail pi = tm.box_expected(pi, box_with_array, transpose=False) expected = tm.box_expected(expected, box_with_array, transpose=False) - result = pi + to_offset('3M') + result = pi + to_offset("3M") tm.assert_equal(result, expected) - result = to_offset('3M') + pi + result = to_offset("3M") + pi tm.assert_equal(result, expected) # --------------------------------------------------------------- # __add__/__sub__ with integer arrays - @pytest.mark.parametrize('int_holder', [np.array, pd.Index]) - @pytest.mark.parametrize('op', [operator.add, ops.radd]) + @pytest.mark.parametrize("int_holder", [np.array, pd.Index]) + @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_pi_add_intarray(self, int_holder, op): # GH#19959 - pi = pd.PeriodIndex([pd.Period('2015Q1'), pd.Period('NaT')]) + pi = pd.PeriodIndex([pd.Period("2015Q1"), pd.Period("NaT")]) other = int_holder([4, -1]) result = op(pi, other) - expected = pd.PeriodIndex([pd.Period('2016Q1'), pd.Period('NaT')]) + expected = pd.PeriodIndex([pd.Period("2016Q1"), pd.Period("NaT")]) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('int_holder', [np.array, pd.Index]) + @pytest.mark.parametrize("int_holder", [np.array, pd.Index]) def test_pi_sub_intarray(self, int_holder): # GH#19959 - pi = pd.PeriodIndex([pd.Period('2015Q1'), pd.Period('NaT')]) + pi = pd.PeriodIndex([pd.Period("2015Q1"), pd.Period("NaT")]) other = int_holder([4, -1]) result = pi - other - expected = pd.PeriodIndex([pd.Period('2014Q1'), pd.Period('NaT')]) + expected = pd.PeriodIndex([pd.Period("2014Q1"), pd.Period("NaT")]) tm.assert_index_equal(result, expected) with pytest.raises(TypeError): @@ -800,10 +836,9 @@ def test_pi_add_timedeltalike_minute_gt1(self, three_days): # in test_pi_add_timedeltalike_tick_gt1, but here we write out the # expected result more explicitly. other = three_days - rng = pd.period_range('2014-05-01', periods=3, freq='2D') + rng = pd.period_range("2014-05-01", periods=3, freq="2D") - expected = pd.PeriodIndex(['2014-05-04', '2014-05-06', '2014-05-08'], - freq='2D') + expected = pd.PeriodIndex(["2014-05-04", "2014-05-06", "2014-05-08"], freq="2D") result = rng + other tm.assert_index_equal(result, expected) @@ -812,21 +847,19 @@ def test_pi_add_timedeltalike_minute_gt1(self, three_days): tm.assert_index_equal(result, expected) # subtraction - expected = pd.PeriodIndex(['2014-04-28', '2014-04-30', '2014-05-02'], - freq='2D') + expected = pd.PeriodIndex(["2014-04-28", "2014-04-30", "2014-05-02"], freq="2D") result = rng - other tm.assert_index_equal(result, expected) with pytest.raises(TypeError): other - rng - @pytest.mark.parametrize('freqstr', ['5ns', '5us', '5ms', - '5s', '5T', '5h', '5d']) + @pytest.mark.parametrize("freqstr", ["5ns", "5us", "5ms", "5s", "5T", "5h", "5d"]) def test_pi_add_timedeltalike_tick_gt1(self, three_days, freqstr): # GH#23031 adding a time-delta-like offset to a PeriodArray that has # tick-like frequency with n != 1 other = three_days - rng = pd.period_range('2014-05-01', periods=6, freq=freqstr) + rng = pd.period_range("2014-05-01", periods=6, freq=freqstr) expected = pd.period_range(rng[0] + other, periods=6, freq=freqstr) @@ -847,8 +880,8 @@ def test_pi_add_timedeltalike_tick_gt1(self, three_days, freqstr): def test_pi_add_iadd_timedeltalike_daily(self, three_days): # Tick other = three_days - rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') - expected = pd.period_range('2014-05-04', '2014-05-18', freq='D') + rng = pd.period_range("2014-05-01", "2014-05-15", freq="D") + expected = pd.period_range("2014-05-04", "2014-05-18", freq="D") result = rng + other tm.assert_index_equal(result, expected) @@ -859,8 +892,8 @@ def test_pi_add_iadd_timedeltalike_daily(self, three_days): def test_pi_sub_isub_timedeltalike_daily(self, three_days): # Tick-like 3 Days other = three_days - rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') - expected = pd.period_range('2014-04-28', '2014-05-12', freq='D') + rng = pd.period_range("2014-05-01", "2014-05-15", freq="D") + expected = pd.period_range("2014-04-28", "2014-05-12", freq="D") result = rng - other tm.assert_index_equal(result, expected) @@ -870,8 +903,8 @@ def test_pi_sub_isub_timedeltalike_daily(self, three_days): def test_pi_add_sub_timedeltalike_freq_mismatch_daily(self, not_daily): other = not_daily - rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') - msg = 'Input has different freq(=.+)? from Period.*?\\(freq=D\\)' + rng = pd.period_range("2014-05-01", "2014-05-15", freq="D") + msg = "Input has different freq(=.+)? from Period.*?\\(freq=D\\)" with pytest.raises(IncompatibleFrequency, match=msg): rng + other with pytest.raises(IncompatibleFrequency, match=msg): @@ -883,9 +916,8 @@ def test_pi_add_sub_timedeltalike_freq_mismatch_daily(self, not_daily): def test_pi_add_iadd_timedeltalike_hourly(self, two_hours): other = two_hours - rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', freq='H') - expected = pd.period_range('2014-01-01 12:00', '2014-01-05 12:00', - freq='H') + rng = pd.period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="H") + expected = pd.period_range("2014-01-01 12:00", "2014-01-05 12:00", freq="H") result = rng + other tm.assert_index_equal(result, expected) @@ -895,8 +927,8 @@ def test_pi_add_iadd_timedeltalike_hourly(self, two_hours): def test_pi_add_timedeltalike_mismatched_freq_hourly(self, not_hourly): other = not_hourly - rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', freq='H') - msg = 'Input has different freq(=.+)? from Period.*?\\(freq=H\\)' + rng = pd.period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="H") + msg = "Input has different freq(=.+)? from Period.*?\\(freq=H\\)" with pytest.raises(IncompatibleFrequency, match=msg): rng + other @@ -906,9 +938,8 @@ def test_pi_add_timedeltalike_mismatched_freq_hourly(self, not_hourly): def test_pi_sub_isub_timedeltalike_hourly(self, two_hours): other = two_hours - rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', freq='H') - expected = pd.period_range('2014-01-01 08:00', '2014-01-05 08:00', - freq='H') + rng = pd.period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="H") + expected = pd.period_range("2014-01-01 08:00", "2014-01-05 08:00", freq="H") result = rng - other tm.assert_index_equal(result, expected) @@ -919,19 +950,17 @@ def test_pi_sub_isub_timedeltalike_hourly(self, two_hours): def test_add_iadd_timedeltalike_annual(self): # offset # DateOffset - rng = pd.period_range('2014', '2024', freq='A') + rng = pd.period_range("2014", "2024", freq="A") result = rng + pd.offsets.YearEnd(5) - expected = pd.period_range('2019', '2029', freq='A') + expected = pd.period_range("2019", "2029", freq="A") tm.assert_index_equal(result, expected) rng += pd.offsets.YearEnd(5) tm.assert_index_equal(rng, expected) - def test_pi_add_sub_timedeltalike_freq_mismatch_annual(self, - mismatched_freq): + def test_pi_add_sub_timedeltalike_freq_mismatch_annual(self, mismatched_freq): other = mismatched_freq - rng = pd.period_range('2014', '2024', freq='A') - msg = ('Input has different freq(=.+)? ' - 'from Period.*?\\(freq=A-DEC\\)') + rng = pd.period_range("2014", "2024", freq="A") + msg = "Input has different freq(=.+)? " "from Period.*?\\(freq=A-DEC\\)" with pytest.raises(IncompatibleFrequency, match=msg): rng + other with pytest.raises(IncompatibleFrequency, match=msg): @@ -942,8 +971,8 @@ def test_pi_add_sub_timedeltalike_freq_mismatch_annual(self, rng -= other def test_pi_add_iadd_timedeltalike_M(self): - rng = pd.period_range('2014-01', '2016-12', freq='M') - expected = pd.period_range('2014-06', '2017-05', freq='M') + rng = pd.period_range("2014-01", "2016-12", freq="M") + expected = pd.period_range("2014-06", "2017-05", freq="M") result = rng + pd.offsets.MonthEnd(5) tm.assert_index_equal(result, expected) @@ -951,11 +980,10 @@ def test_pi_add_iadd_timedeltalike_M(self): rng += pd.offsets.MonthEnd(5) tm.assert_index_equal(rng, expected) - def test_pi_add_sub_timedeltalike_freq_mismatch_monthly(self, - mismatched_freq): + def test_pi_add_sub_timedeltalike_freq_mismatch_monthly(self, mismatched_freq): other = mismatched_freq - rng = pd.period_range('2014-01', '2016-12', freq='M') - msg = 'Input has different freq(=.+)? from Period.*?\\(freq=M\\)' + rng = pd.period_range("2014-01", "2016-12", freq="M") + msg = "Input has different freq(=.+)? from Period.*?\\(freq=M\\)" with pytest.raises(IncompatibleFrequency, match=msg): rng + other with pytest.raises(IncompatibleFrequency, match=msg): @@ -989,17 +1017,21 @@ def test_parr_add_sub_td64_nat(self, box_transpose_fail): class TestPeriodSeriesArithmetic: def test_ops_series_timedelta(self): # GH#13043 - ser = pd.Series([pd.Period('2015-01-01', freq='D'), - pd.Period('2015-01-02', freq='D')], name='xxx') - assert ser.dtype == 'Period[D]' + ser = pd.Series( + [pd.Period("2015-01-01", freq="D"), pd.Period("2015-01-02", freq="D")], + name="xxx", + ) + assert ser.dtype == "Period[D]" - expected = pd.Series([pd.Period('2015-01-02', freq='D'), - pd.Period('2015-01-03', freq='D')], name='xxx') + expected = pd.Series( + [pd.Period("2015-01-02", freq="D"), pd.Period("2015-01-03", freq="D")], + name="xxx", + ) - result = ser + pd.Timedelta('1 days') + result = ser + pd.Timedelta("1 days") tm.assert_series_equal(result, expected) - result = pd.Timedelta('1 days') + ser + result = pd.Timedelta("1 days") + ser tm.assert_series_equal(result, expected) result = ser + pd.tseries.offsets.Day() @@ -1010,22 +1042,26 @@ def test_ops_series_timedelta(self): def test_ops_series_period(self): # GH#13043 - ser = pd.Series([pd.Period('2015-01-01', freq='D'), - pd.Period('2015-01-02', freq='D')], name='xxx') + ser = pd.Series( + [pd.Period("2015-01-01", freq="D"), pd.Period("2015-01-02", freq="D")], + name="xxx", + ) assert ser.dtype == "Period[D]" - per = pd.Period('2015-01-10', freq='D') + per = pd.Period("2015-01-10", freq="D") off = per.freq # dtype will be object because of original dtype - expected = pd.Series([9 * off, 8 * off], name='xxx', dtype=object) + expected = pd.Series([9 * off, 8 * off], name="xxx", dtype=object) tm.assert_series_equal(per - ser, expected) tm.assert_series_equal(ser - per, -1 * expected) - s2 = pd.Series([pd.Period('2015-01-05', freq='D'), - pd.Period('2015-01-04', freq='D')], name='xxx') + s2 = pd.Series( + [pd.Period("2015-01-05", freq="D"), pd.Period("2015-01-04", freq="D")], + name="xxx", + ) assert s2.dtype == "Period[D]" - expected = pd.Series([4 * off, 2 * off], name='xxx', dtype=object) + expected = pd.Series([4 * off, 2 * off], name="xxx", dtype=object) tm.assert_series_equal(s2 - ser, expected) tm.assert_series_equal(ser - s2, -1 * expected) @@ -1045,52 +1081,62 @@ def _check(self, values, func, expected): tm.assert_series_equal(result, exp) def test_pi_ops(self): - idx = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], - freq='M', name='idx') + idx = PeriodIndex( + ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx" + ) - expected = PeriodIndex(['2011-03', '2011-04', '2011-05', '2011-06'], - freq='M', name='idx') + expected = PeriodIndex( + ["2011-03", "2011-04", "2011-05", "2011-06"], freq="M", name="idx" + ) self._check(idx, lambda x: x + 2, expected) self._check(idx, lambda x: 2 + x, expected) self._check(idx + 2, lambda x: x - 2, idx) - result = idx - Period('2011-01', freq='M') + result = idx - Period("2011-01", freq="M") off = idx.freq - exp = pd.Index([0 * off, 1 * off, 2 * off, 3 * off], name='idx') + exp = pd.Index([0 * off, 1 * off, 2 * off, 3 * off], name="idx") tm.assert_index_equal(result, exp) - result = Period('2011-01', freq='M') - idx - exp = pd.Index([0 * off, -1 * off, -2 * off, -3 * off], name='idx') + result = Period("2011-01", freq="M") - idx + exp = pd.Index([0 * off, -1 * off, -2 * off, -3 * off], name="idx") tm.assert_index_equal(result, exp) @pytest.mark.parametrize("ng", ["str", 1.5]) - @pytest.mark.parametrize("func", [ - lambda obj, ng: obj + ng, - lambda obj, ng: ng + obj, - lambda obj, ng: obj - ng, - lambda obj, ng: ng - obj, - lambda obj, ng: np.add(obj, ng), - lambda obj, ng: np.add(ng, obj), - lambda obj, ng: np.subtract(obj, ng), - lambda obj, ng: np.subtract(ng, obj), - ]) + @pytest.mark.parametrize( + "func", + [ + lambda obj, ng: obj + ng, + lambda obj, ng: ng + obj, + lambda obj, ng: obj - ng, + lambda obj, ng: ng - obj, + lambda obj, ng: np.add(obj, ng), + lambda obj, ng: np.add(ng, obj), + lambda obj, ng: np.subtract(obj, ng), + lambda obj, ng: np.subtract(ng, obj), + ], + ) def test_parr_ops_errors(self, ng, func, box_with_array): - idx = PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], - freq="M", name="idx") + idx = PeriodIndex( + ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx" + ) obj = tm.box_expected(idx, box_with_array) - msg = (r"unsupported operand type\(s\)|can only concatenate|" - r"must be str|object to str implicitly") + msg = ( + r"unsupported operand type\(s\)|can only concatenate|" + r"must be str|object to str implicitly" + ) with pytest.raises(TypeError, match=msg): func(obj, ng) def test_pi_ops_nat(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], - freq='M', name='idx') - expected = PeriodIndex(['2011-03', '2011-04', 'NaT', '2011-06'], - freq='M', name='idx') + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) + expected = PeriodIndex( + ["2011-03", "2011-04", "NaT", "2011-06"], freq="M", name="idx" + ) self._check(idx, lambda x: x + 2, expected) self._check(idx, lambda x: 2 + x, expected) @@ -1100,10 +1146,12 @@ def test_pi_ops_nat(self): self._check(idx + 2, lambda x: np.subtract(x, 2), idx) # freq with mult - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], - freq='2M', name='idx') - expected = PeriodIndex(['2011-07', '2011-08', 'NaT', '2011-10'], - freq='2M', name='idx') + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="2M", name="idx" + ) + expected = PeriodIndex( + ["2011-07", "2011-08", "NaT", "2011-10"], freq="2M", name="idx" + ) self._check(idx, lambda x: x + 3, expected) self._check(idx, lambda x: 3 + x, expected) @@ -1114,49 +1162,69 @@ def test_pi_ops_nat(self): def test_pi_ops_array_int(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], - freq='M', name='idx') + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) f = lambda x: x + np.array([1, 2, 3, 4]) - exp = PeriodIndex(['2011-02', '2011-04', 'NaT', '2011-08'], - freq='M', name='idx') + exp = PeriodIndex( + ["2011-02", "2011-04", "NaT", "2011-08"], freq="M", name="idx" + ) self._check(idx, f, exp) f = lambda x: np.add(x, np.array([4, -1, 1, 2])) - exp = PeriodIndex(['2011-05', '2011-01', 'NaT', '2011-06'], - freq='M', name='idx') + exp = PeriodIndex( + ["2011-05", "2011-01", "NaT", "2011-06"], freq="M", name="idx" + ) self._check(idx, f, exp) f = lambda x: x - np.array([1, 2, 3, 4]) - exp = PeriodIndex(['2010-12', '2010-12', 'NaT', '2010-12'], - freq='M', name='idx') + exp = PeriodIndex( + ["2010-12", "2010-12", "NaT", "2010-12"], freq="M", name="idx" + ) self._check(idx, f, exp) f = lambda x: np.subtract(x, np.array([3, 2, 3, -2])) - exp = PeriodIndex(['2010-10', '2010-12', 'NaT', '2011-06'], - freq='M', name='idx') + exp = PeriodIndex( + ["2010-10", "2010-12", "NaT", "2011-06"], freq="M", name="idx" + ) self._check(idx, f, exp) def test_pi_ops_offset(self): - idx = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01', - '2011-04-01'], freq='D', name='idx') + idx = PeriodIndex( + ["2011-01-01", "2011-02-01", "2011-03-01", "2011-04-01"], + freq="D", + name="idx", + ) f = lambda x: x + pd.offsets.Day() - exp = PeriodIndex(['2011-01-02', '2011-02-02', '2011-03-02', - '2011-04-02'], freq='D', name='idx') + exp = PeriodIndex( + ["2011-01-02", "2011-02-02", "2011-03-02", "2011-04-02"], + freq="D", + name="idx", + ) self._check(idx, f, exp) f = lambda x: x + pd.offsets.Day(2) - exp = PeriodIndex(['2011-01-03', '2011-02-03', '2011-03-03', - '2011-04-03'], freq='D', name='idx') + exp = PeriodIndex( + ["2011-01-03", "2011-02-03", "2011-03-03", "2011-04-03"], + freq="D", + name="idx", + ) self._check(idx, f, exp) f = lambda x: x - pd.offsets.Day(2) - exp = PeriodIndex(['2010-12-30', '2011-01-30', '2011-02-27', - '2011-03-30'], freq='D', name='idx') + exp = PeriodIndex( + ["2010-12-30", "2011-01-30", "2011-02-27", "2011-03-30"], + freq="D", + name="idx", + ) self._check(idx, f, exp) def test_pi_offset_errors(self): - idx = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01', - '2011-04-01'], freq='D', name='idx') + idx = PeriodIndex( + ["2011-01-01", "2011-02-01", "2011-03-01", "2011-04-01"], + freq="D", + name="idx", + ) ser = pd.Series(idx) # Series op is applied per Period instance, thus error is raised @@ -1175,50 +1243,53 @@ def test_pi_offset_errors(self): def test_pi_sub_period(self): # GH#13071 - idx = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], - freq='M', name='idx') + idx = PeriodIndex( + ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx" + ) - result = idx - pd.Period('2012-01', freq='M') + result = idx - pd.Period("2012-01", freq="M") off = idx.freq - exp = pd.Index([-12 * off, -11 * off, -10 * off, -9 * off], name='idx') + exp = pd.Index([-12 * off, -11 * off, -10 * off, -9 * off], name="idx") tm.assert_index_equal(result, exp) - result = np.subtract(idx, pd.Period('2012-01', freq='M')) + result = np.subtract(idx, pd.Period("2012-01", freq="M")) tm.assert_index_equal(result, exp) - result = pd.Period('2012-01', freq='M') - idx - exp = pd.Index([12 * off, 11 * off, 10 * off, 9 * off], name='idx') + result = pd.Period("2012-01", freq="M") - idx + exp = pd.Index([12 * off, 11 * off, 10 * off, 9 * off], name="idx") tm.assert_index_equal(result, exp) - result = np.subtract(pd.Period('2012-01', freq='M'), idx) + result = np.subtract(pd.Period("2012-01", freq="M"), idx) tm.assert_index_equal(result, exp) - exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name='idx') - tm.assert_index_equal(idx - pd.Period('NaT', freq='M'), exp) - tm.assert_index_equal(pd.Period('NaT', freq='M') - idx, exp) + exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name="idx") + tm.assert_index_equal(idx - pd.Period("NaT", freq="M"), exp) + tm.assert_index_equal(pd.Period("NaT", freq="M") - idx, exp) def test_pi_sub_pdnat(self): # GH#13071 - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], - freq='M', name='idx') - exp = pd.TimedeltaIndex([pd.NaT] * 4, name='idx') + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) + exp = pd.TimedeltaIndex([pd.NaT] * 4, name="idx") tm.assert_index_equal(pd.NaT - idx, exp) tm.assert_index_equal(idx - pd.NaT, exp) def test_pi_sub_period_nat(self): # GH#13071 - idx = PeriodIndex(['2011-01', 'NaT', '2011-03', '2011-04'], - freq='M', name='idx') + idx = PeriodIndex( + ["2011-01", "NaT", "2011-03", "2011-04"], freq="M", name="idx" + ) - result = idx - pd.Period('2012-01', freq='M') + result = idx - pd.Period("2012-01", freq="M") off = idx.freq - exp = pd.Index([-12 * off, pd.NaT, -10 * off, -9 * off], name='idx') + exp = pd.Index([-12 * off, pd.NaT, -10 * off, -9 * off], name="idx") tm.assert_index_equal(result, exp) - result = pd.Period('2012-01', freq='M') - idx - exp = pd.Index([12 * off, pd.NaT, 10 * off, 9 * off], name='idx') + result = pd.Period("2012-01", freq="M") - idx + exp = pd.Index([12 * off, pd.NaT, 10 * off, 9 * off], name="idx") tm.assert_index_equal(result, exp) - exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name='idx') - tm.assert_index_equal(idx - pd.Period('NaT', freq='M'), exp) - tm.assert_index_equal(pd.Period('NaT', freq='M') - idx, exp) + exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name="idx") + tm.assert_index_equal(idx - pd.Period("NaT", freq="M"), exp) + tm.assert_index_equal(pd.Period("NaT", freq="M") - idx, exp) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 0ae325cfce7877..06c4a6ece4bcce 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -5,13 +5,19 @@ import numpy as np import pytest -from pandas.errors import ( - NullFrequencyError, OutOfBoundsDatetime, PerformanceWarning) +from pandas.errors import NullFrequencyError, OutOfBoundsDatetime, PerformanceWarning import pandas as pd from pandas import ( - DataFrame, DatetimeIndex, NaT, Series, Timedelta, TimedeltaIndex, - Timestamp, timedelta_range) + DataFrame, + DatetimeIndex, + NaT, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, + timedelta_range, +) import pandas.util.testing as tm @@ -31,6 +37,7 @@ def get_upcast_box(box, vector): # ------------------------------------------------------------------ # Timedelta64[ns] dtype Comparisons + class TestTimedelta64ArrayLikeComparisons: # Comparison tests for timedelta64[ns] vectors fully parametrized over # DataFrame/Series/TimedeltaIndex/TimedeltaArray. Ideally all comparison @@ -41,7 +48,7 @@ def test_compare_timedelta64_zerodim(self, box_with_array): box = box_with_array xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - tdi = pd.timedelta_range('2H', periods=4) + tdi = pd.timedelta_range("2H", periods=4) other = np.array(tdi.to_numpy()[0]) tdi = tm.box_expected(tdi, box) @@ -68,10 +75,10 @@ def test_compare_timedelta_series(self): def test_tdi_cmp_str_invalid(self, box_with_array): # GH#13624 xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - tdi = TimedeltaIndex(['1 day', '2 days']) + tdi = TimedeltaIndex(["1 day", "2 days"]) tdarr = tm.box_expected(tdi, box_with_array) - for left, right in [(tdarr, 'a'), ('a', tdarr)]: + for left, right in [(tdarr, "a"), ("a", tdarr)]: with pytest.raises(TypeError): left > right with pytest.raises(TypeError): @@ -91,11 +98,12 @@ def test_tdi_cmp_str_invalid(self, box_with_array): expected = tm.box_expected(expected, xbox) tm.assert_equal(result, expected) - @pytest.mark.parametrize('dtype', [None, object]) + @pytest.mark.parametrize("dtype", [None, object]) def test_comp_nat(self, dtype): - left = pd.TimedeltaIndex([pd.Timedelta('1 days'), pd.NaT, - pd.Timedelta('3 days')]) - right = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta('3 days')]) + left = pd.TimedeltaIndex( + [pd.Timedelta("1 days"), pd.NaT, pd.Timedelta("3 days")] + ) + right = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta("3 days")]) lhs, rhs = left, right if dtype is object: @@ -122,15 +130,29 @@ def test_comp_nat(self, dtype): tm.assert_numpy_array_equal(pd.NaT > lhs, expected) def test_comparisons_nat(self): - tdidx1 = pd.TimedeltaIndex(['1 day', pd.NaT, '1 day 00:00:01', pd.NaT, - '1 day 00:00:01', '5 day 00:00:03']) - tdidx2 = pd.TimedeltaIndex(['2 day', '2 day', pd.NaT, pd.NaT, - '1 day 00:00:02', '5 days 00:00:03']) - tdarr = np.array([np.timedelta64(2, 'D'), - np.timedelta64(2, 'D'), np.timedelta64('nat'), - np.timedelta64('nat'), - np.timedelta64(1, 'D') + np.timedelta64(2, 's'), - np.timedelta64(5, 'D') + np.timedelta64(3, 's')]) + tdidx1 = pd.TimedeltaIndex( + [ + "1 day", + pd.NaT, + "1 day 00:00:01", + pd.NaT, + "1 day 00:00:01", + "5 day 00:00:03", + ] + ) + tdidx2 = pd.TimedeltaIndex( + ["2 day", "2 day", pd.NaT, pd.NaT, "1 day 00:00:02", "5 days 00:00:03"] + ) + tdarr = np.array( + [ + np.timedelta64(2, "D"), + np.timedelta64(2, "D"), + np.timedelta64("nat"), + np.timedelta64("nat"), + np.timedelta64(1, "D") + np.timedelta64(2, "s"), + np.timedelta64(5, "D") + np.timedelta64(3, "s"), + ] + ) cases = [(tdidx1, tdidx2), (tdidx1, tdarr)] @@ -163,7 +185,7 @@ def test_comparisons_nat(self): # TODO: better name def test_comparisons_coverage(self): - rng = timedelta_range('1 days', periods=10) + rng = timedelta_range("1 days", periods=10) result = rng < rng[3] expected = np.array([True, True, True] + [False] * 7) @@ -181,53 +203,49 @@ def test_comparisons_coverage(self): # ------------------------------------------------------------------ # Timedelta64[ns] dtype Arithmetic Operations + class TestTimedelta64ArithmeticUnsorted: # Tests moved from type-specific test files but not # yet sorted/parametrized/de-duplicated def test_ufunc_coercions(self): # normal ops are also tested in tseries/test_timedeltas.py - idx = TimedeltaIndex(['2H', '4H', '6H', '8H', '10H'], - freq='2H', name='x') + idx = TimedeltaIndex(["2H", "4H", "6H", "8H", "10H"], freq="2H", name="x") for result in [idx * 2, np.multiply(idx, 2)]: assert isinstance(result, TimedeltaIndex) - exp = TimedeltaIndex(['4H', '8H', '12H', '16H', '20H'], - freq='4H', name='x') + exp = TimedeltaIndex(["4H", "8H", "12H", "16H", "20H"], freq="4H", name="x") tm.assert_index_equal(result, exp) - assert result.freq == '4H' + assert result.freq == "4H" for result in [idx / 2, np.divide(idx, 2)]: assert isinstance(result, TimedeltaIndex) - exp = TimedeltaIndex(['1H', '2H', '3H', '4H', '5H'], - freq='H', name='x') + exp = TimedeltaIndex(["1H", "2H", "3H", "4H", "5H"], freq="H", name="x") tm.assert_index_equal(result, exp) - assert result.freq == 'H' + assert result.freq == "H" - idx = TimedeltaIndex(['2H', '4H', '6H', '8H', '10H'], - freq='2H', name='x') + idx = TimedeltaIndex(["2H", "4H", "6H", "8H", "10H"], freq="2H", name="x") for result in [-idx, np.negative(idx)]: assert isinstance(result, TimedeltaIndex) - exp = TimedeltaIndex(['-2H', '-4H', '-6H', '-8H', '-10H'], - freq='-2H', name='x') + exp = TimedeltaIndex( + ["-2H", "-4H", "-6H", "-8H", "-10H"], freq="-2H", name="x" + ) tm.assert_index_equal(result, exp) - assert result.freq == '-2H' + assert result.freq == "-2H" - idx = TimedeltaIndex(['-2H', '-1H', '0H', '1H', '2H'], - freq='H', name='x') + idx = TimedeltaIndex(["-2H", "-1H", "0H", "1H", "2H"], freq="H", name="x") for result in [abs(idx), np.absolute(idx)]: assert isinstance(result, TimedeltaIndex) - exp = TimedeltaIndex(['2H', '1H', '0H', '1H', '2H'], - freq=None, name='x') + exp = TimedeltaIndex(["2H", "1H", "0H", "1H", "2H"], freq=None, name="x") tm.assert_index_equal(result, exp) assert result.freq is None def test_subtraction_ops(self): # with datetimes/timedelta and tdi/dti - tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') - dti = pd.date_range('20130101', periods=3, name='bar') - td = Timedelta('1 days') - dt = Timestamp('20130101') + tdi = TimedeltaIndex(["1 days", pd.NaT, "2 days"], name="foo") + dti = pd.date_range("20130101", periods=3, name="bar") + td = Timedelta("1 days") + dt = Timestamp("20130101") msg = "cannot subtract a datelike from a TimedeltaArray" with pytest.raises(TypeError, match=msg): @@ -235,8 +253,10 @@ def test_subtraction_ops(self): with pytest.raises(TypeError, match=msg): tdi - dti - msg = (r"descriptor '__sub__' requires a 'datetime\.datetime' object" - " but received a 'Timedelta'") + msg = ( + r"descriptor '__sub__' requires a 'datetime\.datetime' object" + " but received a 'Timedelta'" + ) with pytest.raises(TypeError, match=msg): td - dt @@ -245,41 +265,40 @@ def test_subtraction_ops(self): td - dti result = dt - dti - expected = TimedeltaIndex(['0 days', '-1 days', '-2 days'], name='bar') + expected = TimedeltaIndex(["0 days", "-1 days", "-2 days"], name="bar") tm.assert_index_equal(result, expected) result = dti - dt - expected = TimedeltaIndex(['0 days', '1 days', '2 days'], name='bar') + expected = TimedeltaIndex(["0 days", "1 days", "2 days"], name="bar") tm.assert_index_equal(result, expected) result = tdi - td - expected = TimedeltaIndex(['0 days', pd.NaT, '1 days'], name='foo') + expected = TimedeltaIndex(["0 days", pd.NaT, "1 days"], name="foo") tm.assert_index_equal(result, expected, check_names=False) result = td - tdi - expected = TimedeltaIndex(['0 days', pd.NaT, '-1 days'], name='foo') + expected = TimedeltaIndex(["0 days", pd.NaT, "-1 days"], name="foo") tm.assert_index_equal(result, expected, check_names=False) result = dti - td - expected = DatetimeIndex( - ['20121231', '20130101', '20130102'], name='bar') + expected = DatetimeIndex(["20121231", "20130101", "20130102"], name="bar") tm.assert_index_equal(result, expected, check_names=False) result = dt - tdi - expected = DatetimeIndex(['20121231', pd.NaT, '20121230'], name='foo') + expected = DatetimeIndex(["20121231", pd.NaT, "20121230"], name="foo") tm.assert_index_equal(result, expected) def test_subtraction_ops_with_tz(self): # check that dt/dti subtraction ops with tz are validated - dti = pd.date_range('20130101', periods=3) - ts = Timestamp('20130101') + dti = pd.date_range("20130101", periods=3) + ts = Timestamp("20130101") dt = ts.to_pydatetime() - dti_tz = pd.date_range('20130101', periods=3).tz_localize('US/Eastern') - ts_tz = Timestamp('20130101').tz_localize('US/Eastern') - ts_tz2 = Timestamp('20130101').tz_localize('CET') + dti_tz = pd.date_range("20130101", periods=3).tz_localize("US/Eastern") + ts_tz = Timestamp("20130101").tz_localize("US/Eastern") + ts_tz2 = Timestamp("20130101").tz_localize("CET") dt_tz = ts_tz.to_pydatetime() - td = Timedelta('1 days') + td = Timedelta("1 days") def _check(result, expected): assert result == expected @@ -287,34 +306,31 @@ def _check(result, expected): # scalars result = ts - ts - expected = Timedelta('0 days') + expected = Timedelta("0 days") _check(result, expected) result = dt_tz - ts_tz - expected = Timedelta('0 days') + expected = Timedelta("0 days") _check(result, expected) result = ts_tz - dt_tz - expected = Timedelta('0 days') + expected = Timedelta("0 days") _check(result, expected) # tz mismatches - msg = ("Timestamp subtraction must have the same timezones or no" - " timezones") + msg = "Timestamp subtraction must have the same timezones or no" " timezones" with pytest.raises(TypeError, match=msg): dt_tz - ts msg = "can't subtract offset-naive and offset-aware datetimes" with pytest.raises(TypeError, match=msg): dt_tz - dt - msg = ("Timestamp subtraction must have the same timezones or no" - " timezones") + msg = "Timestamp subtraction must have the same timezones or no" " timezones" with pytest.raises(TypeError, match=msg): dt_tz - ts_tz2 msg = "can't subtract offset-naive and offset-aware datetimes" with pytest.raises(TypeError, match=msg): dt - dt_tz - msg = ("Timestamp subtraction must have the same timezones or no" - " timezones") + msg = "Timestamp subtraction must have the same timezones or no" " timezones" with pytest.raises(TypeError, match=msg): ts - dt_tz with pytest.raises(TypeError, match=msg): @@ -333,72 +349,71 @@ def _check(result, expected): dti_tz - ts_tz2 result = dti_tz - dt_tz - expected = TimedeltaIndex(['0 days', '1 days', '2 days']) + expected = TimedeltaIndex(["0 days", "1 days", "2 days"]) tm.assert_index_equal(result, expected) result = dt_tz - dti_tz - expected = TimedeltaIndex(['0 days', '-1 days', '-2 days']) + expected = TimedeltaIndex(["0 days", "-1 days", "-2 days"]) tm.assert_index_equal(result, expected) result = dti_tz - ts_tz - expected = TimedeltaIndex(['0 days', '1 days', '2 days']) + expected = TimedeltaIndex(["0 days", "1 days", "2 days"]) tm.assert_index_equal(result, expected) result = ts_tz - dti_tz - expected = TimedeltaIndex(['0 days', '-1 days', '-2 days']) + expected = TimedeltaIndex(["0 days", "-1 days", "-2 days"]) tm.assert_index_equal(result, expected) result = td - td - expected = Timedelta('0 days') + expected = Timedelta("0 days") _check(result, expected) result = dti_tz - td - expected = DatetimeIndex( - ['20121231', '20130101', '20130102'], tz='US/Eastern') + expected = DatetimeIndex(["20121231", "20130101", "20130102"], tz="US/Eastern") tm.assert_index_equal(result, expected) def test_dti_tdi_numeric_ops(self): # These are normally union/diff set-like ops - tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') - dti = pd.date_range('20130101', periods=3, name='bar') + tdi = TimedeltaIndex(["1 days", pd.NaT, "2 days"], name="foo") + dti = pd.date_range("20130101", periods=3, name="bar") # TODO(wesm): unused? # td = Timedelta('1 days') # dt = Timestamp('20130101') result = tdi - tdi - expected = TimedeltaIndex(['0 days', pd.NaT, '0 days'], name='foo') + expected = TimedeltaIndex(["0 days", pd.NaT, "0 days"], name="foo") tm.assert_index_equal(result, expected) result = tdi + tdi - expected = TimedeltaIndex(['2 days', pd.NaT, '4 days'], name='foo') + expected = TimedeltaIndex(["2 days", pd.NaT, "4 days"], name="foo") tm.assert_index_equal(result, expected) result = dti - tdi # name will be reset - expected = DatetimeIndex(['20121231', pd.NaT, '20130101']) + expected = DatetimeIndex(["20121231", pd.NaT, "20130101"]) tm.assert_index_equal(result, expected) def test_addition_ops(self): # with datetimes/timedelta and tdi/dti - tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') - dti = pd.date_range('20130101', periods=3, name='bar') - td = Timedelta('1 days') - dt = Timestamp('20130101') + tdi = TimedeltaIndex(["1 days", pd.NaT, "2 days"], name="foo") + dti = pd.date_range("20130101", periods=3, name="bar") + td = Timedelta("1 days") + dt = Timestamp("20130101") result = tdi + dt - expected = DatetimeIndex(['20130102', pd.NaT, '20130103'], name='foo') + expected = DatetimeIndex(["20130102", pd.NaT, "20130103"], name="foo") tm.assert_index_equal(result, expected) result = dt + tdi - expected = DatetimeIndex(['20130102', pd.NaT, '20130103'], name='foo') + expected = DatetimeIndex(["20130102", pd.NaT, "20130103"], name="foo") tm.assert_index_equal(result, expected) result = td + tdi - expected = TimedeltaIndex(['2 days', pd.NaT, '3 days'], name='foo') + expected = TimedeltaIndex(["2 days", pd.NaT, "3 days"], name="foo") tm.assert_index_equal(result, expected) result = tdi + td - expected = TimedeltaIndex(['2 days', pd.NaT, '3 days'], name='foo') + expected = TimedeltaIndex(["2 days", pd.NaT, "3 days"], name="foo") tm.assert_index_equal(result, expected) # unequal length @@ -416,32 +431,32 @@ def test_addition_ops(self): # pytest.raises(TypeError, lambda : Int64Index([1,2,3]) + tdi) result = tdi + dti # name will be reset - expected = DatetimeIndex(['20130102', pd.NaT, '20130105']) + expected = DatetimeIndex(["20130102", pd.NaT, "20130105"]) tm.assert_index_equal(result, expected) result = dti + tdi # name will be reset - expected = DatetimeIndex(['20130102', pd.NaT, '20130105']) + expected = DatetimeIndex(["20130102", pd.NaT, "20130105"]) tm.assert_index_equal(result, expected) result = dt + td - expected = Timestamp('20130102') + expected = Timestamp("20130102") assert result == expected result = td + dt - expected = Timestamp('20130102') + expected = Timestamp("20130102") assert result == expected # TODO: Needs more informative name, probably split up into # more targeted tests - @pytest.mark.parametrize('freq', ['D', 'B']) + @pytest.mark.parametrize("freq", ["D", "B"]) def test_timedelta(self, freq): - index = pd.date_range('1/1/2000', periods=50, freq=freq) + index = pd.date_range("1/1/2000", periods=50, freq=freq) shifted = index + timedelta(1) back = shifted + timedelta(-1) tm.assert_index_equal(index, back) - if freq == 'D': + if freq == "D": expected = pd.tseries.offsets.Day(1) assert index.freq == expected assert shifted.freq == expected @@ -456,7 +471,7 @@ def test_timedelta(self, freq): tm.assert_index_equal(result, expected) # GH#4134, buggy with timedeltas - rng = pd.date_range('2013', '2014') + rng = pd.date_range("2013", "2014") s = Series(rng) result1 = rng - pd.offsets.Hour(1) result2 = DatetimeIndex(s - np.timedelta64(100000000)) @@ -471,19 +486,23 @@ class TestAddSubNaTMasking: def test_tdi_add_timestamp_nat_masking(self): # GH#17991 checking for overflow-masking with NaT - tdinat = pd.to_timedelta(['24658 days 11:15:00', 'NaT']) - - tsneg = Timestamp('1950-01-01') - ts_neg_variants = [tsneg, - tsneg.to_pydatetime(), - tsneg.to_datetime64().astype('datetime64[ns]'), - tsneg.to_datetime64().astype('datetime64[D]')] - - tspos = Timestamp('1980-01-01') - ts_pos_variants = [tspos, - tspos.to_pydatetime(), - tspos.to_datetime64().astype('datetime64[ns]'), - tspos.to_datetime64().astype('datetime64[D]')] + tdinat = pd.to_timedelta(["24658 days 11:15:00", "NaT"]) + + tsneg = Timestamp("1950-01-01") + ts_neg_variants = [ + tsneg, + tsneg.to_pydatetime(), + tsneg.to_datetime64().astype("datetime64[ns]"), + tsneg.to_datetime64().astype("datetime64[D]"), + ] + + tspos = Timestamp("1980-01-01") + ts_pos_variants = [ + tspos, + tspos.to_pydatetime(), + tspos.to_datetime64().astype("datetime64[ns]"), + tspos.to_datetime64().astype("datetime64[D]"), + ] for variant in ts_neg_variants + ts_pos_variants: res = tdinat + variant @@ -493,36 +512,39 @@ def test_tdi_add_overflow(self): # See GH#14068 # preliminary test scalar analogue of vectorized tests below with pytest.raises(OutOfBoundsDatetime): - pd.to_timedelta(106580, 'D') + Timestamp('2000') + pd.to_timedelta(106580, "D") + Timestamp("2000") with pytest.raises(OutOfBoundsDatetime): - Timestamp('2000') + pd.to_timedelta(106580, 'D') + Timestamp("2000") + pd.to_timedelta(106580, "D") _NaT = int(pd.NaT) + 1 msg = "Overflow in int64 addition" with pytest.raises(OverflowError, match=msg): - pd.to_timedelta([106580], 'D') + Timestamp('2000') + pd.to_timedelta([106580], "D") + Timestamp("2000") with pytest.raises(OverflowError, match=msg): - Timestamp('2000') + pd.to_timedelta([106580], 'D') + Timestamp("2000") + pd.to_timedelta([106580], "D") with pytest.raises(OverflowError, match=msg): - pd.to_timedelta([_NaT]) - Timedelta('1 days') + pd.to_timedelta([_NaT]) - Timedelta("1 days") with pytest.raises(OverflowError, match=msg): - pd.to_timedelta(['5 days', _NaT]) - Timedelta('1 days') + pd.to_timedelta(["5 days", _NaT]) - Timedelta("1 days") with pytest.raises(OverflowError, match=msg): - (pd.to_timedelta([_NaT, '5 days', '1 hours']) - - pd.to_timedelta(['7 seconds', _NaT, '4 hours'])) + ( + pd.to_timedelta([_NaT, "5 days", "1 hours"]) + - pd.to_timedelta(["7 seconds", _NaT, "4 hours"]) + ) # These should not overflow! exp = TimedeltaIndex([pd.NaT]) - result = pd.to_timedelta([pd.NaT]) - Timedelta('1 days') + result = pd.to_timedelta([pd.NaT]) - Timedelta("1 days") tm.assert_index_equal(result, exp) - exp = TimedeltaIndex(['4 days', pd.NaT]) - result = pd.to_timedelta(['5 days', pd.NaT]) - Timedelta('1 days') + exp = TimedeltaIndex(["4 days", pd.NaT]) + result = pd.to_timedelta(["5 days", pd.NaT]) - Timedelta("1 days") tm.assert_index_equal(result, exp) - exp = TimedeltaIndex([pd.NaT, pd.NaT, '5 hours']) - result = (pd.to_timedelta([pd.NaT, '5 days', '1 hours']) + - pd.to_timedelta(['7 seconds', pd.NaT, '4 hours'])) + exp = TimedeltaIndex([pd.NaT, pd.NaT, "5 hours"]) + result = pd.to_timedelta([pd.NaT, "5 days", "1 hours"]) + pd.to_timedelta( + ["7 seconds", pd.NaT, "4 hours"] + ) tm.assert_index_equal(result, exp) @@ -533,7 +555,7 @@ class TestTimedeltaArraylikeAddSubOps: def test_td64_df_add_int_frame(self): # GH#22696 Check that we don't dispatch to numpy implementation, # which treats int64 as m8[ns] - tdi = pd.timedelta_range('1', periods=3) + tdi = pd.timedelta_range("1", periods=3) df = tdi.to_frame() other = pd.DataFrame([1, 2, 3], index=tdi) # indexed like `df` with pytest.raises(TypeError): @@ -549,21 +571,21 @@ def test_td64_df_add_int_frame(self): # parametrization+de-duplication def test_timedelta_ops_with_missing_values(self): # setup - s1 = pd.to_timedelta(Series(['00:00:01'])) - s2 = pd.to_timedelta(Series(['00:00:02'])) + s1 = pd.to_timedelta(Series(["00:00:01"])) + s2 = pd.to_timedelta(Series(["00:00:02"])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # Passing datetime64-dtype data to TimedeltaIndex is deprecated sn = pd.to_timedelta(Series([pd.NaT])) - df1 = pd.DataFrame(['00:00:01']).apply(pd.to_timedelta) - df2 = pd.DataFrame(['00:00:02']).apply(pd.to_timedelta) + df1 = pd.DataFrame(["00:00:01"]).apply(pd.to_timedelta) + df2 = pd.DataFrame(["00:00:02"]).apply(pd.to_timedelta) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # Passing datetime64-dtype data to TimedeltaIndex is deprecated dfn = pd.DataFrame([pd.NaT]).apply(pd.to_timedelta) - scalar1 = pd.to_timedelta('00:00:01') - scalar2 = pd.to_timedelta('00:00:02') - timedelta_NaT = pd.to_timedelta('NaT') + scalar1 = pd.to_timedelta("00:00:01") + scalar2 = pd.to_timedelta("00:00:02") + timedelta_NaT = pd.to_timedelta("NaT") actual = scalar1 + scalar1 assert actual == scalar2 @@ -645,67 +667,64 @@ def test_timedelta_ops_with_missing_values(self): # de-duplication, box-parametrization... def test_operators_timedelta64(self): # series ops - v1 = pd.date_range('2012-1-1', periods=3, freq='D') - v2 = pd.date_range('2012-1-2', periods=3, freq='D') + v1 = pd.date_range("2012-1-1", periods=3, freq="D") + v2 = pd.date_range("2012-1-2", periods=3, freq="D") rs = Series(v2) - Series(v1) - xp = Series(1e9 * 3600 * 24, - rs.index).astype('int64').astype('timedelta64[ns]') + xp = Series(1e9 * 3600 * 24, rs.index).astype("int64").astype("timedelta64[ns]") tm.assert_series_equal(rs, xp) - assert rs.dtype == 'timedelta64[ns]' + assert rs.dtype == "timedelta64[ns]" df = DataFrame(dict(A=v1)) td = Series([timedelta(days=i) for i in range(3)]) - assert td.dtype == 'timedelta64[ns]' + assert td.dtype == "timedelta64[ns]" # series on the rhs - result = df['A'] - df['A'].shift() - assert result.dtype == 'timedelta64[ns]' + result = df["A"] - df["A"].shift() + assert result.dtype == "timedelta64[ns]" - result = df['A'] + td - assert result.dtype == 'M8[ns]' + result = df["A"] + td + assert result.dtype == "M8[ns]" # scalar Timestamp on rhs - maxa = df['A'].max() + maxa = df["A"].max() assert isinstance(maxa, Timestamp) - resultb = df['A'] - df['A'].max() - assert resultb.dtype == 'timedelta64[ns]' + resultb = df["A"] - df["A"].max() + assert resultb.dtype == "timedelta64[ns]" # timestamp on lhs - result = resultb + df['A'] - values = [Timestamp('20111230'), Timestamp('20120101'), - Timestamp('20120103')] - expected = Series(values, name='A') + result = resultb + df["A"] + values = [Timestamp("20111230"), Timestamp("20120101"), Timestamp("20120103")] + expected = Series(values, name="A") tm.assert_series_equal(result, expected) # datetimes on rhs - result = df['A'] - datetime(2001, 1, 1) - expected = Series( - [timedelta(days=4017 + i) for i in range(3)], name='A') + result = df["A"] - datetime(2001, 1, 1) + expected = Series([timedelta(days=4017 + i) for i in range(3)], name="A") tm.assert_series_equal(result, expected) - assert result.dtype == 'm8[ns]' + assert result.dtype == "m8[ns]" d = datetime(2001, 1, 1, 3, 4) - resulta = df['A'] - d - assert resulta.dtype == 'm8[ns]' + resulta = df["A"] - d + assert resulta.dtype == "m8[ns]" # roundtrip resultb = resulta + d - tm.assert_series_equal(df['A'], resultb) + tm.assert_series_equal(df["A"], resultb) # timedeltas on rhs td = timedelta(days=1) - resulta = df['A'] + td + resulta = df["A"] + td resultb = resulta - td - tm.assert_series_equal(resultb, df['A']) - assert resultb.dtype == 'M8[ns]' + tm.assert_series_equal(resultb, df["A"]) + assert resultb.dtype == "M8[ns]" # roundtrip td = timedelta(minutes=5, seconds=3) - resulta = df['A'] + td + resulta = df["A"] + td resultb = resulta - td - tm.assert_series_equal(df['A'], resultb) - assert resultb.dtype == 'M8[ns]' + tm.assert_series_equal(df["A"], resultb) + assert resultb.dtype == "M8[ns]" # inplace value = rs[2] + np.timedelta64(timedelta(minutes=5, seconds=1)) @@ -714,100 +733,102 @@ def test_operators_timedelta64(self): def test_timedelta64_ops_nat(self): # GH 11349 - timedelta_series = Series([NaT, Timedelta('1s')]) - nat_series_dtype_timedelta = Series([NaT, NaT], - dtype='timedelta64[ns]') - single_nat_dtype_timedelta = Series([NaT], dtype='timedelta64[ns]') + timedelta_series = Series([NaT, Timedelta("1s")]) + nat_series_dtype_timedelta = Series([NaT, NaT], dtype="timedelta64[ns]") + single_nat_dtype_timedelta = Series([NaT], dtype="timedelta64[ns]") # subtraction - tm.assert_series_equal(timedelta_series - NaT, - nat_series_dtype_timedelta) - tm.assert_series_equal(-NaT + timedelta_series, - nat_series_dtype_timedelta) + tm.assert_series_equal(timedelta_series - NaT, nat_series_dtype_timedelta) + tm.assert_series_equal(-NaT + timedelta_series, nat_series_dtype_timedelta) - tm.assert_series_equal(timedelta_series - single_nat_dtype_timedelta, - nat_series_dtype_timedelta) - tm.assert_series_equal(-single_nat_dtype_timedelta + timedelta_series, - nat_series_dtype_timedelta) + tm.assert_series_equal( + timedelta_series - single_nat_dtype_timedelta, nat_series_dtype_timedelta + ) + tm.assert_series_equal( + -single_nat_dtype_timedelta + timedelta_series, nat_series_dtype_timedelta + ) # addition - tm.assert_series_equal(nat_series_dtype_timedelta + NaT, - nat_series_dtype_timedelta) - tm.assert_series_equal(NaT + nat_series_dtype_timedelta, - nat_series_dtype_timedelta) - - tm.assert_series_equal(nat_series_dtype_timedelta + - single_nat_dtype_timedelta, - nat_series_dtype_timedelta) - tm.assert_series_equal(single_nat_dtype_timedelta + - nat_series_dtype_timedelta, - nat_series_dtype_timedelta) - - tm.assert_series_equal(timedelta_series + NaT, - nat_series_dtype_timedelta) - tm.assert_series_equal(NaT + timedelta_series, - nat_series_dtype_timedelta) - - tm.assert_series_equal(timedelta_series + single_nat_dtype_timedelta, - nat_series_dtype_timedelta) - tm.assert_series_equal(single_nat_dtype_timedelta + timedelta_series, - nat_series_dtype_timedelta) - - tm.assert_series_equal(nat_series_dtype_timedelta + NaT, - nat_series_dtype_timedelta) - tm.assert_series_equal(NaT + nat_series_dtype_timedelta, - nat_series_dtype_timedelta) - - tm.assert_series_equal(nat_series_dtype_timedelta + - single_nat_dtype_timedelta, - nat_series_dtype_timedelta) - tm.assert_series_equal(single_nat_dtype_timedelta + - nat_series_dtype_timedelta, - nat_series_dtype_timedelta) + tm.assert_series_equal( + nat_series_dtype_timedelta + NaT, nat_series_dtype_timedelta + ) + tm.assert_series_equal( + NaT + nat_series_dtype_timedelta, nat_series_dtype_timedelta + ) + + tm.assert_series_equal( + nat_series_dtype_timedelta + single_nat_dtype_timedelta, + nat_series_dtype_timedelta, + ) + tm.assert_series_equal( + single_nat_dtype_timedelta + nat_series_dtype_timedelta, + nat_series_dtype_timedelta, + ) + + tm.assert_series_equal(timedelta_series + NaT, nat_series_dtype_timedelta) + tm.assert_series_equal(NaT + timedelta_series, nat_series_dtype_timedelta) + + tm.assert_series_equal( + timedelta_series + single_nat_dtype_timedelta, nat_series_dtype_timedelta + ) + tm.assert_series_equal( + single_nat_dtype_timedelta + timedelta_series, nat_series_dtype_timedelta + ) + + tm.assert_series_equal( + nat_series_dtype_timedelta + NaT, nat_series_dtype_timedelta + ) + tm.assert_series_equal( + NaT + nat_series_dtype_timedelta, nat_series_dtype_timedelta + ) + + tm.assert_series_equal( + nat_series_dtype_timedelta + single_nat_dtype_timedelta, + nat_series_dtype_timedelta, + ) + tm.assert_series_equal( + single_nat_dtype_timedelta + nat_series_dtype_timedelta, + nat_series_dtype_timedelta, + ) # multiplication - tm.assert_series_equal(nat_series_dtype_timedelta * 1.0, - nat_series_dtype_timedelta) - tm.assert_series_equal(1.0 * nat_series_dtype_timedelta, - nat_series_dtype_timedelta) + tm.assert_series_equal( + nat_series_dtype_timedelta * 1.0, nat_series_dtype_timedelta + ) + tm.assert_series_equal( + 1.0 * nat_series_dtype_timedelta, nat_series_dtype_timedelta + ) tm.assert_series_equal(timedelta_series * 1, timedelta_series) tm.assert_series_equal(1 * timedelta_series, timedelta_series) - tm.assert_series_equal(timedelta_series * 1.5, - Series([NaT, Timedelta('1.5s')])) - tm.assert_series_equal(1.5 * timedelta_series, - Series([NaT, Timedelta('1.5s')])) + tm.assert_series_equal(timedelta_series * 1.5, Series([NaT, Timedelta("1.5s")])) + tm.assert_series_equal(1.5 * timedelta_series, Series([NaT, Timedelta("1.5s")])) - tm.assert_series_equal(timedelta_series * np.nan, - nat_series_dtype_timedelta) - tm.assert_series_equal(np.nan * timedelta_series, - nat_series_dtype_timedelta) + tm.assert_series_equal(timedelta_series * np.nan, nat_series_dtype_timedelta) + tm.assert_series_equal(np.nan * timedelta_series, nat_series_dtype_timedelta) # division - tm.assert_series_equal(timedelta_series / 2, - Series([NaT, Timedelta('0.5s')])) - tm.assert_series_equal(timedelta_series / 2.0, - Series([NaT, Timedelta('0.5s')])) - tm.assert_series_equal(timedelta_series / np.nan, - nat_series_dtype_timedelta) + tm.assert_series_equal(timedelta_series / 2, Series([NaT, Timedelta("0.5s")])) + tm.assert_series_equal(timedelta_series / 2.0, Series([NaT, Timedelta("0.5s")])) + tm.assert_series_equal(timedelta_series / np.nan, nat_series_dtype_timedelta) # ------------------------------------------------------------- # Invalid Operations def test_td64arr_add_str_invalid(self, box_with_array): # GH#13624 - tdi = TimedeltaIndex(['1 day', '2 days']) + tdi = TimedeltaIndex(["1 day", "2 days"]) tdi = tm.box_expected(tdi, box_with_array) with pytest.raises(TypeError): - tdi + 'a' + tdi + "a" with pytest.raises(TypeError): - 'a' + tdi + "a" + tdi - @pytest.mark.parametrize('other', [3.14, np.array([2.0, 3.0])]) + @pytest.mark.parametrize("other", [3.14, np.array([2.0, 3.0])]) def test_td64arr_add_sub_float(self, box_with_array, other): - tdi = TimedeltaIndex(['-1 days', '-1 days']) + tdi = TimedeltaIndex(["-1 days", "-1 days"]) tdarr = tm.box_expected(tdi, box_with_array) with pytest.raises(TypeError): @@ -819,12 +840,12 @@ def test_td64arr_add_sub_float(self, box_with_array, other): with pytest.raises(TypeError): other - tdarr - @pytest.mark.parametrize('freq', [None, 'H']) + @pytest.mark.parametrize("freq", [None, "H"]) def test_td64arr_sub_period(self, box_with_array, freq): # GH#13078 # not supported, check TypeError - p = pd.Period('2011-01-01', freq='D') - idx = TimedeltaIndex(['1 hours', '2 hours'], freq=freq) + p = pd.Period("2011-01-01", freq="D") + idx = TimedeltaIndex(["1 hours", "2 hours"], freq=freq) idx = tm.box_expected(idx, box_with_array) with pytest.raises(TypeError): @@ -833,12 +854,12 @@ def test_td64arr_sub_period(self, box_with_array, freq): with pytest.raises(TypeError): p - idx - @pytest.mark.parametrize('pi_freq', ['D', 'W', 'Q', 'H']) - @pytest.mark.parametrize('tdi_freq', [None, 'H']) + @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "H"]) + @pytest.mark.parametrize("tdi_freq", [None, "H"]) def test_td64arr_sub_pi(self, box_with_array, tdi_freq, pi_freq): # GH#20049 subtracting PeriodIndex should raise TypeError - tdi = TimedeltaIndex(['1 hours', '2 hours'], freq=tdi_freq) - dti = Timestamp('2018-03-07 17:16:40') + tdi + tdi = TimedeltaIndex(["1 hours", "2 hours"], freq=tdi_freq) + dti = Timestamp("2018-03-07 17:16:40") + tdi pi = dti.to_period(pi_freq) # TODO: parametrize over box for pi? @@ -850,24 +871,26 @@ def test_td64arr_sub_pi(self, box_with_array, tdi_freq, pi_freq): # Binary operations td64 arraylike and datetime-like def test_td64arr_sub_timestamp_raises(self, box_with_array): - idx = TimedeltaIndex(['1 day', '2 day']) + idx = TimedeltaIndex(["1 day", "2 day"]) idx = tm.box_expected(idx, box_with_array) - msg = ("cannot subtract a datelike from|" - "Could not operate|" - "cannot perform operation") + msg = ( + "cannot subtract a datelike from|" + "Could not operate|" + "cannot perform operation" + ) with pytest.raises(TypeError, match=msg): - idx - Timestamp('2011-01-01') + idx - Timestamp("2011-01-01") def test_td64arr_add_timestamp(self, box_with_array, tz_naive_fixture): # GH#23215 # TODO: parametrize over scalar datetime types? tz = tz_naive_fixture - other = Timestamp('2011-01-01', tz=tz) + other = Timestamp("2011-01-01", tz=tz) - idx = TimedeltaIndex(['1 day', '2 day']) - expected = DatetimeIndex(['2011-01-02', '2011-01-03'], tz=tz) + idx = TimedeltaIndex(["1 day", "2 day"]) + expected = DatetimeIndex(["2011-01-02", "2011-01-03"], tz=tz) idx = tm.box_expected(idx, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -880,11 +903,11 @@ def test_td64arr_add_timestamp(self, box_with_array, tz_naive_fixture): def test_td64arr_add_sub_timestamp(self, box_with_array): # GH#11925 - ts = Timestamp('2012-01-01') + ts = Timestamp("2012-01-01") # TODO: parametrize over types of datetime scalar? - tdi = timedelta_range('1 day', periods=3) - expected = pd.date_range('2012-01-02', periods=3) + tdi = timedelta_range("1 day", periods=3) + expected = pd.date_range("2012-01-02", periods=3) tdarr = tm.box_expected(tdi, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -892,7 +915,7 @@ def test_td64arr_add_sub_timestamp(self, box_with_array): tm.assert_equal(ts + tdarr, expected) tm.assert_equal(tdarr + ts, expected) - expected2 = pd.date_range('2011-12-31', periods=3, freq='-1D') + expected2 = pd.date_range("2011-12-31", periods=3, freq="-1D") expected2 = tm.box_expected(expected2, box_with_array) tm.assert_equal(ts - tdarr, expected2) @@ -902,7 +925,7 @@ def test_td64arr_add_sub_timestamp(self, box_with_array): tdarr - ts def test_tdi_sub_dt64_array(self, box_with_array): - dti = pd.date_range('2016-01-01', periods=3) + dti = pd.date_range("2016-01-01", periods=3) tdi = dti - dti.shift(1) dtarr = dti.values expected = pd.DatetimeIndex(dtarr) - tdi @@ -918,7 +941,7 @@ def test_tdi_sub_dt64_array(self, box_with_array): tm.assert_equal(result, expected) def test_tdi_add_dt64_array(self, box_with_array): - dti = pd.date_range('2016-01-01', periods=3) + dti = pd.date_range("2016-01-01", periods=3) tdi = dti - dti.shift(1) dtarr = dti.values expected = pd.DatetimeIndex(dtarr) + tdi @@ -933,9 +956,9 @@ def test_tdi_add_dt64_array(self, box_with_array): def test_td64arr_add_datetime64_nat(self, box_with_array): # GH#23215 - other = np.datetime64('NaT') + other = np.datetime64("NaT") - tdi = timedelta_range('1 day', periods=3) + tdi = timedelta_range("1 day", periods=3) expected = pd.DatetimeIndex(["NaT", "NaT", "NaT"]) tdser = tm.box_expected(tdi, box_with_array) @@ -948,7 +971,7 @@ def test_td64arr_add_datetime64_nat(self, box_with_array): # Operations with int-like others def test_td64arr_add_int_series_invalid(self, box): - tdser = pd.Series(['59 Days', '59 Days', 'NaT'], dtype='m8[ns]') + tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") tdser = tm.box_expected(tdser, box) err = TypeError if box is not pd.Index else NullFrequencyError int_ser = Series([2, 3, 4]) @@ -964,14 +987,14 @@ def test_td64arr_add_int_series_invalid(self, box): def test_td64arr_add_intlike(self, box_with_array): # GH#19123 - tdi = TimedeltaIndex(['59 days', '59 days', 'NaT']) + tdi = TimedeltaIndex(["59 days", "59 days", "NaT"]) ser = tm.box_expected(tdi, box_with_array) err = TypeError if box_with_array in [pd.Index, tm.to_array]: err = NullFrequencyError - other = Series([20, 30, 40], dtype='uint8') + other = Series([20, 30, 40], dtype="uint8") # TODO: separate/parametrize with pytest.raises(err): @@ -994,12 +1017,11 @@ def test_td64arr_add_intlike(self, box_with_array): with pytest.raises(err): ser - pd.Index(other) - @pytest.mark.parametrize('scalar', [1, 1.5, np.array(2)]) - def test_td64arr_add_sub_numeric_scalar_invalid(self, box_with_array, - scalar): + @pytest.mark.parametrize("scalar", [1, 1.5, np.array(2)]) + def test_td64arr_add_sub_numeric_scalar_invalid(self, box_with_array, scalar): box = box_with_array - tdser = pd.Series(['59 Days', '59 Days', 'NaT'], dtype='m8[ns]') + tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") tdser = tm.box_expected(tdser, box) err = TypeError if box in [pd.Index, tm.to_array] and not isinstance(scalar, float): @@ -1014,20 +1036,36 @@ def test_td64arr_add_sub_numeric_scalar_invalid(self, box_with_array, with pytest.raises(err): scalar - tdser - @pytest.mark.parametrize('dtype', ['int64', 'int32', 'int16', - 'uint64', 'uint32', 'uint16', 'uint8', - 'float64', 'float32', 'float16']) - @pytest.mark.parametrize('vec', [ - np.array([1, 2, 3]), - pd.Index([1, 2, 3]), - Series([1, 2, 3]) - # TODO: Add DataFrame in here? - ], ids=lambda x: type(x).__name__) + @pytest.mark.parametrize( + "dtype", + [ + "int64", + "int32", + "int16", + "uint64", + "uint32", + "uint16", + "uint8", + "float64", + "float32", + "float16", + ], + ) + @pytest.mark.parametrize( + "vec", + [ + np.array([1, 2, 3]), + pd.Index([1, 2, 3]), + Series([1, 2, 3]) + # TODO: Add DataFrame in here? + ], + ids=lambda x: type(x).__name__, + ) def test_td64arr_add_sub_numeric_arr_invalid(self, box, vec, dtype): - tdser = pd.Series(['59 Days', '59 Days', 'NaT'], dtype='m8[ns]') + tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") tdser = tm.box_expected(tdser, box) err = TypeError - if box is pd.Index and not dtype.startswith('float'): + if box is pd.Index and not dtype.startswith("float"): err = NullFrequencyError vector = vec.astype(dtype) @@ -1044,9 +1082,14 @@ def test_td64arr_add_sub_numeric_arr_invalid(self, box, vec, dtype): # Operations with timedelta-like others # TODO: this was taken from tests.series.test_ops; de-duplicate - @pytest.mark.parametrize('scalar_td', [timedelta(minutes=5, seconds=4), - Timedelta(minutes=5, seconds=4), - Timedelta('5m4s').to_timedelta64()]) + @pytest.mark.parametrize( + "scalar_td", + [ + timedelta(minutes=5, seconds=4), + Timedelta(minutes=5, seconds=4), + Timedelta("5m4s").to_timedelta64(), + ], + ) def test_operators_timedelta64_with_timedelta(self, scalar_td): # smoke tests td1 = Series([timedelta(minutes=5, seconds=3)] * 3) @@ -1065,14 +1108,16 @@ def test_timedelta64_operations_with_timedeltas(self): td1 = Series([timedelta(minutes=5, seconds=3)] * 3) td2 = timedelta(minutes=5, seconds=4) result = td1 - td2 - expected = (Series([timedelta(seconds=0)] * 3) - - Series([timedelta(seconds=1)] * 3)) - assert result.dtype == 'm8[ns]' + expected = Series([timedelta(seconds=0)] * 3) - Series( + [timedelta(seconds=1)] * 3 + ) + assert result.dtype == "m8[ns]" tm.assert_series_equal(result, expected) result2 = td2 - td1 - expected = (Series([timedelta(seconds=1)] * 3) - - Series([timedelta(seconds=0)] * 3)) + expected = Series([timedelta(seconds=1)] * 3) - Series( + [timedelta(seconds=0)] * 3 + ) tm.assert_series_equal(result2, expected) # roundtrip @@ -1080,24 +1125,26 @@ def test_timedelta64_operations_with_timedeltas(self): # Now again, using pd.to_timedelta, which should build # a Series or a scalar, depending on input. - td1 = Series(pd.to_timedelta(['00:05:03'] * 3)) - td2 = pd.to_timedelta('00:05:04') + td1 = Series(pd.to_timedelta(["00:05:03"] * 3)) + td2 = pd.to_timedelta("00:05:04") result = td1 - td2 - expected = (Series([timedelta(seconds=0)] * 3) - - Series([timedelta(seconds=1)] * 3)) - assert result.dtype == 'm8[ns]' + expected = Series([timedelta(seconds=0)] * 3) - Series( + [timedelta(seconds=1)] * 3 + ) + assert result.dtype == "m8[ns]" tm.assert_series_equal(result, expected) result2 = td2 - td1 - expected = (Series([timedelta(seconds=1)] * 3) - - Series([timedelta(seconds=0)] * 3)) + expected = Series([timedelta(seconds=1)] * 3) - Series( + [timedelta(seconds=0)] * 3 + ) tm.assert_series_equal(result2, expected) # roundtrip tm.assert_series_equal(result + td2, td1) def test_td64arr_add_td64_array(self, box): - dti = pd.date_range('2016-01-01', periods=3) + dti = pd.date_range("2016-01-01", periods=3) tdi = dti - dti.shift(1) tdarr = tdi.values @@ -1111,7 +1158,7 @@ def test_td64arr_add_td64_array(self, box): tm.assert_equal(result, expected) def test_td64arr_sub_td64_array(self, box): - dti = pd.date_range('2016-01-01', periods=3) + dti = pd.date_range("2016-01-01", periods=3) tdi = dti - dti.shift(1) tdarr = tdi.values @@ -1125,20 +1172,28 @@ def test_td64arr_sub_td64_array(self, box): tm.assert_equal(result, expected) # TODO: parametrize over [add, sub, radd, rsub]? - @pytest.mark.parametrize('names', [(None, None, None), - ('Egon', 'Venkman', None), - ('NCC1701D', 'NCC1701D', 'NCC1701D')]) + @pytest.mark.parametrize( + "names", + [ + (None, None, None), + ("Egon", "Venkman", None), + ("NCC1701D", "NCC1701D", "NCC1701D"), + ], + ) def test_td64arr_add_sub_tdi(self, box, names): # GH#17250 make sure result dtype is correct # GH#19043 make sure names are propagated correctly - if box is pd.DataFrame and names[1] == 'Venkman': - pytest.skip("Name propagation for DataFrame does not behave like " - "it does for Index/Series") + if box is pd.DataFrame and names[1] == "Venkman": + pytest.skip( + "Name propagation for DataFrame does not behave like " + "it does for Index/Series" + ) - tdi = TimedeltaIndex(['0 days', '1 day'], name=names[0]) + tdi = TimedeltaIndex(["0 days", "1 day"], name=names[0]) ser = Series([Timedelta(hours=3), Timedelta(hours=4)], name=names[1]) - expected = Series([Timedelta(hours=3), Timedelta(days=1, hours=4)], - name=names[2]) + expected = Series( + [Timedelta(hours=3), Timedelta(days=1, hours=4)], name=names[2] + ) ser = tm.box_expected(ser, box) expected = tm.box_expected(expected, box) @@ -1146,38 +1201,39 @@ def test_td64arr_add_sub_tdi(self, box, names): result = tdi + ser tm.assert_equal(result, expected) if box is not pd.DataFrame: - assert result.dtype == 'timedelta64[ns]' + assert result.dtype == "timedelta64[ns]" else: - assert result.dtypes[0] == 'timedelta64[ns]' + assert result.dtypes[0] == "timedelta64[ns]" result = ser + tdi tm.assert_equal(result, expected) if box is not pd.DataFrame: - assert result.dtype == 'timedelta64[ns]' + assert result.dtype == "timedelta64[ns]" else: - assert result.dtypes[0] == 'timedelta64[ns]' + assert result.dtypes[0] == "timedelta64[ns]" - expected = Series([Timedelta(hours=-3), Timedelta(days=1, hours=-4)], - name=names[2]) + expected = Series( + [Timedelta(hours=-3), Timedelta(days=1, hours=-4)], name=names[2] + ) expected = tm.box_expected(expected, box) result = tdi - ser tm.assert_equal(result, expected) if box is not pd.DataFrame: - assert result.dtype == 'timedelta64[ns]' + assert result.dtype == "timedelta64[ns]" else: - assert result.dtypes[0] == 'timedelta64[ns]' + assert result.dtypes[0] == "timedelta64[ns]" result = ser - tdi tm.assert_equal(result, -expected) if box is not pd.DataFrame: - assert result.dtype == 'timedelta64[ns]' + assert result.dtype == "timedelta64[ns]" else: - assert result.dtypes[0] == 'timedelta64[ns]' + assert result.dtypes[0] == "timedelta64[ns]" def test_td64arr_add_sub_td64_nat(self, box): # GH#23320 special handling for timedelta64("NaT") - tdi = pd.TimedeltaIndex([NaT, Timedelta('1s')]) + tdi = pd.TimedeltaIndex([NaT, Timedelta("1s")]) other = np.timedelta64("NaT") expected = pd.TimedeltaIndex(["NaT"] * 2) @@ -1195,8 +1251,8 @@ def test_td64arr_add_sub_td64_nat(self, box): def test_td64arr_sub_NaT(self, box): # GH#18808 - ser = Series([NaT, Timedelta('1s')]) - expected = Series([NaT, NaT], dtype='timedelta64[ns]') + ser = Series([NaT, Timedelta("1s")]) + expected = Series([NaT, NaT], dtype="timedelta64[ns]") ser = tm.box_expected(ser, box) expected = tm.box_expected(expected, box) @@ -1206,9 +1262,8 @@ def test_td64arr_sub_NaT(self, box): def test_td64arr_add_timedeltalike(self, two_hours, box): # only test adding/sub offsets as + is now numeric - rng = timedelta_range('1 days', '10 days') - expected = timedelta_range('1 days 02:00:00', '10 days 02:00:00', - freq='D') + rng = timedelta_range("1 days", "10 days") + expected = timedelta_range("1 days 02:00:00", "10 days 02:00:00", freq="D") rng = tm.box_expected(rng, box) expected = tm.box_expected(expected, box) @@ -1217,8 +1272,8 @@ def test_td64arr_add_timedeltalike(self, two_hours, box): def test_td64arr_sub_timedeltalike(self, two_hours, box): # only test adding/sub offsets as - is now numeric - rng = timedelta_range('1 days', '10 days') - expected = timedelta_range('0 days 22:00:00', '9 days 22:00:00') + rng = timedelta_range("1 days", "10 days") + expected = timedelta_range("0 days 22:00:00", "9 days 22:00:00") rng = tm.box_expected(rng, box) expected = tm.box_expected(expected, box) @@ -1242,11 +1297,16 @@ def test_timedelta64_operations_with_DateOffset(self): tm.assert_series_equal(result, expected) with tm.assert_produces_warning(PerformanceWarning): - result = td + Series([pd.offsets.Minute(1), pd.offsets.Second(3), - pd.offsets.Hour(2)]) - expected = Series([timedelta(minutes=6, seconds=3), - timedelta(minutes=5, seconds=6), - timedelta(hours=2, minutes=5, seconds=3)]) + result = td + Series( + [pd.offsets.Minute(1), pd.offsets.Second(3), pd.offsets.Hour(2)] + ) + expected = Series( + [ + timedelta(minutes=6, seconds=3), + timedelta(minutes=5, seconds=6), + timedelta(hours=2, minutes=5, seconds=3), + ] + ) tm.assert_series_equal(result, expected) result = td + pd.offsets.Minute(1) + pd.offsets.Second(12) @@ -1254,30 +1314,30 @@ def test_timedelta64_operations_with_DateOffset(self): tm.assert_series_equal(result, expected) # valid DateOffsets - for do in ['Hour', 'Minute', 'Second', 'Day', 'Micro', 'Milli', - 'Nano']: + for do in ["Hour", "Minute", "Second", "Day", "Micro", "Milli", "Nano"]: op = getattr(pd.offsets, do) td + op(5) op(5) + td td - op(5) op(5) - td - @pytest.mark.parametrize('names', [(None, None, None), - ('foo', 'bar', None), - ('foo', 'foo', 'foo')]) + @pytest.mark.parametrize( + "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] + ) def test_td64arr_add_offset_index(self, names, box): # GH#18849, GH#19744 - if box is pd.DataFrame and names[1] == 'bar': - pytest.skip("Name propagation for DataFrame does not behave like " - "it does for Index/Series") - - tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00'], - name=names[0]) - other = pd.Index([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], - name=names[1]) - - expected = TimedeltaIndex([tdi[n] + other[n] for n in range(len(tdi))], - freq='infer', name=names[2]) + if box is pd.DataFrame and names[1] == "bar": + pytest.skip( + "Name propagation for DataFrame does not behave like " + "it does for Index/Series" + ) + + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"], name=names[0]) + other = pd.Index([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], name=names[1]) + + expected = TimedeltaIndex( + [tdi[n] + other[n] for n in range(len(tdi))], freq="infer", name=names[2] + ) tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, box) @@ -1296,11 +1356,12 @@ def test_td64arr_add_offset_index(self, names, box): # over second box? def test_td64arr_add_offset_array(self, box): # GH#18849 - tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00']) + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"]) other = np.array([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)]) - expected = TimedeltaIndex([tdi[n] + other[n] for n in range(len(tdi))], - freq='infer') + expected = TimedeltaIndex( + [tdi[n] + other[n] for n in range(len(tdi))], freq="infer" + ) tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, box) @@ -1316,22 +1377,23 @@ def test_td64arr_add_offset_array(self, box): res2 = other + tdi tm.assert_equal(res2, expected) - @pytest.mark.parametrize('names', [(None, None, None), - ('foo', 'bar', None), - ('foo', 'foo', 'foo')]) + @pytest.mark.parametrize( + "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] + ) def test_td64arr_sub_offset_index(self, names, box): # GH#18824, GH#19744 - if box is pd.DataFrame and names[1] == 'bar': - pytest.skip("Name propagation for DataFrame does not behave like " - "it does for Index/Series") + if box is pd.DataFrame and names[1] == "bar": + pytest.skip( + "Name propagation for DataFrame does not behave like " + "it does for Index/Series" + ) - tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00'], - name=names[0]) - other = pd.Index([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], - name=names[1]) + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"], name=names[0]) + other = pd.Index([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], name=names[1]) - expected = TimedeltaIndex([tdi[n] - other[n] for n in range(len(tdi))], - freq='infer', name=names[2]) + expected = TimedeltaIndex( + [tdi[n] - other[n] for n in range(len(tdi))], freq="infer", name=names[2] + ) tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, box) @@ -1345,11 +1407,12 @@ def test_td64arr_sub_offset_index(self, names, box): def test_td64arr_sub_offset_array(self, box_with_array): # GH#18824 - tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00']) + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"]) other = np.array([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)]) - expected = TimedeltaIndex([tdi[n] - other[n] for n in range(len(tdi))], - freq='infer') + expected = TimedeltaIndex( + [tdi[n] - other[n] for n in range(len(tdi))], freq="infer" + ) tdi = tm.box_expected(tdi, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1361,21 +1424,20 @@ def test_td64arr_sub_offset_array(self, box_with_array): res = tdi - other tm.assert_equal(res, expected) - @pytest.mark.parametrize('names', [(None, None, None), - ('foo', 'bar', None), - ('foo', 'foo', 'foo')]) + @pytest.mark.parametrize( + "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] + ) def test_td64arr_with_offset_series(self, names, box_df_fail): # GH#18849 box = box_df_fail box2 = Series if box in [pd.Index, tm.to_array] else box - tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00'], - name=names[0]) - other = Series([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], - name=names[1]) + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"], name=names[0]) + other = Series([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], name=names[1]) - expected_add = Series([tdi[n] + other[n] for n in range(len(tdi))], - name=names[2]) + expected_add = Series( + [tdi[n] + other[n] for n in range(len(tdi))], name=names[2] + ) tdi = tm.box_expected(tdi, box) expected_add = tm.box_expected(expected_add, box2) @@ -1388,19 +1450,19 @@ def test_td64arr_with_offset_series(self, names, box_df_fail): tm.assert_equal(res2, expected_add) # TODO: separate/parametrize add/sub test? - expected_sub = Series([tdi[n] - other[n] for n in range(len(tdi))], - name=names[2]) + expected_sub = Series( + [tdi[n] - other[n] for n in range(len(tdi))], name=names[2] + ) expected_sub = tm.box_expected(expected_sub, box2) with tm.assert_produces_warning(PerformanceWarning): res3 = tdi - other tm.assert_equal(res3, expected_sub) - @pytest.mark.parametrize('obox', [np.array, pd.Index, pd.Series]) - def test_td64arr_addsub_anchored_offset_arraylike(self, obox, - box_with_array): + @pytest.mark.parametrize("obox", [np.array, pd.Index, pd.Series]) + def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box_with_array): # GH#18824 - tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00']) + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"]) tdi = tm.box_expected(tdi, box_with_array) anchored = obox([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) @@ -1427,10 +1489,10 @@ class TestTimedeltaArraylikeMulDivOps: # TODO: Moved from tests.series.test_operators; needs cleanup @pytest.mark.parametrize("m", [1, 3, 10]) - @pytest.mark.parametrize("unit", ['D', 'h', 'm', 's', 'ms', 'us', 'ns']) + @pytest.mark.parametrize("unit", ["D", "h", "m", "s", "ms", "us", "ns"]) def test_timedelta64_conversions(self, m, unit): - startdate = Series(pd.date_range('2013-01-01', '2013-01-03')) - enddate = Series(pd.date_range('2013-03-01', '2013-03-03')) + startdate = Series(pd.date_range("2013-01-01", "2013-01-03")) + enddate = Series(pd.date_range("2013-03-01", "2013-03-03")) ser = enddate - startdate ser[2] = np.nan @@ -1441,8 +1503,7 @@ def test_timedelta64_conversions(self, m, unit): tm.assert_series_equal(result, expected) # reverse op - expected = Series([Timedelta(np.timedelta64(m, unit)) / x - for x in ser]) + expected = Series([Timedelta(np.timedelta64(m, unit)) / x for x in ser]) result = np.timedelta64(m, unit) / ser tm.assert_series_equal(result, expected) @@ -1451,7 +1512,7 @@ def test_timedelta64_conversions(self, m, unit): # organized with scalar others first, then array-like def test_td64arr_mul_int(self, box_with_array): - idx = TimedeltaIndex(np.arange(5, dtype='int64')) + idx = TimedeltaIndex(np.arange(5, dtype="int64")) idx = tm.box_expected(idx, box_with_array) result = idx * 1 @@ -1461,24 +1522,24 @@ def test_td64arr_mul_int(self, box_with_array): tm.assert_equal(result, idx) def test_td64arr_mul_tdlike_scalar_raises(self, two_hours, box_with_array): - rng = timedelta_range('1 days', '10 days', name='foo') + rng = timedelta_range("1 days", "10 days", name="foo") rng = tm.box_expected(rng, box_with_array) with pytest.raises(TypeError): rng * two_hours def test_tdi_mul_int_array_zerodim(self, box_with_array): - rng5 = np.arange(5, dtype='int64') + rng5 = np.arange(5, dtype="int64") idx = TimedeltaIndex(rng5) expected = TimedeltaIndex(rng5 * 5) idx = tm.box_expected(idx, box_with_array) expected = tm.box_expected(expected, box_with_array) - result = idx * np.array(5, dtype='int64') + result = idx * np.array(5, dtype="int64") tm.assert_equal(result, expected) def test_tdi_mul_int_array(self, box_with_array): - rng5 = np.arange(5, dtype='int64') + rng5 = np.arange(5, dtype="int64") idx = TimedeltaIndex(rng5) expected = TimedeltaIndex(rng5 ** 2) @@ -1492,23 +1553,23 @@ def test_tdi_mul_int_series(self, box_with_array): box = box_with_array xbox = pd.Series if box in [pd.Index, tm.to_array] else box - idx = TimedeltaIndex(np.arange(5, dtype='int64')) - expected = TimedeltaIndex(np.arange(5, dtype='int64') ** 2) + idx = TimedeltaIndex(np.arange(5, dtype="int64")) + expected = TimedeltaIndex(np.arange(5, dtype="int64") ** 2) idx = tm.box_expected(idx, box) expected = tm.box_expected(expected, xbox) - result = idx * pd.Series(np.arange(5, dtype='int64')) + result = idx * pd.Series(np.arange(5, dtype="int64")) tm.assert_equal(result, expected) def test_tdi_mul_float_series(self, box_with_array): box = box_with_array xbox = pd.Series if box in [pd.Index, tm.to_array] else box - idx = TimedeltaIndex(np.arange(5, dtype='int64')) + idx = TimedeltaIndex(np.arange(5, dtype="int64")) idx = tm.box_expected(idx, box) - rng5f = np.arange(5, dtype='float64') + rng5f = np.arange(5, dtype="float64") expected = TimedeltaIndex(rng5f * (rng5f + 1.0)) expected = tm.box_expected(expected, xbox) @@ -1516,19 +1577,23 @@ def test_tdi_mul_float_series(self, box_with_array): tm.assert_equal(result, expected) # TODO: Put Series/DataFrame in others? - @pytest.mark.parametrize('other', [ - np.arange(1, 11), - pd.Int64Index(range(1, 11)), - pd.UInt64Index(range(1, 11)), - pd.Float64Index(range(1, 11)), - pd.RangeIndex(1, 11) - ], ids=lambda x: type(x).__name__) + @pytest.mark.parametrize( + "other", + [ + np.arange(1, 11), + pd.Int64Index(range(1, 11)), + pd.UInt64Index(range(1, 11)), + pd.Float64Index(range(1, 11)), + pd.RangeIndex(1, 11), + ], + ids=lambda x: type(x).__name__, + ) def test_tdi_rmul_arraylike(self, other, box_with_array): box = box_with_array xbox = get_upcast_box(box, other) - tdi = TimedeltaIndex(['1 Day'] * 10) - expected = timedelta_range('1 days', '10 days') + tdi = TimedeltaIndex(["1 Day"] * 10) + expected = timedelta_range("1 days", "10 days") expected._data.freq = None tdi = tm.box_expected(tdi, box) @@ -1544,21 +1609,20 @@ def test_tdi_rmul_arraylike(self, other, box_with_array): def test_td64arr_div_nat_invalid(self, box_with_array): # don't allow division by NaT (maybe could in the future) - rng = timedelta_range('1 days', '10 days', name='foo') + rng = timedelta_range("1 days", "10 days", name="foo") rng = tm.box_expected(rng, box_with_array) - with pytest.raises(TypeError, - match="'?true_divide'? cannot use operands"): + with pytest.raises(TypeError, match="'?true_divide'? cannot use operands"): rng / pd.NaT - with pytest.raises(TypeError, match='Cannot divide NaTType by'): + with pytest.raises(TypeError, match="Cannot divide NaTType by"): pd.NaT / rng def test_td64arr_div_td64nat(self, box_with_array): # GH#23829 - rng = timedelta_range('1 days', '10 days',) + rng = timedelta_range("1 days", "10 days") rng = tm.box_expected(rng, box_with_array) - other = np.timedelta64('NaT') + other = np.timedelta64("NaT") expected = np.array([np.nan] * 10) expected = tm.box_expected(expected, box_with_array) @@ -1570,20 +1634,20 @@ def test_td64arr_div_td64nat(self, box_with_array): tm.assert_equal(result, expected) def test_td64arr_div_int(self, box_with_array): - idx = TimedeltaIndex(np.arange(5, dtype='int64')) + idx = TimedeltaIndex(np.arange(5, dtype="int64")) idx = tm.box_expected(idx, box_with_array) result = idx / 1 tm.assert_equal(result, idx) - with pytest.raises(TypeError, match='Cannot divide'): + with pytest.raises(TypeError, match="Cannot divide"): # GH#23829 1 / idx def test_td64arr_div_tdlike_scalar(self, two_hours, box_with_array): # GH#20088, GH#22163 ensure DataFrame returns correct dtype - rng = timedelta_range('1 days', '10 days', name='foo') - expected = pd.Float64Index((np.arange(10) + 1) * 12, name='foo') + rng = timedelta_range("1 days", "10 days", name="foo") + expected = pd.Float64Index((np.arange(10) + 1) * 12, name="foo") rng = tm.box_expected(rng, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1595,10 +1659,9 @@ def test_td64arr_div_tdlike_scalar(self, two_hours, box_with_array): expected = 1 / expected tm.assert_equal(result, expected) - def test_td64arr_div_tdlike_scalar_with_nat(self, two_hours, - box_with_array): - rng = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') - expected = pd.Float64Index([12, np.nan, 24], name='foo') + def test_td64arr_div_tdlike_scalar_with_nat(self, two_hours, box_with_array): + rng = TimedeltaIndex(["1 days", pd.NaT, "2 days"], name="foo") + expected = pd.Float64Index([12, np.nan, 24], name="foo") rng = tm.box_expected(rng, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1612,13 +1675,13 @@ def test_td64arr_div_tdlike_scalar_with_nat(self, two_hours, def test_td64arr_div_td64_ndarray(self, box_with_array): # GH#22631 - rng = TimedeltaIndex(['1 days', pd.NaT, '2 days']) + rng = TimedeltaIndex(["1 days", pd.NaT, "2 days"]) expected = pd.Float64Index([12, np.nan, 24]) rng = tm.box_expected(rng, box_with_array) expected = tm.box_expected(expected, box_with_array) - other = np.array([2, 4, 2], dtype='m8[h]') + other = np.array([2, 4, 2], dtype="m8[h]") result = rng / other tm.assert_equal(result, expected) @@ -1646,7 +1709,7 @@ def test_td64arr_div_td64_ndarray(self, box_with_array): tm.assert_equal(result, expected) def test_tdarr_div_length_mismatch(self, box_with_array): - rng = TimedeltaIndex(['1 days', pd.NaT, '2 days']) + rng = TimedeltaIndex(["1 days", pd.NaT, "2 days"]) mismatched = [1, 2, 3, 4] rng = tm.box_expected(rng, box_with_array) @@ -1687,8 +1750,7 @@ def test_td64arr_rfloordiv_tdscalar(self, box_with_array, scalar_td): result = scalar_td // td1 tm.assert_equal(result, expected) - def test_td64arr_rfloordiv_tdscalar_explicit(self, box_with_array, - scalar_td): + def test_td64arr_rfloordiv_tdscalar_explicit(self, box_with_array, scalar_td): # GH#18831 td1 = Series([timedelta(minutes=5, seconds=3)] * 3) td1.iloc[2] = np.nan @@ -1704,19 +1766,18 @@ def test_td64arr_rfloordiv_tdscalar_explicit(self, box_with_array, tm.assert_equal(result, expected) def test_td64arr_floordiv_int(self, box_with_array): - idx = TimedeltaIndex(np.arange(5, dtype='int64')) + idx = TimedeltaIndex(np.arange(5, dtype="int64")) idx = tm.box_expected(idx, box_with_array) result = idx // 1 tm.assert_equal(result, idx) - pattern = ('floor_divide cannot use operands|' - 'Cannot divide int by Timedelta*') + pattern = "floor_divide cannot use operands|" "Cannot divide int by Timedelta*" with pytest.raises(TypeError, match=pattern): 1 // idx def test_td64arr_floordiv_tdlike_scalar(self, two_hours, box_with_array): - tdi = timedelta_range('1 days', '10 days', name='foo') - expected = pd.Int64Index((np.arange(10) + 1) * 12, name='foo') + tdi = timedelta_range("1 days", "10 days", name="foo") + expected = pd.Int64Index((np.arange(10) + 1) * 12, name="foo") tdi = tm.box_expected(tdi, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1725,14 +1786,18 @@ def test_td64arr_floordiv_tdlike_scalar(self, two_hours, box_with_array): tm.assert_equal(result, expected) # TODO: Is this redundant with test_td64arr_floordiv_tdlike_scalar? - @pytest.mark.parametrize('scalar_td', [ - timedelta(minutes=10, seconds=7), - Timedelta('10m7s'), - Timedelta('10m7s').to_timedelta64() - ], ids=lambda x: type(x).__name__) + @pytest.mark.parametrize( + "scalar_td", + [ + timedelta(minutes=10, seconds=7), + Timedelta("10m7s"), + Timedelta("10m7s").to_timedelta64(), + ], + ids=lambda x: type(x).__name__, + ) def test_td64arr_rfloordiv_tdlike_scalar(self, scalar_td, box_with_array): # GH#19125 - tdi = TimedeltaIndex(['00:05:03', '00:05:03', pd.NaT], freq=None) + tdi = TimedeltaIndex(["00:05:03", "00:05:03", pd.NaT], freq=None) expected = pd.Index([2.0, 2.0, np.nan]) tdi = tm.box_expected(tdi, box_with_array, transpose=False) @@ -1753,10 +1818,10 @@ def test_td64arr_rfloordiv_tdlike_scalar(self, scalar_td, box_with_array): # reversed ops def test_td64arr_mod_tdscalar(self, box_with_array, three_days): - tdi = timedelta_range('1 Day', '9 days') + tdi = timedelta_range("1 Day", "9 days") tdarr = tm.box_expected(tdi, box_with_array) - expected = TimedeltaIndex(['1 Day', '2 Days', '0 Days'] * 3) + expected = TimedeltaIndex(["1 Day", "2 Days", "0 Days"] * 3) expected = tm.box_expected(expected, box_with_array) result = tdarr % three_days @@ -1770,10 +1835,10 @@ def test_td64arr_mod_tdscalar(self, box_with_array, three_days): tm.assert_equal(result[0], tdarr // three_days) def test_td64arr_mod_int(self, box_with_array): - tdi = timedelta_range('1 ns', '10 ns', periods=10) + tdi = timedelta_range("1 ns", "10 ns", periods=10) tdarr = tm.box_expected(tdi, box_with_array) - expected = TimedeltaIndex(['1 ns', '0 ns'] * 5) + expected = TimedeltaIndex(["1 ns", "0 ns"] * 5) expected = tm.box_expected(expected, box_with_array) result = tdarr % 2 @@ -1790,10 +1855,10 @@ def test_td64arr_mod_int(self, box_with_array): tm.assert_equal(result[0], tdarr // 2) def test_td64arr_rmod_tdscalar(self, box_with_array, three_days): - tdi = timedelta_range('1 Day', '9 days') + tdi = timedelta_range("1 Day", "9 days") tdarr = tm.box_expected(tdi, box_with_array) - expected = ['0 Days', '1 Day', '0 Days'] + ['3 Days'] * 6 + expected = ["0 Days", "1 Day", "0 Days"] + ["3 Days"] * 6 expected = TimedeltaIndex(expected) expected = tm.box_expected(expected, box_with_array) @@ -1819,14 +1884,14 @@ def test_td64arr_mul_tdscalar_invalid(self, box_with_array, scalar_td): # check that we are getting a TypeError # with 'operate' (from core/ops.py) for the ops that are not # defined - pattern = 'operate|unsupported|cannot|not supported' + pattern = "operate|unsupported|cannot|not supported" with pytest.raises(TypeError, match=pattern): td1 * scalar_td with pytest.raises(TypeError, match=pattern): scalar_td * td1 def test_td64arr_mul_too_short_raises(self, box_with_array): - idx = TimedeltaIndex(np.arange(5, dtype='int64')) + idx = TimedeltaIndex(np.arange(5, dtype="int64")) idx = tm.box_expected(idx, box_with_array) with pytest.raises(TypeError): idx * idx[:3] @@ -1834,7 +1899,7 @@ def test_td64arr_mul_too_short_raises(self, box_with_array): idx * np.array([1, 2]) def test_td64arr_mul_td64arr_raises(self, box_with_array): - idx = TimedeltaIndex(np.arange(5, dtype='int64')) + idx = TimedeltaIndex(np.arange(5, dtype="int64")) idx = tm.box_expected(idx, box_with_array) with pytest.raises(TypeError): idx * idx @@ -1842,13 +1907,12 @@ def test_td64arr_mul_td64arr_raises(self, box_with_array): # ------------------------------------------------------------------ # Operations with numeric others - @pytest.mark.parametrize('one', [1, np.array(1), 1.0, np.array(1.0)]) + @pytest.mark.parametrize("one", [1, np.array(1), 1.0, np.array(1.0)]) def test_td64arr_mul_numeric_scalar(self, box_with_array, one): # GH#4521 # divide/multiply by integers - tdser = pd.Series(['59 Days', '59 Days', 'NaT'], dtype='m8[ns]') - expected = Series(['-59 Days', '-59 Days', 'NaT'], - dtype='timedelta64[ns]') + tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + expected = Series(["-59 Days", "-59 Days", "NaT"], dtype="timedelta64[ns]") tdser = tm.box_expected(tdser, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1858,8 +1922,7 @@ def test_td64arr_mul_numeric_scalar(self, box_with_array, one): result = (-one) * tdser tm.assert_equal(result, expected) - expected = Series(['118 Days', '118 Days', 'NaT'], - dtype='timedelta64[ns]') + expected = Series(["118 Days", "118 Days", "NaT"], dtype="timedelta64[ns]") expected = tm.box_expected(expected, box_with_array) result = tdser * (2 * one) @@ -1867,12 +1930,12 @@ def test_td64arr_mul_numeric_scalar(self, box_with_array, one): result = (2 * one) * tdser tm.assert_equal(result, expected) - @pytest.mark.parametrize('two', [2, 2.0, np.array(2), np.array(2.0)]) + @pytest.mark.parametrize("two", [2, 2.0, np.array(2), np.array(2.0)]) def test_td64arr_div_numeric_scalar(self, box_with_array, two): # GH#4521 # divide/multiply by integers - tdser = pd.Series(['59 Days', '59 Days', 'NaT'], dtype='m8[ns]') - expected = Series(['29.5D', '29.5D', 'NaT'], dtype='timedelta64[ns]') + tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + expected = Series(["29.5D", "29.5D", "NaT"], dtype="timedelta64[ns]") tdser = tm.box_expected(tdser, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1880,26 +1943,38 @@ def test_td64arr_div_numeric_scalar(self, box_with_array, two): result = tdser / two tm.assert_equal(result, expected) - with pytest.raises(TypeError, match='Cannot divide'): + with pytest.raises(TypeError, match="Cannot divide"): two / tdser - @pytest.mark.parametrize('dtype', ['int64', 'int32', 'int16', - 'uint64', 'uint32', 'uint16', 'uint8', - 'float64', 'float32', 'float16']) - @pytest.mark.parametrize('vector', [np.array([20, 30, 40]), - pd.Index([20, 30, 40]), - Series([20, 30, 40])], - ids=lambda x: type(x).__name__) + @pytest.mark.parametrize( + "dtype", + [ + "int64", + "int32", + "int16", + "uint64", + "uint32", + "uint16", + "uint8", + "float64", + "float32", + "float16", + ], + ) + @pytest.mark.parametrize( + "vector", + [np.array([20, 30, 40]), pd.Index([20, 30, 40]), Series([20, 30, 40])], + ids=lambda x: type(x).__name__, + ) def test_td64arr_rmul_numeric_array(self, box_with_array, vector, dtype): # GH#4521 # divide/multiply by integers xbox = get_upcast_box(box_with_array, vector) - tdser = pd.Series(['59 Days', '59 Days', 'NaT'], dtype='m8[ns]') + tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") vector = vector.astype(dtype) - expected = Series(['1180 Days', '1770 Days', 'NaT'], - dtype='timedelta64[ns]') + expected = Series(["1180 Days", "1770 Days", "NaT"], dtype="timedelta64[ns]") tdser = tm.box_expected(tdser, box_with_array) expected = tm.box_expected(expected, xbox) @@ -1910,21 +1985,33 @@ def test_td64arr_rmul_numeric_array(self, box_with_array, vector, dtype): result = vector * tdser tm.assert_equal(result, expected) - @pytest.mark.parametrize('dtype', ['int64', 'int32', 'int16', - 'uint64', 'uint32', 'uint16', 'uint8', - 'float64', 'float32', 'float16']) - @pytest.mark.parametrize('vector', [np.array([20, 30, 40]), - pd.Index([20, 30, 40]), - Series([20, 30, 40])], - ids=lambda x: type(x).__name__) + @pytest.mark.parametrize( + "dtype", + [ + "int64", + "int32", + "int16", + "uint64", + "uint32", + "uint16", + "uint8", + "float64", + "float32", + "float16", + ], + ) + @pytest.mark.parametrize( + "vector", + [np.array([20, 30, 40]), pd.Index([20, 30, 40]), Series([20, 30, 40])], + ids=lambda x: type(x).__name__, + ) def test_td64arr_div_numeric_array(self, box_with_array, vector, dtype): # GH#4521 # divide/multiply by integers xbox = get_upcast_box(box_with_array, vector) - tdser = pd.Series(['59 Days', '59 Days', 'NaT'], dtype='m8[ns]') + tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") vector = vector.astype(dtype) - expected = Series(['2.95D', '1D 23H 12m', 'NaT'], - dtype='timedelta64[ns]') + expected = Series(["2.95D", "1D 23H 12m", "NaT"], dtype="timedelta64[ns]") tdser = tm.box_expected(tdser, box_with_array) expected = tm.box_expected(expected, xbox) @@ -1932,11 +2019,13 @@ def test_td64arr_div_numeric_array(self, box_with_array, vector, dtype): result = tdser / vector tm.assert_equal(result, expected) - pattern = ('true_divide cannot use operands|' - 'cannot perform __div__|' - 'cannot perform __truediv__|' - 'unsupported operand|' - 'Cannot divide') + pattern = ( + "true_divide cannot use operands|" + "cannot perform __div__|" + "cannot perform __truediv__|" + "unsupported operand|" + "Cannot divide" + ) with pytest.raises(TypeError, match=pattern): vector / tdser @@ -1944,8 +2033,7 @@ def test_td64arr_div_numeric_array(self, box_with_array, vector, dtype): # Index.__rdiv__ won't try to operate elementwise, just raises result = tdser / vector.astype(object) if box_with_array is pd.DataFrame: - expected = [tdser.iloc[0, n] / vector[n] - for n in range(len(vector))] + expected = [tdser.iloc[0, n] / vector[n] for n in range(len(vector))] else: expected = [tdser[n] / vector[n] for n in range(len(tdser))] expected = tm.box_expected(expected, xbox) @@ -1954,20 +2042,28 @@ def test_td64arr_div_numeric_array(self, box_with_array, vector, dtype): with pytest.raises(TypeError, match=pattern): vector.astype(object) / tdser - @pytest.mark.parametrize('names', [(None, None, None), - ('Egon', 'Venkman', None), - ('NCC1701D', 'NCC1701D', 'NCC1701D')]) + @pytest.mark.parametrize( + "names", + [ + (None, None, None), + ("Egon", "Venkman", None), + ("NCC1701D", "NCC1701D", "NCC1701D"), + ], + ) def test_td64arr_mul_int_series(self, box_df_fail, names): # GH#19042 test for correct name attachment box = box_df_fail # broadcasts along wrong axis, but doesn't raise - tdi = TimedeltaIndex(['0days', '1day', '2days', '3days', '4days'], - name=names[0]) + tdi = TimedeltaIndex( + ["0days", "1day", "2days", "3days", "4days"], name=names[0] + ) # TODO: Should we be parametrizing over types for `ser` too? ser = Series([0, 1, 2, 3, 4], dtype=np.int64, name=names[1]) - expected = Series(['0days', '1day', '4days', '9days', '16days'], - dtype='timedelta64[ns]', - name=names[2]) + expected = Series( + ["0days", "1day", "4days", "9days", "16days"], + dtype="timedelta64[ns]", + name=names[2], + ) tdi = tm.box_expected(tdi, box) box = Series if (box is pd.Index and type(ser) is Series) else box @@ -1981,22 +2077,30 @@ def test_td64arr_mul_int_series(self, box_df_fail, names): tm.assert_equal(result, expected) # TODO: Should we be parametrizing over types for `ser` too? - @pytest.mark.parametrize('names', [(None, None, None), - ('Egon', 'Venkman', None), - ('NCC1701D', 'NCC1701D', 'NCC1701D')]) + @pytest.mark.parametrize( + "names", + [ + (None, None, None), + ("Egon", "Venkman", None), + ("NCC1701D", "NCC1701D", "NCC1701D"), + ], + ) def test_float_series_rdiv_td64arr(self, box_with_array, names): # GH#19042 test for correct name attachment # TODO: the direct operation TimedeltaIndex / Series still # needs to be fixed. box = box_with_array - tdi = TimedeltaIndex(['0days', '1day', '2days', '3days', '4days'], - name=names[0]) + tdi = TimedeltaIndex( + ["0days", "1day", "2days", "3days", "4days"], name=names[0] + ) ser = Series([1.5, 3, 4.5, 6, 7.5], dtype=np.float64, name=names[1]) xname = names[2] if box is not tm.to_array else names[1] - expected = Series([tdi[n] / ser[n] for n in range(len(ser))], - dtype='timedelta64[ns]', - name=xname) + expected = Series( + [tdi[n] / ser[n] for n in range(len(ser))], + dtype="timedelta64[ns]", + name=xname, + ) xbox = box if box in [pd.Index, tm.to_array] and type(ser) is Series: @@ -2014,7 +2118,6 @@ def test_float_series_rdiv_td64arr(self, box_with_array, names): class TestTimedeltaArraylikeInvalidArithmeticOps: - def test_td64arr_pow_invalid(self, scalar_td, box_with_array): td1 = Series([timedelta(minutes=5, seconds=3)] * 3) td1.iloc[2] = np.nan @@ -2024,7 +2127,7 @@ def test_td64arr_pow_invalid(self, scalar_td, box_with_array): # check that we are getting a TypeError # with 'operate' (from core/ops.py) for the ops that are not # defined - pattern = 'operate|unsupported|cannot|not supported' + pattern = "operate|unsupported|cannot|not supported" with pytest.raises(TypeError, match=pattern): scalar_td ** td1 diff --git a/pandas/tests/arrays/categorical/common.py b/pandas/tests/arrays/categorical/common.py index cc4b53407107c1..4ef93906569796 100644 --- a/pandas/tests/arrays/categorical/common.py +++ b/pandas/tests/arrays/categorical/common.py @@ -2,7 +2,7 @@ class TestCategorical: - def setup_method(self, method): - self.factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], - ordered=True) + self.factor = Categorical( + ["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True + ) diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index 6b75d06438889c..1508fef86ae62e 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -5,45 +5,42 @@ import pandas.util.testing as tm -@pytest.mark.parametrize('ordered', [True, False]) -@pytest.mark.parametrize('categories', [ - ['b', 'a', 'c'], - ['a', 'b', 'c', 'd'], -]) +@pytest.mark.parametrize("ordered", [True, False]) +@pytest.mark.parametrize("categories", [["b", "a", "c"], ["a", "b", "c", "d"]]) def test_factorize(categories, ordered): - cat = pd.Categorical(['b', 'b', 'a', 'c', None], - categories=categories, - ordered=ordered) + cat = pd.Categorical( + ["b", "b", "a", "c", None], categories=categories, ordered=ordered + ) labels, uniques = pd.factorize(cat) expected_labels = np.array([0, 0, 1, 2, -1], dtype=np.intp) - expected_uniques = pd.Categorical(['b', 'a', 'c'], - categories=categories, - ordered=ordered) + expected_uniques = pd.Categorical( + ["b", "a", "c"], categories=categories, ordered=ordered + ) tm.assert_numpy_array_equal(labels, expected_labels) tm.assert_categorical_equal(uniques, expected_uniques) def test_factorized_sort(): - cat = pd.Categorical(['b', 'b', None, 'a']) + cat = pd.Categorical(["b", "b", None, "a"]) labels, uniques = pd.factorize(cat, sort=True) expected_labels = np.array([1, 1, -1, 0], dtype=np.intp) - expected_uniques = pd.Categorical(['a', 'b']) + expected_uniques = pd.Categorical(["a", "b"]) tm.assert_numpy_array_equal(labels, expected_labels) tm.assert_categorical_equal(uniques, expected_uniques) def test_factorized_sort_ordered(): - cat = pd.Categorical(['b', 'b', None, 'a'], - categories=['c', 'b', 'a'], - ordered=True) + cat = pd.Categorical( + ["b", "b", None, "a"], categories=["c", "b", "a"], ordered=True + ) labels, uniques = pd.factorize(cat, sort=True) expected_labels = np.array([0, 0, -1, 1], dtype=np.intp) - expected_uniques = pd.Categorical(['b', 'a'], - categories=['c', 'b', 'a'], - ordered=True) + expected_uniques = pd.Categorical( + ["b", "a"], categories=["c", "b", "a"], ordered=True + ) tm.assert_numpy_array_equal(labels, expected_labels) tm.assert_categorical_equal(uniques, expected_uniques) @@ -75,49 +72,52 @@ class TestTake: # https://github.com/pandas-dev/pandas/issues/20664 def test_take_warns(self): - cat = pd.Categorical(['a', 'b']) + cat = pd.Categorical(["a", "b"]) with tm.assert_produces_warning(FutureWarning): cat.take([0, -1]) def test_take_positive_no_warning(self): - cat = pd.Categorical(['a', 'b']) + cat = pd.Categorical(["a", "b"]) with tm.assert_produces_warning(None): cat.take([0, 0]) def test_take_bounds(self, allow_fill): # https://github.com/pandas-dev/pandas/issues/20664 - cat = pd.Categorical(['a', 'b', 'a']) + cat = pd.Categorical(["a", "b", "a"]) with pytest.raises(IndexError): cat.take([4, 5], allow_fill=allow_fill) def test_take_empty(self, allow_fill): # https://github.com/pandas-dev/pandas/issues/20664 - cat = pd.Categorical([], categories=['a', 'b']) + cat = pd.Categorical([], categories=["a", "b"]) with pytest.raises(IndexError): cat.take([0], allow_fill=allow_fill) def test_positional_take(self, ordered_fixture): - cat = pd.Categorical(['a', 'a', 'b', 'b'], categories=['b', 'a'], - ordered=ordered_fixture) + cat = pd.Categorical( + ["a", "a", "b", "b"], categories=["b", "a"], ordered=ordered_fixture + ) result = cat.take([0, 1, 2], allow_fill=False) - expected = pd.Categorical(['a', 'a', 'b'], categories=cat.categories, - ordered=ordered_fixture) + expected = pd.Categorical( + ["a", "a", "b"], categories=cat.categories, ordered=ordered_fixture + ) tm.assert_categorical_equal(result, expected) def test_positional_take_unobserved(self, ordered_fixture): - cat = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'], - ordered=ordered_fixture) + cat = pd.Categorical( + ["a", "b"], categories=["a", "b", "c"], ordered=ordered_fixture + ) result = cat.take([1, 0], allow_fill=False) - expected = pd.Categorical(['b', 'a'], categories=cat.categories, - ordered=ordered_fixture) + expected = pd.Categorical( + ["b", "a"], categories=cat.categories, ordered=ordered_fixture + ) tm.assert_categorical_equal(result, expected) def test_take_allow_fill(self): # https://github.com/pandas-dev/pandas/issues/23296 - cat = pd.Categorical(['a', 'a', 'b']) + cat = pd.Categorical(["a", "a", "b"]) result = cat.take([0, -1, -1], allow_fill=True) - expected = pd.Categorical(['a', np.nan, np.nan], - categories=['a', 'b']) + expected = pd.Categorical(["a", np.nan, np.nan], categories=["a", "b"]) tm.assert_categorical_equal(result, expected) def test_take_fill_with_negative_one(self): @@ -129,14 +129,14 @@ def test_take_fill_with_negative_one(self): def test_take_fill_value(self): # https://github.com/pandas-dev/pandas/issues/23296 - cat = pd.Categorical(['a', 'b', 'c']) - result = cat.take([0, 1, -1], fill_value='a', allow_fill=True) - expected = pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c']) + cat = pd.Categorical(["a", "b", "c"]) + result = cat.take([0, 1, -1], fill_value="a", allow_fill=True) + expected = pd.Categorical(["a", "b", "a"], categories=["a", "b", "c"]) tm.assert_categorical_equal(result, expected) def test_take_fill_value_new_raises(self): # https://github.com/pandas-dev/pandas/issues/23296 - cat = pd.Categorical(['a', 'b', 'c']) + cat = pd.Categorical(["a", "b", "c"]) xpr = r"'fill_value' \('d'\) is not in this Categorical's categories." with pytest.raises(TypeError, match=xpr): - cat.take([0, 1, -1], fill_value='d', allow_fill=True) + cat.take([0, 1, -1], fill_value="d", allow_fill=True) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 134b16d828746d..d8831d7e6bf369 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -11,15 +11,14 @@ class TestCategoricalAnalytics: - def test_min_max(self): # unordered cats have no min/max cat = Categorical(["a", "b", "c", "d"], ordered=False) msg = "Categorical is not ordered for operation {}" - with pytest.raises(TypeError, match=msg.format('min')): + with pytest.raises(TypeError, match=msg.format("min")): cat.min() - with pytest.raises(TypeError, match=msg.format('max')): + with pytest.raises(TypeError, match=msg.format("max")): cat.max() cat = Categorical(["a", "b", "c", "d"], ordered=True) @@ -28,15 +27,17 @@ def test_min_max(self): assert _min == "a" assert _max == "d" - cat = Categorical(["a", "b", "c", "d"], - categories=['d', 'c', 'b', 'a'], ordered=True) + cat = Categorical( + ["a", "b", "c", "d"], categories=["d", "c", "b", "a"], ordered=True + ) _min = cat.min() _max = cat.max() assert _min == "d" assert _max == "a" - cat = Categorical([np.nan, "b", "c", np.nan], - categories=['d', 'c', 'b', 'a'], ordered=True) + cat = Categorical( + [np.nan, "b", "c", np.nan], categories=["d", "c", "b", "a"], ordered=True + ) _min = cat.min() _max = cat.max() assert np.isnan(_min) @@ -47,8 +48,9 @@ def test_min_max(self): _max = cat.max(numeric_only=True) assert _max == "b" - cat = Categorical([np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], - ordered=True) + cat = Categorical( + [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True + ) _min = cat.min() _max = cat.max() assert np.isnan(_min) @@ -59,13 +61,17 @@ def test_min_max(self): _max = cat.max(numeric_only=True) assert _max == 1 - @pytest.mark.parametrize("values,categories,exp_mode", [ - ([1, 1, 2, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5]), - ([1, 1, 1, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5, 1]), - ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [5, 4, 3, 2, 1]), - ([np.nan, np.nan, np.nan, 4, 5], [5, 4, 3, 2, 1], [5, 4]), - ([np.nan, np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]), - ([np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4])]) + @pytest.mark.parametrize( + "values,categories,exp_mode", + [ + ([1, 1, 2, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5]), + ([1, 1, 1, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5, 1]), + ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [5, 4, 3, 2, 1]), + ([np.nan, np.nan, np.nan, 4, 5], [5, 4, 3, 2, 1], [5, 4]), + ([np.nan, np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]), + ([np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]), + ], + ) def test_mode(self, values, categories, exp_mode): s = Categorical(values, categories=categories, ordered=True) res = s.mode() @@ -76,34 +82,38 @@ def test_searchsorted(self): # https://github.com/pandas-dev/pandas/issues/8420 # https://github.com/pandas-dev/pandas/issues/14522 - c1 = Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'], - categories=['cheese', 'milk', 'apple', 'bread'], - ordered=True) + c1 = Categorical( + ["cheese", "milk", "apple", "bread", "bread"], + categories=["cheese", "milk", "apple", "bread"], + ordered=True, + ) s1 = Series(c1) - c2 = Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'], - categories=['cheese', 'milk', 'apple', 'bread'], - ordered=False) + c2 = Categorical( + ["cheese", "milk", "apple", "bread", "bread"], + categories=["cheese", "milk", "apple", "bread"], + ordered=False, + ) s2 = Series(c2) # Searching for single item argument, side='left' (default) - res_cat = c1.searchsorted('apple') + res_cat = c1.searchsorted("apple") assert res_cat == 2 assert is_scalar(res_cat) - res_ser = s1.searchsorted('apple') + res_ser = s1.searchsorted("apple") assert res_ser == 2 assert is_scalar(res_ser) # Searching for single item array, side='left' (default) - res_cat = c1.searchsorted(['bread']) - res_ser = s1.searchsorted(['bread']) + res_cat = c1.searchsorted(["bread"]) + res_ser = s1.searchsorted(["bread"]) exp = np.array([3], dtype=np.intp) tm.assert_numpy_array_equal(res_cat, exp) tm.assert_numpy_array_equal(res_ser, exp) # Searching for several items array, side='right' - res_cat = c1.searchsorted(['apple', 'bread'], side='right') - res_ser = s1.searchsorted(['apple', 'bread'], side='right') + res_cat = c1.searchsorted(["apple", "bread"], side="right") + res_ser = s1.searchsorted(["apple", "bread"], side="right") exp = np.array([3, 5], dtype=np.intp) tm.assert_numpy_array_equal(res_cat, exp) tm.assert_numpy_array_equal(res_ser, exp) @@ -111,22 +121,22 @@ def test_searchsorted(self): # Searching for a single value that is not from the Categorical msg = r"Value\(s\) to be inserted must be in categories" with pytest.raises(KeyError, match=msg): - c1.searchsorted('cucumber') + c1.searchsorted("cucumber") with pytest.raises(KeyError, match=msg): - s1.searchsorted('cucumber') + s1.searchsorted("cucumber") # Searching for multiple values one of each is not from the Categorical with pytest.raises(KeyError, match=msg): - c1.searchsorted(['bread', 'cucumber']) + c1.searchsorted(["bread", "cucumber"]) with pytest.raises(KeyError, match=msg): - s1.searchsorted(['bread', 'cucumber']) + s1.searchsorted(["bread", "cucumber"]) # searchsorted call for unordered Categorical msg = "Categorical not ordered" with pytest.raises(ValueError, match=msg): - c2.searchsorted('apple') + c2.searchsorted("apple") with pytest.raises(ValueError, match=msg): - s2.searchsorted('apple') + s2.searchsorted("apple") def test_unique(self): # categories are reordered based on value when ordered=False @@ -141,17 +151,15 @@ def test_unique(self): tm.assert_index_equal(res.categories, exp) tm.assert_categorical_equal(res, Categorical(exp)) - cat = Categorical(["c", "a", "b", "a", "a"], - categories=["a", "b", "c"]) + cat = Categorical(["c", "a", "b", "a", "a"], categories=["a", "b", "c"]) exp = Index(["c", "a", "b"]) res = cat.unique() tm.assert_index_equal(res.categories, exp) - exp_cat = Categorical(exp, categories=['c', 'a', 'b']) + exp_cat = Categorical(exp, categories=["c", "a", "b"]) tm.assert_categorical_equal(res, exp_cat) # nan must be removed - cat = Categorical(["b", np.nan, "b", np.nan, "a"], - categories=["a", "b", "c"]) + cat = Categorical(["b", np.nan, "b", np.nan, "a"], categories=["a", "b", "c"]) res = cat.unique() exp = Index(["b", "a"]) tm.assert_index_equal(res.categories, exp) @@ -160,29 +168,28 @@ def test_unique(self): def test_unique_ordered(self): # keep categories order when ordered=True - cat = Categorical(['b', 'a', 'b'], categories=['a', 'b'], ordered=True) + cat = Categorical(["b", "a", "b"], categories=["a", "b"], ordered=True) res = cat.unique() - exp_cat = Categorical(['b', 'a'], categories=['a', 'b'], ordered=True) + exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True) tm.assert_categorical_equal(res, exp_cat) - cat = Categorical(['c', 'b', 'a', 'a'], categories=['a', 'b', 'c'], - ordered=True) + cat = Categorical( + ["c", "b", "a", "a"], categories=["a", "b", "c"], ordered=True + ) res = cat.unique() - exp_cat = Categorical(['c', 'b', 'a'], categories=['a', 'b', 'c'], - ordered=True) + exp_cat = Categorical(["c", "b", "a"], categories=["a", "b", "c"], ordered=True) tm.assert_categorical_equal(res, exp_cat) - cat = Categorical(['b', 'a', 'a'], categories=['a', 'b', 'c'], - ordered=True) + cat = Categorical(["b", "a", "a"], categories=["a", "b", "c"], ordered=True) res = cat.unique() - exp_cat = Categorical(['b', 'a'], categories=['a', 'b'], ordered=True) + exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True) tm.assert_categorical_equal(res, exp_cat) - cat = Categorical(['b', 'b', np.nan, 'a'], categories=['a', 'b', 'c'], - ordered=True) + cat = Categorical( + ["b", "b", np.nan, "a"], categories=["a", "b", "c"], ordered=True + ) res = cat.unique() - exp_cat = Categorical(['b', np.nan, 'a'], categories=['a', 'b'], - ordered=True) + exp_cat = Categorical(["b", np.nan, "a"], categories=["a", "b"], ordered=True) tm.assert_categorical_equal(res, exp_cat) def test_unique_index_series(self): @@ -211,18 +218,19 @@ def test_unique_index_series(self): def test_shift(self): # GH 9416 - cat = Categorical(['a', 'b', 'c', 'd', 'a']) + cat = Categorical(["a", "b", "c", "d", "a"]) # shift forward sp1 = cat.shift(1) - xp1 = Categorical([np.nan, 'a', 'b', 'c', 'd']) + xp1 = Categorical([np.nan, "a", "b", "c", "d"]) tm.assert_categorical_equal(sp1, xp1) tm.assert_categorical_equal(cat[:-1], sp1[1:]) # shift back sn2 = cat.shift(-2) - xp2 = Categorical(['c', 'd', 'a', np.nan, np.nan], - categories=['a', 'b', 'c', 'd']) + xp2 = Categorical( + ["c", "d", "a", np.nan, np.nan], categories=["a", "b", "c", "d"] + ) tm.assert_categorical_equal(sn2, xp2) tm.assert_categorical_equal(cat[2:], sn2[:-2]) @@ -241,7 +249,7 @@ def test_memory_usage(self): assert 0 < cat.nbytes <= cat.memory_usage() assert 0 < cat.nbytes <= cat.memory_usage(deep=True) - cat = Categorical(['foo', 'foo', 'bar']) + cat = Categorical(["foo", "foo", "bar"]) assert cat.memory_usage(deep=True) > cat.nbytes if not PYPY: @@ -251,14 +259,14 @@ def test_memory_usage(self): assert abs(diff) < 100 def test_map(self): - c = Categorical(list('ABABC'), categories=list('CBA'), ordered=True) + c = Categorical(list("ABABC"), categories=list("CBA"), ordered=True) result = c.map(lambda x: x.lower()) - exp = Categorical(list('ababc'), categories=list('cba'), ordered=True) + exp = Categorical(list("ababc"), categories=list("cba"), ordered=True) tm.assert_categorical_equal(result, exp) - c = Categorical(list('ABABC'), categories=list('ABC'), ordered=False) + c = Categorical(list("ABABC"), categories=list("ABC"), ordered=False) result = c.map(lambda x: x.lower()) - exp = Categorical(list('ababc'), categories=list('abc'), ordered=False) + exp = Categorical(list("ababc"), categories=list("abc"), ordered=False) tm.assert_categorical_equal(result, exp) result = c.map(lambda x: 1) @@ -266,7 +274,7 @@ def test_map(self): tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64))) def test_validate_inplace(self): - cat = Categorical(['A', 'B', 'B', 'C', 'A']) + cat = Categorical(["A", "B", "B", "C", "A"]) invalid_values = [1, "True", [1, 2, 3], 5.0] for value in invalid_values: @@ -280,21 +288,19 @@ def test_validate_inplace(self): cat.as_unordered(inplace=value) with pytest.raises(ValueError): - cat.set_categories(['X', 'Y', 'Z'], rename=True, inplace=value) + cat.set_categories(["X", "Y", "Z"], rename=True, inplace=value) with pytest.raises(ValueError): - cat.rename_categories(['X', 'Y', 'Z'], inplace=value) + cat.rename_categories(["X", "Y", "Z"], inplace=value) with pytest.raises(ValueError): - cat.reorder_categories( - ['X', 'Y', 'Z'], ordered=True, inplace=value) + cat.reorder_categories(["X", "Y", "Z"], ordered=True, inplace=value) with pytest.raises(ValueError): - cat.add_categories( - new_categories=['D', 'E', 'F'], inplace=value) + cat.add_categories(new_categories=["D", "E", "F"], inplace=value) with pytest.raises(ValueError): - cat.remove_categories(removals=['D', 'E', 'F'], inplace=value) + cat.remove_categories(removals=["D", "E", "F"], inplace=value) with pytest.raises(ValueError): cat.remove_unused_categories(inplace=value) diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index d2f63268e5a123..ab07b3c96a1db5 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -8,23 +8,22 @@ class TestCategoricalAPI: - def test_ordered_api(self): # GH 9347 - cat1 = Categorical(list('acb'), ordered=False) - tm.assert_index_equal(cat1.categories, Index(['a', 'b', 'c'])) + cat1 = Categorical(list("acb"), ordered=False) + tm.assert_index_equal(cat1.categories, Index(["a", "b", "c"])) assert not cat1.ordered - cat2 = Categorical(list('acb'), categories=list('bca'), ordered=False) - tm.assert_index_equal(cat2.categories, Index(['b', 'c', 'a'])) + cat2 = Categorical(list("acb"), categories=list("bca"), ordered=False) + tm.assert_index_equal(cat2.categories, Index(["b", "c", "a"])) assert not cat2.ordered - cat3 = Categorical(list('acb'), ordered=True) - tm.assert_index_equal(cat3.categories, Index(['a', 'b', 'c'])) + cat3 = Categorical(list("acb"), ordered=True) + tm.assert_index_equal(cat3.categories, Index(["a", "b", "c"])) assert cat3.ordered - cat4 = Categorical(list('acb'), categories=list('bca'), ordered=True) - tm.assert_index_equal(cat4.categories, Index(['b', 'c', 'a'])) + cat4 = Categorical(list("acb"), categories=list("bca"), ordered=True) + tm.assert_index_equal(cat4.categories, Index(["b", "c", "a"])) assert cat4.ordered def test_set_ordered(self): @@ -47,7 +46,7 @@ def test_set_ordered(self): assert not cat2.ordered # removed in 0.19.0 - msg = "can\'t set attribute" + msg = "can't set attribute" with pytest.raises(AttributeError, match=msg): cat.ordered = True with pytest.raises(AttributeError, match=msg): @@ -58,8 +57,9 @@ def test_rename_categories(self): # inplace=False: the old one must not be changed res = cat.rename_categories([1, 2, 3]) - tm.assert_numpy_array_equal(res.__array__(), np.array([1, 2, 3, 1], - dtype=np.int64)) + tm.assert_numpy_array_equal( + res.__array__(), np.array([1, 2, 3, 1], dtype=np.int64) + ) tm.assert_index_equal(res.categories, Index([1, 2, 3])) exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_) @@ -76,8 +76,9 @@ def test_rename_categories(self): # and now inplace res = cat.rename_categories([1, 2, 3], inplace=True) assert res is None - tm.assert_numpy_array_equal(cat.__array__(), np.array([1, 2, 3, 1], - dtype=np.int64)) + tm.assert_numpy_array_equal( + cat.__array__(), np.array([1, 2, 3, 1], dtype=np.int64) + ) tm.assert_index_equal(cat.categories, Index([1, 2, 3])) # Lengthen @@ -90,50 +91,49 @@ def test_rename_categories(self): def test_rename_categories_series(self): # https://github.com/pandas-dev/pandas/issues/17981 - c = Categorical(['a', 'b']) - result = c.rename_categories(Series([0, 1], index=['a', 'b'])) + c = Categorical(["a", "b"]) + result = c.rename_categories(Series([0, 1], index=["a", "b"])) expected = Categorical([0, 1]) tm.assert_categorical_equal(result, expected) def test_rename_categories_dict(self): # GH 17336 - cat = Categorical(['a', 'b', 'c', 'd']) - res = cat.rename_categories({'a': 4, 'b': 3, 'c': 2, 'd': 1}) + cat = Categorical(["a", "b", "c", "d"]) + res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1}) expected = Index([4, 3, 2, 1]) tm.assert_index_equal(res.categories, expected) # Test for inplace - res = cat.rename_categories({'a': 4, 'b': 3, 'c': 2, 'd': 1}, - inplace=True) + res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1}, inplace=True) assert res is None tm.assert_index_equal(cat.categories, expected) # Test for dicts of smaller length - cat = Categorical(['a', 'b', 'c', 'd']) - res = cat.rename_categories({'a': 1, 'c': 3}) + cat = Categorical(["a", "b", "c", "d"]) + res = cat.rename_categories({"a": 1, "c": 3}) - expected = Index([1, 'b', 3, 'd']) + expected = Index([1, "b", 3, "d"]) tm.assert_index_equal(res.categories, expected) # Test for dicts with bigger length - cat = Categorical(['a', 'b', 'c', 'd']) - res = cat.rename_categories({'a': 1, 'b': 2, 'c': 3, - 'd': 4, 'e': 5, 'f': 6}) + cat = Categorical(["a", "b", "c", "d"]) + res = cat.rename_categories({"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6}) expected = Index([1, 2, 3, 4]) tm.assert_index_equal(res.categories, expected) # Test for dicts with no items from old categories - cat = Categorical(['a', 'b', 'c', 'd']) - res = cat.rename_categories({'f': 1, 'g': 3}) + cat = Categorical(["a", "b", "c", "d"]) + res = cat.rename_categories({"f": 1, "g": 3}) - expected = Index(['a', 'b', 'c', 'd']) + expected = Index(["a", "b", "c", "d"]) tm.assert_index_equal(res.categories, expected) def test_reorder_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) old = cat.copy() - new = Categorical(["a", "b", "c", "a"], categories=["c", "b", "a"], - ordered=True) + new = Categorical( + ["a", "b", "c", "a"], categories=["c", "b", "a"], ordered=True + ) # first inplace == False res = cat.reorder_categories(["c", "b", "a"]) @@ -164,8 +164,9 @@ def test_reorder_categories(self): def test_add_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) old = cat.copy() - new = Categorical(["a", "b", "c", "a"], - categories=["a", "b", "c", "d"], ordered=True) + new = Categorical( + ["a", "b", "c", "a"], categories=["a", "b", "c", "d"], ordered=True + ) # first inplace == False res = cat.add_categories("d") @@ -187,8 +188,7 @@ def test_add_categories(self): # GH 9927 cat = Categorical(list("abc"), ordered=True) - expected = Categorical( - list("abc"), categories=list("abcde"), ordered=True) + expected = Categorical(list("abc"), categories=list("abcde"), ordered=True) # test with Series, np.array, index, list res = cat.add_categories(Series(["d", "e"])) tm.assert_categorical_equal(res, expected) @@ -222,13 +222,11 @@ def test_set_categories(self): # np.nan cat = Categorical(["a", "b", "c", "a"], ordered=True) res = cat.set_categories(["a"]) - tm.assert_numpy_array_equal(res.codes, np.array([0, -1, -1, 0], - dtype=np.int8)) + tm.assert_numpy_array_equal(res.codes, np.array([0, -1, -1, 0], dtype=np.int8)) # still not all "old" in "new" res = cat.set_categories(["a", "b", "d"]) - tm.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0], - dtype=np.int8)) + tm.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0], dtype=np.int8)) tm.assert_index_equal(res.categories, Index(["a", "b", "d"])) # all "old" included in "new" @@ -238,8 +236,7 @@ def test_set_categories(self): # internals... c = Categorical([1, 2, 3, 4, 1], categories=[1, 2, 3, 4], ordered=True) - tm.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0], - dtype=np.int8)) + tm.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0], dtype=np.int8)) tm.assert_index_equal(c.categories, Index([1, 2, 3, 4])) exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) @@ -249,8 +246,7 @@ def test_set_categories(self): c = c.set_categories([4, 3, 2, 1]) # positions are changed - tm.assert_numpy_array_equal(c._codes, np.array([3, 2, 1, 0, 3], - dtype=np.int8)) + tm.assert_numpy_array_equal(c._codes, np.array([3, 2, 1, 0, 3], dtype=np.int8)) # categories are now in new order tm.assert_index_equal(c.categories, Index([4, 3, 2, 1])) @@ -273,31 +269,33 @@ def test_set_categories(self): tm.assert_numpy_array_equal(c.to_dense(), c2.to_dense()) - @pytest.mark.parametrize('values, categories, new_categories', [ - # No NaNs, same cats, same order - (['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],), - # No NaNs, same cats, different order - (['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],), - # Same, unsorted - (['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],), - # No NaNs, same cats, different order - (['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],), - # NaNs - (['a', 'b', 'c'], ['a', 'b'], ['a', 'b']), - (['a', 'b', 'c'], ['a', 'b'], ['b', 'a']), - (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']), - (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']), - # Introduce NaNs - (['a', 'b', 'c'], ['a', 'b'], ['a']), - (['a', 'b', 'c'], ['a', 'b'], ['b']), - (['b', 'a', 'c'], ['a', 'b'], ['a']), - (['b', 'a', 'c'], ['a', 'b'], ['a']), - # No overlap - (['a', 'b', 'c'], ['a', 'b'], ['d', 'e']), - ]) - @pytest.mark.parametrize('ordered', [True, False]) - def test_set_categories_many(self, values, categories, new_categories, - ordered): + @pytest.mark.parametrize( + "values, categories, new_categories", + [ + # No NaNs, same cats, same order + (["a", "b", "a"], ["a", "b"], ["a", "b"]), + # No NaNs, same cats, different order + (["a", "b", "a"], ["a", "b"], ["b", "a"]), + # Same, unsorted + (["b", "a", "a"], ["a", "b"], ["a", "b"]), + # No NaNs, same cats, different order + (["b", "a", "a"], ["a", "b"], ["b", "a"]), + # NaNs + (["a", "b", "c"], ["a", "b"], ["a", "b"]), + (["a", "b", "c"], ["a", "b"], ["b", "a"]), + (["b", "a", "c"], ["a", "b"], ["a", "b"]), + (["b", "a", "c"], ["a", "b"], ["a", "b"]), + # Introduce NaNs + (["a", "b", "c"], ["a", "b"], ["a"]), + (["a", "b", "c"], ["a", "b"], ["b"]), + (["b", "a", "c"], ["a", "b"], ["a"]), + (["b", "a", "c"], ["a", "b"], ["a"]), + # No overlap + (["a", "b", "c"], ["a", "b"], ["d", "e"]), + ], + ) + @pytest.mark.parametrize("ordered", [True, False]) + def test_set_categories_many(self, values, categories, new_categories, ordered): c = Categorical(values, categories) expected = Categorical(values, new_categories, ordered) result = c.set_categories(new_categories, ordered=ordered) @@ -305,28 +303,27 @@ def test_set_categories_many(self, values, categories, new_categories, def test_set_categories_rename_less(self): # GH 24675 - cat = Categorical(['A', 'B']) - result = cat.set_categories(['A'], rename=True) - expected = Categorical(['A', np.nan]) + cat = Categorical(["A", "B"]) + result = cat.set_categories(["A"], rename=True) + expected = Categorical(["A", np.nan]) tm.assert_categorical_equal(result, expected) def test_set_categories_private(self): - cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd']) - cat._set_categories(['a', 'c', 'd', 'e']) - expected = Categorical(['a', 'c', 'd'], categories=list('acde')) + cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"]) + cat._set_categories(["a", "c", "d", "e"]) + expected = Categorical(["a", "c", "d"], categories=list("acde")) tm.assert_categorical_equal(cat, expected) # fastpath - cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd']) - cat._set_categories(['a', 'c', 'd', 'e'], fastpath=True) - expected = Categorical(['a', 'c', 'd'], categories=list('acde')) + cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"]) + cat._set_categories(["a", "c", "d", "e"], fastpath=True) + expected = Categorical(["a", "c", "d"], categories=list("acde")) tm.assert_categorical_equal(cat, expected) def test_remove_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) old = cat.copy() - new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"], - ordered=True) + new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"], ordered=True) # first inplace == False res = cat.remove_categories("c") @@ -347,8 +344,7 @@ def test_remove_categories(self): cat.remove_categories(["c"]) def test_remove_unused_categories(self): - c = Categorical(["a", "b", "c", "d", "a"], - categories=["a", "b", "c", "d", "e"]) + c = Categorical(["a", "b", "c", "d", "a"], categories=["a", "b", "c", "d", "e"]) exp_categories_all = Index(["a", "b", "c", "d", "e"]) exp_categories_dropped = Index(["a", "b", "c", "d"]) @@ -363,25 +359,23 @@ def test_remove_unused_categories(self): assert res is None # with NaN values (GH11599) - c = Categorical(["a", "b", "c", np.nan], - categories=["a", "b", "c", "d", "e"]) + c = Categorical(["a", "b", "c", np.nan], categories=["a", "b", "c", "d", "e"]) res = c.remove_unused_categories() - tm.assert_index_equal(res.categories, - Index(np.array(["a", "b", "c"]))) + tm.assert_index_equal(res.categories, Index(np.array(["a", "b", "c"]))) exp_codes = np.array([0, 1, 2, -1], dtype=np.int8) tm.assert_numpy_array_equal(res.codes, exp_codes) tm.assert_index_equal(c.categories, exp_categories_all) - val = ['F', np.nan, 'D', 'B', 'D', 'F', np.nan] - cat = Categorical(values=val, categories=list('ABCDEFG')) + val = ["F", np.nan, "D", "B", "D", "F", np.nan] + cat = Categorical(values=val, categories=list("ABCDEFG")) out = cat.remove_unused_categories() - tm.assert_index_equal(out.categories, Index(['B', 'D', 'F'])) + tm.assert_index_equal(out.categories, Index(["B", "D", "F"])) exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8) tm.assert_numpy_array_equal(out.codes, exp_codes) assert out.tolist() == val - alpha = list('abcdefghijklmnopqrstuvwxyz') - val = np.random.choice(alpha[::2], 10000).astype('object') + alpha = list("abcdefghijklmnopqrstuvwxyz") + val = np.random.choice(alpha[::2], 10000).astype("object") val[np.random.choice(len(val), 100)] = np.nan cat = Categorical(values=val, categories=alpha) @@ -390,16 +384,16 @@ def test_remove_unused_categories(self): class TestCategoricalAPIWithFactor(TestCategorical): - def test_describe(self): # string type desc = self.factor.describe() assert self.factor.ordered - exp_index = CategoricalIndex(['a', 'b', 'c'], name='categories', - ordered=self.factor.ordered) - expected = DataFrame({'counts': [3, 2, 3], - 'freqs': [3 / 8., 2 / 8., 3 / 8.]}, - index=exp_index) + exp_index = CategoricalIndex( + ["a", "b", "c"], name="categories", ordered=self.factor.ordered + ) + expected = DataFrame( + {"counts": [3, 2, 3], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0]}, index=exp_index + ) tm.assert_frame_equal(desc, expected) # check unused categories @@ -408,51 +402,53 @@ def test_describe(self): desc = cat.describe() exp_index = CategoricalIndex( - list('abcd'), ordered=self.factor.ordered, name='categories') - expected = DataFrame({'counts': [3, 2, 3, 0], - 'freqs': [3 / 8., 2 / 8., 3 / 8., 0]}, - index=exp_index) + list("abcd"), ordered=self.factor.ordered, name="categories" + ) + expected = DataFrame( + {"counts": [3, 2, 3, 0], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0, 0]}, + index=exp_index, + ) tm.assert_frame_equal(desc, expected) # check an integer one cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1]) desc = cat.describe() - exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered, - name='categories') - expected = DataFrame({'counts': [5, 3, 3], - 'freqs': [5 / 11., 3 / 11., 3 / 11.]}, - index=exp_index) + exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered, name="categories") + expected = DataFrame( + {"counts": [5, 3, 3], "freqs": [5 / 11.0, 3 / 11.0, 3 / 11.0]}, + index=exp_index, + ) tm.assert_frame_equal(desc, expected) # https://github.com/pandas-dev/pandas/issues/3678 # describe should work with NaN cat = Categorical([np.nan, 1, 2, 2]) desc = cat.describe() - expected = DataFrame({'counts': [1, 2, 1], - 'freqs': [1 / 4., 2 / 4., 1 / 4.]}, - index=CategoricalIndex([1, 2, np.nan], - categories=[1, 2], - name='categories')) + expected = DataFrame( + {"counts": [1, 2, 1], "freqs": [1 / 4.0, 2 / 4.0, 1 / 4.0]}, + index=CategoricalIndex( + [1, 2, np.nan], categories=[1, 2], name="categories" + ), + ) tm.assert_frame_equal(desc, expected) def test_set_categories_inplace(self): cat = self.factor.copy() - cat.set_categories(['a', 'b', 'c', 'd'], inplace=True) - tm.assert_index_equal(cat.categories, Index(['a', 'b', 'c', 'd'])) + cat.set_categories(["a", "b", "c", "d"], inplace=True) + tm.assert_index_equal(cat.categories, Index(["a", "b", "c", "d"])) class TestPrivateCategoricalAPI: - def test_codes_immutable(self): # Codes should be read only c = Categorical(["a", "b", "c", "a", np.nan]) - exp = np.array([0, 1, 2, 0, -1], dtype='int8') + exp = np.array([0, 1, 2, 0, -1], dtype="int8") tm.assert_numpy_array_equal(c.codes, exp) # Assignments to codes should raise with pytest.raises(ValueError): - c.codes = np.array([0, 1, 2, 0, 1], dtype='int8') + c.codes = np.array([0, 1, 2, 0, 1], dtype="int8") # changes in the codes array should raise codes = c.codes @@ -463,26 +459,29 @@ def test_codes_immutable(self): # But even after getting the codes, the original array should still be # writeable! c[4] = "a" - exp = np.array([0, 1, 2, 0, 0], dtype='int8') + exp = np.array([0, 1, 2, 0, 0], dtype="int8") tm.assert_numpy_array_equal(c.codes, exp) c._codes[4] = 2 - exp = np.array([0, 1, 2, 0, 2], dtype='int8') + exp = np.array([0, 1, 2, 0, 2], dtype="int8") tm.assert_numpy_array_equal(c.codes, exp) - @pytest.mark.parametrize('codes, old, new, expected', [ - ([0, 1], ['a', 'b'], ['a', 'b'], [0, 1]), - ([0, 1], ['b', 'a'], ['b', 'a'], [0, 1]), - ([0, 1], ['a', 'b'], ['b', 'a'], [1, 0]), - ([0, 1], ['b', 'a'], ['a', 'b'], [1, 0]), - ([0, 1, 0, 1], ['a', 'b'], ['a', 'b', 'c'], [0, 1, 0, 1]), - ([0, 1, 2, 2], ['a', 'b', 'c'], ['a', 'b'], [0, 1, -1, -1]), - ([0, 1, -1], ['a', 'b', 'c'], ['a', 'b', 'c'], [0, 1, -1]), - ([0, 1, -1], ['a', 'b', 'c'], ['b'], [-1, 0, -1]), - ([0, 1, -1], ['a', 'b', 'c'], ['d'], [-1, -1, -1]), - ([0, 1, -1], ['a', 'b', 'c'], [], [-1, -1, -1]), - ([-1, -1], [], ['a', 'b'], [-1, -1]), - ([1, 0], ['b', 'a'], ['a', 'b'], [0, 1]), - ]) + @pytest.mark.parametrize( + "codes, old, new, expected", + [ + ([0, 1], ["a", "b"], ["a", "b"], [0, 1]), + ([0, 1], ["b", "a"], ["b", "a"], [0, 1]), + ([0, 1], ["a", "b"], ["b", "a"], [1, 0]), + ([0, 1], ["b", "a"], ["a", "b"], [1, 0]), + ([0, 1, 0, 1], ["a", "b"], ["a", "b", "c"], [0, 1, 0, 1]), + ([0, 1, 2, 2], ["a", "b", "c"], ["a", "b"], [0, 1, -1, -1]), + ([0, 1, -1], ["a", "b", "c"], ["a", "b", "c"], [0, 1, -1]), + ([0, 1, -1], ["a", "b", "c"], ["b"], [-1, 0, -1]), + ([0, 1, -1], ["a", "b", "c"], ["d"], [-1, -1, -1]), + ([0, 1, -1], ["a", "b", "c"], [], [-1, -1, -1]), + ([-1, -1], [], ["a", "b"], [-1, -1]), + ([1, 0], ["b", "a"], ["a", "b"], [0, 1]), + ], + ) def test_recode_to_categories(self, codes, old, new, expected): codes = np.asanyarray(codes, dtype=np.int8) expected = np.asanyarray(expected, dtype=np.int8) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index a28977a87de8d5..4bf31a52dcda8f 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -8,14 +8,23 @@ import pandas as pd from pandas import ( - Categorical, CategoricalIndex, DatetimeIndex, Index, Interval, - IntervalIndex, NaT, Series, Timestamp, date_range, period_range, - timedelta_range) + Categorical, + CategoricalIndex, + DatetimeIndex, + Index, + Interval, + IntervalIndex, + NaT, + Series, + Timestamp, + date_range, + period_range, + timedelta_range, +) import pandas.util.testing as tm class TestCategoricalConstructors: - def test_validate_ordered(self): # see gh-14058 exp_msg = "'ordered' must either be 'True' or 'False'" @@ -28,8 +37,9 @@ def test_validate_ordered(self): Categorical([1, 2, 3], ordered=ordered) with pytest.raises(exp_err, match=exp_msg): - Categorical.from_codes([0, 0, 1], categories=['a', 'b', 'c'], - ordered=ordered) + Categorical.from_codes( + [0, 0, 1], categories=["a", "b", "c"], ordered=ordered + ) def test_constructor_empty(self): # GH 17248 @@ -57,32 +67,42 @@ def test_constructor_tuples(self): def test_constructor_tuples_datetimes(self): # numpy will auto reshape when all of the tuples are the # same len, so add an extra one with 2 items and slice it off - values = np.array([(Timestamp('2010-01-01'),), - (Timestamp('2010-01-02'),), - (Timestamp('2010-01-01'),), - (Timestamp('2010-01-02'),), - ('a', 'b')], dtype=object)[:-1] + values = np.array( + [ + (Timestamp("2010-01-01"),), + (Timestamp("2010-01-02"),), + (Timestamp("2010-01-01"),), + (Timestamp("2010-01-02"),), + ("a", "b"), + ], + dtype=object, + )[:-1] result = Categorical(values) - expected = Index([(Timestamp('2010-01-01'),), - (Timestamp('2010-01-02'),)], tupleize_cols=False) + expected = Index( + [(Timestamp("2010-01-01"),), (Timestamp("2010-01-02"),)], + tupleize_cols=False, + ) tm.assert_index_equal(result.categories, expected) def test_constructor_unsortable(self): # it works! - arr = np.array([1, 2, 3, datetime.now()], dtype='O') + arr = np.array([1, 2, 3, datetime.now()], dtype="O") factor = Categorical(arr, ordered=False) assert not factor.ordered # this however will raise as cannot be sorted - msg = ("'values' is not ordered, please explicitly specify the " - "categories order by passing in a categories argument.") + msg = ( + "'values' is not ordered, please explicitly specify the " + "categories order by passing in a categories argument." + ) with pytest.raises(TypeError, match=msg): Categorical(arr, ordered=True) def test_constructor_interval(self): - result = Categorical([Interval(1, 2), Interval(2, 3), Interval(3, 6)], - ordered=True) + result = Categorical( + [Interval(1, 2), Interval(2, 3), Interval(3, 6)], ordered=True + ) ii = IntervalIndex([Interval(1, 2), Interval(2, 3), Interval(3, 6)]) exp = Categorical(ii, ordered=True) tm.assert_categorical_equal(result, exp) @@ -143,8 +163,7 @@ def test_constructor(self): tm.assert_categorical_equal(c1, c2) c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) - c2 = Categorical(Series(["a", "b", "c", "a"]), - categories=["a", "b", "c", "d"]) + c2 = Categorical(Series(["a", "b", "c", "a"]), categories=["a", "b", "c", "d"]) tm.assert_categorical_equal(c1, c2) # This should result in integer categories, not float! @@ -156,10 +175,10 @@ def test_constructor(self): assert is_integer_dtype(cat.categories) # this should result in floats - cat = Categorical([np.nan, 1, 2., 3]) + cat = Categorical([np.nan, 1, 2.0, 3]) assert is_float_dtype(cat.categories) - cat = Categorical([np.nan, 1., 2., 3.]) + cat = Categorical([np.nan, 1.0, 2.0, 3.0]) assert is_float_dtype(cat.categories) # This doesn't work -> this would probably need some kind of "remember @@ -193,12 +212,10 @@ def test_constructor(self): # - when the first is an integer dtype and the second is not # - when the resulting codes are all -1/NaN with tm.assert_produces_warning(None): - c_old = Categorical([0, 1, 2, 0, 1, 2], - categories=["a", "b", "c"]) # noqa + c_old = Categorical([0, 1, 2, 0, 1, 2], categories=["a", "b", "c"]) # noqa with tm.assert_produces_warning(None): - c_old = Categorical([0, 1, 2, 0, 1, 2], # noqa - categories=[3, 4, 5]) + c_old = Categorical([0, 1, 2, 0, 1, 2], categories=[3, 4, 5]) # noqa # the next one are from the old docs with tm.assert_produces_warning(None): @@ -207,8 +224,9 @@ def test_constructor(self): # this is a legitimate constructor with tm.assert_produces_warning(None): - c = Categorical(np.array([], dtype='int64'), # noqa - categories=[3, 2, 1], ordered=True) + c = Categorical( + np.array([], dtype="int64"), categories=[3, 2, 1], ordered=True # noqa + ) def test_constructor_with_existing_categories(self): # GH25318: constructing with pd.Series used to bogusly skip recoding @@ -226,32 +244,32 @@ def test_constructor_not_sequence(self): # https://github.com/pandas-dev/pandas/issues/16022 msg = r"^Parameter 'categories' must be list-like, was" with pytest.raises(TypeError, match=msg): - Categorical(['a', 'b'], categories='a') + Categorical(["a", "b"], categories="a") def test_constructor_with_null(self): # Cannot have NaN in categories msg = "Categorial categories cannot be null" with pytest.raises(ValueError, match=msg): - Categorical([np.nan, "a", "b", "c"], - categories=[np.nan, "a", "b", "c"]) + Categorical([np.nan, "a", "b", "c"], categories=[np.nan, "a", "b", "c"]) with pytest.raises(ValueError, match=msg): - Categorical([None, "a", "b", "c"], - categories=[None, "a", "b", "c"]) + Categorical([None, "a", "b", "c"], categories=[None, "a", "b", "c"]) with pytest.raises(ValueError, match=msg): - Categorical(DatetimeIndex(['nat', '20160101']), - categories=[NaT, Timestamp('20160101')]) + Categorical( + DatetimeIndex(["nat", "20160101"]), + categories=[NaT, Timestamp("20160101")], + ) def test_constructor_with_index(self): - ci = CategoricalIndex(list('aabbca'), categories=list('cab')) + ci = CategoricalIndex(list("aabbca"), categories=list("cab")) tm.assert_categorical_equal(ci.values, Categorical(ci)) - ci = CategoricalIndex(list('aabbca'), categories=list('cab')) - tm.assert_categorical_equal(ci.values, - Categorical(ci.astype(object), - categories=ci.categories)) + ci = CategoricalIndex(list("aabbca"), categories=list("cab")) + tm.assert_categorical_equal( + ci.values, Categorical(ci.astype(object), categories=ci.categories) + ) def test_constructor_with_generator(self): # This was raising an Error in isna(single_val).any() because isna @@ -266,7 +284,8 @@ def test_constructor_with_generator(self): # This uses xrange internally from pandas.core.index import MultiIndex - MultiIndex.from_product([range(5), ['a', 'b', 'c']]) + + MultiIndex.from_product([range(5), ["a", "b", "c"]]) # check that categories accept generators and sequences cat = Categorical([0, 1, 2], categories=(x for x in [0, 1, 2])) @@ -274,12 +293,14 @@ def test_constructor_with_generator(self): cat = Categorical([0, 1, 2], categories=xrange(3)) tm.assert_categorical_equal(cat, exp) - @pytest.mark.parametrize("dtl", [ - date_range("1995-01-01 00:00:00", periods=5, freq="s"), - date_range("1995-01-01 00:00:00", periods=5, - freq="s", tz="US/Eastern"), - timedelta_range("1 day", periods=5, freq="s") - ]) + @pytest.mark.parametrize( + "dtl", + [ + date_range("1995-01-01 00:00:00", periods=5, freq="s"), + date_range("1995-01-01 00:00:00", periods=5, freq="s", tz="US/Eastern"), + timedelta_range("1 day", periods=5, freq="s"), + ], + ) def test_constructor_with_datetimelike(self, dtl): # see gh-12077 # constructor with a datetimelike and NaT @@ -310,8 +331,7 @@ def test_constructor_with_datetimelike(self, dtl): assert "NaT" in result def test_constructor_from_index_series_datetimetz(self): - idx = date_range('2015-01-01 10:00', freq='D', periods=3, - tz='US/Eastern') + idx = date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern") result = Categorical(idx) tm.assert_index_equal(result.categories, idx) @@ -319,7 +339,7 @@ def test_constructor_from_index_series_datetimetz(self): tm.assert_index_equal(result.categories, idx) def test_constructor_from_index_series_timedelta(self): - idx = timedelta_range('1 days', freq='D', periods=3) + idx = timedelta_range("1 days", freq="D", periods=3) result = Categorical(idx) tm.assert_index_equal(result.categories, idx) @@ -327,7 +347,7 @@ def test_constructor_from_index_series_timedelta(self): tm.assert_index_equal(result.categories, idx) def test_constructor_from_index_series_period(self): - idx = period_range('2015-01-01', freq='D', periods=3) + idx = period_range("2015-01-01", freq="D", periods=3) result = Categorical(idx) tm.assert_index_equal(result.categories, idx) @@ -337,50 +357,52 @@ def test_constructor_from_index_series_period(self): def test_constructor_invariant(self): # GH 14190 vals = [ - np.array([1., 1.2, 1.8, np.nan]), - np.array([1, 2, 3], dtype='int64'), - ['a', 'b', 'c', np.nan], - [pd.Period('2014-01'), pd.Period('2014-02'), NaT], - [Timestamp('2014-01-01'), Timestamp('2014-01-02'), NaT], - [Timestamp('2014-01-01', tz='US/Eastern'), - Timestamp('2014-01-02', tz='US/Eastern'), NaT], + np.array([1.0, 1.2, 1.8, np.nan]), + np.array([1, 2, 3], dtype="int64"), + ["a", "b", "c", np.nan], + [pd.Period("2014-01"), pd.Period("2014-02"), NaT], + [Timestamp("2014-01-01"), Timestamp("2014-01-02"), NaT], + [ + Timestamp("2014-01-01", tz="US/Eastern"), + Timestamp("2014-01-02", tz="US/Eastern"), + NaT, + ], ] for val in vals: c = Categorical(val) c2 = Categorical(c) tm.assert_categorical_equal(c, c2) - @pytest.mark.parametrize('ordered', [True, False]) + @pytest.mark.parametrize("ordered", [True, False]) def test_constructor_with_dtype(self, ordered): - categories = ['b', 'a', 'c'] + categories = ["b", "a", "c"] dtype = CategoricalDtype(categories, ordered=ordered) - result = Categorical(['a', 'b', 'a', 'c'], dtype=dtype) - expected = Categorical(['a', 'b', 'a', 'c'], categories=categories, - ordered=ordered) + result = Categorical(["a", "b", "a", "c"], dtype=dtype) + expected = Categorical( + ["a", "b", "a", "c"], categories=categories, ordered=ordered + ) tm.assert_categorical_equal(result, expected) assert result.ordered is ordered def test_constructor_dtype_and_others_raises(self): - dtype = CategoricalDtype(['a', 'b'], ordered=True) + dtype = CategoricalDtype(["a", "b"], ordered=True) msg = "Cannot specify `categories` or `ordered` together with `dtype`." with pytest.raises(ValueError, match=msg): - Categorical(['a', 'b'], categories=['a', 'b'], dtype=dtype) + Categorical(["a", "b"], categories=["a", "b"], dtype=dtype) with pytest.raises(ValueError, match=msg): - Categorical(['a', 'b'], ordered=True, dtype=dtype) + Categorical(["a", "b"], ordered=True, dtype=dtype) with pytest.raises(ValueError, match=msg): - Categorical(['a', 'b'], ordered=False, dtype=dtype) + Categorical(["a", "b"], ordered=False, dtype=dtype) - @pytest.mark.parametrize('categories', [ - None, ['a', 'b'], ['a', 'c'], - ]) - @pytest.mark.parametrize('ordered', [True, False]) + @pytest.mark.parametrize("categories", [None, ["a", "b"], ["a", "c"]]) + @pytest.mark.parametrize("ordered", [True, False]) def test_constructor_str_category(self, categories, ordered): - result = Categorical(['a', 'b'], categories=categories, - ordered=ordered, dtype='category') - expected = Categorical(['a', 'b'], categories=categories, - ordered=ordered) + result = Categorical( + ["a", "b"], categories=categories, ordered=ordered, dtype="category" + ) + expected = Categorical(["a", "b"], categories=categories, ordered=ordered) tm.assert_categorical_equal(result, expected) def test_constructor_str_unknown(self): @@ -388,46 +410,48 @@ def test_constructor_str_unknown(self): Categorical([1, 2], dtype="foo") def test_constructor_from_categorical_with_dtype(self): - dtype = CategoricalDtype(['a', 'b', 'c'], ordered=True) - values = Categorical(['a', 'b', 'd']) + dtype = CategoricalDtype(["a", "b", "c"], ordered=True) + values = Categorical(["a", "b", "d"]) result = Categorical(values, dtype=dtype) # We use dtype.categories, not values.categories - expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'c'], - ordered=True) + expected = Categorical( + ["a", "b", "d"], categories=["a", "b", "c"], ordered=True + ) tm.assert_categorical_equal(result, expected) def test_constructor_from_categorical_with_unknown_dtype(self): dtype = CategoricalDtype(None, ordered=True) - values = Categorical(['a', 'b', 'd']) + values = Categorical(["a", "b", "d"]) result = Categorical(values, dtype=dtype) # We use values.categories, not dtype.categories - expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'd'], - ordered=True) + expected = Categorical( + ["a", "b", "d"], categories=["a", "b", "d"], ordered=True + ) tm.assert_categorical_equal(result, expected) def test_constructor_from_categorical_string(self): - values = Categorical(['a', 'b', 'd']) + values = Categorical(["a", "b", "d"]) # use categories, ordered - result = Categorical(values, categories=['a', 'b', 'c'], ordered=True, - dtype='category') - expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'c'], - ordered=True) + result = Categorical( + values, categories=["a", "b", "c"], ordered=True, dtype="category" + ) + expected = Categorical( + ["a", "b", "d"], categories=["a", "b", "c"], ordered=True + ) tm.assert_categorical_equal(result, expected) # No string - result = Categorical(values, categories=['a', 'b', 'c'], ordered=True) + result = Categorical(values, categories=["a", "b", "c"], ordered=True) tm.assert_categorical_equal(result, expected) def test_constructor_with_categorical_categories(self): # GH17884 - expected = Categorical(['a', 'b'], categories=['a', 'b', 'c']) + expected = Categorical(["a", "b"], categories=["a", "b", "c"]) - result = Categorical( - ['a', 'b'], categories=Categorical(['a', 'b', 'c'])) + result = Categorical(["a", "b"], categories=Categorical(["a", "b", "c"])) tm.assert_categorical_equal(result, expected) - result = Categorical( - ['a', 'b'], categories=CategoricalIndex(['a', 'b', 'c'])) + result = Categorical(["a", "b"], categories=CategoricalIndex(["a", "b", "c"])) tm.assert_categorical_equal(result, expected) def test_from_codes(self): @@ -448,13 +472,11 @@ def test_from_codes(self): Categorical.from_codes(["a"], dtype=dtype) # no unique categories - with pytest.raises(ValueError, - match="Categorical categories must be unique"): + with pytest.raises(ValueError, match="Categorical categories must be unique"): Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"]) # NaN categories included - with pytest.raises(ValueError, - match="Categorial categories cannot be null"): + with pytest.raises(ValueError, match="Categorial categories cannot be null"): Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan]) # too negative @@ -474,103 +496,100 @@ def test_from_codes(self): def test_from_codes_with_categorical_categories(self): # GH17884 - expected = Categorical(['a', 'b'], categories=['a', 'b', 'c']) + expected = Categorical(["a", "b"], categories=["a", "b", "c"]) - result = Categorical.from_codes( - [0, 1], categories=Categorical(['a', 'b', 'c'])) + result = Categorical.from_codes([0, 1], categories=Categorical(["a", "b", "c"])) tm.assert_categorical_equal(result, expected) result = Categorical.from_codes( - [0, 1], categories=CategoricalIndex(['a', 'b', 'c'])) + [0, 1], categories=CategoricalIndex(["a", "b", "c"]) + ) tm.assert_categorical_equal(result, expected) # non-unique Categorical still raises - with pytest.raises(ValueError, - match="Categorical categories must be unique"): - Categorical.from_codes([0, 1], Categorical(['a', 'b', 'a'])) + with pytest.raises(ValueError, match="Categorical categories must be unique"): + Categorical.from_codes([0, 1], Categorical(["a", "b", "a"])) def test_from_codes_with_nan_code(self): # GH21767 codes = [1, 2, np.nan] - dtype = CategoricalDtype(categories=['a', 'b', 'c']) - with pytest.raises(ValueError, - match="codes need to be array-like integers"): + dtype = CategoricalDtype(categories=["a", "b", "c"]) + with pytest.raises(ValueError, match="codes need to be array-like integers"): Categorical.from_codes(codes, categories=dtype.categories) - with pytest.raises(ValueError, - match="codes need to be array-like integers"): + with pytest.raises(ValueError, match="codes need to be array-like integers"): Categorical.from_codes(codes, dtype=dtype) def test_from_codes_with_float(self): # GH21767 codes = [1.0, 2.0, 0] # integer, but in float dtype - dtype = CategoricalDtype(categories=['a', 'b', 'c']) + dtype = CategoricalDtype(categories=["a", "b", "c"]) with tm.assert_produces_warning(FutureWarning): cat = Categorical.from_codes(codes, dtype.categories) - tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1')) + tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="i1")) with tm.assert_produces_warning(FutureWarning): cat = Categorical.from_codes(codes, dtype=dtype) - tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1')) + tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="i1")) codes = [1.1, 2.0, 0] # non-integer - with pytest.raises(ValueError, - match="codes need to be array-like integers"): + with pytest.raises(ValueError, match="codes need to be array-like integers"): Categorical.from_codes(codes, dtype.categories) - with pytest.raises(ValueError, - match="codes need to be array-like integers"): + with pytest.raises(ValueError, match="codes need to be array-like integers"): Categorical.from_codes(codes, dtype=dtype) def test_from_codes_with_dtype_raises(self): - msg = 'Cannot specify' + msg = "Cannot specify" with pytest.raises(ValueError, match=msg): - Categorical.from_codes([0, 1], categories=['a', 'b'], - dtype=CategoricalDtype(['a', 'b'])) + Categorical.from_codes( + [0, 1], categories=["a", "b"], dtype=CategoricalDtype(["a", "b"]) + ) with pytest.raises(ValueError, match=msg): - Categorical.from_codes([0, 1], ordered=True, - dtype=CategoricalDtype(['a', 'b'])) + Categorical.from_codes( + [0, 1], ordered=True, dtype=CategoricalDtype(["a", "b"]) + ) def test_from_codes_neither(self): msg = "Both were None" with pytest.raises(ValueError, match=msg): Categorical.from_codes([0, 1]) - @pytest.mark.parametrize('dtype', [None, 'category']) + @pytest.mark.parametrize("dtype", [None, "category"]) def test_from_inferred_categories(self, dtype): - cats = ['a', 'b'] - codes = np.array([0, 0, 1, 1], dtype='i8') + cats = ["a", "b"] + codes = np.array([0, 0, 1, 1], dtype="i8") result = Categorical._from_inferred_categories(cats, codes, dtype) expected = Categorical.from_codes(codes, cats) tm.assert_categorical_equal(result, expected) - @pytest.mark.parametrize('dtype', [None, 'category']) + @pytest.mark.parametrize("dtype", [None, "category"]) def test_from_inferred_categories_sorts(self, dtype): - cats = ['b', 'a'] - codes = np.array([0, 1, 1, 1], dtype='i8') + cats = ["b", "a"] + codes = np.array([0, 1, 1, 1], dtype="i8") result = Categorical._from_inferred_categories(cats, codes, dtype) - expected = Categorical.from_codes([1, 0, 0, 0], ['a', 'b']) + expected = Categorical.from_codes([1, 0, 0, 0], ["a", "b"]) tm.assert_categorical_equal(result, expected) def test_from_inferred_categories_dtype(self): - cats = ['a', 'b', 'd'] - codes = np.array([0, 1, 0, 2], dtype='i8') - dtype = CategoricalDtype(['c', 'b', 'a'], ordered=True) + cats = ["a", "b", "d"] + codes = np.array([0, 1, 0, 2], dtype="i8") + dtype = CategoricalDtype(["c", "b", "a"], ordered=True) result = Categorical._from_inferred_categories(cats, codes, dtype) - expected = Categorical(['a', 'b', 'a', 'd'], - categories=['c', 'b', 'a'], - ordered=True) + expected = Categorical( + ["a", "b", "a", "d"], categories=["c", "b", "a"], ordered=True + ) tm.assert_categorical_equal(result, expected) def test_from_inferred_categories_coerces(self): - cats = ['1', '2', 'bad'] - codes = np.array([0, 0, 1, 2], dtype='i8') + cats = ["1", "2", "bad"] + codes = np.array([0, 0, 1, 2], dtype="i8") dtype = CategoricalDtype([1, 2]) result = Categorical._from_inferred_categories(cats, codes, dtype) expected = Categorical([1, 1, 2, np.nan]) tm.assert_categorical_equal(result, expected) - @pytest.mark.parametrize('ordered', [None, True, False]) + @pytest.mark.parametrize("ordered", [None, True, False]) def test_construction_with_ordered(self, ordered): # GH 9347, 9190 cat = Categorical([0, 1, 2], ordered=ordered) diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 14ad3c4d5e8608..be64b1f28c733d 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -8,24 +8,22 @@ class TestCategoricalDtypes: - def test_is_equal_dtype(self): # test dtype comparisons between cats - c1 = Categorical(list('aabca'), categories=list('abc'), ordered=False) - c2 = Categorical(list('aabca'), categories=list('cab'), ordered=False) - c3 = Categorical(list('aabca'), categories=list('cab'), ordered=True) + c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False) + c2 = Categorical(list("aabca"), categories=list("cab"), ordered=False) + c3 = Categorical(list("aabca"), categories=list("cab"), ordered=True) assert c1.is_dtype_equal(c1) assert c2.is_dtype_equal(c2) assert c3.is_dtype_equal(c3) assert c1.is_dtype_equal(c2) assert not c1.is_dtype_equal(c3) - assert not c1.is_dtype_equal(Index(list('aabca'))) + assert not c1.is_dtype_equal(Index(list("aabca"))) assert not c1.is_dtype_equal(c1.astype(object)) assert c1.is_dtype_equal(CategoricalIndex(c1)) - assert (c1.is_dtype_equal( - CategoricalIndex(c1, categories=list('cab')))) + assert c1.is_dtype_equal(CategoricalIndex(c1, categories=list("cab"))) assert not c1.is_dtype_equal(CategoricalIndex(c1, ordered=True)) # GH 16659 @@ -40,83 +38,85 @@ def test_is_equal_dtype(self): assert not c1.is_dtype_equal(s1.astype(object)) def test_set_dtype_same(self): - c = Categorical(['a', 'b', 'c']) - result = c._set_dtype(CategoricalDtype(['a', 'b', 'c'])) + c = Categorical(["a", "b", "c"]) + result = c._set_dtype(CategoricalDtype(["a", "b", "c"])) tm.assert_categorical_equal(result, c) def test_set_dtype_new_categories(self): - c = Categorical(['a', 'b', 'c']) - result = c._set_dtype(CategoricalDtype(list('abcd'))) + c = Categorical(["a", "b", "c"]) + result = c._set_dtype(CategoricalDtype(list("abcd"))) tm.assert_numpy_array_equal(result.codes, c.codes) - tm.assert_index_equal(result.dtype.categories, Index(list('abcd'))) - - @pytest.mark.parametrize('values, categories, new_categories', [ - # No NaNs, same cats, same order - (['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],), - # No NaNs, same cats, different order - (['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],), - # Same, unsorted - (['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],), - # No NaNs, same cats, different order - (['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],), - # NaNs - (['a', 'b', 'c'], ['a', 'b'], ['a', 'b']), - (['a', 'b', 'c'], ['a', 'b'], ['b', 'a']), - (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']), - (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']), - # Introduce NaNs - (['a', 'b', 'c'], ['a', 'b'], ['a']), - (['a', 'b', 'c'], ['a', 'b'], ['b']), - (['b', 'a', 'c'], ['a', 'b'], ['a']), - (['b', 'a', 'c'], ['a', 'b'], ['a']), - # No overlap - (['a', 'b', 'c'], ['a', 'b'], ['d', 'e']), - ]) - @pytest.mark.parametrize('ordered', [True, False]) - def test_set_dtype_many(self, values, categories, new_categories, - ordered): + tm.assert_index_equal(result.dtype.categories, Index(list("abcd"))) + + @pytest.mark.parametrize( + "values, categories, new_categories", + [ + # No NaNs, same cats, same order + (["a", "b", "a"], ["a", "b"], ["a", "b"]), + # No NaNs, same cats, different order + (["a", "b", "a"], ["a", "b"], ["b", "a"]), + # Same, unsorted + (["b", "a", "a"], ["a", "b"], ["a", "b"]), + # No NaNs, same cats, different order + (["b", "a", "a"], ["a", "b"], ["b", "a"]), + # NaNs + (["a", "b", "c"], ["a", "b"], ["a", "b"]), + (["a", "b", "c"], ["a", "b"], ["b", "a"]), + (["b", "a", "c"], ["a", "b"], ["a", "b"]), + (["b", "a", "c"], ["a", "b"], ["a", "b"]), + # Introduce NaNs + (["a", "b", "c"], ["a", "b"], ["a"]), + (["a", "b", "c"], ["a", "b"], ["b"]), + (["b", "a", "c"], ["a", "b"], ["a"]), + (["b", "a", "c"], ["a", "b"], ["a"]), + # No overlap + (["a", "b", "c"], ["a", "b"], ["d", "e"]), + ], + ) + @pytest.mark.parametrize("ordered", [True, False]) + def test_set_dtype_many(self, values, categories, new_categories, ordered): c = Categorical(values, categories) expected = Categorical(values, new_categories, ordered) result = c._set_dtype(expected.dtype) tm.assert_categorical_equal(result, expected) def test_set_dtype_no_overlap(self): - c = Categorical(['a', 'b', 'c'], ['d', 'e']) - result = c._set_dtype(CategoricalDtype(['a', 'b'])) - expected = Categorical([None, None, None], categories=['a', 'b']) + c = Categorical(["a", "b", "c"], ["d", "e"]) + result = c._set_dtype(CategoricalDtype(["a", "b"])) + expected = Categorical([None, None, None], categories=["a", "b"]) tm.assert_categorical_equal(result, expected) def test_codes_dtypes(self): # GH 8453 - result = Categorical(['foo', 'bar', 'baz']) - assert result.codes.dtype == 'int8' + result = Categorical(["foo", "bar", "baz"]) + assert result.codes.dtype == "int8" - result = Categorical(['foo%05d' % i for i in range(400)]) - assert result.codes.dtype == 'int16' + result = Categorical(["foo%05d" % i for i in range(400)]) + assert result.codes.dtype == "int16" - result = Categorical(['foo%05d' % i for i in range(40000)]) - assert result.codes.dtype == 'int32' + result = Categorical(["foo%05d" % i for i in range(40000)]) + assert result.codes.dtype == "int32" # adding cats - result = Categorical(['foo', 'bar', 'baz']) - assert result.codes.dtype == 'int8' - result = result.add_categories(['foo%05d' % i for i in range(400)]) - assert result.codes.dtype == 'int16' + result = Categorical(["foo", "bar", "baz"]) + assert result.codes.dtype == "int8" + result = result.add_categories(["foo%05d" % i for i in range(400)]) + assert result.codes.dtype == "int16" # removing cats - result = result.remove_categories(['foo%05d' % i for i in range(300)]) - assert result.codes.dtype == 'int8' + result = result.remove_categories(["foo%05d" % i for i in range(300)]) + assert result.codes.dtype == "int8" - @pytest.mark.parametrize('ordered', [True, False]) + @pytest.mark.parametrize("ordered", [True, False]) def test_astype(self, ordered): # string - cat = Categorical(list('abbaaccc'), ordered=ordered) + cat = Categorical(list("abbaaccc"), ordered=ordered) result = cat.astype(object) expected = np.array(cat) tm.assert_numpy_array_equal(result, expected) - msg = 'could not convert string to float' + msg = "could not convert string to float" with pytest.raises(ValueError, match=msg): cat.astype(float) @@ -134,37 +134,36 @@ def test_astype(self, ordered): expected = np.array(cat, dtype=np.float) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('dtype_ordered', [True, False]) - @pytest.mark.parametrize('cat_ordered', [True, False]) + @pytest.mark.parametrize("dtype_ordered", [True, False]) + @pytest.mark.parametrize("cat_ordered", [True, False]) def test_astype_category(self, dtype_ordered, cat_ordered): # GH 10696/18593 - data = list('abcaacbab') - cat = Categorical(data, categories=list('bac'), ordered=cat_ordered) + data = list("abcaacbab") + cat = Categorical(data, categories=list("bac"), ordered=cat_ordered) # standard categories dtype = CategoricalDtype(ordered=dtype_ordered) result = cat.astype(dtype) - expected = Categorical( - data, categories=cat.categories, ordered=dtype_ordered) + expected = Categorical(data, categories=cat.categories, ordered=dtype_ordered) tm.assert_categorical_equal(result, expected) # non-standard categories - dtype = CategoricalDtype(list('adc'), dtype_ordered) + dtype = CategoricalDtype(list("adc"), dtype_ordered) result = cat.astype(dtype) expected = Categorical(data, dtype=dtype) tm.assert_categorical_equal(result, expected) if dtype_ordered is False: # dtype='category' can't specify ordered, so only test once - result = cat.astype('category') + result = cat.astype("category") expected = cat tm.assert_categorical_equal(result, expected) def test_astype_category_ordered_none_deprecated(self): # GH 26336 - cdt1 = CategoricalDtype(categories=list('cdab'), ordered=True) - cdt2 = CategoricalDtype(categories=list('cedafb')) - cat = Categorical(list('abcdaba'), dtype=cdt1) + cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True) + cdt2 = CategoricalDtype(categories=list("cedafb")) + cat = Categorical(list("abcdaba"), dtype=cdt1) with tm.assert_produces_warning(FutureWarning): cat.astype(cdt2) @@ -175,7 +174,6 @@ def test_iter_python_types(self): assert isinstance(cat.tolist()[0], int) def test_iter_python_types_datetime(self): - cat = Categorical([Timestamp('2017-01-01'), - Timestamp('2017-01-02')]) + cat = Categorical([Timestamp("2017-01-01"), Timestamp("2017-01-02")]) assert isinstance(list(cat)[0], Timestamp) assert isinstance(cat.tolist()[0], Timestamp) diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 98f8ccb48c44be..6edd7fd00b7073 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -9,73 +9,74 @@ class TestCategoricalIndexingWithFactor(TestCategorical): - def test_getitem(self): - assert self.factor[0] == 'a' - assert self.factor[-1] == 'c' + assert self.factor[0] == "a" + assert self.factor[-1] == "c" subf = self.factor[[0, 1, 2]] - tm.assert_numpy_array_equal(subf._codes, - np.array([0, 1, 1], dtype=np.int8)) + tm.assert_numpy_array_equal(subf._codes, np.array([0, 1, 1], dtype=np.int8)) - subf = self.factor[np.asarray(self.factor) == 'c'] - tm.assert_numpy_array_equal(subf._codes, - np.array([2, 2, 2], dtype=np.int8)) + subf = self.factor[np.asarray(self.factor) == "c"] + tm.assert_numpy_array_equal(subf._codes, np.array([2, 2, 2], dtype=np.int8)) def test_setitem(self): # int/positional c = self.factor.copy() - c[0] = 'b' - assert c[0] == 'b' - c[-1] = 'a' - assert c[-1] == 'a' + c[0] = "b" + assert c[0] == "b" + c[-1] = "a" + assert c[-1] == "a" # boolean c = self.factor.copy() - indexer = np.zeros(len(c), dtype='bool') + indexer = np.zeros(len(c), dtype="bool") indexer[0] = True indexer[-1] = True - c[indexer] = 'c' - expected = Categorical(['c', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], - ordered=True) + c[indexer] = "c" + expected = Categorical(["c", "b", "b", "a", "a", "c", "c", "c"], ordered=True) tm.assert_categorical_equal(c, expected) - @pytest.mark.parametrize('other', [ - pd.Categorical(['b', 'a']), - pd.Categorical(['b', 'a'], categories=['b', 'a']), - ]) + @pytest.mark.parametrize( + "other", + [pd.Categorical(["b", "a"]), pd.Categorical(["b", "a"], categories=["b", "a"])], + ) def test_setitem_same_but_unordered(self, other): # GH-24142 - target = pd.Categorical(['a', 'b'], categories=['a', 'b']) + target = pd.Categorical(["a", "b"], categories=["a", "b"]) mask = np.array([True, False]) target[mask] = other[mask] - expected = pd.Categorical(['b', 'b'], categories=['a', 'b']) + expected = pd.Categorical(["b", "b"], categories=["a", "b"]) tm.assert_categorical_equal(target, expected) - @pytest.mark.parametrize('other', [ - pd.Categorical(['b', 'a'], categories=['b', 'a', 'c']), - pd.Categorical(['b', 'a'], categories=['a', 'b', 'c']), - pd.Categorical(['a', 'a'], categories=['a']), - pd.Categorical(['b', 'b'], categories=['b']), - ]) + @pytest.mark.parametrize( + "other", + [ + pd.Categorical(["b", "a"], categories=["b", "a", "c"]), + pd.Categorical(["b", "a"], categories=["a", "b", "c"]), + pd.Categorical(["a", "a"], categories=["a"]), + pd.Categorical(["b", "b"], categories=["b"]), + ], + ) def test_setitem_different_unordered_raises(self, other): # GH-24142 - target = pd.Categorical(['a', 'b'], categories=['a', 'b']) + target = pd.Categorical(["a", "b"], categories=["a", "b"]) mask = np.array([True, False]) with pytest.raises(ValueError): target[mask] = other[mask] - @pytest.mark.parametrize('other', [ - pd.Categorical(['b', 'a']), - pd.Categorical(['b', 'a'], categories=['b', 'a'], ordered=True), - pd.Categorical(['b', 'a'], categories=['a', 'b', 'c'], ordered=True), - ]) + @pytest.mark.parametrize( + "other", + [ + pd.Categorical(["b", "a"]), + pd.Categorical(["b", "a"], categories=["b", "a"], ordered=True), + pd.Categorical(["b", "a"], categories=["a", "b", "c"], ordered=True), + ], + ) def test_setitem_same_ordered_rasies(self, other): # Gh-24142 - target = pd.Categorical(['a', 'b'], categories=['a', 'b'], - ordered=True) + target = pd.Categorical(["a", "b"], categories=["a", "b"], ordered=True) mask = np.array([True, False]) with pytest.raises(ValueError): @@ -83,7 +84,6 @@ def test_setitem_same_ordered_rasies(self, other): class TestCategoricalIndexing: - def test_getitem_listlike(self): # GH 9469 @@ -95,31 +95,53 @@ def test_getitem_listlike(self): tm.assert_numpy_array_equal(result, expected) def test_periodindex(self): - idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', - '2014-03', '2014-03'], freq='M') + idx1 = PeriodIndex( + ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], freq="M" + ) cat1 = Categorical(idx1) str(cat1) exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.int8) - exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') + exp_idx = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M") tm.assert_numpy_array_equal(cat1._codes, exp_arr) tm.assert_index_equal(cat1.categories, exp_idx) - idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', - '2014-03', '2014-01'], freq='M') + idx2 = PeriodIndex( + ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], freq="M" + ) cat2 = Categorical(idx2, ordered=True) str(cat2) exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.int8) - exp_idx2 = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') + exp_idx2 = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M") tm.assert_numpy_array_equal(cat2._codes, exp_arr) tm.assert_index_equal(cat2.categories, exp_idx2) - idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09', - '2013-08', '2013-07', '2013-05'], freq='M') + idx3 = PeriodIndex( + [ + "2013-12", + "2013-11", + "2013-10", + "2013-09", + "2013-08", + "2013-07", + "2013-05", + ], + freq="M", + ) cat3 = Categorical(idx3, ordered=True) exp_arr = np.array([6, 5, 4, 3, 2, 1, 0], dtype=np.int8) - exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09', - '2013-10', '2013-11', '2013-12'], freq='M') + exp_idx = PeriodIndex( + [ + "2013-05", + "2013-07", + "2013-08", + "2013-09", + "2013-10", + "2013-11", + "2013-12", + ], + freq="M", + ) tm.assert_numpy_array_equal(cat3._codes, exp_arr) tm.assert_index_equal(cat3.categories, exp_idx) @@ -139,8 +161,9 @@ def test_categories_assigments(self): s.categories = [1, 2] # Combinations of sorted/unique: - @pytest.mark.parametrize("idx_values", [[1, 2, 3, 4], [1, 3, 2, 4], - [1, 3, 3, 4], [1, 2, 2, 4]]) + @pytest.mark.parametrize( + "idx_values", [[1, 2, 3, 4], [1, 3, 2, 4], [1, 3, 3, 4], [1, 2, 2, 4]] + ) # Combinations of missing/unique @pytest.mark.parametrize("key_values", [[1, 2], [1, 5], [1, 1], [5, 5]]) @pytest.mark.parametrize("key_class", [Categorical, CategoricalIndex]) @@ -148,7 +171,7 @@ def test_get_indexer_non_unique(self, idx_values, key_values, key_class): # GH 21448 key = key_class(key_values, categories=range(1, 5)) # Test for flat index and CategoricalIndex with same/different cats: - for dtype in None, 'category', key.dtype: + for dtype in None, "category", key.dtype: idx = Index(idx_values, dtype=dtype) expected, exp_miss = idx.get_indexer_non_unique(key_values) result, res_miss = idx.get_indexer_non_unique(key) @@ -157,57 +180,51 @@ def test_get_indexer_non_unique(self, idx_values, key_values, key_class): tm.assert_numpy_array_equal(exp_miss, res_miss) def test_where_unobserved_nan(self): - ser = pd.Series(pd.Categorical(['a', 'b'])) + ser = pd.Series(pd.Categorical(["a", "b"])) result = ser.where([True, False]) - expected = pd.Series(pd.Categorical(['a', None], - categories=['a', 'b'])) + expected = pd.Series(pd.Categorical(["a", None], categories=["a", "b"])) tm.assert_series_equal(result, expected) # all NA - ser = pd.Series(pd.Categorical(['a', 'b'])) + ser = pd.Series(pd.Categorical(["a", "b"])) result = ser.where([False, False]) - expected = pd.Series(pd.Categorical([None, None], - categories=['a', 'b'])) + expected = pd.Series(pd.Categorical([None, None], categories=["a", "b"])) tm.assert_series_equal(result, expected) def test_where_unobserved_categories(self): - ser = pd.Series( - Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a']) - ) - result = ser.where([True, True, False], other='b') + ser = pd.Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"])) + result = ser.where([True, True, False], other="b") expected = pd.Series( - Categorical(['a', 'b', 'b'], categories=ser.cat.categories) + Categorical(["a", "b", "b"], categories=ser.cat.categories) ) tm.assert_series_equal(result, expected) def test_where_other_categorical(self): - ser = pd.Series( - Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a']) - ) - other = Categorical(['b', 'c', 'a'], categories=['a', 'c', 'b', 'd']) + ser = pd.Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"])) + other = Categorical(["b", "c", "a"], categories=["a", "c", "b", "d"]) result = ser.where([True, False, True], other) - expected = pd.Series(Categorical(['a', 'c', 'c'], dtype=ser.dtype)) + expected = pd.Series(Categorical(["a", "c", "c"], dtype=ser.dtype)) tm.assert_series_equal(result, expected) def test_where_warns(self): - ser = pd.Series(Categorical(['a', 'b', 'c'])) + ser = pd.Series(Categorical(["a", "b", "c"])) with tm.assert_produces_warning(FutureWarning): - result = ser.where([True, False, True], 'd') + result = ser.where([True, False, True], "d") - expected = pd.Series(np.array(['a', 'd', 'c'], dtype='object')) + expected = pd.Series(np.array(["a", "d", "c"], dtype="object")) tm.assert_series_equal(result, expected) def test_where_ordered_differs_rasies(self): ser = pd.Series( - Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'], - ordered=True) + Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"], ordered=True) + ) + other = Categorical( + ["b", "c", "a"], categories=["a", "c", "b", "d"], ordered=True ) - other = Categorical(['b', 'c', 'a'], categories=['a', 'c', 'b', 'd'], - ordered=True) with tm.assert_produces_warning(FutureWarning): result = ser.where([True, False, True], other) - expected = pd.Series(np.array(['a', 'c', 'c'], dtype=object)) + expected = pd.Series(np.array(["a", "c", "c"], dtype=object)) tm.assert_series_equal(result, expected) @@ -220,7 +237,7 @@ def test_mask_with_boolean(index): assert com.is_bool_indexer(idx) result = s[idx] - expected = s[idx.astype('object')] + expected = s[idx.astype("object")] tm.assert_series_equal(result, expected) @@ -231,7 +248,7 @@ def test_mask_with_boolean_raises(index): if index: idx = CategoricalIndex(idx) - with pytest.raises(ValueError, match='NA / NaN'): + with pytest.raises(ValueError, match="NA / NaN"): s[idx] @@ -256,7 +273,7 @@ def array(self, dtype=None): def test_series_at(non_coercible_categorical): - arr = Categorical(['a', 'b', 'c']) + arr = Categorical(["a", "b", "c"]) ser = Series(arr) result = ser.at[0] - assert result == 'a' + assert result == "a" diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index eaf6606b5d63f9..1b62479530d24c 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -10,7 +10,6 @@ class TestCategoricalMissing: - def test_na_flags_int_categories(self): # #1457 @@ -28,25 +27,21 @@ def test_nan_handling(self): # Nans are represented as -1 in codes c = Categorical(["a", "b", np.nan, "a"]) tm.assert_index_equal(c.categories, Index(["a", "b"])) - tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], - dtype=np.int8)) + tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8)) c[1] = np.nan tm.assert_index_equal(c.categories, Index(["a", "b"])) - tm.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0], - dtype=np.int8)) + tm.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0], dtype=np.int8)) # Adding nan to categories should make assigned nan point to the # category! c = Categorical(["a", "b", np.nan, "a"]) tm.assert_index_equal(c.categories, Index(["a", "b"])) - tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], - dtype=np.int8)) + tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8)) def test_set_dtype_nans(self): - c = Categorical(['a', 'b', np.nan]) - result = c._set_dtype(CategoricalDtype(['a', 'c'])) - tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1], - dtype='int8')) + c = Categorical(["a", "b", np.nan]) + result = c._set_dtype(CategoricalDtype(["a", "c"])) + tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1], dtype="int8")) def test_set_item_nan(self): cat = Categorical([1, 2, 3]) @@ -55,14 +50,17 @@ def test_set_item_nan(self): exp = Categorical([1, np.nan, 3], categories=[1, 2, 3]) tm.assert_categorical_equal(cat, exp) - @pytest.mark.parametrize('fillna_kwargs, msg', [ - (dict(value=1, method='ffill'), - "Cannot specify both 'value' and 'method'."), - (dict(), - "Must specify a fill 'value' or 'method'."), - (dict(method='bad'), - "Invalid fill method. Expecting .* bad"), - ]) + @pytest.mark.parametrize( + "fillna_kwargs, msg", + [ + ( + dict(value=1, method="ffill"), + "Cannot specify both 'value' and 'method'.", + ), + (dict(), "Must specify a fill 'value' or 'method'."), + (dict(method="bad"), "Invalid fill method. Expecting .* bad"), + ], + ) def test_fillna_raises(self, fillna_kwargs, msg): # https://github.com/pandas-dev/pandas/issues/19682 cat = Categorical([1, 2, 3]) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index af1d3ca0f9ad47..cd8ec7fcb787d8 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -11,35 +11,33 @@ class TestCategoricalOpsWithFactor(TestCategorical): - def test_categories_none_comparisons(self): - factor = Categorical(['a', 'b', 'b', 'a', - 'a', 'c', 'c', 'c'], ordered=True) + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) tm.assert_categorical_equal(factor, self.factor) def test_comparisons(self): - result = self.factor[self.factor == 'a'] - expected = self.factor[np.asarray(self.factor) == 'a'] + result = self.factor[self.factor == "a"] + expected = self.factor[np.asarray(self.factor) == "a"] tm.assert_categorical_equal(result, expected) - result = self.factor[self.factor != 'a'] - expected = self.factor[np.asarray(self.factor) != 'a'] + result = self.factor[self.factor != "a"] + expected = self.factor[np.asarray(self.factor) != "a"] tm.assert_categorical_equal(result, expected) - result = self.factor[self.factor < 'c'] - expected = self.factor[np.asarray(self.factor) < 'c'] + result = self.factor[self.factor < "c"] + expected = self.factor[np.asarray(self.factor) < "c"] tm.assert_categorical_equal(result, expected) - result = self.factor[self.factor > 'a'] - expected = self.factor[np.asarray(self.factor) > 'a'] + result = self.factor[self.factor > "a"] + expected = self.factor[np.asarray(self.factor) > "a"] tm.assert_categorical_equal(result, expected) - result = self.factor[self.factor >= 'b'] - expected = self.factor[np.asarray(self.factor) >= 'b'] + result = self.factor[self.factor >= "b"] + expected = self.factor[np.asarray(self.factor) >= "b"] tm.assert_categorical_equal(result, expected) - result = self.factor[self.factor <= 'b'] - expected = self.factor[np.asarray(self.factor) <= 'b'] + result = self.factor[self.factor <= "b"] + expected = self.factor[np.asarray(self.factor) <= "b"] tm.assert_categorical_equal(result, expected) n = len(self.factor) @@ -49,18 +47,17 @@ def test_comparisons(self): expected = np.asarray(self.factor) == np.asarray(other) tm.assert_numpy_array_equal(result, expected) - result = self.factor == 'd' + result = self.factor == "d" expected = np.repeat(False, len(self.factor)) tm.assert_numpy_array_equal(result, expected) # comparisons with categoricals - cat_rev = Categorical( - ["a", "b", "c"], categories=["c", "b", "a"], ordered=True) + cat_rev = Categorical(["a", "b", "c"], categories=["c", "b", "a"], ordered=True) cat_rev_base = Categorical( - ["b", "b", "b"], categories=["c", "b", "a"], ordered=True) + ["b", "b", "b"], categories=["c", "b", "a"], ordered=True + ) cat = Categorical(["a", "b", "c"], ordered=True) - cat_base = Categorical( - ["b", "b", "b"], categories=cat.categories, ordered=True) + cat_base = Categorical(["b", "b", "b"], categories=cat.categories, ordered=True) # comparisons need to take categories ordering into account res_rev = cat_rev > cat_rev_base @@ -79,8 +76,7 @@ def test_comparisons(self): with pytest.raises(TypeError): cat > cat_rev - cat_rev_base2 = Categorical( - ["b", "b", "b"], categories=["c", "b", "a", "d"]) + cat_rev_base2 = Categorical(["b", "b", "b"], categories=["c", "b", "a", "d"]) with pytest.raises(TypeError): cat_rev > cat_rev_base2 @@ -94,8 +90,10 @@ def test_comparisons(self): # comparison (in both directions) with Series will raise s = Series(["b", "b", "b"]) - msg = ("Cannot compare a Categorical for op __gt__ with type" - r" ") + msg = ( + "Cannot compare a Categorical for op __gt__ with type" + r" " + ) with pytest.raises(TypeError, match=msg): cat > s with pytest.raises(TypeError, match=msg): @@ -115,8 +113,7 @@ def test_comparisons(self): # Make sure that unequal comparison take the categories order in # account - cat_rev = Categorical( - list("abc"), categories=list("cba"), ordered=True) + cat_rev = Categorical(list("abc"), categories=list("cba"), ordered=True) exp = np.array([True, False, False]) res = cat_rev > "b" tm.assert_numpy_array_equal(res, exp) @@ -127,7 +124,6 @@ def test_comparisons(self): class TestCategoricalOps: - def test_compare_frame(self): # GH#24282 check that Categorical.__cmp__(DataFrame) defers to frame data = ["a", "b", 2, "a"] @@ -135,8 +131,14 @@ def test_compare_frame(self): df = DataFrame(cat) - for op in [operator.eq, operator.ne, operator.ge, - operator.gt, operator.le, operator.lt]: + for op in [ + operator.eq, + operator.ne, + operator.ge, + operator.gt, + operator.le, + operator.lt, + ]: with pytest.raises(ValueError): # alignment raises unless we transpose op(cat, df) @@ -150,19 +152,15 @@ def test_compare_frame(self): tm.assert_frame_equal(result, expected) def test_datetime_categorical_comparison(self): - dt_cat = Categorical(date_range('2014-01-01', periods=3), ordered=True) - tm.assert_numpy_array_equal(dt_cat > dt_cat[0], - np.array([False, True, True])) - tm.assert_numpy_array_equal(dt_cat[0] < dt_cat, - np.array([False, True, True])) + dt_cat = Categorical(date_range("2014-01-01", periods=3), ordered=True) + tm.assert_numpy_array_equal(dt_cat > dt_cat[0], np.array([False, True, True])) + tm.assert_numpy_array_equal(dt_cat[0] < dt_cat, np.array([False, True, True])) def test_reflected_comparison_with_scalars(self): # GH8658 cat = Categorical([1, 2, 3], ordered=True) - tm.assert_numpy_array_equal(cat > cat[0], - np.array([False, True, True])) - tm.assert_numpy_array_equal(cat[0] < cat, - np.array([False, True, True])) + tm.assert_numpy_array_equal(cat > cat[0], np.array([False, True, True])) + tm.assert_numpy_array_equal(cat[0] < cat, np.array([False, True, True])) def test_comparison_with_unknown_scalars(self): # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057 @@ -170,24 +168,25 @@ def test_comparison_with_unknown_scalars(self): # for unequal comps, but not for equal/not equal cat = Categorical([1, 2, 3], ordered=True) - msg = ("Cannot compare a Categorical for op __{}__ with a scalar," - " which is not a category") - with pytest.raises(TypeError, match=msg.format('lt')): + msg = ( + "Cannot compare a Categorical for op __{}__ with a scalar," + " which is not a category" + ) + with pytest.raises(TypeError, match=msg.format("lt")): cat < 4 - with pytest.raises(TypeError, match=msg.format('gt')): + with pytest.raises(TypeError, match=msg.format("gt")): cat > 4 - with pytest.raises(TypeError, match=msg.format('gt')): + with pytest.raises(TypeError, match=msg.format("gt")): 4 < cat - with pytest.raises(TypeError, match=msg.format('lt')): + with pytest.raises(TypeError, match=msg.format("lt")): 4 > cat - tm.assert_numpy_array_equal(cat == 4, - np.array([False, False, False])) - tm.assert_numpy_array_equal(cat != 4, - np.array([True, True, True])) + tm.assert_numpy_array_equal(cat == 4, np.array([False, False, False])) + tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True])) def test_comparison_of_ordered_categorical_with_nan_to_scalar( - self, compare_operators_no_eq_ne): + self, compare_operators_no_eq_ne + ): # https://github.com/pandas-dev/pandas/issues/26504 # BUG: fix ordered categorical comparison with missing values (#26504 ) # and following comparisons with scalars in categories with missing @@ -197,13 +196,13 @@ def test_comparison_of_ordered_categorical_with_nan_to_scalar( scalar = 2 with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) - expected = getattr(np.array(cat), - compare_operators_no_eq_ne)(scalar) + expected = getattr(np.array(cat), compare_operators_no_eq_ne)(scalar) actual = getattr(cat, compare_operators_no_eq_ne)(scalar) tm.assert_numpy_array_equal(actual, expected) def test_comparison_of_ordered_categorical_with_nan_to_listlike( - self, compare_operators_no_eq_ne): + self, compare_operators_no_eq_ne + ): # https://github.com/pandas-dev/pandas/issues/26504 # and following comparisons of missing values in ordered Categorical # with listlike should be evaluated as False @@ -216,18 +215,17 @@ def test_comparison_of_ordered_categorical_with_nan_to_listlike( actual = getattr(cat, compare_operators_no_eq_ne)(other) tm.assert_numpy_array_equal(actual, expected) - @pytest.mark.parametrize('data,reverse,base', [ - (list("abc"), list("cba"), list("bbb")), - ([1, 2, 3], [3, 2, 1], [2, 2, 2])] + @pytest.mark.parametrize( + "data,reverse,base", + [(list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])], ) def test_comparisons(self, data, reverse, base): - cat_rev = Series( - Categorical(data, categories=reverse, ordered=True)) - cat_rev_base = Series( - Categorical(base, categories=reverse, ordered=True)) + cat_rev = Series(Categorical(data, categories=reverse, ordered=True)) + cat_rev_base = Series(Categorical(base, categories=reverse, ordered=True)) cat = Series(Categorical(data, ordered=True)) cat_base = Series( - Categorical(base, categories=cat.cat.categories, ordered=True)) + Categorical(base, categories=cat.cat.categories, ordered=True) + ) s = Series(base) a = np.array(base) @@ -262,8 +260,10 @@ def test_comparisons(self, data, reverse, base): # categorical cannot be compared to Series or numpy array, and also # not the other way around - msg = ("Cannot compare a Categorical for op __gt__ with type" - r" ") + msg = ( + "Cannot compare a Categorical for op __gt__ with type" + r" " + ) with pytest.raises(TypeError, match=msg): cat > s with pytest.raises(TypeError, match=msg): @@ -283,40 +283,42 @@ def test_comparisons(self, data, reverse, base): with pytest.raises(TypeError, match=msg): a < cat_rev - @pytest.mark.parametrize('ctor', [ - lambda *args, **kwargs: Categorical(*args, **kwargs), - lambda *args, **kwargs: Series(Categorical(*args, **kwargs)), - ]) + @pytest.mark.parametrize( + "ctor", + [ + lambda *args, **kwargs: Categorical(*args, **kwargs), + lambda *args, **kwargs: Series(Categorical(*args, **kwargs)), + ], + ) def test_unordered_different_order_equal(self, ctor): # https://github.com/pandas-dev/pandas/issues/16014 - c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False) - c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False) + c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False) + c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False) assert (c1 == c2).all() - c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False) - c2 = ctor(['b', 'a'], categories=['b', 'a'], ordered=False) + c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False) + c2 = ctor(["b", "a"], categories=["b", "a"], ordered=False) assert (c1 != c2).all() - c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False) - c2 = ctor(['b', 'b'], categories=['b', 'a'], ordered=False) + c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False) + c2 = ctor(["b", "b"], categories=["b", "a"], ordered=False) assert (c1 != c2).all() - c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False) - c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False) + c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False) + c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False) result = c1 == c2 tm.assert_numpy_array_equal(np.array(result), np.array([True, False])) def test_unordered_different_categories_raises(self): - c1 = Categorical(['a', 'b'], categories=['a', 'b'], ordered=False) - c2 = Categorical(['a', 'c'], categories=['c', 'a'], ordered=False) + c1 = Categorical(["a", "b"], categories=["a", "b"], ordered=False) + c2 = Categorical(["a", "c"], categories=["c", "a"], ordered=False) - with pytest.raises(TypeError, match=("Categoricals can " - "only be compared")): + with pytest.raises(TypeError, match=("Categoricals can " "only be compared")): c1 == c2 def test_compare_different_lengths(self): - c1 = Categorical([], categories=['a', 'b']) - c2 = Categorical([], categories=['a']) + c1 = Categorical([], categories=["a", "b"]) + c2 = Categorical([], categories=["a"]) msg = "Categories are different lengths" with pytest.raises(TypeError, match=msg): @@ -325,33 +327,36 @@ def test_compare_different_lengths(self): def test_compare_unordered_different_order(self): # https://github.com/pandas-dev/pandas/issues/16603#issuecomment- # 349290078 - a = pd.Categorical(['a'], categories=['a', 'b']) - b = pd.Categorical(['b'], categories=['b', 'a']) + a = pd.Categorical(["a"], categories=["a", "b"]) + b = pd.Categorical(["b"], categories=["b", "a"]) assert not a.equals(b) def test_numeric_like_ops(self): - df = DataFrame({'value': np.random.randint(0, 10000, 100)}) + df = DataFrame({"value": np.random.randint(0, 10000, 100)}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) - df = df.sort_values(by=['value'], ascending=True) - df['value_group'] = pd.cut(df.value, range(0, 10500, 500), - right=False, labels=cat_labels) + df = df.sort_values(by=["value"], ascending=True) + df["value_group"] = pd.cut( + df.value, range(0, 10500, 500), right=False, labels=cat_labels + ) # numeric ops should not succeed - for op, str_rep in [('__add__', r'\+'), - ('__sub__', '-'), - ('__mul__', r'\*'), - ('__truediv__', '/')]: + for op, str_rep in [ + ("__add__", r"\+"), + ("__sub__", "-"), + ("__mul__", r"\*"), + ("__truediv__", "/"), + ]: msg = r"Series cannot perform the operation {}".format(str_rep) with pytest.raises(TypeError, match=msg): getattr(df, op)(df) # reduction ops should not succeed (unless specifically defined, e.g. # min/max) - s = df['value_group'] - for op in ['kurt', 'skew', 'var', 'std', 'mean', 'sum', 'median']: + s = df["value_group"] + for op in ["kurt", "skew", "var", "std", "mean", "sum", "median"]: msg = "Categorical cannot perform the operation {}".format(op) with pytest.raises(TypeError, match=msg): getattr(s, op)(numeric_only=False) @@ -364,10 +369,12 @@ def test_numeric_like_ops(self): np.sum(s) # numeric ops on a Series - for op, str_rep in [('__add__', r'\+'), - ('__sub__', '-'), - ('__mul__', r'\*'), - ('__truediv__', '/')]: + for op, str_rep in [ + ("__add__", r"\+"), + ("__sub__", "-"), + ("__mul__", r"\*"), + ("__truediv__", "/"), + ]: msg = r"Series cannot perform the operation {}".format(str_rep) with pytest.raises(TypeError, match=msg): getattr(s, op)(2) @@ -378,10 +385,10 @@ def test_numeric_like_ops(self): def test_contains(self): # GH21508 - c = pd.Categorical(list('aabbca'), categories=list('cab')) + c = pd.Categorical(list("aabbca"), categories=list("cab")) - assert 'b' in c - assert 'z' not in c + assert "b" in c + assert "z" not in c assert np.nan not in c with pytest.raises(TypeError): assert [1] in c @@ -390,16 +397,21 @@ def test_contains(self): assert 0 not in c assert 1 not in c - c = pd.Categorical(list('aabbca') + [np.nan], categories=list('cab')) + c = pd.Categorical(list("aabbca") + [np.nan], categories=list("cab")) assert np.nan in c - @pytest.mark.parametrize('item, expected', [ - (pd.Interval(0, 1), True), - (1.5, True), - (pd.Interval(0.5, 1.5), False), - ('a', False), - (pd.Timestamp(1), False), - (pd.Timedelta(1), False)], ids=str) + @pytest.mark.parametrize( + "item, expected", + [ + (pd.Interval(0, 1), True), + (1.5, True), + (pd.Interval(0.5, 1.5), False), + ("a", False), + (pd.Timestamp(1), False), + (pd.Timedelta(1), False), + ], + ids=str, + ) def test_contains_interval(self, item, expected): # GH 23705 cat = Categorical(pd.IntervalIndex.from_breaks(range(3))) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index d4ae50dcdaa5d4..9321813b42b33e 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -1,28 +1,33 @@ import numpy as np from pandas import ( - Categorical, CategoricalIndex, Series, date_range, option_context, - period_range, timedelta_range) + Categorical, + CategoricalIndex, + Series, + date_range, + option_context, + period_range, + timedelta_range, +) from pandas.tests.arrays.categorical.common import TestCategorical class TestCategoricalReprWithFactor(TestCategorical): - def test_print(self): - expected = ["[a, b, b, a, a, c, c, c]", - "Categories (3, object): [a < b < c]"] + expected = ["[a, b, b, a, a, c, c, c]", "Categories (3, object): [a < b < c]"] expected = "\n".join(expected) actual = repr(self.factor) assert actual == expected class TestCategoricalRepr: - def test_big_print(self): - factor = Categorical([0, 1, 2, 0, 1, 2] * 100, ['a', 'b', 'c'], - fastpath=True) - expected = ["[a, b, c, a, b, ..., b, c, a, b, c]", "Length: 600", - "Categories (3, object): [a, b, c]"] + factor = Categorical([0, 1, 2, 0, 1, 2] * 100, ["a", "b", "c"], fastpath=True) + expected = [ + "[a, b, c, a, b, ..., b, c, a, b, c]", + "Length: 600", + "Categories (3, object): [a, b, c]", + ] expected = "\n".join(expected) actual = repr(factor) @@ -31,31 +36,33 @@ def test_big_print(self): def test_empty_print(self): factor = Categorical([], ["a", "b", "c"]) - expected = ("[], Categories (3, object): [a, b, c]") + expected = "[], Categories (3, object): [a, b, c]" actual = repr(factor) assert actual == expected assert expected == actual factor = Categorical([], ["a", "b", "c"], ordered=True) - expected = ("[], Categories (3, object): [a < b < c]") + expected = "[], Categories (3, object): [a < b < c]" actual = repr(factor) assert expected == actual factor = Categorical([], []) - expected = ("[], Categories (0, object): []") + expected = "[], Categories (0, object): []" assert expected == repr(factor) def test_print_none_width(self): # GH10087 a = Series(Categorical([1, 2, 3, 4])) - exp = ("0 1\n1 2\n2 3\n3 4\n" - "dtype: category\nCategories (4, int64): [1, 2, 3, 4]") + exp = ( + "0 1\n1 2\n2 3\n3 4\n" + "dtype: category\nCategories (4, int64): [1, 2, 3, 4]" + ) with option_context("display.width", None): assert exp == repr(a) def test_unicode_print(self): - c = Categorical(['aaaaa', 'bb', 'cccc'] * 20) + c = Categorical(["aaaaa", "bb", "cccc"] * 20) expected = """\ [aaaaa, bb, cccc, aaaaa, bb, ..., bb, cccc, aaaaa, bb, cccc] Length: 60 @@ -63,7 +70,7 @@ def test_unicode_print(self): assert repr(c) == expected - c = Categorical(['ああああ', 'いいいいい', 'ううううううう'] * 20) + c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20) expected = """\ [ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう] Length: 60 @@ -73,9 +80,9 @@ def test_unicode_print(self): # unicode option should not affect to Categorical, as it doesn't care # the repr width - with option_context('display.unicode.east_asian_width', True): + with option_context("display.unicode.east_asian_width", True): - c = Categorical(['ああああ', 'いいいいい', 'ううううううう'] * 20) + c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20) expected = """[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう] Length: 60 Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa @@ -137,7 +144,7 @@ def test_categorical_repr_ordered(self): assert repr(c) == exp def test_categorical_repr_datetime(self): - idx = date_range('2011-01-01 09:00', freq='H', periods=5) + idx = date_range("2011-01-01 09:00", freq="H", periods=5) c = Categorical(idx) # TODO(wesm): exceeding 80 characters in the console is not good @@ -148,7 +155,9 @@ def test_categorical_repr_datetime(self): "Categories (5, datetime64[ns]): [2011-01-01 09:00:00, " "2011-01-01 10:00:00, 2011-01-01 11:00:00,\n" " 2011-01-01 12:00:00, " - "2011-01-01 13:00:00]""") + "2011-01-01 13:00:00]" + "" + ) assert repr(c) == exp c = Categorical(idx.append(idx), categories=idx) @@ -160,12 +169,12 @@ def test_categorical_repr_datetime(self): "Categories (5, datetime64[ns]): [2011-01-01 09:00:00, " "2011-01-01 10:00:00, 2011-01-01 11:00:00,\n" " 2011-01-01 12:00:00, " - "2011-01-01 13:00:00]") + "2011-01-01 13:00:00]" + ) assert repr(c) == exp - idx = date_range('2011-01-01 09:00', freq='H', periods=5, - tz='US/Eastern') + idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") c = Categorical(idx) exp = ( "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, " @@ -176,7 +185,8 @@ def test_categorical_repr_datetime(self): " " "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n" " " - "2011-01-01 13:00:00-05:00]") + "2011-01-01 13:00:00-05:00]" + ) assert repr(c) == exp @@ -192,12 +202,13 @@ def test_categorical_repr_datetime(self): " " "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n" " " - "2011-01-01 13:00:00-05:00]") + "2011-01-01 13:00:00-05:00]" + ) assert repr(c) == exp def test_categorical_repr_datetime_ordered(self): - idx = date_range('2011-01-01 09:00', freq='H', periods=5) + idx = date_range("2011-01-01 09:00", freq="H", periods=5) c = Categorical(idx, ordered=True) exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00] Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < @@ -212,8 +223,7 @@ def test_categorical_repr_datetime_ordered(self): assert repr(c) == exp - idx = date_range('2011-01-01 09:00', freq='H', periods=5, - tz='US/Eastern') + idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") c = Categorical(idx, ordered=True) exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < @@ -242,7 +252,7 @@ def test_categorical_repr_int_with_nan(self): assert repr(s) == s_exp def test_categorical_repr_period(self): - idx = period_range('2011-01-01 09:00', freq='H', periods=5) + idx = period_range("2011-01-01 09:00", freq="H", periods=5) c = Categorical(idx) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, @@ -257,7 +267,7 @@ def test_categorical_repr_period(self): assert repr(c) == exp - idx = period_range('2011-01', freq='M', periods=5) + idx = period_range("2011-01", freq="M", periods=5) c = Categorical(idx) exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05] Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" @@ -271,7 +281,7 @@ def test_categorical_repr_period(self): assert repr(c) == exp def test_categorical_repr_period_ordered(self): - idx = period_range('2011-01-01 09:00', freq='H', periods=5) + idx = period_range("2011-01-01 09:00", freq="H", periods=5) c = Categorical(idx, ordered=True) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < @@ -286,7 +296,7 @@ def test_categorical_repr_period_ordered(self): assert repr(c) == exp - idx = period_range('2011-01', freq='M', periods=5) + idx = period_range("2011-01", freq="M", periods=5) c = Categorical(idx, ordered=True) exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05] Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" @@ -300,7 +310,7 @@ def test_categorical_repr_period_ordered(self): assert repr(c) == exp def test_categorical_repr_timedelta(self): - idx = timedelta_range('1 days', periods=5) + idx = timedelta_range("1 days", periods=5) c = Categorical(idx) exp = """[1 days, 2 days, 3 days, 4 days, 5 days] Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" @@ -313,7 +323,7 @@ def test_categorical_repr_timedelta(self): assert repr(c) == exp - idx = timedelta_range('1 hours', periods=20) + idx = timedelta_range("1 hours", periods=20) c = Categorical(idx) exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] Length: 20 @@ -333,7 +343,7 @@ def test_categorical_repr_timedelta(self): assert repr(c) == exp def test_categorical_repr_timedelta_ordered(self): - idx = timedelta_range('1 days', periods=5) + idx = timedelta_range("1 days", periods=5) c = Categorical(idx, ordered=True) exp = """[1 days, 2 days, 3 days, 4 days, 5 days] Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa @@ -346,7 +356,7 @@ def test_categorical_repr_timedelta_ordered(self): assert repr(c) == exp - idx = timedelta_range('1 hours', periods=20) + idx = timedelta_range("1 hours", periods=20) c = Categorical(idx, ordered=True) exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] Length: 20 @@ -384,7 +394,7 @@ def test_categorical_index_repr_ordered(self): assert repr(i) == exp def test_categorical_index_repr_datetime(self): - idx = date_range('2011-01-01 09:00', freq='H', periods=5) + idx = date_range("2011-01-01 09:00", freq="H", periods=5) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', '2011-01-01 11:00:00', '2011-01-01 12:00:00', @@ -393,8 +403,7 @@ def test_categorical_index_repr_datetime(self): assert repr(i) == exp - idx = date_range('2011-01-01 09:00', freq='H', periods=5, - tz='US/Eastern') + idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', @@ -404,7 +413,7 @@ def test_categorical_index_repr_datetime(self): assert repr(i) == exp def test_categorical_index_repr_datetime_ordered(self): - idx = date_range('2011-01-01 09:00', freq='H', periods=5) + idx = date_range("2011-01-01 09:00", freq="H", periods=5) i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', '2011-01-01 11:00:00', '2011-01-01 12:00:00', @@ -413,8 +422,7 @@ def test_categorical_index_repr_datetime_ordered(self): assert repr(i) == exp - idx = date_range('2011-01-01 09:00', freq='H', periods=5, - tz='US/Eastern') + idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', @@ -435,22 +443,22 @@ def test_categorical_index_repr_datetime_ordered(self): def test_categorical_index_repr_period(self): # test all length - idx = period_range('2011-01-01 09:00', freq='H', periods=1) + idx = period_range("2011-01-01 09:00", freq="H", periods=1) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category')""" # noqa assert repr(i) == exp - idx = period_range('2011-01-01 09:00', freq='H', periods=2) + idx = period_range("2011-01-01 09:00", freq="H", periods=2) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category')""" # noqa assert repr(i) == exp - idx = period_range('2011-01-01 09:00', freq='H', periods=3) + idx = period_range("2011-01-01 09:00", freq="H", periods=3) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category')""" # noqa assert repr(i) == exp - idx = period_range('2011-01-01 09:00', freq='H', periods=5) + idx = period_range("2011-01-01 09:00", freq="H", periods=5) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', '2011-01-01 13:00'], @@ -467,13 +475,13 @@ def test_categorical_index_repr_period(self): assert repr(i) == exp - idx = period_range('2011-01', freq='M', periods=5) + idx = period_range("2011-01", freq="M", periods=5) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=False, dtype='category')""" # noqa assert repr(i) == exp def test_categorical_index_repr_period_ordered(self): - idx = period_range('2011-01-01 09:00', freq='H', periods=5) + idx = period_range("2011-01-01 09:00", freq="H", periods=5) i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', '2011-01-01 13:00'], @@ -481,18 +489,18 @@ def test_categorical_index_repr_period_ordered(self): assert repr(i) == exp - idx = period_range('2011-01', freq='M', periods=5) + idx = period_range("2011-01", freq="M", periods=5) i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=True, dtype='category')""" # noqa assert repr(i) == exp def test_categorical_index_repr_timedelta(self): - idx = timedelta_range('1 days', periods=5) + idx = timedelta_range("1 days", periods=5) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days 00:00:00, 2 days 00:00:00, 3 days 00:00:00, 4 days 00:00:00, 5 days 00:00:00], ordered=False, dtype='category')""" # noqa assert repr(i) == exp - idx = timedelta_range('1 hours', periods=10) + idx = timedelta_range("1 hours", periods=10) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00', '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00', @@ -503,12 +511,12 @@ def test_categorical_index_repr_timedelta(self): assert repr(i) == exp def test_categorical_index_repr_timedelta_ordered(self): - idx = timedelta_range('1 days', periods=5) + idx = timedelta_range("1 days", periods=5) i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days 00:00:00, 2 days 00:00:00, 3 days 00:00:00, 4 days 00:00:00, 5 days 00:00:00], ordered=True, dtype='category')""" # noqa assert repr(i) == exp - idx = timedelta_range('1 hours', periods=10) + idx = timedelta_range("1 hours", periods=10) i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00', '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00', diff --git a/pandas/tests/arrays/categorical/test_sorting.py b/pandas/tests/arrays/categorical/test_sorting.py index 26d8da3bf33f13..a0b09e19ece6ef 100644 --- a/pandas/tests/arrays/categorical/test_sorting.py +++ b/pandas/tests/arrays/categorical/test_sorting.py @@ -6,27 +6,28 @@ class TestCategoricalSort: - def test_argsort(self): c = Categorical([5, 3, 1, 4, 2], ordered=True) expected = np.array([2, 4, 1, 3, 0]) - tm.assert_numpy_array_equal(c.argsort(ascending=True), expected, - check_dtype=False) + tm.assert_numpy_array_equal( + c.argsort(ascending=True), expected, check_dtype=False + ) expected = expected[::-1] - tm.assert_numpy_array_equal(c.argsort(ascending=False), expected, - check_dtype=False) + tm.assert_numpy_array_equal( + c.argsort(ascending=False), expected, check_dtype=False + ) def test_numpy_argsort(self): c = Categorical([5, 3, 1, 4, 2], ordered=True) expected = np.array([2, 4, 1, 3, 0]) - tm.assert_numpy_array_equal(np.argsort(c), expected, - check_dtype=False) + tm.assert_numpy_array_equal(np.argsort(c), expected, check_dtype=False) - tm.assert_numpy_array_equal(np.argsort(c, kind='mergesort'), expected, - check_dtype=False) + tm.assert_numpy_array_equal( + np.argsort(c, kind="mergesort"), expected, check_dtype=False + ) msg = "the 'axis' parameter is not supported" with pytest.raises(ValueError, match=msg): @@ -34,7 +35,7 @@ def test_numpy_argsort(self): msg = "the 'order' parameter is not supported" with pytest.raises(ValueError, match=msg): - np.argsort(c, order='C') + np.argsort(c, order="C") def test_sort_values(self): @@ -50,8 +51,9 @@ def test_sort_values(self): tm.assert_numpy_array_equal(res.__array__(), exp) tm.assert_index_equal(res.categories, cat.categories) - cat = Categorical(["a", "c", "b", "d"], - categories=["a", "b", "c", "d"], ordered=True) + cat = Categorical( + ["a", "c", "b", "d"], categories=["a", "b", "c", "d"], ordered=True + ) res = cat.sort_values() exp = np.array(["a", "b", "c", "d"], dtype=object) tm.assert_numpy_array_equal(res.__array__(), exp) @@ -88,34 +90,34 @@ def test_sort_values_na_position(self): tm.assert_index_equal(res.categories, exp_categories) exp = np.array([np.nan, np.nan, 2.0, 2.0, 5.0]) - res = cat.sort_values(ascending=True, na_position='first') + res = cat.sort_values(ascending=True, na_position="first") tm.assert_numpy_array_equal(res.__array__(), exp) tm.assert_index_equal(res.categories, exp_categories) exp = np.array([np.nan, np.nan, 5.0, 2.0, 2.0]) - res = cat.sort_values(ascending=False, na_position='first') + res = cat.sort_values(ascending=False, na_position="first") tm.assert_numpy_array_equal(res.__array__(), exp) tm.assert_index_equal(res.categories, exp_categories) exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan]) - res = cat.sort_values(ascending=True, na_position='last') + res = cat.sort_values(ascending=True, na_position="last") tm.assert_numpy_array_equal(res.__array__(), exp) tm.assert_index_equal(res.categories, exp_categories) exp = np.array([5.0, 2.0, 2.0, np.nan, np.nan]) - res = cat.sort_values(ascending=False, na_position='last') + res = cat.sort_values(ascending=False, na_position="last") tm.assert_numpy_array_equal(res.__array__(), exp) tm.assert_index_equal(res.categories, exp_categories) cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) - res = cat.sort_values(ascending=False, na_position='last') + res = cat.sort_values(ascending=False, na_position="last") exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object) exp_categories = Index(["a", "b", "c", "d"]) tm.assert_numpy_array_equal(res.__array__(), exp_val) tm.assert_index_equal(res.categories, exp_categories) cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) - res = cat.sort_values(ascending=False, na_position='first') + res = cat.sort_values(ascending=False, na_position="first") exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object) exp_categories = Index(["a", "b", "c", "d"]) tm.assert_numpy_array_equal(res.__array__(), exp_val) diff --git a/pandas/tests/arrays/categorical/test_subclass.py b/pandas/tests/arrays/categorical/test_subclass.py index a8f0a348b3eccf..cfc7b8541302f1 100644 --- a/pandas/tests/arrays/categorical/test_subclass.py +++ b/pandas/tests/arrays/categorical/test_subclass.py @@ -3,21 +3,20 @@ class TestCategoricalSubclassing: - def test_constructor(self): - sc = tm.SubclassedCategorical(['a', 'b', 'c']) + sc = tm.SubclassedCategorical(["a", "b", "c"]) assert isinstance(sc, tm.SubclassedCategorical) - tm.assert_categorical_equal(sc, Categorical(['a', 'b', 'c'])) + tm.assert_categorical_equal(sc, Categorical(["a", "b", "c"])) def test_from_codes(self): - sc = tm.SubclassedCategorical.from_codes([1, 0, 2], ['a', 'b', 'c']) + sc = tm.SubclassedCategorical.from_codes([1, 0, 2], ["a", "b", "c"]) assert isinstance(sc, tm.SubclassedCategorical) - exp = Categorical.from_codes([1, 0, 2], ['a', 'b', 'c']) + exp = Categorical.from_codes([1, 0, 2], ["a", "b", "c"]) tm.assert_categorical_equal(sc, exp) def test_map(self): - sc = tm.SubclassedCategorical(['a', 'b', 'c']) + sc = tm.SubclassedCategorical(["a", "b", "c"]) res = sc.map(lambda x: x.upper()) assert isinstance(res, tm.SubclassedCategorical) - exp = Categorical(['A', 'B', 'C']) + exp = Categorical(["A", "B", "C"]) tm.assert_categorical_equal(res, exp) diff --git a/pandas/tests/arrays/categorical/test_warnings.py b/pandas/tests/arrays/categorical/test_warnings.py index 79634f581f366c..53733770ed9547 100644 --- a/pandas/tests/arrays/categorical/test_warnings.py +++ b/pandas/tests/arrays/categorical/test_warnings.py @@ -7,23 +7,23 @@ class TestCategoricalWarnings: def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 - pytest.importorskip('IPython', minversion="6.0.0") + pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; c = Categorical([])" ip.run_code(code) with tm.assert_produces_warning(None): - with provisionalcompleter('ignore'): - list(ip.Completer.completions('c.', 1)) + with provisionalcompleter("ignore"): + list(ip.Completer.completions("c.", 1)) def test_CategoricalAccessor_categorical_deprecation(self): with tm.assert_produces_warning(FutureWarning): - pd.Series(['a', 'b'], dtype='category').cat.categorical + pd.Series(["a", "b"], dtype="category").cat.categorical def test_CategoricalAccessor_name_deprecation(self): with tm.assert_produces_warning(FutureWarning): - pd.Series(['a', 'b'], dtype='category').cat.name + pd.Series(["a", "b"], dtype="category").cat.name def test_CategoricalAccessor_index_deprecation(self): with tm.assert_produces_warning(FutureWarning): - pd.Series(['a', 'b'], dtype='category').cat.index + pd.Series(["a", "b"], dtype="category").cat.index diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 4a7962d88a44e4..82409df5b46f7f 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -3,21 +3,31 @@ import pandas as pd from pandas import ( - Index, Interval, IntervalIndex, Timedelta, Timestamp, date_range, - timedelta_range) + Index, + Interval, + IntervalIndex, + Timedelta, + Timestamp, + date_range, + timedelta_range, +) from pandas.core.arrays import IntervalArray import pandas.util.testing as tm -@pytest.fixture(params=[ - (Index([0, 2, 4]), Index([1, 3, 5])), - (Index([0., 1., 2.]), Index([1., 2., 3.])), - (timedelta_range('0 days', periods=3), - timedelta_range('1 day', periods=3)), - (date_range('20170101', periods=3), date_range('20170102', periods=3)), - (date_range('20170101', periods=3, tz='US/Eastern'), - date_range('20170102', periods=3, tz='US/Eastern'))], - ids=lambda x: str(x[0].dtype)) +@pytest.fixture( + params=[ + (Index([0, 2, 4]), Index([1, 3, 5])), + (Index([0.0, 1.0, 2.0]), Index([1.0, 2.0, 3.0])), + (timedelta_range("0 days", periods=3), timedelta_range("1 day", periods=3)), + (date_range("20170101", periods=3), date_range("20170102", periods=3)), + ( + date_range("20170101", periods=3, tz="US/Eastern"), + date_range("20170102", periods=3, tz="US/Eastern"), + ), + ], + ids=lambda x: str(x[0].dtype), +) def left_right_dtypes(request): """ Fixture for building an IntervalArray from various dtypes @@ -26,26 +36,30 @@ def left_right_dtypes(request): class TestAttributes: - @pytest.mark.parametrize('left, right', [ - (0, 1), - (Timedelta('0 days'), Timedelta('1 day')), - (Timestamp('2018-01-01'), Timestamp('2018-01-02')), - pytest.param(Timestamp('2018-01-01', tz='US/Eastern'), - Timestamp('2018-01-02', tz='US/Eastern'), - marks=pytest.mark.xfail(strict=True, reason='GH 27011'))]) - @pytest.mark.parametrize('constructor', [IntervalArray, IntervalIndex]) + @pytest.mark.parametrize( + "left, right", + [ + (0, 1), + (Timedelta("0 days"), Timedelta("1 day")), + (Timestamp("2018-01-01"), Timestamp("2018-01-02")), + pytest.param( + Timestamp("2018-01-01", tz="US/Eastern"), + Timestamp("2018-01-02", tz="US/Eastern"), + marks=pytest.mark.xfail(strict=True, reason="GH 27011"), + ), + ], + ) + @pytest.mark.parametrize("constructor", [IntervalArray, IntervalIndex]) def test_is_empty(self, constructor, left, right, closed): # GH27219 tuples = [(left, left), (left, right), np.nan] - expected = np.array([closed != 'both', False, False]) + expected = np.array([closed != "both", False, False]) result = constructor.from_tuples(tuples, closed=closed).is_empty tm.assert_numpy_array_equal(result, expected) class TestMethods: - - @pytest.mark.parametrize('new_closed', [ - 'left', 'right', 'both', 'neither']) + @pytest.mark.parametrize("new_closed", ["left", "right", "both", "neither"]) def test_set_closed(self, closed, new_closed): # GH 21670 array = IntervalArray.from_breaks(range(10), closed=closed) @@ -53,20 +67,21 @@ def test_set_closed(self, closed, new_closed): expected = IntervalArray.from_breaks(range(10), closed=new_closed) tm.assert_extension_array_equal(result, expected) - @pytest.mark.parametrize('other', [ - Interval(0, 1, closed='right'), - IntervalArray.from_breaks([1, 2, 3, 4], closed='right'), - ]) + @pytest.mark.parametrize( + "other", + [ + Interval(0, 1, closed="right"), + IntervalArray.from_breaks([1, 2, 3, 4], closed="right"), + ], + ) def test_where_raises(self, other): - ser = pd.Series(IntervalArray.from_breaks([1, 2, 3, 4], - closed='left')) + ser = pd.Series(IntervalArray.from_breaks([1, 2, 3, 4], closed="left")) match = "'value.closed' is 'right', expected 'left'." with pytest.raises(ValueError, match=match): ser.where([True, False, True], other=other) class TestSetitem: - def test_set_na(self, left_right_dtypes): left, right = left_right_dtypes result = IntervalArray.from_arrays(left, right) diff --git a/pandas/tests/arrays/interval/test_ops.py b/pandas/tests/arrays/interval/test_ops.py index 7f53c40d7bf4e8..43601ea301568f 100644 --- a/pandas/tests/arrays/interval/test_ops.py +++ b/pandas/tests/arrays/interval/test_ops.py @@ -15,10 +15,14 @@ def constructor(request): return request.param -@pytest.fixture(params=[ - (Timedelta('0 days'), Timedelta('1 day')), - (Timestamp('2018-01-01'), Timedelta('1 day')), - (0, 1)], ids=lambda x: type(x[0]).__name__) +@pytest.fixture( + params=[ + (Timedelta("0 days"), Timedelta("1 day")), + (Timestamp("2018-01-01"), Timedelta("1 day")), + (0, 1), + ], + ids=lambda x: type(x[0]).__name__, +) def start_shift(request): """ Fixture for generating intervals of different types from a start value @@ -28,28 +32,27 @@ def start_shift(request): class TestOverlaps: - - def test_overlaps_interval( - self, constructor, start_shift, closed, other_closed): + def test_overlaps_interval(self, constructor, start_shift, closed, other_closed): start, shift = start_shift interval = Interval(start, start + 3 * shift, other_closed) # intervals: identical, nested, spanning, partial, adjacent, disjoint - tuples = [(start, start + 3 * shift), - (start + shift, start + 2 * shift), - (start - shift, start + 4 * shift), - (start + 2 * shift, start + 4 * shift), - (start + 3 * shift, start + 4 * shift), - (start + 4 * shift, start + 5 * shift)] + tuples = [ + (start, start + 3 * shift), + (start + shift, start + 2 * shift), + (start - shift, start + 4 * shift), + (start + 2 * shift, start + 4 * shift), + (start + 3 * shift, start + 4 * shift), + (start + 4 * shift, start + 5 * shift), + ] interval_container = constructor.from_tuples(tuples, closed) - adjacent = (interval.closed_right and interval_container.closed_left) + adjacent = interval.closed_right and interval_container.closed_left expected = np.array([True, True, True, True, adjacent, False]) result = interval_container.overlaps(interval) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('other_constructor', [ - IntervalArray, IntervalIndex]) + @pytest.mark.parametrize("other_constructor", [IntervalArray, IntervalIndex]) def test_overlaps_interval_container(self, constructor, other_constructor): # TODO: modify this test when implemented interval_container = constructor.from_breaks(range(5)) @@ -62,21 +65,26 @@ def test_overlaps_na(self, constructor, start_shift): start, shift = start_shift interval = Interval(start, start + shift) - tuples = [(start, start + shift), - np.nan, - (start + 2 * shift, start + 3 * shift)] + tuples = [ + (start, start + shift), + np.nan, + (start + 2 * shift, start + 3 * shift), + ] interval_container = constructor.from_tuples(tuples) expected = np.array([True, False, False]) result = interval_container.overlaps(interval) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('other', [ - 10, True, 'foo', Timedelta('1 day'), Timestamp('2018-01-01')], - ids=lambda x: type(x).__name__) + @pytest.mark.parametrize( + "other", + [10, True, "foo", Timedelta("1 day"), Timestamp("2018-01-01")], + ids=lambda x: type(x).__name__, + ) def test_overlaps_invalid_type(self, constructor, other): interval_container = constructor.from_breaks(range(5)) - msg = '`other` must be Interval-like, got {other}'.format( - other=type(other).__name__) + msg = "`other` must be Interval-like, got {other}".format( + other=type(other).__name__ + ) with pytest.raises(TypeError, match=msg): interval_container.overlaps(other) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index d0a188a8aff3c7..eab174862818c3 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -12,25 +12,21 @@ class TestSeriesAccessor: # TODO: collect other Series accessor tests def test_to_dense(self): - s = pd.Series([0, 1, 0, 10], dtype='Sparse[int64]') + s = pd.Series([0, 1, 0, 10], dtype="Sparse[int64]") result = s.sparse.to_dense() expected = pd.Series([0, 1, 0, 10]) tm.assert_series_equal(result, expected) class TestFrameAccessor: - def test_accessor_raises(self): df = pd.DataFrame({"A": [0, 1]}) - with pytest.raises(AttributeError, match='sparse'): + with pytest.raises(AttributeError, match="sparse"): df.sparse - @pytest.mark.parametrize('format', ['csc', 'csr', 'coo']) - @pytest.mark.parametrize("labels", [ - None, - list(string.ascii_letters[:10]), - ]) - @pytest.mark.parametrize('dtype', ['float64', 'int64']) + @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) + @pytest.mark.parametrize("labels", [None, list(string.ascii_letters[:10])]) + @pytest.mark.parametrize("dtype", ["float64", "int64"]) @td.skip_if_no_scipy def test_from_spmatrix(self, format, labels, dtype): import scipy.sparse @@ -38,85 +34,76 @@ def test_from_spmatrix(self, format, labels, dtype): sp_dtype = pd.SparseDtype(dtype, np.array(0, dtype=dtype).item()) mat = scipy.sparse.eye(10, format=format, dtype=dtype) - result = pd.DataFrame.sparse.from_spmatrix( - mat, index=labels, columns=labels - ) + result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels) expected = pd.DataFrame( - np.eye(10, dtype=dtype), - index=labels, - columns=labels, + np.eye(10, dtype=dtype), index=labels, columns=labels ).astype(sp_dtype) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("columns", [ - ['a', 'b'], - pd.MultiIndex.from_product([['A'], ['a', 'b']]), - ['a', 'a'], - ]) + @pytest.mark.parametrize( + "columns", + [["a", "b"], pd.MultiIndex.from_product([["A"], ["a", "b"]]), ["a", "a"]], + ) @td.skip_if_no_scipy def test_from_spmatrix_columns(self, columns): import scipy.sparse - dtype = pd.SparseDtype('float64', 0.0) + dtype = pd.SparseDtype("float64", 0.0) mat = scipy.sparse.random(10, 2, density=0.5) result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns) - expected = pd.DataFrame( - mat.toarray(), columns=columns - ).astype(dtype) + expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype) tm.assert_frame_equal(result, expected) @td.skip_if_no_scipy def test_to_coo(self): import scipy.sparse - df = pd.DataFrame({ - "A": [0, 1, 0], - "B": [1, 0, 0], - }, dtype='Sparse[int64, 0]') + df = pd.DataFrame({"A": [0, 1, 0], "B": [1, 0, 0]}, dtype="Sparse[int64, 0]") result = df.sparse.to_coo() expected = scipy.sparse.coo_matrix(np.asarray(df)) assert (result != expected).nnz == 0 def test_to_dense(self): - df = pd.DataFrame({ - "A": pd.SparseArray([1, 0], dtype=pd.SparseDtype('int64', 0)), - "B": pd.SparseArray([1, 0], dtype=pd.SparseDtype('int64', 1)), - "C": pd.SparseArray([1., 0.], - dtype=pd.SparseDtype('float64', 0.0)), - }, index=['b', 'a']) + df = pd.DataFrame( + { + "A": pd.SparseArray([1, 0], dtype=pd.SparseDtype("int64", 0)), + "B": pd.SparseArray([1, 0], dtype=pd.SparseDtype("int64", 1)), + "C": pd.SparseArray([1.0, 0.0], dtype=pd.SparseDtype("float64", 0.0)), + }, + index=["b", "a"], + ) result = df.sparse.to_dense() - expected = pd.DataFrame({ - 'A': [1, 0], - 'B': [1, 0], - 'C': [1.0, 0.0], - }, index=['b', 'a']) + expected = pd.DataFrame( + {"A": [1, 0], "B": [1, 0], "C": [1.0, 0.0]}, index=["b", "a"] + ) tm.assert_frame_equal(result, expected) def test_density(self): - df = pd.DataFrame({ - 'A': pd.SparseArray([1, 0, 2, 1], fill_value=0), - 'B': pd.SparseArray([0, 1, 1, 1], fill_value=0), - }) + df = pd.DataFrame( + { + "A": pd.SparseArray([1, 0, 2, 1], fill_value=0), + "B": pd.SparseArray([0, 1, 1, 1], fill_value=0), + } + ) res = df.sparse.density expected = 0.75 assert res == expected - @pytest.mark.parametrize("dtype", ['int64', 'float64']) + @pytest.mark.parametrize("dtype", ["int64", "float64"]) @pytest.mark.parametrize("dense_index", [True, False]) @td.skip_if_no_scipy def test_series_from_coo(self, dtype, dense_index): import scipy.sparse - A = scipy.sparse.eye(3, format='coo', dtype=dtype) + A = scipy.sparse.eye(3, format="coo", dtype=dtype) result = pd.Series.sparse.from_coo(A, dense_index=dense_index) index = pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)]) - expected = pd.Series(pd.SparseArray(np.array([1, 1, 1], dtype=dtype)), - index=index) + expected = pd.Series( + pd.SparseArray(np.array([1, 1, 1], dtype=dtype)), index=index + ) if dense_index: - expected = expected.reindex( - pd.MultiIndex.from_product(index.levels) - ) + expected = expected.reindex(pd.MultiIndex.from_product(index.levels)) tm.assert_series_equal(result, expected) @@ -124,8 +111,9 @@ def test_series_from_coo(self, dtype, dense_index): def test_series_from_coo_incorrect_format_raises(self): # gh-26554 import scipy.sparse + m = scipy.sparse.csr_matrix(np.array([[0, 1], [0, 0]])) - with pytest.raises(TypeError, - match='Expected coo_matrix. Got csr_matrix instead.' - ): + with pytest.raises( + TypeError, match="Expected coo_matrix. Got csr_matrix instead." + ): pd.Series.sparse.from_coo(m) diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index 31a8f13571d16a..7bfedff2177197 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -19,7 +19,7 @@ def _assert(self, a, b): tm.assert_numpy_array_equal(a, b) def _check_numeric_ops(self, a, b, a_dense, b_dense): - with np.errstate(invalid='ignore', divide='ignore'): + with np.errstate(invalid="ignore", divide="ignore"): # Unfortunately, trying to wrap the computation of each expected # value is with np.errstate() is too tedious. @@ -38,8 +38,7 @@ def _check_numeric_ops(self, a, b, a_dense, b_dense): self._assert((b / a).to_dense(), b_dense * 1.0 / a_dense) # ToDo: FIXME in GH 13843 - if not (self._base == pd.Series and - a.dtype.subtype == np.dtype('int64')): + if not (self._base == pd.Series and a.dtype.subtype == np.dtype("int64")): self._assert((a // b).to_dense(), a_dense // b_dense) self._assert((b // a).to_dense(), b_dense // a_dense) @@ -64,8 +63,7 @@ def _check_numeric_ops(self, a, b, a_dense, b_dense): self._assert((b_dense / a).to_dense(), b_dense * 1.0 / a_dense) # ToDo: FIXME in GH 13843 - if not (self._base == pd.Series and - a.dtype.subtype == np.dtype('int64')): + if not (self._base == pd.Series and a.dtype.subtype == np.dtype("int64")): self._assert((a // b_dense).to_dense(), a_dense // b_dense) self._assert((b_dense // a).to_dense(), b_dense // a_dense) @@ -82,7 +80,7 @@ def _check_bool_result(self, res): assert isinstance(res.fill_value, bool) def _check_comparison_ops(self, a, b, a_dense, b_dense): - with np.errstate(invalid='ignore'): + with np.errstate(invalid="ignore"): # Unfortunately, trying to wrap the computation of each expected # value is with np.errstate() is too tedious. # @@ -141,7 +139,7 @@ def _check_logical_ops(self, a, b, a_dense, b_dense): def test_float_scalar(self): values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: a = self._klass(values, kind=kind) self._check_numeric_ops(a, 1, values, 1) self._check_numeric_ops(a, 0, values, 0) @@ -160,7 +158,7 @@ def test_float_scalar(self): def test_float_scalar_comparison(self): values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: a = self._klass(values, kind=kind) self._check_comparison_ops(a, 1, values, 1) self._check_comparison_ops(a, 0, values, 0) @@ -178,7 +176,7 @@ def test_float_scalar_comparison(self): def test_float_same_index(self): # when sp_index are the same - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) @@ -186,8 +184,8 @@ def test_float_same_index(self): b = self._klass(rvalues, kind=kind) self._check_numeric_ops(a, b, values, rvalues) - values = self._base([0., 1., 2., 6., 0., 0., 1., 2., 1., 0.]) - rvalues = self._base([0., 2., 3., 4., 0., 0., 1., 3., 2., 0.]) + values = self._base([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0]) + rvalues = self._base([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0]) a = self._klass(values, kind=kind, fill_value=0) b = self._klass(rvalues, kind=kind, fill_value=0) @@ -195,7 +193,7 @@ def test_float_same_index(self): def test_float_same_index_comparison(self): # when sp_index are the same - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) @@ -203,8 +201,8 @@ def test_float_same_index_comparison(self): b = self._klass(rvalues, kind=kind) self._check_comparison_ops(a, b, values, rvalues) - values = self._base([0., 1., 2., 6., 0., 0., 1., 2., 1., 0.]) - rvalues = self._base([0., 2., 3., 4., 0., 0., 1., 3., 2., 0.]) + values = self._base([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0]) + rvalues = self._base([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0]) a = self._klass(values, kind=kind, fill_value=0) b = self._klass(rvalues, kind=kind, fill_value=0) @@ -214,7 +212,7 @@ def test_float_array(self): values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: a = self._klass(values, kind=kind) b = self._klass(rvalues, kind=kind) self._check_numeric_ops(a, b, values, rvalues) @@ -236,28 +234,28 @@ def test_float_array_different_kind(self): values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) - a = self._klass(values, kind='integer') - b = self._klass(rvalues, kind='block') + a = self._klass(values, kind="integer") + b = self._klass(rvalues, kind="block") self._check_numeric_ops(a, b, values, rvalues) self._check_numeric_ops(a, b * 0, values, rvalues * 0) - a = self._klass(values, kind='integer', fill_value=0) - b = self._klass(rvalues, kind='block') + a = self._klass(values, kind="integer", fill_value=0) + b = self._klass(rvalues, kind="block") self._check_numeric_ops(a, b, values, rvalues) - a = self._klass(values, kind='integer', fill_value=0) - b = self._klass(rvalues, kind='block', fill_value=0) + a = self._klass(values, kind="integer", fill_value=0) + b = self._klass(rvalues, kind="block", fill_value=0) self._check_numeric_ops(a, b, values, rvalues) - a = self._klass(values, kind='integer', fill_value=1) - b = self._klass(rvalues, kind='block', fill_value=2) + a = self._klass(values, kind="integer", fill_value=1) + b = self._klass(rvalues, kind="block", fill_value=2) self._check_numeric_ops(a, b, values, rvalues) def test_float_array_comparison(self): values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: a = self._klass(values, kind=kind) b = self._klass(rvalues, kind=kind) self._check_comparison_ops(a, b, values, rvalues) @@ -282,7 +280,7 @@ def test_int_array(self): values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype) rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype) - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: a = self._klass(values, dtype=dtype, kind=kind) assert a.dtype == SparseDtype(dtype) b = self._klass(rvalues, dtype=dtype, kind=kind) @@ -313,11 +311,11 @@ def test_int_array(self): def test_int_array_comparison(self): # int32 NI ATM - for dtype in ['int64']: + for dtype in ["int64"]: values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype) rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype) - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: a = self._klass(values, dtype=dtype, kind=kind) b = self._klass(rvalues, dtype=dtype, kind=kind) self._check_comparison_ops(a, b, values, rvalues) @@ -338,40 +336,38 @@ def test_int_array_comparison(self): def test_bool_same_index(self): # GH 14000 # when sp_index are the same - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: values = self._base([True, False, True, True], dtype=np.bool) rvalues = self._base([True, False, True, True], dtype=np.bool) for fill_value in [True, False, np.nan]: - a = self._klass(values, kind=kind, dtype=np.bool, - fill_value=fill_value) - b = self._klass(rvalues, kind=kind, dtype=np.bool, - fill_value=fill_value) + a = self._klass(values, kind=kind, dtype=np.bool, fill_value=fill_value) + b = self._klass( + rvalues, kind=kind, dtype=np.bool, fill_value=fill_value + ) self._check_logical_ops(a, b, values, rvalues) def test_bool_array_logical(self): # GH 14000 # when sp_index are the same - for kind in ['integer', 'block']: - values = self._base([True, False, True, False, True, True], - dtype=np.bool) - rvalues = self._base([True, False, False, True, False, True], - dtype=np.bool) + for kind in ["integer", "block"]: + values = self._base([True, False, True, False, True, True], dtype=np.bool) + rvalues = self._base([True, False, False, True, False, True], dtype=np.bool) for fill_value in [True, False, np.nan]: - a = self._klass(values, kind=kind, dtype=np.bool, - fill_value=fill_value) - b = self._klass(rvalues, kind=kind, dtype=np.bool, - fill_value=fill_value) + a = self._klass(values, kind=kind, dtype=np.bool, fill_value=fill_value) + b = self._klass( + rvalues, kind=kind, dtype=np.bool, fill_value=fill_value + ) self._check_logical_ops(a, b, values, rvalues) def test_mixed_array_float_int(self): - for rdtype in ['int64']: + for rdtype in ["int64"]: values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: a = self._klass(values, kind=kind) b = self._klass(rvalues, kind=kind) assert b.dtype == SparseDtype(rdtype) @@ -397,11 +393,11 @@ def test_mixed_array_float_int(self): def test_mixed_array_comparison(self): # int32 NI ATM - for rdtype in ['int64']: + for rdtype in ["int64"]: values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: a = self._klass(values, kind=kind) b = self._klass(rvalues, kind=kind) assert b.dtype == SparseDtype(rdtype) @@ -438,33 +434,34 @@ def test_alignment(self): db = pd.Series(np.arange(4), index=[1, 2, 3, 4]) sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=0) - sb = pd.SparseSeries(np.arange(4), index=[1, 2, 3, 4], - dtype=np.int64, fill_value=0) + sb = pd.SparseSeries( + np.arange(4), index=[1, 2, 3, 4], dtype=np.int64, fill_value=0 + ) self._check_numeric_ops(sa, sb, da, db) sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=np.nan) - sb = pd.SparseSeries(np.arange(4), index=[1, 2, 3, 4], - dtype=np.int64, fill_value=np.nan) + sb = pd.SparseSeries( + np.arange(4), index=[1, 2, 3, 4], dtype=np.int64, fill_value=np.nan + ) self._check_numeric_ops(sa, sb, da, db) da = pd.Series(np.arange(4)) db = pd.Series(np.arange(4), index=[10, 11, 12, 13]) sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=0) - sb = pd.SparseSeries(np.arange(4), index=[10, 11, 12, 13], - dtype=np.int64, fill_value=0) + sb = pd.SparseSeries( + np.arange(4), index=[10, 11, 12, 13], dtype=np.int64, fill_value=0 + ) self._check_numeric_ops(sa, sb, da, db) sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=np.nan) - sb = pd.SparseSeries(np.arange(4), index=[10, 11, 12, 13], - dtype=np.int64, fill_value=np.nan) + sb = pd.SparseSeries( + np.arange(4), index=[10, 11, 12, 13], dtype=np.int64, fill_value=np.nan + ) self._check_numeric_ops(sa, sb, da, db) -@pytest.mark.parametrize("op", [ - operator.eq, - operator.add, -]) +@pytest.mark.parametrize("op", [operator.eq, operator.add]) def test_with_list(op): arr = pd.SparseArray([0, 1], fill_value=0) result = op(arr, [0, 1]) @@ -472,13 +469,10 @@ def test_with_list(op): tm.assert_sp_array_equal(result, expected) -@pytest.mark.parametrize('ufunc', [ - np.abs, np.exp, -]) -@pytest.mark.parametrize('arr', [ - pd.SparseArray([0, 0, -1, 1]), - pd.SparseArray([None, None, -1, 1]), -]) +@pytest.mark.parametrize("ufunc", [np.abs, np.exp]) +@pytest.mark.parametrize( + "arr", [pd.SparseArray([0, 0, -1, 1]), pd.SparseArray([None, None, -1, 1])] +) def test_ufuncs(ufunc, arr): result = ufunc(arr) fill_value = ufunc(arr.fill_value) @@ -486,17 +480,17 @@ def test_ufuncs(ufunc, arr): tm.assert_sp_array_equal(result, expected) -@pytest.mark.parametrize("a, b", [ - (pd.SparseArray([0, 0, 0]), np.array([0, 1, 2])), - (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), - (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), - (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), - (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), -]) -@pytest.mark.parametrize("ufunc", [ - np.add, - np.greater, -]) +@pytest.mark.parametrize( + "a, b", + [ + (pd.SparseArray([0, 0, 0]), np.array([0, 1, 2])), + (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + ], +) +@pytest.mark.parametrize("ufunc", [np.add, np.greater]) def test_binary_ufuncs(ufunc, a, b): # can't say anything about fill value here. result = ufunc(a, b) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 8a51704732d7f3..c76b4d96005269 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -21,10 +21,8 @@ def kind(request): class TestSparseArray: - def setup_method(self, method): - self.arr_data = np.array([np.nan, np.nan, 1, 2, 3, - np.nan, 4, 5, np.nan, 6]) + self.arr_data = np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6]) self.arr = SparseArray(self.arr_data) self.zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0) @@ -59,32 +57,31 @@ def test_constructor_dtype(self): assert arr.fill_value == 0 def test_constructor_dtype_str(self): - result = SparseArray([1, 2, 3], dtype='int') + result = SparseArray([1, 2, 3], dtype="int") expected = SparseArray([1, 2, 3], dtype=int) tm.assert_sp_array_equal(result, expected) def test_constructor_sparse_dtype(self): - result = SparseArray([1, 0, 0, 1], dtype=SparseDtype('int64', -1)) + result = SparseArray([1, 0, 0, 1], dtype=SparseDtype("int64", -1)) expected = SparseArray([1, 0, 0, 1], fill_value=-1, dtype=np.int64) tm.assert_sp_array_equal(result, expected) - assert result.sp_values.dtype == np.dtype('int64') + assert result.sp_values.dtype == np.dtype("int64") def test_constructor_sparse_dtype_str(self): - result = SparseArray([1, 0, 0, 1], dtype='Sparse[int32]') + result = SparseArray([1, 0, 0, 1], dtype="Sparse[int32]") expected = SparseArray([1, 0, 0, 1], dtype=np.int32) tm.assert_sp_array_equal(result, expected) - assert result.sp_values.dtype == np.dtype('int32') + assert result.sp_values.dtype == np.dtype("int32") def test_constructor_object_dtype(self): # GH 11856 - arr = SparseArray(['A', 'A', np.nan, 'B'], dtype=np.object) + arr = SparseArray(["A", "A", np.nan, "B"], dtype=np.object) assert arr.dtype == SparseDtype(np.object) assert np.isnan(arr.fill_value) - arr = SparseArray(['A', 'A', np.nan, 'B'], dtype=np.object, - fill_value='A') - assert arr.dtype == SparseDtype(np.object, 'A') - assert arr.fill_value == 'A' + arr = SparseArray(["A", "A", np.nan, "B"], dtype=np.object, fill_value="A") + assert arr.dtype == SparseDtype(np.object, "A") + assert arr.fill_value == "A" # GH 17574 data = [False, 0, 100.0, 0.0] @@ -104,37 +101,42 @@ def test_constructor_spindex_dtype(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) # XXX: Behavior change: specifying SparseIndex no longer changes the # fill_value - expected = SparseArray([0, 1, 2, 0], kind='integer') + expected = SparseArray([0, 1, 2, 0], kind="integer") tm.assert_sp_array_equal(arr, expected) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 - arr = SparseArray(data=[1, 2, 3], - sparse_index=IntIndex(4, [1, 2, 3]), - dtype=np.int64, fill_value=0) + arr = SparseArray( + data=[1, 2, 3], + sparse_index=IntIndex(4, [1, 2, 3]), + dtype=np.int64, + fill_value=0, + ) exp = SparseArray([0, 1, 2, 3], dtype=np.int64, fill_value=0) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 - arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), - fill_value=0, dtype=np.int64) + arr = SparseArray( + data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=np.int64 + ) exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=np.int64) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 - arr = SparseArray(data=[1, 2, 3], - sparse_index=IntIndex(4, [1, 2, 3]), - dtype=None, fill_value=0) + arr = SparseArray( + data=[1, 2, 3], + sparse_index=IntIndex(4, [1, 2, 3]), + dtype=None, + fill_value=0, + ) exp = SparseArray([0, 1, 2, 3], dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 - @pytest.mark.parametrize("sparse_index", [ - None, IntIndex(1, [0]), - ]) + @pytest.mark.parametrize("sparse_index", [None, IntIndex(1, [0])]) def test_constructor_spindex_dtype_scalar(self, sparse_index): # scalar input arr = SparseArray(data=1, sparse_index=sparse_index, dtype=None) @@ -150,19 +152,23 @@ def test_constructor_spindex_dtype_scalar(self, sparse_index): assert arr.fill_value == 0 def test_constructor_spindex_dtype_scalar_broadcasts(self): - arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), - fill_value=0, dtype=None) + arr = SparseArray( + data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=None + ) exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 - @pytest.mark.parametrize('data, fill_value', [ - (np.array([1, 2]), 0), - (np.array([1.0, 2.0]), np.nan), - ([True, False], False), - ([pd.Timestamp('2017-01-01')], pd.NaT), - ]) + @pytest.mark.parametrize( + "data, fill_value", + [ + (np.array([1, 2]), 0), + (np.array([1.0, 2.0]), np.nan), + ([True, False], False), + ([pd.Timestamp("2017-01-01")], pd.NaT), + ], + ) def test_constructor_inferred_fill_value(self, data, fill_value): result = SparseArray(data).fill_value @@ -171,13 +177,11 @@ def test_constructor_inferred_fill_value(self, data, fill_value): else: assert result == fill_value - @pytest.mark.parametrize('format', ['coo', 'csc', 'csr']) - @pytest.mark.parametrize('size', [ - pytest.param(0, - marks=td.skip_if_np_lt("1.16", - reason='NumPy-11383')), - 10 - ]) + @pytest.mark.parametrize("format", ["coo", "csc", "csr"]) + @pytest.mark.parametrize( + "size", + [pytest.param(0, marks=td.skip_if_np_lt("1.16", reason="NumPy-11383")), 10], + ) @td.skip_if_no_scipy def test_from_spmatrix(self, size, format): import scipy.sparse @@ -193,16 +197,20 @@ def test_from_spmatrix(self, size, format): def test_from_spmatrix_raises(self): import scipy.sparse - mat = scipy.sparse.eye(5, 4, format='csc') + mat = scipy.sparse.eye(5, 4, format="csc") with pytest.raises(ValueError, match="not '4'"): SparseArray.from_spmatrix(mat) - @pytest.mark.parametrize('scalar,dtype', [ - (False, SparseDtype(bool, False)), - (0.0, SparseDtype('float64', 0)), - (1, SparseDtype('int64', 1)), - ('z', SparseDtype('object', 'z'))]) + @pytest.mark.parametrize( + "scalar,dtype", + [ + (False, SparseDtype(bool, False)), + (0.0, SparseDtype("float64", 0)), + (1, SparseDtype("int64", 1)), + ("z", SparseDtype("object", "z")), + ], + ) def test_scalar_with_index_infer_dtype(self, scalar, dtype): # GH 19163 arr = SparseArray(scalar, index=[1, 2, 3], fill_value=scalar) @@ -217,13 +225,13 @@ def test_scalar_with_index_infer_dtype(self, scalar, dtype): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_sparse_series_round_trip(self, kind, fill): # see gh-13999 - arr = SparseArray([np.nan, 1, np.nan, 2, 3], - kind=kind, fill_value=fill) + arr = SparseArray([np.nan, 1, np.nan, 2, 3], kind=kind, fill_value=fill) res = SparseArray(SparseSeries(arr)) tm.assert_sp_array_equal(arr, res) - arr = SparseArray([0, 0, 0, 1, 1, 2], dtype=np.int64, - kind=kind, fill_value=fill) + arr = SparseArray( + [0, 0, 0, 1, 1, 2], dtype=np.int64, kind=kind, fill_value=fill + ) res = SparseArray(SparseSeries(arr), dtype=np.int64) tm.assert_sp_array_equal(arr, res) @@ -234,8 +242,9 @@ def test_sparse_series_round_trip(self, kind, fill): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_sparse_series_round_trip2(self, kind, fill): # see gh-13999 - arr = SparseArray([True, False, True, True], dtype=np.bool, - kind=kind, fill_value=fill) + arr = SparseArray( + [True, False, True, True], dtype=np.bool, kind=kind, fill_value=fill + ) res = SparseArray(SparseSeries(arr)) tm.assert_sp_array_equal(arr, res) @@ -291,16 +300,14 @@ def test_take_negative(self): exp = SparseArray(np.take(self.arr_data, [-4, -3, -2])) tm.assert_sp_array_equal(self.arr.take([-4, -3, -2]), exp) - @pytest.mark.parametrize('fill_value', [0, None, np.nan]) + @pytest.mark.parametrize("fill_value", [0, None, np.nan]) def test_shift_fill_value(self, fill_value): # GH #24128 - sparse = SparseArray(np.array([1, 0, 0, 3, 0]), - fill_value=8.0) + sparse = SparseArray(np.array([1, 0, 0, 3, 0]), fill_value=8.0) res = sparse.shift(1, fill_value=fill_value) if isna(fill_value): fill_value = res.dtype.na_value - exp = SparseArray(np.array([fill_value, 1, 0, 0, 3]), - fill_value=8.0) + exp = SparseArray(np.array([fill_value, 1, 0, 0, 3]), fill_value=8.0) tm.assert_sp_array_equal(res, exp) def test_bad_take(self): @@ -320,8 +327,7 @@ def test_take_filling(self): tm.assert_sp_array_equal(result, expected) # allow_fill=False - result = sparse.take(np.array([1, 0, -1]), - allow_fill=False, fill_value=True) + result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) expected = SparseArray([np.nan, np.nan, 4]) tm.assert_sp_array_equal(result, expected) @@ -355,12 +361,11 @@ def test_take_filling_fill_value(self): tm.assert_sp_array_equal(result, expected) # allow_fill=False - result = sparse.take(np.array([1, 0, -1]), - allow_fill=False, fill_value=True) + result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) expected = SparseArray([0, np.nan, 4], fill_value=0) tm.assert_sp_array_equal(result, expected) - msg = ("Invalid value in 'indices'.") + msg = "Invalid value in 'indices'." with pytest.raises(ValueError, match=msg): sparse.take(np.array([1, 0, -2]), allow_fill=True) with pytest.raises(ValueError, match=msg): @@ -377,11 +382,11 @@ def test_take_filling_all_nan(self): sparse = SparseArray([np.nan, np.nan, np.nan, np.nan, np.nan]) # XXX: did the default kind from take change? result = sparse.take(np.array([1, 0, -1])) - expected = SparseArray([np.nan, np.nan, np.nan], kind='block') + expected = SparseArray([np.nan, np.nan, np.nan], kind="block") tm.assert_sp_array_equal(result, expected) result = sparse.take(np.array([1, 0, -1]), fill_value=True) - expected = SparseArray([np.nan, np.nan, np.nan], kind='block') + expected = SparseArray([np.nan, np.nan, np.nan], kind="block") tm.assert_sp_array_equal(result, expected) with pytest.raises(IndexError): @@ -431,8 +436,7 @@ def test_constructor_bool(self): tm.assert_numpy_array_equal(arr.sp_values, np.array([True, True])) # Behavior change: np.asarray densifies. # tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr)) - tm.assert_numpy_array_equal(arr.sp_index.indices, - np.array([2, 3], np.int32)) + tm.assert_numpy_array_equal(arr.sp_index.indices, np.array([2, 3], np.int32)) dense = arr.to_dense() assert dense.dtype == bool @@ -453,16 +457,16 @@ def test_constructor_bool_fill_value(self): def test_constructor_float32(self): # GH 10648 - data = np.array([1., np.nan, 3], dtype=np.float32) + data = np.array([1.0, np.nan, 3], dtype=np.float32) arr = SparseArray(data, dtype=np.float32) assert arr.dtype == SparseDtype(np.float32) - tm.assert_numpy_array_equal(arr.sp_values, - np.array([1, 3], dtype=np.float32)) + tm.assert_numpy_array_equal(arr.sp_values, np.array([1, 3], dtype=np.float32)) # Behavior change: np.asarray densifies. # tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr)) - tm.assert_numpy_array_equal(arr.sp_index.indices, - np.array([0, 2], dtype=np.int32)) + tm.assert_numpy_array_equal( + arr.sp_index.indices, np.array([0, 2], dtype=np.int32) + ) dense = arr.to_dense() assert dense.dtype == np.float32 @@ -472,39 +476,38 @@ def test_astype(self): # float -> float arr = SparseArray([None, None, 0, 2]) result = arr.astype("Sparse[float32]") - expected = SparseArray([None, None, 0, 2], dtype=np.dtype('float32')) + expected = SparseArray([None, None, 0, 2], dtype=np.dtype("float32")) tm.assert_sp_array_equal(result, expected) dtype = SparseDtype("float64", fill_value=0) result = arr.astype(dtype) - expected = SparseArray._simple_new(np.array([0., 2.], - dtype=dtype.subtype), - IntIndex(4, [2, 3]), - dtype) + expected = SparseArray._simple_new( + np.array([0.0, 2.0], dtype=dtype.subtype), IntIndex(4, [2, 3]), dtype + ) tm.assert_sp_array_equal(result, expected) dtype = SparseDtype("int64", 0) result = arr.astype(dtype) - expected = SparseArray._simple_new(np.array([0, 2], dtype=np.int64), - IntIndex(4, [2, 3]), - dtype) + expected = SparseArray._simple_new( + np.array([0, 2], dtype=np.int64), IntIndex(4, [2, 3]), dtype + ) tm.assert_sp_array_equal(result, expected) arr = SparseArray([0, np.nan, 0, 1], fill_value=0) - with pytest.raises(ValueError, match='NA'): - arr.astype('Sparse[i8]') + with pytest.raises(ValueError, match="NA"): + arr.astype("Sparse[i8]") def test_astype_bool(self): a = pd.SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0)) result = a.astype(bool) - expected = SparseArray([True, 0, 0, True], - dtype=SparseDtype(bool, 0)) + expected = SparseArray([True, 0, 0, True], dtype=SparseDtype(bool, 0)) tm.assert_sp_array_equal(result, expected) # update fill value result = a.astype(SparseDtype(bool, False)) - expected = SparseArray([True, False, False, True], - dtype=SparseDtype(bool, False)) + expected = SparseArray( + [True, False, False, True], dtype=SparseDtype(bool, False) + ) tm.assert_sp_array_equal(result, expected) def test_astype_all(self, any_real_dtype): @@ -515,39 +518,55 @@ def test_astype_all(self, any_real_dtype): assert res.dtype == SparseDtype(typ, 1) assert res.sp_values.dtype == typ - tm.assert_numpy_array_equal(np.asarray(res.to_dense()), - vals.astype(typ)) - - @pytest.mark.parametrize('array, dtype, expected', [ - (SparseArray([0, 1]), 'float', - SparseArray([0., 1.], dtype=SparseDtype(float, 0.0))), - (SparseArray([0, 1]), bool, SparseArray([False, True])), - (SparseArray([0, 1], fill_value=1), bool, - SparseArray([False, True], dtype=SparseDtype(bool, True))), - pytest.param( - SparseArray([0, 1]), 'datetime64[ns]', - SparseArray(np.array([0, 1], dtype='datetime64[ns]'), - dtype=SparseDtype('datetime64[ns]', - pd.Timestamp('1970'))), - marks=[pytest.mark.xfail(reason="NumPy-7619")], - ), - (SparseArray([0, 1, 10]), str, - SparseArray(['0', '1', '10'], dtype=SparseDtype(str, '0'))), - (SparseArray(['10', '20']), float, SparseArray([10.0, 20.0])), - (SparseArray([0, 1, 0]), object, - SparseArray([0, 1, 0], dtype=SparseDtype(object, 0))), - ]) + tm.assert_numpy_array_equal(np.asarray(res.to_dense()), vals.astype(typ)) + + @pytest.mark.parametrize( + "array, dtype, expected", + [ + ( + SparseArray([0, 1]), + "float", + SparseArray([0.0, 1.0], dtype=SparseDtype(float, 0.0)), + ), + (SparseArray([0, 1]), bool, SparseArray([False, True])), + ( + SparseArray([0, 1], fill_value=1), + bool, + SparseArray([False, True], dtype=SparseDtype(bool, True)), + ), + pytest.param( + SparseArray([0, 1]), + "datetime64[ns]", + SparseArray( + np.array([0, 1], dtype="datetime64[ns]"), + dtype=SparseDtype("datetime64[ns]", pd.Timestamp("1970")), + ), + marks=[pytest.mark.xfail(reason="NumPy-7619")], + ), + ( + SparseArray([0, 1, 10]), + str, + SparseArray(["0", "1", "10"], dtype=SparseDtype(str, "0")), + ), + (SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])), + ( + SparseArray([0, 1, 0]), + object, + SparseArray([0, 1, 0], dtype=SparseDtype(object, 0)), + ), + ], + ) def test_astype_more(self, array, dtype, expected): result = array.astype(dtype) tm.assert_sp_array_equal(result, expected) def test_astype_nan_raises(self): arr = SparseArray([1.0, np.nan]) - with pytest.raises(ValueError, match='Cannot convert non-finite'): + with pytest.raises(ValueError, match="Cannot convert non-finite"): arr.astype(int) def test_set_fill_value(self): - arr = SparseArray([1., np.nan, 2.], fill_value=np.nan) + arr = SparseArray([1.0, np.nan, 2.0], fill_value=np.nan) arr.fill_value = 2 assert arr.fill_value == 2 @@ -599,22 +618,28 @@ def test_copy(self): def test_values_asarray(self): assert_almost_equal(self.arr.to_dense(), self.arr_data) - @pytest.mark.parametrize('data,shape,dtype', [ - ([0, 0, 0, 0, 0], (5,), None), - ([], (0,), None), - ([0], (1,), None), - (['A', 'A', np.nan, 'B'], (4,), np.object) - ]) + @pytest.mark.parametrize( + "data,shape,dtype", + [ + ([0, 0, 0, 0, 0], (5,), None), + ([], (0,), None), + ([0], (1,), None), + (["A", "A", np.nan, "B"], (4,), np.object), + ], + ) def test_shape(self, data, shape, dtype): # GH 21126 out = SparseArray(data, dtype=dtype) assert out.shape == shape - @pytest.mark.parametrize("vals", [ - [np.nan, np.nan, np.nan, np.nan, np.nan], - [1, np.nan, np.nan, 3, np.nan], - [1, np.nan, 0, 3, 0], - ]) + @pytest.mark.parametrize( + "vals", + [ + [np.nan, np.nan, np.nan, np.nan, np.nan], + [1, np.nan, np.nan, 3, np.nan], + [1, np.nan, 0, 3, 0], + ], + ) @pytest.mark.parametrize("fill_value", [None, 0]) def test_dense_repr(self, vals, fill_value): vals = np.array(vals) @@ -664,13 +689,13 @@ def test_getslice_tuple(self): dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0]) sparse = SparseArray(dense) - res = sparse[4:, ] - exp = SparseArray(dense[4:, ]) + res = sparse[4:,] + exp = SparseArray(dense[4:,]) tm.assert_sp_array_equal(res, exp) sparse = SparseArray(dense, fill_value=0) - res = sparse[4:, ] - exp = SparseArray(dense[4:, ], fill_value=0) + res = sparse[4:,] + exp = SparseArray(dense[4:,], fill_value=0) tm.assert_sp_array_equal(res, exp) with pytest.raises(IndexError): @@ -685,8 +710,7 @@ def test_boolean_slice_empty(self): res = arr[[False, False, False]] assert res.dtype == arr.dtype - @pytest.mark.parametrize("op", ["add", "sub", "mul", - "truediv", "floordiv", "pow"]) + @pytest.mark.parametrize("op", ["add", "sub", "mul", "truediv", "floordiv", "pow"]) def test_binary_operators(self, op): op = getattr(operator, op) data1 = np.random.randn(20) @@ -705,8 +729,9 @@ def test_binary_operators(self, op): def _check_op(op, first, second): res = op(first, second) - exp = SparseArray(op(first.to_dense(), second.to_dense()), - fill_value=first.fill_value) + exp = SparseArray( + op(first.to_dense(), second.to_dense()), fill_value=first.fill_value + ) assert isinstance(res, SparseArray) assert_almost_equal(res.to_dense(), exp.to_dense()) @@ -746,10 +771,8 @@ def _check_roundtrip(obj): def test_generator_warnings(self): sp_arr = SparseArray([1, 2, 3]) with warnings.catch_warnings(record=True) as w: - warnings.filterwarnings(action='always', - category=DeprecationWarning) - warnings.filterwarnings(action='always', - category=PendingDeprecationWarning) + warnings.filterwarnings(action="always", category=DeprecationWarning) + warnings.filterwarnings(action="always", category=PendingDeprecationWarning) for _ in sp_arr: pass assert len(w) == 0 @@ -786,9 +809,9 @@ def test_fillna(self): tm.assert_sp_array_equal(res, exp) # float dtype's fill_value is np.nan, replaced by -1 - s = SparseArray([0., 0., 0., 0.]) + s = SparseArray([0.0, 0.0, 0.0, 0.0]) res = s.fillna(-1) - exp = SparseArray([0., 0., 0., 0.], fill_value=-1) + exp = SparseArray([0.0, 0.0, 0.0, 0.0], fill_value=-1) tm.assert_sp_array_equal(res, exp) # int dtype shouldn't have missing. No changes. @@ -829,13 +852,7 @@ def test_fillna_overlap(self): def test_nonzero(self): # Tests regression #21172. - sa = pd.SparseArray([ - float('nan'), - float('nan'), - 1, 0, 0, - 2, 0, 0, 0, - 3, 0, 0 - ]) + sa = pd.SparseArray([float("nan"), float("nan"), 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) expected = np.array([2, 5, 9], dtype=np.int32) result, = sa.nonzero() tm.assert_numpy_array_equal(expected, result) @@ -846,12 +863,14 @@ def test_nonzero(self): class TestSparseArrayAnalytics: - - @pytest.mark.parametrize('data,pos,neg', [ - ([True, True, True], True, False), - ([1, 2, 1], 1, 0), - ([1.0, 2.0, 1.0], 1.0, 0.0) - ]) + @pytest.mark.parametrize( + "data,pos,neg", + [ + ([True, True, True], True, False), + ([1, 2, 1], 1, 0), + ([1.0, 2.0, 1.0], 1.0, 0.0), + ], + ) def test_all(self, data, pos, neg): # GH 17570 out = SparseArray(data).all() @@ -867,11 +886,14 @@ def test_all(self, data, pos, neg): out = SparseArray(data, fill_value=pos).all() assert not out - @pytest.mark.parametrize('data,pos,neg', [ - ([True, True, True], True, False), - ([1, 2, 1], 1, 0), - ([1.0, 2.0, 1.0], 1.0, 0.0) - ]) + @pytest.mark.parametrize( + "data,pos,neg", + [ + ([True, True, True], True, False), + ([1, 2, 1], 1, 0), + ([1.0, 2.0, 1.0], 1.0, 0.0), + ], + ) @td.skip_if_np_lt("1.15") # prior didn't dispatch def test_numpy_all(self, data, pos, neg): # GH 17570 @@ -889,15 +911,18 @@ def test_numpy_all(self, data, pos, neg): assert not out # raises with a different message on py2. - msg = "the \'out\' parameter is not supported" + msg = "the 'out' parameter is not supported" with pytest.raises(ValueError, match=msg): np.all(SparseArray(data), out=np.array([])) - @pytest.mark.parametrize('data,pos,neg', [ - ([False, True, False], True, False), - ([0, 2, 0], 2, 0), - ([0.0, 2.0, 0.0], 2.0, 0.0) - ]) + @pytest.mark.parametrize( + "data,pos,neg", + [ + ([False, True, False], True, False), + ([0, 2, 0], 2, 0), + ([0.0, 2.0, 0.0], 2.0, 0.0), + ], + ) def test_any(self, data, pos, neg): # GH 17570 out = SparseArray(data).any() @@ -913,11 +938,14 @@ def test_any(self, data, pos, neg): out = SparseArray(data, fill_value=pos).any() assert not out - @pytest.mark.parametrize('data,pos,neg', [ - ([False, True, False], True, False), - ([0, 2, 0], 2, 0), - ([0.0, 2.0, 0.0], 2.0, 0.0) - ]) + @pytest.mark.parametrize( + "data,pos,neg", + [ + ([False, True, False], True, False), + ([0, 2, 0], 2, 0), + ([0.0, 2.0, 0.0], 2.0, 0.0), + ], + ) @td.skip_if_np_lt("1.15") # prior didn't dispatch def test_numpy_any(self, data, pos, neg): # GH 17570 @@ -934,7 +962,7 @@ def test_numpy_any(self, data, pos, neg): out = np.any(SparseArray(data, fill_value=pos)) assert not out - msg = "the \'out\' parameter is not supported" + msg = "the 'out' parameter is not supported" with pytest.raises(ValueError, match=msg): np.any(SparseArray(data), out=out) @@ -970,12 +998,19 @@ def test_numpy_sum(self): with pytest.raises(ValueError, match=msg): np.sum(SparseArray(data), out=out) - @pytest.mark.parametrize("data,expected", [ - (np.array([1, 2, 3, 4, 5], dtype=float), # non-null data - SparseArray(np.array([1.0, 3.0, 6.0, 10.0, 15.0]))), - (np.array([1, 2, np.nan, 4, 5], dtype=float), # null data - SparseArray(np.array([1.0, 3.0, np.nan, 7.0, 12.0]))) - ]) + @pytest.mark.parametrize( + "data,expected", + [ + ( + np.array([1, 2, 3, 4, 5], dtype=float), # non-null data + SparseArray(np.array([1.0, 3.0, 6.0, 10.0, 15.0])), + ), + ( + np.array([1, 2, np.nan, 4, 5], dtype=float), # null data + SparseArray(np.array([1.0, 3.0, np.nan, 7.0, 12.0])), + ), + ], + ) @pytest.mark.parametrize("numpy", [True, False]) def test_cumsum(self, data, expected, numpy): cumsum = np.cumsum if numpy else lambda s: s.cumsum() @@ -1037,14 +1072,12 @@ def test_ufunc(self): tm.assert_sp_array_equal(np.abs(sparse), result) sparse = SparseArray([1, -1, 2, -2], fill_value=1) - result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, - fill_value=1) + result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, fill_value=1) tm.assert_sp_array_equal(abs(sparse), result) tm.assert_sp_array_equal(np.abs(sparse), result) sparse = SparseArray([1, -1, 2, -2], fill_value=-1) - result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, - fill_value=1) + result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, fill_value=1) tm.assert_sp_array_equal(abs(sparse), result) tm.assert_sp_array_equal(np.abs(sparse), result) @@ -1074,33 +1107,30 @@ def test_ufunc_args(self): result = SparseArray([2, 0, 1, -1], fill_value=1) tm.assert_sp_array_equal(np.add(sparse, 1), result) - @pytest.mark.parametrize('fill_value', [0.0, np.nan]) + @pytest.mark.parametrize("fill_value", [0.0, np.nan]) def test_modf(self, fill_value): # https://github.com/pandas-dev/pandas/issues/26946 - sparse = pd.SparseArray([fill_value] * 10 + [1.1, 2.2], - fill_value=fill_value) + sparse = pd.SparseArray([fill_value] * 10 + [1.1, 2.2], fill_value=fill_value) r1, r2 = np.modf(sparse) e1, e2 = np.modf(np.asarray(sparse)) tm.assert_sp_array_equal(r1, pd.SparseArray(e1, fill_value=fill_value)) tm.assert_sp_array_equal(r2, pd.SparseArray(e2, fill_value=fill_value)) def test_nbytes_integer(self): - arr = SparseArray([1, 0, 0, 0, 2], kind='integer') + arr = SparseArray([1, 0, 0, 0, 2], kind="integer") result = arr.nbytes # (2 * 8) + 2 * 4 assert result == 24 def test_nbytes_block(self): - arr = SparseArray([1, 2, 0, 0, 0], kind='block') + arr = SparseArray([1, 2, 0, 0, 0], kind="block") result = arr.nbytes # (2 * 8) + 4 + 4 # sp_values, blocs, blenghts assert result == 24 def test_asarray_datetime64(self): - s = pd.SparseArray( - pd.to_datetime(['2012', None, None, '2013']) - ) + s = pd.SparseArray(pd.to_datetime(["2012", None, None, "2013"])) np.asarray(s) def test_density(self): @@ -1114,10 +1144,7 @@ def test_npoints(self): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestAccessor: - - @pytest.mark.parametrize('attr', [ - 'npoints', 'density', 'fill_value', 'sp_values', - ]) + @pytest.mark.parametrize("attr", ["npoints", "density", "fill_value", "sp_values"]) def test_get_attributes(self, attr): arr = SparseArray([0, 1]) ser = pd.Series(arr) @@ -1137,29 +1164,31 @@ def test_from_coo(self): result = pd.Series.sparse.from_coo(sp_array) index = pd.MultiIndex.from_arrays([[0, 0, 1, 3], [0, 2, 1, 3]]) - expected = pd.Series([4, 9, 7, 5], index=index, dtype='Sparse[int]') + expected = pd.Series([4, 9, 7, 5], index=index, dtype="Sparse[int]") tm.assert_series_equal(result, expected) @td.skip_if_no_scipy def test_to_coo(self): import scipy.sparse - ser = pd.Series([1, 2, 3], - index=pd.MultiIndex.from_product([[0], [1, 2, 3]], - names=['a', 'b']), - dtype='Sparse[int]') + + ser = pd.Series( + [1, 2, 3], + index=pd.MultiIndex.from_product([[0], [1, 2, 3]], names=["a", "b"]), + dtype="Sparse[int]", + ) A, _, _ = ser.sparse.to_coo() assert isinstance(A, scipy.sparse.coo.coo_matrix) def test_non_sparse_raises(self): ser = pd.Series([1, 2, 3]) - with pytest.raises(AttributeError, match='.sparse'): + with pytest.raises(AttributeError, match=".sparse"): ser.sparse.density def test_setting_fill_value_fillna_still_works(): # This is why letting users update fill_value / dtype is bad # astype has the same problem. - arr = SparseArray([1., np.nan, 1.0], fill_value=0.0) + arr = SparseArray([1.0, np.nan, 1.0], fill_value=0.0) arr.fill_value = np.nan result = arr.isna() # Can't do direct comparison, since the sp_index will be different @@ -1183,27 +1212,26 @@ def test_setting_fill_value_updates(): tm.assert_sp_array_equal(arr, expected) -@pytest.mark.parametrize("arr, loc", [ - ([None, 1, 2], 0), - ([0, None, 2], 1), - ([0, 1, None], 2), - ([0, 1, 1, None, None], 3), - ([1, 1, 1, 2], -1), - ([], -1), -]) +@pytest.mark.parametrize( + "arr, loc", + [ + ([None, 1, 2], 0), + ([0, None, 2], 1), + ([0, 1, None], 2), + ([0, 1, 1, None, None], 3), + ([1, 1, 1, 2], -1), + ([], -1), + ], +) def test_first_fill_value_loc(arr, loc): result = SparseArray(arr)._first_fill_value_loc() assert result == loc -@pytest.mark.parametrize('arr', [ - [1, 2, np.nan, np.nan], - [1, np.nan, 2, np.nan], - [1, 2, np.nan], -]) -@pytest.mark.parametrize("fill_value", [ - np.nan, 0, 1 -]) +@pytest.mark.parametrize( + "arr", [[1, 2, np.nan, np.nan], [1, np.nan, 2, np.nan], [1, 2, np.nan]] +) +@pytest.mark.parametrize("fill_value", [np.nan, 0, 1]) def test_unique_na_fill(arr, fill_value): a = pd.SparseArray(arr, fill_value=fill_value).unique() b = pd.Series(arr).unique() diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py index 2d386de0d31a36..db8f62962f0b0a 100644 --- a/pandas/tests/arrays/sparse/test_dtype.py +++ b/pandas/tests/arrays/sparse/test_dtype.py @@ -5,14 +5,17 @@ from pandas.core.sparse.api import SparseDtype -@pytest.mark.parametrize("dtype, fill_value", [ - ('int', 0), - ('float', np.nan), - ('bool', False), - ('object', np.nan), - ('datetime64[ns]', pd.NaT), - ('timedelta64[ns]', pd.NaT), -]) +@pytest.mark.parametrize( + "dtype, fill_value", + [ + ("int", 0), + ("float", np.nan), + ("bool", False), + ("object", np.nan), + ("datetime64[ns]", pd.NaT), + ("timedelta64[ns]", pd.NaT), + ], +) def test_inferred_dtype(dtype, fill_value): sparse_dtype = SparseDtype(dtype) result = sparse_dtype.fill_value @@ -23,28 +26,31 @@ def test_inferred_dtype(dtype, fill_value): def test_from_sparse_dtype(): - dtype = SparseDtype('float', 0) + dtype = SparseDtype("float", 0) result = SparseDtype(dtype) assert result.fill_value == 0 def test_from_sparse_dtype_fill_value(): - dtype = SparseDtype('int', 1) + dtype = SparseDtype("int", 1) result = SparseDtype(dtype, fill_value=2) - expected = SparseDtype('int', 2) + expected = SparseDtype("int", 2) assert result == expected -@pytest.mark.parametrize('dtype, fill_value', [ - ('int', None), - ('float', None), - ('bool', None), - ('object', None), - ('datetime64[ns]', None), - ('timedelta64[ns]', None), - ('int', np.nan), - ('float', 0), -]) +@pytest.mark.parametrize( + "dtype, fill_value", + [ + ("int", None), + ("float", None), + ("bool", None), + ("object", None), + ("datetime64[ns]", None), + ("timedelta64[ns]", None), + ("int", np.nan), + ("float", 0), + ], +) def test_equal(dtype, fill_value): a = SparseDtype(dtype, fill_value) b = SparseDtype(dtype, fill_value) @@ -53,64 +59,76 @@ def test_equal(dtype, fill_value): def test_nans_equal(): - a = SparseDtype(float, float('nan')) + a = SparseDtype(float, float("nan")) b = SparseDtype(float, np.nan) assert a == b assert b == a -@pytest.mark.parametrize('a, b', [ - (SparseDtype('float64'), SparseDtype('float32')), - (SparseDtype('float64'), SparseDtype('float64', 0)), - (SparseDtype('float64'), SparseDtype('datetime64[ns]', np.nan)), - (SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)), - (SparseDtype('float64'), np.dtype('float64')), -]) +@pytest.mark.parametrize( + "a, b", + [ + (SparseDtype("float64"), SparseDtype("float32")), + (SparseDtype("float64"), SparseDtype("float64", 0)), + (SparseDtype("float64"), SparseDtype("datetime64[ns]", np.nan)), + (SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)), + (SparseDtype("float64"), np.dtype("float64")), + ], +) def test_not_equal(a, b): assert a != b def test_construct_from_string_raises(): with pytest.raises(TypeError): - SparseDtype.construct_from_string('not a dtype') - - -@pytest.mark.parametrize("dtype, expected", [ - (SparseDtype(int), True), - (SparseDtype(float), True), - (SparseDtype(bool), True), - (SparseDtype(object), False), - (SparseDtype(str), False), -]) + SparseDtype.construct_from_string("not a dtype") + + +@pytest.mark.parametrize( + "dtype, expected", + [ + (SparseDtype(int), True), + (SparseDtype(float), True), + (SparseDtype(bool), True), + (SparseDtype(object), False), + (SparseDtype(str), False), + ], +) def test_is_numeric(dtype, expected): assert dtype._is_numeric is expected def test_str_uses_object(): result = SparseDtype(str).subtype - assert result == np.dtype('object') - - -@pytest.mark.parametrize("string, expected", [ - ('Sparse[float64]', SparseDtype(np.dtype('float64'))), - ('Sparse[float32]', SparseDtype(np.dtype('float32'))), - ('Sparse[int]', SparseDtype(np.dtype('int'))), - ('Sparse[str]', SparseDtype(np.dtype('str'))), - ('Sparse[datetime64[ns]]', SparseDtype(np.dtype('datetime64[ns]'))), - ("Sparse", SparseDtype(np.dtype("float"), np.nan)) -]) + assert result == np.dtype("object") + + +@pytest.mark.parametrize( + "string, expected", + [ + ("Sparse[float64]", SparseDtype(np.dtype("float64"))), + ("Sparse[float32]", SparseDtype(np.dtype("float32"))), + ("Sparse[int]", SparseDtype(np.dtype("int"))), + ("Sparse[str]", SparseDtype(np.dtype("str"))), + ("Sparse[datetime64[ns]]", SparseDtype(np.dtype("datetime64[ns]"))), + ("Sparse", SparseDtype(np.dtype("float"), np.nan)), + ], +) def test_construct_from_string(string, expected): result = SparseDtype.construct_from_string(string) assert result == expected -@pytest.mark.parametrize("a, b, expected", [ - (SparseDtype(float, 0.0), SparseDtype(np.dtype('float'), 0.0), True), - (SparseDtype(int, 0), SparseDtype(int, 0), True), - (SparseDtype(float, float('nan')), SparseDtype(float, np.nan), True), - (SparseDtype(float, 0), SparseDtype(float, np.nan), False), - (SparseDtype(int, 0.0), SparseDtype(float, 0.0), False), -]) +@pytest.mark.parametrize( + "a, b, expected", + [ + (SparseDtype(float, 0.0), SparseDtype(np.dtype("float"), 0.0), True), + (SparseDtype(int, 0), SparseDtype(int, 0), True), + (SparseDtype(float, float("nan")), SparseDtype(float, np.nan), True), + (SparseDtype(float, 0), SparseDtype(float, np.nan), False), + (SparseDtype(int, 0.0), SparseDtype(float, 0.0), False), + ], +) def test_hash_equal(a, b, expected): result = a == b assert result is expected @@ -119,43 +137,47 @@ def test_hash_equal(a, b, expected): assert result is expected -@pytest.mark.parametrize('string, expected', [ - ('Sparse[int]', 'int'), - ('Sparse[int, 0]', 'int'), - ('Sparse[int64]', 'int64'), - ('Sparse[int64, 0]', 'int64'), - ('Sparse[datetime64[ns], 0]', 'datetime64[ns]'), -]) +@pytest.mark.parametrize( + "string, expected", + [ + ("Sparse[int]", "int"), + ("Sparse[int, 0]", "int"), + ("Sparse[int64]", "int64"), + ("Sparse[int64, 0]", "int64"), + ("Sparse[datetime64[ns], 0]", "datetime64[ns]"), + ], +) def test_parse_subtype(string, expected): subtype, _ = SparseDtype._parse_subtype(string) assert subtype == expected -@pytest.mark.parametrize("string", [ - "Sparse[int, 1]", - "Sparse[float, 0.0]", - "Sparse[bool, True]", -]) +@pytest.mark.parametrize( + "string", ["Sparse[int, 1]", "Sparse[float, 0.0]", "Sparse[bool, True]"] +) def test_construct_from_string_fill_value_raises(string): - with pytest.raises(TypeError, match='fill_value in the string is not'): + with pytest.raises(TypeError, match="fill_value in the string is not"): SparseDtype.construct_from_string(string) -@pytest.mark.parametrize('original, dtype, expected', [ - (SparseDtype(int, 0), float, SparseDtype(float, 0.0)), - (SparseDtype(int, 1), float, SparseDtype(float, 1.0)), - (SparseDtype(int, 1), str, SparseDtype(object, '1')), - (SparseDtype(float, 1.5), int, SparseDtype(int, 1)), -]) +@pytest.mark.parametrize( + "original, dtype, expected", + [ + (SparseDtype(int, 0), float, SparseDtype(float, 0.0)), + (SparseDtype(int, 1), float, SparseDtype(float, 1.0)), + (SparseDtype(int, 1), str, SparseDtype(object, "1")), + (SparseDtype(float, 1.5), int, SparseDtype(int, 1)), + ], +) def test_update_dtype(original, dtype, expected): result = original.update_dtype(dtype) assert result == expected -@pytest.mark.parametrize("original, dtype", [ - (SparseDtype(float, np.nan), int), - (SparseDtype(str, 'abc'), int), -]) +@pytest.mark.parametrize( + "original, dtype", + [(SparseDtype(float, np.nan), int), (SparseDtype(str, "abc"), int)], +) def test_update_dtype_raises(original, dtype): with pytest.raises(ValueError): original.update_dtype(dtype) diff --git a/pandas/tests/arrays/sparse/test_libsparse.py b/pandas/tests/arrays/sparse/test_libsparse.py index 44bda995f9a780..183eaada16452e 100644 --- a/pandas/tests/arrays/sparse/test_libsparse.py +++ b/pandas/tests/arrays/sparse/test_libsparse.py @@ -12,24 +12,54 @@ TEST_LENGTH = 20 -plain_case = dict(xloc=[0, 7, 15], xlen=[3, 5, 5], yloc=[2, 9, 14], - ylen=[2, 3, 5], intersect_loc=[2, 9, 15], - intersect_len=[1, 3, 4]) -delete_blocks = dict(xloc=[0, 5], xlen=[4, 4], yloc=[1], ylen=[4], - intersect_loc=[1], intersect_len=[3]) -split_blocks = dict(xloc=[0], xlen=[10], yloc=[0, 5], ylen=[3, 7], - intersect_loc=[0, 5], intersect_len=[3, 5]) -skip_block = dict(xloc=[10], xlen=[5], yloc=[0, 12], ylen=[5, 3], - intersect_loc=[12], intersect_len=[3]) - -no_intersect = dict(xloc=[0, 10], xlen=[4, 6], yloc=[5, 17], ylen=[4, 2], - intersect_loc=[], intersect_len=[]) +plain_case = dict( + xloc=[0, 7, 15], + xlen=[3, 5, 5], + yloc=[2, 9, 14], + ylen=[2, 3, 5], + intersect_loc=[2, 9, 15], + intersect_len=[1, 3, 4], +) +delete_blocks = dict( + xloc=[0, 5], xlen=[4, 4], yloc=[1], ylen=[4], intersect_loc=[1], intersect_len=[3] +) +split_blocks = dict( + xloc=[0], + xlen=[10], + yloc=[0, 5], + ylen=[3, 7], + intersect_loc=[0, 5], + intersect_len=[3, 5], +) +skip_block = dict( + xloc=[10], + xlen=[5], + yloc=[0, 12], + ylen=[5, 3], + intersect_loc=[12], + intersect_len=[3], +) + +no_intersect = dict( + xloc=[0, 10], + xlen=[4, 6], + yloc=[5, 17], + ylen=[4, 2], + intersect_loc=[], + intersect_len=[], +) def check_cases(_check_case): def _check_case_dict(case): - _check_case(case['xloc'], case['xlen'], case['yloc'], case['ylen'], - case['intersect_loc'], case['intersect_len']) + _check_case( + case["xloc"], + case["xlen"], + case["yloc"], + case["ylen"], + case["intersect_loc"], + case["intersect_len"], + ) _check_case_dict(plain_case) _check_case_dict(delete_blocks) @@ -43,24 +73,22 @@ def _check_case_dict(case): class TestSparseIndexUnion: - def test_index_make_union(self): def _check_case(xloc, xlen, yloc, ylen, eloc, elen): xindex = BlockIndex(TEST_LENGTH, xloc, xlen) yindex = BlockIndex(TEST_LENGTH, yloc, ylen) bresult = xindex.make_union(yindex) - assert (isinstance(bresult, BlockIndex)) - tm.assert_numpy_array_equal(bresult.blocs, - np.array(eloc, dtype=np.int32)) - tm.assert_numpy_array_equal(bresult.blengths, - np.array(elen, dtype=np.int32)) + assert isinstance(bresult, BlockIndex) + tm.assert_numpy_array_equal(bresult.blocs, np.array(eloc, dtype=np.int32)) + tm.assert_numpy_array_equal( + bresult.blengths, np.array(elen, dtype=np.int32) + ) ixindex = xindex.to_int_index() iyindex = yindex.to_int_index() iresult = ixindex.make_union(iyindex) - assert (isinstance(iresult, IntIndex)) - tm.assert_numpy_array_equal(iresult.indices, - bresult.to_int_index().indices) + assert isinstance(iresult, IntIndex) + tm.assert_numpy_array_equal(iresult.indices, bresult.to_int_index().indices) """ x: ---- @@ -191,12 +219,11 @@ def test_int_index_make_union(self): class TestSparseIndexIntersect: - @td.skip_if_windows def test_intersect(self): def _check_correct(a, b, expected): result = a.intersect(b) - assert (result.equals(expected)) + assert result.equals(expected) def _check_length_exc(a, longer): msg = "Indices must reference same underlying length" @@ -210,12 +237,12 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): longer_index = BlockIndex(TEST_LENGTH + 1, yloc, ylen) _check_correct(xindex, yindex, expected) - _check_correct(xindex.to_int_index(), yindex.to_int_index(), - expected.to_int_index()) + _check_correct( + xindex.to_int_index(), yindex.to_int_index(), expected.to_int_index() + ) _check_length_exc(xindex, longer_index) - _check_length_exc(xindex.to_int_index(), - longer_index.to_int_index()) + _check_length_exc(xindex.to_int_index(), longer_index.to_int_index()) check_cases(_check_case) @@ -231,10 +258,12 @@ def test_intersect_empty(self): assert yindex.intersect(xindex).equals(xindex) def test_intersect_identical(self): - cases = [IntIndex(5, np.array([1, 2], dtype=np.int32)), - IntIndex(5, np.array([0, 2, 4], dtype=np.int32)), - IntIndex(0, np.array([], dtype=np.int32)), - IntIndex(5, np.array([], dtype=np.int32))] + cases = [ + IntIndex(5, np.array([1, 2], dtype=np.int32)), + IntIndex(5, np.array([0, 2, 4], dtype=np.int32)), + IntIndex(0, np.array([], dtype=np.int32)), + IntIndex(5, np.array([], dtype=np.int32)), + ] for case in cases: assert case.intersect(case).equals(case) @@ -243,64 +272,49 @@ def test_intersect_identical(self): class TestSparseIndexCommon: - def test_int_internal(self): - idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='integer') + idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="integer") assert isinstance(idx, IntIndex) assert idx.npoints == 2 - tm.assert_numpy_array_equal(idx.indices, - np.array([2, 3], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32)) - idx = _make_index(4, np.array([], dtype=np.int32), kind='integer') + idx = _make_index(4, np.array([], dtype=np.int32), kind="integer") assert isinstance(idx, IntIndex) assert idx.npoints == 0 - tm.assert_numpy_array_equal(idx.indices, - np.array([], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32)) - idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), - kind='integer') + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer") assert isinstance(idx, IntIndex) assert idx.npoints == 4 - tm.assert_numpy_array_equal(idx.indices, - np.array([0, 1, 2, 3], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32)) def test_block_internal(self): - idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='block') + idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 2 - tm.assert_numpy_array_equal(idx.blocs, - np.array([2], dtype=np.int32)) - tm.assert_numpy_array_equal(idx.blengths, - np.array([2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32)) - idx = _make_index(4, np.array([], dtype=np.int32), kind='block') + idx = _make_index(4, np.array([], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 0 - tm.assert_numpy_array_equal(idx.blocs, - np.array([], dtype=np.int32)) - tm.assert_numpy_array_equal(idx.blengths, - np.array([], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32)) - idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), - kind='block') + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 4 - tm.assert_numpy_array_equal(idx.blocs, - np.array([0], dtype=np.int32)) - tm.assert_numpy_array_equal(idx.blengths, - np.array([4], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32)) - idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), - kind='block') + idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 3 - tm.assert_numpy_array_equal(idx.blocs, - np.array([0, 2], dtype=np.int32)) - tm.assert_numpy_array_equal(idx.blengths, - np.array([1, 2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32)) def test_lookup(self): - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind=kind) assert idx.lookup(-1) == -1 assert idx.lookup(0) == -1 @@ -314,8 +328,7 @@ def test_lookup(self): for i in range(-1, 5): assert idx.lookup(i) == -1 - idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), - kind=kind) + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind) assert idx.lookup(-1) == -1 assert idx.lookup(0) == 0 assert idx.lookup(1) == 1 @@ -323,8 +336,7 @@ def test_lookup(self): assert idx.lookup(3) == 3 assert idx.lookup(4) == -1 - idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), - kind=kind) + idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind) assert idx.lookup(-1) == -1 assert idx.lookup(0) == 0 assert idx.lookup(1) == -1 @@ -333,7 +345,7 @@ def test_lookup(self): assert idx.lookup(4) == -1 def test_lookup_array(self): - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind=kind) res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32)) @@ -348,8 +360,7 @@ def test_lookup_array(self): res = idx.lookup_array(np.array([-1, 0, 2, 4], dtype=np.int32)) exp = np.array([-1, -1, -1, -1], dtype=np.int32) - idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), - kind=kind) + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind) res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32)) exp = np.array([-1, 0, 2], dtype=np.int32) tm.assert_numpy_array_equal(res, exp) @@ -358,8 +369,7 @@ def test_lookup_array(self): exp = np.array([-1, 2, 1, 3], dtype=np.int32) tm.assert_numpy_array_equal(res, exp) - idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), - kind=kind) + idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind) res = idx.lookup_array(np.array([2, 1, 3, 0], dtype=np.int32)) exp = np.array([1, -1, 2, 0], dtype=np.int32) tm.assert_numpy_array_equal(res, exp) @@ -370,16 +380,16 @@ def test_lookup_array(self): def test_lookup_basics(self): def _check(index): - assert (index.lookup(0) == -1) - assert (index.lookup(5) == 0) - assert (index.lookup(7) == 2) - assert (index.lookup(8) == -1) - assert (index.lookup(9) == -1) - assert (index.lookup(10) == -1) - assert (index.lookup(11) == -1) - assert (index.lookup(12) == 3) - assert (index.lookup(17) == 8) - assert (index.lookup(18) == -1) + assert index.lookup(0) == -1 + assert index.lookup(5) == 0 + assert index.lookup(7) == 2 + assert index.lookup(8) == -1 + assert index.lookup(9) == -1 + assert index.lookup(10) == -1 + assert index.lookup(11) == -1 + assert index.lookup(12) == 3 + assert index.lookup(17) == 8 + assert index.lookup(18) == -1 bindex = BlockIndex(20, [5, 12], [3, 6]) iindex = bindex.to_int_index() @@ -391,50 +401,38 @@ def _check(index): class TestBlockIndex: - def test_block_internal(self): - idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='block') + idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 2 - tm.assert_numpy_array_equal(idx.blocs, - np.array([2], dtype=np.int32)) - tm.assert_numpy_array_equal(idx.blengths, - np.array([2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32)) - idx = _make_index(4, np.array([], dtype=np.int32), kind='block') + idx = _make_index(4, np.array([], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 0 - tm.assert_numpy_array_equal(idx.blocs, - np.array([], dtype=np.int32)) - tm.assert_numpy_array_equal(idx.blengths, - np.array([], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32)) - idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), - kind='block') + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 4 - tm.assert_numpy_array_equal(idx.blocs, - np.array([0], dtype=np.int32)) - tm.assert_numpy_array_equal(idx.blengths, - np.array([4], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32)) - idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind='block') + idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 3 - tm.assert_numpy_array_equal(idx.blocs, - np.array([0, 2], dtype=np.int32)) - tm.assert_numpy_array_equal(idx.blengths, - np.array([1, 2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32)) def test_make_block_boundary(self): for i in [5, 10, 100, 101]: - idx = _make_index(i, np.arange(0, i, 2, dtype=np.int32), - kind='block') + idx = _make_index(i, np.arange(0, i, 2, dtype=np.int32), kind="block") exp = np.arange(0, i, 2, dtype=np.int32) tm.assert_numpy_array_equal(idx.blocs, exp) - tm.assert_numpy_array_equal(idx.blengths, - np.ones(len(exp), dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.ones(len(exp), dtype=np.int32)) def test_equals(self): index = BlockIndex(10, [0, 4], [2, 5]) @@ -469,8 +467,7 @@ def test_to_int_index(self): block = BlockIndex(20, locs, lengths) dense = block.to_int_index() - tm.assert_numpy_array_equal(dense.indices, - np.array(exp_inds, dtype=np.int32)) + tm.assert_numpy_array_equal(dense.indices, np.array(exp_inds, dtype=np.int32)) def test_to_block_index(self): index = BlockIndex(10, [0, 5], [4, 5]) @@ -478,7 +475,6 @@ def test_to_block_index(self): class TestIntIndex: - def test_check_integrity(self): # Too many indices than specified in self.length @@ -518,24 +514,20 @@ def test_check_integrity(self): IntIndex(length=5, indices=[1, 3, 3]) def test_int_internal(self): - idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='integer') + idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="integer") assert isinstance(idx, IntIndex) assert idx.npoints == 2 - tm.assert_numpy_array_equal(idx.indices, - np.array([2, 3], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32)) - idx = _make_index(4, np.array([], dtype=np.int32), kind='integer') + idx = _make_index(4, np.array([], dtype=np.int32), kind="integer") assert isinstance(idx, IntIndex) assert idx.npoints == 0 - tm.assert_numpy_array_equal(idx.indices, - np.array([], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32)) - idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), - kind='integer') + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer") assert isinstance(idx, IntIndex) assert idx.npoints == 4 - tm.assert_numpy_array_equal(idx.indices, - np.array([0, 1, 2, 3], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32)) def test_equals(self): index = IntIndex(10, [0, 1, 2, 3, 4]) @@ -543,7 +535,6 @@ def test_equals(self): assert not index.equals(IntIndex(10, [0, 1, 2, 3])) def test_to_block_index(self): - def _check_case(xloc, xlen, yloc, ylen, eloc, elen): xindex = BlockIndex(TEST_LENGTH, xloc, xlen) yindex = BlockIndex(TEST_LENGTH, yloc, ylen) @@ -563,7 +554,6 @@ def test_to_int_index(self): class TestSparseOperators: - def _op_tests(self, sparse_op, python_op): def _check_case(xloc, xlen, yloc, ylen, eloc, elen): xindex = BlockIndex(TEST_LENGTH, xloc, xlen) @@ -572,16 +562,18 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): xdindex = xindex.to_int_index() ydindex = yindex.to_int_index() - x = np.arange(xindex.npoints) * 10. + 1 - y = np.arange(yindex.npoints) * 100. + 1 + x = np.arange(xindex.npoints) * 10.0 + 1 + y = np.arange(yindex.npoints) * 100.0 + 1 xfill = 0 yfill = 2 - result_block_vals, rb_index, bfill = sparse_op(x, xindex, xfill, y, - yindex, yfill) - result_int_vals, ri_index, ifill = sparse_op(x, xdindex, xfill, y, - ydindex, yfill) + result_block_vals, rb_index, bfill = sparse_op( + x, xindex, xfill, y, yindex, yfill + ) + result_int_vals, ri_index, ifill = sparse_op( + x, xdindex, xfill, y, ydindex, yfill + ) assert rb_index.to_int_index().equals(ri_index) tm.assert_numpy_array_equal(result_block_vals, result_int_vals) @@ -597,15 +589,13 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): series_result = python_op(xseries, yseries) series_result = series_result.reindex(ri_index.indices) - tm.assert_numpy_array_equal(result_block_vals, - series_result.values) + tm.assert_numpy_array_equal(result_block_vals, series_result.values) tm.assert_numpy_array_equal(result_int_vals, series_result.values) check_cases(_check_case) - @pytest.mark.parametrize('opname', - ['add', 'sub', 'mul', 'truediv', 'floordiv']) + @pytest.mark.parametrize("opname", ["add", "sub", "mul", "truediv", "floordiv"]) def test_op(self, opname): - sparse_op = getattr(splib, 'sparse_%s_float64' % opname) + sparse_op = getattr(splib, "sparse_%s_float64" % opname) python_op = getattr(operator, opname) self._op_tests(sparse_op, python_op) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index d097141cd8c739..e8d9ecfac61e4f 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -11,95 +11,126 @@ from pandas.api.extensions import register_extension_dtype from pandas.api.types import is_scalar from pandas.core.arrays import PandasArray, integer_array, period_array -from pandas.tests.extension.decimal import ( - DecimalArray, DecimalDtype, to_decimal) +from pandas.tests.extension.decimal import DecimalArray, DecimalDtype, to_decimal import pandas.util.testing as tm -@pytest.mark.parametrize("data, dtype, expected", [ - # Basic NumPy defaults. - ([1, 2], None, PandasArray(np.array([1, 2]))), - ([1, 2], object, PandasArray(np.array([1, 2], dtype=object))), - ([1, 2], np.dtype('float32'), - PandasArray(np.array([1., 2.0], dtype=np.dtype('float32')))), - (np.array([1, 2]), None, PandasArray(np.array([1, 2]))), - - # String alias passes through to NumPy - ([1, 2], 'float32', PandasArray(np.array([1, 2], dtype='float32'))), - - # Period alias - ([pd.Period('2000', 'D'), pd.Period('2001', 'D')], 'Period[D]', - period_array(['2000', '2001'], freq='D')), - - # Period dtype - ([pd.Period('2000', 'D')], pd.PeriodDtype('D'), - period_array(['2000'], freq='D')), - - # Datetime (naive) - ([1, 2], np.dtype('datetime64[ns]'), - pd.arrays.DatetimeArray._from_sequence( - np.array([1, 2], dtype='datetime64[ns]'))), - - (np.array([1, 2], dtype='datetime64[ns]'), None, - pd.arrays.DatetimeArray._from_sequence( - np.array([1, 2], dtype='datetime64[ns]'))), - - (pd.DatetimeIndex(['2000', '2001']), np.dtype('datetime64[ns]'), - pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])), - - (pd.DatetimeIndex(['2000', '2001']), None, - pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])), - - (['2000', '2001'], np.dtype('datetime64[ns]'), - pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])), - - # Datetime (tz-aware) - (['2000', '2001'], pd.DatetimeTZDtype(tz="CET"), - pd.arrays.DatetimeArray._from_sequence( - ['2000', '2001'], dtype=pd.DatetimeTZDtype(tz="CET"))), - - # Timedelta - (['1H', '2H'], np.dtype('timedelta64[ns]'), - pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])), - - (pd.TimedeltaIndex(['1H', '2H']), np.dtype('timedelta64[ns]'), - pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])), - - (pd.TimedeltaIndex(['1H', '2H']), None, - pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])), - - # Category - (['a', 'b'], 'category', pd.Categorical(['a', 'b'])), - (['a', 'b'], pd.CategoricalDtype(None, ordered=True), - pd.Categorical(['a', 'b'], ordered=True)), - - # Interval - ([pd.Interval(1, 2), pd.Interval(3, 4)], 'interval', - pd.arrays.IntervalArray.from_tuples([(1, 2), (3, 4)])), - - # Sparse - ([0, 1], 'Sparse[int64]', pd.SparseArray([0, 1], dtype='int64')), - - # IntegerNA - ([1, None], 'Int16', integer_array([1, None], dtype='Int16')), - (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), - - # Index - (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), - - # Series[EA] returns the EA - (pd.Series(pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])), - None, - pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])), - - # "3rd party" EAs work - ([decimal.Decimal(0), decimal.Decimal(1)], 'decimal', to_decimal([0, 1])), - - # pass an ExtensionArray, but a different dtype - (period_array(['2000', '2001'], freq='D'), - 'category', - pd.Categorical([pd.Period('2000', 'D'), pd.Period('2001', 'D')])), -]) +@pytest.mark.parametrize( + "data, dtype, expected", + [ + # Basic NumPy defaults. + ([1, 2], None, PandasArray(np.array([1, 2]))), + ([1, 2], object, PandasArray(np.array([1, 2], dtype=object))), + ( + [1, 2], + np.dtype("float32"), + PandasArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))), + ), + (np.array([1, 2]), None, PandasArray(np.array([1, 2]))), + # String alias passes through to NumPy + ([1, 2], "float32", PandasArray(np.array([1, 2], dtype="float32"))), + # Period alias + ( + [pd.Period("2000", "D"), pd.Period("2001", "D")], + "Period[D]", + period_array(["2000", "2001"], freq="D"), + ), + # Period dtype + ( + [pd.Period("2000", "D")], + pd.PeriodDtype("D"), + period_array(["2000"], freq="D"), + ), + # Datetime (naive) + ( + [1, 2], + np.dtype("datetime64[ns]"), + pd.arrays.DatetimeArray._from_sequence( + np.array([1, 2], dtype="datetime64[ns]") + ), + ), + ( + np.array([1, 2], dtype="datetime64[ns]"), + None, + pd.arrays.DatetimeArray._from_sequence( + np.array([1, 2], dtype="datetime64[ns]") + ), + ), + ( + pd.DatetimeIndex(["2000", "2001"]), + np.dtype("datetime64[ns]"), + pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + ), + ( + pd.DatetimeIndex(["2000", "2001"]), + None, + pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + ), + ( + ["2000", "2001"], + np.dtype("datetime64[ns]"), + pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + ), + # Datetime (tz-aware) + ( + ["2000", "2001"], + pd.DatetimeTZDtype(tz="CET"), + pd.arrays.DatetimeArray._from_sequence( + ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET") + ), + ), + # Timedelta + ( + ["1H", "2H"], + np.dtype("timedelta64[ns]"), + pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]), + ), + ( + pd.TimedeltaIndex(["1H", "2H"]), + np.dtype("timedelta64[ns]"), + pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]), + ), + ( + pd.TimedeltaIndex(["1H", "2H"]), + None, + pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]), + ), + # Category + (["a", "b"], "category", pd.Categorical(["a", "b"])), + ( + ["a", "b"], + pd.CategoricalDtype(None, ordered=True), + pd.Categorical(["a", "b"], ordered=True), + ), + # Interval + ( + [pd.Interval(1, 2), pd.Interval(3, 4)], + "interval", + pd.arrays.IntervalArray.from_tuples([(1, 2), (3, 4)]), + ), + # Sparse + ([0, 1], "Sparse[int64]", pd.SparseArray([0, 1], dtype="int64")), + # IntegerNA + ([1, None], "Int16", integer_array([1, None], dtype="Int16")), + (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), + # Index + (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), + # Series[EA] returns the EA + ( + pd.Series(pd.Categorical(["a", "b"], categories=["a", "b", "c"])), + None, + pd.Categorical(["a", "b"], categories=["a", "b", "c"]), + ), + # "3rd party" EAs work + ([decimal.Decimal(0), decimal.Decimal(1)], "decimal", to_decimal([0, 1])), + # pass an ExtensionArray, but a different dtype + ( + period_array(["2000", "2001"], freq="D"), + "category", + pd.Categorical([pd.Period("2000", "D"), pd.Period("2001", "D")]), + ), + ], +) def test_array(data, dtype, expected): result = pd.array(data, dtype=dtype) tm.assert_equal(result, expected) @@ -123,85 +154,101 @@ def test_array_copy(): cet = pytz.timezone("CET") -@pytest.mark.parametrize('data, expected', [ - # period - ([pd.Period("2000", "D"), pd.Period("2001", "D")], - period_array(["2000", "2001"], freq="D")), - - # interval - ([pd.Interval(0, 1), pd.Interval(1, 2)], - pd.arrays.IntervalArray.from_breaks([0, 1, 2])), - - # datetime - ([pd.Timestamp('2000',), pd.Timestamp('2001')], - pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])), - - ([datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)], - pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])), - - (np.array([1, 2], dtype='M8[ns]'), - pd.arrays.DatetimeArray(np.array([1, 2], dtype='M8[ns]'))), - - (np.array([1, 2], dtype='M8[us]'), - pd.arrays.DatetimeArray(np.array([1000, 2000], dtype='M8[ns]'))), - - # datetimetz - ([pd.Timestamp('2000', tz='CET'), pd.Timestamp('2001', tz='CET')], - pd.arrays.DatetimeArray._from_sequence( - ['2000', '2001'], dtype=pd.DatetimeTZDtype(tz='CET'))), - - ([datetime.datetime(2000, 1, 1, tzinfo=cet), - datetime.datetime(2001, 1, 1, tzinfo=cet)], - pd.arrays.DatetimeArray._from_sequence(['2000', '2001'], - tz=cet)), - - # timedelta - ([pd.Timedelta('1H'), pd.Timedelta('2H')], - pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])), - - (np.array([1, 2], dtype='m8[ns]'), - pd.arrays.TimedeltaArray(np.array([1, 2], dtype='m8[ns]'))), - - (np.array([1, 2], dtype='m8[us]'), - pd.arrays.TimedeltaArray(np.array([1000, 2000], dtype='m8[ns]'))), - -]) +@pytest.mark.parametrize( + "data, expected", + [ + # period + ( + [pd.Period("2000", "D"), pd.Period("2001", "D")], + period_array(["2000", "2001"], freq="D"), + ), + # interval + ( + [pd.Interval(0, 1), pd.Interval(1, 2)], + pd.arrays.IntervalArray.from_breaks([0, 1, 2]), + ), + # datetime + ( + [pd.Timestamp("2000"), pd.Timestamp("2001")], + pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + ), + ( + [datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)], + pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + ), + ( + np.array([1, 2], dtype="M8[ns]"), + pd.arrays.DatetimeArray(np.array([1, 2], dtype="M8[ns]")), + ), + ( + np.array([1, 2], dtype="M8[us]"), + pd.arrays.DatetimeArray(np.array([1000, 2000], dtype="M8[ns]")), + ), + # datetimetz + ( + [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")], + pd.arrays.DatetimeArray._from_sequence( + ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET") + ), + ), + ( + [ + datetime.datetime(2000, 1, 1, tzinfo=cet), + datetime.datetime(2001, 1, 1, tzinfo=cet), + ], + pd.arrays.DatetimeArray._from_sequence(["2000", "2001"], tz=cet), + ), + # timedelta + ( + [pd.Timedelta("1H"), pd.Timedelta("2H")], + pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]), + ), + ( + np.array([1, 2], dtype="m8[ns]"), + pd.arrays.TimedeltaArray(np.array([1, 2], dtype="m8[ns]")), + ), + ( + np.array([1, 2], dtype="m8[us]"), + pd.arrays.TimedeltaArray(np.array([1000, 2000], dtype="m8[ns]")), + ), + ], +) def test_array_inference(data, expected): result = pd.array(data) tm.assert_equal(result, expected) -@pytest.mark.parametrize('data', [ - # mix of frequencies - [pd.Period("2000", "D"), pd.Period("2001", "A")], - # mix of closed - [pd.Interval(0, 1, closed='left'), pd.Interval(1, 2, closed='right')], - # Mix of timezones - [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000", tz="UTC")], - # Mix of tz-aware and tz-naive - [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000")], - np.array([pd.Timestamp('2000'), pd.Timestamp('2000', tz='CET')]), -]) +@pytest.mark.parametrize( + "data", + [ + # mix of frequencies + [pd.Period("2000", "D"), pd.Period("2001", "A")], + # mix of closed + [pd.Interval(0, 1, closed="left"), pd.Interval(1, 2, closed="right")], + # Mix of timezones + [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000", tz="UTC")], + # Mix of tz-aware and tz-naive + [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000")], + np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]), + ], +) def test_array_inference_fails(data): result = pd.array(data) expected = PandasArray(np.array(data, dtype=object)) tm.assert_extension_array_equal(result, expected) -@pytest.mark.parametrize("data", [ - np.array([[1, 2], [3, 4]]), - [[1, 2], [3, 4]], -]) +@pytest.mark.parametrize("data", [np.array([[1, 2], [3, 4]]), [[1, 2], [3, 4]]]) def test_nd_raises(data): - with pytest.raises(ValueError, match='PandasArray must be 1-dimensional'): + with pytest.raises(ValueError, match="PandasArray must be 1-dimensional"): pd.array(data) def test_scalar_raises(): - with pytest.raises(ValueError, - match="Cannot pass scalar '1'"): + with pytest.raises(ValueError, match="Cannot pass scalar '1'"): pd.array(1) + # --------------------------------------------------------------------------- # A couple dummy classes to ensure that Series and Indexes are unboxed before # getting to the EA classes. @@ -209,7 +256,7 @@ def test_scalar_raises(): @register_extension_dtype class DecimalDtype2(DecimalDtype): - name = 'decimal2' + name = "decimal2" @classmethod def construct_array_type(cls): @@ -227,12 +274,12 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): @pytest.mark.parametrize("box", [pd.Series, pd.Index]) def test_array_unboxes(box): - data = box([decimal.Decimal('1'), decimal.Decimal('2')]) + data = box([decimal.Decimal("1"), decimal.Decimal("2")]) # make sure it works with pytest.raises(TypeError): DecimalArray2._from_sequence(data) - result = pd.array(data, dtype='decimal2') + result = pd.array(data, dtype="decimal2") expected = DecimalArray2._from_sequence(data.values) tm.assert_equal(result, expected) @@ -247,8 +294,8 @@ def registry_without_decimal(): def test_array_not_registered(registry_without_decimal): # check we aren't on it - assert registry.find('decimal') is None - data = [decimal.Decimal('1'), decimal.Decimal('2')] + assert registry.find("decimal") is None + data = [decimal.Decimal("1"), decimal.Decimal("2")] result = pd.array(data, dtype=DecimalDtype) expected = DecimalArray._from_sequence(data) @@ -257,13 +304,13 @@ def test_array_not_registered(registry_without_decimal): class TestArrayAnalytics: def test_searchsorted(self, string_dtype): - arr = pd.array(['a', 'b', 'c'], dtype=string_dtype) + arr = pd.array(["a", "b", "c"], dtype=string_dtype) - result = arr.searchsorted('a', side='left') + result = arr.searchsorted("a", side="left") assert is_scalar(result) assert result == 0 - result = arr.searchsorted('a', side='right') + result = arr.searchsorted("a", side="right") assert is_scalar(result) assert result == 1 @@ -283,13 +330,23 @@ def test_searchsorted_numeric_dtypes_vector(self, any_real_dtype): expected = np.array([1, 2], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('arr, val', [ - [pd.date_range('20120101', periods=10, freq='2D'), - pd.Timestamp('20120102')], - [pd.date_range('20120101', periods=10, freq='2D', tz='Asia/Hong_Kong'), - pd.Timestamp('20120102', tz='Asia/Hong_Kong')], - [pd.timedelta_range(start='1 day', end='10 days', periods=10), - pd.Timedelta('2 days')]]) + @pytest.mark.parametrize( + "arr, val", + [ + [ + pd.date_range("20120101", periods=10, freq="2D"), + pd.Timestamp("20120102"), + ], + [ + pd.date_range("20120101", periods=10, freq="2D", tz="Asia/Hong_Kong"), + pd.Timestamp("20120102", tz="Asia/Hong_Kong"), + ], + [ + pd.timedelta_range(start="1 day", end="10 days", periods=10), + pd.Timedelta("2 days"), + ], + ], + ) def test_search_sorted_datetime64_scalar(self, arr, val): arr = pd.array(arr) result = arr.searchsorted(val) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 2337d8363155cf..34fae1f4b1ab4d 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -7,7 +7,7 @@ # TODO: more freq variants -@pytest.fixture(params=['D', 'B', 'W', 'M', 'Q', 'Y']) +@pytest.fixture(params=["D", "B", "W", "M", "Q", "Y"]) def period_index(request): """ A fixture to provide PeriodIndex objects with different frequencies. @@ -18,13 +18,11 @@ def period_index(request): """ freqstr = request.param # TODO: non-monotone indexes; NaTs, different start dates - pi = pd.period_range(start=pd.Timestamp('2000-01-01'), - periods=100, - freq=freqstr) + pi = pd.period_range(start=pd.Timestamp("2000-01-01"), periods=100, freq=freqstr) return pi -@pytest.fixture(params=['D', 'B', 'W', 'M', 'Q', 'Y']) +@pytest.fixture(params=["D", "B", "W", "M", "Q", "Y"]) def datetime_index(request): """ A fixture to provide DatetimeIndex objects with different frequencies. @@ -35,9 +33,7 @@ def datetime_index(request): """ freqstr = request.param # TODO: non-monotone indexes; NaTs, different start dates, timezones - pi = pd.date_range(start=pd.Timestamp('2000-01-01'), - periods=100, - freq=freqstr) + pi = pd.date_range(start=pd.Timestamp("2000-01-01"), periods=100, freq=freqstr) return pi @@ -50,7 +46,7 @@ def timedelta_index(request): the TimedeltaIndex behavior. """ # TODO: flesh this out - return pd.TimedeltaIndex(['1 Day', '3 Hours', 'NaT']) + return pd.TimedeltaIndex(["1 Day", "3 Hours", "NaT"]) class SharedTests: @@ -59,9 +55,9 @@ class SharedTests: def test_compare_len1_raises(self): # make sure we raise when comparing with different lengths, specific # to the case where one has length-1, which numpy would broadcast - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 - idx = self.index_cls._simple_new(data, freq='D') + idx = self.index_cls._simple_new(data, freq="D") arr = self.array_cls(idx) with pytest.raises(ValueError, match="Lengths must match"): @@ -72,10 +68,10 @@ def test_compare_len1_raises(self): idx <= idx[[0]] def test_take(self): - data = np.arange(100, dtype='i8') * 24 * 3600 * 10**9 + data = np.arange(100, dtype="i8") * 24 * 3600 * 10 ** 9 np.random.shuffle(data) - idx = self.index_cls._simple_new(data, freq='D') + idx = self.index_cls._simple_new(data, freq="D") arr = self.array_cls(idx) takers = [1, 4, 94] @@ -91,9 +87,9 @@ def test_take(self): tm.assert_index_equal(self.index_cls(result), expected) def test_take_fill(self): - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 - idx = self.index_cls._simple_new(data, freq='D') + idx = self.index_cls._simple_new(data, freq="D") arr = self.array_cls(idx) result = arr.take([-1, 1], allow_fill=True, fill_value=None) @@ -112,13 +108,12 @@ def test_take_fill(self): arr.take([0, 1], allow_fill=True, fill_value=2.0) with pytest.raises(ValueError): - arr.take([0, 1], allow_fill=True, - fill_value=pd.Timestamp.now().time) + arr.take([0, 1], allow_fill=True, fill_value=pd.Timestamp.now().time) def test_concat_same_type(self): - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 - idx = self.index_cls._simple_new(data, freq='D').insert(0, pd.NaT) + idx = self.index_cls._simple_new(data, freq="D").insert(0, pd.NaT) arr = self.array_cls(idx) result = arr._concat_same_type([arr[:-1], arr[1:], arr]) @@ -127,8 +122,8 @@ def test_concat_same_type(self): tm.assert_index_equal(self.index_cls(result), expected) def test_unbox_scalar(self): - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 - arr = self.array_cls(data, freq='D') + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") result = arr._unbox_scalar(arr[0]) assert isinstance(result, int) @@ -136,36 +131,36 @@ def test_unbox_scalar(self): assert isinstance(result, int) with pytest.raises(ValueError): - arr._unbox_scalar('foo') + arr._unbox_scalar("foo") def test_check_compatible_with(self): - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 - arr = self.array_cls(data, freq='D') + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") arr._check_compatible_with(arr[0]) arr._check_compatible_with(arr[:1]) arr._check_compatible_with(pd.NaT) def test_scalar_from_string(self): - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 - arr = self.array_cls(data, freq='D') + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") result = arr._scalar_from_string(str(arr[0])) assert result == arr[0] def test_reduce_invalid(self): - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 - arr = self.array_cls(data, freq='D') + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") - with pytest.raises(TypeError, match='cannot perform'): + with pytest.raises(TypeError, match="cannot perform"): arr._reduce("not a method") - @pytest.mark.parametrize('method', ['pad', 'backfill']) + @pytest.mark.parametrize("method", ["pad", "backfill"]) def test_fillna_method_doesnt_change_orig(self, method): - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 - arr = self.array_cls(data, freq='D') + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") arr[4] = pd.NaT - fill_value = arr[3] if method == 'pad' else arr[5] + fill_value = arr[3] if method == "pad" else arr[5] result = arr.fillna(method=method) assert result[4] == fill_value @@ -174,8 +169,8 @@ def test_fillna_method_doesnt_change_orig(self, method): assert arr[4] is pd.NaT def test_searchsorted(self): - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 - arr = self.array_cls(data, freq='D') + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") # scalar result = arr.searchsorted(arr[1]) @@ -199,11 +194,11 @@ def test_searchsorted(self): assert result == 0 def test_setitem(self): - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 - arr = self.array_cls(data, freq='D') + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") arr[0] = arr[1] - expected = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 + expected = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 expected[0] = expected[1] tm.assert_numpy_array_equal(arr.asi8, expected) @@ -213,8 +208,8 @@ def test_setitem(self): tm.assert_numpy_array_equal(arr.asi8, expected) def test_setitem_raises(self): - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 - arr = self.array_cls(data, freq='D') + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") val = arr[0] with pytest.raises(IndexError, match="index 12 is out of bounds"): @@ -231,9 +226,9 @@ class TestDatetimeArray(SharedTests): def test_round(self, tz_naive_fixture): # GH#24064 tz = tz_naive_fixture - dti = pd.date_range('2016-01-01 01:01:00', periods=3, freq='H', tz=tz) + dti = pd.date_range("2016-01-01 01:01:00", periods=3, freq="H", tz=tz) - result = dti.round(freq='2T') + result = dti.round(freq="2T") expected = dti - pd.Timedelta(minutes=1) tm.assert_index_equal(result, expected) @@ -250,14 +245,14 @@ def test_array_interface(self, datetime_index): tm.assert_numpy_array_equal(result, expected) # specifying M8[ns] gives the same result as default - result = np.asarray(arr, dtype='datetime64[ns]') + result = np.asarray(arr, dtype="datetime64[ns]") expected = arr._data assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, dtype='datetime64[ns]', copy=False) + result = np.array(arr, dtype="datetime64[ns]", copy=False) assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, dtype='datetime64[ns]') + result = np.array(arr, dtype="datetime64[ns]") assert result is not expected tm.assert_numpy_array_equal(result, expected) @@ -267,14 +262,14 @@ def test_array_interface(self, datetime_index): tm.assert_numpy_array_equal(result, expected) # to other dtype always copies - result = np.asarray(arr, dtype='int64') + result = np.asarray(arr, dtype="int64") assert result is not arr.asi8 assert not np.may_share_memory(arr, result) expected = arr.asi8.copy() tm.assert_numpy_array_equal(result, expected) # other dtypes handled by numpy - for dtype in ['float64', str]: + for dtype in ["float64", str]: result = np.asarray(arr, dtype=dtype) expected = np.asarray(arr).astype(dtype) tm.assert_numpy_array_equal(result, expected) @@ -282,7 +277,7 @@ def test_array_interface(self, datetime_index): def test_array_object_dtype(self, tz_naive_fixture): # GH#23524 tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=3, tz=tz) + dti = pd.date_range("2016-01-01", periods=3, tz=tz) arr = DatetimeArray(dti) expected = np.array(list(dti)) @@ -297,44 +292,44 @@ def test_array_object_dtype(self, tz_naive_fixture): def test_array_tz(self, tz_naive_fixture): # GH#23524 tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=3, tz=tz) + dti = pd.date_range("2016-01-01", periods=3, tz=tz) arr = DatetimeArray(dti) - expected = dti.asi8.view('M8[ns]') - result = np.array(arr, dtype='M8[ns]') + expected = dti.asi8.view("M8[ns]") + result = np.array(arr, dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, dtype='datetime64[ns]') + result = np.array(arr, dtype="datetime64[ns]") tm.assert_numpy_array_equal(result, expected) # check that we are not making copies when setting copy=False - result = np.array(arr, dtype='M8[ns]', copy=False) + result = np.array(arr, dtype="M8[ns]", copy=False) assert result.base is expected.base assert result.base is not None - result = np.array(arr, dtype='datetime64[ns]', copy=False) + result = np.array(arr, dtype="datetime64[ns]", copy=False) assert result.base is expected.base assert result.base is not None def test_array_i8_dtype(self, tz_naive_fixture): tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=3, tz=tz) + dti = pd.date_range("2016-01-01", periods=3, tz=tz) arr = DatetimeArray(dti) expected = dti.asi8 - result = np.array(arr, dtype='i8') + result = np.array(arr, dtype="i8") tm.assert_numpy_array_equal(result, expected) result = np.array(arr, dtype=np.int64) tm.assert_numpy_array_equal(result, expected) # check that we are still making copies when setting copy=False - result = np.array(arr, dtype='i8', copy=False) + result = np.array(arr, dtype="i8", copy=False) assert result.base is not expected.base assert result.base is None def test_from_array_keeps_base(self): # Ensure that DatetimeArray._data.base isn't lost. - arr = np.array(['2000-01-01', '2000-01-02'], dtype='M8[ns]') + arr = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]") dta = DatetimeArray(arr) assert dta._data is arr @@ -343,7 +338,7 @@ def test_from_array_keeps_base(self): def test_from_dti(self, tz_naive_fixture): tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=3, tz=tz) + dti = pd.date_range("2016-01-01", periods=3, tz=tz) arr = DatetimeArray(dti) assert list(dti) == list(arr) @@ -354,14 +349,14 @@ def test_from_dti(self, tz_naive_fixture): def test_astype_object(self, tz_naive_fixture): tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=3, tz=tz) + dti = pd.date_range("2016-01-01", periods=3, tz=tz) arr = DatetimeArray(dti) - asobj = arr.astype('O') + asobj = arr.astype("O") assert isinstance(asobj, np.ndarray) - assert asobj.dtype == 'O' + assert asobj.dtype == "O" assert list(asobj) == list(dti) - @pytest.mark.parametrize('freqstr', ['D', 'B', 'W', 'M', 'Q', 'Y']) + @pytest.mark.parametrize("freqstr", ["D", "B", "W", "M", "Q", "Y"]) def test_to_perioddelta(self, datetime_index, freqstr): # GH#23113 dti = datetime_index @@ -375,7 +370,7 @@ def test_to_perioddelta(self, datetime_index, freqstr): # an EA-specific tm.assert_ function tm.assert_index_equal(pd.Index(result), pd.Index(expected)) - @pytest.mark.parametrize('freqstr', ['D', 'B', 'W', 'M', 'Q', 'Y']) + @pytest.mark.parametrize("freqstr", ["D", "B", "W", "M", "Q", "Y"]) def test_to_period(self, datetime_index, freqstr): dti = datetime_index arr = DatetimeArray(dti) @@ -388,7 +383,7 @@ def test_to_period(self, datetime_index, freqstr): # an EA-specific tm.assert_ function tm.assert_index_equal(pd.Index(result), pd.Index(expected)) - @pytest.mark.parametrize('propname', pd.DatetimeIndex._bool_ops) + @pytest.mark.parametrize("propname", pd.DatetimeIndex._bool_ops) def test_bool_properties(self, datetime_index, propname): # in this case _bool_ops is just `is_leap_year` dti = datetime_index @@ -400,7 +395,7 @@ def test_bool_properties(self, datetime_index, propname): tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('propname', pd.DatetimeIndex._field_ops) + @pytest.mark.parametrize("propname", pd.DatetimeIndex._field_ops) def test_int_properties(self, datetime_index, propname): dti = datetime_index arr = DatetimeArray(dti) @@ -424,9 +419,9 @@ def test_take_fill_valid(self, datetime_index, tz_naive_fixture): with pytest.raises(ValueError): # fill_value Period invalid - arr.take([-1, 1], allow_fill=True, fill_value=pd.Period('2014Q1')) + arr.take([-1, 1], allow_fill=True, fill_value=pd.Period("2014Q1")) - tz = None if dti.tz is not None else 'US/Eastern' + tz = None if dti.tz is not None else "US/Eastern" now = pd.Timestamp.now().tz_localize(tz) with pytest.raises(TypeError): # Timestamp with mismatched tz-awareness @@ -442,7 +437,7 @@ def test_concat_same_type_invalid(self, datetime_index): arr = DatetimeArray(dti) if arr.tz is None: - other = arr.tz_localize('UTC') + other = arr.tz_localize("UTC") else: other = arr.tz_localize(None) @@ -451,15 +446,19 @@ def test_concat_same_type_invalid(self, datetime_index): def test_concat_same_type_different_freq(self): # we *can* concatenate DTI with different freqs. - a = DatetimeArray(pd.date_range('2000', periods=2, freq='D', - tz='US/Central')) - b = DatetimeArray(pd.date_range('2000', periods=2, freq='H', - tz='US/Central')) + a = DatetimeArray(pd.date_range("2000", periods=2, freq="D", tz="US/Central")) + b = DatetimeArray(pd.date_range("2000", periods=2, freq="H", tz="US/Central")) result = DatetimeArray._concat_same_type([a, b]) - expected = DatetimeArray(pd.to_datetime([ - '2000-01-01 00:00:00', '2000-01-02 00:00:00', - '2000-01-01 00:00:00', '2000-01-01 01:00:00', - ]).tz_localize("US/Central")) + expected = DatetimeArray( + pd.to_datetime( + [ + "2000-01-01 00:00:00", + "2000-01-02 00:00:00", + "2000-01-01 00:00:00", + "2000-01-01 01:00:00", + ] + ).tz_localize("US/Central") + ) tm.assert_datetime_array_equal(result, expected) @@ -469,7 +468,7 @@ class TestTimedeltaArray(SharedTests): array_cls = TimedeltaArray def test_from_tdi(self): - tdi = pd.TimedeltaIndex(['1 Day', '3 Hours']) + tdi = pd.TimedeltaIndex(["1 Day", "3 Hours"]) arr = TimedeltaArray(tdi) assert list(arr) == list(tdi) @@ -479,11 +478,11 @@ def test_from_tdi(self): assert list(tdi2) == list(arr) def test_astype_object(self): - tdi = pd.TimedeltaIndex(['1 Day', '3 Hours']) + tdi = pd.TimedeltaIndex(["1 Day", "3 Hours"]) arr = TimedeltaArray(tdi) - asobj = arr.astype('O') + asobj = arr.astype("O") assert isinstance(asobj, np.ndarray) - assert asobj.dtype == 'O' + assert asobj.dtype == "O" assert list(asobj) == list(tdi) def test_to_pytimedelta(self, timedelta_index): @@ -504,7 +503,7 @@ def test_total_seconds(self, timedelta_index): tm.assert_numpy_array_equal(result, expected.values) - @pytest.mark.parametrize('propname', pd.TimedeltaIndex._field_ops) + @pytest.mark.parametrize("propname", pd.TimedeltaIndex._field_ops) def test_int_properties(self, timedelta_index, propname): tdi = timedelta_index arr = TimedeltaArray(tdi) @@ -527,14 +526,14 @@ def test_array_interface(self, timedelta_index): tm.assert_numpy_array_equal(result, expected) # specifying m8[ns] gives the same result as default - result = np.asarray(arr, dtype='timedelta64[ns]') + result = np.asarray(arr, dtype="timedelta64[ns]") expected = arr._data assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, dtype='timedelta64[ns]', copy=False) + result = np.array(arr, dtype="timedelta64[ns]", copy=False) assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, dtype='timedelta64[ns]') + result = np.array(arr, dtype="timedelta64[ns]") assert result is not expected tm.assert_numpy_array_equal(result, expected) @@ -544,14 +543,14 @@ def test_array_interface(self, timedelta_index): tm.assert_numpy_array_equal(result, expected) # to other dtype always copies - result = np.asarray(arr, dtype='int64') + result = np.asarray(arr, dtype="int64") assert result is not arr.asi8 assert not np.may_share_memory(arr, result) expected = arr.asi8.copy() tm.assert_numpy_array_equal(result, expected) # other dtypes handled by numpy - for dtype in ['float64', str]: + for dtype in ["float64", str]: result = np.asarray(arr, dtype=dtype) expected = np.asarray(arr).astype(dtype) tm.assert_numpy_array_equal(result, expected) @@ -571,7 +570,7 @@ def test_take_fill_valid(self, timedelta_index): with pytest.raises(ValueError): # fill_value Period invalid - arr.take([0, 1], allow_fill=True, fill_value=now.to_period('D')) + arr.take([0, 1], allow_fill=True, fill_value=now.to_period("D")) class TestPeriodArray(SharedTests): @@ -591,12 +590,12 @@ def test_from_pi(self, period_index): def test_astype_object(self, period_index): pi = period_index arr = PeriodArray(pi) - asobj = arr.astype('O') + asobj = arr.astype("O") assert isinstance(asobj, np.ndarray) - assert asobj.dtype == 'O' + assert asobj.dtype == "O" assert list(asobj) == list(pi) - @pytest.mark.parametrize('how', ['S', 'E']) + @pytest.mark.parametrize("how", ["S", "E"]) def test_to_timestamp(self, how, period_index): pi = period_index arr = PeriodArray(pi) @@ -609,7 +608,7 @@ def test_to_timestamp(self, how, period_index): # an EA-specific tm.assert_ function tm.assert_index_equal(pd.Index(result), pd.Index(expected)) - @pytest.mark.parametrize('propname', PeriodArray._bool_ops) + @pytest.mark.parametrize("propname", PeriodArray._bool_ops) def test_bool_properties(self, period_index, propname): # in this case _bool_ops is just `is_leap_year` pi = period_index @@ -620,7 +619,7 @@ def test_bool_properties(self, period_index, propname): tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('propname', PeriodArray._field_ops) + @pytest.mark.parametrize("propname", PeriodArray._field_ops) def test_int_properties(self, period_index, propname): pi = period_index arr = PeriodArray(pi) @@ -644,11 +643,11 @@ def test_array_interface(self, period_index): # to other dtypes with pytest.raises(TypeError): - np.asarray(arr, dtype='int64') + np.asarray(arr, dtype="int64") with pytest.raises(TypeError): - np.asarray(arr, dtype='float64') + np.asarray(arr, dtype="float64") - result = np.asarray(arr, dtype='S20') - expected = np.asarray(arr).astype('S20') + result = np.asarray(arr, dtype="S20") + expected = np.asarray(arr).astype("S20") tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index c7c0e1180ce464..58c2f3fc65bb29 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -15,9 +15,8 @@ class TestDatetimeArrayConstructor: - def test_only_1dim_accepted(self): - arr = np.array([0, 1, 2, 3], dtype='M8[h]').astype('M8[ns]') + arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]") with pytest.raises(ValueError, match="Only 1-dimensional"): # 2-dim @@ -30,24 +29,33 @@ def test_only_1dim_accepted(self): def test_freq_validation(self): # GH#24623 check that invalid instances cannot be created with the # public constructor - arr = np.arange(5, dtype=np.int64) * 3600 * 10**9 + arr = np.arange(5, dtype=np.int64) * 3600 * 10 ** 9 - msg = ("Inferred frequency H from passed values does not " - "conform to passed frequency W-SUN") + msg = ( + "Inferred frequency H from passed values does not " + "conform to passed frequency W-SUN" + ) with pytest.raises(ValueError, match=msg): DatetimeArray(arr, freq="W") - @pytest.mark.parametrize('meth', [DatetimeArray._from_sequence, - sequence_to_dt64ns, - pd.to_datetime, - pd.DatetimeIndex]) + @pytest.mark.parametrize( + "meth", + [ + DatetimeArray._from_sequence, + sequence_to_dt64ns, + pd.to_datetime, + pd.DatetimeIndex, + ], + ) def test_mixing_naive_tzaware_raises(self, meth): # GH#24569 - arr = np.array([pd.Timestamp('2000'), pd.Timestamp('2000', tz='CET')]) + arr = np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]) - msg = ('Cannot mix tz-aware with tz-naive values|' - 'Tz-aware datetime.datetime cannot be converted ' - 'to datetime64 unless utc=True') + msg = ( + "Cannot mix tz-aware with tz-naive values|" + "Tz-aware datetime.datetime cannot be converted " + "to datetime64 unless utc=True" + ) for obj in [arr, arr[::-1]]: # check that we raise regardless of whether naive is found @@ -56,39 +64,42 @@ def test_mixing_naive_tzaware_raises(self, meth): meth(obj) def test_from_pandas_array(self): - arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10**9 + arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10 ** 9 - result = DatetimeArray._from_sequence(arr, freq='infer') + result = DatetimeArray._from_sequence(arr, freq="infer") - expected = pd.date_range('1970-01-01', periods=5, freq='H')._data + expected = pd.date_range("1970-01-01", periods=5, freq="H")._data tm.assert_datetime_array_equal(result, expected) def test_mismatched_timezone_raises(self): - arr = DatetimeArray(np.array(['2000-01-01T06:00:00'], dtype='M8[ns]'), - dtype=DatetimeTZDtype(tz='US/Central')) - dtype = DatetimeTZDtype(tz='US/Eastern') - with pytest.raises(TypeError, match='Timezone of the array'): + arr = DatetimeArray( + np.array(["2000-01-01T06:00:00"], dtype="M8[ns]"), + dtype=DatetimeTZDtype(tz="US/Central"), + ) + dtype = DatetimeTZDtype(tz="US/Eastern") + with pytest.raises(TypeError, match="Timezone of the array"): DatetimeArray(arr, dtype=dtype) def test_non_array_raises(self): - with pytest.raises(ValueError, match='list'): + with pytest.raises(ValueError, match="list"): DatetimeArray([1, 2, 3]) def test_other_type_raises(self): - with pytest.raises(ValueError, - match="The dtype of 'values' is incorrect.*bool"): - DatetimeArray(np.array([1, 2, 3], dtype='bool')) + with pytest.raises( + ValueError, match="The dtype of 'values' is incorrect.*bool" + ): + DatetimeArray(np.array([1, 2, 3], dtype="bool")) def test_incorrect_dtype_raises(self): with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): - DatetimeArray(np.array([1, 2, 3], dtype='i8'), dtype='category') + DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="category") def test_freq_infer_raises(self): - with pytest.raises(ValueError, match='Frequency inference'): - DatetimeArray(np.array([1, 2, 3], dtype='i8'), freq="infer") + with pytest.raises(ValueError, match="Frequency inference"): + DatetimeArray(np.array([1, 2, 3], dtype="i8"), freq="infer") def test_copy(self): - data = np.array([1, 2, 3], dtype='M8[ns]') + data = np.array([1, 2, 3], dtype="M8[ns]") arr = DatetimeArray(data, copy=False) assert arr._data is data @@ -102,10 +113,10 @@ class TestDatetimeArrayComparisons: def test_cmp_dt64_arraylike_tznaive(self, all_compare_operators): # arbitrary tz-naive DatetimeIndex - opname = all_compare_operators.strip('_') + opname = all_compare_operators.strip("_") op = getattr(operator, opname) - dti = pd.date_range('2016-01-1', freq='MS', periods=9, tz=None) + dti = pd.date_range("2016-01-1", freq="MS", periods=9, tz=None) arr = DatetimeArray(dti) assert arr.freq == dti.freq assert arr.tz == dti.tz @@ -113,7 +124,7 @@ def test_cmp_dt64_arraylike_tznaive(self, all_compare_operators): right = dti expected = np.ones(len(arr), dtype=bool) - if opname in ['ne', 'gt', 'lt']: + if opname in ["ne", "gt", "lt"]: # for these the comparisons should be all-False expected = ~expected @@ -131,50 +142,45 @@ def test_cmp_dt64_arraylike_tznaive(self, all_compare_operators): class TestDatetimeArray: def test_astype_to_same(self): - arr = DatetimeArray._from_sequence(['2000'], tz='US/Central') + arr = DatetimeArray._from_sequence(["2000"], tz="US/Central") result = arr.astype(DatetimeTZDtype(tz="US/Central"), copy=False) assert result is arr - @pytest.mark.parametrize("dtype", [ - int, np.int32, np.int64, 'uint32', 'uint64', - ]) + @pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"]) def test_astype_int(self, dtype): - arr = DatetimeArray._from_sequence([pd.Timestamp('2000'), - pd.Timestamp('2001')]) + arr = DatetimeArray._from_sequence([pd.Timestamp("2000"), pd.Timestamp("2001")]) result = arr.astype(dtype) - if np.dtype(dtype).kind == 'u': - expected_dtype = np.dtype('uint64') + if np.dtype(dtype).kind == "u": + expected_dtype = np.dtype("uint64") else: - expected_dtype = np.dtype('int64') + expected_dtype = np.dtype("int64") expected = arr.astype(expected_dtype) assert result.dtype == expected_dtype tm.assert_numpy_array_equal(result, expected) def test_tz_setter_raises(self): - arr = DatetimeArray._from_sequence(['2000'], tz='US/Central') - with pytest.raises(AttributeError, match='tz_localize'): - arr.tz = 'UTC' + arr = DatetimeArray._from_sequence(["2000"], tz="US/Central") + with pytest.raises(AttributeError, match="tz_localize"): + arr.tz = "UTC" def test_setitem_different_tz_raises(self): - data = np.array([1, 2, 3], dtype='M8[ns]') - arr = DatetimeArray(data, copy=False, - dtype=DatetimeTZDtype(tz="US/Central")) + data = np.array([1, 2, 3], dtype="M8[ns]") + arr = DatetimeArray(data, copy=False, dtype=DatetimeTZDtype(tz="US/Central")) with pytest.raises(ValueError, match="None"): - arr[0] = pd.Timestamp('2000') + arr[0] = pd.Timestamp("2000") with pytest.raises(ValueError, match="US/Central"): - arr[0] = pd.Timestamp('2000', tz="US/Eastern") + arr[0] = pd.Timestamp("2000", tz="US/Eastern") def test_setitem_clears_freq(self): - a = DatetimeArray(pd.date_range('2000', periods=2, freq='D', - tz='US/Central')) + a = DatetimeArray(pd.date_range("2000", periods=2, freq="D", tz="US/Central")) a[0] = pd.Timestamp("2000", tz="US/Central") assert a.freq is None def test_repeat_preserves_tz(self): - dti = pd.date_range('2000', periods=2, freq='D', tz='US/Central') + dti = pd.date_range("2000", periods=2, freq="D", tz="US/Central") arr = DatetimeArray(dti) repeated = arr.repeat([1, 1]) @@ -184,7 +190,7 @@ def test_repeat_preserves_tz(self): tm.assert_equal(repeated, expected) def test_value_counts_preserves_tz(self): - dti = pd.date_range('2000', periods=2, freq='D', tz='US/Central') + dti = pd.date_range("2000", periods=2, freq="D", tz="US/Central") arr = DatetimeArray(dti).repeat([4, 3]) result = arr.value_counts() @@ -194,20 +200,18 @@ def test_value_counts_preserves_tz(self): arr[-2] = pd.NaT result = arr.value_counts() - expected = pd.Series([1, 4, 2], - index=[pd.NaT, dti[0], dti[1]]) + expected = pd.Series([1, 4, 2], index=[pd.NaT, dti[0], dti[1]]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('method', ['pad', 'backfill']) + @pytest.mark.parametrize("method", ["pad", "backfill"]) def test_fillna_preserves_tz(self, method): - dti = pd.date_range('2000-01-01', periods=5, freq='D', tz='US/Central') + dti = pd.date_range("2000-01-01", periods=5, freq="D", tz="US/Central") arr = DatetimeArray(dti, copy=True) arr[2] = pd.NaT - fill_val = dti[1] if method == 'pad' else dti[3] + fill_val = dti[1] if method == "pad" else dti[3] expected = DatetimeArray._from_sequence( - [dti[0], dti[1], fill_val, dti[3], dti[4]], - freq=None, tz='US/Central' + [dti[0], dti[1], fill_val, dti[3], dti[4]], freq=None, tz="US/Central" ) result = arr.fillna(method=method) @@ -215,75 +219,82 @@ def test_fillna_preserves_tz(self, method): # assert that arr and dti were not modified in-place assert arr[2] is pd.NaT - assert dti[2] == pd.Timestamp('2000-01-03', tz='US/Central') + assert dti[2] == pd.Timestamp("2000-01-03", tz="US/Central") def test_array_interface_tz(self): tz = "US/Central" - data = DatetimeArray(pd.date_range('2017', periods=2, tz=tz)) + data = DatetimeArray(pd.date_range("2017", periods=2, tz=tz)) result = np.asarray(data) - expected = np.array([pd.Timestamp('2017-01-01T00:00:00', tz=tz), - pd.Timestamp('2017-01-02T00:00:00', tz=tz)], - dtype=object) + expected = np.array( + [ + pd.Timestamp("2017-01-01T00:00:00", tz=tz), + pd.Timestamp("2017-01-02T00:00:00", tz=tz), + ], + dtype=object, + ) tm.assert_numpy_array_equal(result, expected) result = np.asarray(data, dtype=object) tm.assert_numpy_array_equal(result, expected) - result = np.asarray(data, dtype='M8[ns]') + result = np.asarray(data, dtype="M8[ns]") - expected = np.array(['2017-01-01T06:00:00', - '2017-01-02T06:00:00'], dtype="M8[ns]") + expected = np.array( + ["2017-01-01T06:00:00", "2017-01-02T06:00:00"], dtype="M8[ns]" + ) tm.assert_numpy_array_equal(result, expected) def test_array_interface(self): - data = DatetimeArray(pd.date_range('2017', periods=2)) - expected = np.array(['2017-01-01T00:00:00', '2017-01-02T00:00:00'], - dtype='datetime64[ns]') + data = DatetimeArray(pd.date_range("2017", periods=2)) + expected = np.array( + ["2017-01-01T00:00:00", "2017-01-02T00:00:00"], dtype="datetime64[ns]" + ) result = np.asarray(data) tm.assert_numpy_array_equal(result, expected) result = np.asarray(data, dtype=object) - expected = np.array([pd.Timestamp('2017-01-01T00:00:00'), - pd.Timestamp('2017-01-02T00:00:00')], - dtype=object) + expected = np.array( + [pd.Timestamp("2017-01-01T00:00:00"), pd.Timestamp("2017-01-02T00:00:00")], + dtype=object, + ) tm.assert_numpy_array_equal(result, expected) class TestSequenceToDT64NS: - def test_tz_dtype_mismatch_raises(self): - arr = DatetimeArray._from_sequence(['2000'], tz='US/Central') - with pytest.raises(TypeError, match='data is already tz-aware'): + arr = DatetimeArray._from_sequence(["2000"], tz="US/Central") + with pytest.raises(TypeError, match="data is already tz-aware"): sequence_to_dt64ns(arr, dtype=DatetimeTZDtype(tz="UTC")) def test_tz_dtype_matches(self): - arr = DatetimeArray._from_sequence(['2000'], tz='US/Central') - result, _, _ = sequence_to_dt64ns( - arr, dtype=DatetimeTZDtype(tz="US/Central")) + arr = DatetimeArray._from_sequence(["2000"], tz="US/Central") + result, _, _ = sequence_to_dt64ns(arr, dtype=DatetimeTZDtype(tz="US/Central")) tm.assert_numpy_array_equal(arr._data, result) class TestReductions: - @pytest.mark.parametrize("tz", [None, "US/Central"]) def test_min_max(self, tz): - arr = DatetimeArray._from_sequence([ - '2000-01-03', - '2000-01-03', - 'NaT', - '2000-01-02', - '2000-01-05', - '2000-01-04', - ], tz=tz) + arr = DatetimeArray._from_sequence( + [ + "2000-01-03", + "2000-01-03", + "NaT", + "2000-01-02", + "2000-01-05", + "2000-01-04", + ], + tz=tz, + ) result = arr.min() - expected = pd.Timestamp('2000-01-02', tz=tz) + expected = pd.Timestamp("2000-01-02", tz=tz) assert result == expected result = arr.max() - expected = pd.Timestamp('2000-01-05', tz=tz) + expected = pd.Timestamp("2000-01-05", tz=tz) assert result == expected result = arr.min(skipna=False) @@ -293,7 +304,7 @@ def test_min_max(self, tz): assert result is pd.NaT @pytest.mark.parametrize("tz", [None, "US/Central"]) - @pytest.mark.parametrize('skipna', [True, False]) + @pytest.mark.parametrize("skipna", [True, False]) def test_min_max_empty(self, skipna, tz): arr = DatetimeArray._from_sequence([], tz=tz) result = arr.min(skipna=skipna) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index fb62a90a6007e0..c01b52456ff877 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -7,22 +7,35 @@ from pandas.api.types import is_float, is_float_dtype, is_integer, is_scalar from pandas.core.arrays import IntegerArray, integer_array from pandas.core.arrays.integer import ( - Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype, - UInt32Dtype, UInt64Dtype) + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, +) from pandas.tests.extension.base import BaseOpsUtil import pandas.util.testing as tm def make_data(): - return (list(range(8)) + - [np.nan] + - list(range(10, 98)) + - [np.nan] + - [99, 100]) - - -@pytest.fixture(params=[Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, - UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype]) + return list(range(8)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100] + + +@pytest.fixture( + params=[ + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + ] +) def dtype(request): return request.param() @@ -37,12 +50,12 @@ def data_missing(dtype): return integer_array([np.nan, 1], dtype=dtype) -@pytest.fixture(params=['data', 'data_missing']) +@pytest.fixture(params=["data", "data_missing"]) def all_data(request, data, data_missing): """Parametrized fixture giving 'data' and 'data_missing'""" - if request.param == 'data': + if request.param == "data": return data - elif request.param == 'data_missing': + elif request.param == "data_missing": return data_missing @@ -50,33 +63,32 @@ def test_dtypes(dtype): # smoke tests on auto dtype construction if dtype.is_signed_integer: - assert np.dtype(dtype.type).kind == 'i' + assert np.dtype(dtype.type).kind == "i" else: - assert np.dtype(dtype.type).kind == 'u' + assert np.dtype(dtype.type).kind == "u" assert dtype.name is not None -@pytest.mark.parametrize('dtype, expected', [ - (Int8Dtype(), 'Int8Dtype()'), - (Int16Dtype(), 'Int16Dtype()'), - (Int32Dtype(), 'Int32Dtype()'), - (Int64Dtype(), 'Int64Dtype()'), - (UInt8Dtype(), 'UInt8Dtype()'), - (UInt16Dtype(), 'UInt16Dtype()'), - (UInt32Dtype(), 'UInt32Dtype()'), - (UInt64Dtype(), 'UInt64Dtype()'), -]) +@pytest.mark.parametrize( + "dtype, expected", + [ + (Int8Dtype(), "Int8Dtype()"), + (Int16Dtype(), "Int16Dtype()"), + (Int32Dtype(), "Int32Dtype()"), + (Int64Dtype(), "Int64Dtype()"), + (UInt8Dtype(), "UInt8Dtype()"), + (UInt16Dtype(), "UInt16Dtype()"), + (UInt32Dtype(), "UInt32Dtype()"), + (UInt64Dtype(), "UInt64Dtype()"), + ], +) def test_repr_dtype(dtype, expected): assert repr(dtype) == expected def test_repr_array(): result = repr(integer_array([1, None, 3])) - expected = ( - '\n' - '[1, NaN, 3]\n' - 'Length: 3, dtype: Int64' - ) + expected = "\n" "[1, NaN, 3]\n" "Length: 3, dtype: Int64" assert result == expected @@ -94,14 +106,13 @@ def test_repr_array_long(): class TestConstructors: - def test_from_dtype_from_float(self, data): # construct from our dtype & string dtype dtype = data.dtype # from float expected = pd.Series(data) - result = pd.Series(np.array(data).astype('float'), dtype=str(dtype)) + result = pd.Series(np.array(data).astype("float"), dtype=str(dtype)) tm.assert_series_equal(result, expected) # from int / list @@ -117,7 +128,6 @@ def test_from_dtype_from_float(self, data): class TestArithmeticOps(BaseOpsUtil): - def _check_divmod_op(self, s, op, other, exc=None): super()._check_divmod_op(s, op, other, None) @@ -137,23 +147,25 @@ def _check_op(self, s, op_name, other, exc=None): # other array is an Integer if isinstance(other, IntegerArray): - omask = getattr(other, 'mask', None) - mask = getattr(other, 'data', other) + omask = getattr(other, "mask", None) + mask = getattr(other, "data", other) if omask is not None: mask |= omask # 1 ** na is na, so need to unmask those - if op_name == '__pow__': + if op_name == "__pow__": mask = np.where(s == 1, False, mask) - elif op_name == '__rpow__': + elif op_name == "__rpow__": mask = np.where(other == 1, False, mask) # float result type or float op - if ((is_float_dtype(other) or is_float(other) or - op_name in ['__rtruediv__', '__truediv__', - '__rdiv__', '__div__'])): - rs = s.astype('float') + if ( + is_float_dtype(other) + or is_float(other) + or op_name in ["__rtruediv__", "__truediv__", "__rdiv__", "__div__"] + ): + rs = s.astype("float") expected = op(rs, other) self._check_op_float(result, expected, mask, s, op_name, other) @@ -183,15 +195,14 @@ def _check_op_integer(self, result, expected, mask, s, op_name, other): # mod/rmod turn floating 0 into NaN while # integer works as expected (no nan) - if op_name in ['__mod__', '__rmod__']: + if op_name in ["__mod__", "__rmod__"]: if is_scalar(other): if other == 0: expected[s.values == 0] = 0 else: expected = expected.fillna(0) else: - expected[(s.values == 0) & - ((expected == 0) | expected.isna())] = 0 + expected[(s.values == 0) & ((expected == 0) | expected.isna())] = 0 try: expected[(expected == np.inf) | (expected == -np.inf)] = fill_value original = expected @@ -213,13 +224,13 @@ def _check_op_integer(self, result, expected, mask, s, op_name, other): # we need to fill with 0's to emulate what an astype('int') does # (truncation) for certain ops - if op_name in ['__rtruediv__', '__rdiv__']: + if op_name in ["__rtruediv__", "__rdiv__"]: mask |= original.isna() - original = original.fillna(0).astype('int') + original = original.fillna(0).astype("int") - original = original.astype('float') + original = original.astype("float") original[mask] = np.nan - tm.assert_series_equal(original, expected.astype('float')) + tm.assert_series_equal(original, expected.astype("float")) # assert our expected result tm.assert_series_equal(result, expected) @@ -246,7 +257,7 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar op = all_arithmetic_operators - df = pd.DataFrame({'A': data}) + df = pd.DataFrame({"A": data}) self._check_op(df, op, 1, exc=TypeError) def test_arith_series_with_array(self, data, all_arithmetic_operators): @@ -265,15 +276,15 @@ def test_arith_coerce_scalar(self, data, all_arithmetic_operators): other = 0.01 self._check_op(s, op, other) - @pytest.mark.parametrize("other", [1., 1.0, np.array(1.), np.array([1.])]) + @pytest.mark.parametrize("other", [1.0, 1.0, np.array(1.0), np.array([1.0])]) def test_arithmetic_conversion(self, all_arithmetic_operators, other): # if we have a float operand we should have a float result # if that is equal to an integer op = self.get_op_from_name(all_arithmetic_operators) - s = pd.Series([1, 2, 3], dtype='Int64') + s = pd.Series([1, 2, 3], dtype="Int64") result = op(s, other) - assert result.dtype is np.dtype('float') + assert result.dtype is np.dtype("float") @pytest.mark.parametrize("other", [0, 0.5]) def test_arith_zero_dim_ndarray(self, other): @@ -292,23 +303,23 @@ def test_error(self, data, all_arithmetic_operators): # invalid scalars with pytest.raises(TypeError): - ops('foo') + ops("foo") with pytest.raises(TypeError): - ops(pd.Timestamp('20180101')) + ops(pd.Timestamp("20180101")) # invalid array-likes with pytest.raises(TypeError): - ops(pd.Series('foo', index=s.index)) + ops(pd.Series("foo", index=s.index)) - if op != '__rpow__': + if op != "__rpow__": # TODO(extension) # rpow with a datetimelike coerces the integer array incorrectly with pytest.raises(TypeError): - ops(pd.Series(pd.date_range('20180101', periods=len(s)))) + ops(pd.Series(pd.date_range("20180101", periods=len(s)))) # 2d with pytest.raises(NotImplementedError): - opa(pd.DataFrame({'A': s})) + opa(pd.DataFrame({"A": s})) with pytest.raises(NotImplementedError): opa(np.arange(len(s)).reshape(-1, len(s))) @@ -329,7 +340,6 @@ def test_rpow_one_to_na(self): class TestComparisonOps(BaseOpsUtil): - def _compare_other(self, data, op_name, other): op = self.get_op_from_name(op_name) @@ -338,7 +348,7 @@ def _compare_other(self, data, op_name, other): expected = pd.Series(op(data._data, other)) # fill the nan locations - expected[data._mask] = op_name == '__ne__' + expected[data._mask] = op_name == "__ne__" tm.assert_series_equal(result, expected) @@ -350,7 +360,7 @@ def _compare_other(self, data, op_name, other): expected = op(expected, other) # fill the nan locations - expected[data._mask] = op_name == '__ne__' + expected[data._mask] = op_name == "__ne__" tm.assert_series_equal(result, expected) @@ -367,7 +377,7 @@ def test_compare_array(self, data, all_compare_operators): class TestCasting: pass - @pytest.mark.parametrize('dropna', [True, False]) + @pytest.mark.parametrize("dropna", [True, False]) def test_construct_index(self, all_data, dropna): # ensure that we do not coerce to Float64Index, rather # keep as Index @@ -383,7 +393,7 @@ def test_construct_index(self, all_data, dropna): tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('dropna', [True, False]) + @pytest.mark.parametrize("dropna", [True, False]) def test_astype_index(self, all_data, dropna): # as an int/uint index to Index @@ -423,8 +433,7 @@ def test_astype(self, all_data): # coerce to same numpy_dtype - ints s = pd.Series(ints) result = s.astype(all_data.dtype.numpy_dtype) - expected = pd.Series(ints._data.astype( - all_data.dtype.numpy_dtype)) + expected = pd.Series(ints._data.astype(all_data.dtype.numpy_dtype)) tm.assert_series_equal(result, expected) # coerce to same type - mixed @@ -446,19 +455,18 @@ def test_astype(self, all_data): # coerce to object s = pd.Series(mixed) - result = s.astype('object') + result = s.astype("object") expected = pd.Series(np.asarray(mixed)) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('dtype', [Int8Dtype(), 'Int8', - UInt32Dtype(), 'UInt32']) + @pytest.mark.parametrize("dtype", [Int8Dtype(), "Int8", UInt32Dtype(), "UInt32"]) def test_astype_specific_casting(self, dtype): - s = pd.Series([1, 2, 3], dtype='Int64') + s = pd.Series([1, 2, 3], dtype="Int64") result = s.astype(dtype) expected = pd.Series([1, 2, 3], dtype=dtype) tm.assert_series_equal(result, expected) - s = pd.Series([1, 2, 3, None], dtype='Int64') + s = pd.Series([1, 2, 3, None], dtype="Int64") result = s.astype(dtype) expected = pd.Series([1, 2, 3, None], dtype=dtype) tm.assert_series_equal(result, expected) @@ -483,24 +491,24 @@ def test_construct_cast_invalid(self, dtype): def test_frame_repr(data_missing): - df = pd.DataFrame({'A': data_missing}) + df = pd.DataFrame({"A": data_missing}) result = repr(df) - expected = ' A\n0 NaN\n1 1' + expected = " A\n0 NaN\n1 1" assert result == expected def test_conversions(data_missing): # astype to object series - df = pd.DataFrame({'A': data_missing}) - result = df['A'].astype('object') - expected = pd.Series(np.array([np.nan, 1], dtype=object), name='A') + df = pd.DataFrame({"A": data_missing}) + result = df["A"].astype("object") + expected = pd.Series(np.array([np.nan, 1], dtype=object), name="A") tm.assert_series_equal(result, expected) # convert to object ndarray # we assert that we are exactly equal # including type conversions of scalars - result = df['A'].astype('object').values + result = df["A"].astype("object").values expected = np.array([np.nan, 1], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -516,11 +524,11 @@ def test_conversions(data_missing): def test_integer_array_constructor(): - values = np.array([1, 2, 3, 4], dtype='int64') - mask = np.array([False, False, False, True], dtype='bool') + values = np.array([1, 2, 3, 4], dtype="int64") + mask = np.array([False, False, False, True], dtype="bool") result = IntegerArray(values, mask) - expected = integer_array([1, 2, 3, np.nan], dtype='int64') + expected = integer_array([1, 2, 3, np.nan], dtype="int64") tm.assert_extension_array_equal(result, expected) with pytest.raises(TypeError): @@ -536,12 +544,15 @@ def test_integer_array_constructor(): IntegerArray(values) -@pytest.mark.parametrize('a, b', [ - ([1, None], [1, np.nan]), - ([None], [np.nan]), - ([None, np.nan], [np.nan, np.nan]), - ([np.nan, np.nan], [np.nan, np.nan]), -]) +@pytest.mark.parametrize( + "a, b", + [ + ([1, None], [1, np.nan]), + ([None], [np.nan]), + ([None, np.nan], [np.nan, np.nan]), + ([np.nan, np.nan], [np.nan, np.nan]), + ], +) def test_integer_array_constructor_none_is_nan(a, b): result = integer_array(a) expected = integer_array(b) @@ -549,8 +560,8 @@ def test_integer_array_constructor_none_is_nan(a, b): def test_integer_array_constructor_copy(): - values = np.array([1, 2, 3, 4], dtype='int64') - mask = np.array([False, False, False, True], dtype='bool') + values = np.array([1, 2, 3, 4], dtype="int64") + mask = np.array([False, False, False, True], dtype="bool") result = IntegerArray(values, mask) assert result._data is values @@ -562,17 +573,19 @@ def test_integer_array_constructor_copy(): @pytest.mark.parametrize( - 'values', + "values", [ - ['foo', 'bar'], - ['1', '2'], - 'foo', + ["foo", "bar"], + ["1", "2"], + "foo", 1, 1.0, - pd.date_range('20130101', periods=2), - np.array(['foo']), + pd.date_range("20130101", periods=2), + np.array(["foo"]), [[1, 2], [3, 4]], - [np.nan, {'a': 1}]]) + [np.nan, {"a": 1}], + ], +) def test_to_integer_array_error(values): # error in converting existing arrays to IntegerArrays with pytest.raises(TypeError): @@ -581,9 +594,9 @@ def test_to_integer_array_error(values): def test_to_integer_array_inferred_dtype(): # if values has dtype -> respect it - result = integer_array(np.array([1, 2], dtype='int8')) + result = integer_array(np.array([1, 2], dtype="int8")) assert result.dtype == Int8Dtype() - result = integer_array(np.array([1, 2], dtype='int32')) + result = integer_array(np.array([1, 2], dtype="int32")) assert result.dtype == Int32Dtype() # if values have no dtype -> always int64 @@ -592,34 +605,36 @@ def test_to_integer_array_inferred_dtype(): def test_to_integer_array_dtype_keyword(): - result = integer_array([1, 2], dtype='int8') + result = integer_array([1, 2], dtype="int8") assert result.dtype == Int8Dtype() # if values has dtype -> override it - result = integer_array(np.array([1, 2], dtype='int8'), dtype='int32') + result = integer_array(np.array([1, 2], dtype="int8"), dtype="int32") assert result.dtype == Int32Dtype() def test_to_integer_array_float(): - result = integer_array([1., 2.]) + result = integer_array([1.0, 2.0]) expected = integer_array([1, 2]) tm.assert_extension_array_equal(result, expected) with pytest.raises(TypeError, match="cannot safely cast non-equivalent"): - integer_array([1.5, 2.]) + integer_array([1.5, 2.0]) # for float dtypes, the itemsize is not preserved - result = integer_array(np.array([1., 2.], dtype='float32')) + result = integer_array(np.array([1.0, 2.0], dtype="float32")) assert result.dtype == Int64Dtype() @pytest.mark.parametrize( - 'bool_values, int_values, target_dtype, expected_dtype', - [([False, True], [0, 1], Int64Dtype(), Int64Dtype()), - ([False, True], [0, 1], 'Int64', Int64Dtype()), - ([False, True, np.nan], [0, 1, np.nan], Int64Dtype(), Int64Dtype())]) -def test_to_integer_array_bool(bool_values, int_values, target_dtype, - expected_dtype): + "bool_values, int_values, target_dtype, expected_dtype", + [ + ([False, True], [0, 1], Int64Dtype(), Int64Dtype()), + ([False, True], [0, 1], "Int64", Int64Dtype()), + ([False, True, np.nan], [0, 1, np.nan], Int64Dtype(), Int64Dtype()), + ], +) +def test_to_integer_array_bool(bool_values, int_values, target_dtype, expected_dtype): result = integer_array(bool_values, dtype=target_dtype) assert result.dtype == expected_dtype expected = integer_array(int_values, dtype=target_dtype) @@ -627,11 +642,13 @@ def test_to_integer_array_bool(bool_values, int_values, target_dtype, @pytest.mark.parametrize( - 'values, to_dtype, result_dtype', + "values, to_dtype, result_dtype", [ - (np.array([1], dtype='int64'), None, Int64Dtype), + (np.array([1], dtype="int64"), None, Int64Dtype), (np.array([1, np.nan]), None, Int64Dtype), - (np.array([1, np.nan]), 'int8', Int8Dtype)]) + (np.array([1, np.nan]), "int8", Int8Dtype), + ], +) def test_to_integer_array(values, to_dtype, result_dtype): # convert existing arrays to IntegerArrays result = integer_array(values, dtype=to_dtype) @@ -642,12 +659,16 @@ def test_to_integer_array(values, to_dtype, result_dtype): def test_cross_type_arithmetic(): - df = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'), - 'B': pd.Series([1, np.nan, 3], dtype='UInt8'), - 'C': [1, 2, 3]}) + df = pd.DataFrame( + { + "A": pd.Series([1, 2, np.nan], dtype="Int64"), + "B": pd.Series([1, np.nan, 3], dtype="UInt8"), + "C": [1, 2, 3], + } + ) result = df.A + df.C - expected = pd.Series([2, 4, np.nan], dtype='Int64') + expected = pd.Series([2, 4, np.nan], dtype="Int64") tm.assert_series_equal(result, expected) result = (df.A + df.C) * 3 == 12 @@ -655,20 +676,22 @@ def test_cross_type_arithmetic(): tm.assert_series_equal(result, expected) result = df.A + df.B - expected = pd.Series([2, np.nan, np.nan], dtype='Int64') + expected = pd.Series([2, np.nan, np.nan], dtype="Int64") tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('op', ['sum', 'min', 'max', 'prod']) +@pytest.mark.parametrize("op", ["sum", "min", "max", "prod"]) def test_preserve_dtypes(op): # TODO(#22346): preserve Int64 dtype # for ops that enable (mean would actually work here # but generally it is a float return value) - df = pd.DataFrame({ - "A": ['a', 'b', 'b'], - "B": [1, None, 3], - "C": integer_array([1, None, 3], dtype='Int64'), - }) + df = pd.DataFrame( + { + "A": ["a", "b", "b"], + "B": [1, None, 3], + "C": integer_array([1, None, 3], dtype="Int64"), + } + ) # op result = getattr(df.C, op)() @@ -677,22 +700,24 @@ def test_preserve_dtypes(op): # groupby result = getattr(df.groupby("A"), op)() - expected = pd.DataFrame({ - "B": np.array([1.0, 3.0]), - "C": integer_array([1, 3], dtype="Int64") - }, index=pd.Index(['a', 'b'], name='A')) + expected = pd.DataFrame( + {"B": np.array([1.0, 3.0]), "C": integer_array([1, 3], dtype="Int64")}, + index=pd.Index(["a", "b"], name="A"), + ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize('op', ['mean']) +@pytest.mark.parametrize("op", ["mean"]) def test_reduce_to_float(op): # some reduce ops always return float, even if the result # is a rounded number - df = pd.DataFrame({ - "A": ['a', 'b', 'b'], - "B": [1, None, 3], - "C": integer_array([1, None, 3], dtype='Int64'), - }) + df = pd.DataFrame( + { + "A": ["a", "b", "b"], + "B": [1, None, 3], + "C": integer_array([1, None, 3], dtype="Int64"), + } + ) # op result = getattr(df.C, op)() @@ -701,10 +726,10 @@ def test_reduce_to_float(op): # groupby result = getattr(df.groupby("A"), op)() - expected = pd.DataFrame({ - "B": np.array([1.0, 3.0]), - "C": integer_array([1, 3], dtype="Int64") - }, index=pd.Index(['a', 'b'], name='A')) + expected = pd.DataFrame( + {"B": np.array([1.0, 3.0]), "C": integer_array([1, 3], dtype="Int64")}, + index=pd.Index(["a", "b"], name="A"), + ) tm.assert_frame_equal(result, expected) @@ -714,11 +739,10 @@ def test_astype_nansafe(): msg = "cannot convert float NaN to integer" with pytest.raises(ValueError, match=msg): - arr.astype('uint32') + arr.astype("uint32") -@pytest.mark.parametrize( - 'ufunc', [np.abs, np.sign]) +@pytest.mark.parametrize("ufunc", [np.abs, np.sign]) def test_ufuncs_single_int(ufunc): a = integer_array([1, 2, -3, np.nan]) result = ufunc(a) @@ -731,24 +755,22 @@ def test_ufuncs_single_int(ufunc): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - 'ufunc', [np.log, np.exp, np.sin, np.cos, np.sqrt]) +@pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt]) def test_ufuncs_single_float(ufunc): a = integer_array([1, 2, -3, np.nan]) - with np.errstate(invalid='ignore'): + with np.errstate(invalid="ignore"): result = ufunc(a) expected = ufunc(a.astype(float)) tm.assert_numpy_array_equal(result, expected) s = pd.Series(a) - with np.errstate(invalid='ignore'): + with np.errstate(invalid="ignore"): result = ufunc(s) expected = ufunc(s.astype(float)) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - 'ufunc', [np.add, np.subtract]) +@pytest.mark.parametrize("ufunc", [np.add, np.subtract]) def test_ufuncs_binary_int(ufunc): # two IntegerArrays a = integer_array([1, 2, -3, np.nan]) @@ -776,9 +798,7 @@ def test_ufuncs_binary_int(ufunc): tm.assert_extension_array_equal(result, expected) -@pytest.mark.parametrize('values', [ - [0, 1], [0, None] -]) +@pytest.mark.parametrize("values", [[0, 1], [0, None]]) def test_ufunc_reduce_raises(values): a = integer_array(values) with pytest.raises(NotImplementedError): diff --git a/pandas/tests/arrays/test_numpy.py b/pandas/tests/arrays/test_numpy.py index 5e4f6e376c1d35..c4c1696ede6e6f 100644 --- a/pandas/tests/arrays/test_numpy.py +++ b/pandas/tests/arrays/test_numpy.py @@ -11,15 +11,17 @@ import pandas.util.testing as tm -@pytest.fixture(params=[ - np.array(['a', 'b'], dtype=object), - np.array([0, 1], dtype=float), - np.array([0, 1], dtype=int), - np.array([0, 1 + 2j], dtype=complex), - np.array([True, False], dtype=bool), - np.array([0, 1], dtype='datetime64[ns]'), - np.array([0, 1], dtype='timedelta64[ns]'), -]) +@pytest.fixture( + params=[ + np.array(["a", "b"], dtype=object), + np.array([0, 1], dtype=float), + np.array([0, 1], dtype=int), + np.array([0, 1 + 2j], dtype=complex), + np.array([True, False], dtype=bool), + np.array([0, 1], dtype="datetime64[ns]"), + np.array([0, 1], dtype="timedelta64[ns]"), + ] +) def any_numpy_array(request): """ Parametrized fixture for NumPy arrays with different dtypes. @@ -32,35 +34,42 @@ def any_numpy_array(request): # ---------------------------------------------------------------------------- # PandasDtype -@pytest.mark.parametrize('dtype, expected', [ - ('bool', True), - ('int', True), - ('uint', True), - ('float', True), - ('complex', True), - ('str', False), - ('bytes', False), - ('datetime64[ns]', False), - ('object', False), - ('void', False), -]) + +@pytest.mark.parametrize( + "dtype, expected", + [ + ("bool", True), + ("int", True), + ("uint", True), + ("float", True), + ("complex", True), + ("str", False), + ("bytes", False), + ("datetime64[ns]", False), + ("object", False), + ("void", False), + ], +) def test_is_numeric(dtype, expected): dtype = PandasDtype(dtype) assert dtype._is_numeric is expected -@pytest.mark.parametrize('dtype, expected', [ - ('bool', True), - ('int', False), - ('uint', False), - ('float', False), - ('complex', False), - ('str', False), - ('bytes', False), - ('datetime64[ns]', False), - ('object', False), - ('void', False) -]) +@pytest.mark.parametrize( + "dtype, expected", + [ + ("bool", True), + ("int", False), + ("uint", False), + ("float", False), + ("complex", False), + ("str", False), + ("bytes", False), + ("datetime64[ns]", False), + ("object", False), + ("void", False), + ], +) def test_is_boolean(dtype, expected): dtype = PandasDtype(dtype) assert dtype._is_boolean is expected @@ -80,8 +89,9 @@ def test_constructor_from_string(): # ---------------------------------------------------------------------------- # Construction + def test_constructor_no_coercion(): - with pytest.raises(ValueError, match='NumPy array'): + with pytest.raises(ValueError, match="NumPy array"): PandasArray([1, 2, 3]) @@ -100,9 +110,9 @@ def test_series_constructor_with_astype(): def test_from_sequence_dtype(): - arr = np.array([1, 2, 3], dtype='int64') - result = PandasArray._from_sequence(arr, dtype='uint64') - expected = PandasArray(np.array([1, 2, 3], dtype='uint64')) + arr = np.array([1, 2, 3], dtype="int64") + result = PandasArray._from_sequence(arr, dtype="uint64") + expected = PandasArray(np.array([1, 2, 3], dtype="uint64")) tm.assert_extension_array_equal(result, expected) @@ -122,6 +132,7 @@ def test_constructor_with_data(any_numpy_array): # ---------------------------------------------------------------------------- # Conversion + def test_to_numpy(): arr = PandasArray(np.array([1, 2, 3])) result = arr.to_numpy() @@ -130,14 +141,15 @@ def test_to_numpy(): result = arr.to_numpy(copy=True) assert result is not arr._ndarray - result = arr.to_numpy(dtype='f8') - expected = np.array([1, 2, 3], dtype='f8') + result = arr.to_numpy(dtype="f8") + expected = np.array([1, 2, 3], dtype="f8") tm.assert_numpy_array_equal(result, expected) # ---------------------------------------------------------------------------- # Setitem + def test_setitem_series(): ser = pd.Series([1, 2, 3]) ser.array[0] = 10 @@ -158,8 +170,9 @@ def test_setitem(any_numpy_array): # ---------------------------------------------------------------------------- # Reductions + def test_bad_reduce_raises(): - arr = np.array([1, 2, 3], dtype='int64') + arr = np.array([1, 2, 3], dtype="int64") arr = PandasArray(arr) msg = "cannot perform not_a_method with type int" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index c27200e3273ee6..fab59d312fb9d2 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -20,21 +20,24 @@ def test_registered(): expected = PeriodDtype("D") assert result == expected + # ---------------------------------------------------------------------------- # period_array -@pytest.mark.parametrize("data, freq, expected", [ - ([pd.Period("2017", "D")], None, [17167]), - ([pd.Period("2017", "D")], "D", [17167]), - ([2017], "D", [17167]), - (["2017"], "D", [17167]), - ([pd.Period("2017", "D")], pd.tseries.offsets.Day(), [17167]), - ([pd.Period("2017", "D"), None], None, [17167, iNaT]), - (pd.Series(pd.date_range("2017", periods=3)), None, - [17167, 17168, 17169]), - (pd.date_range("2017", periods=3), None, [17167, 17168, 17169]), -]) +@pytest.mark.parametrize( + "data, freq, expected", + [ + ([pd.Period("2017", "D")], None, [17167]), + ([pd.Period("2017", "D")], "D", [17167]), + ([2017], "D", [17167]), + (["2017"], "D", [17167]), + ([pd.Period("2017", "D")], pd.tseries.offsets.Day(), [17167]), + ([pd.Period("2017", "D"), None], None, [17167, iNaT]), + (pd.Series(pd.date_range("2017", periods=3)), None, [17167, 17168, 17169]), + (pd.date_range("2017", periods=3), None, [17167, 17168, 17169]), + ], +) def test_period_array_ok(data, freq, expected): result = period_array(data, freq=freq).asi8 expected = np.asarray(expected, dtype=np.int64) @@ -43,8 +46,8 @@ def test_period_array_ok(data, freq, expected): def test_period_array_readonly_object(): # https://github.com/pandas-dev/pandas/issues/25403 - pa = period_array([pd.Period('2019-01-01')]) - arr = np.asarray(pa, dtype='object') + pa = period_array([pd.Period("2019-01-01")]) + arr = np.asarray(pa, dtype="object") arr.setflags(write=False) result = period_array(arr) @@ -61,20 +64,21 @@ def test_from_datetime64_freq_changes(): # https://github.com/pandas-dev/pandas/issues/23438 arr = pd.date_range("2017", periods=3, freq="D") result = PeriodArray._from_datetime64(arr, freq="M") - expected = period_array(['2017-01-01', '2017-01-01', '2017-01-01'], - freq="M") + expected = period_array(["2017-01-01", "2017-01-01", "2017-01-01"], freq="M") tm.assert_period_array_equal(result, expected) -@pytest.mark.parametrize("data, freq, msg", [ - ([pd.Period('2017', 'D'), - pd.Period('2017', 'A')], - None, - "Input has different freq"), - ([pd.Period('2017', 'D')], - "A", - "Input has different freq"), -]) +@pytest.mark.parametrize( + "data, freq, msg", + [ + ( + [pd.Period("2017", "D"), pd.Period("2017", "A")], + None, + "Input has different freq", + ), + ([pd.Period("2017", "D")], "A", "Input has different freq"), + ], +) def test_period_array_raises(data, freq, msg): with pytest.raises(IncompatibleFrequency, match=msg): period_array(data, freq) @@ -82,48 +86,45 @@ def test_period_array_raises(data, freq, msg): def test_period_array_non_period_series_raies(): ser = pd.Series([1, 2, 3]) - with pytest.raises(TypeError, match='dtype'): - PeriodArray(ser, freq='D') + with pytest.raises(TypeError, match="dtype"): + PeriodArray(ser, freq="D") def test_period_array_freq_mismatch(): - arr = period_array(['2000', '2001'], freq='D') - with pytest.raises(IncompatibleFrequency, match='freq'): - PeriodArray(arr, freq='M') + arr = period_array(["2000", "2001"], freq="D") + with pytest.raises(IncompatibleFrequency, match="freq"): + PeriodArray(arr, freq="M") - with pytest.raises(IncompatibleFrequency, match='freq'): + with pytest.raises(IncompatibleFrequency, match="freq"): PeriodArray(arr, freq=pd.tseries.offsets.MonthEnd()) def test_asi8(): - result = period_array(['2000', '2001', None], freq='D').asi8 + result = period_array(["2000", "2001", None], freq="D").asi8 expected = np.array([10957, 11323, iNaT]) tm.assert_numpy_array_equal(result, expected) def test_take_raises(): - arr = period_array(['2000', '2001'], freq='D') - with pytest.raises(IncompatibleFrequency, match='freq'): - arr.take([0, -1], allow_fill=True, - fill_value=pd.Period('2000', freq='W')) + arr = period_array(["2000", "2001"], freq="D") + with pytest.raises(IncompatibleFrequency, match="freq"): + arr.take([0, -1], allow_fill=True, fill_value=pd.Period("2000", freq="W")) - with pytest.raises(ValueError, match='foo'): - arr.take([0, -1], allow_fill=True, fill_value='foo') + with pytest.raises(ValueError, match="foo"): + arr.take([0, -1], allow_fill=True, fill_value="foo") -@pytest.mark.parametrize('dtype', [ - int, np.int32, np.int64, 'uint32', 'uint64', -]) +@pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"]) def test_astype(dtype): # We choose to ignore the sign and size of integers for # Period/Datetime/Timedelta astype - arr = period_array(['2000', '2001', None], freq='D') + arr = period_array(["2000", "2001", None], freq="D") result = arr.astype(dtype) - if np.dtype(dtype).kind == 'u': - expected_dtype = np.dtype('uint64') + if np.dtype(dtype).kind == "u": + expected_dtype = np.dtype("uint64") else: - expected_dtype = np.dtype('int64') + expected_dtype = np.dtype("int64") expected = arr.astype(expected_dtype) assert result.dtype == expected_dtype @@ -131,7 +132,7 @@ def test_astype(dtype): def test_astype_copies(): - arr = period_array(['2000', '2001', None], freq='D') + arr = period_array(["2000", "2001", None], freq="D") result = arr.astype(np.int64, copy=False) # Add the `.base`, since we now use `.asi8` which returns a view. # We could maybe override it in PeriodArray to return ._data directly. @@ -139,42 +140,40 @@ def test_astype_copies(): result = arr.astype(np.int64, copy=True) assert result is not arr._data - tm.assert_numpy_array_equal(result, arr._data.view('i8')) + tm.assert_numpy_array_equal(result, arr._data.view("i8")) def test_astype_categorical(): - arr = period_array(['2000', '2001', '2001', None], freq='D') - result = arr.astype('category') - categories = pd.PeriodIndex(['2000', '2001'], freq='D') + arr = period_array(["2000", "2001", "2001", None], freq="D") + result = arr.astype("category") + categories = pd.PeriodIndex(["2000", "2001"], freq="D") expected = pd.Categorical.from_codes([0, 1, 1, -1], categories=categories) tm.assert_categorical_equal(result, expected) def test_astype_period(): - arr = period_array(['2000', '2001', None], freq='D') + arr = period_array(["2000", "2001", None], freq="D") result = arr.astype(PeriodDtype("M")) - expected = period_array(['2000', '2001', None], freq='M') + expected = period_array(["2000", "2001", None], freq="M") tm.assert_period_array_equal(result, expected) -@pytest.mark.parametrize('other', [ - 'datetime64[ns]', 'timedelta64[ns]', -]) +@pytest.mark.parametrize("other", ["datetime64[ns]", "timedelta64[ns]"]) def test_astype_datetime(other): - arr = period_array(['2000', '2001', None], freq='D') + arr = period_array(["2000", "2001", None], freq="D") # slice off the [ns] so that the regex matches. with pytest.raises(TypeError, match=other[:-4]): arr.astype(other) def test_fillna_raises(): - arr = period_array(['2000', '2001', '2002'], freq='D') - with pytest.raises(ValueError, match='Length'): + arr = period_array(["2000", "2001", "2002"], freq="D") + with pytest.raises(ValueError, match="Length"): arr.fillna(arr[:2]) def test_fillna_copies(): - arr = period_array(['2000', '2001', '2002'], freq='D') + arr = period_array(["2000", "2001", "2002"], freq="D") result = arr.fillna(pd.Period("2000", "D")) assert result is not arr @@ -182,16 +181,21 @@ def test_fillna_copies(): # ---------------------------------------------------------------------------- # setitem -@pytest.mark.parametrize('key, value, expected', [ - ([0], pd.Period("2000", "D"), [10957, 1, 2]), - ([0], None, [iNaT, 1, 2]), - ([0], np.nan, [iNaT, 1, 2]), - ([0, 1, 2], pd.Period("2000", "D"), [10957] * 3), - ([0, 1, 2], [pd.Period("2000", "D"), - pd.Period("2001", "D"), - pd.Period("2002", "D")], - [10957, 11323, 11688]), -]) + +@pytest.mark.parametrize( + "key, value, expected", + [ + ([0], pd.Period("2000", "D"), [10957, 1, 2]), + ([0], None, [iNaT, 1, 2]), + ([0], np.nan, [iNaT, 1, 2]), + ([0, 1, 2], pd.Period("2000", "D"), [10957] * 3), + ( + [0, 1, 2], + [pd.Period("2000", "D"), pd.Period("2001", "D"), pd.Period("2002", "D")], + [10957, 11323, 11688], + ), + ], +) def test_setitem(key, value, expected): arr = PeriodArray(np.arange(3), freq="D") expected = PeriodArray(expected, freq="D") @@ -204,7 +208,7 @@ def test_setitem_raises_incompatible_freq(): with pytest.raises(IncompatibleFrequency, match="freq"): arr[0] = pd.Period("2000", freq="A") - other = period_array(['2000', '2001'], freq='A') + other = period_array(["2000", "2001"], freq="A") with pytest.raises(IncompatibleFrequency, match="freq"): arr[[0, 1]] = other @@ -224,8 +228,9 @@ def test_setitem_raises_type(): # ---------------------------------------------------------------------------- # Ops + def test_sub_period(): - arr = period_array(['2000', '2001'], freq='D') + arr = period_array(["2000", "2001"], freq="D") other = pd.Period("2000", freq="M") with pytest.raises(IncompatibleFrequency, match="freq"): arr - other @@ -234,12 +239,13 @@ def test_sub_period(): # ---------------------------------------------------------------------------- # Methods -@pytest.mark.parametrize('other', [ - pd.Period('2000', freq='H'), - period_array(['2000', '2001', '2000'], freq='H') -]) + +@pytest.mark.parametrize( + "other", + [pd.Period("2000", freq="H"), period_array(["2000", "2001", "2000"], freq="H")], +) def test_where_different_freq_raises(other): - ser = pd.Series(period_array(['2000', '2001', '2002'], freq='D')) + ser = pd.Series(period_array(["2000", "2001", "2002"], freq="D")) cond = np.array([True, False, True]) with pytest.raises(IncompatibleFrequency, match="freq"): ser.where(cond, other) @@ -248,19 +254,18 @@ def test_where_different_freq_raises(other): # ---------------------------------------------------------------------------- # Printing + def test_repr_small(): - arr = period_array(['2000', '2001'], freq='D') + arr = period_array(["2000", "2001"], freq="D") result = str(arr) expected = ( - "\n" - "['2000-01-01', '2001-01-01']\n" - "Length: 2, dtype: period[D]" + "\n" "['2000-01-01', '2001-01-01']\n" "Length: 2, dtype: period[D]" ) assert result == expected def test_repr_large(): - arr = period_array(['2000', '2001'] * 500, freq='D') + arr = period_array(["2000", "2001"] * 500, freq="D") result = str(arr) expected = ( "\n" @@ -281,24 +286,27 @@ def test_repr_large(): # ---------------------------------------------------------------------------- # Reductions -class TestReductions: +class TestReductions: def test_min_max(self): - arr = period_array([ - '2000-01-03', - '2000-01-03', - 'NaT', - '2000-01-02', - '2000-01-05', - '2000-01-04', - ], freq='D') + arr = period_array( + [ + "2000-01-03", + "2000-01-03", + "NaT", + "2000-01-02", + "2000-01-05", + "2000-01-04", + ], + freq="D", + ) result = arr.min() - expected = pd.Period('2000-01-02', freq='D') + expected = pd.Period("2000-01-02", freq="D") assert result == expected result = arr.max() - expected = pd.Period('2000-01-05', freq='D') + expected = pd.Period("2000-01-05", freq="D") assert result == expected result = arr.min(skipna=False) @@ -307,9 +315,9 @@ def test_min_max(self): result = arr.max(skipna=False) assert result is pd.NaT - @pytest.mark.parametrize('skipna', [True, False]) + @pytest.mark.parametrize("skipna", [True, False]) def test_min_max_empty(self, skipna): - arr = period_array([], freq='D') + arr = period_array([], freq="D") result = arr.min(skipna=skipna) assert result is pd.NaT diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index 87f32ef101fa95..5825f9f150eb88 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -9,7 +9,7 @@ class TestTimedeltaArrayConstructor: def test_only_1dim_accepted(self): # GH#25282 - arr = np.array([0, 1, 2, 3], dtype='m8[h]').astype('m8[ns]') + arr = np.array([0, 1, 2, 3], dtype="m8[h]").astype("m8[ns]") with pytest.raises(ValueError, match="Only 1-dimensional"): # 2-dim @@ -21,37 +21,38 @@ def test_only_1dim_accepted(self): def test_freq_validation(self): # ensure that the public constructor cannot create an invalid instance - arr = np.array([0, 0, 1], dtype=np.int64) * 3600 * 10**9 + arr = np.array([0, 0, 1], dtype=np.int64) * 3600 * 10 ** 9 - msg = ("Inferred frequency None from passed values does not " - "conform to passed frequency D") + msg = ( + "Inferred frequency None from passed values does not " + "conform to passed frequency D" + ) with pytest.raises(ValueError, match=msg): - TimedeltaArray(arr.view('timedelta64[ns]'), freq="D") + TimedeltaArray(arr.view("timedelta64[ns]"), freq="D") def test_non_array_raises(self): - with pytest.raises(ValueError, match='list'): + with pytest.raises(ValueError, match="list"): TimedeltaArray([1, 2, 3]) def test_other_type_raises(self): - with pytest.raises(ValueError, - match="dtype bool cannot be converted"): - TimedeltaArray(np.array([1, 2, 3], dtype='bool')) + with pytest.raises(ValueError, match="dtype bool cannot be converted"): + TimedeltaArray(np.array([1, 2, 3], dtype="bool")) def test_incorrect_dtype_raises(self): # TODO: why TypeError for 'category' but ValueError for i8? - with pytest.raises(ValueError, - match=r'category cannot be converted ' - r'to timedelta64\[ns\]'): - TimedeltaArray(np.array([1, 2, 3], dtype='i8'), dtype='category') + with pytest.raises( + ValueError, match=r"category cannot be converted " r"to timedelta64\[ns\]" + ): + TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype="category") - with pytest.raises(ValueError, - match=r"dtype int64 cannot be converted " - r"to timedelta64\[ns\]"): - TimedeltaArray(np.array([1, 2, 3], dtype='i8'), - dtype=np.dtype("int64")) + with pytest.raises( + ValueError, + match=r"dtype int64 cannot be converted " r"to timedelta64\[ns\]", + ): + TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("int64")) def test_copy(self): - data = np.array([1, 2, 3], dtype='m8[ns]') + data = np.array([1, 2, 3], dtype="m8[ns]") arr = TimedeltaArray(data, copy=False) assert arr._data is data @@ -63,7 +64,7 @@ def test_copy(self): class TestTimedeltaArray: def test_np_sum(self): # GH#25282 - vals = np.arange(5, dtype=np.int64).view('m8[h]').astype('m8[ns]') + vals = np.arange(5, dtype=np.int64).view("m8[h]").astype("m8[ns]") arr = TimedeltaArray(vals) result = np.sum(arr) assert result == vals.sum() @@ -77,27 +78,27 @@ def test_from_sequence_dtype(self): TimedeltaArray._from_sequence([], dtype=object) def test_abs(self): - vals = np.array([-3600 * 10**9, 'NaT', 7200 * 10**9], dtype='m8[ns]') + vals = np.array([-3600 * 10 ** 9, "NaT", 7200 * 10 ** 9], dtype="m8[ns]") arr = TimedeltaArray(vals) - evals = np.array([3600 * 10**9, 'NaT', 7200 * 10**9], dtype='m8[ns]') + evals = np.array([3600 * 10 ** 9, "NaT", 7200 * 10 ** 9], dtype="m8[ns]") expected = TimedeltaArray(evals) result = abs(arr) tm.assert_timedelta_array_equal(result, expected) def test_neg(self): - vals = np.array([-3600 * 10**9, 'NaT', 7200 * 10**9], dtype='m8[ns]') + vals = np.array([-3600 * 10 ** 9, "NaT", 7200 * 10 ** 9], dtype="m8[ns]") arr = TimedeltaArray(vals) - evals = np.array([3600 * 10**9, 'NaT', -7200 * 10**9], dtype='m8[ns]') + evals = np.array([3600 * 10 ** 9, "NaT", -7200 * 10 ** 9], dtype="m8[ns]") expected = TimedeltaArray(evals) result = -arr tm.assert_timedelta_array_equal(result, expected) def test_neg_freq(self): - tdi = pd.timedelta_range('2 Days', periods=4, freq='H') + tdi = pd.timedelta_range("2 Days", periods=4, freq="H") arr = TimedeltaArray(tdi, freq=tdi.freq) expected = TimedeltaArray(-tdi._data, freq=-tdi.freq) @@ -105,42 +106,36 @@ def test_neg_freq(self): result = -arr tm.assert_timedelta_array_equal(result, expected) - @pytest.mark.parametrize("dtype", [ - int, np.int32, np.int64, 'uint32', 'uint64', - ]) + @pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"]) def test_astype_int(self, dtype): - arr = TimedeltaArray._from_sequence([pd.Timedelta('1H'), - pd.Timedelta('2H')]) + arr = TimedeltaArray._from_sequence([pd.Timedelta("1H"), pd.Timedelta("2H")]) result = arr.astype(dtype) - if np.dtype(dtype).kind == 'u': - expected_dtype = np.dtype('uint64') + if np.dtype(dtype).kind == "u": + expected_dtype = np.dtype("uint64") else: - expected_dtype = np.dtype('int64') + expected_dtype = np.dtype("int64") expected = arr.astype(expected_dtype) assert result.dtype == expected_dtype tm.assert_numpy_array_equal(result, expected) def test_setitem_clears_freq(self): - a = TimedeltaArray(pd.timedelta_range('1H', periods=2, freq='H')) + a = TimedeltaArray(pd.timedelta_range("1H", periods=2, freq="H")) a[0] = pd.Timedelta("1H") assert a.freq is None class TestReductions: - def test_min_max(self): - arr = TimedeltaArray._from_sequence([ - '3H', '3H', 'NaT', '2H', '5H', '4H', - ]) + arr = TimedeltaArray._from_sequence(["3H", "3H", "NaT", "2H", "5H", "4H"]) result = arr.min() - expected = pd.Timedelta('2H') + expected = pd.Timedelta("2H") assert result == expected result = arr.max() - expected = pd.Timedelta('5H') + expected = pd.Timedelta("5H") assert result == expected result = arr.min(skipna=False) @@ -149,7 +144,7 @@ def test_min_max(self): result = arr.max(skipna=False) assert result is pd.NaT - @pytest.mark.parametrize('skipna', [True, False]) + @pytest.mark.parametrize("skipna", [True, False]) def test_min_max_empty(self, skipna): arr = TimedeltaArray._from_sequence([]) result = arr.min(skipna=skipna) diff --git a/pandas/tests/computation/test_compat.py b/pandas/tests/computation/test_compat.py index 3b01851bd39caf..b3fbd8c17d8bfa 100644 --- a/pandas/tests/computation/test_compat.py +++ b/pandas/tests/computation/test_compat.py @@ -13,10 +13,12 @@ def test_compat(): # test we have compat with our version of nu from pandas.core.computation.check import _NUMEXPR_INSTALLED + try: import numexpr as ne + ver = ne.__version__ - if LooseVersion(ver) < LooseVersion(VERSIONS['numexpr']): + if LooseVersion(ver) < LooseVersion(VERSIONS["numexpr"]): assert not _NUMEXPR_INSTALLED else: assert _NUMEXPR_INSTALLED @@ -24,22 +26,21 @@ def test_compat(): pytest.skip("not testing numexpr version compat") -@pytest.mark.parametrize('engine', _engines) -@pytest.mark.parametrize('parser', expr._parsers) +@pytest.mark.parametrize("engine", _engines) +@pytest.mark.parametrize("parser", expr._parsers) def test_invalid_numexpr_version(engine, parser): def testit(): a, b = 1, 2 # noqa - res = pd.eval('a + b', engine=engine, parser=parser) + res = pd.eval("a + b", engine=engine, parser=parser) assert res == 3 - if engine == 'numexpr': + if engine == "numexpr": try: import numexpr as ne except ImportError: pytest.skip("no numexpr") else: - if (LooseVersion(ne.__version__) < - LooseVersion(VERSIONS['numexpr'])): + if LooseVersion(ne.__version__) < LooseVersion(VERSIONS["numexpr"]): with pytest.raises(ImportError): testit() else: diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index ca78e2e40ec745..2fd7c8f04c8bec 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -20,26 +20,40 @@ from pandas.core.computation.engines import NumExprClobberingError, _engines import pandas.core.computation.expr as expr from pandas.core.computation.expr import PandasExprVisitor, PythonExprVisitor -from pandas.core.computation.expressions import ( - _NUMEXPR_INSTALLED, _USE_NUMEXPR) +from pandas.core.computation.expressions import _NUMEXPR_INSTALLED, _USE_NUMEXPR from pandas.core.computation.ops import ( - _arith_ops_syms, _binary_math_ops, _binary_ops_dict, - _special_case_arith_ops_syms, _unary_math_ops) + _arith_ops_syms, + _binary_math_ops, + _binary_ops_dict, + _special_case_arith_ops_syms, + _unary_math_ops, +) import pandas.util.testing as tm from pandas.util.testing import ( - assert_frame_equal, assert_numpy_array_equal, assert_produces_warning, - assert_series_equal, makeCustomDataframe as mkdf, randbool) - - -@pytest.fixture(params=( - pytest.param(engine, - marks=pytest.mark.skipif( - engine == 'numexpr' and not _USE_NUMEXPR, - reason='numexpr enabled->{enabled}, ' - 'installed->{installed}'.format( - enabled=_USE_NUMEXPR, - installed=_NUMEXPR_INSTALLED))) - for engine in _engines)) # noqa + assert_frame_equal, + assert_numpy_array_equal, + assert_produces_warning, + assert_series_equal, + makeCustomDataframe as mkdf, + randbool, +) + + +@pytest.fixture( + params=( + pytest.param( + engine, + marks=pytest.mark.skipif( + engine == "numexpr" and not _USE_NUMEXPR, + reason="numexpr enabled->{enabled}, " + "installed->{installed}".format( + enabled=_USE_NUMEXPR, installed=_NUMEXPR_INSTALLED + ), + ), + ) + for engine in _engines + ) +) # noqa def engine(request): return request.param @@ -51,19 +65,18 @@ def parser(request): @pytest.fixture def ne_lt_2_6_9(): - if _NUMEXPR_INSTALLED and _NUMEXPR_VERSION >= LooseVersion('2.6.9'): + if _NUMEXPR_INSTALLED and _NUMEXPR_VERSION >= LooseVersion("2.6.9"): pytest.skip("numexpr is >= 2.6.9") - return 'numexpr' + return "numexpr" @pytest.fixture def unary_fns_for_ne(): if _NUMEXPR_INSTALLED: - if _NUMEXPR_VERSION >= LooseVersion('2.6.9'): + if _NUMEXPR_VERSION >= LooseVersion("2.6.9"): return _unary_math_ops else: - return tuple(x for x in _unary_math_ops - if x not in ("floor", "ceil")) + return tuple(x for x in _unary_math_ops if x not in ("floor", "ceil")) else: pytest.skip("numexpr is not present") @@ -78,23 +91,24 @@ def _eval_single_bin(lhs, cmp1, rhs, engine): try: return c(lhs, rhs) except ValueError as e: - if str(e).startswith('negative number cannot be ' - 'raised to a fractional power'): + if str(e).startswith( + "negative number cannot be " "raised to a fractional power" + ): return np.nan raise return c(lhs, rhs) def _series_and_2d_ndarray(lhs, rhs): - return ((isinstance(lhs, Series) and - isinstance(rhs, np.ndarray) and rhs.ndim > 1) or - (isinstance(rhs, Series) and - isinstance(lhs, np.ndarray) and lhs.ndim > 1)) + return ( + isinstance(lhs, Series) and isinstance(rhs, np.ndarray) and rhs.ndim > 1 + ) or (isinstance(rhs, Series) and isinstance(lhs, np.ndarray) and lhs.ndim > 1) def _series_and_frame(lhs, rhs): - return ((isinstance(lhs, Series) and isinstance(rhs, DataFrame)) or - (isinstance(rhs, Series) and isinstance(lhs, DataFrame))) + return (isinstance(lhs, Series) and isinstance(rhs, DataFrame)) or ( + isinstance(rhs, Series) and isinstance(lhs, DataFrame) + ) def _bool_and_frame(lhs, rhs): @@ -102,8 +116,7 @@ def _bool_and_frame(lhs, rhs): def _is_py3_complex_incompat(result, expected): - return (isinstance(expected, (complex, np.complexfloating)) and - np.isnan(result)) + return isinstance(expected, (complex, np.complexfloating)) and np.isnan(result) _good_arith_ops = set(_arith_ops_syms).difference(_special_case_arith_ops_syms) @@ -111,18 +124,18 @@ def _is_py3_complex_incompat(result, expected): @td.skip_if_no_ne class TestEvalNumexprPandas: - @classmethod def setup_class(cls): import numexpr as ne + cls.ne = ne - cls.engine = 'numexpr' - cls.parser = 'pandas' + cls.engine = "numexpr" + cls.parser = "pandas" @classmethod def teardown_class(cls): del cls.engine, cls.parser - if hasattr(cls, 'ne'): + if hasattr(cls, "ne"): del cls.ne def setup_data(self): @@ -131,12 +144,20 @@ def setup_data(self): nan_df2 = DataFrame(rand(10, 5)) nan_df2[nan_df2 > 0.5] = np.nan - self.pandas_lhses = (DataFrame(randn(10, 5)), Series(randn(5)), - Series([1, 2, np.nan, np.nan, 5]), nan_df1) - self.pandas_rhses = (DataFrame(randn(10, 5)), Series(randn(5)), - Series([1, 2, np.nan, np.nan, 5]), nan_df2) - self.scalar_lhses = randn(), - self.scalar_rhses = randn(), + self.pandas_lhses = ( + DataFrame(randn(10, 5)), + Series(randn(5)), + Series([1, 2, np.nan, np.nan, 5]), + nan_df1, + ) + self.pandas_rhses = ( + DataFrame(randn(10, 5)), + Series(randn(5)), + Series([1, 2, np.nan, np.nan, 5]), + nan_df2, + ) + self.scalar_lhses = (randn(),) + self.scalar_rhses = (randn(),) self.lhses = self.pandas_lhses + self.scalar_lhses self.rhses = self.pandas_rhses + self.scalar_rhses @@ -147,7 +168,7 @@ def setup_ops(self): self.bin_ops = expr._bool_ops_syms self.special_case_ops = _special_case_arith_ops_syms self.arith_ops = _good_arith_ops - self.unary_ops = '-', '~', 'not ' + self.unary_ops = "-", "~", "not " def setup_method(self, method): self.setup_ops() @@ -159,27 +180,35 @@ def teardown_method(self, method): del self.pandas_rhses, self.pandas_lhses, self.current_engines @pytest.mark.slow - @pytest.mark.parametrize('cmp1', ['!=', '==', '<=', '>=', '<', '>'], - ids=['ne', 'eq', 'le', 'ge', 'lt', 'gt']) - @pytest.mark.parametrize('cmp2', ['>', '<'], ids=['gt', 'lt']) + @pytest.mark.parametrize( + "cmp1", + ["!=", "==", "<=", ">=", "<", ">"], + ids=["ne", "eq", "le", "ge", "lt", "gt"], + ) + @pytest.mark.parametrize("cmp2", [">", "<"], ids=["gt", "lt"]) def test_complex_cmp_ops(self, cmp1, cmp2): - for lhs, rhs, binop in product( - self.lhses, self.rhses, self.bin_ops): + for lhs, rhs, binop in product(self.lhses, self.rhses, self.bin_ops): lhs_new = _eval_single_bin(lhs, cmp1, rhs, self.engine) rhs_new = _eval_single_bin(lhs, cmp2, rhs, self.engine) - expected = _eval_single_bin( - lhs_new, binop, rhs_new, self.engine) + expected = _eval_single_bin(lhs_new, binop, rhs_new, self.engine) - ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format( - cmp1=cmp1, binop=binop, cmp2=cmp2) + ex = "(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)".format( + cmp1=cmp1, binop=binop, cmp2=cmp2 + ) result = pd.eval(ex, engine=self.engine, parser=self.parser) self.check_equal(result, expected) def test_simple_cmp_ops(self): - bool_lhses = (DataFrame(randbool(size=(10, 5))), - Series(randbool((5,))), randbool()) - bool_rhses = (DataFrame(randbool(size=(10, 5))), - Series(randbool((5,))), randbool()) + bool_lhses = ( + DataFrame(randbool(size=(10, 5))), + Series(randbool((5,))), + randbool(), + ) + bool_rhses = ( + DataFrame(randbool(size=(10, 5))), + Series(randbool((5,))), + randbool(), + ) for lhs, rhs, cmp_op in product(bool_lhses, bool_rhses, self.cmp_ops): self.check_simple_cmp_op(lhs, cmp_op, rhs) @@ -190,17 +219,17 @@ def test_binary_arith_ops(self): def test_modulus(self): for lhs, rhs in product(self.lhses, self.rhses): - self.check_modulus(lhs, '%', rhs) + self.check_modulus(lhs, "%", rhs) def test_floor_division(self): for lhs, rhs in product(self.lhses, self.rhses): - self.check_floor_division(lhs, '//', rhs) + self.check_floor_division(lhs, "//", rhs) @td.skip_if_windows def test_pow(self): # odd failure on win32 platform, so skip for lhs, rhs in product(self.lhses, self.rhses): - self.check_pow(lhs, '**', rhs) + self.check_pow(lhs, "**", rhs) @pytest.mark.slow def test_single_invert_op(self): @@ -215,9 +244,10 @@ def test_compound_invert_op(self): @pytest.mark.slow def test_chained_cmp_op(self): mids = self.lhses - cmp_ops = '<', '>' - for lhs, cmp1, mid, cmp2, rhs in product(self.lhses, cmp_ops, - mids, cmp_ops, self.rhses): + cmp_ops = "<", ">" + for lhs, cmp1, mid, cmp2, rhs in product( + self.lhses, cmp_ops, mids, cmp_ops, self.rhses + ): self.check_chained_cmp_op(lhs, cmp1, mid, cmp2, rhs) def check_equal(self, result, expected): @@ -231,7 +261,6 @@ def check_equal(self, result, expected): assert result == expected def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): - def check_operands(left, right, cmp_op): return _eval_single_bin(left, cmp_op, right, self.engine) @@ -239,42 +268,46 @@ def check_operands(left, right, cmp_op): rhs_new = check_operands(mid, rhs, cmp2) if lhs_new is not None and rhs_new is not None: - ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2) - ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp1, cmp2) - ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp1, cmp2) - expected = _eval_single_bin(lhs_new, '&', rhs_new, self.engine) + ex1 = "lhs {0} mid {1} rhs".format(cmp1, cmp2) + ex2 = "lhs {0} mid and mid {1} rhs".format(cmp1, cmp2) + ex3 = "(lhs {0} mid) & (mid {1} rhs)".format(cmp1, cmp2) + expected = _eval_single_bin(lhs_new, "&", rhs_new, self.engine) for ex in (ex1, ex2, ex3): - result = pd.eval(ex, engine=self.engine, - parser=self.parser) + result = pd.eval(ex, engine=self.engine, parser=self.parser) tm.assert_almost_equal(result, expected) def check_simple_cmp_op(self, lhs, cmp1, rhs): - ex = 'lhs {0} rhs'.format(cmp1) - msg = (r"only list-like( or dict-like)? objects are allowed to be" - r" passed to (DataFrame\.)?isin\(\), you passed a" - r" (\[|')bool(\]|')|" - "argument of type 'bool' is not iterable") - if cmp1 in ('in', 'not in') and not is_list_like(rhs): + ex = "lhs {0} rhs".format(cmp1) + msg = ( + r"only list-like( or dict-like)? objects are allowed to be" + r" passed to (DataFrame\.)?isin\(\), you passed a" + r" (\[|')bool(\]|')|" + "argument of type 'bool' is not iterable" + ) + if cmp1 in ("in", "not in") and not is_list_like(rhs): with pytest.raises(TypeError, match=msg): - pd.eval(ex, engine=self.engine, parser=self.parser, - local_dict={'lhs': lhs, 'rhs': rhs}) + pd.eval( + ex, + engine=self.engine, + parser=self.parser, + local_dict={"lhs": lhs, "rhs": rhs}, + ) else: expected = _eval_single_bin(lhs, cmp1, rhs, self.engine) result = pd.eval(ex, engine=self.engine, parser=self.parser) self.check_equal(result, expected) def check_binary_arith_op(self, lhs, arith1, rhs): - ex = 'lhs {0} rhs'.format(arith1) + ex = "lhs {0} rhs".format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = _eval_single_bin(lhs, arith1, rhs, self.engine) tm.assert_almost_equal(result, expected) - ex = 'lhs {0} rhs {0} rhs'.format(arith1) + ex = "lhs {0} rhs {0} rhs".format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) - nlhs = _eval_single_bin(lhs, arith1, rhs, - self.engine) + nlhs = _eval_single_bin(lhs, arith1, rhs, self.engine) self.check_alignment(result, nlhs, rhs, arith1) def check_alignment(self, result, nlhs, ghs, op): @@ -287,44 +320,51 @@ def check_alignment(self, result, nlhs, ghs, op): else: # direct numpy comparison - expected = self.ne.evaluate('nlhs {0} ghs'.format(op)) + expected = self.ne.evaluate("nlhs {0} ghs".format(op)) tm.assert_numpy_array_equal(result.values, expected) # modulus, pow, and floor division require special casing def check_modulus(self, lhs, arith1, rhs): - ex = 'lhs {0} rhs'.format(arith1) + ex = "lhs {0} rhs".format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = lhs % rhs tm.assert_almost_equal(result, expected) - expected = self.ne.evaluate('expected {0} rhs'.format(arith1)) + expected = self.ne.evaluate("expected {0} rhs".format(arith1)) if isinstance(result, (DataFrame, Series)): tm.assert_almost_equal(result.values, expected) else: tm.assert_almost_equal(result, expected.item()) def check_floor_division(self, lhs, arith1, rhs): - ex = 'lhs {0} rhs'.format(arith1) + ex = "lhs {0} rhs".format(arith1) - if self.engine == 'python': + if self.engine == "python": res = pd.eval(ex, engine=self.engine, parser=self.parser) expected = lhs // rhs self.check_equal(res, expected) else: - msg = (r"unsupported operand type\(s\) for //: 'VariableNode' and" - " 'VariableNode'") + msg = ( + r"unsupported operand type\(s\) for //: 'VariableNode' and" + " 'VariableNode'" + ) with pytest.raises(TypeError, match=msg): - pd.eval(ex, local_dict={'lhs': lhs, 'rhs': rhs}, - engine=self.engine, parser=self.parser) + pd.eval( + ex, + local_dict={"lhs": lhs, "rhs": rhs}, + engine=self.engine, + parser=self.parser, + ) def get_expected_pow_result(self, lhs, rhs): try: - expected = _eval_single_bin(lhs, '**', rhs, self.engine) + expected = _eval_single_bin(lhs, "**", rhs, self.engine) except ValueError as e: - if str(e).startswith('negative number cannot be ' - 'raised to a fractional power'): - if self.engine == 'python': + if str(e).startswith( + "negative number cannot be " "raised to a fractional power" + ): + if self.engine == "python": pytest.skip(str(e)) else: expected = np.nan @@ -333,21 +373,25 @@ def get_expected_pow_result(self, lhs, rhs): return expected def check_pow(self, lhs, arith1, rhs): - ex = 'lhs {0} rhs'.format(arith1) + ex = "lhs {0} rhs".format(arith1) expected = self.get_expected_pow_result(lhs, rhs) result = pd.eval(ex, engine=self.engine, parser=self.parser) - if (is_scalar(lhs) and is_scalar(rhs) and - _is_py3_complex_incompat(result, expected)): + if ( + is_scalar(lhs) + and is_scalar(rhs) + and _is_py3_complex_incompat(result, expected) + ): with pytest.raises(AssertionError): tm.assert_numpy_array_equal(result, expected) else: tm.assert_almost_equal(result, expected) - ex = '(lhs {0} rhs) {0} rhs'.format(arith1) + ex = "(lhs {0} rhs) {0} rhs".format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = self.get_expected_pow_result( - self.get_expected_pow_result(lhs, rhs), rhs) + self.get_expected_pow_result(lhs, rhs), rhs + ) tm.assert_almost_equal(result, expected) def check_single_invert_op(self, lhs, cmp1, rhs): @@ -358,25 +402,32 @@ def check_single_invert_op(self, lhs, cmp1, rhs): except AttributeError: elb = np.array([bool(el)]) expected = ~elb - result = pd.eval('~elb', engine=self.engine, parser=self.parser) + result = pd.eval("~elb", engine=self.engine, parser=self.parser) tm.assert_almost_equal(expected, result) for engine in self.current_engines: - tm.assert_almost_equal(result, pd.eval('~elb', engine=engine, - parser=self.parser)) + tm.assert_almost_equal( + result, pd.eval("~elb", engine=engine, parser=self.parser) + ) def check_compound_invert_op(self, lhs, cmp1, rhs): - skip_these = 'in', 'not in' - ex = '~(lhs {0} rhs)'.format(cmp1) - - msg = (r"only list-like( or dict-like)? objects are allowed to be" - r" passed to (DataFrame\.)?isin\(\), you passed a" - r" (\[|')float(\]|')|" - "argument of type 'float' is not iterable") + skip_these = "in", "not in" + ex = "~(lhs {0} rhs)".format(cmp1) + + msg = ( + r"only list-like( or dict-like)? objects are allowed to be" + r" passed to (DataFrame\.)?isin\(\), you passed a" + r" (\[|')float(\]|')|" + "argument of type 'float' is not iterable" + ) if is_scalar(rhs) and cmp1 in skip_these: with pytest.raises(TypeError, match=msg): - pd.eval(ex, engine=self.engine, parser=self.parser, - local_dict={'lhs': lhs, 'rhs': rhs}) + pd.eval( + ex, + engine=self.engine, + parser=self.parser, + local_dict={"lhs": lhs, "rhs": rhs}, + ) else: # compound if is_scalar(lhs) and is_scalar(rhs): @@ -394,17 +445,17 @@ def check_compound_invert_op(self, lhs, cmp1, rhs): ev = pd.eval(ex, engine=self.engine, parser=self.parser) tm.assert_almost_equal(ev, result) - def ex(self, op, var_name='lhs'): - return '{0}{1}'.format(op, var_name) + def ex(self, op, var_name="lhs"): + return "{0}{1}".format(op, var_name) def test_frame_invert(self): - expr = self.ex('~') + expr = self.ex("~") # ~ ## # frame # float always raises lhs = DataFrame(randn(5, 2)) - if self.engine == 'numexpr': + if self.engine == "numexpr": with pytest.raises(NotImplementedError): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: @@ -413,7 +464,7 @@ def test_frame_invert(self): # int raises on numexpr lhs = DataFrame(randint(5, size=(5, 2))) - if self.engine == 'numexpr': + if self.engine == "numexpr": with pytest.raises(NotImplementedError): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: @@ -428,8 +479,8 @@ def test_frame_invert(self): assert_frame_equal(expect, result) # object raises - lhs = DataFrame({'b': ['a', 1, 2.0], 'c': rand(3) > 0.5}) - if self.engine == 'numexpr': + lhs = DataFrame({"b": ["a", 1, 2.0], "c": rand(3) > 0.5}) + if self.engine == "numexpr": with pytest.raises(ValueError): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: @@ -438,12 +489,12 @@ def test_frame_invert(self): def test_series_invert(self): # ~ #### - expr = self.ex('~') + expr = self.ex("~") # series # float raises lhs = Series(randn(5)) - if self.engine == 'numexpr': + if self.engine == "numexpr": with pytest.raises(NotImplementedError): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: @@ -452,7 +503,7 @@ def test_series_invert(self): # int raises on numexpr lhs = Series(randint(5, size=5)) - if self.engine == 'numexpr': + if self.engine == "numexpr": with pytest.raises(NotImplementedError): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: @@ -471,8 +522,8 @@ def test_series_invert(self): # bool # object - lhs = Series(['a', 1, 2.0]) - if self.engine == 'numexpr': + lhs = Series(["a", 1, 2.0]) + if self.engine == "numexpr": with pytest.raises(ValueError): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: @@ -480,7 +531,7 @@ def test_series_invert(self): result = pd.eval(expr, engine=self.engine, parser=self.parser) def test_frame_negate(self): - expr = self.ex('-') + expr = self.ex("-") # float lhs = DataFrame(randn(5, 2)) @@ -496,7 +547,7 @@ def test_frame_negate(self): # bool doesn't work with numexpr but works elsewhere lhs = DataFrame(rand(5, 2) > 0.5) - if self.engine == 'numexpr': + if self.engine == "numexpr": with pytest.raises(NotImplementedError): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: @@ -505,7 +556,7 @@ def test_frame_negate(self): assert_frame_equal(expect, result) def test_series_negate(self): - expr = self.ex('-') + expr = self.ex("-") # float lhs = Series(randn(5)) @@ -521,7 +572,7 @@ def test_series_negate(self): # bool doesn't work with numexpr but works elsewhere lhs = Series(rand(5) > 0.5) - if self.engine == 'numexpr': + if self.engine == "numexpr": with pytest.raises(NotImplementedError): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: @@ -530,7 +581,7 @@ def test_series_negate(self): assert_series_equal(expect, result) def test_frame_pos(self): - expr = self.ex('+') + expr = self.ex("+") # float lhs = DataFrame(randn(5, 2)) @@ -551,7 +602,7 @@ def test_frame_pos(self): assert_frame_equal(expect, result) def test_series_pos(self): - expr = self.ex('+') + expr = self.ex("+") # float lhs = Series(randn(5)) @@ -573,57 +624,63 @@ def test_series_pos(self): def test_scalar_unary(self): with pytest.raises(TypeError): - pd.eval('~1.0', engine=self.engine, parser=self.parser) - - assert pd.eval('-1.0', parser=self.parser, - engine=self.engine) == -1.0 - assert pd.eval('+1.0', parser=self.parser, - engine=self.engine) == +1.0 - assert pd.eval('~1', parser=self.parser, - engine=self.engine) == ~1 - assert pd.eval('-1', parser=self.parser, - engine=self.engine) == -1 - assert pd.eval('+1', parser=self.parser, - engine=self.engine) == +1 - assert pd.eval('~True', parser=self.parser, - engine=self.engine) == ~True - assert pd.eval('~False', parser=self.parser, - engine=self.engine) == ~False - assert pd.eval('-True', parser=self.parser, - engine=self.engine) == -True - assert pd.eval('-False', parser=self.parser, - engine=self.engine) == -False - assert pd.eval('+True', parser=self.parser, - engine=self.engine) == +True - assert pd.eval('+False', parser=self.parser, - engine=self.engine) == +False + pd.eval("~1.0", engine=self.engine, parser=self.parser) + + assert pd.eval("-1.0", parser=self.parser, engine=self.engine) == -1.0 + assert pd.eval("+1.0", parser=self.parser, engine=self.engine) == +1.0 + assert pd.eval("~1", parser=self.parser, engine=self.engine) == ~1 + assert pd.eval("-1", parser=self.parser, engine=self.engine) == -1 + assert pd.eval("+1", parser=self.parser, engine=self.engine) == +1 + assert pd.eval("~True", parser=self.parser, engine=self.engine) == ~True + assert pd.eval("~False", parser=self.parser, engine=self.engine) == ~False + assert pd.eval("-True", parser=self.parser, engine=self.engine) == -True + assert pd.eval("-False", parser=self.parser, engine=self.engine) == -False + assert pd.eval("+True", parser=self.parser, engine=self.engine) == +True + assert pd.eval("+False", parser=self.parser, engine=self.engine) == +False def test_unary_in_array(self): # GH 11235 assert_numpy_array_equal( - pd.eval('[-True, True, ~True, +True,' - '-False, False, ~False, +False,' - '-37, 37, ~37, +37]'), - np.array([-True, True, ~True, +True, - -False, False, ~False, +False, - -37, 37, ~37, +37], dtype=np.object_)) - - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) + pd.eval( + "[-True, True, ~True, +True," + "-False, False, ~False, +False," + "-37, 37, ~37, +37]" + ), + np.array( + [ + -True, + True, + ~True, + +True, + -False, + False, + ~False, + +False, + -37, + 37, + ~37, + +37, + ], + dtype=np.object_, + ), + ) + + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_float_comparison_bin_op(self, dtype): # GH 16363 - df = pd.DataFrame({'x': np.array([0], dtype=dtype)}) - res = df.eval('x < -0.1') + df = pd.DataFrame({"x": np.array([0], dtype=dtype)}) + res = df.eval("x < -0.1") assert res.values == np.array([False]) - res = df.eval('-5 > x') + res = df.eval("-5 > x") assert res.values == np.array([False]) def test_disallow_scalar_bool_ops(self): - exprs = '1 or 2', '1 and 2' - exprs += 'a and b', 'a or b' - exprs += '1 or 2 and (3 + 2) > 3', - exprs += '2 * x > 2 or 1 and 2', - exprs += '2 * df > 3 and 1 or a', + exprs = "1 or 2", "1 and 2" + exprs += "a and b", "a or b" + exprs += ("1 or 2 and (3 + 2) > 3",) + exprs += ("2 * x > 2 or 1 and 2",) + exprs += ("2 * df > 3 and 1 or a",) x, a, b, df = np.random.randn(3), 1, 2, DataFrame(randn(3, 2)) # noqa for ex in exprs: @@ -633,35 +690,35 @@ def test_disallow_scalar_bool_ops(self): def test_identical(self): # see gh-10546 x = 1 - result = pd.eval('x', engine=self.engine, parser=self.parser) + result = pd.eval("x", engine=self.engine, parser=self.parser) assert result == 1 assert is_scalar(result) x = 1.5 - result = pd.eval('x', engine=self.engine, parser=self.parser) + result = pd.eval("x", engine=self.engine, parser=self.parser) assert result == 1.5 assert is_scalar(result) x = False - result = pd.eval('x', engine=self.engine, parser=self.parser) + result = pd.eval("x", engine=self.engine, parser=self.parser) assert not result assert is_bool(result) assert is_scalar(result) x = np.array([1]) - result = pd.eval('x', engine=self.engine, parser=self.parser) + result = pd.eval("x", engine=self.engine, parser=self.parser) tm.assert_numpy_array_equal(result, np.array([1])) - assert result.shape == (1, ) + assert result.shape == (1,) x = np.array([1.5]) - result = pd.eval('x', engine=self.engine, parser=self.parser) + result = pd.eval("x", engine=self.engine, parser=self.parser) tm.assert_numpy_array_equal(result, np.array([1.5])) - assert result.shape == (1, ) + assert result.shape == (1,) x = np.array([False]) # noqa - result = pd.eval('x', engine=self.engine, parser=self.parser) + result = pd.eval("x", engine=self.engine, parser=self.parser) tm.assert_numpy_array_equal(result, np.array([False])) - assert result.shape == (1, ) + assert result.shape == (1,) def test_line_continuation(self): # GH 11149 @@ -672,14 +729,12 @@ def test_line_continuation(self): def test_float_truncation(self): # GH 14241 - exp = '1000000000.006' + exp = "1000000000.006" result = pd.eval(exp, engine=self.engine, parser=self.parser) expected = np.float64(exp) assert result == expected - df = pd.DataFrame({'A': [1000000000.0009, - 1000000000.0011, - 1000000000.0015]}) + df = pd.DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]}) cutoff = 1000000000.0006 result = df.query("A < %.4f" % cutoff) assert result.empty @@ -690,60 +745,59 @@ def test_float_truncation(self): tm.assert_frame_equal(expected, result) exact = 1000000000.0011 - result = df.query('A == %.4f' % exact) + result = df.query("A == %.4f" % exact) expected = df.loc[[1], :] tm.assert_frame_equal(expected, result) def test_disallow_python_keywords(self): # GH 18221 - df = pd.DataFrame([[0, 0, 0]], columns=['foo', 'bar', 'class']) + df = pd.DataFrame([[0, 0, 0]], columns=["foo", "bar", "class"]) msg = "Python keyword not valid identifier in numexpr query" with pytest.raises(SyntaxError, match=msg): - df.query('class == 0') + df.query("class == 0") df = pd.DataFrame() - df.index.name = 'lambda' + df.index.name = "lambda" with pytest.raises(SyntaxError, match=msg): - df.query('lambda == 0') + df.query("lambda == 0") @td.skip_if_no_ne class TestEvalNumexprPython(TestEvalNumexprPandas): - @classmethod def setup_class(cls): super().setup_class() import numexpr as ne + cls.ne = ne - cls.engine = 'numexpr' - cls.parser = 'python' + cls.engine = "numexpr" + cls.parser = "python" def setup_ops(self): - self.cmp_ops = list(filter(lambda x: x not in ('in', 'not in'), - expr._cmp_ops_syms)) + self.cmp_ops = list( + filter(lambda x: x not in ("in", "not in"), expr._cmp_ops_syms) + ) self.cmp2_ops = self.cmp_ops[::-1] - self.bin_ops = [s for s in expr._bool_ops_syms - if s not in ('and', 'or')] + self.bin_ops = [s for s in expr._bool_ops_syms if s not in ("and", "or")] self.special_case_ops = _special_case_arith_ops_syms self.arith_ops = _good_arith_ops - self.unary_ops = '+', '-', '~' + self.unary_ops = "+", "-", "~" def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): - ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2) + ex1 = "lhs {0} mid {1} rhs".format(cmp1, cmp2) with pytest.raises(NotImplementedError): pd.eval(ex1, engine=self.engine, parser=self.parser) class TestEvalPythonPython(TestEvalNumexprPython): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = 'python' - cls.parser = 'python' + cls.engine = "python" + cls.parser = "python" def check_modulus(self, lhs, arith1, rhs): - ex = 'lhs {0} rhs'.format(arith1) + ex = "lhs {0} rhs".format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = lhs % rhs @@ -760,21 +814,19 @@ def check_alignment(self, result, nlhs, ghs, op): # TypeError, AttributeError: series or frame with scalar align pass else: - expected = eval('nlhs {0} ghs'.format(op)) + expected = eval("nlhs {0} ghs".format(op)) tm.assert_almost_equal(result, expected) class TestEvalPythonPandas(TestEvalPythonPython): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = 'python' - cls.parser = 'pandas' + cls.engine = "python" + cls.parser = "pandas" def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): - TestEvalNumexprPandas.check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, - rhs) + TestEvalNumexprPandas.check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs) f = lambda *args, **kwargs: np.random.randn() @@ -785,19 +837,19 @@ def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): class TestTypeCasting: - @pytest.mark.parametrize('op', ['+', '-', '*', '**', '/']) + @pytest.mark.parametrize("op", ["+", "-", "*", "**", "/"]) # maybe someday... numexpr has too many upcasting rules now # chain(*(np.sctypes[x] for x in ['uint', 'int', 'float'])) - @pytest.mark.parametrize('dt', [np.float32, np.float64]) + @pytest.mark.parametrize("dt", [np.float32, np.float64]) def test_binop_typecasting(self, engine, parser, op, dt): df = mkdf(5, 3, data_gen_f=f, dtype=dt) - s = 'df {} 3'.format(op) + s = "df {} 3".format(op) res = pd.eval(s, engine=engine, parser=parser) assert df.values.dtype == dt assert res.values.dtype == dt assert_frame_equal(res, eval(s)) - s = '3 {} df'.format(op) + s = "3 {} df".format(op) res = pd.eval(s, engine=engine, parser=parser) assert df.values.dtype == dt assert res.values.dtype == dt @@ -807,65 +859,68 @@ def test_binop_typecasting(self, engine, parser, op, dt): # ------------------------------------- # Basic and complex alignment + def _is_datetime(x): return issubclass(x.dtype.type, np.datetime64) def should_warn(*args): - not_mono = not any(map(operator.attrgetter('is_monotonic'), args)) + not_mono = not any(map(operator.attrgetter("is_monotonic"), args)) only_one_dt = reduce(operator.xor, map(_is_datetime, args)) return not_mono and only_one_dt class TestAlignment: - index_types = 'i', 'u', 'dt' - lhs_index_types = index_types + ('s',) # 'p' + index_types = "i", "u", "dt" + lhs_index_types = index_types + ("s",) # 'p' def test_align_nested_unary_op(self, engine, parser): - s = 'df * ~2' + s = "df * ~2" df = mkdf(5, 3, data_gen_f=f) res = pd.eval(s, engine=engine, parser=parser) assert_frame_equal(res, df * ~2) def test_basic_frame_alignment(self, engine, parser): - args = product(self.lhs_index_types, self.index_types, - self.index_types) + args = product(self.lhs_index_types, self.index_types, self.index_types) with warnings.catch_warnings(record=True): - warnings.simplefilter('always', RuntimeWarning) + warnings.simplefilter("always", RuntimeWarning) for lr_idx_type, rr_idx_type, c_idx_type in args: - df = mkdf(10, 10, data_gen_f=f, r_idx_type=lr_idx_type, - c_idx_type=c_idx_type) - df2 = mkdf(20, 10, data_gen_f=f, r_idx_type=rr_idx_type, - c_idx_type=c_idx_type) + df = mkdf( + 10, 10, data_gen_f=f, r_idx_type=lr_idx_type, c_idx_type=c_idx_type + ) + df2 = mkdf( + 20, 10, data_gen_f=f, r_idx_type=rr_idx_type, c_idx_type=c_idx_type + ) # only warns if not monotonic and not sortable if should_warn(df.index, df2.index): with tm.assert_produces_warning(RuntimeWarning): - res = pd.eval('df + df2', engine=engine, parser=parser) + res = pd.eval("df + df2", engine=engine, parser=parser) else: - res = pd.eval('df + df2', engine=engine, parser=parser) + res = pd.eval("df + df2", engine=engine, parser=parser) assert_frame_equal(res, df + df2) def test_frame_comparison(self, engine, parser): args = product(self.lhs_index_types, repeat=2) for r_idx_type, c_idx_type in args: - df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, - c_idx_type=c_idx_type) - res = pd.eval('df < 2', engine=engine, parser=parser) + df = mkdf( + 10, 10, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type + ) + res = pd.eval("df < 2", engine=engine, parser=parser) assert_frame_equal(res, df < 2) - df3 = DataFrame(randn(*df.shape), index=df.index, - columns=df.columns) - res = pd.eval('df < df3', engine=engine, parser=parser) + df3 = DataFrame(randn(*df.shape), index=df.index, columns=df.columns) + res = pd.eval("df < df3", engine=engine, parser=parser) assert_frame_equal(res, df < df3) @pytest.mark.slow def test_medium_complex_frame_alignment(self, engine, parser): - args = product(self.lhs_index_types, self.index_types, - self.index_types, self.index_types) + args = product( + self.lhs_index_types, self.index_types, self.index_types, self.index_types + ) with warnings.catch_warnings(record=True): - warnings.simplefilter('always', RuntimeWarning) + warnings.simplefilter("always", RuntimeWarning) for r1, c1, r2, c2 in args: df = mkdf(3, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) @@ -873,59 +928,56 @@ def test_medium_complex_frame_alignment(self, engine, parser): df3 = mkdf(5, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) if should_warn(df.index, df2.index, df3.index): with tm.assert_produces_warning(RuntimeWarning): - res = pd.eval('df + df2 + df3', engine=engine, - parser=parser) + res = pd.eval("df + df2 + df3", engine=engine, parser=parser) else: - res = pd.eval('df + df2 + df3', - engine=engine, parser=parser) + res = pd.eval("df + df2 + df3", engine=engine, parser=parser) assert_frame_equal(res, df + df2 + df3) def test_basic_frame_series_alignment(self, engine, parser): def testit(r_idx_type, c_idx_type, index_name): - df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, - c_idx_type=c_idx_type) + df = mkdf( + 10, 10, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type + ) index = getattr(df, index_name) s = Series(np.random.randn(5), index[:5]) if should_warn(df.index, s.index): with tm.assert_produces_warning(RuntimeWarning): - res = pd.eval('df + s', engine=engine, parser=parser) + res = pd.eval("df + s", engine=engine, parser=parser) else: - res = pd.eval('df + s', engine=engine, parser=parser) + res = pd.eval("df + s", engine=engine, parser=parser) - if r_idx_type == 'dt' or c_idx_type == 'dt': - expected = df.add(s) if engine == 'numexpr' else df + s + if r_idx_type == "dt" or c_idx_type == "dt": + expected = df.add(s) if engine == "numexpr" else df + s else: expected = df + s assert_frame_equal(res, expected) - args = product(self.lhs_index_types, self.index_types, - ('index', 'columns')) + args = product(self.lhs_index_types, self.index_types, ("index", "columns")) with warnings.catch_warnings(record=True): - warnings.simplefilter('always', RuntimeWarning) + warnings.simplefilter("always", RuntimeWarning) for r_idx_type, c_idx_type, index_name in args: testit(r_idx_type, c_idx_type, index_name) def test_basic_series_frame_alignment(self, engine, parser): def testit(r_idx_type, c_idx_type, index_name): - df = mkdf(10, 7, data_gen_f=f, r_idx_type=r_idx_type, - c_idx_type=c_idx_type) + df = mkdf(10, 7, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type) index = getattr(df, index_name) s = Series(np.random.randn(5), index[:5]) if should_warn(s.index, df.index): with tm.assert_produces_warning(RuntimeWarning): - res = pd.eval('s + df', engine=engine, parser=parser) + res = pd.eval("s + df", engine=engine, parser=parser) else: - res = pd.eval('s + df', engine=engine, parser=parser) + res = pd.eval("s + df", engine=engine, parser=parser) - if r_idx_type == 'dt' or c_idx_type == 'dt': - expected = df.add(s) if engine == 'numexpr' else s + df + if r_idx_type == "dt" or c_idx_type == "dt": + expected = df.add(s) if engine == "numexpr" else s + df else: expected = s + df assert_frame_equal(res, expected) # only test dt with dt, otherwise weird joins result - args = product(['i', 'u', 's'], ['i', 'u', 's'], ('index', 'columns')) + args = product(["i", "u", "s"], ["i", "u", "s"], ("index", "columns")) with warnings.catch_warnings(record=True): # avoid warning about comparing strings and ints warnings.simplefilter("ignore", RuntimeWarning) @@ -934,7 +986,7 @@ def testit(r_idx_type, c_idx_type, index_name): testit(r_idx_type, c_idx_type, index_name) # dt with dt - args = product(['dt'], ['dt'], ('index', 'columns')) + args = product(["dt"], ["dt"], ("index", "columns")) with warnings.catch_warnings(record=True): # avoid warning about comparing strings and ints warnings.simplefilter("ignore", RuntimeWarning) @@ -943,19 +995,21 @@ def testit(r_idx_type, c_idx_type, index_name): testit(r_idx_type, c_idx_type, index_name) def test_series_frame_commutativity(self, engine, parser): - args = product(self.lhs_index_types, self.index_types, ('+', '*'), - ('index', 'columns')) + args = product( + self.lhs_index_types, self.index_types, ("+", "*"), ("index", "columns") + ) with warnings.catch_warnings(record=True): - warnings.simplefilter('always', RuntimeWarning) + warnings.simplefilter("always", RuntimeWarning) for r_idx_type, c_idx_type, op, index_name in args: - df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, - c_idx_type=c_idx_type) + df = mkdf( + 10, 10, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type + ) index = getattr(df, index_name) s = Series(np.random.randn(5), index[:5]) - lhs = 's {0} df'.format(op) - rhs = 'df {0} s'.format(op) + lhs = "s {0} df".format(op) + rhs = "df {0} s".format(op) if should_warn(df.index, s.index): with tm.assert_produces_warning(RuntimeWarning): a = pd.eval(lhs, engine=engine, parser=parser) @@ -965,40 +1019,42 @@ def test_series_frame_commutativity(self, engine, parser): a = pd.eval(lhs, engine=engine, parser=parser) b = pd.eval(rhs, engine=engine, parser=parser) - if r_idx_type != 'dt' and c_idx_type != 'dt': - if engine == 'numexpr': + if r_idx_type != "dt" and c_idx_type != "dt": + if engine == "numexpr": assert_frame_equal(a, b) @pytest.mark.slow def test_complex_series_frame_alignment(self, engine, parser): import random - args = product(self.lhs_index_types, self.index_types, - self.index_types, self.index_types) + + args = product( + self.lhs_index_types, self.index_types, self.index_types, self.index_types + ) n = 3 m1 = 5 m2 = 2 * m1 with warnings.catch_warnings(record=True): - warnings.simplefilter('always', RuntimeWarning) + warnings.simplefilter("always", RuntimeWarning) for r1, r2, c1, c2 in args: - index_name = random.choice(['index', 'columns']) - obj_name = random.choice(['df', 'df2']) + index_name = random.choice(["index", "columns"]) + obj_name = random.choice(["df", "df2"]) df = mkdf(m1, n, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) df2 = mkdf(m2, n, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) index = getattr(locals().get(obj_name), index_name) s = Series(np.random.randn(n), index[:n]) - if r2 == 'dt' or c2 == 'dt': - if engine == 'numexpr': + if r2 == "dt" or c2 == "dt": + if engine == "numexpr": expected2 = df2.add(s) else: expected2 = df2 + s else: expected2 = df2 + s - if r1 == 'dt' or c1 == 'dt': - if engine == 'numexpr': + if r1 == "dt" or c1 == "dt": + if engine == "numexpr": expected = expected2.add(df) else: expected = expected2 + df @@ -1007,37 +1063,36 @@ def test_complex_series_frame_alignment(self, engine, parser): if should_warn(df2.index, s.index, df.index): with tm.assert_produces_warning(RuntimeWarning): - res = pd.eval('df2 + s + df', engine=engine, - parser=parser) + res = pd.eval("df2 + s + df", engine=engine, parser=parser) else: - res = pd.eval('df2 + s + df', engine=engine, parser=parser) + res = pd.eval("df2 + s + df", engine=engine, parser=parser) assert res.shape == expected.shape assert_frame_equal(res, expected) def test_performance_warning_for_poor_alignment(self, engine, parser): df = DataFrame(randn(1000, 10)) s = Series(randn(10000)) - if engine == 'numexpr': + if engine == "numexpr": seen = PerformanceWarning else: seen = False with assert_produces_warning(seen): - pd.eval('df + s', engine=engine, parser=parser) + pd.eval("df + s", engine=engine, parser=parser) s = Series(randn(1000)) with assert_produces_warning(False): - pd.eval('df + s', engine=engine, parser=parser) + pd.eval("df + s", engine=engine, parser=parser) df = DataFrame(randn(10, 10000)) s = Series(randn(10000)) with assert_produces_warning(False): - pd.eval('df + s', engine=engine, parser=parser) + pd.eval("df + s", engine=engine, parser=parser) df = DataFrame(randn(10, 10)) s = Series(randn(10000)) - is_python_engine = engine == 'python' + is_python_engine = engine == "python" if not is_python_engine: wrn = PerformanceWarning @@ -1045,28 +1100,30 @@ def test_performance_warning_for_poor_alignment(self, engine, parser): wrn = False with assert_produces_warning(wrn) as w: - pd.eval('df + s', engine=engine, parser=parser) + pd.eval("df + s", engine=engine, parser=parser) if not is_python_engine: assert len(w) == 1 msg = str(w[0].message) - expected = ("Alignment difference on axis {0} is larger" - " than an order of magnitude on term {1!r}, " - "by more than {2:.4g}; performance may suffer" - "".format(1, 'df', np.log10(s.size - df.shape[1]))) + expected = ( + "Alignment difference on axis {0} is larger" + " than an order of magnitude on term {1!r}, " + "by more than {2:.4g}; performance may suffer" + "".format(1, "df", np.log10(s.size - df.shape[1])) + ) assert msg == expected # ------------------------------------ # Slightly more complex ops + @td.skip_if_no_ne class TestOperationsNumExprPandas: - @classmethod def setup_class(cls): - cls.engine = 'numexpr' - cls.parser = 'pandas' + cls.engine = "numexpr" + cls.parser = "pandas" cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms @classmethod @@ -1074,20 +1131,20 @@ def teardown_class(cls): del cls.engine, cls.parser def eval(self, *args, **kwargs): - kwargs['engine'] = self.engine - kwargs['parser'] = self.parser - kwargs['level'] = kwargs.pop('level', 0) + 1 + kwargs["engine"] = self.engine + kwargs["parser"] = self.parser + kwargs["level"] = kwargs.pop("level", 0) + 1 return pd.eval(*args, **kwargs) def test_simple_arith_ops(self): ops = self.arith_ops - for op in filter(lambda x: x != '//', ops): - ex = '1 {0} 1'.format(op) - ex2 = 'x {0} 1'.format(op) - ex3 = '1 {0} (x + 1)'.format(op) + for op in filter(lambda x: x != "//", ops): + ex = "1 {0} 1".format(op) + ex2 = "x {0} 1".format(op) + ex3 = "1 {0} (x + 1)".format(op) - if op in ('in', 'not in'): + if op in ("in", "not in"): msg = "argument of type 'int' is not iterable" with pytest.raises(TypeError, match=msg): pd.eval(ex, engine=self.engine, parser=self.parser) @@ -1097,27 +1154,29 @@ def test_simple_arith_ops(self): assert x == expec expec = _eval_single_bin(x, op, 1, self.engine) - y = self.eval(ex2, local_dict={'x': x}, engine=self.engine, - parser=self.parser) + y = self.eval( + ex2, local_dict={"x": x}, engine=self.engine, parser=self.parser + ) assert y == expec expec = _eval_single_bin(1, op, x + 1, self.engine) - y = self.eval(ex3, local_dict={'x': x}, - engine=self.engine, parser=self.parser) + y = self.eval( + ex3, local_dict={"x": x}, engine=self.engine, parser=self.parser + ) assert y == expec def test_simple_bool_ops(self): - for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), - (True, False)): - ex = '{0} {1} {2}'.format(lhs, op, rhs) + for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, False)): + ex = "{0} {1} {2}".format(lhs, op, rhs) res = self.eval(ex) exp = eval(ex) assert res == exp def test_bool_ops_with_constants(self): - for op, lhs, rhs in product(expr._bool_ops_syms, ('True', 'False'), - ('True', 'False')): - ex = '{0} {1} {2}'.format(lhs, op, rhs) + for op, lhs, rhs in product( + expr._bool_ops_syms, ("True", "False"), ("True", "False") + ): + ex = "{0} {1} {2}".format(lhs, op, rhs) res = self.eval(ex) exp = eval(ex) assert res == exp @@ -1126,22 +1185,21 @@ def test_4d_ndarray_fails(self): x = randn(3, 4, 5, 6) y = Series(randn(10)) with pytest.raises(NotImplementedError): - self.eval('x + y', - local_dict={'x': x, 'y': y}) + self.eval("x + y", local_dict={"x": x, "y": y}) def test_constant(self): - x = self.eval('1') + x = self.eval("1") assert x == 1 def test_single_variable(self): df = DataFrame(randn(10, 2)) - df2 = self.eval('df', local_dict={'df': df}) + df2 = self.eval("df", local_dict={"df": df}) assert_frame_equal(df, df2) def test_truediv(self): s = np.array([1]) - ex = 's / 1' - d = {'s': s} # noqa + ex = "s / 1" + d = {"s": s} # noqa res = self.eval(ex, truediv=False) tm.assert_numpy_array_equal(res, np.array([1.0])) @@ -1149,94 +1207,94 @@ def test_truediv(self): res = self.eval(ex, truediv=True) tm.assert_numpy_array_equal(res, np.array([1.0])) - res = self.eval('1 / 2', truediv=True) + res = self.eval("1 / 2", truediv=True) expec = 0.5 assert res == expec - res = self.eval('1 / 2', truediv=False) + res = self.eval("1 / 2", truediv=False) expec = 0.5 assert res == expec - res = self.eval('s / 2', truediv=False) + res = self.eval("s / 2", truediv=False) expec = 0.5 assert res == expec - res = self.eval('s / 2', truediv=True) + res = self.eval("s / 2", truediv=True) expec = 0.5 assert res == expec def test_failing_subscript_with_name_error(self): df = DataFrame(np.random.randn(5, 3)) # noqa with pytest.raises(NameError): - self.eval('df[x > 2] > 2') + self.eval("df[x > 2] > 2") def test_lhs_expression_subscript(self): df = DataFrame(np.random.randn(5, 3)) - result = self.eval('(df + 1)[df > 2]', local_dict={'df': df}) + result = self.eval("(df + 1)[df > 2]", local_dict={"df": df}) expected = (df + 1)[df > 2] assert_frame_equal(result, expected) def test_attr_expression(self): - df = DataFrame(np.random.randn(5, 3), columns=list('abc')) - expr1 = 'df.a < df.b' + df = DataFrame(np.random.randn(5, 3), columns=list("abc")) + expr1 = "df.a < df.b" expec1 = df.a < df.b - expr2 = 'df.a + df.b + df.c' + expr2 = "df.a + df.b + df.c" expec2 = df.a + df.b + df.c - expr3 = 'df.a + df.b + df.c[df.b < 0]' + expr3 = "df.a + df.b + df.c[df.b < 0]" expec3 = df.a + df.b + df.c[df.b < 0] exprs = expr1, expr2, expr3 expecs = expec1, expec2, expec3 for e, expec in zip(exprs, expecs): - assert_series_equal(expec, self.eval(e, local_dict={'df': df})) + assert_series_equal(expec, self.eval(e, local_dict={"df": df})) def test_assignment_fails(self): - df = DataFrame(np.random.randn(5, 3), columns=list('abc')) + df = DataFrame(np.random.randn(5, 3), columns=list("abc")) df2 = DataFrame(np.random.randn(5, 3)) - expr1 = 'df = df2' + expr1 = "df = df2" msg = "cannot assign without a target object" with pytest.raises(ValueError, match=msg): - self.eval(expr1, local_dict={'df': df, 'df2': df2}) + self.eval(expr1, local_dict={"df": df, "df2": df2}) def test_assignment_column(self): - df = DataFrame(np.random.randn(5, 2), columns=list('ab')) + df = DataFrame(np.random.randn(5, 2), columns=list("ab")) orig_df = df.copy() # multiple assignees with pytest.raises(SyntaxError, match="invalid syntax"): - df.eval('d c = a + b') + df.eval("d c = a + b") # invalid assignees msg = "left hand side of an assignment must be a single name" with pytest.raises(SyntaxError, match=msg): - df.eval('d,c = a + b') + df.eval("d,c = a + b") msg = "can't assign to function call" with pytest.raises(SyntaxError, match=msg): df.eval('Timestamp("20131001") = a + b') # single assignment - existing variable expected = orig_df.copy() - expected['a'] = expected['a'] + expected['b'] + expected["a"] = expected["a"] + expected["b"] df = orig_df.copy() - df.eval('a = a + b', inplace=True) + df.eval("a = a + b", inplace=True) assert_frame_equal(df, expected) # single assignment - new variable expected = orig_df.copy() - expected['c'] = expected['a'] + expected['b'] + expected["c"] = expected["a"] + expected["b"] df = orig_df.copy() - df.eval('c = a + b', inplace=True) + df.eval("c = a + b", inplace=True) assert_frame_equal(df, expected) # with a local name overlap def f(): df = orig_df.copy() a = 1 # noqa - df.eval('a = 1 + b', inplace=True) + df.eval("a = 1 + b", inplace=True) return df df = f() expected = orig_df.copy() - expected['a'] = 1 + expected['b'] + expected["a"] = 1 + expected["b"] assert_frame_equal(df, expected) df = orig_df.copy() @@ -1244,7 +1302,7 @@ def f(): def f(): a = 1 # noqa old_a = df.a.copy() - df.eval('a = a + b', inplace=True) + df.eval("a = a + b", inplace=True) result = old_a + df.b assert_series_equal(result, df.a, check_names=False) assert result.name is None @@ -1253,146 +1311,169 @@ def f(): # multiple assignment df = orig_df.copy() - df.eval('c = a + b', inplace=True) + df.eval("c = a + b", inplace=True) msg = "can only assign a single expression" with pytest.raises(SyntaxError, match=msg): - df.eval('c = a = b') + df.eval("c = a = b") # explicit targets df = orig_df.copy() - self.eval('c = df.a + df.b', local_dict={'df': df}, - target=df, inplace=True) + self.eval("c = df.a + df.b", local_dict={"df": df}, target=df, inplace=True) expected = orig_df.copy() - expected['c'] = expected['a'] + expected['b'] + expected["c"] = expected["a"] + expected["b"] assert_frame_equal(df, expected) def test_column_in(self): # GH 11235 - df = DataFrame({'a': [11], 'b': [-32]}) - result = df.eval('a in [11, -32]') + df = DataFrame({"a": [11], "b": [-32]}) + result = df.eval("a in [11, -32]") expected = Series([True]) assert_series_equal(result, expected) def assignment_not_inplace(self): # see gh-9297 - df = DataFrame(np.random.randn(5, 2), columns=list('ab')) + df = DataFrame(np.random.randn(5, 2), columns=list("ab")) - actual = df.eval('c = a + b', inplace=False) + actual = df.eval("c = a + b", inplace=False) assert actual is not None expected = df.copy() - expected['c'] = expected['a'] + expected['b'] + expected["c"] = expected["a"] + expected["b"] tm.assert_frame_equal(df, expected) def test_multi_line_expression(self): # GH 11149 - df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) expected = df.copy() - expected['c'] = expected['a'] + expected['b'] - expected['d'] = expected['c'] + expected['b'] - ans = df.eval(""" + expected["c"] = expected["a"] + expected["b"] + expected["d"] = expected["c"] + expected["b"] + ans = df.eval( + """ c = a + b - d = c + b""", inplace=True) + d = c + b""", + inplace=True, + ) assert_frame_equal(expected, df) assert ans is None - expected['a'] = expected['a'] - 1 - expected['e'] = expected['a'] + 2 - ans = df.eval(""" + expected["a"] = expected["a"] - 1 + expected["e"] = expected["a"] + 2 + ans = df.eval( + """ a = a - 1 - e = a + 2""", inplace=True) + e = a + 2""", + inplace=True, + ) assert_frame_equal(expected, df) assert ans is None # multi-line not valid if not all assignments with pytest.raises(ValueError): - df.eval(""" + df.eval( + """ a = b + 2 - b - 2""", inplace=False) + b - 2""", + inplace=False, + ) def test_multi_line_expression_not_inplace(self): # GH 11149 - df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) expected = df.copy() - expected['c'] = expected['a'] + expected['b'] - expected['d'] = expected['c'] + expected['b'] - df = df.eval(""" + expected["c"] = expected["a"] + expected["b"] + expected["d"] = expected["c"] + expected["b"] + df = df.eval( + """ c = a + b - d = c + b""", inplace=False) + d = c + b""", + inplace=False, + ) assert_frame_equal(expected, df) - expected['a'] = expected['a'] - 1 - expected['e'] = expected['a'] + 2 - df = df.eval(""" + expected["a"] = expected["a"] - 1 + expected["e"] = expected["a"] + 2 + df = df.eval( + """ a = a - 1 - e = a + 2""", inplace=False) + e = a + 2""", + inplace=False, + ) assert_frame_equal(expected, df) def test_multi_line_expression_local_variable(self): # GH 15342 - df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) expected = df.copy() local_var = 7 - expected['c'] = expected['a'] * local_var - expected['d'] = expected['c'] + local_var - ans = df.eval(""" + expected["c"] = expected["a"] * local_var + expected["d"] = expected["c"] + local_var + ans = df.eval( + """ c = a * @local_var d = c + @local_var - """, inplace=True) + """, + inplace=True, + ) assert_frame_equal(expected, df) assert ans is None def test_multi_line_expression_callable_local_variable(self): # 26426 - df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) def local_func(a, b): return b expected = df.copy() - expected['c'] = expected['a'] * local_func(1, 7) - expected['d'] = expected['c'] + local_func(1, 7) - ans = df.eval(""" + expected["c"] = expected["a"] * local_func(1, 7) + expected["d"] = expected["c"] + local_func(1, 7) + ans = df.eval( + """ c = a * @local_func(1, 7) d = c + @local_func(1, 7) - """, inplace=True) + """, + inplace=True, + ) assert_frame_equal(expected, df) assert ans is None def test_multi_line_expression_callable_local_variable_with_kwargs(self): # 26426 - df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) def local_func(a, b): return b expected = df.copy() - expected['c'] = expected['a'] * local_func(b=7, a=1) - expected['d'] = expected['c'] + local_func(b=7, a=1) - ans = df.eval(""" + expected["c"] = expected["a"] * local_func(b=7, a=1) + expected["d"] = expected["c"] + local_func(b=7, a=1) + ans = df.eval( + """ c = a * @local_func(b=7, a=1) d = c + @local_func(b=7, a=1) - """, inplace=True) + """, + inplace=True, + ) assert_frame_equal(expected, df) assert ans is None def test_assignment_in_query(self): # GH 8664 - df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df_orig = df.copy() with pytest.raises(ValueError): - df.query('a = 1') + df.query("a = 1") assert_frame_equal(df, df_orig) def test_query_inplace(self): # see gh-11149 - df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) expected = df.copy() - expected = expected[expected['a'] == 2] - df.query('a == 2', inplace=True) + expected = expected[expected["a"] == 2] + df.query("a == 2", inplace=True) assert_frame_equal(expected, df) df = {} @@ -1401,8 +1482,7 @@ def test_query_inplace(self): self.eval("a = 1 + 2", target=df, inplace=True) tm.assert_dict_equal(df, expected) - @pytest.mark.parametrize("invalid_target", [1, "cat", [1, 2], - np.array([]), (1, 3)]) + @pytest.mark.parametrize("invalid_target", [1, "cat", [1, 2], np.array([]), (1, 3)]) @pytest.mark.filterwarnings("ignore::FutureWarning") def test_cannot_item_assign(self, invalid_target): msg = "Cannot assign expression output to target" @@ -1423,8 +1503,7 @@ def test_cannot_copy_item(self, invalid_target): with pytest.raises(ValueError, match=msg): self.eval(expression, target=invalid_target, inplace=False) - @pytest.mark.parametrize("target", [1, "cat", [1, 2], - np.array([]), (1, 3), {1: 2}]) + @pytest.mark.parametrize("target", [1, "cat", [1, 2], np.array([]), (1, 3), {1: 2}]) def test_inplace_no_assignment(self, target): expression = "1 + 2" @@ -1435,109 +1514,106 @@ def test_inplace_no_assignment(self, target): self.eval(expression, target=target, inplace=True) def test_basic_period_index_boolean_expression(self): - df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') + df = mkdf(2, 2, data_gen_f=f, c_idx_type="p", r_idx_type="i") e = df < 2 - r = self.eval('df < 2', local_dict={'df': df}) + r = self.eval("df < 2", local_dict={"df": df}) x = df < 2 assert_frame_equal(r, e) assert_frame_equal(x, e) def test_basic_period_index_subscript_expression(self): - df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') - r = self.eval('df[df < 2 + 3]', local_dict={'df': df}) + df = mkdf(2, 2, data_gen_f=f, c_idx_type="p", r_idx_type="i") + r = self.eval("df[df < 2 + 3]", local_dict={"df": df}) e = df[df < 2 + 3] assert_frame_equal(r, e) def test_nested_period_index_subscript_expression(self): - df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') - r = self.eval('df[df[df < 2] < 2] + df * 2', local_dict={'df': df}) + df = mkdf(2, 2, data_gen_f=f, c_idx_type="p", r_idx_type="i") + r = self.eval("df[df[df < 2] < 2] + df * 2", local_dict={"df": df}) e = df[df[df < 2] < 2] + df * 2 assert_frame_equal(r, e) def test_date_boolean(self): df = DataFrame(randn(5, 3)) - df['dates1'] = date_range('1/1/2012', periods=5) - res = self.eval('df.dates1 < 20130101', local_dict={'df': df}, - engine=self.engine, parser=self.parser) - expec = df.dates1 < '20130101' + df["dates1"] = date_range("1/1/2012", periods=5) + res = self.eval( + "df.dates1 < 20130101", + local_dict={"df": df}, + engine=self.engine, + parser=self.parser, + ) + expec = df.dates1 < "20130101" assert_series_equal(res, expec, check_names=False) def test_simple_in_ops(self): - if self.parser != 'python': - res = pd.eval('1 in [1, 2]', engine=self.engine, - parser=self.parser) + if self.parser != "python": + res = pd.eval("1 in [1, 2]", engine=self.engine, parser=self.parser) assert res - res = pd.eval('2 in (1, 2)', engine=self.engine, - parser=self.parser) + res = pd.eval("2 in (1, 2)", engine=self.engine, parser=self.parser) assert res - res = pd.eval('3 in (1, 2)', engine=self.engine, - parser=self.parser) + res = pd.eval("3 in (1, 2)", engine=self.engine, parser=self.parser) assert not res - res = pd.eval('3 not in (1, 2)', engine=self.engine, - parser=self.parser) + res = pd.eval("3 not in (1, 2)", engine=self.engine, parser=self.parser) assert res - res = pd.eval('[3] not in (1, 2)', engine=self.engine, - parser=self.parser) + res = pd.eval("[3] not in (1, 2)", engine=self.engine, parser=self.parser) assert res - res = pd.eval('[3] in ([3], 2)', engine=self.engine, - parser=self.parser) + res = pd.eval("[3] in ([3], 2)", engine=self.engine, parser=self.parser) assert res - res = pd.eval('[[3]] in [[[3]], 2]', engine=self.engine, - parser=self.parser) + res = pd.eval("[[3]] in [[[3]], 2]", engine=self.engine, parser=self.parser) assert res - res = pd.eval('(3,) in [(3,), 2]', engine=self.engine, - parser=self.parser) + res = pd.eval("(3,) in [(3,), 2]", engine=self.engine, parser=self.parser) assert res - res = pd.eval('(3,) not in [(3,), 2]', engine=self.engine, - parser=self.parser) + res = pd.eval( + "(3,) not in [(3,), 2]", engine=self.engine, parser=self.parser + ) assert not res - res = pd.eval('[(3,)] in [[(3,)], 2]', engine=self.engine, - parser=self.parser) + res = pd.eval( + "[(3,)] in [[(3,)], 2]", engine=self.engine, parser=self.parser + ) assert res else: with pytest.raises(NotImplementedError): - pd.eval('1 in [1, 2]', engine=self.engine, parser=self.parser) + pd.eval("1 in [1, 2]", engine=self.engine, parser=self.parser) with pytest.raises(NotImplementedError): - pd.eval('2 in (1, 2)', engine=self.engine, parser=self.parser) + pd.eval("2 in (1, 2)", engine=self.engine, parser=self.parser) with pytest.raises(NotImplementedError): - pd.eval('3 in (1, 2)', engine=self.engine, parser=self.parser) + pd.eval("3 in (1, 2)", engine=self.engine, parser=self.parser) with pytest.raises(NotImplementedError): - pd.eval('3 not in (1, 2)', engine=self.engine, - parser=self.parser) + pd.eval("3 not in (1, 2)", engine=self.engine, parser=self.parser) with pytest.raises(NotImplementedError): - pd.eval('[(3,)] in (1, 2, [(3,)])', engine=self.engine, - parser=self.parser) + pd.eval( + "[(3,)] in (1, 2, [(3,)])", engine=self.engine, parser=self.parser + ) with pytest.raises(NotImplementedError): - pd.eval('[3] not in (1, 2, [[3]])', engine=self.engine, - parser=self.parser) + pd.eval( + "[3] not in (1, 2, [[3]])", engine=self.engine, parser=self.parser + ) @td.skip_if_no_ne class TestOperationsNumExprPython(TestOperationsNumExprPandas): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = 'numexpr' - cls.parser = 'python' + cls.engine = "numexpr" + cls.parser = "python" cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms - cls.arith_ops = filter(lambda x: x not in ('in', 'not in'), - cls.arith_ops) + cls.arith_ops = filter(lambda x: x not in ("in", "not in"), cls.arith_ops) def test_check_many_exprs(self): a = 1 # noqa - expr = ' * '.join('a' * 33) + expr = " * ".join("a" * 33) expected = 1 res = pd.eval(expr, engine=self.engine, parser=self.parser) assert res == expected @@ -1546,40 +1622,53 @@ def test_fails_and(self): df = DataFrame(np.random.randn(5, 3)) msg = "'BoolOp' nodes are not implemented" with pytest.raises(NotImplementedError, match=msg): - pd.eval('df > 2 and df > 3', local_dict={'df': df}, - parser=self.parser, engine=self.engine) + pd.eval( + "df > 2 and df > 3", + local_dict={"df": df}, + parser=self.parser, + engine=self.engine, + ) def test_fails_or(self): df = DataFrame(np.random.randn(5, 3)) msg = "'BoolOp' nodes are not implemented" with pytest.raises(NotImplementedError, match=msg): - pd.eval('df > 2 or df > 3', local_dict={'df': df}, - parser=self.parser, engine=self.engine) + pd.eval( + "df > 2 or df > 3", + local_dict={"df": df}, + parser=self.parser, + engine=self.engine, + ) def test_fails_not(self): df = DataFrame(np.random.randn(5, 3)) msg = "'Not' nodes are not implemented" with pytest.raises(NotImplementedError, match=msg): - pd.eval('not df > 2', local_dict={'df': df}, parser=self.parser, - engine=self.engine) + pd.eval( + "not df > 2", + local_dict={"df": df}, + parser=self.parser, + engine=self.engine, + ) def test_fails_ampersand(self): df = DataFrame(np.random.randn(5, 3)) # noqa - ex = '(df + 2)[df > 1] > 0 & (df > 0)' + ex = "(df + 2)[df > 1] > 0 & (df > 0)" with pytest.raises(NotImplementedError): pd.eval(ex, parser=self.parser, engine=self.engine) def test_fails_pipe(self): df = DataFrame(np.random.randn(5, 3)) # noqa - ex = '(df + 2)[df > 1] > 0 | (df > 0)' + ex = "(df + 2)[df > 1] > 0 | (df > 0)" with pytest.raises(NotImplementedError): pd.eval(ex, parser=self.parser, engine=self.engine) def test_bool_ops_with_constants(self): - for op, lhs, rhs in product(expr._bool_ops_syms, ('True', 'False'), - ('True', 'False')): - ex = '{0} {1} {2}'.format(lhs, op, rhs) - if op in ('and', 'or'): + for op, lhs, rhs in product( + expr._bool_ops_syms, ("True", "False"), ("True", "False") + ): + ex = "{0} {1} {2}".format(lhs, op, rhs) + if op in ("and", "or"): with pytest.raises(NotImplementedError): self.eval(ex) else: @@ -1588,10 +1677,9 @@ def test_bool_ops_with_constants(self): assert res == exp def test_simple_bool_ops(self): - for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), - (True, False)): - ex = 'lhs {0} rhs'.format(op) - if op in ('and', 'or'): + for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, False)): + ex = "lhs {0} rhs".format(op) + if op in ("and", "or"): with pytest.raises(NotImplementedError): pd.eval(ex, engine=self.engine, parser=self.parser) else: @@ -1601,33 +1689,29 @@ def test_simple_bool_ops(self): class TestOperationsPythonPython(TestOperationsNumExprPython): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = cls.parser = 'python' + cls.engine = cls.parser = "python" cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms - cls.arith_ops = filter(lambda x: x not in ('in', 'not in'), - cls.arith_ops) + cls.arith_ops = filter(lambda x: x not in ("in", "not in"), cls.arith_ops) class TestOperationsPythonPandas(TestOperationsNumExprPandas): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = 'python' - cls.parser = 'pandas' + cls.engine = "python" + cls.parser = "pandas" cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms @td.skip_if_no_ne class TestMathPythonPython: - @classmethod def setup_class(cls): - cls.engine = 'python' - cls.parser = 'pandas' + cls.engine = "python" + cls.parser = "pandas" cls.unary_fns = _unary_math_ops cls.binary_fns = _binary_math_ops @@ -1636,69 +1720,63 @@ def teardown_class(cls): del cls.engine, cls.parser def eval(self, *args, **kwargs): - kwargs['engine'] = self.engine - kwargs['parser'] = self.parser - kwargs['level'] = kwargs.pop('level', 0) + 1 + kwargs["engine"] = self.engine + kwargs["parser"] = self.parser + kwargs["level"] = kwargs.pop("level", 0) + 1 return pd.eval(*args, **kwargs) def test_unary_functions(self, unary_fns_for_ne): - df = DataFrame({'a': np.random.randn(10)}) + df = DataFrame({"a": np.random.randn(10)}) a = df.a for fn in unary_fns_for_ne: expr = "{0}(a)".format(fn) got = self.eval(expr) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): expect = getattr(np, fn)(a) tm.assert_series_equal(got, expect, check_names=False) - def test_floor_and_ceil_functions_raise_error(self, - ne_lt_2_6_9, - unary_fns_for_ne): - for fn in ('floor', 'ceil'): - msg = "\"{0}\" is not a supported function".format(fn) + def test_floor_and_ceil_functions_raise_error(self, ne_lt_2_6_9, unary_fns_for_ne): + for fn in ("floor", "ceil"): + msg = '"{0}" is not a supported function'.format(fn) with pytest.raises(ValueError, match=msg): expr = "{0}(100)".format(fn) self.eval(expr) def test_binary_functions(self): - df = DataFrame({'a': np.random.randn(10), - 'b': np.random.randn(10)}) + df = DataFrame({"a": np.random.randn(10), "b": np.random.randn(10)}) a = df.a b = df.b for fn in self.binary_fns: expr = "{0}(a, b)".format(fn) got = self.eval(expr) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): expect = getattr(np, fn)(a, b) tm.assert_almost_equal(got, expect, check_names=False) def test_df_use_case(self): - df = DataFrame({'a': np.random.randn(10), - 'b': np.random.randn(10)}) - df.eval("e = arctan2(sin(a), b)", - engine=self.engine, - parser=self.parser, inplace=True) + df = DataFrame({"a": np.random.randn(10), "b": np.random.randn(10)}) + df.eval( + "e = arctan2(sin(a), b)", + engine=self.engine, + parser=self.parser, + inplace=True, + ) got = df.e expect = np.arctan2(np.sin(df.a), df.b) tm.assert_series_equal(got, expect, check_names=False) def test_df_arithmetic_subexpression(self): - df = DataFrame({'a': np.random.randn(10), - 'b': np.random.randn(10)}) - df.eval("e = sin(a + b)", - engine=self.engine, - parser=self.parser, inplace=True) + df = DataFrame({"a": np.random.randn(10), "b": np.random.randn(10)}) + df.eval("e = sin(a + b)", engine=self.engine, parser=self.parser, inplace=True) got = df.e expect = np.sin(df.a + df.b) tm.assert_series_equal(got, expect, check_names=False) def check_result_type(self, dtype, expect_dtype): - df = DataFrame({'a': np.random.randn(10).astype(dtype)}) + df = DataFrame({"a": np.random.randn(10).astype(dtype)}) assert df.a.dtype == dtype - df.eval("b = sin(a)", - engine=self.engine, - parser=self.parser, inplace=True) + df.eval("b = sin(a)", engine=self.engine, parser=self.parser, inplace=True) got = df.b expect = np.sin(df.a) assert expect.dtype == got.dtype @@ -1720,101 +1798,97 @@ def test_result_types2(self): self.check_result_type(np.complex128, np.complex128) def test_undefined_func(self): - df = DataFrame({'a': np.random.randn(10)}) - msg = "\"mysin\" is not a supported function" + df = DataFrame({"a": np.random.randn(10)}) + msg = '"mysin" is not a supported function' with pytest.raises(ValueError, match=msg): - df.eval("mysin(a)", - engine=self.engine, - parser=self.parser) + df.eval("mysin(a)", engine=self.engine, parser=self.parser) def test_keyword_arg(self): - df = DataFrame({'a': np.random.randn(10)}) - msg = "Function \"sin\" does not support keyword arguments" + df = DataFrame({"a": np.random.randn(10)}) + msg = 'Function "sin" does not support keyword arguments' with pytest.raises(TypeError, match=msg): - df.eval("sin(x=a)", - engine=self.engine, - parser=self.parser) + df.eval("sin(x=a)", engine=self.engine, parser=self.parser) class TestMathPythonPandas(TestMathPythonPython): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = 'python' - cls.parser = 'pandas' + cls.engine = "python" + cls.parser = "pandas" class TestMathNumExprPandas(TestMathPythonPython): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = 'numexpr' - cls.parser = 'pandas' + cls.engine = "numexpr" + cls.parser = "pandas" class TestMathNumExprPython(TestMathPythonPython): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = 'numexpr' - cls.parser = 'python' + cls.engine = "numexpr" + cls.parser = "python" _var_s = randn(10) class TestScope: - def test_global_scope(self, engine, parser): - e = '_var_s * 2' - tm.assert_numpy_array_equal(_var_s * 2, pd.eval(e, engine=engine, - parser=parser)) + e = "_var_s * 2" + tm.assert_numpy_array_equal( + _var_s * 2, pd.eval(e, engine=engine, parser=parser) + ) def test_no_new_locals(self, engine, parser): x = 1 # noqa lcls = locals().copy() - pd.eval('x + 1', local_dict=lcls, engine=engine, parser=parser) + pd.eval("x + 1", local_dict=lcls, engine=engine, parser=parser) lcls2 = locals().copy() - lcls2.pop('lcls') + lcls2.pop("lcls") assert lcls == lcls2 def test_no_new_globals(self, engine, parser): x = 1 # noqa gbls = globals().copy() - pd.eval('x + 1', engine=engine, parser=parser) + pd.eval("x + 1", engine=engine, parser=parser) gbls2 = globals().copy() assert gbls == gbls2 @td.skip_if_no_ne def test_invalid_engine(): - msg = 'Invalid engine \'asdf\' passed' + msg = "Invalid engine 'asdf' passed" with pytest.raises(KeyError, match=msg): - pd.eval('x + y', local_dict={'x': 1, 'y': 2}, engine='asdf') + pd.eval("x + y", local_dict={"x": 1, "y": 2}, engine="asdf") @td.skip_if_no_ne def test_invalid_parser(): - msg = 'Invalid parser \'asdf\' passed' + msg = "Invalid parser 'asdf' passed" with pytest.raises(KeyError, match=msg): - pd.eval('x + y', local_dict={'x': 1, 'y': 2}, parser='asdf') + pd.eval("x + y", local_dict={"x": 1, "y": 2}, parser="asdf") -_parsers = {'python': PythonExprVisitor, 'pytables': pytables.ExprVisitor, - 'pandas': PandasExprVisitor} +_parsers = { + "python": PythonExprVisitor, + "pytables": pytables.ExprVisitor, + "pandas": PandasExprVisitor, +} -@pytest.mark.parametrize('engine', _engines) -@pytest.mark.parametrize('parser', _parsers) +@pytest.mark.parametrize("engine", _engines) +@pytest.mark.parametrize("parser", _parsers) def test_disallowed_nodes(engine, parser): VisitorClass = _parsers[parser] uns_ops = VisitorClass.unsupported_nodes - inst = VisitorClass('x + 1', engine, parser) + inst = VisitorClass("x + 1", engine, parser) for ops in uns_ops: with pytest.raises(NotImplementedError): @@ -1822,23 +1896,23 @@ def test_disallowed_nodes(engine, parser): def test_syntax_error_exprs(engine, parser): - e = 's +' + e = "s +" with pytest.raises(SyntaxError): pd.eval(e, engine=engine, parser=parser) def test_name_error_exprs(engine, parser): - e = 's + t' + e = "s + t" with pytest.raises(NameError): pd.eval(e, engine=engine, parser=parser) def test_invalid_local_variable_reference(engine, parser): a, b = 1, 2 # noqa - exprs = 'a + @b', '@a + b', '@a + @b' + exprs = "a + @b", "@a + b", "@a + @b" for _expr in exprs: - if parser != 'pandas': + if parser != "pandas": with pytest.raises(SyntaxError, match="The '@' prefix is only"): pd.eval(_expr, engine=engine, parser=parser) else: @@ -1848,37 +1922,35 @@ def test_invalid_local_variable_reference(engine, parser): def test_numexpr_builtin_raises(engine, parser): sin, dotted_line = 1, 2 - if engine == 'numexpr': - msg = 'Variables in expression .+' + if engine == "numexpr": + msg = "Variables in expression .+" with pytest.raises(NumExprClobberingError, match=msg): - pd.eval('sin + dotted_line', engine=engine, parser=parser) + pd.eval("sin + dotted_line", engine=engine, parser=parser) else: - res = pd.eval('sin + dotted_line', engine=engine, parser=parser) + res = pd.eval("sin + dotted_line", engine=engine, parser=parser) assert res == sin + dotted_line def test_bad_resolver_raises(engine, parser): cannot_resolve = 42, 3.0 - with pytest.raises(TypeError, match='Resolver of type .+'): - pd.eval('1 + 2', resolvers=cannot_resolve, engine=engine, - parser=parser) + with pytest.raises(TypeError, match="Resolver of type .+"): + pd.eval("1 + 2", resolvers=cannot_resolve, engine=engine, parser=parser) def test_empty_string_raises(engine, parser): # GH 13139 with pytest.raises(ValueError, match="expr cannot be an empty string"): - pd.eval('', engine=engine, parser=parser) + pd.eval("", engine=engine, parser=parser) def test_more_than_one_expression_raises(engine, parser): - with pytest.raises(SyntaxError, match=("only a single expression " - "is allowed")): - pd.eval('1 + 1; 2 + 2', engine=engine, parser=parser) + with pytest.raises(SyntaxError, match=("only a single expression " "is allowed")): + pd.eval("1 + 1; 2 + 2", engine=engine, parser=parser) -@pytest.mark.parametrize('cmp', ('and', 'or')) -@pytest.mark.parametrize('lhs', (int, float)) -@pytest.mark.parametrize('rhs', (int, float)) +@pytest.mark.parametrize("cmp", ("and", "or")) +@pytest.mark.parametrize("lhs", (int, float)) +@pytest.mark.parametrize("rhs", (int, float)) def test_bool_ops_fails_on_scalars(lhs, cmp, rhs, engine, parser): gen = {int: lambda: np.random.randint(10), float: np.random.randn} @@ -1886,38 +1958,37 @@ def test_bool_ops_fails_on_scalars(lhs, cmp, rhs, engine, parser): lhs = gen[lhs]() # noqa rhs = gen[rhs]() # noqa - ex1 = 'lhs {0} mid {1} rhs'.format(cmp, cmp) - ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp, cmp) - ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp, cmp) + ex1 = "lhs {0} mid {1} rhs".format(cmp, cmp) + ex2 = "lhs {0} mid and mid {1} rhs".format(cmp, cmp) + ex3 = "(lhs {0} mid) & (mid {1} rhs)".format(cmp, cmp) for ex in (ex1, ex2, ex3): with pytest.raises(NotImplementedError): pd.eval(ex, engine=engine, parser=parser) def test_inf(engine, parser): - s = 'inf + 1' + s = "inf + 1" expected = np.inf result = pd.eval(s, engine=engine, parser=parser) assert result == expected def test_negate_lt_eq_le(engine, parser): - df = pd.DataFrame([[0, 10], [1, 20]], columns=['cat', 'count']) + df = pd.DataFrame([[0, 10], [1, 20]], columns=["cat", "count"]) expected = df[~(df.cat > 0)] - result = df.query('~(cat > 0)', engine=engine, parser=parser) + result = df.query("~(cat > 0)", engine=engine, parser=parser) tm.assert_frame_equal(result, expected) - if parser == 'python': + if parser == "python": with pytest.raises(NotImplementedError): - df.query('not (cat > 0)', engine=engine, parser=parser) + df.query("not (cat > 0)", engine=engine, parser=parser) else: - result = df.query('not (cat > 0)', engine=engine, parser=parser) + result = df.query("not (cat > 0)", engine=engine, parser=parser) tm.assert_frame_equal(result, expected) class TestValidate: - def test_validate_bool_args(self): invalid_values = [1, "True", [1, 2, 3], 5.0] diff --git a/pandas/tests/config/test_config.py b/pandas/tests/config/test_config.py index 7e0b22cb3b4146..3f12d1d7a292dc 100644 --- a/pandas/tests/config/test_config.py +++ b/pandas/tests/config/test_config.py @@ -9,41 +9,39 @@ class TestConfig: - @classmethod def setup_class(cls): from copy import deepcopy cls.cf = cf - cls.gc = deepcopy(getattr(cls.cf, '_global_config')) - cls.do = deepcopy(getattr(cls.cf, '_deprecated_options')) - cls.ro = deepcopy(getattr(cls.cf, '_registered_options')) + cls.gc = deepcopy(getattr(cls.cf, "_global_config")) + cls.do = deepcopy(getattr(cls.cf, "_deprecated_options")) + cls.ro = deepcopy(getattr(cls.cf, "_registered_options")) def setup_method(self, method): - setattr(self.cf, '_global_config', {}) - setattr(self.cf, 'options', self.cf.DictWrapper( - self.cf._global_config)) - setattr(self.cf, '_deprecated_options', {}) - setattr(self.cf, '_registered_options', {}) + setattr(self.cf, "_global_config", {}) + setattr(self.cf, "options", self.cf.DictWrapper(self.cf._global_config)) + setattr(self.cf, "_deprecated_options", {}) + setattr(self.cf, "_registered_options", {}) # Our test fixture in conftest.py sets "chained_assignment" # to "raise" only after all test methods have been setup. # However, after this setup, there is no longer any # "chained_assignment" option, so re-register it. - self.cf.register_option('chained_assignment', 'raise') + self.cf.register_option("chained_assignment", "raise") def teardown_method(self, method): - setattr(self.cf, '_global_config', self.gc) - setattr(self.cf, '_deprecated_options', self.do) - setattr(self.cf, '_registered_options', self.ro) + setattr(self.cf, "_global_config", self.gc) + setattr(self.cf, "_deprecated_options", self.do) + setattr(self.cf, "_registered_options", self.ro) def test_api(self): # the pandas object exposes the user API - assert hasattr(pd, 'get_option') - assert hasattr(pd, 'set_option') - assert hasattr(pd, 'reset_option') - assert hasattr(pd, 'describe_option') + assert hasattr(pd, "get_option") + assert hasattr(pd, "set_option") + assert hasattr(pd, "reset_option") + assert hasattr(pd, "describe_option") def test_is_one_of_factory(self): v = self.cf.is_one_of_factory([None, 12]) @@ -55,128 +53,128 @@ def test_is_one_of_factory(self): v(1.1) def test_register_option(self): - self.cf.register_option('a', 1, 'doc') + self.cf.register_option("a", 1, "doc") # can't register an already registered option msg = "Option 'a' has already been registered" with pytest.raises(OptionError, match=msg): - self.cf.register_option('a', 1, 'doc') + self.cf.register_option("a", 1, "doc") # can't register an already registered option msg = "Path prefix to option 'a' is already an option" with pytest.raises(OptionError, match=msg): - self.cf.register_option('a.b.c.d1', 1, 'doc') + self.cf.register_option("a.b.c.d1", 1, "doc") with pytest.raises(OptionError, match=msg): - self.cf.register_option('a.b.c.d2', 1, 'doc') + self.cf.register_option("a.b.c.d2", 1, "doc") # no python keywords msg = "for is a python keyword" with pytest.raises(ValueError, match=msg): - self.cf.register_option('for', 0) + self.cf.register_option("for", 0) with pytest.raises(ValueError, match=msg): - self.cf.register_option('a.for.b', 0) + self.cf.register_option("a.for.b", 0) # must be valid identifier (ensure attribute access works) msg = "oh my goddess! is not a valid identifier" with pytest.raises(ValueError, match=msg): - self.cf.register_option('Oh my Goddess!', 0) + self.cf.register_option("Oh my Goddess!", 0) # we can register options several levels deep # without predefining the intermediate steps # and we can define differently named options # in the same namespace - self.cf.register_option('k.b.c.d1', 1, 'doc') - self.cf.register_option('k.b.c.d2', 1, 'doc') + self.cf.register_option("k.b.c.d1", 1, "doc") + self.cf.register_option("k.b.c.d2", 1, "doc") def test_describe_option(self): - self.cf.register_option('a', 1, 'doc') - self.cf.register_option('b', 1, 'doc2') - self.cf.deprecate_option('b') - - self.cf.register_option('c.d.e1', 1, 'doc3') - self.cf.register_option('c.d.e2', 1, 'doc4') - self.cf.register_option('f', 1) - self.cf.register_option('g.h', 1) - self.cf.register_option('k', 2) - self.cf.deprecate_option('g.h', rkey="k") - self.cf.register_option('l', "foo") + self.cf.register_option("a", 1, "doc") + self.cf.register_option("b", 1, "doc2") + self.cf.deprecate_option("b") + + self.cf.register_option("c.d.e1", 1, "doc3") + self.cf.register_option("c.d.e2", 1, "doc4") + self.cf.register_option("f", 1) + self.cf.register_option("g.h", 1) + self.cf.register_option("k", 2) + self.cf.deprecate_option("g.h", rkey="k") + self.cf.register_option("l", "foo") # non-existent keys raise KeyError msg = r"No such keys\(s\)" with pytest.raises(OptionError, match=msg): - self.cf.describe_option('no.such.key') + self.cf.describe_option("no.such.key") # we can get the description for any key we registered - assert 'doc' in self.cf.describe_option('a', _print_desc=False) - assert 'doc2' in self.cf.describe_option('b', _print_desc=False) - assert 'precated' in self.cf.describe_option('b', _print_desc=False) - assert 'doc3' in self.cf.describe_option('c.d.e1', _print_desc=False) - assert 'doc4' in self.cf.describe_option('c.d.e2', _print_desc=False) + assert "doc" in self.cf.describe_option("a", _print_desc=False) + assert "doc2" in self.cf.describe_option("b", _print_desc=False) + assert "precated" in self.cf.describe_option("b", _print_desc=False) + assert "doc3" in self.cf.describe_option("c.d.e1", _print_desc=False) + assert "doc4" in self.cf.describe_option("c.d.e2", _print_desc=False) # if no doc is specified we get a default message # saying "description not available" - assert 'vailable' in self.cf.describe_option('f', _print_desc=False) - assert 'vailable' in self.cf.describe_option('g.h', _print_desc=False) - assert 'precated' in self.cf.describe_option('g.h', _print_desc=False) - assert 'k' in self.cf.describe_option('g.h', _print_desc=False) + assert "vailable" in self.cf.describe_option("f", _print_desc=False) + assert "vailable" in self.cf.describe_option("g.h", _print_desc=False) + assert "precated" in self.cf.describe_option("g.h", _print_desc=False) + assert "k" in self.cf.describe_option("g.h", _print_desc=False) # default is reported - assert 'foo' in self.cf.describe_option('l', _print_desc=False) + assert "foo" in self.cf.describe_option("l", _print_desc=False) # current value is reported - assert 'bar' not in self.cf.describe_option('l', _print_desc=False) + assert "bar" not in self.cf.describe_option("l", _print_desc=False) self.cf.set_option("l", "bar") - assert 'bar' in self.cf.describe_option('l', _print_desc=False) + assert "bar" in self.cf.describe_option("l", _print_desc=False) def test_case_insensitive(self): - self.cf.register_option('KanBAN', 1, 'doc') + self.cf.register_option("KanBAN", 1, "doc") - assert 'doc' in self.cf.describe_option('kanbaN', _print_desc=False) - assert self.cf.get_option('kanBaN') == 1 - self.cf.set_option('KanBan', 2) - assert self.cf.get_option('kAnBaN') == 2 + assert "doc" in self.cf.describe_option("kanbaN", _print_desc=False) + assert self.cf.get_option("kanBaN") == 1 + self.cf.set_option("KanBan", 2) + assert self.cf.get_option("kAnBaN") == 2 # gets of non-existent keys fail msg = r"No such keys\(s\): 'no_such_option'" with pytest.raises(OptionError, match=msg): - self.cf.get_option('no_such_option') - self.cf.deprecate_option('KanBan') + self.cf.get_option("no_such_option") + self.cf.deprecate_option("KanBan") - assert self.cf._is_deprecated('kAnBaN') + assert self.cf._is_deprecated("kAnBaN") def test_get_option(self): - self.cf.register_option('a', 1, 'doc') - self.cf.register_option('b.c', 'hullo', 'doc2') - self.cf.register_option('b.b', None, 'doc2') + self.cf.register_option("a", 1, "doc") + self.cf.register_option("b.c", "hullo", "doc2") + self.cf.register_option("b.b", None, "doc2") # gets of existing keys succeed - assert self.cf.get_option('a') == 1 - assert self.cf.get_option('b.c') == 'hullo' - assert self.cf.get_option('b.b') is None + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "hullo" + assert self.cf.get_option("b.b") is None # gets of non-existent keys fail msg = r"No such keys\(s\): 'no_such_option'" with pytest.raises(OptionError, match=msg): - self.cf.get_option('no_such_option') + self.cf.get_option("no_such_option") def test_set_option(self): - self.cf.register_option('a', 1, 'doc') - self.cf.register_option('b.c', 'hullo', 'doc2') - self.cf.register_option('b.b', None, 'doc2') + self.cf.register_option("a", 1, "doc") + self.cf.register_option("b.c", "hullo", "doc2") + self.cf.register_option("b.b", None, "doc2") - assert self.cf.get_option('a') == 1 - assert self.cf.get_option('b.c') == 'hullo' - assert self.cf.get_option('b.b') is None + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "hullo" + assert self.cf.get_option("b.b") is None - self.cf.set_option('a', 2) - self.cf.set_option('b.c', 'wurld') - self.cf.set_option('b.b', 1.1) + self.cf.set_option("a", 2) + self.cf.set_option("b.c", "wurld") + self.cf.set_option("b.b", 1.1) - assert self.cf.get_option('a') == 2 - assert self.cf.get_option('b.c') == 'wurld' - assert self.cf.get_option('b.b') == 1.1 + assert self.cf.get_option("a") == 2 + assert self.cf.get_option("b.c") == "wurld" + assert self.cf.get_option("b.b") == 1.1 msg = r"No such keys\(s\): 'no.such.key'" with pytest.raises(OptionError, match=msg): - self.cf.set_option('no.such.key', None) + self.cf.set_option("no.such.key", None) def test_set_option_empty_args(self): msg = "Must provide an even number of non-keyword arguments" @@ -186,7 +184,7 @@ def test_set_option_empty_args(self): def test_set_option_uneven_args(self): msg = "Must provide an even number of non-keyword arguments" with pytest.raises(ValueError, match=msg): - self.cf.set_option('a.b', 2, 'b.c') + self.cf.set_option("a.b", 2, "b.c") def test_set_option_invalid_single_argument_type(self): msg = "Must provide an even number of non-keyword arguments" @@ -194,175 +192,168 @@ def test_set_option_invalid_single_argument_type(self): self.cf.set_option(2) def test_set_option_multiple(self): - self.cf.register_option('a', 1, 'doc') - self.cf.register_option('b.c', 'hullo', 'doc2') - self.cf.register_option('b.b', None, 'doc2') + self.cf.register_option("a", 1, "doc") + self.cf.register_option("b.c", "hullo", "doc2") + self.cf.register_option("b.b", None, "doc2") - assert self.cf.get_option('a') == 1 - assert self.cf.get_option('b.c') == 'hullo' - assert self.cf.get_option('b.b') is None + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "hullo" + assert self.cf.get_option("b.b") is None - self.cf.set_option('a', '2', 'b.c', None, 'b.b', 10.0) + self.cf.set_option("a", "2", "b.c", None, "b.b", 10.0) - assert self.cf.get_option('a') == '2' - assert self.cf.get_option('b.c') is None - assert self.cf.get_option('b.b') == 10.0 + assert self.cf.get_option("a") == "2" + assert self.cf.get_option("b.c") is None + assert self.cf.get_option("b.b") == 10.0 def test_validation(self): - self.cf.register_option('a', 1, 'doc', validator=self.cf.is_int) - self.cf.register_option('b.c', 'hullo', 'doc2', - validator=self.cf.is_text) + self.cf.register_option("a", 1, "doc", validator=self.cf.is_int) + self.cf.register_option("b.c", "hullo", "doc2", validator=self.cf.is_text) msg = "Value must have type ''" with pytest.raises(ValueError, match=msg): - self.cf.register_option( - 'a.b.c.d2', 'NO', 'doc', validator=self.cf.is_int) + self.cf.register_option("a.b.c.d2", "NO", "doc", validator=self.cf.is_int) - self.cf.set_option('a', 2) # int is_int - self.cf.set_option('b.c', 'wurld') # str is_str + self.cf.set_option("a", 2) # int is_int + self.cf.set_option("b.c", "wurld") # str is_str # None not is_int with pytest.raises(ValueError, match=msg): - self.cf.set_option('a', None) + self.cf.set_option("a", None) with pytest.raises(ValueError, match=msg): - self.cf.set_option('a', 'ab') + self.cf.set_option("a", "ab") msg = r"Value must be an instance of \|" with pytest.raises(ValueError, match=msg): - self.cf.set_option('b.c', 1) + self.cf.set_option("b.c", 1) validator = self.cf.is_one_of_factory([None, self.cf.is_callable]) - self.cf.register_option('b', lambda: None, 'doc', - validator=validator) - self.cf.set_option('b', '%.1f'.format) # Formatter is callable - self.cf.set_option('b', None) # Formatter is none (default) + self.cf.register_option("b", lambda: None, "doc", validator=validator) + self.cf.set_option("b", "%.1f".format) # Formatter is callable + self.cf.set_option("b", None) # Formatter is none (default) with pytest.raises(ValueError, match="Value must be a callable"): - self.cf.set_option('b', '%.1f') + self.cf.set_option("b", "%.1f") def test_reset_option(self): - self.cf.register_option('a', 1, 'doc', validator=self.cf.is_int) - self.cf.register_option('b.c', 'hullo', 'doc2', - validator=self.cf.is_str) - assert self.cf.get_option('a') == 1 - assert self.cf.get_option('b.c') == 'hullo' - - self.cf.set_option('a', 2) - self.cf.set_option('b.c', 'wurld') - assert self.cf.get_option('a') == 2 - assert self.cf.get_option('b.c') == 'wurld' - - self.cf.reset_option('a') - assert self.cf.get_option('a') == 1 - assert self.cf.get_option('b.c') == 'wurld' - self.cf.reset_option('b.c') - assert self.cf.get_option('a') == 1 - assert self.cf.get_option('b.c') == 'hullo' + self.cf.register_option("a", 1, "doc", validator=self.cf.is_int) + self.cf.register_option("b.c", "hullo", "doc2", validator=self.cf.is_str) + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "hullo" + + self.cf.set_option("a", 2) + self.cf.set_option("b.c", "wurld") + assert self.cf.get_option("a") == 2 + assert self.cf.get_option("b.c") == "wurld" + + self.cf.reset_option("a") + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "wurld" + self.cf.reset_option("b.c") + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "hullo" def test_reset_option_all(self): - self.cf.register_option('a', 1, 'doc', validator=self.cf.is_int) - self.cf.register_option('b.c', 'hullo', 'doc2', - validator=self.cf.is_str) - assert self.cf.get_option('a') == 1 - assert self.cf.get_option('b.c') == 'hullo' + self.cf.register_option("a", 1, "doc", validator=self.cf.is_int) + self.cf.register_option("b.c", "hullo", "doc2", validator=self.cf.is_str) + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "hullo" - self.cf.set_option('a', 2) - self.cf.set_option('b.c', 'wurld') - assert self.cf.get_option('a') == 2 - assert self.cf.get_option('b.c') == 'wurld' + self.cf.set_option("a", 2) + self.cf.set_option("b.c", "wurld") + assert self.cf.get_option("a") == 2 + assert self.cf.get_option("b.c") == "wurld" self.cf.reset_option("all") - assert self.cf.get_option('a') == 1 - assert self.cf.get_option('b.c') == 'hullo' + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "hullo" def test_deprecate_option(self): # we can deprecate non-existent options - self.cf.deprecate_option('foo') + self.cf.deprecate_option("foo") - assert self.cf._is_deprecated('foo') + assert self.cf._is_deprecated("foo") with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always') - with pytest.raises( - KeyError, - match="No such keys.s.: 'foo'"): - self.cf.get_option('foo') + warnings.simplefilter("always") + with pytest.raises(KeyError, match="No such keys.s.: 'foo'"): + self.cf.get_option("foo") assert len(w) == 1 # should have raised one warning - assert 'deprecated' in str(w[-1]) # we get the default message + assert "deprecated" in str(w[-1]) # we get the default message - self.cf.register_option('a', 1, 'doc', validator=self.cf.is_int) - self.cf.register_option('b.c', 'hullo', 'doc2') - self.cf.register_option('foo', 'hullo', 'doc2') + self.cf.register_option("a", 1, "doc", validator=self.cf.is_int) + self.cf.register_option("b.c", "hullo", "doc2") + self.cf.register_option("foo", "hullo", "doc2") - self.cf.deprecate_option('a', removal_ver='nifty_ver') + self.cf.deprecate_option("a", removal_ver="nifty_ver") with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always') - self.cf.get_option('a') + warnings.simplefilter("always") + self.cf.get_option("a") assert len(w) == 1 # should have raised one warning - assert 'eprecated' in str(w[-1]) # we get the default message - assert 'nifty_ver' in str(w[-1]) # with the removal_ver quoted + assert "eprecated" in str(w[-1]) # we get the default message + assert "nifty_ver" in str(w[-1]) # with the removal_ver quoted msg = "Option 'a' has already been defined as deprecated" with pytest.raises(OptionError, match=msg): - self.cf.deprecate_option('a') + self.cf.deprecate_option("a") - self.cf.deprecate_option('b.c', 'zounds!') + self.cf.deprecate_option("b.c", "zounds!") with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always') - self.cf.get_option('b.c') + warnings.simplefilter("always") + self.cf.get_option("b.c") assert len(w) == 1 # should have raised one warning - assert 'zounds!' in str(w[-1]) # we get the custom message + assert "zounds!" in str(w[-1]) # we get the custom message # test rerouting keys - self.cf.register_option('d.a', 'foo', 'doc2') - self.cf.register_option('d.dep', 'bar', 'doc2') - assert self.cf.get_option('d.a') == 'foo' - assert self.cf.get_option('d.dep') == 'bar' + self.cf.register_option("d.a", "foo", "doc2") + self.cf.register_option("d.dep", "bar", "doc2") + assert self.cf.get_option("d.a") == "foo" + assert self.cf.get_option("d.dep") == "bar" - self.cf.deprecate_option('d.dep', rkey='d.a') # reroute d.dep to d.a + self.cf.deprecate_option("d.dep", rkey="d.a") # reroute d.dep to d.a with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always') - assert self.cf.get_option('d.dep') == 'foo' + warnings.simplefilter("always") + assert self.cf.get_option("d.dep") == "foo" assert len(w) == 1 # should have raised one warning - assert 'eprecated' in str(w[-1]) # we get the custom message + assert "eprecated" in str(w[-1]) # we get the custom message with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always') - self.cf.set_option('d.dep', 'baz') # should overwrite "d.a" + warnings.simplefilter("always") + self.cf.set_option("d.dep", "baz") # should overwrite "d.a" assert len(w) == 1 # should have raised one warning - assert 'eprecated' in str(w[-1]) # we get the custom message + assert "eprecated" in str(w[-1]) # we get the custom message with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always') - assert self.cf.get_option('d.dep') == 'baz' + warnings.simplefilter("always") + assert self.cf.get_option("d.dep") == "baz" assert len(w) == 1 # should have raised one warning - assert 'eprecated' in str(w[-1]) # we get the custom message + assert "eprecated" in str(w[-1]) # we get the custom message def test_config_prefix(self): with self.cf.config_prefix("base"): - self.cf.register_option('a', 1, "doc1") - self.cf.register_option('b', 2, "doc2") - assert self.cf.get_option('a') == 1 - assert self.cf.get_option('b') == 2 + self.cf.register_option("a", 1, "doc1") + self.cf.register_option("b", 2, "doc2") + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b") == 2 - self.cf.set_option('a', 3) - self.cf.set_option('b', 4) - assert self.cf.get_option('a') == 3 - assert self.cf.get_option('b') == 4 + self.cf.set_option("a", 3) + self.cf.set_option("b", 4) + assert self.cf.get_option("a") == 3 + assert self.cf.get_option("b") == 4 - assert self.cf.get_option('base.a') == 3 - assert self.cf.get_option('base.b') == 4 - assert 'doc1' in self.cf.describe_option('base.a', _print_desc=False) - assert 'doc2' in self.cf.describe_option('base.b', _print_desc=False) + assert self.cf.get_option("base.a") == 3 + assert self.cf.get_option("base.b") == 4 + assert "doc1" in self.cf.describe_option("base.a", _print_desc=False) + assert "doc2" in self.cf.describe_option("base.b", _print_desc=False) - self.cf.reset_option('base.a') - self.cf.reset_option('base.b') + self.cf.reset_option("base.a") + self.cf.reset_option("base.b") with self.cf.config_prefix("base"): - assert self.cf.get_option('a') == 1 - assert self.cf.get_option('b') == 2 + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b") == 2 def test_callback(self): k = [None] @@ -372,8 +363,8 @@ def callback(key): k.append(key) v.append(self.cf.get_option(key)) - self.cf.register_option('d.a', 'foo', cb=callback) - self.cf.register_option('d.b', 'foo', cb=callback) + self.cf.register_option("d.a", "foo", cb=callback) + self.cf.register_option("d.b", "foo", cb=callback) del k[-1], v[-1] self.cf.set_option("d.a", "fooz") @@ -393,7 +384,7 @@ def test_set_ContextManager(self): def eq(val): assert self.cf.get_option("a") == val - self.cf.register_option('a', 0) + self.cf.register_option("a", 0) eq(0) with self.cf.option_context("a", 15): eq(15) @@ -411,8 +402,8 @@ def test_attribute_access(self): def f3(key): holder.append(True) - self.cf.register_option('a', 0) - self.cf.register_option('c', 0, cb=f3) + self.cf.register_option("a", 0) + self.cf.register_option("c", 0, cb=f3) options = self.cf.options assert options.a == 0 @@ -442,7 +433,7 @@ def test_option_context_scope(self): original_value = 60 context_value = 10 - option_name = 'a' + option_name = "a" self.cf.register_option(option_name, original_value) @@ -462,4 +453,4 @@ def test_dictwrapper_getattr(self): # GH 19789 with pytest.raises(OptionError, match="No such option"): options.bananas - assert not hasattr(options, 'bananas') + assert not hasattr(options, "bananas") diff --git a/pandas/tests/config/test_localization.py b/pandas/tests/config/test_localization.py index c63465ff0c4641..20a5be0c8a2897 100644 --- a/pandas/tests/config/test_localization.py +++ b/pandas/tests/config/test_localization.py @@ -12,11 +12,13 @@ _current_locale = locale.getlocale() # Don't run any of these tests if we are on Windows or have no locales. -pytestmark = pytest.mark.skipif(is_platform_windows() or not _all_locales, - reason="Need non-Windows and locales") +pytestmark = pytest.mark.skipif( + is_platform_windows() or not _all_locales, reason="Need non-Windows and locales" +) _skip_if_only_one_locale = pytest.mark.skipif( - len(_all_locales) <= 1, reason="Need multiple locales for meaningful test") + len(_all_locales) <= 1, reason="Need multiple locales for meaningful test" +) def test_can_set_locale_valid_set(): diff --git a/pandas/tests/dtypes/cast/test_construct_from_scalar.py b/pandas/tests/dtypes/cast/test_construct_from_scalar.py index c8a35e692e2b7f..4ff3be5dfaa724 100644 --- a/pandas/tests/dtypes/cast/test_construct_from_scalar.py +++ b/pandas/tests/dtypes/cast/test_construct_from_scalar.py @@ -15,6 +15,6 @@ def test_cast_1d_array_like_from_scalar_categorical(): expected = Categorical(["a", "a"], categories=cats) result = construct_1d_arraylike_from_scalar("a", len(expected), cat_type) - tm.assert_categorical_equal(result, expected, - check_category_order=True, - check_dtype=True) + tm.assert_categorical_equal( + result, expected, check_category_order=True, check_dtype=True + ) diff --git a/pandas/tests/dtypes/cast/test_construct_ndarray.py b/pandas/tests/dtypes/cast/test_construct_ndarray.py index 8653206be9156b..da3789a87aa074 100644 --- a/pandas/tests/dtypes/cast/test_construct_ndarray.py +++ b/pandas/tests/dtypes/cast/test_construct_ndarray.py @@ -6,13 +6,16 @@ from pandas.util import testing as tm -@pytest.mark.parametrize('values, dtype, expected', [ - ([1, 2, 3], None, np.array([1, 2, 3])), - (np.array([1, 2, 3]), None, np.array([1, 2, 3])), - (['1', '2', None], None, np.array(['1', '2', None])), - (['1', '2', None], np.dtype('str'), np.array(['1', '2', None])), - ([1, 2, None], np.dtype('str'), np.array(['1', '2', None])), -]) +@pytest.mark.parametrize( + "values, dtype, expected", + [ + ([1, 2, 3], None, np.array([1, 2, 3])), + (np.array([1, 2, 3]), None, np.array([1, 2, 3])), + (["1", "2", None], None, np.array(["1", "2", None])), + (["1", "2", None], np.dtype("str"), np.array(["1", "2", None])), + ([1, 2, None], np.dtype("str"), np.array(["1", "2", None])), + ], +) def test_construct_1d_ndarray_preserving_na(values, dtype, expected): result = construct_1d_ndarray_preserving_na(values, dtype=dtype) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/dtypes/cast/test_construct_object_arr.py b/pandas/tests/dtypes/cast/test_construct_object_arr.py index 15277b198f675f..cb44f91f34dec8 100644 --- a/pandas/tests/dtypes/cast/test_construct_object_arr.py +++ b/pandas/tests/dtypes/cast/test_construct_object_arr.py @@ -3,8 +3,8 @@ from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike -@pytest.mark.parametrize("datum1", [1, 2., "3", (4, 5), [6, 7], None]) -@pytest.mark.parametrize("datum2", [8, 9., "10", (11, 12), [13, 14], None]) +@pytest.mark.parametrize("datum1", [1, 2.0, "3", (4, 5), [6, 7], None]) +@pytest.mark.parametrize("datum2", [8, 9.0, "10", (11, 12), [13, 14], None]) def test_cast_1d_array(datum1, datum2): data = [datum1, datum2] result = construct_1d_object_array_from_listlike(data) @@ -14,7 +14,7 @@ def test_cast_1d_array(datum1, datum2): assert list(result) == data -@pytest.mark.parametrize("val", [1, 2., None]) +@pytest.mark.parametrize("val", [1, 2.0, None]) def test_cast_1d_array_invalid_scalar(val): with pytest.raises(TypeError, match="has no len()"): construct_1d_object_array_from_listlike(val) diff --git a/pandas/tests/dtypes/cast/test_downcast.py b/pandas/tests/dtypes/cast/test_downcast.py index 61aba9ed41f593..d574b03a8c7249 100644 --- a/pandas/tests/dtypes/cast/test_downcast.py +++ b/pandas/tests/dtypes/cast/test_downcast.py @@ -7,16 +7,26 @@ from pandas.util import testing as tm -@pytest.mark.parametrize("arr,dtype,expected", [ - (np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]), "infer", - np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995])), - - (np.array([8., 8., 8., 8., 8.9999999999995]), "infer", - np.array([8, 8, 8, 8, 9], dtype=np.int64)), - - (np.array([8., 8., 8., 8., 9.0000000000005]), "infer", - np.array([8, 8, 8, 8, 9], dtype=np.int64)), -]) +@pytest.mark.parametrize( + "arr,dtype,expected", + [ + ( + np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]), + "infer", + np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]), + ), + ( + np.array([8.0, 8.0, 8.0, 8.0, 8.9999999999995]), + "infer", + np.array([8, 8, 8, 8, 9], dtype=np.int64), + ), + ( + np.array([8.0, 8.0, 8.0, 8.0, 9.0000000000005]), + "infer", + np.array([8, 8, 8, 8, 9], dtype=np.int64), + ), + ], +) def test_downcast(arr, expected, dtype): result = maybe_downcast_to_dtype(arr, dtype) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/dtypes/cast/test_find_common_type.py b/pandas/tests/dtypes/cast/test_find_common_type.py index c48657bb272ccd..ac7a5221d3469e 100644 --- a/pandas/tests/dtypes/cast/test_find_common_type.py +++ b/pandas/tests/dtypes/cast/test_find_common_type.py @@ -2,61 +2,63 @@ import pytest from pandas.core.dtypes.cast import find_common_type -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, DatetimeTZDtype, PeriodDtype) - - -@pytest.mark.parametrize("source_dtypes,expected_common_dtype", [ - ((np.int64,), np.int64), - ((np.uint64,), np.uint64), - ((np.float32,), np.float32), - ((np.object,), np.object), - - # Into ints. - ((np.int16, np.int64), np.int64), - ((np.int32, np.uint32), np.int64), - ((np.uint16, np.uint64), np.uint64), - - # Into floats. - ((np.float16, np.float32), np.float32), - ((np.float16, np.int16), np.float32), - ((np.float32, np.int16), np.float32), - ((np.uint64, np.int64), np.float64), - ((np.int16, np.float64), np.float64), - ((np.float16, np.int64), np.float64), - - # Into others. - ((np.complex128, np.int32), np.complex128), - ((np.object, np.float32), np.object), - ((np.object, np.int16), np.object), - - # Bool with int. - ((np.dtype("bool"), np.int64), np.object), - ((np.dtype("bool"), np.int32), np.object), - ((np.dtype("bool"), np.int16), np.object), - ((np.dtype("bool"), np.int8), np.object), - ((np.dtype("bool"), np.uint64), np.object), - ((np.dtype("bool"), np.uint32), np.object), - ((np.dtype("bool"), np.uint16), np.object), - ((np.dtype("bool"), np.uint8), np.object), - - # Bool with float. - ((np.dtype("bool"), np.float64), np.object), - ((np.dtype("bool"), np.float32), np.object), - - ((np.dtype("datetime64[ns]"), np.dtype("datetime64[ns]")), - np.dtype("datetime64[ns]")), - ((np.dtype("timedelta64[ns]"), np.dtype("timedelta64[ns]")), - np.dtype("timedelta64[ns]")), - - ((np.dtype("datetime64[ns]"), np.dtype("datetime64[ms]")), - np.dtype("datetime64[ns]")), - ((np.dtype("timedelta64[ms]"), np.dtype("timedelta64[ns]")), - np.dtype("timedelta64[ns]")), - - ((np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")), np.object), - ((np.dtype("datetime64[ns]"), np.int64), np.object) -]) +from pandas.core.dtypes.dtypes import CategoricalDtype, DatetimeTZDtype, PeriodDtype + + +@pytest.mark.parametrize( + "source_dtypes,expected_common_dtype", + [ + ((np.int64,), np.int64), + ((np.uint64,), np.uint64), + ((np.float32,), np.float32), + ((np.object,), np.object), + # Into ints. + ((np.int16, np.int64), np.int64), + ((np.int32, np.uint32), np.int64), + ((np.uint16, np.uint64), np.uint64), + # Into floats. + ((np.float16, np.float32), np.float32), + ((np.float16, np.int16), np.float32), + ((np.float32, np.int16), np.float32), + ((np.uint64, np.int64), np.float64), + ((np.int16, np.float64), np.float64), + ((np.float16, np.int64), np.float64), + # Into others. + ((np.complex128, np.int32), np.complex128), + ((np.object, np.float32), np.object), + ((np.object, np.int16), np.object), + # Bool with int. + ((np.dtype("bool"), np.int64), np.object), + ((np.dtype("bool"), np.int32), np.object), + ((np.dtype("bool"), np.int16), np.object), + ((np.dtype("bool"), np.int8), np.object), + ((np.dtype("bool"), np.uint64), np.object), + ((np.dtype("bool"), np.uint32), np.object), + ((np.dtype("bool"), np.uint16), np.object), + ((np.dtype("bool"), np.uint8), np.object), + # Bool with float. + ((np.dtype("bool"), np.float64), np.object), + ((np.dtype("bool"), np.float32), np.object), + ( + (np.dtype("datetime64[ns]"), np.dtype("datetime64[ns]")), + np.dtype("datetime64[ns]"), + ), + ( + (np.dtype("timedelta64[ns]"), np.dtype("timedelta64[ns]")), + np.dtype("timedelta64[ns]"), + ), + ( + (np.dtype("datetime64[ns]"), np.dtype("datetime64[ms]")), + np.dtype("datetime64[ns]"), + ), + ( + (np.dtype("timedelta64[ms]"), np.dtype("timedelta64[ns]")), + np.dtype("timedelta64[ns]"), + ), + ((np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")), np.object), + ((np.dtype("datetime64[ns]"), np.int64), np.object), + ], +) def test_numpy_dtypes(source_dtypes, expected_common_dtype): assert find_common_type(source_dtypes) == expected_common_dtype @@ -66,11 +68,14 @@ def test_raises_empty_input(): find_common_type([]) -@pytest.mark.parametrize("dtypes,exp_type", [ - ([CategoricalDtype()], "category"), - ([np.object, CategoricalDtype()], np.object), - ([CategoricalDtype(), CategoricalDtype()], "category"), -]) +@pytest.mark.parametrize( + "dtypes,exp_type", + [ + ([CategoricalDtype()], "category"), + ([np.object, CategoricalDtype()], np.object), + ([CategoricalDtype(), CategoricalDtype()], "category"), + ], +) def test_categorical_dtype(dtypes, exp_type): assert find_common_type(dtypes) == exp_type @@ -80,10 +85,15 @@ def test_datetimetz_dtype_match(): assert find_common_type([dtype, dtype]) == "datetime64[ns, US/Eastern]" -@pytest.mark.parametrize("dtype2", [ - DatetimeTZDtype(unit="ns", tz="Asia/Tokyo"), - np.dtype("datetime64[ns]"), np.object, np.int64 -]) +@pytest.mark.parametrize( + "dtype2", + [ + DatetimeTZDtype(unit="ns", tz="Asia/Tokyo"), + np.dtype("datetime64[ns]"), + np.object, + np.int64, + ], +) def test_datetimetz_dtype_mismatch(dtype2): dtype = DatetimeTZDtype(unit="ns", tz="US/Eastern") assert find_common_type([dtype, dtype2]) == np.object @@ -95,11 +105,17 @@ def test_period_dtype_match(): assert find_common_type([dtype, dtype]) == "period[D]" -@pytest.mark.parametrize("dtype2", [ - DatetimeTZDtype(unit="ns", tz="Asia/Tokyo"), - PeriodDtype(freq="2D"), PeriodDtype(freq="H"), - np.dtype("datetime64[ns]"), np.object, np.int64 -]) +@pytest.mark.parametrize( + "dtype2", + [ + DatetimeTZDtype(unit="ns", tz="Asia/Tokyo"), + PeriodDtype(freq="2D"), + PeriodDtype(freq="H"), + np.dtype("datetime64[ns]"), + np.object, + np.int64, + ], +) def test_period_dtype_mismatch(dtype2): dtype = PeriodDtype(freq="D") assert find_common_type([dtype, dtype2]) == np.object diff --git a/pandas/tests/dtypes/cast/test_infer_datetimelike.py b/pandas/tests/dtypes/cast/test_infer_datetimelike.py index 3ff7b02b81342e..f4253e9d9e37b9 100644 --- a/pandas/tests/dtypes/cast/test_infer_datetimelike.py +++ b/pandas/tests/dtypes/cast/test_infer_datetimelike.py @@ -4,11 +4,14 @@ from pandas import DataFrame, NaT, Series, Timestamp -@pytest.mark.parametrize("data,exp_size", [ - # see gh-16362. - ([[NaT, "a", "b", 0], [NaT, "b", "c", 1]], 8), - ([[NaT, "a", 0], [NaT, "b", 1]], 6) -]) +@pytest.mark.parametrize( + "data,exp_size", + [ + # see gh-16362. + ([[NaT, "a", "b", 0], [NaT, "b", "c", 1]], 8), + ([[NaT, "a", 0], [NaT, "b", 1]], 6), + ], +) def test_maybe_infer_to_datetimelike_df_construct(data, exp_size): result = DataFrame(np.array(data)) assert result.size == exp_size diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py index 88c91243fcd741..602b2f26eaa4ae 100644 --- a/pandas/tests/dtypes/cast/test_infer_dtype.py +++ b/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -4,11 +4,13 @@ import pytest from pandas.core.dtypes.cast import ( - cast_scalar_to_array, infer_dtype_from_array, infer_dtype_from_scalar) + cast_scalar_to_array, + infer_dtype_from_array, + infer_dtype_from_scalar, +) from pandas.core.dtypes.common import is_dtype_equal -from pandas import ( - Categorical, Period, Series, Timedelta, Timestamp, date_range) +from pandas import Categorical, Period, Series, Timedelta, Timestamp, date_range from pandas.util import testing as tm @@ -33,9 +35,7 @@ def test_infer_dtype_from_float_scalar(float_dtype): assert dtype == float_dtype -@pytest.mark.parametrize("data,exp_dtype", [ - (12, np.int64), (np.float(12), np.float64) -]) +@pytest.mark.parametrize("data,exp_dtype", [(12, np.int64), (np.float(12), np.float64)]) def test_infer_dtype_from_python_scalar(data, exp_dtype): dtype, val = infer_dtype_from_scalar(data) assert dtype == exp_dtype @@ -53,15 +53,15 @@ def test_infer_dtype_from_complex(complex_dtype): assert dtype == np.complex_ -@pytest.mark.parametrize("data", [np.datetime64(1, "ns"), Timestamp(1), - datetime(2000, 1, 1, 0, 0)]) +@pytest.mark.parametrize( + "data", [np.datetime64(1, "ns"), Timestamp(1), datetime(2000, 1, 1, 0, 0)] +) def test_infer_dtype_from_datetime(data): dtype, val = infer_dtype_from_scalar(data) assert dtype == "M8[ns]" -@pytest.mark.parametrize("data", [np.timedelta64(1, "ns"), Timedelta(1), - timedelta(1)]) +@pytest.mark.parametrize("data", [np.timedelta64(1, "ns"), Timedelta(1), timedelta(1)]) def test_infer_dtype_from_timedelta(data): dtype, val = infer_dtype_from_scalar(data) assert dtype == "m8[ns]" @@ -83,8 +83,9 @@ def test_infer_dtype_from_period(freq, pandas_dtype): assert val == exp_val -@pytest.mark.parametrize("data", [date(2000, 1, 1), "foo", - Timestamp(1, tz="US/Eastern")]) +@pytest.mark.parametrize( + "data", [date(2000, 1, 1), "foo", Timestamp(1, tz="US/Eastern")] +) def test_infer_dtype_misc(data): dtype, val = infer_dtype_from_scalar(data) assert dtype == np.object_ @@ -115,39 +116,51 @@ def test_infer_dtype_from_scalar_errors(): @pytest.mark.parametrize( "arr, expected, pandas_dtype", - [("foo", np.object_, False), - (b"foo", np.object_, False), - (1, np.int_, False), - (1.5, np.float_, False), - ([1], np.int_, False), - (np.array([1], dtype=np.int64), np.int64, False), - ([np.nan, 1, ""], np.object_, False), - (np.array([[1.0, 2.0]]), np.float_, False), - (Categorical(list("aabc")), np.object_, False), - (Categorical([1, 2, 3]), np.int64, False), - (Categorical(list("aabc")), "category", True), - (Categorical([1, 2, 3]), "category", True), - (Timestamp("20160101"), np.object_, False), - (np.datetime64("2016-01-01"), np.dtype("=M8[D]"), False), - (date_range("20160101", periods=3), - np.dtype("=M8[ns]"), False), - (date_range("20160101", periods=3, tz="US/Eastern"), - "datetime64[ns, US/Eastern]", True), - (Series([1., 2, 3]), np.float64, False), - (Series(list("abc")), np.object_, False), - (Series(date_range("20160101", periods=3, tz="US/Eastern")), - "datetime64[ns, US/Eastern]", True)]) + [ + ("foo", np.object_, False), + (b"foo", np.object_, False), + (1, np.int_, False), + (1.5, np.float_, False), + ([1], np.int_, False), + (np.array([1], dtype=np.int64), np.int64, False), + ([np.nan, 1, ""], np.object_, False), + (np.array([[1.0, 2.0]]), np.float_, False), + (Categorical(list("aabc")), np.object_, False), + (Categorical([1, 2, 3]), np.int64, False), + (Categorical(list("aabc")), "category", True), + (Categorical([1, 2, 3]), "category", True), + (Timestamp("20160101"), np.object_, False), + (np.datetime64("2016-01-01"), np.dtype("=M8[D]"), False), + (date_range("20160101", periods=3), np.dtype("=M8[ns]"), False), + ( + date_range("20160101", periods=3, tz="US/Eastern"), + "datetime64[ns, US/Eastern]", + True, + ), + (Series([1.0, 2, 3]), np.float64, False), + (Series(list("abc")), np.object_, False), + ( + Series(date_range("20160101", periods=3, tz="US/Eastern")), + "datetime64[ns, US/Eastern]", + True, + ), + ], +) def test_infer_dtype_from_array(arr, expected, pandas_dtype): dtype, _ = infer_dtype_from_array(arr, pandas_dtype=pandas_dtype) assert is_dtype_equal(dtype, expected) -@pytest.mark.parametrize("obj,dtype", [ - (1, np.int64), (1.1, np.float64), - (Timestamp("2011-01-01"), "datetime64[ns]"), - (Timestamp("2011-01-01", tz="US/Eastern"), np.object), - (Period("2011-01-01", freq="D"), np.object) -]) +@pytest.mark.parametrize( + "obj,dtype", + [ + (1, np.int64), + (1.1, np.float64), + (Timestamp("2011-01-01"), "datetime64[ns]"), + (Timestamp("2011-01-01", tz="US/Eastern"), np.object), + (Period("2011-01-01", freq="D"), np.object), + ], +) def test_cast_scalar_to_array(obj, dtype): shape = (3, 2) diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 5a5b5d47b3ccca..44aebd4d277f24 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -12,17 +12,38 @@ from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( - is_complex_dtype, is_datetime64_dtype, is_datetime_or_timedelta_dtype, - is_float_dtype, is_integer_dtype, is_object_dtype, is_scalar, - is_string_dtype, is_timedelta64_dtype) + is_complex_dtype, + is_datetime64_dtype, + is_datetime_or_timedelta_dtype, + is_float_dtype, + is_integer_dtype, + is_object_dtype, + is_scalar, + is_string_dtype, + is_timedelta64_dtype, +) from pandas.core.dtypes.dtypes import DatetimeTZDtype, PandasExtensionDtype import pandas as pd -@pytest.fixture(params=[bool, 'uint8', 'int32', 'uint64', 'float32', 'float64', - 'complex64', 'complex128', 'M8[ns]', 'm8[ns]', str, - bytes, object]) +@pytest.fixture( + params=[ + bool, + "uint8", + "int32", + "uint64", + "float32", + "float64", + "complex64", + "complex128", + "M8[ns]", + "m8[ns]", + str, + bytes, + object, + ] +) def any_numpy_dtype_reduced(request): """ Parameterized fixture for numpy dtypes, reduced from any_numpy_dtype. @@ -43,8 +64,10 @@ def any_numpy_dtype_reduced(request): return request.param -@pytest.fixture(params=[(True, None), (True, object), (False, None)], - ids=['True-None', 'True-object', 'False-None']) +@pytest.fixture( + params=[(True, None), (True, object), (False, None)], + ids=["True-None", "True-object", "False-None"], +) def box(request): """ Parametrized fixture determining whether/how to transform fill_value. @@ -81,8 +104,15 @@ def _safe_dtype_assert(left_dtype, right_dtype): assert left_dtype == right_dtype -def _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar=None, exp_val_for_array=None): +def _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar=None, + exp_val_for_array=None, +): """ Auxiliary function to unify testing of scalar/array promotion. @@ -129,17 +159,18 @@ def _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, # for equal values, also check type (relevant e.g. for int vs float, resp. # for different datetimes and timedeltas) - match_value = (result_fill_value == expected_fill_value - # disabled type check due to too many xfails; GH 23982/25425 - # and type(result_fill_value) == type(expected_fill_value) - ) + match_value = ( + result_fill_value + == expected_fill_value + # disabled type check due to too many xfails; GH 23982/25425 + # and type(result_fill_value) == type(expected_fill_value) + ) # for missing values, None == None and iNaT == iNaT (which is checked # through match_value above), but np.nan != np.nan and pd.NaT != pd.NaT - match_missing = ((result_fill_value is np.nan - and expected_fill_value is np.nan) - or (result_fill_value is NaT - and expected_fill_value is NaT)) + match_missing = (result_fill_value is np.nan and expected_fill_value is np.nan) or ( + result_fill_value is NaT and expected_fill_value is NaT + ) assert match_value or match_missing @@ -150,14 +181,14 @@ def test_maybe_promote_int_with_int(): # override parametrization due to to many xfails; see GH 23982 / 25425 -@pytest.mark.parametrize('box', [(True, None), (False, None)]) +@pytest.mark.parametrize("box", [(True, None), (False, None)]) def test_maybe_promote_int_with_float(any_int_dtype, float_dtype, box): dtype = np.dtype(any_int_dtype) fill_dtype = np.dtype(float_dtype) boxed, box_dtype = box # read from parametrized fixture - if float_dtype == 'float32' and not boxed: - pytest.xfail('falsely upcasts to float64') + if float_dtype == "float32" and not boxed: + pytest.xfail("falsely upcasts to float64") # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -168,12 +199,19 @@ def test_maybe_promote_int_with_float(any_int_dtype, float_dtype, box): exp_val_for_scalar = np.float64(fill_value) exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) # override parametrization due to to many xfails; see GH 23982 / 25425 -@pytest.mark.parametrize('box', [(True, None), (False, None)]) +@pytest.mark.parametrize("box", [(True, None), (False, None)]) def test_maybe_promote_float_with_int(float_dtype, any_int_dtype, box): dtype = np.dtype(float_dtype) @@ -190,8 +228,15 @@ def test_maybe_promote_float_with_int(float_dtype, any_int_dtype, box): exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) def test_maybe_promote_float_with_float(): @@ -205,10 +250,9 @@ def test_maybe_promote_bool_with_any(any_numpy_dtype_reduced, box): boxed, box_dtype = box # read from parametrized fixture if boxed and fill_dtype == bool: - pytest.xfail('falsely upcasts to object') - if (boxed and box_dtype is None - and is_datetime_or_timedelta_dtype(fill_dtype)): - pytest.xfail('wrongly casts fill_value') + pytest.xfail("falsely upcasts to object") + if boxed and box_dtype is None and is_datetime_or_timedelta_dtype(fill_dtype): + pytest.xfail("wrongly casts fill_value") # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -218,8 +262,15 @@ def test_maybe_promote_bool_with_any(any_numpy_dtype_reduced, box): exp_val_for_scalar = fill_value exp_val_for_array = np.nan if fill_dtype != bool else None - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) def test_maybe_promote_any_with_bool(any_numpy_dtype_reduced, box): @@ -228,11 +279,11 @@ def test_maybe_promote_any_with_bool(any_numpy_dtype_reduced, box): boxed, box_dtype = box # read from parametrized fixture if boxed and dtype == bool: - pytest.xfail('falsely upcasts to object') + pytest.xfail("falsely upcasts to object") if boxed and dtype not in (str, object) and box_dtype is None: - pytest.xfail('falsely upcasts to object') + pytest.xfail("falsely upcasts to object") if not boxed and is_datetime_or_timedelta_dtype(dtype): - pytest.xfail('raises error') + pytest.xfail("raises error") # filling anything but bool with bool casts to object expected_dtype = np.dtype(object) if dtype != bool else dtype @@ -240,8 +291,15 @@ def test_maybe_promote_any_with_bool(any_numpy_dtype_reduced, box): exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] exp_val_for_array = np.nan if dtype != bool else None - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) def test_maybe_promote_bytes_with_any(): @@ -260,38 +318,49 @@ def test_maybe_promote_datetime64_with_any(): # override parametrization of box to add special case for dt_dtype -@pytest.mark.parametrize('box', [ - (True, None), # fill_value wrapped in array with default dtype - # disabled due to too many xfails; see GH 23982 / 25425 - # (True, 'dt_dtype'), # fill_value in array with explicit datetime dtype - # (True, object), # fill_value wrapped in array with object dtype - (False, None) # fill_value passed on as scalar -]) -@pytest.mark.parametrize('fill_value', [ - pd.Timestamp('now'), np.datetime64('now'), - datetime.datetime.now(), datetime.date.today() -], ids=['pd.Timestamp', 'np.datetime64', 'datetime.datetime', 'datetime.date']) -def test_maybe_promote_any_with_datetime64(any_numpy_dtype_reduced, - datetime64_dtype, fill_value, box): +@pytest.mark.parametrize( + "box", + [ + (True, None), # fill_value wrapped in array with default dtype + # disabled due to too many xfails; see GH 23982 / 25425 + # (True, 'dt_dtype'), # fill_value in array with explicit datetime dtype + # (True, object), # fill_value wrapped in array with object dtype + (False, None), # fill_value passed on as scalar + ], +) +@pytest.mark.parametrize( + "fill_value", + [ + pd.Timestamp("now"), + np.datetime64("now"), + datetime.datetime.now(), + datetime.date.today(), + ], + ids=["pd.Timestamp", "np.datetime64", "datetime.datetime", "datetime.date"], +) +def test_maybe_promote_any_with_datetime64( + any_numpy_dtype_reduced, datetime64_dtype, fill_value, box +): dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture if is_datetime64_dtype(dtype): - if (boxed and (box_dtype == object - or (box_dtype is None - and not is_datetime64_dtype(type(fill_value))))): - pytest.xfail('falsely upcasts to object') + if boxed and ( + box_dtype == object + or (box_dtype is None and not is_datetime64_dtype(type(fill_value))) + ): + pytest.xfail("falsely upcasts to object") else: - if (boxed and (box_dtype == 'dt_dtype' - or (box_dtype is None - and is_datetime64_dtype(type(fill_value))))): - pytest.xfail('mix of lack of upcasting, resp. wrong missing value') + if boxed and ( + box_dtype == "dt_dtype" + or (box_dtype is None and is_datetime64_dtype(type(fill_value))) + ): + pytest.xfail("mix of lack of upcasting, resp. wrong missing value") if not boxed and is_timedelta64_dtype(dtype): - pytest.xfail('raises error') + pytest.xfail("raises error") # special case for box_dtype - box_dtype = (np.dtype(datetime64_dtype) if box_dtype == 'dt_dtype' - else box_dtype) + box_dtype = np.dtype(datetime64_dtype) if box_dtype == "dt_dtype" else box_dtype # filling datetime with anything but datetime casts to object if is_datetime64_dtype(dtype): @@ -304,20 +373,28 @@ def test_maybe_promote_any_with_datetime64(any_numpy_dtype_reduced, exp_val_for_scalar = fill_value exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) # override parametrization due to to many xfails; see GH 23982 / 25425 -@pytest.mark.parametrize('box', [(True, object)]) +@pytest.mark.parametrize("box", [(True, object)]) def test_maybe_promote_datetimetz_with_any_numpy_dtype( - tz_aware_fixture, any_numpy_dtype_reduced, box): + tz_aware_fixture, any_numpy_dtype_reduced, box +): dtype = DatetimeTZDtype(tz=tz_aware_fixture) fill_dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture if box_dtype != object: - pytest.xfail('does not upcast correctly') + pytest.xfail("does not upcast correctly") # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -327,25 +404,34 @@ def test_maybe_promote_datetimetz_with_any_numpy_dtype( exp_val_for_scalar = fill_value exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) # override parametrization due to to many xfails; see GH 23982 / 25425 -@pytest.mark.parametrize('box', [(True, None), (True, object)]) -def test_maybe_promote_datetimetz_with_datetimetz(tz_aware_fixture, - tz_aware_fixture2, box): +@pytest.mark.parametrize("box", [(True, None), (True, object)]) +def test_maybe_promote_datetimetz_with_datetimetz( + tz_aware_fixture, tz_aware_fixture2, box +): dtype = DatetimeTZDtype(tz=tz_aware_fixture) fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture2) boxed, box_dtype = box # read from parametrized fixture from dateutil.tz import tzlocal + if is_platform_windows() and tz_aware_fixture2 == tzlocal(): - pytest.xfail('Cannot process fill_value with this dtype, see GH 24310') + pytest.xfail("Cannot process fill_value with this dtype, see GH 24310") if dtype.tz == fill_dtype.tz and boxed: - pytest.xfail('falsely upcasts') + pytest.xfail("falsely upcasts") if dtype.tz != fill_dtype.tz and not boxed: - pytest.xfail('falsely upcasts') + pytest.xfail("falsely upcasts") # create array of given dtype; casts "1" to correct dtype fill_value = pd.Series([10 ** 9], dtype=fill_dtype)[0] @@ -359,49 +445,72 @@ def test_maybe_promote_datetimetz_with_datetimetz(tz_aware_fixture, expected_dtype = np.dtype(object) exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) -@pytest.mark.parametrize('fill_value', [None, np.nan, NaT, iNaT], - ids=['None', 'np.nan', 'pd.NaT', 'iNaT']) +@pytest.mark.parametrize( + "fill_value", [None, np.nan, NaT, iNaT], ids=["None", "np.nan", "pd.NaT", "iNaT"] +) # override parametrization due to to many xfails; see GH 23982 / 25425 -@pytest.mark.parametrize('box', [(False, None)]) +@pytest.mark.parametrize("box", [(False, None)]) def test_maybe_promote_datetimetz_with_na(tz_aware_fixture, fill_value, box): dtype = DatetimeTZDtype(tz=tz_aware_fixture) boxed, box_dtype = box # read from parametrized fixture - if (boxed and (box_dtype == object - or (box_dtype is None - and (fill_value is None or fill_value is NaT)))): - pytest.xfail('false upcasts to object') + if boxed and ( + box_dtype == object + or (box_dtype is None and (fill_value is None or fill_value is NaT)) + ): + pytest.xfail("false upcasts to object") # takes the opinion that DatetimeTZ should have single na-marker # using iNaT would lead to errors elsewhere -> NaT if not boxed and fill_value == iNaT: - pytest.xfail('wrong missing value marker') + pytest.xfail("wrong missing value marker") expected_dtype = dtype # DatetimeTZDtype does not use iNaT as missing value marker exp_val_for_scalar = NaT exp_val_for_array = NaT - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) - - -@pytest.mark.parametrize('fill_value', [ - pd.Timestamp('now'), np.datetime64('now'), - datetime.datetime.now(), datetime.date.today() -], ids=['pd.Timestamp', 'np.datetime64', 'datetime.datetime', 'datetime.date']) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) + + +@pytest.mark.parametrize( + "fill_value", + [ + pd.Timestamp("now"), + np.datetime64("now"), + datetime.datetime.now(), + datetime.date.today(), + ], + ids=["pd.Timestamp", "np.datetime64", "datetime.datetime", "datetime.date"], +) def test_maybe_promote_any_numpy_dtype_with_datetimetz( - any_numpy_dtype_reduced, tz_aware_fixture, fill_value, box): + any_numpy_dtype_reduced, tz_aware_fixture, fill_value, box +): dtype = np.dtype(any_numpy_dtype_reduced) fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture) boxed, box_dtype = box # read from parametrized fixture if is_datetime_or_timedelta_dtype(dtype) and not boxed: - pytest.xfail('raises error') + pytest.xfail("raises error") fill_value = pd.Series([fill_value], dtype=fill_dtype)[0] @@ -410,8 +519,15 @@ def test_maybe_promote_any_numpy_dtype_with_datetimetz( exp_val_for_scalar = fill_value exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) def test_maybe_promote_timedelta64_with_any(): @@ -419,44 +535,55 @@ def test_maybe_promote_timedelta64_with_any(): pass -@pytest.mark.parametrize('fill_value', [ - pd.Timedelta(days=1), np.timedelta64(24, 'h'), datetime.timedelta(1) -], ids=['pd.Timedelta', 'np.timedelta64', 'datetime.timedelta']) +@pytest.mark.parametrize( + "fill_value", + [pd.Timedelta(days=1), np.timedelta64(24, "h"), datetime.timedelta(1)], + ids=["pd.Timedelta", "np.timedelta64", "datetime.timedelta"], +) # override parametrization of box to add special case for td_dtype -@pytest.mark.parametrize('box', [ - (True, None), # fill_value wrapped in array with default dtype - # disabled due to too many xfails; see GH 23982 / 25425 - # (True, 'td_dtype'), # fill_value in array with explicit timedelta dtype - (True, object), # fill_value wrapped in array with object dtype - (False, None) # fill_value passed on as scalar -]) +@pytest.mark.parametrize( + "box", + [ + (True, None), # fill_value wrapped in array with default dtype + # disabled due to too many xfails; see GH 23982 / 25425 + # (True, 'td_dtype'), # fill_value in array with explicit timedelta dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None), # fill_value passed on as scalar + ], +) def test_maybe_promote_any_with_timedelta64( - any_numpy_dtype_reduced, timedelta64_dtype, fill_value, box): + any_numpy_dtype_reduced, timedelta64_dtype, fill_value, box +): dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture if is_timedelta64_dtype(dtype): - if (boxed and (box_dtype == object - or (box_dtype is None - and not is_timedelta64_dtype(type(fill_value))))): - pytest.xfail('falsely upcasts to object') + if boxed and ( + box_dtype == object + or (box_dtype is None and not is_timedelta64_dtype(type(fill_value))) + ): + pytest.xfail("falsely upcasts to object") else: - if (boxed and box_dtype is None - and is_timedelta64_dtype(type(fill_value))): - pytest.xfail('does not upcast correctly') - if (not boxed and is_timedelta64_dtype(type(fill_value)) and ( - is_integer_dtype(dtype) or is_float_dtype(dtype) + if boxed and box_dtype is None and is_timedelta64_dtype(type(fill_value)): + pytest.xfail("does not upcast correctly") + if ( + not boxed + and is_timedelta64_dtype(type(fill_value)) + and ( + is_integer_dtype(dtype) + or is_float_dtype(dtype) or is_complex_dtype(dtype) - or issubclass(dtype.type, np.bytes_))): - pytest.xfail('does not upcast correctly') - if box_dtype == 'td_dtype': - pytest.xfail('falsely upcasts') + or issubclass(dtype.type, np.bytes_) + ) + ): + pytest.xfail("does not upcast correctly") + if box_dtype == "td_dtype": + pytest.xfail("falsely upcasts") if not boxed and is_datetime64_dtype(dtype): - pytest.xfail('raises error') + pytest.xfail("raises error") # special case for box_dtype - box_dtype = (np.dtype(timedelta64_dtype) if box_dtype == 'td_dtype' - else box_dtype) + box_dtype = np.dtype(timedelta64_dtype) if box_dtype == "td_dtype" else box_dtype # filling anything but timedelta with timedelta casts to object if is_timedelta64_dtype(dtype): @@ -469,19 +596,24 @@ def test_maybe_promote_any_with_timedelta64( exp_val_for_scalar = fill_value exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) -def test_maybe_promote_string_with_any(string_dtype, - any_numpy_dtype_reduced, box): +def test_maybe_promote_string_with_any(string_dtype, any_numpy_dtype_reduced, box): dtype = np.dtype(string_dtype) fill_dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture - if (boxed and box_dtype is None - and is_datetime_or_timedelta_dtype(fill_dtype)): - pytest.xfail('wrong missing value marker') + if boxed and box_dtype is None and is_datetime_or_timedelta_dtype(fill_dtype): + pytest.xfail("wrong missing value marker") # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -491,56 +623,76 @@ def test_maybe_promote_string_with_any(string_dtype, exp_val_for_scalar = fill_value exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) # override parametrization of box to add special case for str -@pytest.mark.parametrize('box', [ - # disabled due to too many xfails; see GH 23982 / 25425 - # (True, None), # fill_value wrapped in array with default dtype - # (True, 'str'), # fill_value wrapped in array with generic string-dtype - (True, object), # fill_value wrapped in array with object dtype - (False, None) # fill_value passed on as scalar -]) -def test_maybe_promote_any_with_string(any_numpy_dtype_reduced, - string_dtype, box): +@pytest.mark.parametrize( + "box", + [ + # disabled due to too many xfails; see GH 23982 / 25425 + # (True, None), # fill_value wrapped in array with default dtype + # (True, 'str'), # fill_value wrapped in array with generic string-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None), # fill_value passed on as scalar + ], +) +def test_maybe_promote_any_with_string(any_numpy_dtype_reduced, string_dtype, box): dtype = np.dtype(any_numpy_dtype_reduced) fill_dtype = np.dtype(string_dtype) boxed, box_dtype = box # read from parametrized fixture if is_datetime_or_timedelta_dtype(dtype) and box_dtype != object: - pytest.xfail('does not upcast or raises') - if (boxed and box_dtype in (None, 'str') and ( - is_integer_dtype(dtype) or is_float_dtype(dtype) + pytest.xfail("does not upcast or raises") + if ( + boxed + and box_dtype in (None, "str") + and ( + is_integer_dtype(dtype) + or is_float_dtype(dtype) or is_complex_dtype(dtype) - or issubclass(dtype.type, np.bytes_))): - pytest.xfail('does not upcast correctly') + or issubclass(dtype.type, np.bytes_) + ) + ): + pytest.xfail("does not upcast correctly") # create array of given dtype - fill_value = 'abc' + fill_value = "abc" # special case for box_dtype (cannot use fixture in parametrization) - box_dtype = fill_dtype if box_dtype == 'str' else box_dtype + box_dtype = fill_dtype if box_dtype == "str" else box_dtype # filling anything with a string casts to object expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) -def test_maybe_promote_object_with_any(object_dtype, - any_numpy_dtype_reduced, box): +def test_maybe_promote_object_with_any(object_dtype, any_numpy_dtype_reduced, box): dtype = np.dtype(object_dtype) fill_dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture - if (boxed and box_dtype is None - and is_datetime_or_timedelta_dtype(fill_dtype)): - pytest.xfail('wrong missing value marker') + if boxed and box_dtype is None and is_datetime_or_timedelta_dtype(fill_dtype): + pytest.xfail("wrong missing value marker") # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -550,17 +702,23 @@ def test_maybe_promote_object_with_any(object_dtype, exp_val_for_scalar = fill_value exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) -def test_maybe_promote_any_with_object(any_numpy_dtype_reduced, - object_dtype, box): +def test_maybe_promote_any_with_object(any_numpy_dtype_reduced, object_dtype, box): dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture if not boxed and is_datetime_or_timedelta_dtype(dtype): - pytest.xfail('raises error') + pytest.xfail("raises error") # create array of object dtype from a scalar value (i.e. passing # dtypes.common.is_scalar), which can however not be cast to int/float etc. @@ -571,42 +729,61 @@ def test_maybe_promote_any_with_object(any_numpy_dtype_reduced, exp_val_for_scalar = fill_value exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) -@pytest.mark.parametrize('fill_value', [None, np.nan, NaT, iNaT], - ids=['None', 'np.nan', 'pd.NaT', 'iNaT']) +@pytest.mark.parametrize( + "fill_value", [None, np.nan, NaT, iNaT], ids=["None", "np.nan", "pd.NaT", "iNaT"] +) # override parametrization due to to many xfails; see GH 23982 / 25425 -@pytest.mark.parametrize('box', [(False, None)]) -def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype_reduced, - fill_value, box): +@pytest.mark.parametrize("box", [(False, None)]) +def test_maybe_promote_any_numpy_dtype_with_na( + any_numpy_dtype_reduced, fill_value, box +): dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture - if (dtype == bytes and not boxed - and fill_value is not None and fill_value is not NaT): - pytest.xfail('does not upcast to object') - elif dtype == 'uint64' and not boxed and fill_value == iNaT: - pytest.xfail('does not upcast correctly') + if ( + dtype == bytes + and not boxed + and fill_value is not None + and fill_value is not NaT + ): + pytest.xfail("does not upcast to object") + elif dtype == "uint64" and not boxed and fill_value == iNaT: + pytest.xfail("does not upcast correctly") elif is_datetime_or_timedelta_dtype(dtype) and boxed: - pytest.xfail('falsely upcasts to object') - elif (boxed and (is_integer_dtype(dtype) or is_float_dtype(dtype) - or is_complex_dtype(dtype)) - and fill_value is not NaT and dtype != 'uint64'): - pytest.xfail('falsely upcasts to object') - elif (boxed and dtype == 'uint64' - and (fill_value is np.nan or fill_value is None)): - pytest.xfail('falsely upcasts to object') + pytest.xfail("falsely upcasts to object") + elif ( + boxed + and ( + is_integer_dtype(dtype) or is_float_dtype(dtype) or is_complex_dtype(dtype) + ) + and fill_value is not NaT + and dtype != "uint64" + ): + pytest.xfail("falsely upcasts to object") + elif boxed and dtype == "uint64" and (fill_value is np.nan or fill_value is None): + pytest.xfail("falsely upcasts to object") # below: opinionated that iNaT should be interpreted as missing value - elif (not boxed and (is_float_dtype(dtype) or is_complex_dtype(dtype)) - and fill_value == iNaT): - pytest.xfail('does not cast to missing value marker correctly') - elif ((is_string_dtype(dtype) or dtype == bool) - and not boxed and fill_value == iNaT): - pytest.xfail('does not cast to missing value marker correctly') - - if is_integer_dtype(dtype) and dtype == 'uint64' and fill_value == iNaT: + elif ( + not boxed + and (is_float_dtype(dtype) or is_complex_dtype(dtype)) + and fill_value == iNaT + ): + pytest.xfail("does not cast to missing value marker correctly") + elif (is_string_dtype(dtype) or dtype == bool) and not boxed and fill_value == iNaT: + pytest.xfail("does not cast to missing value marker correctly") + + if is_integer_dtype(dtype) and dtype == "uint64" and fill_value == iNaT: # uint64 + negative int casts to object; iNaT is considered as missing expected_dtype = np.dtype(object) exp_val_for_scalar = np.nan @@ -649,11 +826,18 @@ def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype_reduced, else: # expected_dtype = float / complex / object exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) -@pytest.mark.parametrize('dim', [0, 2, 3]) +@pytest.mark.parametrize("dim", [0, 2, 3]) def test_maybe_promote_dimensions(any_numpy_dtype_reduced, dim): dtype = np.dtype(any_numpy_dtype_reduced) @@ -666,12 +850,13 @@ def test_maybe_promote_dimensions(any_numpy_dtype_reduced, dim): # test against 1-dimensional case expected_dtype, expected_missing_value = maybe_promote( - dtype, np.array([1], dtype=dtype)) + dtype, np.array([1], dtype=dtype) + ) result_dtype, result_missing_value = maybe_promote(dtype, fill_array) assert result_dtype == expected_dtype # None == None, iNaT == iNaT, but np.nan != np.nan - assert ((result_missing_value == expected_missing_value) - or (result_missing_value is np.nan - and expected_missing_value is np.nan)) + assert (result_missing_value == expected_missing_value) or ( + result_missing_value is np.nan and expected_missing_value is np.nan + ) diff --git a/pandas/tests/dtypes/cast/test_upcast.py b/pandas/tests/dtypes/cast/test_upcast.py index 8d5f21806cf46c..f076bcd5780018 100644 --- a/pandas/tests/dtypes/cast/test_upcast.py +++ b/pandas/tests/dtypes/cast/test_upcast.py @@ -7,11 +7,7 @@ from pandas.util import testing as tm -@pytest.mark.parametrize("result", [ - Series([10, 11, 12]), - [10, 11, 12], - (10, 11, 12) -]) +@pytest.mark.parametrize("result", [Series([10, 11, 12]), [10, 11, 12], (10, 11, 12)]) def test_upcast_error(result): # GH23823 mask = np.array([False, True, False]) @@ -20,18 +16,26 @@ def test_upcast_error(result): result, _ = maybe_upcast_putmask(result, mask, other) -@pytest.mark.parametrize("arr, other, exp_changed, expected", [ - (np.arange(1, 6), np.array([61, 62, 63]), - False, np.array([1, 61, 3, 62, 63])), - (np.arange(1, 6), np.array([61.1, 62.2, 63.3]), - True, np.array([1, 61.1, 3, 62.2, 63.3])), - (np.arange(1, 6), np.nan, - True, np.array([1, np.nan, 3, np.nan, np.nan])), - (np.arange(10, 15), np.array([61, 62]), - False, np.array([10, 61, 12, 62, 61])), - (np.arange(10, 15), np.array([61, np.nan]), - True, np.array([10, 61, 12, np.nan, 61])) -]) +@pytest.mark.parametrize( + "arr, other, exp_changed, expected", + [ + (np.arange(1, 6), np.array([61, 62, 63]), False, np.array([1, 61, 3, 62, 63])), + ( + np.arange(1, 6), + np.array([61.1, 62.2, 63.3]), + True, + np.array([1, 61.1, 3, 62.2, 63.3]), + ), + (np.arange(1, 6), np.nan, True, np.array([1, np.nan, 3, np.nan, np.nan])), + (np.arange(10, 15), np.array([61, 62]), False, np.array([10, 61, 12, 62, 61])), + ( + np.arange(10, 15), + np.array([61, np.nan]), + True, + np.array([10, 61, 12, np.nan, 61]), + ), + ], +) def test_upcast(arr, other, exp_changed, expected): # GH23823 mask = np.array([False, True, False, True, True]) @@ -41,20 +45,44 @@ def test_upcast(arr, other, exp_changed, expected): tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("arr, other, exp_changed, expected", [ - (np.arange('2019-01-01', '2019-01-06', dtype='datetime64[D]'), - np.arange('2018-01-01', '2018-01-04', dtype='datetime64[D]'), - False, np.array(['2019-01-01', '2018-01-01', '2019-01-03', - '2018-01-02', '2018-01-03'], dtype='datetime64[D]')), - (np.arange('2019-01-01', '2019-01-06', dtype='datetime64[D]'), np.nan, - False, np.array(['2019-01-01', np.datetime64('NaT'), - '2019-01-03', np.datetime64('NaT'), - np.datetime64('NaT')], dtype='datetime64[D]')), - (np.arange('2019-01-01', '2019-01-06', dtype='datetime64[D]'), - np.arange('2018-01-01', '2018-01-03', dtype='datetime64[D]'), - False, np.array(['2019-01-01', '2018-01-01', '2019-01-03', - '2018-01-02', '2018-01-01'], dtype='datetime64[D]')) -]) +@pytest.mark.parametrize( + "arr, other, exp_changed, expected", + [ + ( + np.arange("2019-01-01", "2019-01-06", dtype="datetime64[D]"), + np.arange("2018-01-01", "2018-01-04", dtype="datetime64[D]"), + False, + np.array( + ["2019-01-01", "2018-01-01", "2019-01-03", "2018-01-02", "2018-01-03"], + dtype="datetime64[D]", + ), + ), + ( + np.arange("2019-01-01", "2019-01-06", dtype="datetime64[D]"), + np.nan, + False, + np.array( + [ + "2019-01-01", + np.datetime64("NaT"), + "2019-01-03", + np.datetime64("NaT"), + np.datetime64("NaT"), + ], + dtype="datetime64[D]", + ), + ), + ( + np.arange("2019-01-01", "2019-01-06", dtype="datetime64[D]"), + np.arange("2018-01-01", "2018-01-03", dtype="datetime64[D]"), + False, + np.array( + ["2019-01-01", "2018-01-01", "2019-01-03", "2018-01-02", "2018-01-01"], + dtype="datetime64[D]", + ), + ), + ], +) def test_upcast_datetime(arr, other, exp_changed, expected): # GH23823 mask = np.array([False, True, False, True, True]) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 675abec661b5ad..27ae918b015fe8 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -5,25 +5,32 @@ import pandas.core.dtypes.common as com from pandas.core.dtypes.dtypes import ( - CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, IntervalDtype, - PeriodDtype) + CategoricalDtype, + CategoricalDtypeType, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, +) import pandas as pd from pandas.conftest import ( - ALL_EA_INT_DTYPES, ALL_INT_DTYPES, SIGNED_EA_INT_DTYPES, SIGNED_INT_DTYPES, - UNSIGNED_EA_INT_DTYPES, UNSIGNED_INT_DTYPES) + ALL_EA_INT_DTYPES, + ALL_INT_DTYPES, + SIGNED_EA_INT_DTYPES, + SIGNED_INT_DTYPES, + UNSIGNED_EA_INT_DTYPES, + UNSIGNED_INT_DTYPES, +) from pandas.core.sparse.api import SparseDtype import pandas.util.testing as tm -ignore_sparse_warning = pytest.mark.filterwarnings( - "ignore:Sparse:FutureWarning" -) +ignore_sparse_warning = pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") # EA & Actual Dtypes def to_ea_dtypes(dtypes): """ convert list of string dtypes to EA dtype """ - return [getattr(pd, dt + 'Dtype') for dt in dtypes] + return [getattr(pd, dt + "Dtype") for dt in dtypes] def to_numpy_dtypes(dtypes): @@ -35,64 +42,84 @@ class TestPandasDtype: # Passing invalid dtype, both as a string or object, must raise TypeError # Per issue GH15520 - @pytest.mark.parametrize('box', [pd.Timestamp, 'pd.Timestamp', list]) + @pytest.mark.parametrize("box", [pd.Timestamp, "pd.Timestamp", list]) def test_invalid_dtype_error(self, box): - with pytest.raises(TypeError, match='not understood'): + with pytest.raises(TypeError, match="not understood"): com.pandas_dtype(box) - @pytest.mark.parametrize('dtype', [ - object, 'float64', np.object_, np.dtype('object'), 'O', - np.float64, float, np.dtype('float64')]) + @pytest.mark.parametrize( + "dtype", + [ + object, + "float64", + np.object_, + np.dtype("object"), + "O", + np.float64, + float, + np.dtype("float64"), + ], + ) def test_pandas_dtype_valid(self, dtype): assert com.pandas_dtype(dtype) == dtype - @pytest.mark.parametrize('dtype', [ - 'M8[ns]', 'm8[ns]', 'object', 'float64', 'int64']) + @pytest.mark.parametrize( + "dtype", ["M8[ns]", "m8[ns]", "object", "float64", "int64"] + ) def test_numpy_dtype(self, dtype): assert com.pandas_dtype(dtype) == np.dtype(dtype) def test_numpy_string_dtype(self): # do not parse freq-like string as period dtype - assert com.pandas_dtype('U') == np.dtype('U') - assert com.pandas_dtype('S') == np.dtype('S') - - @pytest.mark.parametrize('dtype', [ - 'datetime64[ns, US/Eastern]', - 'datetime64[ns, Asia/Tokyo]', - 'datetime64[ns, UTC]']) + assert com.pandas_dtype("U") == np.dtype("U") + assert com.pandas_dtype("S") == np.dtype("S") + + @pytest.mark.parametrize( + "dtype", + [ + "datetime64[ns, US/Eastern]", + "datetime64[ns, Asia/Tokyo]", + "datetime64[ns, UTC]", + ], + ) def test_datetimetz_dtype(self, dtype): - assert (com.pandas_dtype(dtype) == - DatetimeTZDtype.construct_from_string(dtype)) + assert com.pandas_dtype(dtype) == DatetimeTZDtype.construct_from_string(dtype) assert com.pandas_dtype(dtype) == dtype def test_categorical_dtype(self): - assert com.pandas_dtype('category') == CategoricalDtype() - - @pytest.mark.parametrize('dtype', [ - 'period[D]', 'period[3M]', 'period[U]', - 'Period[D]', 'Period[3M]', 'Period[U]']) + assert com.pandas_dtype("category") == CategoricalDtype() + + @pytest.mark.parametrize( + "dtype", + [ + "period[D]", + "period[3M]", + "period[U]", + "Period[D]", + "Period[3M]", + "Period[U]", + ], + ) def test_period_dtype(self, dtype): assert com.pandas_dtype(dtype) is PeriodDtype(dtype) assert com.pandas_dtype(dtype) == PeriodDtype(dtype) assert com.pandas_dtype(dtype) == dtype -dtypes = dict(datetime_tz=com.pandas_dtype('datetime64[ns, US/Eastern]'), - datetime=com.pandas_dtype('datetime64[ns]'), - timedelta=com.pandas_dtype('timedelta64[ns]'), - period=PeriodDtype('D'), - integer=np.dtype(np.int64), - float=np.dtype(np.float64), - object=np.dtype(np.object), - category=com.pandas_dtype('category')) +dtypes = dict( + datetime_tz=com.pandas_dtype("datetime64[ns, US/Eastern]"), + datetime=com.pandas_dtype("datetime64[ns]"), + timedelta=com.pandas_dtype("timedelta64[ns]"), + period=PeriodDtype("D"), + integer=np.dtype(np.int64), + float=np.dtype(np.float64), + object=np.dtype(np.object), + category=com.pandas_dtype("category"), +) -@pytest.mark.parametrize('name1,dtype1', - list(dtypes.items()), - ids=lambda x: str(x)) -@pytest.mark.parametrize('name2,dtype2', - list(dtypes.items()), - ids=lambda x: str(x)) +@pytest.mark.parametrize("name1,dtype1", list(dtypes.items()), ids=lambda x: str(x)) +@pytest.mark.parametrize("name2,dtype2", list(dtypes.items()), ids=lambda x: str(x)) def test_dtype_equal(name1, dtype1, name2, dtype2): # match equal to self, but not equal to other @@ -101,16 +128,21 @@ def test_dtype_equal(name1, dtype1, name2, dtype2): assert not com.is_dtype_equal(dtype1, dtype2) -@pytest.mark.parametrize("dtype1,dtype2", [ - (np.int8, np.int64), - (np.int16, np.int64), - (np.int32, np.int64), - (np.float32, np.float64), - (PeriodDtype("D"), PeriodDtype("2D")), # PeriodType - (com.pandas_dtype("datetime64[ns, US/Eastern]"), - com.pandas_dtype("datetime64[ns, CET]")), # Datetime - (None, None) # gh-15941: no exception should be raised. -]) +@pytest.mark.parametrize( + "dtype1,dtype2", + [ + (np.int8, np.int64), + (np.int16, np.int64), + (np.int32, np.int64), + (np.float32, np.float64), + (PeriodDtype("D"), PeriodDtype("2D")), # PeriodType + ( + com.pandas_dtype("datetime64[ns, US/Eastern]"), + com.pandas_dtype("datetime64[ns, CET]"), + ), # Datetime + (None, None), # gh-15941: no exception should be raised. + ], +) def test_dtype_equal_strict(dtype1, dtype2): assert not com.is_dtype_equal(dtype1, dtype2) @@ -122,14 +154,11 @@ def get_is_dtype_funcs(): """ - fnames = [f for f in dir(com) if (f.startswith('is_') and - f.endswith('dtype'))] + fnames = [f for f in dir(com) if (f.startswith("is_") and f.endswith("dtype"))] return [getattr(com, fname) for fname in fnames] -@pytest.mark.parametrize('func', - get_is_dtype_funcs(), - ids=lambda x: x.__name__) +@pytest.mark.parametrize("func", get_is_dtype_funcs(), ids=lambda x: x.__name__) def test_get_dtype_error_catch(func): # see gh-15941 # @@ -147,9 +176,9 @@ def test_is_object(): assert not com.is_object_dtype([1, 2, 3]) -@pytest.mark.parametrize("check_scipy", [ - False, pytest.param(True, marks=td.skip_if_no_scipy) -]) +@pytest.mark.parametrize( + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] +) @ignore_sparse_warning def test_is_sparse(check_scipy): assert com.is_sparse(pd.SparseArray([1, 2, 3])) @@ -159,6 +188,7 @@ def test_is_sparse(check_scipy): if check_scipy: import scipy.sparse + assert not com.is_sparse(scipy.sparse.bsr_matrix([1, 2, 3])) @@ -166,6 +196,7 @@ def test_is_sparse(check_scipy): @ignore_sparse_warning def test_is_scipy_sparse(): from scipy.sparse import bsr_matrix + assert com.is_scipy_sparse(bsr_matrix([1, 2, 3])) assert not com.is_scipy_sparse(pd.SparseArray([1, 2, 3])) @@ -213,8 +244,7 @@ def test_is_datetime64tz_dtype(): assert not com.is_datetime64tz_dtype(object) assert not com.is_datetime64tz_dtype([1, 2, 3]) assert not com.is_datetime64tz_dtype(pd.DatetimeIndex([1, 2, 3])) - assert com.is_datetime64tz_dtype(pd.DatetimeIndex(['2000'], - tz="US/Eastern")) + assert com.is_datetime64tz_dtype(pd.DatetimeIndex(["2000"], tz="US/Eastern")) def test_is_timedelta64_dtype(): @@ -222,14 +252,14 @@ def test_is_timedelta64_dtype(): assert not com.is_timedelta64_dtype(None) assert not com.is_timedelta64_dtype([1, 2, 3]) assert not com.is_timedelta64_dtype(np.array([], dtype=np.datetime64)) - assert not com.is_timedelta64_dtype('0 days') + assert not com.is_timedelta64_dtype("0 days") assert not com.is_timedelta64_dtype("0 days 00:00:00") assert not com.is_timedelta64_dtype(["0 days 00:00:00"]) assert not com.is_timedelta64_dtype("NO DATE") assert com.is_timedelta64_dtype(np.timedelta64) assert com.is_timedelta64_dtype(pd.Series([], dtype="timedelta64[ns]")) - assert com.is_timedelta64_dtype(pd.to_timedelta(['0 days', '1 days'])) + assert com.is_timedelta64_dtype(pd.to_timedelta(["0 days", "1 days"])) def test_is_period_dtype(): @@ -267,7 +297,7 @@ def test_is_string_dtype(): assert com.is_string_dtype(str) assert com.is_string_dtype(object) - assert com.is_string_dtype(np.array(['a', 'b'])) + assert com.is_string_dtype(np.array(["a", "b"])) def test_is_period_arraylike(): @@ -298,77 +328,119 @@ def test_is_datetimelike(): @pytest.mark.parametrize( - 'dtype', [ - pd.Series([1, 2])] + - ALL_INT_DTYPES + to_numpy_dtypes(ALL_INT_DTYPES) + - ALL_EA_INT_DTYPES + to_ea_dtypes(ALL_EA_INT_DTYPES)) + "dtype", + [pd.Series([1, 2])] + + ALL_INT_DTYPES + + to_numpy_dtypes(ALL_INT_DTYPES) + + ALL_EA_INT_DTYPES + + to_ea_dtypes(ALL_EA_INT_DTYPES), +) def test_is_integer_dtype(dtype): assert com.is_integer_dtype(dtype) @pytest.mark.parametrize( - 'dtype', [str, float, np.datetime64, np.timedelta64, - pd.Index([1, 2.]), np.array(['a', 'b']), - np.array([], dtype=np.timedelta64)]) + "dtype", + [ + str, + float, + np.datetime64, + np.timedelta64, + pd.Index([1, 2.0]), + np.array(["a", "b"]), + np.array([], dtype=np.timedelta64), + ], +) def test_is_not_integer_dtype(dtype): assert not com.is_integer_dtype(dtype) @pytest.mark.parametrize( - 'dtype', [ - pd.Series([1, 2])] + - SIGNED_INT_DTYPES + to_numpy_dtypes(SIGNED_INT_DTYPES) + - SIGNED_EA_INT_DTYPES + to_ea_dtypes(SIGNED_EA_INT_DTYPES)) + "dtype", + [pd.Series([1, 2])] + + SIGNED_INT_DTYPES + + to_numpy_dtypes(SIGNED_INT_DTYPES) + + SIGNED_EA_INT_DTYPES + + to_ea_dtypes(SIGNED_EA_INT_DTYPES), +) def test_is_signed_integer_dtype(dtype): assert com.is_integer_dtype(dtype) @pytest.mark.parametrize( - 'dtype', + "dtype", [ - str, float, np.datetime64, np.timedelta64, - pd.Index([1, 2.]), np.array(['a', 'b']), - np.array([], dtype=np.timedelta64)] + - UNSIGNED_INT_DTYPES + to_numpy_dtypes(UNSIGNED_INT_DTYPES) + - UNSIGNED_EA_INT_DTYPES + to_ea_dtypes(UNSIGNED_EA_INT_DTYPES)) + str, + float, + np.datetime64, + np.timedelta64, + pd.Index([1, 2.0]), + np.array(["a", "b"]), + np.array([], dtype=np.timedelta64), + ] + + UNSIGNED_INT_DTYPES + + to_numpy_dtypes(UNSIGNED_INT_DTYPES) + + UNSIGNED_EA_INT_DTYPES + + to_ea_dtypes(UNSIGNED_EA_INT_DTYPES), +) def test_is_not_signed_integer_dtype(dtype): assert not com.is_signed_integer_dtype(dtype) @pytest.mark.parametrize( - 'dtype', - [pd.Series([1, 2], dtype=np.uint32)] + - UNSIGNED_INT_DTYPES + to_numpy_dtypes(UNSIGNED_INT_DTYPES) + - UNSIGNED_EA_INT_DTYPES + to_ea_dtypes(UNSIGNED_EA_INT_DTYPES)) + "dtype", + [pd.Series([1, 2], dtype=np.uint32)] + + UNSIGNED_INT_DTYPES + + to_numpy_dtypes(UNSIGNED_INT_DTYPES) + + UNSIGNED_EA_INT_DTYPES + + to_ea_dtypes(UNSIGNED_EA_INT_DTYPES), +) def test_is_unsigned_integer_dtype(dtype): assert com.is_unsigned_integer_dtype(dtype) @pytest.mark.parametrize( - 'dtype', + "dtype", [ - str, float, np.datetime64, np.timedelta64, - pd.Index([1, 2.]), np.array(['a', 'b']), - np.array([], dtype=np.timedelta64)] + - SIGNED_INT_DTYPES + to_numpy_dtypes(SIGNED_INT_DTYPES) + - SIGNED_EA_INT_DTYPES + to_ea_dtypes(SIGNED_EA_INT_DTYPES)) + str, + float, + np.datetime64, + np.timedelta64, + pd.Index([1, 2.0]), + np.array(["a", "b"]), + np.array([], dtype=np.timedelta64), + ] + + SIGNED_INT_DTYPES + + to_numpy_dtypes(SIGNED_INT_DTYPES) + + SIGNED_EA_INT_DTYPES + + to_ea_dtypes(SIGNED_EA_INT_DTYPES), +) def test_is_not_unsigned_integer_dtype(dtype): assert not com.is_unsigned_integer_dtype(dtype) @pytest.mark.parametrize( - 'dtype', - [np.int64, np.array([1, 2], dtype=np.int64), 'Int64', pd.Int64Dtype]) + "dtype", [np.int64, np.array([1, 2], dtype=np.int64), "Int64", pd.Int64Dtype] +) def test_is_int64_dtype(dtype): assert com.is_int64_dtype(dtype) @pytest.mark.parametrize( - 'dtype', + "dtype", [ - str, float, np.int32, np.uint64, pd.Index([1, 2.]), - np.array(['a', 'b']), np.array([1, 2], dtype=np.uint32), - 'int8', 'Int8', pd.Int8Dtype]) + str, + float, + np.int32, + np.uint64, + pd.Index([1, 2.0]), + np.array(["a", "b"]), + np.array([1, 2], dtype=np.uint32), + "int8", + "Int8", + pd.Int8Dtype, + ], +) def test_is_not_int64_dtype(dtype): assert not com.is_int64_dtype(dtype) @@ -377,13 +449,14 @@ def test_is_datetime64_any_dtype(): assert not com.is_datetime64_any_dtype(int) assert not com.is_datetime64_any_dtype(str) assert not com.is_datetime64_any_dtype(np.array([1, 2])) - assert not com.is_datetime64_any_dtype(np.array(['a', 'b'])) + assert not com.is_datetime64_any_dtype(np.array(["a", "b"])) assert com.is_datetime64_any_dtype(np.datetime64) assert com.is_datetime64_any_dtype(np.array([], dtype=np.datetime64)) assert com.is_datetime64_any_dtype(DatetimeTZDtype("ns", "US/Eastern")) assert com.is_datetime64_any_dtype( - pd.DatetimeIndex([1, 2, 3], dtype="datetime64[ns]")) + pd.DatetimeIndex([1, 2, 3], dtype="datetime64[ns]") + ) def test_is_datetime64_ns_dtype(): @@ -391,7 +464,7 @@ def test_is_datetime64_ns_dtype(): assert not com.is_datetime64_ns_dtype(str) assert not com.is_datetime64_ns_dtype(np.datetime64) assert not com.is_datetime64_ns_dtype(np.array([1, 2])) - assert not com.is_datetime64_ns_dtype(np.array(['a', 'b'])) + assert not com.is_datetime64_ns_dtype(np.array(["a", "b"])) assert not com.is_datetime64_ns_dtype(np.array([], dtype=np.datetime64)) # This datetime array has the wrong unit (ps instead of ns) @@ -399,34 +472,31 @@ def test_is_datetime64_ns_dtype(): assert com.is_datetime64_ns_dtype(DatetimeTZDtype("ns", "US/Eastern")) assert com.is_datetime64_ns_dtype( - pd.DatetimeIndex([1, 2, 3], dtype=np.dtype('datetime64[ns]'))) + pd.DatetimeIndex([1, 2, 3], dtype=np.dtype("datetime64[ns]")) + ) def test_is_timedelta64_ns_dtype(): - assert not com.is_timedelta64_ns_dtype(np.dtype('m8[ps]')) - assert not com.is_timedelta64_ns_dtype( - np.array([1, 2], dtype=np.timedelta64)) + assert not com.is_timedelta64_ns_dtype(np.dtype("m8[ps]")) + assert not com.is_timedelta64_ns_dtype(np.array([1, 2], dtype=np.timedelta64)) - assert com.is_timedelta64_ns_dtype(np.dtype('m8[ns]')) - assert com.is_timedelta64_ns_dtype(np.array([1, 2], dtype='m8[ns]')) + assert com.is_timedelta64_ns_dtype(np.dtype("m8[ns]")) + assert com.is_timedelta64_ns_dtype(np.array([1, 2], dtype="m8[ns]")) def test_is_datetime_or_timedelta_dtype(): assert not com.is_datetime_or_timedelta_dtype(int) assert not com.is_datetime_or_timedelta_dtype(str) assert not com.is_datetime_or_timedelta_dtype(pd.Series([1, 2])) - assert not com.is_datetime_or_timedelta_dtype(np.array(['a', 'b'])) + assert not com.is_datetime_or_timedelta_dtype(np.array(["a", "b"])) # TODO(jreback), this is slightly suspect - assert not com.is_datetime_or_timedelta_dtype( - DatetimeTZDtype("ns", "US/Eastern")) + assert not com.is_datetime_or_timedelta_dtype(DatetimeTZDtype("ns", "US/Eastern")) assert com.is_datetime_or_timedelta_dtype(np.datetime64) assert com.is_datetime_or_timedelta_dtype(np.timedelta64) - assert com.is_datetime_or_timedelta_dtype( - np.array([], dtype=np.timedelta64)) - assert com.is_datetime_or_timedelta_dtype( - np.array([], dtype=np.datetime64)) + assert com.is_datetime_or_timedelta_dtype(np.array([], dtype=np.timedelta64)) + assert com.is_datetime_or_timedelta_dtype(np.array([], dtype=np.datetime64)) def test_is_numeric_v_string_like(): @@ -434,8 +504,7 @@ def test_is_numeric_v_string_like(): assert not com.is_numeric_v_string_like(1, "foo") assert not com.is_numeric_v_string_like("foo", "foo") assert not com.is_numeric_v_string_like(np.array([1]), np.array([2])) - assert not com.is_numeric_v_string_like( - np.array(["foo"]), np.array(["foo"])) + assert not com.is_numeric_v_string_like(np.array(["foo"]), np.array(["foo"])) assert com.is_numeric_v_string_like(np.array([1]), "foo") assert com.is_numeric_v_string_like("foo", np.array([1])) @@ -479,26 +548,25 @@ def test_needs_i8_conversion(): assert not com.needs_i8_conversion(str) assert not com.needs_i8_conversion(np.int64) assert not com.needs_i8_conversion(pd.Series([1, 2])) - assert not com.needs_i8_conversion(np.array(['a', 'b'])) + assert not com.needs_i8_conversion(np.array(["a", "b"])) assert com.needs_i8_conversion(np.datetime64) assert com.needs_i8_conversion(pd.Series([], dtype="timedelta64[ns]")) - assert com.needs_i8_conversion(pd.DatetimeIndex( - ["2000"], tz="US/Eastern")) + assert com.needs_i8_conversion(pd.DatetimeIndex(["2000"], tz="US/Eastern")) def test_is_numeric_dtype(): assert not com.is_numeric_dtype(str) assert not com.is_numeric_dtype(np.datetime64) assert not com.is_numeric_dtype(np.timedelta64) - assert not com.is_numeric_dtype(np.array(['a', 'b'])) + assert not com.is_numeric_dtype(np.array(["a", "b"])) assert not com.is_numeric_dtype(np.array([], dtype=np.timedelta64)) assert com.is_numeric_dtype(int) assert com.is_numeric_dtype(float) assert com.is_numeric_dtype(np.uint64) assert com.is_numeric_dtype(pd.Series([1, 2])) - assert com.is_numeric_dtype(pd.Index([1, 2.])) + assert com.is_numeric_dtype(pd.Index([1, 2.0])) def test_is_string_like_dtype(): @@ -506,25 +574,25 @@ def test_is_string_like_dtype(): assert not com.is_string_like_dtype(pd.Series([1, 2])) assert com.is_string_like_dtype(str) - assert com.is_string_like_dtype(np.array(['a', 'b'])) + assert com.is_string_like_dtype(np.array(["a", "b"])) def test_is_float_dtype(): assert not com.is_float_dtype(str) assert not com.is_float_dtype(int) assert not com.is_float_dtype(pd.Series([1, 2])) - assert not com.is_float_dtype(np.array(['a', 'b'])) + assert not com.is_float_dtype(np.array(["a", "b"])) assert com.is_float_dtype(float) - assert com.is_float_dtype(pd.Index([1, 2.])) + assert com.is_float_dtype(pd.Index([1, 2.0])) def test_is_bool_dtype(): assert not com.is_bool_dtype(int) assert not com.is_bool_dtype(str) assert not com.is_bool_dtype(pd.Series([1, 2])) - assert not com.is_bool_dtype(np.array(['a', 'b'])) - assert not com.is_bool_dtype(pd.Index(['a', 'b'])) + assert not com.is_bool_dtype(np.array(["a", "b"])) + assert not com.is_bool_dtype(pd.Index(["a", "b"])) assert com.is_bool_dtype(bool) assert com.is_bool_dtype(np.bool) @@ -532,9 +600,9 @@ def test_is_bool_dtype(): assert com.is_bool_dtype(pd.Index([True, False])) -@pytest.mark.parametrize("check_scipy", [ - False, pytest.param(True, marks=td.skip_if_no_scipy) -]) +@pytest.mark.parametrize( + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] +) @ignore_sparse_warning def test_is_extension_type(check_scipy): assert not com.is_extension_type([1, 2, 3]) @@ -546,7 +614,7 @@ def test_is_extension_type(check_scipy): assert com.is_extension_type(pd.Series(cat)) assert com.is_extension_type(pd.SparseArray([1, 2, 3])) assert com.is_extension_type(pd.SparseSeries([1, 2, 3])) - assert com.is_extension_type(pd.DatetimeIndex(['2000'], tz="US/Eastern")) + assert com.is_extension_type(pd.DatetimeIndex(["2000"], tz="US/Eastern")) dtype = DatetimeTZDtype("ns", tz="US/Eastern") s = pd.Series([], dtype=dtype) @@ -554,6 +622,7 @@ def test_is_extension_type(check_scipy): if check_scipy: import scipy.sparse + assert not com.is_extension_type(scipy.sparse.bsr_matrix([1, 2, 3])) @@ -561,15 +630,14 @@ def test_is_complex_dtype(): assert not com.is_complex_dtype(int) assert not com.is_complex_dtype(str) assert not com.is_complex_dtype(pd.Series([1, 2])) - assert not com.is_complex_dtype(np.array(['a', 'b'])) + assert not com.is_complex_dtype(np.array(["a", "b"])) assert com.is_complex_dtype(np.complex) assert com.is_complex_dtype(np.array([1 + 1j, 5])) def test_is_offsetlike(): - assert com.is_offsetlike(np.array([pd.DateOffset(month=3), - pd.offsets.Nano()])) + assert com.is_offsetlike(np.array([pd.DateOffset(month=3), pd.offsets.Nano()])) assert com.is_offsetlike(pd.offsets.MonthEnd()) assert com.is_offsetlike(pd.Index([pd.DateOffset(second=1)])) @@ -580,95 +648,103 @@ def test_is_offsetlike(): assert not com.is_offsetlike(np.array([pd.DateOffset(), pd.Timestamp(0)])) -@pytest.mark.parametrize('input_param,result', [ - (int, np.dtype(int)), - ('int32', np.dtype('int32')), - (float, np.dtype(float)), - ('float64', np.dtype('float64')), - (np.dtype('float64'), np.dtype('float64')), - (str, np.dtype(str)), - (pd.Series([1, 2], dtype=np.dtype('int16')), np.dtype('int16')), - (pd.Series(['a', 'b']), np.dtype(object)), - (pd.Index([1, 2]), np.dtype('int64')), - (pd.Index(['a', 'b']), np.dtype(object)), - ('category', 'category'), - (pd.Categorical(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])), - (pd.Categorical(['a', 'b']), CategoricalDtype(['a', 'b'])), - (pd.CategoricalIndex(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])), - (pd.CategoricalIndex(['a', 'b']), CategoricalDtype(['a', 'b'])), - (CategoricalDtype(), CategoricalDtype()), - (CategoricalDtype(['a', 'b']), CategoricalDtype()), - (pd.DatetimeIndex([1, 2]), np.dtype('=M8[ns]')), - (pd.DatetimeIndex([1, 2]).dtype, np.dtype('=M8[ns]')), - (' df.two.sum() diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 7acfc5ff7f0c57..6824266c9282ba 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -20,15 +20,37 @@ from pandas.core.dtypes import inference from pandas.core.dtypes.common import ( - ensure_categorical, ensure_int32, is_bool, is_datetime64_any_dtype, - is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, - is_float, is_integer, is_number, is_scalar, is_scipy_sparse, - is_timedelta64_dtype, is_timedelta64_ns_dtype) + ensure_categorical, + ensure_int32, + is_bool, + is_datetime64_any_dtype, + is_datetime64_dtype, + is_datetime64_ns_dtype, + is_datetime64tz_dtype, + is_float, + is_integer, + is_number, + is_scalar, + is_scipy_sparse, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, +) import pandas as pd from pandas import ( - Categorical, DataFrame, DateOffset, DatetimeIndex, Index, Interval, Period, - Series, Timedelta, TimedeltaIndex, Timestamp, isna) + Categorical, + DataFrame, + DateOffset, + DatetimeIndex, + Index, + Interval, + Period, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, + isna, +) from pandas.util import testing as tm @@ -40,46 +62,46 @@ def coerce(request): # collect all objects to be tested for list-like-ness; use tuples of objects, # whether they are list-like or not (special casing for sets), and their ID ll_params = [ - ([1], True, 'list'), # noqa: E241 - ([], True, 'list-empty'), # noqa: E241 - ((1, ), True, 'tuple'), # noqa: E241 - (tuple(), True, 'tuple-empty'), # noqa: E241 - ({'a': 1}, True, 'dict'), # noqa: E241 - (dict(), True, 'dict-empty'), # noqa: E241 - ({'a', 1}, 'set', 'set'), # noqa: E241 - (set(), 'set', 'set-empty'), # noqa: E241 - (frozenset({'a', 1}), 'set', 'frozenset'), # noqa: E241 - (frozenset(), 'set', 'frozenset-empty'), # noqa: E241 - (iter([1, 2]), True, 'iterator'), # noqa: E241 - (iter([]), True, 'iterator-empty'), # noqa: E241 - ((x for x in [1, 2]), True, 'generator'), # noqa: E241 - ((x for x in []), True, 'generator-empty'), # noqa: E241 - (Series([1]), True, 'Series'), # noqa: E241 - (Series([]), True, 'Series-empty'), # noqa: E241 - (Series(['a']).str, True, 'StringMethods'), # noqa: E241 - (Series([], dtype='O').str, True, 'StringMethods-empty'), # noqa: E241 - (Index([1]), True, 'Index'), # noqa: E241 - (Index([]), True, 'Index-empty'), # noqa: E241 - (DataFrame([[1]]), True, 'DataFrame'), # noqa: E241 - (DataFrame(), True, 'DataFrame-empty'), # noqa: E241 - (np.ndarray((2,) * 1), True, 'ndarray-1d'), # noqa: E241 - (np.array([]), True, 'ndarray-1d-empty'), # noqa: E241 - (np.ndarray((2,) * 2), True, 'ndarray-2d'), # noqa: E241 - (np.array([[]]), True, 'ndarray-2d-empty'), # noqa: E241 - (np.ndarray((2,) * 3), True, 'ndarray-3d'), # noqa: E241 - (np.array([[[]]]), True, 'ndarray-3d-empty'), # noqa: E241 - (np.ndarray((2,) * 4), True, 'ndarray-4d'), # noqa: E241 - (np.array([[[[]]]]), True, 'ndarray-4d-empty'), # noqa: E241 - (np.array(2), False, 'ndarray-0d'), # noqa: E241 - (1, False, 'int'), # noqa: E241 - (b'123', False, 'bytes'), # noqa: E241 - (b'', False, 'bytes-empty'), # noqa: E241 - ('123', False, 'string'), # noqa: E241 - ('', False, 'string-empty'), # noqa: E241 - (str, False, 'string-type'), # noqa: E241 - (object(), False, 'object'), # noqa: E241 - (np.nan, False, 'NaN'), # noqa: E241 - (None, False, 'None') # noqa: E241 + ([1], True, "list"), # noqa: E241 + ([], True, "list-empty"), # noqa: E241 + ((1,), True, "tuple"), # noqa: E241 + (tuple(), True, "tuple-empty"), # noqa: E241 + ({"a": 1}, True, "dict"), # noqa: E241 + (dict(), True, "dict-empty"), # noqa: E241 + ({"a", 1}, "set", "set"), # noqa: E241 + (set(), "set", "set-empty"), # noqa: E241 + (frozenset({"a", 1}), "set", "frozenset"), # noqa: E241 + (frozenset(), "set", "frozenset-empty"), # noqa: E241 + (iter([1, 2]), True, "iterator"), # noqa: E241 + (iter([]), True, "iterator-empty"), # noqa: E241 + ((x for x in [1, 2]), True, "generator"), # noqa: E241 + ((x for x in []), True, "generator-empty"), # noqa: E241 + (Series([1]), True, "Series"), # noqa: E241 + (Series([]), True, "Series-empty"), # noqa: E241 + (Series(["a"]).str, True, "StringMethods"), # noqa: E241 + (Series([], dtype="O").str, True, "StringMethods-empty"), # noqa: E241 + (Index([1]), True, "Index"), # noqa: E241 + (Index([]), True, "Index-empty"), # noqa: E241 + (DataFrame([[1]]), True, "DataFrame"), # noqa: E241 + (DataFrame(), True, "DataFrame-empty"), # noqa: E241 + (np.ndarray((2,) * 1), True, "ndarray-1d"), # noqa: E241 + (np.array([]), True, "ndarray-1d-empty"), # noqa: E241 + (np.ndarray((2,) * 2), True, "ndarray-2d"), # noqa: E241 + (np.array([[]]), True, "ndarray-2d-empty"), # noqa: E241 + (np.ndarray((2,) * 3), True, "ndarray-3d"), # noqa: E241 + (np.array([[[]]]), True, "ndarray-3d-empty"), # noqa: E241 + (np.ndarray((2,) * 4), True, "ndarray-4d"), # noqa: E241 + (np.array([[[[]]]]), True, "ndarray-4d-empty"), # noqa: E241 + (np.array(2), False, "ndarray-0d"), # noqa: E241 + (1, False, "int"), # noqa: E241 + (b"123", False, "bytes"), # noqa: E241 + (b"", False, "bytes-empty"), # noqa: E241 + ("123", False, "string"), # noqa: E241 + ("", False, "string-empty"), # noqa: E241 + (str, False, "string-type"), # noqa: E241 + (object(), False, "object"), # noqa: E241 + (np.nan, False, "NaN"), # noqa: E241 + (None, False, "None"), # noqa: E241 ] objs, expected, ids = zip(*ll_params) @@ -91,29 +113,28 @@ def maybe_list_like(request): def test_is_list_like(maybe_list_like): obj, expected = maybe_list_like - expected = True if expected == 'set' else expected + expected = True if expected == "set" else expected assert inference.is_list_like(obj) == expected def test_is_list_like_disallow_sets(maybe_list_like): obj, expected = maybe_list_like - expected = False if expected == 'set' else expected + expected = False if expected == "set" else expected assert inference.is_list_like(obj, allow_sets=False) == expected def test_is_sequence(): is_seq = inference.is_sequence - assert (is_seq((1, 2))) - assert (is_seq([1, 2])) - assert (not is_seq("abcd")) - assert (not is_seq(np.int64)) + assert is_seq((1, 2)) + assert is_seq([1, 2]) + assert not is_seq("abcd") + assert not is_seq(np.int64) class A: - def __getitem__(self): return 1 - assert (not is_seq(A())) + assert not is_seq(A()) def test_is_array_like(): @@ -133,37 +154,66 @@ class DtypeList(list): assert not inference.is_array_like(123) -@pytest.mark.parametrize('inner', [ - [], [1], (1, ), (1, 2), {'a': 1}, {1, 'a'}, Series([1]), - Series([]), Series(['a']).str, (x for x in range(5)) -]) -@pytest.mark.parametrize('outer', [ - list, Series, np.array, tuple -]) +@pytest.mark.parametrize( + "inner", + [ + [], + [1], + (1,), + (1, 2), + {"a": 1}, + {1, "a"}, + Series([1]), + Series([]), + Series(["a"]).str, + (x for x in range(5)), + ], +) +@pytest.mark.parametrize("outer", [list, Series, np.array, tuple]) def test_is_nested_list_like_passes(inner, outer): result = outer([inner for _ in range(5)]) assert inference.is_list_like(result) -@pytest.mark.parametrize('obj', [ - 'abc', [], [1], (1,), ['a'], 'a', {'a'}, - [1, 2, 3], Series([1]), DataFrame({"A": [1]}), - ([1, 2] for _ in range(5)), -]) +@pytest.mark.parametrize( + "obj", + [ + "abc", + [], + [1], + (1,), + ["a"], + "a", + {"a"}, + [1, 2, 3], + Series([1]), + DataFrame({"A": [1]}), + ([1, 2] for _ in range(5)), + ], +) def test_is_nested_list_like_fails(obj): assert not inference.is_nested_list_like(obj) -@pytest.mark.parametrize( - "ll", [{}, {'A': 1}, Series([1]), collections.defaultdict()]) +@pytest.mark.parametrize("ll", [{}, {"A": 1}, Series([1]), collections.defaultdict()]) def test_is_dict_like_passes(ll): assert inference.is_dict_like(ll) -@pytest.mark.parametrize("ll", [ - '1', 1, [1, 2], (1, 2), range(2), Index([1]), - dict, collections.defaultdict, Series -]) +@pytest.mark.parametrize( + "ll", + [ + "1", + 1, + [1, 2], + (1, 2), + range(2), + Index([1]), + dict, + collections.defaultdict, + Series, + ], +) def test_is_dict_like_fails(ll): assert not inference.is_dict_like(ll) @@ -177,14 +227,17 @@ def __init__(self, d): self.d = d if has_keys: + def keys(self): return self.d.keys() if has_getitem: + def __getitem__(self, key): return self.d.__getitem__(key) if has_contains: + def __contains__(self, key): return self.d.__contains__(key) @@ -235,14 +288,12 @@ class MockFile: assert not is_file(data) -@pytest.mark.parametrize( - "ll", [collections.namedtuple('Test', list('abc'))(1, 2, 3)]) +@pytest.mark.parametrize("ll", [collections.namedtuple("Test", list("abc"))(1, 2, 3)]) def test_is_names_tuple_passes(ll): assert inference.is_named_tuple(ll) -@pytest.mark.parametrize( - "ll", [(1, 2, 3), 'a', Series({'pi': 3.14})]) +@pytest.mark.parametrize("ll", [(1, 2, 3), "a", Series({"pi": 3.14})]) def test_is_names_tuple_fails(ll): assert not inference.is_named_tuple(ll) @@ -257,19 +308,12 @@ class UnhashableClass1: __hash__ = None class UnhashableClass2: - def __hash__(self): raise TypeError("Not hashable") - hashable = (1, - 3.14, - np.float64(3.14), - 'a', - tuple(), - (1, ), - HashableClass(), ) - not_hashable = ([], UnhashableClass1(), ) - abc_hashable_not_really_hashable = (([], ), UnhashableClass2(), ) + hashable = (1, 3.14, np.float64(3.14), "a", tuple(), (1,), HashableClass()) + not_hashable = ([], UnhashableClass1()) + abc_hashable_not_really_hashable = (([],), UnhashableClass2()) for i in hashable: assert inference.is_hashable(i) @@ -284,41 +328,34 @@ def __hash__(self): assert not inference.is_hashable(np.array([])) -@pytest.mark.parametrize( - "ll", [re.compile('ad')]) +@pytest.mark.parametrize("ll", [re.compile("ad")]) def test_is_re_passes(ll): assert inference.is_re(ll) -@pytest.mark.parametrize( - "ll", ['x', 2, 3, object()]) +@pytest.mark.parametrize("ll", ["x", 2, 3, object()]) def test_is_re_fails(ll): assert not inference.is_re(ll) @pytest.mark.parametrize( - "ll", [r'a', 'x', - r'asdf', - re.compile('adsf'), - r'\u2233\s*', - re.compile(r'')]) + "ll", [r"a", "x", r"asdf", re.compile("adsf"), r"\u2233\s*", re.compile(r"")] +) def test_is_recompilable_passes(ll): assert inference.is_re_compilable(ll) -@pytest.mark.parametrize( - "ll", [1, [], object()]) +@pytest.mark.parametrize("ll", [1, [], object()]) def test_is_recompilable_fails(ll): assert not inference.is_re_compilable(ll) class TestInference: - def test_infer_dtype_bytes(self): - compare = 'bytes' + compare = "bytes" # string array of bytes - arr = np.array(list('abc'), dtype='S1') + arr = np.array(list("abc"), dtype="S1") assert lib.infer_dtype(arr, skipna=True) == compare # object array of bytes @@ -326,64 +363,65 @@ def test_infer_dtype_bytes(self): assert lib.infer_dtype(arr, skipna=True) == compare # object array of bytes with missing values - assert lib.infer_dtype([b'a', np.nan, b'c'], skipna=True) == compare + assert lib.infer_dtype([b"a", np.nan, b"c"], skipna=True) == compare def test_isinf_scalar(self): # GH 11352 - assert libmissing.isposinf_scalar(float('inf')) + assert libmissing.isposinf_scalar(float("inf")) assert libmissing.isposinf_scalar(np.inf) assert not libmissing.isposinf_scalar(-np.inf) assert not libmissing.isposinf_scalar(1) - assert not libmissing.isposinf_scalar('a') + assert not libmissing.isposinf_scalar("a") - assert libmissing.isneginf_scalar(float('-inf')) + assert libmissing.isneginf_scalar(float("-inf")) assert libmissing.isneginf_scalar(-np.inf) assert not libmissing.isneginf_scalar(np.inf) assert not libmissing.isneginf_scalar(1) - assert not libmissing.isneginf_scalar('a') + assert not libmissing.isneginf_scalar("a") def test_maybe_convert_numeric_infinities(self): # see gh-13274 - infinities = ['inf', 'inF', 'iNf', 'Inf', - 'iNF', 'InF', 'INf', 'INF'] - na_values = {'', 'NULL', 'nan'} + infinities = ["inf", "inF", "iNf", "Inf", "iNF", "InF", "INf", "INF"] + na_values = {"", "NULL", "nan"} - pos = np.array(['inf'], dtype=np.float64) - neg = np.array(['-inf'], dtype=np.float64) + pos = np.array(["inf"], dtype=np.float64) + neg = np.array(["-inf"], dtype=np.float64) msg = "Unable to parse string" for infinity in infinities: for maybe_int in (True, False): out = lib.maybe_convert_numeric( - np.array([infinity], dtype=object), - na_values, maybe_int) + np.array([infinity], dtype=object), na_values, maybe_int + ) tm.assert_numpy_array_equal(out, pos) out = lib.maybe_convert_numeric( - np.array(['-' + infinity], dtype=object), - na_values, maybe_int) + np.array(["-" + infinity], dtype=object), na_values, maybe_int + ) tm.assert_numpy_array_equal(out, neg) out = lib.maybe_convert_numeric( - np.array([infinity], dtype=object), - na_values, maybe_int) + np.array([infinity], dtype=object), na_values, maybe_int + ) tm.assert_numpy_array_equal(out, pos) out = lib.maybe_convert_numeric( - np.array(['+' + infinity], dtype=object), - na_values, maybe_int) + np.array(["+" + infinity], dtype=object), na_values, maybe_int + ) tm.assert_numpy_array_equal(out, pos) # too many characters with pytest.raises(ValueError, match=msg): lib.maybe_convert_numeric( - np.array(['foo_' + infinity], dtype=object), - na_values, maybe_int) + np.array(["foo_" + infinity], dtype=object), + na_values, + maybe_int, + ) def test_maybe_convert_numeric_post_floatify_nan(self, coerce): # see gh-13314 - data = np.array(['1.200', '-999.000', '4.500'], dtype=object) + data = np.array(["1.200", "-999.000", "4.500"], dtype=object) expected = np.array([1.2, np.nan, 4.5], dtype=np.float64) nan_values = {-999, -999.0} @@ -391,74 +429,81 @@ def test_maybe_convert_numeric_post_floatify_nan(self, coerce): tm.assert_numpy_array_equal(out, expected) def test_convert_infs(self): - arr = np.array(['inf', 'inf', 'inf'], dtype='O') + arr = np.array(["inf", "inf", "inf"], dtype="O") result = lib.maybe_convert_numeric(arr, set(), False) assert result.dtype == np.float64 - arr = np.array(['-inf', '-inf', '-inf'], dtype='O') + arr = np.array(["-inf", "-inf", "-inf"], dtype="O") result = lib.maybe_convert_numeric(arr, set(), False) assert result.dtype == np.float64 def test_scientific_no_exponent(self): # See PR 12215 - arr = np.array(['42E', '2E', '99e', '6e'], dtype='O') + arr = np.array(["42E", "2E", "99e", "6e"], dtype="O") result = lib.maybe_convert_numeric(arr, set(), False, True) assert np.all(np.isnan(result)) def test_convert_non_hashable(self): # GH13324 # make sure that we are handing non-hashables - arr = np.array([[10.0, 2], 1.0, 'apple']) + arr = np.array([[10.0, 2], 1.0, "apple"]) result = lib.maybe_convert_numeric(arr, set(), False, True) tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan])) def test_convert_numeric_uint64(self): - arr = np.array([2**63], dtype=object) - exp = np.array([2**63], dtype=np.uint64) + arr = np.array([2 ** 63], dtype=object) + exp = np.array([2 ** 63], dtype=np.uint64) tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp) - arr = np.array([str(2**63)], dtype=object) - exp = np.array([2**63], dtype=np.uint64) + arr = np.array([str(2 ** 63)], dtype=object) + exp = np.array([2 ** 63], dtype=np.uint64) tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp) - arr = np.array([np.uint64(2**63)], dtype=object) - exp = np.array([2**63], dtype=np.uint64) + arr = np.array([np.uint64(2 ** 63)], dtype=object) + exp = np.array([2 ** 63], dtype=np.uint64) tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp) - @pytest.mark.parametrize("arr", [ - np.array([2**63, np.nan], dtype=object), - np.array([str(2**63), np.nan], dtype=object), - np.array([np.nan, 2**63], dtype=object), - np.array([np.nan, str(2**63)], dtype=object)]) + @pytest.mark.parametrize( + "arr", + [ + np.array([2 ** 63, np.nan], dtype=object), + np.array([str(2 ** 63), np.nan], dtype=object), + np.array([np.nan, 2 ** 63], dtype=object), + np.array([np.nan, str(2 ** 63)], dtype=object), + ], + ) def test_convert_numeric_uint64_nan(self, coerce, arr): expected = arr.astype(float) if coerce else arr.copy() - result = lib.maybe_convert_numeric(arr, set(), - coerce_numeric=coerce) + result = lib.maybe_convert_numeric(arr, set(), coerce_numeric=coerce) tm.assert_almost_equal(result, expected) def test_convert_numeric_uint64_nan_values(self, coerce): - arr = np.array([2**63, 2**63 + 1], dtype=object) - na_values = {2**63} + arr = np.array([2 ** 63, 2 ** 63 + 1], dtype=object) + na_values = {2 ** 63} - expected = (np.array([np.nan, 2**63 + 1], dtype=float) - if coerce else arr.copy()) - result = lib.maybe_convert_numeric(arr, na_values, - coerce_numeric=coerce) + expected = ( + np.array([np.nan, 2 ** 63 + 1], dtype=float) if coerce else arr.copy() + ) + result = lib.maybe_convert_numeric(arr, na_values, coerce_numeric=coerce) tm.assert_almost_equal(result, expected) - @pytest.mark.parametrize("case", [ - np.array([2**63, -1], dtype=object), - np.array([str(2**63), -1], dtype=object), - np.array([str(2**63), str(-1)], dtype=object), - np.array([-1, 2**63], dtype=object), - np.array([-1, str(2**63)], dtype=object), - np.array([str(-1), str(2**63)], dtype=object)]) + @pytest.mark.parametrize( + "case", + [ + np.array([2 ** 63, -1], dtype=object), + np.array([str(2 ** 63), -1], dtype=object), + np.array([str(2 ** 63), str(-1)], dtype=object), + np.array([-1, 2 ** 63], dtype=object), + np.array([-1, str(2 ** 63)], dtype=object), + np.array([str(-1), str(2 ** 63)], dtype=object), + ], + ) def test_convert_numeric_int64_uint64(self, case, coerce): expected = case.astype(float) if coerce else case.copy() result = lib.maybe_convert_numeric(case, set(), coerce_numeric=coerce) tm.assert_almost_equal(result, expected) - @pytest.mark.parametrize("value", [-2**63 - 1, 2**64]) + @pytest.mark.parametrize("value", [-2 ** 63 - 1, 2 ** 64]) def test_convert_int_overflow(self, value): # see gh-18584 arr = np.array([value], dtype=object) @@ -467,29 +512,28 @@ def test_convert_int_overflow(self, value): def test_maybe_convert_objects_uint64(self): # see gh-4471 - arr = np.array([2**63], dtype=object) - exp = np.array([2**63], dtype=np.uint64) + arr = np.array([2 ** 63], dtype=object) + exp = np.array([2 ** 63], dtype=np.uint64) tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) # NumPy bug: can't compare uint64 to int64, as that # results in both casting to float64, so we should # make sure that this function is robust against it - arr = np.array([np.uint64(2**63)], dtype=object) - exp = np.array([2**63], dtype=np.uint64) + arr = np.array([np.uint64(2 ** 63)], dtype=object) + exp = np.array([2 ** 63], dtype=np.uint64) tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) arr = np.array([2, -1], dtype=object) exp = np.array([2, -1], dtype=np.int64) tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) - arr = np.array([2**63, -1], dtype=object) - exp = np.array([2**63, -1], dtype=object) + arr = np.array([2 ** 63, -1], dtype=object) + exp = np.array([2 ** 63, -1], dtype=object) tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) def test_mixed_dtypes_remain_object_array(self): # GH14956 - array = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], - dtype=object) + array = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object) result = lib.maybe_convert_objects(array, convert_datetime=1) tm.assert_numpy_array_equal(result, array) @@ -507,32 +551,31 @@ def test_inferred_dtype_fixture(self, any_skipna_inferred_dtype): # make sure the inferred dtype of the fixture is as requested assert inferred_dtype == lib.infer_dtype(values, skipna=True) - @pytest.mark.parametrize('skipna', [True, False]) + @pytest.mark.parametrize("skipna", [True, False]) def test_length_zero(self, skipna): - result = lib.infer_dtype(np.array([], dtype='i4'), skipna=skipna) - assert result == 'integer' + result = lib.infer_dtype(np.array([], dtype="i4"), skipna=skipna) + assert result == "integer" result = lib.infer_dtype([], skipna=skipna) - assert result == 'empty' + assert result == "empty" # GH 18004 - arr = np.array([np.array([], dtype=object), - np.array([], dtype=object)]) + arr = np.array([np.array([], dtype=object), np.array([], dtype=object)]) result = lib.infer_dtype(arr, skipna=skipna) - assert result == 'empty' + assert result == "empty" def test_integers(self): - arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O') + arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype="O") result = lib.infer_dtype(arr, skipna=True) - assert result == 'integer' + assert result == "integer" - arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O') + arr = np.array([1, 2, 3, np.int64(4), np.int32(5), "foo"], dtype="O") result = lib.infer_dtype(arr, skipna=True) - assert result == 'mixed-integer' + assert result == "mixed-integer" - arr = np.array([1, 2, 3, 4, 5], dtype='i4') + arr = np.array([1, 2, 3, 4, 5], dtype="i4") result = lib.infer_dtype(arr, skipna=True) - assert result == 'integer' + assert result == "integer" def test_deprecation(self): # GH 24050 @@ -540,121 +583,123 @@ def test_deprecation(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = lib.infer_dtype(arr) # default: skipna=None -> warn - assert result == 'integer' + assert result == "integer" def test_bools(self): - arr = np.array([True, False, True, True, True], dtype='O') + arr = np.array([True, False, True, True, True], dtype="O") result = lib.infer_dtype(arr, skipna=True) - assert result == 'boolean' + assert result == "boolean" - arr = np.array([np.bool_(True), np.bool_(False)], dtype='O') + arr = np.array([np.bool_(True), np.bool_(False)], dtype="O") result = lib.infer_dtype(arr, skipna=True) - assert result == 'boolean' + assert result == "boolean" - arr = np.array([True, False, True, 'foo'], dtype='O') + arr = np.array([True, False, True, "foo"], dtype="O") result = lib.infer_dtype(arr, skipna=True) - assert result == 'mixed' + assert result == "mixed" arr = np.array([True, False, True], dtype=bool) result = lib.infer_dtype(arr, skipna=True) - assert result == 'boolean' + assert result == "boolean" - arr = np.array([True, np.nan, False], dtype='O') + arr = np.array([True, np.nan, False], dtype="O") result = lib.infer_dtype(arr, skipna=True) - assert result == 'boolean' + assert result == "boolean" result = lib.infer_dtype(arr, skipna=False) - assert result == 'mixed' + assert result == "mixed" def test_floats(self): - arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') + arr = np.array([1.0, 2.0, 3.0, np.float64(4), np.float32(5)], dtype="O") result = lib.infer_dtype(arr, skipna=True) - assert result == 'floating' + assert result == "floating" - arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'], - dtype='O') + arr = np.array([1, 2, 3, np.float64(4), np.float32(5), "foo"], dtype="O") result = lib.infer_dtype(arr, skipna=True) - assert result == 'mixed-integer' + assert result == "mixed-integer" - arr = np.array([1, 2, 3, 4, 5], dtype='f4') + arr = np.array([1, 2, 3, 4, 5], dtype="f4") result = lib.infer_dtype(arr, skipna=True) - assert result == 'floating' + assert result == "floating" - arr = np.array([1, 2, 3, 4, 5], dtype='f8') + arr = np.array([1, 2, 3, 4, 5], dtype="f8") result = lib.infer_dtype(arr, skipna=True) - assert result == 'floating' + assert result == "floating" def test_decimals(self): # GH15690 arr = np.array([Decimal(1), Decimal(2), Decimal(3)]) result = lib.infer_dtype(arr, skipna=True) - assert result == 'decimal' + assert result == "decimal" arr = np.array([1.0, 2.0, Decimal(3)]) result = lib.infer_dtype(arr, skipna=True) - assert result == 'mixed' + assert result == "mixed" - arr = np.array([Decimal(1), Decimal('NaN'), Decimal(3)]) + arr = np.array([Decimal(1), Decimal("NaN"), Decimal(3)]) result = lib.infer_dtype(arr, skipna=True) - assert result == 'decimal' + assert result == "decimal" - arr = np.array([Decimal(1), np.nan, Decimal(3)], dtype='O') + arr = np.array([Decimal(1), np.nan, Decimal(3)], dtype="O") result = lib.infer_dtype(arr, skipna=True) - assert result == 'decimal' + assert result == "decimal" # complex is compatible with nan, so skipna has no effect - @pytest.mark.parametrize('skipna', [True, False]) + @pytest.mark.parametrize("skipna", [True, False]) def test_complex(self, skipna): # gets cast to complex on array construction arr = np.array([1.0, 2.0, 1 + 1j]) result = lib.infer_dtype(arr, skipna=skipna) - assert result == 'complex' + assert result == "complex" - arr = np.array([1.0, 2.0, 1 + 1j], dtype='O') + arr = np.array([1.0, 2.0, 1 + 1j], dtype="O") result = lib.infer_dtype(arr, skipna=skipna) - assert result == 'mixed' + assert result == "mixed" # gets cast to complex on array construction arr = np.array([1, np.nan, 1 + 1j]) result = lib.infer_dtype(arr, skipna=skipna) - assert result == 'complex' + assert result == "complex" - arr = np.array([1.0, np.nan, 1 + 1j], dtype='O') + arr = np.array([1.0, np.nan, 1 + 1j], dtype="O") result = lib.infer_dtype(arr, skipna=skipna) - assert result == 'mixed' + assert result == "mixed" # complex with nans stays complex - arr = np.array([1 + 1j, np.nan, 3 + 3j], dtype='O') + arr = np.array([1 + 1j, np.nan, 3 + 3j], dtype="O") result = lib.infer_dtype(arr, skipna=skipna) - assert result == 'complex' + assert result == "complex" # test smaller complex dtype; will pass through _try_infer_map fastpath arr = np.array([1 + 1j, np.nan, 3 + 3j], dtype=np.complex64) result = lib.infer_dtype(arr, skipna=skipna) - assert result == 'complex' + assert result == "complex" def test_string(self): pass def test_unicode(self): - arr = ['a', np.nan, 'c'] + arr = ["a", np.nan, "c"] result = lib.infer_dtype(arr, skipna=False) - assert result == 'mixed' + assert result == "mixed" - arr = ['a', np.nan, 'c'] + arr = ["a", np.nan, "c"] result = lib.infer_dtype(arr, skipna=True) - expected = 'string' + expected = "string" assert result == expected - @pytest.mark.parametrize('dtype, missing, skipna, expected', [ - (float, np.nan, False, 'floating'), - (float, np.nan, True, 'floating'), - (object, np.nan, False, 'floating'), - (object, np.nan, True, 'empty'), - (object, None, False, 'mixed'), - (object, None, True, 'empty') - ]) - @pytest.mark.parametrize('box', [pd.Series, np.array]) + @pytest.mark.parametrize( + "dtype, missing, skipna, expected", + [ + (float, np.nan, False, "floating"), + (float, np.nan, True, "floating"), + (object, np.nan, False, "floating"), + (object, np.nan, True, "empty"), + (object, None, False, "mixed"), + (object, None, True, "empty"), + ], + ) + @pytest.mark.parametrize("box", [pd.Series, np.array]) def test_object_empty(self, box, missing, dtype, skipna, expected): # GH 23421 arr = box([missing, missing], dtype=dtype) @@ -666,154 +711,150 @@ def test_datetime(self): dates = [datetime(2012, 1, x) for x in range(1, 20)] index = Index(dates) - assert index.inferred_type == 'datetime64' + assert index.inferred_type == "datetime64" def test_infer_dtype_datetime(self): - arr = np.array([Timestamp('2011-01-01'), - Timestamp('2011-01-02')]) - assert lib.infer_dtype(arr, skipna=True) == 'datetime' + arr = np.array([Timestamp("2011-01-01"), Timestamp("2011-01-02")]) + assert lib.infer_dtype(arr, skipna=True) == "datetime" - arr = np.array([np.datetime64('2011-01-01'), - np.datetime64('2011-01-01')], dtype=object) - assert lib.infer_dtype(arr, skipna=True) == 'datetime64' + arr = np.array( + [np.datetime64("2011-01-01"), np.datetime64("2011-01-01")], dtype=object + ) + assert lib.infer_dtype(arr, skipna=True) == "datetime64" arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)]) - assert lib.infer_dtype(arr, skipna=True) == 'datetime' + assert lib.infer_dtype(arr, skipna=True) == "datetime" # starts with nan for n in [pd.NaT, np.nan]: - arr = np.array([n, pd.Timestamp('2011-01-02')]) - assert lib.infer_dtype(arr, skipna=True) == 'datetime' + arr = np.array([n, pd.Timestamp("2011-01-02")]) + assert lib.infer_dtype(arr, skipna=True) == "datetime" - arr = np.array([n, np.datetime64('2011-01-02')]) - assert lib.infer_dtype(arr, skipna=True) == 'datetime64' + arr = np.array([n, np.datetime64("2011-01-02")]) + assert lib.infer_dtype(arr, skipna=True) == "datetime64" arr = np.array([n, datetime(2011, 1, 1)]) - assert lib.infer_dtype(arr, skipna=True) == 'datetime' + assert lib.infer_dtype(arr, skipna=True) == "datetime" - arr = np.array([n, pd.Timestamp('2011-01-02'), n]) - assert lib.infer_dtype(arr, skipna=True) == 'datetime' + arr = np.array([n, pd.Timestamp("2011-01-02"), n]) + assert lib.infer_dtype(arr, skipna=True) == "datetime" - arr = np.array([n, np.datetime64('2011-01-02'), n]) - assert lib.infer_dtype(arr, skipna=True) == 'datetime64' + arr = np.array([n, np.datetime64("2011-01-02"), n]) + assert lib.infer_dtype(arr, skipna=True) == "datetime64" arr = np.array([n, datetime(2011, 1, 1), n]) - assert lib.infer_dtype(arr, skipna=True) == 'datetime' + assert lib.infer_dtype(arr, skipna=True) == "datetime" # different type of nat - arr = np.array([np.timedelta64('nat'), - np.datetime64('2011-01-02')], dtype=object) - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + arr = np.array( + [np.timedelta64("nat"), np.datetime64("2011-01-02")], dtype=object + ) + assert lib.infer_dtype(arr, skipna=False) == "mixed" - arr = np.array([np.datetime64('2011-01-02'), - np.timedelta64('nat')], dtype=object) - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + arr = np.array( + [np.datetime64("2011-01-02"), np.timedelta64("nat")], dtype=object + ) + assert lib.infer_dtype(arr, skipna=False) == "mixed" # mixed datetime - arr = np.array([datetime(2011, 1, 1), - pd.Timestamp('2011-01-02')]) - assert lib.infer_dtype(arr, skipna=True) == 'datetime' + arr = np.array([datetime(2011, 1, 1), pd.Timestamp("2011-01-02")]) + assert lib.infer_dtype(arr, skipna=True) == "datetime" # should be datetime? - arr = np.array([np.datetime64('2011-01-01'), - pd.Timestamp('2011-01-02')]) - assert lib.infer_dtype(arr, skipna=True) == 'mixed' + arr = np.array([np.datetime64("2011-01-01"), pd.Timestamp("2011-01-02")]) + assert lib.infer_dtype(arr, skipna=True) == "mixed" - arr = np.array([pd.Timestamp('2011-01-02'), - np.datetime64('2011-01-01')]) - assert lib.infer_dtype(arr, skipna=True) == 'mixed' + arr = np.array([pd.Timestamp("2011-01-02"), np.datetime64("2011-01-01")]) + assert lib.infer_dtype(arr, skipna=True) == "mixed" - arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1]) - assert lib.infer_dtype(arr, skipna=True) == 'mixed-integer' + arr = np.array([np.nan, pd.Timestamp("2011-01-02"), 1]) + assert lib.infer_dtype(arr, skipna=True) == "mixed-integer" - arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1]) - assert lib.infer_dtype(arr, skipna=True) == 'mixed' + arr = np.array([np.nan, pd.Timestamp("2011-01-02"), 1.1]) + assert lib.infer_dtype(arr, skipna=True) == "mixed" - arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')]) - assert lib.infer_dtype(arr, skipna=True) == 'mixed' + arr = np.array([np.nan, "2011-01-01", pd.Timestamp("2011-01-02")]) + assert lib.infer_dtype(arr, skipna=True) == "mixed" def test_infer_dtype_timedelta(self): - arr = np.array([pd.Timedelta('1 days'), - pd.Timedelta('2 days')]) - assert lib.infer_dtype(arr, skipna=True) == 'timedelta' + arr = np.array([pd.Timedelta("1 days"), pd.Timedelta("2 days")]) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" - arr = np.array([np.timedelta64(1, 'D'), - np.timedelta64(2, 'D')], dtype=object) - assert lib.infer_dtype(arr, skipna=True) == 'timedelta' + arr = np.array([np.timedelta64(1, "D"), np.timedelta64(2, "D")], dtype=object) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" arr = np.array([timedelta(1), timedelta(2)]) - assert lib.infer_dtype(arr, skipna=True) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == "timedelta" # starts with nan for n in [pd.NaT, np.nan]: - arr = np.array([n, Timedelta('1 days')]) - assert lib.infer_dtype(arr, skipna=True) == 'timedelta' + arr = np.array([n, Timedelta("1 days")]) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" - arr = np.array([n, np.timedelta64(1, 'D')]) - assert lib.infer_dtype(arr, skipna=True) == 'timedelta' + arr = np.array([n, np.timedelta64(1, "D")]) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" arr = np.array([n, timedelta(1)]) - assert lib.infer_dtype(arr, skipna=True) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == "timedelta" - arr = np.array([n, pd.Timedelta('1 days'), n]) - assert lib.infer_dtype(arr, skipna=True) == 'timedelta' + arr = np.array([n, pd.Timedelta("1 days"), n]) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" - arr = np.array([n, np.timedelta64(1, 'D'), n]) - assert lib.infer_dtype(arr, skipna=True) == 'timedelta' + arr = np.array([n, np.timedelta64(1, "D"), n]) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" arr = np.array([n, timedelta(1), n]) - assert lib.infer_dtype(arr, skipna=True) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == "timedelta" # different type of nat - arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')], - dtype=object) - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + arr = np.array([np.datetime64("nat"), np.timedelta64(1, "D")], dtype=object) + assert lib.infer_dtype(arr, skipna=False) == "mixed" - arr = np.array([np.timedelta64(1, 'D'), np.datetime64('nat')], - dtype=object) - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + arr = np.array([np.timedelta64(1, "D"), np.datetime64("nat")], dtype=object) + assert lib.infer_dtype(arr, skipna=False) == "mixed" def test_infer_dtype_period(self): # GH 13664 - arr = np.array([pd.Period('2011-01', freq='D'), - pd.Period('2011-02', freq='D')]) - assert lib.infer_dtype(arr, skipna=True) == 'period' + arr = np.array([pd.Period("2011-01", freq="D"), pd.Period("2011-02", freq="D")]) + assert lib.infer_dtype(arr, skipna=True) == "period" - arr = np.array([pd.Period('2011-01', freq='D'), - pd.Period('2011-02', freq='M')]) - assert lib.infer_dtype(arr, skipna=True) == 'period' + arr = np.array([pd.Period("2011-01", freq="D"), pd.Period("2011-02", freq="M")]) + assert lib.infer_dtype(arr, skipna=True) == "period" # starts with nan for n in [pd.NaT, np.nan]: - arr = np.array([n, pd.Period('2011-01', freq='D')]) - assert lib.infer_dtype(arr, skipna=True) == 'period' + arr = np.array([n, pd.Period("2011-01", freq="D")]) + assert lib.infer_dtype(arr, skipna=True) == "period" - arr = np.array([n, pd.Period('2011-01', freq='D'), n]) - assert lib.infer_dtype(arr, skipna=True) == 'period' + arr = np.array([n, pd.Period("2011-01", freq="D"), n]) + assert lib.infer_dtype(arr, skipna=True) == "period" # different type of nat - arr = np.array([np.datetime64('nat'), pd.Period('2011-01', freq='M')], - dtype=object) - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + arr = np.array( + [np.datetime64("nat"), pd.Period("2011-01", freq="M")], dtype=object + ) + assert lib.infer_dtype(arr, skipna=False) == "mixed" - arr = np.array([pd.Period('2011-01', freq='M'), np.datetime64('nat')], - dtype=object) - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + arr = np.array( + [pd.Period("2011-01", freq="M"), np.datetime64("nat")], dtype=object + ) + assert lib.infer_dtype(arr, skipna=False) == "mixed" @pytest.mark.parametrize( "data", [ [datetime(2017, 6, 12, 19, 30), datetime(2017, 3, 11, 1, 15)], [Timestamp("20170612"), Timestamp("20170311")], - [Timestamp("20170612", tz='US/Eastern'), - Timestamp("20170311", tz='US/Eastern')], - [date(2017, 6, 12), - Timestamp("20170311", tz='US/Eastern')], + [ + Timestamp("20170612", tz="US/Eastern"), + Timestamp("20170311", tz="US/Eastern"), + ], + [date(2017, 6, 12), Timestamp("20170311", tz="US/Eastern")], [np.datetime64("2017-06-12"), np.datetime64("2017-03-11")], - [np.datetime64("2017-06-12"), datetime(2017, 3, 11, 1, 15)] - ] + [np.datetime64("2017-06-12"), datetime(2017, 3, 11, 1, 15)], + ], ) def test_infer_datetimelike_array_datetime(self, data): assert lib.infer_datetimelike_array(data) == "datetime" @@ -824,8 +865,8 @@ def test_infer_datetimelike_array_datetime(self, data): [timedelta(2017, 6, 12), timedelta(2017, 3, 11)], [timedelta(2017, 6, 12), date(2017, 3, 11)], [np.timedelta64(2017, "D"), np.timedelta64(6, "s")], - [np.timedelta64(2017, "D"), timedelta(2017, 3, 11)] - ] + [np.timedelta64(2017, "D"), timedelta(2017, 3, 11)], + ], ) def test_infer_datetimelike_array_timedelta(self, data): assert lib.infer_datetimelike_array(data) == "timedelta" @@ -841,10 +882,10 @@ def test_infer_datetimelike_array_date(self): [20170612, 20170311], [20170612.5, 20170311.8], [Dummy(), Dummy()], - [Timestamp("20170612"), Timestamp("20170311", tz='US/Eastern')], + [Timestamp("20170612"), Timestamp("20170311", tz="US/Eastern")], [Timestamp("20170612"), 20170311], - [timedelta(2017, 6, 12), Timestamp("20170311", tz='US/Eastern')] - ] + [timedelta(2017, 6, 12), Timestamp("20170311", tz="US/Eastern")], + ], ) def test_infer_datetimelike_array_mixed(self, data): assert lib.infer_datetimelike_array(data) == "mixed" @@ -859,87 +900,83 @@ def test_infer_datetimelike_array_mixed(self, data): [[np.datetime64("2017-06-12"), pd.NaT], "datetime"], [[date(2017, 6, 12), pd.NaT], "date"], [[timedelta(2017, 6, 12), pd.NaT], "timedelta"], - [[np.timedelta64(2017, "D"), pd.NaT], "timedelta"] - ] + [[np.timedelta64(2017, "D"), pd.NaT], "timedelta"], + ], ) @pytest.mark.parametrize("second", [None, np.nan]) - def test_infer_datetimelike_array_nan_nat_like(self, first, second, - expected): + def test_infer_datetimelike_array_nan_nat_like(self, first, second, expected): first.append(second) assert lib.infer_datetimelike_array(first) == expected def test_infer_dtype_all_nan_nat_like(self): arr = np.array([np.nan, np.nan]) - assert lib.infer_dtype(arr, skipna=True) == 'floating' + assert lib.infer_dtype(arr, skipna=True) == "floating" # nan and None mix are result in mixed arr = np.array([np.nan, np.nan, None]) - assert lib.infer_dtype(arr, skipna=True) == 'empty' - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + assert lib.infer_dtype(arr, skipna=True) == "empty" + assert lib.infer_dtype(arr, skipna=False) == "mixed" arr = np.array([None, np.nan, np.nan]) - assert lib.infer_dtype(arr, skipna=True) == 'empty' - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + assert lib.infer_dtype(arr, skipna=True) == "empty" + assert lib.infer_dtype(arr, skipna=False) == "mixed" # pd.NaT arr = np.array([pd.NaT]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime' + assert lib.infer_dtype(arr, skipna=False) == "datetime" arr = np.array([pd.NaT, np.nan]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime' + assert lib.infer_dtype(arr, skipna=False) == "datetime" arr = np.array([np.nan, pd.NaT]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime' + assert lib.infer_dtype(arr, skipna=False) == "datetime" arr = np.array([np.nan, pd.NaT, np.nan]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime' + assert lib.infer_dtype(arr, skipna=False) == "datetime" arr = np.array([None, pd.NaT, None]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime' + assert lib.infer_dtype(arr, skipna=False) == "datetime" # np.datetime64(nat) - arr = np.array([np.datetime64('nat')]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime64' + arr = np.array([np.datetime64("nat")]) + assert lib.infer_dtype(arr, skipna=False) == "datetime64" for n in [np.nan, pd.NaT, None]: - arr = np.array([n, np.datetime64('nat'), n]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime64' + arr = np.array([n, np.datetime64("nat"), n]) + assert lib.infer_dtype(arr, skipna=False) == "datetime64" - arr = np.array([pd.NaT, n, np.datetime64('nat'), n]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime64' + arr = np.array([pd.NaT, n, np.datetime64("nat"), n]) + assert lib.infer_dtype(arr, skipna=False) == "datetime64" - arr = np.array([np.timedelta64('nat')], dtype=object) - assert lib.infer_dtype(arr, skipna=False) == 'timedelta' + arr = np.array([np.timedelta64("nat")], dtype=object) + assert lib.infer_dtype(arr, skipna=False) == "timedelta" for n in [np.nan, pd.NaT, None]: - arr = np.array([n, np.timedelta64('nat'), n]) - assert lib.infer_dtype(arr, skipna=False) == 'timedelta' + arr = np.array([n, np.timedelta64("nat"), n]) + assert lib.infer_dtype(arr, skipna=False) == "timedelta" - arr = np.array([pd.NaT, n, np.timedelta64('nat'), n]) - assert lib.infer_dtype(arr, skipna=False) == 'timedelta' + arr = np.array([pd.NaT, n, np.timedelta64("nat"), n]) + assert lib.infer_dtype(arr, skipna=False) == "timedelta" # datetime / timedelta mixed - arr = np.array([pd.NaT, np.datetime64('nat'), - np.timedelta64('nat'), np.nan]) - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + arr = np.array([pd.NaT, np.datetime64("nat"), np.timedelta64("nat"), np.nan]) + assert lib.infer_dtype(arr, skipna=False) == "mixed" - arr = np.array([np.timedelta64('nat'), np.datetime64('nat')], - dtype=object) - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + arr = np.array([np.timedelta64("nat"), np.datetime64("nat")], dtype=object) + assert lib.infer_dtype(arr, skipna=False) == "mixed" def test_is_datetimelike_array_all_nan_nat_like(self): - arr = np.array([np.nan, pd.NaT, np.datetime64('nat')]) + arr = np.array([np.nan, pd.NaT, np.datetime64("nat")]) assert lib.is_datetime_array(arr) assert lib.is_datetime64_array(arr) assert not lib.is_timedelta_or_timedelta64_array(arr) - arr = np.array([np.nan, pd.NaT, np.timedelta64('nat')]) + arr = np.array([np.nan, pd.NaT, np.timedelta64("nat")]) assert not lib.is_datetime_array(arr) assert not lib.is_datetime64_array(arr) assert lib.is_timedelta_or_timedelta64_array(arr) - arr = np.array([np.nan, pd.NaT, np.datetime64('nat'), - np.timedelta64('nat')]) + arr = np.array([np.nan, pd.NaT, np.datetime64("nat"), np.timedelta64("nat")]) assert not lib.is_datetime_array(arr) assert not lib.is_datetime64_array(arr) assert not lib.is_timedelta_or_timedelta64_array(arr) @@ -955,28 +992,40 @@ def test_is_datetimelike_array_all_nan_nat_like(self): assert not lib.is_timedelta_or_timedelta64_array(arr) assert lib.is_datetime_with_singletz_array( - np.array([pd.Timestamp('20130101', tz='US/Eastern'), - pd.Timestamp('20130102', tz='US/Eastern')], - dtype=object)) + np.array( + [ + pd.Timestamp("20130101", tz="US/Eastern"), + pd.Timestamp("20130102", tz="US/Eastern"), + ], + dtype=object, + ) + ) assert not lib.is_datetime_with_singletz_array( - np.array([pd.Timestamp('20130101', tz='US/Eastern'), - pd.Timestamp('20130102', tz='CET')], - dtype=object)) + np.array( + [ + pd.Timestamp("20130101", tz="US/Eastern"), + pd.Timestamp("20130102", tz="CET"), + ], + dtype=object, + ) + ) @pytest.mark.parametrize( "func", [ - 'is_datetime_array', - 'is_datetime64_array', - 'is_bool_array', - 'is_timedelta_or_timedelta64_array', - 'is_date_array', - 'is_time_array', - 'is_interval_array', - 'is_period_array']) + "is_datetime_array", + "is_datetime64_array", + "is_bool_array", + "is_timedelta_or_timedelta64_array", + "is_date_array", + "is_time_array", + "is_interval_array", + "is_period_array", + ], + ) def test_other_dtypes_for_array(self, func): func = getattr(lib, func) - arr = np.array(['foo', 'bar']) + arr = np.array(["foo", "bar"]) assert not func(arr) arr = np.array([1, 2]) @@ -986,14 +1035,14 @@ def test_date(self): dates = [date(2012, 1, day) for day in range(1, 20)] index = Index(dates) - assert index.inferred_type == 'date' + assert index.inferred_type == "date" dates = [date(2012, 1, day) for day in range(1, 20)] + [np.nan] result = lib.infer_dtype(dates, skipna=False) - assert result == 'mixed' + assert result == "mixed" result = lib.infer_dtype(dates, skipna=True) - assert result == 'date' + assert result == "date" def test_is_numeric_array(self): @@ -1006,11 +1055,13 @@ def test_is_numeric_array(self): def test_is_string_array(self): - assert lib.is_string_array(np.array(['foo', 'bar'])) + assert lib.is_string_array(np.array(["foo", "bar"])) assert not lib.is_string_array( - np.array(['foo', 'bar', np.nan], dtype=object), skipna=False) + np.array(["foo", "bar", np.nan], dtype=object), skipna=False + ) assert lib.is_string_array( - np.array(['foo', 'bar', np.nan], dtype=object), skipna=True) + np.array(["foo", "bar", np.nan], dtype=object), skipna=True + ) assert not lib.is_string_array(np.array([1, 2])) def test_to_object_array_tuples(self): @@ -1021,7 +1072,8 @@ def test_to_object_array_tuples(self): try: # make sure record array works from collections import namedtuple - record = namedtuple('record', 'x y') + + record = namedtuple("record", "x y") r = record(5, 6) values = [r] result = lib.to_object_array_tuples(values) # noqa @@ -1032,11 +1084,11 @@ def test_object(self): # GH 7431 # cannot infer more than this as only a single element - arr = np.array([None], dtype='O') + arr = np.array([None], dtype="O") result = lib.infer_dtype(arr, skipna=False) - assert result == 'mixed' + assert result == "mixed" result = lib.infer_dtype(arr, skipna=True) - assert result == 'empty' + assert result == "empty" def test_to_object_array_width(self): # see gh-13320 @@ -1050,15 +1102,16 @@ def test_to_object_array_width(self): out = lib.to_object_array(rows, min_width=1) tm.assert_numpy_array_equal(out, expected) - expected = np.array([[1, 2, 3, None, None], - [4, 5, 6, None, None]], dtype=object) + expected = np.array( + [[1, 2, 3, None, None], [4, 5, 6, None, None]], dtype=object + ) out = lib.to_object_array(rows, min_width=5) tm.assert_numpy_array_equal(out, expected) def test_is_period(self): - assert lib.is_period(pd.Period('2011-01', freq='M')) - assert not lib.is_period(pd.PeriodIndex(['2011-01'], freq='M')) - assert not lib.is_period(pd.Timestamp('2011-01')) + assert lib.is_period(pd.Period("2011-01", freq="M")) + assert not lib.is_period(pd.PeriodIndex(["2011-01"], freq="M")) + assert not lib.is_period(pd.Timestamp("2011-01")) assert not lib.is_period(1) assert not lib.is_period(np.nan) @@ -1066,23 +1119,23 @@ def test_categorical(self): # GH 8974 from pandas import Categorical, Series - arr = Categorical(list('abc')) + + arr = Categorical(list("abc")) result = lib.infer_dtype(arr, skipna=True) - assert result == 'categorical' + assert result == "categorical" result = lib.infer_dtype(Series(arr), skipna=True) - assert result == 'categorical' + assert result == "categorical" - arr = Categorical(list('abc'), categories=['cegfab'], ordered=True) + arr = Categorical(list("abc"), categories=["cegfab"], ordered=True) result = lib.infer_dtype(arr, skipna=True) - assert result == 'categorical' + assert result == "categorical" result = lib.infer_dtype(Series(arr), skipna=True) - assert result == 'categorical' + assert result == "categorical" class TestNumberScalar: - def test_is_number(self): assert is_number(True) @@ -1096,17 +1149,17 @@ def test_is_number(self): assert is_number(np.nan) assert not is_number(None) - assert not is_number('x') + assert not is_number("x") assert not is_number(datetime(2011, 1, 1)) - assert not is_number(np.datetime64('2011-01-01')) - assert not is_number(Timestamp('2011-01-01')) - assert not is_number(Timestamp('2011-01-01', tz='US/Eastern')) + assert not is_number(np.datetime64("2011-01-01")) + assert not is_number(Timestamp("2011-01-01")) + assert not is_number(Timestamp("2011-01-01", tz="US/Eastern")) assert not is_number(timedelta(1000)) - assert not is_number(Timedelta('1 days')) + assert not is_number(Timedelta("1 days")) # questionable assert not is_number(np.bool_(False)) - assert is_number(np.timedelta64(1, 'D')) + assert is_number(np.timedelta64(1, "D")) def test_is_bool(self): assert is_bool(True) @@ -1121,14 +1174,14 @@ def test_is_bool(self): assert not is_bool(np.complex128(1 + 3j)) assert not is_bool(np.nan) assert not is_bool(None) - assert not is_bool('x') + assert not is_bool("x") assert not is_bool(datetime(2011, 1, 1)) - assert not is_bool(np.datetime64('2011-01-01')) - assert not is_bool(Timestamp('2011-01-01')) - assert not is_bool(Timestamp('2011-01-01', tz='US/Eastern')) + assert not is_bool(np.datetime64("2011-01-01")) + assert not is_bool(Timestamp("2011-01-01")) + assert not is_bool(Timestamp("2011-01-01", tz="US/Eastern")) assert not is_bool(timedelta(1000)) - assert not is_bool(np.timedelta64(1, 'D')) - assert not is_bool(Timedelta('1 days')) + assert not is_bool(np.timedelta64(1, "D")) + assert not is_bool(Timedelta("1 days")) def test_is_integer(self): assert is_integer(1) @@ -1143,16 +1196,16 @@ def test_is_integer(self): assert not is_integer(np.complex128(1 + 3j)) assert not is_integer(np.nan) assert not is_integer(None) - assert not is_integer('x') + assert not is_integer("x") assert not is_integer(datetime(2011, 1, 1)) - assert not is_integer(np.datetime64('2011-01-01')) - assert not is_integer(Timestamp('2011-01-01')) - assert not is_integer(Timestamp('2011-01-01', tz='US/Eastern')) + assert not is_integer(np.datetime64("2011-01-01")) + assert not is_integer(Timestamp("2011-01-01")) + assert not is_integer(Timestamp("2011-01-01", tz="US/Eastern")) assert not is_integer(timedelta(1000)) - assert not is_integer(Timedelta('1 days')) + assert not is_integer(Timedelta("1 days")) # questionable - assert is_integer(np.timedelta64(1, 'D')) + assert is_integer(np.timedelta64(1, "D")) def test_is_float(self): assert is_float(1.1) @@ -1167,75 +1220,74 @@ def test_is_float(self): assert not is_float(np.int64(1)) assert not is_float(np.complex128(1 + 3j)) assert not is_float(None) - assert not is_float('x') + assert not is_float("x") assert not is_float(datetime(2011, 1, 1)) - assert not is_float(np.datetime64('2011-01-01')) - assert not is_float(Timestamp('2011-01-01')) - assert not is_float(Timestamp('2011-01-01', tz='US/Eastern')) + assert not is_float(np.datetime64("2011-01-01")) + assert not is_float(Timestamp("2011-01-01")) + assert not is_float(Timestamp("2011-01-01", tz="US/Eastern")) assert not is_float(timedelta(1000)) - assert not is_float(np.timedelta64(1, 'D')) - assert not is_float(Timedelta('1 days')) + assert not is_float(np.timedelta64(1, "D")) + assert not is_float(Timedelta("1 days")) def test_is_datetime_dtypes(self): - ts = pd.date_range('20130101', periods=3) - tsa = pd.date_range('20130101', periods=3, tz='US/Eastern') + ts = pd.date_range("20130101", periods=3) + tsa = pd.date_range("20130101", periods=3, tz="US/Eastern") - assert is_datetime64_dtype('datetime64') - assert is_datetime64_dtype('datetime64[ns]') + assert is_datetime64_dtype("datetime64") + assert is_datetime64_dtype("datetime64[ns]") assert is_datetime64_dtype(ts) assert not is_datetime64_dtype(tsa) - assert not is_datetime64_ns_dtype('datetime64') - assert is_datetime64_ns_dtype('datetime64[ns]') + assert not is_datetime64_ns_dtype("datetime64") + assert is_datetime64_ns_dtype("datetime64[ns]") assert is_datetime64_ns_dtype(ts) assert is_datetime64_ns_dtype(tsa) - assert is_datetime64_any_dtype('datetime64') - assert is_datetime64_any_dtype('datetime64[ns]') + assert is_datetime64_any_dtype("datetime64") + assert is_datetime64_any_dtype("datetime64[ns]") assert is_datetime64_any_dtype(ts) assert is_datetime64_any_dtype(tsa) - assert not is_datetime64tz_dtype('datetime64') - assert not is_datetime64tz_dtype('datetime64[ns]') + assert not is_datetime64tz_dtype("datetime64") + assert not is_datetime64tz_dtype("datetime64[ns]") assert not is_datetime64tz_dtype(ts) assert is_datetime64tz_dtype(tsa) - for tz in ['US/Eastern', 'UTC']: - dtype = 'datetime64[ns, {}]'.format(tz) + for tz in ["US/Eastern", "UTC"]: + dtype = "datetime64[ns, {}]".format(tz) assert not is_datetime64_dtype(dtype) assert is_datetime64tz_dtype(dtype) assert is_datetime64_ns_dtype(dtype) assert is_datetime64_any_dtype(dtype) def test_is_timedelta(self): - assert is_timedelta64_dtype('timedelta64') - assert is_timedelta64_dtype('timedelta64[ns]') - assert not is_timedelta64_ns_dtype('timedelta64') - assert is_timedelta64_ns_dtype('timedelta64[ns]') + assert is_timedelta64_dtype("timedelta64") + assert is_timedelta64_dtype("timedelta64[ns]") + assert not is_timedelta64_ns_dtype("timedelta64") + assert is_timedelta64_ns_dtype("timedelta64[ns]") - tdi = TimedeltaIndex([1e14, 2e14], dtype='timedelta64[ns]') + tdi = TimedeltaIndex([1e14, 2e14], dtype="timedelta64[ns]") assert is_timedelta64_dtype(tdi) assert is_timedelta64_ns_dtype(tdi) - assert is_timedelta64_ns_dtype(tdi.astype('timedelta64[ns]')) + assert is_timedelta64_ns_dtype(tdi.astype("timedelta64[ns]")) # Conversion to Int64Index: - assert not is_timedelta64_ns_dtype(tdi.astype('timedelta64')) - assert not is_timedelta64_ns_dtype(tdi.astype('timedelta64[h]')) + assert not is_timedelta64_ns_dtype(tdi.astype("timedelta64")) + assert not is_timedelta64_ns_dtype(tdi.astype("timedelta64[h]")) class TestIsScalar: - def test_is_scalar_builtin_scalars(self): assert is_scalar(None) assert is_scalar(True) assert is_scalar(False) assert is_scalar(Number()) assert is_scalar(Fraction()) - assert is_scalar(0.) + assert is_scalar(0.0) assert is_scalar(np.nan) - assert is_scalar('foobar') - assert is_scalar(b'foobar') + assert is_scalar("foobar") + assert is_scalar(b"foobar") assert is_scalar(datetime(2014, 1, 1)) assert is_scalar(date(2014, 1, 1)) assert is_scalar(time(12, 0)) @@ -1247,26 +1299,29 @@ def test_is_scalar_builtin_nonscalars(self): assert not is_scalar([]) assert not is_scalar([1]) assert not is_scalar(()) - assert not is_scalar((1, )) + assert not is_scalar((1,)) assert not is_scalar(slice(None)) assert not is_scalar(Ellipsis) def test_is_scalar_numpy_array_scalars(self): assert is_scalar(np.int64(1)) - assert is_scalar(np.float64(1.)) + assert is_scalar(np.float64(1.0)) assert is_scalar(np.int32(1)) - assert is_scalar(np.object_('foobar')) - assert is_scalar(np.str_('foobar')) - assert is_scalar(np.unicode_('foobar')) - assert is_scalar(np.bytes_(b'foobar')) - assert is_scalar(np.datetime64('2014-01-01')) - assert is_scalar(np.timedelta64(1, 'h')) + assert is_scalar(np.object_("foobar")) + assert is_scalar(np.str_("foobar")) + assert is_scalar(np.unicode_("foobar")) + assert is_scalar(np.bytes_(b"foobar")) + assert is_scalar(np.datetime64("2014-01-01")) + assert is_scalar(np.timedelta64(1, "h")) def test_is_scalar_numpy_zerodim_arrays(self): - for zerodim in [np.array(1), np.array('foobar'), - np.array(np.datetime64('2014-01-01')), - np.array(np.timedelta64(1, 'h')), - np.array(np.datetime64('NaT'))]: + for zerodim in [ + np.array(1), + np.array("foobar"), + np.array(np.datetime64("2014-01-01")), + np.array(np.timedelta64(1, "h")), + np.array(np.datetime64("NaT")), + ]: assert not is_scalar(zerodim) assert is_scalar(lib.item_from_zerodim(zerodim)) @@ -1274,12 +1329,12 @@ def test_is_scalar_numpy_zerodim_arrays(self): def test_is_scalar_numpy_arrays(self): assert not is_scalar(np.array([])) assert not is_scalar(np.array([[]])) - assert not is_scalar(np.matrix('1; 2')) + assert not is_scalar(np.matrix("1; 2")) def test_is_scalar_pandas_scalars(self): - assert is_scalar(Timestamp('2014-01-01')) + assert is_scalar(Timestamp("2014-01-01")) assert is_scalar(Timedelta(hours=1)) - assert is_scalar(Period('2014-01-01')) + assert is_scalar(Period("2014-01-01")) assert is_scalar(Interval(left=0, right=1)) assert is_scalar(DateOffset(days=1)) @@ -1293,26 +1348,25 @@ def test_is_scalar_pandas_containers(self): def test_datetimeindex_from_empty_datetime64_array(): - for unit in ['ms', 'us', 'ns']: - idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit)) - assert (len(idx) == 0) + for unit in ["ms", "us", "ns"]: + idx = DatetimeIndex(np.array([], dtype="datetime64[%s]" % unit)) + assert len(idx) == 0 def test_nan_to_nat_conversions(): - df = DataFrame(dict({ - 'A': np.asarray(range(10), dtype='float64'), - 'B': Timestamp('20010101') - })) + df = DataFrame( + dict({"A": np.asarray(range(10), dtype="float64"), "B": Timestamp("20010101")}) + ) df.iloc[3:6, :] = np.nan - result = df.loc[4, 'B'].value - assert (result == iNaT) + result = df.loc[4, "B"].value + assert result == iNaT - s = df['B'].copy() + s = df["B"].copy() s._data = s._data.setitem(indexer=tuple([slice(8, 9)]), value=np.nan) - assert (isna(s[8])) + assert isna(s[8]) - assert (s[8].value == np.datetime64('NaT').astype(np.int64)) + assert s[8].value == np.datetime64("NaT").astype(np.int64) @td.skip_if_no_scipy @@ -1325,17 +1379,17 @@ def test_is_scipy_sparse(spmatrix): # noqa: F811 def test_ensure_int32(): values = np.arange(10, dtype=np.int32) result = ensure_int32(values) - assert (result.dtype == np.int32) + assert result.dtype == np.int32 values = np.arange(10, dtype=np.int64) result = ensure_int32(values) - assert (result.dtype == np.int32) + assert result.dtype == np.int32 def test_ensure_categorical(): values = np.arange(10, dtype=np.int32) result = ensure_categorical(values) - assert (result.dtype == 'category') + assert result.dtype == "category" values = Categorical(values) result = ensure_categorical(values) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 64bfc050da02f5..a688dec50bc953 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -11,20 +11,24 @@ from pandas._libs.tslibs import iNaT, is_null_datetimelike from pandas.core.dtypes.common import is_scalar -from pandas.core.dtypes.dtypes import ( - DatetimeTZDtype, IntervalDtype, PeriodDtype) +from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype, PeriodDtype from pandas.core.dtypes.missing import ( - array_equivalent, isna, isnull, na_value_for_dtype, notna, notnull) + array_equivalent, + isna, + isnull, + na_value_for_dtype, + notna, + notnull, +) import pandas as pd -from pandas import ( - DatetimeIndex, Float64Index, NaT, Series, TimedeltaIndex, date_range) +from pandas import DatetimeIndex, Float64Index, NaT, Series, TimedeltaIndex, date_range from pandas.util import testing as tm -@pytest.mark.parametrize('notna_f', [notna, notnull]) +@pytest.mark.parametrize("notna_f", [notna, notnull]) def test_notna_notnull(notna_f): - assert notna_f(1.) + assert notna_f(1.0) assert not notna_f(None) assert not notna_f(np.NaN) @@ -45,14 +49,17 @@ def test_notna_notnull(notna_f): assert result.sum() == 2 with cf.option_context("mode.use_inf_as_na", False): - for s in [tm.makeFloatSeries(), tm.makeStringSeries(), - tm.makeObjectSeries(), tm.makeTimeSeries(), - tm.makePeriodSeries()]: - assert (isinstance(notna_f(s), Series)) + for s in [ + tm.makeFloatSeries(), + tm.makeStringSeries(), + tm.makeObjectSeries(), + tm.makeTimeSeries(), + tm.makePeriodSeries(), + ]: + assert isinstance(notna_f(s), Series) class TestIsNA: - def test_0d_array(self): assert isna(np.array(np.nan)) assert not isna(np.array(0.0)) @@ -70,24 +77,31 @@ def test_empty_object(self): expected = np.ones(shape=shape, dtype=bool) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('isna_f', [isna, isnull]) + @pytest.mark.parametrize("isna_f", [isna, isnull]) def test_isna_isnull(self, isna_f): - assert not isna_f(1.) + assert not isna_f(1.0) assert isna_f(None) assert isna_f(np.NaN) - assert float('nan') + assert float("nan") assert not isna_f(np.inf) assert not isna_f(-np.inf) # series - for s in [tm.makeFloatSeries(), tm.makeStringSeries(), - tm.makeObjectSeries(), tm.makeTimeSeries(), - tm.makePeriodSeries()]: + for s in [ + tm.makeFloatSeries(), + tm.makeStringSeries(), + tm.makeObjectSeries(), + tm.makeTimeSeries(), + tm.makePeriodSeries(), + ]: assert isinstance(isna_f(s), Series) # frame - for df in [tm.makeTimeDataFrame(), tm.makePeriodFrame(), - tm.makeMixedDataFrame()]: + for df in [ + tm.makeTimeDataFrame(), + tm.makePeriodFrame(), + tm.makeMixedDataFrame(), + ]: result = isna_f(df) expected = df.apply(isna_f) tm.assert_frame_equal(result, expected) @@ -102,16 +116,16 @@ def test_isna_lists(self): tm.assert_numpy_array_equal(result, exp) # list of strings / unicode - result = isna(['foo', 'bar']) + result = isna(["foo", "bar"]) exp = np.array([False, False]) tm.assert_numpy_array_equal(result, exp) - result = isna(['foo', 'bar']) + result = isna(["foo", "bar"]) exp = np.array([False, False]) tm.assert_numpy_array_equal(result, exp) # GH20675 - result = isna([np.NaN, 'world']) + result = isna([np.NaN, "world"]) exp = np.array([True, False]) tm.assert_numpy_array_equal(result, exp) @@ -125,8 +139,14 @@ def test_isna_nat(self): tm.assert_numpy_array_equal(result, exp) def test_isna_numpy_nat(self): - arr = np.array([NaT, np.datetime64('NaT'), np.timedelta64('NaT'), - np.datetime64('NaT', 's')]) + arr = np.array( + [ + NaT, + np.datetime64("NaT"), + np.timedelta64("NaT"), + np.datetime64("NaT", "s"), + ] + ) result = isna(arr) expected = np.array([True] * 4) tm.assert_numpy_array_equal(result, expected) @@ -135,7 +155,7 @@ def test_isna_datetime(self): assert not isna(datetime.now()) assert notna(datetime.now()) - idx = date_range('1/1/1990', periods=20) + idx = date_range("1/1/1990", periods=20) exp = np.ones(len(idx), dtype=bool) tm.assert_numpy_array_equal(notna(idx), exp) @@ -148,7 +168,7 @@ def test_isna_datetime(self): tm.assert_numpy_array_equal(mask, exp) # GH 9129 - pidx = idx.to_period(freq='M') + pidx = idx.to_period(freq="M") mask = isna(pidx) assert mask[0] exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool) @@ -160,14 +180,20 @@ def test_isna_datetime(self): @pytest.mark.parametrize( "value, expected", - [(np.complex128(np.nan), True), - (np.float64(1), False), - (np.array([1, 1 + 0j, np.nan, 3]), - np.array([False, False, True, False])), - (np.array([1, 1 + 0j, np.nan, 3], dtype=object), - np.array([False, False, True, False])), - (np.array([1, 1 + 0j, np.nan, 3]).astype(object), - np.array([False, False, True, False]))]) + [ + (np.complex128(np.nan), True), + (np.float64(1), False), + (np.array([1, 1 + 0j, np.nan, 3]), np.array([False, False, True, False])), + ( + np.array([1, 1 + 0j, np.nan, 3], dtype=object), + np.array([False, False, True, False]), + ), + ( + np.array([1, 1 + 0j, np.nan, 3]).astype(object), + np.array([False, False, True, False]), + ), + ], + ) def test_complex(self, value, expected): result = isna(value) if is_scalar(result): @@ -176,16 +202,22 @@ def test_complex(self, value, expected): tm.assert_numpy_array_equal(result, expected) def test_datetime_other_units(self): - idx = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-02']) + idx = pd.DatetimeIndex(["2011-01-01", "NaT", "2011-01-02"]) exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isna(idx), exp) tm.assert_numpy_array_equal(notna(idx), ~exp) tm.assert_numpy_array_equal(isna(idx.values), exp) tm.assert_numpy_array_equal(notna(idx.values), ~exp) - for dtype in ['datetime64[D]', 'datetime64[h]', 'datetime64[m]', - 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', - 'datetime64[ns]']: + for dtype in [ + "datetime64[D]", + "datetime64[h]", + "datetime64[m]", + "datetime64[s]", + "datetime64[ms]", + "datetime64[us]", + "datetime64[ns]", + ]: values = idx.values.astype(dtype) exp = np.array([False, True, False]) @@ -201,16 +233,22 @@ def test_datetime_other_units(self): tm.assert_series_equal(notna(s), ~exp) def test_timedelta_other_units(self): - idx = pd.TimedeltaIndex(['1 days', 'NaT', '2 days']) + idx = pd.TimedeltaIndex(["1 days", "NaT", "2 days"]) exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isna(idx), exp) tm.assert_numpy_array_equal(notna(idx), ~exp) tm.assert_numpy_array_equal(isna(idx.values), exp) tm.assert_numpy_array_equal(notna(idx.values), ~exp) - for dtype in ['timedelta64[D]', 'timedelta64[h]', 'timedelta64[m]', - 'timedelta64[s]', 'timedelta64[ms]', 'timedelta64[us]', - 'timedelta64[ns]']: + for dtype in [ + "timedelta64[D]", + "timedelta64[h]", + "timedelta64[m]", + "timedelta64[s]", + "timedelta64[ms]", + "timedelta64[us]", + "timedelta64[ns]", + ]: values = idx.values.astype(dtype) exp = np.array([False, True, False]) @@ -226,7 +264,7 @@ def test_timedelta_other_units(self): tm.assert_series_equal(notna(s), ~exp) def test_period(self): - idx = pd.PeriodIndex(['2011-01', 'NaT', '2012-01'], freq='M') + idx = pd.PeriodIndex(["2011-01", "NaT", "2012-01"], freq="M") exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isna(idx), exp) tm.assert_numpy_array_equal(notna(idx), ~exp) @@ -241,95 +279,112 @@ def test_period(self): def test_array_equivalent(): - assert array_equivalent(np.array([np.nan, np.nan]), - np.array([np.nan, np.nan])) - assert array_equivalent(np.array([np.nan, 1, np.nan]), - np.array([np.nan, 1, np.nan])) - assert array_equivalent(np.array([np.nan, None], dtype='object'), - np.array([np.nan, None], dtype='object')) - assert array_equivalent(np.array([np.nan, 1 + 1j], dtype='complex'), - np.array([np.nan, 1 + 1j], dtype='complex')) - assert not array_equivalent( - np.array([np.nan, 1 + 1j], dtype='complex'), np.array( - [np.nan, 1 + 2j], dtype='complex')) - assert not array_equivalent( - np.array([np.nan, 1, np.nan]), np.array([np.nan, 2, np.nan])) - assert not array_equivalent( - np.array(['a', 'b', 'c', 'd']), np.array(['e', 'e'])) - assert array_equivalent(Float64Index([0, np.nan]), - Float64Index([0, np.nan])) + assert array_equivalent(np.array([np.nan, np.nan]), np.array([np.nan, np.nan])) + assert array_equivalent( + np.array([np.nan, 1, np.nan]), np.array([np.nan, 1, np.nan]) + ) + assert array_equivalent( + np.array([np.nan, None], dtype="object"), + np.array([np.nan, None], dtype="object"), + ) + assert array_equivalent( + np.array([np.nan, 1 + 1j], dtype="complex"), + np.array([np.nan, 1 + 1j], dtype="complex"), + ) assert not array_equivalent( - Float64Index([0, np.nan]), Float64Index([1, np.nan])) - assert array_equivalent(DatetimeIndex([0, np.nan]), - DatetimeIndex([0, np.nan])) + np.array([np.nan, 1 + 1j], dtype="complex"), + np.array([np.nan, 1 + 2j], dtype="complex"), + ) assert not array_equivalent( - DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan])) - assert array_equivalent(TimedeltaIndex([0, np.nan]), - TimedeltaIndex([0, np.nan])) + np.array([np.nan, 1, np.nan]), np.array([np.nan, 2, np.nan]) + ) + assert not array_equivalent(np.array(["a", "b", "c", "d"]), np.array(["e", "e"])) + assert array_equivalent(Float64Index([0, np.nan]), Float64Index([0, np.nan])) + assert not array_equivalent(Float64Index([0, np.nan]), Float64Index([1, np.nan])) + assert array_equivalent(DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan])) + assert not array_equivalent(DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan])) + assert array_equivalent(TimedeltaIndex([0, np.nan]), TimedeltaIndex([0, np.nan])) assert not array_equivalent( - TimedeltaIndex([0, np.nan]), TimedeltaIndex([1, np.nan])) + TimedeltaIndex([0, np.nan]), TimedeltaIndex([1, np.nan]) + ) with catch_warnings(): filterwarnings("ignore", "Converting timezone", FutureWarning) - assert array_equivalent(DatetimeIndex([0, np.nan], tz='US/Eastern'), - DatetimeIndex([0, np.nan], tz='US/Eastern')) + assert array_equivalent( + DatetimeIndex([0, np.nan], tz="US/Eastern"), + DatetimeIndex([0, np.nan], tz="US/Eastern"), + ) assert not array_equivalent( - DatetimeIndex([0, np.nan], tz='US/Eastern'), DatetimeIndex( - [1, np.nan], tz='US/Eastern')) + DatetimeIndex([0, np.nan], tz="US/Eastern"), + DatetimeIndex([1, np.nan], tz="US/Eastern"), + ) assert not array_equivalent( - DatetimeIndex([0, np.nan]), DatetimeIndex( - [0, np.nan], tz='US/Eastern')) + DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan], tz="US/Eastern") + ) assert not array_equivalent( - DatetimeIndex([0, np.nan], tz='CET'), DatetimeIndex( - [0, np.nan], tz='US/Eastern')) + DatetimeIndex([0, np.nan], tz="CET"), + DatetimeIndex([0, np.nan], tz="US/Eastern"), + ) - assert not array_equivalent( - DatetimeIndex([0, np.nan]), TimedeltaIndex([0, np.nan])) + assert not array_equivalent(DatetimeIndex([0, np.nan]), TimedeltaIndex([0, np.nan])) def test_array_equivalent_compat(): # see gh-13388 - m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) - n = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) - assert (array_equivalent(m, n, strict_nan=True)) - assert (array_equivalent(m, n, strict_nan=False)) + m = np.array([(1, 2), (3, 4)], dtype=[("a", int), ("b", float)]) + n = np.array([(1, 2), (3, 4)], dtype=[("a", int), ("b", float)]) + assert array_equivalent(m, n, strict_nan=True) + assert array_equivalent(m, n, strict_nan=False) - m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) - n = np.array([(1, 2), (4, 3)], dtype=[('a', int), ('b', float)]) - assert (not array_equivalent(m, n, strict_nan=True)) - assert (not array_equivalent(m, n, strict_nan=False)) + m = np.array([(1, 2), (3, 4)], dtype=[("a", int), ("b", float)]) + n = np.array([(1, 2), (4, 3)], dtype=[("a", int), ("b", float)]) + assert not array_equivalent(m, n, strict_nan=True) + assert not array_equivalent(m, n, strict_nan=False) - m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) - n = np.array([(1, 2), (3, 4)], dtype=[('b', int), ('a', float)]) - assert (not array_equivalent(m, n, strict_nan=True)) - assert (not array_equivalent(m, n, strict_nan=False)) + m = np.array([(1, 2), (3, 4)], dtype=[("a", int), ("b", float)]) + n = np.array([(1, 2), (3, 4)], dtype=[("b", int), ("a", float)]) + assert not array_equivalent(m, n, strict_nan=True) + assert not array_equivalent(m, n, strict_nan=False) def test_array_equivalent_str(): - for dtype in ['O', 'S', 'U']: - assert array_equivalent(np.array(['A', 'B'], dtype=dtype), - np.array(['A', 'B'], dtype=dtype)) - assert not array_equivalent(np.array(['A', 'B'], dtype=dtype), - np.array(['A', 'X'], dtype=dtype)) - - -@pytest.mark.parametrize('dtype, na_value', [ - # Datetime-like - (np.dtype("M8[ns]"), NaT), - (np.dtype("m8[ns]"), NaT), - (DatetimeTZDtype.construct_from_string('datetime64[ns, US/Eastern]'), NaT), - (PeriodDtype("M"), NaT), - # Integer - ('u1', 0), ('u2', 0), ('u4', 0), ('u8', 0), - ('i1', 0), ('i2', 0), ('i4', 0), ('i8', 0), - # Bool - ('bool', False), - # Float - ('f2', np.nan), ('f4', np.nan), ('f8', np.nan), - # Object - ('O', np.nan), - # Interval - (IntervalDtype(), np.nan), -]) + for dtype in ["O", "S", "U"]: + assert array_equivalent( + np.array(["A", "B"], dtype=dtype), np.array(["A", "B"], dtype=dtype) + ) + assert not array_equivalent( + np.array(["A", "B"], dtype=dtype), np.array(["A", "X"], dtype=dtype) + ) + + +@pytest.mark.parametrize( + "dtype, na_value", + [ + # Datetime-like + (np.dtype("M8[ns]"), NaT), + (np.dtype("m8[ns]"), NaT), + (DatetimeTZDtype.construct_from_string("datetime64[ns, US/Eastern]"), NaT), + (PeriodDtype("M"), NaT), + # Integer + ("u1", 0), + ("u2", 0), + ("u4", 0), + ("u8", 0), + ("i1", 0), + ("i2", 0), + ("i4", 0), + ("i8", 0), + # Bool + ("bool", False), + # Float + ("f2", np.nan), + ("f4", np.nan), + ("f8", np.nan), + # Object + ("O", np.nan), + # Interval + (IntervalDtype(), np.nan), + ], +) def test_na_value_for_dtype(dtype, na_value): result = na_value_for_dtype(dtype) assert result is na_value @@ -337,8 +392,8 @@ def test_na_value_for_dtype(dtype, na_value): class TestNAObj: - _1d_methods = ['isnaobj', 'isnaobj_old'] - _2d_methods = ['isnaobj2d', 'isnaobj2d_old'] + _1d_methods = ["isnaobj", "isnaobj_old"] + _2d_methods = ["isnaobj2d", "isnaobj2d_old"] def _check_behavior(self, arr, expected): for method in TestNAObj._1d_methods: @@ -353,7 +408,7 @@ def _check_behavior(self, arr, expected): tm.assert_numpy_array_equal(result, expected) def test_basic(self): - arr = np.array([1, None, 'foo', -5.1, pd.NaT, np.nan]) + arr = np.array([1, None, "foo", -5.1, pd.NaT, np.nan]) expected = np.array([False, True, False, False, True, True]) self._check_behavior(arr, expected) @@ -384,32 +439,31 @@ def test_empty_like(self): self._check_behavior(arr, expected) -m8_units = ['as', 'ps', 'ns', 'us', 'ms', 's', - 'm', 'h', 'D', 'W', 'M', 'Y'] - -na_vals = [ - None, - NaT, - float('NaN'), - complex('NaN'), - np.nan, - np.float64('NaN'), - np.float32('NaN'), - np.complex64(np.nan), - np.complex128(np.nan), - np.datetime64('NaT'), - np.timedelta64('NaT'), -] + [ - np.datetime64('NaT', unit) for unit in m8_units -] + [ - np.timedelta64('NaT', unit) for unit in m8_units -] +m8_units = ["as", "ps", "ns", "us", "ms", "s", "m", "h", "D", "W", "M", "Y"] + +na_vals = ( + [ + None, + NaT, + float("NaN"), + complex("NaN"), + np.nan, + np.float64("NaN"), + np.float32("NaN"), + np.complex64(np.nan), + np.complex128(np.nan), + np.datetime64("NaT"), + np.timedelta64("NaT"), + ] + + [np.datetime64("NaT", unit) for unit in m8_units] + + [np.timedelta64("NaT", unit) for unit in m8_units] +) inf_vals = [ - float('inf'), - float('-inf'), - complex('inf'), - complex('-inf'), + float("inf"), + float("-inf"), + complex("inf"), + complex("-inf"), np.inf, np.NINF, ] @@ -420,14 +474,12 @@ def test_empty_like(self): int(NaT.value), ] -sometimes_na_vals = [ - Decimal('NaN'), -] +sometimes_na_vals = [Decimal("NaN")] never_na_vals = [ # float/complex values that when viewed as int64 match iNaT -0.0, - np.float64('-0.0'), + np.float64("-0.0"), -0j, np.complex64(-0j), ] diff --git a/pandas/tests/extension/arrow/bool.py b/pandas/tests/extension/arrow/bool.py index 0d6396033fac7c..ee043a6bb837c5 100644 --- a/pandas/tests/extension/arrow/bool.py +++ b/pandas/tests/extension/arrow/bool.py @@ -13,15 +13,19 @@ import pandas as pd from pandas.api.extensions import ( - ExtensionArray, ExtensionDtype, register_extension_dtype, take) + ExtensionArray, + ExtensionDtype, + register_extension_dtype, + take, +) @register_extension_dtype class ArrowBoolDtype(ExtensionDtype): type = np.bool_ - kind = 'b' - name = 'arrow_bool' + kind = "b" + name = "arrow_bool" na_value = pa.NULL @classmethod @@ -29,8 +33,7 @@ def construct_from_string(cls, string): if string == cls.name: return cls() else: - raise TypeError("Cannot construct a '{}' from " - "'{}'".format(cls, string)) + raise TypeError("Cannot construct a '{}' from " "'{}'".format(cls, string)) @classmethod def construct_array_type(cls): @@ -90,9 +93,12 @@ def dtype(self): @property def nbytes(self): - return sum(x.size for chunk in self._data.chunks - for x in chunk.buffers() - if x is not None) + return sum( + x.size + for chunk in self._data.chunks + for x in chunk.buffers() + if x is not None + ) def isna(self): nas = pd.isna(self._data.to_pandas()) @@ -104,8 +110,7 @@ def take(self, indices, allow_fill=False, fill_value=None): if allow_fill and fill_value is None: fill_value = self.dtype.na_value - result = take(data, indices, fill_value=fill_value, - allow_fill=allow_fill) + result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill) return self._from_sequence(result, dtype=self.dtype) def copy(self): @@ -113,15 +118,12 @@ def copy(self): @classmethod def _concat_same_type(cls, to_concat): - chunks = list(itertools.chain.from_iterable(x._data.chunks - for x in to_concat)) + chunks = list(itertools.chain.from_iterable(x._data.chunks for x in to_concat)) arr = pa.chunked_array(chunks) return cls(arr) def __invert__(self): - return type(self).from_scalars( - ~self._data.to_pandas() - ) + return type(self).from_scalars(~self._data.to_pandas()) def _reduce(self, method, skipna=True, **kwargs): if skipna: diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index 21ce5e999334eb..205edf5da5b74b 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -5,7 +5,7 @@ from pandas.tests.extension import base import pandas.util.testing as tm -pytest.importorskip('pyarrow', minversion="0.10.0") +pytest.importorskip("pyarrow", minversion="0.10.0") from .bool import ArrowBoolArray, ArrowBoolDtype # isort:skip @@ -47,7 +47,7 @@ def test_from_dtype(self, data): pytest.skip("GH-22666") # seems like some bug in isna on empty BoolArray returning floats. - @pytest.mark.xfail(reason='bad is-na for empty data') + @pytest.mark.xfail(reason="bad is-na for empty data") def test_from_sequence_from_cls(self, data): super().test_from_sequence_from_cls(data) diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index 0b3f2b860c1270..090df35bd94c99 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -49,10 +49,12 @@ class TestMyDtype(BaseDtypeTests): from .io import BaseParsingTests # noqa from .methods import BaseMethodsTests # noqa from .missing import BaseMissingTests # noqa -from .ops import ( # noqa - BaseArithmeticOpsTests, BaseComparisonOpsTests, BaseOpsUtil) +from .ops import BaseArithmeticOpsTests, BaseComparisonOpsTests, BaseOpsUtil # noqa from .printing import BasePrintingTests # noqa from .reduce import ( # noqa - BaseBooleanReduceTests, BaseNoReduceTests, BaseNumericReduceTests) + BaseBooleanReduceTests, + BaseNoReduceTests, + BaseNumericReduceTests, +) from .reshaping import BaseReshapingTests # noqa from .setitem import BaseSetitemTests # noqa diff --git a/pandas/tests/extension/base/base.py b/pandas/tests/extension/base/base.py index 55cfbea479c472..2f808d20acd31b 100644 --- a/pandas/tests/extension/base/base.py +++ b/pandas/tests/extension/base/base.py @@ -6,6 +6,4 @@ class BaseExtensionTests: assert_equal = staticmethod(tm.assert_equal) assert_series_equal = staticmethod(tm.assert_series_equal) assert_frame_equal = staticmethod(tm.assert_frame_equal) - assert_extension_array_equal = staticmethod( - tm.assert_extension_array_equal - ) + assert_extension_array_equal = staticmethod(tm.assert_extension_array_equal) diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index 231a1f648f8e8a..7262a85b1fe003 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -8,7 +8,6 @@ class BaseConstructorsTests(BaseExtensionTests): - def test_from_sequence_from_cls(self, data): result = type(data)._from_sequence(data, dtype=data.dtype) self.assert_extension_array_equal(result, data) @@ -39,7 +38,7 @@ def test_dataframe_constructor_from_dict(self, data, from_series): if from_series: data = pd.Series(data) result = pd.DataFrame({"A": data}) - assert result.dtypes['A'] == data.dtype + assert result.dtypes["A"] == data.dtype assert result.shape == (len(data), 1) assert isinstance(result._data.blocks[0], ExtensionBlock) @@ -50,7 +49,7 @@ def test_dataframe_from_series(self, data): assert isinstance(result._data.blocks[0], ExtensionBlock) def test_series_given_mismatched_index_raises(self, data): - msg = 'Length of passed values is 3, index implies 5' + msg = "Length of passed values is 3, index implies 5" with pytest.raises(ValueError, match=msg): pd.Series(data[:3], index=[0, 1, 2, 3, 4]) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index 7b9dedceb00d43..a5040c8cfc2fc8 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -15,7 +15,7 @@ def test_name(self, dtype): assert isinstance(dtype.name, str) def test_kind(self, dtype): - valid = set('biufcmMOSUV') + valid = set("biufcmMOSUV") if dtype.kind is not None: assert dtype.kind in valid @@ -46,10 +46,10 @@ def test_is_not_object_type(self, dtype): def test_eq_with_str(self, dtype): assert dtype == dtype.name - assert dtype != dtype.name + '-suffix' + assert dtype != dtype.name + "-suffix" def test_eq_with_numpy_object(self, dtype): - assert dtype != np.dtype('object') + assert dtype != np.dtype("object") def test_eq_with_self(self, dtype): assert dtype == dtype @@ -62,18 +62,16 @@ def test_check_dtype(self, data): dtype = data.dtype # check equivalency for using .dtypes - df = pd.DataFrame({'A': pd.Series(data, dtype=dtype), - 'B': data, - 'C': 'foo', 'D': 1}) + df = pd.DataFrame( + {"A": pd.Series(data, dtype=dtype), "B": data, "C": "foo", "D": 1} + ) # np.dtype('int64') == 'Int64' == 'int64' # so can't distinguish - if dtype.name == 'Int64': - expected = pd.Series([True, True, False, True], - index=list('ABCD')) + if dtype.name == "Int64": + expected = pd.Series([True, True, False, True], index=list("ABCD")) else: - expected = pd.Series([True, True, False, False], - index=list('ABCD')) + expected = pd.Series([True, True, False, False], index=list("ABCD")) # XXX: This should probably be *fixed* not ignored. # See libops.scalar_compare @@ -83,8 +81,7 @@ def test_check_dtype(self, data): self.assert_series_equal(result, expected) - expected = pd.Series([True, True, False, False], - index=list('ABCD')) + expected = pd.Series([True, True, False, False], index=list("ABCD")) result = df.dtypes.apply(str) == str(dtype) self.assert_series_equal(result, expected) @@ -96,10 +93,10 @@ def test_str(self, dtype): def test_eq(self, dtype): assert dtype == dtype.name - assert dtype != 'anonther_type' + assert dtype != "anonther_type" def test_construct_from_string(self, dtype): dtype_instance = dtype.__class__.construct_from_string(dtype.name) assert isinstance(dtype_instance, dtype.__class__) with pytest.raises(TypeError): - dtype.__class__.construct_from_string('another_type') + dtype.__class__.construct_from_string("another_type") diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 6a5507b51b3bac..e02586eacfea70 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -19,8 +19,7 @@ def test_iloc_series(self, data): self.assert_series_equal(result, expected) def test_iloc_frame(self, data): - df = pd.DataFrame({"A": data, 'B': - np.arange(len(data), dtype='int64')}) + df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")}) expected = pd.DataFrame({"A": data[:4]}) # slice -> frame @@ -31,7 +30,7 @@ def test_iloc_frame(self, data): result = df.iloc[[0, 1, 2, 3], [0]] self.assert_frame_equal(result, expected) - expected = pd.Series(data[:4], name='A') + expected = pd.Series(data[:4], name="A") # slice -> series result = df.iloc[:4, 0] @@ -51,26 +50,25 @@ def test_loc_series(self, data): self.assert_series_equal(result, expected) def test_loc_frame(self, data): - df = pd.DataFrame({"A": data, - 'B': np.arange(len(data), dtype='int64')}) + df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")}) expected = pd.DataFrame({"A": data[:4]}) # slice -> frame - result = df.loc[:3, ['A']] + result = df.loc[:3, ["A"]] self.assert_frame_equal(result, expected) # sequence -> frame - result = df.loc[[0, 1, 2, 3], ['A']] + result = df.loc[[0, 1, 2, 3], ["A"]] self.assert_frame_equal(result, expected) - expected = pd.Series(data[:4], name='A') + expected = pd.Series(data[:4], name="A") # slice -> series - result = df.loc[:3, 'A'] + result = df.loc[:3, "A"] self.assert_series_equal(result, expected) # sequence -> series - result = df.loc[:3, 'A'] + result = df.loc[:3, "A"] self.assert_series_equal(result, expected) def test_loc_iloc_frame_single_dtype(self, data): @@ -82,8 +80,9 @@ def test_loc_iloc_frame_single_dtype(self, data): result = df.loc[2] self.assert_series_equal(result, expected) - expected = pd.Series([data[-1]], index=["A"], name=len(data) - 1, - dtype=data.dtype) + expected = pd.Series( + [data[-1]], index=["A"], name=len(data) - 1, dtype=data.dtype + ) result = df.iloc[-1] self.assert_series_equal(result, expected) @@ -146,14 +145,14 @@ def test_get(self, data): assert s.get(-1) is None assert s.get(s.index.max() + 1) is None - s = pd.Series(data[:6], index=list('abcdef')) - assert s.get('c') == s.iloc[2] + s = pd.Series(data[:6], index=list("abcdef")) + assert s.get("c") == s.iloc[2] - result = s.get(slice('b', 'd')) + result = s.get(slice("b", "d")) expected = s.iloc[[1, 2, 3]] self.assert_series_equal(result, expected) - result = s.get('Z') + result = s.get("Z") assert result is None assert s.get(4) == s.iloc[4] @@ -216,7 +215,7 @@ def test_take_pandas_style_negative_raises(self, data, na_value): with pytest.raises(ValueError): data.take([0, -2], fill_value=na_value, allow_fill=True) - @pytest.mark.parametrize('allow_fill', [True, False]) + @pytest.mark.parametrize("allow_fill", [True, False]) def test_take_out_of_bounds_raises(self, data, allow_fill): arr = data[:3] with pytest.raises(IndexError): @@ -227,7 +226,8 @@ def test_take_series(self, data): result = s.take([0, -1]) expected = pd.Series( data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype), - index=[0, len(data) - 1]) + index=[0, len(data) - 1], + ) self.assert_series_equal(result, expected) def test_reindex(self, data, na_value): @@ -239,15 +239,15 @@ def test_reindex(self, data, na_value): n = len(data) result = s.reindex([-1, 0, n]) expected = pd.Series( - data._from_sequence([na_value, data[0], na_value], - dtype=s.dtype), - index=[-1, 0, n]) + data._from_sequence([na_value, data[0], na_value], dtype=s.dtype), + index=[-1, 0, n], + ) self.assert_series_equal(result, expected) result = s.reindex([n, n + 1]) - expected = pd.Series(data._from_sequence([na_value, na_value], - dtype=s.dtype), - index=[n, n + 1]) + expected = pd.Series( + data._from_sequence([na_value, na_value], dtype=s.dtype), index=[n, n + 1] + ) self.assert_series_equal(result, expected) def test_reindex_non_na_fill_value(self, data_missing): diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index daeec5923888c3..dc926d2ff6ab43 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -10,20 +10,18 @@ class BaseGroupbyTests(BaseExtensionTests): """Groupby-specific tests.""" def test_grouping_grouper(self, data_for_grouping): - df = pd.DataFrame({ - "A": ["B", "B", None, None, "A", "A", "B", "C"], - "B": data_for_grouping - }) + df = pd.DataFrame( + {"A": ["B", "B", None, None, "A", "A", "B", "C"], "B": data_for_grouping} + ) gr1 = df.groupby("A").grouper.groupings[0] gr2 = df.groupby("B").grouper.groupings[0] tm.assert_numpy_array_equal(gr1.grouper, df.A.values) tm.assert_extension_array_equal(gr2.grouper, data_for_grouping) - @pytest.mark.parametrize('as_index', [True, False]) + @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_extension_agg(self, as_index, data_for_grouping): - df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], - "B": data_for_grouping}) + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) result = df.groupby("B", as_index=as_index).A.mean() _, index = pd.factorize(data_for_grouping, sort=True) @@ -36,8 +34,7 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): self.assert_frame_equal(result, expected) def test_groupby_extension_no_sort(self, data_for_grouping): - df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], - "B": data_for_grouping}) + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) result = df.groupby("B", sort=False).A.mean() _, index = pd.factorize(data_for_grouping, sort=False) @@ -47,44 +44,48 @@ def test_groupby_extension_no_sort(self, data_for_grouping): def test_groupby_extension_transform(self, data_for_grouping): valid = data_for_grouping[~data_for_grouping.isna()] - df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4], - "B": valid}) + df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4], "B": valid}) result = df.groupby("B").A.transform(len) expected = pd.Series([3, 3, 2, 2, 3, 1], name="A") self.assert_series_equal(result, expected) - def test_groupby_extension_apply( - self, data_for_grouping, groupby_apply_op): - df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], - "B": data_for_grouping}) + def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) df.groupby("B").apply(groupby_apply_op) df.groupby("B").A.apply(groupby_apply_op) df.groupby("A").apply(groupby_apply_op) df.groupby("A").B.apply(groupby_apply_op) def test_groupby_apply_identity(self, data_for_grouping): - df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], - "B": data_for_grouping}) - result = df.groupby('A').B.apply(lambda x: x.array) - expected = pd.Series([df.B.iloc[[0, 1, 6]].array, - df.B.iloc[[2, 3]].array, - df.B.iloc[[4, 5]].array, - df.B.iloc[[7]].array], - index=pd.Index([1, 2, 3, 4], name='A'), - name='B') + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) + result = df.groupby("A").B.apply(lambda x: x.array) + expected = pd.Series( + [ + df.B.iloc[[0, 1, 6]].array, + df.B.iloc[[2, 3]].array, + df.B.iloc[[4, 5]].array, + df.B.iloc[[7]].array, + ], + index=pd.Index([1, 2, 3, 4], name="A"), + name="B", + ) self.assert_series_equal(result, expected) def test_in_numeric_groupby(self, data_for_grouping): - df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], - "B": data_for_grouping, - "C": [1, 1, 1, 1, 1, 1, 1, 1]}) + df = pd.DataFrame( + { + "A": [1, 1, 2, 2, 3, 3, 1, 4], + "B": data_for_grouping, + "C": [1, 1, 1, 1, 1, 1, 1, 1], + } + ) result = df.groupby("A").sum().columns if data_for_grouping.dtype._is_numeric: - expected = pd.Index(['B', 'C']) + expected = pd.Index(["B", "C"]) else: - expected = pd.Index(['C']) + expected = pd.Index(["C"]) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index fd47ae6f312907..dee8021f5375f7 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -11,6 +11,7 @@ class BaseInterfaceTests(BaseExtensionTests): """Tests that the basic interface is satisfied.""" + # ------------------------------------------------------------------------ # Interface # ------------------------------------------------------------------------ @@ -47,8 +48,8 @@ def test_is_extension_array_dtype(self, data): def test_no_values_attribute(self, data): # GH-20735: EA's with .values attribute give problems with internal # code, disallowing this for now until solved - assert not hasattr(data, 'values') - assert not hasattr(data, '_values') + assert not hasattr(data, "values") + assert not hasattr(data, "_values") def test_is_numeric_honored(self, data): result = pd.Series(data) @@ -59,10 +60,10 @@ def test_isna_extension_array(self, data_missing): # _reduce. At the *very* least, you must implement any and all na = data_missing.isna() if is_extension_array_dtype(na): - assert na._reduce('any') + assert na._reduce("any") assert na.any() - assert not na._reduce('all') + assert not na._reduce("all") assert not na.all() assert na.dtype._is_boolean diff --git a/pandas/tests/extension/base/io.py b/pandas/tests/extension/base/io.py index 22787c38b66fbc..3de752a8c682a3 100644 --- a/pandas/tests/extension/base/io.py +++ b/pandas/tests/extension/base/io.py @@ -9,15 +9,12 @@ class BaseParsingTests(BaseExtensionTests): - - @pytest.mark.parametrize('engine', ['c', 'python']) + @pytest.mark.parametrize("engine", ["c", "python"]) def test_EA_types(self, engine, data): - df = pd.DataFrame({ - 'with_dtype': pd.Series(data, dtype=str(data.dtype)) - }) + df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))}) csv_output = df.to_csv(index=False, na_rep=np.nan) - result = pd.read_csv(StringIO(csv_output), dtype={ - 'with_dtype': str(data.dtype) - }, engine=engine) + result = pd.read_csv( + StringIO(csv_output), dtype={"with_dtype": str(data.dtype)}, engine=engine + ) expected = df self.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 9b154a8afeabcc..6d47b0c1d1f778 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -11,7 +11,7 @@ class BaseMethodsTests(BaseExtensionTests): """Various Series and DataFrame methods.""" - @pytest.mark.parametrize('dropna', [True, False]) + @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna): all_data = all_data[:10] if dropna: @@ -20,14 +20,13 @@ def test_value_counts(self, all_data, dropna): other = all_data result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() - expected = pd.Series(other).value_counts( - dropna=dropna).sort_index() + expected = pd.Series(other).value_counts(dropna=dropna).sort_index() self.assert_series_equal(result, expected) def test_count(self, data_missing): df = pd.DataFrame({"A": data_missing}) - result = df.count(axis='columns') + result = df.count(axis="columns") expected = pd.Series([0, 1]) self.assert_series_equal(result, expected) @@ -60,16 +59,19 @@ def test_argsort_missing(self, data_missing_for_sorting): expected = pd.Series(np.array([1, -1, 0], dtype=np.int64)) self.assert_series_equal(result, expected) - @pytest.mark.parametrize('na_position, expected', [ - ('last', np.array([2, 0, 1], dtype=np.dtype('intp'))), - ('first', np.array([1, 2, 0], dtype=np.dtype('intp'))) - ]) + @pytest.mark.parametrize( + "na_position, expected", + [ + ("last", np.array([2, 0, 1], dtype=np.dtype("intp"))), + ("first", np.array([1, 2, 0], dtype=np.dtype("intp"))), + ], + ) def test_nargsort(self, data_missing_for_sorting, na_position, expected): # GH 25439 result = nargsort(data_missing_for_sorting, na_position=na_position) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('ascending', [True, False]) + @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values(self, data_for_sorting, ascending): ser = pd.Series(data_for_sorting) result = ser.sort_values(ascending=ascending) @@ -79,7 +81,7 @@ def test_sort_values(self, data_for_sorting, ascending): self.assert_series_equal(result, expected) - @pytest.mark.parametrize('ascending', [True, False]) + @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values_missing(self, data_missing_for_sorting, ascending): ser = pd.Series(data_missing_for_sorting) result = ser.sort_values(ascending=ascending) @@ -89,18 +91,17 @@ def test_sort_values_missing(self, data_missing_for_sorting, ascending): expected = ser.iloc[[0, 2, 1]] self.assert_series_equal(result, expected) - @pytest.mark.parametrize('ascending', [True, False]) + @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values_frame(self, data_for_sorting, ascending): - df = pd.DataFrame({"A": [1, 2, 1], - "B": data_for_sorting}) - result = df.sort_values(['A', 'B']) - expected = pd.DataFrame({"A": [1, 1, 2], - 'B': data_for_sorting.take([2, 0, 1])}, - index=[2, 0, 1]) + df = pd.DataFrame({"A": [1, 2, 1], "B": data_for_sorting}) + result = df.sort_values(["A", "B"]) + expected = pd.DataFrame( + {"A": [1, 1, 2], "B": data_for_sorting.take([2, 0, 1])}, index=[2, 0, 1] + ) self.assert_frame_equal(result, expected) - @pytest.mark.parametrize('box', [pd.Series, lambda x: x]) - @pytest.mark.parametrize('method', [lambda x: x.unique(), pd.unique]) + @pytest.mark.parametrize("box", [pd.Series, lambda x: x]) + @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique]) def test_unique(self, data, box, method): duplicated = box(data._from_sequence([data[0], data[0]])) @@ -110,19 +111,18 @@ def test_unique(self, data, box, method): assert isinstance(result, type(data)) assert result[0] == duplicated[0] - @pytest.mark.parametrize('na_sentinel', [-1, -2]) + @pytest.mark.parametrize("na_sentinel", [-1, -2]) def test_factorize(self, data_for_grouping, na_sentinel): - labels, uniques = pd.factorize(data_for_grouping, - na_sentinel=na_sentinel) - expected_labels = np.array([0, 0, na_sentinel, - na_sentinel, 1, 1, 0, 2], - dtype=np.intp) + labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + expected_labels = np.array( + [0, 0, na_sentinel, na_sentinel, 1, 1, 0, 2], dtype=np.intp + ) expected_uniques = data_for_grouping.take([0, 4, 7]) tm.assert_numpy_array_equal(labels, expected_labels) self.assert_extension_array_equal(uniques, expected_uniques) - @pytest.mark.parametrize('na_sentinel', [-1, -2]) + @pytest.mark.parametrize("na_sentinel", [-1, -2]) def test_factorize_equivalence(self, data_for_grouping, na_sentinel): l1, u1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) l2, u2 = data_for_grouping.factorize(na_sentinel=na_sentinel) @@ -169,8 +169,9 @@ def test_combine_le(self, data_repeated): s1 = pd.Series(orig_data1) s2 = pd.Series(orig_data2) result = s1.combine(s2, lambda x1, x2: x1 <= x2) - expected = pd.Series([a <= b for (a, b) in - zip(list(orig_data1), list(orig_data2))]) + expected = pd.Series( + [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))] + ) self.assert_series_equal(result, expected) val = s1.iloc[0] @@ -184,17 +185,19 @@ def test_combine_add(self, data_repeated): s1 = pd.Series(orig_data1) s2 = pd.Series(orig_data2) result = s1.combine(s2, lambda x1, x2: x1 + x2) - with np.errstate(over='ignore'): + with np.errstate(over="ignore"): expected = pd.Series( - orig_data1._from_sequence([a + b for (a, b) in - zip(list(orig_data1), - list(orig_data2))])) + orig_data1._from_sequence( + [a + b for (a, b) in zip(list(orig_data1), list(orig_data2))] + ) + ) self.assert_series_equal(result, expected) val = s1.iloc[0] result = s1.combine(val, lambda x1, x2: x1 + x2) expected = pd.Series( - orig_data1._from_sequence([a + val for a in list(orig_data1)])) + orig_data1._from_sequence([a + val for a in list(orig_data1)]) + ) self.assert_series_equal(result, expected) def test_combine_first(self, data): @@ -205,24 +208,22 @@ def test_combine_first(self, data): expected = pd.Series(data[:5]) self.assert_series_equal(result, expected) - @pytest.mark.parametrize('frame', [True, False]) - @pytest.mark.parametrize('periods, indices', [ - (-2, [2, 3, 4, -1, -1]), - (0, [0, 1, 2, 3, 4]), - (2, [-1, -1, 0, 1, 2]), - ]) + @pytest.mark.parametrize("frame", [True, False]) + @pytest.mark.parametrize( + "periods, indices", + [(-2, [2, 3, 4, -1, -1]), (0, [0, 1, 2, 3, 4]), (2, [-1, -1, 0, 1, 2])], + ) def test_container_shift(self, data, frame, periods, indices): # https://github.com/pandas-dev/pandas/issues/22386 subset = data[:5] - data = pd.Series(subset, name='A') - expected = pd.Series(subset.take(indices, allow_fill=True), name='A') + data = pd.Series(subset, name="A") + expected = pd.Series(subset.take(indices, allow_fill=True), name="A") if frame: - result = data.to_frame(name='A').assign(B=1).shift(periods) - expected = pd.concat([ - expected, - pd.Series([1] * 5, name='B').shift(periods) - ], axis=1) + result = data.to_frame(name="A").assign(B=1).shift(periods) + expected = pd.concat( + [expected, pd.Series([1] * 5, name="B").shift(periods)], axis=1 + ) compare = self.assert_frame_equal else: result = data.shift(periods) @@ -230,13 +231,10 @@ def test_container_shift(self, data, frame, periods, indices): compare(result, expected) - @pytest.mark.parametrize('periods, indices', [ - [-4, [-1, -1]], - [-1, [1, -1]], - [0, [0, 1]], - [1, [-1, 0]], - [4, [-1, -1]] - ]) + @pytest.mark.parametrize( + "periods, indices", + [[-4, [-1, -1]], [-1, [1, -1]], [0, [0, 1]], [1, [-1, 0]], [4, [-1, -1]]], + ) def test_shift_non_empty_array(self, data, periods, indices): # https://github.com/pandas-dev/pandas/issues/23911 subset = data[:2] @@ -244,9 +242,7 @@ def test_shift_non_empty_array(self, data, periods, indices): expected = subset.take(indices, allow_fill=True) self.assert_extension_array_equal(result, expected) - @pytest.mark.parametrize('periods', [ - -4, -1, 0, 1, 4 - ]) + @pytest.mark.parametrize("periods", [-4, -1, 0, 1, 4]) def test_shift_empty_array(self, data, periods): # https://github.com/pandas-dev/pandas/issues/23911 empty = data[:0] @@ -307,15 +303,16 @@ def test_where_series(self, data, na_value, as_frame): cond = np.array([True, True, False, False]) if as_frame: - ser = ser.to_frame(name='a') + ser = ser.to_frame(name="a") cond = cond.reshape(-1, 1) result = ser.where(cond) - expected = pd.Series(cls._from_sequence([a, a, na_value, na_value], - dtype=data.dtype)) + expected = pd.Series( + cls._from_sequence([a, a, na_value, na_value], dtype=data.dtype) + ) if as_frame: - expected = expected.to_frame(name='a') + expected = expected.to_frame(name="a") self.assert_equal(result, expected) # array other @@ -325,10 +322,9 @@ def test_where_series(self, data, na_value, as_frame): other = pd.DataFrame({"a": other}) cond = pd.DataFrame({"a": cond}) result = ser.where(cond, other) - expected = pd.Series(cls._from_sequence([a, b, b, b], - dtype=data.dtype)) + expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype)) if as_frame: - expected = expected.to_frame(name='a') + expected = expected.to_frame(name="a") self.assert_equal(result, expected) @pytest.mark.parametrize("repeats", [0, 1, 2, [1, 2, 3]]) @@ -347,11 +343,15 @@ def test_repeat(self, data, repeats, as_series, use_numpy): self.assert_equal(result, expected) - @pytest.mark.parametrize('repeats, kwargs, error, msg', [ - (2, dict(axis=1), ValueError, "'axis"), - (-1, dict(), ValueError, "negative"), - ([1, 2], dict(), ValueError, "shape"), - (2, dict(foo='bar'), TypeError, "'foo'")]) + @pytest.mark.parametrize( + "repeats, kwargs, error, msg", + [ + (2, dict(axis=1), ValueError, "'axis"), + (-1, dict(), ValueError, "negative"), + ([1, 2], dict(), ValueError, "shape"), + (2, dict(foo="bar"), TypeError, "'foo'"), + ], + ) def test_repeat_raises(self, data, repeats, kwargs, error, msg, use_numpy): with pytest.raises(error, match=msg): if use_numpy: diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 834f49f0461f09..21bbb365ab0f33 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -42,13 +42,12 @@ def test_dropna_frame(self, data_missing): self.assert_frame_equal(result, expected) # axis = 1 - result = df.dropna(axis='columns') + result = df.dropna(axis="columns") expected = pd.DataFrame(index=[0, 1]) self.assert_frame_equal(result, expected) # multiple - df = pd.DataFrame({"A": data_missing, - "B": [1, np.nan]}) + df = pd.DataFrame({"A": data_missing, "B": [1, np.nan]}) result = df.dropna() expected = df.iloc[:0] self.assert_frame_equal(result, expected) @@ -61,13 +60,13 @@ def test_fillna_scalar(self, data_missing): def test_fillna_limit_pad(self, data_missing): arr = data_missing.take([1, 0, 0, 0, 1]) - result = pd.Series(arr).fillna(method='ffill', limit=2) + result = pd.Series(arr).fillna(method="ffill", limit=2) expected = pd.Series(data_missing.take([1, 1, 1, 0, 1])) self.assert_series_equal(result, expected) def test_fillna_limit_backfill(self, data_missing): arr = data_missing.take([1, 0, 0, 0, 1]) - result = pd.Series(arr).fillna(method='backfill', limit=2) + result = pd.Series(arr).fillna(method="backfill", limit=2) expected = pd.Series(data_missing.take([1, 0, 1, 1, 1])) self.assert_series_equal(result, expected) @@ -76,8 +75,11 @@ def test_fillna_series(self, data_missing): ser = pd.Series(data_missing) result = ser.fillna(fill_value) - expected = pd.Series(data_missing._from_sequence( - [fill_value, fill_value], dtype=data_missing.dtype)) + expected = pd.Series( + data_missing._from_sequence( + [fill_value, fill_value], dtype=data_missing.dtype + ) + ) self.assert_series_equal(result, expected) # Fill with a series @@ -91,40 +93,37 @@ def test_fillna_series(self, data_missing): def test_fillna_series_method(self, data_missing, fillna_method): fill_value = data_missing[1] - if fillna_method == 'ffill': + if fillna_method == "ffill": data_missing = data_missing[::-1] result = pd.Series(data_missing).fillna(method=fillna_method) - expected = pd.Series(data_missing._from_sequence( - [fill_value, fill_value], dtype=data_missing.dtype)) + expected = pd.Series( + data_missing._from_sequence( + [fill_value, fill_value], dtype=data_missing.dtype + ) + ) self.assert_series_equal(result, expected) def test_fillna_frame(self, data_missing): fill_value = data_missing[1] - result = pd.DataFrame({ - "A": data_missing, - "B": [1, 2] - }).fillna(fill_value) + result = pd.DataFrame({"A": data_missing, "B": [1, 2]}).fillna(fill_value) - expected = pd.DataFrame({ - "A": data_missing._from_sequence([fill_value, fill_value], - dtype=data_missing.dtype), - "B": [1, 2], - }) + expected = pd.DataFrame( + { + "A": data_missing._from_sequence( + [fill_value, fill_value], dtype=data_missing.dtype + ), + "B": [1, 2], + } + ) self.assert_frame_equal(result, expected) def test_fillna_fill_other(self, data): - result = pd.DataFrame({ - "A": data, - "B": [np.nan] * len(data) - }).fillna({"B": 0.0}) - - expected = pd.DataFrame({ - "A": data, - "B": [0.0] * len(result), - }) + result = pd.DataFrame({"A": data, "B": [np.nan] * len(data)}).fillna({"B": 0.0}) + + expected = pd.DataFrame({"A": data, "B": [0.0] * len(result)}) self.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 708eb9c7c8c439..e35464964f4325 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -9,9 +9,8 @@ class BaseOpsUtil(BaseExtensionTests): - def get_op_from_name(self, op_name): - short_opname = op_name.strip('_') + short_opname = op_name.strip("_") try: op = getattr(operator, short_opname) except AttributeError: @@ -61,6 +60,7 @@ class BaseArithmeticOpsTests(BaseOpsUtil): * series_array_exc = TypeError * divmod_exc = TypeError """ + series_scalar_exc = TypeError frame_scalar_exc = TypeError series_array_exc = TypeError @@ -76,15 +76,16 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators): def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar op_name = all_arithmetic_operators - df = pd.DataFrame({'A': data}) + df = pd.DataFrame({"A": data}) self.check_opname(df, op_name, data[0], exc=self.frame_scalar_exc) def test_arith_series_with_array(self, data, all_arithmetic_operators): # ndarray & other series op_name = all_arithmetic_operators s = pd.Series(data) - self.check_opname(s, op_name, pd.Series([s.iloc[0]] * len(s)), - exc=self.series_array_exc) + self.check_opname( + s, op_name, pd.Series([s.iloc[0]] * len(s)), exc=self.series_array_exc + ) def test_divmod(self, data): s = pd.Series(data) @@ -117,7 +118,7 @@ def test_direct_arith_with_series_returns_not_implemented(self, data): # EAs should return NotImplemented for ops with Series. # Pandas takes care of unboxing the series and calling the EA's op. other = pd.Series(data) - if hasattr(data, '__add__'): + if hasattr(data, "__add__"): result = data.__add__(other) assert result is NotImplemented else: @@ -131,10 +132,10 @@ class BaseComparisonOpsTests(BaseOpsUtil): def _compare_other(self, s, data, op_name, other): op = self.get_op_from_name(op_name) - if op_name == '__eq__': + if op_name == "__eq__": assert getattr(data, op_name)(other) is NotImplemented assert not op(s, other).all() - elif op_name == '__ne__': + elif op_name == "__ne__": assert getattr(data, op_name)(other) is NotImplemented assert op(s, other).all() @@ -163,7 +164,7 @@ def test_direct_arith_with_series_returns_not_implemented(self, data): # EAs should return NotImplemented for ops with Series. # Pandas takes care of unboxing the series and calling the EA's op. other = pd.Series(data) - if hasattr(data, '__eq__'): + if hasattr(data, "__eq__"): result = data.__eq__(other) assert result is NotImplemented else: diff --git a/pandas/tests/extension/base/printing.py b/pandas/tests/extension/base/printing.py index 8b33ce173c7860..0f10efbf32a494 100644 --- a/pandas/tests/extension/base/printing.py +++ b/pandas/tests/extension/base/printing.py @@ -19,10 +19,10 @@ def test_array_repr(self, data, size): result = repr(data) assert data.__class__.__name__ in result - assert 'Length: {}'.format(len(data)) in result + assert "Length: {}".format(len(data)) in result assert str(data.dtype) in result - if size == 'big': - assert '...' in result + if size == "big": + assert "..." in result def test_array_repr_unicode(self, data): result = str(data) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index c4b70f20132656..8766bb771f8a23 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -13,16 +13,17 @@ class BaseReduceTests(BaseExtensionTests): Reduction specific tests. Generally these only make sense for numeric/boolean operations. """ + def check_reduce(self, s, op_name, skipna): result = getattr(s, op_name)(skipna=skipna) - expected = getattr(s.astype('float64'), op_name)(skipna=skipna) + expected = getattr(s.astype("float64"), op_name)(skipna=skipna) tm.assert_almost_equal(result, expected) class BaseNoReduceTests(BaseReduceTests): """ we don't define any reductions """ - @pytest.mark.parametrize('skipna', [True, False]) + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions s = pd.Series(data) @@ -30,7 +31,7 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): with pytest.raises(TypeError): getattr(s, op_name)(skipna=skipna) - @pytest.mark.parametrize('skipna', [True, False]) + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna): op_name = all_boolean_reductions s = pd.Series(data) @@ -40,8 +41,7 @@ def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna): class BaseNumericReduceTests(BaseReduceTests): - - @pytest.mark.parametrize('skipna', [True, False]) + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions s = pd.Series(data) @@ -53,8 +53,7 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna): class BaseBooleanReduceTests(BaseReduceTests): - - @pytest.mark.parametrize('skipna', [True, False]) + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series(self, data, all_boolean_reductions, skipna): op_name = all_boolean_reductions s = pd.Series(data) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 4ea78a4239e6ea..90e607343297d2 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -11,7 +11,8 @@ class BaseReshapingTests(BaseExtensionTests): """Tests for reshaping and concatenation.""" - @pytest.mark.parametrize('in_frame', [True, False]) + + @pytest.mark.parametrize("in_frame", [True, False]) def test_concat(self, data, in_frame): wrapped = pd.Series(data) if in_frame: @@ -28,7 +29,7 @@ def test_concat(self, data, in_frame): assert dtype == data.dtype assert isinstance(result._data.blocks[0], ExtensionBlock) - @pytest.mark.parametrize('in_frame', [True, False]) + @pytest.mark.parametrize("in_frame", [True, False]) def test_concat_all_na_block(self, data_missing, in_frame): valid_block = pd.Series(data_missing.take([1, 1]), index=[0, 1]) na_block = pd.Series(data_missing.take([0, 0]), index=[2, 3]) @@ -45,9 +46,9 @@ def test_concat_all_na_block(self, data_missing, in_frame): def test_concat_mixed_dtypes(self, data): # https://github.com/pandas-dev/pandas/issues/20762 - df1 = pd.DataFrame({'A': data[:3]}) + df1 = pd.DataFrame({"A": data[:3]}) df2 = pd.DataFrame({"A": [1, 2, 3]}) - df3 = pd.DataFrame({"A": ['a', 'b', 'c']}).astype('category') + df3 = pd.DataFrame({"A": ["a", "b", "c"]}).astype("category") dfs = [df1, df2, df3] # dataframes @@ -56,40 +57,41 @@ def test_concat_mixed_dtypes(self, data): self.assert_frame_equal(result, expected) # series - result = pd.concat([x['A'] for x in dfs]) - expected = pd.concat([x['A'].astype(object) for x in dfs]) + result = pd.concat([x["A"] for x in dfs]) + expected = pd.concat([x["A"].astype(object) for x in dfs]) self.assert_series_equal(result, expected) # simple test for just EA and one other result = pd.concat([df1, df2]) - expected = pd.concat([df1.astype('object'), df2.astype('object')]) + expected = pd.concat([df1.astype("object"), df2.astype("object")]) self.assert_frame_equal(result, expected) - result = pd.concat([df1['A'], df2['A']]) - expected = pd.concat([df1['A'].astype('object'), - df2['A'].astype('object')]) + result = pd.concat([df1["A"], df2["A"]]) + expected = pd.concat([df1["A"].astype("object"), df2["A"].astype("object")]) self.assert_series_equal(result, expected) def test_concat_columns(self, data, na_value): - df1 = pd.DataFrame({'A': data[:3]}) - df2 = pd.DataFrame({'B': [1, 2, 3]}) + df1 = pd.DataFrame({"A": data[:3]}) + df2 = pd.DataFrame({"B": [1, 2, 3]}) - expected = pd.DataFrame({'A': data[:3], 'B': [1, 2, 3]}) + expected = pd.DataFrame({"A": data[:3], "B": [1, 2, 3]}) result = pd.concat([df1, df2], axis=1) self.assert_frame_equal(result, expected) - result = pd.concat([df1['A'], df2['B']], axis=1) + result = pd.concat([df1["A"], df2["B"]], axis=1) self.assert_frame_equal(result, expected) # non-aligned - df2 = pd.DataFrame({'B': [1, 2, 3]}, index=[1, 2, 3]) - expected = pd.DataFrame({ - 'A': data._from_sequence(list(data[:3]) + [na_value], - dtype=data.dtype), - 'B': [np.nan, 1, 2, 3]}) + df2 = pd.DataFrame({"B": [1, 2, 3]}, index=[1, 2, 3]) + expected = pd.DataFrame( + { + "A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype), + "B": [np.nan, 1, 2, 3], + } + ) result = pd.concat([df1, df2], axis=1) self.assert_frame_equal(result, expected) - result = pd.concat([df1['A'], df2['B']], axis=1) + result = pd.concat([df1["A"], df2["B"]], axis=1) self.assert_frame_equal(result, expected) def test_align(self, data, na_value): @@ -98,80 +100,88 @@ def test_align(self, data, na_value): r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) # Assumes that the ctor can take a list of scalars of the type - e1 = pd.Series(data._from_sequence(list(a) + [na_value], - dtype=data.dtype)) - e2 = pd.Series(data._from_sequence([na_value] + list(b), - dtype=data.dtype)) + e1 = pd.Series(data._from_sequence(list(a) + [na_value], dtype=data.dtype)) + e2 = pd.Series(data._from_sequence([na_value] + list(b), dtype=data.dtype)) self.assert_series_equal(r1, e1) self.assert_series_equal(r2, e2) def test_align_frame(self, data, na_value): a = data[:3] b = data[2:5] - r1, r2 = pd.DataFrame({'A': a}).align( - pd.DataFrame({'A': b}, index=[1, 2, 3]) - ) + r1, r2 = pd.DataFrame({"A": a}).align(pd.DataFrame({"A": b}, index=[1, 2, 3])) # Assumes that the ctor can take a list of scalars of the type - e1 = pd.DataFrame({'A': data._from_sequence(list(a) + [na_value], - dtype=data.dtype)}) - e2 = pd.DataFrame({'A': data._from_sequence([na_value] + list(b), - dtype=data.dtype)}) + e1 = pd.DataFrame( + {"A": data._from_sequence(list(a) + [na_value], dtype=data.dtype)} + ) + e2 = pd.DataFrame( + {"A": data._from_sequence([na_value] + list(b), dtype=data.dtype)} + ) self.assert_frame_equal(r1, e1) self.assert_frame_equal(r2, e2) def test_align_series_frame(self, data, na_value): # https://github.com/pandas-dev/pandas/issues/20576 - ser = pd.Series(data, name='a') + ser = pd.Series(data, name="a") df = pd.DataFrame({"col": np.arange(len(ser) + 1)}) r1, r2 = ser.align(df) - e1 = pd.Series(data._from_sequence(list(data) + [na_value], - dtype=data.dtype), - name=ser.name) + e1 = pd.Series( + data._from_sequence(list(data) + [na_value], dtype=data.dtype), + name=ser.name, + ) self.assert_series_equal(r1, e1) self.assert_frame_equal(r2, df) def test_set_frame_expand_regular_with_extension(self, data): df = pd.DataFrame({"A": [1] * len(data)}) - df['B'] = data + df["B"] = data expected = pd.DataFrame({"A": [1] * len(data), "B": data}) self.assert_frame_equal(df, expected) def test_set_frame_expand_extension_with_regular(self, data): - df = pd.DataFrame({'A': data}) - df['B'] = [1] * len(data) + df = pd.DataFrame({"A": data}) + df["B"] = [1] * len(data) expected = pd.DataFrame({"A": data, "B": [1] * len(data)}) self.assert_frame_equal(df, expected) def test_set_frame_overwrite_object(self, data): # https://github.com/pandas-dev/pandas/issues/20555 df = pd.DataFrame({"A": [1] * len(data)}, dtype=object) - df['A'] = data - assert df.dtypes['A'] == data.dtype + df["A"] = data + assert df.dtypes["A"] == data.dtype def test_merge(self, data, na_value): # GH-20743 - df1 = pd.DataFrame({'ext': data[:3], 'int1': [1, 2, 3], - 'key': [0, 1, 2]}) - df2 = pd.DataFrame({'int2': [1, 2, 3, 4], 'key': [0, 0, 1, 3]}) + df1 = pd.DataFrame({"ext": data[:3], "int1": [1, 2, 3], "key": [0, 1, 2]}) + df2 = pd.DataFrame({"int2": [1, 2, 3, 4], "key": [0, 0, 1, 3]}) res = pd.merge(df1, df2) exp = pd.DataFrame( - {'int1': [1, 1, 2], 'int2': [1, 2, 3], 'key': [0, 0, 1], - 'ext': data._from_sequence([data[0], data[0], data[1]], - dtype=data.dtype)}) - self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) + { + "int1": [1, 1, 2], + "int2": [1, 2, 3], + "key": [0, 0, 1], + "ext": data._from_sequence( + [data[0], data[0], data[1]], dtype=data.dtype + ), + } + ) + self.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]]) - res = pd.merge(df1, df2, how='outer') + res = pd.merge(df1, df2, how="outer") exp = pd.DataFrame( - {'int1': [1, 1, 2, 3, np.nan], 'int2': [1, 2, 3, np.nan, 4], - 'key': [0, 0, 1, 2, 3], - 'ext': data._from_sequence( - [data[0], data[0], data[1], data[2], na_value], - dtype=data.dtype)}) - self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) + { + "int1": [1, 1, 2, 3, np.nan], + "int2": [1, 2, 3, np.nan, 4], + "key": [0, 0, 1, 2, 3], + "ext": data._from_sequence( + [data[0], data[0], data[1], data[2], na_value], dtype=data.dtype + ), + } + ) + self.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]]) def test_merge_on_extension_array(self, data): # GH 23020 @@ -179,14 +189,12 @@ def test_merge_on_extension_array(self, data): key = type(data)._from_sequence([a, b], dtype=data.dtype) df = pd.DataFrame({"key": key, "val": [1, 2]}) - result = pd.merge(df, df, on='key') - expected = pd.DataFrame({"key": key, - "val_x": [1, 2], - "val_y": [1, 2]}) + result = pd.merge(df, df, on="key") + expected = pd.DataFrame({"key": key, "val_x": [1, 2], "val_y": [1, 2]}) self.assert_frame_equal(result, expected) # order - result = pd.merge(df.iloc[[1, 0]], df, on='key') + result = pd.merge(df.iloc[[1, 0]], df, on="key") expected = expected.iloc[[1, 0]].reset_index(drop=True) self.assert_frame_equal(result, expected) @@ -197,19 +205,25 @@ def test_merge_on_extension_array_duplicates(self, data): df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]}) df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]}) - result = pd.merge(df1, df2, on='key') - expected = pd.DataFrame({ - "key": key.take([0, 0, 0, 0, 1]), - "val_x": [1, 1, 3, 3, 2], - "val_y": [1, 3, 1, 3, 2], - }) + result = pd.merge(df1, df2, on="key") + expected = pd.DataFrame( + { + "key": key.take([0, 0, 0, 0, 1]), + "val_x": [1, 1, 3, 3, 2], + "val_y": [1, 3, 1, 3, 2], + } + ) self.assert_frame_equal(result, expected) - @pytest.mark.parametrize("columns", [ - ["A", "B"], - pd.MultiIndex.from_tuples([('A', 'a'), ('A', 'b')], - names=['outer', 'inner']), - ]) + @pytest.mark.parametrize( + "columns", + [ + ["A", "B"], + pd.MultiIndex.from_tuples( + [("A", "a"), ("A", "b")], names=["outer", "inner"] + ), + ], + ) def test_stack(self, data, columns): df = pd.DataFrame({"A": data[:5], "B": data[:5]}) df.columns = columns @@ -227,27 +241,29 @@ def test_stack(self, data, columns): result = result.astype(object) self.assert_equal(result, expected) - @pytest.mark.parametrize("index", [ - # Two levels, uniform. - pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b']]), - names=['a', 'b']), - - # non-uniform - pd.MultiIndex.from_tuples([('A', 'a'), ('A', 'b'), ('B', 'b')]), - - # three levels, non-uniform - pd.MultiIndex.from_product([('A', 'B'), ('a', 'b', 'c'), (0, 1, 2)]), - pd.MultiIndex.from_tuples([ - ('A', 'a', 1), - ('A', 'b', 0), - ('A', 'a', 0), - ('B', 'a', 0), - ('B', 'c', 1), - ]), - ]) + @pytest.mark.parametrize( + "index", + [ + # Two levels, uniform. + pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]), + # non-uniform + pd.MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "b")]), + # three levels, non-uniform + pd.MultiIndex.from_product([("A", "B"), ("a", "b", "c"), (0, 1, 2)]), + pd.MultiIndex.from_tuples( + [ + ("A", "a", 1), + ("A", "b", 0), + ("A", "a", 0), + ("B", "a", 0), + ("B", "c", 1), + ] + ), + ], + ) @pytest.mark.parametrize("obj", ["series", "frame"]) def test_unstack(self, data, index, obj): - data = data[:len(index)] + data = data[: len(index)] if obj == "series": ser = pd.Series(data, index=index) else: @@ -263,8 +279,9 @@ def test_unstack(self, data, index, obj): for level in combinations: result = ser.unstack(level=level) - assert all(isinstance(result[col].array, type(data)) - for col in result.columns) + assert all( + isinstance(result[col].array, type(data)) for col in result.columns + ) expected = ser.astype(object).unstack(level=level) result = result.astype(object) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index db6328e39e6cc0..bb6bb02b462e28 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -31,13 +31,13 @@ def test_setitem_sequence_mismatched_length_raises(self, data, as_array): if as_array: value = data._from_sequence(value) - xpr = 'cannot set using a {} indexer with a different length' - with pytest.raises(ValueError, match=xpr.format('list-like')): + xpr = "cannot set using a {} indexer with a different length" + with pytest.raises(ValueError, match=xpr.format("list-like")): ser[[0, 1]] = value # Ensure no modifications made before the exception self.assert_series_equal(ser, original) - with pytest.raises(ValueError, match=xpr.format('slice')): + with pytest.raises(ValueError, match=xpr.format("slice")): ser[slice(3)] = value self.assert_series_equal(ser, original) @@ -55,7 +55,7 @@ def test_setitem_sequence_broadcasts(self, data, box_in_series): assert data[0] == data[2] assert data[1] == data[2] - @pytest.mark.parametrize('setter', ['loc', 'iloc']) + @pytest.mark.parametrize("setter", ["loc", "iloc"]) def test_setitem_scalar(self, data, setter): arr = pd.Series(data) setter = getattr(arr, setter) @@ -64,36 +64,36 @@ def test_setitem_scalar(self, data, setter): def test_setitem_loc_scalar_mixed(self, data): df = pd.DataFrame({"A": np.arange(len(data)), "B": data}) - df.loc[0, 'B'] = data[1] - assert df.loc[0, 'B'] == data[1] + df.loc[0, "B"] = data[1] + assert df.loc[0, "B"] == data[1] def test_setitem_loc_scalar_single(self, data): df = pd.DataFrame({"B": data}) - df.loc[10, 'B'] = data[1] - assert df.loc[10, 'B'] == data[1] + df.loc[10, "B"] = data[1] + assert df.loc[10, "B"] == data[1] def test_setitem_loc_scalar_multiple_homogoneous(self, data): df = pd.DataFrame({"A": data, "B": data}) - df.loc[10, 'B'] = data[1] - assert df.loc[10, 'B'] == data[1] + df.loc[10, "B"] = data[1] + assert df.loc[10, "B"] == data[1] def test_setitem_iloc_scalar_mixed(self, data): df = pd.DataFrame({"A": np.arange(len(data)), "B": data}) df.iloc[0, 1] = data[1] - assert df.loc[0, 'B'] == data[1] + assert df.loc[0, "B"] == data[1] def test_setitem_iloc_scalar_single(self, data): df = pd.DataFrame({"B": data}) df.iloc[10, 0] = data[1] - assert df.loc[10, 'B'] == data[1] + assert df.loc[10, "B"] == data[1] def test_setitem_iloc_scalar_multiple_homogoneous(self, data): df = pd.DataFrame({"A": data, "B": data}) df.iloc[10, 1] = data[1] - assert df.loc[10, 'B'] == data[1] + assert df.loc[10, "B"] == data[1] - @pytest.mark.parametrize('as_callable', [True, False]) - @pytest.mark.parametrize('setter', ['loc', None]) + @pytest.mark.parametrize("as_callable", [True, False]) + @pytest.mark.parametrize("setter", ["loc", None]) def test_setitem_mask_aligned(self, data, as_callable, setter): ser = pd.Series(data) mask = np.zeros(len(data), dtype=bool) @@ -117,13 +117,13 @@ def test_setitem_mask_aligned(self, data, as_callable, setter): assert ser[0] == data[5] assert ser[1] == data[6] - @pytest.mark.parametrize('setter', ['loc', None]) + @pytest.mark.parametrize("setter", ["loc", None]) def test_setitem_mask_broadcast(self, data, setter): ser = pd.Series(data) mask = np.zeros(len(data), dtype=bool) mask[:2] = True - if setter: # loc + if setter: # loc target = getattr(ser, setter) else: # __setitem__ target = ser @@ -135,35 +135,35 @@ def test_setitem_mask_broadcast(self, data, setter): def test_setitem_expand_columns(self, data): df = pd.DataFrame({"A": data}) result = df.copy() - result['B'] = 1 + result["B"] = 1 expected = pd.DataFrame({"A": data, "B": [1] * len(data)}) self.assert_frame_equal(result, expected) result = df.copy() - result.loc[:, 'B'] = 1 + result.loc[:, "B"] = 1 self.assert_frame_equal(result, expected) # overwrite with new type - result['B'] = data + result["B"] = data expected = pd.DataFrame({"A": data, "B": data}) self.assert_frame_equal(result, expected) def test_setitem_expand_with_extension(self, data): df = pd.DataFrame({"A": [1] * len(data)}) result = df.copy() - result['B'] = data + result["B"] = data expected = pd.DataFrame({"A": [1] * len(data), "B": data}) self.assert_frame_equal(result, expected) result = df.copy() - result.loc[:, 'B'] = data + result.loc[:, "B"] = data self.assert_frame_equal(result, expected) def test_setitem_frame_invalid_length(self, data): df = pd.DataFrame({"A": [1] * len(data)}) xpr = "Length of values does not match length of index" with pytest.raises(ValueError, match=xpr): - df['B'] = data[:5] + df["B"] = data[:5] @pytest.mark.xfail(reason="GH#20441: setitem on extension types.") def test_setitem_tuple_index(self, data): diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 6fbd43e46495f2..d37638d37e4d6c 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -33,12 +33,12 @@ def data_missing(): raise NotImplementedError -@pytest.fixture(params=['data', 'data_missing']) +@pytest.fixture(params=["data", "data_missing"]) def all_data(request, data, data_missing): """Parametrized fixture giving 'data' and 'data_missing'""" - if request.param == 'data': + if request.param == "data": return data - elif request.param == 'data_missing': + elif request.param == "data_missing": return data_missing @@ -57,9 +57,11 @@ def data_repeated(data): A callable that takes a `count` argument and returns a generator yielding `count` datasets. """ + def gen(count): for _ in range(count): yield data + return gen @@ -118,12 +120,15 @@ def box_in_series(request): return request.param -@pytest.fixture(params=[ - lambda x: 1, - lambda x: [1] * len(x), - lambda x: Series([1] * len(x)), - lambda x: x, -], ids=['scalar', 'list', 'series', 'object']) +@pytest.fixture( + params=[ + lambda x: 1, + lambda x: [1] * len(x), + lambda x: Series([1] * len(x)), + lambda x: x, + ], + ids=["scalar", "list", "series", "object"], +) def groupby_apply_op(request): """ Functions to test groupby.apply(). @@ -156,7 +161,7 @@ def use_numpy(request): return request.param -@pytest.fixture(params=['ffill', 'bfill']) +@pytest.fixture(params=["ffill", "bfill"]) def fillna_method(request): """ Parametrized fixture giving method parameters 'ffill' and 'bfill' for diff --git a/pandas/tests/extension/decimal/__init__.py b/pandas/tests/extension/decimal/__init__.py index 7c48e7e71503e7..8194327f8812ea 100644 --- a/pandas/tests/extension/decimal/__init__.py +++ b/pandas/tests/extension/decimal/__init__.py @@ -1,3 +1,3 @@ from .array import DecimalArray, DecimalDtype, make_data, to_decimal -__all__ = ['DecimalArray', 'DecimalDtype', 'to_decimal', 'make_data'] +__all__ = ["DecimalArray", "DecimalDtype", "to_decimal", "make_data"] diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index d097a599730b80..90e6a91fbb91af 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -15,15 +15,15 @@ @register_extension_dtype class DecimalDtype(ExtensionDtype): type = decimal.Decimal - name = 'decimal' - na_value = decimal.Decimal('NaN') - _metadata = ('context',) + name = "decimal" + na_value = decimal.Decimal("NaN") + _metadata = ("context",) def __init__(self, context=None): self.context = context or decimal.getcontext() def __repr__(self): - return 'DecimalDtype(context={})'.format(self.context) + return "DecimalDtype(context={})".format(self.context) @classmethod def construct_array_type(cls): @@ -40,8 +40,7 @@ def construct_from_string(cls, string): if string == cls.name: return cls() else: - raise TypeError("Cannot construct a '{}' from " - "'{}'".format(cls, string)) + raise TypeError("Cannot construct a '{}' from " "'{}'".format(cls, string)) @property def _is_numeric(self): @@ -54,8 +53,7 @@ class DecimalArray(ExtensionArray, ExtensionScalarOpsMixin): def __init__(self, values, dtype=None, copy=False, context=None): for val in values: if not isinstance(val, decimal.Decimal): - raise TypeError("All values must be of type " + - str(decimal.Decimal)) + raise TypeError("All values must be of type " + str(decimal.Decimal)) values = np.asarray(values, dtype=object) self._data = values @@ -77,8 +75,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): @classmethod def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): - return cls._from_sequence([decimal.Decimal(x) for x in strings], - dtype, copy) + return cls._from_sequence([decimal.Decimal(x) for x in strings], dtype, copy) @classmethod def _from_factorized(cls, values, original): @@ -88,12 +85,12 @@ def _from_factorized(cls, values, original): def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # - if not all(isinstance(t, self._HANDLED_TYPES + (DecimalArray,)) - for t in inputs): + if not all( + isinstance(t, self._HANDLED_TYPES + (DecimalArray,)) for t in inputs + ): return NotImplemented - inputs = tuple(x._data if isinstance(x, DecimalArray) else x - for x in inputs) + inputs = tuple(x._data if isinstance(x, DecimalArray) else x for x in inputs) result = getattr(ufunc, method)(*inputs, **kwargs) def reconstruct(x): @@ -120,8 +117,7 @@ def take(self, indexer, allow_fill=False, fill_value=None): if allow_fill and fill_value is None: fill_value = self.dtype.na_value - result = take(data, indexer, fill_value=fill_value, - allow_fill=allow_fill) + result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill) return self._from_sequence(result) def copy(self): @@ -156,7 +152,7 @@ def isna(self): @property def _na_value(self): - return decimal.Decimal('NaN') + return decimal.Decimal("NaN") def _formatter(self, boxed=False): if boxed: @@ -175,8 +171,9 @@ def _reduce(self, name, skipna=True, **kwargs): try: op = getattr(self.data, name) except AttributeError: - raise NotImplementedError("decimal does not support " - "the {} operation".format(name)) + raise NotImplementedError( + "decimal does not support " "the {} operation".format(name) + ) return op(axis=0) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 80885e4045e647..272936f6ec9f01 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -29,21 +29,21 @@ def data_for_twos(): @pytest.fixture def data_missing(): - return DecimalArray([decimal.Decimal('NaN'), decimal.Decimal(1)]) + return DecimalArray([decimal.Decimal("NaN"), decimal.Decimal(1)]) @pytest.fixture def data_for_sorting(): - return DecimalArray([decimal.Decimal('1'), - decimal.Decimal('2'), - decimal.Decimal('0')]) + return DecimalArray( + [decimal.Decimal("1"), decimal.Decimal("2"), decimal.Decimal("0")] + ) @pytest.fixture def data_missing_for_sorting(): - return DecimalArray([decimal.Decimal('1'), - decimal.Decimal('NaN'), - decimal.Decimal('0')]) + return DecimalArray( + [decimal.Decimal("1"), decimal.Decimal("NaN"), decimal.Decimal("0")] + ) @pytest.fixture @@ -58,15 +58,14 @@ def na_value(): @pytest.fixture def data_for_grouping(): - b = decimal.Decimal('1.0') - a = decimal.Decimal('0.0') - c = decimal.Decimal('2.0') - na = decimal.Decimal('NaN') + b = decimal.Decimal("1.0") + a = decimal.Decimal("0.0") + c = decimal.Decimal("2.0") + na = decimal.Decimal("NaN") return DecimalArray([b, b, na, na, a, a, b, c]) class BaseDecimal: - def assert_series_equal(self, left, right, *args, **kwargs): def convert(x): # need to convert array([Decimal(NaN)], dtype='object') to np.NaN @@ -77,35 +76,34 @@ def convert(x): except TypeError: return False - if left.dtype == 'object': + if left.dtype == "object": left_na = left.apply(convert) else: left_na = left.isna() - if right.dtype == 'object': + if right.dtype == "object": right_na = right.apply(convert) else: right_na = right.isna() tm.assert_series_equal(left_na, right_na) - return tm.assert_series_equal(left[~left_na], - right[~right_na], - *args, **kwargs) + return tm.assert_series_equal(left[~left_na], right[~right_na], *args, **kwargs) def assert_frame_equal(self, left, right, *args, **kwargs): # TODO(EA): select_dtypes tm.assert_index_equal( - left.columns, right.columns, - exact=kwargs.get('check_column_type', 'equiv'), - check_names=kwargs.get('check_names', True), - check_exact=kwargs.get('check_exact', False), - check_categorical=kwargs.get('check_categorical', True), - obj='{obj}.columns'.format(obj=kwargs.get('obj', 'DataFrame'))) + left.columns, + right.columns, + exact=kwargs.get("check_column_type", "equiv"), + check_names=kwargs.get("check_names", True), + check_exact=kwargs.get("check_exact", False), + check_categorical=kwargs.get("check_categorical", True), + obj="{obj}.columns".format(obj=kwargs.get("obj", "DataFrame")), + ) - decimals = (left.dtypes == 'decimal').index + decimals = (left.dtypes == "decimal").index for col in decimals: - self.assert_series_equal(left[col], right[col], - *args, **kwargs) + self.assert_series_equal(left[col], right[col], *args, **kwargs) left = left.drop(columns=decimals) right = right.drop(columns=decimals) @@ -122,7 +120,6 @@ class TestInterface(BaseDecimal, base.BaseInterfaceTests): class TestConstructors(BaseDecimal, base.BaseConstructorsTests): - @pytest.mark.skip(reason="not implemented constructor from dtype") def test_from_dtype(self, data): # construct from our dtype & string dtype @@ -134,14 +131,10 @@ class TestReshaping(BaseDecimal, base.BaseReshapingTests): class TestGetitem(BaseDecimal, base.BaseGetitemTests): - def test_take_na_value_other_decimal(self): - arr = DecimalArray([decimal.Decimal('1.0'), - decimal.Decimal('2.0')]) - result = arr.take([0, -1], allow_fill=True, - fill_value=decimal.Decimal('-1.0')) - expected = DecimalArray([decimal.Decimal('1.0'), - decimal.Decimal('-1.0')]) + arr = DecimalArray([decimal.Decimal("1.0"), decimal.Decimal("2.0")]) + result = arr.take([0, -1], allow_fill=True, fill_value=decimal.Decimal("-1.0")) + expected = DecimalArray([decimal.Decimal("1.0"), decimal.Decimal("-1.0")]) self.assert_extension_array_equal(result, expected) @@ -150,10 +143,9 @@ class TestMissing(BaseDecimal, base.BaseMissingTests): class Reduce: - def check_reduce(self, s, op_name, skipna): - if skipna or op_name in ['median', 'skew', 'kurt']: + if skipna or op_name in ["median", "skew", "kurt"]: with pytest.raises(NotImplementedError): getattr(s, op_name)(skipna=skipna) @@ -172,7 +164,7 @@ class TestBooleanReduce(Reduce, base.BaseBooleanReduceTests): class TestMethods(BaseDecimal, base.BaseMethodsTests): - @pytest.mark.parametrize('dropna', [True, False]) + @pytest.mark.parametrize("dropna", [True, False]) @pytest.mark.xfail(reason="value_counts not implemented yet.") def test_value_counts(self, all_data, dropna): all_data = all_data[:10] @@ -192,9 +184,9 @@ class TestCasting(BaseDecimal, base.BaseCastingTests): class TestGroupby(BaseDecimal, base.BaseGroupbyTests): - @pytest.mark.xfail( - reason="needs to correctly define __eq__ to handle nans, xref #27081.") + reason="needs to correctly define __eq__ to handle nans, xref #27081." + ) def test_groupby_apply_identity(self, data_for_grouping): super().test_groupby_apply_identity(data_for_grouping) @@ -204,7 +196,6 @@ class TestSetitem(BaseDecimal, base.BaseSetitemTests): class TestPrinting(BaseDecimal, base.BasePrintingTests): - def test_series_repr(self, data): # Overriding this base test to explicitly test that # the custom _formatter is used @@ -214,36 +205,40 @@ def test_series_repr(self, data): # TODO(extension) -@pytest.mark.xfail(reason=( - "raising AssertionError as this is not implemented, " - "though easy enough to do")) +@pytest.mark.xfail( + reason=( + "raising AssertionError as this is not implemented, " "though easy enough to do" + ) +) def test_series_constructor_coerce_data_to_extension_dtype_raises(): - xpr = ("Cannot cast data to extension dtype 'decimal'. Pass the " - "extension array directly.") + xpr = ( + "Cannot cast data to extension dtype 'decimal'. Pass the " + "extension array directly." + ) with pytest.raises(ValueError, match=xpr): pd.Series([0, 1, 2], dtype=DecimalDtype()) def test_series_constructor_with_dtype(): - arr = DecimalArray([decimal.Decimal('10.0')]) + arr = DecimalArray([decimal.Decimal("10.0")]) result = pd.Series(arr, dtype=DecimalDtype()) expected = pd.Series(arr) tm.assert_series_equal(result, expected) - result = pd.Series(arr, dtype='int64') + result = pd.Series(arr, dtype="int64") expected = pd.Series([10]) tm.assert_series_equal(result, expected) def test_dataframe_constructor_with_dtype(): - arr = DecimalArray([decimal.Decimal('10.0')]) + arr = DecimalArray([decimal.Decimal("10.0")]) result = pd.DataFrame({"A": arr}, dtype=DecimalDtype()) expected = pd.DataFrame({"A": arr}) tm.assert_frame_equal(result, expected) - arr = DecimalArray([decimal.Decimal('10.0')]) - result = pd.DataFrame({"A": arr}, dtype='int64') + arr = DecimalArray([decimal.Decimal("10.0")]) + result = pd.DataFrame({"A": arr}, dtype="int64") expected = pd.DataFrame({"A": [10]}) tm.assert_frame_equal(result, expected) @@ -254,7 +249,7 @@ def test_astype_dispatches(frame): # gets all the way through to ExtensionArray.astype # Designing a reliable smoke test that works for arbitrary data types # is difficult. - data = pd.Series(DecimalArray([decimal.Decimal(2)]), name='a') + data = pd.Series(DecimalArray([decimal.Decimal(2)]), name="a") ctx = decimal.Context() ctx.prec = 5 @@ -264,13 +259,12 @@ def test_astype_dispatches(frame): result = data.astype(DecimalDtype(ctx)) if frame: - result = result['a'] + result = result["a"] assert result.dtype.context.prec == ctx.prec class TestArithmeticOps(BaseDecimal, base.BaseArithmeticOpsTests): - def check_opname(self, s, op_name, other, exc=None): super().check_opname(s, op_name, other, exc=None) @@ -305,7 +299,6 @@ def test_error(self): class TestComparisonOps(BaseDecimal, base.BaseComparisonOpsTests): - def check_opname(self, s, op_name, other, exc=None): super().check_opname(s, op_name, other, exc=None) @@ -323,13 +316,13 @@ def test_compare_array(self, data, all_compare_operators): alter = np.random.choice([-1, 0, 1], len(data)) # Randomly double, halve or keep same value - other = pd.Series(data) * [decimal.Decimal(pow(2.0, i)) - for i in alter] + other = pd.Series(data) * [decimal.Decimal(pow(2.0, i)) for i in alter] self._compare_other(s, data, op_name, other) class DecimalArrayWithoutFromSequence(DecimalArray): """Helper class for testing error handling in _from_sequence.""" + def _from_sequence(cls, scalars, dtype=None, copy=False): raise KeyError("For the test") @@ -345,37 +338,38 @@ def _create_arithmetic_method(cls, op): def test_combine_from_sequence_raises(): # https://github.com/pandas-dev/pandas/issues/22850 - ser = pd.Series(DecimalArrayWithoutFromSequence([ - decimal.Decimal("1.0"), - decimal.Decimal("2.0") - ])) + ser = pd.Series( + DecimalArrayWithoutFromSequence( + [decimal.Decimal("1.0"), decimal.Decimal("2.0")] + ) + ) result = ser.combine(ser, operator.add) # note: object dtype - expected = pd.Series([decimal.Decimal("2.0"), - decimal.Decimal("4.0")], dtype="object") + expected = pd.Series( + [decimal.Decimal("2.0"), decimal.Decimal("4.0")], dtype="object" + ) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("class_", [DecimalArrayWithoutFromSequence, - DecimalArrayWithoutCoercion]) +@pytest.mark.parametrize( + "class_", [DecimalArrayWithoutFromSequence, DecimalArrayWithoutCoercion] +) def test_scalar_ops_from_sequence_raises(class_): # op(EA, EA) should return an EA, or an ndarray if it's not possible # to return an EA with the return values. - arr = class_([ - decimal.Decimal("1.0"), - decimal.Decimal("2.0") - ]) + arr = class_([decimal.Decimal("1.0"), decimal.Decimal("2.0")]) result = arr + arr - expected = np.array([decimal.Decimal("2.0"), decimal.Decimal("4.0")], - dtype="object") + expected = np.array( + [decimal.Decimal("2.0"), decimal.Decimal("4.0")], dtype="object" + ) tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("reverse, expected_div, expected_mod", [ - (False, [0, 1, 1, 2], [1, 0, 1, 0]), - (True, [2, 1, 0, 0], [0, 0, 2, 2]), -]) +@pytest.mark.parametrize( + "reverse, expected_div, expected_mod", + [(False, [0, 1, 1, 2], [1, 0, 1, 0]), (True, [2, 1, 0, 0], [0, 0, 2, 2])], +) def test_divmod_array(reverse, expected_div, expected_mod): # https://github.com/pandas-dev/pandas/issues/22930 arr = to_decimal([1, 2, 3, 4]) @@ -403,10 +397,9 @@ class DecimalArray2(DecimalArray): def _formatting_values(self): return np.array(self) - ser = pd.Series(DecimalArray2([decimal.Decimal('1.0')])) + ser = pd.Series(DecimalArray2([decimal.Decimal("1.0")])) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): repr(ser) diff --git a/pandas/tests/extension/json/__init__.py b/pandas/tests/extension/json/__init__.py index f2679d087c8410..e205c7ee509745 100644 --- a/pandas/tests/extension/json/__init__.py +++ b/pandas/tests/extension/json/__init__.py @@ -1,3 +1,3 @@ from .array import JSONArray, JSONDtype, make_data -__all__ = ['JSONArray', 'JSONDtype', 'make_data'] +__all__ = ["JSONArray", "JSONDtype", "make_data"] diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 1b5009830303bc..ece1924b1b2281 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -26,7 +26,7 @@ class JSONDtype(ExtensionDtype): type = abc.Mapping - name = 'json' + name = "json" na_value = UserDict() @classmethod @@ -44,8 +44,7 @@ def construct_from_string(cls, string): if string == cls.name: return cls() else: - raise TypeError("Cannot construct a '{}' from " - "'{}'".format(cls, string)) + raise TypeError("Cannot construct a '{}' from " "'{}'".format(cls, string)) class JSONArray(ExtensionArray): @@ -55,8 +54,7 @@ class JSONArray(ExtensionArray): def __init__(self, values, dtype=None, copy=False): for val in values: if not isinstance(val, self.dtype.type): - raise TypeError("All values must be of type " + - str(self.dtype.type)) + raise TypeError("All values must be of type " + str(self.dtype.type)) self.data = values # Some aliases for common attribute names to ensure pandas supports @@ -77,7 +75,7 @@ def _from_factorized(cls, values, original): def __getitem__(self, item): if isinstance(item, numbers.Integral): return self.data[item] - elif isinstance(item, np.ndarray) and item.dtype == 'bool': + elif isinstance(item, np.ndarray) and item.dtype == "bool": return self._from_sequence([x for x, m in zip(self, item) if m]) elif isinstance(item, abc.Iterable): # fancy indexing @@ -94,7 +92,7 @@ def __setitem__(self, key, value): # broadcast value value = itertools.cycle([value]) - if isinstance(key, np.ndarray) and key.dtype == 'bool': + if isinstance(key, np.ndarray) and key.dtype == "bool": # masking for i, (k, v) in enumerate(zip(key, value)): if k: @@ -113,16 +111,17 @@ def nbytes(self): return sys.getsizeof(self.data) def isna(self): - return np.array([x == self.dtype.na_value for x in self.data], - dtype=bool) + return np.array([x == self.dtype.na_value for x in self.data], dtype=bool) def take(self, indexer, allow_fill=False, fill_value=None): # re-implement here, since NumPy has trouble setting # sized objects like UserDicts into scalar slots of # an ndarary. indexer = np.asarray(indexer) - msg = ("Index is out of bounds or cannot do a " - "non-empty take from an empty array.") + msg = ( + "Index is out of bounds or cannot do a " + "non-empty take from an empty array." + ) if allow_fill: if fill_value is None: @@ -131,8 +130,9 @@ def take(self, indexer, allow_fill=False, fill_value=None): if (indexer < -1).any(): raise ValueError try: - output = [self.data[loc] if loc != -1 else fill_value - for loc in indexer] + output = [ + self.data[loc] if loc != -1 else fill_value for loc in indexer + ] except IndexError: raise IndexError(msg) else: @@ -161,9 +161,9 @@ def astype(self, dtype, copy=True): def unique(self): # Parent method doesn't work since np.array will try to infer # a 2-dim object. - return type(self)([ - dict(x) for x in list({tuple(d.items()) for d in self.data}) - ]) + return type(self)( + [dict(x) for x in list({tuple(d.items()) for d in self.data})] + ) @classmethod def _concat_same_type(cls, to_concat): @@ -187,6 +187,12 @@ def _values_for_argsort(self): def make_data(): # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer - return [UserDict([ - (random.choice(string.ascii_letters), random.randint(0, 100)) - for _ in range(random.randint(0, 10))]) for _ in range(100)] + return [ + UserDict( + [ + (random.choice(string.ascii_letters), random.randint(0, 100)) + for _ in range(random.randint(0, 10)) + ] + ) + for _ in range(100) + ] diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 89d30b0a3cc06d..bc75ec6aeb2df0 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -37,17 +37,17 @@ def data(): @pytest.fixture def data_missing(): """Length 2 array with [NA, Valid]""" - return JSONArray([{}, {'a': 10}]) + return JSONArray([{}, {"a": 10}]) @pytest.fixture def data_for_sorting(): - return JSONArray([{'b': 1}, {'c': 4}, {'a': 2, 'c': 3}]) + return JSONArray([{"b": 1}, {"c": 4}, {"a": 2, "c": 3}]) @pytest.fixture def data_missing_for_sorting(): - return JSONArray([{'b': 1}, {}, {'a': 4}]) + return JSONArray([{"b": 1}, {}, {"a": 4}]) @pytest.fixture @@ -62,13 +62,18 @@ def na_cmp(): @pytest.fixture def data_for_grouping(): - return JSONArray([ - {'b': 1}, {'b': 1}, - {}, {}, - {'a': 0, 'c': 2}, {'a': 0, 'c': 2}, - {'b': 1}, - {'c': 2}, - ]) + return JSONArray( + [ + {"b": 1}, + {"b": 1}, + {}, + {}, + {"a": 0, "c": 2}, + {"a": 0, "c": 2}, + {"b": 1}, + {"c": 2}, + ] + ) class BaseJSON: @@ -77,28 +82,33 @@ class BaseJSON: # Series.values, which raises. We work around it by # converting the UserDicts to dicts. def assert_series_equal(self, left, right, **kwargs): - if left.dtype.name == 'json': + if left.dtype.name == "json": assert left.dtype == right.dtype - left = pd.Series(JSONArray(left.values.astype(object)), - index=left.index, name=left.name) - right = pd.Series(JSONArray(right.values.astype(object)), - index=right.index, name=right.name) + left = pd.Series( + JSONArray(left.values.astype(object)), index=left.index, name=left.name + ) + right = pd.Series( + JSONArray(right.values.astype(object)), + index=right.index, + name=right.name, + ) tm.assert_series_equal(left, right, **kwargs) def assert_frame_equal(self, left, right, *args, **kwargs): tm.assert_index_equal( - left.columns, right.columns, - exact=kwargs.get('check_column_type', 'equiv'), - check_names=kwargs.get('check_names', True), - check_exact=kwargs.get('check_exact', False), - check_categorical=kwargs.get('check_categorical', True), - obj='{obj}.columns'.format(obj=kwargs.get('obj', 'DataFrame'))) + left.columns, + right.columns, + exact=kwargs.get("check_column_type", "equiv"), + check_names=kwargs.get("check_names", True), + check_exact=kwargs.get("check_exact", False), + check_categorical=kwargs.get("check_categorical", True), + obj="{obj}.columns".format(obj=kwargs.get("obj", "DataFrame")), + ) - jsons = (left.dtypes == 'json').index + jsons = (left.dtypes == "json").index for col in jsons: - self.assert_series_equal(left[col], right[col], - *args, **kwargs) + self.assert_series_equal(left[col], right[col], *args, **kwargs) left = left.drop(columns=jsons) right = right.drop(columns=jsons) @@ -113,9 +123,13 @@ class TestInterface(BaseJSON, base.BaseInterfaceTests): def test_custom_asserts(self): # This would always trigger the KeyError from trying to put # an array of equal-length UserDicts inside an ndarray. - data = JSONArray([collections.UserDict({'a': 1}), - collections.UserDict({'b': 2}), - collections.UserDict({'c': 3})]) + data = JSONArray( + [ + collections.UserDict({"a": 1}), + collections.UserDict({"b": 2}), + collections.UserDict({"c": 3}), + ] + ) a = pd.Series(data) self.assert_series_equal(a, a) self.assert_frame_equal(a.to_frame(), a.to_frame()) @@ -129,7 +143,6 @@ def test_custom_asserts(self): class TestConstructors(BaseJSON, base.BaseConstructorsTests): - @pytest.mark.skip(reason="not implemented constructor from dtype") def test_from_dtype(self, data): # construct from our dtype & string dtype @@ -137,7 +150,6 @@ def test_from_dtype(self, data): class TestReshaping(BaseJSON, base.BaseReshapingTests): - @pytest.mark.skip(reason="Different definitions of NA") def test_stack(self): """ @@ -168,8 +180,9 @@ def test_fillna_frame(self): unhashable = pytest.mark.skip(reason="Unhashable") -unstable = pytest.mark.skipif(not PY36, # 3.6 or higher - reason="Dictionary order unstable") +unstable = pytest.mark.skipif( + not PY36, reason="Dictionary order unstable" # 3.6 or higher +) class TestReduce(base.BaseNoReduceTests): @@ -195,12 +208,12 @@ def test_argsort_missing(self, data_missing_for_sorting): super().test_argsort_missing(data_missing_for_sorting) @unstable - @pytest.mark.parametrize('ascending', [True, False]) + @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values(self, data_for_sorting, ascending): super().test_sort_values(data_for_sorting, ascending) @unstable - @pytest.mark.parametrize('ascending', [True, False]) + @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values_missing(self, data_missing_for_sorting, ascending): super().test_sort_values_missing(data_missing_for_sorting, ascending) @@ -246,7 +259,6 @@ def test_astype_str(self): class TestGroupby(BaseJSON, base.BaseGroupbyTests): - @unhashable def test_groupby_extension_transform(self): """ @@ -269,7 +281,7 @@ def test_groupby_extension_apply(self): """ @unstable - @pytest.mark.parametrize('as_index', [True, False]) + @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_extension_agg(self, as_index, data_for_grouping): super().test_groupby_extension_agg(as_index, data_for_grouping) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 046dcc1c74a03d..f7456d24ad6d37 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -54,19 +54,17 @@ def data(): @pytest.fixture def data_missing(): """Length 2 array with [NA, Valid]""" - return Categorical([np.nan, 'A']) + return Categorical([np.nan, "A"]) @pytest.fixture def data_for_sorting(): - return Categorical(['A', 'B', 'C'], categories=['C', 'A', 'B'], - ordered=True) + return Categorical(["A", "B", "C"], categories=["C", "A", "B"], ordered=True) @pytest.fixture def data_missing_for_sorting(): - return Categorical(['A', None, 'B'], categories=['B', 'A'], - ordered=True) + return Categorical(["A", None, "B"], categories=["B", "A"], ordered=True) @pytest.fixture @@ -76,7 +74,7 @@ def na_value(): @pytest.fixture def data_for_grouping(): - return Categorical(['a', 'a', None, None, 'b', 'b', 'a', 'c']) + return Categorical(["a", "a", None, None, "b", "b", "a", "c"]) class TestDtype(base.BaseDtypeTests): @@ -95,7 +93,6 @@ class TestConstructors(base.BaseConstructorsTests): class TestReshaping(base.BaseReshapingTests): - def test_ravel(self, data): # GH#27199 Categorical.ravel returns self until after deprecation cycle with tm.assert_produces_warning(FutureWarning): @@ -155,7 +152,6 @@ class TestSetitem(base.BaseSetitemTests): class TestMissing(base.BaseMissingTests): - @pytest.mark.skip(reason="Not implemented") def test_fillna_limit_pad(self, data_missing): super().test_fillna_limit_pad(data_missing) @@ -181,8 +177,9 @@ def test_combine_add(self, data_repeated): s1 = pd.Series(orig_data1) s2 = pd.Series(orig_data2) result = s1.combine(s2, lambda x1, x2: x1 + x2) - expected = pd.Series(([a + b for (a, b) in - zip(list(orig_data1), list(orig_data2))])) + expected = pd.Series( + ([a + b for (a, b) in zip(list(orig_data1), list(orig_data2))]) + ) self.assert_series_equal(result, expected) val = s1.iloc[0] @@ -204,14 +201,13 @@ class TestCasting(base.BaseCastingTests): class TestArithmeticOps(base.BaseArithmeticOpsTests): - def test_arith_series_with_scalar(self, data, all_arithmetic_operators): op_name = all_arithmetic_operators - if op_name != '__rmod__': + if op_name != "__rmod__": super().test_arith_series_with_scalar(data, op_name) else: - pytest.skip('rmod never called when string is first argument') + pytest.skip("rmod never called when string is first argument") def test_add_series_with_extension_array(self, data): ser = pd.Series(data) @@ -228,15 +224,14 @@ def _check_divmod_op(self, s, op, other, exc=NotImplementedError): class TestComparisonOps(base.BaseComparisonOpsTests): - def _compare_other(self, s, data, op_name, other): op = self.get_op_from_name(op_name) - if op_name == '__eq__': + if op_name == "__eq__": result = op(s, other) expected = s.combine(other, lambda x, y: x == y) assert (result == expected).all() - elif op_name == '__ne__': + elif op_name == "__ne__": result = op(s, other) expected = s.combine(other, lambda x, y: x != y) assert (result == expected).all() diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py index 14db04e1bcd61f..9b5f9d64f6b671 100644 --- a/pandas/tests/extension/test_common.py +++ b/pandas/tests/extension/test_common.py @@ -14,7 +14,6 @@ class DummyDtype(dtypes.ExtensionDtype): class DummyArray(ExtensionArray): - def __init__(self, data): self.data = data @@ -36,21 +35,20 @@ def astype(self, dtype, copy=True): class TestExtensionArrayDtype: - - @pytest.mark.parametrize('values', [ - pd.Categorical([]), - pd.Categorical([]).dtype, - pd.Series(pd.Categorical([])), - DummyDtype(), - DummyArray(np.array([1, 2])), - ]) + @pytest.mark.parametrize( + "values", + [ + pd.Categorical([]), + pd.Categorical([]).dtype, + pd.Series(pd.Categorical([])), + DummyDtype(), + DummyArray(np.array([1, 2])), + ], + ) def test_is_extension_array_dtype(self, values): assert is_extension_array_dtype(values) - @pytest.mark.parametrize('values', [ - np.array([]), - pd.Series(np.array([])), - ]) + @pytest.mark.parametrize("values", [np.array([]), pd.Series(np.array([]))]) def test_is_not_extension_array_dtype(self, values): assert not is_extension_array_dtype(values) @@ -63,7 +61,7 @@ def test_astype(): result = arr.astype(object) tm.assert_numpy_array_equal(result, expected) - result = arr.astype('object') + result = arr.astype("object") tm.assert_numpy_array_equal(result, expected) @@ -77,10 +75,7 @@ def test_astype_no_copy(): assert arr is not result -@pytest.mark.parametrize('dtype', [ - dtypes.CategoricalDtype(), - dtypes.IntervalDtype(), -]) +@pytest.mark.parametrize("dtype", [dtypes.CategoricalDtype(), dtypes.IntervalDtype()]) def test_is_extension_array_dtype(dtype): assert isinstance(dtype, dtypes.ExtensionDtype) assert is_extension_array_dtype(dtype) diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index baee04c3b79eb0..9a7a43cff0c27b 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -15,34 +15,30 @@ def dtype(request): @pytest.fixture def data(dtype): - data = DatetimeArray(pd.date_range("2000", periods=100, tz=dtype.tz), - dtype=dtype) + data = DatetimeArray(pd.date_range("2000", periods=100, tz=dtype.tz), dtype=dtype) return data @pytest.fixture def data_missing(dtype): return DatetimeArray( - np.array(['NaT', '2000-01-01'], dtype='datetime64[ns]'), - dtype=dtype + np.array(["NaT", "2000-01-01"], dtype="datetime64[ns]"), dtype=dtype ) @pytest.fixture def data_for_sorting(dtype): - a = pd.Timestamp('2000-01-01') - b = pd.Timestamp('2000-01-02') - c = pd.Timestamp('2000-01-03') - return DatetimeArray(np.array([b, c, a], dtype='datetime64[ns]'), - dtype=dtype) + a = pd.Timestamp("2000-01-01") + b = pd.Timestamp("2000-01-02") + c = pd.Timestamp("2000-01-03") + return DatetimeArray(np.array([b, c, a], dtype="datetime64[ns]"), dtype=dtype) @pytest.fixture def data_missing_for_sorting(dtype): - a = pd.Timestamp('2000-01-01') - b = pd.Timestamp('2000-01-02') - return DatetimeArray(np.array([b, 'NaT', a], dtype='datetime64[ns]'), - dtype=dtype) + a = pd.Timestamp("2000-01-01") + b = pd.Timestamp("2000-01-02") + return DatetimeArray(np.array([b, "NaT", a], dtype="datetime64[ns]"), dtype=dtype) @pytest.fixture @@ -52,19 +48,20 @@ def data_for_grouping(dtype): Where A < B < C and NA is missing """ - a = pd.Timestamp('2000-01-01') - b = pd.Timestamp('2000-01-02') - c = pd.Timestamp('2000-01-03') - na = 'NaT' - return DatetimeArray(np.array([b, b, na, na, a, a, b, c], - dtype='datetime64[ns]'), - dtype=dtype) + a = pd.Timestamp("2000-01-01") + b = pd.Timestamp("2000-01-02") + c = pd.Timestamp("2000-01-03") + na = "NaT" + return DatetimeArray( + np.array([b, b, na, na, a, a, b, c], dtype="datetime64[ns]"), dtype=dtype + ) @pytest.fixture def na_cmp(): def cmp(a, b): return a is pd.NaT and a is b + return cmp @@ -103,7 +100,6 @@ def test_combine_add(self, data_repeated): class TestInterface(BaseDatetimeTests, base.BaseInterfaceTests): - def test_array_interface(self, data): if data.tz: # np.asarray(DTA) is currently always tz-naive. @@ -113,34 +109,30 @@ def test_array_interface(self, data): class TestArithmeticOps(BaseDatetimeTests, base.BaseArithmeticOpsTests): - implements = {'__sub__', '__rsub__'} + implements = {"__sub__", "__rsub__"} def test_arith_series_with_scalar(self, data, all_arithmetic_operators): if all_arithmetic_operators in self.implements: s = pd.Series(data) - self.check_opname(s, all_arithmetic_operators, s.iloc[0], - exc=None) + self.check_opname(s, all_arithmetic_operators, s.iloc[0], exc=None) else: # ... but not the rest. - super().test_arith_series_with_scalar(data, - all_arithmetic_operators) + super().test_arith_series_with_scalar(data, all_arithmetic_operators) def test_add_series_with_extension_array(self, data): # Datetime + Datetime not implemented s = pd.Series(data) - msg = 'cannot add DatetimeArray and DatetimeArray' + msg = "cannot add DatetimeArray and DatetimeArray" with pytest.raises(TypeError, match=msg): s + data def test_arith_series_with_array(self, data, all_arithmetic_operators): if all_arithmetic_operators in self.implements: s = pd.Series(data) - self.check_opname(s, all_arithmetic_operators, s.iloc[0], - exc=None) + self.check_opname(s, all_arithmetic_operators, s.iloc[0], exc=None) else: # ... but not the rest. - super().test_arith_series_with_scalar(data, - all_arithmetic_operators) + super().test_arith_series_with_scalar(data, all_arithmetic_operators) def test_error(self, data, all_arithmetic_operators): pass @@ -157,8 +149,7 @@ def test_direct_arith_with_series_returns_not_implemented(self, data): # tests/arithmetic/test_datetime64::TestTimestampSeriesArithmetic:: # test_dt64_seris_add_intlike return super( - TestArithmeticOps, - self + TestArithmeticOps, self ).test_direct_arith_with_series_returns_not_implemented(data) @@ -167,7 +158,6 @@ class TestCasting(BaseDatetimeTests, base.BaseCastingTests): class TestComparisonOps(BaseDatetimeTests, base.BaseComparisonOpsTests): - def _compare_other(self, s, data, op_name, other): # the base test is not appropriate for us. We raise on comparison # with (some) integers, depending on the value. @@ -176,8 +166,7 @@ def _compare_other(self, s, data, op_name, other): @pytest.mark.xfail(reason="different implementation", strict=False) def test_direct_arith_with_series_returns_not_implemented(self, data): return super( - TestComparisonOps, - self + TestComparisonOps, self ).test_direct_arith_with_series_returns_not_implemented(data) @@ -186,7 +175,6 @@ class TestMissing(BaseDatetimeTests, base.BaseMissingTests): class TestReshaping(BaseDatetimeTests, base.BaseReshapingTests): - @pytest.mark.skip(reason="We have DatetimeTZBlock") def test_concat(self, data, in_frame): pass @@ -200,29 +188,31 @@ def test_concat_mixed_dtypes(self, data): @pytest.mark.parametrize("obj", ["series", "frame"]) def test_unstack(self, obj): # GH-13287: can't use base test, since building the expected fails. - data = DatetimeArray._from_sequence(['2000', '2001', '2002', '2003'], - tz='US/Central') - index = pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b']]), - names=['a', 'b']) + data = DatetimeArray._from_sequence( + ["2000", "2001", "2002", "2003"], tz="US/Central" + ) + index = pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]) if obj == "series": ser = pd.Series(data, index=index) - expected = pd.DataFrame({ - "A": data.take([0, 1]), - "B": data.take([2, 3]) - }, index=pd.Index(['a', 'b'], name='b')) - expected.columns.name = 'a' + expected = pd.DataFrame( + {"A": data.take([0, 1]), "B": data.take([2, 3])}, + index=pd.Index(["a", "b"], name="b"), + ) + expected.columns.name = "a" else: ser = pd.DataFrame({"A": data, "B": data}, index=index) expected = pd.DataFrame( - {("A", "A"): data.take([0, 1]), - ("A", "B"): data.take([2, 3]), - ("B", "A"): data.take([0, 1]), - ("B", "B"): data.take([2, 3])}, - index=pd.Index(['a', 'b'], name='b') + { + ("A", "A"): data.take([0, 1]), + ("A", "B"): data.take([2, 3]), + ("B", "A"): data.take([0, 1]), + ("B", "B"): data.take([2, 3]), + }, + index=pd.Index(["a", "b"], name="b"), ) - expected.columns.names = [None, 'a'] + expected.columns.names = [None, "a"] result = ser.unstack(0) self.assert_equal(result, expected) diff --git a/pandas/tests/extension/test_external_block.py b/pandas/tests/extension/test_external_block.py index 7f68babdb8aa56..1a4f84e2c0fd2f 100644 --- a/pandas/tests/extension/test_external_block.py +++ b/pandas/tests/extension/test_external_block.py @@ -20,39 +20,40 @@ def concat_same_type(self, to_concat, placement=None): """ values = np.concatenate([blk.values for blk in to_concat]) return self.make_block_same_class( - values, placement=placement or slice(0, len(values), 1)) + values, placement=placement or slice(0, len(values), 1) + ) @pytest.fixture def df(): - df1 = pd.DataFrame({'a': [1, 2, 3]}) + df1 = pd.DataFrame({"a": [1, 2, 3]}) blocks = df1._data.blocks - values = np.arange(3, dtype='int64') + values = np.arange(3, dtype="int64") custom_block = CustomBlock(values, placement=slice(1, 2)) blocks = blocks + (custom_block,) - block_manager = BlockManager(blocks, [pd.Index(['a', 'b']), df1.index]) + block_manager = BlockManager(blocks, [pd.Index(["a", "b"]), df1.index]) return pd.DataFrame(block_manager) def test_custom_repr(): - values = np.arange(3, dtype='int64') + values = np.arange(3, dtype="int64") # series block = CustomBlock(values, placement=slice(0, 3)) s = pd.Series(SingleBlockManager(block, pd.RangeIndex(3))) - assert repr(s) == '0 Val: 0\n1 Val: 1\n2 Val: 2\ndtype: int64' + assert repr(s) == "0 Val: 0\n1 Val: 1\n2 Val: 2\ndtype: int64" # dataframe block = CustomBlock(values, placement=slice(0, 1)) - blk_mgr = BlockManager([block], [['col'], range(3)]) + blk_mgr = BlockManager([block], [["col"], range(3)]) df = pd.DataFrame(blk_mgr) - assert repr(df) == ' col\n0 Val: 0\n1 Val: 1\n2 Val: 2' + assert repr(df) == " col\n0 Val: 0\n1 Val: 1\n2 Val: 2" def test_concat_series(): # GH17728 - values = np.arange(3, dtype='int64') + values = np.arange(3, dtype="int64") block = CustomBlock(values, placement=slice(0, 3)) s = pd.Series(block, pd.RangeIndex(3), fastpath=True) @@ -68,6 +69,6 @@ def test_concat_dataframe(df): def test_concat_axis1(df): # GH17954 - df2 = pd.DataFrame({'c': [.1, .2, .3]}) + df2 = pd.DataFrame({"c": [0.1, 0.2, 0.3]}) res = pd.concat([df, df2], axis=1) assert isinstance(res._data.blocks[1], CustomBlock) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 22bb086a919cab..d051345fdd12d2 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -21,18 +21,34 @@ import pandas as pd from pandas.core.arrays import integer_array from pandas.core.arrays.integer import ( - Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype, - UInt32Dtype, UInt64Dtype) + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, +) from pandas.tests.extension import base def make_data(): - return (list(range(1, 9)) + [np.nan] + list(range(10, 98)) - + [np.nan] + [99, 100]) - - -@pytest.fixture(params=[Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, - UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype]) + return list(range(1, 9)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100] + + +@pytest.fixture( + params=[ + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + ] +) def dtype(request): return request.param() @@ -83,7 +99,6 @@ def data_for_grouping(dtype): class TestDtype(base.BaseDtypeTests): - @pytest.mark.skip(reason="using multiple dtypes") def test_is_dtype_unboxes_dtype(self): # we have multiple dtypes, so skip @@ -91,20 +106,21 @@ def test_is_dtype_unboxes_dtype(self): class TestArithmeticOps(base.BaseArithmeticOpsTests): - def check_opname(self, s, op_name, other, exc=None): # overwriting to indicate ops don't raise an error super().check_opname(s, op_name, other, exc=None) def _check_op(self, s, op, other, op_name, exc=NotImplementedError): if exc is None: - if s.dtype.is_unsigned_integer and (op_name == '__rsub__'): + if s.dtype.is_unsigned_integer and (op_name == "__rsub__"): # TODO see https://github.com/pandas-dev/pandas/issues/22023 pytest.skip("unsigned subtraction gives negative values") - if (hasattr(other, 'dtype') - and not is_extension_array_dtype(other.dtype) - and pd.api.types.is_integer_dtype(other.dtype)): + if ( + hasattr(other, "dtype") + and not is_extension_array_dtype(other.dtype) + and pd.api.types.is_integer_dtype(other.dtype) + ): # other is np.int64 and would therefore always result in # upcasting, so keeping other as same numpy_dtype other = other.astype(s.dtype.numpy_dtype) @@ -112,12 +128,12 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): result = op(s, other) expected = s.combine(other, op) - if op_name in ('__rtruediv__', '__truediv__', '__div__'): + if op_name in ("__rtruediv__", "__truediv__", "__div__"): expected = expected.astype(float) - if op_name == '__rtruediv__': + if op_name == "__rtruediv__": # TODO reverse operators result in object dtype result = result.astype(float) - elif op_name.startswith('__r'): + elif op_name.startswith("__r"): # TODO reverse operators result in object dtype # see https://github.com/pandas-dev/pandas/issues/22024 expected = expected.astype(s.dtype) @@ -126,7 +142,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): # combine method result in 'biggest' (int64) dtype expected = expected.astype(s.dtype) pass - if (op_name == '__rpow__') and isinstance(other, pd.Series): + if (op_name == "__rpow__") and isinstance(other, pd.Series): # TODO pow on Int arrays gives different result with NA # see https://github.com/pandas-dev/pandas/issues/22022 result = result.fillna(1) @@ -146,7 +162,6 @@ def test_error(self, data, all_arithmetic_operators): class TestComparisonOps(base.BaseComparisonOpsTests): - def check_opname(self, s, op_name, other, exc=None): super().check_opname(s, op_name, other, exc=None) @@ -183,8 +198,7 @@ class TestMissing(base.BaseMissingTests): class TestMethods(base.BaseMethodsTests): - - @pytest.mark.parametrize('dropna', [True, False]) + @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna): all_data = all_data[:10] if dropna: @@ -193,8 +207,7 @@ def test_value_counts(self, all_data, dropna): other = all_data result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() - expected = pd.Series(other).value_counts( - dropna=dropna).sort_index() + expected = pd.Series(other).value_counts(dropna=dropna).sort_index() expected.index = expected.index.astype(all_data.dtype) self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index f1f90b298ffe2e..1aab71286b4a65 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -103,8 +103,7 @@ class TestReduce(base.BaseNoReduceTests): class TestMethods(BaseInterval, base.BaseMethodsTests): - - @pytest.mark.skip(reason='addition is not defined for intervals') + @pytest.mark.skip(reason="addition is not defined for intervals") def test_combine_add(self, data_repeated): pass @@ -155,8 +154,8 @@ def test_array_repr(self, data, size): class TestParsing(BaseInterval, base.BaseParsingTests): - @pytest.mark.parametrize('engine', ['c', 'python']) + @pytest.mark.parametrize("engine", ["c", "python"]) def test_EA_types(self, engine, data): - expected_msg = r'.*must implement _from_sequence_of_strings.*' + expected_msg = r".*must implement _from_sequence_of_strings.*" with pytest.raises(NotImplementedError, match=expected_msg): super().test_EA_types(engine, data) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 74ca296d232958..221cf0787d8397 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -10,7 +10,7 @@ from . import base -@pytest.fixture(params=['float', 'object']) +@pytest.fixture(params=["float", "object"]) def dtype(request): return PandasDtype(np.dtype(request.param)) @@ -33,13 +33,13 @@ def allow_in_pandas(monkeypatch): check. """ with monkeypatch.context() as m: - m.setattr(PandasArray, '_typ', 'extension') + m.setattr(PandasArray, "_typ", "extension") yield @pytest.fixture def data(allow_in_pandas, dtype): - if dtype.numpy_dtype == 'object': + if dtype.numpy_dtype == "object": return pd.Series([(i,) for i in range(100)]).array return PandasArray(np.arange(1, 101, dtype=dtype._dtype)) @@ -48,7 +48,7 @@ def data(allow_in_pandas, dtype): def data_missing(allow_in_pandas, dtype): # For NumPy <1.16, np.array([np.nan, (1,)]) raises # ValueError: setting an array element with a sequence. - if dtype.numpy_dtype == 'object': + if dtype.numpy_dtype == "object": if _np_version_under1p16: raise pytest.skip("Skipping for NumPy <1.16") return PandasArray(np.array([np.nan, (1,)])) @@ -64,6 +64,7 @@ def na_value(): def na_cmp(): def cmp(a, b): return np.isnan(a) and np.isnan(b) + return cmp @@ -74,15 +75,11 @@ def data_for_sorting(allow_in_pandas, dtype): This should be three items [B, C, A] with A < B < C """ - if dtype.numpy_dtype == 'object': + if dtype.numpy_dtype == "object": # Use an empty tuple for first element, then remove, # to disable np.array's shape inference. - return PandasArray( - np.array([(), (2,), (3,), (1,)])[1:] - ) - return PandasArray( - np.array([1, 2, 0]) - ) + return PandasArray(np.array([(), (2,), (3,), (1,)])[1:]) + return PandasArray(np.array([1, 2, 0])) @pytest.fixture @@ -92,13 +89,9 @@ def data_missing_for_sorting(allow_in_pandas, dtype): This should be three items [B, NA, A] with A < B and NA missing. """ - if dtype.numpy_dtype == 'object': - return PandasArray( - np.array([(1,), np.nan, (0,)]) - ) - return PandasArray( - np.array([1, np.nan, 0]) - ) + if dtype.numpy_dtype == "object": + return PandasArray(np.array([(1,), np.nan, (0,)])) + return PandasArray(np.array([1, np.nan, 0])) @pytest.fixture @@ -109,13 +102,11 @@ def data_for_grouping(allow_in_pandas, dtype): Where A < B < C and NA is missing """ - if dtype.numpy_dtype == 'object': + if dtype.numpy_dtype == "object": a, b, c = (1,), (2,), (3,) else: a, b, c = np.arange(3) - return PandasArray(np.array( - [b, b, np.nan, np.nan, a, a, b, c] - )) + return PandasArray(np.array([b, b, np.nan, np.nan, a, a, b, c])) @pytest.fixture @@ -129,11 +120,11 @@ def skip_numpy_object(dtype): This fixture allows these tests to be skipped when used as a usefixtures marker to either an individual test or a test class. """ - if dtype == 'object': + if dtype == "object": raise pytest.skip("Skipping for object dtype.") -skip_nested = pytest.mark.usefixtures('skip_numpy_object') +skip_nested = pytest.mark.usefixtures("skip_numpy_object") class BaseNumPyTests: @@ -141,7 +132,6 @@ class BaseNumPyTests: class TestCasting(BaseNumPyTests, base.BaseCastingTests): - @skip_nested def test_astype_str(self, data): # ValueError: setting an array element with a sequence @@ -161,7 +151,6 @@ def test_array_from_scalars(self, data): class TestDtype(BaseNumPyTests, base.BaseDtypeTests): - @pytest.mark.skip(reason="Incorrect expected.") # we unsurprisingly clash with a NumPy name. def test_check_dtype(self, data): @@ -169,7 +158,6 @@ def test_check_dtype(self, data): class TestGetitem(BaseNumPyTests, base.BaseGetitemTests): - @skip_nested def test_getitem_scalar(self, data): # AssertionError @@ -187,11 +175,9 @@ def test_loc_iloc_frame_single_dtype(self, data): class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests): @skip_nested - def test_groupby_extension_apply( - self, data_for_grouping, groupby_apply_op): + def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): # ValueError: Names should be list-like for a MultiIndex - super().test_groupby_extension_apply(data_for_grouping, - groupby_apply_op) + super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) class TestInterface(BaseNumPyTests, base.BaseInterfaceTests): @@ -202,7 +188,6 @@ def test_array_interface(self, data): class TestMethods(BaseNumPyTests, base.BaseMethodsTests): - @pytest.mark.skip(reason="TODO: remove?") def test_value_counts(self, all_data, dropna): pass @@ -224,8 +209,8 @@ def test_shift_fill_value(self, data): super().test_shift_fill_value(data) @skip_nested - @pytest.mark.parametrize('box', [pd.Series, lambda x: x]) - @pytest.mark.parametrize('method', [lambda x: x.unique(), pd.unique]) + @pytest.mark.parametrize("box", [pd.Series, lambda x: x]) + @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique]) def test_unique(self, data, box, method): # Fails creating expected super().test_unique(data, box, method) @@ -290,7 +275,6 @@ class TestPrinting(BaseNumPyTests, base.BasePrintingTests): @skip_nested class TestNumericReduce(BaseNumPyTests, base.BaseNumericReduceTests): - def check_reduce(self, s, op_name, skipna): result = getattr(s, op_name)(skipna=skipna) # avoid coercing int -> float. Just cast to the actual numpy type. @@ -304,7 +288,6 @@ class TestBooleanReduce(BaseNumPyTests, base.BaseBooleanReduceTests): class TestMissing(BaseNumPyTests, base.BaseMissingTests): - @skip_nested def test_fillna_scalar(self, data_missing): # Non-scalar "scalar" values. @@ -313,8 +296,7 @@ def test_fillna_scalar(self, data_missing): @skip_nested def test_fillna_series_method(self, data_missing, fillna_method): # Non-scalar "scalar" values. - super().test_fillna_series_method( - data_missing, fillna_method) + super().test_fillna_series_method(data_missing, fillna_method) @skip_nested def test_fillna_series(self, data_missing): @@ -328,7 +310,6 @@ def test_fillna_frame(self, data_missing): class TestReshaping(BaseNumPyTests, base.BaseReshapingTests): - @pytest.mark.skip("Incorrect parent test") # not actually a mixed concat, since we concat int and int. def test_concat_mixed_dtypes(self, data): @@ -351,7 +332,6 @@ def test_merge_on_extension_array_duplicates(self, data): class TestSetitem(BaseNumPyTests, base.BaseSetitemTests): - @skip_nested def test_setitem_scalar_series(self, data, box_in_series): # AssertionError @@ -395,7 +375,7 @@ def test_setitem_iloc_scalar_multiple_homogoneous(self, data): super().test_setitem_iloc_scalar_multiple_homogoneous(data) @skip_nested - @pytest.mark.parametrize('setter', ['loc', None]) + @pytest.mark.parametrize("setter", ["loc", None]) def test_setitem_mask_broadcast(self, data, setter): # ValueError: cannot set using a list-like indexer with a different # length than the value diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index b988dcb211dd05..8a500e1be766e6 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -12,7 +12,7 @@ @pytest.fixture def dtype(): - return PeriodDtype(freq='D') + return PeriodDtype(freq="D") @pytest.fixture @@ -71,7 +71,6 @@ class TestGetitem(BasePeriodTests, base.BaseGetitemTests): class TestMethods(BasePeriodTests, base.BaseMethodsTests): - def test_combine_add(self, data_repeated): # Period + Period is not defined. pass @@ -83,28 +82,24 @@ class TestInterface(BasePeriodTests, base.BaseInterfaceTests): class TestArithmeticOps(BasePeriodTests, base.BaseArithmeticOpsTests): - implements = {'__sub__', '__rsub__'} + implements = {"__sub__", "__rsub__"} def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # we implement substitution... if all_arithmetic_operators in self.implements: s = pd.Series(data) - self.check_opname(s, all_arithmetic_operators, s.iloc[0], - exc=None) + self.check_opname(s, all_arithmetic_operators, s.iloc[0], exc=None) else: # ... but not the rest. - super().test_arith_series_with_scalar( - data, all_arithmetic_operators) + super().test_arith_series_with_scalar(data, all_arithmetic_operators) def test_arith_series_with_array(self, data, all_arithmetic_operators): if all_arithmetic_operators in self.implements: s = pd.Series(data) - self.check_opname(s, all_arithmetic_operators, s.iloc[0], - exc=None) + self.check_opname(s, all_arithmetic_operators, s.iloc[0], exc=None) else: # ... but not the rest. - super().test_arith_series_with_scalar( - data, all_arithmetic_operators) + super().test_arith_series_with_scalar(data, all_arithmetic_operators) def _check_divmod_op(self, s, op, other, exc=NotImplementedError): super()._check_divmod_op(s, op, other, exc=TypeError) @@ -112,8 +107,10 @@ def _check_divmod_op(self, s, op, other, exc=NotImplementedError): def test_add_series_with_extension_array(self, data): # we don't implement + for Period s = pd.Series(data) - msg = (r"unsupported operand type\(s\) for \+: " - r"\'PeriodArray\' and \'PeriodArray\'") + msg = ( + r"unsupported operand type\(s\) for \+: " + r"\'PeriodArray\' and \'PeriodArray\'" + ) with pytest.raises(TypeError, match=msg): s + data @@ -132,7 +129,6 @@ class TestCasting(BasePeriodTests, base.BaseCastingTests): class TestComparisonOps(BasePeriodTests, base.BaseComparisonOpsTests): - def _compare_other(self, s, data, op_name, other): # the base test is not appropriate for us. We raise on comparison # with (some) integers, depending on the value. @@ -160,8 +156,8 @@ class TestPrinting(BasePeriodTests, base.BasePrintingTests): class TestParsing(BasePeriodTests, base.BaseParsingTests): - @pytest.mark.parametrize('engine', ['c', 'python']) + @pytest.mark.parametrize("engine", ["c", "python"]) def test_EA_types(self, engine, data): - expected_msg = r'.*must implement _from_sequence_of_strings.*' + expected_msg = r".*must implement _from_sequence_of_strings.*" with pytest.raises(NotImplementedError, match=expected_msg): super().test_EA_types(engine, data) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 8ce53270b7ba87..84d59902d2aa70 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -29,8 +29,7 @@ def dtype(): @pytest.fixture(params=[0, np.nan]) def data(request): """Length-100 PeriodArray for semantics test.""" - res = SparseArray(make_data(request.param), - fill_value=request.param) + res = SparseArray(make_data(request.param), fill_value=request.param) return res @@ -48,10 +47,11 @@ def data_missing(request): @pytest.fixture(params=[0, np.nan]) def data_repeated(request): """Return different versions of data for count times""" + def gen(count): for _ in range(count): - yield SparseArray(make_data(request.param), - fill_value=request.param) + yield SparseArray(make_data(request.param), fill_value=request.param) + yield gen @@ -77,8 +77,7 @@ def na_cmp(): @pytest.fixture(params=[0, np.nan]) def data_for_grouping(request): - return SparseArray([1, 1, np.nan, np.nan, 2, 2, 1, 3], - fill_value=request.param) + return SparseArray([1, 1, np.nan, np.nan, 2, 2, 1, 3], fill_value=request.param) class BaseSparseTests: @@ -92,7 +91,6 @@ def test_ravel(self, data): class TestDtype(BaseSparseTests, base.BaseDtypeTests): - def test_array_type_with_arg(self, data, dtype): assert dtype.construct_array_type() is SparseArray @@ -111,19 +109,19 @@ class TestConstructors(BaseSparseTests, base.BaseConstructorsTests): class TestReshaping(BaseSparseTests, base.BaseReshapingTests): - def test_concat_mixed_dtypes(self, data): # https://github.com/pandas-dev/pandas/issues/20762 # This should be the same, aside from concat([sparse, float]) - df1 = pd.DataFrame({'A': data[:3]}) + df1 = pd.DataFrame({"A": data[:3]}) df2 = pd.DataFrame({"A": [1, 2, 3]}) - df3 = pd.DataFrame({"A": ['a', 'b', 'c']}).astype('category') + df3 = pd.DataFrame({"A": ["a", "b", "c"]}).astype("category") dfs = [df1, df2, df3] # dataframes result = pd.concat(dfs) - expected = pd.concat([x.apply(lambda s: np.asarray(s).astype(object)) - for x in dfs]) + expected = pd.concat( + [x.apply(lambda s: np.asarray(s).astype(object)) for x in dfs] + ) self.assert_frame_equal(result, expected) def test_concat_columns(self, data, na_value): @@ -148,7 +146,6 @@ def test_merge(self, data, na_value): class TestGetitem(BaseSparseTests, base.BaseGetitemTests): - def test_get(self, data): s = pd.Series(data, index=[2 * i for i in range(len(data))]) if np.isnan(s.values.fill_value): @@ -164,11 +161,10 @@ def test_reindex(self, data, na_value): # Skipping TestSetitem, since we don't implement it. -class TestMissing(BaseSparseTests, base.BaseMissingTests): +class TestMissing(BaseSparseTests, base.BaseMissingTests): def test_isna(self, data_missing): - expected_dtype = SparseDtype(bool, - pd.isna(data_missing.dtype.fill_value)) + expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value)) expected = SparseArray([True, False], dtype=expected_dtype) result = pd.isna(data_missing) @@ -204,27 +200,24 @@ def test_fillna_frame(self, data_missing): # Have to override to specify that fill_value will change. fill_value = data_missing[1] - result = pd.DataFrame({ - "A": data_missing, - "B": [1, 2] - }).fillna(fill_value) + result = pd.DataFrame({"A": data_missing, "B": [1, 2]}).fillna(fill_value) if pd.isna(data_missing.fill_value): dtype = SparseDtype(data_missing.dtype, fill_value) else: dtype = data_missing.dtype - expected = pd.DataFrame({ - "A": data_missing._from_sequence([fill_value, fill_value], - dtype=dtype), - "B": [1, 2], - }) + expected = pd.DataFrame( + { + "A": data_missing._from_sequence([fill_value, fill_value], dtype=dtype), + "B": [1, 2], + } + ) self.assert_frame_equal(result, expected) class TestMethods(BaseSparseTests, base.BaseMethodsTests): - def test_combine_le(self, data_repeated): # We return a Series[SparseArray].__le__ returns a # Series[Sparse[bool]] @@ -233,17 +226,19 @@ def test_combine_le(self, data_repeated): s1 = pd.Series(orig_data1) s2 = pd.Series(orig_data2) result = s1.combine(s2, lambda x1, x2: x1 <= x2) - expected = pd.Series(pd.SparseArray([ - a <= b for (a, b) in - zip(list(orig_data1), list(orig_data2)) - ], fill_value=False)) + expected = pd.Series( + pd.SparseArray( + [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))], + fill_value=False, + ) + ) self.assert_series_equal(result, expected) val = s1.iloc[0] result = s1.combine(val, lambda x1, x2: x1 <= x2) - expected = pd.Series(pd.SparseArray([ - a <= val for a in list(orig_data1) - ], fill_value=False)) + expected = pd.Series( + pd.SparseArray([a <= val for a in list(orig_data1)], fill_value=False) + ) self.assert_series_equal(result, expected) def test_fillna_copy_frame(self, data_missing): @@ -280,20 +275,20 @@ def test_where_series(self, data, na_value): cond = np.array([True, True, False, False]) result = ser.where(cond) - new_dtype = SparseDtype('float', 0.0) - expected = pd.Series(cls._from_sequence([a, a, na_value, na_value], - dtype=new_dtype)) + new_dtype = SparseDtype("float", 0.0) + expected = pd.Series( + cls._from_sequence([a, a, na_value, na_value], dtype=new_dtype) + ) self.assert_series_equal(result, expected) other = cls._from_sequence([a, b, a, b], dtype=data.dtype) cond = np.array([True, False, True, True]) result = ser.where(cond, other) - expected = pd.Series(cls._from_sequence([a, b, b, b], - dtype=data.dtype)) + expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype)) self.assert_series_equal(result, expected) def test_combine_first(self, data): - if data.dtype.subtype == 'int': + if data.dtype.subtype == "int": # Right now this is upcasted to float, just like combine_first # for Series[int] pytest.skip("TODO(SparseArray.__setitem__ will preserve dtype.") @@ -334,7 +329,6 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators): class TestComparisonOps(BaseSparseTests, base.BaseComparisonOpsTests): - def _compare_other(self, s, data, op_name, other): op = self.get_op_from_name(op_name) @@ -344,12 +338,14 @@ def _compare_other(self, s, data, op_name, other): # is in general. # Rely on tests in `tests/sparse` to validate that. assert isinstance(result.dtype, SparseDtype) - assert result.dtype.subtype == np.dtype('bool') + assert result.dtype.subtype == np.dtype("bool") - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): expected = pd.Series( - pd.SparseArray(op(np.asarray(data), np.asarray(other)), - fill_value=result.values.fill_value) + pd.SparseArray( + op(np.asarray(data), np.asarray(other)), + fill_value=result.values.fill_value, + ) ) tm.assert_series_equal(result, expected) @@ -361,14 +357,14 @@ def _compare_other(self, s, data, op_name, other): class TestPrinting(BaseSparseTests, base.BasePrintingTests): - @pytest.mark.xfail(reason='Different repr', strict=True) + @pytest.mark.xfail(reason="Different repr", strict=True) def test_array_repr(self, data, size): super().test_array_repr(data, size) class TestParsing(BaseSparseTests, base.BaseParsingTests): - @pytest.mark.parametrize('engine', ['c', 'python']) + @pytest.mark.parametrize("engine", ["c", "python"]) def test_EA_types(self, engine, data): - expected_msg = r'.*must implement _from_sequence_of_strings.*' + expected_msg = r".*must implement _from_sequence_of_strings.*" with pytest.raises(NotImplementedError, match=expected_msg): super().test_EA_types(engine, data) diff --git a/pandas/tests/frame/common.py b/pandas/tests/frame/common.py index 4b71405e20d328..281028b971d1e0 100644 --- a/pandas/tests/frame/common.py +++ b/pandas/tests/frame/common.py @@ -9,17 +9,16 @@ _tsd = tm.getTimeSeriesData() _frame = pd.DataFrame(_seriesd) -_frame2 = pd.DataFrame(_seriesd, columns=['D', 'C', 'B', 'A']) +_frame2 = pd.DataFrame(_seriesd, columns=["D", "C", "B", "A"]) _intframe = pd.DataFrame({k: v.astype(int) for k, v in _seriesd.items()}) _tsframe = pd.DataFrame(_tsd) _mixed_frame = _frame.copy() -_mixed_frame['foo'] = 'bar' +_mixed_frame["foo"] = "bar" class TestData: - @cache_readonly def frame(self): return _frame.copy() @@ -31,8 +30,7 @@ def frame2(self): @cache_readonly def intframe(self): # force these all to int64 to avoid platform testing issues - return pd.DataFrame({c: s for c, s in _intframe.items()}, - dtype=np.int64) + return pd.DataFrame({c: s for c, s in _intframe.items()}, dtype=np.int64) @cache_readonly def tsframe(self): @@ -44,39 +42,59 @@ def mixed_frame(self): @cache_readonly def mixed_float(self): - return pd.DataFrame({'A': _frame['A'].copy().astype('float32'), - 'B': _frame['B'].copy().astype('float32'), - 'C': _frame['C'].copy().astype('float16'), - 'D': _frame['D'].copy().astype('float64')}) + return pd.DataFrame( + { + "A": _frame["A"].copy().astype("float32"), + "B": _frame["B"].copy().astype("float32"), + "C": _frame["C"].copy().astype("float16"), + "D": _frame["D"].copy().astype("float64"), + } + ) @cache_readonly def mixed_float2(self): - return pd.DataFrame({'A': _frame2['A'].copy().astype('float32'), - 'B': _frame2['B'].copy().astype('float32'), - 'C': _frame2['C'].copy().astype('float16'), - 'D': _frame2['D'].copy().astype('float64')}) + return pd.DataFrame( + { + "A": _frame2["A"].copy().astype("float32"), + "B": _frame2["B"].copy().astype("float32"), + "C": _frame2["C"].copy().astype("float16"), + "D": _frame2["D"].copy().astype("float64"), + } + ) @cache_readonly def mixed_int(self): - return pd.DataFrame({'A': _intframe['A'].copy().astype('int32'), - 'B': np.ones(len(_intframe['B']), dtype='uint64'), - 'C': _intframe['C'].copy().astype('uint8'), - 'D': _intframe['D'].copy().astype('int64')}) + return pd.DataFrame( + { + "A": _intframe["A"].copy().astype("int32"), + "B": np.ones(len(_intframe["B"]), dtype="uint64"), + "C": _intframe["C"].copy().astype("uint8"), + "D": _intframe["D"].copy().astype("int64"), + } + ) @cache_readonly def all_mixed(self): - return pd.DataFrame({'a': 1., 'b': 2, 'c': 'foo', - 'float32': np.array([1.] * 10, dtype='float32'), - 'int32': np.array([1] * 10, dtype='int32')}, - index=np.arange(10)) + return pd.DataFrame( + { + "a": 1.0, + "b": 2, + "c": "foo", + "float32": np.array([1.0] * 10, dtype="float32"), + "int32": np.array([1] * 10, dtype="int32"), + }, + index=np.arange(10), + ) @cache_readonly def tzframe(self): - result = pd.DataFrame({'A': pd.date_range('20130101', periods=3), - 'B': pd.date_range('20130101', periods=3, - tz='US/Eastern'), - 'C': pd.date_range('20130101', periods=3, - tz='CET')}) + result = pd.DataFrame( + { + "A": pd.date_range("20130101", periods=3), + "B": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "C": pd.date_range("20130101", periods=3, tz="CET"), + } + ) result.iloc[1, 1] = pd.NaT result.iloc[1, 2] = pd.NaT return result @@ -95,12 +113,10 @@ def ts2(self): @cache_readonly def simple(self): - arr = np.array([[1., 2., 3.], - [4., 5., 6.], - [7., 8., 9.]]) + arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) + + return pd.DataFrame(arr, columns=["one", "two", "three"], index=["a", "b", "c"]) - return pd.DataFrame(arr, columns=['one', 'two', 'three'], - index=['a', 'b', 'c']) # self.ts3 = tm.makeTimeSeries()[-5:] # self.ts4 = tm.makeTimeSeries()[1:-1] @@ -108,32 +124,32 @@ def simple(self): def _check_mixed_float(df, dtype=None): # float16 are most likely to be upcasted to float32 - dtypes = dict(A='float32', B='float32', C='float16', D='float64') + dtypes = dict(A="float32", B="float32", C="float16", D="float64") if isinstance(dtype, str): dtypes = {k: dtype for k, v in dtypes.items()} elif isinstance(dtype, dict): dtypes.update(dtype) - if dtypes.get('A'): - assert(df.dtypes['A'] == dtypes['A']) - if dtypes.get('B'): - assert(df.dtypes['B'] == dtypes['B']) - if dtypes.get('C'): - assert(df.dtypes['C'] == dtypes['C']) - if dtypes.get('D'): - assert(df.dtypes['D'] == dtypes['D']) + if dtypes.get("A"): + assert df.dtypes["A"] == dtypes["A"] + if dtypes.get("B"): + assert df.dtypes["B"] == dtypes["B"] + if dtypes.get("C"): + assert df.dtypes["C"] == dtypes["C"] + if dtypes.get("D"): + assert df.dtypes["D"] == dtypes["D"] def _check_mixed_int(df, dtype=None): - dtypes = dict(A='int32', B='uint64', C='uint8', D='int64') + dtypes = dict(A="int32", B="uint64", C="uint8", D="int64") if isinstance(dtype, str): dtypes = {k: dtype for k, v in dtypes.items()} elif isinstance(dtype, dict): dtypes.update(dtype) - if dtypes.get('A'): - assert(df.dtypes['A'] == dtypes['A']) - if dtypes.get('B'): - assert(df.dtypes['B'] == dtypes['B']) - if dtypes.get('C'): - assert(df.dtypes['C'] == dtypes['C']) - if dtypes.get('D'): - assert(df.dtypes['D'] == dtypes['D']) + if dtypes.get("A"): + assert df.dtypes["A"] == dtypes["A"] + if dtypes.get("B"): + assert df.dtypes["B"] == dtypes["B"] + if dtypes.get("C"): + assert df.dtypes["C"] == dtypes["C"] + if dtypes.get("D"): + assert df.dtypes["D"] == dtypes["D"] diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index 3232c400bd8ce5..915d6edcd83676 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -159,7 +159,7 @@ def float_string_frame(): [30 rows x 5 columns] """ df = DataFrame(tm.getSeriesData()) - df['foo'] = 'bar' + df["foo"] = "bar" return df @@ -190,10 +190,10 @@ def mixed_float_frame(): [30 rows x 4 columns] """ df = DataFrame(tm.getSeriesData()) - df.A = df.A.astype('float32') - df.B = df.B.astype('float32') - df.C = df.C.astype('float16') - df.D = df.D.astype('float64') + df.A = df.A.astype("float32") + df.B = df.B.astype("float32") + df.C = df.C.astype("float16") + df.D = df.D.astype("float64") return df @@ -224,10 +224,10 @@ def mixed_int_frame(): [30 rows x 4 columns] """ df = DataFrame({k: v.astype(int) for k, v in tm.getSeriesData().items()}) - df.A = df.A.astype('int32') - df.B = np.ones(len(df.B), dtype='uint64') - df.C = df.C.astype('uint8') - df.D = df.C.astype('int64') + df.A = df.A.astype("int32") + df.B = np.ones(len(df.B), dtype="uint64") + df.C = df.C.astype("uint8") + df.D = df.C.astype("int64") return df @@ -237,10 +237,16 @@ def mixed_type_frame(): Fixture for DataFrame of float/int/string columns with RangeIndex Columns are ['a', 'b', 'c', 'float32', 'int32']. """ - return DataFrame({'a': 1., 'b': 2, 'c': 'foo', - 'float32': np.array([1.] * 10, dtype='float32'), - 'int32': np.array([1] * 10, dtype='int32')}, - index=np.arange(10)) + return DataFrame( + { + "a": 1.0, + "b": 2, + "c": "foo", + "float32": np.array([1.0] * 10, dtype="float32"), + "int32": np.array([1] * 10, dtype="int32"), + }, + index=np.arange(10), + ) @pytest.fixture @@ -255,11 +261,13 @@ def timezone_frame(): 1 2013-01-02 NaT NaT 2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00 """ - df = DataFrame({'A': date_range('20130101', periods=3), - 'B': date_range('20130101', periods=3, - tz='US/Eastern'), - 'C': date_range('20130101', periods=3, - tz='CET')}) + df = DataFrame( + { + "A": date_range("20130101", periods=3), + "B": date_range("20130101", periods=3, tz="US/Eastern"), + "C": date_range("20130101", periods=3, tz="CET"), + } + ) df.iloc[1, 1] = NaT df.iloc[1, 2] = NaT return df @@ -272,8 +280,9 @@ def uint64_frame(): Columns are ['A', 'B'] """ - return DataFrame({'A': np.arange(3), 'B': [2**63, 2**63 + 5, 2**63 + 10]}, - dtype=np.uint64) + return DataFrame( + {"A": np.arange(3), "B": [2 ** 63, 2 ** 63 + 5, 2 ** 63 + 10]}, dtype=np.uint64 + ) @pytest.fixture @@ -288,12 +297,9 @@ def simple_frame(): b 4.0 5.0 6.0 c 7.0 8.0 9.0 """ - arr = np.array([[1., 2., 3.], - [4., 5., 6.], - [7., 8., 9.]]) + arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) - return DataFrame(arr, columns=['one', 'two', 'three'], - index=['a', 'b', 'c']) + return DataFrame(arr, columns=["one", "two", "three"], index=["a", "b", "c"]) @pytest.fixture @@ -311,10 +317,14 @@ def frame_of_index_cols(): 3 bar one d 0.234246 1.085675 0.718445 4 bar two e 0.533841 -0.005702 -3.533912 """ - df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'], - 'B': ['one', 'two', 'three', 'one', 'two'], - 'C': ['a', 'b', 'c', 'd', 'e'], - 'D': np.random.randn(5), - 'E': np.random.randn(5), - ('tuple', 'as', 'label'): np.random.randn(5)}) + df = DataFrame( + { + "A": ["foo", "foo", "foo", "bar", "bar"], + "B": ["one", "two", "three", "one", "two"], + "C": ["a", "b", "c", "d", "e"], + "D": np.random.randn(5), + "E": np.random.randn(5), + ("tuple", "as", "label"): np.random.randn(5), + } + ) return df diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index e7b4c2c65b842d..229713a5af11a5 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -5,24 +5,37 @@ import pytest from pandas.core.dtypes.common import ( - is_categorical_dtype, is_interval_dtype, is_object_dtype) + is_categorical_dtype, + is_interval_dtype, + is_object_dtype, +) from pandas import ( - Categorical, DataFrame, DatetimeIndex, Index, IntervalIndex, MultiIndex, - RangeIndex, Series, Timestamp, cut, date_range, to_datetime) + Categorical, + DataFrame, + DatetimeIndex, + Index, + IntervalIndex, + MultiIndex, + RangeIndex, + Series, + Timestamp, + cut, + date_range, + to_datetime, +) import pandas.util.testing as tm @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestDataFrameAlterAxes: - def test_set_index_directly(self, float_string_frame): df = float_string_frame idx = Index(np.arange(len(df))[::-1]) df.index = idx tm.assert_index_equal(df.index, idx) - with pytest.raises(ValueError, match='Length mismatch'): + with pytest.raises(ValueError, match="Length mismatch"): df.index = idx[::2] def test_set_index(self, float_string_frame): @@ -31,23 +44,22 @@ def test_set_index(self, float_string_frame): df = df.set_index(idx) tm.assert_index_equal(df.index, idx) - with pytest.raises(ValueError, match='Length mismatch'): + with pytest.raises(ValueError, match="Length mismatch"): df.set_index(idx[::2]) def test_set_index_cast(self): # issue casting an index then set_index - df = DataFrame({'A': [1.1, 2.2, 3.3], 'B': [5.0, 6.1, 7.2]}, - index=[2010, 2011, 2012]) + df = DataFrame( + {"A": [1.1, 2.2, 3.3], "B": [5.0, 6.1, 7.2]}, index=[2010, 2011, 2012] + ) df2 = df.set_index(df.index.astype(np.int32)) tm.assert_frame_equal(df, df2) # A has duplicate values, C does not - @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], - ('tuple', 'as', 'label')]) - @pytest.mark.parametrize('inplace', [True, False]) - @pytest.mark.parametrize('drop', [True, False]) - def test_set_index_drop_inplace(self, frame_of_index_cols, - drop, inplace, keys): + @pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")]) + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_drop_inplace(self, frame_of_index_cols, drop, inplace, keys): df = frame_of_index_cols if isinstance(keys, list): @@ -66,15 +78,15 @@ def test_set_index_drop_inplace(self, frame_of_index_cols, tm.assert_frame_equal(result, expected) # A has duplicate values, C does not - @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], - ('tuple', 'as', 'label')]) - @pytest.mark.parametrize('drop', [True, False]) + @pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")]) + @pytest.mark.parametrize("drop", [True, False]) def test_set_index_append(self, frame_of_index_cols, drop, keys): df = frame_of_index_cols keys = keys if isinstance(keys, list) else [keys] - idx = MultiIndex.from_arrays([df.index] + [df[x] for x in keys], - names=[None] + keys) + idx = MultiIndex.from_arrays( + [df.index] + [df[x] for x in keys], names=[None] + keys + ) expected = df.drop(keys, axis=1) if drop else df.copy() expected.index = idx @@ -83,17 +95,14 @@ def test_set_index_append(self, frame_of_index_cols, drop, keys): tm.assert_frame_equal(result, expected) # A has duplicate values, C does not - @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], - ('tuple', 'as', 'label')]) - @pytest.mark.parametrize('drop', [True, False]) - def test_set_index_append_to_multiindex(self, frame_of_index_cols, - drop, keys): + @pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")]) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_append_to_multiindex(self, frame_of_index_cols, drop, keys): # append to existing multiindex - df = frame_of_index_cols.set_index(['D'], drop=drop, append=True) + df = frame_of_index_cols.set_index(["D"], drop=drop, append=True) keys = keys if isinstance(keys, list) else [keys] - expected = frame_of_index_cols.set_index(['D'] + keys, - drop=drop, append=True) + expected = frame_of_index_cols.set_index(["D"] + keys, drop=drop, append=True) result = df.set_index(keys, drop=drop, append=True) @@ -101,29 +110,38 @@ def test_set_index_append_to_multiindex(self, frame_of_index_cols, def test_set_index_after_mutation(self): # GH1590 - df = DataFrame({'val': [0, 1, 2], 'key': ['a', 'b', 'c']}) - expected = DataFrame({'val': [1, 2]}, - Index(['b', 'c'], name='key')) + df = DataFrame({"val": [0, 1, 2], "key": ["a", "b", "c"]}) + expected = DataFrame({"val": [1, 2]}, Index(["b", "c"], name="key")) df2 = df.loc[df.index.map(lambda indx: indx >= 1)] - result = df2.set_index('key') + result = df2.set_index("key") tm.assert_frame_equal(result, expected) # MultiIndex constructor does not work directly on Series -> lambda # Add list-of-list constructor because list is ambiguous -> lambda # also test index name if append=True (name is duplicate here for B) - @pytest.mark.parametrize('box', [Series, Index, np.array, - list, lambda x: [list(x)], - lambda x: MultiIndex.from_arrays([x])]) - @pytest.mark.parametrize('append, index_name', [(True, None), - (True, 'B'), (True, 'test'), (False, None)]) - @pytest.mark.parametrize('drop', [True, False]) - def test_set_index_pass_single_array(self, frame_of_index_cols, - drop, append, index_name, box): + @pytest.mark.parametrize( + "box", + [ + Series, + Index, + np.array, + list, + lambda x: [list(x)], + lambda x: MultiIndex.from_arrays([x]), + ], + ) + @pytest.mark.parametrize( + "append, index_name", [(True, None), (True, "B"), (True, "test"), (False, None)] + ) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_pass_single_array( + self, frame_of_index_cols, drop, append, index_name, box + ): df = frame_of_index_cols df.index.name = index_name - key = box(df['B']) + key = box(df["B"]) if box == list: # list of strings gets interpreted as list of keys msg = "['one', 'two', 'three', 'one', 'two']" @@ -131,41 +149,44 @@ def test_set_index_pass_single_array(self, frame_of_index_cols, df.set_index(key, drop=drop, append=append) else: # np.array/list-of-list "forget" the name of B - name_mi = getattr(key, 'names', None) - name = [getattr(key, 'name', None)] if name_mi is None else name_mi + name_mi = getattr(key, "names", None) + name = [getattr(key, "name", None)] if name_mi is None else name_mi result = df.set_index(key, drop=drop, append=append) # only valid column keys are dropped # since B is always passed as array above, nothing is dropped - expected = df.set_index(['B'], drop=False, append=append) + expected = df.set_index(["B"], drop=False, append=append) expected.index.names = [index_name] + name if append else name tm.assert_frame_equal(result, expected) # MultiIndex constructor does not work directly on Series -> lambda # also test index name if append=True (name is duplicate here for A & B) - @pytest.mark.parametrize('box', [Series, Index, np.array, list, - lambda x: MultiIndex.from_arrays([x])]) - @pytest.mark.parametrize('append, index_name', - [(True, None), (True, 'A'), (True, 'B'), - (True, 'test'), (False, None)]) - @pytest.mark.parametrize('drop', [True, False]) - def test_set_index_pass_arrays(self, frame_of_index_cols, - drop, append, index_name, box): + @pytest.mark.parametrize( + "box", [Series, Index, np.array, list, lambda x: MultiIndex.from_arrays([x])] + ) + @pytest.mark.parametrize( + "append, index_name", + [(True, None), (True, "A"), (True, "B"), (True, "test"), (False, None)], + ) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_pass_arrays( + self, frame_of_index_cols, drop, append, index_name, box + ): df = frame_of_index_cols df.index.name = index_name - keys = ['A', box(df['B'])] + keys = ["A", box(df["B"])] # np.array/list "forget" the name of B - names = ['A', None if box in [np.array, list, tuple, iter] else 'B'] + names = ["A", None if box in [np.array, list, tuple, iter] else "B"] result = df.set_index(keys, drop=drop, append=append) # only valid column keys are dropped # since B is always passed as array above, only A is dropped, if at all - expected = df.set_index(['A', 'B'], drop=False, append=append) - expected = expected.drop('A', axis=1) if drop else expected + expected = df.set_index(["A", "B"], drop=False, append=append) + expected = expected.drop("A", axis=1) if drop else expected expected.index.names = [index_name] + names if append else names tm.assert_frame_equal(result, expected) @@ -173,31 +194,52 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, # MultiIndex constructor does not work directly on Series -> lambda # We also emulate a "constructor" for the label -> lambda # also test index name if append=True (name is duplicate here for A) - @pytest.mark.parametrize('box2', [Series, Index, np.array, list, iter, - lambda x: MultiIndex.from_arrays([x]), - lambda x: x.name]) - @pytest.mark.parametrize('box1', [Series, Index, np.array, list, iter, - lambda x: MultiIndex.from_arrays([x]), - lambda x: x.name]) - @pytest.mark.parametrize('append, index_name', [(True, None), - (True, 'A'), (True, 'test'), (False, None)]) - @pytest.mark.parametrize('drop', [True, False]) - def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, - append, index_name, box1, box2): + @pytest.mark.parametrize( + "box2", + [ + Series, + Index, + np.array, + list, + iter, + lambda x: MultiIndex.from_arrays([x]), + lambda x: x.name, + ], + ) + @pytest.mark.parametrize( + "box1", + [ + Series, + Index, + np.array, + list, + iter, + lambda x: MultiIndex.from_arrays([x]), + lambda x: x.name, + ], + ) + @pytest.mark.parametrize( + "append, index_name", [(True, None), (True, "A"), (True, "test"), (False, None)] + ) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_pass_arrays_duplicate( + self, frame_of_index_cols, drop, append, index_name, box1, box2 + ): df = frame_of_index_cols df.index.name = index_name - keys = [box1(df['A']), box2(df['A'])] + keys = [box1(df["A"]), box2(df["A"])] result = df.set_index(keys, drop=drop, append=append) # if either box is iter, it has been consumed; re-read - keys = [box1(df['A']), box2(df['A'])] + keys = [box1(df["A"]), box2(df["A"])] # need to adapt first drop for case that both keys are 'A' -- # cannot drop the same column twice; # use "is" because == would give ambiguous Boolean error for containers - first_drop = False if ( - keys[0] is 'A' and keys[1] is 'A') else drop # noqa: F632 + first_drop = ( + False if (keys[0] is "A" and keys[1] is "A") else drop + ) # noqa: F632 # to test against already-tested behaviour, we add sequentially, # hence second append always True; must wrap keys in list, otherwise @@ -206,85 +248,83 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, expected = expected.set_index([keys[1]], drop=drop, append=True) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('append', [True, False]) - @pytest.mark.parametrize('drop', [True, False]) - def test_set_index_pass_multiindex(self, frame_of_index_cols, - drop, append): + @pytest.mark.parametrize("append", [True, False]) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_pass_multiindex(self, frame_of_index_cols, drop, append): df = frame_of_index_cols - keys = MultiIndex.from_arrays([df['A'], df['B']], names=['A', 'B']) + keys = MultiIndex.from_arrays([df["A"], df["B"]], names=["A", "B"]) result = df.set_index(keys, drop=drop, append=append) # setting with a MultiIndex will never drop columns - expected = df.set_index(['A', 'B'], drop=False, append=append) + expected = df.set_index(["A", "B"], drop=False, append=append) tm.assert_frame_equal(result, expected) def test_set_index_verify_integrity(self, frame_of_index_cols): df = frame_of_index_cols - with pytest.raises(ValueError, match='Index has duplicate keys'): - df.set_index('A', verify_integrity=True) + with pytest.raises(ValueError, match="Index has duplicate keys"): + df.set_index("A", verify_integrity=True) # with MultiIndex - with pytest.raises(ValueError, match='Index has duplicate keys'): - df.set_index([df['A'], df['A']], verify_integrity=True) + with pytest.raises(ValueError, match="Index has duplicate keys"): + df.set_index([df["A"], df["A"]], verify_integrity=True) - @pytest.mark.parametrize('append', [True, False]) - @pytest.mark.parametrize('drop', [True, False]) + @pytest.mark.parametrize("append", [True, False]) + @pytest.mark.parametrize("drop", [True, False]) def test_set_index_raise_keys(self, frame_of_index_cols, drop, append): df = frame_of_index_cols with pytest.raises(KeyError, match="['foo', 'bar', 'baz']"): # column names are A-E, as well as one tuple - df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append) + df.set_index(["foo", "bar", "baz"], drop=drop, append=append) # non-existent key in list with arrays - with pytest.raises(KeyError, match='X'): - df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append) + with pytest.raises(KeyError, match="X"): + df.set_index([df["A"], df["B"], "X"], drop=drop, append=append) msg = "[('foo', 'foo', 'foo', 'bar', 'bar')]" # tuples always raise KeyError with pytest.raises(KeyError, match=msg): - df.set_index(tuple(df['A']), drop=drop, append=append) + df.set_index(tuple(df["A"]), drop=drop, append=append) # also within a list with pytest.raises(KeyError, match=msg): - df.set_index(['A', df['A'], tuple(df['A'])], - drop=drop, append=append) - - @pytest.mark.parametrize('append', [True, False]) - @pytest.mark.parametrize('drop', [True, False]) - @pytest.mark.parametrize('box', [set], ids=['set']) - def test_set_index_raise_on_type(self, frame_of_index_cols, box, - drop, append): + df.set_index(["A", df["A"], tuple(df["A"])], drop=drop, append=append) + + @pytest.mark.parametrize("append", [True, False]) + @pytest.mark.parametrize("drop", [True, False]) + @pytest.mark.parametrize("box", [set], ids=["set"]) + def test_set_index_raise_on_type(self, frame_of_index_cols, box, drop, append): df = frame_of_index_cols msg = 'The parameter "keys" may be a column key, .*' # forbidden type, e.g. set with pytest.raises(TypeError, match=msg): - df.set_index(box(df['A']), drop=drop, append=append) + df.set_index(box(df["A"]), drop=drop, append=append) # forbidden type in list, e.g. set with pytest.raises(TypeError, match=msg): - df.set_index(['A', df['A'], box(df['A'])], - drop=drop, append=append) + df.set_index(["A", df["A"], box(df["A"])], drop=drop, append=append) # MultiIndex constructor does not work directly on Series -> lambda - @pytest.mark.parametrize('box', [Series, Index, np.array, iter, - lambda x: MultiIndex.from_arrays([x])], - ids=['Series', 'Index', 'np.array', - 'iter', 'MultiIndex']) - @pytest.mark.parametrize('length', [4, 6], ids=['too_short', 'too_long']) - @pytest.mark.parametrize('append', [True, False]) - @pytest.mark.parametrize('drop', [True, False]) - def test_set_index_raise_on_len(self, frame_of_index_cols, box, length, - drop, append): + @pytest.mark.parametrize( + "box", + [Series, Index, np.array, iter, lambda x: MultiIndex.from_arrays([x])], + ids=["Series", "Index", "np.array", "iter", "MultiIndex"], + ) + @pytest.mark.parametrize("length", [4, 6], ids=["too_short", "too_long"]) + @pytest.mark.parametrize("append", [True, False]) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_raise_on_len( + self, frame_of_index_cols, box, length, drop, append + ): # GH 24984 df = frame_of_index_cols # has length 5 values = np.random.randint(0, 10, (length,)) - msg = 'Length mismatch: Expected 5 rows, received array of length.*' + msg = "Length mismatch: Expected 5 rows, received array of length.*" # wrong length directly with pytest.raises(ValueError, match=msg): @@ -292,7 +332,7 @@ def test_set_index_raise_on_len(self, frame_of_index_cols, box, length, # wrong length in list with pytest.raises(ValueError, match=msg): - df.set_index(['A', df.A, box(values)], drop=drop, append=append) + df.set_index(["A", df.A, box(values)], drop=drop, append=append) def test_set_index_custom_label_type(self): # GH 24969 @@ -308,11 +348,10 @@ def __str__(self): # necessary for pretty KeyError __repr__ = __str__ - thing1 = Thing('One', 'red') - thing2 = Thing('Two', 'blue') + thing1 = Thing("One", "red") + thing2 = Thing("Two", "blue") df = DataFrame({thing1: [0, 1], thing2: [2, 3]}) - expected = DataFrame({thing1: [0, 1]}, - index=Index([2, 3], name=thing2)) + expected = DataFrame({thing1: [0, 1]}, index=Index([2, 3], name=thing2)) # use custom label directly result = df.set_index(thing2) @@ -323,7 +362,7 @@ def __str__(self): tm.assert_frame_equal(result, expected) # missing key - thing3 = Thing('Three', 'pink') + thing3 = Thing("Three", "pink") msg = "" with pytest.raises(KeyError, match=msg): # missing label directly @@ -345,13 +384,12 @@ class Thing(frozenset): def __repr__(self): tmp = sorted(list(self)) # double curly brace prints one brace in format string - return "frozenset({{{}}})".format(', '.join(map(repr, tmp))) + return "frozenset({{{}}})".format(", ".join(map(repr, tmp))) - thing1 = Thing(['One', 'red']) - thing2 = Thing(['Two', 'blue']) + thing1 = Thing(["One", "red"]) + thing2 = Thing(["Two", "blue"]) df = DataFrame({thing1: [0, 1], thing2: [2, 3]}) - expected = DataFrame({thing1: [0, 1]}, - index=Index([2, 3], name=thing2)) + expected = DataFrame({thing1: [0, 1]}, index=Index([2, 3], name=thing2)) # use custom label directly result = df.set_index(thing2) @@ -362,7 +400,7 @@ def __repr__(self): tm.assert_frame_equal(result, expected) # missing key - thing3 = Thing(['Three', 'pink']) + thing3 = Thing(["Three", "pink"]) msg = r"frozenset\(\{'Three', 'pink'\}\)" with pytest.raises(KeyError, match=msg): # missing label directly @@ -384,8 +422,8 @@ def __init__(self, name, color): def __str__(self): return "" % (self.name,) - thing1 = Thing('One', 'red') - thing2 = Thing('Two', 'blue') + thing1 = Thing("One", "red") + thing2 = Thing("Two", "blue") df = DataFrame([[0, 2], [1, 3]], columns=[thing1, thing2]) msg = 'The parameter "keys" may be a column key, .*' @@ -400,53 +438,59 @@ def __str__(self): def test_construction_with_categorical_index(self): ci = tm.makeCategoricalIndex(10) - ci.name = 'B' + ci.name = "B" # with Categorical - df = DataFrame({'A': np.random.randn(10), - 'B': ci.values}) - idf = df.set_index('B') + df = DataFrame({"A": np.random.randn(10), "B": ci.values}) + idf = df.set_index("B") tm.assert_index_equal(idf.index, ci) # from a CategoricalIndex - df = DataFrame({'A': np.random.randn(10), - 'B': ci}) - idf = df.set_index('B') + df = DataFrame({"A": np.random.randn(10), "B": ci}) + idf = df.set_index("B") tm.assert_index_equal(idf.index, ci) # round-trip - idf = idf.reset_index().set_index('B') + idf = idf.reset_index().set_index("B") tm.assert_index_equal(idf.index, ci) def test_set_index_cast_datetimeindex(self): - df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i) - for i in range(1000)], - 'B': np.random.randn(1000)}) + df = DataFrame( + { + "A": [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], + "B": np.random.randn(1000), + } + ) - idf = df.set_index('A') + idf = df.set_index("A") assert isinstance(idf.index, DatetimeIndex) def test_convert_dti_to_series(self): # don't cast a DatetimeIndex WITH a tz, leave as object # GH 6032 - idx = DatetimeIndex(to_datetime(['2013-1-1 13:00', - '2013-1-2 14:00']), - name='B').tz_localize('US/Pacific') - df = DataFrame(np.random.randn(2, 1), columns=['A']) - - expected = Series(np.array([Timestamp('2013-01-01 13:00:00-0800', - tz='US/Pacific'), - Timestamp('2013-01-02 14:00:00-0800', - tz='US/Pacific')], - dtype="object"), name='B') + idx = DatetimeIndex( + to_datetime(["2013-1-1 13:00", "2013-1-2 14:00"]), name="B" + ).tz_localize("US/Pacific") + df = DataFrame(np.random.randn(2, 1), columns=["A"]) + + expected = Series( + np.array( + [ + Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), + Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), + ], + dtype="object", + ), + name="B", + ) # convert index to series result = Series(idx) tm.assert_series_equal(result, expected) # assign to frame - df['B'] = idx - result = df['B'] + df["B"] = idx + result = df["B"] tm.assert_series_equal(result, expected) # convert to series while keeping the timezone @@ -455,18 +499,19 @@ def test_convert_dti_to_series(self): # convert to utc with tm.assert_produces_warning(FutureWarning): - df['B'] = idx.to_series(keep_tz=False, index=[0, 1]) - result = df['B'] - comp = Series(DatetimeIndex(expected.values).tz_localize(None), - name='B') + df["B"] = idx.to_series(keep_tz=False, index=[0, 1]) + result = df["B"] + comp = Series(DatetimeIndex(expected.values).tz_localize(None), name="B") tm.assert_series_equal(result, comp) with tm.assert_produces_warning(FutureWarning) as m: result = idx.to_series(index=[0, 1]) tm.assert_series_equal(result, expected.dt.tz_convert(None)) - msg = ("The default of the 'keep_tz' keyword in " - "DatetimeIndex.to_series will change to True in a future " - "release.") + msg = ( + "The default of the 'keep_tz' keyword in " + "DatetimeIndex.to_series will change to True in a future " + "release." + ) assert msg in str(m[0].message) with tm.assert_produces_warning(FutureWarning): @@ -474,82 +519,83 @@ def test_convert_dti_to_series(self): tm.assert_series_equal(result, expected.dt.tz_convert(None)) # list of datetimes with a tz - df['B'] = idx.to_pydatetime() - result = df['B'] + df["B"] = idx.to_pydatetime() + result = df["B"] tm.assert_series_equal(result, expected) # GH 6785 # set the index manually import pytz - df = DataFrame( - [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}]) - expected = df.set_index('ts') - df.index = df['ts'] - df.pop('ts') + + df = DataFrame([{"ts": datetime(2014, 4, 1, tzinfo=pytz.utc), "foo": 1}]) + expected = df.set_index("ts") + df.index = df["ts"] + df.pop("ts") tm.assert_frame_equal(df, expected) def test_reset_index_tz(self, tz_aware_fixture): # GH 3950 # reset_index with single level tz = tz_aware_fixture - idx = date_range('1/1/2011', periods=5, - freq='D', tz=tz, name='idx') - df = DataFrame({'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, - index=idx) - - expected = DataFrame({'idx': [datetime(2011, 1, 1), - datetime(2011, 1, 2), - datetime(2011, 1, 3), - datetime(2011, 1, 4), - datetime(2011, 1, 5)], - 'a': range(5), - 'b': ['A', 'B', 'C', 'D', 'E']}, - columns=['idx', 'a', 'b']) - expected['idx'] = expected['idx'].apply(lambda d: Timestamp(d, tz=tz)) + idx = date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx") + df = DataFrame({"a": range(5), "b": ["A", "B", "C", "D", "E"]}, index=idx) + + expected = DataFrame( + { + "idx": [ + datetime(2011, 1, 1), + datetime(2011, 1, 2), + datetime(2011, 1, 3), + datetime(2011, 1, 4), + datetime(2011, 1, 5), + ], + "a": range(5), + "b": ["A", "B", "C", "D", "E"], + }, + columns=["idx", "a", "b"], + ) + expected["idx"] = expected["idx"].apply(lambda d: Timestamp(d, tz=tz)) tm.assert_frame_equal(df.reset_index(), expected) def test_set_index_timezone(self): # GH 12358 # tz-aware Series should retain the tz - idx = to_datetime(["2014-01-01 10:10:10"], - utc=True).tz_convert('Europe/Rome') - df = DataFrame({'A': idx}) + idx = to_datetime(["2014-01-01 10:10:10"], utc=True).tz_convert("Europe/Rome") + df = DataFrame({"A": idx}) assert df.set_index(idx).index[0].hour == 11 assert DatetimeIndex(Series(df.A))[0].hour == 11 assert df.set_index(df.A).index[0].hour == 11 def test_set_index_dst(self): - di = date_range('2006-10-29 00:00:00', periods=3, - freq='H', tz='US/Pacific') + di = date_range("2006-10-29 00:00:00", periods=3, freq="H", tz="US/Pacific") - df = DataFrame(data={'a': [0, 1, 2], 'b': [3, 4, 5]}, - index=di).reset_index() + df = DataFrame(data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=di).reset_index() # single level - res = df.set_index('index') - exp = DataFrame(data={'a': [0, 1, 2], 'b': [3, 4, 5]}, - index=Index(di, name='index')) + res = df.set_index("index") + exp = DataFrame( + data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=Index(di, name="index") + ) tm.assert_frame_equal(res, exp) # GH 12920 - res = df.set_index(['index', 'a']) - exp_index = MultiIndex.from_arrays([di, [0, 1, 2]], - names=['index', 'a']) - exp = DataFrame({'b': [3, 4, 5]}, index=exp_index) + res = df.set_index(["index", "a"]) + exp_index = MultiIndex.from_arrays([di, [0, 1, 2]], names=["index", "a"]) + exp = DataFrame({"b": [3, 4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp) def test_reset_index_with_intervals(self): - idx = IntervalIndex.from_breaks(np.arange(11), name='x') - original = DataFrame({'x': idx, 'y': np.arange(10)})[['x', 'y']] + idx = IntervalIndex.from_breaks(np.arange(11), name="x") + original = DataFrame({"x": idx, "y": np.arange(10)})[["x", "y"]] - result = original.set_index('x') - expected = DataFrame({'y': np.arange(10)}, index=idx) + result = original.set_index("x") + expected = DataFrame({"y": np.arange(10)}, index=idx) tm.assert_frame_equal(result, expected) result2 = result.reset_index() tm.assert_frame_equal(result2, original) def test_set_index_multiindexcolumns(self): - columns = MultiIndex.from_tuples([('foo', 1), ('foo', 2), ('bar', 1)]) + columns = MultiIndex.from_tuples([("foo", 1), ("foo", 2), ("bar", 1)]) df = DataFrame(np.random.randn(3, 3), columns=columns) result = df.set_index(df.columns[0]) expected = df.iloc[:, 1:] @@ -559,30 +605,32 @@ def test_set_index_multiindexcolumns(self): def test_set_index_empty_column(self): # GH 1971 - df = DataFrame([ - {'a': 1, 'p': 0}, - {'a': 2, 'm': 10}, - {'a': 3, 'm': 11, 'p': 20}, - {'a': 4, 'm': 12, 'p': 21} - ], columns=('a', 'm', 'p', 'x')) - - result = df.set_index(['a', 'x']) - expected = df[['m', 'p']] - expected.index = MultiIndex.from_arrays([df['a'], df['x']], - names=['a', 'x']) + df = DataFrame( + [ + {"a": 1, "p": 0}, + {"a": 2, "m": 10}, + {"a": 3, "m": 11, "p": 20}, + {"a": 4, "m": 12, "p": 21}, + ], + columns=("a", "m", "p", "x"), + ) + + result = df.set_index(["a", "x"]) + expected = df[["m", "p"]] + expected.index = MultiIndex.from_arrays([df["a"], df["x"]], names=["a", "x"]) tm.assert_frame_equal(result, expected) def test_set_columns(self, float_string_frame): cols = Index(np.arange(len(float_string_frame.columns))) float_string_frame.columns = cols - with pytest.raises(ValueError, match='Length mismatch'): + with pytest.raises(ValueError, match="Length mismatch"): float_string_frame.columns = cols[::2] def test_dti_set_index_reindex(self): # GH 6631 df = DataFrame(np.random.random(6)) - idx1 = date_range('2011/01/01', periods=6, freq='M', tz='US/Eastern') - idx2 = date_range('2013', periods=6, freq='A', tz='Asia/Tokyo') + idx1 = date_range("2011/01/01", periods=6, freq="M", tz="US/Eastern") + idx2 = date_range("2013", periods=6, freq="A", tz="Asia/Tokyo") df = df.set_index(idx1) tm.assert_index_equal(df.index, idx1) @@ -591,13 +639,13 @@ def test_dti_set_index_reindex(self): # GH 11314 # with tz - index = date_range(datetime(2015, 10, 1), - datetime(2015, 10, 1, 23), - freq='H', tz='US/Eastern') - df = DataFrame(np.random.randn(24, 1), columns=['a'], index=index) - new_index = date_range(datetime(2015, 10, 2), - datetime(2015, 10, 2, 23), - freq='H', tz='US/Eastern') + index = date_range( + datetime(2015, 10, 1), datetime(2015, 10, 1, 23), freq="H", tz="US/Eastern" + ) + df = DataFrame(np.random.randn(24, 1), columns=["a"], index=index) + new_index = date_range( + datetime(2015, 10, 2), datetime(2015, 10, 2, 23), freq="H", tz="US/Eastern" + ) result = df.set_index(new_index) assert result.index.freq == index.freq @@ -605,65 +653,58 @@ def test_dti_set_index_reindex(self): # Renaming def test_rename(self, float_frame): - mapping = { - 'A': 'a', - 'B': 'b', - 'C': 'c', - 'D': 'd' - } + mapping = {"A": "a", "B": "b", "C": "c", "D": "d"} renamed = float_frame.rename(columns=mapping) renamed2 = float_frame.rename(columns=str.lower) tm.assert_frame_equal(renamed, renamed2) - tm.assert_frame_equal(renamed2.rename(columns=str.upper), - float_frame, check_names=False) + tm.assert_frame_equal( + renamed2.rename(columns=str.upper), float_frame, check_names=False + ) # index - data = { - 'A': {'foo': 0, 'bar': 1} - } + data = {"A": {"foo": 0, "bar": 1}} # gets sorted alphabetical df = DataFrame(data) - renamed = df.rename(index={'foo': 'bar', 'bar': 'foo'}) - tm.assert_index_equal(renamed.index, Index(['foo', 'bar'])) + renamed = df.rename(index={"foo": "bar", "bar": "foo"}) + tm.assert_index_equal(renamed.index, Index(["foo", "bar"])) renamed = df.rename(index=str.upper) - tm.assert_index_equal(renamed.index, Index(['BAR', 'FOO'])) + tm.assert_index_equal(renamed.index, Index(["BAR", "FOO"])) # have to pass something with pytest.raises(TypeError, match="must pass an index to rename"): float_frame.rename() # partial columns - renamed = float_frame.rename(columns={'C': 'foo', 'D': 'bar'}) - tm.assert_index_equal(renamed.columns, Index(['A', 'B', 'foo', 'bar'])) + renamed = float_frame.rename(columns={"C": "foo", "D": "bar"}) + tm.assert_index_equal(renamed.columns, Index(["A", "B", "foo", "bar"])) # other axis - renamed = float_frame.T.rename(index={'C': 'foo', 'D': 'bar'}) - tm.assert_index_equal(renamed.index, Index(['A', 'B', 'foo', 'bar'])) + renamed = float_frame.T.rename(index={"C": "foo", "D": "bar"}) + tm.assert_index_equal(renamed.index, Index(["A", "B", "foo", "bar"])) # index with name - index = Index(['foo', 'bar'], name='name') + index = Index(["foo", "bar"], name="name") renamer = DataFrame(data, index=index) - renamed = renamer.rename(index={'foo': 'bar', 'bar': 'foo'}) - tm.assert_index_equal(renamed.index, - Index(['bar', 'foo'], name='name')) + renamed = renamer.rename(index={"foo": "bar", "bar": "foo"}) + tm.assert_index_equal(renamed.index, Index(["bar", "foo"], name="name")) assert renamed.index.name == renamer.index.name def test_rename_axis_inplace(self, float_frame): # GH 15704 - expected = float_frame.rename_axis('foo') + expected = float_frame.rename_axis("foo") result = float_frame.copy() - no_return = result.rename_axis('foo', inplace=True) + no_return = result.rename_axis("foo", inplace=True) assert no_return is None tm.assert_frame_equal(result, expected) - expected = float_frame.rename_axis('bar', axis=1) + expected = float_frame.rename_axis("bar", axis=1) result = float_frame.copy() - no_return = result.rename_axis('bar', axis=1, inplace=True) + no_return = result.rename_axis("bar", axis=1, inplace=True) assert no_return is None tm.assert_frame_equal(result, expected) @@ -681,65 +722,67 @@ def test_rename_axis_raises(self): df.rename_axis(id, axis=1) with pytest.raises(ValueError, match="Use `.rename`"): - df['A'].rename_axis(id) + df["A"].rename_axis(id) def test_rename_axis_mapper(self): # GH 19978 - mi = MultiIndex.from_product([['a', 'b', 'c'], [1, 2]], - names=['ll', 'nn']) - df = DataFrame({'x': [i for i in range(len(mi))], - 'y': [i * 10 for i in range(len(mi))]}, - index=mi) + mi = MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["ll", "nn"]) + df = DataFrame( + {"x": [i for i in range(len(mi))], "y": [i * 10 for i in range(len(mi))]}, + index=mi, + ) # Test for rename of the Index object of columns - result = df.rename_axis('cols', axis=1) - tm.assert_index_equal(result.columns, - Index(['x', 'y'], name='cols')) + result = df.rename_axis("cols", axis=1) + tm.assert_index_equal(result.columns, Index(["x", "y"], name="cols")) # Test for rename of the Index object of columns using dict - result = result.rename_axis(columns={'cols': 'new'}, axis=1) - tm.assert_index_equal(result.columns, - Index(['x', 'y'], name='new')) + result = result.rename_axis(columns={"cols": "new"}, axis=1) + tm.assert_index_equal(result.columns, Index(["x", "y"], name="new")) # Test for renaming index using dict - result = df.rename_axis(index={'ll': 'foo'}) - assert result.index.names == ['foo', 'nn'] + result = df.rename_axis(index={"ll": "foo"}) + assert result.index.names == ["foo", "nn"] # Test for renaming index using a function result = df.rename_axis(index=str.upper, axis=0) - assert result.index.names == ['LL', 'NN'] + assert result.index.names == ["LL", "NN"] # Test for renaming index providing complete list - result = df.rename_axis(index=['foo', 'goo']) - assert result.index.names == ['foo', 'goo'] + result = df.rename_axis(index=["foo", "goo"]) + assert result.index.names == ["foo", "goo"] # Test for changing index and columns at same time - sdf = df.reset_index().set_index('nn').drop(columns=['ll', 'y']) - result = sdf.rename_axis(index='foo', columns='meh') - assert result.index.name == 'foo' - assert result.columns.name == 'meh' + sdf = df.reset_index().set_index("nn").drop(columns=["ll", "y"]) + result = sdf.rename_axis(index="foo", columns="meh") + assert result.index.name == "foo" + assert result.columns.name == "meh" # Test different error cases - with pytest.raises(TypeError, match='Must pass'): - df.rename_axis(index='wrong') + with pytest.raises(TypeError, match="Must pass"): + df.rename_axis(index="wrong") - with pytest.raises(ValueError, match='Length of names'): - df.rename_axis(index=['wrong']) + with pytest.raises(ValueError, match="Length of names"): + df.rename_axis(index=["wrong"]) - with pytest.raises(TypeError, match='bogus'): + with pytest.raises(TypeError, match="bogus"): df.rename_axis(bogus=None) - @pytest.mark.parametrize('kwargs, rename_index, rename_columns', [ - ({'mapper': None, 'axis': 0}, True, False), - ({'mapper': None, 'axis': 1}, False, True), - ({'index': None}, True, False), - ({'columns': None}, False, True), - ({'index': None, 'columns': None}, True, True), - ({}, False, False)]) + @pytest.mark.parametrize( + "kwargs, rename_index, rename_columns", + [ + ({"mapper": None, "axis": 0}, True, False), + ({"mapper": None, "axis": 1}, False, True), + ({"index": None}, True, False), + ({"columns": None}, False, True), + ({"index": None, "columns": None}, True, True), + ({}, False, False), + ], + ) def test_rename_axis_none(self, kwargs, rename_index, rename_columns): # GH 25034 - index = Index(list('abc'), name='foo') - columns = Index(['col1', 'col2'], name='bar') + index = Index(list("abc"), name="foo") + columns = Index(["col1", "col2"], name="bar") data = np.arange(6).reshape(3, 2) df = DataFrame(data, index, columns) @@ -751,24 +794,25 @@ def test_rename_axis_none(self, kwargs, rename_index, rename_columns): def test_rename_multiindex(self): - tuples_index = [('foo1', 'bar1'), ('foo2', 'bar2')] - tuples_columns = [('fizz1', 'buzz1'), ('fizz2', 'buzz2')] - index = MultiIndex.from_tuples(tuples_index, names=['foo', 'bar']) - columns = MultiIndex.from_tuples( - tuples_columns, names=['fizz', 'buzz']) + tuples_index = [("foo1", "bar1"), ("foo2", "bar2")] + tuples_columns = [("fizz1", "buzz1"), ("fizz2", "buzz2")] + index = MultiIndex.from_tuples(tuples_index, names=["foo", "bar"]) + columns = MultiIndex.from_tuples(tuples_columns, names=["fizz", "buzz"]) df = DataFrame([(0, 0), (1, 1)], index=index, columns=columns) # # without specifying level -> across all levels - renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, - columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}) - new_index = MultiIndex.from_tuples([('foo3', 'bar1'), - ('foo2', 'bar3')], - names=['foo', 'bar']) - new_columns = MultiIndex.from_tuples([('fizz3', 'buzz1'), - ('fizz2', 'buzz3')], - names=['fizz', 'buzz']) + renamed = df.rename( + index={"foo1": "foo3", "bar2": "bar3"}, + columns={"fizz1": "fizz3", "buzz2": "buzz3"}, + ) + new_index = MultiIndex.from_tuples( + [("foo3", "bar1"), ("foo2", "bar3")], names=["foo", "bar"] + ) + new_columns = MultiIndex.from_tuples( + [("fizz3", "buzz1"), ("fizz2", "buzz3")], names=["fizz", "buzz"] + ) tm.assert_index_equal(renamed.index, new_index) tm.assert_index_equal(renamed.columns, new_columns) assert renamed.index.names == df.index.names @@ -778,212 +822,213 @@ def test_rename_multiindex(self): # with specifying a level (GH13766) # dict - new_columns = MultiIndex.from_tuples([('fizz3', 'buzz1'), - ('fizz2', 'buzz2')], - names=['fizz', 'buzz']) - renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level=0) + new_columns = MultiIndex.from_tuples( + [("fizz3", "buzz1"), ("fizz2", "buzz2")], names=["fizz", "buzz"] + ) + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0) tm.assert_index_equal(renamed.columns, new_columns) - renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level='fizz') + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz") tm.assert_index_equal(renamed.columns, new_columns) - new_columns = MultiIndex.from_tuples([('fizz1', 'buzz1'), - ('fizz2', 'buzz3')], - names=['fizz', 'buzz']) - renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level=1) + new_columns = MultiIndex.from_tuples( + [("fizz1", "buzz1"), ("fizz2", "buzz3")], names=["fizz", "buzz"] + ) + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1) tm.assert_index_equal(renamed.columns, new_columns) - renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level='buzz') + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz") tm.assert_index_equal(renamed.columns, new_columns) # function func = str.upper - new_columns = MultiIndex.from_tuples([('FIZZ1', 'buzz1'), - ('FIZZ2', 'buzz2')], - names=['fizz', 'buzz']) + new_columns = MultiIndex.from_tuples( + [("FIZZ1", "buzz1"), ("FIZZ2", "buzz2")], names=["fizz", "buzz"] + ) renamed = df.rename(columns=func, level=0) tm.assert_index_equal(renamed.columns, new_columns) - renamed = df.rename(columns=func, level='fizz') + renamed = df.rename(columns=func, level="fizz") tm.assert_index_equal(renamed.columns, new_columns) - new_columns = MultiIndex.from_tuples([('fizz1', 'BUZZ1'), - ('fizz2', 'BUZZ2')], - names=['fizz', 'buzz']) + new_columns = MultiIndex.from_tuples( + [("fizz1", "BUZZ1"), ("fizz2", "BUZZ2")], names=["fizz", "buzz"] + ) renamed = df.rename(columns=func, level=1) tm.assert_index_equal(renamed.columns, new_columns) - renamed = df.rename(columns=func, level='buzz') + renamed = df.rename(columns=func, level="buzz") tm.assert_index_equal(renamed.columns, new_columns) # index - new_index = MultiIndex.from_tuples([('foo3', 'bar1'), - ('foo2', 'bar2')], - names=['foo', 'bar']) - renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, - level=0) + new_index = MultiIndex.from_tuples( + [("foo3", "bar1"), ("foo2", "bar2")], names=["foo", "bar"] + ) + renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) tm.assert_index_equal(renamed.index, new_index) def test_rename_nocopy(self, float_frame): - renamed = float_frame.rename(columns={'C': 'foo'}, copy=False) - renamed['foo'] = 1. - assert (float_frame['C'] == 1.).all() + renamed = float_frame.rename(columns={"C": "foo"}, copy=False) + renamed["foo"] = 1.0 + assert (float_frame["C"] == 1.0).all() def test_rename_inplace(self, float_frame): - float_frame.rename(columns={'C': 'foo'}) - assert 'C' in float_frame - assert 'foo' not in float_frame + float_frame.rename(columns={"C": "foo"}) + assert "C" in float_frame + assert "foo" not in float_frame - c_id = id(float_frame['C']) + c_id = id(float_frame["C"]) float_frame = float_frame.copy() - float_frame.rename(columns={'C': 'foo'}, inplace=True) + float_frame.rename(columns={"C": "foo"}, inplace=True) - assert 'C' not in float_frame - assert 'foo' in float_frame - assert id(float_frame['foo']) != c_id + assert "C" not in float_frame + assert "foo" in float_frame + assert id(float_frame["foo"]) != c_id def test_rename_bug(self): # GH 5344 # rename set ref_locs, and set_index was not resetting - df = DataFrame({0: ['foo', 'bar'], 1: ['bah', 'bas'], 2: [1, 2]}) - df = df.rename(columns={0: 'a'}) - df = df.rename(columns={1: 'b'}) - df = df.set_index(['a', 'b']) - df.columns = ['2001-01-01'] - expected = DataFrame([[1], [2]], - index=MultiIndex.from_tuples( - [('foo', 'bah'), ('bar', 'bas')], - names=['a', 'b']), - columns=['2001-01-01']) + df = DataFrame({0: ["foo", "bar"], 1: ["bah", "bas"], 2: [1, 2]}) + df = df.rename(columns={0: "a"}) + df = df.rename(columns={1: "b"}) + df = df.set_index(["a", "b"]) + df.columns = ["2001-01-01"] + expected = DataFrame( + [[1], [2]], + index=MultiIndex.from_tuples( + [("foo", "bah"), ("bar", "bas")], names=["a", "b"] + ), + columns=["2001-01-01"], + ) tm.assert_frame_equal(df, expected) def test_rename_bug2(self): # GH 19497 # rename was changing Index to MultiIndex if Index contained tuples - df = DataFrame(data=np.arange(3), index=[(0, 0), (1, 1), (2, 2)], - columns=["a"]) + df = DataFrame(data=np.arange(3), index=[(0, 0), (1, 1), (2, 2)], columns=["a"]) df = df.rename({(1, 1): (5, 4)}, axis="index") - expected = DataFrame(data=np.arange(3), index=[(0, 0), (5, 4), (2, 2)], - columns=["a"]) + expected = DataFrame( + data=np.arange(3), index=[(0, 0), (5, 4), (2, 2)], columns=["a"] + ) tm.assert_frame_equal(df, expected) def test_rename_errors_raises(self): - df = DataFrame(columns=['A', 'B', 'C', 'D']) - with pytest.raises(KeyError, match='\'E\'] not found in axis'): - df.rename(columns={'A': 'a', 'E': 'e'}, errors='raise') - - @pytest.mark.parametrize('mapper, errors, expected_columns', [ - ({'A': 'a', 'E': 'e'}, 'ignore', ['a', 'B', 'C', 'D']), - ({'A': 'a'}, 'raise', ['a', 'B', 'C', 'D']), - (str.lower, 'raise', ['a', 'b', 'c', 'd'])]) + df = DataFrame(columns=["A", "B", "C", "D"]) + with pytest.raises(KeyError, match="'E'] not found in axis"): + df.rename(columns={"A": "a", "E": "e"}, errors="raise") + + @pytest.mark.parametrize( + "mapper, errors, expected_columns", + [ + ({"A": "a", "E": "e"}, "ignore", ["a", "B", "C", "D"]), + ({"A": "a"}, "raise", ["a", "B", "C", "D"]), + (str.lower, "raise", ["a", "b", "c", "d"]), + ], + ) def test_rename_errors(self, mapper, errors, expected_columns): # GH 13473 # rename now works with errors parameter - df = DataFrame(columns=['A', 'B', 'C', 'D']) + df = DataFrame(columns=["A", "B", "C", "D"]) result = df.rename(columns=mapper, errors=errors) expected = DataFrame(columns=expected_columns) tm.assert_frame_equal(result, expected) def test_reorder_levels(self): - index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], - codes=[[0, 0, 0, 0, 0, 0], - [0, 1, 2, 0, 1, 2], - [0, 1, 0, 1, 0, 1]], - names=['L0', 'L1', 'L2']) - df = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, index=index) + index = MultiIndex( + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + names=["L0", "L1", "L2"], + ) + df = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=index) # no change, position result = df.reorder_levels([0, 1, 2]) tm.assert_frame_equal(df, result) # no change, labels - result = df.reorder_levels(['L0', 'L1', 'L2']) + result = df.reorder_levels(["L0", "L1", "L2"]) tm.assert_frame_equal(df, result) # rotate, position result = df.reorder_levels([1, 2, 0]) - e_idx = MultiIndex(levels=[['one', 'two', 'three'], [0, 1], ['bar']], - codes=[[0, 1, 2, 0, 1, 2], - [0, 1, 0, 1, 0, 1], - [0, 0, 0, 0, 0, 0]], - names=['L1', 'L2', 'L0']) - expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, - index=e_idx) + e_idx = MultiIndex( + levels=[["one", "two", "three"], [0, 1], ["bar"]], + codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1], [0, 0, 0, 0, 0, 0]], + names=["L1", "L2", "L0"], + ) + expected = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=e_idx) tm.assert_frame_equal(result, expected) result = df.reorder_levels([0, 0, 0]) - e_idx = MultiIndex(levels=[['bar'], ['bar'], ['bar']], - codes=[[0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0]], - names=['L0', 'L0', 'L0']) - expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, - index=e_idx) + e_idx = MultiIndex( + levels=[["bar"], ["bar"], ["bar"]], + codes=[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]], + names=["L0", "L0", "L0"], + ) + expected = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=e_idx) tm.assert_frame_equal(result, expected) - result = df.reorder_levels(['L0', 'L0', 'L0']) + result = df.reorder_levels(["L0", "L0", "L0"]) tm.assert_frame_equal(result, expected) def test_reset_index(self, float_frame): stacked = float_frame.stack()[::2] - stacked = DataFrame({'foo': stacked, 'bar': stacked}) + stacked = DataFrame({"foo": stacked, "bar": stacked}) - names = ['first', 'second'] + names = ["first", "second"] stacked.index.names = names deleveled = stacked.reset_index() - for i, (lev, level_codes) in enumerate(zip(stacked.index.levels, - stacked.index.codes)): + for i, (lev, level_codes) in enumerate( + zip(stacked.index.levels, stacked.index.codes) + ): values = lev.take(level_codes) name = names[i] tm.assert_index_equal(values, Index(deleveled[name])) stacked.index.names = [None, None] deleveled2 = stacked.reset_index() - tm.assert_series_equal(deleveled['first'], deleveled2['level_0'], - check_names=False) - tm.assert_series_equal(deleveled['second'], deleveled2['level_1'], - check_names=False) + tm.assert_series_equal( + deleveled["first"], deleveled2["level_0"], check_names=False + ) + tm.assert_series_equal( + deleveled["second"], deleveled2["level_1"], check_names=False + ) # default name assigned rdf = float_frame.reset_index() - exp = Series(float_frame.index.values, name='index') - tm.assert_series_equal(rdf['index'], exp) + exp = Series(float_frame.index.values, name="index") + tm.assert_series_equal(rdf["index"], exp) # default name assigned, corner case df = float_frame.copy() - df['index'] = 'foo' + df["index"] = "foo" rdf = df.reset_index() - exp = Series(float_frame.index.values, name='level_0') - tm.assert_series_equal(rdf['level_0'], exp) + exp = Series(float_frame.index.values, name="level_0") + tm.assert_series_equal(rdf["level_0"], exp) # but this is ok - float_frame.index.name = 'index' + float_frame.index.name = "index" deleveled = float_frame.reset_index() - tm.assert_series_equal(deleveled['index'], Series(float_frame.index)) - tm.assert_index_equal(deleveled.index, - Index(np.arange(len(deleveled)))) + tm.assert_series_equal(deleveled["index"], Series(float_frame.index)) + tm.assert_index_equal(deleveled.index, Index(np.arange(len(deleveled)))) # preserve column names - float_frame.columns.name = 'columns' + float_frame.columns.name = "columns" resetted = float_frame.reset_index() - assert resetted.columns.name == 'columns' + assert resetted.columns.name == "columns" # only remove certain columns - df = float_frame.reset_index().set_index(['index', 'A', 'B']) - rs = df.reset_index(['A', 'B']) + df = float_frame.reset_index().set_index(["index", "A", "B"]) + rs = df.reset_index(["A", "B"]) # TODO should reset_index check_names ? tm.assert_frame_equal(rs, float_frame, check_names=False) - rs = df.reset_index(['index', 'A', 'B']) + rs = df.reset_index(["index", "A", "B"]) tm.assert_frame_equal(rs, float_frame.reset_index(), check_names=False) - rs = df.reset_index(['index', 'A', 'B']) + rs = df.reset_index(["index", "A", "B"]) tm.assert_frame_equal(rs, float_frame.reset_index(), check_names=False) - rs = df.reset_index('A') - xp = float_frame.reset_index().set_index(['index', 'B']) + rs = df.reset_index("A") + xp = float_frame.reset_index().set_index(["index", "B"]) tm.assert_frame_equal(rs, xp, check_names=False) # test resetting in place @@ -992,179 +1037,198 @@ def test_reset_index(self, float_frame): df.reset_index(inplace=True) tm.assert_frame_equal(df, resetted, check_names=False) - df = float_frame.reset_index().set_index(['index', 'A', 'B']) - rs = df.reset_index('A', drop=True) + df = float_frame.reset_index().set_index(["index", "A", "B"]) + rs = df.reset_index("A", drop=True) xp = float_frame.copy() - del xp['A'] - xp = xp.set_index(['B'], append=True) + del xp["A"] + xp = xp.set_index(["B"], append=True) tm.assert_frame_equal(rs, xp, check_names=False) def test_reset_index_name(self): - df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], - columns=['A', 'B', 'C', 'D'], - index=Index(range(2), name='x')) + df = DataFrame( + [[1, 2, 3, 4], [5, 6, 7, 8]], + columns=["A", "B", "C", "D"], + index=Index(range(2), name="x"), + ) assert df.reset_index().index.name is None assert df.reset_index(drop=True).index.name is None df.reset_index(inplace=True) assert df.index.name is None def test_reset_index_level(self): - df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], - columns=['A', 'B', 'C', 'D']) + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "C", "D"]) - for levels in ['A', 'B'], [0, 1]: + for levels in ["A", "B"], [0, 1]: # With MultiIndex - result = df.set_index(['A', 'B']).reset_index(level=levels[0]) - tm.assert_frame_equal(result, df.set_index('B')) + result = df.set_index(["A", "B"]).reset_index(level=levels[0]) + tm.assert_frame_equal(result, df.set_index("B")) - result = df.set_index(['A', 'B']).reset_index(level=levels[:1]) - tm.assert_frame_equal(result, df.set_index('B')) + result = df.set_index(["A", "B"]).reset_index(level=levels[:1]) + tm.assert_frame_equal(result, df.set_index("B")) - result = df.set_index(['A', 'B']).reset_index(level=levels) + result = df.set_index(["A", "B"]).reset_index(level=levels) tm.assert_frame_equal(result, df) - result = df.set_index(['A', 'B']).reset_index(level=levels, - drop=True) - tm.assert_frame_equal(result, df[['C', 'D']]) + result = df.set_index(["A", "B"]).reset_index(level=levels, drop=True) + tm.assert_frame_equal(result, df[["C", "D"]]) # With single-level Index (GH 16263) - result = df.set_index('A').reset_index(level=levels[0]) + result = df.set_index("A").reset_index(level=levels[0]) tm.assert_frame_equal(result, df) - result = df.set_index('A').reset_index(level=levels[:1]) + result = df.set_index("A").reset_index(level=levels[:1]) tm.assert_frame_equal(result, df) - result = df.set_index(['A']).reset_index(level=levels[0], - drop=True) - tm.assert_frame_equal(result, df[['B', 'C', 'D']]) + result = df.set_index(["A"]).reset_index(level=levels[0], drop=True) + tm.assert_frame_equal(result, df[["B", "C", "D"]]) # Missing levels - for both MultiIndex and single-level Index: - for idx_lev in ['A', 'B'], ['A']: - with pytest.raises(KeyError, match='Level E '): - df.set_index(idx_lev).reset_index(level=['A', 'E']) - with pytest.raises(IndexError, match='Too many levels'): + for idx_lev in ["A", "B"], ["A"]: + with pytest.raises(KeyError, match="Level E "): + df.set_index(idx_lev).reset_index(level=["A", "E"]) + with pytest.raises(IndexError, match="Too many levels"): df.set_index(idx_lev).reset_index(level=[0, 1, 2]) def test_reset_index_right_dtype(self): time = np.arange(0.0, 10, np.sqrt(2) / 2) - s1 = Series((9.81 * time ** 2) / 2, - index=Index(time, name='time'), - name='speed') + s1 = Series( + (9.81 * time ** 2) / 2, index=Index(time, name="time"), name="speed" + ) df = DataFrame(s1) resetted = s1.reset_index() - assert resetted['time'].dtype == np.float64 + assert resetted["time"].dtype == np.float64 resetted = df.reset_index() - assert resetted['time'].dtype == np.float64 + assert resetted["time"].dtype == np.float64 def test_reset_index_multiindex_col(self): vals = np.random.randn(3, 3).astype(object) - idx = ['x', 'y', 'z'] + idx = ["x", "y", "z"] full = np.hstack(([[x] for x in idx], vals)) - df = DataFrame(vals, Index(idx, name='a'), - columns=[['b', 'b', 'c'], ['mean', 'median', 'mean']]) + df = DataFrame( + vals, + Index(idx, name="a"), + columns=[["b", "b", "c"], ["mean", "median", "mean"]], + ) rs = df.reset_index() - xp = DataFrame(full, columns=[['a', 'b', 'b', 'c'], - ['', 'mean', 'median', 'mean']]) + xp = DataFrame( + full, columns=[["a", "b", "b", "c"], ["", "mean", "median", "mean"]] + ) tm.assert_frame_equal(rs, xp) rs = df.reset_index(col_fill=None) - xp = DataFrame(full, columns=[['a', 'b', 'b', 'c'], - ['a', 'mean', 'median', 'mean']]) + xp = DataFrame( + full, columns=[["a", "b", "b", "c"], ["a", "mean", "median", "mean"]] + ) tm.assert_frame_equal(rs, xp) - rs = df.reset_index(col_level=1, col_fill='blah') - xp = DataFrame(full, columns=[['blah', 'b', 'b', 'c'], - ['a', 'mean', 'median', 'mean']]) + rs = df.reset_index(col_level=1, col_fill="blah") + xp = DataFrame( + full, columns=[["blah", "b", "b", "c"], ["a", "mean", "median", "mean"]] + ) tm.assert_frame_equal(rs, xp) - df = DataFrame(vals, - MultiIndex.from_arrays([[0, 1, 2], ['x', 'y', 'z']], - names=['d', 'a']), - columns=[['b', 'b', 'c'], ['mean', 'median', 'mean']]) - rs = df.reset_index('a', ) - xp = DataFrame(full, Index([0, 1, 2], name='d'), - columns=[['a', 'b', 'b', 'c'], - ['', 'mean', 'median', 'mean']]) + df = DataFrame( + vals, + MultiIndex.from_arrays([[0, 1, 2], ["x", "y", "z"]], names=["d", "a"]), + columns=[["b", "b", "c"], ["mean", "median", "mean"]], + ) + rs = df.reset_index("a") + xp = DataFrame( + full, + Index([0, 1, 2], name="d"), + columns=[["a", "b", "b", "c"], ["", "mean", "median", "mean"]], + ) tm.assert_frame_equal(rs, xp) - rs = df.reset_index('a', col_fill=None) - xp = DataFrame(full, Index(range(3), name='d'), - columns=[['a', 'b', 'b', 'c'], - ['a', 'mean', 'median', 'mean']]) + rs = df.reset_index("a", col_fill=None) + xp = DataFrame( + full, + Index(range(3), name="d"), + columns=[["a", "b", "b", "c"], ["a", "mean", "median", "mean"]], + ) tm.assert_frame_equal(rs, xp) - rs = df.reset_index('a', col_fill='blah', col_level=1) - xp = DataFrame(full, Index(range(3), name='d'), - columns=[['blah', 'b', 'b', 'c'], - ['a', 'mean', 'median', 'mean']]) + rs = df.reset_index("a", col_fill="blah", col_level=1) + xp = DataFrame( + full, + Index(range(3), name="d"), + columns=[["blah", "b", "b", "c"], ["a", "mean", "median", "mean"]], + ) tm.assert_frame_equal(rs, xp) def test_reset_index_multiindex_nan(self): # GH6322, testing reset_index on MultiIndexes # when we have a nan or all nan - df = DataFrame({'A': ['a', 'b', 'c'], - 'B': [0, 1, np.nan], - 'C': np.random.rand(3)}) - rs = df.set_index(['A', 'B']).reset_index() + df = DataFrame( + {"A": ["a", "b", "c"], "B": [0, 1, np.nan], "C": np.random.rand(3)} + ) + rs = df.set_index(["A", "B"]).reset_index() tm.assert_frame_equal(rs, df) - df = DataFrame({'A': [np.nan, 'b', 'c'], - 'B': [0, 1, 2], - 'C': np.random.rand(3)}) - rs = df.set_index(['A', 'B']).reset_index() + df = DataFrame( + {"A": [np.nan, "b", "c"], "B": [0, 1, 2], "C": np.random.rand(3)} + ) + rs = df.set_index(["A", "B"]).reset_index() tm.assert_frame_equal(rs, df) - df = DataFrame({'A': ['a', 'b', 'c'], - 'B': [0, 1, 2], - 'C': [np.nan, 1.1, 2.2]}) - rs = df.set_index(['A', 'B']).reset_index() + df = DataFrame({"A": ["a", "b", "c"], "B": [0, 1, 2], "C": [np.nan, 1.1, 2.2]}) + rs = df.set_index(["A", "B"]).reset_index() tm.assert_frame_equal(rs, df) - df = DataFrame({'A': ['a', 'b', 'c'], - 'B': [np.nan, np.nan, np.nan], - 'C': np.random.rand(3)}) - rs = df.set_index(['A', 'B']).reset_index() + df = DataFrame( + { + "A": ["a", "b", "c"], + "B": [np.nan, np.nan, np.nan], + "C": np.random.rand(3), + } + ) + rs = df.set_index(["A", "B"]).reset_index() tm.assert_frame_equal(rs, df) def test_reset_index_with_datetimeindex_cols(self): # GH5818 # - df = DataFrame([[1, 2], [3, 4]], - columns=date_range('1/1/2013', '1/2/2013'), - index=['A', 'B']) + df = DataFrame( + [[1, 2], [3, 4]], + columns=date_range("1/1/2013", "1/2/2013"), + index=["A", "B"], + ) result = df.reset_index() - expected = DataFrame([['A', 1, 2], ['B', 3, 4]], - columns=['index', datetime(2013, 1, 1), - datetime(2013, 1, 2)]) + expected = DataFrame( + [["A", 1, 2], ["B", 3, 4]], + columns=["index", datetime(2013, 1, 1), datetime(2013, 1, 2)], + ) tm.assert_frame_equal(result, expected) def test_reset_index_range(self): # GH 12071 - df = DataFrame([[0, 0], [1, 1]], columns=['A', 'B'], - index=RangeIndex(stop=2)) + df = DataFrame([[0, 0], [1, 1]], columns=["A", "B"], index=RangeIndex(stop=2)) result = df.reset_index() assert isinstance(result.index, RangeIndex) - expected = DataFrame([[0, 0, 0], [1, 1, 1]], - columns=['index', 'A', 'B'], - index=RangeIndex(stop=2)) + expected = DataFrame( + [[0, 0, 0], [1, 1, 1]], + columns=["index", "A", "B"], + index=RangeIndex(stop=2), + ) tm.assert_frame_equal(result, expected) def test_set_index_names(self): df = tm.makeDataFrame() - df.index.name = 'name' + df.index.name = "name" - assert df.set_index(df.index).index.names == ['name'] + assert df.set_index(df.index).index.names == ["name"] - mi = MultiIndex.from_arrays(df[['A', 'B']].T.values, names=['A', 'B']) - mi2 = MultiIndex.from_arrays(df[['A', 'B', 'A', 'B']].T.values, - names=['A', 'B', 'C', 'D']) + mi = MultiIndex.from_arrays(df[["A", "B"]].T.values, names=["A", "B"]) + mi2 = MultiIndex.from_arrays( + df[["A", "B", "A", "B"]].T.values, names=["A", "B", "C", "D"] + ) - df = df.set_index(['A', 'B']) + df = df.set_index(["A", "B"]) - assert df.set_index(df.index).index.names == ['A', 'B'] + assert df.set_index(df.index).index.names == ["A", "B"] # Check that set_index isn't converting a MultiIndex into an Index assert isinstance(df.set_index(df.index).index, MultiIndex) @@ -1172,7 +1236,7 @@ def test_set_index_names(self): # Check actual equality tm.assert_index_equal(df.set_index(df.index).index, mi) - idx2 = df.index.rename(['C', 'D']) + idx2 = df.index.rename(["C", "D"]) # Check that [MultiIndex, MultiIndex] yields a MultiIndex rather # than a pair of tuples @@ -1184,55 +1248,56 @@ def test_set_index_names(self): def test_rename_objects(self, float_string_frame): renamed = float_string_frame.rename(columns=str.upper) - assert 'FOO' in renamed - assert 'foo' not in renamed + assert "FOO" in renamed + assert "foo" not in renamed def test_rename_axis_style(self): # https://github.com/pandas-dev/pandas/issues/12392 - df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=['X', 'Y']) - expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=['X', 'Y']) + df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["X", "Y"]) + expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=["X", "Y"]) result = df.rename(str.lower, axis=1) tm.assert_frame_equal(result, expected) - result = df.rename(str.lower, axis='columns') + result = df.rename(str.lower, axis="columns") tm.assert_frame_equal(result, expected) - result = df.rename({"A": 'a', 'B': 'b'}, axis=1) + result = df.rename({"A": "a", "B": "b"}, axis=1) tm.assert_frame_equal(result, expected) - result = df.rename({"A": 'a', 'B': 'b'}, axis='columns') + result = df.rename({"A": "a", "B": "b"}, axis="columns") tm.assert_frame_equal(result, expected) # Index - expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=['x', 'y']) + expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["x", "y"]) result = df.rename(str.lower, axis=0) tm.assert_frame_equal(result, expected) - result = df.rename(str.lower, axis='index') + result = df.rename(str.lower, axis="index") tm.assert_frame_equal(result, expected) - result = df.rename({'X': 'x', 'Y': 'y'}, axis=0) + result = df.rename({"X": "x", "Y": "y"}, axis=0) tm.assert_frame_equal(result, expected) - result = df.rename({'X': 'x', 'Y': 'y'}, axis='index') + result = df.rename({"X": "x", "Y": "y"}, axis="index") tm.assert_frame_equal(result, expected) - result = df.rename(mapper=str.lower, axis='index') + result = df.rename(mapper=str.lower, axis="index") tm.assert_frame_equal(result, expected) def test_rename_mapper_multi(self): - df = DataFrame({"A": ['a', 'b'], "B": ['c', 'd'], - 'C': [1, 2]}).set_index(["A", "B"]) + df = DataFrame({"A": ["a", "b"], "B": ["c", "d"], "C": [1, 2]}).set_index( + ["A", "B"] + ) result = df.rename(str.upper) expected = df.rename(index=str.upper) tm.assert_frame_equal(result, expected) def test_rename_positional_named(self): # https://github.com/pandas-dev/pandas/issues/12392 - df = DataFrame({"a": [1, 2], "b": [1, 2]}, index=['X', 'Y']) + df = DataFrame({"a": [1, 2], "b": [1, 2]}, index=["X", "Y"]) result = df.rename(str.lower, columns=str.upper) - expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=['x', 'y']) + expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["x", "y"]) tm.assert_frame_equal(result, expected) def test_rename_axis_style_raises(self): @@ -1240,8 +1305,7 @@ def test_rename_axis_style_raises(self): df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["0", "1"]) # Named target and axis - over_spec_msg = ("Cannot specify both 'axis' and " - "any of 'index' or 'columns'") + over_spec_msg = "Cannot specify both 'axis' and " "any of 'index' or 'columns'" with pytest.raises(TypeError, match=over_spec_msg): df.rename(index=str.lower, axis=1) @@ -1269,62 +1333,62 @@ def test_rename_axis_style_raises(self): def test_reindex_api_equivalence(self): # equivalence of the labels/axis and index/columns API's - df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]], - index=['a', 'b', 'c'], - columns=['d', 'e', 'f']) - - res1 = df.reindex(['b', 'a']) - res2 = df.reindex(index=['b', 'a']) - res3 = df.reindex(labels=['b', 'a']) - res4 = df.reindex(labels=['b', 'a'], axis=0) - res5 = df.reindex(['b', 'a'], axis=0) + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + + res1 = df.reindex(["b", "a"]) + res2 = df.reindex(index=["b", "a"]) + res3 = df.reindex(labels=["b", "a"]) + res4 = df.reindex(labels=["b", "a"], axis=0) + res5 = df.reindex(["b", "a"], axis=0) for res in [res2, res3, res4, res5]: tm.assert_frame_equal(res1, res) - res1 = df.reindex(columns=['e', 'd']) - res2 = df.reindex(['e', 'd'], axis=1) - res3 = df.reindex(labels=['e', 'd'], axis=1) + res1 = df.reindex(columns=["e", "d"]) + res2 = df.reindex(["e", "d"], axis=1) + res3 = df.reindex(labels=["e", "d"], axis=1) for res in [res2, res3]: tm.assert_frame_equal(res1, res) - res1 = df.reindex(index=['b', 'a'], columns=['e', 'd']) - res2 = df.reindex(columns=['e', 'd'], index=['b', 'a']) - res3 = df.reindex(labels=['b', 'a'], axis=0).reindex(labels=['e', 'd'], - axis=1) + res1 = df.reindex(index=["b", "a"], columns=["e", "d"]) + res2 = df.reindex(columns=["e", "d"], index=["b", "a"]) + res3 = df.reindex(labels=["b", "a"], axis=0).reindex(labels=["e", "d"], axis=1) for res in [res2, res3]: tm.assert_frame_equal(res1, res) def test_rename_positional(self): - df = DataFrame(columns=['A', 'B']) + df = DataFrame(columns=["A", "B"]) with tm.assert_produces_warning(FutureWarning) as rec: result = df.rename(None, str.lower) - expected = DataFrame(columns=['a', 'b']) + expected = DataFrame(columns=["a", "b"]) tm.assert_frame_equal(result, expected) assert len(rec) == 1 message = str(rec[0].message) - assert 'rename' in message - assert 'Use named arguments' in message + assert "rename" in message + assert "Use named arguments" in message def test_assign_columns(self, float_frame): - float_frame['hi'] = 'there' + float_frame["hi"] = "there" df = float_frame.copy() - df.columns = ['foo', 'bar', 'baz', 'quux', 'foo2'] - tm.assert_series_equal(float_frame['C'], df['baz'], check_names=False) - tm.assert_series_equal(float_frame['hi'], df['foo2'], - check_names=False) + df.columns = ["foo", "bar", "baz", "quux", "foo2"] + tm.assert_series_equal(float_frame["C"], df["baz"], check_names=False) + tm.assert_series_equal(float_frame["hi"], df["foo2"], check_names=False) def test_set_index_preserve_categorical_dtype(self): # GH13743, GH13854 - df = DataFrame({'A': [1, 2, 1, 1, 2], - 'B': [10, 16, 22, 28, 34], - 'C1': Categorical(list("abaab"), - categories=list("bac"), - ordered=False), - 'C2': Categorical(list("abaab"), - categories=list("bac"), - ordered=True)}) - for cols in ['C1', 'C2', ['A', 'C1'], ['A', 'C2'], ['C1', 'C2']]: + df = DataFrame( + { + "A": [1, 2, 1, 1, 2], + "B": [10, 16, 22, 28, 34], + "C1": Categorical(list("abaab"), categories=list("bac"), ordered=False), + "C2": Categorical(list("abaab"), categories=list("bac"), ordered=True), + } + ) + for cols in ["C1", "C2", ["A", "C1"], ["A", "C2"], ["C1", "C2"]]: result = df.set_index(cols).reset_index() result = result.reindex(columns=df.columns) tm.assert_frame_equal(result, df) @@ -1340,63 +1404,78 @@ def test_ambiguous_warns(self): def test_rename_signature(self): sig = inspect.signature(DataFrame.rename) parameters = set(sig.parameters) - assert parameters == {"self", "mapper", "index", "columns", "axis", - "inplace", "copy", "level", "errors"} + assert parameters == { + "self", + "mapper", + "index", + "columns", + "axis", + "inplace", + "copy", + "level", + "errors", + } def test_reindex_signature(self): sig = inspect.signature(DataFrame.reindex) parameters = set(sig.parameters) - assert parameters == {"self", "labels", "index", "columns", "axis", - "limit", "copy", "level", "method", - "fill_value", "tolerance"} + assert parameters == { + "self", + "labels", + "index", + "columns", + "axis", + "limit", + "copy", + "level", + "method", + "fill_value", + "tolerance", + } def test_droplevel(self): # GH20342 - df = DataFrame([ - [1, 2, 3, 4], - [5, 6, 7, 8], - [9, 10, 11, 12] - ]) - df = df.set_index([0, 1]).rename_axis(['a', 'b']) - df.columns = MultiIndex.from_tuples([('c', 'e'), ('d', 'f')], - names=['level_1', 'level_2']) + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) + df = df.set_index([0, 1]).rename_axis(["a", "b"]) + df.columns = MultiIndex.from_tuples( + [("c", "e"), ("d", "f")], names=["level_1", "level_2"] + ) # test that dropping of a level in index works - expected = df.reset_index('a', drop=True) - result = df.droplevel('a', axis='index') + expected = df.reset_index("a", drop=True) + result = df.droplevel("a", axis="index") tm.assert_frame_equal(result, expected) # test that dropping of a level in columns works expected = df.copy() - expected.columns = Index(['c', 'd'], name='level_1') - result = df.droplevel('level_2', axis='columns') + expected.columns = Index(["c", "d"], name="level_1") + result = df.droplevel("level_2", axis="columns") tm.assert_frame_equal(result, expected) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestIntervalIndex: - def test_setitem(self): - df = DataFrame({'A': range(10)}) + df = DataFrame({"A": range(10)}) s = cut(df.A, 5) assert isinstance(s.cat.categories, IntervalIndex) # B & D end up as Categoricals # the remainer are converted to in-line objects # contining an IntervalIndex.values - df['B'] = s - df['C'] = np.array(s) - df['D'] = s.values - df['E'] = np.array(s.values) + df["B"] = s + df["C"] = np.array(s) + df["D"] = s.values + df["E"] = np.array(s.values) - assert is_categorical_dtype(df['B']) - assert is_interval_dtype(df['B'].cat.categories) - assert is_categorical_dtype(df['D']) - assert is_interval_dtype(df['D'].cat.categories) + assert is_categorical_dtype(df["B"]) + assert is_interval_dtype(df["B"].cat.categories) + assert is_categorical_dtype(df["D"]) + assert is_interval_dtype(df["D"].cat.categories) - assert is_object_dtype(df['C']) - assert is_object_dtype(df['E']) + assert is_object_dtype(df["C"]) + assert is_object_dtype(df["E"]) # they compare equal as Index # when converted to numpy objects @@ -1407,77 +1486,75 @@ def test_setitem(self): tm.assert_index_equal(c(df.B), c(df.D), check_names=False) # B & D are the same Series - tm.assert_series_equal(df['B'], df['B'], check_names=False) - tm.assert_series_equal(df['B'], df['D'], check_names=False) + tm.assert_series_equal(df["B"], df["B"], check_names=False) + tm.assert_series_equal(df["B"], df["D"], check_names=False) # C & E are the same Series - tm.assert_series_equal(df['C'], df['C'], check_names=False) - tm.assert_series_equal(df['C'], df['E'], check_names=False) + tm.assert_series_equal(df["C"], df["C"], check_names=False) + tm.assert_series_equal(df["C"], df["E"], check_names=False) def test_set_reset_index(self): - df = DataFrame({'A': range(10)}) + df = DataFrame({"A": range(10)}) s = cut(df.A, 5) - df['B'] = s - df = df.set_index('B') + df["B"] = s + df = df.set_index("B") df = df.reset_index() def test_set_axis_inplace(self): # GH14636 - df = DataFrame({'A': [1.1, 2.2, 3.3], - 'B': [5.0, 6.1, 7.2], - 'C': [4.4, 5.5, 6.6]}, - index=[2010, 2011, 2012]) - - expected = {0: df.copy(), - 1: df.copy()} - expected[0].index = list('abc') - expected[1].columns = list('abc') - expected['index'] = expected[0] - expected['columns'] = expected[1] + df = DataFrame( + {"A": [1.1, 2.2, 3.3], "B": [5.0, 6.1, 7.2], "C": [4.4, 5.5, 6.6]}, + index=[2010, 2011, 2012], + ) + + expected = {0: df.copy(), 1: df.copy()} + expected[0].index = list("abc") + expected[1].columns = list("abc") + expected["index"] = expected[0] + expected["columns"] = expected[1] for axis in expected: # inplace=True # The FutureWarning comes from the fact that we would like to have # inplace default to False some day for inplace, warn in (None, FutureWarning), (True, None): - kwargs = {'inplace': inplace} + kwargs = {"inplace": inplace} result = df.copy() with tm.assert_produces_warning(warn): - result.set_axis(list('abc'), axis=axis, **kwargs) + result.set_axis(list("abc"), axis=axis, **kwargs) tm.assert_frame_equal(result, expected[axis]) # inplace=False - result = df.set_axis(list('abc'), axis=axis, inplace=False) + result = df.set_axis(list("abc"), axis=axis, inplace=False) tm.assert_frame_equal(expected[axis], result) # omitting the "axis" parameter with tm.assert_produces_warning(None): - result = df.set_axis(list('abc'), inplace=False) + result = df.set_axis(list("abc"), inplace=False) tm.assert_frame_equal(result, expected[0]) # wrong values for the "axis" parameter - for axis in 3, 'foo': - with pytest.raises(ValueError, match='No axis named'): - df.set_axis(list('abc'), axis=axis, inplace=False) + for axis in 3, "foo": + with pytest.raises(ValueError, match="No axis named"): + df.set_axis(list("abc"), axis=axis, inplace=False) def test_set_axis_prior_to_deprecation_signature(self): - df = DataFrame({'A': [1.1, 2.2, 3.3], - 'B': [5.0, 6.1, 7.2], - 'C': [4.4, 5.5, 6.6]}, - index=[2010, 2011, 2012]) - - expected = {0: df.copy(), - 1: df.copy()} - expected[0].index = list('abc') - expected[1].columns = list('abc') - expected['index'] = expected[0] - expected['columns'] = expected[1] + df = DataFrame( + {"A": [1.1, 2.2, 3.3], "B": [5.0, 6.1, 7.2], "C": [4.4, 5.5, 6.6]}, + index=[2010, 2011, 2012], + ) + + expected = {0: df.copy(), 1: df.copy()} + expected[0].index = list("abc") + expected[1].columns = list("abc") + expected["index"] = expected[0] + expected["columns"] = expected[1] # old signature for axis in expected: with tm.assert_produces_warning(FutureWarning): - result = df.set_axis(axis, list('abc'), inplace=False) + result = df.set_axis(axis, list("abc"), inplace=False) tm.assert_frame_equal(result, expected[axis]) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 9921d91d6de8c2..8c1534aa515e8c 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -10,16 +10,32 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, MultiIndex, Series, Timestamp, date_range, isna, - notna, to_datetime, to_timedelta) + Categorical, + DataFrame, + MultiIndex, + Series, + Timestamp, + date_range, + isna, + notna, + to_datetime, + to_timedelta, +) import pandas.core.algorithms as algorithms import pandas.core.nanops as nanops import pandas.util.testing as tm -def assert_stat_op_calc(opname, alternative, frame, has_skipna=True, - check_dtype=True, check_dates=False, - check_less_precise=False, skipna_alternative=None): +def assert_stat_op_calc( + opname, + alternative, + frame, + has_skipna=True, + check_dtype=True, + check_dates=False, + check_less_precise=False, + skipna_alternative=None, +): """ Check that operator opname works as advertised on frame @@ -49,43 +65,53 @@ def assert_stat_op_calc(opname, alternative, frame, has_skipna=True, f = getattr(frame, opname) if check_dates: - df = DataFrame({'b': date_range('1/1/2001', periods=2)}) + df = DataFrame({"b": date_range("1/1/2001", periods=2)}) result = getattr(df, opname)() assert isinstance(result, Series) - df['a'] = range(len(df)) + df["a"] = range(len(df)) result = getattr(df, opname)() assert isinstance(result, Series) assert len(result) if has_skipna: + def wrapper(x): return alternative(x.values) - skipna_wrapper = tm._make_skipna_wrapper(alternative, - skipna_alternative) + skipna_wrapper = tm._make_skipna_wrapper(alternative, skipna_alternative) result0 = f(axis=0, skipna=False) result1 = f(axis=1, skipna=False) - tm.assert_series_equal(result0, frame.apply(wrapper), - check_dtype=check_dtype, - check_less_precise=check_less_precise) + tm.assert_series_equal( + result0, + frame.apply(wrapper), + check_dtype=check_dtype, + check_less_precise=check_less_precise, + ) # HACK: win32 - tm.assert_series_equal(result1, frame.apply(wrapper, axis=1), - check_dtype=False, - check_less_precise=check_less_precise) + tm.assert_series_equal( + result1, + frame.apply(wrapper, axis=1), + check_dtype=False, + check_less_precise=check_less_precise, + ) else: skipna_wrapper = alternative result0 = f(axis=0) result1 = f(axis=1) - tm.assert_series_equal(result0, frame.apply(skipna_wrapper), - check_dtype=check_dtype, - check_less_precise=check_less_precise) - - if opname in ['sum', 'prod']: + tm.assert_series_equal( + result0, + frame.apply(skipna_wrapper), + check_dtype=check_dtype, + check_less_precise=check_less_precise, + ) + + if opname in ["sum", "prod"]: expected = frame.apply(skipna_wrapper, axis=1) - tm.assert_series_equal(result1, expected, check_dtype=False, - check_less_precise=check_less_precise) + tm.assert_series_equal( + result1, expected, check_dtype=False, check_less_precise=check_less_precise + ) # check dtypes if check_dtype: @@ -94,7 +120,7 @@ def wrapper(x): assert lcd_dtype == result1.dtype # bad axis - with pytest.raises(ValueError, match='No axis named 2'): + with pytest.raises(ValueError, match="No axis named 2"): f(axis=2) # all NA case @@ -102,16 +128,15 @@ def wrapper(x): all_na = frame * np.NaN r0 = getattr(all_na, opname)(axis=0) r1 = getattr(all_na, opname)(axis=1) - if opname in ['sum', 'prod']: - unit = 1 if opname == 'prod' else 0 # result for empty sum/prod + if opname in ["sum", "prod"]: + unit = 1 if opname == "prod" else 0 # result for empty sum/prod expected = pd.Series(unit, index=r0.index, dtype=r0.dtype) tm.assert_series_equal(r0, expected) expected = pd.Series(unit, index=r1.index, dtype=r1.dtype) tm.assert_series_equal(r1, expected) -def assert_stat_op_api(opname, float_frame, float_string_frame, - has_numeric_only=False): +def assert_stat_op_api(opname, float_frame, float_string_frame, has_numeric_only=False): """ Check that API for operator opname works as advertised on frame @@ -158,6 +183,7 @@ def assert_bool_op_calc(opname, alternative, frame, has_skipna=True): f = getattr(frame, opname) if has_skipna: + def skipna_wrapper(x): nona = x.dropna().values return alternative(nona) @@ -169,8 +195,9 @@ def wrapper(x): result1 = f(axis=1, skipna=False) tm.assert_series_equal(result0, frame.apply(wrapper)) - tm.assert_series_equal(result1, frame.apply(wrapper, axis=1), - check_dtype=False) # HACK: win32 + tm.assert_series_equal( + result1, frame.apply(wrapper, axis=1), check_dtype=False + ) # HACK: win32 else: skipna_wrapper = alternative wrapper = alternative @@ -179,11 +206,12 @@ def wrapper(x): result1 = f(axis=1) tm.assert_series_equal(result0, frame.apply(skipna_wrapper)) - tm.assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1), - check_dtype=False) + tm.assert_series_equal( + result1, frame.apply(skipna_wrapper, axis=1), check_dtype=False + ) # bad axis - with pytest.raises(ValueError, match='No axis named 2'): + with pytest.raises(ValueError, match="No axis named 2"): f(axis=2) # all NA case @@ -191,7 +219,7 @@ def wrapper(x): all_na = frame * np.NaN r0 = getattr(all_na, opname)(axis=0) r1 = getattr(all_na, opname)(axis=1) - if opname == 'any': + if opname == "any": assert not r0.any() assert not r1.any() else: @@ -199,8 +227,9 @@ def wrapper(x): assert r1.all() -def assert_bool_op_api(opname, bool_frame_with_na, float_string_frame, - has_bool_only=False): +def assert_bool_op_api( + opname, bool_frame_with_na, float_string_frame, has_bool_only=False +): """ Check that API for boolean operator opname works as advertised on frame @@ -217,7 +246,7 @@ def assert_bool_op_api(opname, bool_frame_with_na, float_string_frame, """ # make sure op works on mixed-type frame mixed = float_string_frame - mixed['_bool_'] = np.random.randn(len(mixed)) > 0.5 + mixed["_bool_"] = np.random.randn(len(mixed)) > 0.5 getattr(mixed, opname)(axis=0) getattr(mixed, opname)(axis=1) @@ -235,62 +264,69 @@ class TestDataFrameAnalytics: @td.skip_if_no_scipy def test_corr_pearson(self, float_frame): - float_frame['A'][:5] = np.nan - float_frame['B'][5:10] = np.nan + float_frame["A"][:5] = np.nan + float_frame["B"][5:10] = np.nan - self._check_method(float_frame, 'pearson') + self._check_method(float_frame, "pearson") @td.skip_if_no_scipy def test_corr_kendall(self, float_frame): - float_frame['A'][:5] = np.nan - float_frame['B'][5:10] = np.nan + float_frame["A"][:5] = np.nan + float_frame["B"][5:10] = np.nan - self._check_method(float_frame, 'kendall') + self._check_method(float_frame, "kendall") @td.skip_if_no_scipy def test_corr_spearman(self, float_frame): - float_frame['A'][:5] = np.nan - float_frame['B'][5:10] = np.nan + float_frame["A"][:5] = np.nan + float_frame["B"][5:10] = np.nan - self._check_method(float_frame, 'spearman') + self._check_method(float_frame, "spearman") - def _check_method(self, frame, method='pearson'): + def _check_method(self, frame, method="pearson"): correls = frame.corr(method=method) - expected = frame['A'].corr(frame['C'], method=method) - tm.assert_almost_equal(correls['A']['C'], expected) + expected = frame["A"].corr(frame["C"], method=method) + tm.assert_almost_equal(correls["A"]["C"], expected) @td.skip_if_no_scipy def test_corr_non_numeric(self, float_frame, float_string_frame): - float_frame['A'][:5] = np.nan - float_frame['B'][5:10] = np.nan + float_frame["A"][:5] = np.nan + float_frame["B"][5:10] = np.nan # exclude non-numeric types result = float_string_frame.corr() - expected = float_string_frame.loc[:, ['A', 'B', 'C', 'D']].corr() + expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr() tm.assert_frame_equal(result, expected) @td.skip_if_no_scipy - @pytest.mark.parametrize('meth', ['pearson', 'kendall', 'spearman']) + @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"]) def test_corr_nooverlap(self, meth): # nothing in common - df = DataFrame({'A': [1, 1.5, 1, np.nan, np.nan, np.nan], - 'B': [np.nan, np.nan, np.nan, 1, 1.5, 1], - 'C': [np.nan, np.nan, np.nan, np.nan, - np.nan, np.nan]}) + df = DataFrame( + { + "A": [1, 1.5, 1, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, np.nan, 1, 1.5, 1], + "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + } + ) rs = df.corr(meth) - assert isna(rs.loc['A', 'B']) - assert isna(rs.loc['B', 'A']) - assert rs.loc['A', 'A'] == 1 - assert rs.loc['B', 'B'] == 1 - assert isna(rs.loc['C', 'C']) + assert isna(rs.loc["A", "B"]) + assert isna(rs.loc["B", "A"]) + assert rs.loc["A", "A"] == 1 + assert rs.loc["B", "B"] == 1 + assert isna(rs.loc["C", "C"]) @td.skip_if_no_scipy - @pytest.mark.parametrize('meth', ['pearson', 'spearman']) + @pytest.mark.parametrize("meth", ["pearson", "spearman"]) def test_corr_constant(self, meth): # constant --> all NA - df = DataFrame({'A': [1, 1, 1, np.nan, np.nan, np.nan], - 'B': [np.nan, np.nan, np.nan, 1, 1, 1]}) + df = DataFrame( + { + "A": [1, 1, 1, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, np.nan, 1, 1, 1], + } + ) rs = df.corr(meth) assert isna(rs.values).all() @@ -308,9 +344,8 @@ def test_corr_int_and_boolean(self): # so it need to be properly handled df = DataFrame({"a": [True, False], "b": [1, 0]}) - expected = DataFrame(np.ones((2, 2)), index=[ - 'a', 'b'], columns=['a', 'b']) - for meth in ['pearson', 'kendall', 'spearman']: + expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"]) + for meth in ["pearson", "kendall", "spearman"]: with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) @@ -319,9 +354,8 @@ def test_corr_int_and_boolean(self): def test_corr_cov_independent_index_column(self): # GH 14617 - df = pd.DataFrame(np.random.randn(4 * 10).reshape(10, 4), - columns=list("abcd")) - for method in ['cov', 'corr']: + df = pd.DataFrame(np.random.randn(4 * 10).reshape(10, 4), columns=list("abcd")) + for method in ["cov", "corr"]: result = getattr(df, method)() assert result.index is not result.columns assert result.index.equals(result.columns) @@ -329,8 +363,9 @@ def test_corr_cov_independent_index_column(self): def test_corr_invalid_method(self): # GH 22298 df = pd.DataFrame(np.random.normal(size=(10, 2))) - msg = ("method must be either 'pearson', " - "'spearman', 'kendall', or a callable, ") + msg = ( + "method must be either 'pearson', " "'spearman', 'kendall', or a callable, " + ) with pytest.raises(ValueError, match=msg): df.corr(method="____") @@ -346,36 +381,39 @@ def test_cov(self, float_frame, float_string_frame): # with NAs frame = float_frame.copy() - frame['A'][:5] = np.nan - frame['B'][5:10] = np.nan + frame["A"][:5] = np.nan + frame["B"][5:10] = np.nan result = float_frame.cov(min_periods=len(float_frame) - 8) expected = float_frame.cov() - expected.loc['A', 'B'] = np.nan - expected.loc['B', 'A'] = np.nan + expected.loc["A", "B"] = np.nan + expected.loc["B", "A"] = np.nan # regular - float_frame['A'][:5] = np.nan - float_frame['B'][:10] = np.nan + float_frame["A"][:5] = np.nan + float_frame["B"][:10] = np.nan cov = float_frame.cov() - tm.assert_almost_equal(cov['A']['C'], - float_frame['A'].cov(float_frame['C'])) + tm.assert_almost_equal(cov["A"]["C"], float_frame["A"].cov(float_frame["C"])) # exclude non-numeric types result = float_string_frame.cov() - expected = float_string_frame.loc[:, ['A', 'B', 'C', 'D']].cov() + expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov() tm.assert_frame_equal(result, expected) # Single column frame df = DataFrame(np.linspace(0.0, 1.0, 10)) result = df.cov() - expected = DataFrame(np.cov(df.values.T).reshape((1, 1)), - index=df.columns, columns=df.columns) + expected = DataFrame( + np.cov(df.values.T).reshape((1, 1)), index=df.columns, columns=df.columns + ) tm.assert_frame_equal(result, expected) df.loc[0] = np.nan result = df.cov() - expected = DataFrame(np.cov(df.values[1:].T).reshape((1, 1)), - index=df.columns, columns=df.columns) + expected = DataFrame( + np.cov(df.values[1:].T).reshape((1, 1)), + index=df.columns, + columns=df.columns, + ) tm.assert_frame_equal(result, expected) def test_corrwith(self, datetime_frame): @@ -386,39 +424,37 @@ def test_corrwith(self, datetime_frame): # make sure order does not matter b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:]) - del b['B'] + del b["B"] colcorr = a.corrwith(b, axis=0) - tm.assert_almost_equal(colcorr['A'], a['A'].corr(b['A'])) + tm.assert_almost_equal(colcorr["A"], a["A"].corr(b["A"])) rowcorr = a.corrwith(b, axis=1) tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0)) dropped = a.corrwith(b, axis=0, drop=True) - tm.assert_almost_equal(dropped['A'], a['A'].corr(b['A'])) - assert 'B' not in dropped + tm.assert_almost_equal(dropped["A"], a["A"].corr(b["A"])) + assert "B" not in dropped dropped = a.corrwith(b, axis=1, drop=True) assert a.index[-1] not in dropped.index # non time-series data - index = ['a', 'b', 'c', 'd', 'e'] - columns = ['one', 'two', 'three', 'four'] + index = ["a", "b", "c", "d", "e"] + columns = ["one", "two", "three", "four"] df1 = DataFrame(np.random.randn(5, 4), index=index, columns=columns) - df2 = DataFrame(np.random.randn(4, 4), - index=index[:4], columns=columns) + df2 = DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns) correls = df1.corrwith(df2, axis=1) for row in index[:4]: - tm.assert_almost_equal(correls[row], - df1.loc[row].corr(df2.loc[row])) + tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) def test_corrwith_with_objects(self): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame() - cols = ['A', 'B', 'C', 'D'] + cols = ["A", "B", "C", "D"] - df1['obj'] = 'foo' - df2['obj'] = 'bar' + df1["obj"] = "foo" + df2["obj"] = "bar" result = df1.corrwith(df2) expected = df1.loc[:, cols].corrwith(df2.loc[:, cols]) @@ -429,45 +465,42 @@ def test_corrwith_with_objects(self): tm.assert_series_equal(result, expected) def test_corrwith_series(self, datetime_frame): - result = datetime_frame.corrwith(datetime_frame['A']) - expected = datetime_frame.apply(datetime_frame['A'].corr) + result = datetime_frame.corrwith(datetime_frame["A"]) + expected = datetime_frame.apply(datetime_frame["A"].corr) tm.assert_series_equal(result, expected) def test_corrwith_matches_corrcoef(self): - df1 = DataFrame(np.arange(10000), columns=['a']) - df2 = DataFrame(np.arange(10000) ** 2, columns=['a']) - c1 = df1.corrwith(df2)['a'] - c2 = np.corrcoef(df1['a'], df2['a'])[0][1] + df1 = DataFrame(np.arange(10000), columns=["a"]) + df2 = DataFrame(np.arange(10000) ** 2, columns=["a"]) + c1 = df1.corrwith(df2)["a"] + c2 = np.corrcoef(df1["a"], df2["a"])[0][1] tm.assert_almost_equal(c1, c2) assert c1 < 1 def test_corrwith_mixed_dtypes(self): # GH 18570 - df = pd.DataFrame({'a': [1, 4, 3, 2], 'b': [4, 6, 7, 3], - 'c': ['a', 'b', 'c', 'd']}) + df = pd.DataFrame( + {"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]} + ) s = pd.Series([0, 6, 7, 3]) result = df.corrwith(s) - corrs = [df['a'].corr(s), df['b'].corr(s)] - expected = pd.Series(data=corrs, index=['a', 'b']) + corrs = [df["a"].corr(s), df["b"].corr(s)] + expected = pd.Series(data=corrs, index=["a", "b"]) tm.assert_series_equal(result, expected) def test_corrwith_index_intersection(self): - df1 = pd.DataFrame(np.random.random(size=(10, 2)), - columns=["a", "b"]) - df2 = pd.DataFrame(np.random.random(size=(10, 3)), - columns=["a", "b", "c"]) + df1 = pd.DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"]) + df2 = pd.DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"]) result = df1.corrwith(df2, drop=True).index.sort_values() expected = df1.columns.intersection(df2.columns).sort_values() tm.assert_index_equal(result, expected) def test_corrwith_index_union(self): - df1 = pd.DataFrame(np.random.random(size=(10, 2)), - columns=["a", "b"]) - df2 = pd.DataFrame(np.random.random(size=(10, 3)), - columns=["a", "b", "c"]) + df1 = pd.DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"]) + df2 = pd.DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"]) result = df1.corrwith(df2, drop=False).index.sort_values() expected = df1.columns.union(df2.columns).sort_values() @@ -487,7 +520,7 @@ def test_corrwith_dup_cols(self): def test_corrwith_spearman(self): # GH 21925 df = pd.DataFrame(np.random.random(size=(100, 3))) - result = df.corrwith(df**2, method="spearman") + result = df.corrwith(df ** 2, method="spearman") expected = Series(np.ones(len(result))) tm.assert_series_equal(result, expected) @@ -495,7 +528,7 @@ def test_corrwith_spearman(self): def test_corrwith_kendall(self): # GH 21925 df = pd.DataFrame(np.random.random(size=(100, 3))) - result = df.corrwith(df**2, method="kendall") + result = df.corrwith(df ** 2, method="kendall") expected = Series(np.ones(len(result))) tm.assert_series_equal(result, expected) @@ -503,34 +536,40 @@ def test_corrwith_kendall(self): # Describe def test_bool_describe_in_mixed_frame(self): - df = DataFrame({ - 'string_data': ['a', 'b', 'c', 'd', 'e'], - 'bool_data': [True, True, False, False, False], - 'int_data': [10, 20, 30, 40, 50], - }) + df = DataFrame( + { + "string_data": ["a", "b", "c", "d", "e"], + "bool_data": [True, True, False, False, False], + "int_data": [10, 20, 30, 40, 50], + } + ) # Integer data are included in .describe() output, # Boolean and string data are not. result = df.describe() - expected = DataFrame({'int_data': [5, 30, df.int_data.std(), - 10, 20, 30, 40, 50]}, - index=['count', 'mean', 'std', 'min', '25%', - '50%', '75%', 'max']) + expected = DataFrame( + {"int_data": [5, 30, df.int_data.std(), 10, 20, 30, 40, 50]}, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) tm.assert_frame_equal(result, expected) # Top value is a boolean value that is False - result = df.describe(include=['bool']) + result = df.describe(include=["bool"]) - expected = DataFrame({'bool_data': [5, 2, False, 3]}, - index=['count', 'unique', 'top', 'freq']) + expected = DataFrame( + {"bool_data": [5, 2, False, 3]}, index=["count", "unique", "top", "freq"] + ) tm.assert_frame_equal(result, expected) def test_describe_empty_object(self): # https://github.com/pandas-dev/pandas/issues/27183 df = pd.DataFrame({"A": [None, None]}, dtype=object) result = df.describe() - expected = pd.DataFrame({"A": [0, 0, np.nan, np.nan]}, dtype=object, - index=['count', 'unique', 'top', 'freq']) + expected = pd.DataFrame( + {"A": [0, 0, np.nan, np.nan]}, + dtype=object, + index=["count", "unique", "top", "freq"], + ) tm.assert_frame_equal(result, expected) result = df.iloc[:0].describe() @@ -538,45 +577,51 @@ def test_describe_empty_object(self): def test_describe_bool_frame(self): # GH 13891 - df = pd.DataFrame({ - 'bool_data_1': [False, False, True, True], - 'bool_data_2': [False, True, True, True] - }) + df = pd.DataFrame( + { + "bool_data_1": [False, False, True, True], + "bool_data_2": [False, True, True, True], + } + ) result = df.describe() - expected = DataFrame({'bool_data_1': [4, 2, True, 2], - 'bool_data_2': [4, 2, True, 3]}, - index=['count', 'unique', 'top', 'freq']) + expected = DataFrame( + {"bool_data_1": [4, 2, True, 2], "bool_data_2": [4, 2, True, 3]}, + index=["count", "unique", "top", "freq"], + ) tm.assert_frame_equal(result, expected) - df = pd.DataFrame({ - 'bool_data': [False, False, True, True, False], - 'int_data': [0, 1, 2, 3, 4] - }) + df = pd.DataFrame( + { + "bool_data": [False, False, True, True, False], + "int_data": [0, 1, 2, 3, 4], + } + ) result = df.describe() - expected = DataFrame({'int_data': [5, 2, df.int_data.std(), 0, 1, - 2, 3, 4]}, - index=['count', 'mean', 'std', 'min', '25%', - '50%', '75%', 'max']) + expected = DataFrame( + {"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]}, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) tm.assert_frame_equal(result, expected) - df = pd.DataFrame({ - 'bool_data': [False, False, True, True], - 'str_data': ['a', 'b', 'c', 'a'] - }) + df = pd.DataFrame( + {"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]} + ) result = df.describe() - expected = DataFrame({'bool_data': [4, 2, True, 2], - 'str_data': [4, 3, 'a', 2]}, - index=['count', 'unique', 'top', 'freq']) + expected = DataFrame( + {"bool_data": [4, 2, True, 2], "str_data": [4, 3, "a", 2]}, + index=["count", "unique", "top", "freq"], + ) tm.assert_frame_equal(result, expected) def test_describe_categorical(self): - df = DataFrame({'value': np.random.randint(0, 10000, 100)}) + df = DataFrame({"value": np.random.randint(0, 10000, 100)}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) - df = df.sort_values(by=['value'], ascending=True) - df['value_group'] = pd.cut(df.value, range(0, 10500, 500), - right=False, labels=cat_labels) + df = df.sort_values(by=["value"], ascending=True) + df["value_group"] = pd.cut( + df.value, range(0, 10500, 500), right=False, labels=cat_labels + ) cat = df # Categoricals should not show up together with numerical columns @@ -586,12 +631,12 @@ def test_describe_categorical(self): # In a frame, describe() for the cat should be the same as for string # arrays (count, unique, top, freq) - cat = Categorical(["a", "b", "b", "b"], categories=['a', 'b', 'c'], - ordered=True) + cat = Categorical( + ["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True + ) s = Series(cat) result = s.describe() - expected = Series([4, 2, "b", 3], - index=['count', 'unique', 'top', 'freq']) + expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"]) tm.assert_series_equal(result, expected) cat = Series(Categorical(["a", "b", "c", "c"])) @@ -605,9 +650,11 @@ def test_describe_empty_categorical_column(self): # also contains (count, unique, top, freq) df = pd.DataFrame({"empty_col": Categorical([])}) result = df.describe() - expected = DataFrame({'empty_col': [0, 0, np.nan, np.nan]}, - index=['count', 'unique', 'top', 'freq'], - dtype='object') + expected = DataFrame( + {"empty_col": [0, 0, np.nan, np.nan]}, + index=["count", "unique", "top", "freq"], + dtype="object", + ) tm.assert_frame_equal(result, expected) # ensure NaN, not None assert np.isnan(result.iloc[2, 0]) @@ -615,86 +662,113 @@ def test_describe_empty_categorical_column(self): def test_describe_categorical_columns(self): # GH 11558 - columns = pd.CategoricalIndex(['int1', 'int2', 'obj'], - ordered=True, name='XXX') - df = DataFrame({'int1': [10, 20, 30, 40, 50], - 'int2': [10, 20, 30, 40, 50], - 'obj': ['A', 0, None, 'X', 1]}, - columns=columns) + columns = pd.CategoricalIndex(["int1", "int2", "obj"], ordered=True, name="XXX") + df = DataFrame( + { + "int1": [10, 20, 30, 40, 50], + "int2": [10, 20, 30, 40, 50], + "obj": ["A", 0, None, "X", 1], + }, + columns=columns, + ) result = df.describe() - exp_columns = pd.CategoricalIndex(['int1', 'int2'], - categories=['int1', 'int2', 'obj'], - ordered=True, name='XXX') - expected = DataFrame({'int1': [5, 30, df.int1.std(), - 10, 20, 30, 40, 50], - 'int2': [5, 30, df.int2.std(), - 10, 20, 30, 40, 50]}, - index=['count', 'mean', 'std', 'min', '25%', - '50%', '75%', 'max'], - columns=exp_columns) + exp_columns = pd.CategoricalIndex( + ["int1", "int2"], + categories=["int1", "int2", "obj"], + ordered=True, + name="XXX", + ) + expected = DataFrame( + { + "int1": [5, 30, df.int1.std(), 10, 20, 30, 40, 50], + "int2": [5, 30, df.int2.std(), 10, 20, 30, 40, 50], + }, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + columns=exp_columns, + ) tm.assert_frame_equal(result, expected) - tm.assert_categorical_equal(result.columns.values, - expected.columns.values) + tm.assert_categorical_equal(result.columns.values, expected.columns.values) def test_describe_datetime_columns(self): - columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'], - freq='MS', tz='US/Eastern', name='XXX') - df = DataFrame({0: [10, 20, 30, 40, 50], - 1: [10, 20, 30, 40, 50], - 2: ['A', 0, None, 'X', 1]}) + columns = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01", "2011-03-01"], + freq="MS", + tz="US/Eastern", + name="XXX", + ) + df = DataFrame( + { + 0: [10, 20, 30, 40, 50], + 1: [10, 20, 30, 40, 50], + 2: ["A", 0, None, "X", 1], + } + ) df.columns = columns result = df.describe() - exp_columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01'], - freq='MS', tz='US/Eastern', name='XXX') - expected = DataFrame({0: [5, 30, df.iloc[:, 0].std(), - 10, 20, 30, 40, 50], - 1: [5, 30, df.iloc[:, 1].std(), - 10, 20, 30, 40, 50]}, - index=['count', 'mean', 'std', 'min', '25%', - '50%', '75%', 'max']) + exp_columns = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01"], freq="MS", tz="US/Eastern", name="XXX" + ) + expected = DataFrame( + { + 0: [5, 30, df.iloc[:, 0].std(), 10, 20, 30, 40, 50], + 1: [5, 30, df.iloc[:, 1].std(), 10, 20, 30, 40, 50], + }, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) expected.columns = exp_columns tm.assert_frame_equal(result, expected) - assert result.columns.freq == 'MS' + assert result.columns.freq == "MS" assert result.columns.tz == expected.columns.tz def test_describe_timedelta_values(self): # GH 6145 - t1 = pd.timedelta_range('1 days', freq='D', periods=5) - t2 = pd.timedelta_range('1 hours', freq='H', periods=5) - df = pd.DataFrame({'t1': t1, 't2': t2}) - - expected = DataFrame({'t1': [5, pd.Timedelta('3 days'), - df.iloc[:, 0].std(), - pd.Timedelta('1 days'), - pd.Timedelta('2 days'), - pd.Timedelta('3 days'), - pd.Timedelta('4 days'), - pd.Timedelta('5 days')], - 't2': [5, pd.Timedelta('3 hours'), - df.iloc[:, 1].std(), - pd.Timedelta('1 hours'), - pd.Timedelta('2 hours'), - pd.Timedelta('3 hours'), - pd.Timedelta('4 hours'), - pd.Timedelta('5 hours')]}, - index=['count', 'mean', 'std', 'min', '25%', - '50%', '75%', 'max']) + t1 = pd.timedelta_range("1 days", freq="D", periods=5) + t2 = pd.timedelta_range("1 hours", freq="H", periods=5) + df = pd.DataFrame({"t1": t1, "t2": t2}) + + expected = DataFrame( + { + "t1": [ + 5, + pd.Timedelta("3 days"), + df.iloc[:, 0].std(), + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + pd.Timedelta("4 days"), + pd.Timedelta("5 days"), + ], + "t2": [ + 5, + pd.Timedelta("3 hours"), + df.iloc[:, 1].std(), + pd.Timedelta("1 hours"), + pd.Timedelta("2 hours"), + pd.Timedelta("3 hours"), + pd.Timedelta("4 hours"), + pd.Timedelta("5 hours"), + ], + }, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) result = df.describe() tm.assert_frame_equal(result, expected) - exp_repr = (" t1 t2\n" - "count 5 5\n" - "mean 3 days 00:00:00 0 days 03:00:00\n" - "std 1 days 13:56:50.394919 0 days 01:34:52.099788\n" - "min 1 days 00:00:00 0 days 01:00:00\n" - "25% 2 days 00:00:00 0 days 02:00:00\n" - "50% 3 days 00:00:00 0 days 03:00:00\n" - "75% 4 days 00:00:00 0 days 04:00:00\n" - "max 5 days 00:00:00 0 days 05:00:00") + exp_repr = ( + " t1 t2\n" + "count 5 5\n" + "mean 3 days 00:00:00 0 days 03:00:00\n" + "std 1 days 13:56:50.394919 0 days 01:34:52.099788\n" + "min 1 days 00:00:00 0 days 01:00:00\n" + "25% 2 days 00:00:00 0 days 02:00:00\n" + "50% 3 days 00:00:00 0 days 03:00:00\n" + "75% 4 days 00:00:00 0 days 04:00:00\n" + "max 5 days 00:00:00 0 days 05:00:00" + ) assert repr(result) == exp_repr def test_describe_tz_values(self, tz_naive_fixture): @@ -704,62 +778,121 @@ def test_describe_tz_values(self, tz_naive_fixture): start = Timestamp(2018, 1, 1) end = Timestamp(2018, 1, 5) s2 = Series(date_range(start, end, tz=tz)) - df = pd.DataFrame({'s1': s1, 's2': s2}) - - expected = DataFrame({'s1': [5, np.nan, np.nan, np.nan, np.nan, np.nan, - 2, 1.581139, 0, 1, 2, 3, 4], - 's2': [5, 5, s2.value_counts().index[0], 1, - start.tz_localize(tz), - end.tz_localize(tz), np.nan, np.nan, - np.nan, np.nan, np.nan, np.nan, np.nan]}, - index=['count', 'unique', 'top', 'freq', 'first', - 'last', 'mean', 'std', 'min', '25%', '50%', - '75%', 'max'] - ) - result = df.describe(include='all') + df = pd.DataFrame({"s1": s1, "s2": s2}) + + expected = DataFrame( + { + "s1": [ + 5, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + 2, + 1.581139, + 0, + 1, + 2, + 3, + 4, + ], + "s2": [ + 5, + 5, + s2.value_counts().index[0], + 1, + start.tz_localize(tz), + end.tz_localize(tz), + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ], + }, + index=[ + "count", + "unique", + "top", + "freq", + "first", + "last", + "mean", + "std", + "min", + "25%", + "50%", + "75%", + "max", + ], + ) + result = df.describe(include="all") tm.assert_frame_equal(result, expected) def test_describe_percentiles_integer_idx(self): # Issue 26660 - df = pd.DataFrame({'x': [1]}) + df = pd.DataFrame({"x": [1]}) pct = np.linspace(0, 1, 10 + 1) result = df.describe(percentiles=pct) expected = DataFrame( - {'x': [1.0, 1.0, np.NaN, 1.0, *[1.0 for _ in pct], 1.0]}, - index=['count', 'mean', 'std', 'min', '0%', '10%', '20%', '30%', - '40%', '50%', '60%', '70%', '80%', '90%', '100%', 'max']) + {"x": [1.0, 1.0, np.NaN, 1.0, *[1.0 for _ in pct], 1.0]}, + index=[ + "count", + "mean", + "std", + "min", + "0%", + "10%", + "20%", + "30%", + "40%", + "50%", + "60%", + "70%", + "80%", + "90%", + "100%", + "max", + ], + ) tm.assert_frame_equal(result, expected) + # --------------------------------------------------------------------- # Reductions def test_stat_op_api(self, float_frame, float_string_frame): - assert_stat_op_api('count', float_frame, float_string_frame, - has_numeric_only=True) - assert_stat_op_api('sum', float_frame, float_string_frame, - has_numeric_only=True) - - assert_stat_op_api('nunique', float_frame, float_string_frame) - assert_stat_op_api('mean', float_frame, float_string_frame) - assert_stat_op_api('product', float_frame, float_string_frame) - assert_stat_op_api('median', float_frame, float_string_frame) - assert_stat_op_api('min', float_frame, float_string_frame) - assert_stat_op_api('max', float_frame, float_string_frame) - assert_stat_op_api('mad', float_frame, float_string_frame) - assert_stat_op_api('var', float_frame, float_string_frame) - assert_stat_op_api('std', float_frame, float_string_frame) - assert_stat_op_api('sem', float_frame, float_string_frame) - assert_stat_op_api('median', float_frame, float_string_frame) + assert_stat_op_api( + "count", float_frame, float_string_frame, has_numeric_only=True + ) + assert_stat_op_api( + "sum", float_frame, float_string_frame, has_numeric_only=True + ) + + assert_stat_op_api("nunique", float_frame, float_string_frame) + assert_stat_op_api("mean", float_frame, float_string_frame) + assert_stat_op_api("product", float_frame, float_string_frame) + assert_stat_op_api("median", float_frame, float_string_frame) + assert_stat_op_api("min", float_frame, float_string_frame) + assert_stat_op_api("max", float_frame, float_string_frame) + assert_stat_op_api("mad", float_frame, float_string_frame) + assert_stat_op_api("var", float_frame, float_string_frame) + assert_stat_op_api("std", float_frame, float_string_frame) + assert_stat_op_api("sem", float_frame, float_string_frame) + assert_stat_op_api("median", float_frame, float_string_frame) try: from scipy.stats import skew, kurtosis # noqa:F401 - assert_stat_op_api('skew', float_frame, float_string_frame) - assert_stat_op_api('kurt', float_frame, float_string_frame) + + assert_stat_op_api("skew", float_frame, float_string_frame) + assert_stat_op_api("kurt", float_frame, float_string_frame) except ImportError: pass def test_stat_op_calc(self, float_frame_with_na, mixed_float_frame): - def count(s): return notna(s).sum() @@ -780,43 +913,61 @@ def sem(x): def skewness(x): from scipy.stats import skew # noqa:F811 + if len(x) < 3: return np.nan return skew(x, bias=False) def kurt(x): from scipy.stats import kurtosis # noqa:F811 + if len(x) < 4: return np.nan return kurtosis(x, bias=False) - assert_stat_op_calc('nunique', nunique, float_frame_with_na, - has_skipna=False, check_dtype=False, - check_dates=True) + assert_stat_op_calc( + "nunique", + nunique, + float_frame_with_na, + has_skipna=False, + check_dtype=False, + check_dates=True, + ) # mixed types (with upcasting happening) - assert_stat_op_calc('sum', np.sum, mixed_float_frame.astype('float32'), - check_dtype=False, check_less_precise=True) - - assert_stat_op_calc('sum', np.sum, float_frame_with_na, - skipna_alternative=np.nansum) - assert_stat_op_calc('mean', np.mean, float_frame_with_na, - check_dates=True) - assert_stat_op_calc('product', np.prod, float_frame_with_na) - - assert_stat_op_calc('mad', mad, float_frame_with_na) - assert_stat_op_calc('var', var, float_frame_with_na) - assert_stat_op_calc('std', std, float_frame_with_na) - assert_stat_op_calc('sem', sem, float_frame_with_na) + assert_stat_op_calc( + "sum", + np.sum, + mixed_float_frame.astype("float32"), + check_dtype=False, + check_less_precise=True, + ) - assert_stat_op_calc('count', count, float_frame_with_na, - has_skipna=False, check_dtype=False, - check_dates=True) + assert_stat_op_calc( + "sum", np.sum, float_frame_with_na, skipna_alternative=np.nansum + ) + assert_stat_op_calc("mean", np.mean, float_frame_with_na, check_dates=True) + assert_stat_op_calc("product", np.prod, float_frame_with_na) + + assert_stat_op_calc("mad", mad, float_frame_with_na) + assert_stat_op_calc("var", var, float_frame_with_na) + assert_stat_op_calc("std", std, float_frame_with_na) + assert_stat_op_calc("sem", sem, float_frame_with_na) + + assert_stat_op_calc( + "count", + count, + float_frame_with_na, + has_skipna=False, + check_dtype=False, + check_dates=True, + ) try: from scipy import skew, kurtosis # noqa:F401 - assert_stat_op_calc('skew', skewness, float_frame_with_na) - assert_stat_op_calc('kurt', kurt, float_frame_with_na) + + assert_stat_op_calc("skew", skewness, float_frame_with_na) + assert_stat_op_calc("kurt", kurt, float_frame_with_na) except ImportError: pass @@ -828,89 +979,100 @@ def wrapper(x): return np.nan return np.median(x) - assert_stat_op_calc('median', wrapper, float_frame_with_na, - check_dates=True) - assert_stat_op_calc('median', wrapper, int_frame, check_dtype=False, - check_dates=True) + assert_stat_op_calc("median", wrapper, float_frame_with_na, check_dates=True) + assert_stat_op_calc( + "median", wrapper, int_frame, check_dtype=False, check_dates=True + ) - @pytest.mark.parametrize('method', ['sum', 'mean', 'prod', 'var', - 'std', 'skew', 'min', 'max']) + @pytest.mark.parametrize( + "method", ["sum", "mean", "prod", "var", "std", "skew", "min", "max"] + ) def test_stat_operators_attempt_obj_array(self, method): # GH#676 data = { - 'a': [-0.00049987540199591344, -0.0016467257772919831, - 0.00067695870775883013], - 'b': [-0, -0, 0.0], - 'c': [0.00031111847529610595, 0.0014902627951905339, - -0.00094099200035979691] + "a": [ + -0.00049987540199591344, + -0.0016467257772919831, + 0.00067695870775883013, + ], + "b": [-0, -0, 0.0], + "c": [ + 0.00031111847529610595, + 0.0014902627951905339, + -0.00094099200035979691, + ], } - df1 = DataFrame(data, index=['foo', 'bar', 'baz'], dtype='O') + df1 = DataFrame(data, index=["foo", "bar", "baz"], dtype="O") - df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3], - 2: [np.nan, 4]}, dtype=object) + df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object) for df in [df1, df2]: assert df.values.dtype == np.object_ result = getattr(df, method)(1) - expected = getattr(df.astype('f8'), method)(1) + expected = getattr(df.astype("f8"), method)(1) - if method in ['sum', 'prod']: + if method in ["sum", "prod"]: tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('op', ['mean', 'std', 'var', - 'skew', 'kurt', 'sem']) + @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"]) def test_mixed_ops(self, op): # GH#16116 - df = DataFrame({'int': [1, 2, 3, 4], - 'float': [1., 2., 3., 4.], - 'str': ['a', 'b', 'c', 'd']}) + df = DataFrame( + { + "int": [1, 2, 3, 4], + "float": [1.0, 2.0, 3.0, 4.0], + "str": ["a", "b", "c", "d"], + } + ) result = getattr(df, op)() assert len(result) == 2 - with pd.option_context('use_bottleneck', False): + with pd.option_context("use_bottleneck", False): result = getattr(df, op)() assert len(result) == 2 def test_reduce_mixed_frame(self): # GH 6806 - df = DataFrame({ - 'bool_data': [True, True, False, False, False], - 'int_data': [10, 20, 30, 40, 50], - 'string_data': ['a', 'b', 'c', 'd', 'e'], - }) - df.reindex(columns=['bool_data', 'int_data', 'string_data']) + df = DataFrame( + { + "bool_data": [True, True, False, False, False], + "int_data": [10, 20, 30, 40, 50], + "string_data": ["a", "b", "c", "d", "e"], + } + ) + df.reindex(columns=["bool_data", "int_data", "string_data"]) test = df.sum(axis=0) - tm.assert_numpy_array_equal(test.values, - np.array([2, 150, 'abcde'], dtype=object)) + tm.assert_numpy_array_equal( + test.values, np.array([2, 150, "abcde"], dtype=object) + ) tm.assert_series_equal(test, df.T.sum(axis=1)) def test_nunique(self): - df = DataFrame({'A': [1, 1, 1], - 'B': [1, 2, 3], - 'C': [1, np.nan, 3]}) - tm.assert_series_equal(df.nunique(), Series({'A': 1, 'B': 3, 'C': 2})) - tm.assert_series_equal(df.nunique(dropna=False), - Series({'A': 1, 'B': 3, 'C': 3})) + df = DataFrame({"A": [1, 1, 1], "B": [1, 2, 3], "C": [1, np.nan, 3]}) + tm.assert_series_equal(df.nunique(), Series({"A": 1, "B": 3, "C": 2})) + tm.assert_series_equal( + df.nunique(dropna=False), Series({"A": 1, "B": 3, "C": 3}) + ) tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2})) - tm.assert_series_equal(df.nunique(axis=1, dropna=False), - Series({0: 1, 1: 3, 2: 2})) + tm.assert_series_equal( + df.nunique(axis=1, dropna=False), Series({0: 1, 1: 3, 2: 2}) + ) - @pytest.mark.parametrize('tz', [None, 'UTC']) + @pytest.mark.parametrize("tz", [None, "UTC"]) def test_mean_mixed_datetime_numeric(self, tz): # https://github.com/pandas-dev/pandas/issues/24752 - df = pd.DataFrame({"A": [1, 1], - "B": [pd.Timestamp('2000', tz=tz)] * 2}) + df = pd.DataFrame({"A": [1, 1], "B": [pd.Timestamp("2000", tz=tz)] * 2}) result = df.mean() - expected = pd.Series([1.0], index=['A']) + expected = pd.Series([1.0], index=["A"]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('tz', [None, 'UTC']) + @pytest.mark.parametrize("tz", [None, "UTC"]) def test_mean_excludeds_datetimes(self, tz): # https://github.com/pandas-dev/pandas/issues/24752 # Our long-term desired behavior is unclear, but the behavior in # 0.24.0rc1 was buggy. - df = pd.DataFrame({"A": [pd.Timestamp('2000', tz=tz)] * 2}) + df = pd.DataFrame({"A": [pd.Timestamp("2000", tz=tz)] * 2}) result = df.mean() expected = pd.Series() tm.assert_series_equal(result, expected) @@ -928,28 +1090,27 @@ def test_var_std(self, datetime_frame): result = nanops.nanvar(arr, axis=0) assert not (result < 0).any() - with pd.option_context('use_bottleneck', False): + with pd.option_context("use_bottleneck", False): result = nanops.nanvar(arr, axis=0) assert not (result < 0).any() - @pytest.mark.parametrize( - "meth", ['sem', 'var', 'std']) + @pytest.mark.parametrize("meth", ["sem", "var", "std"]) def test_numeric_only_flag(self, meth): # GH 9201 - df1 = DataFrame(np.random.randn(5, 3), columns=['foo', 'bar', 'baz']) + df1 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"]) # set one entry to a number in str format - df1.loc[0, 'foo'] = '100' + df1.loc[0, "foo"] = "100" - df2 = DataFrame(np.random.randn(5, 3), columns=['foo', 'bar', 'baz']) + df2 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"]) # set one entry to a non-number str - df2.loc[0, 'foo'] = 'a' + df2.loc[0, "foo"] = "a" result = getattr(df1, meth)(axis=1, numeric_only=True) - expected = getattr(df1[['bar', 'baz']], meth)(axis=1) + expected = getattr(df1[["bar", "baz"]], meth)(axis=1) tm.assert_series_equal(expected, result) result = getattr(df2, meth)(axis=1, numeric_only=True) - expected = getattr(df2[['bar', 'baz']], meth)(axis=1) + expected = getattr(df2[["bar", "baz"]], meth)(axis=1) tm.assert_series_equal(expected, result) # df1 has all numbers, df2 has a letter inside @@ -962,82 +1123,104 @@ def test_numeric_only_flag(self, meth): def test_sem(self, datetime_frame): result = datetime_frame.sem(ddof=4) - expected = datetime_frame.apply( - lambda x: x.std(ddof=4) / np.sqrt(len(x))) + expected = datetime_frame.apply(lambda x: x.std(ddof=4) / np.sqrt(len(x))) tm.assert_almost_equal(result, expected) arr = np.repeat(np.random.random((1, 1000)), 1000, 0) result = nanops.nansem(arr, axis=0) assert not (result < 0).any() - with pd.option_context('use_bottleneck', False): + with pd.option_context("use_bottleneck", False): result = nanops.nansem(arr, axis=0) assert not (result < 0).any() @td.skip_if_no_scipy def test_kurt(self): - index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], - codes=[[0, 0, 0, 0, 0, 0], - [0, 1, 2, 0, 1, 2], - [0, 1, 0, 1, 0, 1]]) + index = MultiIndex( + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + ) df = DataFrame(np.random.randn(6, 3), index=index) kurt = df.kurt() - kurt2 = df.kurt(level=0).xs('bar') + kurt2 = df.kurt(level=0).xs("bar") tm.assert_series_equal(kurt, kurt2, check_names=False) assert kurt.name is None - assert kurt2.name == 'bar' - - @pytest.mark.parametrize("dropna, expected", [ - (True, {'A': [12], - 'B': [10.0], - 'C': [1.0], - 'D': ['a'], - 'E': Categorical(['a'], categories=['a']), - 'F': to_datetime(['2000-1-2']), - 'G': to_timedelta(['1 days'])}), - (False, {'A': [12], - 'B': [10.0], - 'C': [np.nan], - 'D': np.array([np.nan], dtype=object), - 'E': Categorical([np.nan], categories=['a']), - 'F': [pd.NaT], - 'G': to_timedelta([pd.NaT])}), - (True, {'H': [8, 9, np.nan, np.nan], - 'I': [8, 9, np.nan, np.nan], - 'J': [1, np.nan, np.nan, np.nan], - 'K': Categorical(['a', np.nan, np.nan, np.nan], - categories=['a']), - 'L': to_datetime(['2000-1-2', 'NaT', 'NaT', 'NaT']), - 'M': to_timedelta(['1 days', 'nan', 'nan', 'nan']), - 'N': [0, 1, 2, 3]}), - (False, {'H': [8, 9, np.nan, np.nan], - 'I': [8, 9, np.nan, np.nan], - 'J': [1, np.nan, np.nan, np.nan], - 'K': Categorical([np.nan, 'a', np.nan, np.nan], - categories=['a']), - 'L': to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']), - 'M': to_timedelta(['nan', '1 days', 'nan', 'nan']), - 'N': [0, 1, 2, 3]}) - ]) + assert kurt2.name == "bar" + + @pytest.mark.parametrize( + "dropna, expected", + [ + ( + True, + { + "A": [12], + "B": [10.0], + "C": [1.0], + "D": ["a"], + "E": Categorical(["a"], categories=["a"]), + "F": to_datetime(["2000-1-2"]), + "G": to_timedelta(["1 days"]), + }, + ), + ( + False, + { + "A": [12], + "B": [10.0], + "C": [np.nan], + "D": np.array([np.nan], dtype=object), + "E": Categorical([np.nan], categories=["a"]), + "F": [pd.NaT], + "G": to_timedelta([pd.NaT]), + }, + ), + ( + True, + { + "H": [8, 9, np.nan, np.nan], + "I": [8, 9, np.nan, np.nan], + "J": [1, np.nan, np.nan, np.nan], + "K": Categorical(["a", np.nan, np.nan, np.nan], categories=["a"]), + "L": to_datetime(["2000-1-2", "NaT", "NaT", "NaT"]), + "M": to_timedelta(["1 days", "nan", "nan", "nan"]), + "N": [0, 1, 2, 3], + }, + ), + ( + False, + { + "H": [8, 9, np.nan, np.nan], + "I": [8, 9, np.nan, np.nan], + "J": [1, np.nan, np.nan, np.nan], + "K": Categorical([np.nan, "a", np.nan, np.nan], categories=["a"]), + "L": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]), + "M": to_timedelta(["nan", "1 days", "nan", "nan"]), + "N": [0, 1, 2, 3], + }, + ), + ], + ) def test_mode_dropna(self, dropna, expected): - df = DataFrame({"A": [12, 12, 19, 11], - "B": [10, 10, np.nan, 3], - "C": [1, np.nan, np.nan, np.nan], - "D": [np.nan, np.nan, 'a', np.nan], - "E": Categorical([np.nan, np.nan, 'a', np.nan]), - "F": to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']), - "G": to_timedelta(['1 days', 'nan', 'nan', 'nan']), - "H": [8, 8, 9, 9], - "I": [9, 9, 8, 8], - "J": [1, 1, np.nan, np.nan], - "K": Categorical(['a', np.nan, 'a', np.nan]), - "L": to_datetime(['2000-1-2', '2000-1-2', - 'NaT', 'NaT']), - "M": to_timedelta(['1 days', 'nan', - '1 days', 'nan']), - "N": np.arange(4, dtype='int64')}) + df = DataFrame( + { + "A": [12, 12, 19, 11], + "B": [10, 10, np.nan, 3], + "C": [1, np.nan, np.nan, np.nan], + "D": [np.nan, np.nan, "a", np.nan], + "E": Categorical([np.nan, np.nan, "a", np.nan]), + "F": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]), + "G": to_timedelta(["1 days", "nan", "nan", "nan"]), + "H": [8, 8, 9, 9], + "I": [9, 9, 8, 8], + "J": [1, 1, np.nan, np.nan], + "K": Categorical(["a", np.nan, "a", np.nan]), + "L": to_datetime(["2000-1-2", "2000-1-2", "NaT", "NaT"]), + "M": to_timedelta(["1 days", "nan", "1 days", "nan"]), + "N": np.arange(4, dtype="int64"), + } + ) result = df[sorted(list(expected.keys()))].mode(dropna=dropna) expected = DataFrame(expected) @@ -1047,89 +1230,101 @@ def test_mode_sortwarning(self): # Check for the warning that is raised when the mode # results cannot be sorted - df = DataFrame({"A": [np.nan, np.nan, 'a', 'a']}) - expected = DataFrame({'A': ['a', np.nan]}) + df = DataFrame({"A": [np.nan, np.nan, "a", "a"]}) + expected = DataFrame({"A": ["a", np.nan]}) with tm.assert_produces_warning(UserWarning, check_stacklevel=False): result = df.mode(dropna=False) - result = result.sort_values(by='A').reset_index(drop=True) + result = result.sort_values(by="A").reset_index(drop=True) tm.assert_frame_equal(result, expected) def test_operators_timedelta64(self): - df = DataFrame(dict(A=date_range('2012-1-1', periods=3, freq='D'), - B=date_range('2012-1-2', periods=3, freq='D'), - C=Timestamp('20120101') - - timedelta(minutes=5, seconds=5))) + df = DataFrame( + dict( + A=date_range("2012-1-1", periods=3, freq="D"), + B=date_range("2012-1-2", periods=3, freq="D"), + C=Timestamp("20120101") - timedelta(minutes=5, seconds=5), + ) + ) - diffs = DataFrame(dict(A=df['A'] - df['C'], - B=df['A'] - df['B'])) + diffs = DataFrame(dict(A=df["A"] - df["C"], B=df["A"] - df["B"])) # min result = diffs.min() - assert result[0] == diffs.loc[0, 'A'] - assert result[1] == diffs.loc[0, 'B'] + assert result[0] == diffs.loc[0, "A"] + assert result[1] == diffs.loc[0, "B"] result = diffs.min(axis=1) - assert (result == diffs.loc[0, 'B']).all() + assert (result == diffs.loc[0, "B"]).all() # max result = diffs.max() - assert result[0] == diffs.loc[2, 'A'] - assert result[1] == diffs.loc[2, 'B'] + assert result[0] == diffs.loc[2, "A"] + assert result[1] == diffs.loc[2, "B"] result = diffs.max(axis=1) - assert (result == diffs['A']).all() + assert (result == diffs["A"]).all() # abs result = diffs.abs() result2 = abs(diffs) - expected = DataFrame(dict(A=df['A'] - df['C'], - B=df['B'] - df['A'])) + expected = DataFrame(dict(A=df["A"] - df["C"], B=df["B"] - df["A"])) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) # mixed frame mixed = diffs.copy() - mixed['C'] = 'foo' - mixed['D'] = 1 - mixed['E'] = 1. - mixed['F'] = Timestamp('20130101') + mixed["C"] = "foo" + mixed["D"] = 1 + mixed["E"] = 1.0 + mixed["F"] = Timestamp("20130101") # results in an object array result = mixed.min() - expected = Series([pd.Timedelta(timedelta(seconds=5 * 60 + 5)), - pd.Timedelta(timedelta(days=-1)), - 'foo', 1, 1.0, - Timestamp('20130101')], - index=mixed.columns) + expected = Series( + [ + pd.Timedelta(timedelta(seconds=5 * 60 + 5)), + pd.Timedelta(timedelta(days=-1)), + "foo", + 1, + 1.0, + Timestamp("20130101"), + ], + index=mixed.columns, + ) tm.assert_series_equal(result, expected) # excludes numeric result = mixed.min(axis=1) - expected = Series([1, 1, 1.], index=[0, 1, 2]) + expected = Series([1, 1, 1.0], index=[0, 1, 2]) tm.assert_series_equal(result, expected) # works when only those columns are selected - result = mixed[['A', 'B']].min(1) + result = mixed[["A", "B"]].min(1) expected = Series([timedelta(days=-1)] * 3) tm.assert_series_equal(result, expected) - result = mixed[['A', 'B']].min() - expected = Series([timedelta(seconds=5 * 60 + 5), - timedelta(days=-1)], index=['A', 'B']) + result = mixed[["A", "B"]].min() + expected = Series( + [timedelta(seconds=5 * 60 + 5), timedelta(days=-1)], index=["A", "B"] + ) tm.assert_series_equal(result, expected) # GH 3106 - df = DataFrame({'time': date_range('20130102', periods=5), - 'time2': date_range('20130105', periods=5)}) - df['off1'] = df['time2'] - df['time'] - assert df['off1'].dtype == 'timedelta64[ns]' + df = DataFrame( + { + "time": date_range("20130102", periods=5), + "time2": date_range("20130105", periods=5), + } + ) + df["off1"] = df["time2"] - df["time"] + assert df["off1"].dtype == "timedelta64[ns]" - df['off2'] = df['time'] - df['time2'] + df["off2"] = df["time"] - df["time2"] df._consolidate_inplace() - assert df['off1'].dtype == 'timedelta64[ns]' - assert df['off2'].dtype == 'timedelta64[ns]' + assert df["off1"].dtype == "timedelta64[ns]" + assert df["off2"].dtype == "timedelta64[ns]" def test_sum_corner(self): empty_frame = DataFrame() @@ -1141,18 +1336,15 @@ def test_sum_corner(self): assert len(axis0) == 0 assert len(axis1) == 0 - @pytest.mark.parametrize('method, unit', [ - ('sum', 0), - ('prod', 1), - ]) + @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)]) def test_sum_prod_nanops(self, method, unit): - idx = ['a', 'b', 'c'] - df = pd.DataFrame({"a": [unit, unit], - "b": [unit, np.nan], - "c": [np.nan, np.nan]}) + idx = ["a", "b", "c"] + df = pd.DataFrame( + {"a": [unit, unit], "b": [unit, np.nan], "c": [np.nan, np.nan]} + ) # The default result = getattr(df, method) - expected = pd.Series([unit, unit, unit], index=idx, dtype='float64') + expected = pd.Series([unit, unit, unit], index=idx, dtype="float64") # min_count=1 result = getattr(df, method)(min_count=1) @@ -1161,7 +1353,7 @@ def test_sum_prod_nanops(self, method, unit): # min_count=0 result = getattr(df, method)(min_count=0) - expected = pd.Series([unit, unit, unit], index=idx, dtype='float64') + expected = pd.Series([unit, unit, unit], index=idx, dtype="float64") tm.assert_series_equal(result, expected) result = getattr(df.iloc[1:], method)(min_count=1) @@ -1171,25 +1363,23 @@ def test_sum_prod_nanops(self, method, unit): # min_count > 1 df = pd.DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5}) result = getattr(df, method)(min_count=5) - expected = pd.Series(result, index=['A', 'B']) + expected = pd.Series(result, index=["A", "B"]) tm.assert_series_equal(result, expected) result = getattr(df, method)(min_count=6) - expected = pd.Series(result, index=['A', 'B']) + expected = pd.Series(result, index=["A", "B"]) tm.assert_series_equal(result, expected) def test_sum_nanops_timedelta(self): # prod isn't defined on timedeltas - idx = ['a', 'b', 'c'] - df = pd.DataFrame({"a": [0, 0], - "b": [0, np.nan], - "c": [np.nan, np.nan]}) + idx = ["a", "b", "c"] + df = pd.DataFrame({"a": [0, 0], "b": [0, np.nan], "c": [np.nan, np.nan]}) df2 = df.apply(pd.to_timedelta) # 0 by default result = df2.sum() - expected = pd.Series([0, 0, 0], dtype='m8[ns]', index=idx) + expected = pd.Series([0, 0, 0], dtype="m8[ns]", index=idx) tm.assert_series_equal(result, expected) # min_count=0 @@ -1198,13 +1388,12 @@ def test_sum_nanops_timedelta(self): # min_count=1 result = df2.sum(min_count=1) - expected = pd.Series([0, 0, np.nan], dtype='m8[ns]', index=idx) + expected = pd.Series([0, 0, np.nan], dtype="m8[ns]", index=idx) tm.assert_series_equal(result, expected) def test_sum_object(self, float_frame): values = float_frame.values.astype(int) - frame = DataFrame(values, index=float_frame.index, - columns=float_frame.columns) + frame = DataFrame(values, index=float_frame.index, columns=float_frame.columns) deltas = frame * timedelta(1) deltas.sum() @@ -1227,49 +1416,49 @@ def test_mean_corner(self, float_frame, float_string_frame): tm.assert_index_equal(the_sum.index, the_mean.index) # take mean of boolean column - float_frame['bool'] = float_frame['A'] > 0 + float_frame["bool"] = float_frame["A"] > 0 means = float_frame.mean(0) - assert means['bool'] == float_frame['bool'].values.mean() + assert means["bool"] == float_frame["bool"].values.mean() def test_mean_datetimelike(self): # GH#24757 check that datetimelike are excluded by default, handled # correctly with numeric_only=True - df = pd.DataFrame({ - 'A': np.arange(3), - 'B': pd.date_range('2016-01-01', periods=3), - 'C': pd.timedelta_range('1D', periods=3), - 'D': pd.period_range('2016', periods=3, freq='A') - }) + df = pd.DataFrame( + { + "A": np.arange(3), + "B": pd.date_range("2016-01-01", periods=3), + "C": pd.timedelta_range("1D", periods=3), + "D": pd.period_range("2016", periods=3, freq="A"), + } + ) result = df.mean(numeric_only=True) - expected = pd.Series({'A': 1.}) + expected = pd.Series({"A": 1.0}) tm.assert_series_equal(result, expected) result = df.mean() - expected = pd.Series({ - 'A': 1., - 'C': df.loc[1, 'C'] - }) + expected = pd.Series({"A": 1.0, "C": df.loc[1, "C"]}) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(reason="casts to object-dtype and then tries to " - "add timestamps", - raises=TypeError, strict=True) + @pytest.mark.xfail( + reason="casts to object-dtype and then tries to " "add timestamps", + raises=TypeError, + strict=True, + ) def test_mean_datetimelike_numeric_only_false(self): - df = pd.DataFrame({ - 'A': np.arange(3), - 'B': pd.date_range('2016-01-01', periods=3), - 'C': pd.timedelta_range('1D', periods=3), - 'D': pd.period_range('2016', periods=3, freq='A') - }) + df = pd.DataFrame( + { + "A": np.arange(3), + "B": pd.date_range("2016-01-01", periods=3), + "C": pd.timedelta_range("1D", periods=3), + "D": pd.period_range("2016", periods=3, freq="A"), + } + ) result = df.mean(numeric_only=False) - expected = pd.Series({ - 'A': 1, - 'B': df.loc[1, 'B'], - 'C': df.loc[1, 'C'], - 'D': df.loc[1, 'D'] - }) + expected = pd.Series( + {"A": 1, "B": df.loc[1, "B"], "C": df.loc[1, "C"], "D": df.loc[1, "D"]} + ) tm.assert_series_equal(result, expected) def test_stats_mixed_type(self, float_string_frame): @@ -1288,8 +1477,7 @@ def test_sum_bools(self): # Cumulative Reductions - cumsum, cummax, ... def test_cumsum_corner(self): - dm = DataFrame(np.arange(20).reshape(4, 5), - index=range(4), columns=range(5)) + dm = DataFrame(np.arange(20).reshape(4, 5), index=range(4), columns=range(5)) # ?(wesm) result = dm.cumsum() # noqa @@ -1309,7 +1497,7 @@ def test_cumsum(self, datetime_frame): tm.assert_frame_equal(cumsum, expected) # works - df = DataFrame({'A': np.arange(20)}, index=np.arange(20)) + df = DataFrame({"A": np.arange(20)}, index=np.arange(20)) result = df.cumsum() # noqa # fix issue @@ -1361,7 +1549,7 @@ def test_cummin(self, datetime_frame): tm.assert_frame_equal(cummin, expected) # it works - df = DataFrame({'A': np.arange(20)}, index=np.arange(20)) + df = DataFrame({"A": np.arange(20)}, index=np.arange(20)) result = df.cummin() # noqa # fix issue @@ -1384,7 +1572,7 @@ def test_cummax(self, datetime_frame): tm.assert_frame_equal(cummax, expected) # it works - df = DataFrame({'A': np.arange(20)}, index=np.arange(20)) + df = DataFrame({"A": np.arange(20)}, index=np.arange(20)) result = df.cummax() # noqa # fix issue @@ -1428,17 +1616,16 @@ def test_count_objects(self, float_string_frame): def test_pct_change(self): # GH#11150 - pnl = DataFrame([np.arange(0, 40, 10), - np.arange(0, 40, 10), - np.arange(0, 40, 10)]).astype(np.float64) + pnl = DataFrame( + [np.arange(0, 40, 10), np.arange(0, 40, 10), np.arange(0, 40, 10)] + ).astype(np.float64) pnl.iat[1, 0] = np.nan pnl.iat[1, 1] = np.nan pnl.iat[2, 3] = 60 for axis in range(2): - expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift( - axis=axis) - 1 - result = pnl.pct_change(axis=axis, fill_method='pad') + expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift(axis=axis) - 1 + result = pnl.pct_change(axis=axis, fill_method="pad") tm.assert_frame_equal(result, expected) @@ -1453,12 +1640,10 @@ def test_idxmin(self, float_frame, int_frame): for axis in [0, 1]: for df in [frame, int_frame]: result = df.idxmin(axis=axis, skipna=skipna) - expected = df.apply(Series.idxmin, axis=axis, - skipna=skipna) + expected = df.apply(Series.idxmin, axis=axis, skipna=skipna) tm.assert_series_equal(result, expected) - msg = ("No axis named 2 for object type" - " ") + msg = "No axis named 2 for object type" " " with pytest.raises(ValueError, match=msg): frame.idxmin(axis=2) @@ -1470,40 +1655,43 @@ def test_idxmax(self, float_frame, int_frame): for axis in [0, 1]: for df in [frame, int_frame]: result = df.idxmax(axis=axis, skipna=skipna) - expected = df.apply(Series.idxmax, axis=axis, - skipna=skipna) + expected = df.apply(Series.idxmax, axis=axis, skipna=skipna) tm.assert_series_equal(result, expected) - msg = ("No axis named 2 for object type" - " ") + msg = "No axis named 2 for object type" " " with pytest.raises(ValueError, match=msg): frame.idxmax(axis=2) # ---------------------------------------------------------------------- # Logical reductions - @pytest.mark.parametrize('opname', ['any', 'all']) + @pytest.mark.parametrize("opname", ["any", "all"]) def test_any_all(self, opname, bool_frame_with_na, float_string_frame): - assert_bool_op_calc(opname, getattr(np, opname), bool_frame_with_na, - has_skipna=True) - assert_bool_op_api(opname, bool_frame_with_na, float_string_frame, - has_bool_only=True) + assert_bool_op_calc( + opname, getattr(np, opname), bool_frame_with_na, has_skipna=True + ) + assert_bool_op_api( + opname, bool_frame_with_na, float_string_frame, has_bool_only=True + ) def test_any_all_extra(self): - df = DataFrame({ - 'A': [True, False, False], - 'B': [True, True, False], - 'C': [True, True, True], - }, index=['a', 'b', 'c']) - result = df[['A', 'B']].any(1) - expected = Series([True, True, False], index=['a', 'b', 'c']) + df = DataFrame( + { + "A": [True, False, False], + "B": [True, True, False], + "C": [True, True, True], + }, + index=["a", "b", "c"], + ) + result = df[["A", "B"]].any(1) + expected = Series([True, True, False], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) - result = df[['A', 'B']].any(1, bool_only=True) + result = df[["A", "B"]].any(1, bool_only=True) tm.assert_series_equal(result, expected) result = df.all(1) - expected = Series([True, False, False], index=['a', 'b', 'c']) + expected = Series([True, False, False], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) result = df.all(1, bool_only=True) @@ -1516,21 +1704,20 @@ def test_any_all_extra(self): result = df.any(axis=None).item() assert result is True - result = df[['C']].all(axis=None).item() + result = df[["C"]].all(axis=None).item() assert result is True def test_any_datetime(self): # GH 23070 float_data = [1, np.nan, 3, np.nan] - datetime_data = [pd.Timestamp('1960-02-15'), - pd.Timestamp('1960-02-16'), - pd.NaT, - pd.NaT] - df = DataFrame({ - "A": float_data, - "B": datetime_data - }) + datetime_data = [ + pd.Timestamp("1960-02-15"), + pd.Timestamp("1960-02-16"), + pd.NaT, + pd.NaT, + ] + df = DataFrame({"A": float_data, "B": datetime_data}) result = df.any(1) expected = Series([True, True, True, False]) @@ -1539,72 +1726,107 @@ def test_any_datetime(self): def test_any_all_bool_only(self): # GH 25101 - df = DataFrame({"col1": [1, 2, 3], - "col2": [4, 5, 6], - "col3": [None, None, None]}) + df = DataFrame( + {"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]} + ) result = df.all(bool_only=True) expected = Series(dtype=np.bool) tm.assert_series_equal(result, expected) - df = DataFrame({"col1": [1, 2, 3], - "col2": [4, 5, 6], - "col3": [None, None, None], - "col4": [False, False, True]}) + df = DataFrame( + { + "col1": [1, 2, 3], + "col2": [4, 5, 6], + "col3": [None, None, None], + "col4": [False, False, True], + } + ) result = df.all(bool_only=True) expected = Series({"col4": False}) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('func, data, expected', [ - (np.any, {}, False), - (np.all, {}, True), - (np.any, {'A': []}, False), - (np.all, {'A': []}, True), - (np.any, {'A': [False, False]}, False), - (np.all, {'A': [False, False]}, False), - (np.any, {'A': [True, False]}, True), - (np.all, {'A': [True, False]}, False), - (np.any, {'A': [True, True]}, True), - (np.all, {'A': [True, True]}, True), - - (np.any, {'A': [False], 'B': [False]}, False), - (np.all, {'A': [False], 'B': [False]}, False), - - (np.any, {'A': [False, False], 'B': [False, True]}, True), - (np.all, {'A': [False, False], 'B': [False, True]}, False), - - # other types - (np.all, {'A': pd.Series([0.0, 1.0], dtype='float')}, False), - (np.any, {'A': pd.Series([0.0, 1.0], dtype='float')}, True), - (np.all, {'A': pd.Series([0, 1], dtype=int)}, False), - (np.any, {'A': pd.Series([0, 1], dtype=int)}, True), - pytest.param(np.all, {'A': pd.Series([0, 1], dtype='M8[ns]')}, False, - marks=[td.skip_if_np_lt("1.15")]), - pytest.param(np.any, {'A': pd.Series([0, 1], dtype='M8[ns]')}, True, - marks=[td.skip_if_np_lt("1.15")]), - pytest.param(np.all, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True, - marks=[td.skip_if_np_lt("1.15")]), - pytest.param(np.any, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True, - marks=[td.skip_if_np_lt("1.15")]), - pytest.param(np.all, {'A': pd.Series([0, 1], dtype='m8[ns]')}, False, - marks=[td.skip_if_np_lt("1.15")]), - pytest.param(np.any, {'A': pd.Series([0, 1], dtype='m8[ns]')}, True, - marks=[td.skip_if_np_lt("1.15")]), - pytest.param(np.all, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True, - marks=[td.skip_if_np_lt("1.15")]), - pytest.param(np.any, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True, - marks=[td.skip_if_np_lt("1.15")]), - (np.all, {'A': pd.Series([0, 1], dtype='category')}, False), - (np.any, {'A': pd.Series([0, 1], dtype='category')}, True), - (np.all, {'A': pd.Series([1, 2], dtype='category')}, True), - (np.any, {'A': pd.Series([1, 2], dtype='category')}, True), - - # # Mix - # GH 21484 - # (np.all, {'A': pd.Series([10, 20], dtype='M8[ns]'), - # 'B': pd.Series([10, 20], dtype='m8[ns]')}, True), - ]) + @pytest.mark.parametrize( + "func, data, expected", + [ + (np.any, {}, False), + (np.all, {}, True), + (np.any, {"A": []}, False), + (np.all, {"A": []}, True), + (np.any, {"A": [False, False]}, False), + (np.all, {"A": [False, False]}, False), + (np.any, {"A": [True, False]}, True), + (np.all, {"A": [True, False]}, False), + (np.any, {"A": [True, True]}, True), + (np.all, {"A": [True, True]}, True), + (np.any, {"A": [False], "B": [False]}, False), + (np.all, {"A": [False], "B": [False]}, False), + (np.any, {"A": [False, False], "B": [False, True]}, True), + (np.all, {"A": [False, False], "B": [False, True]}, False), + # other types + (np.all, {"A": pd.Series([0.0, 1.0], dtype="float")}, False), + (np.any, {"A": pd.Series([0.0, 1.0], dtype="float")}, True), + (np.all, {"A": pd.Series([0, 1], dtype=int)}, False), + (np.any, {"A": pd.Series([0, 1], dtype=int)}, True), + pytest.param( + np.all, + {"A": pd.Series([0, 1], dtype="M8[ns]")}, + False, + marks=[td.skip_if_np_lt("1.15")], + ), + pytest.param( + np.any, + {"A": pd.Series([0, 1], dtype="M8[ns]")}, + True, + marks=[td.skip_if_np_lt("1.15")], + ), + pytest.param( + np.all, + {"A": pd.Series([1, 2], dtype="M8[ns]")}, + True, + marks=[td.skip_if_np_lt("1.15")], + ), + pytest.param( + np.any, + {"A": pd.Series([1, 2], dtype="M8[ns]")}, + True, + marks=[td.skip_if_np_lt("1.15")], + ), + pytest.param( + np.all, + {"A": pd.Series([0, 1], dtype="m8[ns]")}, + False, + marks=[td.skip_if_np_lt("1.15")], + ), + pytest.param( + np.any, + {"A": pd.Series([0, 1], dtype="m8[ns]")}, + True, + marks=[td.skip_if_np_lt("1.15")], + ), + pytest.param( + np.all, + {"A": pd.Series([1, 2], dtype="m8[ns]")}, + True, + marks=[td.skip_if_np_lt("1.15")], + ), + pytest.param( + np.any, + {"A": pd.Series([1, 2], dtype="m8[ns]")}, + True, + marks=[td.skip_if_np_lt("1.15")], + ), + (np.all, {"A": pd.Series([0, 1], dtype="category")}, False), + (np.any, {"A": pd.Series([0, 1], dtype="category")}, True), + (np.all, {"A": pd.Series([1, 2], dtype="category")}, True), + (np.any, {"A": pd.Series([1, 2], dtype="category")}, True), + # # Mix + # GH 21484 + # (np.all, {'A': pd.Series([10, 20], dtype='M8[ns]'), + # 'B': pd.Series([10, 20], dtype='m8[ns]')}, True), + ], + ) def test_any_all_np_func(self, func, data, expected): # GH 19976 data = DataFrame(data) @@ -1619,32 +1841,38 @@ def test_any_all_np_func(self, func, data, expected): def test_any_all_object(self): # GH 19976 - result = np.all(DataFrame(columns=['a', 'b'])).item() + result = np.all(DataFrame(columns=["a", "b"])).item() assert result is True - result = np.any(DataFrame(columns=['a', 'b'])).item() + result = np.any(DataFrame(columns=["a", "b"])).item() assert result is False - @pytest.mark.parametrize('method', ['any', 'all']) + @pytest.mark.parametrize("method", ["any", "all"]) def test_any_all_level_axis_none_raises(self, method): df = DataFrame( {"A": 1}, - index=MultiIndex.from_product([['A', 'B'], ['a', 'b']], - names=['out', 'in']) + index=MultiIndex.from_product( + [["A", "B"], ["a", "b"]], names=["out", "in"] + ), ) xpr = "Must specify 'axis' when aggregating by level." with pytest.raises(ValueError, match=xpr): - getattr(df, method)(axis=None, level='out') + getattr(df, method)(axis=None, level="out") # ---------------------------------------------------------------------- # Isin def test_isin(self): # GH 4211 - df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], - 'ids2': ['a', 'n', 'c', 'n']}, - index=['foo', 'bar', 'baz', 'qux']) - other = ['a', 'b', 'c'] + df = DataFrame( + { + "vals": [1, 2, 3, 4], + "ids": ["a", "b", "f", "n"], + "ids2": ["a", "n", "c", "n"], + }, + index=["foo", "bar", "baz", "qux"], + ) + other = ["a", "b", "c"] result = df.isin(other) expected = DataFrame([df.loc[s].isin(other) for s in df.index]) @@ -1653,86 +1881,92 @@ def test_isin(self): @pytest.mark.parametrize("empty", [[], Series(), np.array([])]) def test_isin_empty(self, empty): # GH 16991 - df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) + df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) expected = DataFrame(False, df.index, df.columns) result = df.isin(empty) tm.assert_frame_equal(result, expected) def test_isin_dict(self): - df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) - d = {'A': ['a']} + df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) + d = {"A": ["a"]} expected = DataFrame(False, df.index, df.columns) - expected.loc[0, 'A'] = True + expected.loc[0, "A"] = True result = df.isin(d) tm.assert_frame_equal(result, expected) # non unique columns - df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) - df.columns = ['A', 'A'] + df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) + df.columns = ["A", "A"] expected = DataFrame(False, df.index, df.columns) - expected.loc[0, 'A'] = True + expected.loc[0, "A"] = True result = df.isin(d) tm.assert_frame_equal(result, expected) def test_isin_with_string_scalar(self): # GH 4763 - df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], - 'ids2': ['a', 'n', 'c', 'n']}, - index=['foo', 'bar', 'baz', 'qux']) + df = DataFrame( + { + "vals": [1, 2, 3, 4], + "ids": ["a", "b", "f", "n"], + "ids2": ["a", "n", "c", "n"], + }, + index=["foo", "bar", "baz", "qux"], + ) with pytest.raises(TypeError): - df.isin('a') + df.isin("a") with pytest.raises(TypeError): - df.isin('aaa') + df.isin("aaa") def test_isin_df(self): - df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}) - df2 = DataFrame({'A': [0, 2, 12, 4], 'B': [2, np.nan, 4, 5]}) + df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}) + df2 = DataFrame({"A": [0, 2, 12, 4], "B": [2, np.nan, 4, 5]}) expected = DataFrame(False, df1.index, df1.columns) result = df1.isin(df2) - expected['A'].loc[[1, 3]] = True - expected['B'].loc[[0, 2]] = True + expected["A"].loc[[1, 3]] = True + expected["B"].loc[[0, 2]] = True tm.assert_frame_equal(result, expected) # partial overlapping columns - df2.columns = ['A', 'C'] + df2.columns = ["A", "C"] result = df1.isin(df2) - expected['B'] = False + expected["B"] = False tm.assert_frame_equal(result, expected) def test_isin_tuples(self): # GH 16394 - df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']}) - df['C'] = list(zip(df['A'], df['B'])) - result = df['C'].isin([(1, 'a')]) - tm.assert_series_equal(result, - Series([True, False, False], name="C")) + df = pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "f"]}) + df["C"] = list(zip(df["A"], df["B"])) + result = df["C"].isin([(1, "a")]) + tm.assert_series_equal(result, Series([True, False, False], name="C")) def test_isin_df_dupe_values(self): - df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}) + df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}) # just cols duped - df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], - columns=['B', 'B']) + df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], columns=["B", "B"]) with pytest.raises(ValueError): df1.isin(df2) # just index duped - df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], - columns=['A', 'B'], index=[0, 0, 1, 1]) + df2 = DataFrame( + [[0, 2], [12, 4], [2, np.nan], [4, 5]], + columns=["A", "B"], + index=[0, 0, 1, 1], + ) with pytest.raises(ValueError): df1.isin(df2) # cols and index: - df2.columns = ['B', 'B'] + df2.columns = ["B", "B"] with pytest.raises(ValueError): df1.isin(df2) def test_isin_dupe_self(self): - other = DataFrame({'A': [1, 0, 1, 0], 'B': [1, 1, 0, 0]}) - df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=['A', 'A']) + other = DataFrame({"A": [1, 0, 1, 0], "B": [1, 1, 0, 0]}) + df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=["A", "A"]) result = df.isin(other) expected = DataFrame(False, index=df.index, columns=df.columns) expected.loc[0] = True @@ -1740,26 +1974,40 @@ def test_isin_dupe_self(self): tm.assert_frame_equal(result, expected) def test_isin_against_series(self): - df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}, - index=['a', 'b', 'c', 'd']) - s = pd.Series([1, 3, 11, 4], index=['a', 'b', 'c', 'd']) + df = pd.DataFrame( + {"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}, index=["a", "b", "c", "d"] + ) + s = pd.Series([1, 3, 11, 4], index=["a", "b", "c", "d"]) expected = DataFrame(False, index=df.index, columns=df.columns) - expected['A'].loc['a'] = True - expected.loc['d'] = True + expected["A"].loc["a"] = True + expected.loc["d"] = True result = df.isin(s) tm.assert_frame_equal(result, expected) def test_isin_multiIndex(self): - idx = MultiIndex.from_tuples([(0, 'a', 'foo'), (0, 'a', 'bar'), - (0, 'b', 'bar'), (0, 'b', 'baz'), - (2, 'a', 'foo'), (2, 'a', 'bar'), - (2, 'c', 'bar'), (2, 'c', 'baz'), - (1, 'b', 'foo'), (1, 'b', 'bar'), - (1, 'c', 'bar'), (1, 'c', 'baz')]) - df1 = DataFrame({'A': np.ones(12), - 'B': np.zeros(12)}, index=idx) - df2 = DataFrame({'A': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], - 'B': [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1]}) + idx = MultiIndex.from_tuples( + [ + (0, "a", "foo"), + (0, "a", "bar"), + (0, "b", "bar"), + (0, "b", "baz"), + (2, "a", "foo"), + (2, "a", "bar"), + (2, "c", "bar"), + (2, "c", "baz"), + (1, "b", "foo"), + (1, "b", "bar"), + (1, "c", "bar"), + (1, "c", "baz"), + ] + ) + df1 = DataFrame({"A": np.ones(12), "B": np.zeros(12)}, index=idx) + df2 = DataFrame( + { + "A": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], + "B": [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1], + } + ) # against regular index expected = DataFrame(False, index=df1.index, columns=df1.columns) result = df1.isin(df2) @@ -1768,21 +2016,19 @@ def test_isin_multiIndex(self): df2.index = idx expected = df2.values.astype(np.bool) expected[:, 1] = ~expected[:, 1] - expected = DataFrame(expected, columns=['A', 'B'], index=idx) + expected = DataFrame(expected, columns=["A", "B"], index=idx) result = df1.isin(df2) tm.assert_frame_equal(result, expected) def test_isin_empty_datetimelike(self): # GH 15473 - df1_ts = DataFrame({'date': - pd.to_datetime(['2014-01-01', '2014-01-02'])}) - df1_td = DataFrame({'date': - [pd.Timedelta(1, 's'), pd.Timedelta(2, 's')]}) - df2 = DataFrame({'date': []}) + df1_ts = DataFrame({"date": pd.to_datetime(["2014-01-01", "2014-01-02"])}) + df1_td = DataFrame({"date": [pd.Timedelta(1, "s"), pd.Timedelta(2, "s")]}) + df2 = DataFrame({"date": []}) df3 = DataFrame() - expected = DataFrame({'date': [False, False]}) + expected = DataFrame({"date": [False, False]}) result = df1_ts.isin(df2) tm.assert_frame_equal(result, expected) @@ -1805,18 +2051,17 @@ def test_round(self): tm.assert_frame_equal(df, df.round()) # Here's the test frame we'll be working with - df = DataFrame({'col1': [1.123, 2.123, 3.123], - 'col2': [1.234, 2.234, 3.234]}) + df = DataFrame({"col1": [1.123, 2.123, 3.123], "col2": [1.234, 2.234, 3.234]}) # Default round to integer (i.e. decimals=0) - expected_rounded = DataFrame( - {'col1': [1., 2., 3.], 'col2': [1., 2., 3.]}) + expected_rounded = DataFrame({"col1": [1.0, 2.0, 3.0], "col2": [1.0, 2.0, 3.0]}) tm.assert_frame_equal(df.round(), expected_rounded) # Round with an integer decimals = 2 - expected_rounded = DataFrame({'col1': [1.12, 2.12, 3.12], - 'col2': [1.23, 2.23, 3.23]}) + expected_rounded = DataFrame( + {"col1": [1.12, 2.12, 3.12], "col2": [1.23, 2.23, 3.23]} + ) tm.assert_frame_equal(df.round(decimals), expected_rounded) # This should also work with np.round (since np.round dispatches to @@ -1830,29 +2075,29 @@ def test_round(self): # Round with a dictionary expected_rounded = DataFrame( - {'col1': [1.1, 2.1, 3.1], 'col2': [1.23, 2.23, 3.23]}) - round_dict = {'col1': 1, 'col2': 2} + {"col1": [1.1, 2.1, 3.1], "col2": [1.23, 2.23, 3.23]} + ) + round_dict = {"col1": 1, "col2": 2} tm.assert_frame_equal(df.round(round_dict), expected_rounded) # Incomplete dict expected_partially_rounded = DataFrame( - {'col1': [1.123, 2.123, 3.123], 'col2': [1.2, 2.2, 3.2]}) - partial_round_dict = {'col2': 1} - tm.assert_frame_equal(df.round(partial_round_dict), - expected_partially_rounded) + {"col1": [1.123, 2.123, 3.123], "col2": [1.2, 2.2, 3.2]} + ) + partial_round_dict = {"col2": 1} + tm.assert_frame_equal(df.round(partial_round_dict), expected_partially_rounded) # Dict with unknown elements - wrong_round_dict = {'col3': 2, 'col2': 1} - tm.assert_frame_equal(df.round(wrong_round_dict), - expected_partially_rounded) + wrong_round_dict = {"col3": 2, "col2": 1} + tm.assert_frame_equal(df.round(wrong_round_dict), expected_partially_rounded) # float input to `decimals` - non_int_round_dict = {'col1': 1, 'col2': 0.5} + non_int_round_dict = {"col1": 1, "col2": 0.5} with pytest.raises(TypeError): df.round(non_int_round_dict) # String input - non_int_round_dict = {'col1': 1, 'col2': 'foo'} + non_int_round_dict = {"col1": 1, "col2": "foo"} with pytest.raises(TypeError): df.round(non_int_round_dict) @@ -1861,7 +2106,7 @@ def test_round(self): df.round(non_int_round_Series) # List input - non_int_round_dict = {'col1': 1, 'col2': [1, 2]} + non_int_round_dict = {"col1": 1, "col2": [1, 2]} with pytest.raises(TypeError): df.round(non_int_round_dict) @@ -1879,47 +2124,46 @@ def test_round(self): df.round(non_int_round_Series) # Negative numbers - negative_round_dict = {'col1': -1, 'col2': -2} + negative_round_dict = {"col1": -1, "col2": -2} big_df = df * 100 expected_neg_rounded = DataFrame( - {'col1': [110., 210, 310], 'col2': [100., 200, 300]}) - tm.assert_frame_equal(big_df.round(negative_round_dict), - expected_neg_rounded) + {"col1": [110.0, 210, 310], "col2": [100.0, 200, 300]} + ) + tm.assert_frame_equal(big_df.round(negative_round_dict), expected_neg_rounded) # nan in Series round - nan_round_Series = Series({'col1': np.nan, 'col2': 1}) + nan_round_Series = Series({"col1": np.nan, "col2": 1}) # TODO(wesm): unused? - expected_nan_round = DataFrame({ # noqa - 'col1': [1.123, 2.123, 3.123], - 'col2': [1.2, 2.2, 3.2]}) + expected_nan_round = DataFrame( + {"col1": [1.123, 2.123, 3.123], "col2": [1.2, 2.2, 3.2]} # noqa + ) with pytest.raises(TypeError): df.round(nan_round_Series) # Make sure this doesn't break existing Series.round - tm.assert_series_equal(df['col1'].round(1), expected_rounded['col1']) + tm.assert_series_equal(df["col1"].round(1), expected_rounded["col1"]) # named columns # GH 11986 decimals = 2 expected_rounded = DataFrame( - {'col1': [1.12, 2.12, 3.12], 'col2': [1.23, 2.23, 3.23]}) + {"col1": [1.12, 2.12, 3.12], "col2": [1.23, 2.23, 3.23]} + ) df.columns.name = "cols" expected_rounded.columns.name = "cols" tm.assert_frame_equal(df.round(decimals), expected_rounded) # interaction of named columns & series - tm.assert_series_equal(df['col1'].round(decimals), - expected_rounded['col1']) - tm.assert_series_equal(df.round(decimals)['col1'], - expected_rounded['col1']) + tm.assert_series_equal(df["col1"].round(decimals), expected_rounded["col1"]) + tm.assert_series_equal(df.round(decimals)["col1"], expected_rounded["col1"]) def test_numpy_round(self): # GH 12600 df = DataFrame([[1.53, 1.36], [0.06, 7.01]]) out = np.round(df, decimals=0) - expected = DataFrame([[2., 1.], [0., 7.]]) + expected = DataFrame([[2.0, 1.0], [0.0, 7.0]]) tm.assert_frame_equal(out, expected) msg = "the 'out' parameter is not supported" @@ -1931,35 +2175,46 @@ def test_numpy_round_nan(self): df = Series([1.53, np.nan, 0.06]).to_frame() with tm.assert_produces_warning(None): result = df.round() - expected = Series([2., np.nan, 0.]).to_frame() + expected = Series([2.0, np.nan, 0.0]).to_frame() tm.assert_frame_equal(result, expected) def test_round_mixed_type(self): # GH 11885 - df = DataFrame({'col1': [1.1, 2.2, 3.3, 4.4], - 'col2': ['1', 'a', 'c', 'f'], - 'col3': date_range('20111111', periods=4)}) - round_0 = DataFrame({'col1': [1., 2., 3., 4.], - 'col2': ['1', 'a', 'c', 'f'], - 'col3': date_range('20111111', periods=4)}) + df = DataFrame( + { + "col1": [1.1, 2.2, 3.3, 4.4], + "col2": ["1", "a", "c", "f"], + "col3": date_range("20111111", periods=4), + } + ) + round_0 = DataFrame( + { + "col1": [1.0, 2.0, 3.0, 4.0], + "col2": ["1", "a", "c", "f"], + "col3": date_range("20111111", periods=4), + } + ) tm.assert_frame_equal(df.round(), round_0) tm.assert_frame_equal(df.round(1), df) - tm.assert_frame_equal(df.round({'col1': 1}), df) - tm.assert_frame_equal(df.round({'col1': 0}), round_0) - tm.assert_frame_equal(df.round({'col1': 0, 'col2': 1}), round_0) - tm.assert_frame_equal(df.round({'col3': 1}), df) + tm.assert_frame_equal(df.round({"col1": 1}), df) + tm.assert_frame_equal(df.round({"col1": 0}), round_0) + tm.assert_frame_equal(df.round({"col1": 0, "col2": 1}), round_0) + tm.assert_frame_equal(df.round({"col3": 1}), df) def test_round_issue(self): # GH 11611 - df = pd.DataFrame(np.random.random([3, 3]), columns=['A', 'B', 'C'], - index=['first', 'second', 'third']) + df = pd.DataFrame( + np.random.random([3, 3]), + columns=["A", "B", "C"], + index=["first", "second", "third"], + ) dfs = pd.concat((df, df), axis=1) rounded = dfs.round() tm.assert_index_equal(rounded.index, dfs.index) - decimals = pd.Series([1, 0, 2], index=['A', 'B', 'A']) + decimals = pd.Series([1, 0, 2], index=["A", "B", "A"]) msg = "Index of decimals must be unique" with pytest.raises(ValueError, match=msg): df.round(decimals) @@ -1967,18 +2222,16 @@ def test_round_issue(self): def test_built_in_round(self): # GH 11763 # Here's the test frame we'll be working with - df = DataFrame( - {'col1': [1.123, 2.123, 3.123], 'col2': [1.234, 2.234, 3.234]}) + df = DataFrame({"col1": [1.123, 2.123, 3.123], "col2": [1.234, 2.234, 3.234]}) # Default round to integer (i.e. decimals=0) - expected_rounded = DataFrame( - {'col1': [1., 2., 3.], 'col2': [1., 2., 3.]}) + expected_rounded = DataFrame({"col1": [1.0, 2.0, 3.0], "col2": [1.0, 2.0, 3.0]}) tm.assert_frame_equal(round(df), expected_rounded) def test_round_nonunique_categorical(self): # See GH21809 - idx = pd.CategoricalIndex(['low'] * 3 + ['hi'] * 3) - df = pd.DataFrame(np.random.rand(6, 3), columns=list('abc')) + idx = pd.CategoricalIndex(["low"] * 3 + ["hi"] * 3) + df = pd.DataFrame(np.random.rand(6, 3), columns=list("abc")) expected = df.round(3) expected.index = idx @@ -2048,16 +2301,13 @@ def test_clip_mixed_numeric(self): # TODO(jreback) # clip on mixed integer or floats # with integer clippers coerces to float - df = DataFrame({'A': [1, 2, 3], - 'B': [1., np.nan, 3.]}) + df = DataFrame({"A": [1, 2, 3], "B": [1.0, np.nan, 3.0]}) result = df.clip(1, 2) - expected = DataFrame({'A': [1, 2, 2], - 'B': [1., np.nan, 2.]}) + expected = DataFrame({"A": [1, 2, 2], "B": [1.0, np.nan, 2.0]}) tm.assert_frame_equal(result, expected, check_like=True) # GH 24162, clipping now preserves numeric types per column - df = DataFrame([[1, 2, 3.4], [3, 4, 5.6]], - columns=['foo', 'bar', 'baz']) + df = DataFrame([[1, 2, 3.4], [3, 4, 5.6]], columns=["foo", "bar", "baz"]) expected = df.dtypes result = df.clip(upper=3).dtypes tm.assert_series_equal(result, expected) @@ -2093,21 +2343,20 @@ def test_clip_against_series(self, inplace): @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize("lower", [[2, 3, 4], np.asarray([2, 3, 4])]) - @pytest.mark.parametrize("axis,res", [ - (0, [[2., 2., 3.], [4., 5., 6.], [7., 7., 7.]]), - (1, [[2., 3., 4.], [4., 5., 6.], [5., 6., 7.]]) - ]) - def test_clip_against_list_like(self, simple_frame, - inplace, lower, axis, res): + @pytest.mark.parametrize( + "axis,res", + [ + (0, [[2.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 7.0, 7.0]]), + (1, [[2.0, 3.0, 4.0], [4.0, 5.0, 6.0], [5.0, 6.0, 7.0]]), + ], + ) + def test_clip_against_list_like(self, simple_frame, inplace, lower, axis, res): # GH 15390 original = simple_frame.copy(deep=True) - result = original.clip(lower=lower, upper=[5, 6, 7], - axis=axis, inplace=inplace) + result = original.clip(lower=lower, upper=[5, 6, 7], axis=axis, inplace=inplace) - expected = pd.DataFrame(res, - columns=original.columns, - index=original.index) + expected = pd.DataFrame(res, columns=original.columns, index=original.index) if inplace: result = original tm.assert_frame_equal(result, expected, check_exact=True) @@ -2130,16 +2379,15 @@ def test_clip_against_frame(self, axis): def test_clip_against_unordered_columns(self): # GH 20911 - df1 = DataFrame(np.random.randn(1000, 4), columns=['A', 'B', 'C', 'D']) - df2 = DataFrame(np.random.randn(1000, 4), columns=['D', 'A', 'B', 'C']) - df3 = DataFrame(df2.values - 1, columns=['B', 'D', 'C', 'A']) + df1 = DataFrame(np.random.randn(1000, 4), columns=["A", "B", "C", "D"]) + df2 = DataFrame(np.random.randn(1000, 4), columns=["D", "A", "B", "C"]) + df3 = DataFrame(df2.values - 1, columns=["B", "D", "C", "A"]) result_upper = df1.clip(lower=0, upper=df2) expected_upper = df1.clip(lower=0, upper=df2[df1.columns]) result_lower = df1.clip(lower=df3, upper=3) expected_lower = df1.clip(lower=df3[df1.columns], upper=3) result_lower_upper = df1.clip(lower=df3, upper=df2) - expected_lower_upper = df1.clip(lower=df3[df1.columns], - upper=df2[df1.columns]) + expected_lower_upper = df1.clip(lower=df3[df1.columns], upper=df2[df1.columns]) tm.assert_frame_equal(result_upper, expected_upper) tm.assert_frame_equal(result_lower, expected_lower) tm.assert_frame_equal(result_lower_upper, expected_lower_upper) @@ -2148,48 +2396,50 @@ def test_clip_with_na_args(self, float_frame): """Should process np.nan argument as None """ # GH 17276 tm.assert_frame_equal(float_frame.clip(np.nan), float_frame) - tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), - float_frame) + tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame) # GH 19992 - df = DataFrame({'col_0': [1, 2, 3], 'col_1': [4, 5, 6], - 'col_2': [7, 8, 9]}) + df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}) result = df.clip(lower=[4, 5, np.nan], axis=0) - expected = DataFrame({'col_0': [4, 5, np.nan], 'col_1': [4, 5, np.nan], - 'col_2': [7, 8, np.nan]}) + expected = DataFrame( + {"col_0": [4, 5, np.nan], "col_1": [4, 5, np.nan], "col_2": [7, 8, np.nan]} + ) tm.assert_frame_equal(result, expected) result = df.clip(lower=[4, 5, np.nan], axis=1) - expected = DataFrame({'col_0': [4, 4, 4], 'col_1': [5, 5, 6], - 'col_2': [np.nan, np.nan, np.nan]}) + expected = DataFrame( + {"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [np.nan, np.nan, np.nan]} + ) tm.assert_frame_equal(result, expected) # --------------------------------------------------------------------- # Matrix-like def test_dot(self): - a = DataFrame(np.random.randn(3, 4), index=['a', 'b', 'c'], - columns=['p', 'q', 'r', 's']) - b = DataFrame(np.random.randn(4, 2), index=['p', 'q', 'r', 's'], - columns=['one', 'two']) + a = DataFrame( + np.random.randn(3, 4), index=["a", "b", "c"], columns=["p", "q", "r", "s"] + ) + b = DataFrame( + np.random.randn(4, 2), index=["p", "q", "r", "s"], columns=["one", "two"] + ) result = a.dot(b) - expected = DataFrame(np.dot(a.values, b.values), - index=['a', 'b', 'c'], - columns=['one', 'two']) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) # Check alignment b1 = b.reindex(index=reversed(b.index)) result = a.dot(b) tm.assert_frame_equal(result, expected) # Check series argument - result = a.dot(b['one']) - tm.assert_series_equal(result, expected['one'], check_names=False) + result = a.dot(b["one"]) + tm.assert_series_equal(result, expected["one"], check_names=False) assert result.name is None - result = a.dot(b1['one']) - tm.assert_series_equal(result, expected['one'], check_names=False) + result = a.dot(b1["one"]) + tm.assert_series_equal(result, expected["one"], check_names=False) assert result.name is None # can pass correct-length arrays @@ -2199,7 +2449,7 @@ def test_dot(self): expected = a.dot(a.iloc[0]) tm.assert_series_equal(result, expected) - with pytest.raises(ValueError, match='Dot product shape mismatch'): + with pytest.raises(ValueError, match="Dot product shape mismatch"): a.dot(row[:-1]) a = np.random.rand(1, 5) @@ -2213,32 +2463,31 @@ def test_dot(self): result = A.dot(b) # unaligned - df = DataFrame(np.random.randn(3, 4), - index=[1, 2, 3], columns=range(4)) - df2 = DataFrame(np.random.randn(5, 3), - index=range(5), columns=[1, 2, 3]) + df = DataFrame(np.random.randn(3, 4), index=[1, 2, 3], columns=range(4)) + df2 = DataFrame(np.random.randn(5, 3), index=range(5), columns=[1, 2, 3]) - with pytest.raises(ValueError, match='aligned'): + with pytest.raises(ValueError, match="aligned"): df.dot(df2) def test_matmul(self): # matmul test is for GH 10259 - a = DataFrame(np.random.randn(3, 4), index=['a', 'b', 'c'], - columns=['p', 'q', 'r', 's']) - b = DataFrame(np.random.randn(4, 2), index=['p', 'q', 'r', 's'], - columns=['one', 'two']) + a = DataFrame( + np.random.randn(3, 4), index=["a", "b", "c"], columns=["p", "q", "r", "s"] + ) + b = DataFrame( + np.random.randn(4, 2), index=["p", "q", "r", "s"], columns=["one", "two"] + ) # DataFrame @ DataFrame result = operator.matmul(a, b) - expected = DataFrame(np.dot(a.values, b.values), - index=['a', 'b', 'c'], - columns=['one', 'two']) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) tm.assert_frame_equal(result, expected) # DataFrame @ Series result = operator.matmul(a, b.one) - expected = Series(np.dot(a.values, b.one.values), - index=['a', 'b', 'c']) + expected = Series(np.dot(a.values, b.one.values), index=["a", "b", "c"]) tm.assert_series_equal(result, expected) # np.array @ DataFrame @@ -2251,168 +2500,192 @@ def test_matmul(self): # nested list @ DataFrame (__rmatmul__) result = operator.matmul(a.values.tolist(), b) - expected = DataFrame(np.dot(a.values, b.values), - index=['a', 'b', 'c'], - columns=['one', 'two']) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) tm.assert_almost_equal(result.values, expected.values) # mixed dtype DataFrame @ DataFrame - a['q'] = a.q.round().astype(int) + a["q"] = a.q.round().astype(int) result = operator.matmul(a, b) - expected = DataFrame(np.dot(a.values, b.values), - index=['a', 'b', 'c'], - columns=['one', 'two']) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) tm.assert_frame_equal(result, expected) # different dtypes DataFrame @ DataFrame a = a.astype(int) result = operator.matmul(a, b) - expected = DataFrame(np.dot(a.values, b.values), - index=['a', 'b', 'c'], - columns=['one', 'two']) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) tm.assert_frame_equal(result, expected) # unaligned - df = DataFrame(np.random.randn(3, 4), - index=[1, 2, 3], columns=range(4)) - df2 = DataFrame(np.random.randn(5, 3), - index=range(5), columns=[1, 2, 3]) + df = DataFrame(np.random.randn(3, 4), index=[1, 2, 3], columns=range(4)) + df2 = DataFrame(np.random.randn(5, 3), index=range(5), columns=[1, 2, 3]) - with pytest.raises(ValueError, match='aligned'): + with pytest.raises(ValueError, match="aligned"): operator.matmul(df, df2) @pytest.fixture def df_duplicates(): - return pd.DataFrame({'a': [1, 2, 3, 4, 4], - 'b': [1, 1, 1, 1, 1], - 'c': [0, 1, 2, 5, 4]}, - index=[0, 0, 1, 1, 1]) + return pd.DataFrame( + {"a": [1, 2, 3, 4, 4], "b": [1, 1, 1, 1, 1], "c": [0, 1, 2, 5, 4]}, + index=[0, 0, 1, 1, 1], + ) @pytest.fixture def df_strings(): - return pd.DataFrame({'a': np.random.permutation(10), - 'b': list(ascii_lowercase[:10]), - 'c': np.random.permutation(10).astype('float64')}) + return pd.DataFrame( + { + "a": np.random.permutation(10), + "b": list(ascii_lowercase[:10]), + "c": np.random.permutation(10).astype("float64"), + } + ) @pytest.fixture def df_main_dtypes(): return pd.DataFrame( - {'group': [1, 1, 2], - 'int': [1, 2, 3], - 'float': [4., 5., 6.], - 'string': list('abc'), - 'category_string': pd.Series(list('abc')).astype('category'), - 'category_int': [7, 8, 9], - 'datetime': pd.date_range('20130101', periods=3), - 'datetimetz': pd.date_range('20130101', - periods=3, - tz='US/Eastern'), - 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')}, - columns=['group', 'int', 'float', 'string', - 'category_string', 'category_int', - 'datetime', 'datetimetz', - 'timedelta']) + { + "group": [1, 1, 2], + "int": [1, 2, 3], + "float": [4.0, 5.0, 6.0], + "string": list("abc"), + "category_string": pd.Series(list("abc")).astype("category"), + "category_int": [7, 8, 9], + "datetime": pd.date_range("20130101", periods=3), + "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), + }, + columns=[ + "group", + "int", + "float", + "string", + "category_string", + "category_int", + "datetime", + "datetimetz", + "timedelta", + ], + ) class TestNLargestNSmallest: - dtype_error_msg_template = ("Column {column!r} has dtype {dtype}, cannot " - "use method {method!r} with this dtype") + dtype_error_msg_template = ( + "Column {column!r} has dtype {dtype}, cannot " + "use method {method!r} with this dtype" + ) # ---------------------------------------------------------------------- # Top / bottom - @pytest.mark.parametrize('order', [ - ['a'], - ['c'], - ['a', 'b'], - ['a', 'c'], - ['b', 'a'], - ['b', 'c'], - ['a', 'b', 'c'], - ['c', 'a', 'b'], - ['c', 'b', 'a'], - ['b', 'c', 'a'], - ['b', 'a', 'c'], - - # dups! - ['b', 'c', 'c']]) - @pytest.mark.parametrize('n', range(1, 11)) + @pytest.mark.parametrize( + "order", + [ + ["a"], + ["c"], + ["a", "b"], + ["a", "c"], + ["b", "a"], + ["b", "c"], + ["a", "b", "c"], + ["c", "a", "b"], + ["c", "b", "a"], + ["b", "c", "a"], + ["b", "a", "c"], + # dups! + ["b", "c", "c"], + ], + ) + @pytest.mark.parametrize("n", range(1, 11)) def test_n(self, df_strings, nselect_method, n, order): # GH 10393 df = df_strings - if 'b' in order: + if "b" in order: error_msg = self.dtype_error_msg_template.format( - column='b', method=nselect_method, dtype='object') + column="b", method=nselect_method, dtype="object" + ) with pytest.raises(TypeError, match=error_msg): getattr(df, nselect_method)(n, order) else: - ascending = nselect_method == 'nsmallest' + ascending = nselect_method == "nsmallest" result = getattr(df, nselect_method)(n, order) expected = df.sort_values(order, ascending=ascending).head(n) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('columns', [ - ['group', 'category_string'], ['group', 'string']]) + @pytest.mark.parametrize( + "columns", [["group", "category_string"], ["group", "string"]] + ) def test_n_error(self, df_main_dtypes, nselect_method, columns): df = df_main_dtypes col = columns[1] error_msg = self.dtype_error_msg_template.format( - column=col, method=nselect_method, dtype=df[col].dtype) + column=col, method=nselect_method, dtype=df[col].dtype + ) # escape some characters that may be in the repr - error_msg = (error_msg.replace('(', '\\(').replace(")", "\\)") - .replace("[", "\\[").replace("]", "\\]")) + error_msg = ( + error_msg.replace("(", "\\(") + .replace(")", "\\)") + .replace("[", "\\[") + .replace("]", "\\]") + ) with pytest.raises(TypeError, match=error_msg): getattr(df, nselect_method)(2, columns) def test_n_all_dtypes(self, df_main_dtypes): df = df_main_dtypes - df.nsmallest(2, list(set(df) - {'category_string', 'string'})) - df.nlargest(2, list(set(df) - {'category_string', 'string'})) - - @pytest.mark.parametrize('method,expected', [ - ('nlargest', - pd.DataFrame({'a': [2, 2, 2, 1], 'b': [3, 2, 1, 3]}, - index=[2, 1, 0, 3])), - ('nsmallest', - pd.DataFrame({'a': [1, 1, 1, 2], 'b': [1, 2, 3, 1]}, - index=[5, 4, 3, 0]))]) + df.nsmallest(2, list(set(df) - {"category_string", "string"})) + df.nlargest(2, list(set(df) - {"category_string", "string"})) + + @pytest.mark.parametrize( + "method,expected", + [ + ( + "nlargest", + pd.DataFrame( + {"a": [2, 2, 2, 1], "b": [3, 2, 1, 3]}, index=[2, 1, 0, 3] + ), + ), + ( + "nsmallest", + pd.DataFrame( + {"a": [1, 1, 1, 2], "b": [1, 2, 3, 1]}, index=[5, 4, 3, 0] + ), + ), + ], + ) def test_duplicates_on_starter_columns(self, method, expected): # regression test for #22752 - df = pd.DataFrame({ - 'a': [2, 2, 2, 1, 1, 1], - 'b': [1, 2, 3, 3, 2, 1] - }) + df = pd.DataFrame({"a": [2, 2, 2, 1, 1, 1], "b": [1, 2, 3, 3, 2, 1]}) - result = getattr(df, method)(4, columns=['a', 'b']) + result = getattr(df, method)(4, columns=["a", "b"]) tm.assert_frame_equal(result, expected) def test_n_identical_values(self): # GH 15297 - df = pd.DataFrame({'a': [1] * 5, 'b': [1, 2, 3, 4, 5]}) + df = pd.DataFrame({"a": [1] * 5, "b": [1, 2, 3, 4, 5]}) - result = df.nlargest(3, 'a') - expected = pd.DataFrame( - {'a': [1] * 3, 'b': [1, 2, 3]}, index=[0, 1, 2] - ) + result = df.nlargest(3, "a") + expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}, index=[0, 1, 2]) tm.assert_frame_equal(result, expected) - result = df.nsmallest(3, 'a') - expected = pd.DataFrame({'a': [1] * 3, 'b': [1, 2, 3]}) + result = df.nsmallest(3, "a") + expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('order', [ - ['a', 'b', 'c'], - ['c', 'b', 'a'], - ['a'], - ['b'], - ['a', 'b'], - ['c', 'b']]) - @pytest.mark.parametrize('n', range(1, 6)) + @pytest.mark.parametrize( + "order", + [["a", "b", "c"], ["c", "b", "a"], ["a"], ["b"], ["a", "b"], ["c", "b"]], + ) + @pytest.mark.parametrize("n", range(1, 6)) def test_n_duplicate_index(self, df_duplicates, n, order): # GH 13412 @@ -2427,38 +2700,45 @@ def test_n_duplicate_index(self, df_duplicates, n, order): def test_duplicate_keep_all_ties(self): # GH 16818 - df = pd.DataFrame({'a': [5, 4, 4, 2, 3, 3, 3, 3], - 'b': [10, 9, 8, 7, 5, 50, 10, 20]}) - result = df.nlargest(4, 'a', keep='all') - expected = pd.DataFrame({'a': {0: 5, 1: 4, 2: 4, 4: 3, - 5: 3, 6: 3, 7: 3}, - 'b': {0: 10, 1: 9, 2: 8, 4: 5, - 5: 50, 6: 10, 7: 20}}) + df = pd.DataFrame( + {"a": [5, 4, 4, 2, 3, 3, 3, 3], "b": [10, 9, 8, 7, 5, 50, 10, 20]} + ) + result = df.nlargest(4, "a", keep="all") + expected = pd.DataFrame( + { + "a": {0: 5, 1: 4, 2: 4, 4: 3, 5: 3, 6: 3, 7: 3}, + "b": {0: 10, 1: 9, 2: 8, 4: 5, 5: 50, 6: 10, 7: 20}, + } + ) tm.assert_frame_equal(result, expected) - result = df.nsmallest(2, 'a', keep='all') - expected = pd.DataFrame({'a': {3: 2, 4: 3, 5: 3, 6: 3, 7: 3}, - 'b': {3: 7, 4: 5, 5: 50, 6: 10, 7: 20}}) + result = df.nsmallest(2, "a", keep="all") + expected = pd.DataFrame( + { + "a": {3: 2, 4: 3, 5: 3, 6: 3, 7: 3}, + "b": {3: 7, 4: 5, 5: 50, 6: 10, 7: 20}, + } + ) tm.assert_frame_equal(result, expected) def test_series_broadcasting(self): # smoke test for numpy warnings # GH 16378, GH 16306 df = DataFrame([1.0, 1.0, 1.0]) - df_nan = DataFrame({'A': [np.nan, 2.0, np.nan]}) + df_nan = DataFrame({"A": [np.nan, 2.0, np.nan]}) s = Series([1, 1, 1]) s_nan = Series([np.nan, np.nan, 1]) with tm.assert_produces_warning(None): with tm.assert_produces_warning(FutureWarning): df_nan.clip_lower(s, axis=0) - for op in ['lt', 'le', 'gt', 'ge', 'eq', 'ne']: + for op in ["lt", "le", "gt", "ge", "eq", "ne"]: getattr(df, op)(s_nan, axis=0) def test_series_nat_conversion(self): # GH 18521 # Check rank does not mutate DataFrame - df = DataFrame(np.random.randn(10, 3), dtype='float64') + df = DataFrame(np.random.randn(10, 3), dtype="float64") expected = df.copy() df.rank() result = df @@ -2468,15 +2748,16 @@ def test_multiindex_column_lookup(self): # Check whether tuples are correctly treated as multi-level lookups. # GH 23033 df = pd.DataFrame( - columns=pd.MultiIndex.from_product([['x'], ['a', 'b']]), - data=[[0.33, 0.13], [0.86, 0.25], [0.25, 0.70], [0.85, 0.91]]) + columns=pd.MultiIndex.from_product([["x"], ["a", "b"]]), + data=[[0.33, 0.13], [0.86, 0.25], [0.25, 0.70], [0.85, 0.91]], + ) # nsmallest - result = df.nsmallest(3, ('x', 'a')) + result = df.nsmallest(3, ("x", "a")) expected = df.iloc[[2, 0, 3]] tm.assert_frame_equal(result, expected) # nlargest - result = df.nlargest(3, ('x', 'b')) + result = df.nlargest(3, ("x", "b")) expected = df.iloc[[3, 2, 1]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 6372029f2efe7a..93508d7ddc50bd 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -7,11 +7,21 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, Series, SparseDataFrame, SparseDtype, compat, - date_range, timedelta_range) + Categorical, + DataFrame, + Series, + SparseDataFrame, + SparseDtype, + compat, + date_range, + timedelta_range, +) import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) class SharedWithSparse: @@ -22,6 +32,7 @@ class SharedWithSparse: ``self._assert_series_equal()`` which are implemented in sub-classes and dispatch correctly. """ + def _assert_frame_equal(self, left, right): """Dispatch to frame class dependent assertion""" raise NotImplementedError @@ -33,79 +44,78 @@ def _assert_series_equal(self, left, right): def test_copy_index_name_checking(self, float_frame): # don't want to be able to modify the index stored elsewhere after # making a copy - for attr in ('index', 'columns'): + for attr in ("index", "columns"): ind = getattr(float_frame, attr) ind.name = None cp = float_frame.copy() - getattr(cp, attr).name = 'foo' + getattr(cp, attr).name = "foo" assert getattr(float_frame, attr).name is None def test_getitem_pop_assign_name(self, float_frame): - s = float_frame['A'] - assert s.name == 'A' + s = float_frame["A"] + assert s.name == "A" - s = float_frame.pop('A') - assert s.name == 'A' + s = float_frame.pop("A") + assert s.name == "A" - s = float_frame.loc[:, 'B'] - assert s.name == 'B' + s = float_frame.loc[:, "B"] + assert s.name == "B" s2 = s.loc[:] - assert s2.name == 'B' + assert s2.name == "B" def test_get_value(self, float_frame): for idx in float_frame.index: for col in float_frame.columns: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = float_frame.get_value(idx, col) expected = float_frame[col][idx] tm.assert_almost_equal(result, expected) def test_add_prefix_suffix(self, float_frame): - with_prefix = float_frame.add_prefix('foo#') - expected = pd.Index(['foo#%s' % c for c in float_frame.columns]) + with_prefix = float_frame.add_prefix("foo#") + expected = pd.Index(["foo#%s" % c for c in float_frame.columns]) tm.assert_index_equal(with_prefix.columns, expected) - with_suffix = float_frame.add_suffix('#foo') - expected = pd.Index(['%s#foo' % c for c in float_frame.columns]) + with_suffix = float_frame.add_suffix("#foo") + expected = pd.Index(["%s#foo" % c for c in float_frame.columns]) tm.assert_index_equal(with_suffix.columns, expected) - with_pct_prefix = float_frame.add_prefix('%') - expected = pd.Index(['%{}'.format(c) for c in float_frame.columns]) + with_pct_prefix = float_frame.add_prefix("%") + expected = pd.Index(["%{}".format(c) for c in float_frame.columns]) tm.assert_index_equal(with_pct_prefix.columns, expected) - with_pct_suffix = float_frame.add_suffix('%') - expected = pd.Index(['{}%'.format(c) for c in float_frame.columns]) + with_pct_suffix = float_frame.add_suffix("%") + expected = pd.Index(["{}%".format(c) for c in float_frame.columns]) tm.assert_index_equal(with_pct_suffix.columns, expected) def test_get_axis(self, float_frame): f = float_frame assert f._get_axis_number(0) == 0 assert f._get_axis_number(1) == 1 - assert f._get_axis_number('index') == 0 - assert f._get_axis_number('rows') == 0 - assert f._get_axis_number('columns') == 1 + assert f._get_axis_number("index") == 0 + assert f._get_axis_number("rows") == 0 + assert f._get_axis_number("columns") == 1 - assert f._get_axis_name(0) == 'index' - assert f._get_axis_name(1) == 'columns' - assert f._get_axis_name('index') == 'index' - assert f._get_axis_name('rows') == 'index' - assert f._get_axis_name('columns') == 'columns' + assert f._get_axis_name(0) == "index" + assert f._get_axis_name(1) == "columns" + assert f._get_axis_name("index") == "index" + assert f._get_axis_name("rows") == "index" + assert f._get_axis_name("columns") == "columns" assert f._get_axis(0) is f.index assert f._get_axis(1) is f.columns - with pytest.raises(ValueError, match='No axis named'): + with pytest.raises(ValueError, match="No axis named"): f._get_axis_number(2) - with pytest.raises(ValueError, match='No axis.*foo'): - f._get_axis_name('foo') + with pytest.raises(ValueError, match="No axis.*foo"): + f._get_axis_name("foo") - with pytest.raises(ValueError, match='No axis.*None'): + with pytest.raises(ValueError, match="No axis.*None"): f._get_axis_name(None) - with pytest.raises(ValueError, match='No axis named'): + with pytest.raises(ValueError, match="No axis named"): f._get_axis_number(None) def test_keys(self, float_frame): @@ -118,28 +128,28 @@ def test_column_contains_raises(self, float_frame): def test_tab_completion(self): # DataFrame whose columns are identifiers shall have them in __dir__. - df = pd.DataFrame([list('abcd'), list('efgh')], columns=list('ABCD')) - for key in list('ABCD'): + df = pd.DataFrame([list("abcd"), list("efgh")], columns=list("ABCD")) + for key in list("ABCD"): assert key in dir(df) - assert isinstance(df.__getitem__('A'), pd.Series) + assert isinstance(df.__getitem__("A"), pd.Series) # DataFrame whose first-level columns are identifiers shall have # them in __dir__. df = pd.DataFrame( - [list('abcd'), list('efgh')], - columns=pd.MultiIndex.from_tuples(list(zip('ABCD', 'EFGH')))) - for key in list('ABCD'): + [list("abcd"), list("efgh")], + columns=pd.MultiIndex.from_tuples(list(zip("ABCD", "EFGH"))), + ) + for key in list("ABCD"): assert key in dir(df) - for key in list('EFGH'): + for key in list("EFGH"): assert key not in dir(df) - assert isinstance(df.__getitem__('A'), pd.DataFrame) + assert isinstance(df.__getitem__("A"), pd.DataFrame) def test_not_hashable(self): empty_frame = DataFrame() df = self.klass([1]) - msg = ("'(Sparse)?DataFrame' objects are mutable, thus they cannot be" - " hashed") + msg = "'(Sparse)?DataFrame' objects are mutable, thus they cannot be" " hashed" with pytest.raises(TypeError, match=msg): hash(df) with pytest.raises(TypeError, match=msg): @@ -148,11 +158,11 @@ def test_not_hashable(self): def test_new_empty_index(self): df1 = self.klass(np.random.randn(0, 3)) df2 = self.klass(np.random.randn(0, 3)) - df1.index.name = 'foo' + df1.index.name = "foo" assert df2.index.name is None def test_array_interface(self, float_frame): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = np.sqrt(float_frame) assert isinstance(result, type(float_frame)) assert result.index is float_frame.index @@ -179,20 +189,18 @@ def test_nonzero(self, float_frame, float_string_frame): assert not float_string_frame.empty # corner case - df = DataFrame({'A': [1., 2., 3.], - 'B': ['a', 'b', 'c']}, - index=np.arange(3)) - del df['A'] + df = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "c"]}, index=np.arange(3)) + del df["A"] assert not df.empty def test_iteritems(self): - df = self.klass([[1, 2, 3], [4, 5, 6]], columns=['a', 'a', 'b']) + df = self.klass([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) for k, v in df.items(): assert isinstance(v, self.klass._constructor_sliced) def test_items(self): # GH 17213, GH 13918 - cols = ['a', 'b', 'c'] + cols = ["a", "b", "c"] df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=cols) for c, (k, v) in zip(cols, df.items()): assert c == k @@ -214,11 +222,14 @@ def test_iterrows(self, float_frame, float_string_frame): def test_iterrows_iso8601(self): # GH 19671 if self.klass == SparseDataFrame: - pytest.xfail(reason='SparseBlock datetime type not implemented.') + pytest.xfail(reason="SparseBlock datetime type not implemented.") s = self.klass( - {'non_iso8601': ['M1701', 'M1802', 'M1903', 'M2004'], - 'iso8601': date_range('2000-01-01', periods=4, freq='M')}) + { + "non_iso8601": ["M1701", "M1802", "M1903", "M2004"], + "iso8601": date_range("2000-01-01", periods=4, freq="M"), + } + ) for k, v in s.iterrows(): exp = s.loc[k] self._assert_series_equal(v, exp) @@ -226,11 +237,22 @@ def test_iterrows_iso8601(self): def test_iterrows_corner(self): # gh-12222 df = DataFrame( - {'a': [datetime.datetime(2015, 1, 1)], 'b': [None], 'c': [None], - 'd': [''], 'e': [[]], 'f': [set()], 'g': [{}]}) + { + "a": [datetime.datetime(2015, 1, 1)], + "b": [None], + "c": [None], + "d": [""], + "e": [[]], + "f": [set()], + "g": [{}], + } + ) expected = Series( - [datetime.datetime(2015, 1, 1), None, None, '', [], set(), {}], - index=list('abcdefg'), name=0, dtype='object') + [datetime.datetime(2015, 1, 1), None, None, "", [], set(), {}], + index=list("abcdefg"), + name=0, + dtype="object", + ) _, result = next(df.iterrows()) tm.assert_series_equal(result, expected) @@ -241,46 +263,49 @@ def test_itertuples(self, float_frame): expected = float_frame.iloc[i, :].reset_index(drop=True) self._assert_series_equal(s, expected) - df = self.klass({'floats': np.random.randn(5), - 'ints': range(5)}, columns=['floats', 'ints']) + df = self.klass( + {"floats": np.random.randn(5), "ints": range(5)}, columns=["floats", "ints"] + ) for tup in df.itertuples(index=False): assert isinstance(tup[1], int) df = self.klass(data={"a": [1, 2, 3], "b": [4, 5, 6]}) - dfaa = df[['a', 'a']] + dfaa = df[["a", "a"]] - assert (list(dfaa.itertuples()) == - [(0, 1, 1), (1, 2, 2), (2, 3, 3)]) + assert list(dfaa.itertuples()) == [(0, 1, 1), (1, 2, 2), (2, 3, 3)] # repr with int on 32-bit/windows if not (compat.is_platform_windows() or compat.is_platform_32bit()): - assert (repr(list(df.itertuples(name=None))) == - '[(0, 1, 4), (1, 2, 5), (2, 3, 6)]') + assert ( + repr(list(df.itertuples(name=None))) + == "[(0, 1, 4), (1, 2, 5), (2, 3, 6)]" + ) - tup = next(df.itertuples(name='TestName')) - assert tup._fields == ('Index', 'a', 'b') + tup = next(df.itertuples(name="TestName")) + assert tup._fields == ("Index", "a", "b") assert (tup.Index, tup.a, tup.b) == tup - assert type(tup).__name__ == 'TestName' + assert type(tup).__name__ == "TestName" - df.columns = ['def', 'return'] - tup2 = next(df.itertuples(name='TestName')) + df.columns = ["def", "return"] + tup2 = next(df.itertuples(name="TestName")) assert tup2 == (0, 1, 4) - assert tup2._fields == ('Index', '_1', '_2') + assert tup2._fields == ("Index", "_1", "_2") - df3 = DataFrame({'f' + str(i): [i] for i in range(1024)}) + df3 = DataFrame({"f" + str(i): [i] for i in range(1024)}) # will raise SyntaxError if trying to create namedtuple tup3 = next(df3.itertuples()) - assert not hasattr(tup3, '_fields') + assert not hasattr(tup3, "_fields") assert isinstance(tup3, tuple) def test_sequence_like_with_categorical(self): # GH 7839 # make sure can iterate - df = DataFrame({"id": [1, 2, 3, 4, 5, 6], - "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']}) - df['grade'] = Categorical(df['raw_grade']) + df = DataFrame( + {"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]} + ) + df["grade"] = Categorical(df["raw_grade"]) # basic sequencing testing result = list(df.grade.values) @@ -314,16 +339,16 @@ def test_values(self, float_frame, float_string_frame): assert value == frame[col][i] # mixed type - arr = float_string_frame[['foo', 'A']].values - assert arr[0, 0] == 'bar' + arr = float_string_frame[["foo", "A"]].values + assert arr[0, 0] == "bar" - df = self.klass({'complex': [1j, 2j, 3j], 'real': [1, 2, 3]}) + df = self.klass({"complex": [1j, 2j, 3j], "real": [1, 2, 3]}) arr = df.values assert arr[0, 0] == 1j # single block corner case - arr = float_frame[['A', 'B']].values - expected = float_frame.reindex(columns=['A', 'B']).values + arr = float_frame[["A", "B"]].values + expected = float_frame.reindex(columns=["A", "B"]).values assert_almost_equal(arr, expected) def test_to_numpy(self): @@ -368,8 +393,10 @@ def test_swapaxes(self): self._assert_frame_equal(df.T, df.swapaxes(0, 1)) self._assert_frame_equal(df.T, df.swapaxes(1, 0)) self._assert_frame_equal(df, df.swapaxes(0, 0)) - msg = ("No axis named 2 for object type" - r" ") + msg = ( + "No axis named 2 for object type" + r" " + ) with pytest.raises(ValueError, match=msg): df.swapaxes(2, 5) @@ -378,11 +405,11 @@ def test_axis_aliases(self, float_frame): # reg name expected = f.sum(axis=0) - result = f.sum(axis='index') + result = f.sum(axis="index") assert_series_equal(result, expected) expected = f.sum(axis=1) - result = f.sum(axis='columns') + result = f.sum(axis="columns") assert_series_equal(result, expected) def test_class_axis(self): @@ -396,10 +423,11 @@ def test_more_values(self, float_string_frame): assert values.shape[1] == len(float_string_frame.columns) def test_repr_with_mi_nat(self, float_string_frame): - df = self.klass({'X': [1, 2]}, - index=[[pd.NaT, pd.Timestamp('20130101')], ['a', 'b']]) + df = self.klass( + {"X": [1, 2]}, index=[[pd.NaT, pd.Timestamp("20130101")], ["a", "b"]] + ) result = repr(df) - expected = ' X\nNaT a 1\n2013-01-01 b 2' + expected = " X\nNaT a 1\n2013-01-01 b 2" assert result == expected def test_iteritems_names(self, float_string_frame): @@ -416,26 +444,32 @@ def test_empty_nonzero(self): assert not df.empty df = self.klass(index=[1], columns=[1]) assert not df.empty - df = self.klass(index=['a', 'b'], columns=['c', 'd']).dropna() + df = self.klass(index=["a", "b"], columns=["c", "d"]).dropna() assert df.empty assert df.T.empty - empty_frames = [self.klass(), - self.klass(index=[1]), - self.klass(columns=[1]), - self.klass({1: []})] + empty_frames = [ + self.klass(), + self.klass(index=[1]), + self.klass(columns=[1]), + self.klass({1: []}), + ] for df in empty_frames: assert df.empty assert df.T.empty def test_with_datetimelikes(self): - df = self.klass({'A': date_range('20130101', periods=10), - 'B': timedelta_range('1 day', periods=10)}) + df = self.klass( + { + "A": date_range("20130101", periods=10), + "B": timedelta_range("1 day", periods=10), + } + ) t = df.T result = t.dtypes.value_counts() if self.klass is DataFrame: - expected = Series({np.dtype('object'): 10}) + expected = Series({np.dtype("object"): 10}) else: expected = Series({SparseDtype(dtype=object): 10}) tm.assert_series_equal(result, expected) @@ -449,7 +483,7 @@ class TestDataFrameMisc(SharedWithSparse): _assert_series_equal = staticmethod(assert_series_equal) def test_values(self, float_frame): - float_frame.values[:, 0] = 5. + float_frame.values[:, 0] = 5.0 assert (float_frame.values[:, 0] == 5).all() def test_as_matrix_deprecated(self, float_frame): @@ -462,10 +496,10 @@ def test_as_matrix_deprecated(self, float_frame): def test_deepcopy(self, float_frame): cp = deepcopy(float_frame) - series = cp['A'] + series = cp["A"] series[:] = 10 for idx, value in series.items(): - assert float_frame['A'][idx] != value + assert float_frame["A"][idx] != value def test_transpose_get_view(self, float_frame): dft = float_frame.T @@ -476,9 +510,9 @@ def test_transpose_get_view(self, float_frame): def test_inplace_return_self(self): # GH 1893 - data = DataFrame({'a': ['foo', 'bar', 'baz', 'qux'], - 'b': [0, 0, 1, 1], - 'c': [1, 2, 3, 4]}) + data = DataFrame( + {"a": ["foo", "bar", "baz", "qux"], "b": [0, 0, 1, 1], "c": [1, 2, 3, 4]} + ) def _check_f(base, f): result = f(base) @@ -487,19 +521,19 @@ def _check_f(base, f): # -----DataFrame----- # set_index - f = lambda x: x.set_index('a', inplace=True) + f = lambda x: x.set_index("a", inplace=True) _check_f(data.copy(), f) # reset_index f = lambda x: x.reset_index(inplace=True) - _check_f(data.set_index('a'), f) + _check_f(data.set_index("a"), f) # drop_duplicates f = lambda x: x.drop_duplicates(inplace=True) _check_f(data.copy(), f) # sort - f = lambda x: x.sort_values('b', inplace=True) + f = lambda x: x.sort_values("b", inplace=True) _check_f(data.copy(), f) # sort_index @@ -515,15 +549,15 @@ def _check_f(base, f): _check_f(data.copy(), f) # rename - f = lambda x: x.rename({1: 'foo'}, inplace=True) + f = lambda x: x.rename({1: "foo"}, inplace=True) _check_f(data.copy(), f) # -----Series----- - d = data.copy()['c'] + d = data.copy()["c"] # reset_index f = lambda x: x.reset_index(inplace=True, drop=True) - _check_f(data.set_index('a')['c'], f) + _check_f(data.set_index("a")["c"], f) # fillna f = lambda x: x.fillna(0, inplace=True) @@ -534,22 +568,22 @@ def _check_f(base, f): _check_f(d.copy(), f) # rename - f = lambda x: x.rename({1: 'foo'}, inplace=True) + f = lambda x: x.rename({1: "foo"}, inplace=True) _check_f(d.copy(), f) def test_tab_complete_warning(self, ip): # GH 16409 - pytest.importorskip('IPython', minversion="6.0.0") + pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; df = pd.DataFrame()" ip.run_code(code) with tm.assert_produces_warning(None): - with provisionalcompleter('ignore'): - list(ip.Completer.completions('df.', 1)) + with provisionalcompleter("ignore"): + list(ip.Completer.completions("df.", 1)) def test_get_values_deprecated(self): - df = DataFrame({'a': [1, 2], 'b': [.1, .2]}) + df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]}) with tm.assert_produces_warning(FutureWarning): res = df.get_values() tm.assert_numpy_array_equal(res, df.values) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 0c09956b3f2fb7..92912ff9ec0932 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -24,22 +24,23 @@ def int_frame_const_col(): Columns are ['A', 'B', 'C'], with values (per column): [1, 2, 3] """ - df = DataFrame(np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, - columns=['A', 'B', 'C']) + df = DataFrame( + np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1, + columns=["A", "B", "C"], + ) return df class TestDataFrameApply: - def test_apply(self, float_frame): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): # ufunc applied = float_frame.apply(np.sqrt) - tm.assert_series_equal(np.sqrt(float_frame['A']), applied['A']) + tm.assert_series_equal(np.sqrt(float_frame["A"]), applied["A"]) # aggregator applied = float_frame.apply(np.mean) - assert applied['A'] == np.mean(float_frame['A']) + assert applied["A"] == np.mean(float_frame["A"]) d = float_frame.index[0] applied = float_frame.apply(np.mean, axis=1) @@ -47,25 +48,27 @@ def test_apply(self, float_frame): assert applied.index is float_frame.index # want this # invalid axis - df = DataFrame( - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) with pytest.raises(ValueError): df.apply(lambda x: x, 2) # GH 9573 - df = DataFrame({'c0': ['A', 'A', 'B', 'B'], - 'c1': ['C', 'C', 'D', 'D']}) - df = df.apply(lambda ts: ts.astype('category')) + df = DataFrame({"c0": ["A", "A", "B", "B"], "c1": ["C", "C", "D", "D"]}) + df = df.apply(lambda ts: ts.astype("category")) assert df.shape == (4, 2) - assert isinstance(df['c0'].dtype, CategoricalDtype) - assert isinstance(df['c1'].dtype, CategoricalDtype) + assert isinstance(df["c0"].dtype, CategoricalDtype) + assert isinstance(df["c1"].dtype, CategoricalDtype) def test_apply_mixed_datetimelike(self): # mixed datetimelike # GH 7778 - df = DataFrame({'A': date_range('20130101', periods=3), - 'B': pd.to_timedelta(np.arange(3), unit='s')}) + df = DataFrame( + { + "A": date_range("20130101", periods=3), + "B": pd.to_timedelta(np.arange(3), unit="s"), + } + ) result = df.apply(lambda x: x, axis=1) assert_frame_equal(result, df) @@ -90,8 +93,8 @@ def test_apply_empty(self, float_frame): assert_series_equal(result, expected) # GH 2476 - expected = DataFrame(index=['a']) - result = expected.apply(lambda x: x['a'], axis=1) + expected = DataFrame(index=["a"]) + result = expected.apply(lambda x: x["a"], axis=1) assert_frame_equal(expected, result) def test_apply_with_reduce_empty(self): @@ -99,18 +102,16 @@ def test_apply_with_reduce_empty(self): empty_frame = DataFrame() x = [] - result = empty_frame.apply(x.append, axis=1, result_type='expand') + result = empty_frame.apply(x.append, axis=1, result_type="expand") assert_frame_equal(result, empty_frame) - result = empty_frame.apply(x.append, axis=1, result_type='reduce') - assert_series_equal(result, Series( - [], index=pd.Index([], dtype=object))) + result = empty_frame.apply(x.append, axis=1, result_type="reduce") + assert_series_equal(result, Series([], index=pd.Index([], dtype=object))) - empty_with_cols = DataFrame(columns=['a', 'b', 'c']) - result = empty_with_cols.apply(x.append, axis=1, result_type='expand') + empty_with_cols = DataFrame(columns=["a", "b", "c"]) + result = empty_with_cols.apply(x.append, axis=1, result_type="expand") assert_frame_equal(result, empty_with_cols) - result = empty_with_cols.apply(x.append, axis=1, result_type='reduce') - assert_series_equal(result, Series( - [], index=pd.Index([], dtype=object))) + result = empty_with_cols.apply(x.append, axis=1, result_type="reduce") + assert_series_equal(result, Series([], index=pd.Index([], dtype=object))) # Ensure that x.append hasn't been called assert x == [] @@ -123,24 +124,26 @@ def test_apply_deprecate_reduce(self): empty_frame.apply(x.append, axis=1, reduce=True) def test_apply_standard_nonunique(self): - df = DataFrame( - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) result = df.apply(lambda s: s[0], axis=1) - expected = Series([1, 4, 7], ['a', 'a', 'c']) + expected = Series([1, 4, 7], ["a", "a", "c"]) assert_series_equal(result, expected) result = df.T.apply(lambda s: s[0], axis=0) assert_series_equal(result, expected) - @pytest.mark.parametrize('func', ['sum', 'mean', 'min', 'max', 'std']) - @pytest.mark.parametrize('args,kwds', [ - pytest.param([], {}, id='no_args_or_kwds'), - pytest.param([1], {}, id='axis_from_args'), - pytest.param([], {'axis': 1}, id='axis_from_kwds'), - pytest.param([], {'numeric_only': True}, id='optional_kwds'), - pytest.param([1, None], {'numeric_only': True}, id='args_and_kwds') - ]) + @pytest.mark.parametrize("func", ["sum", "mean", "min", "max", "std"]) + @pytest.mark.parametrize( + "args,kwds", + [ + pytest.param([], {}, id="no_args_or_kwds"), + pytest.param([1], {}, id="axis_from_args"), + pytest.param([], {"axis": 1}, id="axis_from_kwds"), + pytest.param([], {"numeric_only": True}, id="optional_kwds"), + pytest.param([1, None], {"numeric_only": True}, id="args_and_kwds"), + ], + ) def test_apply_with_string_funcs(self, float_frame, func, args, kwds): result = float_frame.apply(func, *args, **kwds) expected = getattr(float_frame, func)(*args, **kwds) @@ -153,11 +156,11 @@ def test_apply_broadcast_deprecated(self, float_frame): def test_apply_broadcast(self, float_frame, int_frame_const_col): # scalars - result = float_frame.apply(np.mean, result_type='broadcast') + result = float_frame.apply(np.mean, result_type="broadcast") expected = DataFrame([float_frame.mean()], index=float_frame.index) tm.assert_frame_equal(result, expected) - result = float_frame.apply(np.mean, axis=1, result_type='broadcast') + result = float_frame.apply(np.mean, axis=1, result_type="broadcast") m = float_frame.mean(axis=1) expected = DataFrame({c: m for c in float_frame.columns}) tm.assert_frame_equal(result, expected) @@ -166,31 +169,39 @@ def test_apply_broadcast(self, float_frame, int_frame_const_col): result = float_frame.apply( lambda x: list(range(len(float_frame.columns))), axis=1, - result_type='broadcast') + result_type="broadcast", + ) m = list(range(len(float_frame.columns))) - expected = DataFrame([m] * len(float_frame.index), - dtype='float64', - index=float_frame.index, - columns=float_frame.columns) + expected = DataFrame( + [m] * len(float_frame.index), + dtype="float64", + index=float_frame.index, + columns=float_frame.columns, + ) tm.assert_frame_equal(result, expected) - result = float_frame.apply(lambda x: - list(range(len(float_frame.index))), - result_type='broadcast') + result = float_frame.apply( + lambda x: list(range(len(float_frame.index))), result_type="broadcast" + ) m = list(range(len(float_frame.index))) - expected = DataFrame({c: m for c in float_frame.columns}, - dtype='float64', - index=float_frame.index) + expected = DataFrame( + {c: m for c in float_frame.columns}, + dtype="float64", + index=float_frame.index, + ) tm.assert_frame_equal(result, expected) # preserve columns df = int_frame_const_col - result = df.apply(lambda x: [1, 2, 3], axis=1, result_type='broadcast') + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast") tm.assert_frame_equal(result, df) df = int_frame_const_col - result = df.apply(lambda x: Series([1, 2, 3], index=list('abc')), - axis=1, result_type='broadcast') + result = df.apply( + lambda x: Series([1, 2, 3], index=list("abc")), + axis=1, + result_type="broadcast", + ) expected = df.copy() tm.assert_frame_equal(result, expected) @@ -199,15 +210,18 @@ def test_apply_broadcast_error(self, int_frame_const_col): # > 1 ndim with pytest.raises(ValueError): - df.apply(lambda x: np.array([1, 2]).reshape(-1, 2), - axis=1, result_type='broadcast') + df.apply( + lambda x: np.array([1, 2]).reshape(-1, 2), + axis=1, + result_type="broadcast", + ) # cannot broadcast with pytest.raises(ValueError): - df.apply(lambda x: [1, 2], axis=1, result_type='broadcast') + df.apply(lambda x: [1, 2], axis=1, result_type="broadcast") with pytest.raises(ValueError): - df.apply(lambda x: Series([1, 2]), axis=1, result_type='broadcast') + df.apply(lambda x: Series([1, 2]), axis=1, result_type="broadcast") def test_apply_raw(self, float_frame): result0 = float_frame.apply(np.mean, raw=True) @@ -230,38 +244,37 @@ def test_apply_axis1(self, float_frame): assert tapplied[d] == np.mean(float_frame.xs(d)) def test_apply_ignore_failures(self, float_string_frame): - result = frame_apply(float_string_frame, np.mean, 0, - ignore_failures=True).apply_standard() + result = frame_apply( + float_string_frame, np.mean, 0, ignore_failures=True + ).apply_standard() expected = float_string_frame._get_numeric_data().apply(np.mean) assert_series_equal(result, expected) def test_apply_mixed_dtype_corner(self): - df = DataFrame({'A': ['foo'], - 'B': [1.]}) + df = DataFrame({"A": ["foo"], "B": [1.0]}) result = df[:0].apply(np.mean, axis=1) # the result here is actually kind of ambiguous, should it be a Series # or a DataFrame? - expected = Series(np.nan, index=pd.Index([], dtype='int64')) + expected = Series(np.nan, index=pd.Index([], dtype="int64")) assert_series_equal(result, expected) - df = DataFrame({'A': ['foo'], - 'B': [1.]}) - result = df.apply(lambda x: x['A'], axis=1) - expected = Series(['foo'], index=[0]) + df = DataFrame({"A": ["foo"], "B": [1.0]}) + result = df.apply(lambda x: x["A"], axis=1) + expected = Series(["foo"], index=[0]) assert_series_equal(result, expected) - result = df.apply(lambda x: x['B'], axis=1) - expected = Series([1.], index=[0]) + result = df.apply(lambda x: x["B"], axis=1) + expected = Series([1.0], index=[0]) assert_series_equal(result, expected) def test_apply_empty_infer_type(self): - no_cols = DataFrame(index=['a', 'b', 'c']) - no_index = DataFrame(columns=['a', 'b', 'c']) + no_cols = DataFrame(index=["a", "b", "c"]) + no_index = DataFrame(columns=["a", "b", "c"]) def _check(df, f): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) - test_res = f(np.array([], dtype='f8')) + test_res = f(np.array([], dtype="f8")) is_reduction = not isinstance(test_res, np.ndarray) def _checkit(axis=0, raw=False): @@ -278,13 +291,13 @@ def _checkit(axis=0, raw=False): _checkit(raw=True) _checkit(axis=0, raw=True) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): _check(no_cols, lambda x: x) _check(no_cols, lambda x: x.mean()) _check(no_index, lambda x: x) _check(no_index, lambda x: x.mean()) - result = no_cols.apply(lambda x: x.mean(), result_type='broadcast') + result = no_cols.apply(lambda x: x.mean(), result_type="broadcast") assert isinstance(result, DataFrame) def test_apply_with_args_kwds(self, float_frame): @@ -306,7 +319,7 @@ def subtract_and_divide(x, sub, divide=1): assert_series_equal(result, expected) result = float_frame.apply(subtract_and_divide, args=(2,), divide=2) - expected = float_frame.apply(lambda x: (x - 2.) / 2.) + expected = float_frame.apply(lambda x: (x - 2.0) / 2.0) assert_frame_equal(result, expected) def test_apply_yield_list(self, float_frame): @@ -314,7 +327,7 @@ def test_apply_yield_list(self, float_frame): assert_frame_equal(result, float_frame) def test_apply_reduce_Series(self, float_frame): - float_frame.loc[::2, 'A'] = np.nan + float_frame.loc[::2, "A"] = np.nan expected = float_frame.mean(1) result = float_frame.apply(np.mean, axis=1) assert_series_equal(result, expected) @@ -330,86 +343,165 @@ def test_apply_differently_indexed(self): df = DataFrame(np.random.randn(20, 10)) result0 = df.apply(Series.describe, axis=0) - expected0 = DataFrame({i: v.describe() - for i, v in df.items()}, - columns=df.columns) + expected0 = DataFrame( + {i: v.describe() for i, v in df.items()}, columns=df.columns + ) assert_frame_equal(result0, expected0) result1 = df.apply(Series.describe, axis=1) - expected1 = DataFrame({i: v.describe() - for i, v in df.T.items()}, - columns=df.index).T + expected1 = DataFrame( + {i: v.describe() for i, v in df.T.items()}, columns=df.index + ).T assert_frame_equal(result1, expected1) def test_apply_modify_traceback(self): - data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', - 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', - 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', - 'dull', 'shiny', 'shiny', 'dull', - 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) - - data.loc[4, 'C'] = np.nan + data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) + + data.loc[4, "C"] = np.nan def transform(row): - if row['C'].startswith('shin') and row['A'] == 'foo': - row['D'] = 7 + if row["C"].startswith("shin") and row["A"] == "foo": + row["D"] = 7 return row def transform2(row): - if (notna(row['C']) and row['C'].startswith('shin') and - row['A'] == 'foo'): - row['D'] = 7 + if notna(row["C"]) and row["C"].startswith("shin") and row["A"] == "foo": + row["D"] = 7 return row try: data.apply(transform, axis=1) except AttributeError as e: assert len(e.args) == 2 - assert e.args[1] == 'occurred at index 4' + assert e.args[1] == "occurred at index 4" assert e.args[0] == "'float' object has no attribute 'startswith'" def test_apply_bug(self): # GH 6125 - positions = pd.DataFrame([[1, 'ABC0', 50], [1, 'YUM0', 20], - [1, 'DEF0', 20], [2, 'ABC1', 50], - [2, 'YUM1', 20], [2, 'DEF1', 20]], - columns=['a', 'market', 'position']) + positions = pd.DataFrame( + [ + [1, "ABC0", 50], + [1, "YUM0", 20], + [1, "DEF0", 20], + [2, "ABC1", 50], + [2, "YUM1", 20], + [2, "DEF1", 20], + ], + columns=["a", "market", "position"], + ) def f(r): - return r['market'] + return r["market"] + expected = positions.apply(f, axis=1) - positions = DataFrame([[datetime(2013, 1, 1), 'ABC0', 50], - [datetime(2013, 1, 2), 'YUM0', 20], - [datetime(2013, 1, 3), 'DEF0', 20], - [datetime(2013, 1, 4), 'ABC1', 50], - [datetime(2013, 1, 5), 'YUM1', 20], - [datetime(2013, 1, 6), 'DEF1', 20]], - columns=['a', 'market', 'position']) + positions = DataFrame( + [ + [datetime(2013, 1, 1), "ABC0", 50], + [datetime(2013, 1, 2), "YUM0", 20], + [datetime(2013, 1, 3), "DEF0", 20], + [datetime(2013, 1, 4), "ABC1", 50], + [datetime(2013, 1, 5), "YUM1", 20], + [datetime(2013, 1, 6), "DEF1", 20], + ], + columns=["a", "market", "position"], + ) result = positions.apply(f, axis=1) assert_series_equal(result, expected) def test_apply_convert_objects(self): - data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', - 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', - 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', - 'dull', 'shiny', 'shiny', 'dull', - 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) + data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) result = data.apply(lambda x: x, axis=1) assert_frame_equal(result._convert(datetime=True), data) @@ -425,44 +517,43 @@ def test_apply_attach_name(self, float_frame): # non-reductions result = float_frame.apply(lambda x: np.repeat(x.name, len(x))) - expected = DataFrame(np.tile(float_frame.columns, - (len(float_frame.index), 1)), - index=float_frame.index, - columns=float_frame.columns) + expected = DataFrame( + np.tile(float_frame.columns, (len(float_frame.index), 1)), + index=float_frame.index, + columns=float_frame.columns, + ) assert_frame_equal(result, expected) - result = float_frame.apply(lambda x: np.repeat(x.name, len(x)), - axis=1) - expected = Series(np.repeat(t[0], len(float_frame.columns)) - for t in float_frame.itertuples()) + result = float_frame.apply(lambda x: np.repeat(x.name, len(x)), axis=1) + expected = Series( + np.repeat(t[0], len(float_frame.columns)) for t in float_frame.itertuples() + ) expected.index = float_frame.index assert_series_equal(result, expected) def test_apply_multi_index(self, float_frame): - index = MultiIndex.from_arrays([['a', 'a', 'b'], ['c', 'd', 'd']]) - s = DataFrame([[1, 2], [3, 4], [5, 6]], - index=index, - columns=['col1', 'col2']) - result = s.apply( - lambda x: Series({'min': min(x), 'max': max(x)}), 1) - expected = DataFrame([[1, 2], [3, 4], [5, 6]], - index=index, - columns=['min', 'max']) + index = MultiIndex.from_arrays([["a", "a", "b"], ["c", "d", "d"]]) + s = DataFrame([[1, 2], [3, 4], [5, 6]], index=index, columns=["col1", "col2"]) + result = s.apply(lambda x: Series({"min": min(x), "max": max(x)}), 1) + expected = DataFrame( + [[1, 2], [3, 4], [5, 6]], index=index, columns=["min", "max"] + ) assert_frame_equal(result, expected, check_like=True) def test_apply_dict(self): # GH 8735 - A = DataFrame([['foo', 'bar'], ['spam', 'eggs']]) - A_dicts = Series([dict([(0, 'foo'), (1, 'spam')]), - dict([(0, 'bar'), (1, 'eggs')])]) + A = DataFrame([["foo", "bar"], ["spam", "eggs"]]) + A_dicts = Series( + [dict([(0, "foo"), (1, "spam")]), dict([(0, "bar"), (1, "eggs")])] + ) B = DataFrame([[0, 1], [2, 3]]) B_dicts = Series([dict([(0, 0), (1, 2)]), dict([(0, 1), (1, 3)])]) fn = lambda x: x.to_dict() for df, dicts in [(A, A_dicts), (B, B_dicts)]: - reduce_true = df.apply(fn, result_type='reduce') - reduce_false = df.apply(fn, result_type='expand') + reduce_true = df.apply(fn, result_type="reduce") + reduce_false = df.apply(fn, result_type="expand") reduce_none = df.apply(fn) assert_series_equal(reduce_true, dicts) @@ -476,21 +567,21 @@ def test_applymap(self, float_frame): # GH 465: function returning tuples result = float_frame.applymap(lambda x: (x, x)) - assert isinstance(result['A'][0], tuple) + assert isinstance(result["A"][0], tuple) # GH 2909: object conversion to float in constructor? - df = DataFrame(data=[1, 'a']) + df = DataFrame(data=[1, "a"]) result = df.applymap(lambda x: x) assert result.dtypes[0] == object - df = DataFrame(data=[1., 'a']) + df = DataFrame(data=[1.0, "a"]) result = df.applymap(lambda x: x) assert result.dtypes[0] == object # GH 2786 df = DataFrame(np.random.random((3, 4))) df2 = df.copy() - cols = ['a', 'a', 'a', 'a'] + cols = ["a", "a", "a", "a"] df.columns = cols expected = df2.applymap(str) @@ -499,17 +590,19 @@ def test_applymap(self, float_frame): tm.assert_frame_equal(result, expected) # datetime/timedelta - df['datetime'] = Timestamp('20130101') - df['timedelta'] = pd.Timedelta('1 min') + df["datetime"] = Timestamp("20130101") + df["timedelta"] = pd.Timedelta("1 min") result = df.applymap(str) - for f in ['datetime', 'timedelta']: + for f in ["datetime", "timedelta"]: assert result.loc[0, f] == str(df.loc[0, f]) # GH 8222 - empty_frames = [pd.DataFrame(), - pd.DataFrame(columns=list('ABC')), - pd.DataFrame(index=list('ABC')), - pd.DataFrame({'A': [], 'B': [], 'C': []})] + empty_frames = [ + pd.DataFrame(), + pd.DataFrame(columns=list("ABC")), + pd.DataFrame(index=list("ABC")), + pd.DataFrame({"A": [], "B": [], "C": []}), + ] for frame in empty_frames: for func in [round, lambda x: x]: result = frame.applymap(func) @@ -517,7 +610,7 @@ def test_applymap(self, float_frame): def test_applymap_box_timestamps(self): # GH 2689, GH 2627 - ser = pd.Series(date_range('1/1/2000', periods=10)) + ser = pd.Series(date_range("1/1/2000", periods=10)) def func(x): return (x.hour, x.day, x.month) @@ -527,52 +620,65 @@ def func(x): def test_applymap_box(self): # ufunc will not be boxed. Same test cases as the test_map_box - df = pd.DataFrame({'a': [pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-02')], - 'b': [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern')], - 'c': [pd.Timedelta('1 days'), - pd.Timedelta('2 days')], - 'd': [pd.Period('2011-01-01', freq='M'), - pd.Period('2011-01-02', freq='M')]}) - - result = df.applymap(lambda x: '{0}'.format(x.__class__.__name__)) - expected = pd.DataFrame({'a': ['Timestamp', 'Timestamp'], - 'b': ['Timestamp', 'Timestamp'], - 'c': ['Timedelta', 'Timedelta'], - 'd': ['Period', 'Period']}) + df = pd.DataFrame( + { + "a": [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")], + "b": [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + ], + "c": [pd.Timedelta("1 days"), pd.Timedelta("2 days")], + "d": [ + pd.Period("2011-01-01", freq="M"), + pd.Period("2011-01-02", freq="M"), + ], + } + ) + + result = df.applymap(lambda x: "{0}".format(x.__class__.__name__)) + expected = pd.DataFrame( + { + "a": ["Timestamp", "Timestamp"], + "b": ["Timestamp", "Timestamp"], + "c": ["Timedelta", "Timedelta"], + "d": ["Period", "Period"], + } + ) tm.assert_frame_equal(result, expected) def test_frame_apply_dont_convert_datetime64(self): from pandas.tseries.offsets import BDay - df = DataFrame({'x1': [datetime(1996, 1, 1)]}) + + df = DataFrame({"x1": [datetime(1996, 1, 1)]}) df = df.applymap(lambda x: x + BDay()) df = df.applymap(lambda x: x + BDay()) - assert df.x1.dtype == 'M8[ns]' + assert df.x1.dtype == "M8[ns]" def test_apply_non_numpy_dtype(self): # GH 12244 - df = DataFrame({'dt': pd.date_range( - "2015-01-01", periods=3, tz='Europe/Brussels')}) + df = DataFrame( + {"dt": pd.date_range("2015-01-01", periods=3, tz="Europe/Brussels")} + ) result = df.apply(lambda x: x) assert_frame_equal(result, df) - result = df.apply(lambda x: x + pd.Timedelta('1day')) - expected = DataFrame({'dt': pd.date_range( - "2015-01-02", periods=3, tz='Europe/Brussels')}) + result = df.apply(lambda x: x + pd.Timedelta("1day")) + expected = DataFrame( + {"dt": pd.date_range("2015-01-02", periods=3, tz="Europe/Brussels")} + ) assert_frame_equal(result, expected) - df = DataFrame({'dt': ['a', 'b', 'c', 'a']}, dtype='category') + df = DataFrame({"dt": ["a", "b", "c", "a"]}, dtype="category") result = df.apply(lambda x: x) assert_frame_equal(result, df) def test_apply_dup_names_multi_agg(self): # GH 21063 - df = pd.DataFrame([[0, 1], [2, 3]], columns=['a', 'a']) - expected = pd.DataFrame([[0, 1]], columns=['a', 'a'], index=['min']) - result = df.agg(['min']) + df = pd.DataFrame([[0, 1], [2, 3]], columns=["a", "a"]) + expected = pd.DataFrame([[0, 1]], columns=["a", "a"], index=["min"]) + result = df.agg(["min"]) tm.assert_frame_equal(result, expected) @@ -594,80 +700,93 @@ def test_infer_row_shape(self): def test_with_dictlike_columns(self): # GH 17602 - df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) - result = df.apply(lambda x: {'s': x['a'] + x['b']}, - axis=1) - expected = Series([{'s': 3} for t in df.itertuples()]) + df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) + result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1) + expected = Series([{"s": 3} for t in df.itertuples()]) assert_series_equal(result, expected) - df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'), - pd.Timestamp('2017-05-02 00:00:00')] - result = df.apply(lambda x: {'s': x['a'] + x['b']}, - axis=1) + df["tm"] = [ + pd.Timestamp("2017-05-01 00:00:00"), + pd.Timestamp("2017-05-02 00:00:00"), + ] + result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1) assert_series_equal(result, expected) # compose a series - result = (df['a'] + df['b']).apply(lambda x: {'s': x}) - expected = Series([{'s': 3}, {'s': 3}]) + result = (df["a"] + df["b"]).apply(lambda x: {"s": x}) + expected = Series([{"s": 3}, {"s": 3}]) assert_series_equal(result, expected) # GH 18775 df = DataFrame() df["author"] = ["X", "Y", "Z"] df["publisher"] = ["BBC", "NBC", "N24"] - df["date"] = pd.to_datetime(['17-10-2010 07:15:30', - '13-05-2011 08:20:35', - '15-01-2013 09:09:09']) + df["date"] = pd.to_datetime( + ["17-10-2010 07:15:30", "13-05-2011 08:20:35", "15-01-2013 09:09:09"] + ) result = df.apply(lambda x: {}, axis=1) expected = Series([{}, {}, {}]) assert_series_equal(result, expected) def test_with_dictlike_columns_with_infer(self): # GH 17602 - df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) - result = df.apply(lambda x: {'s': x['a'] + x['b']}, - axis=1, result_type='expand') - expected = DataFrame({'s': [3, 3]}) + df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) + result = df.apply( + lambda x: {"s": x["a"] + x["b"]}, axis=1, result_type="expand" + ) + expected = DataFrame({"s": [3, 3]}) assert_frame_equal(result, expected) - df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'), - pd.Timestamp('2017-05-02 00:00:00')] - result = df.apply(lambda x: {'s': x['a'] + x['b']}, - axis=1, result_type='expand') + df["tm"] = [ + pd.Timestamp("2017-05-01 00:00:00"), + pd.Timestamp("2017-05-02 00:00:00"), + ] + result = df.apply( + lambda x: {"s": x["a"] + x["b"]}, axis=1, result_type="expand" + ) assert_frame_equal(result, expected) def test_with_listlike_columns(self): # GH 17348 - df = DataFrame({'a': Series(np.random.randn(4)), - 'b': ['a', 'list', 'of', 'words'], - 'ts': date_range('2016-10-01', periods=4, freq='H')}) - - result = df[['a', 'b']].apply(tuple, axis=1) - expected = Series([t[1:] for t in df[['a', 'b']].itertuples()]) + df = DataFrame( + { + "a": Series(np.random.randn(4)), + "b": ["a", "list", "of", "words"], + "ts": date_range("2016-10-01", periods=4, freq="H"), + } + ) + + result = df[["a", "b"]].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[["a", "b"]].itertuples()]) assert_series_equal(result, expected) - result = df[['a', 'ts']].apply(tuple, axis=1) - expected = Series([t[1:] for t in df[['a', 'ts']].itertuples()]) + result = df[["a", "ts"]].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[["a", "ts"]].itertuples()]) assert_series_equal(result, expected) # GH 18919 - df = DataFrame({'x': Series([['a', 'b'], ['q']]), - 'y': Series([['z'], ['q', 't']])}) - df.index = MultiIndex.from_tuples([('i0', 'j0'), ('i1', 'j1')]) + df = DataFrame( + {"x": Series([["a", "b"], ["q"]]), "y": Series([["z"], ["q", "t"]])} + ) + df.index = MultiIndex.from_tuples([("i0", "j0"), ("i1", "j1")]) - result = df.apply( - lambda row: [el for el in row['x'] if el in row['y']], - axis=1) - expected = Series([[], ['q']], index=df.index) + result = df.apply(lambda row: [el for el in row["x"] if el in row["y"]], axis=1) + expected = Series([[], ["q"]], index=df.index) assert_series_equal(result, expected) def test_infer_output_shape_columns(self): # GH 18573 - df = DataFrame({'number': [1., 2.], - 'string': ['foo', 'bar'], - 'datetime': [pd.Timestamp('2017-11-29 03:30:00'), - pd.Timestamp('2017-11-29 03:45:00')]}) + df = DataFrame( + { + "number": [1.0, 2.0], + "string": ["foo", "bar"], + "datetime": [ + pd.Timestamp("2017-11-29 03:30:00"), + pd.Timestamp("2017-11-29 03:45:00"), + ], + } + ) result = df.apply(lambda row: (row.number, row.string), axis=1) expected = Series([(t.number, t.string) for t in df.itertuples()]) assert_series_equal(result, expected) @@ -675,7 +794,7 @@ def test_infer_output_shape_columns(self): def test_infer_output_shape_listlike_columns(self): # GH 16353 - df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C']) + df = DataFrame(np.random.randn(6, 3), columns=["A", "B", "C"]) result = df.apply(lambda x: [1, 2, 3], axis=1) expected = Series([[1, 2, 3] for t in df.itertuples()]) @@ -686,26 +805,30 @@ def test_infer_output_shape_listlike_columns(self): assert_series_equal(result, expected) # GH 17970 - df = DataFrame({"a": [1, 2, 3]}, index=list('abc')) + df = DataFrame({"a": [1, 2, 3]}, index=list("abc")) result = df.apply(lambda row: np.ones(1), axis=1) - expected = Series([np.ones(1) for t in df.itertuples()], - index=df.index) + expected = Series([np.ones(1) for t in df.itertuples()], index=df.index) assert_series_equal(result, expected) result = df.apply(lambda row: np.ones(2), axis=1) - expected = Series([np.ones(2) for t in df.itertuples()], - index=df.index) + expected = Series([np.ones(2) for t in df.itertuples()], index=df.index) assert_series_equal(result, expected) # GH 17892 - df = pd.DataFrame({'a': [pd.Timestamp('2010-02-01'), - pd.Timestamp('2010-02-04'), - pd.Timestamp('2010-02-05'), - pd.Timestamp('2010-02-06')], - 'b': [9, 5, 4, 3], - 'c': [5, 3, 4, 2], - 'd': [1, 2, 3, 4]}) + df = pd.DataFrame( + { + "a": [ + pd.Timestamp("2010-02-01"), + pd.Timestamp("2010-02-04"), + pd.Timestamp("2010-02-05"), + pd.Timestamp("2010-02-06"), + ], + "b": [9, 5, 4, 3], + "c": [5, 3, 4, 2], + "d": [1, 2, 3, 4], + } + ) def fun(x): return (1, 2) @@ -717,7 +840,7 @@ def fun(x): def test_consistent_coerce_for_shapes(self): # we want column names to NOT be propagated # just because the shape matches the input shape - df = DataFrame(np.random.randn(4, 3), columns=['A', 'B', 'C']) + df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"]) result = df.apply(lambda x: [1, 2, 3], axis=1) expected = Series([[1, 2, 3] for t in df.itertuples()]) @@ -731,17 +854,16 @@ def test_consistent_names(self, int_frame_const_col): # if a Series is returned, we should use the resulting index names df = int_frame_const_col - result = df.apply(lambda x: Series([1, 2, 3], - index=['test', 'other', 'cols']), - axis=1) - expected = int_frame_const_col.rename(columns={'A': 'test', - 'B': 'other', - 'C': 'cols'}) + result = df.apply( + lambda x: Series([1, 2, 3], index=["test", "other", "cols"]), axis=1 + ) + expected = int_frame_const_col.rename( + columns={"A": "test", "B": "other", "C": "cols"} + ) assert_frame_equal(result, expected) - result = df.apply(lambda x: Series([1, 2], index=['test', 'other']), - axis=1) - expected = expected[['test', 'other']] + result = df.apply(lambda x: Series([1, 2], index=["test", "other"]), axis=1) + expected = expected[["test", "other"]] assert_frame_equal(result, expected) def test_result_type(self, int_frame_const_col): @@ -749,24 +871,25 @@ def test_result_type(self, int_frame_const_col): # path we take in the code df = int_frame_const_col - result = df.apply(lambda x: [1, 2, 3], axis=1, result_type='expand') + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand") expected = df.copy() expected.columns = [0, 1, 2] assert_frame_equal(result, expected) - result = df.apply(lambda x: [1, 2], axis=1, result_type='expand') - expected = df[['A', 'B']].copy() + result = df.apply(lambda x: [1, 2], axis=1, result_type="expand") + expected = df[["A", "B"]].copy() expected.columns = [0, 1] assert_frame_equal(result, expected) # broadcast result - result = df.apply(lambda x: [1, 2, 3], axis=1, result_type='broadcast') + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast") expected = df.copy() assert_frame_equal(result, expected) - columns = ['other', 'col', 'names'] - result = df.apply(lambda x: Series([1, 2, 3], index=columns), - axis=1, result_type='broadcast') + columns = ["other", "col", "names"] + result = df.apply( + lambda x: Series([1, 2, 3], index=columns), axis=1, result_type="broadcast" + ) expected = df.copy() assert_frame_equal(result, expected) @@ -776,13 +899,13 @@ def test_result_type(self, int_frame_const_col): assert_frame_equal(result, expected) # series result with other index - columns = ['other', 'col', 'names'] + columns = ["other", "col", "names"] result = df.apply(lambda x: Series([1, 2, 3], index=columns), axis=1) expected = df.copy() expected.columns = columns assert_frame_equal(result, expected) - @pytest.mark.parametrize("result_type", ['foo', 1]) + @pytest.mark.parametrize("result_type", ["foo", 1]) def test_result_type_error(self, result_type, int_frame_const_col): # allowed result_type df = int_frame_const_col @@ -792,10 +915,9 @@ def test_result_type_error(self, result_type, int_frame_const_col): @pytest.mark.parametrize( "box", - [lambda x: list(x), - lambda x: tuple(x), - lambda x: np.array(x, dtype='int64')], - ids=['list', 'tuple', 'array']) + [lambda x: list(x), lambda x: tuple(x), lambda x: np.array(x, dtype="int64")], + ids=["list", "tuple", "array"], + ) def test_consistency_for_boxed(self, box, int_frame_const_col): # passing an array or list should not affect the output shape df = int_frame_const_col @@ -804,9 +926,8 @@ def test_consistency_for_boxed(self, box, int_frame_const_col): expected = Series([box([1, 2]) for t in df.itertuples()]) assert_series_equal(result, expected) - result = df.apply(lambda x: box([1, 2]), axis=1, result_type='expand') - expected = int_frame_const_col[['A', 'B']].rename(columns={'A': 0, - 'B': 1}) + result = df.apply(lambda x: box([1, 2]), axis=1, result_type="expand") + expected = int_frame_const_col[["A", "B"]].rename(columns={"A": 0, "B": 1}) assert_frame_equal(result, expected) @@ -830,11 +951,10 @@ def zip_frames(frames, axis=1): class TestDataFrameAggregate: - def test_agg_transform(self, axis, float_frame): - other_axis = 1 if axis in {0, 'index'} else 0 + other_axis = 1 if axis in {0, "index"} else 0 - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): f_abs = np.abs(float_frame) f_sqrt = np.sqrt(float_frame) @@ -853,12 +973,14 @@ def test_agg_transform(self, axis, float_frame): # list-like result = float_frame.apply([np.sqrt], axis=axis) expected = f_sqrt.copy() - if axis in {0, 'index'}: + if axis in {0, "index"}: expected.columns = pd.MultiIndex.from_product( - [float_frame.columns, ['sqrt']]) + [float_frame.columns, ["sqrt"]] + ) else: expected.index = pd.MultiIndex.from_product( - [float_frame.index, ['sqrt']]) + [float_frame.index, ["sqrt"]] + ) assert_frame_equal(result, expected) result = float_frame.transform([np.sqrt], axis=axis) @@ -869,39 +991,39 @@ def test_agg_transform(self, axis, float_frame): # functions per series and then concatting result = float_frame.apply([np.abs, np.sqrt], axis=axis) expected = zip_frames([f_abs, f_sqrt], axis=other_axis) - if axis in {0, 'index'}: + if axis in {0, "index"}: expected.columns = pd.MultiIndex.from_product( - [float_frame.columns, ['absolute', 'sqrt']]) + [float_frame.columns, ["absolute", "sqrt"]] + ) else: expected.index = pd.MultiIndex.from_product( - [float_frame.index, ['absolute', 'sqrt']]) + [float_frame.index, ["absolute", "sqrt"]] + ) assert_frame_equal(result, expected) - result = float_frame.transform([np.abs, 'sqrt'], axis=axis) + result = float_frame.transform([np.abs, "sqrt"], axis=axis) assert_frame_equal(result, expected) def test_transform_and_agg_err(self, axis, float_frame): # cannot both transform and agg with pytest.raises(ValueError): - float_frame.transform(['max', 'min'], axis=axis) + float_frame.transform(["max", "min"], axis=axis) with pytest.raises(ValueError): - with np.errstate(all='ignore'): - float_frame.agg(['max', 'sqrt'], axis=axis) + with np.errstate(all="ignore"): + float_frame.agg(["max", "sqrt"], axis=axis) with pytest.raises(ValueError): - with np.errstate(all='ignore'): - float_frame.transform(['max', 'sqrt'], axis=axis) + with np.errstate(all="ignore"): + float_frame.transform(["max", "sqrt"], axis=axis) - df = pd.DataFrame({'A': range(5), 'B': 5}) + df = pd.DataFrame({"A": range(5), "B": 5}) def f(): - with np.errstate(all='ignore'): - df.agg({'A': ['abs', 'sum'], 'B': ['mean', 'max']}, axis=axis) + with np.errstate(all="ignore"): + df.agg({"A": ["abs", "sum"], "B": ["mean", "max"]}, axis=axis) - @pytest.mark.parametrize('method', [ - 'abs', 'shift', 'pct_change', 'cumsum', 'rank', - ]) + @pytest.mark.parametrize("method", ["abs", "shift", "pct_change", "cumsum", "rank"]) def test_transform_method_name(self, method): # GH 19760 df = pd.DataFrame({"A": [-1, 2]}) @@ -911,129 +1033,175 @@ def test_transform_method_name(self, method): def test_demo(self): # demonstration tests - df = pd.DataFrame({'A': range(5), 'B': 5}) + df = pd.DataFrame({"A": range(5), "B": 5}) - result = df.agg(['min', 'max']) - expected = DataFrame({'A': [0, 4], 'B': [5, 5]}, - columns=['A', 'B'], - index=['min', 'max']) + result = df.agg(["min", "max"]) + expected = DataFrame( + {"A": [0, 4], "B": [5, 5]}, columns=["A", "B"], index=["min", "max"] + ) tm.assert_frame_equal(result, expected) - result = df.agg({'A': ['min', 'max'], 'B': ['sum', 'max']}) - expected = DataFrame({'A': [4.0, 0.0, np.nan], - 'B': [5.0, np.nan, 25.0]}, - columns=['A', 'B'], - index=['max', 'min', 'sum']) + result = df.agg({"A": ["min", "max"], "B": ["sum", "max"]}) + expected = DataFrame( + {"A": [4.0, 0.0, np.nan], "B": [5.0, np.nan, 25.0]}, + columns=["A", "B"], + index=["max", "min", "sum"], + ) tm.assert_frame_equal(result.reindex_like(expected), expected) def test_agg_multiple_mixed_no_warning(self): # GH 20909 - mdf = pd.DataFrame({'A': [1, 2, 3], - 'B': [1., 2., 3.], - 'C': ['foo', 'bar', 'baz'], - 'D': pd.date_range('20130101', periods=3)}) - expected = pd.DataFrame({"A": [1, 6], 'B': [1.0, 6.0], - "C": ['bar', 'foobarbaz'], - "D": [pd.Timestamp('2013-01-01'), pd.NaT]}, - index=['min', 'sum']) + mdf = pd.DataFrame( + { + "A": [1, 2, 3], + "B": [1.0, 2.0, 3.0], + "C": ["foo", "bar", "baz"], + "D": pd.date_range("20130101", periods=3), + } + ) + expected = pd.DataFrame( + { + "A": [1, 6], + "B": [1.0, 6.0], + "C": ["bar", "foobarbaz"], + "D": [pd.Timestamp("2013-01-01"), pd.NaT], + }, + index=["min", "sum"], + ) # sorted index with tm.assert_produces_warning(None): - result = mdf.agg(['min', 'sum']) + result = mdf.agg(["min", "sum"]) tm.assert_frame_equal(result, expected) with tm.assert_produces_warning(None): - result = mdf[['D', 'C', 'B', 'A']].agg(['sum', 'min']) + result = mdf[["D", "C", "B", "A"]].agg(["sum", "min"]) # For backwards compatibility, the result's index is # still sorted by function name, so it's ['min', 'sum'] # not ['sum', 'min']. - expected = expected[['D', 'C', 'B', 'A']] + expected = expected[["D", "C", "B", "A"]] tm.assert_frame_equal(result, expected) def test_agg_dict_nested_renaming_depr(self): - df = pd.DataFrame({'A': range(5), 'B': 5}) + df = pd.DataFrame({"A": range(5), "B": 5}) # nested renaming with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df.agg({'A': {'foo': 'min'}, - 'B': {'bar': 'max'}}) + df.agg({"A": {"foo": "min"}, "B": {"bar": "max"}}) def test_agg_reduce(self, axis, float_frame): - other_axis = 1 if axis in {0, 'index'} else 0 + other_axis = 1 if axis in {0, "index"} else 0 name1, name2 = float_frame.axes[other_axis].unique()[:2].sort_values() # all reducers - expected = pd.concat([float_frame.mean(axis=axis), - float_frame.max(axis=axis), - float_frame.sum(axis=axis), - ], axis=1) - expected.columns = ['mean', 'max', 'sum'] - expected = expected.T if axis in {0, 'index'} else expected - - result = float_frame.agg(['mean', 'max', 'sum'], axis=axis) + expected = pd.concat( + [ + float_frame.mean(axis=axis), + float_frame.max(axis=axis), + float_frame.sum(axis=axis), + ], + axis=1, + ) + expected.columns = ["mean", "max", "sum"] + expected = expected.T if axis in {0, "index"} else expected + + result = float_frame.agg(["mean", "max", "sum"], axis=axis) assert_frame_equal(result, expected) # dict input with scalars - func = OrderedDict([(name1, 'mean'), (name2, 'sum')]) + func = OrderedDict([(name1, "mean"), (name2, "sum")]) result = float_frame.agg(func, axis=axis) - expected = Series([float_frame.loc(other_axis)[name1].mean(), - float_frame.loc(other_axis)[name2].sum()], - index=[name1, name2]) + expected = Series( + [ + float_frame.loc(other_axis)[name1].mean(), + float_frame.loc(other_axis)[name2].sum(), + ], + index=[name1, name2], + ) assert_series_equal(result, expected) # dict input with lists - func = OrderedDict([(name1, ['mean']), (name2, ['sum'])]) + func = OrderedDict([(name1, ["mean"]), (name2, ["sum"])]) result = float_frame.agg(func, axis=axis) - expected = DataFrame({ - name1: Series([float_frame.loc(other_axis)[name1].mean()], - index=['mean']), - name2: Series([float_frame.loc(other_axis)[name2].sum()], - index=['sum'])}) - expected = expected.T if axis in {1, 'columns'} else expected + expected = DataFrame( + { + name1: Series( + [float_frame.loc(other_axis)[name1].mean()], index=["mean"] + ), + name2: Series( + [float_frame.loc(other_axis)[name2].sum()], index=["sum"] + ), + } + ) + expected = expected.T if axis in {1, "columns"} else expected assert_frame_equal(result, expected) # dict input with lists with multiple - func = OrderedDict([(name1, ['mean', 'sum']), (name2, ['sum', 'max'])]) + func = OrderedDict([(name1, ["mean", "sum"]), (name2, ["sum", "max"])]) result = float_frame.agg(func, axis=axis) - expected = DataFrame(OrderedDict([ - (name1, Series([float_frame.loc(other_axis)[name1].mean(), - float_frame.loc(other_axis)[name1].sum()], - index=['mean', 'sum'])), - (name2, Series([float_frame.loc(other_axis)[name2].sum(), - float_frame.loc(other_axis)[name2].max()], - index=['sum', 'max'])), - ])) - expected = expected.T if axis in {1, 'columns'} else expected + expected = DataFrame( + OrderedDict( + [ + ( + name1, + Series( + [ + float_frame.loc(other_axis)[name1].mean(), + float_frame.loc(other_axis)[name1].sum(), + ], + index=["mean", "sum"], + ), + ), + ( + name2, + Series( + [ + float_frame.loc(other_axis)[name2].sum(), + float_frame.loc(other_axis)[name2].max(), + ], + index=["sum", "max"], + ), + ), + ] + ) + ) + expected = expected.T if axis in {1, "columns"} else expected assert_frame_equal(result, expected) def test_nuiscance_columns(self): # GH 15015 - df = DataFrame({'A': [1, 2, 3], - 'B': [1., 2., 3.], - 'C': ['foo', 'bar', 'baz'], - 'D': pd.date_range('20130101', periods=3)}) - - result = df.agg('min') - expected = Series([1, 1., 'bar', pd.Timestamp('20130101')], - index=df.columns) + df = DataFrame( + { + "A": [1, 2, 3], + "B": [1.0, 2.0, 3.0], + "C": ["foo", "bar", "baz"], + "D": pd.date_range("20130101", periods=3), + } + ) + + result = df.agg("min") + expected = Series([1, 1.0, "bar", pd.Timestamp("20130101")], index=df.columns) assert_series_equal(result, expected) - result = df.agg(['min']) - expected = DataFrame([[1, 1., 'bar', pd.Timestamp('20130101')]], - index=['min'], columns=df.columns) + result = df.agg(["min"]) + expected = DataFrame( + [[1, 1.0, "bar", pd.Timestamp("20130101")]], + index=["min"], + columns=df.columns, + ) assert_frame_equal(result, expected) - result = df.agg('sum') - expected = Series([6, 6., 'foobarbaz'], - index=['A', 'B', 'C']) + result = df.agg("sum") + expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"]) assert_series_equal(result, expected) - result = df.agg(['sum']) - expected = DataFrame([[6, 6., 'foobarbaz']], - index=['sum'], columns=['A', 'B', 'C']) + result = df.agg(["sum"]) + expected = DataFrame( + [[6, 6.0, "foobarbaz"]], index=["sum"], columns=["A", "B", "C"] + ) assert_frame_equal(result, expected) def test_non_callable_aggregates(self): @@ -1041,74 +1209,85 @@ def test_non_callable_aggregates(self): # GH 16405 # 'size' is a property of frame/series # validate that this is working - df = DataFrame({'A': [None, 2, 3], - 'B': [1.0, np.nan, 3.0], - 'C': ['foo', None, 'bar']}) + df = DataFrame( + {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]} + ) # Function aggregate - result = df.agg({'A': 'count'}) - expected = Series({'A': 2}) + result = df.agg({"A": "count"}) + expected = Series({"A": 2}) assert_series_equal(result, expected) # Non-function aggregate - result = df.agg({'A': 'size'}) - expected = Series({'A': 3}) + result = df.agg({"A": "size"}) + expected = Series({"A": 3}) assert_series_equal(result, expected) # Mix function and non-function aggs - result1 = df.agg(['count', 'size']) - result2 = df.agg({'A': ['count', 'size'], - 'B': ['count', 'size'], - 'C': ['count', 'size']}) - expected = pd.DataFrame({'A': {'count': 2, 'size': 3}, - 'B': {'count': 2, 'size': 3}, - 'C': {'count': 2, 'size': 3}}) + result1 = df.agg(["count", "size"]) + result2 = df.agg( + {"A": ["count", "size"], "B": ["count", "size"], "C": ["count", "size"]} + ) + expected = pd.DataFrame( + { + "A": {"count": 2, "size": 3}, + "B": {"count": 2, "size": 3}, + "C": {"count": 2, "size": 3}, + } + ) assert_frame_equal(result1, result2, check_like=True) assert_frame_equal(result2, expected, check_like=True) # Just functional string arg is same as calling df.arg() - result = df.agg('count') + result = df.agg("count") expected = df.count() assert_series_equal(result, expected) # Just a string attribute arg same as calling df.arg - result = df.agg('size') + result = df.agg("size") expected = df.size assert result == expected - @pytest.mark.parametrize("df, func, expected", chain( - _get_cython_table_params( - DataFrame(), [ - ('sum', Series()), - ('max', Series()), - ('min', Series()), - ('all', Series(dtype=bool)), - ('any', Series(dtype=bool)), - ('mean', Series()), - ('prod', Series()), - ('std', Series()), - ('var', Series()), - ('median', Series()), - ]), - _get_cython_table_params( - DataFrame([[np.nan, 1], [1, 2]]), [ - ('sum', Series([1., 3])), - ('max', Series([1., 2])), - ('min', Series([1., 1])), - ('all', Series([True, True])), - ('any', Series([True, True])), - ('mean', Series([1, 1.5])), - ('prod', Series([1., 2])), - ('std', Series([np.nan, 0.707107])), - ('var', Series([np.nan, 0.5])), - ('median', Series([1, 1.5])), - ]), - )) + @pytest.mark.parametrize( + "df, func, expected", + chain( + _get_cython_table_params( + DataFrame(), + [ + ("sum", Series()), + ("max", Series()), + ("min", Series()), + ("all", Series(dtype=bool)), + ("any", Series(dtype=bool)), + ("mean", Series()), + ("prod", Series()), + ("std", Series()), + ("var", Series()), + ("median", Series()), + ], + ), + _get_cython_table_params( + DataFrame([[np.nan, 1], [1, 2]]), + [ + ("sum", Series([1.0, 3])), + ("max", Series([1.0, 2])), + ("min", Series([1.0, 1])), + ("all", Series([True, True])), + ("any", Series([True, True])), + ("mean", Series([1, 1.5])), + ("prod", Series([1.0, 2])), + ("std", Series([np.nan, 0.707107])), + ("var", Series([np.nan, 0.5])), + ("median", Series([1, 1.5])), + ], + ), + ), + ) def test_agg_cython_table(self, df, func, expected, axis): # GH 21224 # test reducing functions in @@ -1116,18 +1295,21 @@ def test_agg_cython_table(self, df, func, expected, axis): result = df.agg(func, axis=axis) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("df, func, expected", chain( - _get_cython_table_params( - DataFrame(), [ - ('cumprod', DataFrame()), - ('cumsum', DataFrame()), - ]), - _get_cython_table_params( - DataFrame([[np.nan, 1], [1, 2]]), [ - ('cumprod', DataFrame([[np.nan, 1], [1., 2.]])), - ('cumsum', DataFrame([[np.nan, 1], [1., 3.]])), - ]), - )) + @pytest.mark.parametrize( + "df, func, expected", + chain( + _get_cython_table_params( + DataFrame(), [("cumprod", DataFrame()), ("cumsum", DataFrame())] + ), + _get_cython_table_params( + DataFrame([[np.nan, 1], [1, 2]]), + [ + ("cumprod", DataFrame([[np.nan, 1], [1.0, 2.0]])), + ("cumsum", DataFrame([[np.nan, 1], [1.0, 3.0]])), + ], + ), + ), + ) def test_agg_cython_table_transform(self, df, func, expected, axis): # GH 21224 # test transforming functions in @@ -1135,10 +1317,11 @@ def test_agg_cython_table_transform(self, df, func, expected, axis): result = df.agg(func, axis=axis) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("df, func, expected", _get_cython_table_params( - DataFrame([['a', 'b'], ['b', 'a']]), [ - ['cumprod', TypeError], - ]), + @pytest.mark.parametrize( + "df, func, expected", + _get_cython_table_params( + DataFrame([["a", "b"], ["b", "a"]]), [["cumprod", TypeError]] + ), ) def test_agg_cython_table_raises(self, df, func, expected, axis): # GH 21224 diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index bcbea9d7a22365..7c022106c9104e 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -17,22 +17,25 @@ class TestFrameComparisons: # Specifically _not_ flex-comparisons def test_comparison_invalid(self): - def check(df, df2): for (x, y) in [(df, df2), (df2, df)]: # we expect the result to match Series comparisons for # == and !=, inequalities should raise result = x == y - expected = pd.DataFrame({col: x[col] == y[col] - for col in x.columns}, - index=x.index, columns=x.columns) + expected = pd.DataFrame( + {col: x[col] == y[col] for col in x.columns}, + index=x.index, + columns=x.columns, + ) tm.assert_frame_equal(result, expected) result = x != y - expected = pd.DataFrame({col: x[col] != y[col] - for col in x.columns}, - index=x.index, columns=x.columns) + expected = pd.DataFrame( + {col: x[col] != y[col] for col in x.columns}, + index=x.index, + columns=x.columns, + ) tm.assert_frame_equal(result, expected) with pytest.raises(TypeError): @@ -46,56 +49,62 @@ def check(df, df2): # GH4968 # invalid date/int comparisons - df = pd.DataFrame(np.random.randint(10, size=(10, 1)), columns=['a']) - df['dates'] = pd.date_range('20010101', periods=len(df)) + df = pd.DataFrame(np.random.randint(10, size=(10, 1)), columns=["a"]) + df["dates"] = pd.date_range("20010101", periods=len(df)) df2 = df.copy() - df2['dates'] = df['a'] + df2["dates"] = df["a"] check(df, df2) - df = pd.DataFrame(np.random.randint(10, size=(10, 2)), - columns=['a', 'b']) - df2 = pd.DataFrame({'a': pd.date_range('20010101', periods=len(df)), - 'b': pd.date_range('20100101', periods=len(df))}) + df = pd.DataFrame(np.random.randint(10, size=(10, 2)), columns=["a", "b"]) + df2 = pd.DataFrame( + { + "a": pd.date_range("20010101", periods=len(df)), + "b": pd.date_range("20100101", periods=len(df)), + } + ) check(df, df2) def test_timestamp_compare(self): # make sure we can compare Timestamps on the right AND left hand side # GH#4982 - df = pd. DataFrame({'dates1': pd.date_range('20010101', periods=10), - 'dates2': pd.date_range('20010102', periods=10), - 'intcol': np.random.randint(1000000000, size=10), - 'floatcol': np.random.randn(10), - 'stringcol': list(tm.rands(10))}) - df.loc[np.random.rand(len(df)) > 0.5, 'dates2'] = pd.NaT - ops = {'gt': 'lt', 'lt': 'gt', 'ge': 'le', 'le': 'ge', 'eq': 'eq', - 'ne': 'ne'} + df = pd.DataFrame( + { + "dates1": pd.date_range("20010101", periods=10), + "dates2": pd.date_range("20010102", periods=10), + "intcol": np.random.randint(1000000000, size=10), + "floatcol": np.random.randn(10), + "stringcol": list(tm.rands(10)), + } + ) + df.loc[np.random.rand(len(df)) > 0.5, "dates2"] = pd.NaT + ops = {"gt": "lt", "lt": "gt", "ge": "le", "le": "ge", "eq": "eq", "ne": "ne"} for left, right in ops.items(): left_f = getattr(operator, left) right_f = getattr(operator, right) # no nats - if left in ['eq', 'ne']: - expected = left_f(df, pd.Timestamp('20010109')) - result = right_f(pd.Timestamp('20010109'), df) + if left in ["eq", "ne"]: + expected = left_f(df, pd.Timestamp("20010109")) + result = right_f(pd.Timestamp("20010109"), df) tm.assert_frame_equal(result, expected) else: with pytest.raises(TypeError): - left_f(df, pd.Timestamp('20010109')) + left_f(df, pd.Timestamp("20010109")) with pytest.raises(TypeError): - right_f(pd.Timestamp('20010109'), df) + right_f(pd.Timestamp("20010109"), df) # nats - expected = left_f(df, pd.Timestamp('nat')) - result = right_f(pd.Timestamp('nat'), df) + expected = left_f(df, pd.Timestamp("nat")) + result = right_f(pd.Timestamp("nat"), df) tm.assert_frame_equal(result, expected) def test_mixed_comparison(self): # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, # not raise TypeError # (this appears to be fixed before GH#22163, not sure when) - df = pd.DataFrame([['1989-08-01', 1], ['1989-08-01', 2]]) - other = pd.DataFrame([['a', 'b'], ['c', 'd']]) + df = pd.DataFrame([["1989-08-01", 1], ["1989-08-01", 2]]) + other = pd.DataFrame([["a", "b"], ["c", "d"]]) result = df == other assert not result.any().any() @@ -109,9 +118,7 @@ def test_df_boolean_comparison_error(self): # len(df.columns) is supported as of GH#22800 df = pd.DataFrame(np.arange(6).reshape((3, 2))) - expected = pd.DataFrame([[False, False], - [True, False], - [False, False]]) + expected = pd.DataFrame([[False, False], [True, False], [False, False]]) result = df == (2, 2) tm.assert_frame_equal(result, expected) @@ -120,8 +127,9 @@ def test_df_boolean_comparison_error(self): tm.assert_frame_equal(result, expected) def test_df_float_none_comparison(self): - df = pd.DataFrame(np.random.randn(8, 3), index=range(8), - columns=['A', 'B', 'C']) + df = pd.DataFrame( + np.random.randn(8, 3), index=range(8), columns=["A", "B", "C"] + ) result = df.__eq__(None) assert not result.any().any() @@ -156,7 +164,7 @@ def _check_unaligned_frame(meth, op, df, other): # DataFrame assert df.eq(df).values.all() assert not df.ne(df).values.any() - for op in ['eq', 'ne', 'gt', 'lt', 'ge', 'le']: + for op in ["eq", "ne", "gt", "lt", "ge", "le"]: f = getattr(df, op) o = getattr(operator, op) # No NAs @@ -230,23 +238,23 @@ def _test_seq(df, idx_ser, col_ser): # complex arr = np.array([np.nan, 1, 6, np.nan]) arr2 = np.array([2j, np.nan, 7, None]) - df = pd.DataFrame({'a': arr}) - df2 = pd.DataFrame({'a': arr2}) + df = pd.DataFrame({"a": arr}) + df2 = pd.DataFrame({"a": arr2}) rs = df.gt(df2) assert not rs.values.any() rs = df.ne(df2) assert rs.values.all() arr3 = np.array([2j, np.nan, None]) - df3 = pd.DataFrame({'a': arr3}) + df3 = pd.DataFrame({"a": arr3}) rs = df3.gt(2j) assert not rs.values.any() # corner, dtype=object - df1 = pd.DataFrame({'col': ['foo', np.nan, 'bar']}) - df2 = pd.DataFrame({'col': ['foo', datetime.now(), 'bar']}) + df1 = pd.DataFrame({"col": ["foo", np.nan, "bar"]}) + df2 = pd.DataFrame({"col": ["foo", datetime.now(), "bar"]}) result = df1.ne(df2) - exp = pd.DataFrame({'col': [False, True, False]}) + exp = pd.DataFrame({"col": [False, True, False]}) tm.assert_frame_equal(result, exp) def test_flex_comparison_nat(self): @@ -267,19 +275,19 @@ def test_flex_comparison_nat(self): result = df.ne(pd.NaT) assert result.iloc[0, 0].item() is True - @pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le']) + @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) def test_df_flex_cmp_constant_return_types(self, opname): # GH 15077, non-empty DataFrame - df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]}) + df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}) const = 2 result = getattr(df, opname)(const).dtypes.value_counts() tm.assert_series_equal(result, pd.Series([2], index=[np.dtype(bool)])) - @pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le']) + @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) def test_df_flex_cmp_constant_return_types_empty(self, opname): # GH 15077 empty DataFrame - df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]}) + df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}) const = 2 empty = df.iloc[:0] @@ -290,44 +298,48 @@ def test_df_flex_cmp_constant_return_types_empty(self, opname): # ------------------------------------------------------------------- # Arithmetic -class TestFrameFlexArithmetic: +class TestFrameFlexArithmetic: def test_df_add_td64_columnwise(self): # GH 22534 Check that column-wise addition broadcasts correctly - dti = pd.date_range('2016-01-01', periods=10) - tdi = pd.timedelta_range('1', periods=10) + dti = pd.date_range("2016-01-01", periods=10) + tdi = pd.timedelta_range("1", periods=10) tser = pd.Series(tdi) df = pd.DataFrame({0: dti, 1: tdi}) result = df.add(tser, axis=0) - expected = pd.DataFrame({0: dti + tdi, - 1: tdi + tdi}) + expected = pd.DataFrame({0: dti + tdi, 1: tdi + tdi}) tm.assert_frame_equal(result, expected) def test_df_add_flex_filled_mixed_dtypes(self): # GH 19611 - dti = pd.date_range('2016-01-01', periods=3) - ser = pd.Series(['1 Day', 'NaT', '2 Days'], dtype='timedelta64[ns]') - df = pd.DataFrame({'A': dti, 'B': ser}) - other = pd.DataFrame({'A': ser, 'B': ser}) + dti = pd.date_range("2016-01-01", periods=3) + ser = pd.Series(["1 Day", "NaT", "2 Days"], dtype="timedelta64[ns]") + df = pd.DataFrame({"A": dti, "B": ser}) + other = pd.DataFrame({"A": ser, "B": ser}) fill = pd.Timedelta(days=1).to_timedelta64() result = df.add(other, fill_value=fill) expected = pd.DataFrame( - {'A': pd.Series(['2016-01-02', '2016-01-03', '2016-01-05'], - dtype='datetime64[ns]'), - 'B': ser * 2}) + { + "A": pd.Series( + ["2016-01-02", "2016-01-03", "2016-01-05"], dtype="datetime64[ns]" + ), + "B": ser * 2, + } + ) tm.assert_frame_equal(result, expected) - def test_arith_flex_frame(self, all_arithmetic_operators, float_frame, - mixed_float_frame): + def test_arith_flex_frame( + self, all_arithmetic_operators, float_frame, mixed_float_frame + ): # one instance of parametrized fixture op = all_arithmetic_operators def f(x, y): # r-versions not in operator-stdlib; get op without "r" and invert - if op.startswith('__r'): - return getattr(operator, op.replace('__r', '__'))(y, x) + if op.startswith("__r"): + return getattr(operator, op.replace("__r", "__"))(y, x) return getattr(operator, op)(x, y) result = getattr(float_frame, op)(2 * float_frame) @@ -340,9 +352,10 @@ def f(x, y): tm.assert_frame_equal(result, expected) _check_mixed_float(result, dtype=dict(C=None)) - @pytest.mark.parametrize('op', ['__add__', '__sub__', '__mul__']) - def test_arith_flex_frame_mixed(self, op, int_frame, mixed_int_frame, - mixed_float_frame): + @pytest.mark.parametrize("op", ["__add__", "__sub__", "__mul__"]) + def test_arith_flex_frame_mixed( + self, op, int_frame, mixed_int_frame, mixed_float_frame + ): f = getattr(operator, op) # vs mix int @@ -351,9 +364,9 @@ def test_arith_flex_frame_mixed(self, op, int_frame, mixed_int_frame, # no overflow in the uint dtype = None - if op in ['__sub__']: - dtype = dict(B='uint64', C=None) - elif op in ['__add__', '__mul__']: + if op in ["__sub__"]: + dtype = dict(B="uint64", C=None) + elif op in ["__add__", "__mul__"]: dtype = dict(C=None) tm.assert_frame_equal(result, expected) _check_mixed_int(result, dtype=dtype) @@ -369,8 +382,7 @@ def test_arith_flex_frame_mixed(self, op, int_frame, mixed_int_frame, expected = f(int_frame, 2 * int_frame) tm.assert_frame_equal(result, expected) - def test_arith_flex_frame_raise(self, all_arithmetic_operators, - float_frame): + def test_arith_flex_frame_raise(self, all_arithmetic_operators, float_frame): # one instance of parametrized fixture op = all_arithmetic_operators @@ -393,19 +405,19 @@ def test_arith_flex_frame_corner(self, float_frame): result = float_frame[:0].add(float_frame) tm.assert_frame_equal(result, float_frame * np.nan) - with pytest.raises(NotImplementedError, match='fill_value'): + with pytest.raises(NotImplementedError, match="fill_value"): float_frame.add(float_frame.iloc[0], fill_value=3) - with pytest.raises(NotImplementedError, match='fill_value'): - float_frame.add(float_frame.iloc[0], axis='index', fill_value=3) + with pytest.raises(NotImplementedError, match="fill_value"): + float_frame.add(float_frame.iloc[0], axis="index", fill_value=3) def test_arith_flex_series(self, simple_frame): df = simple_frame - row = df.xs('a') - col = df['two'] + row = df.xs("a") + col = df["two"] # after arithmetic refactor, add truediv here - ops = ['add', 'sub', 'mul', 'mod'] + ops = ["add", "sub", "mul", "mod"] for op in ops: f = getattr(df, op) op = getattr(operator, op) @@ -420,46 +432,47 @@ def test_arith_flex_series(self, simple_frame): tm.assert_frame_equal(df.div(col, axis=0), (df.T / col).T) # broadcasting issue in GH 7325 - df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='int64') + df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype="int64") expected = pd.DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]]) - result = df.div(df[0], axis='index') + result = df.div(df[0], axis="index") tm.assert_frame_equal(result, expected) - df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='float64') + df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype="float64") expected = pd.DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]]) - result = df.div(df[0], axis='index') + result = df.div(df[0], axis="index") tm.assert_frame_equal(result, expected) def test_arith_flex_zero_len_raises(self): # GH 19522 passing fill_value to frame flex arith methods should # raise even in the zero-length special cases ser_len0 = pd.Series([]) - df_len0 = pd.DataFrame(columns=['A', 'B']) - df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + df_len0 = pd.DataFrame(columns=["A", "B"]) + df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) - with pytest.raises(NotImplementedError, match='fill_value'): - df.add(ser_len0, fill_value='E') + with pytest.raises(NotImplementedError, match="fill_value"): + df.add(ser_len0, fill_value="E") - with pytest.raises(NotImplementedError, match='fill_value'): - df_len0.sub(df['A'], axis=None, fill_value=3) + with pytest.raises(NotImplementedError, match="fill_value"): + df_len0.sub(df["A"], axis=None, fill_value=3) class TestFrameArithmetic: def test_df_add_2d_array_rowlike_broadcasts(self): # GH#23000 arr = np.arange(6).reshape(3, 2) - df = pd.DataFrame(arr, columns=[True, False], index=['A', 'B', 'C']) + df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) rowlike = arr[[1], :] # shape --> (1, ncols) assert rowlike.shape == (1, df.shape[1]) - expected = pd.DataFrame([[2, 4], - [4, 6], - [6, 8]], - columns=df.columns, index=df.index, - # specify dtype explicitly to avoid failing - # on 32bit builds - dtype=arr.dtype) + expected = pd.DataFrame( + [[2, 4], [4, 6], [6, 8]], + columns=df.columns, + index=df.index, + # specify dtype explicitly to avoid failing + # on 32bit builds + dtype=arr.dtype, + ) result = df + rowlike tm.assert_frame_equal(result, expected) result = rowlike + df @@ -468,41 +481,43 @@ def test_df_add_2d_array_rowlike_broadcasts(self): def test_df_add_2d_array_collike_broadcasts(self): # GH#23000 arr = np.arange(6).reshape(3, 2) - df = pd.DataFrame(arr, columns=[True, False], index=['A', 'B', 'C']) + df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) collike = arr[:, [1]] # shape --> (nrows, 1) assert collike.shape == (df.shape[0], 1) - expected = pd.DataFrame([[1, 2], - [5, 6], - [9, 10]], - columns=df.columns, index=df.index, - # specify dtype explicitly to avoid failing - # on 32bit builds - dtype=arr.dtype) + expected = pd.DataFrame( + [[1, 2], [5, 6], [9, 10]], + columns=df.columns, + index=df.index, + # specify dtype explicitly to avoid failing + # on 32bit builds + dtype=arr.dtype, + ) result = df + collike tm.assert_frame_equal(result, expected) result = collike + df tm.assert_frame_equal(result, expected) - def test_df_arith_2d_array_rowlike_broadcasts(self, - all_arithmetic_operators): + def test_df_arith_2d_array_rowlike_broadcasts(self, all_arithmetic_operators): # GH#23000 opname = all_arithmetic_operators arr = np.arange(6).reshape(3, 2) - df = pd.DataFrame(arr, columns=[True, False], index=['A', 'B', 'C']) + df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) rowlike = arr[[1], :] # shape --> (1, ncols) assert rowlike.shape == (1, df.shape[1]) - exvals = [getattr(df.loc['A'], opname)(rowlike.squeeze()), - getattr(df.loc['B'], opname)(rowlike.squeeze()), - getattr(df.loc['C'], opname)(rowlike.squeeze())] + exvals = [ + getattr(df.loc["A"], opname)(rowlike.squeeze()), + getattr(df.loc["B"], opname)(rowlike.squeeze()), + getattr(df.loc["C"], opname)(rowlike.squeeze()), + ] expected = pd.DataFrame(exvals, columns=df.columns, index=df.index) - if opname in ['__rmod__', '__rfloordiv__']: + if opname in ["__rmod__", "__rfloordiv__"]: # exvals will have dtypes [f8, i8, i8] so expected will be # all-f8, but the DataFrame operation will return mixed dtypes # use exvals[-1].dtype instead of "i8" for compat with 32-bit @@ -512,28 +527,28 @@ def test_df_arith_2d_array_rowlike_broadcasts(self, result = getattr(df, opname)(rowlike) tm.assert_frame_equal(result, expected) - def test_df_arith_2d_array_collike_broadcasts(self, - all_arithmetic_operators): + def test_df_arith_2d_array_collike_broadcasts(self, all_arithmetic_operators): # GH#23000 opname = all_arithmetic_operators arr = np.arange(6).reshape(3, 2) - df = pd.DataFrame(arr, columns=[True, False], index=['A', 'B', 'C']) + df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) collike = arr[:, [1]] # shape --> (nrows, 1) assert collike.shape == (df.shape[0], 1) - exvals = {True: getattr(df[True], opname)(collike.squeeze()), - False: getattr(df[False], opname)(collike.squeeze())} + exvals = { + True: getattr(df[True], opname)(collike.squeeze()), + False: getattr(df[False], opname)(collike.squeeze()), + } dtype = None - if opname in ['__rmod__', '__rfloordiv__']: + if opname in ["__rmod__", "__rfloordiv__"]: # Series ops may return mixed int/float dtypes in cases where # DataFrame op will return all-float. So we upcast `expected` dtype = np.common_type(*[x.values for x in exvals.values()]) - expected = pd.DataFrame(exvals, columns=df.columns, index=df.index, - dtype=dtype) + expected = pd.DataFrame(exvals, columns=df.columns, index=df.index, dtype=dtype) result = getattr(df, opname)(collike) tm.assert_frame_equal(result, expected) @@ -547,24 +562,22 @@ def test_df_bool_mul_int(self): # On appveyor this comes back as np.int32 instead of np.int64, # so we check dtype.kind instead of just dtype kinds = result.dtypes.apply(lambda x: x.kind) - assert (kinds == 'i').all() + assert (kinds == "i").all() result = 1 * df kinds = result.dtypes.apply(lambda x: x.kind) - assert (kinds == 'i').all() + assert (kinds == "i").all() def test_arith_mixed(self): - left = pd.DataFrame({'A': ['a', 'b', 'c'], - 'B': [1, 2, 3]}) + left = pd.DataFrame({"A": ["a", "b", "c"], "B": [1, 2, 3]}) result = left + left - expected = pd.DataFrame({'A': ['aa', 'bb', 'cc'], - 'B': [2, 4, 6]}) + expected = pd.DataFrame({"A": ["aa", "bb", "cc"], "B": [2, 4, 6]}) tm.assert_frame_equal(result, expected) def test_arith_getitem_commute(self): - df = pd.DataFrame({'A': [1.1, 3.3], 'B': [2.5, -3.9]}) + df = pd.DataFrame({"A": [1.1, 3.3], "B": [2.5, -3.9]}) def _test_op(df, op): result = op(df, 1) @@ -594,40 +607,38 @@ def _test_op(df, op): _test_op(df, lambda x, y: x / y) _test_op(df, lambda x, y: x ** y) - @pytest.mark.parametrize('values', [[1, 2], (1, 2), np.array([1, 2]), - range(1, 3), deque([1, 2])]) + @pytest.mark.parametrize( + "values", [[1, 2], (1, 2), np.array([1, 2]), range(1, 3), deque([1, 2])] + ) def test_arith_alignment_non_pandas_object(self, values): # GH#17901 - df = pd.DataFrame({'A': [1, 1], 'B': [1, 1]}) - expected = pd.DataFrame({'A': [2, 2], 'B': [3, 3]}) + df = pd.DataFrame({"A": [1, 1], "B": [1, 1]}) + expected = pd.DataFrame({"A": [2, 2], "B": [3, 3]}) result = df + values tm.assert_frame_equal(result, expected) def test_arith_non_pandas_object(self): - df = pd.DataFrame(np.arange(1, 10, dtype='f8').reshape(3, 3), - columns=['one', 'two', 'three'], - index=['a', 'b', 'c']) - - val1 = df.xs('a').values - added = pd.DataFrame(df.values + val1, - index=df.index, columns=df.columns) + df = pd.DataFrame( + np.arange(1, 10, dtype="f8").reshape(3, 3), + columns=["one", "two", "three"], + index=["a", "b", "c"], + ) + + val1 = df.xs("a").values + added = pd.DataFrame(df.values + val1, index=df.index, columns=df.columns) tm.assert_frame_equal(df + val1, added) - added = pd.DataFrame((df.values.T + val1).T, - index=df.index, columns=df.columns) + added = pd.DataFrame((df.values.T + val1).T, index=df.index, columns=df.columns) tm.assert_frame_equal(df.add(val1, axis=0), added) - val2 = list(df['two']) + val2 = list(df["two"]) - added = pd.DataFrame(df.values + val2, - index=df.index, columns=df.columns) + added = pd.DataFrame(df.values + val2, index=df.index, columns=df.columns) tm.assert_frame_equal(df + val2, added) - added = pd.DataFrame((df.values.T + val2).T, index=df.index, - columns=df.columns) - tm.assert_frame_equal(df.add(val2, axis='index'), added) + added = pd.DataFrame((df.values.T + val2).T, index=df.index, columns=df.columns) + tm.assert_frame_equal(df.add(val2, axis="index"), added) val3 = np.random.rand(*df.shape) - added = pd.DataFrame(df.values + val3, - index=df.index, columns=df.columns) + added = pd.DataFrame(df.values + val3, index=df.index, columns=df.columns) tm.assert_frame_equal(df.add(val3), added) diff --git a/pandas/tests/frame/test_asof.py b/pandas/tests/frame/test_asof.py index e7b9ff348bd570..9a7d806c79dc3d 100644 --- a/pandas/tests/frame/test_asof.py +++ b/pandas/tests/frame/test_asof.py @@ -13,17 +13,16 @@ def date_range_frame(): Columns are ['A', 'B']. """ N = 50 - rng = date_range('1/1/1990', periods=N, freq='53s') - return DataFrame({'A': np.arange(N), 'B': np.arange(N)}, index=rng) + rng = date_range("1/1/1990", periods=N, freq="53s") + return DataFrame({"A": np.arange(N), "B": np.arange(N)}, index=rng) class TestFrameAsof: - def test_basic(self, date_range_frame): df = date_range_frame N = 50 - df.loc[15:30, 'A'] = np.nan - dates = date_range('1/1/1990', periods=N * 3, freq='25s') + df.loc[15:30, "A"] = np.nan + dates = date_range("1/1/1990", periods=N * 3, freq="25s") result = df.asof(dates) assert result.notna().all(1).all() @@ -41,23 +40,22 @@ def test_basic(self, date_range_frame): def test_subset(self, date_range_frame): N = 10 df = date_range_frame.iloc[:N].copy() - df.loc[4:8, 'A'] = np.nan - dates = date_range('1/1/1990', periods=N * 3, - freq='25s') + df.loc[4:8, "A"] = np.nan + dates = date_range("1/1/1990", periods=N * 3, freq="25s") # with a subset of A should be the same - result = df.asof(dates, subset='A') + result = df.asof(dates, subset="A") expected = df.asof(dates) tm.assert_frame_equal(result, expected) # same with A/B - result = df.asof(dates, subset=['A', 'B']) + result = df.asof(dates, subset=["A", "B"]) expected = df.asof(dates) tm.assert_frame_equal(result, expected) # B gives df.asof - result = df.asof(dates, subset='B') - expected = df.resample('25s', closed='right').ffill().reindex(dates) + result = df.asof(dates, subset="B") + expected = df.resample("25s", closed="right").ffill().reindex(dates) expected.iloc[20:] = 9 tm.assert_frame_equal(result, expected) @@ -67,14 +65,15 @@ def test_missing(self, date_range_frame): # no match found - `where` value before earliest date in index N = 10 df = date_range_frame.iloc[:N].copy() - result = df.asof('1989-12-31') + result = df.asof("1989-12-31") - expected = Series(index=['A', 'B'], name=Timestamp('1989-12-31')) + expected = Series(index=["A", "B"], name=Timestamp("1989-12-31")) tm.assert_series_equal(result, expected) - result = df.asof(to_datetime(['1989-12-31'])) - expected = DataFrame(index=to_datetime(['1989-12-31']), - columns=['A', 'B'], dtype='float64') + result = df.asof(to_datetime(["1989-12-31"])) + expected = DataFrame( + index=to_datetime(["1989-12-31"]), columns=["A", "B"], dtype="float64" + ) tm.assert_frame_equal(result, expected) def test_all_nans(self, date_range_frame): @@ -87,41 +86,49 @@ def test_all_nans(self, date_range_frame): # testing non-default indexes, multiple inputs N = 150 rng = date_range_frame.index - dates = date_range('1/1/1990', periods=N, freq='25s') - result = DataFrame(np.nan, index=rng, columns=['A']).asof(dates) - expected = DataFrame(np.nan, index=dates, columns=['A']) + dates = date_range("1/1/1990", periods=N, freq="25s") + result = DataFrame(np.nan, index=rng, columns=["A"]).asof(dates) + expected = DataFrame(np.nan, index=dates, columns=["A"]) tm.assert_frame_equal(result, expected) # testing multiple columns - dates = date_range('1/1/1990', periods=N, freq='25s') - result = DataFrame(np.nan, index=rng, - columns=['A', 'B', 'C']).asof(dates) - expected = DataFrame(np.nan, index=dates, columns=['A', 'B', 'C']) + dates = date_range("1/1/1990", periods=N, freq="25s") + result = DataFrame(np.nan, index=rng, columns=["A", "B", "C"]).asof(dates) + expected = DataFrame(np.nan, index=dates, columns=["A", "B", "C"]) tm.assert_frame_equal(result, expected) # testing scalar input - result = DataFrame(np.nan, index=[1, 2], columns=['A', 'B']).asof([3]) - expected = DataFrame(np.nan, index=[3], columns=['A', 'B']) + result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof([3]) + expected = DataFrame(np.nan, index=[3], columns=["A", "B"]) tm.assert_frame_equal(result, expected) - result = DataFrame(np.nan, index=[1, 2], columns=['A', 'B']).asof(3) - expected = Series(np.nan, index=['A', 'B'], name=3) + result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof(3) + expected = Series(np.nan, index=["A", "B"], name=3) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "stamp,expected", - [(Timestamp('2018-01-01 23:22:43.325+00:00'), - Series(2.0, name=Timestamp('2018-01-01 23:22:43.325+00:00'))), - (Timestamp('2018-01-01 22:33:20.682+01:00'), - Series(1.0, name=Timestamp('2018-01-01 22:33:20.682+01:00'))), - ] + [ + ( + Timestamp("2018-01-01 23:22:43.325+00:00"), + Series(2.0, name=Timestamp("2018-01-01 23:22:43.325+00:00")), + ), + ( + Timestamp("2018-01-01 22:33:20.682+01:00"), + Series(1.0, name=Timestamp("2018-01-01 22:33:20.682+01:00")), + ), + ], ) def test_time_zone_aware_index(self, stamp, expected): # GH21194 # Testing awareness of DataFrame index considering different # UTC and timezone - df = DataFrame(data=[1, 2], - index=[Timestamp('2018-01-01 21:00:05.001+00:00'), - Timestamp('2018-01-01 22:35:10.550+00:00')]) + df = DataFrame( + data=[1, 2], + index=[ + Timestamp("2018-01-01 21:00:05.001+00:00"), + Timestamp("2018-01-01 22:35:10.550+00:00"), + ], + ) result = df.asof(stamp) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 12ac373aa8f607..77be952506964c 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -6,8 +6,7 @@ from pandas.errors import PerformanceWarning import pandas as pd -from pandas import ( - Categorical, DataFrame, Index, MultiIndex, Series, date_range, isna) +from pandas import Categorical, DataFrame, Index, MultiIndex, Series, date_range, isna import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal @@ -17,101 +16,102 @@ class TestDataFrameSelectReindex: # test_indexing def test_drop_names(self): - df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]], - index=['a', 'b', 'c'], - columns=['d', 'e', 'f']) - df.index.name, df.columns.name = 'first', 'second' - df_dropped_b = df.drop('b') - df_dropped_e = df.drop('e', axis=1) + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + df.index.name, df.columns.name = "first", "second" + df_dropped_b = df.drop("b") + df_dropped_e = df.drop("e", axis=1) df_inplace_b, df_inplace_e = df.copy(), df.copy() - df_inplace_b.drop('b', inplace=True) - df_inplace_e.drop('e', axis=1, inplace=True) + df_inplace_b.drop("b", inplace=True) + df_inplace_e.drop("e", axis=1, inplace=True) for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e): - assert obj.index.name == 'first' - assert obj.columns.name == 'second' - assert list(df.columns) == ['d', 'e', 'f'] + assert obj.index.name == "first" + assert obj.columns.name == "second" + assert list(df.columns) == ["d", "e", "f"] msg = r"\['g'\] not found in axis" with pytest.raises(KeyError, match=msg): - df.drop(['g']) + df.drop(["g"]) with pytest.raises(KeyError, match=msg): - df.drop(['g'], 1) + df.drop(["g"], 1) # errors = 'ignore' - dropped = df.drop(['g'], errors='ignore') - expected = Index(['a', 'b', 'c'], name='first') + dropped = df.drop(["g"], errors="ignore") + expected = Index(["a", "b", "c"], name="first") tm.assert_index_equal(dropped.index, expected) - dropped = df.drop(['b', 'g'], errors='ignore') - expected = Index(['a', 'c'], name='first') + dropped = df.drop(["b", "g"], errors="ignore") + expected = Index(["a", "c"], name="first") tm.assert_index_equal(dropped.index, expected) - dropped = df.drop(['g'], axis=1, errors='ignore') - expected = Index(['d', 'e', 'f'], name='second') + dropped = df.drop(["g"], axis=1, errors="ignore") + expected = Index(["d", "e", "f"], name="second") tm.assert_index_equal(dropped.columns, expected) - dropped = df.drop(['d', 'g'], axis=1, errors='ignore') - expected = Index(['e', 'f'], name='second') + dropped = df.drop(["d", "g"], axis=1, errors="ignore") + expected = Index(["e", "f"], name="second") tm.assert_index_equal(dropped.columns, expected) # GH 16398 - dropped = df.drop([], errors='ignore') - expected = Index(['a', 'b', 'c'], name='first') + dropped = df.drop([], errors="ignore") + expected = Index(["a", "b", "c"], name="first") tm.assert_index_equal(dropped.index, expected) def test_drop_col_still_multiindex(self): - arrays = [['a', 'b', 'c', 'top'], - ['', '', '', 'OD'], - ['', '', '', 'wx']] + arrays = [["a", "b", "c", "top"], ["", "", "", "OD"], ["", "", "", "wx"]] tuples = sorted(zip(*arrays)) index = MultiIndex.from_tuples(tuples) df = DataFrame(np.random.randn(3, 4), columns=index) - del df[('a', '', '')] - assert(isinstance(df.columns, MultiIndex)) + del df[("a", "", "")] + assert isinstance(df.columns, MultiIndex) def test_drop(self): simple = DataFrame({"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]}) - assert_frame_equal(simple.drop("A", axis=1), simple[['B']]) - assert_frame_equal(simple.drop(["A", "B"], axis='columns'), - simple[[]]) + assert_frame_equal(simple.drop("A", axis=1), simple[["B"]]) + assert_frame_equal(simple.drop(["A", "B"], axis="columns"), simple[[]]) assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.loc[[2], :]) - assert_frame_equal(simple.drop( - [0, 3], axis='index'), simple.loc[[1, 2], :]) + assert_frame_equal(simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :]) with pytest.raises(KeyError, match=r"\[5\] not found in axis"): simple.drop(5) with pytest.raises(KeyError, match=r"\['C'\] not found in axis"): - simple.drop('C', 1) + simple.drop("C", 1) with pytest.raises(KeyError, match=r"\[5\] not found in axis"): simple.drop([1, 5]) with pytest.raises(KeyError, match=r"\['C'\] not found in axis"): - simple.drop(['A', 'C'], 1) + simple.drop(["A", "C"], 1) # errors = 'ignore' - assert_frame_equal(simple.drop(5, errors='ignore'), simple) - assert_frame_equal(simple.drop([0, 5], errors='ignore'), - simple.loc[[1, 2, 3], :]) - assert_frame_equal(simple.drop('C', axis=1, errors='ignore'), simple) - assert_frame_equal(simple.drop(['A', 'C'], axis=1, errors='ignore'), - simple[['B']]) + assert_frame_equal(simple.drop(5, errors="ignore"), simple) + assert_frame_equal( + simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :] + ) + assert_frame_equal(simple.drop("C", axis=1, errors="ignore"), simple) + assert_frame_equal( + simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]] + ) # non-unique - wheee! - nu_df = DataFrame(list(zip(range(3), range(-3, 1), list('abc'))), - columns=['a', 'a', 'b']) - assert_frame_equal(nu_df.drop('a', axis=1), nu_df[['b']]) - assert_frame_equal(nu_df.drop('b', axis='columns'), nu_df['a']) + nu_df = DataFrame( + list(zip(range(3), range(-3, 1), list("abc"))), columns=["a", "a", "b"] + ) + assert_frame_equal(nu_df.drop("a", axis=1), nu_df[["b"]]) + assert_frame_equal(nu_df.drop("b", axis="columns"), nu_df["a"]) assert_frame_equal(nu_df.drop([]), nu_df) # GH 16398 - nu_df = nu_df.set_index(pd.Index(['X', 'Y', 'X'])) - nu_df.columns = list('abc') - assert_frame_equal(nu_df.drop('X', axis='rows'), nu_df.loc[["Y"], :]) - assert_frame_equal(nu_df.drop(['X', 'Y'], axis=0), nu_df.loc[[], :]) + nu_df = nu_df.set_index(pd.Index(["X", "Y", "X"])) + nu_df.columns = list("abc") + assert_frame_equal(nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :]) + assert_frame_equal(nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :]) # inplace cache issue # GH 5628 - df = pd.DataFrame(np.random.randn(10, 3), columns=list('abc')) + df = pd.DataFrame(np.random.randn(10, 3), columns=list("abc")) expected = df[~(df.b > 0)] df.drop(labels=df[df.b > 0].index, inplace=True) assert_frame_equal(df, expected) @@ -121,59 +121,63 @@ def test_drop_multiindex_not_lexsorted(self): # define the lexsorted version lexsorted_mi = MultiIndex.from_tuples( - [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c']) + [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"] + ) lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) assert lexsorted_df.columns.is_lexsorted() # define the non-lexsorted version - not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'], - data=[[1, 'b1', 'c1', 3], - [1, 'b2', 'c2', 4]]) + not_lexsorted_df = DataFrame( + columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]] + ) not_lexsorted_df = not_lexsorted_df.pivot_table( - index='a', columns=['b', 'c'], values='d') + index="a", columns=["b", "c"], values="d" + ) not_lexsorted_df = not_lexsorted_df.reset_index() assert not not_lexsorted_df.columns.is_lexsorted() # compare the results tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) - expected = lexsorted_df.drop('a', axis=1) + expected = lexsorted_df.drop("a", axis=1) with tm.assert_produces_warning(PerformanceWarning): - result = not_lexsorted_df.drop('a', axis=1) + result = not_lexsorted_df.drop("a", axis=1) tm.assert_frame_equal(result, expected) def test_drop_api_equivalence(self): # equivalence of the labels/axis and index/columns API's (GH12392) - df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]], - index=['a', 'b', 'c'], - columns=['d', 'e', 'f']) - - res1 = df.drop('a') - res2 = df.drop(index='a') + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + + res1 = df.drop("a") + res2 = df.drop(index="a") tm.assert_frame_equal(res1, res2) - res1 = df.drop('d', 1) - res2 = df.drop(columns='d') + res1 = df.drop("d", 1) + res2 = df.drop(columns="d") tm.assert_frame_equal(res1, res2) - res1 = df.drop(labels='e', axis=1) - res2 = df.drop(columns='e') + res1 = df.drop(labels="e", axis=1) + res2 = df.drop(columns="e") tm.assert_frame_equal(res1, res2) - res1 = df.drop(['a'], axis=0) - res2 = df.drop(index=['a']) + res1 = df.drop(["a"], axis=0) + res2 = df.drop(index=["a"]) tm.assert_frame_equal(res1, res2) - res1 = df.drop(['a'], axis=0).drop(['d'], axis=1) - res2 = df.drop(index=['a'], columns=['d']) + res1 = df.drop(["a"], axis=0).drop(["d"], axis=1) + res2 = df.drop(index=["a"], columns=["d"]) tm.assert_frame_equal(res1, res2) with pytest.raises(ValueError): - df.drop(labels='a', index='b') + df.drop(labels="a", index="b") with pytest.raises(ValueError): - df.drop(labels='a', columns='b') + df.drop(labels="a", columns="b") with pytest.raises(ValueError): df.drop(axis=1) @@ -182,25 +186,24 @@ def test_merge_join_different_levels(self): # GH 9455 # first dataframe - df1 = DataFrame(columns=['a', 'b'], data=[[1, 11], [0, 22]]) + df1 = DataFrame(columns=["a", "b"], data=[[1, 11], [0, 22]]) # second dataframe - columns = MultiIndex.from_tuples([('a', ''), ('c', 'c1')]) + columns = MultiIndex.from_tuples([("a", ""), ("c", "c1")]) df2 = DataFrame(columns=columns, data=[[1, 33], [0, 44]]) # merge - columns = ['a', 'b', ('c', 'c1')] + columns = ["a", "b", ("c", "c1")] expected = DataFrame(columns=columns, data=[[1, 11, 33], [0, 22, 44]]) with tm.assert_produces_warning(UserWarning): - result = pd.merge(df1, df2, on='a') + result = pd.merge(df1, df2, on="a") tm.assert_frame_equal(result, expected) # join, see discussion in GH 12219 - columns = ['a', 'b', ('a', ''), ('c', 'c1')] - expected = DataFrame(columns=columns, - data=[[1, 11, 0, 44], [0, 22, 1, 33]]) + columns = ["a", "b", ("a", ""), ("c", "c1")] + expected = DataFrame(columns=columns, data=[[1, 11, 0, 44], [0, 22, 1, 33]]) with tm.assert_produces_warning(UserWarning): - result = df1.join(df2, on='a') + result = df1.join(df2, on="a") tm.assert_frame_equal(result, expected) def test_reindex(self, float_frame): @@ -266,69 +269,73 @@ def test_reindex(self, float_frame): assert result is not float_frame def test_reindex_nan(self): - df = pd.DataFrame([[1, 2], [3, 5], [7, 11], [9, 23]], - index=[2, np.nan, 1, 5], - columns=['joe', 'jim']) + df = pd.DataFrame( + [[1, 2], [3, 5], [7, 11], [9, 23]], + index=[2, np.nan, 1, 5], + columns=["joe", "jim"], + ) i, j = [np.nan, 5, 5, np.nan, 1, 2, np.nan], [1, 3, 3, 1, 2, 0, 1] assert_frame_equal(df.reindex(i), df.iloc[j]) - df.index = df.index.astype('object') + df.index = df.index.astype("object") assert_frame_equal(df.reindex(i), df.iloc[j], check_index_type=False) # GH10388 - df = pd.DataFrame({'other': ['a', 'b', np.nan, 'c'], - 'date': ['2015-03-22', np.nan, - '2012-01-08', np.nan], - 'amount': [2, 3, 4, 5]}) - - df['date'] = pd.to_datetime(df.date) - df['delta'] = (pd.to_datetime('2015-06-18') - df['date']).shift(1) - - left = df.set_index(['delta', 'other', 'date']).reset_index() - right = df.reindex(columns=['delta', 'other', 'date', 'amount']) + df = pd.DataFrame( + { + "other": ["a", "b", np.nan, "c"], + "date": ["2015-03-22", np.nan, "2012-01-08", np.nan], + "amount": [2, 3, 4, 5], + } + ) + + df["date"] = pd.to_datetime(df.date) + df["delta"] = (pd.to_datetime("2015-06-18") - df["date"]).shift(1) + + left = df.set_index(["delta", "other", "date"]).reset_index() + right = df.reindex(columns=["delta", "other", "date", "amount"]) assert_frame_equal(left, right) def test_reindex_name_remains(self): s = Series(np.random.rand(10)) df = DataFrame(s, index=np.arange(len(s))) - i = Series(np.arange(10), name='iname') + i = Series(np.arange(10), name="iname") df = df.reindex(i) - assert df.index.name == 'iname' + assert df.index.name == "iname" - df = df.reindex(Index(np.arange(10), name='tmpname')) - assert df.index.name == 'tmpname' + df = df.reindex(Index(np.arange(10), name="tmpname")) + assert df.index.name == "tmpname" s = Series(np.random.rand(10)) df = DataFrame(s.T, index=np.arange(len(s))) - i = Series(np.arange(10), name='iname') + i = Series(np.arange(10), name="iname") df = df.reindex(columns=i) - assert df.columns.name == 'iname' + assert df.columns.name == "iname" def test_reindex_int(self, int_frame): smaller = int_frame.reindex(int_frame.index[::2]) - assert smaller['A'].dtype == np.int64 + assert smaller["A"].dtype == np.int64 bigger = smaller.reindex(int_frame.index) - assert bigger['A'].dtype == np.float64 + assert bigger["A"].dtype == np.float64 - smaller = int_frame.reindex(columns=['A', 'B']) - assert smaller['A'].dtype == np.int64 + smaller = int_frame.reindex(columns=["A", "B"]) + assert smaller["A"].dtype == np.int64 def test_reindex_like(self, float_frame): - other = float_frame.reindex(index=float_frame.index[:10], - columns=['C', 'B']) + other = float_frame.reindex(index=float_frame.index[:10], columns=["C", "B"]) assert_frame_equal(other, float_frame.reindex_like(other)) def test_reindex_columns(self, float_frame): - new_frame = float_frame.reindex(columns=['A', 'B', 'E']) + new_frame = float_frame.reindex(columns=["A", "B", "E"]) - tm.assert_series_equal(new_frame['B'], float_frame['B']) - assert np.isnan(new_frame['E']).all() - assert 'C' not in new_frame + tm.assert_series_equal(new_frame["B"], float_frame["B"]) + assert np.isnan(new_frame["E"]).all() + assert "C" not in new_frame # Length zero new_frame = float_frame.reindex(columns=[]) @@ -337,55 +344,68 @@ def test_reindex_columns(self, float_frame): def test_reindex_columns_method(self): # GH 14992, reindexing over columns ignored method - df = DataFrame(data=[[11, 12, 13], [21, 22, 23], [31, 32, 33]], - index=[1, 2, 4], - columns=[1, 2, 4], - dtype=float) + df = DataFrame( + data=[[11, 12, 13], [21, 22, 23], [31, 32, 33]], + index=[1, 2, 4], + columns=[1, 2, 4], + dtype=float, + ) # default method result = df.reindex(columns=range(6)) - expected = DataFrame(data=[[np.nan, 11, 12, np.nan, 13, np.nan], - [np.nan, 21, 22, np.nan, 23, np.nan], - [np.nan, 31, 32, np.nan, 33, np.nan]], - index=[1, 2, 4], - columns=range(6), - dtype=float) + expected = DataFrame( + data=[ + [np.nan, 11, 12, np.nan, 13, np.nan], + [np.nan, 21, 22, np.nan, 23, np.nan], + [np.nan, 31, 32, np.nan, 33, np.nan], + ], + index=[1, 2, 4], + columns=range(6), + dtype=float, + ) assert_frame_equal(result, expected) # method='ffill' - result = df.reindex(columns=range(6), method='ffill') - expected = DataFrame(data=[[np.nan, 11, 12, 12, 13, 13], - [np.nan, 21, 22, 22, 23, 23], - [np.nan, 31, 32, 32, 33, 33]], - index=[1, 2, 4], - columns=range(6), - dtype=float) + result = df.reindex(columns=range(6), method="ffill") + expected = DataFrame( + data=[ + [np.nan, 11, 12, 12, 13, 13], + [np.nan, 21, 22, 22, 23, 23], + [np.nan, 31, 32, 32, 33, 33], + ], + index=[1, 2, 4], + columns=range(6), + dtype=float, + ) assert_frame_equal(result, expected) # method='bfill' - result = df.reindex(columns=range(6), method='bfill') - expected = DataFrame(data=[[11, 11, 12, 13, 13, np.nan], - [21, 21, 22, 23, 23, np.nan], - [31, 31, 32, 33, 33, np.nan]], - index=[1, 2, 4], - columns=range(6), - dtype=float) + result = df.reindex(columns=range(6), method="bfill") + expected = DataFrame( + data=[ + [11, 11, 12, 13, 13, np.nan], + [21, 21, 22, 23, 23, np.nan], + [31, 31, 32, 33, 33, np.nan], + ], + index=[1, 2, 4], + columns=range(6), + dtype=float, + ) assert_frame_equal(result, expected) def test_reindex_axes(self): # GH 3317, reindexing by both axes loses freq of the index - df = DataFrame(np.ones((3, 3)), - index=[datetime(2012, 1, 1), - datetime(2012, 1, 2), - datetime(2012, 1, 3)], - columns=['a', 'b', 'c']) - time_freq = date_range('2012-01-01', '2012-01-03', freq='d') - some_cols = ['a', 'b'] + df = DataFrame( + np.ones((3, 3)), + index=[datetime(2012, 1, 1), datetime(2012, 1, 2), datetime(2012, 1, 3)], + columns=["a", "b", "c"], + ) + time_freq = date_range("2012-01-01", "2012-01-03", freq="d") + some_cols = ["a", "b"] index_freq = df.reindex(index=time_freq).index.freq both_freq = df.reindex(index=time_freq, columns=some_cols).index.freq - seq_freq = df.reindex(index=time_freq).reindex( - columns=some_cols).index.freq + seq_freq = df.reindex(index=time_freq).reindex(columns=some_cols).index.freq assert index_freq == both_freq assert index_freq == seq_freq @@ -401,9 +421,9 @@ def test_reindex_fill_value(self): assert_frame_equal(result, expected) # axis=1 - result = df.reindex(columns=range(5), fill_value=0.) + result = df.reindex(columns=range(5), fill_value=0.0) expected = df.copy() - expected[4] = 0. + expected[4] = 0.0 assert_frame_equal(result, expected) result = df.reindex(columns=range(5), fill_value=0) @@ -411,13 +431,13 @@ def test_reindex_fill_value(self): expected[4] = 0 assert_frame_equal(result, expected) - result = df.reindex(columns=range(5), fill_value='foo') + result = df.reindex(columns=range(5), fill_value="foo") expected = df.copy() - expected[4] = 'foo' + expected[4] = "foo" assert_frame_equal(result, expected) # other dtypes - df['foo'] = 'foo' + df["foo"] = "foo" result = df.reindex(range(15), fill_value=0) expected = df.reindex(range(15)).fillna(0) assert_frame_equal(result, expected) @@ -442,57 +462,57 @@ def test_reindex_dups(self): def test_reindex_axis_style(self): # https://github.com/pandas-dev/pandas/issues/12392 df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - expected = pd.DataFrame({"A": [1, 2, np.nan], "B": [4, 5, np.nan]}, - index=[0, 1, 3]) + expected = pd.DataFrame( + {"A": [1, 2, np.nan], "B": [4, 5, np.nan]}, index=[0, 1, 3] + ) result = df.reindex([0, 1, 3]) assert_frame_equal(result, expected) result = df.reindex([0, 1, 3], axis=0) assert_frame_equal(result, expected) - result = df.reindex([0, 1, 3], axis='index') + result = df.reindex([0, 1, 3], axis="index") assert_frame_equal(result, expected) def test_reindex_positional_warns(self): # https://github.com/pandas-dev/pandas/issues/12392 df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - expected = pd.DataFrame({"A": [1., 2], 'B': [4., 5], - "C": [np.nan, np.nan]}) + expected = pd.DataFrame({"A": [1.0, 2], "B": [4.0, 5], "C": [np.nan, np.nan]}) with tm.assert_produces_warning(FutureWarning): - result = df.reindex([0, 1], ['A', 'B', 'C']) + result = df.reindex([0, 1], ["A", "B", "C"]) assert_frame_equal(result, expected) def test_reindex_axis_style_raises(self): # https://github.com/pandas-dev/pandas/issues/12392 - df = pd.DataFrame({"A": [1, 2, 3], 'B': [4, 5, 6]}) + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) with pytest.raises(TypeError, match="Cannot specify both 'axis'"): - df.reindex([0, 1], ['A'], axis=1) + df.reindex([0, 1], ["A"], axis=1) with pytest.raises(TypeError, match="Cannot specify both 'axis'"): - df.reindex([0, 1], ['A'], axis='index') + df.reindex([0, 1], ["A"], axis="index") with pytest.raises(TypeError, match="Cannot specify both 'axis'"): - df.reindex(index=[0, 1], axis='index') + df.reindex(index=[0, 1], axis="index") with pytest.raises(TypeError, match="Cannot specify both 'axis'"): - df.reindex(index=[0, 1], axis='columns') + df.reindex(index=[0, 1], axis="columns") with pytest.raises(TypeError, match="Cannot specify both 'axis'"): - df.reindex(columns=[0, 1], axis='columns') + df.reindex(columns=[0, 1], axis="columns") with pytest.raises(TypeError, match="Cannot specify both 'axis'"): - df.reindex(index=[0, 1], columns=[0, 1], axis='columns') + df.reindex(index=[0, 1], columns=[0, 1], axis="columns") - with pytest.raises(TypeError, match='Cannot specify all'): - df.reindex([0, 1], [0], ['A']) + with pytest.raises(TypeError, match="Cannot specify all"): + df.reindex([0, 1], [0], ["A"]) # Mixing styles with pytest.raises(TypeError, match="Cannot specify both 'axis'"): - df.reindex(index=[0, 1], axis='index') + df.reindex(index=[0, 1], axis="index") with pytest.raises(TypeError, match="Cannot specify both 'axis'"): - df.reindex(index=[0, 1], axis='columns') + df.reindex(index=[0, 1], axis="columns") # Duplicates with pytest.raises(TypeError, match="multiple values"): @@ -501,37 +521,38 @@ def test_reindex_axis_style_raises(self): def test_reindex_single_named_indexer(self): # https://github.com/pandas-dev/pandas/issues/12392 df = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]}) - result = df.reindex([0, 1], columns=['A']) + result = df.reindex([0, 1], columns=["A"]) expected = pd.DataFrame({"A": [1, 2]}) assert_frame_equal(result, expected) def test_reindex_api_equivalence(self): # https://github.com/pandas-dev/pandas/issues/12392 # equivalence of the labels/axis and index/columns API's - df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]], - index=['a', 'b', 'c'], - columns=['d', 'e', 'f']) - - res1 = df.reindex(['b', 'a']) - res2 = df.reindex(index=['b', 'a']) - res3 = df.reindex(labels=['b', 'a']) - res4 = df.reindex(labels=['b', 'a'], axis=0) - res5 = df.reindex(['b', 'a'], axis=0) + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + + res1 = df.reindex(["b", "a"]) + res2 = df.reindex(index=["b", "a"]) + res3 = df.reindex(labels=["b", "a"]) + res4 = df.reindex(labels=["b", "a"], axis=0) + res5 = df.reindex(["b", "a"], axis=0) for res in [res2, res3, res4, res5]: tm.assert_frame_equal(res1, res) - res1 = df.reindex(columns=['e', 'd']) - res2 = df.reindex(['e', 'd'], axis=1) - res3 = df.reindex(labels=['e', 'd'], axis=1) + res1 = df.reindex(columns=["e", "d"]) + res2 = df.reindex(["e", "d"], axis=1) + res3 = df.reindex(labels=["e", "d"], axis=1) for res in [res2, res3]: tm.assert_frame_equal(res1, res) with tm.assert_produces_warning(FutureWarning) as m: - res1 = df.reindex(['b', 'a'], ['e', 'd']) - assert 'reindex' in str(m[0].message) - res2 = df.reindex(columns=['e', 'd'], index=['b', 'a']) - res3 = df.reindex(labels=['b', 'a'], axis=0).reindex(labels=['e', 'd'], - axis=1) + res1 = df.reindex(["b", "a"], ["e", "d"]) + assert "reindex" in str(m[0].message) + res2 = df.reindex(columns=["e", "d"], index=["b", "a"]) + res3 = df.reindex(labels=["b", "a"], axis=0).reindex(labels=["e", "d"], axis=1) for res in [res2, res3]: tm.assert_frame_equal(res1, res) @@ -556,7 +577,7 @@ def test_align_float(self, float_frame): diff_b_vals = bf.reindex(diff_b).values assert (diff_a_vals == -1).all() - af, bf = float_frame.align(other, join='right', axis=0) + af, bf = float_frame.align(other, join="right", axis=0) tm.assert_index_equal(bf.columns, other.columns) tm.assert_index_equal(bf.index, other.index) tm.assert_index_equal(af.index, other.index) @@ -578,23 +599,25 @@ def test_align_float(self, float_frame): assert (diff_a_vals == -1).all() - af, bf = float_frame.align(other, join='inner', axis=1) + af, bf = float_frame.align(other, join="inner", axis=1) tm.assert_index_equal(bf.columns, other.columns) - af, bf = float_frame.align(other, join='inner', axis=1, method='pad') + af, bf = float_frame.align(other, join="inner", axis=1, method="pad") tm.assert_index_equal(bf.columns, other.columns) - af, bf = float_frame.align(other.iloc[:, 0], join='inner', axis=1, - method=None, fill_value=None) + af, bf = float_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None + ) tm.assert_index_equal(bf.index, Index([])) - af, bf = float_frame.align(other.iloc[:, 0], join='inner', axis=1, - method=None, fill_value=0) + af, bf = float_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 + ) tm.assert_index_equal(bf.index, Index([])) # Try to align DataFrame to Series along bad axis with pytest.raises(ValueError): - float_frame.align(af.iloc[0, :3], join='inner', axis=2) + float_frame.align(af.iloc[0, :3], join="inner", axis=2) # align dataframe to series with broadcast or not idx = float_frame.index @@ -608,51 +631,56 @@ def test_align_float(self, float_frame): left, right = float_frame.align(s, broadcast_axis=1) tm.assert_index_equal(left.index, float_frame.index) expected = {c: s for c in float_frame.columns} - expected = DataFrame(expected, index=float_frame.index, - columns=float_frame.columns) + expected = DataFrame( + expected, index=float_frame.index, columns=float_frame.columns + ) tm.assert_frame_equal(right, expected) # see gh-9558 - df = DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) - result = df[df['a'] == 2] - expected = DataFrame([[2, 5]], index=[1], columns=['a', 'b']) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + result = df[df["a"] == 2] + expected = DataFrame([[2, 5]], index=[1], columns=["a", "b"]) tm.assert_frame_equal(result, expected) - result = df.where(df['a'] == 2, 0) - expected = DataFrame({'a': [0, 2, 0], 'b': [0, 5, 0]}) + result = df.where(df["a"] == 2, 0) + expected = DataFrame({"a": [0, 2, 0], "b": [0, 5, 0]}) tm.assert_frame_equal(result, expected) def test_align_int(self, int_frame): # test other non-float types - other = DataFrame(index=range(5), columns=['A', 'B', 'C']) + other = DataFrame(index=range(5), columns=["A", "B", "C"]) - af, bf = int_frame.align(other, join='inner', axis=1, method='pad') + af, bf = int_frame.align(other, join="inner", axis=1, method="pad") tm.assert_index_equal(bf.columns, other.columns) def test_align_mixed_type(self, float_string_frame): - af, bf = float_string_frame.align(float_string_frame, - join='inner', axis=1, method='pad') + af, bf = float_string_frame.align( + float_string_frame, join="inner", axis=1, method="pad" + ) tm.assert_index_equal(bf.columns, float_string_frame.columns) def test_align_mixed_float(self, mixed_float_frame): # mixed floats/ints - other = DataFrame(index=range(5), columns=['A', 'B', 'C']) + other = DataFrame(index=range(5), columns=["A", "B", "C"]) - af, bf = mixed_float_frame.align(other.iloc[:, 0], join='inner', - axis=1, method=None, fill_value=0) + af, bf = mixed_float_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 + ) tm.assert_index_equal(bf.index, Index([])) def test_align_mixed_int(self, mixed_int_frame): - other = DataFrame(index=range(5), columns=['A', 'B', 'C']) + other = DataFrame(index=range(5), columns=["A", "B", "C"]) - af, bf = mixed_int_frame.align(other.iloc[:, 0], join='inner', axis=1, - method=None, fill_value=0) + af, bf = mixed_int_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 + ) tm.assert_index_equal(bf.index, Index([])) def _check_align(self, a, b, axis, fill_axis, how, method, limit=None): - aa, ab = a.align(b, axis=axis, join=how, method=method, limit=limit, - fill_axis=fill_axis) + aa, ab = a.align( + b, axis=axis, join=how, method=method, limit=limit, fill_axis=fill_axis + ) join_index, join_columns = None, None @@ -673,10 +701,10 @@ def _check_align(self, a, b, axis, fill_axis, how, method, limit=None): assert_frame_equal(aa, ea) assert_frame_equal(ab, eb) - @pytest.mark.parametrize('meth', ['pad', 'bfill']) - @pytest.mark.parametrize('ax', [0, 1, None]) - @pytest.mark.parametrize('fax', [0, 1]) - @pytest.mark.parametrize('how', ['inner', 'outer', 'left', 'right']) + @pytest.mark.parametrize("meth", ["pad", "bfill"]) + @pytest.mark.parametrize("ax", [0, 1, None]) + @pytest.mark.parametrize("fax", [0, 1]) + @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"]) def test_align_fill_method(self, how, meth, ax, fax, float_frame): df = float_frame self._check_align_fill(df, how, meth, ax, fax) @@ -686,36 +714,36 @@ def _check_align_fill(self, frame, kind, meth, ax, fax): right = frame.iloc[2:, 6:] empty = frame.iloc[:0, :0] - self._check_align(left, right, axis=ax, fill_axis=fax, - how=kind, method=meth) - self._check_align(left, right, axis=ax, fill_axis=fax, - how=kind, method=meth, limit=1) + self._check_align(left, right, axis=ax, fill_axis=fax, how=kind, method=meth) + self._check_align( + left, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 + ) # empty left - self._check_align(empty, right, axis=ax, fill_axis=fax, - how=kind, method=meth) - self._check_align(empty, right, axis=ax, fill_axis=fax, - how=kind, method=meth, limit=1) + self._check_align(empty, right, axis=ax, fill_axis=fax, how=kind, method=meth) + self._check_align( + empty, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 + ) # empty right - self._check_align(left, empty, axis=ax, fill_axis=fax, - how=kind, method=meth) - self._check_align(left, empty, axis=ax, fill_axis=fax, - how=kind, method=meth, limit=1) + self._check_align(left, empty, axis=ax, fill_axis=fax, how=kind, method=meth) + self._check_align( + left, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 + ) # both empty - self._check_align(empty, empty, axis=ax, fill_axis=fax, - how=kind, method=meth) - self._check_align(empty, empty, axis=ax, fill_axis=fax, - how=kind, method=meth, limit=1) + self._check_align(empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth) + self._check_align( + empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 + ) def test_align_int_fill_bug(self): # GH #910 - X = np.arange(10 * 10, dtype='float64').reshape(10, 10) + X = np.arange(10 * 10, dtype="float64").reshape(10, 10) Y = np.ones((10, 1), dtype=int) df1 = DataFrame(X) - df1['0.X'] = Y.squeeze() + df1["0.X"] = Y.squeeze() df2 = df1.astype(float) @@ -727,15 +755,16 @@ def test_align_multiindex(self): # GH 10665 # same test cases as test_align_multiindex in test_series.py - midx = pd.MultiIndex.from_product([range(2), range(3), range(2)], - names=('a', 'b', 'c')) - idx = pd.Index(range(2), name='b') - df1 = pd.DataFrame(np.arange(12, dtype='int64'), index=midx) - df2 = pd.DataFrame(np.arange(2, dtype='int64'), index=idx) + midx = pd.MultiIndex.from_product( + [range(2), range(3), range(2)], names=("a", "b", "c") + ) + idx = pd.Index(range(2), name="b") + df1 = pd.DataFrame(np.arange(12, dtype="int64"), index=midx) + df2 = pd.DataFrame(np.arange(2, dtype="int64"), index=idx) # these must be the same results (but flipped) - res1l, res1r = df1.align(df2, join='left') - res2l, res2r = df2.align(df1, join='right') + res1l, res1r = df1.align(df2, join="left") + res2l, res2r = df2.align(df1, join="right") expl = df1 assert_frame_equal(expl, res1l) @@ -744,11 +773,12 @@ def test_align_multiindex(self): assert_frame_equal(expr, res1r) assert_frame_equal(expr, res2l) - res1l, res1r = df1.align(df2, join='right') - res2l, res2r = df2.align(df1, join='left') + res1l, res1r = df1.align(df2, join="right") + res2l, res2r = df2.align(df1, join="left") - exp_idx = pd.MultiIndex.from_product([range(2), range(2), range(2)], - names=('a', 'b', 'c')) + exp_idx = pd.MultiIndex.from_product( + [range(2), range(2), range(2)], names=("a", "b", "c") + ) expl = pd.DataFrame([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) assert_frame_equal(expl, res1l) assert_frame_equal(expl, res2r) @@ -757,17 +787,16 @@ def test_align_multiindex(self): assert_frame_equal(expr, res2l) def test_align_series_combinations(self): - df = pd.DataFrame({'a': [1, 3, 5], - 'b': [1, 3, 5]}, index=list('ACE')) - s = pd.Series([1, 2, 4], index=list('ABD'), name='x') + df = pd.DataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE")) + s = pd.Series([1, 2, 4], index=list("ABD"), name="x") # frame + series res1, res2 = df.align(s, axis=0) - exp1 = pd.DataFrame({'a': [1, np.nan, 3, np.nan, 5], - 'b': [1, np.nan, 3, np.nan, 5]}, - index=list('ABCDE')) - exp2 = pd.Series([1, 2, np.nan, 4, np.nan], - index=list('ABCDE'), name='x') + exp1 = pd.DataFrame( + {"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]}, + index=list("ABCDE"), + ) + exp2 = pd.Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x") tm.assert_frame_equal(res1, exp1) tm.assert_series_equal(res2, exp2) @@ -779,111 +808,114 @@ def test_align_series_combinations(self): def test_filter(self, float_frame, float_string_frame): # Items - filtered = float_frame.filter(['A', 'B', 'E']) + filtered = float_frame.filter(["A", "B", "E"]) assert len(filtered.columns) == 2 - assert 'E' not in filtered + assert "E" not in filtered - filtered = float_frame.filter(['A', 'B', 'E'], axis='columns') + filtered = float_frame.filter(["A", "B", "E"], axis="columns") assert len(filtered.columns) == 2 - assert 'E' not in filtered + assert "E" not in filtered # Other axis idx = float_frame.index[0:4] - filtered = float_frame.filter(idx, axis='index') + filtered = float_frame.filter(idx, axis="index") expected = float_frame.reindex(index=idx) tm.assert_frame_equal(filtered, expected) # like fcopy = float_frame.copy() - fcopy['AA'] = 1 + fcopy["AA"] = 1 - filtered = fcopy.filter(like='A') + filtered = fcopy.filter(like="A") assert len(filtered.columns) == 2 - assert 'AA' in filtered + assert "AA" in filtered # like with ints in column names - df = DataFrame(0., index=[0, 1, 2], columns=[0, 1, '_A', '_B']) - filtered = df.filter(like='_') + df = DataFrame(0.0, index=[0, 1, 2], columns=[0, 1, "_A", "_B"]) + filtered = df.filter(like="_") assert len(filtered.columns) == 2 # regex with ints in column names # from PR #10384 - df = DataFrame(0., index=[0, 1, 2], columns=['A1', 1, 'B', 2, 'C']) + df = DataFrame(0.0, index=[0, 1, 2], columns=["A1", 1, "B", 2, "C"]) expected = DataFrame( - 0., index=[0, 1, 2], columns=pd.Index([1, 2], dtype=object)) - filtered = df.filter(regex='^[0-9]+$') + 0.0, index=[0, 1, 2], columns=pd.Index([1, 2], dtype=object) + ) + filtered = df.filter(regex="^[0-9]+$") tm.assert_frame_equal(filtered, expected) - expected = DataFrame(0., index=[0, 1, 2], columns=[0, '0', 1, '1']) + expected = DataFrame(0.0, index=[0, 1, 2], columns=[0, "0", 1, "1"]) # shouldn't remove anything - filtered = expected.filter(regex='^[0-9]+$') + filtered = expected.filter(regex="^[0-9]+$") tm.assert_frame_equal(filtered, expected) # pass in None - with pytest.raises(TypeError, match='Must pass'): + with pytest.raises(TypeError, match="Must pass"): float_frame.filter() - with pytest.raises(TypeError, match='Must pass'): + with pytest.raises(TypeError, match="Must pass"): float_frame.filter(items=None) - with pytest.raises(TypeError, match='Must pass'): + with pytest.raises(TypeError, match="Must pass"): float_frame.filter(axis=1) # test mutually exclusive arguments - with pytest.raises(TypeError, match='mutually exclusive'): - float_frame.filter(items=['one', 'three'], regex='e$', like='bbi') - with pytest.raises(TypeError, match='mutually exclusive'): - float_frame.filter(items=['one', 'three'], regex='e$', axis=1) - with pytest.raises(TypeError, match='mutually exclusive'): - float_frame.filter(items=['one', 'three'], regex='e$') - with pytest.raises(TypeError, match='mutually exclusive'): - float_frame.filter(items=['one', 'three'], like='bbi', axis=0) - with pytest.raises(TypeError, match='mutually exclusive'): - float_frame.filter(items=['one', 'three'], like='bbi') + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], regex="e$", like="bbi") + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], regex="e$", axis=1) + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], regex="e$") + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], like="bbi", axis=0) + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], like="bbi") # objects - filtered = float_string_frame.filter(like='foo') - assert 'foo' in filtered + filtered = float_string_frame.filter(like="foo") + assert "foo" in filtered # unicode columns, won't ascii-encode - df = float_frame.rename(columns={'B': '\u2202'}) - filtered = df.filter(like='C') - assert 'C' in filtered + df = float_frame.rename(columns={"B": "\u2202"}) + filtered = df.filter(like="C") + assert "C" in filtered def test_filter_regex_search(self, float_frame): fcopy = float_frame.copy() - fcopy['AA'] = 1 + fcopy["AA"] = 1 # regex - filtered = fcopy.filter(regex='[A]+') + filtered = fcopy.filter(regex="[A]+") assert len(filtered.columns) == 2 - assert 'AA' in filtered + assert "AA" in filtered # doesn't have to be at beginning - df = DataFrame({'aBBa': [1, 2], - 'BBaBB': [1, 2], - 'aCCa': [1, 2], - 'aCCaBB': [1, 2]}) + df = DataFrame( + {"aBBa": [1, 2], "BBaBB": [1, 2], "aCCa": [1, 2], "aCCaBB": [1, 2]} + ) - result = df.filter(regex='BB') - exp = df[[x for x in df.columns if 'BB' in x]] + result = df.filter(regex="BB") + exp = df[[x for x in df.columns if "BB" in x]] assert_frame_equal(result, exp) - @pytest.mark.parametrize('name,expected', [ - ('a', DataFrame({'a': [1, 2]})), - ('a', DataFrame({'a': [1, 2]})), - ('あ', DataFrame({'あ': [3, 4]})) - ]) + @pytest.mark.parametrize( + "name,expected", + [ + ("a", DataFrame({"a": [1, 2]})), + ("a", DataFrame({"a": [1, 2]})), + ("あ", DataFrame({"あ": [3, 4]})), + ], + ) def test_filter_unicode(self, name, expected): # GH13101 - df = DataFrame({'a': [1, 2], 'あ': [3, 4]}) + df = DataFrame({"a": [1, 2], "あ": [3, 4]}) assert_frame_equal(df.filter(like=name), expected) assert_frame_equal(df.filter(regex=name), expected) - @pytest.mark.parametrize('name', ['a', 'a']) + @pytest.mark.parametrize("name", ["a", "a"]) def test_filter_bytestring(self, name): # GH13101 - df = DataFrame({b'a': [1, 2], b'b': [3, 4]}) - expected = DataFrame({b'a': [1, 2]}) + df = DataFrame({b"a": [1, 2], b"b": [3, 4]}) + expected = DataFrame({b"a": [1, 2]}) assert_frame_equal(df.filter(like=name), expected) assert_frame_equal(df.filter(regex=name), expected) @@ -894,7 +926,7 @@ def test_filter_corner(self): result = empty.filter([]) assert_frame_equal(result, empty) - result = empty.filter(like='foo') + result = empty.filter(like="foo") assert_frame_equal(result, empty) def test_take(self, float_frame): @@ -908,7 +940,7 @@ def test_take(self, float_frame): # axis = 1 result = df.take(order, axis=1) - expected = df.loc[:, ['D', 'B', 'C', 'A']] + expected = df.loc[:, ["D", "B", "C", "A"]] assert_frame_equal(result, expected, check_names=False) # negative indices @@ -924,7 +956,7 @@ def test_take(self, float_frame): # axis = 1 result = df.take(order, axis=1) - expected = df.loc[:, ['C', 'B', 'D']] + expected = df.loc[:, ["C", "B", "D"]] assert_frame_equal(result, expected, check_names=False) # illegal indices @@ -950,7 +982,7 @@ def test_take_mixed_type(self, float_string_frame): # axis = 1 result = df.take(order, axis=1) - expected = df.loc[:, ['foo', 'B', 'C', 'A', 'D']] + expected = df.loc[:, ["foo", "B", "C", "A", "D"]] assert_frame_equal(result, expected) # negative indices @@ -963,7 +995,7 @@ def test_take_mixed_type(self, float_string_frame): # axis = 1 result = df.take(order, axis=1) - expected = df.loc[:, ['foo', 'B', 'D']] + expected = df.loc[:, ["foo", "B", "D"]] assert_frame_equal(result, expected) def test_take_mixed_numeric(self, mixed_float_frame, mixed_int_frame): @@ -977,13 +1009,13 @@ def test_take_mixed_numeric(self, mixed_float_frame, mixed_int_frame): # axis = 1 result = df.take(order, axis=1) - expected = df.loc[:, ['B', 'C', 'A', 'D']] + expected = df.loc[:, ["B", "C", "A", "D"]] assert_frame_equal(result, expected) def test_reindex_boolean(self): - frame = DataFrame(np.ones((10, 2), dtype=bool), - index=np.arange(0, 20, 2), - columns=[0, 2]) + frame = DataFrame( + np.ones((10, 2), dtype=bool), index=np.arange(0, 20, 2), columns=[0, 2] + ) reindexed = frame.reindex(np.arange(10)) assert reindexed.values.dtype == np.object_ @@ -994,26 +1026,28 @@ def test_reindex_boolean(self): assert isna(reindexed[1]).all() def test_reindex_objects(self, float_string_frame): - reindexed = float_string_frame.reindex(columns=['foo', 'A', 'B']) - assert 'foo' in reindexed + reindexed = float_string_frame.reindex(columns=["foo", "A", "B"]) + assert "foo" in reindexed - reindexed = float_string_frame.reindex(columns=['A', 'B']) - assert 'foo' not in reindexed + reindexed = float_string_frame.reindex(columns=["A", "B"]) + assert "foo" not in reindexed def test_reindex_corner(self, int_frame): - index = Index(['a', 'b', 'c']) + index = Index(["a", "b", "c"]) dm = DataFrame({}).reindex(index=[1, 2, 3]) reindexed = dm.reindex(columns=index) tm.assert_index_equal(reindexed.columns, index) # ints are weird - smaller = int_frame.reindex(columns=['A', 'B', 'E']) - assert smaller['E'].dtype == np.float64 + smaller = int_frame.reindex(columns=["A", "B", "E"]) + assert smaller["E"].dtype == np.float64 def test_reindex_with_nans(self): - df = DataFrame([[1, 2], [3, 4], [np.nan, np.nan], [7, 8], [9, 10]], - columns=['a', 'b'], - index=[100.0, 101.0, np.nan, 102.0, 103.0]) + df = DataFrame( + [[1, 2], [3, 4], [np.nan, np.nan], [7, 8], [9, 10]], + columns=["a", "b"], + index=[100.0, 101.0, np.nan, 102.0, 103.0], + ) result = df.reindex(index=[101.0, 102.0, 103.0]) expected = df.iloc[[1, 3, 4]] @@ -1049,60 +1083,63 @@ def test_reindex_multi(self): assert_frame_equal(result, expected) - df = DataFrame(np.random.randn(5, 3) + 1j, columns=['a', 'b', 'c']) + df = DataFrame(np.random.randn(5, 3) + 1j, columns=["a", "b", "c"]) - result = df.reindex(index=[0, 1], columns=['a', 'b']) - expected = df.reindex([0, 1]).reindex(columns=['a', 'b']) + result = df.reindex(index=[0, 1], columns=["a", "b"]) + expected = df.reindex([0, 1]).reindex(columns=["a", "b"]) assert_frame_equal(result, expected) def test_reindex_multi_categorical_time(self): # https://github.com/pandas-dev/pandas/issues/21390 midx = pd.MultiIndex.from_product( - [Categorical(['a', 'b', 'c']), - Categorical(date_range("2012-01-01", periods=3, freq='H'))]) - df = pd.DataFrame({'a': range(len(midx))}, index=midx) + [ + Categorical(["a", "b", "c"]), + Categorical(date_range("2012-01-01", periods=3, freq="H")), + ] + ) + df = pd.DataFrame({"a": range(len(midx))}, index=midx) df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 8]] result = df2.reindex(midx) - expected = pd.DataFrame( - {'a': [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx) + expected = pd.DataFrame({"a": [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx) assert_frame_equal(result, expected) data = [[1, 2, 3], [1, 2, 3]] - @pytest.mark.parametrize('actual', [ - DataFrame(data=data, index=['a', 'a']), - DataFrame(data=data, index=['a', 'b']), - DataFrame(data=data, index=['a', 'b']).set_index([0, 1]), - DataFrame(data=data, index=['a', 'a']).set_index([0, 1]) - ]) + @pytest.mark.parametrize( + "actual", + [ + DataFrame(data=data, index=["a", "a"]), + DataFrame(data=data, index=["a", "b"]), + DataFrame(data=data, index=["a", "b"]).set_index([0, 1]), + DataFrame(data=data, index=["a", "a"]).set_index([0, 1]), + ], + ) def test_raise_on_drop_duplicate_index(self, actual): # issue 19186 level = 0 if isinstance(actual.index, MultiIndex) else None with pytest.raises(KeyError): - actual.drop('c', level=level, axis=0) + actual.drop("c", level=level, axis=0) with pytest.raises(KeyError): - actual.T.drop('c', level=level, axis=1) - expected_no_err = actual.drop('c', axis=0, level=level, - errors='ignore') + actual.T.drop("c", level=level, axis=1) + expected_no_err = actual.drop("c", axis=0, level=level, errors="ignore") assert_frame_equal(expected_no_err, actual) - expected_no_err = actual.T.drop('c', axis=1, level=level, - errors='ignore') + expected_no_err = actual.T.drop("c", axis=1, level=level, errors="ignore") assert_frame_equal(expected_no_err.T, actual) - @pytest.mark.parametrize('index', [[1, 2, 3], [1, 1, 2]]) - @pytest.mark.parametrize('drop_labels', [[], [1], [2]]) + @pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 2]]) + @pytest.mark.parametrize("drop_labels", [[], [1], [2]]) def test_drop_empty_list(self, index, drop_labels): # GH 21494 expected_index = [i for i in index if i not in drop_labels] frame = pd.DataFrame(index=index).drop(drop_labels) tm.assert_frame_equal(frame, pd.DataFrame(index=expected_index)) - @pytest.mark.parametrize('index', [[1, 2, 3], [1, 2, 2]]) - @pytest.mark.parametrize('drop_labels', [[1, 4], [4, 5]]) + @pytest.mark.parametrize("index", [[1, 2, 3], [1, 2, 2]]) + @pytest.mark.parametrize("drop_labels", [[1, 4], [4, 5]]) def test_drop_non_empty_list(self, index, drop_labels): # GH 21494 - with pytest.raises(KeyError, match='not found in axis'): + with pytest.raises(KeyError, match="not found in axis"): pd.DataFrame(index=index).drop(drop_labels) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index f1cbd7763474ea..37b0d61ee31d9b 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -7,14 +7,23 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, Series, Timestamp, compat, date_range, - option_context) + Categorical, + DataFrame, + Series, + Timestamp, + compat, + date_range, + option_context, +) from pandas.core.arrays import IntervalArray, integer_array from pandas.core.internals import ObjectBlock from pandas.core.internals.blocks import IntBlock import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) # Segregated collection of methods that require the BlockManager internal data # structure @@ -25,17 +34,17 @@ def test_setitem_invalidates_datetime_index_freq(self): # GH#24096 altering a datetime64tz column inplace invalidates the # `freq` attribute on the underlying DatetimeIndex - dti = date_range('20130101', periods=3, tz='US/Eastern') + dti = date_range("20130101", periods=3, tz="US/Eastern") ts = dti[1] - df = DataFrame({'B': dti}) - assert df['B']._values.freq == 'D' + df = DataFrame({"B": dti}) + assert df["B"]._values.freq == "D" df.iloc[1, 0] = pd.NaT - assert df['B']._values.freq is None + assert df["B"]._values.freq is None # check that the DatetimeIndex was not altered in place - assert dti.freq == 'D' + assert dti.freq == "D" assert dti[1] == ts def test_cast_internals(self, float_frame): @@ -48,7 +57,7 @@ def test_cast_internals(self, float_frame): assert_frame_equal(casted, expected) def test_consolidate(self, float_frame): - float_frame['E'] = 7. + float_frame["E"] = 7.0 consolidated = float_frame._consolidate() assert len(consolidated._data.blocks) == 1 @@ -57,7 +66,7 @@ def test_consolidate(self, float_frame): assert recons is not consolidated tm.assert_frame_equal(recons, consolidated) - float_frame['F'] = 8. + float_frame["F"] = 8.0 assert len(float_frame._data.blocks) == 3 float_frame._consolidate(inplace=True) @@ -67,11 +76,11 @@ def test_consolidate_inplace(self, float_frame): frame = float_frame.copy() # noqa # triggers in-place consolidation - for letter in range(ord('A'), ord('Z')): + for letter in range(ord("A"), ord("Z")): float_frame[chr(letter)] = chr(letter) def test_values_consolidate(self, float_frame): - float_frame['E'] = 7. + float_frame["E"] = 7.0 assert not float_frame._data.is_consolidated() _ = float_frame.values # noqa assert float_frame._data.is_consolidated() @@ -81,12 +90,12 @@ def test_modify_values(self, float_frame): assert (float_frame.values[5] == 5).all() # unconsolidated - float_frame['E'] = 7. + float_frame["E"] = 7.0 float_frame.values[6] = 6 assert (float_frame.values[6] == 6).all() def test_boolean_set_uncons(self, float_frame): - float_frame['E'] = 7. + float_frame["E"] = 7.0 expected = float_frame.values.copy() expected[expected > 1] = 2 @@ -95,166 +104,174 @@ def test_boolean_set_uncons(self, float_frame): assert_almost_equal(expected, float_frame.values) def test_values_numeric_cols(self, float_frame): - float_frame['foo'] = 'bar' + float_frame["foo"] = "bar" - values = float_frame[['A', 'B', 'C', 'D']].values + values = float_frame[["A", "B", "C", "D"]].values assert values.dtype == np.float64 def test_values_lcd(self, mixed_float_frame, mixed_int_frame): # mixed lcd - values = mixed_float_frame[['A', 'B', 'C', 'D']].values + values = mixed_float_frame[["A", "B", "C", "D"]].values assert values.dtype == np.float64 - values = mixed_float_frame[['A', 'B', 'C']].values + values = mixed_float_frame[["A", "B", "C"]].values assert values.dtype == np.float32 - values = mixed_float_frame[['C']].values + values = mixed_float_frame[["C"]].values assert values.dtype == np.float16 # GH 10364 # B uint64 forces float because there are other signed int types - values = mixed_int_frame[['A', 'B', 'C', 'D']].values + values = mixed_int_frame[["A", "B", "C", "D"]].values assert values.dtype == np.float64 - values = mixed_int_frame[['A', 'D']].values + values = mixed_int_frame[["A", "D"]].values assert values.dtype == np.int64 # B uint64 forces float because there are other signed int types - values = mixed_int_frame[['A', 'B', 'C']].values + values = mixed_int_frame[["A", "B", "C"]].values assert values.dtype == np.float64 # as B and C are both unsigned, no forcing to float is needed - values = mixed_int_frame[['B', 'C']].values + values = mixed_int_frame[["B", "C"]].values assert values.dtype == np.uint64 - values = mixed_int_frame[['A', 'C']].values + values = mixed_int_frame[["A", "C"]].values assert values.dtype == np.int32 - values = mixed_int_frame[['C', 'D']].values + values = mixed_int_frame[["C", "D"]].values assert values.dtype == np.int64 - values = mixed_int_frame[['A']].values + values = mixed_int_frame[["A"]].values assert values.dtype == np.int32 - values = mixed_int_frame[['C']].values + values = mixed_int_frame[["C"]].values assert values.dtype == np.uint8 def test_constructor_with_convert(self): # this is actually mostly a test of lib.maybe_convert_objects # #2845 - df = DataFrame({'A': [2 ** 63 - 1]}) - result = df['A'] - expected = Series(np.asarray([2 ** 63 - 1], np.int64), name='A') + df = DataFrame({"A": [2 ** 63 - 1]}) + result = df["A"] + expected = Series(np.asarray([2 ** 63 - 1], np.int64), name="A") assert_series_equal(result, expected) - df = DataFrame({'A': [2 ** 63]}) - result = df['A'] - expected = Series(np.asarray([2 ** 63], np.uint64), name='A') + df = DataFrame({"A": [2 ** 63]}) + result = df["A"] + expected = Series(np.asarray([2 ** 63], np.uint64), name="A") assert_series_equal(result, expected) - df = DataFrame({'A': [datetime(2005, 1, 1), True]}) - result = df['A'] - expected = Series(np.asarray([datetime(2005, 1, 1), True], np.object_), - name='A') + df = DataFrame({"A": [datetime(2005, 1, 1), True]}) + result = df["A"] + expected = Series( + np.asarray([datetime(2005, 1, 1), True], np.object_), name="A" + ) assert_series_equal(result, expected) - df = DataFrame({'A': [None, 1]}) - result = df['A'] - expected = Series(np.asarray([np.nan, 1], np.float_), name='A') + df = DataFrame({"A": [None, 1]}) + result = df["A"] + expected = Series(np.asarray([np.nan, 1], np.float_), name="A") assert_series_equal(result, expected) - df = DataFrame({'A': [1.0, 2]}) - result = df['A'] - expected = Series(np.asarray([1.0, 2], np.float_), name='A') + df = DataFrame({"A": [1.0, 2]}) + result = df["A"] + expected = Series(np.asarray([1.0, 2], np.float_), name="A") assert_series_equal(result, expected) - df = DataFrame({'A': [1.0 + 2.0j, 3]}) - result = df['A'] - expected = Series(np.asarray([1.0 + 2.0j, 3], np.complex_), name='A') + df = DataFrame({"A": [1.0 + 2.0j, 3]}) + result = df["A"] + expected = Series(np.asarray([1.0 + 2.0j, 3], np.complex_), name="A") assert_series_equal(result, expected) - df = DataFrame({'A': [1.0 + 2.0j, 3.0]}) - result = df['A'] - expected = Series(np.asarray([1.0 + 2.0j, 3.0], np.complex_), name='A') + df = DataFrame({"A": [1.0 + 2.0j, 3.0]}) + result = df["A"] + expected = Series(np.asarray([1.0 + 2.0j, 3.0], np.complex_), name="A") assert_series_equal(result, expected) - df = DataFrame({'A': [1.0 + 2.0j, True]}) - result = df['A'] - expected = Series(np.asarray([1.0 + 2.0j, True], np.object_), name='A') + df = DataFrame({"A": [1.0 + 2.0j, True]}) + result = df["A"] + expected = Series(np.asarray([1.0 + 2.0j, True], np.object_), name="A") assert_series_equal(result, expected) - df = DataFrame({'A': [1.0, None]}) - result = df['A'] - expected = Series(np.asarray([1.0, np.nan], np.float_), name='A') + df = DataFrame({"A": [1.0, None]}) + result = df["A"] + expected = Series(np.asarray([1.0, np.nan], np.float_), name="A") assert_series_equal(result, expected) - df = DataFrame({'A': [1.0 + 2.0j, None]}) - result = df['A'] - expected = Series(np.asarray( - [1.0 + 2.0j, np.nan], np.complex_), name='A') + df = DataFrame({"A": [1.0 + 2.0j, None]}) + result = df["A"] + expected = Series(np.asarray([1.0 + 2.0j, np.nan], np.complex_), name="A") assert_series_equal(result, expected) - df = DataFrame({'A': [2.0, 1, True, None]}) - result = df['A'] - expected = Series(np.asarray( - [2.0, 1, True, None], np.object_), name='A') + df = DataFrame({"A": [2.0, 1, True, None]}) + result = df["A"] + expected = Series(np.asarray([2.0, 1, True, None], np.object_), name="A") assert_series_equal(result, expected) - df = DataFrame({'A': [2.0, 1, datetime(2006, 1, 1), None]}) - result = df['A'] - expected = Series(np.asarray([2.0, 1, datetime(2006, 1, 1), - None], np.object_), name='A') + df = DataFrame({"A": [2.0, 1, datetime(2006, 1, 1), None]}) + result = df["A"] + expected = Series( + np.asarray([2.0, 1, datetime(2006, 1, 1), None], np.object_), name="A" + ) assert_series_equal(result, expected) def test_construction_with_mixed(self, float_string_frame): # test construction edge cases with mixed types # f7u12, this does not work without extensive workaround - data = [[datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)], - [datetime(2000, 1, 2), datetime(2000, 1, 3), - datetime(2000, 1, 1)]] + data = [ + [datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)], + [datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)], + ] df = DataFrame(data) # check dtypes result = df.dtypes - expected = Series({'datetime64[ns]': 3}) + expected = Series({"datetime64[ns]": 3}) # mixed-type frames - float_string_frame['datetime'] = datetime.now() - float_string_frame['timedelta'] = timedelta(days=1, seconds=1) - assert float_string_frame['datetime'].dtype == 'M8[ns]' - assert float_string_frame['timedelta'].dtype == 'm8[ns]' + float_string_frame["datetime"] = datetime.now() + float_string_frame["timedelta"] = timedelta(days=1, seconds=1) + assert float_string_frame["datetime"].dtype == "M8[ns]" + assert float_string_frame["timedelta"].dtype == "m8[ns]" result = float_string_frame.dtypes - expected = Series([np.dtype('float64')] * 4 + - [np.dtype('object'), - np.dtype('datetime64[ns]'), - np.dtype('timedelta64[ns]')], - index=list('ABCD') + ['foo', 'datetime', - 'timedelta']) + expected = Series( + [np.dtype("float64")] * 4 + + [ + np.dtype("object"), + np.dtype("datetime64[ns]"), + np.dtype("timedelta64[ns]"), + ], + index=list("ABCD") + ["foo", "datetime", "timedelta"], + ) assert_series_equal(result, expected) def test_construction_with_conversions(self): # convert from a numpy array of non-ns timedelta64 - arr = np.array([1, 2, 3], dtype='timedelta64[s]') + arr = np.array([1, 2, 3], dtype="timedelta64[s]") df = DataFrame(index=range(3)) - df['A'] = arr - expected = DataFrame({'A': pd.timedelta_range('00:00:01', periods=3, - freq='s')}, - index=range(3)) + df["A"] = arr + expected = DataFrame( + {"A": pd.timedelta_range("00:00:01", periods=3, freq="s")}, index=range(3) + ) assert_frame_equal(df, expected) - expected = DataFrame({ - 'dt1': Timestamp('20130101'), - 'dt2': date_range('20130101', periods=3), - # 'dt3' : date_range('20130101 00:00:01',periods=3,freq='s'), - }, index=range(3)) + expected = DataFrame( + { + "dt1": Timestamp("20130101"), + "dt2": date_range("20130101", periods=3), + # 'dt3' : date_range('20130101 00:00:01',periods=3,freq='s'), + }, + index=range(3), + ) df = DataFrame(index=range(3)) - df['dt1'] = np.datetime64('2013-01-01') - df['dt2'] = np.array(['2013-01-01', '2013-01-02', '2013-01-03'], - dtype='datetime64[D]') + df["dt1"] = np.datetime64("2013-01-01") + df["dt2"] = np.array( + ["2013-01-01", "2013-01-02", "2013-01-03"], dtype="datetime64[D]" + ) # df['dt3'] = np.array(['2013-01-01 00:00:01','2013-01-01 # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]') @@ -266,36 +283,29 @@ def test_constructor_compound_dtypes(self): # compound dtypes should raise not-implementederror def f(dtype): - data = list(itertools.repeat((datetime(2001, 1, 1), - "aa", 20), 9)) - return DataFrame(data=data, - columns=["A", "B", "C"], - dtype=dtype) - - msg = ("compound dtypes are not implemented in the DataFrame" - " constructor") + data = list(itertools.repeat((datetime(2001, 1, 1), "aa", 20), 9)) + return DataFrame(data=data, columns=["A", "B", "C"], dtype=dtype) + + msg = "compound dtypes are not implemented in the DataFrame" " constructor" with pytest.raises(NotImplementedError, match=msg): - f([("A", "datetime64[h]"), - ("B", "str"), - ("C", "int32")]) + f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")]) # these work (though results may be unexpected) - f('int64') - f('float64') + f("int64") + f("float64") # 10822 # invalid error message on dt inference if not compat.is_platform_windows(): - f('M8[ns]') + f("M8[ns]") def test_equals_different_blocks(self): # GH 9330 - df0 = pd.DataFrame({"A": ["x", "y"], "B": [1, 2], - "C": ["w", "z"]}) + df0 = pd.DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]}) df1 = df0.reset_index()[["A", "B", "C"]] # this assert verifies that the above operations have # induced a block rearrangement - assert (df0._data.blocks[0].dtype != df1._data.blocks[0].dtype) + assert df0._data.blocks[0].dtype != df1._data.blocks[0].dtype # do the real tests assert_frame_equal(df0, df1) @@ -310,8 +320,7 @@ def test_copy_blocks(self, float_frame): # use the default copy=True, change a column # deprecated 0.21.0 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): blocks = df.as_blocks() for dtype, _df in blocks.items(): if column in _df: @@ -328,8 +337,7 @@ def test_no_copy_blocks(self, float_frame): # use the copy=False, change a column # deprecated 0.21.0 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): blocks = df.as_blocks(copy=False) for dtype, _df in blocks.items(): if column in _df: @@ -340,8 +348,8 @@ def test_no_copy_blocks(self, float_frame): def test_copy(self, float_frame, float_string_frame): cop = float_frame.copy() - cop['E'] = cop['A'] - assert 'E' not in float_frame + cop["E"] = cop["A"] + assert "E" not in float_frame # copy objects copy = float_string_frame.copy() @@ -379,21 +387,20 @@ def test_consolidate_datetime64(self): ser_starting = df.starting ser_starting.index = ser_starting.values - ser_starting = ser_starting.tz_localize('US/Eastern') - ser_starting = ser_starting.tz_convert('UTC') - ser_starting.index.name = 'starting' + ser_starting = ser_starting.tz_localize("US/Eastern") + ser_starting = ser_starting.tz_convert("UTC") + ser_starting.index.name = "starting" ser_ending = df.ending ser_ending.index = ser_ending.values - ser_ending = ser_ending.tz_localize('US/Eastern') - ser_ending = ser_ending.tz_convert('UTC') - ser_ending.index.name = 'ending' + ser_ending = ser_ending.tz_localize("US/Eastern") + ser_ending = ser_ending.tz_convert("UTC") + ser_ending.index.name = "ending" df.starting = ser_starting.index df.ending = ser_ending.index - tm.assert_index_equal(pd.DatetimeIndex( - df.starting), ser_starting.index) + tm.assert_index_equal(pd.DatetimeIndex(df.starting), ser_starting.index) tm.assert_index_equal(pd.DatetimeIndex(df.ending), ser_ending.index) def test_is_mixed_type(self, float_frame, float_string_frame): @@ -405,40 +412,50 @@ def test_get_numeric_data(self): intname = np.dtype(np.int_).name # noqa floatname = np.dtype(np.float_).name # noqa - datetime64name = np.dtype('M8[ns]').name + datetime64name = np.dtype("M8[ns]").name objectname = np.dtype(np.object_).name - df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', - 'f': Timestamp('20010102')}, - index=np.arange(10)) + df = DataFrame( + {"a": 1.0, "b": 2, "c": "foo", "f": Timestamp("20010102")}, + index=np.arange(10), + ) result = df.dtypes - expected = Series([np.dtype('float64'), - np.dtype('int64'), - np.dtype(objectname), - np.dtype(datetime64name)], - index=['a', 'b', 'c', 'f']) + expected = Series( + [ + np.dtype("float64"), + np.dtype("int64"), + np.dtype(objectname), + np.dtype(datetime64name), + ], + index=["a", "b", "c", "f"], + ) assert_series_equal(result, expected) - df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', - 'd': np.array([1.] * 10, dtype='float32'), - 'e': np.array([1] * 10, dtype='int32'), - 'f': np.array([1] * 10, dtype='int16'), - 'g': Timestamp('20010102')}, - index=np.arange(10)) + df = DataFrame( + { + "a": 1.0, + "b": 2, + "c": "foo", + "d": np.array([1.0] * 10, dtype="float32"), + "e": np.array([1] * 10, dtype="int32"), + "f": np.array([1] * 10, dtype="int16"), + "g": Timestamp("20010102"), + }, + index=np.arange(10), + ) result = df._get_numeric_data() - expected = df.loc[:, ['a', 'b', 'd', 'e', 'f']] + expected = df.loc[:, ["a", "b", "d", "e", "f"]] assert_frame_equal(result, expected) - only_obj = df.loc[:, ['c', 'g']] + only_obj = df.loc[:, ["c", "g"]] result = only_obj._get_numeric_data() expected = df.loc[:, []] assert_frame_equal(result, expected) - df = DataFrame.from_dict( - {'a': [1, 2], 'b': ['foo', 'bar'], 'c': [np.pi, np.e]}) + df = DataFrame.from_dict({"a": [1, 2], "b": ["foo", "bar"], "c": [np.pi, np.e]}) result = df._get_numeric_data() - expected = DataFrame.from_dict({'a': [1, 2], 'c': [np.pi, np.e]}) + expected = DataFrame.from_dict({"a": [1, 2], "c": [np.pi, np.e]}) assert_frame_equal(result, expected) df = result.copy() @@ -448,13 +465,16 @@ def test_get_numeric_data(self): def test_get_numeric_data_extension_dtype(self): # GH 22290 - df = DataFrame({ - 'A': integer_array([-10, np.nan, 0, 10, 20, 30], dtype='Int64'), - 'B': Categorical(list('abcabc')), - 'C': integer_array([0, 1, 2, 3, np.nan, 5], dtype='UInt8'), - 'D': IntervalArray.from_breaks(range(7))}) + df = DataFrame( + { + "A": integer_array([-10, np.nan, 0, 10, 20, 30], dtype="Int64"), + "B": Categorical(list("abcabc")), + "C": integer_array([0, 1, 2, 3, np.nan, 5], dtype="UInt8"), + "D": IntervalArray.from_breaks(range(7)), + } + ) result = df._get_numeric_data() - expected = df.loc[:, ['A', 'C']] + expected = df.loc[:, ["A", "C"]] assert_frame_equal(result, expected) def test_convert_objects(self, float_string_frame): @@ -462,100 +482,114 @@ def test_convert_objects(self, float_string_frame): oops = float_string_frame.T.T converted = oops._convert(datetime=True) assert_frame_equal(converted, float_string_frame) - assert converted['A'].dtype == np.float64 + assert converted["A"].dtype == np.float64 # force numeric conversion - float_string_frame['H'] = '1.' - float_string_frame['I'] = '1' + float_string_frame["H"] = "1." + float_string_frame["I"] = "1" # add in some items that will be nan length = len(float_string_frame) - float_string_frame['J'] = '1.' - float_string_frame['K'] = '1' - float_string_frame.loc[0:5, ['J', 'K']] = 'garbled' + float_string_frame["J"] = "1." + float_string_frame["K"] = "1" + float_string_frame.loc[0:5, ["J", "K"]] = "garbled" converted = float_string_frame._convert(datetime=True, numeric=True) - assert converted['H'].dtype == 'float64' - assert converted['I'].dtype == 'int64' - assert converted['J'].dtype == 'float64' - assert converted['K'].dtype == 'float64' - assert len(converted['J'].dropna()) == length - 5 - assert len(converted['K'].dropna()) == length - 5 + assert converted["H"].dtype == "float64" + assert converted["I"].dtype == "int64" + assert converted["J"].dtype == "float64" + assert converted["K"].dtype == "float64" + assert len(converted["J"].dropna()) == length - 5 + assert len(converted["K"].dropna()) == length - 5 # via astype converted = float_string_frame.copy() - converted['H'] = converted['H'].astype('float64') - converted['I'] = converted['I'].astype('int64') - assert converted['H'].dtype == 'float64' - assert converted['I'].dtype == 'int64' + converted["H"] = converted["H"].astype("float64") + converted["I"] = converted["I"].astype("int64") + assert converted["H"].dtype == "float64" + assert converted["I"].dtype == "int64" # via astype, but errors converted = float_string_frame.copy() - with pytest.raises(ValueError, match='invalid literal'): - converted['H'].astype('int32') + with pytest.raises(ValueError, match="invalid literal"): + converted["H"].astype("int32") # mixed in a single column - df = DataFrame(dict(s=Series([1, 'na', 3, 4]))) + df = DataFrame(dict(s=Series([1, "na", 3, 4]))) result = df._convert(datetime=True, numeric=True) expected = DataFrame(dict(s=Series([1, np.nan, 3, 4]))) assert_frame_equal(result, expected) def test_convert_objects_no_conversion(self): - mixed1 = DataFrame( - {'a': [1, 2, 3], 'b': [4.0, 5, 6], 'c': ['x', 'y', 'z']}) + mixed1 = DataFrame({"a": [1, 2, 3], "b": [4.0, 5, 6], "c": ["x", "y", "z"]}) mixed2 = mixed1._convert(datetime=True) assert_frame_equal(mixed1, mixed2) def test_infer_objects(self): # GH 11221 - df = DataFrame({'a': ['a', 1, 2, 3], - 'b': ['b', 2.0, 3.0, 4.1], - 'c': ['c', datetime(2016, 1, 1), - datetime(2016, 1, 2), - datetime(2016, 1, 3)], - 'd': [1, 2, 3, 'd']}, - columns=['a', 'b', 'c', 'd']) + df = DataFrame( + { + "a": ["a", 1, 2, 3], + "b": ["b", 2.0, 3.0, 4.1], + "c": [ + "c", + datetime(2016, 1, 1), + datetime(2016, 1, 2), + datetime(2016, 1, 3), + ], + "d": [1, 2, 3, "d"], + }, + columns=["a", "b", "c", "d"], + ) df = df.iloc[1:].infer_objects() - assert df['a'].dtype == 'int64' - assert df['b'].dtype == 'float64' - assert df['c'].dtype == 'M8[ns]' - assert df['d'].dtype == 'object' - - expected = DataFrame({'a': [1, 2, 3], - 'b': [2.0, 3.0, 4.1], - 'c': [datetime(2016, 1, 1), - datetime(2016, 1, 2), - datetime(2016, 1, 3)], - 'd': [2, 3, 'd']}, - columns=['a', 'b', 'c', 'd']) + assert df["a"].dtype == "int64" + assert df["b"].dtype == "float64" + assert df["c"].dtype == "M8[ns]" + assert df["d"].dtype == "object" + + expected = DataFrame( + { + "a": [1, 2, 3], + "b": [2.0, 3.0, 4.1], + "c": [datetime(2016, 1, 1), datetime(2016, 1, 2), datetime(2016, 1, 3)], + "d": [2, 3, "d"], + }, + columns=["a", "b", "c", "d"], + ) # reconstruct frame to verify inference is same tm.assert_frame_equal(df.reset_index(drop=True), expected) def test_stale_cached_series_bug_473(self): # this is chained, but ok - with option_context('chained_assignment', None): - Y = DataFrame(np.random.random((4, 4)), index=('a', 'b', 'c', 'd'), - columns=('e', 'f', 'g', 'h')) + with option_context("chained_assignment", None): + Y = DataFrame( + np.random.random((4, 4)), + index=("a", "b", "c", "d"), + columns=("e", "f", "g", "h"), + ) repr(Y) - Y['e'] = Y['e'].astype('object') - Y['g']['c'] = np.NaN + Y["e"] = Y["e"].astype("object") + Y["g"]["c"] = np.NaN repr(Y) result = Y.sum() # noqa - exp = Y['g'].sum() # noqa - assert pd.isna(Y['g']['c']) + exp = Y["g"].sum() # noqa + assert pd.isna(Y["g"]["c"]) def test_get_X_columns(self): # numeric and object columns - df = DataFrame({'a': [1, 2, 3], - 'b': [True, False, True], - 'c': ['foo', 'bar', 'baz'], - 'd': [None, None, None], - 'e': [3.14, 0.577, 2.773]}) + df = DataFrame( + { + "a": [1, 2, 3], + "b": [True, False, True], + "c": ["foo", "bar", "baz"], + "d": [None, None, None], + "e": [3.14, 0.577, 2.773], + } + ) - tm.assert_index_equal(df._get_numeric_data().columns, - pd.Index(['a', 'b', 'e'])) + tm.assert_index_equal(df._get_numeric_data().columns, pd.Index(["a", "b", "e"])) def test_strange_column_corruption_issue(self): # (wesm) Unclear how exactly this is related to internal matters @@ -591,10 +625,15 @@ def test_constructor_no_pandas_array(self): def test_add_column_with_pandas_array(self): # GH 26390 - df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': ['a', 'b', 'c', 'd']}) - df['c'] = pd.array([1, 2, None, 3]) - df2 = pd.DataFrame({'a': [1, 2, 3, 4], 'b': ['a', 'b', 'c', 'd'], - 'c': pd.array([1, 2, None, 3])}) - assert type(df['c']._data.blocks[0]) == ObjectBlock - assert type(df2['c']._data.blocks[0]) == ObjectBlock + df = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) + df["c"] = pd.array([1, 2, None, 3]) + df2 = pd.DataFrame( + { + "a": [1, 2, 3, 4], + "b": ["a", "b", "c", "d"], + "c": pd.array([1, 2, None, 3]), + } + ) + assert type(df["c"]._data.blocks[0]) == ObjectBlock + assert type(df2["c"]._data.blocks[0]) == ObjectBlock assert_frame_equal(df, df2) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index c1d057da91b8f7..e38d214eadeb6f 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -10,25 +10,27 @@ class TestDataFrameConcatCommon: - def test_concat_multiple_frames_dtypes(self): # GH 2759 - A = DataFrame(data=np.ones((10, 2)), columns=[ - 'foo', 'bar'], dtype=np.float64) + A = DataFrame(data=np.ones((10, 2)), columns=["foo", "bar"], dtype=np.float64) B = DataFrame(data=np.ones((10, 2)), dtype=np.float32) results = pd.concat((A, B), axis=1).dtypes - expected = Series([np.dtype('float64')] * 2 + - [np.dtype('float32')] * 2, - index=['foo', 'bar', 0, 1]) + expected = Series( + [np.dtype("float64")] * 2 + [np.dtype("float32")] * 2, + index=["foo", "bar", 0, 1], + ) assert_series_equal(results, expected) - @pytest.mark.parametrize('data', [ - pd.date_range('2000', periods=4), - pd.date_range('2000', periods=4, tz="US/Central"), - pd.period_range('2000', periods=4), - pd.timedelta_range(0, periods=4), - ]) + @pytest.mark.parametrize( + "data", + [ + pd.date_range("2000", periods=4), + pd.date_range("2000", periods=4, tz="US/Central"), + pd.period_range("2000", periods=4), + pd.timedelta_range(0, periods=4), + ], + ) def test_combine_datetlike_udf(self, data): # https://github.com/pandas-dev/pandas/issues/23079 df = pd.DataFrame({"A": data}) @@ -44,9 +46,9 @@ def combiner(a, b): def test_concat_multiple_tzs(self): # GH 12467 # combining datetime tz-aware and naive DataFrames - ts1 = Timestamp('2015-01-01', tz=None) - ts2 = Timestamp('2015-01-01', tz='UTC') - ts3 = Timestamp('2015-01-01', tz='EST') + ts1 = Timestamp("2015-01-01", tz=None) + ts2 = Timestamp("2015-01-01", tz="UTC") + ts3 = Timestamp("2015-01-01", tz="EST") df1 = DataFrame(dict(time=[ts1])) df2 = DataFrame(dict(time=[ts2])) @@ -65,17 +67,23 @@ def test_concat_multiple_tzs(self): assert_frame_equal(results, expected) @pytest.mark.parametrize( - 't1', + "t1", [ - '2015-01-01', - pytest.param(pd.NaT, marks=pytest.mark.xfail( - reason='GH23037 incorrect dtype when concatenating'))]) + "2015-01-01", + pytest.param( + pd.NaT, + marks=pytest.mark.xfail( + reason="GH23037 incorrect dtype when concatenating" + ), + ), + ], + ) def test_concat_tz_NaT(self, t1): # GH 22796 # Concating tz-aware multicolumn DataFrames - ts1 = Timestamp(t1, tz='UTC') - ts2 = Timestamp('2015-01-01', tz='UTC') - ts3 = Timestamp('2015-01-01', tz='UTC') + ts1 = Timestamp(t1, tz="UTC") + ts2 = Timestamp("2015-01-01", tz="UTC") + ts3 = Timestamp("2015-01-01", tz="UTC") df1 = DataFrame([[ts1, ts2]]) df2 = DataFrame([[ts3]]) @@ -91,45 +99,53 @@ def test_concat_tz_not_aligned(self): a = pd.DataFrame({"A": ts}) b = pd.DataFrame({"A": ts, "B": ts}) result = pd.concat([a, b], sort=True, ignore_index=True) - expected = pd.DataFrame({"A": list(ts) + list(ts), - "B": [pd.NaT, pd.NaT] + list(ts)}) + expected = pd.DataFrame( + {"A": list(ts) + list(ts), "B": [pd.NaT, pd.NaT] + list(ts)} + ) assert_frame_equal(result, expected) def test_concat_tuple_keys(self): # GH 14438 - df1 = pd.DataFrame(np.ones((2, 2)), columns=list('AB')) - df2 = pd.DataFrame(np.ones((3, 2)) * 2, columns=list('AB')) - results = pd.concat((df1, df2), keys=[('bee', 'bah'), ('bee', 'boo')]) + df1 = pd.DataFrame(np.ones((2, 2)), columns=list("AB")) + df2 = pd.DataFrame(np.ones((3, 2)) * 2, columns=list("AB")) + results = pd.concat((df1, df2), keys=[("bee", "bah"), ("bee", "boo")]) expected = pd.DataFrame( - {'A': {('bee', 'bah', 0): 1.0, - ('bee', 'bah', 1): 1.0, - ('bee', 'boo', 0): 2.0, - ('bee', 'boo', 1): 2.0, - ('bee', 'boo', 2): 2.0}, - 'B': {('bee', 'bah', 0): 1.0, - ('bee', 'bah', 1): 1.0, - ('bee', 'boo', 0): 2.0, - ('bee', 'boo', 1): 2.0, - ('bee', 'boo', 2): 2.0}}) + { + "A": { + ("bee", "bah", 0): 1.0, + ("bee", "bah", 1): 1.0, + ("bee", "boo", 0): 2.0, + ("bee", "boo", 1): 2.0, + ("bee", "boo", 2): 2.0, + }, + "B": { + ("bee", "bah", 0): 1.0, + ("bee", "bah", 1): 1.0, + ("bee", "boo", 0): 2.0, + ("bee", "boo", 1): 2.0, + ("bee", "boo", 2): 2.0, + }, + } + ) assert_frame_equal(results, expected) def test_append_series_dict(self): - df = DataFrame(np.random.randn(5, 4), - columns=['foo', 'bar', 'baz', 'qux']) + df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) series = df.loc[4] - msg = 'Indexes have overlapping values' + msg = "Indexes have overlapping values" with pytest.raises(ValueError, match=msg): df.append(series, verify_integrity=True) series.name = None - msg = 'Can only append a Series if ignore_index=True' + msg = "Can only append a Series if ignore_index=True" with pytest.raises(TypeError, match=msg): df.append(series, verify_integrity=True) result = df.append(series[::-1], ignore_index=True) - expected = df.append(DataFrame({0: series[::-1]}, index=df.columns).T, - ignore_index=True) + expected = df.append( + DataFrame({0: series[::-1]}, index=df.columns).T, ignore_index=True + ) assert_frame_equal(result, expected) # dict @@ -137,8 +153,9 @@ def test_append_series_dict(self): assert_frame_equal(result, expected) result = df.append(series[::-1][:3], ignore_index=True) - expected = df.append(DataFrame({0: series[::-1][:3]}).T, - ignore_index=True, sort=True) + expected = df.append( + DataFrame({0: series[::-1][:3]}).T, ignore_index=True, sort=True + ) assert_frame_equal(result, expected.loc[:, result.columns]) # can append when name set @@ -149,8 +166,7 @@ def test_append_series_dict(self): assert_frame_equal(result, expected) def test_append_list_of_series_dicts(self): - df = DataFrame(np.random.randn(5, 4), - columns=['foo', 'bar', 'baz', 'qux']) + df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) dicts = [x.to_dict() for idx, x in df.iterrows()] @@ -159,8 +175,10 @@ def test_append_list_of_series_dicts(self): assert_frame_equal(result, expected) # different columns - dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4}, - {'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}] + dicts = [ + {"foo": 1, "bar": 2, "baz": 3, "peekaboo": 4}, + {"foo": 5, "bar": 6, "baz": 7, "peekaboo": 8}, + ] result = df.append(dicts, ignore_index=True, sort=True) expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) assert_frame_equal(result, expected) @@ -170,10 +188,9 @@ def test_append_missing_cols(self): # exercise the conditional branch in append method where the data # to be appended is a list and does not contain all columns that are in # the target DataFrame - df = DataFrame(np.random.randn(5, 4), - columns=['foo', 'bar', 'baz', 'qux']) + df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) - dicts = [{'foo': 9}, {'bar': 10}] + dicts = [{"foo": 9}, {"bar": 10}] with tm.assert_produces_warning(None): result = df.append(dicts, ignore_index=True, sort=True) @@ -197,14 +214,14 @@ def test_append_empty_dataframe(self): assert_frame_equal(result, expected) # Empty df with columns append empty df - df1 = DataFrame(columns=['bar', 'foo']) + df1 = DataFrame(columns=["bar", "foo"]) df2 = DataFrame() result = df1.append(df2) expected = df1.copy() assert_frame_equal(result, expected) # Non-Empty df with columns append empty df - df1 = DataFrame(np.random.randn(5, 2), columns=['bar', 'foo']) + df1 = DataFrame(np.random.randn(5, 2), columns=["bar", "foo"]) df2 = DataFrame() result = df1.append(df2) expected = df1.copy() @@ -216,152 +233,151 @@ def test_append_dtypes(self): # row appends of different dtypes (so need to do by-item) # can sometimes infer the correct type - df1 = DataFrame({'bar': Timestamp('20130101')}, index=range(5)) + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(5)) df2 = DataFrame() result = df1.append(df2) expected = df1.copy() assert_frame_equal(result, expected) - df1 = DataFrame({'bar': Timestamp('20130101')}, index=range(1)) - df2 = DataFrame({'bar': 'foo'}, index=range(1, 2)) + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) + df2 = DataFrame({"bar": "foo"}, index=range(1, 2)) result = df1.append(df2) - expected = DataFrame({'bar': [Timestamp('20130101'), 'foo']}) + expected = DataFrame({"bar": [Timestamp("20130101"), "foo"]}) assert_frame_equal(result, expected) - df1 = DataFrame({'bar': Timestamp('20130101')}, index=range(1)) - df2 = DataFrame({'bar': np.nan}, index=range(1, 2)) + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) + df2 = DataFrame({"bar": np.nan}, index=range(1, 2)) result = df1.append(df2) expected = DataFrame( - {'bar': Series([Timestamp('20130101'), np.nan], dtype='M8[ns]')}) + {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} + ) assert_frame_equal(result, expected) - df1 = DataFrame({'bar': Timestamp('20130101')}, index=range(1)) - df2 = DataFrame({'bar': np.nan}, index=range(1, 2), dtype=object) + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) + df2 = DataFrame({"bar": np.nan}, index=range(1, 2), dtype=object) result = df1.append(df2) expected = DataFrame( - {'bar': Series([Timestamp('20130101'), np.nan], dtype='M8[ns]')}) + {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} + ) assert_frame_equal(result, expected) - df1 = DataFrame({'bar': np.nan}, index=range(1)) - df2 = DataFrame({'bar': Timestamp('20130101')}, index=range(1, 2)) + df1 = DataFrame({"bar": np.nan}, index=range(1)) + df2 = DataFrame({"bar": Timestamp("20130101")}, index=range(1, 2)) result = df1.append(df2) expected = DataFrame( - {'bar': Series([np.nan, Timestamp('20130101')], dtype='M8[ns]')}) + {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")} + ) assert_frame_equal(result, expected) - df1 = DataFrame({'bar': Timestamp('20130101')}, index=range(1)) - df2 = DataFrame({'bar': 1}, index=range(1, 2), dtype=object) + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) + df2 = DataFrame({"bar": 1}, index=range(1, 2), dtype=object) result = df1.append(df2) - expected = DataFrame({'bar': Series([Timestamp('20130101'), 1])}) + expected = DataFrame({"bar": Series([Timestamp("20130101"), 1])}) assert_frame_equal(result, expected) def test_update(self): - df = DataFrame([[1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3], - [1.5, np.nan, 3]]) + df = DataFrame( + [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) - other = DataFrame([[3.6, 2., np.nan], - [np.nan, np.nan, 7]], index=[1, 3]) + other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) df.update(other) - expected = DataFrame([[1.5, np.nan, 3], - [3.6, 2, 3], - [1.5, np.nan, 3], - [1.5, np.nan, 7.]]) + expected = DataFrame( + [[1.5, np.nan, 3], [3.6, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]] + ) assert_frame_equal(df, expected) def test_update_dtypes(self): # gh 3016 - df = DataFrame([[1., 2., False, True], [4., 5., True, False]], - columns=['A', 'B', 'bool1', 'bool2']) + df = DataFrame( + [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], + columns=["A", "B", "bool1", "bool2"], + ) - other = DataFrame([[45, 45]], index=[0], columns=['A', 'B']) + other = DataFrame([[45, 45]], index=[0], columns=["A", "B"]) df.update(other) - expected = DataFrame([[45., 45., False, True], [4., 5., True, False]], - columns=['A', 'B', 'bool1', 'bool2']) + expected = DataFrame( + [[45.0, 45.0, False, True], [4.0, 5.0, True, False]], + columns=["A", "B", "bool1", "bool2"], + ) assert_frame_equal(df, expected) def test_update_nooverwrite(self): - df = DataFrame([[1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3], - [1.5, np.nan, 3]]) + df = DataFrame( + [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) - other = DataFrame([[3.6, 2., np.nan], - [np.nan, np.nan, 7]], index=[1, 3]) + other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) df.update(other, overwrite=False) - expected = DataFrame([[1.5, np.nan, 3], - [1.5, 2, 3], - [1.5, np.nan, 3], - [1.5, np.nan, 3.]]) + expected = DataFrame( + [[1.5, np.nan, 3], [1.5, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 3.0]] + ) assert_frame_equal(df, expected) def test_update_filtered(self): - df = DataFrame([[1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3], - [1.5, np.nan, 3]]) + df = DataFrame( + [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) - other = DataFrame([[3.6, 2., np.nan], - [np.nan, np.nan, 7]], index=[1, 3]) + other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) df.update(other, filter_func=lambda x: x > 2) - expected = DataFrame([[1.5, np.nan, 3], - [1.5, np.nan, 3], - [1.5, np.nan, 3], - [1.5, np.nan, 7.]]) + expected = DataFrame( + [[1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]] + ) assert_frame_equal(df, expected) - @pytest.mark.parametrize('bad_kwarg, exception, msg', [ - # errors must be 'ignore' or 'raise' - ({'errors': 'something'}, ValueError, 'The parameter errors must.*'), - ({'join': 'inner'}, NotImplementedError, 'Only left join is supported') - ]) + @pytest.mark.parametrize( + "bad_kwarg, exception, msg", + [ + # errors must be 'ignore' or 'raise' + ({"errors": "something"}, ValueError, "The parameter errors must.*"), + ({"join": "inner"}, NotImplementedError, "Only left join is supported"), + ], + ) def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg): - df = DataFrame([[1.5, 1, 3.]]) + df = DataFrame([[1.5, 1, 3.0]]) with pytest.raises(exception, match=msg): df.update(df, **bad_kwarg) def test_update_raise_on_overlap(self): - df = DataFrame([[1.5, 1, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3], - [1.5, np.nan, 3]]) + df = DataFrame( + [[1.5, 1, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) - other = DataFrame([[2., np.nan], - [np.nan, 7]], index=[1, 3], columns=[1, 2]) + other = DataFrame([[2.0, np.nan], [np.nan, 7]], index=[1, 3], columns=[1, 2]) with pytest.raises(ValueError, match="Data overlaps"): - df.update(other, errors='raise') + df.update(other, errors="raise") - @pytest.mark.parametrize('raise_conflict', [True, False]) + @pytest.mark.parametrize("raise_conflict", [True, False]) def test_update_deprecation(self, raise_conflict): - df = DataFrame([[1.5, 1, 3.]]) + df = DataFrame([[1.5, 1, 3.0]]) other = DataFrame() with tm.assert_produces_warning(FutureWarning): df.update(other, raise_conflict=raise_conflict) def test_update_from_non_df(self): - d = {'a': Series([1, 2, 3, 4]), 'b': Series([5, 6, 7, 8])} + d = {"a": Series([1, 2, 3, 4]), "b": Series([5, 6, 7, 8])} df = DataFrame(d) - d['a'] = Series([5, 6, 7, 8]) + d["a"] = Series([5, 6, 7, 8]) df.update(d) expected = DataFrame(d) assert_frame_equal(df, expected) - d = {'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]} + d = {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]} df = DataFrame(d) - d['a'] = [5, 6, 7, 8] + d["a"] = [5, 6, 7, 8] df.update(d) expected = DataFrame(d) @@ -370,93 +386,118 @@ def test_update_from_non_df(self): def test_update_datetime_tz(self): # GH 25807 - result = DataFrame([pd.Timestamp('2019', tz='UTC')]) + result = DataFrame([pd.Timestamp("2019", tz="UTC")]) result.update(result) - expected = DataFrame([pd.Timestamp('2019', tz='UTC')]) + expected = DataFrame([pd.Timestamp("2019", tz="UTC")]) assert_frame_equal(result, expected) def test_join_str_datetime(self): - str_dates = ['20120209', '20120222'] + str_dates = ["20120209", "20120222"] dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] - A = DataFrame(str_dates, index=range(2), columns=['aa']) + A = DataFrame(str_dates, index=range(2), columns=["aa"]) C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates) - tst = A.join(C, on='aa') + tst = A.join(C, on="aa") assert len(tst.columns) == 3 def test_join_multiindex_leftright(self): # GH 10741 - df1 = (pd.DataFrame([['a', 'x', 0.471780], ['a', 'y', 0.774908], - ['a', 'z', 0.563634], ['b', 'x', -0.353756], - ['b', 'y', 0.368062], ['b', 'z', -1.721840], - ['c', 'x', 1], ['c', 'y', 2], ['c', 'z', 3]], - columns=['first', 'second', 'value1']) - .set_index(['first', 'second'])) - - df2 = (pd.DataFrame([['a', 10], ['b', 20]], - columns=['first', 'value2']) - .set_index(['first'])) - - exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10], - [-0.353756, 20], [0.368062, 20], - [-1.721840, 20], - [1.000000, np.nan], [2.000000, np.nan], - [3.000000, np.nan]], - index=df1.index, columns=['value1', 'value2']) + df1 = pd.DataFrame( + [ + ["a", "x", 0.471780], + ["a", "y", 0.774908], + ["a", "z", 0.563634], + ["b", "x", -0.353756], + ["b", "y", 0.368062], + ["b", "z", -1.721840], + ["c", "x", 1], + ["c", "y", 2], + ["c", "z", 3], + ], + columns=["first", "second", "value1"], + ).set_index(["first", "second"]) + + df2 = pd.DataFrame( + [["a", 10], ["b", 20]], columns=["first", "value2"] + ).set_index(["first"]) + + exp = pd.DataFrame( + [ + [0.471780, 10], + [0.774908, 10], + [0.563634, 10], + [-0.353756, 20], + [0.368062, 20], + [-1.721840, 20], + [1.000000, np.nan], + [2.000000, np.nan], + [3.000000, np.nan], + ], + index=df1.index, + columns=["value1", "value2"], + ) # these must be the same results (but columns are flipped) - assert_frame_equal(df1.join(df2, how='left'), exp) - assert_frame_equal(df2.join(df1, how='right'), - exp[['value2', 'value1']]) - - exp_idx = pd.MultiIndex.from_product([['a', 'b'], ['x', 'y', 'z']], - names=['first', 'second']) - exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10], - [-0.353756, 20], [0.368062, 20], [-1.721840, 20]], - index=exp_idx, columns=['value1', 'value2']) - - assert_frame_equal(df1.join(df2, how='right'), exp) - assert_frame_equal(df2.join(df1, how='left'), - exp[['value2', 'value1']]) + assert_frame_equal(df1.join(df2, how="left"), exp) + assert_frame_equal(df2.join(df1, how="right"), exp[["value2", "value1"]]) + + exp_idx = pd.MultiIndex.from_product( + [["a", "b"], ["x", "y", "z"]], names=["first", "second"] + ) + exp = pd.DataFrame( + [ + [0.471780, 10], + [0.774908, 10], + [0.563634, 10], + [-0.353756, 20], + [0.368062, 20], + [-1.721840, 20], + ], + index=exp_idx, + columns=["value1", "value2"], + ) + + assert_frame_equal(df1.join(df2, how="right"), exp) + assert_frame_equal(df2.join(df1, how="left"), exp[["value2", "value1"]]) def test_concat_named_keys(self): # GH 14252 - df = pd.DataFrame({'foo': [1, 2], 'bar': [0.1, 0.2]}) - index = Index(['a', 'b'], name='baz') + df = pd.DataFrame({"foo": [1, 2], "bar": [0.1, 0.2]}) + index = Index(["a", "b"], name="baz") concatted_named_from_keys = pd.concat([df, df], keys=index) expected_named = pd.DataFrame( - {'foo': [1, 2, 1, 2], 'bar': [0.1, 0.2, 0.1, 0.2]}, - index=pd.MultiIndex.from_product((['a', 'b'], [0, 1]), - names=['baz', None])) + {"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]}, + index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=["baz", None]), + ) assert_frame_equal(concatted_named_from_keys, expected_named) - index_no_name = Index(['a', 'b'], name=None) + index_no_name = Index(["a", "b"], name=None) concatted_named_from_names = pd.concat( - [df, df], keys=index_no_name, names=['baz']) + [df, df], keys=index_no_name, names=["baz"] + ) assert_frame_equal(concatted_named_from_names, expected_named) concatted_unnamed = pd.concat([df, df], keys=index_no_name) expected_unnamed = pd.DataFrame( - {'foo': [1, 2, 1, 2], 'bar': [0.1, 0.2, 0.1, 0.2]}, - index=pd.MultiIndex.from_product((['a', 'b'], [0, 1]), - names=[None, None])) + {"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]}, + index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=[None, None]), + ) assert_frame_equal(concatted_unnamed, expected_unnamed) def test_concat_axis_parameter(self): # GH 14369 - df1 = pd.DataFrame({'A': [0.1, 0.2]}, index=range(2)) - df2 = pd.DataFrame({'A': [0.3, 0.4]}, index=range(2)) + df1 = pd.DataFrame({"A": [0.1, 0.2]}, index=range(2)) + df2 = pd.DataFrame({"A": [0.3, 0.4]}, index=range(2)) # Index/row/0 DataFrame - expected_index = pd.DataFrame( - {'A': [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1]) + expected_index = pd.DataFrame({"A": [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1]) - concatted_index = pd.concat([df1, df2], axis='index') + concatted_index = pd.concat([df1, df2], axis="index") assert_frame_equal(concatted_index, expected_index) - concatted_row = pd.concat([df1, df2], axis='rows') + concatted_row = pd.concat([df1, df2], axis="rows") assert_frame_equal(concatted_row, expected_index) concatted_0 = pd.concat([df1, df2], axis=0) @@ -464,9 +505,10 @@ def test_concat_axis_parameter(self): # Columns/1 DataFrame expected_columns = pd.DataFrame( - [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=['A', 'A']) + [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=["A", "A"] + ) - concatted_columns = pd.concat([df1, df2], axis='columns') + concatted_columns = pd.concat([df1, df2], axis="columns") assert_frame_equal(concatted_columns, expected_columns) concatted_1 = pd.concat([df1, df2], axis=1) @@ -476,13 +518,12 @@ def test_concat_axis_parameter(self): series2 = pd.Series([0.3, 0.4]) # Index/row/0 Series - expected_index_series = pd.Series( - [0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1]) + expected_index_series = pd.Series([0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1]) - concatted_index_series = pd.concat([series1, series2], axis='index') + concatted_index_series = pd.concat([series1, series2], axis="index") assert_series_equal(concatted_index_series, expected_index_series) - concatted_row_series = pd.concat([series1, series2], axis='rows') + concatted_row_series = pd.concat([series1, series2], axis="rows") assert_series_equal(concatted_row_series, expected_index_series) concatted_0_series = pd.concat([series1, series2], axis=0) @@ -490,61 +531,65 @@ def test_concat_axis_parameter(self): # Columns/1 Series expected_columns_series = pd.DataFrame( - [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1]) + [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1] + ) - concatted_columns_series = pd.concat( - [series1, series2], axis='columns') + concatted_columns_series = pd.concat([series1, series2], axis="columns") assert_frame_equal(concatted_columns_series, expected_columns_series) concatted_1_series = pd.concat([series1, series2], axis=1) assert_frame_equal(concatted_1_series, expected_columns_series) # Testing ValueError - with pytest.raises(ValueError, match='No axis named'): - pd.concat([series1, series2], axis='something') + with pytest.raises(ValueError, match="No axis named"): + pd.concat([series1, series2], axis="something") def test_concat_numerical_names(self): # #15262 # #12223 - df = pd.DataFrame({'col': range(9)}, - dtype='int32', - index=(pd.MultiIndex - .from_product([['A0', 'A1', 'A2'], - ['B0', 'B1', 'B2']], - names=[1, 2]))) + df = pd.DataFrame( + {"col": range(9)}, + dtype="int32", + index=( + pd.MultiIndex.from_product( + [["A0", "A1", "A2"], ["B0", "B1", "B2"]], names=[1, 2] + ) + ), + ) result = pd.concat((df.iloc[:2, :], df.iloc[-2:, :])) - expected = pd.DataFrame({'col': [0, 1, 7, 8]}, - dtype='int32', - index=pd.MultiIndex.from_tuples([('A0', 'B0'), - ('A0', 'B1'), - ('A2', 'B1'), - ('A2', 'B2')], - names=[1, 2])) + expected = pd.DataFrame( + {"col": [0, 1, 7, 8]}, + dtype="int32", + index=pd.MultiIndex.from_tuples( + [("A0", "B0"), ("A0", "B1"), ("A2", "B1"), ("A2", "B2")], names=[1, 2] + ), + ) tm.assert_frame_equal(result, expected) def test_concat_astype_dup_col(self): # gh 23049 - df = pd.DataFrame([{'a': 'b'}]) + df = pd.DataFrame([{"a": "b"}]) df = pd.concat([df, df], axis=1) - result = df.astype('category') - expected = pd.DataFrame(np.array(["b", "b"]).reshape(1, 2), - columns=["a", "a"]).astype("category") + result = df.astype("category") + expected = pd.DataFrame( + np.array(["b", "b"]).reshape(1, 2), columns=["a", "a"] + ).astype("category") tm.assert_frame_equal(result, expected) class TestDataFrameCombineFirst: - def test_combine_first_mixed(self): - a = Series(['a', 'b'], index=range(2)) + a = Series(["a", "b"], index=range(2)) b = Series(range(2), index=range(2)) - f = DataFrame({'A': a, 'B': b}) + f = DataFrame({"A": a, "B": b}) - a = Series(['a', 'b'], index=range(5, 7)) + a = Series(["a", "b"], index=range(5, 7)) b = Series(range(2), index=range(5, 7)) - g = DataFrame({'A': a, 'B': b}) + g = DataFrame({"A": a, "B": b}) - exp = pd.DataFrame({'A': list('abab'), 'B': [0., 1., 0., 1.]}, - index=[0, 1, 5, 6]) + exp = pd.DataFrame( + {"A": list("abab"), "B": [0.0, 1.0, 0.0, 1.0]}, index=[0, 1, 5, 6] + ) combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) @@ -556,42 +601,42 @@ def test_combine_first(self, float_frame): reordered_frame = float_frame.reindex(combined.index) assert_frame_equal(combined, reordered_frame) assert tm.equalContents(combined.columns, float_frame.columns) - assert_series_equal(combined['A'], reordered_frame['A']) + assert_series_equal(combined["A"], reordered_frame["A"]) # same index fcopy = float_frame.copy() - fcopy['A'] = 1 - del fcopy['C'] + fcopy["A"] = 1 + del fcopy["C"] fcopy2 = float_frame.copy() - fcopy2['B'] = 0 - del fcopy2['D'] + fcopy2["B"] = 0 + del fcopy2["D"] combined = fcopy.combine_first(fcopy2) - assert (combined['A'] == 1).all() - assert_series_equal(combined['B'], fcopy['B']) - assert_series_equal(combined['C'], fcopy2['C']) - assert_series_equal(combined['D'], fcopy['D']) + assert (combined["A"] == 1).all() + assert_series_equal(combined["B"], fcopy["B"]) + assert_series_equal(combined["C"], fcopy2["C"]) + assert_series_equal(combined["D"], fcopy["D"]) # overlap head, tail = reordered_frame[:10].copy(), reordered_frame - head['A'] = 1 + head["A"] = 1 combined = head.combine_first(tail) - assert (combined['A'][:10] == 1).all() + assert (combined["A"][:10] == 1).all() # reverse overlap - tail['A'][:10] = 0 + tail["A"][:10] = 0 combined = tail.combine_first(head) - assert (combined['A'][:10] == 0).all() + assert (combined["A"][:10] == 0).all() # no overlap f = float_frame[:10] g = float_frame[10:] combined = f.combine_first(g) - assert_series_equal(combined['A'].reindex(f.index), f['A']) - assert_series_equal(combined['A'].reindex(g.index), g['A']) + assert_series_equal(combined["A"].reindex(f.index), f["A"]) + assert_series_equal(combined["A"].reindex(g.index), g["A"]) # corner cases comb = float_frame.combine_first(DataFrame()) @@ -604,72 +649,74 @@ def test_combine_first(self, float_frame): assert "faz" in comb.index # #2525 - df = DataFrame({'a': [1]}, index=[datetime(2012, 1, 1)]) - df2 = DataFrame(columns=['b']) + df = DataFrame({"a": [1]}, index=[datetime(2012, 1, 1)]) + df2 = DataFrame(columns=["b"]) result = df.combine_first(df2) - assert 'b' in result + assert "b" in result def test_combine_first_mixed_bug(self): - idx = Index(['a', 'b', 'c', 'e']) - ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) - ser2 = Series(['a', 'b', 'c', 'e'], index=idx) + idx = Index(["a", "b", "c", "e"]) + ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx) + ser2 = Series(["a", "b", "c", "e"], index=idx) ser3 = Series([12, 4, 5, 97], index=idx) - frame1 = DataFrame({"col0": ser1, - "col2": ser2, - "col3": ser3}) + frame1 = DataFrame({"col0": ser1, "col2": ser2, "col3": ser3}) - idx = Index(['a', 'b', 'c', 'f']) - ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) - ser2 = Series(['a', 'b', 'c', 'f'], index=idx) + idx = Index(["a", "b", "c", "f"]) + ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx) + ser2 = Series(["a", "b", "c", "f"], index=idx) ser3 = Series([12, 4, 5, 97], index=idx) - frame2 = DataFrame({"col1": ser1, - "col2": ser2, - "col5": ser3}) + frame2 = DataFrame({"col1": ser1, "col2": ser2, "col5": ser3}) combined = frame1.combine_first(frame2) assert len(combined.columns) == 5 # gh 3016 (same as in update) - df = DataFrame([[1., 2., False, True], [4., 5., True, False]], - columns=['A', 'B', 'bool1', 'bool2']) + df = DataFrame( + [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], + columns=["A", "B", "bool1", "bool2"], + ) - other = DataFrame([[45, 45]], index=[0], columns=['A', 'B']) + other = DataFrame([[45, 45]], index=[0], columns=["A", "B"]) result = df.combine_first(other) assert_frame_equal(result, df) - df.loc[0, 'A'] = np.nan + df.loc[0, "A"] = np.nan result = df.combine_first(other) - df.loc[0, 'A'] = 45 + df.loc[0, "A"] = 45 assert_frame_equal(result, df) # doc example - df1 = DataFrame({'A': [1., np.nan, 3., 5., np.nan], - 'B': [np.nan, 2., 3., np.nan, 6.]}) + df1 = DataFrame( + {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]} + ) - df2 = DataFrame({'A': [5., 2., 4., np.nan, 3., 7.], - 'B': [np.nan, np.nan, 3., 4., 6., 8.]}) + df2 = DataFrame( + { + "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0], + "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0], + } + ) result = df1.combine_first(df2) - expected = DataFrame( - {'A': [1, 2, 3, 5, 3, 7.], 'B': [np.nan, 2, 3, 4, 6, 8]}) + expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]}) assert_frame_equal(result, expected) # GH3552, return object dtype with bools df1 = DataFrame( - [[np.nan, 3., True], [-4.6, np.nan, True], [np.nan, 7., False]]) - df2 = DataFrame( - [[-42.6, np.nan, True], [-5., 1.6, False]], index=[1, 2]) + [[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]] + ) + df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2]) result = df1.combine_first(df2)[2] expected = Series([True, True, False], name=2) assert_series_equal(result, expected) # GH 3593, converting datetime64[ns] incorrectly - df0 = DataFrame({"a": [datetime(2000, 1, 1), - datetime(2000, 1, 2), - datetime(2000, 1, 3)]}) + df0 = DataFrame( + {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]} + ) df1 = DataFrame({"a": [None, None, None]}) df2 = df1.combine_first(df0) assert_frame_equal(df2, df0) @@ -677,9 +724,9 @@ def test_combine_first_mixed_bug(self): df2 = df0.combine_first(df1) assert_frame_equal(df2, df0) - df0 = DataFrame({"a": [datetime(2000, 1, 1), - datetime(2000, 1, 2), - datetime(2000, 1, 3)]}) + df0 = DataFrame( + {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]} + ) df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]}) df2 = df1.combine_first(df0) result = df0.copy() @@ -691,164 +738,182 @@ def test_combine_first_mixed_bug(self): def test_combine_first_align_nan(self): # GH 7509 (not fixed) - dfa = pd.DataFrame([[pd.Timestamp('2011-01-01'), 2]], - columns=['a', 'b']) - dfb = pd.DataFrame([[4], [5]], columns=['b']) - assert dfa['a'].dtype == 'datetime64[ns]' - assert dfa['b'].dtype == 'int64' + dfa = pd.DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"]) + dfb = pd.DataFrame([[4], [5]], columns=["b"]) + assert dfa["a"].dtype == "datetime64[ns]" + assert dfa["b"].dtype == "int64" res = dfa.combine_first(dfb) - exp = pd.DataFrame({'a': [pd.Timestamp('2011-01-01'), pd.NaT], - 'b': [2., 5.]}, columns=['a', 'b']) + exp = pd.DataFrame( + {"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2.0, 5.0]}, + columns=["a", "b"], + ) tm.assert_frame_equal(res, exp) - assert res['a'].dtype == 'datetime64[ns]' + assert res["a"].dtype == "datetime64[ns]" # ToDo: this must be int64 - assert res['b'].dtype == 'float64' + assert res["b"].dtype == "float64" res = dfa.iloc[:0].combine_first(dfb) - exp = pd.DataFrame({'a': [np.nan, np.nan], - 'b': [4, 5]}, columns=['a', 'b']) + exp = pd.DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"]) tm.assert_frame_equal(res, exp) # ToDo: this must be datetime64 - assert res['a'].dtype == 'float64' + assert res["a"].dtype == "float64" # ToDo: this must be int64 - assert res['b'].dtype == 'int64' + assert res["b"].dtype == "int64" def test_combine_first_timezone(self): # see gh-7630 - data1 = pd.to_datetime('20100101 01:01').tz_localize('UTC') - df1 = pd.DataFrame(columns=['UTCdatetime', 'abc'], - data=data1, - index=pd.date_range('20140627', periods=1)) - data2 = pd.to_datetime('20121212 12:12').tz_localize('UTC') - df2 = pd.DataFrame(columns=['UTCdatetime', 'xyz'], - data=data2, - index=pd.date_range('20140628', periods=1)) - res = df2[['UTCdatetime']].combine_first(df1) - exp = pd.DataFrame({'UTCdatetime': [pd.Timestamp('2010-01-01 01:01', - tz='UTC'), - pd.Timestamp('2012-12-12 12:12', - tz='UTC')], - 'abc': [pd.Timestamp('2010-01-01 01:01:00', - tz='UTC'), pd.NaT]}, - columns=['UTCdatetime', 'abc'], - index=pd.date_range('20140627', periods=2, - freq='D')) + data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC") + df1 = pd.DataFrame( + columns=["UTCdatetime", "abc"], + data=data1, + index=pd.date_range("20140627", periods=1), + ) + data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC") + df2 = pd.DataFrame( + columns=["UTCdatetime", "xyz"], + data=data2, + index=pd.date_range("20140628", periods=1), + ) + res = df2[["UTCdatetime"]].combine_first(df1) + exp = pd.DataFrame( + { + "UTCdatetime": [ + pd.Timestamp("2010-01-01 01:01", tz="UTC"), + pd.Timestamp("2012-12-12 12:12", tz="UTC"), + ], + "abc": [pd.Timestamp("2010-01-01 01:01:00", tz="UTC"), pd.NaT], + }, + columns=["UTCdatetime", "abc"], + index=pd.date_range("20140627", periods=2, freq="D"), + ) tm.assert_frame_equal(res, exp) - assert res['UTCdatetime'].dtype == 'datetime64[ns, UTC]' - assert res['abc'].dtype == 'datetime64[ns, UTC]' + assert res["UTCdatetime"].dtype == "datetime64[ns, UTC]" + assert res["abc"].dtype == "datetime64[ns, UTC]" # see gh-10567 - dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='UTC') - df1 = pd.DataFrame({'DATE': dts1}) - dts2 = pd.date_range('2015-01-03', '2015-01-05', tz='UTC') - df2 = pd.DataFrame({'DATE': dts2}) + dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC") + df1 = pd.DataFrame({"DATE": dts1}) + dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC") + df2 = pd.DataFrame({"DATE": dts2}) res = df1.combine_first(df2) tm.assert_frame_equal(res, df1) - assert res['DATE'].dtype == 'datetime64[ns, UTC]' + assert res["DATE"].dtype == "datetime64[ns, UTC]" - dts1 = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03', - '2011-01-04'], tz='US/Eastern') - df1 = pd.DataFrame({'DATE': dts1}, index=[1, 3, 5, 7]) - dts2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02', - '2012-01-03'], tz='US/Eastern') - df2 = pd.DataFrame({'DATE': dts2}, index=[2, 4, 5]) + dts1 = pd.DatetimeIndex( + ["2011-01-01", "NaT", "2011-01-03", "2011-01-04"], tz="US/Eastern" + ) + df1 = pd.DataFrame({"DATE": dts1}, index=[1, 3, 5, 7]) + dts2 = pd.DatetimeIndex( + ["2012-01-01", "2012-01-02", "2012-01-03"], tz="US/Eastern" + ) + df2 = pd.DataFrame({"DATE": dts2}, index=[2, 4, 5]) res = df1.combine_first(df2) - exp_dts = pd.DatetimeIndex(['2011-01-01', '2012-01-01', 'NaT', - '2012-01-02', '2011-01-03', '2011-01-04'], - tz='US/Eastern') - exp = pd.DataFrame({'DATE': exp_dts}, index=[1, 2, 3, 4, 5, 7]) + exp_dts = pd.DatetimeIndex( + [ + "2011-01-01", + "2012-01-01", + "NaT", + "2012-01-02", + "2011-01-03", + "2011-01-04", + ], + tz="US/Eastern", + ) + exp = pd.DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) # different tz - dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='US/Eastern') - df1 = pd.DataFrame({'DATE': dts1}) - dts2 = pd.date_range('2015-01-03', '2015-01-05') - df2 = pd.DataFrame({'DATE': dts2}) + dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern") + df1 = pd.DataFrame({"DATE": dts1}) + dts2 = pd.date_range("2015-01-03", "2015-01-05") + df2 = pd.DataFrame({"DATE": dts2}) # if df1 doesn't have NaN, keep its dtype res = df1.combine_first(df2) tm.assert_frame_equal(res, df1) - assert res['DATE'].dtype == 'datetime64[ns, US/Eastern]' + assert res["DATE"].dtype == "datetime64[ns, US/Eastern]" - dts1 = pd.date_range('2015-01-01', '2015-01-02', tz='US/Eastern') - df1 = pd.DataFrame({'DATE': dts1}) - dts2 = pd.date_range('2015-01-01', '2015-01-03') - df2 = pd.DataFrame({'DATE': dts2}) + dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern") + df1 = pd.DataFrame({"DATE": dts1}) + dts2 = pd.date_range("2015-01-01", "2015-01-03") + df2 = pd.DataFrame({"DATE": dts2}) res = df1.combine_first(df2) - exp_dts = [pd.Timestamp('2015-01-01', tz='US/Eastern'), - pd.Timestamp('2015-01-02', tz='US/Eastern'), - pd.Timestamp('2015-01-03')] - exp = pd.DataFrame({'DATE': exp_dts}) + exp_dts = [ + pd.Timestamp("2015-01-01", tz="US/Eastern"), + pd.Timestamp("2015-01-02", tz="US/Eastern"), + pd.Timestamp("2015-01-03"), + ] + exp = pd.DataFrame({"DATE": exp_dts}) tm.assert_frame_equal(res, exp) - assert res['DATE'].dtype == 'object' + assert res["DATE"].dtype == "object" def test_combine_first_timedelta(self): - data1 = pd.TimedeltaIndex(['1 day', 'NaT', '3 day', '4day']) - df1 = pd.DataFrame({'TD': data1}, index=[1, 3, 5, 7]) - data2 = pd.TimedeltaIndex(['10 day', '11 day', '12 day']) - df2 = pd.DataFrame({'TD': data2}, index=[2, 4, 5]) + data1 = pd.TimedeltaIndex(["1 day", "NaT", "3 day", "4day"]) + df1 = pd.DataFrame({"TD": data1}, index=[1, 3, 5, 7]) + data2 = pd.TimedeltaIndex(["10 day", "11 day", "12 day"]) + df2 = pd.DataFrame({"TD": data2}, index=[2, 4, 5]) res = df1.combine_first(df2) - exp_dts = pd.TimedeltaIndex(['1 day', '10 day', 'NaT', - '11 day', '3 day', '4 day']) - exp = pd.DataFrame({'TD': exp_dts}, index=[1, 2, 3, 4, 5, 7]) + exp_dts = pd.TimedeltaIndex( + ["1 day", "10 day", "NaT", "11 day", "3 day", "4 day"] + ) + exp = pd.DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) - assert res['TD'].dtype == 'timedelta64[ns]' + assert res["TD"].dtype == "timedelta64[ns]" def test_combine_first_period(self): - data1 = pd.PeriodIndex(['2011-01', 'NaT', '2011-03', - '2011-04'], freq='M') - df1 = pd.DataFrame({'P': data1}, index=[1, 3, 5, 7]) - data2 = pd.PeriodIndex(['2012-01-01', '2012-02', - '2012-03'], freq='M') - df2 = pd.DataFrame({'P': data2}, index=[2, 4, 5]) + data1 = pd.PeriodIndex(["2011-01", "NaT", "2011-03", "2011-04"], freq="M") + df1 = pd.DataFrame({"P": data1}, index=[1, 3, 5, 7]) + data2 = pd.PeriodIndex(["2012-01-01", "2012-02", "2012-03"], freq="M") + df2 = pd.DataFrame({"P": data2}, index=[2, 4, 5]) res = df1.combine_first(df2) - exp_dts = pd.PeriodIndex(['2011-01', '2012-01', 'NaT', - '2012-02', '2011-03', '2011-04'], - freq='M') - exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7]) + exp_dts = pd.PeriodIndex( + ["2011-01", "2012-01", "NaT", "2012-02", "2011-03", "2011-04"], freq="M" + ) + exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) - assert res['P'].dtype == data1.dtype + assert res["P"].dtype == data1.dtype # different freq - dts2 = pd.PeriodIndex(['2012-01-01', '2012-01-02', - '2012-01-03'], freq='D') - df2 = pd.DataFrame({'P': dts2}, index=[2, 4, 5]) + dts2 = pd.PeriodIndex(["2012-01-01", "2012-01-02", "2012-01-03"], freq="D") + df2 = pd.DataFrame({"P": dts2}, index=[2, 4, 5]) res = df1.combine_first(df2) - exp_dts = [pd.Period('2011-01', freq='M'), - pd.Period('2012-01-01', freq='D'), - pd.NaT, - pd.Period('2012-01-02', freq='D'), - pd.Period('2011-03', freq='M'), - pd.Period('2011-04', freq='M')] - exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7]) + exp_dts = [ + pd.Period("2011-01", freq="M"), + pd.Period("2012-01-01", freq="D"), + pd.NaT, + pd.Period("2012-01-02", freq="D"), + pd.Period("2011-03", freq="M"), + pd.Period("2011-04", freq="M"), + ] + exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) - assert res['P'].dtype == 'object' + assert res["P"].dtype == "object" def test_combine_first_int(self): # GH14687 - integer series that do no align exactly - df1 = pd.DataFrame({'a': [0, 1, 3, 5]}, dtype='int64') - df2 = pd.DataFrame({'a': [1, 4]}, dtype='int64') + df1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") + df2 = pd.DataFrame({"a": [1, 4]}, dtype="int64") res = df1.combine_first(df2) tm.assert_frame_equal(res, df1) - assert res['a'].dtype == 'int64' + assert res["a"].dtype == "int64" @pytest.mark.parametrize("val", [1, 1.0]) def test_combine_first_with_asymmetric_other(self, val): # see gh-20699 - df1 = pd.DataFrame({'isNum': [val]}) - df2 = pd.DataFrame({'isBool': [True]}) + df1 = pd.DataFrame({"isNum": [val]}) + df2 = pd.DataFrame({"isBool": [True]}) res = df1.combine_first(df2) - exp = pd.DataFrame({'isBool': [True], 'isNum': [val]}) + exp = pd.DataFrame({"isBool": [True], "isNum": [val]}) tm.assert_frame_equal(res, exp) @@ -856,35 +921,32 @@ def test_concat_datetime_datetime64_frame(self): # #2624 rows = [] rows.append([datetime(2010, 1, 1), 1]) - rows.append([datetime(2010, 1, 2), 'hi']) + rows.append([datetime(2010, 1, 2), "hi"]) - df2_obj = DataFrame.from_records(rows, columns=['date', 'test']) + df2_obj = DataFrame.from_records(rows, columns=["date", "test"]) ind = date_range(start="2000/1/1", freq="D", periods=10) - df1 = DataFrame({'date': ind, 'test': range(10)}) + df1 = DataFrame({"date": ind, "test": range(10)}) # it works! pd.concat([df1, df2_obj]) class TestDataFrameUpdate: - def test_update_nan(self): # #15593 #15617 # test 1 - df1 = DataFrame({'A': [1.0, 2, 3], 'B': date_range('2000', periods=3)}) - df2 = DataFrame({'A': [None, 2, 3]}) + df1 = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)}) + df2 = DataFrame({"A": [None, 2, 3]}) expected = df1.copy() df1.update(df2, overwrite=False) tm.assert_frame_equal(df1, expected) # test 2 - df1 = DataFrame({'A': [1.0, None, 3], - 'B': date_range('2000', periods=3)}) - df2 = DataFrame({'A': [None, 2, 3]}) - expected = DataFrame({'A': [1.0, 2, 3], - 'B': date_range('2000', periods=3)}) + df1 = DataFrame({"A": [1.0, None, 3], "B": date_range("2000", periods=3)}) + df2 = DataFrame({"A": [None, 2, 3]}) + expected = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)}) df1.update(df2, overwrite=False) tm.assert_frame_equal(df1, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 73a8720adb5ccc..a16ca7045cfddd 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -15,32 +15,51 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, Index, MultiIndex, RangeIndex, Series, Timedelta, - Timestamp, date_range, isna) + Categorical, + DataFrame, + Index, + MultiIndex, + RangeIndex, + Series, + Timedelta, + Timestamp, + date_range, + isna, +) import pandas.util.testing as tm -MIXED_FLOAT_DTYPES = ['float16', 'float32', 'float64'] -MIXED_INT_DTYPES = ['uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', - 'int32', 'int64'] +MIXED_FLOAT_DTYPES = ["float16", "float32", "float64"] +MIXED_INT_DTYPES = [ + "uint8", + "uint16", + "uint32", + "uint64", + "int8", + "int16", + "int32", + "int64", +] class TestDataFrameConstructors: - - @pytest.mark.parametrize('constructor', [ - lambda: DataFrame(), - lambda: DataFrame(None), - lambda: DataFrame({}), - lambda: DataFrame(()), - lambda: DataFrame([]), - lambda: DataFrame((x for x in [])), - lambda: DataFrame(range(0)), - lambda: DataFrame(data=None), - lambda: DataFrame(data={}), - lambda: DataFrame(data=()), - lambda: DataFrame(data=[]), - lambda: DataFrame(data=(x for x in [])), - lambda: DataFrame(data=range(0)), - ]) + @pytest.mark.parametrize( + "constructor", + [ + lambda: DataFrame(), + lambda: DataFrame(None), + lambda: DataFrame({}), + lambda: DataFrame(()), + lambda: DataFrame([]), + lambda: DataFrame((x for x in [])), + lambda: DataFrame(range(0)), + lambda: DataFrame(data=None), + lambda: DataFrame(data={}), + lambda: DataFrame(data=()), + lambda: DataFrame(data=[]), + lambda: DataFrame(data=(x for x in [])), + lambda: DataFrame(data=range(0)), + ], + ) def test_empty_constructor(self, constructor): expected = DataFrame() result = constructor() @@ -48,13 +67,15 @@ def test_empty_constructor(self, constructor): assert len(result.columns) == 0 tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('emptylike,expected_index,expected_columns', [ - ([[]], RangeIndex(1), RangeIndex(0)), - ([[], []], RangeIndex(2), RangeIndex(0)), - ([(x for x in [])], RangeIndex(1), RangeIndex(0)) - ]) - def test_emptylike_constructor( - self, emptylike, expected_index, expected_columns): + @pytest.mark.parametrize( + "emptylike,expected_index,expected_columns", + [ + ([[]], RangeIndex(1), RangeIndex(0)), + ([[], []], RangeIndex(2), RangeIndex(0)), + ([(x for x in [])], RangeIndex(1), RangeIndex(0)), + ], + ) + def test_emptylike_constructor(self, emptylike, expected_index, expected_columns): expected = DataFrame(index=expected_index, columns=expected_columns) result = DataFrame(emptylike) tm.assert_frame_equal(result, expected) @@ -66,36 +87,33 @@ def test_constructor_mixed(self, float_string_frame): indexed_frame = DataFrame(data, index=index) # noqa unindexed_frame = DataFrame(data) # noqa - assert float_string_frame['foo'].dtype == np.object_ + assert float_string_frame["foo"].dtype == np.object_ def test_constructor_cast_failure(self): - foo = DataFrame({'a': ['a', 'b', 'c']}, dtype=np.float64) - assert foo['a'].dtype == object + foo = DataFrame({"a": ["a", "b", "c"]}, dtype=np.float64) + assert foo["a"].dtype == object # GH 3010, constructing with odd arrays df = DataFrame(np.ones((4, 2))) # this is ok - df['foo'] = np.ones((4, 2)).tolist() + df["foo"] = np.ones((4, 2)).tolist() # this is not ok msg = "Wrong number of items passed 2, placement implies 1" with pytest.raises(ValueError, match=msg): - df['test'] = np.ones((4, 2)) + df["test"] = np.ones((4, 2)) # this is ok - df['foo2'] = np.ones((4, 2)).tolist() + df["foo2"] = np.ones((4, 2)).tolist() def test_constructor_dtype_copy(self): - orig_df = DataFrame({ - 'col1': [1.], - 'col2': [2.], - 'col3': [3.]}) + orig_df = DataFrame({"col1": [1.0], "col2": [2.0], "col3": [3.0]}) new_df = pd.DataFrame(orig_df, dtype=float, copy=True) - new_df['col1'] = 200. - assert orig_df['col1'][0] == 1. + new_df["col1"] = 200.0 + assert orig_df["col1"][0] == 1.0 def test_constructor_dtype_nocast_view(self): df = DataFrame([[1, 2]]) @@ -108,10 +126,9 @@ def test_constructor_dtype_nocast_view(self): assert df.values[0, 0] == 97 def test_constructor_dtype_list_data(self): - df = DataFrame([[1, '2'], - [None, 'a']], dtype=object) + df = DataFrame([[1, "2"], [None, "a"]], dtype=object) assert df.loc[1, 0] is None - assert df.loc[0, 1] == '2' + assert df.loc[0, 1] == "2" def test_constructor_list_frames(self): # see gh-3243 @@ -122,20 +139,19 @@ def test_constructor_list_frames(self): assert isinstance(result.iloc[0, 0], DataFrame) def test_constructor_mixed_dtypes(self): - def _make_mixed_dtypes_df(typ, ad=None): - if typ == 'int': + if typ == "int": dtypes = MIXED_INT_DTYPES - arrays = [np.array(np.random.rand(10), dtype=d) - for d in dtypes] - elif typ == 'float': + arrays = [np.array(np.random.rand(10), dtype=d) for d in dtypes] + elif typ == "float": dtypes = MIXED_FLOAT_DTYPES - arrays = [np.array(np.random.randint( - 10, size=10), dtype=d) for d in dtypes] + arrays = [ + np.array(np.random.randint(10, size=10), dtype=d) for d in dtypes + ] for d, a in zip(dtypes, arrays): - assert(a.dtype == d) + assert a.dtype == d if ad is None: ad = dict() ad.update({d: a for d, a in zip(dtypes, arrays)}) @@ -146,18 +162,18 @@ def _check_mixed_dtypes(df, dtypes=None): dtypes = MIXED_FLOAT_DTYPES + MIXED_INT_DTYPES for d in dtypes: if d in df: - assert(df.dtypes[d] == d) + assert df.dtypes[d] == d # mixed floating and integer coexist in the same frame - df = _make_mixed_dtypes_df('float') + df = _make_mixed_dtypes_df("float") _check_mixed_dtypes(df) # add lots of types - df = _make_mixed_dtypes_df('float', dict(A=1, B='foo', C='bar')) + df = _make_mixed_dtypes_df("float", dict(A=1, B="foo", C="bar")) _check_mixed_dtypes(df) # GH 622 - df = _make_mixed_dtypes_df('int') + df = _make_mixed_dtypes_df("int") _check_mixed_dtypes(df) def test_constructor_complex_dtypes(self): @@ -165,19 +181,19 @@ def test_constructor_complex_dtypes(self): a = np.random.rand(10).astype(np.complex64) b = np.random.rand(10).astype(np.complex128) - df = DataFrame({'a': a, 'b': b}) + df = DataFrame({"a": a, "b": b}) assert a.dtype == df.a.dtype assert b.dtype == df.b.dtype def test_constructor_dtype_str_na_values(self, string_dtype): # https://github.com/pandas-dev/pandas/issues/21083 - df = DataFrame({'A': ['x', None]}, dtype=string_dtype) + df = DataFrame({"A": ["x", None]}, dtype=string_dtype) result = df.isna() expected = DataFrame({"A": [False, True]}) tm.assert_frame_equal(result, expected) assert df.iloc[1, 0] is None - df = DataFrame({'A': ['x', np.nan]}, dtype=string_dtype) + df = DataFrame({"A": ["x", np.nan]}, dtype=string_dtype) assert np.isnan(df.iloc[1, 0]) def test_constructor_rec(self, float_frame): @@ -194,37 +210,46 @@ def test_constructor_rec(self, float_frame): tm.assert_index_equal(df2.index, index) rng = np.arange(len(rec))[::-1] - df3 = DataFrame(rec, index=rng, columns=['C', 'B']) - expected = DataFrame(rec, index=rng).reindex(columns=['C', 'B']) + df3 = DataFrame(rec, index=rng, columns=["C", "B"]) + expected = DataFrame(rec, index=rng).reindex(columns=["C", "B"]) tm.assert_frame_equal(df3, expected) def test_constructor_bool(self): - df = DataFrame({0: np.ones(10, dtype=bool), - 1: np.zeros(10, dtype=bool)}) + df = DataFrame({0: np.ones(10, dtype=bool), 1: np.zeros(10, dtype=bool)}) assert df.values.dtype == np.bool_ def test_constructor_overflow_int64(self): # see gh-14881 - values = np.array([2 ** 64 - i for i in range(1, 10)], - dtype=np.uint64) + values = np.array([2 ** 64 - i for i in range(1, 10)], dtype=np.uint64) - result = DataFrame({'a': values}) - assert result['a'].dtype == np.uint64 + result = DataFrame({"a": values}) + assert result["a"].dtype == np.uint64 # see gh-2355 - data_scores = [(6311132704823138710, 273), (2685045978526272070, 23), - (8921811264899370420, 45), (17019687244989530680, 270), - (9930107427299601010, 273)] - dtype = [('uid', 'u8'), ('score', 'u8')] + data_scores = [ + (6311132704823138710, 273), + (2685045978526272070, 23), + (8921811264899370420, 45), + (17019687244989530680, 270), + (9930107427299601010, 273), + ] + dtype = [("uid", "u8"), ("score", "u8")] data = np.zeros((len(data_scores),), dtype=dtype) data[:] = data_scores df_crawls = DataFrame(data) - assert df_crawls['uid'].dtype == np.uint64 - - @pytest.mark.parametrize("values", [np.array([2**64], dtype=object), - np.array([2**65]), [2**64 + 1], - np.array([-2**63 - 4], dtype=object), - np.array([-2**64 - 1]), [-2**65 - 2]]) + assert df_crawls["uid"].dtype == np.uint64 + + @pytest.mark.parametrize( + "values", + [ + np.array([2 ** 64], dtype=object), + np.array([2 ** 65]), + [2 ** 64 + 1], + np.array([-2 ** 63 - 4], dtype=object), + np.array([-2 ** 64 - 1]), + [-2 ** 65 - 2], + ], + ) def test_constructor_int_overflow(self, values): # see gh-18584 value = values[0] @@ -235,10 +260,11 @@ def test_constructor_int_overflow(self, values): def test_constructor_ordereddict(self): import random + nitems = 100 nums = list(range(nitems)) random.shuffle(nums) - expected = ['A%d' % i for i in nums] + expected = ["A%d" % i for i in nums] df = DataFrame(OrderedDict(zip(expected, [[0]] * nitems))) assert expected == list(df.columns) @@ -247,27 +273,29 @@ def test_constructor_dict(self): # test expects index shifted by 5 datetime_series_short = tm.makeTimeSeries(nper=30)[5:] - frame = DataFrame({'col1': datetime_series, - 'col2': datetime_series_short}) + frame = DataFrame({"col1": datetime_series, "col2": datetime_series_short}) # col2 is padded with NaN assert len(datetime_series) == 30 assert len(datetime_series_short) == 25 - tm.assert_series_equal(frame['col1'], datetime_series.rename('col1')) + tm.assert_series_equal(frame["col1"], datetime_series.rename("col1")) - exp = pd.Series(np.concatenate([[np.nan] * 5, - datetime_series_short.values]), - index=datetime_series.index, name='col2') - tm.assert_series_equal(exp, frame['col2']) + exp = pd.Series( + np.concatenate([[np.nan] * 5, datetime_series_short.values]), + index=datetime_series.index, + name="col2", + ) + tm.assert_series_equal(exp, frame["col2"]) - frame = DataFrame({'col1': datetime_series, - 'col2': datetime_series_short}, - columns=['col2', 'col3', 'col4']) + frame = DataFrame( + {"col1": datetime_series, "col2": datetime_series_short}, + columns=["col2", "col3", "col4"], + ) assert len(frame) == len(datetime_series_short) - assert 'col1' not in frame - assert isna(frame['col3']).all() + assert "col1" not in frame + assert isna(frame["col3"]).all() # Corner cases assert len(DataFrame()) == 0 @@ -275,11 +303,11 @@ def test_constructor_dict(self): # mix dict and array, wrong size - no spec for which error should raise # first with pytest.raises(ValueError): - DataFrame({'A': {'a': 'a', 'b': 'b'}, 'B': ['a', 'b', 'c']}) + DataFrame({"A": {"a": "a", "b": "b"}, "B": ["a", "b", "c"]}) # Length-one dict micro-optimization - frame = DataFrame({'A': {'1': 1, '2': 2}}) - tm.assert_index_equal(frame.index, pd.Index(['1', '2'])) + frame = DataFrame({"A": {"1": 1, "2": 2}}) + tm.assert_index_equal(frame.index, pd.Index(["1", "2"])) # empty dict plus index idx = Index([0, 1, 2]) @@ -294,97 +322,95 @@ def test_constructor_dict(self): assert len(frame._series) == 3 # with dict of empty list and Series - frame = DataFrame({'A': [], 'B': []}, columns=['A', 'B']) + frame = DataFrame({"A": [], "B": []}, columns=["A", "B"]) tm.assert_index_equal(frame.index, Index([], dtype=np.int64)) # GH 14381 # Dict with None value frame_none = DataFrame(dict(a=None), index=[0]) frame_none_list = DataFrame(dict(a=[None]), index=[0]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - assert frame_none.get_value(0, 'a') is None - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - assert frame_none_list.get_value(0, 'a') is None + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert frame_none.get_value(0, "a") is None + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert frame_none_list.get_value(0, "a") is None tm.assert_frame_equal(frame_none, frame_none_list) # GH10856 # dict with scalar values should raise error, even if columns passed - msg = 'If using all scalar values, you must pass an index' + msg = "If using all scalar values, you must pass an index" with pytest.raises(ValueError, match=msg): - DataFrame({'a': 0.7}) + DataFrame({"a": 0.7}) with pytest.raises(ValueError, match=msg): - DataFrame({'a': 0.7}, columns=['a']) + DataFrame({"a": 0.7}, columns=["a"]) - @pytest.mark.parametrize("scalar", [2, np.nan, None, 'D']) + @pytest.mark.parametrize("scalar", [2, np.nan, None, "D"]) def test_constructor_invalid_items_unused(self, scalar): # No error if invalid (scalar) value is in fact not used: - result = DataFrame({'a': scalar}, columns=['b']) - expected = DataFrame(columns=['b']) + result = DataFrame({"a": scalar}, columns=["b"]) + expected = DataFrame(columns=["b"]) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')]) + @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")]) def test_constructor_dict_nan_key(self, value): # GH 18455 cols = [1, value, 3] - idx = ['a', value] + idx = ["a", value] values = [[0, 3], [1, 4], [2, 5]] data = {cols[c]: Series(values[c], index=idx) for c in range(3)} - result = DataFrame(data).sort_values(1).sort_values('a', axis=1) - expected = DataFrame(np.arange(6, dtype='int64').reshape(2, 3), - index=idx, columns=cols) + result = DataFrame(data).sort_values(1).sort_values("a", axis=1) + expected = DataFrame( + np.arange(6, dtype="int64").reshape(2, 3), index=idx, columns=cols + ) tm.assert_frame_equal(result, expected) - result = DataFrame(data, index=idx).sort_values('a', axis=1) + result = DataFrame(data, index=idx).sort_values("a", axis=1) tm.assert_frame_equal(result, expected) result = DataFrame(data, index=idx, columns=cols) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("value", [np.nan, None, float('nan')]) + @pytest.mark.parametrize("value", [np.nan, None, float("nan")]) def test_constructor_dict_nan_tuple_key(self, value): # GH 18455 cols = Index([(11, 21), (value, 22), (13, value)]) - idx = Index([('a', value), (value, 2)]) + idx = Index([("a", value), (value, 2)]) values = [[0, 3], [1, 4], [2, 5]] data = {cols[c]: Series(values[c], index=idx) for c in range(3)} - result = (DataFrame(data) - .sort_values((11, 21)) - .sort_values(('a', value), axis=1)) - expected = DataFrame(np.arange(6, dtype='int64').reshape(2, 3), - index=idx, columns=cols) + result = DataFrame(data).sort_values((11, 21)).sort_values(("a", value), axis=1) + expected = DataFrame( + np.arange(6, dtype="int64").reshape(2, 3), index=idx, columns=cols + ) tm.assert_frame_equal(result, expected) - result = DataFrame(data, index=idx).sort_values(('a', value), axis=1) + result = DataFrame(data, index=idx).sort_values(("a", value), axis=1) tm.assert_frame_equal(result, expected) result = DataFrame(data, index=idx, columns=cols) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif(not PY36, reason='Insertion order for Python>=3.6') + @pytest.mark.skipif(not PY36, reason="Insertion order for Python>=3.6") def test_constructor_dict_order_insertion(self): datetime_series = tm.makeTimeSeries(nper=30) datetime_series_short = tm.makeTimeSeries(nper=25) # GH19018 # initialization ordering: by insertion order if python>= 3.6 - d = {'b': datetime_series_short, 'a': datetime_series} + d = {"b": datetime_series_short, "a": datetime_series} frame = DataFrame(data=d) - expected = DataFrame(data=d, columns=list('ba')) + expected = DataFrame(data=d, columns=list("ba")) tm.assert_frame_equal(frame, expected) - @pytest.mark.skipif(PY36, reason='order by value for Python<3.6') + @pytest.mark.skipif(PY36, reason="order by value for Python<3.6") def test_constructor_dict_order_by_values(self): datetime_series = tm.makeTimeSeries(nper=30) datetime_series_short = tm.makeTimeSeries(nper=25) # GH19018 # initialization ordering: by value if python<3.6 - d = {'b': datetime_series_short, 'a': datetime_series} + d = {"b": datetime_series_short, "a": datetime_series} frame = DataFrame(data=d) - expected = DataFrame(data=d, columns=list('ab')) + expected = DataFrame(data=d, columns=list("ab")) tm.assert_frame_equal(frame, expected) def test_constructor_multi_index(self): @@ -404,20 +430,21 @@ def test_constructor_error_msgs(self): msg = "Empty data passed with indices specified." # passing an empty array with columns specified. with pytest.raises(ValueError, match=msg): - DataFrame(np.empty(0), columns=list('abc')) + DataFrame(np.empty(0), columns=list("abc")) msg = "Mixing dicts with non-Series may lead to ambiguous ordering." # mix dict and array, wrong size with pytest.raises(ValueError, match=msg): - DataFrame({'A': {'a': 'a', 'b': 'b'}, - 'B': ['a', 'b', 'c']}) + DataFrame({"A": {"a": "a", "b": "b"}, "B": ["a", "b", "c"]}) # wrong size ndarray, GH 3105 msg = r"Shape of passed values is \(4, 3\), indices imply \(3, 3\)" with pytest.raises(ValueError, match=msg): - DataFrame(np.arange(12).reshape((4, 3)), - columns=['foo', 'bar', 'baz'], - index=pd.date_range('2000-01-01', periods=3)) + DataFrame( + np.arange(12).reshape((4, 3)), + columns=["foo", "bar", "baz"], + index=pd.date_range("2000-01-01", periods=3), + ) arr = np.array([[4, 5, 6]]) msg = r"Shape of passed values is \(1, 3\), indices imply \(1, 4\)" @@ -430,37 +457,31 @@ def test_constructor_error_msgs(self): DataFrame(index=[0], columns=range(0, 4), data=arr) # higher dim raise exception - with pytest.raises(ValueError, match='Must pass 2-d input'): - DataFrame(np.zeros((3, 3, 3)), columns=['A', 'B', 'C'], index=[1]) + with pytest.raises(ValueError, match="Must pass 2-d input"): + DataFrame(np.zeros((3, 3, 3)), columns=["A", "B", "C"], index=[1]) # wrong size axis labels - msg = ("Shape of passed values " - r"is \(2, 3\), indices " - r"imply \(1, 3\)") + msg = "Shape of passed values " r"is \(2, 3\), indices " r"imply \(1, 3\)" with pytest.raises(ValueError, match=msg): - DataFrame(np.random.rand(2, 3), columns=['A', 'B', 'C'], index=[1]) + DataFrame(np.random.rand(2, 3), columns=["A", "B", "C"], index=[1]) - msg = ("Shape of passed values " - r"is \(2, 3\), indices " - r"imply \(2, 2\)") + msg = "Shape of passed values " r"is \(2, 3\), indices " r"imply \(2, 2\)" with pytest.raises(ValueError, match=msg): - DataFrame(np.random.rand(2, 3), columns=['A', 'B'], index=[1, 2]) + DataFrame(np.random.rand(2, 3), columns=["A", "B"], index=[1, 2]) # gh-26429 msg = "2 columns passed, passed data had 10 columns" with pytest.raises(ValueError, match=msg): - DataFrame((range(10), range(10, 20)), columns=('ones', 'twos')) + DataFrame((range(10), range(10, 20)), columns=("ones", "twos")) - msg = ("If using all scalar " - "values, you must pass " - "an index") + msg = "If using all scalar " "values, you must pass " "an index" with pytest.raises(ValueError, match=msg): - DataFrame({'a': False, 'b': True}) + DataFrame({"a": False, "b": True}) def test_constructor_with_embedded_frames(self): # embedded data frames - df1 = DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}) + df1 = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) df2 = DataFrame([df1, df1 + 10]) df2.dtypes @@ -474,11 +495,12 @@ def test_constructor_with_embedded_frames(self): def test_constructor_subclass_dict(self, float_frame): # Test for passing dict subclass to constructor - data = {'col1': tm.TestSubDict((x, 10.0 * x) for x in range(10)), - 'col2': tm.TestSubDict((x, 20.0 * x) for x in range(10))} + data = { + "col1": tm.TestSubDict((x, 10.0 * x) for x in range(10)), + "col2": tm.TestSubDict((x, 20.0 * x) for x in range(10)), + } df = DataFrame(data) - refdf = DataFrame({col: dict(val.items()) - for col, val in data.items()}) + refdf = DataFrame({col: dict(val.items()) for col, val in data.items()}) tm.assert_frame_equal(refdf, df) data = tm.TestSubDict(data.items()) @@ -487,8 +509,9 @@ def test_constructor_subclass_dict(self, float_frame): # try with defaultdict from collections import defaultdict + data = {} - float_frame['B'][:10] = np.nan + float_frame["B"][:10] = np.nan for k, v in float_frame.items(): dct = defaultdict(dict) dct.update(v.to_dict()) @@ -497,48 +520,47 @@ def test_constructor_subclass_dict(self, float_frame): tm.assert_frame_equal(float_frame.sort_index(), frame) def test_constructor_dict_block(self): - expected = np.array([[4., 3., 2., 1.]]) - df = DataFrame({'d': [4.], 'c': [3.], 'b': [2.], 'a': [1.]}, - columns=['d', 'c', 'b', 'a']) + expected = np.array([[4.0, 3.0, 2.0, 1.0]]) + df = DataFrame( + {"d": [4.0], "c": [3.0], "b": [2.0], "a": [1.0]}, + columns=["d", "c", "b", "a"], + ) tm.assert_numpy_array_equal(df.values, expected) def test_constructor_dict_cast(self): # cast float tests - test_data = { - 'A': {'1': 1, '2': 2}, - 'B': {'1': '1', '2': '2', '3': '3'}, - } + test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} frame = DataFrame(test_data, dtype=float) assert len(frame) == 3 - assert frame['B'].dtype == np.float64 - assert frame['A'].dtype == np.float64 + assert frame["B"].dtype == np.float64 + assert frame["A"].dtype == np.float64 frame = DataFrame(test_data) assert len(frame) == 3 - assert frame['B'].dtype == np.object_ - assert frame['A'].dtype == np.float64 + assert frame["B"].dtype == np.object_ + assert frame["A"].dtype == np.float64 # can't cast to float test_data = { - 'A': dict(zip(range(20), tm.makeStringIndex(20))), - 'B': dict(zip(range(15), np.random.randn(15))) + "A": dict(zip(range(20), tm.makeStringIndex(20))), + "B": dict(zip(range(15), np.random.randn(15))), } frame = DataFrame(test_data, dtype=float) assert len(frame) == 20 - assert frame['A'].dtype == np.object_ - assert frame['B'].dtype == np.float64 + assert frame["A"].dtype == np.object_ + assert frame["B"].dtype == np.float64 def test_constructor_dict_dont_upcast(self): - d = {'Col1': {'Row1': 'A String', 'Row2': np.nan}} + d = {"Col1": {"Row1": "A String", "Row2": np.nan}} df = DataFrame(d) - assert isinstance(df['Col1']['Row2'], float) + assert isinstance(df["Col1"]["Row2"], float) - dm = DataFrame([[1, 2], ['a', 'b']], index=[1, 2], columns=[1, 2]) + dm = DataFrame([[1, 2], ["a", "b"]], index=[1, 2], columns=[1, 2]) assert isinstance(dm[1][1], int) def test_constructor_dict_of_tuples(self): # GH #1491 - data = {'a': (1, 2, 3), 'b': (4, 5, 6)} + data = {"a": (1, 2, 3), "b": (4, 5, 6)} result = DataFrame(data) expected = DataFrame({k: list(v) for k, v in data.items()}) @@ -546,50 +568,56 @@ def test_constructor_dict_of_tuples(self): def test_constructor_dict_of_ranges(self): # GH 26356 - data = {'a': range(3), 'b': range(3, 6)} + data = {"a": range(3), "b": range(3, 6)} result = DataFrame(data) - expected = DataFrame({'a': [0, 1, 2], 'b': [3, 4, 5]}) + expected = DataFrame({"a": [0, 1, 2], "b": [3, 4, 5]}) tm.assert_frame_equal(result, expected) def test_constructor_dict_of_iterators(self): # GH 26349 - data = {'a': iter(range(3)), 'b': reversed(range(3))} + data = {"a": iter(range(3)), "b": reversed(range(3))} result = DataFrame(data) - expected = DataFrame({'a': [0, 1, 2], 'b': [2, 1, 0]}) + expected = DataFrame({"a": [0, 1, 2], "b": [2, 1, 0]}) tm.assert_frame_equal(result, expected) def test_constructor_dict_of_generators(self): # GH 26349 - data = {'a': (i for i in (range(3))), - 'b': (i for i in reversed(range(3)))} + data = {"a": (i for i in (range(3))), "b": (i for i in reversed(range(3)))} result = DataFrame(data) - expected = DataFrame({'a': [0, 1, 2], 'b': [2, 1, 0]}) + expected = DataFrame({"a": [0, 1, 2], "b": [2, 1, 0]}) tm.assert_frame_equal(result, expected) def test_constructor_dict_multiindex(self): def check(result, expected): - return tm.assert_frame_equal(result, expected, check_dtype=True, - check_index_type=True, - check_column_type=True, - check_names=True) - d = {('a', 'a'): {('i', 'i'): 0, ('i', 'j'): 1, ('j', 'i'): 2}, - ('b', 'a'): {('i', 'i'): 6, ('i', 'j'): 5, ('j', 'i'): 4}, - ('b', 'c'): {('i', 'i'): 7, ('i', 'j'): 8, ('j', 'i'): 9}} + return tm.assert_frame_equal( + result, + expected, + check_dtype=True, + check_index_type=True, + check_column_type=True, + check_names=True, + ) + + d = { + ("a", "a"): {("i", "i"): 0, ("i", "j"): 1, ("j", "i"): 2}, + ("b", "a"): {("i", "i"): 6, ("i", "j"): 5, ("j", "i"): 4}, + ("b", "c"): {("i", "i"): 7, ("i", "j"): 8, ("j", "i"): 9}, + } _d = sorted(d.items()) df = DataFrame(d) expected = DataFrame( - [x[1] for x in _d], - index=MultiIndex.from_tuples([x[0] for x in _d])).T + [x[1] for x in _d], index=MultiIndex.from_tuples([x[0] for x in _d]) + ).T expected.index = MultiIndex.from_tuples(expected.index) check(df, expected) - d['z'] = {'y': 123., ('i', 'i'): 111, ('i', 'j'): 111, ('j', 'i'): 111} - _d.insert(0, ('z', d['z'])) + d["z"] = {"y": 123.0, ("i", "i"): 111, ("i", "j"): 111, ("j", "i"): 111} + _d.insert(0, ("z", d["z"])) expected = DataFrame( - [x[1] for x in _d], - index=Index([x[0] for x in _d], tupleize_cols=False)).T + [x[1] for x in _d], index=Index([x[0] for x in _d], tupleize_cols=False) + ).T expected.index = Index(expected.index, tupleize_cols=False) df = DataFrame(d) df = df.reindex(columns=expected.columns, index=expected.index) @@ -597,21 +625,24 @@ def check(result, expected): def test_constructor_dict_datetime64_index(self): # GH 10160 - dates_as_str = ['1984-02-19', '1988-11-06', '1989-12-03', '1990-03-15'] + dates_as_str = ["1984-02-19", "1988-11-06", "1989-12-03", "1990-03-15"] def create_data(constructor): - return {i: {constructor(s): 2 * i} - for i, s in enumerate(dates_as_str)} + return {i: {constructor(s): 2 * i} for i, s in enumerate(dates_as_str)} data_datetime64 = create_data(np.datetime64) - data_datetime = create_data(lambda x: datetime.strptime(x, '%Y-%m-%d')) + data_datetime = create_data(lambda x: datetime.strptime(x, "%Y-%m-%d")) data_Timestamp = create_data(Timestamp) - expected = DataFrame([{0: 0, 1: None, 2: None, 3: None}, - {0: None, 1: 2, 2: None, 3: None}, - {0: None, 1: None, 2: 4, 3: None}, - {0: None, 1: None, 2: None, 3: 6}], - index=[Timestamp(dt) for dt in dates_as_str]) + expected = DataFrame( + [ + {0: 0, 1: None, 2: None, 3: None}, + {0: None, 1: 2, 2: None, 3: None}, + {0: None, 1: None, 2: 4, 3: None}, + {0: None, 1: None, 2: None, 3: 6}, + ], + index=[Timestamp(dt) for dt in dates_as_str], + ) result_datetime64 = DataFrame(data_datetime64) result_datetime = DataFrame(data_datetime) @@ -625,18 +656,21 @@ def test_constructor_dict_timedelta64_index(self): td_as_int = [1, 2, 3, 4] def create_data(constructor): - return {i: {constructor(s): 2 * i} - for i, s in enumerate(td_as_int)} + return {i: {constructor(s): 2 * i} for i, s in enumerate(td_as_int)} - data_timedelta64 = create_data(lambda x: np.timedelta64(x, 'D')) + data_timedelta64 = create_data(lambda x: np.timedelta64(x, "D")) data_timedelta = create_data(lambda x: timedelta(days=x)) - data_Timedelta = create_data(lambda x: Timedelta(x, 'D')) + data_Timedelta = create_data(lambda x: Timedelta(x, "D")) - expected = DataFrame([{0: 0, 1: None, 2: None, 3: None}, - {0: None, 1: 2, 2: None, 3: None}, - {0: None, 1: None, 2: 4, 3: None}, - {0: None, 1: None, 2: None, 3: 6}], - index=[Timedelta(td, 'D') for td in td_as_int]) + expected = DataFrame( + [ + {0: 0, 1: None, 2: None, 3: None}, + {0: None, 1: 2, 2: None, 3: None}, + {0: None, 1: None, 2: 4, 3: None}, + {0: None, 1: None, 2: None, 3: 6}, + ], + index=[Timedelta(td, "D") for td in td_as_int], + ) result_timedelta64 = DataFrame(data_timedelta64) result_timedelta = DataFrame(data_timedelta) @@ -647,27 +681,27 @@ def create_data(constructor): def test_constructor_period(self): # PeriodIndex - a = pd.PeriodIndex(['2012-01', 'NaT', '2012-04'], freq='M') - b = pd.PeriodIndex(['2012-02-01', '2012-03-01', 'NaT'], freq='D') - df = pd.DataFrame({'a': a, 'b': b}) - assert df['a'].dtype == a.dtype - assert df['b'].dtype == b.dtype + a = pd.PeriodIndex(["2012-01", "NaT", "2012-04"], freq="M") + b = pd.PeriodIndex(["2012-02-01", "2012-03-01", "NaT"], freq="D") + df = pd.DataFrame({"a": a, "b": b}) + assert df["a"].dtype == a.dtype + assert df["b"].dtype == b.dtype # list of periods - df = pd.DataFrame({'a': a.astype(object).tolist(), - 'b': b.astype(object).tolist()}) - assert df['a'].dtype == a.dtype - assert df['b'].dtype == b.dtype + df = pd.DataFrame( + {"a": a.astype(object).tolist(), "b": b.astype(object).tolist()} + ) + assert df["a"].dtype == a.dtype + assert df["b"].dtype == b.dtype def test_nested_dict_frame_constructor(self): - rng = pd.period_range('1/1/2000', periods=5) + rng = pd.period_range("1/1/2000", periods=5) df = DataFrame(np.random.randn(10, 5), columns=rng) data = {} for col in df.columns: for row in df.index: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): data.setdefault(col, {})[row] = df.get_value(row, col) result = DataFrame(data, columns=rng) @@ -676,8 +710,7 @@ def test_nested_dict_frame_constructor(self): data = {} for col in df.columns: for row in df.index: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): data.setdefault(row, {})[col] = df.get_value(row, col) result = DataFrame(data, index=rng).T @@ -688,33 +721,31 @@ def _check_basic_constructor(self, empty): # objects mat = empty((2, 3), dtype=float) # 2-D input - frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) assert len(frame.index) == 2 assert len(frame.columns) == 3 # 1-D input - frame = DataFrame(empty((3,)), columns=['A'], index=[1, 2, 3]) + frame = DataFrame(empty((3,)), columns=["A"], index=[1, 2, 3]) assert len(frame.index) == 3 assert len(frame.columns) == 1 # cast type - frame = DataFrame(mat, columns=['A', 'B', 'C'], - index=[1, 2], dtype=np.int64) + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64) assert frame.values.dtype == np.int64 # wrong size axis labels - msg = r'Shape of passed values is \(2, 3\), indices imply \(1, 3\)' + msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)" with pytest.raises(ValueError, match=msg): - DataFrame(mat, columns=['A', 'B', 'C'], index=[1]) - msg = r'Shape of passed values is \(2, 3\), indices imply \(2, 2\)' + DataFrame(mat, columns=["A", "B", "C"], index=[1]) + msg = r"Shape of passed values is \(2, 3\), indices imply \(2, 2\)" with pytest.raises(ValueError, match=msg): - DataFrame(mat, columns=['A', 'B'], index=[1, 2]) + DataFrame(mat, columns=["A", "B"], index=[1, 2]) # higher dim raise exception - with pytest.raises(ValueError, match='Must pass 2-d input'): - DataFrame(empty((3, 3, 3)), columns=['A', 'B', 'C'], - index=[1]) + with pytest.raises(ValueError, match="Must pass 2-d input"): + DataFrame(empty((3, 3, 3)), columns=["A", "B", "C"], index=[1]) # automatic labeling frame = DataFrame(mat) @@ -724,7 +755,7 @@ def _check_basic_constructor(self, empty): frame = DataFrame(mat, index=[1, 2]) tm.assert_index_equal(frame.columns, pd.Int64Index(range(3))) - frame = DataFrame(mat, columns=['A', 'B', 'C']) + frame = DataFrame(mat, columns=["A", "B", "C"]) tm.assert_index_equal(frame.index, pd.Int64Index(range(2))) # 0-length axis @@ -737,7 +768,7 @@ def _check_basic_constructor(self, empty): def test_constructor_ndarray(self): self._check_basic_constructor(np.ones) - frame = DataFrame(['foo', 'bar'], index=[0, 1], columns=['A']) + frame = DataFrame(["foo", "bar"], index=[0, 1], columns=["A"]) assert len(frame) == 2 def test_constructor_maskedarray(self): @@ -747,134 +778,132 @@ def test_constructor_maskedarray(self): mat = ma.masked_all((2, 3), dtype=float) mat[0, 0] = 1.0 mat[1, 2] = 2.0 - frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) - assert 1.0 == frame['A'][1] - assert 2.0 == frame['C'][2] + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) + assert 1.0 == frame["A"][1] + assert 2.0 == frame["C"][2] # what is this even checking?? mat = ma.masked_all((2, 3), dtype=float) - frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) assert np.all(~np.asarray(frame == frame)) def test_constructor_maskedarray_nonfloat(self): # masked int promoted to float mat = ma.masked_all((2, 3), dtype=int) # 2-D input - frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) assert len(frame.index) == 2 assert len(frame.columns) == 3 assert np.all(~np.asarray(frame == frame)) # cast type - frame = DataFrame(mat, columns=['A', 'B', 'C'], - index=[1, 2], dtype=np.float64) + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.float64) assert frame.values.dtype == np.float64 # Check non-masked values mat2 = ma.copy(mat) mat2[0, 0] = 1 mat2[1, 2] = 2 - frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2]) - assert 1 == frame['A'][1] - assert 2 == frame['C'][2] + frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2]) + assert 1 == frame["A"][1] + assert 2 == frame["C"][2] # masked np.datetime64 stays (use NaT as null) - mat = ma.masked_all((2, 3), dtype='M8[ns]') + mat = ma.masked_all((2, 3), dtype="M8[ns]") # 2-D input - frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) assert len(frame.index) == 2 assert len(frame.columns) == 3 assert isna(frame).values.all() # cast type - frame = DataFrame(mat, columns=['A', 'B', 'C'], - index=[1, 2], dtype=np.int64) + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64) assert frame.values.dtype == np.int64 # Check non-masked values mat2 = ma.copy(mat) mat2[0, 0] = 1 mat2[1, 2] = 2 - frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2]) - assert 1 == frame['A'].view('i8')[1] - assert 2 == frame['C'].view('i8')[2] + frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2]) + assert 1 == frame["A"].view("i8")[1] + assert 2 == frame["C"].view("i8")[2] # masked bool promoted to object mat = ma.masked_all((2, 3), dtype=bool) # 2-D input - frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) assert len(frame.index) == 2 assert len(frame.columns) == 3 assert np.all(~np.asarray(frame == frame)) # cast type - frame = DataFrame(mat, columns=['A', 'B', 'C'], - index=[1, 2], dtype=object) + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=object) assert frame.values.dtype == object # Check non-masked values mat2 = ma.copy(mat) mat2[0, 0] = True mat2[1, 2] = False - frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2]) - assert frame['A'][1] is True - assert frame['C'][2] is False + frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2]) + assert frame["A"][1] is True + assert frame["C"][2] is False def test_constructor_maskedarray_hardened(self): # Check numpy masked arrays with hard masks -- from GH24574 mat_hard = ma.masked_all((2, 2), dtype=float).harden_mask() - result = pd.DataFrame(mat_hard, columns=['A', 'B'], index=[1, 2]) - expected = pd.DataFrame({ - 'A': [np.nan, np.nan], - 'B': [np.nan, np.nan]}, - columns=['A', 'B'], + result = pd.DataFrame(mat_hard, columns=["A", "B"], index=[1, 2]) + expected = pd.DataFrame( + {"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, + columns=["A", "B"], index=[1, 2], - dtype=float) + dtype=float, + ) tm.assert_frame_equal(result, expected) # Check case where mask is hard but no data are masked mat_hard = ma.ones((2, 2), dtype=float).harden_mask() - result = pd.DataFrame(mat_hard, columns=['A', 'B'], index=[1, 2]) - expected = pd.DataFrame({ - 'A': [1.0, 1.0], - 'B': [1.0, 1.0]}, - columns=['A', 'B'], + result = pd.DataFrame(mat_hard, columns=["A", "B"], index=[1, 2]) + expected = pd.DataFrame( + {"A": [1.0, 1.0], "B": [1.0, 1.0]}, + columns=["A", "B"], index=[1, 2], - dtype=float) + dtype=float, + ) tm.assert_frame_equal(result, expected) def test_constructor_maskedrecarray_dtype(self): # Ensure constructor honors dtype data = np.ma.array( - np.ma.zeros(5, dtype=[('date', '0 - df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', - floatname: np.array([1.] * 10, dtype=floatname), - intname: np.array([1] * 10, dtype=intname)}, - index=np.arange(10)) + df = DataFrame( + { + "a": 1.0, + "b": 2, + "c": "foo", + floatname: np.array([1.0] * 10, dtype=floatname), + intname: np.array([1] * 10, dtype=intname), + }, + index=np.arange(10), + ) result = df.dtypes - expected = Series([np.dtype('float64')] + - [np.dtype('int64')] + - [np.dtype('object')] + - [np.dtype('float64')] + - [np.dtype(intname)], - index=['a', 'b', 'c', floatname, intname]) + expected = Series( + [np.dtype("float64")] + + [np.dtype("int64")] + + [np.dtype("object")] + + [np.dtype("float64")] + + [np.dtype(intname)], + index=["a", "b", "c", floatname, intname], + ) tm.assert_series_equal(result, expected) # GH 2809 ind = date_range(start="2000-01-01", freq="D", periods=10) datetimes = [ts.to_pydatetime() for ts in ind] datetime_s = Series(datetimes) - assert datetime_s.dtype == 'M8[ns]' + assert datetime_s.dtype == "M8[ns]" # GH 2810 ind = date_range(start="2000-01-01", freq="D", periods=10) datetimes = [ts.to_pydatetime() for ts in ind] dates = [ts.date() for ts in ind] - df = DataFrame(datetimes, columns=['datetimes']) - df['dates'] = dates + df = DataFrame(datetimes, columns=["datetimes"]) + df["dates"] = dates result = df.dtypes - expected = Series([np.dtype('datetime64[ns]'), np.dtype('object')], - index=['datetimes', 'dates']) + expected = Series( + [np.dtype("datetime64[ns]"), np.dtype("object")], + index=["datetimes", "dates"], + ) tm.assert_series_equal(result, expected) # GH 7594 # don't coerce tz-aware import pytz - tz = pytz.timezone('US/Eastern') + + tz = pytz.timezone("US/Eastern") dt = tz.localize(datetime(2012, 1, 1)) - df = DataFrame({'End Date': dt}, index=[0]) + df = DataFrame({"End Date": dt}, index=[0]) assert df.iat[0, 0] == dt - tm.assert_series_equal(df.dtypes, Series( - {'End Date': 'datetime64[ns, US/Eastern]'})) + tm.assert_series_equal( + df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}) + ) - df = DataFrame([{'End Date': dt}]) + df = DataFrame([{"End Date": dt}]) assert df.iat[0, 0] == dt - tm.assert_series_equal(df.dtypes, Series( - {'End Date': 'datetime64[ns, US/Eastern]'})) + tm.assert_series_equal( + df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}) + ) # tz-aware (UTC and other tz's) # GH 8411 - dr = date_range('20130101', periods=3) - df = DataFrame({'value': dr}) + dr = date_range("20130101", periods=3) + df = DataFrame({"value": dr}) assert df.iat[0, 0].tz is None - dr = date_range('20130101', periods=3, tz='UTC') - df = DataFrame({'value': dr}) - assert str(df.iat[0, 0].tz) == 'UTC' - dr = date_range('20130101', periods=3, tz='US/Eastern') - df = DataFrame({'value': dr}) - assert str(df.iat[0, 0].tz) == 'US/Eastern' + dr = date_range("20130101", periods=3, tz="UTC") + df = DataFrame({"value": dr}) + assert str(df.iat[0, 0].tz) == "UTC" + dr = date_range("20130101", periods=3, tz="US/Eastern") + df = DataFrame({"value": dr}) + assert str(df.iat[0, 0].tz) == "US/Eastern" # GH 7822 # preserver an index with a tz on dict construction - i = date_range('1/1/2011', periods=5, freq='10s', tz='US/Eastern') + i = date_range("1/1/2011", periods=5, freq="10s", tz="US/Eastern") - expected = DataFrame( - {'a': i.to_series(keep_tz=True).reset_index(drop=True)}) + expected = DataFrame({"a": i.to_series(keep_tz=True).reset_index(drop=True)}) df = DataFrame() - df['a'] = i + df["a"] = i tm.assert_frame_equal(df, expected) - df = DataFrame({'a': i}) + df = DataFrame({"a": i}) tm.assert_frame_equal(df, expected) # multiples - i_no_tz = date_range('1/1/2011', periods=5, freq='10s') - df = DataFrame({'a': i, 'b': i_no_tz}) - expected = DataFrame({'a': i.to_series(keep_tz=True) - .reset_index(drop=True), 'b': i_no_tz}) + i_no_tz = date_range("1/1/2011", periods=5, freq="10s") + df = DataFrame({"a": i, "b": i_no_tz}) + expected = DataFrame( + {"a": i.to_series(keep_tz=True).reset_index(drop=True), "b": i_no_tz} + ) tm.assert_frame_equal(df, expected) def test_constructor_datetimes_with_nulls(self): # gh-15869 - for arr in [np.array([None, None, None, None, - datetime.now(), None]), - np.array([None, None, datetime.now(), None])]: + for arr in [ + np.array([None, None, None, None, datetime.now(), None]), + np.array([None, None, datetime.now(), None]), + ]: result = DataFrame(arr).dtypes - expected = Series([np.dtype('datetime64[ns]')]) + expected = Series([np.dtype("datetime64[ns]")]) tm.assert_series_equal(result, expected) def test_constructor_for_list_with_dtypes(self): # test list of lists/ndarrays df = DataFrame([np.arange(5) for x in range(5)]) result = df.dtypes - expected = Series([np.dtype('int64')] * 5) + expected = Series([np.dtype("int64")] * 5) tm.assert_series_equal(result, expected) - df = DataFrame([np.array(np.arange(5), dtype='int32') - for x in range(5)]) + df = DataFrame([np.array(np.arange(5), dtype="int32") for x in range(5)]) result = df.dtypes - expected = Series([np.dtype('int64')] * 5) + expected = Series([np.dtype("int64")] * 5) tm.assert_series_equal(result, expected) # overflow issue? (we always expecte int64 upcasting here) - df = DataFrame({'a': [2 ** 31, 2 ** 31 + 1]}) - assert df.dtypes.iloc[0] == np.dtype('int64') + df = DataFrame({"a": [2 ** 31, 2 ** 31 + 1]}) + assert df.dtypes.iloc[0] == np.dtype("int64") # GH #2751 (construction with no index specified), make sure we cast to # platform values df = DataFrame([1, 2]) - assert df.dtypes.iloc[0] == np.dtype('int64') + assert df.dtypes.iloc[0] == np.dtype("int64") - df = DataFrame([1., 2.]) - assert df.dtypes.iloc[0] == np.dtype('float64') + df = DataFrame([1.0, 2.0]) + assert df.dtypes.iloc[0] == np.dtype("float64") - df = DataFrame({'a': [1, 2]}) - assert df.dtypes.iloc[0] == np.dtype('int64') + df = DataFrame({"a": [1, 2]}) + assert df.dtypes.iloc[0] == np.dtype("int64") - df = DataFrame({'a': [1., 2.]}) - assert df.dtypes.iloc[0] == np.dtype('float64') + df = DataFrame({"a": [1.0, 2.0]}) + assert df.dtypes.iloc[0] == np.dtype("float64") - df = DataFrame({'a': 1}, index=range(3)) - assert df.dtypes.iloc[0] == np.dtype('int64') + df = DataFrame({"a": 1}, index=range(3)) + assert df.dtypes.iloc[0] == np.dtype("int64") - df = DataFrame({'a': 1.}, index=range(3)) - assert df.dtypes.iloc[0] == np.dtype('float64') + df = DataFrame({"a": 1.0}, index=range(3)) + assert df.dtypes.iloc[0] == np.dtype("float64") # with object list - df = DataFrame({'a': [1, 2, 4, 7], 'b': [1.2, 2.3, 5.1, 6.3], - 'c': list('abcd'), - 'd': [datetime(2000, 1, 1) for i in range(4)], - 'e': [1., 2, 4., 7]}) + df = DataFrame( + { + "a": [1, 2, 4, 7], + "b": [1.2, 2.3, 5.1, 6.3], + "c": list("abcd"), + "d": [datetime(2000, 1, 1) for i in range(4)], + "e": [1.0, 2, 4.0, 7], + } + ) result = df.dtypes - expected = Series([np.dtype('int64'), - np.dtype('float64'), - np.dtype('object'), - np.dtype('datetime64[ns]'), - np.dtype('float64')], - index=list('abcde')) + expected = Series( + [ + np.dtype("int64"), + np.dtype("float64"), + np.dtype("object"), + np.dtype("datetime64[ns]"), + np.dtype("float64"), + ], + index=list("abcde"), + ) tm.assert_series_equal(result, expected) def test_constructor_frame_copy(self, float_frame): cop = DataFrame(float_frame, copy=True) - cop['A'] = 5 - assert (cop['A'] == 5).all() - assert not (float_frame['A'] == 5).all() + cop["A"] = 5 + assert (cop["A"] == 5).all() + assert not (float_frame["A"] == 5).all() def test_constructor_ndarray_copy(self, float_frame): df = DataFrame(float_frame.values) @@ -1759,10 +1846,10 @@ def test_constructor_ndarray_copy(self, float_frame): def test_constructor_series_copy(self, float_frame): series = float_frame._series - df = DataFrame({'A': series['A']}) - df['A'][:] = 5 + df = DataFrame({"A": series["A"]}) + df["A"][:] = 5 - assert not (series['A'] == 5).all() + assert not (series["A"] == 5).all() def test_constructor_with_nas(self): # GH 5016 @@ -1776,19 +1863,19 @@ def check(df): # No NaN found -> error if len(indexer) == 0: - msg = ("cannot do label indexing on" - r" " - r" with these indexers \[nan\] of ") + msg = ( + "cannot do label indexing on" + r" " + r" with these indexers \[nan\] of " + ) with pytest.raises(TypeError, match=msg): df.loc[:, np.nan] # single nan should result in Series elif len(indexer) == 1: - tm.assert_series_equal(df.iloc[:, indexer[0]], - df.loc[:, np.nan]) + tm.assert_series_equal(df.iloc[:, indexer[0]], df.loc[:, np.nan]) # multiple nans should result in DataFrame else: - tm.assert_frame_equal(df.iloc[:, indexer], - df.loc[:, np.nan]) + tm.assert_frame_equal(df.iloc[:, indexer], df.loc[:, np.nan]) df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[1, np.nan]) check(df) @@ -1796,104 +1883,107 @@ def check(df): df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[1.1, 2.2, np.nan]) check(df) - df = DataFrame([[0, 1, 2, 3], [4, 5, 6, 7]], - columns=[np.nan, 1.1, 2.2, np.nan]) + df = DataFrame([[0, 1, 2, 3], [4, 5, 6, 7]], columns=[np.nan, 1.1, 2.2, np.nan]) check(df) - df = DataFrame([[0.0, 1, 2, 3.0], [4, 5, 6, 7]], - columns=[np.nan, 1.1, 2.2, np.nan]) + df = DataFrame( + [[0.0, 1, 2, 3.0], [4, 5, 6, 7]], columns=[np.nan, 1.1, 2.2, np.nan] + ) check(df) # GH 21428 (non-unique columns) - df = DataFrame([[0.0, 1, 2, 3.0], [4, 5, 6, 7]], - columns=[np.nan, 1, 2, 2]) + df = DataFrame([[0.0, 1, 2, 3.0], [4, 5, 6, 7]], columns=[np.nan, 1, 2, 2]) check(df) def test_constructor_lists_to_object_dtype(self): # from #1074 - d = DataFrame({'a': [np.nan, False]}) - assert d['a'].dtype == np.object_ - assert not d['a'][1] + d = DataFrame({"a": [np.nan, False]}) + assert d["a"].dtype == np.object_ + assert not d["a"][1] def test_constructor_categorical(self): # GH8626 # dict creation - df = DataFrame({'A': list('abc')}, dtype='category') - expected = Series(list('abc'), dtype='category', name='A') - tm.assert_series_equal(df['A'], expected) + df = DataFrame({"A": list("abc")}, dtype="category") + expected = Series(list("abc"), dtype="category", name="A") + tm.assert_series_equal(df["A"], expected) # to_frame - s = Series(list('abc'), dtype='category') + s = Series(list("abc"), dtype="category") result = s.to_frame() - expected = Series(list('abc'), dtype='category', name=0) + expected = Series(list("abc"), dtype="category", name=0) tm.assert_series_equal(result[0], expected) - result = s.to_frame(name='foo') - expected = Series(list('abc'), dtype='category', name='foo') - tm.assert_series_equal(result['foo'], expected) + result = s.to_frame(name="foo") + expected = Series(list("abc"), dtype="category", name="foo") + tm.assert_series_equal(result["foo"], expected) # list-like creation - df = DataFrame(list('abc'), dtype='category') - expected = Series(list('abc'), dtype='category', name=0) + df = DataFrame(list("abc"), dtype="category") + expected = Series(list("abc"), dtype="category", name=0) tm.assert_series_equal(df[0], expected) # ndim != 1 - df = DataFrame([Categorical(list('abc'))]) - expected = DataFrame({0: Series(list('abc'), dtype='category')}) + df = DataFrame([Categorical(list("abc"))]) + expected = DataFrame({0: Series(list("abc"), dtype="category")}) tm.assert_frame_equal(df, expected) - df = DataFrame([Categorical(list('abc')), Categorical(list('abd'))]) - expected = DataFrame({0: Series(list('abc'), dtype='category'), - 1: Series(list('abd'), dtype='category')}, - columns=[0, 1]) + df = DataFrame([Categorical(list("abc")), Categorical(list("abd"))]) + expected = DataFrame( + { + 0: Series(list("abc"), dtype="category"), + 1: Series(list("abd"), dtype="category"), + }, + columns=[0, 1], + ) tm.assert_frame_equal(df, expected) # mixed - df = DataFrame([Categorical(list('abc')), list('def')]) - expected = DataFrame({0: Series(list('abc'), dtype='category'), - 1: list('def')}, columns=[0, 1]) + df = DataFrame([Categorical(list("abc")), list("def")]) + expected = DataFrame( + {0: Series(list("abc"), dtype="category"), 1: list("def")}, columns=[0, 1] + ) tm.assert_frame_equal(df, expected) # invalid (shape) msg = r"Shape of passed values is \(6, 2\), indices imply \(3, 2\)" with pytest.raises(ValueError, match=msg): - DataFrame([Categorical(list('abc')), - Categorical(list('abdefg'))]) + DataFrame([Categorical(list("abc")), Categorical(list("abdefg"))]) # ndim > 1 msg = "> 1 ndim Categorical are not supported at this time" with pytest.raises(NotImplementedError, match=msg): - Categorical(np.array([list('abcd')])) + Categorical(np.array([list("abcd")])) def test_constructor_categorical_series(self): items = [1, 2, 3, 1] - exp = Series(items).astype('category') - res = Series(items, dtype='category') + exp = Series(items).astype("category") + res = Series(items, dtype="category") tm.assert_series_equal(res, exp) items = ["a", "b", "c", "a"] - exp = Series(items).astype('category') - res = Series(items, dtype='category') + exp = Series(items).astype("category") + res = Series(items, dtype="category") tm.assert_series_equal(res, exp) # insert into frame with different index # GH 8076 - index = date_range('20000101', periods=3) - expected = Series(Categorical(values=[np.nan, np.nan, np.nan], - categories=['a', 'b', 'c'])) + index = date_range("20000101", periods=3) + expected = Series( + Categorical(values=[np.nan, np.nan, np.nan], categories=["a", "b", "c"]) + ) expected.index = index - expected = DataFrame({'x': expected}) - df = DataFrame( - {'x': Series(['a', 'b', 'c'], dtype='category')}, index=index) + expected = DataFrame({"x": expected}) + df = DataFrame({"x": Series(["a", "b", "c"], dtype="category")}, index=index) tm.assert_frame_equal(df, expected) def test_from_records_to_records(self): # from numpy documentation - arr = np.zeros((2,), dtype=('i4,f4,a10')) - arr[:] = [(1, 2., 'Hello'), (2, 3., "World")] + arr = np.zeros((2,), dtype=("i4,f4,a10")) + arr[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] # TODO(wesm): unused frame = DataFrame.from_records(arr) # noqa @@ -1907,11 +1997,11 @@ def test_from_records_to_records(self): tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2)) # wrong length - msg = r'Shape of passed values is \(2, 3\), indices imply \(1, 3\)' + msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)" with pytest.raises(ValueError, match=msg): DataFrame.from_records(arr, index=index[:-1]) - indexed_frame = DataFrame.from_records(arr, index='f1') + indexed_frame = DataFrame.from_records(arr, index="f1") # what to do? records = indexed_frame.to_records() @@ -1919,46 +2009,51 @@ def test_from_records_to_records(self): records = indexed_frame.to_records(index=False) assert len(records.dtype.names) == 2 - assert 'index' not in records.dtype.names + assert "index" not in records.dtype.names def test_from_records_nones(self): - tuples = [(1, 2, None, 3), - (1, 2, None, 3), - (None, 2, 5, 3)] + tuples = [(1, 2, None, 3), (1, 2, None, 3), (None, 2, 5, 3)] - df = DataFrame.from_records(tuples, columns=['a', 'b', 'c', 'd']) - assert np.isnan(df['c'][0]) + df = DataFrame.from_records(tuples, columns=["a", "b", "c", "d"]) + assert np.isnan(df["c"][0]) def test_from_records_iterator(self): - arr = np.array([(1.0, 1.0, 2, 2), (3.0, 3.0, 4, 4), (5., 5., 6, 6), - (7., 7., 8, 8)], - dtype=[('x', np.float64), ('u', np.float32), - ('y', np.int64), ('z', np.int32)]) + arr = np.array( + [(1.0, 1.0, 2, 2), (3.0, 3.0, 4, 4), (5.0, 5.0, 6, 6), (7.0, 7.0, 8, 8)], + dtype=[ + ("x", np.float64), + ("u", np.float32), + ("y", np.int64), + ("z", np.int32), + ], + ) df = DataFrame.from_records(iter(arr), nrows=2) - xp = DataFrame({'x': np.array([1.0, 3.0], dtype=np.float64), - 'u': np.array([1.0, 3.0], dtype=np.float32), - 'y': np.array([2, 4], dtype=np.int64), - 'z': np.array([2, 4], dtype=np.int32)}) + xp = DataFrame( + { + "x": np.array([1.0, 3.0], dtype=np.float64), + "u": np.array([1.0, 3.0], dtype=np.float32), + "y": np.array([2, 4], dtype=np.int64), + "z": np.array([2, 4], dtype=np.int32), + } + ) tm.assert_frame_equal(df.reindex_like(xp), xp) # no dtypes specified here, so just compare with the default - arr = [(1.0, 2), (3.0, 4), (5., 6), (7., 8)] - df = DataFrame.from_records(iter(arr), columns=['x', 'y'], - nrows=2) - tm.assert_frame_equal(df, xp.reindex(columns=['x', 'y']), - check_dtype=False) + arr = [(1.0, 2), (3.0, 4), (5.0, 6), (7.0, 8)] + df = DataFrame.from_records(iter(arr), columns=["x", "y"], nrows=2) + tm.assert_frame_equal(df, xp.reindex(columns=["x", "y"]), check_dtype=False) def test_from_records_tuples_generator(self): def tuple_generator(length): for i in range(length): - letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" yield (i, letters[i % len(letters)], i / length) - columns_names = ['Integer', 'String', 'Float'] - columns = [[i[j] for i in tuple_generator( - 10)] for j in range(len(columns_names))] - data = {'Integer': columns[0], - 'String': columns[1], 'Float': columns[2]} + columns_names = ["Integer", "String", "Float"] + columns = [ + [i[j] for i in tuple_generator(10)] for j in range(len(columns_names)) + ] + data = {"Integer": columns[0], "String": columns[1], "Float": columns[2]} expected = DataFrame(data, columns=columns_names) generator = tuple_generator(10) @@ -1968,14 +2063,14 @@ def tuple_generator(length): def test_from_records_lists_generator(self): def list_generator(length): for i in range(length): - letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" yield [i, letters[i % len(letters)], i / length] - columns_names = ['Integer', 'String', 'Float'] - columns = [[i[j] for i in list_generator( - 10)] for j in range(len(columns_names))] - data = {'Integer': columns[0], - 'String': columns[1], 'Float': columns[2]} + columns_names = ["Integer", "String", "Float"] + columns = [ + [i[j] for i in list_generator(10)] for j in range(len(columns_names)) + ] + data = {"Integer": columns[0], "String": columns[1], "Float": columns[2]} expected = DataFrame(data, columns=columns_names) generator = list_generator(10) @@ -1983,111 +2078,112 @@ def list_generator(length): tm.assert_frame_equal(result, expected) def test_from_records_columns_not_modified(self): - tuples = [(1, 2, 3), - (1, 2, 3), - (2, 5, 3)] + tuples = [(1, 2, 3), (1, 2, 3), (2, 5, 3)] - columns = ['a', 'b', 'c'] + columns = ["a", "b", "c"] original_columns = list(columns) - df = DataFrame.from_records(tuples, columns=columns, index='a') # noqa + df = DataFrame.from_records(tuples, columns=columns, index="a") # noqa assert columns == original_columns def test_from_records_decimal(self): from decimal import Decimal - tuples = [(Decimal('1.5'),), (Decimal('2.5'),), (None,)] + tuples = [(Decimal("1.5"),), (Decimal("2.5"),), (None,)] - df = DataFrame.from_records(tuples, columns=['a']) - assert df['a'].dtype == object + df = DataFrame.from_records(tuples, columns=["a"]) + assert df["a"].dtype == object - df = DataFrame.from_records(tuples, columns=['a'], coerce_float=True) - assert df['a'].dtype == np.float64 - assert np.isnan(df['a'].values[-1]) + df = DataFrame.from_records(tuples, columns=["a"], coerce_float=True) + assert df["a"].dtype == np.float64 + assert np.isnan(df["a"].values[-1]) def test_from_records_duplicates(self): - result = DataFrame.from_records([(1, 2, 3), (4, 5, 6)], - columns=['a', 'b', 'a']) + result = DataFrame.from_records([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "a"]) - expected = DataFrame([(1, 2, 3), (4, 5, 6)], - columns=['a', 'b', 'a']) + expected = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "a"]) tm.assert_frame_equal(result, expected) def test_from_records_set_index_name(self): def create_dict(order_id): - return {'order_id': order_id, 'quantity': np.random.randint(1, 10), - 'price': np.random.randint(1, 10)} + return { + "order_id": order_id, + "quantity": np.random.randint(1, 10), + "price": np.random.randint(1, 10), + } + documents = [create_dict(i) for i in range(10)] # demo missing data - documents.append({'order_id': 10, 'quantity': 5}) + documents.append({"order_id": 10, "quantity": 5}) - result = DataFrame.from_records(documents, index='order_id') - assert result.index.name == 'order_id' + result = DataFrame.from_records(documents, index="order_id") + assert result.index.name == "order_id" # MultiIndex - result = DataFrame.from_records(documents, - index=['order_id', 'quantity']) - assert result.index.names == ('order_id', 'quantity') + result = DataFrame.from_records(documents, index=["order_id", "quantity"]) + assert result.index.names == ("order_id", "quantity") def test_from_records_misc_brokenness(self): # #2179 - data = {1: ['foo'], 2: ['bar']} + data = {1: ["foo"], 2: ["bar"]} - result = DataFrame.from_records(data, columns=['a', 'b']) - exp = DataFrame(data, columns=['a', 'b']) + result = DataFrame.from_records(data, columns=["a", "b"]) + exp = DataFrame(data, columns=["a", "b"]) tm.assert_frame_equal(result, exp) # overlap in index/index_names - data = {'a': [1, 2, 3], 'b': [4, 5, 6]} + data = {"a": [1, 2, 3], "b": [4, 5, 6]} - result = DataFrame.from_records(data, index=['a', 'b', 'c']) - exp = DataFrame(data, index=['a', 'b', 'c']) + result = DataFrame.from_records(data, index=["a", "b", "c"]) + exp = DataFrame(data, index=["a", "b", "c"]) tm.assert_frame_equal(result, exp) # GH 2623 rows = [] rows.append([datetime(2010, 1, 1), 1]) - rows.append([datetime(2010, 1, 2), 'hi']) # test col upconverts to obj - df2_obj = DataFrame.from_records(rows, columns=['date', 'test']) + rows.append([datetime(2010, 1, 2), "hi"]) # test col upconverts to obj + df2_obj = DataFrame.from_records(rows, columns=["date", "test"]) result = df2_obj.dtypes - expected = Series([np.dtype('datetime64[ns]'), np.dtype('object')], - index=['date', 'test']) + expected = Series( + [np.dtype("datetime64[ns]"), np.dtype("object")], index=["date", "test"] + ) tm.assert_series_equal(result, expected) rows = [] rows.append([datetime(2010, 1, 1), 1]) rows.append([datetime(2010, 1, 2), 1]) - df2_obj = DataFrame.from_records(rows, columns=['date', 'test']) + df2_obj = DataFrame.from_records(rows, columns=["date", "test"]) result = df2_obj.dtypes - expected = Series([np.dtype('datetime64[ns]'), np.dtype('int64')], - index=['date', 'test']) + expected = Series( + [np.dtype("datetime64[ns]"), np.dtype("int64")], index=["date", "test"] + ) tm.assert_series_equal(result, expected) def test_from_records_empty(self): # 3562 - result = DataFrame.from_records([], columns=['a', 'b', 'c']) - expected = DataFrame(columns=['a', 'b', 'c']) + result = DataFrame.from_records([], columns=["a", "b", "c"]) + expected = DataFrame(columns=["a", "b", "c"]) tm.assert_frame_equal(result, expected) - result = DataFrame.from_records([], columns=['a', 'b', 'b']) - expected = DataFrame(columns=['a', 'b', 'b']) + result = DataFrame.from_records([], columns=["a", "b", "b"]) + expected = DataFrame(columns=["a", "b", "b"]) tm.assert_frame_equal(result, expected) def test_from_records_empty_with_nonempty_fields_gh3682(self): - a = np.array([(1, 2)], dtype=[('id', np.int64), ('value', np.int64)]) - df = DataFrame.from_records(a, index='id') - tm.assert_index_equal(df.index, Index([1], name='id')) - assert df.index.name == 'id' - tm.assert_index_equal(df.columns, Index(['value'])) + a = np.array([(1, 2)], dtype=[("id", np.int64), ("value", np.int64)]) + df = DataFrame.from_records(a, index="id") + tm.assert_index_equal(df.index, Index([1], name="id")) + assert df.index.name == "id" + tm.assert_index_equal(df.columns, Index(["value"])) - b = np.array([], dtype=[('id', np.int64), ('value', np.int64)]) - df = DataFrame.from_records(b, index='id') - tm.assert_index_equal(df.index, Index([], name='id')) - assert df.index.name == 'id' + b = np.array([], dtype=[("id", np.int64), ("value", np.int64)]) + df = DataFrame.from_records(b, index="id") + tm.assert_index_equal(df.index, Index([], name="id")) + assert df.index.name == "id" def test_from_records_with_datetimes(self): @@ -2098,10 +2194,10 @@ def test_from_records_with_datetimes(self): # construction with a null in a recarray # GH 6140 - expected = DataFrame({'EXPIRY': [datetime(2005, 3, 1, 0, 0), None]}) + expected = DataFrame({"EXPIRY": [datetime(2005, 3, 1, 0, 0), None]}) arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])] - dtypes = [('EXPIRY', '\n' - 'To: \n' - 'Subject: Test message\n' - '\n' - 'Body would go here\n') + headers = Parser().parsestr( + "From: \n" + "To: \n" + "Subject: Test message\n" + "\n" + "Body would go here\n" + ) frame = DataFrame.from_records([headers]) - all(x in frame for x in ['Type', 'Subject', 'From']) + all(x in frame for x in ["Type", "Subject", "From"]) def test_to_records_floats(self): df = DataFrame(np.random.rand(10, 10)) @@ -132,25 +136,24 @@ def test_to_records_floats(self): def test_to_records_index_name(self): df = DataFrame(np.random.randn(3, 3)) - df.index.name = 'X' + df.index.name = "X" rs = df.to_records() - assert 'X' in rs.dtype.fields + assert "X" in rs.dtype.fields df = DataFrame(np.random.randn(3, 3)) rs = df.to_records() - assert 'index' in rs.dtype.fields + assert "index" in rs.dtype.fields - df.index = MultiIndex.from_tuples([('a', 'x'), ('a', 'y'), ('b', 'z')]) - df.index.names = ['A', None] + df.index = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) + df.index.names = ["A", None] rs = df.to_records() - assert 'level_0' in rs.dtype.fields + assert "level_0" in rs.dtype.fields def test_to_records_with_unicode_index(self): # GH13172 # unicode_literals conflict with to_records - result = DataFrame([{'a': 'x', 'b': 'y'}]).set_index('a') \ - .to_records() - expected = np.rec.array([('x', 'y')], dtype=[('a', 'O'), ('b', 'O')]) + result = DataFrame([{"a": "x", "b": "y"}]).set_index("a").to_records() + expected = np.rec.array([("x", "y")], dtype=[("a", "O"), ("b", "O")]) tm.assert_almost_equal(result, expected) def test_to_records_with_unicode_column_names(self): @@ -163,8 +166,7 @@ def test_to_records_with_unicode_column_names(self): # to be specified using dictionary instead of list of tuples. expected = np.rec.array( [(0, 1.0)], - dtype={"names": ["index", "accented_name_é"], - "formats": ['=i8', '=f8']} + dtype={"names": ["index", "accented_name_é"], "formats": ["=i8", "=f8"]}, ) tm.assert_almost_equal(result, expected) @@ -173,117 +175,154 @@ def test_to_records_with_categorical(self): # GH8626 # dict creation - df = DataFrame({'A': list('abc')}, dtype='category') - expected = Series(list('abc'), dtype='category', name='A') - tm.assert_series_equal(df['A'], expected) + df = DataFrame({"A": list("abc")}, dtype="category") + expected = Series(list("abc"), dtype="category", name="A") + tm.assert_series_equal(df["A"], expected) # list-like creation - df = DataFrame(list('abc'), dtype='category') - expected = Series(list('abc'), dtype='category', name=0) + df = DataFrame(list("abc"), dtype="category") + expected = Series(list("abc"), dtype="category", name=0) tm.assert_series_equal(df[0], expected) # to record array # this coerces result = df.to_records() - expected = np.rec.array([(0, 'a'), (1, 'b'), (2, 'c')], - dtype=[('index', '=i8'), ('0', 'O')]) + expected = np.rec.array( + [(0, "a"), (1, "b"), (2, "c")], dtype=[("index", "=i8"), ("0", "O")] + ) tm.assert_almost_equal(result, expected) - @pytest.mark.parametrize("kwargs,expected", [ - # No dtypes --> default to array dtypes. - (dict(), - np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], - dtype=[("index", " default to array dtypes. + ( + dict(), + np.rec.array( + [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], + dtype=[("index", " 0 + float_string_frame["bool"] = float_string_frame["A"] > 0 result = float_string_frame.dtypes - expected = Series({k: v.dtype for k, v in float_string_frame.items()}, - index=result.index) + expected = Series( + {k: v.dtype for k, v in float_string_frame.items()}, index=result.index + ) assert_series_equal(result, expected) # compat, GH 8722 - with option_context('use_inf_as_na', True): + with option_context("use_inf_as_na", True): df = DataFrame([[1]]) result = df.dtypes - assert_series_equal(result, Series({0: np.dtype('int64')})) + assert_series_equal(result, Series({0: np.dtype("int64")})) def test_ftypes(self, mixed_float_frame): frame = mixed_float_frame - expected = Series(dict(A='float32:dense', - B='float32:dense', - C='float16:dense', - D='float64:dense')).sort_values() + expected = Series( + dict( + A="float32:dense", + B="float32:dense", + C="float16:dense", + D="float64:dense", + ) + ).sort_values() # GH 26705 - Assert .ftypes is deprecated with tm.assert_produces_warning(FutureWarning): @@ -433,74 +493,78 @@ def test_ftypes(self, mixed_float_frame): def test_astype_float(self, float_frame): casted = float_frame.astype(int) - expected = DataFrame(float_frame.values.astype(int), - index=float_frame.index, - columns=float_frame.columns) + expected = DataFrame( + float_frame.values.astype(int), + index=float_frame.index, + columns=float_frame.columns, + ) assert_frame_equal(casted, expected) casted = float_frame.astype(np.int32) - expected = DataFrame(float_frame.values.astype(np.int32), - index=float_frame.index, - columns=float_frame.columns) + expected = DataFrame( + float_frame.values.astype(np.int32), + index=float_frame.index, + columns=float_frame.columns, + ) assert_frame_equal(casted, expected) - float_frame['foo'] = '5' + float_frame["foo"] = "5" casted = float_frame.astype(int) - expected = DataFrame(float_frame.values.astype(int), - index=float_frame.index, - columns=float_frame.columns) + expected = DataFrame( + float_frame.values.astype(int), + index=float_frame.index, + columns=float_frame.columns, + ) assert_frame_equal(casted, expected) def test_astype_mixed_float(self, mixed_float_frame): # mixed casting - casted = mixed_float_frame.reindex( - columns=['A', 'B']).astype('float32') - _check_cast(casted, 'float32') + casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float32") + _check_cast(casted, "float32") - casted = mixed_float_frame.reindex( - columns=['A', 'B']).astype('float16') - _check_cast(casted, 'float16') + casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float16") + _check_cast(casted, "float16") def test_astype_mixed_type(self, mixed_type_frame): # mixed casting mn = mixed_type_frame._get_numeric_data().copy() - mn['little_float'] = np.array(12345., dtype='float16') - mn['big_float'] = np.array(123456789101112., dtype='float64') + mn["little_float"] = np.array(12345.0, dtype="float16") + mn["big_float"] = np.array(123456789101112.0, dtype="float64") - casted = mn.astype('float64') - _check_cast(casted, 'float64') + casted = mn.astype("float64") + _check_cast(casted, "float64") - casted = mn.astype('int64') - _check_cast(casted, 'int64') + casted = mn.astype("int64") + _check_cast(casted, "int64") - casted = mn.reindex(columns=['little_float']).astype('float16') - _check_cast(casted, 'float16') + casted = mn.reindex(columns=["little_float"]).astype("float16") + _check_cast(casted, "float16") - casted = mn.astype('float32') - _check_cast(casted, 'float32') + casted = mn.astype("float32") + _check_cast(casted, "float32") - casted = mn.astype('int32') - _check_cast(casted, 'int32') + casted = mn.astype("int32") + _check_cast(casted, "int32") # to object - casted = mn.astype('O') - _check_cast(casted, 'object') + casted = mn.astype("O") + _check_cast(casted, "object") def test_astype_with_exclude_string(self, float_frame): df = float_frame.copy() expected = float_frame.astype(int) - df['string'] = 'foo' - casted = df.astype(int, errors='ignore') + df["string"] = "foo" + casted = df.astype(int, errors="ignore") - expected['string'] = 'foo' + expected["string"] = "foo" assert_frame_equal(casted, expected) df = float_frame.copy() expected = float_frame.astype(np.int32) - df['string'] = 'foo' - casted = df.astype(np.int32, errors='ignore') + df["string"] = "foo" + casted = df.astype(np.int32, errors="ignore") - expected['string'] = 'foo' + expected["string"] = "foo" assert_frame_equal(casted, expected) def test_astype_with_view_float(self, float_frame): @@ -515,7 +579,7 @@ def test_astype_with_view_float(self, float_frame): def test_astype_with_view_mixed_float(self, mixed_float_frame): - tf = mixed_float_frame.reindex(columns=['A', 'B', 'C']) + tf = mixed_float_frame.reindex(columns=["A", "B", "C"]) casted = tf.astype(np.int64) casted = tf.astype(np.float32) # noqa @@ -545,16 +609,20 @@ def test_astype_str(self): # Datetime-like result = df.astype(str) - expected = DataFrame({ - "a": list(map(str, - map(lambda x: Timestamp(x)._date_repr, a._values))), - "b": list(map(str, map(Timestamp, b._values))), - "c": list(map(str, - map(lambda x: Timedelta(x)._repr_base(format="all"), - c._values))), - "d": list(map(str, d._values)), - "e": list(map(str, e._values)), - }) + expected = DataFrame( + { + "a": list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))), + "b": list(map(str, map(Timestamp, b._values))), + "c": list( + map( + str, + map(lambda x: Timedelta(x)._repr_base(format="all"), c._values), + ) + ), + "d": list(map(str, d._values)), + "e": list(map(str, e._values)), + } + ) assert_frame_equal(result, expected) @@ -568,54 +636,59 @@ def test_astype_str_float(self): # < 1.14 truncates # >= 1.14 preserves the full repr - val = ("1.12345678901" if _np_version_under1p14 - else "1.1234567890123457") + val = "1.12345678901" if _np_version_under1p14 else "1.1234567890123457" expected = DataFrame([val]) assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) def test_astype_dict_like(self, dtype_class): # GH7271 & GH16717 - a = Series(date_range('2010-01-04', periods=5)) + a = Series(date_range("2010-01-04", periods=5)) b = Series(range(5)) c = Series([0.0, 0.2, 0.4, 0.6, 0.8]) - d = Series(['1.0', '2', '3.14', '4', '5.4']) - df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d}) + d = Series(["1.0", "2", "3.14", "4", "5.4"]) + df = DataFrame({"a": a, "b": b, "c": c, "d": d}) original = df.copy(deep=True) # change type of a subset of columns - dt1 = dtype_class({'b': 'str', 'd': 'float32'}) + dt1 = dtype_class({"b": "str", "d": "float32"}) result = df.astype(dt1) - expected = DataFrame({ - 'a': a, - 'b': Series(['0', '1', '2', '3', '4']), - 'c': c, - 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')}) + expected = DataFrame( + { + "a": a, + "b": Series(["0", "1", "2", "3", "4"]), + "c": c, + "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), + } + ) assert_frame_equal(result, expected) assert_frame_equal(df, original) - dt2 = dtype_class({'b': np.float32, 'c': 'float32', 'd': np.float64}) + dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64}) result = df.astype(dt2) - expected = DataFrame({ - 'a': a, - 'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'), - 'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'), - 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')}) + expected = DataFrame( + { + "a": a, + "b": Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"), + "c": Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"), + "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"), + } + ) assert_frame_equal(result, expected) assert_frame_equal(df, original) # change all columns - dt3 = dtype_class({'a': str, 'b': str, 'c': str, 'd': str}) - assert_frame_equal(df.astype(dt3), - df.astype(str)) + dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str}) + assert_frame_equal(df.astype(dt3), df.astype(str)) assert_frame_equal(df, original) # error should be raised when using something other than column labels # in the keys of the dtype dict - dt4 = dtype_class({'b': str, 2: str}) - dt5 = dtype_class({'e': str}) - msg = ("Only a column name can be used for the key in a dtype mappings" - " argument") + dt4 = dtype_class({"b": str, 2: str}) + dt5 = dtype_class({"e": str}) + msg = ( + "Only a column name can be used for the key in a dtype mappings" " argument" + ) with pytest.raises(KeyError, match=msg): df.astype(dt4) with pytest.raises(KeyError, match=msg): @@ -638,117 +711,125 @@ def test_astype_dict_like(self, dtype_class): assert_frame_equal(df, original) def test_astype_duplicate_col(self): - a1 = Series([1, 2, 3, 4, 5], name='a') - b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name='b') - a2 = Series([0, 1, 2, 3, 4], name='a') + a1 = Series([1, 2, 3, 4, 5], name="a") + b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b") + a2 = Series([0, 1, 2, 3, 4], name="a") df = concat([a1, b, a2], axis=1) result = df.astype(str) - a1_str = Series(['1', '2', '3', '4', '5'], dtype='str', name='a') - b_str = Series(['0.1', '0.2', '0.4', '0.6', '0.8'], dtype=str, - name='b') - a2_str = Series(['0', '1', '2', '3', '4'], dtype='str', name='a') + a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a") + b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b") + a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a") expected = concat([a1_str, b_str, a2_str], axis=1) assert_frame_equal(result, expected) - result = df.astype({'a': 'str'}) + result = df.astype({"a": "str"}) expected = concat([a1_str, b, a2_str], axis=1) assert_frame_equal(result, expected) - @pytest.mark.parametrize('dtype', [ - 'category', - CategoricalDtype(), - CategoricalDtype(ordered=True), - CategoricalDtype(ordered=False), - CategoricalDtype(categories=list('abcdef')), - CategoricalDtype(categories=list('edba'), ordered=False), - CategoricalDtype(categories=list('edcb'), ordered=True)], ids=repr) + @pytest.mark.parametrize( + "dtype", + [ + "category", + CategoricalDtype(), + CategoricalDtype(ordered=True), + CategoricalDtype(ordered=False), + CategoricalDtype(categories=list("abcdef")), + CategoricalDtype(categories=list("edba"), ordered=False), + CategoricalDtype(categories=list("edcb"), ordered=True), + ], + ids=repr, + ) def test_astype_categorical(self, dtype): # GH 18099 - d = {'A': list('abbc'), 'B': list('bccd'), 'C': list('cdde')} + d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")} df = DataFrame(d) result = df.astype(dtype) expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d}) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("cls", [ - pd.api.types.CategoricalDtype, - pd.api.types.DatetimeTZDtype, - pd.api.types.IntervalDtype - ]) + @pytest.mark.parametrize( + "cls", + [ + pd.api.types.CategoricalDtype, + pd.api.types.DatetimeTZDtype, + pd.api.types.IntervalDtype, + ], + ) def test_astype_categoricaldtype_class_raises(self, cls): - df = DataFrame({"A": ['a', 'a', 'b', 'c']}) + df = DataFrame({"A": ["a", "a", "b", "c"]}) xpr = "Expected an instance of {}".format(cls.__name__) with pytest.raises(TypeError, match=xpr): df.astype({"A": cls}) with pytest.raises(TypeError, match=xpr): - df['A'].astype(cls) + df["A"].astype(cls) - @pytest.mark.parametrize("dtype", ['Int64', 'Int32', 'Int16']) + @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) def test_astype_extension_dtypes(self, dtype): # GH 22578 - df = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], columns=['a', 'b']) - - expected1 = pd.DataFrame({'a': integer_array([1, 3, 5], - dtype=dtype), - 'b': integer_array([2, 4, 6], - dtype=dtype)}) + df = pd.DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) + + expected1 = pd.DataFrame( + { + "a": integer_array([1, 3, 5], dtype=dtype), + "b": integer_array([2, 4, 6], dtype=dtype), + } + ) tm.assert_frame_equal(df.astype(dtype), expected1) - tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1) - tm.assert_frame_equal(df.astype(dtype).astype('float64'), df) - - df = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], columns=['a', 'b']) - df['b'] = df['b'].astype(dtype) - expected2 = pd.DataFrame({'a': [1., 3., 5.], - 'b': integer_array([2, 4, 6], - dtype=dtype)}) + tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) + tm.assert_frame_equal(df.astype(dtype).astype("float64"), df) + + df = pd.DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) + df["b"] = df["b"].astype(dtype) + expected2 = pd.DataFrame( + {"a": [1.0, 3.0, 5.0], "b": integer_array([2, 4, 6], dtype=dtype)} + ) tm.assert_frame_equal(df, expected2) tm.assert_frame_equal(df.astype(dtype), expected1) - tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1) + tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) - @pytest.mark.parametrize("dtype", ['Int64', 'Int32', 'Int16']) + @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) def test_astype_extension_dtypes_1d(self, dtype): # GH 22578 - df = pd.DataFrame({'a': [1., 2., 3.]}) + df = pd.DataFrame({"a": [1.0, 2.0, 3.0]}) - expected1 = pd.DataFrame({'a': integer_array([1, 2, 3], - dtype=dtype)}) + expected1 = pd.DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)}) tm.assert_frame_equal(df.astype(dtype), expected1) - tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1) + tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) - df = pd.DataFrame({'a': [1., 2., 3.]}) - df['a'] = df['a'].astype(dtype) - expected2 = pd.DataFrame({'a': integer_array([1, 2, 3], - dtype=dtype)}) + df = pd.DataFrame({"a": [1.0, 2.0, 3.0]}) + df["a"] = df["a"].astype(dtype) + expected2 = pd.DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)}) tm.assert_frame_equal(df, expected2) tm.assert_frame_equal(df.astype(dtype), expected1) - tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1) + tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) - @pytest.mark.parametrize("dtype", ['category', 'Int64']) + @pytest.mark.parametrize("dtype", ["category", "Int64"]) def test_astype_extension_dtypes_duplicate_col(self, dtype): # GH 24704 - a1 = Series([0, np.nan, 4], name='a') - a2 = Series([np.nan, 3, 5], name='a') + a1 = Series([0, np.nan, 4], name="a") + a2 = Series([np.nan, 3, 5], name="a") df = concat([a1, a2], axis=1) result = df.astype(dtype) expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1) assert_frame_equal(result, expected) - @pytest.mark.parametrize('dtype', [ - {100: 'float64', 200: 'uint64'}, 'category', 'float64']) + @pytest.mark.parametrize( + "dtype", [{100: "float64", 200: "uint64"}, "category", "float64"] + ) def test_astype_column_metadata(self, dtype): # GH 19920 - columns = pd.UInt64Index([100, 200, 300], name='foo') + columns = pd.UInt64Index([100, 200, 300], name="foo") df = DataFrame(np.arange(15).reshape(5, 3), columns=columns) df = df.astype(dtype) tm.assert_index_equal(df.columns, columns) @pytest.mark.parametrize("dtype", ["M8", "m8"]) - @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_from_datetimelike_to_objectt(self, dtype, unit): # tests astype to object dtype # gh-19223 / gh-12425 @@ -758,14 +839,14 @@ def test_astype_from_datetimelike_to_objectt(self, dtype, unit): result = df.astype(object) assert (result.dtypes == object).all() - if dtype.startswith('M8'): + if dtype.startswith("M8"): assert result.iloc[0, 0] == pd.to_datetime(1, unit=unit) else: assert result.iloc[0, 0] == pd.to_timedelta(1, unit=unit) @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) @pytest.mark.parametrize("dtype", ["M8", "m8"]) - @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit): # tests all units from numeric origination # gh-19223 / gh-12425 @@ -777,7 +858,7 @@ def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit): tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_datetime_unit(self, unit): # tests all units from datetime origination # gh-19223 @@ -789,7 +870,7 @@ def test_astype_to_datetime_unit(self, unit): tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("unit", ['ns']) + @pytest.mark.parametrize("unit", ["ns"]) def test_astype_to_timedelta_unit_ns(self, unit): # preserver the timedelta conversion # gh-19223 @@ -801,7 +882,7 @@ def test_astype_to_timedelta_unit_ns(self, unit): tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("unit", ['us', 'ms', 's', 'h', 'm', 'D']) + @pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"]) def test_astype_to_timedelta_unit(self, unit): # coerce to float # gh-19223 @@ -813,7 +894,7 @@ def test_astype_to_timedelta_unit(self, unit): tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_incorrect_datetimelike(self, unit): # trying to astype a m to a M, or vice-versa # gh-19224 @@ -821,43 +902,58 @@ def test_astype_to_incorrect_datetimelike(self, unit): other = "m8[{}]".format(unit) df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) - msg = (r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" - r" \[timedelta64\[{}\]\]").format(unit) + msg = ( + r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" + r" \[timedelta64\[{}\]\]" + ).format(unit) with pytest.raises(TypeError, match=msg): df.astype(other) - msg = (r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" - r" \[datetime64\[{}\]\]").format(unit) + msg = ( + r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" + r" \[datetime64\[{}\]\]" + ).format(unit) df = DataFrame(np.array([[1, 2, 3]], dtype=other)) with pytest.raises(TypeError, match=msg): df.astype(dtype) def test_timedeltas(self): - df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3, - freq='D')), - B=Series([timedelta(days=i) for i in range(3)]))) + df = DataFrame( + dict( + A=Series(date_range("2012-1-1", periods=3, freq="D")), + B=Series([timedelta(days=i) for i in range(3)]), + ) + ) result = df.dtypes - expected = Series([np.dtype('datetime64[ns]'), - np.dtype('timedelta64[ns]')], - index=list("AB")) + expected = Series( + [np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")], index=list("AB") + ) assert_series_equal(result, expected) - df['C'] = df['A'] + df['B'] + df["C"] = df["A"] + df["B"] result = df.dtypes - expected = Series([np.dtype('datetime64[ns]'), - np.dtype('timedelta64[ns]'), - np.dtype('datetime64[ns]')], - index=list("ABC")) + expected = Series( + [ + np.dtype("datetime64[ns]"), + np.dtype("timedelta64[ns]"), + np.dtype("datetime64[ns]"), + ], + index=list("ABC"), + ) assert_series_equal(result, expected) # mixed int types - df['D'] = 1 + df["D"] = 1 result = df.dtypes - expected = Series([np.dtype('datetime64[ns]'), - np.dtype('timedelta64[ns]'), - np.dtype('datetime64[ns]'), - np.dtype('int64')], - index=list("ABCD")) + expected = Series( + [ + np.dtype("datetime64[ns]"), + np.dtype("timedelta64[ns]"), + np.dtype("datetime64[ns]"), + np.dtype("int64"), + ], + index=list("ABCD"), + ) assert_series_equal(result, expected) def test_arg_for_errors_in_astype(self): @@ -868,145 +964,208 @@ def test_arg_for_errors_in_astype(self): with pytest.raises(ValueError): df.astype(np.float64, errors=True) - df.astype(np.int8, errors='ignore') + df.astype(np.int8, errors="ignore") def test_arg_for_errors_in_astype_dictlist(self): # GH-25905 - df = pd.DataFrame([ - {'a': '1', 'b': '16.5%', 'c': 'test'}, - {'a': '2.2', 'b': '15.3', 'c': 'another_test'}]) - expected = pd.DataFrame([ - {'a': 1.0, 'b': '16.5%', 'c': 'test'}, - {'a': 2.2, 'b': '15.3', 'c': 'another_test'}]) - type_dict = {'a': 'float64', 'b': 'float64', 'c': 'object'} - - result = df.astype(dtype=type_dict, errors='ignore') + df = pd.DataFrame( + [ + {"a": "1", "b": "16.5%", "c": "test"}, + {"a": "2.2", "b": "15.3", "c": "another_test"}, + ] + ) + expected = pd.DataFrame( + [ + {"a": 1.0, "b": "16.5%", "c": "test"}, + {"a": 2.2, "b": "15.3", "c": "another_test"}, + ] + ) + type_dict = {"a": "float64", "b": "float64", "c": "object"} + + result = df.astype(dtype=type_dict, errors="ignore") tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('input_vals', [ - ([1, 2]), - (['1', '2']), - (list(pd.date_range('1/1/2011', periods=2, freq='H'))), - (list(pd.date_range('1/1/2011', periods=2, freq='H', - tz='US/Eastern'))), - ([pd.Interval(left=0, right=5)]), - ]) + @pytest.mark.parametrize( + "input_vals", + [ + ([1, 2]), + (["1", "2"]), + (list(pd.date_range("1/1/2011", periods=2, freq="H"))), + (list(pd.date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))), + ([pd.Interval(left=0, right=5)]), + ], + ) def test_constructor_list_str(self, input_vals, string_dtype): # GH 16605 # Ensure that data elements are converted to strings when # dtype is str, 'str', or 'U' - result = DataFrame({'A': input_vals}, dtype=string_dtype) - expected = DataFrame({'A': input_vals}).astype({'A': string_dtype}) + result = DataFrame({"A": input_vals}, dtype=string_dtype) + expected = DataFrame({"A": input_vals}).astype({"A": string_dtype}) assert_frame_equal(result, expected) def test_constructor_list_str_na(self, string_dtype): result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype) - expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object) + expected = DataFrame({"A": ["1.0", "2.0", None]}, dtype=object) assert_frame_equal(result, expected) - @pytest.mark.parametrize("data, expected", [ - # empty - (DataFrame(), True), - # multi-same - (DataFrame({"A": [1, 2], "B": [1, 2]}), True), - # multi-object - (DataFrame({"A": np.array([1, 2], dtype=object), - "B": np.array(["a", "b"], dtype=object)}), True), - # multi-extension - (DataFrame({"A": pd.Categorical(['a', 'b']), - "B": pd.Categorical(['a', 'b'])}), True), - # differ types - (DataFrame({"A": [1, 2], "B": [1., 2.]}), False), - # differ sizes - (DataFrame({"A": np.array([1, 2], dtype=np.int32), - "B": np.array([1, 2], dtype=np.int64)}), False), - # multi-extension differ - (DataFrame({"A": pd.Categorical(['a', 'b']), - "B": pd.Categorical(['b', 'c'])}), False), - - ]) + @pytest.mark.parametrize( + "data, expected", + [ + # empty + (DataFrame(), True), + # multi-same + (DataFrame({"A": [1, 2], "B": [1, 2]}), True), + # multi-object + ( + DataFrame( + { + "A": np.array([1, 2], dtype=object), + "B": np.array(["a", "b"], dtype=object), + } + ), + True, + ), + # multi-extension + ( + DataFrame( + {"A": pd.Categorical(["a", "b"]), "B": pd.Categorical(["a", "b"])} + ), + True, + ), + # differ types + (DataFrame({"A": [1, 2], "B": [1.0, 2.0]}), False), + # differ sizes + ( + DataFrame( + { + "A": np.array([1, 2], dtype=np.int32), + "B": np.array([1, 2], dtype=np.int64), + } + ), + False, + ), + # multi-extension differ + ( + DataFrame( + {"A": pd.Categorical(["a", "b"]), "B": pd.Categorical(["b", "c"])} + ), + False, + ), + ], + ) def test_is_homogeneous_type(self, data, expected): assert data._is_homogeneous_type is expected def test_asarray_homogenous(self): - df = pd.DataFrame({"A": pd.Categorical([1, 2]), - "B": pd.Categorical([1, 2])}) + df = pd.DataFrame({"A": pd.Categorical([1, 2]), "B": pd.Categorical([1, 2])}) result = np.asarray(df) # may change from object in the future - expected = np.array([[1, 1], [2, 2]], dtype='object') + expected = np.array([[1, 1], [2, 2]], dtype="object") tm.assert_numpy_array_equal(result, expected) class TestDataFrameDatetimeWithTZ: - def test_interleave(self, timezone_frame): # interleave with object - result = timezone_frame.assign(D='foo').values - expected = np.array([[Timestamp('2013-01-01 00:00:00'), - Timestamp('2013-01-02 00:00:00'), - Timestamp('2013-01-03 00:00:00')], - [Timestamp('2013-01-01 00:00:00-0500', - tz='US/Eastern'), - pd.NaT, - Timestamp('2013-01-03 00:00:00-0500', - tz='US/Eastern')], - [Timestamp('2013-01-01 00:00:00+0100', tz='CET'), - pd.NaT, - Timestamp('2013-01-03 00:00:00+0100', tz='CET')], - ['foo', 'foo', 'foo']], dtype=object).T + result = timezone_frame.assign(D="foo").values + expected = np.array( + [ + [ + Timestamp("2013-01-01 00:00:00"), + Timestamp("2013-01-02 00:00:00"), + Timestamp("2013-01-03 00:00:00"), + ], + [ + Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), + pd.NaT, + Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), + ], + [ + Timestamp("2013-01-01 00:00:00+0100", tz="CET"), + pd.NaT, + Timestamp("2013-01-03 00:00:00+0100", tz="CET"), + ], + ["foo", "foo", "foo"], + ], + dtype=object, + ).T tm.assert_numpy_array_equal(result, expected) # interleave with only datetime64[ns] result = timezone_frame.values - expected = np.array([[Timestamp('2013-01-01 00:00:00'), - Timestamp('2013-01-02 00:00:00'), - Timestamp('2013-01-03 00:00:00')], - [Timestamp('2013-01-01 00:00:00-0500', - tz='US/Eastern'), - pd.NaT, - Timestamp('2013-01-03 00:00:00-0500', - tz='US/Eastern')], - [Timestamp('2013-01-01 00:00:00+0100', tz='CET'), - pd.NaT, - Timestamp('2013-01-03 00:00:00+0100', - tz='CET')]], dtype=object).T + expected = np.array( + [ + [ + Timestamp("2013-01-01 00:00:00"), + Timestamp("2013-01-02 00:00:00"), + Timestamp("2013-01-03 00:00:00"), + ], + [ + Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), + pd.NaT, + Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), + ], + [ + Timestamp("2013-01-01 00:00:00+0100", tz="CET"), + pd.NaT, + Timestamp("2013-01-03 00:00:00+0100", tz="CET"), + ], + ], + dtype=object, + ).T tm.assert_numpy_array_equal(result, expected) def test_astype(self, timezone_frame): # astype - expected = np.array([[Timestamp('2013-01-01 00:00:00'), - Timestamp('2013-01-02 00:00:00'), - Timestamp('2013-01-03 00:00:00')], - [Timestamp('2013-01-01 00:00:00-0500', - tz='US/Eastern'), - pd.NaT, - Timestamp('2013-01-03 00:00:00-0500', - tz='US/Eastern')], - [Timestamp('2013-01-01 00:00:00+0100', tz='CET'), - pd.NaT, - Timestamp('2013-01-03 00:00:00+0100', - tz='CET')]], - dtype=object).T - expected = DataFrame(expected, - index=timezone_frame.index, - columns=timezone_frame.columns, dtype=object) + expected = np.array( + [ + [ + Timestamp("2013-01-01 00:00:00"), + Timestamp("2013-01-02 00:00:00"), + Timestamp("2013-01-03 00:00:00"), + ], + [ + Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), + pd.NaT, + Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), + ], + [ + Timestamp("2013-01-01 00:00:00+0100", tz="CET"), + pd.NaT, + Timestamp("2013-01-03 00:00:00+0100", tz="CET"), + ], + ], + dtype=object, + ).T + expected = DataFrame( + expected, + index=timezone_frame.index, + columns=timezone_frame.columns, + dtype=object, + ) result = timezone_frame.astype(object) assert_frame_equal(result, expected) - result = timezone_frame.astype('datetime64[ns]') - expected = DataFrame({'A': date_range('20130101', periods=3), - 'B': (date_range('20130101', periods=3, - tz='US/Eastern') - .tz_convert('UTC') - .tz_localize(None)), - 'C': (date_range('20130101', periods=3, - tz='CET') - .tz_convert('UTC') - .tz_localize(None))}) + result = timezone_frame.astype("datetime64[ns]") + expected = DataFrame( + { + "A": date_range("20130101", periods=3), + "B": ( + date_range("20130101", periods=3, tz="US/Eastern") + .tz_convert("UTC") + .tz_localize(None) + ), + "C": ( + date_range("20130101", periods=3, tz="CET") + .tz_convert("UTC") + .tz_localize(None) + ), + } + ) expected.iloc[1, 1] = pd.NaT expected.iloc[1, 2] = pd.NaT assert_frame_equal(result, expected) @@ -1014,19 +1173,32 @@ def test_astype(self, timezone_frame): def test_astype_str(self, timezone_frame): # str formatting result = timezone_frame.astype(str) - expected = DataFrame([['2013-01-01', '2013-01-01 00:00:00-05:00', - '2013-01-01 00:00:00+01:00'], - ['2013-01-02', 'NaT', 'NaT'], - ['2013-01-03', '2013-01-03 00:00:00-05:00', - '2013-01-03 00:00:00+01:00']], - columns=timezone_frame.columns) + expected = DataFrame( + [ + [ + "2013-01-01", + "2013-01-01 00:00:00-05:00", + "2013-01-01 00:00:00+01:00", + ], + ["2013-01-02", "NaT", "NaT"], + [ + "2013-01-03", + "2013-01-03 00:00:00-05:00", + "2013-01-03 00:00:00+01:00", + ], + ], + columns=timezone_frame.columns, + ) tm.assert_frame_equal(result, expected) - with option_context('display.max_columns', 20): + with option_context("display.max_columns", 20): result = str(timezone_frame) - assert ('0 2013-01-01 2013-01-01 00:00:00-05:00 ' - '2013-01-01 00:00:00+01:00') in result - assert ('1 2013-01-02 ' - 'NaT NaT') in result - assert ('2 2013-01-03 2013-01-03 00:00:00-05:00 ' - '2013-01-03 00:00:00+01:00') in result + assert ( + "0 2013-01-01 2013-01-01 00:00:00-05:00 " "2013-01-01 00:00:00+01:00" + ) in result + assert ( + "1 2013-01-02 " "NaT NaT" + ) in result + assert ( + "2 2013-01-03 2013-01-03 00:00:00-05:00 " "2013-01-03 00:00:00+01:00" + ) in result diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index 703d273b3ca24c..0ea24777ae1f55 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -5,12 +5,10 @@ import pandas.util.testing as tm -@pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']]) +@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]]) def test_duplicated_with_misspelled_column_name(subset): # GH 19730 - df = DataFrame({'A': [0, 0, 1], - 'B': [0, 0, 1], - 'C': [0, 0, 1]}) + df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) with pytest.raises(KeyError): df.duplicated(subset) @@ -24,8 +22,9 @@ def test_duplicated_do_not_fail_on_wide_dataframes(): # gh-21524 # Given the wide dataframe with a lot of columns # with different (important!) values - data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) - for i in range(100)} + data = { + "col_{0:02d}".format(i): np.random.randint(0, 1000, 30000) for i in range(100) + } df = DataFrame(data).T result = df.duplicated() @@ -36,37 +35,47 @@ def test_duplicated_do_not_fail_on_wide_dataframes(): assert result.dtype == np.bool -@pytest.mark.parametrize('keep, expected', [ - ('first', Series([False, False, True, False, True])), - ('last', Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])) -]) +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) def test_duplicated_keep(keep, expected): - df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']}) + df = DataFrame({"A": [0, 1, 1, 2, 0], "B": ["a", "b", "b", "c", "a"]}) result = df.duplicated(keep=keep) tm.assert_series_equal(result, expected) @pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal") -@pytest.mark.parametrize('keep, expected', [ - ('first', Series([False, False, True, False, True])), - ('last', Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])) -]) +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) def test_duplicated_nan_none(keep, expected): - df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object) + df = DataFrame({"C": [np.nan, 3, 3, None, np.nan]}, dtype=object) result = df.duplicated(keep=keep) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('keep', ['first', 'last', False]) -@pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A']) +@pytest.mark.parametrize("keep", ["first", "last", False]) +@pytest.mark.parametrize("subset", [None, ["A", "B"], "A"]) def test_duplicated_subset(subset, keep): - df = DataFrame({'A': [0, 1, 1, 2, 0], - 'B': ['a', 'b', 'b', 'c', 'a'], - 'C': [np.nan, 3, 3, None, np.nan]}) + df = DataFrame( + { + "A": [0, 1, 1, 2, 0], + "B": ["a", "b", "b", "c", "a"], + "C": [np.nan, 3, 3, None, np.nan], + } + ) if subset is None: subset = list(df.columns) @@ -81,77 +90,77 @@ def test_duplicated_subset(subset, keep): def test_drop_duplicates(): - df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1, 1, 2, 2, 2, 2, 1, 2], - 'D': range(8), - }) + df = DataFrame( + { + "AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) # single column - result = df.drop_duplicates('AAA') + result = df.drop_duplicates("AAA") expected = df[:2] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('AAA', keep='last') + result = df.drop_duplicates("AAA", keep="last") expected = df.loc[[6, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('AAA', keep=False) + result = df.drop_duplicates("AAA", keep=False) expected = df.loc[[]] tm.assert_frame_equal(result, expected) assert len(result) == 0 # multi column expected = df.loc[[0, 1, 2, 3]] - result = df.drop_duplicates(np.array(['AAA', 'B'])) + result = df.drop_duplicates(np.array(["AAA", "B"])) tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(['AAA', 'B']) + result = df.drop_duplicates(["AAA", "B"]) tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(('AAA', 'B'), keep='last') + result = df.drop_duplicates(("AAA", "B"), keep="last") expected = df.loc[[0, 5, 6, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(('AAA', 'B'), keep=False) + result = df.drop_duplicates(("AAA", "B"), keep=False) expected = df.loc[[0]] tm.assert_frame_equal(result, expected) # consider everything - df2 = df.loc[:, ['AAA', 'B', 'C']] + df2 = df.loc[:, ["AAA", "B", "C"]] result = df2.drop_duplicates() # in this case only - expected = df2.drop_duplicates(['AAA', 'B']) + expected = df2.drop_duplicates(["AAA", "B"]) tm.assert_frame_equal(result, expected) - result = df2.drop_duplicates(keep='last') - expected = df2.drop_duplicates(['AAA', 'B'], keep='last') + result = df2.drop_duplicates(keep="last") + expected = df2.drop_duplicates(["AAA", "B"], keep="last") tm.assert_frame_equal(result, expected) result = df2.drop_duplicates(keep=False) - expected = df2.drop_duplicates(['AAA', 'B'], keep=False) + expected = df2.drop_duplicates(["AAA", "B"], keep=False) tm.assert_frame_equal(result, expected) # integers - result = df.drop_duplicates('C') + result = df.drop_duplicates("C") expected = df.iloc[[0, 2]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('C', keep='last') + result = df.drop_duplicates("C", keep="last") expected = df.iloc[[-2, -1]] tm.assert_frame_equal(result, expected) - df['E'] = df['C'].astype('int8') - result = df.drop_duplicates('E') + df["E"] = df["C"].astype("int8") + result = df.drop_duplicates("E") expected = df.iloc[[0, 2]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('E', keep='last') + result = df.drop_duplicates("E", keep="last") expected = df.iloc[[-2, -1]] tm.assert_frame_equal(result, expected) # GH 11376 - df = DataFrame({'x': [7, 6, 3, 3, 4, 8, 0], - 'y': [0, 6, 5, 5, 9, 1, 2]}) + df = DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]}) expected = df.loc[df.index != 3] tm.assert_frame_equal(df.drop_duplicates(), expected) @@ -172,15 +181,15 @@ def test_drop_duplicates(): df = DataFrame([i] * 9 for i in range(16)) df = df.append([[1] + [0] * 8], ignore_index=True) - for keep in ['first', 'last', False]: + for keep in ["first", "last", False]: assert df.duplicated(keep=keep).sum() == 0 def test_duplicated_on_empty_frame(): # GH 25184 - df = DataFrame(columns=['a', 'b']) - dupes = df.duplicated('a') + df = DataFrame(columns=["a", "b"]) + dupes = df.duplicated("a") result = df[dupes] expected = df.copy() @@ -189,90 +198,91 @@ def test_duplicated_on_empty_frame(): def test_drop_duplicates_with_duplicate_column_names(): # GH17836 - df = DataFrame([ - [1, 2, 5], - [3, 4, 6], - [3, 4, 7] - ], columns=['a', 'a', 'b']) + df = DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"]) result0 = df.drop_duplicates() tm.assert_frame_equal(result0, df) - result1 = df.drop_duplicates('a') + result1 = df.drop_duplicates("a") expected1 = df[:2] tm.assert_frame_equal(result1, expected1) def test_drop_duplicates_for_take_all(): - df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar', - 'foo', 'bar', 'qux', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1, 1, 2, 2, 2, 2, 1, 2], - 'D': range(8), - }) + df = DataFrame( + { + "AAA": ["foo", "bar", "baz", "bar", "foo", "bar", "qux", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) # single column - result = df.drop_duplicates('AAA') + result = df.drop_duplicates("AAA") expected = df.iloc[[0, 1, 2, 6]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('AAA', keep='last') + result = df.drop_duplicates("AAA", keep="last") expected = df.iloc[[2, 5, 6, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('AAA', keep=False) + result = df.drop_duplicates("AAA", keep=False) expected = df.iloc[[2, 6]] tm.assert_frame_equal(result, expected) # multiple columns - result = df.drop_duplicates(['AAA', 'B']) + result = df.drop_duplicates(["AAA", "B"]) expected = df.iloc[[0, 1, 2, 3, 4, 6]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(['AAA', 'B'], keep='last') + result = df.drop_duplicates(["AAA", "B"], keep="last") expected = df.iloc[[0, 1, 2, 5, 6, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(['AAA', 'B'], keep=False) + result = df.drop_duplicates(["AAA", "B"], keep=False) expected = df.iloc[[0, 1, 2, 6]] tm.assert_frame_equal(result, expected) def test_drop_duplicates_tuple(): - df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1, 1, 2, 2, 2, 2, 1, 2], - 'D': range(8), - }) + df = DataFrame( + { + ("AA", "AB"): ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) # single column - result = df.drop_duplicates(('AA', 'AB')) + result = df.drop_duplicates(("AA", "AB")) expected = df[:2] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(('AA', 'AB'), keep='last') + result = df.drop_duplicates(("AA", "AB"), keep="last") expected = df.loc[[6, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(('AA', 'AB'), keep=False) + result = df.drop_duplicates(("AA", "AB"), keep=False) expected = df.loc[[]] # empty df assert len(result) == 0 tm.assert_frame_equal(result, expected) # multi column expected = df.loc[[0, 1, 2, 3]] - result = df.drop_duplicates((('AA', 'AB'), 'B')) + result = df.drop_duplicates((("AA", "AB"), "B")) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize('df', [ - DataFrame(), - DataFrame(columns=[]), - DataFrame(columns=['A', 'B', 'C']), - DataFrame(index=[]), - DataFrame(index=['A', 'B', 'C']) -]) +@pytest.mark.parametrize( + "df", + [ + DataFrame(), + DataFrame(columns=[]), + DataFrame(columns=["A", "B", "C"]), + DataFrame(index=[]), + DataFrame(index=["A", "B", "C"]), + ], +) def test_drop_duplicates_empty(df): # GH 20516 result = df.drop_duplicates() @@ -285,134 +295,140 @@ def test_drop_duplicates_empty(df): def test_drop_duplicates_NA(): # none - df = DataFrame({'A': [None, None, 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], - 'D': range(8), - }) + df = DataFrame( + { + "A": [None, None, "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0], + "D": range(8), + } + ) # single column - result = df.drop_duplicates('A') + result = df.drop_duplicates("A") expected = df.loc[[0, 2, 3]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('A', keep='last') + result = df.drop_duplicates("A", keep="last") expected = df.loc[[1, 6, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('A', keep=False) + result = df.drop_duplicates("A", keep=False) expected = df.loc[[]] # empty df tm.assert_frame_equal(result, expected) assert len(result) == 0 # multi column - result = df.drop_duplicates(['A', 'B']) + result = df.drop_duplicates(["A", "B"]) expected = df.loc[[0, 2, 3, 6]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(['A', 'B'], keep='last') + result = df.drop_duplicates(["A", "B"], keep="last") expected = df.loc[[1, 5, 6, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(['A', 'B'], keep=False) + result = df.drop_duplicates(["A", "B"], keep=False) expected = df.loc[[6]] tm.assert_frame_equal(result, expected) # nan - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], - 'D': range(8), - }) + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0], + "D": range(8), + } + ) # single column - result = df.drop_duplicates('C') + result = df.drop_duplicates("C") expected = df[:2] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('C', keep='last') + result = df.drop_duplicates("C", keep="last") expected = df.loc[[3, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('C', keep=False) + result = df.drop_duplicates("C", keep=False) expected = df.loc[[]] # empty df tm.assert_frame_equal(result, expected) assert len(result) == 0 # multi column - result = df.drop_duplicates(['C', 'B']) + result = df.drop_duplicates(["C", "B"]) expected = df.loc[[0, 1, 2, 4]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(['C', 'B'], keep='last') + result = df.drop_duplicates(["C", "B"], keep="last") expected = df.loc[[1, 3, 6, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(['C', 'B'], keep=False) + result = df.drop_duplicates(["C", "B"], keep=False) expected = df.loc[[1]] tm.assert_frame_equal(result, expected) def test_drop_duplicates_NA_for_take_all(): # none - df = DataFrame({'A': [None, None, 'foo', 'bar', - 'foo', 'baz', 'bar', 'qux'], - 'C': [1.0, np.nan, np.nan, np.nan, 1., 2., 3, 1.]}) + df = DataFrame( + { + "A": [None, None, "foo", "bar", "foo", "baz", "bar", "qux"], + "C": [1.0, np.nan, np.nan, np.nan, 1.0, 2.0, 3, 1.0], + } + ) # single column - result = df.drop_duplicates('A') + result = df.drop_duplicates("A") expected = df.iloc[[0, 2, 3, 5, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('A', keep='last') + result = df.drop_duplicates("A", keep="last") expected = df.iloc[[1, 4, 5, 6, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('A', keep=False) + result = df.drop_duplicates("A", keep=False) expected = df.iloc[[5, 7]] tm.assert_frame_equal(result, expected) # nan # single column - result = df.drop_duplicates('C') + result = df.drop_duplicates("C") expected = df.iloc[[0, 1, 5, 6]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('C', keep='last') + result = df.drop_duplicates("C", keep="last") expected = df.iloc[[3, 5, 6, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('C', keep=False) + result = df.drop_duplicates("C", keep=False) expected = df.iloc[[5, 6]] tm.assert_frame_equal(result, expected) def test_drop_duplicates_inplace(): - orig = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1, 1, 2, 2, 2, 2, 1, 2], - 'D': range(8), - }) + orig = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) # single column df = orig.copy() - df.drop_duplicates('A', inplace=True) + df.drop_duplicates("A", inplace=True) expected = orig[:2] result = df tm.assert_frame_equal(result, expected) df = orig.copy() - df.drop_duplicates('A', keep='last', inplace=True) + df.drop_duplicates("A", keep="last", inplace=True) expected = orig.loc[[6, 7]] result = df tm.assert_frame_equal(result, expected) df = orig.copy() - df.drop_duplicates('A', keep=False, inplace=True) + df.drop_duplicates("A", keep=False, inplace=True) expected = orig.loc[[]] result = df tm.assert_frame_equal(result, expected) @@ -420,41 +436,41 @@ def test_drop_duplicates_inplace(): # multi column df = orig.copy() - df.drop_duplicates(['A', 'B'], inplace=True) + df.drop_duplicates(["A", "B"], inplace=True) expected = orig.loc[[0, 1, 2, 3]] result = df tm.assert_frame_equal(result, expected) df = orig.copy() - df.drop_duplicates(['A', 'B'], keep='last', inplace=True) + df.drop_duplicates(["A", "B"], keep="last", inplace=True) expected = orig.loc[[0, 5, 6, 7]] result = df tm.assert_frame_equal(result, expected) df = orig.copy() - df.drop_duplicates(['A', 'B'], keep=False, inplace=True) + df.drop_duplicates(["A", "B"], keep=False, inplace=True) expected = orig.loc[[0]] result = df tm.assert_frame_equal(result, expected) # consider everything - orig2 = orig.loc[:, ['A', 'B', 'C']].copy() + orig2 = orig.loc[:, ["A", "B", "C"]].copy() df2 = orig2.copy() df2.drop_duplicates(inplace=True) # in this case only - expected = orig2.drop_duplicates(['A', 'B']) + expected = orig2.drop_duplicates(["A", "B"]) result = df2 tm.assert_frame_equal(result, expected) df2 = orig2.copy() - df2.drop_duplicates(keep='last', inplace=True) - expected = orig2.drop_duplicates(['A', 'B'], keep='last') + df2.drop_duplicates(keep="last", inplace=True) + expected = orig2.drop_duplicates(["A", "B"], keep="last") result = df2 tm.assert_frame_equal(result, expected) df2 = orig2.copy() df2.drop_duplicates(keep=False, inplace=True) - expected = orig2.drop_duplicates(['A', 'B'], keep=False) + expected = orig2.drop_duplicates(["A", "B"], keep=False) result = df2 tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index f8af942f676579..c2d38b2938fca2 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -11,20 +11,31 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, DatetimeIndex, Index, MultiIndex, Series, - Timestamp, date_range, isna, notna) + Categorical, + DataFrame, + DatetimeIndex, + Index, + MultiIndex, + Series, + Timestamp, + date_range, + isna, + notna, +) import pandas.core.common as com from pandas.core.indexing import IndexingError from pandas.tests.frame.common import TestData import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) from pandas.tseries.offsets import BDay class TestDataFrameIndexing(TestData): - def test_getitem(self, float_frame): # Slicing sl = float_frame[:20] @@ -38,70 +49,81 @@ def test_getitem(self, float_frame): for key, _ in float_frame._series.items(): assert float_frame[key] is not None - assert 'random' not in float_frame - with pytest.raises(KeyError, match='random'): - float_frame['random'] + assert "random" not in float_frame + with pytest.raises(KeyError, match="random"): + float_frame["random"] df = float_frame.copy() - df['$10'] = np.random.randn(len(df)) + df["$10"] = np.random.randn(len(df)) ad = np.random.randn(len(df)) - df['@awesome_domain'] = ad + df["@awesome_domain"] = ad with pytest.raises(KeyError): df.__getitem__('df["$10"]') - res = df['@awesome_domain'] + res = df["@awesome_domain"] tm.assert_numpy_array_equal(ad, res.values) def test_getitem_dupe_cols(self): - df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=['a', 'a', 'b']) + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) with pytest.raises(KeyError): - df[['baf']] + df[["baf"]] def test_get(self, float_frame): - b = float_frame.get('B') - assert_series_equal(b, float_frame['B']) - - assert float_frame.get('foo') is None - assert_series_equal(float_frame.get('foo', float_frame['B']), - float_frame['B']) - - @pytest.mark.parametrize("df", [ - DataFrame(), - DataFrame(columns=list("AB")), - DataFrame(columns=list("AB"), index=range(3)) - ]) + b = float_frame.get("B") + assert_series_equal(b, float_frame["B"]) + + assert float_frame.get("foo") is None + assert_series_equal(float_frame.get("foo", float_frame["B"]), float_frame["B"]) + + @pytest.mark.parametrize( + "df", + [ + DataFrame(), + DataFrame(columns=list("AB")), + DataFrame(columns=list("AB"), index=range(3)), + ], + ) def test_get_none(self, df): # see gh-5652 assert df.get(None) is None - @pytest.mark.parametrize('key_type', [iter, np.array, Series, Index]) + @pytest.mark.parametrize("key_type", [iter, np.array, Series, Index]) def test_loc_iterable(self, float_frame, key_type): - idx = key_type(['A', 'B', 'C']) + idx = key_type(["A", "B", "C"]) result = float_frame.loc[:, idx] - expected = float_frame.loc[:, ['A', 'B', 'C']] + expected = float_frame.loc[:, ["A", "B", "C"]] assert_frame_equal(result, expected) @pytest.mark.parametrize( "idx_type", - [list, iter, Index, set, - lambda l: dict(zip(l, range(len(l)))), - lambda l: dict(zip(l, range(len(l)))).keys()], - ids=["list", "iter", "Index", "set", "dict", "dict_keys"]) + [ + list, + iter, + Index, + set, + lambda l: dict(zip(l, range(len(l)))), + lambda l: dict(zip(l, range(len(l)))).keys(), + ], + ids=["list", "iter", "Index", "set", "dict", "dict_keys"], + ) @pytest.mark.parametrize("levels", [1, 2]) def test_getitem_listlike(self, idx_type, levels, float_frame): # GH 21294 if levels == 1: - frame, missing = float_frame, 'food' + frame, missing = float_frame, "food" else: # MultiIndex columns - frame = DataFrame(np.random.randn(8, 3), - columns=Index([('foo', 'bar'), ('baz', 'qux'), - ('peek', 'aboo')], - name=('sth', 'sth2'))) - missing = ('good', 'food') + frame = DataFrame( + np.random.randn(8, 3), + columns=Index( + [("foo", "bar"), ("baz", "qux"), ("peek", "aboo")], + name=("sth", "sth2"), + ), + ) + missing = ("good", "food") keys = [frame.columns[1], frame.columns[0]] idx = idx_type(keys) @@ -115,16 +137,15 @@ def test_getitem_listlike(self, idx_type, levels, float_frame): assert_frame_equal(result, expected) idx = idx_type(keys + [missing]) - with pytest.raises(KeyError, match='not in index'): + with pytest.raises(KeyError, match="not in index"): frame[idx] - @pytest.mark.parametrize("val,expected", [ - (2**63 - 1, Series([1])), - (2**63, Series([2])), - ]) + @pytest.mark.parametrize( + "val,expected", [(2 ** 63 - 1, Series([1])), (2 ** 63, Series([2]))] + ) def test_loc_uint64(self, val, expected): # see gh-19399 - df = DataFrame([1, 2], index=[2**63 - 1, 2**63]) + df = DataFrame([1, 2], index=[2 ** 63 - 1, 2 ** 63]) result = df.loc[val] expected.name = val @@ -132,11 +153,11 @@ def test_loc_uint64(self, val, expected): def test_getitem_callable(self, float_frame): # GH 12533 - result = float_frame[lambda x: 'A'] - tm.assert_series_equal(result, float_frame.loc[:, 'A']) + result = float_frame[lambda x: "A"] + tm.assert_series_equal(result, float_frame.loc[:, "A"]) - result = float_frame[lambda x: ['A', 'B']] - tm.assert_frame_equal(result, float_frame.loc[:, ['A', 'B']]) + result = float_frame[lambda x: ["A", "B"]] + tm.assert_frame_equal(result, float_frame.loc[:, ["A", "B"]]) df = float_frame[:3] result = df[lambda x: [True, False, True]] @@ -144,79 +165,77 @@ def test_getitem_callable(self, float_frame): def test_setitem_list(self, float_frame): - float_frame['E'] = 'foo' - data = float_frame[['A', 'B']] - float_frame[['B', 'A']] = data + float_frame["E"] = "foo" + data = float_frame[["A", "B"]] + float_frame[["B", "A"]] = data - assert_series_equal(float_frame['B'], data['A'], check_names=False) - assert_series_equal(float_frame['A'], data['B'], check_names=False) + assert_series_equal(float_frame["B"], data["A"], check_names=False) + assert_series_equal(float_frame["A"], data["B"], check_names=False) - msg = 'Columns must be same length as key' + msg = "Columns must be same length as key" with pytest.raises(ValueError, match=msg): - data[['A']] = float_frame[['A', 'B']] + data[["A"]] = float_frame[["A", "B"]] - msg = 'Length of values does not match length of index' + msg = "Length of values does not match length of index" with pytest.raises(ValueError, match=msg): - data['A'] = range(len(data.index) - 1) + data["A"] = range(len(data.index) - 1) - df = DataFrame(0, index=range(3), columns=['tt1', 'tt2'], - dtype=np.int_) - df.loc[1, ['tt1', 'tt2']] = [1, 2] + df = DataFrame(0, index=range(3), columns=["tt1", "tt2"], dtype=np.int_) + df.loc[1, ["tt1", "tt2"]] = [1, 2] - result = df.loc[df.index[1], ['tt1', 'tt2']] + result = df.loc[df.index[1], ["tt1", "tt2"]] expected = Series([1, 2], df.columns, dtype=np.int_, name=1) assert_series_equal(result, expected) - df['tt1'] = df['tt2'] = '0' - df.loc[df.index[1], ['tt1', 'tt2']] = ['1', '2'] - result = df.loc[df.index[1], ['tt1', 'tt2']] - expected = Series(['1', '2'], df.columns, name=1) + df["tt1"] = df["tt2"] = "0" + df.loc[df.index[1], ["tt1", "tt2"]] = ["1", "2"] + result = df.loc[df.index[1], ["tt1", "tt2"]] + expected = Series(["1", "2"], df.columns, name=1) assert_series_equal(result, expected) def test_setitem_list_not_dataframe(self, float_frame): data = np.random.randn(len(float_frame), 2) - float_frame[['A', 'B']] = data - assert_almost_equal(float_frame[['A', 'B']].values, data) + float_frame[["A", "B"]] = data + assert_almost_equal(float_frame[["A", "B"]].values, data) def test_setitem_list_of_tuples(self, float_frame): - tuples = list(zip(float_frame['A'], float_frame['B'])) - float_frame['tuples'] = tuples + tuples = list(zip(float_frame["A"], float_frame["B"])) + float_frame["tuples"] = tuples - result = float_frame['tuples'] - expected = Series(tuples, index=float_frame.index, name='tuples') + result = float_frame["tuples"] + expected = Series(tuples, index=float_frame.index, name="tuples") assert_series_equal(result, expected) def test_setitem_mulit_index(self): # GH7655, test that assigning to a sub-frame of a frame # with multi-index columns aligns both rows and columns - it = ['jim', 'joe', 'jolie'], ['first', 'last'], \ - ['left', 'center', 'right'] + it = ["jim", "joe", "jolie"], ["first", "last"], ["left", "center", "right"] cols = MultiIndex.from_product(it) - index = pd.date_range('20141006', periods=20) + index = pd.date_range("20141006", periods=20) vals = np.random.randint(1, 1000, (len(index), len(cols))) df = pd.DataFrame(vals, columns=cols, index=index) i, j = df.index.values.copy(), it[-1][:] np.random.shuffle(i) - df['jim'] = df['jolie'].loc[i, ::-1] - assert_frame_equal(df['jim'], df['jolie']) + df["jim"] = df["jolie"].loc[i, ::-1] + assert_frame_equal(df["jim"], df["jolie"]) np.random.shuffle(j) - df[('joe', 'first')] = df[('jolie', 'last')].loc[i, j] - assert_frame_equal(df[('joe', 'first')], df[('jolie', 'last')]) + df[("joe", "first")] = df[("jolie", "last")].loc[i, j] + assert_frame_equal(df[("joe", "first")], df[("jolie", "last")]) np.random.shuffle(j) - df[('joe', 'last')] = df[('jolie', 'first')].loc[i, j] - assert_frame_equal(df[('joe', 'last')], df[('jolie', 'first')]) + df[("joe", "last")] = df[("jolie", "first")].loc[i, j] + assert_frame_equal(df[("joe", "last")], df[("jolie", "first")]) def test_setitem_callable(self): # GH 12533 - df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}) - df[lambda x: 'A'] = [11, 12, 13, 14] + df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}) + df[lambda x: "A"] = [11, 12, 13, 14] - exp = pd.DataFrame({'A': [11, 12, 13, 14], 'B': [5, 6, 7, 8]}) + exp = pd.DataFrame({"A": [11, 12, 13, 14], "B": [5, 6, 7, 8]}) tm.assert_frame_equal(df, exp) def test_setitem_other_callable(self): @@ -230,8 +249,9 @@ def inc(x): expected = pd.DataFrame([[-1, inc], [inc, -1]]) tm.assert_frame_equal(df, expected) - def test_getitem_boolean(self, float_string_frame, mixed_float_frame, - mixed_int_frame, datetime_frame): + def test_getitem_boolean( + self, float_string_frame, mixed_float_frame, mixed_int_frame, datetime_frame + ): # boolean indexing d = datetime_frame.index[10] indexer = datetime_frame.index > d @@ -241,13 +261,13 @@ def test_getitem_boolean(self, float_string_frame, mixed_float_frame, subframe = datetime_frame[indexer] tm.assert_index_equal(subindex, subframe.index) - with pytest.raises(ValueError, match='Item wrong length'): + with pytest.raises(ValueError, match="Item wrong length"): datetime_frame[indexer[:-1]] subframe_obj = datetime_frame[indexer_obj] assert_frame_equal(subframe_obj, subframe) - with pytest.raises(ValueError, match='boolean values only'): + with pytest.raises(ValueError, match="boolean values only"): datetime_frame[datetime_frame] # test that Series work @@ -266,16 +286,22 @@ def test_getitem_boolean(self, float_string_frame, mixed_float_frame, assert_frame_equal(subframe_obj, subframe) # test df[df > 0] - for df in [datetime_frame, float_string_frame, - mixed_float_frame, mixed_int_frame]: + for df in [ + datetime_frame, + float_string_frame, + mixed_float_frame, + mixed_int_frame, + ]: if df is float_string_frame: continue data = df._get_numeric_data() bif = df[df > 0] - bifw = DataFrame({c: np.where(data[c] > 0, data[c], np.nan) - for c in data.columns}, - index=data.index, columns=data.columns) + bifw = DataFrame( + {c: np.where(data[c] > 0, data[c], np.nan) for c in data.columns}, + index=data.index, + columns=data.columns, + ) # add back other columns to compare for c in df.columns: @@ -292,31 +318,35 @@ def test_getitem_boolean_casting(self, datetime_frame): # don't upcast if we don't need to df = datetime_frame.copy() - df['E'] = 1 - df['E'] = df['E'].astype('int32') - df['E1'] = df['E'].copy() - df['F'] = 1 - df['F'] = df['F'].astype('int64') - df['F1'] = df['F'].copy() + df["E"] = 1 + df["E"] = df["E"].astype("int32") + df["E1"] = df["E"].copy() + df["F"] = 1 + df["F"] = df["F"].astype("int64") + df["F1"] = df["F"].copy() casted = df[df > 0] result = casted.dtypes - expected = Series([np.dtype('float64')] * 4 + - [np.dtype('int32')] * 2 + - [np.dtype('int64')] * 2, - index=['A', 'B', 'C', 'D', 'E', 'E1', 'F', 'F1']) + expected = Series( + [np.dtype("float64")] * 4 + + [np.dtype("int32")] * 2 + + [np.dtype("int64")] * 2, + index=["A", "B", "C", "D", "E", "E1", "F", "F1"], + ) assert_series_equal(result, expected) # int block splitting - df.loc[df.index[1:3], ['E1', 'F1']] = 0 + df.loc[df.index[1:3], ["E1", "F1"]] = 0 casted = df[df > 0] result = casted.dtypes - expected = Series([np.dtype('float64')] * 4 + - [np.dtype('int32')] + - [np.dtype('float64')] + - [np.dtype('int64')] + - [np.dtype('float64')], - index=['A', 'B', 'C', 'D', 'E', 'E1', 'F', 'F1']) + expected = Series( + [np.dtype("float64")] * 4 + + [np.dtype("int32")] + + [np.dtype("float64")] + + [np.dtype("int64")] + + [np.dtype("float64")], + index=["A", "B", "C", "D", "E", "E1", "F", "F1"], + ) assert_series_equal(result, expected) # where dtype conversions @@ -344,7 +374,7 @@ def _checkit(lst): def test_getitem_boolean_iadd(self): arr = np.random.randn(5, 5) - df = DataFrame(arr.copy(), columns=['A', 'B', 'C', 'D', 'E']) + df = DataFrame(arr.copy(), columns=["A", "B", "C", "D", "E"]) df[df < 0] += 1 arr[arr < 0] += 1 @@ -353,8 +383,7 @@ def test_getitem_boolean_iadd(self): def test_boolean_index_empty_corner(self): # #2096 - blah = DataFrame(np.empty([0, 1]), columns=['A'], - index=DatetimeIndex([])) + blah = DataFrame(np.empty([0, 1]), columns=["A"], index=DatetimeIndex([])) # both of these should succeed trivially k = np.array([], bool) @@ -363,8 +392,9 @@ def test_boolean_index_empty_corner(self): blah[k] = 0 def test_getitem_ix_mixed_integer(self): - df = DataFrame(np.random.randn(4, 3), - index=[1, 10, 'C', 'E'], columns=[1, 2, 3]) + df = DataFrame( + np.random.randn(4, 3), index=[1, 10, "C", "E"], columns=[1, 2, 3] + ) result = df.iloc[:-1] expected = df.loc[df.index[:-1]] @@ -377,11 +407,15 @@ def test_getitem_ix_mixed_integer(self): assert_frame_equal(result, expected) # 11320 - df = pd.DataFrame({"rna": (1.5, 2.2, 3.2, 4.5), - -1000: [11, 21, 36, 40], - 0: [10, 22, 43, 34], - 1000: [0, 10, 20, 30]}, - columns=['rna', -1000, 0, 1000]) + df = pd.DataFrame( + { + "rna": (1.5, 2.2, 3.2, 4.5), + -1000: [11, 21, 36, 40], + 0: [10, 22, 43, 34], + 1000: [0, 10, 20, 30], + }, + columns=["rna", -1000, 0, 1000], + ) result = df[[1000]] expected = df.iloc[:, [3]] assert_frame_equal(result, expected) @@ -393,22 +427,22 @@ def test_getitem_setitem_ix_negative_integers(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) result = float_frame.ix[:, -1] - assert_series_equal(result, float_frame['D']) + assert_series_equal(result, float_frame["D"]) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) result = float_frame.ix[:, [-1]] - assert_frame_equal(result, float_frame[['D']]) + assert_frame_equal(result, float_frame[["D"]]) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) result = float_frame.ix[:, [-1, -2]] - assert_frame_equal(result, float_frame[['D', 'C']]) + assert_frame_equal(result, float_frame[["D", "C"]]) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) float_frame.ix[:, [-1]] = 0 - assert (float_frame['D'] == 0).all() + assert (float_frame["D"] == 0).all() df = DataFrame(np.random.randn(8, 4)) # ix does label-based indexing when having an integer index @@ -423,8 +457,7 @@ def test_getitem_setitem_ix_negative_integers(self, float_frame): df.ix[:, [-1]] # #1942 - a = DataFrame(np.random.randn(20, 2), - index=[chr(x + 65) for x in range(20)]) + a = DataFrame(np.random.randn(20, 2), index=[chr(x + 65) for x in range(20)]) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) a.ix[-1] = a.ix[-2] @@ -432,64 +465,64 @@ def test_getitem_setitem_ix_negative_integers(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) assert_series_equal(a.ix[-1], a.ix[-2], check_names=False) - assert a.ix[-1].name == 'T' - assert a.ix[-2].name == 'S' + assert a.ix[-1].name == "T" + assert a.ix[-2].name == "S" def test_getattr(self, float_frame): - assert_series_equal(float_frame.A, float_frame['A']) + assert_series_equal(float_frame.A, float_frame["A"]) msg = "'DataFrame' object has no attribute 'NONEXISTENT_NAME'" with pytest.raises(AttributeError, match=msg): float_frame.NONEXISTENT_NAME def test_setattr_column(self): - df = DataFrame({'foobar': 1}, index=range(10)) + df = DataFrame({"foobar": 1}, index=range(10)) df.foobar = 5 assert (df.foobar == 5).all() def test_setitem(self, float_frame): # not sure what else to do here - series = float_frame['A'][::2] - float_frame['col5'] = series - assert 'col5' in float_frame + series = float_frame["A"][::2] + float_frame["col5"] = series + assert "col5" in float_frame assert len(series) == 15 assert len(float_frame) == 30 exp = np.ravel(np.column_stack((series.values, [np.nan] * 15))) - exp = Series(exp, index=float_frame.index, name='col5') - tm.assert_series_equal(float_frame['col5'], exp) + exp = Series(exp, index=float_frame.index, name="col5") + tm.assert_series_equal(float_frame["col5"], exp) - series = float_frame['A'] - float_frame['col6'] = series - tm.assert_series_equal(series, float_frame['col6'], check_names=False) + series = float_frame["A"] + float_frame["col6"] = series + tm.assert_series_equal(series, float_frame["col6"], check_names=False) with pytest.raises(KeyError): float_frame[np.random.randn(len(float_frame) + 1)] = 1 # set ndarray arr = np.random.randn(len(float_frame)) - float_frame['col9'] = arr - assert (float_frame['col9'] == arr).all() + float_frame["col9"] = arr + assert (float_frame["col9"] == arr).all() - float_frame['col7'] = 5 - assert((float_frame['col7'] == 5).all()) + float_frame["col7"] = 5 + assert (float_frame["col7"] == 5).all() - float_frame['col0'] = 3.14 - assert((float_frame['col0'] == 3.14).all()) + float_frame["col0"] = 3.14 + assert (float_frame["col0"] == 3.14).all() - float_frame['col8'] = 'foo' - assert((float_frame['col8'] == 'foo').all()) + float_frame["col8"] = "foo" + assert (float_frame["col8"] == "foo").all() # this is partially a view (e.g. some blocks are view) # so raise/warn smaller = float_frame[:2] with pytest.raises(com.SettingWithCopyError): - smaller['col10'] = ['1', '2'] + smaller["col10"] = ["1", "2"] - assert smaller['col10'].dtype == np.object_ - assert (smaller['col10'] == ['1', '2']).all() + assert smaller["col10"].dtype == np.object_ + assert (smaller["col10"] == ["1", "2"]).all() # dtype changing GH4204 df = DataFrame([[0, 0]]) @@ -509,27 +542,26 @@ def test_setitem_dtype(self, dtype, float_frame): assert float_frame[dtype].dtype.name == dtype def test_setitem_tuple(self, float_frame): - float_frame['A', 'B'] = float_frame['A'] - assert_series_equal(float_frame['A', 'B'], float_frame[ - 'A'], check_names=False) + float_frame["A", "B"] = float_frame["A"] + assert_series_equal(float_frame["A", "B"], float_frame["A"], check_names=False) def test_setitem_always_copy(self, float_frame): - s = float_frame['A'].copy() - float_frame['E'] = s + s = float_frame["A"].copy() + float_frame["E"] = s - float_frame['E'][5:10] = np.nan + float_frame["E"][5:10] = np.nan assert notna(s[5:10]).all() def test_setitem_boolean(self, float_frame): df = float_frame.copy() values = float_frame.values - df[df['A'] > 0] = 4 + df[df["A"] > 0] = 4 values[values[:, 0] > 0] = 4 assert_almost_equal(df.values, values) # test that column reindexing works - series = df['A'] == 4 + series = df["A"] == 4 series = series.reindex(df.index[::-1]) df[series] = 1 values[values[:, 0] == 4] = 1 @@ -572,9 +604,9 @@ def test_setitem_boolean(self, float_frame): @pytest.mark.parametrize( "mask_type", - [lambda df: df > np.abs(df) / 2, - lambda df: (df > np.abs(df) / 2).values], - ids=['dataframe', 'array']) + [lambda df: df > np.abs(df) / 2, lambda df: (df > np.abs(df) / 2).values], + ids=["dataframe", "array"], + ) def test_setitem_boolean_mask(self, mask_type, float_frame): # Test for issue #18582 @@ -590,63 +622,65 @@ def test_setitem_boolean_mask(self, mask_type, float_frame): assert_frame_equal(result, expected) def test_setitem_cast(self, float_frame): - float_frame['D'] = float_frame['D'].astype('i8') - assert float_frame['D'].dtype == np.int64 + float_frame["D"] = float_frame["D"].astype("i8") + assert float_frame["D"].dtype == np.int64 # #669, should not cast? # this is now set to int64, which means a replacement of the column to # the value dtype (and nothing to do with the existing dtype) - float_frame['B'] = 0 - assert float_frame['B'].dtype == np.int64 + float_frame["B"] = 0 + assert float_frame["B"].dtype == np.int64 # cast if pass array of course - float_frame['B'] = np.arange(len(float_frame)) - assert issubclass(float_frame['B'].dtype.type, np.integer) + float_frame["B"] = np.arange(len(float_frame)) + assert issubclass(float_frame["B"].dtype.type, np.integer) - float_frame['foo'] = 'bar' - float_frame['foo'] = 0 - assert float_frame['foo'].dtype == np.int64 + float_frame["foo"] = "bar" + float_frame["foo"] = 0 + assert float_frame["foo"].dtype == np.int64 - float_frame['foo'] = 'bar' - float_frame['foo'] = 2.5 - assert float_frame['foo'].dtype == np.float64 + float_frame["foo"] = "bar" + float_frame["foo"] = 2.5 + assert float_frame["foo"].dtype == np.float64 - float_frame['something'] = 0 - assert float_frame['something'].dtype == np.int64 - float_frame['something'] = 2 - assert float_frame['something'].dtype == np.int64 - float_frame['something'] = 2.5 - assert float_frame['something'].dtype == np.float64 + float_frame["something"] = 0 + assert float_frame["something"].dtype == np.int64 + float_frame["something"] = 2 + assert float_frame["something"].dtype == np.int64 + float_frame["something"] = 2.5 + assert float_frame["something"].dtype == np.float64 # GH 7704 # dtype conversion on setting - df = DataFrame(np.random.rand(30, 3), columns=tuple('ABC')) - df['event'] = np.nan - df.loc[10, 'event'] = 'foo' + df = DataFrame(np.random.rand(30, 3), columns=tuple("ABC")) + df["event"] = np.nan + df.loc[10, "event"] = "foo" result = df.dtypes - expected = Series([np.dtype('float64')] * 3 + [np.dtype('object')], - index=['A', 'B', 'C', 'event']) + expected = Series( + [np.dtype("float64")] * 3 + [np.dtype("object")], + index=["A", "B", "C", "event"], + ) assert_series_equal(result, expected) # Test that data type is preserved . #5782 - df = DataFrame({'one': np.arange(6, dtype=np.int8)}) - df.loc[1, 'one'] = 6 + df = DataFrame({"one": np.arange(6, dtype=np.int8)}) + df.loc[1, "one"] = 6 assert df.dtypes.one == np.dtype(np.int8) df.one = np.int8(7) assert df.dtypes.one == np.dtype(np.int8) def test_setitem_boolean_column(self, float_frame): expected = float_frame.copy() - mask = float_frame['A'] > 0 + mask = float_frame["A"] > 0 - float_frame.loc[mask, 'B'] = 0 + float_frame.loc[mask, "B"] = 0 expected.values[mask.values, 1] = 0 assert_frame_equal(float_frame, expected) def test_frame_setitem_timestamp(self): # GH#2155 - columns = date_range(start='1/1/2012', end='2/1/2012', freq=BDay()) + columns = date_range(start="1/1/2012", end="2/1/2012", freq=BDay()) data = DataFrame(columns=columns, index=range(10)) t = datetime(2012, 11, 1) ts = Timestamp(t) @@ -655,60 +689,60 @@ def test_frame_setitem_timestamp(self): def test_setitem_corner(self, float_frame): # corner case - df = DataFrame({'B': [1., 2., 3.], - 'C': ['a', 'b', 'c']}, - index=np.arange(3)) - del df['B'] - df['B'] = [1., 2., 3.] - assert 'B' in df + df = DataFrame({"B": [1.0, 2.0, 3.0], "C": ["a", "b", "c"]}, index=np.arange(3)) + del df["B"] + df["B"] = [1.0, 2.0, 3.0] + assert "B" in df assert len(df.columns) == 2 - df['A'] = 'beginning' - df['E'] = 'foo' - df['D'] = 'bar' - df[datetime.now()] = 'date' - df[datetime.now()] = 5. + df["A"] = "beginning" + df["E"] = "foo" + df["D"] = "bar" + df[datetime.now()] = "date" + df[datetime.now()] = 5.0 # what to do when empty frame with index dm = DataFrame(index=float_frame.index) - dm['A'] = 'foo' - dm['B'] = 'bar' + dm["A"] = "foo" + dm["B"] = "bar" assert len(dm.columns) == 2 assert dm.values.dtype == np.object_ # upcast - dm['C'] = 1 - assert dm['C'].dtype == np.int64 + dm["C"] = 1 + assert dm["C"].dtype == np.int64 - dm['E'] = 1. - assert dm['E'].dtype == np.float64 + dm["E"] = 1.0 + assert dm["E"].dtype == np.float64 # set existing column - dm['A'] = 'bar' - assert 'bar' == dm['A'][0] + dm["A"] = "bar" + assert "bar" == dm["A"][0] dm = DataFrame(index=np.arange(3)) - dm['A'] = 1 - dm['foo'] = 'bar' - del dm['foo'] - dm['foo'] = 'bar' - assert dm['foo'].dtype == np.object_ + dm["A"] = 1 + dm["foo"] = "bar" + del dm["foo"] + dm["foo"] = "bar" + assert dm["foo"].dtype == np.object_ - dm['coercable'] = ['1', '2', '3'] - assert dm['coercable'].dtype == np.object_ + dm["coercable"] = ["1", "2", "3"] + assert dm["coercable"].dtype == np.object_ def test_setitem_corner2(self): - data = {"title": ['foobar', 'bar', 'foobar'] + ['foobar'] * 17, - "cruft": np.random.random(20)} + data = { + "title": ["foobar", "bar", "foobar"] + ["foobar"] * 17, + "cruft": np.random.random(20), + } df = DataFrame(data) - ix = df[df['title'] == 'bar'].index + ix = df[df["title"] == "bar"].index - df.loc[ix, ['title']] = 'foobar' - df.loc[ix, ['cruft']] = 0 + df.loc[ix, ["title"]] = "foobar" + df.loc[ix, ["cruft"]] = 0 - assert df.loc[1, 'title'] == 'foobar' - assert df.loc[1, 'cruft'] == 0 + assert df.loc[1, "title"] == "foobar" + assert df.loc[1, "cruft"] == 0 def test_setitem_ambig(self): # Difficulties with mixed-type data @@ -717,9 +751,8 @@ def test_setitem_ambig(self): # Created as float type dm = DataFrame(index=range(3), columns=range(3)) - coercable_series = Series([Decimal(1) for _ in range(3)], - index=range(3)) - uncoercable_series = Series(['foo', 'bzr', 'baz'], index=range(3)) + coercable_series = Series([Decimal(1) for _ in range(3)], index=range(3)) + uncoercable_series = Series(["foo", "bzr", "baz"], index=range(3)) dm[0] = np.ones(3) assert len(dm.columns) == 3 @@ -733,46 +766,44 @@ def test_setitem_ambig(self): def test_setitem_clear_caches(self): # see gh-304 - df = DataFrame({'x': [1.1, 2.1, 3.1, 4.1], 'y': [5.1, 6.1, 7.1, 8.1]}, - index=[0, 1, 2, 3]) - df.insert(2, 'z', np.nan) + df = DataFrame( + {"x": [1.1, 2.1, 3.1, 4.1], "y": [5.1, 6.1, 7.1, 8.1]}, index=[0, 1, 2, 3] + ) + df.insert(2, "z", np.nan) # cache it - foo = df['z'] - df.loc[df.index[2:], 'z'] = 42 + foo = df["z"] + df.loc[df.index[2:], "z"] = 42 - expected = Series([np.nan, np.nan, 42, 42], index=df.index, name='z') + expected = Series([np.nan, np.nan, 42, 42], index=df.index, name="z") - assert df['z'] is not foo - tm.assert_series_equal(df['z'], expected) + assert df["z"] is not foo + tm.assert_series_equal(df["z"], expected) def test_setitem_None(self, float_frame): # GH #766 - float_frame[None] = float_frame['A'] + float_frame[None] = float_frame["A"] assert_series_equal( - float_frame.iloc[:, -1], float_frame['A'], check_names=False) - assert_series_equal(float_frame.loc[:, None], float_frame[ - 'A'], check_names=False) - assert_series_equal(float_frame[None], float_frame[ - 'A'], check_names=False) + float_frame.iloc[:, -1], float_frame["A"], check_names=False + ) + assert_series_equal( + float_frame.loc[:, None], float_frame["A"], check_names=False + ) + assert_series_equal(float_frame[None], float_frame["A"], check_names=False) repr(float_frame) def test_setitem_empty(self): # GH 9596 - df = pd.DataFrame({'a': ['1', '2', '3'], - 'b': ['11', '22', '33'], - 'c': ['111', '222', '333']}) + df = pd.DataFrame( + {"a": ["1", "2", "3"], "b": ["11", "22", "33"], "c": ["111", "222", "333"]} + ) result = df.copy() - result.loc[result.b.isna(), 'a'] = result.a + result.loc[result.b.isna(), "a"] = result.a assert_frame_equal(result, df) @pytest.mark.parametrize("dtype", ["float", "int64"]) - @pytest.mark.parametrize("kwargs", [ - dict(), - dict(index=[1]), - dict(columns=["A"]) - ]) + @pytest.mark.parametrize("kwargs", [dict(), dict(index=[1]), dict(columns=["A"])]) def test_setitem_empty_frame_with_boolean(self, dtype, kwargs): # see gh-10126 kwargs["dtype"] = dtype @@ -785,8 +816,8 @@ def test_setitem_empty_frame_with_boolean(self, dtype, kwargs): def test_setitem_scalars_no_index(self): # GH16823 / 17894 df = DataFrame() - df['foo'] = 1 - expected = DataFrame(columns=['foo']).astype(np.int64) + df["foo"] = 1 + expected = DataFrame(columns=["foo"]).astype(np.int64) assert_frame_equal(df, expected) def test_getitem_empty_frame_with_boolean(self): @@ -798,11 +829,11 @@ def test_getitem_empty_frame_with_boolean(self): def test_delitem_corner(self, float_frame): f = float_frame.copy() - del f['D'] + del f["D"] assert len(f.columns) == 3 with pytest.raises(KeyError, match=r"^'D'$"): - del f['D'] - del f['B'] + del f["D"] + del f["B"] assert len(f.columns) == 2 def test_getitem_fancy_2d(self, float_frame): @@ -810,35 +841,35 @@ def test_getitem_fancy_2d(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - assert_frame_equal(f.ix[:, ['B', 'A']], - f.reindex(columns=['B', 'A'])) + assert_frame_equal(f.ix[:, ["B", "A"]], f.reindex(columns=["B", "A"])) subidx = float_frame.index[[5, 4, 1]] with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - assert_frame_equal(f.ix[subidx, ['B', 'A']], - f.reindex(index=subidx, columns=['B', 'A'])) + assert_frame_equal( + f.ix[subidx, ["B", "A"]], f.reindex(index=subidx, columns=["B", "A"]) + ) # slicing rows, etc. with catch_warnings(record=True): simplefilter("ignore", FutureWarning) assert_frame_equal(f.ix[5:10], f[5:10]) assert_frame_equal(f.ix[5:10, :], f[5:10]) - assert_frame_equal(f.ix[:5, ['A', 'B']], - f.reindex(index=f.index[:5], - columns=['A', 'B'])) + assert_frame_equal( + f.ix[:5, ["A", "B"]], f.reindex(index=f.index[:5], columns=["A", "B"]) + ) # slice rows with labels, inclusive! with catch_warnings(record=True): simplefilter("ignore", FutureWarning) expected = f.ix[5:11] - result = f.ix[f.index[5]:f.index[10]] + result = f.ix[f.index[5] : f.index[10]] assert_frame_equal(expected, result) # slice columns with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - assert_frame_equal(f.ix[:, :2], f.reindex(columns=['A', 'B'])) + assert_frame_equal(f.ix[:, :2], f.reindex(columns=["A", "B"])) # get view with catch_warnings(record=True): @@ -910,9 +941,9 @@ def test_setitem_fancy_2d(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - frame.ix[:, ['B', 'A']] = 1 - expected['B'] = 1. - expected['A'] = 1. + frame.ix[:, ["B", "A"]] = 1 + expected["B"] = 1.0 + expected["A"] = 1.0 assert_frame_equal(frame, expected) # case 2 @@ -926,11 +957,11 @@ def test_setitem_fancy_2d(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - frame.ix[subidx, ['B', 'A']] = values - frame2.ix[[5, 4, 1], ['B', 'A']] = values + frame.ix[subidx, ["B", "A"]] = values + frame2.ix[[5, 4, 1], ["B", "A"]] = values - expected['B'].ix[subidx] = values[:, 0] - expected['A'].ix[subidx] = values[:, 1] + expected["B"].ix[subidx] = values[:, 0] + expected["A"].ix[subidx] = values[:, 1] assert_frame_equal(frame, expected) assert_frame_equal(frame2, expected) @@ -941,8 +972,8 @@ def test_setitem_fancy_2d(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) expected1 = float_frame.copy() - frame.ix[5:10] = 1. - expected1.values[5:10] = 1. + frame.ix[5:10] = 1.0 + expected1.values[5:10] = 1.0 assert_frame_equal(frame, expected1) with catch_warnings(record=True): @@ -957,7 +988,7 @@ def test_setitem_fancy_2d(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) frame = float_frame.copy() - frame.ix[5:10, :] = 1. + frame.ix[5:10, :] = 1.0 assert_frame_equal(frame, expected1) frame.ix[5:10, :] = arr assert_frame_equal(frame, expected2) @@ -971,9 +1002,9 @@ def test_setitem_fancy_2d(self, float_frame): expected = float_frame.copy() values = np.random.randn(5, 2) - frame.ix[:5, ['A', 'B']] = values - expected['A'][:5] = values[:, 0] - expected['B'][:5] = values[:, 1] + frame.ix[:5, ["A", "B"]] = values + expected["A"][:5] = values[:, 0] + expected["B"][:5] = values[:, 1] assert_frame_equal(frame, expected) with catch_warnings(record=True): @@ -987,7 +1018,7 @@ def test_setitem_fancy_2d(self, float_frame): frame = float_frame.copy() expected = float_frame.copy() - frame.ix[frame.index[5]:frame.index[10]] = 5. + frame.ix[frame.index[5] : frame.index[10]] = 5.0 expected.values[5:11] = 5 assert_frame_equal(frame, expected) @@ -999,36 +1030,34 @@ def test_setitem_fancy_2d(self, float_frame): expected = float_frame.copy() # slice indices - frame.ix[:, 1:3] = 4. - expected.values[:, 1:3] = 4. + frame.ix[:, 1:3] = 4.0 + expected.values[:, 1:3] = 4.0 assert_frame_equal(frame, expected) # slice with labels - frame.ix[:, 'B':'C'] = 4. + frame.ix[:, "B":"C"] = 4.0 assert_frame_equal(frame, expected) # new corner case of boolean slicing / setting - frame = DataFrame(zip([2, 3, 9, 6, 7], [np.nan] * 5), - columns=['a', 'b']) + frame = DataFrame(zip([2, 3, 9, 6, 7], [np.nan] * 5), columns=["a", "b"]) lst = [100] lst.extend([np.nan] * 4) - expected = DataFrame(zip([100, 3, 9, 6, 7], lst), - columns=['a', 'b']) - frame[frame['a'] == 2] = 100 + expected = DataFrame(zip([100, 3, 9, 6, 7], lst), columns=["a", "b"]) + frame[frame["a"] == 2] = 100 assert_frame_equal(frame, expected) def test_fancy_getitem_slice_mixed(self, float_frame, float_string_frame): sliced = float_string_frame.iloc[:, -3:] - assert sliced['D'].dtype == np.float64 + assert sliced["D"].dtype == np.float64 # get view with single block # setting it triggers setting with copy sliced = float_frame.iloc[:, -3:] with pytest.raises(com.SettingWithCopyError): - sliced['C'] = 4. + sliced["C"] = 4.0 - assert (float_frame['C'] == 4).all() + assert (float_frame["C"] == 4).all() def test_fancy_setitem_int_labels(self): # integer index defers to label-based indexing @@ -1100,14 +1129,18 @@ def test_fancy_index_int_labels_exceptions(self, float_frame): df.ix[[0, 1, 2], [2, 3, 4]] = 5 # try to set indices not contained in frame - msg = (r"None of \[Index\(\['foo', 'bar', 'baz'\]," - r" dtype='object'\)\] are in the \[index\]") + msg = ( + r"None of \[Index\(\['foo', 'bar', 'baz'\]," + r" dtype='object'\)\] are in the \[index\]" + ) with pytest.raises(KeyError, match=msg): - float_frame.ix[['foo', 'bar', 'baz']] = 1 - msg = (r"None of \[Index\(\['E'\], dtype='object'\)\] are in the" - r" \[columns\]") + float_frame.ix[["foo", "bar", "baz"]] = 1 + msg = ( + r"None of \[Index\(\['E'\], dtype='object'\)\] are in the" + r" \[columns\]" + ) with pytest.raises(KeyError, match=msg): - float_frame.ix[:, ['E']] = 1 + float_frame.ix[:, ["E"]] = 1 # partial setting now allows this GH2578 # pytest.raises(KeyError, float_frame.ix.__setitem__, @@ -1117,29 +1150,27 @@ def test_setitem_fancy_mixed_2d(self, float_string_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - self.mixed_frame.ix[:5, ['C', 'B', 'A']] = 5 - result = self.mixed_frame.ix[:5, ['C', 'B', 'A']] + self.mixed_frame.ix[:5, ["C", "B", "A"]] = 5 + result = self.mixed_frame.ix[:5, ["C", "B", "A"]] assert (result.values == 5).all() float_string_frame.ix[5] = np.nan assert isna(float_string_frame.ix[5]).all() float_string_frame.ix[5] = float_string_frame.ix[6] - assert_series_equal(float_string_frame.ix[5], - float_string_frame.ix[6], - check_names=False) + assert_series_equal( + float_string_frame.ix[5], float_string_frame.ix[6], check_names=False + ) # #1432 with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - df = DataFrame({1: [1., 2., 3.], - 2: [3, 4, 5]}) + df = DataFrame({1: [1.0, 2.0, 3.0], 2: [3, 4, 5]}) assert df._is_mixed_type df.ix[1] = [5, 10] - expected = DataFrame({1: [1., 5., 3.], - 2: [3, 10, 5]}) + expected = DataFrame({1: [1.0, 5.0, 3.0], 2: [3, 10, 5]}) assert_frame_equal(df, expected) @@ -1249,40 +1280,42 @@ def test_ix_multi_take(self): """ def test_ix_multi_take_nonint_index(self): - df = DataFrame(np.random.randn(3, 2), index=['x', 'y', 'z'], - columns=['a', 'b']) + df = DataFrame(np.random.randn(3, 2), index=["x", "y", "z"], columns=["a", "b"]) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) rs = df.ix[[0], [0]] - xp = df.reindex(['x'], columns=['a']) + xp = df.reindex(["x"], columns=["a"]) assert_frame_equal(rs, xp) def test_ix_multi_take_multiindex(self): - df = DataFrame(np.random.randn(3, 2), index=['x', 'y', 'z'], - columns=[['a', 'b'], ['1', '2']]) + df = DataFrame( + np.random.randn(3, 2), + index=["x", "y", "z"], + columns=[["a", "b"], ["1", "2"]], + ) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) rs = df.ix[[0], [0]] - xp = df.reindex(['x'], columns=[('a', '1')]) + xp = df.reindex(["x"], columns=[("a", "1")]) assert_frame_equal(rs, xp) def test_ix_dup(self): - idx = Index(['a', 'a', 'b', 'c', 'd', 'd']) + idx = Index(["a", "a", "b", "c", "d", "d"]) df = DataFrame(np.random.randn(len(idx), 3), idx) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - sub = df.ix[:'d'] + sub = df.ix[:"d"] assert_frame_equal(sub, df) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - sub = df.ix['a':'c'] + sub = df.ix["a":"c"] assert_frame_equal(sub, df.ix[0:4]) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - sub = df.ix['b':'d'] + sub = df.ix["b":"d"] assert_frame_equal(sub, df.ix[2:]) def test_getitem_fancy_1d(self, float_frame, float_string_frame): @@ -1296,8 +1329,8 @@ def test_getitem_fancy_1d(self, float_frame, float_string_frame): # low dimensional slice with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - xs1 = f.ix[2, ['C', 'B', 'A']] - xs2 = f.xs(f.index[2]).reindex(['C', 'B', 'A']) + xs1 = f.ix[2, ["C", "B", "A"]] + xs2 = f.xs(f.index[2]).reindex(["C", "B", "A"]) tm.assert_series_equal(xs1, xs2) with catch_warnings(record=True): @@ -1322,7 +1355,7 @@ def test_getitem_fancy_1d(self, float_frame, float_string_frame): # single column with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - assert_series_equal(f.ix[:, 'A'], f['A']) + assert_series_equal(f.ix[:, "A"], f["A"]) # return view with catch_warnings(record=True): @@ -1353,16 +1386,16 @@ def test_setitem_fancy_1d(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - frame.ix[2, ['C', 'B', 'A']] = [1., 2., 3.] - expected['C'][2] = 1. - expected['B'][2] = 2. - expected['A'][2] = 3. + frame.ix[2, ["C", "B", "A"]] = [1.0, 2.0, 3.0] + expected["C"][2] = 1.0 + expected["B"][2] = 2.0 + expected["A"][2] = 3.0 assert_frame_equal(frame, expected) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) frame2 = float_frame.copy() - frame2.ix[2, [3, 2, 1]] = [1., 2., 3.] + frame2.ix[2, [3, 2, 1]] = [1.0, 2.0, 3.0] assert_frame_equal(frame, expected) # case 2, set a section of a column @@ -1379,7 +1412,7 @@ def test_setitem_fancy_1d(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) frame2 = float_frame.copy() - frame2.ix[5:10, 'B'] = vals + frame2.ix[5:10, "B"] = vals assert_frame_equal(frame, expected) # case 3: full xs @@ -1388,14 +1421,14 @@ def test_setitem_fancy_1d(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - frame.ix[4] = 5. - expected.values[4] = 5. + frame.ix[4] = 5.0 + expected.values[4] = 5.0 assert_frame_equal(frame, expected) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - frame.ix[frame.index[4]] = 6. - expected.values[4] = 6. + frame.ix[frame.index[4]] = 6.0 + expected.values[4] = 6.0 assert_frame_equal(frame, expected) # single column @@ -1404,8 +1437,8 @@ def test_setitem_fancy_1d(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - frame.ix[:, 'A'] = 7. - expected['A'] = 7. + frame.ix[:, "A"] = 7.0 + expected["A"] = 7.0 assert_frame_equal(frame, expected) def test_getitem_fancy_scalar(self, float_frame): @@ -1438,11 +1471,11 @@ def test_getitem_fancy_boolean(self, float_frame): f = float_frame ix = f.loc - expected = f.reindex(columns=['B', 'D']) + expected = f.reindex(columns=["B", "D"]) result = ix[:, [False, True, False, True]] assert_frame_equal(result, expected) - expected = f.reindex(index=f.index[5:10], columns=['B', 'D']) + expected = f.reindex(index=f.index[5:10], columns=["B", "D"]) result = ix[f.index[5:10], [False, True, False, True]] assert_frame_equal(result, expected) @@ -1454,8 +1487,7 @@ def test_getitem_fancy_boolean(self, float_frame): assert_frame_equal(result, expected) result = ix[boolvec, f.columns[2:]] - expected = f.reindex(index=f.index[boolvec], - columns=['C', 'D']) + expected = f.reindex(index=f.index[boolvec], columns=["C", "D"]) assert_frame_equal(result, expected) def test_setitem_fancy_boolean(self, float_frame): @@ -1463,15 +1495,15 @@ def test_setitem_fancy_boolean(self, float_frame): frame = float_frame.copy() expected = float_frame.copy() - mask = frame['A'] > 0 - frame.loc[mask] = 0. - expected.values[mask.values] = 0. + mask = frame["A"] > 0 + frame.loc[mask] = 0.0 + expected.values[mask.values] = 0.0 assert_frame_equal(frame, expected) frame = float_frame.copy() expected = float_frame.copy() - frame.loc[mask, ['A', 'B']] = 0. - expected.values[mask.values, :2] = 0. + frame.loc[mask, ["A", "B"]] = 0.0 + expected.values[mask.values, :2] = 0.0 assert_frame_equal(frame, expected) def test_getitem_fancy_ints(self, float_frame): @@ -1485,7 +1517,7 @@ def test_getitem_fancy_ints(self, float_frame): def test_getitem_setitem_fancy_exceptions(self, float_frame): ix = float_frame.iloc - with pytest.raises(IndexingError, match='Too many indexers'): + with pytest.raises(IndexingError, match="Too many indexers"): ix[:, :, :] with pytest.raises(IndexingError): @@ -1493,7 +1525,7 @@ def test_getitem_setitem_fancy_exceptions(self, float_frame): def test_getitem_setitem_boolean_misaligned(self, float_frame): # boolean index misaligned labels - mask = float_frame['A'][::-1] > 1 + mask = float_frame["A"][::-1] > 1 result = float_frame.loc[mask] expected = float_frame.loc[mask[::-1]] @@ -1516,8 +1548,7 @@ def test_getitem_setitem_boolean_multi(self): assert_frame_equal(result, expected) expected = df.copy() - df.loc[np.array([True, False, True]), - np.array([False, True])] = 5 + df.loc[np.array([True, False, True]), np.array([False, True])] = 5 expected.loc[[0, 2], [1]] = 5 assert_frame_equal(df, expected) @@ -1554,9 +1585,11 @@ def test_getitem_setitem_float_labels(self): df = DataFrame(np.random.randn(5, 5), index=index) # positional slicing only via iloc! - msg = ("cannot do slice indexing on" - r" with" - r" these indexers \[1.0\] of ") + msg = ( + "cannot do slice indexing on" + r" with" + r" these indexers \[1.0\] of " + ) with pytest.raises(TypeError, match=msg): df.iloc[1.0:5] @@ -1608,38 +1641,44 @@ def test_getitem_setitem_float_labels(self): assert (result == 0).values.all() def test_setitem_single_column_mixed(self): - df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], - columns=['foo', 'bar', 'baz']) - df['str'] = 'qux' - df.loc[df.index[::2], 'str'] = np.nan - expected = np.array([np.nan, 'qux', np.nan, 'qux', np.nan], - dtype=object) - assert_almost_equal(df['str'].values, expected) + df = DataFrame( + np.random.randn(5, 3), + index=["a", "b", "c", "d", "e"], + columns=["foo", "bar", "baz"], + ) + df["str"] = "qux" + df.loc[df.index[::2], "str"] = np.nan + expected = np.array([np.nan, "qux", np.nan, "qux", np.nan], dtype=object) + assert_almost_equal(df["str"].values, expected) def test_setitem_single_column_mixed_datetime(self): - df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], - columns=['foo', 'bar', 'baz']) + df = DataFrame( + np.random.randn(5, 3), + index=["a", "b", "c", "d", "e"], + columns=["foo", "bar", "baz"], + ) - df['timestamp'] = Timestamp('20010102') + df["timestamp"] = Timestamp("20010102") # check our dtypes result = df.dtypes - expected = Series([np.dtype('float64')] * 3 + - [np.dtype('datetime64[ns]')], - index=['foo', 'bar', 'baz', 'timestamp']) + expected = Series( + [np.dtype("float64")] * 3 + [np.dtype("datetime64[ns]")], + index=["foo", "bar", "baz", "timestamp"], + ) assert_series_equal(result, expected) # set an allowable datetime64 type - df.loc['b', 'timestamp'] = iNaT - assert isna(df.loc['b', 'timestamp']) + df.loc["b", "timestamp"] = iNaT + assert isna(df.loc["b", "timestamp"]) # allow this syntax - df.loc['c', 'timestamp'] = np.nan - assert isna(df.loc['c', 'timestamp']) + df.loc["c", "timestamp"] = np.nan + assert isna(df.loc["c", "timestamp"]) # allow this syntax - df.loc['d', :] = np.nan - assert not isna(df.loc['c', :]).all() + df.loc["d", :] = np.nan + assert not isna(df.loc["c", :]).all() # as of GH 3216 this will now work! # try to set with a list like item @@ -1648,27 +1687,37 @@ def test_setitem_single_column_mixed_datetime(self): def test_setitem_mixed_datetime(self): # GH 9336 - expected = DataFrame({'a': [0, 0, 0, 0, 13, 14], - 'b': [pd.datetime(2012, 1, 1), - 1, - 'x', - 'y', - pd.datetime(2013, 1, 1), - pd.datetime(2014, 1, 1)]}) - df = pd.DataFrame(0, columns=list('ab'), index=range(6)) - df['b'] = pd.NaT - df.loc[0, 'b'] = pd.datetime(2012, 1, 1) - df.loc[1, 'b'] = 1 - df.loc[[2, 3], 'b'] = 'x', 'y' - A = np.array([[13, np.datetime64('2013-01-01T00:00:00')], - [14, np.datetime64('2014-01-01T00:00:00')]]) - df.loc[[4, 5], ['a', 'b']] = A + expected = DataFrame( + { + "a": [0, 0, 0, 0, 13, 14], + "b": [ + pd.datetime(2012, 1, 1), + 1, + "x", + "y", + pd.datetime(2013, 1, 1), + pd.datetime(2014, 1, 1), + ], + } + ) + df = pd.DataFrame(0, columns=list("ab"), index=range(6)) + df["b"] = pd.NaT + df.loc[0, "b"] = pd.datetime(2012, 1, 1) + df.loc[1, "b"] = 1 + df.loc[[2, 3], "b"] = "x", "y" + A = np.array( + [ + [13, np.datetime64("2013-01-01T00:00:00")], + [14, np.datetime64("2014-01-01T00:00:00")], + ] + ) + df.loc[[4, 5], ["a", "b"]] = A assert_frame_equal(df, expected) def test_setitem_frame_float(self, float_frame): - piece = float_frame.loc[float_frame.index[:2], ['A', 'B']] - float_frame.loc[float_frame.index[-2]:, ['A', 'B']] = piece.values - result = float_frame.loc[float_frame.index[-2:], ['A', 'B']].values + piece = float_frame.loc[float_frame.index[:2], ["A", "B"]] + float_frame.loc[float_frame.index[-2] :, ["A", "B"]] = piece.values + result = float_frame.loc[float_frame.index[-2:], ["A", "B"]].values expected = piece.values assert_almost_equal(result, expected) @@ -1677,83 +1726,80 @@ def test_setitem_frame_mixed(self, float_string_frame): # already aligned f = float_string_frame.copy() - piece = DataFrame([[1., 2.], [3., 4.]], - index=f.index[0:2], columns=['A', 'B']) - key = (slice(None, 2), ['A', 'B']) + piece = DataFrame( + [[1.0, 2.0], [3.0, 4.0]], index=f.index[0:2], columns=["A", "B"] + ) + key = (slice(None, 2), ["A", "B"]) f.loc[key] = piece - assert_almost_equal(f.loc[f.index[0:2], ['A', 'B']].values, - piece.values) + assert_almost_equal(f.loc[f.index[0:2], ["A", "B"]].values, piece.values) # rows unaligned f = float_string_frame.copy() - piece = DataFrame([[1., 2.], [3., 4.], [5., 6.], [7., 8.]], - index=list(f.index[0:2]) + ['foo', 'bar'], - columns=['A', 'B']) - key = (slice(None, 2), ['A', 'B']) + piece = DataFrame( + [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]], + index=list(f.index[0:2]) + ["foo", "bar"], + columns=["A", "B"], + ) + key = (slice(None, 2), ["A", "B"]) f.loc[key] = piece - assert_almost_equal(f.loc[f.index[0:2:], ['A', 'B']].values, - piece.values[0:2]) + assert_almost_equal(f.loc[f.index[0:2:], ["A", "B"]].values, piece.values[0:2]) # key is unaligned with values f = float_string_frame.copy() - piece = f.loc[f.index[:2], ['A']] + piece = f.loc[f.index[:2], ["A"]] piece.index = f.index[-2:] - key = (slice(-2, None), ['A', 'B']) + key = (slice(-2, None), ["A", "B"]) f.loc[key] = piece - piece['B'] = np.nan - assert_almost_equal(f.loc[f.index[-2:], ['A', 'B']].values, - piece.values) + piece["B"] = np.nan + assert_almost_equal(f.loc[f.index[-2:], ["A", "B"]].values, piece.values) # ndarray f = float_string_frame.copy() - piece = float_string_frame.loc[f.index[:2], ['A', 'B']] - key = (slice(-2, None), ['A', 'B']) + piece = float_string_frame.loc[f.index[:2], ["A", "B"]] + key = (slice(-2, None), ["A", "B"]) f.loc[key] = piece.values - assert_almost_equal(f.loc[f.index[-2:], ['A', 'B']].values, - piece.values) + assert_almost_equal(f.loc[f.index[-2:], ["A", "B"]].values, piece.values) def test_setitem_frame_upcast(self): # needs upcasting - df = DataFrame([[1, 2, 'foo'], [3, 4, 'bar']], columns=['A', 'B', 'C']) + df = DataFrame([[1, 2, "foo"], [3, 4, "bar"]], columns=["A", "B", "C"]) df2 = df.copy() - df2.loc[:, ['A', 'B']] = df.loc[:, ['A', 'B']] + 0.5 - expected = df.reindex(columns=['A', 'B']) + df2.loc[:, ["A", "B"]] = df.loc[:, ["A", "B"]] + 0.5 + expected = df.reindex(columns=["A", "B"]) expected += 0.5 - expected['C'] = df['C'] + expected["C"] = df["C"] assert_frame_equal(df2, expected) def test_setitem_frame_align(self, float_frame): - piece = float_frame.loc[float_frame.index[:2], ['A', 'B']] + piece = float_frame.loc[float_frame.index[:2], ["A", "B"]] piece.index = float_frame.index[-2:] - piece.columns = ['A', 'B'] - float_frame.loc[float_frame.index[-2:], ['A', 'B']] = piece - result = float_frame.loc[float_frame.index[-2:], ['A', 'B']].values + piece.columns = ["A", "B"] + float_frame.loc[float_frame.index[-2:], ["A", "B"]] = piece + result = float_frame.loc[float_frame.index[-2:], ["A", "B"]].values expected = piece.values assert_almost_equal(result, expected) def test_getitem_setitem_ix_duplicates(self): # #1201 - df = DataFrame(np.random.randn(5, 3), - index=['foo', 'foo', 'bar', 'baz', 'bar']) + df = DataFrame(np.random.randn(5, 3), index=["foo", "foo", "bar", "baz", "bar"]) - result = df.loc['foo'] + result = df.loc["foo"] expected = df[:2] assert_frame_equal(result, expected) - result = df.loc['bar'] + result = df.loc["bar"] expected = df.iloc[[2, 4]] assert_frame_equal(result, expected) - result = df.loc['baz'] + result = df.loc["baz"] expected = df.iloc[3] assert_series_equal(result, expected) def test_getitem_ix_boolean_duplicates_multiple(self): # #1201 - df = DataFrame(np.random.randn(5, 3), - index=['foo', 'foo', 'bar', 'baz', 'bar']) + df = DataFrame(np.random.randn(5, 3), index=["foo", "foo", "bar", "baz", "bar"]) - result = df.loc[['bar']] + result = df.loc[["bar"]] exp = df.iloc[[2, 4]] assert_frame_equal(result, exp) @@ -1767,7 +1813,7 @@ def test_getitem_ix_boolean_duplicates_multiple(self): def test_getitem_setitem_ix_bool_keyerror(self): # #2199 - df = DataFrame({'a': [1, 2, 3]}) + df = DataFrame({"a": [1, 2, 3]}) with pytest.raises(KeyError, match=r"^False$"): df.loc[False] @@ -1782,11 +1828,11 @@ def test_getitem_setitem_ix_bool_keyerror(self): def test_getitem_list_duplicates(self): # #1943 - df = DataFrame(np.random.randn(4, 4), columns=list('AABC')) - df.columns.name = 'foo' + df = DataFrame(np.random.randn(4, 4), columns=list("AABC")) + df.columns.name = "foo" - result = df[['B', 'C']] - assert result.columns.name == 'foo' + result = df[["B", "C"]] + assert result.columns.name == "foo" expected = df.iloc[:, 2:] assert_frame_equal(result, expected) @@ -1794,8 +1840,7 @@ def test_getitem_list_duplicates(self): def test_get_value(self, float_frame): for idx in float_frame.index: for col in float_frame.columns: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = float_frame.get_value(idx, col) expected = float_frame[col][idx] assert result == expected @@ -1815,131 +1860,125 @@ def test_lookup_mixed(self, float_string_frame): cols = list(df.columns) * len(df.index) result = df.lookup(rows, cols) - expected = np.array([df.loc[r, c] for r, c in zip(rows, cols)], - dtype=np.object_) + expected = np.array( + [df.loc[r, c] for r, c in zip(rows, cols)], dtype=np.object_ + ) tm.assert_almost_equal(result, expected) def test_lookup_bool(self): - df = DataFrame({'label': ['a', 'b', 'a', 'c'], - 'mask_a': [True, True, False, True], - 'mask_b': [True, False, False, False], - 'mask_c': [False, True, False, True]}) - df['mask'] = df.lookup(df.index, 'mask_' + df['label']) - - exp_mask = np.array([ - df.loc[r, c] for r, c in zip(df.index, 'mask_' + df['label'])]) - - tm.assert_series_equal(df['mask'], pd.Series(exp_mask, name='mask')) - assert df['mask'].dtype == np.bool_ + df = DataFrame( + { + "label": ["a", "b", "a", "c"], + "mask_a": [True, True, False, True], + "mask_b": [True, False, False, False], + "mask_c": [False, True, False, True], + } + ) + df["mask"] = df.lookup(df.index, "mask_" + df["label"]) + + exp_mask = np.array( + [df.loc[r, c] for r, c in zip(df.index, "mask_" + df["label"])] + ) + + tm.assert_series_equal(df["mask"], pd.Series(exp_mask, name="mask")) + assert df["mask"].dtype == np.bool_ def test_lookup_raises(self, float_frame): with pytest.raises(KeyError): - float_frame.lookup(['xyz'], ['A']) + float_frame.lookup(["xyz"], ["A"]) with pytest.raises(KeyError): - float_frame.lookup([float_frame.index[0]], ['xyz']) + float_frame.lookup([float_frame.index[0]], ["xyz"]) - with pytest.raises(ValueError, match='same size'): - float_frame.lookup(['a', 'b', 'c'], ['a']) + with pytest.raises(ValueError, match="same size"): + float_frame.lookup(["a", "b", "c"], ["a"]) def test_set_value(self, float_frame): for idx in float_frame.index: for col in float_frame.columns: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): float_frame.set_value(idx, col, 1) assert float_frame[col][idx] == 1 def test_set_value_resize(self, float_frame): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - res = float_frame.set_value('foobar', 'B', 0) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + res = float_frame.set_value("foobar", "B", 0) assert res is float_frame - assert res.index[-1] == 'foobar' - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - assert res.get_value('foobar', 'B') == 0 + assert res.index[-1] == "foobar" + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert res.get_value("foobar", "B") == 0 - float_frame.loc['foobar', 'qux'] = 0 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - assert float_frame.get_value('foobar', 'qux') == 0 + float_frame.loc["foobar", "qux"] = 0 + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert float_frame.get_value("foobar", "qux") == 0 res = float_frame.copy() - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - res3 = res.set_value('foobar', 'baz', 'sam') - assert res3['baz'].dtype == np.object_ + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + res3 = res.set_value("foobar", "baz", "sam") + assert res3["baz"].dtype == np.object_ res = float_frame.copy() - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - res3 = res.set_value('foobar', 'baz', True) - assert res3['baz'].dtype == np.object_ + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + res3 = res.set_value("foobar", "baz", True) + assert res3["baz"].dtype == np.object_ res = float_frame.copy() - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - res3 = res.set_value('foobar', 'baz', 5) - assert is_float_dtype(res3['baz']) - assert isna(res3['baz'].drop(['foobar'])).all() - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + res3 = res.set_value("foobar", "baz", 5) + assert is_float_dtype(res3["baz"]) + assert isna(res3["baz"].drop(["foobar"])).all() + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): msg = "could not convert string to float: 'sam'" with pytest.raises(ValueError, match=msg): - res3.set_value('foobar', 'baz', 'sam') + res3.set_value("foobar", "baz", "sam") def test_set_value_with_index_dtype_change(self): - df_orig = DataFrame(np.random.randn(3, 3), - index=range(3), columns=list('ABC')) + df_orig = DataFrame(np.random.randn(3, 3), index=range(3), columns=list("ABC")) # this is actually ambiguous as the 2 is interpreted as a positional # so column is not created df = df_orig.copy() - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - df.set_value('C', 2, 1.0) - assert list(df.index) == list(df_orig.index) + ['C'] + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + df.set_value("C", 2, 1.0) + assert list(df.index) == list(df_orig.index) + ["C"] # assert list(df.columns) == list(df_orig.columns) + [2] df = df_orig.copy() - df.loc['C', 2] = 1.0 - assert list(df.index) == list(df_orig.index) + ['C'] + df.loc["C", 2] = 1.0 + assert list(df.index) == list(df_orig.index) + ["C"] # assert list(df.columns) == list(df_orig.columns) + [2] # create both new df = df_orig.copy() - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - df.set_value('C', 'D', 1.0) - assert list(df.index) == list(df_orig.index) + ['C'] - assert list(df.columns) == list(df_orig.columns) + ['D'] + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + df.set_value("C", "D", 1.0) + assert list(df.index) == list(df_orig.index) + ["C"] + assert list(df.columns) == list(df_orig.columns) + ["D"] df = df_orig.copy() - df.loc['C', 'D'] = 1.0 - assert list(df.index) == list(df_orig.index) + ['C'] - assert list(df.columns) == list(df_orig.columns) + ['D'] + df.loc["C", "D"] = 1.0 + assert list(df.index) == list(df_orig.index) + ["C"] + assert list(df.columns) == list(df_orig.columns) + ["D"] def test_get_set_value_no_partial_indexing(self): # partial w/ MultiIndex raise exception index = MultiIndex.from_tuples([(0, 1), (0, 2), (1, 1), (1, 2)]) df = DataFrame(index=index, columns=range(4)) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): with pytest.raises(KeyError, match=r"^0$"): df.get_value(0, 1) def test_single_element_ix_dont_upcast(self, float_frame): - float_frame['E'] = 1 - assert issubclass(float_frame['E'].dtype.type, (int, np.integer)) + float_frame["E"] = 1 + assert issubclass(float_frame["E"].dtype.type, (int, np.integer)) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - result = float_frame.ix[float_frame.index[5], 'E'] + result = float_frame.ix[float_frame.index[5], "E"] assert is_integer(result) - result = float_frame.loc[float_frame.index[5], 'E'] + result = float_frame.loc[float_frame.index[5], "E"] assert is_integer(result) # GH 11617 @@ -1953,7 +1992,7 @@ def test_single_element_ix_dont_upcast(self, float_frame): result = df.loc[0, "b"] assert is_integer(result) - expected = Series([666], [0], name='b') + expected = Series([666], [0], name="b") with catch_warnings(record=True): simplefilter("ignore", FutureWarning) result = df.ix[[0], "b"] @@ -1980,10 +2019,10 @@ def test_iloc_row(self): # verify slice is view # setting it makes it raise/warn with pytest.raises(com.SettingWithCopyError): - result[2] = 0. + result[2] = 0.0 exp_col = df[2].copy() - exp_col[4:8] = 0. + exp_col[4:8] = 0.0 assert_series_equal(df[2], exp_col) # list of integers @@ -2011,7 +2050,7 @@ def test_iloc_col(self): # verify slice is view # and that we are setting a copy with pytest.raises(com.SettingWithCopyError): - result[8] = 0. + result[8] = 0.0 assert (df[8] == 0).all() @@ -2022,8 +2061,7 @@ def test_iloc_col(self): def test_iloc_duplicates(self): - df = DataFrame(np.random.rand(3, 3), columns=list('ABC'), - index=list('aab')) + df = DataFrame(np.random.rand(3, 3), columns=list("ABC"), index=list("aab")) result = df.iloc[0] with catch_warnings(record=True): @@ -2042,9 +2080,11 @@ def test_iloc_duplicates(self): assert_series_equal(result, result2) # multiindex - df = DataFrame(np.random.randn(3, 3), - columns=[['i', 'i', 'j'], ['A', 'A', 'B']], - index=[['i', 'i', 'j'], ['X', 'X', 'Y']]) + df = DataFrame( + np.random.randn(3, 3), + columns=[["i", "i", "j"], ["A", "A", "B"]], + index=[["i", "i", "j"], ["X", "X", "Y"]], + ) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) @@ -2074,11 +2114,12 @@ def test_loc_duplicates(self): # gh-17105 # insert a duplicate element to the index - trange = pd.date_range(start=pd.Timestamp(year=2017, month=1, day=1), - end=pd.Timestamp(year=2017, month=1, day=5)) + trange = pd.date_range( + start=pd.Timestamp(year=2017, month=1, day=1), + end=pd.Timestamp(year=2017, month=1, day=5), + ) - trange = trange.insert(loc=5, - item=pd.Timestamp(year=2017, month=1, day=5)) + trange = trange.insert(loc=5, item=pd.Timestamp(year=2017, month=1, day=5)) df = pd.DataFrame(0, index=trange, columns=["A", "B"]) bool_idx = np.array([False, False, False, False, False, True]) @@ -2086,9 +2127,9 @@ def test_loc_duplicates(self): # assignment df.loc[trange[bool_idx], "A"] = 6 - expected = pd.DataFrame({'A': [0, 0, 0, 0, 6, 6], - 'B': [0, 0, 0, 0, 0, 0]}, - index=trange) + expected = pd.DataFrame( + {"A": [0, 0, 0, 0, 6, 6], "B": [0, 0, 0, 0, 0, 0]}, index=trange + ) tm.assert_frame_equal(df, expected) # in-place @@ -2099,8 +2140,9 @@ def test_loc_duplicates(self): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_iloc_sparse_propegate_fill_value(self): from pandas.core.sparse.api import SparseDataFrame - df = SparseDataFrame({'A': [999, 1]}, default_fill_value=999) - assert len(df['A'].sp_values) == len(df.iloc[:, 0].sp_values) + + df = SparseDataFrame({"A": [999, 1]}, default_fill_value=999) + assert len(df["A"].sp_values) == len(df.iloc[:, 0].sp_values) def test_iat(self, float_frame): @@ -2115,8 +2157,9 @@ def test_nested_exception(self): # (which may get fixed), it's just a way to trigger # the issue or reraising an outer exception without # a named argument - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], - "c": [7, 8, 9]}).set_index(["a", "b"]) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}).set_index( + ["a", "b"] + ) index = list(df.index) index[0] = ["a", "b"] df.index = index @@ -2126,16 +2169,19 @@ def test_nested_exception(self): except Exception as e: assert type(e) != UnboundLocalError - @pytest.mark.parametrize("method,expected_values", [ - ("nearest", [0, 1, 1, 2]), - ("pad", [np.nan, 0, 1, 1]), - ("backfill", [0, 1, 2, 2]) - ]) + @pytest.mark.parametrize( + "method,expected_values", + [ + ("nearest", [0, 1, 1, 2]), + ("pad", [np.nan, 0, 1, 1]), + ("backfill", [0, 1, 2, 2]), + ], + ) def test_reindex_methods(self, method, expected_values): df = pd.DataFrame({"x": list(range(5))}) target = np.array([-0.1, 0.9, 1.1, 1.5]) - expected = pd.DataFrame({'x': expected_values}, index=target) + expected = pd.DataFrame({"x": expected_values}, index=target) actual = df.reindex(target, method=method) assert_frame_equal(expected, actual) @@ -2158,9 +2204,9 @@ def test_reindex_methods(self, method, expected_values): actual = df.reindex(target[new_order], method=method) assert_frame_equal(e2, actual) - switched_method = ('pad' if method == 'backfill' - else 'backfill' if method == 'pad' - else method) + switched_method = ( + "pad" if method == "backfill" else "backfill" if method == "pad" else method + ) actual = df[::-1].reindex(target, method=switched_method) assert_frame_equal(expected, actual) @@ -2173,49 +2219,49 @@ def test_reindex_methods_nearest_special(self): assert_frame_equal(expected, actual) expected = pd.DataFrame({"x": [0, np.nan, 1, np.nan]}, index=target) - actual = df.reindex(target, method="nearest", - tolerance=[0.5, 0.01, 0.4, 0.1]) + actual = df.reindex(target, method="nearest", tolerance=[0.5, 0.01, 0.4, 0.1]) assert_frame_equal(expected, actual) def test_reindex_frame_add_nat(self): - rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s') - df = DataFrame({'A': np.random.randn(len(rng)), 'B': rng}) + rng = date_range("1/1/2000 00:00:00", periods=10, freq="10s") + df = DataFrame({"A": np.random.randn(len(rng)), "B": rng}) result = df.reindex(range(15)) - assert np.issubdtype(result['B'].dtype, np.dtype('M8[ns]')) + assert np.issubdtype(result["B"].dtype, np.dtype("M8[ns]")) - mask = com.isna(result)['B'] + mask = com.isna(result)["B"] assert mask[-5:].all() assert not mask[:-5].any() def test_set_dataframe_column_ns_dtype(self): x = DataFrame([datetime.now(), datetime.now()]) - assert x[0].dtype == np.dtype('M8[ns]') + assert x[0].dtype == np.dtype("M8[ns]") def test_non_monotonic_reindex_methods(self): - dr = pd.date_range('2013-08-01', periods=6, freq='B') + dr = pd.date_range("2013-08-01", periods=6, freq="B") data = np.random.randn(6, 1) - df = pd.DataFrame(data, index=dr, columns=list('A')) - df_rev = pd.DataFrame(data, index=dr[[3, 4, 5] + [0, 1, 2]], - columns=list('A')) + df = pd.DataFrame(data, index=dr, columns=list("A")) + df_rev = pd.DataFrame(data, index=dr[[3, 4, 5] + [0, 1, 2]], columns=list("A")) # index is not monotonic increasing or decreasing msg = "index must be monotonic increasing or decreasing" with pytest.raises(ValueError, match=msg): - df_rev.reindex(df.index, method='pad') + df_rev.reindex(df.index, method="pad") with pytest.raises(ValueError, match=msg): - df_rev.reindex(df.index, method='ffill') + df_rev.reindex(df.index, method="ffill") with pytest.raises(ValueError, match=msg): - df_rev.reindex(df.index, method='bfill') + df_rev.reindex(df.index, method="bfill") with pytest.raises(ValueError, match=msg): - df_rev.reindex(df.index, method='nearest') + df_rev.reindex(df.index, method="nearest") def test_reindex_level(self): from itertools import permutations - icol = ['jim', 'joe', 'jolie'] + + icol = ["jim", "joe", "jolie"] def verify_first_level(df, level, idx, check_index_type=True): def f(val): return np.nonzero((df[level] == val).to_numpy())[0] + i = np.concatenate(list(map(f, idx))) left = df.set_index(icol).reindex(idx, level=level) right = df.iloc[i].set_index(icol) @@ -2226,64 +2272,87 @@ def verify(df, level, idx, indexer, check_index_type=True): right = df.iloc[indexer].set_index(icol) assert_frame_equal(left, right, check_index_type=check_index_type) - df = pd.DataFrame({'jim': list('B' * 4 + 'A' * 2 + 'C' * 3), - 'joe': list('abcdeabcd')[::-1], - 'jolie': [10, 20, 30] * 3, - 'joline': np.random.randint(0, 1000, 9)}) - - target = [['C', 'B', 'A'], ['F', 'C', 'A', 'D'], ['A'], - ['A', 'B', 'C'], ['C', 'A', 'B'], ['C', 'B'], ['C', 'A'], - ['A', 'B'], ['B', 'A', 'C']] + df = pd.DataFrame( + { + "jim": list("B" * 4 + "A" * 2 + "C" * 3), + "joe": list("abcdeabcd")[::-1], + "jolie": [10, 20, 30] * 3, + "joline": np.random.randint(0, 1000, 9), + } + ) + + target = [ + ["C", "B", "A"], + ["F", "C", "A", "D"], + ["A"], + ["A", "B", "C"], + ["C", "A", "B"], + ["C", "B"], + ["C", "A"], + ["A", "B"], + ["B", "A", "C"], + ] for idx in target: - verify_first_level(df, 'jim', idx) + verify_first_level(df, "jim", idx) # reindex by these causes different MultiIndex levels - for idx in [['D', 'F'], ['A', 'C', 'B']]: - verify_first_level(df, 'jim', idx, check_index_type=False) - - verify(df, 'joe', list('abcde'), [3, 2, 1, 0, 5, 4, 8, 7, 6]) - verify(df, 'joe', list('abcd'), [3, 2, 1, 0, 5, 8, 7, 6]) - verify(df, 'joe', list('abc'), [3, 2, 1, 8, 7, 6]) - verify(df, 'joe', list('eca'), [1, 3, 4, 6, 8]) - verify(df, 'joe', list('edc'), [0, 1, 4, 5, 6]) - verify(df, 'joe', list('eadbc'), [3, 0, 2, 1, 4, 5, 8, 7, 6]) - verify(df, 'joe', list('edwq'), [0, 4, 5]) - verify(df, 'joe', list('wq'), [], check_index_type=False) - - df = DataFrame({'jim': ['mid'] * 5 + ['btm'] * 8 + ['top'] * 7, - 'joe': ['3rd'] * 2 + ['1st'] * 3 + ['2nd'] * 3 + - ['1st'] * 2 + ['3rd'] * 3 + ['1st'] * 2 + - ['3rd'] * 3 + ['2nd'] * 2, - # this needs to be jointly unique with jim and joe or - # reindexing will fail ~1.5% of the time, this works - # out to needing unique groups of same size as joe - 'jolie': np.concatenate([ - np.random.choice(1000, x, replace=False) - for x in [2, 3, 3, 2, 3, 2, 3, 2]]), - 'joline': np.random.randn(20).round(3) * 10}) - - for idx in permutations(df['jim'].unique()): + for idx in [["D", "F"], ["A", "C", "B"]]: + verify_first_level(df, "jim", idx, check_index_type=False) + + verify(df, "joe", list("abcde"), [3, 2, 1, 0, 5, 4, 8, 7, 6]) + verify(df, "joe", list("abcd"), [3, 2, 1, 0, 5, 8, 7, 6]) + verify(df, "joe", list("abc"), [3, 2, 1, 8, 7, 6]) + verify(df, "joe", list("eca"), [1, 3, 4, 6, 8]) + verify(df, "joe", list("edc"), [0, 1, 4, 5, 6]) + verify(df, "joe", list("eadbc"), [3, 0, 2, 1, 4, 5, 8, 7, 6]) + verify(df, "joe", list("edwq"), [0, 4, 5]) + verify(df, "joe", list("wq"), [], check_index_type=False) + + df = DataFrame( + { + "jim": ["mid"] * 5 + ["btm"] * 8 + ["top"] * 7, + "joe": ["3rd"] * 2 + + ["1st"] * 3 + + ["2nd"] * 3 + + ["1st"] * 2 + + ["3rd"] * 3 + + ["1st"] * 2 + + ["3rd"] * 3 + + ["2nd"] * 2, + # this needs to be jointly unique with jim and joe or + # reindexing will fail ~1.5% of the time, this works + # out to needing unique groups of same size as joe + "jolie": np.concatenate( + [ + np.random.choice(1000, x, replace=False) + for x in [2, 3, 3, 2, 3, 2, 3, 2] + ] + ), + "joline": np.random.randn(20).round(3) * 10, + } + ) + + for idx in permutations(df["jim"].unique()): for i in range(3): - verify_first_level(df, 'jim', idx[:i + 1]) + verify_first_level(df, "jim", idx[: i + 1]) - i = [2, 3, 4, 0, 1, 8, 9, 5, 6, 7, 10, - 11, 12, 13, 14, 18, 19, 15, 16, 17] - verify(df, 'joe', ['1st', '2nd', '3rd'], i) + i = [2, 3, 4, 0, 1, 8, 9, 5, 6, 7, 10, 11, 12, 13, 14, 18, 19, 15, 16, 17] + verify(df, "joe", ["1st", "2nd", "3rd"], i) - i = [0, 1, 2, 3, 4, 10, 11, 12, 5, 6, - 7, 8, 9, 15, 16, 17, 18, 19, 13, 14] - verify(df, 'joe', ['3rd', '2nd', '1st'], i) + i = [0, 1, 2, 3, 4, 10, 11, 12, 5, 6, 7, 8, 9, 15, 16, 17, 18, 19, 13, 14] + verify(df, "joe", ["3rd", "2nd", "1st"], i) i = [0, 1, 5, 6, 7, 10, 11, 12, 18, 19, 15, 16, 17] - verify(df, 'joe', ['2nd', '3rd'], i) + verify(df, "joe", ["2nd", "3rd"], i) i = [0, 1, 2, 3, 4, 10, 11, 12, 8, 9, 15, 16, 17, 13, 14] - verify(df, 'joe', ['3rd', '1st'], i) + verify(df, "joe", ["3rd", "1st"], i) def test_getitem_ix_float_duplicates(self): - df = pd.DataFrame(np.random.randn(3, 3), - index=[0.1, 0.2, 0.2], columns=list('abc')) + df = pd.DataFrame( + np.random.randn(3, 3), index=[0.1, 0.2, 0.2], columns=list("abc") + ) expect = df.iloc[1:] assert_frame_equal(df.loc[0.2], expect) with catch_warnings(record=True): @@ -2291,7 +2360,7 @@ def test_getitem_ix_float_duplicates(self): assert_frame_equal(df.ix[0.2], expect) expect = df.iloc[1:, 0] - assert_series_equal(df.loc[0.2, 'a'], expect) + assert_series_equal(df.loc[0.2, "a"], expect) df.index = [1, 0.2, 0.2] expect = df.iloc[1:] @@ -2301,10 +2370,11 @@ def test_getitem_ix_float_duplicates(self): assert_frame_equal(df.ix[0.2], expect) expect = df.iloc[1:, 0] - assert_series_equal(df.loc[0.2, 'a'], expect) + assert_series_equal(df.loc[0.2, "a"], expect) - df = pd.DataFrame(np.random.randn(4, 3), - index=[1, 0.2, 0.2, 1], columns=list('abc')) + df = pd.DataFrame( + np.random.randn(4, 3), index=[1, 0.2, 0.2, 1], columns=list("abc") + ) expect = df.iloc[1:-1] assert_frame_equal(df.loc[0.2], expect) with catch_warnings(record=True): @@ -2312,7 +2382,7 @@ def test_getitem_ix_float_duplicates(self): assert_frame_equal(df.ix[0.2], expect) expect = df.iloc[1:-1, 0] - assert_series_equal(df.loc[0.2, 'a'], expect) + assert_series_equal(df.loc[0.2, "a"], expect) df.index = [0.1, 0.2, 2, 0.2] expect = df.iloc[[1, -1]] @@ -2322,96 +2392,96 @@ def test_getitem_ix_float_duplicates(self): assert_frame_equal(df.ix[0.2], expect) expect = df.iloc[[1, -1], 0] - assert_series_equal(df.loc[0.2, 'a'], expect) + assert_series_equal(df.loc[0.2, "a"], expect) def test_getitem_sparse_column(self): # https://github.com/pandas-dev/pandas/issues/23559 data = pd.SparseArray([0, 1]) df = pd.DataFrame({"A": data}) expected = pd.Series(data, name="A") - result = df['A'] + result = df["A"] tm.assert_series_equal(result, expected) result = df.iloc[:, 0] tm.assert_series_equal(result, expected) - result = df.loc[:, 'A'] + result = df.loc[:, "A"] tm.assert_series_equal(result, expected) def test_setitem_with_sparse_value(self): # GH8131 - df = pd.DataFrame({'c_1': ['a', 'b', 'c'], 'n_1': [1., 2., 3.]}) + df = pd.DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]}) sp_array = pd.SparseArray([0, 0, 1]) - df['new_column'] = sp_array - assert_series_equal(df['new_column'], - pd.Series(sp_array, name='new_column'), - check_names=False) + df["new_column"] = sp_array + assert_series_equal( + df["new_column"], pd.Series(sp_array, name="new_column"), check_names=False + ) def test_setitem_with_unaligned_sparse_value(self): - df = pd.DataFrame({'c_1': ['a', 'b', 'c'], 'n_1': [1., 2., 3.]}) + df = pd.DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]}) sp_series = pd.Series(pd.SparseArray([0, 0, 1]), index=[2, 1, 0]) - df['new_column'] = sp_series - exp = pd.Series(pd.SparseArray([1, 0, 0]), name='new_column') - assert_series_equal(df['new_column'], exp) + df["new_column"] = sp_series + exp = pd.Series(pd.SparseArray([1, 0, 0]), name="new_column") + assert_series_equal(df["new_column"], exp) def test_setitem_with_unaligned_tz_aware_datetime_column(self): # GH 12981 # Assignment of unaligned offset-aware datetime series. # Make sure timezone isn't lost - column = pd.Series(pd.date_range('2015-01-01', periods=3, tz='utc'), - name='dates') - df = pd.DataFrame({'dates': column}) - df['dates'] = column[[1, 0, 2]] - assert_series_equal(df['dates'], column) + column = pd.Series( + pd.date_range("2015-01-01", periods=3, tz="utc"), name="dates" + ) + df = pd.DataFrame({"dates": column}) + df["dates"] = column[[1, 0, 2]] + assert_series_equal(df["dates"], column) - df = pd.DataFrame({'dates': column}) - df.loc[[0, 1, 2], 'dates'] = column[[1, 0, 2]] - assert_series_equal(df['dates'], column) + df = pd.DataFrame({"dates": column}) + df.loc[[0, 1, 2], "dates"] = column[[1, 0, 2]] + assert_series_equal(df["dates"], column) def test_setitem_datetime_coercion(self): # gh-1048 - df = pd.DataFrame({'c': [pd.Timestamp('2010-10-01')] * 3}) - df.loc[0:1, 'c'] = np.datetime64('2008-08-08') - assert pd.Timestamp('2008-08-08') == df.loc[0, 'c'] - assert pd.Timestamp('2008-08-08') == df.loc[1, 'c'] - df.loc[2, 'c'] = date(2005, 5, 5) - assert pd.Timestamp('2005-05-05') == df.loc[2, 'c'] + df = pd.DataFrame({"c": [pd.Timestamp("2010-10-01")] * 3}) + df.loc[0:1, "c"] = np.datetime64("2008-08-08") + assert pd.Timestamp("2008-08-08") == df.loc[0, "c"] + assert pd.Timestamp("2008-08-08") == df.loc[1, "c"] + df.loc[2, "c"] = date(2005, 5, 5) + assert pd.Timestamp("2005-05-05") == df.loc[2, "c"] def test_setitem_datetimelike_with_inference(self): # GH 7592 # assignment of timedeltas with NaT one_hour = timedelta(hours=1) - df = DataFrame(index=date_range('20130101', periods=4)) - df['A'] = np.array([1 * one_hour] * 4, dtype='m8[ns]') - df.loc[:, 'B'] = np.array([2 * one_hour] * 4, dtype='m8[ns]') - df.loc[:3, 'C'] = np.array([3 * one_hour] * 3, dtype='m8[ns]') - df.loc[:, 'D'] = np.array([4 * one_hour] * 4, dtype='m8[ns]') - df.loc[df.index[:3], 'E'] = np.array([5 * one_hour] * 3, - dtype='m8[ns]') - df['F'] = np.timedelta64('NaT') - df.loc[df.index[:-1], 'F'] = np.array([6 * one_hour] * 3, - dtype='m8[ns]') - df.loc[df.index[-3]:, 'G'] = date_range('20130101', periods=3) - df['H'] = np.datetime64('NaT') + df = DataFrame(index=date_range("20130101", periods=4)) + df["A"] = np.array([1 * one_hour] * 4, dtype="m8[ns]") + df.loc[:, "B"] = np.array([2 * one_hour] * 4, dtype="m8[ns]") + df.loc[:3, "C"] = np.array([3 * one_hour] * 3, dtype="m8[ns]") + df.loc[:, "D"] = np.array([4 * one_hour] * 4, dtype="m8[ns]") + df.loc[df.index[:3], "E"] = np.array([5 * one_hour] * 3, dtype="m8[ns]") + df["F"] = np.timedelta64("NaT") + df.loc[df.index[:-1], "F"] = np.array([6 * one_hour] * 3, dtype="m8[ns]") + df.loc[df.index[-3] :, "G"] = date_range("20130101", periods=3) + df["H"] = np.datetime64("NaT") result = df.dtypes - expected = Series([np.dtype('timedelta64[ns]')] * 6 + - [np.dtype('datetime64[ns]')] * 2, - index=list('ABCDEFGH')) + expected = Series( + [np.dtype("timedelta64[ns]")] * 6 + [np.dtype("datetime64[ns]")] * 2, + index=list("ABCDEFGH"), + ) assert_series_equal(result, expected) - @pytest.mark.parametrize('idxer', ['var', ['var']]) + @pytest.mark.parametrize("idxer", ["var", ["var"]]) def test_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): # GH 11365 tz = tz_naive_fixture - idx = date_range(start='2015-07-12', periods=3, freq='H', tz=tz) - expected = DataFrame(1.2, index=idx, columns=['var']) - result = DataFrame(index=idx, columns=['var']) + idx = date_range(start="2015-07-12", periods=3, freq="H", tz=tz) + expected = DataFrame(1.2, index=idx, columns=["var"]) + result = DataFrame(index=idx, columns=["var"]) result.loc[:, idxer] = expected tm.assert_frame_equal(result, expected) def test_at_time_between_time_datetimeindex(self): - index = date_range("2012-01-01", "2012-01-05", freq='30min') + index = date_range("2012-01-01", "2012-01-05", freq="30min") df = DataFrame(np.random.randn(len(index), 5), index=index) akey = time(12, 0, 0) bkey = slice(time(13, 0, 0), time(14, 0, 0)) @@ -2466,86 +2536,84 @@ def test_xs(self, float_frame, datetime_frame): assert value == float_frame[item][idx] # mixed-type xs - test_data = { - 'A': {'1': 1, '2': 2}, - 'B': {'1': '1', '2': '2', '3': '3'}, - } + test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} frame = DataFrame(test_data) - xs = frame.xs('1') + xs = frame.xs("1") assert xs.dtype == np.object_ - assert xs['A'] == 1 - assert xs['B'] == '1' + assert xs["A"] == 1 + assert xs["B"] == "1" with pytest.raises(KeyError): datetime_frame.xs(datetime_frame.index[0] - BDay()) # xs get column - series = float_frame.xs('A', axis=1) - expected = float_frame['A'] + series = float_frame.xs("A", axis=1) + expected = float_frame["A"] assert_series_equal(series, expected) # view is returned if possible - series = float_frame.xs('A', axis=1) + series = float_frame.xs("A", axis=1) series[:] = 5 assert (expected == 5).all() def test_xs_corner(self): # pathological mixed-type reordering case df = DataFrame(index=[0]) - df['A'] = 1. - df['B'] = 'foo' - df['C'] = 2. - df['D'] = 'bar' - df['E'] = 3. + df["A"] = 1.0 + df["B"] = "foo" + df["C"] = 2.0 + df["D"] = "bar" + df["E"] = 3.0 xs = df.xs(0) - exp = pd.Series([1., 'foo', 2., 'bar', 3.], - index=list('ABCDE'), name=0) + exp = pd.Series([1.0, "foo", 2.0, "bar", 3.0], index=list("ABCDE"), name=0) tm.assert_series_equal(xs, exp) # no columns but Index(dtype=object) - df = DataFrame(index=['a', 'b', 'c']) - result = df.xs('a') - expected = Series([], name='a', index=pd.Index([], dtype=object)) + df = DataFrame(index=["a", "b", "c"]) + result = df.xs("a") + expected = Series([], name="a", index=pd.Index([], dtype=object)) assert_series_equal(result, expected) def test_xs_duplicates(self): - df = DataFrame(np.random.randn(5, 2), index=['b', 'b', 'c', 'b', 'a']) + df = DataFrame(np.random.randn(5, 2), index=["b", "b", "c", "b", "a"]) - cross = df.xs('c') + cross = df.xs("c") exp = df.iloc[2] assert_series_equal(cross, exp) def test_xs_keep_level(self): - df = (DataFrame({'day': {0: 'sat', 1: 'sun'}, - 'flavour': {0: 'strawberry', 1: 'strawberry'}, - 'sales': {0: 10, 1: 12}, - 'year': {0: 2008, 1: 2008}}) - .set_index(['year', 'flavour', 'day'])) - result = df.xs('sat', level='day', drop_level=False) + df = DataFrame( + { + "day": {0: "sat", 1: "sun"}, + "flavour": {0: "strawberry", 1: "strawberry"}, + "sales": {0: 10, 1: 12}, + "year": {0: 2008, 1: 2008}, + } + ).set_index(["year", "flavour", "day"]) + result = df.xs("sat", level="day", drop_level=False) expected = df[:1] assert_frame_equal(result, expected) - result = df.xs([2008, 'sat'], level=['year', 'day'], drop_level=False) + result = df.xs([2008, "sat"], level=["year", "day"], drop_level=False) assert_frame_equal(result, expected) def test_xs_view(self): # in 0.14 this will return a view if possible a copy otherwise, but # this is numpy dependent - dm = DataFrame(np.arange(20.).reshape(4, 5), - index=range(4), columns=range(5)) + dm = DataFrame(np.arange(20.0).reshape(4, 5), index=range(4), columns=range(5)) dm.xs(2)[:] = 10 assert (dm.xs(2) == 10).all() def test_index_namedtuple(self): from collections import namedtuple + IndexType = namedtuple("IndexType", ["a", "b"]) idx1 = IndexType("foo", "bar") idx2 = IndexType("baz", "bof") - index = Index([idx1, idx2], - name="composite_index", tupleize_cols=False) + index = Index([idx1, idx2], name="composite_index", tupleize_cols=False) df = DataFrame([(1, 2), (3, 4)], index=index, columns=["A", "B"]) with catch_warnings(record=True): @@ -2558,36 +2626,56 @@ def test_index_namedtuple(self): def test_boolean_indexing(self): idx = list(range(3)) - cols = ['A', 'B', 'C'] - df1 = DataFrame(index=idx, columns=cols, - data=np.array([[0.0, 0.5, 1.0], - [1.5, 2.0, 2.5], - [3.0, 3.5, 4.0]], - dtype=float)) - df2 = DataFrame(index=idx, columns=cols, - data=np.ones((len(idx), len(cols)))) - - expected = DataFrame(index=idx, columns=cols, - data=np.array([[0.0, 0.5, 1.0], - [1.5, 2.0, -1], - [-1, -1, -1]], dtype=float)) + cols = ["A", "B", "C"] + df1 = DataFrame( + index=idx, + columns=cols, + data=np.array( + [[0.0, 0.5, 1.0], [1.5, 2.0, 2.5], [3.0, 3.5, 4.0]], dtype=float + ), + ) + df2 = DataFrame(index=idx, columns=cols, data=np.ones((len(idx), len(cols)))) + + expected = DataFrame( + index=idx, + columns=cols, + data=np.array([[0.0, 0.5, 1.0], [1.5, 2.0, -1], [-1, -1, -1]], dtype=float), + ) df1[df1 > 2.0 * df2] = -1 assert_frame_equal(df1, expected) - with pytest.raises(ValueError, match='Item wrong length'): + with pytest.raises(ValueError, match="Item wrong length"): df1[df1.index[:-1] > 2] = -1 def test_boolean_indexing_mixed(self): - df = DataFrame({ - 0: {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan}, - 1: {35: np.nan, 40: 0.32632316859446198, 43: np.nan, - 49: 0.32632316859446198, 50: 0.39114724480578139}, - 2: {35: np.nan, 40: np.nan, 43: 0.29012581014105987, 49: np.nan, - 50: np.nan}, - 3: {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan}, - 4: {35: 0.34215328467153283, 40: np.nan, 43: np.nan, 49: np.nan, - 50: np.nan}, - 'y': {35: 0, 40: 0, 43: 0, 49: 0, 50: 1}}) + df = DataFrame( + { + 0: {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan}, + 1: { + 35: np.nan, + 40: 0.32632316859446198, + 43: np.nan, + 49: 0.32632316859446198, + 50: 0.39114724480578139, + }, + 2: { + 35: np.nan, + 40: np.nan, + 43: 0.29012581014105987, + 49: np.nan, + 50: np.nan, + }, + 3: {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan}, + 4: { + 35: 0.34215328467153283, + 40: np.nan, + 43: np.nan, + 49: np.nan, + 50: np.nan, + }, + "y": {35: 0, 40: 0, 43: 0, 49: 0, 50: 1}, + } + ) # mixed int/float ok df2 = df.copy() @@ -2599,33 +2687,33 @@ def test_boolean_indexing_mixed(self): expected.loc[35, 4] = 1 assert_frame_equal(df2, expected) - df['foo'] = 'test' + df["foo"] = "test" msg = "not supported between instances|unorderable types" with pytest.raises(TypeError, match=msg): df[df > 0.3] = 1 - def test_where(self, float_string_frame, mixed_float_frame, - mixed_int_frame): - default_frame = DataFrame(np.random.randn(5, 3), - columns=['A', 'B', 'C']) + def test_where(self, float_string_frame, mixed_float_frame, mixed_int_frame): + default_frame = DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"]) def _safe_add(df): # only add to the numeric items def is_ok(s): - return (issubclass(s.dtype.type, (np.integer, np.floating)) and - s.dtype != 'uint8') + return ( + issubclass(s.dtype.type, (np.integer, np.floating)) + and s.dtype != "uint8" + ) - return DataFrame(dict((c, s + 1) if is_ok(s) else (c, s) - for c, s in df.items())) + return DataFrame( + dict((c, s + 1) if is_ok(s) else (c, s) for c, s in df.items()) + ) def _check_get(df, cond, check_dtypes=True): other1 = _safe_add(df) rs = df.where(cond, other1) rs2 = df.where(cond.values, other1) for k, v in rs.iteritems(): - exp = Series( - np.where(cond[k], df[k], other1[k]), index=v.index) + exp = Series(np.where(cond[k], df[k], other1[k]), index=v.index) assert_series_equal(v, exp, check_names=False) assert_frame_equal(rs, rs2) @@ -2634,8 +2722,12 @@ def _check_get(df, cond, check_dtypes=True): assert (rs.dtypes == df.dtypes).all() # check getting - for df in [default_frame, float_string_frame, - mixed_float_frame, mixed_int_frame]: + for df in [ + default_frame, + float_string_frame, + mixed_float_frame, + mixed_int_frame, + ]: if df is float_string_frame: with pytest.raises(TypeError): df > 0 @@ -2644,16 +2736,23 @@ def _check_get(df, cond, check_dtypes=True): _check_get(df, cond) # upcasting case (GH # 2794) - df = DataFrame({c: Series([1] * 3, dtype=c) - for c in ['float32', 'float64', - 'int32', 'int64']}) + df = DataFrame( + { + c: Series([1] * 3, dtype=c) + for c in ["float32", "float64", "int32", "int64"] + } + ) df.iloc[1, :] = 0 result = df.dtypes - expected = Series([np.dtype('float32'), - np.dtype('float64'), - np.dtype('int32'), - np.dtype('int64')], - index=['float32', 'float64', 'int32', 'int64']) + expected = Series( + [ + np.dtype("float32"), + np.dtype("float64"), + np.dtype("int32"), + np.dtype("int64"), + ], + index=["float32", "float64", "int32", "int64"], + ) # when we don't preserve boolean casts # @@ -2706,8 +2805,7 @@ def _check_align(df, cond, other, check_dtypes=True): # integers are upcast, so don't check the dtypes cond = df > 0 - check_dtypes = all(not issubclass(s.type, np.integer) - for s in df.dtypes) + check_dtypes = all(not issubclass(s.type, np.integer) for s in df.dtypes) _check_align(df, cond, np.nan, check_dtypes=check_dtypes) # invalid conditions @@ -2741,11 +2839,15 @@ def _check_set(df, cond, check_dtypes=True): if check_dtypes: for k, v in df.dtypes.items(): if issubclass(v.type, np.integer) and not cond[k].all(): - v = np.dtype('float64') + v = np.dtype("float64") assert dfi[k].dtype == v - for df in [default_frame, float_string_frame, mixed_float_frame, - mixed_int_frame]: + for df in [ + default_frame, + float_string_frame, + mixed_float_frame, + mixed_int_frame, + ]: if df is float_string_frame: with pytest.raises(TypeError): df > 0 @@ -2763,9 +2865,9 @@ def _check_set(df, cond, check_dtypes=True): # GH 10218 # test DataFrame.where with Series slicing - df = DataFrame({'a': range(3), 'b': range(4, 7)}) - result = df.where(df['a'] == 1) - expected = df[df['a'] == 1].reindex(df.index) + df = DataFrame({"a": range(3), "b": range(4, 7)}) + result = df.where(df["a"] == 1) + expected = df[df["a"] == 1].reindex(df.index) assert_frame_equal(result, expected) @pytest.mark.parametrize("klass", [list, tuple, np.array]) @@ -2785,14 +2887,16 @@ def test_where_array_like(self, klass): result = df.where(klass(cond)) assert_frame_equal(result, expected) - @pytest.mark.parametrize("cond", [ - [[1], [0], [1]], - Series([[2], [5], [7]]), - DataFrame({"a": [2, 5, 7]}), - [["True"], ["False"], ["True"]], - [[Timestamp("2017-01-01")], - [pd.NaT], [Timestamp("2017-01-02")]] - ]) + @pytest.mark.parametrize( + "cond", + [ + [[1], [0], [1]], + Series([[2], [5], [7]]), + DataFrame({"a": [2, 5, 7]}), + [["True"], ["False"], ["True"]], + [[Timestamp("2017-01-01")], [pd.NaT], [Timestamp("2017-01-02")]], + ], + ) def test_where_invalid_input_single(self, cond): # see gh-15414: only boolean arrays accepted df = DataFrame({"a": [1, 2, 3]}) @@ -2801,16 +2905,20 @@ def test_where_invalid_input_single(self, cond): with pytest.raises(ValueError, match=msg): df.where(cond) - @pytest.mark.parametrize("cond", [ - [[0, 1], [1, 0], [1, 1]], - Series([[0, 2], [5, 0], [4, 7]]), - [["False", "True"], ["True", "False"], - ["True", "True"]], - DataFrame({"a": [2, 5, 7], "b": [4, 8, 9]}), - [[pd.NaT, Timestamp("2017-01-01")], - [Timestamp("2017-01-02"), pd.NaT], - [Timestamp("2017-01-03"), Timestamp("2017-01-03")]] - ]) + @pytest.mark.parametrize( + "cond", + [ + [[0, 1], [1, 0], [1, 1]], + Series([[0, 2], [5, 0], [4, 7]]), + [["False", "True"], ["True", "False"], ["True", "True"]], + DataFrame({"a": [2, 5, 7], "b": [4, 8, 9]}), + [ + [pd.NaT, Timestamp("2017-01-01")], + [Timestamp("2017-01-02"), pd.NaT], + [Timestamp("2017-01-03"), Timestamp("2017-01-03")], + ], + ], + ) def test_where_invalid_input_multiple(self, cond): # see gh-15414: only boolean arrays accepted df = DataFrame({"a": [1, 2, 3], "b": [2, 2, 2]}) @@ -2857,10 +2965,13 @@ def test_where_ndframe_align(self): def test_where_bug(self): # see gh-2793 - df = DataFrame({'a': [1.0, 2.0, 3.0, 4.0], 'b': [ - 4.0, 3.0, 2.0, 1.0]}, dtype='float64') - expected = DataFrame({'a': [np.nan, np.nan, 3.0, 4.0], 'b': [ - 4.0, 3.0, np.nan, np.nan]}, dtype='float64') + df = DataFrame( + {"a": [1.0, 2.0, 3.0, 4.0], "b": [4.0, 3.0, 2.0, 1.0]}, dtype="float64" + ) + expected = DataFrame( + {"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]}, + dtype="float64", + ) result = df.where(df > 2, np.nan) assert_frame_equal(result, expected) @@ -2870,13 +2981,17 @@ def test_where_bug(self): def test_where_bug_mixed(self, sint_dtype): # see gh-2793 - df = DataFrame({"a": np.array([1, 2, 3, 4], dtype=sint_dtype), - "b": np.array([4.0, 3.0, 2.0, 1.0], - dtype="float64")}) + df = DataFrame( + { + "a": np.array([1, 2, 3, 4], dtype=sint_dtype), + "b": np.array([4.0, 3.0, 2.0, 1.0], dtype="float64"), + } + ) - expected = DataFrame({"a": [np.nan, np.nan, 3.0, 4.0], - "b": [4.0, 3.0, np.nan, np.nan]}, - dtype="float64") + expected = DataFrame( + {"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]}, + dtype="float64", + ) result = df.where(df > 2, np.nan) assert_frame_equal(result, expected) @@ -2910,9 +3025,13 @@ def test_where_bug_transposition(self): def test_where_datetime(self): # GH 3311 - df = DataFrame(dict(A=date_range('20130102', periods=5), - B=date_range('20130104', periods=5), - C=np.random.randn(5))) + df = DataFrame( + dict( + A=date_range("20130102", periods=5), + B=date_range("20130104", periods=5), + C=np.random.randn(5), + ) + ) stamp = datetime(2013, 1, 3) with pytest.raises(TypeError): @@ -2921,23 +3040,28 @@ def test_where_datetime(self): result = df[df.iloc[:, :-1] > stamp] expected = df.copy() - expected.loc[[0, 1], 'A'] = np.nan - expected.loc[:, 'C'] = np.nan + expected.loc[[0, 1], "A"] = np.nan + expected.loc[:, "C"] = np.nan assert_frame_equal(result, expected) def test_where_none(self): # GH 4667 # setting with None changes dtype - df = DataFrame({'series': Series(range(10))}).astype(float) + df = DataFrame({"series": Series(range(10))}).astype(float) df[df > 7] = None expected = DataFrame( - {'series': Series([0, 1, 2, 3, 4, 5, 6, 7, np.nan, np.nan])}) + {"series": Series([0, 1, 2, 3, 4, 5, 6, 7, np.nan, np.nan])} + ) assert_frame_equal(df, expected) # GH 7656 - df = DataFrame([{'A': 1, 'B': np.nan, 'C': 'Test'}, { - 'A': np.nan, 'B': 'Test', 'C': np.nan}]) - msg = 'boolean setting on mixed-type' + df = DataFrame( + [ + {"A": 1, "B": np.nan, "C": "Test"}, + {"A": np.nan, "B": "Test", "C": np.nan}, + ] + ) + msg = "boolean setting on mixed-type" with pytest.raises(TypeError, match=msg): df.where(~isna(df), None, inplace=True) @@ -2951,7 +3075,6 @@ def test_where_empty_df_and_empty_cond_having_non_bool_dtypes(self): tm.assert_frame_equal(result, df) def test_where_align(self): - def create(): df = DataFrame(np.random.randn(10, 3)) df.iloc[3:5, 0] = np.nan @@ -2962,31 +3085,31 @@ def create(): # series df = create() expected = df.fillna(df.mean()) - result = df.where(pd.notna(df), df.mean(), axis='columns') + result = df.where(pd.notna(df), df.mean(), axis="columns") assert_frame_equal(result, expected) - df.where(pd.notna(df), df.mean(), inplace=True, axis='columns') + df.where(pd.notna(df), df.mean(), inplace=True, axis="columns") assert_frame_equal(df, expected) df = create().fillna(0) expected = df.apply(lambda x, y: x.where(x > 0, y), y=df[0]) - result = df.where(df > 0, df[0], axis='index') + result = df.where(df > 0, df[0], axis="index") assert_frame_equal(result, expected) - result = df.where(df > 0, df[0], axis='rows') + result = df.where(df > 0, df[0], axis="rows") assert_frame_equal(result, expected) # frame df = create() expected = df.fillna(1) - result = df.where(pd.notna(df), DataFrame( - 1, index=df.index, columns=df.columns)) + result = df.where( + pd.notna(df), DataFrame(1, index=df.index, columns=df.columns) + ) assert_frame_equal(result, expected) def test_where_complex(self): # GH 6345 - expected = DataFrame( - [[1 + 1j, 2], [np.nan, 4 + 1j]], columns=['a', 'b']) - df = DataFrame([[1 + 1j, 2], [5 + 1j, 4 + 1j]], columns=['a', 'b']) + expected = DataFrame([[1 + 1j, 2], [np.nan, 4 + 1j]], columns=["a", "b"]) + df = DataFrame([[1 + 1j, 2], [5 + 1j, 4 + 1j]], columns=["a", "b"]) df[df.abs() >= 5] = np.nan assert_frame_equal(df, expected) @@ -2996,72 +3119,80 @@ def test_where_axis(self): mask = DataFrame([[False, False], [False, False]]) s = Series([0, 1]) - expected = DataFrame([[0, 0], [1, 1]], dtype='float64') - result = df.where(mask, s, axis='index') + expected = DataFrame([[0, 0], [1, 1]], dtype="float64") + result = df.where(mask, s, axis="index") assert_frame_equal(result, expected) result = df.copy() - result.where(mask, s, axis='index', inplace=True) + result.where(mask, s, axis="index", inplace=True) assert_frame_equal(result, expected) - expected = DataFrame([[0, 1], [0, 1]], dtype='float64') - result = df.where(mask, s, axis='columns') + expected = DataFrame([[0, 1], [0, 1]], dtype="float64") + result = df.where(mask, s, axis="columns") assert_frame_equal(result, expected) result = df.copy() - result.where(mask, s, axis='columns', inplace=True) + result.where(mask, s, axis="columns", inplace=True) assert_frame_equal(result, expected) # Upcast needed - df = DataFrame([[1, 2], [3, 4]], dtype='int64') + df = DataFrame([[1, 2], [3, 4]], dtype="int64") mask = DataFrame([[False, False], [False, False]]) s = Series([0, np.nan]) - expected = DataFrame([[0, 0], [np.nan, np.nan]], dtype='float64') - result = df.where(mask, s, axis='index') + expected = DataFrame([[0, 0], [np.nan, np.nan]], dtype="float64") + result = df.where(mask, s, axis="index") assert_frame_equal(result, expected) result = df.copy() - result.where(mask, s, axis='index', inplace=True) + result.where(mask, s, axis="index", inplace=True) assert_frame_equal(result, expected) expected = DataFrame([[0, np.nan], [0, np.nan]]) - result = df.where(mask, s, axis='columns') + result = df.where(mask, s, axis="columns") assert_frame_equal(result, expected) - expected = DataFrame({0: np.array([0, 0], dtype='int64'), - 1: np.array([np.nan, np.nan], dtype='float64')}) + expected = DataFrame( + { + 0: np.array([0, 0], dtype="int64"), + 1: np.array([np.nan, np.nan], dtype="float64"), + } + ) result = df.copy() - result.where(mask, s, axis='columns', inplace=True) + result.where(mask, s, axis="columns", inplace=True) assert_frame_equal(result, expected) # Multiple dtypes (=> multiple Blocks) - df = pd.concat([ - DataFrame(np.random.randn(10, 2)), - DataFrame(np.random.randint(0, 10, size=(10, 2)), dtype='int64')], - ignore_index=True, axis=1) + df = pd.concat( + [ + DataFrame(np.random.randn(10, 2)), + DataFrame(np.random.randint(0, 10, size=(10, 2)), dtype="int64"), + ], + ignore_index=True, + axis=1, + ) mask = DataFrame(False, columns=df.columns, index=df.index) s1 = Series(1, index=df.columns) s2 = Series(2, index=df.index) - result = df.where(mask, s1, axis='columns') + result = df.where(mask, s1, axis="columns") expected = DataFrame(1.0, columns=df.columns, index=df.index) - expected[2] = expected[2].astype('int64') - expected[3] = expected[3].astype('int64') + expected[2] = expected[2].astype("int64") + expected[3] = expected[3].astype("int64") assert_frame_equal(result, expected) result = df.copy() - result.where(mask, s1, axis='columns', inplace=True) + result.where(mask, s1, axis="columns", inplace=True) assert_frame_equal(result, expected) - result = df.where(mask, s2, axis='index') + result = df.where(mask, s2, axis="index") expected = DataFrame(2.0, columns=df.columns, index=df.index) - expected[2] = expected[2].astype('int64') - expected[3] = expected[3].astype('int64') + expected[2] = expected[2].astype("int64") + expected[3] = expected[3].astype("int64") assert_frame_equal(result, expected) result = df.copy() - result.where(mask, s2, axis='index', inplace=True) + result.where(mask, s2, axis="index", inplace=True) assert_frame_equal(result, expected) # DataFrame vs DataFrame @@ -3071,13 +3202,13 @@ def test_where_axis(self): result = df.where(mask, d1) assert_frame_equal(result, expected) - result = df.where(mask, d1, axis='index') + result = df.where(mask, d1, axis="index") assert_frame_equal(result, expected) result = df.copy() result.where(mask, d1, inplace=True) assert_frame_equal(result, expected) result = df.copy() - result.where(mask, d1, inplace=True, axis='index') + result.where(mask, d1, inplace=True, axis="index") assert_frame_equal(result, expected) d2 = df.copy().drop(1, axis=1) @@ -3086,13 +3217,13 @@ def test_where_axis(self): result = df.where(mask, d2) assert_frame_equal(result, expected) - result = df.where(mask, d2, axis='columns') + result = df.where(mask, d2, axis="columns") assert_frame_equal(result, expected) result = df.copy() result.where(mask, d2, inplace=True) assert_frame_equal(result, expected) result = df.copy() - result.where(mask, d2, inplace=True, axis='columns') + result.where(mask, d2, inplace=True, axis="columns") assert_frame_equal(result, expected) def test_where_callable(self): @@ -3113,20 +3244,22 @@ def test_where_callable(self): result = (df + 2).where(lambda x: x > 8, lambda x: x + 10) exp = DataFrame([[13, 14, 15], [16, 17, 18], [9, 10, 11]]) tm.assert_frame_equal(result, exp) - tm.assert_frame_equal(result, - (df + 2).where((df + 2) > 8, (df + 2) + 10)) + tm.assert_frame_equal(result, (df + 2).where((df + 2) > 8, (df + 2) + 10)) def test_where_tz_values(self, tz_naive_fixture): - df1 = DataFrame(DatetimeIndex(['20150101', '20150102', '20150103'], - tz=tz_naive_fixture), - columns=['date']) - df2 = DataFrame(DatetimeIndex(['20150103', '20150104', '20150105'], - tz=tz_naive_fixture), - columns=['date']) - mask = DataFrame([True, True, False], columns=['date']) - exp = DataFrame(DatetimeIndex(['20150101', '20150102', '20150105'], - tz=tz_naive_fixture), - columns=['date']) + df1 = DataFrame( + DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture), + columns=["date"], + ) + df2 = DataFrame( + DatetimeIndex(["20150103", "20150104", "20150105"], tz=tz_naive_fixture), + columns=["date"], + ) + mask = DataFrame([True, True, False], columns=["date"]) + exp = DataFrame( + DatetimeIndex(["20150101", "20150102", "20150105"], tz=tz_naive_fixture), + columns=["date"], + ) result = df1.where(mask, df2) assert_frame_equal(exp, result) @@ -3191,8 +3324,7 @@ def test_mask_callable(self): result = (df + 2).mask(lambda x: x > 8, lambda x: x + 10) exp = DataFrame([[3, 4, 5], [6, 7, 8], [19, 20, 21]]) tm.assert_frame_equal(result, exp) - tm.assert_frame_equal(result, - (df + 2).mask((df + 2) > 8, (df + 2) + 10)) + tm.assert_frame_equal(result, (df + 2).mask((df + 2) > 8, (df + 2) + 10)) def test_head_tail(self, float_frame): assert_frame_equal(float_frame.head(), float_frame[:5]) @@ -3221,69 +3353,67 @@ def test_head_tail(self, float_frame): def test_type_error_multiindex(self): # See gh-12218 - df = DataFrame(columns=['i', 'c', 'x', 'y'], - data=[[0, 0, 1, 2], [1, 0, 3, 4], - [0, 1, 1, 2], [1, 1, 3, 4]]) - dg = df.pivot_table(index='i', columns='c', - values=['x', 'y']) + df = DataFrame( + columns=["i", "c", "x", "y"], + data=[[0, 0, 1, 2], [1, 0, 3, 4], [0, 1, 1, 2], [1, 1, 3, 4]], + ) + dg = df.pivot_table(index="i", columns="c", values=["x", "y"]) with pytest.raises(TypeError, match="is an invalid key"): str(dg[:, 0]) - index = Index(range(2), name='i') - columns = MultiIndex(levels=[['x', 'y'], [0, 1]], - codes=[[0, 1], [0, 0]], - names=[None, 'c']) + index = Index(range(2), name="i") + columns = MultiIndex( + levels=[["x", "y"], [0, 1]], codes=[[0, 1], [0, 0]], names=[None, "c"] + ) expected = DataFrame([[1, 2], [3, 4]], columns=columns, index=index) result = dg.loc[:, (slice(None), 0)] assert_frame_equal(result, expected) - name = ('x', 0) - index = Index(range(2), name='i') + name = ("x", 0) + index = Index(range(2), name="i") expected = Series([1, 3], index=index, name=name) - result = dg['x', 0] + result = dg["x", 0] assert_series_equal(result, expected) def test_interval_index(self): # GH 19977 index = pd.interval_range(start=0, periods=3) - df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=index, - columns=['A', 'B', 'C']) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=index, columns=["A", "B", "C"] + ) expected = 1 - result = df.loc[0.5, 'A'] + result = df.loc[0.5, "A"] assert_almost_equal(result, expected) - index = pd.interval_range(start=0, periods=3, closed='both') - df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=index, - columns=['A', 'B', 'C']) + index = pd.interval_range(start=0, periods=3, closed="both") + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=index, columns=["A", "B", "C"] + ) - index_exp = pd.interval_range(start=0, periods=2, - freq=1, closed='both') - expected = pd.Series([1, 4], index=index_exp, name='A') - result = df.loc[1, 'A'] + index_exp = pd.interval_range(start=0, periods=2, freq=1, closed="both") + expected = pd.Series([1, 4], index=index_exp, name="A") + result = df.loc[1, "A"] assert_series_equal(result, expected) class TestDataFrameIndexingDatetimeWithTZ(TestData): - def test_setitem(self, timezone_frame): df = timezone_frame - idx = df['B'].rename('foo') + idx = df["B"].rename("foo") # setitem - df['C'] = idx - assert_series_equal(df['C'], Series(idx, name='C')) + df["C"] = idx + assert_series_equal(df["C"], Series(idx, name="C")) - df['D'] = 'foo' - df['D'] = idx - assert_series_equal(df['D'], Series(idx, name='D')) - del df['D'] + df["D"] = "foo" + df["D"] = idx + assert_series_equal(df["D"], Series(idx, name="D")) + del df["D"] # assert that A & C are not sharing the same base (e.g. they # are copies) @@ -3296,126 +3426,132 @@ def test_setitem(self, timezone_frame): df2 = df.copy() df2.iloc[1, 1] = pd.NaT df2.iloc[1, 2] = pd.NaT - result = df2['B'] - assert_series_equal(notna(result), Series( - [True, False, True], name='B')) + result = df2["B"] + assert_series_equal(notna(result), Series([True, False, True], name="B")) assert_series_equal(df2.dtypes, df.dtypes) def test_set_reset(self): - idx = Index(date_range('20130101', periods=3, tz='US/Eastern'), - name='foo') + idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") # set/reset - df = DataFrame({'A': [0, 1, 2]}, index=idx) + df = DataFrame({"A": [0, 1, 2]}, index=idx) result = df.reset_index() - assert result['foo'].dtype, 'M8[ns, US/Eastern' + assert result["foo"].dtype, "M8[ns, US/Eastern" - df = result.set_index('foo') + df = result.set_index("foo") tm.assert_index_equal(df.index, idx) def test_transpose(self, timezone_frame): result = timezone_frame.T expected = DataFrame(timezone_frame.values.T) - expected.index = ['A', 'B', 'C'] + expected.index = ["A", "B", "C"] assert_frame_equal(result, expected) def test_scalar_assignment(self): # issue #19843 df = pd.DataFrame(index=(0, 1, 2)) - df['now'] = pd.Timestamp('20130101', tz='UTC') + df["now"] = pd.Timestamp("20130101", tz="UTC") expected = pd.DataFrame( - {'now': pd.Timestamp('20130101', tz='UTC')}, index=[0, 1, 2]) + {"now": pd.Timestamp("20130101", tz="UTC")}, index=[0, 1, 2] + ) tm.assert_frame_equal(df, expected) class TestDataFrameIndexingUInt64(TestData): - def test_setitem(self, uint64_frame): df = uint64_frame - idx = df['A'].rename('foo') + idx = df["A"].rename("foo") # setitem - df['C'] = idx - assert_series_equal(df['C'], Series(idx, name='C')) + df["C"] = idx + assert_series_equal(df["C"], Series(idx, name="C")) - df['D'] = 'foo' - df['D'] = idx - assert_series_equal(df['D'], Series(idx, name='D')) - del df['D'] + df["D"] = "foo" + df["D"] = idx + assert_series_equal(df["D"], Series(idx, name="D")) + del df["D"] # With NaN: because uint64 has no NaN element, # the column should be cast to object. df2 = df.copy() df2.iloc[1, 1] = pd.NaT df2.iloc[1, 2] = pd.NaT - result = df2['B'] - assert_series_equal(notna(result), Series( - [True, False, True], name='B')) - assert_series_equal(df2.dtypes, Series([np.dtype('uint64'), - np.dtype('O'), np.dtype('O')], - index=['A', 'B', 'C'])) + result = df2["B"] + assert_series_equal(notna(result), Series([True, False, True], name="B")) + assert_series_equal( + df2.dtypes, + Series( + [np.dtype("uint64"), np.dtype("O"), np.dtype("O")], + index=["A", "B", "C"], + ), + ) def test_set_reset(self): - idx = Index([2**63, 2**63 + 5, 2**63 + 10], name='foo') + idx = Index([2 ** 63, 2 ** 63 + 5, 2 ** 63 + 10], name="foo") # set/reset - df = DataFrame({'A': [0, 1, 2]}, index=idx) + df = DataFrame({"A": [0, 1, 2]}, index=idx) result = df.reset_index() - assert result['foo'].dtype == np.dtype('uint64') + assert result["foo"].dtype == np.dtype("uint64") - df = result.set_index('foo') + df = result.set_index("foo") tm.assert_index_equal(df.index, idx) def test_transpose(self, uint64_frame): result = uint64_frame.T expected = DataFrame(uint64_frame.values.T) - expected.index = ['A', 'B'] + expected.index = ["A", "B"] assert_frame_equal(result, expected) class TestDataFrameIndexingCategorical: - def test_assignment(self): # assignment - df = DataFrame({'value': np.array( - np.random.randint(0, 10000, 100), dtype='int32')}) - labels = Categorical(["{0} - {1}".format(i, i + 499) - for i in range(0, 10000, 500)]) - - df = df.sort_values(by=['value'], ascending=True) + df = DataFrame( + {"value": np.array(np.random.randint(0, 10000, 100), dtype="int32")} + ) + labels = Categorical( + ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + ) + + df = df.sort_values(by=["value"], ascending=True) s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) d = s.values - df['D'] = d + df["D"] = d str(df) result = df.dtypes expected = Series( - [np.dtype('int32'), CategoricalDtype(categories=labels, - ordered=False)], - index=['value', 'D']) + [np.dtype("int32"), CategoricalDtype(categories=labels, ordered=False)], + index=["value", "D"], + ) tm.assert_series_equal(result, expected) - df['E'] = s + df["E"] = s str(df) result = df.dtypes - expected = Series([np.dtype('int32'), - CategoricalDtype(categories=labels, ordered=False), - CategoricalDtype(categories=labels, ordered=False)], - index=['value', 'D', 'E']) + expected = Series( + [ + np.dtype("int32"), + CategoricalDtype(categories=labels, ordered=False), + CategoricalDtype(categories=labels, ordered=False), + ], + index=["value", "D", "E"], + ) tm.assert_series_equal(result, expected) - result1 = df['D'] - result2 = df['E'] + result1 = df["D"] + result2 = df["E"] tm.assert_categorical_equal(result1._data._block.values, d) # sorting - s.name = 'E' + s.name = "E" tm.assert_series_equal(result2.sort_index(), s.sort_index()) cat = Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10]) @@ -3438,44 +3574,37 @@ def test_assigning_ops(self): # assign a part of a column with dtype != categorical -> # exp_parts_cats_col - cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], - categories=["a", "b"]) + cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) idx = Index(["h", "i", "j", "k", "l", "m", "n"]) values = [1, 1, 1, 1, 1, 1, 1] orig = DataFrame({"cats": cats, "values": values}, index=idx) # the expected values # changed single row - cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], - categories=["a", "b"]) + cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) idx1 = Index(["h", "i", "j", "k", "l", "m", "n"]) values1 = [1, 1, 2, 1, 1, 1, 1] - exp_single_row = DataFrame({"cats": cats1, - "values": values1}, index=idx1) + exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1) # changed multiple rows - cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], - categories=["a", "b"]) + cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) values2 = [1, 1, 2, 2, 1, 1, 1] - exp_multi_row = DataFrame({"cats": cats2, - "values": values2}, index=idx2) + exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) # changed part of the cats column - cats3 = Categorical( - ["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) + cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) idx3 = Index(["h", "i", "j", "k", "l", "m", "n"]) values3 = [1, 1, 1, 1, 1, 1, 1] - exp_parts_cats_col = DataFrame({"cats": cats3, - "values": values3}, index=idx3) + exp_parts_cats_col = DataFrame({"cats": cats3, "values": values3}, index=idx3) # changed single value in cats col - cats4 = Categorical( - ["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) + cats4 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) idx4 = Index(["h", "i", "j", "k", "l", "m", "n"]) values4 = [1, 1, 1, 1, 1, 1, 1] - exp_single_cats_value = DataFrame({"cats": cats4, - "values": values4}, index=idx4) + exp_single_cats_value = DataFrame( + {"cats": cats4, "values": values4}, index=idx4 + ) # iloc # ############### @@ -3521,12 +3650,12 @@ def test_assigning_ops(self): with pytest.raises(ValueError): # different categories -> not sure if this should fail or pass df = orig.copy() - df.iloc[2:4, 0] = Categorical(list('bb'), categories=list('abc')) + df.iloc[2:4, 0] = Categorical(list("bb"), categories=list("abc")) with pytest.raises(ValueError): # different values df = orig.copy() - df.iloc[2:4, 0] = Categorical(list('cc'), categories=list('abc')) + df.iloc[2:4, 0] = Categorical(list("cc"), categories=list("abc")) # assign a part of a column with dtype != categorical -> # exp_parts_cats_col @@ -3575,21 +3704,22 @@ def test_assigning_ops(self): # assign a part of a column with dtype == categorical -> # exp_parts_cats_col df = orig.copy() - df.loc["j":"k", "cats"] = Categorical( - ["b", "b"], categories=["a", "b"]) + df.loc["j":"k", "cats"] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp_parts_cats_col) with pytest.raises(ValueError): # different categories -> not sure if this should fail or pass df = orig.copy() df.loc["j":"k", "cats"] = Categorical( - ["b", "b"], categories=["a", "b", "c"]) + ["b", "b"], categories=["a", "b", "c"] + ) with pytest.raises(ValueError): # different values df = orig.copy() df.loc["j":"k", "cats"] = Categorical( - ["c", "c"], categories=["a", "b", "c"]) + ["c", "c"], categories=["a", "b", "c"] + ) # assign a part of a column with dtype != categorical -> # exp_parts_cats_col @@ -3638,21 +3768,22 @@ def test_assigning_ops(self): # assign a part of a column with dtype == categorical -> # exp_parts_cats_col df = orig.copy() - df.loc["j":"k", df.columns[0]] = Categorical( - ["b", "b"], categories=["a", "b"]) + df.loc["j":"k", df.columns[0]] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp_parts_cats_col) with pytest.raises(ValueError): # different categories -> not sure if this should fail or pass df = orig.copy() df.loc["j":"k", df.columns[0]] = Categorical( - ["b", "b"], categories=["a", "b", "c"]) + ["b", "b"], categories=["a", "b", "c"] + ) with pytest.raises(ValueError): # different values df = orig.copy() df.loc["j":"k", df.columns[0]] = Categorical( - ["c", "c"], categories=["a", "b", "c"]) + ["c", "c"], categories=["a", "b", "c"] + ) # assign a part of a column with dtype != categorical -> # exp_parts_cats_col @@ -3685,8 +3816,9 @@ def test_assigning_ops(self): df.at["j", "cats"] = "c" # fancy indexing - catsf = Categorical(["a", "a", "c", "c", "a", "a", "a"], - categories=["a", "b", "c"]) + catsf = Categorical( + ["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"] + ) idxf = Index(["h", "i", "j", "k", "l", "m", "n"]) valuesf = [1, 1, 3, 3, 1, 1, 1] df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf) @@ -3716,8 +3848,9 @@ def test_assigning_ops(self): tm.assert_frame_equal(df, exp) def test_functions_no_warnings(self): - df = DataFrame({'value': np.random.randint(0, 100, 20)}) + df = DataFrame({"value": np.random.randint(0, 100, 20)}) labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)] with tm.assert_produces_warning(False): - df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, - labels=labels) + df["group"] = pd.cut( + df.value, range(0, 105, 10), right=False, labels=labels + ) diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index 3adc62609cc6a7..adace5e4784aee 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -9,46 +9,64 @@ def frame_with_period_index(): return DataFrame( data=np.arange(20).reshape(4, 5), - columns=list('abcde'), - index=period_range(start='2000', freq='A', periods=4)) + columns=list("abcde"), + index=period_range(start="2000", freq="A", periods=4), + ) @pytest.fixture def left(): - return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) + return DataFrame({"a": [20, 10, 0]}, index=[2, 1, 0]) @pytest.fixture def right(): - return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2]) + return DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2]) @pytest.mark.parametrize( "how, sort, expected", - [('inner', False, DataFrame({'a': [20, 10], - 'b': [200, 100]}, - index=[2, 1])), - ('inner', True, DataFrame({'a': [10, 20], - 'b': [100, 200]}, - index=[1, 2])), - ('left', False, DataFrame({'a': [20, 10, 0], - 'b': [200, 100, np.nan]}, - index=[2, 1, 0])), - ('left', True, DataFrame({'a': [0, 10, 20], - 'b': [np.nan, 100, 200]}, - index=[0, 1, 2])), - ('right', False, DataFrame({'a': [np.nan, 10, 20], - 'b': [300, 100, 200]}, - index=[3, 1, 2])), - ('right', True, DataFrame({'a': [10, 20, np.nan], - 'b': [100, 200, 300]}, - index=[1, 2, 3])), - ('outer', False, DataFrame({'a': [0, 10, 20, np.nan], - 'b': [np.nan, 100, 200, 300]}, - index=[0, 1, 2, 3])), - ('outer', True, DataFrame({'a': [0, 10, 20, np.nan], - 'b': [np.nan, 100, 200, 300]}, - index=[0, 1, 2, 3]))]) + [ + ("inner", False, DataFrame({"a": [20, 10], "b": [200, 100]}, index=[2, 1])), + ("inner", True, DataFrame({"a": [10, 20], "b": [100, 200]}, index=[1, 2])), + ( + "left", + False, + DataFrame({"a": [20, 10, 0], "b": [200, 100, np.nan]}, index=[2, 1, 0]), + ), + ( + "left", + True, + DataFrame({"a": [0, 10, 20], "b": [np.nan, 100, 200]}, index=[0, 1, 2]), + ), + ( + "right", + False, + DataFrame({"a": [np.nan, 10, 20], "b": [300, 100, 200]}, index=[3, 1, 2]), + ), + ( + "right", + True, + DataFrame({"a": [10, 20, np.nan], "b": [100, 200, 300]}, index=[1, 2, 3]), + ), + ( + "outer", + False, + DataFrame( + {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3], + ), + ), + ( + "outer", + True, + DataFrame( + {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3], + ), + ), + ], +) def test_join(left, right, how, sort, expected): result = left.join(right, how=how, sort=sort) @@ -58,59 +76,59 @@ def test_join(left, right, how, sort, expected): def test_join_index(float_frame): # left / right - f = float_frame.loc[float_frame.index[:10], ['A', 'B']] - f2 = float_frame.loc[float_frame.index[5:], ['C', 'D']].iloc[::-1] + f = float_frame.loc[float_frame.index[:10], ["A", "B"]] + f2 = float_frame.loc[float_frame.index[5:], ["C", "D"]].iloc[::-1] joined = f.join(f2) tm.assert_index_equal(f.index, joined.index) - expected_columns = Index(['A', 'B', 'C', 'D']) + expected_columns = Index(["A", "B", "C", "D"]) tm.assert_index_equal(joined.columns, expected_columns) - joined = f.join(f2, how='left') + joined = f.join(f2, how="left") tm.assert_index_equal(joined.index, f.index) tm.assert_index_equal(joined.columns, expected_columns) - joined = f.join(f2, how='right') + joined = f.join(f2, how="right") tm.assert_index_equal(joined.index, f2.index) tm.assert_index_equal(joined.columns, expected_columns) # inner - joined = f.join(f2, how='inner') + joined = f.join(f2, how="inner") tm.assert_index_equal(joined.index, f.index[5:10]) tm.assert_index_equal(joined.columns, expected_columns) # outer - joined = f.join(f2, how='outer') + joined = f.join(f2, how="outer") tm.assert_index_equal(joined.index, float_frame.index.sort_values()) tm.assert_index_equal(joined.columns, expected_columns) - with pytest.raises(ValueError, match='join method'): - f.join(f2, how='foo') + with pytest.raises(ValueError, match="join method"): + f.join(f2, how="foo") # corner case - overlapping columns - msg = 'columns overlap but no suffix' - for how in ('outer', 'left', 'inner'): + msg = "columns overlap but no suffix" + for how in ("outer", "left", "inner"): with pytest.raises(ValueError, match=msg): float_frame.join(float_frame, how=how) def test_join_index_more(float_frame): - af = float_frame.loc[:, ['A', 'B']] - bf = float_frame.loc[::2, ['C', 'D']] + af = float_frame.loc[:, ["A", "B"]] + bf = float_frame.loc[::2, ["C", "D"]] expected = af.copy() - expected['C'] = float_frame['C'][::2] - expected['D'] = float_frame['D'][::2] + expected["C"] = float_frame["C"][::2] + expected["D"] = float_frame["D"][::2] result = af.join(bf) tm.assert_frame_equal(result, expected) - result = af.join(bf, how='right') + result = af.join(bf, how="right") tm.assert_frame_equal(result, expected[::2]) - result = bf.join(af, how='right') + result = bf.join(af, how="right") tm.assert_frame_equal(result, expected.loc[:, result.columns]) @@ -123,19 +141,19 @@ def test_join_index_series(float_frame): tm.assert_frame_equal(joined, float_frame, check_names=False) s.name = None - with pytest.raises(ValueError, match='must have a name'): + with pytest.raises(ValueError, match="must have a name"): df.join(s) def test_join_overlap(float_frame): - df1 = float_frame.loc[:, ['A', 'B', 'C']] - df2 = float_frame.loc[:, ['B', 'C', 'D']] + df1 = float_frame.loc[:, ["A", "B", "C"]] + df2 = float_frame.loc[:, ["B", "C", "D"]] - joined = df1.join(df2, lsuffix='_df1', rsuffix='_df2') - df1_suf = df1.loc[:, ['B', 'C']].add_suffix('_df1') - df2_suf = df2.loc[:, ['B', 'C']].add_suffix('_df2') + joined = df1.join(df2, lsuffix="_df1", rsuffix="_df2") + df1_suf = df1.loc[:, ["B", "C"]].add_suffix("_df1") + df2_suf = df2.loc[:, ["B", "C"]].add_suffix("_df2") - no_overlap = float_frame.loc[:, ['A', 'D']] + no_overlap = float_frame.loc[:, ["A", "D"]] expected = df1_suf.join(df2_suf).join(no_overlap) # column order not necessarily sorted @@ -143,35 +161,35 @@ def test_join_overlap(float_frame): def test_join_period_index(frame_with_period_index): - other = frame_with_period_index.rename( - columns=lambda x: '{key}{key}'.format(key=x)) + other = frame_with_period_index.rename(columns=lambda x: "{key}{key}".format(key=x)) - joined_values = np.concatenate( - [frame_with_period_index.values] * 2, axis=1) + joined_values = np.concatenate([frame_with_period_index.values] * 2, axis=1) joined_cols = frame_with_period_index.columns.append(other.columns) joined = frame_with_period_index.join(other) expected = DataFrame( - data=joined_values, - columns=joined_cols, - index=frame_with_period_index.index) + data=joined_values, columns=joined_cols, index=frame_with_period_index.index + ) tm.assert_frame_equal(joined, expected) def test_join_left_sequence_non_unique_index(): # https://github.com/pandas-dev/pandas/issues/19607 - df1 = DataFrame({'a': [0, 10, 20]}, index=[1, 2, 3]) - df2 = DataFrame({'b': [100, 200, 300]}, index=[4, 3, 2]) - df3 = DataFrame({'c': [400, 500, 600]}, index=[2, 2, 4]) + df1 = DataFrame({"a": [0, 10, 20]}, index=[1, 2, 3]) + df2 = DataFrame({"b": [100, 200, 300]}, index=[4, 3, 2]) + df3 = DataFrame({"c": [400, 500, 600]}, index=[2, 2, 4]) - joined = df1.join([df2, df3], how='left') + joined = df1.join([df2, df3], how="left") - expected = DataFrame({ - 'a': [0, 10, 10, 20], - 'b': [np.nan, 300, 300, 200], - 'c': [np.nan, 400, 500, np.nan] - }, index=[1, 2, 2, 3]) + expected = DataFrame( + { + "a": [0, 10, 10, 20], + "b": [np.nan, 300, 300, 200], + "c": [np.nan, 400, 500, np.nan], + }, + index=[1, 2, 2, 3], + ) tm.assert_frame_equal(joined, expected) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index e40ae6dd5494dd..c63a5ba64495f5 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -18,55 +18,55 @@ def _skip_if_no_pchip(): from scipy.interpolate import pchip_interpolate # noqa except ImportError: import pytest - pytest.skip('scipy.interpolate.pchip missing') + pytest.skip("scipy.interpolate.pchip missing") -class TestDataFrameMissingData: +class TestDataFrameMissingData: def test_dropEmptyRows(self, float_frame): N = len(float_frame.index) mat = np.random.randn(N) mat[:5] = np.nan - frame = DataFrame({'foo': mat}, index=float_frame.index) - original = Series(mat, index=float_frame.index, name='foo') + frame = DataFrame({"foo": mat}, index=float_frame.index) + original = Series(mat, index=float_frame.index, name="foo") expected = original.dropna() inplace_frame1, inplace_frame2 = frame.copy(), frame.copy() - smaller_frame = frame.dropna(how='all') + smaller_frame = frame.dropna(how="all") # check that original was preserved - assert_series_equal(frame['foo'], original) - inplace_frame1.dropna(how='all', inplace=True) - assert_series_equal(smaller_frame['foo'], expected) - assert_series_equal(inplace_frame1['foo'], expected) + assert_series_equal(frame["foo"], original) + inplace_frame1.dropna(how="all", inplace=True) + assert_series_equal(smaller_frame["foo"], expected) + assert_series_equal(inplace_frame1["foo"], expected) - smaller_frame = frame.dropna(how='all', subset=['foo']) - inplace_frame2.dropna(how='all', subset=['foo'], inplace=True) - assert_series_equal(smaller_frame['foo'], expected) - assert_series_equal(inplace_frame2['foo'], expected) + smaller_frame = frame.dropna(how="all", subset=["foo"]) + inplace_frame2.dropna(how="all", subset=["foo"], inplace=True) + assert_series_equal(smaller_frame["foo"], expected) + assert_series_equal(inplace_frame2["foo"], expected) def test_dropIncompleteRows(self, float_frame): N = len(float_frame.index) mat = np.random.randn(N) mat[:5] = np.nan - frame = DataFrame({'foo': mat}, index=float_frame.index) - frame['bar'] = 5 - original = Series(mat, index=float_frame.index, name='foo') + frame = DataFrame({"foo": mat}, index=float_frame.index) + frame["bar"] = 5 + original = Series(mat, index=float_frame.index, name="foo") inp_frame1, inp_frame2 = frame.copy(), frame.copy() smaller_frame = frame.dropna() - assert_series_equal(frame['foo'], original) + assert_series_equal(frame["foo"], original) inp_frame1.dropna(inplace=True) - exp = Series(mat[5:], index=float_frame.index[5:], name='foo') - tm.assert_series_equal(smaller_frame['foo'], exp) - tm.assert_series_equal(inp_frame1['foo'], exp) + exp = Series(mat[5:], index=float_frame.index[5:], name="foo") + tm.assert_series_equal(smaller_frame["foo"], exp) + tm.assert_series_equal(inp_frame1["foo"], exp) - samesize_frame = frame.dropna(subset=['bar']) - assert_series_equal(frame['foo'], original) - assert (frame['bar'] == 5).all() - inp_frame2.dropna(subset=['bar'], inplace=True) + samesize_frame = frame.dropna(subset=["bar"]) + assert_series_equal(frame["foo"], original) + assert (frame["bar"] == 5).all() + inp_frame2.dropna(subset=["bar"], inplace=True) tm.assert_index_equal(samesize_frame.index, float_frame.index) tm.assert_index_equal(inp_frame2.index, float_frame.index) @@ -117,60 +117,63 @@ def test_dropna(self): assert_frame_equal(inp, df) # all - dropped = df.dropna(axis=1, how='all') + dropped = df.dropna(axis=1, how="all") assert_frame_equal(dropped, df) df[2] = np.nan - dropped = df.dropna(axis=1, how='all') + dropped = df.dropna(axis=1, how="all") expected = df.loc[:, [0, 1, 3]] assert_frame_equal(dropped, expected) # bad input - msg = ("No axis named 3 for object type" - " ") + msg = "No axis named 3 for object type" " " with pytest.raises(ValueError, match=msg): df.dropna(axis=3) def test_drop_and_dropna_caching(self): # tst that cacher updates - original = Series([1, 2, np.nan], name='A') - expected = Series([1, 2], dtype=original.dtype, name='A') - df = pd.DataFrame({'A': original.values.copy()}) + original = Series([1, 2, np.nan], name="A") + expected = Series([1, 2], dtype=original.dtype, name="A") + df = pd.DataFrame({"A": original.values.copy()}) df2 = df.copy() - df['A'].dropna() - assert_series_equal(df['A'], original) - df['A'].dropna(inplace=True) - assert_series_equal(df['A'], expected) - df2['A'].drop([1]) - assert_series_equal(df2['A'], original) - df2['A'].drop([1], inplace=True) - assert_series_equal(df2['A'], original.drop([1])) + df["A"].dropna() + assert_series_equal(df["A"], original) + df["A"].dropna(inplace=True) + assert_series_equal(df["A"], expected) + df2["A"].drop([1]) + assert_series_equal(df2["A"], original) + df2["A"].drop([1], inplace=True) + assert_series_equal(df2["A"], original.drop([1])) def test_dropna_corner(self, float_frame): # bad input msg = "invalid how option: foo" with pytest.raises(ValueError, match=msg): - float_frame.dropna(how='foo') + float_frame.dropna(how="foo") msg = "must specify how or thresh" with pytest.raises(TypeError, match=msg): float_frame.dropna(how=None) # non-existent column - 8303 with pytest.raises(KeyError, match=r"^\['X'\]$"): - float_frame.dropna(subset=['A', 'X']) + float_frame.dropna(subset=["A", "X"]) def test_dropna_multiple_axes(self): - df = DataFrame([[1, np.nan, 2, 3], - [4, np.nan, 5, 6], - [np.nan, np.nan, np.nan, np.nan], - [7, np.nan, 8, 9]]) + df = DataFrame( + [ + [1, np.nan, 2, 3], + [4, np.nan, 5, 6], + [np.nan, np.nan, np.nan, np.nan], + [7, np.nan, 8, 9], + ] + ) cp = df.copy() # GH20987 with tm.assert_produces_warning(FutureWarning): - result = df.dropna(how='all', axis=[0, 1]) + result = df.dropna(how="all", axis=[0, 1]) with tm.assert_produces_warning(FutureWarning): - result2 = df.dropna(how='all', axis=(0, 1)) - expected = df.dropna(how='all').dropna(how='all', axis=1) + result2 = df.dropna(how="all", axis=(0, 1)) + expected = df.dropna(how="all").dropna(how="all", axis=1) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) @@ -178,34 +181,30 @@ def test_dropna_multiple_axes(self): inp = df.copy() with tm.assert_produces_warning(FutureWarning): - inp.dropna(how='all', axis=(0, 1), inplace=True) + inp.dropna(how="all", axis=(0, 1), inplace=True) assert_frame_equal(inp, expected) def test_dropna_tz_aware_datetime(self): # GH13407 df = DataFrame() - dt1 = datetime.datetime(2015, 1, 1, - tzinfo=dateutil.tz.tzutc()) - dt2 = datetime.datetime(2015, 2, 2, - tzinfo=dateutil.tz.tzutc()) - df['Time'] = [dt1] + dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc()) + dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc()) + df["Time"] = [dt1] result = df.dropna(axis=0) - expected = DataFrame({'Time': [dt1]}) + expected = DataFrame({"Time": [dt1]}) assert_frame_equal(result, expected) # Ex2 - df = DataFrame({'Time': [dt1, None, np.nan, dt2]}) + df = DataFrame({"Time": [dt1, None, np.nan, dt2]}) result = df.dropna(axis=0) - expected = DataFrame([dt1, dt2], - columns=['Time'], - index=[0, 3]) + expected = DataFrame([dt1, dt2], columns=["Time"], index=[0, 3]) assert_frame_equal(result, expected) def test_dropna_categorical_interval_index(self): # GH 25087 ii = pd.IntervalIndex.from_breaks([0, 2.78, 3.14, 6.28]) ci = pd.CategoricalIndex(ii) - df = pd.DataFrame({'A': list('abc')}, index=ci) + df = pd.DataFrame({"A": list("abc")}, index=ci) expected = df result = df.dropna() @@ -213,63 +212,65 @@ def test_dropna_categorical_interval_index(self): def test_fillna_datetime(self, datetime_frame): tf = datetime_frame - tf.loc[tf.index[:5], 'A'] = np.nan - tf.loc[tf.index[-5:], 'A'] = np.nan + tf.loc[tf.index[:5], "A"] = np.nan + tf.loc[tf.index[-5:], "A"] = np.nan zero_filled = datetime_frame.fillna(0) - assert (zero_filled.loc[zero_filled.index[:5], 'A'] == 0).all() + assert (zero_filled.loc[zero_filled.index[:5], "A"] == 0).all() - padded = datetime_frame.fillna(method='pad') - assert np.isnan(padded.loc[padded.index[:5], 'A']).all() - assert (padded.loc[padded.index[-5:], 'A'] == - padded.loc[padded.index[-5], 'A']).all() + padded = datetime_frame.fillna(method="pad") + assert np.isnan(padded.loc[padded.index[:5], "A"]).all() + assert ( + padded.loc[padded.index[-5:], "A"] == padded.loc[padded.index[-5], "A"] + ).all() msg = "Must specify a fill 'value' or 'method'" with pytest.raises(ValueError, match=msg): datetime_frame.fillna() msg = "Cannot specify both 'value' and 'method'" with pytest.raises(ValueError, match=msg): - datetime_frame.fillna(5, method='ffill') + datetime_frame.fillna(5, method="ffill") def test_fillna_mixed_type(self, float_string_frame): mf = float_string_frame - mf.loc[mf.index[5:20], 'foo'] = np.nan - mf.loc[mf.index[-10:], 'A'] = np.nan + mf.loc[mf.index[5:20], "foo"] = np.nan + mf.loc[mf.index[-10:], "A"] = np.nan # TODO: make stronger assertion here, GH 25640 mf.fillna(value=0) - mf.fillna(method='pad') + mf.fillna(method="pad") def test_fillna_mixed_float(self, mixed_float_frame): # mixed numeric (but no float16) - mf = mixed_float_frame.reindex(columns=['A', 'B', 'D']) - mf.loc[mf.index[-10:], 'A'] = np.nan + mf = mixed_float_frame.reindex(columns=["A", "B", "D"]) + mf.loc[mf.index[-10:], "A"] = np.nan result = mf.fillna(value=0) _check_mixed_float(result, dtype=dict(C=None)) - result = mf.fillna(method='pad') + result = mf.fillna(method="pad") _check_mixed_float(result, dtype=dict(C=None)) def test_fillna_empty(self): # empty frame (GH #2778) - df = DataFrame(columns=['x']) - for m in ['pad', 'backfill']: + df = DataFrame(columns=["x"]) + for m in ["pad", "backfill"]: df.x.fillna(method=m, inplace=True) df.x.fillna(method=m) def test_fillna_different_dtype(self): # with different dtype (GH#3386) - df = DataFrame([['a', 'a', np.nan, 'a'], [ - 'b', 'b', np.nan, 'b'], ['c', 'c', np.nan, 'c']]) - - result = df.fillna({2: 'foo'}) - expected = DataFrame([['a', 'a', 'foo', 'a'], - ['b', 'b', 'foo', 'b'], - ['c', 'c', 'foo', 'c']]) + df = DataFrame( + [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] + ) + + result = df.fillna({2: "foo"}) + expected = DataFrame( + [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] + ) assert_frame_equal(result, expected) - df.fillna({2: 'foo'}, inplace=True) + df.fillna({2: "foo"}, inplace=True) assert_frame_equal(df, expected) def test_fillna_limit_and_value(self): @@ -287,42 +288,59 @@ def test_fillna_limit_and_value(self): def test_fillna_datelike(self): # with datelike # GH#6344 - df = DataFrame({ - 'Date': [pd.NaT, Timestamp("2014-1-1")], - 'Date2': [Timestamp("2013-1-1"), pd.NaT] - }) + df = DataFrame( + { + "Date": [pd.NaT, Timestamp("2014-1-1")], + "Date2": [Timestamp("2013-1-1"), pd.NaT], + } + ) expected = df.copy() - expected['Date'] = expected['Date'].fillna( - df.loc[df.index[0], 'Date2']) - result = df.fillna(value={'Date': df['Date2']}) + expected["Date"] = expected["Date"].fillna(df.loc[df.index[0], "Date2"]) + result = df.fillna(value={"Date": df["Date2"]}) assert_frame_equal(result, expected) def test_fillna_tzaware(self): # with timezone # GH#15855 - df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), - pd.NaT]}) - exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), - pd.Timestamp('2012-11-11 00:00:00+01:00')]}) - assert_frame_equal(df.fillna(method='pad'), exp) - - df = pd.DataFrame({'A': [pd.NaT, - pd.Timestamp('2012-11-11 00:00:00+01:00')]}) - exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), - pd.Timestamp('2012-11-11 00:00:00+01:00')]}) - assert_frame_equal(df.fillna(method='bfill'), exp) + df = pd.DataFrame({"A": [pd.Timestamp("2012-11-11 00:00:00+01:00"), pd.NaT]}) + exp = pd.DataFrame( + { + "A": [ + pd.Timestamp("2012-11-11 00:00:00+01:00"), + pd.Timestamp("2012-11-11 00:00:00+01:00"), + ] + } + ) + assert_frame_equal(df.fillna(method="pad"), exp) + + df = pd.DataFrame({"A": [pd.NaT, pd.Timestamp("2012-11-11 00:00:00+01:00")]}) + exp = pd.DataFrame( + { + "A": [ + pd.Timestamp("2012-11-11 00:00:00+01:00"), + pd.Timestamp("2012-11-11 00:00:00+01:00"), + ] + } + ) + assert_frame_equal(df.fillna(method="bfill"), exp) def test_fillna_tzaware_different_column(self): # with timezone in another column # GH#15522 - df = pd.DataFrame({'A': pd.date_range('20130101', periods=4, - tz='US/Eastern'), - 'B': [1, 2, np.nan, np.nan]}) - result = df.fillna(method='pad') - expected = pd.DataFrame({'A': pd.date_range('20130101', periods=4, - tz='US/Eastern'), - 'B': [1., 2., 2., 2.]}) + df = pd.DataFrame( + { + "A": pd.date_range("20130101", periods=4, tz="US/Eastern"), + "B": [1, 2, np.nan, np.nan], + } + ) + result = df.fillna(method="pad") + expected = pd.DataFrame( + { + "A": pd.date_range("20130101", periods=4, tz="US/Eastern"), + "B": [1.0, 2.0, 2.0, 2.0], + } + ) assert_frame_equal(result, expected) def test_na_actions_categorical(self): @@ -344,11 +362,10 @@ def test_na_actions_categorical(self): res = df.fillna(value={"cats": 3, "vals": "b"}) tm.assert_frame_equal(res, df_exp_fill) - with pytest.raises(ValueError, match=("fill value must " - "be in categories")): + with pytest.raises(ValueError, match=("fill value must " "be in categories")): df.fillna(value={"cats": 4, "vals": "c"}) - res = df.fillna(method='pad') + res = df.fillna(method="pad") tm.assert_frame_equal(res, df_exp_fill) # dropna @@ -376,8 +393,7 @@ def test_fillna_categorical_nan(self): df = DataFrame({"cats": cat, "vals": val}) res = df.fillna(df.median()) v_exp = [np.nan, np.nan, np.nan] - df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, - dtype='category') + df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype="category") tm.assert_frame_equal(res, df_exp) result = df.cats.fillna(np.nan) @@ -385,40 +401,41 @@ def test_fillna_categorical_nan(self): result = df.vals.fillna(np.nan) tm.assert_series_equal(result, df.vals) - idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45', - '2011-01-01 09:00', pd.NaT, pd.NaT]) - df = DataFrame({'a': Categorical(idx)}) + idx = pd.DatetimeIndex( + ["2011-01-01 09:00", "2016-01-01 23:45", "2011-01-01 09:00", pd.NaT, pd.NaT] + ) + df = DataFrame({"a": Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=pd.NaT), df) - idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01', - pd.NaT, pd.NaT], freq='M') - df = DataFrame({'a': Categorical(idx)}) + idx = pd.PeriodIndex( + ["2011-01", "2011-01", "2011-01", pd.NaT, pd.NaT], freq="M" + ) + df = DataFrame({"a": Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=pd.NaT), df) - idx = pd.TimedeltaIndex(['1 days', '2 days', - '1 days', pd.NaT, pd.NaT]) - df = DataFrame({'a': Categorical(idx)}) + idx = pd.TimedeltaIndex(["1 days", "2 days", "1 days", pd.NaT, pd.NaT]) + df = DataFrame({"a": Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=pd.NaT), df) def test_fillna_downcast(self): # GH 15277 # infer int64 from float64 - df = pd.DataFrame({'a': [1., np.nan]}) - result = df.fillna(0, downcast='infer') - expected = pd.DataFrame({'a': [1, 0]}) + df = pd.DataFrame({"a": [1.0, np.nan]}) + result = df.fillna(0, downcast="infer") + expected = pd.DataFrame({"a": [1, 0]}) assert_frame_equal(result, expected) # infer int64 from float64 when fillna value is a dict - df = pd.DataFrame({'a': [1., np.nan]}) - result = df.fillna({'a': 0}, downcast='infer') - expected = pd.DataFrame({'a': [1, 0]}) + df = pd.DataFrame({"a": [1.0, np.nan]}) + result = df.fillna({"a": 0}, downcast="infer") + expected = pd.DataFrame({"a": [1, 0]}) assert_frame_equal(result, expected) def test_fillna_dtype_conversion(self): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) result = df.dtypes - expected = Series([np.dtype('object')] * 5, index=[1, 2, 3, 4, 5]) + expected = Series([np.dtype("object")] * 5, index=[1, 2, 3, 4, 5]) assert_series_equal(result, expected) result = df.fillna(1) @@ -426,75 +443,91 @@ def test_fillna_dtype_conversion(self): assert_frame_equal(result, expected) # empty block - df = DataFrame(index=range(3), columns=['A', 'B'], dtype='float64') - result = df.fillna('nan') - expected = DataFrame('nan', index=range(3), columns=['A', 'B']) + df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64") + result = df.fillna("nan") + expected = DataFrame("nan", index=range(3), columns=["A", "B"]) assert_frame_equal(result, expected) # equiv of replace - df = DataFrame(dict(A=[1, np.nan], B=[1., 2.])) - for v in ['', 1, np.nan, 1.0]: + df = DataFrame(dict(A=[1, np.nan], B=[1.0, 2.0])) + for v in ["", 1, np.nan, 1.0]: expected = df.replace(np.nan, v) result = df.fillna(v) assert_frame_equal(result, expected) def test_fillna_datetime_columns(self): # GH 7095 - df = pd.DataFrame({'A': [-1, -2, np.nan], - 'B': date_range('20130101', periods=3), - 'C': ['foo', 'bar', None], - 'D': ['foo2', 'bar2', None]}, - index=date_range('20130110', periods=3)) - result = df.fillna('?') - expected = pd.DataFrame({'A': [-1, -2, '?'], - 'B': date_range('20130101', periods=3), - 'C': ['foo', 'bar', '?'], - 'D': ['foo2', 'bar2', '?']}, - index=date_range('20130110', periods=3)) + df = pd.DataFrame( + { + "A": [-1, -2, np.nan], + "B": date_range("20130101", periods=3), + "C": ["foo", "bar", None], + "D": ["foo2", "bar2", None], + }, + index=date_range("20130110", periods=3), + ) + result = df.fillna("?") + expected = pd.DataFrame( + { + "A": [-1, -2, "?"], + "B": date_range("20130101", periods=3), + "C": ["foo", "bar", "?"], + "D": ["foo2", "bar2", "?"], + }, + index=date_range("20130110", periods=3), + ) tm.assert_frame_equal(result, expected) - df = pd.DataFrame({'A': [-1, -2, np.nan], - 'B': [pd.Timestamp('2013-01-01'), - pd.Timestamp('2013-01-02'), pd.NaT], - 'C': ['foo', 'bar', None], - 'D': ['foo2', 'bar2', None]}, - index=date_range('20130110', periods=3)) - result = df.fillna('?') - expected = pd.DataFrame({'A': [-1, -2, '?'], - 'B': [pd.Timestamp('2013-01-01'), - pd.Timestamp('2013-01-02'), '?'], - 'C': ['foo', 'bar', '?'], - 'D': ['foo2', 'bar2', '?']}, - index=pd.date_range('20130110', periods=3)) + df = pd.DataFrame( + { + "A": [-1, -2, np.nan], + "B": [pd.Timestamp("2013-01-01"), pd.Timestamp("2013-01-02"), pd.NaT], + "C": ["foo", "bar", None], + "D": ["foo2", "bar2", None], + }, + index=date_range("20130110", periods=3), + ) + result = df.fillna("?") + expected = pd.DataFrame( + { + "A": [-1, -2, "?"], + "B": [pd.Timestamp("2013-01-01"), pd.Timestamp("2013-01-02"), "?"], + "C": ["foo", "bar", "?"], + "D": ["foo2", "bar2", "?"], + }, + index=pd.date_range("20130110", periods=3), + ) tm.assert_frame_equal(result, expected) def test_ffill(self, datetime_frame): - datetime_frame['A'][:5] = np.nan - datetime_frame['A'][-5:] = np.nan + datetime_frame["A"][:5] = np.nan + datetime_frame["A"][-5:] = np.nan - assert_frame_equal(datetime_frame.ffill(), - datetime_frame.fillna(method='ffill')) + assert_frame_equal( + datetime_frame.ffill(), datetime_frame.fillna(method="ffill") + ) def test_bfill(self, datetime_frame): - datetime_frame['A'][:5] = np.nan - datetime_frame['A'][-5:] = np.nan + datetime_frame["A"][:5] = np.nan + datetime_frame["A"][-5:] = np.nan - assert_frame_equal(datetime_frame.bfill(), - datetime_frame.fillna(method='bfill')) + assert_frame_equal( + datetime_frame.bfill(), datetime_frame.fillna(method="bfill") + ) def test_frame_pad_backfill_limit(self): index = np.arange(10) df = DataFrame(np.random.randn(10, 4), index=index) - result = df[:2].reindex(index, method='pad', limit=5) + result = df[:2].reindex(index, method="pad", limit=5) - expected = df[:2].reindex(index).fillna(method='pad') + expected = df[:2].reindex(index).fillna(method="pad") expected.values[-3:] = np.nan tm.assert_frame_equal(result, expected) - result = df[-2:].reindex(index, method='backfill', limit=5) + result = df[-2:].reindex(index, method="backfill", limit=5) - expected = df[-2:].reindex(index).fillna(method='backfill') + expected = df[-2:].reindex(index).fillna(method="backfill") expected.values[:3] = np.nan tm.assert_frame_equal(result, expected) @@ -503,16 +536,16 @@ def test_frame_fillna_limit(self): df = DataFrame(np.random.randn(10, 4), index=index) result = df[:2].reindex(index) - result = result.fillna(method='pad', limit=5) + result = result.fillna(method="pad", limit=5) - expected = df[:2].reindex(index).fillna(method='pad') + expected = df[:2].reindex(index).fillna(method="pad") expected.values[-3:] = np.nan tm.assert_frame_equal(result, expected) result = df[-2:].reindex(index) - result = result.fillna(method='backfill', limit=5) + result = result.fillna(method="backfill", limit=5) - expected = df[-2:].reindex(index).fillna(method='backfill') + expected = df[-2:].reindex(index).fillna(method="backfill") expected.values[:3] = np.nan tm.assert_frame_equal(result, expected) @@ -556,26 +589,30 @@ def test_fillna_inplace(self): df[1][:4] = np.nan df[3][-4:] = np.nan - expected = df.fillna(method='ffill') + expected = df.fillna(method="ffill") assert expected is not df - df.fillna(method='ffill', inplace=True) + df.fillna(method="ffill", inplace=True) tm.assert_frame_equal(df, expected) def test_fillna_dict_series(self): - df = DataFrame({'a': [np.nan, 1, 2, np.nan, np.nan], - 'b': [1, 2, 3, np.nan, np.nan], - 'c': [np.nan, 1, 2, 3, 4]}) + df = DataFrame( + { + "a": [np.nan, 1, 2, np.nan, np.nan], + "b": [1, 2, 3, np.nan, np.nan], + "c": [np.nan, 1, 2, 3, 4], + } + ) - result = df.fillna({'a': 0, 'b': 5}) + result = df.fillna({"a": 0, "b": 5}) expected = df.copy() - expected['a'] = expected['a'].fillna(0) - expected['b'] = expected['b'].fillna(5) + expected["a"] = expected["a"].fillna(0) + expected["b"] = expected["b"].fillna(5) assert_frame_equal(result, expected) # it works - result = df.fillna({'a': 0, 'b': 5, 'd': 7}) + result = df.fillna({"a": 0, "b": 5, "d": 7}) # Series treated same as dict result = df.fillna(df.max()) @@ -583,29 +620,41 @@ def test_fillna_dict_series(self): assert_frame_equal(result, expected) # disable this for now - with pytest.raises(NotImplementedError, match='column by column'): + with pytest.raises(NotImplementedError, match="column by column"): df.fillna(df.max(1), axis=1) def test_fillna_dataframe(self): # GH 8377 - df = DataFrame({'a': [np.nan, 1, 2, np.nan, np.nan], - 'b': [1, 2, 3, np.nan, np.nan], - 'c': [np.nan, 1, 2, 3, 4]}, - index=list('VWXYZ')) + df = DataFrame( + { + "a": [np.nan, 1, 2, np.nan, np.nan], + "b": [1, 2, 3, np.nan, np.nan], + "c": [np.nan, 1, 2, 3, 4], + }, + index=list("VWXYZ"), + ) # df2 may have different index and columns - df2 = DataFrame({'a': [np.nan, 10, 20, 30, 40], - 'b': [50, 60, 70, 80, 90], - 'foo': ['bar'] * 5}, - index=list('VWXuZ')) + df2 = DataFrame( + { + "a": [np.nan, 10, 20, 30, 40], + "b": [50, 60, 70, 80, 90], + "foo": ["bar"] * 5, + }, + index=list("VWXuZ"), + ) result = df.fillna(df2) # only those columns and indices which are shared get filled - expected = DataFrame({'a': [np.nan, 1, 2, np.nan, 40], - 'b': [1, 2, 3, np.nan, 90], - 'c': [np.nan, 1, 2, 3, 4]}, - index=list('VWXYZ')) + expected = DataFrame( + { + "a": [np.nan, 1, 2, np.nan, 40], + "b": [1, 2, 3, np.nan, 90], + "c": [np.nan, 1, 2, 3, 4], + }, + index=list("VWXYZ"), + ) assert_frame_equal(result, expected) @@ -613,31 +662,32 @@ def test_fillna_columns(self): df = DataFrame(np.random.randn(10, 10)) df.values[:, ::2] = np.nan - result = df.fillna(method='ffill', axis=1) - expected = df.T.fillna(method='pad').T + result = df.fillna(method="ffill", axis=1) + expected = df.T.fillna(method="pad").T assert_frame_equal(result, expected) - df.insert(6, 'foo', 5) - result = df.fillna(method='ffill', axis=1) - expected = df.astype(float).fillna(method='ffill', axis=1) + df.insert(6, "foo", 5) + result = df.fillna(method="ffill", axis=1) + expected = df.astype(float).fillna(method="ffill", axis=1) assert_frame_equal(result, expected) def test_fillna_invalid_method(self, float_frame): - with pytest.raises(ValueError, match='ffil'): - float_frame.fillna(method='ffil') + with pytest.raises(ValueError, match="ffil"): + float_frame.fillna(method="ffil") def test_fillna_invalid_value(self, float_frame): # list - msg = ("\"value\" parameter must be a scalar or dict, but you passed" - " a \"{}\"") - with pytest.raises(TypeError, match=msg.format('list')): + msg = '"value" parameter must be a scalar or dict, but you passed' ' a "{}"' + with pytest.raises(TypeError, match=msg.format("list")): float_frame.fillna([1, 2]) # tuple - with pytest.raises(TypeError, match=msg.format('tuple')): + with pytest.raises(TypeError, match=msg.format("tuple")): float_frame.fillna((1, 2)) # frame with series - msg = ("\"value\" parameter must be a scalar, dict or Series, but you" - " passed a \"DataFrame\"") + msg = ( + '"value" parameter must be a scalar, dict or Series, but you' + ' passed a "DataFrame"' + ) with pytest.raises(TypeError, match=msg): float_frame.iloc[:, 0].fillna(float_frame) @@ -645,17 +695,17 @@ def test_fillna_col_reordering(self): cols = ["COL." + str(i) for i in range(5, 0, -1)] data = np.random.rand(20, 5) df = DataFrame(index=range(20), columns=cols, data=data) - filled = df.fillna(method='ffill') + filled = df.fillna(method="ffill") assert df.columns.tolist() == filled.columns.tolist() def test_fill_corner(self, float_frame, float_string_frame): mf = float_string_frame - mf.loc[mf.index[5:20], 'foo'] = np.nan - mf.loc[mf.index[-10:], 'A'] = np.nan + mf.loc[mf.index[5:20], "foo"] = np.nan + mf.loc[mf.index[-10:], "A"] = np.nan filled = float_string_frame.fillna(value=0) - assert (filled.loc[filled.index[5:20], 'foo'] == 0).all() - del float_string_frame['foo'] + assert (filled.loc[filled.index[5:20], "foo"] == 0).all() + del float_string_frame["foo"] empty_float = float_frame.reindex(columns=[]) @@ -664,8 +714,8 @@ def test_fill_corner(self, float_frame, float_string_frame): def test_fill_value_when_combine_const(self): # GH12723 - dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float') - df = DataFrame({'foo': dat}, index=range(6)) + dat = np.array([0, 1, np.nan, 3, 4, 5], dtype="float") + df = DataFrame({"foo": dat}, index=range(6)) exp = df.fillna(0).add(2) res = df.add(2, fill_value=0) @@ -673,123 +723,144 @@ def test_fill_value_when_combine_const(self): class TestDataFrameInterpolate: - def test_interp_basic(self): - df = DataFrame({'A': [1, 2, np.nan, 4], - 'B': [1, 4, 9, np.nan], - 'C': [1, 2, 3, 5], - 'D': list('abcd')}) - expected = DataFrame({'A': [1., 2., 3., 4.], - 'B': [1., 4., 9., 9.], - 'C': [1, 2, 3, 5], - 'D': list('abcd')}) + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": [1, 4, 9, np.nan], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) + expected = DataFrame( + { + "A": [1.0, 2.0, 3.0, 4.0], + "B": [1.0, 4.0, 9.0, 9.0], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) result = df.interpolate() assert_frame_equal(result, expected) - result = df.set_index('C').interpolate() - expected = df.set_index('C') - expected.loc[3, 'A'] = 3 - expected.loc[5, 'B'] = 9 + result = df.set_index("C").interpolate() + expected = df.set_index("C") + expected.loc[3, "A"] = 3 + expected.loc[5, "B"] = 9 assert_frame_equal(result, expected) def test_interp_bad_method(self): - df = DataFrame({'A': [1, 2, np.nan, 4], - 'B': [1, 4, 9, np.nan], - 'C': [1, 2, 3, 5], - 'D': list('abcd')}) + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": [1, 4, 9, np.nan], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) with pytest.raises(ValueError): - df.interpolate(method='not_a_method') + df.interpolate(method="not_a_method") def test_interp_combo(self): - df = DataFrame({'A': [1., 2., np.nan, 4.], - 'B': [1, 4, 9, np.nan], - 'C': [1, 2, 3, 5], - 'D': list('abcd')}) - - result = df['A'].interpolate() - expected = Series([1., 2., 3., 4.], name='A') + df = DataFrame( + { + "A": [1.0, 2.0, np.nan, 4.0], + "B": [1, 4, 9, np.nan], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) + + result = df["A"].interpolate() + expected = Series([1.0, 2.0, 3.0, 4.0], name="A") assert_series_equal(result, expected) - result = df['A'].interpolate(downcast='infer') - expected = Series([1, 2, 3, 4], name='A') + result = df["A"].interpolate(downcast="infer") + expected = Series([1, 2, 3, 4], name="A") assert_series_equal(result, expected) def test_interp_nan_idx(self): - df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]}) - df = df.set_index('A') + df = DataFrame({"A": [1, 2, np.nan, 4], "B": [np.nan, 2, 3, 4]}) + df = df.set_index("A") with pytest.raises(NotImplementedError): - df.interpolate(method='values') + df.interpolate(method="values") @td.skip_if_no_scipy def test_interp_various(self): - df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7], - 'C': [1, 2, 3, 5, 8, 13, 21]}) - df = df.set_index('C') + df = DataFrame( + {"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]} + ) + df = df.set_index("C") expected = df.copy() - result = df.interpolate(method='polynomial', order=1) + result = df.interpolate(method="polynomial", order=1) expected.A.loc[3] = 2.66666667 expected.A.loc[13] = 5.76923076 assert_frame_equal(result, expected) - result = df.interpolate(method='cubic') + result = df.interpolate(method="cubic") # GH #15662. expected.A.loc[3] = 2.81547781 expected.A.loc[13] = 5.52964175 assert_frame_equal(result, expected) - result = df.interpolate(method='nearest') + result = df.interpolate(method="nearest") expected.A.loc[3] = 2 expected.A.loc[13] = 5 assert_frame_equal(result, expected, check_dtype=False) - result = df.interpolate(method='quadratic') + result = df.interpolate(method="quadratic") expected.A.loc[3] = 2.82150771 expected.A.loc[13] = 6.12648668 assert_frame_equal(result, expected) - result = df.interpolate(method='slinear') + result = df.interpolate(method="slinear") expected.A.loc[3] = 2.66666667 expected.A.loc[13] = 5.76923077 assert_frame_equal(result, expected) - result = df.interpolate(method='zero') - expected.A.loc[3] = 2. + result = df.interpolate(method="zero") + expected.A.loc[3] = 2.0 expected.A.loc[13] = 5 assert_frame_equal(result, expected, check_dtype=False) @td.skip_if_no_scipy def test_interp_alt_scipy(self): - df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7], - 'C': [1, 2, 3, 5, 8, 13, 21]}) - result = df.interpolate(method='barycentric') + df = DataFrame( + {"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]} + ) + result = df.interpolate(method="barycentric") expected = df.copy() - expected.loc[2, 'A'] = 3 - expected.loc[5, 'A'] = 6 + expected.loc[2, "A"] = 3 + expected.loc[5, "A"] = 6 assert_frame_equal(result, expected) - result = df.interpolate(method='barycentric', downcast='infer') + result = df.interpolate(method="barycentric", downcast="infer") assert_frame_equal(result, expected.astype(np.int64)) - result = df.interpolate(method='krogh') + result = df.interpolate(method="krogh") expectedk = df.copy() - expectedk['A'] = expected['A'] + expectedk["A"] = expected["A"] assert_frame_equal(result, expectedk) _skip_if_no_pchip() - result = df.interpolate(method='pchip') - expected.loc[2, 'A'] = 3 - expected.loc[5, 'A'] = 6.0 + result = df.interpolate(method="pchip") + expected.loc[2, "A"] = 3 + expected.loc[5, "A"] = 6.0 assert_frame_equal(result, expected) def test_interp_rowwise(self): - df = DataFrame({0: [1, 2, np.nan, 4], - 1: [2, 3, 4, np.nan], - 2: [np.nan, 4, 5, 6], - 3: [4, np.nan, 6, 7], - 4: [1, 2, 3, 4]}) + df = DataFrame( + { + 0: [1, 2, np.nan, 4], + 1: [2, 3, 4, np.nan], + 2: [np.nan, 4, 5, 6], + 3: [4, np.nan, 6, 7], + 4: [1, 2, 3, 4], + } + ) result = df.interpolate(axis=1) expected = df.copy() expected.loc[3, 1] = 5 @@ -798,7 +869,7 @@ def test_interp_rowwise(self): expected[4] = expected[4].astype(np.float64) assert_frame_equal(result, expected) - result = df.interpolate(axis=1, method='values') + result = df.interpolate(axis=1, method="values") assert_frame_equal(result, expected) result = df.interpolate(axis=0) @@ -806,84 +877,96 @@ def test_interp_rowwise(self): assert_frame_equal(result, expected) def test_rowwise_alt(self): - df = DataFrame({0: [0, .5, 1., np.nan, 4, 8, np.nan, np.nan, 64], - 1: [1, 2, 3, 4, 3, 2, 1, 0, -1]}) + df = DataFrame( + { + 0: [0, 0.5, 1.0, np.nan, 4, 8, np.nan, np.nan, 64], + 1: [1, 2, 3, 4, 3, 2, 1, 0, -1], + } + ) df.interpolate(axis=0) - @pytest.mark.parametrize("check_scipy", [ - False, pytest.param(True, marks=td.skip_if_no_scipy) - ]) + @pytest.mark.parametrize( + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] + ) def test_interp_leading_nans(self, check_scipy): - df = DataFrame({"A": [np.nan, np.nan, .5, .25, 0], - "B": [np.nan, -3, -3.5, np.nan, -4]}) + df = DataFrame( + {"A": [np.nan, np.nan, 0.5, 0.25, 0], "B": [np.nan, -3, -3.5, np.nan, -4]} + ) result = df.interpolate() expected = df.copy() - expected['B'].loc[3] = -3.75 + expected["B"].loc[3] = -3.75 assert_frame_equal(result, expected) if check_scipy: - result = df.interpolate(method='polynomial', order=1) + result = df.interpolate(method="polynomial", order=1) assert_frame_equal(result, expected) def test_interp_raise_on_only_mixed(self): - df = DataFrame({'A': [1, 2, np.nan, 4], - 'B': ['a', 'b', 'c', 'd'], - 'C': [np.nan, 2, 5, 7], - 'D': [np.nan, np.nan, 9, 9], - 'E': [1, 2, 3, 4]}) + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": ["a", "b", "c", "d"], + "C": [np.nan, 2, 5, 7], + "D": [np.nan, np.nan, 9, 9], + "E": [1, 2, 3, 4], + } + ) with pytest.raises(TypeError): df.interpolate(axis=1) def test_interp_raise_on_all_object_dtype(self): # GH 22985 - df = DataFrame({ - 'A': [1, 2, 3], - 'B': [4, 5, 6]}, - dtype='object') - msg = ("Cannot interpolate with all object-dtype columns " - "in the DataFrame. Try setting at least one " - "column to a numeric dtype.") + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, dtype="object") + msg = ( + "Cannot interpolate with all object-dtype columns " + "in the DataFrame. Try setting at least one " + "column to a numeric dtype." + ) with pytest.raises(TypeError, match=msg): df.interpolate() def test_interp_inplace(self): - df = DataFrame({'a': [1., 2., np.nan, 4.]}) - expected = DataFrame({'a': [1., 2., 3., 4.]}) + df = DataFrame({"a": [1.0, 2.0, np.nan, 4.0]}) + expected = DataFrame({"a": [1.0, 2.0, 3.0, 4.0]}) result = df.copy() - result['a'].interpolate(inplace=True) + result["a"].interpolate(inplace=True) assert_frame_equal(result, expected) result = df.copy() - result['a'].interpolate(inplace=True, downcast='infer') - assert_frame_equal(result, expected.astype('int64')) + result["a"].interpolate(inplace=True, downcast="infer") + assert_frame_equal(result, expected.astype("int64")) def test_interp_inplace_row(self): # GH 10395 - result = DataFrame({'a': [1., 2., 3., 4.], - 'b': [np.nan, 2., 3., 4.], - 'c': [3, 2, 2, 2]}) - expected = result.interpolate(method='linear', axis=1, inplace=False) - result.interpolate(method='linear', axis=1, inplace=True) + result = DataFrame( + {"a": [1.0, 2.0, 3.0, 4.0], "b": [np.nan, 2.0, 3.0, 4.0], "c": [3, 2, 2, 2]} + ) + expected = result.interpolate(method="linear", axis=1, inplace=False) + result.interpolate(method="linear", axis=1, inplace=True) assert_frame_equal(result, expected) def test_interp_ignore_all_good(self): # GH - df = DataFrame({'A': [1, 2, np.nan, 4], - 'B': [1, 2, 3, 4], - 'C': [1., 2., np.nan, 4.], - 'D': [1., 2., 3., 4.]}) - expected = DataFrame({'A': np.array( - [1, 2, 3, 4], dtype='float64'), - 'B': np.array( - [1, 2, 3, 4], dtype='int64'), - 'C': np.array( - [1., 2., 3, 4.], dtype='float64'), - 'D': np.array( - [1., 2., 3., 4.], dtype='float64')}) + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": [1, 2, 3, 4], + "C": [1.0, 2.0, np.nan, 4.0], + "D": [1.0, 2.0, 3.0, 4.0], + } + ) + expected = DataFrame( + { + "A": np.array([1, 2, 3, 4], dtype="float64"), + "B": np.array([1, 2, 3, 4], dtype="int64"), + "C": np.array([1.0, 2.0, 3, 4.0], dtype="float64"), + "D": np.array([1.0, 2.0, 3.0, 4.0], dtype="float64"), + } + ) result = df.interpolate(downcast=None) assert_frame_equal(result, expected) # all good - result = df[['B', 'D']].interpolate(downcast=None) - assert_frame_equal(result, df[['B', 'D']]) + result = df[["B", "D"]].interpolate(downcast=None) + assert_frame_equal(result, df[["B", "D"]]) diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index ffc2a515bc4b72..ed9eeb594f7f67 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -11,13 +11,12 @@ class TestDataFrameMutateColumns: - def test_assign(self): - df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) original = df.copy() result = df.assign(C=df.B / df.A) expected = df.copy() - expected['C'] = [4, 2.5, 2] + expected["C"] = [4, 2.5, 2] assert_frame_equal(result, expected) # lambda syntax @@ -34,13 +33,13 @@ def test_assign(self): assert_frame_equal(df, original) result = df.assign(B=df.B / df.A) - expected = expected.drop('B', axis=1).rename(columns={'C': 'B'}) + expected = expected.drop("B", axis=1).rename(columns={"C": "B"}) assert_frame_equal(result, expected) # overwrite result = df.assign(A=df.A + df.B) expected = df.copy() - expected['A'] = [5, 7, 9] + expected["A"] = [5, 7, 9] assert_frame_equal(result, expected) # lambda @@ -48,33 +47,31 @@ def test_assign(self): assert_frame_equal(result, expected) def test_assign_multiple(self): - df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=['A', 'B']) + df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=["A", "B"]) result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B) - expected = DataFrame([[1, 4, 7, 1, 4], [2, 5, 8, 2, 5], - [3, 6, 9, 3, 6]], columns=list('ABCDE')) + expected = DataFrame( + [[1, 4, 7, 1, 4], [2, 5, 8, 2, 5], [3, 6, 9, 3, 6]], columns=list("ABCDE") + ) assert_frame_equal(result, expected) def test_assign_order(self): # GH 9818 - df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) result = df.assign(D=df.A + df.B, C=df.A - df.B) if PY36: - expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], - columns=list('ABDC')) + expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], columns=list("ABDC")) else: - expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], - columns=list('ABCD')) + expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list("ABCD")) assert_frame_equal(result, expected) result = df.assign(C=df.A - df.B, D=df.A + df.B) - expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], - columns=list('ABCD')) + expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list("ABCD")) assert_frame_equal(result, expected) def test_assign_bad(self): - df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) # non-keyword argument with pytest.raises(TypeError): @@ -82,52 +79,55 @@ def test_assign_bad(self): with pytest.raises(AttributeError): df.assign(C=df.A, D=df.A + df.C) - @pytest.mark.skipif(PY36, reason="""Issue #14207: valid for python - 3.6 and above""") + @pytest.mark.skipif( + PY36, + reason="""Issue #14207: valid for python + 3.6 and above""", + ) def test_assign_dependent_old_python(self): - df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) # Key C does not exist at definition time of df with pytest.raises(KeyError): - df.assign(C=lambda df: df.A, - D=lambda df: df['A'] + df['C']) + df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"]) with pytest.raises(KeyError): - df.assign(C=df.A, D=lambda x: x['A'] + x['C']) + df.assign(C=df.A, D=lambda x: x["A"] + x["C"]) - @pytest.mark.skipif(not PY36, reason="""Issue #14207: not valid for - python 3.5 and below""") + @pytest.mark.skipif( + not PY36, + reason="""Issue #14207: not valid for + python 3.5 and below""", + ) def test_assign_dependent(self): - df = DataFrame({'A': [1, 2], 'B': [3, 4]}) + df = DataFrame({"A": [1, 2], "B": [3, 4]}) - result = df.assign(C=df.A, D=lambda x: x['A'] + x['C']) - expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], - columns=list('ABCD')) + result = df.assign(C=df.A, D=lambda x: x["A"] + x["C"]) + expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD")) assert_frame_equal(result, expected) - result = df.assign(C=lambda df: df.A, - D=lambda df: df['A'] + df['C']) - expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], - columns=list('ABCD')) + result = df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"]) + expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD")) assert_frame_equal(result, expected) def test_insert_error_msmgs(self): # GH 7432 - df = DataFrame({'foo': ['a', 'b', 'c'], 'bar': [ - 1, 2, 3], 'baz': ['d', 'e', 'f']}).set_index('foo') - s = DataFrame({'foo': ['a', 'b', 'c', 'a'], 'fiz': [ - 'g', 'h', 'i', 'j']}).set_index('foo') - msg = 'cannot reindex from a duplicate axis' + df = DataFrame( + {"foo": ["a", "b", "c"], "bar": [1, 2, 3], "baz": ["d", "e", "f"]} + ).set_index("foo") + s = DataFrame( + {"foo": ["a", "b", "c", "a"], "fiz": ["g", "h", "i", "j"]} + ).set_index("foo") + msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): - df['newcol'] = s + df["newcol"] = s # GH 4107, more descriptive error message - df = DataFrame(np.random.randint(0, 2, (4, 4)), - columns=['a', 'b', 'c', 'd']) + df = DataFrame(np.random.randint(0, 2, (4, 4)), columns=["a", "b", "c", "d"]) - msg = 'incompatible index of inserted column with frame index' + msg = "incompatible index of inserted column with frame index" with pytest.raises(TypeError, match=msg): - df['gr'] = df.groupby(['b', 'c']).count() + df["gr"] = df.groupby(["b", "c"]).count() def test_insert_benchmark(self): # from the vb_suite/frame_methods/frame_insert_columns @@ -137,120 +137,120 @@ def test_insert_benchmark(self): new_col = np.random.randn(N) for i in range(K): df[i] = new_col - expected = DataFrame(np.repeat(new_col, K).reshape(N, K), - index=range(N)) + expected = DataFrame(np.repeat(new_col, K).reshape(N, K), index=range(N)) assert_frame_equal(df, expected) def test_insert(self): - df = DataFrame(np.random.randn(5, 3), index=np.arange(5), - columns=['c', 'b', 'a']) + df = DataFrame( + np.random.randn(5, 3), index=np.arange(5), columns=["c", "b", "a"] + ) - df.insert(0, 'foo', df['a']) - tm.assert_index_equal(df.columns, Index(['foo', 'c', 'b', 'a'])) - tm.assert_series_equal(df['a'], df['foo'], check_names=False) + df.insert(0, "foo", df["a"]) + tm.assert_index_equal(df.columns, Index(["foo", "c", "b", "a"])) + tm.assert_series_equal(df["a"], df["foo"], check_names=False) - df.insert(2, 'bar', df['c']) - tm.assert_index_equal(df.columns, - Index(['foo', 'c', 'bar', 'b', 'a'])) - tm.assert_almost_equal(df['c'], df['bar'], check_names=False) + df.insert(2, "bar", df["c"]) + tm.assert_index_equal(df.columns, Index(["foo", "c", "bar", "b", "a"])) + tm.assert_almost_equal(df["c"], df["bar"], check_names=False) # diff dtype # new item - df['x'] = df['a'].astype('float32') + df["x"] = df["a"].astype("float32") result = df.dtypes - expected = Series([np.dtype('float64')] * 5 + [np.dtype('float32')], - index=['foo', 'c', 'bar', 'b', 'a', 'x']) + expected = Series( + [np.dtype("float64")] * 5 + [np.dtype("float32")], + index=["foo", "c", "bar", "b", "a", "x"], + ) tm.assert_series_equal(result, expected) # replacing current (in different block) - df['a'] = df['a'].astype('float32') + df["a"] = df["a"].astype("float32") result = df.dtypes - expected = Series([np.dtype('float64')] * 4 + - [np.dtype('float32')] * 2, - index=['foo', 'c', 'bar', 'b', 'a', 'x']) + expected = Series( + [np.dtype("float64")] * 4 + [np.dtype("float32")] * 2, + index=["foo", "c", "bar", "b", "a", "x"], + ) tm.assert_series_equal(result, expected) - df['y'] = df['a'].astype('int32') + df["y"] = df["a"].astype("int32") result = df.dtypes - expected = Series([np.dtype('float64')] * 4 + - [np.dtype('float32')] * 2 + - [np.dtype('int32')], - index=['foo', 'c', 'bar', 'b', 'a', 'x', 'y']) + expected = Series( + [np.dtype("float64")] * 4 + [np.dtype("float32")] * 2 + [np.dtype("int32")], + index=["foo", "c", "bar", "b", "a", "x", "y"], + ) tm.assert_series_equal(result, expected) - with pytest.raises(ValueError, match='already exists'): - df.insert(1, 'a', df['b']) + with pytest.raises(ValueError, match="already exists"): + df.insert(1, "a", df["b"]) msg = "cannot insert c, already exists" with pytest.raises(ValueError, match=msg): - df.insert(1, 'c', df['b']) + df.insert(1, "c", df["b"]) - df.columns.name = 'some_name' + df.columns.name = "some_name" # preserve columns name field - df.insert(0, 'baz', df['c']) - assert df.columns.name == 'some_name' + df.insert(0, "baz", df["c"]) + assert df.columns.name == "some_name" # GH 13522 - df = DataFrame(index=['A', 'B', 'C']) - df['X'] = df.index - df['X'] = ['x', 'y', 'z'] - exp = DataFrame(data={'X': ['x', 'y', 'z']}, index=['A', 'B', 'C']) + df = DataFrame(index=["A", "B", "C"]) + df["X"] = df.index + df["X"] = ["x", "y", "z"] + exp = DataFrame(data={"X": ["x", "y", "z"]}, index=["A", "B", "C"]) assert_frame_equal(df, exp) def test_delitem(self, float_frame): - del float_frame['A'] - assert 'A' not in float_frame + del float_frame["A"] + assert "A" not in float_frame def test_delitem_multiindex(self): - midx = MultiIndex.from_product([['A', 'B'], [1, 2]]) + midx = MultiIndex.from_product([["A", "B"], [1, 2]]) df = DataFrame(np.random.randn(4, 4), columns=midx) assert len(df.columns) == 4 - assert ('A', ) in df.columns - assert 'A' in df.columns + assert ("A",) in df.columns + assert "A" in df.columns - result = df['A'] + result = df["A"] assert isinstance(result, DataFrame) - del df['A'] + del df["A"] assert len(df.columns) == 2 # A still in the levels, BUT get a KeyError if trying # to delete - assert ('A', ) not in df.columns + assert ("A",) not in df.columns with pytest.raises(KeyError): - del df[('A',)] + del df[("A",)] # behavior of dropped/deleted MultiIndex levels changed from # GH 2770 to GH 19027: MultiIndex no longer '.__contains__' # levels which are dropped/deleted - assert 'A' not in df.columns + assert "A" not in df.columns with pytest.raises(KeyError): - del df['A'] + del df["A"] def test_pop(self, float_frame): - float_frame.columns.name = 'baz' + float_frame.columns.name = "baz" - float_frame.pop('A') - assert 'A' not in float_frame + float_frame.pop("A") + assert "A" not in float_frame - float_frame['foo'] = 'bar' - float_frame.pop('foo') - assert 'foo' not in float_frame - assert float_frame.columns.name == 'baz' + float_frame["foo"] = "bar" + float_frame.pop("foo") + assert "foo" not in float_frame + assert float_frame.columns.name == "baz" # gh-10912: inplace ops cause caching issue - a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[ - 'A', 'B', 'C'], index=['X', 'Y']) - b = a.pop('B') + a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"], index=["X", "Y"]) + b = a.pop("B") b += 1 # original frame - expected = DataFrame([[1, 3], [4, 6]], columns=[ - 'A', 'C'], index=['X', 'Y']) + expected = DataFrame([[1, 3], [4, 6]], columns=["A", "C"], index=["X", "Y"]) tm.assert_frame_equal(a, expected) # result - expected = Series([2, 5], index=['X', 'Y'], name='B') + 1 + expected = Series([2, 5], index=["X", "Y"], name="B") + 1 tm.assert_series_equal(b, expected) def test_pop_non_unique_cols(self): @@ -268,19 +268,18 @@ def test_pop_non_unique_cols(self): def test_insert_column_bug_4032(self): # GH4032, inserting a column and renaming causing errors - df = DataFrame({'b': [1.1, 2.2]}) + df = DataFrame({"b": [1.1, 2.2]}) df = df.rename(columns={}) - df.insert(0, 'a', [1, 2]) + df.insert(0, "a", [1, 2]) result = df.rename(columns={}) str(result) - expected = DataFrame([[1, 1.1], [2, 2.2]], columns=['a', 'b']) + expected = DataFrame([[1, 1.1], [2, 2.2]], columns=["a", "b"]) assert_frame_equal(result, expected) - df.insert(0, 'c', [1.3, 2.3]) + df.insert(0, "c", [1.3, 2.3]) result = df.rename(columns={}) str(result) - expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], - columns=['c', 'a', 'b']) + expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=["c", "a", "b"]) assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index e7583adff403bf..4faa0d0e3f941b 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -9,9 +9,7 @@ class TestDataFrameNonuniqueIndexes(TestData): - def test_column_dups_operations(self): - def check(result, expected=None): if expected is not None: assert_frame_equal(result, expected) @@ -22,253 +20,294 @@ def check(result, expected=None): # GH 3687 arr = np.random.randn(3, 2) idx = list(range(2)) - df = DataFrame(arr, columns=['A', 'A']) + df = DataFrame(arr, columns=["A", "A"]) df.columns = idx expected = DataFrame(arr, columns=idx) check(df, expected) - idx = date_range('20130101', periods=4, freq='Q-NOV') - df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], - columns=['a', 'a', 'a', 'a']) + idx = date_range("20130101", periods=4, freq="Q-NOV") + df = DataFrame( + [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=["a", "a", "a", "a"] + ) df.columns = idx - expected = DataFrame( - [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx) + expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx) check(df, expected) # insert - df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], - columns=['foo', 'bar', 'foo', 'hello']) - df['string'] = 'bah' - expected = DataFrame([[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'], - [2, 1, 3, 5, 'bah']], - columns=['foo', 'bar', 'foo', 'hello', 'string']) + df = DataFrame( + [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], + columns=["foo", "bar", "foo", "hello"], + ) + df["string"] = "bah" + expected = DataFrame( + [[1, 1, 1, 5, "bah"], [1, 1, 2, 5, "bah"], [2, 1, 3, 5, "bah"]], + columns=["foo", "bar", "foo", "hello", "string"], + ) check(df, expected) - with pytest.raises(ValueError, match='Length of value'): - df.insert(0, 'AnotherColumn', range(len(df.index) - 1)) + with pytest.raises(ValueError, match="Length of value"): + df.insert(0, "AnotherColumn", range(len(df.index) - 1)) # insert same dtype - df['foo2'] = 3 - expected = DataFrame([[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3], - [2, 1, 3, 5, 'bah', 3]], - columns=['foo', 'bar', 'foo', 'hello', - 'string', 'foo2']) + df["foo2"] = 3 + expected = DataFrame( + [[1, 1, 1, 5, "bah", 3], [1, 1, 2, 5, "bah", 3], [2, 1, 3, 5, "bah", 3]], + columns=["foo", "bar", "foo", "hello", "string", "foo2"], + ) check(df, expected) # set (non-dup) - df['foo2'] = 4 - expected = DataFrame([[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4], - [2, 1, 3, 5, 'bah', 4]], - columns=['foo', 'bar', 'foo', 'hello', - 'string', 'foo2']) + df["foo2"] = 4 + expected = DataFrame( + [[1, 1, 1, 5, "bah", 4], [1, 1, 2, 5, "bah", 4], [2, 1, 3, 5, "bah", 4]], + columns=["foo", "bar", "foo", "hello", "string", "foo2"], + ) check(df, expected) - df['foo2'] = 3 + df["foo2"] = 3 # delete (non dup) - del df['bar'] - expected = DataFrame([[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3], - [2, 3, 5, 'bah', 3]], - columns=['foo', 'foo', 'hello', 'string', 'foo2']) + del df["bar"] + expected = DataFrame( + [[1, 1, 5, "bah", 3], [1, 2, 5, "bah", 3], [2, 3, 5, "bah", 3]], + columns=["foo", "foo", "hello", "string", "foo2"], + ) check(df, expected) # try to delete again (its not consolidated) - del df['hello'] - expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3], - [2, 3, 'bah', 3]], - columns=['foo', 'foo', 'string', 'foo2']) + del df["hello"] + expected = DataFrame( + [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]], + columns=["foo", "foo", "string", "foo2"], + ) check(df, expected) # consolidate df = df._consolidate() - expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3], - [2, 3, 'bah', 3]], - columns=['foo', 'foo', 'string', 'foo2']) + expected = DataFrame( + [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]], + columns=["foo", "foo", "string", "foo2"], + ) check(df, expected) # insert - df.insert(2, 'new_col', 5.) - expected = DataFrame([[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3], - [2, 3, 5., 'bah', 3]], - columns=['foo', 'foo', 'new_col', 'string', - 'foo2']) + df.insert(2, "new_col", 5.0) + expected = DataFrame( + [[1, 1, 5.0, "bah", 3], [1, 2, 5.0, "bah", 3], [2, 3, 5.0, "bah", 3]], + columns=["foo", "foo", "new_col", "string", "foo2"], + ) check(df, expected) # insert a dup - with pytest.raises(ValueError, match='cannot insert'): - df.insert(2, 'new_col', 4.) - - df.insert(2, 'new_col', 4., allow_duplicates=True) - expected = DataFrame([[1, 1, 4., 5., 'bah', 3], - [1, 2, 4., 5., 'bah', 3], - [2, 3, 4., 5., 'bah', 3]], - columns=['foo', 'foo', 'new_col', - 'new_col', 'string', 'foo2']) + with pytest.raises(ValueError, match="cannot insert"): + df.insert(2, "new_col", 4.0) + + df.insert(2, "new_col", 4.0, allow_duplicates=True) + expected = DataFrame( + [ + [1, 1, 4.0, 5.0, "bah", 3], + [1, 2, 4.0, 5.0, "bah", 3], + [2, 3, 4.0, 5.0, "bah", 3], + ], + columns=["foo", "foo", "new_col", "new_col", "string", "foo2"], + ) check(df, expected) # delete (dup) - del df['foo'] - expected = DataFrame([[4., 5., 'bah', 3], [4., 5., 'bah', 3], - [4., 5., 'bah', 3]], - columns=['new_col', 'new_col', 'string', 'foo2']) + del df["foo"] + expected = DataFrame( + [[4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3]], + columns=["new_col", "new_col", "string", "foo2"], + ) assert_frame_equal(df, expected) # dup across dtypes - df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]], - columns=['foo', 'bar', 'foo', 'hello']) + df = DataFrame( + [[1, 1, 1.0, 5], [1, 1, 2.0, 5], [2, 1, 3.0, 5]], + columns=["foo", "bar", "foo", "hello"], + ) check(df) - df['foo2'] = 7. - expected = DataFrame([[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.], - [2, 1, 3., 5, 7.]], - columns=['foo', 'bar', 'foo', 'hello', 'foo2']) + df["foo2"] = 7.0 + expected = DataFrame( + [[1, 1, 1.0, 5, 7.0], [1, 1, 2.0, 5, 7.0], [2, 1, 3.0, 5, 7.0]], + columns=["foo", "bar", "foo", "hello", "foo2"], + ) check(df, expected) - result = df['foo'] - expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]], - columns=['foo', 'foo']) + result = df["foo"] + expected = DataFrame([[1, 1.0], [1, 2.0], [2, 3.0]], columns=["foo", "foo"]) check(result, expected) # multiple replacements - df['foo'] = 'string' - expected = DataFrame([['string', 1, 'string', 5, 7.], - ['string', 1, 'string', 5, 7.], - ['string', 1, 'string', 5, 7.]], - columns=['foo', 'bar', 'foo', 'hello', 'foo2']) + df["foo"] = "string" + expected = DataFrame( + [ + ["string", 1, "string", 5, 7.0], + ["string", 1, "string", 5, 7.0], + ["string", 1, "string", 5, 7.0], + ], + columns=["foo", "bar", "foo", "hello", "foo2"], + ) check(df, expected) - del df['foo'] - expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=[ - 'bar', 'hello', 'foo2']) + del df["foo"] + expected = DataFrame( + [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "hello", "foo2"] + ) check(df, expected) # values - df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x']) + df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=["x", "x"]) result = df.values expected = np.array([[1, 2.5], [3, 4.5]]) assert (result == expected).all().all() # rename, GH 4403 df4 = DataFrame( - {'RT': [0.0454], - 'TClose': [22.02], - 'TExg': [0.0422]}, - index=MultiIndex.from_tuples([(600809, 20130331)], - names=['STK_ID', 'RPT_Date'])) - - df5 = DataFrame({'RPT_Date': [20120930, 20121231, 20130331], - 'STK_ID': [600809] * 3, - 'STK_Name': ['饡驦', '饡驦', '饡驦'], - 'TClose': [38.05, 41.66, 30.01]}, - index=MultiIndex.from_tuples( - [(600809, 20120930), - (600809, 20121231), - (600809, 20130331)], - names=['STK_ID', 'RPT_Date'])) - - k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True) - result = k.rename( - columns={'TClose_x': 'TClose', 'TClose_y': 'QT_Close'}) + {"RT": [0.0454], "TClose": [22.02], "TExg": [0.0422]}, + index=MultiIndex.from_tuples( + [(600809, 20130331)], names=["STK_ID", "RPT_Date"] + ), + ) + + df5 = DataFrame( + { + "RPT_Date": [20120930, 20121231, 20130331], + "STK_ID": [600809] * 3, + "STK_Name": ["饡驦", "饡驦", "饡驦"], + "TClose": [38.05, 41.66, 30.01], + }, + index=MultiIndex.from_tuples( + [(600809, 20120930), (600809, 20121231), (600809, 20130331)], + names=["STK_ID", "RPT_Date"], + ), + ) + + k = pd.merge(df4, df5, how="inner", left_index=True, right_index=True) + result = k.rename(columns={"TClose_x": "TClose", "TClose_y": "QT_Close"}) str(result) result.dtypes - expected = (DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809, - '饡驦', 30.01]], - columns=['RT', 'TClose', 'TExg', - 'RPT_Date', 'STK_ID', 'STK_Name', - 'QT_Close']) - .set_index(['STK_ID', 'RPT_Date'], drop=False)) + expected = DataFrame( + [[0.0454, 22.02, 0.0422, 20130331, 600809, "饡驦", 30.01]], + columns=[ + "RT", + "TClose", + "TExg", + "RPT_Date", + "STK_ID", + "STK_Name", + "QT_Close", + ], + ).set_index(["STK_ID", "RPT_Date"], drop=False) assert_frame_equal(result, expected) # reindex is invalid! - df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], - columns=['bar', 'a', 'a']) + df = DataFrame( + [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"] + ) msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): - df.reindex(columns=['bar']) + df.reindex(columns=["bar"]) with pytest.raises(ValueError, match=msg): - df.reindex(columns=['bar', 'foo']) + df.reindex(columns=["bar", "foo"]) # drop - df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], - columns=['bar', 'a', 'a']) - result = df.drop(['a'], axis=1) - expected = DataFrame([[1], [1], [1]], columns=['bar']) + df = DataFrame( + [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"] + ) + result = df.drop(["a"], axis=1) + expected = DataFrame([[1], [1], [1]], columns=["bar"]) check(result, expected) - result = df.drop('a', axis=1) + result = df.drop("a", axis=1) check(result, expected) # describe - df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], - columns=['bar', 'a', 'a'], dtype='float64') + df = DataFrame( + [[1, 1, 1], [2, 2, 2], [3, 3, 3]], + columns=["bar", "a", "a"], + dtype="float64", + ) result = df.describe() s = df.iloc[:, 0].describe() expected = pd.concat([s, s, s], keys=df.columns, axis=1) check(result, expected) # check column dups with index equal and not equal to df's index - df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], - columns=['A', 'B', 'A']) - for index in [df.index, pd.Index(list('edcba'))]: + df = DataFrame( + np.random.randn(5, 3), + index=["a", "b", "c", "d", "e"], + columns=["A", "B", "A"], + ) + for index in [df.index, pd.Index(list("edcba"))]: this_df = df.copy() expected_ser = pd.Series(index.values, index=this_df.index) - expected_df = DataFrame({'A': expected_ser, - 'B': this_df['B'], - 'A': expected_ser}, - columns=['A', 'B', 'A']) - this_df['A'] = index + expected_df = DataFrame( + {"A": expected_ser, "B": this_df["B"], "A": expected_ser}, + columns=["A", "B", "A"], + ) + this_df["A"] = index check(this_df, expected_df) # operations - for op in ['__add__', '__mul__', '__sub__', '__truediv__']: + for op in ["__add__", "__mul__", "__sub__", "__truediv__"]: df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10))) expected = getattr(df, op)(df) - expected.columns = ['A', 'A'] - df.columns = ['A', 'A'] + expected.columns = ["A", "A"] + df.columns = ["A", "A"] result = getattr(df, op)(df) check(result, expected) # multiple assignments that change dtypes # the location indexer is a slice # GH 6120 - df = DataFrame(np.random.randn(5, 2), columns=['that', 'that']) - expected = DataFrame(1.0, index=range(5), columns=['that', 'that']) + df = DataFrame(np.random.randn(5, 2), columns=["that", "that"]) + expected = DataFrame(1.0, index=range(5), columns=["that", "that"]) - df['that'] = 1.0 + df["that"] = 1.0 check(df, expected) - df = DataFrame(np.random.rand(5, 2), columns=['that', 'that']) - expected = DataFrame(1, index=range(5), columns=['that', 'that']) + df = DataFrame(np.random.rand(5, 2), columns=["that", "that"]) + expected = DataFrame(1, index=range(5), columns=["that", "that"]) - df['that'] = 1 + df["that"] = 1 check(df, expected) def test_column_dups2(self): # drop buggy GH 6240 - df = DataFrame({'A': np.random.randn(5), - 'B': np.random.randn(5), - 'C': np.random.randn(5), - 'D': ['a', 'b', 'c', 'd', 'e']}) + df = DataFrame( + { + "A": np.random.randn(5), + "B": np.random.randn(5), + "C": np.random.randn(5), + "D": ["a", "b", "c", "d", "e"], + } + ) expected = df.take([0, 1, 1], axis=1) df2 = df.take([2, 0, 1, 2, 1], axis=1) - result = df2.drop('C', axis=1) + result = df2.drop("C", axis=1) assert_frame_equal(result, expected) # dropna - df = DataFrame({'A': np.random.randn(5), - 'B': np.random.randn(5), - 'C': np.random.randn(5), - 'D': ['a', 'b', 'c', 'd', 'e']}) + df = DataFrame( + { + "A": np.random.randn(5), + "B": np.random.randn(5), + "C": np.random.randn(5), + "D": ["a", "b", "c", "d", "e"], + } + ) df.iloc[2, [0, 1, 2]] = np.nan df.iloc[0, 0] = np.nan df.iloc[1, 1] = np.nan df.iloc[:, 3] = np.nan - expected = df.dropna(subset=['A', 'B', 'C'], how='all') - expected.columns = ['A', 'A', 'B', 'C'] + expected = df.dropna(subset=["A", "B", "C"], how="all") + expected.columns = ["A", "A", "B", "C"] - df.columns = ['A', 'A', 'B', 'C'] + df.columns = ["A", "A", "B", "C"] - result = df.dropna(subset=['A', 'C'], how='all') + result = df.dropna(subset=["A", "C"], how="all") assert_frame_equal(result, expected) def test_column_dups_indexing(self): @@ -280,29 +319,28 @@ def check(result, expected=None): # boolean indexing # GH 4879 - dups = ['A', 'A', 'C', 'D'] - df = DataFrame(np.arange(12).reshape(3, 4), columns=[ - 'A', 'B', 'C', 'D'], dtype='float64') + dups = ["A", "A", "C", "D"] + df = DataFrame( + np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64" + ) expected = df[df.C > 6] expected.columns = dups - df = DataFrame(np.arange(12).reshape(3, 4), - columns=dups, dtype='float64') + df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64") result = df[df.C > 6] check(result, expected) # where - df = DataFrame(np.arange(12).reshape(3, 4), columns=[ - 'A', 'B', 'C', 'D'], dtype='float64') + df = DataFrame( + np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64" + ) expected = df[df > 6] expected.columns = dups - df = DataFrame(np.arange(12).reshape(3, 4), - columns=dups, dtype='float64') + df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64") result = df[df > 6] check(result, expected) # boolean with the duplicate raises - df = DataFrame(np.arange(12).reshape(3, 4), - columns=dups, dtype='float64') + df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64") msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): df[df.A > 6] @@ -316,10 +354,8 @@ def check(result, expected=None): assert_frame_equal(result, expected) # equality - df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]], - columns=['A', 'B']) - df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]], - columns=['A', 'A']) + df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]], columns=["A", "B"]) + df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]], columns=["A", "A"]) # not-comparing like-labelled msg = "Can only compare identically-labeled DataFrame objects" @@ -328,105 +364,116 @@ def check(result, expected=None): df1r = df1.reindex_like(df2) result = df1r == df2 - expected = DataFrame([[False, True], [True, False], [False, False], [ - True, False]], columns=['A', 'A']) + expected = DataFrame( + [[False, True], [True, False], [False, False], [True, False]], + columns=["A", "A"], + ) assert_frame_equal(result, expected) # mixed column selection # GH 5639 - dfbool = DataFrame({'one': Series([True, True, False], - index=['a', 'b', 'c']), - 'two': Series([False, False, True, False], - index=['a', 'b', 'c', 'd']), - 'three': Series([False, True, True, True], - index=['a', 'b', 'c', 'd'])}) - expected = pd.concat( - [dfbool['one'], dfbool['three'], dfbool['one']], axis=1) - result = dfbool[['one', 'three', 'one']] + dfbool = DataFrame( + { + "one": Series([True, True, False], index=["a", "b", "c"]), + "two": Series([False, False, True, False], index=["a", "b", "c", "d"]), + "three": Series([False, True, True, True], index=["a", "b", "c", "d"]), + } + ) + expected = pd.concat([dfbool["one"], dfbool["three"], dfbool["one"]], axis=1) + result = dfbool[["one", "three", "one"]] check(result, expected) # multi-axis dups # GH 6121 - df = DataFrame(np.arange(25.).reshape(5, 5), - index=['a', 'b', 'c', 'd', 'e'], - columns=['A', 'B', 'C', 'D', 'E']) - z = df[['A', 'C', 'A']].copy() - expected = z.loc[['a', 'c', 'a']] - - df = DataFrame(np.arange(25.).reshape(5, 5), - index=['a', 'b', 'c', 'd', 'e'], - columns=['A', 'B', 'C', 'D', 'E']) - z = df[['A', 'C', 'A']] - result = z.loc[['a', 'c', 'a']] + df = DataFrame( + np.arange(25.0).reshape(5, 5), + index=["a", "b", "c", "d", "e"], + columns=["A", "B", "C", "D", "E"], + ) + z = df[["A", "C", "A"]].copy() + expected = z.loc[["a", "c", "a"]] + + df = DataFrame( + np.arange(25.0).reshape(5, 5), + index=["a", "b", "c", "d", "e"], + columns=["A", "B", "C", "D", "E"], + ) + z = df[["A", "C", "A"]] + result = z.loc[["a", "c", "a"]] check(result, expected) def test_column_dups_indexing2(self): # GH 8363 # datetime ops with a non-unique index - df = DataFrame({'A': np.arange(5, dtype='int64'), - 'B': np.arange(1, 6, dtype='int64')}, - index=[2, 2, 3, 3, 4]) + df = DataFrame( + {"A": np.arange(5, dtype="int64"), "B": np.arange(1, 6, dtype="int64")}, + index=[2, 2, 3, 3, 4], + ) result = df.B - df.A expected = Series(1, index=[2, 2, 3, 3, 4]) assert_series_equal(result, expected) - df = DataFrame({'A': date_range('20130101', periods=5), - 'B': date_range('20130101 09:00:00', periods=5)}, - index=[2, 2, 3, 3, 4]) + df = DataFrame( + { + "A": date_range("20130101", periods=5), + "B": date_range("20130101 09:00:00", periods=5), + }, + index=[2, 2, 3, 3, 4], + ) result = df.B - df.A - expected = Series(pd.Timedelta('9 hours'), index=[2, 2, 3, 3, 4]) + expected = Series(pd.Timedelta("9 hours"), index=[2, 2, 3, 3, 4]) assert_series_equal(result, expected) def test_columns_with_dups(self): # GH 3468 related # basic - df = DataFrame([[1, 2]], columns=['a', 'a']) - df.columns = ['a', 'a.1'] + df = DataFrame([[1, 2]], columns=["a", "a"]) + df.columns = ["a", "a.1"] str(df) - expected = DataFrame([[1, 2]], columns=['a', 'a.1']) + expected = DataFrame([[1, 2]], columns=["a", "a.1"]) assert_frame_equal(df, expected) - df = DataFrame([[1, 2, 3]], columns=['b', 'a', 'a']) - df.columns = ['b', 'a', 'a.1'] + df = DataFrame([[1, 2, 3]], columns=["b", "a", "a"]) + df.columns = ["b", "a", "a.1"] str(df) - expected = DataFrame([[1, 2, 3]], columns=['b', 'a', 'a.1']) + expected = DataFrame([[1, 2, 3]], columns=["b", "a", "a.1"]) assert_frame_equal(df, expected) # with a dup index - df = DataFrame([[1, 2]], columns=['a', 'a']) - df.columns = ['b', 'b'] + df = DataFrame([[1, 2]], columns=["a", "a"]) + df.columns = ["b", "b"] str(df) - expected = DataFrame([[1, 2]], columns=['b', 'b']) + expected = DataFrame([[1, 2]], columns=["b", "b"]) assert_frame_equal(df, expected) # multi-dtype - df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']], - columns=['a', 'a', 'b', 'b', 'd', 'c', 'c']) - df.columns = list('ABCDEFG') + df = DataFrame( + [[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], + columns=["a", "a", "b", "b", "d", "c", "c"], + ) + df.columns = list("ABCDEFG") str(df) expected = DataFrame( - [[1, 2, 1., 2., 3., 'foo', 'bar']], columns=list('ABCDEFG')) + [[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("ABCDEFG") + ) assert_frame_equal(df, expected) - df = DataFrame([[1, 2, 'foo', 'bar']], columns=['a', 'a', 'a', 'a']) - df.columns = ['a', 'a.1', 'a.2', 'a.3'] + df = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a", "a", "a"]) + df.columns = ["a", "a.1", "a.2", "a.3"] str(df) - expected = DataFrame([[1, 2, 'foo', 'bar']], - columns=['a', 'a.1', 'a.2', 'a.3']) + expected = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a.1", "a.2", "a.3"]) assert_frame_equal(df, expected) # dups across blocks - df_float = DataFrame(np.random.randn(10, 3), dtype='float64') - df_int = DataFrame(np.random.randn(10, 3), dtype='int64') - df_bool = DataFrame(True, index=df_float.index, - columns=df_float.columns) - df_object = DataFrame('foo', index=df_float.index, - columns=df_float.columns) - df_dt = DataFrame(pd.Timestamp('20010101'), - index=df_float.index, - columns=df_float.columns) + df_float = DataFrame(np.random.randn(10, 3), dtype="float64") + df_int = DataFrame(np.random.randn(10, 3), dtype="int64") + df_bool = DataFrame(True, index=df_float.index, columns=df_float.columns) + df_object = DataFrame("foo", index=df_float.index, columns=df_float.columns) + df_dt = DataFrame( + pd.Timestamp("20010101"), index=df_float.index, columns=df_float.columns + ) df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1) assert len(df._data._blknos) == len(df.columns) @@ -437,27 +484,26 @@ def test_columns_with_dups(self): df.iloc[:, i] # dup columns across dtype GH 2079/2194 - vals = [[1, -1, 2.], [2, -2, 3.]] - rs = DataFrame(vals, columns=['A', 'A', 'B']) + vals = [[1, -1, 2.0], [2, -2, 3.0]] + rs = DataFrame(vals, columns=["A", "A", "B"]) xp = DataFrame(vals) - xp.columns = ['A', 'A', 'B'] + xp.columns = ["A", "A", "B"] assert_frame_equal(rs, xp) def test_values_duplicates(self): - df = DataFrame([[1, 2, 'a', 'b'], - [1, 2, 'a', 'b']], - columns=['one', 'one', 'two', 'two']) + df = DataFrame( + [[1, 2, "a", "b"], [1, 2, "a", "b"]], columns=["one", "one", "two", "two"] + ) result = df.values - expected = np.array([[1, 2, 'a', 'b'], [1, 2, 'a', 'b']], - dtype=object) + expected = np.array([[1, 2, "a", "b"], [1, 2, "a", "b"]], dtype=object) tm.assert_numpy_array_equal(result, expected) def test_set_value_by_index(self): # See gh-12344 df = DataFrame(np.arange(9).reshape(3, 3).T) - df.columns = list('AAA') + df.columns = list("AAA") expected = df.iloc[:, 2] df.iloc[:, 0] = 3 @@ -473,9 +519,10 @@ def test_set_value_by_index(self): def test_insert_with_columns_dups(self): # GH 14291 df = pd.DataFrame() - df.insert(0, 'A', ['g', 'h', 'i'], allow_duplicates=True) - df.insert(0, 'A', ['d', 'e', 'f'], allow_duplicates=True) - df.insert(0, 'A', ['a', 'b', 'c'], allow_duplicates=True) - exp = pd.DataFrame([['a', 'd', 'g'], ['b', 'e', 'h'], - ['c', 'f', 'i']], columns=['A', 'A', 'A']) + df.insert(0, "A", ["g", "h", "i"], allow_duplicates=True) + df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True) + df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True) + exp = pd.DataFrame( + [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"] + ) assert_frame_equal(df, exp) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 1e932879e9ad0f..67482ddf657fb2 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -10,79 +10,100 @@ from pandas.tests.frame.common import _check_mixed_float import pandas.util.testing as tm from pandas.util.testing import ( - assert_frame_equal, assert_numpy_array_equal, assert_series_equal) + assert_frame_equal, + assert_numpy_array_equal, + assert_series_equal, +) class TestDataFrameUnaryOperators: # __pos__, __neg__, __inv__ - @pytest.mark.parametrize('df,expected', [ - (pd.DataFrame({'a': [-1, 1]}), pd.DataFrame({'a': [1, -1]})), - (pd.DataFrame({'a': [False, True]}), - pd.DataFrame({'a': [True, False]})), - (pd.DataFrame({'a': pd.Series(pd.to_timedelta([-1, 1]))}), - pd.DataFrame({'a': pd.Series(pd.to_timedelta([1, -1]))})) - ]) + @pytest.mark.parametrize( + "df,expected", + [ + (pd.DataFrame({"a": [-1, 1]}), pd.DataFrame({"a": [1, -1]})), + (pd.DataFrame({"a": [False, True]}), pd.DataFrame({"a": [True, False]})), + ( + pd.DataFrame({"a": pd.Series(pd.to_timedelta([-1, 1]))}), + pd.DataFrame({"a": pd.Series(pd.to_timedelta([1, -1]))}), + ), + ], + ) def test_neg_numeric(self, df, expected): assert_frame_equal(-df, expected) - assert_series_equal(-df['a'], expected['a']) - - @pytest.mark.parametrize('df, expected', [ - (np.array([1, 2], dtype=object), np.array([-1, -2], dtype=object)), - ([Decimal('1.0'), Decimal('2.0')], [Decimal('-1.0'), Decimal('-2.0')]), - ]) + assert_series_equal(-df["a"], expected["a"]) + + @pytest.mark.parametrize( + "df, expected", + [ + (np.array([1, 2], dtype=object), np.array([-1, -2], dtype=object)), + ([Decimal("1.0"), Decimal("2.0")], [Decimal("-1.0"), Decimal("-2.0")]), + ], + ) def test_neg_object(self, df, expected): # GH#21380 - df = pd.DataFrame({'a': df}) - expected = pd.DataFrame({'a': expected}) + df = pd.DataFrame({"a": df}) + expected = pd.DataFrame({"a": expected}) assert_frame_equal(-df, expected) - assert_series_equal(-df['a'], expected['a']) - - @pytest.mark.parametrize('df', [ - pd.DataFrame({'a': ['a', 'b']}), - pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}), - ]) + assert_series_equal(-df["a"], expected["a"]) + + @pytest.mark.parametrize( + "df", + [ + pd.DataFrame({"a": ["a", "b"]}), + pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])}), + ], + ) def test_neg_raises(self, df): with pytest.raises(TypeError): - (- df) + (-df) with pytest.raises(TypeError): - (- df['a']) + (-df["a"]) def test_invert(self, float_frame): df = float_frame assert_frame_equal(-(df < 0), ~(df < 0)) - @pytest.mark.parametrize('df', [ - pd.DataFrame({'a': [-1, 1]}), - pd.DataFrame({'a': [False, True]}), - pd.DataFrame({'a': pd.Series(pd.to_timedelta([-1, 1]))}), - ]) + @pytest.mark.parametrize( + "df", + [ + pd.DataFrame({"a": [-1, 1]}), + pd.DataFrame({"a": [False, True]}), + pd.DataFrame({"a": pd.Series(pd.to_timedelta([-1, 1]))}), + ], + ) def test_pos_numeric(self, df): # GH#16073 assert_frame_equal(+df, df) - assert_series_equal(+df['a'], df['a']) - - @pytest.mark.parametrize('df', [ - # numpy changing behavior in the future - pytest.param(pd.DataFrame({'a': ['a', 'b']}), - marks=[pytest.mark.filterwarnings("ignore")]), - pd.DataFrame({'a': np.array([-1, 2], dtype=object)}), - pd.DataFrame({'a': [Decimal('-1.0'), Decimal('2.0')]}), - ]) + assert_series_equal(+df["a"], df["a"]) + + @pytest.mark.parametrize( + "df", + [ + # numpy changing behavior in the future + pytest.param( + pd.DataFrame({"a": ["a", "b"]}), + marks=[pytest.mark.filterwarnings("ignore")], + ), + pd.DataFrame({"a": np.array([-1, 2], dtype=object)}), + pd.DataFrame({"a": [Decimal("-1.0"), Decimal("2.0")]}), + ], + ) def test_pos_object(self, df): # GH#21380 assert_frame_equal(+df, df) - assert_series_equal(+df['a'], df['a']) + assert_series_equal(+df["a"], df["a"]) - @pytest.mark.parametrize('df', [ - pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}), - ]) + @pytest.mark.parametrize( + "df", [pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])})] + ) def test_pos_raises(self, df): with pytest.raises(TypeError): - (+ df) + (+df) with pytest.raises(TypeError): - (+ df['a']) + (+df["a"]) class TestDataFrameLogicalOperators: @@ -103,14 +124,14 @@ def test_logical_ops_empty_frame(self): result = df & df2 assert_frame_equal(result, df2) - dfa = DataFrame(index=[1], columns=['A']) + dfa = DataFrame(index=[1], columns=["A"]) result = dfa & dfa assert_frame_equal(result, dfa) def test_logical_ops_bool_frame(self): # GH#5808 - df1a_bool = DataFrame(True, index=[1], columns=['A']) + df1a_bool = DataFrame(True, index=[1], columns=["A"]) result = df1a_bool & df1a_bool assert_frame_equal(result, df1a_bool) @@ -120,8 +141,8 @@ def test_logical_ops_bool_frame(self): def test_logical_ops_int_frame(self): # GH#5808 - df1a_int = DataFrame(1, index=[1], columns=['A']) - df1a_bool = DataFrame(True, index=[1], columns=['A']) + df1a_int = DataFrame(1, index=[1], columns=["A"]) + df1a_bool = DataFrame(True, index=[1], columns=["A"]) result = df1a_int | df1a_bool assert_frame_equal(result, df1a_int) @@ -129,48 +150,46 @@ def test_logical_ops_int_frame(self): def test_logical_ops_invalid(self): # GH#5808 - df1 = DataFrame(1.0, index=[1], columns=['A']) - df2 = DataFrame(True, index=[1], columns=['A']) + df1 = DataFrame(1.0, index=[1], columns=["A"]) + df2 = DataFrame(True, index=[1], columns=["A"]) with pytest.raises(TypeError): df1 | df2 - df1 = DataFrame('foo', index=[1], columns=['A']) - df2 = DataFrame(True, index=[1], columns=['A']) + df1 = DataFrame("foo", index=[1], columns=["A"]) + df2 = DataFrame(True, index=[1], columns=["A"]) with pytest.raises(TypeError): df1 | df2 def test_logical_operators(self): - def _check_bin_op(op): result = op(df1, df2) - expected = DataFrame(op(df1.values, df2.values), index=df1.index, - columns=df1.columns) + expected = DataFrame( + op(df1.values, df2.values), index=df1.index, columns=df1.columns + ) assert result.values.dtype == np.bool_ assert_frame_equal(result, expected) def _check_unary_op(op): result = op(df1) - expected = DataFrame(op(df1.values), index=df1.index, - columns=df1.columns) + expected = DataFrame(op(df1.values), index=df1.index, columns=df1.columns) assert result.values.dtype == np.bool_ assert_frame_equal(result, expected) - df1 = {'a': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True}, - 'b': {'a': False, 'b': True, 'c': False, - 'd': False, 'e': False}, - 'c': {'a': False, 'b': False, 'c': True, - 'd': False, 'e': False}, - 'd': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True}, - 'e': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True}} - - df2 = {'a': {'a': True, 'b': False, 'c': True, 'd': False, 'e': False}, - 'b': {'a': False, 'b': True, 'c': False, - 'd': False, 'e': False}, - 'c': {'a': True, 'b': False, 'c': True, 'd': False, 'e': False}, - 'd': {'a': False, 'b': False, 'c': False, - 'd': True, 'e': False}, - 'e': {'a': False, 'b': False, 'c': False, - 'd': False, 'e': True}} + df1 = { + "a": {"a": True, "b": False, "c": False, "d": True, "e": True}, + "b": {"a": False, "b": True, "c": False, "d": False, "e": False}, + "c": {"a": False, "b": False, "c": True, "d": False, "e": False}, + "d": {"a": True, "b": False, "c": False, "d": True, "e": True}, + "e": {"a": True, "b": False, "c": False, "d": True, "e": True}, + } + + df2 = { + "a": {"a": True, "b": False, "c": True, "d": False, "e": False}, + "b": {"a": False, "b": True, "c": False, "d": False, "e": False}, + "c": {"a": True, "b": False, "c": True, "d": False, "e": False}, + "d": {"a": False, "b": False, "c": False, "d": True, "e": False}, + "e": {"a": False, "b": False, "c": False, "d": False, "e": True}, + } df1 = DataFrame(df1) df2 = DataFrame(df2) @@ -182,31 +201,32 @@ def _check_unary_op(op): _check_unary_op(operator.inv) # TODO: belongs elsewhere def test_logical_with_nas(self): - d = DataFrame({'a': [np.nan, False], 'b': [True, True]}) + d = DataFrame({"a": [np.nan, False], "b": [True, True]}) # GH4947 # bool comparisons should return bool - result = d['a'] | d['b'] + result = d["a"] | d["b"] expected = Series([False, True]) assert_series_equal(result, expected) # GH4604, automatic casting here - result = d['a'].fillna(False) | d['b'] + result = d["a"].fillna(False) | d["b"] expected = Series([True, True]) assert_series_equal(result, expected) - result = d['a'].fillna(False, downcast=False) | d['b'] + result = d["a"].fillna(False, downcast=False) | d["b"] expected = Series([True, True]) assert_series_equal(result, expected) class TestDataFrameOperators: - - @pytest.mark.parametrize('op', [operator.add, operator.sub, - operator.mul, operator.truediv]) + @pytest.mark.parametrize( + "op", [operator.add, operator.sub, operator.mul, operator.truediv] + ) def test_operators_none_as_na(self, op): - df = DataFrame({"col1": [2, 5.0, 123, None], - "col2": [1, 2, 3, 4]}, dtype=object) + df = DataFrame( + {"col1": [2, 5.0, 123, None], "col2": [1, 2, 3, 4]}, dtype=object + ) # since filling converts dtypes from object, changed expected to be # object @@ -227,13 +247,12 @@ def test_operators_none_as_na(self, op): result = op(df.fillna(7), df) assert_frame_equal(result, expected, check_dtype=False) - @pytest.mark.parametrize('op,res', [('__eq__', False), - ('__ne__', True)]) + @pytest.mark.parametrize("op,res", [("__eq__", False), ("__ne__", True)]) # TODO: not sure what's correct here. @pytest.mark.filterwarnings("ignore:elementwise:FutureWarning") def test_logical_typeerror_with_non_valid(self, op, res, float_frame): # we are comparing floats vs a string - result = getattr(float_frame, op)('foo') + result = getattr(float_frame, op)("foo") assert bool(result.all().all()) is res def test_binary_ops_align(self): @@ -241,105 +260,109 @@ def test_binary_ops_align(self): # test aligning binary ops # GH 6681 - index = MultiIndex.from_product([list('abc'), - ['one', 'two', 'three'], - [1, 2, 3]], - names=['first', 'second', 'third']) + index = MultiIndex.from_product( + [list("abc"), ["one", "two", "three"], [1, 2, 3]], + names=["first", "second", "third"], + ) - df = DataFrame(np.arange(27 * 3).reshape(27, 3), - index=index, - columns=['value1', 'value2', 'value3']).sort_index() + df = DataFrame( + np.arange(27 * 3).reshape(27, 3), + index=index, + columns=["value1", "value2", "value3"], + ).sort_index() idx = pd.IndexSlice - for op in ['add', 'sub', 'mul', 'div', 'truediv']: + for op in ["add", "sub", "mul", "div", "truediv"]: opa = getattr(operator, op, None) if opa is None: continue x = Series([1.0, 10.0, 100.0], [1, 2, 3]) - result = getattr(df, op)(x, level='third', axis=0) + result = getattr(df, op)(x, level="third", axis=0) - expected = pd.concat([opa(df.loc[idx[:, :, i], :], v) - for i, v in x.iteritems()]).sort_index() + expected = pd.concat( + [opa(df.loc[idx[:, :, i], :], v) for i, v in x.iteritems()] + ).sort_index() assert_frame_equal(result, expected) - x = Series([1.0, 10.0], ['two', 'three']) - result = getattr(df, op)(x, level='second', axis=0) + x = Series([1.0, 10.0], ["two", "three"]) + result = getattr(df, op)(x, level="second", axis=0) - expected = (pd.concat([opa(df.loc[idx[:, i], :], v) - for i, v in x.iteritems()]) - .reindex_like(df).sort_index()) + expected = ( + pd.concat([opa(df.loc[idx[:, i], :], v) for i, v in x.iteritems()]) + .reindex_like(df) + .sort_index() + ) assert_frame_equal(result, expected) # GH9463 (alignment level of dataframe with series) - midx = MultiIndex.from_product([['A', 'B'], ['a', 'b']]) - df = DataFrame(np.ones((2, 4), dtype='int64'), columns=midx) - s = pd.Series({'a': 1, 'b': 2}) + midx = MultiIndex.from_product([["A", "B"], ["a", "b"]]) + df = DataFrame(np.ones((2, 4), dtype="int64"), columns=midx) + s = pd.Series({"a": 1, "b": 2}) df2 = df.copy() - df2.columns.names = ['lvl0', 'lvl1'] + df2.columns.names = ["lvl0", "lvl1"] s2 = s.copy() - s2.index.name = 'lvl1' + s2.index.name = "lvl1" # different cases of integer/string level names: res1 = df.mul(s, axis=1, level=1) res2 = df.mul(s2, axis=1, level=1) res3 = df2.mul(s, axis=1, level=1) res4 = df2.mul(s2, axis=1, level=1) - res5 = df2.mul(s, axis=1, level='lvl1') - res6 = df2.mul(s2, axis=1, level='lvl1') + res5 = df2.mul(s, axis=1, level="lvl1") + res6 = df2.mul(s2, axis=1, level="lvl1") - exp = DataFrame(np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype='int64'), - columns=midx) + exp = DataFrame( + np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype="int64"), columns=midx + ) for res in [res1, res2]: assert_frame_equal(res, exp) - exp.columns.names = ['lvl0', 'lvl1'] + exp.columns.names = ["lvl0", "lvl1"] for res in [res3, res4, res5, res6]: assert_frame_equal(res, exp) def test_dti_tz_convert_to_utc(self): - base = pd.DatetimeIndex(['2011-01-01', '2011-01-02', - '2011-01-03'], tz='UTC') - idx1 = base.tz_convert('Asia/Tokyo')[:2] - idx2 = base.tz_convert('US/Eastern')[1:] - - df1 = DataFrame({'A': [1, 2]}, index=idx1) - df2 = DataFrame({'A': [1, 1]}, index=idx2) - exp = DataFrame({'A': [np.nan, 3, np.nan]}, index=base) + base = pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], tz="UTC") + idx1 = base.tz_convert("Asia/Tokyo")[:2] + idx2 = base.tz_convert("US/Eastern")[1:] + + df1 = DataFrame({"A": [1, 2]}, index=idx1) + df2 = DataFrame({"A": [1, 1]}, index=idx2) + exp = DataFrame({"A": [np.nan, 3, np.nan]}, index=base) assert_frame_equal(df1 + df2, exp) - def test_combineFrame(self, float_frame, mixed_float_frame, - mixed_int_frame): + def test_combineFrame(self, float_frame, mixed_float_frame, mixed_int_frame): frame_copy = float_frame.reindex(float_frame.index[::2]) - del frame_copy['D'] - frame_copy['C'][:5] = np.nan + del frame_copy["D"] + frame_copy["C"][:5] = np.nan added = float_frame + frame_copy - indexer = added['A'].dropna().index - exp = (float_frame['A'] * 2).copy() + indexer = added["A"].dropna().index + exp = (float_frame["A"] * 2).copy() - tm.assert_series_equal(added['A'].dropna(), exp.loc[indexer]) + tm.assert_series_equal(added["A"].dropna(), exp.loc[indexer]) exp.loc[~exp.index.isin(indexer)] = np.nan - tm.assert_series_equal(added['A'], exp.loc[added['A'].index]) + tm.assert_series_equal(added["A"], exp.loc[added["A"].index]) - assert np.isnan(added['C'].reindex(frame_copy.index)[:5]).all() + assert np.isnan(added["C"].reindex(frame_copy.index)[:5]).all() # assert(False) - assert np.isnan(added['D']).all() + assert np.isnan(added["D"]).all() self_added = float_frame + float_frame tm.assert_index_equal(self_added.index, float_frame.index) added_rev = frame_copy + float_frame - assert np.isnan(added['D']).all() - assert np.isnan(added_rev['D']).all() + assert np.isnan(added["D"]).all() + assert np.isnan(added_rev["D"]).all() # corner cases @@ -360,9 +383,9 @@ def test_combineFrame(self, float_frame, mixed_float_frame, # mix vs float64, upcast added = float_frame + mixed_float_frame - _check_mixed_float(added, dtype='float64') + _check_mixed_float(added, dtype="float64") added = mixed_float_frame + float_frame - _check_mixed_float(added, dtype='float64') + _check_mixed_float(added, dtype="float64") # mix vs mix added = mixed_float_frame + mixed_float_frame @@ -370,10 +393,11 @@ def test_combineFrame(self, float_frame, mixed_float_frame, # with int added = float_frame + mixed_int_frame - _check_mixed_float(added, dtype='float64') + _check_mixed_float(added, dtype="float64") - def test_combineSeries(self, float_frame, mixed_float_frame, - mixed_int_frame, datetime_frame): + def test_combineSeries( + self, float_frame, mixed_float_frame, mixed_int_frame, datetime_frame + ): # Series series = float_frame.xs(float_frame.index[0]) @@ -384,23 +408,23 @@ def test_combineSeries(self, float_frame, mixed_float_frame, assert_series_equal(s, float_frame[key] + series[key]) larger_series = series.to_dict() - larger_series['E'] = 1 + larger_series["E"] = 1 larger_series = Series(larger_series) larger_added = float_frame + larger_series for key, s in float_frame.items(): assert_series_equal(larger_added[key], s + series[key]) - assert 'E' in larger_added - assert np.isnan(larger_added['E']).all() + assert "E" in larger_added + assert np.isnan(larger_added["E"]).all() # no upcast needed added = mixed_float_frame + series _check_mixed_float(added) # vs mix (upcast) as needed - added = mixed_float_frame + series.astype('float32') + added = mixed_float_frame + series.astype("float32") _check_mixed_float(added, dtype=dict(C=None)) - added = mixed_float_frame + series.astype('float16') + added = mixed_float_frame + series.astype("float16") _check_mixed_float(added, dtype=dict(C=None)) # these raise with numexpr.....as we are adding an int64 to an @@ -414,46 +438,48 @@ def test_combineSeries(self, float_frame, mixed_float_frame, # 'int32', D = 'int64')) # TimeSeries - ts = datetime_frame['A'] + ts = datetime_frame["A"] # 10890 # we no longer allow auto timeseries broadcasting # and require explicit broadcasting - added = datetime_frame.add(ts, axis='index') + added = datetime_frame.add(ts, axis="index") for key, col in datetime_frame.items(): result = col + ts assert_series_equal(added[key], result, check_names=False) assert added[key].name == key if col.name == ts.name: - assert result.name == 'A' + assert result.name == "A" else: assert result.name is None smaller_frame = datetime_frame[:-5] - smaller_added = smaller_frame.add(ts, axis='index') + smaller_added = smaller_frame.add(ts, axis="index") tm.assert_index_equal(smaller_added.index, datetime_frame.index) smaller_ts = ts[:-5] - smaller_added2 = datetime_frame.add(smaller_ts, axis='index') + smaller_added2 = datetime_frame.add(smaller_ts, axis="index") assert_frame_equal(smaller_added, smaller_added2) # length 0, result is all-nan - result = datetime_frame.add(ts[:0], axis='index') - expected = DataFrame(np.nan, index=datetime_frame.index, - columns=datetime_frame.columns) + result = datetime_frame.add(ts[:0], axis="index") + expected = DataFrame( + np.nan, index=datetime_frame.index, columns=datetime_frame.columns + ) assert_frame_equal(result, expected) # Frame is all-nan - result = datetime_frame[:0].add(ts, axis='index') - expected = DataFrame(np.nan, index=datetime_frame.index, - columns=datetime_frame.columns) + result = datetime_frame[:0].add(ts, axis="index") + expected = DataFrame( + np.nan, index=datetime_frame.index, columns=datetime_frame.columns + ) assert_frame_equal(result, expected) # empty but with non-empty index frame = datetime_frame[:1].reindex(columns=[]) - result = frame.mul(ts, axis='index') + result = frame.mul(ts, axis="index") assert len(result) == len(ts) def test_combineFunc(self, float_frame, mixed_float_frame): @@ -463,8 +489,7 @@ def test_combineFunc(self, float_frame, mixed_float_frame): # vs mix result = mixed_float_frame * 2 for c, s in result.items(): - tm.assert_numpy_array_equal( - s.values, mixed_float_frame[c].values * 2) + tm.assert_numpy_array_equal(s.values, mixed_float_frame[c].values * 2) _check_mixed_float(result, dtype=dict(C=None)) result = DataFrame() * 2 @@ -475,26 +500,25 @@ def test_comparisons(self, simple_frame, float_frame): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame() - row = simple_frame.xs('a') + row = simple_frame.xs("a") ndim_5 = np.ones(df1.shape + (1, 1, 1)) def test_comp(func): result = func(df1, df2) - tm.assert_numpy_array_equal(result.values, - func(df1.values, df2.values)) + tm.assert_numpy_array_equal(result.values, func(df1.values, df2.values)) - with pytest.raises(ValueError, match='dim must be <= 2'): + with pytest.raises(ValueError, match="dim must be <= 2"): func(df1, ndim_5) result2 = func(simple_frame, row) - tm.assert_numpy_array_equal(result2.values, - func(simple_frame.values, row.values)) + tm.assert_numpy_array_equal( + result2.values, func(simple_frame.values, row.values) + ) result3 = func(float_frame, 0) - tm.assert_numpy_array_equal(result3.values, - func(float_frame.values, 0)) + tm.assert_numpy_array_equal(result3.values, func(float_frame.values, 0)) - msg = 'Can only compare identically-labeled DataFrame' + msg = "Can only compare identically-labeled DataFrame" with pytest.raises(ValueError, match=msg): func(simple_frame, simple_frame[:2]) @@ -507,10 +531,10 @@ def test_comp(func): def test_comparison_protected_from_errstate(self): missing_df = tm.makeDataFrame() - missing_df.iloc[0]['A'] = np.nan - with np.errstate(invalid='ignore'): + missing_df.iloc[0]["A"] = np.nan + with np.errstate(invalid="ignore"): expected = missing_df.values < 0 - with np.errstate(invalid='raise'): + with np.errstate(invalid="raise"): result = (missing_df < 0).values tm.assert_numpy_array_equal(result, expected) @@ -533,9 +557,9 @@ def test_boolean_comparison(self): result = df.values > b assert_numpy_array_equal(result, expected.values) - msg1d = 'Unable to coerce to Series, length must be 2: given 3' - msg2d = 'Unable to coerce to DataFrame, shape must be' - msg2db = 'operands could not be broadcast together with shapes' + msg1d = "Unable to coerce to Series, length must be 2: given 3" + msg2d = "Unable to coerce to DataFrame, shape must be" + msg2db = "operands could not be broadcast together with shapes" with pytest.raises(ValueError, match=msg1d): # wrong shape df > lst @@ -581,8 +605,9 @@ def test_boolean_comparison(self): assert df.values.shape != b_c.shape # with alignment - df = DataFrame(np.arange(6).reshape((3, 2)), - columns=list('AB'), index=list('abc')) + df = DataFrame( + np.arange(6).reshape((3, 2)), columns=list("AB"), index=list("abc") + ) expected.index = df.index expected.columns = df.columns @@ -594,18 +619,20 @@ def test_boolean_comparison(self): def test_combine_generic(self, float_frame): df1 = float_frame - df2 = float_frame.loc[float_frame.index[:-5], ['A', 'B', 'C']] + df2 = float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]] combined = df1.combine(df2, np.add) combined2 = df2.combine(df1, np.add) - assert combined['D'].isna().all() - assert combined2['D'].isna().all() + assert combined["D"].isna().all() + assert combined2["D"].isna().all() - chunk = combined.loc[combined.index[:-5], ['A', 'B', 'C']] - chunk2 = combined2.loc[combined2.index[:-5], ['A', 'B', 'C']] + chunk = combined.loc[combined.index[:-5], ["A", "B", "C"]] + chunk2 = combined2.loc[combined2.index[:-5], ["A", "B", "C"]] - exp = float_frame.loc[float_frame.index[:-5], - ['A', 'B', 'C']].reindex_like(chunk) * 2 + exp = ( + float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]].reindex_like(chunk) + * 2 + ) assert_frame_equal(chunk, exp) assert_frame_equal(chunk2, exp) @@ -614,13 +641,15 @@ def test_inplace_ops_alignment(self): # inplace ops / ops alignment # GH 8511 - columns = list('abcdefg') - X_orig = DataFrame(np.arange(10 * len(columns)) - .reshape(-1, len(columns)), - columns=columns, index=range(10)) + columns = list("abcdefg") + X_orig = DataFrame( + np.arange(10 * len(columns)).reshape(-1, len(columns)), + columns=columns, + index=range(10), + ) Z = 100 * X_orig.iloc[:, 1:-1].copy() - block1 = list('bedcf') - subs = list('bcdef') + block1 = list("bedcf") + subs = list("bcdef") # add X = X_orig.copy() @@ -697,42 +726,54 @@ def test_inplace_ops_identity(self): # mixed dtype arr = np.random.randint(0, 10, size=5) - df_orig = DataFrame({'A': arr.copy(), 'B': 'foo'}) + df_orig = DataFrame({"A": arr.copy(), "B": "foo"}) df = df_orig.copy() df2 = df - df['A'] += 1 - expected = DataFrame({'A': arr.copy() + 1, 'B': 'foo'}) + df["A"] += 1 + expected = DataFrame({"A": arr.copy() + 1, "B": "foo"}) assert_frame_equal(df, expected) assert_frame_equal(df2, expected) assert df._data is df2._data df = df_orig.copy() df2 = df - df['A'] += 1.5 - expected = DataFrame({'A': arr.copy() + 1.5, 'B': 'foo'}) + df["A"] += 1.5 + expected = DataFrame({"A": arr.copy() + 1.5, "B": "foo"}) assert_frame_equal(df, expected) assert_frame_equal(df2, expected) assert df._data is df2._data - @pytest.mark.parametrize('op', ['add', 'and', 'div', 'floordiv', 'mod', - 'mul', 'or', 'pow', 'sub', 'truediv', - 'xor']) + @pytest.mark.parametrize( + "op", + [ + "add", + "and", + "div", + "floordiv", + "mod", + "mul", + "or", + "pow", + "sub", + "truediv", + "xor", + ], + ) def test_inplace_ops_identity2(self, op): - if op == 'div': + if op == "div": return - df = DataFrame({'a': [1., 2., 3.], - 'b': [1, 2, 3]}) + df = DataFrame({"a": [1.0, 2.0, 3.0], "b": [1, 2, 3]}) operand = 2 - if op in ('and', 'or', 'xor'): + if op in ("and", "or", "xor"): # cannot use floats for boolean ops - df['a'] = [True, False, True] + df["a"] = [True, False, True] df_copy = df.copy() - iop = '__i{}__'.format(op) - op = '__{}__'.format(op) + iop = "__i{}__".format(op) + op = "__{}__".format(op) # no id change and value is correct getattr(df, iop)(operand) @@ -742,55 +783,62 @@ def test_inplace_ops_identity2(self, op): assert id(df) == expected def test_alignment_non_pandas(self): - index = ['A', 'B', 'C'] - columns = ['X', 'Y', 'Z'] + index = ["A", "B", "C"] + columns = ["X", "Y", "Z"] df = pd.DataFrame(np.random.randn(3, 3), index=index, columns=columns) align = pd.core.ops._align_method_FRAME - for val in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype=np.int64), - range(1, 4)]: - - tm.assert_series_equal(align(df, val, 'index'), - Series([1, 2, 3], index=df.index)) - tm.assert_series_equal(align(df, val, 'columns'), - Series([1, 2, 3], index=df.columns)) + for val in [ + [1, 2, 3], + (1, 2, 3), + np.array([1, 2, 3], dtype=np.int64), + range(1, 4), + ]: + + tm.assert_series_equal( + align(df, val, "index"), Series([1, 2, 3], index=df.index) + ) + tm.assert_series_equal( + align(df, val, "columns"), Series([1, 2, 3], index=df.columns) + ) # length mismatch - msg = 'Unable to coerce to Series, length must be 3: given 2' + msg = "Unable to coerce to Series, length must be 3: given 2" for val in [[1, 2], (1, 2), np.array([1, 2]), range(1, 3)]: with pytest.raises(ValueError, match=msg): - align(df, val, 'index') + align(df, val, "index") with pytest.raises(ValueError, match=msg): - align(df, val, 'columns') + align(df, val, "columns") val = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - tm.assert_frame_equal(align(df, val, 'index'), - DataFrame(val, index=df.index, - columns=df.columns)) - tm.assert_frame_equal(align(df, val, 'columns'), - DataFrame(val, index=df.index, - columns=df.columns)) + tm.assert_frame_equal( + align(df, val, "index"), DataFrame(val, index=df.index, columns=df.columns) + ) + tm.assert_frame_equal( + align(df, val, "columns"), + DataFrame(val, index=df.index, columns=df.columns), + ) # shape mismatch - msg = 'Unable to coerce to DataFrame, shape must be' + msg = "Unable to coerce to DataFrame, shape must be" val = np.array([[1, 2, 3], [4, 5, 6]]) with pytest.raises(ValueError, match=msg): - align(df, val, 'index') + align(df, val, "index") with pytest.raises(ValueError, match=msg): - align(df, val, 'columns') + align(df, val, "columns") val = np.zeros((3, 3, 3)) with pytest.raises(ValueError): - align(df, val, 'index') + align(df, val, "index") with pytest.raises(ValueError): - align(df, val, 'columns') + align(df, val, "columns") def test_no_warning(self, all_arithmetic_operators): - df = pd.DataFrame({"A": [0., 0.], "B": [0., None]}) - b = df['B'] + df = pd.DataFrame({"A": [0.0, 0.0], "B": [0.0, None]}) + b = df["B"] with tm.assert_produces_warning(None): getattr(df, all_arithmetic_operators)(b, 0) @@ -798,7 +846,7 @@ def test_no_warning(self, all_arithmetic_operators): class TestTranspose: def test_transpose_tzaware_1col_single_tz(self): # GH#26825 - dti = pd.date_range('2016-04-05 04:30', periods=3, tz='UTC') + dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") df = pd.DataFrame(dti) assert (df.dtypes == dti.dtype).all() @@ -807,27 +855,27 @@ def test_transpose_tzaware_1col_single_tz(self): def test_transpose_tzaware_2col_single_tz(self): # GH#26825 - dti = pd.date_range('2016-04-05 04:30', periods=3, tz='UTC') + dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") - df3 = pd.DataFrame({'A': dti, 'B': dti}) + df3 = pd.DataFrame({"A": dti, "B": dti}) assert (df3.dtypes == dti.dtype).all() res3 = df3.T assert (res3.dtypes == dti.dtype).all() def test_transpose_tzaware_2col_mixed_tz(self): # GH#26825 - dti = pd.date_range('2016-04-05 04:30', periods=3, tz='UTC') - dti2 = dti.tz_convert('US/Pacific') + dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") + dti2 = dti.tz_convert("US/Pacific") - df4 = pd.DataFrame({'A': dti, 'B': dti2}) + df4 = pd.DataFrame({"A": dti, "B": dti2}) assert (df4.dtypes == [dti.dtype, dti2.dtype]).all() assert (df4.T.dtypes == object).all() tm.assert_frame_equal(df4.T.T, df4) def test_transpose_object_to_tzaware_mixed_tz(self): # GH#26825 - dti = pd.date_range('2016-04-05 04:30', periods=3, tz='UTC') - dti2 = dti.tz_convert('US/Pacific') + dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") + dti2 = dti.tz_convert("US/Pacific") # mixed all-tzaware dtypes df2 = pd.DataFrame([dti, dti2]) diff --git a/pandas/tests/frame/test_period.py b/pandas/tests/frame/test_period.py index e36f8107ba9fd6..a545db3365e365 100644 --- a/pandas/tests/frame/test_period.py +++ b/pandas/tests/frame/test_period.py @@ -5,8 +5,15 @@ import pandas as pd from pandas import ( - DataFrame, DatetimeIndex, Index, PeriodIndex, Timedelta, date_range, - period_range, to_datetime) + DataFrame, + DatetimeIndex, + Index, + PeriodIndex, + Timedelta, + date_range, + period_range, + to_datetime, +) import pandas.util.testing as tm @@ -15,9 +22,8 @@ def _permute(obj): class TestPeriodIndex: - def test_as_frame_columns(self): - rng = period_range('1/1/2000', periods=5) + rng = period_range("1/1/2000", periods=5) df = DataFrame(np.random.randn(10, 5), columns=rng) ts = df[rng[0]] @@ -26,116 +32,119 @@ def test_as_frame_columns(self): # GH # 1211 repr(df) - ts = df['1/1/2000'] + ts = df["1/1/2000"] tm.assert_series_equal(ts, df.iloc[:, 0]) def test_frame_setitem(self): - rng = period_range('1/1/2000', periods=5, name='index') + rng = period_range("1/1/2000", periods=5, name="index") df = DataFrame(np.random.randn(5, 3), index=rng) - df['Index'] = rng - rs = Index(df['Index']) + df["Index"] = rng + rs = Index(df["Index"]) tm.assert_index_equal(rs, rng, check_names=False) - assert rs.name == 'Index' - assert rng.name == 'index' + assert rs.name == "Index" + assert rng.name == "index" - rs = df.reset_index().set_index('index') + rs = df.reset_index().set_index("index") assert isinstance(rs.index, PeriodIndex) tm.assert_index_equal(rs.index, rng) def test_frame_to_time_stamp(self): K = 5 - index = period_range(freq='A', start='1/1/2001', end='12/1/2009') + index = period_range(freq="A", start="1/1/2001", end="12/1/2009") df = DataFrame(np.random.randn(len(index), K), index=index) - df['mix'] = 'a' + df["mix"] = "a" - exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') - exp_index = exp_index + Timedelta(1, 'D') - Timedelta(1, 'ns') - result = df.to_timestamp('D', 'end') + exp_index = date_range("1/1/2001", end="12/31/2009", freq="A-DEC") + exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns") + result = df.to_timestamp("D", "end") tm.assert_index_equal(result.index, exp_index) tm.assert_numpy_array_equal(result.values, df.values) - exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') - result = df.to_timestamp('D', 'start') + exp_index = date_range("1/1/2001", end="1/1/2009", freq="AS-JAN") + result = df.to_timestamp("D", "start") tm.assert_index_equal(result.index, exp_index) - def _get_with_delta(delta, freq='A-DEC'): - return date_range(to_datetime('1/1/2001') + delta, - to_datetime('12/31/2009') + delta, freq=freq) + def _get_with_delta(delta, freq="A-DEC"): + return date_range( + to_datetime("1/1/2001") + delta, + to_datetime("12/31/2009") + delta, + freq=freq, + ) delta = timedelta(hours=23) - result = df.to_timestamp('H', 'end') + result = df.to_timestamp("H", "end") exp_index = _get_with_delta(delta) - exp_index = exp_index + Timedelta(1, 'h') - Timedelta(1, 'ns') + exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) delta = timedelta(hours=23, minutes=59) - result = df.to_timestamp('T', 'end') + result = df.to_timestamp("T", "end") exp_index = _get_with_delta(delta) - exp_index = exp_index + Timedelta(1, 'm') - Timedelta(1, 'ns') + exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) - result = df.to_timestamp('S', 'end') + result = df.to_timestamp("S", "end") delta = timedelta(hours=23, minutes=59, seconds=59) exp_index = _get_with_delta(delta) - exp_index = exp_index + Timedelta(1, 's') - Timedelta(1, 'ns') + exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) # columns df = df.T - exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') - exp_index = exp_index + Timedelta(1, 'D') - Timedelta(1, 'ns') - result = df.to_timestamp('D', 'end', axis=1) + exp_index = date_range("1/1/2001", end="12/31/2009", freq="A-DEC") + exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns") + result = df.to_timestamp("D", "end", axis=1) tm.assert_index_equal(result.columns, exp_index) tm.assert_numpy_array_equal(result.values, df.values) - exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') - result = df.to_timestamp('D', 'start', axis=1) + exp_index = date_range("1/1/2001", end="1/1/2009", freq="AS-JAN") + result = df.to_timestamp("D", "start", axis=1) tm.assert_index_equal(result.columns, exp_index) delta = timedelta(hours=23) - result = df.to_timestamp('H', 'end', axis=1) + result = df.to_timestamp("H", "end", axis=1) exp_index = _get_with_delta(delta) - exp_index = exp_index + Timedelta(1, 'h') - Timedelta(1, 'ns') + exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns") tm.assert_index_equal(result.columns, exp_index) delta = timedelta(hours=23, minutes=59) - result = df.to_timestamp('T', 'end', axis=1) + result = df.to_timestamp("T", "end", axis=1) exp_index = _get_with_delta(delta) - exp_index = exp_index + Timedelta(1, 'm') - Timedelta(1, 'ns') + exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns") tm.assert_index_equal(result.columns, exp_index) - result = df.to_timestamp('S', 'end', axis=1) + result = df.to_timestamp("S", "end", axis=1) delta = timedelta(hours=23, minutes=59, seconds=59) exp_index = _get_with_delta(delta) - exp_index = exp_index + Timedelta(1, 's') - Timedelta(1, 'ns') + exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") tm.assert_index_equal(result.columns, exp_index) # invalid axis - with pytest.raises(ValueError, match='axis'): + with pytest.raises(ValueError, match="axis"): df.to_timestamp(axis=2) - result1 = df.to_timestamp('5t', axis=1) - result2 = df.to_timestamp('t', axis=1) - expected = pd.date_range('2001-01-01', '2009-01-01', freq='AS') + result1 = df.to_timestamp("5t", axis=1) + result2 = df.to_timestamp("t", axis=1) + expected = pd.date_range("2001-01-01", "2009-01-01", freq="AS") assert isinstance(result1.columns, DatetimeIndex) assert isinstance(result2.columns, DatetimeIndex) tm.assert_numpy_array_equal(result1.columns.asi8, expected.asi8) tm.assert_numpy_array_equal(result2.columns.asi8, expected.asi8) # PeriodIndex.to_timestamp always use 'infer' - assert result1.columns.freqstr == 'AS-JAN' - assert result2.columns.freqstr == 'AS-JAN' + assert result1.columns.freqstr == "AS-JAN" + assert result2.columns.freqstr == "AS-JAN" def test_frame_index_to_string(self): - index = PeriodIndex(['2011-1', '2011-2', '2011-3'], freq='M') + index = PeriodIndex(["2011-1", "2011-2", "2011-3"], freq="M") frame = DataFrame(np.random.randn(3, 4), index=index) # it works! frame.to_string() def test_align_frame(self): - rng = period_range('1/1/2000', '1/1/2010', freq='A') + rng = period_range("1/1/2000", "1/1/2010", freq="A") ts = DataFrame(np.random.randn(len(rng), 3), index=rng) result = ts + ts[::2] diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/test_quantile.py index 097477c42d249d..bbb3395fb23afd 100644 --- a/pandas/tests/frame/test_quantile.py +++ b/pandas/tests/frame/test_quantile.py @@ -8,92 +8,97 @@ class TestDataFrameQuantile: - def test_quantile(self, datetime_frame): from numpy import percentile df = datetime_frame q = df.quantile(0.1, axis=0) - assert q['A'] == percentile(df['A'], 10) + assert q["A"] == percentile(df["A"], 10) tm.assert_index_equal(q.index, df.columns) q = df.quantile(0.9, axis=1) - assert (q['2000-01-17'] == - percentile(df.loc['2000-01-17'], 90)) + assert q["2000-01-17"] == percentile(df.loc["2000-01-17"], 90) tm.assert_index_equal(q.index, df.index) # test degenerate case - q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0) - assert(np.isnan(q['x']) and np.isnan(q['y'])) + q = DataFrame({"x": [], "y": []}).quantile(0.1, axis=0) + assert np.isnan(q["x"]) and np.isnan(q["y"]) # non-numeric exclusion - df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]}) + df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]}) rs = df.quantile(0.5) xp = df.median().rename(0.5) assert_series_equal(rs, xp) # axis df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) - result = df.quantile(.5, axis=1) + result = df.quantile(0.5, axis=1) expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) assert_series_equal(result, expected) - result = df.quantile([.5, .75], axis=1) - expected = DataFrame({1: [1.5, 1.75], 2: [2.5, 2.75], - 3: [3.5, 3.75]}, index=[0.5, 0.75]) + result = df.quantile([0.5, 0.75], axis=1) + expected = DataFrame( + {1: [1.5, 1.75], 2: [2.5, 2.75], 3: [3.5, 3.75]}, index=[0.5, 0.75] + ) assert_frame_equal(result, expected, check_index_type=True) # We may want to break API in the future to change this # so that we exclude non-numeric along the same axis # See GH #7312 - df = DataFrame([[1, 2, 3], - ['a', 'b', 4]]) - result = df.quantile(.5, axis=1) - expected = Series([3., 4.], index=[0, 1], name=0.5) + df = DataFrame([[1, 2, 3], ["a", "b", 4]]) + result = df.quantile(0.5, axis=1) + expected = Series([3.0, 4.0], index=[0, 1], name=0.5) assert_series_equal(result, expected) def test_quantile_axis_mixed(self): # mixed on axis=1 - df = DataFrame({"A": [1, 2, 3], - "B": [2., 3., 4.], - "C": pd.date_range('20130101', periods=3), - "D": ['foo', 'bar', 'baz']}) - result = df.quantile(.5, axis=1) + df = DataFrame( + { + "A": [1, 2, 3], + "B": [2.0, 3.0, 4.0], + "C": pd.date_range("20130101", periods=3), + "D": ["foo", "bar", "baz"], + } + ) + result = df.quantile(0.5, axis=1) expected = Series([1.5, 2.5, 3.5], name=0.5) assert_series_equal(result, expected) # must raise with pytest.raises(TypeError): - df.quantile(.5, axis=1, numeric_only=False) + df.quantile(0.5, axis=1, numeric_only=False) def test_quantile_axis_parameter(self): # GH 9543/9544 df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) - result = df.quantile(.5, axis=0) + result = df.quantile(0.5, axis=0) - expected = Series([2., 3.], index=["A", "B"], name=0.5) + expected = Series([2.0, 3.0], index=["A", "B"], name=0.5) assert_series_equal(result, expected) - expected = df.quantile(.5, axis="index") + expected = df.quantile(0.5, axis="index") assert_series_equal(result, expected) - result = df.quantile(.5, axis=1) + result = df.quantile(0.5, axis=1) expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) assert_series_equal(result, expected) - result = df.quantile(.5, axis="columns") + result = df.quantile(0.5, axis="columns") assert_series_equal(result, expected) - msg = ("No axis named -1 for object type" - " ") + msg = ( + "No axis named -1 for object type" " " + ) with pytest.raises(ValueError, match=msg): df.quantile(0.1, axis=-1) - msg = ("No axis named column for object type" - " ") + msg = ( + "No axis named column for object type" + " " + ) with pytest.raises(ValueError, match=msg): df.quantile(0.1, axis="column") @@ -102,45 +107,53 @@ def test_quantile_interpolation(self): # interpolation method other than default linear df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) - result = df.quantile(.5, axis=1, interpolation='nearest') + result = df.quantile(0.5, axis=1, interpolation="nearest") expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5) tm.assert_series_equal(result, expected) # cross-check interpolation=nearest results in original dtype - exp = np.percentile(np.array([[1, 2, 3], [2, 3, 4]]), .5, - axis=0, interpolation='nearest') - expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='int64') + exp = np.percentile( + np.array([[1, 2, 3], [2, 3, 4]]), 0.5, axis=0, interpolation="nearest" + ) + expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="int64") tm.assert_series_equal(result, expected) # float - df = DataFrame({"A": [1., 2., 3.], "B": [2., 3., 4.]}, index=[1, 2, 3]) - result = df.quantile(.5, axis=1, interpolation='nearest') - expected = Series([1., 2., 3.], index=[1, 2, 3], name=0.5) + df = DataFrame({"A": [1.0, 2.0, 3.0], "B": [2.0, 3.0, 4.0]}, index=[1, 2, 3]) + result = df.quantile(0.5, axis=1, interpolation="nearest") + expected = Series([1.0, 2.0, 3.0], index=[1, 2, 3], name=0.5) tm.assert_series_equal(result, expected) - exp = np.percentile(np.array([[1., 2., 3.], [2., 3., 4.]]), .5, - axis=0, interpolation='nearest') - expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='float64') + exp = np.percentile( + np.array([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]]), + 0.5, + axis=0, + interpolation="nearest", + ) + expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="float64") assert_series_equal(result, expected) # axis - result = df.quantile([.5, .75], axis=1, interpolation='lower') - expected = DataFrame({1: [1., 1.], 2: [2., 2.], - 3: [3., 3.]}, index=[0.5, 0.75]) + result = df.quantile([0.5, 0.75], axis=1, interpolation="lower") + expected = DataFrame( + {1: [1.0, 1.0], 2: [2.0, 2.0], 3: [3.0, 3.0]}, index=[0.5, 0.75] + ) assert_frame_equal(result, expected) # test degenerate case - df = DataFrame({'x': [], 'y': []}) - q = df.quantile(0.1, axis=0, interpolation='higher') - assert(np.isnan(q['x']) and np.isnan(q['y'])) + df = DataFrame({"x": [], "y": []}) + q = df.quantile(0.1, axis=0, interpolation="higher") + assert np.isnan(q["x"]) and np.isnan(q["y"]) # multi - df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], - columns=['a', 'b', 'c']) - result = df.quantile([.25, .5], interpolation='midpoint') + df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) + result = df.quantile([0.25, 0.5], interpolation="midpoint") # https://github.com/numpy/numpy/issues/7163 - expected = DataFrame([[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], - index=[.25, .5], columns=['a', 'b', 'c']) + expected = DataFrame( + [[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], + index=[0.25, 0.5], + columns=["a", "b", "c"], + ) assert_frame_equal(result, expected) def test_quantile_interpolation_datetime(self, datetime_frame): @@ -148,8 +161,8 @@ def test_quantile_interpolation_datetime(self, datetime_frame): # interpolation = linear (default case) df = datetime_frame - q = df.quantile(0.1, axis=0, interpolation='linear') - assert q['A'] == np.percentile(df['A'], 10) + q = df.quantile(0.1, axis=0, interpolation="linear") + assert q["A"] == np.percentile(df["A"], 10) def test_quantile_interpolation_int(self, int_frame): # see gh-10174 @@ -157,65 +170,73 @@ def test_quantile_interpolation_int(self, int_frame): df = int_frame # interpolation = linear (default case) q = df.quantile(0.1) - assert q['A'] == np.percentile(df['A'], 10) + assert q["A"] == np.percentile(df["A"], 10) # test with and without interpolation keyword - q1 = df.quantile(0.1, axis=0, interpolation='linear') - assert q1['A'] == np.percentile(df['A'], 10) + q1 = df.quantile(0.1, axis=0, interpolation="linear") + assert q1["A"] == np.percentile(df["A"], 10) tm.assert_series_equal(q, q1) def test_quantile_multi(self): - df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], - columns=['a', 'b', 'c']) - result = df.quantile([.25, .5]) - expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]], - index=[.25, .5], columns=['a', 'b', 'c']) + df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) + result = df.quantile([0.25, 0.5]) + expected = DataFrame( + [[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], + index=[0.25, 0.5], + columns=["a", "b", "c"], + ) assert_frame_equal(result, expected) # axis = 1 - result = df.quantile([.25, .5], axis=1) - expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]], - index=[.25, .5], columns=[0, 1, 2]) + result = df.quantile([0.25, 0.5], axis=1) + expected = DataFrame( + [[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], index=[0.25, 0.5], columns=[0, 1, 2] + ) # empty - result = DataFrame({'x': [], 'y': []}).quantile([0.1, .9], axis=0) - expected = DataFrame({'x': [np.nan, np.nan], 'y': [np.nan, np.nan]}, - index=[.1, .9]) + result = DataFrame({"x": [], "y": []}).quantile([0.1, 0.9], axis=0) + expected = DataFrame( + {"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9] + ) assert_frame_equal(result, expected) def test_quantile_datetime(self): - df = DataFrame({'a': pd.to_datetime(['2010', '2011']), 'b': [0, 5]}) + df = DataFrame({"a": pd.to_datetime(["2010", "2011"]), "b": [0, 5]}) # exclude datetime - result = df.quantile(.5) - expected = Series([2.5], index=['b']) + result = df.quantile(0.5) + expected = Series([2.5], index=["b"]) # datetime - result = df.quantile(.5, numeric_only=False) - expected = Series([Timestamp('2010-07-02 12:00:00'), 2.5], - index=['a', 'b'], - name=0.5) + result = df.quantile(0.5, numeric_only=False) + expected = Series( + [Timestamp("2010-07-02 12:00:00"), 2.5], index=["a", "b"], name=0.5 + ) assert_series_equal(result, expected) # datetime w/ multi - result = df.quantile([.5], numeric_only=False) - expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), 2.5]], - index=[.5], columns=['a', 'b']) + result = df.quantile([0.5], numeric_only=False) + expected = DataFrame( + [[Timestamp("2010-07-02 12:00:00"), 2.5]], index=[0.5], columns=["a", "b"] + ) assert_frame_equal(result, expected) # axis = 1 - df['c'] = pd.to_datetime(['2011', '2012']) - result = df[['a', 'c']].quantile(.5, axis=1, numeric_only=False) - expected = Series([Timestamp('2010-07-02 12:00:00'), - Timestamp('2011-07-02 12:00:00')], - index=[0, 1], - name=0.5) + df["c"] = pd.to_datetime(["2011", "2012"]) + result = df[["a", "c"]].quantile(0.5, axis=1, numeric_only=False) + expected = Series( + [Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")], + index=[0, 1], + name=0.5, + ) assert_series_equal(result, expected) - result = df[['a', 'c']].quantile([.5], axis=1, numeric_only=False) - expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), - Timestamp('2011-07-02 12:00:00')]], - index=[0.5], columns=[0, 1]) + result = df[["a", "c"]].quantile([0.5], axis=1, numeric_only=False) + expected = DataFrame( + [[Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")]], + index=[0.5], + columns=[0, 1], + ) assert_frame_equal(result, expected) # empty when numeric_only=True @@ -224,96 +245,146 @@ def test_quantile_datetime(self): # result = df[['a', 'c']].quantile([.5]) def test_quantile_invalid(self, datetime_frame): - msg = 'percentiles should all be in the interval \\[0, 1\\]' + msg = "percentiles should all be in the interval \\[0, 1\\]" for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: with pytest.raises(ValueError, match=msg): datetime_frame.quantile(invalid) def test_quantile_box(self): - df = DataFrame({'A': [pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-03')], - 'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timestamp('2011-01-03', tz='US/Eastern')], - 'C': [pd.Timedelta('1 days'), - pd.Timedelta('2 days'), - pd.Timedelta('3 days')]}) + df = DataFrame( + { + "A": [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + ], + "B": [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03", tz="US/Eastern"), + ], + "C": [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + ], + } + ) res = df.quantile(0.5, numeric_only=False) - exp = pd.Series([pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timedelta('2 days')], - name=0.5, index=['A', 'B', 'C']) + exp = pd.Series( + [ + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timedelta("2 days"), + ], + name=0.5, + index=["A", "B", "C"], + ) tm.assert_series_equal(res, exp) res = df.quantile([0.5], numeric_only=False) - exp = pd.DataFrame([[pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timedelta('2 days')]], - index=[0.5], columns=['A', 'B', 'C']) + exp = pd.DataFrame( + [ + [ + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timedelta("2 days"), + ] + ], + index=[0.5], + columns=["A", "B", "C"], + ) tm.assert_frame_equal(res, exp) # DatetimeBlock may be consolidated and contain NaT in different loc - df = DataFrame({'A': [pd.Timestamp('2011-01-01'), - pd.NaT, - pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-03')], - 'a': [pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-02'), - pd.NaT, - pd.Timestamp('2011-01-03')], - 'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.NaT, - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timestamp('2011-01-03', tz='US/Eastern')], - 'b': [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.NaT, - pd.Timestamp('2011-01-03', tz='US/Eastern')], - 'C': [pd.Timedelta('1 days'), - pd.Timedelta('2 days'), - pd.Timedelta('3 days'), - pd.NaT], - 'c': [pd.NaT, - pd.Timedelta('1 days'), - pd.Timedelta('2 days'), - pd.Timedelta('3 days')]}, - columns=list('AaBbCc')) + df = DataFrame( + { + "A": [ + pd.Timestamp("2011-01-01"), + pd.NaT, + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + ], + "a": [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.NaT, + pd.Timestamp("2011-01-03"), + ], + "B": [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.NaT, + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03", tz="US/Eastern"), + ], + "b": [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.NaT, + pd.Timestamp("2011-01-03", tz="US/Eastern"), + ], + "C": [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + pd.NaT, + ], + "c": [ + pd.NaT, + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + ], + }, + columns=list("AaBbCc"), + ) res = df.quantile(0.5, numeric_only=False) - exp = pd.Series([pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timedelta('2 days'), - pd.Timedelta('2 days')], - name=0.5, index=list('AaBbCc')) + exp = pd.Series( + [ + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timedelta("2 days"), + pd.Timedelta("2 days"), + ], + name=0.5, + index=list("AaBbCc"), + ) tm.assert_series_equal(res, exp) res = df.quantile([0.5], numeric_only=False) - exp = pd.DataFrame([[pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timedelta('2 days'), - pd.Timedelta('2 days')]], - index=[0.5], columns=list('AaBbCc')) + exp = pd.DataFrame( + [ + [ + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timedelta("2 days"), + pd.Timedelta("2 days"), + ] + ], + index=[0.5], + columns=list("AaBbCc"), + ) tm.assert_frame_equal(res, exp) def test_quantile_nan(self): # GH 14357 - float block where some cols have missing values - df = DataFrame({'a': np.arange(1, 6.0), 'b': np.arange(1, 6.0)}) + df = DataFrame({"a": np.arange(1, 6.0), "b": np.arange(1, 6.0)}) df.iloc[-1, 1] = np.nan res = df.quantile(0.5) - exp = Series([3.0, 2.5], index=['a', 'b'], name=0.5) + exp = Series([3.0, 2.5], index=["a", "b"], name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5, 0.75]) - exp = DataFrame({'a': [3.0, 4.0], 'b': [2.5, 3.25]}, index=[0.5, 0.75]) + exp = DataFrame({"a": [3.0, 4.0], "b": [2.5, 3.25]}, index=[0.5, 0.75]) tm.assert_frame_equal(res, exp) res = df.quantile(0.5, axis=1) @@ -325,57 +396,62 @@ def test_quantile_nan(self): tm.assert_frame_equal(res, exp) # full-nan column - df['b'] = np.nan + df["b"] = np.nan res = df.quantile(0.5) - exp = Series([3.0, np.nan], index=['a', 'b'], name=0.5) + exp = Series([3.0, np.nan], index=["a", "b"], name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5, 0.75]) - exp = DataFrame({'a': [3.0, 4.0], 'b': [np.nan, np.nan]}, - index=[0.5, 0.75]) + exp = DataFrame({"a": [3.0, 4.0], "b": [np.nan, np.nan]}, index=[0.5, 0.75]) tm.assert_frame_equal(res, exp) def test_quantile_nat(self): # full NaT column - df = DataFrame({'a': [pd.NaT, pd.NaT, pd.NaT]}) + df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]}) res = df.quantile(0.5, numeric_only=False) - exp = Series([pd.NaT], index=['a'], name=0.5) + exp = Series([pd.NaT], index=["a"], name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5], numeric_only=False) - exp = DataFrame({'a': [pd.NaT]}, index=[0.5]) + exp = DataFrame({"a": [pd.NaT]}, index=[0.5]) tm.assert_frame_equal(res, exp) # mixed non-null / full null column - df = DataFrame({'a': [pd.Timestamp('2012-01-01'), - pd.Timestamp('2012-01-02'), - pd.Timestamp('2012-01-03')], - 'b': [pd.NaT, pd.NaT, pd.NaT]}) + df = DataFrame( + { + "a": [ + pd.Timestamp("2012-01-01"), + pd.Timestamp("2012-01-02"), + pd.Timestamp("2012-01-03"), + ], + "b": [pd.NaT, pd.NaT, pd.NaT], + } + ) res = df.quantile(0.5, numeric_only=False) - exp = Series([pd.Timestamp('2012-01-02'), pd.NaT], index=['a', 'b'], - name=0.5) + exp = Series([pd.Timestamp("2012-01-02"), pd.NaT], index=["a", "b"], name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5], numeric_only=False) - exp = DataFrame([[pd.Timestamp('2012-01-02'), pd.NaT]], index=[0.5], - columns=['a', 'b']) + exp = DataFrame( + [[pd.Timestamp("2012-01-02"), pd.NaT]], index=[0.5], columns=["a", "b"] + ) tm.assert_frame_equal(res, exp) def test_quantile_empty(self): # floats - df = DataFrame(columns=['a', 'b'], dtype='float64') + df = DataFrame(columns=["a", "b"], dtype="float64") res = df.quantile(0.5) - exp = Series([np.nan, np.nan], index=['a', 'b'], name=0.5) + exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5]) - exp = DataFrame([[np.nan, np.nan]], columns=['a', 'b'], index=[0.5]) + exp = DataFrame([[np.nan, np.nan]], columns=["a", "b"], index=[0.5]) tm.assert_frame_equal(res, exp) # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) @@ -383,13 +459,13 @@ def test_quantile_empty(self): # res = df.quantile([0.5], axis=1) # ints - df = DataFrame(columns=['a', 'b'], dtype='int64') + df = DataFrame(columns=["a", "b"], dtype="int64") # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) # res = df.quantile(0.5) # datetimes - df = DataFrame(columns=['a', 'b'], dtype='datetime64[ns]') + df = DataFrame(columns=["a", "b"], dtype="datetime64[ns]") # FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0) # res = df.quantile(0.5, numeric_only=False) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 0ed484f678fdf0..70c58471dd0d46 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -11,10 +11,13 @@ from pandas.core.computation.check import _NUMEXPR_INSTALLED from pandas.tests.frame.common import TestData from pandas.util.testing import ( - assert_frame_equal, assert_series_equal, makeCustomDataframe as mkdf) + assert_frame_equal, + assert_series_equal, + makeCustomDataframe as mkdf, +) -PARSERS = 'python', 'pandas' -ENGINES = 'python', pytest.param('numexpr', marks=td.skip_if_no_ne) +PARSERS = "python", "pandas" +ENGINES = "python", pytest.param("numexpr", marks=td.skip_if_no_ne) @pytest.fixture(params=PARSERS, ids=lambda x: x) @@ -28,14 +31,13 @@ def engine(request): def skip_if_no_pandas_parser(parser): - if parser != 'pandas': + if parser != "pandas": pytest.skip("cannot evaluate with parser {0!r}".format(parser)) class TestCompat: - def setup_method(self, method): - self.df = DataFrame({'A': [1, 2, 3]}) + self.df = DataFrame({"A": [1, 2, 3]}) self.expected1 = self.df[self.df.A > 0] self.expected2 = self.df.A + 1 @@ -44,44 +46,43 @@ def test_query_default(self): # GH 12749 # this should always work, whether _NUMEXPR_INSTALLED or not df = self.df - result = df.query('A>0') + result = df.query("A>0") assert_frame_equal(result, self.expected1) - result = df.eval('A+1') + result = df.eval("A+1") assert_series_equal(result, self.expected2, check_names=False) def test_query_None(self): df = self.df - result = df.query('A>0', engine=None) + result = df.query("A>0", engine=None) assert_frame_equal(result, self.expected1) - result = df.eval('A+1', engine=None) + result = df.eval("A+1", engine=None) assert_series_equal(result, self.expected2, check_names=False) def test_query_python(self): df = self.df - result = df.query('A>0', engine='python') + result = df.query("A>0", engine="python") assert_frame_equal(result, self.expected1) - result = df.eval('A+1', engine='python') + result = df.eval("A+1", engine="python") assert_series_equal(result, self.expected2, check_names=False) def test_query_numexpr(self): df = self.df if _NUMEXPR_INSTALLED: - result = df.query('A>0', engine='numexpr') + result = df.query("A>0", engine="numexpr") assert_frame_equal(result, self.expected1) - result = df.eval('A+1', engine='numexpr') + result = df.eval("A+1", engine="numexpr") assert_series_equal(result, self.expected2, check_names=False) else: with pytest.raises(ImportError): - df.query('A>0', engine='numexpr') + df.query("A>0", engine="numexpr") with pytest.raises(ImportError): - df.eval('A+1', engine='numexpr') + df.eval("A+1", engine="numexpr") class TestDataFrameEval(TestData): - def test_ops(self): # tst ops and reversed ops in evaluation @@ -90,18 +91,20 @@ def test_ops(self): # smaller hits python, larger hits numexpr for n in [4, 4000]: - df = DataFrame(1, index=range(n), columns=list('abcd')) + df = DataFrame(1, index=range(n), columns=list("abcd")) df.iloc[0] = 2 m = df.mean() - for op_str, op, rop in [('+', '__add__', '__radd__'), - ('-', '__sub__', '__rsub__'), - ('*', '__mul__', '__rmul__'), - ('/', '__truediv__', '__rtruediv__')]: + for op_str, op, rop in [ + ("+", "__add__", "__radd__"), + ("-", "__sub__", "__rsub__"), + ("*", "__mul__", "__rmul__"), + ("/", "__truediv__", "__rtruediv__"), + ]: - base = (DataFrame(np.tile(m.values, n) # noqa - .reshape(n, -1), - columns=list('abcd'))) + base = DataFrame( + np.tile(m.values, n).reshape(n, -1), columns=list("abcd") # noqa + ) expected = eval("base{op}df".format(op=op_str)) @@ -110,25 +113,25 @@ def test_ops(self): assert_frame_equal(result, expected) # these are commutative - if op in ['+', '*']: + if op in ["+", "*"]: result = getattr(df, op)(m) assert_frame_equal(result, expected) # these are not - elif op in ['-', '/']: + elif op in ["-", "/"]: result = getattr(df, rop)(m) assert_frame_equal(result, expected) # GH7192 df = DataFrame(dict(A=np.random.randn(25000))) df.iloc[0:5] = np.nan - expected = (1 - np.isnan(df.iloc[0:25])) + expected = 1 - np.isnan(df.iloc[0:25]) result = (1 - np.isnan(df)).iloc[0:25] assert_frame_equal(result, expected) def test_query_non_str(self): # GH 11485 - df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'b']}) + df = pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "b"]}) msg = "expr must be a string to be evaluated" with pytest.raises(ValueError, match=msg): @@ -139,119 +142,116 @@ def test_query_non_str(self): def test_query_empty_string(self): # GH 13139 - df = pd.DataFrame({'A': [1, 2, 3]}) + df = pd.DataFrame({"A": [1, 2, 3]}) msg = "expr cannot be an empty string" with pytest.raises(ValueError, match=msg): - df.query('') + df.query("") def test_eval_resolvers_as_list(self): # GH 14095 - df = DataFrame(np.random.randn(10, 2), columns=list('ab')) - dict1 = {'a': 1} - dict2 = {'b': 2} - assert (df.eval('a + b', resolvers=[dict1, dict2]) == - dict1['a'] + dict2['b']) - assert (pd.eval('a + b', resolvers=[dict1, dict2]) == - dict1['a'] + dict2['b']) + df = DataFrame(np.random.randn(10, 2), columns=list("ab")) + dict1 = {"a": 1} + dict2 = {"b": 2} + assert df.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"] + assert pd.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"] class TestDataFrameQueryWithMultiIndex: - def test_query_with_named_multiindex(self, parser, engine): skip_if_no_pandas_parser(parser) - a = np.random.choice(['red', 'green'], size=10) - b = np.random.choice(['eggs', 'ham'], size=10) - index = MultiIndex.from_arrays([a, b], names=['color', 'food']) + a = np.random.choice(["red", "green"], size=10) + b = np.random.choice(["eggs", "ham"], size=10) + index = MultiIndex.from_arrays([a, b], names=["color", "food"]) df = DataFrame(np.random.randn(10, 2), index=index) - ind = Series(df.index.get_level_values('color').values, index=index, - name='color') + ind = Series( + df.index.get_level_values("color").values, index=index, name="color" + ) # equality res1 = df.query('color == "red"', parser=parser, engine=engine) res2 = df.query('"red" == color', parser=parser, engine=engine) - exp = df[ind == 'red'] + exp = df[ind == "red"] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) # inequality res1 = df.query('color != "red"', parser=parser, engine=engine) res2 = df.query('"red" != color', parser=parser, engine=engine) - exp = df[ind != 'red'] + exp = df[ind != "red"] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) # list equality (really just set membership) res1 = df.query('color == ["red"]', parser=parser, engine=engine) res2 = df.query('["red"] == color', parser=parser, engine=engine) - exp = df[ind.isin(['red'])] + exp = df[ind.isin(["red"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) res1 = df.query('color != ["red"]', parser=parser, engine=engine) res2 = df.query('["red"] != color', parser=parser, engine=engine) - exp = df[~ind.isin(['red'])] + exp = df[~ind.isin(["red"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) # in/not in ops res1 = df.query('["red"] in color', parser=parser, engine=engine) res2 = df.query('"red" in color', parser=parser, engine=engine) - exp = df[ind.isin(['red'])] + exp = df[ind.isin(["red"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) res1 = df.query('["red"] not in color', parser=parser, engine=engine) res2 = df.query('"red" not in color', parser=parser, engine=engine) - exp = df[~ind.isin(['red'])] + exp = df[~ind.isin(["red"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) def test_query_with_unnamed_multiindex(self, parser, engine): skip_if_no_pandas_parser(parser) - a = np.random.choice(['red', 'green'], size=10) - b = np.random.choice(['eggs', 'ham'], size=10) + a = np.random.choice(["red", "green"], size=10) + b = np.random.choice(["eggs", "ham"], size=10) index = MultiIndex.from_arrays([a, b]) df = DataFrame(np.random.randn(10, 2), index=index) ind = Series(df.index.get_level_values(0).values, index=index) res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine) res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine) - exp = df[ind == 'red'] + exp = df[ind == "red"] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) # inequality res1 = df.query('ilevel_0 != "red"', parser=parser, engine=engine) res2 = df.query('"red" != ilevel_0', parser=parser, engine=engine) - exp = df[ind != 'red'] + exp = df[ind != "red"] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) # list equality (really just set membership) res1 = df.query('ilevel_0 == ["red"]', parser=parser, engine=engine) res2 = df.query('["red"] == ilevel_0', parser=parser, engine=engine) - exp = df[ind.isin(['red'])] + exp = df[ind.isin(["red"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) res1 = df.query('ilevel_0 != ["red"]', parser=parser, engine=engine) res2 = df.query('["red"] != ilevel_0', parser=parser, engine=engine) - exp = df[~ind.isin(['red'])] + exp = df[~ind.isin(["red"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) # in/not in ops res1 = df.query('["red"] in ilevel_0', parser=parser, engine=engine) res2 = df.query('"red" in ilevel_0', parser=parser, engine=engine) - exp = df[ind.isin(['red'])] + exp = df[ind.isin(["red"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) - res1 = df.query('["red"] not in ilevel_0', parser=parser, - engine=engine) + res1 = df.query('["red"] not in ilevel_0', parser=parser, engine=engine) res2 = df.query('"red" not in ilevel_0', parser=parser, engine=engine) - exp = df[~ind.isin(['red'])] + exp = df[~ind.isin(["red"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) @@ -259,60 +259,61 @@ def test_query_with_unnamed_multiindex(self, parser, engine): ind = Series(df.index.get_level_values(1).values, index=index) res1 = df.query('ilevel_1 == "eggs"', parser=parser, engine=engine) res2 = df.query('"eggs" == ilevel_1', parser=parser, engine=engine) - exp = df[ind == 'eggs'] + exp = df[ind == "eggs"] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) # inequality res1 = df.query('ilevel_1 != "eggs"', parser=parser, engine=engine) res2 = df.query('"eggs" != ilevel_1', parser=parser, engine=engine) - exp = df[ind != 'eggs'] + exp = df[ind != "eggs"] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) # list equality (really just set membership) res1 = df.query('ilevel_1 == ["eggs"]', parser=parser, engine=engine) res2 = df.query('["eggs"] == ilevel_1', parser=parser, engine=engine) - exp = df[ind.isin(['eggs'])] + exp = df[ind.isin(["eggs"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) res1 = df.query('ilevel_1 != ["eggs"]', parser=parser, engine=engine) res2 = df.query('["eggs"] != ilevel_1', parser=parser, engine=engine) - exp = df[~ind.isin(['eggs'])] + exp = df[~ind.isin(["eggs"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) # in/not in ops res1 = df.query('["eggs"] in ilevel_1', parser=parser, engine=engine) res2 = df.query('"eggs" in ilevel_1', parser=parser, engine=engine) - exp = df[ind.isin(['eggs'])] + exp = df[ind.isin(["eggs"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) - res1 = df.query('["eggs"] not in ilevel_1', parser=parser, - engine=engine) + res1 = df.query('["eggs"] not in ilevel_1', parser=parser, engine=engine) res2 = df.query('"eggs" not in ilevel_1', parser=parser, engine=engine) - exp = df[~ind.isin(['eggs'])] + exp = df[~ind.isin(["eggs"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) def test_query_with_partially_named_multiindex(self, parser, engine): skip_if_no_pandas_parser(parser) - a = np.random.choice(['red', 'green'], size=10) + a = np.random.choice(["red", "green"], size=10) b = np.arange(10) index = MultiIndex.from_arrays([a, b]) - index.names = [None, 'rating'] + index.names = [None, "rating"] df = DataFrame(np.random.randn(10, 2), index=index) - res = df.query('rating == 1', parser=parser, engine=engine) - ind = Series(df.index.get_level_values('rating').values, index=index, - name='rating') + res = df.query("rating == 1", parser=parser, engine=engine) + ind = Series( + df.index.get_level_values("rating").values, index=index, name="rating" + ) exp = df[ind == 1] assert_frame_equal(res, exp) - res = df.query('rating != 1', parser=parser, engine=engine) - ind = Series(df.index.get_level_values('rating').values, index=index, - name='rating') + res = df.query("rating != 1", parser=parser, engine=engine) + ind = Series( + df.index.get_level_values("rating").values, index=index, name="rating" + ) exp = df[ind != 1] assert_frame_equal(res, exp) @@ -327,7 +328,7 @@ def test_query_with_partially_named_multiindex(self, parser, engine): assert_frame_equal(res, exp) def test_query_multiindex_get_index_resolvers(self): - df = mkdf(10, 3, r_idx_nlevels=2, r_idx_names=['spam', 'eggs']) + df = mkdf(10, 3, r_idx_nlevels=2, r_idx_names=["spam", "eggs"]) resolvers = df._get_index_resolvers() def to_series(mi, level): @@ -337,11 +338,13 @@ def to_series(mi, level): return s col_series = df.columns.to_series() - expected = {'index': df.index, - 'columns': col_series, - 'spam': to_series(df.index, 'spam'), - 'eggs': to_series(df.index, 'eggs'), - 'C0': col_series} + expected = { + "index": df.index, + "columns": col_series, + "spam": to_series(df.index, "spam"), + "eggs": to_series(df.index, "eggs"), + "C0": col_series, + } for k, v in resolvers.items(): if isinstance(v, Index): assert v.is_(expected[k]) @@ -353,11 +356,10 @@ def to_series(mi, level): @td.skip_if_no_ne class TestDataFrameQueryNumExprPandas: - @classmethod def setup_class(cls): - cls.engine = 'numexpr' - cls.parser = 'pandas' + cls.engine = "numexpr" + cls.parser = "pandas" @classmethod def teardown_class(cls): @@ -367,186 +369,190 @@ def test_date_query_with_attribute_access(self): engine, parser = self.engine, self.parser skip_if_no_pandas_parser(parser) df = DataFrame(np.random.randn(5, 3)) - df['dates1'] = date_range('1/1/2012', periods=5) - df['dates2'] = date_range('1/1/2013', periods=5) - df['dates3'] = date_range('1/1/2014', periods=5) - res = df.query('@df.dates1 < 20130101 < @df.dates3', engine=engine, - parser=parser) - expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + df["dates1"] = date_range("1/1/2012", periods=5) + df["dates2"] = date_range("1/1/2013", periods=5) + df["dates3"] = date_range("1/1/2014", periods=5) + res = df.query( + "@df.dates1 < 20130101 < @df.dates3", engine=engine, parser=parser + ) + expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)] assert_frame_equal(res, expec) def test_date_query_no_attribute_access(self): engine, parser = self.engine, self.parser df = DataFrame(np.random.randn(5, 3)) - df['dates1'] = date_range('1/1/2012', periods=5) - df['dates2'] = date_range('1/1/2013', periods=5) - df['dates3'] = date_range('1/1/2014', periods=5) - res = df.query('dates1 < 20130101 < dates3', engine=engine, - parser=parser) - expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + df["dates1"] = date_range("1/1/2012", periods=5) + df["dates2"] = date_range("1/1/2013", periods=5) + df["dates3"] = date_range("1/1/2014", periods=5) + res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser) + expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)] assert_frame_equal(res, expec) def test_date_query_with_NaT(self): engine, parser = self.engine, self.parser n = 10 df = DataFrame(np.random.randn(n, 3)) - df['dates1'] = date_range('1/1/2012', periods=n) - df['dates2'] = date_range('1/1/2013', periods=n) - df['dates3'] = date_range('1/1/2014', periods=n) - df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT - df.loc[np.random.rand(n) > 0.5, 'dates3'] = pd.NaT - res = df.query('dates1 < 20130101 < dates3', engine=engine, - parser=parser) - expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + df["dates1"] = date_range("1/1/2012", periods=n) + df["dates2"] = date_range("1/1/2013", periods=n) + df["dates3"] = date_range("1/1/2014", periods=n) + df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT + df.loc[np.random.rand(n) > 0.5, "dates3"] = pd.NaT + res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser) + expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)] assert_frame_equal(res, expec) def test_date_index_query(self): engine, parser = self.engine, self.parser n = 10 df = DataFrame(np.random.randn(n, 3)) - df['dates1'] = date_range('1/1/2012', periods=n) - df['dates3'] = date_range('1/1/2014', periods=n) - df.set_index('dates1', inplace=True, drop=True) - res = df.query('index < 20130101 < dates3', engine=engine, - parser=parser) - expec = df[(df.index < '20130101') & ('20130101' < df.dates3)] + df["dates1"] = date_range("1/1/2012", periods=n) + df["dates3"] = date_range("1/1/2014", periods=n) + df.set_index("dates1", inplace=True, drop=True) + res = df.query("index < 20130101 < dates3", engine=engine, parser=parser) + expec = df[(df.index < "20130101") & ("20130101" < df.dates3)] assert_frame_equal(res, expec) def test_date_index_query_with_NaT(self): engine, parser = self.engine, self.parser n = 10 df = DataFrame(np.random.randn(n, 3)) - df['dates1'] = date_range('1/1/2012', periods=n) - df['dates3'] = date_range('1/1/2014', periods=n) + df["dates1"] = date_range("1/1/2012", periods=n) + df["dates3"] = date_range("1/1/2014", periods=n) df.iloc[0, 0] = pd.NaT - df.set_index('dates1', inplace=True, drop=True) - res = df.query('index < 20130101 < dates3', engine=engine, - parser=parser) - expec = df[(df.index < '20130101') & ('20130101' < df.dates3)] + df.set_index("dates1", inplace=True, drop=True) + res = df.query("index < 20130101 < dates3", engine=engine, parser=parser) + expec = df[(df.index < "20130101") & ("20130101" < df.dates3)] assert_frame_equal(res, expec) def test_date_index_query_with_NaT_duplicates(self): engine, parser = self.engine, self.parser n = 10 d = {} - d['dates1'] = date_range('1/1/2012', periods=n) - d['dates3'] = date_range('1/1/2014', periods=n) + d["dates1"] = date_range("1/1/2012", periods=n) + d["dates3"] = date_range("1/1/2014", periods=n) df = DataFrame(d) - df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT - df.set_index('dates1', inplace=True, drop=True) - res = df.query('dates1 < 20130101 < dates3', engine=engine, - parser=parser) - expec = df[(df.index.to_series() < '20130101') & - ('20130101' < df.dates3)] + df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT + df.set_index("dates1", inplace=True, drop=True) + res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser) + expec = df[(df.index.to_series() < "20130101") & ("20130101" < df.dates3)] assert_frame_equal(res, expec) def test_date_query_with_non_date(self): engine, parser = self.engine, self.parser n = 10 - df = DataFrame({'dates': date_range('1/1/2012', periods=n), - 'nondate': np.arange(n)}) + df = DataFrame( + {"dates": date_range("1/1/2012", periods=n), "nondate": np.arange(n)} + ) - result = df.query('dates == nondate', parser=parser, engine=engine) + result = df.query("dates == nondate", parser=parser, engine=engine) assert len(result) == 0 - result = df.query('dates != nondate', parser=parser, engine=engine) + result = df.query("dates != nondate", parser=parser, engine=engine) assert_frame_equal(result, df) - for op in ['<', '>', '<=', '>=']: + for op in ["<", ">", "<=", ">="]: with pytest.raises(TypeError): - df.query('dates %s nondate' % op, parser=parser, engine=engine) + df.query("dates %s nondate" % op, parser=parser, engine=engine) def test_query_syntax_error(self): engine, parser = self.engine, self.parser - df = DataFrame({"i": range(10), "+": range(3, 13), - "r": range(4, 14)}) + df = DataFrame({"i": range(10), "+": range(3, 13), "r": range(4, 14)}) with pytest.raises(SyntaxError): - df.query('i - +', engine=engine, parser=parser) + df.query("i - +", engine=engine, parser=parser) def test_query_scope(self): from pandas.core.computation.ops import UndefinedVariableError + engine, parser = self.engine, self.parser skip_if_no_pandas_parser(parser) - df = DataFrame(np.random.randn(20, 2), columns=list('ab')) + df = DataFrame(np.random.randn(20, 2), columns=list("ab")) a, b = 1, 2 # noqa - res = df.query('a > b', engine=engine, parser=parser) + res = df.query("a > b", engine=engine, parser=parser) expected = df[df.a > df.b] assert_frame_equal(res, expected) - res = df.query('@a > b', engine=engine, parser=parser) + res = df.query("@a > b", engine=engine, parser=parser) expected = df[a > df.b] assert_frame_equal(res, expected) # no local variable c with pytest.raises(UndefinedVariableError): - df.query('@a > b > @c', engine=engine, parser=parser) + df.query("@a > b > @c", engine=engine, parser=parser) # no column named 'c' with pytest.raises(UndefinedVariableError): - df.query('@a > b > c', engine=engine, parser=parser) + df.query("@a > b > c", engine=engine, parser=parser) def test_query_doesnt_pickup_local(self): from pandas.core.computation.ops import UndefinedVariableError engine, parser = self.engine, self.parser n = m = 10 - df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list("abc")) # we don't pick up the local 'sin' with pytest.raises(UndefinedVariableError): - df.query('sin > 5', engine=engine, parser=parser) + df.query("sin > 5", engine=engine, parser=parser) def test_query_builtin(self): from pandas.core.computation.engines import NumExprClobberingError + engine, parser = self.engine, self.parser n = m = 10 - df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list("abc")) - df.index.name = 'sin' - msg = 'Variables in expression.+' + df.index.name = "sin" + msg = "Variables in expression.+" with pytest.raises(NumExprClobberingError, match=msg): - df.query('sin > 5', engine=engine, parser=parser) + df.query("sin > 5", engine=engine, parser=parser) def test_query(self): engine, parser = self.engine, self.parser - df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c']) + df = DataFrame(np.random.randn(10, 3), columns=["a", "b", "c"]) - assert_frame_equal(df.query('a < b', engine=engine, parser=parser), - df[df.a < df.b]) - assert_frame_equal(df.query('a + b > b * c', engine=engine, - parser=parser), - df[df.a + df.b > df.b * df.c]) + assert_frame_equal( + df.query("a < b", engine=engine, parser=parser), df[df.a < df.b] + ) + assert_frame_equal( + df.query("a + b > b * c", engine=engine, parser=parser), + df[df.a + df.b > df.b * df.c], + ) def test_query_index_with_name(self): engine, parser = self.engine, self.parser - df = DataFrame(np.random.randint(10, size=(10, 3)), - index=Index(range(10), name='blob'), - columns=['a', 'b', 'c']) - res = df.query('(blob < 5) & (a < b)', engine=engine, parser=parser) + df = DataFrame( + np.random.randint(10, size=(10, 3)), + index=Index(range(10), name="blob"), + columns=["a", "b", "c"], + ) + res = df.query("(blob < 5) & (a < b)", engine=engine, parser=parser) expec = df[(df.index < 5) & (df.a < df.b)] assert_frame_equal(res, expec) - res = df.query('blob < b', engine=engine, parser=parser) + res = df.query("blob < b", engine=engine, parser=parser) expec = df[df.index < df.b] assert_frame_equal(res, expec) def test_query_index_without_name(self): engine, parser = self.engine, self.parser - df = DataFrame(np.random.randint(10, size=(10, 3)), - index=range(10), columns=['a', 'b', 'c']) + df = DataFrame( + np.random.randint(10, size=(10, 3)), + index=range(10), + columns=["a", "b", "c"], + ) # "index" should refer to the index - res = df.query('index < b', engine=engine, parser=parser) + res = df.query("index < b", engine=engine, parser=parser) expec = df[df.index < df.b] assert_frame_equal(res, expec) # test against a scalar - res = df.query('index < 5', engine=engine, parser=parser) + res = df.query("index < 5", engine=engine, parser=parser) expec = df[df.index < 5] assert_frame_equal(res, expec) @@ -560,21 +566,20 @@ def test_nested_scope(self): df2 = DataFrame(np.random.randn(5, 3)) expected = df[(df > 0) & (df2 > 0)] - result = df.query('(@df > 0) & (@df2 > 0)', engine=engine, - parser=parser) + result = df.query("(@df > 0) & (@df2 > 0)", engine=engine, parser=parser) assert_frame_equal(result, expected) - result = pd.eval('df[df > 0 and df2 > 0]', engine=engine, - parser=parser) + result = pd.eval("df[df > 0 and df2 > 0]", engine=engine, parser=parser) assert_frame_equal(result, expected) - result = pd.eval('df[df > 0 and df2 > 0 and df[df > 0] > 0]', - engine=engine, parser=parser) + result = pd.eval( + "df[df > 0 and df2 > 0 and df[df > 0] > 0]", engine=engine, parser=parser + ) expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] assert_frame_equal(result, expected) - result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser) - expected = df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) + result = pd.eval("df[(df>0) & (df2>0)]", engine=engine, parser=parser) + expected = df.query("(@df>0) & (@df2>0)", engine=engine, parser=parser) assert_frame_equal(result, expected) def test_nested_raises_on_local_self_reference(self): @@ -584,104 +589,106 @@ def test_nested_raises_on_local_self_reference(self): # can't reference ourself b/c we're a local so @ is necessary with pytest.raises(UndefinedVariableError): - df.query('df > 0', engine=self.engine, parser=self.parser) + df.query("df > 0", engine=self.engine, parser=self.parser) def test_local_syntax(self): skip_if_no_pandas_parser(self.parser) engine, parser = self.engine, self.parser - df = DataFrame(np.random.randn(100, 10), columns=list('abcdefghij')) + df = DataFrame(np.random.randn(100, 10), columns=list("abcdefghij")) b = 1 expect = df[df.a < b] - result = df.query('a < @b', engine=engine, parser=parser) + result = df.query("a < @b", engine=engine, parser=parser) assert_frame_equal(result, expect) expect = df[df.a < df.b] - result = df.query('a < b', engine=engine, parser=parser) + result = df.query("a < b", engine=engine, parser=parser) assert_frame_equal(result, expect) def test_chained_cmp_and_in(self): skip_if_no_pandas_parser(self.parser) engine, parser = self.engine, self.parser - cols = list('abc') + cols = list("abc") df = DataFrame(np.random.randn(100, len(cols)), columns=cols) - res = df.query('a < b < c and a not in b not in c', engine=engine, - parser=parser) - ind = (df.a < df.b) & (df.b < df.c) & ~df.b.isin(df.a) & ~df.c.isin(df.b) # noqa + res = df.query( + "a < b < c and a not in b not in c", engine=engine, parser=parser + ) + ind = ( + (df.a < df.b) & (df.b < df.c) & ~df.b.isin(df.a) & ~df.c.isin(df.b) + ) # noqa expec = df[ind] assert_frame_equal(res, expec) def test_local_variable_with_in(self): engine, parser = self.engine, self.parser skip_if_no_pandas_parser(parser) - a = Series(np.random.randint(3, size=15), name='a') - b = Series(np.random.randint(10, size=15), name='b') - df = DataFrame({'a': a, 'b': b}) + a = Series(np.random.randint(3, size=15), name="a") + b = Series(np.random.randint(10, size=15), name="b") + df = DataFrame({"a": a, "b": b}) expected = df.loc[(df.b - 1).isin(a)] - result = df.query('b - 1 in a', engine=engine, parser=parser) + result = df.query("b - 1 in a", engine=engine, parser=parser) assert_frame_equal(expected, result) - b = Series(np.random.randint(10, size=15), name='b') + b = Series(np.random.randint(10, size=15), name="b") expected = df.loc[(b - 1).isin(a)] - result = df.query('@b - 1 in a', engine=engine, parser=parser) + result = df.query("@b - 1 in a", engine=engine, parser=parser) assert_frame_equal(expected, result) def test_at_inside_string(self): engine, parser = self.engine, self.parser skip_if_no_pandas_parser(parser) c = 1 # noqa - df = DataFrame({'a': ['a', 'a', 'b', 'b', '@c', '@c']}) + df = DataFrame({"a": ["a", "a", "b", "b", "@c", "@c"]}) result = df.query('a == "@c"', engine=engine, parser=parser) expected = df[df.a == "@c"] assert_frame_equal(result, expected) def test_query_undefined_local(self): from pandas.core.computation.ops import UndefinedVariableError + engine, parser = self.engine, self.parser skip_if_no_pandas_parser(parser) - df = DataFrame(np.random.rand(10, 2), columns=list('ab')) + df = DataFrame(np.random.rand(10, 2), columns=list("ab")) msg = "local variable 'c' is not defined" with pytest.raises(UndefinedVariableError, match=msg): - df.query('a == @c', engine=engine, parser=parser) + df.query("a == @c", engine=engine, parser=parser) def test_index_resolvers_come_after_columns_with_the_same_name(self): n = 1 # noqa a = np.r_[20:101:20] - df = DataFrame({'index': a, 'b': np.random.randn(a.size)}) - df.index.name = 'index' - result = df.query('index > 5', engine=self.engine, parser=self.parser) - expected = df[df['index'] > 5] + df = DataFrame({"index": a, "b": np.random.randn(a.size)}) + df.index.name = "index" + result = df.query("index > 5", engine=self.engine, parser=self.parser) + expected = df[df["index"] > 5] assert_frame_equal(result, expected) - df = DataFrame({'index': a, - 'b': np.random.randn(a.size)}) - result = df.query('ilevel_0 > 5', engine=self.engine, - parser=self.parser) + df = DataFrame({"index": a, "b": np.random.randn(a.size)}) + result = df.query("ilevel_0 > 5", engine=self.engine, parser=self.parser) expected = df.loc[df.index[df.index > 5]] assert_frame_equal(result, expected) - df = DataFrame({'a': a, 'b': np.random.randn(a.size)}) - df.index.name = 'a' - result = df.query('a > 5', engine=self.engine, parser=self.parser) + df = DataFrame({"a": a, "b": np.random.randn(a.size)}) + df.index.name = "a" + result = df.query("a > 5", engine=self.engine, parser=self.parser) expected = df[df.a > 5] assert_frame_equal(result, expected) - result = df.query('index > 5', engine=self.engine, parser=self.parser) + result = df.query("index > 5", engine=self.engine, parser=self.parser) expected = df.loc[df.index[df.index > 5]] assert_frame_equal(result, expected) def test_inf(self): n = 10 - df = DataFrame({'a': np.random.rand(n), 'b': np.random.rand(n)}) + df = DataFrame({"a": np.random.rand(n), "b": np.random.rand(n)}) df.loc[::2, 0] = np.inf - ops = '==', '!=' + ops = "==", "!=" d = dict(zip(ops, (operator.eq, operator.ne))) for op, f in d.items(): - q = 'a %s inf' % op + q = "a %s inf" % op expected = df[f(df.a, np.inf)] result = df.query(q, engine=self.engine, parser=self.parser) assert_frame_equal(result, expected) @@ -689,82 +696,86 @@ def test_inf(self): @td.skip_if_no_ne class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = 'numexpr' - cls.parser = 'python' + cls.engine = "numexpr" + cls.parser = "python" cls.frame = TestData().frame def test_date_query_no_attribute_access(self): engine, parser = self.engine, self.parser df = DataFrame(np.random.randn(5, 3)) - df['dates1'] = date_range('1/1/2012', periods=5) - df['dates2'] = date_range('1/1/2013', periods=5) - df['dates3'] = date_range('1/1/2014', periods=5) - res = df.query('(dates1 < 20130101) & (20130101 < dates3)', - engine=engine, parser=parser) - expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + df["dates1"] = date_range("1/1/2012", periods=5) + df["dates2"] = date_range("1/1/2013", periods=5) + df["dates3"] = date_range("1/1/2014", periods=5) + res = df.query( + "(dates1 < 20130101) & (20130101 < dates3)", engine=engine, parser=parser + ) + expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)] assert_frame_equal(res, expec) def test_date_query_with_NaT(self): engine, parser = self.engine, self.parser n = 10 df = DataFrame(np.random.randn(n, 3)) - df['dates1'] = date_range('1/1/2012', periods=n) - df['dates2'] = date_range('1/1/2013', periods=n) - df['dates3'] = date_range('1/1/2014', periods=n) - df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT - df.loc[np.random.rand(n) > 0.5, 'dates3'] = pd.NaT - res = df.query('(dates1 < 20130101) & (20130101 < dates3)', - engine=engine, parser=parser) - expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + df["dates1"] = date_range("1/1/2012", periods=n) + df["dates2"] = date_range("1/1/2013", periods=n) + df["dates3"] = date_range("1/1/2014", periods=n) + df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT + df.loc[np.random.rand(n) > 0.5, "dates3"] = pd.NaT + res = df.query( + "(dates1 < 20130101) & (20130101 < dates3)", engine=engine, parser=parser + ) + expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)] assert_frame_equal(res, expec) def test_date_index_query(self): engine, parser = self.engine, self.parser n = 10 df = DataFrame(np.random.randn(n, 3)) - df['dates1'] = date_range('1/1/2012', periods=n) - df['dates3'] = date_range('1/1/2014', periods=n) - df.set_index('dates1', inplace=True, drop=True) - res = df.query('(index < 20130101) & (20130101 < dates3)', - engine=engine, parser=parser) - expec = df[(df.index < '20130101') & ('20130101' < df.dates3)] + df["dates1"] = date_range("1/1/2012", periods=n) + df["dates3"] = date_range("1/1/2014", periods=n) + df.set_index("dates1", inplace=True, drop=True) + res = df.query( + "(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser + ) + expec = df[(df.index < "20130101") & ("20130101" < df.dates3)] assert_frame_equal(res, expec) def test_date_index_query_with_NaT(self): engine, parser = self.engine, self.parser n = 10 df = DataFrame(np.random.randn(n, 3)) - df['dates1'] = date_range('1/1/2012', periods=n) - df['dates3'] = date_range('1/1/2014', periods=n) + df["dates1"] = date_range("1/1/2012", periods=n) + df["dates3"] = date_range("1/1/2014", periods=n) df.iloc[0, 0] = pd.NaT - df.set_index('dates1', inplace=True, drop=True) - res = df.query('(index < 20130101) & (20130101 < dates3)', - engine=engine, parser=parser) - expec = df[(df.index < '20130101') & ('20130101' < df.dates3)] + df.set_index("dates1", inplace=True, drop=True) + res = df.query( + "(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser + ) + expec = df[(df.index < "20130101") & ("20130101" < df.dates3)] assert_frame_equal(res, expec) def test_date_index_query_with_NaT_duplicates(self): engine, parser = self.engine, self.parser n = 10 df = DataFrame(np.random.randn(n, 3)) - df['dates1'] = date_range('1/1/2012', periods=n) - df['dates3'] = date_range('1/1/2014', periods=n) - df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT - df.set_index('dates1', inplace=True, drop=True) + df["dates1"] = date_range("1/1/2012", periods=n) + df["dates3"] = date_range("1/1/2014", periods=n) + df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT + df.set_index("dates1", inplace=True, drop=True) with pytest.raises(NotImplementedError): - df.query('index < 20130101 < dates3', engine=engine, parser=parser) + df.query("index < 20130101 < dates3", engine=engine, parser=parser) def test_nested_scope(self): from pandas.core.computation.ops import UndefinedVariableError + engine = self.engine parser = self.parser # smoke test x = 1 # noqa - result = pd.eval('x + 1', engine=engine, parser=parser) + result = pd.eval("x + 1", engine=engine, parser=parser) assert result == 2 df = DataFrame(np.random.randn(5, 3)) @@ -772,170 +783,175 @@ def test_nested_scope(self): # don't have the pandas parser with pytest.raises(SyntaxError): - df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) + df.query("(@df>0) & (@df2>0)", engine=engine, parser=parser) with pytest.raises(UndefinedVariableError): - df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + df.query("(df>0) & (df2>0)", engine=engine, parser=parser) expected = df[(df > 0) & (df2 > 0)] - result = pd.eval('df[(df > 0) & (df2 > 0)]', engine=engine, - parser=parser) + result = pd.eval("df[(df > 0) & (df2 > 0)]", engine=engine, parser=parser) assert_frame_equal(expected, result) expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] - result = pd.eval('df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]', - engine=engine, parser=parser) + result = pd.eval( + "df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]", engine=engine, parser=parser + ) assert_frame_equal(expected, result) class TestDataFrameQueryPythonPandas(TestDataFrameQueryNumExprPandas): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = 'python' - cls.parser = 'pandas' + cls.engine = "python" + cls.parser = "pandas" cls.frame = TestData().frame def test_query_builtin(self): engine, parser = self.engine, self.parser n = m = 10 - df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list("abc")) - df.index.name = 'sin' + df.index.name = "sin" expected = df[df.index > 5] - result = df.query('sin > 5', engine=engine, parser=parser) + result = df.query("sin > 5", engine=engine, parser=parser) assert_frame_equal(expected, result) class TestDataFrameQueryPythonPython(TestDataFrameQueryNumExprPython): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = cls.parser = 'python' + cls.engine = cls.parser = "python" cls.frame = TestData().frame def test_query_builtin(self): engine, parser = self.engine, self.parser n = m = 10 - df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list("abc")) - df.index.name = 'sin' + df.index.name = "sin" expected = df[df.index > 5] - result = df.query('sin > 5', engine=engine, parser=parser) + result = df.query("sin > 5", engine=engine, parser=parser) assert_frame_equal(expected, result) class TestDataFrameQueryStrings: - def test_str_query_method(self, parser, engine): - df = DataFrame(np.random.randn(10, 1), columns=['b']) - df['strings'] = Series(list('aabbccddee')) - expect = df[df.strings == 'a'] + df = DataFrame(np.random.randn(10, 1), columns=["b"]) + df["strings"] = Series(list("aabbccddee")) + expect = df[df.strings == "a"] - if parser != 'pandas': - col = 'strings' + if parser != "pandas": + col = "strings" lst = '"a"' lhs = [col] * 2 + [lst] * 2 rhs = lhs[::-1] - eq, ne = '==', '!=' + eq, ne = "==", "!=" ops = 2 * ([eq] + [ne]) for lhs, op, rhs in zip(lhs, ops, rhs): - ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs) + ex = "{lhs} {op} {rhs}".format(lhs=lhs, op=op, rhs=rhs) msg = r"'(Not)?In' nodes are not implemented" with pytest.raises(NotImplementedError, match=msg): - df.query(ex, engine=engine, parser=parser, - local_dict={'strings': df.strings}) + df.query( + ex, + engine=engine, + parser=parser, + local_dict={"strings": df.strings}, + ) else: res = df.query('"a" == strings', engine=engine, parser=parser) assert_frame_equal(res, expect) res = df.query('strings == "a"', engine=engine, parser=parser) assert_frame_equal(res, expect) - assert_frame_equal(res, df[df.strings.isin(['a'])]) + assert_frame_equal(res, df[df.strings.isin(["a"])]) - expect = df[df.strings != 'a'] + expect = df[df.strings != "a"] res = df.query('strings != "a"', engine=engine, parser=parser) assert_frame_equal(res, expect) res = df.query('"a" != strings', engine=engine, parser=parser) assert_frame_equal(res, expect) - assert_frame_equal(res, df[~df.strings.isin(['a'])]) + assert_frame_equal(res, df[~df.strings.isin(["a"])]) def test_str_list_query_method(self, parser, engine): - df = DataFrame(np.random.randn(10, 1), columns=['b']) - df['strings'] = Series(list('aabbccddee')) - expect = df[df.strings.isin(['a', 'b'])] + df = DataFrame(np.random.randn(10, 1), columns=["b"]) + df["strings"] = Series(list("aabbccddee")) + expect = df[df.strings.isin(["a", "b"])] - if parser != 'pandas': - col = 'strings' + if parser != "pandas": + col = "strings" lst = '["a", "b"]' lhs = [col] * 2 + [lst] * 2 rhs = lhs[::-1] - eq, ne = '==', '!=' + eq, ne = "==", "!=" ops = 2 * ([eq] + [ne]) for lhs, op, rhs in zip(lhs, ops, rhs): - ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs) + ex = "{lhs} {op} {rhs}".format(lhs=lhs, op=op, rhs=rhs) with pytest.raises(NotImplementedError): df.query(ex, engine=engine, parser=parser) else: - res = df.query('strings == ["a", "b"]', engine=engine, - parser=parser) + res = df.query('strings == ["a", "b"]', engine=engine, parser=parser) assert_frame_equal(res, expect) - res = df.query('["a", "b"] == strings', engine=engine, - parser=parser) + res = df.query('["a", "b"] == strings', engine=engine, parser=parser) assert_frame_equal(res, expect) - expect = df[~df.strings.isin(['a', 'b'])] + expect = df[~df.strings.isin(["a", "b"])] - res = df.query('strings != ["a", "b"]', engine=engine, - parser=parser) + res = df.query('strings != ["a", "b"]', engine=engine, parser=parser) assert_frame_equal(res, expect) - res = df.query('["a", "b"] != strings', engine=engine, - parser=parser) + res = df.query('["a", "b"] != strings', engine=engine, parser=parser) assert_frame_equal(res, expect) def test_query_with_string_columns(self, parser, engine): - df = DataFrame({'a': list('aaaabbbbcccc'), - 'b': list('aabbccddeeff'), - 'c': np.random.randint(5, size=12), - 'd': np.random.randint(9, size=12)}) - if parser == 'pandas': - res = df.query('a in b', parser=parser, engine=engine) + df = DataFrame( + { + "a": list("aaaabbbbcccc"), + "b": list("aabbccddeeff"), + "c": np.random.randint(5, size=12), + "d": np.random.randint(9, size=12), + } + ) + if parser == "pandas": + res = df.query("a in b", parser=parser, engine=engine) expec = df[df.a.isin(df.b)] assert_frame_equal(res, expec) - res = df.query('a in b and c < d', parser=parser, engine=engine) + res = df.query("a in b and c < d", parser=parser, engine=engine) expec = df[df.a.isin(df.b) & (df.c < df.d)] assert_frame_equal(res, expec) else: with pytest.raises(NotImplementedError): - df.query('a in b', parser=parser, engine=engine) + df.query("a in b", parser=parser, engine=engine) with pytest.raises(NotImplementedError): - df.query('a in b and c < d', parser=parser, engine=engine) + df.query("a in b and c < d", parser=parser, engine=engine) def test_object_array_eq_ne(self, parser, engine): - df = DataFrame({'a': list('aaaabbbbcccc'), - 'b': list('aabbccddeeff'), - 'c': np.random.randint(5, size=12), - 'd': np.random.randint(9, size=12)}) - res = df.query('a == b', parser=parser, engine=engine) + df = DataFrame( + { + "a": list("aaaabbbbcccc"), + "b": list("aabbccddeeff"), + "c": np.random.randint(5, size=12), + "d": np.random.randint(9, size=12), + } + ) + res = df.query("a == b", parser=parser, engine=engine) exp = df[df.a == df.b] assert_frame_equal(res, exp) - res = df.query('a != b', parser=parser, engine=engine) + res = df.query("a != b", parser=parser, engine=engine) exp = df[df.a != df.b] assert_frame_equal(res, exp) @@ -955,126 +971,131 @@ def test_query_with_nested_strings(self, parser, engine): 6 "page 3 load" 2/1/2014 1:02:01 6 "page 3 exit" 2/1/2014 1:02:31 """ - df = pd.read_csv(StringIO(raw), sep=r'\s{2,}', engine='python', - parse_dates=['timestamp']) + df = pd.read_csv( + StringIO(raw), sep=r"\s{2,}", engine="python", parse_dates=["timestamp"] + ) expected = df[df.event == '"page 1 load"'] - res = df.query("""'"page 1 load"' in event""", parser=parser, - engine=engine) + res = df.query("""'"page 1 load"' in event""", parser=parser, engine=engine) assert_frame_equal(expected, res) def test_query_with_nested_special_character(self, parser, engine): skip_if_no_pandas_parser(parser) - df = DataFrame({'a': ['a', 'b', 'test & test'], - 'b': [1, 2, 3]}) + df = DataFrame({"a": ["a", "b", "test & test"], "b": [1, 2, 3]}) res = df.query('a == "test & test"', parser=parser, engine=engine) - expec = df[df.a == 'test & test'] + expec = df[df.a == "test & test"] assert_frame_equal(res, expec) def test_query_lex_compare_strings(self, parser, engine): import operator as opr - a = Series(np.random.choice(list('abcde'), 20)) + a = Series(np.random.choice(list("abcde"), 20)) b = Series(np.arange(a.size)) - df = DataFrame({'X': a, 'Y': b}) + df = DataFrame({"X": a, "Y": b}) - ops = {'<': opr.lt, '>': opr.gt, '<=': opr.le, '>=': opr.ge} + ops = {"<": opr.lt, ">": opr.gt, "<=": opr.le, ">=": opr.ge} for op, func in ops.items(): res = df.query('X %s "d"' % op, engine=engine, parser=parser) - expected = df[func(df.X, 'd')] + expected = df[func(df.X, "d")] assert_frame_equal(res, expected) def test_query_single_element_booleans(self, parser, engine): - columns = 'bid', 'bidsize', 'ask', 'asksize' + columns = "bid", "bidsize", "ask", "asksize" data = np.random.randint(2, size=(1, len(columns))).astype(bool) df = DataFrame(data, columns=columns) - res = df.query('bid & ask', engine=engine, parser=parser) + res = df.query("bid & ask", engine=engine, parser=parser) expected = df[df.bid & df.ask] assert_frame_equal(res, expected) def test_query_string_scalar_variable(self, parser, engine): skip_if_no_pandas_parser(parser) - df = pd.DataFrame({'Symbol': ['BUD US', 'BUD US', 'IBM US', 'IBM US'], - 'Price': [109.70, 109.72, 183.30, 183.35]}) - e = df[df.Symbol == 'BUD US'] - symb = 'BUD US' # noqa - r = df.query('Symbol == @symb', parser=parser, engine=engine) + df = pd.DataFrame( + { + "Symbol": ["BUD US", "BUD US", "IBM US", "IBM US"], + "Price": [109.70, 109.72, 183.30, 183.35], + } + ) + e = df[df.Symbol == "BUD US"] + symb = "BUD US" # noqa + r = df.query("Symbol == @symb", parser=parser, engine=engine) assert_frame_equal(e, r) class TestDataFrameEvalWithFrame: - def setup_method(self, method): - self.frame = DataFrame(np.random.randn(10, 3), columns=list('abc')) + self.frame = DataFrame(np.random.randn(10, 3), columns=list("abc")) def teardown_method(self, method): del self.frame def test_simple_expr(self, parser, engine): - res = self.frame.eval('a + b', engine=engine, parser=parser) + res = self.frame.eval("a + b", engine=engine, parser=parser) expect = self.frame.a + self.frame.b assert_series_equal(res, expect) def test_bool_arith_expr(self, parser, engine): - res = self.frame.eval('a[a < 1] + b', engine=engine, parser=parser) + res = self.frame.eval("a[a < 1] + b", engine=engine, parser=parser) expect = self.frame.a[self.frame.a < 1] + self.frame.b assert_series_equal(res, expect) - @pytest.mark.parametrize('op', ['+', '-', '*', '/']) + @pytest.mark.parametrize("op", ["+", "-", "*", "/"]) def test_invalid_type_for_operator_raises(self, parser, engine, op): - df = DataFrame({'a': [1, 2], 'b': ['c', 'd']}) + df = DataFrame({"a": [1, 2], "b": ["c", "d"]}) msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'" with pytest.raises(TypeError, match=msg): - df.eval('a {0} b'.format(op), engine=engine, parser=parser) + df.eval("a {0} b".format(op), engine=engine, parser=parser) class TestDataFrameQueryBacktickQuoting: - - @pytest.fixture(scope='class') + @pytest.fixture(scope="class") def df(self): - yield DataFrame({'A': [1, 2, 3], - 'B B': [3, 2, 1], - 'C C': [4, 5, 6], - 'C_C': [8, 9, 10], - 'D_D D': [11, 1, 101]}) + yield DataFrame( + { + "A": [1, 2, 3], + "B B": [3, 2, 1], + "C C": [4, 5, 6], + "C_C": [8, 9, 10], + "D_D D": [11, 1, 101], + } + ) def test_single_backtick_variable_query(self, df): - res = df.query('1 < `B B`') - expect = df[1 < df['B B']] + res = df.query("1 < `B B`") + expect = df[1 < df["B B"]] assert_frame_equal(res, expect) def test_two_backtick_variables_query(self, df): - res = df.query('1 < `B B` and 4 < `C C`') - expect = df[(1 < df['B B']) & (4 < df['C C'])] + res = df.query("1 < `B B` and 4 < `C C`") + expect = df[(1 < df["B B"]) & (4 < df["C C"])] assert_frame_equal(res, expect) def test_single_backtick_variable_expr(self, df): - res = df.eval('A + `B B`') - expect = df['A'] + df['B B'] + res = df.eval("A + `B B`") + expect = df["A"] + df["B B"] assert_series_equal(res, expect) def test_two_backtick_variables_expr(self, df): - res = df.eval('`B B` + `C C`') - expect = df['B B'] + df['C C'] + res = df.eval("`B B` + `C C`") + expect = df["B B"] + df["C C"] assert_series_equal(res, expect) def test_already_underscore_variable(self, df): - res = df.eval('`C_C` + A') - expect = df['C_C'] + df['A'] + res = df.eval("`C_C` + A") + expect = df["C_C"] + df["A"] assert_series_equal(res, expect) def test_same_name_but_underscores(self, df): - res = df.eval('C_C + `C C`') - expect = df['C_C'] + df['C C'] + res = df.eval("C_C + `C C`") + expect = df["C_C"] + df["C C"] assert_series_equal(res, expect) def test_mixed_underscores_and_spaces(self, df): - res = df.eval('A + `D_D D`') - expect = df['A'] + df['D_D D'] + res = df.eval("A + `D_D D`") + expect = df["A"] + df["D_D D"] assert_series_equal(res, expect) def backtick_quote_name_with_no_spaces(self, df): - res = df.eval('A + `C_C`') - expect = df['A'] + df['C_C'] + res = df.eval("A + `C_C`") + expect = df["A"] + df["C_C"] assert_series_equal(res, expect) diff --git a/pandas/tests/frame/test_rank.py b/pandas/tests/frame/test_rank.py index c93defe7c64a62..fd9c53c7d9f5bb 100644 --- a/pandas/tests/frame/test_rank.py +++ b/pandas/tests/frame/test_rank.py @@ -10,18 +10,17 @@ class TestRank: s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]) - df = DataFrame({'A': s, 'B': s}) + df = DataFrame({"A": s, "B": s}) results = { - 'average': np.array([1.5, 5.5, 7.0, 3.5, np.nan, - 3.5, 1.5, 8.0, np.nan, 5.5]), - 'min': np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]), - 'max': np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]), - 'first': np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]), - 'dense': np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]), + "average": np.array([1.5, 5.5, 7.0, 3.5, np.nan, 3.5, 1.5, 8.0, np.nan, 5.5]), + "min": np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]), + "max": np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]), + "first": np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]), + "dense": np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]), } - @pytest.fixture(params=['average', 'min', 'max', 'first', 'dense']) + @pytest.fixture(params=["average", "min", "max", "first", "dense"]) def method(self, request): """ Fixture for trying all rank methods @@ -29,12 +28,12 @@ def method(self, request): return request.param def test_rank(self, float_frame): - rankdata = pytest.importorskip('scipy.stats.rankdata') + rankdata = pytest.importorskip("scipy.stats.rankdata") - float_frame['A'][::2] = np.nan - float_frame['B'][::3] = np.nan - float_frame['C'][::4] = np.nan - float_frame['D'][::5] = np.nan + float_frame["A"][::2] = np.nan + float_frame["B"][::3] = np.nan + float_frame["C"][::4] = np.nan + float_frame["D"][::5] = np.nan ranks0 = float_frame.rank() ranks1 = float_frame.rank(1) @@ -73,7 +72,7 @@ def test_rank2(self): result = df.rank(0, pct=True) tm.assert_frame_equal(result, expected) - df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']]) + df = DataFrame([["b", "c", "a"], ["a", "c", "b"]]) expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]]) result = df.rank(1, numeric_only=False) tm.assert_frame_equal(result, expected) @@ -82,7 +81,7 @@ def test_rank2(self): result = df.rank(0, numeric_only=False) tm.assert_frame_equal(result, expected) - df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']]) + df = DataFrame([["b", np.nan, "a"], ["a", "c", "b"]]) expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 3.0, 2.0]]) result = df.rank(1, numeric_only=False) tm.assert_frame_equal(result, expected) @@ -92,46 +91,44 @@ def test_rank2(self): tm.assert_frame_equal(result, expected) # f7u12, this does not work without extensive workaround - data = [[datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)], - [datetime(2000, 1, 2), datetime(2000, 1, 3), - datetime(2000, 1, 1)]] + data = [ + [datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)], + [datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)], + ] df = DataFrame(data) # check the rank - expected = DataFrame([[2., np.nan, 1.], - [2., 3., 1.]]) + expected = DataFrame([[2.0, np.nan, 1.0], [2.0, 3.0, 1.0]]) result = df.rank(1, numeric_only=False, ascending=True) tm.assert_frame_equal(result, expected) - expected = DataFrame([[1., np.nan, 2.], - [2., 1., 3.]]) + expected = DataFrame([[1.0, np.nan, 2.0], [2.0, 1.0, 3.0]]) result = df.rank(1, numeric_only=False, ascending=False) tm.assert_frame_equal(result, expected) - df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10, - 1e60, 1e80, 1e-30]}) - exp = DataFrame({"a": [3.5, 1., 3.5, 5., 6., 7., 2.]}) + df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10, 1e60, 1e80, 1e-30]}) + exp = DataFrame({"a": [3.5, 1.0, 3.5, 5.0, 6.0, 7.0, 2.0]}) tm.assert_frame_equal(df.rank(), exp) def test_rank_mixed_frame(self, float_string_frame): - float_string_frame['datetime'] = datetime.now() - float_string_frame['timedelta'] = timedelta(days=1, seconds=1) + float_string_frame["datetime"] = datetime.now() + float_string_frame["timedelta"] = timedelta(days=1, seconds=1) result = float_string_frame.rank(1) expected = float_string_frame.rank(1, numeric_only=True) tm.assert_frame_equal(result, expected) def test_rank_na_option(self, float_frame): - rankdata = pytest.importorskip('scipy.stats.rankdata') + rankdata = pytest.importorskip("scipy.stats.rankdata") - float_frame['A'][::2] = np.nan - float_frame['B'][::3] = np.nan - float_frame['C'][::4] = np.nan - float_frame['D'][::5] = np.nan + float_frame["A"][::2] = np.nan + float_frame["B"][::3] = np.nan + float_frame["C"][::4] = np.nan + float_frame["D"][::5] = np.nan # bottom - ranks0 = float_frame.rank(na_option='bottom') - ranks1 = float_frame.rank(1, na_option='bottom') + ranks0 = float_frame.rank(na_option="bottom") + ranks1 = float_frame.rank(1, na_option="bottom") fvals = float_frame.fillna(np.inf).values @@ -142,8 +139,8 @@ def test_rank_na_option(self, float_frame): tm.assert_almost_equal(ranks1.values, exp1) # top - ranks0 = float_frame.rank(na_option='top') - ranks1 = float_frame.rank(1, na_option='top') + ranks0 = float_frame.rank(na_option="top") + ranks1 = float_frame.rank(1, na_option="top") fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values fval1 = float_frame.T @@ -159,8 +156,8 @@ def test_rank_na_option(self, float_frame): # descending # bottom - ranks0 = float_frame.rank(na_option='top', ascending=False) - ranks1 = float_frame.rank(1, na_option='top', ascending=False) + ranks0 = float_frame.rank(na_option="top", ascending=False) + ranks1 = float_frame.rank(1, na_option="top", ascending=False) fvals = float_frame.fillna(np.inf).values @@ -173,8 +170,8 @@ def test_rank_na_option(self, float_frame): # descending # top - ranks0 = float_frame.rank(na_option='bottom', ascending=False) - ranks1 = float_frame.rank(1, na_option='bottom', ascending=False) + ranks0 = float_frame.rank(na_option="bottom", ascending=False) + ranks1 = float_frame.rank(1, na_option="bottom", ascending=False) fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values fval1 = float_frame.T @@ -191,7 +188,7 @@ def test_rank_na_option(self, float_frame): msg = "na_option must be one of 'keep', 'top', or 'bottom'" with pytest.raises(ValueError, match=msg): - float_frame.rank(na_option='bad', ascending=False) + float_frame.rank(na_option="bad", ascending=False) # invalid type with pytest.raises(ValueError, match=msg): @@ -200,35 +197,34 @@ def test_rank_na_option(self, float_frame): def test_rank_axis(self): # check if using axes' names gives the same result df = DataFrame([[2, 1], [4, 3]]) - tm.assert_frame_equal(df.rank(axis=0), df.rank(axis='index')) - tm.assert_frame_equal(df.rank(axis=1), df.rank(axis='columns')) + tm.assert_frame_equal(df.rank(axis=0), df.rank(axis="index")) + tm.assert_frame_equal(df.rank(axis=1), df.rank(axis="columns")) def test_rank_methods_frame(self): - pytest.importorskip('scipy.stats.special') - rankdata = pytest.importorskip('scipy.stats.rankdata') + pytest.importorskip("scipy.stats.special") + rankdata = pytest.importorskip("scipy.stats.rankdata") xs = np.random.randint(0, 21, (100, 26)) xs = (xs - 10.0) / 10.0 - cols = [chr(ord('z') - i) for i in range(xs.shape[1])] + cols = [chr(ord("z") - i) for i in range(xs.shape[1])] for vals in [xs, xs + 1e6, xs * 1e-6]: df = DataFrame(vals, columns=cols) for ax in [0, 1]: - for m in ['average', 'min', 'max', 'first', 'dense']: + for m in ["average", "min", "max", "first", "dense"]: result = df.rank(axis=ax, method=m) sprank = np.apply_along_axis( - rankdata, ax, vals, - m if m != 'first' else 'ordinal') + rankdata, ax, vals, m if m != "first" else "ordinal" + ) sprank = sprank.astype(np.float64) - expected = DataFrame(sprank, - columns=cols).astype('float64') + expected = DataFrame(sprank, columns=cols).astype("float64") tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) + @pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) def test_rank_descending(self, method, dtype): - if 'i' in dtype: + if "i" in dtype: df = self.df.dropna() else: df = self.df.astype(dtype) @@ -237,27 +233,25 @@ def test_rank_descending(self, method, dtype): expected = (df.max() - df).rank() assert_frame_equal(res, expected) - if method == 'first' and dtype == 'O': + if method == "first" and dtype == "O": return expected = (df.max() - df).rank(method=method) - if dtype != 'O': - res2 = df.rank(method=method, ascending=False, - numeric_only=True) + if dtype != "O": + res2 = df.rank(method=method, ascending=False, numeric_only=True) assert_frame_equal(res2, expected) - res3 = df.rank(method=method, ascending=False, - numeric_only=False) + res3 = df.rank(method=method, ascending=False, numeric_only=False) assert_frame_equal(res3, expected) - @pytest.mark.parametrize('axis', [0, 1]) - @pytest.mark.parametrize('dtype', [None, object]) + @pytest.mark.parametrize("axis", [0, 1]) + @pytest.mark.parametrize("dtype", [None, object]) def test_rank_2d_tie_methods(self, method, axis, dtype): df = self.df - def _check2d(df, expected, method='average', axis=0): - exp_df = DataFrame({'A': expected, 'B': expected}) + def _check2d(df, expected, method="average", axis=0): + exp_df = DataFrame({"A": expected, "B": expected}) if axis == 1: df = df.T @@ -266,33 +260,42 @@ def _check2d(df, expected, method='average', axis=0): result = df.rank(method=method, axis=axis) assert_frame_equal(result, exp_df) - disabled = {(object, 'first')} + disabled = {(object, "first")} if (dtype, method) in disabled: return frame = df if dtype is None else df.astype(dtype) _check2d(frame, self.results[method], method=method, axis=axis) @pytest.mark.parametrize( - "method,exp", [("dense", - [[1., 1., 1.], - [1., 0.5, 2. / 3], - [1., 0.5, 1. / 3]]), - ("min", - [[1. / 3, 1., 1.], - [1. / 3, 1. / 3, 2. / 3], - [1. / 3, 1. / 3, 1. / 3]]), - ("max", - [[1., 1., 1.], - [1., 2. / 3, 2. / 3], - [1., 2. / 3, 1. / 3]]), - ("average", - [[2. / 3, 1., 1.], - [2. / 3, 0.5, 2. / 3], - [2. / 3, 0.5, 1. / 3]]), - ("first", - [[1. / 3, 1., 1.], - [2. / 3, 1. / 3, 2. / 3], - [3. / 3, 2. / 3, 1. / 3]])]) + "method,exp", + [ + ("dense", [[1.0, 1.0, 1.0], [1.0, 0.5, 2.0 / 3], [1.0, 0.5, 1.0 / 3]]), + ( + "min", + [ + [1.0 / 3, 1.0, 1.0], + [1.0 / 3, 1.0 / 3, 2.0 / 3], + [1.0 / 3, 1.0 / 3, 1.0 / 3], + ], + ), + ( + "max", + [[1.0, 1.0, 1.0], [1.0, 2.0 / 3, 2.0 / 3], [1.0, 2.0 / 3, 1.0 / 3]], + ), + ( + "average", + [[2.0 / 3, 1.0, 1.0], [2.0 / 3, 0.5, 2.0 / 3], [2.0 / 3, 0.5, 1.0 / 3]], + ), + ( + "first", + [ + [1.0 / 3, 1.0, 1.0], + [2.0 / 3, 1.0 / 3, 2.0 / 3], + [3.0 / 3, 2.0 / 3, 1.0 / 3], + ], + ), + ], + ) def test_rank_pct_true(self, method, exp): # see gh-15630. @@ -306,7 +309,8 @@ def test_rank_pct_true(self, method, exp): @pytest.mark.high_memory def test_pct_max_many_rows(self): # GH 18271 - df = DataFrame({'A': np.arange(2**24 + 1), - 'B': np.arange(2**24 + 1, 0, -1)}) + df = DataFrame( + {"A": np.arange(2 ** 24 + 1), "B": np.arange(2 ** 24 + 1, 0, -1)} + ) result = df.rank(pct=True).max() assert (result == 1).all() diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index 2513508822fecd..2862615ef8585a 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -14,24 +14,18 @@ @pytest.fixture def mix_ab() -> Dict[str, list]: - return {'a': list(range(4)), - 'b': list('ab..'), - } + return {"a": list(range(4)), "b": list("ab..")} @pytest.fixture def mix_abc() -> Dict[str, list]: - return {'a': list(range(4)), - 'b': list('ab..'), - 'c': ['a', 'b', np.nan, 'd'], - } + return {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} class TestDataFrameReplace(TestData): - def test_replace_inplace(self): - self.tsframe['A'][:5] = np.nan - self.tsframe['A'][-5:] = np.nan + self.tsframe["A"][:5] = np.nan + self.tsframe["A"][-5:] = np.nan tsframe = self.tsframe.copy() tsframe.replace(np.nan, 0, inplace=True) @@ -39,8 +33,8 @@ def test_replace_inplace(self): # mixed type mf = self.mixed_frame - mf.iloc[5:20, mf.columns.get_loc('foo')] = np.nan - mf.iloc[-10:, mf.columns.get_loc('A')] = np.nan + mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan + mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan result = self.mixed_frame.replace(np.nan, 0) expected = self.mixed_frame.fillna(value=0) @@ -51,72 +45,72 @@ def test_replace_inplace(self): assert_frame_equal(tsframe, self.tsframe.fillna(0)) def test_regex_replace_scalar(self, mix_ab): - obj = {'a': list('ab..'), 'b': list('efgh')} + obj = {"a": list("ab.."), "b": list("efgh")} dfobj = DataFrame(obj) dfmix = DataFrame(mix_ab) # simplest cases # regex -> value # obj frame - res = dfobj.replace(r'\s*\.\s*', np.nan, regex=True) - assert_frame_equal(dfobj, res.fillna('.')) + res = dfobj.replace(r"\s*\.\s*", np.nan, regex=True) + assert_frame_equal(dfobj, res.fillna(".")) # mixed - res = dfmix.replace(r'\s*\.\s*', np.nan, regex=True) - assert_frame_equal(dfmix, res.fillna('.')) + res = dfmix.replace(r"\s*\.\s*", np.nan, regex=True) + assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame - res = dfobj.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True) + res = dfobj.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True) objc = obj.copy() - objc['a'] = ['a', 'b', '...', '...'] + objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) assert_frame_equal(res, expec) # with mixed - res = dfmix.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True) + res = dfmix.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True) mixc = mix_ab.copy() - mixc['b'] = ['a', 'b', '...', '...'] + mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) assert_frame_equal(res, expec) # everything with compiled regexs as well - res = dfobj.replace(re.compile(r'\s*\.\s*'), np.nan, regex=True) - assert_frame_equal(dfobj, res.fillna('.')) + res = dfobj.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True) + assert_frame_equal(dfobj, res.fillna(".")) # mixed - res = dfmix.replace(re.compile(r'\s*\.\s*'), np.nan, regex=True) - assert_frame_equal(dfmix, res.fillna('.')) + res = dfmix.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True) + assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame - res = dfobj.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1') + res = dfobj.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1") objc = obj.copy() - objc['a'] = ['a', 'b', '...', '...'] + objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) assert_frame_equal(res, expec) # with mixed - res = dfmix.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1') + res = dfmix.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1") mixc = mix_ab.copy() - mixc['b'] = ['a', 'b', '...', '...'] + mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) assert_frame_equal(res, expec) - res = dfmix.replace(regex=re.compile(r'\s*(\.)\s*'), value=r'\1\1\1') + res = dfmix.replace(regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1") mixc = mix_ab.copy() - mixc['b'] = ['a', 'b', '...', '...'] + mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) assert_frame_equal(res, expec) - res = dfmix.replace(regex=r'\s*(\.)\s*', value=r'\1\1\1') + res = dfmix.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1") mixc = mix_ab.copy() - mixc['b'] = ['a', 'b', '...', '...'] + mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) assert_frame_equal(res, expec) def test_regex_replace_scalar_inplace(self, mix_ab): - obj = {'a': list('ab..'), 'b': list('efgh')} + obj = {"a": list("ab.."), "b": list("efgh")} dfobj = DataFrame(obj) dfmix = DataFrame(mix_ab) @@ -124,209 +118,229 @@ def test_regex_replace_scalar_inplace(self, mix_ab): # regex -> value # obj frame res = dfobj.copy() - res.replace(r'\s*\.\s*', np.nan, regex=True, inplace=True) - assert_frame_equal(dfobj, res.fillna('.')) + res.replace(r"\s*\.\s*", np.nan, regex=True, inplace=True) + assert_frame_equal(dfobj, res.fillna(".")) # mixed res = dfmix.copy() - res.replace(r'\s*\.\s*', np.nan, regex=True, inplace=True) - assert_frame_equal(dfmix, res.fillna('.')) + res.replace(r"\s*\.\s*", np.nan, regex=True, inplace=True) + assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame res = dfobj.copy() - res.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True, inplace=True) + res.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True, inplace=True) objc = obj.copy() - objc['a'] = ['a', 'b', '...', '...'] + objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) assert_frame_equal(res, expec) # with mixed res = dfmix.copy() - res.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True, inplace=True) + res.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True, inplace=True) mixc = mix_ab.copy() - mixc['b'] = ['a', 'b', '...', '...'] + mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) assert_frame_equal(res, expec) # everything with compiled regexs as well res = dfobj.copy() - res.replace(re.compile(r'\s*\.\s*'), np.nan, regex=True, inplace=True) - assert_frame_equal(dfobj, res.fillna('.')) + res.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True, inplace=True) + assert_frame_equal(dfobj, res.fillna(".")) # mixed res = dfmix.copy() - res.replace(re.compile(r'\s*\.\s*'), np.nan, regex=True, inplace=True) - assert_frame_equal(dfmix, res.fillna('.')) + res.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True, inplace=True) + assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame res = dfobj.copy() - res.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1', regex=True, - inplace=True) + res.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1", regex=True, inplace=True) objc = obj.copy() - objc['a'] = ['a', 'b', '...', '...'] + objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) assert_frame_equal(res, expec) # with mixed res = dfmix.copy() - res.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1', regex=True, - inplace=True) + res.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1", regex=True, inplace=True) mixc = mix_ab.copy() - mixc['b'] = ['a', 'b', '...', '...'] + mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) assert_frame_equal(res, expec) res = dfobj.copy() - res.replace(regex=r'\s*\.\s*', value=np.nan, inplace=True) - assert_frame_equal(dfobj, res.fillna('.')) + res.replace(regex=r"\s*\.\s*", value=np.nan, inplace=True) + assert_frame_equal(dfobj, res.fillna(".")) # mixed res = dfmix.copy() - res.replace(regex=r'\s*\.\s*', value=np.nan, inplace=True) - assert_frame_equal(dfmix, res.fillna('.')) + res.replace(regex=r"\s*\.\s*", value=np.nan, inplace=True) + assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame res = dfobj.copy() - res.replace(regex=r'\s*(\.)\s*', value=r'\1\1\1', inplace=True) + res.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1", inplace=True) objc = obj.copy() - objc['a'] = ['a', 'b', '...', '...'] + objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) assert_frame_equal(res, expec) # with mixed res = dfmix.copy() - res.replace(regex=r'\s*(\.)\s*', value=r'\1\1\1', inplace=True) + res.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1", inplace=True) mixc = mix_ab.copy() - mixc['b'] = ['a', 'b', '...', '...'] + mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) assert_frame_equal(res, expec) # everything with compiled regexs as well res = dfobj.copy() - res.replace(regex=re.compile(r'\s*\.\s*'), value=np.nan, inplace=True) - assert_frame_equal(dfobj, res.fillna('.')) + res.replace(regex=re.compile(r"\s*\.\s*"), value=np.nan, inplace=True) + assert_frame_equal(dfobj, res.fillna(".")) # mixed res = dfmix.copy() - res.replace(regex=re.compile(r'\s*\.\s*'), value=np.nan, inplace=True) - assert_frame_equal(dfmix, res.fillna('.')) + res.replace(regex=re.compile(r"\s*\.\s*"), value=np.nan, inplace=True) + assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame res = dfobj.copy() - res.replace(regex=re.compile(r'\s*(\.)\s*'), value=r'\1\1\1', - inplace=True) + res.replace(regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1", inplace=True) objc = obj.copy() - objc['a'] = ['a', 'b', '...', '...'] + objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) assert_frame_equal(res, expec) # with mixed res = dfmix.copy() - res.replace(regex=re.compile(r'\s*(\.)\s*'), value=r'\1\1\1', - inplace=True) + res.replace(regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1", inplace=True) mixc = mix_ab.copy() - mixc['b'] = ['a', 'b', '...', '...'] + mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) assert_frame_equal(res, expec) def test_regex_replace_list_obj(self): - obj = {'a': list('ab..'), 'b': list('efgh'), 'c': list('helo')} + obj = {"a": list("ab.."), "b": list("efgh"), "c": list("helo")} dfobj = DataFrame(obj) # lists of regexes and values # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] - to_replace_res = [r'\s*\.\s*', r'e|f|g'] - values = [np.nan, 'crap'] + to_replace_res = [r"\s*\.\s*", r"e|f|g"] + values = [np.nan, "crap"] res = dfobj.replace(to_replace_res, values, regex=True) - expec = DataFrame({'a': ['a', 'b', np.nan, np.nan], - 'b': ['crap'] * 3 + ['h'], - 'c': ['h', 'crap', 'l', 'o'], - }) + expec = DataFrame( + { + "a": ["a", "b", np.nan, np.nan], + "b": ["crap"] * 3 + ["h"], + "c": ["h", "crap", "l", "o"], + } + ) assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] - to_replace_res = [r'\s*(\.)\s*', r'(e|f|g)'] - values = [r'\1\1', r'\1_crap'] + to_replace_res = [r"\s*(\.)\s*", r"(e|f|g)"] + values = [r"\1\1", r"\1_crap"] res = dfobj.replace(to_replace_res, values, regex=True) - expec = DataFrame({'a': ['a', 'b', '..', '..'], - 'b': ['e_crap', 'f_crap', 'g_crap', 'h'], - 'c': ['h', 'e_crap', 'l', 'o'], - }) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["e_crap", "f_crap", "g_crap", "h"], + "c": ["h", "e_crap", "l", "o"], + } + ) assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN # or vN)] - to_replace_res = [r'\s*(\.)\s*', r'e'] - values = [r'\1\1', r'crap'] + to_replace_res = [r"\s*(\.)\s*", r"e"] + values = [r"\1\1", r"crap"] res = dfobj.replace(to_replace_res, values, regex=True) - expec = DataFrame({'a': ['a', 'b', '..', '..'], - 'b': ['crap', 'f', 'g', 'h'], - 'c': ['h', 'crap', 'l', 'o'], - }) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["crap", "f", "g", "h"], + "c": ["h", "crap", "l", "o"], + } + ) assert_frame_equal(res, expec) - to_replace_res = [r'\s*(\.)\s*', r'e'] - values = [r'\1\1', r'crap'] + to_replace_res = [r"\s*(\.)\s*", r"e"] + values = [r"\1\1", r"crap"] res = dfobj.replace(value=values, regex=to_replace_res) - expec = DataFrame({'a': ['a', 'b', '..', '..'], - 'b': ['crap', 'f', 'g', 'h'], - 'c': ['h', 'crap', 'l', 'o'], - }) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["crap", "f", "g", "h"], + "c": ["h", "crap", "l", "o"], + } + ) assert_frame_equal(res, expec) def test_regex_replace_list_obj_inplace(self): # same as above with inplace=True # lists of regexes and values - obj = {'a': list('ab..'), 'b': list('efgh'), 'c': list('helo')} + obj = {"a": list("ab.."), "b": list("efgh"), "c": list("helo")} dfobj = DataFrame(obj) # lists of regexes and values # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] - to_replace_res = [r'\s*\.\s*', r'e|f|g'] - values = [np.nan, 'crap'] + to_replace_res = [r"\s*\.\s*", r"e|f|g"] + values = [np.nan, "crap"] res = dfobj.copy() res.replace(to_replace_res, values, inplace=True, regex=True) - expec = DataFrame({'a': ['a', 'b', np.nan, np.nan], - 'b': ['crap'] * 3 + ['h'], - 'c': ['h', 'crap', 'l', 'o'], - }) + expec = DataFrame( + { + "a": ["a", "b", np.nan, np.nan], + "b": ["crap"] * 3 + ["h"], + "c": ["h", "crap", "l", "o"], + } + ) assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] - to_replace_res = [r'\s*(\.)\s*', r'(e|f|g)'] - values = [r'\1\1', r'\1_crap'] + to_replace_res = [r"\s*(\.)\s*", r"(e|f|g)"] + values = [r"\1\1", r"\1_crap"] res = dfobj.copy() res.replace(to_replace_res, values, inplace=True, regex=True) - expec = DataFrame({'a': ['a', 'b', '..', '..'], - 'b': ['e_crap', 'f_crap', 'g_crap', 'h'], - 'c': ['h', 'e_crap', 'l', 'o'], - }) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["e_crap", "f_crap", "g_crap", "h"], + "c": ["h", "e_crap", "l", "o"], + } + ) assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN # or vN)] - to_replace_res = [r'\s*(\.)\s*', r'e'] - values = [r'\1\1', r'crap'] + to_replace_res = [r"\s*(\.)\s*", r"e"] + values = [r"\1\1", r"crap"] res = dfobj.copy() res.replace(to_replace_res, values, inplace=True, regex=True) - expec = DataFrame({'a': ['a', 'b', '..', '..'], - 'b': ['crap', 'f', 'g', 'h'], - 'c': ['h', 'crap', 'l', 'o'], - }) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["crap", "f", "g", "h"], + "c": ["h", "crap", "l", "o"], + } + ) assert_frame_equal(res, expec) - to_replace_res = [r'\s*(\.)\s*', r'e'] - values = [r'\1\1', r'crap'] + to_replace_res = [r"\s*(\.)\s*", r"e"] + values = [r"\1\1", r"crap"] res = dfobj.copy() res.replace(value=values, regex=to_replace_res, inplace=True) - expec = DataFrame({'a': ['a', 'b', '..', '..'], - 'b': ['crap', 'f', 'g', 'h'], - 'c': ['h', 'crap', 'l', 'o'], - }) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["crap", "f", "g", "h"], + "c": ["h", "crap", "l", "o"], + } + ) assert_frame_equal(res, expec) def test_regex_replace_list_mixed(self, mix_ab): @@ -335,42 +349,39 @@ def test_regex_replace_list_mixed(self, mix_ab): # lists of regexes and values # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] - to_replace_res = [r'\s*\.\s*', r'a'] - values = [np.nan, 'crap'] - mix2 = {'a': list(range(4)), 'b': list('ab..'), 'c': list('halo')} + to_replace_res = [r"\s*\.\s*", r"a"] + values = [np.nan, "crap"] + mix2 = {"a": list(range(4)), "b": list("ab.."), "c": list("halo")} dfmix2 = DataFrame(mix2) res = dfmix2.replace(to_replace_res, values, regex=True) - expec = DataFrame({'a': mix2['a'], - 'b': ['crap', 'b', np.nan, np.nan], - 'c': ['h', 'crap', 'l', 'o'], - }) + expec = DataFrame( + { + "a": mix2["a"], + "b": ["crap", "b", np.nan, np.nan], + "c": ["h", "crap", "l", "o"], + } + ) assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] - to_replace_res = [r'\s*(\.)\s*', r'(a|b)'] - values = [r'\1\1', r'\1_crap'] + to_replace_res = [r"\s*(\.)\s*", r"(a|b)"] + values = [r"\1\1", r"\1_crap"] res = dfmix.replace(to_replace_res, values, regex=True) - expec = DataFrame({'a': mix_ab['a'], - 'b': ['a_crap', 'b_crap', '..', '..'], - }) + expec = DataFrame({"a": mix_ab["a"], "b": ["a_crap", "b_crap", "..", ".."]}) assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN # or vN)] - to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)'] - values = [r'\1\1', r'crap', r'\1_crap'] + to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] + values = [r"\1\1", r"crap", r"\1_crap"] res = dfmix.replace(to_replace_res, values, regex=True) - expec = DataFrame({'a': mix_ab['a'], - 'b': ['crap', 'b_crap', '..', '..'], - }) + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) assert_frame_equal(res, expec) - to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)'] - values = [r'\1\1', r'crap', r'\1_crap'] + to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] + values = [r"\1\1", r"crap", r"\1_crap"] res = dfmix.replace(regex=to_replace_res, value=values) - expec = DataFrame({'a': mix_ab['a'], - 'b': ['crap', 'b_crap', '..', '..'], - }) + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) assert_frame_equal(res, expec) def test_regex_replace_list_mixed_inplace(self, mix_ab): @@ -378,43 +389,35 @@ def test_regex_replace_list_mixed_inplace(self, mix_ab): # the same inplace # lists of regexes and values # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] - to_replace_res = [r'\s*\.\s*', r'a'] - values = [np.nan, 'crap'] + to_replace_res = [r"\s*\.\s*", r"a"] + values = [np.nan, "crap"] res = dfmix.copy() res.replace(to_replace_res, values, inplace=True, regex=True) - expec = DataFrame({'a': mix_ab['a'], - 'b': ['crap', 'b', np.nan, np.nan], - }) + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b", np.nan, np.nan]}) assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] - to_replace_res = [r'\s*(\.)\s*', r'(a|b)'] - values = [r'\1\1', r'\1_crap'] + to_replace_res = [r"\s*(\.)\s*", r"(a|b)"] + values = [r"\1\1", r"\1_crap"] res = dfmix.copy() res.replace(to_replace_res, values, inplace=True, regex=True) - expec = DataFrame({'a': mix_ab['a'], - 'b': ['a_crap', 'b_crap', '..', '..'], - }) + expec = DataFrame({"a": mix_ab["a"], "b": ["a_crap", "b_crap", "..", ".."]}) assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN # or vN)] - to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)'] - values = [r'\1\1', r'crap', r'\1_crap'] + to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] + values = [r"\1\1", r"crap", r"\1_crap"] res = dfmix.copy() res.replace(to_replace_res, values, inplace=True, regex=True) - expec = DataFrame({'a': mix_ab['a'], - 'b': ['crap', 'b_crap', '..', '..'], - }) + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) assert_frame_equal(res, expec) - to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)'] - values = [r'\1\1', r'crap', r'\1_crap'] + to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] + values = [r"\1\1", r"crap", r"\1_crap"] res = dfmix.copy() res.replace(regex=to_replace_res, value=values, inplace=True) - expec = DataFrame({'a': mix_ab['a'], - 'b': ['crap', 'b_crap', '..', '..'], - }) + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) assert_frame_equal(res, expec) def test_regex_replace_dict_mixed(self, mix_abc): @@ -426,76 +429,67 @@ def test_regex_replace_dict_mixed(self, mix_abc): # list of dicts {re1: v1, re2: v2, ..., re3: v3}, search the whole # frame - res = dfmix.replace({'b': r'\s*\.\s*'}, {'b': np.nan}, regex=True) + res = dfmix.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, regex=True) res2 = dfmix.copy() - res2.replace({'b': r'\s*\.\s*'}, {'b': np.nan}, - inplace=True, regex=True) - expec = DataFrame({'a': mix_abc['a'], - 'b': ['a', 'b', np.nan, np.nan], - 'c': mix_abc['c'], - }) + res2.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, inplace=True, regex=True) + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]} + ) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) # list of dicts {re1: re11, re2: re12, ..., reN: re1N}, search the # whole frame - res = dfmix.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, regex=True) + res = dfmix.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, regex=True) res2 = dfmix.copy() - res2.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, inplace=True, - regex=True) - expec = DataFrame({'a': mix_abc['a'], - 'b': ['a', 'b', '.ty', '.ty'], - 'c': mix_abc['c'], - }) + res2.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, inplace=True, regex=True) + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", ".ty", ".ty"], "c": mix_abc["c"]} + ) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) - res = dfmix.replace(regex={'b': r'\s*(\.)\s*'}, value={'b': r'\1ty'}) + res = dfmix.replace(regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"}) res2 = dfmix.copy() - res2.replace(regex={'b': r'\s*(\.)\s*'}, value={'b': r'\1ty'}, - inplace=True) - expec = DataFrame({'a': mix_abc['a'], - 'b': ['a', 'b', '.ty', '.ty'], - 'c': mix_abc['c'], - }) + res2.replace(regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"}, inplace=True) + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", ".ty", ".ty"], "c": mix_abc["c"]} + ) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) # scalar -> dict # to_replace regex, {value: value} - expec = DataFrame({'a': mix_abc['a'], - 'b': [np.nan, 'b', '.', '.'], - 'c': mix_abc['c'], - }) - res = dfmix.replace('a', {'b': np.nan}, regex=True) + expec = DataFrame( + {"a": mix_abc["a"], "b": [np.nan, "b", ".", "."], "c": mix_abc["c"]} + ) + res = dfmix.replace("a", {"b": np.nan}, regex=True) res2 = dfmix.copy() - res2.replace('a', {'b': np.nan}, regex=True, inplace=True) + res2.replace("a", {"b": np.nan}, regex=True, inplace=True) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) - res = dfmix.replace('a', {'b': np.nan}, regex=True) + res = dfmix.replace("a", {"b": np.nan}, regex=True) res2 = dfmix.copy() - res2.replace(regex='a', value={'b': np.nan}, inplace=True) - expec = DataFrame({'a': mix_abc['a'], - 'b': [np.nan, 'b', '.', '.'], - 'c': mix_abc['c'], - }) + res2.replace(regex="a", value={"b": np.nan}, inplace=True) + expec = DataFrame( + {"a": mix_abc["a"], "b": [np.nan, "b", ".", "."], "c": mix_abc["c"]} + ) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) def test_regex_replace_dict_nested(self, mix_abc): # nested dicts will not work until this is implemented for Series dfmix = DataFrame(mix_abc) - res = dfmix.replace({'b': {r'\s*\.\s*': np.nan}}, regex=True) + res = dfmix.replace({"b": {r"\s*\.\s*": np.nan}}, regex=True) res2 = dfmix.copy() res4 = dfmix.copy() - res2.replace({'b': {r'\s*\.\s*': np.nan}}, inplace=True, regex=True) - res3 = dfmix.replace(regex={'b': {r'\s*\.\s*': np.nan}}) - res4.replace(regex={'b': {r'\s*\.\s*': np.nan}}, inplace=True) - expec = DataFrame({'a': mix_abc['a'], - 'b': ['a', 'b', np.nan, np.nan], - 'c': mix_abc['c'], - }) + res2.replace({"b": {r"\s*\.\s*": np.nan}}, inplace=True, regex=True) + res3 = dfmix.replace(regex={"b": {r"\s*\.\s*": np.nan}}) + res4.replace(regex={"b": {r"\s*\.\s*": np.nan}}, inplace=True) + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]} + ) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) assert_frame_equal(res3, expec) @@ -503,28 +497,31 @@ def test_regex_replace_dict_nested(self, mix_abc): def test_regex_replace_dict_nested_non_first_character(self): # GH 25259 - df = pd.DataFrame({'first': ['abc', 'bca', 'cab']}) - expected = pd.DataFrame({'first': ['.bc', 'bc.', 'c.b']}) - result = df.replace({'a': '.'}, regex=True) + df = pd.DataFrame({"first": ["abc", "bca", "cab"]}) + expected = pd.DataFrame({"first": [".bc", "bc.", "c.b"]}) + result = df.replace({"a": "."}, regex=True) assert_frame_equal(result, expected) def test_regex_replace_dict_nested_gh4115(self): - df = pd.DataFrame({'Type': ['Q', 'T', 'Q', 'Q', 'T'], 'tmp': 2}) - expected = DataFrame({'Type': [0, 1, 0, 0, 1], 'tmp': 2}) - result = df.replace({'Type': {'Q': 0, 'T': 1}}) + df = pd.DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) + expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) + result = df.replace({"Type": {"Q": 0, "T": 1}}) assert_frame_equal(result, expected) def test_regex_replace_list_to_scalar(self, mix_abc): df = DataFrame(mix_abc) - expec = DataFrame({'a': mix_abc['a'], - 'b': np.array([np.nan] * 4), - 'c': [np.nan, np.nan, np.nan, 'd'], - }) - res = df.replace([r'\s*\.\s*', 'a|b'], np.nan, regex=True) + expec = DataFrame( + { + "a": mix_abc["a"], + "b": np.array([np.nan] * 4), + "c": [np.nan, np.nan, np.nan, "d"], + } + ) + res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True) res2 = df.copy() res3 = df.copy() - res2.replace([r'\s*\.\s*', 'a|b'], np.nan, regex=True, inplace=True) - res3.replace(regex=[r'\s*\.\s*', 'a|b'], value=np.nan, inplace=True) + res2.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True) + res3.replace(regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) assert_frame_equal(res3, expec) @@ -532,122 +529,122 @@ def test_regex_replace_list_to_scalar(self, mix_abc): def test_regex_replace_str_to_numeric(self, mix_abc): # what happens when you try to replace a numeric value with a regex? df = DataFrame(mix_abc) - res = df.replace(r'\s*\.\s*', 0, regex=True) + res = df.replace(r"\s*\.\s*", 0, regex=True) res2 = df.copy() - res2.replace(r'\s*\.\s*', 0, inplace=True, regex=True) + res2.replace(r"\s*\.\s*", 0, inplace=True, regex=True) res3 = df.copy() - res3.replace(regex=r'\s*\.\s*', value=0, inplace=True) - expec = DataFrame({'a': mix_abc['a'], - 'b': ['a', 'b', 0, 0], - 'c': mix_abc['c'], - }) + res3.replace(regex=r"\s*\.\s*", value=0, inplace=True) + expec = DataFrame({"a": mix_abc["a"], "b": ["a", "b", 0, 0], "c": mix_abc["c"]}) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) assert_frame_equal(res3, expec) def test_regex_replace_regex_list_to_numeric(self, mix_abc): df = DataFrame(mix_abc) - res = df.replace([r'\s*\.\s*', 'b'], 0, regex=True) + res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) res2 = df.copy() - res2.replace([r'\s*\.\s*', 'b'], 0, regex=True, inplace=True) + res2.replace([r"\s*\.\s*", "b"], 0, regex=True, inplace=True) res3 = df.copy() - res3.replace(regex=[r'\s*\.\s*', 'b'], value=0, inplace=True) - expec = DataFrame({'a': mix_abc['a'], - 'b': ['a', 0, 0, 0], - 'c': ['a', 0, np.nan, 'd'], - }) + res3.replace(regex=[r"\s*\.\s*", "b"], value=0, inplace=True) + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", 0, 0, 0], "c": ["a", 0, np.nan, "d"]} + ) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) assert_frame_equal(res3, expec) def test_regex_replace_series_of_regexes(self, mix_abc): df = DataFrame(mix_abc) - s1 = Series({'b': r'\s*\.\s*'}) - s2 = Series({'b': np.nan}) + s1 = Series({"b": r"\s*\.\s*"}) + s2 = Series({"b": np.nan}) res = df.replace(s1, s2, regex=True) res2 = df.copy() res2.replace(s1, s2, inplace=True, regex=True) res3 = df.copy() res3.replace(regex=s1, value=s2, inplace=True) - expec = DataFrame({'a': mix_abc['a'], - 'b': ['a', 'b', np.nan, np.nan], - 'c': mix_abc['c'], - }) + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]} + ) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) assert_frame_equal(res3, expec) def test_regex_replace_numeric_to_object_conversion(self, mix_abc): df = DataFrame(mix_abc) - expec = DataFrame({'a': ['a', 1, 2, 3], - 'b': mix_abc['b'], - 'c': mix_abc['c'], - }) - res = df.replace(0, 'a') + expec = DataFrame({"a": ["a", 1, 2, 3], "b": mix_abc["b"], "c": mix_abc["c"]}) + res = df.replace(0, "a") assert_frame_equal(res, expec) assert res.a.dtype == np.object_ - @pytest.mark.parametrize('metachar', ['[]', '()', r'\d', r'\w', r'\s']) + @pytest.mark.parametrize("metachar", ["[]", "()", r"\d", r"\w", r"\s"]) def test_replace_regex_metachar(self, metachar): - df = DataFrame({'a': [metachar, 'else']}) - result = df.replace({'a': {metachar: 'paren'}}) - expected = DataFrame({'a': ['paren', 'else']}) + df = DataFrame({"a": [metachar, "else"]}) + result = df.replace({"a": {metachar: "paren"}}) + expected = DataFrame({"a": ["paren", "else"]}) assert_frame_equal(result, expected) def test_replace(self): - self.tsframe['A'][:5] = np.nan - self.tsframe['A'][-5:] = np.nan + self.tsframe["A"][:5] = np.nan + self.tsframe["A"][-5:] = np.nan zero_filled = self.tsframe.replace(np.nan, -1e8) assert_frame_equal(zero_filled, self.tsframe.fillna(-1e8)) assert_frame_equal(zero_filled.replace(-1e8, np.nan), self.tsframe) - self.tsframe['A'][:5] = np.nan - self.tsframe['A'][-5:] = np.nan - self.tsframe['B'][:5] = -1e8 + self.tsframe["A"][:5] = np.nan + self.tsframe["A"][-5:] = np.nan + self.tsframe["B"][:5] = -1e8 # empty - df = DataFrame(index=['a', 'b']) + df = DataFrame(index=["a", "b"]) assert_frame_equal(df, df.replace(5, 7)) # GH 11698 # test for mixed data types. - df = pd.DataFrame([('-', pd.to_datetime('20150101')), - ('a', pd.to_datetime('20150102'))]) - df1 = df.replace('-', np.nan) - expected_df = pd.DataFrame([(np.nan, pd.to_datetime('20150101')), - ('a', pd.to_datetime('20150102'))]) + df = pd.DataFrame( + [("-", pd.to_datetime("20150101")), ("a", pd.to_datetime("20150102"))] + ) + df1 = df.replace("-", np.nan) + expected_df = pd.DataFrame( + [(np.nan, pd.to_datetime("20150101")), ("a", pd.to_datetime("20150102"))] + ) assert_frame_equal(df1, expected_df) def test_replace_list(self): - obj = {'a': list('ab..'), 'b': list('efgh'), 'c': list('helo')} + obj = {"a": list("ab.."), "b": list("efgh"), "c": list("helo")} dfobj = DataFrame(obj) # lists of regexes and values # list of [v1, v2, ..., vN] -> [v1, v2, ..., vN] - to_replace_res = [r'.', r'e'] - values = [np.nan, 'crap'] + to_replace_res = [r".", r"e"] + values = [np.nan, "crap"] res = dfobj.replace(to_replace_res, values) - expec = DataFrame({'a': ['a', 'b', np.nan, np.nan], - 'b': ['crap', 'f', 'g', 'h'], - 'c': ['h', 'crap', 'l', 'o'], - }) + expec = DataFrame( + { + "a": ["a", "b", np.nan, np.nan], + "b": ["crap", "f", "g", "h"], + "c": ["h", "crap", "l", "o"], + } + ) assert_frame_equal(res, expec) # list of [v1, v2, ..., vN] -> [v1, v2, .., vN] - to_replace_res = [r'.', r'f'] - values = [r'..', r'crap'] + to_replace_res = [r".", r"f"] + values = [r"..", r"crap"] res = dfobj.replace(to_replace_res, values) - expec = DataFrame({'a': ['a', 'b', '..', '..'], - 'b': ['e', 'crap', 'g', 'h'], - 'c': ['h', 'e', 'l', 'o'], - }) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["e", "crap", "g", "h"], + "c": ["h", "e", "l", "o"], + } + ) assert_frame_equal(res, expec) def test_replace_with_empty_list(self): # GH 21977 - s = pd.Series([['a', 'b'], [], np.nan, [1]]) - df = pd.DataFrame({'col': s}) + s = pd.Series([["a", "b"], [], np.nan, [1]]) + df = pd.DataFrame({"col": s}) expected = df result = df.replace([], np.nan) assert_frame_equal(result, expected) @@ -656,25 +653,23 @@ def test_replace_with_empty_list(self): with pytest.raises(ValueError, match="cannot assign mismatch"): df.replace({np.nan: []}) with pytest.raises(ValueError, match="cannot assign mismatch"): - df.replace({np.nan: ['dummy', 'alt']}) + df.replace({np.nan: ["dummy", "alt"]}) def test_replace_series_dict(self): # from GH 3064 - df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}}) - result = df.replace(0, {'zero': 0.5, 'one': 1.0}) - expected = DataFrame( - {'zero': {'a': 0.5, 'b': 1}, 'one': {'a': 2.0, 'b': 1.0}}) + df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}}) + result = df.replace(0, {"zero": 0.5, "one": 1.0}) + expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 2.0, "b": 1.0}}) assert_frame_equal(result, expected) result = df.replace(0, df.mean()) assert_frame_equal(result, expected) # series to series/dict - df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}}) - s = Series({'zero': 0.0, 'one': 2.0}) - result = df.replace(s, {'zero': 0.5, 'one': 1.0}) - expected = DataFrame( - {'zero': {'a': 0.5, 'b': 1}, 'one': {'a': 1.0, 'b': 0.0}}) + df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}}) + s = Series({"zero": 0.0, "one": 2.0}) + result = df.replace(s, {"zero": 0.5, "one": 1.0}) + expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 1.0, "b": 0.0}}) assert_frame_equal(result, expected) result = df.replace(s, df.mean()) @@ -682,8 +677,8 @@ def test_replace_series_dict(self): def test_replace_convert(self): # gh 3907 - df = DataFrame([['foo', 'bar', 'bah'], ['bar', 'foo', 'bah']]) - m = {'foo': 1, 'bar': 2, 'bah': 3} + df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) + m = {"foo": 1, "bar": 2, "bah": 3} rep = df.replace(m) expec = Series([np.int64] * 3) res = rep.dtypes @@ -691,8 +686,8 @@ def test_replace_convert(self): def test_replace_mixed(self): mf = self.mixed_frame - mf.iloc[5:20, mf.columns.get_loc('foo')] = np.nan - mf.iloc[-10:, mf.columns.get_loc('A')] = np.nan + mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan + mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan result = self.mixed_frame.replace(np.nan, -18) expected = self.mixed_frame.fillna(value=-18) @@ -705,10 +700,18 @@ def test_replace_mixed(self): assert_frame_equal(result.replace(-1e8, np.nan), self.mixed_frame) # int block upcasting - df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'), - 'B': Series([0, 1], dtype='int64')}) - expected = DataFrame({'A': Series([1.0, 2.0], dtype='float64'), - 'B': Series([0.5, 1], dtype='float64')}) + df = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0, 1], dtype="int64"), + } + ) + expected = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0.5, 1], dtype="float64"), + } + ) result = df.replace(0, 0.5) assert_frame_equal(result, expected) @@ -716,57 +719,78 @@ def test_replace_mixed(self): assert_frame_equal(df, expected) # int block splitting - df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'), - 'B': Series([0, 1], dtype='int64'), - 'C': Series([1, 2], dtype='int64')}) - expected = DataFrame({'A': Series([1.0, 2.0], dtype='float64'), - 'B': Series([0.5, 1], dtype='float64'), - 'C': Series([1, 2], dtype='int64')}) + df = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0, 1], dtype="int64"), + "C": Series([1, 2], dtype="int64"), + } + ) + expected = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0.5, 1], dtype="float64"), + "C": Series([1, 2], dtype="int64"), + } + ) result = df.replace(0, 0.5) assert_frame_equal(result, expected) # to object block upcasting - df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'), - 'B': Series([0, 1], dtype='int64')}) - expected = DataFrame({'A': Series([1, 'foo'], dtype='object'), - 'B': Series([0, 1], dtype='int64')}) - result = df.replace(2, 'foo') + df = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0, 1], dtype="int64"), + } + ) + expected = DataFrame( + { + "A": Series([1, "foo"], dtype="object"), + "B": Series([0, 1], dtype="int64"), + } + ) + result = df.replace(2, "foo") assert_frame_equal(result, expected) - expected = DataFrame({'A': Series(['foo', 'bar'], dtype='object'), - 'B': Series([0, 'foo'], dtype='object')}) - result = df.replace([1, 2], ['foo', 'bar']) + expected = DataFrame( + { + "A": Series(["foo", "bar"], dtype="object"), + "B": Series([0, "foo"], dtype="object"), + } + ) + result = df.replace([1, 2], ["foo", "bar"]) assert_frame_equal(result, expected) # test case from - df = DataFrame({'A': Series([3, 0], dtype='int64'), - 'B': Series([0, 3], dtype='int64')}) + df = DataFrame( + {"A": Series([3, 0], dtype="int64"), "B": Series([0, 3], dtype="int64")} + ) result = df.replace(3, df.mean().to_dict()) - expected = df.copy().astype('float64') + expected = df.copy().astype("float64") m = df.mean() expected.iloc[0, 0] = m[0] expected.iloc[1, 1] = m[1] assert_frame_equal(result, expected) def test_replace_simple_nested_dict(self): - df = DataFrame({'col': range(1, 5)}) - expected = DataFrame({'col': ['a', 2, 3, 'b']}) + df = DataFrame({"col": range(1, 5)}) + expected = DataFrame({"col": ["a", 2, 3, "b"]}) - result = df.replace({'col': {1: 'a', 4: 'b'}}) + result = df.replace({"col": {1: "a", 4: "b"}}) assert_frame_equal(expected, result) # in this case, should be the same as the not nested version - result = df.replace({1: 'a', 4: 'b'}) + result = df.replace({1: "a", 4: "b"}) assert_frame_equal(expected, result) def test_replace_simple_nested_dict_with_nonexistent_value(self): - df = DataFrame({'col': range(1, 5)}) - expected = DataFrame({'col': ['a', 2, 3, 'b']}) + df = DataFrame({"col": range(1, 5)}) + expected = DataFrame({"col": ["a", 2, 3, "b"]}) - result = df.replace({-1: '-', 1: 'a', 4: 'b'}) + result = df.replace({-1: "-", 1: "a", 4: "b"}) assert_frame_equal(expected, result) - result = df.replace({'col': {-1: '-', 1: 'a', 4: 'b'}}) + result = df.replace({"col": {-1: "-", 1: "a", 4: "b"}}) assert_frame_equal(expected, result) def test_replace_value_is_none(self): @@ -793,82 +817,129 @@ def test_replace_for_new_dtypes(self): # dtypes tsframe = self.tsframe.copy().astype(np.float32) - tsframe['A'][:5] = np.nan - tsframe['A'][-5:] = np.nan + tsframe["A"][:5] = np.nan + tsframe["A"][-5:] = np.nan zero_filled = tsframe.replace(np.nan, -1e8) assert_frame_equal(zero_filled, tsframe.fillna(-1e8)) assert_frame_equal(zero_filled.replace(-1e8, np.nan), tsframe) - tsframe['A'][:5] = np.nan - tsframe['A'][-5:] = np.nan - tsframe['B'][:5] = -1e8 + tsframe["A"][:5] = np.nan + tsframe["A"][-5:] = np.nan + tsframe["B"][:5] = -1e8 - b = tsframe['B'] + b = tsframe["B"] b[b == -1e8] = np.nan - tsframe['B'] = b - result = tsframe.fillna(method='bfill') - assert_frame_equal(result, tsframe.fillna(method='bfill')) - - @pytest.mark.parametrize('frame, to_replace, value, expected', [ - (DataFrame({'ints': [1, 2, 3]}), 1, 0, - DataFrame({'ints': [0, 2, 3]})), - (DataFrame({'ints': [1, 2, 3]}, dtype=np.int32), 1, 0, - DataFrame({'ints': [0, 2, 3]}, dtype=np.int32)), - (DataFrame({'ints': [1, 2, 3]}, dtype=np.int16), 1, 0, - DataFrame({'ints': [0, 2, 3]}, dtype=np.int16)), - (DataFrame({'bools': [True, False, True]}), False, True, - DataFrame({'bools': [True, True, True]})), - (DataFrame({'complex': [1j, 2j, 3j]}), 1j, 0, - DataFrame({'complex': [0j, 2j, 3j]})), - (DataFrame({'datetime64': Index([datetime(2018, 5, 28), - datetime(2018, 7, 28), - datetime(2018, 5, 28)])}), - datetime(2018, 5, 28), datetime(2018, 7, 28), - DataFrame({'datetime64': Index([datetime(2018, 7, 28)] * 3)})), - # GH 20380 - (DataFrame({'dt': [datetime(3017, 12, 20)], 'str': ['foo']}), - 'foo', 'bar', - DataFrame({'dt': [datetime(3017, 12, 20)], 'str': ['bar']})), - (DataFrame({'A': date_range('20130101', periods=3, tz='US/Eastern'), - 'B': [0, np.nan, 2]}), - Timestamp('20130102', tz='US/Eastern'), - Timestamp('20130104', tz='US/Eastern'), - DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'), - Timestamp('20130104', tz='US/Eastern'), - Timestamp('20130103', tz='US/Eastern')], - 'B': [0, np.nan, 2]})) - ]) + tsframe["B"] = b + result = tsframe.fillna(method="bfill") + assert_frame_equal(result, tsframe.fillna(method="bfill")) + + @pytest.mark.parametrize( + "frame, to_replace, value, expected", + [ + (DataFrame({"ints": [1, 2, 3]}), 1, 0, DataFrame({"ints": [0, 2, 3]})), + ( + DataFrame({"ints": [1, 2, 3]}, dtype=np.int32), + 1, + 0, + DataFrame({"ints": [0, 2, 3]}, dtype=np.int32), + ), + ( + DataFrame({"ints": [1, 2, 3]}, dtype=np.int16), + 1, + 0, + DataFrame({"ints": [0, 2, 3]}, dtype=np.int16), + ), + ( + DataFrame({"bools": [True, False, True]}), + False, + True, + DataFrame({"bools": [True, True, True]}), + ), + ( + DataFrame({"complex": [1j, 2j, 3j]}), + 1j, + 0, + DataFrame({"complex": [0j, 2j, 3j]}), + ), + ( + DataFrame( + { + "datetime64": Index( + [ + datetime(2018, 5, 28), + datetime(2018, 7, 28), + datetime(2018, 5, 28), + ] + ) + } + ), + datetime(2018, 5, 28), + datetime(2018, 7, 28), + DataFrame({"datetime64": Index([datetime(2018, 7, 28)] * 3)}), + ), + # GH 20380 + ( + DataFrame({"dt": [datetime(3017, 12, 20)], "str": ["foo"]}), + "foo", + "bar", + DataFrame({"dt": [datetime(3017, 12, 20)], "str": ["bar"]}), + ), + ( + DataFrame( + { + "A": date_range("20130101", periods=3, tz="US/Eastern"), + "B": [0, np.nan, 2], + } + ), + Timestamp("20130102", tz="US/Eastern"), + Timestamp("20130104", tz="US/Eastern"), + DataFrame( + { + "A": [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130104", tz="US/Eastern"), + Timestamp("20130103", tz="US/Eastern"), + ], + "B": [0, np.nan, 2], + } + ), + ), + ], + ) def test_replace_dtypes(self, frame, to_replace, value, expected): - result = getattr(frame, 'replace')(to_replace, value) + result = getattr(frame, "replace")(to_replace, value) assert_frame_equal(result, expected) def test_replace_input_formats_listlike(self): # both dicts - to_rep = {'A': np.nan, 'B': 0, 'C': ''} - values = {'A': 0, 'B': -1, 'C': 'missing'} - df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5], - 'C': ['', 'asdf', 'fd']}) + to_rep = {"A": np.nan, "B": 0, "C": ""} + values = {"A": 0, "B": -1, "C": "missing"} + df = DataFrame( + {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} + ) filled = df.replace(to_rep, values) expected = {k: v.replace(to_rep[k], values[k]) for k, v in df.items()} assert_frame_equal(filled, DataFrame(expected)) result = df.replace([0, 2, 5], [5, 2, 0]) - expected = DataFrame({'A': [np.nan, 5, np.inf], 'B': [5, 2, 0], - 'C': ['', 'asdf', 'fd']}) + expected = DataFrame( + {"A": [np.nan, 5, np.inf], "B": [5, 2, 0], "C": ["", "asdf", "fd"]} + ) assert_frame_equal(result, expected) # scalar to dict - values = {'A': 0, 'B': -1, 'C': 'missing'} - df = DataFrame({'A': [np.nan, 0, np.nan], 'B': [0, 2, 5], - 'C': ['', 'asdf', 'fd']}) + values = {"A": 0, "B": -1, "C": "missing"} + df = DataFrame( + {"A": [np.nan, 0, np.nan], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} + ) filled = df.replace(np.nan, values) expected = {k: v.replace(np.nan, values[k]) for k, v in df.items()} assert_frame_equal(filled, DataFrame(expected)) # list to list - to_rep = [np.nan, 0, ''] - values = [-2, -1, 'missing'] + to_rep = [np.nan, 0, ""] + values = [-2, -1, "missing"] result = df.replace(to_rep, values) expected = df.copy() for i in range(len(to_rep)): @@ -880,21 +951,22 @@ def test_replace_input_formats_listlike(self): df.replace(to_rep, values[1:]) def test_replace_input_formats_scalar(self): - df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5], - 'C': ['', 'asdf', 'fd']}) + df = DataFrame( + {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} + ) # dict to scalar - to_rep = {'A': np.nan, 'B': 0, 'C': ''} + to_rep = {"A": np.nan, "B": 0, "C": ""} filled = df.replace(to_rep, 0) expected = {k: v.replace(to_rep[k], 0) for k, v in df.items()} assert_frame_equal(filled, DataFrame(expected)) msg = "value argument must be scalar, dict, or Series" with pytest.raises(TypeError, match=msg): - df.replace(to_rep, [np.nan, 0, '']) + df.replace(to_rep, [np.nan, 0, ""]) # list to scalar - to_rep = [np.nan, 0, ''] + to_rep = [np.nan, 0, ""] result = df.replace(to_rep, -1) expected = df.copy() for i in range(len(to_rep)): @@ -905,19 +977,45 @@ def test_replace_limit(self): pass def test_replace_dict_no_regex(self): - answer = Series({0: 'Strongly Agree', 1: 'Agree', 2: 'Neutral', 3: - 'Disagree', 4: 'Strongly Disagree'}) - weights = {'Agree': 4, 'Disagree': 2, 'Neutral': 3, 'Strongly Agree': - 5, 'Strongly Disagree': 1} + answer = Series( + { + 0: "Strongly Agree", + 1: "Agree", + 2: "Neutral", + 3: "Disagree", + 4: "Strongly Disagree", + } + ) + weights = { + "Agree": 4, + "Disagree": 2, + "Neutral": 3, + "Strongly Agree": 5, + "Strongly Disagree": 1, + } expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) result = answer.replace(weights) assert_series_equal(result, expected) def test_replace_series_no_regex(self): - answer = Series({0: 'Strongly Agree', 1: 'Agree', 2: 'Neutral', 3: - 'Disagree', 4: 'Strongly Disagree'}) - weights = Series({'Agree': 4, 'Disagree': 2, 'Neutral': 3, - 'Strongly Agree': 5, 'Strongly Disagree': 1}) + answer = Series( + { + 0: "Strongly Agree", + 1: "Agree", + 2: "Neutral", + 3: "Disagree", + 4: "Strongly Disagree", + } + ) + weights = Series( + { + "Agree": 4, + "Disagree": 2, + "Neutral": 3, + "Strongly Agree": 5, + "Strongly Disagree": 1, + } + ) expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) result = answer.replace(weights) assert_series_equal(result, expected) @@ -928,7 +1026,7 @@ def test_replace_dict_tuple_list_ordering_remains_the_same(self): res2 = df.replace(to_replace=(1, np.nan), value=[-1e8, 0]) res3 = df.replace(to_replace=[1, np.nan], value=[-1e8, 0]) - expected = DataFrame({'A': [0, -1e8]}) + expected = DataFrame({"A": [0, -1e8]}) assert_frame_equal(res1, res2) assert_frame_equal(res2, res3) assert_frame_equal(res3, expected) @@ -939,19 +1037,19 @@ def test_replace_doesnt_replace_without_regex(self): 1 2 vr 0 0 2 2 0 0 0 3 3 0 bt 0""" - df = pd.read_csv(StringIO(raw), sep=r'\s+') - res = df.replace({r'\D': 1}) + df = pd.read_csv(StringIO(raw), sep=r"\s+") + res = df.replace({r"\D": 1}) assert_frame_equal(df, res) def test_replace_bool_with_string(self): - df = DataFrame({'a': [True, False], 'b': list('ab')}) - result = df.replace(True, 'a') - expected = DataFrame({'a': ['a', False], 'b': df.b}) + df = DataFrame({"a": [True, False], "b": list("ab")}) + result = df.replace(True, "a") + expected = DataFrame({"a": ["a", False], "b": df.b}) assert_frame_equal(result, expected) def test_replace_pure_bool_with_string_no_op(self): df = DataFrame(np.random.rand(2, 2) > 0.5) - result = df.replace('asdf', 'fdsa') + result = df.replace("asdf", "fdsa") assert_frame_equal(df, result) def test_replace_bool_with_bool(self): @@ -962,93 +1060,102 @@ def test_replace_bool_with_bool(self): def test_replace_with_dict_with_bool_keys(self): df = DataFrame({0: [True, False], 1: [False, True]}) - with pytest.raises(TypeError, match='Cannot compare types .+'): - df.replace({'asdf': 'asdb', True: 'yes'}) + with pytest.raises(TypeError, match="Cannot compare types .+"): + df.replace({"asdf": "asdb", True: "yes"}) def test_replace_truthy(self): - df = DataFrame({'a': [True, True]}) + df = DataFrame({"a": [True, True]}) r = df.replace([np.inf, -np.inf], np.nan) e = df assert_frame_equal(r, e) def test_replace_int_to_int_chain(self): - df = DataFrame({'a': list(range(1, 5))}) + df = DataFrame({"a": list(range(1, 5))}) with pytest.raises(ValueError, match="Replacement not allowed .+"): - df.replace({'a': dict(zip(range(1, 5), range(2, 6)))}) + df.replace({"a": dict(zip(range(1, 5), range(2, 6)))}) def test_replace_str_to_str_chain(self): a = np.arange(1, 5) astr = a.astype(str) bstr = np.arange(2, 6).astype(str) - df = DataFrame({'a': astr}) + df = DataFrame({"a": astr}) with pytest.raises(ValueError, match="Replacement not allowed .+"): - df.replace({'a': dict(zip(astr, bstr))}) + df.replace({"a": dict(zip(astr, bstr))}) def test_replace_swapping_bug(self): - df = pd.DataFrame({'a': [True, False, True]}) - res = df.replace({'a': {True: 'Y', False: 'N'}}) - expect = pd.DataFrame({'a': ['Y', 'N', 'Y']}) + df = pd.DataFrame({"a": [True, False, True]}) + res = df.replace({"a": {True: "Y", False: "N"}}) + expect = pd.DataFrame({"a": ["Y", "N", "Y"]}) assert_frame_equal(res, expect) - df = pd.DataFrame({'a': [0, 1, 0]}) - res = df.replace({'a': {0: 'Y', 1: 'N'}}) - expect = pd.DataFrame({'a': ['Y', 'N', 'Y']}) + df = pd.DataFrame({"a": [0, 1, 0]}) + res = df.replace({"a": {0: "Y", 1: "N"}}) + expect = pd.DataFrame({"a": ["Y", "N", "Y"]}) assert_frame_equal(res, expect) def test_replace_period(self): d = { - 'fname': { - 'out_augmented_AUG_2011.json': - pd.Period(year=2011, month=8, freq='M'), - 'out_augmented_JAN_2011.json': - pd.Period(year=2011, month=1, freq='M'), - 'out_augmented_MAY_2012.json': - pd.Period(year=2012, month=5, freq='M'), - 'out_augmented_SUBSIDY_WEEK.json': - pd.Period(year=2011, month=4, freq='M'), - 'out_augmented_AUG_2012.json': - pd.Period(year=2012, month=8, freq='M'), - 'out_augmented_MAY_2011.json': - pd.Period(year=2011, month=5, freq='M'), - 'out_augmented_SEP_2013.json': - pd.Period(year=2013, month=9, freq='M')}} - - df = pd.DataFrame(['out_augmented_AUG_2012.json', - 'out_augmented_SEP_2013.json', - 'out_augmented_SUBSIDY_WEEK.json', - 'out_augmented_MAY_2012.json', - 'out_augmented_MAY_2011.json', - 'out_augmented_AUG_2011.json', - 'out_augmented_JAN_2011.json'], columns=['fname']) - assert set(df.fname.values) == set(d['fname'].keys()) + "fname": { + "out_augmented_AUG_2011.json": pd.Period(year=2011, month=8, freq="M"), + "out_augmented_JAN_2011.json": pd.Period(year=2011, month=1, freq="M"), + "out_augmented_MAY_2012.json": pd.Period(year=2012, month=5, freq="M"), + "out_augmented_SUBSIDY_WEEK.json": pd.Period( + year=2011, month=4, freq="M" + ), + "out_augmented_AUG_2012.json": pd.Period(year=2012, month=8, freq="M"), + "out_augmented_MAY_2011.json": pd.Period(year=2011, month=5, freq="M"), + "out_augmented_SEP_2013.json": pd.Period(year=2013, month=9, freq="M"), + } + } + + df = pd.DataFrame( + [ + "out_augmented_AUG_2012.json", + "out_augmented_SEP_2013.json", + "out_augmented_SUBSIDY_WEEK.json", + "out_augmented_MAY_2012.json", + "out_augmented_MAY_2011.json", + "out_augmented_AUG_2011.json", + "out_augmented_JAN_2011.json", + ], + columns=["fname"], + ) + assert set(df.fname.values) == set(d["fname"].keys()) # We don't support converting object -> specialized EA in # replace yet. - expected = DataFrame({'fname': [d['fname'][k] - for k in df.fname.values]}, - dtype=object) + expected = DataFrame( + {"fname": [d["fname"][k] for k in df.fname.values]}, dtype=object + ) result = df.replace(d) assert_frame_equal(result, expected) def test_replace_datetime(self): - d = {'fname': - {'out_augmented_AUG_2011.json': pd.Timestamp('2011-08'), - 'out_augmented_JAN_2011.json': pd.Timestamp('2011-01'), - 'out_augmented_MAY_2012.json': pd.Timestamp('2012-05'), - 'out_augmented_SUBSIDY_WEEK.json': pd.Timestamp('2011-04'), - 'out_augmented_AUG_2012.json': pd.Timestamp('2012-08'), - 'out_augmented_MAY_2011.json': pd.Timestamp('2011-05'), - 'out_augmented_SEP_2013.json': pd.Timestamp('2013-09')}} - - df = pd.DataFrame(['out_augmented_AUG_2012.json', - 'out_augmented_SEP_2013.json', - 'out_augmented_SUBSIDY_WEEK.json', - 'out_augmented_MAY_2012.json', - 'out_augmented_MAY_2011.json', - 'out_augmented_AUG_2011.json', - 'out_augmented_JAN_2011.json'], columns=['fname']) - assert set(df.fname.values) == set(d['fname'].keys()) - expected = DataFrame({'fname': [d['fname'][k] - for k in df.fname.values]}) + d = { + "fname": { + "out_augmented_AUG_2011.json": pd.Timestamp("2011-08"), + "out_augmented_JAN_2011.json": pd.Timestamp("2011-01"), + "out_augmented_MAY_2012.json": pd.Timestamp("2012-05"), + "out_augmented_SUBSIDY_WEEK.json": pd.Timestamp("2011-04"), + "out_augmented_AUG_2012.json": pd.Timestamp("2012-08"), + "out_augmented_MAY_2011.json": pd.Timestamp("2011-05"), + "out_augmented_SEP_2013.json": pd.Timestamp("2013-09"), + } + } + + df = pd.DataFrame( + [ + "out_augmented_AUG_2012.json", + "out_augmented_SEP_2013.json", + "out_augmented_SUBSIDY_WEEK.json", + "out_augmented_MAY_2012.json", + "out_augmented_MAY_2011.json", + "out_augmented_AUG_2011.json", + "out_augmented_JAN_2011.json", + ], + columns=["fname"], + ) + assert set(df.fname.values) == set(d["fname"].keys()) + expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]}) result = df.replace(d) assert_frame_equal(result, expected) @@ -1056,56 +1163,83 @@ def test_replace_datetimetz(self): # GH 11326 # behaving poorly when presented with a datetime64[ns, tz] - df = DataFrame({'A': date_range('20130101', periods=3, - tz='US/Eastern'), - 'B': [0, np.nan, 2]}) + df = DataFrame( + { + "A": date_range("20130101", periods=3, tz="US/Eastern"), + "B": [0, np.nan, 2], + } + ) result = df.replace(np.nan, 1) - expected = DataFrame({'A': date_range('20130101', periods=3, - tz='US/Eastern'), - 'B': Series([0, 1, 2], dtype='float64')}) + expected = DataFrame( + { + "A": date_range("20130101", periods=3, tz="US/Eastern"), + "B": Series([0, 1, 2], dtype="float64"), + } + ) assert_frame_equal(result, expected) result = df.fillna(1) assert_frame_equal(result, expected) result = df.replace(0, np.nan) - expected = DataFrame({'A': date_range('20130101', periods=3, - tz='US/Eastern'), - 'B': [np.nan, np.nan, 2]}) + expected = DataFrame( + { + "A": date_range("20130101", periods=3, tz="US/Eastern"), + "B": [np.nan, np.nan, 2], + } + ) assert_frame_equal(result, expected) - result = df.replace(Timestamp('20130102', tz='US/Eastern'), - Timestamp('20130104', tz='US/Eastern')) - expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'), - Timestamp('20130104', tz='US/Eastern'), - Timestamp('20130103', tz='US/Eastern')], - 'B': [0, np.nan, 2]}) + result = df.replace( + Timestamp("20130102", tz="US/Eastern"), + Timestamp("20130104", tz="US/Eastern"), + ) + expected = DataFrame( + { + "A": [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130104", tz="US/Eastern"), + Timestamp("20130103", tz="US/Eastern"), + ], + "B": [0, np.nan, 2], + } + ) assert_frame_equal(result, expected) result = df.copy() result.iloc[1, 0] = np.nan - result = result.replace( - {'A': pd.NaT}, Timestamp('20130104', tz='US/Eastern')) + result = result.replace({"A": pd.NaT}, Timestamp("20130104", tz="US/Eastern")) assert_frame_equal(result, expected) # coerce to object result = df.copy() result.iloc[1, 0] = np.nan - result = result.replace( - {'A': pd.NaT}, Timestamp('20130104', tz='US/Pacific')) - expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'), - Timestamp('20130104', tz='US/Pacific'), - Timestamp('20130103', tz='US/Eastern')], - 'B': [0, np.nan, 2]}) + result = result.replace({"A": pd.NaT}, Timestamp("20130104", tz="US/Pacific")) + expected = DataFrame( + { + "A": [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130104", tz="US/Pacific"), + Timestamp("20130103", tz="US/Eastern"), + ], + "B": [0, np.nan, 2], + } + ) assert_frame_equal(result, expected) result = df.copy() result.iloc[1, 0] = np.nan - result = result.replace({'A': np.nan}, Timestamp('20130104')) - expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'), - Timestamp('20130104'), - Timestamp('20130103', tz='US/Eastern')], - 'B': [0, np.nan, 2]}) + result = result.replace({"A": np.nan}, Timestamp("20130104")) + expected = DataFrame( + { + "A": [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130104"), + Timestamp("20130103", tz="US/Eastern"), + ], + "B": [0, np.nan, 2], + } + ) assert_frame_equal(result, expected) def test_replace_with_empty_dictlike(self, mix_abc): @@ -1114,37 +1248,44 @@ def test_replace_with_empty_dictlike(self, mix_abc): assert_frame_equal(df, df.replace({})) assert_frame_equal(df, df.replace(Series([]))) - assert_frame_equal(df, df.replace({'b': {}})) - assert_frame_equal(df, df.replace(Series({'b': {}}))) - - @pytest.mark.parametrize("to_replace, method, expected", [ - (0, 'bfill', {'A': [1, 1, 2], - 'B': [5, np.nan, 7], - 'C': ['a', 'b', 'c']}), - (np.nan, 'bfill', {'A': [0, 1, 2], - 'B': [5.0, 7.0, 7.0], - 'C': ['a', 'b', 'c']}), - ('d', 'ffill', {'A': [0, 1, 2], - 'B': [5, np.nan, 7], - 'C': ['a', 'b', 'c']}), - ([0, 2], 'bfill', {'A': [1, 1, 2], - 'B': [5, np.nan, 7], - 'C': ['a', 'b', 'c']}), - ([1, 2], 'pad', {'A': [0, 0, 0], - 'B': [5, np.nan, 7], - 'C': ['a', 'b', 'c']}), - ((1, 2), 'bfill', {'A': [0, 2, 2], - 'B': [5, np.nan, 7], - 'C': ['a', 'b', 'c']}), - (['b', 'c'], 'ffill', {'A': [0, 1, 2], - 'B': [5, np.nan, 7], - 'C': ['a', 'a', 'a']}), - ]) + assert_frame_equal(df, df.replace({"b": {}})) + assert_frame_equal(df, df.replace(Series({"b": {}}))) + + @pytest.mark.parametrize( + "to_replace, method, expected", + [ + (0, "bfill", {"A": [1, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}), + ( + np.nan, + "bfill", + {"A": [0, 1, 2], "B": [5.0, 7.0, 7.0], "C": ["a", "b", "c"]}, + ), + ("d", "ffill", {"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}), + ( + [0, 2], + "bfill", + {"A": [1, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}, + ), + ( + [1, 2], + "pad", + {"A": [0, 0, 0], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}, + ), + ( + (1, 2), + "bfill", + {"A": [0, 2, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}, + ), + ( + ["b", "c"], + "ffill", + {"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "a", "a"]}, + ), + ], + ) def test_replace_method(self, to_replace, method, expected): # GH 19632 - df = DataFrame({'A': [0, 1, 2], - 'B': [5, np.nan, 7], - 'C': ['a', 'b', 'c']}) + df = DataFrame({"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}) result = df.replace(to_replace=to_replace, value=None, method=method) expected = DataFrame(expected) diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 24dba8cb964cc2..c33b758d2d62c3 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -11,7 +11,13 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, Series, date_range, option_context, period_range) + Categorical, + DataFrame, + Series, + date_range, + option_context, + period_range, +) from pandas.tests.frame.common import TestData import pandas.util.testing as tm @@ -22,7 +28,6 @@ class TestDataFrameReprInfoEtc(TestData): - def test_repr_empty(self): # empty foo = repr(self.empty) # noqa @@ -41,11 +46,11 @@ def test_repr_mixed(self): @pytest.mark.slow def test_repr_mixed_big(self): # big mixed - biggie = DataFrame({'A': np.random.randn(200), - 'B': tm.makeStringIndex(200)}, - index=range(200)) - biggie.loc[:20, 'A'] = np.nan - biggie.loc[:20, 'B'] = np.nan + biggie = DataFrame( + {"A": np.random.randn(200), "B": tm.makeStringIndex(200)}, index=range(200) + ) + biggie.loc[:20, "A"] = np.nan + biggie.loc[:20, "B"] = np.nan foo = repr(biggie) # noqa @@ -57,8 +62,8 @@ def test_repr(self): self.frame.info(verbose=False, buf=buf) # even smaller - self.frame.reindex(columns=['A']).info(verbose=False, buf=buf) - self.frame.reindex(columns=['A', 'B']).info(verbose=False, buf=buf) + self.frame.reindex(columns=["A"]).info(verbose=False, buf=buf) + self.frame.reindex(columns=["A", "B"]).info(verbose=False, buf=buf) # exhausting cases in DataFrame.info @@ -75,45 +80,47 @@ def test_repr(self): assert "a\n" not in repr(df) def test_repr_dimensions(self): - df = DataFrame([[1, 2, ], [3, 4]]) - with option_context('display.show_dimensions', True): + df = DataFrame([[1, 2], [3, 4]]) + with option_context("display.show_dimensions", True): assert "2 rows x 2 columns" in repr(df) - with option_context('display.show_dimensions', False): + with option_context("display.show_dimensions", False): assert "2 rows x 2 columns" not in repr(df) - with option_context('display.show_dimensions', 'truncate'): + with option_context("display.show_dimensions", "truncate"): assert "2 rows x 2 columns" not in repr(df) @pytest.mark.slow def test_repr_big(self): # big one - biggie = DataFrame(np.zeros((200, 4)), columns=range(4), - index=range(200)) + biggie = DataFrame(np.zeros((200, 4)), columns=range(4), index=range(200)) repr(biggie) def test_repr_unsortable(self): # columns are not sortable import warnings + warn_filters = warnings.filters - warnings.filterwarnings('ignore', - category=FutureWarning, - module=".*format") - - unsortable = DataFrame({'foo': [1] * 50, - datetime.today(): [1] * 50, - 'bar': ['bar'] * 50, - datetime.today() + timedelta(1): ['bar'] * 50}, - index=np.arange(50)) + warnings.filterwarnings("ignore", category=FutureWarning, module=".*format") + + unsortable = DataFrame( + { + "foo": [1] * 50, + datetime.today(): [1] * 50, + "bar": ["bar"] * 50, + datetime.today() + timedelta(1): ["bar"] * 50, + }, + index=np.arange(50), + ) repr(unsortable) - fmt.set_option('display.precision', 3, 'display.column_space', 10) + fmt.set_option("display.precision", 3, "display.column_space", 10) repr(self.frame) - fmt.set_option('display.max_rows', 10, 'display.max_columns', 2) + fmt.set_option("display.max_rows", 10, "display.max_columns", 2) repr(self.frame) - fmt.set_option('display.max_rows', 1000, 'display.max_columns', 1000) + fmt.set_option("display.max_rows", 1000, "display.max_columns", 1000) repr(self.frame) tm.reset_display_options() @@ -121,51 +128,56 @@ def test_repr_unsortable(self): warnings.filters = warn_filters def test_repr_unicode(self): - uval = '\u03c3\u03c3\u03c3\u03c3' + uval = "\u03c3\u03c3\u03c3\u03c3" # TODO(wesm): is this supposed to be used? - bval = uval.encode('utf-8') # noqa + bval = uval.encode("utf-8") # noqa - df = DataFrame({'A': [uval, uval]}) + df = DataFrame({"A": [uval, uval]}) result = repr(df) - ex_top = ' A' - assert result.split('\n')[0].rstrip() == ex_top + ex_top = " A" + assert result.split("\n")[0].rstrip() == ex_top - df = DataFrame({'A': [uval, uval]}) + df = DataFrame({"A": [uval, uval]}) result = repr(df) - assert result.split('\n')[0].rstrip() == ex_top + assert result.split("\n")[0].rstrip() == ex_top def test_unicode_string_with_unicode(self): - df = DataFrame({'A': ["\u05d0"]}) + df = DataFrame({"A": ["\u05d0"]}) str(df) def test_str_to_bytes_raises(self): # GH 26447 - df = DataFrame({'A': ["abc"]}) + df = DataFrame({"A": ["abc"]}) msg = "^'str' object cannot be interpreted as an integer$" with pytest.raises(TypeError, match=msg): bytes(df) def test_very_wide_info_repr(self): - df = DataFrame(np.random.randn(10, 20), - columns=tm.rands_array(10, 20)) + df = DataFrame(np.random.randn(10, 20), columns=tm.rands_array(10, 20)) repr(df) def test_repr_column_name_unicode_truncation_bug(self): # #1906 - df = DataFrame({'Id': [7117434], - 'StringCol': ('Is it possible to modify drop plot code' - ' so that the output graph is displayed ' - 'in iphone simulator, Is it possible to ' - 'modify drop plot code so that the ' - 'output graph is \xe2\x80\xa8displayed ' - 'in iphone simulator.Now we are adding ' - 'the CSV file externally. I want to Call' - ' the File through the code..')}) - - with option_context('display.max_columns', 20): - assert 'StringCol' in repr(df) + df = DataFrame( + { + "Id": [7117434], + "StringCol": ( + "Is it possible to modify drop plot code" + " so that the output graph is displayed " + "in iphone simulator, Is it possible to " + "modify drop plot code so that the " + "output graph is \xe2\x80\xa8displayed " + "in iphone simulator.Now we are adding " + "the CSV file externally. I want to Call" + " the File through the code.." + ), + } + ) + + with option_context("display.max_columns", 20): + assert "StringCol" in repr(df) def test_latex_repr(self): result = r"""\begin{tabular}{llll} @@ -177,9 +189,8 @@ def test_latex_repr(self): \bottomrule \end{tabular} """ - with option_context("display.latex.escape", False, - 'display.latex.repr', True): - df = DataFrame([[r'$\alpha$', 'b', 'c'], [1, 2, 3]]) + with option_context("display.latex.escape", False, "display.latex.repr", True): + df = DataFrame([[r"$\alpha$", "b", "c"], [1, 2, 3]]) assert result == df._repr_latex_() # GH 12182 @@ -197,25 +208,30 @@ def test_info(self): def test_info_memory(self): # https://github.com/pandas-dev/pandas/issues/21056 - df = pd.DataFrame({'a': pd.Series([1, 2], dtype='i8')}) + df = pd.DataFrame({"a": pd.Series([1, 2], dtype="i8")}) buf = StringIO() df.info(buf=buf) result = buf.getvalue() bytes = float(df.memory_usage().sum()) - expected = textwrap.dedent("""\ + expected = textwrap.dedent( + """\ RangeIndex: 2 entries, 0 to 1 Data columns (total 1 columns): a 2 non-null int64 dtypes: int64(1) memory usage: {} bytes - """.format(bytes)) + """.format( + bytes + ) + ) assert result == expected def test_info_wide(self): from pandas import set_option, reset_option + io = StringIO() df = DataFrame(np.random.randn(5, 101)) df.info(buf=io) @@ -226,35 +242,40 @@ def test_info_wide(self): assert len(rs.splitlines()) > 100 xp = rs - set_option('display.max_info_columns', 101) + set_option("display.max_info_columns", 101) io = StringIO() df.info(buf=io) assert rs == xp - reset_option('display.max_info_columns') + reset_option("display.max_info_columns") def test_info_duplicate_columns(self): io = StringIO() # it works! - frame = DataFrame(np.random.randn(1500, 4), - columns=['a', 'a', 'b', 'b']) + frame = DataFrame(np.random.randn(1500, 4), columns=["a", "a", "b", "b"]) frame.info(buf=io) def test_info_duplicate_columns_shows_correct_dtypes(self): # GH11761 io = StringIO() - frame = DataFrame([[1, 2.0]], - columns=['a', 'a']) + frame = DataFrame([[1, 2.0]], columns=["a", "a"]) frame.info(buf=io) io.seek(0) lines = io.readlines() - assert 'a 1 non-null int64\n' == lines[3] - assert 'a 1 non-null float64\n' == lines[4] + assert "a 1 non-null int64\n" == lines[3] + assert "a 1 non-null float64\n" == lines[4] def test_info_shows_column_dtypes(self): - dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', - 'complex128', 'object', 'bool'] + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] data = {} n = 10 for i, dtype in enumerate(dtypes): @@ -264,47 +285,54 @@ def test_info_shows_column_dtypes(self): df.info(buf=buf) res = buf.getvalue() for i, dtype in enumerate(dtypes): - name = '%d %d non-null %s' % (i, n, dtype) + name = "%d %d non-null %s" % (i, n, dtype) assert name in res def test_info_max_cols(self): df = DataFrame(np.random.randn(10, 5)) for len_, verbose in [(5, None), (5, False), (10, True)]: # For verbose always ^ setting ^ summarize ^ full output - with option_context('max_info_columns', 4): + with option_context("max_info_columns", 4): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() - assert len(res.strip().split('\n')) == len_ + assert len(res.strip().split("\n")) == len_ for len_, verbose in [(10, None), (5, False), (10, True)]: # max_cols no exceeded - with option_context('max_info_columns', 5): + with option_context("max_info_columns", 5): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() - assert len(res.strip().split('\n')) == len_ + assert len(res.strip().split("\n")) == len_ for len_, max_cols in [(10, 5), (5, 4)]: # setting truncates - with option_context('max_info_columns', 4): + with option_context("max_info_columns", 4): buf = StringIO() df.info(buf=buf, max_cols=max_cols) res = buf.getvalue() - assert len(res.strip().split('\n')) == len_ + assert len(res.strip().split("\n")) == len_ # setting wouldn't truncate - with option_context('max_info_columns', 5): + with option_context("max_info_columns", 5): buf = StringIO() df.info(buf=buf, max_cols=max_cols) res = buf.getvalue() - assert len(res.strip().split('\n')) == len_ + assert len(res.strip().split("\n")) == len_ def test_info_memory_usage(self): # Ensure memory usage is displayed, when asserted, on the last line - dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', - 'complex128', 'object', 'bool'] + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] data = {} n = 10 for i, dtype in enumerate(dtypes): @@ -335,7 +363,7 @@ def test_info_memory_usage(self): assert not re.match(r"memory usage: [^+]+\+", res[-1]) # Test a DataFrame with duplicate columns - dtypes = ['int64', 'int64', 'int64', 'float64'] + dtypes = ["int64", "int64", "int64", "float64"] data = {} n = 100 for i, dtype in enumerate(dtypes): @@ -343,12 +371,12 @@ def test_info_memory_usage(self): df = DataFrame(data) df.columns = dtypes - df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo']) + df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"]) df_with_object_index.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() assert re.match(r"memory usage: [^+]+\+", res[-1]) - df_with_object_index.info(buf=buf, memory_usage='deep') + df_with_object_index.info(buf=buf, memory_usage="deep") res = buf.getvalue().splitlines() assert re.match(r"memory usage: [^+]+$", res[-1]) @@ -366,15 +394,12 @@ def test_info_memory_usage(self): assert df.memory_usage().sum() == df.memory_usage(deep=True).sum() # test for validity - DataFrame(1, index=['a'], columns=['A'] - ).memory_usage(index=True) - DataFrame(1, index=['a'], columns=['A'] - ).index.nbytes + DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True) + DataFrame(1, index=["a"], columns=["A"]).index.nbytes df = DataFrame( data=1, - index=pd.MultiIndex.from_product( - [['a'], range(1000)]), - columns=['A'] + index=pd.MultiIndex.from_product([["a"], range(1000)]), + columns=["A"], ) df.index.nbytes df.memory_usage(index=True) @@ -383,39 +408,34 @@ def test_info_memory_usage(self): mem = df.memory_usage(deep=True).sum() assert mem > 0 - @pytest.mark.skipif(PYPY, - reason="on PyPy deep=True doesn't change result") + @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") def test_info_memory_usage_deep_not_pypy(self): - df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo']) - assert (df_with_object_index.memory_usage( - index=True, deep=True).sum() > - df_with_object_index.memory_usage( - index=True).sum()) - - df_object = pd.DataFrame({'a': ['a']}) - assert (df_object.memory_usage(deep=True).sum() > - df_object.memory_usage().sum()) - - @pytest.mark.skipif(not PYPY, - reason="on PyPy deep=True does not change result") + df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"]) + assert ( + df_with_object_index.memory_usage(index=True, deep=True).sum() + > df_with_object_index.memory_usage(index=True).sum() + ) + + df_object = pd.DataFrame({"a": ["a"]}) + assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() + + @pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result") def test_info_memory_usage_deep_pypy(self): - df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo']) - assert (df_with_object_index.memory_usage( - index=True, deep=True).sum() == - df_with_object_index.memory_usage( - index=True).sum()) + df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"]) + assert ( + df_with_object_index.memory_usage(index=True, deep=True).sum() + == df_with_object_index.memory_usage(index=True).sum() + ) - df_object = pd.DataFrame({'a': ['a']}) - assert (df_object.memory_usage(deep=True).sum() == - df_object.memory_usage().sum()) + df_object = pd.DataFrame({"a": ["a"]}) + assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design") def test_usage_via_getsizeof(self): df = DataFrame( data=1, - index=pd.MultiIndex.from_product( - [['a'], range(1000)]), - columns=['A'] + index=pd.MultiIndex.from_product([["a"], range(1000)]), + columns=["A"], ) mem = df.memory_usage(deep=True).sum() # sys.getsizeof will call the .memory_usage with @@ -426,30 +446,32 @@ def test_usage_via_getsizeof(self): def test_info_memory_usage_qualified(self): buf = StringIO() - df = DataFrame(1, columns=list('ab'), - index=[1, 2, 3]) + df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) df.info(buf=buf) - assert '+' not in buf.getvalue() + assert "+" not in buf.getvalue() buf = StringIO() - df = DataFrame(1, columns=list('ab'), - index=list('ABC')) + df = DataFrame(1, columns=list("ab"), index=list("ABC")) df.info(buf=buf) - assert '+' in buf.getvalue() + assert "+" in buf.getvalue() buf = StringIO() - df = DataFrame(1, columns=list('ab'), - index=pd.MultiIndex.from_product( - [range(3), range(3)])) + df = DataFrame( + 1, + columns=list("ab"), + index=pd.MultiIndex.from_product([range(3), range(3)]), + ) df.info(buf=buf) - assert '+' not in buf.getvalue() + assert "+" not in buf.getvalue() buf = StringIO() - df = DataFrame(1, columns=list('ab'), - index=pd.MultiIndex.from_product( - [range(3), ['foo', 'bar']])) + df = DataFrame( + 1, + columns=list("ab"), + index=pd.MultiIndex.from_product([range(3), ["foo", "bar"]]), + ) df.info(buf=buf) - assert '+' in buf.getvalue() + assert "+" in buf.getvalue() def test_info_memory_usage_bug_on_multiindex(self): # GH 14308 @@ -462,13 +484,13 @@ def memory_usage(f): N = 100 M = len(uppercase) - index = pd.MultiIndex.from_product([list(uppercase), - pd.date_range('20160101', - periods=N)], - names=['id', 'date']) - df = DataFrame({'value': np.random.randn(N * M)}, index=index) + index = pd.MultiIndex.from_product( + [list(uppercase), pd.date_range("20160101", periods=N)], + names=["id", "date"], + ) + df = DataFrame({"value": np.random.randn(N * M)}, index=index) - unstacked = df.unstack('id') + unstacked = df.unstack("id") assert df.values.nbytes == unstacked.values.nbytes assert memory_usage(df) > memory_usage(unstacked) @@ -477,7 +499,7 @@ def memory_usage(f): def test_info_categorical(self): # GH14298 - idx = pd.CategoricalIndex(['a', 'b']) + idx = pd.CategoricalIndex(["a", "b"]) df = pd.DataFrame(np.zeros((2, 2)), index=idx, columns=idx) buf = StringIO() @@ -487,23 +509,23 @@ def test_info_categorical_column(self): # make sure it works n = 2500 - df = DataFrame({'int64': np.random.randint(100, size=n)}) - df['category'] = Series(np.array(list('abcdefghij')).take( - np.random.randint(0, 10, size=n))).astype('category') + df = DataFrame({"int64": np.random.randint(100, size=n)}) + df["category"] = Series( + np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n)) + ).astype("category") df.isna() buf = StringIO() df.info(buf=buf) - df2 = df[df['category'] == 'd'] + df2 = df[df["category"] == "d"] buf = StringIO() df2.info(buf=buf) def test_repr_categorical_dates_periods(self): # normal DataFrame - dt = date_range('2011-01-01 09:00', freq='H', periods=5, - tz='US/Eastern') - p = period_range('2011-01', freq='M', periods=5) - df = DataFrame({'dt': dt, 'p': p}) + dt = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + p = period_range("2011-01", freq="M", periods=5) + df = DataFrame({"dt": dt, "p": p}) exp = """ dt p 0 2011-01-01 09:00:00-05:00 2011-01 1 2011-01-01 10:00:00-05:00 2011-02 @@ -513,14 +535,15 @@ def test_repr_categorical_dates_periods(self): assert repr(df) == exp - df2 = DataFrame({'dt': Categorical(dt), 'p': Categorical(p)}) + df2 = DataFrame({"dt": Categorical(dt), "p": Categorical(p)}) assert repr(df2) == exp - @pytest.mark.parametrize('arg', [np.datetime64, np.timedelta64]) - @pytest.mark.parametrize('box, expected', [ - [Series, '0 NaT\ndtype: object'], - [DataFrame, ' 0\n0 NaT']]) + @pytest.mark.parametrize("arg", [np.datetime64, np.timedelta64]) + @pytest.mark.parametrize( + "box, expected", + [[Series, "0 NaT\ndtype: object"], [DataFrame, " 0\n0 NaT"]], + ) def test_repr_np_nat_with_object(self, arg, box, expected): # GH 25445 - result = repr(box([arg('NaT')], dtype=object)) + result = repr(box([arg("NaT")], dtype=object)) assert result == expected diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 04c1375418e674..f3452e9a85fb3e 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -5,53 +5,56 @@ import pytest import pandas as pd -from pandas import ( - DataFrame, Index, MultiIndex, Period, Series, Timedelta, date_range) +from pandas import DataFrame, Index, MultiIndex, Period, Series, Timedelta, date_range from pandas.tests.frame.common import TestData import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal class TestDataFrameReshape(TestData): - def test_pivot(self): data = { - 'index': ['A', 'B', 'C', 'C', 'B', 'A'], - 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'], - 'values': [1., 2., 3., 3., 2., 1.] + "index": ["A", "B", "C", "C", "B", "A"], + "columns": ["One", "One", "One", "Two", "Two", "Two"], + "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], } frame = DataFrame(data) - pivoted = frame.pivot( - index='index', columns='columns', values='values') + pivoted = frame.pivot(index="index", columns="columns", values="values") - expected = DataFrame({ - 'One': {'A': 1., 'B': 2., 'C': 3.}, - 'Two': {'A': 1., 'B': 2., 'C': 3.} - }) + expected = DataFrame( + { + "One": {"A": 1.0, "B": 2.0, "C": 3.0}, + "Two": {"A": 1.0, "B": 2.0, "C": 3.0}, + } + ) - expected.index.name, expected.columns.name = 'index', 'columns' + expected.index.name, expected.columns.name = "index", "columns" tm.assert_frame_equal(pivoted, expected) # name tracking - assert pivoted.index.name == 'index' - assert pivoted.columns.name == 'columns' + assert pivoted.index.name == "index" + assert pivoted.columns.name == "columns" # don't specify values - pivoted = frame.pivot(index='index', columns='columns') - assert pivoted.index.name == 'index' - assert pivoted.columns.names == (None, 'columns') + pivoted = frame.pivot(index="index", columns="columns") + assert pivoted.index.name == "index" + assert pivoted.columns.names == (None, "columns") def test_pivot_duplicates(self): - data = DataFrame({'a': ['bar', 'bar', 'foo', 'foo', 'foo'], - 'b': ['one', 'two', 'one', 'one', 'two'], - 'c': [1., 2., 3., 3., 4.]}) - with pytest.raises(ValueError, match='duplicate entries'): - data.pivot('a', 'b', 'c') + data = DataFrame( + { + "a": ["bar", "bar", "foo", "foo", "foo"], + "b": ["one", "two", "one", "one", "two"], + "c": [1.0, 2.0, 3.0, 3.0, 4.0], + } + ) + with pytest.raises(ValueError, match="duplicate entries"): + data.pivot("a", "b", "c") def test_pivot_empty(self): - df = DataFrame(columns=['a', 'b', 'c']) - result = df.pivot('a', 'b', 'c') + df = DataFrame(columns=["a", "b", "c"]) + result = df.pivot("a", "b", "c") expected = DataFrame() tm.assert_frame_equal(result, expected, check_names=False) @@ -60,40 +63,42 @@ def test_pivot_integer_bug(self): result = df.pivot(index=1, columns=0, values=2) repr(result) - tm.assert_index_equal(result.columns, Index(['A', 'B'], name=0)) + tm.assert_index_equal(result.columns, Index(["A", "B"], name=0)) def test_pivot_index_none(self): # gh-3962 data = { - 'index': ['A', 'B', 'C', 'C', 'B', 'A'], - 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'], - 'values': [1., 2., 3., 3., 2., 1.] + "index": ["A", "B", "C", "C", "B", "A"], + "columns": ["One", "One", "One", "Two", "Two", "Two"], + "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], } - frame = DataFrame(data).set_index('index') - result = frame.pivot(columns='columns', values='values') - expected = DataFrame({ - 'One': {'A': 1., 'B': 2., 'C': 3.}, - 'Two': {'A': 1., 'B': 2., 'C': 3.} - }) + frame = DataFrame(data).set_index("index") + result = frame.pivot(columns="columns", values="values") + expected = DataFrame( + { + "One": {"A": 1.0, "B": 2.0, "C": 3.0}, + "Two": {"A": 1.0, "B": 2.0, "C": 3.0}, + } + ) - expected.index.name, expected.columns.name = 'index', 'columns' + expected.index.name, expected.columns.name = "index", "columns" assert_frame_equal(result, expected) # omit values - result = frame.pivot(columns='columns') + result = frame.pivot(columns="columns") - expected.columns = pd.MultiIndex.from_tuples([('values', 'One'), - ('values', 'Two')], - names=[None, 'columns']) - expected.index.name = 'index' + expected.columns = pd.MultiIndex.from_tuples( + [("values", "One"), ("values", "Two")], names=[None, "columns"] + ) + expected.index.name = "index" tm.assert_frame_equal(result, expected, check_names=False) - assert result.index.name == 'index' - assert result.columns.names == (None, 'columns') + assert result.index.name == "index" + assert result.columns.names == (None, "columns") expected.columns = expected.columns.droplevel(0) - result = frame.pivot(columns='columns', values='values') + result = frame.pivot(columns="columns", values="values") - expected.columns.name = 'columns' + expected.columns.name = "columns" tm.assert_frame_equal(result, expected) def test_stack_unstack(self): @@ -101,22 +106,22 @@ def test_stack_unstack(self): df[:] = np.arange(np.prod(df.shape)).reshape(df.shape) stacked = df.stack() - stacked_df = DataFrame({'foo': stacked, 'bar': stacked}) + stacked_df = DataFrame({"foo": stacked, "bar": stacked}) unstacked = stacked.unstack() unstacked_df = stacked_df.unstack() assert_frame_equal(unstacked, df) - assert_frame_equal(unstacked_df['bar'], df) + assert_frame_equal(unstacked_df["bar"], df) unstacked_cols = stacked.unstack(0) unstacked_cols_df = stacked_df.unstack(0) assert_frame_equal(unstacked_cols.T, df) - assert_frame_equal(unstacked_cols_df['bar'].T, df) + assert_frame_equal(unstacked_cols_df["bar"].T, df) def test_stack_mixed_level(self): # GH 18310 - levels = [range(3), [3, 'a', 'b'], [1, 2]] + levels = [range(3), [3, "a", "b"], [1, 2]] # flat columns: df = DataFrame(1, index=levels[0], columns=levels[1]) @@ -125,17 +130,16 @@ def test_stack_mixed_level(self): assert_series_equal(result, expected) # MultiIndex columns: - df = DataFrame(1, index=levels[0], - columns=MultiIndex.from_product(levels[1:])) + df = DataFrame(1, index=levels[0], columns=MultiIndex.from_product(levels[1:])) result = df.stack(1) - expected = DataFrame(1, index=MultiIndex.from_product([levels[0], - levels[2]]), - columns=levels[1]) + expected = DataFrame( + 1, index=MultiIndex.from_product([levels[0], levels[2]]), columns=levels[1] + ) assert_frame_equal(result, expected) # as above, but used labels in level are actually of homogeneous type - result = df[['a', 'b']].stack(1) - expected = expected[['a', 'b']] + result = df[["a", "b"]].stack(1) + expected = expected[["a", "b"]] assert_frame_equal(result, expected) def test_unstack_fill(self): @@ -146,31 +150,33 @@ def test_unstack_fill(self): # From a series data = Series([1, 2, 4, 5], dtype=np.int16) data.index = MultiIndex.from_tuples( - [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) + [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] + ) result = data.unstack(fill_value=-1) - expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]}, - index=['x', 'y', 'z'], dtype=np.int16) + expected = DataFrame( + {"a": [1, -1, 5], "b": [2, 4, -1]}, index=["x", "y", "z"], dtype=np.int16 + ) assert_frame_equal(result, expected) # From a series with incorrect data type for fill_value result = data.unstack(fill_value=0.5) - expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]}, - index=['x', 'y', 'z'], dtype=np.float) + expected = DataFrame( + {"a": [1, 0.5, 5], "b": [2, 4, 0.5]}, index=["x", "y", "z"], dtype=np.float + ) assert_frame_equal(result, expected) # GH #13971: fill_value when unstacking multiple levels: - df = DataFrame({'x': ['a', 'a', 'b'], - 'y': ['j', 'k', 'j'], - 'z': [0, 1, 2], - 'w': [0, 1, 2]}).set_index(['x', 'y', 'z']) - unstacked = df.unstack(['x', 'y'], fill_value=0) - key = ('w', 'b', 'j') + df = DataFrame( + {"x": ["a", "a", "b"], "y": ["j", "k", "j"], "z": [0, 1, 2], "w": [0, 1, 2]} + ).set_index(["x", "y", "z"]) + unstacked = df.unstack(["x", "y"], fill_value=0) + key = ("w", "b", "j") expected = unstacked[key] result = pd.Series([0, 0, 2], index=unstacked.index, name=key) assert_series_equal(result, expected) - stacked = unstacked.stack(['x', 'y']) + stacked = unstacked.stack(["x", "y"]) stacked.index = stacked.index.reorder_levels(df.index.names) # Workaround for GH #17886 (unnecessarily casts to float): stacked = stacked.astype(np.int64) @@ -178,63 +184,69 @@ def test_unstack_fill(self): assert_frame_equal(result, df) # From a series - s = df['w'] - result = s.unstack(['x', 'y'], fill_value=0) - expected = unstacked['w'] + s = df["w"] + result = s.unstack(["x", "y"], fill_value=0) + expected = unstacked["w"] assert_frame_equal(result, expected) def test_unstack_fill_frame(self): # From a dataframe rows = [[1, 2], [3, 4], [5, 6], [7, 8]] - df = DataFrame(rows, columns=list('AB'), dtype=np.int32) + df = DataFrame(rows, columns=list("AB"), dtype=np.int32) df.index = MultiIndex.from_tuples( - [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) + [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] + ) result = df.unstack(fill_value=-1) rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]] - expected = DataFrame(rows, index=list('xyz'), dtype=np.int32) + expected = DataFrame(rows, index=list("xyz"), dtype=np.int32) expected.columns = MultiIndex.from_tuples( - [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) + [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")] + ) assert_frame_equal(result, expected) # From a mixed type dataframe - df['A'] = df['A'].astype(np.int16) - df['B'] = df['B'].astype(np.float64) + df["A"] = df["A"].astype(np.int16) + df["B"] = df["B"].astype(np.float64) result = df.unstack(fill_value=-1) - expected['A'] = expected['A'].astype(np.int16) - expected['B'] = expected['B'].astype(np.float64) + expected["A"] = expected["A"].astype(np.int16) + expected["B"] = expected["B"].astype(np.float64) assert_frame_equal(result, expected) # From a dataframe with incorrect data type for fill_value result = df.unstack(fill_value=0.5) rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]] - expected = DataFrame(rows, index=list('xyz'), dtype=np.float) + expected = DataFrame(rows, index=list("xyz"), dtype=np.float) expected.columns = MultiIndex.from_tuples( - [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) + [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")] + ) assert_frame_equal(result, expected) def test_unstack_fill_frame_datetime(self): # Test unstacking with date times - dv = pd.date_range('2012-01-01', periods=4).values + dv = pd.date_range("2012-01-01", periods=4).values data = Series(dv) data.index = MultiIndex.from_tuples( - [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) + [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] + ) result = data.unstack() - expected = DataFrame({'a': [dv[0], pd.NaT, dv[3]], - 'b': [dv[1], dv[2], pd.NaT]}, - index=['x', 'y', 'z']) + expected = DataFrame( + {"a": [dv[0], pd.NaT, dv[3]], "b": [dv[1], dv[2], pd.NaT]}, + index=["x", "y", "z"], + ) assert_frame_equal(result, expected) result = data.unstack(fill_value=dv[0]) - expected = DataFrame({'a': [dv[0], dv[0], dv[3]], - 'b': [dv[1], dv[2], dv[0]]}, - index=['x', 'y', 'z']) + expected = DataFrame( + {"a": [dv[0], dv[0], dv[3]], "b": [dv[1], dv[2], dv[0]]}, + index=["x", "y", "z"], + ) assert_frame_equal(result, expected) def test_unstack_fill_frame_timedelta(self): @@ -243,228 +255,263 @@ def test_unstack_fill_frame_timedelta(self): td = [Timedelta(days=i) for i in range(4)] data = Series(td) data.index = MultiIndex.from_tuples( - [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) + [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] + ) result = data.unstack() - expected = DataFrame({'a': [td[0], pd.NaT, td[3]], - 'b': [td[1], td[2], pd.NaT]}, - index=['x', 'y', 'z']) + expected = DataFrame( + {"a": [td[0], pd.NaT, td[3]], "b": [td[1], td[2], pd.NaT]}, + index=["x", "y", "z"], + ) assert_frame_equal(result, expected) result = data.unstack(fill_value=td[1]) - expected = DataFrame({'a': [td[0], td[1], td[3]], - 'b': [td[1], td[2], td[1]]}, - index=['x', 'y', 'z']) + expected = DataFrame( + {"a": [td[0], td[1], td[3]], "b": [td[1], td[2], td[1]]}, + index=["x", "y", "z"], + ) assert_frame_equal(result, expected) def test_unstack_fill_frame_period(self): # Test unstacking with period - periods = [Period('2012-01'), Period('2012-02'), Period('2012-03'), - Period('2012-04')] + periods = [ + Period("2012-01"), + Period("2012-02"), + Period("2012-03"), + Period("2012-04"), + ] data = Series(periods) data.index = MultiIndex.from_tuples( - [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) + [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] + ) result = data.unstack() - expected = DataFrame({'a': [periods[0], None, periods[3]], - 'b': [periods[1], periods[2], None]}, - index=['x', 'y', 'z']) + expected = DataFrame( + {"a": [periods[0], None, periods[3]], "b": [periods[1], periods[2], None]}, + index=["x", "y", "z"], + ) assert_frame_equal(result, expected) result = data.unstack(fill_value=periods[1]) - expected = DataFrame({'a': [periods[0], periods[1], periods[3]], - 'b': [periods[1], periods[2], periods[1]]}, - index=['x', 'y', 'z']) + expected = DataFrame( + { + "a": [periods[0], periods[1], periods[3]], + "b": [periods[1], periods[2], periods[1]], + }, + index=["x", "y", "z"], + ) assert_frame_equal(result, expected) def test_unstack_fill_frame_categorical(self): # Test unstacking with categorical - data = pd.Series(['a', 'b', 'c', 'a'], dtype='category') + data = pd.Series(["a", "b", "c", "a"], dtype="category") data.index = pd.MultiIndex.from_tuples( - [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')], + [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] ) # By default missing values will be NaN result = data.unstack() - expected = DataFrame({'a': pd.Categorical(list('axa'), - categories=list('abc')), - 'b': pd.Categorical(list('bcx'), - categories=list('abc'))}, - index=list('xyz')) + expected = DataFrame( + { + "a": pd.Categorical(list("axa"), categories=list("abc")), + "b": pd.Categorical(list("bcx"), categories=list("abc")), + }, + index=list("xyz"), + ) assert_frame_equal(result, expected) # Fill with non-category results in a TypeError msg = r"'fill_value' \('d'\) is not in" with pytest.raises(TypeError, match=msg): - data.unstack(fill_value='d') + data.unstack(fill_value="d") # Fill with category value replaces missing values as expected - result = data.unstack(fill_value='c') - expected = DataFrame({'a': pd.Categorical(list('aca'), - categories=list('abc')), - 'b': pd.Categorical(list('bcc'), - categories=list('abc'))}, - index=list('xyz')) + result = data.unstack(fill_value="c") + expected = DataFrame( + { + "a": pd.Categorical(list("aca"), categories=list("abc")), + "b": pd.Categorical(list("bcc"), categories=list("abc")), + }, + index=list("xyz"), + ) assert_frame_equal(result, expected) def test_unstack_preserve_dtypes(self): # Checks fix for #11847 - df = pd.DataFrame(dict(state=['IL', 'MI', 'NC'], - index=['a', 'b', 'c'], - some_categories=pd.Series(['a', 'b', 'c'] - ).astype('category'), - A=np.random.rand(3), - B=1, - C='foo', - D=pd.Timestamp('20010102'), - E=pd.Series([1.0, 50.0, 100.0] - ).astype('float32'), - F=pd.Series([3.0, 4.0, 5.0]).astype('float64'), - G=False, - H=pd.Series([1, 200, 923442], dtype='int8'))) + df = pd.DataFrame( + dict( + state=["IL", "MI", "NC"], + index=["a", "b", "c"], + some_categories=pd.Series(["a", "b", "c"]).astype("category"), + A=np.random.rand(3), + B=1, + C="foo", + D=pd.Timestamp("20010102"), + E=pd.Series([1.0, 50.0, 100.0]).astype("float32"), + F=pd.Series([3.0, 4.0, 5.0]).astype("float64"), + G=False, + H=pd.Series([1, 200, 923442], dtype="int8"), + ) + ) def unstack_and_compare(df, column_name): unstacked1 = df.unstack([column_name]) unstacked2 = df.unstack(column_name) assert_frame_equal(unstacked1, unstacked2) - df1 = df.set_index(['state', 'index']) - unstack_and_compare(df1, 'index') + df1 = df.set_index(["state", "index"]) + unstack_and_compare(df1, "index") - df1 = df.set_index(['state', 'some_categories']) - unstack_and_compare(df1, 'some_categories') + df1 = df.set_index(["state", "some_categories"]) + unstack_and_compare(df1, "some_categories") - df1 = df.set_index(['F', 'C']) - unstack_and_compare(df1, 'F') + df1 = df.set_index(["F", "C"]) + unstack_and_compare(df1, "F") - df1 = df.set_index(['G', 'B', 'state']) - unstack_and_compare(df1, 'B') + df1 = df.set_index(["G", "B", "state"]) + unstack_and_compare(df1, "B") - df1 = df.set_index(['E', 'A']) - unstack_and_compare(df1, 'E') + df1 = df.set_index(["E", "A"]) + unstack_and_compare(df1, "E") - df1 = df.set_index(['state', 'index']) - s = df1['A'] - unstack_and_compare(s, 'index') + df1 = df.set_index(["state", "index"]) + s = df1["A"] + unstack_and_compare(s, "index") def test_stack_ints(self): - columns = MultiIndex.from_tuples(list(itertools.product(range(3), - repeat=3))) + columns = MultiIndex.from_tuples(list(itertools.product(range(3), repeat=3))) df = DataFrame(np.random.randn(30, 27), columns=columns) - assert_frame_equal(df.stack(level=[1, 2]), - df.stack(level=1).stack(level=1)) - assert_frame_equal(df.stack(level=[-2, -1]), - df.stack(level=1).stack(level=1)) + assert_frame_equal(df.stack(level=[1, 2]), df.stack(level=1).stack(level=1)) + assert_frame_equal(df.stack(level=[-2, -1]), df.stack(level=1).stack(level=1)) df_named = df.copy() df_named.columns.set_names(range(3), inplace=True) - assert_frame_equal(df_named.stack(level=[1, 2]), - df_named.stack(level=1).stack(level=1)) + assert_frame_equal( + df_named.stack(level=[1, 2]), df_named.stack(level=1).stack(level=1) + ) def test_stack_mixed_levels(self): columns = MultiIndex.from_tuples( - [('A', 'cat', 'long'), ('B', 'cat', 'long'), - ('A', 'dog', 'short'), ('B', 'dog', 'short')], - names=['exp', 'animal', 'hair_length'] + [ + ("A", "cat", "long"), + ("B", "cat", "long"), + ("A", "dog", "short"), + ("B", "dog", "short"), + ], + names=["exp", "animal", "hair_length"], ) df = DataFrame(np.random.randn(4, 4), columns=columns) - animal_hair_stacked = df.stack(level=['animal', 'hair_length']) - exp_hair_stacked = df.stack(level=['exp', 'hair_length']) + animal_hair_stacked = df.stack(level=["animal", "hair_length"]) + exp_hair_stacked = df.stack(level=["exp", "hair_length"]) # GH #8584: Need to check that stacking works when a number # is passed that is both a level name and in the range of # the level numbers df2 = df.copy() - df2.columns.names = ['exp', 'animal', 1] - assert_frame_equal(df2.stack(level=['animal', 1]), - animal_hair_stacked, check_names=False) - assert_frame_equal(df2.stack(level=['exp', 1]), - exp_hair_stacked, check_names=False) + df2.columns.names = ["exp", "animal", 1] + assert_frame_equal( + df2.stack(level=["animal", 1]), animal_hair_stacked, check_names=False + ) + assert_frame_equal( + df2.stack(level=["exp", 1]), exp_hair_stacked, check_names=False + ) # When mixed types are passed and the ints are not level # names, raise - msg = ("level should contain all level names or all level numbers, not" - " a mixture of the two") + msg = ( + "level should contain all level names or all level numbers, not" + " a mixture of the two" + ) with pytest.raises(ValueError, match=msg): - df2.stack(level=['animal', 0]) + df2.stack(level=["animal", 0]) # GH #8584: Having 0 in the level names could raise a # strange error about lexsort depth df3 = df.copy() - df3.columns.names = ['exp', 'animal', 0] - assert_frame_equal(df3.stack(level=['animal', 0]), - animal_hair_stacked, check_names=False) + df3.columns.names = ["exp", "animal", 0] + assert_frame_equal( + df3.stack(level=["animal", 0]), animal_hair_stacked, check_names=False + ) def test_stack_int_level_names(self): columns = MultiIndex.from_tuples( - [('A', 'cat', 'long'), ('B', 'cat', 'long'), - ('A', 'dog', 'short'), ('B', 'dog', 'short')], - names=['exp', 'animal', 'hair_length'] + [ + ("A", "cat", "long"), + ("B", "cat", "long"), + ("A", "dog", "short"), + ("B", "dog", "short"), + ], + names=["exp", "animal", "hair_length"], ) df = DataFrame(np.random.randn(4, 4), columns=columns) - exp_animal_stacked = df.stack(level=['exp', 'animal']) - animal_hair_stacked = df.stack(level=['animal', 'hair_length']) - exp_hair_stacked = df.stack(level=['exp', 'hair_length']) + exp_animal_stacked = df.stack(level=["exp", "animal"]) + animal_hair_stacked = df.stack(level=["animal", "hair_length"]) + exp_hair_stacked = df.stack(level=["exp", "hair_length"]) df2 = df.copy() df2.columns.names = [0, 1, 2] - assert_frame_equal(df2.stack(level=[1, 2]), animal_hair_stacked, - check_names=False) - assert_frame_equal(df2.stack(level=[0, 1]), exp_animal_stacked, - check_names=False) - assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked, - check_names=False) + assert_frame_equal( + df2.stack(level=[1, 2]), animal_hair_stacked, check_names=False + ) + assert_frame_equal( + df2.stack(level=[0, 1]), exp_animal_stacked, check_names=False + ) + assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked, check_names=False) # Out-of-order int column names df3 = df.copy() df3.columns.names = [2, 0, 1] - assert_frame_equal(df3.stack(level=[0, 1]), animal_hair_stacked, - check_names=False) - assert_frame_equal(df3.stack(level=[2, 0]), exp_animal_stacked, - check_names=False) - assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked, - check_names=False) + assert_frame_equal( + df3.stack(level=[0, 1]), animal_hair_stacked, check_names=False + ) + assert_frame_equal( + df3.stack(level=[2, 0]), exp_animal_stacked, check_names=False + ) + assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked, check_names=False) def test_unstack_bool(self): - df = DataFrame([False, False], - index=MultiIndex.from_arrays([['a', 'b'], ['c', 'l']]), - columns=['col']) + df = DataFrame( + [False, False], + index=MultiIndex.from_arrays([["a", "b"], ["c", "l"]]), + columns=["col"], + ) rs = df.unstack() - xp = DataFrame(np.array([[False, np.nan], [np.nan, False]], - dtype=object), - index=['a', 'b'], - columns=MultiIndex.from_arrays([['col', 'col'], - ['c', 'l']])) + xp = DataFrame( + np.array([[False, np.nan], [np.nan, False]], dtype=object), + index=["a", "b"], + columns=MultiIndex.from_arrays([["col", "col"], ["c", "l"]]), + ) assert_frame_equal(rs, xp) def test_unstack_level_binding(self): # GH9856 mi = pd.MultiIndex( - levels=[['foo', 'bar'], ['one', 'two'], - ['a', 'b']], + levels=[["foo", "bar"], ["one", "two"], ["a", "b"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]], - names=['first', 'second', 'third']) + names=["first", "second", "third"], + ) s = pd.Series(0, index=mi) result = s.unstack([1, 2]).stack(0) expected_mi = pd.MultiIndex( - levels=[['foo', 'bar'], ['one', 'two']], + levels=[["foo", "bar"], ["one", "two"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], - names=['first', 'second']) + names=["first", "second"], + ) - expected = pd.DataFrame(np.array([[np.nan, 0], - [0, np.nan], - [np.nan, 0], - [0, np.nan]], - dtype=np.float64), - index=expected_mi, - columns=pd.Index(['a', 'b'], name='third')) + expected = pd.DataFrame( + np.array( + [[np.nan, 0], [0, np.nan], [np.nan, 0], [0, np.nan]], dtype=np.float64 + ), + index=expected_mi, + columns=pd.Index(["a", "b"], name="third"), + ) assert_frame_equal(result, expected) @@ -477,12 +524,14 @@ def test_unstack_to_series(self): assert_frame_equal(undo, self.frame) # check NA handling - data = DataFrame({'x': [1, 2, np.NaN], 'y': [3.0, 4, np.NaN]}) - data.index = Index(['a', 'b', 'c']) + data = DataFrame({"x": [1, 2, np.NaN], "y": [3.0, 4, np.NaN]}) + data.index = Index(["a", "b", "c"]) result = data.unstack() - midx = MultiIndex(levels=[['x', 'y'], ['a', 'b', 'c']], - codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) + midx = MultiIndex( + levels=[["x", "y"], ["a", "b", "c"]], + codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], + ) expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx) assert_series_equal(result, expected) @@ -496,89 +545,94 @@ def test_unstack_to_series(self): def test_unstack_dtypes(self): # GH 2929 - rows = [[1, 1, 3, 4], - [1, 2, 3, 4], - [2, 1, 3, 4], - [2, 2, 3, 4]] + rows = [[1, 1, 3, 4], [1, 2, 3, 4], [2, 1, 3, 4], [2, 2, 3, 4]] - df = DataFrame(rows, columns=list('ABCD')) + df = DataFrame(rows, columns=list("ABCD")) result = df.dtypes - expected = Series([np.dtype('int64')] * 4, - index=list('ABCD')) + expected = Series([np.dtype("int64")] * 4, index=list("ABCD")) assert_series_equal(result, expected) # single dtype - df2 = df.set_index(['A', 'B']) - df3 = df2.unstack('B') + df2 = df.set_index(["A", "B"]) + df3 = df2.unstack("B") result = df3.dtypes - expected = Series([np.dtype('int64')] * 4, - index=pd.MultiIndex.from_arrays([ - ['C', 'C', 'D', 'D'], - [1, 2, 1, 2] - ], names=(None, 'B'))) + expected = Series( + [np.dtype("int64")] * 4, + index=pd.MultiIndex.from_arrays( + [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B") + ), + ) assert_series_equal(result, expected) # mixed - df2 = df.set_index(['A', 'B']) - df2['C'] = 3. - df3 = df2.unstack('B') + df2 = df.set_index(["A", "B"]) + df2["C"] = 3.0 + df3 = df2.unstack("B") result = df3.dtypes - expected = Series([np.dtype('float64')] * 2 + [np.dtype('int64')] * 2, - index=pd.MultiIndex.from_arrays([ - ['C', 'C', 'D', 'D'], - [1, 2, 1, 2] - ], names=(None, 'B'))) + expected = Series( + [np.dtype("float64")] * 2 + [np.dtype("int64")] * 2, + index=pd.MultiIndex.from_arrays( + [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B") + ), + ) assert_series_equal(result, expected) - df2['D'] = 'foo' - df3 = df2.unstack('B') + df2["D"] = "foo" + df3 = df2.unstack("B") result = df3.dtypes - expected = Series([np.dtype('float64')] * 2 + [np.dtype('object')] * 2, - index=pd.MultiIndex.from_arrays([ - ['C', 'C', 'D', 'D'], - [1, 2, 1, 2] - ], names=(None, 'B'))) + expected = Series( + [np.dtype("float64")] * 2 + [np.dtype("object")] * 2, + index=pd.MultiIndex.from_arrays( + [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B") + ), + ) assert_series_equal(result, expected) # GH7405 - for c, d in (np.zeros(5), np.zeros(5)), \ - (np.arange(5, dtype='f8'), np.arange(5, 10, dtype='f8')): - - df = DataFrame({'A': ['a'] * 5, 'C': c, 'D': d, - 'B': pd.date_range('2012-01-01', periods=5)}) + for c, d in ( + (np.zeros(5), np.zeros(5)), + (np.arange(5, dtype="f8"), np.arange(5, 10, dtype="f8")), + ): + + df = DataFrame( + { + "A": ["a"] * 5, + "C": c, + "D": d, + "B": pd.date_range("2012-01-01", periods=5), + } + ) right = df.iloc[:3].copy(deep=True) - df = df.set_index(['A', 'B']) - df['D'] = df['D'].astype('int64') + df = df.set_index(["A", "B"]) + df["D"] = df["D"].astype("int64") left = df.iloc[:3].unstack(0) - right = right.set_index(['A', 'B']).unstack(0) - right[('D', 'a')] = right[('D', 'a')].astype('int64') + right = right.set_index(["A", "B"]).unstack(0) + right[("D", "a")] = right[("D", "a")].astype("int64") assert left.shape == (3, 2) tm.assert_frame_equal(left, right) def test_unstack_non_unique_index_names(self): - idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')], - names=['c1', 'c1']) + idx = MultiIndex.from_tuples([("a", "b"), ("c", "d")], names=["c1", "c1"]) df = DataFrame([1, 2], index=idx) with pytest.raises(ValueError): - df.unstack('c1') + df.unstack("c1") with pytest.raises(ValueError): - df.T.stack('c1') + df.T.stack("c1") def test_unstack_unused_levels(self): # GH 17845: unused codes in index make unstack() cast int to float - idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1] + idx = pd.MultiIndex.from_product([["a"], ["A", "B", "C", "D"]])[:-1] df = pd.DataFrame([[1, 0]] * 3, index=idx) result = df.unstack() - exp_col = pd.MultiIndex.from_product([[0, 1], ['A', 'B', 'C']]) - expected = pd.DataFrame([[1, 1, 1, 0, 0, 0]], index=['a'], - columns=exp_col) + exp_col = pd.MultiIndex.from_product([[0, 1], ["A", "B", "C"]]) + expected = pd.DataFrame([[1, 1, 1, 0, 0, 0]], index=["a"], columns=exp_col) tm.assert_frame_equal(result, expected) - assert((result.columns.levels[1] == idx.levels[1]).all()) + assert (result.columns.levels[1] == idx.levels[1]).all() # Unused items on both levels levels = [[0, 1, 7], [0, 1, 2, 3]] @@ -587,67 +641,73 @@ def test_unstack_unused_levels(self): block = np.arange(4).reshape(2, 2) df = pd.DataFrame(np.concatenate([block, block + 4]), index=idx) result = df.unstack() - expected = pd.DataFrame(np.concatenate([block * 2, block * 2 + 1], - axis=1), - columns=idx) + expected = pd.DataFrame( + np.concatenate([block * 2, block * 2 + 1], axis=1), columns=idx + ) tm.assert_frame_equal(result, expected) - assert((result.columns.levels[1] == idx.levels[1]).all()) + assert (result.columns.levels[1] == idx.levels[1]).all() # With mixed dtype and NaN - levels = [['a', 2, 'c'], [1, 3, 5, 7]] + levels = [["a", 2, "c"], [1, 3, 5, 7]] codes = [[0, -1, 1, 1], [0, 2, -1, 2]] idx = pd.MultiIndex(levels, codes) data = np.arange(8) df = pd.DataFrame(data.reshape(4, 2), index=idx) - cases = ((0, [13, 16, 6, 9, 2, 5, 8, 11], - [np.nan, 'a', 2], [np.nan, 5, 1]), - (1, [8, 11, 1, 4, 12, 15, 13, 16], - [np.nan, 5, 1], [np.nan, 'a', 2])) + cases = ( + (0, [13, 16, 6, 9, 2, 5, 8, 11], [np.nan, "a", 2], [np.nan, 5, 1]), + (1, [8, 11, 1, 4, 12, 15, 13, 16], [np.nan, 5, 1], [np.nan, "a", 2]), + ) for level, idces, col_level, idx_level in cases: result = df.unstack(level=level) exp_data = np.zeros(18) * np.nan exp_data[idces] = data cols = pd.MultiIndex.from_product([[0, 1], col_level]) - expected = pd.DataFrame(exp_data.reshape(3, 6), - index=idx_level, columns=cols) + expected = pd.DataFrame( + exp_data.reshape(3, 6), index=idx_level, columns=cols + ) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("cols", [['A', 'C'], slice(None)]) + @pytest.mark.parametrize("cols", [["A", "C"], slice(None)]) def test_unstack_unused_level(self, cols): # GH 18562 : unused codes on the unstacked level - df = pd.DataFrame([[2010, 'a', 'I'], - [2011, 'b', 'II']], - columns=['A', 'B', 'C']) + df = pd.DataFrame( + [[2010, "a", "I"], [2011, "b", "II"]], columns=["A", "B", "C"] + ) - ind = df.set_index(['A', 'B', 'C'], drop=False) - selection = ind.loc[(slice(None), slice(None), 'I'), cols] + ind = df.set_index(["A", "B", "C"], drop=False) + selection = ind.loc[(slice(None), slice(None), "I"), cols] result = selection.unstack() expected = ind.iloc[[0]][cols] - expected.columns = MultiIndex.from_product([expected.columns, ['I']], - names=[None, 'C']) - expected.index = expected.index.droplevel('C') + expected.columns = MultiIndex.from_product( + [expected.columns, ["I"]], names=[None, "C"] + ) + expected.index = expected.index.droplevel("C") tm.assert_frame_equal(result, expected) def test_unstack_nan_index(self): # GH7466 - cast = lambda val: '{0:1}'.format('' if val != val else val) + cast = lambda val: "{0:1}".format("" if val != val else val) def verify(df): mk_list = lambda a: list(a) if isinstance(a, tuple) else [a] rows, cols = df.notna().values.nonzero() for i, j in zip(rows, cols): - left = sorted(df.iloc[i, j].split('.')) + left = sorted(df.iloc[i, j].split(".")) right = mk_list(df.index[i]) + mk_list(df.columns[j]) right = sorted(list(map(cast, right))) assert left == right - df = DataFrame({'jim': ['a', 'b', np.nan, 'd'], - 'joe': ['w', 'x', 'y', 'z'], - 'jolie': ['a.w', 'b.x', ' .y', 'd.z']}) + df = DataFrame( + { + "jim": ["a", "b", np.nan, "d"], + "joe": ["w", "x", "y", "z"], + "jolie": ["a.w", "b.x", " .y", "d.z"], + } + ) - left = df.set_index(['jim', 'joe']).unstack()['jolie'] - right = df.set_index(['joe', 'jim']).unstack()['jolie'].T + left = df.set_index(["jim", "joe"]).unstack()["jolie"] + right = df.set_index(["joe", "jim"]).unstack()["jolie"].T assert_frame_equal(left, right) for idx in itertools.permutations(df.columns[:2]): @@ -655,154 +715,208 @@ def verify(df): for lev in range(2): udf = mi.unstack(level=lev) assert udf.notna().values.sum() == len(df) - verify(udf['jolie']) + verify(udf["jolie"]) - df = DataFrame({'1st': ['d'] * 3 + [np.nan] * 5 + ['a'] * 2 + - ['c'] * 3 + ['e'] * 2 + ['b'] * 5, - '2nd': ['y'] * 2 + ['w'] * 3 + [np.nan] * 3 + - ['z'] * 4 + [np.nan] * 3 + ['x'] * 3 + [np.nan] * 2, - '3rd': [67, 39, 53, 72, 57, 80, 31, 18, 11, 30, 59, - 50, 62, 59, 76, 52, 14, 53, 60, 51]}) + df = DataFrame( + { + "1st": ["d"] * 3 + + [np.nan] * 5 + + ["a"] * 2 + + ["c"] * 3 + + ["e"] * 2 + + ["b"] * 5, + "2nd": ["y"] * 2 + + ["w"] * 3 + + [np.nan] * 3 + + ["z"] * 4 + + [np.nan] * 3 + + ["x"] * 3 + + [np.nan] * 2, + "3rd": [ + 67, + 39, + 53, + 72, + 57, + 80, + 31, + 18, + 11, + 30, + 59, + 50, + 62, + 59, + 76, + 52, + 14, + 53, + 60, + 51, + ], + } + ) - df['4th'], df['5th'] = \ - df.apply(lambda r: '.'.join(map(cast, r)), axis=1), \ - df.apply(lambda r: '.'.join(map(cast, r.iloc[::-1])), axis=1) + df["4th"], df["5th"] = ( + df.apply(lambda r: ".".join(map(cast, r)), axis=1), + df.apply(lambda r: ".".join(map(cast, r.iloc[::-1])), axis=1), + ) - for idx in itertools.permutations(['1st', '2nd', '3rd']): + for idx in itertools.permutations(["1st", "2nd", "3rd"]): mi = df.set_index(list(idx)) for lev in range(3): udf = mi.unstack(level=lev) assert udf.notna().values.sum() == 2 * len(df) - for col in ['4th', '5th']: + for col in ["4th", "5th"]: verify(udf[col]) # GH7403 - df = pd.DataFrame( - {'A': list('aaaabbbb'), 'B': range(8), 'C': range(8)}) + df = pd.DataFrame({"A": list("aaaabbbb"), "B": range(8), "C": range(8)}) df.iloc[3, 1] = np.NaN - left = df.set_index(['A', 'B']).unstack(0) + left = df.set_index(["A", "B"]).unstack(0) - vals = [[3, 0, 1, 2, np.nan, np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan, np.nan, 4, 5, 6, 7]] + vals = [ + [3, 0, 1, 2, np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, np.nan, 4, 5, 6, 7], + ] vals = list(map(list, zip(*vals))) - idx = Index([np.nan, 0, 1, 2, 4, 5, 6, 7], name='B') - cols = MultiIndex(levels=[['C'], ['a', 'b']], - codes=[[0, 0], [0, 1]], - names=[None, 'A']) + idx = Index([np.nan, 0, 1, 2, 4, 5, 6, 7], name="B") + cols = MultiIndex( + levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"] + ) right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) - df = DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2, - 'C': range(8)}) + df = DataFrame({"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)}) df.iloc[2, 1] = np.NaN - left = df.set_index(['A', 'B']).unstack(0) + left = df.set_index(["A", "B"]).unstack(0) vals = [[2, np.nan], [0, 4], [1, 5], [np.nan, 6], [3, 7]] - cols = MultiIndex(levels=[['C'], ['a', 'b']], - codes=[[0, 0], [0, 1]], - names=[None, 'A']) - idx = Index([np.nan, 0, 1, 2, 3], name='B') + cols = MultiIndex( + levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"] + ) + idx = Index([np.nan, 0, 1, 2, 3], name="B") right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) - df = pd.DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2, - 'C': range(8)}) + df = pd.DataFrame( + {"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)} + ) df.iloc[3, 1] = np.NaN - left = df.set_index(['A', 'B']).unstack(0) + left = df.set_index(["A", "B"]).unstack(0) vals = [[3, np.nan], [0, 4], [1, 5], [2, 6], [np.nan, 7]] - cols = MultiIndex(levels=[['C'], ['a', 'b']], - codes=[[0, 0], [0, 1]], - names=[None, 'A']) - idx = Index([np.nan, 0, 1, 2, 3], name='B') + cols = MultiIndex( + levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"] + ) + idx = Index([np.nan, 0, 1, 2, 3], name="B") right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) # GH7401 - df = pd.DataFrame({'A': list('aaaaabbbbb'), - 'B': (date_range('2012-01-01', periods=5) - .tolist() * 2), - 'C': np.arange(10)}) + df = pd.DataFrame( + { + "A": list("aaaaabbbbb"), + "B": (date_range("2012-01-01", periods=5).tolist() * 2), + "C": np.arange(10), + } + ) df.iloc[3, 1] = np.NaN - left = df.set_index(['A', 'B']).unstack() + left = df.set_index(["A", "B"]).unstack() vals = np.array([[3, 0, 1, 2, np.nan, 4], [np.nan, 5, 6, 7, 8, 9]]) - idx = Index(['a', 'b'], name='A') - cols = MultiIndex(levels=[['C'], date_range('2012-01-01', periods=5)], - codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], - names=[None, 'B']) + idx = Index(["a", "b"], name="A") + cols = MultiIndex( + levels=[["C"], date_range("2012-01-01", periods=5)], + codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], + names=[None, "B"], + ) right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) # GH4862 - vals = [['Hg', np.nan, np.nan, 680585148], - ['U', 0.0, np.nan, 680585148], - ['Pb', 7.07e-06, np.nan, 680585148], - ['Sn', 2.3614e-05, 0.0133, 680607017], - ['Ag', 0.0, 0.0133, 680607017], - ['Hg', -0.00015, 0.0133, 680607017]] - df = DataFrame(vals, columns=['agent', 'change', 'dosage', 's_id'], - index=[17263, 17264, 17265, 17266, 17267, 17268]) + vals = [ + ["Hg", np.nan, np.nan, 680585148], + ["U", 0.0, np.nan, 680585148], + ["Pb", 7.07e-06, np.nan, 680585148], + ["Sn", 2.3614e-05, 0.0133, 680607017], + ["Ag", 0.0, 0.0133, 680607017], + ["Hg", -0.00015, 0.0133, 680607017], + ] + df = DataFrame( + vals, + columns=["agent", "change", "dosage", "s_id"], + index=[17263, 17264, 17265, 17266, 17267, 17268], + ) - left = df.copy().set_index(['s_id', 'dosage', 'agent']).unstack() + left = df.copy().set_index(["s_id", "dosage", "agent"]).unstack() - vals = [[np.nan, np.nan, 7.07e-06, np.nan, 0.0], - [0.0, -0.00015, np.nan, 2.3614e-05, np.nan]] + vals = [ + [np.nan, np.nan, 7.07e-06, np.nan, 0.0], + [0.0, -0.00015, np.nan, 2.3614e-05, np.nan], + ] - idx = MultiIndex(levels=[[680585148, 680607017], [0.0133]], - codes=[[0, 1], [-1, 0]], - names=['s_id', 'dosage']) + idx = MultiIndex( + levels=[[680585148, 680607017], [0.0133]], + codes=[[0, 1], [-1, 0]], + names=["s_id", "dosage"], + ) - cols = MultiIndex(levels=[['change'], ['Ag', 'Hg', 'Pb', 'Sn', 'U']], - codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]], - names=[None, 'agent']) + cols = MultiIndex( + levels=[["change"], ["Ag", "Hg", "Pb", "Sn", "U"]], + codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]], + names=[None, "agent"], + ) right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) - left = df.loc[17264:].copy().set_index(['s_id', 'dosage', 'agent']) + left = df.loc[17264:].copy().set_index(["s_id", "dosage", "agent"]) assert_frame_equal(left.unstack(), right) # GH9497 - multiple unstack with nulls - df = DataFrame({'1st': [1, 2, 1, 2, 1, 2], - '2nd': pd.date_range('2014-02-01', periods=6, - freq='D'), - 'jim': 100 + np.arange(6), - 'joe': (np.random.randn(6) * 10).round(2)}) + df = DataFrame( + { + "1st": [1, 2, 1, 2, 1, 2], + "2nd": pd.date_range("2014-02-01", periods=6, freq="D"), + "jim": 100 + np.arange(6), + "joe": (np.random.randn(6) * 10).round(2), + } + ) - df['3rd'] = df['2nd'] - pd.Timestamp('2014-02-02') - df.loc[1, '2nd'] = df.loc[3, '2nd'] = np.nan - df.loc[1, '3rd'] = df.loc[4, '3rd'] = np.nan + df["3rd"] = df["2nd"] - pd.Timestamp("2014-02-02") + df.loc[1, "2nd"] = df.loc[3, "2nd"] = np.nan + df.loc[1, "3rd"] = df.loc[4, "3rd"] = np.nan - left = df.set_index(['1st', '2nd', '3rd']).unstack(['2nd', '3rd']) + left = df.set_index(["1st", "2nd", "3rd"]).unstack(["2nd", "3rd"]) assert left.notna().values.sum() == 2 * len(df) - for col in ['jim', 'joe']: + for col in ["jim", "joe"]: for _, r in df.iterrows(): - key = r['1st'], (col, r['2nd'], r['3rd']) + key = r["1st"], (col, r["2nd"], r["3rd"]) assert r[col] == left.loc[key] def test_stack_datetime_column_multiIndex(self): # GH 8039 t = datetime(2014, 1, 1) - df = DataFrame( - [1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, 'A', 'B')])) + df = DataFrame([1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, "A", "B")])) result = df.stack() - eidx = MultiIndex.from_product([(0, 1, 2, 3), ('B',)]) - ecols = MultiIndex.from_tuples([(t, 'A')]) + eidx = MultiIndex.from_product([(0, 1, 2, 3), ("B",)]) + ecols = MultiIndex.from_tuples([(t, "A")]) expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols) assert_frame_equal(result, expected) def test_stack_partial_multiIndex(self): # GH 8844 def _test_stack_with_multiindex(multiindex): - df = DataFrame(np.arange(3 * len(multiindex)) - .reshape(3, len(multiindex)), - columns=multiindex) + df = DataFrame( + np.arange(3 * len(multiindex)).reshape(3, len(multiindex)), + columns=multiindex, + ) for level in (-1, 0, 1, [0, 1], [1, 0]): result = df.stack(level=level, dropna=False) @@ -816,48 +930,57 @@ def _test_stack_with_multiindex(multiindex): else: assert_frame_equal(result, expected) - df.columns = MultiIndex.from_tuples(df.columns.to_numpy(), - names=df.columns.names) + df.columns = MultiIndex.from_tuples( + df.columns.to_numpy(), names=df.columns.names + ) expected = df.stack(level=level, dropna=False) if isinstance(expected, Series): assert_series_equal(result, expected) else: assert_frame_equal(result, expected) - full_multiindex = MultiIndex.from_tuples([('B', 'x'), ('B', 'z'), - ('A', 'y'), - ('C', 'x'), ('C', 'u')], - names=['Upper', 'Lower']) - for multiindex_columns in ([0, 1, 2, 3, 4], - [0, 1, 2, 3], [0, 1, 2, 4], - [0, 1, 2], [1, 2, 3], [2, 3, 4], - [0, 1], [0, 2], [0, 3], - [0], [2], [4]): + full_multiindex = MultiIndex.from_tuples( + [("B", "x"), ("B", "z"), ("A", "y"), ("C", "x"), ("C", "u")], + names=["Upper", "Lower"], + ) + for multiindex_columns in ( + [0, 1, 2, 3, 4], + [0, 1, 2, 3], + [0, 1, 2, 4], + [0, 1, 2], + [1, 2, 3], + [2, 3, 4], + [0, 1], + [0, 2], + [0, 3], + [0], + [2], + [4], + ): _test_stack_with_multiindex(full_multiindex[multiindex_columns]) if len(multiindex_columns) > 1: multiindex_columns.reverse() - _test_stack_with_multiindex( - full_multiindex[multiindex_columns]) + _test_stack_with_multiindex(full_multiindex[multiindex_columns]) - df = DataFrame(np.arange(6).reshape(2, 3), - columns=full_multiindex[[0, 1, 3]]) + df = DataFrame(np.arange(6).reshape(2, 3), columns=full_multiindex[[0, 1, 3]]) result = df.stack(dropna=False) - expected = DataFrame([[0, 2], [1, np.nan], [3, 5], [4, np.nan]], - index=MultiIndex( - levels=[[0, 1], ['u', 'x', 'y', 'z']], - codes=[[0, 0, 1, 1], - [1, 3, 1, 3]], - names=[None, 'Lower']), - columns=Index(['B', 'C'], name='Upper'), - dtype=df.dtypes[0]) + expected = DataFrame( + [[0, 2], [1, np.nan], [3, 5], [4, np.nan]], + index=MultiIndex( + levels=[[0, 1], ["u", "x", "y", "z"]], + codes=[[0, 0, 1, 1], [1, 3, 1, 3]], + names=[None, "Lower"], + ), + columns=Index(["B", "C"], name="Upper"), + dtype=df.dtypes[0], + ) assert_frame_equal(result, expected) - @pytest.mark.parametrize('ordered', [False, True]) - @pytest.mark.parametrize('labels', [list("yxz"), list("yxy")]) + @pytest.mark.parametrize("ordered", [False, True]) + @pytest.mark.parametrize("labels", [list("yxz"), list("yxy")]) def test_stack_preserve_categorical_dtype(self, ordered, labels): # GH13854 - cidx = pd.CategoricalIndex(labels, categories=list("xyz"), - ordered=ordered) + cidx = pd.CategoricalIndex(labels, categories=list("xyz"), ordered=ordered) df = DataFrame([[10, 11, 12]], columns=cidx) result = df.stack() @@ -870,43 +993,51 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels): def test_stack_preserve_categorical_dtype_values(self): # GH-23077 - cat = pd.Categorical(['a', 'a', 'b', 'c']) + cat = pd.Categorical(["a", "a", "b", "c"]) df = pd.DataFrame({"A": cat, "B": cat}) result = df.stack() - index = pd.MultiIndex.from_product([[0, 1, 2, 3], ['A', 'B']]) - expected = pd.Series(pd.Categorical(['a', 'a', 'a', 'a', - 'b', 'b', 'c', 'c']), - index=index) + index = pd.MultiIndex.from_product([[0, 1, 2, 3], ["A", "B"]]) + expected = pd.Series( + pd.Categorical(["a", "a", "a", "a", "b", "b", "c", "c"]), index=index + ) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('level', [0, 1]) + @pytest.mark.parametrize("level", [0, 1]) def test_unstack_mixed_extension_types(self, level): - index = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 1)], - names=['a', 'b']) - df = pd.DataFrame({"A": pd.core.arrays.integer_array([0, 1, None]), - "B": pd.Categorical(['a', 'a', 'b'])}, index=index) + index = pd.MultiIndex.from_tuples( + [("A", 0), ("A", 1), ("B", 1)], names=["a", "b"] + ) + df = pd.DataFrame( + { + "A": pd.core.arrays.integer_array([0, 1, None]), + "B": pd.Categorical(["a", "a", "b"]), + }, + index=index, + ) result = df.unstack(level=level) expected = df.astype(object).unstack(level=level) - expected_dtypes = pd.Series([df.A.dtype] * 2 + [df.B.dtype] * 2, - index=result.columns) + expected_dtypes = pd.Series( + [df.A.dtype] * 2 + [df.B.dtype] * 2, index=result.columns + ) tm.assert_series_equal(result.dtypes, expected_dtypes) tm.assert_frame_equal(result.astype(object), expected) - @pytest.mark.parametrize("level", [0, 'baz']) + @pytest.mark.parametrize("level", [0, "baz"]) def test_unstack_swaplevel_sortlevel(self, level): # GH 20994 - mi = pd.MultiIndex.from_product([[0], ['d', 'c']], - names=['bar', 'baz']) - df = pd.DataFrame([[0, 2], [1, 3]], index=mi, columns=['B', 'A']) - df.columns.name = 'foo' - - expected = pd.DataFrame([ - [3, 1, 2, 0]], columns=pd.MultiIndex.from_tuples([ - ('c', 'A'), ('c', 'B'), ('d', 'A'), ('d', 'B')], names=[ - 'baz', 'foo'])) - expected.index.name = 'bar' + mi = pd.MultiIndex.from_product([[0], ["d", "c"]], names=["bar", "baz"]) + df = pd.DataFrame([[0, 2], [1, 3]], index=mi, columns=["B", "A"]) + df.columns.name = "foo" + + expected = pd.DataFrame( + [[3, 1, 2, 0]], + columns=pd.MultiIndex.from_tuples( + [("c", "A"), ("c", "B"), ("d", "A"), ("d", "B")], names=["baz", "foo"] + ), + ) + expected.index.name = "bar" result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level) tm.assert_frame_equal(result, expected) @@ -914,55 +1045,61 @@ def test_unstack_swaplevel_sortlevel(self, level): def test_unstack_fill_frame_object(): # GH12815 Test unstacking with object. - data = pd.Series(['a', 'b', 'c', 'a'], dtype='object') + data = pd.Series(["a", "b", "c", "a"], dtype="object") data.index = pd.MultiIndex.from_tuples( - [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) + [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] + ) # By default missing values will be NaN result = data.unstack() expected = pd.DataFrame( - {'a': ['a', np.nan, 'a'], 'b': ['b', 'c', np.nan]}, - index=list('xyz') + {"a": ["a", np.nan, "a"], "b": ["b", "c", np.nan]}, index=list("xyz") ) assert_frame_equal(result, expected) # Fill with any value replaces missing values as expected - result = data.unstack(fill_value='d') + result = data.unstack(fill_value="d") expected = pd.DataFrame( - {'a': ['a', 'd', 'a'], 'b': ['b', 'c', 'd']}, - index=list('xyz') + {"a": ["a", "d", "a"], "b": ["b", "c", "d"]}, index=list("xyz") ) assert_frame_equal(result, expected) def test_unstack_timezone_aware_values(): # GH 18338 - df = pd.DataFrame({ - 'timestamp': [ - pd.Timestamp('2017-08-27 01:00:00.709949+0000', tz='UTC')], - 'a': ['a'], - 'b': ['b'], - 'c': ['c'], - }, columns=['timestamp', 'a', 'b', 'c']) - result = df.set_index(['a', 'b']).unstack() - expected = pd.DataFrame([[pd.Timestamp('2017-08-27 01:00:00.709949+0000', - tz='UTC'), - 'c']], - index=pd.Index(['a'], name='a'), - columns=pd.MultiIndex( - levels=[['timestamp', 'c'], ['b']], - codes=[[0, 1], [0, 0]], - names=[None, 'b'])) + df = pd.DataFrame( + { + "timestamp": [pd.Timestamp("2017-08-27 01:00:00.709949+0000", tz="UTC")], + "a": ["a"], + "b": ["b"], + "c": ["c"], + }, + columns=["timestamp", "a", "b", "c"], + ) + result = df.set_index(["a", "b"]).unstack() + expected = pd.DataFrame( + [[pd.Timestamp("2017-08-27 01:00:00.709949+0000", tz="UTC"), "c"]], + index=pd.Index(["a"], name="a"), + columns=pd.MultiIndex( + levels=[["timestamp", "c"], ["b"]], + codes=[[0, 1], [0, 0]], + names=[None, "b"], + ), + ) assert_frame_equal(result, expected) def test_stack_timezone_aware_values(): # GH 19420 - ts = pd.date_range(freq="D", start="20180101", end="20180103", - tz="America/New_York") + ts = pd.date_range( + freq="D", start="20180101", end="20180103", tz="America/New_York" + ) df = pd.DataFrame({"A": ts}, index=["a", "b", "c"]) result = df.stack() - expected = pd.Series(ts, - index=pd.MultiIndex(levels=[['a', 'b', 'c'], ['A']], - codes=[[0, 1, 2], [0, 0, 0]])) + expected = pd.Series( + ts, + index=pd.MultiIndex( + levels=[["a", "b", "c"], ["A"]], codes=[[0, 1, 2], [0, 0, 0]] + ), + ) assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_sort_values_level_as_str.py b/pandas/tests/frame/test_sort_values_level_as_str.py index 3dca82a229b2b4..2bcc115bcd09cf 100644 --- a/pandas/tests/frame/test_sort_values_level_as_str.py +++ b/pandas/tests/frame/test_sort_values_level_as_str.py @@ -10,32 +10,34 @@ @pytest.fixture def df_none(): - return DataFrame({ - 'outer': ['a', 'a', 'a', 'b', 'b', 'b'], - 'inner': [1, 2, 2, 2, 1, 1], - 'A': np.arange(6, 0, -1), - ('B', 5): ['one', 'one', 'two', 'two', 'one', 'one']}) + return DataFrame( + { + "outer": ["a", "a", "a", "b", "b", "b"], + "inner": [1, 2, 2, 2, 1, 1], + "A": np.arange(6, 0, -1), + ("B", 5): ["one", "one", "two", "two", "one", "one"], + } + ) -@pytest.fixture(params=[ - ['outer'], - ['outer', 'inner'] -]) +@pytest.fixture(params=[["outer"], ["outer", "inner"]]) def df_idx(request, df_none): levels = request.param return df_none.set_index(levels) -@pytest.fixture(params=[ - 'inner', # index level - ['outer'], # list of index level - 'A', # column - [('B', 5)], # list of column - ['inner', 'outer'], # two index levels - [('B', 5), 'outer'], # index level and column - ['A', ('B', 5)], # Two columns - ['inner', 'outer'] # two index levels and column -]) +@pytest.fixture( + params=[ + "inner", # index level + ["outer"], # list of index level + "A", # column + [("B", 5)], # list of column + ["inner", "outer"], # two index levels + [("B", 5), "outer"], # index level and column + ["A", ("B", 5)], # Two columns + ["inner", "outer"], # two index levels and column + ] +) def sort_names(request): return request.param @@ -45,8 +47,7 @@ def ascending(request): return request.param -def test_sort_index_level_and_column_label( - df_none, df_idx, sort_names, ascending): +def test_sort_index_level_and_column_label(df_none, df_idx, sort_names, ascending): # GH 14353 @@ -54,20 +55,17 @@ def test_sort_index_level_and_column_label( levels = df_idx.index.names # Compute expected by sorting on columns and the setting index - expected = df_none.sort_values(by=sort_names, - ascending=ascending, - axis=0).set_index(levels) + expected = df_none.sort_values( + by=sort_names, ascending=ascending, axis=0 + ).set_index(levels) # Compute result sorting on mix on columns and index levels - result = df_idx.sort_values(by=sort_names, - ascending=ascending, - axis=0) + result = df_idx.sort_values(by=sort_names, ascending=ascending, axis=0) assert_frame_equal(result, expected) -def test_sort_column_level_and_index_label( - df_none, df_idx, sort_names, ascending): +def test_sort_column_level_and_index_label(df_none, df_idx, sort_names, ascending): # GH 14353 @@ -77,20 +75,19 @@ def test_sort_column_level_and_index_label( # Compute expected by sorting on axis=0, setting index levels, and then # transposing. For some cases this will result in a frame with # multiple column levels - expected = df_none.sort_values(by=sort_names, - ascending=ascending, - axis=0).set_index(levels).T + expected = ( + df_none.sort_values(by=sort_names, ascending=ascending, axis=0) + .set_index(levels) + .T + ) # Compute result by transposing and sorting on axis=1. - result = df_idx.T.sort_values(by=sort_names, - ascending=ascending, - axis=1) + result = df_idx.T.sort_values(by=sort_names, ascending=ascending, axis=1) if len(levels) > 1: # Accessing multi-level columns that are not lexsorted raises a # performance warning - with tm.assert_produces_warning(PerformanceWarning, - check_stacklevel=False): + with tm.assert_produces_warning(PerformanceWarning, check_stacklevel=False): assert_frame_equal(result, expected) else: assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 11de77f6779e63..b6442d89388436 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -5,8 +5,15 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, IntervalIndex, MultiIndex, NaT, Series, Timestamp, - date_range) + Categorical, + DataFrame, + IntervalIndex, + MultiIndex, + NaT, + Series, + Timestamp, + date_range, +) from pandas.api.types import CategoricalDtype from pandas.tests.frame.common import TestData import pandas.util.testing as tm @@ -14,44 +21,43 @@ class TestDataFrameSorting(TestData): - def test_sort_values(self): - frame = DataFrame([[1, 1, 2], [3, 1, 0], [4, 5, 6]], - index=[1, 2, 3], columns=list('ABC')) + frame = DataFrame( + [[1, 1, 2], [3, 1, 0], [4, 5, 6]], index=[1, 2, 3], columns=list("ABC") + ) # by column (axis=0) - sorted_df = frame.sort_values(by='A') - indexer = frame['A'].argsort().values + sorted_df = frame.sort_values(by="A") + indexer = frame["A"].argsort().values expected = frame.loc[frame.index[indexer]] assert_frame_equal(sorted_df, expected) - sorted_df = frame.sort_values(by='A', ascending=False) + sorted_df = frame.sort_values(by="A", ascending=False) indexer = indexer[::-1] expected = frame.loc[frame.index[indexer]] assert_frame_equal(sorted_df, expected) - sorted_df = frame.sort_values(by='A', ascending=False) + sorted_df = frame.sort_values(by="A", ascending=False) assert_frame_equal(sorted_df, expected) # GH4839 - sorted_df = frame.sort_values(by=['A'], ascending=[False]) + sorted_df = frame.sort_values(by=["A"], ascending=[False]) assert_frame_equal(sorted_df, expected) # multiple bys - sorted_df = frame.sort_values(by=['B', 'C']) + sorted_df = frame.sort_values(by=["B", "C"]) expected = frame.loc[[2, 1, 3]] assert_frame_equal(sorted_df, expected) - sorted_df = frame.sort_values(by=['B', 'C'], ascending=False) + sorted_df = frame.sort_values(by=["B", "C"], ascending=False) assert_frame_equal(sorted_df, expected[::-1]) - sorted_df = frame.sort_values(by=['B', 'A'], ascending=[True, False]) + sorted_df = frame.sort_values(by=["B", "A"], ascending=[True, False]) assert_frame_equal(sorted_df, expected) - msg = ("No axis named 2 for object type" - " ") + msg = "No axis named 2 for object type" " " with pytest.raises(ValueError, match=msg): - frame.sort_values(by=['A', 'B'], axis=2, inplace=True) + frame.sort_values(by=["A", "B"], axis=2, inplace=True) # by row (axis=1): GH 10806 sorted_df = frame.sort_values(by=3, axis=1) @@ -59,32 +65,32 @@ def test_sort_values(self): assert_frame_equal(sorted_df, expected) sorted_df = frame.sort_values(by=3, axis=1, ascending=False) - expected = frame.reindex(columns=['C', 'B', 'A']) + expected = frame.reindex(columns=["C", "B", "A"]) assert_frame_equal(sorted_df, expected) - sorted_df = frame.sort_values(by=[1, 2], axis='columns') - expected = frame.reindex(columns=['B', 'A', 'C']) + sorted_df = frame.sort_values(by=[1, 2], axis="columns") + expected = frame.reindex(columns=["B", "A", "C"]) assert_frame_equal(sorted_df, expected) - sorted_df = frame.sort_values(by=[1, 3], axis=1, - ascending=[True, False]) + sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=[True, False]) assert_frame_equal(sorted_df, expected) sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False) - expected = frame.reindex(columns=['C', 'B', 'A']) + expected = frame.reindex(columns=["C", "B", "A"]) assert_frame_equal(sorted_df, expected) - msg = r'Length of ascending \(5\) != length of by \(2\)' + msg = r"Length of ascending \(5\) != length of by \(2\)" with pytest.raises(ValueError, match=msg): - frame.sort_values(by=['A', 'B'], axis=0, ascending=[True] * 5) + frame.sort_values(by=["A", "B"], axis=0, ascending=[True] * 5) def test_sort_values_inplace(self): - frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4], - columns=['A', 'B', 'C', 'D']) + frame = DataFrame( + np.random.randn(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"] + ) sorted_df = frame.copy() - sorted_df.sort_values(by='A', inplace=True) - expected = frame.sort_values(by='A') + sorted_df.sort_values(by="A", inplace=True) + expected = frame.sort_values(by="A") assert_frame_equal(sorted_df, expected) sorted_df = frame.copy() @@ -93,190 +99,204 @@ def test_sort_values_inplace(self): assert_frame_equal(sorted_df, expected) sorted_df = frame.copy() - sorted_df.sort_values(by='A', ascending=False, inplace=True) - expected = frame.sort_values(by='A', ascending=False) + sorted_df.sort_values(by="A", ascending=False, inplace=True) + expected = frame.sort_values(by="A", ascending=False) assert_frame_equal(sorted_df, expected) sorted_df = frame.copy() - sorted_df.sort_values(by=['A', 'B'], ascending=False, inplace=True) - expected = frame.sort_values(by=['A', 'B'], ascending=False) + sorted_df.sort_values(by=["A", "B"], ascending=False, inplace=True) + expected = frame.sort_values(by=["A", "B"], ascending=False) assert_frame_equal(sorted_df, expected) def test_sort_nan(self): # GH3917 nan = np.nan - df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4], - 'B': [9, nan, 5, 2, 5, 4, 5]}) + df = DataFrame({"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]}) # sort one column only expected = DataFrame( - {'A': [nan, 1, 1, 2, 4, 6, 8], - 'B': [5, 9, 2, nan, 5, 5, 4]}, - index=[2, 0, 3, 1, 6, 4, 5]) - sorted_df = df.sort_values(['A'], na_position='first') + {"A": [nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, nan, 5, 5, 4]}, + index=[2, 0, 3, 1, 6, 4, 5], + ) + sorted_df = df.sort_values(["A"], na_position="first") assert_frame_equal(sorted_df, expected) expected = DataFrame( - {'A': [nan, 8, 6, 4, 2, 1, 1], - 'B': [5, 4, 5, 5, nan, 9, 2]}, - index=[2, 5, 4, 6, 1, 0, 3]) - sorted_df = df.sort_values(['A'], na_position='first', ascending=False) + {"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 9, 2]}, + index=[2, 5, 4, 6, 1, 0, 3], + ) + sorted_df = df.sort_values(["A"], na_position="first", ascending=False) assert_frame_equal(sorted_df, expected) - expected = df.reindex(columns=['B', 'A']) - sorted_df = df.sort_values(by=1, axis=1, na_position='first') + expected = df.reindex(columns=["B", "A"]) + sorted_df = df.sort_values(by=1, axis=1, na_position="first") assert_frame_equal(sorted_df, expected) # na_position='last', order expected = DataFrame( - {'A': [1, 1, 2, 4, 6, 8, nan], - 'B': [2, 9, nan, 5, 5, 4, 5]}, - index=[3, 0, 1, 6, 4, 5, 2]) - sorted_df = df.sort_values(['A', 'B']) + {"A": [1, 1, 2, 4, 6, 8, nan], "B": [2, 9, nan, 5, 5, 4, 5]}, + index=[3, 0, 1, 6, 4, 5, 2], + ) + sorted_df = df.sort_values(["A", "B"]) assert_frame_equal(sorted_df, expected) # na_position='first', order expected = DataFrame( - {'A': [nan, 1, 1, 2, 4, 6, 8], - 'B': [5, 2, 9, nan, 5, 5, 4]}, - index=[2, 3, 0, 1, 6, 4, 5]) - sorted_df = df.sort_values(['A', 'B'], na_position='first') + {"A": [nan, 1, 1, 2, 4, 6, 8], "B": [5, 2, 9, nan, 5, 5, 4]}, + index=[2, 3, 0, 1, 6, 4, 5], + ) + sorted_df = df.sort_values(["A", "B"], na_position="first") assert_frame_equal(sorted_df, expected) # na_position='first', not order expected = DataFrame( - {'A': [nan, 1, 1, 2, 4, 6, 8], - 'B': [5, 9, 2, nan, 5, 5, 4]}, - index=[2, 0, 3, 1, 6, 4, 5]) - sorted_df = df.sort_values(['A', 'B'], ascending=[ - 1, 0], na_position='first') + {"A": [nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, nan, 5, 5, 4]}, + index=[2, 0, 3, 1, 6, 4, 5], + ) + sorted_df = df.sort_values(["A", "B"], ascending=[1, 0], na_position="first") assert_frame_equal(sorted_df, expected) # na_position='last', not order expected = DataFrame( - {'A': [8, 6, 4, 2, 1, 1, nan], - 'B': [4, 5, 5, nan, 2, 9, 5]}, - index=[5, 4, 6, 1, 3, 0, 2]) - sorted_df = df.sort_values(['A', 'B'], ascending=[ - 0, 1], na_position='last') + {"A": [8, 6, 4, 2, 1, 1, nan], "B": [4, 5, 5, nan, 2, 9, 5]}, + index=[5, 4, 6, 1, 3, 0, 2], + ) + sorted_df = df.sort_values(["A", "B"], ascending=[0, 1], na_position="last") assert_frame_equal(sorted_df, expected) # Test DataFrame with nan label - df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4], - 'B': [9, nan, 5, 2, 5, 4, 5]}, - index=[1, 2, 3, 4, 5, 6, nan]) + df = DataFrame( + {"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]}, + index=[1, 2, 3, 4, 5, 6, nan], + ) # NaN label, ascending=True, na_position='last' - sorted_df = df.sort_index( - kind='quicksort', ascending=True, na_position='last') - expected = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4], - 'B': [9, nan, 5, 2, 5, 4, 5]}, - index=[1, 2, 3, 4, 5, 6, nan]) + sorted_df = df.sort_index(kind="quicksort", ascending=True, na_position="last") + expected = DataFrame( + {"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]}, + index=[1, 2, 3, 4, 5, 6, nan], + ) assert_frame_equal(sorted_df, expected) # NaN label, ascending=True, na_position='first' - sorted_df = df.sort_index(na_position='first') - expected = DataFrame({'A': [4, 1, 2, nan, 1, 6, 8], - 'B': [5, 9, nan, 5, 2, 5, 4]}, - index=[nan, 1, 2, 3, 4, 5, 6]) + sorted_df = df.sort_index(na_position="first") + expected = DataFrame( + {"A": [4, 1, 2, nan, 1, 6, 8], "B": [5, 9, nan, 5, 2, 5, 4]}, + index=[nan, 1, 2, 3, 4, 5, 6], + ) assert_frame_equal(sorted_df, expected) # NaN label, ascending=False, na_position='last' - sorted_df = df.sort_index(kind='quicksort', ascending=False) - expected = DataFrame({'A': [8, 6, 1, nan, 2, 1, 4], - 'B': [4, 5, 2, 5, nan, 9, 5]}, - index=[6, 5, 4, 3, 2, 1, nan]) + sorted_df = df.sort_index(kind="quicksort", ascending=False) + expected = DataFrame( + {"A": [8, 6, 1, nan, 2, 1, 4], "B": [4, 5, 2, 5, nan, 9, 5]}, + index=[6, 5, 4, 3, 2, 1, nan], + ) assert_frame_equal(sorted_df, expected) # NaN label, ascending=False, na_position='first' sorted_df = df.sort_index( - kind='quicksort', ascending=False, na_position='first') - expected = DataFrame({'A': [4, 8, 6, 1, nan, 2, 1], - 'B': [5, 4, 5, 2, 5, nan, 9]}, - index=[nan, 6, 5, 4, 3, 2, 1]) + kind="quicksort", ascending=False, na_position="first" + ) + expected = DataFrame( + {"A": [4, 8, 6, 1, nan, 2, 1], "B": [5, 4, 5, 2, 5, nan, 9]}, + index=[nan, 6, 5, 4, 3, 2, 1], + ) assert_frame_equal(sorted_df, expected) def test_stable_descending_sort(self): # GH #6399 - df = DataFrame([[2, 'first'], [2, 'second'], [1, 'a'], [1, 'b']], - columns=['sort_col', 'order']) - sorted_df = df.sort_values(by='sort_col', kind='mergesort', - ascending=False) + df = DataFrame( + [[2, "first"], [2, "second"], [1, "a"], [1, "b"]], + columns=["sort_col", "order"], + ) + sorted_df = df.sort_values(by="sort_col", kind="mergesort", ascending=False) assert_frame_equal(df, sorted_df) def test_stable_descending_multicolumn_sort(self): nan = np.nan - df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4], - 'B': [9, nan, 5, 2, 5, 4, 5]}) + df = DataFrame({"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]}) # test stable mergesort expected = DataFrame( - {'A': [nan, 8, 6, 4, 2, 1, 1], - 'B': [5, 4, 5, 5, nan, 2, 9]}, - index=[2, 5, 4, 6, 1, 3, 0]) - sorted_df = df.sort_values(['A', 'B'], ascending=[0, 1], - na_position='first', - kind='mergesort') + {"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 2, 9]}, + index=[2, 5, 4, 6, 1, 3, 0], + ) + sorted_df = df.sort_values( + ["A", "B"], ascending=[0, 1], na_position="first", kind="mergesort" + ) assert_frame_equal(sorted_df, expected) expected = DataFrame( - {'A': [nan, 8, 6, 4, 2, 1, 1], - 'B': [5, 4, 5, 5, nan, 9, 2]}, - index=[2, 5, 4, 6, 1, 0, 3]) - sorted_df = df.sort_values(['A', 'B'], ascending=[0, 0], - na_position='first', - kind='mergesort') + {"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 9, 2]}, + index=[2, 5, 4, 6, 1, 0, 3], + ) + sorted_df = df.sort_values( + ["A", "B"], ascending=[0, 0], na_position="first", kind="mergesort" + ) assert_frame_equal(sorted_df, expected) def test_sort_multi_index(self): # GH 25775, testing that sorting by index works with a multi-index. - df = DataFrame({'a': [3, 1, 2], 'b': [0, 0, 0], - 'c': [0, 1, 2], 'd': list('abc')}) - result = df.set_index(list('abc')).sort_index(level=list('ba')) + df = DataFrame( + {"a": [3, 1, 2], "b": [0, 0, 0], "c": [0, 1, 2], "d": list("abc")} + ) + result = df.set_index(list("abc")).sort_index(level=list("ba")) - expected = DataFrame({'a': [1, 2, 3], 'b': [0, 0, 0], - 'c': [1, 2, 0], 'd': list('bca')}) - expected = expected.set_index(list('abc')) + expected = DataFrame( + {"a": [1, 2, 3], "b": [0, 0, 0], "c": [1, 2, 0], "d": list("bca")} + ) + expected = expected.set_index(list("abc")) tm.assert_frame_equal(result, expected) def test_stable_categorial(self): # GH 16793 - df = DataFrame({ - 'x': pd.Categorical(np.repeat([1, 2, 3, 4], 5), ordered=True) - }) + df = DataFrame({"x": pd.Categorical(np.repeat([1, 2, 3, 4], 5), ordered=True)}) expected = df.copy() - sorted_df = df.sort_values('x', kind='mergesort') + sorted_df = df.sort_values("x", kind="mergesort") assert_frame_equal(sorted_df, expected) def test_sort_datetimes(self): # GH 3461, argsort / lexsort differences for a datetime column - df = DataFrame(['a', 'a', 'a', 'b', 'c', 'd', 'e', 'f', 'g'], - columns=['A'], - index=date_range('20130101', periods=9)) - dts = [Timestamp(x) - for x in ['2004-02-11', '2004-01-21', '2004-01-26', - '2005-09-20', '2010-10-04', '2009-05-12', - '2008-11-12', '2010-09-28', '2010-09-28']] - df['B'] = dts[::2] + dts[1::2] - df['C'] = 2. - df['A1'] = 3. - - df1 = df.sort_values(by='A') - df2 = df.sort_values(by=['A']) + df = DataFrame( + ["a", "a", "a", "b", "c", "d", "e", "f", "g"], + columns=["A"], + index=date_range("20130101", periods=9), + ) + dts = [ + Timestamp(x) + for x in [ + "2004-02-11", + "2004-01-21", + "2004-01-26", + "2005-09-20", + "2010-10-04", + "2009-05-12", + "2008-11-12", + "2010-09-28", + "2010-09-28", + ] + ] + df["B"] = dts[::2] + dts[1::2] + df["C"] = 2.0 + df["A1"] = 3.0 + + df1 = df.sort_values(by="A") + df2 = df.sort_values(by=["A"]) assert_frame_equal(df1, df2) - df1 = df.sort_values(by='B') - df2 = df.sort_values(by=['B']) + df1 = df.sort_values(by="B") + df2 = df.sort_values(by=["B"]) assert_frame_equal(df1, df2) - df1 = df.sort_values(by='B') + df1 = df.sort_values(by="B") - df2 = df.sort_values(by=['C', 'B']) + df2 = df.sort_values(by=["C", "B"]) assert_frame_equal(df1, df2) def test_frame_column_inplace_sort_exception(self): - s = self.frame['A'] + s = self.frame["A"] with pytest.raises(ValueError, match="This Series is a view"): s.sort_values(inplace=True) @@ -293,13 +313,15 @@ def test_sort_nat_values_in_int_column(self): int_values = (2, int(NaT)) float_values = (2.0, -1.797693e308) - df = DataFrame(dict(int=int_values, float=float_values), - columns=["int", "float"]) + df = DataFrame( + dict(int=int_values, float=float_values), columns=["int", "float"] + ) - df_reversed = DataFrame(dict(int=int_values[::-1], - float=float_values[::-1]), - columns=["int", "float"], - index=[1, 0]) + df_reversed = DataFrame( + dict(int=int_values[::-1], float=float_values[::-1]), + columns=["int", "float"], + index=[1, 0], + ) # NaT is not a "na" for int64 columns, so na_position must not # influence the result: @@ -315,13 +337,16 @@ def test_sort_nat_values_in_int_column(self): # and now check if NaT is still considered as "na" for datetime64 # columns: - df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT], - float=float_values), columns=["datetime", "float"]) + df = DataFrame( + dict(datetime=[Timestamp("2016-01-01"), NaT], float=float_values), + columns=["datetime", "float"], + ) - df_reversed = DataFrame(dict(datetime=[NaT, Timestamp("2016-01-01")], - float=float_values[::-1]), - columns=["datetime", "float"], - index=[1, 0]) + df_reversed = DataFrame( + dict(datetime=[NaT, Timestamp("2016-01-01")], float=float_values[::-1]), + columns=["datetime", "float"], + index=[1, 0], + ) df_sorted = df.sort_values(["datetime", "float"], na_position="first") assert_frame_equal(df_sorted, df_reversed) @@ -337,68 +362,70 @@ def test_sort_nat(self): # GH 16836 - d1 = [Timestamp(x) for x in ['2016-01-01', '2015-01-01', - np.nan, '2016-01-01']] - d2 = [Timestamp(x) for x in ['2017-01-01', '2014-01-01', - '2016-01-01', '2015-01-01']] - df = pd.DataFrame({'a': d1, 'b': d2}, index=[0, 1, 2, 3]) - - d3 = [Timestamp(x) for x in ['2015-01-01', '2016-01-01', - '2016-01-01', np.nan]] - d4 = [Timestamp(x) for x in ['2014-01-01', '2015-01-01', - '2017-01-01', '2016-01-01']] - expected = pd.DataFrame({'a': d3, 'b': d4}, index=[1, 3, 0, 2]) - sorted_df = df.sort_values(by=['a', 'b'], ) + d1 = [Timestamp(x) for x in ["2016-01-01", "2015-01-01", np.nan, "2016-01-01"]] + d2 = [ + Timestamp(x) + for x in ["2017-01-01", "2014-01-01", "2016-01-01", "2015-01-01"] + ] + df = pd.DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3]) + + d3 = [Timestamp(x) for x in ["2015-01-01", "2016-01-01", "2016-01-01", np.nan]] + d4 = [ + Timestamp(x) + for x in ["2014-01-01", "2015-01-01", "2017-01-01", "2016-01-01"] + ] + expected = pd.DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2]) + sorted_df = df.sort_values(by=["a", "b"]) tm.assert_frame_equal(sorted_df, expected) class TestDataFrameSortIndexKinds(TestData): - def test_sort_index_multicolumn(self): A = np.arange(5).repeat(20) B = np.tile(np.arange(5), 20) random.shuffle(A) random.shuffle(B) - frame = DataFrame({'A': A, 'B': B, - 'C': np.random.randn(100)}) + frame = DataFrame({"A": A, "B": B, "C": np.random.randn(100)}) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): - frame.sort_index(by=['A', 'B']) - result = frame.sort_values(by=['A', 'B']) - indexer = np.lexsort((frame['B'], frame['A'])) + frame.sort_index(by=["A", "B"]) + result = frame.sort_values(by=["A", "B"]) + indexer = np.lexsort((frame["B"], frame["A"])) expected = frame.take(indexer) assert_frame_equal(result, expected) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): - frame.sort_index(by=['A', 'B'], ascending=False) - result = frame.sort_values(by=['A', 'B'], ascending=False) - indexer = np.lexsort((frame['B'].rank(ascending=False), - frame['A'].rank(ascending=False))) + frame.sort_index(by=["A", "B"], ascending=False) + result = frame.sort_values(by=["A", "B"], ascending=False) + indexer = np.lexsort( + (frame["B"].rank(ascending=False), frame["A"].rank(ascending=False)) + ) expected = frame.take(indexer) assert_frame_equal(result, expected) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): - frame.sort_index(by=['B', 'A']) - result = frame.sort_values(by=['B', 'A']) - indexer = np.lexsort((frame['A'], frame['B'])) + frame.sort_index(by=["B", "A"]) + result = frame.sort_values(by=["B", "A"]) + indexer = np.lexsort((frame["A"], frame["B"])) expected = frame.take(indexer) assert_frame_equal(result, expected) def test_sort_index_inplace(self): - frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4], - columns=['A', 'B', 'C', 'D']) + frame = DataFrame( + np.random.randn(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"] + ) # axis=0 unordered = frame.loc[[3, 2, 4, 1]] - a_id = id(unordered['A']) + a_id = id(unordered["A"]) df = unordered.copy() df.sort_index(inplace=True) expected = frame assert_frame_equal(df, expected) - assert a_id != id(df['A']) + assert a_id != id(df["A"]) df = unordered.copy() df.sort_index(ascending=False, inplace=True) @@ -406,7 +433,7 @@ def test_sort_index_inplace(self): assert_frame_equal(df, expected) # axis=1 - unordered = frame.loc[:, ['D', 'B', 'C', 'A']] + unordered = frame.loc[:, ["D", "B", "C", "A"]] df = unordered.copy() df.sort_index(axis=1, inplace=True) expected = frame @@ -425,114 +452,115 @@ def test_sort_index_different_sortorder(self): A = A.take(indexer) B = B.take(indexer) - df = DataFrame({'A': A, 'B': B, - 'C': np.random.randn(100)}) + df = DataFrame({"A": A, "B": B, "C": np.random.randn(100)}) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): - df.sort_index(by=['A', 'B'], ascending=[1, 0]) - result = df.sort_values(by=['A', 'B'], ascending=[1, 0]) + df.sort_index(by=["A", "B"], ascending=[1, 0]) + result = df.sort_values(by=["A", "B"], ascending=[1, 0]) ex_indexer = np.lexsort((df.B.max() - df.B, df.A)) expected = df.take(ex_indexer) assert_frame_equal(result, expected) # test with multiindex, too - idf = df.set_index(['A', 'B']) + idf = df.set_index(["A", "B"]) result = idf.sort_index(ascending=[1, 0]) expected = idf.take(ex_indexer) assert_frame_equal(result, expected) # also, Series! - result = idf['C'].sort_index(ascending=[1, 0]) - assert_series_equal(result, expected['C']) + result = idf["C"].sort_index(ascending=[1, 0]) + assert_series_equal(result, expected["C"]) def test_sort_index_duplicates(self): # with 9816, these are all translated to .sort_values - df = DataFrame([range(5, 9), range(4)], - columns=['a', 'a', 'b', 'b']) + df = DataFrame([range(5, 9), range(4)], columns=["a", "a", "b", "b"]) - with pytest.raises(ValueError, match='not unique'): + with pytest.raises(ValueError, match="not unique"): # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): - df.sort_index(by='a') - with pytest.raises(ValueError, match='not unique'): - df.sort_values(by='a') + df.sort_index(by="a") + with pytest.raises(ValueError, match="not unique"): + df.sort_values(by="a") - with pytest.raises(ValueError, match='not unique'): + with pytest.raises(ValueError, match="not unique"): # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): - df.sort_index(by=['a']) - with pytest.raises(ValueError, match='not unique'): - df.sort_values(by=['a']) + df.sort_index(by=["a"]) + with pytest.raises(ValueError, match="not unique"): + df.sort_values(by=["a"]) - with pytest.raises(ValueError, match='not unique'): + with pytest.raises(ValueError, match="not unique"): # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): # multi-column 'by' is separate codepath - df.sort_index(by=['a', 'b']) - with pytest.raises(ValueError, match='not unique'): + df.sort_index(by=["a", "b"]) + with pytest.raises(ValueError, match="not unique"): # multi-column 'by' is separate codepath - df.sort_values(by=['a', 'b']) + df.sort_values(by=["a", "b"]) # with multi-index # GH4370 - df = DataFrame(np.random.randn(4, 2), - columns=MultiIndex.from_tuples([('a', 0), ('a', 1)])) - with pytest.raises(ValueError, match='level'): + df = DataFrame( + np.random.randn(4, 2), columns=MultiIndex.from_tuples([("a", 0), ("a", 1)]) + ) + with pytest.raises(ValueError, match="level"): # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): - df.sort_index(by='a') - with pytest.raises(ValueError, match='level'): - df.sort_values(by='a') + df.sort_index(by="a") + with pytest.raises(ValueError, match="level"): + df.sort_values(by="a") # convert tuples to a list of tuples # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): - df.sort_index(by=[('a', 1)]) - expected = df.sort_values(by=[('a', 1)]) + df.sort_index(by=[("a", 1)]) + expected = df.sort_values(by=[("a", 1)]) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): - df.sort_index(by=('a', 1)) - result = df.sort_values(by=('a', 1)) + df.sort_index(by=("a", 1)) + result = df.sort_values(by=("a", 1)) assert_frame_equal(result, expected) def test_sort_index_level(self): - mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) df = DataFrame([[1, 2], [3, 4]], mi) - result = df.sort_index(level='A', sort_remaining=False) + result = df.sort_index(level="A", sort_remaining=False) expected = df assert_frame_equal(result, expected) - result = df.sort_index(level=['A', 'B'], sort_remaining=False) + result = df.sort_index(level=["A", "B"], sort_remaining=False) expected = df assert_frame_equal(result, expected) # Error thrown by sort_index when # first index is sorted last (#26053) - result = df.sort_index(level=['C', 'B', 'A']) + result = df.sort_index(level=["C", "B", "A"]) expected = df.iloc[[1, 0]] assert_frame_equal(result, expected) - result = df.sort_index(level=['B', 'C', 'A']) + result = df.sort_index(level=["B", "C", "A"]) expected = df.iloc[[1, 0]] assert_frame_equal(result, expected) - result = df.sort_index(level=['C', 'A']) + result = df.sort_index(level=["C", "A"]) expected = df.iloc[[1, 0]] assert_frame_equal(result, expected) def test_sort_index_categorical_index(self): - df = (DataFrame({'A': np.arange(6, dtype='int64'), - 'B': Series(list('aabbca')) - .astype(CategoricalDtype(list('cab')))}) - .set_index('B')) + df = DataFrame( + { + "A": np.arange(6, dtype="int64"), + "B": Series(list("aabbca")).astype(CategoricalDtype(list("cab"))), + } + ).set_index("B") result = df.sort_index() expected = df.iloc[[4, 0, 1, 5, 2, 3]] @@ -545,8 +573,11 @@ def test_sort_index_categorical_index(self): def test_sort_index(self): # GH13496 - frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], - columns=['A', 'B', 'C', 'D']) + frame = DataFrame( + np.arange(16).reshape(4, 4), + index=[1, 2, 3, 4], + columns=["A", "B", "C", "D"], + ) # axis=0 : sort rows by index labels unordered = frame.loc[[3, 2, 4, 1]] @@ -567,35 +598,28 @@ def test_sort_index(self): expected = frame.iloc[:, ::-1] assert_frame_equal(result, expected) - @pytest.mark.parametrize("level", ['A', 0]) # GH 21052 + @pytest.mark.parametrize("level", ["A", 0]) # GH 21052 def test_sort_index_multiindex(self, level): # GH13496 # sort rows by specified level of multi-index - mi = MultiIndex.from_tuples([ - [2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list('ABC')) + mi = MultiIndex.from_tuples( + [[2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list("ABC") + ) df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi) - expected_mi = MultiIndex.from_tuples([ - [1, 1, 1], - [2, 1, 2], - [2, 1, 3]], names=list('ABC')) - expected = pd.DataFrame([ - [5, 6], - [3, 4], - [1, 2]], index=expected_mi) + expected_mi = MultiIndex.from_tuples( + [[1, 1, 1], [2, 1, 2], [2, 1, 3]], names=list("ABC") + ) + expected = pd.DataFrame([[5, 6], [3, 4], [1, 2]], index=expected_mi) result = df.sort_index(level=level) assert_frame_equal(result, expected) # sort_remaining=False - expected_mi = MultiIndex.from_tuples([ - [1, 1, 1], - [2, 1, 3], - [2, 1, 2]], names=list('ABC')) - expected = pd.DataFrame([ - [5, 6], - [1, 2], - [3, 4]], index=expected_mi) + expected_mi = MultiIndex.from_tuples( + [[1, 1, 1], [2, 1, 3], [2, 1, 2]], names=list("ABC") + ) + expected = pd.DataFrame([[5, 6], [1, 2], [3, 4]], index=expected_mi) result = df.sort_index(level=level, sort_remaining=False) assert_frame_equal(result, expected) @@ -604,92 +628,112 @@ def test_sort_index_intervalindex(self): # confirming that we sort in the order of the bins y = Series(np.random.randn(100)) x1 = Series(np.sign(np.random.randn(100))) - x2 = pd.cut(Series(np.random.randn(100)), - bins=[-3, -0.5, 0, 0.5, 3]) - model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2']) + x2 = pd.cut(Series(np.random.randn(100)), bins=[-3, -0.5, 0, 0.5, 3]) + model = pd.concat([y, x1, x2], axis=1, keys=["Y", "X1", "X2"]) - result = model.groupby(['X1', 'X2'], observed=True).mean().unstack() + result = model.groupby(["X1", "X2"], observed=True).mean().unstack() expected = IntervalIndex.from_tuples( - [(-3.0, -0.5), (-0.5, 0.0), - (0.0, 0.5), (0.5, 3.0)], - closed='right') + [(-3.0, -0.5), (-0.5, 0.0), (0.0, 0.5), (0.5, 3.0)], closed="right" + ) result = result.columns.levels[1].categories tm.assert_index_equal(result, expected) def test_sort_index_na_position_with_categories(self): # GH 22556 # Positioning missing value properly when column is Categorical. - categories = ['A', 'B', 'C'] + categories = ["A", "B", "C"] category_indices = [0, 2, 4] list_of_nans = [np.nan, np.nan] na_indices = [1, 3] - na_position_first = 'first' - na_position_last = 'last' - column_name = 'c' + na_position_first = "first" + na_position_last = "last" + column_name = "c" reversed_categories = sorted(categories, reverse=True) reversed_category_indices = sorted(category_indices, reverse=True) reversed_na_indices = sorted(na_indices) - df = pd.DataFrame({ - column_name: pd.Categorical(['A', np.nan, 'B', np.nan, 'C'], - categories=categories, - ordered=True)}) + df = pd.DataFrame( + { + column_name: pd.Categorical( + ["A", np.nan, "B", np.nan, "C"], categories=categories, ordered=True + ) + } + ) # sort ascending with na first - result = df.sort_values(by=column_name, - ascending=True, - na_position=na_position_first) - expected = DataFrame({ - column_name: Categorical(list_of_nans + categories, - categories=categories, - ordered=True) - }, index=na_indices + category_indices) + result = df.sort_values( + by=column_name, ascending=True, na_position=na_position_first + ) + expected = DataFrame( + { + column_name: Categorical( + list_of_nans + categories, categories=categories, ordered=True + ) + }, + index=na_indices + category_indices, + ) assert_frame_equal(result, expected) # sort ascending with na last - result = df.sort_values(by=column_name, - ascending=True, - na_position=na_position_last) - expected = DataFrame({ - column_name: Categorical(categories + list_of_nans, - categories=categories, - ordered=True) - }, index=category_indices + na_indices) + result = df.sort_values( + by=column_name, ascending=True, na_position=na_position_last + ) + expected = DataFrame( + { + column_name: Categorical( + categories + list_of_nans, categories=categories, ordered=True + ) + }, + index=category_indices + na_indices, + ) assert_frame_equal(result, expected) # sort descending with na first - result = df.sort_values(by=column_name, - ascending=False, - na_position=na_position_first) - expected = DataFrame({ - column_name: Categorical(list_of_nans + reversed_categories, - categories=categories, - ordered=True) - }, index=reversed_na_indices + reversed_category_indices) + result = df.sort_values( + by=column_name, ascending=False, na_position=na_position_first + ) + expected = DataFrame( + { + column_name: Categorical( + list_of_nans + reversed_categories, + categories=categories, + ordered=True, + ) + }, + index=reversed_na_indices + reversed_category_indices, + ) assert_frame_equal(result, expected) # sort descending with na last - result = df.sort_values(by=column_name, - ascending=False, - na_position=na_position_last) - expected = DataFrame({ - column_name: Categorical(reversed_categories + list_of_nans, - categories=categories, - ordered=True) - }, index=reversed_category_indices + reversed_na_indices) + result = df.sort_values( + by=column_name, ascending=False, na_position=na_position_last + ) + expected = DataFrame( + { + column_name: Categorical( + reversed_categories + list_of_nans, + categories=categories, + ordered=True, + ) + }, + index=reversed_category_indices + reversed_na_indices, + ) assert_frame_equal(result, expected) def test_sort_index_na_position_with_categories_raises(self): - df = pd.DataFrame({ - 'c': pd.Categorical(['A', np.nan, 'B', np.nan, 'C'], - categories=['A', 'B', 'C'], - ordered=True)}) + df = pd.DataFrame( + { + "c": pd.Categorical( + ["A", np.nan, "B", np.nan, "C"], + categories=["A", "B", "C"], + ordered=True, + ) + } + ) with pytest.raises(ValueError): - df.sort_values(by='c', - ascending=False, - na_position='bad_position') + df.sort_values(by="c", ascending=False, na_position="bad_position") diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 45b13e5159bcda..c66a97c2b294b1 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -8,19 +8,17 @@ class TestDataFrameSubclassing(TestData): - def test_frame_subclassing_and_slicing(self): # Subclass frame and ensure it returns the right class on slicing it # In reference to PR 9632 class CustomSeries(Series): - @property def _constructor(self): return CustomSeries def custom_series_function(self): - return 'OK' + return "OK" class CustomDataFrame(DataFrame): """ @@ -38,10 +36,9 @@ def _constructor(self): _constructor_sliced = CustomSeries def custom_frame_function(self): - return 'OK' + return "OK" - data = {'col1': range(10), - 'col2': range(10)} + data = {"col1": range(10), "col2": range(10)} cdf = CustomDataFrame(data) # Did we get back our own DF class? @@ -50,34 +47,35 @@ def custom_frame_function(self): # Do we get back our own Series class after selecting a column? cdf_series = cdf.col1 assert isinstance(cdf_series, CustomSeries) - assert cdf_series.custom_series_function() == 'OK' + assert cdf_series.custom_series_function() == "OK" # Do we get back our own DF class after slicing row-wise? cdf_rows = cdf[1:5] assert isinstance(cdf_rows, CustomDataFrame) - assert cdf_rows.custom_frame_function() == 'OK' + assert cdf_rows.custom_frame_function() == "OK" # Make sure sliced part of multi-index frame is custom class - mcol = pd.MultiIndex.from_tuples([('A', 'A'), ('A', 'B')]) + mcol = pd.MultiIndex.from_tuples([("A", "A"), ("A", "B")]) cdf_multi = CustomDataFrame([[0, 1], [2, 3]], columns=mcol) - assert isinstance(cdf_multi['A'], CustomDataFrame) + assert isinstance(cdf_multi["A"], CustomDataFrame) - mcol = pd.MultiIndex.from_tuples([('A', ''), ('B', '')]) + mcol = pd.MultiIndex.from_tuples([("A", ""), ("B", "")]) cdf_multi2 = CustomDataFrame([[0, 1], [2, 3]], columns=mcol) - assert isinstance(cdf_multi2['A'], CustomSeries) + assert isinstance(cdf_multi2["A"], CustomSeries) def test_dataframe_metadata(self): - df = tm.SubclassedDataFrame({'X': [1, 2, 3], 'Y': [1, 2, 3]}, - index=['a', 'b', 'c']) - df.testattr = 'XXX' + df = tm.SubclassedDataFrame( + {"X": [1, 2, 3], "Y": [1, 2, 3]}, index=["a", "b", "c"] + ) + df.testattr = "XXX" - assert df.testattr == 'XXX' - assert df[['X']].testattr == 'XXX' - assert df.loc[['a', 'b'], :].testattr == 'XXX' - assert df.iloc[[0, 1], :].testattr == 'XXX' + assert df.testattr == "XXX" + assert df[["X"]].testattr == "XXX" + assert df.loc[["a", "b"], :].testattr == "XXX" + assert df.iloc[[0, 1], :].testattr == "XXX" # see gh-9776 - assert df.iloc[0:1, :].testattr == 'XXX' + assert df.iloc[0:1, :].testattr == "XXX" # see gh-10553 unpickled = tm.round_trip_pickle(df) @@ -87,64 +85,67 @@ def test_dataframe_metadata(self): def test_indexing_sliced(self): # GH 11559 - df = tm.SubclassedDataFrame({'X': [1, 2, 3], - 'Y': [4, 5, 6], - 'Z': [7, 8, 9]}, - index=['a', 'b', 'c']) - res = df.loc[:, 'X'] - exp = tm.SubclassedSeries([1, 2, 3], index=list('abc'), name='X') + df = tm.SubclassedDataFrame( + {"X": [1, 2, 3], "Y": [4, 5, 6], "Z": [7, 8, 9]}, index=["a", "b", "c"] + ) + res = df.loc[:, "X"] + exp = tm.SubclassedSeries([1, 2, 3], index=list("abc"), name="X") tm.assert_series_equal(res, exp) assert isinstance(res, tm.SubclassedSeries) res = df.iloc[:, 1] - exp = tm.SubclassedSeries([4, 5, 6], index=list('abc'), name='Y') + exp = tm.SubclassedSeries([4, 5, 6], index=list("abc"), name="Y") tm.assert_series_equal(res, exp) assert isinstance(res, tm.SubclassedSeries) - res = df.loc[:, 'Z'] - exp = tm.SubclassedSeries([7, 8, 9], index=list('abc'), name='Z') + res = df.loc[:, "Z"] + exp = tm.SubclassedSeries([7, 8, 9], index=list("abc"), name="Z") tm.assert_series_equal(res, exp) assert isinstance(res, tm.SubclassedSeries) - res = df.loc['a', :] - exp = tm.SubclassedSeries([1, 4, 7], index=list('XYZ'), name='a') + res = df.loc["a", :] + exp = tm.SubclassedSeries([1, 4, 7], index=list("XYZ"), name="a") tm.assert_series_equal(res, exp) assert isinstance(res, tm.SubclassedSeries) res = df.iloc[1, :] - exp = tm.SubclassedSeries([2, 5, 8], index=list('XYZ'), name='b') + exp = tm.SubclassedSeries([2, 5, 8], index=list("XYZ"), name="b") tm.assert_series_equal(res, exp) assert isinstance(res, tm.SubclassedSeries) - res = df.loc['c', :] - exp = tm.SubclassedSeries([3, 6, 9], index=list('XYZ'), name='c') + res = df.loc["c", :] + exp = tm.SubclassedSeries([3, 6, 9], index=list("XYZ"), name="c") tm.assert_series_equal(res, exp) assert isinstance(res, tm.SubclassedSeries) def test_subclass_attr_err_propagation(self): # GH 11808 class A(DataFrame): - @property def bar(self): return self.i_dont_exist - with pytest.raises(AttributeError, match='.*i_dont_exist.*'): + + with pytest.raises(AttributeError, match=".*i_dont_exist.*"): A().bar def test_subclass_align(self): # GH 12983 - df1 = tm.SubclassedDataFrame({'a': [1, 3, 5], - 'b': [1, 3, 5]}, index=list('ACE')) - df2 = tm.SubclassedDataFrame({'c': [1, 2, 4], - 'd': [1, 2, 4]}, index=list('ABD')) + df1 = tm.SubclassedDataFrame( + {"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE") + ) + df2 = tm.SubclassedDataFrame( + {"c": [1, 2, 4], "d": [1, 2, 4]}, index=list("ABD") + ) res1, res2 = df1.align(df2, axis=0) - exp1 = tm.SubclassedDataFrame({'a': [1, np.nan, 3, np.nan, 5], - 'b': [1, np.nan, 3, np.nan, 5]}, - index=list('ABCDE')) - exp2 = tm.SubclassedDataFrame({'c': [1, 2, np.nan, 4, np.nan], - 'd': [1, 2, np.nan, 4, np.nan]}, - index=list('ABCDE')) + exp1 = tm.SubclassedDataFrame( + {"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]}, + index=list("ABCDE"), + ) + exp2 = tm.SubclassedDataFrame( + {"c": [1, 2, np.nan, 4, np.nan], "d": [1, 2, np.nan, 4, np.nan]}, + index=list("ABCDE"), + ) assert isinstance(res1, tm.SubclassedDataFrame) tm.assert_frame_equal(res1, exp1) assert isinstance(res2, tm.SubclassedDataFrame) @@ -158,18 +159,17 @@ def test_subclass_align(self): def test_subclass_align_combinations(self): # GH 12983 - df = tm.SubclassedDataFrame({'a': [1, 3, 5], - 'b': [1, 3, 5]}, index=list('ACE')) - s = tm.SubclassedSeries([1, 2, 4], index=list('ABD'), name='x') + df = tm.SubclassedDataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE")) + s = tm.SubclassedSeries([1, 2, 4], index=list("ABD"), name="x") # frame + series res1, res2 = df.align(s, axis=0) - exp1 = pd.DataFrame({'a': [1, np.nan, 3, np.nan, 5], - 'b': [1, np.nan, 3, np.nan, 5]}, - index=list('ABCDE')) + exp1 = pd.DataFrame( + {"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]}, + index=list("ABCDE"), + ) # name is lost when - exp2 = pd.Series([1, 2, np.nan, 4, np.nan], - index=list('ABCDE'), name='x') + exp2 = pd.Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x") assert isinstance(res1, tm.SubclassedDataFrame) tm.assert_frame_equal(res1, exp1) @@ -185,7 +185,7 @@ def test_subclass_align_combinations(self): def test_subclass_iterrows(self): # GH 13977 - df = tm.SubclassedDataFrame({'a': [1]}) + df = tm.SubclassedDataFrame({"a": [1]}) for i, row in df.iterrows(): assert isinstance(row, tm.SubclassedSeries) tm.assert_series_equal(row, df.loc[i]) @@ -196,274 +196,315 @@ def test_subclass_sparse_slice(self): ssdf = tm.SubclassedSparseDataFrame(rows) ssdf.testattr = "testattr" - tm.assert_sp_frame_equal(ssdf.loc[:2], - tm.SubclassedSparseDataFrame(rows[:3])) - tm.assert_sp_frame_equal(ssdf.iloc[:2], - tm.SubclassedSparseDataFrame(rows[:2])) - tm.assert_sp_frame_equal(ssdf[:2], - tm.SubclassedSparseDataFrame(rows[:2])) + tm.assert_sp_frame_equal(ssdf.loc[:2], tm.SubclassedSparseDataFrame(rows[:3])) + tm.assert_sp_frame_equal(ssdf.iloc[:2], tm.SubclassedSparseDataFrame(rows[:2])) + tm.assert_sp_frame_equal(ssdf[:2], tm.SubclassedSparseDataFrame(rows[:2])) assert ssdf.loc[:2].testattr == "testattr" assert ssdf.iloc[:2].testattr == "testattr" assert ssdf[:2].testattr == "testattr" - tm.assert_sp_series_equal(ssdf.loc[1], - tm.SubclassedSparseSeries(rows[1]), - check_names=False, - check_kind=False) - tm.assert_sp_series_equal(ssdf.iloc[1], - tm.SubclassedSparseSeries(rows[1]), - check_names=False, - check_kind=False) + tm.assert_sp_series_equal( + ssdf.loc[1], + tm.SubclassedSparseSeries(rows[1]), + check_names=False, + check_kind=False, + ) + tm.assert_sp_series_equal( + ssdf.iloc[1], + tm.SubclassedSparseSeries(rows[1]), + check_names=False, + check_kind=False, + ) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_subclass_sparse_transpose(self): - ossdf = tm.SubclassedSparseDataFrame([[1, 2, 3], - [4, 5, 6]]) - essdf = tm.SubclassedSparseDataFrame([[1, 4], - [2, 5], - [3, 6]]) + ossdf = tm.SubclassedSparseDataFrame([[1, 2, 3], [4, 5, 6]]) + essdf = tm.SubclassedSparseDataFrame([[1, 4], [2, 5], [3, 6]]) tm.assert_sp_frame_equal(ossdf.T, essdf) def test_subclass_stack(self): # GH 15564 - df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=['a', 'b', 'c'], - columns=['X', 'Y', 'Z']) + df = tm.SubclassedDataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["a", "b", "c"], + columns=["X", "Y", "Z"], + ) res = df.stack() exp = tm.SubclassedSeries( - [1, 2, 3, 4, 5, 6, 7, 8, 9], - index=[list('aaabbbccc'), list('XYZXYZXYZ')]) + [1, 2, 3, 4, 5, 6, 7, 8, 9], index=[list("aaabbbccc"), list("XYZXYZXYZ")] + ) tm.assert_series_equal(res, exp) def test_subclass_stack_multi(self): # GH 15564 - df = tm.SubclassedDataFrame([ - [10, 11, 12, 13], - [20, 21, 22, 23], - [30, 31, 32, 33], - [40, 41, 42, 43]], + df = tm.SubclassedDataFrame( + [[10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33], [40, 41, 42, 43]], index=MultiIndex.from_tuples( - list(zip(list('AABB'), list('cdcd'))), - names=['aaa', 'ccc']), + list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"] + ), columns=MultiIndex.from_tuples( - list(zip(list('WWXX'), list('yzyz'))), - names=['www', 'yyy'])) - - exp = tm.SubclassedDataFrame([ - [10, 12], - [11, 13], - [20, 22], - [21, 23], - [30, 32], - [31, 33], - [40, 42], - [41, 43]], - index=MultiIndex.from_tuples(list(zip( - list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))), - names=['aaa', 'ccc', 'yyy']), - columns=Index(['W', 'X'], name='www')) + list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"] + ), + ) + + exp = tm.SubclassedDataFrame( + [ + [10, 12], + [11, 13], + [20, 22], + [21, 23], + [30, 32], + [31, 33], + [40, 42], + [41, 43], + ], + index=MultiIndex.from_tuples( + list(zip(list("AAAABBBB"), list("ccddccdd"), list("yzyzyzyz"))), + names=["aaa", "ccc", "yyy"], + ), + columns=Index(["W", "X"], name="www"), + ) res = df.stack() tm.assert_frame_equal(res, exp) - res = df.stack('yyy') + res = df.stack("yyy") tm.assert_frame_equal(res, exp) - exp = tm.SubclassedDataFrame([ - [10, 11], - [12, 13], - [20, 21], - [22, 23], - [30, 31], - [32, 33], - [40, 41], - [42, 43]], - index=MultiIndex.from_tuples(list(zip( - list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))), - names=['aaa', 'ccc', 'www']), - columns=Index(['y', 'z'], name='yyy')) - - res = df.stack('www') + exp = tm.SubclassedDataFrame( + [ + [10, 11], + [12, 13], + [20, 21], + [22, 23], + [30, 31], + [32, 33], + [40, 41], + [42, 43], + ], + index=MultiIndex.from_tuples( + list(zip(list("AAAABBBB"), list("ccddccdd"), list("WXWXWXWX"))), + names=["aaa", "ccc", "www"], + ), + columns=Index(["y", "z"], name="yyy"), + ) + + res = df.stack("www") tm.assert_frame_equal(res, exp) def test_subclass_stack_multi_mixed(self): # GH 15564 - df = tm.SubclassedDataFrame([ - [10, 11, 12.0, 13.0], - [20, 21, 22.0, 23.0], - [30, 31, 32.0, 33.0], - [40, 41, 42.0, 43.0]], + df = tm.SubclassedDataFrame( + [ + [10, 11, 12.0, 13.0], + [20, 21, 22.0, 23.0], + [30, 31, 32.0, 33.0], + [40, 41, 42.0, 43.0], + ], index=MultiIndex.from_tuples( - list(zip(list('AABB'), list('cdcd'))), - names=['aaa', 'ccc']), + list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"] + ), columns=MultiIndex.from_tuples( - list(zip(list('WWXX'), list('yzyz'))), - names=['www', 'yyy'])) - - exp = tm.SubclassedDataFrame([ - [10, 12.0], - [11, 13.0], - [20, 22.0], - [21, 23.0], - [30, 32.0], - [31, 33.0], - [40, 42.0], - [41, 43.0]], - index=MultiIndex.from_tuples(list(zip( - list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))), - names=['aaa', 'ccc', 'yyy']), - columns=Index(['W', 'X'], name='www')) + list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"] + ), + ) + + exp = tm.SubclassedDataFrame( + [ + [10, 12.0], + [11, 13.0], + [20, 22.0], + [21, 23.0], + [30, 32.0], + [31, 33.0], + [40, 42.0], + [41, 43.0], + ], + index=MultiIndex.from_tuples( + list(zip(list("AAAABBBB"), list("ccddccdd"), list("yzyzyzyz"))), + names=["aaa", "ccc", "yyy"], + ), + columns=Index(["W", "X"], name="www"), + ) res = df.stack() tm.assert_frame_equal(res, exp) - res = df.stack('yyy') + res = df.stack("yyy") tm.assert_frame_equal(res, exp) - exp = tm.SubclassedDataFrame([ - [10.0, 11.0], - [12.0, 13.0], - [20.0, 21.0], - [22.0, 23.0], - [30.0, 31.0], - [32.0, 33.0], - [40.0, 41.0], - [42.0, 43.0]], - index=MultiIndex.from_tuples(list(zip( - list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))), - names=['aaa', 'ccc', 'www']), - columns=Index(['y', 'z'], name='yyy')) - - res = df.stack('www') + exp = tm.SubclassedDataFrame( + [ + [10.0, 11.0], + [12.0, 13.0], + [20.0, 21.0], + [22.0, 23.0], + [30.0, 31.0], + [32.0, 33.0], + [40.0, 41.0], + [42.0, 43.0], + ], + index=MultiIndex.from_tuples( + list(zip(list("AAAABBBB"), list("ccddccdd"), list("WXWXWXWX"))), + names=["aaa", "ccc", "www"], + ), + columns=Index(["y", "z"], name="yyy"), + ) + + res = df.stack("www") tm.assert_frame_equal(res, exp) def test_subclass_unstack(self): # GH 15564 - df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=['a', 'b', 'c'], - columns=['X', 'Y', 'Z']) + df = tm.SubclassedDataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["a", "b", "c"], + columns=["X", "Y", "Z"], + ) res = df.unstack() exp = tm.SubclassedSeries( - [1, 4, 7, 2, 5, 8, 3, 6, 9], - index=[list('XXXYYYZZZ'), list('abcabcabc')]) + [1, 4, 7, 2, 5, 8, 3, 6, 9], index=[list("XXXYYYZZZ"), list("abcabcabc")] + ) tm.assert_series_equal(res, exp) def test_subclass_unstack_multi(self): # GH 15564 - df = tm.SubclassedDataFrame([ - [10, 11, 12, 13], - [20, 21, 22, 23], - [30, 31, 32, 33], - [40, 41, 42, 43]], + df = tm.SubclassedDataFrame( + [[10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33], [40, 41, 42, 43]], index=MultiIndex.from_tuples( - list(zip(list('AABB'), list('cdcd'))), - names=['aaa', 'ccc']), + list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"] + ), columns=MultiIndex.from_tuples( - list(zip(list('WWXX'), list('yzyz'))), - names=['www', 'yyy'])) + list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"] + ), + ) - exp = tm.SubclassedDataFrame([ - [10, 20, 11, 21, 12, 22, 13, 23], - [30, 40, 31, 41, 32, 42, 33, 43]], - index=Index(['A', 'B'], name='aaa'), - columns=MultiIndex.from_tuples(list(zip( - list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd'))), - names=['www', 'yyy', 'ccc'])) + exp = tm.SubclassedDataFrame( + [[10, 20, 11, 21, 12, 22, 13, 23], [30, 40, 31, 41, 32, 42, 33, 43]], + index=Index(["A", "B"], name="aaa"), + columns=MultiIndex.from_tuples( + list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("cdcdcdcd"))), + names=["www", "yyy", "ccc"], + ), + ) res = df.unstack() tm.assert_frame_equal(res, exp) - res = df.unstack('ccc') + res = df.unstack("ccc") tm.assert_frame_equal(res, exp) - exp = tm.SubclassedDataFrame([ - [10, 30, 11, 31, 12, 32, 13, 33], - [20, 40, 21, 41, 22, 42, 23, 43]], - index=Index(['c', 'd'], name='ccc'), - columns=MultiIndex.from_tuples(list(zip( - list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB'))), - names=['www', 'yyy', 'aaa'])) + exp = tm.SubclassedDataFrame( + [[10, 30, 11, 31, 12, 32, 13, 33], [20, 40, 21, 41, 22, 42, 23, 43]], + index=Index(["c", "d"], name="ccc"), + columns=MultiIndex.from_tuples( + list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("ABABABAB"))), + names=["www", "yyy", "aaa"], + ), + ) - res = df.unstack('aaa') + res = df.unstack("aaa") tm.assert_frame_equal(res, exp) def test_subclass_unstack_multi_mixed(self): # GH 15564 - df = tm.SubclassedDataFrame([ - [10, 11, 12.0, 13.0], - [20, 21, 22.0, 23.0], - [30, 31, 32.0, 33.0], - [40, 41, 42.0, 43.0]], + df = tm.SubclassedDataFrame( + [ + [10, 11, 12.0, 13.0], + [20, 21, 22.0, 23.0], + [30, 31, 32.0, 33.0], + [40, 41, 42.0, 43.0], + ], index=MultiIndex.from_tuples( - list(zip(list('AABB'), list('cdcd'))), - names=['aaa', 'ccc']), + list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"] + ), columns=MultiIndex.from_tuples( - list(zip(list('WWXX'), list('yzyz'))), - names=['www', 'yyy'])) - - exp = tm.SubclassedDataFrame([ - [10, 20, 11, 21, 12.0, 22.0, 13.0, 23.0], - [30, 40, 31, 41, 32.0, 42.0, 33.0, 43.0]], - index=Index(['A', 'B'], name='aaa'), - columns=MultiIndex.from_tuples(list(zip( - list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd'))), - names=['www', 'yyy', 'ccc'])) + list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"] + ), + ) + + exp = tm.SubclassedDataFrame( + [ + [10, 20, 11, 21, 12.0, 22.0, 13.0, 23.0], + [30, 40, 31, 41, 32.0, 42.0, 33.0, 43.0], + ], + index=Index(["A", "B"], name="aaa"), + columns=MultiIndex.from_tuples( + list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("cdcdcdcd"))), + names=["www", "yyy", "ccc"], + ), + ) res = df.unstack() tm.assert_frame_equal(res, exp) - res = df.unstack('ccc') + res = df.unstack("ccc") tm.assert_frame_equal(res, exp) - exp = tm.SubclassedDataFrame([ - [10, 30, 11, 31, 12.0, 32.0, 13.0, 33.0], - [20, 40, 21, 41, 22.0, 42.0, 23.0, 43.0]], - index=Index(['c', 'd'], name='ccc'), - columns=MultiIndex.from_tuples(list(zip( - list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB'))), - names=['www', 'yyy', 'aaa'])) + exp = tm.SubclassedDataFrame( + [ + [10, 30, 11, 31, 12.0, 32.0, 13.0, 33.0], + [20, 40, 21, 41, 22.0, 42.0, 23.0, 43.0], + ], + index=Index(["c", "d"], name="ccc"), + columns=MultiIndex.from_tuples( + list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("ABABABAB"))), + names=["www", "yyy", "aaa"], + ), + ) - res = df.unstack('aaa') + res = df.unstack("aaa") tm.assert_frame_equal(res, exp) def test_subclass_pivot(self): # GH 15564 - df = tm.SubclassedDataFrame({ - 'index': ['A', 'B', 'C', 'C', 'B', 'A'], - 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'], - 'values': [1., 2., 3., 3., 2., 1.]}) + df = tm.SubclassedDataFrame( + { + "index": ["A", "B", "C", "C", "B", "A"], + "columns": ["One", "One", "One", "Two", "Two", "Two"], + "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], + } + ) - pivoted = df.pivot( - index='index', columns='columns', values='values') + pivoted = df.pivot(index="index", columns="columns", values="values") - expected = tm.SubclassedDataFrame({ - 'One': {'A': 1., 'B': 2., 'C': 3.}, - 'Two': {'A': 1., 'B': 2., 'C': 3.}}) + expected = tm.SubclassedDataFrame( + { + "One": {"A": 1.0, "B": 2.0, "C": 3.0}, + "Two": {"A": 1.0, "B": 2.0, "C": 3.0}, + } + ) - expected.index.name, expected.columns.name = 'index', 'columns' + expected.index.name, expected.columns.name = "index", "columns" tm.assert_frame_equal(pivoted, expected) def test_subclassed_melt(self): # GH 15564 - cheese = tm.SubclassedDataFrame({ - 'first': ['John', 'Mary'], - 'last': ['Doe', 'Bo'], - 'height': [5.5, 6.0], - 'weight': [130, 150]}) - - melted = pd.melt(cheese, id_vars=['first', 'last']) - - expected = tm.SubclassedDataFrame([ - ['John', 'Doe', 'height', 5.5], - ['Mary', 'Bo', 'height', 6.0], - ['John', 'Doe', 'weight', 130], - ['Mary', 'Bo', 'weight', 150]], - columns=['first', 'last', 'variable', 'value']) + cheese = tm.SubclassedDataFrame( + { + "first": ["John", "Mary"], + "last": ["Doe", "Bo"], + "height": [5.5, 6.0], + "weight": [130, 150], + } + ) + + melted = pd.melt(cheese, id_vars=["first", "last"]) + + expected = tm.SubclassedDataFrame( + [ + ["John", "Doe", "height", 5.5], + ["Mary", "Bo", "height", 6.0], + ["John", "Doe", "weight", 130], + ["Mary", "Bo", "weight", 150], + ], + columns=["first", "last", "variable", "value"], + ) tm.assert_frame_equal(melted, expected) @@ -472,21 +513,26 @@ def test_subclassed_wide_to_long(self): np.random.seed(123) x = np.random.randn(3) - df = tm.SubclassedDataFrame({ - "A1970": {0: "a", 1: "b", 2: "c"}, - "A1980": {0: "d", 1: "e", 2: "f"}, - "B1970": {0: 2.5, 1: 1.2, 2: .7}, - "B1980": {0: 3.2, 1: 1.3, 2: .1}, - "X": dict(zip(range(3), x))}) + df = tm.SubclassedDataFrame( + { + "A1970": {0: "a", 1: "b", 2: "c"}, + "A1980": {0: "d", 1: "e", 2: "f"}, + "B1970": {0: 2.5, 1: 1.2, 2: 0.7}, + "B1980": {0: 3.2, 1: 1.3, 2: 0.1}, + "X": dict(zip(range(3), x)), + } + ) df["id"] = df.index - exp_data = {"X": x.tolist() + x.tolist(), - "A": ['a', 'b', 'c', 'd', 'e', 'f'], - "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], - "year": [1970, 1970, 1970, 1980, 1980, 1980], - "id": [0, 1, 2, 0, 1, 2]} + exp_data = { + "X": x.tolist() + x.tolist(), + "A": ["a", "b", "c", "d", "e", "f"], + "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": [1970, 1970, 1970, 1980, 1980, 1980], + "id": [0, 1, 2, 0, 1, 2], + } expected = tm.SubclassedDataFrame(exp_data) - expected = expected.set_index(['id', 'year'])[["X", "A", "B"]] + expected = expected.set_index(["id", "year"])[["X", "A", "B"]] long_frame = pd.wide_to_long(df, ["A", "B"], i="id", j="year") tm.assert_frame_equal(long_frame, expected) @@ -502,32 +548,34 @@ def strech(row): row["value"] += 0.5 return row - df = tm.SubclassedDataFrame([ - ['John', 'Doe', 'height', 5.5], - ['Mary', 'Bo', 'height', 6.0], - ['John', 'Doe', 'weight', 130], - ['Mary', 'Bo', 'weight', 150]], - columns=['first', 'last', 'variable', 'value']) + df = tm.SubclassedDataFrame( + [ + ["John", "Doe", "height", 5.5], + ["Mary", "Bo", "height", 6.0], + ["John", "Doe", "weight", 130], + ["Mary", "Bo", "weight", 150], + ], + columns=["first", "last", "variable", "value"], + ) df.apply(lambda x: check_row_subclass(x)) df.apply(lambda x: check_row_subclass(x), axis=1) - expected = tm.SubclassedDataFrame([ - ['John', 'Doe', 'height', 6.0], - ['Mary', 'Bo', 'height', 6.5], - ['John', 'Doe', 'weight', 130], - ['Mary', 'Bo', 'weight', 150]], - columns=['first', 'last', 'variable', 'value']) + expected = tm.SubclassedDataFrame( + [ + ["John", "Doe", "height", 6.0], + ["Mary", "Bo", "height", 6.5], + ["John", "Doe", "weight", 130], + ["Mary", "Bo", "weight", 150], + ], + columns=["first", "last", "variable", "value"], + ) result = df.apply(lambda x: strech(x), axis=1) assert isinstance(result, tm.SubclassedDataFrame) tm.assert_frame_equal(result, expected) - expected = tm.SubclassedDataFrame([ - [1, 2, 3], - [1, 2, 3], - [1, 2, 3], - [1, 2, 3]]) + expected = tm.SubclassedDataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]) result = df.apply(lambda x: tm.SubclassedSeries([1, 2, 3]), axis=1) assert isinstance(result, tm.SubclassedDataFrame) @@ -537,11 +585,7 @@ def strech(row): assert isinstance(result, tm.SubclassedDataFrame) tm.assert_frame_equal(result, expected) - expected = tm.SubclassedSeries([ - [1, 2, 3], - [1, 2, 3], - [1, 2, 3], - [1, 2, 3]]) + expected = tm.SubclassedSeries([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]) result = df.apply(lambda x: [1, 2, 3], axis=1) assert not isinstance(result, tm.SubclassedDataFrame) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index bce9f70fdc20c8..92801b02dee224 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -7,12 +7,23 @@ import pandas as pd from pandas import ( - DataFrame, DatetimeIndex, Index, MultiIndex, Series, Timestamp, date_range, - period_range, to_datetime) + DataFrame, + DatetimeIndex, + Index, + MultiIndex, + Series, + Timestamp, + date_range, + period_range, + to_datetime, +) from pandas.tests.frame.common import TestData import pandas.util.testing as tm from pandas.util.testing import ( - assert_frame_equal, assert_index_equal, assert_series_equal) + assert_frame_equal, + assert_index_equal, + assert_series_equal, +) import pandas.tseries.offsets as offsets @@ -23,56 +34,71 @@ def close_open_fixture(request): class TestDataFrameTimeSeriesMethods(TestData): - def test_diff(self): the_diff = self.tsframe.diff(1) - assert_series_equal(the_diff['A'], - self.tsframe['A'] - self.tsframe['A'].shift(1)) + assert_series_equal( + the_diff["A"], self.tsframe["A"] - self.tsframe["A"].shift(1) + ) # int dtype a = 10000000000000000 b = a + 1 s = Series([a, b]) - rs = DataFrame({'s': s}).diff() + rs = DataFrame({"s": s}).diff() assert rs.s[1] == 1 # mixed numeric - tf = self.tsframe.astype('float32') + tf = self.tsframe.astype("float32") the_diff = tf.diff(1) - assert_series_equal(the_diff['A'], - tf['A'] - tf['A'].shift(1)) + assert_series_equal(the_diff["A"], tf["A"] - tf["A"].shift(1)) # issue 10907 - df = pd.DataFrame({'y': pd.Series([2]), 'z': pd.Series([3])}) - df.insert(0, 'x', 1) + df = pd.DataFrame({"y": pd.Series([2]), "z": pd.Series([3])}) + df.insert(0, "x", 1) result = df.diff(axis=1) - expected = pd.DataFrame({'x': np.nan, 'y': pd.Series( - 1), 'z': pd.Series(1)}).astype('float64') + expected = pd.DataFrame( + {"x": np.nan, "y": pd.Series(1), "z": pd.Series(1)} + ).astype("float64") assert_frame_equal(result, expected) - @pytest.mark.parametrize('tz', [None, 'UTC']) + @pytest.mark.parametrize("tz", [None, "UTC"]) def test_diff_datetime_axis0(self, tz): # GH 18578 - df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz), - 1: date_range('2010', freq='D', periods=2, tz=tz)}) + df = DataFrame( + { + 0: date_range("2010", freq="D", periods=2, tz=tz), + 1: date_range("2010", freq="D", periods=2, tz=tz), + } + ) result = df.diff(axis=0) - expected = DataFrame({0: pd.TimedeltaIndex(['NaT', '1 days']), - 1: pd.TimedeltaIndex(['NaT', '1 days'])}) + expected = DataFrame( + { + 0: pd.TimedeltaIndex(["NaT", "1 days"]), + 1: pd.TimedeltaIndex(["NaT", "1 days"]), + } + ) assert_frame_equal(result, expected) - @pytest.mark.parametrize('tz', [None, 'UTC']) + @pytest.mark.parametrize("tz", [None, "UTC"]) def test_diff_datetime_axis1(self, tz): # GH 18578 - df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz), - 1: date_range('2010', freq='D', periods=2, tz=tz)}) + df = DataFrame( + { + 0: date_range("2010", freq="D", periods=2, tz=tz), + 1: date_range("2010", freq="D", periods=2, tz=tz), + } + ) if tz is None: result = df.diff(axis=1) - expected = DataFrame({0: pd.TimedeltaIndex(['NaT', 'NaT']), - 1: pd.TimedeltaIndex(['0 days', - '0 days'])}) + expected = DataFrame( + { + 0: pd.TimedeltaIndex(["NaT", "NaT"]), + 1: pd.TimedeltaIndex(["0 days", "0 days"]), + } + ) assert_frame_equal(result, expected) else: with pytest.raises(NotImplementedError): @@ -80,19 +106,22 @@ def test_diff_datetime_axis1(self, tz): def test_diff_timedelta(self): # GH 4533 - df = DataFrame(dict(time=[Timestamp('20130101 9:01'), - Timestamp('20130101 9:02')], - value=[1.0, 2.0])) + df = DataFrame( + dict( + time=[Timestamp("20130101 9:01"), Timestamp("20130101 9:02")], + value=[1.0, 2.0], + ) + ) res = df.diff() - exp = DataFrame([[pd.NaT, np.nan], - [pd.Timedelta('00:01:00'), 1]], - columns=['time', 'value']) + exp = DataFrame( + [[pd.NaT, np.nan], [pd.Timedelta("00:01:00"), 1]], columns=["time", "value"] + ) assert_frame_equal(res, exp) def test_diff_mixed_dtype(self): df = DataFrame(np.random.randn(5, 3)) - df['A'] = np.array([1, 2, 3, 4, 5], dtype=object) + df["A"] = np.array([1, 2, 3, 4, 5], dtype=object) result = df.diff() assert result[0].dtype == np.float64 @@ -103,140 +132,136 @@ def test_diff_neg_n(self): assert_frame_equal(rs, xp) def test_diff_float_n(self): - rs = self.tsframe.diff(1.) + rs = self.tsframe.diff(1.0) xp = self.tsframe.diff(1) assert_frame_equal(rs, xp) def test_diff_axis(self): # GH 9727 - df = DataFrame([[1., 2.], [3., 4.]]) - assert_frame_equal(df.diff(axis=1), DataFrame( - [[np.nan, 1.], [np.nan, 1.]])) - assert_frame_equal(df.diff(axis=0), DataFrame( - [[np.nan, np.nan], [2., 2.]])) + df = DataFrame([[1.0, 2.0], [3.0, 4.0]]) + assert_frame_equal(df.diff(axis=1), DataFrame([[np.nan, 1.0], [np.nan, 1.0]])) + assert_frame_equal(df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]])) def test_pct_change(self): rs = self.tsframe.pct_change(fill_method=None) assert_frame_equal(rs, self.tsframe / self.tsframe.shift(1) - 1) rs = self.tsframe.pct_change(2) - filled = self.tsframe.fillna(method='pad') + filled = self.tsframe.fillna(method="pad") assert_frame_equal(rs, filled / filled.shift(2) - 1) - rs = self.tsframe.pct_change(fill_method='bfill', limit=1) - filled = self.tsframe.fillna(method='bfill', limit=1) + rs = self.tsframe.pct_change(fill_method="bfill", limit=1) + filled = self.tsframe.fillna(method="bfill", limit=1) assert_frame_equal(rs, filled / filled.shift(1) - 1) - rs = self.tsframe.pct_change(freq='5D') - filled = self.tsframe.fillna(method='pad') - assert_frame_equal(rs, - (filled / filled.shift(freq='5D') - 1) - .reindex_like(filled)) + rs = self.tsframe.pct_change(freq="5D") + filled = self.tsframe.fillna(method="pad") + assert_frame_equal( + rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) + ) def test_pct_change_shift_over_nas(self): - s = Series([1., 1.5, np.nan, 2.5, 3.]) + s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) - df = DataFrame({'a': s, 'b': s}) + df = DataFrame({"a": s, "b": s}) chg = df.pct_change() - expected = Series([np.nan, 0.5, 0., 2.5 / 1.5 - 1, .2]) - edf = DataFrame({'a': expected, 'b': expected}) + expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) + edf = DataFrame({"a": expected, "b": expected}) assert_frame_equal(chg, edf) - @pytest.mark.parametrize("freq, periods, fill_method, limit", - [('5B', 5, None, None), - ('3B', 3, None, None), - ('3B', 3, 'bfill', None), - ('7B', 7, 'pad', 1), - ('7B', 7, 'bfill', 3), - ('14B', 14, None, None)]) + @pytest.mark.parametrize( + "freq, periods, fill_method, limit", + [ + ("5B", 5, None, None), + ("3B", 3, None, None), + ("3B", 3, "bfill", None), + ("7B", 7, "pad", 1), + ("7B", 7, "bfill", 3), + ("14B", 14, None, None), + ], + ) def test_pct_change_periods_freq(self, freq, periods, fill_method, limit): # GH 7292 - rs_freq = self.tsframe.pct_change(freq=freq, - fill_method=fill_method, - limit=limit) - rs_periods = self.tsframe.pct_change(periods, - fill_method=fill_method, - limit=limit) + rs_freq = self.tsframe.pct_change( + freq=freq, fill_method=fill_method, limit=limit + ) + rs_periods = self.tsframe.pct_change( + periods, fill_method=fill_method, limit=limit + ) assert_frame_equal(rs_freq, rs_periods) - empty_ts = DataFrame(index=self.tsframe.index, - columns=self.tsframe.columns) - rs_freq = empty_ts.pct_change(freq=freq, - fill_method=fill_method, - limit=limit) - rs_periods = empty_ts.pct_change(periods, - fill_method=fill_method, - limit=limit) + empty_ts = DataFrame(index=self.tsframe.index, columns=self.tsframe.columns) + rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) + rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) assert_frame_equal(rs_freq, rs_periods) def test_frame_ctor_datetime64_column(self): - rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') + rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") dates = np.asarray(rng) - df = DataFrame({'A': np.random.randn(len(rng)), 'B': dates}) - assert np.issubdtype(df['B'].dtype, np.dtype('M8[ns]')) + df = DataFrame({"A": np.random.randn(len(rng)), "B": dates}) + assert np.issubdtype(df["B"].dtype, np.dtype("M8[ns]")) def test_frame_append_datetime64_column(self): - rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') + rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") df = DataFrame(index=np.arange(len(rng))) - df['A'] = rng - assert np.issubdtype(df['A'].dtype, np.dtype('M8[ns]')) + df["A"] = rng + assert np.issubdtype(df["A"].dtype, np.dtype("M8[ns]")) def test_frame_datetime64_pre1900_repr(self): - df = DataFrame({'year': date_range('1/1/1700', periods=50, - freq='A-DEC')}) + df = DataFrame({"year": date_range("1/1/1700", periods=50, freq="A-DEC")}) # it works! repr(df) def test_frame_append_datetime64_col_other_units(self): n = 100 - units = ['h', 'm', 's', 'ms', 'D', 'M', 'Y'] + units = ["h", "m", "s", "ms", "D", "M", "Y"] - ns_dtype = np.dtype('M8[ns]') + ns_dtype = np.dtype("M8[ns]") for unit in units: - dtype = np.dtype('M8[%s]' % unit) + dtype = np.dtype("M8[%s]" % unit) vals = np.arange(n, dtype=np.int64).view(dtype) - df = DataFrame({'ints': np.arange(n)}, index=np.arange(n)) + df = DataFrame({"ints": np.arange(n)}, index=np.arange(n)) df[unit] = vals - ex_vals = to_datetime(vals.astype('O')).values + ex_vals = to_datetime(vals.astype("O")).values assert df[unit].dtype == ns_dtype assert (df[unit].values == ex_vals).all() # Test insertion into existing datetime64 column - df = DataFrame({'ints': np.arange(n)}, index=np.arange(n)) - df['dates'] = np.arange(n, dtype=np.int64).view(ns_dtype) + df = DataFrame({"ints": np.arange(n)}, index=np.arange(n)) + df["dates"] = np.arange(n, dtype=np.int64).view(ns_dtype) for unit in units: - dtype = np.dtype('M8[%s]' % unit) + dtype = np.dtype("M8[%s]" % unit) vals = np.arange(n, dtype=np.int64).view(dtype) tmp = df.copy() - tmp['dates'] = vals - ex_vals = to_datetime(vals.astype('O')).values + tmp["dates"] = vals + ex_vals = to_datetime(vals.astype("O")).values - assert (tmp['dates'].values == ex_vals).all() + assert (tmp["dates"].values == ex_vals).all() def test_shift(self): # naive shift shiftedFrame = self.tsframe.shift(5) tm.assert_index_equal(shiftedFrame.index, self.tsframe.index) - shiftedSeries = self.tsframe['A'].shift(5) - assert_series_equal(shiftedFrame['A'], shiftedSeries) + shiftedSeries = self.tsframe["A"].shift(5) + assert_series_equal(shiftedFrame["A"], shiftedSeries) shiftedFrame = self.tsframe.shift(-5) tm.assert_index_equal(shiftedFrame.index, self.tsframe.index) - shiftedSeries = self.tsframe['A'].shift(-5) - assert_series_equal(shiftedFrame['A'], shiftedSeries) + shiftedSeries = self.tsframe["A"].shift(-5) + assert_series_equal(shiftedFrame["A"], shiftedSeries) # shift by 0 unshifted = self.tsframe.shift(0) @@ -246,13 +271,14 @@ def test_shift(self): shiftedFrame = self.tsframe.shift(5, freq=offsets.BDay()) assert len(shiftedFrame) == len(self.tsframe) - shiftedFrame2 = self.tsframe.shift(5, freq='B') + shiftedFrame2 = self.tsframe.shift(5, freq="B") assert_frame_equal(shiftedFrame, shiftedFrame2) d = self.tsframe.index[0] shifted_d = d + offsets.BDay(5) - assert_series_equal(self.tsframe.xs(d), - shiftedFrame.xs(shifted_d), check_names=False) + assert_series_equal( + self.tsframe.xs(d), shiftedFrame.xs(shifted_d), check_names=False + ) # shift int frame int_shifted = self.intframe.shift(1) # noqa @@ -263,72 +289,78 @@ def test_shift(self): unshifted = shifted.shift(-1) tm.assert_index_equal(shifted.index, ps.index) tm.assert_index_equal(unshifted.index, ps.index) - tm.assert_numpy_array_equal(unshifted.iloc[:, 0].dropna().values, - ps.iloc[:-1, 0].values) + tm.assert_numpy_array_equal( + unshifted.iloc[:, 0].dropna().values, ps.iloc[:-1, 0].values + ) - shifted2 = ps.shift(1, 'B') + shifted2 = ps.shift(1, "B") shifted3 = ps.shift(1, offsets.BDay()) assert_frame_equal(shifted2, shifted3) - assert_frame_equal(ps, shifted2.shift(-1, 'B')) + assert_frame_equal(ps, shifted2.shift(-1, "B")) - msg = 'does not match PeriodIndex freq' + msg = "does not match PeriodIndex freq" with pytest.raises(ValueError, match=msg): - ps.shift(freq='D') + ps.shift(freq="D") # shift other axis # GH 6371 df = DataFrame(np.random.rand(10, 5)) - expected = pd.concat([DataFrame(np.nan, index=df.index, - columns=[0]), - df.iloc[:, 0:-1]], - ignore_index=True, axis=1) + expected = pd.concat( + [DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]], + ignore_index=True, + axis=1, + ) result = df.shift(1, axis=1) assert_frame_equal(result, expected) # shift named axis df = DataFrame(np.random.rand(10, 5)) - expected = pd.concat([DataFrame(np.nan, index=df.index, - columns=[0]), - df.iloc[:, 0:-1]], - ignore_index=True, axis=1) - result = df.shift(1, axis='columns') + expected = pd.concat( + [DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]], + ignore_index=True, + axis=1, + ) + result = df.shift(1, axis="columns") assert_frame_equal(result, expected) def test_shift_bool(self): - df = DataFrame({'high': [True, False], - 'low': [False, False]}) + df = DataFrame({"high": [True, False], "low": [False, False]}) rs = df.shift(1) - xp = DataFrame(np.array([[np.nan, np.nan], - [True, False]], dtype=object), - columns=['high', 'low']) + xp = DataFrame( + np.array([[np.nan, np.nan], [True, False]], dtype=object), + columns=["high", "low"], + ) assert_frame_equal(rs, xp) def test_shift_categorical(self): # GH 9416 - s1 = pd.Series(['a', 'b', 'c'], dtype='category') - s2 = pd.Series(['A', 'B', 'C'], dtype='category') - df = DataFrame({'one': s1, 'two': s2}) + s1 = pd.Series(["a", "b", "c"], dtype="category") + s2 = pd.Series(["A", "B", "C"], dtype="category") + df = DataFrame({"one": s1, "two": s2}) rs = df.shift(1) - xp = DataFrame({'one': s1.shift(1), 'two': s2.shift(1)}) + xp = DataFrame({"one": s1.shift(1), "two": s2.shift(1)}) assert_frame_equal(rs, xp) def test_shift_fill_value(self): # GH #24128 - df = DataFrame([1, 2, 3, 4, 5], - index=date_range('1/1/2000', periods=5, freq='H')) - exp = DataFrame([0, 1, 2, 3, 4], - index=date_range('1/1/2000', periods=5, freq='H')) + df = DataFrame( + [1, 2, 3, 4, 5], index=date_range("1/1/2000", periods=5, freq="H") + ) + exp = DataFrame( + [0, 1, 2, 3, 4], index=date_range("1/1/2000", periods=5, freq="H") + ) result = df.shift(1, fill_value=0) assert_frame_equal(result, exp) - exp = DataFrame([0, 0, 1, 2, 3], - index=date_range('1/1/2000', periods=5, freq='H')) + exp = DataFrame( + [0, 0, 1, 2, 3], index=date_range("1/1/2000", periods=5, freq="H") + ) result = df.shift(2, fill_value=0) assert_frame_equal(result, exp) def test_shift_empty(self): # Regression test for #8019 - df = DataFrame({'foo': []}) + df = DataFrame({"foo": []}) rs = df.shift(-1) assert_frame_equal(df, rs) @@ -349,7 +381,7 @@ def test_shift_duplicate_columns(self): # sanity check the base case nulls = shifted[0].isna().sum() - assert_series_equal(nulls, Series(range(1, 6), dtype='int64')) + assert_series_equal(nulls, Series(range(1, 6), dtype="int64")) # check all answers are the same assert_frame_equal(shifted[0], shifted[1]) @@ -363,14 +395,14 @@ def test_tshift(self): assert_frame_equal(unshifted, ps) - shifted2 = ps.tshift(freq='B') + shifted2 = ps.tshift(freq="B") assert_frame_equal(shifted, shifted2) shifted3 = ps.tshift(freq=offsets.BDay()) assert_frame_equal(shifted, shifted3) - with pytest.raises(ValueError, match='does not match'): - ps.tshift(freq='M') + with pytest.raises(ValueError, match="does not match"): + ps.tshift(freq="M") # DatetimeIndex shifted = self.tsframe.tshift(1) @@ -381,9 +413,11 @@ def test_tshift(self): shifted2 = self.tsframe.tshift(freq=self.tsframe.index.freq) assert_frame_equal(shifted, shifted2) - inferred_ts = DataFrame(self.tsframe.values, - Index(np.asarray(self.tsframe.index)), - columns=self.tsframe.columns) + inferred_ts = DataFrame( + self.tsframe.values, + Index(np.asarray(self.tsframe.index)), + columns=self.tsframe.columns, + ) shifted = inferred_ts.tshift(1) unshifted = shifted.tshift(-1) assert_frame_equal(shifted, self.tsframe.tshift(1)) @@ -435,104 +469,113 @@ def test_truncate(self): msg = "Truncate: 2000-01-06 00:00:00 must be after 2000-02-04 00:00:00" with pytest.raises(ValueError, match=msg): - ts.truncate(before=ts.index[-1] - ts.index.freq, - after=ts.index[0] + ts.index.freq) + ts.truncate( + before=ts.index[-1] - ts.index.freq, after=ts.index[0] + ts.index.freq + ) def test_truncate_copy(self): index = self.tsframe.index truncated = self.tsframe.truncate(index[5], index[10]) - truncated.values[:] = 5. + truncated.values[:] = 5.0 assert not (self.tsframe.values[5:11] == 5).any() def test_truncate_nonsortedindex(self): # GH 17935 - df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e']}, - index=[5, 3, 2, 9, 0]) - msg = 'truncate requires a sorted index' + df = pd.DataFrame({"A": ["a", "b", "c", "d", "e"]}, index=[5, 3, 2, 9, 0]) + msg = "truncate requires a sorted index" with pytest.raises(ValueError, match=msg): df.truncate(before=3, after=9) - rng = pd.date_range('2011-01-01', '2012-01-01', freq='W') - ts = pd.DataFrame({'A': np.random.randn(len(rng)), - 'B': np.random.randn(len(rng))}, - index=rng) - msg = 'truncate requires a sorted index' + rng = pd.date_range("2011-01-01", "2012-01-01", freq="W") + ts = pd.DataFrame( + {"A": np.random.randn(len(rng)), "B": np.random.randn(len(rng))}, index=rng + ) + msg = "truncate requires a sorted index" with pytest.raises(ValueError, match=msg): - ts.sort_values('A', ascending=False).truncate(before='2011-11', - after='2011-12') - - df = pd.DataFrame({3: np.random.randn(5), - 20: np.random.randn(5), - 2: np.random.randn(5), - 0: np.random.randn(5)}, - columns=[3, 20, 2, 0]) - msg = 'truncate requires a sorted index' + ts.sort_values("A", ascending=False).truncate( + before="2011-11", after="2011-12" + ) + + df = pd.DataFrame( + { + 3: np.random.randn(5), + 20: np.random.randn(5), + 2: np.random.randn(5), + 0: np.random.randn(5), + }, + columns=[3, 20, 2, 0], + ) + msg = "truncate requires a sorted index" with pytest.raises(ValueError, match=msg): df.truncate(before=2, after=20, axis=1) def test_asfreq(self): offset_monthly = self.tsframe.asfreq(offsets.BMonthEnd()) - rule_monthly = self.tsframe.asfreq('BM') + rule_monthly = self.tsframe.asfreq("BM") - tm.assert_almost_equal(offset_monthly['A'], rule_monthly['A']) + tm.assert_almost_equal(offset_monthly["A"], rule_monthly["A"]) - filled = rule_monthly.asfreq('B', method='pad') # noqa + filled = rule_monthly.asfreq("B", method="pad") # noqa # TODO: actually check that this worked. # don't forget! - filled_dep = rule_monthly.asfreq('B', method='pad') # noqa + filled_dep = rule_monthly.asfreq("B", method="pad") # noqa # test does not blow up on length-0 DataFrame zero_length = self.tsframe.reindex([]) - result = zero_length.asfreq('BM') + result = zero_length.asfreq("BM") assert result is not zero_length def test_asfreq_datetimeindex(self): - df = DataFrame({'A': [1, 2, 3]}, - index=[datetime(2011, 11, 1), datetime(2011, 11, 2), - datetime(2011, 11, 3)]) - df = df.asfreq('B') + df = DataFrame( + {"A": [1, 2, 3]}, + index=[datetime(2011, 11, 1), datetime(2011, 11, 2), datetime(2011, 11, 3)], + ) + df = df.asfreq("B") assert isinstance(df.index, DatetimeIndex) - ts = df['A'].asfreq('B') + ts = df["A"].asfreq("B") assert isinstance(ts.index, DatetimeIndex) def test_asfreq_fillvalue(self): # test for fill value during upsampling, related to issue 3715 # setup - rng = pd.date_range('1/1/2016', periods=10, freq='2S') + rng = pd.date_range("1/1/2016", periods=10, freq="2S") ts = pd.Series(np.arange(len(rng)), index=rng) - df = pd.DataFrame({'one': ts}) + df = pd.DataFrame({"one": ts}) # insert pre-existing missing value - df.loc['2016-01-01 00:00:08', 'one'] = None + df.loc["2016-01-01 00:00:08", "one"] = None - actual_df = df.asfreq(freq='1S', fill_value=9.0) - expected_df = df.asfreq(freq='1S').fillna(9.0) - expected_df.loc['2016-01-01 00:00:08', 'one'] = None + actual_df = df.asfreq(freq="1S", fill_value=9.0) + expected_df = df.asfreq(freq="1S").fillna(9.0) + expected_df.loc["2016-01-01 00:00:08", "one"] = None assert_frame_equal(expected_df, actual_df) - expected_series = ts.asfreq(freq='1S').fillna(9.0) - actual_series = ts.asfreq(freq='1S', fill_value=9.0) + expected_series = ts.asfreq(freq="1S").fillna(9.0) + actual_series = ts.asfreq(freq="1S", fill_value=9.0) assert_series_equal(expected_series, actual_series) - @pytest.mark.parametrize("data,idx,expected_first,expected_last", [ - ({'A': [1, 2, 3]}, [1, 1, 2], 1, 2), - ({'A': [1, 2, 3]}, [1, 2, 2], 1, 2), - ({'A': [1, 2, 3, 4]}, ['d', 'd', 'd', 'd'], 'd', 'd'), - ({'A': [1, np.nan, 3]}, [1, 1, 2], 1, 2), - ({'A': [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2), - ({'A': [1, np.nan, 3]}, [1, 2, 2], 1, 2)]) - def test_first_last_valid(self, data, idx, - expected_first, expected_last): + @pytest.mark.parametrize( + "data,idx,expected_first,expected_last", + [ + ({"A": [1, 2, 3]}, [1, 1, 2], 1, 2), + ({"A": [1, 2, 3]}, [1, 2, 2], 1, 2), + ({"A": [1, 2, 3, 4]}, ["d", "d", "d", "d"], "d", "d"), + ({"A": [1, np.nan, 3]}, [1, 1, 2], 1, 2), + ({"A": [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2), + ({"A": [1, np.nan, 3]}, [1, 2, 2], 1, 2), + ], + ) + def test_first_last_valid(self, data, idx, expected_first, expected_last): N = len(self.frame.index) mat = np.random.randn(N) mat[:5] = np.nan mat[-5:] = np.nan - frame = DataFrame({'foo': mat}, index=self.frame.index) + frame = DataFrame({"foo": mat}, index=self.frame.index) index = frame.first_valid_index() assert index == frame.index[5] @@ -565,66 +608,66 @@ def test_first_last_valid(self, data, idx, assert expected_last == df.last_valid_index() def test_first_subset(self): - ts = tm.makeTimeDataFrame(freq='12h') - result = ts.first('10d') + ts = tm.makeTimeDataFrame(freq="12h") + result = ts.first("10d") assert len(result) == 20 - ts = tm.makeTimeDataFrame(freq='D') - result = ts.first('10d') + ts = tm.makeTimeDataFrame(freq="D") + result = ts.first("10d") assert len(result) == 10 - result = ts.first('3M') - expected = ts[:'3/31/2000'] + result = ts.first("3M") + expected = ts[:"3/31/2000"] assert_frame_equal(result, expected) - result = ts.first('21D') + result = ts.first("21D") expected = ts[:21] assert_frame_equal(result, expected) - result = ts[:0].first('3M') + result = ts[:0].first("3M") assert_frame_equal(result, ts[:0]) def test_first_raises(self): # GH20725 df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) with pytest.raises(TypeError): # index is not a DatetimeIndex - df.first('1D') + df.first("1D") def test_last_subset(self): - ts = tm.makeTimeDataFrame(freq='12h') - result = ts.last('10d') + ts = tm.makeTimeDataFrame(freq="12h") + result = ts.last("10d") assert len(result) == 20 - ts = tm.makeTimeDataFrame(nper=30, freq='D') - result = ts.last('10d') + ts = tm.makeTimeDataFrame(nper=30, freq="D") + result = ts.last("10d") assert len(result) == 10 - result = ts.last('21D') - expected = ts['2000-01-10':] + result = ts.last("21D") + expected = ts["2000-01-10":] assert_frame_equal(result, expected) - result = ts.last('21D') + result = ts.last("21D") expected = ts[-21:] assert_frame_equal(result, expected) - result = ts[:0].last('3M') + result = ts[:0].last("3M") assert_frame_equal(result, ts[:0]) def test_last_raises(self): # GH20725 df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) with pytest.raises(TypeError): # index is not a DatetimeIndex - df.last('1D') + df.last("1D") def test_at_time(self): - rng = date_range('1/1/2000', '1/5/2000', freq='5min') + rng = date_range("1/1/2000", "1/5/2000", freq="5min") ts = DataFrame(np.random.randn(len(rng), 2), index=rng) rs = ts.at_time(rng[1]) assert (rs.index.hour == rng[1].hour).all() assert (rs.index.minute == rng[1].minute).all() assert (rs.index.second == rng[1].second).all() - result = ts.at_time('9:30') + result = ts.at_time("9:30") expected = ts.at_time(time(9, 30)) assert_frame_equal(result, expected) @@ -634,25 +677,26 @@ def test_at_time(self): assert_frame_equal(result, expected) # midnight, everything - rng = date_range('1/1/2000', '1/31/2000') + rng = date_range("1/1/2000", "1/31/2000") ts = DataFrame(np.random.randn(len(rng), 3), index=rng) result = ts.at_time(time(0, 0)) assert_frame_equal(result, ts) # time doesn't exist - rng = date_range('1/1/2012', freq='23Min', periods=384) + rng = date_range("1/1/2012", freq="23Min", periods=384) ts = DataFrame(np.random.randn(len(rng), 2), rng) - rs = ts.at_time('16:00') + rs = ts.at_time("16:00") assert len(rs) == 0 - @pytest.mark.parametrize('hour', ['1:00', '1:00AM', time(1), - time(1, tzinfo=pytz.UTC)]) + @pytest.mark.parametrize( + "hour", ["1:00", "1:00AM", time(1), time(1, tzinfo=pytz.UTC)] + ) def test_at_time_errors(self, hour): # GH 24043 - dti = pd.date_range('2018', periods=3, freq='H') + dti = pd.date_range("2018", periods=3, freq="H") df = pd.DataFrame(list(range(len(dti))), index=dti) - if getattr(hour, 'tzinfo', None) is None: + if getattr(hour, "tzinfo", None) is None: result = df.at_time(hour) expected = df.iloc[1:2] tm.assert_frame_equal(result, expected) @@ -662,9 +706,9 @@ def test_at_time_errors(self, hour): def test_at_time_tz(self): # GH 24043 - dti = pd.date_range('2018', periods=3, freq='H', tz='US/Pacific') + dti = pd.date_range("2018", periods=3, freq="H", tz="US/Pacific") df = pd.DataFrame(list(range(len(dti))), index=dti) - result = df.at_time(time(4, tzinfo=pytz.timezone('US/Eastern'))) + result = df.at_time(time(4, tzinfo=pytz.timezone("US/Eastern"))) expected = df.iloc[1:2] tm.assert_frame_equal(result, expected) @@ -672,27 +716,27 @@ def test_at_time_raises(self): # GH20725 df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) with pytest.raises(TypeError): # index is not a DatetimeIndex - df.at_time('00:00') + df.at_time("00:00") - @pytest.mark.parametrize('axis', ['index', 'columns', 0, 1]) + @pytest.mark.parametrize("axis", ["index", "columns", 0, 1]) def test_at_time_axis(self, axis): # issue 8839 - rng = date_range('1/1/2000', '1/5/2000', freq='5min') + rng = date_range("1/1/2000", "1/5/2000", freq="5min") ts = DataFrame(np.random.randn(len(rng), len(rng))) ts.index, ts.columns = rng, rng indices = rng[(rng.hour == 9) & (rng.minute == 30) & (rng.second == 0)] - if axis in ['index', 0]: + if axis in ["index", 0]: expected = ts.loc[indices, :] - elif axis in ['columns', 1]: + elif axis in ["columns", 1]: expected = ts.loc[:, indices] - result = ts.at_time('9:30', axis=axis) + result = ts.at_time("9:30", axis=axis) assert_frame_equal(result, expected) def test_between_time(self, close_open_fixture): - rng = date_range('1/1/2000', '1/5/2000', freq='5min') + rng = date_range("1/1/2000", "1/5/2000", freq="5min") ts = DataFrame(np.random.randn(len(rng), 2), index=rng) stime = time(0, 0) etime = time(1, 0) @@ -718,12 +762,12 @@ def test_between_time(self, close_open_fixture): else: assert t < etime - result = ts.between_time('00:00', '01:00') + result = ts.between_time("00:00", "01:00") expected = ts.between_time(stime, etime) assert_frame_equal(result, expected) # across midnight - rng = date_range('1/1/2000', '1/5/2000', freq='5min') + rng = date_range("1/1/2000", "1/5/2000", freq="5min") ts = DataFrame(np.random.randn(len(rng), 2), index=rng) stime = time(22, 0) etime = time(9, 0) @@ -752,61 +796,60 @@ def test_between_time_raises(self): # GH20725 df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) with pytest.raises(TypeError): # index is not a DatetimeIndex - df.between_time(start_time='00:00', end_time='12:00') + df.between_time(start_time="00:00", end_time="12:00") def test_between_time_axis(self, axis): # issue 8839 - rng = date_range('1/1/2000', periods=100, freq='10min') + rng = date_range("1/1/2000", periods=100, freq="10min") ts = DataFrame(np.random.randn(len(rng), len(rng))) - stime, etime = ('08:00:00', '09:00:00') + stime, etime = ("08:00:00", "09:00:00") exp_len = 7 - if axis in ['index', 0]: + if axis in ["index", 0]: ts.index = rng assert len(ts.between_time(stime, etime)) == exp_len assert len(ts.between_time(stime, etime, axis=0)) == exp_len - if axis in ['columns', 1]: + if axis in ["columns", 1]: ts.columns = rng selected = ts.between_time(stime, etime, axis=1).columns assert len(selected) == exp_len def test_between_time_axis_raises(self, axis): # issue 8839 - rng = date_range('1/1/2000', periods=100, freq='10min') + rng = date_range("1/1/2000", periods=100, freq="10min") mask = np.arange(0, len(rng)) rand_data = np.random.randn(len(rng), len(rng)) ts = DataFrame(rand_data, index=rng, columns=rng) - stime, etime = ('08:00:00', '09:00:00') + stime, etime = ("08:00:00", "09:00:00") msg = "Index must be DatetimeIndex" - if axis in ['columns', 1]: + if axis in ["columns", 1]: ts.index = mask with pytest.raises(TypeError, match=msg): ts.between_time(stime, etime) with pytest.raises(TypeError, match=msg): ts.between_time(stime, etime, axis=0) - if axis in ['index', 0]: + if axis in ["index", 0]: ts.columns = mask with pytest.raises(TypeError, match=msg): ts.between_time(stime, etime, axis=1) def test_operation_on_NaT(self): # Both NaT and Timestamp are in DataFrame. - df = pd.DataFrame({'foo': [pd.NaT, pd.NaT, - pd.Timestamp('2012-05-01')]}) + df = pd.DataFrame({"foo": [pd.NaT, pd.NaT, pd.Timestamp("2012-05-01")]}) res = df.min() - exp = pd.Series([pd.Timestamp('2012-05-01')], index=["foo"]) + exp = pd.Series([pd.Timestamp("2012-05-01")], index=["foo"]) tm.assert_series_equal(res, exp) res = df.max() - exp = pd.Series([pd.Timestamp('2012-05-01')], index=["foo"]) + exp = pd.Series([pd.Timestamp("2012-05-01")], index=["foo"]) tm.assert_series_equal(res, exp) # GH12941, only NaTs are in DataFrame. - df = pd.DataFrame({'foo': [pd.NaT, pd.NaT]}) + df = pd.DataFrame({"foo": [pd.NaT, pd.NaT]}) res = df.min() exp = pd.Series([pd.NaT], index=["foo"]) @@ -818,34 +861,36 @@ def test_operation_on_NaT(self): def test_datetime_assignment_with_NaT_and_diff_time_units(self): # GH 7492 - data_ns = np.array([1, 'nat'], dtype='datetime64[ns]') + data_ns = np.array([1, "nat"], dtype="datetime64[ns]") result = pd.Series(data_ns).to_frame() - result['new'] = data_ns - expected = pd.DataFrame({0: [1, None], - 'new': [1, None]}, dtype='datetime64[ns]') + result["new"] = data_ns + expected = pd.DataFrame( + {0: [1, None], "new": [1, None]}, dtype="datetime64[ns]" + ) tm.assert_frame_equal(result, expected) # OutOfBoundsDatetime error shouldn't occur - data_s = np.array([1, 'nat'], dtype='datetime64[s]') - result['new'] = data_s - expected = pd.DataFrame({0: [1, None], - 'new': [1e9, None]}, dtype='datetime64[ns]') + data_s = np.array([1, "nat"], dtype="datetime64[s]") + result["new"] = data_s + expected = pd.DataFrame( + {0: [1, None], "new": [1e9, None]}, dtype="datetime64[ns]" + ) tm.assert_frame_equal(result, expected) def test_frame_to_period(self): K = 5 - dr = date_range('1/1/2000', '1/1/2001') - pr = period_range('1/1/2000', '1/1/2001') + dr = date_range("1/1/2000", "1/1/2001") + pr = period_range("1/1/2000", "1/1/2001") df = DataFrame(np.random.randn(len(dr), K), index=dr) - df['mix'] = 'a' + df["mix"] = "a" pts = df.to_period() exp = df.copy() exp.index = pr assert_frame_equal(pts, exp) - pts = df.to_period('M') - tm.assert_index_equal(pts.index, exp.index.asfreq('M')) + pts = df.to_period("M") + tm.assert_index_equal(pts.index, exp.index.asfreq("M")) df = df.T pts = df.to_period(axis=1) @@ -853,55 +898,53 @@ def test_frame_to_period(self): exp.columns = pr assert_frame_equal(pts, exp) - pts = df.to_period('M', axis=1) - tm.assert_index_equal(pts.columns, exp.columns.asfreq('M')) + pts = df.to_period("M", axis=1) + tm.assert_index_equal(pts.columns, exp.columns.asfreq("M")) - msg = ("No axis named 2 for object type" - " ") + msg = "No axis named 2 for object type" " " with pytest.raises(ValueError, match=msg): df.to_period(axis=2) - @pytest.mark.parametrize("fn", ['tz_localize', 'tz_convert']) + @pytest.mark.parametrize("fn", ["tz_localize", "tz_convert"]) def test_tz_convert_and_localize(self, fn): - l0 = date_range('20140701', periods=5, freq='D') - l1 = date_range('20140701', periods=5, freq='D') + l0 = date_range("20140701", periods=5, freq="D") + l1 = date_range("20140701", periods=5, freq="D") int_idx = Index(range(5)) - if fn == 'tz_convert': - l0 = l0.tz_localize('UTC') - l1 = l1.tz_localize('UTC') + if fn == "tz_convert": + l0 = l0.tz_localize("UTC") + l1 = l1.tz_localize("UTC") for idx in [l0, l1]: - l0_expected = getattr(idx, fn)('US/Pacific') - l1_expected = getattr(idx, fn)('US/Pacific') + l0_expected = getattr(idx, fn)("US/Pacific") + l1_expected = getattr(idx, fn)("US/Pacific") df1 = DataFrame(np.ones(5), index=l0) - df1 = getattr(df1, fn)('US/Pacific') + df1 = getattr(df1, fn)("US/Pacific") assert_index_equal(df1.index, l0_expected) # MultiIndex # GH7846 df2 = DataFrame(np.ones(5), MultiIndex.from_arrays([l0, l1])) - df3 = getattr(df2, fn)('US/Pacific', level=0) + df3 = getattr(df2, fn)("US/Pacific", level=0) assert not df3.index.levels[0].equals(l0) assert_index_equal(df3.index.levels[0], l0_expected) assert_index_equal(df3.index.levels[1], l1) assert not df3.index.levels[1].equals(l1_expected) - df3 = getattr(df2, fn)('US/Pacific', level=1) + df3 = getattr(df2, fn)("US/Pacific", level=1) assert_index_equal(df3.index.levels[0], l0) assert not df3.index.levels[0].equals(l0_expected) assert_index_equal(df3.index.levels[1], l1_expected) assert not df3.index.levels[1].equals(l1) - df4 = DataFrame(np.ones(5), - MultiIndex.from_arrays([int_idx, l0])) + df4 = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0])) # TODO: untested - df5 = getattr(df4, fn)('US/Pacific', level=1) # noqa + df5 = getattr(df4, fn)("US/Pacific", level=1) # noqa assert_index_equal(df3.index.levels[0], l0) assert not df3.index.levels[0].equals(l0_expected) @@ -911,17 +954,16 @@ def test_tz_convert_and_localize(self, fn): # Bad Inputs # Not DatetimeIndex / PeriodIndex - with pytest.raises(TypeError, match='DatetimeIndex'): + with pytest.raises(TypeError, match="DatetimeIndex"): df = DataFrame(index=int_idx) - df = getattr(df, fn)('US/Pacific') + df = getattr(df, fn)("US/Pacific") # Not DatetimeIndex / PeriodIndex - with pytest.raises(TypeError, match='DatetimeIndex'): - df = DataFrame(np.ones(5), - MultiIndex.from_arrays([int_idx, l0])) - df = getattr(df, fn)('US/Pacific', level=0) + with pytest.raises(TypeError, match="DatetimeIndex"): + df = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0])) + df = getattr(df, fn)("US/Pacific", level=0) # Invalid level - with pytest.raises(ValueError, match='not valid'): + with pytest.raises(ValueError, match="not valid"): df = DataFrame(index=l0) - df = getattr(df, fn)('US/Pacific', level=1) + df = getattr(df, fn)("US/Pacific", level=1) diff --git a/pandas/tests/frame/test_timezones.py b/pandas/tests/frame/test_timezones.py index b7c73daae00029..3e110a4b040da5 100644 --- a/pandas/tests/frame/test_timezones.py +++ b/pandas/tests/frame/test_timezones.py @@ -16,17 +16,18 @@ class TestDataFrameTimezones: - def test_frame_values_with_tz(self): tz = "US/Central" - df = DataFrame({"A": date_range('2000', periods=4, tz=tz)}) + df = DataFrame({"A": date_range("2000", periods=4, tz=tz)}) result = df.values - expected = np.array([ - [pd.Timestamp('2000-01-01', tz=tz)], - [pd.Timestamp('2000-01-02', tz=tz)], - [pd.Timestamp('2000-01-03', tz=tz)], - [pd.Timestamp('2000-01-04', tz=tz)], - ]) + expected = np.array( + [ + [pd.Timestamp("2000-01-01", tz=tz)], + [pd.Timestamp("2000-01-02", tz=tz)], + [pd.Timestamp("2000-01-03", tz=tz)], + [pd.Timestamp("2000-01-04", tz=tz)], + ] + ) tm.assert_numpy_array_equal(result, expected) # two columns, homogenous @@ -40,71 +41,78 @@ def test_frame_values_with_tz(self): est = "US/Eastern" df = df.assign(C=df.A.dt.tz_convert(est)) - new = np.array([ - [pd.Timestamp('2000-01-01T01:00:00', tz=est)], - [pd.Timestamp('2000-01-02T01:00:00', tz=est)], - [pd.Timestamp('2000-01-03T01:00:00', tz=est)], - [pd.Timestamp('2000-01-04T01:00:00', tz=est)], - ]) + new = np.array( + [ + [pd.Timestamp("2000-01-01T01:00:00", tz=est)], + [pd.Timestamp("2000-01-02T01:00:00", tz=est)], + [pd.Timestamp("2000-01-03T01:00:00", tz=est)], + [pd.Timestamp("2000-01-04T01:00:00", tz=est)], + ] + ) expected = np.concatenate([expected, new], axis=1) result = df.values tm.assert_numpy_array_equal(result, expected) def test_frame_from_records_utc(self): - rec = {'datum': 1.5, - 'begin_time': datetime(2006, 4, 27, tzinfo=pytz.utc)} + rec = {"datum": 1.5, "begin_time": datetime(2006, 4, 27, tzinfo=pytz.utc)} # it works - DataFrame.from_records([rec], index='begin_time') + DataFrame.from_records([rec], index="begin_time") def test_frame_tz_localize(self): - rng = date_range('1/1/2011', periods=100, freq='H') + rng = date_range("1/1/2011", periods=100, freq="H") - df = DataFrame({'a': 1}, index=rng) - result = df.tz_localize('utc') - expected = DataFrame({'a': 1}, rng.tz_localize('UTC')) - assert result.index.tz.zone == 'UTC' + df = DataFrame({"a": 1}, index=rng) + result = df.tz_localize("utc") + expected = DataFrame({"a": 1}, rng.tz_localize("UTC")) + assert result.index.tz.zone == "UTC" tm.assert_frame_equal(result, expected) df = df.T - result = df.tz_localize('utc', axis=1) - assert result.columns.tz.zone == 'UTC' + result = df.tz_localize("utc", axis=1) + assert result.columns.tz.zone == "UTC" tm.assert_frame_equal(result, expected.T) def test_frame_tz_convert(self): - rng = date_range('1/1/2011', periods=200, freq='D', tz='US/Eastern') + rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern") - df = DataFrame({'a': 1}, index=rng) - result = df.tz_convert('Europe/Berlin') - expected = DataFrame({'a': 1}, rng.tz_convert('Europe/Berlin')) - assert result.index.tz.zone == 'Europe/Berlin' + df = DataFrame({"a": 1}, index=rng) + result = df.tz_convert("Europe/Berlin") + expected = DataFrame({"a": 1}, rng.tz_convert("Europe/Berlin")) + assert result.index.tz.zone == "Europe/Berlin" tm.assert_frame_equal(result, expected) df = df.T - result = df.tz_convert('Europe/Berlin', axis=1) - assert result.columns.tz.zone == 'Europe/Berlin' + result = df.tz_convert("Europe/Berlin", axis=1) + assert result.columns.tz.zone == "Europe/Berlin" tm.assert_frame_equal(result, expected.T) def test_frame_join_tzaware(self): - test1 = DataFrame(np.zeros((6, 3)), - index=date_range("2012-11-15 00:00:00", periods=6, - freq="100L", tz="US/Central")) - test2 = DataFrame(np.zeros((3, 3)), - index=date_range("2012-11-15 00:00:00", periods=3, - freq="250L", tz="US/Central"), - columns=range(3, 6)) - - result = test1.join(test2, how='outer') + test1 = DataFrame( + np.zeros((6, 3)), + index=date_range( + "2012-11-15 00:00:00", periods=6, freq="100L", tz="US/Central" + ), + ) + test2 = DataFrame( + np.zeros((3, 3)), + index=date_range( + "2012-11-15 00:00:00", periods=3, freq="250L", tz="US/Central" + ), + columns=range(3, 6), + ) + + result = test1.join(test2, how="outer") ex_index = test1.index.union(test2.index) tm.assert_index_equal(result.index, ex_index) - assert result.index.tz.zone == 'US/Central' + assert result.index.tz.zone == "US/Central" def test_frame_add_tz_mismatch_converts_to_utc(self): - rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern') - df = DataFrame(np.random.randn(len(rng)), index=rng, columns=['a']) + rng = date_range("1/1/2011", periods=10, freq="H", tz="US/Eastern") + df = DataFrame(np.random.randn(len(rng)), index=rng, columns=["a"]) - df_moscow = df.tz_convert('Europe/Moscow') + df_moscow = df.tz_convert("Europe/Moscow") result = df + df_moscow assert result.index.tz is pytz.utc @@ -112,8 +120,8 @@ def test_frame_add_tz_mismatch_converts_to_utc(self): assert result.index.tz is pytz.utc def test_frame_align_aware(self): - idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern') - idx2 = date_range('2001', periods=5, freq='2H', tz='US/Eastern') + idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") + idx2 = date_range("2001", periods=5, freq="2H", tz="US/Eastern") df1 = DataFrame(np.random.randn(len(idx1), 3), idx1) df2 = DataFrame(np.random.randn(len(idx2), 3), idx2) new1, new2 = df1.align(df2) @@ -123,7 +131,7 @@ def test_frame_align_aware(self): # different timezones convert to UTC # frame with frame - df1_central = df1.tz_convert('US/Central') + df1_central = df1.tz_convert("US/Central") new1, new2 = df1.align(df1_central) assert new1.index.tz == pytz.UTC assert new2.index.tz == pytz.UTC @@ -137,72 +145,71 @@ def test_frame_align_aware(self): assert new1.index.tz == pytz.UTC assert new2.index.tz == pytz.UTC - @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) def test_frame_no_datetime64_dtype(self, tz): # after GH#7822 # these retain the timezones on dict construction - dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') + dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI") dr_tz = dr.tz_localize(tz) - df = DataFrame({'A': 'foo', 'B': dr_tz}, index=dr) - tz_expected = DatetimeTZDtype('ns', dr_tz.tzinfo) - assert df['B'].dtype == tz_expected + df = DataFrame({"A": "foo", "B": dr_tz}, index=dr) + tz_expected = DatetimeTZDtype("ns", dr_tz.tzinfo) + assert df["B"].dtype == tz_expected # GH#2810 (with timezones) datetimes_naive = [ts.to_pydatetime() for ts in dr] datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz] - df = DataFrame({'dr': dr}) - df['dr_tz'] = dr_tz - df['datetimes_naive'] = datetimes_naive - df['datetimes_with_tz'] = datetimes_with_tz + df = DataFrame({"dr": dr}) + df["dr_tz"] = dr_tz + df["datetimes_naive"] = datetimes_naive + df["datetimes_with_tz"] = datetimes_with_tz result = df.dtypes - expected = Series([ - np.dtype('datetime64[ns]'), - DatetimeTZDtype(tz=tz), - np.dtype('datetime64[ns]'), - DatetimeTZDtype(tz=tz) - ], - index=['dr', 'dr_tz', 'datetimes_naive', 'datetimes_with_tz']) + expected = Series( + [ + np.dtype("datetime64[ns]"), + DatetimeTZDtype(tz=tz), + np.dtype("datetime64[ns]"), + DatetimeTZDtype(tz=tz), + ], + index=["dr", "dr_tz", "datetimes_naive", "datetimes_with_tz"], + ) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) def test_frame_reset_index(self, tz): - dr = date_range('2012-06-02', periods=10, tz=tz) + dr = date_range("2012-06-02", periods=10, tz=tz) df = DataFrame(np.random.randn(len(dr)), dr) - roundtripped = df.reset_index().set_index('index') + roundtripped = df.reset_index().set_index("index") xp = df.index.tz rs = roundtripped.index.tz assert xp == rs - @pytest.mark.parametrize('tz', [None, 'America/New_York']) + @pytest.mark.parametrize("tz", [None, "America/New_York"]) def test_boolean_compare_transpose_tzindex_with_dst(self, tz): # GH 19970 - idx = date_range('20161101', '20161130', freq='4H', tz=tz) - df = DataFrame({'a': range(len(idx)), 'b': range(len(idx))}, - index=idx) + idx = date_range("20161101", "20161130", freq="4H", tz=tz) + df = DataFrame({"a": range(len(idx)), "b": range(len(idx))}, index=idx) result = df.T == df.T - expected = DataFrame(True, index=list('ab'), columns=idx) + expected = DataFrame(True, index=list("ab"), columns=idx) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('copy', [True, False]) - @pytest.mark.parametrize('method, tz', [ - ['tz_localize', None], - ['tz_convert', 'Europe/Berlin'] - ]) + @pytest.mark.parametrize("copy", [True, False]) + @pytest.mark.parametrize( + "method, tz", [["tz_localize", None], ["tz_convert", "Europe/Berlin"]] + ) def test_tz_localize_convert_copy_inplace_mutate(self, copy, method, tz): # GH 6326 - result = DataFrame(np.arange(0, 5), - index=date_range('20131027', periods=5, - freq='1H', tz=tz)) - getattr(result, method)('UTC', copy=copy) - expected = DataFrame(np.arange(0, 5), - index=date_range('20131027', periods=5, - freq='1H', tz=tz)) + result = DataFrame( + np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=tz) + ) + getattr(result, method)("UTC", copy=copy) + expected = DataFrame( + np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=tz) + ) tm.assert_frame_equal(result, expected) def test_constructor_data_aware_dtype_naive(self, tz_aware_fixture): # GH 25843 tz = tz_aware_fixture - result = DataFrame({'d': [pd.Timestamp('2019', tz=tz)]}, - dtype='datetime64[ns]') - expected = DataFrame({'d': [pd.Timestamp('2019')]}) + result = DataFrame({"d": [pd.Timestamp("2019", tz=tz)]}, dtype="datetime64[ns]") + expected = DataFrame({"d": [pd.Timestamp("2019")]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index cfe9e00a47db5c..33f29c6f8acb55 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -9,24 +9,42 @@ import pandas as pd from pandas import ( - DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv, - to_datetime) + DataFrame, + Index, + MultiIndex, + Series, + Timestamp, + date_range, + read_csv, + to_datetime, +) import pandas.core.common as com from pandas.tests.frame.common import TestData import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal, ensure_clean, - makeCustomDataframe as mkdf) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, + ensure_clean, + makeCustomDataframe as mkdf, +) from pandas.io.common import _get_handle -MIXED_FLOAT_DTYPES = ['float16', 'float32', 'float64'] -MIXED_INT_DTYPES = ['uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', - 'int32', 'int64'] +MIXED_FLOAT_DTYPES = ["float16", "float32", "float64"] +MIXED_INT_DTYPES = [ + "uint8", + "uint16", + "uint32", + "uint64", + "int8", + "int16", + "int32", + "int64", +] class TestDataFrameToCSV(TestData): - def read_csv(self, path, **kwargs): params = dict(index_col=0, parse_dates=True) params.update(**kwargs) @@ -35,11 +53,11 @@ def read_csv(self, path, **kwargs): def test_to_csv_from_csv1(self): - with ensure_clean('__tmp_to_csv_from_csv1__') as path: - self.frame['A'][:5] = np.nan + with ensure_clean("__tmp_to_csv_from_csv1__") as path: + self.frame["A"][:5] = np.nan self.frame.to_csv(path) - self.frame.to_csv(path, columns=['A', 'B']) + self.frame.to_csv(path, columns=["A", "B"]) self.frame.to_csv(path, header=False) self.frame.to_csv(path, index=False) @@ -48,10 +66,10 @@ def test_to_csv_from_csv1(self): recons = self.read_csv(path) assert_frame_equal(self.tsframe, recons) - self.tsframe.to_csv(path, index_label='index') + self.tsframe.to_csv(path, index_label="index") recons = self.read_csv(path, index_col=None) - assert(len(recons.columns) == len(self.tsframe.columns) + 1) + assert len(recons.columns) == len(self.tsframe.columns) + 1 # no index self.tsframe.to_csv(path, index=False) @@ -59,8 +77,12 @@ def test_to_csv_from_csv1(self): assert_almost_equal(self.tsframe.values, recons.values) # corner case - dm = DataFrame({'s1': Series(range(3), index=np.arange(3)), - 's2': Series(range(2), index=np.arange(2))}) + dm = DataFrame( + { + "s1": Series(range(3), index=np.arange(3)), + "s2": Series(range(2), index=np.arange(2)), + } + ) dm.to_csv(path) recons = self.read_csv(path) @@ -68,27 +90,25 @@ def test_to_csv_from_csv1(self): def test_to_csv_from_csv2(self): - with ensure_clean('__tmp_to_csv_from_csv2__') as path: + with ensure_clean("__tmp_to_csv_from_csv2__") as path: # duplicate index - df = DataFrame(np.random.randn(3, 3), index=['a', 'a', 'b'], - columns=['x', 'y', 'z']) + df = DataFrame( + np.random.randn(3, 3), index=["a", "a", "b"], columns=["x", "y", "z"] + ) df.to_csv(path) result = self.read_csv(path) assert_frame_equal(result, df) - midx = MultiIndex.from_tuples( - [('A', 1, 2), ('A', 1, 2), ('B', 1, 2)]) - df = DataFrame(np.random.randn(3, 3), index=midx, - columns=['x', 'y', 'z']) + midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)]) + df = DataFrame(np.random.randn(3, 3), index=midx, columns=["x", "y", "z"]) df.to_csv(path) - result = self.read_csv(path, index_col=[0, 1, 2], - parse_dates=False) + result = self.read_csv(path, index_col=[0, 1, 2], parse_dates=False) assert_frame_equal(result, df, check_names=False) # column aliases - col_aliases = Index(['AA', 'X', 'Y', 'Z']) + col_aliases = Index(["AA", "X", "Y", "Z"]) self.frame2.to_csv(path, header=col_aliases) rs = self.read_csv(path) @@ -98,16 +118,16 @@ def test_to_csv_from_csv2(self): msg = "Writing 4 cols but got 2 aliases" with pytest.raises(ValueError, match=msg): - self.frame2.to_csv(path, header=['AA', 'X']) + self.frame2.to_csv(path, header=["AA", "X"]) def test_to_csv_from_csv3(self): - with ensure_clean('__tmp_to_csv_from_csv3__') as path: + with ensure_clean("__tmp_to_csv_from_csv3__") as path: df1 = DataFrame(np.random.randn(3, 1)) df2 = DataFrame(np.random.randn(3, 1)) df1.to_csv(path) - df2.to_csv(path, mode='a', header=False) + df2.to_csv(path, mode="a", header=False) xp = pd.concat([df1, df2]) rs = pd.read_csv(path, index_col=0) rs.columns = [int(label) for label in rs.columns] @@ -116,34 +136,38 @@ def test_to_csv_from_csv3(self): def test_to_csv_from_csv4(self): - with ensure_clean('__tmp_to_csv_from_csv4__') as path: + with ensure_clean("__tmp_to_csv_from_csv4__") as path: # GH 10833 (TimedeltaIndex formatting) dt = pd.Timedelta(seconds=1) - df = pd.DataFrame({'dt_data': [i * dt for i in range(3)]}, - index=pd.Index([i * dt for i in range(3)], - name='dt_index')) + df = pd.DataFrame( + {"dt_data": [i * dt for i in range(3)]}, + index=pd.Index([i * dt for i in range(3)], name="dt_index"), + ) df.to_csv(path) - result = pd.read_csv(path, index_col='dt_index') + result = pd.read_csv(path, index_col="dt_index") result.index = pd.to_timedelta(result.index) # TODO: remove renaming when GH 10875 is solved - result.index = result.index.rename('dt_index') - result['dt_data'] = pd.to_timedelta(result['dt_data']) + result.index = result.index.rename("dt_index") + result["dt_data"] = pd.to_timedelta(result["dt_data"]) assert_frame_equal(df, result, check_index_type=True) def test_to_csv_from_csv5(self): # tz, 8260 - with ensure_clean('__tmp_to_csv_from_csv5__') as path: + with ensure_clean("__tmp_to_csv_from_csv5__") as path: self.tzframe.to_csv(path) - result = pd.read_csv(path, index_col=0, parse_dates=['A']) - - converter = lambda c: to_datetime(result[c]).dt.tz_convert( - 'UTC').dt.tz_convert(self.tzframe[c].dt.tz) - result['B'] = converter('B') - result['C'] = converter('C') + result = pd.read_csv(path, index_col=0, parse_dates=["A"]) + + converter = ( + lambda c: to_datetime(result[c]) + .dt.tz_convert("UTC") + .dt.tz_convert(self.tzframe[c].dt.tz) + ) + result["B"] = converter("B") + result["C"] = converter("C") assert_frame_equal(result, self.tzframe) def test_to_csv_cols_reordering(self): @@ -178,8 +202,7 @@ def _check_df(df, cols=None): if df.columns.is_unique: rs_c.columns = cols else: - indexer, missing = df.columns.get_indexer_non_unique( - cols) + indexer, missing = df.columns.get_indexer_non_unique(cols) rs_c.columns = df.columns.take(indexer) for c in cols: @@ -188,8 +211,7 @@ def _check_df(df, cols=None): if isinstance(obj_df, Series): assert_series_equal(obj_df, obj_rs) else: - assert_frame_equal( - obj_df, obj_rs, check_names=False) + assert_frame_equal(obj_df, obj_rs, check_names=False) # wrote in the same order else: @@ -201,11 +223,11 @@ def _check_df(df, cols=None): # dupe cols df = mkdf(N, 3) - df.columns = ['a', 'a', 'b'] + df.columns = ["a", "a", "b"] _check_df(df, None) # dupe cols with selection - cols = ['b', 'a'] + cols = ["b", "a"] _check_df(df, cols) @pytest.mark.slow @@ -216,7 +238,7 @@ def test_to_csv_dtnat(self): def make_dtnat_arr(n, nnat=None): if nnat is None: nnat = int(n * 0.1) # 10% - s = list(date_range('2000', freq='5min', periods=n)) + s = list(date_range("2000", freq="5min", periods=n)) if nnat: for i in np.random.randint(0, len(s), nnat): s[i] = NaT @@ -231,154 +253,204 @@ def make_dtnat_arr(n, nnat=None): s2 = make_dtnat_arr(chunksize + 5, 0) # s3=make_dtnjat_arr(chunksize+5,0) - with ensure_clean('1.csv') as pth: + with ensure_clean("1.csv") as pth: df = DataFrame(dict(a=s1, b=s2)) df.to_csv(pth, chunksize=chunksize) - recons = self.read_csv(pth)._convert(datetime=True, - coerce=True) - assert_frame_equal(df, recons, check_names=False, - check_less_precise=True) + recons = self.read_csv(pth)._convert(datetime=True, coerce=True) + assert_frame_equal(df, recons, check_names=False, check_less_precise=True) @pytest.mark.slow def test_to_csv_moar(self): - - def _do_test(df, r_dtype=None, c_dtype=None, - rnlvl=None, cnlvl=None, dupe_col=False): + def _do_test( + df, r_dtype=None, c_dtype=None, rnlvl=None, cnlvl=None, dupe_col=False + ): kwargs = dict(parse_dates=False) if cnlvl: if rnlvl is not None: - kwargs['index_col'] = list(range(rnlvl)) - kwargs['header'] = list(range(cnlvl)) + kwargs["index_col"] = list(range(rnlvl)) + kwargs["header"] = list(range(cnlvl)) - with ensure_clean('__tmp_to_csv_moar__') as path: - df.to_csv(path, encoding='utf8', - chunksize=chunksize) + with ensure_clean("__tmp_to_csv_moar__") as path: + df.to_csv(path, encoding="utf8", chunksize=chunksize) recons = self.read_csv(path, **kwargs) else: - kwargs['header'] = 0 + kwargs["header"] = 0 - with ensure_clean('__tmp_to_csv_moar__') as path: - df.to_csv(path, encoding='utf8', chunksize=chunksize) + with ensure_clean("__tmp_to_csv_moar__") as path: + df.to_csv(path, encoding="utf8", chunksize=chunksize) recons = self.read_csv(path, **kwargs) def _to_uni(x): if not isinstance(x, str): - return x.decode('utf8') + return x.decode("utf8") return x + if dupe_col: # read_Csv disambiguates the columns by # labeling them dupe.1,dupe.2, etc'. monkey patch columns recons.columns = df.columns if rnlvl and not cnlvl: - delta_lvl = [recons.iloc[ - :, i].values for i in range(rnlvl - 1)] + delta_lvl = [recons.iloc[:, i].values for i in range(rnlvl - 1)] ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl) recons.index = ix - recons = recons.iloc[:, rnlvl - 1:] + recons = recons.iloc[:, rnlvl - 1 :] - type_map = dict(i='i', f='f', s='O', u='O', dt='O', p='O') + type_map = dict(i="i", f="f", s="O", u="O", dt="O", p="O") if r_dtype: - if r_dtype == 'u': # unicode - r_dtype = 'O' + if r_dtype == "u": # unicode + r_dtype = "O" recons.index = np.array( - [_to_uni(label) for label in recons.index], - dtype=r_dtype) + [_to_uni(label) for label in recons.index], dtype=r_dtype + ) df.index = np.array( - [_to_uni(label) for label in df.index], dtype=r_dtype) - elif r_dtype == 'dt': # unicode - r_dtype = 'O' + [_to_uni(label) for label in df.index], dtype=r_dtype + ) + elif r_dtype == "dt": # unicode + r_dtype = "O" recons.index = np.array( - [Timestamp(label) for label in recons.index], - dtype=r_dtype) + [Timestamp(label) for label in recons.index], dtype=r_dtype + ) df.index = np.array( - [Timestamp(label) for label in df.index], - dtype=r_dtype) - elif r_dtype == 'p': - r_dtype = 'O' + [Timestamp(label) for label in df.index], dtype=r_dtype + ) + elif r_dtype == "p": + r_dtype = "O" idx_list = to_datetime(recons.index) recons.index = np.array( - [Timestamp(label) for label in idx_list], - dtype=r_dtype) + [Timestamp(label) for label in idx_list], dtype=r_dtype + ) df.index = np.array( - list(map(Timestamp, df.index.to_timestamp())), - dtype=r_dtype) + list(map(Timestamp, df.index.to_timestamp())), dtype=r_dtype + ) else: r_dtype = type_map.get(r_dtype) recons.index = np.array(recons.index, dtype=r_dtype) df.index = np.array(df.index, dtype=r_dtype) if c_dtype: - if c_dtype == 'u': - c_dtype = 'O' + if c_dtype == "u": + c_dtype = "O" recons.columns = np.array( - [_to_uni(label) for label in recons.columns], - dtype=c_dtype) + [_to_uni(label) for label in recons.columns], dtype=c_dtype + ) df.columns = np.array( - [_to_uni(label) for label in df.columns], - dtype=c_dtype) - elif c_dtype == 'dt': - c_dtype = 'O' + [_to_uni(label) for label in df.columns], dtype=c_dtype + ) + elif c_dtype == "dt": + c_dtype = "O" recons.columns = np.array( - [Timestamp(label) for label in recons.columns], - dtype=c_dtype) + [Timestamp(label) for label in recons.columns], dtype=c_dtype + ) df.columns = np.array( - [Timestamp(label) for label in df.columns], - dtype=c_dtype) - elif c_dtype == 'p': - c_dtype = 'O' + [Timestamp(label) for label in df.columns], dtype=c_dtype + ) + elif c_dtype == "p": + c_dtype = "O" col_list = to_datetime(recons.columns) recons.columns = np.array( - [Timestamp(label) for label in col_list], - dtype=c_dtype) + [Timestamp(label) for label in col_list], dtype=c_dtype + ) col_list = df.columns.to_timestamp() df.columns = np.array( - [Timestamp(label) for label in col_list], - dtype=c_dtype) + [Timestamp(label) for label in col_list], dtype=c_dtype + ) else: c_dtype = type_map.get(c_dtype) recons.columns = np.array(recons.columns, dtype=c_dtype) df.columns = np.array(df.columns, dtype=c_dtype) - assert_frame_equal(df, recons, check_names=False, - check_less_precise=True) + assert_frame_equal(df, recons, check_names=False, check_less_precise=True) N = 100 chunksize = 1000 for ncols in [4]: base = int((chunksize // ncols or 1) or 1) - for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, - 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, - base - 1, base, base + 1]: - _do_test(mkdf(nrows, ncols, r_idx_type='dt', - c_idx_type='s'), 'dt', 's') + for nrows in [ + 2, + 10, + N - 1, + N, + N + 1, + N + 2, + 2 * N - 2, + 2 * N - 1, + 2 * N, + 2 * N + 1, + 2 * N + 2, + base - 1, + base, + base + 1, + ]: + _do_test(mkdf(nrows, ncols, r_idx_type="dt", c_idx_type="s"), "dt", "s") for ncols in [4]: base = int((chunksize // ncols or 1) or 1) - for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, - 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, - base - 1, base, base + 1]: - _do_test(mkdf(nrows, ncols, r_idx_type='dt', - c_idx_type='s'), 'dt', 's') + for nrows in [ + 2, + 10, + N - 1, + N, + N + 1, + N + 2, + 2 * N - 2, + 2 * N - 1, + 2 * N, + 2 * N + 1, + 2 * N + 2, + base - 1, + base, + base + 1, + ]: + _do_test(mkdf(nrows, ncols, r_idx_type="dt", c_idx_type="s"), "dt", "s") pass - for r_idx_type, c_idx_type in [('i', 'i'), ('s', 's'), ('u', 'dt'), - ('p', 'p')]: + for r_idx_type, c_idx_type in [("i", "i"), ("s", "s"), ("u", "dt"), ("p", "p")]: for ncols in [1, 2, 3, 4]: base = int((chunksize // ncols or 1) or 1) - for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, - 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, - base - 1, base, base + 1]: - _do_test(mkdf(nrows, ncols, r_idx_type=r_idx_type, - c_idx_type=c_idx_type), - r_idx_type, c_idx_type) + for nrows in [ + 2, + 10, + N - 1, + N, + N + 1, + N + 2, + 2 * N - 2, + 2 * N - 1, + 2 * N, + 2 * N + 1, + 2 * N + 2, + base - 1, + base, + base + 1, + ]: + _do_test( + mkdf( + nrows, ncols, r_idx_type=r_idx_type, c_idx_type=c_idx_type + ), + r_idx_type, + c_idx_type, + ) for ncols in [1, 2, 3, 4]: base = int((chunksize // ncols or 1) or 1) - for nrows in [10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2, - 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, - base - 1, base, base + 1]: + for nrows in [ + 10, + N - 2, + N - 1, + N, + N + 1, + N + 2, + 2 * N - 2, + 2 * N - 1, + 2 * N, + 2 * N + 1, + 2 * N + 2, + base - 1, + base, + base + 1, + ]: _do_test(mkdf(nrows, ncols)) for nrows in [10, N - 2, N - 1, N, N + 1, N + 2]: @@ -397,20 +469,36 @@ def _to_uni(x): _do_test(mkdf(chunksize // 2 + 1, 2, r_idx_nlevels=2), rnlvl=2) for ncols in [2, 3, 4]: base = int(chunksize // ncols) - for nrows in [10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2, - 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, - base - 1, base, base + 1]: + for nrows in [ + 10, + N - 2, + N - 1, + N, + N + 1, + N + 2, + 2 * N - 2, + 2 * N - 1, + 2 * N, + 2 * N + 1, + 2 * N + 2, + base - 1, + base, + base + 1, + ]: _do_test(mkdf(nrows, ncols, r_idx_nlevels=2), rnlvl=2) _do_test(mkdf(nrows, ncols, c_idx_nlevels=2), cnlvl=2) - _do_test(mkdf(nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2), - rnlvl=2, cnlvl=2) + _do_test( + mkdf(nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2), + rnlvl=2, + cnlvl=2, + ) def test_to_csv_from_csv_w_some_infs(self): # test roundtrip with inf, -inf, nan, as full columns and mix - self.frame['G'] = np.nan - f = lambda x: [np.inf, np.nan][np.random.rand() < .5] - self.frame['H'] = self.frame.index.map(f) + self.frame["G"] = np.nan + f = lambda x: [np.inf, np.nan][np.random.rand() < 0.5] + self.frame["H"] = self.frame.index.map(f) with ensure_clean() as path: self.frame.to_csv(path) @@ -418,14 +506,15 @@ def test_to_csv_from_csv_w_some_infs(self): # TODO to_csv drops column name assert_frame_equal(self.frame, recons, check_names=False) - assert_frame_equal(np.isinf(self.frame), - np.isinf(recons), check_names=False) + assert_frame_equal( + np.isinf(self.frame), np.isinf(recons), check_names=False + ) def test_to_csv_from_csv_w_all_infs(self): # test roundtrip with inf, -inf, nan, as full columns and mix - self.frame['E'] = np.inf - self.frame['F'] = -np.inf + self.frame["E"] = np.inf + self.frame["F"] = -np.inf with ensure_clean() as path: self.frame.to_csv(path) @@ -433,17 +522,18 @@ def test_to_csv_from_csv_w_all_infs(self): # TODO to_csv drops column name assert_frame_equal(self.frame, recons, check_names=False) - assert_frame_equal(np.isinf(self.frame), - np.isinf(recons), check_names=False) + assert_frame_equal( + np.isinf(self.frame), np.isinf(recons), check_names=False + ) def test_to_csv_no_index(self): # GH 3624, after appending columns, to_csv fails - with ensure_clean('__tmp_to_csv_no_index__') as path: - df = DataFrame({'c1': [1, 2, 3], 'c2': [4, 5, 6]}) + with ensure_clean("__tmp_to_csv_no_index__") as path: + df = DataFrame({"c1": [1, 2, 3], "c2": [4, 5, 6]}) df.to_csv(path, index=False) result = read_csv(path) assert_frame_equal(df, result) - df['c3'] = Series([7, 8, 9], dtype='int64') + df["c3"] = Series([7, 8, 9], dtype="int64") df.to_csv(path, index=False) result = read_csv(path) assert_frame_equal(df, result) @@ -452,23 +542,22 @@ def test_to_csv_with_mix_columns(self): # gh-11637: incorrect output when a mix of integer and string column # names passed as columns parameter in to_csv - df = DataFrame({0: ['a', 'b', 'c'], - 1: ['aa', 'bb', 'cc']}) - df['test'] = 'txt' - assert df.to_csv() == df.to_csv(columns=[0, 1, 'test']) + df = DataFrame({0: ["a", "b", "c"], 1: ["aa", "bb", "cc"]}) + df["test"] = "txt" + assert df.to_csv() == df.to_csv(columns=[0, 1, "test"]) def test_to_csv_headers(self): # GH6186, the presence or absence of `index` incorrectly # causes to_csv to have different header semantics. - from_df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) - to_df = DataFrame([[1, 2], [3, 4]], columns=['X', 'Y']) - with ensure_clean('__tmp_to_csv_headers__') as path: - from_df.to_csv(path, header=['X', 'Y']) + from_df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + to_df = DataFrame([[1, 2], [3, 4]], columns=["X", "Y"]) + with ensure_clean("__tmp_to_csv_headers__") as path: + from_df.to_csv(path, header=["X", "Y"]) recons = self.read_csv(path) assert_frame_equal(to_df, recons) - from_df.to_csv(path, index=False, header=['X', 'Y']) + from_df.to_csv(path, index=False, header=["X", "Y"]) recons = self.read_csv(path) recons.reset_index(inplace=True) @@ -479,19 +568,18 @@ def test_to_csv_multiindex(self): frame = self.frame old_index = frame.index arrays = np.arange(len(old_index) * 2).reshape(2, -1) - new_index = MultiIndex.from_arrays(arrays, names=['first', 'second']) + new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index - with ensure_clean('__tmp_to_csv_multiindex__') as path: + with ensure_clean("__tmp_to_csv_multiindex__") as path: frame.to_csv(path, header=False) - frame.to_csv(path, columns=['A', 'B']) + frame.to_csv(path, columns=["A", "B"]) # round trip frame.to_csv(path) - df = self.read_csv(path, index_col=[0, 1], - parse_dates=False) + df = self.read_csv(path, index_col=[0, 1], parse_dates=False) # TODO to_csv drops column name assert_frame_equal(frame, df, check_names=False) @@ -506,7 +594,7 @@ def test_to_csv_multiindex(self): new_index = [old_index, np.arange(len(old_index))] tsframe.index = MultiIndex.from_arrays(new_index) - tsframe.to_csv(path, index_label=['time', 'foo']) + tsframe.to_csv(path, index_label=["time", "foo"]) recons = self.read_csv(path, index_col=[0, 1]) # TODO to_csv drops column name @@ -525,38 +613,36 @@ def test_to_csv_multiindex(self): # needed if setUp becomes class method self.tsframe.index = old_index - with ensure_clean('__tmp_to_csv_multiindex__') as path: + with ensure_clean("__tmp_to_csv_multiindex__") as path: # GH3571, GH1651, GH3141 def _make_frame(names=None): if names is True: - names = ['first', 'second'] - return DataFrame(np.random.randint(0, 10, size=(3, 3)), - columns=MultiIndex.from_tuples( - [('bah', 'foo'), - ('bah', 'bar'), - ('ban', 'baz')], names=names), - dtype='int64') + names = ["first", "second"] + return DataFrame( + np.random.randint(0, 10, size=(3, 3)), + columns=MultiIndex.from_tuples( + [("bah", "foo"), ("bah", "bar"), ("ban", "baz")], names=names + ), + dtype="int64", + ) # column & index are multi-index df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv(path) - result = read_csv(path, header=[0, 1, 2, 3], - index_col=[0, 1]) + result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1]) assert_frame_equal(df, result) # column is mi df = mkdf(5, 3, r_idx_nlevels=1, c_idx_nlevels=4) df.to_csv(path) - result = read_csv( - path, header=[0, 1, 2, 3], index_col=0) + result = read_csv(path, header=[0, 1, 2, 3], index_col=0) assert_frame_equal(df, result) # dup column names? df = mkdf(5, 3, r_idx_nlevels=3, c_idx_nlevels=4) df.to_csv(path) - result = read_csv(path, header=[0, 1, 2, 3], - index_col=[0, 1, 2]) + result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2]) assert_frame_equal(df, result) # writing with no index @@ -576,14 +662,12 @@ def _make_frame(names=None): # whatsnew example df = _make_frame() df.to_csv(path) - result = read_csv(path, header=[0, 1], - index_col=[0]) + result = read_csv(path, header=[0, 1], index_col=[0]) assert_frame_equal(df, result) df = _make_frame(True) df.to_csv(path) - result = read_csv(path, header=[0, 1], - index_col=[0]) + result = read_csv(path, header=[0, 1], index_col=[0]) assert_frame_equal(df, result) # invalid options @@ -591,16 +675,16 @@ def _make_frame(names=None): df.to_csv(path) for i in [6, 7]: - msg = 'len of {i}, but only 5 lines in file'.format(i=i) + msg = "len of {i}, but only 5 lines in file".format(i=i) with pytest.raises(ParserError, match=msg): read_csv(path, header=list(range(i)), index_col=0) # write with cols - msg = 'cannot specify cols with a MultiIndex' + msg = "cannot specify cols with a MultiIndex" with pytest.raises(TypeError, match=msg): - df.to_csv(path, columns=['foo', 'bar']) + df.to_csv(path, columns=["foo", "bar"]) - with ensure_clean('__tmp_to_csv_multiindex__') as path: + with ensure_clean("__tmp_to_csv_multiindex__") as path: # empty tsframe[:0].to_csv(path) recons = self.read_csv(path) @@ -615,38 +699,40 @@ def test_to_csv_float32_nanrep(self): df = DataFrame(np.random.randn(1, 4).astype(np.float32)) df[1] = np.nan - with ensure_clean('__tmp_to_csv_float32_nanrep__.csv') as path: + with ensure_clean("__tmp_to_csv_float32_nanrep__.csv") as path: df.to_csv(path, na_rep=999) with open(path) as f: lines = f.readlines() - assert lines[1].split(',')[2] == '999' + assert lines[1].split(",")[2] == "999" def test_to_csv_withcommas(self): # Commas inside fields should be correctly escaped when saving as CSV. - df = DataFrame({'A': [1, 2, 3], 'B': ['5,6', '7,8', '9,0']}) + df = DataFrame({"A": [1, 2, 3], "B": ["5,6", "7,8", "9,0"]}) - with ensure_clean('__tmp_to_csv_withcommas__.csv') as path: + with ensure_clean("__tmp_to_csv_withcommas__.csv") as path: df.to_csv(path) df2 = self.read_csv(path) assert_frame_equal(df2, df) def test_to_csv_mixed(self): - def create_cols(name): return ["%s%03d" % (name, i) for i in range(5)] - df_float = DataFrame(np.random.randn( - 100, 5), dtype='float64', columns=create_cols('float')) - df_int = DataFrame(np.random.randn(100, 5), - dtype='int64', columns=create_cols('int')) - df_bool = DataFrame(True, index=df_float.index, - columns=create_cols('bool')) - df_object = DataFrame('foo', index=df_float.index, - columns=create_cols('object')) - df_dt = DataFrame(Timestamp('20010101'), - index=df_float.index, columns=create_cols('date')) + df_float = DataFrame( + np.random.randn(100, 5), dtype="float64", columns=create_cols("float") + ) + df_int = DataFrame( + np.random.randn(100, 5), dtype="int64", columns=create_cols("int") + ) + df_bool = DataFrame(True, index=df_float.index, columns=create_cols("bool")) + df_object = DataFrame( + "foo", index=df_float.index, columns=create_cols("object") + ) + df_dt = DataFrame( + Timestamp("20010101"), index=df_float.index, columns=create_cols("date") + ) # add in some nans df_float.loc[30:50, 1:3] = np.nan @@ -658,22 +744,29 @@ def create_cols(name): # dtype dtypes = dict() - for n, dtype in [('float', np.float64), ('int', np.int64), - ('bool', np.bool), ('object', np.object)]: + for n, dtype in [ + ("float", np.float64), + ("int", np.int64), + ("bool", np.bool), + ("object", np.object), + ]: for c in create_cols(n): dtypes[c] = dtype with ensure_clean() as filename: df.to_csv(filename) - rs = read_csv(filename, index_col=0, dtype=dtypes, - parse_dates=create_cols('date')) + rs = read_csv( + filename, index_col=0, dtype=dtypes, parse_dates=create_cols("date") + ) assert_frame_equal(rs, df) def test_to_csv_dups_cols(self): - df = DataFrame(np.random.randn(1000, 30), - columns=list(range(15)) + list(range(15)), - dtype='float64') + df = DataFrame( + np.random.randn(1000, 30), + columns=list(range(15)) + list(range(15)), + dtype="float64", + ) with ensure_clean() as filename: df.to_csv(filename) # single dtype, fine @@ -681,14 +774,14 @@ def test_to_csv_dups_cols(self): result.columns = df.columns assert_frame_equal(result, df) - df_float = DataFrame(np.random.randn(1000, 3), dtype='float64') - df_int = DataFrame(np.random.randn(1000, 3), dtype='int64') + df_float = DataFrame(np.random.randn(1000, 3), dtype="float64") + df_int = DataFrame(np.random.randn(1000, 3), dtype="int64") df_bool = DataFrame(True, index=df_float.index, columns=range(3)) - df_object = DataFrame('foo', index=df_float.index, columns=range(3)) - df_dt = DataFrame(Timestamp('20010101'), - index=df_float.index, columns=range(3)) - df = pd.concat([df_float, df_int, df_bool, df_object, - df_dt], axis=1, ignore_index=True) + df_object = DataFrame("foo", index=df_float.index, columns=range(3)) + df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3)) + df = pd.concat( + [df_float, df_int, df_bool, df_object, df_dt], axis=1, ignore_index=True + ) cols = [] for i in range(5): @@ -700,7 +793,7 @@ def test_to_csv_dups_cols(self): result = read_csv(filename, index_col=0) # date cols - for i in ['0.4', '1.4', '2.4']: + for i in ["0.4", "1.4", "2.4"]: result[i] = to_datetime(result[i]) result.columns = df.columns @@ -711,22 +804,22 @@ def test_to_csv_dups_cols(self): N = 10 df = mkdf(N, 3) - df.columns = ['a', 'a', 'b'] + df.columns = ["a", "a", "b"] with ensure_clean() as filename: df.to_csv(filename) # read_csv will rename the dups columns result = read_csv(filename, index_col=0) - result = result.rename(columns={'a.1': 'a'}) + result = result.rename(columns={"a.1": "a"}) assert_frame_equal(result, df) def test_to_csv_chunking(self): - aa = DataFrame({'A': range(100000)}) - aa['B'] = aa.A + 1.0 - aa['C'] = aa.A + 2.0 - aa['D'] = aa.A + 3.0 + aa = DataFrame({"A": range(100000)}) + aa["B"] = aa.A + 1.0 + aa["C"] = aa.A + 2.0 + aa["D"] = aa.A + 3.0 for chunksize in [10000, 50000, 100000]: with ensure_clean() as filename: @@ -744,9 +837,9 @@ def test_to_csv_wide_frame_formatting(self): assert_frame_equal(rs, df) def test_to_csv_bug(self): - f1 = StringIO('a,1.0\nb,2.0') + f1 = StringIO("a,1.0\nb,2.0") df = self.read_csv(f1, header=None) - newdf = DataFrame({'t': df[df.columns[0]]}) + newdf = DataFrame({"t": df[df.columns[0]]}) with ensure_clean() as path: newdf.to_csv(path) @@ -757,29 +850,29 @@ def test_to_csv_bug(self): def test_to_csv_unicode(self): - df = DataFrame({'c/\u03c3': [1, 2, 3]}) + df = DataFrame({"c/\u03c3": [1, 2, 3]}) with ensure_clean() as path: - df.to_csv(path, encoding='UTF-8') - df2 = read_csv(path, index_col=0, encoding='UTF-8') + df.to_csv(path, encoding="UTF-8") + df2 = read_csv(path, index_col=0, encoding="UTF-8") assert_frame_equal(df, df2) - df.to_csv(path, encoding='UTF-8', index=False) - df2 = read_csv(path, index_col=None, encoding='UTF-8') + df.to_csv(path, encoding="UTF-8", index=False) + df2 = read_csv(path, index_col=None, encoding="UTF-8") assert_frame_equal(df, df2) def test_to_csv_unicode_index_col(self): - buf = StringIO('') + buf = StringIO("") df = DataFrame( [["\u05d0", "d2", "d3", "d4"], ["a1", "a2", "a3", "a4"]], - columns=["\u05d0", - "\u05d1", "\u05d2", "\u05d3"], - index=["\u05d0", "\u05d1"]) + columns=["\u05d0", "\u05d1", "\u05d2", "\u05d3"], + index=["\u05d0", "\u05d1"], + ) - df.to_csv(buf, encoding='UTF-8') + df.to_csv(buf, encoding="UTF-8") buf.seek(0) - df2 = read_csv(buf, index_col=0, encoding='UTF-8') + df2 = read_csv(buf, index_col=0, encoding="UTF-8") assert_frame_equal(df, df2) def test_to_csv_stringio(self): @@ -792,93 +885,93 @@ def test_to_csv_stringio(self): def test_to_csv_float_format(self): - df = DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) + df = DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) with ensure_clean() as filename: - df.to_csv(filename, float_format='%.2f') + df.to_csv(filename, float_format="%.2f") rs = read_csv(filename, index_col=0) - xp = DataFrame([[0.12, 0.23, 0.57], - [12.32, 123123.20, 321321.20]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) + xp = DataFrame( + [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) assert_frame_equal(rs, xp) def test_to_csv_unicodewriter_quoting(self): - df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}) + df = DataFrame({"A": [1, 2, 3], "B": ["foo", "bar", "baz"]}) buf = StringIO() - df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC, - encoding='utf-8') + df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf-8") result = buf.getvalue() - expected_rows = ['"A","B"', - '1,"foo"', - '2,"bar"', - '3,"baz"'] + expected_rows = ['"A","B"', '1,"foo"', '2,"bar"', '3,"baz"'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected def test_to_csv_quote_none(self): # GH4328 - df = DataFrame({'A': ['hello', '{"hello"}']}) - for encoding in (None, 'utf-8'): + df = DataFrame({"A": ["hello", '{"hello"}']}) + for encoding in (None, "utf-8"): buf = StringIO() - df.to_csv(buf, quoting=csv.QUOTE_NONE, - encoding=encoding, index=False) + df.to_csv(buf, quoting=csv.QUOTE_NONE, encoding=encoding, index=False) result = buf.getvalue() - expected_rows = ['A', - 'hello', - '{"hello"}'] + expected_rows = ["A", "hello", '{"hello"}'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected def test_to_csv_index_no_leading_comma(self): - df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, - index=['one', 'two', 'three']) + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["one", "two", "three"]) buf = StringIO() df.to_csv(buf, index_label=False) - expected_rows = ['A,B', - 'one,1,4', - 'two,2,5', - 'three,3,6'] + expected_rows = ["A,B", "one,1,4", "two,2,5", "three,3,6"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert buf.getvalue() == expected def test_to_csv_line_terminators(self): # see gh-20353 - df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, - index=['one', 'two', 'three']) + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["one", "two", "three"]) with ensure_clean() as path: # case 1: CRLF as line terminator - df.to_csv(path, line_terminator='\r\n') - expected = b',A,B\r\none,1,4\r\ntwo,2,5\r\nthree,3,6\r\n' + df.to_csv(path, line_terminator="\r\n") + expected = b",A,B\r\none,1,4\r\ntwo,2,5\r\nthree,3,6\r\n" - with open(path, mode='rb') as f: + with open(path, mode="rb") as f: assert f.read() == expected with ensure_clean() as path: # case 2: LF as line terminator - df.to_csv(path, line_terminator='\n') - expected = b',A,B\none,1,4\ntwo,2,5\nthree,3,6\n' + df.to_csv(path, line_terminator="\n") + expected = b",A,B\none,1,4\ntwo,2,5\nthree,3,6\n" - with open(path, mode='rb') as f: + with open(path, mode="rb") as f: assert f.read() == expected with ensure_clean() as path: # case 3: The default line terminator(=os.linesep)(gh-21406) df.to_csv(path) - os_linesep = os.linesep.encode('utf-8') - expected = (b',A,B' + os_linesep + b'one,1,4' + os_linesep + - b'two,2,5' + os_linesep + b'three,3,6' + os_linesep) - - with open(path, mode='rb') as f: + os_linesep = os.linesep.encode("utf-8") + expected = ( + b",A,B" + + os_linesep + + b"one,1,4" + + os_linesep + + b"two,2,5" + + os_linesep + + b"three,3,6" + + os_linesep + ) + + with open(path, mode="rb") as f: assert f.read() == expected def test_to_csv_from_csv_categorical(self): @@ -915,95 +1008,116 @@ def test_to_csv_path_is_none(self): recons = pd.read_csv(StringIO(csv_str), index_col=0) assert_frame_equal(self.frame, recons) - @pytest.mark.parametrize('df,encoding', [ - (DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']), None), - # GH 21241, 21118 - (DataFrame([['abc', 'def', 'ghi']], columns=['X', 'Y', 'Z']), 'ascii'), - (DataFrame(5 * [[123, "你好", "世界"]], - columns=['X', 'Y', 'Z']), 'gb2312'), - (DataFrame(5 * [[123, "Γειά σου", "Κόσμε"]], - columns=['X', 'Y', 'Z']), 'cp737') - ]) + @pytest.mark.parametrize( + "df,encoding", + [ + ( + DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ), + None, + ), + # GH 21241, 21118 + (DataFrame([["abc", "def", "ghi"]], columns=["X", "Y", "Z"]), "ascii"), + (DataFrame(5 * [[123, "你好", "世界"]], columns=["X", "Y", "Z"]), "gb2312"), + ( + DataFrame(5 * [[123, "Γειά σου", "Κόσμε"]], columns=["X", "Y", "Z"]), + "cp737", + ), + ], + ) def test_to_csv_compression(self, df, encoding, compression): with ensure_clean() as filename: df.to_csv(filename, compression=compression, encoding=encoding) # test the round trip - to_csv -> read_csv - result = read_csv(filename, compression=compression, - index_col=0, encoding=encoding) + result = read_csv( + filename, compression=compression, index_col=0, encoding=encoding + ) assert_frame_equal(df, result) # test the round trip using file handle - to_csv -> read_csv - f, _handles = _get_handle(filename, 'w', compression=compression, - encoding=encoding) + f, _handles = _get_handle( + filename, "w", compression=compression, encoding=encoding + ) with f: df.to_csv(f, encoding=encoding) - result = pd.read_csv(filename, compression=compression, - encoding=encoding, index_col=0, squeeze=True) + result = pd.read_csv( + filename, + compression=compression, + encoding=encoding, + index_col=0, + squeeze=True, + ) assert_frame_equal(df, result) # explicitly make sure file is compressed with tm.decompress_file(filename, compression) as fh: - text = fh.read().decode(encoding or 'utf8') + text = fh.read().decode(encoding or "utf8") for col in df.columns: assert col in text with tm.decompress_file(filename, compression) as fh: - assert_frame_equal(df, read_csv(fh, - index_col=0, - encoding=encoding)) + assert_frame_equal(df, read_csv(fh, index_col=0, encoding=encoding)) def test_to_csv_date_format(self): - with ensure_clean('__tmp_to_csv_date_format__') as path: + with ensure_clean("__tmp_to_csv_date_format__") as path: dt_index = self.tsframe.index datetime_frame = DataFrame( - {'A': dt_index, 'B': dt_index.shift(1)}, index=dt_index) - datetime_frame.to_csv(path, date_format='%Y%m%d') + {"A": dt_index, "B": dt_index.shift(1)}, index=dt_index + ) + datetime_frame.to_csv(path, date_format="%Y%m%d") # Check that the data was put in the specified format test = read_csv(path, index_col=0) datetime_frame_int = datetime_frame.applymap( - lambda x: int(x.strftime('%Y%m%d'))) + lambda x: int(x.strftime("%Y%m%d")) + ) datetime_frame_int.index = datetime_frame_int.index.map( - lambda x: int(x.strftime('%Y%m%d'))) + lambda x: int(x.strftime("%Y%m%d")) + ) assert_frame_equal(test, datetime_frame_int) - datetime_frame.to_csv(path, date_format='%Y-%m-%d') + datetime_frame.to_csv(path, date_format="%Y-%m-%d") # Check that the data was put in the specified format test = read_csv(path, index_col=0) datetime_frame_str = datetime_frame.applymap( - lambda x: x.strftime('%Y-%m-%d')) + lambda x: x.strftime("%Y-%m-%d") + ) datetime_frame_str.index = datetime_frame_str.index.map( - lambda x: x.strftime('%Y-%m-%d')) + lambda x: x.strftime("%Y-%m-%d") + ) assert_frame_equal(test, datetime_frame_str) # Check that columns get converted datetime_frame_columns = datetime_frame.T - datetime_frame_columns.to_csv(path, date_format='%Y%m%d') + datetime_frame_columns.to_csv(path, date_format="%Y%m%d") test = read_csv(path, index_col=0) datetime_frame_columns = datetime_frame_columns.applymap( - lambda x: int(x.strftime('%Y%m%d'))) + lambda x: int(x.strftime("%Y%m%d")) + ) # Columns don't get converted to ints by read_csv - datetime_frame_columns.columns = ( - datetime_frame_columns.columns - .map(lambda x: x.strftime('%Y%m%d'))) + datetime_frame_columns.columns = datetime_frame_columns.columns.map( + lambda x: x.strftime("%Y%m%d") + ) assert_frame_equal(test, datetime_frame_columns) # test NaTs nat_index = to_datetime( - ['NaT'] * 10 + ['2000-01-01', '1/1/2000', '1-1-2000']) - nat_frame = DataFrame({'A': nat_index}, index=nat_index) - nat_frame.to_csv(path, date_format='%Y-%m-%d') + ["NaT"] * 10 + ["2000-01-01", "1/1/2000", "1-1-2000"] + ) + nat_frame = DataFrame({"A": nat_index}, index=nat_index) + nat_frame.to_csv(path, date_format="%Y-%m-%d") test = read_csv(path, parse_dates=[0, 1], index_col=0) @@ -1011,57 +1125,65 @@ def test_to_csv_date_format(self): def test_to_csv_with_dst_transitions(self): - with ensure_clean('csv_date_format_with_dst') as path: + with ensure_clean("csv_date_format_with_dst") as path: # make sure we are not failing on transitions - times = pd.date_range("2013-10-26 23:00", "2013-10-27 01:00", - tz="Europe/London", - freq="H", - ambiguous='infer') - - for i in [times, times + pd.Timedelta('10s')]: - time_range = np.array(range(len(i)), dtype='int64') - df = DataFrame({'A': time_range}, index=i) + times = pd.date_range( + "2013-10-26 23:00", + "2013-10-27 01:00", + tz="Europe/London", + freq="H", + ambiguous="infer", + ) + + for i in [times, times + pd.Timedelta("10s")]: + time_range = np.array(range(len(i)), dtype="int64") + df = DataFrame({"A": time_range}, index=i) df.to_csv(path, index=True) # we have to reconvert the index as we # don't parse the tz's result = read_csv(path, index_col=0) result.index = to_datetime(result.index, utc=True).tz_convert( - 'Europe/London') + "Europe/London" + ) assert_frame_equal(result, df) # GH11619 - idx = pd.date_range('2015-01-01', '2015-12-31', - freq='H', tz='Europe/Paris') - df = DataFrame({'values': 1, 'idx': idx}, - index=idx) - with ensure_clean('csv_date_format_with_dst') as path: + idx = pd.date_range("2015-01-01", "2015-12-31", freq="H", tz="Europe/Paris") + df = DataFrame({"values": 1, "idx": idx}, index=idx) + with ensure_clean("csv_date_format_with_dst") as path: df.to_csv(path, index=True) result = read_csv(path, index_col=0) result.index = to_datetime(result.index, utc=True).tz_convert( - 'Europe/Paris') - result['idx'] = to_datetime(result['idx'], utc=True).astype( - 'datetime64[ns, Europe/Paris]') + "Europe/Paris" + ) + result["idx"] = to_datetime(result["idx"], utc=True).astype( + "datetime64[ns, Europe/Paris]" + ) assert_frame_equal(result, df) # assert working df.astype(str) - with ensure_clean('csv_date_format_with_dst') as path: + with ensure_clean("csv_date_format_with_dst") as path: df.to_pickle(path) result = pd.read_pickle(path) assert_frame_equal(result, df) def test_to_csv_quoting(self): - df = DataFrame({ - 'c_bool': [True, False], - 'c_float': [1.0, 3.2], - 'c_int': [42, np.nan], - 'c_string': ['a', 'b,c'], - }) - - expected_rows = [',c_bool,c_float,c_int,c_string', - '0,True,1.0,42.0,a', - '1,False,3.2,,"b,c"'] + df = DataFrame( + { + "c_bool": [True, False], + "c_float": [1.0, 3.2], + "c_int": [42, np.nan], + "c_string": ["a", "b,c"], + } + ) + + expected_rows = [ + ",c_bool,c_float,c_int,c_string", + "0,True,1.0,42.0,a", + '1,False,3.2,,"b,c"', + ] expected = tm.convert_rows_list_to_csv_str(expected_rows) result = df.to_csv() @@ -1070,17 +1192,21 @@ def test_to_csv_quoting(self): result = df.to_csv(quoting=None) assert result == expected - expected_rows = [',c_bool,c_float,c_int,c_string', - '0,True,1.0,42.0,a', - '1,False,3.2,,"b,c"'] + expected_rows = [ + ",c_bool,c_float,c_int,c_string", + "0,True,1.0,42.0,a", + '1,False,3.2,,"b,c"', + ] expected = tm.convert_rows_list_to_csv_str(expected_rows) result = df.to_csv(quoting=csv.QUOTE_MINIMAL) assert result == expected - expected_rows = ['"","c_bool","c_float","c_int","c_string"', - '"0","True","1.0","42.0","a"', - '"1","False","3.2","","b,c"'] + expected_rows = [ + '"","c_bool","c_float","c_int","c_string"', + '"0","True","1.0","42.0","a"', + '"1","False","3.2","","b,c"', + ] expected = tm.convert_rows_list_to_csv_str(expected_rows) result = df.to_csv(quoting=csv.QUOTE_ALL) @@ -1088,9 +1214,11 @@ def test_to_csv_quoting(self): # see gh-12922, gh-13259: make sure changes to # the formatters do not break this behaviour - expected_rows = ['"","c_bool","c_float","c_int","c_string"', - '0,True,1.0,42.0,"a"', - '1,False,3.2,"","b,c"'] + expected_rows = [ + '"","c_bool","c_float","c_int","c_string"', + '0,True,1.0,42.0,"a"', + '1,False,3.2,"","b,c"', + ] expected = tm.convert_rows_list_to_csv_str(expected_rows) result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC) assert result == expected @@ -1102,41 +1230,40 @@ def test_to_csv_quoting(self): with pytest.raises(csv.Error, match=msg): df.to_csv(quoting=csv.QUOTE_NONE, escapechar=None) - expected_rows = [',c_bool,c_float,c_int,c_string', - '0,True,1.0,42.0,a', - '1,False,3.2,,b!,c'] + expected_rows = [ + ",c_bool,c_float,c_int,c_string", + "0,True,1.0,42.0,a", + "1,False,3.2,,b!,c", + ] expected = tm.convert_rows_list_to_csv_str(expected_rows) - result = df.to_csv(quoting=csv.QUOTE_NONE, - escapechar='!') + result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar="!") assert result == expected - expected_rows = [',c_bool,c_ffloat,c_int,c_string', - '0,True,1.0,42.0,a', - '1,False,3.2,,bf,c'] + expected_rows = [ + ",c_bool,c_ffloat,c_int,c_string", + "0,True,1.0,42.0,a", + "1,False,3.2,,bf,c", + ] expected = tm.convert_rows_list_to_csv_str(expected_rows) - result = df.to_csv(quoting=csv.QUOTE_NONE, - escapechar='f') + result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar="f") assert result == expected # see gh-3503: quoting Windows line terminators # presents with encoding? - text_rows = ['a,b,c', - '1,"test \r\n",3'] + text_rows = ["a,b,c", '1,"test \r\n",3'] text = tm.convert_rows_list_to_csv_str(text_rows) df = pd.read_csv(StringIO(text)) buf = StringIO() - df.to_csv(buf, encoding='utf-8', index=False) + df.to_csv(buf, encoding="utf-8", index=False) assert buf.getvalue() == text # xref gh-7791: make sure the quoting parameter is passed through # with multi-indexes - df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) - df = df.set_index(['a', 'b']) + df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}) + df = df.set_index(["a", "b"]) - expected_rows = ['"a","b","c"', - '"1","3","5"', - '"2","4","6"'] + expected_rows = ['"a","b","c"', '"1","3","5"', '"2","4","6"'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv(quoting=csv.QUOTE_ALL) == expected @@ -1149,20 +1276,14 @@ def test_period_index_date_overflow(self): df = pd.DataFrame([4, 5, 6], index=index) result = df.to_csv() - expected_rows = [',0', - '1990-01-01,4', - '2000-01-01,5', - '3005-01-01,6'] + expected_rows = [",0", "1990-01-01,4", "2000-01-01,5", "3005-01-01,6"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected date_format = "%m-%d-%Y" result = df.to_csv(date_format=date_format) - expected_rows = [',0', - '01-01-1990,4', - '01-01-2000,5', - '01-01-3005,6'] + expected_rows = [",0", "01-01-1990,4", "01-01-2000,5", "01-01-3005,6"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected @@ -1173,26 +1294,20 @@ def test_period_index_date_overflow(self): df = pd.DataFrame([4, 5, 6], index=index) result = df.to_csv() - expected_rows = [',0', - '1990-01-01,4', - ',5', - '3005-01-01,6'] + expected_rows = [",0", "1990-01-01,4", ",5", "3005-01-01,6"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected def test_multi_index_header(self): # see gh-5539 - columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), - ("b", 1), ("b", 2)]) + columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) df.columns = columns header = ["a", "b", "c", "d"] result = df.to_csv(header=header) - expected_rows = [',a,b,c,d', - '0,1,2,3,4', - '1,5,6,7,8'] + expected_rows = [",a,b,c,d", "0,1,2,3,4", "1,5,6,7,8"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected @@ -1202,17 +1317,17 @@ def test_to_csv_single_level_multi_index(self): df = pd.DataFrame([[1, 2, 3]], columns=index) df = df.reindex(columns=[(1,), (3,)]) expected = ",1,3\n0,1,3\n" - result = df.to_csv(line_terminator='\n') + result = df.to_csv(line_terminator="\n") assert_almost_equal(result, expected) def test_gz_lineend(self): # GH 25311 - df = pd.DataFrame({'a': [1, 2]}) - expected_rows = ['a', '1', '2'] + df = pd.DataFrame({"a": [1, 2]}) + expected_rows = ["a", "1", "2"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - with ensure_clean('__test_gz_lineend.csv.gz') as path: + with ensure_clean("__test_gz_lineend.csv.gz") as path: df.to_csv(path, index=False) - with tm.decompress_file(path, compression='gzip') as f: - result = f.read().decode('utf-8') + with tm.decompress_file(path, compression="gzip") as f: + result = f.read().decode("utf-8") assert result == expected diff --git a/pandas/tests/frame/test_validate.py b/pandas/tests/frame/test_validate.py index 8597d91550c779..c7270322b980cd 100644 --- a/pandas/tests/frame/test_validate.py +++ b/pandas/tests/frame/test_validate.py @@ -5,18 +5,27 @@ @pytest.fixture def dataframe(): - return DataFrame({'a': [1, 2], 'b': [3, 4]}) + return DataFrame({"a": [1, 2], "b": [3, 4]}) class TestDataFrameValidate: """Tests for error handling related to data types of method arguments.""" - @pytest.mark.parametrize("func", ["query", "eval", "set_index", - "reset_index", "dropna", - "drop_duplicates", "sort_values"]) + @pytest.mark.parametrize( + "func", + [ + "query", + "eval", + "set_index", + "reset_index", + "dropna", + "drop_duplicates", + "sort_values", + ], + ) @pytest.mark.parametrize("inplace", [1, "True", [1, 2, 3], 5.0]) def test_validate_bool_args(self, dataframe, func, inplace): - msg = "For argument \"inplace\" expected type bool" + msg = 'For argument "inplace" expected type bool' kwargs = dict(inplace=inplace) if func == "query": diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 4646c7c9196a3f..ff9895cf143180 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -11,12 +11,16 @@ from pandas import DataFrame, MultiIndex, Series, date_range import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) from .test_generic import Generic try: import xarray + _XARRAY_INSTALLED = True except ImportError: _XARRAY_INSTALLED = False @@ -27,32 +31,33 @@ class TestDataFrame(Generic): _comparator = lambda self, x, y: assert_frame_equal(x, y) def test_rename_mi(self): - df = DataFrame([ - 11, 21, 31 - ], index=MultiIndex.from_tuples([("A", x) for x in ["a", "B", "c"]])) + df = DataFrame( + [11, 21, 31], + index=MultiIndex.from_tuples([("A", x) for x in ["a", "B", "c"]]), + ) df.rename(str.lower) def test_set_axis_name(self): df = pd.DataFrame([[1, 2], [3, 4]]) - funcs = ['_set_axis_name', 'rename_axis'] + funcs = ["_set_axis_name", "rename_axis"] for func in funcs: - result = methodcaller(func, 'foo')(df) + result = methodcaller(func, "foo")(df) assert df.index.name is None - assert result.index.name == 'foo' + assert result.index.name == "foo" - result = methodcaller(func, 'cols', axis=1)(df) + result = methodcaller(func, "cols", axis=1)(df) assert df.columns.name is None - assert result.columns.name == 'cols' + assert result.columns.name == "cols" def test_set_axis_name_mi(self): df = DataFrame( np.empty((3, 3)), - index=MultiIndex.from_tuples([("A", x) for x in list('aBc')]), - columns=MultiIndex.from_tuples([('C', x) for x in list('xyz')]) + index=MultiIndex.from_tuples([("A", x) for x in list("aBc")]), + columns=MultiIndex.from_tuples([("C", x) for x in list("xyz")]), ) - level_names = ['L1', 'L2'] - funcs = ['_set_axis_name', 'rename_axis'] + level_names = ["L1", "L2"] + funcs = ["_set_axis_name", "rename_axis"] for func in funcs: result = methodcaller(func, level_names)(df) assert result.index.names == level_names @@ -80,7 +85,7 @@ def test_nonzero_single_element(self): def test_get_numeric_data_preserve_dtype(self): # get the numeric data - o = DataFrame({'A': [1, '2', 3.]}) + o = DataFrame({"A": [1, "2", 3.0]}) result = o._get_numeric_data() expected = DataFrame(index=[0, 1, 2], dtype=object) self._compare(result, expected) @@ -89,17 +94,22 @@ def test_metadata_propagation_indiv(self): # groupby df = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - result = df.groupby('A').sum() + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) + result = df.groupby("A").sum() self.check_metadata(df, result) # resample - df = DataFrame(np.random.randn(1000, 2), - index=date_range('20130101', periods=1000, freq='s')) - result = df.resample('1T') + df = DataFrame( + np.random.randn(1000, 2), + index=date_range("20130101", periods=1000, freq="s"), + ) + result = df.resample("1T") self.check_metadata(df, result) # merging with override @@ -108,41 +118,40 @@ def test_metadata_propagation_indiv(self): _finalize = DataFrame.__finalize__ np.random.seed(10) - df1 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=['a', 'b']) - df2 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=['c', 'd']) - DataFrame._metadata = ['filename'] - df1.filename = 'fname1.csv' - df2.filename = 'fname2.csv' + df1 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=["a", "b"]) + df2 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=["c", "d"]) + DataFrame._metadata = ["filename"] + df1.filename = "fname1.csv" + df2.filename = "fname2.csv" def finalize(self, other, method=None, **kwargs): for name in self._metadata: - if method == 'merge': + if method == "merge": left, right = other.left, other.right - value = getattr(left, name, '') + '|' + getattr(right, - name, '') + value = getattr(left, name, "") + "|" + getattr(right, name, "") object.__setattr__(self, name, value) else: - object.__setattr__(self, name, getattr(other, name, '')) + object.__setattr__(self, name, getattr(other, name, "")) return self DataFrame.__finalize__ = finalize - result = df1.merge(df2, left_on=['a'], right_on=['c'], how='inner') - assert result.filename == 'fname1.csv|fname2.csv' + result = df1.merge(df2, left_on=["a"], right_on=["c"], how="inner") + assert result.filename == "fname1.csv|fname2.csv" # concat # GH 6927 - DataFrame._metadata = ['filename'] - df1 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=list('ab')) - df1.filename = 'foo' + DataFrame._metadata = ["filename"] + df1 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=list("ab")) + df1.filename = "foo" def finalize(self, other, method=None, **kwargs): for name in self._metadata: - if method == 'concat': - value = '+'.join([getattr( - o, name) for o in other.objs if getattr(o, name, None) - ]) + if method == "concat": + value = "+".join( + [getattr(o, name) for o in other.objs if getattr(o, name, None)] + ) object.__setattr__(self, name, value) else: object.__setattr__(self, name, getattr(other, name, None)) @@ -152,7 +161,7 @@ def finalize(self, other, method=None, **kwargs): DataFrame.__finalize__ = finalize result = pd.concat([df1, df1]) - assert result.filename == 'foo+foo' + assert result.filename == "foo+foo" # reset DataFrame._metadata = _metadata @@ -161,48 +170,59 @@ def finalize(self, other, method=None, **kwargs): def test_set_attribute(self): # Test for consistent setattr behavior when an attribute and a column # have the same name (Issue #8994) - df = DataFrame({'x': [1, 2, 3]}) + df = DataFrame({"x": [1, 2, 3]}) df.y = 2 - df['y'] = [2, 4, 6] + df["y"] = [2, 4, 6] df.y = 5 assert df.y == 5 - assert_series_equal(df['y'], Series([2, 4, 6], name='y')) - - @pytest.mark.skipif(not _XARRAY_INSTALLED or _XARRAY_INSTALLED and - LooseVersion(xarray.__version__) < - LooseVersion('0.10.0'), - reason='xarray >= 0.10.0 required') + assert_series_equal(df["y"], Series([2, 4, 6], name="y")) + + @pytest.mark.skipif( + not _XARRAY_INSTALLED + or _XARRAY_INSTALLED + and LooseVersion(xarray.__version__) < LooseVersion("0.10.0"), + reason="xarray >= 0.10.0 required", + ) @pytest.mark.parametrize( - "index", ['FloatIndex', 'IntIndex', - 'StringIndex', 'UnicodeIndex', - 'DateIndex', 'PeriodIndex', - 'CategoricalIndex', 'TimedeltaIndex']) + "index", + [ + "FloatIndex", + "IntIndex", + "StringIndex", + "UnicodeIndex", + "DateIndex", + "PeriodIndex", + "CategoricalIndex", + "TimedeltaIndex", + ], + ) def test_to_xarray_index_types(self, index): from xarray import Dataset - index = getattr(tm, 'make{}'.format(index)) - df = DataFrame({'a': list('abc'), - 'b': list(range(1, 4)), - 'c': np.arange(3, 6).astype('u1'), - 'd': np.arange(4.0, 7.0, dtype='float64'), - 'e': [True, False, True], - 'f': pd.Categorical(list('abc')), - 'g': pd.date_range('20130101', periods=3), - 'h': pd.date_range('20130101', - periods=3, - tz='US/Eastern')} - ) + index = getattr(tm, "make{}".format(index)) + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + } + ) df.index = index(3) - df.index.name = 'foo' - df.columns.name = 'bar' + df.index.name = "foo" + df.columns.name = "bar" result = df.to_xarray() - assert result.dims['foo'] == 3 + assert result.dims["foo"] == 3 assert len(result.coords) == 1 assert len(result.data_vars) == 8 - assert_almost_equal(list(result.coords.keys()), ['foo']) + assert_almost_equal(list(result.coords.keys()), ["foo"]) assert isinstance(result, Dataset) # idempotency @@ -210,58 +230,60 @@ def test_to_xarray_index_types(self, index): # datetimes w/tz are not preserved # column names are lost expected = df.copy() - expected['f'] = expected['f'].astype(object) - expected['h'] = expected['h'].astype('datetime64[ns]') + expected["f"] = expected["f"].astype(object) + expected["h"] = expected["h"].astype("datetime64[ns]") expected.columns.name = None - assert_frame_equal(result.to_dataframe(), expected, - check_index_type=False, check_categorical=False) + assert_frame_equal( + result.to_dataframe(), + expected, + check_index_type=False, + check_categorical=False, + ) - @td.skip_if_no('xarray', min_version='0.7.0') + @td.skip_if_no("xarray", min_version="0.7.0") def test_to_xarray(self): from xarray import Dataset - df = DataFrame({'a': list('abc'), - 'b': list(range(1, 4)), - 'c': np.arange(3, 6).astype('u1'), - 'd': np.arange(4.0, 7.0, dtype='float64'), - 'e': [True, False, True], - 'f': pd.Categorical(list('abc')), - 'g': pd.date_range('20130101', periods=3), - 'h': pd.date_range('20130101', - periods=3, - tz='US/Eastern')} - ) - - df.index.name = 'foo' + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + } + ) + + df.index.name = "foo" result = df[0:0].to_xarray() - assert result.dims['foo'] == 0 + assert result.dims["foo"] == 0 assert isinstance(result, Dataset) # available in 0.7.1 # MultiIndex - df.index = pd.MultiIndex.from_product([['a'], range(3)], - names=['one', 'two']) + df.index = pd.MultiIndex.from_product([["a"], range(3)], names=["one", "two"]) result = df.to_xarray() - assert result.dims['one'] == 1 - assert result.dims['two'] == 3 + assert result.dims["one"] == 1 + assert result.dims["two"] == 3 assert len(result.coords) == 2 assert len(result.data_vars) == 8 - assert_almost_equal(list(result.coords.keys()), ['one', 'two']) + assert_almost_equal(list(result.coords.keys()), ["one", "two"]) assert isinstance(result, Dataset) result = result.to_dataframe() expected = df.copy() - expected['f'] = expected['f'].astype(object) - expected['h'] = expected['h'].astype('datetime64[ns]') + expected["f"] = expected["f"].astype(object) + expected["h"] = expected["h"].astype("datetime64[ns]") expected.columns.name = None - assert_frame_equal(result, - expected, - check_index_type=False) + assert_frame_equal(result, expected, check_index_type=False) def test_deepcopy_empty(self): # This test covers empty frame copying with non-empty column sets # as reported in issue GH15370 - empty_frame = DataFrame(data=[], index=[], columns=['A']) + empty_frame = DataFrame(data=[], index=[], columns=["A"]) empty_frame_copy = deepcopy(empty_frame) self._compare(empty_frame_copy, empty_frame) diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index e8343a1cf318b9..aef6c3fe8070c4 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -15,7 +15,6 @@ class Generic: - @property def _ndim(self): return self._typ._AXIS_LEN @@ -33,7 +32,7 @@ def _construct(self, shape, value=None, dtype=None, **kwargs): shape = tuple([shape] * self._ndim) if value is not None: if is_scalar(value): - if value == 'empty': + if value == "empty": arr = None # remove the info axis @@ -59,7 +58,7 @@ def _compare(self, result, expected): def test_rename(self): # single axis - idx = list('ABCD') + idx = list("ABCD") # relabeling values passed into self.rename args = [ str.lower, @@ -75,7 +74,7 @@ def test_rename(self): # rename a single axis result = obj.rename(**{axis: arg}) expected = obj.copy() - setattr(expected, axis, list('abcd')) + setattr(expected, axis, list("abcd")) self._compare(result, expected) # multiple axes at once @@ -83,8 +82,7 @@ def test_rename(self): def test_get_numeric_data(self): n = 4 - kwargs = {self._typ._AXIS_NAMES[i]: list(range(n)) - for i in range(self._ndim)} + kwargs = {self._typ._AXIS_NAMES[i]: list(range(n)) for i in range(self._ndim)} # get the numeric data o = self._construct(n, **kwargs) @@ -93,7 +91,7 @@ def test_get_numeric_data(self): # non-inclusion result = o._get_bool_data() - expected = self._construct(n, value='empty', **kwargs) + expected = self._construct(n, value="empty", **kwargs) self._compare(result, expected) # get the bool data @@ -109,7 +107,7 @@ def test_get_default(self): # GH 7725 d0 = "a", "b", "c", "d" - d1 = np.arange(4, dtype='int64') + d1 = np.arange(4, dtype="int64") others = "e", 10 for data, index in ((d0, d1), (d1, d0)): @@ -127,8 +125,7 @@ def test_nonzero(self): # GH 4633 # look at the boolean/nonzero behavior for objects obj = self._construct(shape=4) - msg = "The truth value of a {} is ambiguous".format( - self._typ.__name__) + msg = "The truth value of a {} is ambiguous".format(self._typ.__name__) with pytest.raises(ValueError, match=msg): bool(obj == 0) with pytest.raises(ValueError, match=msg): @@ -178,24 +175,24 @@ def test_downcast(self): o = self._construct(shape=4, value=9, dtype=np.int64) result = o.copy() - result._data = o._data.downcast(dtypes='infer') + result._data = o._data.downcast(dtypes="infer") self._compare(result, o) - o = self._construct(shape=4, value=9.) + o = self._construct(shape=4, value=9.0) expected = o.astype(np.int64) result = o.copy() - result._data = o._data.downcast(dtypes='infer') + result._data = o._data.downcast(dtypes="infer") self._compare(result, expected) o = self._construct(shape=4, value=9.5) result = o.copy() - result._data = o._data.downcast(dtypes='infer') + result._data = o._data.downcast(dtypes="infer") self._compare(result, o) # are close o = self._construct(shape=4, value=9.000000000005) result = o.copy() - result._data = o._data.downcast(dtypes='infer') + result._data = o._data.downcast(dtypes="infer") expected = o.astype(np.int64) self._compare(result, expected) @@ -206,15 +203,16 @@ def test_constructor_compound_dtypes(self): def f(dtype): return self._construct(shape=3, value=1, dtype=dtype) - msg = ("compound dtypes are not implemented in the {} constructor" - .format(self._typ.__name__)) + msg = "compound dtypes are not implemented in the {} constructor".format( + self._typ.__name__ + ) with pytest.raises(NotImplementedError, match=msg): f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")]) # these work (though results may be unexpected) - f('int64') - f('float64') - f('M8[ns]') + f("int64") + f("float64") + f("M8[ns]") def check_metadata(self, x, y=None): for m in x._metadata: @@ -228,26 +226,26 @@ def test_metadata_propagation(self): # check that the metadata matches up on the resulting ops o = self._construct(shape=3) - o.name = 'foo' + o.name = "foo" o2 = self._construct(shape=3) - o2.name = 'bar' + o2.name = "bar" # ---------- # preserving # ---------- # simple ops with scalars - for op in ['__add__', '__sub__', '__truediv__', '__mul__']: + for op in ["__add__", "__sub__", "__truediv__", "__mul__"]: result = getattr(o, op)(1) self.check_metadata(o, result) # ops with like - for op in ['__add__', '__sub__', '__truediv__', '__mul__']: + for op in ["__add__", "__sub__", "__truediv__", "__mul__"]: result = getattr(o, op)(o) self.check_metadata(o, result) # simple boolean - for op in ['__eq__', '__le__', '__ge__']: + for op in ["__eq__", "__le__", "__ge__"]: v1 = getattr(o, op)(o) self.check_metadata(o, v1) self.check_metadata(o, v1 & v1) @@ -266,7 +264,7 @@ def test_metadata_propagation(self): self.check_metadata(result) # simple boolean - for op in ['__eq__', '__le__', '__ge__']: + for op in ["__eq__", "__le__", "__ge__"]: # this is a name matching op v1 = getattr(o, op)(o) @@ -281,9 +279,14 @@ def test_head_tail(self): o = self._construct(shape=10) # check all index types - for index in [tm.makeFloatIndex, tm.makeIntIndex, tm.makeStringIndex, - tm.makeUnicodeIndex, tm.makeDateIndex, - tm.makePeriodIndex]: + for index in [ + tm.makeFloatIndex, + tm.makeIntIndex, + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeDateIndex, + tm.makePeriodIndex, + ]: axis = o._get_axis_name(0) setattr(o, axis, index(len(getattr(o, axis)))) @@ -318,19 +321,22 @@ def test_sample(self): for test in range(10): seed = np.random.randint(0, 100) self._compare( - o.sample(n=4, random_state=seed), o.sample(n=4, - random_state=seed)) + o.sample(n=4, random_state=seed), o.sample(n=4, random_state=seed) + ) self._compare( - o.sample(frac=0.7, random_state=seed), o.sample( - frac=0.7, random_state=seed)) + o.sample(frac=0.7, random_state=seed), + o.sample(frac=0.7, random_state=seed), + ) self._compare( o.sample(n=4, random_state=np.random.RandomState(test)), - o.sample(n=4, random_state=np.random.RandomState(test))) + o.sample(n=4, random_state=np.random.RandomState(test)), + ) self._compare( o.sample(frac=0.7, random_state=np.random.RandomState(test)), - o.sample(frac=0.7, random_state=np.random.RandomState(test))) + o.sample(frac=0.7, random_state=np.random.RandomState(test)), + ) os1, os2 = [], [] for _ in range(2): @@ -342,7 +348,7 @@ def test_sample(self): # Check for error when random_state argument invalid. with pytest.raises(ValueError): - o.sample(random_state='astring!') + o.sample(random_state="astring!") ### # Check behavior of `frac` and `N` @@ -412,14 +418,12 @@ def test_sample(self): # Check np.nan are replaced by zeros. weights_with_nan = [np.nan] * 10 weights_with_nan[5] = 0.5 - self._compare( - o.sample(n=1, axis=0, weights=weights_with_nan), o.iloc[5:6]) + self._compare(o.sample(n=1, axis=0, weights=weights_with_nan), o.iloc[5:6]) # Check None are also replaced by zeros. weights_with_None = [None] * 10 weights_with_None[5] = 0.5 - self._compare( - o.sample(n=1, axis=0, weights=weights_with_None), o.iloc[5:6]) + self._compare(o.sample(n=1, axis=0, weights=weights_with_None), o.iloc[5:6]) def test_size_compat(self): # GH8846 @@ -436,28 +440,28 @@ def test_split_compat(self): assert len(np.array_split(o, 2)) == 2 def test_unexpected_keyword(self): # GH8597 - df = DataFrame(np.random.randn(5, 2), columns=['jim', 'joe']) + df = DataFrame(np.random.randn(5, 2), columns=["jim", "joe"]) ca = pd.Categorical([0, 0, 2, 2, 3, np.nan]) - ts = df['joe'].copy() + ts = df["joe"].copy() ts[2] = np.nan - with pytest.raises(TypeError, match='unexpected keyword'): - df.drop('joe', axis=1, in_place=True) + with pytest.raises(TypeError, match="unexpected keyword"): + df.drop("joe", axis=1, in_place=True) - with pytest.raises(TypeError, match='unexpected keyword'): + with pytest.raises(TypeError, match="unexpected keyword"): df.reindex([1, 0], inplace=True) - with pytest.raises(TypeError, match='unexpected keyword'): + with pytest.raises(TypeError, match="unexpected keyword"): ca.fillna(0, inplace=True) - with pytest.raises(TypeError, match='unexpected keyword'): + with pytest.raises(TypeError, match="unexpected keyword"): ts.fillna(0, in_place=True) # See gh-12301 def test_stat_unexpected_keyword(self): obj = self._construct(5) - starwars = 'Star Wars' - errmsg = 'unexpected keyword' + starwars = "Star Wars" + errmsg = "unexpected keyword" with pytest.raises(TypeError, match=errmsg): obj.max(epic=starwars) # stat_function @@ -474,7 +478,7 @@ def test_api_compat(self): # compat for __name__, __qualname__ obj = self._construct(5) - for func in ['sum', 'cumsum', 'any', 'var']: + for func in ["sum", "cumsum", "any", "var"]: f = getattr(obj, func) assert f.__name__ == func assert f.__qualname__.endswith(func) @@ -498,29 +502,30 @@ def test_truncate_out_of_bounds(self): # small shape = [int(2e3)] + ([1] * (self._ndim - 1)) - small = self._construct(shape, dtype='int8', value=1) + small = self._construct(shape, dtype="int8", value=1) self._compare(small.truncate(), small) self._compare(small.truncate(before=0, after=3e3), small) self._compare(small.truncate(before=-1, after=2e3), small) # big shape = [int(2e6)] + ([1] * (self._ndim - 1)) - big = self._construct(shape, dtype='int8', value=1) + big = self._construct(shape, dtype="int8", value=1) self._compare(big.truncate(), big) self._compare(big.truncate(before=0, after=3e6), big) self._compare(big.truncate(before=-1, after=2e6), big) def test_validate_bool_args(self): - df = DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) invalid_values = [1, "True", [1, 2, 3], 5.0] for value in invalid_values: with pytest.raises(ValueError): - super(DataFrame, df).rename_axis(mapper={'a': 'x', 'b': 'y'}, - axis=1, inplace=value) + super(DataFrame, df).rename_axis( + mapper={"a": "x", "b": "y"}, axis=1, inplace=value + ) with pytest.raises(ValueError): - super(DataFrame, df).drop('a', axis=1, inplace=value) + super(DataFrame, df).drop("a", axis=1, inplace=value) with pytest.raises(ValueError): super(DataFrame, df).sort_index(inplace=value) @@ -532,8 +537,7 @@ def test_validate_bool_args(self): super(DataFrame, df).fillna(value=0, inplace=value) with pytest.raises(ValueError): - super(DataFrame, df).replace(to_replace=1, value=7, - inplace=value) + super(DataFrame, df).replace(to_replace=1, value=7, inplace=value) with pytest.raises(ValueError): super(DataFrame, df).interpolate(inplace=value) @@ -548,28 +552,33 @@ def test_copy_and_deepcopy(self): # GH 15444 for shape in [0, 1, 2]: obj = self._construct(shape) - for func in [copy, - deepcopy, - lambda x: x.copy(deep=False), - lambda x: x.copy(deep=True)]: + for func in [ + copy, + deepcopy, + lambda x: x.copy(deep=False), + lambda x: x.copy(deep=True), + ]: obj_copy = func(obj) assert obj_copy is not obj self._compare(obj_copy, obj) - @pytest.mark.parametrize("periods,fill_method,limit,exp", [ - (1, "ffill", None, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, 0]), - (1, "ffill", 1, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, np.nan]), - (1, "bfill", None, [np.nan, 0, 0, 1, 1, 1.5, np.nan, np.nan]), - (1, "bfill", 1, [np.nan, np.nan, 0, 1, 1, 1.5, np.nan, np.nan]), - (-1, "ffill", None, [np.nan, np.nan, -.5, -.5, -.6, 0, 0, np.nan]), - (-1, "ffill", 1, [np.nan, np.nan, -.5, -.5, -.6, 0, np.nan, np.nan]), - (-1, "bfill", None, [0, 0, -.5, -.5, -.6, np.nan, np.nan, np.nan]), - (-1, "bfill", 1, [np.nan, 0, -.5, -.5, -.6, np.nan, np.nan, np.nan]) - ]) + @pytest.mark.parametrize( + "periods,fill_method,limit,exp", + [ + (1, "ffill", None, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, 0]), + (1, "ffill", 1, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, np.nan]), + (1, "bfill", None, [np.nan, 0, 0, 1, 1, 1.5, np.nan, np.nan]), + (1, "bfill", 1, [np.nan, np.nan, 0, 1, 1, 1.5, np.nan, np.nan]), + (-1, "ffill", None, [np.nan, np.nan, -0.5, -0.5, -0.6, 0, 0, np.nan]), + (-1, "ffill", 1, [np.nan, np.nan, -0.5, -0.5, -0.6, 0, np.nan, np.nan]), + (-1, "bfill", None, [0, 0, -0.5, -0.5, -0.6, np.nan, np.nan, np.nan]), + (-1, "bfill", 1, [np.nan, 0, -0.5, -0.5, -0.6, np.nan, np.nan, np.nan]), + ], + ) def test_pct_change(self, periods, fill_method, limit, exp): vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan] obj = self._typ(vals) - func = getattr(obj, 'pct_change') + func = getattr(obj, "pct_change") res = func(periods=periods, fill_method=fill_method, limit=limit) if type(obj) is DataFrame: tm.assert_frame_equal(res, DataFrame(exp)) @@ -588,60 +597,62 @@ def test_sample(sel): easy_weight_list = [0] * 10 easy_weight_list[5] = 1 - df = pd.DataFrame({'col1': range(10, 20), - 'col2': range(20, 30), - 'colString': ['a'] * 10, - 'easyweights': easy_weight_list}) - sample1 = df.sample(n=1, weights='easyweights') + df = pd.DataFrame( + { + "col1": range(10, 20), + "col2": range(20, 30), + "colString": ["a"] * 10, + "easyweights": easy_weight_list, + } + ) + sample1 = df.sample(n=1, weights="easyweights") assert_frame_equal(sample1, df.iloc[5:6]) # Ensure proper error if string given as weight for Series or # DataFrame with axis = 1. s = Series(range(10)) with pytest.raises(ValueError): - s.sample(n=3, weights='weight_column') + s.sample(n=3, weights="weight_column") with pytest.raises(ValueError): - df.sample(n=1, weights='weight_column', axis=1) + df.sample(n=1, weights="weight_column", axis=1) # Check weighting key error with pytest.raises(KeyError): - df.sample(n=3, weights='not_a_real_column_name') + df.sample(n=3, weights="not_a_real_column_name") # Check that re-normalizes weights that don't sum to one. weights_less_than_1 = [0] * 10 weights_less_than_1[0] = 0.5 - tm.assert_frame_equal( - df.sample(n=1, weights=weights_less_than_1), df.iloc[:1]) + tm.assert_frame_equal(df.sample(n=1, weights=weights_less_than_1), df.iloc[:1]) ### # Test axis argument ### # Test axis argument - df = pd.DataFrame({'col1': range(10), 'col2': ['a'] * 10}) + df = pd.DataFrame({"col1": range(10), "col2": ["a"] * 10}) second_column_weight = [0, 1] assert_frame_equal( - df.sample(n=1, axis=1, weights=second_column_weight), df[['col2']]) + df.sample(n=1, axis=1, weights=second_column_weight), df[["col2"]] + ) # Different axis arg types - assert_frame_equal(df.sample(n=1, axis='columns', - weights=second_column_weight), - df[['col2']]) + assert_frame_equal( + df.sample(n=1, axis="columns", weights=second_column_weight), df[["col2"]] + ) weight = [0] * 10 weight[5] = 0.5 - assert_frame_equal(df.sample(n=1, axis='rows', weights=weight), - df.iloc[5:6]) - assert_frame_equal(df.sample(n=1, axis='index', weights=weight), - df.iloc[5:6]) + assert_frame_equal(df.sample(n=1, axis="rows", weights=weight), df.iloc[5:6]) + assert_frame_equal(df.sample(n=1, axis="index", weights=weight), df.iloc[5:6]) # Check out of range axis values with pytest.raises(ValueError): df.sample(n=1, axis=2) with pytest.raises(ValueError): - df.sample(n=1, axis='not_a_name') + df.sample(n=1, axis="not_a_name") with pytest.raises(ValueError): s = pd.Series(range(10)) @@ -655,21 +666,19 @@ def test_sample(sel): easy_weight_list = [0] * 3 easy_weight_list[2] = 1 - df = pd.DataFrame({'col1': range(10, 20), - 'col2': range(20, 30), - 'colString': ['a'] * 10}) + df = pd.DataFrame( + {"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10} + ) sample1 = df.sample(n=1, axis=1, weights=easy_weight_list) - assert_frame_equal(sample1, df[['colString']]) + assert_frame_equal(sample1, df[["colString"]]) # Test default axes assert_frame_equal( - df.sample(n=3, random_state=42), df.sample(n=3, axis=0, - random_state=42)) + df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42) + ) # Test that function aligns weights with frame - df = DataFrame( - {'col1': [5, 6, 7], - 'col2': ['a', 'b', 'c'], }, index=[9, 5, 3]) + df = DataFrame({"col1": [5, 6, 7], "col2": ["a", "b", "c"]}, index=[9, 5, 3]) s = Series([1, 0, 0], index=[3, 5, 9]) assert_frame_equal(df.loc[[3]], df.sample(1, weights=s)) @@ -689,39 +698,38 @@ def test_sample(sel): def test_squeeze(self): # noop - for s in [tm.makeFloatSeries(), tm.makeStringSeries(), - tm.makeObjectSeries()]: + for s in [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries()]: tm.assert_series_equal(s.squeeze(), s) for df in [tm.makeTimeDataFrame()]: tm.assert_frame_equal(df.squeeze(), df) # squeezing - df = tm.makeTimeDataFrame().reindex(columns=['A']) - tm.assert_series_equal(df.squeeze(), df['A']) + df = tm.makeTimeDataFrame().reindex(columns=["A"]) + tm.assert_series_equal(df.squeeze(), df["A"]) # don't fail with 0 length dimensions GH11229 & GH8999 - empty_series = Series([], name='five') + empty_series = Series([], name="five") empty_frame = DataFrame([empty_series]) - [tm.assert_series_equal(empty_series, higher_dim.squeeze()) - for higher_dim in [empty_series, empty_frame]] + [ + tm.assert_series_equal(empty_series, higher_dim.squeeze()) + for higher_dim in [empty_series, empty_frame] + ] # axis argument df = tm.makeTimeDataFrame(nper=1).iloc[:, :1] assert df.shape == (1, 1) tm.assert_series_equal(df.squeeze(axis=0), df.iloc[0]) - tm.assert_series_equal(df.squeeze(axis='index'), df.iloc[0]) + tm.assert_series_equal(df.squeeze(axis="index"), df.iloc[0]) tm.assert_series_equal(df.squeeze(axis=1), df.iloc[:, 0]) - tm.assert_series_equal(df.squeeze(axis='columns'), df.iloc[:, 0]) + tm.assert_series_equal(df.squeeze(axis="columns"), df.iloc[:, 0]) assert df.squeeze() == df.iloc[0, 0] - msg = ("No axis named 2 for object type ") + msg = "No axis named 2 for object type " with pytest.raises(ValueError, match=msg): df.squeeze(axis=2) - msg = ("No axis named x for object type ") + msg = "No axis named x for object type " with pytest.raises(ValueError, match=msg): - df.squeeze(axis='x') + df.squeeze(axis="x") df = tm.makeTimeDataFrame(3) tm.assert_frame_equal(df.squeeze(axis=0), df) @@ -730,12 +738,11 @@ def test_numpy_squeeze(self): s = tm.makeFloatSeries() tm.assert_series_equal(np.squeeze(s), s) - df = tm.makeTimeDataFrame().reindex(columns=['A']) - tm.assert_series_equal(np.squeeze(df), df['A']) + df = tm.makeTimeDataFrame().reindex(columns=["A"]) + tm.assert_series_equal(np.squeeze(df), df["A"]) def test_transpose(self): - for s in [tm.makeFloatSeries(), tm.makeStringSeries(), - tm.makeObjectSeries()]: + for s in [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries()]: # calls implementation in pandas/core/base.py tm.assert_series_equal(s.transpose(), s) for df in [tm.makeTimeDataFrame()]: @@ -758,17 +765,19 @@ def test_numpy_transpose(self): def test_take(self): indices = [1, 5, -2, 6, 3, -1] - for s in [tm.makeFloatSeries(), tm.makeStringSeries(), - tm.makeObjectSeries()]: + for s in [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries()]: out = s.take(indices) - expected = Series(data=s.values.take(indices), - index=s.index.take(indices), dtype=s.dtype) + expected = Series( + data=s.values.take(indices), index=s.index.take(indices), dtype=s.dtype + ) tm.assert_series_equal(out, expected) for df in [tm.makeTimeDataFrame()]: out = df.take(indices) - expected = DataFrame(data=df.values.take(indices, axis=0), - index=df.index.take(indices), - columns=df.columns) + expected = DataFrame( + data=df.values.take(indices, axis=0), + index=df.index.take(indices), + columns=df.columns, + ) tm.assert_frame_equal(out, expected) def test_take_invalid_kwargs(self): @@ -787,7 +796,7 @@ def test_take_invalid_kwargs(self): msg = "the 'mode' parameter is not supported" with pytest.raises(ValueError, match=msg): - obj.take(indices, mode='clip') + obj.take(indices, mode="clip") def test_equals(self): s1 = pd.Series([1, 2, 3], index=[0, 2, 1]) @@ -805,34 +814,32 @@ def test_equals(self): s2[0] = 9.9 assert not s1.equals(s2) - idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')]) + idx = MultiIndex.from_tuples([(0, "a"), (1, "b"), (2, "c")]) s1 = Series([1, 2, np.nan], index=idx) s2 = s1.copy() assert s1.equals(s2) # Add object dtype column with nans index = np.random.random(10) - df1 = DataFrame( - np.random.random(10, ), index=index, columns=['floats']) - df1['text'] = 'the sky is so blue. we could use more chocolate.'.split( - ) - df1['start'] = date_range('2000-1-1', periods=10, freq='T') - df1['end'] = date_range('2000-1-1', periods=10, freq='D') - df1['diff'] = df1['end'] - df1['start'] - df1['bool'] = (np.arange(10) % 3 == 0) + df1 = DataFrame(np.random.random(10), index=index, columns=["floats"]) + df1["text"] = "the sky is so blue. we could use more chocolate.".split() + df1["start"] = date_range("2000-1-1", periods=10, freq="T") + df1["end"] = date_range("2000-1-1", periods=10, freq="D") + df1["diff"] = df1["end"] - df1["start"] + df1["bool"] = np.arange(10) % 3 == 0 df1.loc[::2] = np.nan df2 = df1.copy() - assert df1['text'].equals(df2['text']) - assert df1['start'].equals(df2['start']) - assert df1['end'].equals(df2['end']) - assert df1['diff'].equals(df2['diff']) - assert df1['bool'].equals(df2['bool']) + assert df1["text"].equals(df2["text"]) + assert df1["start"].equals(df2["start"]) + assert df1["end"].equals(df2["end"]) + assert df1["diff"].equals(df2["diff"]) + assert df1["bool"].equals(df2["bool"]) assert df1.equals(df2) assert not df1.equals(object) # different dtype different = df1.copy() - different['floats'] = different['floats'].astype('float32') + different["floats"] = different["floats"].astype("float32") assert not df1.equals(different) # different index @@ -846,22 +853,22 @@ def test_equals(self): assert not df1.equals(different) # DatetimeIndex - index = pd.date_range('2000-1-1', periods=10, freq='T') + index = pd.date_range("2000-1-1", periods=10, freq="T") df1 = df1.set_index(index) df2 = df1.copy() assert df1.equals(df2) # MultiIndex - df3 = df1.set_index(['text'], append=True) - df2 = df1.set_index(['text'], append=True) + df3 = df1.set_index(["text"], append=True) + df2 = df1.set_index(["text"], append=True) assert df3.equals(df2) - df2 = df1.set_index(['floats'], append=True) + df2 = df1.set_index(["floats"], append=True) assert not df3.equals(df2) # NaN in index - df3 = df1.set_index(['floats'], append=True) - df2 = df1.set_index(['floats'], append=True) + df3 = df1.set_index(["floats"], append=True) + df2 = df1.set_index(["floats"], append=True) assert df3.equals(df2) # GH 8437 @@ -880,44 +887,45 @@ def test_equals(self): assert e.equals(f) def test_pipe(self): - df = DataFrame({'A': [1, 2, 3]}) + df = DataFrame({"A": [1, 2, 3]}) f = lambda x, y: x ** y result = df.pipe(f, 2) - expected = DataFrame({'A': [1, 4, 9]}) + expected = DataFrame({"A": [1, 4, 9]}) assert_frame_equal(result, expected) result = df.A.pipe(f, 2) assert_series_equal(result, expected.A) def test_pipe_tuple(self): - df = DataFrame({'A': [1, 2, 3]}) + df = DataFrame({"A": [1, 2, 3]}) f = lambda x, y: y - result = df.pipe((f, 'y'), 0) + result = df.pipe((f, "y"), 0) assert_frame_equal(result, df) - result = df.A.pipe((f, 'y'), 0) + result = df.A.pipe((f, "y"), 0) assert_series_equal(result, df.A) def test_pipe_tuple_error(self): df = DataFrame({"A": [1, 2, 3]}) f = lambda x, y: y with pytest.raises(ValueError): - df.pipe((f, 'y'), x=1, y=0) + df.pipe((f, "y"), x=1, y=0) with pytest.raises(ValueError): - df.A.pipe((f, 'y'), x=1, y=0) + df.A.pipe((f, "y"), x=1, y=0) - @pytest.mark.parametrize('box', [pd.Series, pd.DataFrame]) + @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame]) def test_axis_classmethods(self, box): obj = box() - values = (list(box._AXIS_NAMES.keys()) + - list(box._AXIS_NUMBERS.keys()) + - list(box._AXIS_ALIASES.keys())) + values = ( + list(box._AXIS_NAMES.keys()) + + list(box._AXIS_NUMBERS.keys()) + + list(box._AXIS_ALIASES.keys()) + ) for v in values: assert obj._get_axis_number(v) == box._get_axis_number(v) assert obj._get_axis_name(v) == box._get_axis_name(v) - assert obj._get_block_manager_axis(v) == \ - box._get_block_manager_axis(v) + assert obj._get_block_manager_axis(v) == box._get_block_manager_axis(v) def test_deprecated_to_dense(self): # GH 26557: DEPR diff --git a/pandas/tests/generic/test_label_or_level_utils.py b/pandas/tests/generic/test_label_or_level_utils.py index 1341837c466698..d3566f16ab49f9 100644 --- a/pandas/tests/generic/test_label_or_level_utils.py +++ b/pandas/tests/generic/test_label_or_level_utils.py @@ -10,12 +10,10 @@ @pytest.fixture def df(): """DataFrame with columns 'L1', 'L2', and 'L3' """ - return pd.DataFrame({'L1': [1, 2, 3], - 'L2': [11, 12, 13], - 'L3': ['A', 'B', 'C']}) + return pd.DataFrame({"L1": [1, 2, 3], "L2": [11, 12, 13], "L3": ["A", "B", "C"]}) -@pytest.fixture(params=[[], ['L1'], ['L1', 'L2'], ['L1', 'L2', 'L3']]) +@pytest.fixture(params=[[], ["L1"], ["L1", "L2"], ["L1", "L2", "L3"]]) def df_levels(request, df): """DataFrame with columns or index levels 'L1', 'L2', and 'L3' """ levels = request.param @@ -29,9 +27,9 @@ def df_levels(request, df): @pytest.fixture def df_ambig(df): """DataFrame with levels 'L1' and 'L2' and labels 'L1' and 'L3' """ - df = df.set_index(['L1', 'L2']) + df = df.set_index(["L1", "L2"]) - df['L1'] = df['L3'] + df["L1"] = df["L3"] return df @@ -39,8 +37,8 @@ def df_ambig(df): @pytest.fixture def df_duplabels(df): """DataFrame with level 'L1' and labels 'L2', 'L3', and 'L2' """ - df = df.set_index(['L1']) - df = pd.concat([df, df['L2']], axis=1) + df = df.set_index(["L1"]) + df = pd.concat([df, df["L2"]], axis=1) return df @@ -49,8 +47,7 @@ def df_duplabels(df): # ============================= def get_labels_levels(df_levels): expected_labels = list(df_levels.columns) - expected_levels = [name for name in df_levels.index.names - if name is not None] + expected_levels = [name for name in df_levels.index.names if name is not None] return expected_labels, expected_levels @@ -76,7 +73,7 @@ def test_is_level_or_label_reference_df_simple(df_levels, axis): expected_labels, expected_levels = get_labels_levels(df_levels) # Transpose frame if axis == 1 - if axis in {1, 'columns'}: + if axis in {1, "columns"}: df_levels = df_levels.T # Perform checks @@ -87,19 +84,19 @@ def test_is_level_or_label_reference_df_simple(df_levels, axis): def test_is_level_reference_df_ambig(df_ambig, axis): # Transpose frame if axis == 1 - if axis in {1, 'columns'}: + if axis in {1, "columns"}: df_ambig = df_ambig.T # df has both an on-axis level and off-axis label named L1 # Therefore L1 should reference the label, not the level - assert_label_reference(df_ambig, ['L1'], axis=axis) + assert_label_reference(df_ambig, ["L1"], axis=axis) # df has an on-axis level named L2 and it is not ambiguous # Therefore L2 is an level reference - assert_level_reference(df_ambig, ['L2'], axis=axis) + assert_level_reference(df_ambig, ["L2"], axis=axis) # df has a column named L3 and it not an level reference - assert_label_reference(df_ambig, ['L3'], axis=axis) + assert_label_reference(df_ambig, ["L3"], axis=axis) # Series @@ -107,23 +104,23 @@ def test_is_level_reference_df_ambig(df_ambig, axis): def test_is_level_reference_series_simple_axis0(df): # Make series with L1 as index - s = df.set_index('L1').L2 - assert_level_reference(s, ['L1'], axis=0) - assert not s._is_level_reference('L2') + s = df.set_index("L1").L2 + assert_level_reference(s, ["L1"], axis=0) + assert not s._is_level_reference("L2") # Make series with L1 and L2 as index - s = df.set_index(['L1', 'L2']).L3 - assert_level_reference(s, ['L1', 'L2'], axis=0) - assert not s._is_level_reference('L3') + s = df.set_index(["L1", "L2"]).L3 + assert_level_reference(s, ["L1", "L2"], axis=0) + assert not s._is_level_reference("L3") def test_is_level_reference_series_axis1_error(df): # Make series with L1 as index - s = df.set_index('L1').L2 + s = df.set_index("L1").L2 with pytest.raises(ValueError, match="No axis named 1"): - s._is_level_reference('L1', axis=1) + s._is_level_reference("L1", axis=1) # Test _check_label_or_level_ambiguity_df @@ -175,17 +172,17 @@ def test_check_label_or_level_ambiguity_series(df): def test_check_label_or_level_ambiguity_series_axis1_error(df): # Make series with L1 as index - s = df.set_index('L1').L2 + s = df.set_index("L1").L2 with pytest.raises(ValueError, match="No axis named 1"): - s._check_label_or_level_ambiguity('L1', axis=1) + s._check_label_or_level_ambiguity("L1", axis=1) # Test _get_label_or_level_values # =============================== def assert_label_values(frame, labels, axis): for label in labels: - if axis in {0, 'index'}: + if axis in {0, "index"}: expected = frame[label]._values else: expected = frame.loc[label]._values @@ -213,7 +210,7 @@ def test_get_label_or_level_values_df_simple(df_levels, axis): expected_labels, expected_levels = get_labels_levels(df_levels) # Transpose frame if axis == 1 - if axis in {1, 'columns'}: + if axis in {1, "columns"}: df_levels = df_levels.T # Perform checks @@ -224,36 +221,36 @@ def test_get_label_or_level_values_df_simple(df_levels, axis): def test_get_label_or_level_values_df_ambig(df_ambig, axis): # Transpose frame if axis == 1 - if axis in {1, 'columns'}: + if axis in {1, "columns"}: df_ambig = df_ambig.T # df has an on-axis level named L2, and it is not ambiguous. - assert_level_values(df_ambig, ['L2'], axis=axis) + assert_level_values(df_ambig, ["L2"], axis=axis) # df has an off-axis label named L3, and it is not ambiguous. - assert_label_values(df_ambig, ['L3'], axis=axis) + assert_label_values(df_ambig, ["L3"], axis=axis) def test_get_label_or_level_values_df_duplabels(df_duplabels, axis): # Transpose frame if axis == 1 - if axis in {1, 'columns'}: + if axis in {1, "columns"}: df_duplabels = df_duplabels.T # df has unambiguous level 'L1' - assert_level_values(df_duplabels, ['L1'], axis=axis) + assert_level_values(df_duplabels, ["L1"], axis=axis) # df has unique label 'L3' - assert_label_values(df_duplabels, ['L3'], axis=axis) + assert_label_values(df_duplabels, ["L3"], axis=axis) # df has duplicate labels 'L2' - if axis in {0, 'index'}: + if axis in {0, "index"}: expected_msg = "The column label 'L2' is not unique" else: expected_msg = "The index label 'L2' is not unique" with pytest.raises(ValueError, match=expected_msg): - assert_label_values(df_duplabels, ['L2'], axis=axis) + assert_label_values(df_duplabels, ["L2"], axis=axis) # Series @@ -261,21 +258,21 @@ def test_get_label_or_level_values_df_duplabels(df_duplabels, axis): def test_get_label_or_level_values_series_axis0(df): # Make series with L1 as index - s = df.set_index('L1').L2 - assert_level_values(s, ['L1'], axis=0) + s = df.set_index("L1").L2 + assert_level_values(s, ["L1"], axis=0) # Make series with L1 and L2 as index - s = df.set_index(['L1', 'L2']).L3 - assert_level_values(s, ['L1', 'L2'], axis=0) + s = df.set_index(["L1", "L2"]).L3 + assert_level_values(s, ["L1", "L2"], axis=0) def test_get_label_or_level_values_series_axis1_error(df): # Make series with L1 as index - s = df.set_index('L1').L2 + s = df.set_index("L1").L2 with pytest.raises(ValueError, match="No axis named 1"): - s._get_label_or_level_values('L1', axis=1) + s._get_label_or_level_values("L1", axis=1) # Test _drop_labels_or_levels @@ -284,7 +281,7 @@ def assert_labels_dropped(frame, labels, axis): for label in labels: df_dropped = frame._drop_labels_or_levels(label, axis=axis) - if axis in {0, 'index'}: + if axis in {0, "index"}: assert label in frame.columns assert label not in df_dropped.columns else: @@ -296,7 +293,7 @@ def assert_levels_dropped(frame, levels, axis): for level in levels: df_dropped = frame._drop_labels_or_levels(level, axis=axis) - if axis in {0, 'index'}: + if axis in {0, "index"}: assert level in frame.index.names assert level not in df_dropped.index.names else: @@ -312,7 +309,7 @@ def test_drop_labels_or_levels_df(df_levels, axis): expected_labels, expected_levels = get_labels_levels(df_levels) # Transpose frame if axis == 1 - if axis in {1, 'columns'}: + if axis in {1, "columns"}: df_levels = df_levels.T # Perform checks @@ -320,7 +317,7 @@ def test_drop_labels_or_levels_df(df_levels, axis): assert_levels_dropped(df_levels, expected_levels, axis=axis) with pytest.raises(ValueError, match="not valid labels or levels"): - df_levels._drop_labels_or_levels('L4', axis=axis) + df_levels._drop_labels_or_levels("L4", axis=axis) # Series @@ -328,15 +325,15 @@ def test_drop_labels_or_levels_df(df_levels, axis): def test_drop_labels_or_levels_series(df): # Make series with L1 as index - s = df.set_index('L1').L2 - assert_levels_dropped(s, ['L1'], axis=0) + s = df.set_index("L1").L2 + assert_levels_dropped(s, ["L1"], axis=0) with pytest.raises(ValueError, match="not valid labels or levels"): - s._drop_labels_or_levels('L4', axis=0) + s._drop_labels_or_levels("L4", axis=0) # Make series with L1 and L2 as index - s = df.set_index(['L1', 'L2']).L3 - assert_levels_dropped(s, ['L1', 'L2'], axis=0) + s = df.set_index(["L1", "L2"]).L3 + assert_levels_dropped(s, ["L1", "L2"], axis=0) with pytest.raises(ValueError, match="not valid labels or levels"): - s._drop_labels_or_levels('L4', axis=0) + s._drop_labels_or_levels("L4", axis=0) diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index e29622cb8ac171..5c3c35832356e9 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -15,6 +15,7 @@ try: import xarray + _XARRAY_INSTALLED = True except ImportError: _XARRAY_INSTALLED = False @@ -26,43 +27,46 @@ class TestSeries(Generic): def setup_method(self): self.ts = tm.makeTimeSeries() # Was at top level in test_series - self.ts.name = 'ts' + self.ts.name = "ts" self.series = tm.makeStringSeries() - self.series.name = 'series' + self.series.name = "series" def test_rename_mi(self): - s = Series([11, 21, 31], - index=MultiIndex.from_tuples( - [("A", x) for x in ["a", "B", "c"]])) + s = Series( + [11, 21, 31], + index=MultiIndex.from_tuples([("A", x) for x in ["a", "B", "c"]]), + ) s.rename(str.lower) def test_set_axis_name(self): - s = Series([1, 2, 3], index=['a', 'b', 'c']) - funcs = ['rename_axis', '_set_axis_name'] - name = 'foo' + s = Series([1, 2, 3], index=["a", "b", "c"]) + funcs = ["rename_axis", "_set_axis_name"] + name = "foo" for func in funcs: result = methodcaller(func, name)(s) assert s.index.name is None assert result.index.name == name def test_set_axis_name_mi(self): - s = Series([11, 21, 31], index=MultiIndex.from_tuples( - [("A", x) for x in ["a", "B", "c"]], - names=['l1', 'l2']) + s = Series( + [11, 21, 31], + index=MultiIndex.from_tuples( + [("A", x) for x in ["a", "B", "c"]], names=["l1", "l2"] + ), ) - funcs = ['rename_axis', '_set_axis_name'] + funcs = ["rename_axis", "_set_axis_name"] for func in funcs: - result = methodcaller(func, ['L1', 'L2'])(s) + result = methodcaller(func, ["L1", "L2"])(s) assert s.index.name is None - assert s.index.names == ['l1', 'l2'] + assert s.index.names == ["l1", "l2"] assert result.index.name is None - assert result.index.names, ['L1', 'L2'] + assert result.index.names, ["L1", "L2"] def test_set_axis_name_raises(self): s = pd.Series([1]) with pytest.raises(ValueError): - s._set_axis_name(name='a', axis=1) + s._set_axis_name(name="a", axis=1) def test_get_numeric_data_preserve_dtype(self): @@ -71,7 +75,7 @@ def test_get_numeric_data_preserve_dtype(self): result = o._get_numeric_data() self._compare(result, o) - o = Series([1, '2', 3.]) + o = Series([1, "2", 3.0]) result = o._get_numeric_data() expected = Series([], dtype=object, index=pd.Index([], dtype=object)) self._compare(result, expected) @@ -84,9 +88,9 @@ def test_get_numeric_data_preserve_dtype(self): result = o._get_bool_data() self._compare(result, o) - o = Series(date_range('20130101', periods=3)) + o = Series(date_range("20130101", periods=3)) result = o._get_numeric_data() - expected = Series([], dtype='M8[ns]', index=pd.Index([], dtype=object)) + expected = Series([], dtype="M8[ns]", index=pd.Index([], dtype=object)) self._compare(result, expected) def test_nonzero_single_element(self): @@ -100,8 +104,7 @@ def test_nonzero_single_element(self): msg = "The truth value of a Series is ambiguous" # single item nan to raise - for s in [Series([np.nan]), Series([pd.NaT]), Series([True]), - Series([False])]: + for s in [Series([np.nan]), Series([pd.NaT]), Series([True]), Series([False])]: with pytest.raises(ValueError, match=msg): bool(s) @@ -119,7 +122,7 @@ def test_nonzero_single_element(self): s.bool() # single non-bool are an error - for s in [Series([1]), Series([0]), Series(['a']), Series([0.0])]: + for s in [Series([1]), Series([0]), Series(["a"]), Series([0.0])]: msg = "The truth value of a Series is ambiguous" with pytest.raises(ValueError, match=msg): bool(s) @@ -131,38 +134,40 @@ def test_metadata_propagation_indiv(self): # check that the metadata matches up on the resulting ops o = Series(range(3), range(3)) - o.name = 'foo' + o.name = "foo" o2 = Series(range(3), range(3)) - o2.name = 'bar' + o2.name = "bar" result = o.T self.check_metadata(o, result) # resample - ts = Series(np.random.rand(1000), - index=date_range('20130101', periods=1000, freq='s'), - name='foo') - result = ts.resample('1T').mean() + ts = Series( + np.random.rand(1000), + index=date_range("20130101", periods=1000, freq="s"), + name="foo", + ) + result = ts.resample("1T").mean() self.check_metadata(ts, result) - result = ts.resample('1T').min() + result = ts.resample("1T").min() self.check_metadata(ts, result) - result = ts.resample('1T').apply(lambda x: x.sum()) + result = ts.resample("1T").apply(lambda x: x.sum()) self.check_metadata(ts, result) _metadata = Series._metadata _finalize = Series.__finalize__ - Series._metadata = ['name', 'filename'] - o.filename = 'foo' - o2.filename = 'bar' + Series._metadata = ["name", "filename"] + o.filename = "foo" + o2.filename = "bar" def finalize(self, other, method=None, **kwargs): for name in self._metadata: - if method == 'concat' and name == 'filename': - value = '+'.join([getattr( - o, name) for o in other.objs if getattr(o, name, None) - ]) + if method == "concat" and name == "filename": + value = "+".join( + [getattr(o, name) for o in other.objs if getattr(o, name, None)] + ) object.__setattr__(self, name, value) else: object.__setattr__(self, name, getattr(other, name, None)) @@ -172,60 +177,70 @@ def finalize(self, other, method=None, **kwargs): Series.__finalize__ = finalize result = pd.concat([o, o2]) - assert result.filename == 'foo+bar' + assert result.filename == "foo+bar" assert result.name is None # reset Series._metadata = _metadata Series.__finalize__ = _finalize - @pytest.mark.skipif(not _XARRAY_INSTALLED or _XARRAY_INSTALLED and - LooseVersion(xarray.__version__) < - LooseVersion('0.10.0'), - reason='xarray >= 0.10.0 required') + @pytest.mark.skipif( + not _XARRAY_INSTALLED + or _XARRAY_INSTALLED + and LooseVersion(xarray.__version__) < LooseVersion("0.10.0"), + reason="xarray >= 0.10.0 required", + ) @pytest.mark.parametrize( "index", - ['FloatIndex', 'IntIndex', - 'StringIndex', 'UnicodeIndex', - 'DateIndex', 'PeriodIndex', - 'TimedeltaIndex', 'CategoricalIndex']) + [ + "FloatIndex", + "IntIndex", + "StringIndex", + "UnicodeIndex", + "DateIndex", + "PeriodIndex", + "TimedeltaIndex", + "CategoricalIndex", + ], + ) def test_to_xarray_index_types(self, index): from xarray import DataArray - index = getattr(tm, 'make{}'.format(index)) + index = getattr(tm, "make{}".format(index)) s = Series(range(6), index=index(6)) - s.index.name = 'foo' + s.index.name = "foo" result = s.to_xarray() repr(result) assert len(result) == 6 assert len(result.coords) == 1 - assert_almost_equal(list(result.coords.keys()), ['foo']) + assert_almost_equal(list(result.coords.keys()), ["foo"]) assert isinstance(result, DataArray) # idempotency - assert_series_equal(result.to_series(), s, - check_index_type=False, - check_categorical=True) + assert_series_equal( + result.to_series(), s, check_index_type=False, check_categorical=True + ) - @td.skip_if_no('xarray', min_version='0.7.0') + @td.skip_if_no("xarray", min_version="0.7.0") def test_to_xarray(self): from xarray import DataArray s = Series([]) - s.index.name = 'foo' + s.index.name = "foo" result = s.to_xarray() assert len(result) == 0 assert len(result.coords) == 1 - assert_almost_equal(list(result.coords.keys()), ['foo']) + assert_almost_equal(list(result.coords.keys()), ["foo"]) assert isinstance(result, DataArray) s = Series(range(6)) - s.index.name = 'foo' - s.index = pd.MultiIndex.from_product([['a', 'b'], range(3)], - names=['one', 'two']) + s.index.name = "foo" + s.index = pd.MultiIndex.from_product( + [["a", "b"], range(3)], names=["one", "two"] + ) result = s.to_xarray() assert len(result) == 2 - assert_almost_equal(list(result.coords.keys()), ['one', 'two']) + assert_almost_equal(list(result.coords.keys()), ["one", "two"]) assert isinstance(result, DataArray) assert_series_equal(result.to_series(), s) @@ -234,20 +249,20 @@ def test_valid_deprecated(self): with tm.assert_produces_warning(FutureWarning): pd.Series([]).valid() - @pytest.mark.parametrize("s", [ - Series([np.arange(5)]), - pd.date_range('1/1/2011', periods=24, freq='H'), - pd.Series(range(5), index=pd.date_range("2017", periods=5)) - ]) + @pytest.mark.parametrize( + "s", + [ + Series([np.arange(5)]), + pd.date_range("1/1/2011", periods=24, freq="H"), + pd.Series(range(5), index=pd.date_range("2017", periods=5)), + ], + ) @pytest.mark.parametrize("shift_size", [0, 1, 2]) def test_shift_always_copy(self, s, shift_size): # GH22397 assert s.shift(shift_size) is not s - @pytest.mark.parametrize("move_by_freq", [ - pd.Timedelta('1D'), - pd.Timedelta('1M'), - ]) + @pytest.mark.parametrize("move_by_freq", [pd.Timedelta("1D"), pd.Timedelta("1M")]) def test_datetime_shift_always_copy(self, move_by_freq): # GH22397 s = pd.Series(range(5), index=pd.date_range("2017", periods=5)) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index ea59cde54f17bd..52d4fa76bf8794 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -23,7 +23,7 @@ def test_agg_regression1(tsframe): def test_agg_must_agg(df): - grouped = df.groupby('A')['C'] + grouped = df.groupby("A")["C"] msg = "Must produce aggregated value" with pytest.raises(Exception, match=msg): @@ -38,33 +38,57 @@ def test_agg_ser_multi_key(df): f = lambda x: x.sum() results = df.C.groupby([df.A, df.B]).aggregate(f) - expected = df.groupby(['A', 'B']).sum()['C'] + expected = df.groupby(["A", "B"]).sum()["C"] tm.assert_series_equal(results, expected) def test_groupby_aggregation_mixed_dtype(): # GH 6212 - expected = DataFrame({ - 'v1': [5, 5, 7, np.nan, 3, 3, 4, 1], - 'v2': [55, 55, 77, np.nan, 33, 33, 44, 11]}, - index=MultiIndex.from_tuples([(1, 95), (1, 99), (2, 95), (2, 99), - ('big', 'damp'), - ('blue', 'dry'), - ('red', 'red'), ('red', 'wet')], - names=['by1', 'by2'])) - - df = DataFrame({ - 'v1': [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9], - 'v2': [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99], - 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, - 12], - 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, - np.nan, np.nan] - }) - - g = df.groupby(['by1', 'by2']) - result = g[['v1', 'v2']].mean() + expected = DataFrame( + { + "v1": [5, 5, 7, np.nan, 3, 3, 4, 1], + "v2": [55, 55, 77, np.nan, 33, 33, 44, 11], + }, + index=MultiIndex.from_tuples( + [ + (1, 95), + (1, 99), + (2, 95), + (2, 99), + ("big", "damp"), + ("blue", "dry"), + ("red", "red"), + ("red", "wet"), + ], + names=["by1", "by2"], + ), + ) + + df = DataFrame( + { + "v1": [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9], + "v2": [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99], + "by1": ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], + "by2": [ + "wet", + "dry", + 99, + 95, + np.nan, + "damp", + 95, + 99, + "red", + 99, + np.nan, + np.nan, + ], + } + ) + + g = df.groupby(["by1", "by2"]) + result = g[["v1", "v2"]].mean() tm.assert_frame_equal(result, expected) @@ -74,21 +98,19 @@ def test_agg_apply_corner(ts, tsframe): assert ts.dtype == np.float64 # groupby float64 values results in Float64Index - exp = Series([], dtype=np.float64, - index=pd.Index([], dtype=np.float64)) + exp = Series([], dtype=np.float64, index=pd.Index([], dtype=np.float64)) tm.assert_series_equal(grouped.sum(), exp) tm.assert_series_equal(grouped.agg(np.sum), exp) - tm.assert_series_equal(grouped.apply(np.sum), exp, - check_index_type=False) + tm.assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False) # DataFrame - grouped = tsframe.groupby(tsframe['A'] * np.nan) - exp_df = DataFrame(columns=tsframe.columns, dtype=float, - index=pd.Index([], dtype=np.float64)) + grouped = tsframe.groupby(tsframe["A"] * np.nan) + exp_df = DataFrame( + columns=tsframe.columns, dtype=float, index=pd.Index([], dtype=np.float64) + ) tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False) tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) - tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], - check_names=False) + tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], check_names=False) def test_agg_grouping_is_list_tuple(ts): @@ -110,60 +132,66 @@ def test_agg_grouping_is_list_tuple(ts): def test_agg_python_multiindex(mframe): - grouped = mframe.groupby(['A', 'B']) + grouped = mframe.groupby(["A", "B"]) result = grouped.agg(np.mean) expected = grouped.mean() tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize('groupbyfunc', [ - lambda x: x.weekday(), - [lambda x: x.month, lambda x: x.weekday()], -]) +@pytest.mark.parametrize( + "groupbyfunc", [lambda x: x.weekday(), [lambda x: x.month, lambda x: x.weekday()]] +) def test_aggregate_str_func(tsframe, groupbyfunc): grouped = tsframe.groupby(groupbyfunc) # single series - result = grouped['A'].agg('std') - expected = grouped['A'].std() + result = grouped["A"].agg("std") + expected = grouped["A"].std() tm.assert_series_equal(result, expected) # group frame by function name - result = grouped.aggregate('var') + result = grouped.aggregate("var") expected = grouped.var() tm.assert_frame_equal(result, expected) # group frame by function dict - result = grouped.agg(OrderedDict([['A', 'var'], - ['B', 'std'], - ['C', 'mean'], - ['D', 'sem']])) - expected = DataFrame(OrderedDict([['A', grouped['A'].var()], - ['B', grouped['B'].std()], - ['C', grouped['C'].mean()], - ['D', grouped['D'].sem()]])) + result = grouped.agg( + OrderedDict([["A", "var"], ["B", "std"], ["C", "mean"], ["D", "sem"]]) + ) + expected = DataFrame( + OrderedDict( + [ + ["A", grouped["A"].var()], + ["B", grouped["B"].std()], + ["C", grouped["C"].mean()], + ["D", grouped["D"].sem()], + ] + ) + ) tm.assert_frame_equal(result, expected) def test_aggregate_item_by_item(df): - grouped = df.groupby('A') + grouped = df.groupby("A") aggfun = lambda ser: ser.size result = grouped.agg(aggfun) - foo = (df.A == 'foo').sum() - bar = (df.A == 'bar').sum() + foo = (df.A == "foo").sum() + bar = (df.A == "bar").sum() K = len(result.columns) # GH5782 # odd comparisons can result here, so cast to make easy - exp = pd.Series(np.array([foo] * K), index=list('BCD'), - dtype=np.float64, name='foo') - tm.assert_series_equal(result.xs('foo'), exp) + exp = pd.Series( + np.array([foo] * K), index=list("BCD"), dtype=np.float64, name="foo" + ) + tm.assert_series_equal(result.xs("foo"), exp) - exp = pd.Series(np.array([bar] * K), index=list('BCD'), - dtype=np.float64, name='bar') - tm.assert_almost_equal(result.xs('bar'), exp) + exp = pd.Series( + np.array([bar] * K), index=list("BCD"), dtype=np.float64, name="bar" + ) + tm.assert_almost_equal(result.xs("bar"), exp) def aggfun(ser): return ser.size @@ -174,7 +202,7 @@ def aggfun(ser): def test_wrap_agg_out(three_group): - grouped = three_group.groupby(['A', 'B']) + grouped = three_group.groupby(["A", "B"]) def func(ser): if ser.dtype == np.object: @@ -183,53 +211,53 @@ def func(ser): return ser.sum() result = grouped.aggregate(func) - exp_grouped = three_group.loc[:, three_group.columns != 'C'] - expected = exp_grouped.groupby(['A', 'B']).aggregate(func) + exp_grouped = three_group.loc[:, three_group.columns != "C"] + expected = exp_grouped.groupby(["A", "B"]).aggregate(func) tm.assert_frame_equal(result, expected) def test_agg_multiple_functions_maintain_order(df): # GH #610 - funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)] - result = df.groupby('A')['C'].agg(funcs) - exp_cols = Index(['mean', 'max', 'min']) + funcs = [("mean", np.mean), ("max", np.max), ("min", np.min)] + result = df.groupby("A")["C"].agg(funcs) + exp_cols = Index(["mean", "max", "min"]) tm.assert_index_equal(result.columns, exp_cols) def test_multiple_functions_tuples_and_non_tuples(df): # #1359 - funcs = [('foo', 'mean'), 'std'] - ex_funcs = [('foo', 'mean'), ('std', 'std')] + funcs = [("foo", "mean"), "std"] + ex_funcs = [("foo", "mean"), ("std", "std")] - result = df.groupby('A')['C'].agg(funcs) - expected = df.groupby('A')['C'].agg(ex_funcs) + result = df.groupby("A")["C"].agg(funcs) + expected = df.groupby("A")["C"].agg(ex_funcs) tm.assert_frame_equal(result, expected) - result = df.groupby('A').agg(funcs) - expected = df.groupby('A').agg(ex_funcs) + result = df.groupby("A").agg(funcs) + expected = df.groupby("A").agg(ex_funcs) tm.assert_frame_equal(result, expected) def test_more_flexible_frame_multi_function(df): - grouped = df.groupby('A') + grouped = df.groupby("A") - exmean = grouped.agg(OrderedDict([['C', np.mean], ['D', np.mean]])) - exstd = grouped.agg(OrderedDict([['C', np.std], ['D', np.std]])) + exmean = grouped.agg(OrderedDict([["C", np.mean], ["D", np.mean]])) + exstd = grouped.agg(OrderedDict([["C", np.std], ["D", np.std]])) - expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1) + expected = concat([exmean, exstd], keys=["mean", "std"], axis=1) expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1) - d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]]) + d = OrderedDict([["C", [np.mean, np.std]], ["D", [np.mean, np.std]]]) result = grouped.aggregate(d) tm.assert_frame_equal(result, expected) # be careful - result = grouped.aggregate(OrderedDict([['C', np.mean], - ['D', [np.mean, np.std]]])) - expected = grouped.aggregate(OrderedDict([['C', np.mean], - ['D', [np.mean, np.std]]])) + result = grouped.aggregate(OrderedDict([["C", np.mean], ["D", [np.mean, np.std]]])) + expected = grouped.aggregate( + OrderedDict([["C", np.mean], ["D", [np.mean, np.std]]]) + ) tm.assert_frame_equal(result, expected) def foo(x): @@ -240,12 +268,12 @@ def bar(x): # this uses column selection & renaming with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - d = OrderedDict([['C', np.mean], - ['D', OrderedDict([['foo', np.mean], - ['bar', np.std]])]]) + d = OrderedDict( + [["C", np.mean], ["D", OrderedDict([["foo", np.mean], ["bar", np.std]])]] + ) result = grouped.aggregate(d) - d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]]) + d = OrderedDict([["C", [np.mean]], ["D", [foo, bar]]]) expected = grouped.aggregate(d) tm.assert_frame_equal(result, expected) @@ -253,26 +281,29 @@ def bar(x): def test_multi_function_flexible_mix(df): # GH #1268 - grouped = df.groupby('A') + grouped = df.groupby("A") # Expected - d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])], - ['D', {'sum': 'sum'}]]) + d = OrderedDict( + [["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", {"sum": "sum"}]] + ) # this uses column selection & renaming with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): expected = grouped.aggregate(d) # Test 1 - d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])], - ['D', 'sum']]) + d = OrderedDict( + [["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", "sum"]] + ) # this uses column selection & renaming with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = grouped.aggregate(d) tm.assert_frame_equal(result, expected) # Test 2 - d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])], - ['D', ['sum']]]) + d = OrderedDict( + [["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", ["sum"]]] + ) # this uses column selection & renaming with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = grouped.aggregate(d) @@ -281,65 +312,63 @@ def test_multi_function_flexible_mix(df): def test_groupby_agg_coercing_bools(): # issue 14873 - dat = pd.DataFrame( - {'a': [1, 1, 2, 2], 'b': [0, 1, 2, 3], 'c': [None, None, 1, 1]}) - gp = dat.groupby('a') + dat = pd.DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3], "c": [None, None, 1, 1]}) + gp = dat.groupby("a") - index = Index([1, 2], name='a') + index = Index([1, 2], name="a") - result = gp['b'].aggregate(lambda x: (x != 0).all()) - expected = Series([False, True], index=index, name='b') + result = gp["b"].aggregate(lambda x: (x != 0).all()) + expected = Series([False, True], index=index, name="b") tm.assert_series_equal(result, expected) - result = gp['c'].aggregate(lambda x: x.isnull().all()) - expected = Series([True, False], index=index, name='c') + result = gp["c"].aggregate(lambda x: x.isnull().all()) + expected = Series([True, False], index=index, name="c") tm.assert_series_equal(result, expected) def test_order_aggregate_multiple_funcs(): # GH 25692 - df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]}) + df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]}) - res = df.groupby('A').agg(['sum', 'max', 'mean', 'ohlc', 'min']) + res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"]) result = res.columns.levels[1] - expected = pd.Index(['sum', 'max', 'mean', 'ohlc', 'min']) + expected = pd.Index(["sum", "max", "mean", "ohlc", "min"]) tm.assert_index_equal(result, expected) -@pytest.mark.parametrize('dtype', [np.int64, np.uint64]) -@pytest.mark.parametrize('how', ['first', 'last', 'min', - 'max', 'mean', 'median']) +@pytest.mark.parametrize("dtype", [np.int64, np.uint64]) +@pytest.mark.parametrize("how", ["first", "last", "min", "max", "mean", "median"]) def test_uint64_type_handling(dtype, how): # GH 26310 - df = pd.DataFrame({'x': 6903052872240755750, 'y': [1, 2]}) - expected = df.groupby('y').agg({'x': how}) + df = pd.DataFrame({"x": 6903052872240755750, "y": [1, 2]}) + expected = df.groupby("y").agg({"x": how}) df.x = df.x.astype(dtype) - result = df.groupby('y').agg({'x': how}) + result = df.groupby("y").agg({"x": how}) result.x = result.x.astype(np.int64) tm.assert_frame_equal(result, expected, check_exact=True) class TestNamedAggregationSeries: - def test_series_named_agg(self): df = pd.Series([1, 2, 3, 4]) gr = df.groupby([0, 0, 1, 1]) - result = gr.agg(a='sum', b='min') - expected = pd.DataFrame({'a': [3, 7], 'b': [1, 3]}, - columns=['a', 'b'], index=[0, 1]) + result = gr.agg(a="sum", b="min") + expected = pd.DataFrame( + {"a": [3, 7], "b": [1, 3]}, columns=["a", "b"], index=[0, 1] + ) tm.assert_frame_equal(result, expected) - result = gr.agg(b='min', a='sum') + result = gr.agg(b="min", a="sum") # sort for 35 and earlier if compat.PY36: - expected = expected[['b', 'a']] + expected = expected[["b", "a"]] tm.assert_frame_equal(result, expected) def test_no_args_raises(self): gr = pd.Series([1, 2]).groupby([0, 1]) - with pytest.raises(TypeError, match='Must provide'): + with pytest.raises(TypeError, match="Must provide"): gr.agg() # but we do allow this @@ -352,61 +381,63 @@ def test_series_named_agg_duplicates_raises(self): # aggregate_multiple_funcs. It could maybe be lifted in the future. gr = pd.Series([1, 2, 3]).groupby([0, 0, 1]) with pytest.raises(SpecificationError): - gr.agg(a='sum', b='sum') + gr.agg(a="sum", b="sum") def test_mangled(self): gr = pd.Series([1, 2, 3]).groupby([0, 0, 1]) result = gr.agg(a=lambda x: 0, b=lambda x: 1) - expected = pd.DataFrame({'a': [0, 0], 'b': [1, 1]}) + expected = pd.DataFrame({"a": [0, 0], "b": [1, 1]}) tm.assert_frame_equal(result, expected) class TestNamedAggregationDataFrame: def test_agg_relabel(self): - df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'], - "A": [0, 1, 2, 3], - "B": [5, 6, 7, 8]}) - result = df.groupby("group").agg( - a_max=("A", "max"), - b_max=("B", "max"), + df = pd.DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) + result = df.groupby("group").agg(a_max=("A", "max"), b_max=("B", "max")) + expected = pd.DataFrame( + {"a_max": [1, 3], "b_max": [6, 8]}, + index=pd.Index(["a", "b"], name="group"), + columns=["a_max", "b_max"], ) - expected = pd.DataFrame({"a_max": [1, 3], "b_max": [6, 8]}, - index=pd.Index(['a', 'b'], name='group'), - columns=['a_max', 'b_max']) tm.assert_frame_equal(result, expected) # order invariance p98 = functools.partial(np.percentile, q=98) - result = df.groupby('group').agg( + result = df.groupby("group").agg( b_min=("B", "min"), a_min=("A", min), a_mean=("A", np.mean), a_max=("A", "max"), b_max=("B", "max"), - a_98=("A", p98) + a_98=("A", p98), + ) + expected = pd.DataFrame( + { + "b_min": [5, 7], + "a_min": [0, 2], + "a_mean": [0.5, 2.5], + "a_max": [1, 3], + "b_max": [6, 8], + "a_98": [0.98, 2.98], + }, + index=pd.Index(["a", "b"], name="group"), + columns=["b_min", "a_min", "a_mean", "a_max", "b_max", "a_98"], ) - expected = pd.DataFrame({"b_min": [5, 7], - "a_min": [0, 2], - "a_mean": [0.5, 2.5], - "a_max": [1, 3], - "b_max": [6, 8], - "a_98": [0.98, 2.98]}, - index=pd.Index(['a', 'b'], name='group'), - columns=['b_min', 'a_min', 'a_mean', - 'a_max', 'b_max', 'a_98']) if not compat.PY36: - expected = expected[['a_98', 'a_max', 'a_mean', - 'a_min', 'b_max', 'b_min']] + expected = expected[["a_98", "a_max", "a_mean", "a_min", "b_max", "b_min"]] tm.assert_frame_equal(result, expected) def test_agg_relabel_non_identifier(self): - df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'], - "A": [0, 1, 2, 3], - "B": [5, 6, 7, 8]}) + df = pd.DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) - result = df.groupby("group").agg(**{'my col': ('A', 'max')}) - expected = pd.DataFrame({'my col': [1, 3]}, - index=pd.Index(['a', 'b'], name='group')) + result = df.groupby("group").agg(**{"my col": ("A", "max")}) + expected = pd.DataFrame( + {"my col": [1, 3]}, index=pd.Index(["a", "b"], name="group") + ) tm.assert_frame_equal(result, expected) def test_duplicate_raises(self): @@ -417,22 +448,22 @@ def test_duplicate_raises(self): df.groupby("A").agg(a=("A", "min"), b=("A", "min")) def test_agg_relabel_with_level(self): - df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}, - index=pd.MultiIndex.from_product([['A', 'B'], - ['a', 'b']])) - result = df.groupby(level=0).agg(aa=('A', 'max'), bb=('A', 'min'), - cc=('B', 'mean')) - expected = pd.DataFrame({ - 'aa': [0, 1], - 'bb': [0, 1], - 'cc': [1.5, 3.5] - }, index=['A', 'B']) + df = pd.DataFrame( + {"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}, + index=pd.MultiIndex.from_product([["A", "B"], ["a", "b"]]), + ) + result = df.groupby(level=0).agg( + aa=("A", "max"), bb=("A", "min"), cc=("B", "mean") + ) + expected = pd.DataFrame( + {"aa": [0, 1], "bb": [0, 1], "cc": [1.5, 3.5]}, index=["A", "B"] + ) tm.assert_frame_equal(result, expected) def test_agg_relabel_other_raises(self): df = pd.DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]}) grouped = df.groupby("A") - match = 'Must provide' + match = "Must provide" with pytest.raises(TypeError, match=match): grouped.agg(foo=1) @@ -440,74 +471,65 @@ def test_agg_relabel_other_raises(self): grouped.agg() with pytest.raises(TypeError, match=match): - grouped.agg(a=('B', 'max'), b=(1, 2, 3)) + grouped.agg(a=("B", "max"), b=(1, 2, 3)) def test_missing_raises(self): df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) with pytest.raises(KeyError, match="Column 'C' does not exist"): - df.groupby("A").agg(c=('C', 'sum')) + df.groupby("A").agg(c=("C", "sum")) def test_agg_namedtuple(self): df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) result = df.groupby("A").agg( - b=pd.NamedAgg("B", "sum"), - c=pd.NamedAgg(column="B", aggfunc="count") + b=pd.NamedAgg("B", "sum"), c=pd.NamedAgg(column="B", aggfunc="count") ) - expected = df.groupby("A").agg(b=("B", "sum"), - c=("B", "count")) + expected = df.groupby("A").agg(b=("B", "sum"), c=("B", "count")) tm.assert_frame_equal(result, expected) def test_mangled(self): df = pd.DataFrame({"A": [0, 1], "B": [1, 2], "C": [3, 4]}) - result = df.groupby("A").agg( - b=("B", lambda x: 0), - c=("C", lambda x: 1) + result = df.groupby("A").agg(b=("B", lambda x: 0), c=("C", lambda x: 1)) + expected = pd.DataFrame( + {"b": [0, 0], "c": [1, 1]}, index=pd.Index([0, 1], name="A") ) - expected = pd.DataFrame({"b": [0, 0], "c": [1, 1]}, - index=pd.Index([0, 1], name='A')) tm.assert_frame_equal(result, expected) class TestLambdaMangling: - def test_maybe_mangle_lambdas_passthrough(self): - assert _maybe_mangle_lambdas('mean') == 'mean' - assert _maybe_mangle_lambdas(lambda x: x).__name__ == '' + assert _maybe_mangle_lambdas("mean") == "mean" + assert _maybe_mangle_lambdas(lambda x: x).__name__ == "" # don't mangel single lambda. - assert _maybe_mangle_lambdas([lambda x: x])[0].__name__ == '' + assert _maybe_mangle_lambdas([lambda x: x])[0].__name__ == "" def test_maybe_mangle_lambdas_listlike(self): aggfuncs = [lambda x: 1, lambda x: 2] result = _maybe_mangle_lambdas(aggfuncs) - assert result[0].__name__ == '' - assert result[1].__name__ == '' + assert result[0].__name__ == "" + assert result[1].__name__ == "" assert aggfuncs[0](None) == result[0](None) assert aggfuncs[1](None) == result[1](None) def test_maybe_mangle_lambdas(self): - func = { - 'A': [lambda x: 0, lambda x: 1] - } + func = {"A": [lambda x: 0, lambda x: 1]} result = _maybe_mangle_lambdas(func) - assert result['A'][0].__name__ == '' - assert result['A'][1].__name__ == '' + assert result["A"][0].__name__ == "" + assert result["A"][1].__name__ == "" def test_maybe_mangle_lambdas_args(self): - func = { - 'A': [lambda x, a, b=1: (0, a, b), lambda x: 1] - } + func = {"A": [lambda x, a, b=1: (0, a, b), lambda x: 1]} result = _maybe_mangle_lambdas(func) - assert result['A'][0].__name__ == '' - assert result['A'][1].__name__ == '' + assert result["A"][0].__name__ == "" + assert result["A"][1].__name__ == "" - assert func['A'][0](0, 1) == (0, 1, 1) - assert func['A'][0](0, 1, 2) == (0, 1, 2) - assert func['A'][0](0, 2, b=3) == (0, 2, 3) + assert func["A"][0](0, 1) == (0, 1, 1) + assert func["A"][0](0, 1, 2) == (0, 1, 2) + assert func["A"][0](0, 2, b=3) == (0, 2, 3) def test_maybe_mangle_lambdas_named(self): - func = OrderedDict([('C', np.mean), - ('D', OrderedDict([('foo', np.mean), - ('bar', np.mean)]))]) + func = OrderedDict( + [("C", np.mean), ("D", OrderedDict([("foo", np.mean), ("bar", np.mean)]))] + ) result = _maybe_mangle_lambdas(func) assert result == func @@ -515,15 +537,16 @@ def test_basic(self): df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) result = df.groupby("A").agg({"B": [lambda x: 0, lambda x: 1]}) - expected = pd.DataFrame({("B", ""): [0, 0], - ("B", ""): [1, 1]}, - index=pd.Index([0, 1], name='A')) + expected = pd.DataFrame( + {("B", ""): [0, 0], ("B", ""): [1, 1]}, + index=pd.Index([0, 1], name="A"), + ) tm.assert_frame_equal(result, expected) def test_mangle_series_groupby(self): gr = pd.Series([1, 2, 3, 4]).groupby([0, 0, 1, 1]) result = gr.agg([lambda x: 0, lambda x: 1]) - expected = pd.DataFrame({'': [0, 0], '': [1, 1]}) + expected = pd.DataFrame({"": [0, 0], "": [1, 1]}) tm.assert_frame_equal(result, expected) @pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.") @@ -531,9 +554,9 @@ def test_with_kwargs(self): f1 = lambda x, y, b=1: x.sum() + y + b f2 = lambda x, y, b=2: x.sum() + y * b result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0) - expected = pd.DataFrame({'': [4], '': [6]}) + expected = pd.DataFrame({"": [4], "": [6]}) tm.assert_frame_equal(result, expected) result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10) - expected = pd.DataFrame({'': [13], '': [30]}) + expected = pd.DataFrame({"": [13], "": [30]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index c2f98b11bb33e3..5d50c044cf9f55 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -6,121 +6,135 @@ import pytest import pandas as pd -from pandas import ( - DataFrame, Index, NaT, Series, Timedelta, Timestamp, bdate_range) +from pandas import DataFrame, Index, NaT, Series, Timedelta, Timestamp, bdate_range from pandas.core.groupby.groupby import DataError import pandas.util.testing as tm -@pytest.mark.parametrize('op_name', [ - 'count', - 'sum', - 'std', - 'var', - 'sem', - 'mean', - pytest.param('median', - # ignore mean of empty slice - # and all-NaN - marks=[pytest.mark.filterwarnings( - "ignore::RuntimeWarning" - )]), - 'prod', - 'min', - 'max', -]) +@pytest.mark.parametrize( + "op_name", + [ + "count", + "sum", + "std", + "var", + "sem", + "mean", + pytest.param( + "median", + # ignore mean of empty slice + # and all-NaN + marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")], + ), + "prod", + "min", + "max", + ], +) def test_cythonized_aggers(op_name): - data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., np.nan, np.nan], - 'B': ['A', 'B'] * 6, - 'C': np.random.randn(12)} + data = { + "A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan], + "B": ["A", "B"] * 6, + "C": np.random.randn(12), + } df = DataFrame(data) - df.loc[2:10:2, 'C'] = np.nan + df.loc[2:10:2, "C"] = np.nan op = lambda x: getattr(x, op_name)() # single column - grouped = df.drop(['B'], axis=1).groupby('A') - exp = {cat: op(group['C']) for cat, group in grouped} - exp = DataFrame({'C': exp}) - exp.index.name = 'A' + grouped = df.drop(["B"], axis=1).groupby("A") + exp = {cat: op(group["C"]) for cat, group in grouped} + exp = DataFrame({"C": exp}) + exp.index.name = "A" result = op(grouped) tm.assert_frame_equal(result, exp) # multiple columns - grouped = df.groupby(['A', 'B']) + grouped = df.groupby(["A", "B"]) expd = {} for (cat1, cat2), group in grouped: - expd.setdefault(cat1, {})[cat2] = op(group['C']) + expd.setdefault(cat1, {})[cat2] = op(group["C"]) exp = DataFrame(expd).T.stack(dropna=False) - exp.index.names = ['A', 'B'] - exp.name = 'C' + exp.index.names = ["A", "B"] + exp.name = "C" - result = op(grouped)['C'] - if op_name in ['sum', 'prod']: + result = op(grouped)["C"] + if op_name in ["sum", "prod"]: tm.assert_series_equal(result, exp) def test_cython_agg_boolean(): - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': np.random.randint(0, 2, 50).astype('bool')}) - result = frame.groupby('a')['b'].mean() - expected = frame.groupby('a')['b'].agg(np.mean) + frame = DataFrame( + { + "a": np.random.randint(0, 5, 50), + "b": np.random.randint(0, 2, 50).astype("bool"), + } + ) + result = frame.groupby("a")["b"].mean() + expected = frame.groupby("a")["b"].agg(np.mean) tm.assert_series_equal(result, expected) def test_cython_agg_nothing_to_agg(): - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': ['foo', 'bar'] * 25}) + frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) msg = "No numeric types to aggregate" with pytest.raises(DataError, match=msg): - frame.groupby('a')['b'].mean() + frame.groupby("a")["b"].mean() - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': ['foo', 'bar'] * 25}) + frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) with pytest.raises(DataError, match=msg): - frame[['b']].groupby(frame['a']).mean() + frame[["b"]].groupby(frame["a"]).mean() def test_cython_agg_nothing_to_agg_with_dates(): - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': ['foo', 'bar'] * 25, - 'dates': pd.date_range('now', periods=50, freq='T')}) + frame = DataFrame( + { + "a": np.random.randint(0, 5, 50), + "b": ["foo", "bar"] * 25, + "dates": pd.date_range("now", periods=50, freq="T"), + } + ) msg = "No numeric types to aggregate" with pytest.raises(DataError, match=msg): - frame.groupby('b').dates.mean() + frame.groupby("b").dates.mean() def test_cython_agg_frame_columns(): # #2113 - df = DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]}) + df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]}) - df.groupby(level=0, axis='columns').mean() - df.groupby(level=0, axis='columns').mean() - df.groupby(level=0, axis='columns').mean() - df.groupby(level=0, axis='columns').mean() + df.groupby(level=0, axis="columns").mean() + df.groupby(level=0, axis="columns").mean() + df.groupby(level=0, axis="columns").mean() + df.groupby(level=0, axis="columns").mean() def test_cython_agg_return_dict(): # GH 16741 df = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - - ts = df.groupby('A')['B'].agg(lambda x: x.value_counts().to_dict()) - expected = Series([{'two': 1, 'one': 1, 'three': 1}, - {'two': 2, 'one': 2, 'three': 1}], - index=Index(['bar', 'foo'], name='A'), - name='B') + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) + + ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict()) + expected = Series( + [{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}], + index=Index(["bar", "foo"], name="A"), + name="B", + ) tm.assert_series_equal(ts, expected) def test_cython_fail_agg(): - dr = bdate_range('1/1/2000', periods=50) - ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr) + dr = bdate_range("1/1/2000", periods=50) + ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr) grouped = ts.groupby(lambda x: x.month) summed = grouped.sum() @@ -128,17 +142,20 @@ def test_cython_fail_agg(): tm.assert_series_equal(summed, expected) -@pytest.mark.parametrize('op, targop', [ - ('mean', np.mean), - ('median', np.median), - ('var', np.var), - ('add', np.sum), - ('prod', np.prod), - ('min', np.min), - ('max', np.max), - ('first', lambda x: x.iloc[0]), - ('last', lambda x: x.iloc[-1]), -]) +@pytest.mark.parametrize( + "op, targop", + [ + ("mean", np.mean), + ("median", np.median), + ("var", np.var), + ("add", np.sum), + ("prod", np.prod), + ("min", np.min), + ("max", np.max), + ("first", lambda x: x.iloc[0]), + ("last", lambda x: x.iloc[-1]), + ], +) def test__cython_agg_general(op, targop): df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) @@ -148,12 +165,15 @@ def test__cython_agg_general(op, targop): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize('op, targop', [ - ('mean', np.mean), - ('median', lambda x: np.median(x) if len(x) > 0 else np.nan), - ('var', lambda x: np.var(x, ddof=1)), - ('min', np.min), - ('max', np.max), ] +@pytest.mark.parametrize( + "op, targop", + [ + ("mean", np.mean), + ("median", lambda x: np.median(x) if len(x) > 0 else np.nan), + ("var", lambda x: np.var(x, ddof=1)), + ("min", np.min), + ("max", np.max), + ], ) def test_cython_agg_empty_buckets(op, targop, observed): df = pd.DataFrame([11, 12, 13]) @@ -172,43 +192,47 @@ def test_cython_agg_empty_buckets(op, targop, observed): def test_cython_agg_empty_buckets_nanops(observed): # GH-18869 can't call nanops on empty groups, so hardcode expected # for these - df = pd.DataFrame([11, 12, 13], columns=['a']) + df = pd.DataFrame([11, 12, 13], columns=["a"]) grps = range(0, 25, 5) # add / sum - result = df.groupby(pd.cut(df['a'], grps), - observed=observed)._cython_agg_general('add') + result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( + "add" + ) intervals = pd.interval_range(0, 20, freq=5) expected = pd.DataFrame( {"a": [0, 0, 36, 0]}, - index=pd.CategoricalIndex(intervals, name='a', ordered=True)) + index=pd.CategoricalIndex(intervals, name="a", ordered=True), + ) if observed: expected = expected[expected.a != 0] tm.assert_frame_equal(result, expected) # prod - result = df.groupby(pd.cut(df['a'], grps), - observed=observed)._cython_agg_general('prod') + result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( + "prod" + ) expected = pd.DataFrame( {"a": [1, 1, 1716, 1]}, - index=pd.CategoricalIndex(intervals, name='a', ordered=True)) + index=pd.CategoricalIndex(intervals, name="a", ordered=True), + ) if observed: expected = expected[expected.a != 1] tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize('op', ['first', 'last', 'max', 'min']) -@pytest.mark.parametrize('data', [ - Timestamp('2016-10-14 21:00:44.557'), - Timedelta('17088 days 21:00:44.557'), ]) +@pytest.mark.parametrize("op", ["first", "last", "max", "min"]) +@pytest.mark.parametrize( + "data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")] +) def test_cython_with_timestamp_and_nat(op, data): # https://github.com/pandas-dev/pandas/issues/19526 - df = DataFrame({'a': [0, 1], 'b': [data, NaT]}) - index = Index([0, 1], name='a') + df = DataFrame({"a": [0, 1], "b": [data, NaT]}) + index = Index([0, 1], name="a") # We will group by a and test the cython aggregations - expected = DataFrame({'b': [data, NaT]}, index=index) + expected = DataFrame({"b": [data, NaT]}, index=index) - result = df.groupby('a').aggregate(op) + result = df.groupby("a").aggregate(op) tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 903ffa23173cbe..7905575a4a1a8e 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -11,8 +11,14 @@ import pandas as pd from pandas import ( - DataFrame, Index, MultiIndex, PeriodIndex, Series, date_range, - period_range) + DataFrame, + Index, + MultiIndex, + PeriodIndex, + Series, + date_range, + period_range, +) from pandas.core.groupby.groupby import SpecificationError import pandas.util.testing as tm @@ -24,60 +30,73 @@ def test_agg_api(): # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error # different api for agg when passed custom function with mixed frame - df = DataFrame({'data1': np.random.randn(5), - 'data2': np.random.randn(5), - 'key1': ['a', 'a', 'b', 'b', 'a'], - 'key2': ['one', 'two', 'one', 'two', 'one']}) - grouped = df.groupby('key1') + df = DataFrame( + { + "data1": np.random.randn(5), + "data2": np.random.randn(5), + "key1": ["a", "a", "b", "b", "a"], + "key2": ["one", "two", "one", "two", "one"], + } + ) + grouped = df.groupby("key1") def peak_to_peak(arr): return arr.max() - arr.min() expected = grouped.agg([peak_to_peak]) - expected.columns = ['data1', 'data2'] + expected.columns = ["data1", "data2"] result = grouped.agg(peak_to_peak) tm.assert_frame_equal(result, expected) def test_agg_datetimes_mixed(): - data = [[1, '2012-01-01', 1.0], - [2, '2012-01-02', 2.0], - [3, None, 3.0]] - - df1 = DataFrame({'key': [x[0] for x in data], - 'date': [x[1] for x in data], - 'value': [x[2] for x in data]}) - - data = [[row[0], - (dt.datetime.strptime(row[1], '%Y-%m-%d').date() - if row[1] else None), - row[2]] - for row in data] - - df2 = DataFrame({'key': [x[0] for x in data], - 'date': [x[1] for x in data], - 'value': [x[2] for x in data]}) - - df1['weights'] = df1['value'] / df1['value'].sum() - gb1 = df1.groupby('date').aggregate(np.sum) - - df2['weights'] = df1['value'] / df1['value'].sum() - gb2 = df2.groupby('date').aggregate(np.sum) - - assert (len(gb1) == len(gb2)) + data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]] + + df1 = DataFrame( + { + "key": [x[0] for x in data], + "date": [x[1] for x in data], + "value": [x[2] for x in data], + } + ) + + data = [ + [ + row[0], + (dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None), + row[2], + ] + for row in data + ] + + df2 = DataFrame( + { + "key": [x[0] for x in data], + "date": [x[1] for x in data], + "value": [x[2] for x in data], + } + ) + + df1["weights"] = df1["value"] / df1["value"].sum() + gb1 = df1.groupby("date").aggregate(np.sum) + + df2["weights"] = df1["value"] / df1["value"].sum() + gb2 = df2.groupby("date").aggregate(np.sum) + + assert len(gb1) == len(gb2) def test_agg_period_index(): - prng = period_range('2012-1-1', freq='M', periods=3) + prng = period_range("2012-1-1", freq="M", periods=3) df = DataFrame(np.random.randn(3, 2), index=prng) rs = df.groupby(level=0).sum() assert isinstance(rs.index, PeriodIndex) # GH 3579 - index = period_range(start='1999-01', periods=5, freq='M') + index = period_range(start="1999-01", periods=5, freq="M") s1 = Series(np.random.rand(len(index)), index=index) s2 = Series(np.random.rand(len(index)), index=index) - series = [('s1', s1), ('s2', s2)] + series = [("s1", s1), ("s2", s2)] df = DataFrame.from_dict(OrderedDict(series)) grouped = df.groupby(df.index.month) list(grouped) @@ -86,38 +105,38 @@ def test_agg_period_index(): def test_agg_dict_parameter_cast_result_dtypes(): # GH 12821 - df = DataFrame({'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], - 'time': date_range('1/1/2011', periods=8, freq='H')}) - df.loc[[0, 1, 2, 5], 'time'] = None + df = DataFrame( + { + "class": ["A", "A", "B", "B", "C", "C", "D", "D"], + "time": date_range("1/1/2011", periods=8, freq="H"), + } + ) + df.loc[[0, 1, 2, 5], "time"] = None # test for `first` function - exp = df.loc[[0, 3, 4, 6]].set_index('class') - grouped = df.groupby('class') + exp = df.loc[[0, 3, 4, 6]].set_index("class") + grouped = df.groupby("class") tm.assert_frame_equal(grouped.first(), exp) - tm.assert_frame_equal(grouped.agg('first'), exp) - tm.assert_frame_equal(grouped.agg({'time': 'first'}), exp) - tm.assert_series_equal(grouped.time.first(), exp['time']) - tm.assert_series_equal(grouped.time.agg('first'), exp['time']) + tm.assert_frame_equal(grouped.agg("first"), exp) + tm.assert_frame_equal(grouped.agg({"time": "first"}), exp) + tm.assert_series_equal(grouped.time.first(), exp["time"]) + tm.assert_series_equal(grouped.time.agg("first"), exp["time"]) # test for `last` function - exp = df.loc[[0, 3, 4, 7]].set_index('class') - grouped = df.groupby('class') + exp = df.loc[[0, 3, 4, 7]].set_index("class") + grouped = df.groupby("class") tm.assert_frame_equal(grouped.last(), exp) - tm.assert_frame_equal(grouped.agg('last'), exp) - tm.assert_frame_equal(grouped.agg({'time': 'last'}), exp) - tm.assert_series_equal(grouped.time.last(), exp['time']) - tm.assert_series_equal(grouped.time.agg('last'), exp['time']) + tm.assert_frame_equal(grouped.agg("last"), exp) + tm.assert_frame_equal(grouped.agg({"time": "last"}), exp) + tm.assert_series_equal(grouped.time.last(), exp["time"]) + tm.assert_series_equal(grouped.time.agg("last"), exp["time"]) # count - exp = pd.Series([2, 2, 2, 2], - index=Index(list('ABCD'), name='class'), - name='time') + exp = pd.Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time") tm.assert_series_equal(grouped.time.agg(len), exp) tm.assert_series_equal(grouped.time.size(), exp) - exp = pd.Series([0, 1, 1, 2], - index=Index(list('ABCD'), name='class'), - name='time') + exp = pd.Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time") tm.assert_series_equal(grouped.time.count(), exp) @@ -125,19 +144,17 @@ def test_agg_cast_results_dtypes(): # similar to GH12821 # xref #11444 u = [dt.datetime(2015, x + 1, 1) for x in range(12)] - v = list('aaabbbbbbccd') - df = pd.DataFrame({'X': v, 'Y': u}) + v = list("aaabbbbbbccd") + df = pd.DataFrame({"X": v, "Y": u}) - result = df.groupby('X')['Y'].agg(len) - expected = df.groupby('X')['Y'].count() + result = df.groupby("X")["Y"].agg(len) + expected = df.groupby("X")["Y"].count() tm.assert_series_equal(result, expected) def test_aggregate_float64_no_int64(): # see gh-11199 - df = DataFrame({"a": [1, 2, 3, 4, 5], - "b": [1, 2, 2, 4, 5], - "c": [1, 2, 3, 4, 5]}) + df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]}) expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5]) expected.index.name = "b" @@ -145,8 +162,7 @@ def test_aggregate_float64_no_int64(): result = df.groupby("b")[["a"]].mean() tm.assert_frame_equal(result, expected) - expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, - index=[1, 2, 4, 5]) + expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5]) expected.index.name = "b" result = df.groupby("b")[["a", "c"]].mean() @@ -157,138 +173,133 @@ def test_aggregate_api_consistency(): # GH 9052 # make sure that the aggregates via dict # are consistent - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': np.random.randn(8) + 1.0, - 'D': np.arange(8)}) - - grouped = df.groupby(['A', 'B']) - c_mean = grouped['C'].mean() - c_sum = grouped['C'].sum() - d_mean = grouped['D'].mean() - d_sum = grouped['D'].sum() - - result = grouped['D'].agg(['sum', 'mean']) + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": np.random.randn(8) + 1.0, + "D": np.arange(8), + } + ) + + grouped = df.groupby(["A", "B"]) + c_mean = grouped["C"].mean() + c_sum = grouped["C"].sum() + d_mean = grouped["D"].mean() + d_sum = grouped["D"].sum() + + result = grouped["D"].agg(["sum", "mean"]) expected = pd.concat([d_sum, d_mean], axis=1) - expected.columns = ['sum', 'mean'] + expected.columns = ["sum", "mean"] tm.assert_frame_equal(result, expected, check_like=True) result = grouped.agg([np.sum, np.mean]) expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1) - expected.columns = MultiIndex.from_product([['C', 'D'], - ['sum', 'mean']]) + expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]]) tm.assert_frame_equal(result, expected, check_like=True) - result = grouped[['D', 'C']].agg([np.sum, np.mean]) + result = grouped[["D", "C"]].agg([np.sum, np.mean]) expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1) - expected.columns = MultiIndex.from_product([['D', 'C'], - ['sum', 'mean']]) + expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]]) tm.assert_frame_equal(result, expected, check_like=True) - result = grouped.agg({'C': 'mean', 'D': 'sum'}) + result = grouped.agg({"C": "mean", "D": "sum"}) expected = pd.concat([d_sum, c_mean], axis=1) tm.assert_frame_equal(result, expected, check_like=True) - result = grouped.agg({'C': ['mean', 'sum'], - 'D': ['mean', 'sum']}) + result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]}) expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1) - expected.columns = MultiIndex.from_product([['C', 'D'], - ['mean', 'sum']]) + expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = grouped[['D', 'C']].agg({'r': np.sum, - 'r2': np.mean}) + result = grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean}) expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1) - expected.columns = MultiIndex.from_product([['r', 'r2'], - ['D', 'C']]) + expected.columns = MultiIndex.from_product([["r", "r2"], ["D", "C"]]) tm.assert_frame_equal(result, expected, check_like=True) def test_agg_dict_renaming_deprecation(): # 15931 - df = pd.DataFrame({'A': [1, 1, 1, 2, 2], - 'B': range(5), - 'C': range(5)}) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False) as w: - df.groupby('A').agg({'B': {'foo': ['sum', 'max']}, - 'C': {'bar': ['count', 'min']}}) + df = pd.DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)}) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w: + df.groupby("A").agg( + {"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}} + ) assert "using a dict with renaming" in str(w[0].message) assert "named aggregation" in str(w[0].message) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df.groupby('A')[['B', 'C']].agg({'ma': 'max'}) + df.groupby("A")[["B", "C"]].agg({"ma": "max"}) with tm.assert_produces_warning(FutureWarning) as w: - df.groupby('A').B.agg({'foo': 'count'}) + df.groupby("A").B.agg({"foo": "count"}) assert "using a dict on a Series for aggregation" in str(w[0].message) assert "named aggregation instead." in str(w[0].message) def test_agg_compat(): # GH 12334 - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': np.random.randn(8) + 1.0, - 'D': np.arange(8)}) - - g = df.groupby(['A', 'B']) - - expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1) - expected.columns = MultiIndex.from_tuples([('C', 'sum'), - ('C', 'std')]) + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": np.random.randn(8) + 1.0, + "D": np.arange(8), + } + ) + + g = df.groupby(["A", "B"]) + + expected = pd.concat([g["D"].sum(), g["D"].std()], axis=1) + expected.columns = MultiIndex.from_tuples([("C", "sum"), ("C", "std")]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = g['D'].agg({'C': ['sum', 'std']}) + result = g["D"].agg({"C": ["sum", "std"]}) tm.assert_frame_equal(result, expected, check_like=True) - expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1) - expected.columns = ['C', 'D'] + expected = pd.concat([g["D"].sum(), g["D"].std()], axis=1) + expected.columns = ["C", "D"] with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = g['D'].agg({'C': 'sum', 'D': 'std'}) + result = g["D"].agg({"C": "sum", "D": "std"}) tm.assert_frame_equal(result, expected, check_like=True) def test_agg_nested_dicts(): # API change for disallowing these types of nested dicts - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': np.random.randn(8) + 1.0, - 'D': np.arange(8)}) - - g = df.groupby(['A', 'B']) - - msg = r'cannot perform renaming for r[1-2] with a nested dictionary' + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": np.random.randn(8) + 1.0, + "D": np.arange(8), + } + ) + + g = df.groupby(["A", "B"]) + + msg = r"cannot perform renaming for r[1-2] with a nested dictionary" with pytest.raises(SpecificationError, match=msg): - g.aggregate({'r1': {'C': ['mean', 'sum']}, - 'r2': {'D': ['mean', 'sum']}}) + g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = g.agg({'C': {'ra': ['mean', 'std']}, - 'D': {'rb': ['mean', 'std']}}) - expected = pd.concat([g['C'].mean(), g['C'].std(), - g['D'].mean(), g['D'].std()], - axis=1) + result = g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}}) + expected = pd.concat( + [g["C"].mean(), g["C"].std(), g["D"].mean(), g["D"].std()], axis=1 + ) expected.columns = pd.MultiIndex.from_tuples( - [('ra', 'mean'), ('ra', 'std'), - ('rb', 'mean'), ('rb', 'std')]) + [("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")] + ) tm.assert_frame_equal(result, expected, check_like=True) # same name as the original column # GH9052 with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = g['D'].agg({'result1': np.sum, 'result2': np.mean}) - expected = expected.rename(columns={'result1': 'D'}) + expected = g["D"].agg({"result1": np.sum, "result2": np.mean}) + expected = expected.rename(columns={"result1": "D"}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = g['D'].agg({'D': np.sum, 'result2': np.mean}) + result = g["D"].agg({"D": np.sum, "result2": np.mean}) tm.assert_frame_equal(result, expected, check_like=True) @@ -296,11 +307,11 @@ def test_agg_item_by_item_raise_typeerror(): df = DataFrame(np.random.randint(10, size=(20, 10))) def raiseException(df): - pprint_thing('----------------------------------------') + pprint_thing("----------------------------------------") pprint_thing(df.to_string()) - raise TypeError('test') + raise TypeError("test") - with pytest.raises(TypeError, match='test'): + with pytest.raises(TypeError, match="test"): df.groupby(0).agg(raiseException) @@ -315,22 +326,58 @@ def test_series_agg_multikey(): def test_series_agg_multi_pure_python(): data = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) def bad(x): - assert (len(x.values.base) > 0) - return 'foo' + assert len(x.values.base) > 0 + return "foo" - result = data.groupby(['A', 'B']).agg(bad) - expected = data.groupby(['A', 'B']).agg(lambda x: 'foo') + result = data.groupby(["A", "B"]).agg(bad) + expected = data.groupby(["A", "B"]).agg(lambda x: "foo") tm.assert_frame_equal(result, expected) @@ -343,12 +390,20 @@ def P1(a): except Exception: return np.nan - df = DataFrame({'col1': [1, 2, 3, 4], - 'col2': [10, 25, 26, 31], - 'date': [dt.date(2013, 2, 10), dt.date(2013, 2, 10), - dt.date(2013, 2, 11), dt.date(2013, 2, 11)]}) - - g = df.groupby('date') + df = DataFrame( + { + "col1": [1, 2, 3, 4], + "col2": [10, 25, 26, 31], + "date": [ + dt.date(2013, 2, 10), + dt.date(2013, 2, 10), + dt.date(2013, 2, 11), + dt.date(2013, 2, 11), + ], + } + ) + + g = df.groupby("date") expected = g.agg([P1]) expected.columns = expected.columns.levels[0] @@ -359,74 +414,80 @@ def P1(a): def test_agg_callables(): # GH 7929 - df = DataFrame({'foo': [1, 2], 'bar': [3, 4]}).astype(np.int64) + df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64) class fn_class: - def __call__(self, x): return sum(x) - equiv_callables = [sum, - np.sum, - lambda x: sum(x), - lambda x: x.sum(), - partial(sum), - fn_class(), ] + equiv_callables = [ + sum, + np.sum, + lambda x: sum(x), + lambda x: x.sum(), + partial(sum), + fn_class(), + ] expected = df.groupby("foo").agg(sum) for ecall in equiv_callables: - result = df.groupby('foo').agg(ecall) + result = df.groupby("foo").agg(ecall) tm.assert_frame_equal(result, expected) def test_agg_over_numpy_arrays(): # GH 3788 - df = pd.DataFrame([[1, np.array([10, 20, 30])], - [1, np.array([40, 50, 60])], - [2, np.array([20, 30, 40])]], - columns=['category', 'arraydata']) - result = df.groupby('category').agg(sum) + df = pd.DataFrame( + [ + [1, np.array([10, 20, 30])], + [1, np.array([40, 50, 60])], + [2, np.array([20, 30, 40])], + ], + columns=["category", "arraydata"], + ) + result = df.groupby("category").agg(sum) expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]] - expected_index = pd.Index([1, 2], name='category') - expected_column = ['arraydata'] - expected = pd.DataFrame(expected_data, - index=expected_index, - columns=expected_column) + expected_index = pd.Index([1, 2], name="category") + expected_column = ["arraydata"] + expected = pd.DataFrame( + expected_data, index=expected_index, columns=expected_column + ) tm.assert_frame_equal(result, expected) def test_agg_timezone_round_trip(): # GH 15426 - ts = pd.Timestamp("2016-01-01 12:00:00", tz='US/Pacific') - df = pd.DataFrame({'a': 1, - 'b': [ts + dt.timedelta(minutes=nn) - for nn in range(10)]}) + ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific") + df = pd.DataFrame( + {"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]} + ) - result1 = df.groupby('a')['b'].agg(np.min).iloc[0] - result2 = df.groupby('a')['b'].agg(lambda x: np.min(x)).iloc[0] - result3 = df.groupby('a')['b'].min().iloc[0] + result1 = df.groupby("a")["b"].agg(np.min).iloc[0] + result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0] + result3 = df.groupby("a")["b"].min().iloc[0] assert result1 == ts assert result2 == ts assert result3 == ts - dates = [pd.Timestamp("2016-01-0%d 12:00:00" % i, tz='US/Pacific') - for i in range(1, 5)] - df = pd.DataFrame({'A': ['a', 'b'] * 2, 'B': dates}) - grouped = df.groupby('A') + dates = [ + pd.Timestamp("2016-01-0%d 12:00:00" % i, tz="US/Pacific") for i in range(1, 5) + ] + df = pd.DataFrame({"A": ["a", "b"] * 2, "B": dates}) + grouped = df.groupby("A") - ts = df['B'].iloc[0] - assert ts == grouped.nth(0)['B'].iloc[0] - assert ts == grouped.head(1)['B'].iloc[0] - assert ts == grouped.first()['B'].iloc[0] + ts = df["B"].iloc[0] + assert ts == grouped.nth(0)["B"].iloc[0] + assert ts == grouped.head(1)["B"].iloc[0] + assert ts == grouped.first()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 0] - ts = df['B'].iloc[2] - assert ts == grouped.last()['B'].iloc[0] + ts = df["B"].iloc[2] + assert ts == grouped.last()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 0] @@ -438,68 +499,75 @@ def test_sum_uint64_overflow(): df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object) df = df + 9223372036854775807 - index = pd.Index([9223372036854775808, - 9223372036854775810, - 9223372036854775812], - dtype=np.uint64) - expected = pd.DataFrame({1: [9223372036854775809, - 9223372036854775811, - 9223372036854775813]}, - index=index) + index = pd.Index( + [9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64 + ) + expected = pd.DataFrame( + {1: [9223372036854775809, 9223372036854775811, 9223372036854775813]}, + index=index, + ) expected.index.name = 0 result = df.groupby(0).sum() tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("structure, expected", [ - (tuple, pd.DataFrame({'C': {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})), - (list, pd.DataFrame({'C': {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})), - (lambda x: tuple(x), pd.DataFrame({'C': {(1, 1): (1, 1, 1), - (3, 4): (3, 4, 4)}})), - (lambda x: list(x), pd.DataFrame({'C': {(1, 1): [1, 1, 1], - (3, 4): [3, 4, 4]}})) -]) +@pytest.mark.parametrize( + "structure, expected", + [ + (tuple, pd.DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})), + (list, pd.DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})), + ( + lambda x: tuple(x), + pd.DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}), + ), + ( + lambda x: list(x), + pd.DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}), + ), + ], +) def test_agg_structs_dataframe(structure, expected): - df = pd.DataFrame({'A': [1, 1, 1, 3, 3, 3], - 'B': [1, 1, 1, 4, 4, 4], - 'C': [1, 1, 1, 3, 4, 4]}) + df = pd.DataFrame( + {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]} + ) - result = df.groupby(['A', 'B']).aggregate(structure) - expected.index.names = ['A', 'B'] + result = df.groupby(["A", "B"]).aggregate(structure) + expected.index.names = ["A", "B"] tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("structure, expected", [ - (tuple, pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name='C')), - (list, pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name='C')), - (lambda x: tuple(x), pd.Series([(1, 1, 1), (3, 4, 4)], - index=[1, 3], name='C')), - (lambda x: list(x), pd.Series([[1, 1, 1], [3, 4, 4]], - index=[1, 3], name='C')) -]) +@pytest.mark.parametrize( + "structure, expected", + [ + (tuple, pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")), + (list, pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")), + (lambda x: tuple(x), pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")), + (lambda x: list(x), pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")), + ], +) def test_agg_structs_series(structure, expected): # Issue #18079 - df = pd.DataFrame({'A': [1, 1, 1, 3, 3, 3], - 'B': [1, 1, 1, 4, 4, 4], - 'C': [1, 1, 1, 3, 4, 4]}) + df = pd.DataFrame( + {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]} + ) - result = df.groupby('A')['C'].aggregate(structure) - expected.index.name = 'A' + result = df.groupby("A")["C"].aggregate(structure) + expected.index.name = "A" tm.assert_series_equal(result, expected) def test_agg_category_nansum(observed): - categories = ['a', 'b', 'c'] - df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], - categories=categories), - 'B': [1, 2, 3]}) + categories = ["a", "b", "c"] + df = pd.DataFrame( + {"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]} + ) result = df.groupby("A", observed=observed).B.agg(np.nansum) - expected = pd.Series([3, 3, 0], - index=pd.CategoricalIndex(['a', 'b', 'c'], - categories=categories, - name='A'), - name='B') + expected = pd.Series( + [3, 3, 0], + index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"), + name="B", + ) if observed: expected = expected[expected != 0] tm.assert_series_equal(result, expected) @@ -507,25 +575,32 @@ def test_agg_category_nansum(observed): def test_agg_list_like_func(): # GH 18473 - df = pd.DataFrame({'A': [str(x) for x in range(3)], - 'B': [str(x) for x in range(3)]}) - grouped = df.groupby('A', as_index=False, sort=False) - result = grouped.agg({'B': lambda x: list(x)}) - expected = pd.DataFrame({'A': [str(x) for x in range(3)], - 'B': [[str(x)] for x in range(3)]}) + df = pd.DataFrame( + {"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]} + ) + grouped = df.groupby("A", as_index=False, sort=False) + result = grouped.agg({"B": lambda x: list(x)}) + expected = pd.DataFrame( + {"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]} + ) tm.assert_frame_equal(result, expected) def test_agg_lambda_with_timezone(): # GH 23683 - df = pd.DataFrame({ - 'tag': [1, 1], - 'date': [ - pd.Timestamp('2018-01-01', tz='UTC'), - pd.Timestamp('2018-01-02', tz='UTC')] - }) - result = df.groupby('tag').agg({'date': lambda e: e.head(1)}) - expected = pd.DataFrame([pd.Timestamp('2018-01-01', tz='UTC')], - index=pd.Index([1], name='tag'), - columns=['date']) + df = pd.DataFrame( + { + "tag": [1, 1], + "date": [ + pd.Timestamp("2018-01-01", tz="UTC"), + pd.Timestamp("2018-01-02", tz="UTC"), + ], + } + ) + result = df.groupby("tag").agg({"date": lambda e: e.head(1)}) + expected = pd.DataFrame( + [pd.Timestamp("2018-01-01", tz="UTC")], + index=pd.Index([1], name="tag"), + columns=["date"], + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 3b636c87dc584f..bdf93756b7559a 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -7,22 +7,24 @@ @pytest.fixture def mframe(): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - return DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + return DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) @pytest.fixture def df(): return DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) @pytest.fixture @@ -42,27 +44,61 @@ def tsframe(tsd): @pytest.fixture def df_mixed_floats(): - return DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.array( - np.random.randn(8), dtype='float32')}) + return DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.array(np.random.randn(8), dtype="float32"), + } + ) @pytest.fixture def three_group(): - return DataFrame({'A': ['foo', 'foo', 'foo', - 'foo', 'bar', 'bar', - 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', - 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', - 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) + return DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 8f57254eae2193..44a583bf661e89 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -25,9 +25,13 @@ def test_apply_issues(): 2011.05.18,04:00,1.40750 2011.05.18,05:00,1.40649""" - df = pd.read_csv(StringIO(s), header=None, names=['date', 'time', 'value'], - parse_dates=[['date', 'time']]) - df = df.set_index('date_time') + df = pd.read_csv( + StringIO(s), + header=None, + names=["date", "time", "value"], + parse_dates=[["date", "time"]], + ) + df = df.set_index("date_time") expected = df.groupby(df.index.date).idxmax() result = df.groupby(df.index.date).apply(lambda x: x.idxmax()) @@ -35,44 +39,45 @@ def test_apply_issues(): # GH 5789 # don't auto coerce dates - df = pd.read_csv(StringIO(s), header=None, names=['date', 'time', 'value']) + df = pd.read_csv(StringIO(s), header=None, names=["date", "time", "value"]) exp_idx = pd.Index( - ['2011.05.16', '2011.05.17', '2011.05.18' - ], dtype=object, name='date') - expected = Series(['00:00', '02:00', '02:00'], index=exp_idx) - result = df.groupby('date').apply( - lambda x: x['time'][x['value'].idxmax()]) + ["2011.05.16", "2011.05.17", "2011.05.18"], dtype=object, name="date" + ) + expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) + result = df.groupby("date").apply(lambda x: x["time"][x["value"].idxmax()]) tm.assert_series_equal(result, expected) def test_apply_trivial(): # GH 20066 # trivial apply: ignore input and return a constant dataframe. - df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'], - 'data': [1.0, 2.0, 3.0, 4.0, 5.0]}, - columns=['key', 'data']) - expected = pd.concat([df.iloc[1:], df.iloc[1:]], - axis=1, keys=['float64', 'object']) - result = df.groupby([str(x) for x in df.dtypes], - axis=1).apply(lambda x: df.iloc[1:]) + df = pd.DataFrame( + {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, + columns=["key", "data"], + ) + expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", "object"]) + result = df.groupby([str(x) for x in df.dtypes], axis=1).apply( + lambda x: df.iloc[1:] + ) tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(reason="GH#20066; function passed into apply " - "returns a DataFrame with the same index " - "as the one to create GroupBy object.") +@pytest.mark.xfail( + reason="GH#20066; function passed into apply " + "returns a DataFrame with the same index " + "as the one to create GroupBy object." +) def test_apply_trivial_fail(): # GH 20066 # trivial apply fails if the constant dataframe has the same index # with the one used to create GroupBy object. - df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'], - 'data': [1.0, 2.0, 3.0, 4.0, 5.0]}, - columns=['key', 'data']) - expected = pd.concat([df, df], - axis=1, keys=['float64', 'object']) - result = df.groupby([str(x) for x in df.dtypes], - axis=1).apply(lambda x: df) + df = pd.DataFrame( + {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, + columns=["key", "data"], + ) + expected = pd.concat([df, df], axis=1, keys=["float64", "object"]) + result = df.groupby([str(x) for x in df.dtypes], axis=1).apply(lambda x: df) tm.assert_frame_equal(result, expected) @@ -85,15 +90,19 @@ def test_fast_apply(): N = 1000 labels = np.random.randint(0, 2000, size=N) labels2 = np.random.randint(0, 3, size=N) - df = DataFrame({'key': labels, - 'key2': labels2, - 'value1': np.random.randn(N), - 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)}) + df = DataFrame( + { + "key": labels, + "key2": labels2, + "value1": np.random.randn(N), + "value2": ["foo", "bar", "baz", "qux"] * (N // 4), + } + ) def f(g): return 1 - g = df.groupby(['key', 'key2']) + g = df.groupby(["key", "key2"]) grouper = g.grouper @@ -108,28 +117,33 @@ def f(g): @pytest.mark.parametrize( "df, group_names", [ - (DataFrame({"a": [1, 1, 1, 2, 3], - "b": ["a", "a", "a", "b", "c"]}), - [1, 2, 3]), - (DataFrame({"a": [0, 0, 1, 1], - "b": [0, 1, 0, 1]}), - [0, 1]), - (DataFrame({"a": [1]}), - [1]), - (DataFrame({"a": [1, 1, 1, 2, 2, 1, 1, 2], - "b": range(8)}), - [1, 2]), - (DataFrame({"a": [1, 2, 3, 1, 2, 3], - "two": [4, 5, 6, 7, 8, 9]}), - [1, 2, 3]), - (DataFrame({"a": list("aaabbbcccc"), + (DataFrame({"a": [1, 1, 1, 2, 3], "b": ["a", "a", "a", "b", "c"]}), [1, 2, 3]), + (DataFrame({"a": [0, 0, 1, 1], "b": [0, 1, 0, 1]}), [0, 1]), + (DataFrame({"a": [1]}), [1]), + (DataFrame({"a": [1, 1, 1, 2, 2, 1, 1, 2], "b": range(8)}), [1, 2]), + (DataFrame({"a": [1, 2, 3, 1, 2, 3], "two": [4, 5, 6, 7, 8, 9]}), [1, 2, 3]), + ( + DataFrame( + { + "a": list("aaabbbcccc"), "B": [3, 4, 3, 6, 5, 2, 1, 9, 5, 4], - "C": [4, 0, 2, 2, 2, 7, 8, 6, 2, 8]}), - ["a", "b", "c"]), - (DataFrame([[1, 2, 3], [2, 2, 3]], columns=["a", "b", "c"]), - [1, 2]), - ], ids=['GH2936', 'GH7739 & GH10519', 'GH10519', - 'GH2656', 'GH12155', 'GH20084', 'GH21417']) + "C": [4, 0, 2, 2, 2, 7, 8, 6, 2, 8], + } + ), + ["a", "b", "c"], + ), + (DataFrame([[1, 2, 3], [2, 2, 3]], columns=["a", "b", "c"]), [1, 2]), + ], + ids=[ + "GH2936", + "GH7739 & GH10519", + "GH10519", + "GH2656", + "GH12155", + "GH20084", + "GH21417", + ], +) def test_group_apply_once_per_group(df, group_names): # GH2936, GH7739, GH10519, GH2656, GH12155, GH20084, GH21417 @@ -177,8 +191,12 @@ def f_constant_df(group): def test_apply_with_mixed_dtype(): # GH3480, apply with mixed dtype on axis=1 breaks in 0.11 - df = DataFrame({'foo1': np.random.randn(6), - 'foo2': ['one', 'two', 'two', 'three', 'one', 'two']}) + df = DataFrame( + { + "foo1": np.random.randn(6), + "foo2": ["one", "two", "two", "three", "one", "two"], + } + ) result = df.apply(lambda x: x, axis=1).dtypes expected = df.dtypes tm.assert_series_equal(result, expected) @@ -193,12 +211,16 @@ def test_apply_with_mixed_dtype(): def test_groupby_as_index_apply(df): # GH #4648 and #3417 - df = DataFrame({'item_id': ['b', 'b', 'a', 'c', 'a', 'b'], - 'user_id': [1, 2, 1, 1, 3, 1], - 'time': range(6)}) + df = DataFrame( + { + "item_id": ["b", "b", "a", "c", "a", "b"], + "user_id": [1, 2, 1, 1, 3, 1], + "time": range(6), + } + ) - g_as = df.groupby('user_id', as_index=True) - g_not_as = df.groupby('user_id', as_index=False) + g_as = df.groupby("user_id", as_index=True) + g_not_as = df.groupby("user_id", as_index=False) res_as = g_as.head(2).index res_not_as = g_not_as.head(2).index @@ -211,32 +233,31 @@ def test_groupby_as_index_apply(df): # apply doesn't maintain the original ordering # changed in GH5610 as the as_index=False returns a MI here - exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), ( - 2, 4)]) + exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (2, 4)]) tp = [(1, 0), (1, 2), (2, 1), (3, 4)] - exp_as_apply = MultiIndex.from_tuples(tp, names=['user_id', None]) + exp_as_apply = MultiIndex.from_tuples(tp, names=["user_id", None]) tm.assert_index_equal(res_as_apply, exp_as_apply) tm.assert_index_equal(res_not_as_apply, exp_not_as_apply) - ind = Index(list('abcde')) + ind = Index(list("abcde")) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) res = df.groupby(0, as_index=False).apply(lambda x: x).index tm.assert_index_equal(res, ind) def test_apply_concat_preserve_names(three_group): - grouped = three_group.groupby(['A', 'B']) + grouped = three_group.groupby(["A", "B"]) def desc(group): result = group.describe() - result.index.name = 'stat' + result.index.name = "stat" return result def desc2(group): result = group.describe() - result.index.name = 'stat' - result = result[:len(group)] + result.index.name = "stat" + result = result[: len(group)] # weirdo return result @@ -244,31 +265,31 @@ def desc3(group): result = group.describe() # names are different - result.index.name = 'stat_%d' % len(group) + result.index.name = "stat_%d" % len(group) - result = result[:len(group)] + result = result[: len(group)] # weirdo return result result = grouped.apply(desc) - assert result.index.names == ('A', 'B', 'stat') + assert result.index.names == ("A", "B", "stat") result2 = grouped.apply(desc2) - assert result2.index.names == ('A', 'B', 'stat') + assert result2.index.names == ("A", "B", "stat") result3 = grouped.apply(desc3) - assert result3.index.names == ('A', 'B', None) + assert result3.index.names == ("A", "B", None) def test_apply_series_to_frame(): def f(piece): - with np.errstate(invalid='ignore'): + with np.errstate(invalid="ignore"): logged = np.log(piece) - return DataFrame({'value': piece, - 'demeaned': piece - piece.mean(), - 'logged': logged}) + return DataFrame( + {"value": piece, "demeaned": piece - piece.mean(), "logged": logged} + ) - dr = bdate_range('1/1/2000', periods=100) + dr = bdate_range("1/1/2000", periods=100) ts = Series(np.random.randn(100), index=dr) grouped = ts.groupby(lambda x: x.month) @@ -279,45 +300,49 @@ def f(piece): def test_apply_series_yield_constant(df): - result = df.groupby(['A', 'B'])['C'].apply(len) - assert result.index.names[:2] == ('A', 'B') + result = df.groupby(["A", "B"])["C"].apply(len) + assert result.index.names[:2] == ("A", "B") def test_apply_frame_yield_constant(df): # GH13568 - result = df.groupby(['A', 'B']).apply(len) + result = df.groupby(["A", "B"]).apply(len) assert isinstance(result, Series) assert result.name is None - result = df.groupby(['A', 'B'])[['C', 'D']].apply(len) + result = df.groupby(["A", "B"])[["C", "D"]].apply(len) assert isinstance(result, Series) assert result.name is None def test_apply_frame_to_series(df): - grouped = df.groupby(['A', 'B']) + grouped = df.groupby(["A", "B"]) result = grouped.apply(len) - expected = grouped.count()['C'] + expected = grouped.count()["C"] tm.assert_index_equal(result.index, expected.index) tm.assert_numpy_array_equal(result.values, expected.values) def test_apply_frame_concat_series(): def trans(group): - return group.groupby('B')['C'].sum().sort_values()[:2] + return group.groupby("B")["C"].sum().sort_values()[:2] def trans2(group): - grouped = group.groupby(df.reindex(group.index)['B']) + grouped = group.groupby(df.reindex(group.index)["B"]) return grouped.sum().sort_values()[:2] - df = DataFrame({'A': np.random.randint(0, 5, 1000), - 'B': np.random.randint(0, 5, 1000), - 'C': np.random.randn(1000)}) + df = DataFrame( + { + "A": np.random.randint(0, 5, 1000), + "B": np.random.randint(0, 5, 1000), + "C": np.random.randn(1000), + } + ) - result = df.groupby('A').apply(trans) - exp = df.groupby('A')['C'].apply(trans2) + result = df.groupby("A").apply(trans) + exp = df.groupby("A")["C"].apply(trans2) tm.assert_series_equal(result, exp, check_names=False) - assert result.name == 'C' + assert result.name == "C" def test_apply_transform(ts): @@ -331,7 +356,7 @@ def test_apply_multikey_corner(tsframe): grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) def f(group): - return group.sort_values('A')[-5:] + return group.sort_values("A")[-5:] result = grouped.apply(f) for key, group in grouped: @@ -340,59 +365,69 @@ def f(group): def test_apply_chunk_view(): # Low level tinkering could be unsafe, make sure not - df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], - 'value': range(9)}) + df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - result = df.groupby('key', group_keys=False).apply(lambda x: x[:2]) + result = df.groupby("key", group_keys=False).apply(lambda x: x[:2]) expected = df.take([0, 1, 3, 4, 6, 7]) tm.assert_frame_equal(result, expected) def test_apply_no_name_column_conflict(): - df = DataFrame({'name': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2], - 'name2': [0, 0, 0, 1, 1, 1, 0, 0, 1, 1], - 'value': range(9, -1, -1)}) + df = DataFrame( + { + "name": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2], + "name2": [0, 0, 0, 1, 1, 1, 0, 0, 1, 1], + "value": range(9, -1, -1), + } + ) # it works! #2605 - grouped = df.groupby(['name', 'name2']) - grouped.apply(lambda x: x.sort_values('value', inplace=True)) + grouped = df.groupby(["name", "name2"]) + grouped.apply(lambda x: x.sort_values("value", inplace=True)) def test_apply_typecast_fail(): - df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], - 'c': np.tile( - ['a', 'b', 'c'], 2), - 'v': np.arange(1., 7.)}) + df = DataFrame( + { + "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0], + "c": np.tile(["a", "b", "c"], 2), + "v": np.arange(1.0, 7.0), + } + ) def f(group): - v = group['v'] - group['v2'] = (v - v.min()) / (v.max() - v.min()) + v = group["v"] + group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - result = df.groupby('d').apply(f) + result = df.groupby("d").apply(f) expected = df.copy() - expected['v2'] = np.tile([0., 0.5, 1], 2) + expected["v2"] = np.tile([0.0, 0.5, 1], 2) tm.assert_frame_equal(result, expected) def test_apply_multiindex_fail(): - index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3] - ]) - df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], - 'c': np.tile(['a', 'b', 'c'], 2), - 'v': np.arange(1., 7.)}, index=index) + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]]) + df = DataFrame( + { + "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0], + "c": np.tile(["a", "b", "c"], 2), + "v": np.arange(1.0, 7.0), + }, + index=index, + ) def f(group): - v = group['v'] - group['v2'] = (v - v.min()) / (v.max() - v.min()) + v = group["v"] + group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - result = df.groupby('d').apply(f) + result = df.groupby("d").apply(f) expected = df.copy() - expected['v2'] = np.tile([0., 0.5, 1], 2) + expected["v2"] = np.tile([0.0, 0.5, 1], 2) tm.assert_frame_equal(result, expected) @@ -407,24 +442,28 @@ def test_apply_without_copy(): # GH 5545 # returning a non-copy in an applied function fails - data = DataFrame({'id_field': [100, 100, 200, 300], - 'category': ['a', 'b', 'c', 'c'], - 'value': [1, 2, 3, 4]}) + data = DataFrame( + { + "id_field": [100, 100, 200, 300], + "category": ["a", "b", "c", "c"], + "value": [1, 2, 3, 4], + } + ) def filt1(x): if x.shape[0] == 1: return x.copy() else: - return x[x.category == 'c'] + return x[x.category == "c"] def filt2(x): if x.shape[0] == 1: return x else: - return x[x.category == 'c'] + return x[x.category == "c"] - expected = data.groupby('id_field').apply(filt1) - result = data.groupby('id_field').apply(filt2) + expected = data.groupby("id_field").apply(filt1) + result = data.groupby("id_field").apply(filt2) tm.assert_frame_equal(result, expected) @@ -433,18 +472,22 @@ def test_apply_corner_cases(): N = 1000 labels = np.random.randint(0, 100, size=N) - df = DataFrame({'key': labels, - 'value1': np.random.randn(N), - 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)}) + df = DataFrame( + { + "key": labels, + "value1": np.random.randn(N), + "value2": ["foo", "bar", "baz", "qux"] * (N // 4), + } + ) - grouped = df.groupby('key') + grouped = df.groupby("key") def f(g): - g['value3'] = g['value1'] * 2 + g["value3"] = g["value1"] * 2 return g result = grouped.apply(f) - assert 'value3' in result + assert "value3" in result def test_apply_numeric_coercion_when_datetime(): @@ -454,45 +497,49 @@ def test_apply_numeric_coercion_when_datetime(): # for which are here. # GH 15670 - df = pd.DataFrame({'Number': [1, 2], - 'Date': ["2017-03-02"] * 2, - 'Str': ["foo", "inf"]}) - expected = df.groupby(['Number']).apply(lambda x: x.iloc[0]) + df = pd.DataFrame( + {"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]} + ) + expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) df.Date = pd.to_datetime(df.Date) - result = df.groupby(['Number']).apply(lambda x: x.iloc[0]) - tm.assert_series_equal(result['Str'], expected['Str']) + result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + tm.assert_series_equal(result["Str"], expected["Str"]) # GH 15421 - df = pd.DataFrame({'A': [10, 20, 30], - 'B': ['foo', '3', '4'], - 'T': [pd.Timestamp("12:31:22")] * 3}) + df = pd.DataFrame( + {"A": [10, 20, 30], "B": ["foo", "3", "4"], "T": [pd.Timestamp("12:31:22")] * 3} + ) def get_B(g): - return g.iloc[0][['B']] - result = df.groupby('A').apply(get_B)['B'] + return g.iloc[0][["B"]] + + result = df.groupby("A").apply(get_B)["B"] expected = df.B expected.index = df.A tm.assert_series_equal(result, expected) # GH 14423 def predictions(tool): - out = pd.Series(index=['p1', 'p2', 'useTime'], dtype=object) - if 'step1' in list(tool.State): - out['p1'] = str(tool[tool.State == 'step1'].Machine.values[0]) - if 'step2' in list(tool.State): - out['p2'] = str(tool[tool.State == 'step2'].Machine.values[0]) - out['useTime'] = str( - tool[tool.State == 'step2'].oTime.values[0]) + out = pd.Series(index=["p1", "p2", "useTime"], dtype=object) + if "step1" in list(tool.State): + out["p1"] = str(tool[tool.State == "step1"].Machine.values[0]) + if "step2" in list(tool.State): + out["p2"] = str(tool[tool.State == "step2"].Machine.values[0]) + out["useTime"] = str(tool[tool.State == "step2"].oTime.values[0]) return out - df1 = pd.DataFrame({'Key': ['B', 'B', 'A', 'A'], - 'State': ['step1', 'step2', 'step1', 'step2'], - 'oTime': ['', '2016-09-19 05:24:33', - '', '2016-09-19 23:59:04'], - 'Machine': ['23', '36L', '36R', '36R']}) + + df1 = pd.DataFrame( + { + "Key": ["B", "B", "A", "A"], + "State": ["step1", "step2", "step1", "step2"], + "oTime": ["", "2016-09-19 05:24:33", "", "2016-09-19 23:59:04"], + "Machine": ["23", "36L", "36R", "36R"], + } + ) df2 = df1.copy() df2.oTime = pd.to_datetime(df2.oTime) - expected = df1.groupby('Key').apply(predictions).p1 - result = df2.groupby('Key').apply(predictions).p1 + expected = df1.groupby("Key").apply(predictions).p1 + result = df2.groupby("Key").apply(predictions).p1 tm.assert_series_equal(expected, result) @@ -502,23 +549,23 @@ def test_time_field_bug(): # that were not returned by the apply function, an exception would be # raised. - df = pd.DataFrame({'a': 1, 'b': [datetime.now() for nn in range(10)]}) + df = pd.DataFrame({"a": 1, "b": [datetime.now() for nn in range(10)]}) def func_with_no_date(batch): - return pd.Series({'c': 2}) + return pd.Series({"c": 2}) def func_with_date(batch): - return pd.Series({'b': datetime(2015, 1, 1), 'c': 2}) + return pd.Series({"b": datetime(2015, 1, 1), "c": 2}) - dfg_no_conversion = df.groupby(by=['a']).apply(func_with_no_date) - dfg_no_conversion_expected = pd.DataFrame({'c': 2}, index=[1]) - dfg_no_conversion_expected.index.name = 'a' + dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) + dfg_no_conversion_expected = pd.DataFrame({"c": 2}, index=[1]) + dfg_no_conversion_expected.index.name = "a" - dfg_conversion = df.groupby(by=['a']).apply(func_with_date) + dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) dfg_conversion_expected = pd.DataFrame( - {'b': datetime(2015, 1, 1), - 'c': 2}, index=[1]) - dfg_conversion_expected.index.name = 'a' + {"b": datetime(2015, 1, 1), "c": 2}, index=[1] + ) + dfg_conversion_expected.index.name = "a" tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected) tm.assert_frame_equal(dfg_conversion, dfg_conversion_expected) @@ -527,14 +574,16 @@ def func_with_date(batch): def test_gb_apply_list_of_unequal_len_arrays(): # GH1738 - df = DataFrame({'group1': ['a', 'a', 'a', 'b', 'b', 'b', 'a', 'a', 'a', - 'b', 'b', 'b'], - 'group2': ['c', 'c', 'd', 'd', 'd', 'e', 'c', 'c', 'd', - 'd', 'd', 'e'], - 'weight': [1.1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 1, 2], - 'value': [7.1, 8, 9, 10, 11, 12, 8, 7, 6, 5, 4, 3]}) - df = df.set_index(['group1', 'group2']) - df_grouped = df.groupby(level=['group1', 'group2'], sort=True) + df = DataFrame( + { + "group1": ["a", "a", "a", "b", "b", "b", "a", "a", "a", "b", "b", "b"], + "group2": ["c", "c", "d", "d", "d", "e", "c", "c", "d", "d", "d", "e"], + "weight": [1.1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 1, 2], + "value": [7.1, 8, 9, 10, 11, 12, 8, 7, 6, 5, 4, 3], + } + ) + df = df.set_index(["group1", "group2"]) + df_grouped = df.groupby(level=["group1", "group2"], sort=True) def noddy(value, weight): out = np.array(value * weight).repeat(3) @@ -552,61 +601,58 @@ def noddy(value, weight): def test_groupby_apply_all_none(): # Tests to make sure no errors if apply function returns all None # values. Issue 9684. - test_df = DataFrame({'groups': [0, 0, 1, 1], - 'random_vars': [8, 7, 4, 5]}) + test_df = DataFrame({"groups": [0, 0, 1, 1], "random_vars": [8, 7, 4, 5]}) def test_func(x): pass - result = test_df.groupby('groups').apply(test_func) + result = test_df.groupby("groups").apply(test_func) expected = DataFrame() tm.assert_frame_equal(result, expected) def test_groupby_apply_none_first(): # GH 12824. Tests if apply returns None first. - test_df1 = DataFrame({'groups': [1, 1, 1, 2], 'vars': [0, 1, 2, 3]}) - test_df2 = DataFrame({'groups': [1, 2, 2, 2], 'vars': [0, 1, 2, 3]}) + test_df1 = DataFrame({"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]}) + test_df2 = DataFrame({"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]}) def test_func(x): if x.shape[0] < 2: return None return x.iloc[[0, -1]] - result1 = test_df1.groupby('groups').apply(test_func) - result2 = test_df2.groupby('groups').apply(test_func) - index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], - names=['groups', None]) - index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], - names=['groups', None]) - expected1 = DataFrame({'groups': [1, 1], 'vars': [0, 2]}, - index=index1) - expected2 = DataFrame({'groups': [2, 2], 'vars': [1, 3]}, - index=index2) + result1 = test_df1.groupby("groups").apply(test_func) + result2 = test_df2.groupby("groups").apply(test_func) + index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None]) + index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None]) + expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1) + expected2 = DataFrame({"groups": [2, 2], "vars": [1, 3]}, index=index2) tm.assert_frame_equal(result1, expected1) tm.assert_frame_equal(result2, expected2) def test_groupby_apply_return_empty_chunk(): # GH 22221: apply filter which returns some empty groups - df = pd.DataFrame(dict(value=[0, 1], group=['filled', 'empty'])) - groups = df.groupby('group') - result = groups.apply(lambda group: group[group.value != 1]['value']) - expected = pd.Series([0], name='value', - index=MultiIndex.from_product([['empty', 'filled'], - [0]], - names=['group', None] - ).drop('empty')) + df = pd.DataFrame(dict(value=[0, 1], group=["filled", "empty"])) + groups = df.groupby("group") + result = groups.apply(lambda group: group[group.value != 1]["value"]) + expected = pd.Series( + [0], + name="value", + index=MultiIndex.from_product( + [["empty", "filled"], [0]], names=["group", None] + ).drop("empty"), + ) tm.assert_series_equal(result, expected) def test_apply_with_mixed_types(): # gh-20949 - df = pd.DataFrame({'A': 'a a b'.split(), 'B': [1, 2, 3], 'C': [4, 6, 5]}) - g = df.groupby('A') + df = pd.DataFrame({"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]}) + g = df.groupby("A") result = g.transform(lambda x: x / x.sum()) - expected = pd.DataFrame({'B': [1 / 3., 2 / 3., 1], 'C': [0.4, 0.6, 1.0]}) + expected = pd.DataFrame({"B": [1 / 3.0, 2 / 3.0, 1], "C": [0.4, 0.6, 1.0]}) tm.assert_frame_equal(result, expected) result = g.apply(lambda x: x / x.sum()) diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 18bb9315b68c4a..7c12b490f46d2f 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -14,6 +14,7 @@ def test_series_grouper(): from pandas import Series + obj = Series(np.random.randn(10)) dummy = obj[:0] @@ -31,6 +32,7 @@ def test_series_grouper(): def test_series_bin_grouper(): from pandas import Series + obj = Series(np.random.randn(10)) dummy = obj[:0] @@ -47,7 +49,6 @@ def test_series_bin_grouper(): class TestBinGroupers: - def setup_method(self, method): self.obj = np.random.randn(10, 1) self.labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2], dtype=np.int64) @@ -58,31 +59,31 @@ def test_generate_bins(self): binner = np.array([0, 3, 6, 9], dtype=np.int64) for func in [lib.generate_bins_dt64, generate_bins_generic]: - bins = func(values, binner, closed='left') - assert ((bins == np.array([2, 5, 6])).all()) + bins = func(values, binner, closed="left") + assert (bins == np.array([2, 5, 6])).all() - bins = func(values, binner, closed='right') - assert ((bins == np.array([3, 6, 6])).all()) + bins = func(values, binner, closed="right") + assert (bins == np.array([3, 6, 6])).all() for func in [lib.generate_bins_dt64, generate_bins_generic]: values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64) binner = np.array([0, 3, 6], dtype=np.int64) - bins = func(values, binner, closed='right') - assert ((bins == np.array([3, 6])).all()) + bins = func(values, binner, closed="right") + assert (bins == np.array([3, 6])).all() msg = "Invalid length for values or for binner" with pytest.raises(ValueError, match=msg): - generate_bins_generic(values, [], 'right') + generate_bins_generic(values, [], "right") with pytest.raises(ValueError, match=msg): - generate_bins_generic(values[:0], binner, 'right') + generate_bins_generic(values[:0], binner, "right") msg = "Values falls before first bin" with pytest.raises(ValueError, match=msg): - generate_bins_generic(values, [4], 'right') + generate_bins_generic(values, [4], "right") msg = "Values falls after last bin" with pytest.raises(ValueError, match=msg): - generate_bins_generic(values, [-3, -1], 'right') + generate_bins_generic(values, [-3, -1], "right") def test_group_ohlc(): @@ -92,10 +93,9 @@ def _check(dtype): bins = np.array([6, 12, 20]) out = np.zeros((3, 4), dtype) counts = np.zeros(len(out), dtype=np.int64) - labels = ensure_int64(np.repeat(np.arange(3), - np.diff(np.r_[0, bins]))) + labels = ensure_int64(np.repeat(np.arange(3), np.diff(np.r_[0, bins]))) - func = getattr(groupby, 'group_ohlc_%s' % dtype) + func = getattr(groupby, "group_ohlc_%s" % dtype) func(out, counts, obj[:, None], labels) def _ohlc(group): @@ -103,20 +103,18 @@ def _ohlc(group): return np.repeat(nan, 4) return [group[0], group.max(), group.min(), group[-1]] - expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), - _ohlc(obj[12:])]) + expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])]) assert_almost_equal(out, expected) - tm.assert_numpy_array_equal(counts, - np.array([6, 6, 8], dtype=np.int64)) + tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64)) obj[:6] = nan func(out, counts, obj[:, None], labels) expected[0] = nan assert_almost_equal(out, expected) - _check('float32') - _check('float64') + _check("float32") + _check("float64") class TestMoments: @@ -124,7 +122,6 @@ class TestMoments: class TestReducer: - def test_int_index(self): from pandas.core.series import Series @@ -133,23 +130,23 @@ def test_int_index(self): expected = arr.sum(0) assert_almost_equal(result, expected) - result = reduction.reduce(arr, np.sum, axis=1, - labels=Index(np.arange(100))) + result = reduction.reduce(arr, np.sum, axis=1, labels=Index(np.arange(100))) expected = arr.sum(1) assert_almost_equal(result, expected) - dummy = Series(0., index=np.arange(100)) - result = reduction.reduce(arr, np.sum, dummy=dummy, - labels=Index(np.arange(4))) + dummy = Series(0.0, index=np.arange(100)) + result = reduction.reduce(arr, np.sum, dummy=dummy, labels=Index(np.arange(4))) expected = arr.sum(0) assert_almost_equal(result, expected) - dummy = Series(0., index=np.arange(4)) - result = reduction.reduce(arr, np.sum, axis=1, dummy=dummy, - labels=Index(np.arange(100))) + dummy = Series(0.0, index=np.arange(4)) + result = reduction.reduce( + arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) + ) expected = arr.sum(1) assert_almost_equal(result, expected) - result = reduction.reduce(arr, np.sum, axis=1, dummy=dummy, - labels=Index(np.arange(100))) + result = reduction.reduce( + arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) + ) assert_almost_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 58a43dc218d333..486b3b28b29a35 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -8,10 +8,16 @@ import pandas as pd from pandas import ( - Categorical, CategoricalIndex, DataFrame, Index, MultiIndex, Series, qcut) + Categorical, + CategoricalIndex, + DataFrame, + Index, + MultiIndex, + Series, + qcut, +) import pandas.util.testing as tm -from pandas.util.testing import ( - assert_equal, assert_frame_equal, assert_series_equal) +from pandas.util.testing import assert_equal, assert_frame_equal, assert_series_equal def cartesian_product_for_groupers(result, args, names): @@ -21,9 +27,9 @@ def cartesian_product_for_groupers(result, args, names): def f(a): if isinstance(a, (CategoricalIndex, Categorical)): categories = a.categories - a = Categorical.from_codes(np.arange(len(categories)), - categories=categories, - ordered=a.ordered) + a = Categorical.from_codes( + np.arange(len(categories)), categories=categories, ordered=a.ordered + ) return a index = MultiIndex.from_product(map(f, args), names=names) @@ -34,60 +40,64 @@ def test_apply_use_categorical_name(df): cats = qcut(df.C, 4) def get_stats(group): - return {'min': group.min(), - 'max': group.max(), - 'count': group.count(), - 'mean': group.mean()} + return { + "min": group.min(), + "max": group.max(), + "count": group.count(), + "mean": group.mean(), + } result = df.groupby(cats, observed=False).D.apply(get_stats) - assert result.index.names[0] == 'C' + assert result.index.names[0] == "C" def test_basic(): - cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], - categories=["a", "b", "c", "d"], ordered=True) + cats = Categorical( + ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + categories=["a", "b", "c", "d"], + ordered=True, + ) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) - exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True) - expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index) + exp_index = CategoricalIndex(list("abcd"), name="b", ordered=True) + expected = DataFrame({"a": [1, 2, 4, np.nan]}, index=exp_index) result = data.groupby("b", observed=False).mean() tm.assert_frame_equal(result, expected) - cat1 = Categorical(["a", "a", "b", "b"], - categories=["a", "b", "z"], ordered=True) - cat2 = Categorical(["c", "d", "c", "d"], - categories=["c", "d", "y"], ordered=True) + cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) + cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) # single grouper gb = df.groupby("A", observed=False) - exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) - expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)}) + exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True) + expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)}) result = gb.sum() tm.assert_frame_equal(result, expected) # GH 8623 - x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], - [1, 'John P. Doe']], - columns=['person_id', 'person_name']) - x['person_name'] = Categorical(x.person_name) + x = DataFrame( + [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]], + columns=["person_id", "person_name"], + ) + x["person_name"] = Categorical(x.person_name) - g = x.groupby(['person_id'], observed=False) + g = x.groupby(["person_id"], observed=False) result = g.transform(lambda x: x) - tm.assert_frame_equal(result, x[['person_name']]) + tm.assert_frame_equal(result, x[["person_name"]]) - result = x.drop_duplicates('person_name') + result = x.drop_duplicates("person_name") expected = x.iloc[[0, 1]] tm.assert_frame_equal(result, expected) def f(x): - return x.drop_duplicates('person_name').iloc[0] + return x.drop_duplicates("person_name").iloc[0] result = g.apply(f) expected = x.iloc[[0, 1]].copy() - expected.index = Index([1, 2], name='person_id') - expected['person_name'] = expected['person_name'].astype('object') + expected.index = Index([1, 2], name="person_id") + expected["person_name"] = expected["person_name"].astype("object") tm.assert_frame_equal(result, expected) # GH 9921 @@ -96,56 +106,47 @@ def f(x): c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) - tm.assert_series_equal(result, df['a']) + tm.assert_series_equal(result, df["a"]) tm.assert_series_equal( - df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), - df['a']) + df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"] + ) + tm.assert_frame_equal(df.groupby(c, observed=False).transform(sum), df[["a"]]) tm.assert_frame_equal( - df.groupby(c, observed=False).transform(sum), - df[['a']]) - tm.assert_frame_equal( - df.groupby(c, observed=False).transform(lambda xs: np.max(xs)), - df[['a']]) + df.groupby(c, observed=False).transform(lambda xs: np.max(xs)), df[["a"]] + ) # Filter - tm.assert_series_equal( - df.a.groupby(c, observed=False).filter(np.all), - df['a']) - tm.assert_frame_equal( - df.groupby(c, observed=False).filter(np.all), - df) + tm.assert_series_equal(df.a.groupby(c, observed=False).filter(np.all), df["a"]) + tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df) # Non-monotonic df = DataFrame({"a": [5, 15, 25, -5]}) c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) - tm.assert_series_equal(result, df['a']) + tm.assert_series_equal(result, df["a"]) tm.assert_series_equal( - df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), - df['a']) + df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"] + ) + tm.assert_frame_equal(df.groupby(c, observed=False).transform(sum), df[["a"]]) tm.assert_frame_equal( - df.groupby(c, observed=False).transform(sum), - df[['a']]) - tm.assert_frame_equal( - df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), - df[['a']]) + df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[["a"]] + ) # GH 9603 - df = DataFrame({'a': [1, 0, 0, 0]}) - c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd'))) + df = DataFrame({"a": [1, 0, 0, 0]}) + c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list("abcd"))) result = df.groupby(c, observed=False).apply(len) - exp_index = CategoricalIndex( - c.values.categories, ordered=c.values.ordered) + exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered) expected = Series([1, 0, 0, 0], index=exp_index) - expected.index.name = 'a' + expected.index.name = "a" tm.assert_series_equal(result, expected) # more basic - levels = ['foo', 'bar', 'baz', 'qux'] + levels = ["foo", "bar", "baz", "qux"] codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) @@ -155,8 +156,7 @@ def f(x): result = data.groupby(cats, observed=False).mean() expected = data.groupby(np.asarray(cats), observed=False).mean() - exp_idx = CategoricalIndex(levels, categories=cats.categories, - ordered=True) + exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True) expected = expected.reindex(exp_idx) assert_frame_equal(result, expected) @@ -168,73 +168,69 @@ def f(x): ord_labels = np.asarray(cats).take(idx) ord_data = data.take(idx) - exp_cats = Categorical(ord_labels, ordered=True, - categories=['foo', 'bar', 'baz', 'qux']) - expected = ord_data.groupby( - exp_cats, sort=False, observed=False).describe() + exp_cats = Categorical( + ord_labels, ordered=True, categories=["foo", "bar", "baz", "qux"] + ) + expected = ord_data.groupby(exp_cats, sort=False, observed=False).describe() assert_frame_equal(desc_result, expected) # GH 10460 - expc = Categorical.from_codes(np.arange(4).repeat(8), - levels, ordered=True) + expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) - tm.assert_index_equal((desc_result.stack().index - .get_level_values(0)), exp) - exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', - '75%', 'max'] * 4) - tm.assert_index_equal((desc_result.stack().index - .get_level_values(1)), exp) + tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) + exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) + tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp) def test_level_get_group(observed): # GH15155 - df = DataFrame(data=np.arange(2, 22, 2), - index=MultiIndex( - levels=[CategoricalIndex(["a", "b"]), range(10)], - codes=[[0] * 5 + [1] * 5, range(10)], - names=["Index1", "Index2"])) + df = DataFrame( + data=np.arange(2, 22, 2), + index=MultiIndex( + levels=[CategoricalIndex(["a", "b"]), range(10)], + codes=[[0] * 5 + [1] * 5, range(10)], + names=["Index1", "Index2"], + ), + ) g = df.groupby(level=["Index1"], observed=observed) # expected should equal test.loc[["a"]] # GH15166 - expected = DataFrame(data=np.arange(2, 12, 2), - index=MultiIndex(levels=[CategoricalIndex( - ["a", "b"]), range(5)], - codes=[[0] * 5, range(5)], - names=["Index1", "Index2"])) - result = g.get_group('a') + expected = DataFrame( + data=np.arange(2, 12, 2), + index=MultiIndex( + levels=[CategoricalIndex(["a", "b"]), range(5)], + codes=[[0] * 5, range(5)], + names=["Index1", "Index2"], + ), + ) + result = g.get_group("a") assert_frame_equal(result, expected) @pytest.mark.xfail(PY37, reason="flaky on 3.7, xref gh-21636", strict=False) -@pytest.mark.parametrize('ordered', [True, False]) +@pytest.mark.parametrize("ordered", [True, False]) def test_apply(ordered): # GH 10138 - dense = Categorical(list('abc'), ordered=ordered) + dense = Categorical(list("abc"), ordered=ordered) # 'b' is in the categories but not in the list - missing = Categorical( - list('aaa'), categories=['a', 'b'], ordered=ordered) + missing = Categorical(list("aaa"), categories=["a", "b"], ordered=ordered) values = np.arange(len(dense)) - df = DataFrame({'missing': missing, - 'dense': dense, - 'values': values}) - grouped = df.groupby(['missing', 'dense'], observed=True) + df = DataFrame({"missing": missing, "dense": dense, "values": values}) + grouped = df.groupby(["missing", "dense"], observed=True) # missing category 'b' should still exist in the output index - idx = MultiIndex.from_arrays( - [missing, dense], names=['missing', 'dense']) - expected = DataFrame([0, 1, 2.], - index=idx, - columns=['values']) + idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) + expected = DataFrame([0, 1, 2.0], index=idx, columns=["values"]) result = grouped.apply(lambda x: np.mean(x)) assert_frame_equal(result, expected) # we coerce back to ints - expected = expected.astype('int') + expected = expected.astype("int") result = grouped.mean() assert_frame_equal(result, expected) @@ -242,8 +238,7 @@ def test_apply(ordered): assert_frame_equal(result, expected) # but for transform we should still get back the original index - idx = MultiIndex.from_arrays([missing, dense], - names=['missing', 'dense']) + idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) expected = Series(1, index=idx) result = grouped.apply(lambda x: 1) assert_series_equal(result, expected) @@ -257,88 +252,81 @@ def test_observed(observed): # gh-8138 (back-compat) # gh-8869 - cat1 = Categorical(["a", "a", "b", "b"], - categories=["a", "b", "z"], ordered=True) - cat2 = Categorical(["c", "d", "c", "d"], - categories=["c", "d", "y"], ordered=True) + cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) + cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) - df['C'] = ['foo', 'bar'] * 2 + df["C"] = ["foo", "bar"] * 2 # multiple groupers with a non-cat - gb = df.groupby(['A', 'B', 'C'], observed=observed) + gb = df.groupby(["A", "B", "C"], observed=observed) exp_index = MultiIndex.from_arrays( - [cat1, cat2, ['foo', 'bar'] * 2], - names=['A', 'B', 'C']) - expected = DataFrame({'values': Series( - [1, 2, 3, 4], index=exp_index)}).sort_index() + [cat1, cat2, ["foo", "bar"] * 2], names=["A", "B", "C"] + ) + expected = DataFrame({"values": Series([1, 2, 3, 4], index=exp_index)}).sort_index() result = gb.sum() if not observed: expected = cartesian_product_for_groupers( - expected, - [cat1, cat2, ['foo', 'bar']], - list('ABC')) + expected, [cat1, cat2, ["foo", "bar"]], list("ABC") + ) tm.assert_frame_equal(result, expected) - gb = df.groupby(['A', 'B'], observed=observed) - exp_index = MultiIndex.from_arrays( - [cat1, cat2], - names=['A', 'B']) - expected = DataFrame({'values': [1, 2, 3, 4]}, - index=exp_index) + gb = df.groupby(["A", "B"], observed=observed) + exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"]) + expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index) result = gb.sum() if not observed: - expected = cartesian_product_for_groupers( - expected, - [cat1, cat2], - list('AB')) + expected = cartesian_product_for_groupers(expected, [cat1, cat2], list("AB")) tm.assert_frame_equal(result, expected) # https://github.com/pandas-dev/pandas/issues/8138 - d = {'cat': - Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], - ordered=True), - 'ints': [1, 1, 2, 2], - 'val': [10, 20, 30, 40]} + d = { + "cat": Categorical( + ["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True + ), + "ints": [1, 1, 2, 2], + "val": [10, 20, 30, 40], + } df = DataFrame(d) # Grouping on a single column groups_single_key = df.groupby("cat", observed=observed) result = groups_single_key.mean() - exp_index = CategoricalIndex(list('ab'), name="cat", - categories=list('abc'), - ordered=True) - expected = DataFrame({"ints": [1.5, 1.5], "val": [20., 30]}, - index=exp_index) + exp_index = CategoricalIndex( + list("ab"), name="cat", categories=list("abc"), ordered=True + ) + expected = DataFrame({"ints": [1.5, 1.5], "val": [20.0, 30]}, index=exp_index) if not observed: - index = CategoricalIndex(list('abc'), name="cat", - categories=list('abc'), - ordered=True) + index = CategoricalIndex( + list("abc"), name="cat", categories=list("abc"), ordered=True + ) expected = expected.reindex(index) tm.assert_frame_equal(result, expected) # Grouping on two columns groups_double_key = df.groupby(["cat", "ints"], observed=observed) - result = groups_double_key.agg('mean') + result = groups_double_key.agg("mean") expected = DataFrame( - {"val": [10, 30, 20, 40], - "cat": Categorical(['a', 'a', 'b', 'b'], - categories=['a', 'b', 'c'], - ordered=True), - "ints": [1, 2, 1, 2]}).set_index(["cat", "ints"]) + { + "val": [10, 30, 20, 40], + "cat": Categorical( + ["a", "a", "b", "b"], categories=["a", "b", "c"], ordered=True + ), + "ints": [1, 2, 1, 2], + } + ).set_index(["cat", "ints"]) if not observed: expected = cartesian_product_for_groupers( - expected, - [df.cat.values, [1, 2]], - ['cat', 'ints']) + expected, [df.cat.values, [1, 2]], ["cat", "ints"] + ) tm.assert_frame_equal(result, expected) # GH 10132 - for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: + for key in [("a", 1), ("b", 2), ("b", 1), ("a", 2)]: c, i = key result = groups_double_key.get_group(key) expected = df[(df.cat == c) & (df.ints == i)] @@ -346,37 +334,37 @@ def test_observed(observed): # gh-8869 # with as_index - d = {'foo': [10, 8, 4, 8, 4, 1, 1], 'bar': [10, 20, 30, 40, 50, 60, 70], - 'baz': ['d', 'c', 'e', 'a', 'a', 'd', 'c']} + d = { + "foo": [10, 8, 4, 8, 4, 1, 1], + "bar": [10, 20, 30, 40, 50, 60, 70], + "baz": ["d", "c", "e", "a", "a", "d", "c"], + } df = DataFrame(d) - cat = pd.cut(df['foo'], np.linspace(0, 10, 3)) - df['range'] = cat - groups = df.groupby(['range', 'baz'], as_index=False, observed=observed) - result = groups.agg('mean') + cat = pd.cut(df["foo"], np.linspace(0, 10, 3)) + df["range"] = cat + groups = df.groupby(["range", "baz"], as_index=False, observed=observed) + result = groups.agg("mean") - groups2 = df.groupby(['range', 'baz'], as_index=True, observed=observed) - expected = groups2.agg('mean').reset_index() + groups2 = df.groupby(["range", "baz"], as_index=True, observed=observed) + expected = groups2.agg("mean").reset_index() tm.assert_frame_equal(result, expected) def test_observed_codes_remap(observed): - d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} + d = {"C1": [3, 3, 4, 5], "C2": [1, 2, 3, 4], "C3": [10, 100, 200, 34]} df = DataFrame(d) - values = pd.cut(df['C1'], [1, 2, 3, 6]) + values = pd.cut(df["C1"], [1, 2, 3, 6]) values.name = "cat" - groups_double_key = df.groupby([values, 'C2'], observed=observed) + groups_double_key = df.groupby([values, "C2"], observed=observed) - idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]], - names=["cat", "C2"]) - expected = DataFrame({"C1": [3, 3, 4, 5], - "C3": [10, 100, 200, 34]}, index=idx) + idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]], names=["cat", "C2"]) + expected = DataFrame({"C1": [3, 3, 4, 5], "C3": [10, 100, 200, 34]}, index=idx) if not observed: expected = cartesian_product_for_groupers( - expected, - [values.values, [1, 2, 3, 4]], - ['cat', 'C2']) + expected, [values.values, [1, 2, 3, 4]], ["cat", "C2"] + ) - result = groups_double_key.agg('mean') + result = groups_double_key.agg("mean") tm.assert_frame_equal(result, expected) @@ -384,14 +372,17 @@ def test_observed_perf(): # we create a cartesian product, so this is # non-performant if we don't use observed values # gh-14942 - df = DataFrame({ - 'cat': np.random.randint(0, 255, size=30000), - 'int_id': np.random.randint(0, 255, size=30000), - 'other_id': np.random.randint(0, 10000, size=30000), - 'foo': 0}) - df['cat'] = df.cat.astype(str).astype('category') - - grouped = df.groupby(['cat', 'int_id', 'other_id'], observed=True) + df = DataFrame( + { + "cat": np.random.randint(0, 255, size=30000), + "int_id": np.random.randint(0, 255, size=30000), + "other_id": np.random.randint(0, 10000, size=30000), + "foo": 0, + } + ) + df["cat"] = df.cat.astype(str).astype("category") + + grouped = df.groupby(["cat", "int_id", "other_id"], observed=True) result = grouped.count() assert result.index.levels[0].nunique() == df.cat.nunique() assert result.index.levels[1].nunique() == df.int_id.nunique() @@ -402,52 +393,61 @@ def test_observed_groups(observed): # gh-20583 # test that we have the appropriate groups - cat = Categorical(['a', 'c', 'a'], categories=['a', 'b', 'c']) - df = DataFrame({'cat': cat, 'vals': [1, 2, 3]}) - g = df.groupby('cat', observed=observed) + cat = Categorical(["a", "c", "a"], categories=["a", "b", "c"]) + df = DataFrame({"cat": cat, "vals": [1, 2, 3]}) + g = df.groupby("cat", observed=observed) result = g.groups if observed: - expected = {'a': Index([0, 2], dtype='int64'), - 'c': Index([1], dtype='int64')} + expected = {"a": Index([0, 2], dtype="int64"), "c": Index([1], dtype="int64")} else: - expected = {'a': Index([0, 2], dtype='int64'), - 'b': Index([], dtype='int64'), - 'c': Index([1], dtype='int64')} + expected = { + "a": Index([0, 2], dtype="int64"), + "b": Index([], dtype="int64"), + "c": Index([1], dtype="int64"), + } tm.assert_dict_equal(result, expected) def test_observed_groups_with_nan(observed): # GH 24740 - df = DataFrame({'cat': Categorical(['a', np.nan, 'a'], - categories=['a', 'b', 'd']), - 'vals': [1, 2, 3]}) - g = df.groupby('cat', observed=observed) + df = DataFrame( + { + "cat": Categorical(["a", np.nan, "a"], categories=["a", "b", "d"]), + "vals": [1, 2, 3], + } + ) + g = df.groupby("cat", observed=observed) result = g.groups if observed: - expected = {'a': Index([0, 2], dtype='int64')} + expected = {"a": Index([0, 2], dtype="int64")} else: - expected = {'a': Index([0, 2], dtype='int64'), - 'b': Index([], dtype='int64'), - 'd': Index([], dtype='int64')} + expected = { + "a": Index([0, 2], dtype="int64"), + "b": Index([], dtype="int64"), + "d": Index([], dtype="int64"), + } tm.assert_dict_equal(result, expected) def test_dataframe_categorical_with_nan(observed): # GH 21151 - s1 = Categorical([np.nan, 'a', np.nan, 'a'], - categories=['a', 'b', 'c']) + s1 = Categorical([np.nan, "a", np.nan, "a"], categories=["a", "b", "c"]) s2 = Series([1, 2, 3, 4]) - df = DataFrame({'s1': s1, 's2': s2}) - result = df.groupby('s1', observed=observed).first().reset_index() + df = DataFrame({"s1": s1, "s2": s2}) + result = df.groupby("s1", observed=observed).first().reset_index() if observed: - expected = DataFrame({'s1': Categorical(['a'], - categories=['a', 'b', 'c']), 's2': [2]}) + expected = DataFrame( + {"s1": Categorical(["a"], categories=["a", "b", "c"]), "s2": [2]} + ) else: - expected = DataFrame({'s1': Categorical(['a', 'b', 'c'], - categories=['a', 'b', 'c']), - 's2': [2, np.nan, np.nan]}) + expected = DataFrame( + { + "s1": Categorical(["a", "b", "c"], categories=["a", "b", "c"]), + "s2": [2, np.nan, np.nan], + } + ) tm.assert_frame_equal(result, expected) @@ -460,32 +460,35 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): # Build a dataframe with cat having one unobserved category ('missing'), # and a Series with identical values - label = Categorical(['d', 'a', 'b', 'a', 'd', 'b'], - categories=['a', 'b', 'missing', 'd'], - ordered=ordered) - val = Series(['d', 'a', 'b', 'a', 'd', 'b']) - df = DataFrame({'label': label, 'val': val}) + label = Categorical( + ["d", "a", "b", "a", "d", "b"], + categories=["a", "b", "missing", "d"], + ordered=ordered, + ) + val = Series(["d", "a", "b", "a", "d", "b"]) + df = DataFrame({"label": label, "val": val}) # aggregate on the Categorical - result = (df.groupby('label', observed=observed, sort=sort)['val'] - .aggregate('first')) + result = df.groupby("label", observed=observed, sort=sort)["val"].aggregate("first") # If ordering works, we expect index labels equal to aggregation results, # except for 'observed=False': label 'missing' has aggregation None - label = Series(result.index.array, dtype='object') + label = Series(result.index.array, dtype="object") aggr = Series(result.array) if not observed: - aggr[aggr.isna()] = 'missing' + aggr[aggr.isna()] = "missing" if not all(label == aggr): - msg = ('Labels and aggregation results not consistently sorted\n' + - 'for (ordered={}, observed={}, sort={})\n' + - 'Result:\n{}').format(ordered, observed, sort, result) + msg = ( + "Labels and aggregation results not consistently sorted\n" + + "for (ordered={}, observed={}, sort={})\n" + + "Result:\n{}" + ).format(ordered, observed, sort, result) assert False, msg def test_datetime(): # GH9049: ensure backward compatibility - levels = pd.date_range('2014-01-01', periods=4) + levels = pd.date_range("2014-01-01", periods=4) codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) @@ -495,9 +498,9 @@ def test_datetime(): expected = data.groupby(np.asarray(cats), observed=False).mean() expected = expected.reindex(levels) - expected.index = CategoricalIndex(expected.index, - categories=expected.index, - ordered=True) + expected.index = CategoricalIndex( + expected.index, categories=expected.index, ordered=True + ) assert_frame_equal(result, expected) @@ -511,54 +514,50 @@ def test_datetime(): assert_frame_equal(desc_result, expected) tm.assert_index_equal(desc_result.index, expected.index) tm.assert_index_equal( - desc_result.index.get_level_values(0), - expected.index.get_level_values(0)) + desc_result.index.get_level_values(0), expected.index.get_level_values(0) + ) # GH 10460 - expc = Categorical.from_codes( - np.arange(4).repeat(8), levels, ordered=True) + expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) - tm.assert_index_equal((desc_result.stack().index - .get_level_values(0)), exp) - exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', - '75%', 'max'] * 4) - tm.assert_index_equal((desc_result.stack().index - .get_level_values(1)), exp) + tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) + exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) + tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp) def test_categorical_index(): s = np.random.RandomState(12345) - levels = ['foo', 'bar', 'baz', 'qux'] + levels = ["foo", "bar", "baz", "qux"] codes = s.randint(0, 4, size=20) cats = Categorical.from_codes(codes, levels, ordered=True) - df = DataFrame( - np.repeat( - np.arange(20), 4).reshape(-1, 4), columns=list('abcd')) - df['cats'] = cats + df = DataFrame(np.repeat(np.arange(20), 4).reshape(-1, 4), columns=list("abcd")) + df["cats"] = cats # with a cat index - result = df.set_index('cats').groupby(level=0, observed=False).sum() - expected = df[list('abcd')].groupby(cats.codes, observed=False).sum() + result = df.set_index("cats").groupby(level=0, observed=False).sum() + expected = df[list("abcd")].groupby(cats.codes, observed=False).sum() expected.index = CategoricalIndex( - Categorical.from_codes( - [0, 1, 2, 3], levels, ordered=True), name='cats') + Categorical.from_codes([0, 1, 2, 3], levels, ordered=True), name="cats" + ) assert_frame_equal(result, expected) # with a cat column, should produce a cat index - result = df.groupby('cats', observed=False).sum() - expected = df[list('abcd')].groupby(cats.codes, observed=False).sum() + result = df.groupby("cats", observed=False).sum() + expected = df[list("abcd")].groupby(cats.codes, observed=False).sum() expected.index = CategoricalIndex( - Categorical.from_codes( - [0, 1, 2, 3], levels, ordered=True), name='cats') + Categorical.from_codes([0, 1, 2, 3], levels, ordered=True), name="cats" + ) assert_frame_equal(result, expected) def test_describe_categorical_columns(): # GH 11558 - cats = CategoricalIndex(['qux', 'foo', 'baz', 'bar'], - categories=['foo', 'bar', 'baz', 'qux'], - ordered=True) + cats = CategoricalIndex( + ["qux", "foo", "baz", "bar"], + categories=["foo", "bar", "baz", "qux"], + ordered=True, + ) df = DataFrame(np.random.randn(20, 4), columns=cats) result = df.groupby([1, 2, 3, 4] * 5).describe() @@ -568,22 +567,20 @@ def test_describe_categorical_columns(): def test_unstack_categorical(): # GH11558 (example is taken from the original issue) - df = DataFrame({'a': range(10), - 'medium': ['A', 'B'] * 5, - 'artist': list('XYXXY') * 2}) - df['medium'] = df['medium'].astype('category') + df = DataFrame( + {"a": range(10), "medium": ["A", "B"] * 5, "artist": list("XYXXY") * 2} + ) + df["medium"] = df["medium"].astype("category") - gcat = df.groupby( - ['artist', 'medium'], observed=False)['a'].count().unstack() + gcat = df.groupby(["artist", "medium"], observed=False)["a"].count().unstack() result = gcat.describe() - exp_columns = CategoricalIndex(['A', 'B'], ordered=False, - name='medium') + exp_columns = CategoricalIndex(["A", "B"], ordered=False, name="medium") tm.assert_index_equal(result.columns, exp_columns) tm.assert_categorical_equal(result.columns.values, exp_columns.values) - result = gcat['A'] + gcat['B'] - expected = Series([6, 4], index=Index(['X', 'Y'], name='artist')) + result = gcat["A"] + gcat["B"] + expected = Series([6, 4], index=Index(["X", "Y"], name="artist")) tm.assert_series_equal(result, expected) @@ -599,41 +596,54 @@ def test_bins_unequal_len(): def test_as_index(): # GH13204 - df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]), - 'A': [10, 11, 11], - 'B': [101, 102, 103]}) - result = df.groupby(['cat', 'A'], as_index=False, observed=True).sum() + df = DataFrame( + { + "cat": Categorical([1, 2, 2], [1, 2, 3]), + "A": [10, 11, 11], + "B": [101, 102, 103], + } + ) + result = df.groupby(["cat", "A"], as_index=False, observed=True).sum() expected = DataFrame( - {'cat': Categorical([1, 2], categories=df.cat.cat.categories), - 'A': [10, 11], - 'B': [101, 205]}, - columns=['cat', 'A', 'B']) + { + "cat": Categorical([1, 2], categories=df.cat.cat.categories), + "A": [10, 11], + "B": [101, 205], + }, + columns=["cat", "A", "B"], + ) tm.assert_frame_equal(result, expected) # function grouper - f = lambda r: df.loc[r, 'A'] - result = df.groupby(['cat', f], as_index=False, observed=True).sum() + f = lambda r: df.loc[r, "A"] + result = df.groupby(["cat", f], as_index=False, observed=True).sum() expected = DataFrame( - {'cat': Categorical([1, 2], categories=df.cat.cat.categories), - 'A': [10, 22], - 'B': [101, 205]}, - columns=['cat', 'A', 'B']) + { + "cat": Categorical([1, 2], categories=df.cat.cat.categories), + "A": [10, 22], + "B": [101, 205], + }, + columns=["cat", "A", "B"], + ) tm.assert_frame_equal(result, expected) # another not in-axis grouper (conflicting names in index) - s = Series(['a', 'b', 'b'], name='cat') - result = df.groupby(['cat', s], as_index=False, observed=True).sum() + s = Series(["a", "b", "b"], name="cat") + result = df.groupby(["cat", s], as_index=False, observed=True).sum() tm.assert_frame_equal(result, expected) # is original index dropped? - group_columns = ['cat', 'A'] + group_columns = ["cat", "A"] expected = DataFrame( - {'cat': Categorical([1, 2], categories=df.cat.cat.categories), - 'A': [10, 11], - 'B': [101, 205]}, - columns=['cat', 'A', 'B']) - - for name in [None, 'X', 'B']: + { + "cat": Categorical([1, 2], categories=df.cat.cat.categories), + "A": [10, 11], + "B": [101, 205], + }, + columns=["cat", "A", "B"], + ) + + for name in [None, "X", "B"]: df.index = Index(list("abc"), name=name) result = df.groupby(group_columns, as_index=False, observed=True).sum() @@ -642,79 +652,76 @@ def test_as_index(): def test_preserve_categories(): # GH-13179 - categories = list('abc') + categories = list("abc") # ordered=True - df = DataFrame({'A': Categorical(list('ba'), - categories=categories, - ordered=True)}) + df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=True)}) index = CategoricalIndex(categories, categories, ordered=True) tm.assert_index_equal( - df.groupby('A', sort=True, observed=False).first().index, index) + df.groupby("A", sort=True, observed=False).first().index, index + ) tm.assert_index_equal( - df.groupby('A', sort=False, observed=False).first().index, index) + df.groupby("A", sort=False, observed=False).first().index, index + ) # ordered=False - df = DataFrame({'A': Categorical(list('ba'), - categories=categories, - ordered=False)}) + df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)}) sort_index = CategoricalIndex(categories, categories, ordered=False) - nosort_index = CategoricalIndex(list('bac'), list('bac'), - ordered=False) + nosort_index = CategoricalIndex(list("bac"), list("bac"), ordered=False) tm.assert_index_equal( - df.groupby('A', sort=True, observed=False).first().index, - sort_index) + df.groupby("A", sort=True, observed=False).first().index, sort_index + ) tm.assert_index_equal( - df.groupby('A', sort=False, observed=False).first().index, - nosort_index) + df.groupby("A", sort=False, observed=False).first().index, nosort_index + ) def test_preserve_categorical_dtype(): # GH13743, GH13854 - df = DataFrame({'A': [1, 2, 1, 1, 2], - 'B': [10, 16, 22, 28, 34], - 'C1': Categorical(list("abaab"), - categories=list("bac"), - ordered=False), - 'C2': Categorical(list("abaab"), - categories=list("bac"), - ordered=True)}) + df = DataFrame( + { + "A": [1, 2, 1, 1, 2], + "B": [10, 16, 22, 28, 34], + "C1": Categorical(list("abaab"), categories=list("bac"), ordered=False), + "C2": Categorical(list("abaab"), categories=list("bac"), ordered=True), + } + ) # single grouper - exp_full = DataFrame({'A': [2.0, 1.0, np.nan], - 'B': [25.0, 20.0, np.nan], - 'C1': Categorical(list("bac"), - categories=list("bac"), - ordered=False), - 'C2': Categorical(list("bac"), - categories=list("bac"), - ordered=True)}) - for col in ['C1', 'C2']: + exp_full = DataFrame( + { + "A": [2.0, 1.0, np.nan], + "B": [25.0, 20.0, np.nan], + "C1": Categorical(list("bac"), categories=list("bac"), ordered=False), + "C2": Categorical(list("bac"), categories=list("bac"), ordered=True), + } + ) + for col in ["C1", "C2"]: result1 = df.groupby(by=col, as_index=False, observed=False).mean() - result2 = df.groupby( - by=col, as_index=True, observed=False).mean().reset_index() + result2 = df.groupby(by=col, as_index=True, observed=False).mean().reset_index() expected = exp_full.reindex(columns=result1.columns) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) @pytest.mark.parametrize( - 'func, values', - [('first', ['second', 'first']), - ('last', ['fourth', 'third']), - ('min', ['fourth', 'first']), - ('max', ['second', 'third'])]) + "func, values", + [ + ("first", ["second", "first"]), + ("last", ["fourth", "third"]), + ("min", ["fourth", "first"]), + ("max", ["second", "third"]), + ], +) def test_preserve_on_ordered_ops(func, values): # gh-18502 # preserve the categoricals on ops - c = pd.Categorical(['first', 'second', 'third', 'fourth'], ordered=True) - df = pd.DataFrame( - {'payload': [-1, -2, -1, -2], - 'col': c}) - g = df.groupby('payload') + c = pd.Categorical(["first", "second", "third", "fourth"], ordered=True) + df = pd.DataFrame({"payload": [-1, -2, -1, -2], "col": c}) + g = df.groupby("payload") result = getattr(g, func)() expected = pd.DataFrame( - {'payload': [-2, -1], - 'col': pd.Series(values, dtype=c.dtype)}).set_index('payload') + {"payload": [-2, -1], "col": pd.Series(values, dtype=c.dtype)} + ).set_index("payload") tm.assert_frame_equal(result, expected) @@ -727,8 +734,9 @@ def test_categorical_no_compress(): result = data.groupby(cats, observed=False).mean() exp = data.groupby(codes, observed=False).mean() - exp.index = CategoricalIndex(exp.index, categories=cats.categories, - ordered=cats.ordered) + exp.index = CategoricalIndex( + exp.index, categories=cats.categories, ordered=cats.ordered + ) assert_series_equal(result, exp) codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) @@ -736,12 +744,16 @@ def test_categorical_no_compress(): result = data.groupby(cats, observed=False).mean() exp = data.groupby(codes, observed=False).mean().reindex(cats.categories) - exp.index = CategoricalIndex(exp.index, categories=cats.categories, - ordered=cats.ordered) + exp.index = CategoricalIndex( + exp.index, categories=cats.categories, ordered=cats.ordered + ) assert_series_equal(result, exp) - cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], - categories=["a", "b", "c", "d"], ordered=True) + cats = Categorical( + ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + categories=["a", "b", "c", "d"], + ordered=True, + ) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) result = data.groupby("b", observed=False).mean() @@ -757,15 +769,16 @@ def test_sort(): # has a sorted x axis # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') - df = DataFrame({'value': np.random.randint(0, 10000, 100)}) + df = DataFrame({"value": np.random.randint(0, 10000, 100)}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) - df = df.sort_values(by=['value'], ascending=True) - df['value_group'] = pd.cut(df.value, range(0, 10500, 500), - right=False, labels=cat_labels) + df = df.sort_values(by=["value"], ascending=True) + df["value_group"] = pd.cut( + df.value, range(0, 10500, 500), right=False, labels=cat_labels + ) - res = df.groupby(['value_group'], observed=False)['value_group'].count() + res = df.groupby(["value_group"], observed=False)["value_group"].count() exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))] exp.index = CategoricalIndex(exp.index, name=exp.index.name) tm.assert_series_equal(res, exp) @@ -773,20 +786,27 @@ def test_sort(): def test_sort2(): # dataframe groupby sort was being ignored # GH 8868 - df = DataFrame([['(7.5, 10]', 10, 10], - ['(7.5, 10]', 8, 20], - ['(2.5, 5]', 5, 30], - ['(5, 7.5]', 6, 40], - ['(2.5, 5]', 4, 50], - ['(0, 2.5]', 1, 60], - ['(5, 7.5]', 7, 70]], columns=['range', 'foo', 'bar']) - df['range'] = Categorical(df['range'], ordered=True) - index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', - '(7.5, 10]'], name='range', ordered=True) - expected_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], - columns=['foo', 'bar'], index=index) - - col = 'range' + df = DataFrame( + [ + ["(7.5, 10]", 10, 10], + ["(7.5, 10]", 8, 20], + ["(2.5, 5]", 5, 30], + ["(5, 7.5]", 6, 40], + ["(2.5, 5]", 4, 50], + ["(0, 2.5]", 1, 60], + ["(5, 7.5]", 7, 70], + ], + columns=["range", "foo", "bar"], + ) + df["range"] = Categorical(df["range"], ordered=True) + index = CategoricalIndex( + ["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"], name="range", ordered=True + ) + expected_sort = DataFrame( + [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"], index=index + ) + + col = "range" result_sort = df.groupby(col, sort=True, observed=False).first() assert_frame_equal(result_sort, expected_sort) @@ -795,21 +815,24 @@ def test_sort2(): result_sort = df.groupby(col, sort=False, observed=False).first() assert_frame_equal(result_sort, expected_sort) - df['range'] = Categorical(df['range'], ordered=False) - index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', - '(7.5, 10]'], name='range') - expected_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], - columns=['foo', 'bar'], index=index) - - index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]', - '(0, 2.5]'], - categories=['(7.5, 10]', '(2.5, 5]', - '(5, 7.5]', '(0, 2.5]'], - name='range') - expected_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], - index=index, columns=['foo', 'bar']) - - col = 'range' + df["range"] = Categorical(df["range"], ordered=False) + index = CategoricalIndex( + ["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"], name="range" + ) + expected_sort = DataFrame( + [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"], index=index + ) + + index = CategoricalIndex( + ["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"], + categories=["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"], + name="range", + ) + expected_nosort = DataFrame( + [[10, 10], [5, 30], [6, 40], [1, 60]], index=index, columns=["foo", "bar"] + ) + + col = "range" # this is an unordered categorical, but we allow this #### result_sort = df.groupby(col, sort=True, observed=False).first() @@ -824,165 +847,207 @@ def test_sort_datetimelike(): # use same data as test_groupby_sort_categorical, which category is # corresponding to datetime.month - df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1), - datetime(2011, 2, 1), datetime(2011, 5, 1), - datetime(2011, 2, 1), datetime(2011, 1, 1), - datetime(2011, 5, 1)], - 'foo': [10, 8, 5, 6, 4, 1, 7], - 'bar': [10, 20, 30, 40, 50, 60, 70]}, - columns=['dt', 'foo', 'bar']) + df = DataFrame( + { + "dt": [ + datetime(2011, 7, 1), + datetime(2011, 7, 1), + datetime(2011, 2, 1), + datetime(2011, 5, 1), + datetime(2011, 2, 1), + datetime(2011, 1, 1), + datetime(2011, 5, 1), + ], + "foo": [10, 8, 5, 6, 4, 1, 7], + "bar": [10, 20, 30, 40, 50, 60, 70], + }, + columns=["dt", "foo", "bar"], + ) # ordered=True - df['dt'] = Categorical(df['dt'], ordered=True) - index = [datetime(2011, 1, 1), datetime(2011, 2, 1), - datetime(2011, 5, 1), datetime(2011, 7, 1)] + df["dt"] = Categorical(df["dt"], ordered=True) + index = [ + datetime(2011, 1, 1), + datetime(2011, 2, 1), + datetime(2011, 5, 1), + datetime(2011, 7, 1), + ] result_sort = DataFrame( - [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) - result_sort.index = CategoricalIndex(index, name='dt', ordered=True) - - index = [datetime(2011, 7, 1), datetime(2011, 2, 1), - datetime(2011, 5, 1), datetime(2011, 1, 1)] - result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], - columns=['foo', 'bar']) - result_nosort.index = CategoricalIndex(index, categories=index, - name='dt', ordered=True) - - col = 'dt' - assert_frame_equal( - result_sort, df.groupby(col, sort=True, observed=False).first()) + [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"] + ) + result_sort.index = CategoricalIndex(index, name="dt", ordered=True) + + index = [ + datetime(2011, 7, 1), + datetime(2011, 2, 1), + datetime(2011, 5, 1), + datetime(2011, 1, 1), + ] + result_nosort = DataFrame( + [[10, 10], [5, 30], [6, 40], [1, 60]], columns=["foo", "bar"] + ) + result_nosort.index = CategoricalIndex( + index, categories=index, name="dt", ordered=True + ) + + col = "dt" + assert_frame_equal(result_sort, df.groupby(col, sort=True, observed=False).first()) # when categories is ordered, group is ordered by category's order - assert_frame_equal( - result_sort, df.groupby(col, sort=False, observed=False).first()) + assert_frame_equal(result_sort, df.groupby(col, sort=False, observed=False).first()) # ordered = False - df['dt'] = Categorical(df['dt'], ordered=False) - index = [datetime(2011, 1, 1), datetime(2011, 2, 1), - datetime(2011, 5, 1), datetime(2011, 7, 1)] + df["dt"] = Categorical(df["dt"], ordered=False) + index = [ + datetime(2011, 1, 1), + datetime(2011, 2, 1), + datetime(2011, 5, 1), + datetime(2011, 7, 1), + ] result_sort = DataFrame( - [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) - result_sort.index = CategoricalIndex(index, name='dt') - - index = [datetime(2011, 7, 1), datetime(2011, 2, 1), - datetime(2011, 5, 1), datetime(2011, 1, 1)] - result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], - columns=['foo', 'bar']) - result_nosort.index = CategoricalIndex(index, categories=index, - name='dt') - - col = 'dt' + [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"] + ) + result_sort.index = CategoricalIndex(index, name="dt") + + index = [ + datetime(2011, 7, 1), + datetime(2011, 2, 1), + datetime(2011, 5, 1), + datetime(2011, 1, 1), + ] + result_nosort = DataFrame( + [[10, 10], [5, 30], [6, 40], [1, 60]], columns=["foo", "bar"] + ) + result_nosort.index = CategoricalIndex(index, categories=index, name="dt") + + col = "dt" + assert_frame_equal(result_sort, df.groupby(col, sort=True, observed=False).first()) assert_frame_equal( - result_sort, df.groupby(col, sort=True, observed=False).first()) - assert_frame_equal( - result_nosort, df.groupby(col, sort=False, observed=False).first()) + result_nosort, df.groupby(col, sort=False, observed=False).first() + ) def test_empty_sum(): # https://github.com/pandas-dev/pandas/issues/18678 - df = DataFrame({"A": Categorical(['a', 'a', 'b'], - categories=['a', 'b', 'c']), - 'B': [1, 2, 1]}) - expected_idx = CategoricalIndex(['a', 'b', 'c'], name='A') + df = DataFrame( + {"A": Categorical(["a", "a", "b"], categories=["a", "b", "c"]), "B": [1, 2, 1]} + ) + expected_idx = CategoricalIndex(["a", "b", "c"], name="A") # 0 by default result = df.groupby("A", observed=False).B.sum() - expected = Series([3, 1, 0], expected_idx, name='B') + expected = Series([3, 1, 0], expected_idx, name="B") tm.assert_series_equal(result, expected) # min_count=0 result = df.groupby("A", observed=False).B.sum(min_count=0) - expected = Series([3, 1, 0], expected_idx, name='B') + expected = Series([3, 1, 0], expected_idx, name="B") tm.assert_series_equal(result, expected) # min_count=1 result = df.groupby("A", observed=False).B.sum(min_count=1) - expected = Series([3, 1, np.nan], expected_idx, name='B') + expected = Series([3, 1, np.nan], expected_idx, name="B") tm.assert_series_equal(result, expected) # min_count>1 result = df.groupby("A", observed=False).B.sum(min_count=2) - expected = Series([3, np.nan, np.nan], expected_idx, name='B') + expected = Series([3, np.nan, np.nan], expected_idx, name="B") tm.assert_series_equal(result, expected) def test_empty_prod(): # https://github.com/pandas-dev/pandas/issues/18678 - df = DataFrame({"A": Categorical(['a', 'a', 'b'], - categories=['a', 'b', 'c']), - 'B': [1, 2, 1]}) + df = DataFrame( + {"A": Categorical(["a", "a", "b"], categories=["a", "b", "c"]), "B": [1, 2, 1]} + ) - expected_idx = CategoricalIndex(['a', 'b', 'c'], name='A') + expected_idx = CategoricalIndex(["a", "b", "c"], name="A") # 1 by default result = df.groupby("A", observed=False).B.prod() - expected = Series([2, 1, 1], expected_idx, name='B') + expected = Series([2, 1, 1], expected_idx, name="B") tm.assert_series_equal(result, expected) # min_count=0 result = df.groupby("A", observed=False).B.prod(min_count=0) - expected = Series([2, 1, 1], expected_idx, name='B') + expected = Series([2, 1, 1], expected_idx, name="B") tm.assert_series_equal(result, expected) # min_count=1 result = df.groupby("A", observed=False).B.prod(min_count=1) - expected = Series([2, 1, np.nan], expected_idx, name='B') + expected = Series([2, 1, np.nan], expected_idx, name="B") tm.assert_series_equal(result, expected) def test_groupby_multiindex_categorical_datetime(): # https://github.com/pandas-dev/pandas/issues/21390 - df = DataFrame({ - 'key1': Categorical(list('abcbabcba')), - 'key2': Categorical( - list(pd.date_range('2018-06-01 00', freq='1T', periods=3)) * 3), - 'values': np.arange(9), - }) - result = df.groupby(['key1', 'key2']).mean() + df = DataFrame( + { + "key1": Categorical(list("abcbabcba")), + "key2": Categorical( + list(pd.date_range("2018-06-01 00", freq="1T", periods=3)) * 3 + ), + "values": np.arange(9), + } + ) + result = df.groupby(["key1", "key2"]).mean() idx = MultiIndex.from_product( - [Categorical(['a', 'b', 'c']), - Categorical(pd.date_range('2018-06-01 00', freq='1T', periods=3))], - names=['key1', 'key2']) - expected = DataFrame( - {'values': [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx) + [ + Categorical(["a", "b", "c"]), + Categorical(pd.date_range("2018-06-01 00", freq="1T", periods=3)), + ], + names=["key1", "key2"], + ) + expected = DataFrame({"values": [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx) assert_frame_equal(result, expected) -@pytest.mark.parametrize("as_index, expected", [ - (True, Series( - index=MultiIndex.from_arrays( - [Series([1, 1, 2], dtype='category'), - [1, 2, 2]], names=['a', 'b'] +@pytest.mark.parametrize( + "as_index, expected", + [ + ( + True, + Series( + index=MultiIndex.from_arrays( + [Series([1, 1, 2], dtype="category"), [1, 2, 2]], names=["a", "b"] + ), + data=[1, 2, 3], + name="x", + ), ), - data=[1, 2, 3], name='x' - )), - (False, DataFrame({ - 'a': Series([1, 1, 2], dtype='category'), - 'b': [1, 2, 2], - 'x': [1, 2, 3] - })) -]) + ( + False, + DataFrame( + { + "a": Series([1, 1, 2], dtype="category"), + "b": [1, 2, 2], + "x": [1, 2, 3], + } + ), + ), + ], +) def test_groupby_agg_observed_true_single_column(as_index, expected): # GH-23970 - df = DataFrame({ - 'a': Series([1, 1, 2], dtype='category'), - 'b': [1, 2, 2], - 'x': [1, 2, 3] - }) + df = DataFrame( + {"a": Series([1, 1, 2], dtype="category"), "b": [1, 2, 2], "x": [1, 2, 3]} + ) - result = df.groupby( - ['a', 'b'], as_index=as_index, observed=True)['x'].sum() + result = df.groupby(["a", "b"], as_index=as_index, observed=True)["x"].sum() assert_equal(result, expected) -@pytest.mark.parametrize('fill_value', [None, np.nan, pd.NaT]) +@pytest.mark.parametrize("fill_value", [None, np.nan, pd.NaT]) def test_shift(fill_value): - ct = Categorical(['a', 'b', 'c', 'd'], - categories=['a', 'b', 'c', 'd'], ordered=False) - expected = Categorical([None, 'a', 'b', 'c'], - categories=['a', 'b', 'c', 'd'], ordered=False) + ct = Categorical( + ["a", "b", "c", "d"], categories=["a", "b", "c", "d"], ordered=False + ) + expected = Categorical( + [None, "a", "b", "c"], categories=["a", "b", "c", "d"], ordered=False + ) res = ct.shift(1, fill_value=fill_value) assert_equal(res, expected) @@ -1005,66 +1070,98 @@ def df_cat(df): df_cat: DataFrame """ df_cat = df.copy()[:4] # leave out some groups - df_cat['A'] = df_cat['A'].astype('category') - df_cat['B'] = df_cat['B'].astype('category') - df_cat['C'] = Series([1, 2, 3, 4]) - df_cat = df_cat.drop(['D'], axis=1) + df_cat["A"] = df_cat["A"].astype("category") + df_cat["B"] = df_cat["B"].astype("category") + df_cat["C"] = Series([1, 2, 3, 4]) + df_cat = df_cat.drop(["D"], axis=1) return df_cat -@pytest.mark.parametrize('operation, kwargs', [ - ('agg', dict(dtype='category')), - ('apply', dict())]) +@pytest.mark.parametrize( + "operation, kwargs", [("agg", dict(dtype="category")), ("apply", dict())] +) def test_seriesgroupby_observed_true(df_cat, operation, kwargs): # GH 24880 index = MultiIndex.from_frame( - DataFrame({'A': ['foo', 'foo', 'bar', 'bar'], - 'B': ['one', 'two', 'one', 'three'] - }, **kwargs)) - expected = Series(data=[1, 3, 2, 4], index=index, name='C') - grouped = df_cat.groupby(['A', 'B'], observed=True)['C'] + DataFrame( + {"A": ["foo", "foo", "bar", "bar"], "B": ["one", "two", "one", "three"]}, + **kwargs + ) + ) + expected = Series(data=[1, 3, 2, 4], index=index, name="C") + grouped = df_cat.groupby(["A", "B"], observed=True)["C"] result = getattr(grouped, operation)(sum) assert_series_equal(result, expected) -@pytest.mark.parametrize('operation', ['agg', 'apply']) -@pytest.mark.parametrize('observed', [False, None]) +@pytest.mark.parametrize("operation", ["agg", "apply"]) +@pytest.mark.parametrize("observed", [False, None]) def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): # GH 24880 index, _ = MultiIndex.from_product( - [CategoricalIndex(['bar', 'foo'], ordered=False), - CategoricalIndex(['one', 'three', 'two'], ordered=False)], - names=['A', 'B']).sortlevel() - - expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], - index=index, name='C') - grouped = df_cat.groupby(['A', 'B'], observed=observed)['C'] + [ + CategoricalIndex(["bar", "foo"], ordered=False), + CategoricalIndex(["one", "three", "two"], ordered=False), + ], + names=["A", "B"], + ).sortlevel() + + expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C") + grouped = df_cat.groupby(["A", "B"], observed=observed)["C"] result = getattr(grouped, operation)(sum) assert_series_equal(result, expected) -@pytest.mark.parametrize("observed, index, data", [ - (True, MultiIndex.from_tuples( - [('foo', 'one', 'min'), ('foo', 'one', 'max'), - ('foo', 'two', 'min'), ('foo', 'two', 'max'), - ('bar', 'one', 'min'), ('bar', 'one', 'max'), - ('bar', 'three', 'min'), ('bar', 'three', 'max')], - names=['A', 'B', None]), [1, 1, 3, 3, 2, 2, 4, 4]), - (False, MultiIndex.from_product( - [CategoricalIndex(['bar', 'foo'], ordered=False), - CategoricalIndex(['one', 'three', 'two'], ordered=False), - Index(['min', 'max'])], - names=['A', 'B', None]), - [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3]), - (None, MultiIndex.from_product( - [CategoricalIndex(['bar', 'foo'], ordered=False), - CategoricalIndex(['one', 'three', 'two'], ordered=False), - Index(['min', 'max'])], - names=['A', 'B', None]), - [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3])]) +@pytest.mark.parametrize( + "observed, index, data", + [ + ( + True, + MultiIndex.from_tuples( + [ + ("foo", "one", "min"), + ("foo", "one", "max"), + ("foo", "two", "min"), + ("foo", "two", "max"), + ("bar", "one", "min"), + ("bar", "one", "max"), + ("bar", "three", "min"), + ("bar", "three", "max"), + ], + names=["A", "B", None], + ), + [1, 1, 3, 3, 2, 2, 4, 4], + ), + ( + False, + MultiIndex.from_product( + [ + CategoricalIndex(["bar", "foo"], ordered=False), + CategoricalIndex(["one", "three", "two"], ordered=False), + Index(["min", "max"]), + ], + names=["A", "B", None], + ), + [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3], + ), + ( + None, + MultiIndex.from_product( + [ + CategoricalIndex(["bar", "foo"], ordered=False), + CategoricalIndex(["one", "three", "two"], ordered=False), + Index(["min", "max"]), + ], + names=["A", "B", None], + ), + [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3], + ), + ], +) def test_seriesgroupby_observed_apply_dict(df_cat, observed, index, data): # GH 24880 - expected = Series(data=data, index=index, name='C') - result = df_cat.groupby(['A', 'B'], observed=observed)['C'].apply( - lambda x: OrderedDict([('min', x.min()), ('max', x.max())])) + expected = Series(data=data, index=index, name="C") + result = df_cat.groupby(["A", "B"], observed=observed)["C"].apply( + lambda x: OrderedDict([("min", x.min()), ("max", x.max())]) + ) assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 2dd26bac1e1024..5a864b3ab8cb48 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -8,10 +8,9 @@ class TestCounting: - def test_cumcount(self): - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A']) - g = df.groupby('A') + df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"]) + g = df.groupby("A") sg = g.A expected = Series([0, 1, 2, 0, 3]) @@ -24,15 +23,16 @@ def test_cumcount_empty(self): se = Series().groupby(level=0) # edge case, as this is usually considered float - e = Series(dtype='int64') + e = Series(dtype="int64") assert_series_equal(e, ge.cumcount()) assert_series_equal(e, se.cumcount()) def test_cumcount_dupe_index(self): - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], - index=[0] * 5) - g = df.groupby('A') + df = DataFrame( + [["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5 + ) + g = df.groupby("A") sg = g.A expected = Series([0, 1, 2, 0, 3], index=[0] * 5) @@ -42,9 +42,8 @@ def test_cumcount_dupe_index(self): def test_cumcount_mi(self): mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], - index=mi) - g = df.groupby('A') + df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=mi) + g = df.groupby("A") sg = g.A expected = Series([0, 1, 2, 0, 3], index=mi) @@ -53,8 +52,9 @@ def test_cumcount_mi(self): assert_series_equal(expected, sg.cumcount()) def test_cumcount_groupby_not_col(self): - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], - index=[0] * 5) + df = DataFrame( + [["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5 + ) g = df.groupby([0, 0, 0, 1, 0]) sg = g.A @@ -64,8 +64,8 @@ def test_cumcount_groupby_not_col(self): assert_series_equal(expected, sg.cumcount()) def test_ngroup(self): - df = DataFrame({'A': list('aaaba')}) - g = df.groupby('A') + df = DataFrame({"A": list("aaaba")}) + g = df.groupby("A") sg = g.A expected = Series([0, 0, 0, 1, 0]) @@ -74,18 +74,18 @@ def test_ngroup(self): assert_series_equal(expected, sg.ngroup()) def test_ngroup_distinct(self): - df = DataFrame({'A': list('abcde')}) - g = df.groupby('A') + df = DataFrame({"A": list("abcde")}) + g = df.groupby("A") sg = g.A - expected = Series(range(5), dtype='int64') + expected = Series(range(5), dtype="int64") assert_series_equal(expected, g.ngroup()) assert_series_equal(expected, sg.ngroup()) def test_ngroup_one_group(self): - df = DataFrame({'A': [0] * 5}) - g = df.groupby('A') + df = DataFrame({"A": [0] * 5}) + g = df.groupby("A") sg = g.A expected = Series([0] * 5) @@ -98,21 +98,20 @@ def test_ngroup_empty(self): se = Series().groupby(level=0) # edge case, as this is usually considered float - e = Series(dtype='int64') + e = Series(dtype="int64") assert_series_equal(e, ge.ngroup()) assert_series_equal(e, se.ngroup()) def test_ngroup_series_matches_frame(self): - df = DataFrame({'A': list('aaaba')}) - s = Series(list('aaaba')) + df = DataFrame({"A": list("aaaba")}) + s = Series(list("aaaba")) - assert_series_equal(df.groupby(s).ngroup(), - s.groupby(s).ngroup()) + assert_series_equal(df.groupby(s).ngroup(), s.groupby(s).ngroup()) def test_ngroup_dupe_index(self): - df = DataFrame({'A': list('aaaba')}, index=[0] * 5) - g = df.groupby('A') + df = DataFrame({"A": list("aaaba")}, index=[0] * 5) + g = df.groupby("A") sg = g.A expected = Series([0, 0, 0, 1, 0], index=[0] * 5) @@ -122,8 +121,8 @@ def test_ngroup_dupe_index(self): def test_ngroup_mi(self): mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) - df = DataFrame({'A': list('aaaba')}, index=mi) - g = df.groupby('A') + df = DataFrame({"A": list("aaaba")}, index=mi) + g = df.groupby("A") sg = g.A expected = Series([0, 0, 0, 1, 0], index=mi) @@ -131,7 +130,7 @@ def test_ngroup_mi(self): assert_series_equal(expected, sg.ngroup()) def test_ngroup_groupby_not_col(self): - df = DataFrame({'A': list('aaaba')}, index=[0] * 5) + df = DataFrame({"A": list("aaaba")}, index=[0] * 5) g = df.groupby([0, 0, 0, 1, 0]) sg = g.A @@ -141,8 +140,8 @@ def test_ngroup_groupby_not_col(self): assert_series_equal(expected, sg.ngroup()) def test_ngroup_descending(self): - df = DataFrame(['a', 'a', 'b', 'a', 'b'], columns=['A']) - g = df.groupby(['A']) + df = DataFrame(["a", "a", "b", "a", "b"], columns=["A"]) + g = df.groupby(["A"]) ascending = Series([0, 0, 1, 0, 1]) descending = Series([1, 1, 0, 1, 0]) @@ -153,9 +152,11 @@ def test_ngroup_descending(self): def test_ngroup_matches_cumcount(self): # verify one manually-worked out case works - df = DataFrame([['a', 'x'], ['a', 'y'], ['b', 'x'], - ['a', 'x'], ['b', 'y']], columns=['A', 'X']) - g = df.groupby(['A', 'X']) + df = DataFrame( + [["a", "x"], ["a", "y"], ["b", "x"], ["a", "x"], ["b", "y"]], + columns=["A", "X"], + ) + g = df.groupby(["A", "X"]) g_ngroup = g.ngroup() g_cumcount = g.cumcount() expected_ngroup = Series([0, 1, 2, 0, 3]) @@ -167,8 +168,8 @@ def test_ngroup_matches_cumcount(self): def test_ngroup_cumcount_pair(self): # brute force comparison for all small series for p in product(range(3), repeat=4): - df = DataFrame({'a': p}) - g = df.groupby(['a']) + df = DataFrame({"a": p}) + g = df.groupby(["a"]) order = sorted(set(p)) ngroupd = [order.index(val) for val in p] @@ -179,43 +180,43 @@ def test_ngroup_cumcount_pair(self): def test_ngroup_respects_groupby_order(self): np.random.seed(0) - df = DataFrame({'a': np.random.choice(list('abcdef'), 100)}) + df = DataFrame({"a": np.random.choice(list("abcdef"), 100)}) for sort_flag in (False, True): - g = df.groupby(['a'], sort=sort_flag) - df['group_id'] = -1 - df['group_index'] = -1 + g = df.groupby(["a"], sort=sort_flag) + df["group_id"] = -1 + df["group_index"] = -1 for i, (_, group) in enumerate(g): - df.loc[group.index, 'group_id'] = i + df.loc[group.index, "group_id"] = i for j, ind in enumerate(group.index): - df.loc[ind, 'group_index'] = j - - assert_series_equal(Series(df['group_id'].values), - g.ngroup()) - assert_series_equal(Series(df['group_index'].values), - g.cumcount()) - - @pytest.mark.parametrize('datetimelike', [ - [Timestamp('2016-05-%02d 20:09:25+00:00' % i) for i in range(1, 4)], - [Timestamp('2016-05-%02d 20:09:25' % i) for i in range(1, 4)], - [Timedelta(x, unit="h") for x in range(1, 4)], - [Period(freq="2W", year=2017, month=x) for x in range(1, 4)]]) + df.loc[ind, "group_index"] = j + + assert_series_equal(Series(df["group_id"].values), g.ngroup()) + assert_series_equal(Series(df["group_index"].values), g.cumcount()) + + @pytest.mark.parametrize( + "datetimelike", + [ + [Timestamp("2016-05-%02d 20:09:25+00:00" % i) for i in range(1, 4)], + [Timestamp("2016-05-%02d 20:09:25" % i) for i in range(1, 4)], + [Timedelta(x, unit="h") for x in range(1, 4)], + [Period(freq="2W", year=2017, month=x) for x in range(1, 4)], + ], + ) def test_count_with_datetimelike(self, datetimelike): # test for #13393, where DataframeGroupBy.count() fails # when counting a datetimelike column. - df = DataFrame({'x': ['a', 'a', 'b'], 'y': datetimelike}) - res = df.groupby('x').count() - expected = DataFrame({'y': [2, 1]}, index=['a', 'b']) + df = DataFrame({"x": ["a", "a", "b"], "y": datetimelike}) + res = df.groupby("x").count() + expected = DataFrame({"y": [2, 1]}, index=["a", "b"]) expected.index.name = "x" assert_frame_equal(expected, res) def test_count_with_only_nans_in_first_group(self): # GH21956 - df = DataFrame({'A': [np.nan, np.nan], 'B': ['a', 'b'], 'C': [1, 2]}) - result = df.groupby(['A', 'B']).C.count() - mi = MultiIndex(levels=[[], ['a', 'b']], - codes=[[], []], - names=['A', 'B']) - expected = Series([], index=mi, dtype=np.int64, name='C') + df = DataFrame({"A": [np.nan, np.nan], "B": ["a", "b"], "C": [1, 2]}) + result = df.groupby(["A", "B"]).C.count() + mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"]) + expected = Series([], index=mi, dtype=np.int64, name="C") assert_series_equal(result, expected, check_index_type=False) diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index 52c4654ae8c73b..2ce04fc7740830 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -12,17 +12,17 @@ def test_filter_series(): expected_even = pd.Series([20, 22, 24], index=[2, 4, 5]) grouper = s.apply(lambda x: x % 2) grouped = s.groupby(grouper) - tm.assert_series_equal( - grouped.filter(lambda x: x.mean() < 10), expected_odd) - tm.assert_series_equal( - grouped.filter(lambda x: x.mean() > 10), expected_even) + tm.assert_series_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd) + tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 10), expected_even) # Test dropna=False. tm.assert_series_equal( grouped.filter(lambda x: x.mean() < 10, dropna=False), - expected_odd.reindex(s.index)) + expected_odd.reindex(s.index), + ) tm.assert_series_equal( grouped.filter(lambda x: x.mean() > 10, dropna=False), - expected_even.reindex(s.index)) + expected_even.reindex(s.index), + ) def test_filter_single_column_df(): @@ -31,36 +31,35 @@ def test_filter_single_column_df(): expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5]) grouper = df[0].apply(lambda x: x % 2) grouped = df.groupby(grouper) - tm.assert_frame_equal( - grouped.filter(lambda x: x.mean() < 10), expected_odd) - tm.assert_frame_equal( - grouped.filter(lambda x: x.mean() > 10), expected_even) + tm.assert_frame_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd) + tm.assert_frame_equal(grouped.filter(lambda x: x.mean() > 10), expected_even) # Test dropna=False. tm.assert_frame_equal( grouped.filter(lambda x: x.mean() < 10, dropna=False), - expected_odd.reindex(df.index)) + expected_odd.reindex(df.index), + ) tm.assert_frame_equal( grouped.filter(lambda x: x.mean() > 10, dropna=False), - expected_even.reindex(df.index)) + expected_even.reindex(df.index), + ) def test_filter_multi_column_df(): - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': [1, 1, 1, 1]}) - grouper = df['A'].apply(lambda x: x % 2) + df = pd.DataFrame({"A": [1, 12, 12, 1], "B": [1, 1, 1, 1]}) + grouper = df["A"].apply(lambda x: x % 2) grouped = df.groupby(grouper) - expected = pd.DataFrame({'A': [12, 12], 'B': [1, 1]}, index=[1, 2]) + expected = pd.DataFrame({"A": [12, 12], "B": [1, 1]}, index=[1, 2]) tm.assert_frame_equal( - grouped.filter(lambda x: x['A'].sum() - x['B'].sum() > 10), - expected) + grouped.filter(lambda x: x["A"].sum() - x["B"].sum() > 10), expected + ) def test_filter_mixed_df(): - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) - grouper = df['A'].apply(lambda x: x % 2) + df = pd.DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) + grouper = df["A"].apply(lambda x: x % 2) grouped = df.groupby(grouper) - expected = pd.DataFrame({'A': [12, 12], 'B': ['b', 'c']}, index=[1, 2]) - tm.assert_frame_equal( - grouped.filter(lambda x: x['A'].sum() > 10), expected) + expected = pd.DataFrame({"A": [12, 12], "B": ["b", "c"]}, index=[1, 2]) + tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 10), expected) def test_filter_out_all_groups(): @@ -68,11 +67,10 @@ def test_filter_out_all_groups(): grouper = s.apply(lambda x: x % 2) grouped = s.groupby(grouper) tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]]) - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) - grouper = df['A'].apply(lambda x: x % 2) + df = pd.DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) + grouper = df["A"].apply(lambda x: x % 2) grouped = df.groupby(grouper) - tm.assert_frame_equal( - grouped.filter(lambda x: x['A'].sum() > 1000), df.loc[[]]) + tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 1000), df.loc[[]]) def test_filter_out_no_groups(): @@ -81,25 +79,25 @@ def test_filter_out_no_groups(): grouped = s.groupby(grouper) filtered = grouped.filter(lambda x: x.mean() > 0) tm.assert_series_equal(filtered, s) - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) - grouper = df['A'].apply(lambda x: x % 2) + df = pd.DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) + grouper = df["A"].apply(lambda x: x % 2) grouped = df.groupby(grouper) - filtered = grouped.filter(lambda x: x['A'].mean() > 0) + filtered = grouped.filter(lambda x: x["A"].mean() > 0) tm.assert_frame_equal(filtered, df) def test_filter_out_all_groups_in_df(): # GH12768 - df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]}) - res = df.groupby('a') - res = res.filter(lambda x: x['b'].sum() > 5, dropna=False) - expected = pd.DataFrame({'a': [np.nan] * 3, 'b': [np.nan] * 3}) + df = pd.DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]}) + res = df.groupby("a") + res = res.filter(lambda x: x["b"].sum() > 5, dropna=False) + expected = pd.DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3}) tm.assert_frame_equal(expected, res) - df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]}) - res = df.groupby('a') - res = res.filter(lambda x: x['b'].sum() > 5, dropna=True) - expected = pd.DataFrame({'a': [], 'b': []}, dtype="int64") + df = pd.DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]}) + res = df.groupby("a") + res = res.filter(lambda x: x["b"].sum() > 5, dropna=True) + expected = pd.DataFrame({"a": [], "b": []}, dtype="int64") tm.assert_frame_equal(expected, res) @@ -121,20 +119,16 @@ def raise_if_sum_is_zero(x): def test_filter_with_axis_in_groupby(): # issue 11041 index = pd.MultiIndex.from_product([range(10), [0, 1]]) - data = pd.DataFrame( - np.arange(100).reshape(-1, 20), columns=index, dtype='int64') - result = data.groupby(level=0, - axis=1).filter(lambda x: x.iloc[0, 0] > 10) + data = pd.DataFrame(np.arange(100).reshape(-1, 20), columns=index, dtype="int64") + result = data.groupby(level=0, axis=1).filter(lambda x: x.iloc[0, 0] > 10) expected = data.iloc[:, 12:20] tm.assert_frame_equal(result, expected) def test_filter_bad_shapes(): - df = DataFrame({'A': np.arange(8), - 'B': list('aabbbbcc'), - 'C': np.arange(8)}) - s = df['B'] - g_df = df.groupby('B') + df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) + s = df["B"] + g_df = df.groupby("B") g_s = s.groupby(s) f = lambda x: x @@ -163,11 +157,9 @@ def test_filter_bad_shapes(): def test_filter_nan_is_false(): - df = DataFrame({'A': np.arange(8), - 'B': list('aabbbbcc'), - 'C': np.arange(8)}) - s = df['B'] - g_df = df.groupby(df['B']) + df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) + s = df["B"] + g_df = df.groupby(df["B"]) g_s = s.groupby(s) f = lambda x: np.nan @@ -183,7 +175,7 @@ def test_filter_against_workaround(): grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 - old_way = s[grouped.transform(f).astype('bool')] + old_way = s[grouped.transform(f).astype("bool")] new_way = grouped.filter(f) tm.assert_series_equal(new_way.sort_values(), old_way.sort_values()) @@ -192,53 +184,53 @@ def test_filter_against_workaround(): grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 - old_way = s[grouped.transform(f).astype('bool')] + old_way = s[grouped.transform(f).astype("bool")] new_way = grouped.filter(f) tm.assert_series_equal(new_way.sort_values(), old_way.sort_values()) # Set up DataFrame of ints, floats, strings. from string import ascii_lowercase + letters = np.array(list(ascii_lowercase)) N = 1000 random_letters = letters.take(np.random.randint(0, 26, N)) - df = DataFrame({'ints': Series(np.random.randint(0, 100, N)), - 'floats': N / 10 * Series(np.random.random(N)), - 'letters': Series(random_letters)}) + df = DataFrame( + { + "ints": Series(np.random.randint(0, 100, N)), + "floats": N / 10 * Series(np.random.random(N)), + "letters": Series(random_letters), + } + ) # Group by ints; filter on floats. - grouped = df.groupby('ints') - old_way = df[grouped.floats. - transform(lambda x: x.mean() > N / 20).astype('bool')] - new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20) + grouped = df.groupby("ints") + old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")] + new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20) tm.assert_frame_equal(new_way, old_way) # Group by floats (rounded); filter on strings. grouper = df.floats.apply(lambda x: np.round(x, -1)) grouped = df.groupby(grouper) - old_way = df[grouped.letters. - transform(lambda x: len(x) < N / 10).astype('bool')] + old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")] new_way = grouped.filter(lambda x: len(x.letters) < N / 10) tm.assert_frame_equal(new_way, old_way) # Group by strings; filter on ints. - grouped = df.groupby('letters') - old_way = df[grouped.ints. - transform(lambda x: x.mean() > N / 20).astype('bool')] - new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20) + grouped = df.groupby("letters") + old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")] + new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20) tm.assert_frame_equal(new_way, old_way) def test_filter_using_len(): # BUG GH4447 - df = DataFrame({'A': np.arange(8), - 'B': list('aabbbbcc'), - 'C': np.arange(8)}) - grouped = df.groupby('B') + df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) + grouped = df.groupby("B") actual = grouped.filter(lambda x: len(x) > 2) expected = DataFrame( - {'A': np.arange(2, 6), - 'B': list('bbbb'), - 'C': np.arange(2, 6)}, index=np.arange(2, 6)) + {"A": np.arange(2, 6), "B": list("bbbb"), "C": np.arange(2, 6)}, + index=np.arange(2, 6), + ) tm.assert_frame_equal(actual, expected) actual = grouped.filter(lambda x: len(x) > 4) @@ -246,10 +238,10 @@ def test_filter_using_len(): tm.assert_frame_equal(actual, expected) # Series have always worked properly, but we'll test anyway. - s = df['B'] + s = df["B"] grouped = s.groupby(s) actual = grouped.filter(lambda x: len(x) > 2) - expected = Series(4 * ['b'], index=np.arange(2, 6), name='B') + expected = Series(4 * ["b"], index=np.arange(2, 6), name="B") tm.assert_series_equal(actual, expected) actual = grouped.filter(lambda x: len(x) > 4) @@ -259,28 +251,29 @@ def test_filter_using_len(): def test_filter_maintains_ordering(): # Simple case: index is sequential. #4621 - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}) - s = df['pid'] - grouped = df.groupby('tag') + df = DataFrame( + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]} + ) + s = df["pid"] + grouped = df.groupby("tag") actual = grouped.filter(lambda x: len(x) > 1) expected = df.iloc[[1, 2, 4, 7]] tm.assert_frame_equal(actual, expected) - grouped = s.groupby(df['tag']) + grouped = s.groupby(df["tag"]) actual = grouped.filter(lambda x: len(x) > 1) expected = s.iloc[[1, 2, 4, 7]] tm.assert_series_equal(actual, expected) # Now index is sequentially decreasing. df.index = np.arange(len(df) - 1, -1, -1) - s = df['pid'] - grouped = df.groupby('tag') + s = df["pid"] + grouped = df.groupby("tag") actual = grouped.filter(lambda x: len(x) > 1) expected = df.iloc[[1, 2, 4, 7]] tm.assert_frame_equal(actual, expected) - grouped = s.groupby(df['tag']) + grouped = s.groupby(df["tag"]) actual = grouped.filter(lambda x: len(x) > 1) expected = s.iloc[[1, 2, 4, 7]] tm.assert_series_equal(actual, expected) @@ -288,13 +281,13 @@ def test_filter_maintains_ordering(): # Index is shuffled. SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3] df.index = df.index[SHUFFLED] - s = df['pid'] - grouped = df.groupby('tag') + s = df["pid"] + grouped = df.groupby("tag") actual = grouped.filter(lambda x: len(x) > 1) expected = df.iloc[[1, 2, 4, 7]] tm.assert_frame_equal(actual, expected) - grouped = s.groupby(df['tag']) + grouped = s.groupby(df["tag"]) actual = grouped.filter(lambda x: len(x) > 1) expected = s.iloc[[1, 2, 4, 7]] tm.assert_series_equal(actual, expected) @@ -302,39 +295,45 @@ def test_filter_maintains_ordering(): def test_filter_multiple_timestamp(): # GH 10114 - df = DataFrame({'A': np.arange(5, dtype='int64'), - 'B': ['foo', 'bar', 'foo', 'bar', 'bar'], - 'C': Timestamp('20130101')}) + df = DataFrame( + { + "A": np.arange(5, dtype="int64"), + "B": ["foo", "bar", "foo", "bar", "bar"], + "C": Timestamp("20130101"), + } + ) - grouped = df.groupby(['B', 'C']) + grouped = df.groupby(["B", "C"]) - result = grouped['A'].filter(lambda x: True) - tm.assert_series_equal(df['A'], result) + result = grouped["A"].filter(lambda x: True) + tm.assert_series_equal(df["A"], result) - result = grouped['A'].transform(len) - expected = Series([2, 3, 2, 3, 3], name='A') + result = grouped["A"].transform(len) + expected = Series([2, 3, 2, 3, 3], name="A") tm.assert_series_equal(result, expected) result = grouped.filter(lambda x: True) tm.assert_frame_equal(df, result) - result = grouped.transform('sum') - expected = DataFrame({'A': [2, 8, 2, 8, 8]}) + result = grouped.transform("sum") + expected = DataFrame({"A": [2, 8, 2, 8, 8]}) tm.assert_frame_equal(result, expected) result = grouped.transform(len) - expected = DataFrame({'A': [2, 3, 2, 3, 3]}) + expected = DataFrame({"A": [2, 3, 2, 3, 3]}) tm.assert_frame_equal(result, expected) def test_filter_and_transform_with_non_unique_int_index(): # GH4620 index = [1, 1, 1, 2, 1, 1, 0, 1] - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) + df = DataFrame( + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, + index=index, + ) + grouped_df = df.groupby("tag") + ser = df["pid"] + grouped_ser = ser.groupby(df["tag"]) expected_indexes = [1, 2, 4, 7] # Filter DataFrame @@ -354,13 +353,13 @@ def test_filter_and_transform_with_non_unique_int_index(): actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid") # ^ made manually because this can get confusing! tm.assert_series_equal(actual, expected) # Transform Series actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") tm.assert_series_equal(actual, expected) # Transform (a column from) DataFrameGroupBy @@ -371,11 +370,13 @@ def test_filter_and_transform_with_non_unique_int_index(): def test_filter_and_transform_with_multiple_non_unique_int_index(): # GH4620 index = [1, 1, 1, 2, 0, 0, 0, 1] - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) + df = DataFrame( + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, + index=index, + ) + grouped_df = df.groupby("tag") + ser = df["pid"] + grouped_ser = ser.groupby(df["tag"]) expected_indexes = [1, 2, 4, 7] # Filter DataFrame @@ -395,13 +396,13 @@ def test_filter_and_transform_with_multiple_non_unique_int_index(): actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid") # ^ made manually because this can get confusing! tm.assert_series_equal(actual, expected) # Transform Series actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") tm.assert_series_equal(actual, expected) # Transform (a column from) DataFrameGroupBy @@ -412,11 +413,13 @@ def test_filter_and_transform_with_multiple_non_unique_int_index(): def test_filter_and_transform_with_non_unique_float_index(): # GH4620 index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float) - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) + df = DataFrame( + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, + index=index, + ) + grouped_df = df.groupby("tag") + ser = df["pid"] + grouped_ser = ser.groupby(df["tag"]) expected_indexes = [1, 2, 4, 7] # Filter DataFrame @@ -436,13 +439,13 @@ def test_filter_and_transform_with_non_unique_float_index(): actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid") # ^ made manually because this can get confusing! tm.assert_series_equal(actual, expected) # Transform Series actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") tm.assert_series_equal(actual, expected) # Transform (a column from) DataFrameGroupBy @@ -452,15 +455,17 @@ def test_filter_and_transform_with_non_unique_float_index(): def test_filter_and_transform_with_non_unique_timestamp_index(): # GH4620 - t0 = Timestamp('2013-09-30 00:05:00') - t1 = Timestamp('2013-10-30 00:05:00') - t2 = Timestamp('2013-11-30 00:05:00') + t0 = Timestamp("2013-09-30 00:05:00") + t1 = Timestamp("2013-10-30 00:05:00") + t2 = Timestamp("2013-11-30 00:05:00") index = [t1, t1, t1, t2, t1, t1, t0, t1] - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) + df = DataFrame( + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, + index=index, + ) + grouped_df = df.groupby("tag") + ser = df["pid"] + grouped_ser = ser.groupby(df["tag"]) expected_indexes = [1, 2, 4, 7] # Filter DataFrame @@ -480,13 +485,13 @@ def test_filter_and_transform_with_non_unique_timestamp_index(): actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid") # ^ made manually because this can get confusing! tm.assert_series_equal(actual, expected) # Transform Series actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") tm.assert_series_equal(actual, expected) # Transform (a column from) DataFrameGroupBy @@ -496,12 +501,14 @@ def test_filter_and_transform_with_non_unique_timestamp_index(): def test_filter_and_transform_with_non_unique_string_index(): # GH4620 - index = list('bbbcbbab') - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) + index = list("bbbcbbab") + df = DataFrame( + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, + index=index, + ) + grouped_df = df.groupby("tag") + ser = df["pid"] + grouped_ser = ser.groupby(df["tag"]) expected_indexes = [1, 2, 4, 7] # Filter DataFrame @@ -521,13 +528,13 @@ def test_filter_and_transform_with_non_unique_string_index(): actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid") # ^ made manually because this can get confusing! tm.assert_series_equal(actual, expected) # Transform Series actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") tm.assert_series_equal(actual, expected) # Transform (a column from) DataFrameGroupBy @@ -536,39 +543,45 @@ def test_filter_and_transform_with_non_unique_string_index(): def test_filter_has_access_to_grouped_cols(): - df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') + df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=["A", "B"]) + g = df.groupby("A") # previously didn't have access to col A #???? - filt = g.filter(lambda x: x['A'].sum() == 2) + filt = g.filter(lambda x: x["A"].sum() == 2) tm.assert_frame_equal(filt, df.iloc[[0, 1]]) def test_filter_enforces_scalarness(): - df = pd.DataFrame([ - ['best', 'a', 'x'], - ['worst', 'b', 'y'], - ['best', 'c', 'x'], - ['best', 'd', 'y'], - ['worst', 'd', 'y'], - ['worst', 'd', 'y'], - ['best', 'd', 'z'], - ], columns=['a', 'b', 'c']) - with pytest.raises(TypeError, match='filter function returned a.*'): - df.groupby('c').filter(lambda g: g['a'] == 'best') + df = pd.DataFrame( + [ + ["best", "a", "x"], + ["worst", "b", "y"], + ["best", "c", "x"], + ["best", "d", "y"], + ["worst", "d", "y"], + ["worst", "d", "y"], + ["best", "d", "z"], + ], + columns=["a", "b", "c"], + ) + with pytest.raises(TypeError, match="filter function returned a.*"): + df.groupby("c").filter(lambda g: g["a"] == "best") def test_filter_non_bool_raises(): - df = pd.DataFrame([ - ['best', 'a', 1], - ['worst', 'b', 1], - ['best', 'c', 1], - ['best', 'd', 1], - ['worst', 'd', 1], - ['worst', 'd', 1], - ['best', 'd', 1], - ], columns=['a', 'b', 'c']) - with pytest.raises(TypeError, match='filter function returned a.*'): - df.groupby('a').filter(lambda g: g.c.mean()) + df = pd.DataFrame( + [ + ["best", "a", 1], + ["worst", "b", 1], + ["best", "c", 1], + ["best", "d", 1], + ["worst", "d", 1], + ["worst", "d", 1], + ["best", "d", 1], + ], + columns=["a", "b", "c"], + ) + with pytest.raises(TypeError, match="filter function returned a.*"): + df.groupby("a").filter(lambda g: g.c.mean()) def test_filter_dropna_with_empty_groups(): @@ -576,8 +589,7 @@ def test_filter_dropna_with_empty_groups(): data = pd.Series(np.random.rand(9), index=np.repeat([1, 2, 3], 3)) groupped = data.groupby(level=0) result_false = groupped.filter(lambda x: x.mean() > 1, dropna=False) - expected_false = pd.Series([np.nan] * 9, - index=np.repeat([1, 2, 3], 3)) + expected_false = pd.Series([np.nan] * 9, index=np.repeat([1, 2, 3], 3)) tm.assert_series_equal(result_false, expected_false) result_true = groupped.filter(lambda x: x.mean() > 1, dropna=True) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index e4303c0a070760..68e3db3a1ccb04 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -9,58 +9,65 @@ from pandas.errors import UnsupportedFunctionCall import pandas as pd -from pandas import ( - DataFrame, Index, MultiIndex, Series, Timestamp, date_range, isna) +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, isna import pandas.core.nanops as nanops from pandas.util import _test_decorators as td, testing as tm -@pytest.mark.parametrize("agg_func", ['any', 'all']) +@pytest.mark.parametrize("agg_func", ["any", "all"]) @pytest.mark.parametrize("skipna", [True, False]) -@pytest.mark.parametrize("vals", [ - ['foo', 'bar', 'baz'], ['foo', '', ''], ['', '', ''], - [1, 2, 3], [1, 0, 0], [0, 0, 0], - [1., 2., 3.], [1., 0., 0.], [0., 0., 0.], - [True, True, True], [True, False, False], [False, False, False], - [np.nan, np.nan, np.nan] -]) +@pytest.mark.parametrize( + "vals", + [ + ["foo", "bar", "baz"], + ["foo", "", ""], + ["", "", ""], + [1, 2, 3], + [1, 0, 0], + [0, 0, 0], + [1.0, 2.0, 3.0], + [1.0, 0.0, 0.0], + [0.0, 0.0, 0.0], + [True, True, True], + [True, False, False], + [False, False, False], + [np.nan, np.nan, np.nan], + ], +) def test_groupby_bool_aggs(agg_func, skipna, vals): - df = DataFrame({'key': ['a'] * 3 + ['b'] * 3, 'val': vals * 2}) + df = DataFrame({"key": ["a"] * 3 + ["b"] * 3, "val": vals * 2}) # Figure out expectation using Python builtin exp = getattr(builtins, agg_func)(vals) # edge case for missing data with skipna and 'any' - if skipna and all(isna(vals)) and agg_func == 'any': + if skipna and all(isna(vals)) and agg_func == "any": exp = False - exp_df = DataFrame([exp] * 2, columns=['val'], index=Index( - ['a', 'b'], name='key')) - result = getattr(df.groupby('key'), agg_func)(skipna=skipna) + exp_df = DataFrame([exp] * 2, columns=["val"], index=Index(["a", "b"], name="key")) + result = getattr(df.groupby("key"), agg_func)(skipna=skipna) tm.assert_frame_equal(result, exp_df) def test_max_min_non_numeric(): # #2700 - aa = DataFrame({'nn': [11, 11, 22, 22], - 'ii': [1, 2, 3, 4], - 'ss': 4 * ['mama']}) + aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]}) - result = aa.groupby('nn').max() - assert 'ss' in result + result = aa.groupby("nn").max() + assert "ss" in result - result = aa.groupby('nn').max(numeric_only=False) - assert 'ss' in result + result = aa.groupby("nn").max(numeric_only=False) + assert "ss" in result - result = aa.groupby('nn').min() - assert 'ss' in result + result = aa.groupby("nn").min() + assert "ss" in result - result = aa.groupby('nn').min(numeric_only=False) - assert 'ss' in result + result = aa.groupby("nn").min(numeric_only=False) + assert "ss" in result def test_intercept_builtin_sum(): - s = Series([1., 2., np.nan, 3.]) + s = Series([1.0, 2.0, np.nan, 3.0]) grouped = s.groupby([0, 1, 2, 2]) result = grouped.agg(builtins.sum) @@ -73,35 +80,34 @@ def test_intercept_builtin_sum(): # @pytest.mark.parametrize("f", [max, min, sum]) # def test_builtins_apply(f): + @pytest.mark.parametrize("f", [max, min, sum]) -@pytest.mark.parametrize('keys', [ - "jim", # Single key - ["jim", "joe"] # Multi-key -]) +@pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]]) # Single key # Multi-key def test_builtins_apply(keys, f): # see gh-8155 - df = pd.DataFrame(np.random.randint(1, 50, (1000, 2)), - columns=["jim", "joe"]) + df = pd.DataFrame(np.random.randint(1, 50, (1000, 2)), columns=["jim", "joe"]) df["jolie"] = np.random.randn(1000) fname = f.__name__ result = df.groupby(keys).apply(f) ngroups = len(df.drop_duplicates(subset=keys)) - assert_msg = ("invalid frame shape: {} " - "(expected ({}, 3))".format(result.shape, ngroups)) + assert_msg = "invalid frame shape: {} " "(expected ({}, 3))".format( + result.shape, ngroups + ) assert result.shape == (ngroups, 3), assert_msg - tm.assert_frame_equal(result, # numpy's equivalent function - df.groupby(keys).apply(getattr(np, fname))) + tm.assert_frame_equal( + result, # numpy's equivalent function + df.groupby(keys).apply(getattr(np, fname)), + ) if f != sum: expected = df.groupby(keys).agg(fname).reset_index() expected.set_index(keys, inplace=True, drop=False) tm.assert_frame_equal(result, expected, check_dtype=False) - tm.assert_series_equal(getattr(result, fname)(), - getattr(df, fname)()) + tm.assert_series_equal(getattr(result, fname)(), getattr(df, fname)()) def test_arg_passthru(): @@ -111,42 +117,54 @@ def test_arg_passthru(): # GH3668 # GH5724 df = pd.DataFrame( - {'group': [1, 1, 2], - 'int': [1, 2, 3], - 'float': [4., 5., 6.], - 'string': list('abc'), - 'category_string': pd.Series(list('abc')).astype('category'), - 'category_int': [7, 8, 9], - 'datetime': pd.date_range('20130101', periods=3), - 'datetimetz': pd.date_range('20130101', - periods=3, - tz='US/Eastern'), - 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')}, - columns=['group', 'int', 'float', 'string', - 'category_string', 'category_int', - 'datetime', 'datetimetz', - 'timedelta']) - - expected_columns_numeric = Index(['int', 'float', 'category_int']) + { + "group": [1, 1, 2], + "int": [1, 2, 3], + "float": [4.0, 5.0, 6.0], + "string": list("abc"), + "category_string": pd.Series(list("abc")).astype("category"), + "category_int": [7, 8, 9], + "datetime": pd.date_range("20130101", periods=3), + "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), + }, + columns=[ + "group", + "int", + "float", + "string", + "category_string", + "category_int", + "datetime", + "datetimetz", + "timedelta", + ], + ) + + expected_columns_numeric = Index(["int", "float", "category_int"]) # mean / median expected = pd.DataFrame( - {'category_int': [7.5, 9], - 'float': [4.5, 6.], - 'timedelta': [pd.Timedelta('1.5s'), - pd.Timedelta('3s')], - 'int': [1.5, 3], - 'datetime': [pd.Timestamp('2013-01-01 12:00:00'), - pd.Timestamp('2013-01-03 00:00:00')], - 'datetimetz': [ - pd.Timestamp('2013-01-01 12:00:00', tz='US/Eastern'), - pd.Timestamp('2013-01-03 00:00:00', tz='US/Eastern')]}, - index=Index([1, 2], name='group'), - columns=['int', 'float', 'category_int', - 'datetime', 'datetimetz', 'timedelta']) - - for attr in ['mean', 'median']: - f = getattr(df.groupby('group'), attr) + { + "category_int": [7.5, 9], + "float": [4.5, 6.0], + "timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")], + "int": [1.5, 3], + "datetime": [ + pd.Timestamp("2013-01-01 12:00:00"), + pd.Timestamp("2013-01-03 00:00:00"), + ], + "datetimetz": [ + pd.Timestamp("2013-01-01 12:00:00", tz="US/Eastern"), + pd.Timestamp("2013-01-03 00:00:00", tz="US/Eastern"), + ], + }, + index=Index([1, 2], name="group"), + columns=["int", "float", "category_int", "datetime", "datetimetz", "timedelta"], + ) + + for attr in ["mean", "median"]: + f = getattr(df.groupby("group"), attr) result = f() tm.assert_index_equal(result.columns, expected_columns_numeric) @@ -155,43 +173,57 @@ def test_arg_passthru(): # TODO: min, max *should* handle # categorical (ordered) dtype - expected_columns = Index(['int', 'float', 'string', - 'category_int', - 'datetime', 'datetimetz', - 'timedelta']) - for attr in ['min', 'max']: - f = getattr(df.groupby('group'), attr) + expected_columns = Index( + [ + "int", + "float", + "string", + "category_int", + "datetime", + "datetimetz", + "timedelta", + ] + ) + for attr in ["min", "max"]: + f = getattr(df.groupby("group"), attr) result = f() tm.assert_index_equal(result.columns, expected_columns) result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) - expected_columns = Index(['int', 'float', 'string', - 'category_string', 'category_int', - 'datetime', 'datetimetz', - 'timedelta']) - for attr in ['first', 'last']: - f = getattr(df.groupby('group'), attr) + expected_columns = Index( + [ + "int", + "float", + "string", + "category_string", + "category_int", + "datetime", + "datetimetz", + "timedelta", + ] + ) + for attr in ["first", "last"]: + f = getattr(df.groupby("group"), attr) result = f() tm.assert_index_equal(result.columns, expected_columns) result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) - expected_columns = Index(['int', 'float', 'string', - 'category_int', 'timedelta']) - for attr in ['sum']: - f = getattr(df.groupby('group'), attr) + expected_columns = Index(["int", "float", "string", "category_int", "timedelta"]) + for attr in ["sum"]: + f = getattr(df.groupby("group"), attr) result = f() tm.assert_index_equal(result.columns, expected_columns_numeric) result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) - expected_columns = Index(['int', 'float', 'category_int']) - for attr in ['prod', 'cumprod']: - f = getattr(df.groupby('group'), attr) + expected_columns = Index(["int", "float", "category_int"]) + for attr in ["prod", "cumprod"]: + f = getattr(df.groupby("group"), attr) result = f() tm.assert_index_equal(result.columns, expected_columns_numeric) @@ -199,12 +231,11 @@ def test_arg_passthru(): tm.assert_index_equal(result.columns, expected_columns) # like min, max, but don't include strings - expected_columns = Index(['int', 'float', - 'category_int', - 'datetime', 'datetimetz', - 'timedelta']) - for attr in ['cummin', 'cummax']: - f = getattr(df.groupby('group'), attr) + expected_columns = Index( + ["int", "float", "category_int", "datetime", "datetimetz", "timedelta"] + ) + for attr in ["cummin", "cummax"]: + f = getattr(df.groupby("group"), attr) result = f() # GH 15561: numeric_only=False set by default like min/max tm.assert_index_equal(result.columns, expected_columns) @@ -212,10 +243,9 @@ def test_arg_passthru(): result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) - expected_columns = Index(['int', 'float', 'category_int', - 'timedelta']) - for attr in ['cumsum']: - f = getattr(df.groupby('group'), attr) + expected_columns = Index(["int", "float", "category_int", "timedelta"]) + for attr in ["cumsum"]: + f = getattr(df.groupby("group"), attr) result = f() tm.assert_index_equal(result.columns, expected_columns_numeric) @@ -229,54 +259,59 @@ def test_non_cython_api(): # non-cython calls should not include the grouper df = DataFrame( - [[1, 2, 'foo'], - [1, np.nan, 'bar'], - [3, np.nan, 'baz']], - columns=['A', 'B', 'C']) - g = df.groupby('A') - gni = df.groupby('A', as_index=False) + [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], columns=["A", "B", "C"] + ) + g = df.groupby("A") + gni = df.groupby("A", as_index=False) # mad - expected = DataFrame([[0], [np.nan]], columns=['B'], index=[1, 3]) - expected.index.name = 'A' + expected = DataFrame([[0], [np.nan]], columns=["B"], index=[1, 3]) + expected.index.name = "A" result = g.mad() tm.assert_frame_equal(result, expected) - expected = DataFrame([[0., 0.], [0, np.nan]], columns=['A', 'B'], - index=[0, 1]) + expected = DataFrame([[0.0, 0.0], [0, np.nan]], columns=["A", "B"], index=[0, 1]) result = gni.mad() tm.assert_frame_equal(result, expected) # describe - expected_index = pd.Index([1, 3], name='A') - expected_col = pd.MultiIndex(levels=[['B'], - ['count', 'mean', 'std', 'min', - '25%', '50%', '75%', 'max']], - codes=[[0] * 8, list(range(8))]) - expected = pd.DataFrame([[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], - [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, - np.nan, np.nan]], - index=expected_index, - columns=expected_col) + expected_index = pd.Index([1, 3], name="A") + expected_col = pd.MultiIndex( + levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]], + codes=[[0] * 8, list(range(8))], + ) + expected = pd.DataFrame( + [ + [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], + [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + ], + index=expected_index, + columns=expected_col, + ) result = g.describe() tm.assert_frame_equal(result, expected) - expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T, - df[df.A == 3].describe().unstack().to_frame().T]) + expected = pd.concat( + [ + df[df.A == 1].describe().unstack().to_frame().T, + df[df.A == 3].describe().unstack().to_frame().T, + ] + ) expected.index = pd.Index([0, 1]) result = gni.describe() tm.assert_frame_equal(result, expected) # any - expected = DataFrame([[True, True], [False, True]], columns=['B', 'C'], - index=[1, 3]) - expected.index.name = 'A' + expected = DataFrame( + [[True, True], [False, True]], columns=["B", "C"], index=[1, 3] + ) + expected.index.name = "A" result = g.any() tm.assert_frame_equal(result, expected) # idxmax - expected = DataFrame([[0.0], [np.nan]], columns=['B'], index=[1, 3]) - expected.index.name = 'A' + expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3]) + expected.index.name = "A" result = g.idxmax() tm.assert_frame_equal(result, expected) @@ -286,23 +321,20 @@ def test_cython_api2(): # this takes the fast apply path # cumsum (GH5614) - df = DataFrame( - [[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9] - ], columns=['A', 'B', 'C']) - expected = DataFrame( - [[2, np.nan], [np.nan, 9], [4, 9]], columns=['B', 'C']) - result = df.groupby('A').cumsum() + df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"]) + expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"]) + result = df.groupby("A").cumsum() tm.assert_frame_equal(result, expected) # GH 5755 - cumsum is a transformer and should ignore as_index - result = df.groupby('A', as_index=False).cumsum() + result = df.groupby("A", as_index=False).cumsum() tm.assert_frame_equal(result, expected) # GH 13994 - result = df.groupby('A').cumsum(axis=1) + result = df.groupby("A").cumsum(axis=1) expected = df.cumsum(axis=1) tm.assert_frame_equal(result, expected) - result = df.groupby('A').cumprod(axis=1) + result = df.groupby("A").cumprod(axis=1) expected = df.cumprod(axis=1) tm.assert_frame_equal(result, expected) @@ -335,63 +367,69 @@ def test_median_empty_bins(observed): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("dtype", [ - 'int8', 'int16', 'int32', 'int64', 'float32', 'float64']) -@pytest.mark.parametrize("method,data", [ - ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), - ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), - ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), - ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), - ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}], - 'args': [1]}), - ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}], - 'out_type': 'int64'}) -]) +@pytest.mark.parametrize( + "dtype", ["int8", "int16", "int32", "int64", "float32", "float64"] +) +@pytest.mark.parametrize( + "method,data", + [ + ("first", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}), + ("last", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}), + ("min", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}), + ("max", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}), + ("nth", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}], "args": [1]}), + ("count", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 2}], "out_type": "int64"}), + ], +) def test_groupby_non_arithmetic_agg_types(dtype, method, data): # GH9311, GH6620 df = pd.DataFrame( - [{'a': 1, 'b': 1}, - {'a': 1, 'b': 2}, - {'a': 2, 'b': 3}, - {'a': 2, 'b': 4}]) + [{"a": 1, "b": 1}, {"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 2, "b": 4}] + ) - df['b'] = df.b.astype(dtype) + df["b"] = df.b.astype(dtype) - if 'args' not in data: - data['args'] = [] + if "args" not in data: + data["args"] = [] - if 'out_type' in data: - out_type = data['out_type'] + if "out_type" in data: + out_type = data["out_type"] else: out_type = dtype - exp = data['df'] + exp = data["df"] df_out = pd.DataFrame(exp) - df_out['b'] = df_out.b.astype(out_type) - df_out.set_index('a', inplace=True) + df_out["b"] = df_out.b.astype(out_type) + df_out.set_index("a", inplace=True) - grpd = df.groupby('a') - t = getattr(grpd, method)(*data['args']) + grpd = df.groupby("a") + t = getattr(grpd, method)(*data["args"]) tm.assert_frame_equal(t, df_out) -@pytest.mark.parametrize("i", [ - (Timestamp("2011-01-15 12:50:28.502376"), - Timestamp("2011-01-20 12:50:28.593448")), - (24650000000000001, 24650000000000002) -]) +@pytest.mark.parametrize( + "i", + [ + ( + Timestamp("2011-01-15 12:50:28.502376"), + Timestamp("2011-01-20 12:50:28.593448"), + ), + (24650000000000001, 24650000000000002), + ], +) def test_groupby_non_arithmetic_agg_int_like_precision(i): # see gh-6620, gh-9311 df = pd.DataFrame([{"a": 1, "b": i[0]}, {"a": 1, "b": i[1]}]) - grp_exp = {"first": {"expected": i[0]}, - "last": {"expected": i[1]}, - "min": {"expected": i[0]}, - "max": {"expected": i[1]}, - "nth": {"expected": i[1], - "args": [1]}, - "count": {"expected": 2}} + grp_exp = { + "first": {"expected": i[0]}, + "last": {"expected": i[1]}, + "min": {"expected": i[0]}, + "max": {"expected": i[1]}, + "nth": {"expected": i[1], "args": [1]}, + "count": {"expected": 2}, + } for method, data in grp_exp.items(): if "args" not in data: @@ -403,21 +441,28 @@ def test_groupby_non_arithmetic_agg_int_like_precision(i): assert res.iloc[0].b == data["expected"] -@pytest.mark.parametrize("func, values", [ - ("idxmin", {'c_int': [0, 2], 'c_float': [1, 3], 'c_date': [1, 2]}), - ("idxmax", {'c_int': [1, 3], 'c_float': [0, 2], 'c_date': [0, 3]}) -]) +@pytest.mark.parametrize( + "func, values", + [ + ("idxmin", {"c_int": [0, 2], "c_float": [1, 3], "c_date": [1, 2]}), + ("idxmax", {"c_int": [1, 3], "c_float": [0, 2], "c_date": [0, 3]}), + ], +) def test_idxmin_idxmax_returns_int_types(func, values): # GH 25444 - df = pd.DataFrame({'name': ['A', 'A', 'B', 'B'], - 'c_int': [1, 2, 3, 4], - 'c_float': [4.02, 3.03, 2.04, 1.05], - 'c_date': ['2019', '2018', '2016', '2017']}) - df['c_date'] = pd.to_datetime(df['c_date']) + df = pd.DataFrame( + { + "name": ["A", "A", "B", "B"], + "c_int": [1, 2, 3, 4], + "c_float": [4.02, 3.03, 2.04, 1.05], + "c_date": ["2019", "2018", "2016", "2017"], + } + ) + df["c_date"] = pd.to_datetime(df["c_date"]) - result = getattr(df.groupby('name'), func)() + result = getattr(df.groupby("name"), func)() - expected = pd.DataFrame(values, index=Index(['A', 'B'], name="name")) + expected = pd.DataFrame(values, index=Index(["A", "B"], name="name")) tm.assert_frame_equal(result, expected) @@ -427,59 +472,88 @@ def test_fill_consistency(): # GH9221 # pass thru keyword arguments to the generated wrapper # are set if the passed kw is None (only) - df = DataFrame(index=pd.MultiIndex.from_product( - [['value1', 'value2'], date_range('2014-01-01', '2014-01-06')]), - columns=Index( - ['1', '2'], name='id')) - df['1'] = [np.nan, 1, np.nan, np.nan, 11, np.nan, np.nan, 2, np.nan, - np.nan, 22, np.nan] - df['2'] = [np.nan, 3, np.nan, np.nan, 33, np.nan, np.nan, 4, np.nan, - np.nan, 44, np.nan] - - expected = df.groupby(level=0, axis=0).fillna(method='ffill') - result = df.T.groupby(level=0, axis=1).fillna(method='ffill').T + df = DataFrame( + index=pd.MultiIndex.from_product( + [["value1", "value2"], date_range("2014-01-01", "2014-01-06")] + ), + columns=Index(["1", "2"], name="id"), + ) + df["1"] = [ + np.nan, + 1, + np.nan, + np.nan, + 11, + np.nan, + np.nan, + 2, + np.nan, + np.nan, + 22, + np.nan, + ] + df["2"] = [ + np.nan, + 3, + np.nan, + np.nan, + 33, + np.nan, + np.nan, + 4, + np.nan, + np.nan, + 44, + np.nan, + ] + + expected = df.groupby(level=0, axis=0).fillna(method="ffill") + result = df.T.groupby(level=0, axis=1).fillna(method="ffill").T tm.assert_frame_equal(result, expected) def test_groupby_cumprod(): # GH 4095 - df = pd.DataFrame({'key': ['b'] * 10, 'value': 2}) + df = pd.DataFrame({"key": ["b"] * 10, "value": 2}) - actual = df.groupby('key')['value'].cumprod() - expected = df.groupby('key')['value'].apply(lambda x: x.cumprod()) - expected.name = 'value' + actual = df.groupby("key")["value"].cumprod() + expected = df.groupby("key")["value"].apply(lambda x: x.cumprod()) + expected.name = "value" tm.assert_series_equal(actual, expected) - df = pd.DataFrame({'key': ['b'] * 100, 'value': 2}) - actual = df.groupby('key')['value'].cumprod() + df = pd.DataFrame({"key": ["b"] * 100, "value": 2}) + actual = df.groupby("key")["value"].cumprod() # if overflows, groupby product casts to float # while numpy passes back invalid values - df['value'] = df['value'].astype(float) - expected = df.groupby('key')['value'].apply(lambda x: x.cumprod()) - expected.name = 'value' + df["value"] = df["value"].astype(float) + expected = df.groupby("key")["value"].apply(lambda x: x.cumprod()) + expected.name = "value" tm.assert_series_equal(actual, expected) def scipy_sem(*args, **kwargs): from scipy.stats import sem + return sem(*args, ddof=1, **kwargs) @pytest.mark.parametrize( - 'op,targop', - [('mean', np.mean), - ('median', np.median), - ('std', np.std), - ('var', np.var), - ('sum', np.sum), - ('prod', np.prod), - ('min', np.min), - ('max', np.max), - ('first', lambda x: x.iloc[0]), - ('last', lambda x: x.iloc[-1]), - ('count', np.size), - pytest.param( - 'sem', scipy_sem, marks=td.skip_if_no_scipy)]) + "op,targop", + [ + ("mean", np.mean), + ("median", np.median), + ("std", np.std), + ("var", np.var), + ("sum", np.sum), + ("prod", np.prod), + ("min", np.min), + ("max", np.max), + ("first", lambda x: x.iloc[0]), + ("last", lambda x: x.iloc[-1]), + ("count", np.size), + pytest.param("sem", scipy_sem, marks=td.skip_if_no_scipy), + ], +) def test_ops_general(op, targop): df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) @@ -496,56 +570,58 @@ def test_max_nan_bug(): -05-07,2013-05-07 00:00:00,OE,xlsx""" df = pd.read_csv(StringIO(raw), parse_dates=[0]) - gb = df.groupby('Date') - r = gb[['File']].max() - e = gb['File'].max().to_frame() + gb = df.groupby("Date") + r = gb[["File"]].max() + e = gb["File"].max().to_frame() tm.assert_frame_equal(r, e) - assert not r['File'].isna().any() + assert not r["File"].isna().any() def test_nlargest(): a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) - b = Series(list('a' * 5 + 'b' * 5)) + b = Series(list("a" * 5 + "b" * 5)) gb = a.groupby(b) r = gb.nlargest(3) - e = Series([ - 7, 5, 3, 10, 9, 6 - ], index=MultiIndex.from_arrays([list('aaabbb'), [3, 2, 1, 9, 5, 8]])) + e = Series( + [7, 5, 3, 10, 9, 6], + index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]), + ) tm.assert_series_equal(r, e) a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) gb = a.groupby(b) - e = Series([ - 3, 2, 1, 3, 3, 2 - ], index=MultiIndex.from_arrays([list('aaabbb'), [2, 3, 1, 6, 5, 7]])) - tm.assert_series_equal(gb.nlargest(3, keep='last'), e) + e = Series( + [3, 2, 1, 3, 3, 2], + index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]), + ) + tm.assert_series_equal(gb.nlargest(3, keep="last"), e) def test_nsmallest(): a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) - b = Series(list('a' * 5 + 'b' * 5)) + b = Series(list("a" * 5 + "b" * 5)) gb = a.groupby(b) r = gb.nsmallest(3) - e = Series([ - 1, 2, 3, 0, 4, 6 - ], index=MultiIndex.from_arrays([list('aaabbb'), [0, 4, 1, 6, 7, 8]])) + e = Series( + [1, 2, 3, 0, 4, 6], + index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]), + ) tm.assert_series_equal(r, e) a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) gb = a.groupby(b) - e = Series([ - 0, 1, 1, 0, 1, 2 - ], index=MultiIndex.from_arrays([list('aaabbb'), [4, 1, 0, 9, 8, 7]])) - tm.assert_series_equal(gb.nsmallest(3, keep='last'), e) + e = Series( + [0, 1, 1, 0, 1, 2], + index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]), + ) + tm.assert_series_equal(gb.nsmallest(3, keep="last"), e) -@pytest.mark.parametrize("func", [ - 'mean', 'var', 'std', 'cumprod', 'cumsum' -]) +@pytest.mark.parametrize("func", ["mean", "var", "std", "cumprod", "cumsum"]) def test_numpy_compat(func): # see gh-12811 - df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]}) - g = df.groupby('A') + df = pd.DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) + g = df.groupby("A") msg = "numpy operations are not valid with groupby" @@ -558,12 +634,21 @@ def test_numpy_compat(func): def test_cummin_cummax(): # GH 15048 num_types = [np.int32, np.int64, np.float32, np.float64] - num_mins = [np.iinfo(np.int32).min, np.iinfo(np.int64).min, - np.finfo(np.float32).min, np.finfo(np.float64).min] - num_max = [np.iinfo(np.int32).max, np.iinfo(np.int64).max, - np.finfo(np.float32).max, np.finfo(np.float64).max] - base_df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 2, 2], - 'B': [3, 4, 3, 2, 2, 3, 2, 1]}) + num_mins = [ + np.iinfo(np.int32).min, + np.iinfo(np.int64).min, + np.finfo(np.float32).min, + np.finfo(np.float64).min, + ] + num_max = [ + np.iinfo(np.int32).max, + np.iinfo(np.int64).max, + np.finfo(np.float32).max, + np.finfo(np.float64).max, + ] + base_df = pd.DataFrame( + {"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]} + ) expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] @@ -571,154 +656,155 @@ def test_cummin_cummax(): df = base_df.astype(dtype) # cummin - expected = pd.DataFrame({'B': expected_mins}).astype(dtype) - result = df.groupby('A').cummin() + expected = pd.DataFrame({"B": expected_mins}).astype(dtype) + result = df.groupby("A").cummin() tm.assert_frame_equal(result, expected) - result = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() + result = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() tm.assert_frame_equal(result, expected) # Test cummin w/ min value for dtype - df.loc[[2, 6], 'B'] = min_val - expected.loc[[2, 3, 6, 7], 'B'] = min_val - result = df.groupby('A').cummin() + df.loc[[2, 6], "B"] = min_val + expected.loc[[2, 3, 6, 7], "B"] = min_val + result = df.groupby("A").cummin() tm.assert_frame_equal(result, expected) - expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() + expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() tm.assert_frame_equal(result, expected) # cummax - expected = pd.DataFrame({'B': expected_maxs}).astype(dtype) - result = df.groupby('A').cummax() + expected = pd.DataFrame({"B": expected_maxs}).astype(dtype) + result = df.groupby("A").cummax() tm.assert_frame_equal(result, expected) - result = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() + result = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() tm.assert_frame_equal(result, expected) # Test cummax w/ max value for dtype - df.loc[[2, 6], 'B'] = max_val - expected.loc[[2, 3, 6, 7], 'B'] = max_val - result = df.groupby('A').cummax() + df.loc[[2, 6], "B"] = max_val + expected.loc[[2, 3, 6, 7], "B"] = max_val + result = df.groupby("A").cummax() tm.assert_frame_equal(result, expected) - expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() + expected = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() tm.assert_frame_equal(result, expected) # Test nan in some values - base_df.loc[[0, 2, 4, 6], 'B'] = np.nan - expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 2, - np.nan, 3, np.nan, 1]}) - result = base_df.groupby('A').cummin() + base_df.loc[[0, 2, 4, 6], "B"] = np.nan + expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]}) + result = base_df.groupby("A").cummin() tm.assert_frame_equal(result, expected) - expected = (base_df.groupby('A') - .B - .apply(lambda x: x.cummin()) - .to_frame()) + expected = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() tm.assert_frame_equal(result, expected) - expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 4, - np.nan, 3, np.nan, 3]}) - result = base_df.groupby('A').cummax() + expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) + result = base_df.groupby("A").cummax() tm.assert_frame_equal(result, expected) - expected = (base_df.groupby('A') - .B - .apply(lambda x: x.cummax()) - .to_frame()) + expected = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() tm.assert_frame_equal(result, expected) # Test nan in entire column - base_df['B'] = np.nan - expected = pd.DataFrame({'B': [np.nan] * 8}) - result = base_df.groupby('A').cummin() + base_df["B"] = np.nan + expected = pd.DataFrame({"B": [np.nan] * 8}) + result = base_df.groupby("A").cummin() tm.assert_frame_equal(expected, result) - result = base_df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() + result = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() tm.assert_frame_equal(expected, result) - result = base_df.groupby('A').cummax() + result = base_df.groupby("A").cummax() tm.assert_frame_equal(expected, result) - result = base_df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() + result = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() tm.assert_frame_equal(expected, result) # GH 15561 - df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(['2001']))) - expected = pd.Series(pd.to_datetime('2001'), index=[0], name='b') - for method in ['cummax', 'cummin']: - result = getattr(df.groupby('a')['b'], method)() + df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(["2001"]))) + expected = pd.Series(pd.to_datetime("2001"), index=[0], name="b") + for method in ["cummax", "cummin"]: + result = getattr(df.groupby("a")["b"], method)() tm.assert_series_equal(expected, result) # GH 15635 df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1])) - result = df.groupby('a').b.cummax() - expected = pd.Series([2, 1, 2], name='b') + result = df.groupby("a").b.cummax() + expected = pd.Series([2, 1, 2], name="b") tm.assert_series_equal(result, expected) df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2])) - result = df.groupby('a').b.cummin() - expected = pd.Series([1, 2, 1], name='b') + result = df.groupby("a").b.cummin() + expected = pd.Series([1, 2, 1], name="b") tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('in_vals, out_vals', [ - - # Basics: strictly increasing (T), strictly decreasing (F), - # abs val increasing (F), non-strictly increasing (T) - ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], - [True, False, False, True]), - - # Test with inf vals - ([1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], - [True, False, True, False]), - - # Test with nan vals; should always be False - ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], - [False, False, False, False]), -]) +@pytest.mark.parametrize( + "in_vals, out_vals", + [ + # Basics: strictly increasing (T), strictly decreasing (F), + # abs val increasing (F), non-strictly increasing (T) + ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]), + # Test with inf vals + ( + [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], + [True, False, True, False], + ), + # Test with nan vals; should always be False + ( + [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False], + ), + ], +) def test_is_monotonic_increasing(in_vals, out_vals): # GH 17015 source_dict = { - 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], - 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': in_vals} + "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], + "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], + "C": in_vals, + } df = pd.DataFrame(source_dict) - result = df.groupby('B').C.is_monotonic_increasing - index = Index(list('abcd'), name='B') - expected = pd.Series(index=index, data=out_vals, name='C') + result = df.groupby("B").C.is_monotonic_increasing + index = Index(list("abcd"), name="B") + expected = pd.Series(index=index, data=out_vals, name="C") tm.assert_series_equal(result, expected) # Also check result equal to manually taking x.is_monotonic_increasing. - expected = ( - df.groupby(['B']).C.apply(lambda x: x.is_monotonic_increasing)) + expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('in_vals, out_vals', [ - # Basics: strictly decreasing (T), strictly increasing (F), - # abs val decreasing (F), non-strictly increasing (T) - ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], - [True, False, False, True]), - - # Test with inf vals - ([np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], - [True, True, False, True]), - - # Test with nan vals; should always be False - ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], - [False, False, False, False]), -]) +@pytest.mark.parametrize( + "in_vals, out_vals", + [ + # Basics: strictly decreasing (T), strictly increasing (F), + # abs val decreasing (F), non-strictly increasing (T) + ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]), + # Test with inf vals + ( + [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], + [True, True, False, True], + ), + # Test with nan vals; should always be False + ( + [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False], + ), + ], +) def test_is_monotonic_decreasing(in_vals, out_vals): # GH 17015 source_dict = { - 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], - 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': in_vals} + "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], + "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], + "C": in_vals, + } df = pd.DataFrame(source_dict) - result = df.groupby('B').C.is_monotonic_decreasing - index = Index(list('abcd'), name='B') - expected = pd.Series(index=index, data=out_vals, name='C') + result = df.groupby("B").C.is_monotonic_decreasing + index = Index(list("abcd"), name="B") + expected = pd.Series(index=index, data=out_vals, name="C") tm.assert_series_equal(result, expected) # describe # -------------------------------- + def test_apply_describe_bug(mframe): - grouped = mframe.groupby(level='first') + grouped = mframe.groupby(level="first") grouped.describe() # it works! @@ -726,10 +812,9 @@ def test_series_describe_multikey(): ts = tm.makeTimeSeries() grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.describe() - tm.assert_series_equal(result['mean'], grouped.mean(), - check_names=False) - tm.assert_series_equal(result['std'], grouped.std(), check_names=False) - tm.assert_series_equal(result['min'], grouped.min(), check_names=False) + tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False) + tm.assert_series_equal(result["std"], grouped.std(), check_names=False) + tm.assert_series_equal(result["min"], grouped.min(), check_names=False) def test_series_describe_single(): @@ -741,9 +826,9 @@ def test_series_describe_single(): def test_series_index_name(df): - grouped = df.loc[:, ['C']].groupby(df['A']) + grouped = df.loc[:, ["C"]].groupby(df["A"]) result = grouped.agg(lambda x: x.mean()) - assert result.index.name == 'A' + assert result.index.name == "A" def test_frame_describe_multikey(tsframe): @@ -755,172 +840,184 @@ def test_frame_describe_multikey(tsframe): # GH 17464 - Remove duplicate MultiIndex levels group_col = pd.MultiIndex( levels=[[col], group.columns], - codes=[[0] * len(group.columns), range(len(group.columns))]) - group = pd.DataFrame(group.values, - columns=group_col, - index=group.index) + codes=[[0] * len(group.columns), range(len(group.columns))], + ) + group = pd.DataFrame(group.values, columns=group_col, index=group.index) desc_groups.append(group) expected = pd.concat(desc_groups, axis=1) tm.assert_frame_equal(result, expected) - groupedT = tsframe.groupby({'A': 0, 'B': 0, - 'C': 1, 'D': 1}, axis=1) + groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) result = groupedT.describe() expected = tsframe.describe().T expected.index = pd.MultiIndex( levels=[[0, 1], expected.index], - codes=[[0, 0, 1, 1], range(len(expected.index))]) + codes=[[0, 0, 1, 1], range(len(expected.index))], + ) tm.assert_frame_equal(result, expected) def test_frame_describe_tupleindex(): # GH 14848 - regression from 0.19.0 to 0.19.1 - df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3, - 'y': [10, 20, 30, 40, 50] * 3, - 'z': [100, 200, 300, 400, 500] * 3}) - df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 - df2 = df1.rename(columns={'k': 'key'}) + df1 = DataFrame( + { + "x": [1, 2, 3, 4, 5] * 3, + "y": [10, 20, 30, 40, 50] * 3, + "z": [100, 200, 300, 400, 500] * 3, + } + ) + df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 + df2 = df1.rename(columns={"k": "key"}) msg = "Names should be list-like for a MultiIndex" with pytest.raises(ValueError, match=msg): - df1.groupby('k').describe() + df1.groupby("k").describe() with pytest.raises(ValueError, match=msg): - df2.groupby('key').describe() + df2.groupby("key").describe() def test_frame_describe_unstacked_format(): # GH 4792 - prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990, - pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499, - pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499} - volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000, - pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000, - pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000} - df = pd.DataFrame({'PRICE': prices, - 'VOLUME': volumes}) - result = df.groupby('PRICE').VOLUME.describe() - data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(), - df[df.PRICE == 25499].VOLUME.describe().values.tolist()] - expected = pd.DataFrame(data, - index=pd.Index([24990, 25499], name='PRICE'), - columns=['count', 'mean', 'std', 'min', - '25%', '50%', '75%', 'max']) + prices = { + pd.Timestamp("2011-01-06 10:59:05", tz=None): 24990, + pd.Timestamp("2011-01-06 12:43:33", tz=None): 25499, + pd.Timestamp("2011-01-06 12:54:09", tz=None): 25499, + } + volumes = { + pd.Timestamp("2011-01-06 10:59:05", tz=None): 1500000000, + pd.Timestamp("2011-01-06 12:43:33", tz=None): 5000000000, + pd.Timestamp("2011-01-06 12:54:09", tz=None): 100000000, + } + df = pd.DataFrame({"PRICE": prices, "VOLUME": volumes}) + result = df.groupby("PRICE").VOLUME.describe() + data = [ + df[df.PRICE == 24990].VOLUME.describe().values.tolist(), + df[df.PRICE == 25499].VOLUME.describe().values.tolist(), + ] + expected = pd.DataFrame( + data, + index=pd.Index([24990, 25499], name="PRICE"), + columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) tm.assert_frame_equal(result, expected) # nunique # -------------------------------- -@pytest.mark.parametrize('n', 10 ** np.arange(2, 6)) -@pytest.mark.parametrize('m', [10, 100, 1000]) -@pytest.mark.parametrize('sort', [False, True]) -@pytest.mark.parametrize('dropna', [False, True]) -def test_series_groupby_nunique(n, m, sort, dropna): +@pytest.mark.parametrize("n", 10 ** np.arange(2, 6)) +@pytest.mark.parametrize("m", [10, 100, 1000]) +@pytest.mark.parametrize("sort", [False, True]) +@pytest.mark.parametrize("dropna", [False, True]) +def test_series_groupby_nunique(n, m, sort, dropna): def check_nunique(df, keys, as_index=True): gr = df.groupby(keys, as_index=as_index, sort=sort) - left = gr['julie'].nunique(dropna=dropna) + left = gr["julie"].nunique(dropna=dropna) gr = df.groupby(keys, as_index=as_index, sort=sort) - right = gr['julie'].apply(Series.nunique, dropna=dropna) + right = gr["julie"].apply(Series.nunique, dropna=dropna) if not as_index: right = right.reset_index(drop=True) tm.assert_series_equal(left, right, check_names=False) - days = date_range('2015-08-23', periods=10) + days = date_range("2015-08-23", periods=10) - frame = DataFrame({'jim': np.random.choice(list(ascii_lowercase), n), - 'joe': np.random.choice(days, n), - 'julie': np.random.randint(0, m, n)}) + frame = DataFrame( + { + "jim": np.random.choice(list(ascii_lowercase), n), + "joe": np.random.choice(days, n), + "julie": np.random.randint(0, m, n), + } + ) - check_nunique(frame, ['jim']) - check_nunique(frame, ['jim', 'joe']) + check_nunique(frame, ["jim"]) + check_nunique(frame, ["jim", "joe"]) - frame.loc[1::17, 'jim'] = None - frame.loc[3::37, 'joe'] = None - frame.loc[7::19, 'julie'] = None - frame.loc[8::19, 'julie'] = None - frame.loc[9::19, 'julie'] = None + frame.loc[1::17, "jim"] = None + frame.loc[3::37, "joe"] = None + frame.loc[7::19, "julie"] = None + frame.loc[8::19, "julie"] = None + frame.loc[9::19, "julie"] = None - check_nunique(frame, ['jim']) - check_nunique(frame, ['jim', 'joe']) - check_nunique(frame, ['jim'], as_index=False) - check_nunique(frame, ['jim', 'joe'], as_index=False) + check_nunique(frame, ["jim"]) + check_nunique(frame, ["jim", "joe"]) + check_nunique(frame, ["jim"], as_index=False) + check_nunique(frame, ["jim", "joe"], as_index=False) def test_nunique(): - df = DataFrame({ - 'A': list('abbacc'), - 'B': list('abxacc'), - 'C': list('abbacx'), - }) - - expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]}) - result = df.groupby('A', as_index=False).nunique() + df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")}) + + expected = DataFrame({"A": [1] * 3, "B": [1, 2, 1], "C": [1, 1, 2]}) + result = df.groupby("A", as_index=False).nunique() tm.assert_frame_equal(result, expected) # as_index - expected.index = list('abc') - expected.index.name = 'A' - result = df.groupby('A').nunique() + expected.index = list("abc") + expected.index.name = "A" + result = df.groupby("A").nunique() tm.assert_frame_equal(result, expected) # with na - result = df.replace({'x': None}).groupby('A').nunique(dropna=False) + result = df.replace({"x": None}).groupby("A").nunique(dropna=False) tm.assert_frame_equal(result, expected) # dropna - expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3}, - index=list('abc')) - expected.index.name = 'A' - result = df.replace({'x': None}).groupby('A').nunique() + expected = DataFrame({"A": [1] * 3, "B": [1] * 3, "C": [1] * 3}, index=list("abc")) + expected.index.name = "A" + result = df.replace({"x": None}).groupby("A").nunique() tm.assert_frame_equal(result, expected) def test_nunique_with_object(): # GH 11077 data = pd.DataFrame( - [[100, 1, 'Alice'], - [200, 2, 'Bob'], - [300, 3, 'Charlie'], - [-400, 4, 'Dan'], - [500, 5, 'Edith']], - columns=['amount', 'id', 'name'] + [ + [100, 1, "Alice"], + [200, 2, "Bob"], + [300, 3, "Charlie"], + [-400, 4, "Dan"], + [500, 5, "Edith"], + ], + columns=["amount", "id", "name"], ) - result = data.groupby(['id', 'amount'])['name'].nunique() + result = data.groupby(["id", "amount"])["name"].nunique() index = MultiIndex.from_arrays([data.id, data.amount]) - expected = pd.Series([1] * 5, name='name', index=index) + expected = pd.Series([1] * 5, name="name", index=index) tm.assert_series_equal(result, expected) def test_nunique_with_empty_series(): # GH 12553 - data = pd.Series(name='name') + data = pd.Series(name="name") result = data.groupby(level=0).nunique() - expected = pd.Series(name='name', dtype='int64') + expected = pd.Series(name="name", dtype="int64") tm.assert_series_equal(result, expected) def test_nunique_with_timegrouper(): # GH 13453 - test = pd.DataFrame({ - 'time': [Timestamp('2016-06-28 09:35:35'), - Timestamp('2016-06-28 16:09:30'), - Timestamp('2016-06-28 16:46:28')], - 'data': ['1', '2', '3']}).set_index('time') - result = test.groupby(pd.Grouper(freq='h'))['data'].nunique() - expected = test.groupby( - pd.Grouper(freq='h') - )['data'].apply(pd.Series.nunique) + test = pd.DataFrame( + { + "time": [ + Timestamp("2016-06-28 09:35:35"), + Timestamp("2016-06-28 16:09:30"), + Timestamp("2016-06-28 16:46:28"), + ], + "data": ["1", "2", "3"], + } + ).set_index("time") + result = test.groupby(pd.Grouper(freq="h"))["data"].nunique() + expected = test.groupby(pd.Grouper(freq="h"))["data"].apply(pd.Series.nunique) tm.assert_series_equal(result, expected) def test_nunique_preserves_column_level_names(): # GH 23222 - test = pd.DataFrame([1, 2, 2], - columns=pd.Index(['A'], name="level_0")) + test = pd.DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0")) result = test.groupby([0, 0, 0]).nunique() expected = pd.DataFrame([2], columns=test.columns) tm.assert_frame_equal(result, expected) @@ -929,40 +1026,40 @@ def test_nunique_preserves_column_level_names(): # count # -------------------------------- + def test_groupby_timedelta_cython_count(): - df = DataFrame({'g': list('ab' * 2), - 'delt': np.arange(4).astype('timedelta64[ns]')}) - expected = Series([ - 2, 2 - ], index=pd.Index(['a', 'b'], name='g'), name='delt') - result = df.groupby('g').delt.count() + df = DataFrame( + {"g": list("ab" * 2), "delt": np.arange(4).astype("timedelta64[ns]")} + ) + expected = Series([2, 2], index=pd.Index(["a", "b"], name="g"), name="delt") + result = df.groupby("g").delt.count() tm.assert_series_equal(expected, result) def test_count(): n = 1 << 15 - dr = date_range('2015-08-30', periods=n // 10, freq='T') - - df = DataFrame({ - '1st': np.random.choice( - list(ascii_lowercase), n), - '2nd': np.random.randint(0, 5, n), - '3rd': np.random.randn(n).round(3), - '4th': np.random.randint(-10, 10, n), - '5th': np.random.choice(dr, n), - '6th': np.random.randn(n).round(3), - '7th': np.random.randn(n).round(3), - '8th': np.random.choice(dr, n) - np.random.choice(dr, 1), - '9th': np.random.choice( - list(ascii_lowercase), n) - }) - - for col in df.columns.drop(['1st', '2nd', '4th']): + dr = date_range("2015-08-30", periods=n // 10, freq="T") + + df = DataFrame( + { + "1st": np.random.choice(list(ascii_lowercase), n), + "2nd": np.random.randint(0, 5, n), + "3rd": np.random.randn(n).round(3), + "4th": np.random.randint(-10, 10, n), + "5th": np.random.choice(dr, n), + "6th": np.random.randn(n).round(3), + "7th": np.random.randn(n).round(3), + "8th": np.random.choice(dr, n) - np.random.choice(dr, 1), + "9th": np.random.choice(list(ascii_lowercase), n), + } + ) + + for col in df.columns.drop(["1st", "2nd", "4th"]): df.loc[np.random.choice(n, n // 10), col] = np.nan - df['9th'] = df['9th'].astype('category') + df["9th"] = df["9th"].astype("category") - for key in ['1st', '2nd', ['1st', '2nd']]: + for key in ["1st", "2nd", ["1st", "2nd"]]: left = df.groupby(key).count() right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) tm.assert_frame_equal(left, right) @@ -971,70 +1068,65 @@ def test_count(): def test_count_non_nulls(): # GH#5610 # count counts non-nulls - df = pd.DataFrame([[1, 2, 'foo'], - [1, np.nan, 'bar'], - [3, np.nan, np.nan]], - columns=['A', 'B', 'C']) + df = pd.DataFrame( + [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, np.nan]], + columns=["A", "B", "C"], + ) - count_as = df.groupby('A').count() - count_not_as = df.groupby('A', as_index=False).count() + count_as = df.groupby("A").count() + count_not_as = df.groupby("A", as_index=False).count() - expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'], - index=[1, 3]) - expected.index.name = 'A' + expected = DataFrame([[1, 2], [0, 0]], columns=["B", "C"], index=[1, 3]) + expected.index.name = "A" tm.assert_frame_equal(count_not_as, expected.reset_index()) tm.assert_frame_equal(count_as, expected) - count_B = df.groupby('A')['B'].count() - tm.assert_series_equal(count_B, expected['B']) + count_B = df.groupby("A")["B"].count() + tm.assert_series_equal(count_B, expected["B"]) def test_count_object(): - df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, 'c': [2] * 3 + [3] * 3}) - result = df.groupby('c').a.count() - expected = pd.Series([ - 3, 3 - ], index=pd.Index([2, 3], name='c'), name='a') + df = pd.DataFrame({"a": ["a"] * 3 + ["b"] * 3, "c": [2] * 3 + [3] * 3}) + result = df.groupby("c").a.count() + expected = pd.Series([3, 3], index=pd.Index([2, 3], name="c"), name="a") tm.assert_series_equal(result, expected) - df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3, - 'c': [2] * 3 + [3] * 3}) - result = df.groupby('c').a.count() - expected = pd.Series([ - 1, 3 - ], index=pd.Index([2, 3], name='c'), name='a') + df = pd.DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3}) + result = df.groupby("c").a.count() + expected = pd.Series([1, 3], index=pd.Index([2, 3], name="c"), name="a") tm.assert_series_equal(result, expected) def test_count_cross_type(): # GH8169 - vals = np.hstack((np.random.randint(0, 5, (100, 2)), np.random.randint( - 0, 2, (100, 2)))) + vals = np.hstack( + (np.random.randint(0, 5, (100, 2)), np.random.randint(0, 2, (100, 2))) + ) - df = pd.DataFrame(vals, columns=['a', 'b', 'c', 'd']) + df = pd.DataFrame(vals, columns=["a", "b", "c", "d"]) df[df == 2] = np.nan - expected = df.groupby(['c', 'd']).count() + expected = df.groupby(["c", "d"]).count() - for t in ['float32', 'object']: - df['a'] = df['a'].astype(t) - df['b'] = df['b'].astype(t) - result = df.groupby(['c', 'd']).count() + for t in ["float32", "object"]: + df["a"] = df["a"].astype(t) + df["b"] = df["b"].astype(t) + result = df.groupby(["c", "d"]).count() tm.assert_frame_equal(result, expected) def test_lower_int_prec_count(): - df = DataFrame({'a': np.array( - [0, 1, 2, 100], np.int8), - 'b': np.array( - [1, 2, 3, 6], np.uint32), - 'c': np.array( - [4, 5, 6, 8], np.int16), - 'grp': list('ab' * 2)}) - result = df.groupby('grp').count() - expected = DataFrame({'a': [2, 2], - 'b': [2, 2], - 'c': [2, 2]}, index=pd.Index(list('ab'), - name='grp')) + df = DataFrame( + { + "a": np.array([0, 1, 2, 100], np.int8), + "b": np.array([1, 2, 3, 6], np.uint32), + "c": np.array([4, 5, 6, 8], np.int16), + "grp": list("ab" * 2), + } + ) + result = df.groupby("grp").count() + expected = DataFrame( + {"a": [2, 2], "b": [2, 2], "c": [2, 2]}, index=pd.Index(list("ab"), name="grp") + ) tm.assert_frame_equal(result, expected) @@ -1043,8 +1135,7 @@ class RaisingObjectException(Exception): pass class RaisingObject: - - def __init__(self, msg='I will raise inside Cython'): + def __init__(self, msg="I will raise inside Cython"): super().__init__() self.msg = msg @@ -1052,116 +1143,129 @@ def __eq__(self, other): # gets called in Cython to check that raising calls the method raise RaisingObjectException(self.msg) - df = DataFrame({'a': [RaisingObject() for _ in range(4)], - 'grp': list('ab' * 2)}) - result = df.groupby('grp').count() - expected = DataFrame({'a': [2, 2]}, index=pd.Index( - list('ab'), name='grp')) + df = DataFrame({"a": [RaisingObject() for _ in range(4)], "grp": list("ab" * 2)}) + result = df.groupby("grp").count() + expected = DataFrame({"a": [2, 2]}, index=pd.Index(list("ab"), name="grp")) tm.assert_frame_equal(result, expected) # size # -------------------------------- + def test_size(df): - grouped = df.groupby(['A', 'B']) + grouped = df.groupby(["A", "B"]) result = grouped.size() for key, group in grouped: assert result[key] == len(group) - grouped = df.groupby('A') + grouped = df.groupby("A") result = grouped.size() for key, group in grouped: assert result[key] == len(group) - grouped = df.groupby('B') + grouped = df.groupby("B") result = grouped.size() for key, group in grouped: assert result[key] == len(group) - df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc')) - for sort, key in product((False, True), ('a', 'b', ['a', 'b'])): + df = DataFrame(np.random.choice(20, (1000, 3)), columns=list("abc")) + for sort, key in product((False, True), ("a", "b", ["a", "b"])): left = df.groupby(key, sort=sort).size() - right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0]) + right = df.groupby(key, sort=sort)["c"].apply(lambda a: a.shape[0]) tm.assert_series_equal(left, right, check_names=False) # GH11699 - df = DataFrame(columns=['A', 'B']) - out = Series(dtype='int64', index=Index([], name='A')) - tm.assert_series_equal(df.groupby('A').size(), out) + df = DataFrame(columns=["A", "B"]) + out = Series(dtype="int64", index=Index([], name="A")) + tm.assert_series_equal(df.groupby("A").size(), out) def test_size_groupby_all_null(): # GH23050 # Assert no 'Value Error : Length of passed values is 2, index implies 0' - df = DataFrame({'A': [None, None]}) # all-null groups - result = df.groupby('A').size() - expected = Series(dtype='int64', index=Index([], name='A')) + df = DataFrame({"A": [None, None]}) # all-null groups + result = df.groupby("A").size() + expected = Series(dtype="int64", index=Index([], name="A")) tm.assert_series_equal(result, expected) # quantile # -------------------------------- -@pytest.mark.parametrize("interpolation", [ - "linear", "lower", "higher", "nearest", "midpoint"]) -@pytest.mark.parametrize("a_vals,b_vals", [ - # Ints - ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]), - ([1, 2, 3, 4], [4, 3, 2, 1]), - ([1, 2, 3, 4, 5], [4, 3, 2, 1]), - # Floats - ([1., 2., 3., 4., 5.], [5., 4., 3., 2., 1.]), - # Missing data - ([1., np.nan, 3., np.nan, 5.], [5., np.nan, 3., np.nan, 1.]), - ([np.nan, 4., np.nan, 2., np.nan], [np.nan, 4., np.nan, 2., np.nan]), - # Timestamps - ([x for x in pd.date_range('1/1/18', freq='D', periods=5)], - [x for x in pd.date_range('1/1/18', freq='D', periods=5)][::-1]), - # All NA - ([np.nan] * 5, [np.nan] * 5), -]) -@pytest.mark.parametrize('q', [0, .25, .5, .75, 1]) +@pytest.mark.parametrize( + "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] +) +@pytest.mark.parametrize( + "a_vals,b_vals", + [ + # Ints + ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]), + ([1, 2, 3, 4], [4, 3, 2, 1]), + ([1, 2, 3, 4, 5], [4, 3, 2, 1]), + # Floats + ([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]), + # Missing data + ([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]), + ([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]), + # Timestamps + ( + [x for x in pd.date_range("1/1/18", freq="D", periods=5)], + [x for x in pd.date_range("1/1/18", freq="D", periods=5)][::-1], + ), + # All NA + ([np.nan] * 5, [np.nan] * 5), + ], +) +@pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1]) def test_quantile(interpolation, a_vals, b_vals, q): - if interpolation == 'nearest' and q == 0.5 and b_vals == [4, 3, 2, 1]: - pytest.skip("Unclear numpy expectation for nearest result with " - "equidistant data") + if interpolation == "nearest" and q == 0.5 and b_vals == [4, 3, 2, 1]: + pytest.skip( + "Unclear numpy expectation for nearest result with " "equidistant data" + ) a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation) b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation) - df = DataFrame({ - 'key': ['a'] * len(a_vals) + ['b'] * len(b_vals), - 'val': a_vals + b_vals}) + df = DataFrame( + {"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": a_vals + b_vals} + ) - expected = DataFrame([a_expected, b_expected], columns=['val'], - index=Index(['a', 'b'], name='key')) - result = df.groupby('key').quantile(q, interpolation=interpolation) + expected = DataFrame( + [a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key") + ) + result = df.groupby("key").quantile(q, interpolation=interpolation) tm.assert_frame_equal(result, expected) def test_quantile_raises(): - df = pd.DataFrame([ - ['foo', 'a'], ['foo', 'b'], ['foo', 'c']], columns=['key', 'val']) + df = pd.DataFrame( + [["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"] + ) - with pytest.raises(TypeError, match="cannot be performed against " - "'object' dtypes"): - df.groupby('key').quantile() + with pytest.raises( + TypeError, match="cannot be performed against " "'object' dtypes" + ): + df.groupby("key").quantile() # pipe # -------------------------------- + def test_pipe(): # Test the pipe method of DataFrameGroupBy. # Issue #17871 random_state = np.random.RandomState(1234567890) - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': random_state.randn(8), - 'C': random_state.randn(8)}) + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": random_state.randn(8), + "C": random_state.randn(8), + } + ) def f(dfgb): return dfgb.B.max() - dfgb.C.min().min() @@ -1174,11 +1278,10 @@ def square(srs): # Series -> Series # This then chains the GroupBy.pipe and the # NDFrame.pipe methods - result = df.groupby('A').pipe(f).pipe(square) + result = df.groupby("A").pipe(f).pipe(square) - index = Index(['bar', 'foo'], dtype='object', name='A') - expected = pd.Series([8.99110003361, 8.17516964785], name='B', - index=index) + index = Index(["bar", "foo"], dtype="object", name="A") + expected = pd.Series([8.99110003361, 8.17516964785], name="B", index=index) tm.assert_series_equal(expected, result) @@ -1187,13 +1290,18 @@ def test_pipe_args(): # Test passing args to the pipe method of DataFrameGroupBy. # Issue #17871 - df = pd.DataFrame({'group': ['A', 'A', 'B', 'B', 'C'], - 'x': [1.0, 2.0, 3.0, 2.0, 5.0], - 'y': [10.0, 100.0, 1000.0, -100.0, -1000.0]}) + df = pd.DataFrame( + { + "group": ["A", "A", "B", "B", "C"], + "x": [1.0, 2.0, 3.0, 2.0, 5.0], + "y": [10.0, 100.0, 1000.0, -100.0, -1000.0], + } + ) def f(dfgb, arg1): - return (dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False) - .groupby(dfgb.grouper)) + return dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False).groupby( + dfgb.grouper + ) def g(dfgb, arg2): return dfgb.sum() / dfgb.sum().sum() + arg2 @@ -1201,16 +1309,11 @@ def g(dfgb, arg2): def h(df, arg3): return df.x + df.y - arg3 - result = (df - .groupby('group') - .pipe(f, 0) - .pipe(g, 10) - .pipe(h, 100)) + result = df.groupby("group").pipe(f, 0).pipe(g, 10).pipe(h, 100) # Assert the results here - index = pd.Index(['A', 'B', 'C'], name='group') - expected = pd.Series([-79.5160891089, -78.4839108911, -80], - index=index) + index = pd.Index(["A", "B", "C"], name="group") + expected = pd.Series([-79.5160891089, -78.4839108911, -80], index=index) tm.assert_series_equal(expected, result) @@ -1225,8 +1328,10 @@ def h(df, arg3): def test_groupby_mean_no_overflow(): # Regression test for (#22487) - df = pd.DataFrame({ - "user": ["A", "A", "A", "A", "A"], - "connections": [4970, 4749, 4719, 4704, 18446744073699999744] - }) - assert df.groupby('user')['connections'].mean()['A'] == 3689348814740003840 + df = pd.DataFrame( + { + "user": ["A", "A", "A", "A", "A"], + "connections": [4970, 4749, 4719, 4704, 18446744073699999744], + } + ) + assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d13dddac790420..2379d25ebe5aa9 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -9,22 +9,24 @@ from pandas.errors import PerformanceWarning import pandas as pd -from pandas import ( - DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv) +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv import pandas.core.common as com import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) def test_repr(): # GH18203 - result = repr(pd.Grouper(key='A', level='B')) + result = repr(pd.Grouper(key="A", level="B")) expected = "Grouper(key='A', level='B', axis=0, sort=False)" assert result == expected -@pytest.mark.parametrize('dtype', ['int64', 'int32', 'float64', 'float32']) +@pytest.mark.parametrize("dtype", ["int64", "int32", "float64", "float32"]) def test_basic(dtype): data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype) @@ -51,15 +53,13 @@ def test_basic(dtype): assert_series_equal(transformed, expected) value_grouped = data.groupby(data) - assert_series_equal(value_grouped.aggregate(np.mean), agged, - check_index_type=False) + assert_series_equal(value_grouped.aggregate(np.mean), agged, check_index_type=False) # complex agg agged = grouped.aggregate([np.mean, np.std]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - agged = grouped.aggregate({'one': np.mean, 'two': np.std}) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + agged = grouped.aggregate({"one": np.mean, "two": np.std}) group_constants = {0: 10, 1: 20, 2: 30} agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) @@ -77,22 +77,22 @@ def test_groupby_nonobject_dtype(mframe, df_mixed_floats): grouped = mframe.groupby(key) result = grouped.sum() - expected = mframe.groupby(key.astype('O')).sum() + expected = mframe.groupby(key.astype("O")).sum() assert_frame_equal(result, expected) # GH 3911, mixed frame non-conversion df = df_mixed_floats.copy() - df['value'] = range(len(df)) + df["value"] = range(len(df)) def max_value(group): - return group.loc[group['value'].idxmax()] + return group.loc[group["value"].idxmax()] - applied = df.groupby('A').apply(max_value) + applied = df.groupby("A").apply(max_value) result = applied.dtypes - expected = Series([np.dtype('object')] * 2 + - [np.dtype('float64')] * 2 + - [np.dtype('int64')], - index=['A', 'B', 'C', 'D', 'value']) + expected = Series( + [np.dtype("object")] * 2 + [np.dtype("float64")] * 2 + [np.dtype("int64")], + index=["A", "B", "C", "D", "value"], + ) assert_series_equal(result, expected) @@ -100,11 +100,13 @@ def test_groupby_return_type(): # GH2893, return a reduced type df1 = DataFrame( - [{"val1": 1, "val2": 20}, - {"val1": 1, "val2": 19}, - {"val1": 2, "val2": 27}, - {"val1": 2, "val2": 12} - ]) + [ + {"val1": 1, "val2": 20}, + {"val1": 1, "val2": 19}, + {"val1": 2, "val2": 27}, + {"val1": 2, "val2": 12}, + ] + ) def func(dataf): return dataf["val2"] - dataf["val2"].mean() @@ -113,11 +115,13 @@ def func(dataf): assert isinstance(result, Series) df2 = DataFrame( - [{"val1": 1, "val2": 20}, - {"val1": 1, "val2": 19}, - {"val1": 1, "val2": 27}, - {"val1": 1, "val2": 12} - ]) + [ + {"val1": 1, "val2": 20}, + {"val1": 1, "val2": 19}, + {"val1": 1, "val2": 27}, + {"val1": 1, "val2": 12}, + ] + ) def func(dataf): return dataf["val2"] - dataf["val2"].mean() @@ -126,72 +130,74 @@ def func(dataf): assert isinstance(result, Series) # GH3596, return a consistent type (regression in 0.11 from 0.10.1) - df = DataFrame([[1, 1], [1, 1]], columns=['X', 'Y']) - result = df.groupby('X', squeeze=False).count() + df = DataFrame([[1, 1], [1, 1]], columns=["X", "Y"]) + result = df.groupby("X", squeeze=False).count() assert isinstance(result, DataFrame) def test_inconsistent_return_type(): # GH5592 # inconsistent return type - df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb', - 'Pony', 'Pony'], - B=Series(np.arange(7), dtype='int64'), - C=date_range('20130101', periods=7))) + df = DataFrame( + dict( + A=["Tiger", "Tiger", "Tiger", "Lamb", "Lamb", "Pony", "Pony"], + B=Series(np.arange(7), dtype="int64"), + C=date_range("20130101", periods=7), + ) + ) def f(grp): return grp.iloc[0] - expected = df.groupby('A').first()[['B']] - result = df.groupby('A').apply(f)[['B']] + expected = df.groupby("A").first()[["B"]] + result = df.groupby("A").apply(f)[["B"]] assert_frame_equal(result, expected) def f(grp): - if grp.name == 'Tiger': + if grp.name == "Tiger": return None return grp.iloc[0] - result = df.groupby('A').apply(f)[['B']] + result = df.groupby("A").apply(f)[["B"]] e = expected.copy() - e.loc['Tiger'] = np.nan + e.loc["Tiger"] = np.nan assert_frame_equal(result, e) def f(grp): - if grp.name == 'Pony': + if grp.name == "Pony": return None return grp.iloc[0] - result = df.groupby('A').apply(f)[['B']] + result = df.groupby("A").apply(f)[["B"]] e = expected.copy() - e.loc['Pony'] = np.nan + e.loc["Pony"] = np.nan assert_frame_equal(result, e) # 5592 revisited, with datetimes def f(grp): - if grp.name == 'Pony': + if grp.name == "Pony": return None return grp.iloc[0] - result = df.groupby('A').apply(f)[['C']] - e = df.groupby('A').first()[['C']] - e.loc['Pony'] = pd.NaT + result = df.groupby("A").apply(f)[["C"]] + e = df.groupby("A").first()[["C"]] + e.loc["Pony"] = pd.NaT assert_frame_equal(result, e) # scalar outputs def f(grp): - if grp.name == 'Pony': + if grp.name == "Pony": return None - return grp.iloc[0].loc['C'] + return grp.iloc[0].loc["C"] - result = df.groupby('A').apply(f) - e = df.groupby('A').first()['C'].copy() - e.loc['Pony'] = np.nan + result = df.groupby("A").apply(f) + e = df.groupby("A").first()["C"].copy() + e.loc["Pony"] = np.nan e.name = None assert_series_equal(result, e) def test_pass_args_kwargs(ts, tsframe): - def f(x, q=None, axis=0): return np.percentile(x, q, axis=axis) @@ -203,7 +209,7 @@ def f(x, q=None, axis=0): apply_result = ts_grouped.apply(np.percentile, 80, axis=0) trans_result = ts_grouped.transform(np.percentile, 80, axis=0) - agg_expected = ts_grouped.quantile(.8) + agg_expected = ts_grouped.quantile(0.8) trans_expected = ts_grouped.transform(g) assert_series_equal(apply_result, agg_expected) @@ -220,21 +226,20 @@ def f(x, q=None, axis=0): # DataFrame df_grouped = tsframe.groupby(lambda x: x.month) agg_result = df_grouped.agg(np.percentile, 80, axis=0) - apply_result = df_grouped.apply(DataFrame.quantile, .8) - expected = df_grouped.quantile(.8) + apply_result = df_grouped.apply(DataFrame.quantile, 0.8) + expected = df_grouped.quantile(0.8) assert_frame_equal(apply_result, expected, check_names=False) assert_frame_equal(agg_result, expected) agg_result = df_grouped.agg(f, q=80) - apply_result = df_grouped.apply(DataFrame.quantile, q=.8) + apply_result = df_grouped.apply(DataFrame.quantile, q=0.8) assert_frame_equal(agg_result, expected) assert_frame_equal(apply_result, expected, check_names=False) def test_len(): df = tm.makeTimeDataFrame() - grouped = df.groupby([lambda x: x.year, lambda x: x.month, - lambda x: x.day]) + grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]) assert len(grouped) == len(df) grouped = df.groupby([lambda x: x.year, lambda x: x.month]) @@ -243,34 +248,37 @@ def test_len(): # issue 11016 df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3])) - assert len(df.groupby(('a'))) == 0 - assert len(df.groupby(('b'))) == 3 - assert len(df.groupby(['a', 'b'])) == 3 + assert len(df.groupby(("a"))) == 0 + assert len(df.groupby(("b"))) == 3 + assert len(df.groupby(["a", "b"])) == 3 def test_basic_regression(): # regression result = Series([1.0 * x for x in list(range(1, 10)) * 10]) - data = np.random.random(1100) * 10. + data = np.random.random(1100) * 10.0 groupings = Series(data) grouped = result.groupby(groupings) grouped.mean() -@pytest.mark.parametrize('dtype', ['float64', 'float32', 'int64', - 'int32', 'int16', 'int8']) +@pytest.mark.parametrize( + "dtype", ["float64", "float32", "int64", "int32", "int16", "int8"] +) def test_with_na_groups(dtype): index = Index(np.arange(10)) values = Series(np.ones(10), index, dtype=dtype) - labels = Series([np.nan, 'foo', 'bar', 'bar', np.nan, np.nan, - 'bar', 'bar', np.nan, 'foo'], index=index) + labels = Series( + [np.nan, "foo", "bar", "bar", np.nan, np.nan, "bar", "bar", np.nan, "foo"], + index=index, + ) # this SHOULD be an int grouped = values.groupby(labels) agged = grouped.agg(len) - expected = Series([4, 2], index=['bar', 'foo']) + expected = Series([4, 2], index=["bar", "foo"]) assert_series_equal(agged, expected, check_dtype=False) @@ -281,7 +289,7 @@ def f(x): return float(len(x)) agged = grouped.agg(f) - expected = Series([4, 2], index=['bar', 'foo']) + expected = Series([4, 2], index=["bar", "foo"]) assert_series_equal(agged, expected, check_dtype=False) assert issubclass(agged.dtype.type, np.dtype(dtype).type) @@ -294,12 +302,11 @@ def test_indices_concatenation_order(): def f1(x): y = x[(x.b % 2) == 1] ** 2 if y.empty: - multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2, - names=['b', 'c']) - res = DataFrame(columns=['a'], index=multiindex) + multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2, names=["b", "c"]) + res = DataFrame(columns=["a"], index=multiindex) return res else: - y = y.set_index(['b', 'c']) + y = y.set_index(["b", "c"]) return y def f2(x): @@ -307,40 +314,41 @@ def f2(x): if y.empty: return DataFrame() else: - y = y.set_index(['b', 'c']) + y = y.set_index(["b", "c"]) return y def f3(x): y = x[(x.b % 2) == 1] ** 2 if y.empty: - multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2, - names=['foo', 'bar']) - res = DataFrame(columns=['a', 'b'], index=multiindex) + multiindex = MultiIndex( + levels=[[]] * 2, codes=[[]] * 2, names=["foo", "bar"] + ) + res = DataFrame(columns=["a", "b"], index=multiindex) return res else: return y - df = DataFrame({'a': [1, 2, 2, 2], 'b': range(4), 'c': range(5, 9)}) + df = DataFrame({"a": [1, 2, 2, 2], "b": range(4), "c": range(5, 9)}) - df2 = DataFrame({'a': [3, 2, 2, 2], 'b': range(4), 'c': range(5, 9)}) + df2 = DataFrame({"a": [3, 2, 2, 2], "b": range(4), "c": range(5, 9)}) # correct result - result1 = df.groupby('a').apply(f1) - result2 = df2.groupby('a').apply(f1) + result1 = df.groupby("a").apply(f1) + result2 = df2.groupby("a").apply(f1) assert_frame_equal(result1, result2) # should fail (not the same number of levels) msg = "Cannot concat indices that do not have the same number of levels" with pytest.raises(AssertionError, match=msg): - df.groupby('a').apply(f2) + df.groupby("a").apply(f2) with pytest.raises(AssertionError, match=msg): - df2.groupby('a').apply(f2) + df2.groupby("a").apply(f2) # should fail (incorrect shape) with pytest.raises(AssertionError, match=msg): - df.groupby('a').apply(f3) + df.groupby("a").apply(f3) with pytest.raises(AssertionError, match=msg): - df2.groupby('a').apply(f3) + df2.groupby("a").apply(f3) def test_attr_wrapper(ts): @@ -363,7 +371,7 @@ def test_attr_wrapper(ts): # make sure raises error msg = "'SeriesGroupBy' object has no attribute 'foo'" with pytest.raises(AttributeError, match=msg): - getattr(grouped, 'foo') + getattr(grouped, "foo") def test_frame_groupby(tsframe): @@ -376,8 +384,8 @@ def test_frame_groupby(tsframe): # by string tscopy = tsframe.copy() - tscopy['weekday'] = [x.weekday() for x in tscopy.index] - stragged = tscopy.groupby('weekday').aggregate(np.mean) + tscopy["weekday"] = [x.weekday() for x in tscopy.index] + stragged = tscopy.groupby("weekday").aggregate(np.mean) assert_frame_equal(stragged, aggregated, check_names=False) # transform @@ -391,8 +399,7 @@ def test_frame_groupby(tsframe): for name, group in grouped: mean = group.mean() for idx in group.index: - tm.assert_series_equal(transformed.xs(idx), mean, - check_names=False) + tm.assert_series_equal(transformed.xs(idx), mean, check_names=False) # iterate for weekday, group in grouped: @@ -408,7 +415,7 @@ def test_frame_groupby(tsframe): def test_frame_groupby_columns(tsframe): - mapping = {'A': 0, 'B': 0, 'C': 1, 'D': 1} + mapping = {"A": 0, "B": 0, "C": 1, "D": 1} grouped = tsframe.groupby(mapping, axis=1) # aggregate @@ -427,109 +434,149 @@ def test_frame_groupby_columns(tsframe): def test_frame_set_name_single(df): - grouped = df.groupby('A') + grouped = df.groupby("A") result = grouped.mean() - assert result.index.name == 'A' + assert result.index.name == "A" - result = df.groupby('A', as_index=False).mean() - assert result.index.name != 'A' + result = df.groupby("A", as_index=False).mean() + assert result.index.name != "A" result = grouped.agg(np.mean) - assert result.index.name == 'A' + assert result.index.name == "A" - result = grouped.agg({'C': np.mean, 'D': np.std}) - assert result.index.name == 'A' + result = grouped.agg({"C": np.mean, "D": np.std}) + assert result.index.name == "A" - result = grouped['C'].mean() - assert result.index.name == 'A' - result = grouped['C'].agg(np.mean) - assert result.index.name == 'A' - result = grouped['C'].agg([np.mean, np.std]) - assert result.index.name == 'A' + result = grouped["C"].mean() + assert result.index.name == "A" + result = grouped["C"].agg(np.mean) + assert result.index.name == "A" + result = grouped["C"].agg([np.mean, np.std]) + assert result.index.name == "A" - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = grouped['C'].agg({'foo': np.mean, 'bar': np.std}) - assert result.index.name == 'A' + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = grouped["C"].agg({"foo": np.mean, "bar": np.std}) + assert result.index.name == "A" def test_multi_func(df): - col1 = df['A'] - col2 = df['B'] + col1 = df["A"] + col2 = df["B"] grouped = df.groupby([col1.get, col2.get]) agged = grouped.mean() - expected = df.groupby(['A', 'B']).mean() + expected = df.groupby(["A", "B"]).mean() # TODO groupby get drops names - assert_frame_equal(agged.loc[:, ['C', 'D']], - expected.loc[:, ['C', 'D']], - check_names=False) + assert_frame_equal( + agged.loc[:, ["C", "D"]], expected.loc[:, ["C", "D"]], check_names=False + ) # some "groups" with no data - df = DataFrame({'v1': np.random.randn(6), - 'v2': np.random.randn(6), - 'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']), - 'k2': np.array(['1', '1', '1', '2', '2', '2'])}, - index=['one', 'two', 'three', 'four', 'five', 'six']) + df = DataFrame( + { + "v1": np.random.randn(6), + "v2": np.random.randn(6), + "k1": np.array(["b", "b", "b", "a", "a", "a"]), + "k2": np.array(["1", "1", "1", "2", "2", "2"]), + }, + index=["one", "two", "three", "four", "five", "six"], + ) # only verify that it works for now - grouped = df.groupby(['k1', 'k2']) + grouped = df.groupby(["k1", "k2"]) grouped.agg(np.sum) def test_multi_key_multiple_functions(df): - grouped = df.groupby(['A', 'B'])['C'] + grouped = df.groupby(["A", "B"])["C"] agged = grouped.agg([np.mean, np.std]) - expected = DataFrame({'mean': grouped.agg(np.mean), - 'std': grouped.agg(np.std)}) + expected = DataFrame({"mean": grouped.agg(np.mean), "std": grouped.agg(np.std)}) assert_frame_equal(agged, expected) def test_frame_multi_key_function_list(): data = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) - - grouped = data.groupby(['A', 'B']) + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) + + grouped = data.groupby(["A", "B"]) funcs = [np.mean, np.std] agged = grouped.agg(funcs) - expected = pd.concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs), - grouped['F'].agg(funcs)], - keys=['D', 'E', 'F'], axis=1) - assert (isinstance(agged.index, MultiIndex)) - assert (isinstance(expected.index, MultiIndex)) + expected = pd.concat( + [grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)], + keys=["D", "E", "F"], + axis=1, + ) + assert isinstance(agged.index, MultiIndex) + assert isinstance(expected.index, MultiIndex) assert_frame_equal(agged, expected) -@pytest.mark.parametrize('op', [lambda x: x.sum(), lambda x: x.mean()]) +@pytest.mark.parametrize("op", [lambda x: x.sum(), lambda x: x.mean()]) def test_groupby_multiple_columns(df, op): data = df - grouped = data.groupby(['A', 'B']) + grouped = data.groupby(["A", "B"]) result1 = op(grouped) keys = [] values = [] - for n1, gp1 in data.groupby('A'): - for n2, gp2 in gp1.groupby('B'): + for n1, gp1 in data.groupby("A"): + for n2, gp2 in gp1.groupby("B"): keys.append((n1, n2)) - values.append(op(gp2.loc[:, ['C', 'D']])) + values.append(op(gp2.loc[:, ["C", "D"]])) - mi = MultiIndex.from_tuples(keys, names=['A', 'B']) + mi = MultiIndex.from_tuples(keys, names=["A", "B"]) expected = pd.concat(values, axis=1).T expected.index = mi # a little bit crude - for col in ['C', 'D']: + for col in ["C", "D"]: result_col = op(grouped[col]) pivoted = result1[col] exp = expected[col] @@ -537,14 +584,14 @@ def test_groupby_multiple_columns(df, op): assert_series_equal(pivoted, exp) # test single series works the same - result = data['C'].groupby([data['A'], data['B']]).mean() - expected = data.groupby(['A', 'B']).mean()['C'] + result = data["C"].groupby([data["A"], data["B"]]).mean() + expected = data.groupby(["A", "B"]).mean()["C"] assert_series_equal(result, expected) def test_groupby_as_index_agg(df): - grouped = df.groupby('A', as_index=False) + grouped = df.groupby("A", as_index=False) # single-key @@ -552,48 +599,46 @@ def test_groupby_as_index_agg(df): expected = grouped.mean() assert_frame_equal(result, expected) - result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]])) + result2 = grouped.agg(OrderedDict([["C", np.mean], ["D", np.sum]])) expected2 = grouped.mean() - expected2['D'] = grouped.sum()['D'] + expected2["D"] = grouped.sum()["D"] assert_frame_equal(result2, expected2) - grouped = df.groupby('A', as_index=True) - expected3 = grouped['C'].sum() - expected3 = DataFrame(expected3).rename(columns={'C': 'Q'}) + grouped = df.groupby("A", as_index=True) + expected3 = grouped["C"].sum() + expected3 = DataFrame(expected3).rename(columns={"C": "Q"}) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result3 = grouped['C'].agg({'Q': np.sum}) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result3 = grouped["C"].agg({"Q": np.sum}) assert_frame_equal(result3, expected3) # multi-key - grouped = df.groupby(['A', 'B'], as_index=False) + grouped = df.groupby(["A", "B"], as_index=False) result = grouped.agg(np.mean) expected = grouped.mean() assert_frame_equal(result, expected) - result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]])) + result2 = grouped.agg(OrderedDict([["C", np.mean], ["D", np.sum]])) expected2 = grouped.mean() - expected2['D'] = grouped.sum()['D'] + expected2["D"] = grouped.sum()["D"] assert_frame_equal(result2, expected2) - expected3 = grouped['C'].sum() - expected3 = DataFrame(expected3).rename(columns={'C': 'Q'}) - result3 = grouped['C'].agg({'Q': np.sum}) + expected3 = grouped["C"].sum() + expected3 = DataFrame(expected3).rename(columns={"C": "Q"}) + result3 = grouped["C"].agg({"Q": np.sum}) assert_frame_equal(result3, expected3) # GH7115 & GH8112 & GH8582 - df = DataFrame(np.random.randint(0, 100, (50, 3)), - columns=['jim', 'joe', 'jolie']) - ts = Series(np.random.randint(5, 10, 50), name='jim') + df = DataFrame(np.random.randint(0, 100, (50, 3)), columns=["jim", "joe", "jolie"]) + ts = Series(np.random.randint(5, 10, 50), name="jim") gr = df.groupby(ts) gr.nth(0) # invokes set_selection_from_grouper internally assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum)) - for attr in ['mean', 'max', 'count', 'idxmax', 'cumsum', 'all']: + for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]: gr = df.groupby(ts, as_index=False) left = getattr(gr, attr)() @@ -604,69 +649,69 @@ def test_groupby_as_index_agg(df): def test_as_index_series_return_frame(df): - grouped = df.groupby('A', as_index=False) - grouped2 = df.groupby(['A', 'B'], as_index=False) + grouped = df.groupby("A", as_index=False) + grouped2 = df.groupby(["A", "B"], as_index=False) - result = grouped['C'].agg(np.sum) - expected = grouped.agg(np.sum).loc[:, ['A', 'C']] + result = grouped["C"].agg(np.sum) + expected = grouped.agg(np.sum).loc[:, ["A", "C"]] assert isinstance(result, DataFrame) assert_frame_equal(result, expected) - result2 = grouped2['C'].agg(np.sum) - expected2 = grouped2.agg(np.sum).loc[:, ['A', 'B', 'C']] + result2 = grouped2["C"].agg(np.sum) + expected2 = grouped2.agg(np.sum).loc[:, ["A", "B", "C"]] assert isinstance(result2, DataFrame) assert_frame_equal(result2, expected2) - result = grouped['C'].sum() - expected = grouped.sum().loc[:, ['A', 'C']] + result = grouped["C"].sum() + expected = grouped.sum().loc[:, ["A", "C"]] assert isinstance(result, DataFrame) assert_frame_equal(result, expected) - result2 = grouped2['C'].sum() - expected2 = grouped2.sum().loc[:, ['A', 'B', 'C']] + result2 = grouped2["C"].sum() + expected2 = grouped2.sum().loc[:, ["A", "B", "C"]] assert isinstance(result2, DataFrame) assert_frame_equal(result2, expected2) def test_as_index_series_column_slice_raises(df): # GH15072 - grouped = df.groupby('A', as_index=False) + grouped = df.groupby("A", as_index=False) msg = r"Column\(s\) C already selected" with pytest.raises(IndexError, match=msg): - grouped['C'].__getitem__('D') + grouped["C"].__getitem__("D") def test_groupby_as_index_cython(df): data = df # single-key - grouped = data.groupby('A', as_index=False) + grouped = data.groupby("A", as_index=False) result = grouped.mean() - expected = data.groupby(['A']).mean() - expected.insert(0, 'A', expected.index) + expected = data.groupby(["A"]).mean() + expected.insert(0, "A", expected.index) expected.index = np.arange(len(expected)) assert_frame_equal(result, expected) # multi-key - grouped = data.groupby(['A', 'B'], as_index=False) + grouped = data.groupby(["A", "B"], as_index=False) result = grouped.mean() - expected = data.groupby(['A', 'B']).mean() + expected = data.groupby(["A", "B"]).mean() arrays = list(zip(*expected.index.values)) - expected.insert(0, 'A', arrays[0]) - expected.insert(1, 'B', arrays[1]) + expected.insert(0, "A", arrays[0]) + expected.insert(1, "B", arrays[1]) expected.index = np.arange(len(expected)) assert_frame_equal(result, expected) def test_groupby_as_index_series_scalar(df): - grouped = df.groupby(['A', 'B'], as_index=False) + grouped = df.groupby(["A", "B"], as_index=False) # GH #421 - result = grouped['C'].agg(len) - expected = grouped.agg(len).loc[:, ['A', 'B', 'C']] + result = grouped["C"].agg(len) + expected = grouped.agg(len).loc[:, ["A", "B", "C"]] assert_frame_equal(result, expected) @@ -682,14 +727,13 @@ def test_groupby_as_index_corner(df, ts): def test_groupby_multiple_key(df): df = tm.makeTimeDataFrame() - grouped = df.groupby([lambda x: x.year, lambda x: x.month, - lambda x: x.day]) + grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]) agged = grouped.sum() assert_almost_equal(df.values, agged.values) - grouped = df.T.groupby([lambda x: x.year, - lambda x: x.month, - lambda x: x.day], axis=1) + grouped = df.T.groupby( + [lambda x: x.year, lambda x: x.month, lambda x: x.day], axis=1 + ) agged = grouped.agg(lambda x: x.sum()) tm.assert_index_equal(agged.index, df.columns) @@ -702,44 +746,46 @@ def test_groupby_multiple_key(df): def test_groupby_multi_corner(df): # test that having an all-NA column doesn't mess you up df = df.copy() - df['bad'] = np.nan - agged = df.groupby(['A', 'B']).mean() + df["bad"] = np.nan + agged = df.groupby(["A", "B"]).mean() - expected = df.groupby(['A', 'B']).mean() - expected['bad'] = np.nan + expected = df.groupby(["A", "B"]).mean() + expected["bad"] = np.nan assert_frame_equal(agged, expected) def test_omit_nuisance(df): - grouped = df.groupby('A') + grouped = df.groupby("A") result = grouped.mean() - expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean() + expected = df.loc[:, ["A", "C", "D"]].groupby("A").mean() assert_frame_equal(result, expected) agged = grouped.agg(np.mean) exp = grouped.mean() assert_frame_equal(agged, exp) - df = df.loc[:, ['A', 'C', 'D']] - df['E'] = datetime.now() - grouped = df.groupby('A') + df = df.loc[:, ["A", "C", "D"]] + df["E"] = datetime.now() + grouped = df.groupby("A") result = grouped.agg(np.sum) expected = grouped.sum() assert_frame_equal(result, expected) # won't work with axis = 1 - grouped = df.groupby({'A': 0, 'C': 0, 'D': 1, 'E': 1}, axis=1) - msg = (r'\("unsupported operand type\(s\) for \+: ' - "'Timestamp' and 'float'\"" - r", 'occurred at index 0'\)") + grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) + msg = ( + r'\("unsupported operand type\(s\) for \+: ' + "'Timestamp' and 'float'\"" + r", 'occurred at index 0'\)" + ) with pytest.raises(TypeError, match=msg): grouped.agg(lambda x: x.sum(0, numeric_only=False)) def test_omit_nuisance_python_multiple(three_group): - grouped = three_group.groupby(['A', 'B']) + grouped = three_group.groupby(["A", "B"]) agged = grouped.agg(np.mean) exp = grouped.mean() @@ -748,41 +794,45 @@ def test_omit_nuisance_python_multiple(three_group): def test_empty_groups_corner(mframe): # handle empty groups - df = DataFrame({'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']), - 'k2': np.array(['1', '1', '1', '2', '2', '2']), - 'k3': ['foo', 'bar'] * 3, - 'v1': np.random.randn(6), - 'v2': np.random.randn(6)}) - - grouped = df.groupby(['k1', 'k2']) + df = DataFrame( + { + "k1": np.array(["b", "b", "b", "a", "a", "a"]), + "k2": np.array(["1", "1", "1", "2", "2", "2"]), + "k3": ["foo", "bar"] * 3, + "v1": np.random.randn(6), + "v2": np.random.randn(6), + } + ) + + grouped = df.groupby(["k1", "k2"]) result = grouped.agg(np.mean) expected = grouped.mean() assert_frame_equal(result, expected) grouped = mframe[3:5].groupby(level=0) agged = grouped.apply(lambda x: x.mean()) - agged_A = grouped['A'].apply(np.mean) - assert_series_equal(agged['A'], agged_A) - assert agged.index.name == 'first' + agged_A = grouped["A"].apply(np.mean) + assert_series_equal(agged["A"], agged_A) + assert agged.index.name == "first" def test_nonsense_func(): df = DataFrame([0]) msg = r"unsupported operand type\(s\) for \+: 'int' and 'str'" with pytest.raises(TypeError, match=msg): - df.groupby(lambda x: x + 'foo') + df.groupby(lambda x: x + "foo") def test_wrap_aggregated_output_multindex(mframe): df = mframe.T - df['baz', 'two'] = 'peekaboo' + df["baz", "two"] = "peekaboo" keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] agged = df.groupby(keys).agg(np.mean) assert isinstance(agged.columns, MultiIndex) def aggfun(ser): - if ser.name == ('foo', 'one'): + if ser.name == ("foo", "one"): raise TypeError else: return ser.sum() @@ -794,28 +844,28 @@ def aggfun(ser): def test_groupby_level_apply(mframe): result = mframe.groupby(level=0).count() - assert result.index.name == 'first' + assert result.index.name == "first" result = mframe.groupby(level=1).count() - assert result.index.name == 'second' + assert result.index.name == "second" - result = mframe['A'].groupby(level=0).count() - assert result.index.name == 'first' + result = mframe["A"].groupby(level=0).count() + assert result.index.name == "first" def test_groupby_level_mapper(mframe): deleveled = mframe.reset_index() - mapper0 = {'foo': 0, 'bar': 0, 'baz': 1, 'qux': 1} - mapper1 = {'one': 0, 'two': 0, 'three': 1} + mapper0 = {"foo": 0, "bar": 0, "baz": 1, "qux": 1} + mapper1 = {"one": 0, "two": 0, "three": 1} result0 = mframe.groupby(mapper0, level=0).sum() result1 = mframe.groupby(mapper1, level=1).sum() - mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']]) - mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']]) + mapped_level0 = np.array([mapper0.get(x) for x in deleveled["first"]]) + mapped_level1 = np.array([mapper1.get(x) for x in deleveled["second"]]) expected0 = mframe.groupby(mapped_level0).sum() expected1 = mframe.groupby(mapped_level1).sum() - expected0.index.name, expected1.index.name = 'first', 'second' + expected0.index.name, expected1.index.name = "first", "second" assert_frame_equal(result0, expected0) assert_frame_equal(result1, expected1) @@ -823,10 +873,8 @@ def test_groupby_level_mapper(mframe): def test_groupby_level_nonmulti(): # GH 1313, GH 13901 - s = Series([1, 2, 3, 10, 4, 5, 20, 6], - Index([1, 2, 3, 1, 4, 5, 2, 6], name='foo')) - expected = Series([11, 22, 3, 4, 5, 6], - Index(range(1, 7), name='foo')) + s = Series([1, 2, 3, 10, 4, 5, 20, 6], Index([1, 2, 3, 1, 4, 5, 2, 6], name="foo")) + expected = Series([11, 22, 3, 4, 5, 6], Index(range(1, 7), name="foo")) result = s.groupby(level=0).sum() tm.assert_series_equal(result, expected) @@ -871,25 +919,32 @@ def test_mutate_groups(): # GH3380 - df = DataFrame({ - 'cat1': ['a'] * 8 + ['b'] * 6, - 'cat2': ['c'] * 2 + ['d'] * 2 + ['e'] * 2 + ['f'] * 2 + ['c'] * 2 + - ['d'] * 2 + ['e'] * 2, - 'cat3': ['g{}'.format(x) for x in range(1, 15)], - 'val': np.random.randint(100, size=14), - }) + df = DataFrame( + { + "cat1": ["a"] * 8 + ["b"] * 6, + "cat2": ["c"] * 2 + + ["d"] * 2 + + ["e"] * 2 + + ["f"] * 2 + + ["c"] * 2 + + ["d"] * 2 + + ["e"] * 2, + "cat3": ["g{}".format(x) for x in range(1, 15)], + "val": np.random.randint(100, size=14), + } + ) def f_copy(x): x = x.copy() - x['rank'] = x.val.rank(method='min') - return x.groupby('cat2')['rank'].min() + x["rank"] = x.val.rank(method="min") + return x.groupby("cat2")["rank"].min() def f_no_copy(x): - x['rank'] = x.val.rank(method='min') - return x.groupby('cat2')['rank'].min() + x["rank"] = x.val.rank(method="min") + return x.groupby("cat2")["rank"].min() - grpby_copy = df.groupby('cat1').apply(f_copy) - grpby_no_copy = df.groupby('cat1').apply(f_no_copy) + grpby_copy = df.groupby("cat1").apply(f_copy) + grpby_no_copy = df.groupby("cat1").apply(f_no_copy) assert_series_equal(grpby_copy, grpby_no_copy) @@ -898,18 +953,21 @@ def test_no_mutate_but_looks_like(): # GH 8467 # first show's mutation indicator # second does not, but should yield the same results - df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'value': range(9)}) + df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - result1 = df.groupby('key', group_keys=True).apply(lambda x: x[:].key) - result2 = df.groupby('key', group_keys=True).apply(lambda x: x.key) + result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) + result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) assert_series_equal(result1, result2) def test_groupby_series_indexed_differently(): - s1 = Series([5.0, -9.0, 4.0, 100., -5., 55., 6.7], - index=Index(['a', 'b', 'c', 'd', 'e', 'f', 'g'])) - s2 = Series([1.0, 1.0, 4.0, 5.0, 5.0, 7.0], - index=Index(['a', 'b', 'd', 'f', 'g', 'h'])) + s1 = Series( + [5.0, -9.0, 4.0, 100.0, -5.0, 55.0, 6.7], + index=Index(["a", "b", "c", "d", "e", "f", "g"]), + ) + s2 = Series( + [1.0, 1.0, 4.0, 5.0, 5.0, 7.0], index=Index(["a", "b", "d", "f", "g", "h"]) + ) grouped = s1.groupby(s2) agged = grouped.mean() @@ -918,12 +976,18 @@ def test_groupby_series_indexed_differently(): def test_groupby_with_hier_columns(): - tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', - 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', - 'one', 'two']])) + tuples = list( + zip( + *[ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + ) + ) index = MultiIndex.from_tuples(tuples) - columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), ( - 'B', 'cat'), ('A', 'dog')]) + columns = MultiIndex.from_tuples( + [("A", "cat"), ("B", "dog"), ("B", "cat"), ("A", "dog")] + ) df = DataFrame(np.random.randn(8, 4), index=index, columns=columns) result = df.groupby(level=0).mean() @@ -939,23 +1003,24 @@ def test_groupby_with_hier_columns(): tm.assert_index_equal(result.columns, columns) result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1)) - tm.assert_index_equal(result.columns, Index(['A', 'B'])) + tm.assert_index_equal(result.columns, Index(["A", "B"])) tm.assert_index_equal(result.index, df.index) # add a nuisance column sorted_columns, _ = columns.sortlevel(0) - df['A', 'foo'] = 'bar' + df["A", "foo"] = "bar" result = df.groupby(level=0).mean() tm.assert_index_equal(result.columns, df.columns[:-1]) def test_grouping_ndarray(df): - grouped = df.groupby(df['A'].values) + grouped = df.groupby(df["A"].values) result = grouped.sum() - expected = df.groupby('A').sum() - assert_frame_equal(result, expected, check_names=False - ) # Note: no names when grouping by value + expected = df.groupby("A").sum() + assert_frame_equal( + result, expected, check_names=False + ) # Note: no names when grouping by value def test_groupby_wrong_multi_labels(): @@ -968,7 +1033,7 @@ def test_groupby_wrong_multi_labels(): data = read_csv(StringIO(data), index_col=0) - grouped = data.groupby(['foo', 'bar', 'baz', 'spam']) + grouped = data.groupby(["foo", "bar", "baz", "spam"]) result = grouped.agg(np.mean) expected = grouped.mean() @@ -976,40 +1041,41 @@ def test_groupby_wrong_multi_labels(): def test_groupby_series_with_name(df): - result = df.groupby(df['A']).mean() - result2 = df.groupby(df['A'], as_index=False).mean() - assert result.index.name == 'A' - assert 'A' in result2 + result = df.groupby(df["A"]).mean() + result2 = df.groupby(df["A"], as_index=False).mean() + assert result.index.name == "A" + assert "A" in result2 - result = df.groupby([df['A'], df['B']]).mean() - result2 = df.groupby([df['A'], df['B']], - as_index=False).mean() - assert result.index.names == ('A', 'B') - assert 'A' in result2 - assert 'B' in result2 + result = df.groupby([df["A"], df["B"]]).mean() + result2 = df.groupby([df["A"], df["B"]], as_index=False).mean() + assert result.index.names == ("A", "B") + assert "A" in result2 + assert "B" in result2 def test_seriesgroupby_name_attr(df): # GH 6265 - result = df.groupby('A')['C'] - assert result.count().name == 'C' - assert result.mean().name == 'C' + result = df.groupby("A")["C"] + assert result.count().name == "C" + assert result.mean().name == "C" testFunc = lambda x: np.sum(x) * 2 - assert result.agg(testFunc).name == 'C' + assert result.agg(testFunc).name == "C" def test_consistency_name(): # GH 12363 - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': np.random.randn(8) + 1.0, - 'D': np.arange(8)}) - - expected = df.groupby(['A']).B.count() + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": np.random.randn(8) + 1.0, + "D": np.arange(8), + } + ) + + expected = df.groupby(["A"]).B.count() result = df.B.groupby(df.A).count() assert_series_equal(result, expected) @@ -1017,23 +1083,19 @@ def test_consistency_name(): def test_groupby_name_propagation(df): # GH 6124 def summarize(df, name=None): - return Series({'count': 1, 'mean': 2, 'omissions': 3, }, name=name) + return Series({"count": 1, "mean": 2, "omissions": 3}, name=name) def summarize_random_name(df): # Provide a different name for each Series. In this case, groupby # should not attempt to propagate the Series name since they are # inconsistent. - return Series({ - 'count': 1, - 'mean': 2, - 'omissions': 3, - }, name=df.iloc[0]['A']) + return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"]) - metrics = df.groupby('A').apply(summarize) + metrics = df.groupby("A").apply(summarize) assert metrics.columns.name is None - metrics = df.groupby('A').apply(summarize, 'metrics') - assert metrics.columns.name == 'metrics' - metrics = df.groupby('A').apply(summarize_random_name) + metrics = df.groupby("A").apply(summarize, "metrics") + assert metrics.columns.name == "metrics" + metrics = df.groupby("A").apply(summarize_random_name) assert metrics.columns.name is None @@ -1047,14 +1109,13 @@ def test_groupby_nonstring_columns(): def test_groupby_mixed_type_columns(): # GH 13432, unorderable types in py3 - df = DataFrame([[0, 1, 2]], columns=['A', 'B', 0]) - expected = DataFrame([[1, 2]], columns=['B', 0], - index=Index([0], name='A')) + df = DataFrame([[0, 1, 2]], columns=["A", "B", 0]) + expected = DataFrame([[1, 2]], columns=["B", 0], index=Index([0], name="A")) - result = df.groupby('A').first() + result = df.groupby("A").first() tm.assert_frame_equal(result, expected) - result = df.groupby('A').sum() + result = df.groupby("A").sum() tm.assert_frame_equal(result, expected) @@ -1087,14 +1148,14 @@ def test_series_grouper_noncontig_index(): def test_convert_objects_leave_decimal_alone(): s = Series(range(5)) - labels = np.array(['a', 'b', 'c', 'd', 'e'], dtype='O') + labels = np.array(["a", "b", "c", "d", "e"], dtype="O") def convert_fast(x): return Decimal(str(x.mean())) def convert_force_pure(x): # base will be length 0 - assert (len(x.values.base) > 0) + assert len(x.values.base) > 0 return Decimal(str(x.mean())) grouped = s.groupby(labels) @@ -1110,43 +1171,39 @@ def convert_force_pure(x): def test_groupby_dtype_inference_empty(): # GH 6733 - df = DataFrame({'x': [], 'range': np.arange(0, dtype='int64')}) - assert df['x'].dtype == np.float64 + df = DataFrame({"x": [], "range": np.arange(0, dtype="int64")}) + assert df["x"].dtype == np.float64 - result = df.groupby('x').first() - exp_index = Index([], name='x', dtype=np.float64) - expected = DataFrame({'range': Series( - [], index=exp_index, dtype='int64')}) + result = df.groupby("x").first() + exp_index = Index([], name="x", dtype=np.float64) + expected = DataFrame({"range": Series([], index=exp_index, dtype="int64")}) assert_frame_equal(result, expected, by_blocks=True) def test_groupby_list_infer_array_like(df): - result = df.groupby(list(df['A'])).mean() - expected = df.groupby(df['A']).mean() + result = df.groupby(list(df["A"])).mean() + expected = df.groupby(df["A"]).mean() assert_frame_equal(result, expected, check_names=False) with pytest.raises(KeyError, match=r"^'foo'$"): - df.groupby(list(df['A'][:-1])) + df.groupby(list(df["A"][:-1])) # pathological case of ambiguity - df = DataFrame({'foo': [0, 1], - 'bar': [3, 4], - 'val': np.random.randn(2)}) + df = DataFrame({"foo": [0, 1], "bar": [3, 4], "val": np.random.randn(2)}) - result = df.groupby(['foo', 'bar']).mean() - expected = df.groupby([df['foo'], df['bar']]).mean()[['val']] + result = df.groupby(["foo", "bar"]).mean() + expected = df.groupby([df["foo"], df["bar"]]).mean()[["val"]] def test_groupby_keys_same_size_as_index(): # GH 11185 - freq = 's' - index = pd.date_range(start=pd.Timestamp('2015-09-29T11:34:44-0700'), - periods=2, freq=freq) - df = pd.DataFrame([['A', 10], ['B', 15]], columns=[ - 'metric', 'values' - ], index=index) - result = df.groupby([pd.Grouper(level=0, freq=freq), 'metric']).mean() - expected = df.set_index([df.index, 'metric']) + freq = "s" + index = pd.date_range( + start=pd.Timestamp("2015-09-29T11:34:44-0700"), periods=2, freq=freq + ) + df = pd.DataFrame([["A", 10], ["B", 15]], columns=["metric", "values"], index=index) + result = df.groupby([pd.Grouper(level=0, freq=freq), "metric"]).mean() + expected = df.set_index([df.index, "metric"]) assert_frame_equal(result, expected) @@ -1154,23 +1211,33 @@ def test_groupby_keys_same_size_as_index(): def test_groupby_one_row(): # GH 11741 msg = r"^'Z'$" - df1 = pd.DataFrame(np.random.randn(1, 4), columns=list('ABCD')) + df1 = pd.DataFrame(np.random.randn(1, 4), columns=list("ABCD")) with pytest.raises(KeyError, match=msg): - df1.groupby('Z') - df2 = pd.DataFrame(np.random.randn(2, 4), columns=list('ABCD')) + df1.groupby("Z") + df2 = pd.DataFrame(np.random.randn(2, 4), columns=list("ABCD")) with pytest.raises(KeyError, match=msg): - df2.groupby('Z') + df2.groupby("Z") def test_groupby_nat_exclude(): # GH 6992 df = pd.DataFrame( - {'values': np.random.randn(8), - 'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp( - '2013-02-01'), np.nan, pd.Timestamp('2013-02-01'), np.nan, - pd.Timestamp('2013-01-01')], - 'str': [np.nan, 'a', np.nan, 'a', np.nan, 'a', np.nan, 'b']}) - grouped = df.groupby('dt') + { + "values": np.random.randn(8), + "dt": [ + np.nan, + pd.Timestamp("2013-01-01"), + np.nan, + pd.Timestamp("2013-02-01"), + np.nan, + pd.Timestamp("2013-02-01"), + np.nan, + pd.Timestamp("2013-01-01"), + ], + "str": [np.nan, "a", np.nan, "a", np.nan, "a", np.nan, "b"], + } + ) + grouped = df.groupby("dt") expected = [pd.Index([1, 7]), pd.Index([3, 5])] keys = sorted(grouped.groups.keys()) @@ -1185,27 +1252,26 @@ def test_groupby_nat_exclude(): assert grouped.ngroups == 2 expected = { - Timestamp('2013-01-01 00:00:00'): np.array([1, 7], dtype=np.int64), - Timestamp('2013-02-01 00:00:00'): np.array([3, 5], dtype=np.int64) + Timestamp("2013-01-01 00:00:00"): np.array([1, 7], dtype=np.int64), + Timestamp("2013-02-01 00:00:00"): np.array([3, 5], dtype=np.int64), } for k in grouped.indices: tm.assert_numpy_array_equal(grouped.indices[k], expected[k]) - tm.assert_frame_equal( - grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]]) - tm.assert_frame_equal( - grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]]) + tm.assert_frame_equal(grouped.get_group(Timestamp("2013-01-01")), df.iloc[[1, 7]]) + tm.assert_frame_equal(grouped.get_group(Timestamp("2013-02-01")), df.iloc[[3, 5]]) with pytest.raises(KeyError, match=r"^NaT$"): grouped.get_group(pd.NaT) - nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan], - 'nat': [pd.NaT, pd.NaT, pd.NaT]}) - assert nan_df['nan'].dtype == 'float64' - assert nan_df['nat'].dtype == 'datetime64[ns]' + nan_df = DataFrame( + {"nan": [np.nan, np.nan, np.nan], "nat": [pd.NaT, pd.NaT, pd.NaT]} + ) + assert nan_df["nan"].dtype == "float64" + assert nan_df["nat"].dtype == "datetime64[ns]" - for key in ['nan', 'nat']: + for key in ["nan", "nat"]: grouped = nan_df.groupby(key) assert grouped.groups == {} assert grouped.ngroups == 0 @@ -1218,56 +1284,55 @@ def test_groupby_nat_exclude(): def test_groupby_2d_malformed(): d = DataFrame(index=range(2)) - d['group'] = ['g1', 'g2'] - d['zeros'] = [0, 0] - d['ones'] = [1, 1] - d['label'] = ['l1', 'l2'] - tmp = d.groupby(['group']).mean() + d["group"] = ["g1", "g2"] + d["zeros"] = [0, 0] + d["ones"] = [1, 1] + d["label"] = ["l1", "l2"] + tmp = d.groupby(["group"]).mean() res_values = np.array([[0, 1], [0, 1]], dtype=np.int64) - tm.assert_index_equal(tmp.columns, Index(['zeros', 'ones'])) + tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"])) tm.assert_numpy_array_equal(tmp.values, res_values) def test_int32_overflow(): - B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000) - )) + B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000))) A = np.arange(25000) - df = DataFrame({'A': A, - 'B': B, - 'C': A, - 'D': B, - 'E': np.random.randn(25000)}) - - left = df.groupby(['A', 'B', 'C', 'D']).sum() - right = df.groupby(['D', 'C', 'B', 'A']).sum() + df = DataFrame({"A": A, "B": B, "C": A, "D": B, "E": np.random.randn(25000)}) + + left = df.groupby(["A", "B", "C", "D"]).sum() + right = df.groupby(["D", "C", "B", "A"]).sum() assert len(left) == len(right) def test_groupby_sort_multi(): - df = DataFrame({'a': ['foo', 'bar', 'baz'], - 'b': [3, 2, 1], - 'c': [0, 1, 2], - 'd': np.random.randn(3)}) - - tups = [tuple(row) for row in df[['a', 'b', 'c']].values] + df = DataFrame( + { + "a": ["foo", "bar", "baz"], + "b": [3, 2, 1], + "c": [0, 1, 2], + "d": np.random.randn(3), + } + ) + + tups = [tuple(row) for row in df[["a", "b", "c"]].values] tups = com.asarray_tuplesafe(tups) - result = df.groupby(['a', 'b', 'c'], sort=True).sum() + result = df.groupby(["a", "b", "c"], sort=True).sum() tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]]) - tups = [tuple(row) for row in df[['c', 'a', 'b']].values] + tups = [tuple(row) for row in df[["c", "a", "b"]].values] tups = com.asarray_tuplesafe(tups) - result = df.groupby(['c', 'a', 'b'], sort=True).sum() + result = df.groupby(["c", "a", "b"], sort=True).sum() tm.assert_numpy_array_equal(result.index.values, tups) - tups = [tuple(x) for x in df[['b', 'c', 'a']].values] + tups = [tuple(x) for x in df[["b", "c", "a"]].values] tups = com.asarray_tuplesafe(tups) - result = df.groupby(['b', 'c', 'a'], sort=True).sum() + result = df.groupby(["b", "c", "a"], sort=True).sum() tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]]) - df = DataFrame({'a': [0, 1, 2, 0, 1, 2], - 'b': [0, 0, 0, 1, 1, 1], - 'd': np.random.randn(6)}) - grouped = df.groupby(['a', 'b'])['d'] + df = DataFrame( + {"a": [0, 1, 2, 0, 1, 2], "b": [0, 0, 0, 1, 1, 1], "d": np.random.randn(6)} + ) + grouped = df.groupby(["a", "b"])["d"] result = grouped.sum() def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): @@ -1275,16 +1340,17 @@ def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = com.asarray_tuplesafe(tups) expected = f(df.groupby(tups)[field]) for k, v in expected.items(): - assert (result[k] == v) + assert result[k] == v - _check_groupby(df, result, ['a', 'b'], 'd') + _check_groupby(df, result, ["a", "b"], "d") def test_dont_clobber_name_column(): - df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], - 'name': ['foo', 'bar', 'baz'] * 2}) + df = DataFrame( + {"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2} + ) - result = df.groupby('key').apply(lambda x: x) + result = df.groupby("key").apply(lambda x: x) assert_frame_equal(result, df) @@ -1293,14 +1359,14 @@ def test_skip_group_keys(): tsf = tm.makeTimeDataFrame() grouped = tsf.groupby(lambda x: x.month, group_keys=False) - result = grouped.apply(lambda x: x.sort_values(by='A')[:3]) + result = grouped.apply(lambda x: x.sort_values(by="A")[:3]) - pieces = [group.sort_values(by='A')[:3] for key, group in grouped] + pieces = [group.sort_values(by="A")[:3] for key, group in grouped] expected = pd.concat(pieces) assert_frame_equal(result, expected) - grouped = tsf['A'].groupby(lambda x: x.month, group_keys=False) + grouped = tsf["A"].groupby(lambda x: x.month, group_keys=False) result = grouped.apply(lambda x: x.sort_values()[:3]) pieces = [group.sort_values()[:3] for key, group in grouped] @@ -1311,39 +1377,39 @@ def test_skip_group_keys(): def test_no_nonsense_name(float_frame): # GH #995 - s = float_frame['C'].copy() + s = float_frame["C"].copy() s.name = None - result = s.groupby(float_frame['A']).agg(np.sum) + result = s.groupby(float_frame["A"]).agg(np.sum) assert result.name is None def test_multifunc_sum_bug(): # GH #1065 x = DataFrame(np.arange(9).reshape(3, 3)) - x['test'] = 0 - x['fl'] = [1.3, 1.5, 1.6] + x["test"] = 0 + x["fl"] = [1.3, 1.5, 1.6] - grouped = x.groupby('test') - result = grouped.agg({'fl': 'sum', 2: 'size'}) - assert result['fl'].dtype == np.float64 + grouped = x.groupby("test") + result = grouped.agg({"fl": "sum", 2: "size"}) + assert result["fl"].dtype == np.float64 def test_handle_dict_return_value(df): def f(group): - return {'max': group.max(), 'min': group.min()} + return {"max": group.max(), "min": group.min()} def g(group): - return Series({'max': group.max(), 'min': group.min()}) + return Series({"max": group.max(), "min": group.min()}) - result = df.groupby('A')['C'].apply(f) - expected = df.groupby('A')['C'].apply(g) + result = df.groupby("A")["C"].apply(f) + expected = df.groupby("A")["C"].apply(g) assert isinstance(result, Series) assert_series_equal(result, expected) -@pytest.mark.parametrize('grouper', ['A', ['A', 'B']]) +@pytest.mark.parametrize("grouper", ["A", ["A", "B"]]) def test_set_group_name(df, grouper): def f(group): assert group.name is not None @@ -1361,25 +1427,26 @@ def foo(x): # make sure all these work grouped.apply(f) grouped.aggregate(freduce) - grouped.aggregate({'C': freduce, 'D': freduce}) + grouped.aggregate({"C": freduce, "D": freduce}) grouped.transform(f) - grouped['C'].apply(f) - grouped['C'].aggregate(freduce) - grouped['C'].aggregate([freduce, foo]) - grouped['C'].transform(f) + grouped["C"].apply(f) + grouped["C"].aggregate(freduce) + grouped["C"].aggregate([freduce, foo]) + grouped["C"].transform(f) def test_group_name_available_in_inference_pass(): # gh-15062 - df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)}) + df = pd.DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": np.arange(6)}) names = [] def f(group): names.append(group.name) return group.copy() - df.groupby('a', sort=False, group_keys=False).apply(f) + + df.groupby("a", sort=False, group_keys=False).apply(f) expected_names = [0, 1, 2] assert names == expected_names @@ -1387,10 +1454,10 @@ def f(group): def test_no_dummy_key_names(df): # see gh-1291 - result = df.groupby(df['A'].values).sum() + result = df.groupby(df["A"].values).sum() assert result.index.name is None - result = df.groupby([df['A'].values, df['B'].values]).sum() + result = df.groupby([df["A"].values, df["B"].values]).sum() assert result.index.names == (None, None) @@ -1398,26 +1465,28 @@ def test_groupby_sort_multiindex_series(): # series multiindex groupby sort argument was not being passed through # _compress_group_index # GH 9444 - index = MultiIndex(levels=[[1, 2], [1, 2]], - codes=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]], - names=['a', 'b']) + index = MultiIndex( + levels=[[1, 2], [1, 2]], + codes=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]], + names=["a", "b"], + ) mseries = Series([0, 1, 2, 3, 4, 5], index=index) - index = MultiIndex(levels=[[1, 2], [1, 2]], - codes=[[0, 0, 1], [1, 0, 0]], names=['a', 'b']) + index = MultiIndex( + levels=[[1, 2], [1, 2]], codes=[[0, 0, 1], [1, 0, 0]], names=["a", "b"] + ) mseries_result = Series([0, 2, 4], index=index) - result = mseries.groupby(level=['a', 'b'], sort=False).first() + result = mseries.groupby(level=["a", "b"], sort=False).first() assert_series_equal(result, mseries_result) - result = mseries.groupby(level=['a', 'b'], sort=True).first() + result = mseries.groupby(level=["a", "b"], sort=True).first() assert_series_equal(result, mseries_result.sort_index()) def test_groupby_reindex_inside_function(): periods = 1000 - ind = date_range(start='2012/1/1', freq='5min', periods=periods) - df = DataFrame({'high': np.arange( - periods), 'low': np.arange(periods)}, index=ind) + ind = date_range(start="2012/1/1", freq="5min", periods=periods) + df = DataFrame({"high": np.arange(periods), "low": np.arange(periods)}, index=ind) def agg_before(hour, func, fix=False): """ @@ -1425,8 +1494,7 @@ def agg_before(hour, func, fix=False): """ def _func(data): - d = data.loc[data.index.map( - lambda x: x.hour < 11)].dropna() + d = data.loc[data.index.map(lambda x: x.hour < 11)].dropna() if fix: data[data.index[0]] if len(d) == 0: @@ -1440,24 +1508,29 @@ def afunc(data): return np.max(d) grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) - closure_bad = grouped.agg({'high': agg_before(11, np.max)}) - closure_good = grouped.agg({'high': agg_before(11, np.max, True)}) + closure_bad = grouped.agg({"high": agg_before(11, np.max)}) + closure_good = grouped.agg({"high": agg_before(11, np.max, True)}) assert_frame_equal(closure_bad, closure_good) def test_groupby_multiindex_missing_pair(): # GH9049 - df = DataFrame({'group1': ['a', 'a', 'a', 'b'], - 'group2': ['c', 'c', 'd', 'c'], - 'value': [1, 1, 1, 5]}) - df = df.set_index(['group1', 'group2']) - df_grouped = df.groupby(level=['group1', 'group2'], sort=True) - - res = df_grouped.agg('sum') + df = DataFrame( + { + "group1": ["a", "a", "a", "b"], + "group2": ["c", "c", "d", "c"], + "value": [1, 1, 1, 5], + } + ) + df = df.set_index(["group1", "group2"]) + df_grouped = df.groupby(level=["group1", "group2"], sort=True) + + res = df_grouped.agg("sum") idx = MultiIndex.from_tuples( - [('a', 'c'), ('a', 'd'), ('b', 'c')], names=['group1', 'group2']) - exp = DataFrame([[2], [1], [5]], index=idx, columns=['value']) + [("a", "c"), ("a", "d"), ("b", "c")], names=["group1", "group2"] + ) + exp = DataFrame([[2], [1], [5]], index=idx, columns=["value"]) tm.assert_frame_equal(res, exp) @@ -1467,43 +1540,47 @@ def test_groupby_multiindex_not_lexsorted(): # define the lexsorted version lexsorted_mi = MultiIndex.from_tuples( - [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c']) + [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"] + ) lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) assert lexsorted_df.columns.is_lexsorted() # define the non-lexsorted version - not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'], - data=[[1, 'b1', 'c1', 3], - [1, 'b2', 'c2', 4]]) + not_lexsorted_df = DataFrame( + columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]] + ) not_lexsorted_df = not_lexsorted_df.pivot_table( - index='a', columns=['b', 'c'], values='d') + index="a", columns=["b", "c"], values="d" + ) not_lexsorted_df = not_lexsorted_df.reset_index() assert not not_lexsorted_df.columns.is_lexsorted() # compare the results tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) - expected = lexsorted_df.groupby('a').mean() + expected = lexsorted_df.groupby("a").mean() with tm.assert_produces_warning(PerformanceWarning): - result = not_lexsorted_df.groupby('a').mean() + result = not_lexsorted_df.groupby("a").mean() tm.assert_frame_equal(expected, result) # a transforming function should work regardless of sort # GH 14776 - df = DataFrame({'x': ['a', 'a', 'b', 'a'], - 'y': [1, 1, 2, 2], - 'z': [1, 2, 3, 4]}).set_index(['x', 'y']) + df = DataFrame( + {"x": ["a", "a", "b", "a"], "y": [1, 1, 2, 2], "z": [1, 2, 3, 4]} + ).set_index(["x", "y"]) assert not df.index.is_lexsorted() for level in [0, 1, [0, 1]]: for sort in [False, True]: - result = df.groupby(level=level, sort=sort).apply( - DataFrame.drop_duplicates) + result = df.groupby(level=level, sort=sort).apply(DataFrame.drop_duplicates) expected = df tm.assert_frame_equal(expected, result) - result = df.sort_index().groupby(level=level, sort=sort).apply( - DataFrame.drop_duplicates) + result = ( + df.sort_index() + .groupby(level=level, sort=sort) + .apply(DataFrame.drop_duplicates) + ) expected = df.sort_index() tm.assert_frame_equal(expected, result) @@ -1511,27 +1588,27 @@ def test_groupby_multiindex_not_lexsorted(): def test_index_label_overlaps_location(): # checking we don't have any label/location confusion in the # the wake of GH5375 - df = DataFrame(list('ABCDE'), index=[2, 0, 2, 1, 1]) - g = df.groupby(list('ababb')) + df = DataFrame(list("ABCDE"), index=[2, 0, 2, 1, 1]) + g = df.groupby(list("ababb")) actual = g.filter(lambda x: len(x) > 2) expected = df.iloc[[1, 3, 4]] assert_frame_equal(actual, expected) ser = df[0] - g = ser.groupby(list('ababb')) + g = ser.groupby(list("ababb")) actual = g.filter(lambda x: len(x) > 2) expected = ser.take([1, 3, 4]) assert_series_equal(actual, expected) # ... and again, with a generic Index of floats df.index = df.index.astype(float) - g = df.groupby(list('ababb')) + g = df.groupby(list("ababb")) actual = g.filter(lambda x: len(x) > 2) expected = df.iloc[[1, 3, 4]] assert_frame_equal(actual, expected) ser = df[0] - g = ser.groupby(list('ababb')) + g = ser.groupby(list("ababb")) actual = g.filter(lambda x: len(x) > 2) expected = ser.take([1, 3, 4]) assert_series_equal(actual, expected) @@ -1541,32 +1618,37 @@ def test_transform_doesnt_clobber_ints(): # GH 7972 n = 6 x = np.arange(n) - df = DataFrame({'a': x // 2, 'b': 2.0 * x, 'c': 3.0 * x}) - df2 = DataFrame({'a': x // 2 * 1.0, 'b': 2.0 * x, 'c': 3.0 * x}) + df = DataFrame({"a": x // 2, "b": 2.0 * x, "c": 3.0 * x}) + df2 = DataFrame({"a": x // 2 * 1.0, "b": 2.0 * x, "c": 3.0 * x}) - gb = df.groupby('a') - result = gb.transform('mean') + gb = df.groupby("a") + result = gb.transform("mean") - gb2 = df2.groupby('a') - expected = gb2.transform('mean') + gb2 = df2.groupby("a") + expected = gb2.transform("mean") tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize('sort_column', ['ints', 'floats', 'strings', - ['ints', 'floats'], - ['ints', 'strings']]) -@pytest.mark.parametrize('group_column', ['int_groups', 'string_groups', - ['int_groups', 'string_groups']]) +@pytest.mark.parametrize( + "sort_column", + ["ints", "floats", "strings", ["ints", "floats"], ["ints", "strings"]], +) +@pytest.mark.parametrize( + "group_column", ["int_groups", "string_groups", ["int_groups", "string_groups"]] +) def test_groupby_preserves_sort(sort_column, group_column): # Test to ensure that groupby always preserves sort order of original # object. Issue #8588 and #9651 df = DataFrame( - {'int_groups': [3, 1, 0, 1, 0, 3, 3, 3], - 'string_groups': ['z', 'a', 'z', 'a', 'a', 'g', 'g', 'g'], - 'ints': [8, 7, 4, 5, 2, 9, 1, 1], - 'floats': [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5], - 'strings': ['z', 'd', 'a', 'e', 'word', 'word2', '42', '47']}) + { + "int_groups": [3, 1, 0, 1, 0, 3, 3, 3], + "string_groups": ["z", "a", "z", "a", "a", "g", "g", "g"], + "ints": [8, 7, 4, 5, 2, 9, 1, 1], + "floats": [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5], + "strings": ["z", "d", "a", "e", "word", "word2", "42", "47"], + } + ) # Try sorting on different types and with different group types @@ -1575,6 +1657,7 @@ def test_groupby_preserves_sort(sort_column, group_column): def test_sort(x): assert_frame_equal(x, x.sort_values(by=sort_column)) + g.apply(test_sort) @@ -1586,15 +1669,20 @@ def test_group_shift_with_null_key(): # values in column `B`, and then group by [`A`, `B`]. This should # force `-1` in `labels` array of `g.grouper.group_info` exactly # at those places, where the group-by key is partially missing. - df = DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i) - for i in range(n_rows)], dtype=float, - columns=["A", "B", "Z"], index=None) + df = DataFrame( + [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)], + dtype=float, + columns=["A", "B", "Z"], + index=None, + ) g = df.groupby(["A", "B"]) - expected = DataFrame([(i + 12 if i % 3 and i < n_rows - 12 - else np.nan) - for i in range(n_rows)], dtype=float, - columns=["Z"], index=None) + expected = DataFrame( + [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)], + dtype=float, + columns=["Z"], + index=None, + ) result = g.shift(-1) assert_frame_equal(result, expected) @@ -1603,15 +1691,20 @@ def test_group_shift_with_null_key(): def test_group_shift_with_fill_value(): # GH #24128 n_rows = 24 - df = DataFrame([(i % 12, i % 3, i) - for i in range(n_rows)], dtype=float, - columns=["A", "B", "Z"], index=None) + df = DataFrame( + [(i % 12, i % 3, i) for i in range(n_rows)], + dtype=float, + columns=["A", "B", "Z"], + index=None, + ) g = df.groupby(["A", "B"]) - expected = DataFrame([(i + 12 if i < n_rows - 12 - else 0) - for i in range(n_rows)], dtype=float, - columns=["Z"], index=None) + expected = DataFrame( + [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)], + dtype=float, + columns=["Z"], + index=None, + ) result = g.shift(-1, fill_value=0)[["Z"]] assert_frame_equal(result, expected) @@ -1619,48 +1712,58 @@ def test_group_shift_with_fill_value(): def test_pivot_table_values_key_error(): # This test is designed to replicate the error in issue #14938 - df = pd.DataFrame({'eventDate': - pd.date_range(pd.datetime.today(), - periods=20, freq='M').tolist(), - 'thename': range(0, 20)}) + df = pd.DataFrame( + { + "eventDate": pd.date_range( + pd.datetime.today(), periods=20, freq="M" + ).tolist(), + "thename": range(0, 20), + } + ) - df['year'] = df.set_index('eventDate').index.year - df['month'] = df.set_index('eventDate').index.month + df["year"] = df.set_index("eventDate").index.year + df["month"] = df.set_index("eventDate").index.month with pytest.raises(KeyError, match="'badname'"): - df.reset_index().pivot_table(index='year', columns='month', - values='badname', aggfunc='count') + df.reset_index().pivot_table( + index="year", columns="month", values="badname", aggfunc="count" + ) def test_empty_dataframe_groupby(): # GH8093 - df = DataFrame(columns=['A', 'B', 'C']) + df = DataFrame(columns=["A", "B", "C"]) - result = df.groupby('A').sum() - expected = DataFrame(columns=['B', 'C'], dtype=np.float64) - expected.index.name = 'A' + result = df.groupby("A").sum() + expected = DataFrame(columns=["B", "C"], dtype=np.float64) + expected.index.name = "A" assert_frame_equal(result, expected) def test_tuple_warns(): # https://github.com/pandas-dev/pandas/issues/18314 - df = pd.DataFrame({('a', 'b'): [1, 1, 2, 2], 'a': [1, 1, 1, 2], - 'b': [1, 2, 2, 2], 'c': [1, 1, 1, 1]}) + df = pd.DataFrame( + { + ("a", "b"): [1, 1, 2, 2], + "a": [1, 1, 1, 2], + "b": [1, 2, 2, 2], + "c": [1, 1, 1, 1], + } + ) with tm.assert_produces_warning(FutureWarning) as w: - df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean() + df[["a", "b", "c"]].groupby(("a", "b")).c.mean() assert "Interpreting tuple 'by' as a list" in str(w[0].message) with tm.assert_produces_warning(None): - df.groupby(('a', 'b')).c.mean() + df.groupby(("a", "b")).c.mean() def test_tuple_warns_unhashable(): # https://github.com/pandas-dev/pandas/issues/18314 - business_dates = date_range(start='4/1/2014', end='6/30/2014', - freq='B') - df = DataFrame(1, index=business_dates, columns=['a', 'b']) + business_dates = date_range(start="4/1/2014", end="6/30/2014", freq="B") + df = DataFrame(1, index=business_dates, columns=["a", "b"]) with tm.assert_produces_warning(FutureWarning) as w: df.groupby((df.index.year, df.index.month)).nth([0, 3, -1]) @@ -1670,28 +1773,36 @@ def test_tuple_warns_unhashable(): def test_tuple_correct_keyerror(): # https://github.com/pandas-dev/pandas/issues/18798 - df = pd.DataFrame(1, index=range(3), - columns=pd.MultiIndex.from_product([[1, 2], - [3, 4]])) + df = pd.DataFrame( + 1, index=range(3), columns=pd.MultiIndex.from_product([[1, 2], [3, 4]]) + ) with pytest.raises(KeyError, match=r"^\(7, 8\)$"): df.groupby((7, 8)).mean() def test_groupby_agg_ohlc_non_first(): # GH 21716 - df = pd.DataFrame([[1], [1]], columns=['foo'], - index=pd.date_range('2018-01-01', periods=2, freq='D')) - - expected = pd.DataFrame([ - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1] - ], columns=pd.MultiIndex.from_tuples(( - ('foo', 'sum', 'foo'), ('foo', 'ohlc', 'open'), - ('foo', 'ohlc', 'high'), ('foo', 'ohlc', 'low'), - ('foo', 'ohlc', 'close'))), index=pd.date_range( - '2018-01-01', periods=2, freq='D')) - - result = df.groupby(pd.Grouper(freq='D')).agg(['sum', 'ohlc']) + df = pd.DataFrame( + [[1], [1]], + columns=["foo"], + index=pd.date_range("2018-01-01", periods=2, freq="D"), + ) + + expected = pd.DataFrame( + [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]], + columns=pd.MultiIndex.from_tuples( + ( + ("foo", "sum", "foo"), + ("foo", "ohlc", "open"), + ("foo", "ohlc", "high"), + ("foo", "ohlc", "low"), + ("foo", "ohlc", "close"), + ) + ), + index=pd.date_range("2018-01-01", periods=2, freq="D"), + ) + + result = df.groupby(pd.Grouper(freq="D")).agg(["sum", "ohlc"]) tm.assert_frame_equal(result, expected) @@ -1699,23 +1810,23 @@ def test_groupby_agg_ohlc_non_first(): def test_groupby_multiindex_nat(): # GH 9236 values = [ - (pd.NaT, 'a'), - (datetime(2012, 1, 2), 'a'), - (datetime(2012, 1, 2), 'b'), - (datetime(2012, 1, 3), 'a') + (pd.NaT, "a"), + (datetime(2012, 1, 2), "a"), + (datetime(2012, 1, 2), "b"), + (datetime(2012, 1, 3), "a"), ] - mi = pd.MultiIndex.from_tuples(values, names=['date', None]) + mi = pd.MultiIndex.from_tuples(values, names=["date", None]) ser = pd.Series([3, 2, 2.5, 4], index=mi) result = ser.groupby(level=1).mean() - expected = pd.Series([3., 2.5], index=["a", "b"]) + expected = pd.Series([3.0, 2.5], index=["a", "b"]) assert_series_equal(result, expected) def test_groupby_empty_list_raises(): # GH 5289 values = zip(range(10), range(10)) - df = DataFrame(values, columns=['apple', 'b']) + df = DataFrame(values, columns=["apple", "b"]) msg = "Grouper and axis must be same length" with pytest.raises(ValueError, match=msg): df.groupby([[]]) @@ -1723,18 +1834,14 @@ def test_groupby_empty_list_raises(): def test_groupby_multiindex_series_keys_len_equal_group_axis(): # GH 25704 - index_array = [ - ['x', 'x'], - ['a', 'b'], - ['k', 'k'] - ] - index_names = ['first', 'second', 'third'] + index_array = [["x", "x"], ["a", "b"], ["k", "k"]] + index_names = ["first", "second", "third"] ri = pd.MultiIndex.from_arrays(index_array, names=index_names) s = pd.Series(data=[1, 2], index=ri) - result = s.groupby(['first', 'third']).sum() + result = s.groupby(["first", "third"]).sum() - index_array = [['x'], ['k']] - index_names = ['first', 'third'] + index_array = [["x"], ["k"]] + index_names = ["first", "third"] ei = pd.MultiIndex.from_arrays(index_array, names=index_names) expected = pd.Series([3], index=ei) @@ -1744,14 +1851,12 @@ def test_groupby_multiindex_series_keys_len_equal_group_axis(): def test_groupby_groups_in_BaseGrouper(): # GH 26326 # Test if DataFrame grouped with a pandas.Grouper has correct groups - mi = pd.MultiIndex.from_product([['A', 'B'], - ['C', 'D']], names=['alpha', 'beta']) - df = pd.DataFrame({'foo': [1, 2, 1, 2], 'bar': [1, 2, 3, 4]}, - index=mi) - result = df.groupby([pd.Grouper(level='alpha'), 'beta']) - expected = df.groupby(['alpha', 'beta']) - assert(result.groups == expected.groups) - - result = df.groupby(['beta', pd.Grouper(level='alpha')]) - expected = df.groupby(['beta', 'alpha']) - assert(result.groups == expected.groups) + mi = pd.MultiIndex.from_product([["A", "B"], ["C", "D"]], names=["alpha", "beta"]) + df = pd.DataFrame({"foo": [1, 2, 1, 2], "bar": [1, 2, 3, 4]}, index=mi) + result = df.groupby([pd.Grouper(level="alpha"), "beta"]) + expected = df.groupby(["alpha", "beta"]) + assert result.groups == expected.groups + + result = df.groupby(["beta", pd.Grouper(level="alpha")]) + expected = df.groupby(["beta", "alpha"]) + assert result.groups == expected.groups diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 5508c290b04298..1fd67caadf2e4e 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -5,69 +5,81 @@ import pandas as pd from pandas import ( - CategoricalIndex, DataFrame, Index, MultiIndex, Series, Timestamp, - date_range) + CategoricalIndex, + DataFrame, + Index, + MultiIndex, + Series, + Timestamp, + date_range, +) from pandas.core.groupby.grouper import Grouping import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) # selection # -------------------------------- class TestSelection: - def test_select_bad_cols(self): - df = DataFrame([[1, 2]], columns=['A', 'B']) - g = df.groupby('A') - with pytest.raises(KeyError, match='"Columns not found: \'C\'"'): - g[['C']] + df = DataFrame([[1, 2]], columns=["A", "B"]) + g = df.groupby("A") + with pytest.raises(KeyError, match="\"Columns not found: 'C'\""): + g[["C"]] - with pytest.raises(KeyError, match='^[^A]+$'): + with pytest.raises(KeyError, match="^[^A]+$"): # A should not be referenced as a bad column... # will have to rethink regex if you change message! - g[['A', 'C']] + g[["A", "C"]] def test_groupby_duplicated_column_errormsg(self): # GH7511 - df = DataFrame(columns=['A', 'B', 'A', 'C'], - data=[range(4), range(2, 6), range(0, 8, 2)]) + df = DataFrame( + columns=["A", "B", "A", "C"], data=[range(4), range(2, 6), range(0, 8, 2)] + ) msg = "Grouper for 'A' not 1-dimensional" with pytest.raises(ValueError, match=msg): - df.groupby('A') + df.groupby("A") with pytest.raises(ValueError, match=msg): - df.groupby(['A', 'B']) + df.groupby(["A", "B"]) - grouped = df.groupby('B') + grouped = df.groupby("B") c = grouped.count() assert c.columns.nlevels == 1 assert c.columns.size == 3 def test_column_select_via_attr(self, df): - result = df.groupby('A').C.sum() - expected = df.groupby('A')['C'].sum() + result = df.groupby("A").C.sum() + expected = df.groupby("A")["C"].sum() assert_series_equal(result, expected) - df['mean'] = 1.5 - result = df.groupby('A').mean() - expected = df.groupby('A').agg(np.mean) + df["mean"] = 1.5 + result = df.groupby("A").mean() + expected = df.groupby("A").agg(np.mean) assert_frame_equal(result, expected) def test_getitem_list_of_columns(self): df = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8), - 'E': np.random.randn(8)}) + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + "E": np.random.randn(8), + } + ) - result = df.groupby('A')[['C', 'D']].mean() - result2 = df.groupby('A')['C', 'D'].mean() - result3 = df.groupby('A')[df.columns[2:4]].mean() + result = df.groupby("A")[["C", "D"]].mean() + result2 = df.groupby("A")["C", "D"].mean() + result3 = df.groupby("A")[df.columns[2:4]].mean() - expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean() + expected = df.loc[:, ["A", "C", "D"]].groupby("A").mean() assert_frame_equal(result, expected) assert_frame_equal(result2, expected) @@ -75,10 +87,14 @@ def test_getitem_list_of_columns(self): def test_getitem_numeric_column_names(self): # GH #13731 - df = DataFrame({0: list('abcd') * 2, - 2: np.random.randn(8), - 4: np.random.randn(8), - 6: np.random.randn(8)}) + df = DataFrame( + { + 0: list("abcd") * 2, + 2: np.random.randn(8), + 4: np.random.randn(8), + 6: np.random.randn(8), + } + ) result = df.groupby(0)[df.columns[1:3]].mean() result2 = df.groupby(0)[2, 4].mean() result3 = df.groupby(0)[[2, 4]].mean() @@ -93,110 +109,123 @@ def test_getitem_numeric_column_names(self): # grouping # -------------------------------- -class TestGrouping: +class TestGrouping: def test_grouper_index_types(self): # related GH5375 # groupby misbehaving when using a Floatlike index - df = DataFrame(np.arange(10).reshape(5, 2), columns=list('AB')) - for index in [tm.makeFloatIndex, tm.makeStringIndex, - tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex, - tm.makePeriodIndex]: + df = DataFrame(np.arange(10).reshape(5, 2), columns=list("AB")) + for index in [ + tm.makeFloatIndex, + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeIntIndex, + tm.makeDateIndex, + tm.makePeriodIndex, + ]: df.index = index(len(df)) - df.groupby(list('abcde')).apply(lambda x: x) + df.groupby(list("abcde")).apply(lambda x: x) df.index = list(reversed(df.index.tolist())) - df.groupby(list('abcde')).apply(lambda x: x) + df.groupby(list("abcde")).apply(lambda x: x) def test_grouper_multilevel_freq(self): # GH 7885 # with level and freq specified in a pd.Grouper from datetime import date, timedelta + d0 = date.today() - timedelta(days=14) dates = date_range(d0, date.today()) - date_index = pd.MultiIndex.from_product( - [dates, dates], names=['foo', 'bar']) + date_index = pd.MultiIndex.from_product([dates, dates], names=["foo", "bar"]) df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index) # Check string level - expected = df.reset_index().groupby([pd.Grouper( - key='foo', freq='W'), pd.Grouper(key='bar', freq='W')]).sum() + expected = ( + df.reset_index() + .groupby([pd.Grouper(key="foo", freq="W"), pd.Grouper(key="bar", freq="W")]) + .sum() + ) # reset index changes columns dtype to object - expected.columns = pd.Index([0], dtype='int64') + expected.columns = pd.Index([0], dtype="int64") - result = df.groupby([pd.Grouper(level='foo', freq='W'), pd.Grouper( - level='bar', freq='W')]).sum() + result = df.groupby( + [pd.Grouper(level="foo", freq="W"), pd.Grouper(level="bar", freq="W")] + ).sum() assert_frame_equal(result, expected) # Check integer level - result = df.groupby([pd.Grouper(level=0, freq='W'), pd.Grouper( - level=1, freq='W')]).sum() + result = df.groupby( + [pd.Grouper(level=0, freq="W"), pd.Grouper(level=1, freq="W")] + ).sum() assert_frame_equal(result, expected) def test_grouper_creation_bug(self): # GH 8795 - df = DataFrame({'A': [0, 0, 1, 1, 2, 2], 'B': [1, 2, 3, 4, 5, 6]}) - g = df.groupby('A') + df = DataFrame({"A": [0, 0, 1, 1, 2, 2], "B": [1, 2, 3, 4, 5, 6]}) + g = df.groupby("A") expected = g.sum() - g = df.groupby(pd.Grouper(key='A')) + g = df.groupby(pd.Grouper(key="A")) result = g.sum() assert_frame_equal(result, expected) result = g.apply(lambda x: x.sum()) assert_frame_equal(result, expected) - g = df.groupby(pd.Grouper(key='A', axis=0)) + g = df.groupby(pd.Grouper(key="A", axis=0)) result = g.sum() assert_frame_equal(result, expected) # GH14334 # pd.Grouper(key=...) may be passed in a list - df = DataFrame({'A': [0, 0, 0, 1, 1, 1], - 'B': [1, 1, 2, 2, 3, 3], - 'C': [1, 2, 3, 4, 5, 6]}) + df = DataFrame( + {"A": [0, 0, 0, 1, 1, 1], "B": [1, 1, 2, 2, 3, 3], "C": [1, 2, 3, 4, 5, 6]} + ) # Group by single column - expected = df.groupby('A').sum() - g = df.groupby([pd.Grouper(key='A')]) + expected = df.groupby("A").sum() + g = df.groupby([pd.Grouper(key="A")]) result = g.sum() assert_frame_equal(result, expected) # Group by two columns # using a combination of strings and Grouper objects - expected = df.groupby(['A', 'B']).sum() + expected = df.groupby(["A", "B"]).sum() # Group with two Grouper objects - g = df.groupby([pd.Grouper(key='A'), pd.Grouper(key='B')]) + g = df.groupby([pd.Grouper(key="A"), pd.Grouper(key="B")]) result = g.sum() assert_frame_equal(result, expected) # Group with a string and a Grouper object - g = df.groupby(['A', pd.Grouper(key='B')]) + g = df.groupby(["A", pd.Grouper(key="B")]) result = g.sum() assert_frame_equal(result, expected) # Group with a Grouper object and a string - g = df.groupby([pd.Grouper(key='A'), 'B']) + g = df.groupby([pd.Grouper(key="A"), "B"]) result = g.sum() assert_frame_equal(result, expected) # GH8866 - s = Series(np.arange(8, dtype='int64'), - index=pd.MultiIndex.from_product( - [list('ab'), range(2), - date_range('20130101', periods=2)], - names=['one', 'two', 'three'])) - result = s.groupby(pd.Grouper(level='three', freq='M')).sum() - expected = Series([28], index=Index( - [Timestamp('2013-01-31')], freq='M', name='three')) + s = Series( + np.arange(8, dtype="int64"), + index=pd.MultiIndex.from_product( + [list("ab"), range(2), date_range("20130101", periods=2)], + names=["one", "two", "three"], + ), + ) + result = s.groupby(pd.Grouper(level="three", freq="M")).sum() + expected = Series( + [28], index=Index([Timestamp("2013-01-31")], freq="M", name="three") + ) assert_series_equal(result, expected) # just specifying a level breaks - result = s.groupby(pd.Grouper(level='one')).sum() - expected = s.groupby(level='one').sum() + result = s.groupby(pd.Grouper(level="one")).sum() + expected = s.groupby(level="one").sum() assert_series_equal(result, expected) def test_grouper_column_and_index(self): @@ -204,39 +233,41 @@ def test_grouper_column_and_index(self): # Grouping a multi-index frame by a column and an index level should # be equivalent to resetting the index and grouping by two columns - idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3), - ('b', 1), ('b', 2), ('b', 3)]) - idx.names = ['outer', 'inner'] - df_multi = pd.DataFrame({"A": np.arange(6), - 'B': ['one', 'one', 'two', - 'two', 'one', 'one']}, - index=idx) - result = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean() - expected = df_multi.reset_index().groupby(['B', 'inner']).mean() + idx = pd.MultiIndex.from_tuples( + [("a", 1), ("a", 2), ("a", 3), ("b", 1), ("b", 2), ("b", 3)] + ) + idx.names = ["outer", "inner"] + df_multi = pd.DataFrame( + {"A": np.arange(6), "B": ["one", "one", "two", "two", "one", "one"]}, + index=idx, + ) + result = df_multi.groupby(["B", pd.Grouper(level="inner")]).mean() + expected = df_multi.reset_index().groupby(["B", "inner"]).mean() assert_frame_equal(result, expected) # Test the reverse grouping order - result = df_multi.groupby([pd.Grouper(level='inner'), 'B']).mean() - expected = df_multi.reset_index().groupby(['inner', 'B']).mean() + result = df_multi.groupby([pd.Grouper(level="inner"), "B"]).mean() + expected = df_multi.reset_index().groupby(["inner", "B"]).mean() assert_frame_equal(result, expected) # Grouping a single-index frame by a column and the index should # be equivalent to resetting the index and grouping by two columns - df_single = df_multi.reset_index('outer') - result = df_single.groupby(['B', pd.Grouper(level='inner')]).mean() - expected = df_single.reset_index().groupby(['B', 'inner']).mean() + df_single = df_multi.reset_index("outer") + result = df_single.groupby(["B", pd.Grouper(level="inner")]).mean() + expected = df_single.reset_index().groupby(["B", "inner"]).mean() assert_frame_equal(result, expected) # Test the reverse grouping order - result = df_single.groupby([pd.Grouper(level='inner'), 'B']).mean() - expected = df_single.reset_index().groupby(['inner', 'B']).mean() + result = df_single.groupby([pd.Grouper(level="inner"), "B"]).mean() + expected = df_single.reset_index().groupby(["inner", "B"]).mean() assert_frame_equal(result, expected) def test_groupby_levels_and_columns(self): # GH9344, GH9049 - idx_names = ['x', 'y'] + idx_names = ["x", "y"] idx = pd.MultiIndex.from_tuples( - [(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names) + [(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names + ) df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx) by_levels = df.groupby(level=idx_names).mean() @@ -250,26 +281,18 @@ def test_groupby_levels_and_columns(self): def test_groupby_categorical_index_and_columns(self, observed): # GH18432, adapted for GH25871 - columns = ['A', 'B', 'A', 'B'] - categories = ['B', 'A'] - data = np.array([[1, 2, 1, 2], - [1, 2, 1, 2], - [1, 2, 1, 2], - [1, 2, 1, 2], - [1, 2, 1, 2]], int) - cat_columns = CategoricalIndex(columns, - categories=categories, - ordered=True) + columns = ["A", "B", "A", "B"] + categories = ["B", "A"] + data = np.array( + [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2]], int + ) + cat_columns = CategoricalIndex(columns, categories=categories, ordered=True) df = DataFrame(data=data, columns=cat_columns) result = df.groupby(axis=1, level=0, observed=observed).sum() - expected_data = np.array([[4, 2], - [4, 2], - [4, 2], - [4, 2], - [4, 2]], int) - expected_columns = CategoricalIndex(categories, - categories=categories, - ordered=True) + expected_data = np.array([[4, 2], [4, 2], [4, 2], [4, 2], [4, 2]], int) + expected_columns = CategoricalIndex( + categories, categories=categories, ordered=True + ) expected = DataFrame(data=expected_data, columns=expected_columns) assert_frame_equal(result, expected) @@ -284,20 +307,26 @@ def test_grouper_getting_correct_binner(self): # GH 10063 # using a non-time-based grouper and a time-based grouper # and specifying levels - df = DataFrame({'A': 1}, index=pd.MultiIndex.from_product( - [list('ab'), date_range('20130101', periods=80)], names=['one', - 'two'])) - result = df.groupby([pd.Grouper(level='one'), pd.Grouper( - level='two', freq='M')]).sum() - expected = DataFrame({'A': [31, 28, 21, 31, 28, 21]}, - index=MultiIndex.from_product( - [list('ab'), - date_range('20130101', freq='M', periods=3)], - names=['one', 'two'])) + df = DataFrame( + {"A": 1}, + index=pd.MultiIndex.from_product( + [list("ab"), date_range("20130101", periods=80)], names=["one", "two"] + ), + ) + result = df.groupby( + [pd.Grouper(level="one"), pd.Grouper(level="two", freq="M")] + ).sum() + expected = DataFrame( + {"A": [31, 28, 21, 31, 28, 21]}, + index=MultiIndex.from_product( + [list("ab"), date_range("20130101", freq="M", periods=3)], + names=["one", "two"], + ), + ) assert_frame_equal(result, expected) def test_grouper_iter(self, df): - assert sorted(df.groupby('A').grouper) == ['bar', 'foo'] + assert sorted(df.groupby("A").grouper) == ["bar", "foo"] def test_empty_groups(self, df): # see gh-1048 @@ -305,7 +334,7 @@ def test_empty_groups(self, df): df.groupby([]) def test_groupby_grouper(self, df): - grouped = df.groupby('A') + grouped = df.groupby("A") result = df.groupby(grouped.grouper).mean() expected = grouped.mean() @@ -314,13 +343,14 @@ def test_groupby_grouper(self, df): def test_groupby_dict_mapping(self): # GH #679 from pandas import Series - s = Series({'T1': 5}) - result = s.groupby({'T1': 'T2'}).agg(sum) - expected = s.groupby(['T2']).agg(sum) + + s = Series({"T1": 5}) + result = s.groupby({"T1": "T2"}).agg(sum) + expected = s.groupby(["T2"]).agg(sum) assert_series_equal(result, expected) - s = Series([1., 2., 3., 4.], index=list('abcd')) - mapping = {'a': 0, 'b': 0, 'c': 1, 'd': 1} + s = Series([1.0, 2.0, 3.0, 4.0], index=list("abcd")) + mapping = {"a": 0, "b": 0, "c": 1, "d": 1} result = s.groupby(mapping).mean() result2 = s.groupby(mapping).agg(np.mean) @@ -331,7 +361,7 @@ def test_groupby_dict_mapping(self): assert_series_equal(result, expected2) def test_groupby_grouper_f_sanity_checked(self): - dates = date_range('01-Jan-2013', periods=12, freq='MS') + dates = date_range("01-Jan-2013", periods=12, freq="MS") ts = Series(np.random.randn(12), index=dates) # GH3035 @@ -348,10 +378,9 @@ def test_groupby_grouper_f_sanity_checked(self): ts.groupby(lambda key: key[0:6]) def test_grouping_error_on_multidim_input(self, df): - msg = ("Grouper for ''" - " not 1-dimensional") + msg = "Grouper for ''" " not 1-dimensional" with pytest.raises(ValueError, match=msg): - Grouping(df.index, df[['A', 'A']]) + Grouping(df.index, df[["A", "A"]]) def test_multiindex_passthru(self): @@ -366,76 +395,77 @@ def test_multiindex_passthru(self): def test_multiindex_negative_level(self, mframe): # GH 13901 result = mframe.groupby(level=-1).sum() - expected = mframe.groupby(level='second').sum() + expected = mframe.groupby(level="second").sum() assert_frame_equal(result, expected) result = mframe.groupby(level=-2).sum() - expected = mframe.groupby(level='first').sum() + expected = mframe.groupby(level="first").sum() assert_frame_equal(result, expected) result = mframe.groupby(level=[-2, -1]).sum() expected = mframe assert_frame_equal(result, expected) - result = mframe.groupby(level=[-1, 'first']).sum() - expected = mframe.groupby(level=['second', 'first']).sum() + result = mframe.groupby(level=[-1, "first"]).sum() + expected = mframe.groupby(level=["second", "first"]).sum() assert_frame_equal(result, expected) def test_multifunc_select_col_integer_cols(self, df): df.columns = np.arange(len(df.columns)) # it works! - df.groupby(1, as_index=False)[2].agg({'Q': np.mean}) + df.groupby(1, as_index=False)[2].agg({"Q": np.mean}) def test_multiindex_columns_empty_level(self): - lst = [['count', 'values'], ['to filter', '']] + lst = [["count", "values"], ["to filter", ""]] midx = MultiIndex.from_tuples(lst) - df = DataFrame([[1, 'A']], columns=midx) + df = DataFrame([[1, "A"]], columns=midx) - grouped = df.groupby('to filter').groups - assert grouped['A'] == [0] + grouped = df.groupby("to filter").groups + assert grouped["A"] == [0] - grouped = df.groupby([('to filter', '')]).groups - assert grouped['A'] == [0] + grouped = df.groupby([("to filter", "")]).groups + assert grouped["A"] == [0] - df = DataFrame([[1, 'A'], [2, 'B']], columns=midx) + df = DataFrame([[1, "A"], [2, "B"]], columns=midx) - expected = df.groupby('to filter').groups - result = df.groupby([('to filter', '')]).groups + expected = df.groupby("to filter").groups + result = df.groupby([("to filter", "")]).groups assert result == expected - df = DataFrame([[1, 'A'], [2, 'A']], columns=midx) + df = DataFrame([[1, "A"], [2, "A"]], columns=midx) - expected = df.groupby('to filter').groups - result = df.groupby([('to filter', '')]).groups + expected = df.groupby("to filter").groups + result = df.groupby([("to filter", "")]).groups tm.assert_dict_equal(result, expected) def test_groupby_multiindex_tuple(self): # GH 17979 - df = pd.DataFrame([[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]], - columns=pd.MultiIndex.from_arrays( - [['a', 'b', 'b', 'c'], - [1, 1, 2, 2]])) - expected = df.groupby([('b', 1)]).groups - result = df.groupby(('b', 1)).groups + df = pd.DataFrame( + [[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]], + columns=pd.MultiIndex.from_arrays([["a", "b", "b", "c"], [1, 1, 2, 2]]), + ) + expected = df.groupby([("b", 1)]).groups + result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) - df2 = pd.DataFrame(df.values, - columns=pd.MultiIndex.from_arrays( - [['a', 'b', 'b', 'c'], - ['d', 'd', 'e', 'e']])) - expected = df2.groupby([('b', 'd')]).groups - result = df.groupby(('b', 1)).groups + df2 = pd.DataFrame( + df.values, + columns=pd.MultiIndex.from_arrays( + [["a", "b", "b", "c"], ["d", "d", "e", "e"]] + ), + ) + expected = df2.groupby([("b", "d")]).groups + result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) - df3 = pd.DataFrame(df.values, - columns=[('a', 'd'), ('b', 'd'), ('b', 'e'), 'c']) - expected = df3.groupby([('b', 'd')]).groups - result = df.groupby(('b', 1)).groups + df3 = pd.DataFrame(df.values, columns=[("a", "d"), ("b", "d"), ("b", "e"), "c"]) + expected = df3.groupby([("b", "d")]).groups + result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) - @pytest.mark.parametrize('sort', [True, False]) + @pytest.mark.parametrize("sort", [True, False]) def test_groupby_level(self, sort, mframe, df): # GH 17537 frame = mframe @@ -444,14 +474,14 @@ def test_groupby_level(self, sort, mframe, df): result0 = frame.groupby(level=0, sort=sort).sum() result1 = frame.groupby(level=1, sort=sort).sum() - expected0 = frame.groupby(deleveled['first'].values, sort=sort).sum() - expected1 = frame.groupby(deleveled['second'].values, sort=sort).sum() + expected0 = frame.groupby(deleveled["first"].values, sort=sort).sum() + expected1 = frame.groupby(deleveled["second"].values, sort=sort).sum() - expected0.index.name = 'first' - expected1.index.name = 'second' + expected0.index.name = "first" + expected1.index.name = "second" - assert result0.index.name == 'first' - assert result1.index.name == 'second' + assert result0.index.name == "first" + assert result1.index.name == "second" assert_frame_equal(result0, expected0) assert_frame_equal(result1, expected1) @@ -459,8 +489,8 @@ def test_groupby_level(self, sort, mframe, df): assert result1.index.name == frame.index.names[1] # groupby level name - result0 = frame.groupby(level='first', sort=sort).sum() - result1 = frame.groupby(level='second', sort=sort).sum() + result0 = frame.groupby(level="first", sort=sort).sum() + result1 = frame.groupby(level="second", sort=sort).sum() assert_frame_equal(result0, expected0) assert_frame_equal(result1, expected1) @@ -478,34 +508,37 @@ def test_groupby_level(self, sort, mframe, df): def test_groupby_level_index_names(self): # GH4014 this used to raise ValueError since 'exp'>1 (in py2) - df = DataFrame({'exp': ['A'] * 3 + ['B'] * 3, - 'var1': range(6), }).set_index('exp') - df.groupby(level='exp') + df = DataFrame({"exp": ["A"] * 3 + ["B"] * 3, "var1": range(6)}).set_index( + "exp" + ) + df.groupby(level="exp") msg = "level name foo is not the name of the index" with pytest.raises(ValueError, match=msg): - df.groupby(level='foo') + df.groupby(level="foo") - @pytest.mark.parametrize('sort', [True, False]) + @pytest.mark.parametrize("sort", [True, False]) def test_groupby_level_with_nas(self, sort): # GH 17537 - index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], - codes=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, - 2, 3]]) + index = MultiIndex( + levels=[[1, 0], [0, 1, 2, 3]], + codes=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]], + ) # factorizing doesn't confuse things - s = Series(np.arange(8.), index=index) + s = Series(np.arange(8.0), index=index) result = s.groupby(level=0, sort=sort).sum() - expected = Series([6., 22.], index=[0, 1]) + expected = Series([6.0, 22.0], index=[0, 1]) assert_series_equal(result, expected) - index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], - codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, - 1, 2, 3]]) + index = MultiIndex( + levels=[[1, 0], [0, 1, 2, 3]], + codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]], + ) # factorizing doesn't confuse things - s = Series(np.arange(8.), index=index) + s = Series(np.arange(8.0), index=index) result = s.groupby(level=0, sort=sort).sum() - expected = Series([6., 18.], index=[0.0, 1.0]) + expected = Series([6.0, 18.0], index=[0.0, 1.0]) assert_series_equal(result, expected) def test_groupby_args(self, mframe): @@ -520,10 +553,13 @@ def test_groupby_args(self, mframe): with pytest.raises(TypeError, match=msg): frame.groupby(by=None, level=None) - @pytest.mark.parametrize('sort,labels', [ - [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]], - [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]] - ]) + @pytest.mark.parametrize( + "sort,labels", + [ + [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]], + [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]], + ], + ) def test_level_preserve_order(self, sort, labels, mframe): # GH 17537 grouped = mframe.groupby(level=0, sort=sort) @@ -537,28 +573,28 @@ def test_grouping_labels(self, mframe): def test_list_grouper_with_nat(self): # GH 14715 - df = pd.DataFrame({'date': pd.date_range('1/1/2011', - periods=365, freq='D')}) + df = pd.DataFrame({"date": pd.date_range("1/1/2011", periods=365, freq="D")}) df.iloc[-1] = pd.NaT - grouper = pd.Grouper(key='date', freq='AS') + grouper = pd.Grouper(key="date", freq="AS") # Grouper in a list grouping result = df.groupby([grouper]) - expected = {pd.Timestamp('2011-01-01'): pd.Index(list(range(364)))} + expected = {pd.Timestamp("2011-01-01"): pd.Index(list(range(364)))} tm.assert_dict_equal(result.groups, expected) # Test case without a list result = df.groupby(grouper) - expected = {pd.Timestamp('2011-01-01'): 365} + expected = {pd.Timestamp("2011-01-01"): 365} tm.assert_dict_equal(result.groups, expected) @pytest.mark.parametrize( - 'func,expected', + "func,expected", [ - ('transform', pd.Series(name=2, index=pd.RangeIndex(0, 0, 1))), - ('agg', pd.Series(name=2, index=pd.Float64Index([], name=1))), - ('apply', pd.Series(name=2, index=pd.Float64Index([], name=1))), - ]) + ("transform", pd.Series(name=2, index=pd.RangeIndex(0, 0, 1))), + ("agg", pd.Series(name=2, index=pd.Float64Index([], name=1))), + ("apply", pd.Series(name=2, index=pd.Float64Index([], name=1))), + ], + ) def test_evaluate_with_empty_groups(self, func, expected): # 26208 # test transform'ing empty groups @@ -571,7 +607,7 @@ def test_evaluate_with_empty_groups(self, func, expected): def test_groupby_empty(self): # https://github.com/pandas-dev/pandas/issues/27190 - s = pd.Series([], name='name') + s = pd.Series([], name="name") gr = s.groupby([]) result = gr.mean() @@ -579,32 +615,46 @@ def test_groupby_empty(self): # check group properties assert len(gr.grouper.groupings) == 1 - tm.assert_numpy_array_equal(gr.grouper.group_info[0], - np.array([], dtype=np.dtype("intp"))) + tm.assert_numpy_array_equal( + gr.grouper.group_info[0], np.array([], dtype=np.dtype("intp")) + ) - tm.assert_numpy_array_equal(gr.grouper.group_info[1], - np.array([], dtype=np.dtype('int'))) + tm.assert_numpy_array_equal( + gr.grouper.group_info[1], np.array([], dtype=np.dtype("int")) + ) assert gr.grouper.group_info[2] == 0 # check name - assert s.groupby(s).grouper.names == ['name'] + assert s.groupby(s).grouper.names == ["name"] # get_group # -------------------------------- + class TestGetGroup: def test_get_group(self): # GH 5267 # be datelike friendly - df = DataFrame({'DATE': pd.to_datetime( - ['10-Oct-2013', '10-Oct-2013', '10-Oct-2013', '11-Oct-2013', - '11-Oct-2013', '11-Oct-2013']), - 'label': ['foo', 'foo', 'bar', 'foo', 'foo', 'bar'], - 'VAL': [1, 2, 3, 4, 5, 6]}) - - g = df.groupby('DATE') + df = DataFrame( + { + "DATE": pd.to_datetime( + [ + "10-Oct-2013", + "10-Oct-2013", + "10-Oct-2013", + "11-Oct-2013", + "11-Oct-2013", + "11-Oct-2013", + ] + ), + "label": ["foo", "foo", "bar", "foo", "foo", "bar"], + "VAL": [1, 2, 3, 4, 5, 6], + } + ) + + g = df.groupby("DATE") key = list(g.groups)[0] result1 = g.get_group(key) result2 = g.get_group(Timestamp(key).to_pydatetime()) @@ -612,7 +662,7 @@ def test_get_group(self): assert_frame_equal(result1, result2) assert_frame_equal(result1, result3) - g = df.groupby(['DATE', 'label']) + g = df.groupby(["DATE", "label"]) key = list(g.groups)[0] result1 = g.get_group(key) @@ -624,13 +674,15 @@ def test_get_group(self): # must pass a same-length tuple with multiple keys msg = "must supply a tuple to get_group with multiple grouping keys" with pytest.raises(ValueError, match=msg): - g.get_group('foo') + g.get_group("foo") with pytest.raises(ValueError, match=msg): - g.get_group(('foo')) - msg = ("must supply a same-length tuple to get_group with multiple" - " grouping keys") + g.get_group(("foo")) + msg = ( + "must supply a same-length tuple to get_group with multiple" + " grouping keys" + ) with pytest.raises(ValueError, match=msg): - g.get_group(('foo', 'bar', 'baz')) + g.get_group(("foo", "bar", "baz")) def test_get_group_empty_bins(self, observed): @@ -650,71 +702,70 @@ def test_get_group_empty_bins(self, observed): def test_get_group_grouped_by_tuple(self): # GH 8121 - df = DataFrame([[(1, ), (1, 2), (1, ), (1, 2)]], index=['ids']).T - gr = df.groupby('ids') - expected = DataFrame({'ids': [(1, ), (1, )]}, index=[0, 2]) - result = gr.get_group((1, )) + df = DataFrame([[(1,), (1, 2), (1,), (1, 2)]], index=["ids"]).T + gr = df.groupby("ids") + expected = DataFrame({"ids": [(1,), (1,)]}, index=[0, 2]) + result = gr.get_group((1,)) assert_frame_equal(result, expected) - dt = pd.to_datetime(['2010-01-01', '2010-01-02', '2010-01-01', - '2010-01-02']) - df = DataFrame({'ids': [(x, ) for x in dt]}) - gr = df.groupby('ids') - result = gr.get_group(('2010-01-01', )) - expected = DataFrame({'ids': [(dt[0], ), (dt[0], )]}, index=[0, 2]) + dt = pd.to_datetime(["2010-01-01", "2010-01-02", "2010-01-01", "2010-01-02"]) + df = DataFrame({"ids": [(x,) for x in dt]}) + gr = df.groupby("ids") + result = gr.get_group(("2010-01-01",)) + expected = DataFrame({"ids": [(dt[0],), (dt[0],)]}, index=[0, 2]) assert_frame_equal(result, expected) def test_groupby_with_empty(self): index = pd.DatetimeIndex(()) data = () series = pd.Series(data, index) - grouper = pd.Grouper(freq='D') + grouper = pd.Grouper(freq="D") grouped = series.groupby(grouper) assert next(iter(grouped), None) is None def test_groupby_with_single_column(self): - df = pd.DataFrame({'a': list('abssbab')}) - tm.assert_frame_equal(df.groupby('a').get_group('a'), df.iloc[[0, 5]]) + df = pd.DataFrame({"a": list("abssbab")}) + tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]]) # GH 13530 - exp = pd.DataFrame(index=pd.Index(['a', 'b', 's'], name='a')) - tm.assert_frame_equal(df.groupby('a').count(), exp) - tm.assert_frame_equal(df.groupby('a').sum(), exp) - tm.assert_frame_equal(df.groupby('a').nth(1), exp) + exp = pd.DataFrame(index=pd.Index(["a", "b", "s"], name="a")) + tm.assert_frame_equal(df.groupby("a").count(), exp) + tm.assert_frame_equal(df.groupby("a").sum(), exp) + tm.assert_frame_equal(df.groupby("a").nth(1), exp) def test_gb_key_len_equal_axis_len(self): # GH16843 # test ensures that index and column keys are recognized correctly # when number of keys equals axis length of groupby - df = pd.DataFrame([['foo', 'bar', 'B', 1], - ['foo', 'bar', 'B', 2], - ['foo', 'baz', 'C', 3]], - columns=['first', 'second', 'third', 'one']) - df = df.set_index(['first', 'second']) - df = df.groupby(['first', 'second', 'third']).size() - assert df.loc[('foo', 'bar', 'B')] == 2 - assert df.loc[('foo', 'baz', 'C')] == 1 + df = pd.DataFrame( + [["foo", "bar", "B", 1], ["foo", "bar", "B", 2], ["foo", "baz", "C", 3]], + columns=["first", "second", "third", "one"], + ) + df = df.set_index(["first", "second"]) + df = df.groupby(["first", "second", "third"]).size() + assert df.loc[("foo", "bar", "B")] == 2 + assert df.loc[("foo", "baz", "C")] == 1 # groups & iteration # -------------------------------- -class TestIteration: +class TestIteration: def test_groups(self, df): - grouped = df.groupby(['A']) + grouped = df.groupby(["A"]) groups = grouped.groups assert groups is grouped.groups # caching works for k, v in grouped.groups.items(): - assert (df.loc[v]['A'] == k).all() + assert (df.loc[v]["A"] == k).all() - grouped = df.groupby(['A', 'B']) + grouped = df.groupby(["A", "B"]) groups = grouped.groups assert groups is grouped.groups # caching works for k, v in grouped.groups.items(): - assert (df.loc[v]['A'] == k[0]).all() - assert (df.loc[v]['B'] == k[1]).all() + assert (df.loc[v]["A"] == k[0]).all() + assert (df.loc[v]["B"] == k[1]).all() def test_grouping_is_iterable(self, tsframe): # this code path isn't used anywhere else @@ -727,14 +778,18 @@ def test_grouping_is_iterable(self, tsframe): def test_multi_iter(self): s = Series(np.arange(6)) - k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b']) - k2 = np.array(['1', '2', '1', '2', '1', '2']) + k1 = np.array(["a", "a", "a", "b", "b", "b"]) + k2 = np.array(["1", "2", "1", "2", "1", "2"]) grouped = s.groupby([k1, k2]) iterated = list(grouped) - expected = [('a', '1', s[[0, 2]]), ('a', '2', s[[1]]), - ('b', '1', s[[4]]), ('b', '2', s[[3, 5]])] + expected = [ + ("a", "1", s[[0, 2]]), + ("a", "2", s[[1]]), + ("b", "1", s[[4]]), + ("b", "2", s[[3, 5]]), + ] for i, ((one, two), three) in enumerate(iterated): e1, e2, e3 = expected[i] assert e1 == one @@ -742,22 +797,24 @@ def test_multi_iter(self): assert_series_equal(three, e3) def test_multi_iter_frame(self, three_group): - k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a']) - k2 = np.array(['1', '2', '1', '2', '1', '2']) - df = DataFrame({'v1': np.random.randn(6), - 'v2': np.random.randn(6), - 'k1': k1, 'k2': k2}, - index=['one', 'two', 'three', 'four', 'five', 'six']) + k1 = np.array(["b", "b", "b", "a", "a", "a"]) + k2 = np.array(["1", "2", "1", "2", "1", "2"]) + df = DataFrame( + {"v1": np.random.randn(6), "v2": np.random.randn(6), "k1": k1, "k2": k2}, + index=["one", "two", "three", "four", "five", "six"], + ) - grouped = df.groupby(['k1', 'k2']) + grouped = df.groupby(["k1", "k2"]) # things get sorted! iterated = list(grouped) idx = df.index - expected = [('a', '1', df.loc[idx[[4]]]), - ('a', '2', df.loc[idx[[3, 5]]]), - ('b', '1', df.loc[idx[[0, 2]]]), - ('b', '2', df.loc[idx[[1]]])] + expected = [ + ("a", "1", df.loc[idx[[4]]]), + ("a", "2", df.loc[idx[[3, 5]]]), + ("b", "1", df.loc[idx[[0, 2]]]), + ("b", "2", df.loc[idx[[1]]]), + ] for i, ((one, two), three) in enumerate(iterated): e1, e2, e3 = expected[i] assert e1 == one @@ -765,82 +822,83 @@ def test_multi_iter_frame(self, three_group): assert_frame_equal(three, e3) # don't iterate through groups with no data - df['k1'] = np.array(['b', 'b', 'b', 'a', 'a', 'a']) - df['k2'] = np.array(['1', '1', '1', '2', '2', '2']) - grouped = df.groupby(['k1', 'k2']) + df["k1"] = np.array(["b", "b", "b", "a", "a", "a"]) + df["k2"] = np.array(["1", "1", "1", "2", "2", "2"]) + grouped = df.groupby(["k1", "k2"]) groups = {key: gp for key, gp in grouped} assert len(groups) == 2 # axis = 1 - three_levels = three_group.groupby(['A', 'B', 'C']).mean() + three_levels = three_group.groupby(["A", "B", "C"]).mean() grouped = three_levels.T.groupby(axis=1, level=(1, 2)) for key, group in grouped: pass def test_dictify(self, df): - dict(iter(df.groupby('A'))) - dict(iter(df.groupby(['A', 'B']))) - dict(iter(df['C'].groupby(df['A']))) - dict(iter(df['C'].groupby([df['A'], df['B']]))) - dict(iter(df.groupby('A')['C'])) - dict(iter(df.groupby(['A', 'B'])['C'])) + dict(iter(df.groupby("A"))) + dict(iter(df.groupby(["A", "B"]))) + dict(iter(df["C"].groupby(df["A"]))) + dict(iter(df["C"].groupby([df["A"], df["B"]]))) + dict(iter(df.groupby("A")["C"])) + dict(iter(df.groupby(["A", "B"])["C"])) def test_groupby_with_small_elem(self): # GH 8542 # length=2 - df = pd.DataFrame({'event': ['start', 'start'], - 'change': [1234, 5678]}, - index=pd.DatetimeIndex(['2014-09-10', '2013-10-10'])) - grouped = df.groupby([pd.Grouper(freq='M'), 'event']) + df = pd.DataFrame( + {"event": ["start", "start"], "change": [1234, 5678]}, + index=pd.DatetimeIndex(["2014-09-10", "2013-10-10"]), + ) + grouped = df.groupby([pd.Grouper(freq="M"), "event"]) assert len(grouped.groups) == 2 assert grouped.ngroups == 2 - assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups - assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups + assert (pd.Timestamp("2014-09-30"), "start") in grouped.groups + assert (pd.Timestamp("2013-10-31"), "start") in grouped.groups - res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start')) + res = grouped.get_group((pd.Timestamp("2014-09-30"), "start")) tm.assert_frame_equal(res, df.iloc[[0], :]) - res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start')) + res = grouped.get_group((pd.Timestamp("2013-10-31"), "start")) tm.assert_frame_equal(res, df.iloc[[1], :]) - df = pd.DataFrame({'event': ['start', 'start', 'start'], - 'change': [1234, 5678, 9123]}, - index=pd.DatetimeIndex(['2014-09-10', '2013-10-10', - '2014-09-15'])) - grouped = df.groupby([pd.Grouper(freq='M'), 'event']) + df = pd.DataFrame( + {"event": ["start", "start", "start"], "change": [1234, 5678, 9123]}, + index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-09-15"]), + ) + grouped = df.groupby([pd.Grouper(freq="M"), "event"]) assert len(grouped.groups) == 2 assert grouped.ngroups == 2 - assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups - assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups + assert (pd.Timestamp("2014-09-30"), "start") in grouped.groups + assert (pd.Timestamp("2013-10-31"), "start") in grouped.groups - res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start')) + res = grouped.get_group((pd.Timestamp("2014-09-30"), "start")) tm.assert_frame_equal(res, df.iloc[[0, 2], :]) - res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start')) + res = grouped.get_group((pd.Timestamp("2013-10-31"), "start")) tm.assert_frame_equal(res, df.iloc[[1], :]) # length=3 - df = pd.DataFrame({'event': ['start', 'start', 'start'], - 'change': [1234, 5678, 9123]}, - index=pd.DatetimeIndex(['2014-09-10', '2013-10-10', - '2014-08-05'])) - grouped = df.groupby([pd.Grouper(freq='M'), 'event']) + df = pd.DataFrame( + {"event": ["start", "start", "start"], "change": [1234, 5678, 9123]}, + index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-08-05"]), + ) + grouped = df.groupby([pd.Grouper(freq="M"), "event"]) assert len(grouped.groups) == 3 assert grouped.ngroups == 3 - assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups - assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups - assert (pd.Timestamp('2014-08-31'), 'start') in grouped.groups + assert (pd.Timestamp("2014-09-30"), "start") in grouped.groups + assert (pd.Timestamp("2013-10-31"), "start") in grouped.groups + assert (pd.Timestamp("2014-08-31"), "start") in grouped.groups - res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start')) + res = grouped.get_group((pd.Timestamp("2014-09-30"), "start")) tm.assert_frame_equal(res, df.iloc[[0], :]) - res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start')) + res = grouped.get_group((pd.Timestamp("2013-10-31"), "start")) tm.assert_frame_equal(res, df.iloc[[1], :]) - res = grouped.get_group((pd.Timestamp('2014-08-31'), 'start')) + res = grouped.get_group((pd.Timestamp("2014-08-31"), "start")) tm.assert_frame_equal(res, df.iloc[[2], :]) def test_grouping_string_repr(self): # GH 13394 mi = MultiIndex.from_arrays([list("AAB"), list("aba")]) df = DataFrame([[1, 2, 3]], columns=mi) - gr = df.groupby(df[('A', 'a')]) + gr = df.groupby(df[("A", "a")]) result = gr.grouper.groupings[0].__repr__() expected = "Grouping(('A', 'a'))" diff --git a/pandas/tests/groupby/test_index_as_string.py b/pandas/tests/groupby/test_index_as_string.py index 141381f84300b5..71d545e960566d 100644 --- a/pandas/tests/groupby/test_index_as_string.py +++ b/pandas/tests/groupby/test_index_as_string.py @@ -5,13 +5,17 @@ from pandas.util.testing import assert_frame_equal, assert_series_equal -@pytest.fixture(params=[['inner'], ['inner', 'outer']]) +@pytest.fixture(params=[["inner"], ["inner", "outer"]]) def frame(request): levels = request.param - df = pd.DataFrame({'outer': ['a', 'a', 'a', 'b', 'b', 'b'], - 'inner': [1, 2, 3, 1, 2, 3], - 'A': np.arange(6), - 'B': ['one', 'one', 'two', 'two', 'one', 'one']}) + df = pd.DataFrame( + { + "outer": ["a", "a", "a", "b", "b", "b"], + "inner": [1, 2, 3, 1, 2, 3], + "A": np.arange(6), + "B": ["one", "one", "two", "two", "one", "one"], + } + ) if levels: df = df.set_index(levels) @@ -20,39 +24,49 @@ def frame(request): @pytest.fixture() def series(): - df = pd.DataFrame({'outer': ['a', 'a', 'a', 'b', 'b', 'b'], - 'inner': [1, 2, 3, 1, 2, 3], - 'A': np.arange(6), - 'B': ['one', 'one', 'two', 'two', 'one', 'one']}) - s = df.set_index(['outer', 'inner', 'B'])['A'] + df = pd.DataFrame( + { + "outer": ["a", "a", "a", "b", "b", "b"], + "inner": [1, 2, 3, 1, 2, 3], + "A": np.arange(6), + "B": ["one", "one", "two", "two", "one", "one"], + } + ) + s = df.set_index(["outer", "inner", "B"])["A"] return s -@pytest.mark.parametrize('key_strs,groupers', [ - ('inner', # Index name - pd.Grouper(level='inner') - ), - (['inner'], # List of index name - [pd.Grouper(level='inner')] - ), - (['B', 'inner'], # Column and index - ['B', pd.Grouper(level='inner')] - ), - (['inner', 'B'], # Index and column - [pd.Grouper(level='inner'), 'B'])]) +@pytest.mark.parametrize( + "key_strs,groupers", + [ + ("inner", pd.Grouper(level="inner")), # Index name + (["inner"], [pd.Grouper(level="inner")]), # List of index name + (["B", "inner"], ["B", pd.Grouper(level="inner")]), # Column and index + (["inner", "B"], [pd.Grouper(level="inner"), "B"]), # Index and column + ], +) def test_grouper_index_level_as_string(frame, key_strs, groupers): result = frame.groupby(key_strs).mean() expected = frame.groupby(groupers).mean() assert_frame_equal(result, expected) -@pytest.mark.parametrize('levels', [ - 'inner', 'outer', 'B', - ['inner'], ['outer'], ['B'], - ['inner', 'outer'], ['outer', 'inner'], - ['inner', 'outer', 'B'], ['B', 'outer', 'inner'] -]) +@pytest.mark.parametrize( + "levels", + [ + "inner", + "outer", + "B", + ["inner"], + ["outer"], + ["B"], + ["inner", "outer"], + ["outer", "inner"], + ["inner", "outer", "B"], + ["B", "outer", "inner"], + ], +) def test_grouper_index_level_as_string_series(series, levels): # Compute expected result diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index deb0f48b9cea2c..2c4b56793580cf 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -8,10 +8,10 @@ def test_first_last_nth(df): # tests for first / last / nth - grouped = df.groupby('A') + grouped = df.groupby("A") first = grouped.first() - expected = df.loc[[1, 0], ['B', 'C', 'D']] - expected.index = Index(['bar', 'foo'], name='A') + expected = df.loc[[1, 0], ["B", "C", "D"]] + expected.index = Index(["bar", "foo"], name="A") expected = expected.sort_index() assert_frame_equal(first, expected) @@ -19,122 +19,120 @@ def test_first_last_nth(df): assert_frame_equal(nth, expected) last = grouped.last() - expected = df.loc[[5, 7], ['B', 'C', 'D']] - expected.index = Index(['bar', 'foo'], name='A') + expected = df.loc[[5, 7], ["B", "C", "D"]] + expected.index = Index(["bar", "foo"], name="A") assert_frame_equal(last, expected) nth = grouped.nth(-1) assert_frame_equal(nth, expected) nth = grouped.nth(1) - expected = df.loc[[2, 3], ['B', 'C', 'D']].copy() - expected.index = Index(['foo', 'bar'], name='A') + expected = df.loc[[2, 3], ["B", "C", "D"]].copy() + expected.index = Index(["foo", "bar"], name="A") expected = expected.sort_index() assert_frame_equal(nth, expected) # it works! - grouped['B'].first() - grouped['B'].last() - grouped['B'].nth(0) + grouped["B"].first() + grouped["B"].last() + grouped["B"].nth(0) - df.loc[df['A'] == 'foo', 'B'] = np.nan - assert isna(grouped['B'].first()['foo']) - assert isna(grouped['B'].last()['foo']) - assert isna(grouped['B'].nth(0)['foo']) + df.loc[df["A"] == "foo", "B"] = np.nan + assert isna(grouped["B"].first()["foo"]) + assert isna(grouped["B"].last()["foo"]) + assert isna(grouped["B"].nth(0)["foo"]) # v0.14.0 whatsnew - df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + g = df.groupby("A") result = g.first() - expected = df.iloc[[1, 2]].set_index('A') + expected = df.iloc[[1, 2]].set_index("A") assert_frame_equal(result, expected) - expected = df.iloc[[1, 2]].set_index('A') - result = g.nth(0, dropna='any') + expected = df.iloc[[1, 2]].set_index("A") + result = g.nth(0, dropna="any") assert_frame_equal(result, expected) def test_first_last_nth_dtypes(df_mixed_floats): df = df_mixed_floats.copy() - df['E'] = True - df['F'] = 1 + df["E"] = True + df["F"] = 1 # tests for first / last / nth - grouped = df.groupby('A') + grouped = df.groupby("A") first = grouped.first() - expected = df.loc[[1, 0], ['B', 'C', 'D', 'E', 'F']] - expected.index = Index(['bar', 'foo'], name='A') + expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]] + expected.index = Index(["bar", "foo"], name="A") expected = expected.sort_index() assert_frame_equal(first, expected) last = grouped.last() - expected = df.loc[[5, 7], ['B', 'C', 'D', 'E', 'F']] - expected.index = Index(['bar', 'foo'], name='A') + expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]] + expected.index = Index(["bar", "foo"], name="A") expected = expected.sort_index() assert_frame_equal(last, expected) nth = grouped.nth(1) - expected = df.loc[[3, 2], ['B', 'C', 'D', 'E', 'F']] - expected.index = Index(['bar', 'foo'], name='A') + expected = df.loc[[3, 2], ["B", "C", "D", "E", "F"]] + expected.index = Index(["bar", "foo"], name="A") expected = expected.sort_index() assert_frame_equal(nth, expected) # GH 2763, first/last shifting dtypes idx = list(range(10)) idx.append(9) - s = Series(data=range(11), index=idx, name='IntCol') - assert s.dtype == 'int64' + s = Series(data=range(11), index=idx, name="IntCol") + assert s.dtype == "int64" f = s.groupby(level=0).first() - assert f.dtype == 'int64' + assert f.dtype == "int64" def test_nth(): - df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - - assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A')) - assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A')) - assert_frame_equal(g.nth(2), df.loc[[]].set_index('A')) - assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A')) - assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A')) - assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A')) - assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]]) - assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]]) - assert_frame_equal(g[['B']].nth(0), - df.loc[[0, 2], ['A', 'B']].set_index('A')) - - exp = df.set_index('A') - assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]]) - assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]]) - - exp['B'] = np.nan - assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]]) - assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]]) + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + g = df.groupby("A") + + assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index("A")) + assert_frame_equal(g.nth(1), df.iloc[[1]].set_index("A")) + assert_frame_equal(g.nth(2), df.loc[[]].set_index("A")) + assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index("A")) + assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index("A")) + assert_frame_equal(g.nth(-3), df.loc[[]].set_index("A")) + assert_series_equal(g.B.nth(0), df.set_index("A").B.iloc[[0, 2]]) + assert_series_equal(g.B.nth(1), df.set_index("A").B.iloc[[1]]) + assert_frame_equal(g[["B"]].nth(0), df.loc[[0, 2], ["A", "B"]].set_index("A")) + + exp = df.set_index("A") + assert_frame_equal(g.nth(0, dropna="any"), exp.iloc[[1, 2]]) + assert_frame_equal(g.nth(-1, dropna="any"), exp.iloc[[1, 2]]) + + exp["B"] = np.nan + assert_frame_equal(g.nth(7, dropna="any"), exp.iloc[[1, 2]]) + assert_frame_equal(g.nth(2, dropna="any"), exp.iloc[[1, 2]]) # out of bounds, regression from 0.13.1 # GH 6621 - df = DataFrame({'color': {0: 'green', - 1: 'green', - 2: 'red', - 3: 'red', - 4: 'red'}, - 'food': {0: 'ham', - 1: 'eggs', - 2: 'eggs', - 3: 'ham', - 4: 'pork'}, - 'two': {0: 1.5456590000000001, - 1: -0.070345000000000005, - 2: -2.4004539999999999, - 3: 0.46206000000000003, - 4: 0.52350799999999997}, - 'one': {0: 0.56573799999999996, - 1: -0.9742360000000001, - 2: 1.033801, - 3: -0.78543499999999999, - 4: 0.70422799999999997}}).set_index(['color', - 'food']) + df = DataFrame( + { + "color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"}, + "food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"}, + "two": { + 0: 1.5456590000000001, + 1: -0.070345000000000005, + 2: -2.4004539999999999, + 3: 0.46206000000000003, + 4: 0.52350799999999997, + }, + "one": { + 0: 0.56573799999999996, + 1: -0.9742360000000001, + 2: 1.033801, + 3: -0.78543499999999999, + 4: 0.70422799999999997, + }, + } + ).set_index(["color", "food"]) result = df.groupby(level=0, as_index=False).nth(2) expected = df.iloc[[-1]] @@ -146,7 +144,7 @@ def test_nth(): # GH 7559 # from the vbench - df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype='int64') + df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype="int64") s = df[1] g = df[0] expected = s.groupby(g).first() @@ -164,93 +162,115 @@ def test_nth(): # as it keeps the order in the series (and not the group order) # related GH 7287 expected = s.groupby(g, sort=False).first() - result = s.groupby(g, sort=False).nth(0, dropna='all') + result = s.groupby(g, sort=False).nth(0, dropna="all") assert_series_equal(result, expected) - with pytest.raises(ValueError, match='For a DataFrame groupby'): + with pytest.raises(ValueError, match="For a DataFrame groupby"): s.groupby(g, sort=False).nth(0, dropna=True) # doc example - df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - result = g.B.nth(0, dropna='all') + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + g = df.groupby("A") + result = g.B.nth(0, dropna="all") expected = g.B.first() assert_series_equal(result, expected) # test multiple nth values - df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], - columns=['A', 'B']) - g = df.groupby('A') - - assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A')) - assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A')) - assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A')) - assert_frame_equal( - g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A')) - assert_frame_equal( - g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A')) - assert_frame_equal( - g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A')) - assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A')) - assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A')) - - business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', - freq='B') - df = DataFrame(1, index=business_dates, columns=['a', 'b']) + df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"]) + g = df.groupby("A") + + assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index("A")) + assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index("A")) + assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index("A")) + assert_frame_equal(g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index("A")) + assert_frame_equal(g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index("A")) + assert_frame_equal(g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index("A")) + assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index("A")) + assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index("A")) + + business_dates = pd.date_range(start="4/1/2014", end="6/30/2014", freq="B") + df = DataFrame(1, index=business_dates, columns=["a", "b"]) # get the first, fourth and last two business days for each month key = [df.index.year, df.index.month] result = df.groupby(key, as_index=False).nth([0, 3, -2, -1]) expected_dates = pd.to_datetime( - ['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1', - '2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5', - '2014/6/27', '2014/6/30']) - expected = DataFrame(1, columns=['a', 'b'], index=expected_dates) + [ + "2014/4/1", + "2014/4/4", + "2014/4/29", + "2014/4/30", + "2014/5/1", + "2014/5/6", + "2014/5/29", + "2014/5/30", + "2014/6/2", + "2014/6/5", + "2014/6/27", + "2014/6/30", + ] + ) + expected = DataFrame(1, columns=["a", "b"], index=expected_dates) assert_frame_equal(result, expected) def test_nth_multi_index(three_group): # PR 9090, related to issue 8979 # test nth on MultiIndex, should match .first() - grouped = three_group.groupby(['A', 'B']) + grouped = three_group.groupby(["A", "B"]) result = grouped.nth(0) expected = grouped.first() assert_frame_equal(result, expected) -@pytest.mark.parametrize('data, expected_first, expected_last', [ - ({'id': ['A'], - 'time': Timestamp('2012-02-01 14:00:00', - tz='US/Central'), - 'foo': [1]}, - {'id': ['A'], - 'time': Timestamp('2012-02-01 14:00:00', - tz='US/Central'), - 'foo': [1]}, - {'id': ['A'], - 'time': Timestamp('2012-02-01 14:00:00', - tz='US/Central'), - 'foo': [1]}), - ({'id': ['A', 'B', 'A'], - 'time': [Timestamp('2012-01-01 13:00:00', - tz='America/New_York'), - Timestamp('2012-02-01 14:00:00', - tz='US/Central'), - Timestamp('2012-03-01 12:00:00', - tz='Europe/London')], - 'foo': [1, 2, 3]}, - {'id': ['A', 'B'], - 'time': [Timestamp('2012-01-01 13:00:00', - tz='America/New_York'), - Timestamp('2012-02-01 14:00:00', - tz='US/Central')], - 'foo': [1, 2]}, - {'id': ['A', 'B'], - 'time': [Timestamp('2012-03-01 12:00:00', - tz='Europe/London'), - Timestamp('2012-02-01 14:00:00', - tz='US/Central')], - 'foo': [3, 2]}) -]) +@pytest.mark.parametrize( + "data, expected_first, expected_last", + [ + ( + { + "id": ["A"], + "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), + "foo": [1], + }, + { + "id": ["A"], + "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), + "foo": [1], + }, + { + "id": ["A"], + "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), + "foo": [1], + }, + ), + ( + { + "id": ["A", "B", "A"], + "time": [ + Timestamp("2012-01-01 13:00:00", tz="America/New_York"), + Timestamp("2012-02-01 14:00:00", tz="US/Central"), + Timestamp("2012-03-01 12:00:00", tz="Europe/London"), + ], + "foo": [1, 2, 3], + }, + { + "id": ["A", "B"], + "time": [ + Timestamp("2012-01-01 13:00:00", tz="America/New_York"), + Timestamp("2012-02-01 14:00:00", tz="US/Central"), + ], + "foo": [1, 2], + }, + { + "id": ["A", "B"], + "time": [ + Timestamp("2012-03-01 12:00:00", tz="Europe/London"), + Timestamp("2012-02-01 14:00:00", tz="US/Central"), + ], + "foo": [3, 2], + }, + ), + ], +) def test_first_last_tz(data, expected_first, expected_last): # GH15884 # Test that the timezone is retained when calling first @@ -258,43 +278,50 @@ def test_first_last_tz(data, expected_first, expected_last): df = DataFrame(data) - result = df.groupby('id', as_index=False).first() + result = df.groupby("id", as_index=False).first() expected = DataFrame(expected_first) - cols = ['id', 'time', 'foo'] + cols = ["id", "time", "foo"] assert_frame_equal(result[cols], expected[cols]) - result = df.groupby('id', as_index=False)['time'].first() - assert_frame_equal(result, expected[['id', 'time']]) + result = df.groupby("id", as_index=False)["time"].first() + assert_frame_equal(result, expected[["id", "time"]]) - result = df.groupby('id', as_index=False).last() + result = df.groupby("id", as_index=False).last() expected = DataFrame(expected_last) - cols = ['id', 'time', 'foo'] + cols = ["id", "time", "foo"] assert_frame_equal(result[cols], expected[cols]) - result = df.groupby('id', as_index=False)['time'].last() - assert_frame_equal(result, expected[['id', 'time']]) + result = df.groupby("id", as_index=False)["time"].last() + assert_frame_equal(result, expected[["id", "time"]]) -@pytest.mark.parametrize('method, ts, alpha', [ - ['first', Timestamp('2013-01-01', tz='US/Eastern'), 'a'], - ['last', Timestamp('2013-01-02', tz='US/Eastern'), 'b'] -]) +@pytest.mark.parametrize( + "method, ts, alpha", + [ + ["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"], + ["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"], + ], +) def test_first_last_tz_multi_column(method, ts, alpha): # GH 21603 - category_string = pd.Series(list('abc')).astype( - 'category') - df = pd.DataFrame({'group': [1, 1, 2], - 'category_string': category_string, - 'datetimetz': pd.date_range('20130101', periods=3, - tz='US/Eastern')}) - result = getattr(df.groupby('group'), method)() + category_string = pd.Series(list("abc")).astype("category") + df = pd.DataFrame( + { + "group": [1, 1, 2], + "category_string": category_string, + "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"), + } + ) + result = getattr(df.groupby("group"), method)() expected = pd.DataFrame( - {'category_string': pd.Categorical( - [alpha, 'c'], dtype=category_string.dtype), - 'datetimetz': [ts, - Timestamp('2013-01-03', - tz='US/Eastern')]}, - index=pd.Index([1, 2], name='group')) + { + "category_string": pd.Categorical( + [alpha, "c"], dtype=category_string.dtype + ), + "datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")], + }, + index=pd.Index([1, 2], name="group"), + ) assert_frame_equal(result, expected) @@ -302,35 +329,74 @@ def test_nth_multi_index_as_expected(): # PR 9090, related to issue 8979 # test nth on MultiIndex three_group = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny']}) - grouped = three_group.groupby(['A', 'B']) + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + } + ) + grouped = three_group.groupby(["A", "B"]) result = grouped.nth(0) expected = DataFrame( - {'C': ['dull', 'dull', 'dull', 'dull']}, - index=MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo'], - ['one', 'two', 'one', 'two']], - names=['A', 'B'])) + {"C": ["dull", "dull", "dull", "dull"]}, + index=MultiIndex.from_arrays( + [["bar", "bar", "foo", "foo"], ["one", "two", "one", "two"]], + names=["A", "B"], + ), + ) assert_frame_equal(result, expected) def test_groupby_head_tail(): - df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) - g_as = df.groupby('A', as_index=True) - g_not_as = df.groupby('A', as_index=False) + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) + g_as = df.groupby("A", as_index=True) + g_not_as = df.groupby("A", as_index=False) # as_index= False, much easier assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1)) assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1)) - empty_not_as = DataFrame(columns=df.columns, - index=pd.Index([], dtype=df.index.dtype)) - empty_not_as['A'] = empty_not_as['A'].astype(df.A.dtype) - empty_not_as['B'] = empty_not_as['B'].astype(df.B.dtype) + empty_not_as = DataFrame( + columns=df.columns, index=pd.Index([], dtype=df.index.dtype) + ) + empty_not_as["A"] = empty_not_as["A"].astype(df.A.dtype) + empty_not_as["B"] = empty_not_as["B"].astype(df.B.dtype) assert_frame_equal(empty_not_as, g_not_as.head(0)) assert_frame_equal(empty_not_as, g_not_as.tail(0)) assert_frame_equal(empty_not_as, g_not_as.head(-1)) @@ -346,8 +412,8 @@ def test_groupby_head_tail(): assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1)) empty_as = DataFrame(index=df_as.index[:0], columns=df.columns) - empty_as['A'] = empty_not_as['A'].astype(df.A.dtype) - empty_as['B'] = empty_not_as['B'].astype(df.B.dtype) + empty_as["A"] = empty_not_as["A"].astype(df.A.dtype) + empty_as["B"] = empty_not_as["B"].astype(df.B.dtype) assert_frame_equal(empty_as, g_as.head(0)) assert_frame_equal(empty_as, g_as.tail(0)) assert_frame_equal(empty_as, g_as.head(-1)) @@ -358,40 +424,40 @@ def test_groupby_head_tail(): # test with selection assert_frame_equal(g_as[[]].head(1), df_as.loc[[0, 2], []]) - assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0, 2], ['A']]) - assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0, 2], ['B']]) - assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0, 2]]) + assert_frame_equal(g_as[["A"]].head(1), df_as.loc[[0, 2], ["A"]]) + assert_frame_equal(g_as[["B"]].head(1), df_as.loc[[0, 2], ["B"]]) + assert_frame_equal(g_as[["A", "B"]].head(1), df_as.loc[[0, 2]]) assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0, 2], []]) - assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0, 2], ['A']]) - assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0, 2], ['B']]) - assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0, 2]]) + assert_frame_equal(g_not_as[["A"]].head(1), df_as.loc[[0, 2], ["A"]]) + assert_frame_equal(g_not_as[["B"]].head(1), df_as.loc[[0, 2], ["B"]]) + assert_frame_equal(g_not_as[["A", "B"]].head(1), df_as.loc[[0, 2]]) def test_group_selection_cache(): # GH 12839 nth, head, and tail should return same result consistently - df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) - expected = df.iloc[[0, 2]].set_index('A') + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) + expected = df.iloc[[0, 2]].set_index("A") - g = df.groupby('A') + g = df.groupby("A") result1 = g.head(n=2) result2 = g.nth(0) assert_frame_equal(result1, df) assert_frame_equal(result2, expected) - g = df.groupby('A') + g = df.groupby("A") result1 = g.tail(n=2) result2 = g.nth(0) assert_frame_equal(result1, df) assert_frame_equal(result2, expected) - g = df.groupby('A') + g = df.groupby("A") result1 = g.nth(0) result2 = g.head(n=2) assert_frame_equal(result1, expected) assert_frame_equal(result2, df) - g = df.groupby('A') + g = df.groupby("A") result1 = g.nth(0) result2 = g.tail(n=2) assert_frame_equal(result1, expected) @@ -400,54 +466,48 @@ def test_group_selection_cache(): def test_nth_empty(): # GH 16064 - df = DataFrame(index=[0], columns=['a', 'b', 'c']) - result = df.groupby('a').nth(10) - expected = DataFrame(index=Index([], name='a'), columns=['b', 'c']) + df = DataFrame(index=[0], columns=["a", "b", "c"]) + result = df.groupby("a").nth(10) + expected = DataFrame(index=Index([], name="a"), columns=["b", "c"]) assert_frame_equal(result, expected) - result = df.groupby(['a', 'b']).nth(10) - expected = DataFrame(index=MultiIndex([[], []], [[], []], - names=['a', 'b']), - columns=['c']) + result = df.groupby(["a", "b"]).nth(10) + expected = DataFrame( + index=MultiIndex([[], []], [[], []], names=["a", "b"]), columns=["c"] + ) assert_frame_equal(result, expected) def test_nth_column_order(): # GH 20760 # Check that nth preserves column order - df = DataFrame([[1, 'b', 100], - [1, 'a', 50], - [1, 'a', np.nan], - [2, 'c', 200], - [2, 'd', 150]], - columns=['A', 'C', 'B']) - result = df.groupby('A').nth(0) - expected = DataFrame([['b', 100.0], - ['c', 200.0]], - columns=['C', 'B'], - index=Index([1, 2], name='A')) + df = DataFrame( + [[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]], + columns=["A", "C", "B"], + ) + result = df.groupby("A").nth(0) + expected = DataFrame( + [["b", 100.0], ["c", 200.0]], columns=["C", "B"], index=Index([1, 2], name="A") + ) assert_frame_equal(result, expected) - result = df.groupby('A').nth(-1, dropna='any') - expected = DataFrame([['a', 50.0], - ['d', 150.0]], - columns=['C', 'B'], - index=Index([1, 2], name='A')) + result = df.groupby("A").nth(-1, dropna="any") + expected = DataFrame( + [["a", 50.0], ["d", 150.0]], columns=["C", "B"], index=Index([1, 2], name="A") + ) assert_frame_equal(result, expected) -@pytest.mark.parametrize("dropna", [None, 'any', 'all']) +@pytest.mark.parametrize("dropna", [None, "any", "all"]) def test_nth_nan_in_grouper(dropna): # GH 26011 - df = DataFrame([ - [np.nan, 0, 1], - ['abc', 2, 3], - [np.nan, 4, 5], - ['def', 6, 7], - [np.nan, 8, 9], - ], columns=list('abc')) - result = df.groupby('a').nth(0, dropna=dropna) - expected = pd.DataFrame([[2, 3], [6, 7]], columns=list('bc'), - index=Index(['abc', 'def'], name='a')) + df = DataFrame( + [[np.nan, 0, 1], ["abc", 2, 3], [np.nan, 4, 5], ["def", 6, 7], [np.nan, 8, 9]], + columns=list("abc"), + ) + result = df.groupby("a").nth(0, dropna=dropna) + expected = pd.DataFrame( + [[2, 3], [6, 7]], columns=list("bc"), index=Index(["abc", "def"], name="a") + ) assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 9b0396bb530a1d..a6ea793b53c413 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -12,275 +12,410 @@ def test_rank_apply(): lab1 = np.random.randint(0, 100, size=500) lab2 = np.random.randint(0, 130, size=500) - df = DataFrame({'value': np.random.randn(500), - 'key1': lev1.take(lab1), - 'key2': lev2.take(lab2)}) + df = DataFrame( + { + "value": np.random.randn(500), + "key1": lev1.take(lab1), + "key2": lev2.take(lab2), + } + ) - result = df.groupby(['key1', 'key2']).value.rank() + result = df.groupby(["key1", "key2"]).value.rank() - expected = [piece.value.rank() - for key, piece in df.groupby(['key1', 'key2'])] + expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])] expected = concat(expected, axis=0) expected = expected.reindex(result.index) tm.assert_series_equal(result, expected) - result = df.groupby(['key1', 'key2']).value.rank(pct=True) + result = df.groupby(["key1", "key2"]).value.rank(pct=True) - expected = [piece.value.rank(pct=True) - for key, piece in df.groupby(['key1', 'key2'])] + expected = [ + piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"]) + ] expected = concat(expected, axis=0) expected = expected.reindex(result.index) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("grps", [ - ['qux'], ['qux', 'quux']]) -@pytest.mark.parametrize("vals", [ - [2, 2, 8, 2, 6], - [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), - pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), - pd.Timestamp('2018-01-06')]]) -@pytest.mark.parametrize("ties_method,ascending,pct,exp", [ - ('average', True, False, [2., 2., 5., 2., 4.]), - ('average', True, True, [0.4, 0.4, 1.0, 0.4, 0.8]), - ('average', False, False, [4., 4., 1., 4., 2.]), - ('average', False, True, [.8, .8, .2, .8, .4]), - ('min', True, False, [1., 1., 5., 1., 4.]), - ('min', True, True, [0.2, 0.2, 1.0, 0.2, 0.8]), - ('min', False, False, [3., 3., 1., 3., 2.]), - ('min', False, True, [.6, .6, .2, .6, .4]), - ('max', True, False, [3., 3., 5., 3., 4.]), - ('max', True, True, [0.6, 0.6, 1.0, 0.6, 0.8]), - ('max', False, False, [5., 5., 1., 5., 2.]), - ('max', False, True, [1., 1., .2, 1., .4]), - ('first', True, False, [1., 2., 5., 3., 4.]), - ('first', True, True, [0.2, 0.4, 1.0, 0.6, 0.8]), - ('first', False, False, [3., 4., 1., 5., 2.]), - ('first', False, True, [.6, .8, .2, 1., .4]), - ('dense', True, False, [1., 1., 3., 1., 2.]), - ('dense', True, True, [1. / 3., 1. / 3., 3. / 3., 1. / 3., 2. / 3.]), - ('dense', False, False, [3., 3., 1., 3., 2.]), - ('dense', False, True, [3. / 3., 3. / 3., 1. / 3., 3. / 3., 2. / 3.]), -]) +@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) +@pytest.mark.parametrize( + "vals", + [ + [2, 2, 8, 2, 6], + [ + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-08"), + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-06"), + ], + ], +) +@pytest.mark.parametrize( + "ties_method,ascending,pct,exp", + [ + ("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]), + ("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]), + ("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]), + ("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]), + ("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]), + ("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]), + ("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]), + ("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]), + ("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]), + ("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]), + ("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]), + ("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]), + ("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]), + ("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]), + ("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]), + ("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]), + ("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]), + ("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]), + ("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]), + ("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]), + ], +) def test_rank_args(grps, vals, ties_method, ascending, pct, exp): key = np.repeat(grps, len(vals)) vals = vals * len(grps) - df = DataFrame({'key': key, 'val': vals}) - result = df.groupby('key').rank(method=ties_method, - ascending=ascending, pct=pct) + df = DataFrame({"key": key, "val": vals}) + result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct) - exp_df = DataFrame(exp * len(grps), columns=['val']) + exp_df = DataFrame(exp * len(grps), columns=["val"]) tm.assert_frame_equal(result, exp_df) -@pytest.mark.parametrize("grps", [ - ['qux'], ['qux', 'quux']]) -@pytest.mark.parametrize("vals", [ - [-np.inf, -np.inf, np.nan, 1., np.nan, np.inf, np.inf], -]) -@pytest.mark.parametrize("ties_method,ascending,na_option,exp", [ - ('average', True, 'keep', [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]), - ('average', True, 'top', [3.5, 3.5, 1.5, 5., 1.5, 6.5, 6.5]), - ('average', True, 'bottom', [1.5, 1.5, 6.5, 3., 6.5, 4.5, 4.5]), - ('average', False, 'keep', [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]), - ('average', False, 'top', [6.5, 6.5, 1.5, 5., 1.5, 3.5, 3.5]), - ('average', False, 'bottom', [4.5, 4.5, 6.5, 3., 6.5, 1.5, 1.5]), - ('min', True, 'keep', [1., 1., np.nan, 3., np.nan, 4., 4.]), - ('min', True, 'top', [3., 3., 1., 5., 1., 6., 6.]), - ('min', True, 'bottom', [1., 1., 6., 3., 6., 4., 4.]), - ('min', False, 'keep', [4., 4., np.nan, 3., np.nan, 1., 1.]), - ('min', False, 'top', [6., 6., 1., 5., 1., 3., 3.]), - ('min', False, 'bottom', [4., 4., 6., 3., 6., 1., 1.]), - ('max', True, 'keep', [2., 2., np.nan, 3., np.nan, 5., 5.]), - ('max', True, 'top', [4., 4., 2., 5., 2., 7., 7.]), - ('max', True, 'bottom', [2., 2., 7., 3., 7., 5., 5.]), - ('max', False, 'keep', [5., 5., np.nan, 3., np.nan, 2., 2.]), - ('max', False, 'top', [7., 7., 2., 5., 2., 4., 4.]), - ('max', False, 'bottom', [5., 5., 7., 3., 7., 2., 2.]), - ('first', True, 'keep', [1., 2., np.nan, 3., np.nan, 4., 5.]), - ('first', True, 'top', [3., 4., 1., 5., 2., 6., 7.]), - ('first', True, 'bottom', [1., 2., 6., 3., 7., 4., 5.]), - ('first', False, 'keep', [4., 5., np.nan, 3., np.nan, 1., 2.]), - ('first', False, 'top', [6., 7., 1., 5., 2., 3., 4.]), - ('first', False, 'bottom', [4., 5., 6., 3., 7., 1., 2.]), - ('dense', True, 'keep', [1., 1., np.nan, 2., np.nan, 3., 3.]), - ('dense', True, 'top', [2., 2., 1., 3., 1., 4., 4.]), - ('dense', True, 'bottom', [1., 1., 4., 2., 4., 3., 3.]), - ('dense', False, 'keep', [3., 3., np.nan, 2., np.nan, 1., 1.]), - ('dense', False, 'top', [4., 4., 1., 3., 1., 2., 2.]), - ('dense', False, 'bottom', [3., 3., 4., 2., 4., 1., 1.]) -]) +@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) +@pytest.mark.parametrize( + "vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]] +) +@pytest.mark.parametrize( + "ties_method,ascending,na_option,exp", + [ + ("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]), + ("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]), + ("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]), + ("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]), + ("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]), + ("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]), + ("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]), + ("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]), + ("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]), + ("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]), + ("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]), + ("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]), + ("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]), + ("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]), + ("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]), + ("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]), + ("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]), + ("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]), + ("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]), + ("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]), + ("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]), + ("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]), + ("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]), + ("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]), + ("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]), + ("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]), + ("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]), + ("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]), + ("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]), + ("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]), + ], +) def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): # GH 20561 key = np.repeat(grps, len(vals)) vals = vals * len(grps) - df = DataFrame({'key': key, 'val': vals}) - result = df.groupby('key').rank(method=ties_method, - ascending=ascending, - na_option=na_option) - exp_df = DataFrame(exp * len(grps), columns=['val']) + df = DataFrame({"key": key, "val": vals}) + result = df.groupby("key").rank( + method=ties_method, ascending=ascending, na_option=na_option + ) + exp_df = DataFrame(exp * len(grps), columns=["val"]) tm.assert_frame_equal(result, exp_df) -@pytest.mark.parametrize("grps", [ - ['qux'], ['qux', 'quux']]) -@pytest.mark.parametrize("vals", [ - [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], - [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan, - pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), - pd.Timestamp('2018-01-06'), np.nan, np.nan] -]) -@pytest.mark.parametrize("ties_method,ascending,na_option,pct,exp", [ - ('average', True, 'keep', False, - [2., 2., np.nan, 5., 2., 4., np.nan, np.nan]), - ('average', True, 'keep', True, - [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan]), - ('average', False, 'keep', False, - [4., 4., np.nan, 1., 4., 2., np.nan, np.nan]), - ('average', False, 'keep', True, - [.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan]), - ('min', True, 'keep', False, - [1., 1., np.nan, 5., 1., 4., np.nan, np.nan]), - ('min', True, 'keep', True, - [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]), - ('min', False, 'keep', False, - [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), - ('min', False, 'keep', True, - [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), - ('max', True, 'keep', False, - [3., 3., np.nan, 5., 3., 4., np.nan, np.nan]), - ('max', True, 'keep', True, - [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), - ('max', False, 'keep', False, - [5., 5., np.nan, 1., 5., 2., np.nan, np.nan]), - ('max', False, 'keep', True, - [1., 1., np.nan, 0.2, 1., 0.4, np.nan, np.nan]), - ('first', True, 'keep', False, - [1., 2., np.nan, 5., 3., 4., np.nan, np.nan]), - ('first', True, 'keep', True, - [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), - ('first', False, 'keep', False, - [3., 4., np.nan, 1., 5., 2., np.nan, np.nan]), - ('first', False, 'keep', True, - [.6, 0.8, np.nan, 0.2, 1., 0.4, np.nan, np.nan]), - ('dense', True, 'keep', False, - [1., 1., np.nan, 3., 1., 2., np.nan, np.nan]), - ('dense', True, 'keep', True, - [1. / 3., 1. / 3., np.nan, 3. / 3., 1. / 3., 2. / 3., np.nan, np.nan]), - ('dense', False, 'keep', False, - [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), - ('dense', False, 'keep', True, - [3. / 3., 3. / 3., np.nan, 1. / 3., 3. / 3., 2. / 3., np.nan, np.nan]), - ('average', True, 'bottom', False, [2., 2., 7., 5., 2., 4., 7., 7.]), - ('average', True, 'bottom', True, - [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]), - ('average', False, 'bottom', False, [4., 4., 7., 1., 4., 2., 7., 7.]), - ('average', False, 'bottom', True, - [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875]), - ('min', True, 'bottom', False, [1., 1., 6., 5., 1., 4., 6., 6.]), - ('min', True, 'bottom', True, - [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75]), - ('min', False, 'bottom', False, [3., 3., 6., 1., 3., 2., 6., 6.]), - ('min', False, 'bottom', True, - [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75]), - ('max', True, 'bottom', False, [3., 3., 8., 5., 3., 4., 8., 8.]), - ('max', True, 'bottom', True, - [0.375, 0.375, 1., 0.625, 0.375, 0.5, 1., 1.]), - ('max', False, 'bottom', False, [5., 5., 8., 1., 5., 2., 8., 8.]), - ('max', False, 'bottom', True, - [0.625, 0.625, 1., 0.125, 0.625, 0.25, 1., 1.]), - ('first', True, 'bottom', False, [1., 2., 6., 5., 3., 4., 7., 8.]), - ('first', True, 'bottom', True, - [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.]), - ('first', False, 'bottom', False, [3., 4., 6., 1., 5., 2., 7., 8.]), - ('first', False, 'bottom', True, - [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]), - ('dense', True, 'bottom', False, [1., 1., 4., 3., 1., 2., 4., 4.]), - ('dense', True, 'bottom', True, - [0.25, 0.25, 1., 0.75, 0.25, 0.5, 1., 1.]), - ('dense', False, 'bottom', False, [3., 3., 4., 1., 3., 2., 4., 4.]), - ('dense', False, 'bottom', True, - [0.75, 0.75, 1., 0.25, 0.75, 0.5, 1., 1.]) -]) -def test_rank_args_missing(grps, vals, ties_method, ascending, - na_option, pct, exp): +@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) +@pytest.mark.parametrize( + "vals", + [ + [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], + [ + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-02"), + np.nan, + pd.Timestamp("2018-01-08"), + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-06"), + np.nan, + np.nan, + ], + ], +) +@pytest.mark.parametrize( + "ties_method,ascending,na_option,pct,exp", + [ + ( + "average", + True, + "keep", + False, + [2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan], + ), + ( + "average", + True, + "keep", + True, + [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan], + ), + ( + "average", + False, + "keep", + False, + [4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan], + ), + ( + "average", + False, + "keep", + True, + [0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan], + ), + ("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]), + ("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]), + ( + "min", + False, + "keep", + False, + [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan], + ), + ("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), + ("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]), + ("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), + ( + "max", + False, + "keep", + False, + [5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan], + ), + ("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]), + ( + "first", + True, + "keep", + False, + [1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan], + ), + ( + "first", + True, + "keep", + True, + [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan], + ), + ( + "first", + False, + "keep", + False, + [3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan], + ), + ( + "first", + False, + "keep", + True, + [0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan], + ), + ( + "dense", + True, + "keep", + False, + [1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan], + ), + ( + "dense", + True, + "keep", + True, + [ + 1.0 / 3.0, + 1.0 / 3.0, + np.nan, + 3.0 / 3.0, + 1.0 / 3.0, + 2.0 / 3.0, + np.nan, + np.nan, + ], + ), + ( + "dense", + False, + "keep", + False, + [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan], + ), + ( + "dense", + False, + "keep", + True, + [ + 3.0 / 3.0, + 3.0 / 3.0, + np.nan, + 1.0 / 3.0, + 3.0 / 3.0, + 2.0 / 3.0, + np.nan, + np.nan, + ], + ), + ("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]), + ( + "average", + True, + "bottom", + True, + [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875], + ), + ("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]), + ( + "average", + False, + "bottom", + True, + [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875], + ), + ("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]), + ( + "min", + True, + "bottom", + True, + [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75], + ), + ("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]), + ( + "min", + False, + "bottom", + True, + [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75], + ), + ("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]), + ("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]), + ("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]), + ( + "max", + False, + "bottom", + True, + [0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0], + ), + ("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]), + ( + "first", + True, + "bottom", + True, + [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0], + ), + ("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]), + ( + "first", + False, + "bottom", + True, + [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0], + ), + ("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]), + ("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]), + ("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]), + ("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]), + ], +) +def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp): key = np.repeat(grps, len(vals)) vals = vals * len(grps) - df = DataFrame({'key': key, 'val': vals}) - result = df.groupby('key').rank(method=ties_method, - ascending=ascending, - na_option=na_option, pct=pct) + df = DataFrame({"key": key, "val": vals}) + result = df.groupby("key").rank( + method=ties_method, ascending=ascending, na_option=na_option, pct=pct + ) - exp_df = DataFrame(exp * len(grps), columns=['val']) + exp_df = DataFrame(exp * len(grps), columns=["val"]) tm.assert_frame_equal(result, exp_df) -@pytest.mark.parametrize("pct,exp", [ - (False, [3., 3., 3., 3., 3.]), - (True, [.6, .6, .6, .6, .6])]) +@pytest.mark.parametrize( + "pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])] +) def test_rank_resets_each_group(pct, exp): df = DataFrame( - {'key': ['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'], - 'val': [1] * 10} + {"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10} ) - result = df.groupby('key').rank(pct=pct) - exp_df = DataFrame(exp * 2, columns=['val']) + result = df.groupby("key").rank(pct=pct) + exp_df = DataFrame(exp * 2, columns=["val"]) tm.assert_frame_equal(result, exp_df) def test_rank_avg_even_vals(): - df = DataFrame({'key': ['a'] * 4, 'val': [1] * 4}) - result = df.groupby('key').rank() - exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=['val']) + df = DataFrame({"key": ["a"] * 4, "val": [1] * 4}) + result = df.groupby("key").rank() + exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"]) tm.assert_frame_equal(result, exp_df) -@pytest.mark.parametrize("ties_method", [ - 'average', 'min', 'max', 'first', 'dense']) +@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"]) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) @pytest.mark.parametrize("pct", [True, False]) -@pytest.mark.parametrize("vals", [ - ['bar', 'bar', 'foo', 'bar', 'baz'], - ['bar', np.nan, 'foo', np.nan, 'baz'] -]) -def test_rank_object_raises(ties_method, ascending, na_option, - pct, vals): - df = DataFrame({'key': ['foo'] * 5, 'val': vals}) +@pytest.mark.parametrize( + "vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]] +) +def test_rank_object_raises(ties_method, ascending, na_option, pct, vals): + df = DataFrame({"key": ["foo"] * 5, "val": vals}) with pytest.raises(TypeError, match="not callable"): - df.groupby('key').rank(method=ties_method, - ascending=ascending, - na_option=na_option, pct=pct) + df.groupby("key").rank( + method=ties_method, ascending=ascending, na_option=na_option, pct=pct + ) @pytest.mark.parametrize("na_option", [True, "bad", 1]) -@pytest.mark.parametrize("ties_method", [ - 'average', 'min', 'max', 'first', 'dense']) +@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"]) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("pct", [True, False]) -@pytest.mark.parametrize("vals", [ - ['bar', 'bar', 'foo', 'bar', 'baz'], - ['bar', np.nan, 'foo', np.nan, 'baz'], - [1, np.nan, 2, np.nan, 3] -]) +@pytest.mark.parametrize( + "vals", + [ + ["bar", "bar", "foo", "bar", "baz"], + ["bar", np.nan, "foo", np.nan, "baz"], + [1, np.nan, 2, np.nan, 3], + ], +) def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals): - df = DataFrame({'key': ['foo'] * 5, 'val': vals}) + df = DataFrame({"key": ["foo"] * 5, "val": vals}) msg = "na_option must be one of 'keep', 'top', or 'bottom'" with pytest.raises(ValueError, match=msg): - df.groupby('key').rank(method=ties_method, - ascending=ascending, - na_option=na_option, pct=pct) + df.groupby("key").rank( + method=ties_method, ascending=ascending, na_option=na_option, pct=pct + ) def test_rank_empty_group(): # see gh-22519 column = "A" - df = DataFrame({ - "A": [0, 1, 0], - "B": [1., np.nan, 2.] - }) + df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]}) result = df.groupby(column).B.rank(pct=True) expected = Series([0.5, np.nan, 1.0], name="B") @@ -291,12 +426,15 @@ def test_rank_empty_group(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("input_key,input_value,output_value", [ - ([1, 2], [1, 1], [1.0, 1.0]), - ([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]), - ([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]), - ([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]) -]) +@pytest.mark.parametrize( + "input_key,input_value,output_value", + [ + ([1, 2], [1, 1], [1.0, 1.0]), + ([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]), + ([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]), + ([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]), + ], +) def test_rank_zero_div(input_key, input_value, output_value): # GH 23666 df = DataFrame({"A": input_key, "B": input_value}) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 4ca470d316e5c8..d201b887739ec9 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -17,45 +17,47 @@ class TestGroupBy: - def test_groupby_with_timegrouper(self): # GH 4161 # TimeGrouper requires a sorted index # also verifies that the resultant index has the correct name - df_original = DataFrame({ - 'Buyer': 'Carl Carl Carl Carl Joe Carl'.split(), - 'Quantity': [18, 3, 5, 1, 9, 3], - 'Date': [ - datetime(2013, 9, 1, 13, 0), - datetime(2013, 9, 1, 13, 5), - datetime(2013, 10, 1, 20, 0), - datetime(2013, 10, 3, 10, 0), - datetime(2013, 12, 2, 12, 0), - datetime(2013, 9, 2, 14, 0), - ] - }) + df_original = DataFrame( + { + "Buyer": "Carl Carl Carl Carl Joe Carl".split(), + "Quantity": [18, 3, 5, 1, 9, 3], + "Date": [ + datetime(2013, 9, 1, 13, 0), + datetime(2013, 9, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 3, 10, 0), + datetime(2013, 12, 2, 12, 0), + datetime(2013, 9, 2, 14, 0), + ], + } + ) # GH 6908 change target column's order - df_reordered = df_original.sort_values(by='Quantity') + df_reordered = df_original.sort_values(by="Quantity") for df in [df_original, df_reordered]: - df = df.set_index(['Date']) + df = df.set_index(["Date"]) expected = DataFrame( - {'Quantity': 0}, - index=date_range('20130901', - '20131205', freq='5D', - name='Date', closed='left')) - expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype='int64') + {"Quantity": 0}, + index=date_range( + "20130901", "20131205", freq="5D", name="Date", closed="left" + ), + ) + expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype="int64") - result1 = df.resample('5D') .sum() + result1 = df.resample("5D").sum() assert_frame_equal(result1, expected) df_sorted = df.sort_index() - result2 = df_sorted.groupby(pd.Grouper(freq='5D')).sum() + result2 = df_sorted.groupby(pd.Grouper(freq="5D")).sum() assert_frame_equal(result2, expected) - result3 = df.groupby(pd.Grouper(freq='5D')).sum() + result3 = df.groupby(pd.Grouper(freq="5D")).sum() assert_frame_equal(result3, expected) @pytest.mark.parametrize("should_sort", [True, False]) @@ -63,25 +65,27 @@ def test_groupby_with_timegrouper_methods(self, should_sort): # GH 3881 # make sure API of timegrouper conforms - df = pd.DataFrame({ - 'Branch': 'A A A A A B'.split(), - 'Buyer': 'Carl Mark Carl Joe Joe Carl'.split(), - 'Quantity': [1, 3, 5, 8, 9, 3], - 'Date': [ - datetime(2013, 1, 1, 13, 0), - datetime(2013, 1, 1, 13, 5), - datetime(2013, 10, 1, 20, 0), - datetime(2013, 10, 2, 10, 0), - datetime(2013, 12, 2, 12, 0), - datetime(2013, 12, 2, 14, 0), - ] - }) + df = pd.DataFrame( + { + "Branch": "A A A A A B".split(), + "Buyer": "Carl Mark Carl Joe Joe Carl".split(), + "Quantity": [1, 3, 5, 8, 9, 3], + "Date": [ + datetime(2013, 1, 1, 13, 0), + datetime(2013, 1, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 12, 2, 12, 0), + datetime(2013, 12, 2, 14, 0), + ], + } + ) if should_sort: - df = df.sort_values(by='Quantity', ascending=False) + df = df.sort_values(by="Quantity", ascending=False) - df = df.set_index('Date', drop=False) - g = df.groupby(pd.Grouper(freq='6M')) + df = df.set_index("Date", drop=False) + g = df.groupby(pd.Grouper(freq="6M")) assert g.group_keys assert isinstance(g.grouper, BinGrouper) @@ -94,242 +98,291 @@ def test_timegrouper_with_reg_groups(self): # GH 3794 # allow combination of timegrouper/reg groups - df_original = DataFrame({ - 'Branch': 'A A A A A A A B'.split(), - 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), - 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], - 'Date': [ - datetime(2013, 1, 1, 13, 0), - datetime(2013, 1, 1, 13, 5), - datetime(2013, 10, 1, 20, 0), - datetime(2013, 10, 2, 10, 0), - datetime(2013, 10, 1, 20, 0), - datetime(2013, 10, 2, 10, 0), - datetime(2013, 12, 2, 12, 0), - datetime(2013, 12, 2, 14, 0), - ] - }).set_index('Date') - - df_sorted = df_original.sort_values(by='Quantity', ascending=False) + df_original = DataFrame( + { + "Branch": "A A A A A A A B".split(), + "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), + "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], + "Date": [ + datetime(2013, 1, 1, 13, 0), + datetime(2013, 1, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 12, 2, 12, 0), + datetime(2013, 12, 2, 14, 0), + ], + } + ).set_index("Date") + + df_sorted = df_original.sort_values(by="Quantity", ascending=False) for df in [df_original, df_sorted]: - expected = DataFrame({ - 'Buyer': 'Carl Joe Mark'.split(), - 'Quantity': [10, 18, 3], - 'Date': [ - datetime(2013, 12, 31, 0, 0), - datetime(2013, 12, 31, 0, 0), - datetime(2013, 12, 31, 0, 0), - ] - }).set_index(['Date', 'Buyer']) - - result = df.groupby([pd.Grouper(freq='A'), 'Buyer']).sum() + expected = DataFrame( + { + "Buyer": "Carl Joe Mark".split(), + "Quantity": [10, 18, 3], + "Date": [ + datetime(2013, 12, 31, 0, 0), + datetime(2013, 12, 31, 0, 0), + datetime(2013, 12, 31, 0, 0), + ], + } + ).set_index(["Date", "Buyer"]) + + result = df.groupby([pd.Grouper(freq="A"), "Buyer"]).sum() assert_frame_equal(result, expected) - expected = DataFrame({ - 'Buyer': 'Carl Mark Carl Joe'.split(), - 'Quantity': [1, 3, 9, 18], - 'Date': [ - datetime(2013, 1, 1, 0, 0), - datetime(2013, 1, 1, 0, 0), - datetime(2013, 7, 1, 0, 0), - datetime(2013, 7, 1, 0, 0), - ] - }).set_index(['Date', 'Buyer']) - result = df.groupby([pd.Grouper(freq='6MS'), 'Buyer']).sum() + expected = DataFrame( + { + "Buyer": "Carl Mark Carl Joe".split(), + "Quantity": [1, 3, 9, 18], + "Date": [ + datetime(2013, 1, 1, 0, 0), + datetime(2013, 1, 1, 0, 0), + datetime(2013, 7, 1, 0, 0), + datetime(2013, 7, 1, 0, 0), + ], + } + ).set_index(["Date", "Buyer"]) + result = df.groupby([pd.Grouper(freq="6MS"), "Buyer"]).sum() assert_frame_equal(result, expected) - df_original = DataFrame({ - 'Branch': 'A A A A A A A B'.split(), - 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), - 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], - 'Date': [ - datetime(2013, 10, 1, 13, 0), - datetime(2013, 10, 1, 13, 5), - datetime(2013, 10, 1, 20, 0), - datetime(2013, 10, 2, 10, 0), - datetime(2013, 10, 1, 20, 0), - datetime(2013, 10, 2, 10, 0), - datetime(2013, 10, 2, 12, 0), - datetime(2013, 10, 2, 14, 0), - ] - }).set_index('Date') - - df_sorted = df_original.sort_values(by='Quantity', ascending=False) + df_original = DataFrame( + { + "Branch": "A A A A A A A B".split(), + "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), + "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], + "Date": [ + datetime(2013, 10, 1, 13, 0), + datetime(2013, 10, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 10, 2, 12, 0), + datetime(2013, 10, 2, 14, 0), + ], + } + ).set_index("Date") + + df_sorted = df_original.sort_values(by="Quantity", ascending=False) for df in [df_original, df_sorted]: - expected = DataFrame({ - 'Buyer': 'Carl Joe Mark Carl Joe'.split(), - 'Quantity': [6, 8, 3, 4, 10], - 'Date': [ - datetime(2013, 10, 1, 0, 0), - datetime(2013, 10, 1, 0, 0), - datetime(2013, 10, 1, 0, 0), - datetime(2013, 10, 2, 0, 0), - datetime(2013, 10, 2, 0, 0), - ] - }).set_index(['Date', 'Buyer']) - - result = df.groupby([pd.Grouper(freq='1D'), 'Buyer']).sum() + expected = DataFrame( + { + "Buyer": "Carl Joe Mark Carl Joe".split(), + "Quantity": [6, 8, 3, 4, 10], + "Date": [ + datetime(2013, 10, 1, 0, 0), + datetime(2013, 10, 1, 0, 0), + datetime(2013, 10, 1, 0, 0), + datetime(2013, 10, 2, 0, 0), + datetime(2013, 10, 2, 0, 0), + ], + } + ).set_index(["Date", "Buyer"]) + + result = df.groupby([pd.Grouper(freq="1D"), "Buyer"]).sum() assert_frame_equal(result, expected) - result = df.groupby([pd.Grouper(freq='1M'), 'Buyer']).sum() - expected = DataFrame({ - 'Buyer': 'Carl Joe Mark'.split(), - 'Quantity': [10, 18, 3], - 'Date': [ - datetime(2013, 10, 31, 0, 0), - datetime(2013, 10, 31, 0, 0), - datetime(2013, 10, 31, 0, 0), - ] - }).set_index(['Date', 'Buyer']) + result = df.groupby([pd.Grouper(freq="1M"), "Buyer"]).sum() + expected = DataFrame( + { + "Buyer": "Carl Joe Mark".split(), + "Quantity": [10, 18, 3], + "Date": [ + datetime(2013, 10, 31, 0, 0), + datetime(2013, 10, 31, 0, 0), + datetime(2013, 10, 31, 0, 0), + ], + } + ).set_index(["Date", "Buyer"]) assert_frame_equal(result, expected) # passing the name df = df.reset_index() - result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer' - ]).sum() + result = df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"]).sum() assert_frame_equal(result, expected) with pytest.raises(KeyError): - df.groupby([pd.Grouper(freq='1M', key='foo'), 'Buyer']).sum() + df.groupby([pd.Grouper(freq="1M", key="foo"), "Buyer"]).sum() # passing the level - df = df.set_index('Date') - result = df.groupby([pd.Grouper(freq='1M', level='Date'), 'Buyer' - ]).sum() + df = df.set_index("Date") + result = df.groupby([pd.Grouper(freq="1M", level="Date"), "Buyer"]).sum() assert_frame_equal(result, expected) - result = df.groupby([pd.Grouper(freq='1M', level=0), 'Buyer']).sum( - ) + result = df.groupby([pd.Grouper(freq="1M", level=0), "Buyer"]).sum() assert_frame_equal(result, expected) with pytest.raises(ValueError): - df.groupby([pd.Grouper(freq='1M', level='foo'), - 'Buyer']).sum() + df.groupby([pd.Grouper(freq="1M", level="foo"), "Buyer"]).sum() # multi names df = df.copy() - df['Date'] = df.index + pd.offsets.MonthEnd(2) - result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer' - ]).sum() - expected = DataFrame({ - 'Buyer': 'Carl Joe Mark'.split(), - 'Quantity': [10, 18, 3], - 'Date': [ - datetime(2013, 11, 30, 0, 0), - datetime(2013, 11, 30, 0, 0), - datetime(2013, 11, 30, 0, 0), - ] - }).set_index(['Date', 'Buyer']) + df["Date"] = df.index + pd.offsets.MonthEnd(2) + result = df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"]).sum() + expected = DataFrame( + { + "Buyer": "Carl Joe Mark".split(), + "Quantity": [10, 18, 3], + "Date": [ + datetime(2013, 11, 30, 0, 0), + datetime(2013, 11, 30, 0, 0), + datetime(2013, 11, 30, 0, 0), + ], + } + ).set_index(["Date", "Buyer"]) assert_frame_equal(result, expected) # error as we have both a level and a name! with pytest.raises(ValueError): - df.groupby([pd.Grouper(freq='1M', key='Date', - level='Date'), 'Buyer']).sum() + df.groupby( + [pd.Grouper(freq="1M", key="Date", level="Date"), "Buyer"] + ).sum() # single groupers - expected = DataFrame({'Quantity': [31], - 'Date': [datetime(2013, 10, 31, 0, 0) - ]}).set_index('Date') - result = df.groupby(pd.Grouper(freq='1M')).sum() + expected = DataFrame( + {"Quantity": [31], "Date": [datetime(2013, 10, 31, 0, 0)]} + ).set_index("Date") + result = df.groupby(pd.Grouper(freq="1M")).sum() assert_frame_equal(result, expected) - result = df.groupby([pd.Grouper(freq='1M')]).sum() + result = df.groupby([pd.Grouper(freq="1M")]).sum() assert_frame_equal(result, expected) - expected = DataFrame({'Quantity': [31], - 'Date': [datetime(2013, 11, 30, 0, 0) - ]}).set_index('Date') - result = df.groupby(pd.Grouper(freq='1M', key='Date')).sum() + expected = DataFrame( + {"Quantity": [31], "Date": [datetime(2013, 11, 30, 0, 0)]} + ).set_index("Date") + result = df.groupby(pd.Grouper(freq="1M", key="Date")).sum() assert_frame_equal(result, expected) - result = df.groupby([pd.Grouper(freq='1M', key='Date')]).sum() + result = df.groupby([pd.Grouper(freq="1M", key="Date")]).sum() assert_frame_equal(result, expected) - @pytest.mark.parametrize('freq', ['D', 'M', 'A', 'Q-APR']) + @pytest.mark.parametrize("freq", ["D", "M", "A", "Q-APR"]) def test_timegrouper_with_reg_groups_freq(self, freq): # GH 6764 multiple grouping with/without sort - df = DataFrame({ - 'date': pd.to_datetime([ - '20121002', '20121007', '20130130', '20130202', '20130305', - '20121002', '20121207', '20130130', '20130202', '20130305', - '20130202', '20130305' - ]), - 'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], - 'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301, - 359, 801], - 'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12] - }).set_index('date') + df = DataFrame( + { + "date": pd.to_datetime( + [ + "20121002", + "20121007", + "20130130", + "20130202", + "20130305", + "20121002", + "20121207", + "20130130", + "20130202", + "20130305", + "20130202", + "20130305", + ] + ), + "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], + "whole_cost": [ + 1790, + 364, + 280, + 259, + 201, + 623, + 90, + 312, + 359, + 301, + 359, + 801, + ], + "cost1": [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12], + } + ).set_index("date") expected = ( - df.groupby('user_id')['whole_cost'] - .resample(freq) - .sum(min_count=1) # XXX - .dropna() - .reorder_levels(['date', 'user_id']) - .sort_index() - .astype('int64') + df.groupby("user_id")["whole_cost"] + .resample(freq) + .sum(min_count=1) # XXX + .dropna() + .reorder_levels(["date", "user_id"]) + .sort_index() + .astype("int64") ) - expected.name = 'whole_cost' + expected.name = "whole_cost" - result1 = df.sort_index().groupby([pd.Grouper(freq=freq), - 'user_id'])['whole_cost'].sum() + result1 = ( + df.sort_index() + .groupby([pd.Grouper(freq=freq), "user_id"])["whole_cost"] + .sum() + ) assert_series_equal(result1, expected) - result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[ - 'whole_cost'].sum() + result2 = df.groupby([pd.Grouper(freq=freq), "user_id"])["whole_cost"].sum() assert_series_equal(result2, expected) def test_timegrouper_get_group(self): # GH 6914 - df_original = DataFrame({ - 'Buyer': 'Carl Joe Joe Carl Joe Carl'.split(), - 'Quantity': [18, 3, 5, 1, 9, 3], - 'Date': [datetime(2013, 9, 1, 13, 0), - datetime(2013, 9, 1, 13, 5), - datetime(2013, 10, 1, 20, 0), - datetime(2013, 10, 3, 10, 0), - datetime(2013, 12, 2, 12, 0), - datetime(2013, 9, 2, 14, 0), ] - }) - df_reordered = df_original.sort_values(by='Quantity') + df_original = DataFrame( + { + "Buyer": "Carl Joe Joe Carl Joe Carl".split(), + "Quantity": [18, 3, 5, 1, 9, 3], + "Date": [ + datetime(2013, 9, 1, 13, 0), + datetime(2013, 9, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 3, 10, 0), + datetime(2013, 12, 2, 12, 0), + datetime(2013, 9, 2, 14, 0), + ], + } + ) + df_reordered = df_original.sort_values(by="Quantity") # single grouping - expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]], - df_original.iloc[[4]]] - dt_list = ['2013-09-30', '2013-10-31', '2013-12-31'] + expected_list = [ + df_original.iloc[[0, 1, 5]], + df_original.iloc[[2, 3]], + df_original.iloc[[4]], + ] + dt_list = ["2013-09-30", "2013-10-31", "2013-12-31"] for df in [df_original, df_reordered]: - grouped = df.groupby(pd.Grouper(freq='M', key='Date')) + grouped = df.groupby(pd.Grouper(freq="M", key="Date")) for t, expected in zip(dt_list, expected_list): dt = pd.Timestamp(t) result = grouped.get_group(dt) assert_frame_equal(result, expected) # multiple grouping - expected_list = [df_original.iloc[[1]], df_original.iloc[[3]], - df_original.iloc[[4]]] - g_list = [('Joe', '2013-09-30'), ('Carl', '2013-10-31'), - ('Joe', '2013-12-31')] + expected_list = [ + df_original.iloc[[1]], + df_original.iloc[[3]], + df_original.iloc[[4]], + ] + g_list = [("Joe", "2013-09-30"), ("Carl", "2013-10-31"), ("Joe", "2013-12-31")] for df in [df_original, df_reordered]: - grouped = df.groupby(['Buyer', pd.Grouper(freq='M', key='Date')]) + grouped = df.groupby(["Buyer", pd.Grouper(freq="M", key="Date")]) for (b, t), expected in zip(g_list, expected_list): dt = pd.Timestamp(t) result = grouped.get_group((b, dt)) assert_frame_equal(result, expected) # with index - df_original = df_original.set_index('Date') - df_reordered = df_original.sort_values(by='Quantity') + df_original = df_original.set_index("Date") + df_reordered = df_original.sort_values(by="Quantity") - expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]], - df_original.iloc[[4]]] + expected_list = [ + df_original.iloc[[0, 1, 5]], + df_original.iloc[[2, 3]], + df_original.iloc[[4]], + ] for df in [df_original, df_reordered]: - grouped = df.groupby(pd.Grouper(freq='M')) + grouped = df.groupby(pd.Grouper(freq="M")) for t, expected in zip(dt_list, expected_list): dt = pd.Timestamp(t) result = grouped.get_group(dt) @@ -339,44 +392,43 @@ def test_timegrouper_apply_return_type_series(self): # Using `apply` with the `TimeGrouper` should give the # same return type as an `apply` with a `Grouper`. # Issue #11742 - df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'], - 'value': [10, 13]}) + df = pd.DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]}) df_dt = df.copy() - df_dt['date'] = pd.to_datetime(df_dt['date']) + df_dt["date"] = pd.to_datetime(df_dt["date"]) def sumfunc_series(x): - return pd.Series([x['value'].sum()], ('sum',)) + return pd.Series([x["value"].sum()], ("sum",)) - expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_series) - result = (df_dt.groupby(pd.Grouper(freq='M', key='date')) - .apply(sumfunc_series)) - assert_frame_equal(result.reset_index(drop=True), - expected.reset_index(drop=True)) + expected = df.groupby(pd.Grouper(key="date")).apply(sumfunc_series) + result = df_dt.groupby(pd.Grouper(freq="M", key="date")).apply(sumfunc_series) + assert_frame_equal( + result.reset_index(drop=True), expected.reset_index(drop=True) + ) def test_timegrouper_apply_return_type_value(self): # Using `apply` with the `TimeGrouper` should give the # same return type as an `apply` with a `Grouper`. # Issue #11742 - df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'], - 'value': [10, 13]}) + df = pd.DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]}) df_dt = df.copy() - df_dt['date'] = pd.to_datetime(df_dt['date']) + df_dt["date"] = pd.to_datetime(df_dt["date"]) def sumfunc_value(x): return x.value.sum() - expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_value) - result = (df_dt.groupby(Grouper(freq='M', key='date')) - .apply(sumfunc_value)) - assert_series_equal(result.reset_index(drop=True), - expected.reset_index(drop=True)) + expected = df.groupby(pd.Grouper(key="date")).apply(sumfunc_value) + result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_value) + assert_series_equal( + result.reset_index(drop=True), expected.reset_index(drop=True) + ) def test_groupby_groups_datetimeindex(self): # GH#1430 periods = 1000 - ind = pd.date_range(start='2012/1/1', freq='5min', periods=periods) - df = DataFrame({'high': np.arange(periods), - 'low': np.arange(periods)}, index=ind) + ind = pd.date_range(start="2012/1/1", freq="5min", periods=periods) + df = DataFrame( + {"high": np.arange(periods), "low": np.arange(periods)}, index=ind + ) grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) # it works! @@ -384,76 +436,94 @@ def test_groupby_groups_datetimeindex(self): assert isinstance(list(groups.keys())[0], datetime) # GH#11442 - index = pd.date_range('2015/01/01', periods=5, name='date') - df = pd.DataFrame({'A': [5, 6, 7, 8, 9], - 'B': [1, 2, 3, 4, 5]}, index=index) - result = df.groupby(level='date').groups - dates = ['2015-01-05', '2015-01-04', '2015-01-03', - '2015-01-02', '2015-01-01'] - expected = {pd.Timestamp(date): pd.DatetimeIndex([date], name='date') - for date in dates} + index = pd.date_range("2015/01/01", periods=5, name="date") + df = pd.DataFrame({"A": [5, 6, 7, 8, 9], "B": [1, 2, 3, 4, 5]}, index=index) + result = df.groupby(level="date").groups + dates = ["2015-01-05", "2015-01-04", "2015-01-03", "2015-01-02", "2015-01-01"] + expected = { + pd.Timestamp(date): pd.DatetimeIndex([date], name="date") for date in dates + } tm.assert_dict_equal(result, expected) - grouped = df.groupby(level='date') + grouped = df.groupby(level="date") for date in dates: result = grouped.get_group(date) - data = [[df.loc[date, 'A'], df.loc[date, 'B']]] - expected_index = pd.DatetimeIndex([date], name='date') - expected = pd.DataFrame(data, - columns=list('AB'), - index=expected_index) + data = [[df.loc[date, "A"], df.loc[date, "B"]]] + expected_index = pd.DatetimeIndex([date], name="date") + expected = pd.DataFrame(data, columns=list("AB"), index=expected_index) tm.assert_frame_equal(result, expected) def test_groupby_groups_datetimeindex_tz(self): # GH 3950 - dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', - '2011-07-19 09:00:00', '2011-07-19 07:00:00', - '2011-07-19 08:00:00', '2011-07-19 09:00:00'] - df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], - 'datetime': dates, - 'value1': np.arange(6, dtype='int64'), - 'value2': [1, 2] * 3}) - df['datetime'] = df['datetime'].apply( - lambda d: Timestamp(d, tz='US/Pacific')) - - exp_idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00', - '2011-07-19 07:00:00', - '2011-07-19 08:00:00', - '2011-07-19 08:00:00', - '2011-07-19 09:00:00', - '2011-07-19 09:00:00'], - tz='US/Pacific', name='datetime') - exp_idx2 = Index(['a', 'b'] * 3, name='label') + dates = [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ] + df = DataFrame( + { + "label": ["a", "a", "a", "b", "b", "b"], + "datetime": dates, + "value1": np.arange(6, dtype="int64"), + "value2": [1, 2] * 3, + } + ) + df["datetime"] = df["datetime"].apply(lambda d: Timestamp(d, tz="US/Pacific")) + + exp_idx1 = pd.DatetimeIndex( + [ + "2011-07-19 07:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 09:00:00", + ], + tz="US/Pacific", + name="datetime", + ) + exp_idx2 = Index(["a", "b"] * 3, name="label") exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) - expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5], - 'value2': [1, 2, 2, 1, 1, 2]}, - index=exp_idx, columns=['value1', 'value2']) + expected = DataFrame( + {"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]}, + index=exp_idx, + columns=["value1", "value2"], + ) - result = df.groupby(['datetime', 'label']).sum() + result = df.groupby(["datetime", "label"]).sum() assert_frame_equal(result, expected) # by level - didx = pd.DatetimeIndex(dates, tz='Asia/Tokyo') - df = DataFrame({'value1': np.arange(6, dtype='int64'), - 'value2': [1, 2, 3, 1, 2, 3]}, - index=didx) + didx = pd.DatetimeIndex(dates, tz="Asia/Tokyo") + df = DataFrame( + {"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]}, + index=didx, + ) - exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', - '2011-07-19 08:00:00', - '2011-07-19 09:00:00'], tz='Asia/Tokyo') - expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]}, - index=exp_idx, columns=['value1', 'value2']) + exp_idx = pd.DatetimeIndex( + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], + tz="Asia/Tokyo", + ) + expected = DataFrame( + {"value1": [3, 5, 7], "value2": [2, 4, 6]}, + index=exp_idx, + columns=["value1", "value2"], + ) result = df.groupby(level=0).sum() assert_frame_equal(result, expected) def test_frame_datetime64_handling_groupby(self): # it works! - df = DataFrame([(3, np.datetime64('2012-07-03')), - (3, np.datetime64('2012-07-04'))], - columns=['a', 'date']) - result = df.groupby('a').first() - assert result['date'][3] == Timestamp('2012-07-03') + df = DataFrame( + [(3, np.datetime64("2012-07-03")), (3, np.datetime64("2012-07-04"))], + columns=["a", "date"], + ) + result = df.groupby("a").first() + assert result["date"][3] == Timestamp("2012-07-03") def test_groupby_multi_timezone(self): @@ -465,77 +535,99 @@ def test_groupby_multi_timezone(self): 3,2000-01-31 16:50:00,America/Chicago 4,2000-01-01 16:50:00,America/New_York""" - df = pd.read_csv(StringIO(data), header=None, - names=['value', 'date', 'tz']) - result = df.groupby('tz').date.apply( - lambda x: pd.to_datetime(x).dt.tz_localize(x.name)) - - expected = Series([Timestamp('2000-01-28 16:47:00-0600', - tz='America/Chicago'), - Timestamp('2000-01-29 16:48:00-0600', - tz='America/Chicago'), - Timestamp('2000-01-30 16:49:00-0800', - tz='America/Los_Angeles'), - Timestamp('2000-01-31 16:50:00-0600', - tz='America/Chicago'), - Timestamp('2000-01-01 16:50:00-0500', - tz='America/New_York')], - name='date', - dtype=object) + df = pd.read_csv(StringIO(data), header=None, names=["value", "date", "tz"]) + result = df.groupby("tz").date.apply( + lambda x: pd.to_datetime(x).dt.tz_localize(x.name) + ) + + expected = Series( + [ + Timestamp("2000-01-28 16:47:00-0600", tz="America/Chicago"), + Timestamp("2000-01-29 16:48:00-0600", tz="America/Chicago"), + Timestamp("2000-01-30 16:49:00-0800", tz="America/Los_Angeles"), + Timestamp("2000-01-31 16:50:00-0600", tz="America/Chicago"), + Timestamp("2000-01-01 16:50:00-0500", tz="America/New_York"), + ], + name="date", + dtype=object, + ) assert_series_equal(result, expected) - tz = 'America/Chicago' - res_values = df.groupby('tz').date.get_group(tz) + tz = "America/Chicago" + res_values = df.groupby("tz").date.get_group(tz) result = pd.to_datetime(res_values).dt.tz_localize(tz) - exp_values = Series(['2000-01-28 16:47:00', '2000-01-29 16:48:00', - '2000-01-31 16:50:00'], - index=[0, 1, 3], name='date') + exp_values = Series( + ["2000-01-28 16:47:00", "2000-01-29 16:48:00", "2000-01-31 16:50:00"], + index=[0, 1, 3], + name="date", + ) expected = pd.to_datetime(exp_values).dt.tz_localize(tz) assert_series_equal(result, expected) def test_groupby_groups_periods(self): - dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', - '2011-07-19 09:00:00', '2011-07-19 07:00:00', - '2011-07-19 08:00:00', '2011-07-19 09:00:00'] - df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], - 'period': [pd.Period(d, freq='H') for d in dates], - 'value1': np.arange(6, dtype='int64'), - 'value2': [1, 2] * 3}) - - exp_idx1 = pd.PeriodIndex(['2011-07-19 07:00:00', - '2011-07-19 07:00:00', - '2011-07-19 08:00:00', - '2011-07-19 08:00:00', - '2011-07-19 09:00:00', - '2011-07-19 09:00:00'], - freq='H', name='period') - exp_idx2 = Index(['a', 'b'] * 3, name='label') + dates = [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ] + df = DataFrame( + { + "label": ["a", "a", "a", "b", "b", "b"], + "period": [pd.Period(d, freq="H") for d in dates], + "value1": np.arange(6, dtype="int64"), + "value2": [1, 2] * 3, + } + ) + + exp_idx1 = pd.PeriodIndex( + [ + "2011-07-19 07:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 09:00:00", + ], + freq="H", + name="period", + ) + exp_idx2 = Index(["a", "b"] * 3, name="label") exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) - expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5], - 'value2': [1, 2, 2, 1, 1, 2]}, - index=exp_idx, columns=['value1', 'value2']) + expected = DataFrame( + {"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]}, + index=exp_idx, + columns=["value1", "value2"], + ) - result = df.groupby(['period', 'label']).sum() + result = df.groupby(["period", "label"]).sum() assert_frame_equal(result, expected) # by level - didx = pd.PeriodIndex(dates, freq='H') - df = DataFrame({'value1': np.arange(6, dtype='int64'), - 'value2': [1, 2, 3, 1, 2, 3]}, - index=didx) + didx = pd.PeriodIndex(dates, freq="H") + df = DataFrame( + {"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]}, + index=didx, + ) - exp_idx = pd.PeriodIndex(['2011-07-19 07:00:00', - '2011-07-19 08:00:00', - '2011-07-19 09:00:00'], freq='H') - expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]}, - index=exp_idx, columns=['value1', 'value2']) + exp_idx = pd.PeriodIndex( + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], + freq="H", + ) + expected = DataFrame( + {"value1": [3, 5, 7], "value2": [2, 4, 6]}, + index=exp_idx, + columns=["value1", "value2"], + ) result = df.groupby(level=0).sum() assert_frame_equal(result, expected) def test_groupby_first_datetime64(self): df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)]) - df[1] = df[1].view('M8[ns]') + df[1] = df[1].view("M8[ns]") assert issubclass(df[1].dtype.type, np.datetime64) @@ -550,50 +642,52 @@ def test_groupby_first_datetime64(self): def test_groupby_max_datetime64(self): # GH 5869 # datetimelike dtype conversion from int - df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) - expected = df.groupby('A')['A'].apply(lambda x: x.max()) - result = df.groupby('A')['A'].max() + df = DataFrame(dict(A=Timestamp("20130101"), B=np.arange(5))) + expected = df.groupby("A")["A"].apply(lambda x: x.max()) + result = df.groupby("A")["A"].max() assert_series_equal(result, expected) def test_groupby_datetime64_32_bit(self): # GH 6410 / numpy 4328 # 32-bit under 1.9-dev indexing issue - df = DataFrame({"A": range(2), "B": [pd.Timestamp('2000-01-1')] * 2}) + df = DataFrame({"A": range(2), "B": [pd.Timestamp("2000-01-1")] * 2}) result = df.groupby("A")["B"].transform(min) - expected = Series([pd.Timestamp('2000-01-1')] * 2, name='B') + expected = Series([pd.Timestamp("2000-01-1")] * 2, name="B") assert_series_equal(result, expected) def test_groupby_with_timezone_selection(self): # GH 11616 # Test that column selection returns output in correct timezone. np.random.seed(42) - df = pd.DataFrame({ - 'factor': np.random.randint(0, 3, size=60), - 'time': pd.date_range('01/01/2000 00:00', periods=60, - freq='s', tz='UTC') - }) - df1 = df.groupby('factor').max()['time'] - df2 = df.groupby('factor')['time'].max() + df = pd.DataFrame( + { + "factor": np.random.randint(0, 3, size=60), + "time": pd.date_range( + "01/01/2000 00:00", periods=60, freq="s", tz="UTC" + ), + } + ) + df1 = df.groupby("factor").max()["time"] + df2 = df.groupby("factor")["time"].max() tm.assert_series_equal(df1, df2) def test_timezone_info(self): # see gh-11682: Timezone info lost when broadcasting # scalar datetime to DataFrame - df = pd.DataFrame({'a': [1], 'b': [datetime.now(pytz.utc)]}) - assert df['b'][0].tzinfo == pytz.utc - df = pd.DataFrame({'a': [1, 2, 3]}) - df['b'] = datetime.now(pytz.utc) - assert df['b'][0].tzinfo == pytz.utc + df = pd.DataFrame({"a": [1], "b": [datetime.now(pytz.utc)]}) + assert df["b"][0].tzinfo == pytz.utc + df = pd.DataFrame({"a": [1, 2, 3]}) + df["b"] = datetime.now(pytz.utc) + assert df["b"][0].tzinfo == pytz.utc def test_datetime_count(self): - df = DataFrame({'a': [1, 2, 3] * 2, - 'dates': pd.date_range('now', periods=6, freq='T')}) - result = df.groupby('a').dates.count() - expected = Series([ - 2, 2, 2 - ], index=Index([1, 2, 3], name='a'), name='dates') + df = DataFrame( + {"a": [1, 2, 3] * 2, "dates": pd.date_range("now", periods=6, freq="T")} + ) + result = df.groupby("a").dates.count() + expected = Series([2, 2, 2], index=Index([1, 2, 3], name="a"), name="dates") tm.assert_series_equal(result, expected) def test_first_last_max_min_on_time_data(self): @@ -601,16 +695,25 @@ def test_first_last_max_min_on_time_data(self): # Verify that NaT is not in the result of max, min, first and last on # Dataframe with datetime or timedelta values. from datetime import timedelta as td + df_test = DataFrame( - {'dt': [nan, '2015-07-24 10:10', '2015-07-25 11:11', - '2015-07-23 12:12', nan], - 'td': [nan, td(days=1), td(days=2), td(days=3), nan]}) + { + "dt": [ + nan, + "2015-07-24 10:10", + "2015-07-25 11:11", + "2015-07-23 12:12", + nan, + ], + "td": [nan, td(days=1), td(days=2), td(days=3), nan], + } + ) df_test.dt = pd.to_datetime(df_test.dt) - df_test['group'] = 'A' + df_test["group"] = "A" df_ref = df_test[df_test.dt.notna()] - grouped_test = df_test.groupby('group') - grouped_ref = df_ref.groupby('group') + grouped_test = df_test.groupby("group") + grouped_ref = df_ref.groupby("group") assert_frame_equal(grouped_ref.max(), grouped_test.max()) assert_frame_equal(grouped_ref.min(), grouped_test.min()) @@ -619,28 +722,34 @@ def test_first_last_max_min_on_time_data(self): def test_nunique_with_timegrouper_and_nat(self): # GH 17575 - test = pd.DataFrame({ - 'time': [Timestamp('2016-06-28 09:35:35'), - pd.NaT, - Timestamp('2016-06-28 16:46:28')], - 'data': ['1', '2', '3']}) - - grouper = pd.Grouper(key='time', freq='h') - result = test.groupby(grouper)['data'].nunique() - expected = test[test.time.notnull()].groupby(grouper)['data'].nunique() + test = pd.DataFrame( + { + "time": [ + Timestamp("2016-06-28 09:35:35"), + pd.NaT, + Timestamp("2016-06-28 16:46:28"), + ], + "data": ["1", "2", "3"], + } + ) + + grouper = pd.Grouper(key="time", freq="h") + result = test.groupby(grouper)["data"].nunique() + expected = test[test.time.notnull()].groupby(grouper)["data"].nunique() tm.assert_series_equal(result, expected) def test_scalar_call_versus_list_call(self): # Issue: 17530 data_frame = { - 'location': ['shanghai', 'beijing', 'shanghai'], - 'time': pd.Series(['2017-08-09 13:32:23', '2017-08-11 23:23:15', - '2017-08-11 22:23:15'], - dtype='datetime64[ns]'), - 'value': [1, 2, 3] + "location": ["shanghai", "beijing", "shanghai"], + "time": pd.Series( + ["2017-08-09 13:32:23", "2017-08-11 23:23:15", "2017-08-11 22:23:15"], + dtype="datetime64[ns]", + ), + "value": [1, 2, 3], } - data_frame = pd.DataFrame(data_frame).set_index('time') - grouper = pd.Grouper(freq='D') + data_frame = pd.DataFrame(data_frame).set_index("time") + grouper = pd.Grouper(freq="D") grouped = data_frame.groupby(grouper) result = grouped.count() diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 6ed2e178a7fc78..705e4080cf34e3 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -10,7 +10,14 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, MultiIndex, Series, Timestamp, concat, date_range) + Categorical, + DataFrame, + MultiIndex, + Series, + Timestamp, + concat, + date_range, +) from pandas.core.groupby.groupby import DataError from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -36,79 +43,92 @@ def test_transform(): # make sure that we preserve the input order df = DataFrame( - np.arange(6, dtype='int64').reshape( - 3, 2), columns=["a", "b"], index=[0, 2, 1]) + np.arange(6, dtype="int64").reshape(3, 2), columns=["a", "b"], index=[0, 2, 1] + ) key = [0, 0, 1] - expected = df.sort_index().groupby(key).transform( - lambda x: x - x.mean()).groupby(key).mean() - result = df.groupby(key).transform(lambda x: x - x.mean()).groupby( - key).mean() + expected = ( + df.sort_index() + .groupby(key) + .transform(lambda x: x - x.mean()) + .groupby(key) + .mean() + ) + result = df.groupby(key).transform(lambda x: x - x.mean()).groupby(key).mean() assert_frame_equal(result, expected) def demean(arr): return arr - arr.mean() - people = DataFrame(np.random.randn(5, 5), - columns=['a', 'b', 'c', 'd', 'e'], - index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis']) - key = ['one', 'two', 'one', 'two', 'one'] + people = DataFrame( + np.random.randn(5, 5), + columns=["a", "b", "c", "d", "e"], + index=["Joe", "Steve", "Wes", "Jim", "Travis"], + ) + key = ["one", "two", "one", "two", "one"] result = people.groupby(key).transform(demean).groupby(key).mean() expected = people.groupby(key).apply(demean).groupby(key).mean() assert_frame_equal(result, expected) # GH 8430 df = tm.makeTimeDataFrame() - g = df.groupby(pd.Grouper(freq='M')) + g = df.groupby(pd.Grouper(freq="M")) g.transform(lambda x: x - 1) # GH 9700 - df = DataFrame({'a': range(5, 10), 'b': range(5)}) - result = df.groupby('a').transform(max) - expected = DataFrame({'b': range(5)}) + df = DataFrame({"a": range(5, 10), "b": range(5)}) + result = df.groupby("a").transform(max) + expected = DataFrame({"b": range(5)}) tm.assert_frame_equal(result, expected) def test_transform_fast(): - df = DataFrame({'id': np.arange(100000) / 3, - 'val': np.random.randn(100000)}) + df = DataFrame({"id": np.arange(100000) / 3, "val": np.random.randn(100000)}) - grp = df.groupby('id')['val'] + grp = df.groupby("id")["val"] - values = np.repeat(grp.mean().values, - ensure_platform_int(grp.count().values)) - expected = pd.Series(values, index=df.index, name='val') + values = np.repeat(grp.mean().values, ensure_platform_int(grp.count().values)) + expected = pd.Series(values, index=df.index, name="val") result = grp.transform(np.mean) assert_series_equal(result, expected) - result = grp.transform('mean') + result = grp.transform("mean") assert_series_equal(result, expected) # GH 12737 - df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5], - 'd': pd.date_range('2014-1-1', '2014-1-4'), - 'i': [1, 2, 3, 4]}, - columns=['grouping', 'f', 'i', 'd']) - result = df.groupby('grouping').transform('first') - - dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'), - pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')] - expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5], - 'd': dates, - 'i': [1, 2, 2, 4]}, - columns=['f', 'i', 'd']) + df = pd.DataFrame( + { + "grouping": [0, 1, 1, 3], + "f": [1.1, 2.1, 3.1, 4.5], + "d": pd.date_range("2014-1-1", "2014-1-4"), + "i": [1, 2, 3, 4], + }, + columns=["grouping", "f", "i", "d"], + ) + result = df.groupby("grouping").transform("first") + + dates = [ + pd.Timestamp("2014-1-1"), + pd.Timestamp("2014-1-2"), + pd.Timestamp("2014-1-2"), + pd.Timestamp("2014-1-4"), + ] + expected = pd.DataFrame( + {"f": [1.1, 2.1, 2.1, 4.5], "d": dates, "i": [1, 2, 2, 4]}, + columns=["f", "i", "d"], + ) assert_frame_equal(result, expected) # selection - result = df.groupby('grouping')[['f', 'i']].transform('first') - expected = expected[['f', 'i']] + result = df.groupby("grouping")[["f", "i"]].transform("first") + expected = expected[["f", "i"]] assert_frame_equal(result, expected) # dup columns - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a']) - result = df.groupby('g').transform('first') - expected = df.drop('g', axis=1) + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["g", "a", "a"]) + result = df.groupby("g").transform("first") + expected = df.drop("g", axis=1) assert_frame_equal(result, expected) @@ -130,8 +150,7 @@ def test_transform_broadcast(tsframe, ts): assert_fp_equal(res[col], agged[col]) # group columns - grouped = tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, - axis=1) + grouped = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) result = grouped.transform(np.mean) tm.assert_index_equal(result.index, tsframe.index) tm.assert_index_equal(result.columns, tsframe.columns) @@ -152,33 +171,32 @@ def test_transform_axis(tsframe): base = tsframe.iloc[0:5] r = len(base.index) c = len(base.columns) - tso = DataFrame(np.random.randn(r, c), - index=base.index, - columns=base.columns, - dtype='float64') + tso = DataFrame( + np.random.randn(r, c), index=base.index, columns=base.columns, dtype="float64" + ) # monotonic ts = tso grouped = ts.groupby(lambda x: x.weekday()) - result = ts - grouped.transform('mean') + result = ts - grouped.transform("mean") expected = grouped.apply(lambda x: x - x.mean()) assert_frame_equal(result, expected) ts = ts.T grouped = ts.groupby(lambda x: x.weekday(), axis=1) - result = ts - grouped.transform('mean') + result = ts - grouped.transform("mean") expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) assert_frame_equal(result, expected) # non-monotonic ts = tso.iloc[[1, 0] + list(range(2, len(base)))] grouped = ts.groupby(lambda x: x.weekday()) - result = ts - grouped.transform('mean') + result = ts - grouped.transform("mean") expected = grouped.apply(lambda x: x - x.mean()) assert_frame_equal(result, expected) ts = ts.T grouped = ts.groupby(lambda x: x.weekday(), axis=1) - result = ts - grouped.transform('mean') + result = ts - grouped.transform("mean") expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) assert_frame_equal(result, expected) @@ -187,7 +205,7 @@ def test_transform_dtype(): # GH 9807 # Check transform dtype output is preserved df = DataFrame([[1, 3], [2, 3]]) - result = df.groupby(1).transform('mean') + result = df.groupby(1).transform("mean") expected = DataFrame([[1.5], [1.5]]) assert_frame_equal(result, expected) @@ -195,63 +213,63 @@ def test_transform_dtype(): def test_transform_bug(): # GH 5712 # transforming on a datetime column - df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) - result = df.groupby('A')['B'].transform( - lambda x: x.rank(ascending=False)) - expected = Series(np.arange(5, 0, step=-1), name='B') + df = DataFrame(dict(A=Timestamp("20130101"), B=np.arange(5))) + result = df.groupby("A")["B"].transform(lambda x: x.rank(ascending=False)) + expected = Series(np.arange(5, 0, step=-1), name="B") assert_series_equal(result, expected) def test_transform_numeric_to_boolean(): # GH 16875 # inconsistency in transforming boolean values - expected = pd.Series([True, True], name='A') + expected = pd.Series([True, True], name="A") - df = pd.DataFrame({'A': [1.1, 2.2], 'B': [1, 2]}) - result = df.groupby('B').A.transform(lambda x: True) + df = pd.DataFrame({"A": [1.1, 2.2], "B": [1, 2]}) + result = df.groupby("B").A.transform(lambda x: True) assert_series_equal(result, expected) - df = pd.DataFrame({'A': [1, 2], 'B': [1, 2]}) - result = df.groupby('B').A.transform(lambda x: True) + df = pd.DataFrame({"A": [1, 2], "B": [1, 2]}) + result = df.groupby("B").A.transform(lambda x: True) assert_series_equal(result, expected) def test_transform_datetime_to_timedelta(): # GH 15429 # transforming a datetime to timedelta - df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) - expected = pd.Series([ - Timestamp('20130101') - Timestamp('20130101')] * 5, name='A') + df = DataFrame(dict(A=Timestamp("20130101"), B=np.arange(5))) + expected = pd.Series([Timestamp("20130101") - Timestamp("20130101")] * 5, name="A") # this does date math without changing result type in transform - base_time = df['A'][0] - result = df.groupby('A')['A'].transform( - lambda x: x.max() - x.min() + base_time) - base_time + base_time = df["A"][0] + result = ( + df.groupby("A")["A"].transform(lambda x: x.max() - x.min() + base_time) + - base_time + ) assert_series_equal(result, expected) # this does date math and causes the transform to return timedelta - result = df.groupby('A')['A'].transform(lambda x: x.max() - x.min()) + result = df.groupby("A")["A"].transform(lambda x: x.max() - x.min()) assert_series_equal(result, expected) def test_transform_datetime_to_numeric(): # GH 10972 # convert dt to float - df = DataFrame({ - 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')}) - result = df.groupby('a').b.transform( - lambda x: x.dt.dayofweek - x.dt.dayofweek.mean()) + df = DataFrame({"a": 1, "b": date_range("2015-01-01", periods=2, freq="D")}) + result = df.groupby("a").b.transform( + lambda x: x.dt.dayofweek - x.dt.dayofweek.mean() + ) - expected = Series([-0.5, 0.5], name='b') + expected = Series([-0.5, 0.5], name="b") assert_series_equal(result, expected) # convert dt to int - df = DataFrame({ - 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')}) - result = df.groupby('a').b.transform( - lambda x: x.dt.dayofweek - x.dt.dayofweek.min()) + df = DataFrame({"a": 1, "b": date_range("2015-01-01", periods=2, freq="D")}) + result = df.groupby("a").b.transform( + lambda x: x.dt.dayofweek - x.dt.dayofweek.min() + ) - expected = Series([0, 1], name='b') + expected = Series([0, 1], name="b") assert_series_equal(result, expected) @@ -271,14 +289,14 @@ def test_transform_casting(): 9 B-053 b76cd912ff "2014-10-08 19:17:48" 10 B-065 b76cd912ff "2014-10-08 19:21:38" """ - df = pd.read_csv(StringIO(data), sep=r'\s+', - index_col=[0], parse_dates=['DATETIME']) + df = pd.read_csv( + StringIO(data), sep=r"\s+", index_col=[0], parse_dates=["DATETIME"] + ) - result = df.groupby('ID3')['DATETIME'].transform(lambda x: x.diff()) + result = df.groupby("ID3")["DATETIME"].transform(lambda x: x.diff()) assert is_timedelta64_dtype(result.dtype) - result = df[['ID3', 'DATETIME']].groupby('ID3').transform( - lambda x: x.diff()) + result = df[["ID3", "DATETIME"]].groupby("ID3").transform(lambda x: x.diff()) assert is_timedelta64_dtype(result.DATETIME.dtype) @@ -294,18 +312,18 @@ def test_dispatch_transform(tsframe): grouped = df.groupby(lambda x: x.month) - filled = grouped.fillna(method='pad') - fillit = lambda x: x.fillna(method='pad') + filled = grouped.fillna(method="pad") + fillit = lambda x: x.fillna(method="pad") expected = df.groupby(lambda x: x.month).transform(fillit) assert_frame_equal(filled, expected) def test_transform_select_columns(df): f = lambda x: x.mean() - result = df.groupby('A')['C', 'D'].transform(f) + result = df.groupby("A")["C", "D"].transform(f) - selection = df[['C', 'D']] - expected = selection.groupby(df['A']).transform(f) + selection = df[["C", "D"]] + expected = selection.groupby(df["A"]).transform(f) assert_frame_equal(result, expected) @@ -315,48 +333,55 @@ def test_transform_exclude_nuisance(df): # this also tests orderings in transform between # series/frame to make sure it's consistent expected = {} - grouped = df.groupby('A') - expected['C'] = grouped['C'].transform(np.mean) - expected['D'] = grouped['D'].transform(np.mean) + grouped = df.groupby("A") + expected["C"] = grouped["C"].transform(np.mean) + expected["D"] = grouped["D"].transform(np.mean) expected = DataFrame(expected) - result = df.groupby('A').transform(np.mean) + result = df.groupby("A").transform(np.mean) assert_frame_equal(result, expected) def test_transform_function_aliases(df): - result = df.groupby('A').transform('mean') - expected = df.groupby('A').transform(np.mean) + result = df.groupby("A").transform("mean") + expected = df.groupby("A").transform(np.mean) assert_frame_equal(result, expected) - result = df.groupby('A')['C'].transform('mean') - expected = df.groupby('A')['C'].transform(np.mean) + result = df.groupby("A")["C"].transform("mean") + expected = df.groupby("A")["C"].transform(np.mean) assert_series_equal(result, expected) def test_series_fast_transform_date(): # GH 13191 - df = pd.DataFrame({'grouping': [np.nan, 1, 1, 3], - 'd': pd.date_range('2014-1-1', '2014-1-4')}) - result = df.groupby('grouping')['d'].transform('first') - dates = [pd.NaT, pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'), - pd.Timestamp('2014-1-4')] - expected = pd.Series(dates, name='d') + df = pd.DataFrame( + {"grouping": [np.nan, 1, 1, 3], "d": pd.date_range("2014-1-1", "2014-1-4")} + ) + result = df.groupby("grouping")["d"].transform("first") + dates = [ + pd.NaT, + pd.Timestamp("2014-1-2"), + pd.Timestamp("2014-1-2"), + pd.Timestamp("2014-1-4"), + ] + expected = pd.Series(dates, name="d") assert_series_equal(result, expected) def test_transform_length(): # GH 9697 - df = pd.DataFrame({'col1': [1, 1, 2, 2], 'col2': [1, 2, 3, np.nan]}) + df = pd.DataFrame({"col1": [1, 1, 2, 2], "col2": [1, 2, 3, np.nan]}) expected = pd.Series([3.0] * 4) def nsum(x): return np.nansum(x) - results = [df.groupby('col1').transform(sum)['col2'], - df.groupby('col1')['col2'].transform(sum), - df.groupby('col1').transform(nsum)['col2'], - df.groupby('col1')['col2'].transform(nsum)] + results = [ + df.groupby("col1").transform(sum)["col2"], + df.groupby("col1")["col2"].transform(sum), + df.groupby("col1").transform(nsum)["col2"], + df.groupby("col1")["col2"].transform(nsum), + ] for result in results: assert_series_equal(result, expected, check_names=False) @@ -366,8 +391,8 @@ def test_transform_coercion(): # 14457 # when we are transforming be sure to not coerce # via assignment - df = pd.DataFrame(dict(A=['a', 'a'], B=[0, 1])) - g = df.groupby('A') + df = pd.DataFrame(dict(A=["a", "a"], B=[0, 1])) + g = df.groupby("A") expected = g.transform(np.mean) result = g.transform(lambda x: np.mean(x)) @@ -379,31 +404,33 @@ def test_groupby_transform_with_int(): # GH 3740, make sure that we might upcast on item-by-item transform # floats - df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=Series(1, dtype='float64'), - C=Series( - [1, 2, 3, 1, 2, 3], dtype='float64'), D='foo')) - with np.errstate(all='ignore'): - result = df.groupby('A').transform( - lambda x: (x - x.mean()) / x.std()) - expected = DataFrame(dict(B=np.nan, C=Series( - [-1, 0, 1, -1, 0, 1], dtype='float64'))) + df = DataFrame( + dict( + A=[1, 1, 1, 2, 2, 2], + B=Series(1, dtype="float64"), + C=Series([1, 2, 3, 1, 2, 3], dtype="float64"), + D="foo", + ) + ) + with np.errstate(all="ignore"): + result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) + expected = DataFrame( + dict(B=np.nan, C=Series([-1, 0, 1, -1, 0, 1], dtype="float64")) + ) assert_frame_equal(result, expected) # int case - df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, - C=[1, 2, 3, 1, 2, 3], D='foo')) - with np.errstate(all='ignore'): - result = df.groupby('A').transform( - lambda x: (x - x.mean()) / x.std()) + df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=[1, 2, 3, 1, 2, 3], D="foo")) + with np.errstate(all="ignore"): + result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) expected = DataFrame(dict(B=np.nan, C=[-1, 0, 1, -1, 0, 1])) assert_frame_equal(result, expected) # int that needs float conversion s = Series([2, 3, 4, 10, 5, -1]) - df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=s, D='foo')) - with np.errstate(all='ignore'): - result = df.groupby('A').transform( - lambda x: (x - x.mean()) / x.std()) + df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=s, D="foo")) + with np.errstate(all="ignore"): + result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) s1 = s.iloc[0:3] s1 = (s1 - s1.mean()) / s1.std() @@ -413,39 +440,43 @@ def test_groupby_transform_with_int(): assert_frame_equal(result, expected) # int downcasting - result = df.groupby('A').transform(lambda x: x * 2 / 2) + result = df.groupby("A").transform(lambda x: x * 2 / 2) expected = DataFrame(dict(B=1, C=[2, 3, 4, 10, 5, -1])) assert_frame_equal(result, expected) def test_groupby_transform_with_nan_group(): # GH 9941 - df = pd.DataFrame({'a': range(10), - 'b': [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]}) - result = df.groupby(df.b)['a'].transform(max) - expected = pd.Series([1., 1., 2., 3., np.nan, 6., 6., 9., 9., 9.], - name='a') + df = pd.DataFrame({"a": range(10), "b": [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]}) + result = df.groupby(df.b)["a"].transform(max) + expected = pd.Series( + [1.0, 1.0, 2.0, 3.0, np.nan, 6.0, 6.0, 9.0, 9.0, 9.0], name="a" + ) assert_series_equal(result, expected) def test_transform_mixed_type(): - index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3] - ]) - df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], - 'c': np.tile(['a', 'b', 'c'], 2), - 'v': np.arange(1., 7.)}, index=index) + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]]) + df = DataFrame( + { + "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0], + "c": np.tile(["a", "b", "c"], 2), + "v": np.arange(1.0, 7.0), + }, + index=index, + ) def f(group): - group['g'] = group['d'] * 2 + group["g"] = group["d"] * 2 return group[:1] - grouped = df.groupby('c') + grouped = df.groupby("c") result = grouped.apply(f) - assert result['d'].dtype == np.float64 + assert result["d"].dtype == np.float64 # this is by definition a mutating operation! - with pd.option_context('mode.chained_assignment', None): + with pd.option_context("mode.chained_assignment", None): for key, group in grouped: res = f(group) assert_frame_equal(res, result.loc[key]) @@ -474,8 +505,7 @@ def _check_cython_group_transform_cumulative(pd_op, np_op, dtype): ngroups = 1 pd_op(ans, data, labels, ngroups, is_datetimelike) - tm.assert_numpy_array_equal(np_op(data), ans[:, 0], - check_dtype=False) + tm.assert_numpy_array_equal(np_op(data), ans[:, 0], check_dtype=False) def test_cython_group_transform_cumsum(any_real_dtype): @@ -500,38 +530,45 @@ def test_cython_group_transform_algos(): labels = np.array([0, 0, 0, 0, 0], dtype=np.int64) ngroups = 1 - data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64') + data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64") actual = np.zeros_like(data) actual.fill(np.nan) - groupby.group_cumprod_float64(actual, data, labels, ngroups, - is_datetimelike) - expected = np.array([1, 2, 6, np.nan, 24], dtype='float64') + groupby.group_cumprod_float64(actual, data, labels, ngroups, is_datetimelike) + expected = np.array([1, 2, 6, np.nan, 24], dtype="float64") tm.assert_numpy_array_equal(actual[:, 0], expected) actual = np.zeros_like(data) actual.fill(np.nan) groupby.group_cumsum(actual, data, labels, ngroups, is_datetimelike) - expected = np.array([1, 3, 6, np.nan, 10], dtype='float64') + expected = np.array([1, 3, 6, np.nan, 10], dtype="float64") tm.assert_numpy_array_equal(actual[:, 0], expected) # timedelta is_datetimelike = True - data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None] - actual = np.zeros_like(data, dtype='int64') - groupby.group_cumsum(actual, data.view('int64'), labels, - ngroups, is_datetimelike) - expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64( - 2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'), - np.timedelta64(5, 'ns')]) - tm.assert_numpy_array_equal(actual[:, 0].view('m8[ns]'), expected) + data = np.array([np.timedelta64(1, "ns")] * 5, dtype="m8[ns]")[:, None] + actual = np.zeros_like(data, dtype="int64") + groupby.group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike) + expected = np.array( + [ + np.timedelta64(1, "ns"), + np.timedelta64(2, "ns"), + np.timedelta64(3, "ns"), + np.timedelta64(4, "ns"), + np.timedelta64(5, "ns"), + ] + ) + tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected) @pytest.mark.parametrize( "op, args, targop", - [('cumprod', (), lambda x: x.cumprod()), - ('cumsum', (), lambda x: x.cumsum()), - ('shift', (-1, ), lambda x: x.shift(-1)), - ('shift', (1, ), lambda x: x.shift())]) + [ + ("cumprod", (), lambda x: x.cumprod()), + ("cumsum", (), lambda x: x.cumsum()), + ("shift", (-1,), lambda x: x.shift(-1)), + ("shift", (1,), lambda x: x.shift()), + ], +) def test_cython_transform_series(op, args, targop): # GH 4095 s = Series(np.random.randn(1000)) @@ -544,64 +581,104 @@ def test_cython_transform_series(op, args, targop): # print(data.head()) expected = data.groupby(labels).transform(targop) - tm.assert_series_equal( - expected, - data.groupby(labels).transform(op, *args)) - tm.assert_series_equal(expected, getattr( - data.groupby(labels), op)(*args)) + tm.assert_series_equal(expected, data.groupby(labels).transform(op, *args)) + tm.assert_series_equal(expected, getattr(data.groupby(labels), op)(*args)) -@pytest.mark.parametrize("op", ['cumprod', 'cumsum']) +@pytest.mark.parametrize("op", ["cumprod", "cumsum"]) @pytest.mark.parametrize("skipna", [False, True]) -@pytest.mark.parametrize('input, exp', [ - # When everything is NaN - ({'key': ['b'] * 10, 'value': np.nan}, - pd.Series([np.nan] * 10, name='value')), - # When there is a single NaN - ({'key': ['b'] * 10 + ['a'] * 2, - 'value': [3] * 3 + [np.nan] + [3] * 8}, - {('cumprod', False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0], - ('cumprod', True): [3.0, 9.0, 27.0, np.nan, 81., 243., 729., - 2187., 6561., 19683., 3.0, 9.0], - ('cumsum', False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0], - ('cumsum', True): [3.0, 6.0, 9.0, np.nan, 12., 15., 18., - 21., 24., 27., 3.0, 6.0]})]) +@pytest.mark.parametrize( + "input, exp", + [ + # When everything is NaN + ({"key": ["b"] * 10, "value": np.nan}, pd.Series([np.nan] * 10, name="value")), + # When there is a single NaN + ( + {"key": ["b"] * 10 + ["a"] * 2, "value": [3] * 3 + [np.nan] + [3] * 8}, + { + ("cumprod", False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0], + ("cumprod", True): [ + 3.0, + 9.0, + 27.0, + np.nan, + 81.0, + 243.0, + 729.0, + 2187.0, + 6561.0, + 19683.0, + 3.0, + 9.0, + ], + ("cumsum", False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0], + ("cumsum", True): [ + 3.0, + 6.0, + 9.0, + np.nan, + 12.0, + 15.0, + 18.0, + 21.0, + 24.0, + 27.0, + 3.0, + 6.0, + ], + }, + ), + ], +) def test_groupby_cum_skipna(op, skipna, input, exp): df = pd.DataFrame(input) - result = df.groupby('key')['value'].transform(op, skipna=skipna) + result = df.groupby("key")["value"].transform(op, skipna=skipna) if isinstance(exp, dict): expected = exp[(op, skipna)] else: expected = exp - expected = pd.Series(expected, name='value') + expected = pd.Series(expected, name="value") tm.assert_series_equal(expected, result) @pytest.mark.parametrize( "op, args, targop", - [('cumprod', (), lambda x: x.cumprod()), - ('cumsum', (), lambda x: x.cumsum()), - ('shift', (-1, ), lambda x: x.shift(-1)), - ('shift', (1, ), lambda x: x.shift())]) + [ + ("cumprod", (), lambda x: x.cumprod()), + ("cumsum", (), lambda x: x.cumsum()), + ("shift", (-1,), lambda x: x.shift(-1)), + ("shift", (1,), lambda x: x.shift()), + ], +) def test_cython_transform_frame(op, args, targop): s = Series(np.random.randn(1000)) s_missing = s.copy() s_missing.iloc[2:10] = np.nan labels = np.random.randint(0, 50, size=1000).astype(float) - strings = list('qwertyuiopasdfghjklz') + strings = list("qwertyuiopasdfghjklz") strings_missing = strings[:] strings_missing[5] = np.nan - df = DataFrame({'float': s, - 'float_missing': s_missing, - 'int': [1, 1, 1, 1, 2] * 200, - 'datetime': pd.date_range('1990-1-1', periods=1000), - 'timedelta': pd.timedelta_range(1, freq='s', - periods=1000), - 'string': strings * 50, - 'string_missing': strings_missing * 50}, - columns=['float', 'float_missing', 'int', 'datetime', - 'timedelta', 'string', 'string_missing']) - df['cat'] = df['string'].astype('category') + df = DataFrame( + { + "float": s, + "float_missing": s_missing, + "int": [1, 1, 1, 1, 2] * 200, + "datetime": pd.date_range("1990-1-1", periods=1000), + "timedelta": pd.timedelta_range(1, freq="s", periods=1000), + "string": strings * 50, + "string_missing": strings_missing * 50, + }, + columns=[ + "float", + "float_missing", + "int", + "datetime", + "timedelta", + "string", + "string_missing", + ], + ) + df["cat"] = df["string"].astype("category") df2 = df.copy() df2.index = pd.MultiIndex.from_product([range(100), range(10)]) @@ -609,37 +686,35 @@ def test_cython_transform_frame(op, args, targop): # DataFrame - Single and MultiIndex, # group by values, index level, columns for df in [df, df2]: - for gb_target in [dict(by=labels), dict(level=0), dict(by='string') - ]: # dict(by='string_missing')]: + for gb_target in [ + dict(by=labels), + dict(level=0), + dict(by="string"), + ]: # dict(by='string_missing')]: # dict(by=['int','string'])]: gb = df.groupby(**gb_target) # whitelisted methods set the selection before applying # bit a of hack to make sure the cythonized shift # is equivalent to pre 0.17.1 behavior - if op == 'shift': + if op == "shift": gb._set_group_selection() - if op != 'shift' and 'int' not in gb_target: + if op != "shift" and "int" not in gb_target: # numeric apply fastpath promotes dtype so have # to apply separately and concat - i = gb[['int']].apply(targop) - f = gb[['float', 'float_missing']].apply(targop) + i = gb[["int"]].apply(targop) + f = gb[["float", "float_missing"]].apply(targop) expected = pd.concat([f, i], axis=1) else: expected = gb.apply(targop) expected = expected.sort_index(axis=1) - tm.assert_frame_equal(expected, - gb.transform(op, *args).sort_index( - axis=1)) - tm.assert_frame_equal( - expected, - getattr(gb, op)(*args).sort_index(axis=1)) + tm.assert_frame_equal(expected, gb.transform(op, *args).sort_index(axis=1)) + tm.assert_frame_equal(expected, getattr(gb, op)(*args).sort_index(axis=1)) # individual columns for c in df: - if c not in ['float', 'int', 'float_missing' - ] and op != 'shift': + if c not in ["float", "int", "float_missing"] and op != "shift": msg = "No numeric types to aggregate" with pytest.raises(DataError, match=msg): gb[c].transform(op) @@ -648,84 +723,102 @@ def test_cython_transform_frame(op, args, targop): else: expected = gb[c].apply(targop) expected.name = c - tm.assert_series_equal(expected, - gb[c].transform(op, *args)) - tm.assert_series_equal(expected, - getattr(gb[c], op)(*args)) + tm.assert_series_equal(expected, gb[c].transform(op, *args)) + tm.assert_series_equal(expected, getattr(gb[c], op)(*args)) def test_transform_with_non_scalar_group(): # GH 10165 - cols = pd.MultiIndex.from_tuples([ - ('syn', 'A'), ('mis', 'A'), ('non', 'A'), - ('syn', 'C'), ('mis', 'C'), ('non', 'C'), - ('syn', 'T'), ('mis', 'T'), ('non', 'T'), - ('syn', 'G'), ('mis', 'G'), ('non', 'G')]) - df = pd.DataFrame(np.random.randint(1, 10, (4, 12)), - columns=cols, - index=['A', 'C', 'G', 'T']) - - msg = 'transform must return a scalar value for each group.*' + cols = pd.MultiIndex.from_tuples( + [ + ("syn", "A"), + ("mis", "A"), + ("non", "A"), + ("syn", "C"), + ("mis", "C"), + ("non", "C"), + ("syn", "T"), + ("mis", "T"), + ("non", "T"), + ("syn", "G"), + ("mis", "G"), + ("non", "G"), + ] + ) + df = pd.DataFrame( + np.random.randint(1, 10, (4, 12)), columns=cols, index=["A", "C", "G", "T"] + ) + + msg = "transform must return a scalar value for each group.*" with pytest.raises(ValueError, match=msg): - df.groupby(axis=1, level=1).transform( - lambda z: z.div(z.sum(axis=1), axis=0)) + df.groupby(axis=1, level=1).transform(lambda z: z.div(z.sum(axis=1), axis=0)) -@pytest.mark.parametrize('cols,exp,comp_func', [ - ('a', pd.Series([1, 1, 1], name='a'), tm.assert_series_equal), - (['a', 'c'], pd.DataFrame({'a': [1, 1, 1], 'c': [1, 1, 1]}), - tm.assert_frame_equal) -]) -@pytest.mark.parametrize('agg_func', [ - 'count', 'rank', 'size']) +@pytest.mark.parametrize( + "cols,exp,comp_func", + [ + ("a", pd.Series([1, 1, 1], name="a"), tm.assert_series_equal), + ( + ["a", "c"], + pd.DataFrame({"a": [1, 1, 1], "c": [1, 1, 1]}), + tm.assert_frame_equal, + ), + ], +) +@pytest.mark.parametrize("agg_func", ["count", "rank", "size"]) def test_transform_numeric_ret(cols, exp, comp_func, agg_func): - if agg_func == 'size' and isinstance(cols, list): - pytest.xfail("'size' transformation not supported with " - "NDFrameGroupy") + if agg_func == "size" and isinstance(cols, list): + pytest.xfail("'size' transformation not supported with " "NDFrameGroupy") # GH 19200 df = pd.DataFrame( - {'a': pd.date_range('2018-01-01', periods=3), - 'b': range(3), - 'c': range(7, 10)}) + {"a": pd.date_range("2018-01-01", periods=3), "b": range(3), "c": range(7, 10)} + ) - result = df.groupby('b')[cols].transform(agg_func) + result = df.groupby("b")[cols].transform(agg_func) - if agg_func == 'rank': - exp = exp.astype('float') + if agg_func == "rank": + exp = exp.astype("float") comp_func(result, exp) @pytest.mark.parametrize("mix_groupings", [True, False]) @pytest.mark.parametrize("as_series", [True, False]) -@pytest.mark.parametrize("val1,val2", [ - ('foo', 'bar'), (1, 2), (1., 2.)]) -@pytest.mark.parametrize("fill_method,limit,exp_vals", [ - ("ffill", None, - [np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']), - ("ffill", 1, - [np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]), - ("bfill", None, - ['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]), - ("bfill", 1, - [np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan]) -]) -def test_group_fill_methods(mix_groupings, as_series, val1, val2, - fill_method, limit, exp_vals): +@pytest.mark.parametrize("val1,val2", [("foo", "bar"), (1, 2), (1.0, 2.0)]) +@pytest.mark.parametrize( + "fill_method,limit,exp_vals", + [ + ( + "ffill", + None, + [np.nan, np.nan, "val1", "val1", "val1", "val2", "val2", "val2"], + ), + ("ffill", 1, [np.nan, np.nan, "val1", "val1", np.nan, "val2", "val2", np.nan]), + ( + "bfill", + None, + ["val1", "val1", "val1", "val2", "val2", "val2", np.nan, np.nan], + ), + ("bfill", 1, [np.nan, "val1", "val1", np.nan, "val2", "val2", np.nan, np.nan]), + ], +) +def test_group_fill_methods( + mix_groupings, as_series, val1, val2, fill_method, limit, exp_vals +): vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan] _exp_vals = list(exp_vals) # Overwrite placeholder values for index, exp_val in enumerate(_exp_vals): - if exp_val == 'val1': + if exp_val == "val1": _exp_vals[index] = val1 - elif exp_val == 'val2': + elif exp_val == "val2": _exp_vals[index] = val2 # Need to modify values and expectations depending on the # Series / DataFrame that we ultimately want to generate if mix_groupings: # ['a', 'b', 'a, 'b', ...] - keys = ['a', 'b'] * len(vals) + keys = ["a", "b"] * len(vals) def interweave(list_obj): temp = list() @@ -737,82 +830,98 @@ def interweave(list_obj): _exp_vals = interweave(_exp_vals) vals = interweave(vals) else: # ['a', 'a', 'a', ... 'b', 'b', 'b'] - keys = ['a'] * len(vals) + ['b'] * len(vals) + keys = ["a"] * len(vals) + ["b"] * len(vals) _exp_vals = _exp_vals * 2 vals = vals * 2 - df = DataFrame({'key': keys, 'val': vals}) + df = DataFrame({"key": keys, "val": vals}) if as_series: - result = getattr( - df.groupby('key')['val'], fill_method)(limit=limit) - exp = Series(_exp_vals, name='val') + result = getattr(df.groupby("key")["val"], fill_method)(limit=limit) + exp = Series(_exp_vals, name="val") assert_series_equal(result, exp) else: - result = getattr(df.groupby('key'), fill_method)(limit=limit) - exp = DataFrame({'val': _exp_vals}) + result = getattr(df.groupby("key"), fill_method)(limit=limit) + exp = DataFrame({"val": _exp_vals}) assert_frame_equal(result, exp) -@pytest.mark.parametrize("fill_method", ['ffill', 'bfill']) +@pytest.mark.parametrize("fill_method", ["ffill", "bfill"]) def test_pad_stable_sorting(fill_method): # GH 21207 x = [0] * 20 y = [np.nan] * 10 + [1] * 10 - if fill_method == 'bfill': + if fill_method == "bfill": y = y[::-1] - df = pd.DataFrame({'x': x, 'y': y}) - expected = df.drop('x', 1) + df = pd.DataFrame({"x": x, "y": y}) + expected = df.drop("x", 1) - result = getattr(df.groupby('x'), fill_method)() + result = getattr(df.groupby("x"), fill_method)() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("test_series", [True, False]) -@pytest.mark.parametrize("freq", [ - None, - pytest.param('D', marks=pytest.mark.xfail( - reason='GH#23918 before method uses freq in vectorized approach'))]) -@pytest.mark.parametrize("periods,fill_method,limit", [ - (1, 'ffill', None), (1, 'ffill', 1), - (1, 'bfill', None), (1, 'bfill', 1), - (-1, 'ffill', None), (-1, 'ffill', 1), - (-1, 'bfill', None), (-1, 'bfill', 1), -]) +@pytest.mark.parametrize( + "freq", + [ + None, + pytest.param( + "D", + marks=pytest.mark.xfail( + reason="GH#23918 before method uses freq in vectorized approach" + ), + ), + ], +) +@pytest.mark.parametrize( + "periods,fill_method,limit", + [ + (1, "ffill", None), + (1, "ffill", 1), + (1, "bfill", None), + (1, "bfill", 1), + (-1, "ffill", None), + (-1, "ffill", 1), + (-1, "bfill", None), + (-1, "bfill", 1), + ], +) def test_pct_change(test_series, freq, periods, fill_method, limit): # GH 21200, 21621 vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4] - keys = ['a', 'b'] + keys = ["a", "b"] key_v = np.repeat(keys, len(vals)) - df = DataFrame({'key': key_v, 'vals': vals * 2}) + df = DataFrame({"key": key_v, "vals": vals * 2}) - df_g = getattr(df.groupby('key'), fill_method)(limit=limit) + df_g = getattr(df.groupby("key"), fill_method)(limit=limit) grp = df_g.groupby(df.key) - expected = grp['vals'].obj / grp['vals'].shift(periods) - 1 + expected = grp["vals"].obj / grp["vals"].shift(periods) - 1 if test_series: - result = df.groupby('key')['vals'].pct_change( - periods=periods, fill_method=fill_method, limit=limit, freq=freq) + result = df.groupby("key")["vals"].pct_change( + periods=periods, fill_method=fill_method, limit=limit, freq=freq + ) tm.assert_series_equal(result, expected) else: - result = df.groupby('key').pct_change( - periods=periods, fill_method=fill_method, limit=limit, freq=freq) - tm.assert_frame_equal(result, expected.to_frame('vals')) + result = df.groupby("key").pct_change( + periods=periods, fill_method=fill_method, limit=limit, freq=freq + ) + tm.assert_frame_equal(result, expected.to_frame("vals")) @pytest.mark.parametrize("func", [np.any, np.all]) def test_any_all_np_func(func): # GH 20653 - df = pd.DataFrame([['foo', True], - [np.nan, True], - ['foo', True]], columns=['key', 'val']) + df = pd.DataFrame( + [["foo", True], [np.nan, True], ["foo", True]], columns=["key", "val"] + ) - exp = pd.Series([True, np.nan, True], name='val') + exp = pd.Series([True, np.nan, True], name="val") - res = df.groupby('key')['val'].transform(func) + res = df.groupby("key")["val"].transform(func) tm.assert_series_equal(res, exp) @@ -825,51 +934,52 @@ def demean_rename(x): return result result = result.rename( - columns={c: '{}_demeaned'.format(c) for c in result.columns}) + columns={c: "{}_demeaned".format(c) for c in result.columns} + ) return result - df = pd.DataFrame({'group': list('ababa'), - 'value': [1, 1, 1, 2, 2]}) - expected = pd.DataFrame({'value': [-1. / 3, -0.5, -1. / 3, 0.5, 2. / 3]}) + df = pd.DataFrame({"group": list("ababa"), "value": [1, 1, 1, 2, 2]}) + expected = pd.DataFrame({"value": [-1.0 / 3, -0.5, -1.0 / 3, 0.5, 2.0 / 3]}) - result = df.groupby('group').transform(demean_rename) + result = df.groupby("group").transform(demean_rename) tm.assert_frame_equal(result, expected) - result_single = df.groupby('group').value.transform(demean_rename) - tm.assert_series_equal(result_single, expected['value']) + result_single = df.groupby("group").value.transform(demean_rename) + tm.assert_series_equal(result_single, expected["value"]) -@pytest.mark.parametrize('func', [min, max, np.min, np.max, 'first', 'last']) +@pytest.mark.parametrize("func", [min, max, np.min, np.max, "first", "last"]) def test_groupby_transform_timezone_column(func): # GH 24198 - ts = pd.to_datetime('now', utc=True).tz_convert('Asia/Singapore') - result = pd.DataFrame({'end_time': [ts], 'id': [1]}) - result['max_end_time'] = result.groupby('id').end_time.transform(func) - expected = pd.DataFrame([[ts, 1, ts]], columns=['end_time', 'id', - 'max_end_time']) + ts = pd.to_datetime("now", utc=True).tz_convert("Asia/Singapore") + result = pd.DataFrame({"end_time": [ts], "id": [1]}) + result["max_end_time"] = result.groupby("id").end_time.transform(func) + expected = pd.DataFrame([[ts, 1, ts]], columns=["end_time", "id", "max_end_time"]) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("func, values", [ - ("idxmin", ["1/1/2011"] * 2 + ["1/3/2011"] * 7 + ["1/10/2011"]), - ("idxmax", ["1/2/2011"] * 2 + ["1/9/2011"] * 7 + ["1/10/2011"]) -]) +@pytest.mark.parametrize( + "func, values", + [ + ("idxmin", ["1/1/2011"] * 2 + ["1/3/2011"] * 7 + ["1/10/2011"]), + ("idxmax", ["1/2/2011"] * 2 + ["1/9/2011"] * 7 + ["1/10/2011"]), + ], +) def test_groupby_transform_with_datetimes(func, values): # GH 15306 - dates = pd.date_range('1/1/2011', periods=10, freq='D') + dates = pd.date_range("1/1/2011", periods=10, freq="D") - stocks = pd.DataFrame({'price': np.arange(10.0)}, index=dates) - stocks['week_id'] = pd.to_datetime(stocks.index).week + stocks = pd.DataFrame({"price": np.arange(10.0)}, index=dates) + stocks["week_id"] = pd.to_datetime(stocks.index).week - result = stocks.groupby(stocks['week_id'])['price'].transform(func) + result = stocks.groupby(stocks["week_id"])["price"].transform(func) - expected = pd.Series(data=pd.to_datetime(values), - index=dates, name="price") + expected = pd.Series(data=pd.to_datetime(values), index=dates, name="price") tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('func', ['cumsum', 'cumprod', 'cummin', 'cummax']) +@pytest.mark.parametrize("func", ["cumsum", "cumprod", "cummin", "cummax"]) def test_transform_absent_categories(func): # GH 16771 # cython transforms with more groups than rows @@ -882,8 +992,8 @@ def test_transform_absent_categories(func): assert_series_equal(result, expected) -@pytest.mark.parametrize('func', ['ffill', 'bfill', 'shift']) -@pytest.mark.parametrize('key, val', [('level', 0), ('by', Series([0]))]) +@pytest.mark.parametrize("func", ["ffill", "bfill", "shift"]) +@pytest.mark.parametrize("key, val", [("level", 0), ("by", Series([0]))]) def test_ffill_not_in_axis(func, key, val): # GH 21521 df = pd.DataFrame([[np.nan]]) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 2b5f87aa59a8d5..c7b28822092a8f 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -16,21 +16,22 @@ # our starting frame def seed_df(seed_nans, n, m): np.random.seed(1234) - days = date_range('2015-08-24', periods=10) + days = date_range("2015-08-24", periods=10) - frame = DataFrame({ - '1st': np.random.choice( - list('abcd'), n), - '2nd': np.random.choice(days, n), - '3rd': np.random.randint(1, m + 1, n) - }) + frame = DataFrame( + { + "1st": np.random.choice(list("abcd"), n), + "2nd": np.random.choice(days, n), + "3rd": np.random.randint(1, m + 1, n), + } + ) if seed_nans: - frame.loc[1::11, '1st'] = np.nan - frame.loc[3::17, '2nd'] = np.nan - frame.loc[7::19, '3rd'] = np.nan - frame.loc[8::19, '3rd'] = np.nan - frame.loc[9::19, '3rd'] = np.nan + frame.loc[1::11, "1st"] = np.nan + frame.loc[3::17, "2nd"] = np.nan + frame.loc[7::19, "3rd"] = np.nan + frame.loc[8::19, "3rd"] = np.nan + frame.loc[9::19, "3rd"] = np.nan return frame @@ -42,8 +43,8 @@ def seed_df(seed_nans, n, m): for n, m in product((100, 1000), (5, 20)): df = seed_df(seed_nans, n, m) - bins = None, np.arange(0, max(5, df['3rd'].max()) + 1, 2) - keys = '1st', '2nd', ['1st', '2nd'] + bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2) + keys = "1st", "2nd", ["1st", "2nd"] for k, b in product(keys, bins): binned.append((df, k, b, n, m)) ids.append("{}-{}-{}".format(k, n, m)) @@ -52,24 +53,27 @@ def seed_df(seed_nans, n, m): @pytest.mark.slow @pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids) def test_series_groupby_value_counts(df, keys, bins, n, m): - def rebuild_index(df): arr = list(map(df.index.get_level_values, range(df.index.nlevels))) df.index = MultiIndex.from_arrays(arr, names=df.index.names) return df - for isort, normalize, sort, ascending, dropna \ - in product((False, True), repeat=5): + for isort, normalize, sort, ascending, dropna in product((False, True), repeat=5): - kwargs = dict(normalize=normalize, sort=sort, - ascending=ascending, dropna=dropna, bins=bins) + kwargs = dict( + normalize=normalize, + sort=sort, + ascending=ascending, + dropna=dropna, + bins=bins, + ) gr = df.groupby(keys, sort=isort) - left = gr['3rd'].value_counts(**kwargs) + left = gr["3rd"].value_counts(**kwargs) gr = df.groupby(keys, sort=isort) - right = gr['3rd'].apply(Series.value_counts, **kwargs) - right.index.names = right.index.names[:-1] + ['3rd'] + right = gr["3rd"].apply(Series.value_counts, **kwargs) + right.index.names = right.index.names[:-1] + ["3rd"] # have to sort on index because of unstable sort on values left, right = map(rebuild_index, (left, right)) # xref GH9212 diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 2bd2f3fb00b562..03e10ff44c2990 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -11,26 +11,37 @@ from pandas import DataFrame, Index, MultiIndex, Series, date_range from pandas.util import testing as tm -AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', - 'mad', 'std', 'var', 'sem'] -AGG_FUNCTIONS_WITH_SKIPNA = ['skew', 'mad'] +AGG_FUNCTIONS = [ + "sum", + "prod", + "min", + "max", + "median", + "mean", + "skew", + "mad", + "std", + "var", + "sem", +] +AGG_FUNCTIONS_WITH_SKIPNA = ["skew", "mad"] df_whitelist = [ - 'quantile', - 'fillna', - 'mad', - 'take', - 'idxmax', - 'idxmin', - 'tshift', - 'skew', - 'plot', - 'hist', - 'dtypes', - 'corrwith', - 'corr', - 'cov', - 'diff', + "quantile", + "fillna", + "mad", + "take", + "idxmax", + "idxmin", + "tshift", + "skew", + "plot", + "hist", + "dtypes", + "corrwith", + "corr", + "cov", + "diff", ] @@ -40,25 +51,25 @@ def df_whitelist_fixture(request): s_whitelist = [ - 'quantile', - 'fillna', - 'mad', - 'take', - 'idxmax', - 'idxmin', - 'tshift', - 'skew', - 'plot', - 'hist', - 'dtype', - 'corr', - 'cov', - 'diff', - 'unique', - 'nlargest', - 'nsmallest', - 'is_monotonic_increasing', - 'is_monotonic_decreasing', + "quantile", + "fillna", + "mad", + "take", + "idxmax", + "idxmin", + "tshift", + "skew", + "plot", + "hist", + "dtype", + "corr", + "cov", + "diff", + "unique", + "nlargest", + "nsmallest", + "is_monotonic_increasing", + "is_monotonic_decreasing", ] @@ -69,22 +80,24 @@ def s_whitelist_fixture(request): @pytest.fixture def mframe(): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - return DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + return DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) @pytest.fixture def df(): return DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) @pytest.fixture @@ -92,8 +105,12 @@ def df_letters(): letters = np.array(list(ascii_lowercase)) N = 10 random_letters = letters.take(np.random.randint(0, 26, N)) - df = DataFrame({'floats': N / 10 * Series(np.random.random(N)), - 'letters': Series(random_letters)}) + df = DataFrame( + { + "floats": N / 10 * Series(np.random.random(N)), + "letters": Series(random_letters), + } + ) return df @@ -104,7 +121,7 @@ def test_groupby_whitelist(df_letters, whitelist): # dataframe obj = df_letters else: - obj = df_letters['floats'] + obj = df_letters["floats"] gb = obj.groupby(df.letters) @@ -147,26 +164,25 @@ def test_groupby_frame_whitelist(df_letters, df_whitelist_fixture): @pytest.fixture def raw_frame(): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - raw_frame = DataFrame(np.random.randn(10, 3), index=index, - columns=Index(['A', 'B', 'C'], name='exp')) + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + raw_frame = DataFrame( + np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp") + ) raw_frame.iloc[1, [1, 2]] = np.nan raw_frame.iloc[7, [0, 1]] = np.nan return raw_frame -@pytest.mark.parametrize('op', AGG_FUNCTIONS) -@pytest.mark.parametrize('level', [0, 1]) -@pytest.mark.parametrize('axis', [0, 1]) -@pytest.mark.parametrize('skipna', [True, False]) -@pytest.mark.parametrize('sort', [True, False]) -def test_regression_whitelist_methods( - raw_frame, op, level, - axis, skipna, sort): +@pytest.mark.parametrize("op", AGG_FUNCTIONS) +@pytest.mark.parametrize("level", [0, 1]) +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +def test_regression_whitelist_methods(raw_frame, op, level, axis, skipna, sort): # GH6944 # GH 17537 # explicitly test the whitelist methods @@ -179,8 +195,7 @@ def test_regression_whitelist_methods( if op in AGG_FUNCTIONS_WITH_SKIPNA: grouped = frame.groupby(level=level, axis=axis, sort=sort) result = getattr(grouped, op)(skipna=skipna) - expected = getattr(frame, op)(level=level, axis=axis, - skipna=skipna) + expected = getattr(frame, op)(level=level, axis=axis, skipna=skipna) if sort: expected = expected.sort_index(axis=axis, level=level) tm.assert_frame_equal(result, expected) @@ -198,21 +213,32 @@ def test_groupby_blacklist(df_letters): s = df_letters.floats blacklist = [ - 'eval', 'query', 'abs', 'where', - 'mask', 'align', 'groupby', 'clip', 'astype', - 'at', 'combine', 'consolidate', 'convert_objects', + "eval", + "query", + "abs", + "where", + "mask", + "align", + "groupby", + "clip", + "astype", + "at", + "combine", + "consolidate", + "convert_objects", ] - to_methods = [method for method in dir(df) if method.startswith('to_')] + to_methods = [method for method in dir(df) if method.startswith("to_")] blacklist.extend(to_methods) # e.g., to_csv - defined_but_not_allowed = ("(?:^Cannot.+{0!r}.+{1!r}.+try using the " - "'apply' method$)") + defined_but_not_allowed = ( + "(?:^Cannot.+{0!r}.+{1!r}.+try using the " "'apply' method$)" + ) # e.g., query, eval not_defined = "(?:^{1!r} object has no attribute {0!r}$)" - fmt = defined_but_not_allowed + '|' + not_defined + fmt = defined_but_not_allowed + "|" + not_defined for bl in blacklist: for obj in (df, s): gb = obj.groupby(df.letters) @@ -222,58 +248,117 @@ def test_groupby_blacklist(df_letters): def test_tab_completion(mframe): - grp = mframe.groupby(level='second') - results = {v for v in dir(grp) if not v.startswith('_')} + grp = mframe.groupby(level="second") + results = {v for v in dir(grp) if not v.startswith("_")} expected = { - 'A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter', - 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', - 'mean', 'median', 'min', 'ngroups', 'nth', 'ohlc', 'plot', - 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', - 'nunique', 'head', 'describe', 'cummax', 'quantile', - 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', - 'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew', - 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', - 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', - 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding', 'pipe', + "A", + "B", + "C", + "agg", + "aggregate", + "apply", + "boxplot", + "filter", + "first", + "get_group", + "groups", + "hist", + "indices", + "last", + "max", + "mean", + "median", + "min", + "ngroups", + "nth", + "ohlc", + "plot", + "prod", + "size", + "std", + "sum", + "transform", + "var", + "sem", + "count", + "nunique", + "head", + "describe", + "cummax", + "quantile", + "rank", + "cumprod", + "tail", + "resample", + "cummin", + "fillna", + "cumsum", + "cumcount", + "ngroup", + "all", + "shift", + "skew", + "take", + "tshift", + "pct_change", + "any", + "mad", + "corr", + "corrwith", + "cov", + "dtypes", + "ndim", + "diff", + "idxmax", + "idxmin", + "ffill", + "bfill", + "pad", + "backfill", + "rolling", + "expanding", + "pipe", } assert results == expected def test_groupby_function_rename(mframe): - grp = mframe.groupby(level='second') - for name in ['sum', 'prod', 'min', 'max', 'first', 'last']: + grp = mframe.groupby(level="second") + for name in ["sum", "prod", "min", "max", "first", "last"]: f = getattr(grp, name) assert f.__name__ == name def test_groupby_selection_with_methods(df): # some methods which require DatetimeIndex - rng = date_range('2014', periods=len(df)) + rng = date_range("2014", periods=len(df)) df.index = rng - g = df.groupby(['A'])[['C']] - g_exp = df[['C']].groupby(df['A']) + g = df.groupby(["A"])[["C"]] + g_exp = df[["C"]].groupby(df["A"]) # TODO check groupby with > 1 col ? # methods which are called as .foo() - methods = ['count', - 'corr', - 'cummax', - 'cummin', - 'cumprod', - 'describe', - 'rank', - 'quantile', - 'diff', - 'shift', - 'all', - 'any', - 'idxmin', - 'idxmax', - 'ffill', - 'bfill', - 'pct_change', - 'tshift'] + methods = [ + "count", + "corr", + "cummax", + "cummin", + "cumprod", + "describe", + "rank", + "quantile", + "diff", + "shift", + "all", + "any", + "idxmin", + "idxmax", + "ffill", + "bfill", + "pct_change", + "tshift", + ] for m in methods: res = getattr(g, m)() @@ -285,12 +370,11 @@ def test_groupby_selection_with_methods(df): # methods which aren't just .foo() tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0)) tm.assert_frame_equal(g.dtypes, g_exp.dtypes) - tm.assert_frame_equal(g.apply(lambda x: x.sum()), - g_exp.apply(lambda x: x.sum())) + tm.assert_frame_equal(g.apply(lambda x: x.sum()), g_exp.apply(lambda x: x.sum())) - tm.assert_frame_equal(g.resample('D').mean(), g_exp.resample('D').mean()) - tm.assert_frame_equal(g.resample('D').ohlc(), - g_exp.resample('D').ohlc()) + tm.assert_frame_equal(g.resample("D").mean(), g_exp.resample("D").mean()) + tm.assert_frame_equal(g.resample("D").ohlc(), g_exp.resample("D").ohlc()) - tm.assert_frame_equal(g.filter(lambda x: len(x) == 3), - g_exp.filter(lambda x: len(x) == 3)) + tm.assert_frame_equal( + g.filter(lambda x: len(x) == 3), g_exp.filter(lambda x: len(x) == 3) + ) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 674f600bc8693e..9459069f0ea2d2 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -7,9 +7,19 @@ import pandas as pd from pandas import ( - CategoricalIndex, DatetimeIndex, Index, Int64Index, IntervalIndex, - MultiIndex, PeriodIndex, RangeIndex, Series, TimedeltaIndex, UInt64Index, - isna) + CategoricalIndex, + DatetimeIndex, + Index, + Int64Index, + IntervalIndex, + MultiIndex, + PeriodIndex, + RangeIndex, + Series, + TimedeltaIndex, + UInt64Index, + isna, +) from pandas.core.indexes.base import InvalidIndexError from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin import pandas.util.testing as tm @@ -17,8 +27,9 @@ class Base: """ base class for index sub-class tests """ + _holder = None - _compat_props = ['shape', 'ndim', 'size', 'nbytes'] + _compat_props = ["shape", "ndim", "size", "nbytes"] def setup_indices(self): for name, idx in self.indices.items(): @@ -26,10 +37,12 @@ def setup_indices(self): def test_pickle_compat_construction(self): # need an object to create with - msg = (r"Index\(\.\.\.\) must be called with a collection of some" - r" kind, None was passed|" - r"__new__\(\) missing 1 required positional argument: 'data'|" - r"__new__\(\) takes at least 2 arguments \(1 given\)") + msg = ( + r"Index\(\.\.\.\) must be called with a collection of some" + r" kind, None was passed|" + r"__new__\(\) missing 1 required positional argument: 'data'|" + r"__new__\(\) takes at least 2 arguments \(1 given\)" + ) with pytest.raises(TypeError, match=msg): self._holder() @@ -55,7 +68,7 @@ def test_to_series_with_arguments(self): # name kwarg idx = self.create_index() - s = idx.to_series(name='__test') + s = idx.to_series(name="__test") assert s.values is not idx.values assert s.index is not idx @@ -83,8 +96,8 @@ def test_to_frame(self, name): def test_to_frame_datetime_tz(self): # GH 25809 - idx = pd.date_range(start='2019-01-01', end='2019-01-30', freq='D') - idx = idx.tz_localize('UTC') + idx = pd.date_range(start="2019-01-01", end="2019-01-30", freq="D") + idx = idx.tz_localize("UTC") result = idx.to_frame() expected = pd.DataFrame(idx, index=idx) tm.assert_frame_equal(result, expected) @@ -105,29 +118,52 @@ def test_create_index_existing_name(self): # specified, the new index should inherit the previous object name expected = self.create_index() if not isinstance(expected, MultiIndex): - expected.name = 'foo' + expected.name = "foo" result = pd.Index(expected) tm.assert_index_equal(result, expected) - result = pd.Index(expected, name='bar') - expected.name = 'bar' + result = pd.Index(expected, name="bar") + expected.name = "bar" tm.assert_index_equal(result, expected) else: - expected.names = ['foo', 'bar'] + expected.names = ["foo", "bar"] result = pd.Index(expected) tm.assert_index_equal( - result, Index(Index([('foo', 'one'), ('foo', 'two'), - ('bar', 'one'), ('baz', 'two'), - ('qux', 'one'), ('qux', 'two')], - dtype='object'), - names=['foo', 'bar'])) - - result = pd.Index(expected, names=['A', 'B']) + result, + Index( + Index( + [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ], + dtype="object", + ), + names=["foo", "bar"], + ), + ) + + result = pd.Index(expected, names=["A", "B"]) tm.assert_index_equal( result, - Index(Index([('foo', 'one'), ('foo', 'two'), ('bar', 'one'), - ('baz', 'two'), ('qux', 'one'), ('qux', 'two')], - dtype='object'), names=['A', 'B'])) + Index( + Index( + [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ], + dtype="object", + ), + names=["A", "B"], + ), + ) def test_numeric_compat(self): @@ -141,7 +177,7 @@ def test_numeric_compat(self): with pytest.raises(TypeError, match=div_err): idx / 1 - div_err = div_err.replace(' __', ' __r') + div_err = div_err.replace(" __", " __r") with pytest.raises(TypeError, match=div_err): 1 / idx with pytest.raises(TypeError, match="cannot perform __floordiv__"): @@ -151,9 +187,9 @@ def test_numeric_compat(self): def test_logical_compat(self): idx = self.create_index() - with pytest.raises(TypeError, match='cannot perform all'): + with pytest.raises(TypeError, match="cannot perform all"): idx.all() - with pytest.raises(TypeError, match='cannot perform any'): + with pytest.raises(TypeError, match="cannot perform any"): idx.any() def test_boolean_context_compat(self): @@ -161,7 +197,7 @@ def test_boolean_context_compat(self): # boolean context compat idx = self.create_index() - with pytest.raises(ValueError, match='The truth value of a'): + with pytest.raises(ValueError, match="The truth value of a"): if idx: pass @@ -172,8 +208,8 @@ def test_reindex_base(self): actual = idx.get_indexer(idx) tm.assert_numpy_array_equal(expected, actual) - with pytest.raises(ValueError, match='Invalid fill method'): - idx.get_indexer(idx, method='invalid') + with pytest.raises(ValueError, match="Invalid fill method"): + idx.get_indexer(idx, method="invalid") def test_get_indexer_consistency(self): # See GH 16819 @@ -216,7 +252,7 @@ def test_str(self): # test the string repr idx = self.create_index() - idx.name = 'foo' + idx.name = "foo" assert "'foo'" in str(idx) assert idx.__class__.__name__ in str(idx) @@ -226,7 +262,7 @@ def test_repr_max_seq_item_setting(self): idx = idx.repeat(50) with pd.option_context("display.max_seq_items", None): repr(idx) - assert '...' not in str(idx) + assert "..." not in str(idx) def test_copy_name(self): # gh-12309: Check that the "name" argument @@ -236,7 +272,7 @@ def test_copy_name(self): if isinstance(index, MultiIndex): continue - first = index.__class__(index, copy=True, name='mario') + first = index.__class__(index, copy=True, name="mario") second = first.__class__(first, copy=False) # Even though "copy=False", we want a new object. @@ -245,8 +281,8 @@ def test_copy_name(self): # Not using tm.assert_index_equal() since names differ. assert index.equals(first) - assert first.name == 'mario' - assert second.name == 'mario' + assert first.name == "mario" + assert second.name == "mario" s1 = Series(2, index=first) s2 = Series(3, index=second[:-1]) @@ -254,7 +290,7 @@ def test_copy_name(self): if not isinstance(index, CategoricalIndex): # See gh-13365 s3 = s1 * s2 - assert s3.index.name == 'mario' + assert s3.index.name == "mario" def test_ensure_copied_data(self): # Check the "copy" argument of each Index.__new__ is honoured @@ -263,7 +299,7 @@ def test_ensure_copied_data(self): init_kwargs = {} if isinstance(index, PeriodIndex): # Needs "freq" specification: - init_kwargs['freq'] = index.freq + init_kwargs["freq"] = index.freq elif isinstance(index, (RangeIndex, MultiIndex, CategoricalIndex)): # RangeIndex cannot be initialized from data # MultiIndex and CategoricalIndex are tested separately @@ -272,27 +308,27 @@ def test_ensure_copied_data(self): index_type = index.__class__ result = index_type(index.values, copy=True, **init_kwargs) tm.assert_index_equal(index, result) - tm.assert_numpy_array_equal(index._ndarray_values, - result._ndarray_values, - check_same='copy') + tm.assert_numpy_array_equal( + index._ndarray_values, result._ndarray_values, check_same="copy" + ) if isinstance(index, PeriodIndex): # .values an object array of Period, thus copied - result = index_type(ordinal=index.asi8, copy=False, - **init_kwargs) - tm.assert_numpy_array_equal(index._ndarray_values, - result._ndarray_values, - check_same='same') + result = index_type(ordinal=index.asi8, copy=False, **init_kwargs) + tm.assert_numpy_array_equal( + index._ndarray_values, result._ndarray_values, check_same="same" + ) elif isinstance(index, IntervalIndex): # checked in test_interval.py pass else: result = index_type(index.values, copy=False, **init_kwargs) - tm.assert_numpy_array_equal(index.values, result.values, - check_same='same') - tm.assert_numpy_array_equal(index._ndarray_values, - result._ndarray_values, - check_same='same') + tm.assert_numpy_array_equal( + index.values, result.values, check_same="same" + ) + tm.assert_numpy_array_equal( + index._ndarray_values, result._ndarray_values, check_same="same" + ) def test_memory_usage(self): for name, index in self.indices.items(): @@ -307,7 +343,7 @@ def test_memory_usage(self): if not isinstance(index, (RangeIndex, IntervalIndex)): assert result2 > result - if index.inferred_type == 'object': + if index.inferred_type == "object": assert result3 > result2 else: @@ -319,7 +355,7 @@ def test_argsort(self): for k, ind in self.indices.items(): # separately tested - if k in ['catIndex']: + if k in ["catIndex"]: continue result = ind.argsort() @@ -346,26 +382,25 @@ def test_numpy_argsort(self): msg = "the 'kind' parameter is not supported" with pytest.raises(ValueError, match=msg): - np.argsort(ind, kind='mergesort') + np.argsort(ind, kind="mergesort") msg = "the 'order' parameter is not supported" with pytest.raises(ValueError, match=msg): - np.argsort(ind, order=('a', 'b')) + np.argsort(ind, order=("a", "b")) def test_take(self): indexer = [4, 3, 0, 2] for k, ind in self.indices.items(): # separate - if k in ['boolIndex', 'tuples', 'empty']: + if k in ["boolIndex", "tuples", "empty"]: continue result = ind.take(indexer) expected = ind[indexer] assert result.equals(expected) - if not isinstance(ind, - (DatetimeIndex, PeriodIndex, TimedeltaIndex)): + if not isinstance(ind, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): # GH 10791 with pytest.raises(AttributeError): ind.freq @@ -384,7 +419,7 @@ def test_take_invalid_kwargs(self): msg = "the 'mode' parameter is not supported" with pytest.raises(ValueError, match=msg): - idx.take(indices, mode='clip') + idx.take(indices, mode="clip") def test_repeat(self): rep = 2 @@ -407,7 +442,7 @@ def test_numpy_repeat(self): with pytest.raises(ValueError, match=msg): np.repeat(i, rep, axis=0) - @pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) + @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) def test_where(self, klass): i = self.create_index() @@ -422,8 +457,9 @@ def test_where(self, klass): tm.assert_index_equal(result, expected) @pytest.mark.parametrize("case", [0.5, "xxx"]) - @pytest.mark.parametrize("method", ["intersection", "union", - "difference", "symmetric_difference"]) + @pytest.mark.parametrize( + "method", ["intersection", "union", "difference", "symmetric_difference"] + ) def test_set_ops_error_cases(self, case, method): for name, idx in self.indices.items(): # non-iterable input @@ -444,8 +480,7 @@ def test_intersection_base(self): assert tm.equalContents(intersect, second) # GH 10149 - cases = [klass(second.values) - for klass in [np.array, Series, list]] + cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: if isinstance(idx, CategoricalIndex): pass @@ -467,8 +502,7 @@ def test_union_base(self): assert tm.equalContents(union, everything) # GH 10149 - cases = [klass(second.values) - for klass in [np.array, Series, list]] + cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: if isinstance(idx, CategoricalIndex): pass @@ -495,15 +529,15 @@ def test_difference_base(self, sort): assert tm.equalContents(result, answer) # GH 10149 - cases = [klass(second.values) - for klass in [np.array, Series, list]] + cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: if isinstance(idx, CategoricalIndex): pass elif isinstance(idx, (DatetimeIndex, TimedeltaIndex)): assert result.__class__ == answer.__class__ - tm.assert_numpy_array_equal(result.sort_values().asi8, - answer.sort_values().asi8) + tm.assert_numpy_array_equal( + result.sort_values().asi8, answer.sort_values().asi8 + ) else: result = first.difference(case, sort) assert tm.equalContents(result, answer) @@ -525,8 +559,7 @@ def test_symmetric_difference(self): assert tm.equalContents(result, answer) # GH 10149 - cases = [klass(second.values) - for klass in [np.array, Series, list]] + cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: if isinstance(idx, CategoricalIndex): pass @@ -599,7 +632,7 @@ def test_equals_op(self): # GH9947, GH10637 index_a = self.create_index() if isinstance(index_a, PeriodIndex): - pytest.skip('Skip check for PeriodIndex') + pytest.skip("Skip check for PeriodIndex") n = len(index_a) index_b = index_a[0:-1] @@ -741,8 +774,7 @@ def test_nulls(self): for name, index in self.indices.items(): if len(index) == 0: - tm.assert_numpy_array_equal( - index.isna(), np.array([], dtype=bool)) + tm.assert_numpy_array_equal(index.isna(), np.array([], dtype=bool)) elif isinstance(index, MultiIndex): idx = index.copy() msg = "isna is not defined for MultiIndex" @@ -752,9 +784,11 @@ def test_nulls(self): if not index.hasnans: tm.assert_numpy_array_equal( - index.isna(), np.zeros(len(index), dtype=bool)) + index.isna(), np.zeros(len(index), dtype=bool) + ) tm.assert_numpy_array_equal( - index.notna(), np.ones(len(index), dtype=bool)) + index.notna(), np.ones(len(index), dtype=bool) + ) else: result = isna(index) tm.assert_numpy_array_equal(index.isna(), result) @@ -778,7 +812,7 @@ def test_map(self): # we don't infer UInt64 if isinstance(index, pd.UInt64Index): - expected = index.astype('int64') + expected = index.astype("int64") else: expected = index @@ -789,7 +823,9 @@ def test_map(self): "mapper", [ lambda values, index: {i: e for e, i in zip(values, index)}, - lambda values, index: pd.Series(values, index)]) + lambda values, index: pd.Series(values, index), + ], + ) def test_map_dictlike(self, mapper): index = self.create_index() @@ -800,7 +836,7 @@ def test_map_dictlike(self, mapper): # we don't infer to UInt64 for a dict if isinstance(index, pd.UInt64Index) and isinstance(identity, dict): - expected = index.astype('int64') + expected = index.astype("int64") else: expected = index @@ -823,11 +859,11 @@ def test_putmask_with_wrong_mask(self): index.putmask(np.ones(len(index) - 1, np.bool), 1) with pytest.raises(ValueError): - index.putmask('foo', 1) + index.putmask("foo", 1) - @pytest.mark.parametrize('copy', [True, False]) - @pytest.mark.parametrize('name', [None, 'foo']) - @pytest.mark.parametrize('ordered', [True, False]) + @pytest.mark.parametrize("copy", [True, False]) + @pytest.mark.parametrize("name", [None, "foo"]) + @pytest.mark.parametrize("ordered", [True, False]) def test_astype_category(self, copy, name, ordered): # GH 18630 index = self.create_index() @@ -848,7 +884,7 @@ def test_astype_category(self, copy, name, ordered): if ordered is False: # dtype='category' defaults to ordered=False, so only test once - result = index.astype('category', copy=copy) + result = index.astype("category", copy=copy) expected = CategoricalIndex(index.values, name=name) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index 83f1f22b158b18..12c5fb83395494 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -5,22 +5,23 @@ from pandas.core.indexes.api import Index, MultiIndex import pandas.util.testing as tm -indices_list = [tm.makeUnicodeIndex(100), - tm.makeStringIndex(100), - tm.makeDateIndex(100), - tm.makePeriodIndex(100), - tm.makeTimedeltaIndex(100), - tm.makeIntIndex(100), - tm.makeUIntIndex(100), - tm.makeRangeIndex(100), - tm.makeFloatIndex(100), - Index([True, False]), - tm.makeCategoricalIndex(100), - tm.makeIntervalIndex(100), - Index([]), - MultiIndex.from_tuples(zip( - ['foo', 'bar', 'baz'], [1, 2, 3])), - Index([0, 0, 1, 1, 2, 2])] +indices_list = [ + tm.makeUnicodeIndex(100), + tm.makeStringIndex(100), + tm.makeDateIndex(100), + tm.makePeriodIndex(100), + tm.makeTimedeltaIndex(100), + tm.makeIntIndex(100), + tm.makeUIntIndex(100), + tm.makeRangeIndex(100), + tm.makeFloatIndex(100), + Index([True, False]), + tm.makeCategoricalIndex(100), + tm.makeIntervalIndex(100), + Index([]), + MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), + Index([0, 0, 1, 1, 2, 2]), +] @pytest.fixture(params=indices_list, ids=lambda x: type(x).__name__) @@ -34,11 +35,12 @@ def one(request): return request.param -zeros = [box([0] * 5, dtype=dtype) - for box in [pd.Index, np.array] - for dtype in [np.int64, np.uint64, np.float64]] -zeros.extend([np.array(0, dtype=dtype) - for dtype in [np.int64, np.uint64, np.float64]]) +zeros = [ + box([0] * 5, dtype=dtype) + for box in [pd.Index, np.array] + for dtype in [np.int64, np.uint64, np.float64] +] +zeros.extend([np.array(0, dtype=dtype) for dtype in [np.int64, np.uint64, np.float64]]) zeros.extend([0, 0.0]) diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index 180033c2d2619f..1b3c4e65d252b3 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -9,7 +9,6 @@ class DatetimeLike(Base): - def test_argmax_axis_invalid(self): # GH#23081 rng = self.create_index() @@ -36,21 +35,21 @@ def test_str(self): # test the string repr idx = self.create_index() - idx.name = 'foo' + idx.name = "foo" assert not "length=%s" % len(idx) in str(idx) assert "'foo'" in str(idx) assert idx.__class__.__name__ in str(idx) - if hasattr(idx, 'tz'): + if hasattr(idx, "tz"): if idx.tz is not None: assert idx.tz in str(idx) - if hasattr(idx, 'freq'): + if hasattr(idx, "freq"): assert "freq='%s'" % idx.freqstr in str(idx) def test_view(self): i = self.create_index() - i_view = i.view('i8') + i_view = i.view("i8") result = self._holder(i) tm.assert_index_equal(result, i) @@ -72,7 +71,9 @@ def test_map_callable(self): "mapper", [ lambda values, index: {i: e for e, i in zip(values, index)}, - lambda values, index: pd.Series(values, index)]) + lambda values, index: pd.Series(values, index), + ], + ) def test_map_dictlike(self, mapper): expected = self.index + self.index.freq diff --git a/pandas/tests/indexes/datetimes/test_arithmetic.py b/pandas/tests/indexes/datetimes/test_arithmetic.py index e998f77582c854..4851dd5a55c1ef 100644 --- a/pandas/tests/indexes/datetimes/test_arithmetic.py +++ b/pandas/tests/indexes/datetimes/test_arithmetic.py @@ -18,44 +18,56 @@ class TestDatetimeIndexArithmetic: def test_dti_shift_tzaware(self, tz_naive_fixture): # GH#9903 tz = tz_naive_fixture - idx = pd.DatetimeIndex([], name='xxx', tz=tz) - tm.assert_index_equal(idx.shift(0, freq='H'), idx) - tm.assert_index_equal(idx.shift(3, freq='H'), idx) - - idx = pd.DatetimeIndex(['2011-01-01 10:00', '2011-01-01 11:00', - '2011-01-01 12:00'], name='xxx', tz=tz) - tm.assert_index_equal(idx.shift(0, freq='H'), idx) - exp = pd.DatetimeIndex(['2011-01-01 13:00', '2011-01-01 14:00', - '2011-01-01 15:00'], name='xxx', tz=tz) - tm.assert_index_equal(idx.shift(3, freq='H'), exp) - exp = pd.DatetimeIndex(['2011-01-01 07:00', '2011-01-01 08:00', - '2011-01-01 09:00'], name='xxx', tz=tz) - tm.assert_index_equal(idx.shift(-3, freq='H'), exp) + idx = pd.DatetimeIndex([], name="xxx", tz=tz) + tm.assert_index_equal(idx.shift(0, freq="H"), idx) + tm.assert_index_equal(idx.shift(3, freq="H"), idx) + + idx = pd.DatetimeIndex( + ["2011-01-01 10:00", "2011-01-01 11:00", "2011-01-01 12:00"], + name="xxx", + tz=tz, + ) + tm.assert_index_equal(idx.shift(0, freq="H"), idx) + exp = pd.DatetimeIndex( + ["2011-01-01 13:00", "2011-01-01 14:00", "2011-01-01 15:00"], + name="xxx", + tz=tz, + ) + tm.assert_index_equal(idx.shift(3, freq="H"), exp) + exp = pd.DatetimeIndex( + ["2011-01-01 07:00", "2011-01-01 08:00", "2011-01-01 09:00"], + name="xxx", + tz=tz, + ) + tm.assert_index_equal(idx.shift(-3, freq="H"), exp) def test_dti_shift_freqs(self): # test shift for DatetimeIndex and non DatetimeIndex # GH#8083 - drange = pd.date_range('20130101', periods=5) + drange = pd.date_range("20130101", periods=5) result = drange.shift(1) - expected = pd.DatetimeIndex(['2013-01-02', '2013-01-03', '2013-01-04', - '2013-01-05', - '2013-01-06'], freq='D') + expected = pd.DatetimeIndex( + ["2013-01-02", "2013-01-03", "2013-01-04", "2013-01-05", "2013-01-06"], + freq="D", + ) tm.assert_index_equal(result, expected) result = drange.shift(-1) - expected = pd.DatetimeIndex(['2012-12-31', '2013-01-01', '2013-01-02', - '2013-01-03', '2013-01-04'], - freq='D') + expected = pd.DatetimeIndex( + ["2012-12-31", "2013-01-01", "2013-01-02", "2013-01-03", "2013-01-04"], + freq="D", + ) tm.assert_index_equal(result, expected) - result = drange.shift(3, freq='2D') - expected = pd.DatetimeIndex(['2013-01-07', '2013-01-08', '2013-01-09', - '2013-01-10', - '2013-01-11'], freq='D') + result = drange.shift(3, freq="2D") + expected = pd.DatetimeIndex( + ["2013-01-07", "2013-01-08", "2013-01-09", "2013-01-10", "2013-01-11"], + freq="D", + ) tm.assert_index_equal(result, expected) def test_dti_shift_int(self): - rng = date_range('1/1/2000', periods=20) + rng = date_range("1/1/2000", periods=20) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # GH#22535 @@ -73,36 +85,39 @@ def test_dti_shift_int(self): def test_dti_shift_no_freq(self): # GH#19147 - dti = pd.DatetimeIndex(['2011-01-01 10:00', '2011-01-01'], freq=None) + dti = pd.DatetimeIndex(["2011-01-01 10:00", "2011-01-01"], freq=None) with pytest.raises(NullFrequencyError): dti.shift(2) - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_dti_shift_localized(self, tzstr): - dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') + dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI") dr_tz = dr.tz_localize(tzstr) - result = dr_tz.shift(1, '10T') + result = dr_tz.shift(1, "10T") assert result.tz == dr_tz.tz def test_dti_shift_across_dst(self): # GH 8616 - idx = date_range('2013-11-03', tz='America/Chicago', - periods=7, freq='H') + idx = date_range("2013-11-03", tz="America/Chicago", periods=7, freq="H") s = Series(index=idx[:-1]) - result = s.shift(freq='H') + result = s.shift(freq="H") expected = Series(index=idx[1:]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('shift, result_time', [ - [0, '2014-11-14 00:00:00'], - [-1, '2014-11-13 23:00:00'], - [1, '2014-11-14 01:00:00']]) + @pytest.mark.parametrize( + "shift, result_time", + [ + [0, "2014-11-14 00:00:00"], + [-1, "2014-11-13 23:00:00"], + [1, "2014-11-14 01:00:00"], + ], + ) def test_dti_shift_near_midnight(self, shift, result_time): # GH 8616 dt = datetime(2014, 11, 14, 0) - dt_est = pytz.timezone('EST').localize(dt) + dt_est = pytz.timezone("EST").localize(dt) s = Series(data=[1], index=[dt_est]) - result = s.shift(shift, freq='H') - expected = Series(1, index=DatetimeIndex([result_time], tz='EST')) + result = s.shift(shift, freq="H") + expected = Series(1, index=DatetimeIndex([result_time], tz="EST")) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_astype.py b/pandas/tests/indexes/datetimes/test_astype.py index 38a060bb0d1d3b..eabf293ae915f6 100644 --- a/pandas/tests/indexes/datetimes/test_astype.py +++ b/pandas/tests/indexes/datetimes/test_astype.py @@ -8,33 +8,40 @@ import pandas as pd from pandas import ( - DatetimeIndex, Index, Int64Index, NaT, Period, Series, Timestamp, - date_range) + DatetimeIndex, + Index, + Int64Index, + NaT, + Period, + Series, + Timestamp, + date_range, +) import pandas.util.testing as tm class TestDatetimeIndex: - def test_astype(self): # GH 13149, GH 13209 - idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN]) result = idx.astype(object) - expected = Index([Timestamp('2016-05-16')] + [NaT] * 3, dtype=object) + expected = Index([Timestamp("2016-05-16")] + [NaT] * 3, dtype=object) tm.assert_index_equal(result, expected) result = idx.astype(int) - expected = Int64Index([1463356800000000000] + - [-9223372036854775808] * 3, dtype=np.int64) + expected = Int64Index( + [1463356800000000000] + [-9223372036854775808] * 3, dtype=np.int64 + ) tm.assert_index_equal(result, expected) - rng = date_range('1/1/2000', periods=10) - result = rng.astype('i8') + rng = date_range("1/1/2000", periods=10) + result = rng.astype("i8") tm.assert_index_equal(result, Index(rng.asi8)) tm.assert_numpy_array_equal(result.values, rng.asi8) def test_astype_uint(self): - arr = date_range('2000', periods=2) + arr = date_range("2000", periods=2) expected = pd.UInt64Index( np.array([946684800000000000, 946771200000000000], dtype="uint64") ) @@ -45,139 +52,167 @@ def test_astype_uint(self): def test_astype_with_tz(self): # with tz - rng = date_range('1/1/2000', periods=10, tz='US/Eastern') - result = rng.astype('datetime64[ns]') - expected = (date_range('1/1/2000', periods=10, - tz='US/Eastern') - .tz_convert('UTC').tz_localize(None)) + rng = date_range("1/1/2000", periods=10, tz="US/Eastern") + result = rng.astype("datetime64[ns]") + expected = ( + date_range("1/1/2000", periods=10, tz="US/Eastern") + .tz_convert("UTC") + .tz_localize(None) + ) tm.assert_index_equal(result, expected) # BUG#10442 : testing astype(str) is correct for Series/DatetimeIndex - result = pd.Series(pd.date_range('2012-01-01', periods=3)).astype(str) - expected = pd.Series( - ['2012-01-01', '2012-01-02', '2012-01-03'], dtype=object) + result = pd.Series(pd.date_range("2012-01-01", periods=3)).astype(str) + expected = pd.Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype=object) tm.assert_series_equal(result, expected) - result = Series(pd.date_range('2012-01-01', periods=3, - tz='US/Eastern')).astype(str) - expected = Series(['2012-01-01 00:00:00-05:00', - '2012-01-02 00:00:00-05:00', - '2012-01-03 00:00:00-05:00'], - dtype=object) + result = Series(pd.date_range("2012-01-01", periods=3, tz="US/Eastern")).astype( + str + ) + expected = Series( + [ + "2012-01-01 00:00:00-05:00", + "2012-01-02 00:00:00-05:00", + "2012-01-03 00:00:00-05:00", + ], + dtype=object, + ) tm.assert_series_equal(result, expected) # GH 18951: tz-aware to tz-aware - idx = date_range('20170101', periods=4, tz='US/Pacific') - result = idx.astype('datetime64[ns, US/Eastern]') - expected = date_range('20170101 03:00:00', periods=4, tz='US/Eastern') + idx = date_range("20170101", periods=4, tz="US/Pacific") + result = idx.astype("datetime64[ns, US/Eastern]") + expected = date_range("20170101 03:00:00", periods=4, tz="US/Eastern") tm.assert_index_equal(result, expected) # GH 18951: tz-naive to tz-aware - idx = date_range('20170101', periods=4) - result = idx.astype('datetime64[ns, US/Eastern]') - expected = date_range('20170101', periods=4, tz='US/Eastern') + idx = date_range("20170101", periods=4) + result = idx.astype("datetime64[ns, US/Eastern]") + expected = date_range("20170101", periods=4, tz="US/Eastern") tm.assert_index_equal(result, expected) def test_astype_str_compat(self): # GH 13149, GH 13209 # verify that we are returning NaT as a string (and not unicode) - idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN]) result = idx.astype(str) - expected = Index(['2016-05-16', 'NaT', 'NaT', 'NaT'], dtype=object) + expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) tm.assert_index_equal(result, expected) def test_astype_str(self): # test astype string - #10442 - result = date_range('2012-01-01', periods=4, - name='test_name').astype(str) - expected = Index(['2012-01-01', '2012-01-02', '2012-01-03', - '2012-01-04'], name='test_name', dtype=object) + result = date_range("2012-01-01", periods=4, name="test_name").astype(str) + expected = Index( + ["2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04"], + name="test_name", + dtype=object, + ) tm.assert_index_equal(result, expected) # test astype string with tz and name - result = date_range('2012-01-01', periods=3, name='test_name', - tz='US/Eastern').astype(str) - expected = Index(['2012-01-01 00:00:00-05:00', - '2012-01-02 00:00:00-05:00', - '2012-01-03 00:00:00-05:00'], - name='test_name', dtype=object) + result = date_range( + "2012-01-01", periods=3, name="test_name", tz="US/Eastern" + ).astype(str) + expected = Index( + [ + "2012-01-01 00:00:00-05:00", + "2012-01-02 00:00:00-05:00", + "2012-01-03 00:00:00-05:00", + ], + name="test_name", + dtype=object, + ) tm.assert_index_equal(result, expected) # test astype string with freqH and name - result = date_range('1/1/2011', periods=3, freq='H', - name='test_name').astype(str) - expected = Index(['2011-01-01 00:00:00', '2011-01-01 01:00:00', - '2011-01-01 02:00:00'], - name='test_name', dtype=object) + result = date_range("1/1/2011", periods=3, freq="H", name="test_name").astype( + str + ) + expected = Index( + ["2011-01-01 00:00:00", "2011-01-01 01:00:00", "2011-01-01 02:00:00"], + name="test_name", + dtype=object, + ) tm.assert_index_equal(result, expected) # test astype string with freqH and timezone - result = date_range('3/6/2012 00:00', periods=2, freq='H', - tz='Europe/London', name='test_name').astype(str) - expected = Index(['2012-03-06 00:00:00+00:00', - '2012-03-06 01:00:00+00:00'], - dtype=object, name='test_name') + result = date_range( + "3/6/2012 00:00", periods=2, freq="H", tz="Europe/London", name="test_name" + ).astype(str) + expected = Index( + ["2012-03-06 00:00:00+00:00", "2012-03-06 01:00:00+00:00"], + dtype=object, + name="test_name", + ) tm.assert_index_equal(result, expected) def test_astype_datetime64(self): # GH 13149, GH 13209 - idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN]) - result = idx.astype('datetime64[ns]') + result = idx.astype("datetime64[ns]") tm.assert_index_equal(result, idx) assert result is not idx - result = idx.astype('datetime64[ns]', copy=False) + result = idx.astype("datetime64[ns]", copy=False) tm.assert_index_equal(result, idx) assert result is idx - idx_tz = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN], tz='EST') - result = idx_tz.astype('datetime64[ns]') - expected = DatetimeIndex(['2016-05-16 05:00:00', 'NaT', 'NaT', 'NaT'], - dtype='datetime64[ns]') + idx_tz = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN], tz="EST") + result = idx_tz.astype("datetime64[ns]") + expected = DatetimeIndex( + ["2016-05-16 05:00:00", "NaT", "NaT", "NaT"], dtype="datetime64[ns]" + ) tm.assert_index_equal(result, expected) def test_astype_object(self): - rng = date_range('1/1/2000', periods=20) + rng = date_range("1/1/2000", periods=20) - casted = rng.astype('O') + casted = rng.astype("O") exp_values = list(rng) tm.assert_index_equal(casted, Index(exp_values, dtype=np.object_)) assert casted.tolist() == exp_values - @pytest.mark.parametrize('tz', [None, 'Asia/Tokyo']) + @pytest.mark.parametrize("tz", [None, "Asia/Tokyo"]) def test_astype_object_tz(self, tz): - idx = pd.date_range(start='2013-01-01', periods=4, freq='M', - name='idx', tz=tz) - expected_list = [Timestamp('2013-01-31', tz=tz), - Timestamp('2013-02-28', tz=tz), - Timestamp('2013-03-31', tz=tz), - Timestamp('2013-04-30', tz=tz)] - expected = pd.Index(expected_list, dtype=object, name='idx') + idx = pd.date_range(start="2013-01-01", periods=4, freq="M", name="idx", tz=tz) + expected_list = [ + Timestamp("2013-01-31", tz=tz), + Timestamp("2013-02-28", tz=tz), + Timestamp("2013-03-31", tz=tz), + Timestamp("2013-04-30", tz=tz), + ] + expected = pd.Index(expected_list, dtype=object, name="idx") result = idx.astype(object) tm.assert_index_equal(result, expected) assert idx.tolist() == expected_list def test_astype_object_with_nat(self): - idx = DatetimeIndex([datetime(2013, 1, 1), datetime(2013, 1, 2), - pd.NaT, datetime(2013, 1, 4)], name='idx') - expected_list = [Timestamp('2013-01-01'), - Timestamp('2013-01-02'), pd.NaT, - Timestamp('2013-01-04')] - expected = pd.Index(expected_list, dtype=object, name='idx') + idx = DatetimeIndex( + [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT, datetime(2013, 1, 4)], + name="idx", + ) + expected_list = [ + Timestamp("2013-01-01"), + Timestamp("2013-01-02"), + pd.NaT, + Timestamp("2013-01-04"), + ] + expected = pd.Index(expected_list, dtype=object, name="idx") result = idx.astype(object) tm.assert_index_equal(result, expected) assert idx.tolist() == expected_list - @pytest.mark.parametrize('dtype', [ - float, 'timedelta64', 'timedelta64[ns]', 'datetime64', - 'datetime64[D]']) + @pytest.mark.parametrize( + "dtype", + [float, "timedelta64", "timedelta64[ns]", "datetime64", "datetime64[D]"], + ) def test_astype_raises(self, dtype): # GH 13149, GH 13209 - idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) - msg = 'Cannot cast DatetimeArray to dtype' + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN]) + msg = "Cannot cast DatetimeArray to dtype" with pytest.raises(TypeError, match=msg): idx.astype(dtype) @@ -190,9 +225,9 @@ def _check_rng(rng): assert x == stamp.to_pydatetime() assert x.tzinfo == stamp.tzinfo - rng = date_range('20090415', '20090519') - rng_eastern = date_range('20090415', '20090519', tz='US/Eastern') - rng_utc = date_range('20090415', '20090519', tz='utc') + rng = date_range("20090415", "20090519") + rng_eastern = date_range("20090415", "20090519", tz="US/Eastern") + rng_utc = date_range("20090415", "20090519", tz="utc") _check_rng(rng) _check_rng(rng_eastern) @@ -207,10 +242,9 @@ def _check_rng(rng): assert x == stamp.to_pydatetime() assert x.tzinfo == stamp.tzinfo - rng = date_range('20090415', '20090519') - rng_eastern = date_range('20090415', '20090519', - tz=pytz.timezone('US/Eastern')) - rng_utc = date_range('20090415', '20090519', tz=pytz.utc) + rng = date_range("20090415", "20090519") + rng_eastern = date_range("20090415", "20090519", tz=pytz.timezone("US/Eastern")) + rng_utc = date_range("20090415", "20090519", tz=pytz.utc) _check_rng(rng) _check_rng(rng_eastern) @@ -225,31 +259,32 @@ def _check_rng(rng): assert x == stamp.to_pydatetime() assert x.tzinfo == stamp.tzinfo - rng = date_range('20090415', '20090519') - rng_eastern = date_range('20090415', '20090519', - tz='dateutil/US/Eastern') - rng_utc = date_range('20090415', '20090519', tz=dateutil.tz.tzutc()) + rng = date_range("20090415", "20090519") + rng_eastern = date_range("20090415", "20090519", tz="dateutil/US/Eastern") + rng_utc = date_range("20090415", "20090519", tz=dateutil.tz.tzutc()) _check_rng(rng) _check_rng(rng_eastern) _check_rng(rng_utc) - @pytest.mark.parametrize('tz, dtype', [ - ['US/Pacific', 'datetime64[ns, US/Pacific]'], - [None, 'datetime64[ns]']]) + @pytest.mark.parametrize( + "tz, dtype", + [["US/Pacific", "datetime64[ns, US/Pacific]"], [None, "datetime64[ns]"]], + ) def test_integer_index_astype_datetime(self, tz, dtype): # GH 20997, 20964, 24559 - val = [pd.Timestamp('2018-01-01', tz=tz).value] + val = [pd.Timestamp("2018-01-01", tz=tz).value] result = pd.Index(val).astype(dtype) expected = pd.DatetimeIndex(["2018-01-01"], tz=tz) tm.assert_index_equal(result, expected) class TestToPeriod: - def setup_method(self, method): - data = [Timestamp('2007-01-01 10:11:12.123456Z'), - Timestamp('2007-01-01 10:11:13.789123Z')] + data = [ + Timestamp("2007-01-01 10:11:12.123456Z"), + Timestamp("2007-01-01 10:11:13.789123Z"), + ] self.index = DatetimeIndex(data) def test_to_period_millisecond(self): @@ -257,26 +292,27 @@ def test_to_period_millisecond(self): with tm.assert_produces_warning(UserWarning): # warning that timezone info will be lost - period = index.to_period(freq='L') + period = index.to_period(freq="L") assert 2 == len(period) - assert period[0] == Period('2007-01-01 10:11:12.123Z', 'L') - assert period[1] == Period('2007-01-01 10:11:13.789Z', 'L') + assert period[0] == Period("2007-01-01 10:11:12.123Z", "L") + assert period[1] == Period("2007-01-01 10:11:13.789Z", "L") def test_to_period_microsecond(self): index = self.index with tm.assert_produces_warning(UserWarning): # warning that timezone info will be lost - period = index.to_period(freq='U') + period = index.to_period(freq="U") assert 2 == len(period) - assert period[0] == Period('2007-01-01 10:11:12.123456Z', 'U') - assert period[1] == Period('2007-01-01 10:11:13.789123Z', 'U') + assert period[0] == Period("2007-01-01 10:11:12.123456Z", "U") + assert period[1] == Period("2007-01-01 10:11:13.789123Z", "U") - @pytest.mark.parametrize('tz', [ - 'US/Eastern', pytz.utc, tzlocal(), 'dateutil/US/Eastern', - dateutil.tz.tzutc()]) + @pytest.mark.parametrize( + "tz", + ["US/Eastern", pytz.utc, tzlocal(), "dateutil/US/Eastern", dateutil.tz.tzutc()], + ) def test_to_period_tz(self, tz): - ts = date_range('1/1/2000', '2/1/2000', tz=tz) + ts = date_range("1/1/2000", "2/1/2000", tz=tz) with tm.assert_produces_warning(UserWarning): # GH#21333 warning that timezone info will be lost @@ -285,7 +321,7 @@ def test_to_period_tz(self, tz): assert result == expected - expected = date_range('1/1/2000', '2/1/2000').to_period() + expected = date_range("1/1/2000", "2/1/2000").to_period() with tm.assert_produces_warning(UserWarning): # GH#21333 warning that timezone info will be lost @@ -293,45 +329,44 @@ def test_to_period_tz(self, tz): tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('tz', ['Etc/GMT-1', 'Etc/GMT+1']) + @pytest.mark.parametrize("tz", ["Etc/GMT-1", "Etc/GMT+1"]) def test_to_period_tz_utc_offset_consistency(self, tz): # GH 22905 - ts = pd.date_range('1/1/2000', '2/1/2000', tz='Etc/GMT-1') + ts = pd.date_range("1/1/2000", "2/1/2000", tz="Etc/GMT-1") with tm.assert_produces_warning(UserWarning): result = ts.to_period()[0] expected = ts[0].to_period() assert result == expected def test_to_period_nofreq(self): - idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) + idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-04"]) with pytest.raises(ValueError): idx.to_period() - idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'], - freq='infer') - assert idx.freqstr == 'D' - expected = pd.PeriodIndex(['2000-01-01', '2000-01-02', - '2000-01-03'], freq='D') + idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-03"], freq="infer") + assert idx.freqstr == "D" + expected = pd.PeriodIndex(["2000-01-01", "2000-01-02", "2000-01-03"], freq="D") tm.assert_index_equal(idx.to_period(), expected) # GH 7606 - idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03']) + idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-03"]) assert idx.freqstr is None tm.assert_index_equal(idx.to_period(), expected) - @pytest.mark.parametrize('tz', [None, 'US/Central']) + @pytest.mark.parametrize("tz", [None, "US/Central"]) def test_astype_category(self, tz): obj = pd.date_range("2000", periods=2, tz=tz) - result = obj.astype('category') - expected = pd.CategoricalIndex([pd.Timestamp('2000-01-01', tz=tz), - pd.Timestamp('2000-01-02', tz=tz)]) + result = obj.astype("category") + expected = pd.CategoricalIndex( + [pd.Timestamp("2000-01-01", tz=tz), pd.Timestamp("2000-01-02", tz=tz)] + ) tm.assert_index_equal(result, expected) - result = obj._data.astype('category') + result = obj._data.astype("category") expected = expected.values tm.assert_categorical_equal(result, expected) - @pytest.mark.parametrize('tz', [None, 'US/Central']) + @pytest.mark.parametrize("tz", [None, "US/Central"]) def test_astype_array_fallback(self, tz): obj = pd.date_range("2000", periods=2, tz=tz) result = obj.astype(bool) diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index 56dfbfd485eb16..f22c820253ee58 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -11,26 +11,31 @@ import pandas as pd from pandas import ( - DatetimeIndex, Index, Timestamp, date_range, datetime, offsets, - to_datetime) + DatetimeIndex, + Index, + Timestamp, + date_range, + datetime, + offsets, + to_datetime, +) from pandas.core.arrays import DatetimeArray, period_array import pandas.util.testing as tm class TestDatetimeIndex: - - @pytest.mark.parametrize('dt_cls', [DatetimeIndex, - DatetimeArray._from_sequence]) + @pytest.mark.parametrize("dt_cls", [DatetimeIndex, DatetimeArray._from_sequence]) def test_freq_validation_with_nat(self, dt_cls): # GH#11587 make sure we get a useful error message when generate_range # raises - msg = ("Inferred frequency None from passed values does not conform " - "to passed frequency D") + msg = ( + "Inferred frequency None from passed values does not conform " + "to passed frequency D" + ) with pytest.raises(ValueError, match=msg): - dt_cls([pd.NaT, pd.Timestamp('2011-01-01')], freq='D') + dt_cls([pd.NaT, pd.Timestamp("2011-01-01")], freq="D") with pytest.raises(ValueError, match=msg): - dt_cls([pd.NaT, pd.Timestamp('2011-01-01').value], - freq='D') + dt_cls([pd.NaT, pd.Timestamp("2011-01-01").value], freq="D") def test_categorical_preserves_tz(self): # GH#18664 retain tz when going DTI-->Categorical-->DTI @@ -38,8 +43,8 @@ def test_categorical_preserves_tz(self): # once CategoricalIndex(DTA) works dti = pd.DatetimeIndex( - [pd.NaT, '2015-01-01', '1999-04-06 15:14:13', '2015-01-01'], - tz='US/Eastern') + [pd.NaT, "2015-01-01", "1999-04-06 15:14:13", "2015-01-01"], tz="US/Eastern" + ) ci = pd.CategoricalIndex(dti) carr = pd.Categorical(dti) @@ -51,7 +56,7 @@ def test_categorical_preserves_tz(self): def test_dti_with_period_data_raises(self): # GH#23675 - data = pd.PeriodIndex(['2016Q1', '2016Q2'], freq='Q') + data = pd.PeriodIndex(["2016Q1", "2016Q2"], freq="Q") with pytest.raises(TypeError, match="PeriodDtype data is invalid"): DatetimeIndex(data) @@ -67,59 +72,64 @@ def test_dti_with_period_data_raises(self): def test_dti_with_timedelta64_data_deprecation(self): # GH#23675 - data = np.array([0], dtype='m8[ns]') + data = np.array([0], dtype="m8[ns]") with tm.assert_produces_warning(FutureWarning): result = DatetimeIndex(data) - assert result[0] == Timestamp('1970-01-01') + assert result[0] == Timestamp("1970-01-01") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = to_datetime(data) - assert result[0] == Timestamp('1970-01-01') + assert result[0] == Timestamp("1970-01-01") with tm.assert_produces_warning(FutureWarning): result = DatetimeIndex(pd.TimedeltaIndex(data)) - assert result[0] == Timestamp('1970-01-01') + assert result[0] == Timestamp("1970-01-01") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = to_datetime(pd.TimedeltaIndex(data)) - assert result[0] == Timestamp('1970-01-01') + assert result[0] == Timestamp("1970-01-01") def test_construction_caching(self): - df = pd.DataFrame({'dt': pd.date_range('20130101', periods=3), - 'dttz': pd.date_range('20130101', periods=3, - tz='US/Eastern'), - 'dt_with_null': [pd.Timestamp('20130101'), pd.NaT, - pd.Timestamp('20130103')], - 'dtns': pd.date_range('20130101', periods=3, - freq='ns')}) - assert df.dttz.dtype.tz.zone == 'US/Eastern' - - @pytest.mark.parametrize('kwargs', [ - {'tz': 'dtype.tz'}, - {'dtype': 'dtype'}, - {'dtype': 'dtype', 'tz': 'dtype.tz'}]) + df = pd.DataFrame( + { + "dt": pd.date_range("20130101", periods=3), + "dttz": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "dt_with_null": [ + pd.Timestamp("20130101"), + pd.NaT, + pd.Timestamp("20130103"), + ], + "dtns": pd.date_range("20130101", periods=3, freq="ns"), + } + ) + assert df.dttz.dtype.tz.zone == "US/Eastern" + + @pytest.mark.parametrize( + "kwargs", + [{"tz": "dtype.tz"}, {"dtype": "dtype"}, {"dtype": "dtype", "tz": "dtype.tz"}], + ) def test_construction_with_alt(self, kwargs, tz_aware_fixture): tz = tz_aware_fixture - i = pd.date_range('20130101', periods=5, freq='H', tz=tz) + i = pd.date_range("20130101", periods=5, freq="H", tz=tz) kwargs = {key: attrgetter(val)(i) for key, val in kwargs.items()} result = DatetimeIndex(i, **kwargs) tm.assert_index_equal(i, result) - @pytest.mark.parametrize('kwargs', [ - {'tz': 'dtype.tz'}, - {'dtype': 'dtype'}, - {'dtype': 'dtype', 'tz': 'dtype.tz'}]) + @pytest.mark.parametrize( + "kwargs", + [{"tz": "dtype.tz"}, {"dtype": "dtype"}, {"dtype": "dtype", "tz": "dtype.tz"}], + ) def test_construction_with_alt_tz_localize(self, kwargs, tz_aware_fixture): tz = tz_aware_fixture - i = pd.date_range('20130101', periods=5, freq='H', tz=tz) + i = pd.date_range("20130101", periods=5, freq="H", tz=tz) kwargs = {key: attrgetter(val)(i) for key, val in kwargs.items()} - if str(tz) in ('UTC', 'tzutc()', 'UTC+00:00'): + if str(tz) in ("UTC", "tzutc()", "UTC+00:00"): warn = None else: warn = FutureWarning @@ -130,81 +140,110 @@ def test_construction_with_alt_tz_localize(self, kwargs, tz_aware_fixture): tm.assert_index_equal(result, expected) # localize into the provided tz - i2 = DatetimeIndex(i.tz_localize(None).asi8, tz='UTC') - expected = i.tz_localize(None).tz_localize('UTC') + i2 = DatetimeIndex(i.tz_localize(None).asi8, tz="UTC") + expected = i.tz_localize(None).tz_localize("UTC") tm.assert_index_equal(i2, expected) # incompat tz/dtype msg = "cannot supply both a tz and a dtype with a tz" with pytest.raises(ValueError, match=msg): - DatetimeIndex(i.tz_localize(None).asi8, - dtype=i.dtype, tz='US/Pacific') + DatetimeIndex(i.tz_localize(None).asi8, dtype=i.dtype, tz="US/Pacific") def test_construction_index_with_mixed_timezones(self): # gh-11488: no tz results in DatetimeIndex - result = Index([Timestamp('2011-01-01'), - Timestamp('2011-01-02')], name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01'), - Timestamp('2011-01-02')], name='idx') + result = Index([Timestamp("2011-01-01"), Timestamp("2011-01-02")], name="idx") + exp = DatetimeIndex( + [Timestamp("2011-01-01"), Timestamp("2011-01-02")], name="idx" + ) tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) assert result.tz is None # same tz results in DatetimeIndex - result = Index([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - Timestamp('2011-01-02 10:00', tz='Asia/Tokyo')], - name='idx') + result = Index( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="Asia/Tokyo"), + ], + name="idx", + ) exp = DatetimeIndex( - [Timestamp('2011-01-01 10:00'), Timestamp('2011-01-02 10:00') - ], tz='Asia/Tokyo', name='idx') + [Timestamp("2011-01-01 10:00"), Timestamp("2011-01-02 10:00")], + tz="Asia/Tokyo", + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) assert result.tz is not None assert result.tz == exp.tz # same tz results in DatetimeIndex (DST) - result = Index([Timestamp('2011-01-01 10:00', tz='US/Eastern'), - Timestamp('2011-08-01 10:00', tz='US/Eastern')], - name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), - Timestamp('2011-08-01 10:00')], - tz='US/Eastern', name='idx') + result = Index( + [ + Timestamp("2011-01-01 10:00", tz="US/Eastern"), + Timestamp("2011-08-01 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00"), Timestamp("2011-08-01 10:00")], + tz="US/Eastern", + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) assert result.tz is not None assert result.tz == exp.tz # Different tz results in Index(dtype=object) - result = Index([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - name='idx') - exp = Index([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - dtype='object', name='idx') + result = Index( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = Index( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + dtype="object", + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert not isinstance(result, DatetimeIndex) - result = Index([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - name='idx') - exp = Index([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - dtype='object', name='idx') + result = Index( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = Index( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + dtype="object", + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert not isinstance(result, DatetimeIndex) # length = 1 - result = Index([Timestamp('2011-01-01')], name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01')], name='idx') + result = Index([Timestamp("2011-01-01")], name="idx") + exp = DatetimeIndex([Timestamp("2011-01-01")], name="idx") tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) assert result.tz is None # length = 1 with tz - result = Index( - [Timestamp('2011-01-01 10:00', tz='Asia/Tokyo')], name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01 10:00')], tz='Asia/Tokyo', - name='idx') + result = Index([Timestamp("2011-01-01 10:00", tz="Asia/Tokyo")], name="idx") + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00")], tz="Asia/Tokyo", name="idx" + ) tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) assert result.tz is not None @@ -212,70 +251,117 @@ def test_construction_index_with_mixed_timezones(self): def test_construction_index_with_mixed_timezones_with_NaT(self): # see gh-11488 - result = Index([pd.NaT, Timestamp('2011-01-01'), - pd.NaT, Timestamp('2011-01-02')], name='idx') - exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01'), - pd.NaT, Timestamp('2011-01-02')], name='idx') + result = Index( + [pd.NaT, Timestamp("2011-01-01"), pd.NaT, Timestamp("2011-01-02")], + name="idx", + ) + exp = DatetimeIndex( + [pd.NaT, Timestamp("2011-01-01"), pd.NaT, Timestamp("2011-01-02")], + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) assert result.tz is None # Same tz results in DatetimeIndex - result = Index([pd.NaT, Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - pd.NaT, Timestamp('2011-01-02 10:00', - tz='Asia/Tokyo')], - name='idx') - exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01 10:00'), - pd.NaT, Timestamp('2011-01-02 10:00')], - tz='Asia/Tokyo', name='idx') + result = Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="Asia/Tokyo"), + ], + name="idx", + ) + exp = DatetimeIndex( + [ + pd.NaT, + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-02 10:00"), + ], + tz="Asia/Tokyo", + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) assert result.tz is not None assert result.tz == exp.tz # same tz results in DatetimeIndex (DST) - result = Index([Timestamp('2011-01-01 10:00', tz='US/Eastern'), - pd.NaT, - Timestamp('2011-08-01 10:00', tz='US/Eastern')], - name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), pd.NaT, - Timestamp('2011-08-01 10:00')], - tz='US/Eastern', name='idx') + result = Index( + [ + Timestamp("2011-01-01 10:00", tz="US/Eastern"), + pd.NaT, + Timestamp("2011-08-01 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00"), pd.NaT, Timestamp("2011-08-01 10:00")], + tz="US/Eastern", + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) assert result.tz is not None assert result.tz == exp.tz # different tz results in Index(dtype=object) - result = Index([pd.NaT, Timestamp('2011-01-01 10:00'), - pd.NaT, Timestamp('2011-01-02 10:00', - tz='US/Eastern')], - name='idx') - exp = Index([pd.NaT, Timestamp('2011-01-01 10:00'), - pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')], - dtype='object', name='idx') + result = Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + dtype="object", + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert not isinstance(result, DatetimeIndex) - result = Index([pd.NaT, Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - pd.NaT, Timestamp('2011-01-02 10:00', - tz='US/Eastern')], name='idx') - exp = Index([pd.NaT, Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')], - dtype='object', name='idx') + result = Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + dtype="object", + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert not isinstance(result, DatetimeIndex) # all NaT - result = Index([pd.NaT, pd.NaT], name='idx') - exp = DatetimeIndex([pd.NaT, pd.NaT], name='idx') + result = Index([pd.NaT, pd.NaT], name="idx") + exp = DatetimeIndex([pd.NaT, pd.NaT], name="idx") tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) assert result.tz is None # all NaT with tz - result = Index([pd.NaT, pd.NaT], tz='Asia/Tokyo', name='idx') - exp = DatetimeIndex([pd.NaT, pd.NaT], tz='Asia/Tokyo', name='idx') + result = Index([pd.NaT, pd.NaT], tz="Asia/Tokyo", name="idx") + exp = DatetimeIndex([pd.NaT, pd.NaT], tz="Asia/Tokyo", name="idx") tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) @@ -287,74 +373,109 @@ def test_construction_dti_with_mixed_timezones(self): # no tz results in DatetimeIndex result = DatetimeIndex( - [Timestamp('2011-01-01'), Timestamp('2011-01-02')], name='idx') + [Timestamp("2011-01-01"), Timestamp("2011-01-02")], name="idx" + ) exp = DatetimeIndex( - [Timestamp('2011-01-01'), Timestamp('2011-01-02')], name='idx') + [Timestamp("2011-01-01"), Timestamp("2011-01-02")], name="idx" + ) tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) # same tz results in DatetimeIndex - result = DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - Timestamp('2011-01-02 10:00', - tz='Asia/Tokyo')], - name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00')], - tz='Asia/Tokyo', name='idx') + result = DatetimeIndex( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="Asia/Tokyo"), + ], + name="idx", + ) + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00"), Timestamp("2011-01-02 10:00")], + tz="Asia/Tokyo", + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) # same tz results in DatetimeIndex (DST) - result = DatetimeIndex([Timestamp('2011-01-01 10:00', tz='US/Eastern'), - Timestamp('2011-08-01 10:00', - tz='US/Eastern')], - name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), - Timestamp('2011-08-01 10:00')], - tz='US/Eastern', name='idx') + result = DatetimeIndex( + [ + Timestamp("2011-01-01 10:00", tz="US/Eastern"), + Timestamp("2011-08-01 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00"), Timestamp("2011-08-01 10:00")], + tz="US/Eastern", + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) # tz mismatch affecting to tz-aware raises TypeError/ValueError with pytest.raises(ValueError): - DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - name='idx') - - msg = 'cannot be converted to datetime64' + DatetimeIndex( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + name="idx", + ) + + msg = "cannot be converted to datetime64" with pytest.raises(ValueError, match=msg): - DatetimeIndex([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - tz='Asia/Tokyo', name='idx') + DatetimeIndex( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + tz="Asia/Tokyo", + name="idx", + ) with pytest.raises(ValueError): - DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - tz='US/Eastern', name='idx') + DatetimeIndex( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + tz="US/Eastern", + name="idx", + ) with pytest.raises(ValueError, match=msg): # passing tz should results in DatetimeIndex, then mismatch raises # TypeError - Index([pd.NaT, Timestamp('2011-01-01 10:00'), - pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')], - tz='Asia/Tokyo', name='idx') + Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + tz="Asia/Tokyo", + name="idx", + ) def test_construction_base_constructor(self): - arr = [pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')] + arr = [pd.Timestamp("2011-01-01"), pd.NaT, pd.Timestamp("2011-01-03")] tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) - tm.assert_index_equal(pd.Index(np.array(arr)), - pd.DatetimeIndex(np.array(arr))) + tm.assert_index_equal(pd.Index(np.array(arr)), pd.DatetimeIndex(np.array(arr))) - arr = [np.nan, pd.NaT, pd.Timestamp('2011-01-03')] + arr = [np.nan, pd.NaT, pd.Timestamp("2011-01-03")] tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) - tm.assert_index_equal(pd.Index(np.array(arr)), - pd.DatetimeIndex(np.array(arr))) + tm.assert_index_equal(pd.Index(np.array(arr)), pd.DatetimeIndex(np.array(arr))) def test_construction_outofbounds(self): # GH 13663 - dates = [datetime(3000, 1, 1), datetime(4000, 1, 1), - datetime(5000, 1, 1), datetime(6000, 1, 1)] + dates = [ + datetime(3000, 1, 1), + datetime(4000, 1, 1), + datetime(5000, 1, 1), + datetime(6000, 1, 1), + ] exp = Index(dates, dtype=object) # coerces to object tm.assert_index_equal(Index(dates), exp) @@ -365,186 +486,202 @@ def test_construction_outofbounds(self): def test_construction_with_ndarray(self): # GH 5152 - dates = [datetime(2013, 10, 7), - datetime(2013, 10, 8), - datetime(2013, 10, 9)] + dates = [datetime(2013, 10, 7), datetime(2013, 10, 8), datetime(2013, 10, 9)] data = DatetimeIndex(dates, freq=pd.offsets.BDay()).values result = DatetimeIndex(data, freq=pd.offsets.BDay()) - expected = DatetimeIndex(['2013-10-07', - '2013-10-08', - '2013-10-09'], - freq='B') + expected = DatetimeIndex(["2013-10-07", "2013-10-08", "2013-10-09"], freq="B") tm.assert_index_equal(result, expected) def test_verify_integrity_deprecated(self): # GH#23919 with tm.assert_produces_warning(FutureWarning): - DatetimeIndex(['1/1/2000'], verify_integrity=False) + DatetimeIndex(["1/1/2000"], verify_integrity=False) def test_range_kwargs_deprecated(self): # GH#23919 with tm.assert_produces_warning(FutureWarning): - DatetimeIndex(start='1/1/2000', end='1/10/2000', freq='D') + DatetimeIndex(start="1/1/2000", end="1/10/2000", freq="D") def test_integer_values_and_tz_deprecated(self): # GH-24559 values = np.array([946684800000000000]) with tm.assert_produces_warning(FutureWarning): - result = DatetimeIndex(values, tz='US/Central') - expected = pd.DatetimeIndex(['2000-01-01T00:00:00'], tz="US/Central") + result = DatetimeIndex(values, tz="US/Central") + expected = pd.DatetimeIndex(["2000-01-01T00:00:00"], tz="US/Central") tm.assert_index_equal(result, expected) # but UTC is *not* deprecated. with tm.assert_produces_warning(None): - result = DatetimeIndex(values, tz='UTC') - expected = pd.DatetimeIndex(['2000-01-01T00:00:00'], tz="US/Central") + result = DatetimeIndex(values, tz="UTC") + expected = pd.DatetimeIndex(["2000-01-01T00:00:00"], tz="US/Central") def test_constructor_coverage(self): - rng = date_range('1/1/2000', periods=10.5) - exp = date_range('1/1/2000', periods=10) + rng = date_range("1/1/2000", periods=10.5) + exp = date_range("1/1/2000", periods=10) tm.assert_index_equal(rng, exp) - msg = 'periods must be a number, got foo' + msg = "periods must be a number, got foo" with pytest.raises(TypeError, match=msg): - date_range(start='1/1/2000', periods='foo', freq='D') + date_range(start="1/1/2000", periods="foo", freq="D") with pytest.raises(ValueError): with tm.assert_produces_warning(FutureWarning): - DatetimeIndex(start='1/1/2000', end='1/10/2000') + DatetimeIndex(start="1/1/2000", end="1/10/2000") with pytest.raises(TypeError): - DatetimeIndex('1/1/2000') + DatetimeIndex("1/1/2000") # generator expression gen = (datetime(2000, 1, 1) + timedelta(i) for i in range(10)) result = DatetimeIndex(gen) - expected = DatetimeIndex([datetime(2000, 1, 1) + timedelta(i) - for i in range(10)]) + expected = DatetimeIndex( + [datetime(2000, 1, 1) + timedelta(i) for i in range(10)] + ) tm.assert_index_equal(result, expected) # NumPy string array - strings = np.array(['2000-01-01', '2000-01-02', '2000-01-03']) + strings = np.array(["2000-01-01", "2000-01-02", "2000-01-03"]) result = DatetimeIndex(strings) - expected = DatetimeIndex(strings.astype('O')) + expected = DatetimeIndex(strings.astype("O")) tm.assert_index_equal(result, expected) from_ints = DatetimeIndex(expected.asi8) tm.assert_index_equal(from_ints, expected) # string with NaT - strings = np.array(['2000-01-01', '2000-01-02', 'NaT']) + strings = np.array(["2000-01-01", "2000-01-02", "NaT"]) result = DatetimeIndex(strings) - expected = DatetimeIndex(strings.astype('O')) + expected = DatetimeIndex(strings.astype("O")) tm.assert_index_equal(result, expected) from_ints = DatetimeIndex(expected.asi8) tm.assert_index_equal(from_ints, expected) # non-conforming - msg = ("Inferred frequency None from passed values does not conform" - " to passed frequency D") + msg = ( + "Inferred frequency None from passed values does not conform" + " to passed frequency D" + ) with pytest.raises(ValueError, match=msg): - DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04'], freq='D') + DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-04"], freq="D") - msg = ("Of the four parameters: start, end, periods, and freq, exactly" - " three must be specified") + msg = ( + "Of the four parameters: start, end, periods, and freq, exactly" + " three must be specified" + ) with pytest.raises(ValueError, match=msg): - date_range(start='2011-01-01', freq='b') + date_range(start="2011-01-01", freq="b") with pytest.raises(ValueError, match=msg): - date_range(end='2011-01-01', freq='B') + date_range(end="2011-01-01", freq="B") with pytest.raises(ValueError, match=msg): - date_range(periods=10, freq='D') + date_range(periods=10, freq="D") - @pytest.mark.parametrize('freq', ['AS', 'W-SUN']) + @pytest.mark.parametrize("freq", ["AS", "W-SUN"]) def test_constructor_datetime64_tzformat(self, freq): # see GH#6572: ISO 8601 format results in pytz.FixedOffset - idx = date_range('2013-01-01T00:00:00-05:00', - '2016-01-01T23:59:59-05:00', freq=freq) - expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', - freq=freq, tz=pytz.FixedOffset(-300)) + idx = date_range( + "2013-01-01T00:00:00-05:00", "2016-01-01T23:59:59-05:00", freq=freq + ) + expected = date_range( + "2013-01-01T00:00:00", + "2016-01-01T23:59:59", + freq=freq, + tz=pytz.FixedOffset(-300), + ) tm.assert_index_equal(idx, expected) # Unable to use `US/Eastern` because of DST - expected_i8 = date_range('2013-01-01T00:00:00', - '2016-01-01T23:59:59', freq=freq, - tz='America/Lima') + expected_i8 = date_range( + "2013-01-01T00:00:00", "2016-01-01T23:59:59", freq=freq, tz="America/Lima" + ) tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) - idx = date_range('2013-01-01T00:00:00+09:00', - '2016-01-01T23:59:59+09:00', freq=freq) - expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', - freq=freq, tz=pytz.FixedOffset(540)) + idx = date_range( + "2013-01-01T00:00:00+09:00", "2016-01-01T23:59:59+09:00", freq=freq + ) + expected = date_range( + "2013-01-01T00:00:00", + "2016-01-01T23:59:59", + freq=freq, + tz=pytz.FixedOffset(540), + ) tm.assert_index_equal(idx, expected) - expected_i8 = date_range('2013-01-01T00:00:00', - '2016-01-01T23:59:59', freq=freq, - tz='Asia/Tokyo') + expected_i8 = date_range( + "2013-01-01T00:00:00", "2016-01-01T23:59:59", freq=freq, tz="Asia/Tokyo" + ) tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) # Non ISO 8601 format results in dateutil.tz.tzoffset - idx = date_range('2013/1/1 0:00:00-5:00', '2016/1/1 23:59:59-5:00', - freq=freq) - expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', - freq=freq, tz=pytz.FixedOffset(-300)) + idx = date_range("2013/1/1 0:00:00-5:00", "2016/1/1 23:59:59-5:00", freq=freq) + expected = date_range( + "2013-01-01T00:00:00", + "2016-01-01T23:59:59", + freq=freq, + tz=pytz.FixedOffset(-300), + ) tm.assert_index_equal(idx, expected) # Unable to use `US/Eastern` because of DST - expected_i8 = date_range('2013-01-01T00:00:00', - '2016-01-01T23:59:59', freq=freq, - tz='America/Lima') + expected_i8 = date_range( + "2013-01-01T00:00:00", "2016-01-01T23:59:59", freq=freq, tz="America/Lima" + ) tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) - idx = date_range('2013/1/1 0:00:00+9:00', - '2016/1/1 23:59:59+09:00', freq=freq) - expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', - freq=freq, tz=pytz.FixedOffset(540)) + idx = date_range("2013/1/1 0:00:00+9:00", "2016/1/1 23:59:59+09:00", freq=freq) + expected = date_range( + "2013-01-01T00:00:00", + "2016-01-01T23:59:59", + freq=freq, + tz=pytz.FixedOffset(540), + ) tm.assert_index_equal(idx, expected) - expected_i8 = date_range('2013-01-01T00:00:00', - '2016-01-01T23:59:59', freq=freq, - tz='Asia/Tokyo') + expected_i8 = date_range( + "2013-01-01T00:00:00", "2016-01-01T23:59:59", freq=freq, tz="Asia/Tokyo" + ) tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) def test_constructor_dtype(self): # passing a dtype with a tz should localize - idx = DatetimeIndex(['2013-01-01', '2013-01-02'], - dtype='datetime64[ns, US/Eastern]') - expected = DatetimeIndex(['2013-01-01', '2013-01-02'] - ).tz_localize('US/Eastern') + idx = DatetimeIndex( + ["2013-01-01", "2013-01-02"], dtype="datetime64[ns, US/Eastern]" + ) + expected = DatetimeIndex(["2013-01-01", "2013-01-02"]).tz_localize("US/Eastern") tm.assert_index_equal(idx, expected) - idx = DatetimeIndex(['2013-01-01', '2013-01-02'], - tz='US/Eastern') + idx = DatetimeIndex(["2013-01-01", "2013-01-02"], tz="US/Eastern") tm.assert_index_equal(idx, expected) # if we already have a tz and its not the same, then raise - idx = DatetimeIndex(['2013-01-01', '2013-01-02'], - dtype='datetime64[ns, US/Eastern]') - - msg = ("cannot supply both a tz and a timezone-naive dtype" - r" \(i\.e\. datetime64\[ns\]\)") + idx = DatetimeIndex( + ["2013-01-01", "2013-01-02"], dtype="datetime64[ns, US/Eastern]" + ) + + msg = ( + "cannot supply both a tz and a timezone-naive dtype" + r" \(i\.e\. datetime64\[ns\]\)" + ) with pytest.raises(ValueError, match=msg): - DatetimeIndex(idx, dtype='datetime64[ns]') + DatetimeIndex(idx, dtype="datetime64[ns]") # this is effectively trying to convert tz's - msg = ("data is already tz-aware US/Eastern, unable to set specified" - " tz: CET") + msg = "data is already tz-aware US/Eastern, unable to set specified" " tz: CET" with pytest.raises(TypeError, match=msg): - DatetimeIndex(idx, dtype='datetime64[ns, CET]') + DatetimeIndex(idx, dtype="datetime64[ns, CET]") msg = "cannot supply both a tz and a dtype with a tz" with pytest.raises(ValueError, match=msg): - DatetimeIndex(idx, tz='CET', dtype='datetime64[ns, US/Eastern]') + DatetimeIndex(idx, tz="CET", dtype="datetime64[ns, US/Eastern]") - result = DatetimeIndex(idx, dtype='datetime64[ns, US/Eastern]') + result = DatetimeIndex(idx, dtype="datetime64[ns, US/Eastern]") tm.assert_index_equal(idx, result) - @pytest.mark.parametrize('dtype', [object, np.int32, np.int64]) + @pytest.mark.parametrize("dtype", [object, np.int32, np.int64]) def test_constructor_invalid_dtype_raises(self, dtype): # GH 23986 with pytest.raises(ValueError): DatetimeIndex([1, 2], dtype=dtype) def test_constructor_name(self): - idx = date_range(start='2000-01-01', periods=1, freq='A', - name='TEST') - assert idx.name == 'TEST' + idx = date_range(start="2000-01-01", periods=1, freq="A", name="TEST") + assert idx.name == "TEST" def test_000constructor_resolution(self): # 2252 @@ -555,55 +692,68 @@ def test_000constructor_resolution(self): def test_disallow_setting_tz(self): # GH 3746 - dti = DatetimeIndex(['2010'], tz='UTC') + dti = DatetimeIndex(["2010"], tz="UTC") with pytest.raises(AttributeError): - dti.tz = pytz.timezone('US/Pacific') - - @pytest.mark.parametrize('tz', [ - None, 'America/Los_Angeles', pytz.timezone('America/Los_Angeles'), - Timestamp('2000', tz='America/Los_Angeles').tz]) + dti.tz = pytz.timezone("US/Pacific") + + @pytest.mark.parametrize( + "tz", + [ + None, + "America/Los_Angeles", + pytz.timezone("America/Los_Angeles"), + Timestamp("2000", tz="America/Los_Angeles").tz, + ], + ) def test_constructor_start_end_with_tz(self, tz): # GH 18595 - start = Timestamp('2013-01-01 06:00:00', tz='America/Los_Angeles') - end = Timestamp('2013-01-02 06:00:00', tz='America/Los_Angeles') - result = date_range(freq='D', start=start, end=end, tz=tz) - expected = DatetimeIndex(['2013-01-01 06:00:00', - '2013-01-02 06:00:00'], - tz='America/Los_Angeles') + start = Timestamp("2013-01-01 06:00:00", tz="America/Los_Angeles") + end = Timestamp("2013-01-02 06:00:00", tz="America/Los_Angeles") + result = date_range(freq="D", start=start, end=end, tz=tz) + expected = DatetimeIndex( + ["2013-01-01 06:00:00", "2013-01-02 06:00:00"], tz="America/Los_Angeles" + ) tm.assert_index_equal(result, expected) # Especially assert that the timezone is consistent for pytz - assert pytz.timezone('America/Los_Angeles') is result.tz + assert pytz.timezone("America/Los_Angeles") is result.tz - @pytest.mark.parametrize('tz', ['US/Pacific', 'US/Eastern', 'Asia/Tokyo']) + @pytest.mark.parametrize("tz", ["US/Pacific", "US/Eastern", "Asia/Tokyo"]) def test_constructor_with_non_normalized_pytz(self, tz): # GH 18595 - non_norm_tz = Timestamp('2010', tz=tz).tz - result = DatetimeIndex(['2010'], tz=non_norm_tz) + non_norm_tz = Timestamp("2010", tz=tz).tz + result = DatetimeIndex(["2010"], tz=non_norm_tz) assert pytz.timezone(tz) is result.tz def test_constructor_timestamp_near_dst(self): # GH 20854 - ts = [Timestamp('2016-10-30 03:00:00+0300', tz='Europe/Helsinki'), - Timestamp('2016-10-30 03:00:00+0200', tz='Europe/Helsinki')] + ts = [ + Timestamp("2016-10-30 03:00:00+0300", tz="Europe/Helsinki"), + Timestamp("2016-10-30 03:00:00+0200", tz="Europe/Helsinki"), + ] result = DatetimeIndex(ts) - expected = DatetimeIndex([ts[0].to_pydatetime(), - ts[1].to_pydatetime()]) + expected = DatetimeIndex([ts[0].to_pydatetime(), ts[1].to_pydatetime()]) tm.assert_index_equal(result, expected) # TODO(GH-24559): Remove the xfail for the tz-aware case. - @pytest.mark.parametrize('klass', [Index, DatetimeIndex]) - @pytest.mark.parametrize('box', [ - np.array, partial(np.array, dtype=object), list]) - @pytest.mark.parametrize('tz, dtype', [ - pytest.param('US/Pacific', 'datetime64[ns, US/Pacific]', - marks=[pytest.mark.xfail(), - pytest.mark.filterwarnings( - "ignore:\\n Passing:FutureWarning")]), - [None, 'datetime64[ns]'], - ]) + @pytest.mark.parametrize("klass", [Index, DatetimeIndex]) + @pytest.mark.parametrize("box", [np.array, partial(np.array, dtype=object), list]) + @pytest.mark.parametrize( + "tz, dtype", + [ + pytest.param( + "US/Pacific", + "datetime64[ns, US/Pacific]", + marks=[ + pytest.mark.xfail(), + pytest.mark.filterwarnings("ignore:\\n Passing:FutureWarning"), + ], + ), + [None, "datetime64[ns]"], + ], + ) def test_constructor_with_int_tz(self, klass, box, tz, dtype): # GH 20997, 20964 - ts = Timestamp('2018-01-01', tz=tz) + ts = Timestamp("2018-01-01", tz=tz) result = klass(box([ts.value]), dtype=dtype) expected = klass([ts]) assert result == expected @@ -621,56 +771,61 @@ def test_construction_int_rountrip(self, tz_naive_fixture): def test_construction_from_replaced_timestamps_with_dst(self): # GH 18785 - index = pd.date_range(pd.Timestamp(2000, 1, 1), - pd.Timestamp(2005, 1, 1), - freq='MS', tz='Australia/Melbourne') - test = pd.DataFrame({'data': range(len(index))}, index=index) - test = test.resample('Y').mean() - result = pd.DatetimeIndex([x.replace(month=6, day=1) - for x in test.index]) - expected = pd.DatetimeIndex(['2000-06-01 00:00:00', - '2001-06-01 00:00:00', - '2002-06-01 00:00:00', - '2003-06-01 00:00:00', - '2004-06-01 00:00:00', - '2005-06-01 00:00:00'], - tz='Australia/Melbourne') + index = pd.date_range( + pd.Timestamp(2000, 1, 1), + pd.Timestamp(2005, 1, 1), + freq="MS", + tz="Australia/Melbourne", + ) + test = pd.DataFrame({"data": range(len(index))}, index=index) + test = test.resample("Y").mean() + result = pd.DatetimeIndex([x.replace(month=6, day=1) for x in test.index]) + expected = pd.DatetimeIndex( + [ + "2000-06-01 00:00:00", + "2001-06-01 00:00:00", + "2002-06-01 00:00:00", + "2003-06-01 00:00:00", + "2004-06-01 00:00:00", + "2005-06-01 00:00:00", + ], + tz="Australia/Melbourne", + ) tm.assert_index_equal(result, expected) def test_construction_with_tz_and_tz_aware_dti(self): # GH 23579 - dti = date_range('2016-01-01', periods=3, tz='US/Central') + dti = date_range("2016-01-01", periods=3, tz="US/Central") with pytest.raises(TypeError): - DatetimeIndex(dti, tz='Asia/Tokyo') + DatetimeIndex(dti, tz="Asia/Tokyo") def test_construction_with_nat_and_tzlocal(self): tz = dateutil.tz.tzlocal() - result = DatetimeIndex(['2018', 'NaT'], tz=tz) - expected = DatetimeIndex([Timestamp('2018', tz=tz), pd.NaT]) + result = DatetimeIndex(["2018", "NaT"], tz=tz) + expected = DatetimeIndex([Timestamp("2018", tz=tz), pd.NaT]) tm.assert_index_equal(result, expected) def test_constructor_no_precision_warns(self): # GH-24753, GH-24739 - expected = pd.DatetimeIndex(['2000'], dtype='datetime64[ns]') + expected = pd.DatetimeIndex(["2000"], dtype="datetime64[ns]") # we set the stacklevel for DatetimeIndex with tm.assert_produces_warning(FutureWarning): - result = pd.DatetimeIndex(['2000'], dtype='datetime64') + result = pd.DatetimeIndex(["2000"], dtype="datetime64") tm.assert_index_equal(result, expected) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = pd.Index(['2000'], dtype='datetime64') + result = pd.Index(["2000"], dtype="datetime64") tm.assert_index_equal(result, expected) def test_constructor_wrong_precision_raises(self): with pytest.raises(ValueError): - pd.DatetimeIndex(['2000'], dtype='datetime64[us]') + pd.DatetimeIndex(["2000"], dtype="datetime64[us]") class TestTimeSeries: - def test_dti_constructor_preserve_dti_freq(self): - rng = date_range('1/1/2000', '1/2/2000', freq='5min') + rng = date_range("1/1/2000", "1/2/2000", freq="5min") rng2 = DatetimeIndex(rng) assert rng.freq == rng2.freq @@ -678,43 +833,51 @@ def test_dti_constructor_preserve_dti_freq(self): def test_dti_constructor_years_only(self, tz_naive_fixture): tz = tz_naive_fixture # GH 6961 - rng1 = date_range('2014', '2015', freq='M', tz=tz) - expected1 = date_range('2014-01-31', '2014-12-31', freq='M', tz=tz) + rng1 = date_range("2014", "2015", freq="M", tz=tz) + expected1 = date_range("2014-01-31", "2014-12-31", freq="M", tz=tz) - rng2 = date_range('2014', '2015', freq='MS', tz=tz) - expected2 = date_range('2014-01-01', '2015-01-01', freq='MS', tz=tz) + rng2 = date_range("2014", "2015", freq="MS", tz=tz) + expected2 = date_range("2014-01-01", "2015-01-01", freq="MS", tz=tz) - rng3 = date_range('2014', '2020', freq='A', tz=tz) - expected3 = date_range('2014-12-31', '2019-12-31', freq='A', tz=tz) + rng3 = date_range("2014", "2020", freq="A", tz=tz) + expected3 = date_range("2014-12-31", "2019-12-31", freq="A", tz=tz) - rng4 = date_range('2014', '2020', freq='AS', tz=tz) - expected4 = date_range('2014-01-01', '2020-01-01', freq='AS', tz=tz) + rng4 = date_range("2014", "2020", freq="AS", tz=tz) + expected4 = date_range("2014-01-01", "2020-01-01", freq="AS", tz=tz) - for rng, expected in [(rng1, expected1), (rng2, expected2), - (rng3, expected3), (rng4, expected4)]: + for rng, expected in [ + (rng1, expected1), + (rng2, expected2), + (rng3, expected3), + (rng4, expected4), + ]: tm.assert_index_equal(rng, expected) def test_dti_constructor_small_int(self, any_int_dtype): # see gh-13721 - exp = DatetimeIndex(['1970-01-01 00:00:00.00000000', - '1970-01-01 00:00:00.00000001', - '1970-01-01 00:00:00.00000002']) + exp = DatetimeIndex( + [ + "1970-01-01 00:00:00.00000000", + "1970-01-01 00:00:00.00000001", + "1970-01-01 00:00:00.00000002", + ] + ) arr = np.array([0, 10, 20], dtype=any_int_dtype) tm.assert_index_equal(DatetimeIndex(arr), exp) def test_ctor_str_intraday(self): - rng = DatetimeIndex(['1-1-2000 00:00:01']) + rng = DatetimeIndex(["1-1-2000 00:00:01"]) assert rng[0].second == 1 def test_is_(self): - dti = date_range(start='1/1/2005', end='12/1/2005', freq='M') + dti = date_range(start="1/1/2005", end="12/1/2005", freq="M") assert dti.is_(dti) assert dti.is_(dti.view()) assert not dti.is_(dti.copy()) def test_index_cast_datetime64_other_units(self): - arr = np.arange(0, 100, 10, dtype=np.int64).view('M8[D]') + arr = np.arange(0, 100, 10, dtype=np.int64).view("M8[D]") idx = Index(arr) assert (idx.values == conversion.ensure_datetime64ns(arr)).all() @@ -733,48 +896,48 @@ def test_constructor_int64_nocopy(self): arr[50:100] = -1 assert (index.asi8[50:100] != -1).all() - @pytest.mark.parametrize('freq', ['M', 'Q', 'A', 'D', 'B', 'BH', - 'T', 'S', 'L', 'U', 'H', 'N', 'C']) + @pytest.mark.parametrize( + "freq", ["M", "Q", "A", "D", "B", "BH", "T", "S", "L", "U", "H", "N", "C"] + ) def test_from_freq_recreate_from_data(self, freq): - org = date_range(start='2001/02/01 09:00', freq=freq, periods=1) + org = date_range(start="2001/02/01 09:00", freq=freq, periods=1) idx = DatetimeIndex(org, freq=freq) tm.assert_index_equal(idx, org) - org = date_range(start='2001/02/01 09:00', freq=freq, - tz='US/Pacific', periods=1) - idx = DatetimeIndex(org, freq=freq, tz='US/Pacific') + org = date_range( + start="2001/02/01 09:00", freq=freq, tz="US/Pacific", periods=1 + ) + idx = DatetimeIndex(org, freq=freq, tz="US/Pacific") tm.assert_index_equal(idx, org) def test_datetimeindex_constructor_misc(self): - arr = ['1/1/2005', '1/2/2005', 'Jn 3, 2005', '2005-01-04'] + arr = ["1/1/2005", "1/2/2005", "Jn 3, 2005", "2005-01-04"] msg = r"(\(')?Unknown string format(:', 'Jn 3, 2005'\))?" with pytest.raises(ValueError, match=msg): DatetimeIndex(arr) - arr = ['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04'] + arr = ["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"] idx1 = DatetimeIndex(arr) - arr = [datetime(2005, 1, 1), '1/2/2005', '1/3/2005', '2005-01-04'] + arr = [datetime(2005, 1, 1), "1/2/2005", "1/3/2005", "2005-01-04"] idx2 = DatetimeIndex(arr) - arr = [Timestamp(datetime(2005, 1, 1)), '1/2/2005', '1/3/2005', - '2005-01-04'] + arr = [Timestamp(datetime(2005, 1, 1)), "1/2/2005", "1/3/2005", "2005-01-04"] idx3 = DatetimeIndex(arr) - arr = np.array(['1/1/2005', '1/2/2005', '1/3/2005', - '2005-01-04'], dtype='O') + arr = np.array(["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"], dtype="O") idx4 = DatetimeIndex(arr) - arr = to_datetime(['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04']) + arr = to_datetime(["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"]) idx5 = DatetimeIndex(arr) - arr = to_datetime(['1/1/2005', '1/2/2005', 'Jan 3, 2005', '2005-01-04' - ]) + arr = to_datetime(["1/1/2005", "1/2/2005", "Jan 3, 2005", "2005-01-04"]) idx6 = DatetimeIndex(arr) - idx7 = DatetimeIndex(['12/05/2007', '25/01/2008'], dayfirst=True) - idx8 = DatetimeIndex(['2007/05/12', '2008/01/25'], dayfirst=False, - yearfirst=True) + idx7 = DatetimeIndex(["12/05/2007", "25/01/2008"], dayfirst=True) + idx8 = DatetimeIndex( + ["2007/05/12", "2008/01/25"], dayfirst=False, yearfirst=True + ) tm.assert_index_equal(idx7, idx8) for other in [idx2, idx3, idx4, idx5, idx6]: @@ -782,30 +945,31 @@ def test_datetimeindex_constructor_misc(self): sdate = datetime(1999, 12, 25) edate = datetime(2000, 1, 1) - idx = date_range(start=sdate, freq='1B', periods=20) + idx = date_range(start=sdate, freq="1B", periods=20) assert len(idx) == 20 assert idx[0] == sdate + 0 * offsets.BDay() - assert idx.freq == 'B' + assert idx.freq == "B" - idx = date_range(end=edate, freq=('D', 5), periods=20) + idx = date_range(end=edate, freq=("D", 5), periods=20) assert len(idx) == 20 assert idx[-1] == edate - assert idx.freq == '5D' + assert idx.freq == "5D" - idx1 = date_range(start=sdate, end=edate, freq='W-SUN') - idx2 = date_range(start=sdate, end=edate, - freq=offsets.Week(weekday=6)) + idx1 = date_range(start=sdate, end=edate, freq="W-SUN") + idx2 = date_range(start=sdate, end=edate, freq=offsets.Week(weekday=6)) assert len(idx1) == len(idx2) assert idx1.freq == idx2.freq - idx1 = date_range(start=sdate, end=edate, freq='QS') - idx2 = date_range(start=sdate, end=edate, - freq=offsets.QuarterBegin(startingMonth=1)) + idx1 = date_range(start=sdate, end=edate, freq="QS") + idx2 = date_range( + start=sdate, end=edate, freq=offsets.QuarterBegin(startingMonth=1) + ) assert len(idx1) == len(idx2) assert idx1.freq == idx2.freq - idx1 = date_range(start=sdate, end=edate, freq='BQ') - idx2 = date_range(start=sdate, end=edate, - freq=offsets.BQuarterEnd(startingMonth=12)) + idx1 = date_range(start=sdate, end=edate, freq="BQ") + idx2 = date_range( + start=sdate, end=edate, freq=offsets.BQuarterEnd(startingMonth=12) + ) assert len(idx1) == len(idx2) assert idx1.freq == idx2.freq diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 1545cc52eb1f44..54c931cd60d20e 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -18,7 +18,13 @@ import pandas.util.testing as tm from pandas.tseries.offsets import ( - BDay, CDay, DateOffset, MonthEnd, generate_range, prefix_mapping) + BDay, + CDay, + DateOffset, + MonthEnd, + generate_range, + prefix_mapping, +) START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) @@ -30,50 +36,48 @@ class TestTimestampEquivDateRange: # pertinent cases. def test_date_range_timestamp_equiv(self): - rng = date_range('20090415', '20090519', tz='US/Eastern') + rng = date_range("20090415", "20090519", tz="US/Eastern") stamp = rng[0] - ts = Timestamp('20090415', tz='US/Eastern', freq='D') + ts = Timestamp("20090415", tz="US/Eastern", freq="D") assert ts == stamp def test_date_range_timestamp_equiv_dateutil(self): - rng = date_range('20090415', '20090519', tz='dateutil/US/Eastern') + rng = date_range("20090415", "20090519", tz="dateutil/US/Eastern") stamp = rng[0] - ts = Timestamp('20090415', tz='dateutil/US/Eastern', freq='D') + ts = Timestamp("20090415", tz="dateutil/US/Eastern", freq="D") assert ts == stamp def test_date_range_timestamp_equiv_explicit_pytz(self): - rng = date_range('20090415', '20090519', - tz=pytz.timezone('US/Eastern')) + rng = date_range("20090415", "20090519", tz=pytz.timezone("US/Eastern")) stamp = rng[0] - ts = Timestamp('20090415', tz=pytz.timezone('US/Eastern'), freq='D') + ts = Timestamp("20090415", tz=pytz.timezone("US/Eastern"), freq="D") assert ts == stamp @td.skip_if_windows_python_3 def test_date_range_timestamp_equiv_explicit_dateutil(self): from pandas._libs.tslibs.timezones import dateutil_gettz as gettz - rng = date_range('20090415', '20090519', tz=gettz('US/Eastern')) + rng = date_range("20090415", "20090519", tz=gettz("US/Eastern")) stamp = rng[0] - ts = Timestamp('20090415', tz=gettz('US/Eastern'), freq='D') + ts = Timestamp("20090415", tz=gettz("US/Eastern"), freq="D") assert ts == stamp def test_date_range_timestamp_equiv_from_datetime_instance(self): datetime_instance = datetime(2014, 3, 4) # build a timestamp with a frequency, since then it supports # addition/subtraction of integers - timestamp_instance = date_range(datetime_instance, periods=1, - freq='D')[0] + timestamp_instance = date_range(datetime_instance, periods=1, freq="D")[0] - ts = Timestamp(datetime_instance, freq='D') + ts = Timestamp(datetime_instance, freq="D") assert ts == timestamp_instance def test_date_range_timestamp_equiv_preserve_frequency(self): - timestamp_instance = date_range('2014-03-05', periods=1, freq='D')[0] - ts = Timestamp('2014-03-05', freq='D') + timestamp_instance = date_range("2014-03-05", periods=1, freq="D")[0] + ts = Timestamp("2014-03-05", freq="D") assert timestamp_instance == ts @@ -83,9 +87,9 @@ def test_date_range_nat(self): # GH#11587 msg = "Neither `start` nor `end` can be NaT" with pytest.raises(ValueError, match=msg): - date_range(start='2016-01-01', end=pd.NaT, freq='D') + date_range(start="2016-01-01", end=pd.NaT, freq="D") with pytest.raises(ValueError, match=msg): - date_range(start=pd.NaT, end='2016-01-01', freq='D') + date_range(start=pd.NaT, end="2016-01-01", freq="D") def test_date_range_multiplication_overflow(self): # GH#24255 @@ -93,25 +97,25 @@ def test_date_range_multiplication_overflow(self): # are caught with tm.assert_produces_warning(None): # we should _not_ be seeing a overflow RuntimeWarning - dti = date_range(start='1677-09-22', periods=213503, freq='D') + dti = date_range(start="1677-09-22", periods=213503, freq="D") - assert dti[0] == Timestamp('1677-09-22') + assert dti[0] == Timestamp("1677-09-22") assert len(dti) == 213503 msg = "Cannot generate range with" with pytest.raises(OutOfBoundsDatetime, match=msg): - date_range('1969-05-04', periods=200000000, freq='30000D') + date_range("1969-05-04", periods=200000000, freq="30000D") def test_date_range_unsigned_overflow_handling(self): # GH#24255 # case where `addend = periods * stride` overflows int64 bounds # but not uint64 bounds - dti = date_range(start='1677-09-22', end='2262-04-11', freq='D') + dti = date_range(start="1677-09-22", end="2262-04-11", freq="D") - dti2 = date_range(start=dti[0], periods=len(dti), freq='D') + dti2 = date_range(start=dti[0], periods=len(dti), freq="D") assert dti2.equals(dti) - dti3 = date_range(end=dti[-1], periods=len(dti), freq='D') + dti3 = date_range(end=dti[-1], periods=len(dti), freq="D") assert dti3.equals(dti) def test_date_range_int64_overflow_non_recoverable(self): @@ -119,217 +123,257 @@ def test_date_range_int64_overflow_non_recoverable(self): # case with start later than 1970-01-01, overflow int64 but not uint64 msg = "Cannot generate range with" with pytest.raises(OutOfBoundsDatetime, match=msg): - date_range(start='1970-02-01', periods=106752 * 24, freq='H') + date_range(start="1970-02-01", periods=106752 * 24, freq="H") # case with end before 1970-01-01, overflow int64 but not uint64 with pytest.raises(OutOfBoundsDatetime, match=msg): - date_range(end='1969-11-14', periods=106752 * 24, freq='H') + date_range(end="1969-11-14", periods=106752 * 24, freq="H") def test_date_range_int64_overflow_stride_endpoint_different_signs(self): # cases where stride * periods overflow int64 and stride/endpoint # have different signs - start = Timestamp('2262-02-23') - end = Timestamp('1969-11-14') + start = Timestamp("2262-02-23") + end = Timestamp("1969-11-14") - expected = date_range(start=start, end=end, freq='-1H') + expected = date_range(start=start, end=end, freq="-1H") assert expected[0] == start assert expected[-1] == end - dti = date_range(end=end, periods=len(expected), freq='-1H') + dti = date_range(end=end, periods=len(expected), freq="-1H") tm.assert_index_equal(dti, expected) - start2 = Timestamp('1970-02-01') - end2 = Timestamp('1677-10-22') + start2 = Timestamp("1970-02-01") + end2 = Timestamp("1677-10-22") - expected2 = date_range(start=start2, end=end2, freq='-1H') + expected2 = date_range(start=start2, end=end2, freq="-1H") assert expected2[0] == start2 assert expected2[-1] == end2 - dti2 = date_range(start=start2, periods=len(expected2), freq='-1H') + dti2 = date_range(start=start2, periods=len(expected2), freq="-1H") tm.assert_index_equal(dti2, expected2) def test_date_range_out_of_bounds(self): # GH#14187 with pytest.raises(OutOfBoundsDatetime): - date_range('2016-01-01', periods=100000, freq='D') + date_range("2016-01-01", periods=100000, freq="D") with pytest.raises(OutOfBoundsDatetime): - date_range(end='1763-10-12', periods=100000, freq='D') + date_range(end="1763-10-12", periods=100000, freq="D") def test_date_range_gen_error(self): - rng = date_range('1/1/2000 00:00', '1/1/2000 00:18', freq='5min') + rng = date_range("1/1/2000 00:00", "1/1/2000 00:18", freq="5min") assert len(rng) == 4 @pytest.mark.parametrize("freq", ["AS", "YS"]) def test_begin_year_alias(self, freq): # see gh-9313 rng = date_range("1/1/2013", "7/1/2017", freq=freq) - exp = pd.DatetimeIndex(["2013-01-01", "2014-01-01", - "2015-01-01", "2016-01-01", - "2017-01-01"], freq=freq) + exp = pd.DatetimeIndex( + ["2013-01-01", "2014-01-01", "2015-01-01", "2016-01-01", "2017-01-01"], + freq=freq, + ) tm.assert_index_equal(rng, exp) @pytest.mark.parametrize("freq", ["A", "Y"]) def test_end_year_alias(self, freq): # see gh-9313 rng = date_range("1/1/2013", "7/1/2017", freq=freq) - exp = pd.DatetimeIndex(["2013-12-31", "2014-12-31", - "2015-12-31", "2016-12-31"], freq=freq) + exp = pd.DatetimeIndex( + ["2013-12-31", "2014-12-31", "2015-12-31", "2016-12-31"], freq=freq + ) tm.assert_index_equal(rng, exp) @pytest.mark.parametrize("freq", ["BA", "BY"]) def test_business_end_year_alias(self, freq): # see gh-9313 rng = date_range("1/1/2013", "7/1/2017", freq=freq) - exp = pd.DatetimeIndex(["2013-12-31", "2014-12-31", - "2015-12-31", "2016-12-30"], freq=freq) + exp = pd.DatetimeIndex( + ["2013-12-31", "2014-12-31", "2015-12-31", "2016-12-30"], freq=freq + ) tm.assert_index_equal(rng, exp) def test_date_range_negative_freq(self): # GH 11018 - rng = date_range('2011-12-31', freq='-2A', periods=3) - exp = pd.DatetimeIndex(['2011-12-31', '2009-12-31', - '2007-12-31'], freq='-2A') + rng = date_range("2011-12-31", freq="-2A", periods=3) + exp = pd.DatetimeIndex(["2011-12-31", "2009-12-31", "2007-12-31"], freq="-2A") tm.assert_index_equal(rng, exp) - assert rng.freq == '-2A' + assert rng.freq == "-2A" - rng = date_range('2011-01-31', freq='-2M', periods=3) - exp = pd.DatetimeIndex(['2011-01-31', '2010-11-30', - '2010-09-30'], freq='-2M') + rng = date_range("2011-01-31", freq="-2M", periods=3) + exp = pd.DatetimeIndex(["2011-01-31", "2010-11-30", "2010-09-30"], freq="-2M") tm.assert_index_equal(rng, exp) - assert rng.freq == '-2M' + assert rng.freq == "-2M" def test_date_range_bms_bug(self): # #1645 - rng = date_range('1/1/2000', periods=10, freq='BMS') + rng = date_range("1/1/2000", periods=10, freq="BMS") - ex_first = Timestamp('2000-01-03') + ex_first = Timestamp("2000-01-03") assert rng[0] == ex_first def test_date_range_normalize(self): snap = datetime.today() n = 50 - rng = date_range(snap, periods=n, normalize=False, freq='2D') + rng = date_range(snap, periods=n, normalize=False, freq="2D") offset = timedelta(2) values = DatetimeIndex([snap + i * offset for i in range(n)]) tm.assert_index_equal(rng, values) - rng = date_range('1/1/2000 08:15', periods=n, normalize=False, - freq='B') + rng = date_range("1/1/2000 08:15", periods=n, normalize=False, freq="B") the_time = time(8, 15) for val in rng: assert val.time() == the_time def test_date_range_fy5252(self): - dr = date_range(start="2013-01-01", periods=2, freq=offsets.FY5253( - startingMonth=1, weekday=3, variation="nearest")) - assert dr[0] == Timestamp('2013-01-31') - assert dr[1] == Timestamp('2014-01-30') + dr = date_range( + start="2013-01-01", + periods=2, + freq=offsets.FY5253(startingMonth=1, weekday=3, variation="nearest"), + ) + assert dr[0] == Timestamp("2013-01-31") + assert dr[1] == Timestamp("2014-01-30") def test_date_range_ambiguous_arguments(self): # #2538 start = datetime(2011, 1, 1, 5, 3, 40) end = datetime(2011, 1, 1, 8, 9, 40) - msg = ('Of the four parameters: start, end, periods, and ' - 'freq, exactly three must be specified') + msg = ( + "Of the four parameters: start, end, periods, and " + "freq, exactly three must be specified" + ) with pytest.raises(ValueError, match=msg): - date_range(start, end, periods=10, freq='s') + date_range(start, end, periods=10, freq="s") def test_date_range_convenience_periods(self): # GH 20808 - result = date_range('2018-04-24', '2018-04-27', periods=3) - expected = DatetimeIndex(['2018-04-24 00:00:00', - '2018-04-25 12:00:00', - '2018-04-27 00:00:00'], freq=None) + result = date_range("2018-04-24", "2018-04-27", periods=3) + expected = DatetimeIndex( + ["2018-04-24 00:00:00", "2018-04-25 12:00:00", "2018-04-27 00:00:00"], + freq=None, + ) tm.assert_index_equal(result, expected) # Test if spacing remains linear if tz changes to dst in range - result = date_range('2018-04-01 01:00:00', - '2018-04-01 04:00:00', - tz='Australia/Sydney', - periods=3) - expected = DatetimeIndex([Timestamp('2018-04-01 01:00:00+1100', - tz='Australia/Sydney'), - Timestamp('2018-04-01 02:00:00+1000', - tz='Australia/Sydney'), - Timestamp('2018-04-01 04:00:00+1000', - tz='Australia/Sydney')]) + result = date_range( + "2018-04-01 01:00:00", + "2018-04-01 04:00:00", + tz="Australia/Sydney", + periods=3, + ) + expected = DatetimeIndex( + [ + Timestamp("2018-04-01 01:00:00+1100", tz="Australia/Sydney"), + Timestamp("2018-04-01 02:00:00+1000", tz="Australia/Sydney"), + Timestamp("2018-04-01 04:00:00+1000", tz="Australia/Sydney"), + ] + ) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('start,end,result_tz', [ - ['20180101', '20180103', 'US/Eastern'], - [datetime(2018, 1, 1), datetime(2018, 1, 3), 'US/Eastern'], - [Timestamp('20180101'), Timestamp('20180103'), 'US/Eastern'], - [Timestamp('20180101', tz='US/Eastern'), - Timestamp('20180103', tz='US/Eastern'), 'US/Eastern'], - [Timestamp('20180101', tz='US/Eastern'), - Timestamp('20180103', tz='US/Eastern'), None]]) + @pytest.mark.parametrize( + "start,end,result_tz", + [ + ["20180101", "20180103", "US/Eastern"], + [datetime(2018, 1, 1), datetime(2018, 1, 3), "US/Eastern"], + [Timestamp("20180101"), Timestamp("20180103"), "US/Eastern"], + [ + Timestamp("20180101", tz="US/Eastern"), + Timestamp("20180103", tz="US/Eastern"), + "US/Eastern", + ], + [ + Timestamp("20180101", tz="US/Eastern"), + Timestamp("20180103", tz="US/Eastern"), + None, + ], + ], + ) def test_date_range_linspacing_tz(self, start, end, result_tz): # GH 20983 result = date_range(start, end, periods=3, tz=result_tz) - expected = date_range('20180101', periods=3, freq='D', tz='US/Eastern') + expected = date_range("20180101", periods=3, freq="D", tz="US/Eastern") tm.assert_index_equal(result, expected) def test_date_range_businesshour(self): - idx = DatetimeIndex(['2014-07-04 09:00', '2014-07-04 10:00', - '2014-07-04 11:00', - '2014-07-04 12:00', '2014-07-04 13:00', - '2014-07-04 14:00', - '2014-07-04 15:00', '2014-07-04 16:00'], - freq='BH') - rng = date_range('2014-07-04 09:00', '2014-07-04 16:00', freq='BH') + idx = DatetimeIndex( + [ + "2014-07-04 09:00", + "2014-07-04 10:00", + "2014-07-04 11:00", + "2014-07-04 12:00", + "2014-07-04 13:00", + "2014-07-04 14:00", + "2014-07-04 15:00", + "2014-07-04 16:00", + ], + freq="BH", + ) + rng = date_range("2014-07-04 09:00", "2014-07-04 16:00", freq="BH") tm.assert_index_equal(idx, rng) - idx = DatetimeIndex( - ['2014-07-04 16:00', '2014-07-07 09:00'], freq='BH') - rng = date_range('2014-07-04 16:00', '2014-07-07 09:00', freq='BH') + idx = DatetimeIndex(["2014-07-04 16:00", "2014-07-07 09:00"], freq="BH") + rng = date_range("2014-07-04 16:00", "2014-07-07 09:00", freq="BH") tm.assert_index_equal(idx, rng) - idx = DatetimeIndex(['2014-07-04 09:00', '2014-07-04 10:00', - '2014-07-04 11:00', - '2014-07-04 12:00', '2014-07-04 13:00', - '2014-07-04 14:00', - '2014-07-04 15:00', '2014-07-04 16:00', - '2014-07-07 09:00', '2014-07-07 10:00', - '2014-07-07 11:00', - '2014-07-07 12:00', '2014-07-07 13:00', - '2014-07-07 14:00', - '2014-07-07 15:00', '2014-07-07 16:00', - '2014-07-08 09:00', '2014-07-08 10:00', - '2014-07-08 11:00', - '2014-07-08 12:00', '2014-07-08 13:00', - '2014-07-08 14:00', - '2014-07-08 15:00', '2014-07-08 16:00'], - freq='BH') - rng = date_range('2014-07-04 09:00', '2014-07-08 16:00', freq='BH') + idx = DatetimeIndex( + [ + "2014-07-04 09:00", + "2014-07-04 10:00", + "2014-07-04 11:00", + "2014-07-04 12:00", + "2014-07-04 13:00", + "2014-07-04 14:00", + "2014-07-04 15:00", + "2014-07-04 16:00", + "2014-07-07 09:00", + "2014-07-07 10:00", + "2014-07-07 11:00", + "2014-07-07 12:00", + "2014-07-07 13:00", + "2014-07-07 14:00", + "2014-07-07 15:00", + "2014-07-07 16:00", + "2014-07-08 09:00", + "2014-07-08 10:00", + "2014-07-08 11:00", + "2014-07-08 12:00", + "2014-07-08 13:00", + "2014-07-08 14:00", + "2014-07-08 15:00", + "2014-07-08 16:00", + ], + freq="BH", + ) + rng = date_range("2014-07-04 09:00", "2014-07-08 16:00", freq="BH") tm.assert_index_equal(idx, rng) def test_range_misspecified(self): # GH #1095 - msg = ('Of the four parameters: start, end, periods, and ' - 'freq, exactly three must be specified') + msg = ( + "Of the four parameters: start, end, periods, and " + "freq, exactly three must be specified" + ) with pytest.raises(ValueError, match=msg): - date_range(start='1/1/2000') + date_range(start="1/1/2000") with pytest.raises(ValueError, match=msg): - date_range(end='1/1/2000') + date_range(end="1/1/2000") with pytest.raises(ValueError, match=msg): date_range(periods=10) with pytest.raises(ValueError, match=msg): - date_range(start='1/1/2000', freq='H') + date_range(start="1/1/2000", freq="H") with pytest.raises(ValueError, match=msg): - date_range(end='1/1/2000', freq='H') + date_range(end="1/1/2000", freq="H") with pytest.raises(ValueError, match=msg): - date_range(periods=10, freq='H') + date_range(periods=10, freq="H") with pytest.raises(ValueError, match=msg): date_range() @@ -337,8 +381,9 @@ def test_range_misspecified(self): def test_compat_replace(self): # https://github.com/statsmodels/statsmodels/issues/3349 # replace should take ints/longs for compat - result = date_range(Timestamp('1960-04-01 00:00:00', freq='QS-JAN'), - periods=76, freq='QS-JAN') + result = date_range( + Timestamp("1960-04-01 00:00:00", freq="QS-JAN"), periods=76, freq="QS-JAN" + ) assert len(result) == 76 def test_catch_infinite_loop(self): @@ -346,42 +391,49 @@ def test_catch_infinite_loop(self): # blow up, don't loop forever msg = "Offset did not increment date" with pytest.raises(ValueError, match=msg): - date_range(datetime(2011, 11, 11), datetime(2011, 11, 12), - freq=offset) + date_range(datetime(2011, 11, 11), datetime(2011, 11, 12), freq=offset) - @pytest.mark.parametrize('periods', (1, 2)) + @pytest.mark.parametrize("periods", (1, 2)) def test_wom_len(self, periods): # https://github.com/pandas-dev/pandas/issues/20517 - res = date_range(start='20110101', periods=periods, freq='WOM-1MON') + res = date_range(start="20110101", periods=periods, freq="WOM-1MON") assert len(res) == periods def test_construct_over_dst(self): # GH 20854 - pre_dst = Timestamp('2010-11-07 01:00:00').tz_localize('US/Pacific', - ambiguous=True) - pst_dst = Timestamp('2010-11-07 01:00:00').tz_localize('US/Pacific', - ambiguous=False) - expect_data = [Timestamp('2010-11-07 00:00:00', tz='US/Pacific'), - pre_dst, - pst_dst] + pre_dst = Timestamp("2010-11-07 01:00:00").tz_localize( + "US/Pacific", ambiguous=True + ) + pst_dst = Timestamp("2010-11-07 01:00:00").tz_localize( + "US/Pacific", ambiguous=False + ) + expect_data = [ + Timestamp("2010-11-07 00:00:00", tz="US/Pacific"), + pre_dst, + pst_dst, + ] expected = DatetimeIndex(expect_data) - result = date_range(start='2010-11-7', periods=3, - freq='H', tz='US/Pacific') + result = date_range(start="2010-11-7", periods=3, freq="H", tz="US/Pacific") tm.assert_index_equal(result, expected) def test_construct_with_different_start_end_string_format(self): # GH 12064 - result = date_range('2013-01-01 00:00:00+09:00', - '2013/01/01 02:00:00+09:00', freq='H') - expected = DatetimeIndex([Timestamp('2013-01-01 00:00:00+09:00'), - Timestamp('2013-01-01 01:00:00+09:00'), - Timestamp('2013-01-01 02:00:00+09:00')]) + result = date_range( + "2013-01-01 00:00:00+09:00", "2013/01/01 02:00:00+09:00", freq="H" + ) + expected = DatetimeIndex( + [ + Timestamp("2013-01-01 00:00:00+09:00"), + Timestamp("2013-01-01 01:00:00+09:00"), + Timestamp("2013-01-01 02:00:00+09:00"), + ] + ) tm.assert_index_equal(result, expected) def test_error_with_zero_monthends(self): - msg = r'Offset <0 \* MonthEnds> did not increment date' + msg = r"Offset <0 \* MonthEnds> did not increment date" with pytest.raises(ValueError, match=msg): - date_range('1/1/2000', '1/1/2001', freq=MonthEnd(0)) + date_range("1/1/2000", "1/1/2001", freq=MonthEnd(0)) def test_range_bug(self): # GH #770 @@ -394,7 +446,7 @@ def test_range_bug(self): def test_range_tz_pytz(self): # see gh-2906 - tz = timezone('US/Eastern') + tz = timezone("US/Eastern") start = tz.localize(datetime(2011, 1, 1)) end = tz.localize(datetime(2011, 1, 3)) @@ -413,25 +465,36 @@ def test_range_tz_pytz(self): assert dr[0] == start assert dr[2] == end - @pytest.mark.parametrize('start, end', [ - [Timestamp(datetime(2014, 3, 6), tz='US/Eastern'), - Timestamp(datetime(2014, 3, 12), tz='US/Eastern')], - [Timestamp(datetime(2013, 11, 1), tz='US/Eastern'), - Timestamp(datetime(2013, 11, 6), tz='US/Eastern')] - ]) + @pytest.mark.parametrize( + "start, end", + [ + [ + Timestamp(datetime(2014, 3, 6), tz="US/Eastern"), + Timestamp(datetime(2014, 3, 12), tz="US/Eastern"), + ], + [ + Timestamp(datetime(2013, 11, 1), tz="US/Eastern"), + Timestamp(datetime(2013, 11, 6), tz="US/Eastern"), + ], + ], + ) def test_range_tz_dst_straddle_pytz(self, start, end): - dr = date_range(start, end, freq='D') + dr = date_range(start, end, freq="D") assert dr[0] == start assert dr[-1] == end assert np.all(dr.hour == 0) - dr = date_range(start, end, freq='D', tz='US/Eastern') + dr = date_range(start, end, freq="D", tz="US/Eastern") assert dr[0] == start assert dr[-1] == end assert np.all(dr.hour == 0) - dr = date_range(start.replace(tzinfo=None), end.replace( - tzinfo=None), freq='D', tz='US/Eastern') + dr = date_range( + start.replace(tzinfo=None), + end.replace(tzinfo=None), + freq="D", + tz="US/Eastern", + ) assert dr[0] == start assert dr[-1] == end assert np.all(dr.hour == 0) @@ -441,27 +504,28 @@ def test_range_tz_dateutil(self): # Use maybe_get_tz to fix filename in tz under dateutil. from pandas._libs.tslibs.timezones import maybe_get_tz - tz = lambda x: maybe_get_tz('dateutil/' + x) - start = datetime(2011, 1, 1, tzinfo=tz('US/Eastern')) - end = datetime(2011, 1, 3, tzinfo=tz('US/Eastern')) + tz = lambda x: maybe_get_tz("dateutil/" + x) + + start = datetime(2011, 1, 1, tzinfo=tz("US/Eastern")) + end = datetime(2011, 1, 3, tzinfo=tz("US/Eastern")) dr = date_range(start=start, periods=3) - assert dr.tz == tz('US/Eastern') + assert dr.tz == tz("US/Eastern") assert dr[0] == start assert dr[2] == end dr = date_range(end=end, periods=3) - assert dr.tz == tz('US/Eastern') + assert dr.tz == tz("US/Eastern") assert dr[0] == start assert dr[2] == end dr = date_range(start=start, end=end) - assert dr.tz == tz('US/Eastern') + assert dr.tz == tz("US/Eastern") assert dr[0] == start assert dr[2] == end - @pytest.mark.parametrize('freq', ["1D", "3D", "2M", "7W", "3H", "A"]) + @pytest.mark.parametrize("freq", ["1D", "3D", "2M", "7W", "3H", "A"]) def test_range_closed(self, freq): begin = datetime(2011, 1, 1) end = datetime(2014, 1, 1) @@ -482,8 +546,8 @@ def test_range_closed(self, freq): def test_range_closed_with_tz_aware_start_end(self): # GH12409, GH12684 - begin = Timestamp('2011/1/1', tz='US/Eastern') - end = Timestamp('2014/1/1', tz='US/Eastern') + begin = Timestamp("2011/1/1", tz="US/Eastern") + end = Timestamp("2014/1/1", tz="US/Eastern") for freq in ["1D", "3D", "2M", "7W", "3H", "A"]: closed = date_range(begin, end, closed=None, freq=freq) @@ -500,18 +564,15 @@ def test_range_closed_with_tz_aware_start_end(self): tm.assert_index_equal(expected_left, left) tm.assert_index_equal(expected_right, right) - begin = Timestamp('2011/1/1') - end = Timestamp('2014/1/1') - begintz = Timestamp('2011/1/1', tz='US/Eastern') - endtz = Timestamp('2014/1/1', tz='US/Eastern') + begin = Timestamp("2011/1/1") + end = Timestamp("2014/1/1") + begintz = Timestamp("2011/1/1", tz="US/Eastern") + endtz = Timestamp("2014/1/1", tz="US/Eastern") for freq in ["1D", "3D", "2M", "7W", "3H", "A"]: - closed = date_range(begin, end, closed=None, freq=freq, - tz='US/Eastern') - left = date_range(begin, end, closed="left", freq=freq, - tz='US/Eastern') - right = date_range(begin, end, closed="right", freq=freq, - tz='US/Eastern') + closed = date_range(begin, end, closed=None, freq=freq, tz="US/Eastern") + left = date_range(begin, end, closed="left", freq=freq, tz="US/Eastern") + right = date_range(begin, end, closed="right", freq=freq, tz="US/Eastern") expected_left = left expected_right = right @@ -523,20 +584,23 @@ def test_range_closed_with_tz_aware_start_end(self): tm.assert_index_equal(expected_left, left) tm.assert_index_equal(expected_right, right) - @pytest.mark.parametrize('closed', ['right', 'left', None]) + @pytest.mark.parametrize("closed", ["right", "left", None]) def test_range_closed_boundary(self, closed): # GH#11804 - right_boundary = date_range('2015-09-12', '2015-12-01', - freq='QS-MAR', closed=closed) - left_boundary = date_range('2015-09-01', '2015-09-12', - freq='QS-MAR', closed=closed) - both_boundary = date_range('2015-09-01', '2015-12-01', - freq='QS-MAR', closed=closed) + right_boundary = date_range( + "2015-09-12", "2015-12-01", freq="QS-MAR", closed=closed + ) + left_boundary = date_range( + "2015-09-01", "2015-09-12", freq="QS-MAR", closed=closed + ) + both_boundary = date_range( + "2015-09-01", "2015-12-01", freq="QS-MAR", closed=closed + ) expected_right = expected_left = expected_both = both_boundary - if closed == 'right': + if closed == "right": expected_left = both_boundary[1:] - if closed == 'left': + if closed == "left": expected_right = both_boundary[:-1] if closed is None: expected_right = both_boundary[1:] @@ -548,66 +612,65 @@ def test_range_closed_boundary(self, closed): def test_years_only(self): # GH 6961 - dr = date_range('2014', '2015', freq='M') + dr = date_range("2014", "2015", freq="M") assert dr[0] == datetime(2014, 1, 31) assert dr[-1] == datetime(2014, 12, 31) def test_freq_divides_end_in_nanos(self): # GH 10885 - result_1 = date_range('2005-01-12 10:00', '2005-01-12 16:00', - freq='345min') - result_2 = date_range('2005-01-13 10:00', '2005-01-13 16:00', - freq='345min') - expected_1 = DatetimeIndex(['2005-01-12 10:00:00', - '2005-01-12 15:45:00'], - dtype='datetime64[ns]', freq='345T', - tz=None) - expected_2 = DatetimeIndex(['2005-01-13 10:00:00', - '2005-01-13 15:45:00'], - dtype='datetime64[ns]', freq='345T', - tz=None) + result_1 = date_range("2005-01-12 10:00", "2005-01-12 16:00", freq="345min") + result_2 = date_range("2005-01-13 10:00", "2005-01-13 16:00", freq="345min") + expected_1 = DatetimeIndex( + ["2005-01-12 10:00:00", "2005-01-12 15:45:00"], + dtype="datetime64[ns]", + freq="345T", + tz=None, + ) + expected_2 = DatetimeIndex( + ["2005-01-13 10:00:00", "2005-01-13 15:45:00"], + dtype="datetime64[ns]", + freq="345T", + tz=None, + ) tm.assert_index_equal(result_1, expected_1) tm.assert_index_equal(result_2, expected_2) def test_cached_range_bug(self): - rng = date_range('2010-09-01 05:00:00', periods=50, - freq=DateOffset(hours=6)) + rng = date_range("2010-09-01 05:00:00", periods=50, freq=DateOffset(hours=6)) assert len(rng) == 50 assert rng[0] == datetime(2010, 9, 1, 5) def test_timezone_comparaison_bug(self): # smoke test - start = Timestamp('20130220 10:00', tz='US/Eastern') - result = date_range(start, periods=2, tz='US/Eastern') + start = Timestamp("20130220 10:00", tz="US/Eastern") + result = date_range(start, periods=2, tz="US/Eastern") assert len(result) == 2 def test_timezone_comparaison_assert(self): - start = Timestamp('20130220 10:00', tz='US/Eastern') - msg = 'Inferred time zone not equal to passed time zone' + start = Timestamp("20130220 10:00", tz="US/Eastern") + msg = "Inferred time zone not equal to passed time zone" with pytest.raises(AssertionError, match=msg): - date_range(start, periods=2, tz='Europe/Berlin') + date_range(start, periods=2, tz="Europe/Berlin") - def test_negative_non_tick_frequency_descending_dates(self, - tz_aware_fixture): + def test_negative_non_tick_frequency_descending_dates(self, tz_aware_fixture): # GH 23270 tz = tz_aware_fixture - result = pd.date_range(start='2011-06-01', end='2011-01-01', - freq='-1MS', tz=tz) - expected = pd.date_range(end='2011-06-01', start='2011-01-01', - freq='1MS', tz=tz)[::-1] + result = pd.date_range(start="2011-06-01", end="2011-01-01", freq="-1MS", tz=tz) + expected = pd.date_range( + end="2011-06-01", start="2011-01-01", freq="1MS", tz=tz + )[::-1] tm.assert_index_equal(result, expected) class TestGenRangeGeneration: - def test_generate(self): rng1 = list(generate_range(START, END, offset=BDay())) - rng2 = list(generate_range(START, END, offset='B')) + rng2 = list(generate_range(START, END, offset="B")) assert rng1 == rng2 def test_generate_cday(self): rng1 = list(generate_range(START, END, offset=CDay())) - rng2 = list(generate_range(START, END, offset='C')) + rng2 = list(generate_range(START, END, offset="C")) assert rng1 == rng2 def test_1(self): @@ -616,48 +679,62 @@ def test_1(self): assert rng == expected def test_2(self): - rng = list(generate_range(start=datetime(2008, 1, 1), - end=datetime(2008, 1, 3))) - expected = [datetime(2008, 1, 1), - datetime(2008, 1, 2), - datetime(2008, 1, 3)] + rng = list(generate_range(start=datetime(2008, 1, 1), end=datetime(2008, 1, 3))) + expected = [datetime(2008, 1, 1), datetime(2008, 1, 2), datetime(2008, 1, 3)] assert rng == expected def test_3(self): - rng = list(generate_range(start=datetime(2008, 1, 5), - end=datetime(2008, 1, 6))) + rng = list(generate_range(start=datetime(2008, 1, 5), end=datetime(2008, 1, 6))) expected = [] assert rng == expected def test_precision_finer_than_offset(self): # GH#9907 - result1 = pd.date_range(start='2015-04-15 00:00:03', - end='2016-04-22 00:00:00', freq='Q') - result2 = pd.date_range(start='2015-04-15 00:00:03', - end='2015-06-22 00:00:04', freq='W') - expected1_list = ['2015-06-30 00:00:03', '2015-09-30 00:00:03', - '2015-12-31 00:00:03', '2016-03-31 00:00:03'] - expected2_list = ['2015-04-19 00:00:03', '2015-04-26 00:00:03', - '2015-05-03 00:00:03', '2015-05-10 00:00:03', - '2015-05-17 00:00:03', '2015-05-24 00:00:03', - '2015-05-31 00:00:03', '2015-06-07 00:00:03', - '2015-06-14 00:00:03', '2015-06-21 00:00:03'] - expected1 = DatetimeIndex(expected1_list, dtype='datetime64[ns]', - freq='Q-DEC', tz=None) - expected2 = DatetimeIndex(expected2_list, dtype='datetime64[ns]', - freq='W-SUN', tz=None) + result1 = pd.date_range( + start="2015-04-15 00:00:03", end="2016-04-22 00:00:00", freq="Q" + ) + result2 = pd.date_range( + start="2015-04-15 00:00:03", end="2015-06-22 00:00:04", freq="W" + ) + expected1_list = [ + "2015-06-30 00:00:03", + "2015-09-30 00:00:03", + "2015-12-31 00:00:03", + "2016-03-31 00:00:03", + ] + expected2_list = [ + "2015-04-19 00:00:03", + "2015-04-26 00:00:03", + "2015-05-03 00:00:03", + "2015-05-10 00:00:03", + "2015-05-17 00:00:03", + "2015-05-24 00:00:03", + "2015-05-31 00:00:03", + "2015-06-07 00:00:03", + "2015-06-14 00:00:03", + "2015-06-21 00:00:03", + ] + expected1 = DatetimeIndex( + expected1_list, dtype="datetime64[ns]", freq="Q-DEC", tz=None + ) + expected2 = DatetimeIndex( + expected2_list, dtype="datetime64[ns]", freq="W-SUN", tz=None + ) tm.assert_index_equal(result1, expected1) tm.assert_index_equal(result2, expected2) - dt1, dt2 = '2017-01-01', '2017-01-01' - tz1, tz2 = 'US/Eastern', 'Europe/London' - - @pytest.mark.parametrize("start,end", [ - (pd.Timestamp(dt1, tz=tz1), pd.Timestamp(dt2)), - (pd.Timestamp(dt1), pd.Timestamp(dt2, tz=tz2)), - (pd.Timestamp(dt1, tz=tz1), pd.Timestamp(dt2, tz=tz2)), - (pd.Timestamp(dt1, tz=tz2), pd.Timestamp(dt2, tz=tz1)) - ]) + dt1, dt2 = "2017-01-01", "2017-01-01" + tz1, tz2 = "US/Eastern", "Europe/London" + + @pytest.mark.parametrize( + "start,end", + [ + (pd.Timestamp(dt1, tz=tz1), pd.Timestamp(dt2)), + (pd.Timestamp(dt1), pd.Timestamp(dt2, tz=tz2)), + (pd.Timestamp(dt1, tz=tz1), pd.Timestamp(dt2, tz=tz2)), + (pd.Timestamp(dt1, tz=tz2), pd.Timestamp(dt2, tz=tz1)), + ], + ) def test_mismatching_tz_raises_err(self, start, end): # issue 18488 with pytest.raises(TypeError): @@ -667,20 +744,19 @@ def test_mismatching_tz_raises_err(self, start, end): class TestBusinessDateRange: - def test_constructor(self): bdate_range(START, END, freq=BDay()) bdate_range(START, periods=20, freq=BDay()) bdate_range(end=START, periods=20, freq=BDay()) - msg = 'periods must be a number, got B' + msg = "periods must be a number, got B" with pytest.raises(TypeError, match=msg): - date_range('2011-1-1', '2012-1-1', 'B') + date_range("2011-1-1", "2012-1-1", "B") with pytest.raises(TypeError, match=msg): - bdate_range('2011-1-1', '2012-1-1', 'B') + bdate_range("2011-1-1", "2012-1-1", "B") - msg = 'freq must be specified for bdate_range; use date_range instead' + msg = "freq must be specified for bdate_range; use date_range instead" with pytest.raises(TypeError, match=msg): bdate_range(START, END, periods=10, freq=None) @@ -688,7 +764,7 @@ def test_naive_aware_conflicts(self): naive = bdate_range(START, END, freq=BDay(), tz=None) aware = bdate_range(START, END, freq=BDay(), tz="Asia/Hong_Kong") - msg = 'tz-naive.*tz-aware' + msg = "tz-naive.*tz-aware" with pytest.raises(TypeError, match=msg): naive.join(aware) @@ -705,7 +781,7 @@ def test_misc(self): assert dr[-1] == end def test_date_parse_failure(self): - badly_formed_date = '2007/100/1' + badly_formed_date = "2007/100/1" with pytest.raises(ValueError): Timestamp(badly_formed_date) @@ -721,56 +797,55 @@ def test_date_parse_failure(self): def test_daterange_bug_456(self): # GH #456 - rng1 = bdate_range('12/5/2011', '12/5/2011') - rng2 = bdate_range('12/2/2011', '12/5/2011') + rng1 = bdate_range("12/5/2011", "12/5/2011") + rng2 = bdate_range("12/2/2011", "12/5/2011") rng2.freq = BDay() result = rng1.union(rng2) assert isinstance(result, DatetimeIndex) - @pytest.mark.parametrize('closed', ['left', 'right']) + @pytest.mark.parametrize("closed", ["left", "right"]) def test_bdays_and_open_boundaries(self, closed): # GH 6673 - start = '2018-07-21' # Saturday - end = '2018-07-29' # Sunday - result = pd.date_range(start, end, freq='B', closed=closed) + start = "2018-07-21" # Saturday + end = "2018-07-29" # Sunday + result = pd.date_range(start, end, freq="B", closed=closed) - bday_start = '2018-07-23' # Monday - bday_end = '2018-07-27' # Friday - expected = pd.date_range(bday_start, bday_end, freq='D') + bday_start = "2018-07-23" # Monday + bday_end = "2018-07-27" # Friday + expected = pd.date_range(bday_start, bday_end, freq="D") tm.assert_index_equal(result, expected) def test_bday_near_overflow(self): # GH#24252 avoid doing unnecessary addition that _would_ overflow start = pd.Timestamp.max.floor("D").to_pydatetime() - rng = pd.date_range(start, end=None, periods=1, freq='B') - expected = pd.DatetimeIndex([start], freq='B') + rng = pd.date_range(start, end=None, periods=1, freq="B") + expected = pd.DatetimeIndex([start], freq="B") tm.assert_index_equal(rng, expected) def test_bday_overflow_error(self): # GH#24252 check that we get OutOfBoundsDatetime and not OverflowError start = pd.Timestamp.max.floor("D").to_pydatetime() with pytest.raises(OutOfBoundsDatetime): - pd.date_range(start, periods=2, freq='B') + pd.date_range(start, periods=2, freq="B") class TestCustomDateRange: - def test_constructor(self): bdate_range(START, END, freq=CDay()) bdate_range(START, periods=20, freq=CDay()) bdate_range(end=START, periods=20, freq=CDay()) - msg = 'periods must be a number, got C' + msg = "periods must be a number, got C" with pytest.raises(TypeError, match=msg): - date_range('2011-1-1', '2012-1-1', 'C') + date_range("2011-1-1", "2012-1-1", "C") with pytest.raises(TypeError, match=msg): - bdate_range('2011-1-1', '2012-1-1', 'C') + bdate_range("2011-1-1", "2012-1-1", "C") def test_misc(self): end = datetime(2009, 5, 13) - dr = bdate_range(end=end, periods=20, freq='C') + dr = bdate_range(end=end, periods=20, freq="C") firstDate = end - 19 * CDay() assert len(dr) == 20 @@ -779,77 +854,95 @@ def test_misc(self): def test_daterange_bug_456(self): # GH #456 - rng1 = bdate_range('12/5/2011', '12/5/2011', freq='C') - rng2 = bdate_range('12/2/2011', '12/5/2011', freq='C') + rng1 = bdate_range("12/5/2011", "12/5/2011", freq="C") + rng2 = bdate_range("12/2/2011", "12/5/2011", freq="C") rng2.freq = CDay() result = rng1.union(rng2) assert isinstance(result, DatetimeIndex) def test_cdaterange(self): - result = bdate_range('2013-05-01', periods=3, freq='C') - expected = DatetimeIndex(['2013-05-01', '2013-05-02', '2013-05-03']) + result = bdate_range("2013-05-01", periods=3, freq="C") + expected = DatetimeIndex(["2013-05-01", "2013-05-02", "2013-05-03"]) tm.assert_index_equal(result, expected) def test_cdaterange_weekmask(self): - result = bdate_range('2013-05-01', periods=3, freq='C', - weekmask='Sun Mon Tue Wed Thu') - expected = DatetimeIndex(['2013-05-01', '2013-05-02', '2013-05-05']) + result = bdate_range( + "2013-05-01", periods=3, freq="C", weekmask="Sun Mon Tue Wed Thu" + ) + expected = DatetimeIndex(["2013-05-01", "2013-05-02", "2013-05-05"]) tm.assert_index_equal(result, expected) # raise with non-custom freq - msg = ('a custom frequency string is required when holidays or ' - 'weekmask are passed, got frequency B') + msg = ( + "a custom frequency string is required when holidays or " + "weekmask are passed, got frequency B" + ) with pytest.raises(ValueError, match=msg): - bdate_range('2013-05-01', periods=3, - weekmask='Sun Mon Tue Wed Thu') + bdate_range("2013-05-01", periods=3, weekmask="Sun Mon Tue Wed Thu") def test_cdaterange_holidays(self): - result = bdate_range('2013-05-01', periods=3, freq='C', - holidays=['2013-05-01']) - expected = DatetimeIndex(['2013-05-02', '2013-05-03', '2013-05-06']) + result = bdate_range("2013-05-01", periods=3, freq="C", holidays=["2013-05-01"]) + expected = DatetimeIndex(["2013-05-02", "2013-05-03", "2013-05-06"]) tm.assert_index_equal(result, expected) # raise with non-custom freq - msg = ('a custom frequency string is required when holidays or ' - 'weekmask are passed, got frequency B') + msg = ( + "a custom frequency string is required when holidays or " + "weekmask are passed, got frequency B" + ) with pytest.raises(ValueError, match=msg): - bdate_range('2013-05-01', periods=3, holidays=['2013-05-01']) + bdate_range("2013-05-01", periods=3, holidays=["2013-05-01"]) def test_cdaterange_weekmask_and_holidays(self): - result = bdate_range('2013-05-01', periods=3, freq='C', - weekmask='Sun Mon Tue Wed Thu', - holidays=['2013-05-01']) - expected = DatetimeIndex(['2013-05-02', '2013-05-05', '2013-05-06']) + result = bdate_range( + "2013-05-01", + periods=3, + freq="C", + weekmask="Sun Mon Tue Wed Thu", + holidays=["2013-05-01"], + ) + expected = DatetimeIndex(["2013-05-02", "2013-05-05", "2013-05-06"]) tm.assert_index_equal(result, expected) # raise with non-custom freq - msg = ('a custom frequency string is required when holidays or ' - 'weekmask are passed, got frequency B') + msg = ( + "a custom frequency string is required when holidays or " + "weekmask are passed, got frequency B" + ) with pytest.raises(ValueError, match=msg): - bdate_range('2013-05-01', periods=3, - weekmask='Sun Mon Tue Wed Thu', - holidays=['2013-05-01']) - - @pytest.mark.parametrize('freq', [freq for freq in prefix_mapping - if freq.startswith('C')]) + bdate_range( + "2013-05-01", + periods=3, + weekmask="Sun Mon Tue Wed Thu", + holidays=["2013-05-01"], + ) + + @pytest.mark.parametrize( + "freq", [freq for freq in prefix_mapping if freq.startswith("C")] + ) def test_all_custom_freq(self, freq): # should not raise - bdate_range(START, END, freq=freq, weekmask='Mon Wed Fri', - holidays=['2009-03-14']) + bdate_range( + START, END, freq=freq, weekmask="Mon Wed Fri", holidays=["2009-03-14"] + ) - bad_freq = freq + 'FOO' - msg = 'invalid custom frequency string: {freq}' + bad_freq = freq + "FOO" + msg = "invalid custom frequency string: {freq}" with pytest.raises(ValueError, match=msg.format(freq=bad_freq)): bdate_range(START, END, freq=bad_freq) - @pytest.mark.parametrize('start_end', [ - ('2018-01-01T00:00:01.000Z', '2018-01-03T00:00:01.000Z'), - ('2018-01-01T00:00:00.010Z', '2018-01-03T00:00:00.010Z'), - ('2001-01-01T00:00:00.010Z', '2001-01-03T00:00:00.010Z')]) + @pytest.mark.parametrize( + "start_end", + [ + ("2018-01-01T00:00:01.000Z", "2018-01-03T00:00:01.000Z"), + ("2018-01-01T00:00:00.010Z", "2018-01-03T00:00:00.010Z"), + ("2001-01-01T00:00:00.010Z", "2001-01-03T00:00:00.010Z"), + ], + ) def test_range_with_millisecond_resolution(self, start_end): # https://github.com/pandas-dev/pandas/issues/24110 start, end = start_end - result = pd.date_range(start=start, end=end, periods=2, closed='left') + result = pd.date_range(start=start, end=end, periods=2, closed="left") expected = DatetimeIndex([start]) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 01649cb4646de4..aeff489861f5dd 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -5,8 +5,7 @@ import pytest import pandas as pd -from pandas import ( - DataFrame, DatetimeIndex, Index, Timestamp, date_range, offsets) +from pandas import DataFrame, DatetimeIndex, Index, Timestamp, date_range, offsets import pandas.util.testing as tm from pandas.util.testing import assert_almost_equal @@ -14,20 +13,19 @@ class TestDatetimeIndex: - def test_roundtrip_pickle_with_tz(self): # GH 8367 # round-trip of timezone - index = date_range('20130101', periods=3, tz='US/Eastern', name='foo') + index = date_range("20130101", periods=3, tz="US/Eastern", name="foo") unpickled = tm.round_trip_pickle(index) tm.assert_index_equal(index, unpickled) def test_reindex_preserves_tz_if_target_is_empty_list_or_array(self): # GH7774 - index = date_range('20130101', periods=3, tz='US/Eastern') - assert str(index.reindex([])[0].tz) == 'US/Eastern' - assert str(index.reindex(np.array([]))[0].tz) == 'US/Eastern' + index = date_range("20130101", periods=3, tz="US/Eastern") + assert str(index.reindex([])[0].tz) == "US/Eastern" + assert str(index.reindex(np.array([]))[0].tz) == "US/Eastern" def test_time_loc(self): # GH8667 from datetime import time @@ -39,12 +37,11 @@ def test_time_loc(self): # GH8667 step = 24 * 3600 for n in ns: - idx = pd.date_range('2014-11-26', periods=n, freq='S') + idx = pd.date_range("2014-11-26", periods=n, freq="S") ts = pd.Series(np.random.randn(n), index=idx) i = np.arange(start, n, step) - tm.assert_numpy_array_equal(ts.index.get_loc(key), i, - check_dtype=False) + tm.assert_numpy_array_equal(ts.index.get_loc(key), i, check_dtype=False) tm.assert_series_equal(ts[key], ts.iloc[i]) left, right = ts.copy(), ts.copy() @@ -61,10 +58,10 @@ def test_time_overflow_for_32bit_machines(self): # overflow. periods = np.int_(1000) - idx1 = pd.date_range(start='2000', periods=periods, freq='S') + idx1 = pd.date_range(start="2000", periods=periods, freq="S") assert len(idx1) == periods - idx2 = pd.date_range(end='2000', periods=periods, freq='S') + idx2 = pd.date_range(end="2000", periods=periods, freq="S") assert len(idx2) == periods def test_nat(self): @@ -85,62 +82,64 @@ def test_week_of_month_frequency(self): tm.assert_index_equal(result_union, expected) # GH 5115 - result = date_range("2013-1-1", periods=4, freq='WOM-1SAT') - dates = ['2013-01-05', '2013-02-02', '2013-03-02', '2013-04-06'] - expected = DatetimeIndex(dates, freq='WOM-1SAT') + result = date_range("2013-1-1", periods=4, freq="WOM-1SAT") + dates = ["2013-01-05", "2013-02-02", "2013-03-02", "2013-04-06"] + expected = DatetimeIndex(dates, freq="WOM-1SAT") tm.assert_index_equal(result, expected) def test_hash_error(self): - index = date_range('20010101', periods=10) - with pytest.raises(TypeError, match=("unhashable type: %r" % - type(index).__name__)): + index = date_range("20010101", periods=10) + with pytest.raises( + TypeError, match=("unhashable type: %r" % type(index).__name__) + ): hash(index) def test_stringified_slice_with_tz(self): # GH#2658 - start = '2013-01-07' - idx = date_range(start=start, freq="1d", periods=10, tz='US/Eastern') + start = "2013-01-07" + idx = date_range(start=start, freq="1d", periods=10, tz="US/Eastern") df = DataFrame(np.arange(10), index=idx) df["2013-01-14 23:44:34.437768-05:00":] # no exception here def test_append_join_nondatetimeindex(self): - rng = date_range('1/1/2000', periods=10) - idx = Index(['a', 'b', 'c', 'd']) + rng = date_range("1/1/2000", periods=10) + idx = Index(["a", "b", "c", "d"]) result = rng.append(idx) assert isinstance(result[0], Timestamp) # it works - rng.join(idx, how='outer') + rng.join(idx, how="outer") def test_map(self): - rng = date_range('1/1/2000', periods=10) + rng = date_range("1/1/2000", periods=10) - f = lambda x: x.strftime('%Y%m%d') + f = lambda x: x.strftime("%Y%m%d") result = rng.map(f) - exp = Index([f(x) for x in rng], dtype='= -1') + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): @@ -253,28 +280,33 @@ def test_take_fill_value(self): idx.take(np.array([1, -5])) def test_take_fill_value_with_timezone(self): - idx = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'], - name='xxx', tz='US/Eastern') + idx = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx", tz="US/Eastern" + ) result = idx.take(np.array([1, 0, -1])) - expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx', tz='US/Eastern') + expected = pd.DatetimeIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx", tz="US/Eastern" + ) tm.assert_index_equal(result, expected) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', 'NaT'], - name='xxx', tz='US/Eastern') + expected = pd.DatetimeIndex( + ["2011-02-01", "2011-01-01", "NaT"], name="xxx", tz="US/Eastern" + ) tm.assert_index_equal(result, expected) # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx', tz='US/Eastern') + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.DatetimeIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx", tz="US/Eastern" + ) tm.assert_index_equal(result, expected) - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): @@ -285,56 +317,72 @@ def test_take_fill_value_with_timezone(self): class TestDatetimeIndex: - @pytest.mark.parametrize('null', [None, np.nan, pd.NaT]) - @pytest.mark.parametrize('tz', [None, 'UTC', 'US/Eastern']) + @pytest.mark.parametrize("null", [None, np.nan, pd.NaT]) + @pytest.mark.parametrize("tz", [None, "UTC", "US/Eastern"]) def test_insert_nat(self, tz, null): # GH#16537, GH#18295 (test missing) - idx = pd.DatetimeIndex(['2017-01-01'], tz=tz) - expected = pd.DatetimeIndex(['NaT', '2017-01-01'], tz=tz) + idx = pd.DatetimeIndex(["2017-01-01"], tz=tz) + expected = pd.DatetimeIndex(["NaT", "2017-01-01"], tz=tz) res = idx.insert(0, null) tm.assert_index_equal(res, expected) def test_insert(self): - idx = DatetimeIndex( - ['2000-01-04', '2000-01-01', '2000-01-02'], name='idx') + idx = DatetimeIndex(["2000-01-04", "2000-01-01", "2000-01-02"], name="idx") result = idx.insert(2, datetime(2000, 1, 5)) - exp = DatetimeIndex(['2000-01-04', '2000-01-01', '2000-01-05', - '2000-01-02'], name='idx') + exp = DatetimeIndex( + ["2000-01-04", "2000-01-01", "2000-01-05", "2000-01-02"], name="idx" + ) tm.assert_index_equal(result, exp) # insertion of non-datetime should coerce to object index - result = idx.insert(1, 'inserted') - expected = Index([datetime(2000, 1, 4), 'inserted', - datetime(2000, 1, 1), - datetime(2000, 1, 2)], name='idx') + result = idx.insert(1, "inserted") + expected = Index( + [ + datetime(2000, 1, 4), + "inserted", + datetime(2000, 1, 1), + datetime(2000, 1, 2), + ], + name="idx", + ) assert not isinstance(result, DatetimeIndex) tm.assert_index_equal(result, expected) assert result.name == expected.name - idx = date_range('1/1/2000', periods=3, freq='M', name='idx') + idx = date_range("1/1/2000", periods=3, freq="M", name="idx") # preserve freq - expected_0 = DatetimeIndex(['1999-12-31', '2000-01-31', '2000-02-29', - '2000-03-31'], name='idx', freq='M') - expected_3 = DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', - '2000-04-30'], name='idx', freq='M') + expected_0 = DatetimeIndex( + ["1999-12-31", "2000-01-31", "2000-02-29", "2000-03-31"], + name="idx", + freq="M", + ) + expected_3 = DatetimeIndex( + ["2000-01-31", "2000-02-29", "2000-03-31", "2000-04-30"], + name="idx", + freq="M", + ) # reset freq to None - expected_1_nofreq = DatetimeIndex(['2000-01-31', '2000-01-31', - '2000-02-29', - '2000-03-31'], name='idx', - freq=None) - expected_3_nofreq = DatetimeIndex(['2000-01-31', '2000-02-29', - '2000-03-31', - '2000-01-02'], name='idx', - freq=None) - - cases = [(0, datetime(1999, 12, 31), expected_0), - (-3, datetime(1999, 12, 31), expected_0), - (3, datetime(2000, 4, 30), expected_3), - (1, datetime(2000, 1, 31), expected_1_nofreq), - (3, datetime(2000, 1, 2), expected_3_nofreq)] + expected_1_nofreq = DatetimeIndex( + ["2000-01-31", "2000-01-31", "2000-02-29", "2000-03-31"], + name="idx", + freq=None, + ) + expected_3_nofreq = DatetimeIndex( + ["2000-01-31", "2000-02-29", "2000-03-31", "2000-01-02"], + name="idx", + freq=None, + ) + + cases = [ + (0, datetime(1999, 12, 31), expected_0), + (-3, datetime(1999, 12, 31), expected_0), + (3, datetime(2000, 4, 30), expected_3), + (1, datetime(2000, 1, 31), expected_1_nofreq), + (3, datetime(2000, 1, 2), expected_3_nofreq), + ] for n, d, expected in cases: result = idx.insert(n, d) @@ -344,33 +392,36 @@ def test_insert(self): # reset freq to None result = idx.insert(3, datetime(2000, 1, 2)) - expected = DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', - '2000-01-02'], name='idx', freq=None) + expected = DatetimeIndex( + ["2000-01-31", "2000-02-29", "2000-03-31", "2000-01-02"], + name="idx", + freq=None, + ) tm.assert_index_equal(result, expected) assert result.name == expected.name assert result.freq is None # see gh-7299 - idx = date_range('1/1/2000', periods=3, freq='D', tz='Asia/Tokyo', - name='idx') + idx = date_range("1/1/2000", periods=3, freq="D", tz="Asia/Tokyo", name="idx") with pytest.raises(ValueError): - idx.insert(3, pd.Timestamp('2000-01-04')) + idx.insert(3, pd.Timestamp("2000-01-04")) with pytest.raises(ValueError): idx.insert(3, datetime(2000, 1, 4)) with pytest.raises(ValueError): - idx.insert(3, pd.Timestamp('2000-01-04', tz='US/Eastern')) + idx.insert(3, pd.Timestamp("2000-01-04", tz="US/Eastern")) with pytest.raises(ValueError): - idx.insert(3, datetime(2000, 1, 4, - tzinfo=pytz.timezone('US/Eastern'))) + idx.insert(3, datetime(2000, 1, 4, tzinfo=pytz.timezone("US/Eastern"))) - for tz in ['US/Pacific', 'Asia/Singapore']: - idx = date_range('1/1/2000 09:00', periods=6, freq='H', tz=tz, - name='idx') + for tz in ["US/Pacific", "Asia/Singapore"]: + idx = date_range("1/1/2000 09:00", periods=6, freq="H", tz=tz, name="idx") # preserve freq - expected = date_range('1/1/2000 09:00', periods=7, freq='H', tz=tz, - name='idx') - for d in [pd.Timestamp('2000-01-01 15:00', tz=tz), - pytz.timezone(tz).localize(datetime(2000, 1, 1, 15))]: + expected = date_range( + "1/1/2000 09:00", periods=7, freq="H", tz=tz, name="idx" + ) + for d in [ + pd.Timestamp("2000-01-01 15:00", tz=tz), + pytz.timezone(tz).localize(datetime(2000, 1, 1, 15)), + ]: result = idx.insert(6, d) tm.assert_index_equal(result, expected) @@ -378,15 +429,25 @@ def test_insert(self): assert result.freq == expected.freq assert result.tz == expected.tz - expected = DatetimeIndex(['2000-01-01 09:00', '2000-01-01 10:00', - '2000-01-01 11:00', - '2000-01-01 12:00', '2000-01-01 13:00', - '2000-01-01 14:00', - '2000-01-01 10:00'], name='idx', - tz=tz, freq=None) + expected = DatetimeIndex( + [ + "2000-01-01 09:00", + "2000-01-01 10:00", + "2000-01-01 11:00", + "2000-01-01 12:00", + "2000-01-01 13:00", + "2000-01-01 14:00", + "2000-01-01 10:00", + ], + name="idx", + tz=tz, + freq=None, + ) # reset freq to None - for d in [pd.Timestamp('2000-01-01 10:00', tz=tz), - pytz.timezone(tz).localize(datetime(2000, 1, 1, 10))]: + for d in [ + pd.Timestamp("2000-01-01 10:00", tz=tz), + pytz.timezone(tz).localize(datetime(2000, 1, 1, 10)), + ]: result = idx.insert(6, d) tm.assert_index_equal(result, expected) assert result.name == expected.name @@ -394,23 +455,26 @@ def test_insert(self): assert result.freq is None def test_delete(self): - idx = date_range(start='2000-01-01', periods=5, freq='M', name='idx') + idx = date_range(start="2000-01-01", periods=5, freq="M", name="idx") # prserve freq - expected_0 = date_range(start='2000-02-01', periods=4, freq='M', - name='idx') - expected_4 = date_range(start='2000-01-01', periods=4, freq='M', - name='idx') + expected_0 = date_range(start="2000-02-01", periods=4, freq="M", name="idx") + expected_4 = date_range(start="2000-01-01", periods=4, freq="M", name="idx") # reset freq to None - expected_1 = DatetimeIndex(['2000-01-31', '2000-03-31', '2000-04-30', - '2000-05-31'], freq=None, name='idx') - - cases = {0: expected_0, - -5: expected_0, - -1: expected_4, - 4: expected_4, - 1: expected_1} + expected_1 = DatetimeIndex( + ["2000-01-31", "2000-03-31", "2000-04-30", "2000-05-31"], + freq=None, + name="idx", + ) + + cases = { + 0: expected_0, + -5: expected_0, + -1: expected_4, + 4: expected_4, + 1: expected_1, + } for n, expected in cases.items(): result = idx.delete(n) tm.assert_index_equal(result, expected) @@ -421,43 +485,56 @@ def test_delete(self): # either depending on numpy version idx.delete(5) - for tz in [None, 'Asia/Tokyo', 'US/Pacific']: - idx = date_range(start='2000-01-01 09:00', periods=10, freq='H', - name='idx', tz=tz) + for tz in [None, "Asia/Tokyo", "US/Pacific"]: + idx = date_range( + start="2000-01-01 09:00", periods=10, freq="H", name="idx", tz=tz + ) - expected = date_range(start='2000-01-01 10:00', periods=9, - freq='H', name='idx', tz=tz) + expected = date_range( + start="2000-01-01 10:00", periods=9, freq="H", name="idx", tz=tz + ) result = idx.delete(0) tm.assert_index_equal(result, expected) assert result.name == expected.name - assert result.freqstr == 'H' + assert result.freqstr == "H" assert result.tz == expected.tz - expected = date_range(start='2000-01-01 09:00', periods=9, - freq='H', name='idx', tz=tz) + expected = date_range( + start="2000-01-01 09:00", periods=9, freq="H", name="idx", tz=tz + ) result = idx.delete(-1) tm.assert_index_equal(result, expected) assert result.name == expected.name - assert result.freqstr == 'H' + assert result.freqstr == "H" assert result.tz == expected.tz def test_delete_slice(self): - idx = date_range(start='2000-01-01', periods=10, freq='D', name='idx') + idx = date_range(start="2000-01-01", periods=10, freq="D", name="idx") # prserve freq - expected_0_2 = date_range(start='2000-01-04', periods=7, freq='D', - name='idx') - expected_7_9 = date_range(start='2000-01-01', periods=7, freq='D', - name='idx') + expected_0_2 = date_range(start="2000-01-04", periods=7, freq="D", name="idx") + expected_7_9 = date_range(start="2000-01-01", periods=7, freq="D", name="idx") # reset freq to None - expected_3_5 = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', - '2000-01-07', '2000-01-08', '2000-01-09', - '2000-01-10'], freq=None, name='idx') - - cases = {(0, 1, 2): expected_0_2, - (7, 8, 9): expected_7_9, - (3, 4, 5): expected_3_5} + expected_3_5 = DatetimeIndex( + [ + "2000-01-01", + "2000-01-02", + "2000-01-03", + "2000-01-07", + "2000-01-08", + "2000-01-09", + "2000-01-10", + ], + freq=None, + name="idx", + ) + + cases = { + (0, 1, 2): expected_0_2, + (7, 8, 9): expected_7_9, + (3, 4, 5): expected_3_5, + } for n, expected in cases.items(): result = idx.delete(n) tm.assert_index_equal(result, expected) @@ -469,13 +546,18 @@ def test_delete_slice(self): assert result.name == expected.name assert result.freq == expected.freq - for tz in [None, 'Asia/Tokyo', 'US/Pacific']: - ts = pd.Series(1, index=pd.date_range( - '2000-01-01 09:00', periods=10, freq='H', name='idx', tz=tz)) + for tz in [None, "Asia/Tokyo", "US/Pacific"]: + ts = pd.Series( + 1, + index=pd.date_range( + "2000-01-01 09:00", periods=10, freq="H", name="idx", tz=tz + ), + ) # preserve freq result = ts.drop(ts.index[:5]).index - expected = pd.date_range('2000-01-01 14:00', periods=5, freq='H', - name='idx', tz=tz) + expected = pd.date_range( + "2000-01-01 14:00", periods=5, freq="H", name="idx", tz=tz + ) tm.assert_index_equal(result, expected) assert result.name == expected.name assert result.freq == expected.freq @@ -483,128 +565,155 @@ def test_delete_slice(self): # reset freq to None result = ts.drop(ts.index[[1, 3, 5, 7, 9]]).index - expected = DatetimeIndex(['2000-01-01 09:00', '2000-01-01 11:00', - '2000-01-01 13:00', - '2000-01-01 15:00', '2000-01-01 17:00'], - freq=None, name='idx', tz=tz) + expected = DatetimeIndex( + [ + "2000-01-01 09:00", + "2000-01-01 11:00", + "2000-01-01 13:00", + "2000-01-01 15:00", + "2000-01-01 17:00", + ], + freq=None, + name="idx", + tz=tz, + ) tm.assert_index_equal(result, expected) assert result.name == expected.name assert result.freq == expected.freq assert result.tz == expected.tz def test_get_loc(self): - idx = pd.date_range('2000-01-01', periods=3) + idx = pd.date_range("2000-01-01", periods=3) - for method in [None, 'pad', 'backfill', 'nearest']: + for method in [None, "pad", "backfill", "nearest"]: assert idx.get_loc(idx[1], method) == 1 assert idx.get_loc(idx[1].to_pydatetime(), method) == 1 assert idx.get_loc(str(idx[1]), method) == 1 if method is not None: - assert idx.get_loc(idx[1], method, - tolerance=pd.Timedelta('0 days')) == 1 - - assert idx.get_loc('2000-01-01', method='nearest') == 0 - assert idx.get_loc('2000-01-01T12', method='nearest') == 1 - - assert idx.get_loc('2000-01-01T12', method='nearest', - tolerance='1 day') == 1 - assert idx.get_loc('2000-01-01T12', method='nearest', - tolerance=pd.Timedelta('1D')) == 1 - assert idx.get_loc('2000-01-01T12', method='nearest', - tolerance=np.timedelta64(1, 'D')) == 1 - assert idx.get_loc('2000-01-01T12', method='nearest', - tolerance=timedelta(1)) == 1 - with pytest.raises(ValueError, match='unit abbreviation w/o a number'): - idx.get_loc('2000-01-01T12', method='nearest', tolerance='foo') + assert ( + idx.get_loc(idx[1], method, tolerance=pd.Timedelta("0 days")) == 1 + ) + + assert idx.get_loc("2000-01-01", method="nearest") == 0 + assert idx.get_loc("2000-01-01T12", method="nearest") == 1 + + assert idx.get_loc("2000-01-01T12", method="nearest", tolerance="1 day") == 1 + assert ( + idx.get_loc("2000-01-01T12", method="nearest", tolerance=pd.Timedelta("1D")) + == 1 + ) + assert ( + idx.get_loc( + "2000-01-01T12", method="nearest", tolerance=np.timedelta64(1, "D") + ) + == 1 + ) + assert ( + idx.get_loc("2000-01-01T12", method="nearest", tolerance=timedelta(1)) == 1 + ) + with pytest.raises(ValueError, match="unit abbreviation w/o a number"): + idx.get_loc("2000-01-01T12", method="nearest", tolerance="foo") with pytest.raises(KeyError): - idx.get_loc('2000-01-01T03', method='nearest', tolerance='2 hours') + idx.get_loc("2000-01-01T03", method="nearest", tolerance="2 hours") with pytest.raises( - ValueError, - match='tolerance size must match target index size'): - idx.get_loc('2000-01-01', method='nearest', - tolerance=[pd.Timedelta('1day').to_timedelta64(), - pd.Timedelta('1day').to_timedelta64()]) - - assert idx.get_loc('2000', method='nearest') == slice(0, 3) - assert idx.get_loc('2000-01', method='nearest') == slice(0, 3) - - assert idx.get_loc('1999', method='nearest') == 0 - assert idx.get_loc('2001', method='nearest') == 2 + ValueError, match="tolerance size must match target index size" + ): + idx.get_loc( + "2000-01-01", + method="nearest", + tolerance=[ + pd.Timedelta("1day").to_timedelta64(), + pd.Timedelta("1day").to_timedelta64(), + ], + ) + + assert idx.get_loc("2000", method="nearest") == slice(0, 3) + assert idx.get_loc("2000-01", method="nearest") == slice(0, 3) + + assert idx.get_loc("1999", method="nearest") == 0 + assert idx.get_loc("2001", method="nearest") == 2 with pytest.raises(KeyError): - idx.get_loc('1999', method='pad') + idx.get_loc("1999", method="pad") with pytest.raises(KeyError): - idx.get_loc('2001', method='backfill') + idx.get_loc("2001", method="backfill") with pytest.raises(KeyError): - idx.get_loc('foobar') + idx.get_loc("foobar") with pytest.raises(TypeError): idx.get_loc(slice(2)) - idx = pd.to_datetime(['2000-01-01', '2000-01-04']) - assert idx.get_loc('2000-01-02', method='nearest') == 0 - assert idx.get_loc('2000-01-03', method='nearest') == 1 - assert idx.get_loc('2000-01', method='nearest') == slice(0, 2) + idx = pd.to_datetime(["2000-01-01", "2000-01-04"]) + assert idx.get_loc("2000-01-02", method="nearest") == 0 + assert idx.get_loc("2000-01-03", method="nearest") == 1 + assert idx.get_loc("2000-01", method="nearest") == slice(0, 2) # time indexing - idx = pd.date_range('2000-01-01', periods=24, freq='H') - tm.assert_numpy_array_equal(idx.get_loc(time(12)), - np.array([12]), check_dtype=False) - tm.assert_numpy_array_equal(idx.get_loc(time(12, 30)), - np.array([]), check_dtype=False) + idx = pd.date_range("2000-01-01", periods=24, freq="H") + tm.assert_numpy_array_equal( + idx.get_loc(time(12)), np.array([12]), check_dtype=False + ) + tm.assert_numpy_array_equal( + idx.get_loc(time(12, 30)), np.array([]), check_dtype=False + ) with pytest.raises(NotImplementedError): - idx.get_loc(time(12, 30), method='pad') + idx.get_loc(time(12, 30), method="pad") def test_get_indexer(self): - idx = pd.date_range('2000-01-01', periods=3) + idx = pd.date_range("2000-01-01", periods=3) exp = np.array([0, 1, 2], dtype=np.intp) tm.assert_numpy_array_equal(idx.get_indexer(idx), exp) - target = idx[0] + pd.to_timedelta(['-1 hour', '12 hours', - '1 day 1 hour']) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 1, 1], dtype=np.intp)) + target = idx[0] + pd.to_timedelta(["-1 hour", "12 hours", "1 day 1 hour"]) tm.assert_numpy_array_equal( - idx.get_indexer(target, 'nearest', - tolerance=pd.Timedelta('1 hour')), - np.array([0, -1, 1], dtype=np.intp)) - tol_raw = [pd.Timedelta('1 hour'), - pd.Timedelta('1 hour'), - pd.Timedelta('1 hour').to_timedelta64(), ] + idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) + ) tm.assert_numpy_array_equal( - idx.get_indexer(target, 'nearest', - tolerance=[np.timedelta64(x) for x in tol_raw]), - np.array([0, -1, 1], dtype=np.intp)) - tol_bad = [pd.Timedelta('2 hour').to_timedelta64(), - pd.Timedelta('1 hour').to_timedelta64(), - 'foo', ] - with pytest.raises( - ValueError, match='abbreviation w/o a number'): - idx.get_indexer(target, 'nearest', tolerance=tol_bad) + idx.get_indexer(target, "backfill"), np.array([0, 1, 2], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest"), np.array([0, 1, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest", tolerance=pd.Timedelta("1 hour")), + np.array([0, -1, 1], dtype=np.intp), + ) + tol_raw = [ + pd.Timedelta("1 hour"), + pd.Timedelta("1 hour"), + pd.Timedelta("1 hour").to_timedelta64(), + ] + tm.assert_numpy_array_equal( + idx.get_indexer( + target, "nearest", tolerance=[np.timedelta64(x) for x in tol_raw] + ), + np.array([0, -1, 1], dtype=np.intp), + ) + tol_bad = [ + pd.Timedelta("2 hour").to_timedelta64(), + pd.Timedelta("1 hour").to_timedelta64(), + "foo", + ] + with pytest.raises(ValueError, match="abbreviation w/o a number"): + idx.get_indexer(target, "nearest", tolerance=tol_bad) with pytest.raises(ValueError): - idx.get_indexer(idx[[0]], method='nearest', tolerance='foo') + idx.get_indexer(idx[[0]], method="nearest", tolerance="foo") def test_reasonable_key_error(self): # GH#1062 - index = DatetimeIndex(['1/3/2000']) - with pytest.raises(KeyError, match='2000'): - index.get_loc('1/1/2000') + index = DatetimeIndex(["1/3/2000"]) + with pytest.raises(KeyError, match="2000"): + index.get_loc("1/1/2000") - @pytest.mark.parametrize('key', [pd.Timedelta(0), - pd.Timedelta(1), - timedelta(0)]) + @pytest.mark.parametrize("key", [pd.Timedelta(0), pd.Timedelta(1), timedelta(0)]) def test_timedelta_invalid_key(self, key): # GH#20464 - dti = pd.date_range('1970-01-01', periods=10) + dti = pd.date_range("1970-01-01", periods=10) with pytest.raises(TypeError): dti.get_loc(key) def test_get_loc_nat(self): # GH#20464 - index = DatetimeIndex(['1/3/2000', 'NaT']) + index = DatetimeIndex(["1/3/2000", "NaT"]) assert index.get_loc(pd.NaT) == 1 diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 91e614cd516b92..4ea32359b8d4a4 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -6,16 +6,14 @@ import pytest import pandas as pd -from pandas import ( - DatetimeIndex, Index, Timestamp, date_range, datetime, offsets) +from pandas import DatetimeIndex, Index, Timestamp, date_range, datetime, offsets import pandas.util.testing as tm class TestTimeSeries: - def test_pass_datetimeindex_to_index(self): # Bugs in #1396 - rng = date_range('1/1/2000', '3/1/2000') + rng = date_range("1/1/2000", "3/1/2000") idx = Index(rng, dtype=object) expected = Index(rng.to_pydatetime(), dtype=object) @@ -24,78 +22,126 @@ def test_pass_datetimeindex_to_index(self): def test_range_edges(self): # GH#13672 - idx = pd.date_range(start=Timestamp('1970-01-01 00:00:00.000000001'), - end=Timestamp('1970-01-01 00:00:00.000000004'), - freq='N') - exp = DatetimeIndex(['1970-01-01 00:00:00.000000001', - '1970-01-01 00:00:00.000000002', - '1970-01-01 00:00:00.000000003', - '1970-01-01 00:00:00.000000004']) + idx = pd.date_range( + start=Timestamp("1970-01-01 00:00:00.000000001"), + end=Timestamp("1970-01-01 00:00:00.000000004"), + freq="N", + ) + exp = DatetimeIndex( + [ + "1970-01-01 00:00:00.000000001", + "1970-01-01 00:00:00.000000002", + "1970-01-01 00:00:00.000000003", + "1970-01-01 00:00:00.000000004", + ] + ) tm.assert_index_equal(idx, exp) - idx = pd.date_range(start=Timestamp('1970-01-01 00:00:00.000000004'), - end=Timestamp('1970-01-01 00:00:00.000000001'), - freq='N') + idx = pd.date_range( + start=Timestamp("1970-01-01 00:00:00.000000004"), + end=Timestamp("1970-01-01 00:00:00.000000001"), + freq="N", + ) exp = DatetimeIndex([]) tm.assert_index_equal(idx, exp) - idx = pd.date_range(start=Timestamp('1970-01-01 00:00:00.000000001'), - end=Timestamp('1970-01-01 00:00:00.000000001'), - freq='N') - exp = DatetimeIndex(['1970-01-01 00:00:00.000000001']) + idx = pd.date_range( + start=Timestamp("1970-01-01 00:00:00.000000001"), + end=Timestamp("1970-01-01 00:00:00.000000001"), + freq="N", + ) + exp = DatetimeIndex(["1970-01-01 00:00:00.000000001"]) tm.assert_index_equal(idx, exp) - idx = pd.date_range(start=Timestamp('1970-01-01 00:00:00.000001'), - end=Timestamp('1970-01-01 00:00:00.000004'), - freq='U') - exp = DatetimeIndex(['1970-01-01 00:00:00.000001', - '1970-01-01 00:00:00.000002', - '1970-01-01 00:00:00.000003', - '1970-01-01 00:00:00.000004']) + idx = pd.date_range( + start=Timestamp("1970-01-01 00:00:00.000001"), + end=Timestamp("1970-01-01 00:00:00.000004"), + freq="U", + ) + exp = DatetimeIndex( + [ + "1970-01-01 00:00:00.000001", + "1970-01-01 00:00:00.000002", + "1970-01-01 00:00:00.000003", + "1970-01-01 00:00:00.000004", + ] + ) tm.assert_index_equal(idx, exp) - idx = pd.date_range(start=Timestamp('1970-01-01 00:00:00.001'), - end=Timestamp('1970-01-01 00:00:00.004'), - freq='L') - exp = DatetimeIndex(['1970-01-01 00:00:00.001', - '1970-01-01 00:00:00.002', - '1970-01-01 00:00:00.003', - '1970-01-01 00:00:00.004']) + idx = pd.date_range( + start=Timestamp("1970-01-01 00:00:00.001"), + end=Timestamp("1970-01-01 00:00:00.004"), + freq="L", + ) + exp = DatetimeIndex( + [ + "1970-01-01 00:00:00.001", + "1970-01-01 00:00:00.002", + "1970-01-01 00:00:00.003", + "1970-01-01 00:00:00.004", + ] + ) tm.assert_index_equal(idx, exp) - idx = pd.date_range(start=Timestamp('1970-01-01 00:00:01'), - end=Timestamp('1970-01-01 00:00:04'), freq='S') - exp = DatetimeIndex(['1970-01-01 00:00:01', '1970-01-01 00:00:02', - '1970-01-01 00:00:03', '1970-01-01 00:00:04']) + idx = pd.date_range( + start=Timestamp("1970-01-01 00:00:01"), + end=Timestamp("1970-01-01 00:00:04"), + freq="S", + ) + exp = DatetimeIndex( + [ + "1970-01-01 00:00:01", + "1970-01-01 00:00:02", + "1970-01-01 00:00:03", + "1970-01-01 00:00:04", + ] + ) tm.assert_index_equal(idx, exp) - idx = pd.date_range(start=Timestamp('1970-01-01 00:01'), - end=Timestamp('1970-01-01 00:04'), freq='T') - exp = DatetimeIndex(['1970-01-01 00:01', '1970-01-01 00:02', - '1970-01-01 00:03', '1970-01-01 00:04']) + idx = pd.date_range( + start=Timestamp("1970-01-01 00:01"), + end=Timestamp("1970-01-01 00:04"), + freq="T", + ) + exp = DatetimeIndex( + [ + "1970-01-01 00:01", + "1970-01-01 00:02", + "1970-01-01 00:03", + "1970-01-01 00:04", + ] + ) tm.assert_index_equal(idx, exp) - idx = pd.date_range(start=Timestamp('1970-01-01 01:00'), - end=Timestamp('1970-01-01 04:00'), freq='H') - exp = DatetimeIndex(['1970-01-01 01:00', '1970-01-01 02:00', - '1970-01-01 03:00', '1970-01-01 04:00']) + idx = pd.date_range( + start=Timestamp("1970-01-01 01:00"), + end=Timestamp("1970-01-01 04:00"), + freq="H", + ) + exp = DatetimeIndex( + [ + "1970-01-01 01:00", + "1970-01-01 02:00", + "1970-01-01 03:00", + "1970-01-01 04:00", + ] + ) tm.assert_index_equal(idx, exp) - idx = pd.date_range(start=Timestamp('1970-01-01'), - end=Timestamp('1970-01-04'), freq='D') - exp = DatetimeIndex(['1970-01-01', '1970-01-02', - '1970-01-03', '1970-01-04']) + idx = pd.date_range( + start=Timestamp("1970-01-01"), end=Timestamp("1970-01-04"), freq="D" + ) + exp = DatetimeIndex(["1970-01-01", "1970-01-02", "1970-01-03", "1970-01-04"]) tm.assert_index_equal(idx, exp) class TestDatetime64: - def test_datetimeindex_accessors(self): - dti_naive = pd.date_range(freq='D', start=datetime(1998, 1, 1), - periods=365) + dti_naive = pd.date_range(freq="D", start=datetime(1998, 1, 1), periods=365) # GH#13303 - dti_tz = pd.date_range(freq='D', start=datetime(1998, 1, 1), - periods=365, tz='US/Eastern') + dti_tz = pd.date_range( + freq="D", start=datetime(1998, 1, 1), periods=365, tz="US/Eastern" + ) for dti in [dti_naive, dti_tz]: assert dti.year[0] == 1998 @@ -156,14 +202,14 @@ def test_datetimeindex_accessors(self): assert len(dti.is_year_end) == 365 assert len(dti.weekday_name) == 365 - dti.name = 'name' + dti.name = "name" # non boolean accessors -> return Index for accessor in DatetimeIndex._field_ops: res = getattr(dti, accessor) assert len(res) == 365 assert isinstance(res, Index) - assert res.name == 'name' + assert res.name == "name" # boolean accessors -> return array for accessor in DatetimeIndex._bool_ops: @@ -176,11 +222,10 @@ def test_datetimeindex_accessors(self): exp = dti[[0, 90, 181, 273]] tm.assert_index_equal(res, exp) res = dti[dti.is_leap_year] - exp = DatetimeIndex([], freq='D', tz=dti.tz, name='name') + exp = DatetimeIndex([], freq="D", tz=dti.tz, name="name") tm.assert_index_equal(res, exp) - dti = pd.date_range(freq='BQ-FEB', start=datetime(1998, 1, 1), - periods=4) + dti = pd.date_range(freq="BQ-FEB", start=datetime(1998, 1, 1), periods=4) assert sum(dti.is_quarter_start) == 0 assert sum(dti.is_quarter_end) == 4 @@ -188,49 +233,50 @@ def test_datetimeindex_accessors(self): assert sum(dti.is_year_end) == 1 # Ensure is_start/end accessors throw ValueError for CustomBusinessDay, - bday_egypt = offsets.CustomBusinessDay(weekmask='Sun Mon Tue Wed Thu') + bday_egypt = offsets.CustomBusinessDay(weekmask="Sun Mon Tue Wed Thu") dti = date_range(datetime(2013, 4, 30), periods=5, freq=bday_egypt) msg = "Custom business days is not supported by is_month_start" with pytest.raises(ValueError, match=msg): dti.is_month_start - dti = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03']) + dti = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-03"]) assert dti.is_month_start[0] == 1 tests = [ - (Timestamp('2013-06-01', freq='M').is_month_start, 1), - (Timestamp('2013-06-01', freq='BM').is_month_start, 0), - (Timestamp('2013-06-03', freq='M').is_month_start, 0), - (Timestamp('2013-06-03', freq='BM').is_month_start, 1), - (Timestamp('2013-02-28', freq='Q-FEB').is_month_end, 1), - (Timestamp('2013-02-28', freq='Q-FEB').is_quarter_end, 1), - (Timestamp('2013-02-28', freq='Q-FEB').is_year_end, 1), - (Timestamp('2013-03-01', freq='Q-FEB').is_month_start, 1), - (Timestamp('2013-03-01', freq='Q-FEB').is_quarter_start, 1), - (Timestamp('2013-03-01', freq='Q-FEB').is_year_start, 1), - (Timestamp('2013-03-31', freq='QS-FEB').is_month_end, 1), - (Timestamp('2013-03-31', freq='QS-FEB').is_quarter_end, 0), - (Timestamp('2013-03-31', freq='QS-FEB').is_year_end, 0), - (Timestamp('2013-02-01', freq='QS-FEB').is_month_start, 1), - (Timestamp('2013-02-01', freq='QS-FEB').is_quarter_start, 1), - (Timestamp('2013-02-01', freq='QS-FEB').is_year_start, 1), - (Timestamp('2013-06-30', freq='BQ').is_month_end, 0), - (Timestamp('2013-06-30', freq='BQ').is_quarter_end, 0), - (Timestamp('2013-06-30', freq='BQ').is_year_end, 0), - (Timestamp('2013-06-28', freq='BQ').is_month_end, 1), - (Timestamp('2013-06-28', freq='BQ').is_quarter_end, 1), - (Timestamp('2013-06-28', freq='BQ').is_year_end, 0), - (Timestamp('2013-06-30', freq='BQS-APR').is_month_end, 0), - (Timestamp('2013-06-30', freq='BQS-APR').is_quarter_end, 0), - (Timestamp('2013-06-30', freq='BQS-APR').is_year_end, 0), - (Timestamp('2013-06-28', freq='BQS-APR').is_month_end, 1), - (Timestamp('2013-06-28', freq='BQS-APR').is_quarter_end, 1), - (Timestamp('2013-03-29', freq='BQS-APR').is_year_end, 1), - (Timestamp('2013-11-01', freq='AS-NOV').is_year_start, 1), - (Timestamp('2013-10-31', freq='AS-NOV').is_year_end, 1), - (Timestamp('2012-02-01').days_in_month, 29), - (Timestamp('2013-02-01').days_in_month, 28)] + (Timestamp("2013-06-01", freq="M").is_month_start, 1), + (Timestamp("2013-06-01", freq="BM").is_month_start, 0), + (Timestamp("2013-06-03", freq="M").is_month_start, 0), + (Timestamp("2013-06-03", freq="BM").is_month_start, 1), + (Timestamp("2013-02-28", freq="Q-FEB").is_month_end, 1), + (Timestamp("2013-02-28", freq="Q-FEB").is_quarter_end, 1), + (Timestamp("2013-02-28", freq="Q-FEB").is_year_end, 1), + (Timestamp("2013-03-01", freq="Q-FEB").is_month_start, 1), + (Timestamp("2013-03-01", freq="Q-FEB").is_quarter_start, 1), + (Timestamp("2013-03-01", freq="Q-FEB").is_year_start, 1), + (Timestamp("2013-03-31", freq="QS-FEB").is_month_end, 1), + (Timestamp("2013-03-31", freq="QS-FEB").is_quarter_end, 0), + (Timestamp("2013-03-31", freq="QS-FEB").is_year_end, 0), + (Timestamp("2013-02-01", freq="QS-FEB").is_month_start, 1), + (Timestamp("2013-02-01", freq="QS-FEB").is_quarter_start, 1), + (Timestamp("2013-02-01", freq="QS-FEB").is_year_start, 1), + (Timestamp("2013-06-30", freq="BQ").is_month_end, 0), + (Timestamp("2013-06-30", freq="BQ").is_quarter_end, 0), + (Timestamp("2013-06-30", freq="BQ").is_year_end, 0), + (Timestamp("2013-06-28", freq="BQ").is_month_end, 1), + (Timestamp("2013-06-28", freq="BQ").is_quarter_end, 1), + (Timestamp("2013-06-28", freq="BQ").is_year_end, 0), + (Timestamp("2013-06-30", freq="BQS-APR").is_month_end, 0), + (Timestamp("2013-06-30", freq="BQS-APR").is_quarter_end, 0), + (Timestamp("2013-06-30", freq="BQS-APR").is_year_end, 0), + (Timestamp("2013-06-28", freq="BQS-APR").is_month_end, 1), + (Timestamp("2013-06-28", freq="BQS-APR").is_quarter_end, 1), + (Timestamp("2013-03-29", freq="BQS-APR").is_year_end, 1), + (Timestamp("2013-11-01", freq="AS-NOV").is_year_start, 1), + (Timestamp("2013-10-31", freq="AS-NOV").is_year_end, 1), + (Timestamp("2012-02-01").days_in_month, 29), + (Timestamp("2013-02-01").days_in_month, 28), + ] for ts, value in tests: assert ts == value @@ -244,37 +290,59 @@ def test_datetimeindex_accessors(self): assert [d.weekofyear for d in dates] == expected # GH 12806 - @pytest.mark.parametrize('time_locale', [ - None] if tm.get_locales() is None else [None] + tm.get_locales()) + @pytest.mark.parametrize( + "time_locale", [None] if tm.get_locales() is None else [None] + tm.get_locales() + ) def test_datetime_name_accessors(self, time_locale): # Test Monday -> Sunday and January -> December, in that sequence if time_locale is None: # If the time_locale is None, day-name and month_name should # return the english attributes - expected_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', - 'Friday', 'Saturday', 'Sunday'] - expected_months = ['January', 'February', 'March', 'April', 'May', - 'June', 'July', 'August', 'September', - 'October', 'November', 'December'] + expected_days = [ + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + "Sunday", + ] + expected_months = [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", + ] else: with tm.set_locale(time_locale, locale.LC_TIME): expected_days = calendar.day_name[:] expected_months = calendar.month_name[1:] # GH#11128 - dti = pd.date_range(freq='D', start=datetime(1998, 1, 1), - periods=365) - english_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', - 'Friday', 'Saturday', 'Sunday'] - for day, name, eng_name in zip(range(4, 11), - expected_days, - english_days): + dti = pd.date_range(freq="D", start=datetime(1998, 1, 1), periods=365) + english_days = [ + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + "Sunday", + ] + for day, name, eng_name in zip(range(4, 11), expected_days, english_days): name = name.capitalize() assert dti.weekday_name[day] == eng_name assert dti.day_name(locale=time_locale)[day] == name ts = Timestamp(datetime(2016, 4, day)) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): assert ts.weekday_name == eng_name assert ts.day_name(locale=time_locale) == name dti = dti.append(DatetimeIndex([pd.NaT])) @@ -283,7 +351,7 @@ def test_datetime_name_accessors(self, time_locale): assert np.isnan(ts.day_name(locale=time_locale)) # GH#12805 - dti = pd.date_range(freq='M', start='2012', end='2013') + dti = pd.date_range(freq="M", start="2012", end="2013") result = dti.month_name(locale=time_locale) expected = Index([month.capitalize() for month in expected_months]) @@ -308,5 +376,4 @@ def test_datetime_name_accessors(self, time_locale): def test_nanosecond_field(self): dti = DatetimeIndex(np.arange(10)) - tm.assert_index_equal(dti.nanosecond, - pd.Index(np.arange(10, dtype=np.int64))) + tm.assert_index_equal(dti.nanosecond, pd.Index(np.arange(10, dtype=np.int64))) diff --git a/pandas/tests/indexes/datetimes/test_missing.py b/pandas/tests/indexes/datetimes/test_missing.py index 5a6f2fa86b11ff..6d94319b33b02e 100644 --- a/pandas/tests/indexes/datetimes/test_missing.py +++ b/pandas/tests/indexes/datetimes/test_missing.py @@ -5,48 +5,58 @@ class TestDatetimeIndex: - - @pytest.mark.parametrize('tz', ['US/Eastern', 'Asia/Tokyo']) + @pytest.mark.parametrize("tz", ["US/Eastern", "Asia/Tokyo"]) def test_fillna_datetime64(self, tz): # GH 11343 - idx = pd.DatetimeIndex(['2011-01-01 09:00', pd.NaT, - '2011-01-01 11:00']) + idx = pd.DatetimeIndex(["2011-01-01 09:00", pd.NaT, "2011-01-01 11:00"]) - exp = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', - '2011-01-01 11:00']) - tm.assert_index_equal( - idx.fillna(pd.Timestamp('2011-01-01 10:00')), exp) + exp = pd.DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"] + ) + tm.assert_index_equal(idx.fillna(pd.Timestamp("2011-01-01 10:00")), exp) # tz mismatch - exp = pd.Index([pd.Timestamp('2011-01-01 09:00'), - pd.Timestamp('2011-01-01 10:00', tz=tz), - pd.Timestamp('2011-01-01 11:00')], dtype=object) - tm.assert_index_equal( - idx.fillna(pd.Timestamp('2011-01-01 10:00', tz=tz)), exp) + exp = pd.Index( + [ + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 10:00", tz=tz), + pd.Timestamp("2011-01-01 11:00"), + ], + dtype=object, + ) + tm.assert_index_equal(idx.fillna(pd.Timestamp("2011-01-01 10:00", tz=tz)), exp) # object - exp = pd.Index([pd.Timestamp('2011-01-01 09:00'), 'x', - pd.Timestamp('2011-01-01 11:00')], dtype=object) - tm.assert_index_equal(idx.fillna('x'), exp) - - idx = pd.DatetimeIndex(['2011-01-01 09:00', pd.NaT, - '2011-01-01 11:00'], tz=tz) - - exp = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', - '2011-01-01 11:00'], tz=tz) - tm.assert_index_equal( - idx.fillna(pd.Timestamp('2011-01-01 10:00', tz=tz)), exp) - - exp = pd.Index([pd.Timestamp('2011-01-01 09:00', tz=tz), - pd.Timestamp('2011-01-01 10:00'), - pd.Timestamp('2011-01-01 11:00', tz=tz)], - dtype=object) - tm.assert_index_equal( - idx.fillna(pd.Timestamp('2011-01-01 10:00')), exp) + exp = pd.Index( + [pd.Timestamp("2011-01-01 09:00"), "x", pd.Timestamp("2011-01-01 11:00")], + dtype=object, + ) + tm.assert_index_equal(idx.fillna("x"), exp) + + idx = pd.DatetimeIndex(["2011-01-01 09:00", pd.NaT, "2011-01-01 11:00"], tz=tz) + + exp = pd.DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], tz=tz + ) + tm.assert_index_equal(idx.fillna(pd.Timestamp("2011-01-01 10:00", tz=tz)), exp) + + exp = pd.Index( + [ + pd.Timestamp("2011-01-01 09:00", tz=tz), + pd.Timestamp("2011-01-01 10:00"), + pd.Timestamp("2011-01-01 11:00", tz=tz), + ], + dtype=object, + ) + tm.assert_index_equal(idx.fillna(pd.Timestamp("2011-01-01 10:00")), exp) # object - exp = pd.Index([pd.Timestamp('2011-01-01 09:00', tz=tz), - 'x', - pd.Timestamp('2011-01-01 11:00', tz=tz)], - dtype=object) - tm.assert_index_equal(idx.fillna('x'), exp) + exp = pd.Index( + [ + pd.Timestamp("2011-01-01 09:00", tz=tz), + "x", + pd.Timestamp("2011-01-01 11:00", tz=tz), + ], + dtype=object, + ) + tm.assert_index_equal(idx.fillna("x"), exp) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 4a8f691987f8e1..d4dff2cbce89b5 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -8,8 +8,14 @@ import pandas as pd from pandas import ( - DatetimeIndex, Index, PeriodIndex, Series, Timestamp, bdate_range, - date_range) + DatetimeIndex, + Index, + PeriodIndex, + Series, + Timestamp, + bdate_range, + date_range, +) from pandas.tests.test_base import Ops import pandas.util.testing as tm @@ -19,11 +25,9 @@ class TestDatetimeIndexOps(Ops): - def setup_method(self, method): super().setup_method(method) - mask = lambda x: (isinstance(x, DatetimeIndex) or - isinstance(x, PeriodIndex)) + mask = lambda x: (isinstance(x, DatetimeIndex) or isinstance(x, PeriodIndex)) self.is_valid_objs = [o for o in self.objs if mask(o)] self.not_valid_objs = [o for o in self.objs if not mask(o)] @@ -38,7 +42,7 @@ def test_ops_properties_basic(self): # sanity check that the behavior didn't change # GH#7206 msg = "'Series' object has no attribute '{}'" - for op in ['year', 'day', 'second', 'weekday']: + for op in ["year", "day", "second", "weekday"]: with pytest.raises(AttributeError, match=msg.format(op)): getattr(self.dt_series, op) @@ -53,32 +57,43 @@ def test_ops_properties_basic(self): def test_repeat_range(self, tz_naive_fixture): tz = tz_naive_fixture - rng = date_range('1/1/2000', '1/1/2001') + rng = date_range("1/1/2000", "1/1/2001") result = rng.repeat(5) assert result.freq is None assert len(result) == 5 * len(rng) - index = pd.date_range('2001-01-01', periods=2, freq='D', tz=tz) - exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', - '2001-01-02', '2001-01-02'], tz=tz) + index = pd.date_range("2001-01-01", periods=2, freq="D", tz=tz) + exp = pd.DatetimeIndex( + ["2001-01-01", "2001-01-01", "2001-01-02", "2001-01-02"], tz=tz + ) for res in [index.repeat(2), np.repeat(index, 2)]: tm.assert_index_equal(res, exp) assert res.freq is None - index = pd.date_range('2001-01-01', periods=2, freq='2D', tz=tz) - exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', - '2001-01-03', '2001-01-03'], tz=tz) + index = pd.date_range("2001-01-01", periods=2, freq="2D", tz=tz) + exp = pd.DatetimeIndex( + ["2001-01-01", "2001-01-01", "2001-01-03", "2001-01-03"], tz=tz + ) for res in [index.repeat(2), np.repeat(index, 2)]: tm.assert_index_equal(res, exp) assert res.freq is None - index = pd.DatetimeIndex(['2001-01-01', 'NaT', '2003-01-01'], - tz=tz) - exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', '2001-01-01', - 'NaT', 'NaT', 'NaT', - '2003-01-01', '2003-01-01', '2003-01-01'], - tz=tz) + index = pd.DatetimeIndex(["2001-01-01", "NaT", "2003-01-01"], tz=tz) + exp = pd.DatetimeIndex( + [ + "2001-01-01", + "2001-01-01", + "2001-01-01", + "NaT", + "NaT", + "NaT", + "2003-01-01", + "2003-01-01", + "2003-01-01", + ], + tz=tz, + ) for res in [index.repeat(3), np.repeat(index, 3)]: tm.assert_index_equal(res, exp) assert res.freq is None @@ -88,15 +103,16 @@ def test_repeat(self, tz_naive_fixture): reps = 2 msg = "the 'axis' parameter is not supported" - rng = pd.date_range(start='2016-01-01', periods=2, - freq='30Min', tz=tz) + rng = pd.date_range(start="2016-01-01", periods=2, freq="30Min", tz=tz) - expected_rng = DatetimeIndex([ - Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'), - ]) + expected_rng = DatetimeIndex( + [ + Timestamp("2016-01-01 00:00:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 00:00:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 00:30:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 00:30:00", tz=tz, freq="30T"), + ] + ) res = rng.repeat(reps) tm.assert_index_equal(res, expected_rng) @@ -108,75 +124,93 @@ def test_repeat(self, tz_naive_fixture): def test_resolution(self, tz_naive_fixture): tz = tz_naive_fixture - for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T', - 'S', 'L', 'U'], - ['day', 'day', 'day', 'day', 'hour', - 'minute', 'second', 'millisecond', - 'microsecond']): - idx = pd.date_range(start='2013-04-01', periods=30, freq=freq, - tz=tz) + for freq, expected in zip( + ["A", "Q", "M", "D", "H", "T", "S", "L", "U"], + [ + "day", + "day", + "day", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + ], + ): + idx = pd.date_range(start="2013-04-01", periods=30, freq=freq, tz=tz) assert idx.resolution == expected def test_value_counts_unique(self, tz_naive_fixture): tz = tz_naive_fixture # GH 7735 - idx = pd.date_range('2011-01-01 09:00', freq='H', periods=10) + idx = pd.date_range("2011-01-01 09:00", freq="H", periods=10) # create repeated values, 'n'th element is repeated by n+1 times - idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)), - tz=tz) + idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)), tz=tz) - exp_idx = pd.date_range('2011-01-01 18:00', freq='-1H', periods=10, - tz=tz) - expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') + exp_idx = pd.date_range("2011-01-01 18:00", freq="-1H", periods=10, tz=tz) + expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) - expected = pd.date_range('2011-01-01 09:00', freq='H', periods=10, - tz=tz) + expected = pd.date_range("2011-01-01 09:00", freq="H", periods=10, tz=tz) tm.assert_index_equal(idx.unique(), expected) - idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 09:00', - '2013-01-01 09:00', '2013-01-01 08:00', - '2013-01-01 08:00', pd.NaT], tz=tz) - - exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00'], - tz=tz) + idx = DatetimeIndex( + [ + "2013-01-01 09:00", + "2013-01-01 09:00", + "2013-01-01 09:00", + "2013-01-01 08:00", + "2013-01-01 08:00", + pd.NaT, + ], + tz=tz, + ) + + exp_idx = DatetimeIndex(["2013-01-01 09:00", "2013-01-01 08:00"], tz=tz) expected = Series([3, 2], index=exp_idx) for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) - exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00', - pd.NaT], tz=tz) + exp_idx = DatetimeIndex(["2013-01-01 09:00", "2013-01-01 08:00", pd.NaT], tz=tz) expected = Series([3, 2, 1], index=exp_idx) for obj in [idx, Series(idx)]: - tm.assert_series_equal(obj.value_counts(dropna=False), - expected) + tm.assert_series_equal(obj.value_counts(dropna=False), expected) tm.assert_index_equal(idx.unique(), exp_idx) def test_nonunique_contains(self): # GH 9512 - for idx in map(DatetimeIndex, - ([0, 1, 0], [0, 0, -1], [0, -1, -1], - ['2015', '2015', '2016'], ['2015', '2015', '2014'])): + for idx in map( + DatetimeIndex, + ( + [0, 1, 0], + [0, 0, -1], + [0, -1, -1], + ["2015", "2015", "2016"], + ["2015", "2015", "2014"], + ), + ): assert idx[0] in idx - @pytest.mark.parametrize('idx', - [ - DatetimeIndex( - ['2011-01-01', - '2011-01-02', - '2011-01-03'], - freq='D', name='idx'), - DatetimeIndex( - ['2011-01-01 09:00', - '2011-01-01 10:00', - '2011-01-01 11:00'], - freq='H', name='tzidx', tz='Asia/Tokyo') - ]) + @pytest.mark.parametrize( + "idx", + [ + DatetimeIndex( + ["2011-01-01", "2011-01-02", "2011-01-03"], freq="D", name="idx" + ), + DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], + freq="H", + name="tzidx", + tz="Asia/Tokyo", + ), + ], + ) def test_order_with_freq(self, idx): ordered = idx.sort_values() tm.assert_index_equal(ordered, idx) @@ -190,41 +224,39 @@ def test_order_with_freq(self, idx): ordered, indexer = idx.sort_values(return_indexer=True) tm.assert_index_equal(ordered, idx) - tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), - check_dtype=False) + tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), check_dtype=False) assert ordered.freq == idx.freq - ordered, indexer = idx.sort_values(return_indexer=True, - ascending=False) + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) expected = idx[::-1] tm.assert_index_equal(ordered, expected) - tm.assert_numpy_array_equal(indexer, - np.array([2, 1, 0]), - check_dtype=False) + tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0]), check_dtype=False) assert ordered.freq == expected.freq assert ordered.freq.n == -1 - @pytest.mark.parametrize('index_dates,expected_dates', [ - (['2011-01-01', '2011-01-03', '2011-01-05', - '2011-01-02', '2011-01-01'], - ['2011-01-01', '2011-01-01', '2011-01-02', - '2011-01-03', '2011-01-05']), - (['2011-01-01', '2011-01-03', '2011-01-05', - '2011-01-02', '2011-01-01'], - ['2011-01-01', '2011-01-01', '2011-01-02', - '2011-01-03', '2011-01-05']), - ([pd.NaT, '2011-01-03', '2011-01-05', - '2011-01-02', pd.NaT], - [pd.NaT, pd.NaT, '2011-01-02', '2011-01-03', - '2011-01-05']) - ]) - def test_order_without_freq(self, index_dates, expected_dates, - tz_naive_fixture): + @pytest.mark.parametrize( + "index_dates,expected_dates", + [ + ( + ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], + ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], + ), + ( + ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], + ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], + ), + ( + [pd.NaT, "2011-01-03", "2011-01-05", "2011-01-02", pd.NaT], + [pd.NaT, pd.NaT, "2011-01-02", "2011-01-03", "2011-01-05"], + ), + ], + ) + def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture): tz = tz_naive_fixture # without freq - index = DatetimeIndex(index_dates, tz=tz, name='idx') - expected = DatetimeIndex(expected_dates, tz=tz, name='idx') + index = DatetimeIndex(index_dates, tz=tz, name="idx") + expected = DatetimeIndex(expected_dates, tz=tz, name="idx") ordered = index.sort_values() tm.assert_index_equal(ordered, expected) @@ -241,8 +273,7 @@ def test_order_without_freq(self, index_dates, expected_dates, tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) assert ordered.freq is None - ordered, indexer = index.sort_values(return_indexer=True, - ascending=False) + ordered, indexer = index.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, expected[::-1]) exp = np.array([2, 1, 3, 4, 0]) @@ -251,7 +282,7 @@ def test_order_without_freq(self, index_dates, expected_dates, def test_drop_duplicates_metadata(self): # GH 10115 - idx = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') + idx = pd.date_range("2011-01-01", "2011-01-31", freq="D", name="idx") result = idx.drop_duplicates() tm.assert_index_equal(idx, result) assert idx.freq == result.freq @@ -264,7 +295,7 @@ def test_drop_duplicates_metadata(self): def test_drop_duplicates(self): # to check Index/Series compat - base = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') + base = pd.date_range("2011-01-01", "2011-01-31", freq="D", name="idx") idx = base.append(base[:5]) res = idx.drop_duplicates() @@ -272,10 +303,10 @@ def test_drop_duplicates(self): res = Series(idx).drop_duplicates() tm.assert_series_equal(res, Series(base)) - res = idx.drop_duplicates(keep='last') + res = idx.drop_duplicates(keep="last") exp = base[5:].append(base[:5]) tm.assert_index_equal(res, exp) - res = Series(idx).drop_duplicates(keep='last') + res = Series(idx).drop_duplicates(keep="last") tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) res = idx.drop_duplicates(keep=False) @@ -283,14 +314,34 @@ def test_drop_duplicates(self): res = Series(idx).drop_duplicates(keep=False) tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) - @pytest.mark.parametrize('freq', [ - 'A', '2A', '-2A', 'Q', '-1Q', 'M', '-1M', 'D', '3D', - '-3D', 'W', '-1W', 'H', '2H', '-2H', 'T', '2T', 'S', - '-3S']) + @pytest.mark.parametrize( + "freq", + [ + "A", + "2A", + "-2A", + "Q", + "-1Q", + "M", + "-1M", + "D", + "3D", + "-3D", + "W", + "-1W", + "H", + "2H", + "-2H", + "T", + "2T", + "S", + "-3S", + ], + ) def test_infer_freq(self, freq): # GH 11018 - idx = pd.date_range('2011-01-01 09:00:00', freq=freq, periods=10) - result = pd.DatetimeIndex(idx.asi8, freq='infer') + idx = pd.date_range("2011-01-01 09:00:00", freq=freq, periods=10) + result = pd.DatetimeIndex(idx.asi8, freq="infer") tm.assert_index_equal(idx, result) assert result.freq == freq @@ -299,25 +350,23 @@ def test_nat(self, tz_naive_fixture): assert pd.DatetimeIndex._na_value is pd.NaT assert pd.DatetimeIndex([])._na_value is pd.NaT - idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz) + idx = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) assert idx._can_hold_na tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) assert idx.hasnans is False - tm.assert_numpy_array_equal(idx._nan_idxs, - np.array([], dtype=np.intp)) + tm.assert_numpy_array_equal(idx._nan_idxs, np.array([], dtype=np.intp)) - idx = pd.DatetimeIndex(['2011-01-01', 'NaT'], tz=tz) + idx = pd.DatetimeIndex(["2011-01-01", "NaT"], tz=tz) assert idx._can_hold_na tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) assert idx.hasnans is True - tm.assert_numpy_array_equal(idx._nan_idxs, - np.array([1], dtype=np.intp)) + tm.assert_numpy_array_equal(idx._nan_idxs, np.array([1], dtype=np.intp)) def test_equals(self): # GH 13107 - idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02', 'NaT']) + idx = pd.DatetimeIndex(["2011-01-01", "2011-01-02", "NaT"]) assert idx.equals(idx) assert idx.equals(idx.copy()) assert idx.equals(idx.astype(object)) @@ -326,8 +375,7 @@ def test_equals(self): assert not idx.equals(list(idx)) assert not idx.equals(pd.Series(idx)) - idx2 = pd.DatetimeIndex(['2011-01-01', '2011-01-02', 'NaT'], - tz='US/Pacific') + idx2 = pd.DatetimeIndex(["2011-01-01", "2011-01-02", "NaT"], tz="US/Pacific") assert not idx.equals(idx2) assert not idx.equals(idx2.copy()) assert not idx.equals(idx2.astype(object)) @@ -336,7 +384,7 @@ def test_equals(self): assert not idx.equals(pd.Series(idx2)) # same internal, different tz - idx3 = pd.DatetimeIndex._simple_new(idx.asi8, tz='US/Pacific') + idx3 = pd.DatetimeIndex._simple_new(idx.asi8, tz="US/Pacific") tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) assert not idx.equals(idx3) assert not idx.equals(idx3.copy()) @@ -345,11 +393,9 @@ def test_equals(self): assert not idx.equals(list(idx3)) assert not idx.equals(pd.Series(idx3)) - @pytest.mark.parametrize('values', [ - ['20180101', '20180103', '20180105'], []]) - @pytest.mark.parametrize('freq', [ - '2D', Day(2), '2B', BDay(2), '48H', Hour(48)]) - @pytest.mark.parametrize('tz', [None, 'US/Eastern']) + @pytest.mark.parametrize("values", [["20180101", "20180103", "20180105"], []]) + @pytest.mark.parametrize("freq", ["2D", Day(2), "2B", BDay(2), "48H", Hour(48)]) + @pytest.mark.parametrize("tz", [None, "US/Eastern"]) def test_freq_setter(self, values, freq, tz): # GH 20678 idx = DatetimeIndex(values, tz=tz) @@ -365,21 +411,23 @@ def test_freq_setter(self, values, freq, tz): def test_freq_setter_errors(self): # GH 20678 - idx = DatetimeIndex(['20180101', '20180103', '20180105']) + idx = DatetimeIndex(["20180101", "20180103", "20180105"]) # setting with an incompatible freq - msg = ('Inferred frequency 2D from passed values does not conform to ' - 'passed frequency 5D') + msg = ( + "Inferred frequency 2D from passed values does not conform to " + "passed frequency 5D" + ) with pytest.raises(ValueError, match=msg): - idx.freq = '5D' + idx.freq = "5D" # setting with non-freq string - with pytest.raises(ValueError, match='Invalid frequency'): - idx.freq = 'foo' + with pytest.raises(ValueError, match="Invalid frequency"): + idx.freq = "foo" def test_offset_deprecated(self): # GH 20716 - idx = pd.DatetimeIndex(['20180101', '20180102']) + idx = pd.DatetimeIndex(["20180101", "20180102"]) # getter deprecated with tm.assert_produces_warning(FutureWarning): @@ -391,7 +439,6 @@ def test_offset_deprecated(self): class TestBusinessDatetimeIndex: - def setup_method(self, method): self.rng = bdate_range(START, END) @@ -437,10 +484,10 @@ def test_identical(self): assert t1.identical(t2) # name - t1 = t1.rename('foo') + t1 = t1.rename("foo") assert t1.equals(t2) assert not t1.identical(t2) - t2 = t2.rename('foo') + t2 = t2.rename("foo") assert t1.identical(t2) # freq @@ -451,7 +498,7 @@ def test_identical(self): class TestCustomDatetimeIndex: def setup_method(self, method): - self.rng = bdate_range(START, END, freq='C') + self.rng = bdate_range(START, END, freq="C") def test_comparison(self): d = self.rng[10] @@ -490,8 +537,7 @@ def test_shift_periods(self): idx = pd.date_range(start=START, end=END, periods=3) tm.assert_index_equal(idx.shift(periods=0), idx) tm.assert_index_equal(idx.shift(0), idx) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=True): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=True): tm.assert_index_equal(idx.shift(n=0), idx) def test_pickle_unpickle(self): diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 6ec8568ce72428..3095bf9657277f 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -8,37 +8,43 @@ import pandas as pd from pandas import ( - DataFrame, DatetimeIndex, Index, Series, Timedelta, Timestamp, date_range) + DataFrame, + DatetimeIndex, + Index, + Series, + Timedelta, + Timestamp, + date_range, +) from pandas.core.indexing import IndexingError from pandas.util import testing as tm class TestSlicing: def test_dti_slicing(self): - dti = date_range(start='1/1/2005', end='12/1/2005', freq='M') + dti = date_range(start="1/1/2005", end="12/1/2005", freq="M") dti2 = dti[[1, 3, 5]] v1 = dti2[0] v2 = dti2[1] v3 = dti2[2] - assert v1 == Timestamp('2/28/2005') - assert v2 == Timestamp('4/30/2005') - assert v3 == Timestamp('6/30/2005') + assert v1 == Timestamp("2/28/2005") + assert v2 == Timestamp("4/30/2005") + assert v3 == Timestamp("6/30/2005") # don't carry freq through irregular slicing assert dti2.freq is None def test_slice_keeps_name(self): # GH4226 - st = pd.Timestamp('2013-07-01 00:00:00', tz='America/Los_Angeles') - et = pd.Timestamp('2013-07-02 00:00:00', tz='America/Los_Angeles') - dr = pd.date_range(st, et, freq='H', name='timebucket') + st = pd.Timestamp("2013-07-01 00:00:00", tz="America/Los_Angeles") + et = pd.Timestamp("2013-07-02 00:00:00", tz="America/Los_Angeles") + dr = pd.date_range(st, et, freq="H", name="timebucket") assert dr[1:].name == dr.name def test_slice_with_negative_step(self): - ts = Series(np.arange(20), - date_range('2014-01-01', periods=20, freq='MS')) + ts = Series(np.arange(20), date_range("2014-01-01", periods=20, freq="MS")) SLC = pd.IndexSlice def assert_slices_equivalent(l_slc, i_slc): @@ -46,50 +52,51 @@ def assert_slices_equivalent(l_slc, i_slc): tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) - assert_slices_equivalent(SLC[Timestamp('2014-10-01')::-1], SLC[9::-1]) - assert_slices_equivalent(SLC['2014-10-01'::-1], SLC[9::-1]) + assert_slices_equivalent(SLC[Timestamp("2014-10-01") :: -1], SLC[9::-1]) + assert_slices_equivalent(SLC["2014-10-01"::-1], SLC[9::-1]) - assert_slices_equivalent(SLC[:Timestamp('2014-10-01'):-1], SLC[:8:-1]) - assert_slices_equivalent(SLC[:'2014-10-01':-1], SLC[:8:-1]) + assert_slices_equivalent(SLC[: Timestamp("2014-10-01") : -1], SLC[:8:-1]) + assert_slices_equivalent(SLC[:"2014-10-01":-1], SLC[:8:-1]) - assert_slices_equivalent(SLC['2015-02-01':'2014-10-01':-1], - SLC[13:8:-1]) - assert_slices_equivalent(SLC[Timestamp('2015-02-01'):Timestamp( - '2014-10-01'):-1], SLC[13:8:-1]) - assert_slices_equivalent(SLC['2015-02-01':Timestamp('2014-10-01'):-1], - SLC[13:8:-1]) - assert_slices_equivalent(SLC[Timestamp('2015-02-01'):'2014-10-01':-1], - SLC[13:8:-1]) + assert_slices_equivalent(SLC["2015-02-01":"2014-10-01":-1], SLC[13:8:-1]) + assert_slices_equivalent( + SLC[Timestamp("2015-02-01") : Timestamp("2014-10-01") : -1], SLC[13:8:-1] + ) + assert_slices_equivalent( + SLC["2015-02-01" : Timestamp("2014-10-01") : -1], SLC[13:8:-1] + ) + assert_slices_equivalent( + SLC[Timestamp("2015-02-01") : "2014-10-01" : -1], SLC[13:8:-1] + ) - assert_slices_equivalent(SLC['2014-10-01':'2015-02-01':-1], SLC[:0]) + assert_slices_equivalent(SLC["2014-10-01":"2015-02-01":-1], SLC[:0]) def test_slice_with_zero_step_raises(self): - ts = Series(np.arange(20), - date_range('2014-01-01', periods=20, freq='MS')) - with pytest.raises(ValueError, match='slice step cannot be zero'): + ts = Series(np.arange(20), date_range("2014-01-01", periods=20, freq="MS")) + with pytest.raises(ValueError, match="slice step cannot be zero"): ts[::0] - with pytest.raises(ValueError, match='slice step cannot be zero'): + with pytest.raises(ValueError, match="slice step cannot be zero"): ts.loc[::0] - with pytest.raises(ValueError, match='slice step cannot be zero'): + with pytest.raises(ValueError, match="slice step cannot be zero"): ts.loc[::0] def test_slice_bounds_empty(self): # GH#14354 - empty_idx = date_range(freq='1H', periods=0, end='2015') + empty_idx = date_range(freq="1H", periods=0, end="2015") - right = empty_idx._maybe_cast_slice_bound('2015-01-02', 'right', 'loc') - exp = Timestamp('2015-01-02 23:59:59.999999999') + right = empty_idx._maybe_cast_slice_bound("2015-01-02", "right", "loc") + exp = Timestamp("2015-01-02 23:59:59.999999999") assert right == exp - left = empty_idx._maybe_cast_slice_bound('2015-01-02', 'left', 'loc') - exp = Timestamp('2015-01-02 00:00:00') + left = empty_idx._maybe_cast_slice_bound("2015-01-02", "left", "loc") + exp = Timestamp("2015-01-02 00:00:00") assert left == exp def test_slice_duplicate_monotonic(self): # https://github.com/pandas-dev/pandas/issues/16515 - idx = pd.DatetimeIndex(['2017', '2017']) - result = idx._maybe_cast_slice_bound('2017-01-01', 'left', 'loc') - expected = Timestamp('2017-01-01') + idx = pd.DatetimeIndex(["2017", "2017"]) + result = idx._maybe_cast_slice_bound("2017-01-01", "left", "loc") + expected = Timestamp("2017-01-01") assert result == expected def test_monotone_DTI_indexing_bug(self): @@ -98,136 +105,139 @@ def test_monotone_DTI_indexing_bug(self): # partial string indexing. df = pd.DataFrame(list(range(5))) - date_list = ['2018-01-02', '2017-02-10', '2016-03-10', - '2015-03-15', '2014-03-16'] + date_list = [ + "2018-01-02", + "2017-02-10", + "2016-03-10", + "2015-03-15", + "2014-03-16", + ] date_index = pd.to_datetime(date_list) - df['date'] = date_index - expected = pd.DataFrame({0: list(range(5)), 'date': date_index}) + df["date"] = date_index + expected = pd.DataFrame({0: list(range(5)), "date": date_index}) tm.assert_frame_equal(df, expected) - df = pd.DataFrame({'A': [1, 2, 3]}, - index=pd.date_range('20170101', - periods=3)[::-1]) - expected = pd.DataFrame({'A': 1}, - index=pd.date_range('20170103', - periods=1)) - tm.assert_frame_equal(df.loc['2017-01-03'], expected) + df = pd.DataFrame( + {"A": [1, 2, 3]}, index=pd.date_range("20170101", periods=3)[::-1] + ) + expected = pd.DataFrame({"A": 1}, index=pd.date_range("20170103", periods=1)) + tm.assert_frame_equal(df.loc["2017-01-03"], expected) def test_slice_year(self): - dti = date_range(freq='B', start=datetime(2005, 1, 1), periods=500) + dti = date_range(freq="B", start=datetime(2005, 1, 1), periods=500) s = Series(np.arange(len(dti)), index=dti) - result = s['2005'] + result = s["2005"] expected = s[s.index.year == 2005] tm.assert_series_equal(result, expected) df = DataFrame(np.random.rand(len(dti), 5), index=dti) - result = df.loc['2005'] + result = df.loc["2005"] expected = df[df.index.year == 2005] tm.assert_frame_equal(result, expected) - rng = date_range('1/1/2000', '1/1/2010') + rng = date_range("1/1/2000", "1/1/2010") - result = rng.get_loc('2009') + result = rng.get_loc("2009") expected = slice(3288, 3653) assert result == expected def test_slice_quarter(self): - dti = date_range(freq='D', start=datetime(2000, 6, 1), periods=500) + dti = date_range(freq="D", start=datetime(2000, 6, 1), periods=500) s = Series(np.arange(len(dti)), index=dti) - assert len(s['2001Q1']) == 90 + assert len(s["2001Q1"]) == 90 df = DataFrame(np.random.rand(len(dti), 5), index=dti) - assert len(df.loc['1Q01']) == 90 + assert len(df.loc["1Q01"]) == 90 def test_slice_month(self): - dti = date_range(freq='D', start=datetime(2005, 1, 1), periods=500) + dti = date_range(freq="D", start=datetime(2005, 1, 1), periods=500) s = Series(np.arange(len(dti)), index=dti) - assert len(s['2005-11']) == 30 + assert len(s["2005-11"]) == 30 df = DataFrame(np.random.rand(len(dti), 5), index=dti) - assert len(df.loc['2005-11']) == 30 + assert len(df.loc["2005-11"]) == 30 - tm.assert_series_equal(s['2005-11'], s['11-2005']) + tm.assert_series_equal(s["2005-11"], s["11-2005"]) def test_partial_slice(self): - rng = date_range(freq='D', start=datetime(2005, 1, 1), periods=500) + rng = date_range(freq="D", start=datetime(2005, 1, 1), periods=500) s = Series(np.arange(len(rng)), index=rng) - result = s['2005-05':'2006-02'] - expected = s['20050501':'20060228'] + result = s["2005-05":"2006-02"] + expected = s["20050501":"20060228"] tm.assert_series_equal(result, expected) - result = s['2005-05':] - expected = s['20050501':] + result = s["2005-05":] + expected = s["20050501":] tm.assert_series_equal(result, expected) - result = s[:'2006-02'] - expected = s[:'20060228'] + result = s[:"2006-02"] + expected = s[:"20060228"] tm.assert_series_equal(result, expected) - result = s['2005-1-1'] + result = s["2005-1-1"] assert result == s.iloc[0] with pytest.raises(KeyError, match=r"^'2004-12-31'$"): - s['2004-12-31'] + s["2004-12-31"] def test_partial_slice_daily(self): - rng = date_range(freq='H', start=datetime(2005, 1, 31), periods=500) + rng = date_range(freq="H", start=datetime(2005, 1, 31), periods=500) s = Series(np.arange(len(rng)), index=rng) - result = s['2005-1-31'] + result = s["2005-1-31"] tm.assert_series_equal(result, s.iloc[:24]) with pytest.raises(KeyError, match=r"^'2004-12-31 00'$"): - s['2004-12-31 00'] + s["2004-12-31 00"] def test_partial_slice_hourly(self): - rng = date_range(freq='T', start=datetime(2005, 1, 1, 20, 0, 0), - periods=500) + rng = date_range(freq="T", start=datetime(2005, 1, 1, 20, 0, 0), periods=500) s = Series(np.arange(len(rng)), index=rng) - result = s['2005-1-1'] - tm.assert_series_equal(result, s.iloc[:60 * 4]) + result = s["2005-1-1"] + tm.assert_series_equal(result, s.iloc[: 60 * 4]) - result = s['2005-1-1 20'] + result = s["2005-1-1 20"] tm.assert_series_equal(result, s.iloc[:60]) - assert s['2005-1-1 20:00'] == s.iloc[0] + assert s["2005-1-1 20:00"] == s.iloc[0] with pytest.raises(KeyError, match=r"^'2004-12-31 00:15'$"): - s['2004-12-31 00:15'] + s["2004-12-31 00:15"] def test_partial_slice_minutely(self): - rng = date_range(freq='S', start=datetime(2005, 1, 1, 23, 59, 0), - periods=500) + rng = date_range(freq="S", start=datetime(2005, 1, 1, 23, 59, 0), periods=500) s = Series(np.arange(len(rng)), index=rng) - result = s['2005-1-1 23:59'] + result = s["2005-1-1 23:59"] tm.assert_series_equal(result, s.iloc[:60]) - result = s['2005-1-1'] + result = s["2005-1-1"] tm.assert_series_equal(result, s.iloc[:60]) - assert s[Timestamp('2005-1-1 23:59:00')] == s.iloc[0] + assert s[Timestamp("2005-1-1 23:59:00")] == s.iloc[0] with pytest.raises(KeyError, match=r"^'2004-12-31 00:00:00'$"): - s['2004-12-31 00:00:00'] + s["2004-12-31 00:00:00"] def test_partial_slice_second_precision(self): - rng = date_range(start=datetime(2005, 1, 1, 0, 0, 59, - microsecond=999990), - periods=20, freq='US') + rng = date_range( + start=datetime(2005, 1, 1, 0, 0, 59, microsecond=999990), + periods=20, + freq="US", + ) s = Series(np.arange(20), rng) - tm.assert_series_equal(s['2005-1-1 00:00'], s.iloc[:10]) - tm.assert_series_equal(s['2005-1-1 00:00:59'], s.iloc[:10]) + tm.assert_series_equal(s["2005-1-1 00:00"], s.iloc[:10]) + tm.assert_series_equal(s["2005-1-1 00:00:59"], s.iloc[:10]) - tm.assert_series_equal(s['2005-1-1 00:01'], s.iloc[10:]) - tm.assert_series_equal(s['2005-1-1 00:01:00'], s.iloc[10:]) + tm.assert_series_equal(s["2005-1-1 00:01"], s.iloc[10:]) + tm.assert_series_equal(s["2005-1-1 00:01:00"], s.iloc[10:]) - assert s[Timestamp('2005-1-1 00:00:59.999990')] == s.iloc[0] - with pytest.raises(KeyError, match='2005-1-1 00:00:00'): - s['2005-1-1 00:00:00'] + assert s[Timestamp("2005-1-1 00:00:59.999990")] == s.iloc[0] + with pytest.raises(KeyError, match="2005-1-1 00:00:00"): + s["2005-1-1 00:00:00"] def test_partial_slicing_dataframe(self): # GH14856 @@ -237,17 +247,22 @@ def test_partial_slicing_dataframe(self): # string is considered a slice # - If string resolution is equal to or more precise than index # resolution, string is considered an exact match - formats = ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%d %H', - '%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S'] - resolutions = ['year', 'month', 'day', 'hour', 'minute', 'second'] + formats = [ + "%Y", + "%Y-%m", + "%Y-%m-%d", + "%Y-%m-%d %H", + "%Y-%m-%d %H:%M", + "%Y-%m-%d %H:%M:%S", + ] + resolutions = ["year", "month", "day", "hour", "minute", "second"] for rnum, resolution in enumerate(resolutions[2:], 2): # we check only 'day', 'hour', 'minute' and 'second' unit = Timedelta("1 " + resolution) middate = datetime(2012, 1, 1, 0, 0, 0) - index = DatetimeIndex([middate - unit, - middate, middate + unit]) + index = DatetimeIndex([middate - unit, middate, middate + unit]) values = [1, 2, 3] - df = DataFrame({'a': values}, index, dtype=np.int64) + df = DataFrame({"a": values}, index, dtype=np.int64) assert df.index.resolution == resolution # Timestamp with the same resolution as index @@ -256,7 +271,7 @@ def test_partial_slicing_dataframe(self): for timestamp, expected in zip(index, values): ts_string = timestamp.strftime(formats[rnum]) # make ts_string as precise as index - result = df['a'][ts_string] + result = df["a"][ts_string] assert isinstance(result, np.int64) assert result == expected msg = r"^'{}'$".format(ts_string) @@ -265,13 +280,12 @@ def test_partial_slicing_dataframe(self): # Timestamp with resolution less precise than index for fmt in formats[:rnum]: - for element, theslice in [[0, slice(None, 1)], - [1, slice(1, None)]]: + for element, theslice in [[0, slice(None, 1)], [1, slice(1, None)]]: ts_string = index[element].strftime(fmt) # Series should return slice - result = df['a'][ts_string] - expected = df['a'][theslice] + result = df["a"][ts_string] + expected = df["a"][theslice] tm.assert_series_equal(result, expected) # Frame should return slice as well @@ -283,9 +297,9 @@ def test_partial_slicing_dataframe(self): # Compatible with existing key # Should return scalar for Series # and raise KeyError for Frame - for fmt in formats[rnum + 1:]: + for fmt in formats[rnum + 1 :]: ts_string = index[1].strftime(fmt) - result = df['a'][ts_string] + result = df["a"][ts_string] assert isinstance(result, np.int64) assert result == 2 msg = r"^'{}'$".format(ts_string) @@ -294,12 +308,12 @@ def test_partial_slicing_dataframe(self): # Not compatible with existing key # Should raise KeyError - for fmt, res in list(zip(formats, resolutions))[rnum + 1:]: + for fmt, res in list(zip(formats, resolutions))[rnum + 1 :]: ts = index[1] + Timedelta("1 " + res) ts_string = ts.strftime(fmt) msg = r"^'{}'$".format(ts_string) with pytest.raises(KeyError, match=msg): - df['a'][ts_string] + df["a"][ts_string] with pytest.raises(KeyError, match=msg): df[ts_string] @@ -307,109 +321,140 @@ def test_partial_slicing_with_multiindex(self): # GH 4758 # partial string indexing with a multi-index buggy - df = DataFrame({'ACCOUNT': ["ACCT1", "ACCT1", "ACCT1", "ACCT2"], - 'TICKER': ["ABC", "MNP", "XYZ", "XYZ"], - 'val': [1, 2, 3, 4]}, - index=date_range("2013-06-19 09:30:00", - periods=4, freq='5T')) - df_multi = df.set_index(['ACCOUNT', 'TICKER'], append=True) - - expected = DataFrame([ - [1] - ], index=Index(['ABC'], name='TICKER'), columns=['val']) - result = df_multi.loc[('2013-06-19 09:30:00', 'ACCT1')] + df = DataFrame( + { + "ACCOUNT": ["ACCT1", "ACCT1", "ACCT1", "ACCT2"], + "TICKER": ["ABC", "MNP", "XYZ", "XYZ"], + "val": [1, 2, 3, 4], + }, + index=date_range("2013-06-19 09:30:00", periods=4, freq="5T"), + ) + df_multi = df.set_index(["ACCOUNT", "TICKER"], append=True) + + expected = DataFrame( + [[1]], index=Index(["ABC"], name="TICKER"), columns=["val"] + ) + result = df_multi.loc[("2013-06-19 09:30:00", "ACCT1")] tm.assert_frame_equal(result, expected) expected = df_multi.loc[ - (pd.Timestamp('2013-06-19 09:30:00', tz=None), 'ACCT1', 'ABC')] - result = df_multi.loc[('2013-06-19 09:30:00', 'ACCT1', 'ABC')] + (pd.Timestamp("2013-06-19 09:30:00", tz=None), "ACCT1", "ABC") + ] + result = df_multi.loc[("2013-06-19 09:30:00", "ACCT1", "ABC")] tm.assert_series_equal(result, expected) # this is an IndexingError as we don't do partial string selection on # multi-levels. msg = "Too many indexers" with pytest.raises(IndexingError, match=msg): - df_multi.loc[('2013-06-19', 'ACCT1', 'ABC')] + df_multi.loc[("2013-06-19", "ACCT1", "ABC")] # GH 4294 # partial slice on a series mi - s = pd.DataFrame(np.random.rand(1000, 1000), index=pd.date_range( - '2000-1-1', periods=1000)).stack() + s = pd.DataFrame( + np.random.rand(1000, 1000), index=pd.date_range("2000-1-1", periods=1000) + ).stack() s2 = s[:-1].copy() - expected = s2['2000-1-4'] - result = s2[pd.Timestamp('2000-1-4')] + expected = s2["2000-1-4"] + result = s2[pd.Timestamp("2000-1-4")] tm.assert_series_equal(result, expected) - result = s[pd.Timestamp('2000-1-4')] - expected = s['2000-1-4'] + result = s[pd.Timestamp("2000-1-4")] + expected = s["2000-1-4"] tm.assert_series_equal(result, expected) df2 = pd.DataFrame(s) - expected = df2.xs('2000-1-4') - result = df2.loc[pd.Timestamp('2000-1-4')] + expected = df2.xs("2000-1-4") + result = df2.loc[pd.Timestamp("2000-1-4")] tm.assert_frame_equal(result, expected) def test_partial_slice_doesnt_require_monotonicity(self): # For historical reasons. - s = pd.Series(np.arange(10), pd.date_range('2014-01-01', periods=10)) + s = pd.Series(np.arange(10), pd.date_range("2014-01-01", periods=10)) nonmonotonic = s[[3, 5, 4]] expected = nonmonotonic.iloc[:0] - timestamp = pd.Timestamp('2014-01-10') + timestamp = pd.Timestamp("2014-01-10") - tm.assert_series_equal(nonmonotonic['2014-01-10':], expected) - with pytest.raises(KeyError, - match=r"Timestamp\('2014-01-10 00:00:00'\)"): + tm.assert_series_equal(nonmonotonic["2014-01-10":], expected) + with pytest.raises(KeyError, match=r"Timestamp\('2014-01-10 00:00:00'\)"): nonmonotonic[timestamp:] - tm.assert_series_equal(nonmonotonic.loc['2014-01-10':], expected) - with pytest.raises(KeyError, - match=r"Timestamp\('2014-01-10 00:00:00'\)"): + tm.assert_series_equal(nonmonotonic.loc["2014-01-10":], expected) + with pytest.raises(KeyError, match=r"Timestamp\('2014-01-10 00:00:00'\)"): nonmonotonic.loc[timestamp:] def test_loc_datetime_length_one(self): # GH16071 - df = pd.DataFrame(columns=['1'], - index=pd.date_range('2016-10-01T00:00:00', - '2016-10-01T23:59:59')) - result = df.loc[datetime(2016, 10, 1):] + df = pd.DataFrame( + columns=["1"], + index=pd.date_range("2016-10-01T00:00:00", "2016-10-01T23:59:59"), + ) + result = df.loc[datetime(2016, 10, 1) :] tm.assert_frame_equal(result, df) - result = df.loc['2016-10-01T00:00:00':] + result = df.loc["2016-10-01T00:00:00":] tm.assert_frame_equal(result, df) - @pytest.mark.parametrize('datetimelike', [ - Timestamp('20130101'), datetime(2013, 1, 1), - np.datetime64('2013-01-01T00:00', 'ns')]) - @pytest.mark.parametrize('op,expected', [ - (op.lt, [True, False, False, False]), - (op.le, [True, True, False, False]), - (op.eq, [False, True, False, False]), - (op.gt, [False, False, False, True])]) + @pytest.mark.parametrize( + "datetimelike", + [ + Timestamp("20130101"), + datetime(2013, 1, 1), + np.datetime64("2013-01-01T00:00", "ns"), + ], + ) + @pytest.mark.parametrize( + "op,expected", + [ + (op.lt, [True, False, False, False]), + (op.le, [True, True, False, False]), + (op.eq, [False, True, False, False]), + (op.gt, [False, False, False, True]), + ], + ) def test_selection_by_datetimelike(self, datetimelike, op, expected): # GH issue #17965, test for ability to compare datetime64[ns] columns # to datetimelike - df = DataFrame({'A': [pd.Timestamp('20120101'), - pd.Timestamp('20130101'), - np.nan, pd.Timestamp('20130103')]}) + df = DataFrame( + { + "A": [ + pd.Timestamp("20120101"), + pd.Timestamp("20130101"), + np.nan, + pd.Timestamp("20130103"), + ] + } + ) result = op(df.A, datetimelike) - expected = Series(expected, name='A') + expected = Series(expected, name="A") tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('start', [ - '2018-12-02 21:50:00+00:00', pd.Timestamp('2018-12-02 21:50:00+00:00'), - pd.Timestamp('2018-12-02 21:50:00+00:00').to_pydatetime() - ]) - @pytest.mark.parametrize('end', [ - '2018-12-02 21:52:00+00:00', pd.Timestamp('2018-12-02 21:52:00+00:00'), - pd.Timestamp('2018-12-02 21:52:00+00:00').to_pydatetime() - ]) + @pytest.mark.parametrize( + "start", + [ + "2018-12-02 21:50:00+00:00", + pd.Timestamp("2018-12-02 21:50:00+00:00"), + pd.Timestamp("2018-12-02 21:50:00+00:00").to_pydatetime(), + ], + ) + @pytest.mark.parametrize( + "end", + [ + "2018-12-02 21:52:00+00:00", + pd.Timestamp("2018-12-02 21:52:00+00:00"), + pd.Timestamp("2018-12-02 21:52:00+00:00").to_pydatetime(), + ], + ) def test_getitem_with_datestring_with_UTC_offset(self, start, end): # GH 24076 - idx = pd.date_range(start='2018-12-02 14:50:00-07:00', - end='2018-12-02 14:50:00-07:00', freq='1min') - df = pd.DataFrame(1, index=idx, columns=['A']) + idx = pd.date_range( + start="2018-12-02 14:50:00-07:00", + end="2018-12-02 14:50:00-07:00", + freq="1min", + ) + df = pd.DataFrame(1, index=idx, columns=["A"]) result = df[start:end] expected = df.iloc[0:3, :] tm.assert_frame_equal(result, expected) @@ -418,7 +463,7 @@ def test_getitem_with_datestring_with_UTC_offset(self, start, end): start = str(start) end = str(end) with pytest.raises(ValueError, match="Both dates must"): - df[start:end[:-4] + '1:00'] + df[start : end[:-4] + "1:00"] with pytest.raises(ValueError, match="The index must be timezone"): df = df.tz_localize(None) diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index ae14396c753989..00310f4fba7c74 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -17,38 +17,48 @@ class TestDatetimeIndexOps: def test_dti_time(self): - rng = date_range('1/1/2000', freq='12min', periods=10) + rng = date_range("1/1/2000", freq="12min", periods=10) result = pd.Index(rng).time expected = [t.time() for t in rng] assert (result == expected).all() def test_dti_date(self): - rng = date_range('1/1/2000', freq='12H', periods=10) + rng = date_range("1/1/2000", freq="12H", periods=10) result = pd.Index(rng).date expected = [t.date() for t in rng] assert (result == expected).all() - @pytest.mark.parametrize('data', [ - ['1400-01-01'], - [datetime(1400, 1, 1)]]) + @pytest.mark.parametrize("data", [["1400-01-01"], [datetime(1400, 1, 1)]]) def test_dti_date_out_of_range(self, data): # GH#1475 msg = "Out of bounds nanosecond timestamp: 1400-01-01 00:00:00" with pytest.raises(OutOfBoundsDatetime, match=msg): DatetimeIndex(data) - @pytest.mark.parametrize('field', [ - 'dayofweek', 'dayofyear', 'week', 'weekofyear', 'quarter', - 'days_in_month', 'is_month_start', 'is_month_end', - 'is_quarter_start', 'is_quarter_end', 'is_year_start', - 'is_year_end', 'weekday_name']) + @pytest.mark.parametrize( + "field", + [ + "dayofweek", + "dayofyear", + "week", + "weekofyear", + "quarter", + "days_in_month", + "is_month_start", + "is_month_end", + "is_quarter_start", + "is_quarter_end", + "is_year_start", + "is_year_end", + "weekday_name", + ], + ) def test_dti_timestamp_fields(self, field): # extra fields from DatetimeIndex like quarter and week idx = tm.makeDateIndex(100) expected = getattr(idx, field)[-1] - if field == 'weekday_name': - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + if field == "weekday_name": + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = getattr(Timestamp(idx[-1]), field) else: result = getattr(Timestamp(idx[-1]), field) @@ -65,117 +75,141 @@ def test_dti_timestamp_freq_fields(self): # DatetimeIndex.round def test_round_daily(self): - dti = date_range('20130101 09:10:11', periods=5) - result = dti.round('D') - expected = date_range('20130101', periods=5) + dti = date_range("20130101 09:10:11", periods=5) + result = dti.round("D") + expected = date_range("20130101", periods=5) tm.assert_index_equal(result, expected) - dti = dti.tz_localize('UTC').tz_convert('US/Eastern') - result = dti.round('D') - expected = date_range('20130101', - periods=5).tz_localize('US/Eastern') + dti = dti.tz_localize("UTC").tz_convert("US/Eastern") + result = dti.round("D") + expected = date_range("20130101", periods=5).tz_localize("US/Eastern") tm.assert_index_equal(result, expected) - result = dti.round('s') + result = dti.round("s") tm.assert_index_equal(result, dti) - @pytest.mark.parametrize('freq, error_msg', [ - ('Y', ' is a non-fixed frequency'), - ('M', ' is a non-fixed frequency'), - ('foobar', 'Invalid frequency: foobar')]) + @pytest.mark.parametrize( + "freq, error_msg", + [ + ("Y", " is a non-fixed frequency"), + ("M", " is a non-fixed frequency"), + ("foobar", "Invalid frequency: foobar"), + ], + ) def test_round_invalid(self, freq, error_msg): - dti = date_range('20130101 09:10:11', periods=5) - dti = dti.tz_localize('UTC').tz_convert('US/Eastern') + dti = date_range("20130101 09:10:11", periods=5) + dti = dti.tz_localize("UTC").tz_convert("US/Eastern") with pytest.raises(ValueError, match=error_msg): dti.round(freq) def test_round(self, tz_naive_fixture): tz = tz_naive_fixture - rng = date_range(start='2016-01-01', periods=5, - freq='30Min', tz=tz) + rng = date_range(start="2016-01-01", periods=5, freq="30Min", tz=tz) elt = rng[1] - expected_rng = DatetimeIndex([ - Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 01:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 02:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 02:00:00', tz=tz, freq='30T'), - ]) + expected_rng = DatetimeIndex( + [ + Timestamp("2016-01-01 00:00:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 00:00:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 01:00:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 02:00:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 02:00:00", tz=tz, freq="30T"), + ] + ) expected_elt = expected_rng[1] - tm.assert_index_equal(rng.round(freq='H'), expected_rng) - assert elt.round(freq='H') == expected_elt + tm.assert_index_equal(rng.round(freq="H"), expected_rng) + assert elt.round(freq="H") == expected_elt msg = pd._libs.tslibs.frequencies.INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): - rng.round(freq='foo') + rng.round(freq="foo") with pytest.raises(ValueError, match=msg): - elt.round(freq='foo') + elt.round(freq="foo") msg = " is a non-fixed frequency" with pytest.raises(ValueError, match=msg): - rng.round(freq='M') + rng.round(freq="M") with pytest.raises(ValueError, match=msg): - elt.round(freq='M') + elt.round(freq="M") # GH#14440 & GH#15578 - index = DatetimeIndex(['2016-10-17 12:00:00.0015'], tz=tz) - result = index.round('ms') - expected = DatetimeIndex(['2016-10-17 12:00:00.002000'], tz=tz) + index = DatetimeIndex(["2016-10-17 12:00:00.0015"], tz=tz) + result = index.round("ms") + expected = DatetimeIndex(["2016-10-17 12:00:00.002000"], tz=tz) tm.assert_index_equal(result, expected) - for freq in ['us', 'ns']: + for freq in ["us", "ns"]: tm.assert_index_equal(index, index.round(freq)) - index = DatetimeIndex(['2016-10-17 12:00:00.00149'], tz=tz) - result = index.round('ms') - expected = DatetimeIndex(['2016-10-17 12:00:00.001000'], tz=tz) + index = DatetimeIndex(["2016-10-17 12:00:00.00149"], tz=tz) + result = index.round("ms") + expected = DatetimeIndex(["2016-10-17 12:00:00.001000"], tz=tz) tm.assert_index_equal(result, expected) - index = DatetimeIndex(['2016-10-17 12:00:00.001501031']) - result = index.round('10ns') - expected = DatetimeIndex(['2016-10-17 12:00:00.001501030']) + index = DatetimeIndex(["2016-10-17 12:00:00.001501031"]) + result = index.round("10ns") + expected = DatetimeIndex(["2016-10-17 12:00:00.001501030"]) tm.assert_index_equal(result, expected) with tm.assert_produces_warning(False): - ts = '2016-10-17 12:00:00.001501031' - DatetimeIndex([ts]).round('1010ns') + ts = "2016-10-17 12:00:00.001501031" + DatetimeIndex([ts]).round("1010ns") def test_no_rounding_occurs(self, tz_naive_fixture): # GH 21262 tz = tz_naive_fixture - rng = date_range(start='2016-01-01', periods=5, - freq='2Min', tz=tz) - - expected_rng = DatetimeIndex([ - Timestamp('2016-01-01 00:00:00', tz=tz, freq='2T'), - Timestamp('2016-01-01 00:02:00', tz=tz, freq='2T'), - Timestamp('2016-01-01 00:04:00', tz=tz, freq='2T'), - Timestamp('2016-01-01 00:06:00', tz=tz, freq='2T'), - Timestamp('2016-01-01 00:08:00', tz=tz, freq='2T'), - ]) - - tm.assert_index_equal(rng.round(freq='2T'), expected_rng) - - @pytest.mark.parametrize('test_input, rounder, freq, expected', [ - (['2117-01-01 00:00:45'], 'floor', '15s', ['2117-01-01 00:00:45']), - (['2117-01-01 00:00:45'], 'ceil', '15s', ['2117-01-01 00:00:45']), - (['2117-01-01 00:00:45.000000012'], 'floor', '10ns', - ['2117-01-01 00:00:45.000000010']), - (['1823-01-01 00:00:01.000000012'], 'ceil', '10ns', - ['1823-01-01 00:00:01.000000020']), - (['1823-01-01 00:00:01'], 'floor', '1s', ['1823-01-01 00:00:01']), - (['1823-01-01 00:00:01'], 'ceil', '1s', ['1823-01-01 00:00:01']), - (['2018-01-01 00:15:00'], 'ceil', '15T', ['2018-01-01 00:15:00']), - (['2018-01-01 00:15:00'], 'floor', '15T', ['2018-01-01 00:15:00']), - (['1823-01-01 03:00:00'], 'ceil', '3H', ['1823-01-01 03:00:00']), - (['1823-01-01 03:00:00'], 'floor', '3H', ['1823-01-01 03:00:00']), - (('NaT', '1823-01-01 00:00:01'), 'floor', '1s', - ('NaT', '1823-01-01 00:00:01')), - (('NaT', '1823-01-01 00:00:01'), 'ceil', '1s', - ('NaT', '1823-01-01 00:00:01')) - ]) + rng = date_range(start="2016-01-01", periods=5, freq="2Min", tz=tz) + + expected_rng = DatetimeIndex( + [ + Timestamp("2016-01-01 00:00:00", tz=tz, freq="2T"), + Timestamp("2016-01-01 00:02:00", tz=tz, freq="2T"), + Timestamp("2016-01-01 00:04:00", tz=tz, freq="2T"), + Timestamp("2016-01-01 00:06:00", tz=tz, freq="2T"), + Timestamp("2016-01-01 00:08:00", tz=tz, freq="2T"), + ] + ) + + tm.assert_index_equal(rng.round(freq="2T"), expected_rng) + + @pytest.mark.parametrize( + "test_input, rounder, freq, expected", + [ + (["2117-01-01 00:00:45"], "floor", "15s", ["2117-01-01 00:00:45"]), + (["2117-01-01 00:00:45"], "ceil", "15s", ["2117-01-01 00:00:45"]), + ( + ["2117-01-01 00:00:45.000000012"], + "floor", + "10ns", + ["2117-01-01 00:00:45.000000010"], + ), + ( + ["1823-01-01 00:00:01.000000012"], + "ceil", + "10ns", + ["1823-01-01 00:00:01.000000020"], + ), + (["1823-01-01 00:00:01"], "floor", "1s", ["1823-01-01 00:00:01"]), + (["1823-01-01 00:00:01"], "ceil", "1s", ["1823-01-01 00:00:01"]), + (["2018-01-01 00:15:00"], "ceil", "15T", ["2018-01-01 00:15:00"]), + (["2018-01-01 00:15:00"], "floor", "15T", ["2018-01-01 00:15:00"]), + (["1823-01-01 03:00:00"], "ceil", "3H", ["1823-01-01 03:00:00"]), + (["1823-01-01 03:00:00"], "floor", "3H", ["1823-01-01 03:00:00"]), + ( + ("NaT", "1823-01-01 00:00:01"), + "floor", + "1s", + ("NaT", "1823-01-01 00:00:01"), + ), + ( + ("NaT", "1823-01-01 00:00:01"), + "ceil", + "1s", + ("NaT", "1823-01-01 00:00:01"), + ), + ], + ) def test_ceil_floor_edge(self, test_input, rounder, freq, expected): dt = DatetimeIndex(list(test_input)) func = getattr(dt, rounder) @@ -183,17 +217,34 @@ def test_ceil_floor_edge(self, test_input, rounder, freq, expected): expected = DatetimeIndex(list(expected)) assert expected.equals(result) - @pytest.mark.parametrize('start, index_freq, periods', [ - ('2018-01-01', '12H', 25), - ('2018-01-01 0:0:0.124999', '1ns', 1000), - ]) - @pytest.mark.parametrize('round_freq', [ - '2ns', '3ns', '4ns', '5ns', '6ns', '7ns', - '250ns', '500ns', '750ns', - '1us', '19us', '250us', '500us', '750us', - '1s', '2s', '3s', - '12H', '1D', - ]) + @pytest.mark.parametrize( + "start, index_freq, periods", + [("2018-01-01", "12H", 25), ("2018-01-01 0:0:0.124999", "1ns", 1000)], + ) + @pytest.mark.parametrize( + "round_freq", + [ + "2ns", + "3ns", + "4ns", + "5ns", + "6ns", + "7ns", + "250ns", + "500ns", + "750ns", + "1us", + "19us", + "250us", + "500us", + "750us", + "1s", + "2s", + "3s", + "12H", + "1D", + ], + ) def test_round_int64(self, start, index_freq, periods, round_freq): dt = date_range(start=start, freq=index_freq, periods=periods) unit = to_offset(round_freq).nanos @@ -227,19 +278,21 @@ def test_round_int64(self, start, index_freq, periods, round_freq): # DatetimeIndex.normalize def test_normalize(self): - rng = date_range('1/1/2000 9:30', periods=10, freq='D') + rng = date_range("1/1/2000 9:30", periods=10, freq="D") result = rng.normalize() - expected = date_range('1/1/2000', periods=10, freq='D') + expected = date_range("1/1/2000", periods=10, freq="D") tm.assert_index_equal(result, expected) - arr_ns = np.array([1380585623454345752, - 1380585612343234312]).astype("datetime64[ns]") + arr_ns = np.array([1380585623454345752, 1380585612343234312]).astype( + "datetime64[ns]" + ) rng_ns = DatetimeIndex(arr_ns) rng_ns_normalized = rng_ns.normalize() - arr_ns = np.array([1380585600000000000, - 1380585600000000000]).astype("datetime64[ns]") + arr_ns = np.array([1380585600000000000, 1380585600000000000]).astype( + "datetime64[ns]" + ) expected = DatetimeIndex(arr_ns) tm.assert_index_equal(rng_ns_normalized, expected) @@ -247,44 +300,43 @@ def test_normalize(self): assert not rng.is_normalized def test_normalize_nat(self): - dti = DatetimeIndex([pd.NaT, Timestamp('2018-01-01 01:00:00')]) + dti = DatetimeIndex([pd.NaT, Timestamp("2018-01-01 01:00:00")]) result = dti.normalize() - expected = DatetimeIndex([pd.NaT, Timestamp('2018-01-01')]) + expected = DatetimeIndex([pd.NaT, Timestamp("2018-01-01")]) tm.assert_index_equal(result, expected) class TestDateTimeIndexToJulianDate: - def test_1700(self): - dr = date_range(start=Timestamp('1710-10-01'), periods=5, freq='D') + dr = date_range(start=Timestamp("1710-10-01"), periods=5, freq="D") r1 = pd.Index([x.to_julian_date() for x in dr]) r2 = dr.to_julian_date() assert isinstance(r2, pd.Float64Index) tm.assert_index_equal(r1, r2) def test_2000(self): - dr = date_range(start=Timestamp('2000-02-27'), periods=5, freq='D') + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="D") r1 = pd.Index([x.to_julian_date() for x in dr]) r2 = dr.to_julian_date() assert isinstance(r2, pd.Float64Index) tm.assert_index_equal(r1, r2) def test_hour(self): - dr = date_range(start=Timestamp('2000-02-27'), periods=5, freq='H') + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="H") r1 = pd.Index([x.to_julian_date() for x in dr]) r2 = dr.to_julian_date() assert isinstance(r2, pd.Float64Index) tm.assert_index_equal(r1, r2) def test_minute(self): - dr = date_range(start=Timestamp('2000-02-27'), periods=5, freq='T') + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="T") r1 = pd.Index([x.to_julian_date() for x in dr]) r2 = dr.to_julian_date() assert isinstance(r2, pd.Float64Index) tm.assert_index_equal(r1, r2) def test_second(self): - dr = date_range(start=Timestamp('2000-02-27'), periods=5, freq='S') + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="S") r1 = pd.Index([x.to_julian_date() for x in dr]) r2 = dr.to_julian_date() assert isinstance(r2, pd.Float64Index) diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index fd666f3d56c9d4..67fc70c17d7bc3 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -7,8 +7,15 @@ import pandas as pd from pandas import ( - DataFrame, DatetimeIndex, Index, Int64Index, Series, bdate_range, - date_range, to_datetime) + DataFrame, + DatetimeIndex, + Index, + Int64Index, + Series, + bdate_range, + date_range, + to_datetime, +) import pandas.util.testing as tm from pandas.tseries.offsets import BMonthEnd, Minute, MonthEnd @@ -17,8 +24,14 @@ class TestDatetimeIndexSetOps: - tz = [None, 'UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/Asia/Singapore', - 'dateutil/US/Pacific'] + tz = [ + None, + "UTC", + "Asia/Tokyo", + "US/Eastern", + "dateutil/Asia/Singapore", + "dateutil/US/Pacific", + ] # TODO: moved from test_datetimelike; dedup with version below @pytest.mark.parametrize("sort", [None, False]) @@ -37,9 +50,9 @@ def test_union3(self, sort, box): second = everything[5:] # GH 10149 - expected = first.astype('O').union( - pd.Index(second.values, dtype='O') - ).astype('O') + expected = ( + first.astype("O").union(pd.Index(second.values, dtype="O")).astype("O") + ) case = box(second.values) result = first.union(case, sort=sort) tm.assert_index_equal(result, expected) @@ -47,27 +60,26 @@ def test_union3(self, sort, box): @pytest.mark.parametrize("tz", tz) @pytest.mark.parametrize("sort", [None, False]) def test_union(self, tz, sort): - rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz) - expected1 = pd.date_range('1/1/2000', freq='D', periods=10, tz=tz) + rng1 = pd.date_range("1/1/2000", freq="D", periods=5, tz=tz) + other1 = pd.date_range("1/6/2000", freq="D", periods=5, tz=tz) + expected1 = pd.date_range("1/1/2000", freq="D", periods=10, tz=tz) expected1_notsorted = pd.DatetimeIndex(list(other1) + list(rng1)) - rng2 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz) - expected2 = pd.date_range('1/1/2000', freq='D', periods=8, tz=tz) + rng2 = pd.date_range("1/1/2000", freq="D", periods=5, tz=tz) + other2 = pd.date_range("1/4/2000", freq="D", periods=5, tz=tz) + expected2 = pd.date_range("1/1/2000", freq="D", periods=8, tz=tz) expected2_notsorted = pd.DatetimeIndex(list(other2) + list(rng2[:3])) - rng3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + rng3 = pd.date_range("1/1/2000", freq="D", periods=5, tz=tz) other3 = pd.DatetimeIndex([], tz=tz) - expected3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + expected3 = pd.date_range("1/1/2000", freq="D", periods=5, tz=tz) expected3_notsorted = rng3 - for rng, other, exp, exp_notsorted in [(rng1, other1, expected1, - expected1_notsorted), - (rng2, other2, expected2, - expected2_notsorted), - (rng3, other3, expected3, - expected3_notsorted)]: + for rng, other, exp, exp_notsorted in [ + (rng1, other1, expected1, expected1_notsorted), + (rng2, other2, expected2, expected2_notsorted), + (rng3, other3, expected3, expected3_notsorted), + ]: result_union = rng.union(other, sort=sort) tm.assert_index_equal(result_union, exp) @@ -80,8 +92,8 @@ def test_union(self, tz, sort): @pytest.mark.parametrize("sort", [None, False]) def test_union_coverage(self, sort): - idx = DatetimeIndex(['2000-01-03', '2000-01-01', '2000-01-02']) - ordered = DatetimeIndex(idx.sort_values(), freq='infer') + idx = DatetimeIndex(["2000-01-03", "2000-01-01", "2000-01-02"]) + ordered = DatetimeIndex(idx.sort_values(), freq="infer") result = ordered.union(idx, sort=sort) tm.assert_index_equal(result, ordered) @@ -91,8 +103,8 @@ def test_union_coverage(self, sort): @pytest.mark.parametrize("sort", [None, False]) def test_union_bug_1730(self, sort): - rng_a = date_range('1/1/2012', periods=4, freq='3H') - rng_b = date_range('1/1/2012', periods=4, freq='4H') + rng_a = date_range("1/1/2012", periods=4, freq="3H") + rng_b = date_range("1/1/2012", periods=4, freq="4H") result = rng_a.union(rng_b, sort=sort) exp = list(rng_a) + list(rng_b[1:]) @@ -104,16 +116,24 @@ def test_union_bug_1730(self, sort): @pytest.mark.parametrize("sort", [None, False]) def test_union_bug_1745(self, sort): - left = DatetimeIndex(['2012-05-11 15:19:49.695000']) - right = DatetimeIndex(['2012-05-29 13:04:21.322000', - '2012-05-11 15:27:24.873000', - '2012-05-11 15:31:05.350000']) + left = DatetimeIndex(["2012-05-11 15:19:49.695000"]) + right = DatetimeIndex( + [ + "2012-05-29 13:04:21.322000", + "2012-05-11 15:27:24.873000", + "2012-05-11 15:31:05.350000", + ] + ) result = left.union(right, sort=sort) - exp = DatetimeIndex(['2012-05-11 15:19:49.695000', - '2012-05-29 13:04:21.322000', - '2012-05-11 15:27:24.873000', - '2012-05-11 15:31:05.350000']) + exp = DatetimeIndex( + [ + "2012-05-11 15:19:49.695000", + "2012-05-29 13:04:21.322000", + "2012-05-11 15:27:24.873000", + "2012-05-11 15:31:05.350000", + ] + ) if sort is None: exp = exp.sort_values() tm.assert_index_equal(result, exp) @@ -121,6 +141,7 @@ def test_union_bug_1745(self, sort): @pytest.mark.parametrize("sort", [None, False]) def test_union_bug_4564(self, sort): from pandas import DateOffset + left = date_range("2013-01-01", "2013-02-01") right = left + DateOffset(minutes=15) @@ -135,7 +156,7 @@ def test_union_bug_4564(self, sort): @pytest.mark.parametrize("sort", [None, False]) def test_union_freq_both_none(self, sort): # GH11086 - expected = bdate_range('20150101', periods=10) + expected = bdate_range("20150101", periods=10) expected.freq = None result = expected.union(expected, sort=sort) @@ -143,20 +164,20 @@ def test_union_freq_both_none(self, sort): assert result.freq is None def test_union_dataframe_index(self): - rng1 = date_range('1/1/1999', '1/1/2012', freq='MS') + rng1 = date_range("1/1/1999", "1/1/2012", freq="MS") s1 = Series(np.random.randn(len(rng1)), rng1) - rng2 = date_range('1/1/1980', '12/1/2001', freq='MS') + rng2 = date_range("1/1/1980", "12/1/2001", freq="MS") s2 = Series(np.random.randn(len(rng2)), rng2) - df = DataFrame({'s1': s1, 's2': s2}) + df = DataFrame({"s1": s1, "s2": s2}) - exp = pd.date_range('1/1/1980', '1/1/2012', freq='MS') + exp = pd.date_range("1/1/1980", "1/1/2012", freq="MS") tm.assert_index_equal(df.index, exp) @pytest.mark.parametrize("sort", [None, False]) def test_union_with_DatetimeIndex(self, sort): i1 = Int64Index(np.arange(0, 20, 2)) - i2 = date_range(start='2012-01-03 00:00:00', periods=10, freq='D') + i2 = date_range(start="2012-01-03 00:00:00", periods=10, freq="D") # Works i1.union(i2, sort=sort) # Fails with "AttributeError: can't set attribute" @@ -175,31 +196,35 @@ def test_intersection2(self): result = first.intersection(case) assert tm.equalContents(result, second) - third = Index(['a', 'b', 'c']) + third = Index(["a", "b", "c"]) result = first.intersection(third) expected = pd.Index([], dtype=object) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("tz", [None, 'Asia/Tokyo', 'US/Eastern', - 'dateutil/US/Pacific']) + @pytest.mark.parametrize( + "tz", [None, "Asia/Tokyo", "US/Eastern", "dateutil/US/Pacific"] + ) @pytest.mark.parametrize("sort", [None, False]) def test_intersection(self, tz, sort): # GH 4690 (with tz) - base = date_range('6/1/2000', '6/30/2000', freq='D', name='idx') + base = date_range("6/1/2000", "6/30/2000", freq="D", name="idx") # if target has the same name, it is preserved - rng2 = date_range('5/15/2000', '6/20/2000', freq='D', name='idx') - expected2 = date_range('6/1/2000', '6/20/2000', freq='D', name='idx') + rng2 = date_range("5/15/2000", "6/20/2000", freq="D", name="idx") + expected2 = date_range("6/1/2000", "6/20/2000", freq="D", name="idx") # if target name is different, it will be reset - rng3 = date_range('5/15/2000', '6/20/2000', freq='D', name='other') - expected3 = date_range('6/1/2000', '6/20/2000', freq='D', name=None) + rng3 = date_range("5/15/2000", "6/20/2000", freq="D", name="other") + expected3 = date_range("6/1/2000", "6/20/2000", freq="D", name=None) - rng4 = date_range('7/1/2000', '7/31/2000', freq='D', name='idx') - expected4 = DatetimeIndex([], name='idx') + rng4 = date_range("7/1/2000", "7/31/2000", freq="D", name="idx") + expected4 = DatetimeIndex([], name="idx") - for (rng, expected) in [(rng2, expected2), (rng3, expected3), - (rng4, expected4)]: + for (rng, expected) in [ + (rng2, expected2), + (rng3, expected3), + (rng4, expected4), + ]: result = base.intersection(rng) tm.assert_index_equal(result, expected) assert result.name == expected.name @@ -207,29 +232,31 @@ def test_intersection(self, tz, sort): assert result.tz == expected.tz # non-monotonic - base = DatetimeIndex(['2011-01-05', '2011-01-04', - '2011-01-02', '2011-01-03'], - tz=tz, name='idx') - - rng2 = DatetimeIndex(['2011-01-04', '2011-01-02', - '2011-02-02', '2011-02-03'], - tz=tz, name='idx') - expected2 = DatetimeIndex(['2011-01-04', '2011-01-02'], - tz=tz, name='idx') - - rng3 = DatetimeIndex(['2011-01-04', '2011-01-02', - '2011-02-02', '2011-02-03'], - tz=tz, name='other') - expected3 = DatetimeIndex(['2011-01-04', '2011-01-02'], - tz=tz, name=None) + base = DatetimeIndex( + ["2011-01-05", "2011-01-04", "2011-01-02", "2011-01-03"], tz=tz, name="idx" + ) + + rng2 = DatetimeIndex( + ["2011-01-04", "2011-01-02", "2011-02-02", "2011-02-03"], tz=tz, name="idx" + ) + expected2 = DatetimeIndex(["2011-01-04", "2011-01-02"], tz=tz, name="idx") + + rng3 = DatetimeIndex( + ["2011-01-04", "2011-01-02", "2011-02-02", "2011-02-03"], + tz=tz, + name="other", + ) + expected3 = DatetimeIndex(["2011-01-04", "2011-01-02"], tz=tz, name=None) # GH 7880 - rng4 = date_range('7/1/2000', '7/31/2000', freq='D', tz=tz, - name='idx') - expected4 = DatetimeIndex([], tz=tz, name='idx') - - for (rng, expected) in [(rng2, expected2), (rng3, expected3), - (rng4, expected4)]: + rng4 = date_range("7/1/2000", "7/31/2000", freq="D", tz=tz, name="idx") + expected4 = DatetimeIndex([], tz=tz, name="idx") + + for (rng, expected) in [ + (rng2, expected2), + (rng3, expected3), + (rng4, expected4), + ]: result = base.intersection(rng, sort=sort) if sort is None: expected = expected.sort_values() @@ -240,7 +267,7 @@ def test_intersection(self, tz, sort): def test_intersection_empty(self): # empty same freq GH2129 - rng = date_range('6/1/2000', '6/15/2000', freq='T') + rng = date_range("6/1/2000", "6/15/2000", freq="T") result = rng[0:0].intersection(rng) assert len(result) == 0 @@ -249,7 +276,8 @@ def test_intersection_empty(self): def test_intersection_bug_1708(self): from pandas import DateOffset - index_1 = date_range('1/1/2012', periods=4, freq='12H') + + index_1 = date_range("1/1/2012", periods=4, freq="12H") index_2 = index_1 + DateOffset(hours=1) result = index_1 & index_2 @@ -258,24 +286,25 @@ def test_intersection_bug_1708(self): @pytest.mark.parametrize("tz", tz) @pytest.mark.parametrize("sort", [None, False]) def test_difference(self, tz, sort): - rng_dates = ['1/2/2000', '1/3/2000', '1/1/2000', '1/4/2000', - '1/5/2000'] + rng_dates = ["1/2/2000", "1/3/2000", "1/1/2000", "1/4/2000", "1/5/2000"] rng1 = pd.DatetimeIndex(rng_dates, tz=tz) - other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz) + other1 = pd.date_range("1/6/2000", freq="D", periods=5, tz=tz) expected1 = pd.DatetimeIndex(rng_dates, tz=tz) rng2 = pd.DatetimeIndex(rng_dates, tz=tz) - other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz) + other2 = pd.date_range("1/4/2000", freq="D", periods=5, tz=tz) expected2 = pd.DatetimeIndex(rng_dates[:3], tz=tz) rng3 = pd.DatetimeIndex(rng_dates, tz=tz) other3 = pd.DatetimeIndex([], tz=tz) expected3 = pd.DatetimeIndex(rng_dates, tz=tz) - for rng, other, expected in [(rng1, other1, expected1), - (rng2, other2, expected2), - (rng3, other3, expected3)]: + for rng, other, expected in [ + (rng1, other1, expected1), + (rng2, other2, expected2), + (rng3, other3, expected3), + ]: result_diff = rng.difference(other, sort) if sort is None: expected = expected.sort_values() @@ -290,29 +319,27 @@ def test_difference_freq(self, sort): expected = DatetimeIndex(["20160920", "20160925"], freq=None) idx_diff = index.difference(other, sort) tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) other = date_range("20160922", "20160925", freq="D") idx_diff = index.difference(other, sort) expected = DatetimeIndex(["20160920", "20160921"], freq=None) tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) @pytest.mark.parametrize("sort", [None, False]) def test_datetimeindex_diff(self, sort): - dti1 = date_range(freq='Q-JAN', start=datetime(1997, 12, 31), - periods=100) - dti2 = date_range(freq='Q-JAN', start=datetime(1997, 12, 31), - periods=98) + dti1 = date_range(freq="Q-JAN", start=datetime(1997, 12, 31), periods=100) + dti2 = date_range(freq="Q-JAN", start=datetime(1997, 12, 31), periods=98) assert len(dti1.difference(dti2, sort)) == 2 @pytest.mark.parametrize("sort", [None, False]) def test_datetimeindex_union_join_empty(self, sort): - dti = date_range(start='1/1/2001', end='2/1/2001', freq='D') + dti = date_range(start="1/1/2001", end="2/1/2001", freq="D") empty = Index([]) result = dti.union(empty, sort=sort) - expected = dti.astype('O') + expected = dti.astype("O") tm.assert_index_equal(result, expected) result = dti.join(empty) @@ -320,16 +347,13 @@ def test_datetimeindex_union_join_empty(self, sort): tm.assert_index_equal(result, dti) def test_join_nonunique(self): - idx1 = to_datetime(['2012-11-06 16:00:11.477563', - '2012-11-06 16:00:11.477563']) - idx2 = to_datetime(['2012-11-06 15:11:09.006507', - '2012-11-06 15:11:09.006507']) - rs = idx1.join(idx2, how='outer') + idx1 = to_datetime(["2012-11-06 16:00:11.477563", "2012-11-06 16:00:11.477563"]) + idx2 = to_datetime(["2012-11-06 15:11:09.006507", "2012-11-06 15:11:09.006507"]) + rs = idx1.join(idx2, how="outer") assert rs.is_monotonic class TestBusinessDatetimeIndex: - def setup_method(self, method): self.rng = bdate_range(START, END) @@ -376,14 +400,14 @@ def test_outer_join(self): left = self.rng[:10] right = self.rng[5:10] - the_join = left.join(right, how='outer') + the_join = left.join(right, how="outer") assert isinstance(the_join, DatetimeIndex) # non-overlapping, gap in middle left = self.rng[:5] right = self.rng[10:] - the_join = left.join(right, how='outer') + the_join = left.join(right, how="outer") assert isinstance(the_join, DatetimeIndex) assert the_join.freq is None @@ -391,19 +415,19 @@ def test_outer_join(self): left = self.rng[:5] right = self.rng[5:10] - the_join = left.join(right, how='outer') + the_join = left.join(right, how="outer") assert isinstance(the_join, DatetimeIndex) # overlapping, but different offset rng = date_range(START, END, freq=BMonthEnd()) - the_join = self.rng.join(rng, how='outer') + the_join = self.rng.join(rng, how="outer") assert isinstance(the_join, DatetimeIndex) assert the_join.freq is None @pytest.mark.parametrize("sort", [None, False]) def test_union_not_cacheable(self, sort): - rng = date_range('1/1/2000', periods=50, freq=Minute()) + rng = date_range("1/1/2000", periods=50, freq=Minute()) rng1 = rng[10:] rng2 = rng[:25] the_union = rng1.union(rng2, sort=sort) @@ -420,7 +444,7 @@ def test_union_not_cacheable(self, sort): tm.assert_index_equal(the_union, expected) def test_intersection(self): - rng = date_range('1/1/2000', periods=50, freq=Minute()) + rng = date_range("1/1/2000", periods=50, freq=Minute()) rng1 = rng[10:] rng2 = rng[:25] the_int = rng1.intersection(rng2) @@ -439,15 +463,16 @@ def test_intersection(self): def test_intersection_bug(self): # GH #771 - a = bdate_range('11/30/2011', '12/31/2011') - b = bdate_range('12/10/2011', '12/20/2011') + a = bdate_range("11/30/2011", "12/31/2011") + b = bdate_range("12/10/2011", "12/20/2011") result = a.intersection(b) tm.assert_index_equal(result, b) @pytest.mark.parametrize("sort", [None, False]) def test_month_range_union_tz_pytz(self, sort): from pytz import timezone - tz = timezone('US/Eastern') + + tz = timezone("US/Eastern") early_start = datetime(2011, 1, 1) early_end = datetime(2011, 3, 1) @@ -455,10 +480,8 @@ def test_month_range_union_tz_pytz(self, sort): late_start = datetime(2011, 3, 1) late_end = datetime(2011, 5, 1) - early_dr = date_range(start=early_start, end=early_end, tz=tz, - freq=MonthEnd()) - late_dr = date_range(start=late_start, end=late_end, tz=tz, - freq=MonthEnd()) + early_dr = date_range(start=early_start, end=early_end, tz=tz, freq=MonthEnd()) + late_dr = date_range(start=late_start, end=late_end, tz=tz, freq=MonthEnd()) early_dr.union(late_dr, sort=sort) @@ -466,7 +489,8 @@ def test_month_range_union_tz_pytz(self, sort): @pytest.mark.parametrize("sort", [None, False]) def test_month_range_union_tz_dateutil(self, sort): from pandas._libs.tslibs.timezones import dateutil_gettz - tz = dateutil_gettz('US/Eastern') + + tz = dateutil_gettz("US/Eastern") early_start = datetime(2011, 1, 1) early_end = datetime(2011, 3, 1) @@ -474,18 +498,15 @@ def test_month_range_union_tz_dateutil(self, sort): late_start = datetime(2011, 3, 1) late_end = datetime(2011, 5, 1) - early_dr = date_range(start=early_start, end=early_end, tz=tz, - freq=MonthEnd()) - late_dr = date_range(start=late_start, end=late_end, tz=tz, - freq=MonthEnd()) + early_dr = date_range(start=early_start, end=early_end, tz=tz, freq=MonthEnd()) + late_dr = date_range(start=late_start, end=late_end, tz=tz, freq=MonthEnd()) early_dr.union(late_dr, sort=sort) class TestCustomDatetimeIndex: - def setup_method(self, method): - self.rng = bdate_range(START, END, freq='C') + self.rng = bdate_range(START, END, freq="C") @pytest.mark.parametrize("sort", [None, False]) def test_union(self, sort): @@ -527,14 +548,14 @@ def test_outer_join(self): left = self.rng[:10] right = self.rng[5:10] - the_join = left.join(right, how='outer') + the_join = left.join(right, how="outer") assert isinstance(the_join, DatetimeIndex) # non-overlapping, gap in middle left = self.rng[:5] right = self.rng[10:] - the_join = left.join(right, how='outer') + the_join = left.join(right, how="outer") assert isinstance(the_join, DatetimeIndex) assert the_join.freq is None @@ -542,19 +563,19 @@ def test_outer_join(self): left = self.rng[:5] right = self.rng[5:10] - the_join = left.join(right, how='outer') + the_join = left.join(right, how="outer") assert isinstance(the_join, DatetimeIndex) # overlapping, but different offset rng = date_range(START, END, freq=BMonthEnd()) - the_join = self.rng.join(rng, how='outer') + the_join = self.rng.join(rng, how="outer") assert isinstance(the_join, DatetimeIndex) assert the_join.freq is None def test_intersection_bug(self): # GH #771 - a = bdate_range('11/30/2011', '12/31/2011', freq='C') - b = bdate_range('12/10/2011', '12/20/2011', freq='C') + a = bdate_range("11/30/2011", "12/31/2011", freq="C") + b = bdate_range("12/10/2011", "12/20/2011", freq="C") result = a.intersection(b) tm.assert_index_equal(result, b) diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index af0183379790a5..059dbb00019d8e 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -14,8 +14,14 @@ import pandas as pd from pandas import ( - DatetimeIndex, Index, Timestamp, bdate_range, date_range, isna, - to_datetime) + DatetimeIndex, + Index, + Timestamp, + bdate_range, + date_range, + isna, + to_datetime, +) import pandas.util.testing as tm @@ -36,7 +42,7 @@ def dst(self, dt): return timedelta(0) -fixed_off = FixedOffset(-420, '-07:00') +fixed_off = FixedOffset(-420, "-07:00") fixed_off_no_name = FixedOffset(-330, None) @@ -47,43 +53,43 @@ def test_tz_convert_nat(self): # GH#5546 dates = [pd.NaT] idx = DatetimeIndex(dates) - idx = idx.tz_localize('US/Pacific') - tm.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Pacific')) - idx = idx.tz_convert('US/Eastern') - tm.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Eastern')) - idx = idx.tz_convert('UTC') - tm.assert_index_equal(idx, DatetimeIndex(dates, tz='UTC')) - - dates = ['2010-12-01 00:00', '2010-12-02 00:00', pd.NaT] + idx = idx.tz_localize("US/Pacific") + tm.assert_index_equal(idx, DatetimeIndex(dates, tz="US/Pacific")) + idx = idx.tz_convert("US/Eastern") + tm.assert_index_equal(idx, DatetimeIndex(dates, tz="US/Eastern")) + idx = idx.tz_convert("UTC") + tm.assert_index_equal(idx, DatetimeIndex(dates, tz="UTC")) + + dates = ["2010-12-01 00:00", "2010-12-02 00:00", pd.NaT] idx = DatetimeIndex(dates) - idx = idx.tz_localize('US/Pacific') - tm.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Pacific')) - idx = idx.tz_convert('US/Eastern') - expected = ['2010-12-01 03:00', '2010-12-02 03:00', pd.NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) + idx = idx.tz_localize("US/Pacific") + tm.assert_index_equal(idx, DatetimeIndex(dates, tz="US/Pacific")) + idx = idx.tz_convert("US/Eastern") + expected = ["2010-12-01 03:00", "2010-12-02 03:00", pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Eastern")) idx = idx + pd.offsets.Hour(5) - expected = ['2010-12-01 08:00', '2010-12-02 08:00', pd.NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) - idx = idx.tz_convert('US/Pacific') - expected = ['2010-12-01 05:00', '2010-12-02 05:00', pd.NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Pacific')) + expected = ["2010-12-01 08:00", "2010-12-02 08:00", pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Eastern")) + idx = idx.tz_convert("US/Pacific") + expected = ["2010-12-01 05:00", "2010-12-02 05:00", pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Pacific")) - idx = idx + np.timedelta64(3, 'h') - expected = ['2010-12-01 08:00', '2010-12-02 08:00', pd.NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Pacific')) + idx = idx + np.timedelta64(3, "h") + expected = ["2010-12-01 08:00", "2010-12-02 08:00", pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Pacific")) - idx = idx.tz_convert('US/Eastern') - expected = ['2010-12-01 11:00', '2010-12-02 11:00', pd.NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) + idx = idx.tz_convert("US/Eastern") + expected = ["2010-12-01 11:00", "2010-12-02 11:00", pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Eastern")) - @pytest.mark.parametrize('prefix', ['', 'dateutil/']) + @pytest.mark.parametrize("prefix", ["", "dateutil/"]) def test_dti_tz_convert_compat_timestamp(self, prefix): - strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] - idx = DatetimeIndex(strdates, tz=prefix + 'US/Eastern') + strdates = ["1/1/2012", "3/1/2012", "4/1/2012"] + idx = DatetimeIndex(strdates, tz=prefix + "US/Eastern") - conv = idx[0].tz_convert(prefix + 'US/Pacific') - expected = idx.tz_convert(prefix + 'US/Pacific')[0] + conv = idx[0].tz_convert(prefix + "US/Pacific") + expected = idx.tz_convert(prefix + "US/Pacific")[0] assert conv == expected @@ -92,196 +98,203 @@ def test_dti_tz_convert_hour_overflow_dst(self): # https://github.com/pandas-dev/pandas/issues/13306 # sorted case US/Eastern -> UTC - ts = ['2008-05-12 09:50:00', - '2008-12-12 09:50:35', - '2009-05-12 09:50:32'] - tt = DatetimeIndex(ts).tz_localize('US/Eastern') - ut = tt.tz_convert('UTC') + ts = ["2008-05-12 09:50:00", "2008-12-12 09:50:35", "2009-05-12 09:50:32"] + tt = DatetimeIndex(ts).tz_localize("US/Eastern") + ut = tt.tz_convert("UTC") expected = Index([13, 14, 13]) tm.assert_index_equal(ut.hour, expected) # sorted case UTC -> US/Eastern - ts = ['2008-05-12 13:50:00', - '2008-12-12 14:50:35', - '2009-05-12 13:50:32'] - tt = DatetimeIndex(ts).tz_localize('UTC') - ut = tt.tz_convert('US/Eastern') + ts = ["2008-05-12 13:50:00", "2008-12-12 14:50:35", "2009-05-12 13:50:32"] + tt = DatetimeIndex(ts).tz_localize("UTC") + ut = tt.tz_convert("US/Eastern") expected = Index([9, 9, 9]) tm.assert_index_equal(ut.hour, expected) # unsorted case US/Eastern -> UTC - ts = ['2008-05-12 09:50:00', - '2008-12-12 09:50:35', - '2008-05-12 09:50:32'] - tt = DatetimeIndex(ts).tz_localize('US/Eastern') - ut = tt.tz_convert('UTC') + ts = ["2008-05-12 09:50:00", "2008-12-12 09:50:35", "2008-05-12 09:50:32"] + tt = DatetimeIndex(ts).tz_localize("US/Eastern") + ut = tt.tz_convert("UTC") expected = Index([13, 14, 13]) tm.assert_index_equal(ut.hour, expected) # unsorted case UTC -> US/Eastern - ts = ['2008-05-12 13:50:00', - '2008-12-12 14:50:35', - '2008-05-12 13:50:32'] - tt = DatetimeIndex(ts).tz_localize('UTC') - ut = tt.tz_convert('US/Eastern') + ts = ["2008-05-12 13:50:00", "2008-12-12 14:50:35", "2008-05-12 13:50:32"] + tt = DatetimeIndex(ts).tz_localize("UTC") + ut = tt.tz_convert("US/Eastern") expected = Index([9, 9, 9]) tm.assert_index_equal(ut.hour, expected) - @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) def test_dti_tz_convert_hour_overflow_dst_timestamps(self, tz): # Regression test for GH#13306 # sorted case US/Eastern -> UTC - ts = [Timestamp('2008-05-12 09:50:00', tz=tz), - Timestamp('2008-12-12 09:50:35', tz=tz), - Timestamp('2009-05-12 09:50:32', tz=tz)] + ts = [ + Timestamp("2008-05-12 09:50:00", tz=tz), + Timestamp("2008-12-12 09:50:35", tz=tz), + Timestamp("2009-05-12 09:50:32", tz=tz), + ] tt = DatetimeIndex(ts) - ut = tt.tz_convert('UTC') + ut = tt.tz_convert("UTC") expected = Index([13, 14, 13]) tm.assert_index_equal(ut.hour, expected) # sorted case UTC -> US/Eastern - ts = [Timestamp('2008-05-12 13:50:00', tz='UTC'), - Timestamp('2008-12-12 14:50:35', tz='UTC'), - Timestamp('2009-05-12 13:50:32', tz='UTC')] + ts = [ + Timestamp("2008-05-12 13:50:00", tz="UTC"), + Timestamp("2008-12-12 14:50:35", tz="UTC"), + Timestamp("2009-05-12 13:50:32", tz="UTC"), + ] tt = DatetimeIndex(ts) - ut = tt.tz_convert('US/Eastern') + ut = tt.tz_convert("US/Eastern") expected = Index([9, 9, 9]) tm.assert_index_equal(ut.hour, expected) # unsorted case US/Eastern -> UTC - ts = [Timestamp('2008-05-12 09:50:00', tz=tz), - Timestamp('2008-12-12 09:50:35', tz=tz), - Timestamp('2008-05-12 09:50:32', tz=tz)] + ts = [ + Timestamp("2008-05-12 09:50:00", tz=tz), + Timestamp("2008-12-12 09:50:35", tz=tz), + Timestamp("2008-05-12 09:50:32", tz=tz), + ] tt = DatetimeIndex(ts) - ut = tt.tz_convert('UTC') + ut = tt.tz_convert("UTC") expected = Index([13, 14, 13]) tm.assert_index_equal(ut.hour, expected) # unsorted case UTC -> US/Eastern - ts = [Timestamp('2008-05-12 13:50:00', tz='UTC'), - Timestamp('2008-12-12 14:50:35', tz='UTC'), - Timestamp('2008-05-12 13:50:32', tz='UTC')] + ts = [ + Timestamp("2008-05-12 13:50:00", tz="UTC"), + Timestamp("2008-12-12 14:50:35", tz="UTC"), + Timestamp("2008-05-12 13:50:32", tz="UTC"), + ] tt = DatetimeIndex(ts) - ut = tt.tz_convert('US/Eastern') + ut = tt.tz_convert("US/Eastern") expected = Index([9, 9, 9]) tm.assert_index_equal(ut.hour, expected) - @pytest.mark.parametrize('freq, n', [('H', 1), ('T', 60), ('S', 3600)]) + @pytest.mark.parametrize("freq, n", [("H", 1), ("T", 60), ("S", 3600)]) def test_dti_tz_convert_trans_pos_plus_1__bug(self, freq, n): # Regression test for tslib.tz_convert(vals, tz1, tz2). # See https://github.com/pandas-dev/pandas/issues/4496 for details. - idx = date_range(datetime(2011, 3, 26, 23), - datetime(2011, 3, 27, 1), freq=freq) - idx = idx.tz_localize('UTC') - idx = idx.tz_convert('Europe/Moscow') + idx = date_range(datetime(2011, 3, 26, 23), datetime(2011, 3, 27, 1), freq=freq) + idx = idx.tz_localize("UTC") + idx = idx.tz_convert("Europe/Moscow") expected = np.repeat(np.array([3, 4, 5]), np.array([n, n, 1])) tm.assert_index_equal(idx.hour, Index(expected)) def test_dti_tz_convert_dst(self): - for freq, n in [('H', 1), ('T', 60), ('S', 3600)]: + for freq, n in [("H", 1), ("T", 60), ("S", 3600)]: # Start DST - idx = date_range('2014-03-08 23:00', '2014-03-09 09:00', freq=freq, - tz='UTC') - idx = idx.tz_convert('US/Eastern') - expected = np.repeat(np.array([18, 19, 20, 21, 22, 23, - 0, 1, 3, 4, 5]), - np.array([n, n, n, n, n, n, n, n, n, n, 1])) + idx = date_range( + "2014-03-08 23:00", "2014-03-09 09:00", freq=freq, tz="UTC" + ) + idx = idx.tz_convert("US/Eastern") + expected = np.repeat( + np.array([18, 19, 20, 21, 22, 23, 0, 1, 3, 4, 5]), + np.array([n, n, n, n, n, n, n, n, n, n, 1]), + ) tm.assert_index_equal(idx.hour, Index(expected)) - idx = date_range('2014-03-08 18:00', '2014-03-09 05:00', freq=freq, - tz='US/Eastern') - idx = idx.tz_convert('UTC') - expected = np.repeat(np.array([23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), - np.array([n, n, n, n, n, n, n, n, n, n, 1])) + idx = date_range( + "2014-03-08 18:00", "2014-03-09 05:00", freq=freq, tz="US/Eastern" + ) + idx = idx.tz_convert("UTC") + expected = np.repeat( + np.array([23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + np.array([n, n, n, n, n, n, n, n, n, n, 1]), + ) tm.assert_index_equal(idx.hour, Index(expected)) # End DST - idx = date_range('2014-11-01 23:00', '2014-11-02 09:00', freq=freq, - tz='UTC') - idx = idx.tz_convert('US/Eastern') - expected = np.repeat(np.array([19, 20, 21, 22, 23, - 0, 1, 1, 2, 3, 4]), - np.array([n, n, n, n, n, n, n, n, n, n, 1])) + idx = date_range( + "2014-11-01 23:00", "2014-11-02 09:00", freq=freq, tz="UTC" + ) + idx = idx.tz_convert("US/Eastern") + expected = np.repeat( + np.array([19, 20, 21, 22, 23, 0, 1, 1, 2, 3, 4]), + np.array([n, n, n, n, n, n, n, n, n, n, 1]), + ) tm.assert_index_equal(idx.hour, Index(expected)) - idx = date_range('2014-11-01 18:00', '2014-11-02 05:00', freq=freq, - tz='US/Eastern') - idx = idx.tz_convert('UTC') - expected = np.repeat(np.array([22, 23, 0, 1, 2, 3, 4, 5, 6, - 7, 8, 9, 10]), - np.array([n, n, n, n, n, n, n, n, n, - n, n, n, 1])) + idx = date_range( + "2014-11-01 18:00", "2014-11-02 05:00", freq=freq, tz="US/Eastern" + ) + idx = idx.tz_convert("UTC") + expected = np.repeat( + np.array([22, 23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), + np.array([n, n, n, n, n, n, n, n, n, n, n, n, 1]), + ) tm.assert_index_equal(idx.hour, Index(expected)) # daily # Start DST - idx = date_range('2014-03-08 00:00', '2014-03-09 00:00', freq='D', - tz='UTC') - idx = idx.tz_convert('US/Eastern') + idx = date_range("2014-03-08 00:00", "2014-03-09 00:00", freq="D", tz="UTC") + idx = idx.tz_convert("US/Eastern") tm.assert_index_equal(idx.hour, Index([19, 19])) - idx = date_range('2014-03-08 00:00', '2014-03-09 00:00', freq='D', - tz='US/Eastern') - idx = idx.tz_convert('UTC') + idx = date_range( + "2014-03-08 00:00", "2014-03-09 00:00", freq="D", tz="US/Eastern" + ) + idx = idx.tz_convert("UTC") tm.assert_index_equal(idx.hour, Index([5, 5])) # End DST - idx = date_range('2014-11-01 00:00', '2014-11-02 00:00', freq='D', - tz='UTC') - idx = idx.tz_convert('US/Eastern') + idx = date_range("2014-11-01 00:00", "2014-11-02 00:00", freq="D", tz="UTC") + idx = idx.tz_convert("US/Eastern") tm.assert_index_equal(idx.hour, Index([20, 20])) - idx = date_range('2014-11-01 00:00', '2014-11-02 000:00', freq='D', - tz='US/Eastern') - idx = idx.tz_convert('UTC') + idx = date_range( + "2014-11-01 00:00", "2014-11-02 000:00", freq="D", tz="US/Eastern" + ) + idx = idx.tz_convert("UTC") tm.assert_index_equal(idx.hour, Index([4, 4])) def test_tz_convert_roundtrip(self, tz_aware_fixture): tz = tz_aware_fixture - idx1 = date_range(start='2014-01-01', end='2014-12-31', freq='M', - tz='UTC') - exp1 = date_range(start='2014-01-01', end='2014-12-31', freq='M') + idx1 = date_range(start="2014-01-01", end="2014-12-31", freq="M", tz="UTC") + exp1 = date_range(start="2014-01-01", end="2014-12-31", freq="M") - idx2 = date_range(start='2014-01-01', end='2014-12-31', freq='D', - tz='UTC') - exp2 = date_range(start='2014-01-01', end='2014-12-31', freq='D') + idx2 = date_range(start="2014-01-01", end="2014-12-31", freq="D", tz="UTC") + exp2 = date_range(start="2014-01-01", end="2014-12-31", freq="D") - idx3 = date_range(start='2014-01-01', end='2014-03-01', freq='H', - tz='UTC') - exp3 = date_range(start='2014-01-01', end='2014-03-01', freq='H') + idx3 = date_range(start="2014-01-01", end="2014-03-01", freq="H", tz="UTC") + exp3 = date_range(start="2014-01-01", end="2014-03-01", freq="H") - idx4 = date_range(start='2014-08-01', end='2014-10-31', freq='T', - tz='UTC') - exp4 = date_range(start='2014-08-01', end='2014-10-31', freq='T') + idx4 = date_range(start="2014-08-01", end="2014-10-31", freq="T", tz="UTC") + exp4 = date_range(start="2014-08-01", end="2014-10-31", freq="T") - for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3), - (idx4, exp4)]: + for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3), (idx4, exp4)]: converted = idx.tz_convert(tz) reset = converted.tz_convert(None) tm.assert_index_equal(reset, expected) assert reset.tzinfo is None - expected = converted.tz_convert('UTC').tz_localize(None) + expected = converted.tz_convert("UTC").tz_localize(None) tm.assert_index_equal(reset, expected) def test_dti_tz_convert_tzlocal(self): # GH#13583 # tz_convert doesn't affect to internal - dti = date_range(start='2001-01-01', end='2001-03-01', tz='UTC') + dti = date_range(start="2001-01-01", end="2001-03-01", tz="UTC") dti2 = dti.tz_convert(dateutil.tz.tzlocal()) tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) - dti = date_range(start='2001-01-01', end='2001-03-01', - tz=dateutil.tz.tzlocal()) + dti = date_range(start="2001-01-01", end="2001-03-01", tz=dateutil.tz.tzlocal()) dti2 = dti.tz_convert(None) tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) - @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern', - pytz.timezone('US/Eastern'), - gettz('US/Eastern')]) + @pytest.mark.parametrize( + "tz", + [ + "US/Eastern", + "dateutil/US/Eastern", + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + ], + ) def test_dti_tz_convert_utc_to_local_no_modify(self, tz): - rng = date_range('3/11/2012', '3/12/2012', freq='H', tz='utc') + rng = date_range("3/11/2012", "3/12/2012", freq="H", tz="utc") rng_eastern = rng.tz_convert(tz) # Values are unmodified @@ -289,9 +302,9 @@ def test_dti_tz_convert_utc_to_local_no_modify(self, tz): assert timezones.tz_compare(rng_eastern.tz, timezones.maybe_get_tz(tz)) - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_tz_convert_unsorted(self, tzstr): - dr = date_range('2012-03-09', freq='H', periods=100, tz='utc') + dr = date_range("2012-03-09", freq="H", periods=100, tz="utc") dr = dr.tz_convert(tzstr) result = dr[::-1].hour @@ -303,80 +316,80 @@ def test_tz_convert_unsorted(self, tzstr): def test_dti_tz_localize_nonexistent_raise_coerce(self): # GH#13057 - times = ['2015-03-08 01:00', '2015-03-08 02:00', '2015-03-08 03:00'] + times = ["2015-03-08 01:00", "2015-03-08 02:00", "2015-03-08 03:00"] index = DatetimeIndex(times) - tz = 'US/Eastern' + tz = "US/Eastern" with pytest.raises(pytz.NonExistentTimeError): index.tz_localize(tz=tz) with pytest.raises(pytz.NonExistentTimeError): with tm.assert_produces_warning(FutureWarning): - index.tz_localize(tz=tz, errors='raise') - - with tm.assert_produces_warning(FutureWarning, - clear=FutureWarning, - check_stacklevel=False): - result = index.tz_localize(tz=tz, errors='coerce') - test_times = ['2015-03-08 01:00-05:00', 'NaT', - '2015-03-08 03:00-04:00'] + index.tz_localize(tz=tz, errors="raise") + + with tm.assert_produces_warning( + FutureWarning, clear=FutureWarning, check_stacklevel=False + ): + result = index.tz_localize(tz=tz, errors="coerce") + test_times = ["2015-03-08 01:00-05:00", "NaT", "2015-03-08 03:00-04:00"] dti = to_datetime(test_times, utc=True) - expected = dti.tz_convert('US/Eastern') + expected = dti.tz_convert("US/Eastern") tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), - gettz('US/Eastern')]) + @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) def test_dti_tz_localize_ambiguous_infer(self, tz): # November 6, 2011, fall back, repeat 2 AM hour # With no repeated hours, we cannot infer the transition - dr = date_range(datetime(2011, 11, 6, 0), periods=5, - freq=pd.offsets.Hour()) + dr = date_range(datetime(2011, 11, 6, 0), periods=5, freq=pd.offsets.Hour()) with pytest.raises(pytz.AmbiguousTimeError): dr.tz_localize(tz) # With repeated hours, we can infer the transition - dr = date_range(datetime(2011, 11, 6, 0), periods=5, - freq=pd.offsets.Hour(), tz=tz) - times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', - '11/06/2011 02:00', '11/06/2011 03:00'] + dr = date_range( + datetime(2011, 11, 6, 0), periods=5, freq=pd.offsets.Hour(), tz=tz + ) + times = [ + "11/06/2011 00:00", + "11/06/2011 01:00", + "11/06/2011 01:00", + "11/06/2011 02:00", + "11/06/2011 03:00", + ] di = DatetimeIndex(times) - localized = di.tz_localize(tz, ambiguous='infer') + localized = di.tz_localize(tz, ambiguous="infer") tm.assert_index_equal(dr, localized) - tm.assert_index_equal(dr, DatetimeIndex(times, tz=tz, - ambiguous='infer')) + tm.assert_index_equal(dr, DatetimeIndex(times, tz=tz, ambiguous="infer")) # When there is no dst transition, nothing special happens - dr = date_range(datetime(2011, 6, 1, 0), periods=10, - freq=pd.offsets.Hour()) + dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=pd.offsets.Hour()) localized = dr.tz_localize(tz) - localized_infer = dr.tz_localize(tz, ambiguous='infer') + localized_infer = dr.tz_localize(tz, ambiguous="infer") tm.assert_index_equal(localized, localized_infer) - @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), - gettz('US/Eastern')]) + @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) def test_dti_tz_localize_ambiguous_times(self, tz): # March 13, 2011, spring forward, skip from 2 AM to 3 AM - dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, - freq=pd.offsets.Hour()) + dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, freq=pd.offsets.Hour()) with pytest.raises(pytz.NonExistentTimeError): dr.tz_localize(tz) # after dst transition, it works - dr = date_range(datetime(2011, 3, 13, 3, 30), periods=3, - freq=pd.offsets.Hour(), tz=tz) + dr = date_range( + datetime(2011, 3, 13, 3, 30), periods=3, freq=pd.offsets.Hour(), tz=tz + ) # November 6, 2011, fall back, repeat 2 AM hour - dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, - freq=pd.offsets.Hour()) + dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, freq=pd.offsets.Hour()) with pytest.raises(pytz.AmbiguousTimeError): dr.tz_localize(tz) # UTC is OK - dr = date_range(datetime(2011, 3, 13), periods=48, - freq=pd.offsets.Minute(30), tz=pytz.utc) + dr = date_range( + datetime(2011, 3, 13), periods=48, freq=pd.offsets.Minute(30), tz=pytz.utc + ) - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_dti_tz_localize_pass_dates_to_utc(self, tzstr): - strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] + strdates = ["1/1/2012", "3/1/2012", "4/1/2012"] idx = DatetimeIndex(strdates) conv = idx.tz_localize(tzstr) @@ -386,47 +399,51 @@ def test_dti_tz_localize_pass_dates_to_utc(self, tzstr): assert conv.tz == fromdates.tz tm.assert_numpy_array_equal(conv.values, fromdates.values) - @pytest.mark.parametrize('prefix', ['', 'dateutil/']) + @pytest.mark.parametrize("prefix", ["", "dateutil/"]) def test_dti_tz_localize(self, prefix): - tzstr = prefix + 'US/Eastern' - dti = pd.date_range(start='1/1/2005', end='1/1/2005 0:00:30.256', - freq='L') + tzstr = prefix + "US/Eastern" + dti = pd.date_range(start="1/1/2005", end="1/1/2005 0:00:30.256", freq="L") dti2 = dti.tz_localize(tzstr) - dti_utc = pd.date_range(start='1/1/2005 05:00', - end='1/1/2005 5:00:30.256', freq='L', tz='utc') + dti_utc = pd.date_range( + start="1/1/2005 05:00", end="1/1/2005 5:00:30.256", freq="L", tz="utc" + ) tm.assert_numpy_array_equal(dti2.values, dti_utc.values) - dti3 = dti2.tz_convert(prefix + 'US/Pacific') + dti3 = dti2.tz_convert(prefix + "US/Pacific") tm.assert_numpy_array_equal(dti3.values, dti_utc.values) - dti = pd.date_range(start='11/6/2011 1:59', end='11/6/2011 2:00', - freq='L') + dti = pd.date_range(start="11/6/2011 1:59", end="11/6/2011 2:00", freq="L") with pytest.raises(pytz.AmbiguousTimeError): dti.tz_localize(tzstr) - dti = pd.date_range(start='3/13/2011 1:59', end='3/13/2011 2:00', - freq='L') + dti = pd.date_range(start="3/13/2011 1:59", end="3/13/2011 2:00", freq="L") with pytest.raises(pytz.NonExistentTimeError): dti.tz_localize(tzstr) - @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern', - pytz.timezone('US/Eastern'), - gettz('US/Eastern')]) + @pytest.mark.parametrize( + "tz", + [ + "US/Eastern", + "dateutil/US/Eastern", + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + ], + ) def test_dti_tz_localize_utc_conversion(self, tz): # Localizing to time zone should: # 1) check for DST ambiguities # 2) convert to UTC - rng = date_range('3/10/2012', '3/11/2012', freq='30T') + rng = date_range("3/10/2012", "3/11/2012", freq="30T") converted = rng.tz_localize(tz) expected_naive = rng + pd.offsets.Hour(5) tm.assert_numpy_array_equal(converted.asi8, expected_naive.asi8) # DST ambiguity, this should fail - rng = date_range('3/11/2012', '3/12/2012', freq='30T') + rng = date_range("3/11/2012", "3/12/2012", freq="30T") # Is this really how it should fail?? with pytest.raises(pytz.NonExistentTimeError): rng.tz_localize(tz) @@ -435,7 +452,7 @@ def test_dti_tz_localize_roundtrip(self, tz_aware_fixture): # note: this tz tests that a tz-naive index can be localized # and de-localized successfully, when there are no DST transitions # in the range. - idx = date_range(start='2014-06-01', end='2014-08-30', freq='15T') + idx = date_range(start="2014-06-01", end="2014-08-30", freq="15T") tz = tz_aware_fixture localized = idx.tz_localize(tz) # cant localize a tz-aware object @@ -446,10 +463,10 @@ def test_dti_tz_localize_roundtrip(self, tz_aware_fixture): tm.assert_index_equal(reset, idx) def test_dti_tz_localize_naive(self): - rng = date_range('1/1/2011', periods=100, freq='H') + rng = date_range("1/1/2011", periods=100, freq="H") - conv = rng.tz_localize('US/Pacific') - exp = date_range('1/1/2011', periods=100, freq='H', tz='US/Pacific') + conv = rng.tz_localize("US/Pacific") + exp = date_range("1/1/2011", periods=100, freq="H", tz="US/Pacific") tm.assert_index_equal(conv, exp) @@ -458,55 +475,66 @@ def test_dti_tz_localize_tzlocal(self): offset = dateutil.tz.tzlocal().utcoffset(datetime(2011, 1, 1)) offset = int(offset.total_seconds() * 1000000000) - dti = date_range(start='2001-01-01', end='2001-03-01') + dti = date_range(start="2001-01-01", end="2001-03-01") dti2 = dti.tz_localize(dateutil.tz.tzlocal()) tm.assert_numpy_array_equal(dti2.asi8 + offset, dti.asi8) - dti = date_range(start='2001-01-01', end='2001-03-01', - tz=dateutil.tz.tzlocal()) + dti = date_range(start="2001-01-01", end="2001-03-01", tz=dateutil.tz.tzlocal()) dti2 = dti.tz_localize(None) tm.assert_numpy_array_equal(dti2.asi8 - offset, dti.asi8) - @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), - gettz('US/Eastern')]) + @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) def test_dti_tz_localize_ambiguous_nat(self, tz): - times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', - '11/06/2011 02:00', '11/06/2011 03:00'] + times = [ + "11/06/2011 00:00", + "11/06/2011 01:00", + "11/06/2011 01:00", + "11/06/2011 02:00", + "11/06/2011 03:00", + ] di = DatetimeIndex(times) - localized = di.tz_localize(tz, ambiguous='NaT') + localized = di.tz_localize(tz, ambiguous="NaT") - times = ['11/06/2011 00:00', np.NaN, np.NaN, '11/06/2011 02:00', - '11/06/2011 03:00'] - di_test = DatetimeIndex(times, tz='US/Eastern') + times = [ + "11/06/2011 00:00", + np.NaN, + np.NaN, + "11/06/2011 02:00", + "11/06/2011 03:00", + ] + di_test = DatetimeIndex(times, tz="US/Eastern") # left dtype is datetime64[ns, US/Eastern] # right is datetime64[ns, tzfile('/usr/share/zoneinfo/US/Eastern')] tm.assert_numpy_array_equal(di_test.values, localized.values) - @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), - gettz('US/Eastern')]) + @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) def test_dti_tz_localize_ambiguous_flags(self, tz): # November 6, 2011, fall back, repeat 2 AM hour # Pass in flags to determine right dst transition - dr = date_range(datetime(2011, 11, 6, 0), periods=5, - freq=pd.offsets.Hour(), tz=tz) - times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', - '11/06/2011 02:00', '11/06/2011 03:00'] + dr = date_range( + datetime(2011, 11, 6, 0), periods=5, freq=pd.offsets.Hour(), tz=tz + ) + times = [ + "11/06/2011 00:00", + "11/06/2011 01:00", + "11/06/2011 01:00", + "11/06/2011 02:00", + "11/06/2011 03:00", + ] # Test tz_localize di = DatetimeIndex(times) is_dst = [1, 1, 0, 0, 0] localized = di.tz_localize(tz, ambiguous=is_dst) tm.assert_index_equal(dr, localized) - tm.assert_index_equal(dr, DatetimeIndex(times, tz=tz, - ambiguous=is_dst)) + tm.assert_index_equal(dr, DatetimeIndex(times, tz=tz, ambiguous=is_dst)) localized = di.tz_localize(tz, ambiguous=np.array(is_dst)) tm.assert_index_equal(dr, localized) - localized = di.tz_localize(tz, - ambiguous=np.array(is_dst).astype('bool')) + localized = di.tz_localize(tz, ambiguous=np.array(is_dst).astype("bool")) tm.assert_index_equal(dr, localized) # Test constructor @@ -528,76 +556,81 @@ def test_dti_tz_localize_ambiguous_flags(self, tz): tm.assert_index_equal(dr, localized) # When there is no dst transition, nothing special happens - dr = date_range(datetime(2011, 6, 1, 0), periods=10, - freq=pd.offsets.Hour()) + dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=pd.offsets.Hour()) is_dst = np.array([1] * 10) localized = dr.tz_localize(tz) localized_is_dst = dr.tz_localize(tz, ambiguous=is_dst) tm.assert_index_equal(localized, localized_is_dst) # TODO: belongs outside tz_localize tests? - @pytest.mark.parametrize('tz', ['Europe/London', 'dateutil/Europe/London']) + @pytest.mark.parametrize("tz", ["Europe/London", "dateutil/Europe/London"]) def test_dti_construction_ambiguous_endpoint(self, tz): # construction with an ambiguous end-point # GH#11626 with pytest.raises(pytz.AmbiguousTimeError): - date_range("2013-10-26 23:00", "2013-10-27 01:00", - tz="Europe/London", freq="H") + date_range( + "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", freq="H" + ) - times = date_range("2013-10-26 23:00", "2013-10-27 01:00", freq="H", - tz=tz, ambiguous='infer') - assert times[0] == Timestamp('2013-10-26 23:00', tz=tz, freq="H") + times = date_range( + "2013-10-26 23:00", "2013-10-27 01:00", freq="H", tz=tz, ambiguous="infer" + ) + assert times[0] == Timestamp("2013-10-26 23:00", tz=tz, freq="H") - if str(tz).startswith('dateutil'): + if str(tz).startswith("dateutil"): # fixed ambiguous behavior # see GH#14621 - assert times[-1] == Timestamp('2013-10-27 01:00:00+0100', - tz=tz, freq="H") + assert times[-1] == Timestamp("2013-10-27 01:00:00+0100", tz=tz, freq="H") else: - assert times[-1] == Timestamp('2013-10-27 01:00:00+0000', - tz=tz, freq="H") - - @pytest.mark.parametrize('tz, option, expected', [ - ['US/Pacific', 'shift_forward', "2019-03-10 03:00"], - ['dateutil/US/Pacific', 'shift_forward', "2019-03-10 03:00"], - ['US/Pacific', 'shift_backward', "2019-03-10 01:00"], - pytest.param('dateutil/US/Pacific', 'shift_backward', - "2019-03-10 01:00", - marks=pytest.mark.xfail(reason="GH 24329")), - ['US/Pacific', timedelta(hours=1), "2019-03-10 03:00"] - ]) + assert times[-1] == Timestamp("2013-10-27 01:00:00+0000", tz=tz, freq="H") + + @pytest.mark.parametrize( + "tz, option, expected", + [ + ["US/Pacific", "shift_forward", "2019-03-10 03:00"], + ["dateutil/US/Pacific", "shift_forward", "2019-03-10 03:00"], + ["US/Pacific", "shift_backward", "2019-03-10 01:00"], + pytest.param( + "dateutil/US/Pacific", + "shift_backward", + "2019-03-10 01:00", + marks=pytest.mark.xfail(reason="GH 24329"), + ), + ["US/Pacific", timedelta(hours=1), "2019-03-10 03:00"], + ], + ) def test_dti_construction_nonexistent_endpoint(self, tz, option, expected): # construction with an nonexistent end-point with pytest.raises(pytz.NonExistentTimeError): - date_range("2019-03-10 00:00", "2019-03-10 02:00", - tz="US/Pacific", freq="H") + date_range( + "2019-03-10 00:00", "2019-03-10 02:00", tz="US/Pacific", freq="H" + ) - times = date_range("2019-03-10 00:00", "2019-03-10 02:00", freq="H", - tz=tz, nonexistent=option) + times = date_range( + "2019-03-10 00:00", "2019-03-10 02:00", freq="H", tz=tz, nonexistent=option + ) assert times[-1] == Timestamp(expected, tz=tz, freq="H") def test_dti_tz_localize_bdate_range(self): - dr = pd.bdate_range('1/1/2009', '1/1/2010') - dr_utc = pd.bdate_range('1/1/2009', '1/1/2010', tz=pytz.utc) + dr = pd.bdate_range("1/1/2009", "1/1/2010") + dr_utc = pd.bdate_range("1/1/2009", "1/1/2010", tz=pytz.utc) localized = dr.tz_localize(pytz.utc) tm.assert_index_equal(dr_utc, localized) - @pytest.mark.parametrize('tz', ['Europe/Warsaw', 'dateutil/Europe/Warsaw']) - @pytest.mark.parametrize('method, exp', [ - ['NaT', pd.NaT], - ['raise', None], - ['foo', 'invalid'] - ]) + @pytest.mark.parametrize("tz", ["Europe/Warsaw", "dateutil/Europe/Warsaw"]) + @pytest.mark.parametrize( + "method, exp", [["NaT", pd.NaT], ["raise", None], ["foo", "invalid"]] + ) def test_dti_tz_localize_nonexistent(self, tz, method, exp): # GH 8917 n = 60 - dti = date_range(start='2015-03-29 02:00:00', periods=n, freq='min') - if method == 'raise': + dti = date_range(start="2015-03-29 02:00:00", periods=n, freq="min") + if method == "raise": with pytest.raises(pytz.NonExistentTimeError): dti.tz_localize(tz, nonexistent=method) - elif exp == 'invalid': + elif exp == "invalid": with pytest.raises(ValueError): dti.tz_localize(tz, nonexistent=method) else: @@ -605,106 +638,135 @@ def test_dti_tz_localize_nonexistent(self, tz, method, exp): expected = DatetimeIndex([exp] * n, tz=tz) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('start_ts, tz, end_ts, shift', [ - ['2015-03-29 02:20:00', 'Europe/Warsaw', '2015-03-29 03:00:00', - 'forward'], - ['2015-03-29 02:20:00', 'Europe/Warsaw', - '2015-03-29 01:59:59.999999999', 'backward'], - ['2015-03-29 02:20:00', 'Europe/Warsaw', - '2015-03-29 03:20:00', timedelta(hours=1)], - ['2015-03-29 02:20:00', 'Europe/Warsaw', - '2015-03-29 01:20:00', timedelta(hours=-1)], - ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 03:00:00', - 'forward'], - ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 01:59:59.999999999', - 'backward'], - ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 03:33:00', - timedelta(hours=1)], - ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 01:33:00', - timedelta(hours=-1)] - ]) - @pytest.mark.parametrize('tz_type', ['', 'dateutil/']) - def test_dti_tz_localize_nonexistent_shift(self, start_ts, tz, - end_ts, shift, - tz_type): + @pytest.mark.parametrize( + "start_ts, tz, end_ts, shift", + [ + ["2015-03-29 02:20:00", "Europe/Warsaw", "2015-03-29 03:00:00", "forward"], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 01:59:59.999999999", + "backward", + ], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 03:20:00", + timedelta(hours=1), + ], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 01:20:00", + timedelta(hours=-1), + ], + ["2018-03-11 02:33:00", "US/Pacific", "2018-03-11 03:00:00", "forward"], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 01:59:59.999999999", + "backward", + ], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 03:33:00", + timedelta(hours=1), + ], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 01:33:00", + timedelta(hours=-1), + ], + ], + ) + @pytest.mark.parametrize("tz_type", ["", "dateutil/"]) + def test_dti_tz_localize_nonexistent_shift( + self, start_ts, tz, end_ts, shift, tz_type + ): # GH 8917 tz = tz_type + tz if isinstance(shift, str): - shift = 'shift_' + shift + shift = "shift_" + shift dti = DatetimeIndex([Timestamp(start_ts)]) result = dti.tz_localize(tz, nonexistent=shift) expected = DatetimeIndex([Timestamp(end_ts)]).tz_localize(tz) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('offset', [-1, 1]) - @pytest.mark.parametrize('tz_type', ['', 'dateutil/']) + @pytest.mark.parametrize("offset", [-1, 1]) + @pytest.mark.parametrize("tz_type", ["", "dateutil/"]) def test_dti_tz_localize_nonexistent_shift_invalid(self, offset, tz_type): # GH 8917 - tz = tz_type + 'Europe/Warsaw' - dti = DatetimeIndex([Timestamp('2015-03-29 02:20:00')]) + tz = tz_type + "Europe/Warsaw" + dti = DatetimeIndex([Timestamp("2015-03-29 02:20:00")]) msg = "The provided timedelta will relocalize on a nonexistent time" with pytest.raises(ValueError, match=msg): dti.tz_localize(tz, nonexistent=timedelta(seconds=offset)) - @pytest.mark.filterwarnings('ignore::FutureWarning') + @pytest.mark.filterwarnings("ignore::FutureWarning") def test_dti_tz_localize_errors_deprecation(self): # GH 22644 - tz = 'Europe/Warsaw' + tz = "Europe/Warsaw" n = 60 - dti = date_range(start='2015-03-29 02:00:00', periods=n, freq='min') + dti = date_range(start="2015-03-29 02:00:00", periods=n, freq="min") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): with pytest.raises(ValueError): - dti.tz_localize(tz, errors='foo') + dti.tz_localize(tz, errors="foo") # make sure errors='coerce' gets mapped correctly to nonexistent - result = dti.tz_localize(tz, errors='coerce') - expected = dti.tz_localize(tz, nonexistent='NaT') + result = dti.tz_localize(tz, errors="coerce") + expected = dti.tz_localize(tz, nonexistent="NaT") tm.assert_index_equal(result, expected) # ------------------------------------------------------------- # DatetimeIndex.normalize def test_normalize_tz(self): - rng = date_range('1/1/2000 9:30', periods=10, freq='D', - tz='US/Eastern') + rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz="US/Eastern") result = rng.normalize() - expected = date_range('1/1/2000', periods=10, freq='D', - tz='US/Eastern') + expected = date_range("1/1/2000", periods=10, freq="D", tz="US/Eastern") tm.assert_index_equal(result, expected) assert result.is_normalized assert not rng.is_normalized - rng = date_range('1/1/2000 9:30', periods=10, freq='D', tz='UTC') + rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz="UTC") result = rng.normalize() - expected = date_range('1/1/2000', periods=10, freq='D', tz='UTC') + expected = date_range("1/1/2000", periods=10, freq="D", tz="UTC") tm.assert_index_equal(result, expected) assert result.is_normalized assert not rng.is_normalized - rng = date_range('1/1/2000 9:30', periods=10, freq='D', tz=tzlocal()) + rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz=tzlocal()) result = rng.normalize() - expected = date_range('1/1/2000', periods=10, freq='D', tz=tzlocal()) + expected = date_range("1/1/2000", periods=10, freq="D", tz=tzlocal()) tm.assert_index_equal(result, expected) assert result.is_normalized assert not rng.is_normalized @td.skip_if_windows - @pytest.mark.parametrize('timezone', ['US/Pacific', 'US/Eastern', 'UTC', - 'Asia/Kolkata', 'Asia/Shanghai', - 'Australia/Canberra']) + @pytest.mark.parametrize( + "timezone", + [ + "US/Pacific", + "US/Eastern", + "UTC", + "Asia/Kolkata", + "Asia/Shanghai", + "Australia/Canberra", + ], + ) def test_normalize_tz_local(self, timezone): # GH#13459 with tm.set_timezone(timezone): - rng = date_range('1/1/2000 9:30', periods=10, freq='D', - tz=tzlocal()) + rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz=tzlocal()) result = rng.normalize() - expected = date_range('1/1/2000', periods=10, freq='D', - tz=tzlocal()) + expected = date_range("1/1/2000", periods=10, freq="D", tz=tzlocal()) tm.assert_index_equal(result, expected) assert result.is_normalized @@ -713,15 +775,15 @@ def test_normalize_tz_local(self, timezone): # ------------------------------------------------------------ # DatetimeIndex.__new__ - @pytest.mark.parametrize('prefix', ['', 'dateutil/']) + @pytest.mark.parametrize("prefix", ["", "dateutil/"]) def test_dti_constructor_static_tzinfo(self, prefix): # it works! - index = DatetimeIndex([datetime(2012, 1, 1)], tz=prefix + 'EST') + index = DatetimeIndex([datetime(2012, 1, 1)], tz=prefix + "EST") index.hour index[0] def test_dti_constructor_with_fixed_tz(self): - off = FixedOffset(420, '+07:00') + off = FixedOffset(420, "+07:00") start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off) rng = date_range(start=start, end=end) @@ -730,44 +792,39 @@ def test_dti_constructor_with_fixed_tz(self): rng2 = date_range(start, periods=len(rng), tz=off) tm.assert_index_equal(rng, rng2) - rng3 = date_range('3/11/2012 05:00:00+07:00', - '6/11/2012 05:00:00+07:00') + rng3 = date_range("3/11/2012 05:00:00+07:00", "6/11/2012 05:00:00+07:00") assert (rng.values == rng3.values).all() - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_dti_convert_datetime_list(self, tzstr): - dr = date_range('2012-06-02', periods=10, - tz=tzstr, name='foo') - dr2 = DatetimeIndex(list(dr), name='foo') + dr = date_range("2012-06-02", periods=10, tz=tzstr, name="foo") + dr2 = DatetimeIndex(list(dr), name="foo") tm.assert_index_equal(dr, dr2) assert dr.tz == dr2.tz - assert dr2.name == 'foo' + assert dr2.name == "foo" def test_dti_construction_univalent(self): - rng = date_range('03/12/2012 00:00', periods=10, freq='W-FRI', - tz='US/Eastern') - rng2 = DatetimeIndex(data=rng, tz='US/Eastern') + rng = date_range("03/12/2012 00:00", periods=10, freq="W-FRI", tz="US/Eastern") + rng2 = DatetimeIndex(data=rng, tz="US/Eastern") tm.assert_index_equal(rng, rng2) - @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), - gettz('US/Eastern')]) + @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) def test_dti_from_tzaware_datetime(self, tz): d = [datetime(2012, 8, 19, tzinfo=tz)] index = DatetimeIndex(d) assert timezones.tz_compare(index.tz, tz) - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_dti_tz_constructors(self, tzstr): """ Test different DatetimeIndex constructions with timezone Follow-up of GH#4229 """ - arr = ['11/10/2005 08:00:00', '11/10/2005 09:00:00'] + arr = ["11/10/2005 08:00:00", "11/10/2005 09:00:00"] idx1 = to_datetime(arr).tz_localize(tzstr) - idx2 = pd.date_range(start="2005-11-10 08:00:00", freq='H', periods=2, - tz=tzstr) + idx2 = pd.date_range(start="2005-11-10 08:00:00", freq="H", periods=2, tz=tzstr) idx3 = DatetimeIndex(arr, tz=tzstr) idx4 = DatetimeIndex(np.array(arr), tz=tzstr) @@ -778,10 +835,10 @@ def test_dti_tz_constructors(self, tzstr): # Unsorted def test_join_utc_convert(self, join_type): - rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") - left = rng.tz_convert('US/Eastern') - right = rng.tz_convert('Europe/Berlin') + left = rng.tz_convert("US/Eastern") + right = rng.tz_convert("Europe/Berlin") result = left.join(left[:-5], how=join_type) assert isinstance(result, DatetimeIndex) @@ -789,30 +846,30 @@ def test_join_utc_convert(self, join_type): result = left.join(right[:-5], how=join_type) assert isinstance(result, DatetimeIndex) - assert result.tz.zone == 'UTC' + assert result.tz.zone == "UTC" - @pytest.mark.parametrize("dtype", [ - None, 'datetime64[ns, CET]', - 'datetime64[ns, EST]', 'datetime64[ns, UTC]' - ]) + @pytest.mark.parametrize( + "dtype", + [None, "datetime64[ns, CET]", "datetime64[ns, EST]", "datetime64[ns, UTC]"], + ) def test_date_accessor(self, dtype): # Regression test for GH#21230 expected = np.array([date(2018, 6, 4), pd.NaT]) - index = DatetimeIndex(['2018-06-04 10:00:00', pd.NaT], dtype=dtype) + index = DatetimeIndex(["2018-06-04 10:00:00", pd.NaT], dtype=dtype) result = index.date tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("dtype", [ - None, 'datetime64[ns, CET]', - 'datetime64[ns, EST]', 'datetime64[ns, UTC]' - ]) + @pytest.mark.parametrize( + "dtype", + [None, "datetime64[ns, CET]", "datetime64[ns, EST]", "datetime64[ns, UTC]"], + ) def test_time_accessor(self, dtype): # Regression test for GH#21267 expected = np.array([time(10, 20, 30), pd.NaT]) - index = DatetimeIndex(['2018-06-04 10:20:30', pd.NaT], dtype=dtype) + index = DatetimeIndex(["2018-06-04 10:20:30", pd.NaT], dtype=dtype) result = index.time tm.assert_numpy_array_equal(result, expected) @@ -823,7 +880,7 @@ def test_timetz_accessor(self, tz_naive_fixture): expected = np.array([time(10, 20, 30, tzinfo=tz), pd.NaT]) - index = DatetimeIndex(['2018-06-04 10:20:30', pd.NaT], tz=tz) + index = DatetimeIndex(["2018-06-04 10:20:30", pd.NaT], tz=tz) result = index.timetz tm.assert_numpy_array_equal(result, expected) @@ -837,10 +894,10 @@ def test_dti_drop_dont_lose_tz(self): def test_dti_tz_conversion_freq(self, tz_naive_fixture): # GH25241 - t3 = DatetimeIndex(['2019-01-01 10:00'], freq='H') + t3 = DatetimeIndex(["2019-01-01 10:00"], freq="H") assert t3.tz_localize(tz=tz_naive_fixture).freq == t3.freq - t4 = DatetimeIndex(['2019-01-02 12:00'], tz='UTC', freq='T') - assert t4.tz_convert(tz='UTC').freq == t4.freq + t4 = DatetimeIndex(["2019-01-02 12:00"], tz="UTC", freq="T") + assert t4.tz_convert(tz="UTC").freq == t4.freq def test_drop_dst_boundary(self): # see gh-18031 @@ -851,32 +908,52 @@ def test_drop_dst_boundary(self): end = pd.Timestamp("201710290300", tz=tz) index = pd.date_range(start=start, end=end, freq=freq) - expected = DatetimeIndex(["201710290115", "201710290130", - "201710290145", "201710290200", - "201710290215", "201710290230", - "201710290245", "201710290200", - "201710290215", "201710290230", - "201710290245", "201710290300"], - tz=tz, freq=freq, - ambiguous=[True, True, True, True, - True, True, True, False, - False, False, False, False]) + expected = DatetimeIndex( + [ + "201710290115", + "201710290130", + "201710290145", + "201710290200", + "201710290215", + "201710290230", + "201710290245", + "201710290200", + "201710290215", + "201710290230", + "201710290245", + "201710290300", + ], + tz=tz, + freq=freq, + ambiguous=[ + True, + True, + True, + True, + True, + True, + True, + False, + False, + False, + False, + False, + ], + ) result = index.drop(index[0]) tm.assert_index_equal(result, expected) def test_date_range_localize(self): - rng = date_range('3/11/2012 03:00', periods=15, freq='H', - tz='US/Eastern') - rng2 = DatetimeIndex(['3/11/2012 03:00', '3/11/2012 04:00'], - tz='US/Eastern') - rng3 = date_range('3/11/2012 03:00', periods=15, freq='H') - rng3 = rng3.tz_localize('US/Eastern') + rng = date_range("3/11/2012 03:00", periods=15, freq="H", tz="US/Eastern") + rng2 = DatetimeIndex(["3/11/2012 03:00", "3/11/2012 04:00"], tz="US/Eastern") + rng3 = date_range("3/11/2012 03:00", periods=15, freq="H") + rng3 = rng3.tz_localize("US/Eastern") tm.assert_index_equal(rng, rng3) # DST transition time val = rng[0] - exp = Timestamp('3/11/2012 03:00', tz='US/Eastern') + exp = Timestamp("3/11/2012 03:00", tz="US/Eastern") assert val.hour == 3 assert exp.hour == 3 @@ -884,26 +961,23 @@ def test_date_range_localize(self): tm.assert_index_equal(rng[:2], rng2) # Right before the DST transition - rng = date_range('3/11/2012 00:00', periods=2, freq='H', - tz='US/Eastern') - rng2 = DatetimeIndex(['3/11/2012 00:00', '3/11/2012 01:00'], - tz='US/Eastern') + rng = date_range("3/11/2012 00:00", periods=2, freq="H", tz="US/Eastern") + rng2 = DatetimeIndex(["3/11/2012 00:00", "3/11/2012 01:00"], tz="US/Eastern") tm.assert_index_equal(rng, rng2) - exp = Timestamp('3/11/2012 00:00', tz='US/Eastern') + exp = Timestamp("3/11/2012 00:00", tz="US/Eastern") assert exp.hour == 0 assert rng[0] == exp - exp = Timestamp('3/11/2012 01:00', tz='US/Eastern') + exp = Timestamp("3/11/2012 01:00", tz="US/Eastern") assert exp.hour == 1 assert rng[1] == exp - rng = date_range('3/11/2012 00:00', periods=10, freq='H', - tz='US/Eastern') + rng = date_range("3/11/2012 00:00", periods=10, freq="H", tz="US/Eastern") assert rng[2].hour == 3 def test_timestamp_equality_different_timezones(self): - utc_range = date_range('1/1/2000', periods=20, tz='UTC') - eastern_range = utc_range.tz_convert('US/Eastern') - berlin_range = utc_range.tz_convert('Europe/Berlin') + utc_range = date_range("1/1/2000", periods=20, tz="UTC") + eastern_range = utc_range.tz_convert("US/Eastern") + berlin_range = utc_range.tz_convert("Europe/Berlin") for a, b, c in zip(utc_range, eastern_range, berlin_range): assert a == b @@ -915,7 +989,7 @@ def test_timestamp_equality_different_timezones(self): assert (berlin_range == eastern_range).all() def test_dti_intersection(self): - rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") left = rng[10:90][::-1] right = rng[20:80][::-1] @@ -925,24 +999,24 @@ def test_dti_intersection(self): assert result.tz == left.tz def test_dti_equals_with_tz(self): - left = date_range('1/1/2011', periods=100, freq='H', tz='utc') - right = date_range('1/1/2011', periods=100, freq='H', tz='US/Eastern') + left = date_range("1/1/2011", periods=100, freq="H", tz="utc") + right = date_range("1/1/2011", periods=100, freq="H", tz="US/Eastern") assert not left.equals(right) - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_dti_tz_nat(self, tzstr): idx = DatetimeIndex([Timestamp("2013-1-1", tz=tzstr), pd.NaT]) assert isna(idx[1]) assert idx[0].tzinfo is not None - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_dti_astype_asobject_tzinfos(self, tzstr): # GH#1345 # dates around a dst transition - rng = date_range('2/13/2010', '5/6/2010', tz=tzstr) + rng = date_range("2/13/2010", "5/6/2010", tz=tzstr) objs = rng.astype(object) for i, x in enumerate(objs): @@ -956,28 +1030,28 @@ def test_dti_astype_asobject_tzinfos(self, tzstr): assert x == exval assert x.tzinfo == exval.tzinfo - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_dti_with_timezone_repr(self, tzstr): - rng = date_range('4/13/2010', '5/6/2010') + rng = date_range("4/13/2010", "5/6/2010") rng_eastern = rng.tz_localize(tzstr) rng_repr = repr(rng_eastern) - assert '2010-04-13 00:00:00' in rng_repr + assert "2010-04-13 00:00:00" in rng_repr - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_dti_take_dont_lose_meta(self, tzstr): - rng = date_range('1/1/2000', periods=20, tz=tzstr) + rng = date_range("1/1/2000", periods=20, tz=tzstr) result = rng.take(range(5)) assert result.tz == rng.tz assert result.freq == rng.freq - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_utc_box_timestamp_and_localize(self, tzstr): tz = timezones.maybe_get_tz(tzstr) - rng = date_range('3/11/2012', '3/12/2012', freq='H', tz='utc') + rng = date_range("3/11/2012", "3/12/2012", freq="H", tz="utc") rng_eastern = rng.tz_convert(tzstr) expected = rng[-1].astimezone(tz) @@ -987,15 +1061,16 @@ def test_utc_box_timestamp_and_localize(self, tzstr): assert stamp.tzinfo == expected.tzinfo # right tzinfo - rng = date_range('3/13/2012', '3/14/2012', freq='H', tz='utc') + rng = date_range("3/13/2012", "3/14/2012", freq="H", tz="utc") rng_eastern = rng.tz_convert(tzstr) # test not valid for dateutil timezones. # assert 'EDT' in repr(rng_eastern[0].tzinfo) - assert ('EDT' in repr(rng_eastern[0].tzinfo) or - 'tzfile' in repr(rng_eastern[0].tzinfo)) + assert "EDT" in repr(rng_eastern[0].tzinfo) or "tzfile" in repr( + rng_eastern[0].tzinfo + ) def test_dti_to_pydatetime(self): - dt = dateutil.parser.parse('2012-06-13T01:39:00Z') + dt = dateutil.parser.parse("2012-06-13T01:39:00Z") dt = dt.replace(tzinfo=tzlocal()) arr = np.array([dt], dtype=object) @@ -1003,15 +1078,19 @@ def test_dti_to_pydatetime(self): result = to_datetime(arr, utc=True) assert result.tz is pytz.utc - rng = date_range('2012-11-03 03:00', '2012-11-05 03:00', tz=tzlocal()) + rng = date_range("2012-11-03 03:00", "2012-11-05 03:00", tz=tzlocal()) arr = rng.to_pydatetime() result = to_datetime(arr, utc=True) assert result.tz is pytz.utc def test_dti_to_pydatetime_fizedtz(self): - dates = np.array([datetime(2000, 1, 1, tzinfo=fixed_off), - datetime(2000, 1, 2, tzinfo=fixed_off), - datetime(2000, 1, 3, tzinfo=fixed_off)]) + dates = np.array( + [ + datetime(2000, 1, 1, tzinfo=fixed_off), + datetime(2000, 1, 2, tzinfo=fixed_off), + datetime(2000, 1, 3, tzinfo=fixed_off), + ] + ) dti = DatetimeIndex(dates) result = dti.to_pydatetime() @@ -1020,8 +1099,7 @@ def test_dti_to_pydatetime_fizedtz(self): result = dti._mpl_repr() tm.assert_numpy_array_equal(dates, result) - @pytest.mark.parametrize('tz', [pytz.timezone('US/Central'), - gettz('US/Central')]) + @pytest.mark.parametrize("tz", [pytz.timezone("US/Central"), gettz("US/Central")]) def test_with_tz(self, tz): # just want it to work start = datetime(2011, 3, 12, tzinfo=pytz.utc) @@ -1029,8 +1107,8 @@ def test_with_tz(self, tz): assert dr.tz is pytz.utc # DateRange with naive datetimes - dr = bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc) - dr = bdate_range('1/1/2005', '1/1/2009', tz=tz) + dr = bdate_range("1/1/2005", "1/1/2009", tz=pytz.utc) + dr = bdate_range("1/1/2005", "1/1/2009", tz=tz) # normalized central = dr.tz_convert(tz) @@ -1045,31 +1123,30 @@ def test_with_tz(self, tz): assert central[0].tz is comp # datetimes with tzinfo set - dr = bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), - datetime(2009, 1, 1, tzinfo=pytz.utc)) + dr = bdate_range( + datetime(2005, 1, 1, tzinfo=pytz.utc), datetime(2009, 1, 1, tzinfo=pytz.utc) + ) with pytest.raises(Exception): - bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), '1/1/2009', - tz=tz) + bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), "1/1/2009", tz=tz) - @pytest.mark.parametrize('prefix', ['', 'dateutil/']) + @pytest.mark.parametrize("prefix", ["", "dateutil/"]) def test_field_access_localize(self, prefix): - strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] - rng = DatetimeIndex(strdates, tz=prefix + 'US/Eastern') + strdates = ["1/1/2012", "3/1/2012", "4/1/2012"] + rng = DatetimeIndex(strdates, tz=prefix + "US/Eastern") assert (rng.hour == 0).all() # a more unusual time zone, #1946 - dr = date_range('2011-10-02 00:00', freq='h', periods=10, - tz=prefix + 'America/Atikokan') + dr = date_range( + "2011-10-02 00:00", freq="h", periods=10, tz=prefix + "America/Atikokan" + ) expected = Index(np.arange(10, dtype=np.int64)) tm.assert_index_equal(dr.hour, expected) - @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), - gettz('US/Eastern')]) + @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) def test_dti_convert_tz_aware_datetime_datetime(self, tz): # GH#1581 - dates = [datetime(2000, 1, 1), datetime(2000, 1, 2), - datetime(2000, 1, 3)] + dates = [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)] dates_aware = [conversion.localize_pydatetime(x, tz) for x in dates] result = DatetimeIndex(dates_aware) @@ -1082,70 +1159,74 @@ def test_dti_convert_tz_aware_datetime_datetime(self, tz): def test_dti_union_aware(self): # non-overlapping - rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", - tz="US/Central") + rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", tz="US/Central") - rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="H", - tz="US/Eastern") + rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="H", tz="US/Eastern") result = rng.union(rng2) - expected = rng.astype('O').union(rng2.astype('O')) + expected = rng.astype("O").union(rng2.astype("O")) tm.assert_index_equal(result, expected) - assert result[0].tz.zone == 'US/Central' - assert result[-1].tz.zone == 'US/Eastern' + assert result[0].tz.zone == "US/Central" + assert result[-1].tz.zone == "US/Eastern" def test_dti_union_mixed(self): # GH 21671 - rng = DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT]) - rng2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02'], tz='Asia/Tokyo') + rng = DatetimeIndex([pd.Timestamp("2011-01-01"), pd.NaT]) + rng2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz="Asia/Tokyo") result = rng.union(rng2) - expected = Index([pd.Timestamp('2011-01-01'), - pd.NaT, - pd.Timestamp('2012-01-01', tz='Asia/Tokyo'), - pd.Timestamp('2012-01-02', tz='Asia/Tokyo')], - dtype=object) + expected = Index( + [ + pd.Timestamp("2011-01-01"), + pd.NaT, + pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), + pd.Timestamp("2012-01-02", tz="Asia/Tokyo"), + ], + dtype=object, + ) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('tz', [None, 'UTC', "US/Central", - dateutil.tz.tzoffset(None, -28800)]) + @pytest.mark.parametrize( + "tz", [None, "UTC", "US/Central", dateutil.tz.tzoffset(None, -28800)] + ) @pytest.mark.usefixtures("datetime_tz_utc") def test_iteration_preserves_nanoseconds(self, tz): # GH 19603 - index = DatetimeIndex(["2018-02-08 15:00:00.168456358", - "2018-02-08 15:00:00.168456359"], tz=tz) + index = DatetimeIndex( + ["2018-02-08 15:00:00.168456358", "2018-02-08 15:00:00.168456359"], tz=tz + ) for i, ts in enumerate(index): assert ts == index[i] class TestDateRange: """Tests for date_range with timezones""" + def test_hongkong_tz_convert(self): # GH#1673 smoke test - dr = date_range('2012-01-01', '2012-01-10', freq='D', tz='Hongkong') + dr = date_range("2012-01-01", "2012-01-10", freq="D", tz="Hongkong") # it works! dr.hour - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_date_range_span_dst_transition(self, tzstr): # GH#1778 # Standard -> Daylight Savings Time - dr = date_range('03/06/2012 00:00', periods=200, freq='W-FRI', - tz='US/Eastern') + dr = date_range("03/06/2012 00:00", periods=200, freq="W-FRI", tz="US/Eastern") assert (dr.hour == 0).all() - dr = date_range('2012-11-02', periods=10, tz=tzstr) + dr = date_range("2012-11-02", periods=10, tz=tzstr) result = dr.hour expected = Index([0] * 10) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_date_range_timezone_str_argument(self, tzstr): tz = timezones.maybe_get_tz(tzstr) - result = date_range('1/1/2000', periods=10, tz=tzstr) - expected = date_range('1/1/2000', periods=10, tz=tz) + result = date_range("1/1/2000", periods=10, tz=tzstr) + expected = date_range("1/1/2000", periods=10, tz=tz) tm.assert_index_equal(result, expected) @@ -1159,29 +1240,30 @@ def test_date_range_with_fixedoffset_noname(self): idx = Index([start, end]) assert off == idx.tz - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_date_range_with_tz(self, tzstr): - stamp = Timestamp('3/11/2012 05:00', tz=tzstr) + stamp = Timestamp("3/11/2012 05:00", tz=tzstr) assert stamp.hour == 5 - rng = date_range('3/11/2012 04:00', periods=10, freq='H', - tz=tzstr) + rng = date_range("3/11/2012 04:00", periods=10, freq="H", tz=tzstr) assert stamp == rng[1] class TestToDatetime: """Tests for the to_datetime constructor with timezones""" + def test_to_datetime_utc(self): - arr = np.array([dateutil.parser.parse('2012-06-13T01:39:00Z')], - dtype=object) + arr = np.array([dateutil.parser.parse("2012-06-13T01:39:00Z")], dtype=object) result = to_datetime(arr, utc=True) assert result.tz is pytz.utc def test_to_datetime_fixed_offset(self): - dates = [datetime(2000, 1, 1, tzinfo=fixed_off), - datetime(2000, 1, 2, tzinfo=fixed_off), - datetime(2000, 1, 3, tzinfo=fixed_off)] + dates = [ + datetime(2000, 1, 1, tzinfo=fixed_off), + datetime(2000, 1, 2, tzinfo=fixed_off), + datetime(2000, 1, 3, tzinfo=fixed_off), + ] result = to_datetime(dates) assert result.tz == fixed_off diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 784633b2512cec..ec4310dbc8396b 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -19,8 +19,16 @@ import pandas as pd from pandas import ( - DataFrame, DatetimeIndex, Index, NaT, Series, Timestamp, date_range, isna, - to_datetime) + DataFrame, + DatetimeIndex, + Index, + NaT, + Series, + Timestamp, + date_range, + isna, + to_datetime, +) from pandas.core.arrays import DatetimeArray from pandas.core.tools import datetimes as tools from pandas.util import testing as tm @@ -28,23 +36,21 @@ class TestTimeConversionFormats: - - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_format(self, cache): - values = ['1/1/2000', '1/2/2000', '1/3/2000'] - - results1 = [Timestamp('20000101'), Timestamp('20000201'), - Timestamp('20000301')] - results2 = [Timestamp('20000101'), Timestamp('20000102'), - Timestamp('20000103')] - for vals, expecteds in [(values, (Index(results1), Index(results2))), - (Series(values), - (Series(results1), Series(results2))), - (values[0], (results1[0], results2[0])), - (values[1], (results1[1], results2[1])), - (values[2], (results1[2], results2[2]))]: - - for i, fmt in enumerate(['%d/%m/%Y', '%m/%d/%Y']): + values = ["1/1/2000", "1/2/2000", "1/3/2000"] + + results1 = [Timestamp("20000101"), Timestamp("20000201"), Timestamp("20000301")] + results2 = [Timestamp("20000101"), Timestamp("20000102"), Timestamp("20000103")] + for vals, expecteds in [ + (values, (Index(results1), Index(results2))), + (Series(values), (Series(results1), Series(results2))), + (values[0], (results1[0], results2[0])), + (values[1], (results1[1], results2[1])), + (values[2], (results1[2], results2[2])), + ]: + + for i, fmt in enumerate(["%d/%m/%Y", "%m/%d/%Y"]): result = to_datetime(vals, format=fmt, cache=cache) expected = expecteds[i] @@ -55,123 +61,138 @@ def test_to_datetime_format(self, cache): else: tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_format_YYYYMMDD(self, cache): s = Series([19801222, 19801222] + [19810105] * 5) expected = Series([Timestamp(x) for x in s.apply(str)]) - result = to_datetime(s, format='%Y%m%d', cache=cache) + result = to_datetime(s, format="%Y%m%d", cache=cache) assert_series_equal(result, expected) - result = to_datetime(s.apply(str), format='%Y%m%d', cache=cache) + result = to_datetime(s.apply(str), format="%Y%m%d", cache=cache) assert_series_equal(result, expected) # with NaT - expected = Series([Timestamp("19801222"), Timestamp("19801222")] + - [Timestamp("19810105")] * 5) + expected = Series( + [Timestamp("19801222"), Timestamp("19801222")] + [Timestamp("19810105")] * 5 + ) expected[2] = np.nan s[2] = np.nan - result = to_datetime(s, format='%Y%m%d', cache=cache) + result = to_datetime(s, format="%Y%m%d", cache=cache) assert_series_equal(result, expected) # string with NaT s = s.apply(str) - s[2] = 'nat' - result = to_datetime(s, format='%Y%m%d', cache=cache) + s[2] = "nat" + result = to_datetime(s, format="%Y%m%d", cache=cache) assert_series_equal(result, expected) # coercion # GH 7930 s = Series([20121231, 20141231, 99991231]) - result = pd.to_datetime(s, format='%Y%m%d', errors='ignore', - cache=cache) - expected = Series([datetime(2012, 12, 31), - datetime(2014, 12, 31), datetime(9999, 12, 31)], - dtype=object) + result = pd.to_datetime(s, format="%Y%m%d", errors="ignore", cache=cache) + expected = Series( + [datetime(2012, 12, 31), datetime(2014, 12, 31), datetime(9999, 12, 31)], + dtype=object, + ) tm.assert_series_equal(result, expected) - result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', - cache=cache) - expected = Series(['20121231', '20141231', 'NaT'], dtype='M8[ns]') + result = pd.to_datetime(s, format="%Y%m%d", errors="coerce", cache=cache) + expected = Series(["20121231", "20141231", "NaT"], dtype="M8[ns]") tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("input_s, expected", [ - # NaN before strings with invalid date values - [Series(['19801222', np.nan, '20010012', '10019999']), - Series([Timestamp('19801222'), np.nan, np.nan, np.nan])], - # NaN after strings with invalid date values - [Series(['19801222', '20010012', '10019999', np.nan]), - Series([Timestamp('19801222'), np.nan, np.nan, np.nan])], - # NaN before integers with invalid date values - [Series([20190813, np.nan, 20010012, 20019999]), - Series([Timestamp('20190813'), np.nan, np.nan, np.nan])], - # NaN after integers with invalid date values - [Series([20190813, 20010012, np.nan, 20019999]), - Series([Timestamp('20190813'), np.nan, np.nan, np.nan])]]) + @pytest.mark.parametrize( + "input_s, expected", + [ + # NaN before strings with invalid date values + [ + Series(["19801222", np.nan, "20010012", "10019999"]), + Series([Timestamp("19801222"), np.nan, np.nan, np.nan]), + ], + # NaN after strings with invalid date values + [ + Series(["19801222", "20010012", "10019999", np.nan]), + Series([Timestamp("19801222"), np.nan, np.nan, np.nan]), + ], + # NaN before integers with invalid date values + [ + Series([20190813, np.nan, 20010012, 20019999]), + Series([Timestamp("20190813"), np.nan, np.nan, np.nan]), + ], + # NaN after integers with invalid date values + [ + Series([20190813, 20010012, np.nan, 20019999]), + Series([Timestamp("20190813"), np.nan, np.nan, np.nan]), + ], + ], + ) def test_to_datetime_format_YYYYMMDD_overflow(self, input_s, expected): # GH 25512 # format='%Y%m%d', errors='coerce' - result = pd.to_datetime(input_s, format='%Y%m%d', errors='coerce') + result = pd.to_datetime(input_s, format="%Y%m%d", errors="coerce") assert_series_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_format_integer(self, cache): # GH 10178 s = Series([2000, 2001, 2002]) expected = Series([Timestamp(x) for x in s.apply(str)]) - result = to_datetime(s, format='%Y', cache=cache) + result = to_datetime(s, format="%Y", cache=cache) assert_series_equal(result, expected) s = Series([200001, 200105, 200206]) - expected = Series([Timestamp(x[:4] + '-' + x[4:]) for x in s.apply(str) - ]) + expected = Series([Timestamp(x[:4] + "-" + x[4:]) for x in s.apply(str)]) - result = to_datetime(s, format='%Y%m', cache=cache) + result = to_datetime(s, format="%Y%m", cache=cache) assert_series_equal(result, expected) - @pytest.mark.parametrize('int_date, expected', [ - # valid date, length == 8 - [20121030, datetime(2012, 10, 30)], - # short valid date, length == 6 - [199934, datetime(1999, 3, 4)], - # long integer date partially parsed to datetime(2012,1,1), length > 8 - [2012010101, 2012010101], - # invalid date partially parsed to datetime(2012,9,9), length == 8 - [20129930, 20129930], - # short integer date partially parsed to datetime(2012,9,9), length < 8 - [2012993, 2012993], - # short invalid date, length == 4 - [2121, 2121]]) - def test_int_to_datetime_format_YYYYMMDD_typeerror(self, int_date, - expected): + @pytest.mark.parametrize( + "int_date, expected", + [ + # valid date, length == 8 + [20121030, datetime(2012, 10, 30)], + # short valid date, length == 6 + [199934, datetime(1999, 3, 4)], + # long integer date partially parsed to datetime(2012,1,1), length > 8 + [2012010101, 2012010101], + # invalid date partially parsed to datetime(2012,9,9), length == 8 + [20129930, 20129930], + # short integer date partially parsed to datetime(2012,9,9), length < 8 + [2012993, 2012993], + # short invalid date, length == 4 + [2121, 2121], + ], + ) + def test_int_to_datetime_format_YYYYMMDD_typeerror(self, int_date, expected): # GH 26583 - result = to_datetime(int_date, format='%Y%m%d', errors='ignore') + result = to_datetime(int_date, format="%Y%m%d", errors="ignore") assert result == expected - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_format_microsecond(self, cache): # these are locale dependent lang, _ = locale.getlocale() month_abbr = calendar.month_abbr[4] - val = '01-{}-2011 00:00:01.978'.format(month_abbr) + val = "01-{}-2011 00:00:01.978".format(month_abbr) - format = '%d-%b-%Y %H:%M:%S.%f' + format = "%d-%b-%Y %H:%M:%S.%f" result = to_datetime(val, format=format, cache=cache) exp = datetime.strptime(val, format) assert result == exp - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_format_time(self, cache): data = [ - ['01/10/2010 15:20', '%m/%d/%Y %H:%M', - Timestamp('2010-01-10 15:20')], - ['01/10/2010 05:43', '%m/%d/%Y %I:%M', - Timestamp('2010-01-10 05:43')], - ['01/10/2010 13:56:01', '%m/%d/%Y %H:%M:%S', - Timestamp('2010-01-10 13:56:01')] # , + ["01/10/2010 15:20", "%m/%d/%Y %H:%M", Timestamp("2010-01-10 15:20")], + ["01/10/2010 05:43", "%m/%d/%Y %I:%M", Timestamp("2010-01-10 05:43")], + [ + "01/10/2010 13:56:01", + "%m/%d/%Y %H:%M:%S", + Timestamp("2010-01-10 13:56:01"), + ] # , # ['01/10/2010 08:14 PM', '%m/%d/%Y %I:%M %p', # Timestamp('2010-01-10 20:14')], # ['01/10/2010 07:40 AM', '%m/%d/%Y %I:%M %p', @@ -183,75 +204,97 @@ def test_to_datetime_format_time(self, cache): assert to_datetime(s, format=format, cache=cache) == dt @td.skip_if_has_locale - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_with_non_exact(self, cache): # GH 10834 # 8904 # exact kw - s = Series(['19MAY11', 'foobar19MAY11', '19MAY11:00:00:00', - '19MAY11 00:00:00Z']) - result = to_datetime(s, format='%d%b%y', exact=False, cache=cache) - expected = to_datetime(s.str.extract(r'(\d+\w+\d+)', expand=False), - format='%d%b%y', cache=cache) + s = Series( + ["19MAY11", "foobar19MAY11", "19MAY11:00:00:00", "19MAY11 00:00:00Z"] + ) + result = to_datetime(s, format="%d%b%y", exact=False, cache=cache) + expected = to_datetime( + s.str.extract(r"(\d+\w+\d+)", expand=False), format="%d%b%y", cache=cache + ) assert_series_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_parse_nanoseconds_with_formula(self, cache): # GH8989 # truncating the nanoseconds when a format was provided - for v in ["2012-01-01 09:00:00.000000001", - "2012-01-01 09:00:00.000001", - "2012-01-01 09:00:00.001", - "2012-01-01 09:00:00.001000", - "2012-01-01 09:00:00.001000000", ]: + for v in [ + "2012-01-01 09:00:00.000000001", + "2012-01-01 09:00:00.000001", + "2012-01-01 09:00:00.001", + "2012-01-01 09:00:00.001000", + "2012-01-01 09:00:00.001000000", + ]: expected = pd.to_datetime(v, cache=cache) - result = pd.to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f", - cache=cache) + result = pd.to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f", cache=cache) assert result == expected - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_format_weeks(self, cache): data = [ - ['2009324', '%Y%W%w', Timestamp('2009-08-13')], - ['2013020', '%Y%U%w', Timestamp('2013-01-13')] + ["2009324", "%Y%W%w", Timestamp("2009-08-13")], + ["2013020", "%Y%U%w", Timestamp("2013-01-13")], ] for s, format, dt in data: assert to_datetime(s, format=format, cache=cache) == dt - @pytest.mark.parametrize("fmt,dates,expected_dates", [ - ['%Y-%m-%d %H:%M:%S %Z', - ['2010-01-01 12:00:00 UTC'] * 2, - [pd.Timestamp('2010-01-01 12:00:00', tz='UTC')] * 2], - ['%Y-%m-%d %H:%M:%S %Z', - ['2010-01-01 12:00:00 UTC', - '2010-01-01 12:00:00 GMT', - '2010-01-01 12:00:00 US/Pacific'], - [pd.Timestamp('2010-01-01 12:00:00', tz='UTC'), - pd.Timestamp('2010-01-01 12:00:00', tz='GMT'), - pd.Timestamp('2010-01-01 12:00:00', tz='US/Pacific')]], - ['%Y-%m-%d %H:%M:%S%z', - ['2010-01-01 12:00:00+0100'] * 2, - [pd.Timestamp('2010-01-01 12:00:00', - tzinfo=pytz.FixedOffset(60))] * 2], - ['%Y-%m-%d %H:%M:%S %z', - ['2010-01-01 12:00:00 +0100'] * 2, - [pd.Timestamp('2010-01-01 12:00:00', - tzinfo=pytz.FixedOffset(60))] * 2], - ['%Y-%m-%d %H:%M:%S %z', - ['2010-01-01 12:00:00 +0100', '2010-01-01 12:00:00 -0100'], - [pd.Timestamp('2010-01-01 12:00:00', - tzinfo=pytz.FixedOffset(60)), - pd.Timestamp('2010-01-01 12:00:00', - tzinfo=pytz.FixedOffset(-60))]], - ['%Y-%m-%d %H:%M:%S %z', - ['2010-01-01 12:00:00 Z', '2010-01-01 12:00:00 Z'], - [pd.Timestamp('2010-01-01 12:00:00', - tzinfo=pytz.FixedOffset(0)), # pytz coerces to UTC - pd.Timestamp('2010-01-01 12:00:00', - tzinfo=pytz.FixedOffset(0))]]]) - def test_to_datetime_parse_tzname_or_tzoffset(self, fmt, dates, - expected_dates): + @pytest.mark.parametrize( + "fmt,dates,expected_dates", + [ + [ + "%Y-%m-%d %H:%M:%S %Z", + ["2010-01-01 12:00:00 UTC"] * 2, + [pd.Timestamp("2010-01-01 12:00:00", tz="UTC")] * 2, + ], + [ + "%Y-%m-%d %H:%M:%S %Z", + [ + "2010-01-01 12:00:00 UTC", + "2010-01-01 12:00:00 GMT", + "2010-01-01 12:00:00 US/Pacific", + ], + [ + pd.Timestamp("2010-01-01 12:00:00", tz="UTC"), + pd.Timestamp("2010-01-01 12:00:00", tz="GMT"), + pd.Timestamp("2010-01-01 12:00:00", tz="US/Pacific"), + ], + ], + [ + "%Y-%m-%d %H:%M:%S%z", + ["2010-01-01 12:00:00+0100"] * 2, + [pd.Timestamp("2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(60))] * 2, + ], + [ + "%Y-%m-%d %H:%M:%S %z", + ["2010-01-01 12:00:00 +0100"] * 2, + [pd.Timestamp("2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(60))] * 2, + ], + [ + "%Y-%m-%d %H:%M:%S %z", + ["2010-01-01 12:00:00 +0100", "2010-01-01 12:00:00 -0100"], + [ + pd.Timestamp("2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(60)), + pd.Timestamp("2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(-60)), + ], + ], + [ + "%Y-%m-%d %H:%M:%S %z", + ["2010-01-01 12:00:00 Z", "2010-01-01 12:00:00 Z"], + [ + pd.Timestamp( + "2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(0) + ), # pytz coerces to UTC + pd.Timestamp("2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(0)), + ], + ], + ], + ) + def test_to_datetime_parse_tzname_or_tzoffset(self, fmt, dates, expected_dates): # GH 13486 result = pd.to_datetime(dates, format=fmt) expected = pd.Index(expected_dates) @@ -260,69 +303,120 @@ def test_to_datetime_parse_tzname_or_tzoffset(self, fmt, dates, with pytest.raises(ValueError): pd.to_datetime(dates, format=fmt, utc=True) - @pytest.mark.parametrize('offset', [ - '+0', '-1foo', 'UTCbar', ':10', '+01:000:01', '']) + @pytest.mark.parametrize( + "offset", ["+0", "-1foo", "UTCbar", ":10", "+01:000:01", ""] + ) def test_to_datetime_parse_timezone_malformed(self, offset): - fmt = '%Y-%m-%d %H:%M:%S %z' - date = '2010-01-01 12:00:00 ' + offset + fmt = "%Y-%m-%d %H:%M:%S %z" + date = "2010-01-01 12:00:00 " + offset with pytest.raises(ValueError): pd.to_datetime([date], format=fmt) def test_to_datetime_parse_timezone_keeps_name(self): # GH 21697 - fmt = '%Y-%m-%d %H:%M:%S %z' - arg = pd.Index(['2010-01-01 12:00:00 Z'], name='foo') + fmt = "%Y-%m-%d %H:%M:%S %z" + arg = pd.Index(["2010-01-01 12:00:00 Z"], name="foo") result = pd.to_datetime(arg, format=fmt) - expected = pd.DatetimeIndex(['2010-01-01 12:00:00'], tz='UTC', - name='foo') + expected = pd.DatetimeIndex(["2010-01-01 12:00:00"], tz="UTC", name="foo") tm.assert_index_equal(result, expected) class TestToDatetime: - @pytest.mark.parametrize("s, _format, dt", [ - ['2015-1-1', '%G-%V-%u', datetime(2014, 12, 29, 0, 0)], - ['2015-1-4', '%G-%V-%u', datetime(2015, 1, 1, 0, 0)], - ['2015-1-7', '%G-%V-%u', datetime(2015, 1, 4, 0, 0)] - ]) + @pytest.mark.parametrize( + "s, _format, dt", + [ + ["2015-1-1", "%G-%V-%u", datetime(2014, 12, 29, 0, 0)], + ["2015-1-4", "%G-%V-%u", datetime(2015, 1, 1, 0, 0)], + ["2015-1-7", "%G-%V-%u", datetime(2015, 1, 4, 0, 0)], + ], + ) def test_to_datetime_iso_week_year_format(self, s, _format, dt): # See GH#16607 assert to_datetime(s, format=_format) == dt - @pytest.mark.parametrize("msg, s, _format", [ - ["ISO week directive '%V' must be used with the ISO year directive " - "'%G' and a weekday directive '%A', '%a', '%w', or '%u'.", "1999 50", - "%Y %V"], - ["ISO year directive '%G' must be used with the ISO week directive " - "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", "1999 51", - "%G %V"], - ["ISO year directive '%G' must be used with the ISO week directive " - "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", "1999 " - "Monday", "%G %A"], - ["ISO year directive '%G' must be used with the ISO week directive " - "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", "1999 Mon", - "%G %a"], - ["ISO year directive '%G' must be used with the ISO week directive " - "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", "1999 6", - "%G %w"], - ["ISO year directive '%G' must be used with the ISO week directive " - "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", "1999 6", - "%G %u"], - ["ISO year directive '%G' must be used with the ISO week directive " - "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", "2051", - "%G"], - ["Day of the year directive '%j' is not compatible with ISO year " - "directive '%G'. Use '%Y' instead.", "1999 51 6 256", "%G %V %u %j"], - ["ISO week directive '%V' is incompatible with the year directive " - "'%Y'. Use the ISO year '%G' instead.", "1999 51 Sunday", "%Y %V %A"], - ["ISO week directive '%V' is incompatible with the year directive " - "'%Y'. Use the ISO year '%G' instead.", "1999 51 Sun", "%Y %V %a"], - ["ISO week directive '%V' is incompatible with the year directive " - "'%Y'. Use the ISO year '%G' instead.", "1999 51 1", "%Y %V %w"], - ["ISO week directive '%V' is incompatible with the year directive " - "'%Y'. Use the ISO year '%G' instead.", "1999 51 1", "%Y %V %u"], - ["ISO week directive '%V' must be used with the ISO year directive " - "'%G' and a weekday directive '%A', '%a', '%w', or '%u'.", "20", "%V"] - ]) + @pytest.mark.parametrize( + "msg, s, _format", + [ + [ + "ISO week directive '%V' must be used with the ISO year directive " + "'%G' and a weekday directive '%A', '%a', '%w', or '%u'.", + "1999 50", + "%Y %V", + ], + [ + "ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", + "1999 51", + "%G %V", + ], + [ + "ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", + "1999 " "Monday", + "%G %A", + ], + [ + "ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", + "1999 Mon", + "%G %a", + ], + [ + "ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", + "1999 6", + "%G %w", + ], + [ + "ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", + "1999 6", + "%G %u", + ], + [ + "ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", + "2051", + "%G", + ], + [ + "Day of the year directive '%j' is not compatible with ISO year " + "directive '%G'. Use '%Y' instead.", + "1999 51 6 256", + "%G %V %u %j", + ], + [ + "ISO week directive '%V' is incompatible with the year directive " + "'%Y'. Use the ISO year '%G' instead.", + "1999 51 Sunday", + "%Y %V %A", + ], + [ + "ISO week directive '%V' is incompatible with the year directive " + "'%Y'. Use the ISO year '%G' instead.", + "1999 51 Sun", + "%Y %V %a", + ], + [ + "ISO week directive '%V' is incompatible with the year directive " + "'%Y'. Use the ISO year '%G' instead.", + "1999 51 1", + "%Y %V %w", + ], + [ + "ISO week directive '%V' is incompatible with the year directive " + "'%Y'. Use the ISO year '%G' instead.", + "1999 51 1", + "%Y %V %u", + ], + [ + "ISO week directive '%V' must be used with the ISO year directive " + "'%G' and a weekday directive '%A', '%a', '%w', or '%u'.", + "20", + "%V", + ], + ], + ) def test_error_iso_week_year(self, msg, s, _format): # See GH#16607 # This test checks for errors thrown when giving the wrong format @@ -331,15 +425,17 @@ def test_error_iso_week_year(self, msg, s, _format): # locale specific, but the test data is in english. # Therefore, the tests only run when locale is not overwritten, # as a sort of solution to this problem. - if (locale.getlocale() != ('zh_CN', 'UTF-8') and - locale.getlocale() != ('it_IT', 'UTF-8')): + if locale.getlocale() != ("zh_CN", "UTF-8") and locale.getlocale() != ( + "it_IT", + "UTF-8", + ): with pytest.raises(ValueError, match=msg): to_datetime(s, format=_format) - @pytest.mark.parametrize('tz', [None, 'US/Central']) + @pytest.mark.parametrize("tz", [None, "US/Central"]) def test_to_datetime_dtarr(self, tz): # DatetimeArray - dti = date_range('1965-04-03', periods=19, freq='2W', tz=tz) + dti = date_range("1965-04-03", periods=19, freq="2W", tz=tz) arr = DatetimeArray(dti) result = to_datetime(arr) @@ -353,21 +449,21 @@ def test_to_datetime_pydatetime(self): assert actual == datetime(2008, 1, 15) def test_to_datetime_YYYYMMDD(self): - actual = pd.to_datetime('20080115') + actual = pd.to_datetime("20080115") assert actual == datetime(2008, 1, 15) def test_to_datetime_unparseable_ignore(self): # unparseable - s = 'Month 1, 1999' - assert pd.to_datetime(s, errors='ignore') == s + s = "Month 1, 1999" + assert pd.to_datetime(s, errors="ignore") == s @td.skip_if_windows # `tm.set_timezone` does not work in windows def test_to_datetime_now(self): # See GH#18666 - with tm.set_timezone('US/Eastern'): - npnow = np.datetime64('now').astype('datetime64[ns]') - pdnow = pd.to_datetime('now') - pdnow2 = pd.to_datetime(['now'])[0] + with tm.set_timezone("US/Eastern"): + npnow = np.datetime64("now").astype("datetime64[ns]") + pdnow = pd.to_datetime("now") + pdnow2 = pd.to_datetime(["now"])[0] # These should all be equal with infinite perf; this gives # a generous margin of 10 seconds @@ -385,13 +481,12 @@ def test_to_datetime_today(self): # Unfortunately this test between 12 and 1 AM Samoa time # this both of these timezones _and_ UTC will all be in the same day, # so this test will not detect the regression introduced in #18666. - with tm.set_timezone('Pacific/Auckland'): # 12-13 hours ahead of UTC - nptoday = np.datetime64('today')\ - .astype('datetime64[ns]').astype(np.int64) - pdtoday = pd.to_datetime('today') - pdtoday2 = pd.to_datetime(['today'])[0] + with tm.set_timezone("Pacific/Auckland"): # 12-13 hours ahead of UTC + nptoday = np.datetime64("today").astype("datetime64[ns]").astype(np.int64) + pdtoday = pd.to_datetime("today") + pdtoday2 = pd.to_datetime(["today"])[0] - tstoday = pd.Timestamp('today') + tstoday = pd.Timestamp("today") tstoday2 = pd.Timestamp.today() # These should all be equal with infinite perf; this gives @@ -404,11 +499,10 @@ def test_to_datetime_today(self): assert pdtoday.tzinfo is None assert pdtoday2.tzinfo is None - with tm.set_timezone('US/Samoa'): # 11 hours behind UTC - nptoday = np.datetime64('today')\ - .astype('datetime64[ns]').astype(np.int64) - pdtoday = pd.to_datetime('today') - pdtoday2 = pd.to_datetime(['today'])[0] + with tm.set_timezone("US/Samoa"): # 11 hours behind UTC + nptoday = np.datetime64("today").astype("datetime64[ns]").astype(np.int64) + pdtoday = pd.to_datetime("today") + pdtoday2 = pd.to_datetime(["today"])[0] # These should all be equal with infinite perf; this gives # a generous margin of 10 seconds @@ -419,175 +513,190 @@ def test_to_datetime_today(self): assert pdtoday2.tzinfo is None def test_to_datetime_today_now_unicode_bytes(self): - to_datetime(['now']) - to_datetime(['today']) + to_datetime(["now"]) + to_datetime(["today"]) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_dt64s(self, cache): - in_bound_dts = [ - np.datetime64('2000-01-01'), - np.datetime64('2000-01-02'), - ] + in_bound_dts = [np.datetime64("2000-01-01"), np.datetime64("2000-01-02")] for dt in in_bound_dts: assert pd.to_datetime(dt, cache=cache) == Timestamp(dt) - @pytest.mark.parametrize('dt', [np.datetime64('1000-01-01'), - np.datetime64('5000-01-02')]) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize( + "dt", [np.datetime64("1000-01-01"), np.datetime64("5000-01-02")] + ) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_dt64s_out_of_bounds(self, cache, dt): msg = "Out of bounds nanosecond timestamp: {}".format(dt) with pytest.raises(OutOfBoundsDatetime, match=msg): - pd.to_datetime(dt, errors='raise') + pd.to_datetime(dt, errors="raise") with pytest.raises(OutOfBoundsDatetime, match=msg): Timestamp(dt) - assert pd.to_datetime(dt, errors='coerce', cache=cache) is NaT + assert pd.to_datetime(dt, errors="coerce", cache=cache) is NaT - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_array_of_dt64s(self, cache): - dts = [np.datetime64('2000-01-01'), np.datetime64('2000-01-02'), ] + dts = [np.datetime64("2000-01-01"), np.datetime64("2000-01-02")] # Assuming all datetimes are in bounds, to_datetime() returns # an array that is equal to Timestamp() parsing tm.assert_index_equal( pd.to_datetime(dts, cache=cache), - pd.DatetimeIndex([Timestamp(x).asm8 for x in dts]) + pd.DatetimeIndex([Timestamp(x).asm8 for x in dts]), ) # A list of datetimes where the last one is out of bounds - dts_with_oob = dts + [np.datetime64('9999-01-01')] + dts_with_oob = dts + [np.datetime64("9999-01-01")] msg = "Out of bounds nanosecond timestamp: 9999-01-01 00:00:00" with pytest.raises(OutOfBoundsDatetime, match=msg): - pd.to_datetime(dts_with_oob, errors='raise') + pd.to_datetime(dts_with_oob, errors="raise") tm.assert_index_equal( - pd.to_datetime(dts_with_oob, errors='coerce', - cache=cache), + pd.to_datetime(dts_with_oob, errors="coerce", cache=cache), pd.DatetimeIndex( [ Timestamp(dts_with_oob[0]).asm8, Timestamp(dts_with_oob[1]).asm8, - pd.NaT + pd.NaT, ] - ) + ), ) # With errors='ignore', out of bounds datetime64s # are converted to their .item(), which depending on the version of # numpy is either a python datetime.datetime or datetime.date tm.assert_index_equal( - pd.to_datetime(dts_with_oob, errors='ignore', - cache=cache), - pd.Index( - [dt.item() for dt in dts_with_oob] - ) + pd.to_datetime(dts_with_oob, errors="ignore", cache=cache), + pd.Index([dt.item() for dt in dts_with_oob]), ) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_tz(self, cache): # xref 8260 # uniform returns a DatetimeIndex - arr = [pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), - pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')] + arr = [ + pd.Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), + pd.Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), + ] result = pd.to_datetime(arr, cache=cache) expected = DatetimeIndex( - ['2013-01-01 13:00:00', '2013-01-02 14:00:00'], tz='US/Pacific') + ["2013-01-01 13:00:00", "2013-01-02 14:00:00"], tz="US/Pacific" + ) tm.assert_index_equal(result, expected) # mixed tzs will raise - arr = [pd.Timestamp('2013-01-01 13:00:00', tz='US/Pacific'), - pd.Timestamp('2013-01-02 14:00:00', tz='US/Eastern')] - msg = ("Tz-aware datetime.datetime cannot be converted to datetime64" - " unless utc=True") + arr = [ + pd.Timestamp("2013-01-01 13:00:00", tz="US/Pacific"), + pd.Timestamp("2013-01-02 14:00:00", tz="US/Eastern"), + ] + msg = ( + "Tz-aware datetime.datetime cannot be converted to datetime64" + " unless utc=True" + ) with pytest.raises(ValueError, match=msg): pd.to_datetime(arr, cache=cache) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_different_offsets(self, cache): # inspired by asv timeseries.ToDatetimeNONISO8601 benchmark # see GH-26097 for more - ts_string_1 = 'March 1, 2018 12:00:00+0400' - ts_string_2 = 'March 1, 2018 12:00:00+0500' + ts_string_1 = "March 1, 2018 12:00:00+0400" + ts_string_2 = "March 1, 2018 12:00:00+0500" arr = [ts_string_1] * 5 + [ts_string_2] * 5 expected = pd.Index([parse(x) for x in arr]) result = pd.to_datetime(arr, cache=cache) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_tz_pytz(self, cache): # see gh-8260 - us_eastern = pytz.timezone('US/Eastern') - arr = np.array([us_eastern.localize(datetime(year=2000, month=1, day=1, - hour=3, minute=0)), - us_eastern.localize(datetime(year=2000, month=6, day=1, - hour=3, minute=0))], - dtype=object) + us_eastern = pytz.timezone("US/Eastern") + arr = np.array( + [ + us_eastern.localize( + datetime(year=2000, month=1, day=1, hour=3, minute=0) + ), + us_eastern.localize( + datetime(year=2000, month=6, day=1, hour=3, minute=0) + ), + ], + dtype=object, + ) result = pd.to_datetime(arr, utc=True, cache=cache) - expected = DatetimeIndex(['2000-01-01 08:00:00+00:00', - '2000-06-01 07:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + expected = DatetimeIndex( + ["2000-01-01 08:00:00+00:00", "2000-06-01 07:00:00+00:00"], + dtype="datetime64[ns, UTC]", + freq=None, + ) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) - @pytest.mark.parametrize("init_constructor, end_constructor, test_method", - [(Index, DatetimeIndex, tm.assert_index_equal), - (list, DatetimeIndex, tm.assert_index_equal), - (np.array, DatetimeIndex, tm.assert_index_equal), - (Series, Series, tm.assert_series_equal)]) - def test_to_datetime_utc_true(self, - cache, - init_constructor, - end_constructor, - test_method): + @pytest.mark.parametrize("cache", [True, False]) + @pytest.mark.parametrize( + "init_constructor, end_constructor, test_method", + [ + (Index, DatetimeIndex, tm.assert_index_equal), + (list, DatetimeIndex, tm.assert_index_equal), + (np.array, DatetimeIndex, tm.assert_index_equal), + (Series, Series, tm.assert_series_equal), + ], + ) + def test_to_datetime_utc_true( + self, cache, init_constructor, end_constructor, test_method + ): # See gh-11934 & gh-6415 - data = ['20100102 121314', '20100102 121315'] - expected_data = [pd.Timestamp('2010-01-02 12:13:14', tz='utc'), - pd.Timestamp('2010-01-02 12:13:15', tz='utc')] - - result = pd.to_datetime(init_constructor(data), - format='%Y%m%d %H%M%S', - utc=True, - cache=cache) + data = ["20100102 121314", "20100102 121315"] + expected_data = [ + pd.Timestamp("2010-01-02 12:13:14", tz="utc"), + pd.Timestamp("2010-01-02 12:13:15", tz="utc"), + ] + + result = pd.to_datetime( + init_constructor(data), format="%Y%m%d %H%M%S", utc=True, cache=cache + ) expected = end_constructor(expected_data) test_method(result, expected) # Test scalar case as well for scalar, expected in zip(data, expected_data): - result = pd.to_datetime(scalar, format='%Y%m%d %H%M%S', utc=True, - cache=cache) + result = pd.to_datetime( + scalar, format="%Y%m%d %H%M%S", utc=True, cache=cache + ) assert result == expected - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_utc_true_with_series_single_value(self, cache): # GH 15760 UTC=True with Series ts = 1.5e18 result = pd.to_datetime(pd.Series([ts]), utc=True, cache=cache) - expected = pd.Series([pd.Timestamp(ts, tz='utc')]) + expected = pd.Series([pd.Timestamp(ts, tz="utc")]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_utc_true_with_series_tzaware_string(self, cache): - ts = '2013-01-01 00:00:00-01:00' - expected_ts = '2013-01-01 01:00:00' + ts = "2013-01-01 00:00:00-01:00" + expected_ts = "2013-01-01 01:00:00" data = pd.Series([ts] * 3) result = pd.to_datetime(data, utc=True, cache=cache) - expected = pd.Series([pd.Timestamp(expected_ts, tz='utc')] * 3) + expected = pd.Series([pd.Timestamp(expected_ts, tz="utc")] * 3) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) - @pytest.mark.parametrize('date, dtype', - [('2013-01-01 01:00:00', 'datetime64[ns]'), - ('2013-01-01 01:00:00', 'datetime64[ns, UTC]')]) - def test_to_datetime_utc_true_with_series_datetime_ns(self, cache, date, - dtype): - expected = pd.Series([pd.Timestamp('2013-01-01 01:00:00', tz='UTC')]) - result = pd.to_datetime(pd.Series([date], dtype=dtype), utc=True, - cache=cache) + @pytest.mark.parametrize("cache", [True, False]) + @pytest.mark.parametrize( + "date, dtype", + [ + ("2013-01-01 01:00:00", "datetime64[ns]"), + ("2013-01-01 01:00:00", "datetime64[ns, UTC]"), + ], + ) + def test_to_datetime_utc_true_with_series_datetime_ns(self, cache, date, dtype): + expected = pd.Series([pd.Timestamp("2013-01-01 01:00:00", tz="UTC")]) + result = pd.to_datetime(pd.Series([date], dtype=dtype), utc=True, cache=cache) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_tz_psycopg2(self, cache): # xref 8260 @@ -599,36 +708,43 @@ def test_to_datetime_tz_psycopg2(self, cache): # misc cases tz1 = psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None) tz2 = psycopg2.tz.FixedOffsetTimezone(offset=-240, name=None) - arr = np.array([datetime(2000, 1, 1, 3, 0, tzinfo=tz1), - datetime(2000, 6, 1, 3, 0, tzinfo=tz2)], - dtype=object) - - result = pd.to_datetime(arr, errors='coerce', utc=True, cache=cache) - expected = DatetimeIndex(['2000-01-01 08:00:00+00:00', - '2000-06-01 07:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + arr = np.array( + [ + datetime(2000, 1, 1, 3, 0, tzinfo=tz1), + datetime(2000, 6, 1, 3, 0, tzinfo=tz2), + ], + dtype=object, + ) + + result = pd.to_datetime(arr, errors="coerce", utc=True, cache=cache) + expected = DatetimeIndex( + ["2000-01-01 08:00:00+00:00", "2000-06-01 07:00:00+00:00"], + dtype="datetime64[ns, UTC]", + freq=None, + ) tm.assert_index_equal(result, expected) # dtype coercion - i = pd.DatetimeIndex([ - '2000-01-01 08:00:00' - ], tz=psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None)) + i = pd.DatetimeIndex( + ["2000-01-01 08:00:00"], + tz=psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None), + ) assert is_datetime64_ns_dtype(i) # tz coercion - result = pd.to_datetime(i, errors='coerce', cache=cache) + result = pd.to_datetime(i, errors="coerce", cache=cache) tm.assert_index_equal(result, i) - result = pd.to_datetime(i, errors='coerce', utc=True, cache=cache) - expected = pd.DatetimeIndex(['2000-01-01 13:00:00'], - dtype='datetime64[ns, UTC]') + result = pd.to_datetime(i, errors="coerce", utc=True, cache=cache) + expected = pd.DatetimeIndex( + ["2000-01-01 13:00:00"], dtype="datetime64[ns, UTC]" + ) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( - 'cache', - [pytest.param(True, - marks=pytest.mark.skipif(True, reason="GH 18111")), - False]) + "cache", + [pytest.param(True, marks=pytest.mark.skipif(True, reason="GH 18111")), False], + ) def test_datetime_bool(self, cache): # GH13176 with pytest.raises(TypeError): @@ -642,13 +758,13 @@ def test_datetime_bool(self, cache): with pytest.raises(TypeError): to_datetime([False, datetime.today()], cache=cache) with pytest.raises(TypeError): - to_datetime(['20130101', True], cache=cache) - tm.assert_index_equal(to_datetime([0, False, NaT, 0.0], - errors="coerce", cache=cache), - DatetimeIndex([to_datetime(0, cache=cache), - NaT, - NaT, - to_datetime(0, cache=cache)])) + to_datetime(["20130101", True], cache=cache) + tm.assert_index_equal( + to_datetime([0, False, NaT, 0.0], errors="coerce", cache=cache), + DatetimeIndex( + [to_datetime(0, cache=cache), NaT, NaT, to_datetime(0, cache=cache)] + ), + ) def test_datetime_invalid_datatype(self): # GH13176 @@ -658,69 +774,78 @@ def test_datetime_invalid_datatype(self): with pytest.raises(TypeError): pd.to_datetime(pd.to_datetime) - @pytest.mark.parametrize('value', ["a", "00:01:99"]) - @pytest.mark.parametrize('infer', [True, False]) - @pytest.mark.parametrize('format', [None, 'H%:M%:S%']) + @pytest.mark.parametrize("value", ["a", "00:01:99"]) + @pytest.mark.parametrize("infer", [True, False]) + @pytest.mark.parametrize("format", [None, "H%:M%:S%"]) def test_datetime_invalid_scalar(self, value, format, infer): # GH24763 - res = pd.to_datetime(value, errors='ignore', format=format, - infer_datetime_format=infer) + res = pd.to_datetime( + value, errors="ignore", format=format, infer_datetime_format=infer + ) assert res == value - res = pd.to_datetime(value, errors='coerce', format=format, - infer_datetime_format=infer) + res = pd.to_datetime( + value, errors="coerce", format=format, infer_datetime_format=infer + ) assert res is pd.NaT with pytest.raises(ValueError): - pd.to_datetime(value, errors='raise', format=format, - infer_datetime_format=infer) + pd.to_datetime( + value, errors="raise", format=format, infer_datetime_format=infer + ) - @pytest.mark.parametrize('value', ["3000/12/11 00:00:00"]) - @pytest.mark.parametrize('infer', [True, False]) - @pytest.mark.parametrize('format', [None, 'H%:M%:S%']) + @pytest.mark.parametrize("value", ["3000/12/11 00:00:00"]) + @pytest.mark.parametrize("infer", [True, False]) + @pytest.mark.parametrize("format", [None, "H%:M%:S%"]) def test_datetime_outofbounds_scalar(self, value, format, infer): # GH24763 - res = pd.to_datetime(value, errors='ignore', format=format, - infer_datetime_format=infer) + res = pd.to_datetime( + value, errors="ignore", format=format, infer_datetime_format=infer + ) assert res == value - res = pd.to_datetime(value, errors='coerce', format=format, - infer_datetime_format=infer) + res = pd.to_datetime( + value, errors="coerce", format=format, infer_datetime_format=infer + ) assert res is pd.NaT if format is not None: with pytest.raises(ValueError): - pd.to_datetime(value, errors='raise', format=format, - infer_datetime_format=infer) + pd.to_datetime( + value, errors="raise", format=format, infer_datetime_format=infer + ) else: with pytest.raises(OutOfBoundsDatetime): - pd.to_datetime(value, errors='raise', format=format, - infer_datetime_format=infer) + pd.to_datetime( + value, errors="raise", format=format, infer_datetime_format=infer + ) - @pytest.mark.parametrize('values', [["a"], ["00:01:99"], - ["a", "b", "99:00:00"]]) - @pytest.mark.parametrize('infer', [True, False]) - @pytest.mark.parametrize('format', [None, 'H%:M%:S%']) + @pytest.mark.parametrize("values", [["a"], ["00:01:99"], ["a", "b", "99:00:00"]]) + @pytest.mark.parametrize("infer", [True, False]) + @pytest.mark.parametrize("format", [None, "H%:M%:S%"]) def test_datetime_invalid_index(self, values, format, infer): # GH24763 - res = pd.to_datetime(values, errors='ignore', format=format, - infer_datetime_format=infer) + res = pd.to_datetime( + values, errors="ignore", format=format, infer_datetime_format=infer + ) tm.assert_index_equal(res, pd.Index(values)) - res = pd.to_datetime(values, errors='coerce', format=format, - infer_datetime_format=infer) + res = pd.to_datetime( + values, errors="coerce", format=format, infer_datetime_format=infer + ) tm.assert_index_equal(res, pd.DatetimeIndex([pd.NaT] * len(values))) with pytest.raises(ValueError): - pd.to_datetime(values, errors='raise', format=format, - infer_datetime_format=infer) + pd.to_datetime( + values, errors="raise", format=format, infer_datetime_format=infer + ) @pytest.mark.parametrize("utc", [True, None]) - @pytest.mark.parametrize("format", ['%Y%m%d %H:%M:%S', None]) + @pytest.mark.parametrize("format", ["%Y%m%d %H:%M:%S", None]) @pytest.mark.parametrize("constructor", [list, tuple, np.array, pd.Index]) def test_to_datetime_cache(self, utc, format, constructor): - date = '20130101 00:00:00' - test_dates = [date] * 10**5 + date = "20130101 00:00:00" + test_dates = [date] * 10 ** 5 data = constructor(test_dates) result = pd.to_datetime(data, utc=utc, format=format, cache=True) @@ -729,28 +854,32 @@ def test_to_datetime_cache(self, utc, format, constructor): tm.assert_index_equal(result, expected) @pytest.mark.parametrize("utc", [True, None]) - @pytest.mark.parametrize("format", ['%Y%m%d %H:%M:%S', None]) + @pytest.mark.parametrize("format", ["%Y%m%d %H:%M:%S", None]) def test_to_datetime_cache_series(self, utc, format): - date = '20130101 00:00:00' - test_dates = [date] * 10**5 + date = "20130101 00:00:00" + test_dates = [date] * 10 ** 5 data = pd.Series(test_dates) result = pd.to_datetime(data, utc=utc, format=format, cache=True) expected = pd.to_datetime(data, utc=utc, format=format, cache=False) tm.assert_series_equal(result, expected) def test_to_datetime_cache_scalar(self): - date = '20130101 00:00:00' + date = "20130101 00:00:00" result = pd.to_datetime(date, cache=True) - expected = pd.Timestamp('20130101 00:00:00') + expected = pd.Timestamp("20130101 00:00:00") assert result == expected - @pytest.mark.parametrize('date, format', - [('2017-20', '%Y-%W'), - ('20 Sunday', '%W %A'), - ('20 Sun', '%W %a'), - ('2017-21', '%Y-%U'), - ('20 Sunday', '%U %A'), - ('20 Sun', '%U %a')]) + @pytest.mark.parametrize( + "date, format", + [ + ("2017-20", "%Y-%W"), + ("20 Sunday", "%W %A"), + ("20 Sun", "%W %a"), + ("2017-21", "%Y-%U"), + ("20 Sunday", "%U %A"), + ("20 Sun", "%U %a"), + ], + ) def test_week_without_day_and_calendar_year(self, date, format): # GH16774 @@ -760,15 +889,19 @@ def test_week_without_day_and_calendar_year(self, date, format): def test_to_datetime_coerce(self): # GH 26122 - ts_strings = ['March 1, 2018 12:00:00+0400', - 'March 1, 2018 12:00:00+0500', - '20100240'] - result = to_datetime(ts_strings, errors='coerce') - expected = Index([datetime(2018, 3, 1, 12, 0, - tzinfo=tzoffset(None, 14400)), - datetime(2018, 3, 1, 12, 0, - tzinfo=tzoffset(None, 18000)), - NaT]) + ts_strings = [ + "March 1, 2018 12:00:00+0400", + "March 1, 2018 12:00:00+0500", + "20100240", + ] + result = to_datetime(ts_strings, errors="coerce") + expected = Index( + [ + datetime(2018, 3, 1, 12, 0, tzinfo=tzoffset(None, 14400)), + datetime(2018, 3, 1, 12, 0, tzinfo=tzoffset(None, 18000)), + NaT, + ] + ) tm.assert_index_equal(result, expected) def test_iso_8601_strings_with_same_offset(self): @@ -787,93 +920,109 @@ def test_iso_8601_strings_with_same_offset(self): def test_iso_8601_strings_same_offset_no_box(self): # GH 22446 - data = ['2018-01-04 09:01:00+09:00', '2018-01-04 09:02:00+09:00'] + data = ["2018-01-04 09:01:00+09:00", "2018-01-04 09:02:00+09:00"] with tm.assert_produces_warning(FutureWarning): result = pd.to_datetime(data, box=False) - expected = np.array([ - datetime(2018, 1, 4, 9, 1, tzinfo=pytz.FixedOffset(540)), - datetime(2018, 1, 4, 9, 2, tzinfo=pytz.FixedOffset(540)) - ], - dtype=object) + expected = np.array( + [ + datetime(2018, 1, 4, 9, 1, tzinfo=pytz.FixedOffset(540)), + datetime(2018, 1, 4, 9, 2, tzinfo=pytz.FixedOffset(540)), + ], + dtype=object, + ) tm.assert_numpy_array_equal(result, expected) def test_iso_8601_strings_with_different_offsets(self): # GH 17697, 11736 - ts_strings = ["2015-11-18 15:30:00+05:30", - "2015-11-18 16:30:00+06:30", - NaT] + ts_strings = ["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30", NaT] result = to_datetime(ts_strings) - expected = np.array([datetime(2015, 11, 18, 15, 30, - tzinfo=tzoffset(None, 19800)), - datetime(2015, 11, 18, 16, 30, - tzinfo=tzoffset(None, 23400)), - NaT], - dtype=object) + expected = np.array( + [ + datetime(2015, 11, 18, 15, 30, tzinfo=tzoffset(None, 19800)), + datetime(2015, 11, 18, 16, 30, tzinfo=tzoffset(None, 23400)), + NaT, + ], + dtype=object, + ) # GH 21864 expected = Index(expected) tm.assert_index_equal(result, expected) result = to_datetime(ts_strings, utc=True) - expected = DatetimeIndex([Timestamp(2015, 11, 18, 10), - Timestamp(2015, 11, 18, 10), - NaT], tz='UTC') + expected = DatetimeIndex( + [Timestamp(2015, 11, 18, 10), Timestamp(2015, 11, 18, 10), NaT], tz="UTC" + ) tm.assert_index_equal(result, expected) def test_iso8601_strings_mixed_offsets_with_naive(self): # GH 24992 - result = pd.to_datetime([ - '2018-11-28T00:00:00', - '2018-11-28T00:00:00+12:00', - '2018-11-28T00:00:00', - '2018-11-28T00:00:00+06:00', - '2018-11-28T00:00:00' - ], utc=True) - expected = pd.to_datetime([ - '2018-11-28T00:00:00', - '2018-11-27T12:00:00', - '2018-11-28T00:00:00', - '2018-11-27T18:00:00', - '2018-11-28T00:00:00' - ], utc=True) + result = pd.to_datetime( + [ + "2018-11-28T00:00:00", + "2018-11-28T00:00:00+12:00", + "2018-11-28T00:00:00", + "2018-11-28T00:00:00+06:00", + "2018-11-28T00:00:00", + ], + utc=True, + ) + expected = pd.to_datetime( + [ + "2018-11-28T00:00:00", + "2018-11-27T12:00:00", + "2018-11-28T00:00:00", + "2018-11-27T18:00:00", + "2018-11-28T00:00:00", + ], + utc=True, + ) tm.assert_index_equal(result, expected) - items = ['2018-11-28T00:00:00+12:00', '2018-11-28T00:00:00'] + items = ["2018-11-28T00:00:00+12:00", "2018-11-28T00:00:00"] result = pd.to_datetime(items, utc=True) expected = pd.to_datetime(list(reversed(items)), utc=True)[::-1] tm.assert_index_equal(result, expected) def test_mixed_offsets_with_native_datetime_raises(self): # GH 25978 - s = pd.Series([ - 'nan', - pd.Timestamp("1990-01-01"), - "2015-03-14T16:15:14.123-08:00", - "2019-03-04T21:56:32.620-07:00", - None, - ]) + s = pd.Series( + [ + "nan", + pd.Timestamp("1990-01-01"), + "2015-03-14T16:15:14.123-08:00", + "2019-03-04T21:56:32.620-07:00", + None, + ] + ) with pytest.raises(ValueError, match="Tz-aware datetime.datetime"): pd.to_datetime(s) def test_non_iso_strings_with_tz_offset(self): - result = to_datetime(['March 1, 2018 12:00:00+0400'] * 2) - expected = DatetimeIndex([datetime(2018, 3, 1, 12, - tzinfo=pytz.FixedOffset(240))] * 2) + result = to_datetime(["March 1, 2018 12:00:00+0400"] * 2) + expected = DatetimeIndex( + [datetime(2018, 3, 1, 12, tzinfo=pytz.FixedOffset(240))] * 2 + ) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('ts, expected', [ - (Timestamp('2018-01-01'), - Timestamp('2018-01-01', tz='UTC')), - (Timestamp('2018-01-01', tz='US/Pacific'), - Timestamp('2018-01-01 08:00', tz='UTC'))]) + @pytest.mark.parametrize( + "ts, expected", + [ + (Timestamp("2018-01-01"), Timestamp("2018-01-01", tz="UTC")), + ( + Timestamp("2018-01-01", tz="US/Pacific"), + Timestamp("2018-01-01 08:00", tz="UTC"), + ), + ], + ) def test_timestamp_utc_true(self, ts, expected): # GH 24415 result = to_datetime(ts, utc=True) assert result == expected def test_to_datetime_box_deprecated(self): - expected = np.datetime64('2018-09-09') + expected = np.datetime64("2018-09-09") # Deprecated - see GH24416 with tm.assert_produces_warning(FutureWarning): @@ -884,85 +1033,89 @@ def test_to_datetime_box_deprecated(self): class TestToDatetimeUnit: - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_unit(self, cache): # GH 11758 # test proper behavior with erros with pytest.raises(ValueError): - to_datetime([1], unit='D', format='%Y%m%d', cache=cache) - - values = [11111111, 1, 1.0, iNaT, NaT, np.nan, - 'NaT', ''] - result = to_datetime(values, unit='D', errors='ignore', cache=cache) - expected = Index([11111111, Timestamp('1970-01-02'), - Timestamp('1970-01-02'), NaT, - NaT, NaT, NaT, NaT], - dtype=object) + to_datetime([1], unit="D", format="%Y%m%d", cache=cache) + + values = [11111111, 1, 1.0, iNaT, NaT, np.nan, "NaT", ""] + result = to_datetime(values, unit="D", errors="ignore", cache=cache) + expected = Index( + [ + 11111111, + Timestamp("1970-01-02"), + Timestamp("1970-01-02"), + NaT, + NaT, + NaT, + NaT, + NaT, + ], + dtype=object, + ) tm.assert_index_equal(result, expected) - result = to_datetime(values, unit='D', errors='coerce', cache=cache) - expected = DatetimeIndex(['NaT', '1970-01-02', '1970-01-02', - 'NaT', 'NaT', 'NaT', 'NaT', 'NaT']) + result = to_datetime(values, unit="D", errors="coerce", cache=cache) + expected = DatetimeIndex( + ["NaT", "1970-01-02", "1970-01-02", "NaT", "NaT", "NaT", "NaT", "NaT"] + ) tm.assert_index_equal(result, expected) with pytest.raises(tslib.OutOfBoundsDatetime): - to_datetime(values, unit='D', errors='raise', cache=cache) + to_datetime(values, unit="D", errors="raise", cache=cache) - values = [1420043460000, iNaT, NaT, np.nan, 'NaT'] + values = [1420043460000, iNaT, NaT, np.nan, "NaT"] - result = to_datetime(values, errors='ignore', unit='s', cache=cache) - expected = Index([1420043460000, NaT, NaT, - NaT, NaT], dtype=object) + result = to_datetime(values, errors="ignore", unit="s", cache=cache) + expected = Index([1420043460000, NaT, NaT, NaT, NaT], dtype=object) tm.assert_index_equal(result, expected) - result = to_datetime(values, errors='coerce', unit='s', cache=cache) - expected = DatetimeIndex(['NaT', 'NaT', 'NaT', 'NaT', 'NaT']) + result = to_datetime(values, errors="coerce", unit="s", cache=cache) + expected = DatetimeIndex(["NaT", "NaT", "NaT", "NaT", "NaT"]) tm.assert_index_equal(result, expected) with pytest.raises(tslib.OutOfBoundsDatetime): - to_datetime(values, errors='raise', unit='s', cache=cache) + to_datetime(values, errors="raise", unit="s", cache=cache) # if we have a string, then we raise a ValueError # and NOT an OutOfBoundsDatetime - for val in ['foo', Timestamp('20130101')]: + for val in ["foo", Timestamp("20130101")]: try: - to_datetime(val, errors='raise', unit='s', cache=cache) + to_datetime(val, errors="raise", unit="s", cache=cache) except tslib.OutOfBoundsDatetime: raise AssertionError("incorrect exception raised") except ValueError: pass - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_unit_consistency(self, cache): # consistency of conversions - expected = Timestamp('1970-05-09 14:25:11') - result = pd.to_datetime(11111111, unit='s', errors='raise', - cache=cache) + expected = Timestamp("1970-05-09 14:25:11") + result = pd.to_datetime(11111111, unit="s", errors="raise", cache=cache) assert result == expected assert isinstance(result, Timestamp) - result = pd.to_datetime(11111111, unit='s', errors='coerce', - cache=cache) + result = pd.to_datetime(11111111, unit="s", errors="coerce", cache=cache) assert result == expected assert isinstance(result, Timestamp) - result = pd.to_datetime(11111111, unit='s', errors='ignore', - cache=cache) + result = pd.to_datetime(11111111, unit="s", errors="ignore", cache=cache) assert result == expected assert isinstance(result, Timestamp) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_unit_with_numeric(self, cache): # GH 13180 # coercions from floats/ints are ok - expected = DatetimeIndex(['2015-06-19 05:33:20', - '2015-05-27 22:33:20']) - arr1 = [1.434692e+18, 1.432766e+18] - arr2 = np.array(arr1).astype('int64') - for errors in ['ignore', 'raise', 'coerce']: + expected = DatetimeIndex(["2015-06-19 05:33:20", "2015-05-27 22:33:20"]) + arr1 = [1.434692e18, 1.432766e18] + arr2 = np.array(arr1).astype("int64") + for errors in ["ignore", "raise", "coerce"]: result = pd.to_datetime(arr1, errors=errors, cache=cache) tm.assert_index_equal(result, expected) @@ -971,127 +1124,137 @@ def test_unit_with_numeric(self, cache): # but we want to make sure that we are coercing # if we have ints/strings - expected = DatetimeIndex(['NaT', - '2015-06-19 05:33:20', - '2015-05-27 22:33:20']) - arr = ['foo', 1.434692e+18, 1.432766e+18] - result = pd.to_datetime(arr, errors='coerce', cache=cache) + expected = DatetimeIndex(["NaT", "2015-06-19 05:33:20", "2015-05-27 22:33:20"]) + arr = ["foo", 1.434692e18, 1.432766e18] + result = pd.to_datetime(arr, errors="coerce", cache=cache) tm.assert_index_equal(result, expected) - expected = DatetimeIndex(['2015-06-19 05:33:20', - '2015-05-27 22:33:20', - 'NaT', - 'NaT']) - arr = [1.434692e+18, 1.432766e+18, 'foo', 'NaT'] - result = pd.to_datetime(arr, errors='coerce', cache=cache) + expected = DatetimeIndex( + ["2015-06-19 05:33:20", "2015-05-27 22:33:20", "NaT", "NaT"] + ) + arr = [1.434692e18, 1.432766e18, "foo", "NaT"] + result = pd.to_datetime(arr, errors="coerce", cache=cache) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_unit_mixed(self, cache): # mixed integers/datetimes - expected = DatetimeIndex(['2013-01-01', 'NaT', 'NaT']) - arr = [pd.Timestamp('20130101'), 1.434692e+18, 1.432766e+18] - result = pd.to_datetime(arr, errors='coerce', cache=cache) + expected = DatetimeIndex(["2013-01-01", "NaT", "NaT"]) + arr = [pd.Timestamp("20130101"), 1.434692e18, 1.432766e18] + result = pd.to_datetime(arr, errors="coerce", cache=cache) tm.assert_index_equal(result, expected) with pytest.raises(ValueError): - pd.to_datetime(arr, errors='raise', cache=cache) + pd.to_datetime(arr, errors="raise", cache=cache) - expected = DatetimeIndex(['NaT', - 'NaT', - '2013-01-01']) - arr = [1.434692e+18, 1.432766e+18, pd.Timestamp('20130101')] - result = pd.to_datetime(arr, errors='coerce', cache=cache) + expected = DatetimeIndex(["NaT", "NaT", "2013-01-01"]) + arr = [1.434692e18, 1.432766e18, pd.Timestamp("20130101")] + result = pd.to_datetime(arr, errors="coerce", cache=cache) tm.assert_index_equal(result, expected) with pytest.raises(ValueError): - pd.to_datetime(arr, errors='raise', cache=cache) + pd.to_datetime(arr, errors="raise", cache=cache) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_unit_rounding(self, cache): # GH 14156: argument will incur floating point errors but no # premature rounding - result = pd.to_datetime(1434743731.8770001, unit='s', cache=cache) - expected = pd.Timestamp('2015-06-19 19:55:31.877000093') + result = pd.to_datetime(1434743731.8770001, unit="s", cache=cache) + expected = pd.Timestamp("2015-06-19 19:55:31.877000093") assert result == expected - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_unit_ignore_keeps_name(self, cache): # GH 21697 - expected = pd.Index([15e9] * 2, name='name') - result = pd.to_datetime(expected, errors='ignore', unit='s', - cache=cache) + expected = pd.Index([15e9] * 2, name="name") + result = pd.to_datetime(expected, errors="ignore", unit="s", cache=cache) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_dataframe(self, cache): - df = DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5], - 'hour': [6, 7], - 'minute': [58, 59], - 'second': [10, 11], - 'ms': [1, 1], - 'us': [2, 2], - 'ns': [3, 3]}) - - result = to_datetime({'year': df['year'], - 'month': df['month'], - 'day': df['day']}, cache=cache) - expected = Series([Timestamp('20150204 00:00:00'), - Timestamp('20160305 00:0:00')]) + df = DataFrame( + { + "year": [2015, 2016], + "month": [2, 3], + "day": [4, 5], + "hour": [6, 7], + "minute": [58, 59], + "second": [10, 11], + "ms": [1, 1], + "us": [2, 2], + "ns": [3, 3], + } + ) + + result = to_datetime( + {"year": df["year"], "month": df["month"], "day": df["day"]}, cache=cache + ) + expected = Series( + [Timestamp("20150204 00:00:00"), Timestamp("20160305 00:0:00")] + ) assert_series_equal(result, expected) # dict-like - result = to_datetime(df[['year', 'month', 'day']].to_dict(), - cache=cache) + result = to_datetime(df[["year", "month", "day"]].to_dict(), cache=cache) assert_series_equal(result, expected) # dict but with constructable - df2 = df[['year', 'month', 'day']].to_dict() - df2['month'] = 2 + df2 = df[["year", "month", "day"]].to_dict() + df2["month"] = 2 result = to_datetime(df2, cache=cache) - expected2 = Series([Timestamp('20150204 00:00:00'), - Timestamp('20160205 00:0:00')]) + expected2 = Series( + [Timestamp("20150204 00:00:00"), Timestamp("20160205 00:0:00")] + ) assert_series_equal(result, expected2) # unit mappings - units = [{'year': 'years', - 'month': 'months', - 'day': 'days', - 'hour': 'hours', - 'minute': 'minutes', - 'second': 'seconds'}, - {'year': 'year', - 'month': 'month', - 'day': 'day', - 'hour': 'hour', - 'minute': 'minute', - 'second': 'second'}, - ] + units = [ + { + "year": "years", + "month": "months", + "day": "days", + "hour": "hours", + "minute": "minutes", + "second": "seconds", + }, + { + "year": "year", + "month": "month", + "day": "day", + "hour": "hour", + "minute": "minute", + "second": "second", + }, + ] for d in units: - result = to_datetime(df[list(d.keys())].rename(columns=d), - cache=cache) - expected = Series([Timestamp('20150204 06:58:10'), - Timestamp('20160305 07:59:11')]) + result = to_datetime(df[list(d.keys())].rename(columns=d), cache=cache) + expected = Series( + [Timestamp("20150204 06:58:10"), Timestamp("20160305 07:59:11")] + ) assert_series_equal(result, expected) - d = {'year': 'year', - 'month': 'month', - 'day': 'day', - 'hour': 'hour', - 'minute': 'minute', - 'second': 'second', - 'ms': 'ms', - 'us': 'us', - 'ns': 'ns'} + d = { + "year": "year", + "month": "month", + "day": "day", + "hour": "hour", + "minute": "minute", + "second": "second", + "ms": "ms", + "us": "us", + "ns": "ns", + } result = to_datetime(df.rename(columns=d), cache=cache) - expected = Series([Timestamp('20150204 06:58:10.001002003'), - Timestamp('20160305 07:59:11.001002003')]) + expected = Series( + [ + Timestamp("20150204 06:58:10.001002003"), + Timestamp("20160305 07:59:11.001002003"), + ] + ) assert_series_equal(result, expected) # coerce back to int @@ -1099,110 +1262,103 @@ def test_dataframe(self, cache): assert_series_equal(result, expected) # passing coerce - df2 = DataFrame({'year': [2015, 2016], - 'month': [2, 20], - 'day': [4, 5]}) + df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]}) - msg = ("cannot assemble the datetimes: time data .+ does not " - r"match format '%Y%m%d' \(match\)") + msg = ( + "cannot assemble the datetimes: time data .+ does not " + r"match format '%Y%m%d' \(match\)" + ) with pytest.raises(ValueError, match=msg): to_datetime(df2, cache=cache) - result = to_datetime(df2, errors='coerce', cache=cache) - expected = Series([Timestamp('20150204 00:00:00'), - NaT]) + result = to_datetime(df2, errors="coerce", cache=cache) + expected = Series([Timestamp("20150204 00:00:00"), NaT]) assert_series_equal(result, expected) # extra columns - msg = ("extra keys have been passed to the datetime assemblage: " - r"\[foo\]") + msg = "extra keys have been passed to the datetime assemblage: " r"\[foo\]" with pytest.raises(ValueError, match=msg): df2 = df.copy() - df2['foo'] = 1 + df2["foo"] = 1 to_datetime(df2, cache=cache) # not enough - msg = (r'to assemble mappings requires at least that \[year, month, ' - r'day\] be specified: \[.+\] is missing') - for c in [['year'], - ['year', 'month'], - ['year', 'month', 'second'], - ['month', 'day'], - ['year', 'day', 'second']]: + msg = ( + r"to assemble mappings requires at least that \[year, month, " + r"day\] be specified: \[.+\] is missing" + ) + for c in [ + ["year"], + ["year", "month"], + ["year", "month", "second"], + ["month", "day"], + ["year", "day", "second"], + ]: with pytest.raises(ValueError, match=msg): to_datetime(df[c], cache=cache) # duplicates - msg = 'cannot assemble with duplicate keys' - df2 = DataFrame({'year': [2015, 2016], - 'month': [2, 20], - 'day': [4, 5]}) - df2.columns = ['year', 'year', 'day'] + msg = "cannot assemble with duplicate keys" + df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]}) + df2.columns = ["year", "year", "day"] with pytest.raises(ValueError, match=msg): to_datetime(df2, cache=cache) - df2 = DataFrame({'year': [2015, 2016], - 'month': [2, 20], - 'day': [4, 5], - 'hour': [4, 5]}) - df2.columns = ['year', 'month', 'day', 'day'] + df2 = DataFrame( + {"year": [2015, 2016], "month": [2, 20], "day": [4, 5], "hour": [4, 5]} + ) + df2.columns = ["year", "month", "day", "day"] with pytest.raises(ValueError, match=msg): to_datetime(df2, cache=cache) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_dataframe_dtypes(self, cache): # #13451 - df = DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5]}) + df = DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) # int16 - result = to_datetime(df.astype('int16'), cache=cache) - expected = Series([Timestamp('20150204 00:00:00'), - Timestamp('20160305 00:00:00')]) + result = to_datetime(df.astype("int16"), cache=cache) + expected = Series( + [Timestamp("20150204 00:00:00"), Timestamp("20160305 00:00:00")] + ) assert_series_equal(result, expected) # mixed dtypes - df['month'] = df['month'].astype('int8') - df['day'] = df['day'].astype('int8') + df["month"] = df["month"].astype("int8") + df["day"] = df["day"].astype("int8") result = to_datetime(df, cache=cache) - expected = Series([Timestamp('20150204 00:00:00'), - Timestamp('20160305 00:00:00')]) + expected = Series( + [Timestamp("20150204 00:00:00"), Timestamp("20160305 00:00:00")] + ) assert_series_equal(result, expected) # float - df = DataFrame({'year': [2000, 2001], - 'month': [1.5, 1], - 'day': [1, 1]}) + df = DataFrame({"year": [2000, 2001], "month": [1.5, 1], "day": [1, 1]}) with pytest.raises(ValueError): to_datetime(df, cache=cache) def test_dataframe_box_false(self): # GH 23760 - df = pd.DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5]}) + df = pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) with tm.assert_produces_warning(FutureWarning): result = pd.to_datetime(df, box=False) - expected = np.array(['2015-02-04', '2016-03-05'], - dtype='datetime64[ns]') + expected = np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[ns]") tm.assert_numpy_array_equal(result, expected) def test_dataframe_utc_true(self): # GH 23760 - df = pd.DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5]}) + df = pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) result = pd.to_datetime(df, utc=True) - expected = pd.Series(np.array(['2015-02-04', '2016-03-05'], - dtype='datetime64[ns]')).dt.tz_localize('UTC') + expected = pd.Series( + np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[ns]") + ).dt.tz_localize("UTC") tm.assert_series_equal(result, expected) def test_to_datetime_errors_ignore_utc_true(self): # GH 23758 - result = pd.to_datetime([1], unit='s', utc=True, errors='ignore') - expected = DatetimeIndex(['1970-01-01 00:00:01'], tz='UTC') + result = pd.to_datetime([1], unit="s", utc=True, errors="ignore") + expected = DatetimeIndex(["1970-01-01 00:00:01"], tz="UTC") tm.assert_index_equal(result, expected) @@ -1211,24 +1367,24 @@ def test_to_datetime_barely_out_of_bounds(self): # GH#19529 # GH#19382 close enough to bounds that dropping nanos would result # in an in-bounds datetime - arr = np.array(['2262-04-11 23:47:16.854775808'], dtype=object) + arr = np.array(["2262-04-11 23:47:16.854775808"], dtype=object) with pytest.raises(OutOfBoundsDatetime): to_datetime(arr) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_iso8601(self, cache): result = to_datetime(["2012-01-01 00:00:00"], cache=cache) exp = Timestamp("2012-01-01 00:00:00") assert result[0] == exp - result = to_datetime(['20121001'], cache=cache) # bad iso 8601 - exp = Timestamp('2012-10-01') + result = to_datetime(["20121001"], cache=cache) # bad iso 8601 + exp = Timestamp("2012-10-01") assert result[0] == exp - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_default(self, cache): - rs = to_datetime('2001', cache=cache) + rs = to_datetime("2001", cache=cache) xp = datetime(2001, 1, 1) assert rs == xp @@ -1238,63 +1394,59 @@ def test_to_datetime_default(self, cache): # pytest.raises(ValueError, to_datetime('01-13-2012', # dayfirst=True)) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_on_datetime64_series(self, cache): # #2699 - s = Series(date_range('1/1/2000', periods=10)) + s = Series(date_range("1/1/2000", periods=10)) result = to_datetime(s, cache=cache) assert result[0] == s[0] - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_with_space_in_series(self, cache): # GH 6428 - s = Series(['10/18/2006', '10/18/2008', ' ']) + s = Series(["10/18/2006", "10/18/2008", " "]) msg = r"(\(')?String does not contain a date(:', ' '\))?" with pytest.raises(ValueError, match=msg): - to_datetime(s, errors='raise', cache=cache) - result_coerce = to_datetime(s, errors='coerce', cache=cache) - expected_coerce = Series([datetime(2006, 10, 18), - datetime(2008, 10, 18), - NaT]) + to_datetime(s, errors="raise", cache=cache) + result_coerce = to_datetime(s, errors="coerce", cache=cache) + expected_coerce = Series([datetime(2006, 10, 18), datetime(2008, 10, 18), NaT]) tm.assert_series_equal(result_coerce, expected_coerce) - result_ignore = to_datetime(s, errors='ignore', cache=cache) + result_ignore = to_datetime(s, errors="ignore", cache=cache) tm.assert_series_equal(result_ignore, s) @td.skip_if_has_locale - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_with_apply(self, cache): # this is only locale tested with US/None locales # GH 5195 # with a format and coerce a single item to_datetime fails - td = Series(['May 04', 'Jun 02', 'Dec 11'], index=[1, 2, 3]) - expected = pd.to_datetime(td, format='%b %y', cache=cache) - result = td.apply(pd.to_datetime, format='%b %y', cache=cache) + td = Series(["May 04", "Jun 02", "Dec 11"], index=[1, 2, 3]) + expected = pd.to_datetime(td, format="%b %y", cache=cache) + result = td.apply(pd.to_datetime, format="%b %y", cache=cache) assert_series_equal(result, expected) - td = pd.Series(['May 04', 'Jun 02', ''], index=[1, 2, 3]) + td = pd.Series(["May 04", "Jun 02", ""], index=[1, 2, 3]) msg = r"time data '' does not match format '%b %y' \(match\)" with pytest.raises(ValueError, match=msg): - pd.to_datetime(td, format='%b %y', errors='raise', cache=cache) + pd.to_datetime(td, format="%b %y", errors="raise", cache=cache) with pytest.raises(ValueError, match=msg): - td.apply(pd.to_datetime, format='%b %y', - errors='raise', cache=cache) - expected = pd.to_datetime(td, format='%b %y', errors='coerce', - cache=cache) + td.apply(pd.to_datetime, format="%b %y", errors="raise", cache=cache) + expected = pd.to_datetime(td, format="%b %y", errors="coerce", cache=cache) result = td.apply( - lambda x: pd.to_datetime(x, format='%b %y', errors='coerce', - cache=cache)) + lambda x: pd.to_datetime(x, format="%b %y", errors="coerce", cache=cache) + ) assert_series_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_types(self, cache): # empty string - result = to_datetime('', cache=cache) + result = to_datetime("", cache=cache) assert result is NaT - result = to_datetime(['', ''], cache=cache) + result = to_datetime(["", ""], cache=cache) assert isna(result).all() # ints @@ -1303,12 +1455,12 @@ def test_to_datetime_types(self, cache): assert result == expected # GH 3888 (strings) - expected = to_datetime(['2012'], cache=cache)[0] - result = to_datetime('2012', cache=cache) + expected = to_datetime(["2012"], cache=cache)[0] + result = to_datetime("2012", cache=cache) assert result == expected # array = ['2012','20120101','20120101 12:01:01'] - array = ['20120101', '20120101 12:01:01'] + array = ["20120101", "20120101 12:01:01"] expected = list(to_datetime(array, cache=cache)) result = [Timestamp(date_str) for date_str in array] tm.assert_almost_equal(result, expected) @@ -1318,31 +1470,31 @@ def test_to_datetime_types(self, cache): # expected = to_datetime('2012') # assert result == expected - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_unprocessable_input(self, cache): # GH 4928 # GH 21864 - result = to_datetime([1, '1'], errors='ignore', cache=cache) + result = to_datetime([1, "1"], errors="ignore", cache=cache) - expected = Index(np.array([1, '1'], dtype='O')) + expected = Index(np.array([1, "1"], dtype="O")) tm.assert_equal(result, expected) msg = "invalid string coercion to datetime" with pytest.raises(TypeError, match=msg): - to_datetime([1, '1'], errors='raise', cache=cache) + to_datetime([1, "1"], errors="raise", cache=cache) def test_to_datetime_other_datetime64_units(self): # 5/25/2012 - scalar = np.int64(1337904000000000).view('M8[us]') - as_obj = scalar.astype('O') + scalar = np.int64(1337904000000000).view("M8[us]") + as_obj = scalar.astype("O") index = DatetimeIndex([scalar]) - assert index[0] == scalar.astype('O') + assert index[0] == scalar.astype("O") value = Timestamp(scalar) assert value == as_obj def test_to_datetime_list_of_integers(self): - rng = date_range('1/1/2000', periods=20) + rng = date_range("1/1/2000", periods=20) rng = DatetimeIndex(rng.values) ints = list(rng.asi8) @@ -1356,16 +1508,17 @@ def test_to_datetime_overflow(self): # we are overflowing Timedelta range here with pytest.raises(OverflowError): - date_range(start='1/1/1700', freq='B', periods=100000) + date_range(start="1/1/1700", freq="B", periods=100000) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_string_na_nat_conversion(self, cache): # GH #999, #858 - strings = np.array(['1/1/2000', '1/2/2000', np.nan, - '1/4/2000, 12:34:56'], dtype=object) + strings = np.array( + ["1/1/2000", "1/2/2000", np.nan, "1/4/2000, 12:34:56"], dtype=object + ) - expected = np.empty(4, dtype='M8[ns]') + expected = np.empty(4, dtype="M8[ns]") for i, val in enumerate(strings): if isna(val): expected[i] = iNaT @@ -1379,33 +1532,41 @@ def test_string_na_nat_conversion(self, cache): assert isinstance(result2, DatetimeIndex) tm.assert_numpy_array_equal(result, result2.values) - malformed = np.array(['1/100/2000', np.nan], dtype=object) + malformed = np.array(["1/100/2000", np.nan], dtype=object) # GH 10636, default is now 'raise' - msg = (r"Unknown string format:|day is out of range for month") + msg = r"Unknown string format:|day is out of range for month" with pytest.raises(ValueError, match=msg): - to_datetime(malformed, errors='raise', cache=cache) + to_datetime(malformed, errors="raise", cache=cache) - result = to_datetime(malformed, errors='ignore', cache=cache) + result = to_datetime(malformed, errors="ignore", cache=cache) # GH 21864 expected = Index(malformed) tm.assert_index_equal(result, expected) with pytest.raises(ValueError, match=msg): - to_datetime(malformed, errors='raise', cache=cache) + to_datetime(malformed, errors="raise", cache=cache) - idx = ['a', 'b', 'c', 'd', 'e'] - series = Series(['1/1/2000', np.nan, '1/3/2000', np.nan, - '1/5/2000'], index=idx, name='foo') - dseries = Series([to_datetime('1/1/2000', cache=cache), np.nan, - to_datetime('1/3/2000', cache=cache), np.nan, - to_datetime('1/5/2000', cache=cache)], - index=idx, name='foo') + idx = ["a", "b", "c", "d", "e"] + series = Series( + ["1/1/2000", np.nan, "1/3/2000", np.nan, "1/5/2000"], index=idx, name="foo" + ) + dseries = Series( + [ + to_datetime("1/1/2000", cache=cache), + np.nan, + to_datetime("1/3/2000", cache=cache), + np.nan, + to_datetime("1/5/2000", cache=cache), + ], + index=idx, + name="foo", + ) result = to_datetime(series, cache=cache) dresult = to_datetime(dseries, cache=cache) - expected = Series(np.empty(5, dtype='M8[ns]'), index=idx) + expected = Series(np.empty(5, dtype="M8[ns]"), index=idx) for i in range(5): x = series[i] if isna(x): @@ -1414,32 +1575,41 @@ def test_string_na_nat_conversion(self, cache): expected[i] = to_datetime(x, cache=cache) assert_series_equal(result, expected, check_names=False) - assert result.name == 'foo' + assert result.name == "foo" assert_series_equal(dresult, expected, check_names=False) - assert dresult.name == 'foo' + assert dresult.name == "foo" - @pytest.mark.parametrize('dtype', [ - 'datetime64[h]', 'datetime64[m]', - 'datetime64[s]', 'datetime64[ms]', - 'datetime64[us]', 'datetime64[ns]']) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize( + "dtype", + [ + "datetime64[h]", + "datetime64[m]", + "datetime64[s]", + "datetime64[ms]", + "datetime64[us]", + "datetime64[ns]", + ], + ) + @pytest.mark.parametrize("cache", [True, False]) def test_dti_constructor_numpy_timeunits(self, cache, dtype): # GH 9114 - base = pd.to_datetime(['2000-01-01T00:00', '2000-01-02T00:00', 'NaT'], - cache=cache) + base = pd.to_datetime( + ["2000-01-01T00:00", "2000-01-02T00:00", "NaT"], cache=cache + ) values = base.values.astype(dtype) tm.assert_index_equal(DatetimeIndex(values), base) tm.assert_index_equal(to_datetime(values, cache=cache), base) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_dayfirst(self, cache): # GH 5917 - arr = ['10/02/2014', '11/02/2014', '12/02/2014'] - expected = DatetimeIndex([datetime(2014, 2, 10), datetime(2014, 2, 11), - datetime(2014, 2, 12)]) + arr = ["10/02/2014", "11/02/2014", "12/02/2014"] + expected = DatetimeIndex( + [datetime(2014, 2, 10), datetime(2014, 2, 11), datetime(2014, 2, 12)] + ) idx1 = DatetimeIndex(arr, dayfirst=True) idx2 = DatetimeIndex(np.array(arr), dayfirst=True) idx3 = to_datetime(arr, dayfirst=True, cache=cache) @@ -1455,218 +1625,237 @@ def test_dayfirst(self, cache): class TestGuessDatetimeFormat: - @td.skip_if_not_us_locale def test_guess_datetime_format_for_array(self): - expected_format = '%Y-%m-%d %H:%M:%S.%f' + expected_format = "%Y-%m-%d %H:%M:%S.%f" dt_string = datetime(2011, 12, 30, 0, 0, 0).strftime(expected_format) test_arrays = [ - np.array([dt_string, dt_string, dt_string], dtype='O'), - np.array([np.nan, np.nan, dt_string], dtype='O'), - np.array([dt_string, 'random_string'], dtype='O'), + np.array([dt_string, dt_string, dt_string], dtype="O"), + np.array([np.nan, np.nan, dt_string], dtype="O"), + np.array([dt_string, "random_string"], dtype="O"), ] for test_array in test_arrays: - assert tools._guess_datetime_format_for_array( - test_array) == expected_format + assert tools._guess_datetime_format_for_array(test_array) == expected_format format_for_string_of_nans = tools._guess_datetime_format_for_array( - np.array( - [np.nan, np.nan, np.nan], dtype='O')) + np.array([np.nan, np.nan, np.nan], dtype="O") + ) assert format_for_string_of_nans is None class TestToDatetimeInferFormat: - - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_infer_datetime_format_consistent_format(self, cache): - s = pd.Series(pd.date_range('20000101', periods=50, freq='H')) + s = pd.Series(pd.date_range("20000101", periods=50, freq="H")) - test_formats = ['%m-%d-%Y', '%m/%d/%Y %H:%M:%S.%f', - '%Y-%m-%dT%H:%M:%S.%f'] + test_formats = ["%m-%d-%Y", "%m/%d/%Y %H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S.%f"] for test_format in test_formats: s_as_dt_strings = s.apply(lambda x: x.strftime(test_format)) - with_format = pd.to_datetime(s_as_dt_strings, format=test_format, - cache=cache) - no_infer = pd.to_datetime(s_as_dt_strings, - infer_datetime_format=False, - cache=cache) - yes_infer = pd.to_datetime(s_as_dt_strings, - infer_datetime_format=True, - cache=cache) + with_format = pd.to_datetime( + s_as_dt_strings, format=test_format, cache=cache + ) + no_infer = pd.to_datetime( + s_as_dt_strings, infer_datetime_format=False, cache=cache + ) + yes_infer = pd.to_datetime( + s_as_dt_strings, infer_datetime_format=True, cache=cache + ) # Whether the format is explicitly passed, it is inferred, or # it is not inferred, the results should all be the same tm.assert_series_equal(with_format, no_infer) tm.assert_series_equal(no_infer, yes_infer) - @pytest.mark.parametrize('cache', [True, False]) - def test_to_datetime_infer_datetime_format_inconsistent_format(self, - cache): - s = pd.Series(np.array(['01/01/2011 00:00:00', - '01-02-2011 00:00:00', - '2011-01-03T00:00:00'])) + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_infer_datetime_format_inconsistent_format(self, cache): + s = pd.Series( + np.array( + ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"] + ) + ) # When the format is inconsistent, infer_datetime_format should just # fallback to the default parsing - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False, - cache=cache), - pd.to_datetime(s, infer_datetime_format=True, - cache=cache)) + tm.assert_series_equal( + pd.to_datetime(s, infer_datetime_format=False, cache=cache), + pd.to_datetime(s, infer_datetime_format=True, cache=cache), + ) - s = pd.Series(np.array(['Jan/01/2011', 'Feb/01/2011', 'Mar/01/2011'])) + s = pd.Series(np.array(["Jan/01/2011", "Feb/01/2011", "Mar/01/2011"])) - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False, - cache=cache), - pd.to_datetime(s, infer_datetime_format=True, - cache=cache)) + tm.assert_series_equal( + pd.to_datetime(s, infer_datetime_format=False, cache=cache), + pd.to_datetime(s, infer_datetime_format=True, cache=cache), + ) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_infer_datetime_format_series_with_nans(self, cache): - s = pd.Series(np.array(['01/01/2011 00:00:00', np.nan, - '01/03/2011 00:00:00', np.nan])) - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False, - cache=cache), - pd.to_datetime(s, infer_datetime_format=True, - cache=cache)) - - @pytest.mark.parametrize('cache', [True, False]) - def test_to_datetime_infer_datetime_format_series_start_with_nans(self, - cache): - s = pd.Series(np.array([np.nan, np.nan, '01/01/2011 00:00:00', - '01/02/2011 00:00:00', '01/03/2011 00:00:00'])) - - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False, - cache=cache), - pd.to_datetime(s, infer_datetime_format=True, - cache=cache)) - - @pytest.mark.parametrize('cache', [True, False]) + s = pd.Series( + np.array(["01/01/2011 00:00:00", np.nan, "01/03/2011 00:00:00", np.nan]) + ) + tm.assert_series_equal( + pd.to_datetime(s, infer_datetime_format=False, cache=cache), + pd.to_datetime(s, infer_datetime_format=True, cache=cache), + ) + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_infer_datetime_format_series_start_with_nans(self, cache): + s = pd.Series( + np.array( + [ + np.nan, + np.nan, + "01/01/2011 00:00:00", + "01/02/2011 00:00:00", + "01/03/2011 00:00:00", + ] + ) + ) + + tm.assert_series_equal( + pd.to_datetime(s, infer_datetime_format=False, cache=cache), + pd.to_datetime(s, infer_datetime_format=True, cache=cache), + ) + + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_iso8601_noleading_0s(self, cache): # GH 11871 - s = pd.Series(['2014-1-1', '2014-2-2', '2015-3-3']) - expected = pd.Series([pd.Timestamp('2014-01-01'), - pd.Timestamp('2014-02-02'), - pd.Timestamp('2015-03-03')]) + s = pd.Series(["2014-1-1", "2014-2-2", "2015-3-3"]) + expected = pd.Series( + [ + pd.Timestamp("2014-01-01"), + pd.Timestamp("2014-02-02"), + pd.Timestamp("2015-03-03"), + ] + ) tm.assert_series_equal(pd.to_datetime(s, cache=cache), expected) - tm.assert_series_equal(pd.to_datetime(s, format='%Y-%m-%d', - cache=cache), expected) + tm.assert_series_equal( + pd.to_datetime(s, format="%Y-%m-%d", cache=cache), expected + ) class TestDaysInMonth: # tests for issue #10154 - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_day_not_in_month_coerce(self, cache): - assert isna(to_datetime('2015-02-29', errors='coerce', cache=cache)) - assert isna(to_datetime('2015-02-29', format="%Y-%m-%d", - errors='coerce', cache=cache)) - assert isna(to_datetime('2015-02-32', format="%Y-%m-%d", - errors='coerce', cache=cache)) - assert isna(to_datetime('2015-04-31', format="%Y-%m-%d", - errors='coerce', cache=cache)) - - @pytest.mark.parametrize('cache', [True, False]) + assert isna(to_datetime("2015-02-29", errors="coerce", cache=cache)) + assert isna( + to_datetime("2015-02-29", format="%Y-%m-%d", errors="coerce", cache=cache) + ) + assert isna( + to_datetime("2015-02-32", format="%Y-%m-%d", errors="coerce", cache=cache) + ) + assert isna( + to_datetime("2015-04-31", format="%Y-%m-%d", errors="coerce", cache=cache) + ) + + @pytest.mark.parametrize("cache", [True, False]) def test_day_not_in_month_raise(self, cache): msg = "day is out of range for month" with pytest.raises(ValueError, match=msg): - to_datetime('2015-02-29', errors='raise', cache=cache) + to_datetime("2015-02-29", errors="raise", cache=cache) msg = "time data 2015-02-29 doesn't match format specified" with pytest.raises(ValueError, match=msg): - to_datetime('2015-02-29', errors='raise', format="%Y-%m-%d", - cache=cache) + to_datetime("2015-02-29", errors="raise", format="%Y-%m-%d", cache=cache) msg = "time data 2015-02-32 doesn't match format specified" with pytest.raises(ValueError, match=msg): - to_datetime('2015-02-32', errors='raise', format="%Y-%m-%d", - cache=cache) + to_datetime("2015-02-32", errors="raise", format="%Y-%m-%d", cache=cache) msg = "time data 2015-04-31 doesn't match format specified" with pytest.raises(ValueError, match=msg): - to_datetime('2015-04-31', errors='raise', format="%Y-%m-%d", - cache=cache) + to_datetime("2015-04-31", errors="raise", format="%Y-%m-%d", cache=cache) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_day_not_in_month_ignore(self, cache): - assert to_datetime('2015-02-29', errors='ignore', - cache=cache) == '2015-02-29' - assert to_datetime('2015-02-29', errors='ignore', - format="%Y-%m-%d", cache=cache) == '2015-02-29' - assert to_datetime('2015-02-32', errors='ignore', - format="%Y-%m-%d", cache=cache) == '2015-02-32' - assert to_datetime('2015-04-31', errors='ignore', - format="%Y-%m-%d", cache=cache) == '2015-04-31' + assert to_datetime("2015-02-29", errors="ignore", cache=cache) == "2015-02-29" + assert ( + to_datetime("2015-02-29", errors="ignore", format="%Y-%m-%d", cache=cache) + == "2015-02-29" + ) + assert ( + to_datetime("2015-02-32", errors="ignore", format="%Y-%m-%d", cache=cache) + == "2015-02-32" + ) + assert ( + to_datetime("2015-04-31", errors="ignore", format="%Y-%m-%d", cache=cache) + == "2015-04-31" + ) class TestDatetimeParsingWrappers: - - @pytest.mark.parametrize('date_str,expected', list({ - '2011-01-01': datetime(2011, 1, 1), - '2Q2005': datetime(2005, 4, 1), - '2Q05': datetime(2005, 4, 1), - '2005Q1': datetime(2005, 1, 1), - '05Q1': datetime(2005, 1, 1), - '2011Q3': datetime(2011, 7, 1), - '11Q3': datetime(2011, 7, 1), - '3Q2011': datetime(2011, 7, 1), - '3Q11': datetime(2011, 7, 1), - - # quarterly without space - '2000Q4': datetime(2000, 10, 1), - '00Q4': datetime(2000, 10, 1), - '4Q2000': datetime(2000, 10, 1), - '4Q00': datetime(2000, 10, 1), - '2000q4': datetime(2000, 10, 1), - '2000-Q4': datetime(2000, 10, 1), - '00-Q4': datetime(2000, 10, 1), - '4Q-2000': datetime(2000, 10, 1), - '4Q-00': datetime(2000, 10, 1), - '00q4': datetime(2000, 10, 1), - '2005': datetime(2005, 1, 1), - '2005-11': datetime(2005, 11, 1), - '2005 11': datetime(2005, 11, 1), - '11-2005': datetime(2005, 11, 1), - '11 2005': datetime(2005, 11, 1), - '200511': datetime(2020, 5, 11), - '20051109': datetime(2005, 11, 9), - '20051109 10:15': datetime(2005, 11, 9, 10, 15), - '20051109 08H': datetime(2005, 11, 9, 8, 0), - '2005-11-09 10:15': datetime(2005, 11, 9, 10, 15), - '2005-11-09 08H': datetime(2005, 11, 9, 8, 0), - '2005/11/09 10:15': datetime(2005, 11, 9, 10, 15), - '2005/11/09 08H': datetime(2005, 11, 9, 8, 0), - "Thu Sep 25 10:36:28 2003": datetime(2003, 9, 25, 10, 36, 28), - "Thu Sep 25 2003": datetime(2003, 9, 25), - "Sep 25 2003": datetime(2003, 9, 25), - "January 1 2014": datetime(2014, 1, 1), - - # GHE10537 - '2014-06': datetime(2014, 6, 1), - '06-2014': datetime(2014, 6, 1), - '2014-6': datetime(2014, 6, 1), - '6-2014': datetime(2014, 6, 1), - - '20010101 12': datetime(2001, 1, 1, 12), - '20010101 1234': datetime(2001, 1, 1, 12, 34), - '20010101 123456': datetime(2001, 1, 1, 12, 34, 56)}.items())) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize( + "date_str,expected", + list( + { + "2011-01-01": datetime(2011, 1, 1), + "2Q2005": datetime(2005, 4, 1), + "2Q05": datetime(2005, 4, 1), + "2005Q1": datetime(2005, 1, 1), + "05Q1": datetime(2005, 1, 1), + "2011Q3": datetime(2011, 7, 1), + "11Q3": datetime(2011, 7, 1), + "3Q2011": datetime(2011, 7, 1), + "3Q11": datetime(2011, 7, 1), + # quarterly without space + "2000Q4": datetime(2000, 10, 1), + "00Q4": datetime(2000, 10, 1), + "4Q2000": datetime(2000, 10, 1), + "4Q00": datetime(2000, 10, 1), + "2000q4": datetime(2000, 10, 1), + "2000-Q4": datetime(2000, 10, 1), + "00-Q4": datetime(2000, 10, 1), + "4Q-2000": datetime(2000, 10, 1), + "4Q-00": datetime(2000, 10, 1), + "00q4": datetime(2000, 10, 1), + "2005": datetime(2005, 1, 1), + "2005-11": datetime(2005, 11, 1), + "2005 11": datetime(2005, 11, 1), + "11-2005": datetime(2005, 11, 1), + "11 2005": datetime(2005, 11, 1), + "200511": datetime(2020, 5, 11), + "20051109": datetime(2005, 11, 9), + "20051109 10:15": datetime(2005, 11, 9, 10, 15), + "20051109 08H": datetime(2005, 11, 9, 8, 0), + "2005-11-09 10:15": datetime(2005, 11, 9, 10, 15), + "2005-11-09 08H": datetime(2005, 11, 9, 8, 0), + "2005/11/09 10:15": datetime(2005, 11, 9, 10, 15), + "2005/11/09 08H": datetime(2005, 11, 9, 8, 0), + "Thu Sep 25 10:36:28 2003": datetime(2003, 9, 25, 10, 36, 28), + "Thu Sep 25 2003": datetime(2003, 9, 25), + "Sep 25 2003": datetime(2003, 9, 25), + "January 1 2014": datetime(2014, 1, 1), + # GHE10537 + "2014-06": datetime(2014, 6, 1), + "06-2014": datetime(2014, 6, 1), + "2014-6": datetime(2014, 6, 1), + "6-2014": datetime(2014, 6, 1), + "20010101 12": datetime(2001, 1, 1, 12), + "20010101 1234": datetime(2001, 1, 1, 12, 34), + "20010101 123456": datetime(2001, 1, 1, 12, 34, 56), + }.items() + ), + ) + @pytest.mark.parametrize("cache", [True, False]) def test_parsers(self, date_str, expected, cache): # dateutil >= 2.5.0 defaults to yearfirst=True # https://github.com/dateutil/dateutil/issues/217 yearfirst = True - result1, _, _ = parsing.parse_time_string(date_str, - yearfirst=yearfirst) + result1, _, _ = parsing.parse_time_string(date_str, yearfirst=yearfirst) result2 = to_datetime(date_str, yearfirst=yearfirst) result3 = to_datetime([date_str], yearfirst=yearfirst) # result5 is used below - result4 = to_datetime(np.array([date_str], dtype=object), - yearfirst=yearfirst, cache=cache) + result4 = to_datetime( + np.array([date_str], dtype=object), yearfirst=yearfirst, cache=cache + ) result6 = DatetimeIndex([date_str], yearfirst=yearfirst) # result7 is used below result8 = DatetimeIndex(Index([date_str]), yearfirst=yearfirst) @@ -1682,31 +1871,30 @@ def test_parsers(self, date_str, expected, cache): if not yearfirst: result5 = Timestamp(date_str) assert result5 == expected - result7 = date_range(date_str, freq='S', periods=1, - yearfirst=yearfirst) + result7 = date_range(date_str, freq="S", periods=1, yearfirst=yearfirst) assert result7 == expected - @pytest.mark.parametrize('cache', [True, False]) - def test_na_values_with_cache(self, cache, unique_nulls_fixture, - unique_nulls_fixture2): + @pytest.mark.parametrize("cache", [True, False]) + def test_na_values_with_cache( + self, cache, unique_nulls_fixture, unique_nulls_fixture2 + ): # GH22305 - expected = Index([NaT, NaT], dtype='datetime64[ns]') - result = to_datetime([unique_nulls_fixture, unique_nulls_fixture2], - cache=cache) + expected = Index([NaT, NaT], dtype="datetime64[ns]") + result = to_datetime([unique_nulls_fixture, unique_nulls_fixture2], cache=cache) tm.assert_index_equal(result, expected) def test_parsers_nat(self): # Test that each of several string-accepting methods return pd.NaT - result1, _, _ = parsing.parse_time_string('NaT') - result2 = to_datetime('NaT') - result3 = Timestamp('NaT') - result4 = DatetimeIndex(['NaT'])[0] + result1, _, _ = parsing.parse_time_string("NaT") + result2 = to_datetime("NaT") + result3 = Timestamp("NaT") + result4 = DatetimeIndex(["NaT"])[0] assert result1 is NaT assert result2 is NaT assert result3 is NaT assert result4 is NaT - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_parsers_dayfirst_yearfirst(self, cache): # OK # 2.5.1 10-11-12 [dayfirst=0, yearfirst=0] -> 2012-10-11 00:00:00 @@ -1749,55 +1937,58 @@ def test_parsers_dayfirst_yearfirst(self, cache): # 2.5.3 20/12/21 [dayfirst=1, yearfirst=0] -> 2021-12-20 00:00:00 # str : dayfirst, yearfirst, expected - cases = {'10-11-12': [(False, False, - datetime(2012, 10, 11)), - (True, False, - datetime(2012, 11, 10)), - (False, True, - datetime(2010, 11, 12)), - (True, True, - datetime(2010, 12, 11))], - '20/12/21': [(False, False, - datetime(2021, 12, 20)), - (True, False, - datetime(2021, 12, 20)), - (False, True, - datetime(2020, 12, 21)), - (True, True, - datetime(2020, 12, 21))]} + cases = { + "10-11-12": [ + (False, False, datetime(2012, 10, 11)), + (True, False, datetime(2012, 11, 10)), + (False, True, datetime(2010, 11, 12)), + (True, True, datetime(2010, 12, 11)), + ], + "20/12/21": [ + (False, False, datetime(2021, 12, 20)), + (True, False, datetime(2021, 12, 20)), + (False, True, datetime(2020, 12, 21)), + (True, True, datetime(2020, 12, 21)), + ], + } for date_str, values in cases.items(): for dayfirst, yearfirst, expected in values: # compare with dateutil result - dateutil_result = parse(date_str, dayfirst=dayfirst, - yearfirst=yearfirst) + dateutil_result = parse( + date_str, dayfirst=dayfirst, yearfirst=yearfirst + ) assert dateutil_result == expected - result1, _, _ = parsing.parse_time_string(date_str, - dayfirst=dayfirst, - yearfirst=yearfirst) + result1, _, _ = parsing.parse_time_string( + date_str, dayfirst=dayfirst, yearfirst=yearfirst + ) # we don't support dayfirst/yearfirst here: if not dayfirst and not yearfirst: result2 = Timestamp(date_str) assert result2 == expected - result3 = to_datetime(date_str, dayfirst=dayfirst, - yearfirst=yearfirst, cache=cache) + result3 = to_datetime( + date_str, dayfirst=dayfirst, yearfirst=yearfirst, cache=cache + ) - result4 = DatetimeIndex([date_str], dayfirst=dayfirst, - yearfirst=yearfirst)[0] + result4 = DatetimeIndex( + [date_str], dayfirst=dayfirst, yearfirst=yearfirst + )[0] assert result1 == expected assert result3 == expected assert result4 == expected - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_parsers_timestring(self, cache): # must be the same as dateutil result - cases = {'10:15': (parse('10:15'), datetime(1, 1, 1, 10, 15)), - '9:05': (parse('9:05'), datetime(1, 1, 1, 9, 5))} + cases = { + "10:15": (parse("10:15"), datetime(1, 1, 1, 10, 15)), + "9:05": (parse("9:05"), datetime(1, 1, 1, 9, 5)), + } for date_str, (exp_now, exp_def) in cases.items(): result1, _, _ = parsing.parse_time_string(date_str) @@ -1817,8 +2008,17 @@ def test_parsers_timestring(self, cache): @td.skip_if_has_locale def test_parsers_time(self): # GH11818 - strings = ["14:15", "1415", "2:15pm", "0215pm", "14:15:00", "141500", - "2:15:00pm", "021500pm", time(14, 15)] + strings = [ + "14:15", + "1415", + "2:15pm", + "0215pm", + "14:15:00", + "141500", + "2:15:00pm", + "021500pm", + time(14, 15), + ] expected = time(14, 15) for time_string in strings: @@ -1835,8 +2035,7 @@ def test_parsers_time(self): assert tools.to_time(arg) == expected_arr assert tools.to_time(arg, format="%H:%M") == expected_arr assert tools.to_time(arg, infer_time_format=True) == expected_arr - assert tools.to_time(arg, format="%I:%M%p", - errors="coerce") == [None, None] + assert tools.to_time(arg, format="%I:%M%p", errors="coerce") == [None, None] res = tools.to_time(arg, format="%I:%M%p", errors="ignore") tm.assert_numpy_array_equal(res, np.array(arg, dtype=np.object_)) @@ -1844,30 +2043,42 @@ def test_parsers_time(self): with pytest.raises(ValueError): tools.to_time(arg, format="%I:%M%p", errors="raise") - tm.assert_series_equal(tools.to_time(Series(arg, name="test")), - Series(expected_arr, name="test")) + tm.assert_series_equal( + tools.to_time(Series(arg, name="test")), Series(expected_arr, name="test") + ) res = tools.to_time(np.array(arg)) assert isinstance(res, list) assert res == expected_arr - @pytest.mark.parametrize('cache', [True, False]) - @pytest.mark.parametrize('dt_string, tz, dt_string_repr', [ - ('2013-01-01 05:45+0545', pytz.FixedOffset(345), - "Timestamp('2013-01-01 05:45:00+0545', tz='pytz.FixedOffset(345)')"), - ('2013-01-01 05:30+0530', pytz.FixedOffset(330), - "Timestamp('2013-01-01 05:30:00+0530', tz='pytz.FixedOffset(330)')")]) - def test_parsers_timezone_minute_offsets_roundtrip(self, cache, dt_string, - tz, dt_string_repr): + @pytest.mark.parametrize("cache", [True, False]) + @pytest.mark.parametrize( + "dt_string, tz, dt_string_repr", + [ + ( + "2013-01-01 05:45+0545", + pytz.FixedOffset(345), + "Timestamp('2013-01-01 05:45:00+0545', tz='pytz.FixedOffset(345)')", + ), + ( + "2013-01-01 05:30+0530", + pytz.FixedOffset(330), + "Timestamp('2013-01-01 05:30:00+0530', tz='pytz.FixedOffset(330)')", + ), + ], + ) + def test_parsers_timezone_minute_offsets_roundtrip( + self, cache, dt_string, tz, dt_string_repr + ): # GH11708 base = to_datetime("2013-01-01 00:00:00", cache=cache) - base = base.tz_localize('UTC').tz_convert(tz) + base = base.tz_localize("UTC").tz_convert(tz) dt_time = to_datetime(dt_string, cache=cache) assert base == dt_time assert dt_string_repr == repr(dt_time) -@pytest.fixture(params=['D', 's', 'ms', 'us', 'ns']) +@pytest.fixture(params=["D", "s", "ms", "us", "ns"]) def units(request): """Day and some time units. @@ -1883,7 +2094,7 @@ def units(request): @pytest.fixture def epoch_1960(): """Timestamp at 1960-01-01.""" - return Timestamp('1960-01-01') + return Timestamp("1960-01-01") @pytest.fixture @@ -1891,7 +2102,7 @@ def units_from_epochs(): return list(range(5)) -@pytest.fixture(params=['timestamp', 'pydatetime', 'datetime64', 'str_1960']) +@pytest.fixture(params=["timestamp", "pydatetime", "datetime64", "str_1960"]) def epochs(epoch_1960, request): """Timestamp at 1960-01-01 in various forms. @@ -1900,11 +2111,10 @@ def epochs(epoch_1960, request): * numpy.datetime64 * str """ - assert request.param in {'timestamp', 'pydatetime', 'datetime64', - "str_1960"} - if request.param == 'timestamp': + assert request.param in {"timestamp", "pydatetime", "datetime64", "str_1960"} + if request.param == "timestamp": return epoch_1960 - elif request.param == 'pydatetime': + elif request.param == "pydatetime": return epoch_1960.to_pydatetime() elif request.param == "datetime64": return epoch_1960.to_datetime64() @@ -1914,50 +2124,47 @@ def epochs(epoch_1960, request): @pytest.fixture def julian_dates(): - return pd.date_range('2014-1-1', periods=10).to_julian_date().values + return pd.date_range("2014-1-1", periods=10).to_julian_date().values class TestOrigin: - def test_to_basic(self, julian_dates): # gh-11276, gh-11745 # for origin as julian - result = Series(pd.to_datetime( - julian_dates, unit='D', origin='julian')) - expected = Series(pd.to_datetime( - julian_dates - pd.Timestamp(0).to_julian_date(), unit='D')) + result = Series(pd.to_datetime(julian_dates, unit="D", origin="julian")) + expected = Series( + pd.to_datetime(julian_dates - pd.Timestamp(0).to_julian_date(), unit="D") + ) assert_series_equal(result, expected) - result = Series(pd.to_datetime( - [0, 1, 2], unit='D', origin='unix')) - expected = Series([Timestamp('1970-01-01'), - Timestamp('1970-01-02'), - Timestamp('1970-01-03')]) + result = Series(pd.to_datetime([0, 1, 2], unit="D", origin="unix")) + expected = Series( + [Timestamp("1970-01-01"), Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ) assert_series_equal(result, expected) # default - result = Series(pd.to_datetime( - [0, 1, 2], unit='D')) - expected = Series([Timestamp('1970-01-01'), - Timestamp('1970-01-02'), - Timestamp('1970-01-03')]) + result = Series(pd.to_datetime([0, 1, 2], unit="D")) + expected = Series( + [Timestamp("1970-01-01"), Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ) assert_series_equal(result, expected) def test_julian_round_trip(self): - result = pd.to_datetime(2456658, origin='julian', unit='D') + result = pd.to_datetime(2456658, origin="julian", unit="D") assert result.to_julian_date() == 2456658 # out-of-bounds with pytest.raises(ValueError): - pd.to_datetime(1, origin="julian", unit='D') + pd.to_datetime(1, origin="julian", unit="D") def test_invalid_unit(self, units, julian_dates): # checking for invalid combination of origin='julian' and unit != D - if units != 'D': + if units != "D": with pytest.raises(ValueError): - pd.to_datetime(julian_dates, unit=units, origin='julian') + pd.to_datetime(julian_dates, unit=units, origin="julian") def test_invalid_origin(self): @@ -1966,38 +2173,37 @@ def test_invalid_origin(self): pd.to_datetime("2005-01-01", origin="1960-01-01") with pytest.raises(ValueError): - pd.to_datetime("2005-01-01", origin="1960-01-01", unit='D') + pd.to_datetime("2005-01-01", origin="1960-01-01", unit="D") def test_epoch(self, units, epochs, epoch_1960, units_from_epochs): expected = Series( - [pd.Timedelta(x, unit=units) + - epoch_1960 for x in units_from_epochs]) + [pd.Timedelta(x, unit=units) + epoch_1960 for x in units_from_epochs] + ) - result = Series(pd.to_datetime( - units_from_epochs, unit=units, origin=epochs)) + result = Series(pd.to_datetime(units_from_epochs, unit=units, origin=epochs)) assert_series_equal(result, expected) - @pytest.mark.parametrize("origin, exc", - [('random_string', ValueError), - ('epoch', ValueError), - ('13-24-1990', ValueError), - (datetime(1, 1, 1), tslib.OutOfBoundsDatetime)]) + @pytest.mark.parametrize( + "origin, exc", + [ + ("random_string", ValueError), + ("epoch", ValueError), + ("13-24-1990", ValueError), + (datetime(1, 1, 1), tslib.OutOfBoundsDatetime), + ], + ) def test_invalid_origins(self, origin, exc, units, units_from_epochs): with pytest.raises(exc): - pd.to_datetime(units_from_epochs, unit=units, - origin=origin) + pd.to_datetime(units_from_epochs, unit=units, origin=origin) def test_invalid_origins_tzinfo(self): # GH16842 with pytest.raises(ValueError): - pd.to_datetime(1, unit='D', - origin=datetime(2000, 1, 1, tzinfo=pytz.utc)) + pd.to_datetime(1, unit="D", origin=datetime(2000, 1, 1, tzinfo=pytz.utc)) - @pytest.mark.parametrize("format", [ - None, "%Y-%m-%d %H:%M:%S" - ]) + @pytest.mark.parametrize("format", [None, "%Y-%m-%d %H:%M:%S"]) def test_to_datetime_out_of_bounds_with_format_arg(self, format): # see gh-23830 msg = "Out of bounds nanosecond timestamp" @@ -2008,45 +2214,53 @@ def test_processing_order(self): # make sure we handle out-of-bounds *before* # constructing the dates - result = pd.to_datetime(200 * 365, unit='D') - expected = Timestamp('2169-11-13 00:00:00') + result = pd.to_datetime(200 * 365, unit="D") + expected = Timestamp("2169-11-13 00:00:00") assert result == expected - result = pd.to_datetime(200 * 365, unit='D', origin='1870-01-01') - expected = Timestamp('2069-11-13 00:00:00') + result = pd.to_datetime(200 * 365, unit="D", origin="1870-01-01") + expected = Timestamp("2069-11-13 00:00:00") assert result == expected - result = pd.to_datetime(300 * 365, unit='D', origin='1870-01-01') - expected = Timestamp('2169-10-20 00:00:00') + result = pd.to_datetime(300 * 365, unit="D", origin="1870-01-01") + expected = Timestamp("2169-10-20 00:00:00") assert result == expected - @pytest.mark.parametrize('offset,utc,exp', [ - ["Z", True, "2019-01-01T00:00:00.000Z"], - ["Z", None, "2019-01-01T00:00:00.000Z"], - ["-01:00", True, "2019-01-01T01:00:00.000Z"], - ["-01:00", None, "2019-01-01T00:00:00.000-01:00"], - ]) + @pytest.mark.parametrize( + "offset,utc,exp", + [ + ["Z", True, "2019-01-01T00:00:00.000Z"], + ["Z", None, "2019-01-01T00:00:00.000Z"], + ["-01:00", True, "2019-01-01T01:00:00.000Z"], + ["-01:00", None, "2019-01-01T00:00:00.000-01:00"], + ], + ) def test_arg_tz_ns_unit(self, offset, utc, exp): # GH 25546 arg = "2019-01-01T00:00:00.000" + offset - result = to_datetime([arg], unit='ns', utc=utc) + result = to_datetime([arg], unit="ns", utc=utc) expected = to_datetime([exp]) tm.assert_index_equal(result, expected) -@pytest.mark.parametrize('listlike,do_caching', [ - ([1, 2, 3, 4, 5, 6, 7, 8, 9, 0], False), - ([1, 1, 1, 1, 4, 5, 6, 7, 8, 9], True) -]) +@pytest.mark.parametrize( + "listlike,do_caching", + [([1, 2, 3, 4, 5, 6, 7, 8, 9, 0], False), ([1, 1, 1, 1, 4, 5, 6, 7, 8, 9], True)], +) def test_should_cache(listlike, do_caching): - assert tools.should_cache(listlike, check_count=len(listlike), - unique_share=0.7) == do_caching - - -@pytest.mark.parametrize('unique_share,check_count, err_message', [ - (0.5, 11, r'check_count must be in next bounds: \[0; len\(arg\)\]'), - (10, 2, r'unique_share must be in next bounds: \(0; 1\)') -]) + assert ( + tools.should_cache(listlike, check_count=len(listlike), unique_share=0.7) + == do_caching + ) + + +@pytest.mark.parametrize( + "unique_share,check_count, err_message", + [ + (0.5, 11, r"check_count must be in next bounds: \[0; len\(arg\)\]"), + (10, 2, r"unique_share must be in next bounds: \(0; 1\)"), + ], +) def test_should_cache_errors(unique_share, check_count, err_message): arg = [5] * 10 diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index 16bcb459a22f99..91022fef165217 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -4,8 +4,14 @@ from pandas.core.dtypes.dtypes import CategoricalDtype, IntervalDtype from pandas import ( - CategoricalIndex, Index, IntervalIndex, NaT, Timedelta, Timestamp, - interval_range) + CategoricalIndex, + Index, + IntervalIndex, + NaT, + Timedelta, + Timestamp, + interval_range, +) import pandas.util.testing as tm @@ -13,7 +19,7 @@ class Base: """Tests common to IntervalIndex with any subtype""" def test_astype_idempotent(self, index): - result = index.astype('interval') + result = index.astype("interval") tm.assert_index_equal(result, index) result = index.astype(index.dtype) @@ -21,12 +27,12 @@ def test_astype_idempotent(self, index): def test_astype_object(self, index): result = index.astype(object) - expected = Index(index.values, dtype='object') + expected = Index(index.values, dtype="object") tm.assert_index_equal(result, expected) assert not result.equals(index) def test_astype_category(self, index): - result = index.astype('category') + result = index.astype("category") expected = CategoricalIndex(index.values) tm.assert_index_equal(result, expected) @@ -37,64 +43,77 @@ def test_astype_category(self, index): categories = index.dropna().unique().values[:-1] dtype = CategoricalDtype(categories=categories, ordered=True) result = index.astype(dtype) - expected = CategoricalIndex( - index.values, categories=categories, ordered=True) + expected = CategoricalIndex(index.values, categories=categories, ordered=True) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('dtype', [ - 'int64', 'uint64', 'float64', 'complex128', 'period[M]', - 'timedelta64', 'timedelta64[ns]', 'datetime64', 'datetime64[ns]', - 'datetime64[ns, US/Eastern]']) + @pytest.mark.parametrize( + "dtype", + [ + "int64", + "uint64", + "float64", + "complex128", + "period[M]", + "timedelta64", + "timedelta64[ns]", + "datetime64", + "datetime64[ns]", + "datetime64[ns, US/Eastern]", + ], + ) def test_astype_cannot_cast(self, index, dtype): - msg = 'Cannot cast IntervalIndex to dtype' + msg = "Cannot cast IntervalIndex to dtype" with pytest.raises(TypeError, match=msg): index.astype(dtype) def test_astype_invalid_dtype(self, index): msg = "data type 'fake_dtype' not understood" with pytest.raises(TypeError, match=msg): - index.astype('fake_dtype') + index.astype("fake_dtype") class TestIntSubtype(Base): """Tests specific to IntervalIndex with integer-like subtype""" indexes = [ - IntervalIndex.from_breaks(np.arange(-10, 11, dtype='int64')), - IntervalIndex.from_breaks( - np.arange(100, dtype='uint64'), closed='left'), + IntervalIndex.from_breaks(np.arange(-10, 11, dtype="int64")), + IntervalIndex.from_breaks(np.arange(100, dtype="uint64"), closed="left"), ] @pytest.fixture(params=indexes) def index(self, request): return request.param - @pytest.mark.parametrize('subtype', [ - 'float64', 'datetime64[ns]', 'timedelta64[ns]']) + @pytest.mark.parametrize( + "subtype", ["float64", "datetime64[ns]", "timedelta64[ns]"] + ) def test_subtype_conversion(self, index, subtype): dtype = IntervalDtype(subtype) result = index.astype(dtype) - expected = IntervalIndex.from_arrays(index.left.astype(subtype), - index.right.astype(subtype), - closed=index.closed) + expected = IntervalIndex.from_arrays( + index.left.astype(subtype), index.right.astype(subtype), closed=index.closed + ) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('subtype_start, subtype_end', [ - ('int64', 'uint64'), ('uint64', 'int64')]) + @pytest.mark.parametrize( + "subtype_start, subtype_end", [("int64", "uint64"), ("uint64", "int64")] + ) def test_subtype_integer(self, subtype_start, subtype_end): index = IntervalIndex.from_breaks(np.arange(100, dtype=subtype_start)) dtype = IntervalDtype(subtype_end) result = index.astype(dtype) - expected = IntervalIndex.from_arrays(index.left.astype(subtype_end), - index.right.astype(subtype_end), - closed=index.closed) + expected = IntervalIndex.from_arrays( + index.left.astype(subtype_end), + index.right.astype(subtype_end), + closed=index.closed, + ) tm.assert_index_equal(result, expected) - @pytest.mark.xfail(reason='GH#15832') + @pytest.mark.xfail(reason="GH#15832") def test_subtype_integer_errors(self): # int64 -> uint64 fails with negative values index = interval_range(-10, 10) - dtype = IntervalDtype('uint64') + dtype = IntervalDtype("uint64") with pytest.raises(ValueError): index.astype(dtype) @@ -103,53 +122,53 @@ class TestFloatSubtype(Base): """Tests specific to IntervalIndex with float subtype""" indexes = [ - interval_range(-10.0, 10.0, closed='neither'), - IntervalIndex.from_arrays([-1.5, np.nan, 0., 0., 1.5], - [-0.5, np.nan, 1., 1., 3.], - closed='both'), + interval_range(-10.0, 10.0, closed="neither"), + IntervalIndex.from_arrays( + [-1.5, np.nan, 0.0, 0.0, 1.5], [-0.5, np.nan, 1.0, 1.0, 3.0], closed="both" + ), ] @pytest.fixture(params=indexes) def index(self, request): return request.param - @pytest.mark.parametrize('subtype', ['int64', 'uint64']) + @pytest.mark.parametrize("subtype", ["int64", "uint64"]) def test_subtype_integer(self, subtype): index = interval_range(0.0, 10.0) dtype = IntervalDtype(subtype) result = index.astype(dtype) - expected = IntervalIndex.from_arrays(index.left.astype(subtype), - index.right.astype(subtype), - closed=index.closed) + expected = IntervalIndex.from_arrays( + index.left.astype(subtype), index.right.astype(subtype), closed=index.closed + ) tm.assert_index_equal(result, expected) # raises with NA - msg = 'Cannot convert NA to integer' + msg = "Cannot convert NA to integer" with pytest.raises(ValueError, match=msg): index.insert(0, np.nan).astype(dtype) - @pytest.mark.xfail(reason='GH#15832') + @pytest.mark.xfail(reason="GH#15832") def test_subtype_integer_errors(self): # float64 -> uint64 fails with negative values index = interval_range(-10.0, 10.0) - dtype = IntervalDtype('uint64') + dtype = IntervalDtype("uint64") with pytest.raises(ValueError): index.astype(dtype) # float64 -> integer-like fails with non-integer valued floats index = interval_range(0.0, 10.0, freq=0.25) - dtype = IntervalDtype('int64') + dtype = IntervalDtype("int64") with pytest.raises(ValueError): index.astype(dtype) - dtype = IntervalDtype('uint64') + dtype = IntervalDtype("uint64") with pytest.raises(ValueError): index.astype(dtype) - @pytest.mark.parametrize('subtype', ['datetime64[ns]', 'timedelta64[ns]']) + @pytest.mark.parametrize("subtype", ["datetime64[ns]", "timedelta64[ns]"]) def test_subtype_datetimelike(self, index, subtype): dtype = IntervalDtype(subtype) - msg = 'Cannot convert .* to .*; subtypes are incompatible' + msg = "Cannot convert .* to .*; subtypes are incompatible" with pytest.raises(TypeError, match=msg): index.astype(dtype) @@ -158,47 +177,47 @@ class TestDatetimelikeSubtype(Base): """Tests specific to IntervalIndex with datetime-like subtype""" indexes = [ - interval_range(Timestamp('2018-01-01'), periods=10, closed='neither'), - interval_range(Timestamp('2018-01-01'), periods=10).insert(2, NaT), - interval_range(Timestamp('2018-01-01', tz='US/Eastern'), periods=10), - interval_range(Timedelta('0 days'), periods=10, closed='both'), - interval_range(Timedelta('0 days'), periods=10).insert(2, NaT), + interval_range(Timestamp("2018-01-01"), periods=10, closed="neither"), + interval_range(Timestamp("2018-01-01"), periods=10).insert(2, NaT), + interval_range(Timestamp("2018-01-01", tz="US/Eastern"), periods=10), + interval_range(Timedelta("0 days"), periods=10, closed="both"), + interval_range(Timedelta("0 days"), periods=10).insert(2, NaT), ] @pytest.fixture(params=indexes) def index(self, request): return request.param - @pytest.mark.parametrize('subtype', ['int64', 'uint64']) + @pytest.mark.parametrize("subtype", ["int64", "uint64"]) def test_subtype_integer(self, index, subtype): dtype = IntervalDtype(subtype) result = index.astype(dtype) - expected = IntervalIndex.from_arrays(index.left.astype(subtype), - index.right.astype(subtype), - closed=index.closed) + expected = IntervalIndex.from_arrays( + index.left.astype(subtype), index.right.astype(subtype), closed=index.closed + ) tm.assert_index_equal(result, expected) def test_subtype_float(self, index): - dtype = IntervalDtype('float64') - msg = 'Cannot convert .* to .*; subtypes are incompatible' + dtype = IntervalDtype("float64") + msg = "Cannot convert .* to .*; subtypes are incompatible" with pytest.raises(TypeError, match=msg): index.astype(dtype) def test_subtype_datetimelike(self): # datetime -> timedelta raises - dtype = IntervalDtype('timedelta64[ns]') - msg = 'Cannot convert .* to .*; subtypes are incompatible' + dtype = IntervalDtype("timedelta64[ns]") + msg = "Cannot convert .* to .*; subtypes are incompatible" - index = interval_range(Timestamp('2018-01-01'), periods=10) + index = interval_range(Timestamp("2018-01-01"), periods=10) with pytest.raises(TypeError, match=msg): index.astype(dtype) - index = interval_range(Timestamp('2018-01-01', tz='CET'), periods=10) + index = interval_range(Timestamp("2018-01-01", tz="CET"), periods=10) with pytest.raises(TypeError, match=msg): index.astype(dtype) # timedelta -> datetime raises - dtype = IntervalDtype('datetime64[ns]') - index = interval_range(Timedelta('0 days'), periods=10) + dtype = IntervalDtype("datetime64[ns]") + index = interval_range(Timedelta("0 days"), periods=10) with pytest.raises(TypeError, match=msg): index.astype(dtype) diff --git a/pandas/tests/indexes/interval/test_construction.py b/pandas/tests/indexes/interval/test_construction.py index aabaaa0f297f9d..e2abb4531525ae 100644 --- a/pandas/tests/indexes/interval/test_construction.py +++ b/pandas/tests/indexes/interval/test_construction.py @@ -7,14 +7,24 @@ from pandas.core.dtypes.dtypes import IntervalDtype from pandas import ( - Categorical, CategoricalIndex, Float64Index, Index, Int64Index, Interval, - IntervalIndex, date_range, notna, period_range, timedelta_range) + Categorical, + CategoricalIndex, + Float64Index, + Index, + Int64Index, + Interval, + IntervalIndex, + date_range, + notna, + period_range, + timedelta_range, +) from pandas.core.arrays import IntervalArray import pandas.core.common as com import pandas.util.testing as tm -@pytest.fixture(params=[None, 'foo']) +@pytest.fixture(params=[None, "foo"]) def name(request): return request.param @@ -26,31 +36,39 @@ class Base: get_kwargs_from_breaks to the expected format. """ - @pytest.mark.parametrize('breaks', [ - [3, 14, 15, 92, 653], - np.arange(10, dtype='int64'), - Int64Index(range(-10, 11)), - Float64Index(np.arange(20, 30, 0.5)), - date_range('20180101', periods=10), - date_range('20180101', periods=10, tz='US/Eastern'), - timedelta_range('1 day', periods=10)]) + @pytest.mark.parametrize( + "breaks", + [ + [3, 14, 15, 92, 653], + np.arange(10, dtype="int64"), + Int64Index(range(-10, 11)), + Float64Index(np.arange(20, 30, 0.5)), + date_range("20180101", periods=10), + date_range("20180101", periods=10, tz="US/Eastern"), + timedelta_range("1 day", periods=10), + ], + ) def test_constructor(self, constructor, breaks, closed, name): result_kwargs = self.get_kwargs_from_breaks(breaks, closed) result = constructor(closed=closed, name=name, **result_kwargs) assert result.closed == closed assert result.name == name - assert result.dtype.subtype == getattr(breaks, 'dtype', 'int64') + assert result.dtype.subtype == getattr(breaks, "dtype", "int64") tm.assert_index_equal(result.left, Index(breaks[:-1])) tm.assert_index_equal(result.right, Index(breaks[1:])) - @pytest.mark.parametrize('breaks, subtype', [ - (Int64Index([0, 1, 2, 3, 4]), 'float64'), - (Int64Index([0, 1, 2, 3, 4]), 'datetime64[ns]'), - (Int64Index([0, 1, 2, 3, 4]), 'timedelta64[ns]'), - (Float64Index([0, 1, 2, 3, 4]), 'int64'), - (date_range('2017-01-01', periods=5), 'int64'), - (timedelta_range('1 day', periods=5), 'int64')]) + @pytest.mark.parametrize( + "breaks, subtype", + [ + (Int64Index([0, 1, 2, 3, 4]), "float64"), + (Int64Index([0, 1, 2, 3, 4]), "datetime64[ns]"), + (Int64Index([0, 1, 2, 3, 4]), "timedelta64[ns]"), + (Float64Index([0, 1, 2, 3, 4]), "int64"), + (date_range("2017-01-01", periods=5), "int64"), + (timedelta_range("1 day", periods=5), "int64"), + ], + ) def test_constructor_dtype(self, constructor, breaks, subtype): # GH 19262: conversion via dtype parameter expected_kwargs = self.get_kwargs_from_breaks(breaks.astype(subtype)) @@ -62,8 +80,7 @@ def test_constructor_dtype(self, constructor, breaks, subtype): result = constructor(dtype=dtype, **result_kwargs) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('breaks', [ - [np.nan] * 2, [np.nan] * 4, [np.nan] * 50]) + @pytest.mark.parametrize("breaks", [[np.nan] * 2, [np.nan] * 4, [np.nan] * 50]) def test_constructor_nan(self, constructor, breaks, closed): # GH 18421 result_kwargs = self.get_kwargs_from_breaks(breaks) @@ -76,46 +93,55 @@ def test_constructor_nan(self, constructor, breaks, closed): assert result.dtype.subtype == expected_subtype tm.assert_numpy_array_equal(result._ndarray_values, expected_values) - @pytest.mark.parametrize('breaks', [ - [], - np.array([], dtype='int64'), - np.array([], dtype='float64'), - np.array([], dtype='datetime64[ns]'), - np.array([], dtype='timedelta64[ns]')]) + @pytest.mark.parametrize( + "breaks", + [ + [], + np.array([], dtype="int64"), + np.array([], dtype="float64"), + np.array([], dtype="datetime64[ns]"), + np.array([], dtype="timedelta64[ns]"), + ], + ) def test_constructor_empty(self, constructor, breaks, closed): # GH 18421 result_kwargs = self.get_kwargs_from_breaks(breaks) result = constructor(closed=closed, **result_kwargs) expected_values = np.array([], dtype=object) - expected_subtype = getattr(breaks, 'dtype', np.int64) + expected_subtype = getattr(breaks, "dtype", np.int64) assert result.empty assert result.closed == closed assert result.dtype.subtype == expected_subtype tm.assert_numpy_array_equal(result._ndarray_values, expected_values) - @pytest.mark.parametrize('breaks', [ - tuple('0123456789'), - list('abcdefghij'), - np.array(list('abcdefghij'), dtype=object), - np.array(list('abcdefghij'), dtype=' self.index tm.assert_numpy_array_equal(actual, expected) - actual = self.index == IntervalIndex.from_breaks([0, 1, 2], 'left') + actual = self.index == IntervalIndex.from_breaks([0, 1, 2], "left") tm.assert_numpy_array_equal(actual, expected) actual = self.index == self.index.values @@ -718,9 +785,9 @@ def test_comparison(self): actual = self.index == self.index.left tm.assert_numpy_array_equal(actual, np.array([False, False])) - with pytest.raises(TypeError, match='unorderable types'): + with pytest.raises(TypeError, match="unorderable types"): self.index > 0 - with pytest.raises(TypeError, match='unorderable types'): + with pytest.raises(TypeError, match="unorderable types"): self.index <= 0 msg = r"unorderable types: Interval\(\) > int\(\)" with pytest.raises(TypeError, match=msg): @@ -730,20 +797,22 @@ def test_comparison(self): self.index > np.arange(3) def test_missing_values(self, closed): - idx = Index([np.nan, Interval(0, 1, closed=closed), - Interval(1, 2, closed=closed)]) - idx2 = IntervalIndex.from_arrays( - [np.nan, 0, 1], [np.nan, 1, 2], closed=closed) + idx = Index( + [np.nan, Interval(0, 1, closed=closed), Interval(1, 2, closed=closed)] + ) + idx2 = IntervalIndex.from_arrays([np.nan, 0, 1], [np.nan, 1, 2], closed=closed) assert idx.equals(idx2) - msg = ("missing values must be missing in the same location both left" - " and right sides") + msg = ( + "missing values must be missing in the same location both left" + " and right sides" + ) with pytest.raises(ValueError, match=msg): IntervalIndex.from_arrays( - [np.nan, 0, 1], np.array([0, 1, 2]), closed=closed) + [np.nan, 0, 1], np.array([0, 1, 2]), closed=closed + ) - tm.assert_numpy_array_equal(isna(idx), - np.array([True, False, False])) + tm.assert_numpy_array_equal(isna(idx), np.array([True, False, False])) def test_sort_values(self, closed): index = self.create_index(closed=closed) @@ -765,44 +834,46 @@ def test_sort_values(self, closed): expected = IntervalIndex([np.nan, Interval(1, 2), Interval(0, 1)]) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('tz', [None, 'US/Eastern']) + @pytest.mark.parametrize("tz", [None, "US/Eastern"]) def test_datetime(self, tz): - start = Timestamp('2000-01-01', tz=tz) + start = Timestamp("2000-01-01", tz=tz) dates = date_range(start=start, periods=10) index = IntervalIndex.from_breaks(dates) # test mid - start = Timestamp('2000-01-01T12:00', tz=tz) + start = Timestamp("2000-01-01T12:00", tz=tz) expected = date_range(start=start, periods=9) tm.assert_index_equal(index.mid, expected) # __contains__ doesn't check individual points - assert Timestamp('2000-01-01', tz=tz) not in index - assert Timestamp('2000-01-01T12', tz=tz) not in index - assert Timestamp('2000-01-02', tz=tz) not in index - iv_true = Interval(Timestamp('2000-01-02', tz=tz), - Timestamp('2000-01-03', tz=tz)) - iv_false = Interval(Timestamp('1999-12-31', tz=tz), - Timestamp('2000-01-01', tz=tz)) + assert Timestamp("2000-01-01", tz=tz) not in index + assert Timestamp("2000-01-01T12", tz=tz) not in index + assert Timestamp("2000-01-02", tz=tz) not in index + iv_true = Interval( + Timestamp("2000-01-02", tz=tz), Timestamp("2000-01-03", tz=tz) + ) + iv_false = Interval( + Timestamp("1999-12-31", tz=tz), Timestamp("2000-01-01", tz=tz) + ) assert iv_true in index assert iv_false not in index # .contains does check individual points - assert not index.contains(Timestamp('2000-01-01', tz=tz)).any() - assert index.contains(Timestamp('2000-01-01T12', tz=tz)).any() - assert index.contains(Timestamp('2000-01-02', tz=tz)).any() + assert not index.contains(Timestamp("2000-01-01", tz=tz)).any() + assert index.contains(Timestamp("2000-01-01T12", tz=tz)).any() + assert index.contains(Timestamp("2000-01-02", tz=tz)).any() # test get_indexer - start = Timestamp('1999-12-31T12:00', tz=tz) - target = date_range(start=start, periods=7, freq='12H') + start = Timestamp("1999-12-31T12:00", tz=tz) + target = date_range(start=start, periods=7, freq="12H") actual = index.get_indexer(target) - expected = np.array([-1, -1, 0, 0, 1, 1, 2], dtype='intp') + expected = np.array([-1, -1, 0, 0, 1, 1, 2], dtype="intp") tm.assert_numpy_array_equal(actual, expected) - start = Timestamp('2000-01-08T18:00', tz=tz) - target = date_range(start=start, periods=7, freq='6H') + start = Timestamp("2000-01-08T18:00", tz=tz) + target = date_range(start=start, periods=7, freq="6H") actual = index.get_indexer(target) - expected = np.array([7, 7, 8, 8, 8, 8, -1], dtype='intp') + expected = np.array([7, 7, 8, 8, 8, 8, -1], dtype="intp") tm.assert_numpy_array_equal(actual, expected) def test_append(self, closed): @@ -811,20 +882,23 @@ def test_append(self, closed): index2 = IntervalIndex.from_arrays([1, 2], [2, 3], closed=closed) result = index1.append(index2) - expected = IntervalIndex.from_arrays( - [0, 1, 1, 2], [1, 2, 2, 3], closed=closed) + expected = IntervalIndex.from_arrays([0, 1, 1, 2], [1, 2, 2, 3], closed=closed) tm.assert_index_equal(result, expected) result = index1.append([index1, index2]) expected = IntervalIndex.from_arrays( - [0, 1, 0, 1, 1, 2], [1, 2, 1, 2, 2, 3], closed=closed) + [0, 1, 0, 1, 1, 2], [1, 2, 1, 2, 2, 3], closed=closed + ) tm.assert_index_equal(result, expected) - msg = ('can only append two IntervalIndex objects that are closed ' - 'on the same side') - for other_closed in {'left', 'right', 'both', 'neither'} - {closed}: + msg = ( + "can only append two IntervalIndex objects that are closed " + "on the same side" + ) + for other_closed in {"left", "right", "both", "neither"} - {closed}: index_other_closed = IntervalIndex.from_arrays( - [0, 1], [1, 2], closed=other_closed) + [0, 1], [1, 2], closed=other_closed + ) with pytest.raises(ValueError, match=msg): index1.append(index_other_closed) @@ -854,24 +928,27 @@ def test_is_non_overlapping_monotonic(self, closed): assert idx.is_non_overlapping_monotonic is False # Should be False for closed='both', otherwise True (GH16560) - if closed == 'both': + if closed == "both": idx = IntervalIndex.from_breaks(range(4), closed=closed) assert idx.is_non_overlapping_monotonic is False else: idx = IntervalIndex.from_breaks(range(4), closed=closed) assert idx.is_non_overlapping_monotonic is True - @pytest.mark.parametrize('start, shift, na_value', [ - (0, 1, np.nan), - (Timestamp('2018-01-01'), Timedelta('1 day'), pd.NaT), - (Timedelta('0 days'), Timedelta('1 day'), pd.NaT)]) + @pytest.mark.parametrize( + "start, shift, na_value", + [ + (0, 1, np.nan), + (Timestamp("2018-01-01"), Timedelta("1 day"), pd.NaT), + (Timedelta("0 days"), Timedelta("1 day"), pd.NaT), + ], + ) def test_is_overlapping(self, start, shift, na_value, closed): # GH 23309 # see test_interval_tree.py for extensive tests; interface tests here # non-overlapping - tuples = [(start + n * shift, start + (n + 1) * shift) - for n in (0, 2, 4)] + tuples = [(start + n * shift, start + (n + 1) * shift) for n in (0, 2, 4)] index = IntervalIndex.from_tuples(tuples, closed=closed) assert index.is_overlapping is False @@ -881,8 +958,7 @@ def test_is_overlapping(self, start, shift, na_value, closed): assert index.is_overlapping is False # overlapping - tuples = [(start + n * shift, start + (n + 2) * shift) - for n in range(3)] + tuples = [(start + n * shift, start + (n + 2) * shift) for n in range(3)] index = IntervalIndex.from_tuples(tuples, closed=closed) assert index.is_overlapping is True @@ -892,11 +968,10 @@ def test_is_overlapping(self, start, shift, na_value, closed): assert index.is_overlapping is True # common endpoints - tuples = [(start + n * shift, start + (n + 1) * shift) - for n in range(3)] + tuples = [(start + n * shift, start + (n + 1) * shift) for n in range(3)] index = IntervalIndex.from_tuples(tuples, closed=closed) result = index.is_overlapping - expected = closed == 'both' + expected = closed == "both" assert result is expected # common endpoints with NA @@ -905,12 +980,24 @@ def test_is_overlapping(self, start, shift, na_value, closed): result = index.is_overlapping assert result is expected - @pytest.mark.parametrize('tuples', [ - list(zip(range(10), range(1, 11))), - list(zip(date_range('20170101', periods=10), - date_range('20170101', periods=10))), - list(zip(timedelta_range('0 days', periods=10), - timedelta_range('1 day', periods=10)))]) + @pytest.mark.parametrize( + "tuples", + [ + list(zip(range(10), range(1, 11))), + list( + zip( + date_range("20170101", periods=10), + date_range("20170101", periods=10), + ) + ), + list( + zip( + timedelta_range("0 days", periods=10), + timedelta_range("1 day", periods=10), + ) + ), + ], + ) def test_to_tuples(self, tuples): # GH 18756 idx = IntervalIndex.from_tuples(tuples) @@ -918,13 +1005,27 @@ def test_to_tuples(self, tuples): expected = Index(com.asarray_tuplesafe(tuples)) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('tuples', [ - list(zip(range(10), range(1, 11))) + [np.nan], - list(zip(date_range('20170101', periods=10), - date_range('20170101', periods=10))) + [np.nan], - list(zip(timedelta_range('0 days', periods=10), - timedelta_range('1 day', periods=10))) + [np.nan]]) - @pytest.mark.parametrize('na_tuple', [True, False]) + @pytest.mark.parametrize( + "tuples", + [ + list(zip(range(10), range(1, 11))) + [np.nan], + list( + zip( + date_range("20170101", periods=10), + date_range("20170101", periods=10), + ) + ) + + [np.nan], + list( + zip( + timedelta_range("0 days", periods=10), + timedelta_range("1 day", periods=10), + ) + ) + + [np.nan], + ], + ) + @pytest.mark.parametrize("na_tuple", [True, False]) def test_to_tuples_na(self, tuples, na_tuple): # GH 18756 idx = IntervalIndex.from_tuples(tuples) @@ -946,8 +1047,8 @@ def test_to_tuples_na(self, tuples, na_tuple): def test_nbytes(self): # GH 19209 - left = np.arange(0, 4, dtype='i8') - right = np.arange(1, 5, dtype='i8') + left = np.arange(0, 4, dtype="i8") + right = np.arange(1, 5, dtype="i8") result = IntervalIndex.from_arrays(left, right).nbytes expected = 64 # 4 * 8 * 2 @@ -955,8 +1056,8 @@ def test_nbytes(self): def test_itemsize(self): # GH 19209 - left = np.arange(0, 4, dtype='i8') - right = np.arange(1, 5, dtype='i8') + left = np.arange(0, 4, dtype="i8") + right = np.arange(1, 5, dtype="i8") expected = 16 # 8 * 2 with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): @@ -964,8 +1065,7 @@ def test_itemsize(self): assert result == expected - @pytest.mark.parametrize('new_closed', [ - 'left', 'right', 'both', 'neither']) + @pytest.mark.parametrize("new_closed", ["left", "right", "both", "neither"]) def test_set_closed(self, name, closed, new_closed): # GH 21670 index = interval_range(0, 5, closed=closed, name=name) @@ -973,7 +1073,7 @@ def test_set_closed(self, name, closed, new_closed): expected = interval_range(0, 5, closed=new_closed, name=name) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('bad_closed', ['foo', 10, 'LEFT', True, False]) + @pytest.mark.parametrize("bad_closed", ["foo", 10, "LEFT", True, False]) def test_set_closed_errors(self, bad_closed): # GH 21670 index = interval_range(0, 5) @@ -983,7 +1083,8 @@ def test_set_closed_errors(self, bad_closed): def test_is_all_dates(self): # GH 23576 - year_2017 = pd.Interval(pd.Timestamp('2017-01-01 00:00:00'), - pd.Timestamp('2018-01-01 00:00:00')) + year_2017 = pd.Interval( + pd.Timestamp("2017-01-01 00:00:00"), pd.Timestamp("2018-01-01 00:00:00") + ) year_2017_index = pd.IntervalIndex([year_2017]) assert not year_2017_index.is_all_dates diff --git a/pandas/tests/indexes/interval/test_interval_new.py b/pandas/tests/indexes/interval/test_interval_new.py index cef230e98a6eec..ab9f7ef1c3e262 100644 --- a/pandas/tests/indexes/interval/test_interval_new.py +++ b/pandas/tests/indexes/interval/test_interval_new.py @@ -7,14 +7,12 @@ class TestIntervalIndex: - - @pytest.mark.parametrize("side", ['right', 'left', 'both', 'neither']) + @pytest.mark.parametrize("side", ["right", "left", "both", "neither"]) def test_get_loc_interval(self, closed, side): idx = IntervalIndex.from_tuples([(0, 1), (2, 3)], closed=closed) - for bound in [[0, 1], [1, 2], [2, 3], [3, 4], - [0, 2], [2.5, 3], [-1, 4]]: + for bound in [[0, 1], [1, 2], [2, 3], [3, 4], [0, 2], [2.5, 3], [-1, 4]]: # if get_loc is supplied an interval, it should only search # for exact matches, not overlaps or covers, else KeyError. if closed == side: @@ -34,10 +32,12 @@ def test_get_loc_scalar(self, closed, scalar): # correct = {side: {query: answer}}. # If query is not in the dict, that query should raise a KeyError - correct = {'right': {0.5: 0, 1: 0, 2.5: 1, 3: 1}, - 'left': {0: 0, 0.5: 0, 2: 1, 2.5: 1}, - 'both': {0: 0, 0.5: 0, 1: 0, 2: 1, 2.5: 1, 3: 1}, - 'neither': {0.5: 0, 2.5: 1}} + correct = { + "right": {0.5: 0, 1: 0, 2.5: 1, 3: 1}, + "left": {0: 0, 0.5: 0, 2: 1, 2.5: 1}, + "both": {0: 0, 0.5: 0, 1: 0, 2: 1, 2.5: 1, 3: 1}, + "neither": {0.5: 0, 2.5: 1}, + } idx = IntervalIndex.from_tuples([(0, 1), (2, 3)], closed=closed) @@ -54,35 +54,29 @@ def test_slice_locs_with_interval(self): # increasing monotonically index = IntervalIndex.from_tuples([(0, 2), (1, 3), (2, 4)]) - assert index.slice_locs( - start=Interval(0, 2), end=Interval(2, 4)) == (0, 3) + assert index.slice_locs(start=Interval(0, 2), end=Interval(2, 4)) == (0, 3) assert index.slice_locs(start=Interval(0, 2)) == (0, 3) assert index.slice_locs(end=Interval(2, 4)) == (0, 3) assert index.slice_locs(end=Interval(0, 2)) == (0, 1) - assert index.slice_locs( - start=Interval(2, 4), end=Interval(0, 2)) == (2, 1) + assert index.slice_locs(start=Interval(2, 4), end=Interval(0, 2)) == (2, 1) # decreasing monotonically index = IntervalIndex.from_tuples([(2, 4), (1, 3), (0, 2)]) - assert index.slice_locs( - start=Interval(0, 2), end=Interval(2, 4)) == (2, 1) + assert index.slice_locs(start=Interval(0, 2), end=Interval(2, 4)) == (2, 1) assert index.slice_locs(start=Interval(0, 2)) == (2, 3) assert index.slice_locs(end=Interval(2, 4)) == (0, 1) assert index.slice_locs(end=Interval(0, 2)) == (0, 3) - assert index.slice_locs( - start=Interval(2, 4), end=Interval(0, 2)) == (0, 3) + assert index.slice_locs(start=Interval(2, 4), end=Interval(0, 2)) == (0, 3) # sorted duplicates index = IntervalIndex.from_tuples([(0, 2), (0, 2), (2, 4)]) - assert index.slice_locs( - start=Interval(0, 2), end=Interval(2, 4)) == (0, 3) + assert index.slice_locs(start=Interval(0, 2), end=Interval(2, 4)) == (0, 3) assert index.slice_locs(start=Interval(0, 2)) == (0, 3) assert index.slice_locs(end=Interval(2, 4)) == (0, 3) assert index.slice_locs(end=Interval(0, 2)) == (0, 2) - assert index.slice_locs( - start=Interval(2, 4), end=Interval(0, 2)) == (2, 2) + assert index.slice_locs(start=Interval(2, 4), end=Interval(0, 2)) == (2, 2) # unsorted duplicates index = IntervalIndex.from_tuples([(0, 2), (2, 4), (0, 2)]) @@ -104,13 +98,11 @@ def test_slice_locs_with_interval(self): # another unsorted duplicates index = IntervalIndex.from_tuples([(0, 2), (0, 2), (2, 4), (1, 3)]) - assert index.slice_locs( - start=Interval(0, 2), end=Interval(2, 4)) == (0, 3) + assert index.slice_locs(start=Interval(0, 2), end=Interval(2, 4)) == (0, 3) assert index.slice_locs(start=Interval(0, 2)) == (0, 4) assert index.slice_locs(end=Interval(2, 4)) == (0, 3) assert index.slice_locs(end=Interval(0, 2)) == (0, 2) - assert index.slice_locs( - start=Interval(2, 4), end=Interval(0, 2)) == (2, 2) + assert index.slice_locs(start=Interval(2, 4), end=Interval(0, 2)) == (2, 2) def test_slice_locs_with_ints_and_floats_succeeds(self): @@ -133,108 +125,125 @@ def test_slice_locs_with_ints_and_floats_succeeds(self): assert index.slice_locs(3, 4) == (1, 1) assert index.slice_locs(0, 4) == (3, 1) - @pytest.mark.parametrize("query", [ - [0, 1], [0, 2], [0, 3], [0, 4]]) - @pytest.mark.parametrize("tuples", [ - [(0, 2), (1, 3), (2, 4)], - [(2, 4), (1, 3), (0, 2)], - [(0, 2), (0, 2), (2, 4)], - [(0, 2), (2, 4), (0, 2)], - [(0, 2), (0, 2), (2, 4), (1, 3)]]) + @pytest.mark.parametrize("query", [[0, 1], [0, 2], [0, 3], [0, 4]]) + @pytest.mark.parametrize( + "tuples", + [ + [(0, 2), (1, 3), (2, 4)], + [(2, 4), (1, 3), (0, 2)], + [(0, 2), (0, 2), (2, 4)], + [(0, 2), (2, 4), (0, 2)], + [(0, 2), (0, 2), (2, 4), (1, 3)], + ], + ) def test_slice_locs_with_ints_and_floats_errors(self, tuples, query): start, stop = query index = IntervalIndex.from_tuples(tuples) with pytest.raises(KeyError): index.slice_locs(start, stop) - @pytest.mark.parametrize('query, expected', [ - ([Interval(2, 4, closed='right')], [1]), - ([Interval(2, 4, closed='left')], [-1]), - ([Interval(2, 4, closed='both')], [-1]), - ([Interval(2, 4, closed='neither')], [-1]), - ([Interval(1, 4, closed='right')], [-1]), - ([Interval(0, 4, closed='right')], [-1]), - ([Interval(0.5, 1.5, closed='right')], [-1]), - ([Interval(2, 4, closed='right'), Interval(0, 1, closed='right')], - [1, -1]), - ([Interval(2, 4, closed='right'), Interval(2, 4, closed='right')], - [1, 1]), - ([Interval(5, 7, closed='right'), Interval(2, 4, closed='right')], - [2, 1]), - ([Interval(2, 4, closed='right'), Interval(2, 4, closed='left')], - [1, -1])]) + @pytest.mark.parametrize( + "query, expected", + [ + ([Interval(2, 4, closed="right")], [1]), + ([Interval(2, 4, closed="left")], [-1]), + ([Interval(2, 4, closed="both")], [-1]), + ([Interval(2, 4, closed="neither")], [-1]), + ([Interval(1, 4, closed="right")], [-1]), + ([Interval(0, 4, closed="right")], [-1]), + ([Interval(0.5, 1.5, closed="right")], [-1]), + ([Interval(2, 4, closed="right"), Interval(0, 1, closed="right")], [1, -1]), + ([Interval(2, 4, closed="right"), Interval(2, 4, closed="right")], [1, 1]), + ([Interval(5, 7, closed="right"), Interval(2, 4, closed="right")], [2, 1]), + ([Interval(2, 4, closed="right"), Interval(2, 4, closed="left")], [1, -1]), + ], + ) def test_get_indexer_with_interval(self, query, expected): tuples = [(0, 2), (2, 4), (5, 7)] - index = IntervalIndex.from_tuples(tuples, closed='right') + index = IntervalIndex.from_tuples(tuples, closed="right") result = index.get_indexer(query) - expected = np.array(expected, dtype='intp') + expected = np.array(expected, dtype="intp") tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('query, expected', [ - ([-0.5], [-1]), - ([0], [-1]), - ([0.5], [0]), - ([1], [0]), - ([1.5], [1]), - ([2], [1]), - ([2.5], [-1]), - ([3], [-1]), - ([3.5], [2]), - ([4], [2]), - ([4.5], [-1]), - ([1, 2], [0, 1]), - ([1, 2, 3], [0, 1, -1]), - ([1, 2, 3, 4], [0, 1, -1, 2]), - ([1, 2, 3, 4, 2], [0, 1, -1, 2, 1])]) + @pytest.mark.parametrize( + "query, expected", + [ + ([-0.5], [-1]), + ([0], [-1]), + ([0.5], [0]), + ([1], [0]), + ([1.5], [1]), + ([2], [1]), + ([2.5], [-1]), + ([3], [-1]), + ([3.5], [2]), + ([4], [2]), + ([4.5], [-1]), + ([1, 2], [0, 1]), + ([1, 2, 3], [0, 1, -1]), + ([1, 2, 3, 4], [0, 1, -1, 2]), + ([1, 2, 3, 4, 2], [0, 1, -1, 2, 1]), + ], + ) def test_get_indexer_with_int_and_float(self, query, expected): tuples = [(0, 1), (1, 2), (3, 4)] - index = IntervalIndex.from_tuples(tuples, closed='right') + index = IntervalIndex.from_tuples(tuples, closed="right") result = index.get_indexer(query) - expected = np.array(expected, dtype='intp') + expected = np.array(expected, dtype="intp") tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('tuples, closed', [ - ([(0, 2), (1, 3), (3, 4)], 'neither'), - ([(0, 5), (1, 4), (6, 7)], 'left'), - ([(0, 1), (0, 1), (1, 2)], 'right'), - ([(0, 1), (2, 3), (3, 4)], 'both')]) + @pytest.mark.parametrize( + "tuples, closed", + [ + ([(0, 2), (1, 3), (3, 4)], "neither"), + ([(0, 5), (1, 4), (6, 7)], "left"), + ([(0, 1), (0, 1), (1, 2)], "right"), + ([(0, 1), (2, 3), (3, 4)], "both"), + ], + ) def test_get_indexer_errors(self, tuples, closed): # IntervalIndex needs non-overlapping for uniqueness when querying index = IntervalIndex.from_tuples(tuples, closed=closed) - msg = ('cannot handle overlapping indices; use ' - 'IntervalIndex.get_indexer_non_unique') + msg = ( + "cannot handle overlapping indices; use " + "IntervalIndex.get_indexer_non_unique" + ) with pytest.raises(InvalidIndexError, match=msg): index.get_indexer([0, 2]) - @pytest.mark.parametrize('query, expected', [ - ([-0.5], ([-1], [0])), - ([0], ([0], [])), - ([0.5], ([0], [])), - ([1], ([0, 1], [])), - ([1.5], ([0, 1], [])), - ([2], ([0, 1, 2], [])), - ([2.5], ([1, 2], [])), - ([3], ([2], [])), - ([3.5], ([2], [])), - ([4], ([-1], [0])), - ([4.5], ([-1], [0])), - ([1, 2], ([0, 1, 0, 1, 2], [])), - ([1, 2, 3], ([0, 1, 0, 1, 2, 2], [])), - ([1, 2, 3, 4], ([0, 1, 0, 1, 2, 2, -1], [3])), - ([1, 2, 3, 4, 2], ([0, 1, 0, 1, 2, 2, -1, 0, 1, 2], [3]))]) + @pytest.mark.parametrize( + "query, expected", + [ + ([-0.5], ([-1], [0])), + ([0], ([0], [])), + ([0.5], ([0], [])), + ([1], ([0, 1], [])), + ([1.5], ([0, 1], [])), + ([2], ([0, 1, 2], [])), + ([2.5], ([1, 2], [])), + ([3], ([2], [])), + ([3.5], ([2], [])), + ([4], ([-1], [0])), + ([4.5], ([-1], [0])), + ([1, 2], ([0, 1, 0, 1, 2], [])), + ([1, 2, 3], ([0, 1, 0, 1, 2, 2], [])), + ([1, 2, 3, 4], ([0, 1, 0, 1, 2, 2, -1], [3])), + ([1, 2, 3, 4, 2], ([0, 1, 0, 1, 2, 2, -1, 0, 1, 2], [3])), + ], + ) def test_get_indexer_non_unique_with_int_and_float(self, query, expected): tuples = [(0, 2.5), (1, 3), (2, 4)] - index = IntervalIndex.from_tuples(tuples, closed='left') + index = IntervalIndex.from_tuples(tuples, closed="left") result_indexer, result_missing = index.get_indexer_non_unique(query) - expected_indexer = np.array(expected[0], dtype='intp') - expected_missing = np.array(expected[1], dtype='intp') + expected_indexer = np.array(expected[0], dtype="intp") + expected_missing = np.array(expected[1], dtype="intp") tm.assert_numpy_array_equal(result_indexer, expected_indexer) tm.assert_numpy_array_equal(result_missing, expected_missing) @@ -244,17 +253,17 @@ def test_get_indexer_non_unique_with_int_and_float(self, query, expected): def test_contains_dunder(self): - index = IntervalIndex.from_arrays([0, 1], [1, 2], closed='right') + index = IntervalIndex.from_arrays([0, 1], [1, 2], closed="right") # __contains__ requires perfect matches to intervals. assert 0 not in index assert 1 not in index assert 2 not in index - assert Interval(0, 1, closed='right') in index - assert Interval(0, 2, closed='right') not in index - assert Interval(0, 0.5, closed='right') not in index - assert Interval(3, 5, closed='right') not in index - assert Interval(-1, 0, closed='left') not in index - assert Interval(0, 1, closed='left') not in index - assert Interval(0, 1, closed='both') not in index + assert Interval(0, 1, closed="right") in index + assert Interval(0, 2, closed="right") not in index + assert Interval(0, 0.5, closed="right") not in index + assert Interval(3, 5, closed="right") not in index + assert Interval(-1, 0, closed="left") not in index + assert Interval(0, 1, closed="left") not in index + assert Interval(0, 1, closed="both") not in index diff --git a/pandas/tests/indexes/interval/test_interval_range.py b/pandas/tests/indexes/interval/test_interval_range.py index 572fe5fbad1005..b102444b4ec9c4 100644 --- a/pandas/tests/indexes/interval/test_interval_range.py +++ b/pandas/tests/indexes/interval/test_interval_range.py @@ -6,22 +6,27 @@ from pandas.core.dtypes.common import is_integer from pandas import ( - DateOffset, Interval, IntervalIndex, Timedelta, Timestamp, date_range, - interval_range, timedelta_range) + DateOffset, + Interval, + IntervalIndex, + Timedelta, + Timestamp, + date_range, + interval_range, + timedelta_range, +) import pandas.util.testing as tm from pandas.tseries.offsets import Day -@pytest.fixture(scope='class', params=[None, 'foo']) +@pytest.fixture(scope="class", params=[None, "foo"]) def name(request): return request.param class TestIntervalRange: - - @pytest.mark.parametrize('freq, periods', [ - (1, 100), (2.5, 40), (5, 20), (25, 4)]) + @pytest.mark.parametrize("freq, periods", [(1, 100), (2.5, 40), (5, 20), (25, 4)]) def test_constructor_numeric(self, closed, name, freq, periods): start, end = 0, 100 breaks = np.arange(101, step=freq) @@ -29,106 +34,127 @@ def test_constructor_numeric(self, closed, name, freq, periods): # defined from start/end/freq result = interval_range( - start=start, end=end, freq=freq, name=name, closed=closed) + start=start, end=end, freq=freq, name=name, closed=closed + ) tm.assert_index_equal(result, expected) # defined from start/periods/freq result = interval_range( - start=start, periods=periods, freq=freq, name=name, closed=closed) + start=start, periods=periods, freq=freq, name=name, closed=closed + ) tm.assert_index_equal(result, expected) # defined from end/periods/freq result = interval_range( - end=end, periods=periods, freq=freq, name=name, closed=closed) + end=end, periods=periods, freq=freq, name=name, closed=closed + ) tm.assert_index_equal(result, expected) # GH 20976: linspace behavior defined from start/end/periods result = interval_range( - start=start, end=end, periods=periods, name=name, closed=closed) + start=start, end=end, periods=periods, name=name, closed=closed + ) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('tz', [None, 'US/Eastern']) - @pytest.mark.parametrize('freq, periods', [ - ('D', 364), ('2D', 182), ('22D18H', 16), ('M', 11)]) + @pytest.mark.parametrize("tz", [None, "US/Eastern"]) + @pytest.mark.parametrize( + "freq, periods", [("D", 364), ("2D", 182), ("22D18H", 16), ("M", 11)] + ) def test_constructor_timestamp(self, closed, name, freq, periods, tz): - start, end = Timestamp('20180101', tz=tz), Timestamp('20181231', tz=tz) + start, end = Timestamp("20180101", tz=tz), Timestamp("20181231", tz=tz) breaks = date_range(start=start, end=end, freq=freq) expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed) # defined from start/end/freq result = interval_range( - start=start, end=end, freq=freq, name=name, closed=closed) + start=start, end=end, freq=freq, name=name, closed=closed + ) tm.assert_index_equal(result, expected) # defined from start/periods/freq result = interval_range( - start=start, periods=periods, freq=freq, name=name, closed=closed) + start=start, periods=periods, freq=freq, name=name, closed=closed + ) tm.assert_index_equal(result, expected) # defined from end/periods/freq result = interval_range( - end=end, periods=periods, freq=freq, name=name, closed=closed) + end=end, periods=periods, freq=freq, name=name, closed=closed + ) tm.assert_index_equal(result, expected) # GH 20976: linspace behavior defined from start/end/periods if not breaks.freq.isAnchored() and tz is None: # matches expected only for non-anchored offsets and tz naive # (anchored/DST transitions cause unequal spacing in expected) - result = interval_range(start=start, end=end, periods=periods, - name=name, closed=closed) + result = interval_range( + start=start, end=end, periods=periods, name=name, closed=closed + ) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('freq, periods', [ - ('D', 100), ('2D12H', 40), ('5D', 20), ('25D', 4)]) + @pytest.mark.parametrize( + "freq, periods", [("D", 100), ("2D12H", 40), ("5D", 20), ("25D", 4)] + ) def test_constructor_timedelta(self, closed, name, freq, periods): - start, end = Timedelta('0 days'), Timedelta('100 days') + start, end = Timedelta("0 days"), Timedelta("100 days") breaks = timedelta_range(start=start, end=end, freq=freq) expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed) # defined from start/end/freq result = interval_range( - start=start, end=end, freq=freq, name=name, closed=closed) + start=start, end=end, freq=freq, name=name, closed=closed + ) tm.assert_index_equal(result, expected) # defined from start/periods/freq result = interval_range( - start=start, periods=periods, freq=freq, name=name, closed=closed) + start=start, periods=periods, freq=freq, name=name, closed=closed + ) tm.assert_index_equal(result, expected) # defined from end/periods/freq result = interval_range( - end=end, periods=periods, freq=freq, name=name, closed=closed) + end=end, periods=periods, freq=freq, name=name, closed=closed + ) tm.assert_index_equal(result, expected) # GH 20976: linspace behavior defined from start/end/periods result = interval_range( - start=start, end=end, periods=periods, name=name, closed=closed) + start=start, end=end, periods=periods, name=name, closed=closed + ) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('start, end, freq, expected_endpoint', [ - (0, 10, 3, 9), - (0, 10, 1.5, 9), - (0.5, 10, 3, 9.5), - (Timedelta('0D'), Timedelta('10D'), '2D4H', Timedelta('8D16H')), - (Timestamp('2018-01-01'), - Timestamp('2018-02-09'), - 'MS', - Timestamp('2018-02-01')), - (Timestamp('2018-01-01', tz='US/Eastern'), - Timestamp('2018-01-20', tz='US/Eastern'), - '5D12H', - Timestamp('2018-01-17 12:00:00', tz='US/Eastern'))]) + @pytest.mark.parametrize( + "start, end, freq, expected_endpoint", + [ + (0, 10, 3, 9), + (0, 10, 1.5, 9), + (0.5, 10, 3, 9.5), + (Timedelta("0D"), Timedelta("10D"), "2D4H", Timedelta("8D16H")), + ( + Timestamp("2018-01-01"), + Timestamp("2018-02-09"), + "MS", + Timestamp("2018-02-01"), + ), + ( + Timestamp("2018-01-01", tz="US/Eastern"), + Timestamp("2018-01-20", tz="US/Eastern"), + "5D12H", + Timestamp("2018-01-17 12:00:00", tz="US/Eastern"), + ), + ], + ) def test_early_truncation(self, start, end, freq, expected_endpoint): # index truncates early if freq causes end to be skipped result = interval_range(start=start, end=end, freq=freq) result_endpoint = result.right[-1] assert result_endpoint == expected_endpoint - @pytest.mark.parametrize('start, end, freq', [ - (0.5, None, None), - (None, 4.5, None), - (0.5, None, 1.5), - (None, 6.5, 1.5)]) + @pytest.mark.parametrize( + "start, end, freq", + [(0.5, None, None), (None, 4.5, None), (0.5, None, 1.5), (None, 6.5, 1.5)], + ) def test_no_invalid_float_truncation(self, start, end, freq): # GH 21161 if freq is None: @@ -140,13 +166,21 @@ def test_no_invalid_float_truncation(self, start, end, freq): result = interval_range(start=start, end=end, periods=4, freq=freq) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('start, mid, end', [ - (Timestamp('2018-03-10', tz='US/Eastern'), - Timestamp('2018-03-10 23:30:00', tz='US/Eastern'), - Timestamp('2018-03-12', tz='US/Eastern')), - (Timestamp('2018-11-03', tz='US/Eastern'), - Timestamp('2018-11-04 00:30:00', tz='US/Eastern'), - Timestamp('2018-11-05', tz='US/Eastern'))]) + @pytest.mark.parametrize( + "start, mid, end", + [ + ( + Timestamp("2018-03-10", tz="US/Eastern"), + Timestamp("2018-03-10 23:30:00", tz="US/Eastern"), + Timestamp("2018-03-12", tz="US/Eastern"), + ), + ( + Timestamp("2018-11-03", tz="US/Eastern"), + Timestamp("2018-11-04 00:30:00", tz="US/Eastern"), + Timestamp("2018-11-05", tz="US/Eastern"), + ), + ], + ) def test_linspace_dst_transition(self, start, mid, end): # GH 20976: linspace behavior defined from start/end/periods # accounts for the hour gained/lost during DST transition @@ -154,9 +188,9 @@ def test_linspace_dst_transition(self, start, mid, end): expected = IntervalIndex.from_breaks([start, mid, end]) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('freq', [2, 2.0]) - @pytest.mark.parametrize('end', [10, 10.0]) - @pytest.mark.parametrize('start', [0, 0.0]) + @pytest.mark.parametrize("freq", [2, 2.0]) + @pytest.mark.parametrize("end", [10, 10.0]) + @pytest.mark.parametrize("start", [0, 0.0]) def test_float_subtype(self, start, end, freq): # Has float subtype if any of start/end/freq are float, even if all # resulting endpoints can safely be upcast to integers @@ -164,25 +198,25 @@ def test_float_subtype(self, start, end, freq): # defined from start/end/freq index = interval_range(start=start, end=end, freq=freq) result = index.dtype.subtype - expected = 'int64' if is_integer(start + end + freq) else 'float64' + expected = "int64" if is_integer(start + end + freq) else "float64" assert result == expected # defined from start/periods/freq index = interval_range(start=start, periods=5, freq=freq) result = index.dtype.subtype - expected = 'int64' if is_integer(start + freq) else 'float64' + expected = "int64" if is_integer(start + freq) else "float64" assert result == expected # defined from end/periods/freq index = interval_range(end=end, periods=5, freq=freq) result = index.dtype.subtype - expected = 'int64' if is_integer(end + freq) else 'float64' + expected = "int64" if is_integer(end + freq) else "float64" assert result == expected # GH 20976: linspace behavior defined from start/end/periods index = interval_range(start=start, end=end, periods=5) result = index.dtype.subtype - expected = 'int64' if is_integer(start + end) else 'float64' + expected = "int64" if is_integer(start + end) else "float64" assert result == expected def test_constructor_coverage(self): @@ -192,19 +226,23 @@ def test_constructor_coverage(self): tm.assert_index_equal(result, expected) # equivalent timestamp-like start/end - start, end = Timestamp('2017-01-01'), Timestamp('2017-01-15') + start, end = Timestamp("2017-01-01"), Timestamp("2017-01-15") expected = interval_range(start=start, end=end) - result = interval_range(start=start.to_pydatetime(), - end=end.to_pydatetime()) + result = interval_range(start=start.to_pydatetime(), end=end.to_pydatetime()) tm.assert_index_equal(result, expected) result = interval_range(start=start.asm8, end=end.asm8) tm.assert_index_equal(result, expected) # equivalent freq with timestamp - equiv_freq = ['D', Day(), Timedelta(days=1), timedelta(days=1), - DateOffset(days=1)] + equiv_freq = [ + "D", + Day(), + Timedelta(days=1), + timedelta(days=1), + DateOffset(days=1), + ] for freq in equiv_freq: result = interval_range(start=start, end=end, freq=freq) tm.assert_index_equal(result, expected) @@ -213,23 +251,24 @@ def test_constructor_coverage(self): start, end = Timedelta(days=1), Timedelta(days=10) expected = interval_range(start=start, end=end) - result = interval_range(start=start.to_pytimedelta(), - end=end.to_pytimedelta()) + result = interval_range(start=start.to_pytimedelta(), end=end.to_pytimedelta()) tm.assert_index_equal(result, expected) result = interval_range(start=start.asm8, end=end.asm8) tm.assert_index_equal(result, expected) # equivalent freq with timedelta - equiv_freq = ['D', Day(), Timedelta(days=1), timedelta(days=1)] + equiv_freq = ["D", Day(), Timedelta(days=1), timedelta(days=1)] for freq in equiv_freq: result = interval_range(start=start, end=end, freq=freq) tm.assert_index_equal(result, expected) def test_errors(self): # not enough params - msg = ('Of the four parameters: start, end, periods, and freq, ' - 'exactly three must be specified') + msg = ( + "Of the four parameters: start, end, periods, and freq, " + "exactly three must be specified" + ) with pytest.raises(ValueError, match=msg): interval_range(start=0) @@ -248,67 +287,69 @@ def test_errors(self): interval_range(start=0, end=5, periods=6, freq=1.5) # mixed units - msg = 'start, end, freq need to be type compatible' + msg = "start, end, freq need to be type compatible" with pytest.raises(TypeError, match=msg): - interval_range(start=0, end=Timestamp('20130101'), freq=2) + interval_range(start=0, end=Timestamp("20130101"), freq=2) with pytest.raises(TypeError, match=msg): - interval_range(start=0, end=Timedelta('1 day'), freq=2) + interval_range(start=0, end=Timedelta("1 day"), freq=2) with pytest.raises(TypeError, match=msg): - interval_range(start=0, end=10, freq='D') + interval_range(start=0, end=10, freq="D") with pytest.raises(TypeError, match=msg): - interval_range(start=Timestamp('20130101'), end=10, freq='D') + interval_range(start=Timestamp("20130101"), end=10, freq="D") with pytest.raises(TypeError, match=msg): - interval_range(start=Timestamp('20130101'), - end=Timedelta('1 day'), freq='D') + interval_range( + start=Timestamp("20130101"), end=Timedelta("1 day"), freq="D" + ) with pytest.raises(TypeError, match=msg): - interval_range(start=Timestamp('20130101'), - end=Timestamp('20130110'), freq=2) + interval_range( + start=Timestamp("20130101"), end=Timestamp("20130110"), freq=2 + ) with pytest.raises(TypeError, match=msg): - interval_range(start=Timedelta('1 day'), end=10, freq='D') + interval_range(start=Timedelta("1 day"), end=10, freq="D") with pytest.raises(TypeError, match=msg): - interval_range(start=Timedelta('1 day'), - end=Timestamp('20130110'), freq='D') + interval_range( + start=Timedelta("1 day"), end=Timestamp("20130110"), freq="D" + ) with pytest.raises(TypeError, match=msg): - interval_range(start=Timedelta('1 day'), - end=Timedelta('10 days'), freq=2) + interval_range(start=Timedelta("1 day"), end=Timedelta("10 days"), freq=2) # invalid periods - msg = 'periods must be a number, got foo' + msg = "periods must be a number, got foo" with pytest.raises(TypeError, match=msg): - interval_range(start=0, periods='foo') + interval_range(start=0, periods="foo") # invalid start - msg = 'start must be numeric or datetime-like, got foo' + msg = "start must be numeric or datetime-like, got foo" with pytest.raises(ValueError, match=msg): - interval_range(start='foo', periods=10) + interval_range(start="foo", periods=10) # invalid end - msg = r'end must be numeric or datetime-like, got \(0, 1\]' + msg = r"end must be numeric or datetime-like, got \(0, 1\]" with pytest.raises(ValueError, match=msg): interval_range(end=Interval(0, 1), periods=10) # invalid freq for datetime-like - msg = 'freq must be numeric or convertible to DateOffset, got foo' + msg = "freq must be numeric or convertible to DateOffset, got foo" with pytest.raises(ValueError, match=msg): - interval_range(start=0, end=10, freq='foo') + interval_range(start=0, end=10, freq="foo") with pytest.raises(ValueError, match=msg): - interval_range(start=Timestamp('20130101'), periods=10, freq='foo') + interval_range(start=Timestamp("20130101"), periods=10, freq="foo") with pytest.raises(ValueError, match=msg): - interval_range(end=Timedelta('1 day'), periods=10, freq='foo') + interval_range(end=Timedelta("1 day"), periods=10, freq="foo") # mixed tz - start = Timestamp('2017-01-01', tz='US/Eastern') - end = Timestamp('2017-01-07', tz='US/Pacific') - msg = 'Start and end cannot both be tz-aware with different timezones' + start = Timestamp("2017-01-01", tz="US/Eastern") + end = Timestamp("2017-01-07", tz="US/Pacific") + msg = "Start and end cannot both be tz-aware with different timezones" with pytest.raises(TypeError, match=msg): interval_range(start=start, end=end) diff --git a/pandas/tests/indexes/interval/test_interval_tree.py b/pandas/tests/indexes/interval/test_interval_tree.py index a3868a5675177c..b7104242b5ccc6 100644 --- a/pandas/tests/indexes/interval/test_interval_tree.py +++ b/pandas/tests/indexes/interval/test_interval_tree.py @@ -14,13 +14,15 @@ def skipif_32bit(param): Skip parameters in a parametrize on 32bit systems. Specifically used here to skip leaf_size parameters related to GH 23440. """ - marks = pytest.mark.skipif(compat.is_platform_32bit(), - reason='GH 23440: int type mismatch on 32bit') + marks = pytest.mark.skipif( + compat.is_platform_32bit(), reason="GH 23440: int type mismatch on 32bit" + ) return pytest.param(param, marks=marks) @pytest.fixture( - scope='class', params=['int32', 'int64', 'float32', 'float64', 'uint64']) + scope="class", params=["int32", "int64", "float32", "float64", "uint64"] +) def dtype(request): return request.param @@ -34,28 +36,30 @@ def leaf_size(request): return request.param -@pytest.fixture(params=[ - np.arange(5, dtype='int64'), - np.arange(5, dtype='int32'), - np.arange(5, dtype='uint64'), - np.arange(5, dtype='float64'), - np.arange(5, dtype='float32'), - np.array([0, 1, 2, 3, 4, np.nan], dtype='float64'), - np.array([0, 1, 2, 3, 4, np.nan], dtype='float32')]) +@pytest.fixture( + params=[ + np.arange(5, dtype="int64"), + np.arange(5, dtype="int32"), + np.arange(5, dtype="uint64"), + np.arange(5, dtype="float64"), + np.arange(5, dtype="float32"), + np.array([0, 1, 2, 3, 4, np.nan], dtype="float64"), + np.array([0, 1, 2, 3, 4, np.nan], dtype="float32"), + ] +) def tree(request, leaf_size): left = request.param return IntervalTree(left, left + 2, leaf_size=leaf_size) class TestIntervalTree: - def test_get_loc(self, tree): result = tree.get_loc(1) - expected = np.array([0], dtype='intp') + expected = np.array([0], dtype="intp") tm.assert_numpy_array_equal(result, expected) result = np.sort(tree.get_loc(2)) - expected = np.array([0, 1], dtype='intp') + expected = np.array([0, 1], dtype="intp") tm.assert_numpy_array_equal(result, expected) with pytest.raises(KeyError): @@ -63,30 +67,29 @@ def test_get_loc(self, tree): def test_get_indexer(self, tree): result = tree.get_indexer(np.array([1.0, 5.5, 6.5])) - expected = np.array([0, 4, -1], dtype='intp') + expected = np.array([0, 4, -1], dtype="intp") tm.assert_numpy_array_equal(result, expected) with pytest.raises(KeyError): tree.get_indexer(np.array([3.0])) def test_get_indexer_non_unique(self, tree): - indexer, missing = tree.get_indexer_non_unique( - np.array([1.0, 2.0, 6.5])) + indexer, missing = tree.get_indexer_non_unique(np.array([1.0, 2.0, 6.5])) result = indexer[:1] - expected = np.array([0], dtype='intp') + expected = np.array([0], dtype="intp") tm.assert_numpy_array_equal(result, expected) result = np.sort(indexer[1:3]) - expected = np.array([0, 1], dtype='intp') + expected = np.array([0, 1], dtype="intp") tm.assert_numpy_array_equal(result, expected) result = np.sort(indexer[3:]) - expected = np.array([-1], dtype='intp') + expected = np.array([-1], dtype="intp") tm.assert_numpy_array_equal(result, expected) result = missing - expected = np.array([2], dtype='intp') + expected = np.array([2], dtype="intp") tm.assert_numpy_array_equal(result, expected) def test_duplicates(self, dtype): @@ -94,7 +97,7 @@ def test_duplicates(self, dtype): tree = IntervalTree(left, left + 1) result = np.sort(tree.get_loc(0.5)) - expected = np.array([0, 1, 2], dtype='intp') + expected = np.array([0, 1, 2], dtype="intp") tm.assert_numpy_array_equal(result, expected) with pytest.raises(KeyError): @@ -102,31 +105,31 @@ def test_duplicates(self, dtype): indexer, missing = tree.get_indexer_non_unique(np.array([0.5])) result = np.sort(indexer) - expected = np.array([0, 1, 2], dtype='intp') + expected = np.array([0, 1, 2], dtype="intp") tm.assert_numpy_array_equal(result, expected) result = missing - expected = np.array([], dtype='intp') + expected = np.array([], dtype="intp") tm.assert_numpy_array_equal(result, expected) def test_get_loc_closed(self, closed): tree = IntervalTree([0], [1], closed=closed) - for p, errors in [(0, tree.open_left), - (1, tree.open_right)]: + for p, errors in [(0, tree.open_left), (1, tree.open_right)]: if errors: with pytest.raises(KeyError): tree.get_loc(p) else: result = tree.get_loc(p) - expected = np.array([0], dtype='intp') + expected = np.array([0], dtype="intp") tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('leaf_size', [ - skipif_32bit(1), skipif_32bit(10), skipif_32bit(100), 10000]) + @pytest.mark.parametrize( + "leaf_size", [skipif_32bit(1), skipif_32bit(10), skipif_32bit(100), 10000] + ) def test_get_indexer_closed(self, closed, leaf_size): - x = np.arange(1000, dtype='float64') - found = x.astype('intp') - not_found = (-1 * np.ones(1000)).astype('intp') + x = np.arange(1000, dtype="float64") + found = x.astype("intp") + not_found = (-1 * np.ones(1000)).astype("intp") tree = IntervalTree(x, x + 0.5, closed=closed, leaf_size=leaf_size) tm.assert_numpy_array_equal(found, tree.get_indexer(x + 0.25)) @@ -137,40 +140,48 @@ def test_get_indexer_closed(self, closed, leaf_size): expected = found if tree.closed_right else not_found tm.assert_numpy_array_equal(expected, tree.get_indexer(x + 0.5)) - @pytest.mark.parametrize('left, right, expected', [ - (np.array([0, 1, 4]), np.array([2, 3, 5]), True), - (np.array([0, 1, 2]), np.array([5, 4, 3]), True), - (np.array([0, 1, np.nan]), np.array([5, 4, np.nan]), True), - (np.array([0, 2, 4]), np.array([1, 3, 5]), False), - (np.array([0, 2, np.nan]), np.array([1, 3, np.nan]), False)]) - @pytest.mark.parametrize('order', map(list, permutations(range(3)))) + @pytest.mark.parametrize( + "left, right, expected", + [ + (np.array([0, 1, 4]), np.array([2, 3, 5]), True), + (np.array([0, 1, 2]), np.array([5, 4, 3]), True), + (np.array([0, 1, np.nan]), np.array([5, 4, np.nan]), True), + (np.array([0, 2, 4]), np.array([1, 3, 5]), False), + (np.array([0, 2, np.nan]), np.array([1, 3, np.nan]), False), + ], + ) + @pytest.mark.parametrize("order", map(list, permutations(range(3)))) def test_is_overlapping(self, closed, order, left, right, expected): # GH 23309 tree = IntervalTree(left[order], right[order], closed=closed) result = tree.is_overlapping assert result is expected - @pytest.mark.parametrize('order', map(list, permutations(range(3)))) + @pytest.mark.parametrize("order", map(list, permutations(range(3)))) def test_is_overlapping_endpoints(self, closed, order): """shared endpoints are marked as overlapping""" # GH 23309 left, right = np.arange(3), np.arange(1, 4) tree = IntervalTree(left[order], right[order], closed=closed) result = tree.is_overlapping - expected = closed == 'both' + expected = closed == "both" assert result is expected - @pytest.mark.parametrize('left, right', [ - (np.array([], dtype='int64'), np.array([], dtype='int64')), - (np.array([0], dtype='int64'), np.array([1], dtype='int64')), - (np.array([np.nan]), np.array([np.nan])), - (np.array([np.nan] * 3), np.array([np.nan] * 3))]) + @pytest.mark.parametrize( + "left, right", + [ + (np.array([], dtype="int64"), np.array([], dtype="int64")), + (np.array([0], dtype="int64"), np.array([1], dtype="int64")), + (np.array([np.nan]), np.array([np.nan])), + (np.array([np.nan] * 3), np.array([np.nan] * 3)), + ], + ) def test_is_overlapping_trivial(self, closed, left, right): # GH 23309 tree = IntervalTree(left, right, closed=closed) assert tree.is_overlapping is False - @pytest.mark.skipif(compat.is_platform_32bit(), reason='GH 23440') + @pytest.mark.skipif(compat.is_platform_32bit(), reason="GH 23440") def test_construction_overflow(self): # GH 25485 left, right = np.arange(101), [np.iinfo(np.int64).max] * 101 diff --git a/pandas/tests/indexes/interval/test_setops.py b/pandas/tests/indexes/interval/test_setops.py index 9ab0d15cbe6a36..89e733c30b1e31 100644 --- a/pandas/tests/indexes/interval/test_setops.py +++ b/pandas/tests/indexes/interval/test_setops.py @@ -5,7 +5,7 @@ import pandas.util.testing as tm -@pytest.fixture(scope='class', params=[None, 'foo']) +@pytest.fixture(scope="class", params=[None, "foo"]) def name(request): return request.param @@ -15,17 +15,15 @@ def sort(request): return request.param -def monotonic_index(start, end, dtype='int64', closed='right'): - return IntervalIndex.from_breaks(np.arange(start, end, dtype=dtype), - closed=closed) +def monotonic_index(start, end, dtype="int64", closed="right"): + return IntervalIndex.from_breaks(np.arange(start, end, dtype=dtype), closed=closed) -def empty_index(dtype='int64', closed='right'): +def empty_index(dtype="int64", closed="right"): return IntervalIndex(np.array([], dtype=dtype), closed=closed) class TestIntervalIndex: - def test_union(self, closed, sort): index = monotonic_index(0, 11, closed=closed) other = monotonic_index(5, 13, closed=closed) @@ -45,12 +43,12 @@ def test_union(self, closed, sort): tm.assert_index_equal(index.union(index[:1], sort=sort), index) # GH 19101: empty result, same dtype - index = empty_index(dtype='int64', closed=closed) + index = empty_index(dtype="int64", closed=closed) result = index.union(index, sort=sort) tm.assert_index_equal(result, index) # GH 19101: empty result, different dtypes - other = empty_index(dtype='float64', closed=closed) + other = empty_index(dtype="float64", closed=closed) result = index.union(other, sort=sort) tm.assert_index_equal(result, index) @@ -73,12 +71,12 @@ def test_intersection(self, closed, sort): # GH 19101: empty result, same dtype other = monotonic_index(300, 314, closed=closed) - expected = empty_index(dtype='int64', closed=closed) + expected = empty_index(dtype="int64", closed=closed) result = index.intersection(other, sort=sort) tm.assert_index_equal(result, expected) # GH 19101: empty result, different dtypes - other = monotonic_index(300, 314, dtype='float64', closed=closed) + other = monotonic_index(300, 314, dtype="float64", closed=closed) result = index.intersection(other, sort=sort) tm.assert_index_equal(result, expected) @@ -111,9 +109,7 @@ def test_intersection(self, closed, sort): tm.assert_index_equal(result, expected) def test_difference(self, closed, sort): - index = IntervalIndex.from_arrays([1, 0, 3, 2], - [1, 2, 3, 4], - closed=closed) + index = IntervalIndex.from_arrays([1, 0, 3, 2], [1, 2, 3, 4], closed=closed) result = index.difference(index[:1], sort=sort) expected = index[1:] if sort is None: @@ -122,12 +118,13 @@ def test_difference(self, closed, sort): # GH 19101: empty result, same dtype result = index.difference(index, sort=sort) - expected = empty_index(dtype='int64', closed=closed) + expected = empty_index(dtype="int64", closed=closed) tm.assert_index_equal(result, expected) # GH 19101: empty result, different dtypes - other = IntervalIndex.from_arrays(index.left.astype('float64'), - index.right, closed=closed) + other = IntervalIndex.from_arrays( + index.left.astype("float64"), index.right, closed=closed + ) result = index.difference(other, sort=sort) tm.assert_index_equal(result, expected) @@ -141,19 +138,21 @@ def test_symmetric_difference(self, closed, sort): # GH 19101: empty result, same dtype result = index.symmetric_difference(index, sort=sort) - expected = empty_index(dtype='int64', closed=closed) + expected = empty_index(dtype="int64", closed=closed) if sort is None: tm.assert_index_equal(result, expected) assert tm.equalContents(result, expected) # GH 19101: empty result, different dtypes - other = IntervalIndex.from_arrays(index.left.astype('float64'), - index.right, closed=closed) + other = IntervalIndex.from_arrays( + index.left.astype("float64"), index.right, closed=closed + ) result = index.symmetric_difference(other, sort=sort) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('op_name', [ - 'union', 'intersection', 'difference', 'symmetric_difference']) + @pytest.mark.parametrize( + "op_name", ["union", "intersection", "difference", "symmetric_difference"] + ) @pytest.mark.parametrize("sort", [None, False]) def test_set_incompatible_types(self, closed, op_name, sort): index = monotonic_index(0, 11, closed=closed) @@ -161,24 +160,28 @@ def test_set_incompatible_types(self, closed, op_name, sort): # TODO: standardize return type of non-union setops type(self vs other) # non-IntervalIndex - if op_name == 'difference': + if op_name == "difference": expected = index else: - expected = getattr(index.astype('O'), op_name)(Index([1, 2, 3])) + expected = getattr(index.astype("O"), op_name)(Index([1, 2, 3])) result = set_op(Index([1, 2, 3]), sort=sort) tm.assert_index_equal(result, expected) # mixed closed - msg = ('can only do set operations between two IntervalIndex objects ' - 'that are closed on the same side') - for other_closed in {'right', 'left', 'both', 'neither'} - {closed}: + msg = ( + "can only do set operations between two IntervalIndex objects " + "that are closed on the same side" + ) + for other_closed in {"right", "left", "both", "neither"} - {closed}: other = monotonic_index(0, 11, closed=other_closed) with pytest.raises(ValueError, match=msg): set_op(other, sort=sort) # GH 19016: incompatible dtypes - other = interval_range(Timestamp('20180101'), periods=9, closed=closed) - msg = ('can only do {op} between two IntervalIndex objects that have ' - 'compatible dtypes').format(op=op_name) + other = interval_range(Timestamp("20180101"), periods=9, closed=closed) + msg = ( + "can only do {op} between two IntervalIndex objects that have " + "compatible dtypes" + ).format(op=op_name) with pytest.raises(TypeError, match=msg): set_op(other, sort=sort) diff --git a/pandas/tests/indexes/multi/conftest.py b/pandas/tests/indexes/multi/conftest.py index 307772347e8f5b..acaea4ff96ff53 100644 --- a/pandas/tests/indexes/multi/conftest.py +++ b/pandas/tests/indexes/multi/conftest.py @@ -9,30 +9,36 @@ def idx(): # a MultiIndex used to test the general functionality of the # general functionality of this object - major_axis = Index(['foo', 'bar', 'baz', 'qux']) - minor_axis = Index(['one', 'two']) + major_axis = Index(["foo", "bar", "baz", "qux"]) + minor_axis = Index(["one", "two"]) major_codes = np.array([0, 0, 1, 2, 3, 3]) minor_codes = np.array([0, 1, 0, 1, 0, 1]) - index_names = ['first', 'second'] - mi = MultiIndex(levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes], - names=index_names, verify_integrity=False) + index_names = ["first", "second"] + mi = MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=index_names, + verify_integrity=False, + ) return mi @pytest.fixture def idx_dup(): # compare tests/indexes/multi/conftest.py - major_axis = Index(['foo', 'bar', 'baz', 'qux']) - minor_axis = Index(['one', 'two']) + major_axis = Index(["foo", "bar", "baz", "qux"]) + minor_axis = Index(["one", "two"]) major_codes = np.array([0, 0, 1, 0, 1, 1]) minor_codes = np.array([0, 1, 0, 1, 0, 1]) - index_names = ['first', 'second'] - mi = MultiIndex(levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes], - names=index_names, verify_integrity=False) + index_names = ["first", "second"] + mi = MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=index_names, + verify_integrity=False, + ) return mi @@ -40,7 +46,7 @@ def idx_dup(): def index_names(): # names that match those in the idx fixture for testing equality of # names assigned to the idx - return ['first', 'second'] + return ["first", "second"] @pytest.fixture @@ -52,7 +58,7 @@ def holder(): @pytest.fixture def compat_props(): # a MultiIndex must have these properties associated with it - return ['shape', 'ndim', 'size'] + return ["shape", "ndim", "size"] @pytest.fixture @@ -61,10 +67,9 @@ def narrow_multi_index(): Return a MultiIndex that is narrower than the display (<80 characters). """ n = 1000 - ci = pd.CategoricalIndex(list('a' * n) + (['abc'] * n)) - dti = pd.date_range('2000-01-01', freq='s', periods=n * 2) - return pd.MultiIndex.from_arrays([ci, ci.codes + 9, dti], - names=['a', 'b', 'dti']) + ci = pd.CategoricalIndex(list("a" * n) + (["abc"] * n)) + dti = pd.date_range("2000-01-01", freq="s", periods=n * 2) + return pd.MultiIndex.from_arrays([ci, ci.codes + 9, dti], names=["a", "b", "dti"]) @pytest.fixture @@ -73,8 +78,8 @@ def wide_multi_index(): Return a MultiIndex that is wider than the display (>80 characters). """ n = 1000 - ci = pd.CategoricalIndex(list('a' * n) + (['abc'] * n)) - dti = pd.date_range('2000-01-01', freq='s', periods=n * 2) + ci = pd.CategoricalIndex(list("a" * n) + (["abc"] * n)) + dti = pd.date_range("2000-01-01", freq="s", periods=n * 2) levels = [ci, ci.codes + 9, dti, dti, dti] - names = ['a', 'b', 'dti_1', 'dti_2', 'dti_3'] + names = ["a", "b", "dti_1", "dti_2", "dti_3"] return pd.MultiIndex.from_arrays(levels, names=names) diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py index f886d78da6da24..7f5d57db8da886 100644 --- a/pandas/tests/indexes/multi/test_analytics.py +++ b/pandas/tests/indexes/multi/test_analytics.py @@ -37,11 +37,12 @@ def test_truncate(): major_codes = np.array([0, 0, 1, 2, 3, 3]) minor_codes = np.array([0, 1, 0, 1, 0, 1]) - index = MultiIndex(levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes]) + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) result = index.truncate(before=1) - assert 'foo' not in result.levels[0] + assert "foo" not in result.levels[0] assert 1 in result.levels[0] result = index.truncate(after=1) @@ -57,16 +58,16 @@ def test_truncate(): def test_where(): - i = MultiIndex.from_tuples([('A', 1), ('A', 2)]) + i = MultiIndex.from_tuples([("A", 1), ("A", 2)]) msg = r"\.where is not supported for MultiIndex operations" with pytest.raises(NotImplementedError, match=msg): i.where(True) -@pytest.mark.parametrize('klass', [list, tuple, np.array, pd.Series]) +@pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) def test_where_array_like(klass): - i = MultiIndex.from_tuples([('A', 1), ('A', 2)]) + i = MultiIndex.from_tuples([("A", 1), ("A", 2)]) cond = [False, True] msg = r"\.where is not supported for MultiIndex operations" with pytest.raises(NotImplementedError, match=msg): @@ -78,19 +79,17 @@ def test_where_array_like(klass): def test_reorder_levels(idx): # this blows up - with pytest.raises(IndexError, match='^Too many levels'): + with pytest.raises(IndexError, match="^Too many levels"): idx.reorder_levels([2, 1, 0]) def test_numpy_repeat(): reps = 2 numbers = [1, 2, 3] - names = np.array(['foo', 'bar']) + names = np.array(["foo", "bar"]) - m = MultiIndex.from_product([ - numbers, names], names=names) - expected = MultiIndex.from_product([ - numbers, names.repeat(reps)], names=names) + m = MultiIndex.from_product([numbers, names], names=names) + expected = MultiIndex.from_product([numbers, names.repeat(reps)], names=names) tm.assert_index_equal(np.repeat(m, reps), expected) msg = "the 'axis' parameter is not supported" @@ -100,36 +99,50 @@ def test_numpy_repeat(): def test_append_mixed_dtypes(): # GH 13660 - dti = date_range('2011-01-01', freq='M', periods=3, ) - dti_tz = date_range('2011-01-01', freq='M', periods=3, tz='US/Eastern') - pi = period_range('2011-01', freq='M', periods=3) - - mi = MultiIndex.from_arrays([[1, 2, 3], - [1.1, np.nan, 3.3], - ['a', 'b', 'c'], - dti, dti_tz, pi]) + dti = date_range("2011-01-01", freq="M", periods=3) + dti_tz = date_range("2011-01-01", freq="M", periods=3, tz="US/Eastern") + pi = period_range("2011-01", freq="M", periods=3) + + mi = MultiIndex.from_arrays( + [[1, 2, 3], [1.1, np.nan, 3.3], ["a", "b", "c"], dti, dti_tz, pi] + ) assert mi.nlevels == 6 res = mi.append(mi) - exp = MultiIndex.from_arrays([[1, 2, 3, 1, 2, 3], - [1.1, np.nan, 3.3, 1.1, np.nan, 3.3], - ['a', 'b', 'c', 'a', 'b', 'c'], - dti.append(dti), - dti_tz.append(dti_tz), - pi.append(pi)]) + exp = MultiIndex.from_arrays( + [ + [1, 2, 3, 1, 2, 3], + [1.1, np.nan, 3.3, 1.1, np.nan, 3.3], + ["a", "b", "c", "a", "b", "c"], + dti.append(dti), + dti_tz.append(dti_tz), + pi.append(pi), + ] + ) tm.assert_index_equal(res, exp) - other = MultiIndex.from_arrays([['x', 'y', 'z'], ['x', 'y', 'z'], - ['x', 'y', 'z'], ['x', 'y', 'z'], - ['x', 'y', 'z'], ['x', 'y', 'z']]) + other = MultiIndex.from_arrays( + [ + ["x", "y", "z"], + ["x", "y", "z"], + ["x", "y", "z"], + ["x", "y", "z"], + ["x", "y", "z"], + ["x", "y", "z"], + ] + ) res = mi.append(other) - exp = MultiIndex.from_arrays([[1, 2, 3, 'x', 'y', 'z'], - [1.1, np.nan, 3.3, 'x', 'y', 'z'], - ['a', 'b', 'c', 'x', 'y', 'z'], - dti.append(pd.Index(['x', 'y', 'z'])), - dti_tz.append(pd.Index(['x', 'y', 'z'])), - pi.append(pd.Index(['x', 'y', 'z']))]) + exp = MultiIndex.from_arrays( + [ + [1, 2, 3, "x", "y", "z"], + [1.1, np.nan, 3.3, "x", "y", "z"], + ["a", "b", "c", "x", "y", "z"], + dti.append(pd.Index(["x", "y", "z"])), + dti_tz.append(pd.Index(["x", "y", "z"])), + pi.append(pd.Index(["x", "y", "z"])), + ] + ) tm.assert_index_equal(res, exp) @@ -162,41 +175,46 @@ def test_take_invalid_kwargs(idx): msg = "the 'mode' parameter is not supported" with pytest.raises(ValueError, match=msg): - idx.take(indices, mode='clip') + idx.take(indices, mode="clip") def test_take_fill_value(): # GH 12631 - vals = [['A', 'B'], - [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')]] - idx = pd.MultiIndex.from_product(vals, names=['str', 'dt']) + vals = [["A", "B"], [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]] + idx = pd.MultiIndex.from_product(vals, names=["str", "dt"]) result = idx.take(np.array([1, 0, -1])) - exp_vals = [('A', pd.Timestamp('2011-01-02')), - ('A', pd.Timestamp('2011-01-01')), - ('B', pd.Timestamp('2011-01-02'))] - expected = pd.MultiIndex.from_tuples(exp_vals, names=['str', 'dt']) + exp_vals = [ + ("A", pd.Timestamp("2011-01-02")), + ("A", pd.Timestamp("2011-01-01")), + ("B", pd.Timestamp("2011-01-02")), + ] + expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"]) tm.assert_index_equal(result, expected) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) - exp_vals = [('A', pd.Timestamp('2011-01-02')), - ('A', pd.Timestamp('2011-01-01')), - (np.nan, pd.NaT)] - expected = pd.MultiIndex.from_tuples(exp_vals, names=['str', 'dt']) + exp_vals = [ + ("A", pd.Timestamp("2011-01-02")), + ("A", pd.Timestamp("2011-01-01")), + (np.nan, pd.NaT), + ] + expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"]) tm.assert_index_equal(result, expected) # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - exp_vals = [('A', pd.Timestamp('2011-01-02')), - ('A', pd.Timestamp('2011-01-01')), - ('B', pd.Timestamp('2011-01-02'))] - expected = pd.MultiIndex.from_tuples(exp_vals, names=['str', 'dt']) + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + exp_vals = [ + ("A", pd.Timestamp("2011-01-02")), + ("A", pd.Timestamp("2011-01-01")), + ("B", pd.Timestamp("2011-01-02")), + ] + expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"]) tm.assert_index_equal(result, expected) - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') + msg = ( + "When allow_fill=True and fill_value is not None, " "all indices must be >= -1" + ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): @@ -209,8 +227,14 @@ def test_take_fill_value(): def test_iter(idx): result = list(idx) - expected = [('foo', 'one'), ('foo', 'two'), ('bar', 'one'), - ('baz', 'two'), ('qux', 'one'), ('qux', 'two')] + expected = [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ] assert result == expected @@ -237,7 +261,7 @@ def test_map(idx): # we don't infer UInt64 if isinstance(index, pd.UInt64Index): - expected = index.astype('int64') + expected = index.astype("int64") else: expected = index @@ -249,7 +273,9 @@ def test_map(idx): "mapper", [ lambda values, idx: {i: e for e, i in zip(values, idx)}, - lambda values, idx: pd.Series(values, idx)]) + lambda values, idx: pd.Series(values, idx), + ], +) def test_map_dictlike(idx, mapper): if isinstance(idx, (pd.CategoricalIndex, pd.IntervalIndex)): @@ -259,7 +285,7 @@ def test_map_dictlike(idx, mapper): # we don't infer to UInt64 for a dict if isinstance(idx, pd.UInt64Index) and isinstance(identity, dict): - expected = idx.astype('int64') + expected = idx.astype("int64") else: expected = idx @@ -272,13 +298,34 @@ def test_map_dictlike(idx, mapper): tm.assert_index_equal(result, expected) -@pytest.mark.parametrize('func', [ - np.exp, np.exp2, np.expm1, np.log, np.log2, np.log10, - np.log1p, np.sqrt, np.sin, np.cos, np.tan, np.arcsin, - np.arccos, np.arctan, np.sinh, np.cosh, np.tanh, - np.arcsinh, np.arccosh, np.arctanh, np.deg2rad, - np.rad2deg -], ids=lambda func: func.__name__) +@pytest.mark.parametrize( + "func", + [ + np.exp, + np.exp2, + np.expm1, + np.log, + np.log2, + np.log10, + np.log1p, + np.sqrt, + np.sin, + np.cos, + np.tan, + np.arcsin, + np.arccos, + np.arctan, + np.sinh, + np.cosh, + np.tanh, + np.arcsinh, + np.arccosh, + np.arctanh, + np.deg2rad, + np.rad2deg, + ], + ids=lambda func: func.__name__, +) def test_numpy_ufuncs(idx, func): # test ufuncs of numpy. see: # http://docs.scipy.org/doc/numpy/reference/ufuncs.html @@ -288,18 +335,24 @@ def test_numpy_ufuncs(idx, func): msg = "'tuple' object has no attribute '{}'".format(func.__name__) else: expected_exception = TypeError - msg = ("loop of ufunc does not support argument 0 of type tuple which" - " has no callable {} method").format(func.__name__) + msg = ( + "loop of ufunc does not support argument 0 of type tuple which" + " has no callable {} method" + ).format(func.__name__) with pytest.raises(expected_exception, match=msg): func(idx) -@pytest.mark.parametrize('func', [ - np.isfinite, np.isinf, np.isnan, np.signbit -], ids=lambda func: func.__name__) +@pytest.mark.parametrize( + "func", + [np.isfinite, np.isinf, np.isnan, np.signbit], + ids=lambda func: func.__name__, +) def test_numpy_type_funcs(idx, func): - msg = ("ufunc '{}' not supported for the input types, and the inputs" - " could not be safely coerced to any supported types according to" - " the casting rule ''safe''").format(func.__name__) + msg = ( + "ufunc '{}' not supported for the input types, and the inputs" + " could not be safely coerced to any supported types according to" + " the casting rule ''safe''" + ).format(func.__name__) with pytest.raises(TypeError, match=msg): func(idx) diff --git a/pandas/tests/indexes/multi/test_astype.py b/pandas/tests/indexes/multi/test_astype.py index 5ee44fde730d77..4adcdd0112b264 100644 --- a/pandas/tests/indexes/multi/test_astype.py +++ b/pandas/tests/indexes/multi/test_astype.py @@ -8,7 +8,7 @@ def test_astype(idx): expected = idx.copy() - actual = idx.astype('O') + actual = idx.astype("O") assert_copy(actual.levels, expected.levels) assert_copy(actual.codes, expected.codes) assert [level.name for level in actual.levels] == list(expected.names) @@ -17,14 +17,14 @@ def test_astype(idx): idx.astype(np.dtype(int)) -@pytest.mark.parametrize('ordered', [True, False]) +@pytest.mark.parametrize("ordered", [True, False]) def test_astype_category(idx, ordered): # GH 18630 - msg = '> 1 ndim Categorical are not supported at this time' + msg = "> 1 ndim Categorical are not supported at this time" with pytest.raises(NotImplementedError, match=msg): idx.astype(CategoricalDtype(ordered=ordered)) if ordered is False: # dtype='category' defaults to ordered=False, so only test once with pytest.raises(NotImplementedError, match=msg): - idx.astype('category') + idx.astype("category") diff --git a/pandas/tests/indexes/multi/test_compat.py b/pandas/tests/indexes/multi/test_compat.py index bd194cab8e2049..b02f87dc4aacb8 100644 --- a/pandas/tests/indexes/multi/test_compat.py +++ b/pandas/tests/indexes/multi/test_compat.py @@ -45,8 +45,8 @@ def test_boolean_context_compat2(): # boolean context compat # GH7897 - i1 = MultiIndex.from_tuples([('A', 1), ('A', 2)]) - i2 = MultiIndex.from_tuples([('A', 1), ('A', 3)]) + i1 = MultiIndex.from_tuples([("A", 1), ("A", 2)]) + i2 = MultiIndex.from_tuples([("A", 1), ("A", 3)]) common = i1.intersection(i2) with pytest.raises(ValueError): @@ -54,8 +54,8 @@ def test_boolean_context_compat2(): def test_inplace_mutation_resets_values(): - levels = [['a', 'b', 'c'], [4]] - levels2 = [[1, 2, 3], ['a']] + levels = [["a", "b", "c"], [4]] + levels2 = [[1, 2, 3], ["a"]] codes = [[0, 1, 0, 2, 2, 0], [0, 0, 0, 0, 0, 0]] mi1 = MultiIndex(levels=levels, codes=codes) @@ -82,7 +82,7 @@ def test_inplace_mutation_resets_values(): # Make sure label setting works too codes2 = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]] exp_values = np.empty((6,), dtype=object) - exp_values[:] = [(1, 'a')] * 6 + exp_values[:] = [(1, "a")] * 6 # Must be 1d array of tuples assert exp_values.shape == (6,) diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 7cab05660ac49e..1b6177ede30ec9 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -13,12 +13,13 @@ def test_constructor_single_level(): - result = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], - codes=[[0, 1, 2, 3]], names=['first']) + result = MultiIndex( + levels=[["foo", "bar", "baz", "qux"]], codes=[[0, 1, 2, 3]], names=["first"] + ) assert isinstance(result, MultiIndex) - expected = Index(['foo', 'bar', 'baz', 'qux'], name='first') + expected = Index(["foo", "bar", "baz", "qux"], name="first") tm.assert_index_equal(result.levels[0], expected) - assert result.names == ['first'] + assert result.names == ["first"] def test_constructor_no_levels(): @@ -35,18 +36,20 @@ def test_constructor_no_levels(): def test_constructor_nonhashable_names(): # GH 20527 - levels = [[1, 2], ['one', 'two']] + levels = [[1, 2], ["one", "two"]] codes = [[0, 0, 1, 1], [0, 1, 0, 1]] - names = (['foo'], ['bar']) + names = (["foo"], ["bar"]) msg = r"MultiIndex\.name must be a hashable type" with pytest.raises(TypeError, match=msg): MultiIndex(levels=levels, codes=codes, names=names) # With .rename() - mi = MultiIndex(levels=[[1, 2], ['one', 'two']], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]], - names=('foo', 'bar')) - renamed = [['foor'], ['barr']] + mi = MultiIndex( + levels=[[1, 2], ["one", "two"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=("foo", "bar"), + ) + renamed = [["foor"], ["barr"]] with pytest.raises(TypeError, match=msg): mi.rename(names=renamed) @@ -63,70 +66,76 @@ def test_constructor_mismatched_codes_levels(idx): with pytest.raises(ValueError, match=msg): MultiIndex(levels=levels, codes=codes) - length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\." - " NOTE: this index is in an inconsistent state") + length_error = ( + r"On level 0, code max \(3\) >= length of level \(1\)\." + " NOTE: this index is in an inconsistent state" + ) label_error = r"Unequal code lengths: \[4, 2\]" code_value_error = r"On level 0, code value \(-2\) < -1" # important to check that it's looking at the right thing. with pytest.raises(ValueError, match=length_error): - MultiIndex(levels=[['a'], ['b']], - codes=[[0, 1, 2, 3], [0, 3, 4, 1]]) + MultiIndex(levels=[["a"], ["b"]], codes=[[0, 1, 2, 3], [0, 3, 4, 1]]) with pytest.raises(ValueError, match=label_error): - MultiIndex(levels=[['a'], ['b']], codes=[[0, 0, 0, 0], [0, 0]]) + MultiIndex(levels=[["a"], ["b"]], codes=[[0, 0, 0, 0], [0, 0]]) # external API with pytest.raises(ValueError, match=length_error): - idx.copy().set_levels([['a'], ['b']]) + idx.copy().set_levels([["a"], ["b"]]) with pytest.raises(ValueError, match=label_error): idx.copy().set_codes([[0, 0, 0, 0], [0, 0]]) # test set_codes with verify_integrity=False # the setting should not raise any value error - idx.copy().set_codes(codes=[[0, 0, 0, 0], [0, 0]], - verify_integrity=False) + idx.copy().set_codes(codes=[[0, 0, 0, 0], [0, 0]], verify_integrity=False) # code value smaller than -1 with pytest.raises(ValueError, match=code_value_error): - MultiIndex(levels=[['a'], ['b']], codes=[[0, -2], [0, 0]]) + MultiIndex(levels=[["a"], ["b"]], codes=[[0, -2], [0, 0]]) def test_na_levels(): # GH26408 # test if codes are re-assigned value -1 for levels # with mising values (NaN, NaT, None) - result = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], - codes=[[0, -1, 1, 2, 3, 4]]) - expected = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], - codes=[[-1, -1, -1, -1, 3, 4]]) + result = MultiIndex( + levels=[[np.nan, None, pd.NaT, 128, 2]], codes=[[0, -1, 1, 2, 3, 4]] + ) + expected = MultiIndex( + levels=[[np.nan, None, pd.NaT, 128, 2]], codes=[[-1, -1, -1, -1, 3, 4]] + ) tm.assert_index_equal(result, expected) - result = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], - codes=[[0, -1, 1, 2, 3, 4]]) - expected = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], - codes=[[-1, -1, 1, -1, 3, -1]]) + result = MultiIndex( + levels=[[np.nan, "s", pd.NaT, 128, None]], codes=[[0, -1, 1, 2, 3, 4]] + ) + expected = MultiIndex( + levels=[[np.nan, "s", pd.NaT, 128, None]], codes=[[-1, -1, 1, -1, 3, -1]] + ) tm.assert_index_equal(result, expected) # verify set_levels and set_codes result = MultiIndex( - levels=[[1, 2, 3, 4, 5]], codes=[[0, -1, 1, 2, 3, 4]]).set_levels( - [[np.nan, 's', pd.NaT, 128, None]]) + levels=[[1, 2, 3, 4, 5]], codes=[[0, -1, 1, 2, 3, 4]] + ).set_levels([[np.nan, "s", pd.NaT, 128, None]]) tm.assert_index_equal(result, expected) result = MultiIndex( - levels=[[np.nan, 's', pd.NaT, 128, None]], - codes=[[1, 2, 2, 2, 2, 2]]).set_codes( - [[0, -1, 1, 2, 3, 4]]) + levels=[[np.nan, "s", pd.NaT, 128, None]], codes=[[1, 2, 2, 2, 2, 2]] + ).set_codes([[0, -1, 1, 2, 3, 4]]) tm.assert_index_equal(result, expected) def test_labels_deprecated(idx): # GH23752 with tm.assert_produces_warning(FutureWarning): - MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], - labels=[[0, 1, 2, 3]], names=['first']) + MultiIndex( + levels=[["foo", "bar", "baz", "qux"]], + labels=[[0, 1, 2, 3]], + names=["first"], + ) with tm.assert_produces_warning(FutureWarning): idx.labels @@ -135,8 +144,7 @@ def test_copy_in_constructor(): levels = np.array(["a", "b", "c"]) codes = np.array([1, 1, 2, 0, 0, 1, 1]) val = codes[0] - mi = MultiIndex(levels=[levels, levels], codes=[codes, codes], - copy=True) + mi = MultiIndex(levels=[levels, levels], codes=[codes, codes], copy=True) assert mi.codes[0][0] == val codes[0] = 15 assert mi.codes[0][0] == val @@ -149,24 +157,27 @@ def test_copy_in_constructor(): # from_arrays # ---------------------------------------------------------------------------- def test_from_arrays(idx): - arrays = [np.asarray(lev).take(level_codes) - for lev, level_codes in zip(idx.levels, idx.codes)] + arrays = [ + np.asarray(lev).take(level_codes) + for lev, level_codes in zip(idx.levels, idx.codes) + ] # list of arrays as input result = MultiIndex.from_arrays(arrays, names=idx.names) tm.assert_index_equal(result, idx) # infer correctly - result = MultiIndex.from_arrays([[pd.NaT, Timestamp('20130101')], - ['a', 'b']]) - assert result.levels[0].equals(Index([Timestamp('20130101')])) - assert result.levels[1].equals(Index(['a', 'b'])) + result = MultiIndex.from_arrays([[pd.NaT, Timestamp("20130101")], ["a", "b"]]) + assert result.levels[0].equals(Index([Timestamp("20130101")])) + assert result.levels[1].equals(Index(["a", "b"])) def test_from_arrays_iterator(idx): # GH 18434 - arrays = [np.asarray(lev).take(level_codes) - for lev, level_codes in zip(idx.levels, idx.codes)] + arrays = [ + np.asarray(lev).take(level_codes) + for lev, level_codes in zip(idx.levels, idx.codes) + ] # iterator as input result = MultiIndex.from_arrays(iter(arrays), names=idx.names) @@ -179,8 +190,10 @@ def test_from_arrays_iterator(idx): def test_from_arrays_tuples(idx): - arrays = tuple(tuple(np.asarray(lev).take(level_codes)) - for lev, level_codes in zip(idx.levels, idx.codes)) + arrays = tuple( + tuple(np.asarray(lev).take(level_codes)) + for lev, level_codes in zip(idx.levels, idx.codes) + ) # tuple of tuples as input result = MultiIndex.from_arrays(arrays, names=idx.names) @@ -188,10 +201,8 @@ def test_from_arrays_tuples(idx): def test_from_arrays_index_series_datetimetz(): - idx1 = pd.date_range('2015-01-01 10:00', freq='D', periods=3, - tz='US/Eastern') - idx2 = pd.date_range('2015-01-01 10:00', freq='H', periods=3, - tz='Asia/Tokyo') + idx1 = pd.date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern") + idx2 = pd.date_range("2015-01-01 10:00", freq="H", periods=3, tz="Asia/Tokyo") result = pd.MultiIndex.from_arrays([idx1, idx2]) tm.assert_index_equal(result.get_level_values(0), idx1) tm.assert_index_equal(result.get_level_values(1), idx2) @@ -204,8 +215,8 @@ def test_from_arrays_index_series_datetimetz(): def test_from_arrays_index_series_timedelta(): - idx1 = pd.timedelta_range('1 days', freq='D', periods=3) - idx2 = pd.timedelta_range('2 hours', freq='H', periods=3) + idx1 = pd.timedelta_range("1 days", freq="D", periods=3) + idx2 = pd.timedelta_range("2 hours", freq="H", periods=3) result = pd.MultiIndex.from_arrays([idx1, idx2]) tm.assert_index_equal(result.get_level_values(0), idx1) tm.assert_index_equal(result.get_level_values(1), idx2) @@ -218,8 +229,8 @@ def test_from_arrays_index_series_timedelta(): def test_from_arrays_index_series_period(): - idx1 = pd.period_range('2011-01-01', freq='D', periods=3) - idx2 = pd.period_range('2015-01-01', freq='H', periods=3) + idx1 = pd.period_range("2011-01-01", freq="D", periods=3) + idx2 = pd.period_range("2015-01-01", freq="H", periods=3) result = pd.MultiIndex.from_arrays([idx1, idx2]) tm.assert_index_equal(result.get_level_values(0), idx1) tm.assert_index_equal(result.get_level_values(1), idx2) @@ -232,11 +243,10 @@ def test_from_arrays_index_series_period(): def test_from_arrays_index_datetimelike_mixed(): - idx1 = pd.date_range('2015-01-01 10:00', freq='D', periods=3, - tz='US/Eastern') - idx2 = pd.date_range('2015-01-01 10:00', freq='H', periods=3) - idx3 = pd.timedelta_range('1 days', freq='D', periods=3) - idx4 = pd.period_range('2011-01-01', freq='D', periods=3) + idx1 = pd.date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern") + idx2 = pd.date_range("2015-01-01 10:00", freq="H", periods=3) + idx3 = pd.timedelta_range("1 days", freq="D", periods=3) + idx4 = pd.period_range("2011-01-01", freq="D", periods=3) result = pd.MultiIndex.from_arrays([idx1, idx2, idx3, idx4]) tm.assert_index_equal(result.get_level_values(0), idx1) @@ -244,10 +254,9 @@ def test_from_arrays_index_datetimelike_mixed(): tm.assert_index_equal(result.get_level_values(2), idx3) tm.assert_index_equal(result.get_level_values(3), idx4) - result2 = pd.MultiIndex.from_arrays([pd.Series(idx1), - pd.Series(idx2), - pd.Series(idx3), - pd.Series(idx4)]) + result2 = pd.MultiIndex.from_arrays( + [pd.Series(idx1), pd.Series(idx2), pd.Series(idx3), pd.Series(idx4)] + ) tm.assert_index_equal(result2.get_level_values(0), idx1) tm.assert_index_equal(result2.get_level_values(1), idx2) tm.assert_index_equal(result2.get_level_values(2), idx3) @@ -258,10 +267,8 @@ def test_from_arrays_index_datetimelike_mixed(): def test_from_arrays_index_series_categorical(): # GH13743 - idx1 = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), - ordered=False) - idx2 = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), - ordered=True) + idx1 = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), ordered=False) + idx2 = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), ordered=True) result = pd.MultiIndex.from_arrays([idx1, idx2]) tm.assert_index_equal(result.get_level_values(0), idx1) @@ -283,41 +290,60 @@ def test_from_arrays_empty(): MultiIndex.from_arrays(arrays=[]) # 1 level - result = MultiIndex.from_arrays(arrays=[[]], names=['A']) + result = MultiIndex.from_arrays(arrays=[[]], names=["A"]) assert isinstance(result, MultiIndex) - expected = Index([], name='A') + expected = Index([], name="A") tm.assert_index_equal(result.levels[0], expected) # N levels for N in [2, 3]: arrays = [[]] * N - names = list('ABC')[:N] + names = list("ABC")[:N] result = MultiIndex.from_arrays(arrays=arrays, names=names) - expected = MultiIndex(levels=[[]] * N, codes=[[]] * N, - names=names) + expected = MultiIndex(levels=[[]] * N, codes=[[]] * N, names=names) tm.assert_index_equal(result, expected) -@pytest.mark.parametrize('invalid_sequence_of_arrays', [ - 1, [1], [1, 2], [[1], 2], [1, [2]], 'a', ['a'], ['a', 'b'], [['a'], 'b'], - (1,), (1, 2), ([1], 2), (1, [2]), 'a', ('a',), ('a', 'b'), (['a'], 'b'), - [(1,), 2], [1, (2,)], [('a',), 'b'], - ((1,), 2), (1, (2,)), (('a',), 'b') -]) +@pytest.mark.parametrize( + "invalid_sequence_of_arrays", + [ + 1, + [1], + [1, 2], + [[1], 2], + [1, [2]], + "a", + ["a"], + ["a", "b"], + [["a"], "b"], + (1,), + (1, 2), + ([1], 2), + (1, [2]), + "a", + ("a",), + ("a", "b"), + (["a"], "b"), + [(1,), 2], + [1, (2,)], + [("a",), "b"], + ((1,), 2), + (1, (2,)), + (("a",), "b"), + ], +) def test_from_arrays_invalid_input(invalid_sequence_of_arrays): msg = "Input must be a list / sequence of array-likes" with pytest.raises(TypeError, match=msg): MultiIndex.from_arrays(arrays=invalid_sequence_of_arrays) -@pytest.mark.parametrize('idx1, idx2', [ - ([1, 2, 3], ['a', 'b']), - ([], ['a', 'b']), - ([1, 2, 3], []) -]) +@pytest.mark.parametrize( + "idx1, idx2", [([1, 2, 3], ["a", "b"]), ([], ["a", "b"]), ([1, 2, 3], [])] +) def test_from_arrays_different_lengths(idx1, idx2): # see gh-13599 - msg = '^all arrays must be same length$' + msg = "^all arrays must be same length$" with pytest.raises(ValueError, match=msg): MultiIndex.from_arrays([idx1, idx2]) @@ -326,40 +352,39 @@ def test_from_arrays_different_lengths(idx1, idx2): # from_tuples # ---------------------------------------------------------------------------- def test_from_tuples(): - msg = 'Cannot infer number of levels from empty list' + msg = "Cannot infer number of levels from empty list" with pytest.raises(TypeError, match=msg): MultiIndex.from_tuples([]) - expected = MultiIndex(levels=[[1, 3], [2, 4]], - codes=[[0, 1], [0, 1]], - names=['a', 'b']) + expected = MultiIndex( + levels=[[1, 3], [2, 4]], codes=[[0, 1], [0, 1]], names=["a", "b"] + ) # input tuples - result = MultiIndex.from_tuples(((1, 2), (3, 4)), names=['a', 'b']) + result = MultiIndex.from_tuples(((1, 2), (3, 4)), names=["a", "b"]) tm.assert_index_equal(result, expected) def test_from_tuples_iterator(): # GH 18434 # input iterator for tuples - expected = MultiIndex(levels=[[1, 3], [2, 4]], - codes=[[0, 1], [0, 1]], - names=['a', 'b']) + expected = MultiIndex( + levels=[[1, 3], [2, 4]], codes=[[0, 1], [0, 1]], names=["a", "b"] + ) - result = MultiIndex.from_tuples(zip([1, 3], [2, 4]), names=['a', 'b']) + result = MultiIndex.from_tuples(zip([1, 3], [2, 4]), names=["a", "b"]) tm.assert_index_equal(result, expected) # input non-iterables - msg = 'Input must be a list / sequence of tuple-likes.' + msg = "Input must be a list / sequence of tuple-likes." with pytest.raises(TypeError, match=msg): MultiIndex.from_tuples(0) def test_from_tuples_empty(): # GH 16777 - result = MultiIndex.from_tuples([], names=['a', 'b']) - expected = MultiIndex.from_arrays(arrays=[[], []], - names=['a', 'b']) + result = MultiIndex.from_tuples([], names=["a", "b"]) + expected = MultiIndex.from_arrays(arrays=[[], []], names=["a", "b"]) tm.assert_index_equal(result, expected) @@ -374,17 +399,18 @@ def test_tuples_with_name_string(): li = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] msg = "Names should be list-like for a MultiIndex" with pytest.raises(ValueError, match=msg): - pd.Index(li, name='abc') + pd.Index(li, name="abc") with pytest.raises(ValueError, match=msg): - pd.Index(li, name='a') + pd.Index(li, name="a") def test_from_tuples_with_tuple_label(): # GH 15457 - expected = pd.DataFrame([[2, 1, 2], [4, (1, 2), 3]], - columns=['a', 'b', 'c']).set_index(['a', 'b']) - idx = pd.MultiIndex.from_tuples([(2, 1), (4, (1, 2))], names=('a', 'b')) - result = pd.DataFrame([2, 3], columns=['c'], index=idx) + expected = pd.DataFrame( + [[2, 1, 2], [4, (1, 2), 3]], columns=["a", "b", "c"] + ).set_index(["a", "b"]) + idx = pd.MultiIndex.from_tuples([(2, 1), (4, (1, 2))], names=("a", "b")) + result = pd.DataFrame([2, 3], columns=["c"], index=idx) tm.assert_frame_equal(expected, result) @@ -399,79 +425,64 @@ def test_from_product_empty_zero_levels(): def test_from_product_empty_one_level(): - result = MultiIndex.from_product([[]], names=['A']) - expected = pd.Index([], name='A') + result = MultiIndex.from_product([[]], names=["A"]) + expected = pd.Index([], name="A") tm.assert_index_equal(result.levels[0], expected) -@pytest.mark.parametrize('first, second', [ - ([], []), - (['foo', 'bar', 'baz'], []), - ([], ['a', 'b', 'c']), -]) +@pytest.mark.parametrize( + "first, second", [([], []), (["foo", "bar", "baz"], []), ([], ["a", "b", "c"])] +) def test_from_product_empty_two_levels(first, second): - names = ['A', 'B'] + names = ["A", "B"] result = MultiIndex.from_product([first, second], names=names) - expected = MultiIndex(levels=[first, second], - codes=[[], []], names=names) + expected = MultiIndex(levels=[first, second], codes=[[], []], names=names) tm.assert_index_equal(result, expected) -@pytest.mark.parametrize('N', list(range(4))) +@pytest.mark.parametrize("N", list(range(4))) def test_from_product_empty_three_levels(N): # GH12258 - names = ['A', 'B', 'C'] + names = ["A", "B", "C"] lvl2 = list(range(N)) result = MultiIndex.from_product([[], lvl2, []], names=names) - expected = MultiIndex(levels=[[], lvl2, []], - codes=[[], [], []], names=names) + expected = MultiIndex(levels=[[], lvl2, []], codes=[[], [], []], names=names) tm.assert_index_equal(result, expected) -@pytest.mark.parametrize('invalid_input', [ - 1, - [1], - [1, 2], - [[1], 2], - 'a', - ['a'], - ['a', 'b'], - [['a'], 'b'], -]) +@pytest.mark.parametrize( + "invalid_input", [1, [1], [1, 2], [[1], 2], "a", ["a"], ["a", "b"], [["a"], "b"]] +) def test_from_product_invalid_input(invalid_input): - msg = (r"Input must be a list / sequence of iterables|" - "Input must be list-like") + msg = r"Input must be a list / sequence of iterables|" "Input must be list-like" with pytest.raises(TypeError, match=msg): MultiIndex.from_product(iterables=invalid_input) def test_from_product_datetimeindex(): - dt_index = date_range('2000-01-01', periods=2) + dt_index = date_range("2000-01-01", periods=2) mi = pd.MultiIndex.from_product([[1, 2], dt_index]) - etalon = construct_1d_object_array_from_listlike([ - (1, pd.Timestamp('2000-01-01')), - (1, pd.Timestamp('2000-01-02')), - (2, pd.Timestamp('2000-01-01')), - (2, pd.Timestamp('2000-01-02')), - ]) + etalon = construct_1d_object_array_from_listlike( + [ + (1, pd.Timestamp("2000-01-01")), + (1, pd.Timestamp("2000-01-02")), + (2, pd.Timestamp("2000-01-01")), + (2, pd.Timestamp("2000-01-02")), + ] + ) tm.assert_numpy_array_equal(mi.values, etalon) -@pytest.mark.parametrize('ordered', [False, True]) -@pytest.mark.parametrize('f', [ - lambda x: x, - lambda x: pd.Series(x), - lambda x: x.values -]) +@pytest.mark.parametrize("ordered", [False, True]) +@pytest.mark.parametrize("f", [lambda x: x, lambda x: pd.Series(x), lambda x: x.values]) def test_from_product_index_series_categorical(ordered, f): # GH13743 - first = ['foo', 'bar'] + first = ["foo", "bar"] - idx = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), - ordered=ordered) - expected = pd.CategoricalIndex(list("abcaab") + list("abcaab"), - categories=list("bac"), - ordered=ordered) + idx = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), ordered=ordered) + expected = pd.CategoricalIndex( + list("abcaab") + list("abcaab"), categories=list("bac"), ordered=ordered + ) result = pd.MultiIndex.from_product([first, f(idx)]) tm.assert_index_equal(result.get_level_values(1), expected) @@ -479,14 +490,22 @@ def test_from_product_index_series_categorical(ordered, f): def test_from_product(): - first = ['foo', 'bar', 'buz'] - second = ['a', 'b', 'c'] - names = ['first', 'second'] + first = ["foo", "bar", "buz"] + second = ["a", "b", "c"] + names = ["first", "second"] result = MultiIndex.from_product([first, second], names=names) - tuples = [('foo', 'a'), ('foo', 'b'), ('foo', 'c'), ('bar', 'a'), - ('bar', 'b'), ('bar', 'c'), ('buz', 'a'), ('buz', 'b'), - ('buz', 'c')] + tuples = [ + ("foo", "a"), + ("foo", "b"), + ("foo", "c"), + ("bar", "a"), + ("bar", "b"), + ("bar", "c"), + ("buz", "a"), + ("buz", "b"), + ("buz", "c"), + ] expected = MultiIndex.from_tuples(tuples, names=names) tm.assert_index_equal(result, expected) @@ -494,12 +513,20 @@ def test_from_product(): def test_from_product_iterator(): # GH 18434 - first = ['foo', 'bar', 'buz'] - second = ['a', 'b', 'c'] - names = ['first', 'second'] - tuples = [('foo', 'a'), ('foo', 'b'), ('foo', 'c'), ('bar', 'a'), - ('bar', 'b'), ('bar', 'c'), ('buz', 'a'), ('buz', 'b'), - ('buz', 'c')] + first = ["foo", "bar", "buz"] + second = ["a", "b", "c"] + names = ["first", "second"] + tuples = [ + ("foo", "a"), + ("foo", "b"), + ("foo", "c"), + ("bar", "a"), + ("bar", "b"), + ("bar", "c"), + ("buz", "a"), + ("buz", "b"), + ("buz", "c"), + ] expected = MultiIndex.from_tuples(tuples, names=names) # iterator as input @@ -517,28 +544,38 @@ def test_create_index_existing_name(idx): # GH11193, when an existing index is passed, and a new name is not # specified, the new index should inherit the previous object name index = idx - index.names = ['foo', 'bar'] + index.names = ["foo", "bar"] result = pd.Index(index) expected = Index( - Index([ - ('foo', 'one'), ('foo', 'two'), - ('bar', 'one'), ('baz', 'two'), - ('qux', 'one'), ('qux', 'two')], - dtype='object' + Index( + [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ], + dtype="object", ), - names=['foo', 'bar'] + names=["foo", "bar"], ) tm.assert_index_equal(result, expected) - result = pd.Index(index, names=['A', 'B']) + result = pd.Index(index, names=["A", "B"]) expected = Index( - Index([ - ('foo', 'one'), ('foo', 'two'), - ('bar', 'one'), ('baz', 'two'), - ('qux', 'one'), ('qux', 'two')], - dtype='object' + Index( + [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ], + dtype="object", ), - names=['A', 'B'] + names=["A", "B"], ) tm.assert_index_equal(result, expected) @@ -548,45 +585,56 @@ def test_create_index_existing_name(idx): # ---------------------------------------------------------------------------- def test_from_frame(): # GH 22420 - df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], - columns=['L1', 'L2']) - expected = pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), - ('b', 'a'), ('b', 'b')], - names=['L1', 'L2']) + df = pd.DataFrame( + [["a", "a"], ["a", "b"], ["b", "a"], ["b", "b"]], columns=["L1", "L2"] + ) + expected = pd.MultiIndex.from_tuples( + [("a", "a"), ("a", "b"), ("b", "a"), ("b", "b")], names=["L1", "L2"] + ) result = pd.MultiIndex.from_frame(df) tm.assert_index_equal(expected, result) -@pytest.mark.parametrize('non_frame', [ - pd.Series([1, 2, 3, 4]), - [1, 2, 3, 4], - [[1, 2], [3, 4], [5, 6]], - pd.Index([1, 2, 3, 4]), - np.array([[1, 2], [3, 4], [5, 6]]), - 27 -]) +@pytest.mark.parametrize( + "non_frame", + [ + pd.Series([1, 2, 3, 4]), + [1, 2, 3, 4], + [[1, 2], [3, 4], [5, 6]], + pd.Index([1, 2, 3, 4]), + np.array([[1, 2], [3, 4], [5, 6]]), + 27, + ], +) def test_from_frame_error(non_frame): # GH 22420 - with pytest.raises(TypeError, match='Input must be a DataFrame'): + with pytest.raises(TypeError, match="Input must be a DataFrame"): pd.MultiIndex.from_frame(non_frame) def test_from_frame_dtype_fidelity(): # GH 22420 - df = pd.DataFrame(OrderedDict([ - ('dates', pd.date_range('19910905', periods=6, tz='US/Eastern')), - ('a', [1, 1, 1, 2, 2, 2]), - ('b', pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True)), - ('c', ['x', 'x', 'y', 'z', 'x', 'y']) - ])) + df = pd.DataFrame( + OrderedDict( + [ + ("dates", pd.date_range("19910905", periods=6, tz="US/Eastern")), + ("a", [1, 1, 1, 2, 2, 2]), + ("b", pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True)), + ("c", ["x", "x", "y", "z", "x", "y"]), + ] + ) + ) original_dtypes = df.dtypes.to_dict() - expected_mi = pd.MultiIndex.from_arrays([ - pd.date_range('19910905', periods=6, tz='US/Eastern'), - [1, 1, 1, 2, 2, 2], - pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), - ['x', 'x', 'y', 'z', 'x', 'y'] - ], names=['dates', 'a', 'b', 'c']) + expected_mi = pd.MultiIndex.from_arrays( + [ + pd.date_range("19910905", periods=6, tz="US/Eastern"), + [1, 1, 1, 2, 2, 2], + pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True), + ["x", "x", "y", "z", "x", "y"], + ], + names=["dates", "a", "b", "c"], + ) mi = pd.MultiIndex.from_frame(df) mi_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} @@ -594,28 +642,31 @@ def test_from_frame_dtype_fidelity(): assert original_dtypes == mi_dtypes -@pytest.mark.parametrize('names_in,names_out', [ - (None, [('L1', 'x'), ('L2', 'y')]), - (['x', 'y'], ['x', 'y']), -]) +@pytest.mark.parametrize( + "names_in,names_out", [(None, [("L1", "x"), ("L2", "y")]), (["x", "y"], ["x", "y"])] +) def test_from_frame_valid_names(names_in, names_out): # GH 22420 - df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], - columns=pd.MultiIndex.from_tuples([('L1', 'x'), - ('L2', 'y')])) + df = pd.DataFrame( + [["a", "a"], ["a", "b"], ["b", "a"], ["b", "b"]], + columns=pd.MultiIndex.from_tuples([("L1", "x"), ("L2", "y")]), + ) mi = pd.MultiIndex.from_frame(df, names=names_in) assert mi.names == names_out -@pytest.mark.parametrize('names,expected_error_msg', [ - ('bad_input', "Names should be list-like for a MultiIndex"), - (['a', 'b', 'c'], - "Length of names must match number of levels in MultiIndex") -]) +@pytest.mark.parametrize( + "names,expected_error_msg", + [ + ("bad_input", "Names should be list-like for a MultiIndex"), + (["a", "b", "c"], "Length of names must match number of levels in MultiIndex"), + ], +) def test_from_frame_invalid_names(names, expected_error_msg): # GH 22420 - df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], - columns=pd.MultiIndex.from_tuples([('L1', 'x'), - ('L2', 'y')])) + df = pd.DataFrame( + [["a", "a"], ["a", "b"], ["b", "a"], ["b", "b"]], + columns=pd.MultiIndex.from_tuples([("L1", "x"), ("L2", "y")]), + ) with pytest.raises(ValueError, match=expected_error_msg): pd.MultiIndex.from_frame(df, names=names) diff --git a/pandas/tests/indexes/multi/test_contains.py b/pandas/tests/indexes/multi/test_contains.py index 4b6934d445fd0b..21b71613f00f0d 100644 --- a/pandas/tests/indexes/multi/test_contains.py +++ b/pandas/tests/indexes/multi/test_contains.py @@ -9,44 +9,42 @@ def test_contains_top_level(): - midx = MultiIndex.from_product([['A', 'B'], [1, 2]]) - assert 'A' in midx - assert 'A' not in midx._engine + midx = MultiIndex.from_product([["A", "B"], [1, 2]]) + assert "A" in midx + assert "A" not in midx._engine def test_contains_with_nat(): # MI with a NaT - mi = MultiIndex(levels=[['C'], - pd.date_range('2012-01-01', periods=5)], - codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], - names=[None, 'B']) - assert ('C', pd.Timestamp('2012-01-01')) in mi + mi = MultiIndex( + levels=[["C"], pd.date_range("2012-01-01", periods=5)], + codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], + names=[None, "B"], + ) + assert ("C", pd.Timestamp("2012-01-01")) in mi for val in mi.values: assert val in mi def test_contains(idx): - assert ('foo', 'two') in idx - assert ('bar', 'two') not in idx + assert ("foo", "two") in idx + assert ("bar", "two") not in idx assert None not in idx @pytest.mark.skipif(not PYPY, reason="tuples cmp recursively on PyPy") def test_isin_nan_pypy(): - idx = MultiIndex.from_arrays([['foo', 'bar'], [1.0, np.nan]]) - tm.assert_numpy_array_equal(idx.isin([('bar', np.nan)]), - np.array([False, True])) - tm.assert_numpy_array_equal(idx.isin([('bar', float('nan'))]), - np.array([False, True])) + idx = MultiIndex.from_arrays([["foo", "bar"], [1.0, np.nan]]) + tm.assert_numpy_array_equal(idx.isin([("bar", np.nan)]), np.array([False, True])) + tm.assert_numpy_array_equal( + idx.isin([("bar", float("nan"))]), np.array([False, True]) + ) def test_isin(): - values = [('foo', 2), ('bar', 3), ('quux', 4)] + values = [("foo", 2), ("bar", 3), ("quux", 4)] - idx = MultiIndex.from_arrays([ - ['qux', 'baz', 'foo', 'bar'], - np.arange(4) - ]) + idx = MultiIndex.from_arrays([["qux", "baz", "foo", "bar"], np.arange(4)]) result = idx.isin(values) expected = np.array([False, False, True, True]) tm.assert_numpy_array_equal(result, expected) @@ -60,18 +58,17 @@ def test_isin(): @pytest.mark.skipif(PYPY, reason="tuples cmp recursively on PyPy") def test_isin_nan_not_pypy(): - idx = MultiIndex.from_arrays([['foo', 'bar'], [1.0, np.nan]]) - tm.assert_numpy_array_equal(idx.isin([('bar', np.nan)]), - np.array([False, False])) - tm.assert_numpy_array_equal(idx.isin([('bar', float('nan'))]), - np.array([False, False])) + idx = MultiIndex.from_arrays([["foo", "bar"], [1.0, np.nan]]) + tm.assert_numpy_array_equal(idx.isin([("bar", np.nan)]), np.array([False, False])) + tm.assert_numpy_array_equal( + idx.isin([("bar", float("nan"))]), np.array([False, False]) + ) def test_isin_level_kwarg(): - idx = MultiIndex.from_arrays([['qux', 'baz', 'foo', 'bar'], np.arange( - 4)]) + idx = MultiIndex.from_arrays([["qux", "baz", "foo", "bar"], np.arange(4)]) - vals_0 = ['foo', 'bar', 'quux'] + vals_0 = ["foo", "bar", "quux"] vals_1 = [2, 3, 10] expected = np.array([False, False, True, True]) @@ -84,8 +81,7 @@ def test_isin_level_kwarg(): msg = "Too many levels: Index has only 2 levels, not 6" with pytest.raises(IndexError, match=msg): idx.isin(vals_0, level=5) - msg = ("Too many levels: Index has only 2 levels, -5 is not a valid level" - " number") + msg = "Too many levels: Index has only 2 levels, -5 is not a valid level" " number" with pytest.raises(IndexError, match=msg): idx.isin(vals_0, level=-5) @@ -94,11 +90,11 @@ def test_isin_level_kwarg(): with pytest.raises(KeyError, match=r"'Level -1\.0 not found'"): idx.isin(vals_1, level=-1.0) with pytest.raises(KeyError, match="'Level A not found'"): - idx.isin(vals_1, level='A') + idx.isin(vals_1, level="A") - idx.names = ['A', 'B'] - tm.assert_numpy_array_equal(expected, idx.isin(vals_0, level='A')) - tm.assert_numpy_array_equal(expected, idx.isin(vals_1, level='B')) + idx.names = ["A", "B"] + tm.assert_numpy_array_equal(expected, idx.isin(vals_0, level="A")) + tm.assert_numpy_array_equal(expected, idx.isin(vals_1, level="B")) with pytest.raises(KeyError, match="'Level C not found'"): - idx.isin(vals_1, level='C') + idx.isin(vals_1, level="C") diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index 877904a91b953b..3fc73dd05bc726 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -21,7 +21,7 @@ def test_to_numpy(idx): def test_to_frame(): - tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')] + tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")] index = MultiIndex.from_tuples(tuples) result = index.to_frame(index=False) @@ -32,11 +32,11 @@ def test_to_frame(): expected.index = index tm.assert_frame_equal(result, expected) - tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')] - index = MultiIndex.from_tuples(tuples, names=['first', 'second']) + tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")] + index = MultiIndex.from_tuples(tuples, names=["first", "second"]) result = index.to_frame(index=False) expected = DataFrame(tuples) - expected.columns = ['first', 'second'] + expected.columns = ["first", "second"] tm.assert_frame_equal(result, expected) result = index.to_frame() @@ -45,31 +45,33 @@ def test_to_frame(): # See GH-22580 index = MultiIndex.from_tuples(tuples) - result = index.to_frame(index=False, name=['first', 'second']) + result = index.to_frame(index=False, name=["first", "second"]) expected = DataFrame(tuples) - expected.columns = ['first', 'second'] + expected.columns = ["first", "second"] tm.assert_frame_equal(result, expected) - result = index.to_frame(name=['first', 'second']) + result = index.to_frame(name=["first", "second"]) expected.index = index - expected.columns = ['first', 'second'] + expected.columns = ["first", "second"] tm.assert_frame_equal(result, expected) msg = "'name' must be a list / sequence of column names." with pytest.raises(TypeError, match=msg): - index.to_frame(name='first') + index.to_frame(name="first") msg = "'name' should have same length as number of levels on index." with pytest.raises(ValueError, match=msg): - index.to_frame(name=['first']) + index.to_frame(name=["first"]) # Tests for datetime index - index = MultiIndex.from_product([range(5), - pd.date_range('20130101', periods=3)]) + index = MultiIndex.from_product([range(5), pd.date_range("20130101", periods=3)]) result = index.to_frame(index=False) expected = DataFrame( - {0: np.repeat(np.arange(5, dtype='int64'), 3), - 1: np.tile(pd.date_range('20130101', periods=3), 5)}) + { + 0: np.repeat(np.arange(5, dtype="int64"), 3), + 1: np.tile(pd.date_range("20130101", periods=3), 5), + } + ) tm.assert_frame_equal(result, expected) result = index.to_frame() @@ -77,34 +79,43 @@ def test_to_frame(): tm.assert_frame_equal(result, expected) # See GH-22580 - result = index.to_frame(index=False, name=['first', 'second']) + result = index.to_frame(index=False, name=["first", "second"]) expected = DataFrame( - {'first': np.repeat(np.arange(5, dtype='int64'), 3), - 'second': np.tile(pd.date_range('20130101', periods=3), 5)}) + { + "first": np.repeat(np.arange(5, dtype="int64"), 3), + "second": np.tile(pd.date_range("20130101", periods=3), 5), + } + ) tm.assert_frame_equal(result, expected) - result = index.to_frame(name=['first', 'second']) + result = index.to_frame(name=["first", "second"]) expected.index = index tm.assert_frame_equal(result, expected) def test_to_frame_dtype_fidelity(): # GH 22420 - mi = pd.MultiIndex.from_arrays([ - pd.date_range('19910905', periods=6, tz='US/Eastern'), - [1, 1, 1, 2, 2, 2], - pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), - ['x', 'x', 'y', 'z', 'x', 'y'] - ], names=['dates', 'a', 'b', 'c']) - original_dtypes = {name: mi.levels[i].dtype - for i, name in enumerate(mi.names)} - - expected_df = pd.DataFrame(OrderedDict([ - ('dates', pd.date_range('19910905', periods=6, tz='US/Eastern')), - ('a', [1, 1, 1, 2, 2, 2]), - ('b', pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True)), - ('c', ['x', 'x', 'y', 'z', 'x', 'y']) - ])) + mi = pd.MultiIndex.from_arrays( + [ + pd.date_range("19910905", periods=6, tz="US/Eastern"), + [1, 1, 1, 2, 2, 2], + pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True), + ["x", "x", "y", "z", "x", "y"], + ], + names=["dates", "a", "b", "c"], + ) + original_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} + + expected_df = pd.DataFrame( + OrderedDict( + [ + ("dates", pd.date_range("19910905", periods=6, tz="US/Eastern")), + ("a", [1, 1, 1, 2, 2, 2]), + ("b", pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True)), + ("c", ["x", "x", "y", "z", "x", "y"]), + ] + ) + ) df = mi.to_frame(index=False) df_dtypes = df.dtypes.to_dict() @@ -114,48 +125,61 @@ def test_to_frame_dtype_fidelity(): def test_to_frame_resulting_column_order(): # GH 22420 - expected = ['z', 0, 'a'] - mi = pd.MultiIndex.from_arrays([['a', 'b', 'c'], ['x', 'y', 'z'], - ['q', 'w', 'e']], names=expected) + expected = ["z", 0, "a"] + mi = pd.MultiIndex.from_arrays( + [["a", "b", "c"], ["x", "y", "z"], ["q", "w", "e"]], names=expected + ) result = mi.to_frame().columns.tolist() assert result == expected def test_to_hierarchical(): - index = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), ( - 2, 'two')]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + index = MultiIndex.from_tuples([(1, "one"), (1, "two"), (2, "one"), (2, "two")]) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = index.to_hierarchical(3) - expected = MultiIndex(levels=[[1, 2], ['one', 'two']], - codes=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], - [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]]) + expected = MultiIndex( + levels=[[1, 2], ["one", "two"]], + codes=[ + [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], + [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1], + ], + ) tm.assert_index_equal(result, expected) assert result.names == index.names # K > 1 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = index.to_hierarchical(3, 2) - expected = MultiIndex(levels=[[1, 2], ['one', 'two']], - codes=[[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1], - [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]]) + expected = MultiIndex( + levels=[[1, 2], ["one", "two"]], + codes=[ + [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1], + [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], + ], + ) tm.assert_index_equal(result, expected) assert result.names == index.names # non-sorted - index = MultiIndex.from_tuples([(2, 'c'), (1, 'b'), - (2, 'a'), (2, 'b')], - names=['N1', 'N2']) + index = MultiIndex.from_tuples( + [(2, "c"), (1, "b"), (2, "a"), (2, "b")], names=["N1", "N2"] + ) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = index.to_hierarchical(2) - expected = MultiIndex.from_tuples([(2, 'c'), (2, 'c'), (1, 'b'), - (1, 'b'), - (2, 'a'), (2, 'a'), - (2, 'b'), (2, 'b')], - names=['N1', 'N2']) + expected = MultiIndex.from_tuples( + [ + (2, "c"), + (2, "c"), + (1, "b"), + (1, "b"), + (2, "a"), + (2, "a"), + (2, "b"), + (2, "b"), + ], + names=["N1", "N2"], + ) tm.assert_index_equal(result, expected) assert result.names == index.names @@ -166,9 +190,9 @@ def test_roundtrip_pickle_with_tz(): # GH 8367 # round-trip of timezone index = MultiIndex.from_product( - [[1, 2], ['a', 'b'], date_range('20130101', periods=3, - tz='US/Eastern') - ], names=['one', 'two', 'three']) + [[1, 2], ["a", "b"], date_range("20130101", periods=3, tz="US/Eastern")], + names=["one", "two", "three"], + ) unpickled = tm.round_trip_pickle(index) assert index.equal_levels(unpickled) @@ -178,7 +202,7 @@ def test_pickle(indices): unpickled = tm.round_trip_pickle(indices) assert indices.equals(unpickled) - original_name, indices.name = indices.name, 'foo' + original_name, indices.name = indices.name, "foo" unpickled = tm.round_trip_pickle(indices) assert indices.equals(unpickled) indices.name = original_name @@ -205,7 +229,7 @@ def test_to_series_with_arguments(idx): # name kwarg idx = idx - s = idx.to_series(name='__test') + s = idx.to_series(name="__test") assert s.values is not idx.values assert s.index is not idx @@ -213,8 +237,16 @@ def test_to_series_with_arguments(idx): def test_to_flat_index(idx): - expected = pd.Index((('foo', 'one'), ('foo', 'two'), ('bar', 'one'), - ('baz', 'two'), ('qux', 'one'), ('qux', 'two')), - tupleize_cols=False) + expected = pd.Index( + ( + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ), + tupleize_cols=False, + ) result = idx.to_flat_index() tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_copy.py b/pandas/tests/indexes/multi/test_copy.py index 17e81a348f1865..35a5cccc0ec45d 100644 --- a/pandas/tests/indexes/multi/test_copy.py +++ b/pandas/tests/indexes/multi/test_copy.py @@ -46,46 +46,49 @@ def test_view(idx): assert_multiindex_copied(i_view, idx) -@pytest.mark.parametrize('func', [copy, deepcopy]) +@pytest.mark.parametrize("func", [copy, deepcopy]) def test_copy_and_deepcopy(func): idx = MultiIndex( - levels=[['foo', 'bar'], ['fizz', 'buzz']], + levels=[["foo", "bar"], ["fizz", "buzz"]], codes=[[0, 0, 0, 1], [0, 0, 1, 1]], - names=['first', 'second'] + names=["first", "second"], ) idx_copy = func(idx) assert idx_copy is not idx assert idx_copy.equals(idx) -@pytest.mark.parametrize('deep', [True, False]) +@pytest.mark.parametrize("deep", [True, False]) def test_copy_method(deep): idx = MultiIndex( - levels=[['foo', 'bar'], ['fizz', 'buzz']], + levels=[["foo", "bar"], ["fizz", "buzz"]], codes=[[0, 0, 0, 1], [0, 0, 1, 1]], - names=['first', 'second'] + names=["first", "second"], ) idx_copy = idx.copy(deep=deep) assert idx_copy.equals(idx) -@pytest.mark.parametrize('deep', [True, False]) -@pytest.mark.parametrize('kwarg, value', [ - ('names', ['thrid', 'fourth']), - ('levels', [['foo2', 'bar2'], ['fizz2', 'buzz2']]), - ('codes', [[1, 0, 0, 0], [1, 1, 0, 0]]) -]) +@pytest.mark.parametrize("deep", [True, False]) +@pytest.mark.parametrize( + "kwarg, value", + [ + ("names", ["thrid", "fourth"]), + ("levels", [["foo2", "bar2"], ["fizz2", "buzz2"]]), + ("codes", [[1, 0, 0, 0], [1, 1, 0, 0]]), + ], +) def test_copy_method_kwargs(deep, kwarg, value): # gh-12309: Check that the "name" argument as well other kwargs are honored idx = MultiIndex( - levels=[['foo', 'bar'], ['fizz', 'buzz']], + levels=[["foo", "bar"], ["fizz", "buzz"]], codes=[[0, 0, 0, 1], [0, 0, 1, 1]], - names=['first', 'second'] + names=["first", "second"], ) return - idx_copy = idx.copy(**{kwarg: value, 'deep': deep}) - if kwarg == 'names': + idx_copy = idx.copy(**{kwarg: value, "deep": deep}) + if kwarg == "names": assert getattr(idx_copy, kwarg) == value else: assert [list(i) for i in getattr(idx_copy, kwarg)] == value diff --git a/pandas/tests/indexes/multi/test_drop.py b/pandas/tests/indexes/multi/test_drop.py index 555ed948f82a13..2c24c5bd570856 100644 --- a/pandas/tests/indexes/multi/test_drop.py +++ b/pandas/tests/indexes/multi/test_drop.py @@ -9,78 +9,81 @@ def test_drop(idx): - dropped = idx.drop([('foo', 'two'), ('qux', 'one')]) + dropped = idx.drop([("foo", "two"), ("qux", "one")]) - index = MultiIndex.from_tuples([('foo', 'two'), ('qux', 'one')]) + index = MultiIndex.from_tuples([("foo", "two"), ("qux", "one")]) dropped2 = idx.drop(index) expected = idx[[0, 2, 3, 5]] tm.assert_index_equal(dropped, expected) tm.assert_index_equal(dropped2, expected) - dropped = idx.drop(['bar']) + dropped = idx.drop(["bar"]) expected = idx[[0, 1, 3, 4, 5]] tm.assert_index_equal(dropped, expected) - dropped = idx.drop('foo') + dropped = idx.drop("foo") expected = idx[[2, 3, 4, 5]] tm.assert_index_equal(dropped, expected) - index = MultiIndex.from_tuples([('bar', 'two')]) + index = MultiIndex.from_tuples([("bar", "two")]) with pytest.raises(KeyError, match=r"^10$"): - idx.drop([('bar', 'two')]) + idx.drop([("bar", "two")]) with pytest.raises(KeyError, match=r"^10$"): idx.drop(index) with pytest.raises(KeyError, match=r"^'two'$"): - idx.drop(['foo', 'two']) + idx.drop(["foo", "two"]) # partially correct argument - mixed_index = MultiIndex.from_tuples([('qux', 'one'), ('bar', 'two')]) + mixed_index = MultiIndex.from_tuples([("qux", "one"), ("bar", "two")]) with pytest.raises(KeyError, match=r"^10$"): idx.drop(mixed_index) # error='ignore' - dropped = idx.drop(index, errors='ignore') + dropped = idx.drop(index, errors="ignore") expected = idx[[0, 1, 2, 3, 4, 5]] tm.assert_index_equal(dropped, expected) - dropped = idx.drop(mixed_index, errors='ignore') + dropped = idx.drop(mixed_index, errors="ignore") expected = idx[[0, 1, 2, 3, 5]] tm.assert_index_equal(dropped, expected) - dropped = idx.drop(['foo', 'two'], errors='ignore') + dropped = idx.drop(["foo", "two"], errors="ignore") expected = idx[[2, 3, 4, 5]] tm.assert_index_equal(dropped, expected) # mixed partial / full drop - dropped = idx.drop(['foo', ('qux', 'one')]) + dropped = idx.drop(["foo", ("qux", "one")]) expected = idx[[2, 3, 5]] tm.assert_index_equal(dropped, expected) # mixed partial / full drop / error='ignore' - mixed_index = ['foo', ('qux', 'one'), 'two'] + mixed_index = ["foo", ("qux", "one"), "two"] with pytest.raises(KeyError, match=r"^'two'$"): idx.drop(mixed_index) - dropped = idx.drop(mixed_index, errors='ignore') + dropped = idx.drop(mixed_index, errors="ignore") expected = idx[[2, 3, 5]] tm.assert_index_equal(dropped, expected) def test_droplevel_with_names(idx): - index = idx[idx.get_loc('foo')] + index = idx[idx.get_loc("foo")] dropped = index.droplevel(0) - assert dropped.name == 'second' + assert dropped.name == "second" index = MultiIndex( levels=[Index(range(4)), Index(range(4)), Index(range(4))], - codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), - np.array([0, 1, 0, 0, 0, 1, 0, 1]), - np.array([1, 0, 1, 1, 0, 0, 1, 0])], - names=['one', 'two', 'three']) + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + names=["one", "two", "three"], + ) dropped = index.droplevel(0) - assert dropped.names == ('two', 'three') + assert dropped.names == ("two", "three") - dropped = index.droplevel('two') + dropped = index.droplevel("two") expected = index.droplevel(1) assert dropped.equals(expected) @@ -88,12 +91,15 @@ def test_droplevel_with_names(idx): def test_droplevel_list(): index = MultiIndex( levels=[Index(range(4)), Index(range(4)), Index(range(4))], - codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), - np.array([0, 1, 0, 0, 0, 1, 0, 1]), - np.array([1, 0, 1, 1, 0, 0, 1, 0])], - names=['one', 'two', 'three']) - - dropped = index[:2].droplevel(['three', 'one']) + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + names=["one", "two", "three"], + ) + + dropped = index[:2].droplevel(["three", "one"]) expected = index[:2].droplevel(2).droplevel(0) assert dropped.equals(expected) @@ -101,27 +107,30 @@ def test_droplevel_list(): expected = index[:2] assert dropped.equals(expected) - msg = ("Cannot remove 3 levels from an index with 3 levels: at least one" - " level must be left") + msg = ( + "Cannot remove 3 levels from an index with 3 levels: at least one" + " level must be left" + ) with pytest.raises(ValueError, match=msg): - index[:2].droplevel(['one', 'two', 'three']) + index[:2].droplevel(["one", "two", "three"]) with pytest.raises(KeyError, match="'Level four not found'"): - index[:2].droplevel(['one', 'four']) + index[:2].droplevel(["one", "four"]) def test_drop_not_lexsorted(): # GH 12078 # define the lexsorted version of the multi-index - tuples = [('a', ''), ('b1', 'c1'), ('b2', 'c2')] - lexsorted_mi = MultiIndex.from_tuples(tuples, names=['b', 'c']) + tuples = [("a", ""), ("b1", "c1"), ("b2", "c2")] + lexsorted_mi = MultiIndex.from_tuples(tuples, names=["b", "c"]) assert lexsorted_mi.is_lexsorted() # and the not-lexsorted version - df = pd.DataFrame(columns=['a', 'b', 'c', 'd'], - data=[[1, 'b1', 'c1', 3], [1, 'b2', 'c2', 4]]) - df = df.pivot_table(index='a', columns=['b', 'c'], values='d') + df = pd.DataFrame( + columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]] + ) + df = df.pivot_table(index="a", columns=["b", "c"], values="d") df = df.reset_index() not_lexsorted_mi = df.columns assert not not_lexsorted_mi.is_lexsorted() @@ -129,5 +138,4 @@ def test_drop_not_lexsorted(): # compare the results tm.assert_index_equal(lexsorted_mi, not_lexsorted_mi) with tm.assert_produces_warning(PerformanceWarning): - tm.assert_index_equal(lexsorted_mi.drop('a'), - not_lexsorted_mi.drop('a')) + tm.assert_index_equal(lexsorted_mi.drop("a"), not_lexsorted_mi.drop("a")) diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index b1eff00d07484a..518bd093b23b19 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -9,7 +9,7 @@ import pandas.util.testing as tm -@pytest.mark.parametrize('names', [None, ['first', 'second']]) +@pytest.mark.parametrize("names", [None, ["first", "second"]]) def test_unique(names): mi = MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]], names=names) @@ -17,15 +17,14 @@ def test_unique(names): exp = MultiIndex.from_arrays([[1, 2, 2], [1, 1, 2]], names=mi.names) tm.assert_index_equal(res, exp) - mi = MultiIndex.from_arrays([list('aaaa'), list('abab')], - names=names) + mi = MultiIndex.from_arrays([list("aaaa"), list("abab")], names=names) res = mi.unique() - exp = MultiIndex.from_arrays([list('aa'), list('ab')], names=mi.names) + exp = MultiIndex.from_arrays([list("aa"), list("ab")], names=mi.names) tm.assert_index_equal(res, exp) - mi = MultiIndex.from_arrays([list('aaaa'), list('aaaa')], names=names) + mi = MultiIndex.from_arrays([list("aaaa"), list("aaaa")], names=names) res = mi.unique() - exp = MultiIndex.from_arrays([['a'], ['a']], names=mi.names) + exp = MultiIndex.from_arrays([["a"], ["a"]], names=mi.names) tm.assert_index_equal(res, exp) # GH #20568 - empty MI @@ -35,22 +34,24 @@ def test_unique(names): def test_unique_datetimelike(): - idx1 = DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-01', - '2015-01-01', 'NaT', 'NaT']) - idx2 = DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-02', - '2015-01-02', 'NaT', '2015-01-01'], - tz='Asia/Tokyo') + idx1 = DatetimeIndex( + ["2015-01-01", "2015-01-01", "2015-01-01", "2015-01-01", "NaT", "NaT"] + ) + idx2 = DatetimeIndex( + ["2015-01-01", "2015-01-01", "2015-01-02", "2015-01-02", "NaT", "2015-01-01"], + tz="Asia/Tokyo", + ) result = MultiIndex.from_arrays([idx1, idx2]).unique() - eidx1 = DatetimeIndex(['2015-01-01', '2015-01-01', 'NaT', 'NaT']) - eidx2 = DatetimeIndex(['2015-01-01', '2015-01-02', - 'NaT', '2015-01-01'], - tz='Asia/Tokyo') + eidx1 = DatetimeIndex(["2015-01-01", "2015-01-01", "NaT", "NaT"]) + eidx2 = DatetimeIndex( + ["2015-01-01", "2015-01-02", "NaT", "2015-01-01"], tz="Asia/Tokyo" + ) exp = MultiIndex.from_arrays([eidx1, eidx2]) tm.assert_index_equal(result, exp) -@pytest.mark.parametrize('level', [0, 'first', 1, 'second']) +@pytest.mark.parametrize("level", [0, "first", 1, "second"]) def test_unique_level(idx, level): # GH #17896 - with level= argument result = idx.unique(level=level) @@ -58,19 +59,18 @@ def test_unique_level(idx, level): tm.assert_index_equal(result, expected) # With already unique level - mi = MultiIndex.from_arrays([[1, 3, 2, 4], [1, 3, 2, 5]], - names=['first', 'second']) + mi = MultiIndex.from_arrays([[1, 3, 2, 4], [1, 3, 2, 5]], names=["first", "second"]) result = mi.unique(level=level) expected = mi.get_level_values(level) tm.assert_index_equal(result, expected) # With empty MI - mi = MultiIndex.from_arrays([[], []], names=['first', 'second']) + mi = MultiIndex.from_arrays([[], []], names=["first", "second"]) result = mi.unique(level=level) expected = mi.get_level_values(level) -@pytest.mark.parametrize('dropna', [True, False]) +@pytest.mark.parametrize("dropna", [True, False]) def test_get_unique_index(idx, dropna): mi = idx[[0, 1, 0, 1, 1, 0, 0]] expected = mi._shallow_copy(mi[[0, 1]]) @@ -84,18 +84,15 @@ def test_duplicate_multiindex_codes(): # GH 17464 # Make sure that a MultiIndex with duplicate levels throws a ValueError with pytest.raises(ValueError): - mi = MultiIndex([['A'] * 10, range(10)], [[0] * 10, range(10)]) + mi = MultiIndex([["A"] * 10, range(10)], [[0] * 10, range(10)]) # And that using set_levels with duplicate levels fails - mi = MultiIndex.from_arrays([['A', 'A', 'B', 'B', 'B'], - [1, 2, 1, 2, 3]]) + mi = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]]) with pytest.raises(ValueError): - mi.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]], - inplace=True) + mi.set_levels([["A", "B", "A", "A", "B"], [2, 1, 3, -2, 5]], inplace=True) -@pytest.mark.parametrize('names', [['a', 'b', 'a'], [1, 1, 2], - [1, 'a', 1]]) +@pytest.mark.parametrize("names", [["a", "b", "a"], [1, 1, 2], [1, "a", 1]]) def test_duplicate_level_names(names): # GH18872, GH19029 mi = MultiIndex.from_product([[0, 1]] * 3, names=names) @@ -115,14 +112,15 @@ def test_duplicate_level_names(names): def test_duplicate_meta_data(): # GH 10115 mi = MultiIndex( - levels=[[0, 1], [0, 1, 2]], - codes=[[0, 0, 0, 0, 1, 1, 1], - [0, 1, 2, 0, 0, 1, 2]]) - - for idx in [mi, - mi.set_names([None, None]), - mi.set_names([None, 'Num']), - mi.set_names(['Upper', 'Num']), ]: + levels=[[0, 1], [0, 1, 2]], codes=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]] + ) + + for idx in [ + mi, + mi.set_names([None, None]), + mi.set_names([None, "Num"]), + mi.set_names(["Upper", "Num"]), + ]: assert idx.has_duplicates assert idx.drop_duplicates().names == idx.names @@ -134,45 +132,49 @@ def test_has_duplicates(idx, idx_dup): assert idx_dup.is_unique is False assert idx_dup.has_duplicates is True - mi = MultiIndex(levels=[[0, 1], [0, 1, 2]], - codes=[[0, 0, 0, 0, 1, 1, 1], - [0, 1, 2, 0, 0, 1, 2]]) + mi = MultiIndex( + levels=[[0, 1], [0, 1, 2]], codes=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]] + ) assert mi.is_unique is False assert mi.has_duplicates is True # single instance of NaN - mi_nan = MultiIndex(levels=[['a', 'b'], [0, 1]], - codes=[[-1, 0, 0, 1, 1], [-1, 0, 1, 0, 1]]) + mi_nan = MultiIndex( + levels=[["a", "b"], [0, 1]], codes=[[-1, 0, 0, 1, 1], [-1, 0, 1, 0, 1]] + ) assert mi_nan.is_unique is True assert mi_nan.has_duplicates is False # multiple instances of NaN - mi_nan_dup = MultiIndex(levels=[['a', 'b'], [0, 1]], - codes=[[-1, -1, 0, 0, 1, 1], [-1, -1, 0, 1, 0, 1]]) + mi_nan_dup = MultiIndex( + levels=[["a", "b"], [0, 1]], codes=[[-1, -1, 0, 0, 1, 1], [-1, -1, 0, 1, 0, 1]] + ) assert mi_nan_dup.is_unique is False assert mi_nan_dup.has_duplicates is True def test_has_duplicates_from_tuples(): # GH 9075 - t = [('x', 'out', 'z', 5, 'y', 'in', 'z', 169), - ('x', 'out', 'z', 7, 'y', 'in', 'z', 119), - ('x', 'out', 'z', 9, 'y', 'in', 'z', 135), - ('x', 'out', 'z', 13, 'y', 'in', 'z', 145), - ('x', 'out', 'z', 14, 'y', 'in', 'z', 158), - ('x', 'out', 'z', 16, 'y', 'in', 'z', 122), - ('x', 'out', 'z', 17, 'y', 'in', 'z', 160), - ('x', 'out', 'z', 18, 'y', 'in', 'z', 180), - ('x', 'out', 'z', 20, 'y', 'in', 'z', 143), - ('x', 'out', 'z', 21, 'y', 'in', 'z', 128), - ('x', 'out', 'z', 22, 'y', 'in', 'z', 129), - ('x', 'out', 'z', 25, 'y', 'in', 'z', 111), - ('x', 'out', 'z', 28, 'y', 'in', 'z', 114), - ('x', 'out', 'z', 29, 'y', 'in', 'z', 121), - ('x', 'out', 'z', 31, 'y', 'in', 'z', 126), - ('x', 'out', 'z', 32, 'y', 'in', 'z', 155), - ('x', 'out', 'z', 33, 'y', 'in', 'z', 123), - ('x', 'out', 'z', 12, 'y', 'in', 'z', 144)] + t = [ + ("x", "out", "z", 5, "y", "in", "z", 169), + ("x", "out", "z", 7, "y", "in", "z", 119), + ("x", "out", "z", 9, "y", "in", "z", 135), + ("x", "out", "z", 13, "y", "in", "z", 145), + ("x", "out", "z", 14, "y", "in", "z", 158), + ("x", "out", "z", 16, "y", "in", "z", 122), + ("x", "out", "z", 17, "y", "in", "z", 160), + ("x", "out", "z", 18, "y", "in", "z", 180), + ("x", "out", "z", 20, "y", "in", "z", 143), + ("x", "out", "z", 21, "y", "in", "z", 128), + ("x", "out", "z", 22, "y", "in", "z", 129), + ("x", "out", "z", 25, "y", "in", "z", 111), + ("x", "out", "z", 28, "y", "in", "z", 114), + ("x", "out", "z", 29, "y", "in", "z", 121), + ("x", "out", "z", 31, "y", "in", "z", 126), + ("x", "out", "z", 32, "y", "in", "z", 155), + ("x", "out", "z", 33, "y", "in", "z", 123), + ("x", "out", "z", 12, "y", "in", "z", 144), + ] mi = MultiIndex.from_tuples(t) assert not mi.has_duplicates @@ -202,8 +204,10 @@ def check(nlevels, with_nulls): # with a dup if with_nulls: + def f(a): return np.insert(a, 1000, a[0]) + codes = list(map(f, codes)) mi = MultiIndex(levels=levels, codes=codes) else: @@ -221,17 +225,20 @@ def f(a): check(8, True) -@pytest.mark.parametrize('keep, expected', [ - ('first', np.array([False, False, False, True, True, False])), - ('last', np.array([False, True, True, False, False, False])), - (False, np.array([False, True, True, True, True, False])) -]) +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", np.array([False, False, False, True, True, False])), + ("last", np.array([False, True, True, False, False, False])), + (False, np.array([False, True, True, True, True, False])), + ], +) def test_duplicated(idx_dup, keep, expected): result = idx_dup.duplicated(keep=keep) tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize('keep', ['first', 'last', False]) +@pytest.mark.parametrize("keep", ["first", "last", False]) def test_duplicated_large(keep): # GH 9125 n, k = 200, 5000 @@ -254,22 +261,23 @@ def test_get_duplicates(): # Deprecated - see GH20239 assert mi.get_duplicates().equals(MultiIndex.from_arrays([[], []])) - tm.assert_numpy_array_equal(mi.duplicated(), - np.zeros(2, dtype='bool')) + tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(2, dtype="bool")) for n in range(1, 6): # 1st level shape for m in range(1, 5): # 2nd level shape # all possible unique combinations, including nan codes = product(range(-1, n), range(-1, m)) - mi = MultiIndex(levels=[list('abcde')[:n], list('WXYZ')[:m]], - codes=np.random.permutation(list(codes)).T) + mi = MultiIndex( + levels=[list("abcde")[:n], list("WXYZ")[:m]], + codes=np.random.permutation(list(codes)).T, + ) assert len(mi) == (n + 1) * (m + 1) assert not mi.has_duplicates with tm.assert_produces_warning(FutureWarning): # Deprecated - see GH20239 - assert mi.get_duplicates().equals(MultiIndex.from_arrays( - [[], []])) + assert mi.get_duplicates().equals(MultiIndex.from_arrays([[], []])) - tm.assert_numpy_array_equal(mi.duplicated(), - np.zeros(len(mi), dtype='bool')) + tm.assert_numpy_array_equal( + mi.duplicated(), np.zeros(len(mi), dtype="bool") + ) diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py index 3bdccbb8ab38d7..f61ba0132ab975 100644 --- a/pandas/tests/indexes/multi/test_equivalence.py +++ b/pandas/tests/indexes/multi/test_equivalence.py @@ -94,13 +94,14 @@ def test_equals_multi(idx): assert not idx.equals(idx[-1]) # different number of levels - index = MultiIndex(levels=[Index(list(range(4))), - Index(list(range(4))), - Index(list(range(4)))], - codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), - np.array([0, 1, 0, 0, 0, 1, 0, 1]), - np.array([1, 0, 1, 1, 0, 0, 1, 0])], - ) + index = MultiIndex( + levels=[Index(list(range(4))), Index(list(range(4))), Index(list(range(4)))], + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + ) index2 = MultiIndex(levels=index.levels[:-1], codes=index.codes[:-1]) assert not index.equals(index2) @@ -113,20 +114,22 @@ def test_equals_multi(idx): major_codes = np.array([0, 0, 1, 2, 2, 3]) minor_codes = np.array([0, 1, 0, 0, 1, 0]) - index = MultiIndex(levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes]) + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) assert not idx.equals(index) assert not idx.equal_levels(index) # some of the labels are different - major_axis = Index(['foo', 'bar', 'baz', 'qux']) - minor_axis = Index(['one', 'two']) + major_axis = Index(["foo", "bar", "baz", "qux"]) + minor_axis = Index(["one", "two"]) major_codes = np.array([0, 0, 2, 2, 3, 3]) minor_codes = np.array([0, 1, 0, 1, 0, 1]) - index = MultiIndex(levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes]) + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) assert not idx.equals(index) @@ -135,11 +138,11 @@ def test_identical(idx): mi2 = idx.copy() assert mi.identical(mi2) - mi = mi.set_names(['new1', 'new2']) + mi = mi.set_names(["new1", "new2"]) assert mi.equals(mi2) assert not mi.identical(mi2) - mi2 = mi2.set_names(['new1', 'new2']) + mi2 = mi2.set_names(["new1", "new2"]) assert mi.identical(mi2) mi3 = Index(mi.tolist(), names=mi.names) @@ -156,8 +159,7 @@ def test_equals_operator(idx): def test_equals_missing_values(): # make sure take is not using -1 - i = pd.MultiIndex.from_tuples([(0, pd.NaT), - (0, pd.Timestamp('20130101'))]) + i = pd.MultiIndex.from_tuples([(0, pd.NaT), (0, pd.Timestamp("20130101"))]) result = i[0:1].equals(i[0]) assert not result result = i[1:2].equals(i[1]) diff --git a/pandas/tests/indexes/multi/test_format.py b/pandas/tests/indexes/multi/test_format.py index 8413fc1318d0b2..a7f58b9ea78bde 100644 --- a/pandas/tests/indexes/multi/test_format.py +++ b/pandas/tests/indexes/multi/test_format.py @@ -20,20 +20,20 @@ def test_format(idx): def test_format_integer_names(): - index = MultiIndex(levels=[[0, 1], [0, 1]], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=[0, 1]) + index = MultiIndex( + levels=[[0, 1], [0, 1]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=[0, 1] + ) index.format(names=True) def test_format_sparse_config(idx): warn_filters = warnings.filters - warnings.filterwarnings('ignore', category=FutureWarning, - module=".*format") + warnings.filterwarnings("ignore", category=FutureWarning, module=".*format") # GH1538 - pd.set_option('display.multi_sparse', False) + pd.set_option("display.multi_sparse", False) result = idx.format() - assert result[1] == 'foo two' + assert result[1] == "foo two" tm.reset_display_options() @@ -41,24 +41,29 @@ def test_format_sparse_config(idx): def test_format_sparse_display(): - index = MultiIndex(levels=[[0, 1], [0, 1], [0, 1], [0]], - codes=[[0, 0, 0, 1, 1, 1], [0, 0, 1, 0, 0, 1], - [0, 1, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0]]) + index = MultiIndex( + levels=[[0, 1], [0, 1], [0, 1], [0]], + codes=[ + [0, 0, 0, 1, 1, 1], + [0, 0, 1, 0, 0, 1], + [0, 1, 0, 0, 1, 0], + [0, 0, 0, 0, 0, 0], + ], + ) result = index.format() - assert result[3] == '1 0 0 0' + assert result[3] == "1 0 0 0" def test_repr_with_unicode_data(): - with pd.option_context("display.encoding", 'UTF-8'): + with pd.option_context("display.encoding", "UTF-8"): d = {"a": ["\u05d0", 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} index = pd.DataFrame(d).set_index(["a", "b"]).index assert "\\" not in repr(index) # we don't want unicode-escaped def test_repr_roundtrip_raises(): - mi = MultiIndex.from_product([list('ab'), range(3)], - names=['first', 'second']) + mi = MultiIndex.from_product([list("ab"), range(3)], names=["first", "second"]) with pytest.raises(TypeError): eval(repr(mi)) @@ -74,11 +79,10 @@ def test_repr_max_seq_item_setting(idx): idx = idx.repeat(50) with pd.option_context("display.max_seq_items", None): repr(idx) - assert '...' not in str(idx) + assert "..." not in str(idx) class TestRepr: - def test_repr(self, idx): result = idx[:1].__repr__() expected = """\ @@ -97,7 +101,7 @@ def test_repr(self, idx): names=['first', 'second'])""" assert result == expected - with pd.option_context('display.max_seq_items', 5): + with pd.option_context("display.max_seq_items", 5): result = idx.__repr__() expected = """\ MultiIndex([('foo', 'one'), diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index e07f1ce7c4e92f..5ab817d8468c3a 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -23,68 +23,66 @@ def test_get_level_number_integer(idx): msg = "Too many levels: Index has only 2 levels, not 3" with pytest.raises(IndexError, match=msg): idx._get_level_number(2) - with pytest.raises(KeyError, match='Level fourth not found'): - idx._get_level_number('fourth') + with pytest.raises(KeyError, match="Level fourth not found"): + idx._get_level_number("fourth") def test_get_level_values(idx): result = idx.get_level_values(0) - expected = Index(['foo', 'foo', 'bar', 'baz', 'qux', 'qux'], - name='first') + expected = Index(["foo", "foo", "bar", "baz", "qux", "qux"], name="first") tm.assert_index_equal(result, expected) - assert result.name == 'first' + assert result.name == "first" - result = idx.get_level_values('first') + result = idx.get_level_values("first") expected = idx.get_level_values(0) tm.assert_index_equal(result, expected) # GH 10460 index = MultiIndex( - levels=[CategoricalIndex(['A', 'B']), - CategoricalIndex([1, 2, 3])], - codes=[np.array([0, 0, 0, 1, 1, 1]), - np.array([0, 1, 2, 0, 1, 2])]) + levels=[CategoricalIndex(["A", "B"]), CategoricalIndex([1, 2, 3])], + codes=[np.array([0, 0, 0, 1, 1, 1]), np.array([0, 1, 2, 0, 1, 2])], + ) - exp = CategoricalIndex(['A', 'A', 'A', 'B', 'B', 'B']) + exp = CategoricalIndex(["A", "A", "A", "B", "B", "B"]) tm.assert_index_equal(index.get_level_values(0), exp) exp = CategoricalIndex([1, 2, 3, 1, 2, 3]) tm.assert_index_equal(index.get_level_values(1), exp) def test_get_value_duplicates(): - index = MultiIndex(levels=[['D', 'B', 'C'], - [0, 26, 27, 37, 57, 67, 75, 82]], - codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], - [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], - names=['tag', 'day']) + index = MultiIndex( + levels=[["D", "B", "C"], [0, 26, 27, 37, 57, 67, 75, 82]], + codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + names=["tag", "day"], + ) - assert index.get_loc('D') == slice(0, 3) + assert index.get_loc("D") == slice(0, 3) with pytest.raises(KeyError, match=r"^'D'$"): - index._engine.get_value(np.array([]), 'D') + index._engine.get_value(np.array([]), "D") def test_get_level_values_all_na(): # GH 17924 when level entirely consists of nan - arrays = [[np.nan, np.nan, np.nan], ['a', np.nan, 1]] + arrays = [[np.nan, np.nan, np.nan], ["a", np.nan, 1]] index = pd.MultiIndex.from_arrays(arrays) result = index.get_level_values(0) expected = pd.Index([np.nan, np.nan, np.nan], dtype=np.float64) tm.assert_index_equal(result, expected) result = index.get_level_values(1) - expected = pd.Index(['a', np.nan, 1], dtype=object) + expected = pd.Index(["a", np.nan, 1], dtype=object) tm.assert_index_equal(result, expected) def test_get_level_values_int_with_na(): # GH 17924 - arrays = [['a', 'b', 'b'], [1, np.nan, 2]] + arrays = [["a", "b", "b"], [1, np.nan, 2]] index = pd.MultiIndex.from_arrays(arrays) result = index.get_level_values(1) expected = Index([1, np.nan, 2]) tm.assert_index_equal(result, expected) - arrays = [['a', 'b', 'b'], [np.nan, np.nan, 2]] + arrays = [["a", "b", "b"], [np.nan, np.nan, 2]] index = pd.MultiIndex.from_arrays(arrays) result = index.get_level_values(1) expected = Index([np.nan, np.nan, 2]) @@ -92,17 +90,17 @@ def test_get_level_values_int_with_na(): def test_get_level_values_na(): - arrays = [[np.nan, np.nan, np.nan], ['a', np.nan, 1]] + arrays = [[np.nan, np.nan, np.nan], ["a", np.nan, 1]] index = pd.MultiIndex.from_arrays(arrays) result = index.get_level_values(0) expected = pd.Index([np.nan, np.nan, np.nan]) tm.assert_index_equal(result, expected) result = index.get_level_values(1) - expected = pd.Index(['a', np.nan, 1]) + expected = pd.Index(["a", np.nan, 1]) tm.assert_index_equal(result, expected) - arrays = [['a', 'b', 'b'], pd.DatetimeIndex([0, 1, pd.NaT])] + arrays = [["a", "b", "b"], pd.DatetimeIndex([0, 1, pd.NaT])] index = pd.MultiIndex.from_arrays(arrays) result = index.get_level_values(1) expected = pd.DatetimeIndex([0, 1, pd.NaT]) @@ -153,7 +151,7 @@ def test_set_levels_codes_directly(idx): # setting levels/codes directly raises AttributeError levels = idx.levels - new_levels = [[lev + 'a' for lev in level] for level in levels] + new_levels = [[lev + "a" for lev in level] for level in levels] codes = idx.codes major_codes, minor_codes = codes @@ -172,7 +170,7 @@ def test_set_levels(idx): # side note - you probably wouldn't want to use levels and codes # directly like this - but it is possible. levels = idx.levels - new_levels = [[lev + 'a' for lev in level] for level in levels] + new_levels = [[lev + "a" for lev in level] for level in levels] # level changing [w/o mutation] ind2 = idx.set_levels(new_levels) @@ -214,8 +212,7 @@ def test_set_levels(idx): # level changing multiple levels [w/ mutation] ind2 = idx.copy() - inplace_return = ind2.set_levels(new_levels, level=[0, 1], - inplace=True) + inplace_return = ind2.set_levels(new_levels, level=[0, 1], inplace=True) assert inplace_return is None assert_matching(ind2.levels, new_levels) assert_matching(idx.levels, levels) @@ -225,25 +222,20 @@ def test_set_levels(idx): original_index = idx.copy() for inplace in [True, False]: with pytest.raises(ValueError, match="^On"): - idx.set_levels(['c'], level=0, inplace=inplace) - assert_matching(idx.levels, original_index.levels, - check_dtype=True) + idx.set_levels(["c"], level=0, inplace=inplace) + assert_matching(idx.levels, original_index.levels, check_dtype=True) with pytest.raises(ValueError, match="^On"): - idx.set_codes([0, 1, 2, 3, 4, 5], level=0, - inplace=inplace) - assert_matching(idx.codes, original_index.codes, - check_dtype=True) + idx.set_codes([0, 1, 2, 3, 4, 5], level=0, inplace=inplace) + assert_matching(idx.codes, original_index.codes, check_dtype=True) with pytest.raises(TypeError, match="^Levels"): - idx.set_levels('c', level=0, inplace=inplace) - assert_matching(idx.levels, original_index.levels, - check_dtype=True) + idx.set_levels("c", level=0, inplace=inplace) + assert_matching(idx.levels, original_index.levels, check_dtype=True) with pytest.raises(TypeError, match="^Codes"): idx.set_codes(1, level=0, inplace=inplace) - assert_matching(idx.codes, original_index.codes, - check_dtype=True) + assert_matching(idx.codes, original_index.codes, check_dtype=True) def test_set_codes(idx): @@ -295,8 +287,7 @@ def test_set_codes(idx): # codes changing multiple levels [w/ mutation] ind2 = idx.copy() - inplace_return = ind2.set_codes(new_codes, level=[0, 1], - inplace=True) + inplace_return = ind2.set_codes(new_codes, level=[0, 1], inplace=True) assert inplace_return is None assert_matching(ind2.codes, new_codes) assert_matching(idx.codes, codes) @@ -304,8 +295,7 @@ def test_set_codes(idx): # label changing for levels of different magnitude of categories ind = pd.MultiIndex.from_tuples([(0, i) for i in range(130)]) new_codes = range(129, -1, -1) - expected = pd.MultiIndex.from_tuples( - [(0, i) for i in new_codes]) + expected = pd.MultiIndex.from_tuples([(0, i) for i in new_codes]) # [w/o mutation] result = ind.set_codes(codes=new_codes, level=1) @@ -324,8 +314,7 @@ def test_set_labels_deprecated(): # GH23752 ind = pd.MultiIndex.from_tuples([(0, i) for i in range(130)]) new_labels = range(129, -1, -1) - expected = pd.MultiIndex.from_tuples( - [(0, i) for i in new_labels]) + expected = pd.MultiIndex.from_tuples([(0, i) for i in new_labels]) # [w/o mutation] with tm.assert_produces_warning(FutureWarning): @@ -343,59 +332,57 @@ def test_set_levels_codes_names_bad_input(idx): levels, codes = idx.levels, idx.codes names = idx.names - with pytest.raises(ValueError, match='Length of levels'): + with pytest.raises(ValueError, match="Length of levels"): idx.set_levels([levels[0]]) - with pytest.raises(ValueError, match='Length of codes'): + with pytest.raises(ValueError, match="Length of codes"): idx.set_codes([codes[0]]) - with pytest.raises(ValueError, match='Length of names'): + with pytest.raises(ValueError, match="Length of names"): idx.set_names([names[0]]) # shouldn't scalar data error, instead should demand list-like - with pytest.raises(TypeError, match='list of lists-like'): + with pytest.raises(TypeError, match="list of lists-like"): idx.set_levels(levels[0]) # shouldn't scalar data error, instead should demand list-like - with pytest.raises(TypeError, match='list of lists-like'): + with pytest.raises(TypeError, match="list of lists-like"): idx.set_codes(codes[0]) # shouldn't scalar data error, instead should demand list-like - with pytest.raises(TypeError, match='list-like'): + with pytest.raises(TypeError, match="list-like"): idx.set_names(names[0]) # should have equal lengths - with pytest.raises(TypeError, match='list of lists-like'): + with pytest.raises(TypeError, match="list of lists-like"): idx.set_levels(levels[0], level=[0, 1]) - with pytest.raises(TypeError, match='list-like'): + with pytest.raises(TypeError, match="list-like"): idx.set_levels(levels, level=0) # should have equal lengths - with pytest.raises(TypeError, match='list of lists-like'): + with pytest.raises(TypeError, match="list of lists-like"): idx.set_codes(codes[0], level=[0, 1]) - with pytest.raises(TypeError, match='list-like'): + with pytest.raises(TypeError, match="list-like"): idx.set_codes(codes, level=0) # should have equal lengths - with pytest.raises(ValueError, match='Length of names'): + with pytest.raises(ValueError, match="Length of names"): idx.set_names(names[0], level=[0, 1]) - with pytest.raises(TypeError, match='Names must be a'): + with pytest.raises(TypeError, match="Names must be a"): idx.set_names(names, level=0) -@pytest.mark.parametrize('inplace', [True, False]) +@pytest.mark.parametrize("inplace", [True, False]) def test_set_names_with_nlevel_1(inplace): # GH 21149 # Ensure that .set_names for MultiIndex with # nlevels == 1 does not raise any errors - expected = pd.MultiIndex(levels=[[0, 1]], - codes=[[0, 1]], - names=['first']) + expected = pd.MultiIndex(levels=[[0, 1]], codes=[[0, 1]], names=["first"]) m = pd.MultiIndex.from_product([[0, 1]]) - result = m.set_names('first', level=0, inplace=inplace) + result = m.set_names("first", level=0, inplace=inplace) if inplace: result = m @@ -403,50 +390,49 @@ def test_set_names_with_nlevel_1(inplace): tm.assert_index_equal(result, expected) -@pytest.mark.parametrize('ordered', [True, False]) +@pytest.mark.parametrize("ordered", [True, False]) def test_set_levels_categorical(ordered): # GH13854 index = MultiIndex.from_arrays([list("xyzx"), [0, 1, 2, 3]]) cidx = CategoricalIndex(list("bac"), ordered=ordered) result = index.set_levels(cidx, 0) - expected = MultiIndex(levels=[cidx, [0, 1, 2, 3]], - codes=index.codes) + expected = MultiIndex(levels=[cidx, [0, 1, 2, 3]], codes=index.codes) tm.assert_index_equal(result, expected) result_lvl = result.get_level_values(0) - expected_lvl = CategoricalIndex(list("bacb"), - categories=cidx.categories, - ordered=cidx.ordered) + expected_lvl = CategoricalIndex( + list("bacb"), categories=cidx.categories, ordered=cidx.ordered + ) tm.assert_index_equal(result_lvl, expected_lvl) def test_set_value_keeps_names(): # motivating example from #3742 - lev1 = ['hans', 'hans', 'hans', 'grethe', 'grethe', 'grethe'] - lev2 = ['1', '2', '3'] * 2 - idx = pd.MultiIndex.from_arrays([lev1, lev2], names=['Name', 'Number']) + lev1 = ["hans", "hans", "hans", "grethe", "grethe", "grethe"] + lev2 = ["1", "2", "3"] * 2 + idx = pd.MultiIndex.from_arrays([lev1, lev2], names=["Name", "Number"]) df = pd.DataFrame( - np.random.randn(6, 4), - columns=['one', 'two', 'three', 'four'], - index=idx) + np.random.randn(6, 4), columns=["one", "two", "three", "four"], index=idx + ) df = df.sort_index() assert df._is_copy is None - assert df.index.names == ('Name', 'Number') - df.at[('grethe', '4'), 'one'] = 99.34 + assert df.index.names == ("Name", "Number") + df.at[("grethe", "4"), "one"] = 99.34 assert df._is_copy is None - assert df.index.names == ('Name', 'Number') + assert df.index.names == ("Name", "Number") def test_set_levels_with_iterable(): # GH23273 sizes = [1, 2, 3] - colors = ['black'] * 3 - index = pd.MultiIndex.from_arrays([sizes, colors], names=['size', 'color']) + colors = ["black"] * 3 + index = pd.MultiIndex.from_arrays([sizes, colors], names=["size", "color"]) - result = index.set_levels(map(int, ['3', '2', '1']), level='size') + result = index.set_levels(map(int, ["3", "2", "1"]), level="size") expected_sizes = [3, 2, 1] - expected = pd.MultiIndex.from_arrays([expected_sizes, colors], - names=['size', 'color']) + expected = pd.MultiIndex.from_arrays( + [expected_sizes, colors], names=["size", "color"] + ) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 3acd194b28a050..75dea68eadbf7d 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -5,8 +5,13 @@ import pandas as pd from pandas import ( - Categorical, CategoricalIndex, Index, IntervalIndex, MultiIndex, - date_range) + Categorical, + CategoricalIndex, + Index, + IntervalIndex, + MultiIndex, + date_range, +) from pandas.core.indexes.base import InvalidIndexError import pandas.util.testing as tm from pandas.util.testing import assert_almost_equal @@ -15,16 +20,16 @@ def test_slice_locs_partial(idx): sorted_idx, _ = idx.sortlevel(0) - result = sorted_idx.slice_locs(('foo', 'two'), ('qux', 'one')) + result = sorted_idx.slice_locs(("foo", "two"), ("qux", "one")) assert result == (1, 5) - result = sorted_idx.slice_locs(None, ('qux', 'one')) + result = sorted_idx.slice_locs(None, ("qux", "one")) assert result == (0, 5) - result = sorted_idx.slice_locs(('foo', 'two'), None) + result = sorted_idx.slice_locs(("foo", "two"), None) assert result == (1, len(sorted_idx)) - result = sorted_idx.slice_locs('bar', 'baz') + result = sorted_idx.slice_locs("bar", "baz") assert result == (2, 4) @@ -38,8 +43,11 @@ def test_slice_locs(): expected = df[5:16].stack() tm.assert_almost_equal(sliced.values, expected.values) - slob = slice(*idx.slice_locs(df.index[5] + timedelta(seconds=30), - df.index[15] - timedelta(seconds=30))) + slob = slice( + *idx.slice_locs( + df.index[5] + timedelta(seconds=30), df.index[15] - timedelta(seconds=30) + ) + ) sliced = stacked[slob] expected = df[6:15].stack() tm.assert_almost_equal(sliced.values, expected.values) @@ -49,28 +57,29 @@ def test_slice_locs_with_type_mismatch(): df = tm.makeTimeDataFrame() stacked = df.stack() idx = stacked.index - with pytest.raises(TypeError, match='^Level type mismatch'): + with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs((1, 3)) - with pytest.raises(TypeError, match='^Level type mismatch'): + with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs(df.index[5] + timedelta(seconds=30), (5, 2)) df = tm.makeCustomDataframe(5, 5) stacked = df.stack() idx = stacked.index - with pytest.raises(TypeError, match='^Level type mismatch'): + with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs(timedelta(seconds=30)) # TODO: Try creating a UnicodeDecodeError in exception message - with pytest.raises(TypeError, match='^Level type mismatch'): + with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs(df.index[1], (16, "a")) def test_slice_locs_not_sorted(): - index = MultiIndex(levels=[Index(np.arange(4)), - Index(np.arange(4)), - Index(np.arange(4))], - codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), - np.array([0, 1, 0, 0, 0, 1, 0, 1]), - np.array([1, 0, 1, 1, 0, 0, 1, 0])], - ) + index = MultiIndex( + levels=[Index(np.arange(4)), Index(np.arange(4)), Index(np.arange(4))], + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + ) msg = "[Kk]ey length.*greater than MultiIndex lexsort depth" with pytest.raises(KeyError, match=msg): index.slice_locs((1, 0, 1), (2, 1, 0)) @@ -84,9 +93,11 @@ def test_slice_locs_not_sorted(): def test_slice_locs_not_contained(): # some searchsorted action - index = MultiIndex(levels=[[0, 2, 4, 6], [0, 2, 4]], - codes=[[0, 0, 0, 1, 1, 2, 3, 3, 3], - [0, 1, 2, 1, 2, 2, 0, 1, 2]], sortorder=0) + index = MultiIndex( + levels=[[0, 2, 4, 6], [0, 2, 4]], + codes=[[0, 0, 0, 1, 1, 2, 3, 3, 3], [0, 1, 2, 1, 2, 2, 0, 1, 2]], + sortorder=0, + ) result = index.slice_locs((1, 0), (5, 2)) assert result == (3, 6) @@ -118,7 +129,7 @@ def test_putmask_with_wrong_mask(idx): idx.putmask(np.ones(len(idx) - 1, np.bool), 1) with pytest.raises(ValueError, match=msg): - idx.putmask('foo', 1) + idx.putmask("foo", 1) def test_get_indexer(): @@ -128,32 +139,33 @@ def test_get_indexer(): major_codes = np.array([0, 0, 1, 2, 2, 3, 3], dtype=np.intp) minor_codes = np.array([0, 1, 0, 0, 1, 0, 1], dtype=np.intp) - index = MultiIndex(levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes]) + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) idx1 = index[:5] idx2 = index[[1, 3, 5]] r1 = idx1.get_indexer(idx2) assert_almost_equal(r1, np.array([1, 3, -1], dtype=np.intp)) - r1 = idx2.get_indexer(idx1, method='pad') + r1 = idx2.get_indexer(idx1, method="pad") e1 = np.array([-1, 0, 0, 1, 1], dtype=np.intp) assert_almost_equal(r1, e1) - r2 = idx2.get_indexer(idx1[::-1], method='pad') + r2 = idx2.get_indexer(idx1[::-1], method="pad") assert_almost_equal(r2, e1[::-1]) - rffill1 = idx2.get_indexer(idx1, method='ffill') + rffill1 = idx2.get_indexer(idx1, method="ffill") assert_almost_equal(r1, rffill1) - r1 = idx2.get_indexer(idx1, method='backfill') + r1 = idx2.get_indexer(idx1, method="backfill") e1 = np.array([0, 0, 1, 1, 2], dtype=np.intp) assert_almost_equal(r1, e1) - r2 = idx2.get_indexer(idx1[::-1], method='backfill') + r2 = idx2.get_indexer(idx1[::-1], method="backfill") assert_almost_equal(r2, e1[::-1]) - rbfill1 = idx2.get_indexer(idx1, method='bfill') + rbfill1 = idx2.get_indexer(idx1, method="bfill") assert_almost_equal(r1, rbfill1) # pass non-MultiIndex @@ -174,19 +186,20 @@ def test_get_indexer(): def test_get_indexer_nearest(): - midx = MultiIndex.from_tuples([('a', 1), ('b', 2)]) - msg = ("method='nearest' not implemented yet for MultiIndex; see GitHub" - " issue 9365") + midx = MultiIndex.from_tuples([("a", 1), ("b", 2)]) + msg = ( + "method='nearest' not implemented yet for MultiIndex; see GitHub" " issue 9365" + ) with pytest.raises(NotImplementedError, match=msg): - midx.get_indexer(['a'], method='nearest') + midx.get_indexer(["a"], method="nearest") msg = "tolerance not implemented yet for MultiIndex" with pytest.raises(NotImplementedError, match=msg): - midx.get_indexer(['a'], method='pad', tolerance=2) + midx.get_indexer(["a"], method="pad", tolerance=2) def test_getitem(idx): # scalar - assert idx[2] == ('bar', 'one') + assert idx[2] == ("bar", "one") # slice result = idx[2:5] @@ -203,8 +216,8 @@ def test_getitem(idx): def test_getitem_group_select(idx): sorted_idx, _ = idx.sortlevel(0) - assert sorted_idx.get_loc('baz') == slice(3, 4) - assert sorted_idx.get_loc('foo') == slice(0, 2) + assert sorted_idx.get_loc("baz") == slice(3, 4) + assert sorted_idx.get_loc("foo") == slice(0, 2) def test_get_indexer_consistency(idx): @@ -226,54 +239,55 @@ def test_get_indexer_consistency(idx): assert indexer.dtype == np.intp -@pytest.mark.parametrize('ind1', [[True] * 5, pd.Index([True] * 5)]) -@pytest.mark.parametrize('ind2', [[True, False, True, False, False], - pd.Index([True, False, True, False, - False])]) +@pytest.mark.parametrize("ind1", [[True] * 5, pd.Index([True] * 5)]) +@pytest.mark.parametrize( + "ind2", + [[True, False, True, False, False], pd.Index([True, False, True, False, False])], +) def test_getitem_bool_index_all(ind1, ind2): # GH#22533 - idx = MultiIndex.from_tuples([(10, 1), (20, 2), (30, 3), - (40, 4), (50, 5)]) + idx = MultiIndex.from_tuples([(10, 1), (20, 2), (30, 3), (40, 4), (50, 5)]) tm.assert_index_equal(idx[ind1], idx) expected = MultiIndex.from_tuples([(10, 1), (30, 3)]) tm.assert_index_equal(idx[ind2], expected) -@pytest.mark.parametrize('ind1', [[True], pd.Index([True])]) -@pytest.mark.parametrize('ind2', [[False], pd.Index([False])]) +@pytest.mark.parametrize("ind1", [[True], pd.Index([True])]) +@pytest.mark.parametrize("ind2", [[False], pd.Index([False])]) def test_getitem_bool_index_single(ind1, ind2): # GH#22533 idx = MultiIndex.from_tuples([(10, 1)]) tm.assert_index_equal(idx[ind1], idx) - expected = pd.MultiIndex(levels=[np.array([], dtype=np.int64), - np.array([], dtype=np.int64)], - codes=[[], []]) + expected = pd.MultiIndex( + levels=[np.array([], dtype=np.int64), np.array([], dtype=np.int64)], + codes=[[], []], + ) tm.assert_index_equal(idx[ind2], expected) def test_get_loc(idx): - assert idx.get_loc(('foo', 'two')) == 1 - assert idx.get_loc(('baz', 'two')) == 3 + assert idx.get_loc(("foo", "two")) == 1 + assert idx.get_loc(("baz", "two")) == 3 with pytest.raises(KeyError, match=r"^10$"): - idx.get_loc(('bar', 'two')) + idx.get_loc(("bar", "two")) with pytest.raises(KeyError, match=r"^'quux'$"): - idx.get_loc('quux') + idx.get_loc("quux") - msg = ("only the default get_loc method is currently supported for" - " MultiIndex") + msg = "only the default get_loc method is currently supported for" " MultiIndex" with pytest.raises(NotImplementedError, match=msg): - idx.get_loc('foo', method='nearest') + idx.get_loc("foo", method="nearest") # 3 levels - index = MultiIndex(levels=[Index(np.arange(4)), - Index(np.arange(4)), - Index(np.arange(4))], - codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), - np.array([0, 1, 0, 0, 0, 1, 0, 1]), - np.array([1, 0, 1, 1, 0, 0, 1, 0])], - ) + index = MultiIndex( + levels=[Index(np.arange(4)), Index(np.arange(4)), Index(np.arange(4))], + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + ) with pytest.raises(KeyError, match=r"^\(1, 1\)$"): index.get_loc((1, 1)) assert index.get_loc((2, 0)) == slice(3, 5) @@ -286,20 +300,21 @@ def test_get_loc_duplicates(): assert result == expected # pytest.raises(Exception, index.get_loc, 2) - index = Index(['c', 'a', 'a', 'b', 'b']) - rs = index.get_loc('c') + index = Index(["c", "a", "a", "b", "b"]) + rs = index.get_loc("c") xp = 0 assert rs == xp def test_get_loc_level(): - index = MultiIndex(levels=[Index(np.arange(4)), - Index(np.arange(4)), - Index(np.arange(4))], - codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), - np.array([0, 1, 0, 0, 0, 1, 0, 1]), - np.array([1, 0, 1, 1, 0, 0, 1, 0])], - ) + index = MultiIndex( + levels=[Index(np.arange(4)), Index(np.arange(4)), Index(np.arange(4))], + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + ) loc, new_index = index.get_loc_level((0, 1)) expected = slice(1, 2) exp_index = index[expected].droplevel(0).droplevel(0) @@ -320,32 +335,31 @@ def test_get_loc_level(): with pytest.raises(KeyError, match=r"^2$"): index.drop(1, level=2).get_loc_level(2, level=2) - index = MultiIndex(levels=[[2000], list(range(4))], - codes=[np.array([0, 0, 0, 0]), - np.array([0, 1, 2, 3])], - ) + index = MultiIndex( + levels=[[2000], list(range(4))], + codes=[np.array([0, 0, 0, 0]), np.array([0, 1, 2, 3])], + ) result, new_index = index.get_loc_level((2000, slice(None, None))) expected = slice(None, None) assert result == expected assert new_index.equals(index.droplevel(0)) -@pytest.mark.parametrize('dtype1', [int, float, bool, str]) -@pytest.mark.parametrize('dtype2', [int, float, bool, str]) +@pytest.mark.parametrize("dtype1", [int, float, bool, str]) +@pytest.mark.parametrize("dtype2", [int, float, bool, str]) def test_get_loc_multiple_dtypes(dtype1, dtype2): # GH 18520 - levels = [np.array([0, 1]).astype(dtype1), - np.array([0, 1]).astype(dtype2)] + levels = [np.array([0, 1]).astype(dtype1), np.array([0, 1]).astype(dtype2)] idx = pd.MultiIndex.from_product(levels) assert idx.get_loc(idx[2]) == 2 -@pytest.mark.parametrize('level', [0, 1]) -@pytest.mark.parametrize('dtypes', [[int, float], [float, int]]) +@pytest.mark.parametrize("level", [0, 1]) +@pytest.mark.parametrize("dtypes", [[int, float], [float, int]]) def test_get_loc_implicit_cast(level, dtypes): # GH 18818, GH 15994 : as flat index, cast int to float and vice-versa - levels = [['a', 'b'], ['c', 'd']] - key = ['b', 'd'] + levels = [["a", "b"], ["c", "d"]] + key = ["b", "d"] lev_dtype, key_dtype = dtypes levels[level] = np.array([0, 1], dtype=lev_dtype) key[level] = key_dtype(1) @@ -355,7 +369,7 @@ def test_get_loc_implicit_cast(level, dtypes): def test_get_loc_cast_bool(): # GH 19086 : int is casted to bool, but not vice-versa - levels = [[False, True], np.arange(2, dtype='int64')] + levels = [[False, True], np.arange(2, dtype="int64")] idx = MultiIndex.from_product(levels) assert idx.get_loc((0, 1)) == 1 @@ -367,11 +381,11 @@ def test_get_loc_cast_bool(): idx.get_loc((True, False)) -@pytest.mark.parametrize('level', [0, 1]) +@pytest.mark.parametrize("level", [0, 1]) def test_get_loc_nan(level, nulls_fixture): # GH 18485 : NaN in MultiIndex - levels = [['a', 'b'], ['c', 'd']] - key = ['b', 'd'] + levels = [["a", "b"], ["c", "d"]] + key = ["b", "d"] levels[level] = np.array([0, nulls_fixture], dtype=type(nulls_fixture)) key[level] = nulls_fixture idx = MultiIndex.from_product(levels) @@ -393,28 +407,37 @@ def test_get_loc_missing_nan(): def test_get_indexer_categorical_time(): # https://github.com/pandas-dev/pandas/issues/21390 midx = MultiIndex.from_product( - [Categorical(['a', 'b', 'c']), - Categorical(date_range("2012-01-01", periods=3, freq='H'))]) + [ + Categorical(["a", "b", "c"]), + Categorical(date_range("2012-01-01", periods=3, freq="H")), + ] + ) result = midx.get_indexer(midx) tm.assert_numpy_array_equal(result, np.arange(9, dtype=np.intp)) def test_timestamp_multiindex_indexer(): # https://github.com/pandas-dev/pandas/issues/26944 - idx = pd.MultiIndex.from_product([ - pd.date_range("2019-01-01T00:15:33", periods=100, freq="H", - name="date"), - ['x'], - [3] - ]) - df = pd.DataFrame({'foo': np.arange(len(idx))}, idx) - result = df.loc[pd.IndexSlice['2019-1-2':, "x", :], 'foo'] - qidx = pd.MultiIndex.from_product([ - pd.date_range(start="2019-01-02T00:15:33", end='2019-01-05T02:15:33', - freq="H", name="date"), - ['x'], - [3] - ]) - should_be = pd.Series(data=np.arange(24, len(qidx) + 24), index=qidx, - name="foo") + idx = pd.MultiIndex.from_product( + [ + pd.date_range("2019-01-01T00:15:33", periods=100, freq="H", name="date"), + ["x"], + [3], + ] + ) + df = pd.DataFrame({"foo": np.arange(len(idx))}, idx) + result = df.loc[pd.IndexSlice["2019-1-2":, "x", :], "foo"] + qidx = pd.MultiIndex.from_product( + [ + pd.date_range( + start="2019-01-02T00:15:33", + end="2019-01-05T02:15:33", + freq="H", + name="date", + ), + ["x"], + [3], + ] + ) + should_be = pd.Series(data=np.arange(24, len(qidx) + 24), index=qidx, name="foo") tm.assert_series_equal(result, should_be) diff --git a/pandas/tests/indexes/multi/test_integrity.py b/pandas/tests/indexes/multi/test_integrity.py index a0bdb4c98e6c70..dba75b6247a20e 100644 --- a/pandas/tests/indexes/multi/test_integrity.py +++ b/pandas/tests/indexes/multi/test_integrity.py @@ -13,28 +13,31 @@ def test_labels_dtypes(): # GH 8456 - i = MultiIndex.from_tuples([('A', 1), ('A', 2)]) - assert i.codes[0].dtype == 'int8' - assert i.codes[1].dtype == 'int8' - - i = MultiIndex.from_product([['a'], range(40)]) - assert i.codes[1].dtype == 'int8' - i = MultiIndex.from_product([['a'], range(400)]) - assert i.codes[1].dtype == 'int16' - i = MultiIndex.from_product([['a'], range(40000)]) - assert i.codes[1].dtype == 'int32' - - i = pd.MultiIndex.from_product([['a'], range(1000)]) + i = MultiIndex.from_tuples([("A", 1), ("A", 2)]) + assert i.codes[0].dtype == "int8" + assert i.codes[1].dtype == "int8" + + i = MultiIndex.from_product([["a"], range(40)]) + assert i.codes[1].dtype == "int8" + i = MultiIndex.from_product([["a"], range(400)]) + assert i.codes[1].dtype == "int16" + i = MultiIndex.from_product([["a"], range(40000)]) + assert i.codes[1].dtype == "int32" + + i = pd.MultiIndex.from_product([["a"], range(1000)]) assert (i.codes[0] >= 0).all() assert (i.codes[1] >= 0).all() def test_values_boxed(): - tuples = [(1, pd.Timestamp('2000-01-01')), (2, pd.NaT), - (3, pd.Timestamp('2000-01-03')), - (1, pd.Timestamp('2000-01-04')), - (2, pd.Timestamp('2000-01-02')), - (3, pd.Timestamp('2000-01-03'))] + tuples = [ + (1, pd.Timestamp("2000-01-01")), + (2, pd.NaT), + (3, pd.Timestamp("2000-01-03")), + (1, pd.Timestamp("2000-01-04")), + (2, pd.Timestamp("2000-01-02")), + (3, pd.Timestamp("2000-01-03")), + ] result = pd.MultiIndex.from_tuples(tuples) expected = construct_1d_object_array_from_listlike(tuples) tm.assert_numpy_array_equal(result.values, expected) @@ -48,7 +51,7 @@ def test_values_multiindex_datetimeindex(): naive = pd.DatetimeIndex(ints) # TODO(GH-24559): Remove the FutureWarning with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - aware = pd.DatetimeIndex(ints, tz='US/Central') + aware = pd.DatetimeIndex(ints, tz="US/Central") idx = pd.MultiIndex.from_arrays([naive, aware]) result = idx.values @@ -72,7 +75,7 @@ def test_values_multiindex_datetimeindex(): def test_values_multiindex_periodindex(): # Test to ensure we hit the boxing / nobox part of MI.values ints = np.arange(2007, 2012) - pidx = pd.PeriodIndex(ints, freq='D') + pidx = pd.PeriodIndex(ints, freq="D") idx = pd.MultiIndex.from_arrays([ints, pidx]) result = idx.values @@ -102,14 +105,16 @@ def test_consistency(): minor_codes = np.repeat(range(10), 7000) # the fact that is works means it's consistent - index = MultiIndex(levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes]) + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) # inconsistent major_codes = np.array([0, 0, 1, 1, 1, 2, 2, 3, 3]) minor_codes = np.array([0, 1, 0, 1, 1, 0, 1, 0, 1]) - index = MultiIndex(levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes]) + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) assert index.is_unique is False @@ -117,11 +122,11 @@ def test_consistency(): def test_hash_collisions(): # non-smoke test that we don't get hash collisions - index = MultiIndex.from_product([np.arange(1000), np.arange(1000)], - names=['one', 'two']) + index = MultiIndex.from_product( + [np.arange(1000), np.arange(1000)], names=["one", "two"] + ) result = index.get_indexer(index.values) - tm.assert_numpy_array_equal(result, np.arange( - len(index), dtype='intp')) + tm.assert_numpy_array_equal(result, np.arange(len(index), dtype="intp")) for i in [0, 1, len(index) - 2, len(index) - 1]: result = index.get_loc(index[i]) @@ -133,9 +138,8 @@ def test_dims(): def take_invalid_kwargs(): - vals = [['A', 'B'], - [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')]] - idx = pd.MultiIndex.from_product(vals, names=['str', 'dt']) + vals = [["A", "B"], [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]] + idx = pd.MultiIndex.from_product(vals, names=["str", "dt"]) indices = [1, 2] msg = r"take\(\) got an unexpected keyword argument 'foo'" @@ -148,7 +152,7 @@ def take_invalid_kwargs(): msg = "the 'mode' parameter is not supported" with pytest.raises(ValueError, match=msg): - idx.take(indices, mode='clip') + idx.take(indices, mode="clip") def test_isna_behavior(idx): @@ -163,30 +167,31 @@ def test_isna_behavior(idx): def test_large_multiindex_error(): # GH12527 df_below_1000000 = pd.DataFrame( - 1, index=pd.MultiIndex.from_product([[1, 2], range(499999)]), - columns=['dest']) + 1, index=pd.MultiIndex.from_product([[1, 2], range(499999)]), columns=["dest"] + ) with pytest.raises(KeyError, match=r"^\(-1, 0\)$"): - df_below_1000000.loc[(-1, 0), 'dest'] + df_below_1000000.loc[(-1, 0), "dest"] with pytest.raises(KeyError, match=r"^\(3, 0\)$"): - df_below_1000000.loc[(3, 0), 'dest'] + df_below_1000000.loc[(3, 0), "dest"] df_above_1000000 = pd.DataFrame( - 1, index=pd.MultiIndex.from_product([[1, 2], range(500001)]), - columns=['dest']) + 1, index=pd.MultiIndex.from_product([[1, 2], range(500001)]), columns=["dest"] + ) with pytest.raises(KeyError, match=r"^\(-1, 0\)$"): - df_above_1000000.loc[(-1, 0), 'dest'] + df_above_1000000.loc[(-1, 0), "dest"] with pytest.raises(KeyError, match=r"^\(3, 0\)$"): - df_above_1000000.loc[(3, 0), 'dest'] + df_above_1000000.loc[(3, 0), "dest"] def test_million_record_attribute_error(): # GH 18165 r = list(range(1000000)) - df = pd.DataFrame({'a': r, 'b': r}, - index=pd.MultiIndex.from_tuples([(x, x) for x in r])) + df = pd.DataFrame( + {"a": r, "b": r}, index=pd.MultiIndex.from_tuples([(x, x) for x in r]) + ) msg = "'Series' object has no attribute 'foo'" with pytest.raises(AttributeError, match=msg): - df['a'].foo() + df["a"].foo() def test_can_hold_identifiers(idx): @@ -197,7 +202,7 @@ def test_can_hold_identifiers(idx): def test_metadata_immutable(idx): levels, codes = idx.levels, idx.codes # shouldn't be able to set at either the top level or base level - mutable_regex = re.compile('does not support mutable operations') + mutable_regex = re.compile("does not support mutable operations") with pytest.raises(TypeError, match=mutable_regex): levels[0] = levels[0] with pytest.raises(TypeError, match=mutable_regex): @@ -214,11 +219,9 @@ def test_metadata_immutable(idx): def test_level_setting_resets_attributes(): - ind = pd.MultiIndex.from_arrays([ - ['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3] - ]) + ind = pd.MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]]) assert ind.is_monotonic - ind.set_levels([['A', 'B'], [1, 3, 2]], inplace=True) + ind.set_levels([["A", "B"], [1, 3, 2]], inplace=True) # if this fails, probably didn't reset the cache correctly. assert not ind.is_monotonic @@ -227,30 +230,30 @@ def test_rangeindex_fallback_coercion_bug(): # GH 12893 foo = pd.DataFrame(np.arange(100).reshape((10, 10))) bar = pd.DataFrame(np.arange(100).reshape((10, 10))) - df = pd.concat({'foo': foo.stack(), 'bar': bar.stack()}, axis=1) - df.index.names = ['fizz', 'buzz'] + df = pd.concat({"foo": foo.stack(), "bar": bar.stack()}, axis=1) + df.index.names = ["fizz", "buzz"] str(df) - expected = pd.DataFrame({'bar': np.arange(100), - 'foo': np.arange(100)}, - index=pd.MultiIndex.from_product( - [range(10), range(10)], - names=['fizz', 'buzz'])) + expected = pd.DataFrame( + {"bar": np.arange(100), "foo": np.arange(100)}, + index=pd.MultiIndex.from_product( + [range(10), range(10)], names=["fizz", "buzz"] + ), + ) tm.assert_frame_equal(df, expected, check_like=True) - result = df.index.get_level_values('fizz') - expected = pd.Int64Index(np.arange(10), name='fizz').repeat(10) + result = df.index.get_level_values("fizz") + expected = pd.Int64Index(np.arange(10), name="fizz").repeat(10) tm.assert_index_equal(result, expected) - result = df.index.get_level_values('buzz') - expected = pd.Int64Index(np.tile(np.arange(10), 10), name='buzz') + result = df.index.get_level_values("buzz") + expected = pd.Int64Index(np.tile(np.arange(10), 10), name="buzz") tm.assert_index_equal(result, expected) def test_hash_error(indices): index = indices - with pytest.raises(TypeError, match=("unhashable type: %r" % - type(index).__name__)): + with pytest.raises(TypeError, match=("unhashable type: %r" % type(index).__name__)): hash(indices) @@ -279,7 +282,7 @@ def test_memory_usage(idx): if not isinstance(idx, (RangeIndex, IntervalIndex)): assert result2 > result - if idx.inferred_type == 'object': + if idx.inferred_type == "object": assert result3 > result2 else: diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py index ea26f210daaf60..42d8cf761842ec 100644 --- a/pandas/tests/indexes/multi/test_join.py +++ b/pandas/tests/indexes/multi/test_join.py @@ -6,30 +6,27 @@ import pandas.util.testing as tm -@pytest.mark.parametrize('other', [ - Index(['three', 'one', 'two']), - Index(['one']), - Index(['one', 'three']), -]) +@pytest.mark.parametrize( + "other", [Index(["three", "one", "two"]), Index(["one"]), Index(["one", "three"])] +) def test_join_level(idx, other, join_type): - join_index, lidx, ridx = other.join(idx, how=join_type, - level='second', - return_indexers=True) + join_index, lidx, ridx = other.join( + idx, how=join_type, level="second", return_indexers=True + ) exp_level = other.join(idx.levels[1], how=join_type) assert join_index.levels[0].equals(idx.levels[0]) assert join_index.levels[1].equals(exp_level) # pare down levels - mask = np.array( - [x[1] in exp_level for x in idx], dtype=bool) + mask = np.array([x[1] in exp_level for x in idx], dtype=bool) exp_values = idx.values[mask] tm.assert_numpy_array_equal(join_index.values, exp_values) - if join_type in ('outer', 'inner'): - join_index2, ridx2, lidx2 = \ - idx.join(other, how=join_type, level='second', - return_indexers=True) + if join_type in ("outer", "inner"): + join_index2, ridx2, lidx2 = idx.join( + other, how=join_type, level="second", return_indexers=True + ) assert join_index.equals(join_index2) tm.assert_numpy_array_equal(lidx, lidx2) @@ -39,8 +36,8 @@ def test_join_level(idx, other, join_type): def test_join_level_corner_case(idx): # some corner cases - index = Index(['three', 'one', 'two']) - result = index.join(idx, level='second') + index = Index(["three", "one", "two"]) + result = index.join(idx, level="second") assert isinstance(result, MultiIndex) with pytest.raises(TypeError, match="Join.*MultiIndex.*ambiguous"): @@ -54,34 +51,33 @@ def test_join_self(idx, join_type): def test_join_multi(): # GH 10665 - midx = pd.MultiIndex.from_product( - [np.arange(4), np.arange(4)], names=['a', 'b']) - idx = pd.Index([1, 2, 5], name='b') + midx = pd.MultiIndex.from_product([np.arange(4), np.arange(4)], names=["a", "b"]) + idx = pd.Index([1, 2, 5], name="b") # inner - jidx, lidx, ridx = midx.join(idx, how='inner', return_indexers=True) - exp_idx = pd.MultiIndex.from_product( - [np.arange(4), [1, 2]], names=['a', 'b']) + jidx, lidx, ridx = midx.join(idx, how="inner", return_indexers=True) + exp_idx = pd.MultiIndex.from_product([np.arange(4), [1, 2]], names=["a", "b"]) exp_lidx = np.array([1, 2, 5, 6, 9, 10, 13, 14], dtype=np.intp) exp_ridx = np.array([0, 1, 0, 1, 0, 1, 0, 1], dtype=np.intp) tm.assert_index_equal(jidx, exp_idx) tm.assert_numpy_array_equal(lidx, exp_lidx) tm.assert_numpy_array_equal(ridx, exp_ridx) # flip - jidx, ridx, lidx = idx.join(midx, how='inner', return_indexers=True) + jidx, ridx, lidx = idx.join(midx, how="inner", return_indexers=True) tm.assert_index_equal(jidx, exp_idx) tm.assert_numpy_array_equal(lidx, exp_lidx) tm.assert_numpy_array_equal(ridx, exp_ridx) # keep MultiIndex - jidx, lidx, ridx = midx.join(idx, how='left', return_indexers=True) - exp_ridx = np.array([-1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1, -1, 0, - 1, -1], dtype=np.intp) + jidx, lidx, ridx = midx.join(idx, how="left", return_indexers=True) + exp_ridx = np.array( + [-1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1], dtype=np.intp + ) tm.assert_index_equal(jidx, midx) assert lidx is None tm.assert_numpy_array_equal(ridx, exp_ridx) # flip - jidx, ridx, lidx = idx.join(midx, how='right', return_indexers=True) + jidx, ridx, lidx = idx.join(midx, how="right", return_indexers=True) tm.assert_index_equal(jidx, midx) assert lidx is None tm.assert_numpy_array_equal(ridx, exp_ridx) diff --git a/pandas/tests/indexes/multi/test_missing.py b/pandas/tests/indexes/multi/test_missing.py index 1928c303a1bcdd..15bbd2ce97c3c0 100644 --- a/pandas/tests/indexes/multi/test_missing.py +++ b/pandas/tests/indexes/multi/test_missing.py @@ -13,7 +13,7 @@ def test_fillna(idx): # GH 11343 # TODO: Remove or Refactor. Not Implemented for MultiIndex - for name, index in [('idx', idx), ]: + for name, index in [("idx", idx)]: if len(index) == 0: pass elif isinstance(index, MultiIndex): @@ -54,39 +54,42 @@ def test_fillna(idx): def test_dropna(): # GH 6194 - idx = pd.MultiIndex.from_arrays([[1, np.nan, 3, np.nan, 5], - [1, 2, np.nan, np.nan, 5], - ['a', 'b', 'c', np.nan, 'e']]) - - exp = pd.MultiIndex.from_arrays([[1, 5], - [1, 5], - ['a', 'e']]) + idx = pd.MultiIndex.from_arrays( + [ + [1, np.nan, 3, np.nan, 5], + [1, 2, np.nan, np.nan, 5], + ["a", "b", "c", np.nan, "e"], + ] + ) + + exp = pd.MultiIndex.from_arrays([[1, 5], [1, 5], ["a", "e"]]) tm.assert_index_equal(idx.dropna(), exp) - tm.assert_index_equal(idx.dropna(how='any'), exp) + tm.assert_index_equal(idx.dropna(how="any"), exp) - exp = pd.MultiIndex.from_arrays([[1, np.nan, 3, 5], - [1, 2, np.nan, 5], - ['a', 'b', 'c', 'e']]) - tm.assert_index_equal(idx.dropna(how='all'), exp) + exp = pd.MultiIndex.from_arrays( + [[1, np.nan, 3, 5], [1, 2, np.nan, 5], ["a", "b", "c", "e"]] + ) + tm.assert_index_equal(idx.dropna(how="all"), exp) msg = "invalid how option: xxx" with pytest.raises(ValueError, match=msg): - idx.dropna(how='xxx') + idx.dropna(how="xxx") # GH26408 # test if missing values are dropped for multiindex constructed # from codes and values - idx = MultiIndex(levels=[[np.nan, None, pd.NaT, "128", 2], - [np.nan, None, pd.NaT, "128", 2]], - codes=[[0, -1, 1, 2, 3, 4], - [0, -1, 3, 3, 3, 4]]) + idx = MultiIndex( + levels=[[np.nan, None, pd.NaT, "128", 2], [np.nan, None, pd.NaT, "128", 2]], + codes=[[0, -1, 1, 2, 3, 4], [0, -1, 3, 3, 3, 4]], + ) expected = MultiIndex.from_arrays([["128", 2], ["128", 2]]) tm.assert_index_equal(idx.dropna(), expected) - tm.assert_index_equal(idx.dropna(how='any'), expected) + tm.assert_index_equal(idx.dropna(how="any"), expected) - expected = MultiIndex.from_arrays([[np.nan, np.nan, "128", 2], - ["128", "128", "128", 2]]) - tm.assert_index_equal(idx.dropna(how='all'), expected) + expected = MultiIndex.from_arrays( + [[np.nan, np.nan, "128", 2], ["128", "128", "128", 2]] + ) + tm.assert_index_equal(idx.dropna(how="all"), expected) def test_nulls(idx): @@ -123,13 +126,11 @@ def test_hasnans_isnans(idx): def test_nan_stays_float(): # GH 7031 - idx0 = pd.MultiIndex(levels=[["A", "B"], []], - codes=[[1, 0], [-1, -1]], - names=[0, 1]) - idx1 = pd.MultiIndex(levels=[["C"], ["D"]], - codes=[[0], [0]], - names=[0, 1]) - idxm = idx0.join(idx1, how='outer') + idx0 = pd.MultiIndex( + levels=[["A", "B"], []], codes=[[1, 0], [-1, -1]], names=[0, 1] + ) + idx1 = pd.MultiIndex(levels=[["C"], ["D"]], codes=[[0], [0]], names=[0, 1]) + idxm = idx0.join(idx1, how="outer") assert pd.isna(idx0.get_level_values(1)).all() # the following failed in 0.14.1 assert pd.isna(idxm.get_level_values(1)[:-1]).all() diff --git a/pandas/tests/indexes/multi/test_monotonic.py b/pandas/tests/indexes/multi/test_monotonic.py index a160dc26781579..b5c73d5e97745c 100644 --- a/pandas/tests/indexes/multi/test_monotonic.py +++ b/pandas/tests/indexes/multi/test_monotonic.py @@ -7,50 +7,50 @@ def test_is_monotonic_increasing(): - i = MultiIndex.from_product([np.arange(10), - np.arange(10)], names=['one', 'two']) + i = MultiIndex.from_product([np.arange(10), np.arange(10)], names=["one", "two"]) assert i.is_monotonic is True assert i._is_strictly_monotonic_increasing is True assert Index(i.values).is_monotonic is True assert i._is_strictly_monotonic_increasing is True - i = MultiIndex.from_product([np.arange(10, 0, -1), - np.arange(10)], names=['one', 'two']) + i = MultiIndex.from_product( + [np.arange(10, 0, -1), np.arange(10)], names=["one", "two"] + ) assert i.is_monotonic is False assert i._is_strictly_monotonic_increasing is False assert Index(i.values).is_monotonic is False assert Index(i.values)._is_strictly_monotonic_increasing is False - i = MultiIndex.from_product([np.arange(10), - np.arange(10, 0, -1)], - names=['one', 'two']) + i = MultiIndex.from_product( + [np.arange(10), np.arange(10, 0, -1)], names=["one", "two"] + ) assert i.is_monotonic is False assert i._is_strictly_monotonic_increasing is False assert Index(i.values).is_monotonic is False assert Index(i.values)._is_strictly_monotonic_increasing is False - i = MultiIndex.from_product([[1.0, np.nan, 2.0], ['a', 'b', 'c']]) + i = MultiIndex.from_product([[1.0, np.nan, 2.0], ["a", "b", "c"]]) assert i.is_monotonic is False assert i._is_strictly_monotonic_increasing is False assert Index(i.values).is_monotonic is False assert Index(i.values)._is_strictly_monotonic_increasing is False # string ordering - i = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) + i = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) assert i.is_monotonic is False assert Index(i.values).is_monotonic is False assert i._is_strictly_monotonic_increasing is False assert Index(i.values)._is_strictly_monotonic_increasing is False - i = MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], - ['mom', 'next', 'zenith']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) + i = MultiIndex( + levels=[["bar", "baz", "foo", "qux"], ["mom", "next", "zenith"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) assert i.is_monotonic is True assert Index(i.values).is_monotonic is True assert i._is_strictly_monotonic_increasing is True @@ -58,11 +58,19 @@ def test_is_monotonic_increasing(): # mixed levels, hits the TypeError i = MultiIndex( - levels=[[1, 2, 3, 4], ['gb00b03mlx29', 'lu0197800237', - 'nl0000289783', - 'nl0000289965', 'nl0000301109']], + levels=[ + [1, 2, 3, 4], + [ + "gb00b03mlx29", + "lu0197800237", + "nl0000289783", + "nl0000289965", + "nl0000301109", + ], + ], codes=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]], - names=['household_id', 'asset_id']) + names=["household_id", "asset_id"], + ) assert i.is_monotonic is False assert i._is_strictly_monotonic_increasing is False @@ -76,51 +84,52 @@ def test_is_monotonic_increasing(): def test_is_monotonic_decreasing(): - i = MultiIndex.from_product([np.arange(9, -1, -1), - np.arange(9, -1, -1)], - names=['one', 'two']) + i = MultiIndex.from_product( + [np.arange(9, -1, -1), np.arange(9, -1, -1)], names=["one", "two"] + ) assert i.is_monotonic_decreasing is True assert i._is_strictly_monotonic_decreasing is True assert Index(i.values).is_monotonic_decreasing is True assert i._is_strictly_monotonic_decreasing is True - i = MultiIndex.from_product([np.arange(10), - np.arange(10, 0, -1)], - names=['one', 'two']) + i = MultiIndex.from_product( + [np.arange(10), np.arange(10, 0, -1)], names=["one", "two"] + ) assert i.is_monotonic_decreasing is False assert i._is_strictly_monotonic_decreasing is False assert Index(i.values).is_monotonic_decreasing is False assert Index(i.values)._is_strictly_monotonic_decreasing is False - i = MultiIndex.from_product([np.arange(10, 0, -1), - np.arange(10)], names=['one', 'two']) + i = MultiIndex.from_product( + [np.arange(10, 0, -1), np.arange(10)], names=["one", "two"] + ) assert i.is_monotonic_decreasing is False assert i._is_strictly_monotonic_decreasing is False assert Index(i.values).is_monotonic_decreasing is False assert Index(i.values)._is_strictly_monotonic_decreasing is False - i = MultiIndex.from_product([[2.0, np.nan, 1.0], ['c', 'b', 'a']]) + i = MultiIndex.from_product([[2.0, np.nan, 1.0], ["c", "b", "a"]]) assert i.is_monotonic_decreasing is False assert i._is_strictly_monotonic_decreasing is False assert Index(i.values).is_monotonic_decreasing is False assert Index(i.values)._is_strictly_monotonic_decreasing is False # string ordering - i = MultiIndex(levels=[['qux', 'foo', 'baz', 'bar'], - ['three', 'two', 'one']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) + i = MultiIndex( + levels=[["qux", "foo", "baz", "bar"], ["three", "two", "one"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) assert i.is_monotonic_decreasing is False assert Index(i.values).is_monotonic_decreasing is False assert i._is_strictly_monotonic_decreasing is False assert Index(i.values)._is_strictly_monotonic_decreasing is False - i = MultiIndex(levels=[['qux', 'foo', 'baz', 'bar'], - ['zenith', 'next', 'mom']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) + i = MultiIndex( + levels=[["qux", "foo", "baz", "bar"], ["zenith", "next", "mom"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) assert i.is_monotonic_decreasing is True assert Index(i.values).is_monotonic_decreasing is True assert i._is_strictly_monotonic_decreasing is True @@ -128,11 +137,19 @@ def test_is_monotonic_decreasing(): # mixed levels, hits the TypeError i = MultiIndex( - levels=[[4, 3, 2, 1], ['nl0000301109', 'nl0000289965', - 'nl0000289783', 'lu0197800237', - 'gb00b03mlx29']], + levels=[ + [4, 3, 2, 1], + [ + "nl0000301109", + "nl0000289965", + "nl0000289783", + "lu0197800237", + "gb00b03mlx29", + ], + ], codes=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]], - names=['household_id', 'asset_id']) + names=["household_id", "asset_id"], + ) assert i.is_monotonic_decreasing is False assert i._is_strictly_monotonic_decreasing is False @@ -146,15 +163,17 @@ def test_is_monotonic_decreasing(): def test_is_strictly_monotonic_increasing(): - idx = pd.MultiIndex(levels=[['bar', 'baz'], ['mom', 'next']], - codes=[[0, 0, 1, 1], [0, 0, 0, 1]]) + idx = pd.MultiIndex( + levels=[["bar", "baz"], ["mom", "next"]], codes=[[0, 0, 1, 1], [0, 0, 0, 1]] + ) assert idx.is_monotonic_increasing is True assert idx._is_strictly_monotonic_increasing is False def test_is_strictly_monotonic_decreasing(): - idx = pd.MultiIndex(levels=[['baz', 'bar'], ['next', 'mom']], - codes=[[0, 0, 1, 1], [0, 0, 0, 1]]) + idx = pd.MultiIndex( + levels=[["baz", "bar"], ["next", "mom"]], codes=[[0, 0, 1, 1], [0, 0, 0, 1]] + ) assert idx.is_monotonic_decreasing is True assert idx._is_strictly_monotonic_decreasing is False @@ -180,32 +199,32 @@ def test_searchsorted_monotonic(indices): # test _searchsorted_monotonic in all cases # test searchsorted only for increasing if indices.is_monotonic_increasing: - ssm_left = indices._searchsorted_monotonic(value, side='left') + ssm_left = indices._searchsorted_monotonic(value, side="left") assert is_scalar(ssm_left) assert expected_left == ssm_left - ssm_right = indices._searchsorted_monotonic(value, side='right') + ssm_right = indices._searchsorted_monotonic(value, side="right") assert is_scalar(ssm_right) assert expected_right == ssm_right - ss_left = indices.searchsorted(value, side='left') + ss_left = indices.searchsorted(value, side="left") assert is_scalar(ss_left) assert expected_left == ss_left - ss_right = indices.searchsorted(value, side='right') + ss_right = indices.searchsorted(value, side="right") assert is_scalar(ss_right) assert expected_right == ss_right elif indices.is_monotonic_decreasing: - ssm_left = indices._searchsorted_monotonic(value, side='left') + ssm_left = indices._searchsorted_monotonic(value, side="left") assert is_scalar(ssm_left) assert expected_left == ssm_left - ssm_right = indices._searchsorted_monotonic(value, side='right') + ssm_right = indices._searchsorted_monotonic(value, side="right") assert is_scalar(ssm_right) assert expected_right == ssm_right else: # non-monotonic should raise. with pytest.raises(ValueError): - indices._searchsorted_monotonic(value, side='left') + indices._searchsorted_monotonic(value, side="left") diff --git a/pandas/tests/indexes/multi/test_names.py b/pandas/tests/indexes/multi/test_names.py index 4785e1590b75da..5856cb56b307b8 100644 --- a/pandas/tests/indexes/multi/test_names.py +++ b/pandas/tests/indexes/multi/test_names.py @@ -10,22 +10,19 @@ def check_level_names(index, names): def test_slice_keep_name(): - x = MultiIndex.from_tuples([('a', 'b'), (1, 2), ('c', 'd')], - names=['x', 'y']) + x = MultiIndex.from_tuples([("a", "b"), (1, 2), ("c", "d")], names=["x", "y"]) assert x[1:].names == x.names def test_index_name_retained(): # GH9857 - result = pd.DataFrame({'x': [1, 2, 6], - 'y': [2, 2, 8], - 'z': [-5, 0, 5]}) - result = result.set_index('z') + result = pd.DataFrame({"x": [1, 2, 6], "y": [2, 2, 8], "z": [-5, 0, 5]}) + result = result.set_index("z") result.loc[10] = [9, 10] - df_expected = pd.DataFrame({'x': [1, 2, 6, 9], - 'y': [2, 2, 8, 10], - 'z': [-5, 0, 5, 10]}) - df_expected = df_expected.set_index('z') + df_expected = pd.DataFrame( + {"x": [1, 2, 6, 9], "y": [2, 2, 8, 10], "z": [-5, 0, 5, 10]} + ) + df_expected = df_expected.set_index("z") tm.assert_frame_equal(result, df_expected) @@ -62,24 +59,24 @@ def test_take_preserve_name(idx): def test_copy_names(): # Check that adding a "names" parameter to the copy is honored # GH14302 - multi_idx = pd.Index([(1, 2), (3, 4)], names=['MyName1', 'MyName2']) + multi_idx = pd.Index([(1, 2), (3, 4)], names=["MyName1", "MyName2"]) multi_idx1 = multi_idx.copy() assert multi_idx.equals(multi_idx1) - assert multi_idx.names == ['MyName1', 'MyName2'] - assert multi_idx1.names == ['MyName1', 'MyName2'] + assert multi_idx.names == ["MyName1", "MyName2"] + assert multi_idx1.names == ["MyName1", "MyName2"] - multi_idx2 = multi_idx.copy(names=['NewName1', 'NewName2']) + multi_idx2 = multi_idx.copy(names=["NewName1", "NewName2"]) assert multi_idx.equals(multi_idx2) - assert multi_idx.names == ['MyName1', 'MyName2'] - assert multi_idx2.names == ['NewName1', 'NewName2'] + assert multi_idx.names == ["MyName1", "MyName2"] + assert multi_idx2.names == ["NewName1", "NewName2"] - multi_idx3 = multi_idx.copy(name=['NewName1', 'NewName2']) + multi_idx3 = multi_idx.copy(name=["NewName1", "NewName2"]) assert multi_idx.equals(multi_idx3) - assert multi_idx.names == ['MyName1', 'MyName2'] - assert multi_idx3.names == ['NewName1', 'NewName2'] + assert multi_idx.names == ["MyName1", "MyName2"] + assert multi_idx3.names == ["NewName1", "NewName2"] def test_names(idx, index_names): @@ -100,13 +97,17 @@ def test_names(idx, index_names): major_axis, minor_axis = idx.levels major_codes, minor_codes = idx.codes with pytest.raises(ValueError, match="^Length of names"): - MultiIndex(levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes], - names=['first']) + MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=["first"], + ) with pytest.raises(ValueError, match="^Length of names"): - MultiIndex(levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes], - names=['first', 'second', 'third']) + MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=["first", "second", "third"], + ) # names are assigned index.names = ["a", "b"] @@ -117,6 +118,6 @@ def test_names(idx, index_names): def test_duplicate_level_names_access_raises(idx): # GH19029 - idx.names = ['foo', 'foo'] - with pytest.raises(ValueError, match='name foo occurs multiple times'): - idx._get_level_number('foo') + idx.names = ["foo", "foo"] + with pytest.raises(ValueError, match="name foo occurs multiple times"): + idx._get_level_number("foo") diff --git a/pandas/tests/indexes/multi/test_partial_indexing.py b/pandas/tests/indexes/multi/test_partial_indexing.py index b75396a3136664..d6799e86683a9e 100644 --- a/pandas/tests/indexes/multi/test_partial_indexing.py +++ b/pandas/tests/indexes/multi/test_partial_indexing.py @@ -8,10 +8,10 @@ def test_partial_string_timestamp_multiindex(): # GH10331 - dr = pd.date_range('2016-01-01', '2016-01-03', freq='12H') - abc = ['a', 'b', 'c'] + dr = pd.date_range("2016-01-01", "2016-01-03", freq="12H") + abc = ["a", "b", "c"] ix = pd.MultiIndex.from_product([dr, abc]) - df = pd.DataFrame({'c1': range(0, 15)}, index=ix) + df = pd.DataFrame({"c1": range(0, 15)}, index=ix) idx = pd.IndexSlice # c1 @@ -32,23 +32,21 @@ def test_partial_string_timestamp_multiindex(): # c 14 # partial string matching on a single index - for df_swap in (df.swaplevel(), - df.swaplevel(0), - df.swaplevel(0, 1)): + for df_swap in (df.swaplevel(), df.swaplevel(0), df.swaplevel(0, 1)): df_swap = df_swap.sort_index() - just_a = df_swap.loc['a'] - result = just_a.loc['2016-01-01'] - expected = df.loc[idx[:, 'a'], :].iloc[0:2] + just_a = df_swap.loc["a"] + result = just_a.loc["2016-01-01"] + expected = df.loc[idx[:, "a"], :].iloc[0:2] expected.index = expected.index.droplevel(1) tm.assert_frame_equal(result, expected) # indexing with IndexSlice - result = df.loc[idx['2016-01-01':'2016-02-01', :], :] + result = df.loc[idx["2016-01-01":"2016-02-01", :], :] expected = df tm.assert_frame_equal(result, expected) # match on secondary index - result = df_swap.loc[idx[:, '2016-01-01':'2016-01-01'], :] + result = df_swap.loc[idx[:, "2016-01-01":"2016-01-01"], :] expected = df_swap.iloc[[0, 1, 5, 6, 10, 11]] tm.assert_frame_equal(result, expected) @@ -57,42 +55,42 @@ def test_partial_string_timestamp_multiindex(): # in multi-indexes. This would amount to selecting a scalar from a # column. with pytest.raises(KeyError): - df['2016-01-01'] + df["2016-01-01"] # partial string match on year only - result = df.loc['2016'] + result = df.loc["2016"] expected = df tm.assert_frame_equal(result, expected) # partial string match on date - result = df.loc['2016-01-01'] + result = df.loc["2016-01-01"] expected = df.iloc[0:6] tm.assert_frame_equal(result, expected) # partial string match on date and hour, from middle - result = df.loc['2016-01-02 12'] + result = df.loc["2016-01-02 12"] expected = df.iloc[9:12] tm.assert_frame_equal(result, expected) # partial string match on secondary index - result = df_swap.loc[idx[:, '2016-01-02'], :] + result = df_swap.loc[idx[:, "2016-01-02"], :] expected = df_swap.iloc[[2, 3, 7, 8, 12, 13]] tm.assert_frame_equal(result, expected) # tuple selector with partial string match on date - result = df.loc[('2016-01-01', 'a'), :] + result = df.loc[("2016-01-01", "a"), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) # Slicing date on first level should break (of course) with pytest.raises(KeyError): - df_swap.loc['2016-01-01'] + df_swap.loc["2016-01-01"] # GH12685 (partial string with daily resolution or below) - dr = date_range('2013-01-01', periods=100, freq='D') - ix = MultiIndex.from_product([dr, ['a', 'b']]) - df = DataFrame(np.random.randn(200, 1), columns=['A'], index=ix) + dr = date_range("2013-01-01", periods=100, freq="D") + ix = MultiIndex.from_product([dr, ["a", "b"]]) + df = DataFrame(np.random.randn(200, 1), columns=["A"], index=ix) - result = df.loc[idx['2013-03':'2013-03', :], :] + result = df.loc[idx["2013-03":"2013-03", :], :] expected = df.iloc[118:180] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_reindex.py b/pandas/tests/indexes/multi/test_reindex.py index 6f13a374f6cc94..88de4d1e803868 100644 --- a/pandas/tests/indexes/multi/test_reindex.py +++ b/pandas/tests/indexes/multi/test_reindex.py @@ -22,13 +22,13 @@ def test_reindex(idx): def test_reindex_level(idx): - index = Index(['one']) + index = Index(["one"]) - target, indexer = idx.reindex(index, level='second') - target2, indexer2 = index.reindex(idx, level='second') + target, indexer = idx.reindex(index, level="second") + target2, indexer2 = index.reindex(idx, level="second") - exp_index = idx.join(index, level='second', how='right') - exp_index2 = idx.join(index, level='second', how='left') + exp_index = idx.join(index, level="second", how="right") + exp_index2 = idx.join(index, level="second", how="left") assert target.equals(exp_index) exp_indexer = np.array([0, 2, 4]) @@ -39,10 +39,10 @@ def test_reindex_level(idx): tm.assert_numpy_array_equal(indexer2, exp_indexer2, check_dtype=False) with pytest.raises(TypeError, match="Fill method not supported"): - idx.reindex(idx, method='pad', level='second') + idx.reindex(idx, method="pad", level="second") with pytest.raises(TypeError, match="Fill method not supported"): - index.reindex(index, method='bfill', level='first') + index.reindex(index, method="bfill", level="first") def test_reindex_preserves_names_when_target_is_list_or_ndarray(idx): @@ -61,26 +61,25 @@ def test_reindex_preserves_names_when_target_is_list_or_ndarray(idx): assert idx.reindex(other_dtype.tolist())[0].names == [None, None] assert idx.reindex(other_dtype.values)[0].names == [None, None] - idx.names = ['foo', 'bar'] - assert idx.reindex([])[0].names == ['foo', 'bar'] - assert idx.reindex(np.array([]))[0].names == ['foo', 'bar'] - assert idx.reindex(target.tolist())[0].names == ['foo', 'bar'] - assert idx.reindex(target.values)[0].names == ['foo', 'bar'] - assert idx.reindex(other_dtype.tolist())[0].names == ['foo', 'bar'] - assert idx.reindex(other_dtype.values)[0].names == ['foo', 'bar'] + idx.names = ["foo", "bar"] + assert idx.reindex([])[0].names == ["foo", "bar"] + assert idx.reindex(np.array([]))[0].names == ["foo", "bar"] + assert idx.reindex(target.tolist())[0].names == ["foo", "bar"] + assert idx.reindex(target.values)[0].names == ["foo", "bar"] + assert idx.reindex(other_dtype.tolist())[0].names == ["foo", "bar"] + assert idx.reindex(other_dtype.values)[0].names == ["foo", "bar"] def test_reindex_lvl_preserves_names_when_target_is_list_or_array(): # GH7774 - idx = pd.MultiIndex.from_product([[0, 1], ['a', 'b']], - names=['foo', 'bar']) - assert idx.reindex([], level=0)[0].names == ['foo', 'bar'] - assert idx.reindex([], level=1)[0].names == ['foo', 'bar'] + idx = pd.MultiIndex.from_product([[0, 1], ["a", "b"]], names=["foo", "bar"]) + assert idx.reindex([], level=0)[0].names == ["foo", "bar"] + assert idx.reindex([], level=1)[0].names == ["foo", "bar"] def test_reindex_lvl_preserves_type_if_target_is_empty_list_or_array(): # GH7774 - idx = pd.MultiIndex.from_product([[0, 1], ['a', 'b']]) + idx = pd.MultiIndex.from_product([[0, 1], ["a", "b"]]) assert idx.reindex([], level=0)[0].levels[0].dtype.type == np.int64 assert idx.reindex([], level=1)[0].levels[1].dtype.type == np.object_ @@ -92,8 +91,8 @@ def test_reindex_base(idx): actual = idx.get_indexer(idx) tm.assert_numpy_array_equal(expected, actual) - with pytest.raises(ValueError, match='Invalid fill method'): - idx.get_indexer(idx, method='invalid') + with pytest.raises(ValueError, match="Invalid fill method"): + idx.get_indexer(idx, method="invalid") def test_reindex_non_unique(): @@ -101,6 +100,6 @@ def test_reindex_non_unique(): a = pd.Series(np.arange(4), index=idx) new_idx = pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)]) - msg = 'cannot handle a non-unique multi-index!' + msg = "cannot handle a non-unique multi-index!" with pytest.raises(ValueError, match=msg): a.reindex(new_idx) diff --git a/pandas/tests/indexes/multi/test_reshape.py b/pandas/tests/indexes/multi/test_reshape.py index 87a2751631fe63..a30e6f33d14998 100644 --- a/pandas/tests/indexes/multi/test_reshape.py +++ b/pandas/tests/indexes/multi/test_reshape.py @@ -8,67 +8,75 @@ def test_insert(idx): # key contained in all levels - new_index = idx.insert(0, ('bar', 'two')) + new_index = idx.insert(0, ("bar", "two")) assert new_index.equal_levels(idx) - assert new_index[0] == ('bar', 'two') + assert new_index[0] == ("bar", "two") # key not contained in all levels - new_index = idx.insert(0, ('abc', 'three')) + new_index = idx.insert(0, ("abc", "three")) - exp0 = Index(list(idx.levels[0]) + ['abc'], name='first') + exp0 = Index(list(idx.levels[0]) + ["abc"], name="first") tm.assert_index_equal(new_index.levels[0], exp0) - exp1 = Index(list(idx.levels[1]) + ['three'], name='second') + exp1 = Index(list(idx.levels[1]) + ["three"], name="second") tm.assert_index_equal(new_index.levels[1], exp1) - assert new_index[0] == ('abc', 'three') + assert new_index[0] == ("abc", "three") # key wrong length msg = "Item must have length equal to number of levels" with pytest.raises(ValueError, match=msg): - idx.insert(0, ('foo2',)) - - left = pd.DataFrame([['a', 'b', 0], ['b', 'd', 1]], - columns=['1st', '2nd', '3rd']) - left.set_index(['1st', '2nd'], inplace=True) - ts = left['3rd'].copy(deep=True) - - left.loc[('b', 'x'), '3rd'] = 2 - left.loc[('b', 'a'), '3rd'] = -1 - left.loc[('b', 'b'), '3rd'] = 3 - left.loc[('a', 'x'), '3rd'] = 4 - left.loc[('a', 'w'), '3rd'] = 5 - left.loc[('a', 'a'), '3rd'] = 6 - - ts.loc[('b', 'x')] = 2 - ts.loc['b', 'a'] = -1 - ts.loc[('b', 'b')] = 3 - ts.loc['a', 'x'] = 4 - ts.loc[('a', 'w')] = 5 - ts.loc['a', 'a'] = 6 - - right = pd.DataFrame([['a', 'b', 0], ['b', 'd', 1], ['b', 'x', 2], - ['b', 'a', -1], ['b', 'b', 3], ['a', 'x', 4], - ['a', 'w', 5], ['a', 'a', 6]], - columns=['1st', '2nd', '3rd']) - right.set_index(['1st', '2nd'], inplace=True) + idx.insert(0, ("foo2",)) + + left = pd.DataFrame([["a", "b", 0], ["b", "d", 1]], columns=["1st", "2nd", "3rd"]) + left.set_index(["1st", "2nd"], inplace=True) + ts = left["3rd"].copy(deep=True) + + left.loc[("b", "x"), "3rd"] = 2 + left.loc[("b", "a"), "3rd"] = -1 + left.loc[("b", "b"), "3rd"] = 3 + left.loc[("a", "x"), "3rd"] = 4 + left.loc[("a", "w"), "3rd"] = 5 + left.loc[("a", "a"), "3rd"] = 6 + + ts.loc[("b", "x")] = 2 + ts.loc["b", "a"] = -1 + ts.loc[("b", "b")] = 3 + ts.loc["a", "x"] = 4 + ts.loc[("a", "w")] = 5 + ts.loc["a", "a"] = 6 + + right = pd.DataFrame( + [ + ["a", "b", 0], + ["b", "d", 1], + ["b", "x", 2], + ["b", "a", -1], + ["b", "b", 3], + ["a", "x", 4], + ["a", "w", 5], + ["a", "a", 6], + ], + columns=["1st", "2nd", "3rd"], + ) + right.set_index(["1st", "2nd"], inplace=True) # FIXME data types changes to float because # of intermediate nan insertion; tm.assert_frame_equal(left, right, check_dtype=False) - tm.assert_series_equal(ts, right['3rd']) + tm.assert_series_equal(ts, right["3rd"]) # GH9250 - idx = [('test1', i) for i in range(5)] + \ - [('test2', i) for i in range(6)] + \ - [('test', 17), ('test', 18)] + idx = ( + [("test1", i) for i in range(5)] + + [("test2", i) for i in range(6)] + + [("test", 17), ("test", 18)] + ) - left = pd.Series(np.linspace(0, 10, 11), - pd.MultiIndex.from_tuples(idx[:-2])) + left = pd.Series(np.linspace(0, 10, 11), pd.MultiIndex.from_tuples(idx[:-2])) - left.loc[('test', 17)] = 11 - left.loc[('test', 18)] = 12 + left.loc[("test", 17)] = 11 + left.loc[("test", 18)] = 12 - right = pd.Series(np.linspace(0, 12, 13), - pd.MultiIndex.from_tuples(idx)) + right = pd.Series(np.linspace(0, 12, 13), pd.MultiIndex.from_tuples(idx)) tm.assert_series_equal(left, right) @@ -89,12 +97,10 @@ def test_append(idx): def test_repeat(): reps = 2 numbers = [1, 2, 3] - names = np.array(['foo', 'bar']) + names = np.array(["foo", "bar"]) - m = MultiIndex.from_product([ - numbers, names], names=names) - expected = MultiIndex.from_product([ - numbers, names.repeat(reps)], names=names) + m = MultiIndex.from_product([numbers, names], names=names) + expected = MultiIndex.from_product([numbers, names.repeat(reps)], names=names) tm.assert_index_equal(m.repeat(reps), expected) diff --git a/pandas/tests/indexes/multi/test_set_ops.py b/pandas/tests/indexes/multi/test_set_ops.py index 640e121d32b3a4..835784054261ee 100644 --- a/pandas/tests/indexes/multi/test_set_ops.py +++ b/pandas/tests/indexes/multi/test_set_ops.py @@ -8,8 +8,9 @@ @pytest.mark.parametrize("case", [0.5, "xxx"]) @pytest.mark.parametrize("sort", [None, False]) -@pytest.mark.parametrize("method", ["intersection", "union", - "difference", "symmetric_difference"]) +@pytest.mark.parametrize( + "method", ["intersection", "union", "difference", "symmetric_difference"] +) def test_set_ops_error_cases(idx, case, sort, method): # non-iterable input msg = "Input must be Index or array-like" @@ -28,8 +29,7 @@ def test_intersection_base(idx, sort): assert tm.equalContents(intersect, second) # GH 10149 - cases = [klass(second.values) - for klass in [np.array, Series, list]] + cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: result = first.intersection(case, sort=sort) if sort is None: @@ -52,8 +52,7 @@ def test_union_base(idx, sort): assert tm.equalContents(union, everything) # GH 10149 - cases = [klass(second.values) - for klass in [np.array, Series, list]] + cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: result = first.union(case, sort=sort) if sort is None: @@ -78,8 +77,7 @@ def test_difference_base(idx, sort): tm.assert_index_equal(result, answer) # GH 10149 - cases = [klass(second.values) - for klass in [np.array, Series, list]] + cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: result = idx.difference(case, sort=sort) tm.assert_index_equal(result, answer) @@ -102,8 +100,7 @@ def test_symmetric_difference(idx, sort): tm.assert_index_equal(result, answer) # GH 10149 - cases = [klass(second.values) - for klass in [np.array, Series, list]] + cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: result = first.symmetric_difference(case, sort=sort) tm.assert_index_equal(result, answer) @@ -129,9 +126,7 @@ def test_difference(idx, sort): if sort is None: vals = sorted(vals) - expected = MultiIndex.from_tuples(vals, - sortorder=0, - names=idx.names) + expected = MultiIndex.from_tuples(vals, sortorder=0, names=idx.names) assert isinstance(result, MultiIndex) assert result.equals(expected) @@ -158,7 +153,7 @@ def test_difference(idx, sort): # names not the same chunklet = idx[-3:] - chunklet.names = ['foo', 'baz'] + chunklet.names = ["foo", "baz"] result = first.difference(chunklet, sort=sort) assert result.names == (None, None) @@ -176,9 +171,10 @@ def test_difference(idx, sort): assert first.names == result.names # name from non-empty array - result = first.difference([('foo', 'one')], sort=sort) - expected = pd.MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'), ( - 'foo', 'two'), ('qux', 'one'), ('qux', 'two')]) + result = first.difference([("foo", "one")], sort=sort) + expected = pd.MultiIndex.from_tuples( + [("bar", "one"), ("baz", "two"), ("foo", "two"), ("qux", "one"), ("qux", "two")] + ) expected.names = first.names assert first.names == result.names @@ -189,7 +185,7 @@ def test_difference(idx, sort): def test_difference_sort_special(): # GH-24959 - idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + idx = pd.MultiIndex.from_product([[1, 0], ["a", "b"]]) # sort=None, the default result = idx.difference([]) tm.assert_index_equal(result, idx) @@ -198,19 +194,17 @@ def test_difference_sort_special(): @pytest.mark.xfail(reason="Not implemented.") def test_difference_sort_special_true(): # TODO decide on True behaviour - idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + idx = pd.MultiIndex.from_product([[1, 0], ["a", "b"]]) result = idx.difference([], sort=True) - expected = pd.MultiIndex.from_product([[0, 1], ['a', 'b']]) + expected = pd.MultiIndex.from_product([[0, 1], ["a", "b"]]) tm.assert_index_equal(result, expected) def test_difference_sort_incomparable(): # GH-24959 - idx = pd.MultiIndex.from_product([[1, pd.Timestamp('2000'), 2], - ['a', 'b']]) + idx = pd.MultiIndex.from_product([[1, pd.Timestamp("2000"), 2], ["a", "b"]]) - other = pd.MultiIndex.from_product([[3, pd.Timestamp('2000'), 4], - ['c', 'd']]) + other = pd.MultiIndex.from_product([[3, pd.Timestamp("2000"), 4], ["c", "d"]]) # sort=None, the default # MultiIndex.difference deviates here from other difference # implementations in not catching the TypeError @@ -226,10 +220,8 @@ def test_difference_sort_incomparable(): def test_difference_sort_incomparable_true(): # TODO decide on True behaviour # # sort=True, raises - idx = pd.MultiIndex.from_product([[1, pd.Timestamp('2000'), 2], - ['a', 'b']]) - other = pd.MultiIndex.from_product([[3, pd.Timestamp('2000'), 4], - ['c', 'd']]) + idx = pd.MultiIndex.from_product([[1, pd.Timestamp("2000"), 2], ["a", "b"]]) + other = pd.MultiIndex.from_product([[3, pd.Timestamp("2000"), 4], ["c", "d"]]) with pytest.raises(TypeError): idx.difference(other, sort=True) @@ -299,7 +291,7 @@ def test_intersection(idx, sort): def test_intersect_equal_sort(): # GH-24959 - idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + idx = pd.MultiIndex.from_product([[1, 0], ["a", "b"]]) tm.assert_index_equal(idx.intersection(idx, sort=False), idx) tm.assert_index_equal(idx.intersection(idx, sort=None), idx) @@ -307,15 +299,15 @@ def test_intersect_equal_sort(): @pytest.mark.xfail(reason="Not implemented.") def test_intersect_equal_sort_true(): # TODO decide on True behaviour - idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) - sorted_ = pd.MultiIndex.from_product([[0, 1], ['a', 'b']]) + idx = pd.MultiIndex.from_product([[1, 0], ["a", "b"]]) + sorted_ = pd.MultiIndex.from_product([[0, 1], ["a", "b"]]) tm.assert_index_equal(idx.intersection(idx, sort=True), sorted_) -@pytest.mark.parametrize('slice_', [slice(None), slice(0)]) +@pytest.mark.parametrize("slice_", [slice(None), slice(0)]) def test_union_sort_other_empty(slice_): # https://github.com/pandas-dev/pandas/issues/24959 - idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + idx = pd.MultiIndex.from_product([[1, 0], ["a", "b"]]) # default, sort=None other = idx[slice_] @@ -331,16 +323,16 @@ def test_union_sort_other_empty(slice_): def test_union_sort_other_empty_sort(slice_): # TODO decide on True behaviour # # sort=True - idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + idx = pd.MultiIndex.from_product([[1, 0], ["a", "b"]]) other = idx[:0] result = idx.union(other, sort=True) - expected = pd.MultiIndex.from_product([[0, 1], ['a', 'b']]) + expected = pd.MultiIndex.from_product([[0, 1], ["a", "b"]]) tm.assert_index_equal(result, expected) def test_union_sort_other_incomparable(): # https://github.com/pandas-dev/pandas/issues/24959 - idx = pd.MultiIndex.from_product([[1, pd.Timestamp('2000')], ['a', 'b']]) + idx = pd.MultiIndex.from_product([[1, pd.Timestamp("2000")], ["a", "b"]]) # default, sort=None result = idx.union(idx[:1]) @@ -355,16 +347,17 @@ def test_union_sort_other_incomparable(): def test_union_sort_other_incomparable_sort(): # TODO decide on True behaviour # # sort=True - idx = pd.MultiIndex.from_product([[1, pd.Timestamp('2000')], ['a', 'b']]) - with pytest.raises(TypeError, match='Cannot compare'): + idx = pd.MultiIndex.from_product([[1, pd.Timestamp("2000")], ["a", "b"]]) + with pytest.raises(TypeError, match="Cannot compare"): idx.union(idx[:1], sort=True) -@pytest.mark.parametrize("method", ['union', 'intersection', 'difference', - 'symmetric_difference']) +@pytest.mark.parametrize( + "method", ["union", "intersection", "difference", "symmetric_difference"] +) def test_setops_disallow_true(method): - idx1 = pd.MultiIndex.from_product([['a', 'b'], [1, 2]]) - idx2 = pd.MultiIndex.from_product([['b', 'c'], [1, 2]]) + idx1 = pd.MultiIndex.from_product([["a", "b"], [1, 2]]) + idx2 = pd.MultiIndex.from_product([["b", "c"], [1, 2]]) with pytest.raises(ValueError, match="The 'sort' keyword only takes"): getattr(idx1, method)(idx2, sort=True) diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index b3c0bd69475e31..c62bc80cfb53fd 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -33,14 +33,20 @@ def test_sortlevel(idx): def test_sortlevel_not_sort_remaining(): - mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) - sorted_idx, _ = mi.sortlevel('A', sort_remaining=False) + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) + sorted_idx, _ = mi.sortlevel("A", sort_remaining=False) assert sorted_idx.equals(mi) def test_sortlevel_deterministic(): - tuples = [('bar', 'one'), ('foo', 'two'), ('qux', 'two'), - ('foo', 'one'), ('baz', 'two'), ('qux', 'one')] + tuples = [ + ("bar", "one"), + ("foo", "two"), + ("qux", "two"), + ("foo", "one"), + ("baz", "two"), + ("qux", "one"), + ] index = MultiIndex.from_tuples(tuples) @@ -84,55 +90,55 @@ def test_numpy_argsort(idx): msg = "the 'kind' parameter is not supported" with pytest.raises(ValueError, match=msg): - np.argsort(idx, kind='mergesort') + np.argsort(idx, kind="mergesort") msg = "the 'order' parameter is not supported" with pytest.raises(ValueError, match=msg): - np.argsort(idx, order=('a', 'b')) + np.argsort(idx, order=("a", "b")) def test_unsortedindex(): # GH 11897 - mi = pd.MultiIndex.from_tuples([('z', 'a'), ('x', 'a'), ('y', 'b'), - ('x', 'b'), ('y', 'a'), ('z', 'b')], - names=['one', 'two']) - df = pd.DataFrame([[i, 10 * i] for i in range(6)], index=mi, - columns=['one', 'two']) + mi = pd.MultiIndex.from_tuples( + [("z", "a"), ("x", "a"), ("y", "b"), ("x", "b"), ("y", "a"), ("z", "b")], + names=["one", "two"], + ) + df = pd.DataFrame([[i, 10 * i] for i in range(6)], index=mi, columns=["one", "two"]) # GH 16734: not sorted, but no real slicing - result = df.loc(axis=0)['z', 'a'] + result = df.loc(axis=0)["z", "a"] expected = df.iloc[0] tm.assert_series_equal(result, expected) with pytest.raises(UnsortedIndexError): - df.loc(axis=0)['z', slice('a')] + df.loc(axis=0)["z", slice("a")] df.sort_index(inplace=True) - assert len(df.loc(axis=0)['z', :]) == 2 + assert len(df.loc(axis=0)["z", :]) == 2 with pytest.raises(KeyError): - df.loc(axis=0)['q', :] + df.loc(axis=0)["q", :] def test_unsortedindex_doc_examples(): # http://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex # noqa - dfm = DataFrame({'jim': [0, 0, 1, 1], - 'joe': ['x', 'x', 'z', 'y'], - 'jolie': np.random.rand(4)}) + dfm = DataFrame( + {"jim": [0, 0, 1, 1], "joe": ["x", "x", "z", "y"], "jolie": np.random.rand(4)} + ) - dfm = dfm.set_index(['jim', 'joe']) + dfm = dfm.set_index(["jim", "joe"]) with tm.assert_produces_warning(PerformanceWarning): - dfm.loc[(1, 'z')] + dfm.loc[(1, "z")] with pytest.raises(UnsortedIndexError): - dfm.loc[(0, 'y'):(1, 'z')] + dfm.loc[(0, "y"):(1, "z")] assert not dfm.index.is_lexsorted() assert dfm.index.lexsort_depth == 1 # sort it dfm = dfm.sort_index() - dfm.loc[(1, 'z')] - dfm.loc[(0, 'y'):(1, 'z')] + dfm.loc[(1, "z")] + dfm.loc[(0, "y"):(1, "z")] assert dfm.index.is_lexsorted() assert dfm.index.lexsort_depth == 2 @@ -141,9 +147,7 @@ def test_unsortedindex_doc_examples(): def test_reconstruct_sort(): # starts off lexsorted & monotonic - mi = MultiIndex.from_arrays([ - ['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3] - ]) + mi = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]]) assert mi.is_lexsorted() assert mi.is_monotonic @@ -156,9 +160,10 @@ def test_reconstruct_sort(): assert Index(mi.values).equals(Index(recons.values)) # cannot convert to lexsorted - mi = pd.MultiIndex.from_tuples([('z', 'a'), ('x', 'a'), ('y', 'b'), - ('x', 'b'), ('y', 'a'), ('z', 'b')], - names=['one', 'two']) + mi = pd.MultiIndex.from_tuples( + [("z", "a"), ("x", "a"), ("y", "b"), ("x", "b"), ("y", "a"), ("z", "b")], + names=["one", "two"], + ) assert not mi.is_lexsorted() assert not mi.is_monotonic @@ -170,9 +175,11 @@ def test_reconstruct_sort(): assert Index(mi.values).equals(Index(recons.values)) # cannot convert to lexsorted - mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]], - codes=[[0, 1, 0, 2], [2, 0, 0, 1]], - names=['col1', 'col2']) + mi = MultiIndex( + levels=[["b", "d", "a"], [1, 2, 3]], + codes=[[0, 1, 0, 2], [2, 0, 0, 1]], + names=["col1", "col2"], + ) assert not mi.is_lexsorted() assert not mi.is_monotonic @@ -186,25 +193,27 @@ def test_reconstruct_sort(): def test_reconstruct_remove_unused(): # xref to GH 2770 - df = DataFrame([['deleteMe', 1, 9], - ['keepMe', 2, 9], - ['keepMeToo', 3, 9]], - columns=['first', 'second', 'third']) - df2 = df.set_index(['first', 'second'], drop=False) - df2 = df2[df2['first'] != 'deleteMe'] + df = DataFrame( + [["deleteMe", 1, 9], ["keepMe", 2, 9], ["keepMeToo", 3, 9]], + columns=["first", "second", "third"], + ) + df2 = df.set_index(["first", "second"], drop=False) + df2 = df2[df2["first"] != "deleteMe"] # removed levels are there - expected = MultiIndex(levels=[['deleteMe', 'keepMe', 'keepMeToo'], - [1, 2, 3]], - codes=[[1, 2], [1, 2]], - names=['first', 'second']) + expected = MultiIndex( + levels=[["deleteMe", "keepMe", "keepMeToo"], [1, 2, 3]], + codes=[[1, 2], [1, 2]], + names=["first", "second"], + ) result = df2.index tm.assert_index_equal(result, expected) - expected = MultiIndex(levels=[['keepMe', 'keepMeToo'], - [2, 3]], - codes=[[0, 1], [0, 1]], - names=['first', 'second']) + expected = MultiIndex( + levels=[["keepMe", "keepMeToo"], [2, 3]], + codes=[[0, 1], [0, 1]], + names=["first", "second"], + ) result = df2.index.remove_unused_levels() tm.assert_index_equal(result, expected) @@ -214,10 +223,9 @@ def test_reconstruct_remove_unused(): assert result2.is_(result) -@pytest.mark.parametrize('first_type,second_type', [ - ('int64', 'int64'), - ('datetime64[D]', 'str') -]) +@pytest.mark.parametrize( + "first_type,second_type", [("int64", "int64"), ("datetime64[D]", "str")] +) def test_remove_unused_levels_large(first_type, second_type): # GH16556 @@ -227,11 +235,14 @@ def test_remove_unused_levels_large(first_type, second_type): rng = np.random.RandomState(4) # seed is arbitrary value that works size = 1 << 16 - df = DataFrame(dict( - first=rng.randint(0, 1 << 13, size).astype(first_type), - second=rng.randint(0, 1 << 10, size).astype(second_type), - third=rng.rand(size))) - df = df.groupby(['first', 'second']).sum() + df = DataFrame( + dict( + first=rng.randint(0, 1 << 13, size).astype(first_type), + second=rng.randint(0, 1 << 10, size).astype(second_type), + third=rng.rand(size), + ) + ) + df = df.groupby(["first", "second"]).sum() df = df[df.third < 0.1] result = df.index.remove_unused_levels() @@ -239,23 +250,24 @@ def test_remove_unused_levels_large(first_type, second_type): assert len(result.levels[1]) < len(df.index.levels[1]) assert result.equals(df.index) - expected = df.reset_index().set_index(['first', 'second']).index + expected = df.reset_index().set_index(["first", "second"]).index tm.assert_index_equal(result, expected) -@pytest.mark.parametrize('level0', [['a', 'd', 'b'], - ['a', 'd', 'b', 'unused']]) -@pytest.mark.parametrize('level1', [['w', 'x', 'y', 'z'], - ['w', 'x', 'y', 'z', 'unused']]) +@pytest.mark.parametrize("level0", [["a", "d", "b"], ["a", "d", "b", "unused"]]) +@pytest.mark.parametrize( + "level1", [["w", "x", "y", "z"], ["w", "x", "y", "z", "unused"]] +) def test_remove_unused_nan(level0, level1): # GH 18417 - mi = pd.MultiIndex(levels=[level0, level1], - codes=[[0, 2, -1, 1, -1], [0, 1, 2, 3, 2]]) + mi = pd.MultiIndex( + levels=[level0, level1], codes=[[0, 2, -1, 1, -1], [0, 1, 2, 3, 2]] + ) result = mi.remove_unused_levels() tm.assert_index_equal(result, mi) for level in 0, 1: - assert('unused' not in result.levels[level]) + assert "unused" not in result.levels[level] def test_argsort(idx): diff --git a/pandas/tests/indexes/period/test_arithmetic.py b/pandas/tests/indexes/period/test_arithmetic.py index d41e33a2411415..1057ca7bbd6629 100644 --- a/pandas/tests/indexes/period/test_arithmetic.py +++ b/pandas/tests/indexes/period/test_arithmetic.py @@ -11,96 +11,111 @@ class TestPeriodIndexArithmetic: # PeriodIndex.shift is used by __add__ and __sub__ def test_pi_shift_ndarray(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], - freq='M', name='idx') + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) result = idx.shift(np.array([1, 2, 3, 4])) - expected = PeriodIndex(['2011-02', '2011-04', 'NaT', '2011-08'], - freq='M', name='idx') + expected = PeriodIndex( + ["2011-02", "2011-04", "NaT", "2011-08"], freq="M", name="idx" + ) tm.assert_index_equal(result, expected) result = idx.shift(np.array([1, -2, 3, -4])) - expected = PeriodIndex(['2011-02', '2010-12', 'NaT', '2010-12'], - freq='M', name='idx') + expected = PeriodIndex( + ["2011-02", "2010-12", "NaT", "2010-12"], freq="M", name="idx" + ) tm.assert_index_equal(result, expected) def test_shift(self): - pi1 = period_range(freq='A', start='1/1/2001', end='12/1/2009') - pi2 = period_range(freq='A', start='1/1/2002', end='12/1/2010') + pi1 = period_range(freq="A", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="A", start="1/1/2002", end="12/1/2010") tm.assert_index_equal(pi1.shift(0), pi1) assert len(pi1) == len(pi2) tm.assert_index_equal(pi1.shift(1), pi2) - pi1 = period_range(freq='A', start='1/1/2001', end='12/1/2009') - pi2 = period_range(freq='A', start='1/1/2000', end='12/1/2008') + pi1 = period_range(freq="A", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="A", start="1/1/2000", end="12/1/2008") assert len(pi1) == len(pi2) tm.assert_index_equal(pi1.shift(-1), pi2) - pi1 = period_range(freq='M', start='1/1/2001', end='12/1/2009') - pi2 = period_range(freq='M', start='2/1/2001', end='1/1/2010') + pi1 = period_range(freq="M", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="M", start="2/1/2001", end="1/1/2010") assert len(pi1) == len(pi2) tm.assert_index_equal(pi1.shift(1), pi2) - pi1 = period_range(freq='M', start='1/1/2001', end='12/1/2009') - pi2 = period_range(freq='M', start='12/1/2000', end='11/1/2009') + pi1 = period_range(freq="M", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="M", start="12/1/2000", end="11/1/2009") assert len(pi1) == len(pi2) tm.assert_index_equal(pi1.shift(-1), pi2) - pi1 = period_range(freq='D', start='1/1/2001', end='12/1/2009') - pi2 = period_range(freq='D', start='1/2/2001', end='12/2/2009') + pi1 = period_range(freq="D", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="D", start="1/2/2001", end="12/2/2009") assert len(pi1) == len(pi2) tm.assert_index_equal(pi1.shift(1), pi2) - pi1 = period_range(freq='D', start='1/1/2001', end='12/1/2009') - pi2 = period_range(freq='D', start='12/31/2000', end='11/30/2009') + pi1 = period_range(freq="D", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="D", start="12/31/2000", end="11/30/2009") assert len(pi1) == len(pi2) tm.assert_index_equal(pi1.shift(-1), pi2) def test_shift_corner_cases(self): # GH#9903 - idx = pd.PeriodIndex([], name='xxx', freq='H') + idx = pd.PeriodIndex([], name="xxx", freq="H") with pytest.raises(TypeError): # period shift doesn't accept freq - idx.shift(1, freq='H') + idx.shift(1, freq="H") tm.assert_index_equal(idx.shift(0), idx) tm.assert_index_equal(idx.shift(3), idx) - idx = pd.PeriodIndex(['2011-01-01 10:00', '2011-01-01 11:00' - '2011-01-01 12:00'], name='xxx', freq='H') + idx = pd.PeriodIndex( + ["2011-01-01 10:00", "2011-01-01 11:00" "2011-01-01 12:00"], + name="xxx", + freq="H", + ) tm.assert_index_equal(idx.shift(0), idx) - exp = pd.PeriodIndex(['2011-01-01 13:00', '2011-01-01 14:00' - '2011-01-01 15:00'], name='xxx', freq='H') + exp = pd.PeriodIndex( + ["2011-01-01 13:00", "2011-01-01 14:00" "2011-01-01 15:00"], + name="xxx", + freq="H", + ) tm.assert_index_equal(idx.shift(3), exp) - exp = pd.PeriodIndex(['2011-01-01 07:00', '2011-01-01 08:00' - '2011-01-01 09:00'], name='xxx', freq='H') + exp = pd.PeriodIndex( + ["2011-01-01 07:00", "2011-01-01 08:00" "2011-01-01 09:00"], + name="xxx", + freq="H", + ) tm.assert_index_equal(idx.shift(-3), exp) def test_shift_nat(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], - freq='M', name='idx') + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) result = idx.shift(1) - expected = PeriodIndex(['2011-02', '2011-03', 'NaT', '2011-05'], - freq='M', name='idx') + expected = PeriodIndex( + ["2011-02", "2011-03", "NaT", "2011-05"], freq="M", name="idx" + ) tm.assert_index_equal(result, expected) assert result.name == expected.name def test_shift_gh8083(self): # test shift for PeriodIndex # GH#8083 - drange = pd.period_range('20130101', periods=5, freq='D') + drange = pd.period_range("20130101", periods=5, freq="D") result = drange.shift(1) - expected = PeriodIndex(['2013-01-02', '2013-01-03', '2013-01-04', - '2013-01-05', '2013-01-06'], freq='D') + expected = PeriodIndex( + ["2013-01-02", "2013-01-03", "2013-01-04", "2013-01-05", "2013-01-06"], + freq="D", + ) tm.assert_index_equal(result, expected) def test_shift_periods(self): # GH #22458 : argument 'n' was deprecated in favor of 'periods' - idx = period_range(freq='A', start='1/1/2001', end='12/1/2009') + idx = period_range(freq="A", start="1/1/2001", end="12/1/2009") tm.assert_index_equal(idx.shift(periods=0), idx) tm.assert_index_equal(idx.shift(0), idx) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=True): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=True): tm.assert_index_equal(idx.shift(n=0), idx) diff --git a/pandas/tests/indexes/period/test_asfreq.py b/pandas/tests/indexes/period/test_asfreq.py index 373f42b930425c..2a6e84da229e18 100644 --- a/pandas/tests/indexes/period/test_asfreq.py +++ b/pandas/tests/indexes/period/test_asfreq.py @@ -7,148 +7,143 @@ class TestPeriodIndex: - def test_asfreq(self): - pi1 = period_range(freq='A', start='1/1/2001', end='1/1/2001') - pi2 = period_range(freq='Q', start='1/1/2001', end='1/1/2001') - pi3 = period_range(freq='M', start='1/1/2001', end='1/1/2001') - pi4 = period_range(freq='D', start='1/1/2001', end='1/1/2001') - pi5 = period_range(freq='H', start='1/1/2001', end='1/1/2001 00:00') - pi6 = period_range(freq='Min', start='1/1/2001', end='1/1/2001 00:00') - pi7 = period_range(freq='S', start='1/1/2001', end='1/1/2001 00:00:00') - - assert pi1.asfreq('Q', 'S') == pi2 - assert pi1.asfreq('Q', 's') == pi2 - assert pi1.asfreq('M', 'start') == pi3 - assert pi1.asfreq('D', 'StarT') == pi4 - assert pi1.asfreq('H', 'beGIN') == pi5 - assert pi1.asfreq('Min', 'S') == pi6 - assert pi1.asfreq('S', 'S') == pi7 - - assert pi2.asfreq('A', 'S') == pi1 - assert pi2.asfreq('M', 'S') == pi3 - assert pi2.asfreq('D', 'S') == pi4 - assert pi2.asfreq('H', 'S') == pi5 - assert pi2.asfreq('Min', 'S') == pi6 - assert pi2.asfreq('S', 'S') == pi7 - - assert pi3.asfreq('A', 'S') == pi1 - assert pi3.asfreq('Q', 'S') == pi2 - assert pi3.asfreq('D', 'S') == pi4 - assert pi3.asfreq('H', 'S') == pi5 - assert pi3.asfreq('Min', 'S') == pi6 - assert pi3.asfreq('S', 'S') == pi7 - - assert pi4.asfreq('A', 'S') == pi1 - assert pi4.asfreq('Q', 'S') == pi2 - assert pi4.asfreq('M', 'S') == pi3 - assert pi4.asfreq('H', 'S') == pi5 - assert pi4.asfreq('Min', 'S') == pi6 - assert pi4.asfreq('S', 'S') == pi7 - - assert pi5.asfreq('A', 'S') == pi1 - assert pi5.asfreq('Q', 'S') == pi2 - assert pi5.asfreq('M', 'S') == pi3 - assert pi5.asfreq('D', 'S') == pi4 - assert pi5.asfreq('Min', 'S') == pi6 - assert pi5.asfreq('S', 'S') == pi7 - - assert pi6.asfreq('A', 'S') == pi1 - assert pi6.asfreq('Q', 'S') == pi2 - assert pi6.asfreq('M', 'S') == pi3 - assert pi6.asfreq('D', 'S') == pi4 - assert pi6.asfreq('H', 'S') == pi5 - assert pi6.asfreq('S', 'S') == pi7 - - assert pi7.asfreq('A', 'S') == pi1 - assert pi7.asfreq('Q', 'S') == pi2 - assert pi7.asfreq('M', 'S') == pi3 - assert pi7.asfreq('D', 'S') == pi4 - assert pi7.asfreq('H', 'S') == pi5 - assert pi7.asfreq('Min', 'S') == pi6 + pi1 = period_range(freq="A", start="1/1/2001", end="1/1/2001") + pi2 = period_range(freq="Q", start="1/1/2001", end="1/1/2001") + pi3 = period_range(freq="M", start="1/1/2001", end="1/1/2001") + pi4 = period_range(freq="D", start="1/1/2001", end="1/1/2001") + pi5 = period_range(freq="H", start="1/1/2001", end="1/1/2001 00:00") + pi6 = period_range(freq="Min", start="1/1/2001", end="1/1/2001 00:00") + pi7 = period_range(freq="S", start="1/1/2001", end="1/1/2001 00:00:00") + + assert pi1.asfreq("Q", "S") == pi2 + assert pi1.asfreq("Q", "s") == pi2 + assert pi1.asfreq("M", "start") == pi3 + assert pi1.asfreq("D", "StarT") == pi4 + assert pi1.asfreq("H", "beGIN") == pi5 + assert pi1.asfreq("Min", "S") == pi6 + assert pi1.asfreq("S", "S") == pi7 + + assert pi2.asfreq("A", "S") == pi1 + assert pi2.asfreq("M", "S") == pi3 + assert pi2.asfreq("D", "S") == pi4 + assert pi2.asfreq("H", "S") == pi5 + assert pi2.asfreq("Min", "S") == pi6 + assert pi2.asfreq("S", "S") == pi7 + + assert pi3.asfreq("A", "S") == pi1 + assert pi3.asfreq("Q", "S") == pi2 + assert pi3.asfreq("D", "S") == pi4 + assert pi3.asfreq("H", "S") == pi5 + assert pi3.asfreq("Min", "S") == pi6 + assert pi3.asfreq("S", "S") == pi7 + + assert pi4.asfreq("A", "S") == pi1 + assert pi4.asfreq("Q", "S") == pi2 + assert pi4.asfreq("M", "S") == pi3 + assert pi4.asfreq("H", "S") == pi5 + assert pi4.asfreq("Min", "S") == pi6 + assert pi4.asfreq("S", "S") == pi7 + + assert pi5.asfreq("A", "S") == pi1 + assert pi5.asfreq("Q", "S") == pi2 + assert pi5.asfreq("M", "S") == pi3 + assert pi5.asfreq("D", "S") == pi4 + assert pi5.asfreq("Min", "S") == pi6 + assert pi5.asfreq("S", "S") == pi7 + + assert pi6.asfreq("A", "S") == pi1 + assert pi6.asfreq("Q", "S") == pi2 + assert pi6.asfreq("M", "S") == pi3 + assert pi6.asfreq("D", "S") == pi4 + assert pi6.asfreq("H", "S") == pi5 + assert pi6.asfreq("S", "S") == pi7 + + assert pi7.asfreq("A", "S") == pi1 + assert pi7.asfreq("Q", "S") == pi2 + assert pi7.asfreq("M", "S") == pi3 + assert pi7.asfreq("D", "S") == pi4 + assert pi7.asfreq("H", "S") == pi5 + assert pi7.asfreq("Min", "S") == pi6 msg = "How must be one of S or E" with pytest.raises(ValueError, match=msg): - pi7.asfreq('T', 'foo') - result1 = pi1.asfreq('3M') - result2 = pi1.asfreq('M') - expected = period_range(freq='M', start='2001-12', end='2001-12') + pi7.asfreq("T", "foo") + result1 = pi1.asfreq("3M") + result2 = pi1.asfreq("M") + expected = period_range(freq="M", start="2001-12", end="2001-12") tm.assert_numpy_array_equal(result1.asi8, expected.asi8) - assert result1.freqstr == '3M' + assert result1.freqstr == "3M" tm.assert_numpy_array_equal(result2.asi8, expected.asi8) - assert result2.freqstr == 'M' + assert result2.freqstr == "M" def test_asfreq_nat(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], freq='M') - result = idx.asfreq(freq='Q') - expected = PeriodIndex(['2011Q1', '2011Q1', 'NaT', '2011Q2'], freq='Q') + idx = PeriodIndex(["2011-01", "2011-02", "NaT", "2011-04"], freq="M") + result = idx.asfreq(freq="Q") + expected = PeriodIndex(["2011Q1", "2011Q1", "NaT", "2011Q2"], freq="Q") tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('freq', ['D', '3D']) + @pytest.mark.parametrize("freq", ["D", "3D"]) def test_asfreq_mult_pi(self, freq): - pi = PeriodIndex(['2001-01', '2001-02', 'NaT', '2001-03'], freq='2M') + pi = PeriodIndex(["2001-01", "2001-02", "NaT", "2001-03"], freq="2M") result = pi.asfreq(freq) - exp = PeriodIndex(['2001-02-28', '2001-03-31', 'NaT', - '2001-04-30'], freq=freq) + exp = PeriodIndex(["2001-02-28", "2001-03-31", "NaT", "2001-04-30"], freq=freq) tm.assert_index_equal(result, exp) assert result.freq == exp.freq - result = pi.asfreq(freq, how='S') - exp = PeriodIndex(['2001-01-01', '2001-02-01', 'NaT', - '2001-03-01'], freq=freq) + result = pi.asfreq(freq, how="S") + exp = PeriodIndex(["2001-01-01", "2001-02-01", "NaT", "2001-03-01"], freq=freq) tm.assert_index_equal(result, exp) assert result.freq == exp.freq def test_asfreq_combined_pi(self): - pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'], - freq='H') - exp = PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'], - freq='25H') - for freq, how in zip(['1D1H', '1H1D'], ['S', 'E']): + pi = pd.PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="H") + exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="25H") + for freq, how in zip(["1D1H", "1H1D"], ["S", "E"]): result = pi.asfreq(freq, how=how) tm.assert_index_equal(result, exp) assert result.freq == exp.freq - for freq in ['1D1H', '1H1D']: - pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', - 'NaT'], freq=freq) - result = pi.asfreq('H') - exp = PeriodIndex(['2001-01-02 00:00', '2001-01-03 02:00', 'NaT'], - freq='H') + for freq in ["1D1H", "1H1D"]: + pi = pd.PeriodIndex( + ["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq=freq + ) + result = pi.asfreq("H") + exp = PeriodIndex(["2001-01-02 00:00", "2001-01-03 02:00", "NaT"], freq="H") tm.assert_index_equal(result, exp) assert result.freq == exp.freq - pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', - 'NaT'], freq=freq) - result = pi.asfreq('H', how='S') - exp = PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'], - freq='H') + pi = pd.PeriodIndex( + ["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq=freq + ) + result = pi.asfreq("H", how="S") + exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="H") tm.assert_index_equal(result, exp) assert result.freq == exp.freq def test_asfreq_ts(self): - index = period_range(freq='A', start='1/1/2001', end='12/31/2010') + index = period_range(freq="A", start="1/1/2001", end="12/31/2010") ts = Series(np.random.randn(len(index)), index=index) df = DataFrame(np.random.randn(len(index), 3), index=index) - result = ts.asfreq('D', how='end') - df_result = df.asfreq('D', how='end') - exp_index = index.asfreq('D', how='end') + result = ts.asfreq("D", how="end") + df_result = df.asfreq("D", how="end") + exp_index = index.asfreq("D", how="end") assert len(result) == len(ts) tm.assert_index_equal(result.index, exp_index) tm.assert_index_equal(df_result.index, exp_index) - result = ts.asfreq('D', how='start') + result = ts.asfreq("D", how="start") assert len(result) == len(ts) - tm.assert_index_equal(result.index, index.asfreq('D', how='start')) + tm.assert_index_equal(result.index, index.asfreq("D", how="start")) def test_astype_asfreq(self): - pi1 = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01'], freq='D') - exp = PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='M') - tm.assert_index_equal(pi1.asfreq('M'), exp) - tm.assert_index_equal(pi1.astype('period[M]'), exp) - - exp = PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='3M') - tm.assert_index_equal(pi1.asfreq('3M'), exp) - tm.assert_index_equal(pi1.astype('period[3M]'), exp) + pi1 = PeriodIndex(["2011-01-01", "2011-02-01", "2011-03-01"], freq="D") + exp = PeriodIndex(["2011-01", "2011-02", "2011-03"], freq="M") + tm.assert_index_equal(pi1.asfreq("M"), exp) + tm.assert_index_equal(pi1.astype("period[M]"), exp) + + exp = PeriodIndex(["2011-01", "2011-02", "2011-03"], freq="3M") + tm.assert_index_equal(pi1.asfreq("3M"), exp) + tm.assert_index_equal(pi1.astype("period[3M]"), exp) diff --git a/pandas/tests/indexes/period/test_astype.py b/pandas/tests/indexes/period/test_astype.py index bcf1109358d225..fa57ec2b1f7ca5 100644 --- a/pandas/tests/indexes/period/test_astype.py +++ b/pandas/tests/indexes/period/test_astype.py @@ -7,73 +7,73 @@ class TestPeriodIndexAsType: - @pytest.mark.parametrize('dtype', [ - float, 'timedelta64', 'timedelta64[ns]']) + @pytest.mark.parametrize("dtype", [float, "timedelta64", "timedelta64[ns]"]) def test_astype_raises(self, dtype): # GH#13149, GH#13209 - idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') - msg = 'Cannot cast PeriodArray to dtype' + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.NaN], freq="D") + msg = "Cannot cast PeriodArray to dtype" with pytest.raises(TypeError, match=msg): idx.astype(dtype) def test_astype_conversion(self): # GH#13149, GH#13209 - idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.NaN], freq="D") result = idx.astype(object) - expected = Index([Period('2016-05-16', freq='D')] + - [Period(NaT, freq='D')] * 3, dtype='object') + expected = Index( + [Period("2016-05-16", freq="D")] + [Period(NaT, freq="D")] * 3, + dtype="object", + ) tm.assert_index_equal(result, expected) result = idx.astype(np.int64) - expected = Int64Index([16937] + [-9223372036854775808] * 3, - dtype=np.int64) + expected = Int64Index([16937] + [-9223372036854775808] * 3, dtype=np.int64) tm.assert_index_equal(result, expected) result = idx.astype(str) expected = Index(str(x) for x in idx) tm.assert_index_equal(result, expected) - idx = period_range('1990', '2009', freq='A') - result = idx.astype('i8') + idx = period_range("1990", "2009", freq="A") + result = idx.astype("i8") tm.assert_index_equal(result, Index(idx.asi8)) tm.assert_numpy_array_equal(result.values, idx.asi8) def test_astype_uint(self): - arr = period_range('2000', periods=2) - expected = pd.UInt64Index(np.array([10957, 10958], dtype='uint64')) + arr = period_range("2000", periods=2) + expected = pd.UInt64Index(np.array([10957, 10958], dtype="uint64")) tm.assert_index_equal(arr.astype("uint64"), expected) tm.assert_index_equal(arr.astype("uint32"), expected) def test_astype_object(self): - idx = pd.PeriodIndex([], freq='M') + idx = pd.PeriodIndex([], freq="M") exp = np.array([], dtype=object) tm.assert_numpy_array_equal(idx.astype(object).values, exp) tm.assert_numpy_array_equal(idx._mpl_repr(), exp) - idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') + idx = pd.PeriodIndex(["2011-01", pd.NaT], freq="M") - exp = np.array([pd.Period('2011-01', freq='M'), pd.NaT], dtype=object) + exp = np.array([pd.Period("2011-01", freq="M"), pd.NaT], dtype=object) tm.assert_numpy_array_equal(idx.astype(object).values, exp) tm.assert_numpy_array_equal(idx._mpl_repr(), exp) - exp = np.array([pd.Period('2011-01-01', freq='D'), pd.NaT], - dtype=object) - idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') + exp = np.array([pd.Period("2011-01-01", freq="D"), pd.NaT], dtype=object) + idx = pd.PeriodIndex(["2011-01-01", pd.NaT], freq="D") tm.assert_numpy_array_equal(idx.astype(object).values, exp) tm.assert_numpy_array_equal(idx._mpl_repr(), exp) # TODO: de-duplicate this version (from test_ops) with the one above # (from test_period) def test_astype_object2(self): - idx = pd.period_range(start='2013-01-01', periods=4, freq='M', - name='idx') - expected_list = [pd.Period('2013-01-31', freq='M'), - pd.Period('2013-02-28', freq='M'), - pd.Period('2013-03-31', freq='M'), - pd.Period('2013-04-30', freq='M')] - expected = pd.Index(expected_list, dtype=object, name='idx') + idx = pd.period_range(start="2013-01-01", periods=4, freq="M", name="idx") + expected_list = [ + pd.Period("2013-01-31", freq="M"), + pd.Period("2013-02-28", freq="M"), + pd.Period("2013-03-31", freq="M"), + pd.Period("2013-04-30", freq="M"), + ] + expected = pd.Index(expected_list, dtype=object, name="idx") result = idx.astype(object) assert isinstance(result, Index) assert result.dtype == object @@ -81,13 +81,16 @@ def test_astype_object2(self): assert result.name == expected.name assert idx.tolist() == expected_list - idx = PeriodIndex(['2013-01-01', '2013-01-02', 'NaT', - '2013-01-04'], freq='D', name='idx') - expected_list = [pd.Period('2013-01-01', freq='D'), - pd.Period('2013-01-02', freq='D'), - pd.Period('NaT', freq='D'), - pd.Period('2013-01-04', freq='D')] - expected = pd.Index(expected_list, dtype=object, name='idx') + idx = PeriodIndex( + ["2013-01-01", "2013-01-02", "NaT", "2013-01-04"], freq="D", name="idx" + ) + expected_list = [ + pd.Period("2013-01-01", freq="D"), + pd.Period("2013-01-02", freq="D"), + pd.Period("NaT", freq="D"), + pd.Period("2013-01-04", freq="D"), + ] + expected = pd.Index(expected_list, dtype=object, name="idx") result = idx.astype(object) assert isinstance(result, Index) assert result.dtype == object @@ -104,12 +107,13 @@ def test_astype_object2(self): def test_astype_category(self): obj = pd.period_range("2000", periods=2) - result = obj.astype('category') - expected = pd.CategoricalIndex([pd.Period('2000-01-01', freq="D"), - pd.Period('2000-01-02', freq="D")]) + result = obj.astype("category") + expected = pd.CategoricalIndex( + [pd.Period("2000-01-01", freq="D"), pd.Period("2000-01-02", freq="D")] + ) tm.assert_index_equal(result, expected) - result = obj._data.astype('category') + result = obj._data.astype("category") expected = expected.values tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/indexes/period/test_construction.py b/pandas/tests/indexes/period/test_construction.py index 2b420dd7259982..7c10239faad420 100644 --- a/pandas/tests/indexes/period/test_construction.py +++ b/pandas/tests/indexes/period/test_construction.py @@ -6,43 +6,38 @@ from pandas.core.dtypes.dtypes import PeriodDtype import pandas as pd -from pandas import ( - Index, Period, PeriodIndex, Series, date_range, offsets, period_range) +from pandas import Index, Period, PeriodIndex, Series, date_range, offsets, period_range import pandas.core.indexes.period as period import pandas.util.testing as tm class TestPeriodIndex: - def setup_method(self, method): pass def test_construction_base_constructor(self): # GH 13664 - arr = [pd.Period('2011-01', freq='M'), pd.NaT, - pd.Period('2011-03', freq='M')] + arr = [pd.Period("2011-01", freq="M"), pd.NaT, pd.Period("2011-03", freq="M")] tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) - tm.assert_index_equal(pd.Index(np.array(arr)), - pd.PeriodIndex(np.array(arr))) + tm.assert_index_equal(pd.Index(np.array(arr)), pd.PeriodIndex(np.array(arr))) - arr = [np.nan, pd.NaT, pd.Period('2011-03', freq='M')] + arr = [np.nan, pd.NaT, pd.Period("2011-03", freq="M")] tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) - tm.assert_index_equal(pd.Index(np.array(arr)), - pd.PeriodIndex(np.array(arr))) + tm.assert_index_equal(pd.Index(np.array(arr)), pd.PeriodIndex(np.array(arr))) - arr = [pd.Period('2011-01', freq='M'), pd.NaT, - pd.Period('2011-03', freq='D')] + arr = [pd.Period("2011-01", freq="M"), pd.NaT, pd.Period("2011-03", freq="D")] tm.assert_index_equal(pd.Index(arr), pd.Index(arr, dtype=object)) - tm.assert_index_equal(pd.Index(np.array(arr)), - pd.Index(np.array(arr), dtype=object)) + tm.assert_index_equal( + pd.Index(np.array(arr)), pd.Index(np.array(arr), dtype=object) + ) def test_constructor_use_start_freq(self): # GH #1118 - p = Period('4/2/2012', freq='B') + p = Period("4/2/2012", freq="B") with tm.assert_produces_warning(FutureWarning): index = PeriodIndex(start=p, periods=10) - expected = period_range(start='4/2/2012', periods=10, freq='B') + expected = period_range(start="4/2/2012", periods=10, freq="B") tm.assert_index_equal(index, expected) index = period_range(start=p, periods=10) @@ -54,11 +49,11 @@ def test_constructor_field_arrays(self): years = np.arange(1990, 2010).repeat(4)[2:-2] quarters = np.tile(np.arange(1, 5), 20)[2:-2] - index = PeriodIndex(year=years, quarter=quarters, freq='Q-DEC') - expected = period_range('1990Q3', '2009Q2', freq='Q-DEC') + index = PeriodIndex(year=years, quarter=quarters, freq="Q-DEC") + expected = period_range("1990Q3", "2009Q2", freq="Q-DEC") tm.assert_index_equal(index, expected) - index2 = PeriodIndex(year=years, quarter=quarters, freq='2Q-DEC') + index2 = PeriodIndex(year=years, quarter=quarters, freq="2Q-DEC") tm.assert_numpy_array_equal(index.asi8, index2.asi8) index = PeriodIndex(year=years, quarter=quarters) @@ -69,33 +64,40 @@ def test_constructor_field_arrays(self): msg = "Mismatched Period array lengths" with pytest.raises(ValueError, match=msg): - PeriodIndex(year=years, month=months, freq='M') + PeriodIndex(year=years, month=months, freq="M") with pytest.raises(ValueError, match=msg): - PeriodIndex(year=years, month=months, freq='2M') + PeriodIndex(year=years, month=months, freq="2M") msg = "Can either instantiate from fields or endpoints, but not both" with pytest.raises(ValueError, match=msg): - PeriodIndex(year=years, month=months, freq='M', - start=Period('2007-01', freq='M')) + PeriodIndex( + year=years, month=months, freq="M", start=Period("2007-01", freq="M") + ) years = [2007, 2007, 2007] months = [1, 2, 3] - idx = PeriodIndex(year=years, month=months, freq='M') - exp = period_range('2007-01', periods=3, freq='M') + idx = PeriodIndex(year=years, month=months, freq="M") + exp = period_range("2007-01", periods=3, freq="M") tm.assert_index_equal(idx, exp) def test_constructor_U(self): # U was used as undefined period with pytest.raises(ValueError, match="Invalid frequency: X"): - period_range('2007-1-1', periods=500, freq='X') + period_range("2007-1-1", periods=500, freq="X") def test_constructor_nano(self): - idx = period_range(start=Period(ordinal=1, freq='N'), - end=Period(ordinal=4, freq='N'), freq='N') - exp = PeriodIndex([Period(ordinal=1, freq='N'), - Period(ordinal=2, freq='N'), - Period(ordinal=3, freq='N'), - Period(ordinal=4, freq='N')], freq='N') + idx = period_range( + start=Period(ordinal=1, freq="N"), end=Period(ordinal=4, freq="N"), freq="N" + ) + exp = PeriodIndex( + [ + Period(ordinal=1, freq="N"), + Period(ordinal=2, freq="N"), + Period(ordinal=3, freq="N"), + Period(ordinal=4, freq="N"), + ], + freq="N", + ) tm.assert_index_equal(idx, exp) def test_constructor_arrays_negative_year(self): @@ -110,34 +112,35 @@ def test_constructor_arrays_negative_year(self): def test_constructor_invalid_quarters(self): msg = "Quarter must be 1 <= q <= 4" with pytest.raises(ValueError, match=msg): - PeriodIndex(year=range(2000, 2004), quarter=list(range(4)), - freq='Q-DEC') + PeriodIndex(year=range(2000, 2004), quarter=list(range(4)), freq="Q-DEC") def test_constructor_corner(self): msg = "Not enough parameters to construct Period range" with pytest.raises(ValueError, match=msg): - PeriodIndex(periods=10, freq='A') + PeriodIndex(periods=10, freq="A") - start = Period('2007', freq='A-JUN') - end = Period('2010', freq='A-DEC') + start = Period("2007", freq="A-JUN") + end = Period("2010", freq="A-DEC") msg = "start and end must have same freq" with pytest.raises(ValueError, match=msg): PeriodIndex(start=start, end=end) - msg = ("Of the three parameters: start, end, and periods, exactly two" - " must be specified") + msg = ( + "Of the three parameters: start, end, and periods, exactly two" + " must be specified" + ) with pytest.raises(ValueError, match=msg): PeriodIndex(start=start) with pytest.raises(ValueError, match=msg): PeriodIndex(end=end) - result = period_range('2007-01', periods=10.5, freq='M') - exp = period_range('2007-01', periods=10, freq='M') + result = period_range("2007-01", periods=10.5, freq="M") + exp = period_range("2007-01", periods=10, freq="M") tm.assert_index_equal(result, exp) def test_constructor_fromarraylike(self): - idx = period_range('2007-01', periods=20, freq='M') + idx = period_range("2007-01", periods=20, freq="M") # values is an array of Period, thus can retrieve freq tm.assert_index_equal(PeriodIndex(idx.values), idx) @@ -151,7 +154,7 @@ def test_constructor_fromarraylike(self): msg = "'Period' object is not iterable" with pytest.raises(TypeError, match=msg): - PeriodIndex(data=Period('2007', freq='A')) + PeriodIndex(data=Period("2007", freq="A")) result = PeriodIndex(iter(idx)) tm.assert_index_equal(result, idx) @@ -159,329 +162,344 @@ def test_constructor_fromarraylike(self): result = PeriodIndex(idx) tm.assert_index_equal(result, idx) - result = PeriodIndex(idx, freq='M') + result = PeriodIndex(idx, freq="M") tm.assert_index_equal(result, idx) result = PeriodIndex(idx, freq=offsets.MonthEnd()) tm.assert_index_equal(result, idx) - assert result.freq == 'M' + assert result.freq == "M" - result = PeriodIndex(idx, freq='2M') - tm.assert_index_equal(result, idx.asfreq('2M')) - assert result.freq == '2M' + result = PeriodIndex(idx, freq="2M") + tm.assert_index_equal(result, idx.asfreq("2M")) + assert result.freq == "2M" result = PeriodIndex(idx, freq=offsets.MonthEnd(2)) - tm.assert_index_equal(result, idx.asfreq('2M')) - assert result.freq == '2M' + tm.assert_index_equal(result, idx.asfreq("2M")) + assert result.freq == "2M" - result = PeriodIndex(idx, freq='D') - exp = idx.asfreq('D', 'e') + result = PeriodIndex(idx, freq="D") + exp = idx.asfreq("D", "e") tm.assert_index_equal(result, exp) def test_constructor_datetime64arr(self): vals = np.arange(100000, 100000 + 10000, 100, dtype=np.int64) - vals = vals.view(np.dtype('M8[us]')) + vals = vals.view(np.dtype("M8[us]")) msg = r"Wrong dtype: datetime64\[us\]" with pytest.raises(ValueError, match=msg): - PeriodIndex(vals, freq='D') + PeriodIndex(vals, freq="D") - @pytest.mark.parametrize('box', [None, 'series', 'index']) + @pytest.mark.parametrize("box", [None, "series", "index"]) def test_constructor_datetime64arr_ok(self, box): # https://github.com/pandas-dev/pandas/issues/23438 - data = pd.date_range('2017', periods=4, freq="M") + data = pd.date_range("2017", periods=4, freq="M") if box is None: data = data._values - elif box == 'series': + elif box == "series": data = pd.Series(data) - result = PeriodIndex(data, freq='D') - expected = PeriodIndex([ - '2017-01-31', '2017-02-28', '2017-03-31', '2017-04-30' - ], freq="D") + result = PeriodIndex(data, freq="D") + expected = PeriodIndex( + ["2017-01-31", "2017-02-28", "2017-03-31", "2017-04-30"], freq="D" + ) tm.assert_index_equal(result, expected) def test_constructor_dtype(self): # passing a dtype with a tz should localize - idx = PeriodIndex(['2013-01', '2013-03'], dtype='period[M]') - exp = PeriodIndex(['2013-01', '2013-03'], freq='M') + idx = PeriodIndex(["2013-01", "2013-03"], dtype="period[M]") + exp = PeriodIndex(["2013-01", "2013-03"], freq="M") tm.assert_index_equal(idx, exp) - assert idx.dtype == 'period[M]' + assert idx.dtype == "period[M]" - idx = PeriodIndex(['2013-01-05', '2013-03-05'], dtype='period[3D]') - exp = PeriodIndex(['2013-01-05', '2013-03-05'], freq='3D') + idx = PeriodIndex(["2013-01-05", "2013-03-05"], dtype="period[3D]") + exp = PeriodIndex(["2013-01-05", "2013-03-05"], freq="3D") tm.assert_index_equal(idx, exp) - assert idx.dtype == 'period[3D]' + assert idx.dtype == "period[3D]" # if we already have a freq and its not the same, then asfreq # (not changed) - idx = PeriodIndex(['2013-01-01', '2013-01-02'], freq='D') + idx = PeriodIndex(["2013-01-01", "2013-01-02"], freq="D") - res = PeriodIndex(idx, dtype='period[M]') - exp = PeriodIndex(['2013-01', '2013-01'], freq='M') + res = PeriodIndex(idx, dtype="period[M]") + exp = PeriodIndex(["2013-01", "2013-01"], freq="M") tm.assert_index_equal(res, exp) - assert res.dtype == 'period[M]' + assert res.dtype == "period[M]" - res = PeriodIndex(idx, freq='M') + res = PeriodIndex(idx, freq="M") tm.assert_index_equal(res, exp) - assert res.dtype == 'period[M]' + assert res.dtype == "period[M]" - msg = 'specified freq and dtype are different' + msg = "specified freq and dtype are different" with pytest.raises(period.IncompatibleFrequency, match=msg): - PeriodIndex(['2011-01'], freq='M', dtype='period[D]') + PeriodIndex(["2011-01"], freq="M", dtype="period[D]") def test_constructor_empty(self): - idx = pd.PeriodIndex([], freq='M') + idx = pd.PeriodIndex([], freq="M") assert isinstance(idx, PeriodIndex) assert len(idx) == 0 - assert idx.freq == 'M' + assert idx.freq == "M" - with pytest.raises(ValueError, match='freq not specified'): + with pytest.raises(ValueError, match="freq not specified"): pd.PeriodIndex([]) def test_constructor_pi_nat(self): - idx = PeriodIndex([Period('2011-01', freq='M'), pd.NaT, - Period('2011-01', freq='M')]) - exp = PeriodIndex(['2011-01', 'NaT', '2011-01'], freq='M') + idx = PeriodIndex( + [Period("2011-01", freq="M"), pd.NaT, Period("2011-01", freq="M")] + ) + exp = PeriodIndex(["2011-01", "NaT", "2011-01"], freq="M") tm.assert_index_equal(idx, exp) - idx = PeriodIndex(np.array([Period('2011-01', freq='M'), pd.NaT, - Period('2011-01', freq='M')])) + idx = PeriodIndex( + np.array([Period("2011-01", freq="M"), pd.NaT, Period("2011-01", freq="M")]) + ) tm.assert_index_equal(idx, exp) - idx = PeriodIndex([pd.NaT, pd.NaT, Period('2011-01', freq='M'), - Period('2011-01', freq='M')]) - exp = PeriodIndex(['NaT', 'NaT', '2011-01', '2011-01'], freq='M') + idx = PeriodIndex( + [pd.NaT, pd.NaT, Period("2011-01", freq="M"), Period("2011-01", freq="M")] + ) + exp = PeriodIndex(["NaT", "NaT", "2011-01", "2011-01"], freq="M") tm.assert_index_equal(idx, exp) - idx = PeriodIndex(np.array([pd.NaT, pd.NaT, - Period('2011-01', freq='M'), - Period('2011-01', freq='M')])) + idx = PeriodIndex( + np.array( + [ + pd.NaT, + pd.NaT, + Period("2011-01", freq="M"), + Period("2011-01", freq="M"), + ] + ) + ) tm.assert_index_equal(idx, exp) - idx = PeriodIndex([pd.NaT, pd.NaT, '2011-01', '2011-01'], freq='M') + idx = PeriodIndex([pd.NaT, pd.NaT, "2011-01", "2011-01"], freq="M") tm.assert_index_equal(idx, exp) - with pytest.raises(ValueError, match='freq not specified'): + with pytest.raises(ValueError, match="freq not specified"): PeriodIndex([pd.NaT, pd.NaT]) - with pytest.raises(ValueError, match='freq not specified'): + with pytest.raises(ValueError, match="freq not specified"): PeriodIndex(np.array([pd.NaT, pd.NaT])) - with pytest.raises(ValueError, match='freq not specified'): - PeriodIndex(['NaT', 'NaT']) + with pytest.raises(ValueError, match="freq not specified"): + PeriodIndex(["NaT", "NaT"]) - with pytest.raises(ValueError, match='freq not specified'): - PeriodIndex(np.array(['NaT', 'NaT'])) + with pytest.raises(ValueError, match="freq not specified"): + PeriodIndex(np.array(["NaT", "NaT"])) def test_constructor_incompat_freq(self): msg = "Input has different freq=D from PeriodIndex\\(freq=M\\)" with pytest.raises(period.IncompatibleFrequency, match=msg): - PeriodIndex([Period('2011-01', freq='M'), pd.NaT, - Period('2011-01', freq='D')]) + PeriodIndex( + [Period("2011-01", freq="M"), pd.NaT, Period("2011-01", freq="D")] + ) with pytest.raises(period.IncompatibleFrequency, match=msg): - PeriodIndex(np.array([Period('2011-01', freq='M'), pd.NaT, - Period('2011-01', freq='D')])) + PeriodIndex( + np.array( + [Period("2011-01", freq="M"), pd.NaT, Period("2011-01", freq="D")] + ) + ) # first element is pd.NaT with pytest.raises(period.IncompatibleFrequency, match=msg): - PeriodIndex([pd.NaT, Period('2011-01', freq='M'), - Period('2011-01', freq='D')]) + PeriodIndex( + [pd.NaT, Period("2011-01", freq="M"), Period("2011-01", freq="D")] + ) with pytest.raises(period.IncompatibleFrequency, match=msg): - PeriodIndex(np.array([pd.NaT, Period('2011-01', freq='M'), - Period('2011-01', freq='D')])) + PeriodIndex( + np.array( + [pd.NaT, Period("2011-01", freq="M"), Period("2011-01", freq="D")] + ) + ) def test_constructor_mixed(self): - idx = PeriodIndex(['2011-01', pd.NaT, Period('2011-01', freq='M')]) - exp = PeriodIndex(['2011-01', 'NaT', '2011-01'], freq='M') + idx = PeriodIndex(["2011-01", pd.NaT, Period("2011-01", freq="M")]) + exp = PeriodIndex(["2011-01", "NaT", "2011-01"], freq="M") tm.assert_index_equal(idx, exp) - idx = PeriodIndex(['NaT', pd.NaT, Period('2011-01', freq='M')]) - exp = PeriodIndex(['NaT', 'NaT', '2011-01'], freq='M') + idx = PeriodIndex(["NaT", pd.NaT, Period("2011-01", freq="M")]) + exp = PeriodIndex(["NaT", "NaT", "2011-01"], freq="M") tm.assert_index_equal(idx, exp) - idx = PeriodIndex([Period('2011-01-01', freq='D'), pd.NaT, - '2012-01-01']) - exp = PeriodIndex(['2011-01-01', 'NaT', '2012-01-01'], freq='D') + idx = PeriodIndex([Period("2011-01-01", freq="D"), pd.NaT, "2012-01-01"]) + exp = PeriodIndex(["2011-01-01", "NaT", "2012-01-01"], freq="D") tm.assert_index_equal(idx, exp) def test_constructor_simple_new(self): - idx = period_range('2007-01', name='p', periods=2, freq='M') - result = idx._simple_new(idx, name='p', freq=idx.freq) + idx = period_range("2007-01", name="p", periods=2, freq="M") + result = idx._simple_new(idx, name="p", freq=idx.freq) tm.assert_index_equal(result, idx) - result = idx._simple_new(idx.astype('i8'), name='p', freq=idx.freq) + result = idx._simple_new(idx.astype("i8"), name="p", freq=idx.freq) tm.assert_index_equal(result, idx) def test_constructor_simple_new_empty(self): # GH13079 - idx = PeriodIndex([], freq='M', name='p') - result = idx._simple_new(idx, name='p', freq='M') + idx = PeriodIndex([], freq="M", name="p") + result = idx._simple_new(idx, name="p", freq="M") tm.assert_index_equal(result, idx) - @pytest.mark.parametrize('floats', [[1.1, 2.1], np.array([1.1, 2.1])]) + @pytest.mark.parametrize("floats", [[1.1, 2.1], np.array([1.1, 2.1])]) def test_constructor_floats(self, floats): msg = r"PeriodIndex\._simple_new does not accept floats" with pytest.raises(TypeError, match=msg): - pd.PeriodIndex._simple_new(floats, freq='M') + pd.PeriodIndex._simple_new(floats, freq="M") msg = "PeriodIndex does not allow floating point in construction" with pytest.raises(TypeError, match=msg): - pd.PeriodIndex(floats, freq='M') + pd.PeriodIndex(floats, freq="M") def test_constructor_nat(self): msg = "start and end must not be NaT" with pytest.raises(ValueError, match=msg): - period_range(start='NaT', end='2011-01-01', freq='M') + period_range(start="NaT", end="2011-01-01", freq="M") with pytest.raises(ValueError, match=msg): - period_range(start='2011-01-01', end='NaT', freq='M') + period_range(start="2011-01-01", end="NaT", freq="M") def test_constructor_year_and_quarter(self): year = pd.Series([2001, 2002, 2003]) quarter = year - 2000 idx = PeriodIndex(year=year, quarter=quarter) - strs = ['%dQ%d' % t for t in zip(quarter, year)] + strs = ["%dQ%d" % t for t in zip(quarter, year)] lops = list(map(Period, strs)) p = PeriodIndex(lops) tm.assert_index_equal(p, idx) - @pytest.mark.parametrize('func, warning', [ - (PeriodIndex, FutureWarning), - (period_range, None) - ]) + @pytest.mark.parametrize( + "func, warning", [(PeriodIndex, FutureWarning), (period_range, None)] + ) def test_constructor_freq_mult(self, func, warning): # GH #7811 with tm.assert_produces_warning(warning): # must be the same, but for sure... - pidx = func(start='2014-01', freq='2M', periods=4) - expected = PeriodIndex(['2014-01', '2014-03', - '2014-05', '2014-07'], freq='2M') + pidx = func(start="2014-01", freq="2M", periods=4) + expected = PeriodIndex(["2014-01", "2014-03", "2014-05", "2014-07"], freq="2M") tm.assert_index_equal(pidx, expected) with tm.assert_produces_warning(warning): - pidx = func(start='2014-01-02', end='2014-01-15', freq='3D') - expected = PeriodIndex(['2014-01-02', '2014-01-05', - '2014-01-08', '2014-01-11', - '2014-01-14'], freq='3D') + pidx = func(start="2014-01-02", end="2014-01-15", freq="3D") + expected = PeriodIndex( + ["2014-01-02", "2014-01-05", "2014-01-08", "2014-01-11", "2014-01-14"], + freq="3D", + ) tm.assert_index_equal(pidx, expected) with tm.assert_produces_warning(warning): - pidx = func(end='2014-01-01 17:00', freq='4H', periods=3) - expected = PeriodIndex(['2014-01-01 09:00', '2014-01-01 13:00', - '2014-01-01 17:00'], freq='4H') + pidx = func(end="2014-01-01 17:00", freq="4H", periods=3) + expected = PeriodIndex( + ["2014-01-01 09:00", "2014-01-01 13:00", "2014-01-01 17:00"], freq="4H" + ) tm.assert_index_equal(pidx, expected) - msg = ('Frequency must be positive, because it' - ' represents span: -1M') + msg = "Frequency must be positive, because it" " represents span: -1M" with pytest.raises(ValueError, match=msg): - PeriodIndex(['2011-01'], freq='-1M') + PeriodIndex(["2011-01"], freq="-1M") - msg = ('Frequency must be positive, because it' ' represents span: 0M') + msg = "Frequency must be positive, because it" " represents span: 0M" with pytest.raises(ValueError, match=msg): - PeriodIndex(['2011-01'], freq='0M') + PeriodIndex(["2011-01"], freq="0M") - msg = ('Frequency must be positive, because it' ' represents span: 0M') + msg = "Frequency must be positive, because it" " represents span: 0M" with pytest.raises(ValueError, match=msg): - period_range('2011-01', periods=3, freq='0M') + period_range("2011-01", periods=3, freq="0M") - @pytest.mark.parametrize('freq', ['A', 'M', 'D', 'T', 'S']) - @pytest.mark.parametrize('mult', [1, 2, 3, 4, 5]) + @pytest.mark.parametrize("freq", ["A", "M", "D", "T", "S"]) + @pytest.mark.parametrize("mult", [1, 2, 3, 4, 5]) def test_constructor_freq_mult_dti_compat(self, mult, freq): freqstr = str(mult) + freq - pidx = period_range(start='2014-04-01', freq=freqstr, periods=10) - expected = date_range(start='2014-04-01', freq=freqstr, - periods=10).to_period(freqstr) + pidx = period_range(start="2014-04-01", freq=freqstr, periods=10) + expected = date_range(start="2014-04-01", freq=freqstr, periods=10).to_period( + freqstr + ) tm.assert_index_equal(pidx, expected) def test_constructor_freq_combined(self): - for freq in ['1D1H', '1H1D']: - pidx = PeriodIndex(['2016-01-01', '2016-01-02'], freq=freq) - expected = PeriodIndex(['2016-01-01 00:00', '2016-01-02 00:00'], - freq='25H') - for freq in ['1D1H', '1H1D']: - pidx = period_range(start='2016-01-01', periods=2, freq=freq) - expected = PeriodIndex(['2016-01-01 00:00', '2016-01-02 01:00'], - freq='25H') + for freq in ["1D1H", "1H1D"]: + pidx = PeriodIndex(["2016-01-01", "2016-01-02"], freq=freq) + expected = PeriodIndex(["2016-01-01 00:00", "2016-01-02 00:00"], freq="25H") + for freq in ["1D1H", "1H1D"]: + pidx = period_range(start="2016-01-01", periods=2, freq=freq) + expected = PeriodIndex(["2016-01-01 00:00", "2016-01-02 01:00"], freq="25H") tm.assert_index_equal(pidx, expected) def test_constructor_range_based_deprecated(self): with tm.assert_produces_warning(FutureWarning): - pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + pi = PeriodIndex(freq="A", start="1/1/2001", end="12/1/2009") assert len(pi) == 9 def test_constructor_range_based_deprecated_different_freq(self): with tm.assert_produces_warning(FutureWarning) as m: - PeriodIndex(start='2000', periods=2) + PeriodIndex(start="2000", periods=2) warning, = m assert 'freq="A-DEC"' in str(warning.message) def test_constructor(self): - pi = period_range(freq='A', start='1/1/2001', end='12/1/2009') + pi = period_range(freq="A", start="1/1/2001", end="12/1/2009") assert len(pi) == 9 - pi = period_range(freq='Q', start='1/1/2001', end='12/1/2009') + pi = period_range(freq="Q", start="1/1/2001", end="12/1/2009") assert len(pi) == 4 * 9 - pi = period_range(freq='M', start='1/1/2001', end='12/1/2009') + pi = period_range(freq="M", start="1/1/2001", end="12/1/2009") assert len(pi) == 12 * 9 - pi = period_range(freq='D', start='1/1/2001', end='12/31/2009') + pi = period_range(freq="D", start="1/1/2001", end="12/31/2009") assert len(pi) == 365 * 9 + 2 - pi = period_range(freq='B', start='1/1/2001', end='12/31/2009') + pi = period_range(freq="B", start="1/1/2001", end="12/31/2009") assert len(pi) == 261 * 9 - pi = period_range(freq='H', start='1/1/2001', end='12/31/2001 23:00') + pi = period_range(freq="H", start="1/1/2001", end="12/31/2001 23:00") assert len(pi) == 365 * 24 - pi = period_range(freq='Min', start='1/1/2001', end='1/1/2001 23:59') + pi = period_range(freq="Min", start="1/1/2001", end="1/1/2001 23:59") assert len(pi) == 24 * 60 - pi = period_range(freq='S', start='1/1/2001', end='1/1/2001 23:59:59') + pi = period_range(freq="S", start="1/1/2001", end="1/1/2001 23:59:59") assert len(pi) == 24 * 60 * 60 - start = Period('02-Apr-2005', 'B') + start = Period("02-Apr-2005", "B") i1 = period_range(start=start, periods=20) assert len(i1) == 20 assert i1.freq == start.freq assert i1[0] == start - end_intv = Period('2006-12-31', 'W') + end_intv = Period("2006-12-31", "W") i1 = period_range(end=end_intv, periods=10) assert len(i1) == 10 assert i1.freq == end_intv.freq assert i1[-1] == end_intv - end_intv = Period('2006-12-31', '1w') + end_intv = Period("2006-12-31", "1w") i2 = period_range(end=end_intv, periods=10) assert len(i1) == len(i2) assert (i1 == i2).all() assert i1.freq == i2.freq - end_intv = Period('2006-12-31', ('w', 1)) + end_intv = Period("2006-12-31", ("w", 1)) i2 = period_range(end=end_intv, periods=10) assert len(i1) == len(i2) assert (i1 == i2).all() assert i1.freq == i2.freq - end_intv = Period('2005-05-01', 'B') + end_intv = Period("2005-05-01", "B") i1 = period_range(start=start, end=end_intv) # infer freq from first element - i2 = PeriodIndex([end_intv, Period('2005-05-05', 'B')]) + i2 = PeriodIndex([end_intv, Period("2005-05-05", "B")]) assert len(i2) == 2 assert i2[0] == end_intv - i2 = PeriodIndex(np.array([end_intv, Period('2005-05-05', 'B')])) + i2 = PeriodIndex(np.array([end_intv, Period("2005-05-05", "B")])) assert len(i2) == 2 assert i2[0] == end_intv # Mixed freq should fail - vals = [end_intv, Period('2006-12-31', 'w')] + vals = [end_intv, Period("2006-12-31", "w")] msg = r"Input has different freq=W-SUN from PeriodIndex\(freq=B\)" with pytest.raises(IncompatibleFrequency, match=msg): PeriodIndex(vals) @@ -490,28 +508,31 @@ def test_constructor(self): PeriodIndex(vals) def test_constructor_error(self): - start = Period('02-Apr-2005', 'B') - end_intv = Period('2006-12-31', ('w', 1)) + start = Period("02-Apr-2005", "B") + end_intv = Period("2006-12-31", ("w", 1)) - msg = 'start and end must have same freq' + msg = "start and end must have same freq" with pytest.raises(ValueError, match=msg): PeriodIndex(start=start, end=end_intv) - msg = ('Of the three parameters: start, end, and periods, ' - 'exactly two must be specified') + msg = ( + "Of the three parameters: start, end, and periods, " + "exactly two must be specified" + ) with pytest.raises(ValueError, match=msg): PeriodIndex(start=start) - @pytest.mark.parametrize('freq', ['M', 'Q', 'A', 'D', 'B', - 'T', 'S', 'L', 'U', 'N', 'H']) + @pytest.mark.parametrize( + "freq", ["M", "Q", "A", "D", "B", "T", "S", "L", "U", "N", "H"] + ) def test_recreate_from_data(self, freq): - org = period_range(start='2001/04/01', freq=freq, periods=1) + org = period_range(start="2001/04/01", freq=freq, periods=1) idx = PeriodIndex(org.values, freq=freq) tm.assert_index_equal(idx, org) def test_map_with_string_constructor(self): raw = [2005, 2007, 2009] - index = PeriodIndex(raw, freq='A') + index = PeriodIndex(raw, freq="A") expected = Index([str(num) for num in raw]) res = index.map(str) @@ -527,18 +548,15 @@ def test_map_with_string_constructor(self): class TestSeriesPeriod: - def setup_method(self, method): - self.series = Series(period_range('2000-01-01', periods=10, freq='D')) + self.series = Series(period_range("2000-01-01", periods=10, freq="D")) def test_constructor_cant_cast_period(self): msg = "Cannot cast PeriodArray to dtype float64" with pytest.raises(TypeError, match=msg): - Series(period_range('2000-01-01', periods=10, freq='D'), - dtype=float) + Series(period_range("2000-01-01", periods=10, freq="D"), dtype=float) def test_constructor_cast_object(self): - s = Series(period_range('1/1/2000', periods=10), - dtype=PeriodDtype("D")) - exp = Series(period_range('1/1/2000', periods=10)) + s = Series(period_range("1/1/2000", periods=10), dtype=PeriodDtype("D")) + exp = Series(period_range("1/1/2000", periods=10)) tm.assert_series_equal(s, exp) diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py index e1d1dd307ea76c..c5566f74af11e4 100644 --- a/pandas/tests/indexes/period/test_formats.py +++ b/pandas/tests/indexes/period/test_formats.py @@ -7,125 +7,122 @@ def test_to_native_types(): - index = PeriodIndex(['2017-01-01', '2017-01-02', - '2017-01-03'], freq='D') + index = PeriodIndex(["2017-01-01", "2017-01-02", "2017-01-03"], freq="D") # First, with no arguments. - expected = np.array(['2017-01-01', '2017-01-02', - '2017-01-03'], dtype='=U10') + expected = np.array(["2017-01-01", "2017-01-02", "2017-01-03"], dtype="=U10") result = index.to_native_types() tm.assert_numpy_array_equal(result, expected) # No NaN values, so na_rep has no effect - result = index.to_native_types(na_rep='pandas') + result = index.to_native_types(na_rep="pandas") tm.assert_numpy_array_equal(result, expected) # Make sure slicing works - expected = np.array(['2017-01-01', '2017-01-03'], dtype='=U10') + expected = np.array(["2017-01-01", "2017-01-03"], dtype="=U10") result = index.to_native_types([0, 2]) tm.assert_numpy_array_equal(result, expected) # Make sure date formatting works - expected = np.array(['01-2017-01', '01-2017-02', - '01-2017-03'], dtype='=U10') + expected = np.array(["01-2017-01", "01-2017-02", "01-2017-03"], dtype="=U10") - result = index.to_native_types(date_format='%m-%Y-%d') + result = index.to_native_types(date_format="%m-%Y-%d") tm.assert_numpy_array_equal(result, expected) # NULL object handling should work - index = PeriodIndex(['2017-01-01', pd.NaT, '2017-01-03'], freq='D') - expected = np.array(['2017-01-01', 'NaT', '2017-01-03'], dtype=object) + index = PeriodIndex(["2017-01-01", pd.NaT, "2017-01-03"], freq="D") + expected = np.array(["2017-01-01", "NaT", "2017-01-03"], dtype=object) result = index.to_native_types() tm.assert_numpy_array_equal(result, expected) - expected = np.array(['2017-01-01', 'pandas', - '2017-01-03'], dtype=object) + expected = np.array(["2017-01-01", "pandas", "2017-01-03"], dtype=object) - result = index.to_native_types(na_rep='pandas') + result = index.to_native_types(na_rep="pandas") tm.assert_numpy_array_equal(result, expected) class TestPeriodIndexRendering: - def test_frame_repr(self): - df = pd.DataFrame({"A": [1, 2, 3]}, - index=pd.date_range('2000', periods=3)) + df = pd.DataFrame({"A": [1, 2, 3]}, index=pd.date_range("2000", periods=3)) result = repr(df) - expected = ( - ' A\n' - '2000-01-01 1\n' - '2000-01-02 2\n' - '2000-01-03 3') + expected = " A\n" "2000-01-01 1\n" "2000-01-02 2\n" "2000-01-03 3" assert result == expected - @pytest.mark.parametrize('method', ['__repr__', '__str__']) + @pytest.mark.parametrize("method", ["__repr__", "__str__"]) def test_representation(self, method): # GH#7601 - idx1 = PeriodIndex([], freq='D') - idx2 = PeriodIndex(['2011-01-01'], freq='D') - idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') - idx4 = PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], - freq='D') - idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A') - idx6 = PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], - freq='H') - idx7 = pd.period_range('2013Q1', periods=1, freq="Q") - idx8 = pd.period_range('2013Q1', periods=2, freq="Q") - idx9 = pd.period_range('2013Q1', periods=3, freq="Q") - idx10 = PeriodIndex(['2011-01-01', '2011-02-01'], freq='3D') + idx1 = PeriodIndex([], freq="D") + idx2 = PeriodIndex(["2011-01-01"], freq="D") + idx3 = PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") + idx4 = PeriodIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") + idx5 = PeriodIndex(["2011", "2012", "2013"], freq="A") + idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="H") + idx7 = pd.period_range("2013Q1", periods=1, freq="Q") + idx8 = pd.period_range("2013Q1", periods=2, freq="Q") + idx9 = pd.period_range("2013Q1", periods=3, freq="Q") + idx10 = PeriodIndex(["2011-01-01", "2011-02-01"], freq="3D") exp1 = """PeriodIndex([], dtype='period[D]', freq='D')""" exp2 = """PeriodIndex(['2011-01-01'], dtype='period[D]', freq='D')""" - exp3 = ("PeriodIndex(['2011-01-01', '2011-01-02'], dtype='period[D]', " - "freq='D')") - - exp4 = ("PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], " - "dtype='period[D]', freq='D')") - - exp5 = ("PeriodIndex(['2011', '2012', '2013'], dtype='period[A-DEC]', " - "freq='A-DEC')") - - exp6 = ("PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], " - "dtype='period[H]', freq='H')") - - exp7 = ("PeriodIndex(['2013Q1'], dtype='period[Q-DEC]', " - "freq='Q-DEC')") - - exp8 = ("PeriodIndex(['2013Q1', '2013Q2'], dtype='period[Q-DEC]', " - "freq='Q-DEC')") - - exp9 = ("PeriodIndex(['2013Q1', '2013Q2', '2013Q3'], " - "dtype='period[Q-DEC]', freq='Q-DEC')") - - exp10 = ("PeriodIndex(['2011-01-01', '2011-02-01'], " - "dtype='period[3D]', freq='3D')") - - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, - idx6, idx7, idx8, idx9, idx10], - [exp1, exp2, exp3, exp4, exp5, - exp6, exp7, exp8, exp9, exp10]): + exp3 = ( + "PeriodIndex(['2011-01-01', '2011-01-02'], dtype='period[D]', " "freq='D')" + ) + + exp4 = ( + "PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], " + "dtype='period[D]', freq='D')" + ) + + exp5 = ( + "PeriodIndex(['2011', '2012', '2013'], dtype='period[A-DEC]', " + "freq='A-DEC')" + ) + + exp6 = ( + "PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], " + "dtype='period[H]', freq='H')" + ) + + exp7 = "PeriodIndex(['2013Q1'], dtype='period[Q-DEC]', " "freq='Q-DEC')" + + exp8 = ( + "PeriodIndex(['2013Q1', '2013Q2'], dtype='period[Q-DEC]', " "freq='Q-DEC')" + ) + + exp9 = ( + "PeriodIndex(['2013Q1', '2013Q2', '2013Q3'], " + "dtype='period[Q-DEC]', freq='Q-DEC')" + ) + + exp10 = ( + "PeriodIndex(['2011-01-01', '2011-02-01'], " + "dtype='period[3D]', freq='3D')" + ) + + for idx, expected in zip( + [idx1, idx2, idx3, idx4, idx5, idx6, idx7, idx8, idx9, idx10], + [exp1, exp2, exp3, exp4, exp5, exp6, exp7, exp8, exp9, exp10], + ): result = getattr(idx, method)() assert result == expected def test_representation_to_series(self): # GH#10971 - idx1 = PeriodIndex([], freq='D') - idx2 = PeriodIndex(['2011-01-01'], freq='D') - idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') - idx4 = PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], - freq='D') - idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A') - idx6 = PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], - freq='H') - - idx7 = pd.period_range('2013Q1', periods=1, freq="Q") - idx8 = pd.period_range('2013Q1', periods=2, freq="Q") - idx9 = pd.period_range('2013Q1', periods=3, freq="Q") + idx1 = PeriodIndex([], freq="D") + idx2 = PeriodIndex(["2011-01-01"], freq="D") + idx3 = PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") + idx4 = PeriodIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") + idx5 = PeriodIndex(["2011", "2012", "2013"], freq="A") + idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="H") + + idx7 = pd.period_range("2013Q1", periods=1, freq="Q") + idx8 = pd.period_range("2013Q1", periods=2, freq="Q") + idx9 = pd.period_range("2013Q1", periods=3, freq="Q") exp1 = """Series([], dtype: period[D])""" @@ -163,27 +160,25 @@ def test_representation_to_series(self): 2 2013Q3 dtype: period[Q-DEC]""" - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, - idx6, idx7, idx8, idx9], - [exp1, exp2, exp3, exp4, exp5, - exp6, exp7, exp8, exp9]): + for idx, expected in zip( + [idx1, idx2, idx3, idx4, idx5, idx6, idx7, idx8, idx9], + [exp1, exp2, exp3, exp4, exp5, exp6, exp7, exp8, exp9], + ): result = repr(pd.Series(idx)) assert result == expected def test_summary(self): # GH#9116 - idx1 = PeriodIndex([], freq='D') - idx2 = PeriodIndex(['2011-01-01'], freq='D') - idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') - idx4 = PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], - freq='D') - idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A') - idx6 = PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], - freq='H') - - idx7 = pd.period_range('2013Q1', periods=1, freq="Q") - idx8 = pd.period_range('2013Q1', periods=2, freq="Q") - idx9 = pd.period_range('2013Q1', periods=3, freq="Q") + idx1 = PeriodIndex([], freq="D") + idx2 = PeriodIndex(["2011-01-01"], freq="D") + idx3 = PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") + idx4 = PeriodIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") + idx5 = PeriodIndex(["2011", "2012", "2013"], freq="A") + idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="H") + + idx7 = pd.period_range("2013Q1", periods=1, freq="Q") + idx8 = pd.period_range("2013Q1", periods=2, freq="Q") + idx9 = pd.period_range("2013Q1", periods=3, freq="Q") exp1 = """PeriodIndex: 0 entries Freq: D""" @@ -212,9 +207,9 @@ def test_summary(self): exp9 = """PeriodIndex: 3 entries, 2013Q1 to 2013Q3 Freq: Q-DEC""" - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, - idx6, idx7, idx8, idx9], - [exp1, exp2, exp3, exp4, exp5, - exp6, exp7, exp8, exp9]): + for idx, expected in zip( + [idx1, idx2, idx3, idx4, idx5, idx6, idx7, idx8, idx9], + [exp1, exp2, exp3, exp4, exp5, exp6, exp7, exp8, exp9], + ): result = idx._summary() assert result == expected diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 27a690e58b70f1..3f66891caddc33 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -6,102 +6,101 @@ from pandas._libs.tslibs import period as libperiod import pandas as pd -from pandas import ( - DatetimeIndex, Period, PeriodIndex, Series, notna, period_range) +from pandas import DatetimeIndex, Period, PeriodIndex, Series, notna, period_range from pandas.util import testing as tm class TestGetItem: def test_ellipsis(self): # GH#21282 - idx = period_range('2011-01-01', '2011-01-31', freq='D', - name='idx') + idx = period_range("2011-01-01", "2011-01-31", freq="D", name="idx") result = idx[...] assert result.equals(idx) assert result is not idx def test_getitem(self): - idx1 = pd.period_range('2011-01-01', '2011-01-31', freq='D', - name='idx') + idx1 = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx") for idx in [idx1]: result = idx[0] - assert result == pd.Period('2011-01-01', freq='D') + assert result == pd.Period("2011-01-01", freq="D") result = idx[-1] - assert result == pd.Period('2011-01-31', freq='D') + assert result == pd.Period("2011-01-31", freq="D") result = idx[0:5] - expected = pd.period_range('2011-01-01', '2011-01-05', freq='D', - name='idx') + expected = pd.period_range("2011-01-01", "2011-01-05", freq="D", name="idx") tm.assert_index_equal(result, expected) assert result.freq == expected.freq - assert result.freq == 'D' + assert result.freq == "D" result = idx[0:10:2] - expected = pd.PeriodIndex(['2011-01-01', '2011-01-03', - '2011-01-05', - '2011-01-07', '2011-01-09'], - freq='D', name='idx') + expected = pd.PeriodIndex( + ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-07", "2011-01-09"], + freq="D", + name="idx", + ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq - assert result.freq == 'D' + assert result.freq == "D" result = idx[-20:-5:3] - expected = pd.PeriodIndex(['2011-01-12', '2011-01-15', - '2011-01-18', - '2011-01-21', '2011-01-24'], - freq='D', name='idx') + expected = pd.PeriodIndex( + ["2011-01-12", "2011-01-15", "2011-01-18", "2011-01-21", "2011-01-24"], + freq="D", + name="idx", + ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq - assert result.freq == 'D' + assert result.freq == "D" result = idx[4::-1] - expected = PeriodIndex(['2011-01-05', '2011-01-04', '2011-01-03', - '2011-01-02', '2011-01-01'], - freq='D', name='idx') + expected = PeriodIndex( + ["2011-01-05", "2011-01-04", "2011-01-03", "2011-01-02", "2011-01-01"], + freq="D", + name="idx", + ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq - assert result.freq == 'D' + assert result.freq == "D" def test_getitem_index(self): - idx = period_range('2007-01', periods=10, freq='M', name='x') + idx = period_range("2007-01", periods=10, freq="M", name="x") result = idx[[1, 3, 5]] - exp = pd.PeriodIndex(['2007-02', '2007-04', '2007-06'], - freq='M', name='x') + exp = pd.PeriodIndex(["2007-02", "2007-04", "2007-06"], freq="M", name="x") tm.assert_index_equal(result, exp) - result = idx[[True, True, False, False, False, - True, True, False, False, False]] - exp = pd.PeriodIndex(['2007-01', '2007-02', '2007-06', '2007-07'], - freq='M', name='x') + result = idx[[True, True, False, False, False, True, True, False, False, False]] + exp = pd.PeriodIndex( + ["2007-01", "2007-02", "2007-06", "2007-07"], freq="M", name="x" + ) tm.assert_index_equal(result, exp) def test_getitem_partial(self): - rng = period_range('2007-01', periods=50, freq='M') + rng = period_range("2007-01", periods=50, freq="M") ts = Series(np.random.randn(len(rng)), rng) with pytest.raises(KeyError, match=r"^'2006'$"): - ts['2006'] + ts["2006"] - result = ts['2008'] + result = ts["2008"] assert (result.index.year == 2008).all() - result = ts['2008':'2009'] + result = ts["2008":"2009"] assert len(result) == 24 - result = ts['2008-1':'2009-12'] + result = ts["2008-1":"2009-12"] assert len(result) == 24 - result = ts['2008Q1':'2009Q4'] + result = ts["2008Q1":"2009Q4"] assert len(result) == 24 - result = ts[:'2009'] + result = ts[:"2009"] assert len(result) == 36 - result = ts['2009':] + result = ts["2009":] assert len(result) == 50 - 24 exp = result @@ -111,10 +110,10 @@ def test_getitem_partial(self): ts = ts[10:].append(ts[10:]) msg = "left slice bound for non-unique label: '2008'" with pytest.raises(KeyError, match=msg): - ts[slice('2008', '2009')] + ts[slice("2008", "2009")] def test_getitem_datetime(self): - rng = period_range(start='2012-01-01', periods=10, freq='W-MON') + rng = period_range(start="2012-01-01", periods=10, freq="W-MON") ts = Series(range(len(rng)), index=rng) dt1 = datetime(2011, 10, 2) @@ -124,36 +123,38 @@ def test_getitem_datetime(self): tm.assert_series_equal(rs, ts) def test_getitem_nat(self): - idx = pd.PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='M') - assert idx[0] == pd.Period('2011-01', freq='M') + idx = pd.PeriodIndex(["2011-01", "NaT", "2011-02"], freq="M") + assert idx[0] == pd.Period("2011-01", freq="M") assert idx[1] is pd.NaT s = pd.Series([0, 1, 2], index=idx) assert s[pd.NaT] == 1 s = pd.Series(idx, index=idx) - assert (s[pd.Period('2011-01', freq='M')] == - pd.Period('2011-01', freq='M')) + assert s[pd.Period("2011-01", freq="M")] == pd.Period("2011-01", freq="M") assert s[pd.NaT] is pd.NaT def test_getitem_list_periods(self): # GH 7710 - rng = period_range(start='2012-01-01', periods=10, freq='D') + rng = period_range(start="2012-01-01", periods=10, freq="D") ts = Series(range(len(rng)), index=rng) exp = ts.iloc[[1]] - tm.assert_series_equal(ts[[Period('2012-01-02', freq='D')]], exp) + tm.assert_series_equal(ts[[Period("2012-01-02", freq="D")]], exp) def test_getitem_seconds(self): # GH#6716 - didx = pd.date_range(start='2013/01/01 09:00:00', freq='S', - periods=4000) - pidx = period_range(start='2013/01/01 09:00:00', freq='S', - periods=4000) + didx = pd.date_range(start="2013/01/01 09:00:00", freq="S", periods=4000) + pidx = period_range(start="2013/01/01 09:00:00", freq="S", periods=4000) for idx in [didx, pidx]: # getitem against index should raise ValueError - values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H', - '2013/02/01 09:00'] + values = [ + "2014", + "2013/02", + "2013/01/02", + "2013/02/01 9H", + "2013/02/01 09:00", + ] for v in values: # GH7116 # these show deprecations as we are trying @@ -163,21 +164,26 @@ def test_getitem_seconds(self): continue s = Series(np.random.rand(len(idx)), index=idx) - tm.assert_series_equal(s['2013/01/01 10:00'], s[3600:3660]) - tm.assert_series_equal(s['2013/01/01 9H'], s[:3600]) - for d in ['2013/01/01', '2013/01', '2013']: + tm.assert_series_equal(s["2013/01/01 10:00"], s[3600:3660]) + tm.assert_series_equal(s["2013/01/01 9H"], s[:3600]) + for d in ["2013/01/01", "2013/01", "2013"]: tm.assert_series_equal(s[d], s) def test_getitem_day(self): # GH#6716 # Confirm DatetimeIndex and PeriodIndex works identically - didx = pd.date_range(start='2013/01/01', freq='D', periods=400) - pidx = period_range(start='2013/01/01', freq='D', periods=400) + didx = pd.date_range(start="2013/01/01", freq="D", periods=400) + pidx = period_range(start="2013/01/01", freq="D", periods=400) for idx in [didx, pidx]: # getitem against index should raise ValueError - values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H', - '2013/02/01 09:00'] + values = [ + "2014", + "2013/02", + "2013/01/02", + "2013/02/01 9H", + "2013/02/01 09:00", + ] for v in values: # GH7116 @@ -188,46 +194,44 @@ def test_getitem_day(self): continue s = Series(np.random.rand(len(idx)), index=idx) - tm.assert_series_equal(s['2013/01'], s[0:31]) - tm.assert_series_equal(s['2013/02'], s[31:59]) - tm.assert_series_equal(s['2014'], s[365:]) + tm.assert_series_equal(s["2013/01"], s[0:31]) + tm.assert_series_equal(s["2013/02"], s[31:59]) + tm.assert_series_equal(s["2014"], s[365:]) - invalid = ['2013/02/01 9H', '2013/02/01 09:00'] + invalid = ["2013/02/01 9H", "2013/02/01 09:00"] for v in invalid: with pytest.raises(KeyError): s[v] class TestWhere: - @pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) + @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) def test_where(self, klass): - i = period_range('20130101', periods=5, freq='D') + i = period_range("20130101", periods=5, freq="D") cond = [True] * len(i) expected = i result = i.where(klass(cond)) tm.assert_index_equal(result, expected) cond = [False] + [True] * (len(i) - 1) - expected = PeriodIndex([pd.NaT] + i[1:].tolist(), freq='D') + expected = PeriodIndex([pd.NaT] + i[1:].tolist(), freq="D") result = i.where(klass(cond)) tm.assert_index_equal(result, expected) def test_where_other(self): - i = period_range('20130101', periods=5, freq='D') + i = period_range("20130101", periods=5, freq="D") for arr in [np.nan, pd.NaT]: result = i.where(notna(i), other=np.nan) expected = i tm.assert_index_equal(result, expected) i2 = i.copy() - i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), - freq='D') + i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), freq="D") result = i.where(notna(i2), i2) tm.assert_index_equal(result, i2) i2 = i.copy() - i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), - freq='D') + i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), freq="D") result = i.where(notna(i2), i2.values) tm.assert_index_equal(result, i2) @@ -235,58 +239,65 @@ def test_where_other(self): class TestTake: def test_take(self): # GH#10295 - idx1 = pd.period_range('2011-01-01', '2011-01-31', freq='D', - name='idx') + idx1 = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx") for idx in [idx1]: result = idx.take([0]) - assert result == pd.Period('2011-01-01', freq='D') + assert result == pd.Period("2011-01-01", freq="D") result = idx.take([5]) - assert result == pd.Period('2011-01-06', freq='D') + assert result == pd.Period("2011-01-06", freq="D") result = idx.take([0, 1, 2]) - expected = pd.period_range('2011-01-01', '2011-01-03', freq='D', - name='idx') + expected = pd.period_range("2011-01-01", "2011-01-03", freq="D", name="idx") tm.assert_index_equal(result, expected) - assert result.freq == 'D' + assert result.freq == "D" assert result.freq == expected.freq result = idx.take([0, 2, 4]) - expected = pd.PeriodIndex(['2011-01-01', '2011-01-03', - '2011-01-05'], freq='D', name='idx') + expected = pd.PeriodIndex( + ["2011-01-01", "2011-01-03", "2011-01-05"], freq="D", name="idx" + ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq - assert result.freq == 'D' + assert result.freq == "D" result = idx.take([7, 4, 1]) - expected = pd.PeriodIndex(['2011-01-08', '2011-01-05', - '2011-01-02'], - freq='D', name='idx') + expected = pd.PeriodIndex( + ["2011-01-08", "2011-01-05", "2011-01-02"], freq="D", name="idx" + ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq - assert result.freq == 'D' + assert result.freq == "D" result = idx.take([3, 2, 5]) - expected = PeriodIndex(['2011-01-04', '2011-01-03', '2011-01-06'], - freq='D', name='idx') + expected = PeriodIndex( + ["2011-01-04", "2011-01-03", "2011-01-06"], freq="D", name="idx" + ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq - assert result.freq == 'D' + assert result.freq == "D" result = idx.take([-3, 2, 5]) - expected = PeriodIndex(['2011-01-29', '2011-01-03', '2011-01-06'], - freq='D', name='idx') + expected = PeriodIndex( + ["2011-01-29", "2011-01-03", "2011-01-06"], freq="D", name="idx" + ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq - assert result.freq == 'D' + assert result.freq == "D" def test_take_misc(self): - index = period_range(start='1/1/10', end='12/31/12', freq='D', - name='idx') - expected = PeriodIndex([datetime(2010, 1, 6), datetime(2010, 1, 7), - datetime(2010, 1, 9), datetime(2010, 1, 13)], - freq='D', name='idx') + index = period_range(start="1/1/10", end="12/31/12", freq="D", name="idx") + expected = PeriodIndex( + [ + datetime(2010, 1, 6), + datetime(2010, 1, 7), + datetime(2010, 1, 9), + datetime(2010, 1, 13), + ], + freq="D", + name="idx", + ) taken1 = index.take([5, 6, 8, 12]) taken2 = index[[5, 6, 8, 12]] @@ -299,28 +310,33 @@ def test_take_misc(self): def test_take_fill_value(self): # GH#12631 - idx = pd.PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01'], - name='xxx', freq='D') + idx = pd.PeriodIndex( + ["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx", freq="D" + ) result = idx.take(np.array([1, 0, -1])) - expected = pd.PeriodIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx', freq='D') + expected = pd.PeriodIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx", freq="D" + ) tm.assert_index_equal(result, expected) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.PeriodIndex(['2011-02-01', '2011-01-01', 'NaT'], - name='xxx', freq='D') + expected = pd.PeriodIndex( + ["2011-02-01", "2011-01-01", "NaT"], name="xxx", freq="D" + ) tm.assert_index_equal(result, expected) # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.PeriodIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx', freq='D') + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.PeriodIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx", freq="D" + ) tm.assert_index_equal(result, expected) - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): @@ -332,10 +348,9 @@ def test_take_fill_value(self): class TestIndexing: - def test_get_loc_msg(self): - idx = period_range('2000-1-1', freq='A', periods=10) - bad_period = Period('2012', 'A') + idx = period_range("2000-1-1", freq="A", periods=10) + bad_period = Period("2012", "A") with pytest.raises(KeyError, match=r"^Period\('2012', 'A-DEC'\)$"): idx.get_loc(bad_period) @@ -345,21 +360,21 @@ def test_get_loc_msg(self): assert inst.args[0] == bad_period def test_get_loc_nat(self): - didx = DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03']) - pidx = PeriodIndex(['2011-01-01', 'NaT', '2011-01-03'], freq='M') + didx = DatetimeIndex(["2011-01-01", "NaT", "2011-01-03"]) + pidx = PeriodIndex(["2011-01-01", "NaT", "2011-01-03"], freq="M") # check DatetimeIndex compat for idx in [didx, pidx]: assert idx.get_loc(pd.NaT) == 1 assert idx.get_loc(None) == 1 - assert idx.get_loc(float('nan')) == 1 + assert idx.get_loc(float("nan")) == 1 assert idx.get_loc(np.nan) == 1 def test_get_loc(self): # GH 17717 - p0 = pd.Period('2017-09-01') - p1 = pd.Period('2017-09-02') - p2 = pd.Period('2017-09-03') + p0 = pd.Period("2017-09-01") + p1 = pd.Period("2017-09-02") + p2 = pd.Period("2017-09-03") # get the location of p1/p2 from # monotonic increasing PeriodIndex with non-duplicate @@ -374,12 +389,14 @@ def test_get_loc(self): msg = "Cannot interpret 'foo' as period" with pytest.raises(KeyError, match=msg): - idx0.get_loc('foo') + idx0.get_loc("foo") with pytest.raises(KeyError, match=r"^1\.1$"): idx0.get_loc(1.1) - msg = (r"'PeriodIndex\(\['2017-09-01', '2017-09-02', '2017-09-03'\]," - r" dtype='period\[D\]', freq='D'\)' is an invalid key") + msg = ( + r"'PeriodIndex\(\['2017-09-01', '2017-09-02', '2017-09-03'\]," + r" dtype='period\[D\]', freq='D'\)' is an invalid key" + ) with pytest.raises(TypeError, match=msg): idx0.get_loc(idx0) @@ -396,13 +413,15 @@ def test_get_loc(self): msg = "Cannot interpret 'foo' as period" with pytest.raises(KeyError, match=msg): - idx1.get_loc('foo') + idx1.get_loc("foo") with pytest.raises(KeyError, match=r"^1\.1$"): idx1.get_loc(1.1) - msg = (r"'PeriodIndex\(\['2017-09-02', '2017-09-02', '2017-09-03'\]," - r" dtype='period\[D\]', freq='D'\)' is an invalid key") + msg = ( + r"'PeriodIndex\(\['2017-09-02', '2017-09-02', '2017-09-03'\]," + r" dtype='period\[D\]', freq='D'\)' is an invalid key" + ) with pytest.raises(TypeError, match=msg): idx1.get_loc(idx1) @@ -419,9 +438,9 @@ def test_get_loc(self): def test_is_monotonic_increasing(self): # GH 17717 - p0 = pd.Period('2017-09-01') - p1 = pd.Period('2017-09-02') - p2 = pd.Period('2017-09-03') + p0 = pd.Period("2017-09-01") + p1 = pd.Period("2017-09-02") + p2 = pd.Period("2017-09-03") idx_inc0 = pd.PeriodIndex([p0, p1, p2]) idx_inc1 = pd.PeriodIndex([p0, p1, p1]) @@ -437,9 +456,9 @@ def test_is_monotonic_increasing(self): def test_is_monotonic_decreasing(self): # GH 17717 - p0 = pd.Period('2017-09-01') - p1 = pd.Period('2017-09-02') - p2 = pd.Period('2017-09-03') + p0 = pd.Period("2017-09-01") + p1 = pd.Period("2017-09-02") + p2 = pd.Period("2017-09-03") idx_inc0 = pd.PeriodIndex([p0, p1, p2]) idx_inc1 = pd.PeriodIndex([p0, p1, p1]) @@ -455,10 +474,10 @@ def test_is_monotonic_decreasing(self): def test_contains(self): # GH 17717 - p0 = pd.Period('2017-09-01') - p1 = pd.Period('2017-09-02') - p2 = pd.Period('2017-09-03') - p3 = pd.Period('2017-09-04') + p0 = pd.Period("2017-09-01") + p1 = pd.Period("2017-09-02") + p2 = pd.Period("2017-09-03") + p3 = pd.Period("2017-09-04") ps0 = [p0, p1, p2] idx0 = pd.PeriodIndex(ps0) @@ -467,17 +486,17 @@ def test_contains(self): assert p in idx0 assert str(p) in idx0 - assert '2017-09-01 00:00:01' in idx0 + assert "2017-09-01 00:00:01" in idx0 - assert '2017-09' in idx0 + assert "2017-09" in idx0 assert p3 not in idx0 def test_get_value(self): # GH 17717 - p0 = pd.Period('2017-09-01') - p1 = pd.Period('2017-09-02') - p2 = pd.Period('2017-09-03') + p0 = pd.Period("2017-09-01") + p1 = pd.Period("2017-09-02") + p2 = pd.Period("2017-09-03") idx0 = pd.PeriodIndex([p0, p1, p2]) input0 = np.array([1, 2, 3]) @@ -502,39 +521,41 @@ def test_get_value(self): def test_get_indexer(self): # GH 17717 - p1 = pd.Period('2017-09-01') - p2 = pd.Period('2017-09-04') - p3 = pd.Period('2017-09-07') + p1 = pd.Period("2017-09-01") + p2 = pd.Period("2017-09-04") + p3 = pd.Period("2017-09-07") - tp0 = pd.Period('2017-08-31') - tp1 = pd.Period('2017-09-02') - tp2 = pd.Period('2017-09-05') - tp3 = pd.Period('2017-09-09') + tp0 = pd.Period("2017-08-31") + tp1 = pd.Period("2017-09-02") + tp2 = pd.Period("2017-09-05") + tp3 = pd.Period("2017-09-09") idx = pd.PeriodIndex([p1, p2, p3]) - tm.assert_numpy_array_equal(idx.get_indexer(idx), - np.array([0, 1, 2], dtype=np.intp)) + tm.assert_numpy_array_equal( + idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) + ) target = pd.PeriodIndex([tp0, tp1, tp2, tp3]) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1, 2], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2, -1], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 0, 1, 2], dtype=np.intp)) - - res = idx.get_indexer(target, 'nearest', - tolerance=pd.Timedelta('1 day')) - tm.assert_numpy_array_equal(res, - np.array([0, 0, 1, -1], dtype=np.intp)) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "pad"), np.array([-1, 0, 1, 2], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "backfill"), np.array([0, 1, 2, -1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest"), np.array([0, 0, 1, 2], dtype=np.intp) + ) + + res = idx.get_indexer(target, "nearest", tolerance=pd.Timedelta("1 day")) + tm.assert_numpy_array_equal(res, np.array([0, 0, 1, -1], dtype=np.intp)) def test_get_indexer_non_unique(self): # GH 17717 - p1 = pd.Period('2017-09-02') - p2 = pd.Period('2017-09-03') - p3 = pd.Period('2017-09-04') - p4 = pd.Period('2017-09-05') + p1 = pd.Period("2017-09-02") + p2 = pd.Period("2017-09-03") + p3 = pd.Period("2017-09-04") + p4 = pd.Period("2017-09-05") idx1 = pd.PeriodIndex([p1, p2, p1]) idx2 = pd.PeriodIndex([p2, p1, p3, p4]) @@ -548,85 +569,108 @@ def test_get_indexer_non_unique(self): # TODO: This method came from test_period; de-dup with version above def test_get_loc2(self): - idx = pd.period_range('2000-01-01', periods=3) + idx = pd.period_range("2000-01-01", periods=3) - for method in [None, 'pad', 'backfill', 'nearest']: + for method in [None, "pad", "backfill", "nearest"]: assert idx.get_loc(idx[1], method) == 1 - assert idx.get_loc(idx[1].asfreq('H', how='start'), method) == 1 + assert idx.get_loc(idx[1].asfreq("H", how="start"), method) == 1 assert idx.get_loc(idx[1].to_timestamp(), method) == 1 - assert idx.get_loc(idx[1].to_timestamp() - .to_pydatetime(), method) == 1 + assert idx.get_loc(idx[1].to_timestamp().to_pydatetime(), method) == 1 assert idx.get_loc(str(idx[1]), method) == 1 - idx = pd.period_range('2000-01-01', periods=5)[::2] - assert idx.get_loc('2000-01-02T12', method='nearest', - tolerance='1 day') == 1 - assert idx.get_loc('2000-01-02T12', method='nearest', - tolerance=pd.Timedelta('1D')) == 1 - assert idx.get_loc('2000-01-02T12', method='nearest', - tolerance=np.timedelta64(1, 'D')) == 1 - assert idx.get_loc('2000-01-02T12', method='nearest', - tolerance=timedelta(1)) == 1 - - msg = 'unit abbreviation w/o a number' + idx = pd.period_range("2000-01-01", periods=5)[::2] + assert idx.get_loc("2000-01-02T12", method="nearest", tolerance="1 day") == 1 + assert ( + idx.get_loc("2000-01-02T12", method="nearest", tolerance=pd.Timedelta("1D")) + == 1 + ) + assert ( + idx.get_loc( + "2000-01-02T12", method="nearest", tolerance=np.timedelta64(1, "D") + ) + == 1 + ) + assert ( + idx.get_loc("2000-01-02T12", method="nearest", tolerance=timedelta(1)) == 1 + ) + + msg = "unit abbreviation w/o a number" with pytest.raises(ValueError, match=msg): - idx.get_loc('2000-01-10', method='nearest', tolerance='foo') + idx.get_loc("2000-01-10", method="nearest", tolerance="foo") - msg = 'Input has different freq=None from PeriodArray\\(freq=D\\)' + msg = "Input has different freq=None from PeriodArray\\(freq=D\\)" with pytest.raises(ValueError, match=msg): - idx.get_loc('2000-01-10', method='nearest', tolerance='1 hour') + idx.get_loc("2000-01-10", method="nearest", tolerance="1 hour") with pytest.raises(KeyError, match=r"^Period\('2000-01-10', 'D'\)$"): - idx.get_loc('2000-01-10', method='nearest', tolerance='1 day') + idx.get_loc("2000-01-10", method="nearest", tolerance="1 day") with pytest.raises( - ValueError, - match='list-like tolerance size must match target index size'): - idx.get_loc('2000-01-10', method='nearest', - tolerance=[pd.Timedelta('1 day').to_timedelta64(), - pd.Timedelta('1 day').to_timedelta64()]) + ValueError, match="list-like tolerance size must match target index size" + ): + idx.get_loc( + "2000-01-10", + method="nearest", + tolerance=[ + pd.Timedelta("1 day").to_timedelta64(), + pd.Timedelta("1 day").to_timedelta64(), + ], + ) # TODO: This method came from test_period; de-dup with version above def test_get_indexer2(self): - idx = pd.period_range('2000-01-01', periods=3).asfreq('H', how='start') - tm.assert_numpy_array_equal(idx.get_indexer(idx), - np.array([0, 1, 2], dtype=np.intp)) - - target = pd.PeriodIndex(['1999-12-31T23', '2000-01-01T12', - '2000-01-02T01'], freq='H') - tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 1, 1], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest', - tolerance='1 hour'), - np.array([0, -1, 1], dtype=np.intp)) - - msg = 'Input has different freq=None from PeriodArray\\(freq=H\\)' + idx = pd.period_range("2000-01-01", periods=3).asfreq("H", how="start") + tm.assert_numpy_array_equal( + idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) + ) + + target = pd.PeriodIndex( + ["1999-12-31T23", "2000-01-01T12", "2000-01-02T01"], freq="H" + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "backfill"), np.array([0, 1, 2], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest"), np.array([0, 1, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest", tolerance="1 hour"), + np.array([0, -1, 1], dtype=np.intp), + ) + + msg = "Input has different freq=None from PeriodArray\\(freq=H\\)" with pytest.raises(ValueError, match=msg): - idx.get_indexer(target, 'nearest', tolerance='1 minute') - - tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest', - tolerance='1 day'), - np.array([0, 1, 1], dtype=np.intp)) - tol_raw = [pd.Timedelta('1 hour'), - pd.Timedelta('1 hour'), - np.timedelta64(1, 'D'), ] + idx.get_indexer(target, "nearest", tolerance="1 minute") + + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest", tolerance="1 day"), + np.array([0, 1, 1], dtype=np.intp), + ) + tol_raw = [ + pd.Timedelta("1 hour"), + pd.Timedelta("1 hour"), + np.timedelta64(1, "D"), + ] tm.assert_numpy_array_equal( - idx.get_indexer(target, 'nearest', - tolerance=[np.timedelta64(x) for x in tol_raw]), - np.array([0, -1, 1], dtype=np.intp)) - tol_bad = [pd.Timedelta('2 hour').to_timedelta64(), - pd.Timedelta('1 hour').to_timedelta64(), - np.timedelta64(1, 'M'), ] + idx.get_indexer( + target, "nearest", tolerance=[np.timedelta64(x) for x in tol_raw] + ), + np.array([0, -1, 1], dtype=np.intp), + ) + tol_bad = [ + pd.Timedelta("2 hour").to_timedelta64(), + pd.Timedelta("1 hour").to_timedelta64(), + np.timedelta64(1, "M"), + ] with pytest.raises( - libperiod.IncompatibleFrequency, - match='Input has different freq=None from'): - idx.get_indexer(target, 'nearest', tolerance=tol_bad) + libperiod.IncompatibleFrequency, match="Input has different freq=None from" + ): + idx.get_indexer(target, "nearest", tolerance=tol_bad) def test_indexing(self): # GH 4390, iat incorrectly indexing - index = period_range('1/1/2001', periods=10) + index = period_range("1/1/2001", periods=10) s = Series(np.random.randn(10), index=index) expected = s[index[0]] result = s.iat[0] @@ -634,7 +678,7 @@ def test_indexing(self): def test_period_index_indexer(self): # GH4125 - idx = pd.period_range('2002-01', '2003-12', freq='M') + idx = pd.period_range("2002-01", "2003-12", freq="M") df = pd.DataFrame(pd.np.random.randn(24, 10), index=idx) tm.assert_frame_equal(df, df.loc[idx]) tm.assert_frame_equal(df, df.loc[list(idx)]) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 766919735c1913..96042f4dbaba2d 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -1,4 +1,3 @@ - import numpy as np import pytest @@ -10,11 +9,9 @@ class TestPeriodIndexOps(Ops): - def setup_method(self, method): super().setup_method(method) - mask = lambda x: (isinstance(x, DatetimeIndex) or - isinstance(x, PeriodIndex)) + mask = lambda x: (isinstance(x, DatetimeIndex) or isinstance(x, PeriodIndex)) self.is_valid_objs = [o for o in self.objs if mask(o)] self.not_valid_objs = [o for o in self.objs if not mask(o)] @@ -25,50 +22,72 @@ def test_ops_properties(self): self.check_ops_properties(PeriodArray._bool_ops, f) def test_resolution(self): - for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', - 'T', 'S', 'L', 'U'], - ['day', 'day', 'day', 'day', - 'hour', 'minute', 'second', - 'millisecond', 'microsecond']): - - idx = pd.period_range(start='2013-04-01', periods=30, freq=freq) + for freq, expected in zip( + ["A", "Q", "M", "D", "H", "T", "S", "L", "U"], + [ + "day", + "day", + "day", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + ], + ): + + idx = pd.period_range(start="2013-04-01", periods=30, freq=freq) assert idx.resolution == expected def test_value_counts_unique(self): # GH 7735 - idx = pd.period_range('2011-01-01 09:00', freq='H', periods=10) + idx = pd.period_range("2011-01-01 09:00", freq="H", periods=10) # create repeated values, 'n'th element is repeated by n+1 times - idx = PeriodIndex(np.repeat(idx._values, range(1, len(idx) + 1)), - freq='H') - - exp_idx = PeriodIndex(['2011-01-01 18:00', '2011-01-01 17:00', - '2011-01-01 16:00', '2011-01-01 15:00', - '2011-01-01 14:00', '2011-01-01 13:00', - '2011-01-01 12:00', '2011-01-01 11:00', - '2011-01-01 10:00', - '2011-01-01 09:00'], freq='H') - expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') + idx = PeriodIndex(np.repeat(idx._values, range(1, len(idx) + 1)), freq="H") + + exp_idx = PeriodIndex( + [ + "2011-01-01 18:00", + "2011-01-01 17:00", + "2011-01-01 16:00", + "2011-01-01 15:00", + "2011-01-01 14:00", + "2011-01-01 13:00", + "2011-01-01 12:00", + "2011-01-01 11:00", + "2011-01-01 10:00", + "2011-01-01 09:00", + ], + freq="H", + ) + expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) - expected = pd.period_range('2011-01-01 09:00', freq='H', - periods=10) + expected = pd.period_range("2011-01-01 09:00", freq="H", periods=10) tm.assert_index_equal(idx.unique(), expected) - idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 09:00', - '2013-01-01 09:00', '2013-01-01 08:00', - '2013-01-01 08:00', NaT], freq='H') + idx = PeriodIndex( + [ + "2013-01-01 09:00", + "2013-01-01 09:00", + "2013-01-01 09:00", + "2013-01-01 08:00", + "2013-01-01 08:00", + NaT, + ], + freq="H", + ) - exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00'], - freq='H') + exp_idx = PeriodIndex(["2013-01-01 09:00", "2013-01-01 08:00"], freq="H") expected = Series([3, 2], index=exp_idx) for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) - exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00', - NaT], freq='H') + exp_idx = PeriodIndex(["2013-01-01 09:00", "2013-01-01 08:00", NaT], freq="H") expected = Series([3, 2, 1], index=exp_idx) for obj in [idx, Series(idx)]: @@ -78,7 +97,7 @@ def test_value_counts_unique(self): def test_drop_duplicates_metadata(self): # GH 10115 - idx = pd.period_range('2011-01-01', '2011-01-31', freq='D', name='idx') + idx = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx") result = idx.drop_duplicates() tm.assert_index_equal(idx, result) assert idx.freq == result.freq @@ -90,8 +109,7 @@ def test_drop_duplicates_metadata(self): def test_drop_duplicates(self): # to check Index/Series compat - base = pd.period_range('2011-01-01', '2011-01-31', freq='D', - name='idx') + base = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx") idx = base.append(base[:5]) res = idx.drop_duplicates() @@ -99,10 +117,10 @@ def test_drop_duplicates(self): res = Series(idx).drop_duplicates() tm.assert_series_equal(res, Series(base)) - res = idx.drop_duplicates(keep='last') + res = idx.drop_duplicates(keep="last") exp = base[5:].append(base[:5]) tm.assert_index_equal(res, exp) - res = Series(idx).drop_duplicates(keep='last') + res = Series(idx).drop_duplicates(keep="last") tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) res = idx.drop_duplicates(keep=False) @@ -115,9 +133,9 @@ def _check_freq(index, expected_index): if isinstance(index, PeriodIndex): assert index.freq == expected_index.freq - pidx = PeriodIndex(['2011', '2012', '2013'], name='pidx', freq='A') + pidx = PeriodIndex(["2011", "2012", "2013"], name="pidx", freq="A") # for compatibility check - iidx = Index([2011, 2012, 2013], name='idx') + iidx = Index([2011, 2012, 2013], name="idx") for idx in [pidx, iidx]: ordered = idx.sort_values() tm.assert_index_equal(ordered, idx) @@ -129,24 +147,23 @@ def _check_freq(index, expected_index): ordered, indexer = idx.sort_values(return_indexer=True) tm.assert_index_equal(ordered, idx) - tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), - check_dtype=False) + tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), check_dtype=False) _check_freq(ordered, idx) - ordered, indexer = idx.sort_values(return_indexer=True, - ascending=False) + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, idx[::-1]) - tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0]), - check_dtype=False) + tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0]), check_dtype=False) _check_freq(ordered, idx[::-1]) - pidx = PeriodIndex(['2011', '2013', '2015', '2012', - '2011'], name='pidx', freq='A') + pidx = PeriodIndex( + ["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A" + ) pexpected = PeriodIndex( - ['2011', '2011', '2012', '2013', '2015'], name='pidx', freq='A') + ["2011", "2011", "2012", "2013", "2015"], name="pidx", freq="A" + ) # for compatibility check - iidx = Index([2011, 2013, 2015, 2012, 2011], name='idx') - iexpected = Index([2011, 2011, 2012, 2013, 2015], name='idx') + iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx") + iexpected = Index([2011, 2011, 2012, 2013, 2015], name="idx") for idx, expected in [(pidx, pexpected), (iidx, iexpected)]: ordered = idx.sort_values() tm.assert_index_equal(ordered, expected) @@ -163,33 +180,30 @@ def _check_freq(index, expected_index): tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) _check_freq(ordered, idx) - ordered, indexer = idx.sort_values(return_indexer=True, - ascending=False) + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, expected[::-1]) exp = np.array([2, 1, 3, 4, 0]) tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) _check_freq(ordered, idx) - pidx = PeriodIndex(['2011', '2013', 'NaT', '2011'], name='pidx', - freq='D') + pidx = PeriodIndex(["2011", "2013", "NaT", "2011"], name="pidx", freq="D") result = pidx.sort_values() - expected = PeriodIndex(['NaT', '2011', '2011', '2013'], - name='pidx', freq='D') + expected = PeriodIndex(["NaT", "2011", "2011", "2013"], name="pidx", freq="D") tm.assert_index_equal(result, expected) - assert result.freq == 'D' + assert result.freq == "D" result = pidx.sort_values(ascending=False) - expected = PeriodIndex( - ['2013', '2011', '2011', 'NaT'], name='pidx', freq='D') + expected = PeriodIndex(["2013", "2011", "2011", "NaT"], name="pidx", freq="D") tm.assert_index_equal(result, expected) - assert result.freq == 'D' + assert result.freq == "D" def test_order(self): - for freq in ['D', '2D', '4D']: - idx = PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], - freq=freq, name='idx') + for freq in ["D", "2D", "4D"]: + idx = PeriodIndex( + ["2011-01-01", "2011-01-02", "2011-01-03"], freq=freq, name="idx" + ) ordered = idx.sort_values() tm.assert_index_equal(ordered, idx) @@ -203,60 +217,68 @@ def test_order(self): ordered, indexer = idx.sort_values(return_indexer=True) tm.assert_index_equal(ordered, idx) - tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), - check_dtype=False) + tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), check_dtype=False) assert ordered.freq == idx.freq assert ordered.freq == freq - ordered, indexer = idx.sort_values(return_indexer=True, - ascending=False) + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) expected = idx[::-1] tm.assert_index_equal(ordered, expected) - tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0]), - check_dtype=False) + tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0]), check_dtype=False) assert ordered.freq == expected.freq assert ordered.freq == freq - idx1 = PeriodIndex(['2011-01-01', '2011-01-03', '2011-01-05', - '2011-01-02', '2011-01-01'], freq='D', name='idx1') - exp1 = PeriodIndex(['2011-01-01', '2011-01-01', '2011-01-02', - '2011-01-03', '2011-01-05'], freq='D', name='idx1') + idx1 = PeriodIndex( + ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], + freq="D", + name="idx1", + ) + exp1 = PeriodIndex( + ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], + freq="D", + name="idx1", + ) - idx2 = PeriodIndex(['2011-01-01', '2011-01-03', '2011-01-05', - '2011-01-02', '2011-01-01'], - freq='D', name='idx2') - exp2 = PeriodIndex(['2011-01-01', '2011-01-01', '2011-01-02', - '2011-01-03', '2011-01-05'], - freq='D', name='idx2') + idx2 = PeriodIndex( + ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], + freq="D", + name="idx2", + ) + exp2 = PeriodIndex( + ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], + freq="D", + name="idx2", + ) - idx3 = PeriodIndex([NaT, '2011-01-03', '2011-01-05', - '2011-01-02', NaT], freq='D', name='idx3') - exp3 = PeriodIndex([NaT, NaT, '2011-01-02', '2011-01-03', - '2011-01-05'], freq='D', name='idx3') + idx3 = PeriodIndex( + [NaT, "2011-01-03", "2011-01-05", "2011-01-02", NaT], freq="D", name="idx3" + ) + exp3 = PeriodIndex( + [NaT, NaT, "2011-01-02", "2011-01-03", "2011-01-05"], freq="D", name="idx3" + ) for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3)]: ordered = idx.sort_values() tm.assert_index_equal(ordered, expected) - assert ordered.freq == 'D' + assert ordered.freq == "D" ordered = idx.sort_values(ascending=False) tm.assert_index_equal(ordered, expected[::-1]) - assert ordered.freq == 'D' + assert ordered.freq == "D" ordered, indexer = idx.sort_values(return_indexer=True) tm.assert_index_equal(ordered, expected) exp = np.array([0, 4, 3, 1, 2]) tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) - assert ordered.freq == 'D' + assert ordered.freq == "D" - ordered, indexer = idx.sort_values(return_indexer=True, - ascending=False) + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, expected[::-1]) exp = np.array([2, 1, 3, 4, 0]) tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) - assert ordered.freq == 'D' + assert ordered.freq == "D" def test_shift(self): # This is tested in test_arithmetic @@ -264,29 +286,26 @@ def test_shift(self): def test_nat(self): assert pd.PeriodIndex._na_value is NaT - assert pd.PeriodIndex([], freq='M')._na_value is NaT + assert pd.PeriodIndex([], freq="M")._na_value is NaT - idx = pd.PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') + idx = pd.PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") assert idx._can_hold_na tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) assert idx.hasnans is False - tm.assert_numpy_array_equal(idx._nan_idxs, - np.array([], dtype=np.intp)) + tm.assert_numpy_array_equal(idx._nan_idxs, np.array([], dtype=np.intp)) - idx = pd.PeriodIndex(['2011-01-01', 'NaT'], freq='D') + idx = pd.PeriodIndex(["2011-01-01", "NaT"], freq="D") assert idx._can_hold_na tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) assert idx.hasnans is True - tm.assert_numpy_array_equal(idx._nan_idxs, - np.array([1], dtype=np.intp)) + tm.assert_numpy_array_equal(idx._nan_idxs, np.array([1], dtype=np.intp)) - @pytest.mark.parametrize('freq', ['D', 'M']) + @pytest.mark.parametrize("freq", ["D", "M"]) def test_equals(self, freq): # GH#13107 - idx = pd.PeriodIndex(['2011-01-01', '2011-01-02', 'NaT'], - freq=freq) + idx = pd.PeriodIndex(["2011-01-01", "2011-01-02", "NaT"], freq=freq) assert idx.equals(idx) assert idx.equals(idx.copy()) assert idx.equals(idx.astype(object)) @@ -295,8 +314,7 @@ def test_equals(self, freq): assert not idx.equals(list(idx)) assert not idx.equals(pd.Series(idx)) - idx2 = pd.PeriodIndex(['2011-01-01', '2011-01-02', 'NaT'], - freq='H') + idx2 = pd.PeriodIndex(["2011-01-01", "2011-01-02", "NaT"], freq="H") assert not idx.equals(idx2) assert not idx.equals(idx2.copy()) assert not idx.equals(idx2.astype(object)) @@ -318,7 +336,7 @@ def test_equals(self, freq): def test_freq_setter_deprecated(self): # GH 20678 - idx = pd.period_range('2018Q1', periods=4, freq='Q') + idx = pd.period_range("2018Q1", periods=4, freq="Q") # no warning for getter with tm.assert_produces_warning(None): diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py index dbde7ecf3826d4..00b9803980bc8d 100644 --- a/pandas/tests/indexes/period/test_partial_slicing.py +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -7,13 +7,11 @@ class TestPeriodIndex: - def setup_method(self, method): pass def test_slice_with_negative_step(self): - ts = Series(np.arange(20), - period_range('2014-01', periods=20, freq='M')) + ts = Series(np.arange(20), period_range("2014-01", periods=20, freq="M")) SLC = pd.IndexSlice def assert_slices_equivalent(l_slc, i_slc): @@ -21,112 +19,117 @@ def assert_slices_equivalent(l_slc, i_slc): tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) - assert_slices_equivalent(SLC[Period('2014-10')::-1], SLC[9::-1]) - assert_slices_equivalent(SLC['2014-10'::-1], SLC[9::-1]) + assert_slices_equivalent(SLC[Period("2014-10") :: -1], SLC[9::-1]) + assert_slices_equivalent(SLC["2014-10"::-1], SLC[9::-1]) - assert_slices_equivalent(SLC[:Period('2014-10'):-1], SLC[:8:-1]) - assert_slices_equivalent(SLC[:'2014-10':-1], SLC[:8:-1]) + assert_slices_equivalent(SLC[: Period("2014-10") : -1], SLC[:8:-1]) + assert_slices_equivalent(SLC[:"2014-10":-1], SLC[:8:-1]) - assert_slices_equivalent(SLC['2015-02':'2014-10':-1], SLC[13:8:-1]) - assert_slices_equivalent(SLC[Period('2015-02'):Period('2014-10'):-1], - SLC[13:8:-1]) - assert_slices_equivalent(SLC['2015-02':Period('2014-10'):-1], - SLC[13:8:-1]) - assert_slices_equivalent(SLC[Period('2015-02'):'2014-10':-1], - SLC[13:8:-1]) + assert_slices_equivalent(SLC["2015-02":"2014-10":-1], SLC[13:8:-1]) + assert_slices_equivalent( + SLC[Period("2015-02") : Period("2014-10") : -1], SLC[13:8:-1] + ) + assert_slices_equivalent(SLC["2015-02" : Period("2014-10") : -1], SLC[13:8:-1]) + assert_slices_equivalent(SLC[Period("2015-02") : "2014-10" : -1], SLC[13:8:-1]) - assert_slices_equivalent(SLC['2014-10':'2015-02':-1], SLC[:0]) + assert_slices_equivalent(SLC["2014-10":"2015-02":-1], SLC[:0]) def test_slice_with_zero_step_raises(self): - ts = Series(np.arange(20), - period_range('2014-01', periods=20, freq='M')) - with pytest.raises(ValueError, match='slice step cannot be zero'): + ts = Series(np.arange(20), period_range("2014-01", periods=20, freq="M")) + with pytest.raises(ValueError, match="slice step cannot be zero"): ts[::0] - with pytest.raises(ValueError, match='slice step cannot be zero'): + with pytest.raises(ValueError, match="slice step cannot be zero"): ts.loc[::0] - with pytest.raises(ValueError, match='slice step cannot be zero'): + with pytest.raises(ValueError, match="slice step cannot be zero"): ts.loc[::0] def test_slice_keep_name(self): - idx = period_range('20010101', periods=10, freq='D', name='bob') + idx = period_range("20010101", periods=10, freq="D", name="bob") assert idx.name == idx[1:].name def test_pindex_slice_index(self): - pi = period_range(start='1/1/10', end='12/31/12', freq='M') + pi = period_range(start="1/1/10", end="12/31/12", freq="M") s = Series(np.random.rand(len(pi)), index=pi) - res = s['2010'] + res = s["2010"] exp = s[0:12] tm.assert_series_equal(res, exp) - res = s['2011'] + res = s["2011"] exp = s[12:24] tm.assert_series_equal(res, exp) def test_range_slice_day(self): # GH#6716 - didx = pd.date_range(start='2013/01/01', freq='D', periods=400) - pidx = period_range(start='2013/01/01', freq='D', periods=400) + didx = pd.date_range(start="2013/01/01", freq="D", periods=400) + pidx = period_range(start="2013/01/01", freq="D", periods=400) for idx in [didx, pidx]: # slices against index should raise IndexError - values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H', - '2013/02/01 09:00'] + values = [ + "2014", + "2013/02", + "2013/01/02", + "2013/02/01 9H", + "2013/02/01 09:00", + ] for v in values: with pytest.raises(TypeError): idx[v:] s = Series(np.random.rand(len(idx)), index=idx) - tm.assert_series_equal(s['2013/01/02':], s[1:]) - tm.assert_series_equal(s['2013/01/02':'2013/01/05'], s[1:5]) - tm.assert_series_equal(s['2013/02':], s[31:]) - tm.assert_series_equal(s['2014':], s[365:]) + tm.assert_series_equal(s["2013/01/02":], s[1:]) + tm.assert_series_equal(s["2013/01/02":"2013/01/05"], s[1:5]) + tm.assert_series_equal(s["2013/02":], s[31:]) + tm.assert_series_equal(s["2014":], s[365:]) - invalid = ['2013/02/01 9H', '2013/02/01 09:00'] + invalid = ["2013/02/01 9H", "2013/02/01 09:00"] for v in invalid: with pytest.raises(TypeError): idx[v:] def test_range_slice_seconds(self): # GH#6716 - didx = pd.date_range(start='2013/01/01 09:00:00', freq='S', - periods=4000) - pidx = period_range(start='2013/01/01 09:00:00', freq='S', - periods=4000) + didx = pd.date_range(start="2013/01/01 09:00:00", freq="S", periods=4000) + pidx = period_range(start="2013/01/01 09:00:00", freq="S", periods=4000) for idx in [didx, pidx]: # slices against index should raise IndexError - values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H', - '2013/02/01 09:00'] + values = [ + "2014", + "2013/02", + "2013/01/02", + "2013/02/01 9H", + "2013/02/01 09:00", + ] for v in values: with pytest.raises(TypeError): idx[v:] s = Series(np.random.rand(len(idx)), index=idx) - tm.assert_series_equal(s['2013/01/01 09:05':'2013/01/01 09:10'], - s[300:660]) - tm.assert_series_equal(s['2013/01/01 10:00':'2013/01/01 10:05'], - s[3600:3960]) - tm.assert_series_equal(s['2013/01/01 10H':], s[3600:]) - tm.assert_series_equal(s[:'2013/01/01 09:30'], s[:1860]) - for d in ['2013/01/01', '2013/01', '2013']: + tm.assert_series_equal(s["2013/01/01 09:05":"2013/01/01 09:10"], s[300:660]) + tm.assert_series_equal( + s["2013/01/01 10:00":"2013/01/01 10:05"], s[3600:3960] + ) + tm.assert_series_equal(s["2013/01/01 10H":], s[3600:]) + tm.assert_series_equal(s[:"2013/01/01 09:30"], s[:1860]) + for d in ["2013/01/01", "2013/01", "2013"]: tm.assert_series_equal(s[d:], s) def test_range_slice_outofbounds(self): # GH#5407 - didx = pd.date_range(start='2013/10/01', freq='D', periods=10) - pidx = period_range(start='2013/10/01', freq='D', periods=10) + didx = pd.date_range(start="2013/10/01", freq="D", periods=10) + pidx = period_range(start="2013/10/01", freq="D", periods=10) for idx in [didx, pidx]: df = DataFrame(dict(units=[100 + i for i in range(10)]), index=idx) - empty = DataFrame(index=idx.__class__([], freq='D'), - columns=['units']) - empty['units'] = empty['units'].astype('int64') - - tm.assert_frame_equal(df['2013/09/01':'2013/09/30'], empty) - tm.assert_frame_equal(df['2013/09/30':'2013/10/02'], df.iloc[:2]) - tm.assert_frame_equal(df['2013/10/01':'2013/10/02'], df.iloc[:2]) - tm.assert_frame_equal(df['2013/10/02':'2013/09/30'], empty) - tm.assert_frame_equal(df['2013/10/15':'2013/10/17'], empty) - tm.assert_frame_equal(df['2013-06':'2013-09'], empty) - tm.assert_frame_equal(df['2013-11':'2013-12'], empty) + empty = DataFrame(index=idx.__class__([], freq="D"), columns=["units"]) + empty["units"] = empty["units"].astype("int64") + + tm.assert_frame_equal(df["2013/09/01":"2013/09/30"], empty) + tm.assert_frame_equal(df["2013/09/30":"2013/10/02"], df.iloc[:2]) + tm.assert_frame_equal(df["2013/10/01":"2013/10/02"], df.iloc[:2]) + tm.assert_frame_equal(df["2013/10/02":"2013/09/30"], empty) + tm.assert_frame_equal(df["2013/10/15":"2013/10/17"], empty) + tm.assert_frame_equal(df["2013-06":"2013-09"], empty) + tm.assert_frame_equal(df["2013-11":"2013-12"], empty) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index b33982f3d62f35..8b3b66bd1ee6b3 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -6,8 +6,17 @@ import pandas as pd from pandas import ( - DataFrame, DatetimeIndex, Index, NaT, Period, PeriodIndex, Series, - date_range, offsets, period_range) + DataFrame, + DatetimeIndex, + Index, + NaT, + Period, + PeriodIndex, + Series, + date_range, + offsets, + period_range, +) from pandas.util import testing as tm from ..datetimelike import DatetimeLike @@ -17,20 +26,21 @@ class TestPeriodIndex(DatetimeLike): _holder = PeriodIndex def setup_method(self, method): - self.indices = dict(index=tm.makePeriodIndex(10), - index_dec=period_range('20130101', periods=10, - freq='D')[::-1]) + self.indices = dict( + index=tm.makePeriodIndex(10), + index_dec=period_range("20130101", periods=10, freq="D")[::-1], + ) self.setup_indices() def create_index(self): - return period_range('20130101', periods=5, freq='D') + return period_range("20130101", periods=5, freq="D") def test_pickle_compat_construction(self): pass - @pytest.mark.parametrize('freq', ['D', 'M', 'A']) + @pytest.mark.parametrize("freq", ["D", "M", "A"]) def test_pickle_round_trip(self, freq): - idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq=freq) + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.NaN], freq=freq) result = tm.round_trip_pickle(idx) tm.assert_index_equal(result, idx) @@ -38,11 +48,15 @@ def test_where(self): # This is handled in test_indexing pass - @pytest.mark.parametrize('use_numpy', [True, False]) - @pytest.mark.parametrize('index', [ - pd.period_range('2000-01-01', periods=3, freq='D'), - pd.period_range('2001-01-01', periods=3, freq='2D'), - pd.PeriodIndex(['2001-01', 'NaT', '2003-01'], freq='M')]) + @pytest.mark.parametrize("use_numpy", [True, False]) + @pytest.mark.parametrize( + "index", + [ + pd.period_range("2000-01-01", periods=3, freq="D"), + pd.period_range("2001-01-01", periods=3, freq="2D"), + pd.PeriodIndex(["2001-01", "NaT", "2003-01"], freq="M"), + ], + ) def test_repeat_freqstr(self, index, use_numpy): # GH10183 expected = PeriodIndex([p for p in index for _ in range(3)]) @@ -52,23 +66,32 @@ def test_repeat_freqstr(self, index, use_numpy): def test_fillna_period(self): # GH 11343 - idx = pd.PeriodIndex(['2011-01-01 09:00', pd.NaT, - '2011-01-01 11:00'], freq='H') - - exp = pd.PeriodIndex(['2011-01-01 09:00', '2011-01-01 10:00', - '2011-01-01 11:00'], freq='H') - tm.assert_index_equal( - idx.fillna(pd.Period('2011-01-01 10:00', freq='H')), exp) - - exp = pd.Index([pd.Period('2011-01-01 09:00', freq='H'), 'x', - pd.Period('2011-01-01 11:00', freq='H')], dtype=object) - tm.assert_index_equal(idx.fillna('x'), exp) - - exp = pd.Index([pd.Period('2011-01-01 09:00', freq='H'), - pd.Period('2011-01-01', freq='D'), - pd.Period('2011-01-01 11:00', freq='H')], dtype=object) - tm.assert_index_equal(idx.fillna( - pd.Period('2011-01-01', freq='D')), exp) + idx = pd.PeriodIndex(["2011-01-01 09:00", pd.NaT, "2011-01-01 11:00"], freq="H") + + exp = pd.PeriodIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], freq="H" + ) + tm.assert_index_equal(idx.fillna(pd.Period("2011-01-01 10:00", freq="H")), exp) + + exp = pd.Index( + [ + pd.Period("2011-01-01 09:00", freq="H"), + "x", + pd.Period("2011-01-01 11:00", freq="H"), + ], + dtype=object, + ) + tm.assert_index_equal(idx.fillna("x"), exp) + + exp = pd.Index( + [ + pd.Period("2011-01-01 09:00", freq="H"), + pd.Period("2011-01-01", freq="D"), + pd.Period("2011-01-01 11:00", freq="H"), + ], + dtype=object, + ) + tm.assert_index_equal(idx.fillna(pd.Period("2011-01-01", freq="D")), exp) def test_no_millisecond_field(self): msg = "type object 'DatetimeIndex' has no attribute 'millisecond'" @@ -87,32 +110,32 @@ def test_difference_freq(self, sort): index = period_range("20160920", "20160925", freq="D") other = period_range("20160921", "20160924", freq="D") - expected = PeriodIndex(["20160920", "20160925"], freq='D') + expected = PeriodIndex(["20160920", "20160925"], freq="D") idx_diff = index.difference(other, sort) tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) other = period_range("20160922", "20160925", freq="D") idx_diff = index.difference(other, sort) - expected = PeriodIndex(["20160920", "20160921"], freq='D') + expected = PeriodIndex(["20160920", "20160921"], freq="D") tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) def test_hash_error(self): - index = period_range('20010101', periods=10) + index = period_range("20010101", periods=10) msg = "unhashable type: '{}'".format(type(index).__name__) with pytest.raises(TypeError, match=msg): hash(index) def test_make_time_series(self): - index = period_range(freq='A', start='1/1/2001', end='12/1/2009') + index = period_range(freq="A", start="1/1/2001", end="12/1/2009") series = Series(1, index=index) assert isinstance(series, Series) def test_shallow_copy_empty(self): # GH13067 - idx = PeriodIndex([], freq='M') + idx = PeriodIndex([], freq="M") result = idx._shallow_copy() expected = idx @@ -131,36 +154,36 @@ def test_shallow_copy_changing_freq_raises(self): pi._shallow_copy(pi, freq="H") def test_dtype_str(self): - pi = pd.PeriodIndex([], freq='M') + pi = pd.PeriodIndex([], freq="M") with tm.assert_produces_warning(FutureWarning): - assert pi.dtype_str == 'period[M]' + assert pi.dtype_str == "period[M]" assert pi.dtype_str == str(pi.dtype) with tm.assert_produces_warning(FutureWarning): - pi = pd.PeriodIndex([], freq='3M') - assert pi.dtype_str == 'period[3M]' + pi = pd.PeriodIndex([], freq="3M") + assert pi.dtype_str == "period[3M]" assert pi.dtype_str == str(pi.dtype) def test_view_asi8(self): - idx = pd.PeriodIndex([], freq='M') + idx = pd.PeriodIndex([], freq="M") exp = np.array([], dtype=np.int64) - tm.assert_numpy_array_equal(idx.view('i8'), exp) + tm.assert_numpy_array_equal(idx.view("i8"), exp) tm.assert_numpy_array_equal(idx.asi8, exp) - idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') + idx = pd.PeriodIndex(["2011-01", pd.NaT], freq="M") exp = np.array([492, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx.view('i8'), exp) + tm.assert_numpy_array_equal(idx.view("i8"), exp) tm.assert_numpy_array_equal(idx.asi8, exp) exp = np.array([14975, -9223372036854775808], dtype=np.int64) - idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') - tm.assert_numpy_array_equal(idx.view('i8'), exp) + idx = pd.PeriodIndex(["2011-01-01", pd.NaT], freq="D") + tm.assert_numpy_array_equal(idx.view("i8"), exp) tm.assert_numpy_array_equal(idx.asi8, exp) def test_values(self): - idx = pd.PeriodIndex([], freq='M') + idx = pd.PeriodIndex([], freq="M") exp = np.array([], dtype=np.object) tm.assert_numpy_array_equal(idx.values, exp) @@ -170,52 +193,51 @@ def test_values(self): exp = np.array([], dtype=np.int64) tm.assert_numpy_array_equal(idx._ndarray_values, exp) - idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') + idx = pd.PeriodIndex(["2011-01", pd.NaT], freq="M") - exp = np.array([pd.Period('2011-01', freq='M'), pd.NaT], dtype=object) + exp = np.array([pd.Period("2011-01", freq="M"), pd.NaT], dtype=object) tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.to_numpy(), exp) exp = np.array([492, -9223372036854775808], dtype=np.int64) tm.assert_numpy_array_equal(idx._ndarray_values, exp) - idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') + idx = pd.PeriodIndex(["2011-01-01", pd.NaT], freq="D") - exp = np.array([pd.Period('2011-01-01', freq='D'), pd.NaT], - dtype=object) + exp = np.array([pd.Period("2011-01-01", freq="D"), pd.NaT], dtype=object) tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.to_numpy(), exp) exp = np.array([14975, -9223372036854775808], dtype=np.int64) tm.assert_numpy_array_equal(idx._ndarray_values, exp) def test_period_index_length(self): - pi = period_range(freq='A', start='1/1/2001', end='12/1/2009') + pi = period_range(freq="A", start="1/1/2001", end="12/1/2009") assert len(pi) == 9 - pi = period_range(freq='Q', start='1/1/2001', end='12/1/2009') + pi = period_range(freq="Q", start="1/1/2001", end="12/1/2009") assert len(pi) == 4 * 9 - pi = period_range(freq='M', start='1/1/2001', end='12/1/2009') + pi = period_range(freq="M", start="1/1/2001", end="12/1/2009") assert len(pi) == 12 * 9 - start = Period('02-Apr-2005', 'B') + start = Period("02-Apr-2005", "B") i1 = period_range(start=start, periods=20) assert len(i1) == 20 assert i1.freq == start.freq assert i1[0] == start - end_intv = Period('2006-12-31', 'W') + end_intv = Period("2006-12-31", "W") i1 = period_range(end=end_intv, periods=10) assert len(i1) == 10 assert i1.freq == end_intv.freq assert i1[-1] == end_intv - end_intv = Period('2006-12-31', '1w') + end_intv = Period("2006-12-31", "1w") i2 = period_range(end=end_intv, periods=10) assert len(i1) == len(i2) assert (i1 == i2).all() assert i1.freq == i2.freq - end_intv = Period('2006-12-31', ('w', 1)) + end_intv = Period("2006-12-31", ("w", 1)) i2 = period_range(end=end_intv, periods=10) assert len(i1) == len(i2) assert (i1 == i2).all() @@ -225,25 +247,27 @@ def test_period_index_length(self): with pytest.raises(ValueError, match=msg): period_range(start=start, end=end_intv) - end_intv = Period('2005-05-01', 'B') + end_intv = Period("2005-05-01", "B") i1 = period_range(start=start, end=end_intv) - msg = ("Of the three parameters: start, end, and periods, exactly two" - " must be specified") + msg = ( + "Of the three parameters: start, end, and periods, exactly two" + " must be specified" + ) with pytest.raises(ValueError, match=msg): period_range(start=start) # infer freq from first element - i2 = PeriodIndex([end_intv, Period('2005-05-05', 'B')]) + i2 = PeriodIndex([end_intv, Period("2005-05-05", "B")]) assert len(i2) == 2 assert i2[0] == end_intv - i2 = PeriodIndex(np.array([end_intv, Period('2005-05-05', 'B')])) + i2 = PeriodIndex(np.array([end_intv, Period("2005-05-05", "B")])) assert len(i2) == 2 assert i2[0] == end_intv # Mixed freq should fail - vals = [end_intv, Period('2006-12-31', 'w')] + vals = [end_intv, Period("2006-12-31", "w")] msg = r"Input has different freq=W-SUN from PeriodIndex\(freq=B\)" with pytest.raises(IncompatibleFrequency, match=msg): PeriodIndex(vals) @@ -255,39 +279,52 @@ def test_fields(self): # year, month, day, hour, minute # second, weekofyear, week, dayofweek, weekday, dayofyear, quarter # qyear - pi = period_range(freq='A', start='1/1/2001', end='12/1/2005') + pi = period_range(freq="A", start="1/1/2001", end="12/1/2005") self._check_all_fields(pi) - pi = period_range(freq='Q', start='1/1/2001', end='12/1/2002') + pi = period_range(freq="Q", start="1/1/2001", end="12/1/2002") self._check_all_fields(pi) - pi = period_range(freq='M', start='1/1/2001', end='1/1/2002') + pi = period_range(freq="M", start="1/1/2001", end="1/1/2002") self._check_all_fields(pi) - pi = period_range(freq='D', start='12/1/2001', end='6/1/2001') + pi = period_range(freq="D", start="12/1/2001", end="6/1/2001") self._check_all_fields(pi) - pi = period_range(freq='B', start='12/1/2001', end='6/1/2001') + pi = period_range(freq="B", start="12/1/2001", end="6/1/2001") self._check_all_fields(pi) - pi = period_range(freq='H', start='12/31/2001', end='1/1/2002 23:00') + pi = period_range(freq="H", start="12/31/2001", end="1/1/2002 23:00") self._check_all_fields(pi) - pi = period_range(freq='Min', start='12/31/2001', end='1/1/2002 00:20') + pi = period_range(freq="Min", start="12/31/2001", end="1/1/2002 00:20") self._check_all_fields(pi) - pi = period_range(freq='S', start='12/31/2001 00:00:00', - end='12/31/2001 00:05:00') + pi = period_range( + freq="S", start="12/31/2001 00:00:00", end="12/31/2001 00:05:00" + ) self._check_all_fields(pi) - end_intv = Period('2006-12-31', 'W') + end_intv = Period("2006-12-31", "W") i1 = period_range(end=end_intv, periods=10) self._check_all_fields(i1) def _check_all_fields(self, periodindex): - fields = ['year', 'month', 'day', 'hour', 'minute', 'second', - 'weekofyear', 'week', 'dayofweek', 'dayofyear', - 'quarter', 'qyear', 'days_in_month'] + fields = [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "weekofyear", + "week", + "dayofweek", + "dayofyear", + "quarter", + "qyear", + "days_in_month", + ] periods = list(periodindex) s = pd.Series(periodindex) @@ -309,8 +346,8 @@ def _check_all_fields(self, periodindex): def test_period_set_index_reindex(self): # GH 6631 df = DataFrame(np.random.random(6)) - idx1 = period_range('2011/01/01', periods=6, freq='M') - idx2 = period_range('2013', periods=6, freq='A') + idx1 = period_range("2011/01/01", periods=6, freq="M") + idx2 = period_range("2013", periods=6, freq="A") df = df.set_index(idx1) tm.assert_index_equal(df.index, idx1) @@ -318,11 +355,12 @@ def test_period_set_index_reindex(self): tm.assert_index_equal(df.index, idx2) def test_factorize(self): - idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', - '2014-03', '2014-03'], freq='M') + idx1 = PeriodIndex( + ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], freq="M" + ) exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) - exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') + exp_idx = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M") arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) @@ -332,8 +370,9 @@ def test_factorize(self): tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) - idx2 = pd.PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', - '2014-03', '2014-01'], freq='M') + idx2 = pd.PeriodIndex( + ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], freq="M" + ) exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.intp) arr, idx = idx2.factorize(sort=True) @@ -341,14 +380,13 @@ def test_factorize(self): tm.assert_index_equal(idx, exp_idx) exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp) - exp_idx = PeriodIndex(['2014-03', '2014-02', '2014-01'], freq='M') + exp_idx = PeriodIndex(["2014-03", "2014-02", "2014-01"], freq="M") arr, idx = idx2.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) def test_is_(self): - create_index = lambda: period_range(freq='A', start='1/1/2001', - end='12/1/2009') + create_index = lambda: period_range(freq="A", start="1/1/2001", end="12/1/2009") index = create_index() assert index.is_(index) assert not index.is_(create_index()) @@ -359,55 +397,57 @@ def test_is_(self): index.name = "Apple" assert ind2.is_(index) assert not index.is_(index[:]) - assert not index.is_(index.asfreq('M')) - assert not index.is_(index.asfreq('A')) + assert not index.is_(index.asfreq("M")) + assert not index.is_(index.asfreq("A")) assert not index.is_(index - 2) assert not index.is_(index - 0) def test_contains(self): - rng = period_range('2007-01', freq='M', periods=10) + rng = period_range("2007-01", freq="M", periods=10) - assert Period('2007-01', freq='M') in rng - assert not Period('2007-01', freq='D') in rng - assert not Period('2007-01', freq='2M') in rng + assert Period("2007-01", freq="M") in rng + assert not Period("2007-01", freq="D") in rng + assert not Period("2007-01", freq="2M") in rng def test_contains_nat(self): # see gh-13582 - idx = period_range('2007-01', freq='M', periods=10) + idx = period_range("2007-01", freq="M", periods=10) assert pd.NaT not in idx assert None not in idx - assert float('nan') not in idx + assert float("nan") not in idx assert np.nan not in idx - idx = pd.PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='M') + idx = pd.PeriodIndex(["2011-01", "NaT", "2011-02"], freq="M") assert pd.NaT in idx assert None in idx - assert float('nan') in idx + assert float("nan") in idx assert np.nan in idx def test_periods_number_check(self): - msg = ("Of the three parameters: start, end, and periods, exactly two" - " must be specified") + msg = ( + "Of the three parameters: start, end, and periods, exactly two" + " must be specified" + ) with pytest.raises(ValueError, match=msg): - period_range('2011-1-1', '2012-1-1', 'B') + period_range("2011-1-1", "2012-1-1", "B") def test_start_time(self): # GH 17157 - index = period_range(freq='M', start='2016-01-01', end='2016-05-31') - expected_index = date_range('2016-01-01', end='2016-05-31', freq='MS') + index = period_range(freq="M", start="2016-01-01", end="2016-05-31") + expected_index = date_range("2016-01-01", end="2016-05-31", freq="MS") tm.assert_index_equal(index.start_time, expected_index) def test_end_time(self): # GH 17157 - index = period_range(freq='M', start='2016-01-01', end='2016-05-31') - expected_index = date_range('2016-01-01', end='2016-05-31', freq='M') - expected_index = expected_index.shift(1, freq='D').shift(-1, freq='ns') + index = period_range(freq="M", start="2016-01-01", end="2016-05-31") + expected_index = date_range("2016-01-01", end="2016-05-31", freq="M") + expected_index = expected_index.shift(1, freq="D").shift(-1, freq="ns") tm.assert_index_equal(index.end_time, expected_index) def test_index_duplicate_periods(self): # monotonic - idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq='A-JUN') + idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="A-JUN") ts = Series(np.random.randn(len(idx)), index=idx) result = ts[2007] @@ -417,7 +457,7 @@ def test_index_duplicate_periods(self): assert (ts[1:3] == 1).all() # not monotonic - idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq='A-JUN') + idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq="A-JUN") ts = Series(np.random.randn(len(idx)), index=idx) result = ts[2007] @@ -425,15 +465,13 @@ def test_index_duplicate_periods(self): tm.assert_series_equal(result, expected) def test_index_unique(self): - idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq='A-JUN') - expected = PeriodIndex([2000, 2007, 2009], freq='A-JUN') + idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="A-JUN") + expected = PeriodIndex([2000, 2007, 2009], freq="A-JUN") tm.assert_index_equal(idx.unique(), expected) assert idx.nunique() == 3 - idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq='A-JUN', - tz='US/Eastern') - expected = PeriodIndex([2000, 2007, 2009], freq='A-JUN', - tz='US/Eastern') + idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq="A-JUN", tz="US/Eastern") + expected = PeriodIndex([2000, 2007, 2009], freq="A-JUN", tz="US/Eastern") tm.assert_index_equal(idx.unique(), expected) assert idx.nunique() == 3 @@ -446,68 +484,71 @@ def test_ndarray_compat_properties(self): super().test_ndarray_compat_properties() def test_negative_ordinals(self): - Period(ordinal=-1000, freq='A') - Period(ordinal=0, freq='A') + Period(ordinal=-1000, freq="A") + Period(ordinal=0, freq="A") - idx1 = PeriodIndex(ordinal=[-1, 0, 1], freq='A') - idx2 = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq='A') + idx1 = PeriodIndex(ordinal=[-1, 0, 1], freq="A") + idx2 = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq="A") tm.assert_index_equal(idx1, idx2) def test_pindex_fieldaccessor_nat(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2012-03', '2012-04'], freq='D', name='name') + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2012-03", "2012-04"], freq="D", name="name" + ) - exp = Index([2011, 2011, -1, 2012, 2012], dtype=np.int64, name='name') + exp = Index([2011, 2011, -1, 2012, 2012], dtype=np.int64, name="name") tm.assert_index_equal(idx.year, exp) - exp = Index([1, 2, -1, 3, 4], dtype=np.int64, name='name') + exp = Index([1, 2, -1, 3, 4], dtype=np.int64, name="name") tm.assert_index_equal(idx.month, exp) def test_pindex_qaccess(self): - pi = PeriodIndex(['2Q05', '3Q05', '4Q05', '1Q06', '2Q06'], freq='Q') + pi = PeriodIndex(["2Q05", "3Q05", "4Q05", "1Q06", "2Q06"], freq="Q") s = Series(np.random.rand(len(pi)), index=pi).cumsum() # Todo: fix these accessors! - assert s['05Q4'] == s[2] + assert s["05Q4"] == s[2] def test_pindex_multiples(self): with tm.assert_produces_warning(FutureWarning): - pi = PeriodIndex(start='1/1/11', end='12/31/11', freq='2M') - expected = PeriodIndex(['2011-01', '2011-03', '2011-05', '2011-07', - '2011-09', '2011-11'], freq='2M') + pi = PeriodIndex(start="1/1/11", end="12/31/11", freq="2M") + expected = PeriodIndex( + ["2011-01", "2011-03", "2011-05", "2011-07", "2011-09", "2011-11"], + freq="2M", + ) tm.assert_index_equal(pi, expected) assert pi.freq == offsets.MonthEnd(2) - assert pi.freqstr == '2M' + assert pi.freqstr == "2M" - pi = period_range(start='1/1/11', end='12/31/11', freq='2M') + pi = period_range(start="1/1/11", end="12/31/11", freq="2M") tm.assert_index_equal(pi, expected) assert pi.freq == offsets.MonthEnd(2) - assert pi.freqstr == '2M' + assert pi.freqstr == "2M" - pi = period_range(start='1/1/11', periods=6, freq='2M') + pi = period_range(start="1/1/11", periods=6, freq="2M") tm.assert_index_equal(pi, expected) assert pi.freq == offsets.MonthEnd(2) - assert pi.freqstr == '2M' + assert pi.freqstr == "2M" def test_iteration(self): - index = period_range(start='1/1/10', periods=4, freq='B') + index = period_range(start="1/1/10", periods=4, freq="B") result = list(index) assert isinstance(result[0], Period) assert result[0].freq == index.freq def test_is_full(self): - index = PeriodIndex([2005, 2007, 2009], freq='A') + index = PeriodIndex([2005, 2007, 2009], freq="A") assert not index.is_full - index = PeriodIndex([2005, 2006, 2007], freq='A') + index = PeriodIndex([2005, 2006, 2007], freq="A") assert index.is_full - index = PeriodIndex([2005, 2005, 2007], freq='A') + index = PeriodIndex([2005, 2005, 2007], freq="A") assert not index.is_full - index = PeriodIndex([2005, 2005, 2006], freq='A') + index = PeriodIndex([2005, 2005, 2006], freq="A") assert index.is_full - index = PeriodIndex([2006, 2005, 2005], freq='A') + index = PeriodIndex([2006, 2005, 2005], freq="A") with pytest.raises(ValueError, match="Index is not monotonic"): index.is_full @@ -515,8 +556,8 @@ def test_is_full(self): def test_with_multi_index(self): # #1705 - index = date_range('1/1/2012', periods=4, freq='12H') - index_as_arrays = [index.to_period(freq='D'), index.hour] + index = date_range("1/1/2012", periods=4, freq="12H") + index_as_arrays = [index.to_period(freq="D"), index.hour] s = Series([0, 1, 2, 3], index_as_arrays) @@ -525,7 +566,7 @@ def test_with_multi_index(self): assert isinstance(s.index.values[0][0], Period) def test_convert_array_of_periods(self): - rng = period_range('1/1/2000', periods=20, freq='D') + rng = period_range("1/1/2000", periods=20, freq="D") periods = list(rng) result = pd.Index(periods) @@ -533,8 +574,8 @@ def test_convert_array_of_periods(self): def test_append_concat(self): # #1815 - d1 = date_range('12/31/1990', '12/31/1999', freq='A-DEC') - d2 = date_range('12/31/2000', '12/31/2009', freq='A-DEC') + d1 = date_range("12/31/1990", "12/31/1999", freq="A-DEC") + d2 = date_range("12/31/2000", "12/31/2009", freq="A-DEC") s1 = Series(np.random.randn(10), d1) s2 = Series(np.random.randn(10), d2) @@ -549,35 +590,36 @@ def test_append_concat(self): def test_pickle_freq(self): # GH2891 - prng = period_range('1/1/2011', '1/1/2012', freq='M') + prng = period_range("1/1/2011", "1/1/2012", freq="M") new_prng = tm.round_trip_pickle(prng) assert new_prng.freq == offsets.MonthEnd() - assert new_prng.freqstr == 'M' + assert new_prng.freqstr == "M" def test_map(self): # test_map_dictlike generally tests - index = PeriodIndex([2005, 2007, 2009], freq='A') + index = PeriodIndex([2005, 2007, 2009], freq="A") result = index.map(lambda x: x.ordinal) exp = Index([x.ordinal for x in index]) tm.assert_index_equal(result, exp) def test_join_self(self, join_type): - index = period_range('1/1/2000', periods=10) + index = period_range("1/1/2000", periods=10) joined = index.join(index, how=join_type) assert index is joined def test_insert(self): # GH 18295 (test missing) expected = PeriodIndex( - ['2017Q1', pd.NaT, '2017Q2', '2017Q3', '2017Q4'], freq='Q') + ["2017Q1", pd.NaT, "2017Q2", "2017Q3", "2017Q4"], freq="Q" + ) for na in (np.nan, pd.NaT, None): - result = period_range('2017Q1', periods=4, freq='Q').insert(1, na) + result = period_range("2017Q1", periods=4, freq="Q").insert(1, na) tm.assert_index_equal(result, expected) def test_maybe_convert_timedelta(): - pi = PeriodIndex(['2000', '2001'], freq='D') + pi = PeriodIndex(["2000", "2001"], freq="D") offset = offsets.Day(2) assert pi._maybe_convert_timedelta(offset) == 2 assert pi._maybe_convert_timedelta(2) == 2 diff --git a/pandas/tests/indexes/period/test_period_range.py b/pandas/tests/indexes/period/test_period_range.py index ca75635e561616..828fab08daceb8 100644 --- a/pandas/tests/indexes/period/test_period_range.py +++ b/pandas/tests/indexes/period/test_period_range.py @@ -5,71 +5,75 @@ class TestPeriodRange: - - @pytest.mark.parametrize('freq', ['D', 'W', 'M', 'Q', 'A']) + @pytest.mark.parametrize("freq", ["D", "W", "M", "Q", "A"]) def test_construction_from_string(self, freq): # non-empty - expected = date_range(start='2017-01-01', periods=5, - freq=freq, name='foo').to_period() + expected = date_range( + start="2017-01-01", periods=5, freq=freq, name="foo" + ).to_period() start, end = str(expected[0]), str(expected[-1]) - result = period_range(start=start, end=end, freq=freq, name='foo') + result = period_range(start=start, end=end, freq=freq, name="foo") tm.assert_index_equal(result, expected) - result = period_range(start=start, periods=5, freq=freq, name='foo') + result = period_range(start=start, periods=5, freq=freq, name="foo") tm.assert_index_equal(result, expected) - result = period_range(end=end, periods=5, freq=freq, name='foo') + result = period_range(end=end, periods=5, freq=freq, name="foo") tm.assert_index_equal(result, expected) # empty - expected = PeriodIndex([], freq=freq, name='foo') + expected = PeriodIndex([], freq=freq, name="foo") - result = period_range(start=start, periods=0, freq=freq, name='foo') + result = period_range(start=start, periods=0, freq=freq, name="foo") tm.assert_index_equal(result, expected) - result = period_range(end=end, periods=0, freq=freq, name='foo') + result = period_range(end=end, periods=0, freq=freq, name="foo") tm.assert_index_equal(result, expected) - result = period_range(start=end, end=start, freq=freq, name='foo') + result = period_range(start=end, end=start, freq=freq, name="foo") tm.assert_index_equal(result, expected) def test_construction_from_period(self): # upsampling - start, end = Period('2017Q1', freq='Q'), Period('2018Q1', freq='Q') - expected = date_range(start='2017-03-31', end='2018-03-31', freq='M', - name='foo').to_period() - result = period_range(start=start, end=end, freq='M', name='foo') + start, end = Period("2017Q1", freq="Q"), Period("2018Q1", freq="Q") + expected = date_range( + start="2017-03-31", end="2018-03-31", freq="M", name="foo" + ).to_period() + result = period_range(start=start, end=end, freq="M", name="foo") tm.assert_index_equal(result, expected) # downsampling - start, end = Period('2017-1', freq='M'), Period('2019-12', freq='M') - expected = date_range(start='2017-01-31', end='2019-12-31', freq='Q', - name='foo').to_period() - result = period_range(start=start, end=end, freq='Q', name='foo') + start, end = Period("2017-1", freq="M"), Period("2019-12", freq="M") + expected = date_range( + start="2017-01-31", end="2019-12-31", freq="Q", name="foo" + ).to_period() + result = period_range(start=start, end=end, freq="Q", name="foo") tm.assert_index_equal(result, expected) # empty - expected = PeriodIndex([], freq='W', name='foo') + expected = PeriodIndex([], freq="W", name="foo") - result = period_range(start=start, periods=0, freq='W', name='foo') + result = period_range(start=start, periods=0, freq="W", name="foo") tm.assert_index_equal(result, expected) - result = period_range(end=end, periods=0, freq='W', name='foo') + result = period_range(end=end, periods=0, freq="W", name="foo") tm.assert_index_equal(result, expected) - result = period_range(start=end, end=start, freq='W', name='foo') + result = period_range(start=end, end=start, freq="W", name="foo") tm.assert_index_equal(result, expected) def test_errors(self): # not enough params - msg = ('Of the three parameters: start, end, and periods, ' - 'exactly two must be specified') + msg = ( + "Of the three parameters: start, end, and periods, " + "exactly two must be specified" + ) with pytest.raises(ValueError, match=msg): - period_range(start='2017Q1') + period_range(start="2017Q1") with pytest.raises(ValueError, match=msg): - period_range(end='2017Q1') + period_range(end="2017Q1") with pytest.raises(ValueError, match=msg): period_range(periods=5) @@ -79,17 +83,17 @@ def test_errors(self): # too many params with pytest.raises(ValueError, match=msg): - period_range(start='2017Q1', end='2018Q1', periods=8, freq='Q') + period_range(start="2017Q1", end="2018Q1", periods=8, freq="Q") # start/end NaT - msg = 'start and end must not be NaT' + msg = "start and end must not be NaT" with pytest.raises(ValueError, match=msg): - period_range(start=NaT, end='2018Q1') + period_range(start=NaT, end="2018Q1") with pytest.raises(ValueError, match=msg): - period_range(start='2017Q1', end=NaT) + period_range(start="2017Q1", end=NaT) # invalid periods param - msg = 'periods must be a number, got foo' + msg = "periods must be a number, got foo" with pytest.raises(TypeError, match=msg): - period_range(start='2017Q1', periods='foo') + period_range(start="2017Q1", periods="foo") diff --git a/pandas/tests/indexes/period/test_scalar_compat.py b/pandas/tests/indexes/period/test_scalar_compat.py index ac01b4aad81c99..7956b9f26e6efe 100644 --- a/pandas/tests/indexes/period/test_scalar_compat.py +++ b/pandas/tests/indexes/period/test_scalar_compat.py @@ -6,12 +6,12 @@ class TestPeriodIndexOps: def test_start_time(self): - index = period_range(freq='M', start='2016-01-01', end='2016-05-31') - expected_index = date_range('2016-01-01', end='2016-05-31', freq='MS') + index = period_range(freq="M", start="2016-01-01", end="2016-05-31") + expected_index = date_range("2016-01-01", end="2016-05-31", freq="MS") tm.assert_index_equal(index.start_time, expected_index) def test_end_time(self): - index = period_range(freq='M', start='2016-01-01', end='2016-05-31') - expected_index = date_range('2016-01-01', end='2016-05-31', freq='M') - expected_index += Timedelta(1, 'D') - Timedelta(1, 'ns') + index = period_range(freq="M", start="2016-01-01", end="2016-05-31") + expected_index = date_range("2016-01-01", end="2016-05-31", freq="M") + expected_index += Timedelta(1, "D") - Timedelta(1, "ns") tm.assert_index_equal(index.end_time, expected_index) diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index a9102aeec060cf..94b061330002fd 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -12,9 +12,8 @@ def _permute(obj): class TestPeriodIndex: - def test_joins(self, join_type): - index = period_range('1/1/2000', '1/20/2000', freq='D') + index = period_range("1/1/2000", "1/20/2000", freq="D") joined = index.join(index[:-5], how=join_type) @@ -22,86 +21,140 @@ def test_joins(self, join_type): assert joined.freq == index.freq def test_join_self(self, join_type): - index = period_range('1/1/2000', '1/20/2000', freq='D') + index = period_range("1/1/2000", "1/20/2000", freq="D") res = index.join(index, how=join_type) assert index is res def test_join_does_not_recur(self): df = tm.makeCustomDataframe( - 3, 2, data_gen_f=lambda *args: np.random.randint(2), - c_idx_type='p', r_idx_type='dt') + 3, + 2, + data_gen_f=lambda *args: np.random.randint(2), + c_idx_type="p", + r_idx_type="dt", + ) s = df.iloc[:2, 0] - res = s.index.join(df.columns, how='outer') - expected = Index([s.index[0], s.index[1], - df.columns[0], df.columns[1]], object) + res = s.index.join(df.columns, how="outer") + expected = Index([s.index[0], s.index[1], df.columns[0], df.columns[1]], object) tm.assert_index_equal(res, expected) @pytest.mark.parametrize("sort", [None, False]) def test_union(self, sort): # union - other1 = pd.period_range('1/1/2000', freq='D', periods=5) - rng1 = pd.period_range('1/6/2000', freq='D', periods=5) - expected1 = pd.PeriodIndex(['2000-01-06', '2000-01-07', - '2000-01-08', '2000-01-09', - '2000-01-10', '2000-01-01', - '2000-01-02', '2000-01-03', - '2000-01-04', '2000-01-05'], - freq='D') - - rng2 = pd.period_range('1/1/2000', freq='D', periods=5) - other2 = pd.period_range('1/4/2000', freq='D', periods=5) - expected2 = pd.period_range('1/1/2000', freq='D', periods=8) - - rng3 = pd.period_range('1/1/2000', freq='D', periods=5) - other3 = pd.PeriodIndex([], freq='D') - expected3 = pd.period_range('1/1/2000', freq='D', periods=5) - - rng4 = pd.period_range('2000-01-01 09:00', freq='H', periods=5) - other4 = pd.period_range('2000-01-02 09:00', freq='H', periods=5) - expected4 = pd.PeriodIndex(['2000-01-01 09:00', '2000-01-01 10:00', - '2000-01-01 11:00', '2000-01-01 12:00', - '2000-01-01 13:00', '2000-01-02 09:00', - '2000-01-02 10:00', '2000-01-02 11:00', - '2000-01-02 12:00', '2000-01-02 13:00'], - freq='H') - - rng5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03', - '2000-01-01 09:05'], freq='T') - other5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:05' - '2000-01-01 09:08'], - freq='T') - expected5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03', - '2000-01-01 09:05', '2000-01-01 09:08'], - freq='T') - - rng6 = pd.period_range('2000-01-01', freq='M', periods=7) - other6 = pd.period_range('2000-04-01', freq='M', periods=7) - expected6 = pd.period_range('2000-01-01', freq='M', periods=10) - - rng7 = pd.period_range('2003-01-01', freq='A', periods=5) - other7 = pd.period_range('1998-01-01', freq='A', periods=8) - expected7 = pd.PeriodIndex(['2003', '2004', '2005', '2006', '2007', - '1998', '1999', '2000', '2001', '2002'], - freq='A') - - rng8 = pd.PeriodIndex(['1/3/2000', '1/2/2000', '1/1/2000', - '1/5/2000', '1/4/2000'], freq='D') - other8 = pd.period_range('1/6/2000', freq='D', periods=5) - expected8 = pd.PeriodIndex(['1/3/2000', '1/2/2000', '1/1/2000', - '1/5/2000', '1/4/2000', '1/6/2000', - '1/7/2000', '1/8/2000', '1/9/2000', - '1/10/2000'], freq='D') - - for rng, other, expected in [(rng1, other1, expected1), - (rng2, other2, expected2), - (rng3, other3, expected3), - (rng4, other4, expected4), - (rng5, other5, expected5), - (rng6, other6, expected6), - (rng7, other7, expected7), - (rng8, other8, expected8)]: + other1 = pd.period_range("1/1/2000", freq="D", periods=5) + rng1 = pd.period_range("1/6/2000", freq="D", periods=5) + expected1 = pd.PeriodIndex( + [ + "2000-01-06", + "2000-01-07", + "2000-01-08", + "2000-01-09", + "2000-01-10", + "2000-01-01", + "2000-01-02", + "2000-01-03", + "2000-01-04", + "2000-01-05", + ], + freq="D", + ) + + rng2 = pd.period_range("1/1/2000", freq="D", periods=5) + other2 = pd.period_range("1/4/2000", freq="D", periods=5) + expected2 = pd.period_range("1/1/2000", freq="D", periods=8) + + rng3 = pd.period_range("1/1/2000", freq="D", periods=5) + other3 = pd.PeriodIndex([], freq="D") + expected3 = pd.period_range("1/1/2000", freq="D", periods=5) + + rng4 = pd.period_range("2000-01-01 09:00", freq="H", periods=5) + other4 = pd.period_range("2000-01-02 09:00", freq="H", periods=5) + expected4 = pd.PeriodIndex( + [ + "2000-01-01 09:00", + "2000-01-01 10:00", + "2000-01-01 11:00", + "2000-01-01 12:00", + "2000-01-01 13:00", + "2000-01-02 09:00", + "2000-01-02 10:00", + "2000-01-02 11:00", + "2000-01-02 12:00", + "2000-01-02 13:00", + ], + freq="H", + ) + + rng5 = pd.PeriodIndex( + ["2000-01-01 09:01", "2000-01-01 09:03", "2000-01-01 09:05"], freq="T" + ) + other5 = pd.PeriodIndex( + ["2000-01-01 09:01", "2000-01-01 09:05" "2000-01-01 09:08"], freq="T" + ) + expected5 = pd.PeriodIndex( + [ + "2000-01-01 09:01", + "2000-01-01 09:03", + "2000-01-01 09:05", + "2000-01-01 09:08", + ], + freq="T", + ) + + rng6 = pd.period_range("2000-01-01", freq="M", periods=7) + other6 = pd.period_range("2000-04-01", freq="M", periods=7) + expected6 = pd.period_range("2000-01-01", freq="M", periods=10) + + rng7 = pd.period_range("2003-01-01", freq="A", periods=5) + other7 = pd.period_range("1998-01-01", freq="A", periods=8) + expected7 = pd.PeriodIndex( + [ + "2003", + "2004", + "2005", + "2006", + "2007", + "1998", + "1999", + "2000", + "2001", + "2002", + ], + freq="A", + ) + + rng8 = pd.PeriodIndex( + ["1/3/2000", "1/2/2000", "1/1/2000", "1/5/2000", "1/4/2000"], freq="D" + ) + other8 = pd.period_range("1/6/2000", freq="D", periods=5) + expected8 = pd.PeriodIndex( + [ + "1/3/2000", + "1/2/2000", + "1/1/2000", + "1/5/2000", + "1/4/2000", + "1/6/2000", + "1/7/2000", + "1/8/2000", + "1/9/2000", + "1/10/2000", + ], + freq="D", + ) + + for rng, other, expected in [ + (rng1, other1, expected1), + (rng2, other2, expected2), + (rng3, other3, expected3), + (rng4, other4, expected4), + (rng5, other5, expected5), + (rng6, other6, expected6), + (rng7, other7, expected7), + (rng8, other8, expected8), + ]: result_union = rng.union(other, sort=sort) if sort is None: @@ -110,7 +163,7 @@ def test_union(self, sort): @pytest.mark.parametrize("sort", [None, False]) def test_union_misc(self, sort): - index = period_range('1/1/2000', '1/20/2000', freq='D') + index = period_range("1/1/2000", "1/20/2000", freq="D") result = index[:-5].union(index[10:], sort=sort) tm.assert_index_equal(result, index) @@ -122,29 +175,29 @@ def test_union_misc(self, sort): assert tm.equalContents(result, index) # raise if different frequencies - index = period_range('1/1/2000', '1/20/2000', freq='D') - index2 = period_range('1/1/2000', '1/20/2000', freq='W-WED') + index = period_range("1/1/2000", "1/20/2000", freq="D") + index2 = period_range("1/1/2000", "1/20/2000", freq="W-WED") with pytest.raises(period.IncompatibleFrequency): index.union(index2, sort=sort) - index3 = period_range('1/1/2000', '1/20/2000', freq='2D') + index3 = period_range("1/1/2000", "1/20/2000", freq="2D") with pytest.raises(period.IncompatibleFrequency): index.join(index3) def test_union_dataframe_index(self): - rng1 = pd.period_range('1/1/1999', '1/1/2012', freq='M') + rng1 = pd.period_range("1/1/1999", "1/1/2012", freq="M") s1 = pd.Series(np.random.randn(len(rng1)), rng1) - rng2 = pd.period_range('1/1/1980', '12/1/2001', freq='M') + rng2 = pd.period_range("1/1/1980", "12/1/2001", freq="M") s2 = pd.Series(np.random.randn(len(rng2)), rng2) - df = pd.DataFrame({'s1': s1, 's2': s2}) + df = pd.DataFrame({"s1": s1, "s2": s2}) - exp = pd.period_range('1/1/1980', '1/1/2012', freq='M') + exp = pd.period_range("1/1/1980", "1/1/2012", freq="M") tm.assert_index_equal(df.index, exp) @pytest.mark.parametrize("sort", [None, False]) def test_intersection(self, sort): - index = period_range('1/1/2000', '1/20/2000', freq='D') + index = period_range("1/1/2000", "1/20/2000", freq="D") result = index[:-5].intersection(index[10:], sort=sort) tm.assert_index_equal(result, index[10:-5]) @@ -158,69 +211,78 @@ def test_intersection(self, sort): assert tm.equalContents(result, index[10:-5]) # raise if different frequencies - index = period_range('1/1/2000', '1/20/2000', freq='D') - index2 = period_range('1/1/2000', '1/20/2000', freq='W-WED') + index = period_range("1/1/2000", "1/20/2000", freq="D") + index2 = period_range("1/1/2000", "1/20/2000", freq="W-WED") with pytest.raises(period.IncompatibleFrequency): index.intersection(index2, sort=sort) - index3 = period_range('1/1/2000', '1/20/2000', freq='2D') + index3 = period_range("1/1/2000", "1/20/2000", freq="2D") with pytest.raises(period.IncompatibleFrequency): index.intersection(index3, sort=sort) @pytest.mark.parametrize("sort", [None, False]) def test_intersection_cases(self, sort): - base = period_range('6/1/2000', '6/30/2000', freq='D', name='idx') + base = period_range("6/1/2000", "6/30/2000", freq="D", name="idx") # if target has the same name, it is preserved - rng2 = period_range('5/15/2000', '6/20/2000', freq='D', name='idx') - expected2 = period_range('6/1/2000', '6/20/2000', freq='D', - name='idx') + rng2 = period_range("5/15/2000", "6/20/2000", freq="D", name="idx") + expected2 = period_range("6/1/2000", "6/20/2000", freq="D", name="idx") # if target name is different, it will be reset - rng3 = period_range('5/15/2000', '6/20/2000', freq='D', name='other') - expected3 = period_range('6/1/2000', '6/20/2000', freq='D', - name=None) + rng3 = period_range("5/15/2000", "6/20/2000", freq="D", name="other") + expected3 = period_range("6/1/2000", "6/20/2000", freq="D", name=None) - rng4 = period_range('7/1/2000', '7/31/2000', freq='D', name='idx') - expected4 = PeriodIndex([], name='idx', freq='D') + rng4 = period_range("7/1/2000", "7/31/2000", freq="D", name="idx") + expected4 = PeriodIndex([], name="idx", freq="D") - for (rng, expected) in [(rng2, expected2), (rng3, expected3), - (rng4, expected4)]: + for (rng, expected) in [ + (rng2, expected2), + (rng3, expected3), + (rng4, expected4), + ]: result = base.intersection(rng, sort=sort) tm.assert_index_equal(result, expected) assert result.name == expected.name assert result.freq == expected.freq # non-monotonic - base = PeriodIndex(['2011-01-05', '2011-01-04', '2011-01-02', - '2011-01-03'], freq='D', name='idx') - - rng2 = PeriodIndex(['2011-01-04', '2011-01-02', - '2011-02-02', '2011-02-03'], - freq='D', name='idx') - expected2 = PeriodIndex(['2011-01-04', '2011-01-02'], freq='D', - name='idx') - - rng3 = PeriodIndex(['2011-01-04', '2011-01-02', '2011-02-02', - '2011-02-03'], - freq='D', name='other') - expected3 = PeriodIndex(['2011-01-04', '2011-01-02'], freq='D', - name=None) - - rng4 = period_range('7/1/2000', '7/31/2000', freq='D', name='idx') - expected4 = PeriodIndex([], freq='D', name='idx') - - for (rng, expected) in [(rng2, expected2), (rng3, expected3), - (rng4, expected4)]: + base = PeriodIndex( + ["2011-01-05", "2011-01-04", "2011-01-02", "2011-01-03"], + freq="D", + name="idx", + ) + + rng2 = PeriodIndex( + ["2011-01-04", "2011-01-02", "2011-02-02", "2011-02-03"], + freq="D", + name="idx", + ) + expected2 = PeriodIndex(["2011-01-04", "2011-01-02"], freq="D", name="idx") + + rng3 = PeriodIndex( + ["2011-01-04", "2011-01-02", "2011-02-02", "2011-02-03"], + freq="D", + name="other", + ) + expected3 = PeriodIndex(["2011-01-04", "2011-01-02"], freq="D", name=None) + + rng4 = period_range("7/1/2000", "7/31/2000", freq="D", name="idx") + expected4 = PeriodIndex([], freq="D", name="idx") + + for (rng, expected) in [ + (rng2, expected2), + (rng3, expected3), + (rng4, expected4), + ]: result = base.intersection(rng, sort=sort) if sort is None: expected = expected.sort_values() tm.assert_index_equal(result, expected) assert result.name == expected.name - assert result.freq == 'D' + assert result.freq == "D" # empty same freq - rng = date_range('6/1/2000', '6/15/2000', freq='T') + rng = date_range("6/1/2000", "6/15/2000", freq="T") result = rng[0:0].intersection(rng) assert len(result) == 0 @@ -230,54 +292,63 @@ def test_intersection_cases(self, sort): @pytest.mark.parametrize("sort", [None, False]) def test_difference(self, sort): # diff - period_rng = ['1/3/2000', '1/2/2000', '1/1/2000', '1/5/2000', - '1/4/2000'] - rng1 = pd.PeriodIndex(period_rng, freq='D') - other1 = pd.period_range('1/6/2000', freq='D', periods=5) + period_rng = ["1/3/2000", "1/2/2000", "1/1/2000", "1/5/2000", "1/4/2000"] + rng1 = pd.PeriodIndex(period_rng, freq="D") + other1 = pd.period_range("1/6/2000", freq="D", periods=5) expected1 = rng1 - rng2 = pd.PeriodIndex(period_rng, freq='D') - other2 = pd.period_range('1/4/2000', freq='D', periods=5) - expected2 = pd.PeriodIndex(['1/3/2000', '1/2/2000', '1/1/2000'], - freq='D') + rng2 = pd.PeriodIndex(period_rng, freq="D") + other2 = pd.period_range("1/4/2000", freq="D", periods=5) + expected2 = pd.PeriodIndex(["1/3/2000", "1/2/2000", "1/1/2000"], freq="D") - rng3 = pd.PeriodIndex(period_rng, freq='D') - other3 = pd.PeriodIndex([], freq='D') + rng3 = pd.PeriodIndex(period_rng, freq="D") + other3 = pd.PeriodIndex([], freq="D") expected3 = rng3 - period_rng = ['2000-01-01 10:00', '2000-01-01 09:00', - '2000-01-01 12:00', '2000-01-01 11:00', - '2000-01-01 13:00'] - rng4 = pd.PeriodIndex(period_rng, freq='H') - other4 = pd.period_range('2000-01-02 09:00', freq='H', periods=5) + period_rng = [ + "2000-01-01 10:00", + "2000-01-01 09:00", + "2000-01-01 12:00", + "2000-01-01 11:00", + "2000-01-01 13:00", + ] + rng4 = pd.PeriodIndex(period_rng, freq="H") + other4 = pd.period_range("2000-01-02 09:00", freq="H", periods=5) expected4 = rng4 - rng5 = pd.PeriodIndex(['2000-01-01 09:03', '2000-01-01 09:01', - '2000-01-01 09:05'], freq='T') - other5 = pd.PeriodIndex( - ['2000-01-01 09:01', '2000-01-01 09:05'], freq='T') - expected5 = pd.PeriodIndex(['2000-01-01 09:03'], freq='T') - - period_rng = ['2000-02-01', '2000-01-01', '2000-06-01', - '2000-07-01', '2000-05-01', '2000-03-01', - '2000-04-01'] - rng6 = pd.PeriodIndex(period_rng, freq='M') - other6 = pd.period_range('2000-04-01', freq='M', periods=7) - expected6 = pd.PeriodIndex(['2000-02-01', '2000-01-01', '2000-03-01'], - freq='M') - - period_rng = ['2003', '2007', '2006', '2005', '2004'] - rng7 = pd.PeriodIndex(period_rng, freq='A') - other7 = pd.period_range('1998-01-01', freq='A', periods=8) - expected7 = pd.PeriodIndex(['2007', '2006'], freq='A') - - for rng, other, expected in [(rng1, other1, expected1), - (rng2, other2, expected2), - (rng3, other3, expected3), - (rng4, other4, expected4), - (rng5, other5, expected5), - (rng6, other6, expected6), - (rng7, other7, expected7), ]: + rng5 = pd.PeriodIndex( + ["2000-01-01 09:03", "2000-01-01 09:01", "2000-01-01 09:05"], freq="T" + ) + other5 = pd.PeriodIndex(["2000-01-01 09:01", "2000-01-01 09:05"], freq="T") + expected5 = pd.PeriodIndex(["2000-01-01 09:03"], freq="T") + + period_rng = [ + "2000-02-01", + "2000-01-01", + "2000-06-01", + "2000-07-01", + "2000-05-01", + "2000-03-01", + "2000-04-01", + ] + rng6 = pd.PeriodIndex(period_rng, freq="M") + other6 = pd.period_range("2000-04-01", freq="M", periods=7) + expected6 = pd.PeriodIndex(["2000-02-01", "2000-01-01", "2000-03-01"], freq="M") + + period_rng = ["2003", "2007", "2006", "2005", "2004"] + rng7 = pd.PeriodIndex(period_rng, freq="A") + other7 = pd.period_range("1998-01-01", freq="A", periods=8) + expected7 = pd.PeriodIndex(["2007", "2006"], freq="A") + + for rng, other, expected in [ + (rng1, other1, expected1), + (rng2, other2, expected2), + (rng3, other3, expected3), + (rng4, other4, expected4), + (rng5, other5, expected5), + (rng6, other6, expected6), + (rng7, other7, expected7), + ]: result_difference = rng.difference(other, sort=sort) if sort is None: expected = expected.sort_values() diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index 7d69723b4a1181..e52954a1145788 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -7,8 +7,16 @@ import pandas as pd from pandas import ( - DatetimeIndex, Period, PeriodIndex, Series, Timedelta, Timestamp, - date_range, period_range, to_datetime) + DatetimeIndex, + Period, + PeriodIndex, + Series, + Timedelta, + Timestamp, + date_range, + period_range, + to_datetime, +) import pandas.core.indexes.period as period import pandas.util.testing as tm @@ -25,20 +33,19 @@ def _check_freq(self, freq, base_date): tm.assert_numpy_array_equal(rng.asi8, exp) def test_annual(self): - self._check_freq('A', 1970) + self._check_freq("A", 1970) def test_monthly(self): - self._check_freq('M', '1970-01') + self._check_freq("M", "1970-01") - @pytest.mark.parametrize('freq', ['W-THU', 'D', 'B', 'H', 'T', - 'S', 'L', 'U', 'N']) + @pytest.mark.parametrize("freq", ["W-THU", "D", "B", "H", "T", "S", "L", "U", "N"]) def test_freq(self, freq): - self._check_freq(freq, '1970-01-01') + self._check_freq(freq, "1970-01-01") def test_negone_ordinals(self): - freqs = ['A', 'M', 'Q', 'D', 'H', 'T', 'S'] + freqs = ["A", "M", "Q", "D", "H", "T", "S"] - period = Period(ordinal=-1, freq='D') + period = Period(ordinal=-1, freq="D") for freq in freqs: repr(period.asfreq(freq)) @@ -47,188 +54,195 @@ def test_negone_ordinals(self): repr(period) assert period.year == 1969 - period = Period(ordinal=-1, freq='B') + period = Period(ordinal=-1, freq="B") repr(period) - period = Period(ordinal=-1, freq='W') + period = Period(ordinal=-1, freq="W") repr(period) class TestPeriodIndex: def test_to_timestamp(self): - index = period_range(freq='A', start='1/1/2001', end='12/1/2009') - series = Series(1, index=index, name='foo') + index = period_range(freq="A", start="1/1/2001", end="12/1/2009") + series = Series(1, index=index, name="foo") - exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') - result = series.to_timestamp(how='end') - exp_index = exp_index + Timedelta(1, 'D') - Timedelta(1, 'ns') + exp_index = date_range("1/1/2001", end="12/31/2009", freq="A-DEC") + result = series.to_timestamp(how="end") + exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) - assert result.name == 'foo' + assert result.name == "foo" - exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') - result = series.to_timestamp(how='start') + exp_index = date_range("1/1/2001", end="1/1/2009", freq="AS-JAN") + result = series.to_timestamp(how="start") tm.assert_index_equal(result.index, exp_index) - def _get_with_delta(delta, freq='A-DEC'): - return date_range(to_datetime('1/1/2001') + delta, - to_datetime('12/31/2009') + delta, freq=freq) + def _get_with_delta(delta, freq="A-DEC"): + return date_range( + to_datetime("1/1/2001") + delta, + to_datetime("12/31/2009") + delta, + freq=freq, + ) delta = timedelta(hours=23) - result = series.to_timestamp('H', 'end') + result = series.to_timestamp("H", "end") exp_index = _get_with_delta(delta) - exp_index = exp_index + Timedelta(1, 'h') - Timedelta(1, 'ns') + exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) delta = timedelta(hours=23, minutes=59) - result = series.to_timestamp('T', 'end') + result = series.to_timestamp("T", "end") exp_index = _get_with_delta(delta) - exp_index = exp_index + Timedelta(1, 'm') - Timedelta(1, 'ns') + exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) - result = series.to_timestamp('S', 'end') + result = series.to_timestamp("S", "end") delta = timedelta(hours=23, minutes=59, seconds=59) exp_index = _get_with_delta(delta) - exp_index = exp_index + Timedelta(1, 's') - Timedelta(1, 'ns') + exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) - index = period_range(freq='H', start='1/1/2001', end='1/2/2001') - series = Series(1, index=index, name='foo') + index = period_range(freq="H", start="1/1/2001", end="1/2/2001") + series = Series(1, index=index, name="foo") - exp_index = date_range('1/1/2001 00:59:59', end='1/2/2001 00:59:59', - freq='H') - result = series.to_timestamp(how='end') - exp_index = exp_index + Timedelta(1, 's') - Timedelta(1, 'ns') + exp_index = date_range("1/1/2001 00:59:59", end="1/2/2001 00:59:59", freq="H") + result = series.to_timestamp(how="end") + exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) - assert result.name == 'foo' + assert result.name == "foo" def test_to_timestamp_freq(self): - idx = pd.period_range('2017', periods=12, freq="A-DEC") + idx = pd.period_range("2017", periods=12, freq="A-DEC") result = idx.to_timestamp() expected = pd.date_range("2017", periods=12, freq="AS-JAN") tm.assert_index_equal(result, expected) def test_to_timestamp_repr_is_code(self): - zs = [Timestamp('99-04-17 00:00:00', tz='UTC'), - Timestamp('2001-04-17 00:00:00', tz='UTC'), - Timestamp('2001-04-17 00:00:00', tz='America/Los_Angeles'), - Timestamp('2001-04-17 00:00:00', tz=None)] + zs = [ + Timestamp("99-04-17 00:00:00", tz="UTC"), + Timestamp("2001-04-17 00:00:00", tz="UTC"), + Timestamp("2001-04-17 00:00:00", tz="America/Los_Angeles"), + Timestamp("2001-04-17 00:00:00", tz=None), + ] for z in zs: assert eval(repr(z)) == z def test_to_timestamp_to_period_astype(self): - idx = DatetimeIndex([pd.NaT, '2011-01-01', '2011-02-01'], name='idx') + idx = DatetimeIndex([pd.NaT, "2011-01-01", "2011-02-01"], name="idx") - res = idx.astype('period[M]') - exp = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='M', name='idx') + res = idx.astype("period[M]") + exp = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="M", name="idx") tm.assert_index_equal(res, exp) - res = idx.astype('period[3M]') - exp = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='3M', name='idx') + res = idx.astype("period[3M]") + exp = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="3M", name="idx") tm.assert_index_equal(res, exp) def test_dti_to_period(self): - dti = pd.date_range(start='1/1/2005', end='12/1/2005', freq='M') + dti = pd.date_range(start="1/1/2005", end="12/1/2005", freq="M") pi1 = dti.to_period() - pi2 = dti.to_period(freq='D') - pi3 = dti.to_period(freq='3D') - - assert pi1[0] == Period('Jan 2005', freq='M') - assert pi2[0] == Period('1/31/2005', freq='D') - assert pi3[0] == Period('1/31/2005', freq='3D') - - assert pi1[-1] == Period('Nov 2005', freq='M') - assert pi2[-1] == Period('11/30/2005', freq='D') - assert pi3[-1], Period('11/30/2005', freq='3D') - - tm.assert_index_equal(pi1, period_range('1/1/2005', '11/1/2005', - freq='M')) - tm.assert_index_equal(pi2, period_range('1/1/2005', '11/1/2005', - freq='M').asfreq('D')) - tm.assert_index_equal(pi3, period_range('1/1/2005', '11/1/2005', - freq='M').asfreq('3D')) - - @pytest.mark.parametrize('month', MONTHS) + pi2 = dti.to_period(freq="D") + pi3 = dti.to_period(freq="3D") + + assert pi1[0] == Period("Jan 2005", freq="M") + assert pi2[0] == Period("1/31/2005", freq="D") + assert pi3[0] == Period("1/31/2005", freq="3D") + + assert pi1[-1] == Period("Nov 2005", freq="M") + assert pi2[-1] == Period("11/30/2005", freq="D") + assert pi3[-1], Period("11/30/2005", freq="3D") + + tm.assert_index_equal(pi1, period_range("1/1/2005", "11/1/2005", freq="M")) + tm.assert_index_equal( + pi2, period_range("1/1/2005", "11/1/2005", freq="M").asfreq("D") + ) + tm.assert_index_equal( + pi3, period_range("1/1/2005", "11/1/2005", freq="M").asfreq("3D") + ) + + @pytest.mark.parametrize("month", MONTHS) def test_to_period_quarterly(self, month): # make sure we can make the round trip - freq = 'Q-%s' % month - rng = period_range('1989Q3', '1991Q3', freq=freq) + freq = "Q-%s" % month + rng = period_range("1989Q3", "1991Q3", freq=freq) stamps = rng.to_timestamp() result = stamps.to_period(freq) tm.assert_index_equal(rng, result) - @pytest.mark.parametrize('off', ['BQ', 'QS', 'BQS']) + @pytest.mark.parametrize("off", ["BQ", "QS", "BQS"]) def test_to_period_quarterlyish(self, off): - rng = date_range('01-Jan-2012', periods=8, freq=off) + rng = date_range("01-Jan-2012", periods=8, freq=off) prng = rng.to_period() - assert prng.freq == 'Q-DEC' + assert prng.freq == "Q-DEC" - @pytest.mark.parametrize('off', ['BA', 'AS', 'BAS']) + @pytest.mark.parametrize("off", ["BA", "AS", "BAS"]) def test_to_period_annualish(self, off): - rng = date_range('01-Jan-2012', periods=8, freq=off) + rng = date_range("01-Jan-2012", periods=8, freq=off) prng = rng.to_period() - assert prng.freq == 'A-DEC' + assert prng.freq == "A-DEC" def test_to_period_monthish(self): - offsets = ['MS', 'BM'] + offsets = ["MS", "BM"] for off in offsets: - rng = date_range('01-Jan-2012', periods=8, freq=off) + rng = date_range("01-Jan-2012", periods=8, freq=off) prng = rng.to_period() - assert prng.freq == 'M' + assert prng.freq == "M" - rng = date_range('01-Jan-2012', periods=8, freq='M') + rng = date_range("01-Jan-2012", periods=8, freq="M") prng = rng.to_period() - assert prng.freq == 'M' + assert prng.freq == "M" msg = pd._libs.tslibs.frequencies.INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): - date_range('01-Jan-2012', periods=8, freq='EOM') + date_range("01-Jan-2012", periods=8, freq="EOM") def test_period_dt64_round_trip(self): - dti = date_range('1/1/2000', '1/7/2002', freq='B') + dti = date_range("1/1/2000", "1/7/2002", freq="B") pi = dti.to_period() tm.assert_index_equal(pi.to_timestamp(), dti) - dti = date_range('1/1/2000', '1/7/2002', freq='B') - pi = dti.to_period(freq='H') + dti = date_range("1/1/2000", "1/7/2002", freq="B") + pi = dti.to_period(freq="H") tm.assert_index_equal(pi.to_timestamp(), dti) def test_combine_first(self): # GH#3367 - didx = pd.date_range(start='1950-01-31', end='1950-07-31', freq='M') - pidx = pd.period_range(start=pd.Period('1950-1'), - end=pd.Period('1950-7'), freq='M') + didx = pd.date_range(start="1950-01-31", end="1950-07-31", freq="M") + pidx = pd.period_range( + start=pd.Period("1950-1"), end=pd.Period("1950-7"), freq="M" + ) # check to be consistent with DatetimeIndex for idx in [didx, pidx]: a = pd.Series([1, np.nan, np.nan, 4, 5, np.nan, 7], index=idx) b = pd.Series([9, 9, 9, 9, 9, 9, 9], index=idx) result = a.combine_first(b) - expected = pd.Series([1, 9, 9, 4, 5, 9, 7], index=idx, - dtype=np.float64) + expected = pd.Series([1, 9, 9, 4, 5, 9, 7], index=idx, dtype=np.float64) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('freq', ['D', '2D']) + @pytest.mark.parametrize("freq", ["D", "2D"]) def test_searchsorted(self, freq): - pidx = pd.PeriodIndex(['2014-01-01', '2014-01-02', '2014-01-03', - '2014-01-04', '2014-01-05'], freq=freq) + pidx = pd.PeriodIndex( + ["2014-01-01", "2014-01-02", "2014-01-03", "2014-01-04", "2014-01-05"], + freq=freq, + ) - p1 = pd.Period('2014-01-01', freq=freq) + p1 = pd.Period("2014-01-01", freq=freq) assert pidx.searchsorted(p1) == 0 - p2 = pd.Period('2014-01-04', freq=freq) + p2 = pd.Period("2014-01-04", freq=freq) assert pidx.searchsorted(p2) == 3 msg = "Input has different freq=H from PeriodIndex" with pytest.raises(period.IncompatibleFrequency, match=msg): - pidx.searchsorted(pd.Period('2014-01-01', freq='H')) + pidx.searchsorted(pd.Period("2014-01-01", freq="H")) msg = "Input has different freq=5D from PeriodIndex" with pytest.raises(period.IncompatibleFrequency, match=msg): - pidx.searchsorted(pd.Period('2014-01-01', freq='5D')) + pidx.searchsorted(pd.Period("2014-01-01", freq="5D")) class TestPeriodIndexConversion: def test_tolist(self): - index = period_range(freq='A', start='1/1/2001', end='12/1/2009') + index = period_range(freq="A", start="1/1/2001", end="12/1/2009") rs = index.tolist() for x in rs: assert isinstance(x, Period) @@ -238,37 +252,34 @@ def test_tolist(self): def test_to_timestamp_pi_nat(self): # GH#7228 - index = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='M', - name='idx') + index = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="M", name="idx") - result = index.to_timestamp('D') - expected = DatetimeIndex([pd.NaT, datetime(2011, 1, 1), - datetime(2011, 2, 1)], name='idx') + result = index.to_timestamp("D") + expected = DatetimeIndex( + [pd.NaT, datetime(2011, 1, 1), datetime(2011, 2, 1)], name="idx" + ) tm.assert_index_equal(result, expected) - assert result.name == 'idx' + assert result.name == "idx" - result2 = result.to_period(freq='M') + result2 = result.to_period(freq="M") tm.assert_index_equal(result2, index) - assert result2.name == 'idx' + assert result2.name == "idx" - result3 = result.to_period(freq='3M') - exp = PeriodIndex(['NaT', '2011-01', '2011-02'], - freq='3M', name='idx') + result3 = result.to_period(freq="3M") + exp = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="3M", name="idx") tm.assert_index_equal(result3, exp) - assert result3.freqstr == '3M' + assert result3.freqstr == "3M" - msg = ('Frequency must be positive, because it' - ' represents span: -2A') + msg = "Frequency must be positive, because it" " represents span: -2A" with pytest.raises(ValueError, match=msg): - result.to_period(freq='-2A') + result.to_period(freq="-2A") def test_to_timestamp_preserve_name(self): - index = period_range(freq='A', start='1/1/2001', end='12/1/2009', - name='foo') - assert index.name == 'foo' + index = period_range(freq="A", start="1/1/2001", end="12/1/2009", name="foo") + assert index.name == "foo" - conv = index.to_timestamp('D') - assert conv.name == 'foo' + conv = index.to_timestamp("D") + assert conv.name == "foo" def test_to_timestamp_quarterly_bug(self): years = np.arange(1960, 2000).repeat(4) @@ -276,69 +287,66 @@ def test_to_timestamp_quarterly_bug(self): pindex = PeriodIndex(year=years, quarter=quarters) - stamps = pindex.to_timestamp('D', 'end') - expected = DatetimeIndex([x.to_timestamp('D', 'end') for x in pindex]) + stamps = pindex.to_timestamp("D", "end") + expected = DatetimeIndex([x.to_timestamp("D", "end") for x in pindex]) tm.assert_index_equal(stamps, expected) def test_to_timestamp_pi_mult(self): - idx = PeriodIndex(['2011-01', 'NaT', '2011-02'], - freq='2M', name='idx') + idx = PeriodIndex(["2011-01", "NaT", "2011-02"], freq="2M", name="idx") result = idx.to_timestamp() - expected = DatetimeIndex(['2011-01-01', 'NaT', '2011-02-01'], - name='idx') + expected = DatetimeIndex(["2011-01-01", "NaT", "2011-02-01"], name="idx") tm.assert_index_equal(result, expected) - result = idx.to_timestamp(how='E') - expected = DatetimeIndex(['2011-02-28', 'NaT', '2011-03-31'], - name='idx') - expected = expected + Timedelta(1, 'D') - Timedelta(1, 'ns') + result = idx.to_timestamp(how="E") + expected = DatetimeIndex(["2011-02-28", "NaT", "2011-03-31"], name="idx") + expected = expected + Timedelta(1, "D") - Timedelta(1, "ns") tm.assert_index_equal(result, expected) def test_to_timestamp_pi_combined(self): - idx = period_range(start='2011', periods=2, freq='1D1H', name='idx') + idx = period_range(start="2011", periods=2, freq="1D1H", name="idx") result = idx.to_timestamp() - expected = DatetimeIndex(['2011-01-01 00:00', '2011-01-02 01:00'], - name='idx') + expected = DatetimeIndex(["2011-01-01 00:00", "2011-01-02 01:00"], name="idx") tm.assert_index_equal(result, expected) - result = idx.to_timestamp(how='E') - expected = DatetimeIndex(['2011-01-02 00:59:59', - '2011-01-03 01:59:59'], - name='idx') - expected = expected + Timedelta(1, 's') - Timedelta(1, 'ns') + result = idx.to_timestamp(how="E") + expected = DatetimeIndex( + ["2011-01-02 00:59:59", "2011-01-03 01:59:59"], name="idx" + ) + expected = expected + Timedelta(1, "s") - Timedelta(1, "ns") tm.assert_index_equal(result, expected) - result = idx.to_timestamp(how='E', freq='H') - expected = DatetimeIndex(['2011-01-02 00:00', '2011-01-03 01:00'], - name='idx') - expected = expected + Timedelta(1, 'h') - Timedelta(1, 'ns') + result = idx.to_timestamp(how="E", freq="H") + expected = DatetimeIndex(["2011-01-02 00:00", "2011-01-03 01:00"], name="idx") + expected = expected + Timedelta(1, "h") - Timedelta(1, "ns") tm.assert_index_equal(result, expected) def test_period_astype_to_timestamp(self): - pi = pd.PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='M') - - exp = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01']) - tm.assert_index_equal(pi.astype('datetime64[ns]'), exp) - - exp = pd.DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31']) - exp = exp + Timedelta(1, 'D') - Timedelta(1, 'ns') - tm.assert_index_equal(pi.astype('datetime64[ns]', how='end'), exp) - - exp = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'], - tz='US/Eastern') - res = pi.astype('datetime64[ns, US/Eastern]') - tm.assert_index_equal(pi.astype('datetime64[ns, US/Eastern]'), exp) - - exp = pd.DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31'], - tz='US/Eastern') - exp = exp + Timedelta(1, 'D') - Timedelta(1, 'ns') - res = pi.astype('datetime64[ns, US/Eastern]', how='end') + pi = pd.PeriodIndex(["2011-01", "2011-02", "2011-03"], freq="M") + + exp = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"]) + tm.assert_index_equal(pi.astype("datetime64[ns]"), exp) + + exp = pd.DatetimeIndex(["2011-01-31", "2011-02-28", "2011-03-31"]) + exp = exp + Timedelta(1, "D") - Timedelta(1, "ns") + tm.assert_index_equal(pi.astype("datetime64[ns]", how="end"), exp) + + exp = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01", "2011-03-01"], tz="US/Eastern" + ) + res = pi.astype("datetime64[ns, US/Eastern]") + tm.assert_index_equal(pi.astype("datetime64[ns, US/Eastern]"), exp) + + exp = pd.DatetimeIndex( + ["2011-01-31", "2011-02-28", "2011-03-31"], tz="US/Eastern" + ) + exp = exp + Timedelta(1, "D") - Timedelta(1, "ns") + res = pi.astype("datetime64[ns, US/Eastern]", how="end") tm.assert_index_equal(res, exp) def test_to_timestamp_1703(self): - index = period_range('1/1/2012', periods=4, freq='D') + index = period_range("1/1/2012", periods=4, freq="D") result = index.to_timestamp() - assert result[0] == Timestamp('1/1/2012') + assert result[0] == Timestamp("1/1/2012") diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index a3563838e048da..f0382a040e0631 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -19,11 +19,25 @@ import pandas as pd from pandas import ( - CategoricalIndex, DataFrame, DatetimeIndex, Float64Index, Int64Index, - PeriodIndex, RangeIndex, Series, TimedeltaIndex, UInt64Index, date_range, - isna, period_range) + CategoricalIndex, + DataFrame, + DatetimeIndex, + Float64Index, + Int64Index, + PeriodIndex, + RangeIndex, + Series, + TimedeltaIndex, + UInt64Index, + date_range, + isna, + period_range, +) from pandas.core.index import ( - _get_combined_index, ensure_index, ensure_index_from_sequences) + _get_combined_index, + ensure_index, + ensure_index_from_sequences, +) from pandas.core.indexes.api import Index, MultiIndex from pandas.core.sorting import safe_sort from pandas.tests.indexes.common import Base @@ -35,25 +49,26 @@ class TestIndex(Base): _holder = Index def setup_method(self, method): - self.indices = dict(unicodeIndex=tm.makeUnicodeIndex(100), - strIndex=tm.makeStringIndex(100), - dateIndex=tm.makeDateIndex(100), - periodIndex=tm.makePeriodIndex(100), - tdIndex=tm.makeTimedeltaIndex(100), - intIndex=tm.makeIntIndex(100), - uintIndex=tm.makeUIntIndex(100), - rangeIndex=tm.makeRangeIndex(100), - floatIndex=tm.makeFloatIndex(100), - boolIndex=Index([True, False]), - catIndex=tm.makeCategoricalIndex(100), - empty=Index([]), - tuples=MultiIndex.from_tuples(zip( - ['foo', 'bar', 'baz'], [1, 2, 3])), - repeats=Index([0, 0, 1, 1, 2, 2])) + self.indices = dict( + unicodeIndex=tm.makeUnicodeIndex(100), + strIndex=tm.makeStringIndex(100), + dateIndex=tm.makeDateIndex(100), + periodIndex=tm.makePeriodIndex(100), + tdIndex=tm.makeTimedeltaIndex(100), + intIndex=tm.makeIntIndex(100), + uintIndex=tm.makeUIntIndex(100), + rangeIndex=tm.makeRangeIndex(100), + floatIndex=tm.makeFloatIndex(100), + boolIndex=Index([True, False]), + catIndex=tm.makeCategoricalIndex(100), + empty=Index([]), + tuples=MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), + repeats=Index([0, 0, 1, 1, 2, 2]), + ) self.setup_indices() def create_index(self): - return Index(list('abcde')) + return Index(list("abcde")) def generate_index_types(self, skip_index_keys=[]): """ @@ -76,9 +91,9 @@ def test_new_axis(self): def test_copy_and_deepcopy(self): new_copy2 = self.intIndex.copy(dtype=int) - assert new_copy2.dtype.kind == 'i' + assert new_copy2.dtype.kind == "i" - @pytest.mark.parametrize("attr", ['strIndex', 'dateIndex']) + @pytest.mark.parametrize("attr", ["strIndex", "dateIndex"]) def test_constructor_regular(self, attr): # regular instance creation index = getattr(self, attr) @@ -94,9 +109,9 @@ def test_constructor_casting(self): def test_constructor_copy(self): # copy arr = np.array(self.strIndex) - index = Index(arr, copy=True, name='name') + index = Index(arr, copy=True, name="name") assert isinstance(index, Index) - assert index.name == 'name' + assert index.name == "name" tm.assert_numpy_array_equal(arr, index.values) arr[0] = "SOMEBIGLONGSTRING" assert index[0] != "SOMEBIGLONGSTRING" @@ -107,13 +122,14 @@ def test_constructor_copy(self): def test_constructor_corner(self): # corner case - msg = (r"Index\(\.\.\.\) must be called with a collection of some" - " kind, 0 was passed") + msg = ( + r"Index\(\.\.\.\) must be called with a collection of some" + " kind, 0 was passed" + ) with pytest.raises(TypeError, match=msg): Index(0) - @pytest.mark.parametrize("index_vals", [ - [('A', 1), 'B'], ['B', ('A', 1)]]) + @pytest.mark.parametrize("index_vals", [[("A", 1), "B"], ["B", ("A", 1)]]) def test_construction_list_mixed_tuples(self, index_vals): # see gh-10697: if we are constructing from a mixed list of tuples, # make sure that we are independent of the sorting order. @@ -121,23 +137,31 @@ def test_construction_list_mixed_tuples(self, index_vals): assert isinstance(index, Index) assert not isinstance(index, MultiIndex) - @pytest.mark.parametrize('na_value', [None, np.nan]) - @pytest.mark.parametrize('vtype', [list, tuple, iter]) + @pytest.mark.parametrize("na_value", [None, np.nan]) + @pytest.mark.parametrize("vtype", [list, tuple, iter]) def test_construction_list_tuples_nan(self, na_value, vtype): # GH 18505 : valid tuples containing NaN - values = [(1, 'two'), (3., na_value)] + values = [(1, "two"), (3.0, na_value)] result = Index(vtype(values)) expected = MultiIndex.from_tuples(values) tm.assert_index_equal(result, expected) @pytest.mark.parametrize("cast_as_obj", [True, False]) - @pytest.mark.parametrize("index", [ - pd.date_range('2015-01-01 10:00', freq='D', periods=3, - tz='US/Eastern', name='Green Eggs & Ham'), # DTI with tz - pd.date_range('2015-01-01 10:00', freq='D', periods=3), # DTI no tz - pd.timedelta_range('1 days', freq='D', periods=3), # td - pd.period_range('2015-01-01', freq='D', periods=3) # period - ]) + @pytest.mark.parametrize( + "index", + [ + pd.date_range( + "2015-01-01 10:00", + freq="D", + periods=3, + tz="US/Eastern", + name="Green Eggs & Ham", + ), # DTI with tz + pd.date_range("2015-01-01 10:00", freq="D", periods=3), # DTI no tz + pd.timedelta_range("1 days", freq="D", periods=3), # td + pd.period_range("2015-01-01", freq="D", periods=3), # period + ], + ) def test_constructor_from_index_dtlike(self, cast_as_obj, index): if cast_as_obj: result = pd.Index(index.astype(object)) @@ -157,12 +181,17 @@ def test_constructor_from_index_dtlike(self, cast_as_obj, index): assert result.dtype == np.object_ assert list(result) == list(index) - @pytest.mark.parametrize("index,has_tz", [ - (pd.date_range('2015-01-01 10:00', freq='D', periods=3, - tz='US/Eastern'), True), # datetimetz - (pd.timedelta_range('1 days', freq='D', periods=3), False), # td - (pd.period_range('2015-01-01', freq='D', periods=3), False) # period - ]) + @pytest.mark.parametrize( + "index,has_tz", + [ + ( + pd.date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern"), + True, + ), # datetimetz + (pd.timedelta_range("1 days", freq="D", periods=3), False), # td + (pd.period_range("2015-01-01", freq="D", periods=3), False), # period + ], + ) def test_constructor_from_series_dtlike(self, index, has_tz): result = pd.Index(pd.Series(index)) tm.assert_index_equal(result, index) @@ -172,50 +201,56 @@ def test_constructor_from_series_dtlike(self, index, has_tz): @pytest.mark.parametrize("klass", [Index, DatetimeIndex]) def test_constructor_from_series(self, klass): - expected = DatetimeIndex([Timestamp('20110101'), Timestamp('20120101'), - Timestamp('20130101')]) - s = Series([Timestamp('20110101'), Timestamp('20120101'), - Timestamp('20130101')]) + expected = DatetimeIndex( + [Timestamp("20110101"), Timestamp("20120101"), Timestamp("20130101")] + ) + s = Series( + [Timestamp("20110101"), Timestamp("20120101"), Timestamp("20130101")] + ) result = klass(s) tm.assert_index_equal(result, expected) def test_constructor_from_series_freq(self): # GH 6273 # create from a series, passing a freq - dts = ['1-1-1990', '2-1-1990', '3-1-1990', '4-1-1990', '5-1-1990'] - expected = DatetimeIndex(dts, freq='MS') + dts = ["1-1-1990", "2-1-1990", "3-1-1990", "4-1-1990", "5-1-1990"] + expected = DatetimeIndex(dts, freq="MS") s = Series(pd.to_datetime(dts)) - result = DatetimeIndex(s, freq='MS') + result = DatetimeIndex(s, freq="MS") tm.assert_index_equal(result, expected) def test_constructor_from_frame_series_freq(self): # GH 6273 # create from a series, passing a freq - dts = ['1-1-1990', '2-1-1990', '3-1-1990', '4-1-1990', '5-1-1990'] - expected = DatetimeIndex(dts, freq='MS') + dts = ["1-1-1990", "2-1-1990", "3-1-1990", "4-1-1990", "5-1-1990"] + expected = DatetimeIndex(dts, freq="MS") df = pd.DataFrame(np.random.rand(5, 3)) - df['date'] = dts - result = DatetimeIndex(df['date'], freq='MS') + df["date"] = dts + result = DatetimeIndex(df["date"], freq="MS") - assert df['date'].dtype == object - expected.name = 'date' + assert df["date"].dtype == object + expected.name = "date" tm.assert_index_equal(result, expected) - expected = pd.Series(dts, name='date') - tm.assert_series_equal(df['date'], expected) + expected = pd.Series(dts, name="date") + tm.assert_series_equal(df["date"], expected) # GH 6274 # infer freq of same - freq = pd.infer_freq(df['date']) - assert freq == 'MS' + freq = pd.infer_freq(df["date"]) + assert freq == "MS" - @pytest.mark.parametrize("array", [ - np.arange(5), np.array(['a', 'b', 'c']), date_range( - '2000-01-01', periods=3).values - ]) + @pytest.mark.parametrize( + "array", + [ + np.arange(5), + np.array(["a", "b", "c"]), + date_range("2000-01-01", periods=3).values, + ], + ) def test_constructor_ndarray_like(self, array): # GH 5460#issuecomment-44474502 # it should be possible to convert any object that satisfies the numpy @@ -231,9 +266,10 @@ def __array__(self, dtype=None): result = pd.Index(ArrayLike(array)) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('dtype', [ - int, 'int64', 'int32', 'int16', 'int8', 'uint64', 'uint32', - 'uint16', 'uint8']) + @pytest.mark.parametrize( + "dtype", + [int, "int64", "int32", "int16", "int8", "uint64", "uint32", "uint16", "uint8"], + ) def test_constructor_int_dtype_float(self, dtype): # GH 18400 if is_unsigned_integer_dtype(dtype): @@ -242,17 +278,17 @@ def test_constructor_int_dtype_float(self, dtype): index_type = Int64Index expected = index_type([0, 1, 2, 3]) - result = Index([0., 1., 2., 3.], dtype=dtype) + result = Index([0.0, 1.0, 2.0, 3.0], dtype=dtype) tm.assert_index_equal(result, expected) def test_constructor_int_dtype_nan(self): # see gh-15187 data = [np.nan] expected = Float64Index(data) - result = Index(data, dtype='float') + result = Index(data, dtype="float") tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("dtype", ['int64', 'uint64']) + @pytest.mark.parametrize("dtype", ["int64", "uint64"]) def test_constructor_int_dtype_nan_raises(self, dtype): # see gh-15187 data = [np.nan] @@ -266,10 +302,13 @@ def test_constructor_no_pandas_array(self): expected = pd.Index([1, 2, 3]) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("klass,dtype,na_val", [ - (pd.Float64Index, np.float64, np.nan), - (pd.DatetimeIndex, 'datetime64[ns]', pd.NaT) - ]) + @pytest.mark.parametrize( + "klass,dtype,na_val", + [ + (pd.Float64Index, np.float64, np.nan), + (pd.DatetimeIndex, "datetime64[ns]", pd.NaT), + ], + ) def test_index_ctor_infer_nan_nat(self, klass, dtype, na_val): # GH 13467 na_list = [na_val, na_val] @@ -283,12 +322,14 @@ def test_index_ctor_infer_nan_nat(self, klass, dtype, na_val): tm.assert_index_equal(result, expected) @pytest.mark.parametrize("pos", [0, 1]) - @pytest.mark.parametrize("klass,dtype,ctor", [ - (pd.DatetimeIndex, 'datetime64[ns]', np.datetime64('nat')), - (pd.TimedeltaIndex, 'timedelta64[ns]', np.timedelta64('nat')) - ]) - def test_index_ctor_infer_nat_dt_like(self, pos, klass, dtype, ctor, - nulls_fixture): + @pytest.mark.parametrize( + "klass,dtype,ctor", + [ + (pd.DatetimeIndex, "datetime64[ns]", np.datetime64("nat")), + (pd.TimedeltaIndex, "timedelta64[ns]", np.timedelta64("nat")), + ], + ) + def test_index_ctor_infer_nat_dt_like(self, pos, klass, dtype, ctor, nulls_fixture): expected = klass([pd.NaT, pd.NaT]) assert expected.dtype == dtype data = [ctor] @@ -303,7 +344,7 @@ def test_index_ctor_infer_nat_dt_like(self, pos, klass, dtype, ctor, @pytest.mark.parametrize("swap_objs", [True, False]) def test_index_ctor_nat_result(self, swap_objs): # mixed np.datetime64/timedelta64 nat results in object - data = [np.datetime64('nat'), np.timedelta64('nat')] + data = [np.datetime64("nat"), np.timedelta64("nat")] if swap_objs: data = data[::-1] @@ -312,41 +353,57 @@ def test_index_ctor_nat_result(self, swap_objs): tm.assert_index_equal(Index(np.array(data, dtype=object)), expected) def test_index_ctor_infer_periodindex(self): - xp = period_range('2012-1-1', freq='M', periods=3) + xp = period_range("2012-1-1", freq="M", periods=3) rs = Index(xp) tm.assert_index_equal(rs, xp) assert isinstance(rs, PeriodIndex) - @pytest.mark.parametrize("vals,dtype", [ - ([1, 2, 3, 4, 5], 'int'), ([1.1, np.nan, 2.2, 3.0], 'float'), - (['A', 'B', 'C', np.nan], 'obj') - ]) + @pytest.mark.parametrize( + "vals,dtype", + [ + ([1, 2, 3, 4, 5], "int"), + ([1.1, np.nan, 2.2, 3.0], "float"), + (["A", "B", "C", np.nan], "obj"), + ], + ) def test_constructor_simple_new(self, vals, dtype): index = Index(vals, name=dtype) result = index._simple_new(index.values, dtype) tm.assert_index_equal(result, index) - @pytest.mark.parametrize("vals", [ - [1, 2, 3], np.array([1, 2, 3]), np.array([1, 2, 3], dtype=int), - # below should coerce - [1., 2., 3.], np.array([1., 2., 3.], dtype=float) - ]) + @pytest.mark.parametrize( + "vals", + [ + [1, 2, 3], + np.array([1, 2, 3]), + np.array([1, 2, 3], dtype=int), + # below should coerce + [1.0, 2.0, 3.0], + np.array([1.0, 2.0, 3.0], dtype=float), + ], + ) def test_constructor_dtypes_to_int64(self, vals): index = Index(vals, dtype=int) assert isinstance(index, Int64Index) - @pytest.mark.parametrize("vals", [ - [1, 2, 3], [1., 2., 3.], np.array([1., 2., 3.]), - np.array([1, 2, 3], dtype=int), np.array([1., 2., 3.], dtype=float) - ]) + @pytest.mark.parametrize( + "vals", + [ + [1, 2, 3], + [1.0, 2.0, 3.0], + np.array([1.0, 2.0, 3.0]), + np.array([1, 2, 3], dtype=int), + np.array([1.0, 2.0, 3.0], dtype=float), + ], + ) def test_constructor_dtypes_to_float64(self, vals): index = Index(vals, dtype=float) assert isinstance(index, Float64Index) @pytest.mark.parametrize("cast_index", [True, False]) - @pytest.mark.parametrize("vals", [ - [True, False, True], np.array([True, False, True], dtype=bool) - ]) + @pytest.mark.parametrize( + "vals", [[True, False, True], np.array([True, False, True], dtype=bool)] + ) def test_constructor_dtypes_to_object(self, cast_index, vals): if cast_index: index = Index(vals, dtype=bool) @@ -356,23 +413,36 @@ def test_constructor_dtypes_to_object(self, cast_index, vals): assert isinstance(index, Index) assert index.dtype == object - @pytest.mark.parametrize("vals", [ - [1, 2, 3], np.array([1, 2, 3], dtype=int), - np.array([np_datetime64_compat('2011-01-01'), - np_datetime64_compat('2011-01-02')]), - [datetime(2011, 1, 1), datetime(2011, 1, 2)] - ]) + @pytest.mark.parametrize( + "vals", + [ + [1, 2, 3], + np.array([1, 2, 3], dtype=int), + np.array( + [np_datetime64_compat("2011-01-01"), np_datetime64_compat("2011-01-02")] + ), + [datetime(2011, 1, 1), datetime(2011, 1, 2)], + ], + ) def test_constructor_dtypes_to_categorical(self, vals): - index = Index(vals, dtype='category') + index = Index(vals, dtype="category") assert isinstance(index, CategoricalIndex) @pytest.mark.parametrize("cast_index", [True, False]) - @pytest.mark.parametrize("vals", [ - Index(np.array([np_datetime64_compat('2011-01-01'), - np_datetime64_compat('2011-01-02')])), - Index([datetime(2011, 1, 1), datetime(2011, 1, 2)]) - - ]) + @pytest.mark.parametrize( + "vals", + [ + Index( + np.array( + [ + np_datetime64_compat("2011-01-01"), + np_datetime64_compat("2011-01-02"), + ] + ) + ), + Index([datetime(2011, 1, 1), datetime(2011, 1, 2)]), + ], + ) def test_constructor_dtypes_to_datetime(self, cast_index, vals): if cast_index: index = Index(vals, dtype=object) @@ -383,10 +453,13 @@ def test_constructor_dtypes_to_datetime(self, cast_index, vals): assert isinstance(index, DatetimeIndex) @pytest.mark.parametrize("cast_index", [True, False]) - @pytest.mark.parametrize("vals", [ - np.array([np.timedelta64(1, 'D'), np.timedelta64(1, 'D')]), - [timedelta(1), timedelta(1)] - ]) + @pytest.mark.parametrize( + "vals", + [ + np.array([np.timedelta64(1, "D"), np.timedelta64(1, "D")]), + [timedelta(1), timedelta(1)], + ], + ) def test_constructor_dtypes_to_timedelta(self, cast_index, vals): if cast_index: index = Index(vals, dtype=object) @@ -396,24 +469,24 @@ def test_constructor_dtypes_to_timedelta(self, cast_index, vals): index = Index(vals) assert isinstance(index, TimedeltaIndex) - @pytest.mark.parametrize("attr, utc", [ - ['values', False], - ['asi8', True]]) + @pytest.mark.parametrize("attr, utc", [["values", False], ["asi8", True]]) @pytest.mark.parametrize("klass", [pd.Index, pd.DatetimeIndex]) - def test_constructor_dtypes_datetime(self, tz_naive_fixture, attr, utc, - klass): + def test_constructor_dtypes_datetime(self, tz_naive_fixture, attr, utc, klass): # Test constructing with a datetimetz dtype # .values produces numpy datetimes, so these are considered naive # .asi8 produces integers, so these are considered epoch timestamps # ^the above will be true in a later version. Right now we `.view` # the i8 values as NS_DTYPE, effectively treating them as wall times. - index = pd.date_range('2011-01-01', periods=5) + index = pd.date_range("2011-01-01", periods=5) arg = getattr(index, attr) index = index.tz_localize(tz_naive_fixture) dtype = index.dtype - if (tz_naive_fixture and attr == "asi8" and - str(tz_naive_fixture) not in ('UTC', 'tzutc()', 'UTC+00:00')): + if ( + tz_naive_fixture + and attr == "asi8" + and str(tz_naive_fixture) not in ("UTC", "tzutc()", "UTC+00:00") + ): ex_warn = FutureWarning else: ex_warn = None @@ -436,10 +509,10 @@ def test_constructor_dtypes_datetime(self, tz_naive_fixture, attr, utc, result = klass(list(arg), dtype=dtype) tm.assert_index_equal(result, index) - @pytest.mark.parametrize("attr", ['values', 'asi8']) + @pytest.mark.parametrize("attr", ["values", "asi8"]) @pytest.mark.parametrize("klass", [pd.Index, pd.TimedeltaIndex]) def test_constructor_dtypes_timedelta(self, attr, klass): - index = pd.timedelta_range('1 days', periods=5) + index = pd.timedelta_range("1 days", periods=5) dtype = index.dtype values = getattr(index, attr) @@ -451,119 +524,136 @@ def test_constructor_dtypes_timedelta(self, attr, klass): tm.assert_index_equal(result, index) @pytest.mark.parametrize("value", [[], iter([]), (x for x in [])]) - @pytest.mark.parametrize("klass", - [Index, Float64Index, Int64Index, UInt64Index, - CategoricalIndex, DatetimeIndex, TimedeltaIndex]) + @pytest.mark.parametrize( + "klass", + [ + Index, + Float64Index, + Int64Index, + UInt64Index, + CategoricalIndex, + DatetimeIndex, + TimedeltaIndex, + ], + ) def test_constructor_empty(self, value, klass): empty = klass(value) assert isinstance(empty, klass) assert not len(empty) - @pytest.mark.parametrize("empty,klass", [ - (PeriodIndex([], freq='B'), PeriodIndex), - (PeriodIndex(iter([]), freq='B'), PeriodIndex), - (PeriodIndex((x for x in []), freq='B'), PeriodIndex), - (RangeIndex(step=1), pd.RangeIndex), - (MultiIndex(levels=[[1, 2], ['blue', 'red']], - codes=[[], []]), MultiIndex) - ]) + @pytest.mark.parametrize( + "empty,klass", + [ + (PeriodIndex([], freq="B"), PeriodIndex), + (PeriodIndex(iter([]), freq="B"), PeriodIndex), + (PeriodIndex((x for x in []), freq="B"), PeriodIndex), + (RangeIndex(step=1), pd.RangeIndex), + (MultiIndex(levels=[[1, 2], ["blue", "red"]], codes=[[], []]), MultiIndex), + ], + ) def test_constructor_empty_special(self, empty, klass): assert isinstance(empty, klass) assert not len(empty) def test_constructor_overflow_int64(self): # see gh-15832 - msg = ("The elements provided in the data cannot " - "all be casted to the dtype int64") + msg = ( + "The elements provided in the data cannot " + "all be casted to the dtype int64" + ) with pytest.raises(OverflowError, match=msg): Index([np.iinfo(np.uint64).max - 1], dtype="int64") - @pytest.mark.xfail(reason="see GH#21311: Index " - "doesn't enforce dtype argument") + @pytest.mark.xfail(reason="see GH#21311: Index " "doesn't enforce dtype argument") def test_constructor_cast(self): msg = "could not convert string to float" with pytest.raises(ValueError, match=msg): Index(["a", "b", "c"], dtype=float) def test_view_with_args(self): - restricted = ['unicodeIndex', 'strIndex', 'catIndex', 'boolIndex', - 'empty'] + restricted = ["unicodeIndex", "strIndex", "catIndex", "boolIndex", "empty"] for i in list(set(self.indices.keys()) - set(restricted)): ind = self.indices[i] - ind.view('i8') - - @pytest.mark.parametrize('index_type', [ - 'unicodeIndex', - 'strIndex', - pytest.param('catIndex', marks=pytest.mark.xfail(reason="gh-25464")), - 'boolIndex', - 'empty']) + ind.view("i8") + + @pytest.mark.parametrize( + "index_type", + [ + "unicodeIndex", + "strIndex", + pytest.param("catIndex", marks=pytest.mark.xfail(reason="gh-25464")), + "boolIndex", + "empty", + ], + ) def test_view_with_args_object_array_raises(self, index_type): ind = self.indices[index_type] msg = "Cannot change data-type for object array" with pytest.raises(TypeError, match=msg): - ind.view('i8') + ind.view("i8") def test_astype(self): - casted = self.intIndex.astype('i8') + casted = self.intIndex.astype("i8") # it works! casted.get_loc(5) # pass on name - self.intIndex.name = 'foobar' - casted = self.intIndex.astype('i8') - assert casted.name == 'foobar' + self.intIndex.name = "foobar" + casted = self.intIndex.astype("i8") + assert casted.name == "foobar" def test_equals_object(self): # same - assert Index(['a', 'b', 'c']).equals(Index(['a', 'b', 'c'])) + assert Index(["a", "b", "c"]).equals(Index(["a", "b", "c"])) - @pytest.mark.parametrize("comp", [ - Index(['a', 'b']), Index(['a', 'b', 'd']), ['a', 'b', 'c']]) + @pytest.mark.parametrize( + "comp", [Index(["a", "b"]), Index(["a", "b", "d"]), ["a", "b", "c"]] + ) def test_not_equals_object(self, comp): - assert not Index(['a', 'b', 'c']).equals(comp) + assert not Index(["a", "b", "c"]).equals(comp) def test_insert(self): # GH 7256 # validate neg/pos inserts - result = Index(['b', 'c', 'd']) + result = Index(["b", "c", "d"]) # test 0th element - tm.assert_index_equal(Index(['a', 'b', 'c', 'd']), - result.insert(0, 'a')) + tm.assert_index_equal(Index(["a", "b", "c", "d"]), result.insert(0, "a")) # test Nth element that follows Python list behavior - tm.assert_index_equal(Index(['b', 'c', 'e', 'd']), - result.insert(-1, 'e')) + tm.assert_index_equal(Index(["b", "c", "e", "d"]), result.insert(-1, "e")) # test loc +/- neq (0, -1) - tm.assert_index_equal(result.insert(1, 'z'), result.insert(-2, 'z')) + tm.assert_index_equal(result.insert(1, "z"), result.insert(-2, "z")) # test empty null_index = Index([]) - tm.assert_index_equal(Index(['a']), null_index.insert(0, 'a')) + tm.assert_index_equal(Index(["a"]), null_index.insert(0, "a")) def test_insert_missing(self, nulls_fixture): # GH 22295 # test there is no mangling of NA values - expected = Index(['a', nulls_fixture, 'b', 'c']) - result = Index(list('abc')).insert(1, nulls_fixture) + expected = Index(["a", nulls_fixture, "b", "c"]) + result = Index(list("abc")).insert(1, nulls_fixture) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("pos,expected", [ - (0, Index(['b', 'c', 'd'], name='index')), - (-1, Index(['a', 'b', 'c'], name='index')) - ]) + @pytest.mark.parametrize( + "pos,expected", + [ + (0, Index(["b", "c", "d"], name="index")), + (-1, Index(["a", "b", "c"], name="index")), + ], + ) def test_delete(self, pos, expected): - index = Index(['a', 'b', 'c', 'd'], name='index') + index = Index(["a", "b", "c", "d"], name="index") result = index.delete(pos) tm.assert_index_equal(result, expected) assert result.name == expected.name def test_delete_raises(self): - index = Index(['a', 'b', 'c', 'd'], name='index') + index = Index(["a", "b", "c", "d"], name="index") msg = "index 5 is out of bounds for axis 0 with size 4" with pytest.raises(IndexError, match=msg): index.delete(5) @@ -571,20 +661,20 @@ def test_delete_raises(self): def test_identical(self): # index - i1 = Index(['a', 'b', 'c']) - i2 = Index(['a', 'b', 'c']) + i1 = Index(["a", "b", "c"]) + i2 = Index(["a", "b", "c"]) assert i1.identical(i2) - i1 = i1.rename('foo') + i1 = i1.rename("foo") assert i1.equals(i2) assert not i1.identical(i2) - i2 = i2.rename('foo') + i2 = i2.rename("foo") assert i1.identical(i2) - i3 = Index([('a', 'a'), ('a', 'b'), ('b', 'a')]) - i4 = Index([('a', 'a'), ('a', 'b'), ('b', 'a')], tupleize_cols=False) + i3 = Index([("a", "a"), ("a", "b"), ("b", "a")]) + i4 = Index([("a", "a"), ("a", "b"), ("b", "a")], tupleize_cols=False) assert not i3.identical(i4) def test_is_(self): @@ -600,7 +690,7 @@ def test_is_(self): # quasi-implementation dependent assert ind.is_(ind.view()) ind2 = ind.view() - ind2.name = 'bob' + ind2.name = "bob" assert ind.is_(ind2) assert ind2.is_(ind) # doesn't matter if Indices are *actually* views of underlying data, @@ -622,14 +712,14 @@ def test_asof(self): assert isinstance(self.dateIndex.asof(d), Timestamp) def test_asof_datetime_partial(self): - index = pd.date_range('2010-01-01', periods=2, freq='m') - expected = Timestamp('2010-02-28') - result = index.asof('2010-02') + index = pd.date_range("2010-01-01", periods=2, freq="m") + expected = Timestamp("2010-02-28") + result = index.asof("2010-02") assert result == expected assert not isinstance(result, Index) def test_nanosecond_index_access(self): - s = Series([Timestamp('20130101')]).values.view('i8')[0] + s = Series([Timestamp("20130101")]).values.view("i8")[0] r = DatetimeIndex([s + 50 + i for i in range(100)]) x = Series(np.random.randn(100), index=r) @@ -638,8 +728,9 @@ def test_nanosecond_index_access(self): # this does not yet work, as parsing strings is done via dateutil # assert first_value == x['2013-01-01 00:00:00.000000050+0000'] - expected_ts = np_datetime64_compat('2013-01-01 00:00:00.000000050+' - '0000', 'ns') + expected_ts = np_datetime64_compat( + "2013-01-01 00:00:00.000000050+" "0000", "ns" + ) assert first_value == x[Timestamp(expected_ts)] def test_booleanindex(self): @@ -660,8 +751,7 @@ def test_fancy(self): for i in sl: assert i == sl[sl.get_loc(i)] - @pytest.mark.parametrize("attr", [ - 'strIndex', 'intIndex', 'floatIndex']) + @pytest.mark.parametrize("attr", ["strIndex", "intIndex", "floatIndex"]) @pytest.mark.parametrize("dtype", [np.int_, np.bool_]) def test_empty_fancy(self, attr, dtype): empty_arr = np.array([], dtype=dtype) @@ -671,8 +761,7 @@ def test_empty_fancy(self, attr, dtype): assert index[[]].identical(empty_index) assert index[empty_arr].identical(empty_index) - @pytest.mark.parametrize("attr", [ - 'strIndex', 'intIndex', 'floatIndex']) + @pytest.mark.parametrize("attr", ["strIndex", "intIndex", "floatIndex"]) def test_empty_fancy_raises(self, attr): # pd.DatetimeIndex is excluded, because it overrides getitem and should # be tested separately. @@ -699,27 +788,34 @@ def test_intersection(self, sort): inter = first.intersection(first, sort=sort) assert inter is first - @pytest.mark.parametrize("index2,keeps_name", [ - (Index([3, 4, 5, 6, 7], name="index"), True), # preserve same name - (Index([3, 4, 5, 6, 7], name="other"), False), # drop diff names - (Index([3, 4, 5, 6, 7]), False)]) + @pytest.mark.parametrize( + "index2,keeps_name", + [ + (Index([3, 4, 5, 6, 7], name="index"), True), # preserve same name + (Index([3, 4, 5, 6, 7], name="other"), False), # drop diff names + (Index([3, 4, 5, 6, 7]), False), + ], + ) @pytest.mark.parametrize("sort", [None, False]) def test_intersection_name_preservation(self, index2, keeps_name, sort): - index1 = Index([1, 2, 3, 4, 5], name='index') + index1 = Index([1, 2, 3, 4, 5], name="index") expected = Index([3, 4, 5]) result = index1.intersection(index2, sort) if keeps_name: - expected.name = 'index' + expected.name = "index" assert result.name == expected.name tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("first_name,second_name,expected_name", [ - ('A', 'A', 'A'), ('A', 'B', None), (None, 'B', None)]) + @pytest.mark.parametrize( + "first_name,second_name,expected_name", + [("A", "A", "A"), ("A", "B", None), (None, "B", None)], + ) @pytest.mark.parametrize("sort", [None, False]) - def test_intersection_name_preservation2(self, first_name, second_name, - expected_name, sort): + def test_intersection_name_preservation2( + self, first_name, second_name, expected_name, sort + ): first = self.strIndex[5:20] second = self.strIndex[:10] first.name = first_name @@ -727,12 +823,16 @@ def test_intersection_name_preservation2(self, first_name, second_name, intersect = first.intersection(second, sort=sort) assert intersect.name == expected_name - @pytest.mark.parametrize("index2,keeps_name", [ - (Index([4, 7, 6, 5, 3], name='index'), True), - (Index([4, 7, 6, 5, 3], name='other'), False)]) + @pytest.mark.parametrize( + "index2,keeps_name", + [ + (Index([4, 7, 6, 5, 3], name="index"), True), + (Index([4, 7, 6, 5, 3], name="other"), False), + ], + ) @pytest.mark.parametrize("sort", [None, False]) def test_intersection_monotonic(self, index2, keeps_name, sort): - index1 = Index([5, 3, 2, 4, 1], name='index') + index1 = Index([5, 3, 2, 4, 1], name="index") expected = Index([5, 3, 4]) if keeps_name: @@ -743,15 +843,15 @@ def test_intersection_monotonic(self, index2, keeps_name, sort): expected = expected.sort_values() tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("index2,expected_arr", [ - (Index(['B', 'D']), ['B']), - (Index(['B', 'D', 'A']), ['A', 'B', 'A'])]) + @pytest.mark.parametrize( + "index2,expected_arr", + [(Index(["B", "D"]), ["B"]), (Index(["B", "D", "A"]), ["A", "B", "A"])], + ) @pytest.mark.parametrize("sort", [None, False]) - def test_intersection_non_monotonic_non_unique(self, index2, expected_arr, - sort): + def test_intersection_non_monotonic_non_unique(self, index2, expected_arr, sort): # non-monotonic non-unique - index1 = Index(['A', 'B', 'A', 'C']) - expected = Index(expected_arr, dtype='object') + index1 = Index(["A", "B", "A", "C"]) + expected = Index(expected_arr, dtype="object") result = index1.intersection(index2, sort=sort) if sort is None: expected = expected.sort_values() @@ -762,41 +862,41 @@ def test_intersect_str_dates(self, sort): dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] i1 = Index(dt_dates, dtype=object) - i2 = Index(['aa'], dtype=object) + i2 = Index(["aa"], dtype=object) result = i2.intersection(i1, sort=sort) assert len(result) == 0 def test_intersect_nosort(self): - result = pd.Index(['c', 'b', 'a']).intersection(['b', 'a']) - expected = pd.Index(['b', 'a']) + result = pd.Index(["c", "b", "a"]).intersection(["b", "a"]) + expected = pd.Index(["b", "a"]) tm.assert_index_equal(result, expected) def test_intersection_equal_sort(self): - idx = pd.Index(['c', 'a', 'b']) + idx = pd.Index(["c", "a", "b"]) tm.assert_index_equal(idx.intersection(idx, sort=False), idx) tm.assert_index_equal(idx.intersection(idx, sort=None), idx) @pytest.mark.xfail(reason="Not implemented") def test_intersection_equal_sort_true(self): # TODO decide on True behaviour - idx = pd.Index(['c', 'a', 'b']) - sorted_ = pd.Index(['a', 'b', 'c']) + idx = pd.Index(["c", "a", "b"]) + sorted_ = pd.Index(["a", "b", "c"]) tm.assert_index_equal(idx.intersection(idx, sort=True), sorted_) @pytest.mark.parametrize("sort", [None, False]) def test_chained_union(self, sort): # Chained unions handles names correctly - i1 = Index([1, 2], name='i1') - i2 = Index([5, 6], name='i2') - i3 = Index([3, 4], name='i3') + i1 = Index([1, 2], name="i1") + i2 = Index([5, 6], name="i2") + i3 = Index([3, 4], name="i3") union = i1.union(i2.union(i3, sort=sort), sort=sort) expected = i1.union(i2, sort=sort).union(i3, sort=sort) tm.assert_index_equal(union, expected) - j1 = Index([1, 2], name='j1') - j2 = Index([], name='j2') - j3 = Index([], name='j3') + j1 = Index([1, 2], name="j1") + j2 = Index([], name="j2") + j3 = Index([], name="j3") union = j1.union(j2.union(j3, sort=sort), sort=sort) expected = j1.union(j2, sort=sort).union(j3, sort=sort) tm.assert_index_equal(union, expected) @@ -813,7 +913,7 @@ def test_union(self, sort): tm.assert_index_equal(union, everything.sort_values()) assert tm.equalContents(union, everything) - @pytest.mark.parametrize('slice_', [slice(None), slice(0)]) + @pytest.mark.parametrize("slice_", [slice(None), slice(0)]) def test_union_sort_other_special(self, slice_): # https://github.com/pandas-dev/pandas/issues/24959 @@ -827,7 +927,7 @@ def test_union_sort_other_special(self, slice_): tm.assert_index_equal(idx.union(other, sort=False), idx) @pytest.mark.xfail(reason="Not implemented") - @pytest.mark.parametrize('slice_', [slice(None), slice(0)]) + @pytest.mark.parametrize("slice_", [slice(None), slice(0)]) def test_union_sort_special_true(self, slice_): # TODO decide on True behaviour # sort=True @@ -841,7 +941,7 @@ def test_union_sort_special_true(self, slice_): def test_union_sort_other_incomparable(self): # https://github.com/pandas-dev/pandas/issues/24959 - idx = pd.Index([1, pd.Timestamp('2000')]) + idx = pd.Index([1, pd.Timestamp("2000")]) # default (sort=None) with tm.assert_produces_warning(RuntimeWarning): result = idx.union(idx[:1]) @@ -861,12 +961,11 @@ def test_union_sort_other_incomparable(self): def test_union_sort_other_incomparable_true(self): # TODO decide on True behaviour # sort=True - idx = pd.Index([1, pd.Timestamp('2000')]) - with pytest.raises(TypeError, match='.*'): + idx = pd.Index([1, pd.Timestamp("2000")]) + with pytest.raises(TypeError, match=".*"): idx.union(idx[:1], sort=True) - @pytest.mark.parametrize("klass", [ - np.array, Series, list]) + @pytest.mark.parametrize("klass", [np.array, Series, list]) @pytest.mark.parametrize("sort", [None, False]) def test_union_from_iterables(self, klass, sort): # GH 10149 @@ -898,13 +997,16 @@ def test_union_identity(self, sort): union = Index([]).union(first, sort=sort) assert (union is first) is (not sort) - @pytest.mark.parametrize("first_list", [list('ba'), list()]) - @pytest.mark.parametrize("second_list", [list('ab'), list()]) - @pytest.mark.parametrize("first_name, second_name, expected_name", [ - ('A', 'B', None), (None, 'B', None), ('A', None, None)]) + @pytest.mark.parametrize("first_list", [list("ba"), list()]) + @pytest.mark.parametrize("second_list", [list("ab"), list()]) + @pytest.mark.parametrize( + "first_name, second_name, expected_name", + [("A", "B", None), (None, "B", None), ("A", None, None)], + ) @pytest.mark.parametrize("sort", [None, False]) - def test_union_name_preservation(self, first_list, second_list, first_name, - second_name, expected_name, sort): + def test_union_name_preservation( + self, first_list, second_list, first_name, second_name, expected_name, sort + ): first = Index(first_list, name=first_name) second = Index(second_list, name=second_name) union = first.union(second, sort=sort) @@ -927,7 +1029,7 @@ def test_union_dt_as_obj(self, sort): if self.dateIndex.dtype == np.object_: appended = np.append(self.strIndex, self.dateIndex) else: - appended = np.append(self.strIndex, self.dateIndex.astype('O')) + appended = np.append(self.strIndex, self.dateIndex.astype("O")) assert tm.equalContents(firstCat, appended) assert tm.equalContents(secondCat, self.strIndex) @@ -935,11 +1037,12 @@ def test_union_dt_as_obj(self, sort): tm.assert_contains_all(self.strIndex, secondCat) tm.assert_contains_all(self.dateIndex, firstCat) - @pytest.mark.parametrize("method", ['union', 'intersection', 'difference', - 'symmetric_difference']) + @pytest.mark.parametrize( + "method", ["union", "intersection", "difference", "symmetric_difference"] + ) def test_setops_disallow_true(self, method): - idx1 = pd.Index(['a', 'b']) - idx2 = pd.Index(['b', 'c']) + idx1 = pd.Index(["a", "b"]) + idx2 = pd.Index(["b", "c"]) with pytest.raises(ValueError, match="The 'sort' keyword only takes"): getattr(idx1, method)(idx2, sort=True) @@ -969,13 +1072,14 @@ def test_map_with_tuples(self): def test_map_with_tuples_mi(self): # Test that returning a single object from a MultiIndex # returns an Index. - first_level = ['foo', 'bar', 'baz'] + first_level = ["foo", "bar", "baz"] multi_index = MultiIndex.from_tuples(zip(first_level, [1, 2, 3])) reduced_index = multi_index.map(lambda x: x[0]) tm.assert_index_equal(reduced_index, Index(first_level)) - @pytest.mark.parametrize("attr", [ - 'makeDateIndex', 'makePeriodIndex', 'makeTimedeltaIndex']) + @pytest.mark.parametrize( + "attr", ["makeDateIndex", "makePeriodIndex", "makeTimedeltaIndex"] + ) def test_map_tseries_indices_return_index(self, attr): index = getattr(tm, attr)(10) expected = Index([1] * 10) @@ -983,28 +1087,30 @@ def test_map_tseries_indices_return_index(self, attr): tm.assert_index_equal(expected, result) def test_map_tseries_indices_accsr_return_index(self): - date_index = tm.makeDateIndex(24, freq='h', name='hourly') - expected = Index(range(24), name='hourly') + date_index = tm.makeDateIndex(24, freq="h", name="hourly") + expected = Index(range(24), name="hourly") tm.assert_index_equal(expected, date_index.map(lambda x: x.hour)) @pytest.mark.parametrize( "mapper", [ lambda values, index: {i: e for e, i in zip(values, index)}, - lambda values, index: pd.Series(values, index)]) + lambda values, index: pd.Series(values, index), + ], + ) def test_map_dictlike(self, mapper): # GH 12756 - expected = Index(['foo', 'bar', 'baz']) + expected = Index(["foo", "bar", "baz"]) index = tm.makeIntIndex(3) result = index.map(mapper(expected.values, index)) tm.assert_index_equal(result, expected) # TODO: replace with fixture for name in self.indices.keys(): - if name == 'catIndex': + if name == "catIndex": # Tested in test_categorical continue - elif name == 'repeats': + elif name == "repeats": # Cannot map duplicated index continue @@ -1012,18 +1118,19 @@ def test_map_dictlike(self, mapper): expected = Index(np.arange(len(index), 0, -1)) # to match proper result coercion for uints - if name == 'empty': + if name == "empty": expected = Index([]) result = index.map(mapper(expected, index)) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("mapper", [ - Series(['foo', 2., 'baz'], index=[0, 2, -1]), - {0: 'foo', 2: 2.0, -1: 'baz'}]) + @pytest.mark.parametrize( + "mapper", + [Series(["foo", 2.0, "baz"], index=[0, 2, -1]), {0: "foo", 2: 2.0, -1: "baz"}], + ) def test_map_with_non_function_missing_values(self, mapper): # GH 12756 - expected = Index([2., np.nan, 'foo']) + expected = Index([2.0, np.nan, "foo"]) result = Index([2, 1, 0]).map(mapper) tm.assert_index_equal(expected, result) @@ -1031,20 +1138,20 @@ def test_map_with_non_function_missing_values(self, mapper): def test_map_na_exclusion(self): index = Index([1.5, np.nan, 3, np.nan, 5]) - result = index.map(lambda x: x * 2, na_action='ignore') + result = index.map(lambda x: x * 2, na_action="ignore") expected = index * 2 tm.assert_index_equal(result, expected) def test_map_defaultdict(self): index = Index([1, 2, 3]) - default_dict = defaultdict(lambda: 'blank') - default_dict[1] = 'stuff' + default_dict = defaultdict(lambda: "blank") + default_dict[1] = "stuff" result = index.map(default_dict) - expected = Index(['stuff', 'blank', 'blank']) + expected = Index(["stuff", "blank", "blank"]) tm.assert_index_equal(result, expected) def test_append_multiple(self): - index = Index(['a', 'b', 'c', 'd', 'e', 'f']) + index = Index(["a", "b", "c", "d", "e", "f"]) foos = [index[:2], index[2:4], index[4:]] result = foos[0].append(foos[1:]) @@ -1054,17 +1161,15 @@ def test_append_multiple(self): result = index.append([]) tm.assert_index_equal(result, index) - @pytest.mark.parametrize("name,expected", [ - ('foo', 'foo'), ('bar', None)]) + @pytest.mark.parametrize("name,expected", [("foo", "foo"), ("bar", None)]) def test_append_empty_preserve_name(self, name, expected): - left = Index([], name='foo') + left = Index([], name="foo") right = Index([1, 2, 3], name=name) result = left.append(right) assert result.name == expected - @pytest.mark.parametrize("second_name,expected", [ - (None, None), ('name', 'name')]) + @pytest.mark.parametrize("second_name,expected", [(None, None), ("name", "name")]) @pytest.mark.parametrize("sort", [None, False]) def test_difference_name_preservation(self, second_name, expected, sort): # TODO: replace with fixturesult @@ -1072,7 +1177,7 @@ def test_difference_name_preservation(self, second_name, expected, sort): second = self.strIndex[:10] answer = self.strIndex[10:20] - first.name = 'name' + first.name = "name" second.name = second_name result = first.difference(second, sort=sort) @@ -1086,7 +1191,7 @@ def test_difference_name_preservation(self, second_name, expected, sort): @pytest.mark.parametrize("sort", [None, False]) def test_difference_empty_arg(self, sort): first = self.strIndex[5:20] - first.name == 'name' + first.name == "name" result = first.difference([], sort) assert tm.equalContents(result, first) @@ -1095,7 +1200,7 @@ def test_difference_empty_arg(self, sort): @pytest.mark.parametrize("sort", [None, False]) def test_difference_identity(self, sort): first = self.strIndex[5:20] - first.name == 'name' + first.name == "name" result = first.difference(first, sort) assert len(result) == 0 @@ -1117,7 +1222,7 @@ def test_difference_sort(self, sort): @pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference(self, sort): # smoke - index1 = Index([5, 2, 3, 4], name='index1') + index1 = Index([5, 2, 3, 4], name="index1") index2 = Index([2, 3, 4, 1]) result = index1.symmetric_difference(index2, sort=sort) expected = Index([5, 1]) @@ -1132,16 +1237,16 @@ def test_symmetric_difference(self, sort): assert tm.equalContents(result, expected) assert result.name is None - @pytest.mark.parametrize('opname', ['difference', 'symmetric_difference']) + @pytest.mark.parametrize("opname", ["difference", "symmetric_difference"]) def test_difference_incomparable(self, opname): - a = pd.Index([3, pd.Timestamp('2000'), 1]) - b = pd.Index([2, pd.Timestamp('1999'), 1]) + a = pd.Index([3, pd.Timestamp("2000"), 1]) + b = pd.Index([2, pd.Timestamp("1999"), 1]) op = operator.methodcaller(opname, b) # sort=None, the default result = op(a) - expected = pd.Index([3, pd.Timestamp('2000'), 2, pd.Timestamp('1999')]) - if opname == 'difference': + expected = pd.Index([3, pd.Timestamp("2000"), 2, pd.Timestamp("1999")]) + if opname == "difference": expected = expected[:2] tm.assert_index_equal(result, expected) @@ -1151,31 +1256,35 @@ def test_difference_incomparable(self, opname): tm.assert_index_equal(result, expected) @pytest.mark.xfail(reason="Not implemented") - @pytest.mark.parametrize('opname', ['difference', 'symmetric_difference']) + @pytest.mark.parametrize("opname", ["difference", "symmetric_difference"]) def test_difference_incomparable_true(self, opname): # TODO decide on True behaviour # # sort=True, raises - a = pd.Index([3, pd.Timestamp('2000'), 1]) - b = pd.Index([2, pd.Timestamp('1999'), 1]) + a = pd.Index([3, pd.Timestamp("2000"), 1]) + b = pd.Index([2, pd.Timestamp("1999"), 1]) op = operator.methodcaller(opname, b, sort=True) - with pytest.raises(TypeError, match='Cannot compare'): + with pytest.raises(TypeError, match="Cannot compare"): op(a) @pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference_mi(self, sort): index1 = MultiIndex.from_tuples(self.tuples) - index2 = MultiIndex.from_tuples([('foo', 1), ('bar', 3)]) + index2 = MultiIndex.from_tuples([("foo", 1), ("bar", 3)]) result = index1.symmetric_difference(index2, sort=sort) - expected = MultiIndex.from_tuples([('bar', 2), ('baz', 3), ('bar', 3)]) + expected = MultiIndex.from_tuples([("bar", 2), ("baz", 3), ("bar", 3)]) if sort is None: expected = expected.sort_values() tm.assert_index_equal(result, expected) assert tm.equalContents(result, expected) - @pytest.mark.parametrize("index2,expected", [ - (Index([0, 1, np.nan]), Index([2.0, 3.0, 0.0])), - (Index([0, 1]), Index([np.nan, 2.0, 3.0, 0.0]))]) + @pytest.mark.parametrize( + "index2,expected", + [ + (Index([0, 1, np.nan]), Index([2.0, 3.0, 0.0])), + (Index([0, 1]), Index([np.nan, 2.0, 3.0, 0.0])), + ], + ) @pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference_missing(self, index2, expected, sort): # GH 13514 change: {nan} - {nan} == {} @@ -1189,24 +1298,23 @@ def test_symmetric_difference_missing(self, index2, expected, sort): @pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference_non_index(self, sort): - index1 = Index([1, 2, 3, 4], name='index1') + index1 = Index([1, 2, 3, 4], name="index1") index2 = np.array([2, 3, 4, 5]) expected = Index([1, 5]) result = index1.symmetric_difference(index2, sort=sort) assert tm.equalContents(result, expected) - assert result.name == 'index1' + assert result.name == "index1" - result = index1.symmetric_difference(index2, result_name='new_name', - sort=sort) + result = index1.symmetric_difference(index2, result_name="new_name", sort=sort) assert tm.equalContents(result, expected) - assert result.name == 'new_name' + assert result.name == "new_name" @pytest.mark.parametrize("sort", [None, False]) def test_difference_type(self, sort): # GH 20040 # If taking difference of a set and itself, it # needs to preserve the type of the index - skip_index_keys = ['repeats'] + skip_index_keys = ["repeats"] for key, index in self.generate_index_types(skip_index_keys): result = index.difference(index, sort=sort) expected = index.drop(index) @@ -1218,42 +1326,66 @@ def test_intersection_difference(self, sort): # Test that the intersection of an index with an # empty index produces the same index as the difference # of an index with itself. Test for all types - skip_index_keys = ['repeats'] + skip_index_keys = ["repeats"] for key, index in self.generate_index_types(skip_index_keys): inter = index.intersection(index.drop(index)) diff = index.difference(index, sort=sort) tm.assert_index_equal(inter, diff) - @pytest.mark.parametrize("attr,expected", [ - ('strIndex', False), ('boolIndex', False), ('catIndex', False), - ('intIndex', True), ('dateIndex', False), ('floatIndex', True)]) + @pytest.mark.parametrize( + "attr,expected", + [ + ("strIndex", False), + ("boolIndex", False), + ("catIndex", False), + ("intIndex", True), + ("dateIndex", False), + ("floatIndex", True), + ], + ) def test_is_numeric(self, attr, expected): assert getattr(self, attr).is_numeric() == expected - @pytest.mark.parametrize("attr,expected", [ - ('strIndex', True), ('boolIndex', True), ('catIndex', False), - ('intIndex', False), ('dateIndex', False), ('floatIndex', False)]) + @pytest.mark.parametrize( + "attr,expected", + [ + ("strIndex", True), + ("boolIndex", True), + ("catIndex", False), + ("intIndex", False), + ("dateIndex", False), + ("floatIndex", False), + ], + ) def test_is_object(self, attr, expected): assert getattr(self, attr).is_object() == expected - @pytest.mark.parametrize("attr,expected", [ - ('strIndex', False), ('boolIndex', False), ('catIndex', False), - ('intIndex', False), ('dateIndex', True), ('floatIndex', False)]) + @pytest.mark.parametrize( + "attr,expected", + [ + ("strIndex", False), + ("boolIndex", False), + ("catIndex", False), + ("intIndex", False), + ("dateIndex", True), + ("floatIndex", False), + ], + ) def test_is_all_dates(self, attr, expected): assert getattr(self, attr).is_all_dates == expected def test_summary(self): self._check_method_works(Index._summary) # GH3869 - ind = Index(['{other}%s', "~:{range}:0"], name='A') + ind = Index(["{other}%s", "~:{range}:0"], name="A") result = ind._summary() # shouldn't be formatted accidentally. - assert '~:{range}:0' in result - assert '{other}%s' in result + assert "~:{range}:0" in result + assert "{other}%s" in result # GH18217 def test_summary_deprecated(self): - ind = Index(['{other}%s', "~:{range}:0"], name='A') + ind = Index(["{other}%s", "~:{range}:0"], name="A") with tm.assert_produces_warning(FutureWarning): ind.summary() @@ -1274,8 +1406,7 @@ def test_format(self): self.strIndex[:0].format() - @pytest.mark.parametrize("vals", [ - [1, 2.0 + 3.0j, 4.], ['a', 'b', 'c']]) + @pytest.mark.parametrize("vals", [[1, 2.0 + 3.0j, 4.0], ["a", "b", "c"]]) def test_format_missing(self, vals, nulls_fixture): # 2845 vals = list(vals) # Copy for each iteration @@ -1283,7 +1414,7 @@ def test_format_missing(self, vals, nulls_fixture): index = Index(vals) formatted = index.format() - expected = [str(index[0]), str(index[1]), str(index[2]), 'NaN'] + expected = [str(index[0]), str(index[1]), str(index[2]), "NaN"] assert formatted == expected assert index[3] is nulls_fixture @@ -1291,20 +1422,20 @@ def test_format_missing(self, vals, nulls_fixture): def test_format_with_name_time_info(self): # bug I fixed 12/20/2011 inc = timedelta(hours=4) - dates = Index([dt + inc for dt in self.dateIndex], name='something') + dates = Index([dt + inc for dt in self.dateIndex], name="something") formatted = dates.format(name=True) - assert formatted[0] == 'something' + assert formatted[0] == "something" def test_format_datetime_with_time(self): t = Index([datetime(2012, 2, 7), datetime(2012, 2, 7, 23)]) result = t.format() - expected = ['2012-02-07 00:00:00', '2012-02-07 23:00:00'] + expected = ["2012-02-07 00:00:00", "2012-02-07 23:00:00"] assert len(result) == 2 assert result == expected - @pytest.mark.parametrize("op", ['any', 'all']) + @pytest.mark.parametrize("op", ["any", "all"]) def test_logical_compat(self, op): index = self.create_index() assert getattr(index, op)() == getattr(index.values, op)() @@ -1328,11 +1459,15 @@ def test_get_indexer(self): assert_almost_equal(r1, e1) @pytest.mark.parametrize("reverse", [True, False]) - @pytest.mark.parametrize("expected,method", [ - (np.array([-1, 0, 0, 1, 1], dtype=np.intp), 'pad'), - (np.array([-1, 0, 0, 1, 1], dtype=np.intp), 'ffill'), - (np.array([0, 0, 1, 1, 2], dtype=np.intp), 'backfill'), - (np.array([0, 0, 1, 1, 2], dtype=np.intp), 'bfill')]) + @pytest.mark.parametrize( + "expected,method", + [ + (np.array([-1, 0, 0, 1, 1], dtype=np.intp), "pad"), + (np.array([-1, 0, 0, 1, 1], dtype=np.intp), "ffill"), + (np.array([0, 0, 1, 1, 2], dtype=np.intp), "backfill"), + (np.array([0, 0, 1, 1, 2], dtype=np.intp), "bfill"), + ], + ) def test_get_indexer_methods(self, reverse, expected, method): index1 = Index([1, 2, 3, 4, 5]) index2 = Index([2, 4, 6]) @@ -1348,66 +1483,70 @@ def test_get_indexer_invalid(self): # GH10411 index = Index(np.arange(10)) - with pytest.raises(ValueError, match='tolerance argument'): + with pytest.raises(ValueError, match="tolerance argument"): index.get_indexer([1, 0], tolerance=1) - with pytest.raises(ValueError, match='limit argument'): + with pytest.raises(ValueError, match="limit argument"): index.get_indexer([1, 0], limit=1) @pytest.mark.parametrize( - 'method, tolerance, indexer, expected', + "method, tolerance, indexer, expected", [ - ('pad', None, [0, 5, 9], [0, 5, 9]), - ('backfill', None, [0, 5, 9], [0, 5, 9]), - ('nearest', None, [0, 5, 9], [0, 5, 9]), - ('pad', 0, [0, 5, 9], [0, 5, 9]), - ('backfill', 0, [0, 5, 9], [0, 5, 9]), - ('nearest', 0, [0, 5, 9], [0, 5, 9]), - - ('pad', None, [0.2, 1.8, 8.5], [0, 1, 8]), - ('backfill', None, [0.2, 1.8, 8.5], [1, 2, 9]), - ('nearest', None, [0.2, 1.8, 8.5], [0, 2, 9]), - ('pad', 1, [0.2, 1.8, 8.5], [0, 1, 8]), - ('backfill', 1, [0.2, 1.8, 8.5], [1, 2, 9]), - ('nearest', 1, [0.2, 1.8, 8.5], [0, 2, 9]), - - ('pad', 0.2, [0.2, 1.8, 8.5], [0, -1, -1]), - ('backfill', 0.2, [0.2, 1.8, 8.5], [-1, 2, -1]), - ('nearest', 0.2, [0.2, 1.8, 8.5], [0, 2, -1])]) + ("pad", None, [0, 5, 9], [0, 5, 9]), + ("backfill", None, [0, 5, 9], [0, 5, 9]), + ("nearest", None, [0, 5, 9], [0, 5, 9]), + ("pad", 0, [0, 5, 9], [0, 5, 9]), + ("backfill", 0, [0, 5, 9], [0, 5, 9]), + ("nearest", 0, [0, 5, 9], [0, 5, 9]), + ("pad", None, [0.2, 1.8, 8.5], [0, 1, 8]), + ("backfill", None, [0.2, 1.8, 8.5], [1, 2, 9]), + ("nearest", None, [0.2, 1.8, 8.5], [0, 2, 9]), + ("pad", 1, [0.2, 1.8, 8.5], [0, 1, 8]), + ("backfill", 1, [0.2, 1.8, 8.5], [1, 2, 9]), + ("nearest", 1, [0.2, 1.8, 8.5], [0, 2, 9]), + ("pad", 0.2, [0.2, 1.8, 8.5], [0, -1, -1]), + ("backfill", 0.2, [0.2, 1.8, 8.5], [-1, 2, -1]), + ("nearest", 0.2, [0.2, 1.8, 8.5], [0, 2, -1]), + ], + ) def test_get_indexer_nearest(self, method, tolerance, indexer, expected): index = Index(np.arange(10)) actual = index.get_indexer(indexer, method=method, tolerance=tolerance) - tm.assert_numpy_array_equal(actual, np.array(expected, - dtype=np.intp)) + tm.assert_numpy_array_equal(actual, np.array(expected, dtype=np.intp)) - @pytest.mark.parametrize('listtype', [list, tuple, Series, np.array]) + @pytest.mark.parametrize("listtype", [list, tuple, Series, np.array]) @pytest.mark.parametrize( - 'tolerance, expected', - list(zip([[0.3, 0.3, 0.1], [0.2, 0.1, 0.1], - [0.1, 0.5, 0.5]], - [[0, 2, -1], [0, -1, -1], - [-1, 2, 9]]))) - def test_get_indexer_nearest_listlike_tolerance(self, tolerance, - expected, listtype): + "tolerance, expected", + list( + zip( + [[0.3, 0.3, 0.1], [0.2, 0.1, 0.1], [0.1, 0.5, 0.5]], + [[0, 2, -1], [0, -1, -1], [-1, 2, 9]], + ) + ), + ) + def test_get_indexer_nearest_listlike_tolerance( + self, tolerance, expected, listtype + ): index = Index(np.arange(10)) - actual = index.get_indexer([0.2, 1.8, 8.5], method='nearest', - tolerance=listtype(tolerance)) - tm.assert_numpy_array_equal(actual, np.array(expected, - dtype=np.intp)) + actual = index.get_indexer( + [0.2, 1.8, 8.5], method="nearest", tolerance=listtype(tolerance) + ) + tm.assert_numpy_array_equal(actual, np.array(expected, dtype=np.intp)) def test_get_indexer_nearest_error(self): index = Index(np.arange(10)) - with pytest.raises(ValueError, match='limit argument'): - index.get_indexer([1, 0], method='nearest', limit=1) + with pytest.raises(ValueError, match="limit argument"): + index.get_indexer([1, 0], method="nearest", limit=1) - with pytest.raises(ValueError, match='tolerance size must match'): - index.get_indexer([1, 0], method='nearest', - tolerance=[1, 2, 3]) + with pytest.raises(ValueError, match="tolerance size must match"): + index.get_indexer([1, 0], method="nearest", tolerance=[1, 2, 3]) - @pytest.mark.parametrize("method,expected", [ - ('pad', [8, 7, 0]), ('backfill', [9, 8, 1]), ('nearest', [9, 7, 0])]) + @pytest.mark.parametrize( + "method,expected", + [("pad", [8, 7, 0]), ("backfill", [9, 8, 1]), ("nearest", [9, 7, 0])], + ) def test_get_indexer_nearest_decreasing(self, method, expected): index = Index(np.arange(10))[::-1] @@ -1417,31 +1556,35 @@ def test_get_indexer_nearest_decreasing(self, method, expected): actual = index.get_indexer([0.2, 1.8, 8.5], method=method) tm.assert_numpy_array_equal(actual, np.array(expected, dtype=np.intp)) - @pytest.mark.parametrize("method,expected", [ - ('pad', np.array([-1, 0, 1, 1], dtype=np.intp)), - ('backfill', np.array([0, 0, 1, -1], dtype=np.intp))]) + @pytest.mark.parametrize( + "method,expected", + [ + ("pad", np.array([-1, 0, 1, 1], dtype=np.intp)), + ("backfill", np.array([0, 0, 1, -1], dtype=np.intp)), + ], + ) def test_get_indexer_strings(self, method, expected): - index = pd.Index(['b', 'c']) - actual = index.get_indexer(['a', 'b', 'c', 'd'], method=method) + index = pd.Index(["b", "c"]) + actual = index.get_indexer(["a", "b", "c", "d"], method=method) tm.assert_numpy_array_equal(actual, expected) def test_get_indexer_strings_raises(self): - index = pd.Index(['b', 'c']) + index = pd.Index(["b", "c"]) msg = r"unsupported operand type\(s\) for -: 'str' and 'str'" with pytest.raises(TypeError, match=msg): - index.get_indexer(['a', 'b', 'c', 'd'], method='nearest') + index.get_indexer(["a", "b", "c", "d"], method="nearest") with pytest.raises(TypeError, match=msg): - index.get_indexer(['a', 'b', 'c', 'd'], method='pad', tolerance=2) + index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) with pytest.raises(TypeError, match=msg): - index.get_indexer(['a', 'b', 'c', 'd'], method='pad', - tolerance=[2, 2, 2, 2]) + index.get_indexer( + ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] + ) - @pytest.mark.parametrize("idx_class", [Int64Index, RangeIndex, - Float64Index]) + @pytest.mark.parametrize("idx_class", [Int64Index, RangeIndex, Float64Index]) def test_get_indexer_numeric_index_boolean_target(self, idx_class): # GH 16877 @@ -1450,22 +1593,23 @@ def test_get_indexer_numeric_index_boolean_target(self, idx_class): expected = np.array([-1, -1, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - def test_get_indexer_with_NA_values(self, unique_nulls_fixture, - unique_nulls_fixture2): + def test_get_indexer_with_NA_values( + self, unique_nulls_fixture, unique_nulls_fixture2 + ): # GH 22332 # check pairwise, that no pair of na values # is mangled if unique_nulls_fixture is unique_nulls_fixture2: return # skip it, values are not unique - arr = np.array([unique_nulls_fixture, - unique_nulls_fixture2], dtype=np.object) + arr = np.array([unique_nulls_fixture, unique_nulls_fixture2], dtype=np.object) index = pd.Index(arr, dtype=np.object) - result = index.get_indexer([unique_nulls_fixture, - unique_nulls_fixture2, 'Unknown']) + result = index.get_indexer( + [unique_nulls_fixture, unique_nulls_fixture2, "Unknown"] + ) expected = np.array([0, 1, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("method", [None, 'pad', 'backfill', 'nearest']) + @pytest.mark.parametrize("method", [None, "pad", "backfill", "nearest"]) def test_get_loc(self, method): index = pd.Index([0, 1, 2]) assert index.get_loc(1, method=method) == 1 @@ -1473,58 +1617,59 @@ def test_get_loc(self, method): if method: assert index.get_loc(1, method=method, tolerance=0) == 1 - @pytest.mark.parametrize("method", [None, 'pad', 'backfill', 'nearest']) + @pytest.mark.parametrize("method", [None, "pad", "backfill", "nearest"]) def test_get_loc_raises_bad_label(self, method): index = pd.Index([0, 1, 2]) if method: # Messages vary across versions if PY36: - msg = 'not supported between' + msg = "not supported between" else: - msg = 'unorderable types' + msg = "unorderable types" else: - msg = 'invalid key' + msg = "invalid key" with pytest.raises(TypeError, match=msg): index.get_loc([1, 2], method=method) - @pytest.mark.parametrize("method,loc", [ - ('pad', 1), ('backfill', 2), ('nearest', 1)]) + @pytest.mark.parametrize( + "method,loc", [("pad", 1), ("backfill", 2), ("nearest", 1)] + ) def test_get_loc_tolerance(self, method, loc): index = pd.Index([0, 1, 2]) assert index.get_loc(1.1, method) == loc assert index.get_loc(1.1, method, tolerance=1) == loc - @pytest.mark.parametrize("method", ['pad', 'backfill', 'nearest']) + @pytest.mark.parametrize("method", ["pad", "backfill", "nearest"]) def test_get_loc_outside_tolerance_raises(self, method): index = pd.Index([0, 1, 2]) - with pytest.raises(KeyError, match='1.1'): + with pytest.raises(KeyError, match="1.1"): index.get_loc(1.1, method, tolerance=0.05) def test_get_loc_bad_tolerance_raises(self): index = pd.Index([0, 1, 2]) - with pytest.raises(ValueError, match='must be numeric'): - index.get_loc(1.1, 'nearest', tolerance='invalid') + with pytest.raises(ValueError, match="must be numeric"): + index.get_loc(1.1, "nearest", tolerance="invalid") def test_get_loc_tolerance_no_method_raises(self): index = pd.Index([0, 1, 2]) - with pytest.raises(ValueError, match='tolerance .* valid if'): + with pytest.raises(ValueError, match="tolerance .* valid if"): index.get_loc(1.1, tolerance=1) def test_get_loc_raises_missized_tolerance(self): index = pd.Index([0, 1, 2]) - with pytest.raises(ValueError, match='tolerance size must match'): - index.get_loc(1.1, 'nearest', tolerance=[1, 1]) + with pytest.raises(ValueError, match="tolerance size must match"): + index.get_loc(1.1, "nearest", tolerance=[1, 1]) def test_get_loc_raises_object_nearest(self): - index = pd.Index(['a', 'c']) - with pytest.raises(TypeError, match='unsupported operand type'): - index.get_loc('a', method='nearest') + index = pd.Index(["a", "c"]) + with pytest.raises(TypeError, match="unsupported operand type"): + index.get_loc("a", method="nearest") def test_get_loc_raises_object_tolerance(self): - index = pd.Index(['a', 'c']) - with pytest.raises(TypeError, match='unsupported operand type'): - index.get_loc('a', method='pad', tolerance='invalid') + index = pd.Index(["a", "c"]) + with pytest.raises(TypeError, match="unsupported operand type"): + index.get_loc("a", method="pad", tolerance="invalid") @pytest.mark.parametrize("dtype", [int, float]) def test_slice_locs(self, dtype): @@ -1555,17 +1700,17 @@ def test_slice_float_locs(self, dtype): assert index2.slice_locs(10.5, -1) == (0, n) def test_slice_locs_dup(self): - index = Index(['a', 'a', 'b', 'c', 'd', 'd']) - assert index.slice_locs('a', 'd') == (0, 6) - assert index.slice_locs(end='d') == (0, 6) - assert index.slice_locs('a', 'c') == (0, 4) - assert index.slice_locs('b', 'd') == (2, 6) + index = Index(["a", "a", "b", "c", "d", "d"]) + assert index.slice_locs("a", "d") == (0, 6) + assert index.slice_locs(end="d") == (0, 6) + assert index.slice_locs("a", "c") == (0, 4) + assert index.slice_locs("b", "d") == (2, 6) index2 = index[::-1] - assert index2.slice_locs('d', 'a') == (0, 6) - assert index2.slice_locs(end='a') == (0, 6) - assert index2.slice_locs('d', 'b') == (0, 4) - assert index2.slice_locs('c', 'a') == (2, 6) + assert index2.slice_locs("d", "a") == (0, 6) + assert index2.slice_locs(end="a") == (0, 6) + assert index2.slice_locs("d", "b") == (0, 4) + assert index2.slice_locs("c", "a") == (2, 6) @pytest.mark.parametrize("dtype", [int, float]) def test_slice_locs_dup_numeric(self, dtype): @@ -1587,30 +1732,39 @@ def test_slice_locs_na(self): def test_slice_locs_na_raises(self): index = Index([np.nan, 1, 2]) - with pytest.raises(KeyError, match=''): + with pytest.raises(KeyError, match=""): index.slice_locs(start=1.5) - with pytest.raises(KeyError, match=''): + with pytest.raises(KeyError, match=""): index.slice_locs(end=1.5) - @pytest.mark.parametrize("in_slice,expected", [ - (pd.IndexSlice[::-1], 'yxdcb'), (pd.IndexSlice['b':'y':-1], ''), - (pd.IndexSlice['b'::-1], 'b'), (pd.IndexSlice[:'b':-1], 'yxdcb'), - (pd.IndexSlice[:'y':-1], 'y'), (pd.IndexSlice['y'::-1], 'yxdcb'), - (pd.IndexSlice['y'::-4], 'yb'), - # absent labels - (pd.IndexSlice[:'a':-1], 'yxdcb'), (pd.IndexSlice[:'a':-2], 'ydb'), - (pd.IndexSlice['z'::-1], 'yxdcb'), (pd.IndexSlice['z'::-3], 'yc'), - (pd.IndexSlice['m'::-1], 'dcb'), (pd.IndexSlice[:'m':-1], 'yx'), - (pd.IndexSlice['a':'a':-1], ''), (pd.IndexSlice['z':'z':-1], ''), - (pd.IndexSlice['m':'m':-1], '') - ]) + @pytest.mark.parametrize( + "in_slice,expected", + [ + (pd.IndexSlice[::-1], "yxdcb"), + (pd.IndexSlice["b":"y":-1], ""), + (pd.IndexSlice["b"::-1], "b"), + (pd.IndexSlice[:"b":-1], "yxdcb"), + (pd.IndexSlice[:"y":-1], "y"), + (pd.IndexSlice["y"::-1], "yxdcb"), + (pd.IndexSlice["y"::-4], "yb"), + # absent labels + (pd.IndexSlice[:"a":-1], "yxdcb"), + (pd.IndexSlice[:"a":-2], "ydb"), + (pd.IndexSlice["z"::-1], "yxdcb"), + (pd.IndexSlice["z"::-3], "yc"), + (pd.IndexSlice["m"::-1], "dcb"), + (pd.IndexSlice[:"m":-1], "yx"), + (pd.IndexSlice["a":"a":-1], ""), + (pd.IndexSlice["z":"z":-1], ""), + (pd.IndexSlice["m":"m":-1], ""), + ], + ) def test_slice_locs_negative_step(self, in_slice, expected): - index = Index(list('bcdxy')) + index = Index(list("bcdxy")) - s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, - in_slice.step) - result = index[s_start:s_stop:in_slice.step] + s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) + result = index[s_start : s_stop : in_slice.step] expected = pd.Index(list(expected)) tm.assert_index_equal(result, expected) @@ -1627,9 +1781,9 @@ def test_drop_by_str_label(self): expected = self.strIndex[1:] tm.assert_index_equal(dropped, expected) - @pytest.mark.parametrize("keys", [['foo', 'bar'], ['1', 'bar']]) + @pytest.mark.parametrize("keys", [["foo", "bar"], ["1", "bar"]]) def test_drop_by_str_label_raises_missing_keys(self, keys): - with pytest.raises(KeyError, match=''): + with pytest.raises(KeyError, match=""): self.strIndex.drop(keys) def test_drop_by_str_label_errors_ignore(self): @@ -1638,13 +1792,13 @@ def test_drop_by_str_label_errors_ignore(self): # errors='ignore' n = len(self.strIndex) drop = self.strIndex[list(range(5, 10))] - mixed = drop.tolist() + ['foo'] - dropped = self.strIndex.drop(mixed, errors='ignore') + mixed = drop.tolist() + ["foo"] + dropped = self.strIndex.drop(mixed, errors="ignore") expected = self.strIndex[list(range(5)) + list(range(10, n))] tm.assert_index_equal(dropped, expected) - dropped = self.strIndex.drop(['foo', 'bar'], errors='ignore') + dropped = self.strIndex.drop(["foo", "bar"], errors="ignore") expected = self.strIndex[list(range(n))] tm.assert_index_equal(dropped, expected) @@ -1658,25 +1812,27 @@ def test_drop_by_numeric_label_loc(self): def test_drop_by_numeric_label_raises_missing_keys(self): index = Index([1, 2, 3]) - with pytest.raises(KeyError, match=''): + with pytest.raises(KeyError, match=""): index.drop([3, 4]) - @pytest.mark.parametrize("key,expected", [ - (4, Index([1, 2, 3])), ([3, 4, 5], Index([1, 2]))]) + @pytest.mark.parametrize( + "key,expected", [(4, Index([1, 2, 3])), ([3, 4, 5], Index([1, 2]))] + ) def test_drop_by_numeric_label_errors_ignore(self, key, expected): index = Index([1, 2, 3]) - dropped = index.drop(key, errors='ignore') + dropped = index.drop(key, errors="ignore") tm.assert_index_equal(dropped, expected) - @pytest.mark.parametrize("values", [['a', 'b', ('c', 'd')], - ['a', ('c', 'd'), 'b'], - [('c', 'd'), 'a', 'b']]) - @pytest.mark.parametrize("to_drop", [[('c', 'd'), 'a'], ['a', ('c', 'd')]]) + @pytest.mark.parametrize( + "values", + [["a", "b", ("c", "d")], ["a", ("c", "d"), "b"], [("c", "d"), "a", "b"]], + ) + @pytest.mark.parametrize("to_drop", [[("c", "d"), "a"], ["a", ("c", "d")]]) def test_drop_tuple(self, values, to_drop): # GH 18304 index = pd.Index(values) - expected = pd.Index(['b']) + expected = pd.Index(["b"]) result = index.drop(to_drop) tm.assert_index_equal(result, expected) @@ -1687,31 +1843,53 @@ def test_drop_tuple(self, values, to_drop): tm.assert_index_equal(result, expected) removed = index.drop(to_drop[1]) - msg = r"\"\[{}\] not found in axis\"".format( - re.escape(to_drop[1].__repr__())) + msg = r"\"\[{}\] not found in axis\"".format(re.escape(to_drop[1].__repr__())) for drop_me in to_drop[1], [to_drop[1]]: with pytest.raises(KeyError, match=msg): removed.drop(drop_me) - @pytest.mark.parametrize("method,expected,sort", [ - ('intersection', np.array([(1, 'A'), (2, 'A'), (1, 'B'), (2, 'B')], - dtype=[('num', int), ('let', 'a1')]), - False), - - ('intersection', np.array([(1, 'A'), (1, 'B'), (2, 'A'), (2, 'B')], - dtype=[('num', int), ('let', 'a1')]), - None), - - ('union', np.array([(1, 'A'), (1, 'B'), (1, 'C'), (2, 'A'), (2, 'B'), - (2, 'C')], dtype=[('num', int), ('let', 'a1')]), - None) - ]) + @pytest.mark.parametrize( + "method,expected,sort", + [ + ( + "intersection", + np.array( + [(1, "A"), (2, "A"), (1, "B"), (2, "B")], + dtype=[("num", int), ("let", "a1")], + ), + False, + ), + ( + "intersection", + np.array( + [(1, "A"), (1, "B"), (2, "A"), (2, "B")], + dtype=[("num", int), ("let", "a1")], + ), + None, + ), + ( + "union", + np.array( + [(1, "A"), (1, "B"), (1, "C"), (2, "A"), (2, "B"), (2, "C")], + dtype=[("num", int), ("let", "a1")], + ), + None, + ), + ], + ) def test_tuple_union_bug(self, method, expected, sort): - index1 = Index(np.array([(1, 'A'), (2, 'A'), (1, 'B'), (2, 'B')], - dtype=[('num', int), ('let', 'a1')])) - index2 = Index(np.array([(1, 'A'), (2, 'A'), (1, 'B'), - (2, 'B'), (1, 'C'), (2, 'C')], - dtype=[('num', int), ('let', 'a1')])) + index1 = Index( + np.array( + [(1, "A"), (2, "A"), (1, "B"), (2, "B")], + dtype=[("num", int), ("let", "a1")], + ) + ) + index2 = Index( + np.array( + [(1, "A"), (2, "A"), (1, "B"), (2, "B"), (1, "C"), (2, "C")], + dtype=[("num", int), ("let", "a1")], + ) + ) result = getattr(index1, method)(index2, sort=sort) assert result.ndim == 1 @@ -1719,10 +1897,15 @@ def test_tuple_union_bug(self, method, expected, sort): expected = Index(expected) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("attr", [ - 'is_monotonic_increasing', 'is_monotonic_decreasing', - '_is_strictly_monotonic_increasing', - '_is_strictly_monotonic_decreasing']) + @pytest.mark.parametrize( + "attr", + [ + "is_monotonic_increasing", + "is_monotonic_decreasing", + "_is_strictly_monotonic_increasing", + "_is_strictly_monotonic_decreasing", + ], + ) def test_is_monotonic_incomparable(self, attr): index = Index([5, datetime.now(), 7]) assert not getattr(index, attr) @@ -1737,13 +1920,14 @@ def test_get_set_value(self): self.dateIndex.set_value(values, date, 10) assert values[67] == 10 - @pytest.mark.parametrize("values", [ - ['foo', 'bar', 'quux'], {'foo', 'bar', 'quux'}]) - @pytest.mark.parametrize("index,expected", [ - (Index(['qux', 'baz', 'foo', 'bar']), - np.array([False, False, True, True])), - (Index([]), np.array([], dtype=bool)) # empty - ]) + @pytest.mark.parametrize("values", [["foo", "bar", "quux"], {"foo", "bar", "quux"}]) + @pytest.mark.parametrize( + "index,expected", + [ + (Index(["qux", "baz", "foo", "bar"]), np.array([False, False, True, True])), + (Index([]), np.array([], dtype=bool)), # empty + ], + ) def test_isin(self, values, index, expected): result = index.isin(values) tm.assert_numpy_array_equal(result, expected) @@ -1753,62 +1937,75 @@ def test_isin_nan_common_object(self, nulls_fixture, nulls_fixture2): # mangle the various types (save a corner case with PyPy) # all nans are the same - if (isinstance(nulls_fixture, float) and - isinstance(nulls_fixture2, float) and - math.isnan(nulls_fixture) and - math.isnan(nulls_fixture2)): - tm.assert_numpy_array_equal(Index(['a', nulls_fixture]).isin( - [nulls_fixture2]), np.array([False, True])) + if ( + isinstance(nulls_fixture, float) + and isinstance(nulls_fixture2, float) + and math.isnan(nulls_fixture) + and math.isnan(nulls_fixture2) + ): + tm.assert_numpy_array_equal( + Index(["a", nulls_fixture]).isin([nulls_fixture2]), + np.array([False, True]), + ) elif nulls_fixture is nulls_fixture2: # should preserve NA type - tm.assert_numpy_array_equal(Index(['a', nulls_fixture]).isin( - [nulls_fixture2]), np.array([False, True])) + tm.assert_numpy_array_equal( + Index(["a", nulls_fixture]).isin([nulls_fixture2]), + np.array([False, True]), + ) else: - tm.assert_numpy_array_equal(Index(['a', nulls_fixture]).isin( - [nulls_fixture2]), np.array([False, False])) + tm.assert_numpy_array_equal( + Index(["a", nulls_fixture]).isin([nulls_fixture2]), + np.array([False, False]), + ) def test_isin_nan_common_float64(self, nulls_fixture): if nulls_fixture is pd.NaT: pytest.skip("pd.NaT not compatible with Float64Index") # Float64Index overrides isin, so must be checked separately - tm.assert_numpy_array_equal(Float64Index([1.0, nulls_fixture]).isin( - [np.nan]), np.array([False, True])) + tm.assert_numpy_array_equal( + Float64Index([1.0, nulls_fixture]).isin([np.nan]), np.array([False, True]) + ) # we cannot compare NaT with NaN - tm.assert_numpy_array_equal(Float64Index([1.0, nulls_fixture]).isin( - [pd.NaT]), np.array([False, False])) + tm.assert_numpy_array_equal( + Float64Index([1.0, nulls_fixture]).isin([pd.NaT]), np.array([False, False]) + ) @pytest.mark.parametrize("level", [0, -1]) - @pytest.mark.parametrize("index", [ - Index(['qux', 'baz', 'foo', 'bar']), - # Float64Index overrides isin, so must be checked separately - Float64Index([1.0, 2.0, 3.0, 4.0])]) + @pytest.mark.parametrize( + "index", + [ + Index(["qux", "baz", "foo", "bar"]), + # Float64Index overrides isin, so must be checked separately + Float64Index([1.0, 2.0, 3.0, 4.0]), + ], + ) def test_isin_level_kwarg(self, level, index): - values = index.tolist()[-2:] + ['nonexisting'] + values = index.tolist()[-2:] + ["nonexisting"] expected = np.array([False, False, True, True]) tm.assert_numpy_array_equal(expected, index.isin(values, level=level)) - index.name = 'foobar' - tm.assert_numpy_array_equal(expected, - index.isin(values, level='foobar')) + index.name = "foobar" + tm.assert_numpy_array_equal(expected, index.isin(values, level="foobar")) @pytest.mark.parametrize("level", [2, 10, -3]) def test_isin_level_kwarg_bad_level_raises(self, level, indices): index = indices - with pytest.raises(IndexError, match='Too many levels'): + with pytest.raises(IndexError, match="Too many levels"): index.isin([], level=level) - @pytest.mark.parametrize("label", [1.0, 'foobar', 'xyzzy', np.nan]) + @pytest.mark.parametrize("label", [1.0, "foobar", "xyzzy", np.nan]) def test_isin_level_kwarg_bad_label_raises(self, label, indices): index = indices if isinstance(index, MultiIndex): - index = index.rename(['foo', 'bar']) + index = index.rename(["foo", "bar"]) msg = "'Level {} not found'" else: - index = index.rename('foo') + index = index.rename("foo") msg = r"'Level {} must be same as name \(foo\)'" with pytest.raises(KeyError, match=msg.format(label)): index.isin([], level=label) @@ -1822,21 +2019,24 @@ def test_isin_empty(self, empty): result = index.isin(empty) tm.assert_numpy_array_equal(expected, result) - @pytest.mark.parametrize("values", [ - [1, 2, 3, 4], - [1., 2., 3., 4.], - [True, True, True, True], - ["foo", "bar", "baz", "qux"], - pd.date_range('2018-01-01', freq='D', periods=4)]) + @pytest.mark.parametrize( + "values", + [ + [1, 2, 3, 4], + [1.0, 2.0, 3.0, 4.0], + [True, True, True, True], + ["foo", "bar", "baz", "qux"], + pd.date_range("2018-01-01", freq="D", periods=4), + ], + ) def test_boolean_cmp(self, values): index = Index(values) - result = (index == values) + result = index == values expected = np.array([True, True, True, True], dtype=bool) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("name,level", [ - (None, 0), ('a', 'a')]) + @pytest.mark.parametrize("name,level", [(None, 0), ("a", "a")]) def test_get_level_values(self, name, level): expected = self.strIndex.copy() if name: @@ -1846,43 +2046,55 @@ def test_get_level_values(self, name, level): tm.assert_index_equal(result, expected) def test_slice_keep_name(self): - index = Index(['a', 'b'], name='asdf') + index = Index(["a", "b"], name="asdf") assert index.name == index[1:].name # instance attributes of the form self.Index - @pytest.mark.parametrize('index_kind', - ['unicode', 'str', 'date', 'int', 'float']) + @pytest.mark.parametrize("index_kind", ["unicode", "str", "date", "int", "float"]) def test_join_self(self, join_type, index_kind): - res = getattr(self, '{0}Index'.format(index_kind)) + res = getattr(self, "{0}Index".format(index_kind)) joined = res.join(res, how=join_type) assert res is joined - @pytest.mark.parametrize("method", ['strip', 'rstrip', 'lstrip']) + @pytest.mark.parametrize("method", ["strip", "rstrip", "lstrip"]) def test_str_attribute(self, method): # GH9068 - index = Index([' jack', 'jill ', ' jesse ', 'frank']) + index = Index([" jack", "jill ", " jesse ", "frank"]) expected = Index([getattr(str, method)(x) for x in index.values]) result = getattr(index.str, method)() tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("index", [ - Index(range(5)), tm.makeDateIndex(10), - MultiIndex.from_tuples([('foo', '1'), ('bar', '3')]), - period_range(start='2000', end='2010', freq='A')]) + @pytest.mark.parametrize( + "index", + [ + Index(range(5)), + tm.makeDateIndex(10), + MultiIndex.from_tuples([("foo", "1"), ("bar", "3")]), + period_range(start="2000", end="2010", freq="A"), + ], + ) def test_str_attribute_raises(self, index): - with pytest.raises(AttributeError, match='only use .str accessor'): + with pytest.raises(AttributeError, match="only use .str accessor"): index.str.repeat(2) - @pytest.mark.parametrize("expand,expected", [ - (None, Index([['a', 'b', 'c'], ['d', 'e'], ['f']])), - (False, Index([['a', 'b', 'c'], ['d', 'e'], ['f']])), - (True, MultiIndex.from_tuples([('a', 'b', 'c'), ('d', 'e', np.nan), - ('f', np.nan, np.nan)]))]) + @pytest.mark.parametrize( + "expand,expected", + [ + (None, Index([["a", "b", "c"], ["d", "e"], ["f"]])), + (False, Index([["a", "b", "c"], ["d", "e"], ["f"]])), + ( + True, + MultiIndex.from_tuples( + [("a", "b", "c"), ("d", "e", np.nan), ("f", np.nan, np.nan)] + ), + ), + ], + ) def test_str_split(self, expand, expected): - index = Index(['a b c', 'd e', 'f']) + index = Index(["a b c", "d e", "f"]) if expand is not None: result = index.str.split(expand=expand) else: @@ -1892,30 +2104,31 @@ def test_str_split(self, expand, expected): def test_str_bool_return(self): # test boolean case, should return np.array instead of boolean Index - index = Index(['a1', 'a2', 'b1', 'b2']) - result = index.str.startswith('a') + index = Index(["a1", "a2", "b1", "b2"]) + result = index.str.startswith("a") expected = np.array([True, True, False, False]) tm.assert_numpy_array_equal(result, expected) assert isinstance(result, np.ndarray) def test_str_bool_series_indexing(self): - index = Index(['a1', 'a2', 'b1', 'b2']) + index = Index(["a1", "a2", "b1", "b2"]) s = Series(range(4), index=index) - result = s[s.index.str.startswith('a')] - expected = Series(range(2), index=['a1', 'a2']) + result = s[s.index.str.startswith("a")] + expected = Series(range(2), index=["a1", "a2"]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("index,expected", [ - (Index(list('abcd')), True), (Index(range(4)), False)]) + @pytest.mark.parametrize( + "index,expected", [(Index(list("abcd")), True), (Index(range(4)), False)] + ) def test_tab_completion(self, index, expected): # GH 9910 - result = 'str' in dir(index) + result = "str" in dir(index) assert result == expected def test_indexing_doesnt_change_class(self): - index = Index([1, 2, 3, 'a', 'b', 'c']) + index = Index([1, 2, 3, "a", "b", "c"]) assert index[1:3].identical(pd.Index([2, 3], dtype=np.object_)) assert index[[0, 1]].identical(pd.Index([1, 2], dtype=np.object_)) @@ -1925,44 +2138,44 @@ def test_outer_join_sort(self): right_index = tm.makeDateIndex(10) with tm.assert_produces_warning(RuntimeWarning): - result = left_index.join(right_index, how='outer') + result = left_index.join(right_index, how="outer") # right_index in this case because DatetimeIndex has join precedence # over Int64Index with tm.assert_produces_warning(RuntimeWarning): - expected = right_index.astype(object).union( - left_index.astype(object)) + expected = right_index.astype(object).union(left_index.astype(object)) tm.assert_index_equal(result, expected) def test_nan_first_take_datetime(self): - index = Index([pd.NaT, Timestamp('20130101'), Timestamp('20130102')]) + index = Index([pd.NaT, Timestamp("20130101"), Timestamp("20130102")]) result = index.take([-1, 0, 1]) expected = Index([index[-1], index[0], index[1]]) tm.assert_index_equal(result, expected) def test_take_fill_value(self): # GH 12631 - index = pd.Index(list('ABC'), name='xxx') + index = pd.Index(list("ABC"), name="xxx") result = index.take(np.array([1, 0, -1])) - expected = pd.Index(list('BAC'), name='xxx') + expected = pd.Index(list("BAC"), name="xxx") tm.assert_index_equal(result, expected) # fill_value result = index.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.Index(['B', 'A', np.nan], name='xxx') + expected = pd.Index(["B", "A", np.nan], name="xxx") tm.assert_index_equal(result, expected) # allow_fill=False - result = index.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.Index(['B', 'A', 'C'], name='xxx') + result = index.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.Index(["B", "A", "C"], name="xxx") tm.assert_index_equal(result, expected) def test_take_fill_value_none_raises(self): - index = pd.Index(list('ABC'), name='xxx') - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') + index = pd.Index(list("ABC"), name="xxx") + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) with pytest.raises(ValueError, match=msg): index.take(np.array([1, 0, -2]), fill_value=True) @@ -1970,47 +2183,55 @@ def test_take_fill_value_none_raises(self): index.take(np.array([1, 0, -5]), fill_value=True) def test_take_bad_bounds_raises(self): - index = pd.Index(list('ABC'), name='xxx') - with pytest.raises(IndexError, match='out of bounds'): + index = pd.Index(list("ABC"), name="xxx") + with pytest.raises(IndexError, match="out of bounds"): index.take(np.array([1, -5])) - @pytest.mark.parametrize("name", [None, 'foobar']) - @pytest.mark.parametrize("labels", [ - [], np.array([]), ['A', 'B', 'C'], ['C', 'B', 'A'], - np.array(['A', 'B', 'C']), np.array(['C', 'B', 'A']), - # Must preserve name even if dtype changes - pd.date_range('20130101', periods=3).values, - pd.date_range('20130101', periods=3).tolist()]) - def test_reindex_preserves_name_if_target_is_list_or_ndarray(self, name, - labels): + @pytest.mark.parametrize("name", [None, "foobar"]) + @pytest.mark.parametrize( + "labels", + [ + [], + np.array([]), + ["A", "B", "C"], + ["C", "B", "A"], + np.array(["A", "B", "C"]), + np.array(["C", "B", "A"]), + # Must preserve name even if dtype changes + pd.date_range("20130101", periods=3).values, + pd.date_range("20130101", periods=3).tolist(), + ], + ) + def test_reindex_preserves_name_if_target_is_list_or_ndarray(self, name, labels): # GH6552 index = pd.Index([0, 1, 2]) index.name = name assert index.reindex(labels)[0].name == name - @pytest.mark.parametrize("labels", [ - [], np.array([]), np.array([], dtype=np.int64)]) - def test_reindex_preserves_type_if_target_is_empty_list_or_array(self, - labels): + @pytest.mark.parametrize("labels", [[], np.array([]), np.array([], dtype=np.int64)]) + def test_reindex_preserves_type_if_target_is_empty_list_or_array(self, labels): # GH7774 - index = pd.Index(list('abc')) + index = pd.Index(list("abc")) assert index.reindex(labels)[0].dtype.type == np.object_ - @pytest.mark.parametrize("labels,dtype", [ - (pd.Int64Index([]), np.int64), - (pd.Float64Index([]), np.float64), - (pd.DatetimeIndex([]), np.datetime64)]) - def test_reindex_doesnt_preserve_type_if_target_is_empty_index(self, - labels, - dtype): + @pytest.mark.parametrize( + "labels,dtype", + [ + (pd.Int64Index([]), np.int64), + (pd.Float64Index([]), np.float64), + (pd.DatetimeIndex([]), np.datetime64), + ], + ) + def test_reindex_doesnt_preserve_type_if_target_is_empty_index(self, labels, dtype): # GH7774 - index = pd.Index(list('abc')) + index = pd.Index(list("abc")) assert index.reindex(labels)[0].dtype.type == dtype def test_reindex_no_type_preserve_target_empty_mi(self): - index = pd.Index(list('abc')) - result = index.reindex(pd.MultiIndex( - [pd.Int64Index([]), pd.Float64Index([])], [[], []]))[0] + index = pd.Index(list("abc")) + result = index.reindex( + pd.MultiIndex([pd.Int64Index([]), pd.Float64Index([])], [[], []]) + )[0] assert result.levels[0].dtype.type == np.int64 assert result.levels[1].dtype.type == np.float64 @@ -2021,121 +2242,161 @@ def test_groupby(self): tm.assert_dict_equal(result, expected) - @pytest.mark.parametrize("mi,expected", [ - (MultiIndex.from_tuples([(1, 2), (4, 5)]), np.array([True, True])), - (MultiIndex.from_tuples([(1, 2), (4, 6)]), np.array([True, False]))]) + @pytest.mark.parametrize( + "mi,expected", + [ + (MultiIndex.from_tuples([(1, 2), (4, 5)]), np.array([True, True])), + (MultiIndex.from_tuples([(1, 2), (4, 6)]), np.array([True, False])), + ], + ) def test_equals_op_multiindex(self, mi, expected): # GH9785 # test comparisons of multiindex - df = pd.read_csv(StringIO('a,b,c\n1,2,3\n4,5,6'), index_col=[0, 1]) + df = pd.read_csv(StringIO("a,b,c\n1,2,3\n4,5,6"), index_col=[0, 1]) result = df.index == mi tm.assert_numpy_array_equal(result, expected) def test_equals_op_multiindex_identify(self): - df = pd.read_csv(StringIO('a,b,c\n1,2,3\n4,5,6'), index_col=[0, 1]) + df = pd.read_csv(StringIO("a,b,c\n1,2,3\n4,5,6"), index_col=[0, 1]) result = df.index == df.index expected = np.array([True, True]) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("index", [ - MultiIndex.from_tuples([(1, 2), (4, 5), (8, 9)]), - Index(['foo', 'bar', 'baz'])]) + @pytest.mark.parametrize( + "index", + [ + MultiIndex.from_tuples([(1, 2), (4, 5), (8, 9)]), + Index(["foo", "bar", "baz"]), + ], + ) def test_equals_op_mismatched_multiindex_raises(self, index): - df = pd.read_csv(StringIO('a,b,c\n1,2,3\n4,5,6'), index_col=[0, 1]) + df = pd.read_csv(StringIO("a,b,c\n1,2,3\n4,5,6"), index_col=[0, 1]) with pytest.raises(ValueError, match="Lengths must match"): df.index == index def test_equals_op_index_vs_mi_same_length(self): mi = MultiIndex.from_tuples([(1, 2), (4, 5), (8, 9)]) - index = Index(['foo', 'bar', 'baz']) + index = Index(["foo", "bar", "baz"]) result = mi == index expected = np.array([False, False, False]) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("dt_conv", [ - pd.to_datetime, pd.to_timedelta]) + @pytest.mark.parametrize("dt_conv", [pd.to_datetime, pd.to_timedelta]) def test_dt_conversion_preserves_name(self, dt_conv): # GH 10875 - index = pd.Index(['01:02:03', '01:02:04'], name='label') + index = pd.Index(["01:02:03", "01:02:04"], name="label") assert index.name == dt_conv(index).name - @pytest.mark.parametrize("index,expected", [ - # ASCII - # short - (pd.Index(['a', 'bb', 'ccc']), - """Index(['a', 'bb', 'ccc'], dtype='object')"""), - # multiple lines - (pd.Index(['a', 'bb', 'ccc'] * 10), - """\ + @pytest.mark.parametrize( + "index,expected", + [ + # ASCII + # short + ( + pd.Index(["a", "bb", "ccc"]), + """Index(['a', 'bb', 'ccc'], dtype='object')""", + ), + # multiple lines + ( + pd.Index(["a", "bb", "ccc"] * 10), + """\ Index(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], - dtype='object')"""), - # truncated - (pd.Index(['a', 'bb', 'ccc'] * 100), - """\ + dtype='object')""", + ), + # truncated + ( + pd.Index(["a", "bb", "ccc"] * 100), + """\ Index(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', ... 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], - dtype='object', length=300)"""), - - # Non-ASCII - # short - (pd.Index(['あ', 'いい', 'ううう']), - """Index(['あ', 'いい', 'ううう'], dtype='object')"""), - # multiple lines - (pd.Index(['あ', 'いい', 'ううう'] * 10), - ("Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " - "'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',\n" - " 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " - "'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',\n" - " 'あ', 'いい', 'ううう', 'あ', 'いい', " - "'ううう'],\n" - " dtype='object')")), - # truncated - (pd.Index(['あ', 'いい', 'ううう'] * 100), - ("Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " - "'あ', 'いい', 'ううう', 'あ',\n" - " ...\n" - " 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', " - "'ううう', 'あ', 'いい', 'ううう'],\n" - " dtype='object', length=300)"))]) + dtype='object', length=300)""", + ), + # Non-ASCII + # short + ( + pd.Index(["あ", "いい", "ううう"]), + """Index(['あ', 'いい', 'ううう'], dtype='object')""", + ), + # multiple lines + ( + pd.Index(["あ", "いい", "ううう"] * 10), + ( + "Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " + "'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',\n" + " 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " + "'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',\n" + " 'あ', 'いい', 'ううう', 'あ', 'いい', " + "'ううう'],\n" + " dtype='object')" + ), + ), + # truncated + ( + pd.Index(["あ", "いい", "ううう"] * 100), + ( + "Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " + "'あ', 'いい', 'ううう', 'あ',\n" + " ...\n" + " 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', " + "'ううう', 'あ', 'いい', 'ううう'],\n" + " dtype='object', length=300)" + ), + ), + ], + ) def test_string_index_repr(self, index, expected): result = repr(index) assert result == expected - @pytest.mark.parametrize("index,expected", [ - # short - (pd.Index(['あ', 'いい', 'ううう']), - ("Index(['あ', 'いい', 'ううう'], " - "dtype='object')")), - # multiple lines - (pd.Index(['あ', 'いい', 'ううう'] * 10), - ("Index(['あ', 'いい', 'ううう', 'あ', 'いい', " - "'ううう', 'あ', 'いい', 'ううう',\n" - " 'あ', 'いい', 'ううう', 'あ', 'いい', " - "'ううう', 'あ', 'いい', 'ううう',\n" - " 'あ', 'いい', 'ううう', 'あ', 'いい', " - "'ううう', 'あ', 'いい', 'ううう',\n" - " 'あ', 'いい', 'ううう'],\n" - " dtype='object')""")), - # truncated - (pd.Index(['あ', 'いい', 'ううう'] * 100), - ("Index(['あ', 'いい', 'ううう', 'あ', 'いい', " - "'ううう', 'あ', 'いい', 'ううう',\n" - " 'あ',\n" - " ...\n" - " 'ううう', 'あ', 'いい', 'ううう', 'あ', " - "'いい', 'ううう', 'あ', 'いい',\n" - " 'ううう'],\n" - " dtype='object', length=300)"))]) + @pytest.mark.parametrize( + "index,expected", + [ + # short + ( + pd.Index(["あ", "いい", "ううう"]), + ("Index(['あ', 'いい', 'ううう'], " "dtype='object')"), + ), + # multiple lines + ( + pd.Index(["あ", "いい", "ううう"] * 10), + ( + "Index(['あ', 'いい', 'ううう', 'あ', 'いい', " + "'ううう', 'あ', 'いい', 'ううう',\n" + " 'あ', 'いい', 'ううう', 'あ', 'いい', " + "'ううう', 'あ', 'いい', 'ううう',\n" + " 'あ', 'いい', 'ううう', 'あ', 'いい', " + "'ううう', 'あ', 'いい', 'ううう',\n" + " 'あ', 'いい', 'ううう'],\n" + " dtype='object')" + "" + ), + ), + # truncated + ( + pd.Index(["あ", "いい", "ううう"] * 100), + ( + "Index(['あ', 'いい', 'ううう', 'あ', 'いい', " + "'ううう', 'あ', 'いい', 'ううう',\n" + " 'あ',\n" + " ...\n" + " 'ううう', 'あ', 'いい', 'ううう', 'あ', " + "'いい', 'ううう', 'あ', 'いい',\n" + " 'ううう'],\n" + " dtype='object', length=300)" + ), + ), + ], + ) def test_string_index_repr_with_unicode_option(self, index, expected): # Enable Unicode option ----------------------------------------- - with cf.option_context('display.unicode.east_asian_width', True): + with cf.option_context("display.unicode.east_asian_width", True): result = repr(index) assert result == expected @@ -2151,14 +2412,14 @@ def test_get_duplicates_deprecated(self): def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 - pytest.importorskip('IPython', minversion="6.0.0") + pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; idx = pd.Index([1, 2])" ip.run_code(code) with tm.assert_produces_warning(None): - with provisionalcompleter('ignore'): - list(ip.Completer.completions('idx.', 4)) + with provisionalcompleter("ignore"): + list(ip.Completer.completions("idx.", 4)) def test_deprecated_contains(self): for index in self.indices.values(): @@ -2174,7 +2435,7 @@ class TestMixedIntIndex(Base): _holder = Index def setup_method(self, method): - self.indices = dict(mixedIndex=Index([0, 'a', 1, 'b', 2, 'c'])) + self.indices = dict(mixedIndex=Index([0, "a", 1, "b", 2, "c"])) self.setup_indices() def create_index(self): @@ -2203,42 +2464,42 @@ def test_copy_name(self): # GH12309 index = self.create_index() - first = index.__class__(index, copy=True, name='mario') + first = index.__class__(index, copy=True, name="mario") second = first.__class__(first, copy=False) # Even though "copy=False", we want a new object. assert first is not second tm.assert_index_equal(first, second) - assert first.name == 'mario' - assert second.name == 'mario' + assert first.name == "mario" + assert second.name == "mario" s1 = Series(2, index=first) s2 = Series(3, index=second[:-1]) s3 = s1 * s2 - assert s3.index.name == 'mario' + assert s3.index.name == "mario" def test_copy_name2(self): # Check that adding a "name" parameter to the copy is honored # GH14302 - index = pd.Index([1, 2], name='MyName') + index = pd.Index([1, 2], name="MyName") index1 = index.copy() tm.assert_index_equal(index, index1) - index2 = index.copy(name='NewName') + index2 = index.copy(name="NewName") tm.assert_index_equal(index, index2, check_names=False) - assert index.name == 'MyName' - assert index2.name == 'NewName' + assert index.name == "MyName" + assert index2.name == "NewName" - index3 = index.copy(names=['NewName']) + index3 = index.copy(names=["NewName"]) tm.assert_index_equal(index, index3, check_names=False) - assert index.name == 'MyName' - assert index.names == ['MyName'] - assert index3.name == 'NewName' - assert index3.names == ['NewName'] + assert index.name == "MyName" + assert index.names == ["MyName"] + assert index3.name == "NewName" + assert index3.names == ["NewName"] def test_union_base(self): index = self.create_index() @@ -2247,11 +2508,10 @@ def test_union_base(self): result = first.union(second) - expected = Index([0, 1, 2, 'a', 'b', 'c']) + expected = Index([0, 1, 2, "a", "b", "c"]) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("klass", [ - np.array, Series, list]) + @pytest.mark.parametrize("klass", [np.array, Series, list]) def test_union_different_type_base(self, klass): # GH 10149 index = self.create_index() @@ -2263,8 +2523,8 @@ def test_union_different_type_base(self, klass): assert tm.equalContents(result, index) def test_unique_na(self): - idx = pd.Index([2, np.nan, 2, 1], name='my_index') - expected = pd.Index([2, np.nan, 1], name='my_index') + idx = pd.Index([2, np.nan, 2, 1], name="my_index") + expected = pd.Index([2, np.nan, 1], name="my_index") result = idx.unique() tm.assert_index_equal(result, expected) @@ -2275,12 +2535,11 @@ def test_intersection_base(self, sort): first = index[:5] second = index[:3] - expected = Index([0, 1, 'a']) if sort is None else Index([0, 'a', 1]) + expected = Index([0, 1, "a"]) if sort is None else Index([0, "a", 1]) result = first.intersection(second, sort=sort) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("klass", [ - np.array, Series, list]) + @pytest.mark.parametrize("klass", [np.array, Series, list]) @pytest.mark.parametrize("sort", [None, False]) def test_intersection_different_type_base(self, klass, sort): # GH 10149 @@ -2299,7 +2558,7 @@ def test_difference_base(self, sort): second = index[3:] result = first.difference(second, sort) - expected = Index([0, 'a', 1]) + expected = Index([0, "a", 1]) if sort is None: expected = Index(safe_sort(expected)) tm.assert_index_equal(result, expected) @@ -2311,7 +2570,7 @@ def test_symmetric_difference(self): second = index[3:] result = first.symmetric_difference(second) - expected = Index([0, 1, 2, 'a', 'c']) + expected = Index([0, 1, 2, "a", "c"]) tm.assert_index_equal(result, expected) def test_logical_compat(self): @@ -2319,14 +2578,18 @@ def test_logical_compat(self): assert index.all() == index.values.all() assert index.any() == index.values.any() - @pytest.mark.parametrize("how", ['any', 'all']) - @pytest.mark.parametrize("dtype", [ - None, object, 'category']) - @pytest.mark.parametrize("vals,expected", [ - ([1, 2, 3], [1, 2, 3]), ([1., 2., 3.], [1., 2., 3.]), - ([1., 2., np.nan, 3.], [1., 2., 3.]), - (['A', 'B', 'C'], ['A', 'B', 'C']), - (['A', np.nan, 'B', 'C'], ['A', 'B', 'C'])]) + @pytest.mark.parametrize("how", ["any", "all"]) + @pytest.mark.parametrize("dtype", [None, object, "category"]) + @pytest.mark.parametrize( + "vals,expected", + [ + ([1, 2, 3], [1, 2, 3]), + ([1.0, 2.0, 3.0], [1.0, 2.0, 3.0]), + ([1.0, 2.0, np.nan, 3.0], [1.0, 2.0, 3.0]), + (["A", "B", "C"], ["A", "B", "C"]), + (["A", np.nan, "B", "C"], ["A", "B", "C"]), + ], + ) def test_dropna(self, how, dtype, vals, expected): # GH 6194 index = pd.Index(vals, dtype=dtype) @@ -2334,20 +2597,36 @@ def test_dropna(self, how, dtype, vals, expected): expected = pd.Index(expected, dtype=dtype) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("how", ['any', 'all']) - @pytest.mark.parametrize("index,expected", [ - (pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03']), - pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'])), - (pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', pd.NaT]), - pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'])), - (pd.TimedeltaIndex(['1 days', '2 days', '3 days']), - pd.TimedeltaIndex(['1 days', '2 days', '3 days'])), - (pd.TimedeltaIndex([pd.NaT, '1 days', '2 days', '3 days', pd.NaT]), - pd.TimedeltaIndex(['1 days', '2 days', '3 days'])), - (pd.PeriodIndex(['2012-02', '2012-04', '2012-05'], freq='M'), - pd.PeriodIndex(['2012-02', '2012-04', '2012-05'], freq='M')), - (pd.PeriodIndex(['2012-02', '2012-04', 'NaT', '2012-05'], freq='M'), - pd.PeriodIndex(['2012-02', '2012-04', '2012-05'], freq='M'))]) + @pytest.mark.parametrize("how", ["any", "all"]) + @pytest.mark.parametrize( + "index,expected", + [ + ( + pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"]), + pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"]), + ), + ( + pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03", pd.NaT]), + pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"]), + ), + ( + pd.TimedeltaIndex(["1 days", "2 days", "3 days"]), + pd.TimedeltaIndex(["1 days", "2 days", "3 days"]), + ), + ( + pd.TimedeltaIndex([pd.NaT, "1 days", "2 days", "3 days", pd.NaT]), + pd.TimedeltaIndex(["1 days", "2 days", "3 days"]), + ), + ( + pd.PeriodIndex(["2012-02", "2012-04", "2012-05"], freq="M"), + pd.PeriodIndex(["2012-02", "2012-04", "2012-05"], freq="M"), + ), + ( + pd.PeriodIndex(["2012-02", "2012-04", "NaT", "2012-05"], freq="M"), + pd.PeriodIndex(["2012-02", "2012-04", "2012-05"], freq="M"), + ), + ], + ) def test_dropna_dt_like(self, how, index, expected): result = index.dropna(how=how) tm.assert_index_equal(result, expected) @@ -2355,7 +2634,7 @@ def test_dropna_dt_like(self, how, index, expected): def test_dropna_invalid_how_raises(self): msg = "invalid how option: xxx" with pytest.raises(ValueError, match=msg): - pd.Index([1, 2, 3]).dropna(how='xxx') + pd.Index([1, 2, 3]).dropna(how="xxx") def test_get_combined_index(self): result = _get_combined_index([]) @@ -2370,12 +2649,19 @@ def test_repeat(self): result = index.repeat(repeats) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("index", [ - pd.Index([np.nan]), pd.Index([np.nan, 1]), - pd.Index([1, 2, np.nan]), pd.Index(['a', 'b', np.nan]), - pd.to_datetime(['NaT']), pd.to_datetime(['NaT', '2000-01-01']), - pd.to_datetime(['2000-01-01', 'NaT', '2000-01-02']), - pd.to_timedelta(['1 day', 'NaT'])]) + @pytest.mark.parametrize( + "index", + [ + pd.Index([np.nan]), + pd.Index([np.nan, 1]), + pd.Index([1, 2, np.nan]), + pd.Index(["a", "b", np.nan]), + pd.to_datetime(["NaT"]), + pd.to_datetime(["NaT", "2000-01-01"]), + pd.to_datetime(["2000-01-01", "NaT", "2000-01-02"]), + pd.to_timedelta(["1 day", "NaT"]), + ], + ) def test_is_monotonic_na(self, index): assert index.is_monotonic_increasing is False assert index.is_monotonic_decreasing is False @@ -2383,21 +2669,19 @@ def test_is_monotonic_na(self, index): assert index._is_strictly_monotonic_decreasing is False def test_repr_summary(self): - with cf.option_context('display.max_seq_items', 10): + with cf.option_context("display.max_seq_items", 10): result = repr(pd.Index(np.arange(1000))) assert len(result) < 200 assert "..." in result @pytest.mark.parametrize("klass", [Series, DataFrame]) def test_int_name_format(self, klass): - index = Index(['a', 'b', 'c'], name=0) + index = Index(["a", "b", "c"], name=0) result = klass(list(range(3)), index=index) - assert '0' in repr(result) + assert "0" in repr(result) def test_print_unicode_columns(self): - df = pd.DataFrame({"\u05d0": [1, 2, 3], - "\u05d1": [4, 5, 6], - "c": [7, 8, 9]}) + df = pd.DataFrame({"\u05d0": [1, 2, 3], "\u05d1": [4, 5, 6], "c": [7, 8, 9]}) repr(df.columns) # should not raise UnicodeDecodeError def test_str_to_bytes_raises(self): @@ -2411,7 +2695,7 @@ def test_intersect_str_dates(self): dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] index1 = Index(dt_dates, dtype=object) - index2 = Index(['aa'], dtype=object) + index2 = Index(["aa"], dtype=object) result = index2.intersection(index1) expected = Index([], dtype=object) @@ -2419,78 +2703,107 @@ def test_intersect_str_dates(self): class TestIndexUtils: - - @pytest.mark.parametrize('data, names, expected', [ - ([[1, 2, 3]], None, Index([1, 2, 3])), - ([[1, 2, 3]], ['name'], Index([1, 2, 3], name='name')), - ([['a', 'a'], ['c', 'd']], None, - MultiIndex([['a'], ['c', 'd']], [[0, 0], [0, 1]])), - ([['a', 'a'], ['c', 'd']], ['L1', 'L2'], - MultiIndex([['a'], ['c', 'd']], [[0, 0], [0, 1]], - names=['L1', 'L2'])), - ]) + @pytest.mark.parametrize( + "data, names, expected", + [ + ([[1, 2, 3]], None, Index([1, 2, 3])), + ([[1, 2, 3]], ["name"], Index([1, 2, 3], name="name")), + ( + [["a", "a"], ["c", "d"]], + None, + MultiIndex([["a"], ["c", "d"]], [[0, 0], [0, 1]]), + ), + ( + [["a", "a"], ["c", "d"]], + ["L1", "L2"], + MultiIndex([["a"], ["c", "d"]], [[0, 0], [0, 1]], names=["L1", "L2"]), + ), + ], + ) def test_ensure_index_from_sequences(self, data, names, expected): result = ensure_index_from_sequences(data, names) tm.assert_index_equal(result, expected) def test_ensure_index_mixed_closed_intervals(self): # GH27172 - intervals = [pd.Interval(0, 1, closed='left'), - pd.Interval(1, 2, closed='right'), - pd.Interval(2, 3, closed='neither'), - pd.Interval(3, 4, closed='both')] + intervals = [ + pd.Interval(0, 1, closed="left"), + pd.Interval(1, 2, closed="right"), + pd.Interval(2, 3, closed="neither"), + pd.Interval(3, 4, closed="both"), + ] result = ensure_index(intervals) expected = Index(intervals, dtype=object) tm.assert_index_equal(result, expected) -@pytest.mark.parametrize('opname', ['eq', 'ne', 'le', 'lt', 'ge', 'gt', - 'add', 'radd', 'sub', 'rsub', - 'mul', 'rmul', 'truediv', 'rtruediv', - 'floordiv', 'rfloordiv', - 'pow', 'rpow', 'mod', 'divmod']) +@pytest.mark.parametrize( + "opname", + [ + "eq", + "ne", + "le", + "lt", + "ge", + "gt", + "add", + "radd", + "sub", + "rsub", + "mul", + "rmul", + "truediv", + "rtruediv", + "floordiv", + "rfloordiv", + "pow", + "rpow", + "mod", + "divmod", + ], +) def test_generated_op_names(opname, indices): index = indices - if isinstance(index, ABCIndex) and opname == 'rsub': + if isinstance(index, ABCIndex) and opname == "rsub": # pd.Index.__rsub__ does not exist; though the method does exist # for subclasses. see GH#19723 return - opname = '__{name}__'.format(name=opname) + opname = "__{name}__".format(name=opname) method = getattr(index, opname) assert method.__name__ == opname -@pytest.mark.parametrize('index_maker', tm.index_subclass_makers_generator()) +@pytest.mark.parametrize("index_maker", tm.index_subclass_makers_generator()) def test_index_subclass_constructor_wrong_kwargs(index_maker): # GH #19348 - with pytest.raises(TypeError, match='unexpected keyword argument'): - index_maker(foo='bar') + with pytest.raises(TypeError, match="unexpected keyword argument"): + index_maker(foo="bar") def test_deprecated_fastpath(): with tm.assert_produces_warning(FutureWarning): - idx = pd.Index( - np.array(['a', 'b'], dtype=object), name='test', fastpath=True) + idx = pd.Index(np.array(["a", "b"], dtype=object), name="test", fastpath=True) - expected = pd.Index(['a', 'b'], name='test') + expected = pd.Index(["a", "b"], name="test") tm.assert_index_equal(idx, expected) with tm.assert_produces_warning(FutureWarning): idx = pd.Int64Index( - np.array([1, 2, 3], dtype='int64'), name='test', fastpath=True) + np.array([1, 2, 3], dtype="int64"), name="test", fastpath=True + ) - expected = pd.Index([1, 2, 3], name='test', dtype='int64') + expected = pd.Index([1, 2, 3], name="test", dtype="int64") tm.assert_index_equal(idx, expected) with tm.assert_produces_warning(FutureWarning): - idx = pd.RangeIndex(0, 5, 2, name='test', fastpath=True) + idx = pd.RangeIndex(0, 5, 2, name="test", fastpath=True) - expected = pd.RangeIndex(0, 5, 2, name='test') + expected = pd.RangeIndex(0, 5, 2, name="test") tm.assert_index_equal(idx, expected) with tm.assert_produces_warning(FutureWarning): - idx = pd.CategoricalIndex(['a', 'b', 'c'], name='test', fastpath=True) + idx = pd.CategoricalIndex(["a", "b", "c"], name="test", fastpath=True) - expected = pd.CategoricalIndex(['a', 'b', 'c'], name='test') + expected = pd.CategoricalIndex(["a", "b", "c"], name="test") tm.assert_index_equal(idx, expected) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 3b5092c9010619..d52bc818c95aaa 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -25,18 +25,17 @@ def setup_method(self, method): def create_index(self, categories=None, ordered=False): if categories is None: - categories = list('cab') - return CategoricalIndex( - list('aabbca'), categories=categories, ordered=ordered) + categories = list("cab") + return CategoricalIndex(list("aabbca"), categories=categories, ordered=ordered) def test_can_hold_identifiers(self): - idx = self.create_index(categories=list('abcd')) + idx = self.create_index(categories=list("abcd")) key = idx[0] assert idx._can_hold_identifiers_and_holds_name(key) is True def test_construction(self): - ci = self.create_index(categories=list('abcd')) + ci = self.create_index(categories=list("abcd")) categories = ci.categories result = Index(ci) @@ -50,56 +49,57 @@ def test_construction(self): # empty result = CategoricalIndex(categories=categories) tm.assert_index_equal(result.categories, Index(categories)) - tm.assert_numpy_array_equal(result.codes, np.array([], dtype='int8')) + tm.assert_numpy_array_equal(result.codes, np.array([], dtype="int8")) assert not result.ordered # passing categories - result = CategoricalIndex(list('aabbca'), categories=categories) + result = CategoricalIndex(list("aabbca"), categories=categories) tm.assert_index_equal(result.categories, Index(categories)) - tm.assert_numpy_array_equal(result.codes, - np.array([0, 0, 1, - 1, 2, 0], dtype='int8')) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) - c = pd.Categorical(list('aabbca')) + c = pd.Categorical(list("aabbca")) result = CategoricalIndex(c) - tm.assert_index_equal(result.categories, Index(list('abc'))) - tm.assert_numpy_array_equal(result.codes, - np.array([0, 0, 1, - 1, 2, 0], dtype='int8')) + tm.assert_index_equal(result.categories, Index(list("abc"))) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) assert not result.ordered result = CategoricalIndex(c, categories=categories) tm.assert_index_equal(result.categories, Index(categories)) - tm.assert_numpy_array_equal(result.codes, - np.array([0, 0, 1, - 1, 2, 0], dtype='int8')) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) assert not result.ordered - ci = CategoricalIndex(c, categories=list('abcd')) + ci = CategoricalIndex(c, categories=list("abcd")) result = CategoricalIndex(ci) tm.assert_index_equal(result.categories, Index(categories)) - tm.assert_numpy_array_equal(result.codes, - np.array([0, 0, 1, - 1, 2, 0], dtype='int8')) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) assert not result.ordered - result = CategoricalIndex(ci, categories=list('ab')) - tm.assert_index_equal(result.categories, Index(list('ab'))) - tm.assert_numpy_array_equal(result.codes, - np.array([0, 0, 1, - 1, -1, 0], dtype='int8')) + result = CategoricalIndex(ci, categories=list("ab")) + tm.assert_index_equal(result.categories, Index(list("ab"))) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8") + ) assert not result.ordered - result = CategoricalIndex(ci, categories=list('ab'), ordered=True) - tm.assert_index_equal(result.categories, Index(list('ab'))) - tm.assert_numpy_array_equal(result.codes, - np.array([0, 0, 1, - 1, -1, 0], dtype='int8')) + result = CategoricalIndex(ci, categories=list("ab"), ordered=True) + tm.assert_index_equal(result.categories, Index(list("ab"))) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8") + ) assert result.ordered - result = pd.CategoricalIndex(ci, categories=list('ab'), ordered=True) - expected = pd.CategoricalIndex(ci, categories=list('ab'), ordered=True, - dtype='category') + result = pd.CategoricalIndex(ci, categories=list("ab"), ordered=True) + expected = pd.CategoricalIndex( + ci, categories=list("ab"), ordered=True, dtype="category" + ) tm.assert_index_equal(result, expected, exact=True) # turn me to an Index @@ -110,24 +110,22 @@ def test_construction(self): def test_construction_with_dtype(self): # specify dtype - ci = self.create_index(categories=list('abc')) + ci = self.create_index(categories=list("abc")) - result = Index(np.array(ci), dtype='category') + result = Index(np.array(ci), dtype="category") tm.assert_index_equal(result, ci, exact=True) - result = Index(np.array(ci).tolist(), dtype='category') + result = Index(np.array(ci).tolist(), dtype="category") tm.assert_index_equal(result, ci, exact=True) # these are generally only equal when the categories are reordered ci = self.create_index() - result = Index( - np.array(ci), dtype='category').reorder_categories(ci.categories) + result = Index(np.array(ci), dtype="category").reorder_categories(ci.categories) tm.assert_index_equal(result, ci, exact=True) # make sure indexes are handled - expected = CategoricalIndex([0, 1, 2], categories=[0, 1, 2], - ordered=True) + expected = CategoricalIndex([0, 1, 2], categories=[0, 1, 2], ordered=True) idx = Index(range(3)) result = CategoricalIndex(idx, categories=idx, ordered=True) tm.assert_index_equal(result, expected, exact=True) @@ -141,7 +139,7 @@ def test_construction_empty_with_bool_categories(self): def test_construction_with_categorical_dtype(self): # construction with CategoricalDtype # GH18109 - data, cats, ordered = 'a a b b'.split(), 'c b a'.split(), True + data, cats, ordered = "a a b b".split(), "c b a".split(), True dtype = CategoricalDtype(categories=cats, ordered=ordered) result = CategoricalIndex(data, dtype=dtype) @@ -170,93 +168,108 @@ def test_create_categorical(self): # https://github.com/pandas-dev/pandas/pull/17513 # The public CI constructor doesn't hit this code path with # instances of CategoricalIndex, but we still want to test the code - ci = CategoricalIndex(['a', 'b', 'c']) + ci = CategoricalIndex(["a", "b", "c"]) # First ci is self, second ci is data. result = CategoricalIndex._create_categorical(ci, ci) - expected = Categorical(['a', 'b', 'c']) + expected = Categorical(["a", "b", "c"]) tm.assert_categorical_equal(result, expected) - @pytest.mark.parametrize('func,op_name', [ - (lambda idx: idx - idx, '__sub__'), - (lambda idx: idx + idx, '__add__'), - (lambda idx: idx - ['a', 'b'], '__sub__'), - (lambda idx: idx + ['a', 'b'], '__add__'), - (lambda idx: ['a', 'b'] - idx, '__rsub__'), - (lambda idx: ['a', 'b'] + idx, '__radd__'), - ]) + @pytest.mark.parametrize( + "func,op_name", + [ + (lambda idx: idx - idx, "__sub__"), + (lambda idx: idx + idx, "__add__"), + (lambda idx: idx - ["a", "b"], "__sub__"), + (lambda idx: idx + ["a", "b"], "__add__"), + (lambda idx: ["a", "b"] - idx, "__rsub__"), + (lambda idx: ["a", "b"] + idx, "__radd__"), + ], + ) def test_disallow_set_ops(self, func, op_name): # GH 10039 # set ops (+/-) raise TypeError - idx = pd.Index(pd.Categorical(['a', 'b'])) + idx = pd.Index(pd.Categorical(["a", "b"])) msg = "cannot perform {} with this index type: CategoricalIndex" with pytest.raises(TypeError, match=msg.format(op_name)): func(idx) def test_method_delegation(self): - ci = CategoricalIndex(list('aabbca'), categories=list('cabdef')) - result = ci.set_categories(list('cab')) - tm.assert_index_equal(result, CategoricalIndex( - list('aabbca'), categories=list('cab'))) + ci = CategoricalIndex(list("aabbca"), categories=list("cabdef")) + result = ci.set_categories(list("cab")) + tm.assert_index_equal( + result, CategoricalIndex(list("aabbca"), categories=list("cab")) + ) - ci = CategoricalIndex(list('aabbca'), categories=list('cab')) - result = ci.rename_categories(list('efg')) - tm.assert_index_equal(result, CategoricalIndex( - list('ffggef'), categories=list('efg'))) + ci = CategoricalIndex(list("aabbca"), categories=list("cab")) + result = ci.rename_categories(list("efg")) + tm.assert_index_equal( + result, CategoricalIndex(list("ffggef"), categories=list("efg")) + ) # GH18862 (let rename_categories take callables) result = ci.rename_categories(lambda x: x.upper()) - tm.assert_index_equal(result, CategoricalIndex( - list('AABBCA'), categories=list('CAB'))) - - ci = CategoricalIndex(list('aabbca'), categories=list('cab')) - result = ci.add_categories(['d']) - tm.assert_index_equal(result, CategoricalIndex( - list('aabbca'), categories=list('cabd'))) - - ci = CategoricalIndex(list('aabbca'), categories=list('cab')) - result = ci.remove_categories(['c']) - tm.assert_index_equal(result, CategoricalIndex( - list('aabb') + [np.nan] + ['a'], categories=list('ab'))) - - ci = CategoricalIndex(list('aabbca'), categories=list('cabdef')) + tm.assert_index_equal( + result, CategoricalIndex(list("AABBCA"), categories=list("CAB")) + ) + + ci = CategoricalIndex(list("aabbca"), categories=list("cab")) + result = ci.add_categories(["d"]) + tm.assert_index_equal( + result, CategoricalIndex(list("aabbca"), categories=list("cabd")) + ) + + ci = CategoricalIndex(list("aabbca"), categories=list("cab")) + result = ci.remove_categories(["c"]) + tm.assert_index_equal( + result, + CategoricalIndex(list("aabb") + [np.nan] + ["a"], categories=list("ab")), + ) + + ci = CategoricalIndex(list("aabbca"), categories=list("cabdef")) result = ci.as_unordered() tm.assert_index_equal(result, ci) - ci = CategoricalIndex(list('aabbca'), categories=list('cabdef')) + ci = CategoricalIndex(list("aabbca"), categories=list("cabdef")) result = ci.as_ordered() - tm.assert_index_equal(result, CategoricalIndex( - list('aabbca'), categories=list('cabdef'), ordered=True)) + tm.assert_index_equal( + result, + CategoricalIndex(list("aabbca"), categories=list("cabdef"), ordered=True), + ) # invalid msg = "cannot use inplace with CategoricalIndex" with pytest.raises(ValueError, match=msg): - ci.set_categories(list('cab'), inplace=True) + ci.set_categories(list("cab"), inplace=True) def test_contains(self): - ci = self.create_index(categories=list('cabdef')) + ci = self.create_index(categories=list("cabdef")) - assert 'a' in ci - assert 'z' not in ci - assert 'e' not in ci + assert "a" in ci + assert "z" not in ci + assert "e" not in ci assert np.nan not in ci # assert codes NOT in index assert 0 not in ci assert 1 not in ci - ci = CategoricalIndex( - list('aabbca') + [np.nan], categories=list('cabdef')) + ci = CategoricalIndex(list("aabbca") + [np.nan], categories=list("cabdef")) assert np.nan in ci - @pytest.mark.parametrize('item, expected', [ - (pd.Interval(0, 1), True), - (1.5, True), - (pd.Interval(0.5, 1.5), False), - ('a', False), - (pd.Timestamp(1), False), - (pd.Timedelta(1), False)], ids=str) + @pytest.mark.parametrize( + "item, expected", + [ + (pd.Interval(0, 1), True), + (1.5, True), + (pd.Interval(0.5, 1.5), False), + ("a", False), + (pd.Timestamp(1), False), + (pd.Timedelta(1), False), + ], + ids=str, + ) def test_contains_interval(self, item, expected): # GH 23705 ci = CategoricalIndex(IntervalIndex.from_breaks(range(3))) @@ -264,49 +277,47 @@ def test_contains_interval(self, item, expected): assert result is expected def test_map(self): - ci = pd.CategoricalIndex(list('ABABC'), categories=list('CBA'), - ordered=True) + ci = pd.CategoricalIndex(list("ABABC"), categories=list("CBA"), ordered=True) result = ci.map(lambda x: x.lower()) - exp = pd.CategoricalIndex(list('ababc'), categories=list('cba'), - ordered=True) + exp = pd.CategoricalIndex(list("ababc"), categories=list("cba"), ordered=True) tm.assert_index_equal(result, exp) - ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'), - ordered=False, name='XXX') + ci = pd.CategoricalIndex( + list("ABABC"), categories=list("BAC"), ordered=False, name="XXX" + ) result = ci.map(lambda x: x.lower()) - exp = pd.CategoricalIndex(list('ababc'), categories=list('bac'), - ordered=False, name='XXX') + exp = pd.CategoricalIndex( + list("ababc"), categories=list("bac"), ordered=False, name="XXX" + ) tm.assert_index_equal(result, exp) # GH 12766: Return an index not an array - tm.assert_index_equal(ci.map(lambda x: 1), - Index(np.array([1] * 5, dtype=np.int64), - name='XXX')) + tm.assert_index_equal( + ci.map(lambda x: 1), Index(np.array([1] * 5, dtype=np.int64), name="XXX") + ) # change categories dtype - ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'), - ordered=False) + ci = pd.CategoricalIndex(list("ABABC"), categories=list("BAC"), ordered=False) def f(x): - return {'A': 10, 'B': 20, 'C': 30}.get(x) + return {"A": 10, "B": 20, "C": 30}.get(x) result = ci.map(f) - exp = pd.CategoricalIndex([10, 20, 10, 20, 30], - categories=[20, 10, 30], - ordered=False) + exp = pd.CategoricalIndex( + [10, 20, 10, 20, 30], categories=[20, 10, 30], ordered=False + ) tm.assert_index_equal(result, exp) - result = ci.map(pd.Series([10, 20, 30], index=['A', 'B', 'C'])) + result = ci.map(pd.Series([10, 20, 30], index=["A", "B", "C"])) tm.assert_index_equal(result, exp) - result = ci.map({'A': 10, 'B': 20, 'C': 30}) + result = ci.map({"A": 10, "B": 20, "C": 30}) tm.assert_index_equal(result, exp) def test_map_with_categorical_series(self): # GH 12756 a = pd.Index([1, 2, 3, 4]) - b = pd.Series(["even", "odd", "even", "odd"], - dtype="category") + b = pd.Series(["even", "odd", "even", "odd"], dtype="category") c = pd.Series(["even", "odd", "even", "odd"]) exp = CategoricalIndex(["odd", "even", "odd", np.nan]) @@ -315,18 +326,16 @@ def test_map_with_categorical_series(self): tm.assert_index_equal(a.map(c), exp) @pytest.mark.parametrize( - ( - 'data', - 'f' - ), + ("data", "f"), ( ([1, 1, np.nan], pd.isna), ([1, 2, np.nan], pd.isna), ([1, 1, np.nan], {1: False}), ([1, 2, np.nan], {1: False, 2: False}), ([1, 1, np.nan], pd.Series([False, False])), - ([1, 2, np.nan], pd.Series([False, False, False])) - )) + ([1, 2, np.nan], pd.Series([False, False, False])), + ), + ) def test_map_with_nan(self, data, f): # GH 24241 values = pd.Categorical(data) result = values.map(f) @@ -337,7 +346,7 @@ def test_map_with_nan(self, data, f): # GH 24241 expected = pd.Index([False, False, np.nan]) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('klass', [list, tuple, np.array, pd.Series]) + @pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) def test_where(self, klass): i = self.create_index() cond = [True] * len(i) @@ -346,8 +355,7 @@ def test_where(self, klass): tm.assert_index_equal(result, expected) cond = [False] + [True] * (len(i) - 1) - expected = CategoricalIndex([np.nan] + i[1:].tolist(), - categories=i.categories) + expected = CategoricalIndex([np.nan] + i[1:].tolist(), categories=i.categories) result = i.where(klass(cond)) tm.assert_index_equal(result, expected) @@ -371,31 +379,31 @@ def test_append(self): # appending with different categories or reordered is not ok msg = "all inputs must be Index" with pytest.raises(TypeError, match=msg): - ci.append(ci.values.set_categories(list('abcd'))) + ci.append(ci.values.set_categories(list("abcd"))) with pytest.raises(TypeError, match=msg): - ci.append(ci.values.reorder_categories(list('abc'))) + ci.append(ci.values.reorder_categories(list("abc"))) # with objects - result = ci.append(Index(['c', 'a'])) - expected = CategoricalIndex(list('aabbcaca'), categories=categories) + result = ci.append(Index(["c", "a"])) + expected = CategoricalIndex(list("aabbcaca"), categories=categories) tm.assert_index_equal(result, expected, exact=True) # invalid objects msg = "cannot append a non-category item to a CategoricalIndex" with pytest.raises(TypeError, match=msg): - ci.append(Index(['a', 'd'])) + ci.append(Index(["a", "d"])) # GH14298 - if base object is not categorical -> coerce to object - result = Index(['c', 'a']).append(ci) - expected = Index(list('caaabbca')) + result = Index(["c", "a"]).append(ci) + expected = Index(list("caaabbca")) tm.assert_index_equal(result, expected, exact=True) def test_append_to_another(self): # hits _concat_index_asobject - fst = Index(['a', 'b']) - snd = CategoricalIndex(['d', 'e']) + fst = Index(["a", "b"]) + snd = CategoricalIndex(["d", "e"]) result = fst.append(snd) - expected = Index(['a', 'b', 'd', 'e']) + expected = Index(["a", "b", "d", "e"]) tm.assert_index_equal(result, expected) def test_insert(self): @@ -404,30 +412,32 @@ def test_insert(self): categories = ci.categories # test 0th element - result = ci.insert(0, 'a') - expected = CategoricalIndex(list('aaabbca'), categories=categories) + result = ci.insert(0, "a") + expected = CategoricalIndex(list("aaabbca"), categories=categories) tm.assert_index_equal(result, expected, exact=True) # test Nth element that follows Python list behavior - result = ci.insert(-1, 'a') - expected = CategoricalIndex(list('aabbcaa'), categories=categories) + result = ci.insert(-1, "a") + expected = CategoricalIndex(list("aabbcaa"), categories=categories) tm.assert_index_equal(result, expected, exact=True) # test empty - result = CategoricalIndex(categories=categories).insert(0, 'a') - expected = CategoricalIndex(['a'], categories=categories) + result = CategoricalIndex(categories=categories).insert(0, "a") + expected = CategoricalIndex(["a"], categories=categories) tm.assert_index_equal(result, expected, exact=True) # invalid - msg = ("cannot insert an item into a CategoricalIndex that is not" - " already an existing category") + msg = ( + "cannot insert an item into a CategoricalIndex that is not" + " already an existing category" + ) with pytest.raises(TypeError, match=msg): - ci.insert(0, 'd') + ci.insert(0, "d") # GH 18295 (test missing) - expected = CategoricalIndex(['a', np.nan, 'a', 'b', 'c', 'b']) + expected = CategoricalIndex(["a", np.nan, "a", "b", "c", "b"]) for na in (np.nan, pd.NaT, None): - result = CategoricalIndex(list('aabcb')).insert(1, na) + result = CategoricalIndex(list("aabcb")).insert(1, na) tm.assert_index_equal(result, expected) def test_delete(self): @@ -436,11 +446,11 @@ def test_delete(self): categories = ci.categories result = ci.delete(0) - expected = CategoricalIndex(list('abbca'), categories=categories) + expected = CategoricalIndex(list("abbca"), categories=categories) tm.assert_index_equal(result, expected, exact=True) result = ci.delete(-1) - expected = CategoricalIndex(list('aabbc'), categories=categories) + expected = CategoricalIndex(list("aabbc"), categories=categories) tm.assert_index_equal(result, expected, exact=True) with pytest.raises((IndexError, ValueError)): @@ -459,23 +469,22 @@ def test_astype(self): assert not isinstance(result, CategoricalIndex) # interval - ii = IntervalIndex.from_arrays(left=[-0.001, 2.0], - right=[2, 4], - closed='right') + ii = IntervalIndex.from_arrays(left=[-0.001, 2.0], right=[2, 4], closed="right") - ci = CategoricalIndex(Categorical.from_codes( - [0, 1, -1], categories=ii, ordered=True)) + ci = CategoricalIndex( + Categorical.from_codes([0, 1, -1], categories=ii, ordered=True) + ) - result = ci.astype('interval') + result = ci.astype("interval") expected = ii.take([0, 1, -1]) tm.assert_index_equal(result, expected) result = IntervalIndex(result.values) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('name', [None, 'foo']) - @pytest.mark.parametrize('dtype_ordered', [True, False]) - @pytest.mark.parametrize('index_ordered', [True, False]) + @pytest.mark.parametrize("name", [None, "foo"]) + @pytest.mark.parametrize("dtype_ordered", [True, False]) + @pytest.mark.parametrize("index_ordered", [True, False]) def test_astype_category(self, name, dtype_ordered, index_ordered): # GH 18630 index = self.create_index(ordered=index_ordered) @@ -485,10 +494,12 @@ def test_astype_category(self, name, dtype_ordered, index_ordered): # standard categories dtype = CategoricalDtype(ordered=dtype_ordered) result = index.astype(dtype) - expected = CategoricalIndex(index.tolist(), - name=name, - categories=index.categories, - ordered=dtype_ordered) + expected = CategoricalIndex( + index.tolist(), + name=name, + categories=index.categories, + ordered=dtype_ordered, + ) tm.assert_index_equal(result, expected) # non-standard categories @@ -499,17 +510,18 @@ def test_astype_category(self, name, dtype_ordered, index_ordered): if dtype_ordered is False: # dtype='category' can't specify ordered, so only test once - result = index.astype('category') + result = index.astype("category") expected = index tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('none, warning', [ - (None, None), (ordered_sentinel, FutureWarning)]) + @pytest.mark.parametrize( + "none, warning", [(None, None), (ordered_sentinel, FutureWarning)] + ) def test_astype_category_ordered_none_deprecated(self, none, warning): # GH 26336: only warn if None is not explicitly passed - cdt1 = CategoricalDtype(categories=list('cdab'), ordered=True) - cdt2 = CategoricalDtype(categories=list('cedafb'), ordered=none) - idx = CategoricalIndex(list('abcdaba'), dtype=cdt1) + cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True) + cdt2 = CategoricalDtype(categories=list("cedafb"), ordered=none) + idx = CategoricalIndex(list("abcdaba"), dtype=cdt1) with tm.assert_produces_warning(warning): idx.astype(cdt2) @@ -550,59 +562,52 @@ def test_reindexing(self): tm.assert_numpy_array_equal(expected, actual) def test_reindex_dtype(self): - c = CategoricalIndex(['a', 'b', 'c', 'a']) - res, indexer = c.reindex(['a', 'c']) - tm.assert_index_equal(res, Index(['a', 'a', 'c']), exact=True) - tm.assert_numpy_array_equal(indexer, - np.array([0, 3, 2], dtype=np.intp)) + c = CategoricalIndex(["a", "b", "c", "a"]) + res, indexer = c.reindex(["a", "c"]) + tm.assert_index_equal(res, Index(["a", "a", "c"]), exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) - c = CategoricalIndex(['a', 'b', 'c', 'a']) - res, indexer = c.reindex(Categorical(['a', 'c'])) + c = CategoricalIndex(["a", "b", "c", "a"]) + res, indexer = c.reindex(Categorical(["a", "c"])) - exp = CategoricalIndex(['a', 'a', 'c'], categories=['a', 'c']) + exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) tm.assert_index_equal(res, exp, exact=True) - tm.assert_numpy_array_equal(indexer, - np.array([0, 3, 2], dtype=np.intp)) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) - c = CategoricalIndex(['a', 'b', 'c', 'a'], - categories=['a', 'b', 'c', 'd']) - res, indexer = c.reindex(['a', 'c']) - exp = Index(['a', 'a', 'c'], dtype='object') + c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) + res, indexer = c.reindex(["a", "c"]) + exp = Index(["a", "a", "c"], dtype="object") tm.assert_index_equal(res, exp, exact=True) - tm.assert_numpy_array_equal(indexer, - np.array([0, 3, 2], dtype=np.intp)) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) - c = CategoricalIndex(['a', 'b', 'c', 'a'], - categories=['a', 'b', 'c', 'd']) - res, indexer = c.reindex(Categorical(['a', 'c'])) - exp = CategoricalIndex(['a', 'a', 'c'], categories=['a', 'c']) + c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) + res, indexer = c.reindex(Categorical(["a", "c"])) + exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) tm.assert_index_equal(res, exp, exact=True) - tm.assert_numpy_array_equal(indexer, - np.array([0, 3, 2], dtype=np.intp)) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) def test_reindex_duplicate_target(self): # See GH23963 - c = CategoricalIndex(['a', 'b', 'c', 'a'], - categories=['a', 'b', 'c', 'd']) - with pytest.raises(ValueError, match='non-unique indexer'): - c.reindex(['a', 'a', 'c']) + c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) + with pytest.raises(ValueError, match="non-unique indexer"): + c.reindex(["a", "a", "c"]) - with pytest.raises(ValueError, match='non-unique indexer'): - c.reindex(CategoricalIndex(['a', 'a', 'c'], - categories=['a', 'b', 'c', 'd'])) + with pytest.raises(ValueError, match="non-unique indexer"): + c.reindex( + CategoricalIndex(["a", "a", "c"], categories=["a", "b", "c", "d"]) + ) def test_reindex_empty_index(self): # See GH16770 c = CategoricalIndex([]) - res, indexer = c.reindex(['a', 'b']) - tm.assert_index_equal(res, Index(['a', 'b']), exact=True) - tm.assert_numpy_array_equal(indexer, - np.array([-1, -1], dtype=np.intp)) - - @pytest.mark.parametrize('data, non_lexsorted_data', [ - [[1, 2, 3], [9, 0, 1, 2, 3]], - [list('abc'), list('fabcd')], - ]) + res, indexer = c.reindex(["a", "b"]) + tm.assert_index_equal(res, Index(["a", "b"]), exact=True) + tm.assert_numpy_array_equal(indexer, np.array([-1, -1], dtype=np.intp)) + + @pytest.mark.parametrize( + "data, non_lexsorted_data", + [[[1, 2, 3], [9, 0, 1, 2, 3]], [list("abc"), list("fabcd")]], + ) def test_is_monotonic(self, data, non_lexsorted_data): c = CategoricalIndex(data) assert c.is_monotonic_increasing is True @@ -639,86 +644,89 @@ def test_is_monotonic(self, data, non_lexsorted_data): def test_has_duplicates(self): - idx = CategoricalIndex([0, 0, 0], name='foo') + idx = CategoricalIndex([0, 0, 0], name="foo") assert idx.is_unique is False assert idx.has_duplicates is True def test_drop_duplicates(self): - idx = CategoricalIndex([0, 0, 0], name='foo') - expected = CategoricalIndex([0], name='foo') + idx = CategoricalIndex([0, 0, 0], name="foo") + expected = CategoricalIndex([0], name="foo") tm.assert_index_equal(idx.drop_duplicates(), expected) tm.assert_index_equal(idx.unique(), expected) def test_get_indexer(self): - idx1 = CategoricalIndex(list('aabcde'), categories=list('edabc')) - idx2 = CategoricalIndex(list('abf')) + idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc")) + idx2 = CategoricalIndex(list("abf")) - for indexer in [idx2, list('abf'), Index(list('abf'))]: + for indexer in [idx2, list("abf"), Index(list("abf"))]: r1 = idx1.get_indexer(idx2) assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp)) - msg = ("method='pad' and method='backfill' not implemented yet for" - " CategoricalIndex") + msg = ( + "method='pad' and method='backfill' not implemented yet for" + " CategoricalIndex" + ) with pytest.raises(NotImplementedError, match=msg): - idx2.get_indexer(idx1, method='pad') + idx2.get_indexer(idx1, method="pad") with pytest.raises(NotImplementedError, match=msg): - idx2.get_indexer(idx1, method='backfill') + idx2.get_indexer(idx1, method="backfill") msg = "method='nearest' not implemented yet for CategoricalIndex" with pytest.raises(NotImplementedError, match=msg): - idx2.get_indexer(idx1, method='nearest') + idx2.get_indexer(idx1, method="nearest") def test_get_loc(self): # GH 12531 - cidx1 = CategoricalIndex(list('abcde'), categories=list('edabc')) - idx1 = Index(list('abcde')) - assert cidx1.get_loc('a') == idx1.get_loc('a') - assert cidx1.get_loc('e') == idx1.get_loc('e') + cidx1 = CategoricalIndex(list("abcde"), categories=list("edabc")) + idx1 = Index(list("abcde")) + assert cidx1.get_loc("a") == idx1.get_loc("a") + assert cidx1.get_loc("e") == idx1.get_loc("e") for i in [cidx1, idx1]: with pytest.raises(KeyError): - i.get_loc('NOT-EXIST') + i.get_loc("NOT-EXIST") # non-unique - cidx2 = CategoricalIndex(list('aacded'), categories=list('edabc')) - idx2 = Index(list('aacded')) + cidx2 = CategoricalIndex(list("aacded"), categories=list("edabc")) + idx2 = Index(list("aacded")) # results in bool array - res = cidx2.get_loc('d') - tm.assert_numpy_array_equal(res, idx2.get_loc('d')) - tm.assert_numpy_array_equal(res, np.array([False, False, False, - True, False, True])) + res = cidx2.get_loc("d") + tm.assert_numpy_array_equal(res, idx2.get_loc("d")) + tm.assert_numpy_array_equal( + res, np.array([False, False, False, True, False, True]) + ) # unique element results in scalar - res = cidx2.get_loc('e') - assert res == idx2.get_loc('e') + res = cidx2.get_loc("e") + assert res == idx2.get_loc("e") assert res == 4 for i in [cidx2, idx2]: with pytest.raises(KeyError): - i.get_loc('NOT-EXIST') + i.get_loc("NOT-EXIST") # non-unique, sliceable - cidx3 = CategoricalIndex(list('aabbb'), categories=list('abc')) - idx3 = Index(list('aabbb')) + cidx3 = CategoricalIndex(list("aabbb"), categories=list("abc")) + idx3 = Index(list("aabbb")) # results in slice - res = cidx3.get_loc('a') - assert res == idx3.get_loc('a') + res = cidx3.get_loc("a") + assert res == idx3.get_loc("a") assert res == slice(0, 2, None) - res = cidx3.get_loc('b') - assert res == idx3.get_loc('b') + res = cidx3.get_loc("b") + assert res == idx3.get_loc("b") assert res == slice(2, 5, None) for i in [cidx3, idx3]: with pytest.raises(KeyError): - i.get_loc('c') + i.get_loc("c") def test_repr_roundtrip(self): - ci = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True) + ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) str(ci) tm.assert_index_equal(eval(repr(ci)), ci, exact=True) @@ -732,30 +740,30 @@ def test_repr_roundtrip(self): def test_isin(self): - ci = CategoricalIndex( - list('aabca') + [np.nan], categories=['c', 'a', 'b']) + ci = CategoricalIndex(list("aabca") + [np.nan], categories=["c", "a", "b"]) tm.assert_numpy_array_equal( - ci.isin(['c']), - np.array([False, False, False, True, False, False])) + ci.isin(["c"]), np.array([False, False, False, True, False, False]) + ) tm.assert_numpy_array_equal( - ci.isin(['c', 'a', 'b']), np.array([True] * 5 + [False])) + ci.isin(["c", "a", "b"]), np.array([True] * 5 + [False]) + ) tm.assert_numpy_array_equal( - ci.isin(['c', 'a', 'b', np.nan]), np.array([True] * 6)) + ci.isin(["c", "a", "b", np.nan]), np.array([True] * 6) + ) # mismatched categorical -> coerced to ndarray so doesn't matter - result = ci.isin(ci.set_categories(list('abcdefghi'))) + result = ci.isin(ci.set_categories(list("abcdefghi"))) expected = np.array([True] * 6) tm.assert_numpy_array_equal(result, expected) - result = ci.isin(ci.set_categories(list('defghi'))) + result = ci.isin(ci.set_categories(list("defghi"))) expected = np.array([False] * 5 + [True]) tm.assert_numpy_array_equal(result, expected) def test_identical(self): - ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True) - ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'], - ordered=True) + ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) + ci2 = CategoricalIndex(["a", "b"], categories=["a", "b", "c"], ordered=True) assert ci1.identical(ci1) assert ci1.identical(ci1.copy()) assert not ci1.identical(ci2) @@ -777,9 +785,8 @@ def test_ensure_copied_data(self): assert _base(index.values) is _base(result.values) def test_equals_categorical(self): - ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True) - ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'], - ordered=True) + ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) + ci2 = CategoricalIndex(["a", "b"], categories=["a", "b", "c"], ordered=True) assert ci1.equals(ci1) assert not ci1.equals(ci2) @@ -794,71 +801,69 @@ def test_equals_categorical(self): assert (ci1 >= ci1).all() assert not (ci1 == 1).all() - assert (ci1 == Index(['a', 'b'])).all() + assert (ci1 == Index(["a", "b"])).all() assert (ci1 == ci1.values).all() # invalid comparisons with pytest.raises(ValueError, match="Lengths must match"): - ci1 == Index(['a', 'b', 'c']) + ci1 == Index(["a", "b", "c"]) - msg = ("categorical index comparisons must have the same categories" - " and ordered attributes") + msg = ( + "categorical index comparisons must have the same categories" + " and ordered attributes" + ) with pytest.raises(TypeError, match=msg): ci1 == ci2 with pytest.raises(TypeError, match=msg): ci1 == Categorical(ci1.values, ordered=False) with pytest.raises(TypeError, match=msg): - ci1 == Categorical(ci1.values, categories=list('abc')) + ci1 == Categorical(ci1.values, categories=list("abc")) # tests # make sure that we are testing for category inclusion properly - ci = CategoricalIndex(list('aabca'), categories=['c', 'a', 'b']) - assert not ci.equals(list('aabca')) + ci = CategoricalIndex(list("aabca"), categories=["c", "a", "b"]) + assert not ci.equals(list("aabca")) # Same categories, but different order # Unordered - assert ci.equals(CategoricalIndex(list('aabca'))) + assert ci.equals(CategoricalIndex(list("aabca"))) # Ordered - assert not ci.equals(CategoricalIndex(list('aabca'), ordered=True)) + assert not ci.equals(CategoricalIndex(list("aabca"), ordered=True)) assert ci.equals(ci.copy()) - ci = CategoricalIndex(list('aabca') + [np.nan], - categories=['c', 'a', 'b']) - assert not ci.equals(list('aabca')) - assert not ci.equals(CategoricalIndex(list('aabca'))) + ci = CategoricalIndex(list("aabca") + [np.nan], categories=["c", "a", "b"]) + assert not ci.equals(list("aabca")) + assert not ci.equals(CategoricalIndex(list("aabca"))) assert ci.equals(ci.copy()) - ci = CategoricalIndex(list('aabca') + [np.nan], - categories=['c', 'a', 'b']) - assert not ci.equals(list('aabca') + [np.nan]) - assert ci.equals(CategoricalIndex(list('aabca') + [np.nan])) - assert not ci.equals(CategoricalIndex(list('aabca') + [np.nan], - ordered=True)) + ci = CategoricalIndex(list("aabca") + [np.nan], categories=["c", "a", "b"]) + assert not ci.equals(list("aabca") + [np.nan]) + assert ci.equals(CategoricalIndex(list("aabca") + [np.nan])) + assert not ci.equals(CategoricalIndex(list("aabca") + [np.nan], ordered=True)) assert ci.equals(ci.copy()) def test_equals_categoridcal_unordered(self): # https://github.com/pandas-dev/pandas/issues/16603 - a = pd.CategoricalIndex(['A'], categories=['A', 'B']) - b = pd.CategoricalIndex(['A'], categories=['B', 'A']) - c = pd.CategoricalIndex(['C'], categories=['B', 'A']) + a = pd.CategoricalIndex(["A"], categories=["A", "B"]) + b = pd.CategoricalIndex(["A"], categories=["B", "A"]) + c = pd.CategoricalIndex(["C"], categories=["B", "A"]) assert a.equals(b) assert not a.equals(c) assert not b.equals(c) def test_frame_repr(self): - df = pd.DataFrame({"A": [1, 2, 3]}, - index=pd.CategoricalIndex(['a', 'b', 'c'])) + df = pd.DataFrame({"A": [1, 2, 3]}, index=pd.CategoricalIndex(["a", "b", "c"])) result = repr(df) - expected = ' A\na 1\nb 2\nc 3' + expected = " A\na 1\nb 2\nc 3" assert result == expected def test_string_categorical_index_repr(self): # short - idx = pd.CategoricalIndex(['a', 'bb', 'ccc']) + idx = pd.CategoricalIndex(["a", "bb", "ccc"]) expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa assert repr(idx) == expected # multiple lines - idx = pd.CategoricalIndex(['a', 'bb', 'ccc'] * 10) + idx = pd.CategoricalIndex(["a", "bb", "ccc"] * 10) expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], @@ -867,7 +872,7 @@ def test_string_categorical_index_repr(self): assert repr(idx) == expected # truncated - idx = pd.CategoricalIndex(['a', 'bb', 'ccc'] * 100) + idx = pd.CategoricalIndex(["a", "bb", "ccc"] * 100) expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', ... 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], @@ -876,7 +881,7 @@ def test_string_categorical_index_repr(self): assert repr(idx) == expected # larger categories - idx = pd.CategoricalIndex(list('abcdefghijklmmo')) + idx = pd.CategoricalIndex(list("abcdefghijklmmo")) expected = """CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'm', 'o'], categories=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', ...], ordered=False, dtype='category')""" # noqa @@ -884,12 +889,12 @@ def test_string_categorical_index_repr(self): assert repr(idx) == expected # short - idx = pd.CategoricalIndex(['あ', 'いい', 'ううう']) + idx = pd.CategoricalIndex(["あ", "いい", "ううう"]) expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa assert repr(idx) == expected # multiple lines - idx = pd.CategoricalIndex(['あ', 'いい', 'ううう'] * 10) + idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 10) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], @@ -898,7 +903,7 @@ def test_string_categorical_index_repr(self): assert repr(idx) == expected # truncated - idx = pd.CategoricalIndex(['あ', 'いい', 'ううう'] * 100) + idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 100) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', ... 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], @@ -907,7 +912,7 @@ def test_string_categorical_index_repr(self): assert repr(idx) == expected # larger categories - idx = pd.CategoricalIndex(list('あいうえおかきくけこさしすせそ')) + idx = pd.CategoricalIndex(list("あいうえおかきくけこさしすせそ")) expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ'], categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa @@ -915,15 +920,15 @@ def test_string_categorical_index_repr(self): assert repr(idx) == expected # Emable Unicode option ----------------------------------------- - with cf.option_context('display.unicode.east_asian_width', True): + with cf.option_context("display.unicode.east_asian_width", True): # short - idx = pd.CategoricalIndex(['あ', 'いい', 'ううう']) + idx = pd.CategoricalIndex(["あ", "いい", "ううう"]) expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa assert repr(idx) == expected # multiple lines - idx = pd.CategoricalIndex(['あ', 'いい', 'ううう'] * 10) + idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 10) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', @@ -933,7 +938,7 @@ def test_string_categorical_index_repr(self): assert repr(idx) == expected # truncated - idx = pd.CategoricalIndex(['あ', 'いい', 'ううう'] * 100) + idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 100) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', ... @@ -944,7 +949,7 @@ def test_string_categorical_index_repr(self): assert repr(idx) == expected # larger categories - idx = pd.CategoricalIndex(list('あいうえおかきくけこさしすせそ')) + idx = pd.CategoricalIndex(list("あいうえおかきくけこさしすせそ")) expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ'], categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa @@ -953,13 +958,13 @@ def test_string_categorical_index_repr(self): def test_fillna_categorical(self): # GH 11343 - idx = CategoricalIndex([1.0, np.nan, 3.0, 1.0], name='x') + idx = CategoricalIndex([1.0, np.nan, 3.0, 1.0], name="x") # fill by value in categories - exp = CategoricalIndex([1.0, 1.0, 3.0, 1.0], name='x') + exp = CategoricalIndex([1.0, 1.0, 3.0, 1.0], name="x") tm.assert_index_equal(idx.fillna(1.0), exp) # fill by value not in categories raises ValueError - msg = 'fill value must be in categories' + msg = "fill value must be in categories" with pytest.raises(ValueError, match=msg): idx.fillna(2.0) @@ -967,53 +972,55 @@ def test_take_fill_value(self): # GH 12631 # numeric category - idx = pd.CategoricalIndex([1, 2, 3], name='xxx') + idx = pd.CategoricalIndex([1, 2, 3], name="xxx") result = idx.take(np.array([1, 0, -1])) - expected = pd.CategoricalIndex([2, 1, 3], name='xxx') + expected = pd.CategoricalIndex([2, 1, 3], name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.CategoricalIndex([2, 1, np.nan], categories=[1, 2, 3], - name='xxx') + expected = pd.CategoricalIndex([2, 1, np.nan], categories=[1, 2, 3], name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.CategoricalIndex([2, 1, 3], name='xxx') + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.CategoricalIndex([2, 1, 3], name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # object category - idx = pd.CategoricalIndex(list('CBA'), categories=list('ABC'), - ordered=True, name='xxx') + idx = pd.CategoricalIndex( + list("CBA"), categories=list("ABC"), ordered=True, name="xxx" + ) result = idx.take(np.array([1, 0, -1])) - expected = pd.CategoricalIndex(list('BCA'), categories=list('ABC'), - ordered=True, name='xxx') + expected = pd.CategoricalIndex( + list("BCA"), categories=list("ABC"), ordered=True, name="xxx" + ) tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.CategoricalIndex(['B', 'C', np.nan], - categories=list('ABC'), ordered=True, - name='xxx') + expected = pd.CategoricalIndex( + ["B", "C", np.nan], categories=list("ABC"), ordered=True, name="xxx" + ) tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.CategoricalIndex(list('BCA'), categories=list('ABC'), - ordered=True, name='xxx') + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.CategoricalIndex( + list("BCA"), categories=list("ABC"), ordered=True, name="xxx" + ) tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): @@ -1025,33 +1032,34 @@ def test_take_fill_value(self): def test_take_fill_value_datetime(self): # datetime category - idx = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'], - name='xxx') + idx = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx") idx = pd.CategoricalIndex(idx) result = idx.take(np.array([1, 0, -1])) - expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx') + expected = pd.DatetimeIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx" + ) expected = pd.CategoricalIndex(expected) tm.assert_index_equal(result, expected) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', 'NaT'], - name='xxx') - exp_cats = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01']) + expected = pd.DatetimeIndex(["2011-02-01", "2011-01-01", "NaT"], name="xxx") + exp_cats = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"]) expected = pd.CategoricalIndex(expected, categories=exp_cats) tm.assert_index_equal(result, expected) # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx') + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.DatetimeIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx" + ) expected = pd.CategoricalIndex(expected) tm.assert_index_equal(result, expected) - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): @@ -1061,7 +1069,7 @@ def test_take_fill_value_datetime(self): idx.take(np.array([1, -5])) def test_take_invalid_kwargs(self): - idx = pd.CategoricalIndex([1, 2, 3], name='foo') + idx = pd.CategoricalIndex([1, 2, 3], name="foo") indices = [1, 0, -1] msg = r"take\(\) got an unexpected keyword argument 'foo'" @@ -1074,14 +1082,17 @@ def test_take_invalid_kwargs(self): msg = "the 'mode' parameter is not supported" with pytest.raises(ValueError, match=msg): - idx.take(indices, mode='clip') - - @pytest.mark.parametrize('dtype, engine_type', [ - (np.int8, libindex.Int8Engine), - (np.int16, libindex.Int16Engine), - (np.int32, libindex.Int32Engine), - (np.int64, libindex.Int64Engine), - ]) + idx.take(indices, mode="clip") + + @pytest.mark.parametrize( + "dtype, engine_type", + [ + (np.int8, libindex.Int8Engine), + (np.int16, libindex.Int16Engine), + (np.int32, libindex.Int32Engine), + (np.int64, libindex.Int64Engine), + ], + ) def test_engine_type(self, dtype, engine_type): if dtype != np.int64: # num. of uniques required to push CategoricalIndex.codes to a @@ -1092,6 +1103,6 @@ def test_engine_type(self, dtype, engine_type): # having 2**32 - 2**31 categories would be very memory-intensive, # so we cheat a bit with the dtype ci = pd.CategoricalIndex(range(32768)) # == 2**16 - 2**(16 - 1) - ci.values._codes = ci.values._codes.astype('int64') + ci.values._codes = ci.values._codes.astype("int64") assert np.issubdtype(ci.codes.dtype, dtype) assert isinstance(ci._engine, engine_type) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 3cb907c6f58442..465b7f5e23bb8a 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -18,7 +18,6 @@ class TestCommon: - def test_droplevel(self, indices): # GH 21115 if isinstance(indices, MultiIndex): @@ -34,7 +33,7 @@ def test_droplevel(self, indices): with pytest.raises(ValueError): indices.droplevel(level) - for level in 'wrong', ['wrong']: + for level in "wrong", ["wrong"]: with pytest.raises(KeyError): indices.droplevel(level) @@ -45,7 +44,7 @@ def test_constructor_non_hashable_name(self, indices): pytest.skip("multiindex handled in test_multi.py") message = "Index.name must be a hashable type" - renamed = [['1']] + renamed = [["1"]] # With .rename() with pytest.raises(TypeError, match=message): @@ -62,7 +61,7 @@ def test_constructor_unwraps_index(self, indices): b = type(a)(a) tm.assert_equal(a._data, b._data) - @pytest.mark.parametrize("itm", [101, 'no_int']) + @pytest.mark.parametrize("itm", [101, "no_int"]) # FutureWarning from non-tuple sequence of nd indexing @pytest.mark.filterwarnings("ignore::FutureWarning") def test_getitem_error(self, indices, itm): @@ -70,14 +69,15 @@ def test_getitem_error(self, indices, itm): indices[itm] @pytest.mark.parametrize( - 'fname, sname, expected_name', + "fname, sname, expected_name", [ - ('A', 'A', 'A'), - ('A', 'B', None), - ('A', None, None), - (None, 'B', None), + ("A", "A", "A"), + ("A", "B", None), + ("A", None, None), + (None, "B", None), (None, None, None), - ]) + ], + ) def test_corner_union(self, indices, fname, sname, expected_name): # GH 9943 9862 # Test unions with various name combinations @@ -131,7 +131,7 @@ def test_set_name_methods(self, indices): # don't tests a MultiIndex here (as its tested separated) if isinstance(indices, MultiIndex): - pytest.skip('Skip check for MultiIndex') + pytest.skip("Skip check for MultiIndex") original_name = indices.name new_ind = indices.set_names([new_name]) assert new_ind.name == new_name @@ -149,7 +149,7 @@ def test_set_name_methods(self, indices): indices.set_names("a", level=0) # rename in place just leaves tuples and other containers alone - name = ('A', 'B') + name = ("A", "B") indices.rename(name, inplace=True) assert indices.name == name assert indices.names == [name] @@ -162,15 +162,16 @@ def test_dtype_str(self, indices): def test_hash_error(self, indices): index = indices - with pytest.raises(TypeError, match=("unhashable type: %r" % - type(index).__name__)): + with pytest.raises( + TypeError, match=("unhashable type: %r" % type(index).__name__) + ): hash(indices) def test_copy_and_deepcopy(self, indices): from copy import copy, deepcopy if isinstance(indices, MultiIndex): - pytest.skip('Skip check for MultiIndex') + pytest.skip("Skip check for MultiIndex") for func in (copy, deepcopy): idx_copy = func(indices) @@ -184,7 +185,7 @@ def test_unique(self, indices): # don't test a MultiIndex here (as its tested separated) # don't test a CategoricalIndex because categories change (GH 18291) if isinstance(indices, (MultiIndex, CategoricalIndex)): - pytest.skip('Skip check for MultiIndex/CategoricalIndex') + pytest.skip("Skip check for MultiIndex/CategoricalIndex") # GH 17896 expected = indices.drop_duplicates() @@ -197,14 +198,15 @@ def test_unique(self, indices): indices.unique(level=3) msg = r"Level wrong must be same as name \({}\)".format( - re.escape(indices.name.__repr__())) + re.escape(indices.name.__repr__()) + ) with pytest.raises(KeyError, match=msg): - indices.unique(level='wrong') + indices.unique(level="wrong") def test_get_unique_index(self, indices): # MultiIndex tested separately if not len(indices) or isinstance(indices, MultiIndex): - pytest.skip('Skip check for empty Index and MultiIndex') + pytest.skip("Skip check for empty Index and MultiIndex") idx = indices[[0] * 5] idx_unique = indices[[0]] @@ -223,7 +225,7 @@ def test_get_unique_index(self, indices): # nans: if not indices._can_hold_na: - pytest.skip('Skip na-check if index cannot hold na') + pytest.skip("Skip na-check if index cannot hold na") if needs_i8_conversion(indices): vals = indices.asi8[[0] * 5] @@ -240,9 +242,7 @@ def test_get_unique_index(self, indices): assert idx_nan.dtype == indices.dtype assert idx_unique_nan.dtype == indices.dtype - for dropna, expected in zip([False, True], - [idx_unique_nan, - idx_unique]): + for dropna, expected in zip([False, True], [idx_unique_nan, idx_unique]): for i in [idx_nan, idx_unique_nan]: result = i._get_unique_index(dropna=dropna) tm.assert_index_equal(result, expected) @@ -254,7 +254,7 @@ def test_sort(self, indices): def test_mutability(self, indices): if not len(indices): - pytest.skip('Skip check for empty Index') + pytest.skip("Skip check for empty Index") msg = "Index does not support mutable operations" with pytest.raises(TypeError, match=msg): indices[0] = indices[0] @@ -270,11 +270,11 @@ def test_searchsorted_monotonic(self, indices): # not implemented for tuple searches in MultiIndex # or Intervals searches in IntervalIndex if isinstance(indices, (MultiIndex, pd.IntervalIndex)): - pytest.skip('Skip check for MultiIndex/IntervalIndex') + pytest.skip("Skip check for MultiIndex/IntervalIndex") # nothing to test if the index is empty if indices.empty: - pytest.skip('Skip check for empty Index') + pytest.skip("Skip check for empty Index") value = indices[0] # determine the expected results (handle dupes for 'right') @@ -286,41 +286,41 @@ def test_searchsorted_monotonic(self, indices): # test _searchsorted_monotonic in all cases # test searchsorted only for increasing if indices.is_monotonic_increasing: - ssm_left = indices._searchsorted_monotonic(value, side='left') + ssm_left = indices._searchsorted_monotonic(value, side="left") assert expected_left == ssm_left - ssm_right = indices._searchsorted_monotonic(value, side='right') + ssm_right = indices._searchsorted_monotonic(value, side="right") assert expected_right == ssm_right - ss_left = indices.searchsorted(value, side='left') + ss_left = indices.searchsorted(value, side="left") assert expected_left == ss_left - ss_right = indices.searchsorted(value, side='right') + ss_right = indices.searchsorted(value, side="right") assert expected_right == ss_right elif indices.is_monotonic_decreasing: - ssm_left = indices._searchsorted_monotonic(value, side='left') + ssm_left = indices._searchsorted_monotonic(value, side="left") assert expected_left == ssm_left - ssm_right = indices._searchsorted_monotonic(value, side='right') + ssm_right = indices._searchsorted_monotonic(value, side="right") assert expected_right == ssm_right else: # non-monotonic should raise. with pytest.raises(ValueError): - indices._searchsorted_monotonic(value, side='left') + indices._searchsorted_monotonic(value, side="left") def test_pickle(self, indices): - original_name, indices.name = indices.name, 'foo' + original_name, indices.name = indices.name, "foo" unpickled = tm.round_trip_pickle(indices) assert indices.equals(unpickled) indices.name = original_name - @pytest.mark.parametrize('keep', ['first', 'last', False]) + @pytest.mark.parametrize("keep", ["first", "last", False]) def test_duplicated(self, indices, keep): if not len(indices) or isinstance(indices, (MultiIndex, RangeIndex)): # MultiIndex tested separately in: # tests/indexes/multi/test_unique_and_duplicates - pytest.skip('Skip check for empty Index, MultiIndex, RangeIndex') + pytest.skip("Skip check for empty Index, MultiIndex, RangeIndex") holder = type(indices) @@ -348,8 +348,7 @@ def test_has_duplicates(self, indices): # MultiIndex tested separately in: # tests/indexes/multi/test_unique_and_duplicates. # RangeIndex is unique by definition. - pytest.skip('Skip check for empty Index, MultiIndex, ' - 'and RangeIndex') + pytest.skip("Skip check for empty Index, MultiIndex, " "and RangeIndex") idx = holder([indices[0]] * 5) assert idx.is_unique is False diff --git a/pandas/tests/indexes/test_frozen.py b/pandas/tests/indexes/test_frozen.py index 56efd4bbfd62a6..57acc8ee72b70c 100644 --- a/pandas/tests/indexes/test_frozen.py +++ b/pandas/tests/indexes/test_frozen.py @@ -9,7 +9,7 @@ class TestFrozenList(CheckImmutable, CheckStringMixin): - mutable_methods = ('extend', 'pop', 'remove', 'insert') + mutable_methods = ("extend", "pop", "remove", "insert") unicode_container = FrozenList(["\u05d0", "\u05d1", "c"]) def setup_method(self, _): @@ -58,7 +58,7 @@ def test_tricky_container_to_bytes_raises(self): class TestFrozenNDArray(CheckImmutable, CheckStringMixin): - mutable_methods = ('put', 'itemset', 'fill') + mutable_methods = ("put", "itemset", "fill") def setup_method(self, _): self.lst = [3, 5, 7, -2] diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 3437f501aa9109..1feb82a923b197 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -14,7 +14,6 @@ class Numeric(Base): - def test_can_hold_identifiers(self): idx = self.create_index() key = idx[0] @@ -27,10 +26,10 @@ def test_explicit_conversions(self): # GH 8608 # add/sub are overridden explicitly for Float/Int Index - idx = self._holder(np.arange(5, dtype='int64')) + idx = self._holder(np.arange(5, dtype="int64")) # float conversions - arr = np.arange(5, dtype='int64') * 3.2 + arr = np.arange(5, dtype="int64") * 3.2 expected = Float64Index(arr) fidx = idx * 3.2 tm.assert_index_equal(fidx, expected) @@ -39,40 +38,44 @@ def test_explicit_conversions(self): # interops with numpy arrays expected = Float64Index(arr) - a = np.zeros(5, dtype='float64') + a = np.zeros(5, dtype="float64") result = fidx - a tm.assert_index_equal(result, expected) expected = Float64Index(-arr) - a = np.zeros(5, dtype='float64') + a = np.zeros(5, dtype="float64") result = a - fidx tm.assert_index_equal(result, expected) def test_index_groupby(self): int_idx = Index(range(6)) float_idx = Index(np.arange(0, 0.6, 0.1)) - obj_idx = Index('A B C D E F'.split()) - dt_idx = pd.date_range('2013-01-01', freq='M', periods=6) + obj_idx = Index("A B C D E F".split()) + dt_idx = pd.date_range("2013-01-01", freq="M", periods=6) for idx in [int_idx, float_idx, obj_idx, dt_idx]: to_groupby = np.array([1, 2, np.nan, np.nan, 2, 1]) - tm.assert_dict_equal(idx.groupby(to_groupby), - {1.0: idx[[0, 5]], 2.0: idx[[1, 4]]}) - - to_groupby = Index([datetime(2011, 11, 1), - datetime(2011, 12, 1), - pd.NaT, - pd.NaT, - datetime(2011, 12, 1), - datetime(2011, 11, 1)], - tz='UTC').values - - ex_keys = [Timestamp('2011-11-01'), Timestamp('2011-12-01')] - expected = {ex_keys[0]: idx[[0, 5]], - ex_keys[1]: idx[[1, 4]]} + tm.assert_dict_equal( + idx.groupby(to_groupby), {1.0: idx[[0, 5]], 2.0: idx[[1, 4]]} + ) + + to_groupby = Index( + [ + datetime(2011, 11, 1), + datetime(2011, 12, 1), + pd.NaT, + pd.NaT, + datetime(2011, 12, 1), + datetime(2011, 11, 1), + ], + tz="UTC", + ).values + + ex_keys = [Timestamp("2011-11-01"), Timestamp("2011-12-01")] + expected = {ex_keys[0]: idx[[0, 5]], ex_keys[1]: idx[[1, 4]]} tm.assert_dict_equal(idx.groupby(to_groupby), expected) - @pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) + @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) def test_where(self, klass): i = self.create_index() cond = [True] * len(i) @@ -96,14 +99,16 @@ class TestFloat64Index(Numeric): _holder = Float64Index def setup_method(self, method): - self.indices = dict(mixed=Float64Index([1.5, 2, 3, 4, 5]), - float=Float64Index(np.arange(5) * 2.5), - mixed_dec=Float64Index([5, 4, 3, 2, 1.5]), - float_dec=Float64Index(np.arange(4, -1, -1) * 2.5)) + self.indices = dict( + mixed=Float64Index([1.5, 2, 3, 4, 5]), + float=Float64Index(np.arange(5) * 2.5), + mixed_dec=Float64Index([5, 4, 3, 2, 1.5]), + float_dec=Float64Index(np.arange(4, -1, -1) * 2.5), + ) self.setup_indices() def create_index(self): - return Float64Index(np.arange(5, dtype='float64')) + return Float64Index(np.arange(5, dtype="float64")) def test_repr_roundtrip(self): for ind in (self.mixed, self.float): @@ -126,17 +131,17 @@ def test_constructor(self): # explicit construction index = Float64Index([1, 2, 3, 4, 5]) assert isinstance(index, Float64Index) - expected = np.array([1, 2, 3, 4, 5], dtype='float64') + expected = np.array([1, 2, 3, 4, 5], dtype="float64") tm.assert_numpy_array_equal(index.values, expected) index = Float64Index(np.array([1, 2, 3, 4, 5])) assert isinstance(index, Float64Index) - index = Float64Index([1., 2, 3, 4, 5]) + index = Float64Index([1.0, 2, 3, 4, 5]) assert isinstance(index, Float64Index) - index = Float64Index(np.array([1., 2, 3, 4, 5])) + index = Float64Index(np.array([1.0, 2, 3, 4, 5])) assert isinstance(index, Float64Index) assert index.dtype == float - index = Float64Index(np.array([1., 2, 3, 4, 5]), dtype=np.float32) + index = Float64Index(np.array([1.0, 2, 3, 4, 5]), dtype=np.float32) assert isinstance(index, Float64Index) assert index.dtype == np.float64 @@ -155,34 +160,37 @@ def test_constructor(self): def test_constructor_invalid(self): # invalid - msg = (r"Float64Index\(\.\.\.\) must be called with a collection of" - r" some kind, 0\.0 was passed") + msg = ( + r"Float64Index\(\.\.\.\) must be called with a collection of" + r" some kind, 0\.0 was passed" + ) with pytest.raises(TypeError, match=msg): - Float64Index(0.) - msg = ("String dtype not supported, you may need to explicitly cast to" - " a numeric type") + Float64Index(0.0) + msg = ( + "String dtype not supported, you may need to explicitly cast to" + " a numeric type" + ) with pytest.raises(TypeError, match=msg): - Float64Index(['a', 'b', 0.]) - msg = (r"float\(\) argument must be a string or a number, not" - " 'Timestamp'") + Float64Index(["a", "b", 0.0]) + msg = r"float\(\) argument must be a string or a number, not" " 'Timestamp'" with pytest.raises(TypeError, match=msg): - Float64Index([Timestamp('20130101')]) + Float64Index([Timestamp("20130101")]) def test_constructor_coerce(self): self.check_coerce(self.mixed, Index([1.5, 2, 3, 4, 5])) self.check_coerce(self.float, Index(np.arange(5) * 2.5)) - self.check_coerce(self.float, Index(np.array( - np.arange(5) * 2.5, dtype=object))) + self.check_coerce(self.float, Index(np.array(np.arange(5) * 2.5, dtype=object))) def test_constructor_explicit(self): # these don't auto convert - self.check_coerce(self.float, - Index((np.arange(5) * 2.5), dtype=object), - is_float_index=False) - self.check_coerce(self.mixed, Index( - [1.5, 2, 3, 4, 5], dtype=object), is_float_index=False) + self.check_coerce( + self.float, Index((np.arange(5) * 2.5), dtype=object), is_float_index=False + ) + self.check_coerce( + self.mixed, Index([1.5, 2, 3, 4, 5], dtype=object), is_float_index=False + ) def test_astype(self): @@ -192,7 +200,7 @@ def test_astype(self): self.check_is_index(result) i = self.mixed.copy() - i.name = 'foo' + i.name = "foo" result = i.astype(object) assert result.equals(i) assert i.equals(result) @@ -200,7 +208,7 @@ def test_astype(self): # GH 12881 # a float astype int - for dtype in ['int16', 'int32', 'int64']: + for dtype in ["int16", "int32", "int64"]: i = Float64Index([0, 1, 2]) result = i.astype(dtype) expected = Int64Index([0, 1, 2]) @@ -211,7 +219,7 @@ def test_astype(self): expected = Int64Index([0, 1, 2]) tm.assert_index_equal(result, expected) - for dtype in ['float32', 'float64']: + for dtype in ["float32", "float64"]: i = Float64Index([0, 1, 2]) result = i.astype(dtype) expected = i @@ -223,14 +231,16 @@ def test_astype(self): tm.assert_index_equal(result, expected) # invalid - for dtype in ['M8[ns]', 'm8[ns]']: - msg = ("Cannot convert Float64Index to dtype {}; integer values" - " are required for conversion").format(pandas_dtype(dtype)) + for dtype in ["M8[ns]", "m8[ns]"]: + msg = ( + "Cannot convert Float64Index to dtype {}; integer values" + " are required for conversion" + ).format(pandas_dtype(dtype)) with pytest.raises(TypeError, match=re.escape(msg)): i.astype(dtype) # GH 13149 - for dtype in ['int16', 'int32', 'int64']: + for dtype in ["int16", "int32", "int64"]: i = Float64Index([0, 1.1, np.NAN]) msg = "Cannot convert NA to integer" with pytest.raises(ValueError, match=msg): @@ -266,49 +276,53 @@ def test_equals_numeric(self): def test_get_indexer(self): idx = Float64Index([0.0, 1.0, 2.0]) - tm.assert_numpy_array_equal(idx.get_indexer(idx), - np.array([0, 1, 2], dtype=np.intp)) + tm.assert_numpy_array_equal( + idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) + ) target = [-0.1, 0.5, 1.1] - tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 1, 1], dtype=np.intp)) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "backfill"), np.array([0, 1, 2], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest"), np.array([0, 1, 1], dtype=np.intp) + ) def test_get_loc(self): idx = Float64Index([0.0, 1.0, 2.0]) - for method in [None, 'pad', 'backfill', 'nearest']: + for method in [None, "pad", "backfill", "nearest"]: assert idx.get_loc(1, method) == 1 if method is not None: assert idx.get_loc(1, method, tolerance=0) == 1 - for method, loc in [('pad', 1), ('backfill', 2), ('nearest', 1)]: + for method, loc in [("pad", 1), ("backfill", 2), ("nearest", 1)]: assert idx.get_loc(1.1, method) == loc assert idx.get_loc(1.1, method, tolerance=0.9) == loc with pytest.raises(KeyError, match="^'foo'$"): - idx.get_loc('foo') + idx.get_loc("foo") with pytest.raises(KeyError, match=r"^1\.5$"): idx.get_loc(1.5) with pytest.raises(KeyError, match=r"^1\.5$"): - idx.get_loc(1.5, method='pad', tolerance=0.1) + idx.get_loc(1.5, method="pad", tolerance=0.1) with pytest.raises(KeyError, match="^True$"): idx.get_loc(True) with pytest.raises(KeyError, match="^False$"): idx.get_loc(False) - with pytest.raises(ValueError, match='must be numeric'): - idx.get_loc(1.4, method='nearest', tolerance='foo') + with pytest.raises(ValueError, match="must be numeric"): + idx.get_loc(1.4, method="nearest", tolerance="foo") - with pytest.raises(ValueError, match='must contain numeric elements'): - idx.get_loc(1.4, method='nearest', tolerance=np.array(['foo'])) + with pytest.raises(ValueError, match="must contain numeric elements"): + idx.get_loc(1.4, method="nearest", tolerance=np.array(["foo"])) with pytest.raises( - ValueError, - match='tolerance size must match target index size'): - idx.get_loc(1.4, method='nearest', tolerance=np.array([1, 2])) + ValueError, match="tolerance size must match target index size" + ): + idx.get_loc(1.4, method="nearest", tolerance=np.array([1, 2])) def test_get_loc_na(self): idx = Float64Index([np.nan, 1, 2]) @@ -359,16 +373,14 @@ def test_doesnt_contain_all_the_things(self): def test_nan_multiple_containment(self): i = Float64Index([1.0, np.nan]) tm.assert_numpy_array_equal(i.isin([1.0]), np.array([True, False])) - tm.assert_numpy_array_equal(i.isin([2.0, np.pi]), - np.array([False, False])) + tm.assert_numpy_array_equal(i.isin([2.0, np.pi]), np.array([False, False])) tm.assert_numpy_array_equal(i.isin([np.nan]), np.array([False, True])) - tm.assert_numpy_array_equal(i.isin([1.0, np.nan]), - np.array([True, True])) + tm.assert_numpy_array_equal(i.isin([1.0, np.nan]), np.array([True, True])) i = Float64Index([1.0, 2.0]) tm.assert_numpy_array_equal(i.isin([np.nan]), np.array([False, False])) def test_astype_from_object(self): - index = Index([1.0, np.nan, 0.2], dtype='object') + index = Index([1.0, np.nan, 0.2], dtype="object") result = index.astype(float) expected = Float64Index([1.0, np.nan, 0.2]) assert result.dtype == expected.dtype @@ -376,39 +388,40 @@ def test_astype_from_object(self): def test_fillna_float64(self): # GH 11343 - idx = Index([1.0, np.nan, 3.0], dtype=float, name='x') + idx = Index([1.0, np.nan, 3.0], dtype=float, name="x") # can't downcast - exp = Index([1.0, 0.1, 3.0], name='x') + exp = Index([1.0, 0.1, 3.0], name="x") tm.assert_index_equal(idx.fillna(0.1), exp) # downcast - exp = Float64Index([1.0, 2.0, 3.0], name='x') + exp = Float64Index([1.0, 2.0, 3.0], name="x") tm.assert_index_equal(idx.fillna(2), exp) # object - exp = Index([1.0, 'obj', 3.0], name='x') - tm.assert_index_equal(idx.fillna('obj'), exp) + exp = Index([1.0, "obj", 3.0], name="x") + tm.assert_index_equal(idx.fillna("obj"), exp) def test_take_fill_value(self): # GH 12631 - idx = pd.Float64Index([1., 2., 3.], name='xxx') + idx = pd.Float64Index([1.0, 2.0, 3.0], name="xxx") result = idx.take(np.array([1, 0, -1])) - expected = pd.Float64Index([2., 1., 3.], name='xxx') + expected = pd.Float64Index([2.0, 1.0, 3.0], name="xxx") tm.assert_index_equal(result, expected) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.Float64Index([2., 1., np.nan], name='xxx') + expected = pd.Float64Index([2.0, 1.0, np.nan], name="xxx") tm.assert_index_equal(result, expected) # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.Float64Index([2., 1., 3.], name='xxx') + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.Float64Index([2.0, 1.0, 3.0], name="xxx") tm.assert_index_equal(result, expected) - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): @@ -419,17 +432,16 @@ def test_take_fill_value(self): class NumericInt(Numeric): - def test_view(self): - i = self._holder([], name='Foo') + i = self._holder([], name="Foo") i_view = i.view() - assert i_view.name == 'Foo' + assert i_view.name == "Foo" i_view = i.view(self._dtype) - tm.assert_index_equal(i, self._holder(i_view, name='Foo')) + tm.assert_index_equal(i, self._holder(i_view, name="Foo")) i_view = i.view(self._holder) - tm.assert_index_equal(i, self._holder(i_view, name='Foo')) + tm.assert_index_equal(i, self._holder(i_view, name="Foo")) def test_is_monotonic(self): assert self.index.is_monotonic is True @@ -478,15 +490,16 @@ def test_identical(self): assert not i.identical(same_values_different_type) i = self.index.copy(dtype=object) - i = i.rename('foo') + i = i.rename("foo") same_values = Index(i, dtype=object) assert same_values.identical(i) assert not i.identical(self.index) - assert Index(same_values, name='foo', dtype=object).identical(i) + assert Index(same_values, name="foo", dtype=object).identical(i) assert not self.index.copy(dtype=object).identical( - self.index.copy(dtype=self._dtype)) + self.index.copy(dtype=self._dtype) + ) def test_join_non_unique(self): left = Index([4, 4, 3, 3]) @@ -502,13 +515,14 @@ def test_join_non_unique(self): exp_ridx = np.array([2, 3, 2, 3, 0, 1, 0, 1], dtype=np.intp) tm.assert_numpy_array_equal(ridx, exp_ridx) - @pytest.mark.parametrize('kind', ['outer', 'inner', 'left', 'right']) + @pytest.mark.parametrize("kind", ["outer", "inner", "left", "right"]) def test_join_self(self, kind): joined = self.index.join(self.index, how=kind) assert self.index is joined def test_union_noncomparable(self): from datetime import datetime, timedelta + # corner case, non-Int64Index now = datetime.now() other = Index([now + timedelta(i) for i in range(4)], dtype=object) @@ -521,15 +535,17 @@ def test_union_noncomparable(self): tm.assert_index_equal(result, expected) def test_cant_or_shouldnt_cast(self): - msg = ("String dtype not supported, you may need to explicitly cast to" - " a numeric type") + msg = ( + "String dtype not supported, you may need to explicitly cast to" + " a numeric type" + ) # can't - data = ['foo', 'bar', 'baz'] + data = ["foo", "bar", "baz"] with pytest.raises(TypeError, match=msg): self._holder(data) # shouldn't - data = ['0', '1', '2'] + data = ["0", "1", "2"] with pytest.raises(TypeError, match=msg): self._holder(data) @@ -537,33 +553,33 @@ def test_view_index(self): self.index.view(Index) def test_prevent_casting(self): - result = self.index.astype('O') + result = self.index.astype("O") assert result.dtype == np.object_ def test_take_preserve_name(self): - index = self._holder([1, 2, 3, 4], name='foo') + index = self._holder([1, 2, 3, 4], name="foo") taken = index.take([3, 0, 1]) assert index.name == taken.name def test_take_fill_value(self): # see gh-12631 - idx = self._holder([1, 2, 3], name='xxx') + idx = self._holder([1, 2, 3], name="xxx") result = idx.take(np.array([1, 0, -1])) - expected = self._holder([2, 1, 3], name='xxx') + expected = self._holder([2, 1, 3], name="xxx") tm.assert_index_equal(result, expected) name = self._holder.__name__ - msg = ("Unable to fill values because " - "{name} cannot contain NA").format(name=name) + msg = ("Unable to fill values because " "{name} cannot contain NA").format( + name=name + ) # fill_value=True with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -1]), fill_value=True) # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = self._holder([2, 1, 3], name='xxx') + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = self._holder([2, 1, 3], name="xxx") tm.assert_index_equal(result, expected) with pytest.raises(ValueError, match=msg): @@ -575,21 +591,23 @@ def test_take_fill_value(self): idx.take(np.array([1, -5])) def test_slice_keep_name(self): - idx = self._holder([1, 2], name='asdf') + idx = self._holder([1, 2], name="asdf") assert idx.name == idx[1:].name class TestInt64Index(NumericInt): - _dtype = 'int64' + _dtype = "int64" _holder = Int64Index def setup_method(self, method): - self.indices = dict(index=Int64Index(np.arange(0, 20, 2)), - index_dec=Int64Index(np.arange(19, -1, -1))) + self.indices = dict( + index=Int64Index(np.arange(0, 20, 2)), + index_dec=Int64Index(np.arange(19, -1, -1)), + ) self.setup_indices() def create_index(self): - return Int64Index(np.arange(5, dtype='int64')) + return Int64Index(np.arange(5, dtype="int64")) def test_constructor(self): # pass list, coerce fine @@ -602,8 +620,10 @@ def test_constructor(self): tm.assert_index_equal(index, expected) # scalar raise Exception - msg = (r"Int64Index\(\.\.\.\) must be called with a collection of some" - " kind, 5 was passed") + msg = ( + r"Int64Index\(\.\.\.\) must be called with a collection of some" + " kind, 5 was passed" + ) with pytest.raises(TypeError, match=msg): Int64Index(5) @@ -620,9 +640,11 @@ def test_constructor(self): # interpret list-like expected = Int64Index([5, 0]) for cls in [Index, Int64Index]: - for idx in [cls([5, 0], dtype='int64'), - cls(np.array([5, 0]), dtype='int64'), - cls(Series([5, 0]), dtype='int64')]: + for idx in [ + cls([5, 0], dtype="int64"), + cls(np.array([5, 0]), dtype="int64"), + cls(Series([5, 0]), dtype="int64"), + ]: tm.assert_index_equal(idx, expected) def test_constructor_corner(self): @@ -632,12 +654,12 @@ def test_constructor_corner(self): tm.assert_index_equal(index, Index(arr)) # preventing casting - arr = np.array([1, '2', 3, '4'], dtype=object) - with pytest.raises(TypeError, match='casting'): + arr = np.array([1, "2", 3, "4"], dtype=object) + with pytest.raises(TypeError, match="casting"): Int64Index(arr) arr_with_floats = [0, 2, 3, 4, 5, 1.25, 3, -1] - with pytest.raises(TypeError, match='casting'): + with pytest.raises(TypeError, match="casting"): Int64Index(arr_with_floats) def test_constructor_coercion_signed_to_unsigned(self, uint_dtype): @@ -651,7 +673,7 @@ def test_constructor_coercion_signed_to_unsigned(self, uint_dtype): def test_constructor_unwraps_index(self): idx = pd.Index([1, 2]) result = pd.Int64Index(idx) - expected = np.array([1, 2], dtype='int64') + expected = np.array([1, 2], dtype="int64") tm.assert_numpy_array_equal(result._data, expected) def test_coerce_list(self): @@ -670,25 +692,25 @@ def test_get_indexer(self): tm.assert_numpy_array_equal(indexer, expected) target = Int64Index(np.arange(10)) - indexer = self.index.get_indexer(target, method='pad') + indexer = self.index.get_indexer(target, method="pad") expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) target = Int64Index(np.arange(10)) - indexer = self.index.get_indexer(target, method='backfill') + indexer = self.index.get_indexer(target, method="backfill") expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) def test_intersection(self): other = Index([1, 2, 3, 4, 5]) result = self.index.intersection(other) - expected = Index(np.sort(np.intersect1d(self.index.values, - other.values))) + expected = Index(np.sort(np.intersect1d(self.index.values, other.values))) tm.assert_index_equal(result, expected) result = other.intersection(self.index) - expected = Index(np.sort(np.asarray(np.intersect1d(self.index.values, - other.values)))) + expected = Index( + np.sort(np.asarray(np.intersect1d(self.index.values, other.values))) + ) tm.assert_index_equal(result, expected) def test_join_inner(self): @@ -696,8 +718,7 @@ def test_join_inner(self): other_mono = Int64Index([1, 2, 5, 7, 12, 25]) # not monotonic - res, lidx, ridx = self.index.join(other, how='inner', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="inner", return_indexers=True) # no guarantee of sortedness, so sort for comparison purposes ind = res.argsort() @@ -715,8 +736,7 @@ def test_join_inner(self): tm.assert_numpy_array_equal(ridx, eridx) # monotonic - res, lidx, ridx = self.index.join(other_mono, how='inner', - return_indexers=True) + res, lidx, ridx = self.index.join(other_mono, how="inner", return_indexers=True) res2 = self.index.intersection(other_mono) tm.assert_index_equal(res, res2) @@ -733,11 +753,9 @@ def test_join_left(self): other_mono = Int64Index([1, 2, 5, 7, 12, 25]) # not monotonic - res, lidx, ridx = self.index.join(other, how='left', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="left", return_indexers=True) eres = self.index - eridx = np.array([-1, 4, -1, -1, -1, -1, 1, -1, -1, -1], - dtype=np.intp) + eridx = np.array([-1, 4, -1, -1, -1, -1, 1, -1, -1, -1], dtype=np.intp) assert isinstance(res, Int64Index) tm.assert_index_equal(res, eres) @@ -745,10 +763,8 @@ def test_join_left(self): tm.assert_numpy_array_equal(ridx, eridx) # monotonic - res, lidx, ridx = self.index.join(other_mono, how='left', - return_indexers=True) - eridx = np.array([-1, 1, -1, -1, -1, -1, 4, -1, -1, -1], - dtype=np.intp) + res, lidx, ridx = self.index.join(other_mono, how="left", return_indexers=True) + eridx = np.array([-1, 1, -1, -1, -1, -1, 4, -1, -1, -1], dtype=np.intp) assert isinstance(res, Int64Index) tm.assert_index_equal(res, eres) assert lidx is None @@ -757,7 +773,7 @@ def test_join_left(self): # non-unique idx = Index([1, 1, 2, 5]) idx2 = Index([1, 2, 5, 7, 9]) - res, lidx, ridx = idx2.join(idx, how='left', return_indexers=True) + res, lidx, ridx = idx2.join(idx, how="left", return_indexers=True) eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2 eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) @@ -770,8 +786,7 @@ def test_join_right(self): other_mono = Int64Index([1, 2, 5, 7, 12, 25]) # not monotonic - res, lidx, ridx = self.index.join(other, how='right', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="right", return_indexers=True) eres = other elidx = np.array([-1, 6, -1, -1, 1, -1], dtype=np.intp) @@ -781,8 +796,7 @@ def test_join_right(self): assert ridx is None # monotonic - res, lidx, ridx = self.index.join(other_mono, how='right', - return_indexers=True) + res, lidx, ridx = self.index.join(other_mono, how="right", return_indexers=True) eres = other_mono elidx = np.array([-1, 1, -1, -1, 6, -1], dtype=np.intp) assert isinstance(other, Int64Index) @@ -793,7 +807,7 @@ def test_join_right(self): # non-unique idx = Index([1, 1, 2, 5]) idx2 = Index([1, 2, 5, 7, 9]) - res, lidx, ridx = idx.join(idx2, how='right', return_indexers=True) + res, lidx, ridx = idx.join(idx2, how="right", return_indexers=True) eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2 elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) @@ -804,28 +818,28 @@ def test_join_right(self): def test_join_non_int_index(self): other = Index([3, 6, 7, 8, 10], dtype=object) - outer = self.index.join(other, how='outer') - outer2 = other.join(self.index, how='outer') + outer = self.index.join(other, how="outer") + outer2 = other.join(self.index, how="outer") expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, 16, 18]) tm.assert_index_equal(outer, outer2) tm.assert_index_equal(outer, expected) - inner = self.index.join(other, how='inner') - inner2 = other.join(self.index, how='inner') + inner = self.index.join(other, how="inner") + inner2 = other.join(self.index, how="inner") expected = Index([6, 8, 10]) tm.assert_index_equal(inner, inner2) tm.assert_index_equal(inner, expected) - left = self.index.join(other, how='left') + left = self.index.join(other, how="left") tm.assert_index_equal(left, self.index.astype(object)) - left2 = other.join(self.index, how='left') + left2 = other.join(self.index, how="left") tm.assert_index_equal(left2, other) - right = self.index.join(other, how='right') + right = self.index.join(other, how="right") tm.assert_index_equal(right, other) - right2 = other.join(self.index, how='right') + right2 = other.join(self.index, how="right") tm.assert_index_equal(right2, self.index.astype(object)) def test_join_outer(self): @@ -834,16 +848,15 @@ def test_join_outer(self): # not monotonic # guarantee of sortedness - res, lidx, ridx = self.index.join(other, how='outer', - return_indexers=True) - noidx_res = self.index.join(other, how='outer') + res, lidx, ridx = self.index.join(other, how="outer", return_indexers=True) + noidx_res = self.index.join(other, how="outer") tm.assert_index_equal(res, noidx_res) eres = Int64Index([0, 1, 2, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 25]) - elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], - dtype=np.intp) - eridx = np.array([-1, 3, 4, -1, 5, -1, 0, -1, -1, 1, -1, -1, -1, 2], - dtype=np.intp) + elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], dtype=np.intp) + eridx = np.array( + [-1, 3, 4, -1, 5, -1, 0, -1, -1, 1, -1, -1, -1, 2], dtype=np.intp + ) assert isinstance(res, Int64Index) tm.assert_index_equal(res, eres) @@ -851,15 +864,14 @@ def test_join_outer(self): tm.assert_numpy_array_equal(ridx, eridx) # monotonic - res, lidx, ridx = self.index.join(other_mono, how='outer', - return_indexers=True) - noidx_res = self.index.join(other_mono, how='outer') + res, lidx, ridx = self.index.join(other_mono, how="outer", return_indexers=True) + noidx_res = self.index.join(other_mono, how="outer") tm.assert_index_equal(res, noidx_res) - elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], - dtype=np.intp) - eridx = np.array([-1, 0, 1, -1, 2, -1, 3, -1, -1, 4, -1, -1, -1, 5], - dtype=np.intp) + elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], dtype=np.intp) + eridx = np.array( + [-1, 0, 1, -1, 2, -1, 3, -1, -1, 4, -1, -1, -1, 5], dtype=np.intp + ) assert isinstance(res, Int64Index) tm.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) @@ -868,75 +880,72 @@ def test_join_outer(self): class TestUInt64Index(NumericInt): - _dtype = 'uint64' + _dtype = "uint64" _holder = UInt64Index def setup_method(self, method): - vals = [2**63, 2**63 + 10, 2**63 + 15, 2**63 + 20, 2**63 + 25] - self.indices = dict(index=UInt64Index(vals), - index_dec=UInt64Index(reversed(vals))) + vals = [2 ** 63, 2 ** 63 + 10, 2 ** 63 + 15, 2 ** 63 + 20, 2 ** 63 + 25] + self.indices = dict( + index=UInt64Index(vals), index_dec=UInt64Index(reversed(vals)) + ) self.setup_indices() def create_index(self): - return UInt64Index(np.arange(5, dtype='uint64')) + return UInt64Index(np.arange(5, dtype="uint64")) def test_constructor(self): idx = UInt64Index([1, 2, 3]) res = Index([1, 2, 3], dtype=np.uint64) tm.assert_index_equal(res, idx) - idx = UInt64Index([1, 2**63]) - res = Index([1, 2**63], dtype=np.uint64) + idx = UInt64Index([1, 2 ** 63]) + res = Index([1, 2 ** 63], dtype=np.uint64) tm.assert_index_equal(res, idx) - idx = UInt64Index([1, 2**63]) - res = Index([1, 2**63]) + idx = UInt64Index([1, 2 ** 63]) + res = Index([1, 2 ** 63]) tm.assert_index_equal(res, idx) - idx = Index([-1, 2**63], dtype=object) - res = Index(np.array([-1, 2**63], dtype=object)) + idx = Index([-1, 2 ** 63], dtype=object) + res = Index(np.array([-1, 2 ** 63], dtype=object)) tm.assert_index_equal(res, idx) def test_get_indexer(self): - target = UInt64Index(np.arange(10).astype('uint64') * 5 + 2**63) + target = UInt64Index(np.arange(10).astype("uint64") * 5 + 2 ** 63) indexer = self.index.get_indexer(target) - expected = np.array([0, -1, 1, 2, 3, 4, - -1, -1, -1, -1], dtype=np.intp) + expected = np.array([0, -1, 1, 2, 3, 4, -1, -1, -1, -1], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) - target = UInt64Index(np.arange(10).astype('uint64') * 5 + 2**63) - indexer = self.index.get_indexer(target, method='pad') - expected = np.array([0, 0, 1, 2, 3, 4, - 4, 4, 4, 4], dtype=np.intp) + target = UInt64Index(np.arange(10).astype("uint64") * 5 + 2 ** 63) + indexer = self.index.get_indexer(target, method="pad") + expected = np.array([0, 0, 1, 2, 3, 4, 4, 4, 4, 4], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) - target = UInt64Index(np.arange(10).astype('uint64') * 5 + 2**63) - indexer = self.index.get_indexer(target, method='backfill') - expected = np.array([0, 1, 1, 2, 3, 4, - -1, -1, -1, -1], dtype=np.intp) + target = UInt64Index(np.arange(10).astype("uint64") * 5 + 2 ** 63) + indexer = self.index.get_indexer(target, method="backfill") + expected = np.array([0, 1, 1, 2, 3, 4, -1, -1, -1, -1], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) def test_intersection(self): - other = Index([2**63, 2**63 + 5, 2**63 + 10, 2**63 + 15, 2**63 + 20]) + other = Index([2 ** 63, 2 ** 63 + 5, 2 ** 63 + 10, 2 ** 63 + 15, 2 ** 63 + 20]) result = self.index.intersection(other) - expected = Index(np.sort(np.intersect1d(self.index.values, - other.values))) + expected = Index(np.sort(np.intersect1d(self.index.values, other.values))) tm.assert_index_equal(result, expected) result = other.intersection(self.index) - expected = Index(np.sort(np.asarray(np.intersect1d(self.index.values, - other.values)))) + expected = Index( + np.sort(np.asarray(np.intersect1d(self.index.values, other.values))) + ) tm.assert_index_equal(result, expected) def test_join_inner(self): - other = UInt64Index(2**63 + np.array( - [7, 12, 25, 1, 2, 10], dtype='uint64')) - other_mono = UInt64Index(2**63 + np.array( - [1, 2, 7, 10, 12, 25], dtype='uint64')) + other = UInt64Index(2 ** 63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64")) + other_mono = UInt64Index( + 2 ** 63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64") + ) # not monotonic - res, lidx, ridx = self.index.join(other, how='inner', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="inner", return_indexers=True) # no guarantee of sortedness, so sort for comparison purposes ind = res.argsort() @@ -944,7 +953,7 @@ def test_join_inner(self): lidx = lidx.take(ind) ridx = ridx.take(ind) - eres = UInt64Index(2**63 + np.array([10, 25], dtype='uint64')) + eres = UInt64Index(2 ** 63 + np.array([10, 25], dtype="uint64")) elidx = np.array([1, 4], dtype=np.intp) eridx = np.array([5, 2], dtype=np.intp) @@ -954,8 +963,7 @@ def test_join_inner(self): tm.assert_numpy_array_equal(ridx, eridx) # monotonic - res, lidx, ridx = self.index.join(other_mono, how='inner', - return_indexers=True) + res, lidx, ridx = self.index.join(other_mono, how="inner", return_indexers=True) res2 = self.index.intersection(other_mono) tm.assert_index_equal(res, res2) @@ -969,14 +977,13 @@ def test_join_inner(self): tm.assert_numpy_array_equal(ridx, eridx) def test_join_left(self): - other = UInt64Index(2**63 + np.array( - [7, 12, 25, 1, 2, 10], dtype='uint64')) - other_mono = UInt64Index(2**63 + np.array( - [1, 2, 7, 10, 12, 25], dtype='uint64')) + other = UInt64Index(2 ** 63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64")) + other_mono = UInt64Index( + 2 ** 63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64") + ) # not monotonic - res, lidx, ridx = self.index.join(other, how='left', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="left", return_indexers=True) eres = self.index eridx = np.array([-1, 5, -1, -1, 2], dtype=np.intp) @@ -986,8 +993,7 @@ def test_join_left(self): tm.assert_numpy_array_equal(ridx, eridx) # monotonic - res, lidx, ridx = self.index.join(other_mono, how='left', - return_indexers=True) + res, lidx, ridx = self.index.join(other_mono, how="left", return_indexers=True) eridx = np.array([-1, 3, -1, -1, 5], dtype=np.intp) assert isinstance(res, UInt64Index) @@ -996,13 +1002,12 @@ def test_join_left(self): tm.assert_numpy_array_equal(ridx, eridx) # non-unique - idx = UInt64Index(2**63 + np.array([1, 1, 2, 5], dtype='uint64')) - idx2 = UInt64Index(2**63 + np.array([1, 2, 5, 7, 9], dtype='uint64')) - res, lidx, ridx = idx2.join(idx, how='left', return_indexers=True) + idx = UInt64Index(2 ** 63 + np.array([1, 1, 2, 5], dtype="uint64")) + idx2 = UInt64Index(2 ** 63 + np.array([1, 2, 5, 7, 9], dtype="uint64")) + res, lidx, ridx = idx2.join(idx, how="left", return_indexers=True) # 1 is in idx2, so it should be x2 - eres = UInt64Index(2**63 + np.array( - [1, 1, 2, 5, 7, 9], dtype='uint64')) + eres = UInt64Index(2 ** 63 + np.array([1, 1, 2, 5, 7, 9], dtype="uint64")) eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) @@ -1011,14 +1016,13 @@ def test_join_left(self): tm.assert_numpy_array_equal(ridx, eridx) def test_join_right(self): - other = UInt64Index(2**63 + np.array( - [7, 12, 25, 1, 2, 10], dtype='uint64')) - other_mono = UInt64Index(2**63 + np.array( - [1, 2, 7, 10, 12, 25], dtype='uint64')) + other = UInt64Index(2 ** 63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64")) + other_mono = UInt64Index( + 2 ** 63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64") + ) # not monotonic - res, lidx, ridx = self.index.join(other, how='right', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="right", return_indexers=True) eres = other elidx = np.array([-1, -1, 4, -1, -1, 1], dtype=np.intp) @@ -1028,8 +1032,7 @@ def test_join_right(self): assert ridx is None # monotonic - res, lidx, ridx = self.index.join(other_mono, how='right', - return_indexers=True) + res, lidx, ridx = self.index.join(other_mono, how="right", return_indexers=True) eres = other_mono elidx = np.array([-1, -1, -1, 1, -1, 4], dtype=np.intp) @@ -1039,13 +1042,12 @@ def test_join_right(self): assert ridx is None # non-unique - idx = UInt64Index(2**63 + np.array([1, 1, 2, 5], dtype='uint64')) - idx2 = UInt64Index(2**63 + np.array([1, 2, 5, 7, 9], dtype='uint64')) - res, lidx, ridx = idx.join(idx2, how='right', return_indexers=True) + idx = UInt64Index(2 ** 63 + np.array([1, 1, 2, 5], dtype="uint64")) + idx2 = UInt64Index(2 ** 63 + np.array([1, 2, 5, 7, 9], dtype="uint64")) + res, lidx, ridx = idx.join(idx2, how="right", return_indexers=True) # 1 is in idx2, so it should be x2 - eres = UInt64Index(2**63 + np.array( - [1, 1, 2, 5, 7, 9], dtype='uint64')) + eres = UInt64Index(2 ** 63 + np.array([1, 1, 2, 5, 7, 9], dtype="uint64")) elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) @@ -1054,49 +1056,51 @@ def test_join_right(self): tm.assert_numpy_array_equal(ridx, eridx) def test_join_non_int_index(self): - other = Index(2**63 + np.array( - [1, 5, 7, 10, 20], dtype='uint64'), dtype=object) - - outer = self.index.join(other, how='outer') - outer2 = other.join(self.index, how='outer') - expected = Index(2**63 + np.array( - [0, 1, 5, 7, 10, 15, 20, 25], dtype='uint64')) + other = Index( + 2 ** 63 + np.array([1, 5, 7, 10, 20], dtype="uint64"), dtype=object + ) + + outer = self.index.join(other, how="outer") + outer2 = other.join(self.index, how="outer") + expected = Index( + 2 ** 63 + np.array([0, 1, 5, 7, 10, 15, 20, 25], dtype="uint64") + ) tm.assert_index_equal(outer, outer2) tm.assert_index_equal(outer, expected) - inner = self.index.join(other, how='inner') - inner2 = other.join(self.index, how='inner') - expected = Index(2**63 + np.array([10, 20], dtype='uint64')) + inner = self.index.join(other, how="inner") + inner2 = other.join(self.index, how="inner") + expected = Index(2 ** 63 + np.array([10, 20], dtype="uint64")) tm.assert_index_equal(inner, inner2) tm.assert_index_equal(inner, expected) - left = self.index.join(other, how='left') + left = self.index.join(other, how="left") tm.assert_index_equal(left, self.index.astype(object)) - left2 = other.join(self.index, how='left') + left2 = other.join(self.index, how="left") tm.assert_index_equal(left2, other) - right = self.index.join(other, how='right') + right = self.index.join(other, how="right") tm.assert_index_equal(right, other) - right2 = other.join(self.index, how='right') + right2 = other.join(self.index, how="right") tm.assert_index_equal(right2, self.index.astype(object)) def test_join_outer(self): - other = UInt64Index(2**63 + np.array( - [7, 12, 25, 1, 2, 10], dtype='uint64')) - other_mono = UInt64Index(2**63 + np.array( - [1, 2, 7, 10, 12, 25], dtype='uint64')) + other = UInt64Index(2 ** 63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64")) + other_mono = UInt64Index( + 2 ** 63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64") + ) # not monotonic # guarantee of sortedness - res, lidx, ridx = self.index.join(other, how='outer', - return_indexers=True) - noidx_res = self.index.join(other, how='outer') + res, lidx, ridx = self.index.join(other, how="outer", return_indexers=True) + noidx_res = self.index.join(other, how="outer") tm.assert_index_equal(res, noidx_res) - eres = UInt64Index(2**63 + np.array( - [0, 1, 2, 7, 10, 12, 15, 20, 25], dtype='uint64')) + eres = UInt64Index( + 2 ** 63 + np.array([0, 1, 2, 7, 10, 12, 15, 20, 25], dtype="uint64") + ) elidx = np.array([0, -1, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp) eridx = np.array([-1, 3, 4, 0, 5, 1, -1, -1, 2], dtype=np.intp) @@ -1106,9 +1110,8 @@ def test_join_outer(self): tm.assert_numpy_array_equal(ridx, eridx) # monotonic - res, lidx, ridx = self.index.join(other_mono, how='outer', - return_indexers=True) - noidx_res = self.index.join(other_mono, how='outer') + res, lidx, ridx = self.index.join(other_mono, how="outer", return_indexers=True) + noidx_res = self.index.join(other_mono, how="outer") tm.assert_index_equal(res, noidx_res) elidx = np.array([0, -1, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp) @@ -1120,7 +1123,7 @@ def test_join_outer(self): tm.assert_numpy_array_equal(ridx, eridx) -@pytest.mark.parametrize("dtype", ['int64', 'uint64']) +@pytest.mark.parametrize("dtype", ["int64", "uint64"]) def test_int_float_union_dtype(dtype): # https://github.com/pandas-dev/pandas/issues/26778 # [u]int | float -> float diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index 349d10f5079e86..f9ca1bca041653 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -2,19 +2,46 @@ import pytest from pandas import ( - DatetimeIndex, Float64Index, Index, Int64Index, TimedeltaIndex, - UInt64Index, _np_version_under1p17) + DatetimeIndex, + Float64Index, + Index, + Int64Index, + TimedeltaIndex, + UInt64Index, + _np_version_under1p17, +) from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin from pandas.util import testing as tm @pytest.mark.parametrize( - 'func', [np.exp, np.exp2, np.expm1, np.log, np.log2, np.log10, - np.log1p, np.sqrt, np.sin, np.cos, np.tan, np.arcsin, - np.arccos, np.arctan, np.sinh, np.cosh, np.tanh, - np.arcsinh, np.arccosh, np.arctanh, np.deg2rad, - np.rad2deg], - ids=lambda x: x.__name__) + "func", + [ + np.exp, + np.exp2, + np.expm1, + np.log, + np.log2, + np.log10, + np.log1p, + np.sqrt, + np.sin, + np.cos, + np.tan, + np.arcsin, + np.arccos, + np.arctan, + np.sinh, + np.cosh, + np.tanh, + np.arcsinh, + np.arccosh, + np.arctanh, + np.deg2rad, + np.rad2deg, + ], + ids=lambda x: x.__name__, +) def test_numpy_ufuncs_basic(indices, func): # test ufuncs of numpy, see: # http://docs.scipy.org/doc/numpy/reference/ufuncs.html @@ -23,11 +50,11 @@ def test_numpy_ufuncs_basic(indices, func): if isinstance(idx, DatetimeIndexOpsMixin): # raise TypeError or ValueError (PeriodIndex) with pytest.raises(Exception): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): func(idx) elif isinstance(idx, (Float64Index, Int64Index, UInt64Index)): # coerces to float (e.g. np.sin) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = func(idx) exp = Index(func(idx.values), name=idx.name) @@ -39,13 +66,13 @@ def test_numpy_ufuncs_basic(indices, func): pass else: with pytest.raises(Exception): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): func(idx) @pytest.mark.parametrize( - 'func', [np.isfinite, np.isinf, np.isnan, np.signbit], - ids=lambda x: x.__name__) + "func", [np.isfinite, np.isinf, np.isnan, np.signbit], ids=lambda x: x.__name__ +) def test_numpy_ufuncs_other(indices, func): # test ufuncs of numpy, see: # http://docs.scipy.org/doc/numpy/reference/ufuncs.html @@ -90,6 +117,5 @@ def test_elementwise_comparison_warning(): # When NumPy dev actually enforces this change, we'll need to skip # this test. idx = Index([1, 2]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - idx == 'a' + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + idx == "a" diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 7cdf5db64b3a9f..213d9c65052291 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -20,11 +20,13 @@ class TestRangeIndex(Numeric): _holder = RangeIndex - _compat_props = ['shape', 'ndim', 'size'] + _compat_props = ["shape", "ndim", "size"] def setup_method(self, method): - self.indices = dict(index=RangeIndex(0, 20, 2, name='foo'), - index_dec=RangeIndex(18, -1, -2, name='bar')) + self.indices = dict( + index=RangeIndex(0, 20, 2, name="foo"), + index_dec=RangeIndex(18, -1, -2, name="bar"), + ) self.setup_indices() def create_index(self): @@ -39,19 +41,22 @@ def test_too_many_names(self): with pytest.raises(ValueError, match="^Length"): self.index.names = ["roger", "harold"] - @pytest.mark.parametrize('name', [None, 'foo']) - @pytest.mark.parametrize('args, kwargs, start, stop, step', [ - ((5,), dict(), 0, 5, 1), - ((1, 5), dict(), 1, 5, 1), - ((1, 5, 2), dict(), 1, 5, 2), - ((0,), dict(), 0, 0, 1), - ((0, 0), dict(), 0, 0, 1), - (tuple(), dict(start=0), 0, 0, 1), - (tuple(), dict(stop=0), 0, 0, 1)]) + @pytest.mark.parametrize("name", [None, "foo"]) + @pytest.mark.parametrize( + "args, kwargs, start, stop, step", + [ + ((5,), dict(), 0, 5, 1), + ((1, 5), dict(), 1, 5, 1), + ((1, 5, 2), dict(), 1, 5, 2), + ((0,), dict(), 0, 0, 1), + ((0, 0), dict(), 0, 0, 1), + (tuple(), dict(start=0), 0, 0, 1), + (tuple(), dict(stop=0), 0, 0, 1), + ], + ) def test_constructor(self, args, kwargs, start, stop, step, name): result = RangeIndex(*args, name=name, **kwargs) - expected = Index(np.arange(start, stop, step, dtype=np.int64), - name=name) + expected = Index(np.arange(start, stop, step, dtype=np.int64), name=name) assert isinstance(result, RangeIndex) assert result.name is name assert result._range == range(start, stop, step) @@ -63,18 +68,28 @@ def test_constructor_invalid_args(self): RangeIndex() with pytest.raises(TypeError, match=msg): - RangeIndex(name='Foo') + RangeIndex(name="Foo") # invalid args - for i in [Index(['a', 'b']), Series(['a', 'b']), np.array(['a', 'b']), - [], 'foo', datetime(2000, 1, 1, 0, 0), np.arange(0, 10), - np.array([1]), [1]]: + for i in [ + Index(["a", "b"]), + Series(["a", "b"]), + np.array(["a", "b"]), + [], + "foo", + datetime(2000, 1, 1, 0, 0), + np.arange(0, 10), + np.array([1]), + [1], + ]: with pytest.raises(TypeError): RangeIndex(i) # we don't allow on a bare Index - msg = (r'Index\(\.\.\.\) must be called with a collection of some ' - r'kind, 0 was passed') + msg = ( + r"Index\(\.\.\.\) must be called with a collection of some " + r"kind, 0 was passed" + ) with pytest.raises(TypeError, match=msg): Index(0, 1000) @@ -92,7 +107,7 @@ def test_constructor_same(self): tm.assert_index_equal(result, index, exact=True) with pytest.raises(TypeError): - RangeIndex(index, dtype='float64') + RangeIndex(index, dtype="float64") def test_constructor_range(self): @@ -122,29 +137,29 @@ def test_constructor_range(self): tm.assert_index_equal(result, expected, exact=True) with pytest.raises(TypeError): - Index(range(1, 5, 2), dtype='float64') - msg = r'^from_range\(\) got an unexpected keyword argument' + Index(range(1, 5, 2), dtype="float64") + msg = r"^from_range\(\) got an unexpected keyword argument" with pytest.raises(TypeError, match=msg): pd.RangeIndex.from_range(range(10), copy=True) def test_constructor_name(self): # GH12288 orig = RangeIndex(10) - orig.name = 'original' + orig.name = "original" copy = RangeIndex(orig) - copy.name = 'copy' + copy.name = "copy" - assert orig.name == 'original' - assert copy.name == 'copy' + assert orig.name == "original" + assert copy.name == "copy" new = Index(copy) - assert new.name == 'copy' + assert new.name == "copy" - new.name = 'new' - assert orig.name == 'original' - assert copy.name == 'copy' - assert new.name == 'new' + new.name = "new" + assert orig.name == "original" + assert copy.name == "copy" + assert new.name == "new" def test_constructor_corner(self): arr = np.array([1, 2, 3, 4], dtype=object) @@ -154,26 +169,30 @@ def test_constructor_corner(self): # non-int raise Exception with pytest.raises(TypeError): - RangeIndex('1', '10', '1') + RangeIndex("1", "10", "1") with pytest.raises(TypeError): RangeIndex(1.1, 10.2, 1.3) # invalid passed type with pytest.raises(TypeError): - RangeIndex(1, 5, dtype='float64') - - @pytest.mark.parametrize('index, start, stop, step', [ - (RangeIndex(5), 0, 5, 1), - (RangeIndex(0, 5), 0, 5, 1), - (RangeIndex(5, step=2), 0, 5, 2), - (RangeIndex(1, 5, 2), 1, 5, 2)]) + RangeIndex(1, 5, dtype="float64") + + @pytest.mark.parametrize( + "index, start, stop, step", + [ + (RangeIndex(5), 0, 5, 1), + (RangeIndex(0, 5), 0, 5, 1), + (RangeIndex(5, step=2), 0, 5, 2), + (RangeIndex(1, 5, 2), 1, 5, 2), + ], + ) def test_start_stop_step_attrs(self, index, start, stop, step): # GH 25710 assert index.start == start assert index.stop == stop assert index.step == step - @pytest.mark.parametrize('attr_name', ['_start', '_stop', '_step']) + @pytest.mark.parametrize("attr_name", ["_start", "_stop", "_step"]) def test_deprecated_start_stop_step_attrs(self, attr_name): # GH 26581 idx = self.create_index() @@ -181,15 +200,15 @@ def test_deprecated_start_stop_step_attrs(self, attr_name): getattr(idx, attr_name) def test_copy(self): - i = RangeIndex(5, name='Foo') + i = RangeIndex(5, name="Foo") i_copy = i.copy() assert i_copy is not i assert i_copy.identical(i) assert i_copy._range == range(0, 5, 1) - assert i_copy.name == 'Foo' + assert i_copy.name == "Foo" def test_repr(self): - i = RangeIndex(5, name='Foo') + i = RangeIndex(5, name="Foo") result = repr(i) expected = "RangeIndex(start=0, stop=5, step=1, name='Foo')" assert result == expected @@ -207,7 +226,7 @@ def test_repr(self): def test_insert(self): - idx = RangeIndex(5, name='Foo') + idx = RangeIndex(5, name="Foo") result = idx[1:4] # test 0th element @@ -221,7 +240,7 @@ def test_insert(self): def test_delete(self): - idx = RangeIndex(5, name='Foo') + idx = RangeIndex(5, name="Foo") expected = idx[1:].astype(int) result = idx.delete(0) tm.assert_index_equal(result, expected) @@ -237,11 +256,11 @@ def test_delete(self): result = idx.delete(len(idx)) def test_view(self): - i = RangeIndex(0, name='Foo') + i = RangeIndex(0, name="Foo") i_view = i.view() - assert i_view.name == 'Foo' + assert i_view.name == "Foo" - i_view = i.view('i8') + i_view = i.view("i8") tm.assert_numpy_array_equal(i.values, i_view) i_view = i.view(RangeIndex) @@ -287,7 +306,7 @@ def test_cached_data(self): idx.any() assert idx._cached_data is None - df = pd.DataFrame({'a': range(10)}, index=idx) + df = pd.DataFrame({"a": range(10)}, index=idx) df.loc[50] assert idx._cached_data is None @@ -341,10 +360,12 @@ def test_is_monotonic(self): assert index._is_strictly_monotonic_decreasing is True def test_equals_range(self): - equiv_pairs = [(RangeIndex(0, 9, 2), RangeIndex(0, 10, 2)), - (RangeIndex(0), RangeIndex(1, -1, 3)), - (RangeIndex(1, 2, 3), RangeIndex(1, 3, 4)), - (RangeIndex(0, -9, -2), RangeIndex(0, -10, -2))] + equiv_pairs = [ + (RangeIndex(0, 9, 2), RangeIndex(0, 10, 2)), + (RangeIndex(0), RangeIndex(1, -1, 3)), + (RangeIndex(1, 2, 3), RangeIndex(1, 3, 4)), + (RangeIndex(0, -9, -2), RangeIndex(0, -10, -2)), + ] for left, right in equiv_pairs: assert left.equals(right) assert right.equals(left) @@ -366,15 +387,16 @@ def test_identical(self): assert not i.identical(same_values_different_type) i = self.index.copy(dtype=object) - i = i.rename('foo') + i = i.rename("foo") same_values = Index(i, dtype=object) assert same_values.identical(self.index.copy(dtype=object)) assert not i.identical(self.index) - assert Index(same_values, name='foo', dtype=object).identical(i) + assert Index(same_values, name="foo", dtype=object).identical(i) assert not self.index.copy(dtype=object).identical( - self.index.copy(dtype='int64')) + self.index.copy(dtype="int64") + ) def test_get_indexer(self): target = RangeIndex(10) @@ -384,13 +406,13 @@ def test_get_indexer(self): def test_get_indexer_pad(self): target = RangeIndex(10) - indexer = self.index.get_indexer(target, method='pad') + indexer = self.index.get_indexer(target, method="pad") expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) def test_get_indexer_backfill(self): target = RangeIndex(10) - indexer = self.index.get_indexer(target, method='backfill') + indexer = self.index.get_indexer(target, method="backfill") expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) @@ -398,17 +420,21 @@ def test_join_outer(self): # join with Int64Index other = Int64Index(np.arange(25, 14, -1)) - res, lidx, ridx = self.index.join(other, how='outer', - return_indexers=True) - noidx_res = self.index.join(other, how='outer') + res, lidx, ridx = self.index.join(other, how="outer", return_indexers=True) + noidx_res = self.index.join(other, how="outer") tm.assert_index_equal(res, noidx_res) - eres = Int64Index([0, 2, 4, 6, 8, 10, 12, 14, 15, 16, 17, 18, 19, 20, - 21, 22, 23, 24, 25]) - elidx = np.array([0, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, 9, - -1, -1, -1, -1, -1, -1, -1], dtype=np.intp) - eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 10, 9, 8, 7, 6, - 5, 4, 3, 2, 1, 0], dtype=np.intp) + eres = Int64Index( + [0, 2, 4, 6, 8, 10, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25] + ) + elidx = np.array( + [0, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, 9, -1, -1, -1, -1, -1, -1, -1], + dtype=np.intp, + ) + eridx = np.array( + [-1, -1, -1, -1, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], + dtype=np.intp, + ) assert isinstance(res, Int64Index) assert not isinstance(res, RangeIndex) @@ -419,9 +445,8 @@ def test_join_outer(self): # join with RangeIndex other = RangeIndex(25, 14, -1) - res, lidx, ridx = self.index.join(other, how='outer', - return_indexers=True) - noidx_res = self.index.join(other, how='outer') + res, lidx, ridx = self.index.join(other, how="outer", return_indexers=True) + noidx_res = self.index.join(other, how="outer") tm.assert_index_equal(res, noidx_res) assert isinstance(res, Int64Index) @@ -434,8 +459,7 @@ def test_join_inner(self): # Join with non-RangeIndex other = Int64Index(np.arange(25, 14, -1)) - res, lidx, ridx = self.index.join(other, how='inner', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="inner", return_indexers=True) # no guarantee of sortedness, so sort for comparison purposes ind = res.argsort() @@ -455,8 +479,7 @@ def test_join_inner(self): # Join two RangeIndex other = RangeIndex(25, 14, -1) - res, lidx, ridx = self.index.join(other, how='inner', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="inner", return_indexers=True) assert isinstance(res, RangeIndex) tm.assert_index_equal(res, eres) @@ -467,8 +490,7 @@ def test_join_left(self): # Join with Int64Index other = Int64Index(np.arange(25, 14, -1)) - res, lidx, ridx = self.index.join(other, how='left', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="left", return_indexers=True) eres = self.index eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 9, 7], dtype=np.intp) @@ -480,8 +502,7 @@ def test_join_left(self): # Join withRangeIndex other = Int64Index(np.arange(25, 14, -1)) - res, lidx, ridx = self.index.join(other, how='left', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="left", return_indexers=True) assert isinstance(res, RangeIndex) tm.assert_index_equal(res, eres) @@ -492,11 +513,9 @@ def test_join_right(self): # Join with Int64Index other = Int64Index(np.arange(25, 14, -1)) - res, lidx, ridx = self.index.join(other, how='right', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="right", return_indexers=True) eres = other - elidx = np.array([-1, -1, -1, -1, -1, -1, -1, 9, -1, 8, -1], - dtype=np.intp) + elidx = np.array([-1, -1, -1, -1, -1, -1, -1, 9, -1, 8, -1], dtype=np.intp) assert isinstance(other, Int64Index) tm.assert_index_equal(res, eres) @@ -506,8 +525,7 @@ def test_join_right(self): # Join withRangeIndex other = RangeIndex(25, 14, -1) - res, lidx, ridx = self.index.join(other, how='right', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="right", return_indexers=True) eres = other assert isinstance(other, RangeIndex) @@ -518,28 +536,28 @@ def test_join_right(self): def test_join_non_int_index(self): other = Index([3, 6, 7, 8, 10], dtype=object) - outer = self.index.join(other, how='outer') - outer2 = other.join(self.index, how='outer') + outer = self.index.join(other, how="outer") + outer2 = other.join(self.index, how="outer") expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, 16, 18]) tm.assert_index_equal(outer, outer2) tm.assert_index_equal(outer, expected) - inner = self.index.join(other, how='inner') - inner2 = other.join(self.index, how='inner') + inner = self.index.join(other, how="inner") + inner2 = other.join(self.index, how="inner") expected = Index([6, 8, 10]) tm.assert_index_equal(inner, inner2) tm.assert_index_equal(inner, expected) - left = self.index.join(other, how='left') + left = self.index.join(other, how="left") tm.assert_index_equal(left, self.index.astype(object)) - left2 = other.join(self.index, how='left') + left2 = other.join(self.index, how="left") tm.assert_index_equal(left2, other) - right = self.index.join(other, how='right') + right = self.index.join(other, how="right") tm.assert_index_equal(right, other) - right2 = other.join(self.index, how='right') + right2 = other.join(self.index, how="right") tm.assert_index_equal(right2, self.index.astype(object)) def test_join_non_unique(self): @@ -549,15 +567,14 @@ def test_join_non_unique(self): eres = Int64Index([0, 2, 4, 4, 6, 8, 10, 12, 14, 16, 18]) elidx = np.array([0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.intp) - eridx = np.array([-1, -1, 0, 1, -1, -1, -1, -1, -1, -1, -1], - dtype=np.intp) + eridx = np.array([-1, -1, 0, 1, -1, -1, -1, -1, -1, -1, -1], dtype=np.intp) tm.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx) def test_join_self(self): - kinds = 'outer', 'inner', 'left', 'right' + kinds = "outer", "inner", "left", "right" for kind in kinds: joined = self.index.join(self.index, how=kind) assert self.index is joined @@ -567,27 +584,25 @@ def test_intersection(self, sort): # intersect with Int64Index other = Index(np.arange(1, 6)) result = self.index.intersection(other, sort=sort) - expected = Index(np.sort(np.intersect1d(self.index.values, - other.values))) + expected = Index(np.sort(np.intersect1d(self.index.values, other.values))) tm.assert_index_equal(result, expected) result = other.intersection(self.index, sort=sort) - expected = Index(np.sort(np.asarray(np.intersect1d(self.index.values, - other.values)))) + expected = Index( + np.sort(np.asarray(np.intersect1d(self.index.values, other.values))) + ) tm.assert_index_equal(result, expected) # intersect with increasing RangeIndex other = RangeIndex(1, 6) result = self.index.intersection(other, sort=sort) - expected = Index(np.sort(np.intersect1d(self.index.values, - other.values))) + expected = Index(np.sort(np.intersect1d(self.index.values, other.values))) tm.assert_index_equal(result, expected) # intersect with decreasing RangeIndex other = RangeIndex(5, 0, -1) result = self.index.intersection(other, sort=sort) - expected = Index(np.sort(np.intersect1d(self.index.values, - other.values))) + expected = Index(np.sort(np.intersect1d(self.index.values, other.values))) tm.assert_index_equal(result, expected) # reversed (GH 17296) @@ -634,9 +649,10 @@ def test_intersection(self, sort): expected = RangeIndex(0, 0, 1) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('sort', [False, None]) + @pytest.mark.parametrize("sort", [False, None]) def test_union_noncomparable(self, sort): from datetime import datetime, timedelta + # corner case, non-Int64Index now = datetime.now() other = Index([now + timedelta(i) for i in range(4)], dtype=object) @@ -648,40 +664,72 @@ def test_union_noncomparable(self, sort): expected = Index(np.concatenate((other, self.index))) tm.assert_index_equal(result, expected) - @pytest.fixture(params=[ - (RI(0, 10, 1), RI(0, 10, 1), RI(0, 10, 1), RI(0, 10, 1)), - (RI(0, 10, 1), RI(5, 20, 1), RI(0, 20, 1), I64(range(20))), - (RI(0, 10, 1), RI(10, 20, 1), RI(0, 20, 1), I64(range(20))), - (RI(0, -10, -1), RI(0, -10, -1), RI(0, -10, -1), RI(0, -10, -1)), - (RI(0, -10, -1), RI(-10, -20, -1), RI(-19, 1, 1), - I64(range(0, -20, -1))), - (RI(0, 10, 2), RI(1, 10, 2), RI(0, 10, 1), - I64(list(range(0, 10, 2)) + list(range(1, 10, 2)))), - (RI(0, 11, 2), RI(1, 12, 2), RI(0, 12, 1), - I64(list(range(0, 11, 2)) + list(range(1, 12, 2)))), - (RI(0, 21, 4), RI(-2, 24, 4), RI(-2, 24, 2), - I64(list(range(0, 21, 4)) + list(range(-2, 24, 4)))), - (RI(0, -20, -2), RI(-1, -21, -2), RI(-19, 1, 1), - I64(list(range(0, -20, -2)) + list(range(-1, -21, -2)))), - (RI(0, 100, 5), RI(0, 100, 20), RI(0, 100, 5), I64(range(0, 100, 5))), - (RI(0, -100, -5), RI(5, -100, -20), RI(-95, 10, 5), - I64(list(range(0, -100, -5)) + [5])), - (RI(0, -11, -1), RI(1, -12, -4), RI(-11, 2, 1), - I64(list(range(0, -11, -1)) + [1, -11])), - (RI(0), RI(0), RI(0), RI(0)), - (RI(0, -10, -2), RI(0), RI(0, -10, -2), RI(0, -10, -2)), - (RI(0, 100, 2), RI(100, 150, 200), RI(0, 102, 2), - I64(range(0, 102, 2))), - (RI(0, -100, -2), RI(-100, 50, 102), RI(-100, 4, 2), - I64(list(range(0, -100, -2)) + [-100, 2])), - (RI(0, -100, -1), RI(0, -50, -3), RI(-99, 1, 1), - I64(list(range(0, -100, -1)))), - (RI(0, 1, 1), RI(5, 6, 10), RI(0, 6, 5), I64([0, 5])), - (RI(0, 10, 5), RI(-5, -6, -20), RI(-5, 10, 5), I64([0, 5, -5])), - (RI(0, 3, 1), RI(4, 5, 1), I64([0, 1, 2, 4]), I64([0, 1, 2, 4])), - (RI(0, 10, 1), I64([]), RI(0, 10, 1), RI(0, 10, 1)), - (RI(0), I64([1, 5, 6]), I64([1, 5, 6]), I64([1, 5, 6])) - ]) + @pytest.fixture( + params=[ + (RI(0, 10, 1), RI(0, 10, 1), RI(0, 10, 1), RI(0, 10, 1)), + (RI(0, 10, 1), RI(5, 20, 1), RI(0, 20, 1), I64(range(20))), + (RI(0, 10, 1), RI(10, 20, 1), RI(0, 20, 1), I64(range(20))), + (RI(0, -10, -1), RI(0, -10, -1), RI(0, -10, -1), RI(0, -10, -1)), + (RI(0, -10, -1), RI(-10, -20, -1), RI(-19, 1, 1), I64(range(0, -20, -1))), + ( + RI(0, 10, 2), + RI(1, 10, 2), + RI(0, 10, 1), + I64(list(range(0, 10, 2)) + list(range(1, 10, 2))), + ), + ( + RI(0, 11, 2), + RI(1, 12, 2), + RI(0, 12, 1), + I64(list(range(0, 11, 2)) + list(range(1, 12, 2))), + ), + ( + RI(0, 21, 4), + RI(-2, 24, 4), + RI(-2, 24, 2), + I64(list(range(0, 21, 4)) + list(range(-2, 24, 4))), + ), + ( + RI(0, -20, -2), + RI(-1, -21, -2), + RI(-19, 1, 1), + I64(list(range(0, -20, -2)) + list(range(-1, -21, -2))), + ), + (RI(0, 100, 5), RI(0, 100, 20), RI(0, 100, 5), I64(range(0, 100, 5))), + ( + RI(0, -100, -5), + RI(5, -100, -20), + RI(-95, 10, 5), + I64(list(range(0, -100, -5)) + [5]), + ), + ( + RI(0, -11, -1), + RI(1, -12, -4), + RI(-11, 2, 1), + I64(list(range(0, -11, -1)) + [1, -11]), + ), + (RI(0), RI(0), RI(0), RI(0)), + (RI(0, -10, -2), RI(0), RI(0, -10, -2), RI(0, -10, -2)), + (RI(0, 100, 2), RI(100, 150, 200), RI(0, 102, 2), I64(range(0, 102, 2))), + ( + RI(0, -100, -2), + RI(-100, 50, 102), + RI(-100, 4, 2), + I64(list(range(0, -100, -2)) + [-100, 2]), + ), + ( + RI(0, -100, -1), + RI(0, -50, -3), + RI(-99, 1, 1), + I64(list(range(0, -100, -1))), + ), + (RI(0, 1, 1), RI(5, 6, 10), RI(0, 6, 5), I64([0, 5])), + (RI(0, 10, 5), RI(-5, -6, -20), RI(-5, 10, 5), I64([0, 5, -5])), + (RI(0, 3, 1), RI(4, 5, 1), I64([0, 1, 2, 4]), I64([0, 1, 2, 4])), + (RI(0, 10, 1), I64([]), RI(0, 10, 1), RI(0, 10, 1)), + (RI(0), I64([1, 5, 6]), I64([1, 5, 6]), I64([1, 5, 6])), + ] + ) def unions(self, request): """Inputs and expected outputs for RangeIndex.union tests""" @@ -715,29 +763,29 @@ def test_nbytes(self): def test_cant_or_shouldnt_cast(self): # can't with pytest.raises(TypeError): - RangeIndex('foo', 'bar', 'baz') + RangeIndex("foo", "bar", "baz") # shouldn't with pytest.raises(TypeError): - RangeIndex('0', '1', '2') + RangeIndex("0", "1", "2") def test_view_Index(self): self.index.view(Index) def test_prevent_casting(self): - result = self.index.astype('O') + result = self.index.astype("O") assert result.dtype == np.object_ def test_take_preserve_name(self): - index = RangeIndex(1, 5, name='foo') + index = RangeIndex(1, 5, name="foo") taken = index.take([3, 0, 1]) assert index.name == taken.name def test_take_fill_value(self): # GH 12631 - idx = pd.RangeIndex(1, 4, name='xxx') + idx = pd.RangeIndex(1, 4, name="xxx") result = idx.take(np.array([1, 0, -1])) - expected = pd.Int64Index([2, 1, 3], name='xxx') + expected = pd.Int64Index([2, 1, 3], name="xxx") tm.assert_index_equal(result, expected) # fill_value @@ -746,9 +794,8 @@ def test_take_fill_value(self): idx.take(np.array([1, 0, -1]), fill_value=True) # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.Int64Index([2, 1, 3], name='xxx') + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.Int64Index([2, 1, 3], name="xxx") tm.assert_index_equal(result, expected) msg = "Unable to fill values because RangeIndex cannot contain NA" @@ -761,16 +808,14 @@ def test_take_fill_value(self): idx.take(np.array([1, -5])) def test_print_unicode_columns(self): - df = pd.DataFrame({"\u05d0": [1, 2, 3], - "\u05d1": [4, 5, 6], - "c": [7, 8, 9]}) + df = pd.DataFrame({"\u05d0": [1, 2, 3], "\u05d1": [4, 5, 6], "c": [7, 8, 9]}) repr(df.columns) # should not raise UnicodeDecodeError def test_repr_roundtrip(self): tm.assert_index_equal(eval(repr(self.index)), self.index) def test_slice_keep_name(self): - idx = RangeIndex(1, 2, name='asdf') + idx = RangeIndex(1, 2, name="asdf") assert idx.name == idx[1:].name def test_explicit_conversions(self): @@ -780,7 +825,7 @@ def test_explicit_conversions(self): idx = RangeIndex(5) # float conversions - arr = np.arange(5, dtype='int64') * 3.2 + arr = np.arange(5, dtype="int64") * 3.2 expected = Float64Index(arr) fidx = idx * 3.2 tm.assert_index_equal(fidx, expected) @@ -789,12 +834,12 @@ def test_explicit_conversions(self): # interops with numpy arrays expected = Float64Index(arr) - a = np.zeros(5, dtype='float64') + a = np.zeros(5, dtype="float64") result = fidx - a tm.assert_index_equal(result, expected) expected = Float64Index(-arr) - a = np.zeros(5, dtype='float64') + a = np.zeros(5, dtype="float64") result = a - fidx tm.assert_index_equal(result, expected) @@ -874,38 +919,38 @@ def test_slice_specialised(self): # positive slice values index = self.index[7:10:2] - expected = Index(np.array([14, 18]), name='foo') + expected = Index(np.array([14, 18]), name="foo") tm.assert_index_equal(index, expected) # negative slice values index = self.index[-1:-5:-2] - expected = Index(np.array([18, 14]), name='foo') + expected = Index(np.array([18, 14]), name="foo") tm.assert_index_equal(index, expected) # stop overshoot index = self.index[2:100:4] - expected = Index(np.array([4, 12]), name='foo') + expected = Index(np.array([4, 12]), name="foo") tm.assert_index_equal(index, expected) # reverse index = self.index[::-1] - expected = Index(self.index.values[::-1], name='foo') + expected = Index(self.index.values[::-1], name="foo") tm.assert_index_equal(index, expected) index = self.index[-8::-1] - expected = Index(np.array([4, 2, 0]), name='foo') + expected = Index(np.array([4, 2, 0]), name="foo") tm.assert_index_equal(index, expected) index = self.index[-40::-1] - expected = Index(np.array([], dtype=np.int64), name='foo') + expected = Index(np.array([], dtype=np.int64), name="foo") tm.assert_index_equal(index, expected) index = self.index[40::-1] - expected = Index(self.index.values[40::-1], name='foo') + expected = Index(self.index.values[40::-1], name="foo") tm.assert_index_equal(index, expected) index = self.index[10::-1] - expected = Index(self.index.values[::-1], name='foo') + expected = Index(self.index.values[::-1], name="foo") tm.assert_index_equal(index, expected) def test_len_specialised(self): @@ -931,27 +976,29 @@ def test_len_specialised(self): i = RangeIndex(0, 5, step) assert len(i) == 0 - @pytest.fixture(params=[ - ([RI(1, 12, 5)], RI(1, 12, 5)), - ([RI(0, 6, 4)], RI(0, 6, 4)), - ([RI(1, 3), RI(3, 7)], RI(1, 7)), - ([RI(1, 5, 2), RI(5, 6)], RI(1, 6, 2)), - ([RI(1, 3, 2), RI(4, 7, 3)], RI(1, 7, 3)), - ([RI(-4, 3, 2), RI(4, 7, 2)], RI(-4, 7, 2)), - ([RI(-4, -8), RI(-8, -12)], RI(0, 0)), - ([RI(-4, -8), RI(3, -4)], RI(0, 0)), - ([RI(-4, -8), RI(3, 5)], RI(3, 5)), - ([RI(-4, -2), RI(3, 5)], I64([-4, -3, 3, 4])), - ([RI(-2,), RI(3, 5)], RI(3, 5)), - ([RI(2,), RI(2)], I64([0, 1, 0, 1])), - ([RI(2,), RI(2, 5), RI(5, 8, 4)], RI(0, 6)), - ([RI(2,), RI(3, 5), RI(5, 8, 4)], I64([0, 1, 3, 4, 5])), - ([RI(-2, 2), RI(2, 5), RI(5, 8, 4)], RI(-2, 6)), - ([RI(3,), I64([-1, 3, 15])], I64([0, 1, 2, -1, 3, 15])), - ([RI(3,), F64([-1, 3.1, 15.])], F64([0, 1, 2, -1, 3.1, 15.])), - ([RI(3,), OI(['a', None, 14])], OI([0, 1, 2, 'a', None, 14])), - ([RI(3, 1), OI(['a', None, 14])], OI(['a', None, 14])) - ]) + @pytest.fixture( + params=[ + ([RI(1, 12, 5)], RI(1, 12, 5)), + ([RI(0, 6, 4)], RI(0, 6, 4)), + ([RI(1, 3), RI(3, 7)], RI(1, 7)), + ([RI(1, 5, 2), RI(5, 6)], RI(1, 6, 2)), + ([RI(1, 3, 2), RI(4, 7, 3)], RI(1, 7, 3)), + ([RI(-4, 3, 2), RI(4, 7, 2)], RI(-4, 7, 2)), + ([RI(-4, -8), RI(-8, -12)], RI(0, 0)), + ([RI(-4, -8), RI(3, -4)], RI(0, 0)), + ([RI(-4, -8), RI(3, 5)], RI(3, 5)), + ([RI(-4, -2), RI(3, 5)], I64([-4, -3, 3, 4])), + ([RI(-2), RI(3, 5)], RI(3, 5)), + ([RI(2), RI(2)], I64([0, 1, 0, 1])), + ([RI(2), RI(2, 5), RI(5, 8, 4)], RI(0, 6)), + ([RI(2), RI(3, 5), RI(5, 8, 4)], I64([0, 1, 3, 4, 5])), + ([RI(-2, 2), RI(2, 5), RI(5, 8, 4)], RI(-2, 6)), + ([RI(3), I64([-1, 3, 15])], I64([0, 1, 2, -1, 3, 15])), + ([RI(3), F64([-1, 3.1, 15.0])], F64([0, 1, 2, -1, 3.1, 15.0])), + ([RI(3), OI(["a", None, 14])], OI([0, 1, 2, "a", None, 14])), + ([RI(3, 1), OI(["a", None, 14])], OI(["a", None, 14])), + ] + ) def appends(self, request): """Inputs and expected outputs for RangeIndex.append test""" @@ -977,15 +1024,16 @@ def test_engineless_lookup(self): idx = RangeIndex(2, 10, 3) assert idx.get_loc(5) == 1 - tm.assert_numpy_array_equal(idx.get_indexer([2, 8]), - ensure_platform_int(np.array([0, 2]))) + tm.assert_numpy_array_equal( + idx.get_indexer([2, 8]), ensure_platform_int(np.array([0, 2])) + ) with pytest.raises(KeyError): idx.get_loc(3) - assert '_engine' not in idx._cache + assert "_engine" not in idx._cache # The engine is still required for lookup of a different dtype scalar: with pytest.raises(KeyError): - assert idx.get_loc('a') == -1 + assert idx.get_loc("a") == -1 - assert '_engine' in idx._cache + assert "_engine" in idx._cache diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 8c0762c7e7e5a3..b3850f7a4e09e0 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -1,7 +1,7 @@ -''' +""" The tests in this package are to ensure the proper resultant dtypes of set operations. -''' +""" from collections import OrderedDict import itertools as it @@ -16,16 +16,20 @@ from pandas.tests.indexes.conftest import indices_list import pandas.util.testing as tm -COMPATIBLE_INCONSISTENT_PAIRS = OrderedDict([ - ((Int64Index, RangeIndex), (tm.makeIntIndex, tm.makeRangeIndex)), - ((Float64Index, Int64Index), (tm.makeFloatIndex, tm.makeIntIndex)), - ((Float64Index, RangeIndex), (tm.makeFloatIndex, tm.makeIntIndex)), - ((Float64Index, UInt64Index), (tm.makeFloatIndex, tm.makeUIntIndex)), -]) +COMPATIBLE_INCONSISTENT_PAIRS = OrderedDict( + [ + ((Int64Index, RangeIndex), (tm.makeIntIndex, tm.makeRangeIndex)), + ((Float64Index, Int64Index), (tm.makeFloatIndex, tm.makeIntIndex)), + ((Float64Index, RangeIndex), (tm.makeFloatIndex, tm.makeIntIndex)), + ((Float64Index, UInt64Index), (tm.makeFloatIndex, tm.makeUIntIndex)), + ] +) -@pytest.fixture(params=list(it.combinations(indices_list, 2)), - ids=lambda x: type(x[0]).__name__ + type(x[1]).__name__) +@pytest.fixture( + params=list(it.combinations(indices_list, 2)), + ids=lambda x: type(x[0]).__name__ + type(x[1]).__name__, +) def index_pair(request): """ Create all combinations of 2 index types. @@ -46,13 +50,13 @@ def test_union_different_types(index_pair): idx1, idx2 = index_pair type_pair = tuple(sorted([type(idx1), type(idx2)], key=lambda x: str(x))) if type_pair in COMPATIBLE_INCONSISTENT_PAIRS: - pytest.xfail('This test only considers non compatible indexes.') + pytest.xfail("This test only considers non compatible indexes.") if any(isinstance(idx, pd.MultiIndex) for idx in index_pair): - pytest.xfail('This test doesn\'t consider multiindixes.') + pytest.xfail("This test doesn't consider multiindixes.") if is_dtype_equal(idx1.dtype, idx2.dtype): - pytest.xfail('This test only considers non matching dtypes.') + pytest.xfail("This test only considers non matching dtypes.") # A union with a CategoricalIndex (even as dtype('O')) and a # non-CategoricalIndex can only be made if both indices are monotonic. @@ -63,12 +67,11 @@ def test_union_different_types(index_pair): idx1 = idx1.sort_values() idx2 = idx2.sort_values() - assert idx1.union(idx2).dtype == np.dtype('O') - assert idx2.union(idx1).dtype == np.dtype('O') + assert idx1.union(idx2).dtype == np.dtype("O") + assert idx2.union(idx1).dtype == np.dtype("O") -@pytest.mark.parametrize('idx_fact1,idx_fact2', - COMPATIBLE_INCONSISTENT_PAIRS.values()) +@pytest.mark.parametrize("idx_fact1,idx_fact2", COMPATIBLE_INCONSISTENT_PAIRS.values()) def test_compatible_inconsistent_pairs(idx_fact1, idx_fact2): # GH 23525 idx1 = idx_fact1(10) @@ -81,23 +84,26 @@ def test_compatible_inconsistent_pairs(idx_fact1, idx_fact2): assert res2.dtype in (idx1.dtype, idx2.dtype) -@pytest.mark.parametrize('left, right, expected', [ - ('int64', 'int64', 'int64'), - ('int64', 'uint64', 'object'), - ('int64', 'float64', 'float64'), - ('uint64', 'float64', 'float64'), - ('uint64', 'uint64', 'uint64'), - ('float64', 'float64', 'float64'), - ('datetime64[ns]', 'int64', 'object'), - ('datetime64[ns]', 'uint64', 'object'), - ('datetime64[ns]', 'float64', 'object'), - ('datetime64[ns, CET]', 'int64', 'object'), - ('datetime64[ns, CET]', 'uint64', 'object'), - ('datetime64[ns, CET]', 'float64', 'object'), - ('Period[D]', 'int64', 'object'), - ('Period[D]', 'uint64', 'object'), - ('Period[D]', 'float64', 'object'), -]) +@pytest.mark.parametrize( + "left, right, expected", + [ + ("int64", "int64", "int64"), + ("int64", "uint64", "object"), + ("int64", "float64", "float64"), + ("uint64", "float64", "float64"), + ("uint64", "uint64", "uint64"), + ("float64", "float64", "float64"), + ("datetime64[ns]", "int64", "object"), + ("datetime64[ns]", "uint64", "object"), + ("datetime64[ns]", "float64", "object"), + ("datetime64[ns, CET]", "int64", "object"), + ("datetime64[ns, CET]", "uint64", "object"), + ("datetime64[ns, CET]", "float64", "object"), + ("Period[D]", "int64", "object"), + ("Period[D]", "uint64", "object"), + ("Period[D]", "float64", "object"), + ], +) def test_union_dtypes(left, right, expected): left = pandas_dtype(left) right = pandas_dtype(right) diff --git a/pandas/tests/indexes/timedeltas/test_arithmetic.py b/pandas/tests/indexes/timedeltas/test_arithmetic.py index 5ede6a289d42fd..0f51a6333ab2d4 100644 --- a/pandas/tests/indexes/timedeltas/test_arithmetic.py +++ b/pandas/tests/indexes/timedeltas/test_arithmetic.py @@ -10,15 +10,21 @@ import pandas.util.testing as tm -@pytest.fixture(params=[pd.offsets.Hour(2), timedelta(hours=2), - np.timedelta64(2, 'h'), Timedelta(hours=2)], - ids=str) +@pytest.fixture( + params=[ + pd.offsets.Hour(2), + timedelta(hours=2), + np.timedelta64(2, "h"), + Timedelta(hours=2), + ], + ids=str, +) def delta(request): # Several ways of representing two hours return request.param -@pytest.fixture(params=['B', 'D']) +@pytest.fixture(params=["B", "D"]) def freq(request): return request.param @@ -31,52 +37,63 @@ class TestTimedeltaIndexArithmetic: def test_tdi_shift_empty(self): # GH#9903 - idx = pd.TimedeltaIndex([], name='xxx') - tm.assert_index_equal(idx.shift(0, freq='H'), idx) - tm.assert_index_equal(idx.shift(3, freq='H'), idx) + idx = pd.TimedeltaIndex([], name="xxx") + tm.assert_index_equal(idx.shift(0, freq="H"), idx) + tm.assert_index_equal(idx.shift(3, freq="H"), idx) def test_tdi_shift_hours(self): # GH#9903 - idx = pd.TimedeltaIndex(['5 hours', '6 hours', '9 hours'], name='xxx') - tm.assert_index_equal(idx.shift(0, freq='H'), idx) - exp = pd.TimedeltaIndex(['8 hours', '9 hours', '12 hours'], name='xxx') - tm.assert_index_equal(idx.shift(3, freq='H'), exp) - exp = pd.TimedeltaIndex(['2 hours', '3 hours', '6 hours'], name='xxx') - tm.assert_index_equal(idx.shift(-3, freq='H'), exp) + idx = pd.TimedeltaIndex(["5 hours", "6 hours", "9 hours"], name="xxx") + tm.assert_index_equal(idx.shift(0, freq="H"), idx) + exp = pd.TimedeltaIndex(["8 hours", "9 hours", "12 hours"], name="xxx") + tm.assert_index_equal(idx.shift(3, freq="H"), exp) + exp = pd.TimedeltaIndex(["2 hours", "3 hours", "6 hours"], name="xxx") + tm.assert_index_equal(idx.shift(-3, freq="H"), exp) def test_tdi_shift_minutes(self): # GH#9903 - idx = pd.TimedeltaIndex(['5 hours', '6 hours', '9 hours'], name='xxx') - tm.assert_index_equal(idx.shift(0, freq='T'), idx) - exp = pd.TimedeltaIndex(['05:03:00', '06:03:00', '9:03:00'], - name='xxx') - tm.assert_index_equal(idx.shift(3, freq='T'), exp) - exp = pd.TimedeltaIndex(['04:57:00', '05:57:00', '8:57:00'], - name='xxx') - tm.assert_index_equal(idx.shift(-3, freq='T'), exp) + idx = pd.TimedeltaIndex(["5 hours", "6 hours", "9 hours"], name="xxx") + tm.assert_index_equal(idx.shift(0, freq="T"), idx) + exp = pd.TimedeltaIndex(["05:03:00", "06:03:00", "9:03:00"], name="xxx") + tm.assert_index_equal(idx.shift(3, freq="T"), exp) + exp = pd.TimedeltaIndex(["04:57:00", "05:57:00", "8:57:00"], name="xxx") + tm.assert_index_equal(idx.shift(-3, freq="T"), exp) def test_tdi_shift_int(self): # GH#8083 - trange = pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) + trange = pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) result = trange.shift(1) - expected = TimedeltaIndex(['1 days 01:00:00', '2 days 01:00:00', - '3 days 01:00:00', - '4 days 01:00:00', '5 days 01:00:00'], - freq='D') + expected = TimedeltaIndex( + [ + "1 days 01:00:00", + "2 days 01:00:00", + "3 days 01:00:00", + "4 days 01:00:00", + "5 days 01:00:00", + ], + freq="D", + ) tm.assert_index_equal(result, expected) def test_tdi_shift_nonstandard_freq(self): # GH#8083 - trange = pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) - result = trange.shift(3, freq='2D 1s') - expected = TimedeltaIndex(['6 days 01:00:03', '7 days 01:00:03', - '8 days 01:00:03', '9 days 01:00:03', - '10 days 01:00:03'], freq='D') + trange = pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) + result = trange.shift(3, freq="2D 1s") + expected = TimedeltaIndex( + [ + "6 days 01:00:03", + "7 days 01:00:03", + "8 days 01:00:03", + "9 days 01:00:03", + "10 days 01:00:03", + ], + freq="D", + ) tm.assert_index_equal(result, expected) def test_shift_no_freq(self): # GH#19147 - tdi = TimedeltaIndex(['1 days 01:00:00', '2 days 01:00:00'], freq=None) + tdi = TimedeltaIndex(["1 days 01:00:00", "2 days 01:00:00"], freq=None) with pytest.raises(NullFrequencyError): tdi.shift(2) @@ -85,32 +102,32 @@ def test_shift_no_freq(self): def test_tdi_add_int(self, one): # Variants of `one` for #19012 - rng = timedelta_range('1 days 09:00:00', freq='H', periods=10) + rng = timedelta_range("1 days 09:00:00", freq="H", periods=10) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # GH#22535 result = rng + one - expected = timedelta_range('1 days 10:00:00', freq='H', periods=10) + expected = timedelta_range("1 days 10:00:00", freq="H", periods=10) tm.assert_index_equal(result, expected) def test_tdi_iadd_int(self, one): - rng = timedelta_range('1 days 09:00:00', freq='H', periods=10) - expected = timedelta_range('1 days 10:00:00', freq='H', periods=10) + rng = timedelta_range("1 days 09:00:00", freq="H", periods=10) + expected = timedelta_range("1 days 10:00:00", freq="H", periods=10) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # GH#22535 rng += one tm.assert_index_equal(rng, expected) def test_tdi_sub_int(self, one): - rng = timedelta_range('1 days 09:00:00', freq='H', periods=10) + rng = timedelta_range("1 days 09:00:00", freq="H", periods=10) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # GH#22535 result = rng - one - expected = timedelta_range('1 days 08:00:00', freq='H', periods=10) + expected = timedelta_range("1 days 08:00:00", freq="H", periods=10) tm.assert_index_equal(result, expected) def test_tdi_isub_int(self, one): - rng = timedelta_range('1 days 09:00:00', freq='H', periods=10) - expected = timedelta_range('1 days 08:00:00', freq='H', periods=10) + rng = timedelta_range("1 days 09:00:00", freq="H", periods=10) + expected = timedelta_range("1 days 08:00:00", freq="H", periods=10) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # GH#22535 rng -= one @@ -119,12 +136,12 @@ def test_tdi_isub_int(self, one): # ------------------------------------------------------------- # __add__/__sub__ with integer arrays - @pytest.mark.parametrize('box', [np.array, pd.Index]) + @pytest.mark.parametrize("box", [np.array, pd.Index]) def test_tdi_add_integer_array(self, box): # GH#19959 - rng = timedelta_range('1 days 09:00:00', freq='H', periods=3) + rng = timedelta_range("1 days 09:00:00", freq="H", periods=3) other = box([4, 3, 2]) - expected = TimedeltaIndex(['1 day 13:00:00'] * 3) + expected = TimedeltaIndex(["1 day 13:00:00"] * 3) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # GH#22535 result = rng + other @@ -135,12 +152,12 @@ def test_tdi_add_integer_array(self, box): result = other + rng tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('box', [np.array, pd.Index]) + @pytest.mark.parametrize("box", [np.array, pd.Index]) def test_tdi_sub_integer_array(self, box): # GH#19959 - rng = timedelta_range('9H', freq='H', periods=3) + rng = timedelta_range("9H", freq="H", periods=3) other = box([4, 3, 2]) - expected = TimedeltaIndex(['5H', '7H', '9H']) + expected = TimedeltaIndex(["5H", "7H", "9H"]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # GH#22535 result = rng - other @@ -151,10 +168,10 @@ def test_tdi_sub_integer_array(self, box): result = other - rng tm.assert_index_equal(result, -expected) - @pytest.mark.parametrize('box', [np.array, pd.Index]) + @pytest.mark.parametrize("box", [np.array, pd.Index]) def test_tdi_addsub_integer_array_no_freq(self, box): # GH#19959 - tdi = TimedeltaIndex(['1 Day', 'NaT', '3 Hours']) + tdi = TimedeltaIndex(["1 Day", "NaT", "3 Hours"]) other = box([14, -1, 16]) with pytest.raises(NullFrequencyError): tdi + other @@ -172,16 +189,15 @@ def test_tdi_addsub_integer_array_no_freq(self, box): def test_tdi_iadd_timedeltalike(self, delta): # only test adding/sub offsets as + is now numeric - rng = timedelta_range('1 days', '10 days') - expected = timedelta_range('1 days 02:00:00', '10 days 02:00:00', - freq='D') + rng = timedelta_range("1 days", "10 days") + expected = timedelta_range("1 days 02:00:00", "10 days 02:00:00", freq="D") rng += delta tm.assert_index_equal(rng, expected) def test_tdi_isub_timedeltalike(self, delta): # only test adding/sub offsets as - is now numeric - rng = timedelta_range('1 days', '10 days') - expected = timedelta_range('0 days 22:00:00', '9 days 22:00:00') + rng = timedelta_range("1 days", "10 days") + expected = timedelta_range("0 days 22:00:00", "9 days 22:00:00") rng -= delta tm.assert_index_equal(rng, expected) @@ -189,89 +205,90 @@ def test_tdi_isub_timedeltalike(self, delta): # TODO: after #24365 this probably belongs in scalar tests def test_ops_ndarray(self): - td = Timedelta('1 day') + td = Timedelta("1 day") # timedelta, timedelta - other = pd.to_timedelta(['1 day']).values - expected = pd.to_timedelta(['2 days']).values + other = pd.to_timedelta(["1 day"]).values + expected = pd.to_timedelta(["2 days"]).values tm.assert_numpy_array_equal(td + other, expected) tm.assert_numpy_array_equal(other + td, expected) msg = r"unsupported operand type\(s\) for \+: 'Timedelta' and 'int'" with pytest.raises(TypeError, match=msg): td + np.array([1]) - msg = (r"unsupported operand type\(s\) for \+: 'numpy.ndarray' and" - " 'Timedelta'") + msg = ( + r"unsupported operand type\(s\) for \+: 'numpy.ndarray' and" " 'Timedelta'" + ) with pytest.raises(TypeError, match=msg): np.array([1]) + td - expected = pd.to_timedelta(['0 days']).values + expected = pd.to_timedelta(["0 days"]).values tm.assert_numpy_array_equal(td - other, expected) tm.assert_numpy_array_equal(-other + td, expected) msg = r"unsupported operand type\(s\) for -: 'Timedelta' and 'int'" with pytest.raises(TypeError, match=msg): td - np.array([1]) - msg = (r"unsupported operand type\(s\) for -: 'numpy.ndarray' and" - " 'Timedelta'") + msg = r"unsupported operand type\(s\) for -: 'numpy.ndarray' and" " 'Timedelta'" with pytest.raises(TypeError, match=msg): np.array([1]) - td - expected = pd.to_timedelta(['2 days']).values + expected = pd.to_timedelta(["2 days"]).values tm.assert_numpy_array_equal(td * np.array([2]), expected) tm.assert_numpy_array_equal(np.array([2]) * td, expected) - msg = ("ufunc '?multiply'? cannot use operands with types" - r" dtype\('= -1') + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): @@ -169,46 +161,57 @@ def test_take_fill_value(self): class TestTimedeltaIndex: - def test_insert(self): - idx = TimedeltaIndex(['4day', '1day', '2day'], name='idx') + idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx") result = idx.insert(2, timedelta(days=5)) - exp = TimedeltaIndex(['4day', '1day', '5day', '2day'], name='idx') + exp = TimedeltaIndex(["4day", "1day", "5day", "2day"], name="idx") tm.assert_index_equal(result, exp) # insertion of non-datetime should coerce to object index - result = idx.insert(1, 'inserted') - expected = Index([Timedelta('4day'), 'inserted', Timedelta('1day'), - Timedelta('2day')], name='idx') + result = idx.insert(1, "inserted") + expected = Index( + [Timedelta("4day"), "inserted", Timedelta("1day"), Timedelta("2day")], + name="idx", + ) assert not isinstance(result, TimedeltaIndex) tm.assert_index_equal(result, expected) assert result.name == expected.name - idx = timedelta_range('1day 00:00:01', periods=3, freq='s', name='idx') + idx = timedelta_range("1day 00:00:01", periods=3, freq="s", name="idx") # preserve freq - expected_0 = TimedeltaIndex(['1day', '1day 00:00:01', '1day 00:00:02', - '1day 00:00:03'], - name='idx', freq='s') - expected_3 = TimedeltaIndex(['1day 00:00:01', '1day 00:00:02', - '1day 00:00:03', '1day 00:00:04'], - name='idx', freq='s') + expected_0 = TimedeltaIndex( + ["1day", "1day 00:00:01", "1day 00:00:02", "1day 00:00:03"], + name="idx", + freq="s", + ) + expected_3 = TimedeltaIndex( + ["1day 00:00:01", "1day 00:00:02", "1day 00:00:03", "1day 00:00:04"], + name="idx", + freq="s", + ) # reset freq to None - expected_1_nofreq = TimedeltaIndex(['1day 00:00:01', '1day 00:00:01', - '1day 00:00:02', '1day 00:00:03'], - name='idx', freq=None) - expected_3_nofreq = TimedeltaIndex(['1day 00:00:01', '1day 00:00:02', - '1day 00:00:03', '1day 00:00:05'], - name='idx', freq=None) - - cases = [(0, Timedelta('1day'), expected_0), - (-3, Timedelta('1day'), expected_0), - (3, Timedelta('1day 00:00:04'), expected_3), - (1, Timedelta('1day 00:00:01'), expected_1_nofreq), - (3, Timedelta('1day 00:00:05'), expected_3_nofreq)] + expected_1_nofreq = TimedeltaIndex( + ["1day 00:00:01", "1day 00:00:01", "1day 00:00:02", "1day 00:00:03"], + name="idx", + freq=None, + ) + expected_3_nofreq = TimedeltaIndex( + ["1day 00:00:01", "1day 00:00:02", "1day 00:00:03", "1day 00:00:05"], + name="idx", + freq=None, + ) + + cases = [ + (0, Timedelta("1day"), expected_0), + (-3, Timedelta("1day"), expected_0), + (3, Timedelta("1day 00:00:04"), expected_3), + (1, Timedelta("1day 00:00:01"), expected_1_nofreq), + (3, Timedelta("1day 00:00:05"), expected_3_nofreq), + ] for n, d, expected in cases: result = idx.insert(n, d) @@ -217,29 +220,30 @@ def test_insert(self): assert result.freq == expected.freq # GH 18295 (test missing) - expected = TimedeltaIndex(['1day', pd.NaT, '2day', '3day']) + expected = TimedeltaIndex(["1day", pd.NaT, "2day", "3day"]) for na in (np.nan, pd.NaT, None): - result = timedelta_range('1day', '3day').insert(1, na) + result = timedelta_range("1day", "3day").insert(1, na) tm.assert_index_equal(result, expected) def test_delete(self): - idx = timedelta_range(start='1 Days', periods=5, freq='D', name='idx') + idx = timedelta_range(start="1 Days", periods=5, freq="D", name="idx") # prserve freq - expected_0 = timedelta_range(start='2 Days', periods=4, freq='D', - name='idx') - expected_4 = timedelta_range(start='1 Days', periods=4, freq='D', - name='idx') + expected_0 = timedelta_range(start="2 Days", periods=4, freq="D", name="idx") + expected_4 = timedelta_range(start="1 Days", periods=4, freq="D", name="idx") # reset freq to None expected_1 = TimedeltaIndex( - ['1 day', '3 day', '4 day', '5 day'], freq=None, name='idx') - - cases = {0: expected_0, - -5: expected_0, - -1: expected_4, - 4: expected_4, - 1: expected_1} + ["1 day", "3 day", "4 day", "5 day"], freq=None, name="idx" + ) + + cases = { + 0: expected_0, + -5: expected_0, + -1: expected_4, + 4: expected_4, + 1: expected_1, + } for n, expected in cases.items(): result = idx.delete(n) tm.assert_index_equal(result, expected) @@ -251,22 +255,22 @@ def test_delete(self): idx.delete(5) def test_delete_slice(self): - idx = timedelta_range(start='1 days', periods=10, freq='D', name='idx') + idx = timedelta_range(start="1 days", periods=10, freq="D", name="idx") # prserve freq - expected_0_2 = timedelta_range(start='4 days', periods=7, freq='D', - name='idx') - expected_7_9 = timedelta_range(start='1 days', periods=7, freq='D', - name='idx') + expected_0_2 = timedelta_range(start="4 days", periods=7, freq="D", name="idx") + expected_7_9 = timedelta_range(start="1 days", periods=7, freq="D", name="idx") # reset freq to None - expected_3_5 = TimedeltaIndex(['1 d', '2 d', '3 d', - '7 d', '8 d', '9 d', '10d'], - freq=None, name='idx') - - cases = {(0, 1, 2): expected_0_2, - (7, 8, 9): expected_7_9, - (3, 4, 5): expected_3_5} + expected_3_5 = TimedeltaIndex( + ["1 d", "2 d", "3 d", "7 d", "8 d", "9 d", "10d"], freq=None, name="idx" + ) + + cases = { + (0, 1, 2): expected_0_2, + (7, 8, 9): expected_7_9, + (3, 4, 5): expected_3_5, + } for n, expected in cases.items(): result = idx.delete(n) tm.assert_index_equal(result, expected) @@ -279,60 +283,63 @@ def test_delete_slice(self): assert result.freq == expected.freq def test_get_loc(self): - idx = pd.to_timedelta(['0 days', '1 days', '2 days']) + idx = pd.to_timedelta(["0 days", "1 days", "2 days"]) - for method in [None, 'pad', 'backfill', 'nearest']: + for method in [None, "pad", "backfill", "nearest"]: assert idx.get_loc(idx[1], method) == 1 assert idx.get_loc(idx[1].to_pytimedelta(), method) == 1 assert idx.get_loc(str(idx[1]), method) == 1 - assert idx.get_loc(idx[1], 'pad', - tolerance=Timedelta(0)) == 1 - assert idx.get_loc(idx[1], 'pad', - tolerance=np.timedelta64(0, 's')) == 1 - assert idx.get_loc(idx[1], 'pad', - tolerance=timedelta(0)) == 1 + assert idx.get_loc(idx[1], "pad", tolerance=Timedelta(0)) == 1 + assert idx.get_loc(idx[1], "pad", tolerance=np.timedelta64(0, "s")) == 1 + assert idx.get_loc(idx[1], "pad", tolerance=timedelta(0)) == 1 - with pytest.raises(ValueError, match='unit abbreviation w/o a number'): - idx.get_loc(idx[1], method='nearest', tolerance='foo') + with pytest.raises(ValueError, match="unit abbreviation w/o a number"): + idx.get_loc(idx[1], method="nearest", tolerance="foo") - with pytest.raises( - ValueError, - match='tolerance size must match'): - idx.get_loc(idx[1], method='nearest', - tolerance=[Timedelta(0).to_timedelta64(), - Timedelta(0).to_timedelta64()]) + with pytest.raises(ValueError, match="tolerance size must match"): + idx.get_loc( + idx[1], + method="nearest", + tolerance=[ + Timedelta(0).to_timedelta64(), + Timedelta(0).to_timedelta64(), + ], + ) - for method, loc in [('pad', 1), ('backfill', 2), ('nearest', 1)]: - assert idx.get_loc('1 day 1 hour', method) == loc + for method, loc in [("pad", 1), ("backfill", 2), ("nearest", 1)]: + assert idx.get_loc("1 day 1 hour", method) == loc # GH 16909 assert idx.get_loc(idx[1].to_timedelta64()) == 1 # GH 16896 - assert idx.get_loc('0 days') == 0 + assert idx.get_loc("0 days") == 0 def test_get_loc_nat(self): - tidx = TimedeltaIndex(['1 days 01:00:00', 'NaT', '2 days 01:00:00']) + tidx = TimedeltaIndex(["1 days 01:00:00", "NaT", "2 days 01:00:00"]) assert tidx.get_loc(pd.NaT) == 1 assert tidx.get_loc(None) == 1 - assert tidx.get_loc(float('nan')) == 1 + assert tidx.get_loc(float("nan")) == 1 assert tidx.get_loc(np.nan) == 1 def test_get_indexer(self): - idx = pd.to_timedelta(['0 days', '1 days', '2 days']) - tm.assert_numpy_array_equal(idx.get_indexer(idx), - np.array([0, 1, 2], dtype=np.intp)) - - target = pd.to_timedelta(['-1 hour', '12 hours', '1 day 1 hour']) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 1, 1], dtype=np.intp)) - - res = idx.get_indexer(target, 'nearest', - tolerance=Timedelta('1 hour')) + idx = pd.to_timedelta(["0 days", "1 days", "2 days"]) + tm.assert_numpy_array_equal( + idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) + ) + + target = pd.to_timedelta(["-1 hour", "12 hours", "1 day 1 hour"]) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "backfill"), np.array([0, 1, 2], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest"), np.array([0, 1, 1], dtype=np.intp) + ) + + res = idx.get_indexer(target, "nearest", tolerance=Timedelta("1 hour")) tm.assert_numpy_array_equal(res, np.array([0, -1, 1], dtype=np.intp)) diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index 9e96b7d99e35dc..d7d8b103478615 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -1,4 +1,3 @@ - import numpy as np import pytest @@ -27,31 +26,37 @@ def test_ops_properties(self): def test_value_counts_unique(self): # GH 7735 - idx = timedelta_range('1 days 09:00:00', freq='H', periods=10) + idx = timedelta_range("1 days 09:00:00", freq="H", periods=10) # create repeated values, 'n'th element is repeated by n+1 times idx = TimedeltaIndex(np.repeat(idx.values, range(1, len(idx) + 1))) - exp_idx = timedelta_range('1 days 18:00:00', freq='-1H', periods=10) - expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') + exp_idx = timedelta_range("1 days 18:00:00", freq="-1H", periods=10) + expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) - expected = timedelta_range('1 days 09:00:00', freq='H', periods=10) + expected = timedelta_range("1 days 09:00:00", freq="H", periods=10) tm.assert_index_equal(idx.unique(), expected) - idx = TimedeltaIndex(['1 days 09:00:00', '1 days 09:00:00', - '1 days 09:00:00', '1 days 08:00:00', - '1 days 08:00:00', pd.NaT]) - - exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00']) + idx = TimedeltaIndex( + [ + "1 days 09:00:00", + "1 days 09:00:00", + "1 days 09:00:00", + "1 days 08:00:00", + "1 days 08:00:00", + pd.NaT, + ] + ) + + exp_idx = TimedeltaIndex(["1 days 09:00:00", "1 days 08:00:00"]) expected = Series([3, 2], index=exp_idx) for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) - exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00', - pd.NaT]) + exp_idx = TimedeltaIndex(["1 days 09:00:00", "1 days 08:00:00", pd.NaT]) expected = Series([3, 2, 1], index=exp_idx) for obj in [idx, Series(idx)]: @@ -61,26 +66,31 @@ def test_value_counts_unique(self): def test_nonunique_contains(self): # GH 9512 - for idx in map(TimedeltaIndex, ([0, 1, 0], [0, 0, -1], [0, -1, -1], - ['00:01:00', '00:01:00', '00:02:00'], - ['00:01:00', '00:01:00', '00:00:01'])): + for idx in map( + TimedeltaIndex, + ( + [0, 1, 0], + [0, 0, -1], + [0, -1, -1], + ["00:01:00", "00:01:00", "00:02:00"], + ["00:01:00", "00:01:00", "00:00:01"], + ), + ): assert idx[0] in idx def test_unknown_attribute(self): # see gh-9680 - tdi = pd.timedelta_range(start=0, periods=10, freq='1s') + tdi = pd.timedelta_range(start=0, periods=10, freq="1s") ts = pd.Series(np.random.normal(size=10), index=tdi) - assert 'foo' not in ts.__dict__.keys() + assert "foo" not in ts.__dict__.keys() msg = "'Series' object has no attribute 'foo'" with pytest.raises(AttributeError, match=msg): ts.foo def test_order(self): # GH 10295 - idx1 = TimedeltaIndex(['1 day', '2 day', '3 day'], freq='D', - name='idx') - idx2 = TimedeltaIndex( - ['1 hour', '2 hour', '3 hour'], freq='H', name='idx') + idx1 = TimedeltaIndex(["1 day", "2 day", "3 day"], freq="D", name="idx") + idx2 = TimedeltaIndex(["1 hour", "2 hour", "3 hour"], freq="H", name="idx") for idx in [idx1, idx2]: ordered = idx.sort_values() @@ -95,23 +105,24 @@ def test_order(self): ordered, indexer = idx.sort_values(return_indexer=True) tm.assert_index_equal(ordered, idx) - tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), - check_dtype=False) + tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), check_dtype=False) assert ordered.freq == idx.freq - ordered, indexer = idx.sort_values(return_indexer=True, - ascending=False) + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, idx[::-1]) assert ordered.freq == expected.freq assert ordered.freq.n == -1 - idx1 = TimedeltaIndex(['1 hour', '3 hour', '5 hour', - '2 hour ', '1 hour'], name='idx1') - exp1 = TimedeltaIndex(['1 hour', '1 hour', '2 hour', - '3 hour', '5 hour'], name='idx1') + idx1 = TimedeltaIndex( + ["1 hour", "3 hour", "5 hour", "2 hour ", "1 hour"], name="idx1" + ) + exp1 = TimedeltaIndex( + ["1 hour", "1 hour", "2 hour", "3 hour", "5 hour"], name="idx1" + ) - idx2 = TimedeltaIndex(['1 day', '3 day', '5 day', - '2 day', '1 day'], name='idx2') + idx2 = TimedeltaIndex( + ["1 day", "3 day", "5 day", "2 day", "1 day"], name="idx2" + ) # TODO(wesm): unused? # exp2 = TimedeltaIndex(['1 day', '1 day', '2 day', @@ -138,8 +149,7 @@ def test_order(self): tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) assert ordered.freq is None - ordered, indexer = idx.sort_values(return_indexer=True, - ascending=False) + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, expected[::-1]) exp = np.array([2, 1, 3, 4, 0]) @@ -148,7 +158,7 @@ def test_order(self): def test_drop_duplicates_metadata(self): # GH 10115 - idx = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') + idx = pd.timedelta_range("1 day", "31 day", freq="D", name="idx") result = idx.drop_duplicates() tm.assert_index_equal(idx, result) assert idx.freq == result.freq @@ -161,7 +171,7 @@ def test_drop_duplicates_metadata(self): def test_drop_duplicates(self): # to check Index/Series compat - base = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') + base = pd.timedelta_range("1 day", "31 day", freq="D", name="idx") idx = base.append(base[:5]) res = idx.drop_duplicates() @@ -169,10 +179,10 @@ def test_drop_duplicates(self): res = Series(idx).drop_duplicates() tm.assert_series_equal(res, Series(base)) - res = idx.drop_duplicates(keep='last') + res = idx.drop_duplicates(keep="last") exp = base[5:].append(base[:5]) tm.assert_index_equal(res, exp) - res = Series(idx).drop_duplicates(keep='last') + res = Series(idx).drop_duplicates(keep="last") tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) res = idx.drop_duplicates(keep=False) @@ -180,13 +190,13 @@ def test_drop_duplicates(self): res = Series(idx).drop_duplicates(keep=False) tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) - @pytest.mark.parametrize('freq', ['D', '3D', '-3D', - 'H', '2H', '-2H', - 'T', '2T', 'S', '-3S']) + @pytest.mark.parametrize( + "freq", ["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"] + ) def test_infer_freq(self, freq): # GH#11018 - idx = pd.timedelta_range('1', freq=freq, periods=10) - result = pd.TimedeltaIndex(idx.asi8, freq='infer') + idx = pd.timedelta_range("1", freq=freq, periods=10) + result = pd.TimedeltaIndex(idx.asi8, freq="infer") tm.assert_index_equal(idx, result) assert result.freq == freq @@ -194,16 +204,26 @@ def test_shift(self): pass # handled in test_arithmetic.py def test_repeat(self): - index = pd.timedelta_range('1 days', periods=2, freq='D') - exp = pd.TimedeltaIndex(['1 days', '1 days', '2 days', '2 days']) + index = pd.timedelta_range("1 days", periods=2, freq="D") + exp = pd.TimedeltaIndex(["1 days", "1 days", "2 days", "2 days"]) for res in [index.repeat(2), np.repeat(index, 2)]: tm.assert_index_equal(res, exp) assert res.freq is None - index = TimedeltaIndex(['1 days', 'NaT', '3 days']) - exp = TimedeltaIndex(['1 days', '1 days', '1 days', - 'NaT', 'NaT', 'NaT', - '3 days', '3 days', '3 days']) + index = TimedeltaIndex(["1 days", "NaT", "3 days"]) + exp = TimedeltaIndex( + [ + "1 days", + "1 days", + "1 days", + "NaT", + "NaT", + "NaT", + "3 days", + "3 days", + "3 days", + ] + ) for res in [index.repeat(3), np.repeat(index, 3)]: tm.assert_index_equal(res, exp) assert res.freq is None @@ -212,25 +232,23 @@ def test_nat(self): assert pd.TimedeltaIndex._na_value is pd.NaT assert pd.TimedeltaIndex([])._na_value is pd.NaT - idx = pd.TimedeltaIndex(['1 days', '2 days']) + idx = pd.TimedeltaIndex(["1 days", "2 days"]) assert idx._can_hold_na tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) assert idx.hasnans is False - tm.assert_numpy_array_equal(idx._nan_idxs, - np.array([], dtype=np.intp)) + tm.assert_numpy_array_equal(idx._nan_idxs, np.array([], dtype=np.intp)) - idx = pd.TimedeltaIndex(['1 days', 'NaT']) + idx = pd.TimedeltaIndex(["1 days", "NaT"]) assert idx._can_hold_na tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) assert idx.hasnans is True - tm.assert_numpy_array_equal(idx._nan_idxs, - np.array([1], dtype=np.intp)) + tm.assert_numpy_array_equal(idx._nan_idxs, np.array([1], dtype=np.intp)) def test_equals(self): # GH 13107 - idx = pd.TimedeltaIndex(['1 days', '2 days', 'NaT']) + idx = pd.TimedeltaIndex(["1 days", "2 days", "NaT"]) assert idx.equals(idx) assert idx.equals(idx.copy()) assert idx.equals(idx.astype(object)) @@ -239,7 +257,7 @@ def test_equals(self): assert not idx.equals(list(idx)) assert not idx.equals(pd.Series(idx)) - idx2 = pd.TimedeltaIndex(['2 days', '1 days', 'NaT']) + idx2 = pd.TimedeltaIndex(["2 days", "1 days", "NaT"]) assert not idx.equals(idx2) assert not idx.equals(idx2.copy()) assert not idx.equals(idx2.astype(object)) @@ -248,8 +266,8 @@ def test_equals(self): assert not idx.equals(list(idx2)) assert not idx.equals(pd.Series(idx2)) - @pytest.mark.parametrize('values', [['0 days', '2 days', '4 days'], []]) - @pytest.mark.parametrize('freq', ['2D', Day(2), '48H', Hour(48)]) + @pytest.mark.parametrize("values", [["0 days", "2 days", "4 days"], []]) + @pytest.mark.parametrize("freq", ["2D", Day(2), "48H", Hour(48)]) def test_freq_setter(self, values, freq): # GH 20678 idx = TimedeltaIndex(values) @@ -265,19 +283,21 @@ def test_freq_setter(self, values, freq): def test_freq_setter_errors(self): # GH 20678 - idx = TimedeltaIndex(['0 days', '2 days', '4 days']) + idx = TimedeltaIndex(["0 days", "2 days", "4 days"]) # setting with an incompatible freq - msg = ('Inferred frequency 2D from passed values does not conform to ' - 'passed frequency 5D') + msg = ( + "Inferred frequency 2D from passed values does not conform to " + "passed frequency 5D" + ) with pytest.raises(ValueError, match=msg): - idx.freq = '5D' + idx.freq = "5D" # setting with a non-fixed frequency - msg = r'<2 \* BusinessDays> is a non-fixed frequency' + msg = r"<2 \* BusinessDays> is a non-fixed frequency" with pytest.raises(ValueError, match=msg): - idx.freq = '2B' + idx.freq = "2B" # setting with non-freq string - with pytest.raises(ValueError, match='Invalid frequency'): - idx.freq = 'foo' + with pytest.raises(ValueError, match="Invalid frequency"): + idx.freq = "foo" diff --git a/pandas/tests/indexes/timedeltas/test_partial_slicing.py b/pandas/tests/indexes/timedeltas/test_partial_slicing.py index 0c1ecffec2bf7d..446b67d5f501d5 100644 --- a/pandas/tests/indexes/timedeltas/test_partial_slicing.py +++ b/pandas/tests/indexes/timedeltas/test_partial_slicing.py @@ -9,51 +9,51 @@ class TestSlicing: def test_slice_keeps_name(self): # GH4226 - dr = pd.timedelta_range('1d', '5d', freq='H', name='timebucket') + dr = pd.timedelta_range("1d", "5d", freq="H", name="timebucket") assert dr[1:].name == dr.name def test_partial_slice(self): - rng = timedelta_range('1 day 10:11:12', freq='h', periods=500) + rng = timedelta_range("1 day 10:11:12", freq="h", periods=500) s = Series(np.arange(len(rng)), index=rng) - result = s['5 day':'6 day'] + result = s["5 day":"6 day"] expected = s.iloc[86:134] assert_series_equal(result, expected) - result = s['5 day':] + result = s["5 day":] expected = s.iloc[86:] assert_series_equal(result, expected) - result = s[:'6 day'] + result = s[:"6 day"] expected = s.iloc[:134] assert_series_equal(result, expected) - result = s['6 days, 23:11:12'] + result = s["6 days, 23:11:12"] assert result == s.iloc[133] msg = r"^Timedelta\('50 days 00:00:00'\)$" with pytest.raises(KeyError, match=msg): - s['50 days'] + s["50 days"] def test_partial_slice_high_reso(self): # higher reso - rng = timedelta_range('1 day 10:11:12', freq='us', periods=2000) + rng = timedelta_range("1 day 10:11:12", freq="us", periods=2000) s = Series(np.arange(len(rng)), index=rng) - result = s['1 day 10:11:12':] + result = s["1 day 10:11:12":] expected = s.iloc[0:] assert_series_equal(result, expected) - result = s['1 day 10:11:12.001':] + result = s["1 day 10:11:12.001":] expected = s.iloc[1000:] assert_series_equal(result, expected) - result = s['1 days, 10:11:12.001001'] + result = s["1 days, 10:11:12.001001"] assert result == s.iloc[1001] def test_slice_with_negative_step(self): - ts = Series(np.arange(20), timedelta_range('0', periods=20, freq='H')) + ts = Series(np.arange(20), timedelta_range("0", periods=20, freq="H")) SLC = pd.IndexSlice def assert_slices_equivalent(l_slc, i_slc): @@ -61,27 +61,30 @@ def assert_slices_equivalent(l_slc, i_slc): assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) - assert_slices_equivalent(SLC[Timedelta(hours=7)::-1], SLC[7::-1]) - assert_slices_equivalent(SLC['7 hours'::-1], SLC[7::-1]) + assert_slices_equivalent(SLC[Timedelta(hours=7) :: -1], SLC[7::-1]) + assert_slices_equivalent(SLC["7 hours"::-1], SLC[7::-1]) - assert_slices_equivalent(SLC[:Timedelta(hours=7):-1], SLC[:6:-1]) - assert_slices_equivalent(SLC[:'7 hours':-1], SLC[:6:-1]) + assert_slices_equivalent(SLC[: Timedelta(hours=7) : -1], SLC[:6:-1]) + assert_slices_equivalent(SLC[:"7 hours":-1], SLC[:6:-1]) - assert_slices_equivalent(SLC['15 hours':'7 hours':-1], SLC[15:6:-1]) - assert_slices_equivalent(SLC[Timedelta(hours=15):Timedelta(hours=7):- - 1], SLC[15:6:-1]) - assert_slices_equivalent(SLC['15 hours':Timedelta(hours=7):-1], - SLC[15:6:-1]) - assert_slices_equivalent(SLC[Timedelta(hours=15):'7 hours':-1], - SLC[15:6:-1]) + assert_slices_equivalent(SLC["15 hours":"7 hours":-1], SLC[15:6:-1]) + assert_slices_equivalent( + SLC[Timedelta(hours=15) : Timedelta(hours=7) : -1], SLC[15:6:-1] + ) + assert_slices_equivalent( + SLC["15 hours" : Timedelta(hours=7) : -1], SLC[15:6:-1] + ) + assert_slices_equivalent( + SLC[Timedelta(hours=15) : "7 hours" : -1], SLC[15:6:-1] + ) - assert_slices_equivalent(SLC['7 hours':'15 hours':-1], SLC[:0]) + assert_slices_equivalent(SLC["7 hours":"15 hours":-1], SLC[:0]) def test_slice_with_zero_step_raises(self): - ts = Series(np.arange(20), timedelta_range('0', periods=20, freq='H')) - with pytest.raises(ValueError, match='slice step cannot be zero'): + ts = Series(np.arange(20), timedelta_range("0", periods=20, freq="H")) + with pytest.raises(ValueError, match="slice step cannot be zero"): ts[::0] - with pytest.raises(ValueError, match='slice step cannot be zero'): + with pytest.raises(ValueError, match="slice step cannot be zero"): ts.loc[::0] - with pytest.raises(ValueError, match='slice step cannot be zero'): + with pytest.raises(ValueError, match="slice step cannot be zero"): ts.loc[::0] diff --git a/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/pandas/tests/indexes/timedeltas/test_scalar_compat.py index fd3c0f7eabe0c9..38f1d2c7d4a1bf 100644 --- a/pandas/tests/indexes/timedeltas/test_scalar_compat.py +++ b/pandas/tests/indexes/timedeltas/test_scalar_compat.py @@ -14,10 +14,11 @@ class TestVectorizedTimedelta: def test_tdi_total_seconds(self): # GH#10939 # test index - rng = timedelta_range('1 days, 10:11:12.100123456', periods=2, - freq='s') - expt = [1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456. / 1e9, - 1 * 86400 + 10 * 3600 + 11 * 60 + 13 + 100123456. / 1e9] + rng = timedelta_range("1 days, 10:11:12.100123456", periods=2, freq="s") + expt = [ + 1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456.0 / 1e9, + 1 * 86400 + 10 * 3600 + 11 * 60 + 13 + 100123456.0 / 1e9, + ] tm.assert_almost_equal(rng.total_seconds(), Index(expt)) # test Series @@ -27,37 +28,44 @@ def test_tdi_total_seconds(self): # with nat ser[1] = np.nan - s_expt = Series([1 * 86400 + 10 * 3600 + 11 * 60 + - 12 + 100123456. / 1e9, np.nan], index=[0, 1]) + s_expt = Series( + [1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456.0 / 1e9, np.nan], + index=[0, 1], + ) tm.assert_series_equal(ser.dt.total_seconds(), s_expt) # with both nat - ser = Series([np.nan, np.nan], dtype='timedelta64[ns]') - tm.assert_series_equal(ser.dt.total_seconds(), - Series([np.nan, np.nan], index=[0, 1])) + ser = Series([np.nan, np.nan], dtype="timedelta64[ns]") + tm.assert_series_equal( + ser.dt.total_seconds(), Series([np.nan, np.nan], index=[0, 1]) + ) def test_tdi_round(self): - td = pd.timedelta_range(start='16801 days', periods=5, freq='30Min') + td = pd.timedelta_range(start="16801 days", periods=5, freq="30Min") elt = td[1] - expected_rng = TimedeltaIndex([Timedelta('16801 days 00:00:00'), - Timedelta('16801 days 00:00:00'), - Timedelta('16801 days 01:00:00'), - Timedelta('16801 days 02:00:00'), - Timedelta('16801 days 02:00:00')]) + expected_rng = TimedeltaIndex( + [ + Timedelta("16801 days 00:00:00"), + Timedelta("16801 days 00:00:00"), + Timedelta("16801 days 01:00:00"), + Timedelta("16801 days 02:00:00"), + Timedelta("16801 days 02:00:00"), + ] + ) expected_elt = expected_rng[1] - tm.assert_index_equal(td.round(freq='H'), expected_rng) - assert elt.round(freq='H') == expected_elt + tm.assert_index_equal(td.round(freq="H"), expected_rng) + assert elt.round(freq="H") == expected_elt msg = pd._libs.tslibs.frequencies.INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): - td.round(freq='foo') + td.round(freq="foo") with pytest.raises(ValueError, match=msg): - elt.round(freq='foo') + elt.round(freq="foo") msg = " is a non-fixed frequency" with pytest.raises(ValueError, match=msg): - td.round(freq='M') + td.round(freq="M") with pytest.raises(ValueError, match=msg): - elt.round(freq='M') + elt.round(freq="M") diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index a0dc74408a4aff..861067480b5fad 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -9,24 +9,23 @@ class TestTimedeltaIndex: - def test_union(self): - i1 = timedelta_range('1day', periods=5) - i2 = timedelta_range('3day', periods=5) + i1 = timedelta_range("1day", periods=5) + i2 = timedelta_range("3day", periods=5) result = i1.union(i2) - expected = timedelta_range('1day', periods=7) + expected = timedelta_range("1day", periods=7) tm.assert_index_equal(result, expected) i1 = Int64Index(np.arange(0, 20, 2)) - i2 = timedelta_range(start='1 day', periods=10, freq='D') + i2 = timedelta_range(start="1 day", periods=10, freq="D") i1.union(i2) # Works i2.union(i1) # Fails with "AttributeError: can't set attribute" def test_union_coverage(self): - idx = TimedeltaIndex(['3d', '1d', '2d']) - ordered = TimedeltaIndex(idx.sort_values(), freq='infer') + idx = TimedeltaIndex(["3d", "1d", "2d"]) + ordered = TimedeltaIndex(idx.sort_values(), freq="infer") result = ordered.union(idx) tm.assert_index_equal(result, ordered) @@ -36,8 +35,8 @@ def test_union_coverage(self): def test_union_bug_1730(self): - rng_a = timedelta_range('1 day', periods=4, freq='3H') - rng_b = timedelta_range('1 day', periods=4, freq='4H') + rng_a = timedelta_range("1 day", periods=4, freq="3H") + rng_b = timedelta_range("1 day", periods=4, freq="4H") result = rng_a.union(rng_b) exp = TimedeltaIndex(sorted(set(list(rng_a)) | set(list(rng_b)))) @@ -45,10 +44,10 @@ def test_union_bug_1730(self): def test_union_bug_1745(self): - left = TimedeltaIndex(['1 day 15:19:49.695000']) - right = TimedeltaIndex(['2 day 13:04:21.322000', - '1 day 15:27:24.873000', - '1 day 15:31:05.350000']) + left = TimedeltaIndex(["1 day 15:19:49.695000"]) + right = TimedeltaIndex( + ["2 day 13:04:21.322000", "1 day 15:27:24.873000", "1 day 15:31:05.350000"] + ) result = left.union(right) exp = TimedeltaIndex(sorted(set(list(left)) | set(list(right)))) @@ -64,25 +63,25 @@ def test_union_bug_4564(self): tm.assert_index_equal(result, exp) def test_intersection_bug_1708(self): - index_1 = timedelta_range('1 day', periods=4, freq='h') + index_1 = timedelta_range("1 day", periods=4, freq="h") index_2 = index_1 + pd.offsets.Hour(5) result = index_1 & index_2 assert len(result) == 0 - index_1 = timedelta_range('1 day', periods=4, freq='h') + index_1 = timedelta_range("1 day", periods=4, freq="h") index_2 = index_1 + pd.offsets.Hour(1) result = index_1 & index_2 - expected = timedelta_range('1 day 01:00:00', periods=3, freq='h') + expected = timedelta_range("1 day 01:00:00", periods=3, freq="h") tm.assert_index_equal(result, expected) @pytest.mark.parametrize("sort", [None, False]) def test_intersection_equal(self, sort): # GH 24471 Test intersection outcome given the sort keyword # for equal indicies intersection should return the original index - first = timedelta_range('1 day', periods=4, freq='h') - second = timedelta_range('1 day', periods=4, freq='h') + first = timedelta_range("1 day", periods=4, freq="h") + second = timedelta_range("1 day", periods=4, freq="h") intersect = first.intersection(second, sort=sort) if sort is None: tm.assert_index_equal(intersect, second.sort_values()) @@ -96,17 +95,17 @@ def test_intersection_equal(self, sort): @pytest.mark.parametrize("sort", [None, False]) def test_intersection_zero_length(self, period_1, period_2, sort): # GH 24471 test for non overlap the intersection should be zero length - index_1 = timedelta_range('1 day', periods=period_1, freq='h') - index_2 = timedelta_range('1 day', periods=period_2, freq='h') - expected = timedelta_range('1 day', periods=0, freq='h') + index_1 = timedelta_range("1 day", periods=period_1, freq="h") + index_2 = timedelta_range("1 day", periods=period_2, freq="h") + expected = timedelta_range("1 day", periods=0, freq="h") result = index_1.intersection(index_2, sort=sort) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('sort', [None, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_zero_length_input_index(self, sort): # GH 24966 test for 0-len intersections are copied - index_1 = timedelta_range('1 day', periods=0, freq='h') - index_2 = timedelta_range('1 day', periods=3, freq='h') + index_1 = timedelta_range("1 day", periods=0, freq="h") + index_2 = timedelta_range("1 day", periods=3, freq="h") result = index_1.intersection(index_2, sort=sort) assert index_1 is not result assert index_2 is not result @@ -116,18 +115,26 @@ def test_zero_length_input_index(self, sort): "rng, expected", # if target has the same name, it is preserved [ - (timedelta_range('1 day', periods=5, freq='h', name='idx'), - timedelta_range('1 day', periods=4, freq='h', name='idx')), + ( + timedelta_range("1 day", periods=5, freq="h", name="idx"), + timedelta_range("1 day", periods=4, freq="h", name="idx"), + ), # if target name is different, it will be reset - (timedelta_range('1 day', periods=5, freq='h', name='other'), - timedelta_range('1 day', periods=4, freq='h', name=None)), + ( + timedelta_range("1 day", periods=5, freq="h", name="other"), + timedelta_range("1 day", periods=4, freq="h", name=None), + ), # if no overlap exists return empty index - (timedelta_range('1 day', periods=10, freq='h', name='idx')[5:], - TimedeltaIndex([], name='idx'))]) + ( + timedelta_range("1 day", periods=10, freq="h", name="idx")[5:], + TimedeltaIndex([], name="idx"), + ), + ], + ) @pytest.mark.parametrize("sort", [None, False]) def test_intersection(self, rng, expected, sort): # GH 4690 (with tz) - base = timedelta_range('1 day', periods=4, freq='h', name='idx') + base = timedelta_range("1 day", periods=4, freq="h", name="idx") result = base.intersection(rng, sort=sort) if sort is None: expected = expected.sort_values() @@ -139,23 +146,28 @@ def test_intersection(self, rng, expected, sort): "rng, expected", # part intersection works [ - (TimedeltaIndex(['5 hour', '2 hour', '4 hour', '9 hour'], - name='idx'), - TimedeltaIndex(['2 hour', '4 hour'], name='idx')), + ( + TimedeltaIndex(["5 hour", "2 hour", "4 hour", "9 hour"], name="idx"), + TimedeltaIndex(["2 hour", "4 hour"], name="idx"), + ), # reordered part intersection - (TimedeltaIndex(['2 hour', '5 hour', '5 hour', '1 hour'], - name='other'), - TimedeltaIndex(['1 hour', '2 hour'], name=None)), + ( + TimedeltaIndex(["2 hour", "5 hour", "5 hour", "1 hour"], name="other"), + TimedeltaIndex(["1 hour", "2 hour"], name=None), + ), # reveresed index - (TimedeltaIndex(['1 hour', '2 hour', '4 hour', '3 hour'], - name='idx')[::-1], - TimedeltaIndex(['1 hour', '2 hour', '4 hour', '3 hour'], - name='idx'))]) + ( + TimedeltaIndex(["1 hour", "2 hour", "4 hour", "3 hour"], name="idx")[ + ::-1 + ], + TimedeltaIndex(["1 hour", "2 hour", "4 hour", "3 hour"], name="idx"), + ), + ], + ) @pytest.mark.parametrize("sort", [None, False]) def test_intersection_non_monotonic(self, rng, expected, sort): # 24471 non-monotonic - base = TimedeltaIndex(['1 hour', '2 hour', '4 hour', '3 hour'], - name='idx') + base = TimedeltaIndex(["1 hour", "2 hour", "4 hour", "3 hour"], name="idx") result = base.intersection(rng, sort=sort) if sort is None: expected = expected.sort_values() diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 79d064c57fa400..018ccfb2439dc1 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -6,11 +6,21 @@ import pandas as pd from pandas import ( - DataFrame, Index, Int64Index, Series, Timedelta, TimedeltaIndex, - date_range, timedelta_range) + DataFrame, + Index, + Int64Index, + Series, + Timedelta, + TimedeltaIndex, + date_range, + timedelta_range, +) import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_index_equal, assert_series_equal) + assert_almost_equal, + assert_index_equal, + assert_series_equal, +) from ..datetimelike import DatetimeLike @@ -25,7 +35,7 @@ def setup_method(self, method): self.setup_indices() def create_index(self): - return pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) + return pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) def test_numeric_compat(self): # Dummy method to override super's version; this test is now done @@ -40,17 +50,18 @@ def test_pickle_compat_construction(self): def test_fillna_timedelta(self): # GH 11343 - idx = pd.TimedeltaIndex(['1 day', pd.NaT, '3 day']) + idx = pd.TimedeltaIndex(["1 day", pd.NaT, "3 day"]) - exp = pd.TimedeltaIndex(['1 day', '2 day', '3 day']) - tm.assert_index_equal(idx.fillna(pd.Timedelta('2 day')), exp) + exp = pd.TimedeltaIndex(["1 day", "2 day", "3 day"]) + tm.assert_index_equal(idx.fillna(pd.Timedelta("2 day")), exp) - exp = pd.TimedeltaIndex(['1 day', '3 hour', '3 day']) - idx.fillna(pd.Timedelta('3 hour')) + exp = pd.TimedeltaIndex(["1 day", "3 hour", "3 day"]) + idx.fillna(pd.Timedelta("3 hour")) exp = pd.Index( - [pd.Timedelta('1 day'), 'x', pd.Timedelta('3 day')], dtype=object) - tm.assert_index_equal(idx.fillna('x'), exp) + [pd.Timedelta("1 day"), "x", pd.Timedelta("3 day")], dtype=object + ) + tm.assert_index_equal(idx.fillna("x"), exp) @pytest.mark.parametrize("sort", [None, False]) def test_difference_freq(self, sort): @@ -62,19 +73,20 @@ def test_difference_freq(self, sort): expected = TimedeltaIndex(["0 days", "5 days"], freq=None) idx_diff = index.difference(other, sort) tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) other = timedelta_range("2 days", "5 days", freq="D") idx_diff = index.difference(other, sort) expected = TimedeltaIndex(["0 days", "1 days"], freq=None) tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) @pytest.mark.parametrize("sort", [None, False]) def test_difference_sort(self, sort): - index = pd.TimedeltaIndex(["5 days", "3 days", "2 days", "4 days", - "1 days", "0 days"]) + index = pd.TimedeltaIndex( + ["5 days", "3 days", "2 days", "4 days", "1 days", "0 days"] + ) other = timedelta_range("1 days", "4 days", freq="D") idx_diff = index.difference(other, sort) @@ -85,7 +97,7 @@ def test_difference_sort(self, sort): expected = expected.sort_values() tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) other = timedelta_range("2 days", "5 days", freq="D") idx_diff = index.difference(other, sort) @@ -95,7 +107,7 @@ def test_difference_sort(self, sort): expected = expected.sort_values() tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) def test_isin(self): @@ -106,15 +118,15 @@ def test_isin(self): result = index.isin(list(index)) assert result.all() - assert_almost_equal(index.isin([index[2], 5]), - np.array([False, False, True, False])) + assert_almost_equal( + index.isin([index[2], 5]), np.array([False, False, True, False]) + ) def test_factorize(self): - idx1 = TimedeltaIndex(['1 day', '1 day', '2 day', '2 day', '3 day', - '3 day']) + idx1 = TimedeltaIndex(["1 day", "1 day", "2 day", "2 day", "3 day", "3 day"]) exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) - exp_idx = TimedeltaIndex(['1 day', '2 day', '3 day']) + exp_idx = TimedeltaIndex(["1 day", "2 day", "3 day"]) arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) @@ -125,32 +137,36 @@ def test_factorize(self): tm.assert_index_equal(idx, exp_idx) # freq must be preserved - idx3 = timedelta_range('1 day', periods=4, freq='s') + idx3 = timedelta_range("1 day", periods=4, freq="s") exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) arr, idx = idx3.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, idx3) def test_join_self(self, join_type): - index = timedelta_range('1 day', periods=10) + index = timedelta_range("1 day", periods=10) joined = index.join(index, how=join_type) tm.assert_index_equal(index, joined) def test_does_not_convert_mixed_integer(self): - df = tm.makeCustomDataframe(10, 10, - data_gen_f=lambda *args, **kwargs: randn(), - r_idx_type='i', c_idx_type='td') + df = tm.makeCustomDataframe( + 10, + 10, + data_gen_f=lambda *args, **kwargs: randn(), + r_idx_type="i", + c_idx_type="td", + ) str(df) - cols = df.columns.join(df.index, how='outer') + cols = df.columns.join(df.index, how="outer") joined = cols.join(df.columns) - assert cols.dtype == np.dtype('O') + assert cols.dtype == np.dtype("O") assert cols.dtype == joined.dtype tm.assert_index_equal(cols, joined) def test_sort_values(self): - idx = TimedeltaIndex(['4d', '1d', '2d']) + idx = TimedeltaIndex(["4d", "1d", "2d"]) ordered = idx.sort_values() assert ordered.is_monotonic @@ -161,48 +177,44 @@ def test_sort_values(self): ordered, dexer = idx.sort_values(return_indexer=True) assert ordered.is_monotonic - tm.assert_numpy_array_equal(dexer, np.array([1, 2, 0]), - check_dtype=False) + tm.assert_numpy_array_equal(dexer, np.array([1, 2, 0]), check_dtype=False) ordered, dexer = idx.sort_values(return_indexer=True, ascending=False) assert ordered[::-1].is_monotonic - tm.assert_numpy_array_equal(dexer, np.array([0, 2, 1]), - check_dtype=False) + tm.assert_numpy_array_equal(dexer, np.array([0, 2, 1]), check_dtype=False) def test_get_duplicates(self): - idx = TimedeltaIndex(['1 day', '2 day', '2 day', '3 day', '3day', - '4day']) + idx = TimedeltaIndex(["1 day", "2 day", "2 day", "3 day", "3day", "4day"]) with tm.assert_produces_warning(FutureWarning): # Deprecated - see GH20239 result = idx.get_duplicates() - ex = TimedeltaIndex(['2 day', '3day']) + ex = TimedeltaIndex(["2 day", "3day"]) tm.assert_index_equal(result, ex) def test_argmin_argmax(self): - idx = TimedeltaIndex(['1 day 00:00:05', '1 day 00:00:01', - '1 day 00:00:02']) + idx = TimedeltaIndex(["1 day 00:00:05", "1 day 00:00:01", "1 day 00:00:02"]) assert idx.argmin() == 1 assert idx.argmax() == 0 def test_misc_coverage(self): - rng = timedelta_range('1 day', periods=5) + rng = timedelta_range("1 day", periods=5) result = rng.groupby(rng.days) assert isinstance(list(result.values())[0][0], Timedelta) - idx = TimedeltaIndex(['3d', '1d', '2d']) + idx = TimedeltaIndex(["3d", "1d", "2d"]) assert not idx.equals(list(idx)) - non_td = Index(list('abc')) + non_td = Index(list("abc")) assert not idx.equals(list(non_td)) def test_map(self): # test_map_dictlike generally tests - rng = timedelta_range('1 day', periods=10) + rng = timedelta_range("1 day", periods=10) f = lambda x: x.days result = rng.map(f) @@ -211,7 +223,7 @@ def test_map(self): def test_pass_TimedeltaIndex_to_index(self): - rng = timedelta_range('1 days', '10 days') + rng = timedelta_range("1 days", "10 days") idx = Index(rng, dtype=object) expected = Index(rng.to_pytimedelta(), dtype=object) @@ -220,56 +232,55 @@ def test_pass_TimedeltaIndex_to_index(self): def test_pickle(self): - rng = timedelta_range('1 days', periods=10) + rng = timedelta_range("1 days", periods=10) rng_p = tm.round_trip_pickle(rng) tm.assert_index_equal(rng, rng_p) def test_hash_error(self): - index = timedelta_range('1 days', periods=10) - with pytest.raises(TypeError, match=("unhashable type: %r" % - type(index).__name__)): + index = timedelta_range("1 days", periods=10) + with pytest.raises( + TypeError, match=("unhashable type: %r" % type(index).__name__) + ): hash(index) def test_append_join_nondatetimeindex(self): - rng = timedelta_range('1 days', periods=10) - idx = Index(['a', 'b', 'c', 'd']) + rng = timedelta_range("1 days", periods=10) + idx = Index(["a", "b", "c", "d"]) result = rng.append(idx) assert isinstance(result[0], Timedelta) # it works - rng.join(idx, how='outer') + rng.join(idx, how="outer") def test_append_numpy_bug_1681(self): - td = timedelta_range('1 days', '10 days', freq='2D') + td = timedelta_range("1 days", "10 days", freq="2D") a = DataFrame() - c = DataFrame({'A': 'foo', 'B': td}, index=td) + c = DataFrame({"A": "foo", "B": td}, index=td) str(c) result = a.append(c) - assert (result['B'] == td).all() + assert (result["B"] == td).all() def test_fields(self): - rng = timedelta_range('1 days, 10:11:12.100123456', periods=2, - freq='s') - tm.assert_index_equal(rng.days, Index([1, 1], dtype='int64')) + rng = timedelta_range("1 days, 10:11:12.100123456", periods=2, freq="s") + tm.assert_index_equal(rng.days, Index([1, 1], dtype="int64")) tm.assert_index_equal( rng.seconds, - Index([10 * 3600 + 11 * 60 + 12, 10 * 3600 + 11 * 60 + 13], - dtype='int64')) + Index([10 * 3600 + 11 * 60 + 12, 10 * 3600 + 11 * 60 + 13], dtype="int64"), + ) tm.assert_index_equal( - rng.microseconds, - Index([100 * 1000 + 123, 100 * 1000 + 123], dtype='int64')) - tm.assert_index_equal(rng.nanoseconds, - Index([456, 456], dtype='int64')) + rng.microseconds, Index([100 * 1000 + 123, 100 * 1000 + 123], dtype="int64") + ) + tm.assert_index_equal(rng.nanoseconds, Index([456, 456], dtype="int64")) msg = "'TimedeltaIndex' object has no attribute '{}'" - with pytest.raises(AttributeError, match=msg.format('hours')): + with pytest.raises(AttributeError, match=msg.format("hours")): rng.hours - with pytest.raises(AttributeError, match=msg.format('minutes')): + with pytest.raises(AttributeError, match=msg.format("minutes")): rng.minutes - with pytest.raises(AttributeError, match=msg.format('milliseconds')): + with pytest.raises(AttributeError, match=msg.format("milliseconds")): rng.milliseconds # with nat @@ -277,71 +288,69 @@ def test_fields(self): s[1] = np.nan tm.assert_series_equal(s.dt.days, Series([1, np.nan], index=[0, 1])) - tm.assert_series_equal(s.dt.seconds, Series( - [10 * 3600 + 11 * 60 + 12, np.nan], index=[0, 1])) + tm.assert_series_equal( + s.dt.seconds, Series([10 * 3600 + 11 * 60 + 12, np.nan], index=[0, 1]) + ) # preserve name (GH15589) - rng.name = 'name' - assert rng.days.name == 'name' + rng.name = "name" + assert rng.days.name == "name" def test_freq_conversion(self): # doc example # series - td = Series(date_range('20130101', periods=4)) - \ - Series(date_range('20121201', periods=4)) + td = Series(date_range("20130101", periods=4)) - Series( + date_range("20121201", periods=4) + ) td[2] += timedelta(minutes=5, seconds=3) td[3] = np.nan - result = td / np.timedelta64(1, 'D') - expected = Series([31, 31, (31 * 86400 + 5 * 60 + 3) / 86400.0, np.nan - ]) + result = td / np.timedelta64(1, "D") + expected = Series([31, 31, (31 * 86400 + 5 * 60 + 3) / 86400.0, np.nan]) assert_series_equal(result, expected) - result = td.astype('timedelta64[D]') + result = td.astype("timedelta64[D]") expected = Series([31, 31, 31, np.nan]) assert_series_equal(result, expected) - result = td / np.timedelta64(1, 's') - expected = Series([31 * 86400, 31 * 86400, 31 * 86400 + 5 * 60 + 3, - np.nan]) + result = td / np.timedelta64(1, "s") + expected = Series([31 * 86400, 31 * 86400, 31 * 86400 + 5 * 60 + 3, np.nan]) assert_series_equal(result, expected) - result = td.astype('timedelta64[s]') + result = td.astype("timedelta64[s]") assert_series_equal(result, expected) # tdi td = TimedeltaIndex(td) - result = td / np.timedelta64(1, 'D') + result = td / np.timedelta64(1, "D") expected = Index([31, 31, (31 * 86400 + 5 * 60 + 3) / 86400.0, np.nan]) assert_index_equal(result, expected) - result = td.astype('timedelta64[D]') + result = td.astype("timedelta64[D]") expected = Index([31, 31, 31, np.nan]) assert_index_equal(result, expected) - result = td / np.timedelta64(1, 's') - expected = Index([31 * 86400, 31 * 86400, 31 * 86400 + 5 * 60 + 3, - np.nan]) + result = td / np.timedelta64(1, "s") + expected = Index([31 * 86400, 31 * 86400, 31 * 86400 + 5 * 60 + 3, np.nan]) assert_index_equal(result, expected) - result = td.astype('timedelta64[s]') + result = td.astype("timedelta64[s]") assert_index_equal(result, expected) - @pytest.mark.parametrize('unit', ['Y', 'y', 'M']) + @pytest.mark.parametrize("unit", ["Y", "y", "M"]) def test_unit_m_y_deprecated(self, unit): with tm.assert_produces_warning(FutureWarning) as w: TimedeltaIndex([1, 3, 7], unit) - msg = r'.* units are deprecated .*' + msg = r".* units are deprecated .*" assert re.match(msg, str(w[0].message)) class TestTimeSeries: - def test_series_box_timedelta(self): - rng = timedelta_range('1 day 1 s', periods=5, freq='h') + rng = timedelta_range("1 day 1 s", periods=5, freq="h") s = Series(rng) assert isinstance(s[1], Timedelta) assert isinstance(s.iat[2], Timedelta) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py index 971cbe65b5da14..1c1d0f1a735cf1 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta_range.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -9,28 +9,26 @@ class TestTimedeltas: - def test_timedelta_range(self): - expected = to_timedelta(np.arange(5), unit='D') - result = timedelta_range('0 days', periods=5, freq='D') + expected = to_timedelta(np.arange(5), unit="D") + result = timedelta_range("0 days", periods=5, freq="D") tm.assert_index_equal(result, expected) - expected = to_timedelta(np.arange(11), unit='D') - result = timedelta_range('0 days', '10 days', freq='D') + expected = to_timedelta(np.arange(11), unit="D") + result = timedelta_range("0 days", "10 days", freq="D") tm.assert_index_equal(result, expected) - expected = to_timedelta(np.arange(5), unit='D') + Second(2) + Day() - result = timedelta_range('1 days, 00:00:02', '5 days, 00:00:02', - freq='D') + expected = to_timedelta(np.arange(5), unit="D") + Second(2) + Day() + result = timedelta_range("1 days, 00:00:02", "5 days, 00:00:02", freq="D") tm.assert_index_equal(result, expected) - expected = to_timedelta([1, 3, 5, 7, 9], unit='D') + Second(2) - result = timedelta_range('1 days, 00:00:02', periods=5, freq='2D') + expected = to_timedelta([1, 3, 5, 7, 9], unit="D") + Second(2) + result = timedelta_range("1 days, 00:00:02", periods=5, freq="2D") tm.assert_index_equal(result, expected) - expected = to_timedelta(np.arange(50), unit='T') * 30 - result = timedelta_range('0 days', freq='30T', periods=50) + expected = to_timedelta(np.arange(50), unit="T") * 30 + result = timedelta_range("0 days", freq="30T", periods=50) tm.assert_index_equal(result, expected) # GH 11776 @@ -39,34 +37,37 @@ def test_timedelta_range(self): for arg in (arr, df): with pytest.raises(TypeError, match="1-d array"): to_timedelta(arg) - for errors in ['ignore', 'raise', 'coerce']: + for errors in ["ignore", "raise", "coerce"]: with pytest.raises(TypeError, match="1-d array"): to_timedelta(arg, errors=errors) # issue10583 df = pd.DataFrame(np.random.normal(size=(10, 4))) - df.index = pd.timedelta_range(start='0s', periods=10, freq='s') - expected = df.loc[pd.Timedelta('0s'):, :] - result = df.loc['0s':, :] + df.index = pd.timedelta_range(start="0s", periods=10, freq="s") + expected = df.loc[pd.Timedelta("0s") :, :] + result = df.loc["0s":, :] tm.assert_frame_equal(expected, result) - @pytest.mark.parametrize('periods, freq', [ - (3, '2D'), (5, 'D'), (6, '19H12T'), (7, '16H'), (9, '12H')]) + @pytest.mark.parametrize( + "periods, freq", [(3, "2D"), (5, "D"), (6, "19H12T"), (7, "16H"), (9, "12H")] + ) def test_linspace_behavior(self, periods, freq): # GH 20976 - result = timedelta_range(start='0 days', end='4 days', periods=periods) - expected = timedelta_range(start='0 days', end='4 days', freq=freq) + result = timedelta_range(start="0 days", end="4 days", periods=periods) + expected = timedelta_range(start="0 days", end="4 days", freq=freq) tm.assert_index_equal(result, expected) def test_errors(self): # not enough params - msg = ('Of the four parameters: start, end, periods, and freq, ' - 'exactly three must be specified') + msg = ( + "Of the four parameters: start, end, periods, and freq, " + "exactly three must be specified" + ) with pytest.raises(ValueError, match=msg): - timedelta_range(start='0 days') + timedelta_range(start="0 days") with pytest.raises(ValueError, match=msg): - timedelta_range(end='5 days') + timedelta_range(end="5 days") with pytest.raises(ValueError, match=msg): timedelta_range(periods=2) @@ -76,4 +77,4 @@ def test_errors(self): # too many params with pytest.raises(ValueError, match=msg): - timedelta_range(start='0 days', end='5 days', periods=10, freq='H') + timedelta_range(start="0 days", end="5 days", periods=10, freq="H") diff --git a/pandas/tests/indexes/timedeltas/test_tools.py b/pandas/tests/indexes/timedeltas/test_tools.py index 81e51fed788e4f..4aed0b1af81a6d 100644 --- a/pandas/tests/indexes/timedeltas/test_tools.py +++ b/pandas/tests/indexes/timedeltas/test_tools.py @@ -12,49 +12,52 @@ class TestTimedeltas: - def test_to_timedelta(self): def conv(v): - return v.astype('m8[ns]') + return v.astype("m8[ns]") - d1 = np.timedelta64(1, 'D') + d1 = np.timedelta64(1, "D") with tm.assert_produces_warning(FutureWarning): - assert (to_timedelta('1 days 06:05:01.00003', box=False) == - conv(d1 + np.timedelta64(6 * 3600 + 5 * 60 + 1, 's') + - np.timedelta64(30, 'us'))) + assert to_timedelta("1 days 06:05:01.00003", box=False) == conv( + d1 + + np.timedelta64(6 * 3600 + 5 * 60 + 1, "s") + + np.timedelta64(30, "us") + ) with tm.assert_produces_warning(FutureWarning): - assert (to_timedelta('15.5us', box=False) == - conv(np.timedelta64(15500, 'ns'))) + assert to_timedelta("15.5us", box=False) == conv( + np.timedelta64(15500, "ns") + ) # empty string - result = to_timedelta('', box=False) - assert result.astype('int64') == iNaT + result = to_timedelta("", box=False) + assert result.astype("int64") == iNaT - result = to_timedelta(['', '']) + result = to_timedelta(["", ""]) assert isna(result).all() # pass thru - result = to_timedelta(np.array([np.timedelta64(1, 's')])) - expected = pd.Index(np.array([np.timedelta64(1, 's')])) + result = to_timedelta(np.array([np.timedelta64(1, "s")])) + expected = pd.Index(np.array([np.timedelta64(1, "s")])) tm.assert_index_equal(result, expected) with tm.assert_produces_warning(FutureWarning): # ints - result = np.timedelta64(0, 'ns') + result = np.timedelta64(0, "ns") expected = to_timedelta(0, box=False) assert result == expected # Series expected = Series([timedelta(days=1), timedelta(days=1, seconds=1)]) - result = to_timedelta(Series(['1d', '1days 00:00:01'])) + result = to_timedelta(Series(["1d", "1days 00:00:01"])) tm.assert_series_equal(result, expected) # with units - result = TimedeltaIndex([np.timedelta64(0, 'ns'), np.timedelta64( - 10, 's').astype('m8[ns]')]) - expected = to_timedelta([0, 10], unit='s') + result = TimedeltaIndex( + [np.timedelta64(0, "ns"), np.timedelta64(10, "s").astype("m8[ns]")] + ) + expected = to_timedelta([0, 10], unit="s") tm.assert_index_equal(result, expected) with tm.assert_produces_warning(FutureWarning): @@ -71,51 +74,51 @@ def conv(v): assert result == expected # arrays of various dtypes - arr = np.array([1] * 5, dtype='int64') - result = to_timedelta(arr, unit='s') - expected = TimedeltaIndex([np.timedelta64(1, 's')] * 5) + arr = np.array([1] * 5, dtype="int64") + result = to_timedelta(arr, unit="s") + expected = TimedeltaIndex([np.timedelta64(1, "s")] * 5) tm.assert_index_equal(result, expected) - arr = np.array([1] * 5, dtype='int64') - result = to_timedelta(arr, unit='m') - expected = TimedeltaIndex([np.timedelta64(1, 'm')] * 5) + arr = np.array([1] * 5, dtype="int64") + result = to_timedelta(arr, unit="m") + expected = TimedeltaIndex([np.timedelta64(1, "m")] * 5) tm.assert_index_equal(result, expected) - arr = np.array([1] * 5, dtype='int64') - result = to_timedelta(arr, unit='h') - expected = TimedeltaIndex([np.timedelta64(1, 'h')] * 5) + arr = np.array([1] * 5, dtype="int64") + result = to_timedelta(arr, unit="h") + expected = TimedeltaIndex([np.timedelta64(1, "h")] * 5) tm.assert_index_equal(result, expected) - arr = np.array([1] * 5, dtype='timedelta64[s]') + arr = np.array([1] * 5, dtype="timedelta64[s]") result = to_timedelta(arr) - expected = TimedeltaIndex([np.timedelta64(1, 's')] * 5) + expected = TimedeltaIndex([np.timedelta64(1, "s")] * 5) tm.assert_index_equal(result, expected) - arr = np.array([1] * 5, dtype='timedelta64[D]') + arr = np.array([1] * 5, dtype="timedelta64[D]") result = to_timedelta(arr) - expected = TimedeltaIndex([np.timedelta64(1, 'D')] * 5) + expected = TimedeltaIndex([np.timedelta64(1, "D")] * 5) tm.assert_index_equal(result, expected) with tm.assert_produces_warning(FutureWarning): # Test with lists as input when box=false - expected = np.array(np.arange(3) * 1000000000, - dtype='timedelta64[ns]') - result = to_timedelta(range(3), unit='s', box=False) + expected = np.array(np.arange(3) * 1000000000, dtype="timedelta64[ns]") + result = to_timedelta(range(3), unit="s", box=False) tm.assert_numpy_array_equal(expected, result) with tm.assert_produces_warning(FutureWarning): - result = to_timedelta(np.arange(3), unit='s', box=False) + result = to_timedelta(np.arange(3), unit="s", box=False) tm.assert_numpy_array_equal(expected, result) with tm.assert_produces_warning(FutureWarning): - result = to_timedelta([0, 1, 2], unit='s', box=False) + result = to_timedelta([0, 1, 2], unit="s", box=False) tm.assert_numpy_array_equal(expected, result) with tm.assert_produces_warning(FutureWarning): # Tests with fractional seconds as input: expected = np.array( - [0, 500000000, 800000000, 1200000000], dtype='timedelta64[ns]') - result = to_timedelta([0., 0.5, 0.8, 1.2], unit='s', box=False) + [0, 500000000, 800000000, 1200000000], dtype="timedelta64[ns]" + ) + result = to_timedelta([0.0, 0.5, 0.8, 1.2], unit="s", box=False) tm.assert_numpy_array_equal(expected, result) def test_to_timedelta_invalid(self): @@ -123,85 +126,92 @@ def test_to_timedelta_invalid(self): # bad value for errors parameter msg = "errors must be one of" with pytest.raises(ValueError, match=msg): - to_timedelta(['foo'], errors='never') + to_timedelta(["foo"], errors="never") # these will error msg = "invalid unit abbreviation: foo" with pytest.raises(ValueError, match=msg): - to_timedelta([1, 2], unit='foo') + to_timedelta([1, 2], unit="foo") with pytest.raises(ValueError, match=msg): - to_timedelta(1, unit='foo') + to_timedelta(1, unit="foo") # time not supported ATM - msg = ("Value must be Timedelta, string, integer, float, timedelta or" - " convertible") + msg = ( + "Value must be Timedelta, string, integer, float, timedelta or" + " convertible" + ) with pytest.raises(ValueError, match=msg): to_timedelta(time(second=1)) - assert to_timedelta(time(second=1), errors='coerce') is pd.NaT + assert to_timedelta(time(second=1), errors="coerce") is pd.NaT msg = "unit abbreviation w/o a number" with pytest.raises(ValueError, match=msg): - to_timedelta(['foo', 'bar']) - tm.assert_index_equal(TimedeltaIndex([pd.NaT, pd.NaT]), - to_timedelta(['foo', 'bar'], errors='coerce')) + to_timedelta(["foo", "bar"]) + tm.assert_index_equal( + TimedeltaIndex([pd.NaT, pd.NaT]), + to_timedelta(["foo", "bar"], errors="coerce"), + ) - tm.assert_index_equal(TimedeltaIndex(['1 day', pd.NaT, '1 min']), - to_timedelta(['1 day', 'bar', '1 min'], - errors='coerce')) + tm.assert_index_equal( + TimedeltaIndex(["1 day", pd.NaT, "1 min"]), + to_timedelta(["1 day", "bar", "1 min"], errors="coerce"), + ) # gh-13613: these should not error because errors='ignore' - invalid_data = 'apple' - assert invalid_data == to_timedelta(invalid_data, errors='ignore') + invalid_data = "apple" + assert invalid_data == to_timedelta(invalid_data, errors="ignore") - invalid_data = ['apple', '1 days'] + invalid_data = ["apple", "1 days"] tm.assert_numpy_array_equal( np.array(invalid_data, dtype=object), - to_timedelta(invalid_data, errors='ignore')) + to_timedelta(invalid_data, errors="ignore"), + ) - invalid_data = pd.Index(['apple', '1 days']) - tm.assert_index_equal(invalid_data, to_timedelta( - invalid_data, errors='ignore')) + invalid_data = pd.Index(["apple", "1 days"]) + tm.assert_index_equal(invalid_data, to_timedelta(invalid_data, errors="ignore")) - invalid_data = Series(['apple', '1 days']) - tm.assert_series_equal(invalid_data, to_timedelta( - invalid_data, errors='ignore')) + invalid_data = Series(["apple", "1 days"]) + tm.assert_series_equal( + invalid_data, to_timedelta(invalid_data, errors="ignore") + ) def test_to_timedelta_via_apply(self): # GH 5458 - expected = Series([np.timedelta64(1, 's')]) - result = Series(['00:00:01']).apply(to_timedelta) + expected = Series([np.timedelta64(1, "s")]) + result = Series(["00:00:01"]).apply(to_timedelta) tm.assert_series_equal(result, expected) - result = Series([to_timedelta('00:00:01')]) + result = Series([to_timedelta("00:00:01")]) tm.assert_series_equal(result, expected) def test_to_timedelta_on_missing_values(self): # GH5438 - timedelta_NaT = np.timedelta64('NaT') + timedelta_NaT = np.timedelta64("NaT") - actual = pd.to_timedelta(Series(['00:00:01', np.nan])) - expected = Series([np.timedelta64(1000000000, 'ns'), - timedelta_NaT], dtype='%-8.8s,obj->%-8.8s," - "key1->(%-4.4s),key2->(%-4.4s),axis->%s] %s" % - (name, result, t, o, method1, method2, a, error or '')) + v = ( + "%-16.16s [%-16.16s]: [typ->%-8.8s,obj->%-8.8s," + "key1->(%-4.4s),key2->(%-4.4s),axis->%s] %s" + % (name, result, t, o, method1, method2, a, error or "") + ) if _verbose: pprint_thing(v) @@ -187,7 +213,7 @@ def _print(result, error=None): try: xp = self.get_result(obj, method2, k2, a) except Exception: - result = 'no comp' + result = "no comp" _print(result) return @@ -200,18 +226,18 @@ def _print(result, error=None): tm.assert_series_equal(rs, xp) elif xp.ndim == 2: tm.assert_frame_equal(rs, xp) - result = 'ok' + result = "ok" except AssertionError as e: detail = str(e) - result = 'fail' + result = "fail" # reverse the checks if fails is True: - if result == 'fail': - result = 'ok (fail)' + if result == "fail": + result = "ok (fail)" _print(result) - if not result.startswith('ok'): + if not result.startswith("ok"): raise AssertionError(detail) except AssertionError: @@ -221,7 +247,7 @@ def _print(result, error=None): # if we are in fails, the ok, otherwise raise it if fails is not None: if isinstance(detail, fails): - result = 'ok (%s)' % type(detail).__name__ + result = "ok (%s)" % type(detail).__name__ _print(result) return diff --git a/pandas/tests/indexing/conftest.py b/pandas/tests/indexing/conftest.py index be1cf4800a2ef3..142bedaa943a62 100644 --- a/pandas/tests/indexing/conftest.py +++ b/pandas/tests/indexing/conftest.py @@ -4,17 +4,20 @@ from pandas._libs import index as libindex -@pytest.fixture(params=[ - (libindex.Int64Engine, np.int64), - (libindex.Int32Engine, np.int32), - (libindex.Int16Engine, np.int16), - (libindex.Int8Engine, np.int8), - (libindex.UInt64Engine, np.uint64), - (libindex.UInt32Engine, np.uint32), - (libindex.UInt16Engine, np.uint16), - (libindex.UInt8Engine, np.uint8), - (libindex.Float64Engine, np.float64), - (libindex.Float32Engine, np.float32), -], ids=lambda x: x[0].__name__) +@pytest.fixture( + params=[ + (libindex.Int64Engine, np.int64), + (libindex.Int32Engine, np.int32), + (libindex.Int16Engine, np.int16), + (libindex.Int8Engine, np.int8), + (libindex.UInt64Engine, np.uint64), + (libindex.UInt32Engine, np.uint32), + (libindex.UInt16Engine, np.uint16), + (libindex.UInt8Engine, np.uint8), + (libindex.Float64Engine, np.float64), + (libindex.Float32Engine, np.float32), + ], + ids=lambda x: x[0].__name__, +) def numeric_indexing_engine_type_and_dtype(request): return request.param diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index 76f0b94ea39048..1bdb665101d416 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -7,7 +7,6 @@ class TestIntervalIndex: - def setup_method(self, method): self.s = Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) @@ -28,14 +27,14 @@ def test_getitem_with_scalar(self): expected = s.iloc[2:5] tm.assert_series_equal(expected, s[s >= 2]) - @pytest.mark.parametrize('direction', ['increasing', 'decreasing']) + @pytest.mark.parametrize("direction", ["increasing", "decreasing"]) def test_nonoverlapping_monotonic(self, direction, closed): tpls = [(0, 1), (2, 3), (4, 5)] - if direction == 'decreasing': + if direction == "decreasing": tpls = tpls[::-1] idx = IntervalIndex.from_tuples(tpls, closed=closed) - s = Series(list('abc'), idx) + s = Series(list("abc"), idx) for key, expected in zip(idx.left, s): if idx.closed_left: @@ -73,8 +72,9 @@ def test_non_matching(self): s.loc[[-1, 3]] def test_large_series(self): - s = Series(np.arange(1000000), - index=IntervalIndex.from_breaks(np.arange(1000001))) + s = Series( + np.arange(1000000), index=IntervalIndex.from_breaks(np.arange(1000001)) + ) result1 = s.loc[:80000] result2 = s.loc[0:80000] @@ -84,10 +84,10 @@ def test_large_series(self): def test_loc_getitem_frame(self): - df = DataFrame({'A': range(10)}) + df = DataFrame({"A": range(10)}) s = pd.cut(df.A, 5) - df['B'] = s - df = df.set_index('B') + df["B"] = s + df = df.set_index("B") result = df.loc[4] expected = df.iloc[4:6] diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index aa016ac5dd1a74..92c71bbc6eb327 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -6,7 +6,6 @@ class TestIntervalIndex: - def setup_method(self, method): self.s = Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) @@ -32,10 +31,10 @@ def test_loc_with_interval(self): # missing or not exact with pytest.raises(KeyError): - s.loc[Interval(3, 5, closed='left')] + s.loc[Interval(3, 5, closed="left")] with pytest.raises(KeyError): - s[Interval(3, 5, closed='left')] + s[Interval(3, 5, closed="left")] with pytest.raises(KeyError): s[Interval(3, 5)] @@ -99,29 +98,29 @@ def test_loc_with_slices(self): # slice of interval expected = s.iloc[:3] - result = s.loc[Interval(0, 1):Interval(2, 3)] + result = s.loc[Interval(0, 1) : Interval(2, 3)] tm.assert_series_equal(expected, result) - result = s[Interval(0, 1):Interval(2, 3)] + result = s[Interval(0, 1) : Interval(2, 3)] tm.assert_series_equal(expected, result) expected = s.iloc[3:] - result = s.loc[Interval(3, 4):] + result = s.loc[Interval(3, 4) :] tm.assert_series_equal(expected, result) - result = s[Interval(3, 4):] + result = s[Interval(3, 4) :] tm.assert_series_equal(expected, result) - msg = 'Interval objects are not currently supported' + msg = "Interval objects are not currently supported" with pytest.raises(NotImplementedError, match=msg): - s.loc[Interval(3, 6):] + s.loc[Interval(3, 6) :] with pytest.raises(NotImplementedError, match=msg): - s[Interval(3, 6):] + s[Interval(3, 6) :] with pytest.raises(NotImplementedError, match=msg): - s.loc[Interval(3, 4, closed='left'):] + s.loc[Interval(3, 4, closed="left") :] with pytest.raises(NotImplementedError, match=msg): - s[Interval(3, 4, closed='left'):] + s[Interval(3, 4, closed="left") :] # TODO with non-existing intervals ? # s.loc[Interval(-1, 0):Interval(2, 3)] @@ -190,17 +189,17 @@ def test_loc_with_overlap(self): # slices with interval (only exact matches) expected = s - result = s.loc[Interval(1, 5):Interval(3, 7)] + result = s.loc[Interval(1, 5) : Interval(3, 7)] tm.assert_series_equal(expected, result) - result = s[Interval(1, 5):Interval(3, 7)] + result = s[Interval(1, 5) : Interval(3, 7)] tm.assert_series_equal(expected, result) with pytest.raises(KeyError): - s.loc[Interval(1, 6):Interval(3, 8)] + s.loc[Interval(1, 6) : Interval(3, 8)] with pytest.raises(KeyError): - s[Interval(1, 6):Interval(3, 8)] + s[Interval(1, 6) : Interval(3, 8)] # slices with scalar raise for overlapping intervals # TODO KeyError is the appropriate error? @@ -229,11 +228,11 @@ def test_non_unique_moar(self): tm.assert_series_equal(expected, result) expected = s - result = s.loc[Interval(1, 3):] + result = s.loc[Interval(1, 3) :] tm.assert_series_equal(expected, result) expected = s - result = s[Interval(1, 3):] + result = s[Interval(1, 3) :] tm.assert_series_equal(expected, result) expected = s.iloc[[0, 1]] diff --git a/pandas/tests/indexing/multiindex/conftest.py b/pandas/tests/indexing/multiindex/conftest.py index 545e092d9ce651..23149944f3c38d 100644 --- a/pandas/tests/indexing/multiindex/conftest.py +++ b/pandas/tests/indexing/multiindex/conftest.py @@ -8,13 +8,14 @@ @pytest.fixture def multiindex_dataframe_random_data(): """DataFrame with 2 level MultiIndex with random data""" - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - return DataFrame(np.random.randn(10, 3), index=index, - columns=Index(['A', 'B', 'C'], name='exp')) + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + return DataFrame( + np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp") + ) @pytest.fixture @@ -22,10 +23,8 @@ def multiindex_year_month_day_dataframe_random_data(): """DataFrame with 3 level MultiIndex (year, month, day) covering first 100 business days from 2000-01-01 with random data""" tdf = tm.makeTimeDataFrame(100) - ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, - lambda x: x.day]).sum() + ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() # use Int64Index, to make sure things work - ymd.index.set_levels([lev.astype('i8') for lev in ymd.index.levels], - inplace=True) - ymd.index.set_names(['year', 'month', 'day'], inplace=True) + ymd.index.set_levels([lev.astype("i8") for lev in ymd.index.levels], inplace=True) + ymd.index.set_names(["year", "month", "day"], inplace=True) return ymd diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py index ff59e446a7b2e5..3183721eeb54f8 100644 --- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py +++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -13,37 +13,38 @@ def test_detect_chained_assignment(): b = [123, None] c = [1234, 2345] d = [12345, 23456] - tuples = [('eyes', 'left'), ('eyes', 'right'), ('ears', 'left'), - ('ears', 'right')] - events = {('eyes', 'left'): a, - ('eyes', 'right'): b, - ('ears', 'left'): c, - ('ears', 'right'): d} - multiind = MultiIndex.from_tuples(tuples, names=['part', 'side']) - zed = DataFrame(events, index=['a', 'b'], columns=multiind) + tuples = [("eyes", "left"), ("eyes", "right"), ("ears", "left"), ("ears", "right")] + events = { + ("eyes", "left"): a, + ("eyes", "right"): b, + ("ears", "left"): c, + ("ears", "right"): d, + } + multiind = MultiIndex.from_tuples(tuples, names=["part", "side"]) + zed = DataFrame(events, index=["a", "b"], columns=multiind) with pytest.raises(com.SettingWithCopyError): - zed['eyes']['right'].fillna(value=555, inplace=True) + zed["eyes"]["right"].fillna(value=555, inplace=True) def test_cache_updating(): # 5216 # make sure that we don't try to set a dead cache a = np.random.rand(10, 3) - df = DataFrame(a, columns=['x', 'y', 'z']) + df = DataFrame(a, columns=["x", "y", "z"]) tuples = [(i, j) for i in range(5) for j in range(2)] index = MultiIndex.from_tuples(tuples) df.index = index # setting via chained assignment # but actually works, since everything is a view - df.loc[0]['z'].iloc[0] = 1. - result = df.loc[(0, 0), 'z'] + df.loc[0]["z"].iloc[0] = 1.0 + result = df.loc[(0, 0), "z"] assert result == 1 # correct setting - df.loc[(0, 0), 'z'] = 2 - result = df.loc[(0, 0), 'z'] + df.loc[(0, 0), "z"] = 2 + result = df.loc[(0, 0), "z"] assert result == 2 diff --git a/pandas/tests/indexing/multiindex/test_datetime.py b/pandas/tests/indexing/multiindex/test_datetime.py index a270ab32e9b041..907d20cd5bd537 100644 --- a/pandas/tests/indexing/multiindex/test_datetime.py +++ b/pandas/tests/indexing/multiindex/test_datetime.py @@ -8,15 +8,15 @@ def test_multiindex_period_datetime(): # GH4861, using datetime in period of multiindex raises exception - idx1 = Index(['a', 'a', 'a', 'b', 'b']) - idx2 = period_range('2012-01', periods=len(idx1), freq='M') + idx1 = Index(["a", "a", "a", "b", "b"]) + idx2 = period_range("2012-01", periods=len(idx1), freq="M") s = Series(np.random.randn(len(idx1)), [idx1, idx2]) # try Period as index expected = s.iloc[0] - result = s.loc['a', Period('2012-01')] + result = s.loc["a", Period("2012-01")] assert result == expected # try datetime as index - result = s.loc['a', datetime(2012, 1, 1)] + result = s.loc["a", datetime(2012, 1, 1)] assert result == expected diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index 2fbbdef33b6339..0c61644eb46aee 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -10,13 +10,14 @@ # ---------------------------------------------------------------------------- -@pytest.mark.parametrize('access_method', [lambda s, x: s[:, x], - lambda s, x: s.loc[:, x], - lambda s, x: s.xs(x, level=1)]) -@pytest.mark.parametrize('level1_value, expected', [ - (0, Series([1], index=[0])), - (1, Series([2, 3], index=[1, 2])) -]) +@pytest.mark.parametrize( + "access_method", + [lambda s, x: s[:, x], lambda s, x: s.loc[:, x], lambda s, x: s.xs(x, level=1)], +) +@pytest.mark.parametrize( + "level1_value, expected", + [(0, Series([1], index=[0])), (1, Series([2, 3], index=[1, 2]))], +) def test_series_getitem_multiindex(access_method, level1_value, expected): # GH 6018 @@ -28,40 +29,37 @@ def test_series_getitem_multiindex(access_method, level1_value, expected): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('level0_value', ['D', 'A']) +@pytest.mark.parametrize("level0_value", ["D", "A"]) def test_series_getitem_duplicates_multiindex(level0_value): # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise # the appropriate error, only in PY3 of course! - index = MultiIndex(levels=[[level0_value, 'B', 'C'], - [0, 26, 27, 37, 57, 67, 75, 82]], - codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], - [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], - names=['tag', 'day']) + index = MultiIndex( + levels=[[level0_value, "B", "C"], [0, 26, 27, 37, 57, 67, 75, 82]], + codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + names=["tag", "day"], + ) arr = np.random.randn(len(index), 1) - df = DataFrame(arr, index=index, columns=['val']) + df = DataFrame(arr, index=index, columns=["val"]) # confirm indexing on missing value raises KeyError - if level0_value != 'A': + if level0_value != "A": with pytest.raises(KeyError, match=r"^'A'$"): - df.val['A'] + df.val["A"] with pytest.raises(KeyError, match=r"^'X'$"): - df.val['X'] + df.val["X"] result = df.val[level0_value] - expected = Series(arr.ravel()[0:3], name='val', index=Index( - [26, 37, 57], name='day')) + expected = Series( + arr.ravel()[0:3], name="val", index=Index([26, 37, 57], name="day") + ) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('indexer', [ - lambda s: s[2000, 3], - lambda s: s.loc[2000, 3] -]) -def test_series_getitem( - multiindex_year_month_day_dataframe_random_data, indexer): - s = multiindex_year_month_day_dataframe_random_data['A'] +@pytest.mark.parametrize("indexer", [lambda s: s[2000, 3], lambda s: s.loc[2000, 3]]) +def test_series_getitem(multiindex_year_month_day_dataframe_random_data, indexer): + s = multiindex_year_month_day_dataframe_random_data["A"] expected = s.reindex(s.index[42:65]) expected.index = expected.index.droplevel(0).droplevel(0) @@ -69,40 +67,50 @@ def test_series_getitem( tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('indexer', [ - lambda s: s[2000, 3, 10], - lambda s: s.loc[2000, 3, 10] -]) +@pytest.mark.parametrize( + "indexer", [lambda s: s[2000, 3, 10], lambda s: s.loc[2000, 3, 10]] +) def test_series_getitem_returns_scalar( - multiindex_year_month_day_dataframe_random_data, indexer): - s = multiindex_year_month_day_dataframe_random_data['A'] + multiindex_year_month_day_dataframe_random_data, indexer +): + s = multiindex_year_month_day_dataframe_random_data["A"] expected = s.iloc[49] result = indexer(s) assert result == expected -@pytest.mark.parametrize('indexer,expected_error,expected_error_msg', [ - (lambda s: s.__getitem__((2000, 3, 4)), KeyError, r"^356$"), - (lambda s: s[(2000, 3, 4)], KeyError, r"^356$"), - (lambda s: s.loc[(2000, 3, 4)], KeyError, r"^356$"), - (lambda s: s.loc[(2000, 3, 4, 5)], IndexingError, 'Too many indexers'), - (lambda s: s.__getitem__(len(s)), IndexError, 'index out of bounds'), - (lambda s: s[len(s)], IndexError, 'index out of bounds'), - (lambda s: s.iloc[len(s)], IndexError, - 'single positional indexer is out-of-bounds') -]) +@pytest.mark.parametrize( + "indexer,expected_error,expected_error_msg", + [ + (lambda s: s.__getitem__((2000, 3, 4)), KeyError, r"^356$"), + (lambda s: s[(2000, 3, 4)], KeyError, r"^356$"), + (lambda s: s.loc[(2000, 3, 4)], KeyError, r"^356$"), + (lambda s: s.loc[(2000, 3, 4, 5)], IndexingError, "Too many indexers"), + (lambda s: s.__getitem__(len(s)), IndexError, "index out of bounds"), + (lambda s: s[len(s)], IndexError, "index out of bounds"), + ( + lambda s: s.iloc[len(s)], + IndexError, + "single positional indexer is out-of-bounds", + ), + ], +) def test_series_getitem_indexing_errors( - multiindex_year_month_day_dataframe_random_data, indexer, - expected_error, expected_error_msg): - s = multiindex_year_month_day_dataframe_random_data['A'] + multiindex_year_month_day_dataframe_random_data, + indexer, + expected_error, + expected_error_msg, +): + s = multiindex_year_month_day_dataframe_random_data["A"] with pytest.raises(expected_error, match=expected_error_msg): indexer(s) def test_series_getitem_corner_generator( - multiindex_year_month_day_dataframe_random_data): - s = multiindex_year_month_day_dataframe_random_data['A'] + multiindex_year_month_day_dataframe_random_data +): + s = multiindex_year_month_day_dataframe_random_data["A"] result = s[(x > 0 for x in s)] expected = s[s > 0] tm.assert_series_equal(result, expected) @@ -112,42 +120,55 @@ def test_series_getitem_corner_generator( # test indexing of DataFrame with multi-level Index # ---------------------------------------------------------------------------- + def test_getitem_simple(multiindex_dataframe_random_data): df = multiindex_dataframe_random_data.T expected = df.values[:, 0] - result = df['foo', 'one'].values + result = df["foo", "one"].values tm.assert_almost_equal(result, expected) -@pytest.mark.parametrize('indexer,expected_error_msg', [ - (lambda df: df[('foo', 'four')], r"^\('foo', 'four'\)$"), - (lambda df: df['foobar'], r"^'foobar'$") -]) +@pytest.mark.parametrize( + "indexer,expected_error_msg", + [ + (lambda df: df[("foo", "four")], r"^\('foo', 'four'\)$"), + (lambda df: df["foobar"], r"^'foobar'$"), + ], +) def test_frame_getitem_simple_key_error( - multiindex_dataframe_random_data, indexer, expected_error_msg): + multiindex_dataframe_random_data, indexer, expected_error_msg +): df = multiindex_dataframe_random_data.T with pytest.raises(KeyError, match=expected_error_msg): indexer(df) def test_frame_getitem_multicolumn_empty_level(): - df = DataFrame({'a': ['1', '2', '3'], 'b': ['2', '3', '4']}) - df.columns = [['level1 item1', 'level1 item2'], ['', 'level2 item2'], - ['level3 item1', 'level3 item2']] - - result = df['level1 item1'] - expected = DataFrame([['1'], ['2'], ['3']], index=df.index, - columns=['level3 item1']) + df = DataFrame({"a": ["1", "2", "3"], "b": ["2", "3", "4"]}) + df.columns = [ + ["level1 item1", "level1 item2"], + ["", "level2 item2"], + ["level3 item1", "level3 item2"], + ] + + result = df["level1 item1"] + expected = DataFrame( + [["1"], ["2"], ["3"]], index=df.index, columns=["level3 item1"] + ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize('indexer,expected_slice', [ - (lambda df: df['foo'], slice(3)), - (lambda df: df['bar'], slice(3, 5)), - (lambda df: df.loc[:, 'bar'], slice(3, 5)) -]) +@pytest.mark.parametrize( + "indexer,expected_slice", + [ + (lambda df: df["foo"], slice(3)), + (lambda df: df["bar"], slice(3, 5)), + (lambda df: df.loc[:, "bar"], slice(3, 5)), + ], +) def test_frame_getitem_toplevel( - multiindex_dataframe_random_data, indexer, expected_slice): + multiindex_dataframe_random_data, indexer, expected_slice +): df = multiindex_dataframe_random_data.T expected = df.reindex(columns=df.columns[expected_slice]) expected.columns = expected.columns.droplevel(0) @@ -156,21 +177,23 @@ def test_frame_getitem_toplevel( def test_frame_mixed_depth_get(): - arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], - ['', 'OD', 'OD', 'result1', 'result2', 'result1'], - ['', 'wx', 'wy', '', '', '']] + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] tuples = sorted(zip(*arrays)) index = MultiIndex.from_tuples(tuples) df = DataFrame(np.random.randn(4, 6), columns=index) - result = df['a'] - expected = df['a', '', ''].rename('a') + result = df["a"] + expected = df["a", "", ""].rename("a") tm.assert_series_equal(result, expected) - result = df['routine1', 'result1'] - expected = df['routine1', 'result1', ''] - expected = expected.rename(('routine1', 'result1')) + result = df["routine1", "result1"] + expected = df["routine1", "result1", ""] + expected = expected.rename(("routine1", "result1")) tm.assert_series_equal(result, expected) @@ -178,30 +201,29 @@ def test_frame_mixed_depth_get(): # test indexing of DataFrame with multi-level Index with duplicates # ---------------------------------------------------------------------------- + @pytest.fixture def dataframe_with_duplicate_index(): """Fixture for DataFrame used in tests for gh-4145 and gh-4146""" - data = [['a', 'd', 'e', 'c', 'f', 'b'], - [1, 4, 5, 3, 6, 2], - [1, 4, 5, 3, 6, 2]] - index = ['h1', 'h3', 'h5'] + data = [["a", "d", "e", "c", "f", "b"], [1, 4, 5, 3, 6, 2], [1, 4, 5, 3, 6, 2]] + index = ["h1", "h3", "h5"] columns = MultiIndex( - levels=[['A', 'B'], ['A1', 'A2', 'B1', 'B2']], + levels=[["A", "B"], ["A1", "A2", "B1", "B2"]], codes=[[0, 0, 0, 1, 1, 1], [0, 3, 3, 0, 1, 2]], - names=['main', 'sub']) + names=["main", "sub"], + ) return DataFrame(data, index=index, columns=columns) -@pytest.mark.parametrize('indexer', [ - lambda df: df[('A', 'A1')], - lambda df: df.loc[:, ('A', 'A1')] -]) +@pytest.mark.parametrize( + "indexer", [lambda df: df[("A", "A1")], lambda df: df.loc[:, ("A", "A1")]] +) def test_frame_mi_access(dataframe_with_duplicate_index, indexer): # GH 4145 df = dataframe_with_duplicate_index - index = Index(['h1', 'h3', 'h5']) - columns = MultiIndex.from_tuples([('A', 'A1')], names=['main', 'sub']) - expected = DataFrame([['a', 1, 1]], index=columns, columns=index).T + index = Index(["h1", "h3", "h5"]) + columns = MultiIndex.from_tuples([("A", "A1")], names=["main", "sub"]) + expected = DataFrame([["a", 1, 1]], index=columns, columns=index).T result = indexer(df) tm.assert_frame_equal(result, expected) @@ -213,16 +235,18 @@ def test_frame_mi_access_returns_series(dataframe_with_duplicate_index): # as of 4879, this returns a Series (which is similar to what happens # with a non-unique) df = dataframe_with_duplicate_index - expected = Series(['a', 1, 1], index=['h1', 'h3', 'h5'], name='A1') - result = df['A']['A1'] + expected = Series(["a", 1, 1], index=["h1", "h3", "h5"], name="A1") + result = df["A"]["A1"] tm.assert_series_equal(result, expected) def test_frame_mi_access_returns_frame(dataframe_with_duplicate_index): # selecting a non_unique from the 2nd level df = dataframe_with_duplicate_index - expected = DataFrame([['d', 4, 4], ['e', 5, 5]], - index=Index(['B2', 'B2'], name='sub'), - columns=['h1', 'h3', 'h5'], ).T - result = df['A']['B2'] + expected = DataFrame( + [["d", 4, 4], ["e", 5, 5]], + index=Index(["B2", "B2"], name="sub"), + columns=["h1", "h3", "h5"], + ).T + result = df["A"]["B2"] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_iloc.py b/pandas/tests/indexing/multiindex/test_iloc.py index bdd505804c82b5..1335ee9dc2bdf0 100644 --- a/pandas/tests/indexing/multiindex/test_iloc.py +++ b/pandas/tests/indexing/multiindex/test_iloc.py @@ -12,23 +12,34 @@ def simple_multiindex_dataframe(): both columns and row MultiIndex using supplied data or random data by default. """ + def _simple_multiindex_dataframe(data=None): if data is None: data = np.random.randn(3, 3) - return DataFrame(data, columns=[[2, 2, 4], [6, 8, 10]], - index=[[4, 4, 8], [8, 10, 12]]) + return DataFrame( + data, columns=[[2, 2, 4], [6, 8, 10]], index=[[4, 4, 8], [8, 10, 12]] + ) + return _simple_multiindex_dataframe -@pytest.mark.parametrize('indexer, expected', [ - (lambda df: df.iloc[0], - lambda arr: Series(arr[0], index=[[2, 2, 4], [6, 8, 10]], name=(4, 8))), - (lambda df: df.iloc[2], - lambda arr: Series(arr[2], index=[[2, 2, 4], [6, 8, 10]], name=(8, 12))), - (lambda df: df.iloc[:, 2], - lambda arr: Series( - arr[:, 2], index=[[4, 4, 8], [8, 10, 12]], name=(4, 10))) -]) +@pytest.mark.parametrize( + "indexer, expected", + [ + ( + lambda df: df.iloc[0], + lambda arr: Series(arr[0], index=[[2, 2, 4], [6, 8, 10]], name=(4, 8)), + ), + ( + lambda df: df.iloc[2], + lambda arr: Series(arr[2], index=[[2, 2, 4], [6, 8, 10]], name=(8, 12)), + ), + ( + lambda df: df.iloc[:, 2], + lambda arr: Series(arr[:, 2], index=[[4, 4, 8], [8, 10, 12]], name=(4, 10)), + ), + ], +) def test_iloc_returns_series(indexer, expected, simple_multiindex_dataframe): arr = np.random.randn(3, 3) df = simple_multiindex_dataframe(arr) @@ -54,20 +65,22 @@ def test_iloc_returns_scalar(simple_multiindex_dataframe): def test_iloc_getitem_multiple_items(): # GH 5528 - tup = zip(*[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']]) + tup = zip(*[["a", "a", "b", "b"], ["x", "y", "x", "y"]]) index = MultiIndex.from_tuples(tup) df = DataFrame(np.random.randn(4, 4), index=index) result = df.iloc[[2, 3]] - expected = df.xs('b', drop_level=False) + expected = df.xs("b", drop_level=False) tm.assert_frame_equal(result, expected) def test_iloc_getitem_labels(): # this is basically regular indexing arr = np.random.randn(4, 3) - df = DataFrame(arr, - columns=[['i', 'i', 'j'], ['A', 'A', 'B']], - index=[['i', 'i', 'j', 'k'], ['X', 'X', 'Y', 'Y']]) + df = DataFrame( + arr, + columns=[["i", "i", "j"], ["A", "A", "B"]], + index=[["i", "i", "j", "k"], ["X", "X", "Y", "Y"]], + ) result = df.iloc[2, 2] expected = arr[2, 2] assert result == expected @@ -91,23 +104,30 @@ def test_frame_setitem_slice(multiindex_dataframe_random_data): def test_indexing_ambiguity_bug_1678(): # GH 1678 columns = MultiIndex.from_tuples( - [('Ohio', 'Green'), ('Ohio', 'Red'), ('Colorado', 'Green')]) - index = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)]) + [("Ohio", "Green"), ("Ohio", "Red"), ("Colorado", "Green")] + ) + index = MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) df = DataFrame(np.arange(12).reshape((4, 3)), index=index, columns=columns) result = df.iloc[:, 1] - expected = df.loc[:, ('Ohio', 'Red')] + expected = df.loc[:, ("Ohio", "Red")] tm.assert_series_equal(result, expected) def test_iloc_integer_locations(): # GH 13797 - data = [['str00', 'str01'], ['str10', 'str11'], ['str20', 'srt21'], - ['str30', 'str31'], ['str40', 'str41']] + data = [ + ["str00", "str01"], + ["str10", "str11"], + ["str20", "srt21"], + ["str30", "str31"], + ["str40", "str41"], + ] index = MultiIndex.from_tuples( - [('CC', 'A'), ('CC', 'B'), ('CC', 'B'), ('BB', 'a'), ('BB', 'b')]) + [("CC", "A"), ("CC", "B"), ("CC", "B"), ("BB", "a"), ("BB", "b")] + ) expected = DataFrame(data) df = DataFrame(data, index=index) @@ -118,28 +138,28 @@ def test_iloc_integer_locations(): @pytest.mark.parametrize( - 'data, indexes, values, expected_k', [ + "data, indexes, values, expected_k", + [ # test without indexer value in first level of MultiIndex ([[2, 22, 5], [2, 33, 6]], [0, -1, 1], [2, 3, 1], [7, 10]), # test like code sample 1 in the issue - ([[1, 22, 555], [1, 33, 666]], [0, -1, 1], [200, 300, 100], - [755, 1066]), + ([[1, 22, 555], [1, 33, 666]], [0, -1, 1], [200, 300, 100], [755, 1066]), # test like code sample 2 in the issue ([[1, 3, 7], [2, 4, 8]], [0, -1, 1], [10, 10, 1000], [17, 1018]), # test like code sample 3 in the issue - ([[1, 11, 4], [2, 22, 5], [3, 33, 6]], [0, -1, 1], [4, 7, 10], - [8, 15, 13]) - ]) + ([[1, 11, 4], [2, 22, 5], [3, 33, 6]], [0, -1, 1], [4, 7, 10], [8, 15, 13]), + ], +) def test_iloc_setitem_int_multiindex_series(data, indexes, values, expected_k): # GH17148 - df = DataFrame(data=data, columns=['i', 'j', 'k']) - df = df.set_index(['i', 'j']) + df = DataFrame(data=data, columns=["i", "j", "k"]) + df = df.set_index(["i", "j"]) series = df.k.copy() for i, v in zip(indexes, values): series.iloc[i] += v - df['k'] = expected_k + df["k"] = expected_k expected = df.k tm.assert_series_equal(series, expected) diff --git a/pandas/tests/indexing/multiindex/test_indexing_slow.py b/pandas/tests/indexing/multiindex/test_indexing_slow.py index 1eb137eb65fa1d..aab44daf8d17f0 100644 --- a/pandas/tests/indexing/multiindex/test_indexing_slow.py +++ b/pandas/tests/indexing/multiindex/test_indexing_slow.py @@ -16,36 +16,37 @@ def test_multiindex_get_loc(): # GH7724, GH2646 # test indexing into a multi-index before & past the lexsort depth from numpy.random import randint, choice, randn - cols = ['jim', 'joe', 'jolie', 'joline', 'jolia'] + + cols = ["jim", "joe", "jolie", "joline", "jolia"] def validate(mi, df, key): - mask = np.ones(len(df)).astype('bool') + mask = np.ones(len(df)).astype("bool") # test for all partials of this key for i, k in enumerate(key): mask &= df.iloc[:, i] == k if not mask.any(): - assert key[:i + 1] not in mi.index + assert key[: i + 1] not in mi.index continue - assert key[:i + 1] in mi.index + assert key[: i + 1] in mi.index right = df[mask].copy() if i + 1 != len(key): # partial key - right.drop(cols[:i + 1], axis=1, inplace=True) - right.set_index(cols[i + 1:-1], inplace=True) - tm.assert_frame_equal(mi.loc[key[:i + 1]], right) + right.drop(cols[: i + 1], axis=1, inplace=True) + right.set_index(cols[i + 1 : -1], inplace=True) + tm.assert_frame_equal(mi.loc[key[: i + 1]], right) else: # full key right.set_index(cols[:-1], inplace=True) if len(right) == 1: # single hit - right = Series(right['jolia'].values, - name=right.index[0], - index=['jolia']) - tm.assert_series_equal(mi.loc[key[:i + 1]], right) + right = Series( + right["jolia"].values, name=right.index[0], index=["jolia"] + ) + tm.assert_series_equal(mi.loc[key[: i + 1]], right) else: # multi hit - tm.assert_frame_equal(mi.loc[key[:i + 1]], right) + tm.assert_frame_equal(mi.loc[key[: i + 1]], right) def loop(mi, df, keys): for key in keys: @@ -53,19 +54,24 @@ def loop(mi, df, keys): n, m = 1000, 50 - vals = [randint(0, 10, n), choice( - list('abcdefghij'), n), choice( - pd.date_range('20141009', periods=10).tolist(), n), choice( - list('ZYXWVUTSRQ'), n), randn(n)] + vals = [ + randint(0, 10, n), + choice(list("abcdefghij"), n), + choice(pd.date_range("20141009", periods=10).tolist(), n), + choice(list("ZYXWVUTSRQ"), n), + randn(n), + ] vals = list(map(tuple, zip(*vals))) # bunch of keys for testing - keys = [randint(0, 11, m), choice( - list('abcdefghijk'), m), choice( - pd.date_range('20141009', periods=11).tolist(), m), choice( - list('ZYXWVUTSRQP'), m)] + keys = [ + randint(0, 11, m), + choice(list("abcdefghijk"), m), + choice(pd.date_range("20141009", periods=11).tolist(), m), + choice(list("ZYXWVUTSRQP"), m), + ] keys = list(map(tuple, zip(*keys))) - keys += list(map(lambda t: t[:-1], vals[::n // m])) + keys += list(map(lambda t: t[:-1], vals[:: n // m])) # covers both unique index and non-unique index df = DataFrame(vals, columns=cols) @@ -73,8 +79,7 @@ def loop(mi, df, keys): for frame in a, b: for i in range(5): # lexsort depth - df = frame.copy() if i == 0 else frame.sort_values( - by=cols[:i]) + df = frame.copy() if i == 0 else frame.sort_values(by=cols[:i]) mi = df.set_index(cols[:-1]) assert not mi.index.lexsort_depth < i loop(mi, df, keys) @@ -84,4 +89,4 @@ def loop(mi, df, keys): def test_large_mi_dataframe_indexing(): # GH10645 result = MultiIndex.from_arrays([range(10 ** 6), range(10 ** 6)]) - assert (not (10 ** 6, 0) in result) + assert not (10 ** 6, 0) in result diff --git a/pandas/tests/indexing/multiindex/test_ix.py b/pandas/tests/indexing/multiindex/test_ix.py index 5ea172f14f6f6c..d43115d60c0294 100644 --- a/pandas/tests/indexing/multiindex/test_ix.py +++ b/pandas/tests/indexing/multiindex/test_ix.py @@ -11,34 +11,35 @@ @pytest.mark.filterwarnings("ignore:\\n.ix:FutureWarning") class TestMultiIndexIx: - def test_frame_setitem_ix(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data - frame.loc[('bar', 'two'), 'B'] = 5 - assert frame.loc[('bar', 'two'), 'B'] == 5 + frame.loc[("bar", "two"), "B"] = 5 + assert frame.loc[("bar", "two"), "B"] == 5 # with integer labels df = frame.copy() df.columns = list(range(3)) - df.loc[('bar', 'two'), 1] = 7 - assert df.loc[('bar', 'two'), 1] == 7 + df.loc[("bar", "two"), 1] = 7 + assert df.loc[("bar", "two"), 1] == 7 with catch_warnings(record=True): simplefilter("ignore", FutureWarning) df = frame.copy() df.columns = list(range(3)) - df.ix[('bar', 'two'), 1] = 7 - assert df.loc[('bar', 'two'), 1] == 7 + df.ix[("bar", "two"), 1] = 7 + assert df.loc[("bar", "two"), 1] == 7 def test_ix_general(self): # ix general issues # GH 2817 - data = {'amount': {0: 700, 1: 600, 2: 222, 3: 333, 4: 444}, - 'col': {0: 3.5, 1: 3.5, 2: 4.0, 3: 4.0, 4: 4.0}, - 'year': {0: 2012, 1: 2011, 2: 2012, 3: 2012, 4: 2012}} - df = DataFrame(data).set_index(keys=['col', 'year']) + data = { + "amount": {0: 700, 1: 600, 2: 222, 3: 333, 4: 444}, + "col": {0: 3.5, 1: 3.5, 2: 4.0, 3: 4.0, 4: 4.0}, + "year": {0: 2012, 1: 2011, 2: 2012, 3: 2012, 4: 2012}, + } + df = DataFrame(data).set_index(keys=["col", "year"]) key = 4.0, 2012 # emits a PerformanceWarning, ok @@ -50,23 +51,25 @@ def test_ix_general(self): res = df.loc[key] # col has float dtype, result should be Float64Index - index = MultiIndex.from_arrays([[4.] * 3, [2012] * 3], - names=['col', 'year']) - expected = DataFrame({'amount': [222, 333, 444]}, index=index) + index = MultiIndex.from_arrays([[4.0] * 3, [2012] * 3], names=["col", "year"]) + expected = DataFrame({"amount": [222, 333, 444]}, index=index) tm.assert_frame_equal(res, expected) def test_ix_multiindex_missing_label_raises(self): # GH 21593 - df = DataFrame(np.random.randn(3, 3), - columns=[[2, 2, 4], [6, 8, 10]], - index=[[4, 4, 8], [8, 10, 12]]) + df = DataFrame( + np.random.randn(3, 3), + columns=[[2, 2, 4], [6, 8, 10]], + index=[[4, 4, 8], [8, 10, 12]], + ) with pytest.raises(KeyError, match=r"^2$"): df.ix[2] def test_series_ix_getitem_fancy( - self, multiindex_year_month_day_dataframe_random_data): - s = multiindex_year_month_day_dataframe_random_data['A'] + self, multiindex_year_month_day_dataframe_random_data + ): + s = multiindex_year_month_day_dataframe_random_data["A"] expected = s.reindex(s.index[49:51]) result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index 962976b8ded559..9188adc7d6e93d 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -12,8 +12,9 @@ @pytest.fixture def single_level_multiindex(): """single level MultiIndex""" - return MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], - codes=[[0, 1, 2, 3]], names=['first']) + return MultiIndex( + levels=[["foo", "bar", "baz", "qux"]], codes=[[0, 1, 2, 3]], names=["first"] + ) @pytest.fixture @@ -25,17 +26,17 @@ def frame_random_data_integer_multi_index(): class TestMultiIndexLoc: - def test_loc_getitem_series(self): # GH14730 # passing a series as a key with a MultiIndex - index = MultiIndex.from_product([[1, 2, 3], ['A', 'B', 'C']]) + index = MultiIndex.from_product([[1, 2, 3], ["A", "B", "C"]]) x = Series(index=index, data=range(9), dtype=np.float64) y = Series([1, 3]) expected = Series( data=[0, 1, 2, 6, 7, 8], - index=MultiIndex.from_product([[1, 3], ['A', 'B', 'C']]), - dtype=np.float64) + index=MultiIndex.from_product([[1, 3], ["A", "B", "C"]]), + dtype=np.float64, + ) result = x.loc[y] tm.assert_series_equal(result, expected) @@ -48,99 +49,107 @@ def test_loc_getitem_series(self): tm.assert_series_equal(result, expected) empty = Series(data=[], dtype=np.float64) - expected = Series([], index=MultiIndex( - levels=index.levels, codes=[[], []], dtype=np.float64)) + expected = Series( + [], index=MultiIndex(levels=index.levels, codes=[[], []], dtype=np.float64) + ) result = x.loc[empty] tm.assert_series_equal(result, expected) def test_loc_getitem_array(self): # GH15434 # passing an array as a key with a MultiIndex - index = MultiIndex.from_product([[1, 2, 3], ['A', 'B', 'C']]) + index = MultiIndex.from_product([[1, 2, 3], ["A", "B", "C"]]) x = Series(index=index, data=range(9), dtype=np.float64) y = np.array([1, 3]) expected = Series( data=[0, 1, 2, 6, 7, 8], - index=MultiIndex.from_product([[1, 3], ['A', 'B', 'C']]), - dtype=np.float64) + index=MultiIndex.from_product([[1, 3], ["A", "B", "C"]]), + dtype=np.float64, + ) result = x.loc[y] tm.assert_series_equal(result, expected) # empty array: empty = np.array([]) - expected = Series([], index=MultiIndex( - levels=index.levels, codes=[[], []], dtype=np.float64)) + expected = Series( + [], index=MultiIndex(levels=index.levels, codes=[[], []], dtype=np.float64) + ) result = x.loc[empty] tm.assert_series_equal(result, expected) # 0-dim array (scalar): scalar = np.int64(1) - expected = Series( - data=[0, 1, 2], - index=['A', 'B', 'C'], - dtype=np.float64) + expected = Series(data=[0, 1, 2], index=["A", "B", "C"], dtype=np.float64) result = x.loc[scalar] tm.assert_series_equal(result, expected) def test_loc_multiindex_labels(self): - df = DataFrame(np.random.randn(3, 3), - columns=[['i', 'i', 'j'], ['A', 'A', 'B']], - index=[['i', 'i', 'j'], ['X', 'X', 'Y']]) + df = DataFrame( + np.random.randn(3, 3), + columns=[["i", "i", "j"], ["A", "A", "B"]], + index=[["i", "i", "j"], ["X", "X", "Y"]], + ) # the first 2 rows expected = df.iloc[[0, 1]].droplevel(0) - result = df.loc['i'] + result = df.loc["i"] tm.assert_frame_equal(result, expected) # 2nd (last) column expected = df.iloc[:, [2]].droplevel(0, axis=1) - result = df.loc[:, 'j'] + result = df.loc[:, "j"] tm.assert_frame_equal(result, expected) # bottom right corner expected = df.iloc[[2], [2]].droplevel(0).droplevel(0, axis=1) - result = df.loc['j'].loc[:, 'j'] + result = df.loc["j"].loc[:, "j"] tm.assert_frame_equal(result, expected) # with a tuple expected = df.iloc[[0, 1]] - result = df.loc[('i', 'X')] + result = df.loc[("i", "X")] tm.assert_frame_equal(result, expected) def test_loc_multiindex_ints(self): - df = DataFrame(np.random.randn(3, 3), - columns=[[2, 2, 4], [6, 8, 10]], - index=[[4, 4, 8], [8, 10, 12]]) + df = DataFrame( + np.random.randn(3, 3), + columns=[[2, 2, 4], [6, 8, 10]], + index=[[4, 4, 8], [8, 10, 12]], + ) expected = df.iloc[[0, 1]].droplevel(0) result = df.loc[4] tm.assert_frame_equal(result, expected) def test_loc_multiindex_missing_label_raises(self): - df = DataFrame(np.random.randn(3, 3), - columns=[[2, 2, 4], [6, 8, 10]], - index=[[4, 4, 8], [8, 10, 12]]) + df = DataFrame( + np.random.randn(3, 3), + columns=[[2, 2, 4], [6, 8, 10]], + index=[[4, 4, 8], [8, 10, 12]], + ) with pytest.raises(KeyError, match=r"^2$"): df.loc[2] def test_loc_multiindex_too_many_dims_raises(self): # GH 14885 - s = Series(range(8), index=MultiIndex.from_product( - [['a', 'b'], ['c', 'd'], ['e', 'f']])) + s = Series( + range(8), + index=MultiIndex.from_product([["a", "b"], ["c", "d"], ["e", "f"]]), + ) with pytest.raises(KeyError, match=r"^\('a', 'b'\)$"): - s.loc['a', 'b'] + s.loc["a", "b"] with pytest.raises(KeyError, match=r"^\('a', 'd', 'g'\)$"): - s.loc['a', 'd', 'g'] - with pytest.raises(IndexingError, match='Too many indexers'): - s.loc['a', 'd', 'g', 'j'] + s.loc["a", "d", "g"] + with pytest.raises(IndexingError, match="Too many indexers"): + s.loc["a", "d", "g", "j"] def test_loc_multiindex_indexer_none(self): # GH6788 # multi-index indexer is None (meaning take all) - attributes = ['Attribute' + str(i) for i in range(1)] - attribute_values = ['Value' + str(i) for i in range(5)] + attributes = ["Attribute" + str(i) for i in range(1)] + attribute_values = ["Value" + str(i) for i in range(5)] index = MultiIndex.from_product([attributes, attribute_values]) df = 0.1 * np.random.randn(10, 1 * 5) + 0.5 @@ -150,11 +159,12 @@ def test_loc_multiindex_indexer_none(self): # GH 7349 # loc with a multi-index seems to be doing fallback - df = DataFrame(np.arange(12).reshape(-1, 1), - index=MultiIndex.from_product([[1, 2, 3, 4], - [1, 2, 3]])) + df = DataFrame( + np.arange(12).reshape(-1, 1), + index=MultiIndex.from_product([[1, 2, 3, 4], [1, 2, 3]]), + ) - expected = df.loc[([1, 2], ), :] + expected = df.loc[([1, 2],), :] result = df.loc[[1, 2]] tm.assert_frame_equal(result, expected) @@ -162,34 +172,37 @@ def test_loc_multiindex_incomplete(self): # GH 7399 # incomplete indexers - s = Series(np.arange(15, dtype='int64'), - MultiIndex.from_product([range(5), ['a', 'b', 'c']])) - expected = s.loc[:, 'a':'c'] + s = Series( + np.arange(15, dtype="int64"), + MultiIndex.from_product([range(5), ["a", "b", "c"]]), + ) + expected = s.loc[:, "a":"c"] - result = s.loc[0:4, 'a':'c'] + result = s.loc[0:4, "a":"c"] tm.assert_series_equal(result, expected) tm.assert_series_equal(result, expected) - result = s.loc[:4, 'a':'c'] + result = s.loc[:4, "a":"c"] tm.assert_series_equal(result, expected) tm.assert_series_equal(result, expected) - result = s.loc[0:, 'a':'c'] + result = s.loc[0:, "a":"c"] tm.assert_series_equal(result, expected) tm.assert_series_equal(result, expected) # GH 7400 # multiindexer gettitem with list of indexers skips wrong element - s = Series(np.arange(15, dtype='int64'), - MultiIndex.from_product([range(5), ['a', 'b', 'c']])) + s = Series( + np.arange(15, dtype="int64"), + MultiIndex.from_product([range(5), ["a", "b", "c"]]), + ) expected = s.iloc[[6, 7, 8, 12, 13, 14]] - result = s.loc[2:4:2, 'a':'c'] + result = s.loc[2:4:2, "a":"c"] tm.assert_series_equal(result, expected) def test_get_loc_single_level(self, single_level_multiindex): single_level = single_level_multiindex - s = Series(np.random.randn(len(single_level)), - index=single_level) + s = Series(np.random.randn(len(single_level)), index=single_level) for k in single_level.values: s[k] @@ -197,16 +210,17 @@ def test_loc_getitem_int_slice(self): # GH 3053 # loc should treat integer slices like label slices - index = MultiIndex.from_tuples([t for t in itertools.product( - [6, 7, 8], ['a', 'b'])]) + index = MultiIndex.from_tuples( + [t for t in itertools.product([6, 7, 8], ["a", "b"])] + ) df = DataFrame(np.random.randn(6, 6), index, index) result = df.loc[6:8, :] expected = df tm.assert_frame_equal(result, expected) - index = MultiIndex.from_tuples([t - for t in itertools.product( - [10, 20, 30], ['a', 'b'])]) + index = MultiIndex.from_tuples( + [t for t in itertools.product([10, 20, 30], ["a", "b"])] + ) df = DataFrame(np.random.randn(6, 6), index, index) result = df.loc[20:30, :] expected = df.iloc[2:] @@ -215,7 +229,7 @@ def test_loc_getitem_int_slice(self): # doc examples result = df.loc[10, :] expected = df.iloc[0:2] - expected.index = ['a', 'b'] + expected.index = ["a", "b"] tm.assert_frame_equal(result, expected) result = df.loc[:, 10] @@ -223,11 +237,11 @@ def test_loc_getitem_int_slice(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - 'indexer_type_1', - (list, tuple, set, slice, np.ndarray, Series, Index)) + "indexer_type_1", (list, tuple, set, slice, np.ndarray, Series, Index) + ) @pytest.mark.parametrize( - 'indexer_type_2', - (list, tuple, set, slice, np.ndarray, Series, Index)) + "indexer_type_2", (list, tuple, set, slice, np.ndarray, Series, Index) + ) def test_loc_getitem_nested_indexer(self, indexer_type_1, indexer_type_2): # GH #19686 # .loc should work with nested indexers which can be @@ -244,8 +258,8 @@ def convert_nested_indexer(indexer_type, keys): b = [1, 2, 3] index = MultiIndex.from_product([a, b]) df = DataFrame( - np.arange(len(index), dtype='int64'), - index=index, columns=['Data']) + np.arange(len(index), dtype="int64"), index=index, columns=["Data"] + ) keys = ([10, 20], [2, 3]) types = (indexer_type_1, indexer_type_2) @@ -254,43 +268,52 @@ def convert_nested_indexer(indexer_type, keys): # of all the valid types indexer = tuple( convert_nested_indexer(indexer_type, k) - for indexer_type, k in zip(types, keys)) + for indexer_type, k in zip(types, keys) + ) - result = df.loc[indexer, 'Data'] + result = df.loc[indexer, "Data"] expected = Series( - [1, 2, 4, 5], name='Data', - index=MultiIndex.from_product(keys)) + [1, 2, 4, 5], name="Data", index=MultiIndex.from_product(keys) + ) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('indexer, is_level1, expected_error', [ - ([], False, None), # empty ok - (['A'], False, None), - (['A', 'D'], False, None), - (['D'], False, r"\['D'\] not in index"), # not any values found - (pd.IndexSlice[:, ['foo']], True, None), - (pd.IndexSlice[:, ['foo', 'bah']], True, None) -]) -def test_loc_getitem_duplicates_multiindex_missing_indexers(indexer, is_level1, - expected_error): +@pytest.mark.parametrize( + "indexer, is_level1, expected_error", + [ + ([], False, None), # empty ok + (["A"], False, None), + (["A", "D"], False, None), + (["D"], False, r"\['D'\] not in index"), # not any values found + (pd.IndexSlice[:, ["foo"]], True, None), + (pd.IndexSlice[:, ["foo", "bah"]], True, None), + ], +) +def test_loc_getitem_duplicates_multiindex_missing_indexers( + indexer, is_level1, expected_error +): # GH 7866 # multi-index slicing with missing indexers - idx = MultiIndex.from_product([['A', 'B', 'C'], - ['foo', 'bar', 'baz']], - names=['one', 'two']) - s = Series(np.arange(9, dtype='int64'), index=idx).sort_index() + idx = MultiIndex.from_product( + [["A", "B", "C"], ["foo", "bar", "baz"]], names=["one", "two"] + ) + s = Series(np.arange(9, dtype="int64"), index=idx).sort_index() if indexer == []: expected = s.iloc[[]] elif is_level1: - expected = Series([0, 3, 6], index=MultiIndex.from_product( - [['A', 'B', 'C'], ['foo']], names=['one', 'two'])).sort_index() + expected = Series( + [0, 3, 6], + index=MultiIndex.from_product( + [["A", "B", "C"], ["foo"]], names=["one", "two"] + ), + ).sort_index() else: - exp_idx = MultiIndex.from_product([['A'], ['foo', 'bar', 'baz']], - names=['one', 'two']) - expected = Series(np.arange(3, dtype='int64'), - index=exp_idx).sort_index() + exp_idx = MultiIndex.from_product( + [["A"], ["foo", "bar", "baz"]], names=["one", "two"] + ) + expected = Series(np.arange(3, dtype="int64"), index=exp_idx).sort_index() if expected_error is not None: with pytest.raises(KeyError, match=expected_error): @@ -300,23 +323,18 @@ def test_loc_getitem_duplicates_multiindex_missing_indexers(indexer, is_level1, tm.assert_series_equal(result, expected) -def test_series_loc_getitem_fancy( - multiindex_year_month_day_dataframe_random_data): - s = multiindex_year_month_day_dataframe_random_data['A'] +def test_series_loc_getitem_fancy(multiindex_year_month_day_dataframe_random_data): + s = multiindex_year_month_day_dataframe_random_data["A"] expected = s.reindex(s.index[49:51]) result = s.loc[[(2000, 3, 10), (2000, 3, 13)]] tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('columns_indexer', [ - ([], slice(None)), - (['foo'], []) -]) +@pytest.mark.parametrize("columns_indexer", [([], slice(None)), (["foo"], [])]) def test_loc_getitem_duplicates_multiindex_empty_indexer(columns_indexer): # GH 8737 # empty indexer - multi_index = MultiIndex.from_product((['foo', 'bar', 'baz'], - ['alpha', 'beta'])) + multi_index = MultiIndex.from_product((["foo", "bar", "baz"], ["alpha", "beta"])) df = DataFrame(np.random.randn(5, 6), index=range(5), columns=multi_index) df = df.sort_index(level=0, axis=1) @@ -328,22 +346,26 @@ def test_loc_getitem_duplicates_multiindex_empty_indexer(columns_indexer): def test_loc_getitem_duplicates_multiindex_non_scalar_type_object(): # regression from < 0.14.0 # GH 7914 - df = DataFrame([[np.mean, np.median], ['mean', 'median']], - columns=MultiIndex.from_tuples([('functs', 'mean'), - ('functs', 'median')]), - index=['function', 'name']) - result = df.loc['function', ('functs', 'mean')] + df = DataFrame( + [[np.mean, np.median], ["mean", "median"]], + columns=MultiIndex.from_tuples([("functs", "mean"), ("functs", "median")]), + index=["function", "name"], + ) + result = df.loc["function", ("functs", "mean")] expected = np.mean assert result == expected def test_loc_getitem_tuple_plus_slice(): # GH 671 - df = DataFrame({'a': np.arange(10), - 'b': np.arange(10), - 'c': np.random.randn(10), - 'd': np.random.randn(10)} - ).set_index(['a', 'b']) + df = DataFrame( + { + "a": np.arange(10), + "b": np.arange(10), + "c": np.random.randn(10), + "d": np.random.randn(10), + } + ).set_index(["a", "b"]) expected = df.loc[0, 0] result = df.loc[(0, 0), :] tm.assert_series_equal(result, expected) @@ -357,8 +379,7 @@ def test_loc_getitem_int(frame_random_data_integer_multi_index): tm.assert_frame_equal(result, expected) -def test_loc_getitem_int_raises_exception( - frame_random_data_integer_multi_index): +def test_loc_getitem_int_raises_exception(frame_random_data_integer_multi_index): df = frame_random_data_integer_multi_index with pytest.raises(KeyError, match=r"^3$"): df.loc[3] @@ -369,10 +390,10 @@ def test_loc_getitem_lowerdim_corner(multiindex_dataframe_random_data): # test setup - check key not in dataframe with pytest.raises(KeyError, match=r"^11$"): - df.loc[('bar', 'three'), 'B'] + df.loc[("bar", "three"), "B"] # in theory should be inserting in a sorted space???? - df.loc[('bar', 'three'), 'B'] = 0 + df.loc[("bar", "three"), "B"] = 0 expected = 0 - result = df.sort_index().loc[('bar', 'three'), 'B'] + result = df.sort_index().loc[("bar", "three"), "B"] assert result == expected diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 6dc8d67a971d3a..ccaaa2ae02499a 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -1,4 +1,3 @@ - import numpy as np import pytest @@ -11,20 +10,22 @@ class TestMultiIndexBasic: - def test_multiindex_perf_warn(self): - df = DataFrame({'jim': [0, 0, 1, 1], - 'joe': ['x', 'x', 'z', 'y'], - 'jolie': np.random.rand(4)}).set_index(['jim', 'joe']) + df = DataFrame( + { + "jim": [0, 0, 1, 1], + "joe": ["x", "x", "z", "y"], + "jolie": np.random.rand(4), + } + ).set_index(["jim", "joe"]) - with tm.assert_produces_warning(PerformanceWarning, - clear=[pd.core.index]): - df.loc[(1, 'z')] + with tm.assert_produces_warning(PerformanceWarning, clear=[pd.core.index]): + df.loc[(1, "z")] df = df.iloc[[2, 1, 3, 0]] with tm.assert_produces_warning(PerformanceWarning): - df.loc[(0, )] + df.loc[(0,)] def test_multiindex_contains_dropped(self): # GH 19027 @@ -40,17 +41,20 @@ def test_multiindex_contains_dropped(self): assert 2 not in idx # also applies to strings - idx = MultiIndex.from_product([['a', 'b'], ['c', 'd']]) - assert 'a' in idx - idx = idx.drop('a') - assert 'a' in idx.levels[0] - assert 'a' not in idx - - @pytest.mark.parametrize("data, expected", [ - (MultiIndex.from_product([(), ()]), True), - (MultiIndex.from_product([(1, 2), (3, 4)]), True), - (MultiIndex.from_product([('a', 'b'), (1, 2)]), False), - ]) + idx = MultiIndex.from_product([["a", "b"], ["c", "d"]]) + assert "a" in idx + idx = idx.drop("a") + assert "a" in idx.levels[0] + assert "a" not in idx + + @pytest.mark.parametrize( + "data, expected", + [ + (MultiIndex.from_product([(), ()]), True), + (MultiIndex.from_product([(1, 2), (3, 4)]), True), + (MultiIndex.from_product([("a", "b"), (1, 2)]), False), + ], + ) def test_multiindex_is_homogeneous_type(self, data, expected): assert data._is_homogeneous_type is expected @@ -60,8 +64,7 @@ def test_indexing_over_hashtable_size_cutoff(self): old_cutoff = _index._SIZE_CUTOFF _index._SIZE_CUTOFF = 20000 - s = Series(np.arange(n), - MultiIndex.from_arrays((["a"] * n, np.arange(n)))) + s = Series(np.arange(n), MultiIndex.from_arrays((["a"] * n, np.arange(n)))) # hai it works! assert s[("a", 5)] == 5 @@ -73,22 +76,31 @@ def test_indexing_over_hashtable_size_cutoff(self): def test_multi_nan_indexing(self): # GH 3588 - df = DataFrame({"a": ['R1', 'R2', np.nan, 'R4'], - 'b': ["C1", "C2", "C3", "C4"], - "c": [10, 15, np.nan, 20]}) - result = df.set_index(['a', 'b'], drop=False) - expected = DataFrame({"a": ['R1', 'R2', np.nan, 'R4'], - 'b': ["C1", "C2", "C3", "C4"], - "c": [10, 15, np.nan, 20]}, - index=[Index(['R1', 'R2', np.nan, 'R4'], - name='a'), - Index(['C1', 'C2', 'C3', 'C4'], name='b')]) + df = DataFrame( + { + "a": ["R1", "R2", np.nan, "R4"], + "b": ["C1", "C2", "C3", "C4"], + "c": [10, 15, np.nan, 20], + } + ) + result = df.set_index(["a", "b"], drop=False) + expected = DataFrame( + { + "a": ["R1", "R2", np.nan, "R4"], + "b": ["C1", "C2", "C3", "C4"], + "c": [10, 15, np.nan, 20], + }, + index=[ + Index(["R1", "R2", np.nan, "R4"], name="a"), + Index(["C1", "C2", "C3", "C4"], name="b"), + ], + ) tm.assert_frame_equal(result, expected) def test_contains(self): # GH 24570 - tx = pd.timedelta_range('09:30:00', '16:00:00', freq='30 min') + tx = pd.timedelta_range("09:30:00", "16:00:00", freq="30 min") idx = MultiIndex.from_arrays([tx, np.arange(len(tx))]) assert tx[0] in idx - assert 'element_not_exit' not in idx - assert '0 day 09:30:00' in idx + assert "element_not_exit" not in idx + assert "0 day 09:30:00" in idx diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index 20830bbe4680b7..3c65f1b8abddb8 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -8,27 +8,25 @@ class TestMultiIndexPartial: - def test_getitem_partial_int(self): # GH 12416 # with single item l1 = [10, 20] - l2 = ['a', 'b'] - df = DataFrame(index=range(2), - columns=MultiIndex.from_product([l1, l2])) - expected = DataFrame(index=range(2), - columns=l2) + l2 = ["a", "b"] + df = DataFrame(index=range(2), columns=MultiIndex.from_product([l1, l2])) + expected = DataFrame(index=range(2), columns=l2) result = df[20] tm.assert_frame_equal(result, expected) # with list - expected = DataFrame(index=range(2), - columns=MultiIndex.from_product([l1[1:], l2])) + expected = DataFrame( + index=range(2), columns=MultiIndex.from_product([l1[1:], l2]) + ) result = df[[20]] tm.assert_frame_equal(result, expected) # missing item: - with pytest.raises(KeyError, match='1'): + with pytest.raises(KeyError, match="1"): df[1] with pytest.raises(KeyError, match=r"'\[1\] not in index'"): df[[1]] @@ -36,13 +34,16 @@ def test_getitem_partial_int(self): def test_series_slice_partial(self): pass - def test_xs_partial(self, multiindex_dataframe_random_data, - multiindex_year_month_day_dataframe_random_data): + def test_xs_partial( + self, + multiindex_dataframe_random_data, + multiindex_year_month_day_dataframe_random_data, + ): frame = multiindex_dataframe_random_data ymd = multiindex_year_month_day_dataframe_random_data - result = frame.xs('foo') - result2 = frame.loc['foo'] - expected = frame.T['foo'].T + result = frame.xs("foo") + result2 = frame.loc["foo"] + expected = frame.T["foo"].T tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result, result2) @@ -51,19 +52,21 @@ def test_xs_partial(self, multiindex_dataframe_random_data, tm.assert_frame_equal(result, expected) # ex from #1796 - index = MultiIndex(levels=[['foo', 'bar'], ['one', 'two'], [-1, 1]], - codes=[[0, 0, 0, 0, 1, 1, 1, 1], - [0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, - 0, 1]]) - df = DataFrame(np.random.randn(8, 4), index=index, - columns=list('abcd')) - - result = df.xs(['foo', 'one']) - expected = df.loc['foo', 'one'] + index = MultiIndex( + levels=[["foo", "bar"], ["one", "two"], [-1, 1]], + codes=[ + [0, 0, 0, 0, 1, 1, 1, 1], + [0, 0, 1, 1, 0, 0, 1, 1], + [0, 1, 0, 1, 0, 1, 0, 1], + ], + ) + df = DataFrame(np.random.randn(8, 4), index=index, columns=list("abcd")) + + result = df.xs(["foo", "one"]) + expected = df.loc["foo", "one"] tm.assert_frame_equal(result, expected) - def test_getitem_partial( - self, multiindex_year_month_day_dataframe_random_data): + def test_getitem_partial(self, multiindex_year_month_day_dataframe_random_data): ymd = multiindex_year_month_day_dataframe_random_data ymd = ymd.T result = ymd[2000, 2] @@ -73,10 +76,12 @@ def test_getitem_partial( tm.assert_frame_equal(result, expected) def test_fancy_slice_partial( - self, multiindex_dataframe_random_data, - multiindex_year_month_day_dataframe_random_data): + self, + multiindex_dataframe_random_data, + multiindex_year_month_day_dataframe_random_data, + ): frame = multiindex_dataframe_random_data - result = frame.loc['bar':'baz'] + result = frame.loc["bar":"baz"] expected = frame[3:7] tm.assert_frame_equal(result, expected) @@ -87,28 +92,29 @@ def test_fancy_slice_partial( tm.assert_frame_equal(result, expected) def test_getitem_partial_column_select(self): - idx = MultiIndex(codes=[[0, 0, 0], [0, 1, 1], [1, 0, 1]], - levels=[['a', 'b'], ['x', 'y'], ['p', 'q']]) + idx = MultiIndex( + codes=[[0, 0, 0], [0, 1, 1], [1, 0, 1]], + levels=[["a", "b"], ["x", "y"], ["p", "q"]], + ) df = DataFrame(np.random.rand(3, 2), index=idx) - result = df.loc[('a', 'y'), :] - expected = df.loc[('a', 'y')] + result = df.loc[("a", "y"), :] + expected = df.loc[("a", "y")] tm.assert_frame_equal(result, expected) - result = df.loc[('a', 'y'), [1, 0]] - expected = df.loc[('a', 'y')][[1, 0]] + result = df.loc[("a", "y"), [1, 0]] + expected = df.loc[("a", "y")][[1, 0]] tm.assert_frame_equal(result, expected) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - result = df.ix[('a', 'y'), [1, 0]] + result = df.ix[("a", "y"), [1, 0]] tm.assert_frame_equal(result, expected) with pytest.raises(KeyError, match=r"\('a', 'foo'\)"): - df.loc[('a', 'foo'), :] + df.loc[("a", "foo"), :] - def test_partial_set( - self, multiindex_year_month_day_dataframe_random_data): + def test_partial_set(self, multiindex_year_month_day_dataframe_random_data): # GH #397 ymd = multiindex_year_month_day_dataframe_random_data df = ymd.copy() @@ -117,8 +123,8 @@ def test_partial_set( exp.loc[2000, 4].values[:] = 0 tm.assert_frame_equal(df, exp) - df['A'].loc[2000, 4] = 1 - exp['A'].loc[2000, 4].values[:] = 1 + df["A"].loc[2000, 4] = 1 + exp["A"].loc[2000, 4].values[:] = 1 tm.assert_frame_equal(df, exp) df.loc[2000] = 5 @@ -126,19 +132,18 @@ def test_partial_set( tm.assert_frame_equal(df, exp) # this works...for now - df['A'].iloc[14] = 5 - assert df['A'][14] == 5 + df["A"].iloc[14] = 5 + assert df["A"][14] == 5 # --------------------------------------------------------------------- # AMBIGUOUS CASES! - def test_partial_loc_missing( - self, multiindex_year_month_day_dataframe_random_data): + def test_partial_loc_missing(self, multiindex_year_month_day_dataframe_random_data): pytest.skip("skipping for now") ymd = multiindex_year_month_day_dataframe_random_data result = ymd.loc[2000, 0] - expected = ymd.loc[2000]['A'] + expected = ymd.loc[2000]["A"] tm.assert_series_equal(result, expected) # need to put in some work here @@ -158,28 +163,28 @@ def test_setitem_multiple_partial(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data expected = frame.copy() result = frame.copy() - result.loc[['foo', 'bar']] = 0 - expected.loc['foo'] = 0 - expected.loc['bar'] = 0 + result.loc[["foo", "bar"]] = 0 + expected.loc["foo"] = 0 + expected.loc["bar"] = 0 tm.assert_frame_equal(result, expected) expected = frame.copy() result = frame.copy() - result.loc['foo':'bar'] = 0 - expected.loc['foo'] = 0 - expected.loc['bar'] = 0 + result.loc["foo":"bar"] = 0 + expected.loc["foo"] = 0 + expected.loc["bar"] = 0 tm.assert_frame_equal(result, expected) - expected = frame['A'].copy() - result = frame['A'].copy() - result.loc[['foo', 'bar']] = 0 - expected.loc['foo'] = 0 - expected.loc['bar'] = 0 + expected = frame["A"].copy() + result = frame["A"].copy() + result.loc[["foo", "bar"]] = 0 + expected.loc["foo"] = 0 + expected.loc["bar"] = 0 tm.assert_series_equal(result, expected) - expected = frame['A'].copy() - result = frame['A'].copy() - result.loc['foo':'bar'] = 0 - expected.loc['foo'] = 0 - expected.loc['bar'] = 0 + expected = frame["A"].copy() + result = frame["A"].copy() + result.loc["foo":"bar"] = 0 + expected.loc["foo"] = 0 + expected.loc["bar"] = 0 tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_set_ops.py b/pandas/tests/indexing/multiindex/test_set_ops.py index 6c7d209333d62e..5d0bc61e9957c3 100644 --- a/pandas/tests/indexing/multiindex/test_set_ops.py +++ b/pandas/tests/indexing/multiindex/test_set_ops.py @@ -5,22 +5,22 @@ class TestMultiIndexSetOps: - def test_multiindex_symmetric_difference(self): # GH 13490 - idx = MultiIndex.from_product([['a', 'b'], ['A', 'B']], - names=['a', 'b']) + idx = MultiIndex.from_product([["a", "b"], ["A", "B"]], names=["a", "b"]) result = idx ^ idx assert result.names == idx.names - idx2 = idx.copy().rename(['A', 'B']) + idx2 = idx.copy().rename(["A", "B"]) result = idx ^ idx2 assert result.names == [None, None] def test_mixed_depth_insert(self): - arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], - ['', 'OD', 'OD', 'result1', 'result2', 'result1'], - ['', 'wx', 'wy', '', '', '']] + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] tuples = sorted(zip(*arrays)) index = MultiIndex.from_tuples(tuples) @@ -28,15 +28,14 @@ def test_mixed_depth_insert(self): result = df.copy() expected = df.copy() - result['b'] = [1, 2, 3, 4] - expected['b', '', ''] = [1, 2, 3, 4] + result["b"] = [1, 2, 3, 4] + expected["b", "", ""] = [1, 2, 3, 4] tm.assert_frame_equal(result, expected) def test_dataframe_insert_column_all_na(self): # GH #1534 - mix = MultiIndex.from_tuples([('1a', '2a'), ('1a', '2b'), ('1a', '2c') - ]) + mix = MultiIndex.from_tuples([("1a", "2a"), ("1a", "2b"), ("1a", "2c")]) df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mix) s = Series({(1, 1): 1, (1, 2): 2}) - df['new'] = s - assert df['new'].isna().all() + df["new"] = s + assert df["new"].isna().all() diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 44aae4cd55e352..261d2e9c04e773 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -5,19 +5,17 @@ import pytest import pandas as pd -from pandas import ( - DataFrame, MultiIndex, Series, Timestamp, date_range, isna, notna) +from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, isna, notna import pandas.core.common as com from pandas.util import testing as tm @pytest.mark.filterwarnings("ignore:\\n.ix:FutureWarning") class TestMultiIndexSetItem: - def test_setitem_multiindex(self): with catch_warnings(record=True): - for index_fn in ('ix', 'loc'): + for index_fn in ("ix", "loc"): def assert_equal(a, b): assert a == b @@ -29,136 +27,168 @@ def check(target, indexers, value, compare_fn, expected=None): if expected is None: expected = value compare_fn(result, expected) + # GH7190 - index = MultiIndex.from_product([np.arange(0, 100), - np.arange(0, 80)], - names=['time', 'firm']) + index = MultiIndex.from_product( + [np.arange(0, 100), np.arange(0, 80)], names=["time", "firm"] + ) t, n = 0, 2 - df = DataFrame(np.nan, columns=['A', 'w', 'l', 'a', 'x', - 'X', 'd', 'profit'], - index=index) - check(target=df, indexers=((t, n), 'X'), value=0, - compare_fn=assert_equal) - - df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x', - 'X', 'd', 'profit'], - index=index) - check(target=df, indexers=((t, n), 'X'), value=1, - compare_fn=assert_equal) - - df = DataFrame(columns=['A', 'w', 'l', 'a', 'x', - 'X', 'd', 'profit'], - index=index) - check(target=df, indexers=((t, n), 'X'), value=2, - compare_fn=assert_equal) + df = DataFrame( + np.nan, + columns=["A", "w", "l", "a", "x", "X", "d", "profit"], + index=index, + ) + check( + target=df, indexers=((t, n), "X"), value=0, compare_fn=assert_equal + ) + + df = DataFrame( + -999, + columns=["A", "w", "l", "a", "x", "X", "d", "profit"], + index=index, + ) + check( + target=df, indexers=((t, n), "X"), value=1, compare_fn=assert_equal + ) + + df = DataFrame( + columns=["A", "w", "l", "a", "x", "X", "d", "profit"], index=index + ) + check( + target=df, indexers=((t, n), "X"), value=2, compare_fn=assert_equal + ) # gh-7218: assigning with 0-dim arrays - df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x', - 'X', 'd', 'profit'], - index=index) - check(target=df, - indexers=((t, n), 'X'), - value=np.array(3), - compare_fn=assert_equal, - expected=3, ) + df = DataFrame( + -999, + columns=["A", "w", "l", "a", "x", "X", "d", "profit"], + index=index, + ) + check( + target=df, + indexers=((t, n), "X"), + value=np.array(3), + compare_fn=assert_equal, + expected=3, + ) # GH5206 - df = DataFrame(np.arange(25).reshape(5, 5), - columns='A,B,C,D,E'.split(','), dtype=float) - df['F'] = 99 - row_selection = df['A'] % 2 == 0 - col_selection = ['B', 'C'] + df = DataFrame( + np.arange(25).reshape(5, 5), + columns="A,B,C,D,E".split(","), + dtype=float, + ) + df["F"] = 99 + row_selection = df["A"] % 2 == 0 + col_selection = ["B", "C"] with catch_warnings(record=True): - df.ix[row_selection, col_selection] = df['F'] - output = DataFrame(99., index=[0, 2, 4], columns=['B', 'C']) + df.ix[row_selection, col_selection] = df["F"] + output = DataFrame(99.0, index=[0, 2, 4], columns=["B", "C"]) with catch_warnings(record=True): - tm.assert_frame_equal(df.ix[row_selection, col_selection], - output) - check(target=df, - indexers=(row_selection, col_selection), - value=df['F'], - compare_fn=tm.assert_frame_equal, - expected=output, ) + tm.assert_frame_equal(df.ix[row_selection, col_selection], output) + check( + target=df, + indexers=(row_selection, col_selection), + value=df["F"], + compare_fn=tm.assert_frame_equal, + expected=output, + ) # GH11372 - idx = MultiIndex.from_product([ - ['A', 'B', 'C'], - date_range('2015-01-01', '2015-04-01', freq='MS')]) - cols = MultiIndex.from_product([ - ['foo', 'bar'], - date_range('2016-01-01', '2016-02-01', freq='MS')]) + idx = MultiIndex.from_product( + [["A", "B", "C"], date_range("2015-01-01", "2015-04-01", freq="MS")] + ) + cols = MultiIndex.from_product( + [["foo", "bar"], date_range("2016-01-01", "2016-02-01", freq="MS")] + ) - df = DataFrame(np.random.random((12, 4)), - index=idx, columns=cols) + df = DataFrame(np.random.random((12, 4)), index=idx, columns=cols) subidx = MultiIndex.from_tuples( - [('A', Timestamp('2015-01-01')), - ('A', Timestamp('2015-02-01'))]) + [("A", Timestamp("2015-01-01")), ("A", Timestamp("2015-02-01"))] + ) subcols = MultiIndex.from_tuples( - [('foo', Timestamp('2016-01-01')), - ('foo', Timestamp('2016-02-01'))]) - - vals = DataFrame(np.random.random((2, 2)), - index=subidx, columns=subcols) - check(target=df, - indexers=(subidx, subcols), - value=vals, - compare_fn=tm.assert_frame_equal, ) - # set all columns + [("foo", Timestamp("2016-01-01")), ("foo", Timestamp("2016-02-01"))] + ) + vals = DataFrame( - np.random.random((2, 4)), index=subidx, columns=cols) - check(target=df, - indexers=(subidx, slice(None, None, None)), - value=vals, - compare_fn=tm.assert_frame_equal, ) + np.random.random((2, 2)), index=subidx, columns=subcols + ) + check( + target=df, + indexers=(subidx, subcols), + value=vals, + compare_fn=tm.assert_frame_equal, + ) + # set all columns + vals = DataFrame(np.random.random((2, 4)), index=subidx, columns=cols) + check( + target=df, + indexers=(subidx, slice(None, None, None)), + value=vals, + compare_fn=tm.assert_frame_equal, + ) # identity copy = df.copy() - check(target=df, indexers=(df.index, df.columns), value=df, - compare_fn=tm.assert_frame_equal, expected=copy) + check( + target=df, + indexers=(df.index, df.columns), + value=df, + compare_fn=tm.assert_frame_equal, + expected=copy, + ) def test_multiindex_setitem(self): # GH 3738 # setting with a multi-index right hand side - arrays = [np.array(['bar', 'bar', 'baz', 'qux', 'qux', 'bar']), - np.array(['one', 'two', 'one', 'one', 'two', 'one']), - np.arange(0, 6, 1)] + arrays = [ + np.array(["bar", "bar", "baz", "qux", "qux", "bar"]), + np.array(["one", "two", "one", "one", "two", "one"]), + np.arange(0, 6, 1), + ] - df_orig = DataFrame(np.random.randn(6, 3), index=arrays, - columns=['A', 'B', 'C']).sort_index() + df_orig = DataFrame( + np.random.randn(6, 3), index=arrays, columns=["A", "B", "C"] + ).sort_index() - expected = df_orig.loc[['bar']] * 2 + expected = df_orig.loc[["bar"]] * 2 df = df_orig.copy() - df.loc[['bar']] *= 2 - tm.assert_frame_equal(df.loc[['bar']], expected) + df.loc[["bar"]] *= 2 + tm.assert_frame_equal(df.loc[["bar"]], expected) # raise because these have differing levels with pytest.raises(TypeError): - df.loc['bar'] *= 2 + df.loc["bar"] *= 2 # from SO # http://stackoverflow.com/questions/24572040/pandas-access-the-level-of-multiindex-for-inplace-operation - df_orig = DataFrame.from_dict({'price': { - ('DE', 'Coal', 'Stock'): 2, - ('DE', 'Gas', 'Stock'): 4, - ('DE', 'Elec', 'Demand'): 1, - ('FR', 'Gas', 'Stock'): 5, - ('FR', 'Solar', 'SupIm'): 0, - ('FR', 'Wind', 'SupIm'): 0 - }}) - df_orig.index = MultiIndex.from_tuples(df_orig.index, - names=['Sit', 'Com', 'Type']) + df_orig = DataFrame.from_dict( + { + "price": { + ("DE", "Coal", "Stock"): 2, + ("DE", "Gas", "Stock"): 4, + ("DE", "Elec", "Demand"): 1, + ("FR", "Gas", "Stock"): 5, + ("FR", "Solar", "SupIm"): 0, + ("FR", "Wind", "SupIm"): 0, + } + } + ) + df_orig.index = MultiIndex.from_tuples( + df_orig.index, names=["Sit", "Com", "Type"] + ) expected = df_orig.copy() expected.iloc[[0, 2, 3]] *= 2 idx = pd.IndexSlice df = df_orig.copy() - df.loc[idx[:, :, 'Stock'], :] *= 2 + df.loc[idx[:, :, "Stock"], :] *= 2 tm.assert_frame_equal(df, expected) df = df_orig.copy() - df.loc[idx[:, :, 'Stock'], 'price'] *= 2 + df.loc[idx[:, :, "Stock"], "price"] *= 2 tm.assert_frame_equal(df, expected) def test_multiindex_assignment(self): @@ -166,60 +196,66 @@ def test_multiindex_assignment(self): # GH3777 part 2 # mixed dtype - df = DataFrame(np.random.randint(5, 10, size=9).reshape(3, 3), - columns=list('abc'), - index=[[4, 4, 8], [8, 10, 12]]) - df['d'] = np.nan - arr = np.array([0., 1.]) + df = DataFrame( + np.random.randint(5, 10, size=9).reshape(3, 3), + columns=list("abc"), + index=[[4, 4, 8], [8, 10, 12]], + ) + df["d"] = np.nan + arr = np.array([0.0, 1.0]) with catch_warnings(record=True): - df.ix[4, 'd'] = arr - tm.assert_series_equal(df.ix[4, 'd'], - Series(arr, index=[8, 10], name='d')) + df.ix[4, "d"] = arr + tm.assert_series_equal(df.ix[4, "d"], Series(arr, index=[8, 10], name="d")) # single dtype - df = DataFrame(np.random.randint(5, 10, size=9).reshape(3, 3), - columns=list('abc'), - index=[[4, 4, 8], [8, 10, 12]]) + df = DataFrame( + np.random.randint(5, 10, size=9).reshape(3, 3), + columns=list("abc"), + index=[[4, 4, 8], [8, 10, 12]], + ) with catch_warnings(record=True): - df.ix[4, 'c'] = arr - exp = Series(arr, index=[8, 10], name='c', dtype='float64') - tm.assert_series_equal(df.ix[4, 'c'], exp) + df.ix[4, "c"] = arr + exp = Series(arr, index=[8, 10], name="c", dtype="float64") + tm.assert_series_equal(df.ix[4, "c"], exp) # scalar ok with catch_warnings(record=True): - df.ix[4, 'c'] = 10 - exp = Series(10, index=[8, 10], name='c', dtype='float64') - tm.assert_series_equal(df.ix[4, 'c'], exp) + df.ix[4, "c"] = 10 + exp = Series(10, index=[8, 10], name="c", dtype="float64") + tm.assert_series_equal(df.ix[4, "c"], exp) # invalid assignments with pytest.raises(ValueError): with catch_warnings(record=True): - df.ix[4, 'c'] = [0, 1, 2, 3] + df.ix[4, "c"] = [0, 1, 2, 3] with pytest.raises(ValueError): with catch_warnings(record=True): - df.ix[4, 'c'] = [0] + df.ix[4, "c"] = [0] # groupby example NUM_ROWS = 100 NUM_COLS = 10 - col_names = ['A' + num for num in - map(str, np.arange(NUM_COLS).tolist())] + col_names = ["A" + num for num in map(str, np.arange(NUM_COLS).tolist())] index_cols = col_names[:5] - df = DataFrame(np.random.randint(5, size=(NUM_ROWS, NUM_COLS)), - dtype=np.int64, columns=col_names) + df = DataFrame( + np.random.randint(5, size=(NUM_ROWS, NUM_COLS)), + dtype=np.int64, + columns=col_names, + ) df = df.set_index(index_cols).sort_index() grp = df.groupby(level=index_cols[:4]) - df['new_col'] = np.nan + df["new_col"] = np.nan f_index = np.arange(5) def f(name, df2): - return Series(np.arange(df2.shape[0]), - name=df2.index.values[0]).reindex(f_index) + return Series(np.arange(df2.shape[0]), name=df2.index.values[0]).reindex( + f_index + ) # TODO(wesm): unused? # new_df = pd.concat([f(name, df2) for name, df2 in grp], axis=1).T @@ -229,12 +265,11 @@ def f(name, df2): for name, df2 in grp: new_vals = np.arange(df2.shape[0]) with catch_warnings(record=True): - df.ix[name, 'new_col'] = new_vals + df.ix[name, "new_col"] = new_vals - def test_series_setitem( - self, multiindex_year_month_day_dataframe_random_data): + def test_series_setitem(self, multiindex_year_month_day_dataframe_random_data): ymd = multiindex_year_month_day_dataframe_random_data - s = ymd['A'] + s = ymd["A"] s[2000, 3] = np.nan assert isna(s.values[42:65]).all() @@ -244,8 +279,7 @@ def test_series_setitem( s[2000, 3, 10] = np.nan assert isna(s[49]) - def test_frame_getitem_setitem_boolean( - self, multiindex_dataframe_random_data): + def test_frame_getitem_setitem_boolean(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data df = frame.T.copy() values = df.values @@ -267,88 +301,90 @@ def test_frame_getitem_setitem_boolean( np.putmask(values[:-1], values[:-1] < 0, 2) tm.assert_almost_equal(df.values, values) - with pytest.raises(TypeError, match='boolean values only'): + with pytest.raises(TypeError, match="boolean values only"): df[df * 0] = 2 def test_frame_getitem_setitem_multislice(self): - levels = [['t1', 't2'], ['a', 'b', 'c']] + levels = [["t1", "t2"], ["a", "b", "c"]] codes = [[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]] - midx = MultiIndex(codes=codes, levels=levels, names=[None, 'id']) - df = DataFrame({'value': [1, 2, 3, 7, 8]}, index=midx) + midx = MultiIndex(codes=codes, levels=levels, names=[None, "id"]) + df = DataFrame({"value": [1, 2, 3, 7, 8]}, index=midx) - result = df.loc[:, 'value'] - tm.assert_series_equal(df['value'], result) + result = df.loc[:, "value"] + tm.assert_series_equal(df["value"], result) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - result = df.ix[:, 'value'] - tm.assert_series_equal(df['value'], result) + result = df.ix[:, "value"] + tm.assert_series_equal(df["value"], result) - result = df.loc[df.index[1:3], 'value'] - tm.assert_series_equal(df['value'][1:3], result) + result = df.loc[df.index[1:3], "value"] + tm.assert_series_equal(df["value"][1:3], result) result = df.loc[:, :] tm.assert_frame_equal(df, result) result = df - df.loc[:, 'value'] = 10 - result['value'] = 10 + df.loc[:, "value"] = 10 + result["value"] = 10 tm.assert_frame_equal(df, result) df.loc[:, :] = 10 tm.assert_frame_equal(df, result) def test_frame_setitem_multi_column(self): - df = DataFrame(randn(10, 4), columns=[['a', 'a', 'b', 'b'], - [0, 1, 0, 1]]) + df = DataFrame(randn(10, 4), columns=[["a", "a", "b", "b"], [0, 1, 0, 1]]) cp = df.copy() - cp['a'] = cp['b'] - tm.assert_frame_equal(cp['a'], cp['b']) + cp["a"] = cp["b"] + tm.assert_frame_equal(cp["a"], cp["b"]) # set with ndarray cp = df.copy() - cp['a'] = cp['b'].values - tm.assert_frame_equal(cp['a'], cp['b']) + cp["a"] = cp["b"].values + tm.assert_frame_equal(cp["a"], cp["b"]) # --------------------------------------- # #1803 - columns = MultiIndex.from_tuples([('A', '1'), ('A', '2'), ('B', '1')]) + columns = MultiIndex.from_tuples([("A", "1"), ("A", "2"), ("B", "1")]) df = DataFrame(index=[1, 3, 5], columns=columns) # Works, but adds a column instead of updating the two existing ones - df['A'] = 0.0 # Doesn't work - assert (df['A'].values == 0).all() + df["A"] = 0.0 # Doesn't work + assert (df["A"].values == 0).all() # it broadcasts - df['B', '1'] = [1, 2, 3] - df['A'] = df['B', '1'] + df["B", "1"] = [1, 2, 3] + df["A"] = df["B", "1"] - sliced_a1 = df['A', '1'] - sliced_a2 = df['A', '2'] - sliced_b1 = df['B', '1'] + sliced_a1 = df["A", "1"] + sliced_a2 = df["A", "2"] + sliced_b1 = df["B", "1"] tm.assert_series_equal(sliced_a1, sliced_b1, check_names=False) tm.assert_series_equal(sliced_a2, sliced_b1, check_names=False) - assert sliced_a1.name == ('A', '1') - assert sliced_a2.name == ('A', '2') - assert sliced_b1.name == ('B', '1') + assert sliced_a1.name == ("A", "1") + assert sliced_a2.name == ("A", "2") + assert sliced_b1.name == ("B", "1") def test_getitem_setitem_tuple_plus_columns( - self, multiindex_year_month_day_dataframe_random_data): + self, multiindex_year_month_day_dataframe_random_data + ): # GH #1013 ymd = multiindex_year_month_day_dataframe_random_data df = ymd[:5] - result = df.loc[(2000, 1, 6), ['A', 'B', 'C']] - expected = df.loc[2000, 1, 6][['A', 'B', 'C']] + result = df.loc[(2000, 1, 6), ["A", "B", "C"]] + expected = df.loc[2000, 1, 6][["A", "B", "C"]] tm.assert_series_equal(result, expected) def test_getitem_setitem_slice_integers(self): - index = MultiIndex(levels=[[0, 1, 2], [0, 2]], - codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) + index = MultiIndex( + levels=[[0, 1, 2], [0, 2]], codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]] + ) - frame = DataFrame(np.random.randn(len(index), 4), index=index, - columns=['a', 'b', 'c', 'd']) + frame = DataFrame( + np.random.randn(len(index), 4), index=index, columns=["a", "b", "c", "d"] + ) res = frame.loc[1:2] exp = frame.reindex(frame.index[2:]) tm.assert_frame_equal(res, exp) @@ -368,13 +404,13 @@ def test_getitem_setitem_slice_integers(self): def test_setitem_change_dtype(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data dft = frame.T - s = dft['foo', 'two'] - dft['foo', 'two'] = s > s.median() - tm.assert_series_equal(dft['foo', 'two'], s > s.median()) + s = dft["foo", "two"] + dft["foo", "two"] = s > s.median() + tm.assert_series_equal(dft["foo", "two"], s > s.median()) # assert isinstance(dft._data.blocks[1].items, MultiIndex) - reindexed = dft.reindex(columns=[('foo', 'two')]) - tm.assert_series_equal(reindexed['foo', 'two'], s > s.median()) + reindexed = dft.reindex(columns=[("foo", "two")]) + tm.assert_series_equal(reindexed["foo", "two"], s > s.median()) def test_set_column_scalar_with_ix(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data @@ -383,31 +419,31 @@ def test_set_column_scalar_with_ix(self, multiindex_dataframe_random_data): frame.loc[subset] = 99 assert (frame.loc[subset].values == 99).all() - col = frame['B'] + col = frame["B"] col[subset] = 97 - assert (frame.loc[subset, 'B'] == 97).all() + assert (frame.loc[subset, "B"] == 97).all() def test_nonunique_assignment_1750(self): - df = DataFrame([[1, 1, "x", "X"], [1, 1, "y", "Y"], [1, 2, "z", "Z"]], - columns=list("ABCD")) + df = DataFrame( + [[1, 1, "x", "X"], [1, 1, "y", "Y"], [1, 2, "z", "Z"]], columns=list("ABCD") + ) - df = df.set_index(['A', 'B']) + df = df.set_index(["A", "B"]) ix = MultiIndex.from_tuples([(1, 1)]) - df.loc[ix, "C"] = '_' + df.loc[ix, "C"] = "_" - assert (df.xs((1, 1))['C'] == '_').all() + assert (df.xs((1, 1))["C"] == "_").all() def test_astype_assignment_with_dups(self): # GH 4686 # assignment with dups that has a dtype change - cols = MultiIndex.from_tuples([('A', '1'), ('B', '1'), ('A', '2')]) - df = DataFrame(np.arange(3).reshape((1, 3)), - columns=cols, dtype=object) + cols = MultiIndex.from_tuples([("A", "1"), ("B", "1"), ("A", "2")]) + df = DataFrame(np.arange(3).reshape((1, 3)), columns=cols, dtype=object) index = df.index.copy() - df['A'] = df['A'].astype(np.float64) + df["A"] = df["A"].astype(np.float64) tm.assert_index_equal(df.index, index) @@ -415,8 +451,8 @@ def test_frame_setitem_view_direct(multiindex_dataframe_random_data): # this works because we are modifying the underlying array # really a no-no df = multiindex_dataframe_random_data.T - df['foo'].values[:] = 0 - assert (df['foo'].values == 0).all() + df["foo"].values[:] = 0 + assert (df["foo"].values == 0).all() def test_frame_setitem_copy_raises(multiindex_dataframe_random_data): @@ -424,7 +460,7 @@ def test_frame_setitem_copy_raises(multiindex_dataframe_random_data): df = multiindex_dataframe_random_data.T msg = "A value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(com.SettingWithCopyError, match=msg): - df['foo']['one'] = 2 + df["foo"]["one"] = 2 def test_frame_setitem_copy_no_write(multiindex_dataframe_random_data): @@ -433,7 +469,7 @@ def test_frame_setitem_copy_no_write(multiindex_dataframe_random_data): df = frame.copy() msg = "A value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(com.SettingWithCopyError, match=msg): - df['foo']['one'] = 2 + df["foo"]["one"] = 2 result = df tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py index 2431f27bff78ab..421ca71428bcc7 100644 --- a/pandas/tests/indexing/multiindex/test_slice.py +++ b/pandas/tests/indexing/multiindex/test_slice.py @@ -14,40 +14,48 @@ @pytest.mark.filterwarnings("ignore:\\n.ix:FutureWarning") class TestMultiIndexSlicers: - def test_per_axis_per_level_getitem(self): # GH6134 # example test case - ix = MultiIndex.from_product([_mklbl('A', 5), _mklbl('B', 7), _mklbl( - 'C', 4), _mklbl('D', 2)]) + ix = MultiIndex.from_product( + [_mklbl("A", 5), _mklbl("B", 7), _mklbl("C", 4), _mklbl("D", 2)] + ) df = DataFrame(np.arange(len(ix.to_numpy())), index=ix) - result = df.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (a == 'A1' or a == 'A2' or a == 'A3') and ( - c == 'C1' or c == 'C3')]] + result = df.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :] + expected = df.loc[ + [ + tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3") + ] + ] tm.assert_frame_equal(result, expected) - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (a == 'A1' or a == 'A2' or a == 'A3') and ( - c == 'C1' or c == 'C2' or c == 'C3')]] - result = df.loc[(slice('A1', 'A3'), slice(None), slice('C1', 'C3')), :] + expected = df.loc[ + [ + tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (a == "A1" or a == "A2" or a == "A3") + and (c == "C1" or c == "C2" or c == "C3") + ] + ] + result = df.loc[(slice("A1", "A3"), slice(None), slice("C1", "C3")), :] tm.assert_frame_equal(result, expected) # test multi-index slicing with per axis and per index controls - index = MultiIndex.from_tuples([('A', 1), ('A', 2), - ('A', 3), ('B', 1)], - names=['one', 'two']) - columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), - ('b', 'foo'), ('b', 'bah')], - names=['lvl0', 'lvl1']) + index = MultiIndex.from_tuples( + [("A", 1), ("A", 2), ("A", 3), ("B", 1)], names=["one", "two"] + ) + columns = MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], + names=["lvl0", "lvl1"], + ) df = DataFrame( - np.arange(16, dtype='int64').reshape( - 4, 4), index=index, columns=columns) + np.arange(16, dtype="int64").reshape(4, 4), index=index, columns=columns + ) df = df.sort_index(axis=0).sort_index(axis=1) # identity @@ -68,19 +76,21 @@ def test_per_axis_per_level_getitem(self): tm.assert_frame_equal(result, expected) # columns - result = df.loc[:, (slice(None), ['foo'])] + result = df.loc[:, (slice(None), ["foo"])] expected = df.iloc[:, [1, 3]] tm.assert_frame_equal(result, expected) # both - result = df.loc[(slice(None), 1), (slice(None), ['foo'])] + result = df.loc[(slice(None), 1), (slice(None), ["foo"])] expected = df.iloc[[0, 3], [1, 3]] tm.assert_frame_equal(result, expected) - result = df.loc['A', 'a'] - expected = DataFrame(dict(bar=[1, 5, 9], foo=[0, 4, 8]), - index=Index([1, 2, 3], name='two'), - columns=Index(['bar', 'foo'], name='lvl1')) + result = df.loc["A", "a"] + expected = DataFrame( + dict(bar=[1, 5, 9], foo=[0, 4, 8]), + index=Index([1, 2, 3], name="two"), + columns=Index(["bar", "foo"], name="lvl1"), + ) tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), [1, 2]), :] @@ -89,15 +99,18 @@ def test_per_axis_per_level_getitem(self): # multi-level series s = Series(np.arange(len(ix.to_numpy())), index=ix) - result = s.loc['A1':'A3', :, ['C1', 'C3']] - expected = s.loc[[tuple([a, b, c, d]) - for a, b, c, d in s.index.values - if (a == 'A1' or a == 'A2' or a == 'A3') and ( - c == 'C1' or c == 'C3')]] + result = s.loc["A1":"A3", :, ["C1", "C3"]] + expected = s.loc[ + [ + tuple([a, b, c, d]) + for a, b, c, d in s.index.values + if (a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3") + ] + ] tm.assert_series_equal(result, expected) # boolean indexers - result = df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :] + result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :] expected = df.iloc[[2, 3]] tm.assert_frame_equal(result, expected) @@ -119,28 +132,39 @@ def test_per_axis_per_level_getitem(self): df = df.sort_index(level=1, axis=0) assert df.index.lexsort_depth == 0 - msg = ('MultiIndex slicing requires the index to be ' - r'lexsorted: slicing on levels \[1\], lexsort depth 0') + msg = ( + "MultiIndex slicing requires the index to be " + r"lexsorted: slicing on levels \[1\], lexsort depth 0" + ) with pytest.raises(UnsortedIndexError, match=msg): - df.loc[(slice(None), slice('bar')), :] + df.loc[(slice(None), slice("bar")), :] # GH 16734: not sorted, but no real slicing - result = df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :] + result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :] tm.assert_frame_equal(result, df.iloc[[1, 3], :]) def test_multiindex_slicers_non_unique(self): # GH 7106 # non-unique mi index support - df = (DataFrame(dict(A=['foo', 'foo', 'foo', 'foo'], - B=['a', 'a', 'a', 'a'], - C=[1, 2, 1, 3], - D=[1, 2, 3, 4])) - .set_index(['A', 'B', 'C']).sort_index()) + df = ( + DataFrame( + dict( + A=["foo", "foo", "foo", "foo"], + B=["a", "a", "a", "a"], + C=[1, 2, 1, 3], + D=[1, 2, 3, 4], + ) + ) + .set_index(["A", "B", "C"]) + .sort_index() + ) assert not df.index.is_unique - expected = (DataFrame(dict(A=['foo', 'foo'], B=['a', 'a'], - C=[1, 1], D=[1, 3])) - .set_index(['A', 'B', 'C']).sort_index()) + expected = ( + DataFrame(dict(A=["foo", "foo"], B=["a", "a"], C=[1, 1], D=[1, 3])) + .set_index(["A", "B", "C"]) + .sort_index() + ) result = df.loc[(slice(None), slice(None), 1), :] tm.assert_frame_equal(result, expected) @@ -148,25 +172,56 @@ def test_multiindex_slicers_non_unique(self): result = df.xs(1, level=2, drop_level=False) tm.assert_frame_equal(result, expected) - df = (DataFrame(dict(A=['foo', 'foo', 'foo', 'foo'], - B=['a', 'a', 'a', 'a'], - C=[1, 2, 1, 2], - D=[1, 2, 3, 4])) - .set_index(['A', 'B', 'C']).sort_index()) + df = ( + DataFrame( + dict( + A=["foo", "foo", "foo", "foo"], + B=["a", "a", "a", "a"], + C=[1, 2, 1, 2], + D=[1, 2, 3, 4], + ) + ) + .set_index(["A", "B", "C"]) + .sort_index() + ) assert not df.index.is_unique - expected = (DataFrame(dict(A=['foo', 'foo'], B=['a', 'a'], - C=[1, 1], D=[1, 3])) - .set_index(['A', 'B', 'C']).sort_index()) + expected = ( + DataFrame(dict(A=["foo", "foo"], B=["a", "a"], C=[1, 1], D=[1, 3])) + .set_index(["A", "B", "C"]) + .sort_index() + ) result = df.loc[(slice(None), slice(None), 1), :] assert not result.index.is_unique tm.assert_frame_equal(result, expected) # GH12896 # numpy-implementation dependent bug - ints = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 13, 14, 14, 16, - 17, 18, 19, 200000, 200000] + ints = [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 12, + 13, + 14, + 14, + 16, + 17, + 18, + 19, + 200000, + 200000, + ] n = len(ints) - idx = MultiIndex.from_arrays([['a'] * n, ints]) + idx = MultiIndex.from_arrays([["a"] * n, ints]) result = Series([1] * n, index=idx) result = result.sort_index() result = result.loc[(slice(None), slice(100000))] @@ -178,100 +233,139 @@ def test_multiindex_slicers_datetimelike(self): # GH 7429 # buggy/inconsistent behavior when slicing with datetime-like import datetime - dates = [datetime.datetime(2012, 1, 1, 12, 12, 12) + - datetime.timedelta(days=i) for i in range(6)] + + dates = [ + datetime.datetime(2012, 1, 1, 12, 12, 12) + datetime.timedelta(days=i) + for i in range(6) + ] freq = [1, 2] - index = MultiIndex.from_product( - [dates, freq], names=['date', 'frequency']) + index = MultiIndex.from_product([dates, freq], names=["date", "frequency"]) df = DataFrame( - np.arange(6 * 2 * 4, dtype='int64').reshape( - -1, 4), index=index, columns=list('ABCD')) + np.arange(6 * 2 * 4, dtype="int64").reshape(-1, 4), + index=index, + columns=list("ABCD"), + ) # multi-axis slicing idx = pd.IndexSlice expected = df.iloc[[0, 2, 4], [0, 1]] - result = df.loc[(slice(Timestamp('2012-01-01 12:12:12'), - Timestamp('2012-01-03 12:12:12')), - slice(1, 1)), slice('A', 'B')] - tm.assert_frame_equal(result, expected) - - result = df.loc[(idx[Timestamp('2012-01-01 12:12:12'):Timestamp( - '2012-01-03 12:12:12')], idx[1:1]), slice('A', 'B')] - tm.assert_frame_equal(result, expected) - - result = df.loc[(slice(Timestamp('2012-01-01 12:12:12'), - Timestamp('2012-01-03 12:12:12')), 1), - slice('A', 'B')] + result = df.loc[ + ( + slice( + Timestamp("2012-01-01 12:12:12"), Timestamp("2012-01-03 12:12:12") + ), + slice(1, 1), + ), + slice("A", "B"), + ] + tm.assert_frame_equal(result, expected) + + result = df.loc[ + ( + idx[ + Timestamp("2012-01-01 12:12:12") : Timestamp("2012-01-03 12:12:12") + ], + idx[1:1], + ), + slice("A", "B"), + ] + tm.assert_frame_equal(result, expected) + + result = df.loc[ + ( + slice( + Timestamp("2012-01-01 12:12:12"), Timestamp("2012-01-03 12:12:12") + ), + 1, + ), + slice("A", "B"), + ] tm.assert_frame_equal(result, expected) # with strings - result = df.loc[(slice('2012-01-01 12:12:12', '2012-01-03 12:12:12'), - slice(1, 1)), slice('A', 'B')] + result = df.loc[ + (slice("2012-01-01 12:12:12", "2012-01-03 12:12:12"), slice(1, 1)), + slice("A", "B"), + ] tm.assert_frame_equal(result, expected) - result = df.loc[(idx['2012-01-01 12:12:12':'2012-01-03 12:12:12'], 1), - idx['A', 'B']] + result = df.loc[ + (idx["2012-01-01 12:12:12":"2012-01-03 12:12:12"], 1), idx["A", "B"] + ] tm.assert_frame_equal(result, expected) def test_multiindex_slicers_edges(self): # GH 8132 # various edge cases df = DataFrame( - {'A': ['A0'] * 5 + ['A1'] * 5 + ['A2'] * 5, - 'B': ['B0', 'B0', 'B1', 'B1', 'B2'] * 3, - 'DATE': ["2013-06-11", "2013-07-02", "2013-07-09", "2013-07-30", - "2013-08-06", "2013-06-11", "2013-07-02", "2013-07-09", - "2013-07-30", "2013-08-06", "2013-09-03", "2013-10-01", - "2013-07-09", "2013-08-06", "2013-09-03"], - 'VALUES': [22, 35, 14, 9, 4, 40, 18, 4, 2, 5, 1, 2, 3, 4, 2]}) - - df['DATE'] = pd.to_datetime(df['DATE']) - df1 = df.set_index(['A', 'B', 'DATE']) + { + "A": ["A0"] * 5 + ["A1"] * 5 + ["A2"] * 5, + "B": ["B0", "B0", "B1", "B1", "B2"] * 3, + "DATE": [ + "2013-06-11", + "2013-07-02", + "2013-07-09", + "2013-07-30", + "2013-08-06", + "2013-06-11", + "2013-07-02", + "2013-07-09", + "2013-07-30", + "2013-08-06", + "2013-09-03", + "2013-10-01", + "2013-07-09", + "2013-08-06", + "2013-09-03", + ], + "VALUES": [22, 35, 14, 9, 4, 40, 18, 4, 2, 5, 1, 2, 3, 4, 2], + } + ) + + df["DATE"] = pd.to_datetime(df["DATE"]) + df1 = df.set_index(["A", "B", "DATE"]) df1 = df1.sort_index() # A1 - Get all values under "A0" and "A1" - result = df1.loc[(slice('A1')), :] + result = df1.loc[(slice("A1")), :] expected = df1.iloc[0:10] tm.assert_frame_equal(result, expected) # A2 - Get all values from the start to "A2" - result = df1.loc[(slice('A2')), :] + result = df1.loc[(slice("A2")), :] expected = df1 tm.assert_frame_equal(result, expected) # A3 - Get all values under "B1" or "B2" - result = df1.loc[(slice(None), slice('B1', 'B2')), :] + result = df1.loc[(slice(None), slice("B1", "B2")), :] expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13, 14]] tm.assert_frame_equal(result, expected) # A4 - Get all values between 2013-07-02 and 2013-07-09 - result = df1.loc[(slice(None), slice(None), - slice('20130702', '20130709')), :] + result = df1.loc[(slice(None), slice(None), slice("20130702", "20130709")), :] expected = df1.iloc[[1, 2, 6, 7, 12]] tm.assert_frame_equal(result, expected) # B1 - Get all values in B0 that are also under A0, A1 and A2 - result = df1.loc[(slice('A2'), slice('B0')), :] + result = df1.loc[(slice("A2"), slice("B0")), :] expected = df1.iloc[[0, 1, 5, 6, 10, 11]] tm.assert_frame_equal(result, expected) # B2 - Get all values in B0, B1 and B2 (similar to what #2 is doing for # the As) - result = df1.loc[(slice(None), slice('B2')), :] + result = df1.loc[(slice(None), slice("B2")), :] expected = df1 tm.assert_frame_equal(result, expected) # B3 - Get all values from B1 to B2 and up to 2013-08-06 - result = df1.loc[(slice(None), slice('B1', 'B2'), - slice('2013-08-06')), :] + result = df1.loc[(slice(None), slice("B1", "B2"), slice("2013-08-06")), :] expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13]] tm.assert_frame_equal(result, expected) # B4 - Same as A4 but the start of the date slice is not a key. # shows indexing on a partial selection slice - result = df1.loc[(slice(None), slice(None), - slice('20130701', '20130709')), :] + result = df1.loc[(slice(None), slice(None), slice("20130701", "20130709")), :] expected = df1.iloc[[1, 2, 6, 7, 12]] tm.assert_frame_equal(result, expected) @@ -281,92 +375,122 @@ def test_per_axis_per_level_doc_examples(self): idx = pd.IndexSlice # from indexing.rst / advanced - index = MultiIndex.from_product([_mklbl('A', 4), _mklbl('B', 2), - _mklbl('C', 4), _mklbl('D', 2)]) - columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), - ('b', 'foo'), ('b', 'bah')], - names=['lvl0', 'lvl1']) - df = DataFrame(np.arange(len(index) * len(columns), dtype='int64') - .reshape((len(index), len(columns))), - index=index, columns=columns) - result = df.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (a == 'A1' or a == 'A2' or a == 'A3') and ( - c == 'C1' or c == 'C3')]] - tm.assert_frame_equal(result, expected) - result = df.loc[idx['A1':'A3', :, ['C1', 'C3']], :] - tm.assert_frame_equal(result, expected) - - result = df.loc[(slice(None), slice(None), ['C1', 'C3']), :] - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (c == 'C1' or c == 'C3')]] - tm.assert_frame_equal(result, expected) - result = df.loc[idx[:, :, ['C1', 'C3']], :] + index = MultiIndex.from_product( + [_mklbl("A", 4), _mklbl("B", 2), _mklbl("C", 4), _mklbl("D", 2)] + ) + columns = MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], + names=["lvl0", "lvl1"], + ) + df = DataFrame( + np.arange(len(index) * len(columns), dtype="int64").reshape( + (len(index), len(columns)) + ), + index=index, + columns=columns, + ) + result = df.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :] + expected = df.loc[ + [ + tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3") + ] + ] + tm.assert_frame_equal(result, expected) + result = df.loc[idx["A1":"A3", :, ["C1", "C3"]], :] + tm.assert_frame_equal(result, expected) + + result = df.loc[(slice(None), slice(None), ["C1", "C3"]), :] + expected = df.loc[ + [ + tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (c == "C1" or c == "C3") + ] + ] + tm.assert_frame_equal(result, expected) + result = df.loc[idx[:, :, ["C1", "C3"]], :] tm.assert_frame_equal(result, expected) # not sorted with pytest.raises(UnsortedIndexError): - df.loc['A1', ('a', slice('foo'))] + df.loc["A1", ("a", slice("foo"))] # GH 16734: not sorted, but no real slicing - tm.assert_frame_equal(df.loc['A1', (slice(None), 'foo')], - df.loc['A1'].iloc[:, [0, 2]]) + tm.assert_frame_equal( + df.loc["A1", (slice(None), "foo")], df.loc["A1"].iloc[:, [0, 2]] + ) df = df.sort_index(axis=1) # slicing - df.loc['A1', (slice(None), 'foo')] - df.loc[(slice(None), slice(None), ['C1', 'C3']), (slice(None), 'foo')] + df.loc["A1", (slice(None), "foo")] + df.loc[(slice(None), slice(None), ["C1", "C3"]), (slice(None), "foo")] # setitem - df.loc(axis=0)[:, :, ['C1', 'C3']] = -10 + df.loc(axis=0)[:, :, ["C1", "C3"]] = -10 def test_loc_axis_arguments(self): - index = MultiIndex.from_product([_mklbl('A', 4), _mklbl('B', 2), - _mklbl('C', 4), _mklbl('D', 2)]) - columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), - ('b', 'foo'), ('b', 'bah')], - names=['lvl0', 'lvl1']) - df = DataFrame(np.arange(len(index) * len(columns), dtype='int64') - .reshape((len(index), len(columns))), - index=index, - columns=columns).sort_index().sort_index(axis=1) + index = MultiIndex.from_product( + [_mklbl("A", 4), _mklbl("B", 2), _mklbl("C", 4), _mklbl("D", 2)] + ) + columns = MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], + names=["lvl0", "lvl1"], + ) + df = ( + DataFrame( + np.arange(len(index) * len(columns), dtype="int64").reshape( + (len(index), len(columns)) + ), + index=index, + columns=columns, + ) + .sort_index() + .sort_index(axis=1) + ) # axis 0 - result = df.loc(axis=0)['A1':'A3', :, ['C1', 'C3']] - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (a == 'A1' or a == 'A2' or a == 'A3') and ( - c == 'C1' or c == 'C3')]] - tm.assert_frame_equal(result, expected) - - result = df.loc(axis='index')[:, :, ['C1', 'C3']] - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (c == 'C1' or c == 'C3')]] + result = df.loc(axis=0)["A1":"A3", :, ["C1", "C3"]] + expected = df.loc[ + [ + tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3") + ] + ] + tm.assert_frame_equal(result, expected) + + result = df.loc(axis="index")[:, :, ["C1", "C3"]] + expected = df.loc[ + [ + tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (c == "C1" or c == "C3") + ] + ] tm.assert_frame_equal(result, expected) # axis 1 - result = df.loc(axis=1)[:, 'foo'] - expected = df.loc[:, (slice(None), 'foo')] + result = df.loc(axis=1)[:, "foo"] + expected = df.loc[:, (slice(None), "foo")] tm.assert_frame_equal(result, expected) - result = df.loc(axis='columns')[:, 'foo'] - expected = df.loc[:, (slice(None), 'foo')] + result = df.loc(axis="columns")[:, "foo"] + expected = df.loc[:, (slice(None), "foo")] tm.assert_frame_equal(result, expected) # invalid axis with pytest.raises(ValueError): - df.loc(axis=-1)[:, :, ['C1', 'C3']] + df.loc(axis=-1)[:, :, ["C1", "C3"]] with pytest.raises(ValueError): - df.loc(axis=2)[:, :, ['C1', 'C3']] + df.loc(axis=2)[:, :, ["C1", "C3"]] with pytest.raises(ValueError): - df.loc(axis='foo')[:, :, ['C1', 'C3']] + df.loc(axis="foo")[:, :, ["C1", "C3"]] def test_per_axis_per_level_setitem(self): @@ -374,16 +498,17 @@ def test_per_axis_per_level_setitem(self): idx = pd.IndexSlice # test multi-index slicing with per axis and per index controls - index = MultiIndex.from_tuples([('A', 1), ('A', 2), - ('A', 3), ('B', 1)], - names=['one', 'two']) - columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), - ('b', 'foo'), ('b', 'bah')], - names=['lvl0', 'lvl1']) + index = MultiIndex.from_tuples( + [("A", 1), ("A", 2), ("A", 3), ("B", 1)], names=["one", "two"] + ) + columns = MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], + names=["lvl0", "lvl1"], + ) df_orig = DataFrame( - np.arange(16, dtype='int64').reshape( - 4, 4), index=index, columns=columns) + np.arange(16, dtype="int64").reshape(4, 4), index=index, columns=columns + ) df_orig = df_orig.sort_index(axis=0).sort_index(axis=1) # identity @@ -432,34 +557,35 @@ def test_per_axis_per_level_setitem(self): # columns df = df_orig.copy() - df.loc[:, (slice(None), ['foo'])] = 100 + df.loc[:, (slice(None), ["foo"])] = 100 expected = df_orig.copy() expected.iloc[:, [1, 3]] = 100 tm.assert_frame_equal(df, expected) # both df = df_orig.copy() - df.loc[(slice(None), 1), (slice(None), ['foo'])] = 100 + df.loc[(slice(None), 1), (slice(None), ["foo"])] = 100 expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() - df.loc[idx[:, 1], idx[:, ['foo']]] = 100 + df.loc[idx[:, 1], idx[:, ["foo"]]] = 100 expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() - df.loc['A', 'a'] = 100 + df.loc["A", "a"] = 100 expected = df_orig.copy() expected.iloc[0:3, 0:2] = 100 tm.assert_frame_equal(df, expected) # setting with a list-like df = df_orig.copy() - df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array( - [[100, 100], [100, 100]], dtype='int64') + df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array( + [[100, 100], [100, 100]], dtype="int64" + ) expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 tm.assert_frame_equal(df, expected) @@ -468,39 +594,44 @@ def test_per_axis_per_level_setitem(self): df = df_orig.copy() with pytest.raises(ValueError): - df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array( - [[100], [100, 100]], dtype='int64') + df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array( + [[100], [100, 100]], dtype="int64" + ) with pytest.raises(ValueError): - df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array( - [100, 100, 100, 100], dtype='int64') + df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array( + [100, 100, 100, 100], dtype="int64" + ) # with an alignable rhs df = df_orig.copy() - df.loc[(slice(None), 1), (slice(None), ['foo'])] = df.loc[(slice( - None), 1), (slice(None), ['foo'])] * 5 + df.loc[(slice(None), 1), (slice(None), ["foo"])] = ( + df.loc[(slice(None), 1), (slice(None), ["foo"])] * 5 + ) expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = expected.iloc[[0, 3], [1, 3]] * 5 tm.assert_frame_equal(df, expected) df = df_orig.copy() - df.loc[(slice(None), 1), (slice(None), ['foo'])] *= df.loc[(slice( - None), 1), (slice(None), ['foo'])] + df.loc[(slice(None), 1), (slice(None), ["foo"])] *= df.loc[ + (slice(None), 1), (slice(None), ["foo"]) + ] expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] tm.assert_frame_equal(df, expected) - rhs = df_orig.loc[(slice(None), 1), (slice(None), ['foo'])].copy() - rhs.loc[:, ('c', 'bah')] = 10 + rhs = df_orig.loc[(slice(None), 1), (slice(None), ["foo"])].copy() + rhs.loc[:, ("c", "bah")] = 10 df = df_orig.copy() - df.loc[(slice(None), 1), (slice(None), ['foo'])] *= rhs + df.loc[(slice(None), 1), (slice(None), ["foo"])] *= rhs expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] tm.assert_frame_equal(df, expected) def test_multiindex_label_slicing_with_negative_step(self): - s = Series(np.arange(20), - MultiIndex.from_product([list('abcde'), np.arange(4)])) + s = Series( + np.arange(20), MultiIndex.from_product([list("abcde"), np.arange(4)]) + ) SLC = pd.IndexSlice def assert_slices_equivalent(l_slc, i_slc): @@ -511,46 +642,45 @@ def assert_slices_equivalent(l_slc, i_slc): assert_slices_equivalent(SLC[::-1], SLC[::-1]) - assert_slices_equivalent(SLC['d'::-1], SLC[15::-1]) - assert_slices_equivalent(SLC[('d', )::-1], SLC[15::-1]) + assert_slices_equivalent(SLC["d"::-1], SLC[15::-1]) + assert_slices_equivalent(SLC[("d",)::-1], SLC[15::-1]) - assert_slices_equivalent(SLC[:'d':-1], SLC[:11:-1]) - assert_slices_equivalent(SLC[:('d', ):-1], SLC[:11:-1]) + assert_slices_equivalent(SLC[:"d":-1], SLC[:11:-1]) + assert_slices_equivalent(SLC[:("d",):-1], SLC[:11:-1]) - assert_slices_equivalent(SLC['d':'b':-1], SLC[15:3:-1]) - assert_slices_equivalent(SLC[('d', ):'b':-1], SLC[15:3:-1]) - assert_slices_equivalent(SLC['d':('b', ):-1], SLC[15:3:-1]) - assert_slices_equivalent(SLC[('d', ):('b', ):-1], SLC[15:3:-1]) - assert_slices_equivalent(SLC['b':'d':-1], SLC[:0]) + assert_slices_equivalent(SLC["d":"b":-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC[("d",):"b":-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC["d":("b",):-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC[("d",):("b",):-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC["b":"d":-1], SLC[:0]) - assert_slices_equivalent(SLC[('c', 2)::-1], SLC[10::-1]) - assert_slices_equivalent(SLC[:('c', 2):-1], SLC[:9:-1]) - assert_slices_equivalent(SLC[('e', 0):('c', 2):-1], SLC[16:9:-1]) + assert_slices_equivalent(SLC[("c", 2)::-1], SLC[10::-1]) + assert_slices_equivalent(SLC[:("c", 2):-1], SLC[:9:-1]) + assert_slices_equivalent(SLC[("e", 0):("c", 2):-1], SLC[16:9:-1]) def test_multiindex_slice_first_level(self): # GH 12697 - freq = ['a', 'b', 'c', 'd'] + freq = ["a", "b", "c", "d"] idx = MultiIndex.from_product([freq, np.arange(500)]) - df = DataFrame(list(range(2000)), index=idx, columns=['Test']) + df = DataFrame(list(range(2000)), index=idx, columns=["Test"]) df_slice = df.loc[pd.IndexSlice[:, 30:70], :] - result = df_slice.loc['a'] - expected = DataFrame(list(range(30, 71)), - columns=['Test'], index=range(30, 71)) + result = df_slice.loc["a"] + expected = DataFrame(list(range(30, 71)), columns=["Test"], index=range(30, 71)) tm.assert_frame_equal(result, expected) - result = df_slice.loc['d'] - expected = DataFrame(list(range(1530, 1571)), - columns=['Test'], index=range(30, 71)) + result = df_slice.loc["d"] + expected = DataFrame( + list(range(1530, 1571)), columns=["Test"], index=range(30, 71) + ) tm.assert_frame_equal(result, expected) - def test_int_series_slicing( - self, multiindex_year_month_day_dataframe_random_data): + def test_int_series_slicing(self, multiindex_year_month_day_dataframe_random_data): ymd = multiindex_year_month_day_dataframe_random_data - s = ymd['A'] + s = ymd["A"] result = s[5:] expected = s.reindex(s.index[5:]) tm.assert_series_equal(result, expected) - exp = ymd['A'].copy() + exp = ymd["A"].copy() s[5:] = 0 exp.values[5:] = 0 tm.assert_numpy_array_equal(s.values, exp.values) @@ -562,16 +692,16 @@ def test_int_series_slicing( def test_non_reducing_slice_on_multiindex(self): # GH 19861 dic = { - ('a', 'd'): [1, 4], - ('a', 'c'): [2, 3], - ('b', 'c'): [3, 2], - ('b', 'd'): [4, 1] + ("a", "d"): [1, 4], + ("a", "c"): [2, 3], + ("b", "c"): [3, 2], + ("b", "d"): [4, 1], } df = pd.DataFrame(dic, index=[0, 1]) idx = pd.IndexSlice - slice_ = idx[:, idx['b', 'd']] + slice_ = idx[:, idx["b", "d"]] tslice_ = _non_reducing_slice(slice_) result = df.loc[tslice_] - expected = pd.DataFrame({('b', 'd'): [4, 1]}) + expected = pd.DataFrame({("b", "d"): [4, 1]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_sorted.py b/pandas/tests/indexing/multiindex/test_sorted.py index 7fb6c806ae793a..43ad66b7d11168 100644 --- a/pandas/tests/indexing/multiindex/test_sorted.py +++ b/pandas/tests/indexing/multiindex/test_sorted.py @@ -8,14 +8,15 @@ class TestMultiIndexSorted: def test_getitem_multilevel_index_tuple_not_sorted(self): index_columns = list("abc") - df = DataFrame([[0, 1, 0, "x"], [0, 0, 1, "y"]], - columns=index_columns + ["data"]) + df = DataFrame( + [[0, 1, 0, "x"], [0, 0, 1, "y"]], columns=index_columns + ["data"] + ) df = df.set_index(index_columns) query_index = df.index[:1] rs = df.loc[query_index, "data"] - xp_idx = MultiIndex.from_tuples([(0, 1, 0)], names=['a', 'b', 'c']) - xp = Series(['x'], index=xp_idx, name='data') + xp_idx = MultiIndex.from_tuples([(0, 1, 0)], names=["a", "b", "c"]) + xp = Series(["x"], index=xp_idx, name="data") tm.assert_series_equal(rs, xp) def test_getitem_slice_not_sorted(self, multiindex_dataframe_random_data): @@ -23,21 +24,25 @@ def test_getitem_slice_not_sorted(self, multiindex_dataframe_random_data): df = frame.sort_index(level=1).T # buglet with int typechecking - result = df.iloc[:, :np.int32(3)] + result = df.iloc[:, : np.int32(3)] expected = df.reindex(columns=df.columns[:3]) tm.assert_frame_equal(result, expected) def test_frame_getitem_not_sorted2(self): # 13431 - df = DataFrame({'col1': ['b', 'd', 'b', 'a'], - 'col2': [3, 1, 1, 2], - 'data': ['one', 'two', 'three', 'four']}) - - df2 = df.set_index(['col1', 'col2']) + df = DataFrame( + { + "col1": ["b", "d", "b", "a"], + "col2": [3, 1, 1, 2], + "data": ["one", "two", "three", "four"], + } + ) + + df2 = df.set_index(["col1", "col2"]) df2_original = df2.copy() - df2.index.set_levels(['b', 'd', 'a'], level='col1', inplace=True) - df2.index.set_codes([0, 1, 0, 2], level='col1', inplace=True) + df2.index.set_levels(["b", "d", "a"], level="col1", inplace=True) + df2.index.set_codes([0, 1, 0, 2], level="col1", inplace=True) assert not df2.index.is_lexsorted() assert not df2.index.is_monotonic @@ -54,37 +59,39 @@ def test_frame_getitem_not_sorted2(self): def test_frame_getitem_not_sorted(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data df = frame.T - df['foo', 'four'] = 'foo' + df["foo", "four"] = "foo" arrays = [np.array(x) for x in zip(*df.columns.values)] - result = df['foo'] - result2 = df.loc[:, 'foo'] - expected = df.reindex(columns=df.columns[arrays[0] == 'foo']) + result = df["foo"] + result2 = df.loc[:, "foo"] + expected = df.reindex(columns=df.columns[arrays[0] == "foo"]) expected.columns = expected.columns.droplevel(0) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) df = df.T - result = df.xs('foo') - result2 = df.loc['foo'] - expected = df.reindex(df.index[arrays[0] == 'foo']) + result = df.xs("foo") + result2 = df.loc["foo"] + expected = df.reindex(df.index[arrays[0] == "foo"]) expected.index = expected.index.droplevel(0) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) def test_series_getitem_not_sorted(self): - arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + arrays = [ + ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) arrays = [np.array(x) for x in zip(*index.values)] - result = s['qux'] - result2 = s.loc['qux'] - expected = s[arrays[0] == 'qux'] + result = s["qux"] + result2 = s.loc["qux"] + expected = s[arrays[0] == "qux"] expected.index = expected.index.droplevel(0) tm.assert_series_equal(result, expected) tm.assert_series_equal(result2, expected) diff --git a/pandas/tests/indexing/multiindex/test_xs.py b/pandas/tests/indexing/multiindex/test_xs.py index bbc55c75c5b77b..5e58b3ec155ed9 100644 --- a/pandas/tests/indexing/multiindex/test_xs.py +++ b/pandas/tests/indexing/multiindex/test_xs.py @@ -10,26 +10,36 @@ @pytest.fixture def four_level_index_dataframe(): - arr = np.array([[-0.5109, -2.3358, -0.4645, 0.05076, 0.364], - [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], - [-0.6662, -0.5243, -0.358, 0.89145, 2.5838]]) + arr = np.array( + [ + [-0.5109, -2.3358, -0.4645, 0.05076, 0.364], + [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], + [-0.6662, -0.5243, -0.358, 0.89145, 2.5838], + ] + ) index = MultiIndex( - levels=[['a', 'x'], ['b', 'q'], [10.0032, 20.0, 30.0], [3, 4, 5]], + levels=[["a", "x"], ["b", "q"], [10.0032, 20.0, 30.0], [3, 4, 5]], codes=[[0, 0, 1], [0, 1, 1], [0, 1, 2], [2, 1, 0]], - names=['one', 'two', 'three', 'four']) - return DataFrame(arr, index=index, columns=list('ABCDE')) - - -@pytest.mark.parametrize('key, level, exp_arr, exp_index', [ - ('a', 'lvl0', lambda x: x[:, 0:2], Index(['bar', 'foo'], name='lvl1')), - ('foo', 'lvl1', lambda x: x[:, 1:2], Index(['a'], name='lvl0')) -]) + names=["one", "two", "three", "four"], + ) + return DataFrame(arr, index=index, columns=list("ABCDE")) + + +@pytest.mark.parametrize( + "key, level, exp_arr, exp_index", + [ + ("a", "lvl0", lambda x: x[:, 0:2], Index(["bar", "foo"], name="lvl1")), + ("foo", "lvl1", lambda x: x[:, 1:2], Index(["a"], name="lvl0")), + ], +) def test_xs_named_levels_axis_eq_1(key, level, exp_arr, exp_index): # see gh-2903 arr = np.random.randn(4, 4) - index = MultiIndex(levels=[['a', 'b'], ['bar', 'foo', 'hello', 'world']], - codes=[[0, 0, 1, 1], [0, 1, 2, 3]], - names=['lvl0', 'lvl1']) + index = MultiIndex( + levels=[["a", "b"], ["bar", "foo", "hello", "world"]], + codes=[[0, 0, 1, 1], [0, 1, 2, 3]], + names=["lvl0", "lvl1"], + ) df = DataFrame(arr, columns=index) result = df.xs(key, level=level, axis=1) expected = DataFrame(exp_arr(arr), columns=exp_index) @@ -38,15 +48,15 @@ def test_xs_named_levels_axis_eq_1(key, level, exp_arr, exp_index): def test_xs_values(multiindex_dataframe_random_data): df = multiindex_dataframe_random_data - result = df.xs(('bar', 'two')).values + result = df.xs(("bar", "two")).values expected = df.values[4] tm.assert_almost_equal(result, expected) def test_xs_loc_equality(multiindex_dataframe_random_data): df = multiindex_dataframe_random_data - result = df.xs(('bar', 'two')) - expected = df.loc[('bar', 'two')] + result = df.xs(("bar", "two")) + expected = df.loc[("bar", "two")] tm.assert_series_equal(result, expected) @@ -54,33 +64,31 @@ def test_xs_missing_values_in_index(): # see gh-6574 # missing values in returned index should be preserved acc = [ - ('a', 'abcde', 1), - ('b', 'bbcde', 2), - ('y', 'yzcde', 25), - ('z', 'xbcde', 24), - ('z', None, 26), - ('z', 'zbcde', 25), - ('z', 'ybcde', 26), + ("a", "abcde", 1), + ("b", "bbcde", 2), + ("y", "yzcde", 25), + ("z", "xbcde", 24), + ("z", None, 26), + ("z", "zbcde", 25), + ("z", "ybcde", 26), ] - df = DataFrame(acc, - columns=['a1', 'a2', 'cnt']).set_index(['a1', 'a2']) - expected = DataFrame({'cnt': [24, 26, 25, 26]}, index=Index( - ['xbcde', np.nan, 'zbcde', 'ybcde'], name='a2')) + df = DataFrame(acc, columns=["a1", "a2", "cnt"]).set_index(["a1", "a2"]) + expected = DataFrame( + {"cnt": [24, 26, 25, 26]}, + index=Index(["xbcde", np.nan, "zbcde", "ybcde"], name="a2"), + ) - result = df.xs('z', level='a1') + result = df.xs("z", level="a1") tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize('key, level', [ - ('one', 'second'), - (['one'], ['second']) -]) +@pytest.mark.parametrize("key, level", [("one", "second"), (["one"], ["second"])]) def test_xs_with_duplicates(key, level, multiindex_dataframe_random_data): # see gh-13719 frame = multiindex_dataframe_random_data df = concat([frame] * 2) assert df.index.is_unique is False - expected = concat([frame.xs('one', level='second')] * 2) + expected = concat([frame.xs("one", level="second")] * 2) result = df.xs(key, level=level) tm.assert_frame_equal(result, expected) @@ -88,36 +96,38 @@ def test_xs_with_duplicates(key, level, multiindex_dataframe_random_data): def test_xs_level(multiindex_dataframe_random_data): df = multiindex_dataframe_random_data - result = df.xs('two', level='second') - expected = df[df.index.get_level_values(1) == 'two'] - expected.index = Index(['foo', 'bar', 'baz', 'qux'], name='first') + result = df.xs("two", level="second") + expected = df[df.index.get_level_values(1) == "two"] + expected.index = Index(["foo", "bar", "baz", "qux"], name="first") tm.assert_frame_equal(result, expected) def test_xs_level_eq_2(): arr = np.random.randn(3, 5) index = MultiIndex( - levels=[['a', 'p', 'x'], ['b', 'q', 'y'], ['c', 'r', 'z']], - codes=[[2, 0, 1], [2, 0, 1], [2, 0, 1]]) + levels=[["a", "p", "x"], ["b", "q", "y"], ["c", "r", "z"]], + codes=[[2, 0, 1], [2, 0, 1], [2, 0, 1]], + ) df = DataFrame(arr, index=index) - expected = DataFrame(arr[1:2], index=[['a'], ['b']]) - result = df.xs('c', level=2) + expected = DataFrame(arr[1:2], index=[["a"], ["b"]]) + result = df.xs("c", level=2) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize('indexer', [ - lambda df: df.xs(('a', 4), level=['one', 'four']), - lambda df: df.xs('a').xs(4, level='four') -]) +@pytest.mark.parametrize( + "indexer", + [ + lambda df: df.xs(("a", 4), level=["one", "four"]), + lambda df: df.xs("a").xs(4, level="four"), + ], +) def test_xs_level_multiple(indexer, four_level_index_dataframe): df = four_level_index_dataframe expected_values = [[0.4473, 1.4152, 0.2834, 1.00661, 0.1744]] expected_index = MultiIndex( - levels=[['q'], [20.0]], - codes=[[0], [0]], - names=['two', 'three']) - expected = DataFrame( - expected_values, index=expected_index, columns=list('ABCDE')) + levels=[["q"], [20.0]], codes=[[0], [0]], names=["two", "three"] + ) + expected = DataFrame(expected_values, index=expected_index, columns=list("ABCDE")) result = indexer(df) tm.assert_frame_equal(result, expected) @@ -125,11 +135,11 @@ def test_xs_level_multiple(indexer, four_level_index_dataframe): def test_xs_setting_with_copy_error(multiindex_dataframe_random_data): # this is a copy in 0.14 df = multiindex_dataframe_random_data - result = df.xs('two', level='second') + result = df.xs("two", level="second") # setting this will give a SettingWithCopyError # as we are trying to write a view - msg = 'A value is trying to be set on a copy of a slice from a DataFrame' + msg = "A value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(com.SettingWithCopyError, match=msg): result[:] = 10 @@ -137,11 +147,11 @@ def test_xs_setting_with_copy_error(multiindex_dataframe_random_data): def test_xs_setting_with_copy_error_multiple(four_level_index_dataframe): # this is a copy in 0.14 df = four_level_index_dataframe - result = df.xs(('a', 4), level=['one', 'four']) + result = df.xs(("a", 4), level=["one", "four"]) # setting this will give a SettingWithCopyError # as we are trying to write a view - msg = 'A value is trying to be set on a copy of a slice from a DataFrame' + msg = "A value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(com.SettingWithCopyError, match=msg): result[:] = 10 @@ -149,32 +159,32 @@ def test_xs_setting_with_copy_error_multiple(four_level_index_dataframe): def test_xs_integer_key(): # see gh-2107 dates = range(20111201, 20111205) - ids = 'abcde' + ids = "abcde" index = MultiIndex.from_tuples( - [x for x in product(dates, ids)], - names=['date', 'secid']) - df = DataFrame( - np.random.randn(len(index), 3), index, ['X', 'Y', 'Z']) + [x for x in product(dates, ids)], names=["date", "secid"] + ) + df = DataFrame(np.random.randn(len(index), 3), index, ["X", "Y", "Z"]) - result = df.xs(20111201, level='date') + result = df.xs(20111201, level="date") expected = df.loc[20111201, :] tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize('indexer', [ - lambda df: df.xs('a', level=0), - lambda df: df.xs('a') -]) +@pytest.mark.parametrize( + "indexer", [lambda df: df.xs("a", level=0), lambda df: df.xs("a")] +) def test_xs_level0(indexer, four_level_index_dataframe): df = four_level_index_dataframe - expected_values = [[-0.5109, -2.3358, -0.4645, 0.05076, 0.364], - [0.4473, 1.4152, 0.2834, 1.00661, 0.1744]] + expected_values = [ + [-0.5109, -2.3358, -0.4645, 0.05076, 0.364], + [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], + ] expected_index = MultiIndex( - levels=[['b', 'q'], [10.0032, 20.0], [4, 5]], + levels=[["b", "q"], [10.0032, 20.0], [4, 5]], codes=[[0, 1], [0, 1], [1, 0]], - names=['two', 'three', 'four']) - expected = DataFrame( - expected_values, index=expected_index, columns=list('ABCDE')) + names=["two", "three", "four"], + ) + expected = DataFrame(expected_values, index=expected_index, columns=list("ABCDE")) result = indexer(df) tm.assert_frame_equal(result, expected) @@ -184,9 +194,9 @@ def test_xs_level_series(multiindex_dataframe_random_data): # this test is not explicitly testing .xs functionality # TODO: move to another module or refactor df = multiindex_dataframe_random_data - s = df['A'] - result = s[:, 'two'] - expected = df.xs('two', level=1)['A'] + s = df["A"] + result = s[:, "two"] + expected = df.xs("two", level=1)["A"] tm.assert_series_equal(result, expected) @@ -194,44 +204,46 @@ def test_xs_level_series_ymd(multiindex_year_month_day_dataframe_random_data): # this test is not explicitly testing .xs functionality # TODO: move to another module or refactor df = multiindex_year_month_day_dataframe_random_data - s = df['A'] + s = df["A"] result = s[2000, 5] - expected = df.loc[2000, 5]['A'] + expected = df.loc[2000, 5]["A"] tm.assert_series_equal(result, expected) def test_xs_level_series_slice_not_implemented( - multiindex_year_month_day_dataframe_random_data): + multiindex_year_month_day_dataframe_random_data +): # this test is not explicitly testing .xs functionality # TODO: move to another module or refactor # not implementing this for now df = multiindex_year_month_day_dataframe_random_data - s = df['A'] + s = df["A"] - msg = r'\(2000, slice\(3, 4, None\)\)' + msg = r"\(2000, slice\(3, 4, None\)\)" with pytest.raises(TypeError, match=msg): s[2000, 3:4] def test_series_getitem_multiindex_xs(): # GH6258 - dt = list(date_range('20130903', periods=3)) - idx = MultiIndex.from_product([list('AB'), dt]) + dt = list(date_range("20130903", periods=3)) + idx = MultiIndex.from_product([list("AB"), dt]) s = Series([1, 3, 4, 1, 3, 4], index=idx) - expected = Series([1, 1], index=list('AB')) + expected = Series([1, 1], index=list("AB")) - result = s.xs('20130903', level=1) + result = s.xs("20130903", level=1) tm.assert_series_equal(result, expected) def test_series_getitem_multiindex_xs_by_label(): # GH5684 - idx = MultiIndex.from_tuples([('a', 'one'), ('a', 'two'), ('b', 'one'), - ('b', 'two')]) + idx = MultiIndex.from_tuples( + [("a", "one"), ("a", "two"), ("b", "one"), ("b", "two")] + ) s = Series([1, 2, 3, 4], index=idx) - s.index.set_names(['L1', 'L2'], inplace=True) - expected = Series([1, 3], index=['a', 'b']) - expected.index.set_names(['L1'], inplace=True) + s.index.set_names(["L1", "L2"], inplace=True) + expected = Series([1, 3], index=["a", "b"]) + expected.index.set_names(["L1"], inplace=True) - result = s.xs('one', level='L2') + result = s.xs("one", level="L2") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/test_callable.py b/pandas/tests/indexing/test_callable.py index 657309170cac31..78aaf80b532fb0 100644 --- a/pandas/tests/indexing/test_callable.py +++ b/pandas/tests/indexing/test_callable.py @@ -5,11 +5,9 @@ class TestIndexingCallable: - def test_frame_loc_callable(self): # GH 11485 - df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': list('aabb'), - 'C': [1, 2, 3, 4]}) + df = pd.DataFrame({"A": [1, 2, 3, 4], "B": list("aabb"), "C": [1, 2, 3, 4]}) # iloc cannot use boolean Series (see GH3635) # return bool indexer @@ -19,168 +17,161 @@ def test_frame_loc_callable(self): res = df.loc[lambda x: x.A > 2] tm.assert_frame_equal(res, df.loc[df.A > 2]) - res = df.loc[lambda x: x.A > 2, ] - tm.assert_frame_equal(res, df.loc[df.A > 2, ]) + res = df.loc[lambda x: x.A > 2,] + tm.assert_frame_equal(res, df.loc[df.A > 2,]) - res = df.loc[lambda x: x.A > 2, ] - tm.assert_frame_equal(res, df.loc[df.A > 2, ]) + res = df.loc[lambda x: x.A > 2,] + tm.assert_frame_equal(res, df.loc[df.A > 2,]) - res = df.loc[lambda x: x.B == 'b', :] - tm.assert_frame_equal(res, df.loc[df.B == 'b', :]) + res = df.loc[lambda x: x.B == "b", :] + tm.assert_frame_equal(res, df.loc[df.B == "b", :]) - res = df.loc[lambda x: x.B == 'b', :] - tm.assert_frame_equal(res, df.loc[df.B == 'b', :]) + res = df.loc[lambda x: x.B == "b", :] + tm.assert_frame_equal(res, df.loc[df.B == "b", :]) - res = df.loc[lambda x: x.A > 2, lambda x: x.columns == 'B'] + res = df.loc[lambda x: x.A > 2, lambda x: x.columns == "B"] tm.assert_frame_equal(res, df.loc[df.A > 2, [False, True, False]]) - res = df.loc[lambda x: x.A > 2, lambda x: x.columns == 'B'] + res = df.loc[lambda x: x.A > 2, lambda x: x.columns == "B"] tm.assert_frame_equal(res, df.loc[df.A > 2, [False, True, False]]) - res = df.loc[lambda x: x.A > 2, lambda x: 'B'] - tm.assert_series_equal(res, df.loc[df.A > 2, 'B']) + res = df.loc[lambda x: x.A > 2, lambda x: "B"] + tm.assert_series_equal(res, df.loc[df.A > 2, "B"]) - res = df.loc[lambda x: x.A > 2, lambda x: 'B'] - tm.assert_series_equal(res, df.loc[df.A > 2, 'B']) + res = df.loc[lambda x: x.A > 2, lambda x: "B"] + tm.assert_series_equal(res, df.loc[df.A > 2, "B"]) - res = df.loc[lambda x: x.A > 2, lambda x: ['A', 'B']] - tm.assert_frame_equal(res, df.loc[df.A > 2, ['A', 'B']]) + res = df.loc[lambda x: x.A > 2, lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A > 2, ["A", "B"]]) - res = df.loc[lambda x: x.A > 2, lambda x: ['A', 'B']] - tm.assert_frame_equal(res, df.loc[df.A > 2, ['A', 'B']]) + res = df.loc[lambda x: x.A > 2, lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A > 2, ["A", "B"]]) - res = df.loc[lambda x: x.A == 2, lambda x: ['A', 'B']] - tm.assert_frame_equal(res, df.loc[df.A == 2, ['A', 'B']]) + res = df.loc[lambda x: x.A == 2, lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A == 2, ["A", "B"]]) - res = df.loc[lambda x: x.A == 2, lambda x: ['A', 'B']] - tm.assert_frame_equal(res, df.loc[df.A == 2, ['A', 'B']]) + res = df.loc[lambda x: x.A == 2, lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A == 2, ["A", "B"]]) # scalar - res = df.loc[lambda x: 1, lambda x: 'A'] - assert res == df.loc[1, 'A'] + res = df.loc[lambda x: 1, lambda x: "A"] + assert res == df.loc[1, "A"] - res = df.loc[lambda x: 1, lambda x: 'A'] - assert res == df.loc[1, 'A'] + res = df.loc[lambda x: 1, lambda x: "A"] + assert res == df.loc[1, "A"] def test_frame_loc_callable_mixture(self): # GH 11485 - df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': list('aabb'), - 'C': [1, 2, 3, 4]}) + df = pd.DataFrame({"A": [1, 2, 3, 4], "B": list("aabb"), "C": [1, 2, 3, 4]}) - res = df.loc[lambda x: x.A > 2, ['A', 'B']] - tm.assert_frame_equal(res, df.loc[df.A > 2, ['A', 'B']]) + res = df.loc[lambda x: x.A > 2, ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A > 2, ["A", "B"]]) - res = df.loc[lambda x: x.A > 2, ['A', 'B']] - tm.assert_frame_equal(res, df.loc[df.A > 2, ['A', 'B']]) + res = df.loc[lambda x: x.A > 2, ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A > 2, ["A", "B"]]) - res = df.loc[[2, 3], lambda x: ['A', 'B']] - tm.assert_frame_equal(res, df.loc[[2, 3], ['A', 'B']]) + res = df.loc[[2, 3], lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[[2, 3], ["A", "B"]]) - res = df.loc[[2, 3], lambda x: ['A', 'B']] - tm.assert_frame_equal(res, df.loc[[2, 3], ['A', 'B']]) + res = df.loc[[2, 3], lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[[2, 3], ["A", "B"]]) - res = df.loc[3, lambda x: ['A', 'B']] - tm.assert_series_equal(res, df.loc[3, ['A', 'B']]) + res = df.loc[3, lambda x: ["A", "B"]] + tm.assert_series_equal(res, df.loc[3, ["A", "B"]]) - res = df.loc[3, lambda x: ['A', 'B']] - tm.assert_series_equal(res, df.loc[3, ['A', 'B']]) + res = df.loc[3, lambda x: ["A", "B"]] + tm.assert_series_equal(res, df.loc[3, ["A", "B"]]) def test_frame_loc_callable_labels(self): # GH 11485 - df = pd.DataFrame({'X': [1, 2, 3, 4], - 'Y': list('aabb')}, - index=list('ABCD')) + df = pd.DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) # return label - res = df.loc[lambda x: ['A', 'C']] - tm.assert_frame_equal(res, df.loc[['A', 'C']]) + res = df.loc[lambda x: ["A", "C"]] + tm.assert_frame_equal(res, df.loc[["A", "C"]]) - res = df.loc[lambda x: ['A', 'C'], ] - tm.assert_frame_equal(res, df.loc[['A', 'C'], ]) + res = df.loc[lambda x: ["A", "C"],] + tm.assert_frame_equal(res, df.loc[["A", "C"],]) - res = df.loc[lambda x: ['A', 'C'], :] - tm.assert_frame_equal(res, df.loc[['A', 'C'], :]) + res = df.loc[lambda x: ["A", "C"], :] + tm.assert_frame_equal(res, df.loc[["A", "C"], :]) - res = df.loc[lambda x: ['A', 'C'], lambda x: 'X'] - tm.assert_series_equal(res, df.loc[['A', 'C'], 'X']) + res = df.loc[lambda x: ["A", "C"], lambda x: "X"] + tm.assert_series_equal(res, df.loc[["A", "C"], "X"]) - res = df.loc[lambda x: ['A', 'C'], lambda x: ['X']] - tm.assert_frame_equal(res, df.loc[['A', 'C'], ['X']]) + res = df.loc[lambda x: ["A", "C"], lambda x: ["X"]] + tm.assert_frame_equal(res, df.loc[["A", "C"], ["X"]]) # mixture - res = df.loc[['A', 'C'], lambda x: 'X'] - tm.assert_series_equal(res, df.loc[['A', 'C'], 'X']) + res = df.loc[["A", "C"], lambda x: "X"] + tm.assert_series_equal(res, df.loc[["A", "C"], "X"]) - res = df.loc[['A', 'C'], lambda x: ['X']] - tm.assert_frame_equal(res, df.loc[['A', 'C'], ['X']]) + res = df.loc[["A", "C"], lambda x: ["X"]] + tm.assert_frame_equal(res, df.loc[["A", "C"], ["X"]]) - res = df.loc[lambda x: ['A', 'C'], 'X'] - tm.assert_series_equal(res, df.loc[['A', 'C'], 'X']) + res = df.loc[lambda x: ["A", "C"], "X"] + tm.assert_series_equal(res, df.loc[["A", "C"], "X"]) - res = df.loc[lambda x: ['A', 'C'], ['X']] - tm.assert_frame_equal(res, df.loc[['A', 'C'], ['X']]) + res = df.loc[lambda x: ["A", "C"], ["X"]] + tm.assert_frame_equal(res, df.loc[["A", "C"], ["X"]]) def test_frame_loc_callable_setitem(self): # GH 11485 - df = pd.DataFrame({'X': [1, 2, 3, 4], - 'Y': list('aabb')}, - index=list('ABCD')) + df = pd.DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) # return label res = df.copy() - res.loc[lambda x: ['A', 'C']] = -20 + res.loc[lambda x: ["A", "C"]] = -20 exp = df.copy() - exp.loc[['A', 'C']] = -20 + exp.loc[["A", "C"]] = -20 tm.assert_frame_equal(res, exp) res = df.copy() - res.loc[lambda x: ['A', 'C'], :] = 20 + res.loc[lambda x: ["A", "C"], :] = 20 exp = df.copy() - exp.loc[['A', 'C'], :] = 20 + exp.loc[["A", "C"], :] = 20 tm.assert_frame_equal(res, exp) res = df.copy() - res.loc[lambda x: ['A', 'C'], lambda x: 'X'] = -1 + res.loc[lambda x: ["A", "C"], lambda x: "X"] = -1 exp = df.copy() - exp.loc[['A', 'C'], 'X'] = -1 + exp.loc[["A", "C"], "X"] = -1 tm.assert_frame_equal(res, exp) res = df.copy() - res.loc[lambda x: ['A', 'C'], lambda x: ['X']] = [5, 10] + res.loc[lambda x: ["A", "C"], lambda x: ["X"]] = [5, 10] exp = df.copy() - exp.loc[['A', 'C'], ['X']] = [5, 10] + exp.loc[["A", "C"], ["X"]] = [5, 10] tm.assert_frame_equal(res, exp) # mixture res = df.copy() - res.loc[['A', 'C'], lambda x: 'X'] = np.array([-1, -2]) + res.loc[["A", "C"], lambda x: "X"] = np.array([-1, -2]) exp = df.copy() - exp.loc[['A', 'C'], 'X'] = np.array([-1, -2]) + exp.loc[["A", "C"], "X"] = np.array([-1, -2]) tm.assert_frame_equal(res, exp) res = df.copy() - res.loc[['A', 'C'], lambda x: ['X']] = 10 + res.loc[["A", "C"], lambda x: ["X"]] = 10 exp = df.copy() - exp.loc[['A', 'C'], ['X']] = 10 + exp.loc[["A", "C"], ["X"]] = 10 tm.assert_frame_equal(res, exp) res = df.copy() - res.loc[lambda x: ['A', 'C'], 'X'] = -2 + res.loc[lambda x: ["A", "C"], "X"] = -2 exp = df.copy() - exp.loc[['A', 'C'], 'X'] = -2 + exp.loc[["A", "C"], "X"] = -2 tm.assert_frame_equal(res, exp) res = df.copy() - res.loc[lambda x: ['A', 'C'], ['X']] = -4 + res.loc[lambda x: ["A", "C"], ["X"]] = -4 exp = df.copy() - exp.loc[['A', 'C'], ['X']] = -4 + exp.loc[["A", "C"], ["X"]] = -4 tm.assert_frame_equal(res, exp) def test_frame_iloc_callable(self): # GH 11485 - df = pd.DataFrame({'X': [1, 2, 3, 4], - 'Y': list('aabb')}, - index=list('ABCD')) + df = pd.DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) # return location res = df.iloc[lambda x: [1, 3]] @@ -210,9 +201,7 @@ def test_frame_iloc_callable(self): def test_frame_iloc_callable_setitem(self): # GH 11485 - df = pd.DataFrame({'X': [1, 2, 3, 4], - 'Y': list('aabb')}, - index=list('ABCD')) + df = pd.DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) # return location res = df.copy() diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 1ec89af42a1e15..3549d81623e107 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -6,67 +6,84 @@ import pandas as pd from pandas import ( - Categorical, CategoricalIndex, DataFrame, Index, Interval, Series, - Timestamp) + Categorical, + CategoricalIndex, + DataFrame, + Index, + Interval, + Series, + Timestamp, +) from pandas.api.types import CategoricalDtype as CDT from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal class TestCategoricalIndex: - def setup_method(self, method): - self.df = DataFrame({'A': np.arange(6, dtype='int64'), - 'B': Series(list('aabbca')).astype( - CDT(list('cab')))}).set_index('B') - self.df2 = DataFrame({'A': np.arange(6, dtype='int64'), - 'B': Series(list('aabbca')).astype( - CDT(list('cabe')))}).set_index('B') - self.df3 = DataFrame({'A': np.arange(6, dtype='int64'), - 'B': (Series([1, 1, 2, 1, 3, 2]) - .astype(CDT([3, 2, 1], ordered=True))) - }).set_index('B') - self.df4 = DataFrame({'A': np.arange(6, dtype='int64'), - 'B': (Series([1, 1, 2, 1, 3, 2]) - .astype(CDT([3, 2, 1], ordered=False))) - }).set_index('B') + self.df = DataFrame( + { + "A": np.arange(6, dtype="int64"), + "B": Series(list("aabbca")).astype(CDT(list("cab"))), + } + ).set_index("B") + self.df2 = DataFrame( + { + "A": np.arange(6, dtype="int64"), + "B": Series(list("aabbca")).astype(CDT(list("cabe"))), + } + ).set_index("B") + self.df3 = DataFrame( + { + "A": np.arange(6, dtype="int64"), + "B": (Series([1, 1, 2, 1, 3, 2]).astype(CDT([3, 2, 1], ordered=True))), + } + ).set_index("B") + self.df4 = DataFrame( + { + "A": np.arange(6, dtype="int64"), + "B": (Series([1, 1, 2, 1, 3, 2]).astype(CDT([3, 2, 1], ordered=False))), + } + ).set_index("B") def test_loc_scalar(self): - result = self.df.loc['a'] - expected = (DataFrame({'A': [0, 1, 5], - 'B': (Series(list('aaa')) - .astype(CDT(list('cab'))))}) - .set_index('B')) + result = self.df.loc["a"] + expected = DataFrame( + {"A": [0, 1, 5], "B": (Series(list("aaa")).astype(CDT(list("cab"))))} + ).set_index("B") assert_frame_equal(result, expected) df = self.df.copy() - df.loc['a'] = 20 - expected = (DataFrame({'A': [20, 20, 2, 3, 4, 20], - 'B': (Series(list('aabbca')) - .astype(CDT(list('cab'))))}) - .set_index('B')) + df.loc["a"] = 20 + expected = DataFrame( + { + "A": [20, 20, 2, 3, 4, 20], + "B": (Series(list("aabbca")).astype(CDT(list("cab")))), + } + ).set_index("B") assert_frame_equal(df, expected) # value not in the categories with pytest.raises(KeyError, match=r"^'d'$"): - df.loc['d'] + df.loc["d"] msg = "cannot append a non-category item to a CategoricalIndex" with pytest.raises(TypeError, match=msg): - df.loc['d'] = 10 + df.loc["d"] = 10 - msg = ("cannot insert an item into a CategoricalIndex that is not" - " already an existing category") + msg = ( + "cannot insert an item into a CategoricalIndex that is not" + " already an existing category" + ) with pytest.raises(TypeError, match=msg): - df.loc['d', 'A'] = 10 + df.loc["d", "A"] = 10 with pytest.raises(TypeError, match=msg): - df.loc['d', 'C'] = 10 + df.loc["d", "C"] = 10 def test_getitem_scalar(self): - cats = Categorical([Timestamp('12-31-1999'), - Timestamp('12-31-2000')]) + cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")]) s = Series([1, 2], index=cats) @@ -79,7 +96,7 @@ def test_slicing_directly(self): sliced = cat[3] assert sliced == "d" sliced = cat[3:5] - expected = Categorical(["d", "a"], categories=['a', 'b', 'c', 'd']) + expected = Categorical(["d", "a"], categories=["a", "b", "c", "d"]) tm.assert_numpy_array_equal(sliced._codes, expected._codes) tm.assert_index_equal(sliced.categories, expected.categories) @@ -89,20 +106,22 @@ def test_slicing(self): exp = np.array([4, 3, 2, 1], dtype=np.int64) tm.assert_numpy_array_equal(reversed.__array__(), exp) - df = DataFrame({'value': (np.arange(100) + 1).astype('int64')}) - df['D'] = pd.cut(df.value, bins=[0, 25, 50, 75, 100]) + df = DataFrame({"value": (np.arange(100) + 1).astype("int64")}) + df["D"] = pd.cut(df.value, bins=[0, 25, 50, 75, 100]) - expected = Series([11, Interval(0, 25)], index=['value', 'D'], name=10) + expected = Series([11, Interval(0, 25)], index=["value", "D"], name=10) result = df.iloc[10] tm.assert_series_equal(result, expected) - expected = DataFrame({'value': np.arange(11, 21).astype('int64')}, - index=np.arange(10, 20).astype('int64')) - expected['D'] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100]) + expected = DataFrame( + {"value": np.arange(11, 21).astype("int64")}, + index=np.arange(10, 20).astype("int64"), + ) + expected["D"] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100]) result = df.iloc[10:20] tm.assert_frame_equal(result, expected) - expected = Series([9, Interval(0, 25)], index=['value', 'D'], name=8) + expected = Series([9, Interval(0, 25)], index=["value", "D"], name=8) result = df.loc[8] tm.assert_series_equal(result, expected) @@ -116,7 +135,8 @@ def test_slicing_and_getting_ops(self): # - returning a single value cats = Categorical( - ["a", "c", "b", "c", "c", "c", "c"], categories=["a", "b", "c"]) + ["a", "c", "b", "c", "c", "c", "c"], categories=["a", "b", "c"] + ) idx = Index(["h", "i", "j", "k", "l", "m", "n"]) values = [1, 2, 3, 4, 5, 6, 7] df = DataFrame({"cats": cats, "values": values}, index=idx) @@ -130,11 +150,10 @@ def test_slicing_and_getting_ops(self): exp_df = DataFrame({"cats": cats2, "values": values2}, index=idx2) # :,"cats" | :,0 - exp_col = Series(cats, index=idx, name='cats') + exp_col = Series(cats, index=idx, name="cats") # "j",: | 2,: - exp_row = Series(["b", 3], index=["cats", "values"], dtype="object", - name="j") + exp_row = Series(["b", 3], index=["cats", "values"], dtype="object", name="j") # "j","cats | 2,0 exp_val = "b" @@ -248,39 +267,47 @@ def test_slicing_and_getting_ops(self): def test_slicing_doc_examples(self): # GH 7918 - cats = Categorical(["a", "b", "b", "b", "c", "c", "c"], - categories=["a", "b", "c"]) - idx = Index(["h", "i", "j", "k", "l", "m", "n", ]) + cats = Categorical( + ["a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c"] + ) + idx = Index(["h", "i", "j", "k", "l", "m", "n"]) values = [1, 2, 2, 2, 3, 4, 5] df = DataFrame({"cats": cats, "values": values}, index=idx) result = df.iloc[2:4, :] expected = DataFrame( - {"cats": Categorical(['b', 'b'], categories=['a', 'b', 'c']), - "values": [2, 2]}, index=['j', 'k']) + { + "cats": Categorical(["b", "b"], categories=["a", "b", "c"]), + "values": [2, 2], + }, + index=["j", "k"], + ) tm.assert_frame_equal(result, expected) result = df.iloc[2:4, :].dtypes - expected = Series(['category', 'int64'], ['cats', 'values']) + expected = Series(["category", "int64"], ["cats", "values"]) tm.assert_series_equal(result, expected) result = df.loc["h":"j", "cats"] - expected = Series(Categorical(['a', 'b', 'b'], - categories=['a', 'b', 'c']), - index=['h', 'i', 'j'], name='cats') + expected = Series( + Categorical(["a", "b", "b"], categories=["a", "b", "c"]), + index=["h", "i", "j"], + name="cats", + ) tm.assert_series_equal(result, expected) result = df.loc["h":"j", df.columns[0:1]] - expected = DataFrame({'cats': Categorical(['a', 'b', 'b'], - categories=['a', 'b', 'c'])}, - index=['h', 'i', 'j']) + expected = DataFrame( + {"cats": Categorical(["a", "b", "b"], categories=["a", "b", "c"])}, + index=["h", "i", "j"], + ) tm.assert_frame_equal(result, expected) def test_getitem_category_type(self): # GH 14580 # test iloc() on Series with Categorical data - s = Series([1, 2, 3]).astype('category') + s = Series([1, 2, 3]).astype("category") # get slice result = s.iloc[0:2] @@ -300,156 +327,147 @@ def test_getitem_category_type(self): def test_loc_listlike(self): # list of labels - result = self.df.loc[['c', 'a']] + result = self.df.loc[["c", "a"]] expected = self.df.iloc[[4, 0, 1, 5]] assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.loc[['a', 'b', 'e']] - exp_index = CategoricalIndex( - list('aaabbe'), categories=list('cabe'), name='B') - expected = DataFrame({'A': [0, 1, 5, 2, 3, np.nan]}, index=exp_index) + result = self.df2.loc[["a", "b", "e"]] + exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B") + expected = DataFrame({"A": [0, 1, 5, 2, 3, np.nan]}, index=exp_index) assert_frame_equal(result, expected, check_index_type=True) # element in the categories but not in the values with pytest.raises(KeyError, match=r"^'e'$"): - self.df2.loc['e'] + self.df2.loc["e"] # assign is ok df = self.df2.copy() - df.loc['e'] = 20 - result = df.loc[['a', 'b', 'e']] - exp_index = CategoricalIndex( - list('aaabbe'), categories=list('cabe'), name='B') - expected = DataFrame({'A': [0, 1, 5, 2, 3, 20]}, index=exp_index) + df.loc["e"] = 20 + result = df.loc[["a", "b", "e"]] + exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B") + expected = DataFrame({"A": [0, 1, 5, 2, 3, 20]}, index=exp_index) assert_frame_equal(result, expected) df = self.df2.copy() - result = df.loc[['a', 'b', 'e']] - exp_index = CategoricalIndex( - list('aaabbe'), categories=list('cabe'), name='B') - expected = DataFrame({'A': [0, 1, 5, 2, 3, np.nan]}, index=exp_index) + result = df.loc[["a", "b", "e"]] + exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B") + expected = DataFrame({"A": [0, 1, 5, 2, 3, np.nan]}, index=exp_index) assert_frame_equal(result, expected, check_index_type=True) # not all labels in the categories with pytest.raises(KeyError): - self.df2.loc[['a', 'd']] + self.df2.loc[["a", "d"]] def test_loc_listlike_dtypes(self): # GH 11586 # unique categories and codes - index = CategoricalIndex(['a', 'b', 'c']) - df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index) + index = CategoricalIndex(["a", "b", "c"]) + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index) # unique slice - res = df.loc[['a', 'b']] - exp_index = CategoricalIndex(['a', 'b'], - categories=index.categories) - exp = DataFrame({'A': [1, 2], 'B': [4, 5]}, index=exp_index) + res = df.loc[["a", "b"]] + exp_index = CategoricalIndex(["a", "b"], categories=index.categories) + exp = DataFrame({"A": [1, 2], "B": [4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice - res = df.loc[['a', 'a', 'b']] + res = df.loc[["a", "a", "b"]] - exp_index = CategoricalIndex(['a', 'a', 'b'], - categories=index.categories) - exp = DataFrame({'A': [1, 1, 2], 'B': [4, 4, 5]}, index=exp_index) + exp_index = CategoricalIndex(["a", "a", "b"], categories=index.categories) + exp = DataFrame({"A": [1, 1, 2], "B": [4, 4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) - msg = ('a list-indexer must only include ' - 'values that are in the categories') + msg = "a list-indexer must only include " "values that are in the categories" with pytest.raises(KeyError, match=msg): - df.loc[['a', 'x']] + df.loc[["a", "x"]] # duplicated categories and codes - index = CategoricalIndex(['a', 'b', 'a']) - df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index) + index = CategoricalIndex(["a", "b", "a"]) + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index) # unique slice - res = df.loc[['a', 'b']] - exp = DataFrame({'A': [1, 3, 2], - 'B': [4, 6, 5]}, - index=CategoricalIndex(['a', 'a', 'b'])) + res = df.loc[["a", "b"]] + exp = DataFrame( + {"A": [1, 3, 2], "B": [4, 6, 5]}, index=CategoricalIndex(["a", "a", "b"]) + ) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice - res = df.loc[['a', 'a', 'b']] + res = df.loc[["a", "a", "b"]] exp = DataFrame( - {'A': [1, 3, 1, 3, 2], - 'B': [4, 6, 4, 6, 5 - ]}, index=CategoricalIndex(['a', 'a', 'a', 'a', 'b'])) + {"A": [1, 3, 1, 3, 2], "B": [4, 6, 4, 6, 5]}, + index=CategoricalIndex(["a", "a", "a", "a", "b"]), + ) tm.assert_frame_equal(res, exp, check_index_type=True) - msg = ('a list-indexer must only include values ' - 'that are in the categories') + msg = "a list-indexer must only include values " "that are in the categories" with pytest.raises(KeyError, match=msg): - df.loc[['a', 'x']] + df.loc[["a", "x"]] # contains unused category - index = CategoricalIndex( - ['a', 'b', 'a', 'c'], categories=list('abcde')) - df = DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, index=index) - - res = df.loc[['a', 'b']] - exp = DataFrame({'A': [1, 3, 2], 'B': [5, 7, 6]}, - index=CategoricalIndex(['a', 'a', 'b'], - categories=list('abcde'))) + index = CategoricalIndex(["a", "b", "a", "c"], categories=list("abcde")) + df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index) + + res = df.loc[["a", "b"]] + exp = DataFrame( + {"A": [1, 3, 2], "B": [5, 7, 6]}, + index=CategoricalIndex(["a", "a", "b"], categories=list("abcde")), + ) tm.assert_frame_equal(res, exp, check_index_type=True) - res = df.loc[['a', 'e']] - exp = DataFrame({'A': [1, 3, np.nan], 'B': [5, 7, np.nan]}, - index=CategoricalIndex(['a', 'a', 'e'], - categories=list('abcde'))) + res = df.loc[["a", "e"]] + exp = DataFrame( + {"A": [1, 3, np.nan], "B": [5, 7, np.nan]}, + index=CategoricalIndex(["a", "a", "e"], categories=list("abcde")), + ) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice - res = df.loc[['a', 'a', 'b']] - exp = DataFrame({'A': [1, 3, 1, 3, 2], 'B': [5, 7, 5, 7, 6]}, - index=CategoricalIndex(['a', 'a', 'a', 'a', 'b'], - categories=list('abcde'))) + res = df.loc[["a", "a", "b"]] + exp = DataFrame( + {"A": [1, 3, 1, 3, 2], "B": [5, 7, 5, 7, 6]}, + index=CategoricalIndex(["a", "a", "a", "a", "b"], categories=list("abcde")), + ) tm.assert_frame_equal(res, exp, check_index_type=True) - msg = ('a list-indexer must only include values ' - 'that are in the categories') + msg = "a list-indexer must only include values " "that are in the categories" with pytest.raises(KeyError, match=msg): - df.loc[['a', 'x']] + df.loc[["a", "x"]] def test_get_indexer_array(self): - arr = np.array([Timestamp('1999-12-31 00:00:00'), - Timestamp('2000-12-31 00:00:00')], dtype=object) - cats = [Timestamp('1999-12-31 00:00:00'), - Timestamp('2000-12-31 00:00:00')] - ci = CategoricalIndex(cats, - categories=cats, - ordered=False, dtype='category') + arr = np.array( + [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")], + dtype=object, + ) + cats = [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")] + ci = CategoricalIndex(cats, categories=cats, ordered=False, dtype="category") result = ci.get_indexer(arr) - expected = np.array([0, 1], dtype='intp') + expected = np.array([0, 1], dtype="intp") tm.assert_numpy_array_equal(result, expected) def test_get_indexer_same_categories_same_order(self): - ci = CategoricalIndex(['a', 'b'], categories=['a', 'b']) + ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) - result = ci.get_indexer(CategoricalIndex(['b', 'b'], - categories=['a', 'b'])) - expected = np.array([1, 1], dtype='intp') + result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["a", "b"])) + expected = np.array([1, 1], dtype="intp") tm.assert_numpy_array_equal(result, expected) def test_get_indexer_same_categories_different_order(self): # https://github.com/pandas-dev/pandas/issues/19551 - ci = CategoricalIndex(['a', 'b'], categories=['a', 'b']) + ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) - result = ci.get_indexer(CategoricalIndex(['b', 'b'], - categories=['b', 'a'])) - expected = np.array([1, 1], dtype='intp') + result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["b", "a"])) + expected = np.array([1, 1], dtype="intp") tm.assert_numpy_array_equal(result, expected) def test_getitem_with_listlike(self): # GH 16115 - cats = Categorical([Timestamp('12-31-1999'), - Timestamp('12-31-2000')]) + cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")]) - expected = DataFrame([[1, 0], [0, 1]], dtype='uint8', - index=[0, 1], columns=cats) + expected = DataFrame( + [[1, 0], [0, 1]], dtype="uint8", index=[0, 1], columns=cats + ) dummies = pd.get_dummies(cats) result = dummies[[c for c in dummies.columns]] assert_frame_equal(result, expected) @@ -459,65 +477,65 @@ def test_setitem_listlike(self): # GH 9469 # properly coerce the input indexers np.random.seed(1) - c = Categorical(np.random.randint(0, 5, size=150000).astype( - np.int8)).add_categories([-1000]) + c = Categorical( + np.random.randint(0, 5, size=150000).astype(np.int8) + ).add_categories([-1000]) indexer = np.array([100000]).astype(np.int64) c[indexer] = -1000 # we are asserting the code result here # which maps to the -1000 category result = c.codes[np.array([100000]).astype(np.int64)] - tm.assert_numpy_array_equal(result, np.array([5], dtype='int8')) + tm.assert_numpy_array_equal(result, np.array([5], dtype="int8")) def test_ix_categorical_index(self): # GH 12531 - df = DataFrame(np.random.randn(3, 3), - index=list('ABC'), columns=list('XYZ')) + df = DataFrame(np.random.randn(3, 3), index=list("ABC"), columns=list("XYZ")) cdf = df.copy() cdf.index = CategoricalIndex(df.index) cdf.columns = CategoricalIndex(df.columns) - expect = Series(df.loc['A', :], index=cdf.columns, name='A') - assert_series_equal(cdf.loc['A', :], expect) + expect = Series(df.loc["A", :], index=cdf.columns, name="A") + assert_series_equal(cdf.loc["A", :], expect) - expect = Series(df.loc[:, 'X'], index=cdf.index, name='X') - assert_series_equal(cdf.loc[:, 'X'], expect) + expect = Series(df.loc[:, "X"], index=cdf.index, name="X") + assert_series_equal(cdf.loc[:, "X"], expect) - exp_index = CategoricalIndex(list('AB'), categories=['A', 'B', 'C']) - expect = DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns, - index=exp_index) - assert_frame_equal(cdf.loc[['A', 'B'], :], expect) + exp_index = CategoricalIndex(list("AB"), categories=["A", "B", "C"]) + expect = DataFrame(df.loc[["A", "B"], :], columns=cdf.columns, index=exp_index) + assert_frame_equal(cdf.loc[["A", "B"], :], expect) - exp_columns = CategoricalIndex(list('XY'), - categories=['X', 'Y', 'Z']) - expect = DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index, - columns=exp_columns) - assert_frame_equal(cdf.loc[:, ['X', 'Y']], expect) + exp_columns = CategoricalIndex(list("XY"), categories=["X", "Y", "Z"]) + expect = DataFrame(df.loc[:, ["X", "Y"]], index=cdf.index, columns=exp_columns) + assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect) # non-unique - df = DataFrame(np.random.randn(3, 3), - index=list('ABA'), columns=list('XYX')) + df = DataFrame(np.random.randn(3, 3), index=list("ABA"), columns=list("XYX")) cdf = df.copy() cdf.index = CategoricalIndex(df.index) cdf.columns = CategoricalIndex(df.columns) - exp_index = CategoricalIndex(list('AA'), categories=['A', 'B']) - expect = DataFrame(df.loc['A', :], columns=cdf.columns, - index=exp_index) - assert_frame_equal(cdf.loc['A', :], expect) + exp_index = CategoricalIndex(list("AA"), categories=["A", "B"]) + expect = DataFrame(df.loc["A", :], columns=cdf.columns, index=exp_index) + assert_frame_equal(cdf.loc["A", :], expect) - exp_columns = CategoricalIndex(list('XX'), categories=['X', 'Y']) - expect = DataFrame(df.loc[:, 'X'], index=cdf.index, - columns=exp_columns) - assert_frame_equal(cdf.loc[:, 'X'], expect) + exp_columns = CategoricalIndex(list("XX"), categories=["X", "Y"]) + expect = DataFrame(df.loc[:, "X"], index=cdf.index, columns=exp_columns) + assert_frame_equal(cdf.loc[:, "X"], expect) - expect = DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns, - index=CategoricalIndex(list('AAB'))) - assert_frame_equal(cdf.loc[['A', 'B'], :], expect) + expect = DataFrame( + df.loc[["A", "B"], :], + columns=cdf.columns, + index=CategoricalIndex(list("AAB")), + ) + assert_frame_equal(cdf.loc[["A", "B"], :], expect) - expect = DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index, - columns=CategoricalIndex(list('XXY'))) - assert_frame_equal(cdf.loc[:, ['X', 'Y']], expect) + expect = DataFrame( + df.loc[:, ["X", "Y"]], + index=cdf.index, + columns=CategoricalIndex(list("XXY")), + ) + assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect) def test_read_only_source(self): # GH 10043 @@ -542,95 +560,101 @@ def test_reindexing(self): # reindexing # convert to a regular index - result = self.df2.reindex(['a', 'b', 'e']) - expected = DataFrame({'A': [0, 1, 5, 2, 3, np.nan], - 'B': Series(list('aaabbe'))}).set_index('B') + result = self.df2.reindex(["a", "b", "e"]) + expected = DataFrame( + {"A": [0, 1, 5, 2, 3, np.nan], "B": Series(list("aaabbe"))} + ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(['a', 'b']) - expected = DataFrame({'A': [0, 1, 5, 2, 3], - 'B': Series(list('aaabb'))}).set_index('B') + result = self.df2.reindex(["a", "b"]) + expected = DataFrame( + {"A": [0, 1, 5, 2, 3], "B": Series(list("aaabb"))} + ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(['e']) - expected = DataFrame({'A': [np.nan], - 'B': Series(['e'])}).set_index('B') + result = self.df2.reindex(["e"]) + expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(['d']) - expected = DataFrame({'A': [np.nan], - 'B': Series(['d'])}).set_index('B') + result = self.df2.reindex(["d"]) + expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) # since we are actually reindexing with a Categorical # then return a Categorical - cats = list('cabe') + cats = list("cabe") - result = self.df2.reindex(Categorical(['a', 'd'], categories=cats)) - expected = DataFrame({'A': [0, 1, 5, np.nan], - 'B': Series(list('aaad')).astype( - CDT(cats))}).set_index('B') + result = self.df2.reindex(Categorical(["a", "d"], categories=cats)) + expected = DataFrame( + {"A": [0, 1, 5, np.nan], "B": Series(list("aaad")).astype(CDT(cats))} + ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(Categorical(['a'], categories=cats)) - expected = DataFrame({'A': [0, 1, 5], - 'B': Series(list('aaa')).astype( - CDT(cats))}).set_index('B') + result = self.df2.reindex(Categorical(["a"], categories=cats)) + expected = DataFrame( + {"A": [0, 1, 5], "B": Series(list("aaa")).astype(CDT(cats))} + ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(['a', 'b', 'e']) - expected = DataFrame({'A': [0, 1, 5, 2, 3, np.nan], - 'B': Series(list('aaabbe'))}).set_index('B') + result = self.df2.reindex(["a", "b", "e"]) + expected = DataFrame( + {"A": [0, 1, 5, 2, 3, np.nan], "B": Series(list("aaabbe"))} + ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(['a', 'b']) - expected = DataFrame({'A': [0, 1, 5, 2, 3], - 'B': Series(list('aaabb'))}).set_index('B') + result = self.df2.reindex(["a", "b"]) + expected = DataFrame( + {"A": [0, 1, 5, 2, 3], "B": Series(list("aaabb"))} + ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(['e']) - expected = DataFrame({'A': [np.nan], - 'B': Series(['e'])}).set_index('B') + result = self.df2.reindex(["e"]) + expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) # give back the type of categorical that we received - result = self.df2.reindex(Categorical( - ['a', 'd'], categories=cats, ordered=True)) + result = self.df2.reindex( + Categorical(["a", "d"], categories=cats, ordered=True) + ) expected = DataFrame( - {'A': [0, 1, 5, np.nan], - 'B': Series(list('aaad')).astype( - CDT(cats, ordered=True))}).set_index('B') + { + "A": [0, 1, 5, np.nan], + "B": Series(list("aaad")).astype(CDT(cats, ordered=True)), + } + ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(Categorical( - ['a', 'd'], categories=['a', 'd'])) - expected = DataFrame({'A': [0, 1, 5, np.nan], - 'B': Series(list('aaad')).astype( - CDT(['a', 'd']))}).set_index('B') + result = self.df2.reindex(Categorical(["a", "d"], categories=["a", "d"])) + expected = DataFrame( + {"A": [0, 1, 5, np.nan], "B": Series(list("aaad")).astype(CDT(["a", "d"]))} + ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) # passed duplicate indexers are not allowed msg = "cannot reindex with a non-unique indexer" with pytest.raises(ValueError, match=msg): - self.df2.reindex(['a', 'a']) + self.df2.reindex(["a", "a"]) # args NotImplemented ATM msg = r"argument {} is not implemented for CategoricalIndex\.reindex" - with pytest.raises(NotImplementedError, match=msg.format('method')): - self.df2.reindex(['a'], method='ffill') - with pytest.raises(NotImplementedError, match=msg.format('level')): - self.df2.reindex(['a'], level=1) - with pytest.raises(NotImplementedError, match=msg.format('limit')): - self.df2.reindex(['a'], limit=2) + with pytest.raises(NotImplementedError, match=msg.format("method")): + self.df2.reindex(["a"], method="ffill") + with pytest.raises(NotImplementedError, match=msg.format("level")): + self.df2.reindex(["a"], level=1) + with pytest.raises(NotImplementedError, match=msg.format("limit")): + self.df2.reindex(["a"], limit=2) def test_loc_slice(self): # slicing # not implemented ATM # GH9748 - msg = ("cannot do slice indexing on {klass} with these " - r"indexers \[1\] of {kind}".format( - klass=str(CategoricalIndex), kind=str(int))) + msg = ( + "cannot do slice indexing on {klass} with these " + r"indexers \[1\] of {kind}".format( + klass=str(CategoricalIndex), kind=str(int) + ) + ) with pytest.raises(TypeError, match=msg): self.df.loc[1:5] @@ -641,23 +665,24 @@ def test_loc_slice(self): def test_loc_and_at_with_categorical_index(self): # GH 20629 s = Series([1, 2, 3], index=pd.CategoricalIndex(["A", "B", "C"])) - assert s.loc['A'] == 1 - assert s.at['A'] == 1 - df = DataFrame([[1, 2], [3, 4], [5, 6]], - index=pd.CategoricalIndex(["A", "B", "C"])) - assert df.loc['B', 1] == 4 - assert df.at['B', 1] == 4 + assert s.loc["A"] == 1 + assert s.at["A"] == 1 + df = DataFrame( + [[1, 2], [3, 4], [5, 6]], index=pd.CategoricalIndex(["A", "B", "C"]) + ) + assert df.loc["B", 1] == 4 + assert df.at["B", 1] == 4 def test_boolean_selection(self): df3 = self.df3 df4 = self.df4 - result = df3[df3.index == 'a'] + result = df3[df3.index == "a"] expected = df3.iloc[[]] assert_frame_equal(result, expected) - result = df4[df4.index == 'a'] + result = df4[df4.index == "a"] expected = df4.iloc[[]] assert_frame_equal(result, expected) @@ -701,31 +726,31 @@ def test_indexing_with_category(self): # https://github.com/pandas-dev/pandas/issues/12564 # consistent result if comparing as Dataframe - cat = DataFrame({'A': ['foo', 'bar', 'baz']}) - exp = DataFrame({'A': [True, False, False]}) + cat = DataFrame({"A": ["foo", "bar", "baz"]}) + exp = DataFrame({"A": [True, False, False]}) - res = (cat[['A']] == 'foo') + res = cat[["A"]] == "foo" tm.assert_frame_equal(res, exp) - cat['A'] = cat['A'].astype('category') + cat["A"] = cat["A"].astype("category") - res = (cat[['A']] == 'foo') + res = cat[["A"]] == "foo" tm.assert_frame_equal(res, exp) def test_map_with_dict_or_series(self): - orig_values = ['a', 'B', 1, 'a'] - new_values = ['one', 2, 3.0, 'one'] - cur_index = pd.CategoricalIndex(orig_values, name='XXX') - expected = pd.CategoricalIndex(new_values, - name='XXX', categories=[3.0, 2, 'one']) + orig_values = ["a", "B", 1, "a"] + new_values = ["one", 2, 3.0, "one"] + cur_index = pd.CategoricalIndex(orig_values, name="XXX") + expected = pd.CategoricalIndex( + new_values, name="XXX", categories=[3.0, 2, "one"] + ) mapper = pd.Series(new_values[:-1], index=orig_values[:-1]) output = cur_index.map(mapper) # Order of categories in output can be different tm.assert_index_equal(expected, output) - mapper = {o: n for o, n in - zip(orig_values[:-1], new_values[:-1])} + mapper = {o: n for o, n in zip(orig_values[:-1], new_values[:-1])} output = cur_index.map(mapper) # Order of categories in output can be different tm.assert_index_equal(expected, output) diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index b94d3000a58417..7d47063623d87b 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -8,11 +8,10 @@ class TestCaching: - def test_slice_consolidate_invalidate_item_cache(self): # this is chained assignment, but will 'work' - with option_context('chained_assignment', None): + with option_context("chained_assignment", None): # #3970 df = DataFrame({"aa": np.arange(5), "bb": [2.2] * 5}) @@ -27,164 +26,168 @@ def test_slice_consolidate_invalidate_item_cache(self): repr(df) # Assignment to wrong series - df['bb'].iloc[0] = 0.17 + df["bb"].iloc[0] = 0.17 df._clear_item_cache() - tm.assert_almost_equal(df['bb'][0], 0.17) + tm.assert_almost_equal(df["bb"][0], 0.17) def test_setitem_cache_updating(self): # GH 5424 - cont = ['one', 'two', 'three', 'four', 'five', 'six', 'seven'] + cont = ["one", "two", "three", "four", "five", "six", "seven"] for do_ref in [False, False]: - df = DataFrame({'a': cont, - "b": cont[3:] + cont[:3], - 'c': np.arange(7)}) + df = DataFrame({"a": cont, "b": cont[3:] + cont[:3], "c": np.arange(7)}) # ref the cache if do_ref: df.loc[0, "c"] # set it - df.loc[7, 'c'] = 1 + df.loc[7, "c"] = 1 - assert df.loc[0, 'c'] == 0.0 - assert df.loc[7, 'c'] == 1.0 + assert df.loc[0, "c"] == 0.0 + assert df.loc[7, "c"] == 1.0 # GH 7084 # not updating cache on series setting with slices - expected = DataFrame({'A': [600, 600, 600]}, - index=date_range('5/7/2014', '5/9/2014')) - out = DataFrame({'A': [0, 0, 0]}, - index=date_range('5/7/2014', '5/9/2014')) - df = DataFrame({'C': ['A', 'A', 'A'], 'D': [100, 200, 300]}) + expected = DataFrame( + {"A": [600, 600, 600]}, index=date_range("5/7/2014", "5/9/2014") + ) + out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014")) + df = DataFrame({"C": ["A", "A", "A"], "D": [100, 200, 300]}) # loop through df to update out - six = Timestamp('5/7/2014') - eix = Timestamp('5/9/2014') + six = Timestamp("5/7/2014") + eix = Timestamp("5/9/2014") for ix, row in df.iterrows(): - out.loc[six:eix, row['C']] = out.loc[six:eix, row['C']] + row['D'] + out.loc[six:eix, row["C"]] = out.loc[six:eix, row["C"]] + row["D"] tm.assert_frame_equal(out, expected) - tm.assert_series_equal(out['A'], expected['A']) + tm.assert_series_equal(out["A"], expected["A"]) # try via a chain indexing # this actually works - out = DataFrame({'A': [0, 0, 0]}, - index=date_range('5/7/2014', '5/9/2014')) + out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014")) for ix, row in df.iterrows(): - v = out[row['C']][six:eix] + row['D'] - out[row['C']][six:eix] = v + v = out[row["C"]][six:eix] + row["D"] + out[row["C"]][six:eix] = v tm.assert_frame_equal(out, expected) - tm.assert_series_equal(out['A'], expected['A']) + tm.assert_series_equal(out["A"], expected["A"]) - out = DataFrame({'A': [0, 0, 0]}, - index=date_range('5/7/2014', '5/9/2014')) + out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014")) for ix, row in df.iterrows(): - out.loc[six:eix, row['C']] += row['D'] + out.loc[six:eix, row["C"]] += row["D"] tm.assert_frame_equal(out, expected) - tm.assert_series_equal(out['A'], expected['A']) + tm.assert_series_equal(out["A"], expected["A"]) class TestChaining: - def test_setitem_chained_setfault(self): # GH6026 - data = ['right', 'left', 'left', 'left', 'right', 'left', 'timeout'] - mdata = ['right', 'left', 'left', 'left', 'right', 'left', 'none'] + data = ["right", "left", "left", "left", "right", "left", "timeout"] + mdata = ["right", "left", "left", "left", "right", "left", "none"] - df = DataFrame({'response': np.array(data)}) - mask = df.response == 'timeout' - df.response[mask] = 'none' - tm.assert_frame_equal(df, DataFrame({'response': mdata})) + df = DataFrame({"response": np.array(data)}) + mask = df.response == "timeout" + df.response[mask] = "none" + tm.assert_frame_equal(df, DataFrame({"response": mdata})) - recarray = np.rec.fromarrays([data], names=['response']) + recarray = np.rec.fromarrays([data], names=["response"]) df = DataFrame(recarray) - mask = df.response == 'timeout' - df.response[mask] = 'none' - tm.assert_frame_equal(df, DataFrame({'response': mdata})) + mask = df.response == "timeout" + df.response[mask] = "none" + tm.assert_frame_equal(df, DataFrame({"response": mdata})) - df = DataFrame({'response': data, 'response1': data}) - mask = df.response == 'timeout' - df.response[mask] = 'none' - tm.assert_frame_equal(df, DataFrame({'response': mdata, - 'response1': data})) + df = DataFrame({"response": data, "response1": data}) + mask = df.response == "timeout" + df.response[mask] = "none" + tm.assert_frame_equal(df, DataFrame({"response": mdata, "response1": data})) # GH 6056 - expected = DataFrame(dict(A=[np.nan, 'bar', 'bah', 'foo', 'bar'])) - df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar']))) - df['A'].iloc[0] = np.nan + expected = DataFrame(dict(A=[np.nan, "bar", "bah", "foo", "bar"])) + df = DataFrame(dict(A=np.array(["foo", "bar", "bah", "foo", "bar"]))) + df["A"].iloc[0] = np.nan result = df.head() tm.assert_frame_equal(result, expected) - df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar']))) + df = DataFrame(dict(A=np.array(["foo", "bar", "bah", "foo", "bar"]))) df.A.iloc[0] = np.nan result = df.head() tm.assert_frame_equal(result, expected) def test_detect_chained_assignment(self): - pd.set_option('chained_assignment', 'raise') + pd.set_option("chained_assignment", "raise") # work with the chain - expected = DataFrame([[-5, 1], [-6, 3]], columns=list('AB')) - df = DataFrame(np.arange(4).reshape(2, 2), - columns=list('AB'), dtype='int64') + expected = DataFrame([[-5, 1], [-6, 3]], columns=list("AB")) + df = DataFrame(np.arange(4).reshape(2, 2), columns=list("AB"), dtype="int64") assert df._is_copy is None - df['A'][0] = -5 - df['A'][1] = -6 + df["A"][0] = -5 + df["A"][1] = -6 tm.assert_frame_equal(df, expected) # test with the chaining - df = DataFrame({'A': Series(range(2), dtype='int64'), - 'B': np.array(np.arange(2, 4), dtype=np.float64)}) + df = DataFrame( + { + "A": Series(range(2), dtype="int64"), + "B": np.array(np.arange(2, 4), dtype=np.float64), + } + ) assert df._is_copy is None with pytest.raises(com.SettingWithCopyError): - df['A'][0] = -5 + df["A"][0] = -5 with pytest.raises(com.SettingWithCopyError): - df['A'][1] = np.nan + df["A"][1] = np.nan - assert df['A']._is_copy is None + assert df["A"]._is_copy is None # Using a copy (the chain), fails - df = DataFrame({'A': Series(range(2), dtype='int64'), - 'B': np.array(np.arange(2, 4), dtype=np.float64)}) + df = DataFrame( + { + "A": Series(range(2), dtype="int64"), + "B": np.array(np.arange(2, 4), dtype=np.float64), + } + ) with pytest.raises(com.SettingWithCopyError): - df.loc[0]['A'] = -5 + df.loc[0]["A"] = -5 # Doc example - df = DataFrame({'a': ['one', 'one', 'two', 'three', - 'two', 'one', 'six'], - 'c': Series(range(7), dtype='int64')}) + df = DataFrame( + { + "a": ["one", "one", "two", "three", "two", "one", "six"], + "c": Series(range(7), dtype="int64"), + } + ) assert df._is_copy is None with pytest.raises(com.SettingWithCopyError): - indexer = df.a.str.startswith('o') - df[indexer]['c'] = 42 + indexer = df.a.str.startswith("o") + df[indexer]["c"] = 42 - expected = DataFrame({'A': [111, 'bbb', 'ccc'], 'B': [1, 2, 3]}) - df = DataFrame({'A': ['aaa', 'bbb', 'ccc'], 'B': [1, 2, 3]}) + expected = DataFrame({"A": [111, "bbb", "ccc"], "B": [1, 2, 3]}) + df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) with pytest.raises(com.SettingWithCopyError): - df['A'][0] = 111 + df["A"][0] = 111 with pytest.raises(com.SettingWithCopyError): - df.loc[0]['A'] = 111 + df.loc[0]["A"] = 111 - df.loc[0, 'A'] = 111 + df.loc[0, "A"] = 111 tm.assert_frame_equal(df, expected) # gh-5475: Make sure that is_copy is picked up reconstruction df = DataFrame({"A": [1, 2]}) assert df._is_copy is None - with tm.ensure_clean('__tmp__pickle') as path: + with tm.ensure_clean("__tmp__pickle") as path: df.to_pickle(path) df2 = pd.read_pickle(path) df2["B"] = df2["A"] @@ -199,9 +202,9 @@ def random_text(nobs=100): idx = np.random.randint(len(letters), size=2) idx.sort() - df.append([letters[idx[0]:idx[1]]]) + df.append([letters[idx[0] : idx[1]]]) - return DataFrame(df, columns=['letters']) + return DataFrame(df, columns=["letters"]) df = random_text(100000) @@ -217,7 +220,7 @@ def random_text(nobs=100): df = df.loc[indexer].copy() assert df._is_copy is None - df['letters'] = df['letters'].apply(str.lower) + df["letters"] = df["letters"].apply(str.lower) # Implicitly take df = random_text(100000) @@ -225,7 +228,7 @@ def random_text(nobs=100): df = df.loc[indexer] assert df._is_copy is not None - df['letters'] = df['letters'].apply(str.lower) + df["letters"] = df["letters"].apply(str.lower) # Implicitly take 2 df = random_text(100000) @@ -233,23 +236,22 @@ def random_text(nobs=100): df = df.loc[indexer] assert df._is_copy is not None - df.loc[:, 'letters'] = df['letters'].apply(str.lower) + df.loc[:, "letters"] = df["letters"].apply(str.lower) # Should be ok even though it's a copy! assert df._is_copy is None - df['letters'] = df['letters'].apply(str.lower) + df["letters"] = df["letters"].apply(str.lower) assert df._is_copy is None df = random_text(100000) indexer = df.letters.apply(lambda x: len(x) > 10) - df.loc[indexer, 'letters'] = ( - df.loc[indexer, 'letters'].apply(str.lower)) + df.loc[indexer, "letters"] = df.loc[indexer, "letters"].apply(str.lower) # an identical take, so no copy - df = DataFrame({'a': [1]}).dropna() + df = DataFrame({"a": [1]}).dropna() assert df._is_copy is None - df['a'] += 1 + df["a"] += 1 df = DataFrame(np.random.randn(10, 4)) s = df.iloc[:, 0].sort_values() @@ -258,62 +260,64 @@ def random_text(nobs=100): tm.assert_series_equal(s, df[0].sort_values()) # see gh-6025: false positives - df = DataFrame({'column1': ['a', 'a', 'a'], 'column2': [4, 8, 9]}) + df = DataFrame({"column1": ["a", "a", "a"], "column2": [4, 8, 9]}) str(df) - df['column1'] = df['column1'] + 'b' + df["column1"] = df["column1"] + "b" str(df) - df = df[df['column2'] != 8] + df = df[df["column2"] != 8] str(df) - df['column1'] = df['column1'] + 'c' + df["column1"] = df["column1"] + "c" str(df) # from SO: # http://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc - df = DataFrame(np.arange(0, 9), columns=['count']) - df['group'] = 'b' + df = DataFrame(np.arange(0, 9), columns=["count"]) + df["group"] = "b" with pytest.raises(com.SettingWithCopyError): - df.iloc[0:5]['group'] = 'a' + df.iloc[0:5]["group"] = "a" # Mixed type setting but same dtype & changing dtype - df = DataFrame(dict(A=date_range('20130101', periods=5), - B=np.random.randn(5), - C=np.arange(5, dtype='int64'), - D=list('abcde'))) + df = DataFrame( + dict( + A=date_range("20130101", periods=5), + B=np.random.randn(5), + C=np.arange(5, dtype="int64"), + D=list("abcde"), + ) + ) with pytest.raises(com.SettingWithCopyError): - df.loc[2]['D'] = 'foo' + df.loc[2]["D"] = "foo" with pytest.raises(com.SettingWithCopyError): - df.loc[2]['C'] = 'foo' + df.loc[2]["C"] = "foo" with pytest.raises(com.SettingWithCopyError): - df['C'][2] = 'foo' + df["C"][2] = "foo" def test_setting_with_copy_bug(self): # operating on a copy - df = DataFrame({'a': list(range(4)), - 'b': list('ab..'), - 'c': ['a', 'b', np.nan, 'd']}) + df = DataFrame( + {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} + ) mask = pd.isna(df.c) - msg = ("A value is trying to be set on a copy of a slice from a" - " DataFrame") + msg = "A value is trying to be set on a copy of a slice from a" " DataFrame" with pytest.raises(com.SettingWithCopyError, match=msg): - df[['c']][mask] = df[['b']][mask] + df[["c"]][mask] = df[["b"]][mask] # invalid warning as we are returning a new object # GH 8730 - df1 = DataFrame({'x': Series(['a', 'b', 'c']), - 'y': Series(['d', 'e', 'f'])}) - df2 = df1[['x']] + df1 = DataFrame({"x": Series(["a", "b", "c"]), "y": Series(["d", "e", "f"])}) + df2 = df1[["x"]] # this should not raise - df2['y'] = ['g', 'h', 'i'] + df2["y"] = ["g", "h", "i"] def test_detect_chained_assignment_warnings(self): with option_context("chained_assignment", "warn"): @@ -325,14 +329,16 @@ def test_detect_chained_assignment_warnings(self): def test_detect_chained_assignment_warnings_filter_and_dupe_cols(self): # xref gh-13017. with option_context("chained_assignment", "warn"): - df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, -9]], - columns=["a", "a", "c"]) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, -9]], columns=["a", "a", "c"] + ) with tm.assert_produces_warning(com.SettingWithCopyWarning): df.c.loc[df.c > 0] = None - expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, -9]], - columns=["a", "a", "c"]) + expected = pd.DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, -9]], columns=["a", "a", "c"] + ) tm.assert_frame_equal(df, expected) def test_chained_getitem_with_lists(self): @@ -344,15 +350,15 @@ def check(result, expected): tm.assert_numpy_array_equal(result, expected) assert isinstance(result, np.ndarray) - df = DataFrame({'A': 5 * [np.zeros(3)], 'B': 5 * [np.ones(3)]}) - expected = df['A'].iloc[2] - result = df.loc[2, 'A'] + df = DataFrame({"A": 5 * [np.zeros(3)], "B": 5 * [np.ones(3)]}) + expected = df["A"].iloc[2] + result = df.loc[2, "A"] check(result, expected) - result2 = df.iloc[2]['A'] + result2 = df.iloc[2]["A"] check(result2, expected) - result3 = df['A'].loc[2] + result3 = df["A"].loc[2] check(result3, expected) - result4 = df['A'].iloc[2] + result4 = df["A"].iloc[2] check(result4, expected) @pytest.mark.filterwarnings("ignore::FutureWarning") @@ -360,26 +366,32 @@ def test_cache_updating(self): # GH 4939, make sure to update the cache on setitem df = tm.makeDataFrame() - df['A'] # cache series + df["A"] # cache series df.ix["Hello Friend"] = df.ix[0] - assert "Hello Friend" in df['A'].index - assert "Hello Friend" in df['B'].index + assert "Hello Friend" in df["A"].index + assert "Hello Friend" in df["B"].index # 10264 - df = DataFrame(np.zeros((5, 5), dtype='int64'), columns=[ - 'a', 'b', 'c', 'd', 'e'], index=range(5)) - df['f'] = 0 + df = DataFrame( + np.zeros((5, 5), dtype="int64"), + columns=["a", "b", "c", "d", "e"], + index=range(5), + ) + df["f"] = 0 df.f.values[3] = 1 # TODO(wesm): unused? # y = df.iloc[np.arange(2, len(df))] df.f.values[3] = 2 - expected = DataFrame(np.zeros((5, 6), dtype='int64'), columns=[ - 'a', 'b', 'c', 'd', 'e', 'f'], index=range(5)) - expected.at[3, 'f'] = 2 + expected = DataFrame( + np.zeros((5, 6), dtype="int64"), + columns=["a", "b", "c", "d", "e", "f"], + index=range(5), + ) + expected.at[3, "f"] = 2 tm.assert_frame_equal(df, expected) - expected = Series([0, 0, 0, 2, 0], name='f') + expected = Series([0, 0, 0, 2, 0], name="f") tm.assert_series_equal(df.f, expected) def test_deprecate_is_copy(self): diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index e9c1b85e7d40c8..f46fbcdb504e91 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -13,7 +13,7 @@ ############################################################### -@pytest.fixture(autouse=True, scope='class') +@pytest.fixture(autouse=True, scope="class") def check_comprehensiveness(request): # Iterate over combination of dtype, method and klass # and ensure that each are contained within a collected test @@ -23,12 +23,13 @@ def check_comprehensiveness(request): def has_test(combo): klass, dtype, method = combo cls_funcs = request.node.session.items - return any(klass in x.name and dtype in x.name and - method in x.name for x in cls_funcs) + return any( + klass in x.name and dtype in x.name and method in x.name for x in cls_funcs + ) for combo in combos: if not has_test(combo): - msg = 'test method is not defined: {0}, {1}' + msg = "test method is not defined: {0}, {1}" raise AssertionError(msg.format(cls.__name__, combo)) yield @@ -36,9 +37,18 @@ def has_test(combo): class CoercionBase: - klasses = ['index', 'series'] - dtypes = ['object', 'int64', 'float64', 'complex128', 'bool', - 'datetime64', 'datetime64tz', 'timedelta64', 'period'] + klasses = ["index", "series"] + dtypes = [ + "object", + "int64", + "float64", + "complex128", + "bool", + "datetime64", + "datetime64tz", + "timedelta64", + "period", + ] @property def method(self): @@ -58,10 +68,11 @@ def _assert(self, left, right, dtype): class TestSetitemCoercion(CoercionBase): - method = 'setitem' + method = "setitem" - def _assert_setitem_series_conversion(self, original_series, loc_value, - expected_series, expected_dtype): + def _assert_setitem_series_conversion( + self, original_series, loc_value, expected_series, expected_dtype + ): """ test series value's coercion triggered by assignment """ temp = original_series.copy() temp[1] = loc_value @@ -74,23 +85,21 @@ def _assert_setitem_series_conversion(self, original_series, loc_value, # temp.loc[1] = loc_value # tm.assert_series_equal(temp, expected_series) - @pytest.mark.parametrize("val,exp_dtype", [ - (1, np.object), - (1.1, np.object), - (1 + 1j, np.object), - (True, np.object)]) + @pytest.mark.parametrize( + "val,exp_dtype", + [(1, np.object), (1.1, np.object), (1 + 1j, np.object), (True, np.object)], + ) def test_setitem_series_object(self, val, exp_dtype): - obj = pd.Series(list('abcd')) + obj = pd.Series(list("abcd")) assert obj.dtype == np.object - exp = pd.Series(['a', val, 'c', 'd']) + exp = pd.Series(["a", val, "c", "d"]) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - @pytest.mark.parametrize("val,exp_dtype", [ - (1, np.int64), - (1.1, np.float64), - (1 + 1j, np.complex128), - (True, np.object)]) + @pytest.mark.parametrize( + "val,exp_dtype", + [(1, np.int64), (1.1, np.float64), (1 + 1j, np.complex128), (True, np.object)], + ) def test_setitem_series_int64(self, val, exp_dtype): obj = pd.Series([1, 2, 3, 4]) assert obj.dtype == np.int64 @@ -103,9 +112,9 @@ def test_setitem_series_int64(self, val, exp_dtype): exp = pd.Series([1, val, 3, 4]) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - @pytest.mark.parametrize("val,exp_dtype", [ - (np.int32(1), np.int8), - (np.int16(2**9), np.int16)]) + @pytest.mark.parametrize( + "val,exp_dtype", [(np.int32(1), np.int8), (np.int16(2 ** 9), np.int16)] + ) def test_setitem_series_int8(self, val, exp_dtype): obj = pd.Series([1, 2, 3, 4], dtype=np.int8) assert obj.dtype == np.int8 @@ -118,11 +127,15 @@ def test_setitem_series_int8(self, val, exp_dtype): exp = pd.Series([1, val, 3, 4], dtype=np.int8) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - @pytest.mark.parametrize("val,exp_dtype", [ - (1, np.float64), - (1.1, np.float64), - (1 + 1j, np.complex128), - (True, np.object)]) + @pytest.mark.parametrize( + "val,exp_dtype", + [ + (1, np.float64), + (1.1, np.float64), + (1 + 1j, np.complex128), + (True, np.object), + ], + ) def test_setitem_series_float64(self, val, exp_dtype): obj = pd.Series([1.1, 2.2, 3.3, 4.4]) assert obj.dtype == np.float64 @@ -130,11 +143,15 @@ def test_setitem_series_float64(self, val, exp_dtype): exp = pd.Series([1.1, val, 3.3, 4.4]) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - @pytest.mark.parametrize("val,exp_dtype", [ - (1, np.complex128), - (1.1, np.complex128), - (1 + 1j, np.complex128), - (True, np.object)]) + @pytest.mark.parametrize( + "val,exp_dtype", + [ + (1, np.complex128), + (1.1, np.complex128), + (1 + 1j, np.complex128), + (True, np.object), + ], + ) def test_setitem_series_complex128(self, val, exp_dtype): obj = pd.Series([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j]) assert obj.dtype == np.complex128 @@ -142,12 +159,16 @@ def test_setitem_series_complex128(self, val, exp_dtype): exp = pd.Series([1 + 1j, val, 3 + 3j, 4 + 4j]) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - @pytest.mark.parametrize("val,exp_dtype", [ - (1, np.int64), - (3, np.int64), - (1.1, np.float64), - (1 + 1j, np.complex128), - (True, np.bool)]) + @pytest.mark.parametrize( + "val,exp_dtype", + [ + (1, np.int64), + (3, np.int64), + (1.1, np.float64), + (1 + 1j, np.complex128), + (True, np.bool), + ], + ) def test_setitem_series_bool(self, val, exp_dtype): obj = pd.Series([True, False, True, False]) assert obj.dtype == np.bool @@ -168,62 +189,89 @@ def test_setitem_series_bool(self, val, exp_dtype): exp = pd.Series([True, val, True, False]) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - @pytest.mark.parametrize("val,exp_dtype", [ - (pd.Timestamp('2012-01-01'), 'datetime64[ns]'), - (1, np.object), - ('x', np.object)]) + @pytest.mark.parametrize( + "val,exp_dtype", + [ + (pd.Timestamp("2012-01-01"), "datetime64[ns]"), + (1, np.object), + ("x", np.object), + ], + ) def test_setitem_series_datetime64(self, val, exp_dtype): - obj = pd.Series([pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) - assert obj.dtype == 'datetime64[ns]' - - exp = pd.Series([pd.Timestamp('2011-01-01'), - val, - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) + obj = pd.Series( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + pd.Timestamp("2011-01-04"), + ] + ) + assert obj.dtype == "datetime64[ns]" + + exp = pd.Series( + [ + pd.Timestamp("2011-01-01"), + val, + pd.Timestamp("2011-01-03"), + pd.Timestamp("2011-01-04"), + ] + ) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - @pytest.mark.parametrize("val,exp_dtype", [ - (pd.Timestamp('2012-01-01', tz='US/Eastern'), - 'datetime64[ns, US/Eastern]'), - (pd.Timestamp('2012-01-01', tz='US/Pacific'), np.object), - (pd.Timestamp('2012-01-01'), np.object), - (1, np.object)]) + @pytest.mark.parametrize( + "val,exp_dtype", + [ + (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]"), + (pd.Timestamp("2012-01-01", tz="US/Pacific"), np.object), + (pd.Timestamp("2012-01-01"), np.object), + (1, np.object), + ], + ) def test_setitem_series_datetime64tz(self, val, exp_dtype): - tz = 'US/Eastern' - obj = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2011-01-02', tz=tz), - pd.Timestamp('2011-01-03', tz=tz), - pd.Timestamp('2011-01-04', tz=tz)]) - assert obj.dtype == 'datetime64[ns, US/Eastern]' - - exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - val, - pd.Timestamp('2011-01-03', tz=tz), - pd.Timestamp('2011-01-04', tz=tz)]) + tz = "US/Eastern" + obj = pd.Series( + [ + pd.Timestamp("2011-01-01", tz=tz), + pd.Timestamp("2011-01-02", tz=tz), + pd.Timestamp("2011-01-03", tz=tz), + pd.Timestamp("2011-01-04", tz=tz), + ] + ) + assert obj.dtype == "datetime64[ns, US/Eastern]" + + exp = pd.Series( + [ + pd.Timestamp("2011-01-01", tz=tz), + val, + pd.Timestamp("2011-01-03", tz=tz), + pd.Timestamp("2011-01-04", tz=tz), + ] + ) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - @pytest.mark.parametrize("val,exp_dtype", [ - (pd.Timedelta('12 day'), 'timedelta64[ns]'), - (1, np.object), - ('x', np.object)]) + @pytest.mark.parametrize( + "val,exp_dtype", + [(pd.Timedelta("12 day"), "timedelta64[ns]"), (1, np.object), ("x", np.object)], + ) def test_setitem_series_timedelta64(self, val, exp_dtype): - obj = pd.Series([pd.Timedelta('1 day'), - pd.Timedelta('2 day'), - pd.Timedelta('3 day'), - pd.Timedelta('4 day')]) - assert obj.dtype == 'timedelta64[ns]' - - exp = pd.Series([pd.Timedelta('1 day'), - val, - pd.Timedelta('3 day'), - pd.Timedelta('4 day')]) + obj = pd.Series( + [ + pd.Timedelta("1 day"), + pd.Timedelta("2 day"), + pd.Timedelta("3 day"), + pd.Timedelta("4 day"), + ] + ) + assert obj.dtype == "timedelta64[ns]" + + exp = pd.Series( + [pd.Timedelta("1 day"), val, pd.Timedelta("3 day"), pd.Timedelta("4 day")] + ) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - def _assert_setitem_index_conversion(self, original_series, loc_key, - expected_index, expected_dtype): + def _assert_setitem_index_conversion( + self, original_series, loc_key, expected_index, expected_dtype + ): """ test index's coercion triggered by assign key """ temp = original_series.copy() temp[loc_key] = 5 @@ -239,12 +287,11 @@ def _assert_setitem_index_conversion(self, original_series, loc_key, # check dtype explicitly for sure assert temp.index.dtype == expected_dtype - @pytest.mark.parametrize("val,exp_dtype", [ - ('x', np.object), - (5, IndexError), - (1.1, np.object)]) + @pytest.mark.parametrize( + "val,exp_dtype", [("x", np.object), (5, IndexError), (1.1, np.object)] + ) def test_setitem_index_object(self, val, exp_dtype): - obj = pd.Series([1, 2, 3, 4], index=list('abcd')) + obj = pd.Series([1, 2, 3, 4], index=list("abcd")) assert obj.index.dtype == np.object if exp_dtype is IndexError: @@ -252,14 +299,12 @@ def test_setitem_index_object(self, val, exp_dtype): with pytest.raises(exp_dtype): temp[5] = 5 else: - exp_index = pd.Index(list('abcd') + [val]) - self._assert_setitem_index_conversion(obj, val, exp_index, - exp_dtype) - - @pytest.mark.parametrize("val,exp_dtype", [ - (5, np.int64), - (1.1, np.float64), - ('x', np.object)]) + exp_index = pd.Index(list("abcd") + [val]) + self._assert_setitem_index_conversion(obj, val, exp_index, exp_dtype) + + @pytest.mark.parametrize( + "val,exp_dtype", [(5, np.int64), (1.1, np.float64), ("x", np.object)] + ) def test_setitem_index_int64(self, val, exp_dtype): obj = pd.Series([1, 2, 3, 4]) assert obj.index.dtype == np.int64 @@ -267,10 +312,9 @@ def test_setitem_index_int64(self, val, exp_dtype): exp_index = pd.Index([0, 1, 2, 3, val]) self._assert_setitem_index_conversion(obj, val, exp_index, exp_dtype) - @pytest.mark.parametrize("val,exp_dtype", [ - (5, IndexError), - (5.1, np.float64), - ('x', np.object)]) + @pytest.mark.parametrize( + "val,exp_dtype", [(5, IndexError), (5.1, np.float64), ("x", np.object)] + ) def test_setitem_index_float64(self, val, exp_dtype): obj = pd.Series([1, 2, 3, 4], index=[1.1, 2.1, 3.1, 4.1]) assert obj.index.dtype == np.float64 @@ -309,34 +353,41 @@ def test_setitem_index_period(self): class TestInsertIndexCoercion(CoercionBase): - klasses = ['index'] - method = 'insert' + klasses = ["index"] + method = "insert" - def _assert_insert_conversion(self, original, value, - expected, expected_dtype): + def _assert_insert_conversion(self, original, value, expected, expected_dtype): """ test coercion triggered by insert """ target = original.copy() res = target.insert(1, value) tm.assert_index_equal(res, expected) assert res.dtype == expected_dtype - @pytest.mark.parametrize("insert, coerced_val, coerced_dtype", [ - (1, 1, np.object), - (1.1, 1.1, np.object), - (False, False, np.object), - ('x', 'x', np.object)]) + @pytest.mark.parametrize( + "insert, coerced_val, coerced_dtype", + [ + (1, 1, np.object), + (1.1, 1.1, np.object), + (False, False, np.object), + ("x", "x", np.object), + ], + ) def test_insert_index_object(self, insert, coerced_val, coerced_dtype): - obj = pd.Index(list('abcd')) + obj = pd.Index(list("abcd")) assert obj.dtype == np.object - exp = pd.Index(['a', coerced_val, 'b', 'c', 'd']) + exp = pd.Index(["a", coerced_val, "b", "c", "d"]) self._assert_insert_conversion(obj, insert, exp, coerced_dtype) - @pytest.mark.parametrize("insert, coerced_val, coerced_dtype", [ - (1, 1, np.int64), - (1.1, 1.1, np.float64), - (False, 0, np.int64), - ('x', 'x', np.object)]) + @pytest.mark.parametrize( + "insert, coerced_val, coerced_dtype", + [ + (1, 1, np.int64), + (1.1, 1.1, np.float64), + (False, 0, np.int64), + ("x", "x", np.object), + ], + ) def test_insert_index_int64(self, insert, coerced_val, coerced_dtype): obj = pd.Int64Index([1, 2, 3, 4]) assert obj.dtype == np.int64 @@ -344,39 +395,49 @@ def test_insert_index_int64(self, insert, coerced_val, coerced_dtype): exp = pd.Index([1, coerced_val, 2, 3, 4]) self._assert_insert_conversion(obj, insert, exp, coerced_dtype) - @pytest.mark.parametrize("insert, coerced_val, coerced_dtype", [ - (1, 1., np.float64), - (1.1, 1.1, np.float64), - (False, 0., np.float64), - ('x', 'x', np.object)]) + @pytest.mark.parametrize( + "insert, coerced_val, coerced_dtype", + [ + (1, 1.0, np.float64), + (1.1, 1.1, np.float64), + (False, 0.0, np.float64), + ("x", "x", np.object), + ], + ) def test_insert_index_float64(self, insert, coerced_val, coerced_dtype): - obj = pd.Float64Index([1., 2., 3., 4.]) + obj = pd.Float64Index([1.0, 2.0, 3.0, 4.0]) assert obj.dtype == np.float64 - exp = pd.Index([1., coerced_val, 2., 3., 4.]) + exp = pd.Index([1.0, coerced_val, 2.0, 3.0, 4.0]) self._assert_insert_conversion(obj, insert, exp, coerced_dtype) - @pytest.mark.parametrize('fill_val,exp_dtype', [ - (pd.Timestamp('2012-01-01'), 'datetime64[ns]'), - (pd.Timestamp('2012-01-01', tz='US/Eastern'), - 'datetime64[ns, US/Eastern]')], - ids=['datetime64', 'datetime64tz']) + @pytest.mark.parametrize( + "fill_val,exp_dtype", + [ + (pd.Timestamp("2012-01-01"), "datetime64[ns]"), + (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]"), + ], + ids=["datetime64", "datetime64tz"], + ) def test_insert_index_datetimes(self, fill_val, exp_dtype): - obj = pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', - '2011-01-04'], tz=fill_val.tz) + obj = pd.DatetimeIndex( + ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], tz=fill_val.tz + ) assert obj.dtype == exp_dtype - exp = pd.DatetimeIndex(['2011-01-01', fill_val.date(), '2011-01-02', - '2011-01-03', '2011-01-04'], tz=fill_val.tz) + exp = pd.DatetimeIndex( + ["2011-01-01", fill_val.date(), "2011-01-02", "2011-01-03", "2011-01-04"], + tz=fill_val.tz, + ) self._assert_insert_conversion(obj, fill_val, exp, exp_dtype) msg = "Passed item and index have different timezone" if fill_val.tz: with pytest.raises(ValueError, match=msg): - obj.insert(1, pd.Timestamp('2012-01-01')) + obj.insert(1, pd.Timestamp("2012-01-01")) with pytest.raises(ValueError, match=msg): - obj.insert(1, pd.Timestamp('2012-01-01', tz='Asia/Tokyo')) + obj.insert(1, pd.Timestamp("2012-01-01", tz="Asia/Tokyo")) msg = "cannot insert DatetimeIndex with incompatible label" with pytest.raises(TypeError, match=msg): @@ -385,44 +446,53 @@ def test_insert_index_datetimes(self, fill_val, exp_dtype): pytest.xfail("ToDo: must coerce to object") def test_insert_index_timedelta64(self): - obj = pd.TimedeltaIndex(['1 day', '2 day', '3 day', '4 day']) - assert obj.dtype == 'timedelta64[ns]' + obj = pd.TimedeltaIndex(["1 day", "2 day", "3 day", "4 day"]) + assert obj.dtype == "timedelta64[ns]" # timedelta64 + timedelta64 => timedelta64 - exp = pd.TimedeltaIndex(['1 day', '10 day', '2 day', '3 day', '4 day']) - self._assert_insert_conversion(obj, pd.Timedelta('10 day'), - exp, 'timedelta64[ns]') + exp = pd.TimedeltaIndex(["1 day", "10 day", "2 day", "3 day", "4 day"]) + self._assert_insert_conversion( + obj, pd.Timedelta("10 day"), exp, "timedelta64[ns]" + ) # ToDo: must coerce to object msg = "cannot insert TimedeltaIndex with incompatible label" with pytest.raises(TypeError, match=msg): - obj.insert(1, pd.Timestamp('2012-01-01')) + obj.insert(1, pd.Timestamp("2012-01-01")) # ToDo: must coerce to object msg = "cannot insert TimedeltaIndex with incompatible label" with pytest.raises(TypeError, match=msg): obj.insert(1, 1) - @pytest.mark.parametrize("insert, coerced_val, coerced_dtype", [ - (pd.Period('2012-01', freq='M'), '2012-01', 'period[M]'), - (pd.Timestamp('2012-01-01'), pd.Timestamp('2012-01-01'), np.object), - (1, 1, np.object), - ('x', 'x', np.object)]) + @pytest.mark.parametrize( + "insert, coerced_val, coerced_dtype", + [ + (pd.Period("2012-01", freq="M"), "2012-01", "period[M]"), + (pd.Timestamp("2012-01-01"), pd.Timestamp("2012-01-01"), np.object), + (1, 1, np.object), + ("x", "x", np.object), + ], + ) def test_insert_index_period(self, insert, coerced_val, coerced_dtype): - obj = pd.PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], - freq='M') - assert obj.dtype == 'period[M]' + obj = pd.PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq="M") + assert obj.dtype == "period[M]" if isinstance(insert, pd.Period): index_type = pd.PeriodIndex else: index_type = pd.Index - exp = index_type([pd.Period('2011-01', freq='M'), - coerced_val, - pd.Period('2011-02', freq='M'), - pd.Period('2011-03', freq='M'), - pd.Period('2011-04', freq='M')], freq='M') + exp = index_type( + [ + pd.Period("2011-01", freq="M"), + coerced_val, + pd.Period("2011-02", freq="M"), + pd.Period("2011-03", freq="M"), + pd.Period("2011-04", freq="M"), + ], + freq="M", + ) self._assert_insert_conversion(obj, insert, exp, coerced_dtype) def test_insert_index_complex128(self): @@ -434,24 +504,23 @@ def test_insert_index_bool(self): class TestWhereCoercion(CoercionBase): - method = 'where' + method = "where" - def _assert_where_conversion(self, original, cond, values, - expected, expected_dtype): + def _assert_where_conversion( + self, original, cond, values, expected, expected_dtype + ): """ test coercion triggered by where """ target = original.copy() res = target.where(cond, values) self._assert(res, expected, expected_dtype) - @pytest.mark.parametrize("klass", [pd.Series, pd.Index], - ids=['series', 'index']) - @pytest.mark.parametrize("fill_val,exp_dtype", [ - (1, np.object), - (1.1, np.object), - (1 + 1j, np.object), - (True, np.object)]) + @pytest.mark.parametrize("klass", [pd.Series, pd.Index], ids=["series", "index"]) + @pytest.mark.parametrize( + "fill_val,exp_dtype", + [(1, np.object), (1.1, np.object), (1 + 1j, np.object), (True, np.object)], + ) def test_where_object(self, klass, fill_val, exp_dtype): - obj = klass(list('abcd')) + obj = klass(list("abcd")) assert obj.dtype == np.object cond = klass([True, False, True, False]) @@ -460,7 +529,7 @@ def test_where_object(self, klass, fill_val, exp_dtype): else: ret_val = fill_val - exp = klass(['a', ret_val, 'c', ret_val]) + exp = klass(["a", ret_val, "c", ret_val]) self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype) if fill_val is True: @@ -468,16 +537,14 @@ def test_where_object(self, klass, fill_val, exp_dtype): else: values = klass(fill_val * x for x in [5, 6, 7, 8]) - exp = klass(['a', values[1], 'c', values[3]]) + exp = klass(["a", values[1], "c", values[3]]) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - @pytest.mark.parametrize("klass", [pd.Series, pd.Index], - ids=['series', 'index']) - @pytest.mark.parametrize("fill_val,exp_dtype", [ - (1, np.int64), - (1.1, np.float64), - (1 + 1j, np.complex128), - (True, np.object)]) + @pytest.mark.parametrize("klass", [pd.Series, pd.Index], ids=["series", "index"]) + @pytest.mark.parametrize( + "fill_val,exp_dtype", + [(1, np.int64), (1.1, np.float64), (1 + 1j, np.complex128), (True, np.object)], + ) def test_where_int64(self, klass, fill_val, exp_dtype): if klass is pd.Index and exp_dtype is np.complex128: pytest.skip("Complex Index not supported") @@ -495,13 +562,16 @@ def test_where_int64(self, klass, fill_val, exp_dtype): exp = klass([1, values[1], 3, values[3]]) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - @pytest.mark.parametrize("klass", [pd.Series, pd.Index], - ids=['series', 'index']) - @pytest.mark.parametrize("fill_val, exp_dtype", [ - (1, np.float64), - (1.1, np.float64), - (1 + 1j, np.complex128), - (True, np.object)]) + @pytest.mark.parametrize("klass", [pd.Series, pd.Index], ids=["series", "index"]) + @pytest.mark.parametrize( + "fill_val, exp_dtype", + [ + (1, np.float64), + (1.1, np.float64), + (1 + 1j, np.complex128), + (True, np.object), + ], + ) def test_where_float64(self, klass, fill_val, exp_dtype): if klass is pd.Index and exp_dtype is np.complex128: pytest.skip("Complex Index not supported") @@ -519,11 +589,15 @@ def test_where_float64(self, klass, fill_val, exp_dtype): exp = klass([1.1, values[1], 3.3, values[3]]) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - @pytest.mark.parametrize("fill_val,exp_dtype", [ - (1, np.complex128), - (1.1, np.complex128), - (1 + 1j, np.complex128), - (True, np.object)]) + @pytest.mark.parametrize( + "fill_val,exp_dtype", + [ + (1, np.complex128), + (1.1, np.complex128), + (1 + 1j, np.complex128), + (True, np.object), + ], + ) def test_where_series_complex128(self, fill_val, exp_dtype): obj = pd.Series([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j]) assert obj.dtype == np.complex128 @@ -539,11 +613,10 @@ def test_where_series_complex128(self, fill_val, exp_dtype): exp = pd.Series([1 + 1j, values[1], 3 + 3j, values[3]]) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - @pytest.mark.parametrize("fill_val,exp_dtype", [ - (1, np.object), - (1.1, np.object), - (1 + 1j, np.object), - (True, np.bool)]) + @pytest.mark.parametrize( + "fill_val,exp_dtype", + [(1, np.object), (1.1, np.object), (1 + 1j, np.object), (True, np.bool)], + ) def test_where_series_bool(self, fill_val, exp_dtype): obj = pd.Series([True, False, True, False]) @@ -560,81 +633,112 @@ def test_where_series_bool(self, fill_val, exp_dtype): exp = pd.Series([True, values[1], True, values[3]]) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - @pytest.mark.parametrize("fill_val,exp_dtype", [ - (pd.Timestamp('2012-01-01'), 'datetime64[ns]'), - (pd.Timestamp('2012-01-01', tz='US/Eastern'), np.object)], - ids=['datetime64', 'datetime64tz']) + @pytest.mark.parametrize( + "fill_val,exp_dtype", + [ + (pd.Timestamp("2012-01-01"), "datetime64[ns]"), + (pd.Timestamp("2012-01-01", tz="US/Eastern"), np.object), + ], + ids=["datetime64", "datetime64tz"], + ) def test_where_series_datetime64(self, fill_val, exp_dtype): - obj = pd.Series([pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) - assert obj.dtype == 'datetime64[ns]' + obj = pd.Series( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + pd.Timestamp("2011-01-04"), + ] + ) + assert obj.dtype == "datetime64[ns]" cond = pd.Series([True, False, True, False]) - exp = pd.Series([pd.Timestamp('2011-01-01'), fill_val, - pd.Timestamp('2011-01-03'), fill_val]) + exp = pd.Series( + [pd.Timestamp("2011-01-01"), fill_val, pd.Timestamp("2011-01-03"), fill_val] + ) self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype) values = pd.Series(pd.date_range(fill_val, periods=4)) if fill_val.tz: - exp = pd.Series([pd.Timestamp('2011-01-01'), - pd.Timestamp('2012-01-02 00:00', tz='US/Eastern'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2012-01-04 00:00', - tz='US/Eastern')]) + exp = pd.Series( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2012-01-02 00:00", tz="US/Eastern"), + pd.Timestamp("2011-01-03"), + pd.Timestamp("2012-01-04 00:00", tz="US/Eastern"), + ] + ) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - exp = pd.Series([pd.Timestamp('2011-01-01'), values[1], - pd.Timestamp('2011-01-03'), values[3]]) + exp = pd.Series( + [ + pd.Timestamp("2011-01-01"), + values[1], + pd.Timestamp("2011-01-03"), + values[3], + ] + ) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) def test_where_index_datetime(self): - fill_val = pd.Timestamp('2012-01-01') - exp_dtype = 'datetime64[ns]' - obj = pd.Index([pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) - assert obj.dtype == 'datetime64[ns]' + fill_val = pd.Timestamp("2012-01-01") + exp_dtype = "datetime64[ns]" + obj = pd.Index( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + pd.Timestamp("2011-01-04"), + ] + ) + assert obj.dtype == "datetime64[ns]" cond = pd.Index([True, False, True, False]) - msg = ("Index\\(\\.\\.\\.\\) must be called with a collection " - "of some kind") + msg = "Index\\(\\.\\.\\.\\) must be called with a collection " "of some kind" with pytest.raises(TypeError, match=msg): obj.where(cond, fill_val) values = pd.Index(pd.date_range(fill_val, periods=4)) - exp = pd.Index([pd.Timestamp('2011-01-01'), - pd.Timestamp('2012-01-02'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2012-01-04')]) + exp = pd.Index( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2012-01-02"), + pd.Timestamp("2011-01-03"), + pd.Timestamp("2012-01-04"), + ] + ) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - @pytest.mark.xfail( - reason="GH 22839: do not ignore timezone, must be object") + @pytest.mark.xfail(reason="GH 22839: do not ignore timezone, must be object") def test_where_index_datetimetz(self): - fill_val = pd.Timestamp('2012-01-01', tz='US/Eastern') + fill_val = pd.Timestamp("2012-01-01", tz="US/Eastern") exp_dtype = np.object - obj = pd.Index([pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) - assert obj.dtype == 'datetime64[ns]' + obj = pd.Index( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + pd.Timestamp("2011-01-04"), + ] + ) + assert obj.dtype == "datetime64[ns]" cond = pd.Index([True, False, True, False]) - msg = ("Index\\(\\.\\.\\.\\) must be called with a collection " - "of some kind") + msg = "Index\\(\\.\\.\\.\\) must be called with a collection " "of some kind" with pytest.raises(TypeError, match=msg): obj.where(cond, fill_val) values = pd.Index(pd.date_range(fill_val, periods=4)) - exp = pd.Index([pd.Timestamp('2011-01-01'), - pd.Timestamp('2012-01-02', tz='US/Eastern'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2012-01-04', tz='US/Eastern')], - dtype=exp_dtype) + exp = pd.Index( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2012-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03"), + pd.Timestamp("2012-01-04", tz="US/Eastern"), + ], + dtype=exp_dtype, + ) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) @@ -667,39 +771,39 @@ class TestFillnaSeriesCoercion(CoercionBase): # not indexing, but place here for consistency - method = 'fillna' + method = "fillna" def test_has_comprehensive_tests(self): pass - def _assert_fillna_conversion(self, original, value, - expected, expected_dtype): + def _assert_fillna_conversion(self, original, value, expected, expected_dtype): """ test coercion triggered by fillna """ target = original.copy() res = target.fillna(value) self._assert(res, expected, expected_dtype) - @pytest.mark.parametrize("klass", [pd.Series, pd.Index], - ids=['series', 'index']) - @pytest.mark.parametrize("fill_val, fill_dtype", [ - (1, np.object), - (1.1, np.object), - (1 + 1j, np.object), - (True, np.object)]) + @pytest.mark.parametrize("klass", [pd.Series, pd.Index], ids=["series", "index"]) + @pytest.mark.parametrize( + "fill_val, fill_dtype", + [(1, np.object), (1.1, np.object), (1 + 1j, np.object), (True, np.object)], + ) def test_fillna_object(self, klass, fill_val, fill_dtype): - obj = klass(['a', np.nan, 'c', 'd']) + obj = klass(["a", np.nan, "c", "d"]) assert obj.dtype == np.object - exp = klass(['a', fill_val, 'c', 'd']) + exp = klass(["a", fill_val, "c", "d"]) self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) - @pytest.mark.parametrize("klass", [pd.Series, pd.Index], - ids=['series', 'index']) - @pytest.mark.parametrize("fill_val,fill_dtype", [ - (1, np.float64), - (1.1, np.float64), - (1 + 1j, np.complex128), - (True, np.object)]) + @pytest.mark.parametrize("klass", [pd.Series, pd.Index], ids=["series", "index"]) + @pytest.mark.parametrize( + "fill_val,fill_dtype", + [ + (1, np.float64), + (1.1, np.float64), + (1 + 1j, np.complex128), + (True, np.object), + ], + ) def test_fillna_float64(self, klass, fill_val, fill_dtype): obj = klass([1.1, np.nan, 3.3, 4.4]) assert obj.dtype == np.float64 @@ -712,11 +816,15 @@ def test_fillna_float64(self, klass, fill_val, fill_dtype): fill_dtype = np.object self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) - @pytest.mark.parametrize("fill_val,fill_dtype", [ - (1, np.complex128), - (1.1, np.complex128), - (1 + 1j, np.complex128), - (True, np.object)]) + @pytest.mark.parametrize( + "fill_val,fill_dtype", + [ + (1, np.complex128), + (1.1, np.complex128), + (1 + 1j, np.complex128), + (True, np.object), + ], + ) def test_fillna_series_complex128(self, fill_val, fill_dtype): obj = pd.Series([1 + 1j, np.nan, 3 + 3j, 4 + 4j]) assert obj.dtype == np.complex128 @@ -724,47 +832,70 @@ def test_fillna_series_complex128(self, fill_val, fill_dtype): exp = pd.Series([1 + 1j, fill_val, 3 + 3j, 4 + 4j]) self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) - @pytest.mark.parametrize("klass", [pd.Series, pd.Index], - ids=['series', 'index']) - @pytest.mark.parametrize("fill_val,fill_dtype", [ - (pd.Timestamp('2012-01-01'), 'datetime64[ns]'), - (pd.Timestamp('2012-01-01', tz='US/Eastern'), np.object), - (1, np.object), ('x', np.object)], - ids=['datetime64', 'datetime64tz', 'object', 'object']) + @pytest.mark.parametrize("klass", [pd.Series, pd.Index], ids=["series", "index"]) + @pytest.mark.parametrize( + "fill_val,fill_dtype", + [ + (pd.Timestamp("2012-01-01"), "datetime64[ns]"), + (pd.Timestamp("2012-01-01", tz="US/Eastern"), np.object), + (1, np.object), + ("x", np.object), + ], + ids=["datetime64", "datetime64tz", "object", "object"], + ) def test_fillna_datetime(self, klass, fill_val, fill_dtype): - obj = klass([pd.Timestamp('2011-01-01'), - pd.NaT, - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) - assert obj.dtype == 'datetime64[ns]' - - exp = klass([pd.Timestamp('2011-01-01'), - fill_val, - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) + obj = klass( + [ + pd.Timestamp("2011-01-01"), + pd.NaT, + pd.Timestamp("2011-01-03"), + pd.Timestamp("2011-01-04"), + ] + ) + assert obj.dtype == "datetime64[ns]" + + exp = klass( + [ + pd.Timestamp("2011-01-01"), + fill_val, + pd.Timestamp("2011-01-03"), + pd.Timestamp("2011-01-04"), + ] + ) self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) @pytest.mark.parametrize("klass", [pd.Series, pd.Index]) - @pytest.mark.parametrize("fill_val,fill_dtype", [ - (pd.Timestamp('2012-01-01', tz='US/Eastern'), - 'datetime64[ns, US/Eastern]'), - (pd.Timestamp('2012-01-01'), np.object), - (pd.Timestamp('2012-01-01', tz='Asia/Tokyo'), np.object), - (1, np.object), - ('x', np.object)]) + @pytest.mark.parametrize( + "fill_val,fill_dtype", + [ + (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]"), + (pd.Timestamp("2012-01-01"), np.object), + (pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), np.object), + (1, np.object), + ("x", np.object), + ], + ) def test_fillna_datetime64tz(self, klass, fill_val, fill_dtype): - tz = 'US/Eastern' - - obj = klass([pd.Timestamp('2011-01-01', tz=tz), - pd.NaT, - pd.Timestamp('2011-01-03', tz=tz), - pd.Timestamp('2011-01-04', tz=tz)]) - assert obj.dtype == 'datetime64[ns, US/Eastern]' - - exp = klass([pd.Timestamp('2011-01-01', tz=tz), - fill_val, - pd.Timestamp('2011-01-03', tz=tz), - pd.Timestamp('2011-01-04', tz=tz)]) + tz = "US/Eastern" + + obj = klass( + [ + pd.Timestamp("2011-01-01", tz=tz), + pd.NaT, + pd.Timestamp("2011-01-03", tz=tz), + pd.Timestamp("2011-01-04", tz=tz), + ] + ) + assert obj.dtype == "datetime64[ns, US/Eastern]" + + exp = klass( + [ + pd.Timestamp("2011-01-01", tz=tz), + fill_val, + pd.Timestamp("2011-01-03", tz=tz), + pd.Timestamp("2011-01-04", tz=tz), + ] + ) self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) def test_fillna_series_int64(self): @@ -794,83 +925,114 @@ def test_fillna_index_period(self): class TestReplaceSeriesCoercion(CoercionBase): - klasses = ['series'] - method = 'replace' + klasses = ["series"] + method = "replace" rep = {} - rep['object'] = ['a', 'b'] - rep['int64'] = [4, 5] - rep['float64'] = [1.1, 2.2] - rep['complex128'] = [1 + 1j, 2 + 2j] - rep['bool'] = [True, False] - rep['datetime64[ns]'] = [pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-03')] - - for tz in ['UTC', 'US/Eastern']: + rep["object"] = ["a", "b"] + rep["int64"] = [4, 5] + rep["float64"] = [1.1, 2.2] + rep["complex128"] = [1 + 1j, 2 + 2j] + rep["bool"] = [True, False] + rep["datetime64[ns]"] = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-03")] + + for tz in ["UTC", "US/Eastern"]: # to test tz => different tz replacement - key = 'datetime64[ns, {0}]'.format(tz) - rep[key] = [pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2011-01-03', tz=tz)] - - rep['timedelta64[ns]'] = [pd.Timedelta('1 day'), - pd.Timedelta('2 day')] - - @pytest.mark.parametrize('how', ['dict', 'series']) - @pytest.mark.parametrize('to_key', [ - 'object', 'int64', 'float64', 'complex128', 'bool', 'datetime64[ns]', - 'datetime64[ns, UTC]', 'datetime64[ns, US/Eastern]', 'timedelta64[ns]' - ], ids=['object', 'int64', 'float64', 'complex128', 'bool', - 'datetime64', 'datetime64tz', 'datetime64tz', 'timedelta64']) - @pytest.mark.parametrize('from_key', [ - 'object', 'int64', 'float64', 'complex128', 'bool', 'datetime64[ns]', - 'datetime64[ns, UTC]', 'datetime64[ns, US/Eastern]', 'timedelta64[ns]'] + key = "datetime64[ns, {0}]".format(tz) + rep[key] = [ + pd.Timestamp("2011-01-01", tz=tz), + pd.Timestamp("2011-01-03", tz=tz), + ] + + rep["timedelta64[ns]"] = [pd.Timedelta("1 day"), pd.Timedelta("2 day")] + + @pytest.mark.parametrize("how", ["dict", "series"]) + @pytest.mark.parametrize( + "to_key", + [ + "object", + "int64", + "float64", + "complex128", + "bool", + "datetime64[ns]", + "datetime64[ns, UTC]", + "datetime64[ns, US/Eastern]", + "timedelta64[ns]", + ], + ids=[ + "object", + "int64", + "float64", + "complex128", + "bool", + "datetime64", + "datetime64tz", + "datetime64tz", + "timedelta64", + ], + ) + @pytest.mark.parametrize( + "from_key", + [ + "object", + "int64", + "float64", + "complex128", + "bool", + "datetime64[ns]", + "datetime64[ns, UTC]", + "datetime64[ns, US/Eastern]", + "timedelta64[ns]", + ], ) def test_replace_series(self, how, to_key, from_key): - if from_key == 'bool' and how == 'series': + if from_key == "bool" and how == "series": # doesn't work in PY3, though ...dict_from_bool works fine pytest.skip("doesn't work as in PY3") - index = pd.Index([3, 4], name='xxx') - obj = pd.Series(self.rep[from_key], index=index, name='yyy') + index = pd.Index([3, 4], name="xxx") + obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key - if (from_key.startswith('datetime') and to_key.startswith('datetime')): + if from_key.startswith("datetime") and to_key.startswith("datetime"): # tested below return - elif from_key in ['datetime64[ns, US/Eastern]', 'datetime64[ns, UTC]']: + elif from_key in ["datetime64[ns, US/Eastern]", "datetime64[ns, UTC]"]: # tested below return - if how == 'dict': + if how == "dict": replacer = dict(zip(self.rep[from_key], self.rep[to_key])) - elif how == 'series': + elif how == "series": replacer = pd.Series(self.rep[to_key], index=self.rep[from_key]) else: raise ValueError result = obj.replace(replacer) - if ((from_key == 'float64' and to_key in ('int64')) or - (from_key == 'complex128' and - to_key in ('int64', 'float64'))): + if (from_key == "float64" and to_key in ("int64")) or ( + from_key == "complex128" and to_key in ("int64", "float64") + ): if compat.is_platform_32bit() or compat.is_platform_windows(): - pytest.skip("32-bit platform buggy: {0} -> {1}".format - (from_key, to_key)) + pytest.skip( + "32-bit platform buggy: {0} -> {1}".format(from_key, to_key) + ) # Expected: do not downcast by replacement - exp = pd.Series(self.rep[to_key], index=index, - name='yyy', dtype=from_key) + exp = pd.Series(self.rep[to_key], index=index, name="yyy", dtype=from_key) else: - exp = pd.Series(self.rep[to_key], index=index, name='yyy') + exp = pd.Series(self.rep[to_key], index=index, name="yyy") assert exp.dtype == to_key tm.assert_series_equal(result, exp) # TODO(jbrockmendel) commented out to only have a single xfail printed - @pytest.mark.xfail(reason='GH #18376, tzawareness-compat bug ' - 'in BlockManager.replace_list') + @pytest.mark.xfail( + reason="GH #18376, tzawareness-compat bug " "in BlockManager.replace_list" + ) # @pytest.mark.parametrize('how', ['dict', 'series']) # @pytest.mark.parametrize('to_key', ['timedelta64[ns]', 'bool', 'object', # 'complex128', 'float64', 'int64']) @@ -878,31 +1040,32 @@ def test_replace_series(self, how, to_key, from_key): # 'datetime64[ns, US/Eastern]']) # def test_replace_series_datetime_tz(self, how, to_key, from_key): def test_replace_series_datetime_tz(self): - how = 'series' - from_key = 'datetime64[ns, US/Eastern]' - to_key = 'timedelta64[ns]' + how = "series" + from_key = "datetime64[ns, US/Eastern]" + to_key = "timedelta64[ns]" - index = pd.Index([3, 4], name='xxx') - obj = pd.Series(self.rep[from_key], index=index, name='yyy') + index = pd.Index([3, 4], name="xxx") + obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key - if how == 'dict': + if how == "dict": replacer = dict(zip(self.rep[from_key], self.rep[to_key])) - elif how == 'series': + elif how == "series": replacer = pd.Series(self.rep[to_key], index=self.rep[from_key]) else: raise ValueError result = obj.replace(replacer) - exp = pd.Series(self.rep[to_key], index=index, name='yyy') + exp = pd.Series(self.rep[to_key], index=index, name="yyy") assert exp.dtype == to_key tm.assert_series_equal(result, exp) # TODO(jreback) commented out to only have a single xfail printed - @pytest.mark.xfail(reason="different tz, " - "currently mask_missing raises SystemError", - strict=False) + @pytest.mark.xfail( + reason="different tz, " "currently mask_missing raises SystemError", + strict=False, + ) # @pytest.mark.parametrize('how', ['dict', 'series']) # @pytest.mark.parametrize('to_key', [ # 'datetime64[ns]', 'datetime64[ns, UTC]', @@ -912,23 +1075,23 @@ def test_replace_series_datetime_tz(self): # 'datetime64[ns, US/Eastern]']) # def test_replace_series_datetime_datetime(self, how, to_key, from_key): def test_replace_series_datetime_datetime(self): - how = 'dict' - to_key = 'datetime64[ns]' - from_key = 'datetime64[ns]' + how = "dict" + to_key = "datetime64[ns]" + from_key = "datetime64[ns]" - index = pd.Index([3, 4], name='xxx') - obj = pd.Series(self.rep[from_key], index=index, name='yyy') + index = pd.Index([3, 4], name="xxx") + obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key - if how == 'dict': + if how == "dict": replacer = dict(zip(self.rep[from_key], self.rep[to_key])) - elif how == 'series': + elif how == "series": replacer = pd.Series(self.rep[to_key], index=self.rep[from_key]) else: raise ValueError result = obj.replace(replacer) - exp = pd.Series(self.rep[to_key], index=index, name='yyy') + exp = pd.Series(self.rep[to_key], index=index, name="yyy") assert exp.dtype == to_key tm.assert_series_equal(result, exp) diff --git a/pandas/tests/indexing/test_datetime.py b/pandas/tests/indexing/test_datetime.py index 278fa6bd44f99b..31e9cff68445e1 100644 --- a/pandas/tests/indexing/test_datetime.py +++ b/pandas/tests/indexing/test_datetime.py @@ -9,14 +9,13 @@ class TestDatetimeIndex: - def test_setitem_with_datetime_tz(self): # 16889 # support .loc with alignment and tz-aware DatetimeIndex mask = np.array([True, False, True, False]) - idx = date_range('20010101', periods=4, tz='UTC') - df = DataFrame({'a': np.arange(4)}, index=idx).astype('float64') + idx = date_range("20010101", periods=4, tz="UTC") + df = DataFrame({"a": np.arange(4)}, index=idx).astype("float64") result = df.copy() result.loc[mask, :] = df.loc[mask, :] @@ -26,8 +25,8 @@ def test_setitem_with_datetime_tz(self): result.loc[mask] = df.loc[mask] tm.assert_frame_equal(result, df) - idx = date_range('20010101', periods=4) - df = DataFrame({'a': np.arange(4)}, index=idx).astype('float64') + idx = date_range("20010101", periods=4) + df = DataFrame({"a": np.arange(4)}, index=idx).astype("float64") result = df.copy() result.loc[mask, :] = df.loc[mask, :] @@ -42,31 +41,37 @@ def test_indexing_with_datetime_tz(self): # GH#8260 # support datetime64 with tz - idx = Index(date_range('20130101', periods=3, tz='US/Eastern'), - name='foo') - dr = date_range('20130110', periods=3) - df = DataFrame({'A': idx, 'B': dr}) - df['C'] = idx + idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") + dr = date_range("20130110", periods=3) + df = DataFrame({"A": idx, "B": dr}) + df["C"] = idx df.iloc[1, 1] = pd.NaT df.iloc[1, 2] = pd.NaT # indexing result = df.iloc[1] - expected = Series([Timestamp('2013-01-02 00:00:00-0500', - tz='US/Eastern'), np.nan, np.nan], - index=list('ABC'), dtype='object', name=1) + expected = Series( + [Timestamp("2013-01-02 00:00:00-0500", tz="US/Eastern"), np.nan, np.nan], + index=list("ABC"), + dtype="object", + name=1, + ) tm.assert_series_equal(result, expected) result = df.loc[1] - expected = Series([Timestamp('2013-01-02 00:00:00-0500', - tz='US/Eastern'), np.nan, np.nan], - index=list('ABC'), dtype='object', name=1) + expected = Series( + [Timestamp("2013-01-02 00:00:00-0500", tz="US/Eastern"), np.nan, np.nan], + index=list("ABC"), + dtype="object", + name=1, + ) tm.assert_series_equal(result, expected) # indexing - fast_xs - df = DataFrame({'a': date_range('2014-01-01', periods=10, tz='UTC')}) + df = DataFrame({"a": date_range("2014-01-01", periods=10, tz="UTC")}) result = df.iloc[5] - expected = Series([Timestamp('2014-01-06 00:00:00+0000', tz='UTC')], - index=['a'], name=5) + expected = Series( + [Timestamp("2014-01-06 00:00:00+0000", tz="UTC")], index=["a"], name=5 + ) tm.assert_series_equal(result, expected) result = df.loc[5] @@ -78,34 +83,33 @@ def test_indexing_with_datetime_tz(self): tm.assert_frame_equal(result, expected) # indexing - setting an element - df = DataFrame(data=pd.to_datetime( - ['2015-03-30 20:12:32', '2015-03-12 00:11:11']), columns=['time']) - df['new_col'] = ['new', 'old'] - df.time = df.set_index('time').index.tz_localize('UTC') - v = df[df.new_col == 'new'].set_index('time').index.tz_convert( - 'US/Pacific') + df = DataFrame( + data=pd.to_datetime(["2015-03-30 20:12:32", "2015-03-12 00:11:11"]), + columns=["time"], + ) + df["new_col"] = ["new", "old"] + df.time = df.set_index("time").index.tz_localize("UTC") + v = df[df.new_col == "new"].set_index("time").index.tz_convert("US/Pacific") # trying to set a single element on a part of a different timezone # this converts to object df2 = df.copy() - df2.loc[df2.new_col == 'new', 'time'] = v + df2.loc[df2.new_col == "new", "time"] = v - expected = Series([v[0], df.loc[1, 'time']], name='time') + expected = Series([v[0], df.loc[1, "time"]], name="time") tm.assert_series_equal(df2.time, expected) - v = df.loc[df.new_col == 'new', 'time'] + pd.Timedelta('1s') - df.loc[df.new_col == 'new', 'time'] = v - tm.assert_series_equal(df.loc[df.new_col == 'new', 'time'], v) + v = df.loc[df.new_col == "new", "time"] + pd.Timedelta("1s") + df.loc[df.new_col == "new", "time"] = v + tm.assert_series_equal(df.loc[df.new_col == "new", "time"], v) def test_consistency_with_tz_aware_scalar(self): # xef gh-12938 # various ways of indexing the same tz-aware scalar - df = Series([Timestamp('2016-03-30 14:35:25', - tz='Europe/Brussels')]).to_frame() + df = Series([Timestamp("2016-03-30 14:35:25", tz="Europe/Brussels")]).to_frame() df = pd.concat([df, df]).reset_index(drop=True) - expected = Timestamp('2016-03-30 14:35:25+0200', - tz='Europe/Brussels') + expected = Timestamp("2016-03-30 14:35:25+0200", tz="Europe/Brussels") result = df[0][0] assert result == expected @@ -132,9 +136,9 @@ def test_indexing_with_datetimeindex_tz(self): # GH 12050 # indexing on a series with a datetimeindex with tz - index = date_range('2015-01-01', periods=2, tz='utc') + index = date_range("2015-01-01", periods=2, tz="utc") - ser = Series(range(2), index=index, dtype='int64') + ser = Series(range(2), index=index, dtype="int64") # list-like indexing @@ -181,108 +185,128 @@ def test_partial_setting_with_datetimelike_dtype(self): # GH9478 # a datetimeindex alignment issue with partial setting - df = DataFrame(np.arange(6.).reshape(3, 2), columns=list('AB'), - index=date_range('1/1/2000', periods=3, freq='1H')) + df = DataFrame( + np.arange(6.0).reshape(3, 2), + columns=list("AB"), + index=date_range("1/1/2000", periods=3, freq="1H"), + ) expected = df.copy() - expected['C'] = [expected.index[0]] + [pd.NaT, pd.NaT] + expected["C"] = [expected.index[0]] + [pd.NaT, pd.NaT] mask = df.A < 1 - df.loc[mask, 'C'] = df.loc[mask].index + df.loc[mask, "C"] = df.loc[mask].index tm.assert_frame_equal(df, expected) def test_loc_setitem_datetime(self): # GH 9516 - dt1 = Timestamp('20130101 09:00:00') - dt2 = Timestamp('20130101 10:00:00') + dt1 = Timestamp("20130101 09:00:00") + dt2 = Timestamp("20130101 10:00:00") - for conv in [lambda x: x, lambda x: x.to_datetime64(), - lambda x: x.to_pydatetime(), lambda x: np.datetime64(x)]: + for conv in [ + lambda x: x, + lambda x: x.to_datetime64(), + lambda x: x.to_pydatetime(), + lambda x: np.datetime64(x), + ]: df = DataFrame() - df.loc[conv(dt1), 'one'] = 100 - df.loc[conv(dt2), 'one'] = 200 + df.loc[conv(dt1), "one"] = 100 + df.loc[conv(dt2), "one"] = 200 - expected = DataFrame({'one': [100.0, 200.0]}, index=[dt1, dt2]) + expected = DataFrame({"one": [100.0, 200.0]}, index=[dt1, dt2]) tm.assert_frame_equal(df, expected) def test_series_partial_set_datetime(self): # GH 11497 - idx = date_range('2011-01-01', '2011-01-02', freq='D', name='idx') - ser = Series([0.1, 0.2], index=idx, name='s') + idx = date_range("2011-01-01", "2011-01-02", freq="D", name="idx") + ser = Series([0.1, 0.2], index=idx, name="s") - result = ser.loc[[Timestamp('2011-01-01'), Timestamp('2011-01-02')]] - exp = Series([0.1, 0.2], index=idx, name='s') + result = ser.loc[[Timestamp("2011-01-01"), Timestamp("2011-01-02")]] + exp = Series([0.1, 0.2], index=idx, name="s") tm.assert_series_equal(result, exp, check_index_type=True) - keys = [Timestamp('2011-01-02'), Timestamp('2011-01-02'), - Timestamp('2011-01-01')] - exp = Series([0.2, 0.2, 0.1], index=pd.DatetimeIndex(keys, name='idx'), - name='s') + keys = [ + Timestamp("2011-01-02"), + Timestamp("2011-01-02"), + Timestamp("2011-01-01"), + ] + exp = Series( + [0.2, 0.2, 0.1], index=pd.DatetimeIndex(keys, name="idx"), name="s" + ) tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True) - keys = [Timestamp('2011-01-03'), Timestamp('2011-01-02'), - Timestamp('2011-01-03')] - exp = Series([np.nan, 0.2, np.nan], - index=pd.DatetimeIndex(keys, name='idx'), name='s') - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + keys = [ + Timestamp("2011-01-03"), + Timestamp("2011-01-02"), + Timestamp("2011-01-03"), + ] + exp = Series( + [np.nan, 0.2, np.nan], index=pd.DatetimeIndex(keys, name="idx"), name="s" + ) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True) def test_series_partial_set_period(self): # GH 11497 - idx = pd.period_range('2011-01-01', '2011-01-02', freq='D', name='idx') - ser = Series([0.1, 0.2], index=idx, name='s') + idx = pd.period_range("2011-01-01", "2011-01-02", freq="D", name="idx") + ser = Series([0.1, 0.2], index=idx, name="s") - result = ser.loc[[pd.Period('2011-01-01', freq='D'), - pd.Period('2011-01-02', freq='D')]] - exp = Series([0.1, 0.2], index=idx, name='s') + result = ser.loc[ + [pd.Period("2011-01-01", freq="D"), pd.Period("2011-01-02", freq="D")] + ] + exp = Series([0.1, 0.2], index=idx, name="s") tm.assert_series_equal(result, exp, check_index_type=True) - keys = [pd.Period('2011-01-02', freq='D'), - pd.Period('2011-01-02', freq='D'), - pd.Period('2011-01-01', freq='D')] - exp = Series([0.2, 0.2, 0.1], index=pd.PeriodIndex(keys, name='idx'), - name='s') + keys = [ + pd.Period("2011-01-02", freq="D"), + pd.Period("2011-01-02", freq="D"), + pd.Period("2011-01-01", freq="D"), + ] + exp = Series([0.2, 0.2, 0.1], index=pd.PeriodIndex(keys, name="idx"), name="s") tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True) - keys = [pd.Period('2011-01-03', freq='D'), - pd.Period('2011-01-02', freq='D'), - pd.Period('2011-01-03', freq='D')] - exp = Series([np.nan, 0.2, np.nan], - index=pd.PeriodIndex(keys, name='idx'), name='s') - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + keys = [ + pd.Period("2011-01-03", freq="D"), + pd.Period("2011-01-02", freq="D"), + pd.Period("2011-01-03", freq="D"), + ] + exp = Series( + [np.nan, 0.2, np.nan], index=pd.PeriodIndex(keys, name="idx"), name="s" + ) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = ser.loc[keys] tm.assert_series_equal(result, exp) def test_nanosecond_getitem_setitem_with_tz(self): # GH 11679 - data = ['2016-06-28 08:30:00.123456789'] - index = pd.DatetimeIndex(data, dtype='datetime64[ns, America/Chicago]') - df = DataFrame({'a': [10]}, index=index) + data = ["2016-06-28 08:30:00.123456789"] + index = pd.DatetimeIndex(data, dtype="datetime64[ns, America/Chicago]") + df = DataFrame({"a": [10]}, index=index) result = df.loc[df.index[0]] - expected = Series(10, index=['a'], name=df.index[0]) + expected = Series(10, index=["a"], name=df.index[0]) tm.assert_series_equal(result, expected) result = df.copy() - result.loc[df.index[0], 'a'] = -1 - expected = DataFrame(-1, index=index, columns=['a']) + result.loc[df.index[0], "a"] = -1 + expected = DataFrame(-1, index=index, columns=["a"]) tm.assert_frame_equal(result, expected) def test_loc_getitem_across_dst(self): # GH 21846 - idx = pd.date_range('2017-10-29 01:30:00', - tz='Europe/Berlin', periods=5, freq='30 min') - series2 = pd.Series([0, 1, 2, 3, 4], - index=idx) - - t_1 = pd.Timestamp('2017-10-29 02:30:00+02:00', tz='Europe/Berlin', - freq='30min') - t_2 = pd.Timestamp('2017-10-29 02:00:00+01:00', tz='Europe/Berlin', - freq='30min') + idx = pd.date_range( + "2017-10-29 01:30:00", tz="Europe/Berlin", periods=5, freq="30 min" + ) + series2 = pd.Series([0, 1, 2, 3, 4], index=idx) + + t_1 = pd.Timestamp( + "2017-10-29 02:30:00+02:00", tz="Europe/Berlin", freq="30min" + ) + t_2 = pd.Timestamp( + "2017-10-29 02:00:00+01:00", tz="Europe/Berlin", freq="30min" + ) result = series2.loc[t_1:t_2] expected = pd.Series([2, 3], index=idx[2:4]) tm.assert_series_equal(result, expected) @@ -303,20 +327,22 @@ def test_loc_incremental_setitem_with_dst(self): def test_loc_setitem_with_existing_dst(self): # GH 18308 - start = pd.Timestamp('2017-10-29 00:00:00+0200', tz='Europe/Madrid') - end = pd.Timestamp('2017-10-29 03:00:00+0100', tz='Europe/Madrid') - ts = pd.Timestamp('2016-10-10 03:00:00', tz='Europe/Madrid') - idx = pd.date_range(start, end, closed='left', freq="H") - result = pd.DataFrame(index=idx, columns=['value']) - result.loc[ts, 'value'] = 12 - expected = pd.DataFrame([np.nan] * len(idx) + [12], - index=idx.append(pd.DatetimeIndex([ts])), - columns=['value'], - dtype=object) + start = pd.Timestamp("2017-10-29 00:00:00+0200", tz="Europe/Madrid") + end = pd.Timestamp("2017-10-29 03:00:00+0100", tz="Europe/Madrid") + ts = pd.Timestamp("2016-10-10 03:00:00", tz="Europe/Madrid") + idx = pd.date_range(start, end, closed="left", freq="H") + result = pd.DataFrame(index=idx, columns=["value"]) + result.loc[ts, "value"] = 12 + expected = pd.DataFrame( + [np.nan] * len(idx) + [12], + index=idx.append(pd.DatetimeIndex([ts])), + columns=["value"], + dtype=object, + ) tm.assert_frame_equal(result, expected) def test_loc_str_slicing(self): - ix = pd.period_range(start='2017-01-01', end='2018-01-01', freq='M') + ix = pd.period_range(start="2017-01-01", end="2018-01-01", freq="M") ser = ix.to_series() result = ser.loc[:"2017-12"] expected = ser.iloc[:-1] @@ -324,9 +350,9 @@ def test_loc_str_slicing(self): tm.assert_series_equal(result, expected) def test_loc_label_slicing(self): - ix = pd.period_range(start='2017-01-01', end='2018-01-01', freq='M') + ix = pd.period_range(start="2017-01-01", end="2018-01-01", freq="M") ser = ix.to_series() - result = ser.loc[:ix[-2]] + result = ser.loc[: ix[-2]] expected = ser.iloc[:-1] tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index ada613110d9bf9..78ff6580bb1e1d 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -3,8 +3,7 @@ import numpy as np import pytest -from pandas import ( - DataFrame, Float64Index, Index, Int64Index, RangeIndex, Series) +from pandas import DataFrame, Float64Index, Index, Int64Index, RangeIndex, Series import pandas.util.testing as tm from pandas.util.testing import assert_almost_equal, assert_series_equal @@ -12,7 +11,6 @@ class TestFloatIndexers: - def check(self, result, original, indexer, getitem): """ comparator for results @@ -38,23 +36,29 @@ def test_scalar_error(self): # but is specifically testing for the error # message - for index in [tm.makeStringIndex, tm.makeUnicodeIndex, - tm.makeCategoricalIndex, - tm.makeDateIndex, tm.makeTimedeltaIndex, - tm.makePeriodIndex, tm.makeIntIndex, - tm.makeRangeIndex]: + for index in [ + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeCategoricalIndex, + tm.makeDateIndex, + tm.makeTimedeltaIndex, + tm.makePeriodIndex, + tm.makeIntIndex, + tm.makeRangeIndex, + ]: i = index(5) s = Series(np.arange(len(i)), index=i) - msg = 'Cannot index by location index' + msg = "Cannot index by location index" with pytest.raises(TypeError, match=msg): s.iloc[3.0] - msg = ("cannot do positional indexing on {klass} with these " - r"indexers \[3\.0\] of {kind}".format( - klass=type(i), kind=str(float))) + msg = ( + "cannot do positional indexing on {klass} with these " + r"indexers \[3\.0\] of {kind}".format(klass=type(i), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s.iloc[3.0] = 0 @@ -65,22 +69,28 @@ def test_scalar_non_numeric(self): # float_indexers should raise exceptions # on appropriate Index types & accessors - for index in [tm.makeStringIndex, tm.makeUnicodeIndex, - tm.makeCategoricalIndex, - tm.makeDateIndex, tm.makeTimedeltaIndex, - tm.makePeriodIndex]: + for index in [ + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeCategoricalIndex, + tm.makeDateIndex, + tm.makeTimedeltaIndex, + tm.makePeriodIndex, + ]: i = index(5) - for s in [Series( - np.arange(len(i)), index=i), DataFrame( - np.random.randn( - len(i), len(i)), index=i, columns=i)]: + for s in [ + Series(np.arange(len(i)), index=i), + DataFrame(np.random.randn(len(i), len(i)), index=i, columns=i), + ]: # getting - for idxr, getitem in [(lambda x: x.ix, False), - (lambda x: x.iloc, False), - (lambda x: x, True)]: + for idxr, getitem in [ + (lambda x: x.ix, False), + (lambda x: x.iloc, False), + (lambda x: x, True), + ]: # gettitem on a DataFrame is a KeyError as it is indexing # via labels on the columns @@ -89,26 +99,28 @@ def test_scalar_non_numeric(self): msg = r"^3(\.0)?$" else: error = TypeError - msg = (r"cannot do (label|index|positional) indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}|" - "Cannot index by location index with a" - " non-integer key" - .format(klass=type(i), kind=str(float))) + msg = ( + r"cannot do (label|index|positional) indexing" + r" on {klass} with these indexers \[3\.0\] of" + r" {kind}|" + "Cannot index by location index with a" + " non-integer key".format(klass=type(i), kind=str(float)) + ) with catch_warnings(record=True): with pytest.raises(error, match=msg): idxr(s)[3.0] # label based can be a TypeError or KeyError - if s.index.inferred_type in ['string', 'unicode', 'mixed']: + if s.index.inferred_type in ["string", "unicode", "mixed"]: error = KeyError msg = r"^3$" else: error = TypeError - msg = (r"cannot do (label|index) indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}" - .format(klass=type(i), kind=str(float))) + msg = ( + r"cannot do (label|index) indexing" + r" on {klass} with these indexers \[3\.0\] of" + r" {kind}".format(klass=type(i), kind=str(float)) + ) with pytest.raises(error, match=msg): s.loc[3.0] @@ -116,19 +128,19 @@ def test_scalar_non_numeric(self): assert 3.0 not in s # setting with a float fails with iloc - msg = (r"cannot do (label|index|positional) indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}" - .format(klass=type(i), kind=str(float))) + msg = ( + r"cannot do (label|index|positional) indexing" + r" on {klass} with these indexers \[3\.0\] of" + r" {kind}".format(klass=type(i), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s.iloc[3.0] = 0 # setting with an indexer - if s.index.inferred_type in ['categorical']: + if s.index.inferred_type in ["categorical"]: # Value or Type Error pass - elif s.index.inferred_type in ['datetime64', 'timedelta64', - 'period']: + elif s.index.inferred_type in ["datetime64", "timedelta64", "period"]: # these should prob work # and are inconsisten between series/dataframe ATM @@ -146,8 +158,7 @@ def test_scalar_non_numeric(self): s2.loc[3.0] = 10 assert s2.index.is_object() - for idxr in [lambda x: x.ix, - lambda x: x]: + for idxr in [lambda x: x.ix, lambda x: x]: s2 = s.copy() with catch_warnings(record=True): idxr(s2)[3.0] = 0 @@ -156,30 +167,32 @@ def test_scalar_non_numeric(self): # fallsback to position selection, series only s = Series(np.arange(len(i)), index=i) s[3] - msg = (r"cannot do (label|index) indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}" - .format(klass=type(i), kind=str(float))) + msg = ( + r"cannot do (label|index) indexing" + r" on {klass} with these indexers \[3\.0\] of" + r" {kind}".format(klass=type(i), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s[3.0] @ignore_ix def test_scalar_with_mixed(self): - s2 = Series([1, 2, 3], index=['a', 'b', 'c']) - s3 = Series([1, 2, 3], index=['a', 'b', 1.5]) + s2 = Series([1, 2, 3], index=["a", "b", "c"]) + s3 = Series([1, 2, 3], index=["a", "b", 1.5]) # lookup in a pure stringstr # with an invalid indexer - for idxr in [lambda x: x.ix, - lambda x: x, - lambda x: x.iloc]: - - msg = (r"cannot do label indexing" - r" on {klass} with these indexers \[1\.0\] of" - r" {kind}|" - "Cannot index by location index with a non-integer key" - .format(klass=str(Index), kind=str(float))) + for idxr in [lambda x: x.ix, lambda x: x, lambda x: x.iloc]: + + msg = ( + r"cannot do label indexing" + r" on {klass} with these indexers \[1\.0\] of" + r" {kind}|" + "Cannot index by location index with a non-integer key".format( + klass=str(Index), kind=str(float) + ) + ) with catch_warnings(record=True): with pytest.raises(TypeError, match=msg): idxr(s2)[1.0] @@ -187,7 +200,7 @@ def test_scalar_with_mixed(self): with pytest.raises(KeyError, match=r"^1$"): s2.loc[1.0] - result = s2.loc['b'] + result = s2.loc["b"] expected = 2 assert result == expected @@ -195,10 +208,11 @@ def test_scalar_with_mixed(self): # indexing for idxr in [lambda x: x]: - msg = (r"cannot do label indexing" - r" on {klass} with these indexers \[1\.0\] of" - r" {kind}" - .format(klass=str(Index), kind=str(float))) + msg = ( + r"cannot do label indexing" + r" on {klass} with these indexers \[1\.0\] of" + r" {kind}".format(klass=str(Index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): idxr(s3)[1.0] @@ -211,10 +225,11 @@ def test_scalar_with_mixed(self): for idxr in [lambda x: x.ix]: with catch_warnings(record=True): - msg = (r"cannot do label indexing" - r" on {klass} with these indexers \[1\.0\] of" - r" {kind}" - .format(klass=str(Index), kind=str(float))) + msg = ( + r"cannot do label indexing" + r" on {klass} with these indexers \[1\.0\] of" + r" {kind}".format(klass=str(Index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): idxr(s3)[1.0] @@ -240,36 +255,41 @@ def test_scalar_integer(self): # integer index for i in [Int64Index(range(5)), RangeIndex(5)]: - for s in [Series(np.arange(len(i))), - DataFrame(np.random.randn(len(i), len(i)), - index=i, columns=i)]: + for s in [ + Series(np.arange(len(i))), + DataFrame(np.random.randn(len(i), len(i)), index=i, columns=i), + ]: # coerce to equal int - for idxr, getitem in [(lambda x: x.ix, False), - (lambda x: x.loc, False), - (lambda x: x, True)]: + for idxr, getitem in [ + (lambda x: x.ix, False), + (lambda x: x.loc, False), + (lambda x: x, True), + ]: with catch_warnings(record=True): result = idxr(s)[3.0] self.check(result, s, 3, getitem) # coerce to equal int - for idxr, getitem in [(lambda x: x.ix, False), - (lambda x: x.loc, False), - (lambda x: x, True)]: + for idxr, getitem in [ + (lambda x: x.ix, False), + (lambda x: x.loc, False), + (lambda x: x, True), + ]: if isinstance(s, Series): + def compare(x, y): assert x == y + expected = 100 else: compare = tm.assert_series_equal if getitem: - expected = Series(100, - index=range(len(s)), name=3) + expected = Series(100, index=range(len(s)), name=3) else: - expected = Series(100., - index=range(len(s)), name=3) + expected = Series(100.0, index=range(len(s)), name=3) s2 = s.copy() with catch_warnings(record=True): @@ -289,16 +309,21 @@ def compare(x, y): def test_scalar_float(self): # scalar float indexers work on a float index - index = Index(np.arange(5.)) - for s in [Series(np.arange(len(index)), index=index), - DataFrame(np.random.randn(len(index), len(index)), - index=index, columns=index)]: + index = Index(np.arange(5.0)) + for s in [ + Series(np.arange(len(index)), index=index), + DataFrame( + np.random.randn(len(index), len(index)), index=index, columns=index + ), + ]: # assert all operations except for iloc are ok indexer = index[3] - for idxr, getitem in [(lambda x: x.ix, False), - (lambda x: x.loc, False), - (lambda x: x, True)]: + for idxr, getitem in [ + (lambda x: x.ix, False), + (lambda x: x.loc, False), + (lambda x: x, True), + ]: # getting result = idxr(s)[indexer] @@ -332,10 +357,11 @@ def test_scalar_float(self): with pytest.raises(TypeError, match=msg): s.iloc[3.0] - msg = (r"cannot do positional indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}" - .format(klass=str(Float64Index), kind=str(float))) + msg = ( + r"cannot do positional indexing" + r" on {klass} with these indexers \[3\.0\] of" + r" {kind}".format(klass=str(Float64Index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s2.iloc[3.0] = 0 @@ -346,65 +372,79 @@ def test_slice_non_numeric(self): # float_indexers should raise exceptions # on appropriate Index types & accessors - for index in [tm.makeStringIndex, tm.makeUnicodeIndex, - tm.makeDateIndex, tm.makeTimedeltaIndex, - tm.makePeriodIndex]: + for index in [ + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeDateIndex, + tm.makeTimedeltaIndex, + tm.makePeriodIndex, + ]: index = index(5) - for s in [Series(range(5), index=index), - DataFrame(np.random.randn(5, 2), index=index)]: + for s in [ + Series(range(5), index=index), + DataFrame(np.random.randn(5, 2), index=index), + ]: # getitem - for l in [slice(3.0, 4), - slice(3, 4.0), - slice(3.0, 4.0)]: - - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}" - .format(klass=type(index), kind=str(float))) + for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: + + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[(3|4)\.0\] of" + " {kind}".format(klass=type(index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s.iloc[l] - for idxr in [lambda x: x.ix, - lambda x: x.loc, - lambda x: x.iloc, - lambda x: x]: - - msg = ("cannot do slice indexing" - r" on {klass} with these indexers" - r" \[(3|4)(\.0)?\]" - r" of ({kind_float}|{kind_int})" - .format(klass=type(index), - kind_float=str(float), - kind_int=str(int))) + for idxr in [ + lambda x: x.ix, + lambda x: x.loc, + lambda x: x.iloc, + lambda x: x, + ]: + + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers" + r" \[(3|4)(\.0)?\]" + r" of ({kind_float}|{kind_int})".format( + klass=type(index), + kind_float=str(float), + kind_int=str(int), + ) + ) with catch_warnings(record=True): with pytest.raises(TypeError, match=msg): idxr(s)[l] # setitem - for l in [slice(3.0, 4), - slice(3, 4.0), - slice(3.0, 4.0)]: - - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}" - .format(klass=type(index), kind=str(float))) + for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: + + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[(3|4)\.0\] of" + " {kind}".format(klass=type(index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s.iloc[l] = 0 - for idxr in [lambda x: x.ix, - lambda x: x.loc, - lambda x: x.iloc, - lambda x: x]: - msg = ("cannot do slice indexing" - r" on {klass} with these indexers" - r" \[(3|4)(\.0)?\]" - r" of ({kind_float}|{kind_int})" - .format(klass=type(index), - kind_float=str(float), - kind_int=str(int))) + for idxr in [ + lambda x: x.ix, + lambda x: x.loc, + lambda x: x.iloc, + lambda x: x, + ]: + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers" + r" \[(3|4)(\.0)?\]" + r" of ({kind_float}|{kind_int})".format( + klass=type(index), + kind_float=str(float), + kind_int=str(int), + ) + ) with catch_warnings(record=True): with pytest.raises(TypeError, match=msg): idxr(s)[l] = 0 @@ -416,20 +456,19 @@ def test_slice_integer(self): # these coerce to a like integer # oob indicates if we are out of bounds # of positional indexing - for index, oob in [(Int64Index(range(5)), False), - (RangeIndex(5), False), - (Int64Index(range(5)) + 10, True)]: + for index, oob in [ + (Int64Index(range(5)), False), + (RangeIndex(5), False), + (Int64Index(range(5)) + 10, True), + ]: # s is an in-range index s = Series(range(5), index=index) # getitem - for l in [slice(3.0, 4), - slice(3, 4.0), - slice(3.0, 4.0)]: + for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: - for idxr in [lambda x: x.loc, - lambda x: x.ix]: + for idxr in [lambda x: x.loc, lambda x: x.ix]: with catch_warnings(record=True): result = idxr(s)[l] @@ -444,19 +483,18 @@ def test_slice_integer(self): self.check(result, s, indexer, False) # positional indexing - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}" - .format(klass=type(index), kind=str(float))) + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[(3|4)\.0\] of" + " {kind}".format(klass=type(index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s[l] # getitem out-of-bounds - for l in [slice(-6, 6), - slice(-6.0, 6.0)]: + for l in [slice(-6, 6), slice(-6.0, 6.0)]: - for idxr in [lambda x: x.loc, - lambda x: x.ix]: + for idxr in [lambda x: x.loc, lambda x: x.ix]: with catch_warnings(record=True): result = idxr(s)[l] @@ -470,20 +508,22 @@ def test_slice_integer(self): self.check(result, s, indexer, False) # positional indexing - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[-6\.0\] of" - " {kind}" - .format(klass=type(index), kind=str(float))) + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[-6\.0\] of" + " {kind}".format(klass=type(index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s[slice(-6.0, 6.0)] # getitem odd floats - for l, res1 in [(slice(2.5, 4), slice(3, 5)), - (slice(2, 3.5), slice(2, 4)), - (slice(2.5, 3.5), slice(3, 4))]: + for l, res1 in [ + (slice(2.5, 4), slice(3, 5)), + (slice(2, 3.5), slice(2, 4)), + (slice(2.5, 3.5), slice(3, 4)), + ]: - for idxr in [lambda x: x.loc, - lambda x: x.ix]: + for idxr in [lambda x: x.loc, lambda x: x.ix]: with catch_warnings(record=True): result = idxr(s)[l] @@ -495,20 +535,18 @@ def test_slice_integer(self): self.check(result, s, res, False) # positional indexing - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[(2|3)\.5\] of" - " {kind}" - .format(klass=type(index), kind=str(float))) + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[(2|3)\.5\] of" + " {kind}".format(klass=type(index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s[l] # setitem - for l in [slice(3.0, 4), - slice(3, 4.0), - slice(3.0, 4.0)]: + for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: - for idxr in [lambda x: x.loc, - lambda x: x.ix]: + for idxr in [lambda x: x.loc, lambda x: x.ix]: sc = s.copy() with catch_warnings(record=True): idxr(sc)[l] = 0 @@ -516,10 +554,11 @@ def test_slice_integer(self): assert (result == 0).all() # positional indexing - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}" - .format(klass=type(index), kind=str(float))) + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[(3|4)\.0\] of" + " {kind}".format(klass=type(index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s[l] = 0 @@ -533,18 +572,16 @@ def test_integer_positional_indexing(self): expected = s.iloc[2:4] assert_series_equal(result, expected) - for idxr in [lambda x: x, - lambda x: x.iloc]: + for idxr in [lambda x: x, lambda x: x.iloc]: - for l in [slice(2, 4.0), - slice(2.0, 4), - slice(2.0, 4.0)]: + for l in [slice(2, 4.0), slice(2.0, 4), slice(2.0, 4.0)]: klass = RangeIndex - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[(2|4)\.0\] of" - " {kind}" - .format(klass=str(klass), kind=str(float))) + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[(2|4)\.0\] of" + " {kind}".format(klass=str(klass), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): idxr(s)[l] @@ -559,57 +596,57 @@ def test_slice_integer_frame_getitem(self): def f(idxr): # getitem - for l in [slice(0.0, 1), - slice(0, 1.0), - slice(0.0, 1.0)]: + for l in [slice(0.0, 1), slice(0, 1.0), slice(0.0, 1.0)]: result = idxr(s)[l] indexer = slice(0, 2) self.check(result, s, indexer, False) # positional indexing - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[(0|1)\.0\] of" - " {kind}" - .format(klass=type(index), kind=str(float))) + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[(0|1)\.0\] of" + " {kind}".format(klass=type(index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s[l] # getitem out-of-bounds - for l in [slice(-10, 10), - slice(-10.0, 10.0)]: + for l in [slice(-10, 10), slice(-10.0, 10.0)]: result = idxr(s)[l] self.check(result, s, slice(-10, 10), True) # positional indexing - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[-10\.0\] of" - " {kind}" - .format(klass=type(index), kind=str(float))) + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[-10\.0\] of" + " {kind}".format(klass=type(index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s[slice(-10.0, 10.0)] # getitem odd floats - for l, res in [(slice(0.5, 1), slice(1, 2)), - (slice(0, 0.5), slice(0, 1)), - (slice(0.5, 1.5), slice(1, 2))]: + for l, res in [ + (slice(0.5, 1), slice(1, 2)), + (slice(0, 0.5), slice(0, 1)), + (slice(0.5, 1.5), slice(1, 2)), + ]: result = idxr(s)[l] self.check(result, s, res, False) # positional indexing - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[0\.5\] of" - " {kind}" - .format(klass=type(index), kind=str(float))) + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[0\.5\] of" + " {kind}".format(klass=type(index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s[l] # setitem - for l in [slice(3.0, 4), - slice(3, 4.0), - slice(3.0, 4.0)]: + for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: sc = s.copy() idxr(sc)[l] = 0 @@ -617,10 +654,11 @@ def f(idxr): assert (result == 0).all() # positional indexing - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}" - .format(klass=type(index), kind=str(float))) + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[(3|4)\.0\] of" + " {kind}".format(klass=type(index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s[l] = 0 @@ -632,18 +670,16 @@ def f(idxr): def test_slice_float(self): # same as above, but for floats - index = Index(np.arange(5.)) + 0.1 - for s in [Series(range(5), index=index), - DataFrame(np.random.randn(5, 2), index=index)]: + index = Index(np.arange(5.0)) + 0.1 + for s in [ + Series(range(5), index=index), + DataFrame(np.random.randn(5, 2), index=index), + ]: - for l in [slice(3.0, 4), - slice(3, 4.0), - slice(3.0, 4.0)]: + for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: expected = s.iloc[3:4] - for idxr in [lambda x: x.ix, - lambda x: x.loc, - lambda x: x]: + for idxr in [lambda x: x.ix, lambda x: x.loc, lambda x: x]: # getitem with catch_warnings(record=True): @@ -714,7 +750,7 @@ def test_floating_misc(self): assert_series_equal(s.loc[fancy_idx], expected) assert_series_equal(s.loc[fancy_idx], expected) - expected = Series([2, 0], index=Index([5, 0], dtype='int64')) + expected = Series([2, 0], index=Index([5, 0], dtype="int64")) for fancy_idx in [[5, 0], np.array([5, 0])]: # int assert_series_equal(s[fancy_idx], expected) assert_series_equal(s.loc[fancy_idx], expected) @@ -771,8 +807,7 @@ def test_floating_misc(self): result3 = s.loc[[1.6, 5, 10]] assert_series_equal(result1, result2) assert_series_equal(result1, result3) - assert_series_equal(result1, Series( - [np.nan, 2, 4], index=[1.6, 5, 10])) + assert_series_equal(result1, Series([np.nan, 2, 4], index=[1.6, 5, 10])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result1 = s[[0, 1, 2]] @@ -782,8 +817,7 @@ def test_floating_misc(self): result3 = s.loc[[0, 1, 2]] assert_series_equal(result1, result2) assert_series_equal(result1, result3) - assert_series_equal(result1, Series( - [0.0, np.nan, np.nan], index=[0, 1, 2])) + assert_series_equal(result1, Series([0.0, np.nan, np.nan], index=[0, 1, 2])) result1 = s.loc[[2.5, 5]] result2 = s.loc[[2.5, 5]] @@ -799,168 +833,170 @@ def test_floating_misc(self): def test_floating_tuples(self): # see gh-13509 - s = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.1, 0.2], name='foo') + s = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.1, 0.2], name="foo") result = s[0.0] assert result == (1, 1) - expected = Series([(1, 1), (2, 2)], index=[0.0, 0.0], name='foo') - s = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.0, 0.2], name='foo') + expected = Series([(1, 1), (2, 2)], index=[0.0, 0.0], name="foo") + s = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.0, 0.2], name="foo") result = s[0.0] tm.assert_series_equal(result, expected) def test_float64index_slicing_bug(self): # GH 5557, related to slicing a float index - ser = {256: 2321.0, - 1: 78.0, - 2: 2716.0, - 3: 0.0, - 4: 369.0, - 5: 0.0, - 6: 269.0, - 7: 0.0, - 8: 0.0, - 9: 0.0, - 10: 3536.0, - 11: 0.0, - 12: 24.0, - 13: 0.0, - 14: 931.0, - 15: 0.0, - 16: 101.0, - 17: 78.0, - 18: 9643.0, - 19: 0.0, - 20: 0.0, - 21: 0.0, - 22: 63761.0, - 23: 0.0, - 24: 446.0, - 25: 0.0, - 26: 34773.0, - 27: 0.0, - 28: 729.0, - 29: 78.0, - 30: 0.0, - 31: 0.0, - 32: 3374.0, - 33: 0.0, - 34: 1391.0, - 35: 0.0, - 36: 361.0, - 37: 0.0, - 38: 61808.0, - 39: 0.0, - 40: 0.0, - 41: 0.0, - 42: 6677.0, - 43: 0.0, - 44: 802.0, - 45: 0.0, - 46: 2691.0, - 47: 0.0, - 48: 3582.0, - 49: 0.0, - 50: 734.0, - 51: 0.0, - 52: 627.0, - 53: 70.0, - 54: 2584.0, - 55: 0.0, - 56: 324.0, - 57: 0.0, - 58: 605.0, - 59: 0.0, - 60: 0.0, - 61: 0.0, - 62: 3989.0, - 63: 10.0, - 64: 42.0, - 65: 0.0, - 66: 904.0, - 67: 0.0, - 68: 88.0, - 69: 70.0, - 70: 8172.0, - 71: 0.0, - 72: 0.0, - 73: 0.0, - 74: 64902.0, - 75: 0.0, - 76: 347.0, - 77: 0.0, - 78: 36605.0, - 79: 0.0, - 80: 379.0, - 81: 70.0, - 82: 0.0, - 83: 0.0, - 84: 3001.0, - 85: 0.0, - 86: 1630.0, - 87: 7.0, - 88: 364.0, - 89: 0.0, - 90: 67404.0, - 91: 9.0, - 92: 0.0, - 93: 0.0, - 94: 7685.0, - 95: 0.0, - 96: 1017.0, - 97: 0.0, - 98: 2831.0, - 99: 0.0, - 100: 2963.0, - 101: 0.0, - 102: 854.0, - 103: 0.0, - 104: 0.0, - 105: 0.0, - 106: 0.0, - 107: 0.0, - 108: 0.0, - 109: 0.0, - 110: 0.0, - 111: 0.0, - 112: 0.0, - 113: 0.0, - 114: 0.0, - 115: 0.0, - 116: 0.0, - 117: 0.0, - 118: 0.0, - 119: 0.0, - 120: 0.0, - 121: 0.0, - 122: 0.0, - 123: 0.0, - 124: 0.0, - 125: 0.0, - 126: 67744.0, - 127: 22.0, - 128: 264.0, - 129: 0.0, - 260: 197.0, - 268: 0.0, - 265: 0.0, - 269: 0.0, - 261: 0.0, - 266: 1198.0, - 267: 0.0, - 262: 2629.0, - 258: 775.0, - 257: 0.0, - 263: 0.0, - 259: 0.0, - 264: 163.0, - 250: 10326.0, - 251: 0.0, - 252: 1228.0, - 253: 0.0, - 254: 2769.0, - 255: 0.0} + ser = { + 256: 2321.0, + 1: 78.0, + 2: 2716.0, + 3: 0.0, + 4: 369.0, + 5: 0.0, + 6: 269.0, + 7: 0.0, + 8: 0.0, + 9: 0.0, + 10: 3536.0, + 11: 0.0, + 12: 24.0, + 13: 0.0, + 14: 931.0, + 15: 0.0, + 16: 101.0, + 17: 78.0, + 18: 9643.0, + 19: 0.0, + 20: 0.0, + 21: 0.0, + 22: 63761.0, + 23: 0.0, + 24: 446.0, + 25: 0.0, + 26: 34773.0, + 27: 0.0, + 28: 729.0, + 29: 78.0, + 30: 0.0, + 31: 0.0, + 32: 3374.0, + 33: 0.0, + 34: 1391.0, + 35: 0.0, + 36: 361.0, + 37: 0.0, + 38: 61808.0, + 39: 0.0, + 40: 0.0, + 41: 0.0, + 42: 6677.0, + 43: 0.0, + 44: 802.0, + 45: 0.0, + 46: 2691.0, + 47: 0.0, + 48: 3582.0, + 49: 0.0, + 50: 734.0, + 51: 0.0, + 52: 627.0, + 53: 70.0, + 54: 2584.0, + 55: 0.0, + 56: 324.0, + 57: 0.0, + 58: 605.0, + 59: 0.0, + 60: 0.0, + 61: 0.0, + 62: 3989.0, + 63: 10.0, + 64: 42.0, + 65: 0.0, + 66: 904.0, + 67: 0.0, + 68: 88.0, + 69: 70.0, + 70: 8172.0, + 71: 0.0, + 72: 0.0, + 73: 0.0, + 74: 64902.0, + 75: 0.0, + 76: 347.0, + 77: 0.0, + 78: 36605.0, + 79: 0.0, + 80: 379.0, + 81: 70.0, + 82: 0.0, + 83: 0.0, + 84: 3001.0, + 85: 0.0, + 86: 1630.0, + 87: 7.0, + 88: 364.0, + 89: 0.0, + 90: 67404.0, + 91: 9.0, + 92: 0.0, + 93: 0.0, + 94: 7685.0, + 95: 0.0, + 96: 1017.0, + 97: 0.0, + 98: 2831.0, + 99: 0.0, + 100: 2963.0, + 101: 0.0, + 102: 854.0, + 103: 0.0, + 104: 0.0, + 105: 0.0, + 106: 0.0, + 107: 0.0, + 108: 0.0, + 109: 0.0, + 110: 0.0, + 111: 0.0, + 112: 0.0, + 113: 0.0, + 114: 0.0, + 115: 0.0, + 116: 0.0, + 117: 0.0, + 118: 0.0, + 119: 0.0, + 120: 0.0, + 121: 0.0, + 122: 0.0, + 123: 0.0, + 124: 0.0, + 125: 0.0, + 126: 67744.0, + 127: 22.0, + 128: 264.0, + 129: 0.0, + 260: 197.0, + 268: 0.0, + 265: 0.0, + 269: 0.0, + 261: 0.0, + 266: 1198.0, + 267: 0.0, + 262: 2629.0, + 258: 775.0, + 257: 0.0, + 263: 0.0, + 259: 0.0, + 264: 163.0, + 250: 10326.0, + 251: 0.0, + 252: 1228.0, + 253: 0.0, + 254: 2769.0, + 255: 0.0, + } # smoke test for the repr s = Series(ser) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 8b54907131b8c2..760d8c70b94342 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -13,15 +13,14 @@ class TestiLoc(Base): - def test_iloc_exceeds_bounds(self): # GH6296 # iloc should allow indexers that exceed the bounds - df = DataFrame(np.random.random_sample((20, 5)), columns=list('ABCDE')) + df = DataFrame(np.random.random_sample((20, 5)), columns=list("ABCDE")) # lists of positions should raise IndexError! - msg = 'positional indexers are out-of-bounds' + msg = "positional indexers are out-of-bounds" with pytest.raises(IndexError, match=msg): df.iloc[:, [0, 1, 2, 3, 4, 5]] with pytest.raises(IndexError, match=msg): @@ -31,14 +30,14 @@ def test_iloc_exceeds_bounds(self): with pytest.raises(IndexError, match=msg): df.iloc[[100]] - s = df['A'] + s = df["A"] with pytest.raises(IndexError, match=msg): s.iloc[[100]] with pytest.raises(IndexError, match=msg): s.iloc[[-100]] # still raise on a single indexer - msg = 'single positional indexer is out-of-bounds' + msg = "single positional indexer is out-of-bounds" with pytest.raises(IndexError, match=msg): df.iloc[30] with pytest.raises(IndexError, match=msg): @@ -104,7 +103,7 @@ def check(result, expected): result.dtypes tm.assert_frame_equal(result, expected) - dfl = DataFrame(np.random.randn(5, 2), columns=list('AB')) + dfl = DataFrame(np.random.randn(5, 2), columns=list("AB")) check(dfl.iloc[:, 2:3], DataFrame(index=dfl.index)) check(dfl.iloc[:, 1:3], dfl.iloc[:, [1]]) check(dfl.iloc[4:6], dfl.iloc[[4]]) @@ -116,40 +115,57 @@ def check(result, expected): with pytest.raises(IndexError, match=msg): dfl.iloc[:, 4] - @pytest.mark.parametrize("index,columns", [(np.arange(20), list('ABCDE'))]) - @pytest.mark.parametrize("index_vals,column_vals", [ - ([slice(None), ['A', 'D']]), - (['1', '2'], slice(None)), - ([pd.datetime(2019, 1, 1)], slice(None))]) - def test_iloc_non_integer_raises(self, index, columns, - index_vals, column_vals): + @pytest.mark.parametrize("index,columns", [(np.arange(20), list("ABCDE"))]) + @pytest.mark.parametrize( + "index_vals,column_vals", + [ + ([slice(None), ["A", "D"]]), + (["1", "2"], slice(None)), + ([pd.datetime(2019, 1, 1)], slice(None)), + ], + ) + def test_iloc_non_integer_raises(self, index, columns, index_vals, column_vals): # GH 25753 - df = DataFrame(np.random.randn(len(index), len(columns)), - index=index, - columns=columns) - msg = '.iloc requires numeric indexers, got' + df = DataFrame( + np.random.randn(len(index), len(columns)), index=index, columns=columns + ) + msg = ".iloc requires numeric indexers, got" with pytest.raises(IndexError, match=msg): df.iloc[index_vals, column_vals] def test_iloc_getitem_int(self): # integer - self.check_result('integer', 'iloc', 2, 'ix', - {0: 4, 1: 6, 2: 8}, typs=['ints', 'uints']) - self.check_result('integer', 'iloc', 2, 'indexer', 2, - typs=['labels', 'mixed', 'ts', 'floats', 'empty'], - fails=IndexError) + self.check_result( + "integer", "iloc", 2, "ix", {0: 4, 1: 6, 2: 8}, typs=["ints", "uints"] + ) + self.check_result( + "integer", + "iloc", + 2, + "indexer", + 2, + typs=["labels", "mixed", "ts", "floats", "empty"], + fails=IndexError, + ) def test_iloc_getitem_neg_int(self): # neg integer - self.check_result('neg int', 'iloc', -1, 'ix', - {0: 6, 1: 9, 2: 12}, typs=['ints', 'uints']) - self.check_result('neg int', 'iloc', -1, 'indexer', -1, - typs=['labels', 'mixed', 'ts', 'floats', 'empty'], - fails=IndexError) - - @pytest.mark.parametrize('dims', [1, 2]) + self.check_result( + "neg int", "iloc", -1, "ix", {0: 6, 1: 9, 2: 12}, typs=["ints", "uints"] + ) + self.check_result( + "neg int", + "iloc", + -1, + "indexer", + -1, + typs=["labels", "mixed", "ts", "floats", "empty"], + fails=IndexError, + ) + + @pytest.mark.parametrize("dims", [1, 2]) def test_iloc_getitem_invalid_scalar(self, dims): # GH 21982 @@ -158,19 +174,18 @@ def test_iloc_getitem_invalid_scalar(self, dims): else: s = DataFrame(np.arange(100).reshape(10, 10)) - with pytest.raises(TypeError, match='Cannot index by location index'): - s.iloc['a'] + with pytest.raises(TypeError, match="Cannot index by location index"): + s.iloc["a"] def test_iloc_array_not_mutating_negative_indices(self): # GH 21867 array_with_neg_numbers = np.array([1, 2, -1]) array_copy = array_with_neg_numbers.copy() - df = pd.DataFrame({ - 'A': [100, 101, 102], - 'B': [103, 104, 105], - 'C': [106, 107, 108]}, - index=[1, 2, 3]) + df = pd.DataFrame( + {"A": [100, 101, 102], "B": [103, 104, 105], "C": [106, 107, 108]}, + index=[1, 2, 3], + ) df.iloc[array_with_neg_numbers] tm.assert_numpy_array_equal(array_with_neg_numbers, array_copy) df.iloc[:, array_with_neg_numbers] @@ -179,33 +194,65 @@ def test_iloc_array_not_mutating_negative_indices(self): def test_iloc_getitem_list_int(self): # list of ints - self.check_result('list int', 'iloc', [0, 1, 2], 'ix', - {0: [0, 2, 4], 1: [0, 3, 6], 2: [0, 4, 8]}, - typs=['ints', 'uints']) - self.check_result('list int', 'iloc', [2], 'ix', - {0: [4], 1: [6], 2: [8]}, typs=['ints', 'uints']) - self.check_result('list int', 'iloc', [0, 1, 2], 'indexer', [0, 1, 2], - typs=['labels', 'mixed', 'ts', 'floats', 'empty'], - fails=IndexError) + self.check_result( + "list int", + "iloc", + [0, 1, 2], + "ix", + {0: [0, 2, 4], 1: [0, 3, 6], 2: [0, 4, 8]}, + typs=["ints", "uints"], + ) + self.check_result( + "list int", + "iloc", + [2], + "ix", + {0: [4], 1: [6], 2: [8]}, + typs=["ints", "uints"], + ) + self.check_result( + "list int", + "iloc", + [0, 1, 2], + "indexer", + [0, 1, 2], + typs=["labels", "mixed", "ts", "floats", "empty"], + fails=IndexError, + ) # array of ints (GH5006), make sure that a single indexer is returning # the correct type - self.check_result('array int', 'iloc', np.array([0, 1, 2]), 'ix', - {0: [0, 2, 4], - 1: [0, 3, 6], - 2: [0, 4, 8]}, typs=['ints', 'uints']) - self.check_result('array int', 'iloc', np.array([2]), 'ix', - {0: [4], 1: [6], 2: [8]}, typs=['ints', 'uints']) - self.check_result('array int', 'iloc', np.array([0, 1, 2]), 'indexer', - [0, 1, 2], - typs=['labels', 'mixed', 'ts', 'floats', 'empty'], - fails=IndexError) + self.check_result( + "array int", + "iloc", + np.array([0, 1, 2]), + "ix", + {0: [0, 2, 4], 1: [0, 3, 6], 2: [0, 4, 8]}, + typs=["ints", "uints"], + ) + self.check_result( + "array int", + "iloc", + np.array([2]), + "ix", + {0: [4], 1: [6], 2: [8]}, + typs=["ints", "uints"], + ) + self.check_result( + "array int", + "iloc", + np.array([0, 1, 2]), + "indexer", + [0, 1, 2], + typs=["labels", "mixed", "ts", "floats", "empty"], + fails=IndexError, + ) def test_iloc_getitem_neg_int_can_reach_first_index(self): # GH10547 and GH10779 # negative integers should be able to reach index 0 - df = DataFrame({'A': [2, 3, 5], 'B': [7, 11, 13]}) - s = df['A'] + df = DataFrame({"A": [2, 3, 5], "B": [7, 11, 13]}) + s = df["A"] expected = df.iloc[0] result = df.iloc[-3] @@ -224,19 +271,25 @@ def test_iloc_getitem_neg_int_can_reach_first_index(self): tm.assert_series_equal(result, expected) # check the length 1 Series case highlighted in GH10547 - expected = Series(['a'], index=['A']) + expected = Series(["a"], index=["A"]) result = expected.iloc[[-1]] tm.assert_series_equal(result, expected) def test_iloc_getitem_dups(self): - self.check_result('list int (dups)', 'iloc', [0, 1, 1, 3], 'ix', - {0: [0, 2, 2, 6], 1: [0, 3, 3, 9]}, - objs=['series', 'frame'], typs=['ints', 'uints']) + self.check_result( + "list int (dups)", + "iloc", + [0, 1, 1, 3], + "ix", + {0: [0, 2, 2, 6], 1: [0, 3, 3, 9]}, + objs=["series", "frame"], + typs=["ints", "uints"], + ) # GH 6766 - df1 = DataFrame([{'A': None, 'B': 1}, {'A': 2, 'B': 2}]) - df2 = DataFrame([{'A': 3, 'B': 3}, {'A': 4, 'B': 4}]) + df1 = DataFrame([{"A": None, "B": 1}, {"A": 2, "B": 2}]) + df2 = DataFrame([{"A": 3, "B": 3}, {"A": 4, "B": 4}]) df = concat([df1, df2], axis=1) # cross-sectional indexing @@ -244,53 +297,74 @@ def test_iloc_getitem_dups(self): assert isna(result) result = df.iloc[0, :] - expected = Series([np.nan, 1, 3, 3], index=['A', 'B', 'A', 'B'], - name=0) + expected = Series([np.nan, 1, 3, 3], index=["A", "B", "A", "B"], name=0) tm.assert_series_equal(result, expected) def test_iloc_getitem_array(self): # array like s = Series(index=range(1, 4)) - self.check_result('array like', 'iloc', s.index, 'ix', - {0: [2, 4, 6], 1: [3, 6, 9], 2: [4, 8, 12]}, - typs=['ints', 'uints']) + self.check_result( + "array like", + "iloc", + s.index, + "ix", + {0: [2, 4, 6], 1: [3, 6, 9], 2: [4, 8, 12]}, + typs=["ints", "uints"], + ) def test_iloc_getitem_bool(self): # boolean indexers - b = [True, False, True, False, ] - self.check_result('bool', 'iloc', b, 'ix', b, typs=['ints', 'uints']) - self.check_result('bool', 'iloc', b, 'ix', b, - typs=['labels', 'mixed', 'ts', 'floats', 'empty'], - fails=IndexError) - - @pytest.mark.parametrize('index', [[True, False], - [True, False, True, False]]) + b = [True, False, True, False] + self.check_result("bool", "iloc", b, "ix", b, typs=["ints", "uints"]) + self.check_result( + "bool", + "iloc", + b, + "ix", + b, + typs=["labels", "mixed", "ts", "floats", "empty"], + fails=IndexError, + ) + + @pytest.mark.parametrize("index", [[True, False], [True, False, True, False]]) def test_iloc_getitem_bool_diff_len(self, index): # GH26658 s = Series([1, 2, 3]) - with pytest.raises(IndexError, - match=('Item wrong length {} instead of {}.'.format( - len(index), len(s)))): + with pytest.raises( + IndexError, + match=("Item wrong length {} instead of {}.".format(len(index), len(s))), + ): _ = s.iloc[index] def test_iloc_getitem_slice(self): # slices - self.check_result('slice', 'iloc', slice(1, 3), 'ix', - {0: [2, 4], 1: [3, 6], 2: [4, 8]}, - typs=['ints', 'uints']) - self.check_result('slice', 'iloc', slice(1, 3), 'indexer', - slice(1, 3), - typs=['labels', 'mixed', 'ts', 'floats', 'empty'], - fails=IndexError) + self.check_result( + "slice", + "iloc", + slice(1, 3), + "ix", + {0: [2, 4], 1: [3, 6], 2: [4, 8]}, + typs=["ints", "uints"], + ) + self.check_result( + "slice", + "iloc", + slice(1, 3), + "indexer", + slice(1, 3), + typs=["labels", "mixed", "ts", "floats", "empty"], + fails=IndexError, + ) def test_iloc_getitem_slice_dups(self): - df1 = DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']) - df2 = DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2), - columns=['A', 'C']) + df1 = DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]) + df2 = DataFrame( + np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] + ) # axis=1 df = concat([df1, df2], axis=1) @@ -332,14 +406,17 @@ def test_iloc_setitem(self): def test_iloc_setitem_list(self): # setitem with an iloc list - df = DataFrame(np.arange(9).reshape((3, 3)), index=["A", "B", "C"], - columns=["A", "B", "C"]) + df = DataFrame( + np.arange(9).reshape((3, 3)), index=["A", "B", "C"], columns=["A", "B", "C"] + ) df.iloc[[0, 1], [1, 2]] df.iloc[[0, 1], [1, 2]] += 100 expected = DataFrame( np.array([0, 101, 102, 3, 104, 105, 6, 7, 8]).reshape((3, 3)), - index=["A", "B", "C"], columns=["A", "B", "C"]) + index=["A", "B", "C"], + columns=["A", "B", "C"], + ) tm.assert_frame_equal(df, expected) def test_iloc_setitem_pandas_object(self): @@ -359,12 +436,12 @@ def test_iloc_setitem_dups(self): # GH 6766 # iloc with a mask aligning from another iloc - df1 = DataFrame([{'A': None, 'B': 1}, {'A': 2, 'B': 2}]) - df2 = DataFrame([{'A': 3, 'B': 3}, {'A': 4, 'B': 4}]) + df1 = DataFrame([{"A": None, "B": 1}, {"A": 2, "B": 2}]) + df2 = DataFrame([{"A": 3, "B": 3}, {"A": 4, "B": 4}]) df = concat([df1, df2], axis=1) expected = df.fillna(3) - expected['A'] = expected['A'].astype('float64') + expected["A"] = expected["A"].astype("float64") inds = np.isnan(df.iloc[:, 0]) mask = inds[inds].index df.iloc[mask, 0] = df.iloc[mask, 2] @@ -372,8 +449,8 @@ def test_iloc_setitem_dups(self): # del a dup column across blocks expected = DataFrame({0: [1, 2], 1: [3, 4]}) - expected.columns = ['B', 'B'] - del df['A'] + expected.columns = ["B", "B"] + del df["A"] tm.assert_frame_equal(df, expected) # assign back to self @@ -381,15 +458,14 @@ def test_iloc_setitem_dups(self): tm.assert_frame_equal(df, expected) # reversed x 2 - df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index( - drop=True) - df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index( - drop=True) + df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index(drop=True) + df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index(drop=True) tm.assert_frame_equal(df, expected) def test_iloc_getitem_frame(self): - df = DataFrame(np.random.randn(10, 4), index=range(0, 20, 2), - columns=range(0, 8, 2)) + df = DataFrame( + np.random.randn(10, 4), index=range(0, 20, 2), columns=range(0, 8, 2) + ) result = df.iloc[2] with catch_warnings(record=True): @@ -453,20 +529,21 @@ def test_iloc_getitem_frame(self): def test_iloc_getitem_labelled_frame(self): # try with labelled frame - df = DataFrame(np.random.randn(10, 4), - index=list('abcdefghij'), columns=list('ABCD')) + df = DataFrame( + np.random.randn(10, 4), index=list("abcdefghij"), columns=list("ABCD") + ) result = df.iloc[1, 1] - exp = df.loc['b', 'B'] + exp = df.loc["b", "B"] assert result == exp result = df.iloc[:, 2:3] - expected = df.loc[:, ['C']] + expected = df.loc[:, ["C"]] tm.assert_frame_equal(result, expected) # negative indexing result = df.iloc[-1, -1] - exp = df.loc['j', 'D'] + exp = df.loc["j", "D"] assert result == exp # out-of-bounds exception @@ -475,11 +552,13 @@ def test_iloc_getitem_labelled_frame(self): df.iloc[10, 5] # trying to use a label - msg = (r"Location based indexing can only have \[integer, integer" - r" slice \(START point is INCLUDED, END point is EXCLUDED\)," - r" listlike of integers, boolean array\] types") + msg = ( + r"Location based indexing can only have \[integer, integer" + r" slice \(START point is INCLUDED, END point is EXCLUDED\)," + r" listlike of integers, boolean array\] types" + ) with pytest.raises(ValueError, match=msg): - df.iloc['j', 'D'] + df.iloc["j", "D"] def test_iloc_getitem_doc_issue(self): @@ -487,8 +566,8 @@ def test_iloc_getitem_doc_issue(self): # surfaced in GH 6059 arr = np.random.randn(6, 4) - index = date_range('20130101', periods=6) - columns = list('ABCD') + index = date_range("20130101", periods=6) + columns = list("ABCD") df = DataFrame(arr, index=index, columns=columns) # defines ref_locs @@ -498,18 +577,16 @@ def test_iloc_getitem_doc_issue(self): str(result) result.dtypes - expected = DataFrame(arr[3:5, 0:2], index=index[3:5], - columns=columns[0:2]) + expected = DataFrame(arr[3:5, 0:2], index=index[3:5], columns=columns[0:2]) tm.assert_frame_equal(result, expected) # for dups - df.columns = list('aaaa') + df.columns = list("aaaa") result = df.iloc[3:5, 0:2] str(result) result.dtypes - expected = DataFrame(arr[3:5, 0:2], index=index[3:5], - columns=list('aa')) + expected = DataFrame(arr[3:5, 0:2], index=index[3:5], columns=list("aa")) tm.assert_frame_equal(result, expected) # related @@ -522,13 +599,13 @@ def test_iloc_getitem_doc_issue(self): result = df.iloc[1:5, 2:4] str(result) result.dtypes - expected = DataFrame(arr[1:5, 2:4], index=index[1:5], - columns=columns[2:4]) + expected = DataFrame(arr[1:5, 2:4], index=index[1:5], columns=columns[2:4]) tm.assert_frame_equal(result, expected) def test_iloc_setitem_series(self): - df = DataFrame(np.random.randn(10, 4), index=list('abcdefghij'), - columns=list('ABCD')) + df = DataFrame( + np.random.randn(10, 4), index=list("abcdefghij"), columns=list("ABCD") + ) df.iloc[1, 1] = 1 result = df.iloc[1, 1] @@ -561,46 +638,41 @@ def test_iloc_setitem_list_of_lists(self): # GH 7551 # list-of-list is set incorrectly in mixed vs. single dtyped frames - df = DataFrame(dict(A=np.arange(5, dtype='int64'), - B=np.arange(5, 10, dtype='int64'))) + df = DataFrame( + dict(A=np.arange(5, dtype="int64"), B=np.arange(5, 10, dtype="int64")) + ) df.iloc[2:4] = [[10, 11], [12, 13]] expected = DataFrame(dict(A=[0, 1, 10, 12, 4], B=[5, 6, 11, 13, 9])) tm.assert_frame_equal(df, expected) - df = DataFrame( - dict(A=list('abcde'), B=np.arange(5, 10, dtype='int64'))) - df.iloc[2:4] = [['x', 11], ['y', 13]] - expected = DataFrame(dict(A=['a', 'b', 'x', 'y', 'e'], - B=[5, 6, 11, 13, 9])) + df = DataFrame(dict(A=list("abcde"), B=np.arange(5, 10, dtype="int64"))) + df.iloc[2:4] = [["x", 11], ["y", 13]] + expected = DataFrame(dict(A=["a", "b", "x", "y", "e"], B=[5, 6, 11, 13, 9])) tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize( - 'indexer', [[0], slice(None, 1, None), np.array([0])]) - @pytest.mark.parametrize( - 'value', [['Z'], np.array(['Z'])]) + @pytest.mark.parametrize("indexer", [[0], slice(None, 1, None), np.array([0])]) + @pytest.mark.parametrize("value", [["Z"], np.array(["Z"])]) def test_iloc_setitem_with_scalar_index(self, indexer, value): # GH #19474 # assigning like "df.iloc[0, [0]] = ['Z']" should be evaluated # elementwisely, not using "setter('A', ['Z'])". - df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) df.iloc[0, indexer] = value result = df.iloc[0, 0] - assert is_scalar(result) and result == 'Z' + assert is_scalar(result) and result == "Z" def test_iloc_mask(self): # GH 3631, iloc with a mask (of a series) should raise - df = DataFrame(list(range(5)), index=list('ABCDE'), columns=['a']) - mask = (df.a % 2 == 0) - msg = ("iLocation based boolean indexing cannot use an indexable as" - " a mask") + df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"]) + mask = df.a % 2 == 0 + msg = "iLocation based boolean indexing cannot use an indexable as" " a mask" with pytest.raises(ValueError, match=msg): df.iloc[mask] mask.index = range(len(mask)) - msg = ("iLocation based boolean indexing on an integer type is not" - " available") + msg = "iLocation based boolean indexing on an integer type is not" " available" with pytest.raises(NotImplementedError, match=msg): df.iloc[mask] @@ -612,41 +684,44 @@ def test_iloc_mask(self): locs = np.arange(4) nums = 2 ** locs reps = [bin(num) for num in nums] - df = DataFrame({'locs': locs, 'nums': nums}, reps) + df = DataFrame({"locs": locs, "nums": nums}, reps) expected = { - (None, ''): '0b1100', - (None, '.loc'): '0b1100', - (None, '.iloc'): '0b1100', - ('index', ''): '0b11', - ('index', '.loc'): '0b11', - ('index', '.iloc'): ('iLocation based boolean indexing ' - 'cannot use an indexable as a mask'), - ('locs', ''): 'Unalignable boolean Series provided as indexer ' - '(index of the boolean Series and of the indexed ' - 'object do not match).', - ('locs', '.loc'): 'Unalignable boolean Series provided as indexer ' - '(index of the boolean Series and of the ' - 'indexed object do not match).', - ('locs', '.iloc'): ('iLocation based boolean indexing on an ' - 'integer type is not available'), + (None, ""): "0b1100", + (None, ".loc"): "0b1100", + (None, ".iloc"): "0b1100", + ("index", ""): "0b11", + ("index", ".loc"): "0b11", + ("index", ".iloc"): ( + "iLocation based boolean indexing " "cannot use an indexable as a mask" + ), + ("locs", ""): "Unalignable boolean Series provided as indexer " + "(index of the boolean Series and of the indexed " + "object do not match).", + ("locs", ".loc"): "Unalignable boolean Series provided as indexer " + "(index of the boolean Series and of the " + "indexed object do not match).", + ("locs", ".iloc"): ( + "iLocation based boolean indexing on an " + "integer type is not available" + ), } # UserWarnings from reindex of a boolean mask with catch_warnings(record=True): simplefilter("ignore", UserWarning) result = dict() - for idx in [None, 'index', 'locs']: + for idx in [None, "index", "locs"]: mask = (df.nums > 2).values if idx: mask = Series(mask, list(reversed(getattr(df, idx)))) - for method in ['', '.loc', '.iloc']: + for method in ["", ".loc", ".iloc"]: try: if method: accessor = getattr(df, method[1:]) else: accessor = df - ans = str(bin(accessor[mask]['nums'].sum())) + ans = str(bin(accessor[mask]["nums"].sum())) except Exception as e: ans = str(e) @@ -654,13 +729,13 @@ def test_iloc_mask(self): r = expected.get(key) if r != ans: raise AssertionError( - "[%s] does not match [%s], received [%s]" - % (key, ans, r)) + "[%s] does not match [%s], received [%s]" % (key, ans, r) + ) def test_iloc_non_unique_indexing(self): # GH 4017, non-unique indexing (on the axis) - df = DataFrame({'A': [0.1] * 3000, 'B': [1] * 3000}) + df = DataFrame({"A": [0.1] * 3000, "B": [1] * 3000}) idx = np.arange(30) * 99 expected = df.iloc[idx] @@ -669,7 +744,7 @@ def test_iloc_non_unique_indexing(self): tm.assert_frame_equal(result, expected) - df2 = DataFrame({'A': [0.1] * 1000, 'B': [1] * 1000}) + df2 = DataFrame({"A": [0.1] * 1000, "B": [1] * 1000}) df2 = concat([df2, 2 * df2, 3 * df2]) sidx = df2.index.to_series() @@ -682,35 +757,43 @@ def test_iloc_non_unique_indexing(self): new_list.append(s * 3) expected = DataFrame(new_list) - expected = concat([expected, DataFrame(index=idx[idx > sidx.max()])], - sort=True) + expected = concat([expected, DataFrame(index=idx[idx > sidx.max()])], sort=True) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df2.loc[idx] tm.assert_frame_equal(result, expected, check_index_type=False) def test_iloc_empty_list_indexer_is_ok(self): from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5, 2) # vertical empty - tm.assert_frame_equal(df.iloc[:, []], df.iloc[:, :0], - check_index_type=True, check_column_type=True) + tm.assert_frame_equal( + df.iloc[:, []], + df.iloc[:, :0], + check_index_type=True, + check_column_type=True, + ) # horizontal empty - tm.assert_frame_equal(df.iloc[[], :], df.iloc[:0, :], - check_index_type=True, check_column_type=True) + tm.assert_frame_equal( + df.iloc[[], :], + df.iloc[:0, :], + check_index_type=True, + check_column_type=True, + ) # horizontal empty - tm.assert_frame_equal(df.iloc[[]], df.iloc[:0, :], - check_index_type=True, - check_column_type=True) + tm.assert_frame_equal( + df.iloc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True + ) def test_identity_slice_returns_new_object(self): # GH13873 - original_df = DataFrame({'a': [1, 2, 3]}) + original_df = DataFrame({"a": [1, 2, 3]}) sliced_df = original_df.iloc[:] assert sliced_df is not original_df # should be a shallow copy - original_df['a'] = [4, 4, 4] - assert (sliced_df['a'] == 4).all() + original_df["a"] = [4, 4, 4] + assert (sliced_df["a"] == 4).all() original_series = Series([1, 2, 3, 4, 5, 6]) sliced_series = original_series.iloc[:] diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 92966e721aedc5..e06047b52ac15f 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -13,7 +13,10 @@ from pandas import DataFrame, Index, NaT, Series from pandas.core.generic import NDFrame from pandas.core.indexing import ( - _maybe_numeric_slice, _non_reducing_slice, validate_indices) + _maybe_numeric_slice, + _non_reducing_slice, + validate_indices, +) from pandas.tests.indexing.common import Base, _mklbl import pandas.util.testing as tm @@ -31,70 +34,89 @@ def test_setitem_ndarray_1d(self): # len of indexer vs length of the 1d ndarray df = DataFrame(index=Index(np.arange(1, 11))) - df['foo'] = np.zeros(10, dtype=np.float64) - df['bar'] = np.zeros(10, dtype=np.complex) + df["foo"] = np.zeros(10, dtype=np.float64) + df["bar"] = np.zeros(10, dtype=np.complex) # invalid with pytest.raises(ValueError): - df.loc[df.index[2:5], 'bar'] = np.array([2.33j, 1.23 + 0.1j, - 2.2, 1.0]) + df.loc[df.index[2:5], "bar"] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0]) # valid - df.loc[df.index[2:6], 'bar'] = np.array([2.33j, 1.23 + 0.1j, - 2.2, 1.0]) + df.loc[df.index[2:6], "bar"] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0]) - result = df.loc[df.index[2:6], 'bar'] - expected = Series([2.33j, 1.23 + 0.1j, 2.2, 1.0], index=[3, 4, 5, 6], - name='bar') + result = df.loc[df.index[2:6], "bar"] + expected = Series( + [2.33j, 1.23 + 0.1j, 2.2, 1.0], index=[3, 4, 5, 6], name="bar" + ) tm.assert_series_equal(result, expected) # dtype getting changed? df = DataFrame(index=Index(np.arange(1, 11))) - df['foo'] = np.zeros(10, dtype=np.float64) - df['bar'] = np.zeros(10, dtype=np.complex) + df["foo"] = np.zeros(10, dtype=np.float64) + df["bar"] = np.zeros(10, dtype=np.complex) with pytest.raises(ValueError): df[2:5] = np.arange(1, 4) * 1j - @pytest.mark.parametrize('index', tm.all_index_generator(5), - ids=lambda x: type(x).__name__) - @pytest.mark.parametrize('obj', [ - lambda i: Series(np.arange(len(i)), index=i), - lambda i: DataFrame( - np.random.randn(len(i), len(i)), index=i, columns=i) - ], ids=['Series', 'DataFrame']) - @pytest.mark.parametrize('idxr, idxr_id', [ - (lambda x: x, 'getitem'), - (lambda x: x.loc, 'loc'), - (lambda x: x.iloc, 'iloc'), - pytest.param(lambda x: x.ix, 'ix', marks=ignore_ix) - ]) + @pytest.mark.parametrize( + "index", tm.all_index_generator(5), ids=lambda x: type(x).__name__ + ) + @pytest.mark.parametrize( + "obj", + [ + lambda i: Series(np.arange(len(i)), index=i), + lambda i: DataFrame(np.random.randn(len(i), len(i)), index=i, columns=i), + ], + ids=["Series", "DataFrame"], + ) + @pytest.mark.parametrize( + "idxr, idxr_id", + [ + (lambda x: x, "getitem"), + (lambda x: x.loc, "loc"), + (lambda x: x.iloc, "iloc"), + pytest.param(lambda x: x.ix, "ix", marks=ignore_ix), + ], + ) def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): # GH 25567 obj = obj(index) idxr = idxr(obj) nd3 = np.random.randint(5, size=(2, 2, 2)) - msg = (r"Buffer has wrong number of dimensions \(expected 1," - r" got 3\)|" - "The truth value of an array with more than one element is" - " ambiguous|" - "Cannot index with multidimensional key|" - r"Wrong number of dimensions. values.ndim != ndim \[3 != 1\]|" - "No matching signature found|" # TypeError - "unhashable type: 'numpy.ndarray'" # TypeError - ) - - if (isinstance(obj, Series) and idxr_id == 'getitem' and - index.inferred_type in [ - 'string', 'datetime64', 'period', 'timedelta64', - 'boolean', 'categorical']): + msg = ( + r"Buffer has wrong number of dimensions \(expected 1," + r" got 3\)|" + "The truth value of an array with more than one element is" + " ambiguous|" + "Cannot index with multidimensional key|" + r"Wrong number of dimensions. values.ndim != ndim \[3 != 1\]|" + "No matching signature found|" # TypeError + "unhashable type: 'numpy.ndarray'" # TypeError + ) + + if ( + isinstance(obj, Series) + and idxr_id == "getitem" + and index.inferred_type + in [ + "string", + "datetime64", + "period", + "timedelta64", + "boolean", + "categorical", + ] + ): idxr[nd3] else: - if (isinstance(obj, DataFrame) and idxr_id == 'getitem' and - index.inferred_type == 'boolean'): + if ( + isinstance(obj, DataFrame) + and idxr_id == "getitem" + and index.inferred_type == "boolean" + ): error = TypeError - elif idxr_id == 'getitem' and index.inferred_type == 'interval': + elif idxr_id == "getitem" and index.inferred_type == "interval": error = TypeError else: error = ValueError @@ -102,49 +124,74 @@ def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): with pytest.raises(error, match=msg): idxr[nd3] - @pytest.mark.parametrize('index', tm.all_index_generator(5), - ids=lambda x: type(x).__name__) - @pytest.mark.parametrize('obj', [ - lambda i: Series(np.arange(len(i)), index=i), - lambda i: DataFrame( - np.random.randn(len(i), len(i)), index=i, columns=i) - ], ids=['Series', 'DataFrame']) - @pytest.mark.parametrize('idxr, idxr_id', [ - (lambda x: x, 'setitem'), - (lambda x: x.loc, 'loc'), - (lambda x: x.iloc, 'iloc'), - pytest.param(lambda x: x.ix, 'ix', marks=ignore_ix) - ]) + @pytest.mark.parametrize( + "index", tm.all_index_generator(5), ids=lambda x: type(x).__name__ + ) + @pytest.mark.parametrize( + "obj", + [ + lambda i: Series(np.arange(len(i)), index=i), + lambda i: DataFrame(np.random.randn(len(i), len(i)), index=i, columns=i), + ], + ids=["Series", "DataFrame"], + ) + @pytest.mark.parametrize( + "idxr, idxr_id", + [ + (lambda x: x, "setitem"), + (lambda x: x.loc, "loc"), + (lambda x: x.iloc, "iloc"), + pytest.param(lambda x: x.ix, "ix", marks=ignore_ix), + ], + ) def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): # GH 25567 obj = obj(index) idxr = idxr(obj) nd3 = np.random.randint(5, size=(2, 2, 2)) - msg = (r"Buffer has wrong number of dimensions \(expected 1," - r" got 3\)|" - "The truth value of an array with more than one element is" - " ambiguous|" - "Only 1-dimensional input arrays are supported|" - "'pandas._libs.interval.IntervalTree' object has no attribute" - " 'set_value'|" # AttributeError - "unhashable type: 'numpy.ndarray'|" # TypeError - "No matching signature found|" # TypeError - r"^\[\[\[" # pandas.core.indexing.IndexingError - ) - - if ((idxr_id == 'iloc') - or ((isinstance(obj, Series) and idxr_id == 'setitem' - and index.inferred_type in [ - 'floating', 'string', 'datetime64', 'period', 'timedelta64', - 'boolean', 'categorical'])) - or (idxr_id == 'ix' and index.inferred_type in [ - 'string', 'datetime64', 'period', 'boolean'])): + msg = ( + r"Buffer has wrong number of dimensions \(expected 1," + r" got 3\)|" + "The truth value of an array with more than one element is" + " ambiguous|" + "Only 1-dimensional input arrays are supported|" + "'pandas._libs.interval.IntervalTree' object has no attribute" + " 'set_value'|" # AttributeError + "unhashable type: 'numpy.ndarray'|" # TypeError + "No matching signature found|" # TypeError + r"^\[\[\[" # pandas.core.indexing.IndexingError + ) + + if ( + (idxr_id == "iloc") + or ( + ( + isinstance(obj, Series) + and idxr_id == "setitem" + and index.inferred_type + in [ + "floating", + "string", + "datetime64", + "period", + "timedelta64", + "boolean", + "categorical", + ] + ) + ) + or ( + idxr_id == "ix" + and index.inferred_type in ["string", "datetime64", "period", "boolean"] + ) + ): idxr[nd3] = 0 else: with pytest.raises( - (ValueError, AttributeError, TypeError, - pd.core.indexing.IndexingError), match=msg): + (ValueError, AttributeError, TypeError, pd.core.indexing.IndexingError), + match=msg, + ): idxr[nd3] = 0 def test_inf_upcast(self): @@ -179,58 +226,67 @@ def test_setitem_dtype_upcast(self): # GH3216 df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) - df['c'] = np.nan - assert df['c'].dtype == np.float64 + df["c"] = np.nan + assert df["c"].dtype == np.float64 - df.loc[0, 'c'] = 'foo' - expected = DataFrame([{"a": 1, "c": 'foo'}, - {"a": 3, "b": 2, "c": np.nan}]) + df.loc[0, "c"] = "foo" + expected = DataFrame([{"a": 1, "c": "foo"}, {"a": 3, "b": 2, "c": np.nan}]) tm.assert_frame_equal(df, expected) # GH10280 - df = DataFrame(np.arange(6, dtype='int64').reshape(2, 3), - index=list('ab'), - columns=['foo', 'bar', 'baz']) + df = DataFrame( + np.arange(6, dtype="int64").reshape(2, 3), + index=list("ab"), + columns=["foo", "bar", "baz"], + ) - for val in [3.14, 'wxyz']: + for val in [3.14, "wxyz"]: left = df.copy() - left.loc['a', 'bar'] = val - right = DataFrame([[0, val, 2], [3, 4, 5]], index=list('ab'), - columns=['foo', 'bar', 'baz']) + left.loc["a", "bar"] = val + right = DataFrame( + [[0, val, 2], [3, 4, 5]], + index=list("ab"), + columns=["foo", "bar", "baz"], + ) tm.assert_frame_equal(left, right) - assert is_integer_dtype(left['foo']) - assert is_integer_dtype(left['baz']) - - left = DataFrame(np.arange(6, dtype='int64').reshape(2, 3) / 10.0, - index=list('ab'), - columns=['foo', 'bar', 'baz']) - left.loc['a', 'bar'] = 'wxyz' - - right = DataFrame([[0, 'wxyz', .2], [.3, .4, .5]], index=list('ab'), - columns=['foo', 'bar', 'baz']) + assert is_integer_dtype(left["foo"]) + assert is_integer_dtype(left["baz"]) + + left = DataFrame( + np.arange(6, dtype="int64").reshape(2, 3) / 10.0, + index=list("ab"), + columns=["foo", "bar", "baz"], + ) + left.loc["a", "bar"] = "wxyz" + + right = DataFrame( + [[0, "wxyz", 0.2], [0.3, 0.4, 0.5]], + index=list("ab"), + columns=["foo", "bar", "baz"], + ) tm.assert_frame_equal(left, right) - assert is_float_dtype(left['foo']) - assert is_float_dtype(left['baz']) + assert is_float_dtype(left["foo"]) + assert is_float_dtype(left["baz"]) def test_dups_fancy_indexing(self): # GH 3455 from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(10, 3) - df.columns = ['a', 'a', 'b'] - result = df[['b', 'a']].columns - expected = Index(['b', 'a', 'a']) + df.columns = ["a", "a", "b"] + result = df[["b", "a"]].columns + expected = Index(["b", "a", "a"]) tm.assert_index_equal(result, expected) # across dtypes - df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']], - columns=list('aaaaaaa')) + df = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("aaaaaaa")) df.head() str(df) - result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']]) - result.columns = list('aaaaaaa') + result = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]]) + result.columns = list("aaaaaaa") # TODO(wesm): unused? df_v = df.iloc[:, 4] # noqa @@ -240,45 +296,51 @@ def test_dups_fancy_indexing(self): # GH 3561, dups not in selected order df = DataFrame( - {'test': [5, 7, 9, 11], - 'test1': [4., 5, 6, 7], - 'other': list('abcd')}, index=['A', 'A', 'B', 'C']) - rows = ['C', 'B'] + {"test": [5, 7, 9, 11], "test1": [4.0, 5, 6, 7], "other": list("abcd")}, + index=["A", "A", "B", "C"], + ) + rows = ["C", "B"] expected = DataFrame( - {'test': [11, 9], - 'test1': [7., 6], - 'other': ['d', 'c']}, index=rows) + {"test": [11, 9], "test1": [7.0, 6], "other": ["d", "c"]}, index=rows + ) result = df.loc[rows] tm.assert_frame_equal(result, expected) result = df.loc[Index(rows)] tm.assert_frame_equal(result, expected) - rows = ['C', 'B', 'E'] + rows = ["C", "B", "E"] expected = DataFrame( - {'test': [11, 9, np.nan], - 'test1': [7., 6, np.nan], - 'other': ['d', 'c', np.nan]}, index=rows) + { + "test": [11, 9, np.nan], + "test1": [7.0, 6, np.nan], + "other": ["d", "c", np.nan], + }, + index=rows, + ) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # see GH5553, make sure we use the right indexer - rows = ['F', 'G', 'H', 'C', 'B', 'E'] - expected = DataFrame({'test': [np.nan, np.nan, np.nan, 11, 9, np.nan], - 'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan], - 'other': [np.nan, np.nan, np.nan, - 'd', 'c', np.nan]}, - index=rows) + rows = ["F", "G", "H", "C", "B", "E"] + expected = DataFrame( + { + "test": [np.nan, np.nan, np.nan, 11, 9, np.nan], + "test1": [np.nan, np.nan, np.nan, 7.0, 6, np.nan], + "other": [np.nan, np.nan, np.nan, "d", "c", np.nan], + }, + index=rows, + ) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # List containing only missing label - dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD')) + dfnu = DataFrame(np.random.randn(5, 3), index=list("AABCD")) with pytest.raises(KeyError): - dfnu.loc[['E']] + dfnu.loc[["E"]] # ToDo: check_index_type can be True after GH 11497 @@ -289,50 +351,52 @@ def test_dups_fancy_indexing(self): expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) - df = DataFrame({"A": list('abc')}) + df = DataFrame({"A": list("abc")}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] - expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0]) + expected = DataFrame({"A": ["a", np.nan, "a"]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) # non unique with non unique selector - df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C']) + df = DataFrame({"test": [5, 7, 9, 11]}, index=["A", "A", "B", "C"]) expected = DataFrame( - {'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E']) + {"test": [5, 7, 5, 7, np.nan]}, index=["A", "A", "A", "A", "E"] + ) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df.loc[['A', 'A', 'E']] + result = df.loc[["A", "A", "E"]] tm.assert_frame_equal(result, expected) def test_dups_fancy_indexing2(self): # GH 5835 # dups on index and missing values - df = DataFrame( - np.random.randn(5, 5), columns=['A', 'B', 'B', 'B', 'A']) + df = DataFrame(np.random.randn(5, 5), columns=["A", "B", "B", "B", "A"]) expected = pd.concat( - [df.loc[:, ['A', 'B']], DataFrame(np.nan, columns=['C'], - index=df.index)], axis=1) + [df.loc[:, ["A", "B"]], DataFrame(np.nan, columns=["C"], index=df.index)], + axis=1, + ) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df.loc[:, ['A', 'B', 'C']] + result = df.loc[:, ["A", "B", "C"]] tm.assert_frame_equal(result, expected) # GH 6504, multi-axis indexing - df = DataFrame(np.random.randn(9, 2), - index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=['a', 'b']) + df = DataFrame( + np.random.randn(9, 2), index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=["a", "b"] + ) expected = df.iloc[0:6] result = df.loc[[1, 2]] tm.assert_frame_equal(result, expected) expected = df - result = df.loc[:, ['a', 'b']] + result = df.loc[:, ["a", "b"]] tm.assert_frame_equal(result, expected) expected = df.iloc[0:6, :] - result = df.loc[[1, 2], ['a', 'b']] + result = df.loc[[1, 2], ["a", "b"]] tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('case', [lambda s: s, lambda s: s.loc]) + @pytest.mark.parametrize("case", [lambda s: s, lambda s: s.loc]) def test_duplicate_int_indexing(self, case): # GH 17347 s = pd.Series(range(3), index=[1, 1, 3]) @@ -343,17 +407,18 @@ def test_duplicate_int_indexing(self, case): def test_indexing_mixed_frame_bug(self): # GH3492 - df = DataFrame({'a': {1: 'aaa', 2: 'bbb', 3: 'ccc'}, - 'b': {1: 111, 2: 222, 3: 333}}) + df = DataFrame( + {"a": {1: "aaa", 2: "bbb", 3: "ccc"}, "b": {1: 111, 2: 222, 3: 333}} + ) # this works, new column is created correctly - df['test'] = df['a'].apply(lambda x: '_' if x == 'aaa' else x) + df["test"] = df["a"].apply(lambda x: "_" if x == "aaa" else x) # this does not work, ie column test is not changed - idx = df['test'] == '_' - temp = df.loc[idx, 'a'].apply(lambda x: '-----' if x == 'aaa' else x) - df.loc[idx, 'test'] = temp - assert df.iloc[0, 2] == '-----' + idx = df["test"] == "_" + temp = df.loc[idx, "a"].apply(lambda x: "-----" if x == "aaa" else x) + df.loc[idx, "test"] = temp + assert df.iloc[0, 2] == "-----" # if I look at df, then element [0,2] equals '_'. If instead I type # df.ix[idx,'test'], I get '-----', finally by typing df.iloc[0,2] I @@ -361,8 +426,7 @@ def test_indexing_mixed_frame_bug(self): def test_multitype_list_index_access(self): # GH 10610 - df = DataFrame(np.random.random((10, 5)), - columns=["a"] + [20, 21, 22, 23]) + df = DataFrame(np.random.random((10, 5)), columns=["a"] + [20, 21, 22, 23]) with pytest.raises(KeyError): df[[22, 26, -8]] @@ -371,88 +435,110 @@ def test_multitype_list_index_access(self): def test_set_index_nan(self): # GH 3586 - df = DataFrame({'PRuid': {17: 'nonQC', - 18: 'nonQC', - 19: 'nonQC', - 20: '10', - 21: '11', - 22: '12', - 23: '13', - 24: '24', - 25: '35', - 26: '46', - 27: '47', - 28: '48', - 29: '59', - 30: '10'}, - 'QC': {17: 0.0, - 18: 0.0, - 19: 0.0, - 20: np.nan, - 21: np.nan, - 22: np.nan, - 23: np.nan, - 24: 1.0, - 25: np.nan, - 26: np.nan, - 27: np.nan, - 28: np.nan, - 29: np.nan, - 30: np.nan}, - 'data': {17: 7.9544899999999998, - 18: 8.0142609999999994, - 19: 7.8591520000000008, - 20: 0.86140349999999999, - 21: 0.87853110000000001, - 22: 0.8427041999999999, - 23: 0.78587700000000005, - 24: 0.73062459999999996, - 25: 0.81668560000000001, - 26: 0.81927080000000008, - 27: 0.80705009999999999, - 28: 0.81440240000000008, - 29: 0.80140849999999997, - 30: 0.81307740000000006}, - 'year': {17: 2006, - 18: 2007, - 19: 2008, - 20: 1985, - 21: 1985, - 22: 1985, - 23: 1985, - 24: 1985, - 25: 1985, - 26: 1985, - 27: 1985, - 28: 1985, - 29: 1985, - 30: 1986}}).reset_index() - - result = df.set_index(['year', 'PRuid', 'QC']).reset_index().reindex( - columns=df.columns) + df = DataFrame( + { + "PRuid": { + 17: "nonQC", + 18: "nonQC", + 19: "nonQC", + 20: "10", + 21: "11", + 22: "12", + 23: "13", + 24: "24", + 25: "35", + 26: "46", + 27: "47", + 28: "48", + 29: "59", + 30: "10", + }, + "QC": { + 17: 0.0, + 18: 0.0, + 19: 0.0, + 20: np.nan, + 21: np.nan, + 22: np.nan, + 23: np.nan, + 24: 1.0, + 25: np.nan, + 26: np.nan, + 27: np.nan, + 28: np.nan, + 29: np.nan, + 30: np.nan, + }, + "data": { + 17: 7.9544899999999998, + 18: 8.0142609999999994, + 19: 7.8591520000000008, + 20: 0.86140349999999999, + 21: 0.87853110000000001, + 22: 0.8427041999999999, + 23: 0.78587700000000005, + 24: 0.73062459999999996, + 25: 0.81668560000000001, + 26: 0.81927080000000008, + 27: 0.80705009999999999, + 28: 0.81440240000000008, + 29: 0.80140849999999997, + 30: 0.81307740000000006, + }, + "year": { + 17: 2006, + 18: 2007, + 19: 2008, + 20: 1985, + 21: 1985, + 22: 1985, + 23: 1985, + 24: 1985, + 25: 1985, + 26: 1985, + 27: 1985, + 28: 1985, + 29: 1985, + 30: 1986, + }, + } + ).reset_index() + + result = ( + df.set_index(["year", "PRuid", "QC"]) + .reset_index() + .reindex(columns=df.columns) + ) tm.assert_frame_equal(result, df) def test_multi_assign(self): # GH 3626, an assignment of a sub-df to a df - df = DataFrame({'FC': ['a', 'b', 'a', 'b', 'a', 'b'], - 'PF': [0, 0, 0, 0, 1, 1], - 'col1': list(range(6)), - 'col2': list(range(6, 12)), - }) + df = DataFrame( + { + "FC": ["a", "b", "a", "b", "a", "b"], + "PF": [0, 0, 0, 0, 1, 1], + "col1": list(range(6)), + "col2": list(range(6, 12)), + } + ) df.iloc[1, 0] = np.nan df2 = df.copy() mask = ~df2.FC.isna() - cols = ['col1', 'col2'] + cols = ["col1", "col2"] dft = df2 * 2 dft.iloc[3, 3] = np.nan - expected = DataFrame({'FC': ['a', np.nan, 'a', 'b', 'a', 'b'], - 'PF': [0, 0, 0, 0, 1, 1], - 'col1': Series([0, 1, 4, 6, 8, 10]), - 'col2': [12, 7, 16, np.nan, 20, 22]}) + expected = DataFrame( + { + "FC": ["a", np.nan, "a", "b", "a", "b"], + "PF": [0, 0, 0, 0, 1, 1], + "col1": Series([0, 1, 4, 6, 8, 10]), + "col2": [12, 7, 16, np.nan, 20, 22], + } + ) # frame on rhs df2.loc[mask, cols] = dft.loc[mask, cols] @@ -464,10 +550,14 @@ def test_multi_assign(self): # with an ndarray on rhs # coerces to float64 because values has float64 dtype # GH 14001 - expected = DataFrame({'FC': ['a', np.nan, 'a', 'b', 'a', 'b'], - 'PF': [0, 0, 0, 0, 1, 1], - 'col1': [0., 1., 4., 6., 8., 10.], - 'col2': [12, 7, 16, np.nan, 20, 22]}) + expected = DataFrame( + { + "FC": ["a", np.nan, "a", "b", "a", "b"], + "PF": [0, 0, 0, 0, 1, 1], + "col1": [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], + "col2": [12, 7, 16, np.nan, 20, 22], + } + ) df2 = df.copy() df2.loc[mask, cols] = dft.loc[mask, cols].values tm.assert_frame_equal(df2, expected) @@ -475,15 +565,21 @@ def test_multi_assign(self): tm.assert_frame_equal(df2, expected) # broadcasting on the rhs is required - df = DataFrame(dict(A=[1, 2, 0, 0, 0], B=[0, 0, 0, 10, 11], C=[ - 0, 0, 0, 10, 11], D=[3, 4, 5, 6, 7])) + df = DataFrame( + dict( + A=[1, 2, 0, 0, 0], + B=[0, 0, 0, 10, 11], + C=[0, 0, 0, 10, 11], + D=[3, 4, 5, 6, 7], + ) + ) expected = df.copy() - mask = expected['A'] == 0 - for col in ['A', 'B']: - expected.loc[mask, col] = df['D'] + mask = expected["A"] == 0 + for col in ["A", "B"]: + expected.loc[mask, col] = df["D"] - df.loc[df['A'] == 0, ['A', 'B']] = df['D'] + df.loc[df["A"] == 0, ["A", "B"]] = df["D"] tm.assert_frame_equal(df, expected) def test_setitem_list(self): @@ -505,7 +601,6 @@ def test_setitem_list(self): # ix with an object class TO: - def __init__(self, value): self.value = value @@ -547,104 +642,113 @@ def test_string_slice(self): # GH 14424 # string indexing against datetimelike with object # dtype should properly raises KeyError - df = DataFrame([1], Index([pd.Timestamp('2011-01-01')], dtype=object)) + df = DataFrame([1], Index([pd.Timestamp("2011-01-01")], dtype=object)) assert df.index.is_all_dates with pytest.raises(KeyError): - df['2011'] + df["2011"] with pytest.raises(KeyError): - df.loc['2011', 0] + df.loc["2011", 0] df = DataFrame() assert not df.index.is_all_dates with pytest.raises(KeyError): - df['2011'] + df["2011"] with pytest.raises(KeyError): - df.loc['2011', 0] + df.loc["2011", 0] def test_astype_assignment(self): # GH4312 (iloc) - df_orig = DataFrame([['1', '2', '3', '.4', 5, 6., 'foo']], - columns=list('ABCDEFG')) + df_orig = DataFrame( + [["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") + ) df = df_orig.copy() df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64) - expected = DataFrame([[1, 2, '3', '.4', 5, 6., 'foo']], - columns=list('ABCDEFG')) + expected = DataFrame( + [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") + ) tm.assert_frame_equal(df, expected) df = df_orig.copy() df.iloc[:, 0:2] = df.iloc[:, 0:2]._convert(datetime=True, numeric=True) - expected = DataFrame([[1, 2, '3', '.4', 5, 6., 'foo']], - columns=list('ABCDEFG')) + expected = DataFrame( + [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") + ) tm.assert_frame_equal(df, expected) # GH5702 (loc) df = df_orig.copy() - df.loc[:, 'A'] = df.loc[:, 'A'].astype(np.int64) - expected = DataFrame([[1, '2', '3', '.4', 5, 6., 'foo']], - columns=list('ABCDEFG')) + df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64) + expected = DataFrame( + [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") + ) tm.assert_frame_equal(df, expected) df = df_orig.copy() - df.loc[:, ['B', 'C']] = df.loc[:, ['B', 'C']].astype(np.int64) - expected = DataFrame([['1', 2, 3, '.4', 5, 6., 'foo']], - columns=list('ABCDEFG')) + df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64) + expected = DataFrame( + [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") + ) tm.assert_frame_equal(df, expected) # full replacements / no nans - df = DataFrame({'A': [1., 2., 3., 4.]}) - df.iloc[:, 0] = df['A'].astype(np.int64) - expected = DataFrame({'A': [1, 2, 3, 4]}) + df = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]}) + df.iloc[:, 0] = df["A"].astype(np.int64) + expected = DataFrame({"A": [1, 2, 3, 4]}) tm.assert_frame_equal(df, expected) - df = DataFrame({'A': [1., 2., 3., 4.]}) - df.loc[:, 'A'] = df['A'].astype(np.int64) - expected = DataFrame({'A': [1, 2, 3, 4]}) + df = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]}) + df.loc[:, "A"] = df["A"].astype(np.int64) + expected = DataFrame({"A": [1, 2, 3, 4]}) tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize("index,val", [ - (Index([0, 1, 2]), 2), - (Index([0, 1, '2']), '2'), - (Index([0, 1, 2, np.inf, 4]), 4), - (Index([0, 1, 2, np.nan, 4]), 4), - (Index([0, 1, 2, np.inf]), np.inf), - (Index([0, 1, 2, np.nan]), np.nan), - ]) + @pytest.mark.parametrize( + "index,val", + [ + (Index([0, 1, 2]), 2), + (Index([0, 1, "2"]), "2"), + (Index([0, 1, 2, np.inf, 4]), 4), + (Index([0, 1, 2, np.nan, 4]), 4), + (Index([0, 1, 2, np.inf]), np.inf), + (Index([0, 1, 2, np.nan]), np.nan), + ], + ) def test_index_contains(self, index, val): assert val in index - @pytest.mark.parametrize("index,val", [ - (Index([0, 1, 2]), '2'), - (Index([0, 1, '2']), 2), - (Index([0, 1, 2, np.inf]), 4), - (Index([0, 1, 2, np.nan]), 4), - (Index([0, 1, 2, np.inf]), np.nan), - (Index([0, 1, 2, np.nan]), np.inf), - # Checking if np.inf in Int64Index should not cause an OverflowError - # Related to GH 16957 - (pd.Int64Index([0, 1, 2]), np.inf), - (pd.Int64Index([0, 1, 2]), np.nan), - (pd.UInt64Index([0, 1, 2]), np.inf), - (pd.UInt64Index([0, 1, 2]), np.nan), - ]) + @pytest.mark.parametrize( + "index,val", + [ + (Index([0, 1, 2]), "2"), + (Index([0, 1, "2"]), 2), + (Index([0, 1, 2, np.inf]), 4), + (Index([0, 1, 2, np.nan]), 4), + (Index([0, 1, 2, np.inf]), np.nan), + (Index([0, 1, 2, np.nan]), np.inf), + # Checking if np.inf in Int64Index should not cause an OverflowError + # Related to GH 16957 + (pd.Int64Index([0, 1, 2]), np.inf), + (pd.Int64Index([0, 1, 2]), np.nan), + (pd.UInt64Index([0, 1, 2]), np.inf), + (pd.UInt64Index([0, 1, 2]), np.nan), + ], + ) def test_index_not_contains(self, index, val): assert val not in index - @pytest.mark.parametrize("index,val", [ - (Index([0, 1, '2']), 0), - (Index([0, 1, '2']), '2'), - ]) + @pytest.mark.parametrize( + "index,val", [(Index([0, 1, "2"]), 0), (Index([0, 1, "2"]), "2")] + ) def test_mixed_index_contains(self, index, val): # GH 19860 assert val in index - @pytest.mark.parametrize("index,val", [ - (Index([0, 1, '2']), '1'), - (Index([0, 1, '2']), 2), - ]) + @pytest.mark.parametrize( + "index,val", [(Index([0, 1, "2"]), "1"), (Index([0, 1, "2"]), 2)] + ) def test_mixed_index_not_contains(self, index, val): # GH 19860 assert val not in index @@ -676,14 +780,11 @@ def test_index_type_coercion(self): # then we need to coerce to object # integer indexes - for s in [Series(range(5)), - Series(range(5), index=range(1, 6))]: + for s in [Series(range(5)), Series(range(5), index=range(1, 6))]: assert s.index.is_integer() - for indexer in [lambda x: x.ix, - lambda x: x.loc, - lambda x: x]: + for indexer in [lambda x: x.ix, lambda x: x.loc, lambda x: x]: s2 = s.copy() indexer(s2)[0.1] = 0 assert s2.index.is_floating() @@ -697,16 +798,14 @@ def test_index_type_coercion(self): tm.assert_index_equal(s2.index, exp) s2 = s.copy() - indexer(s2)['0'] = 0 + indexer(s2)["0"] = 0 assert s2.index.is_object() - for s in [Series(range(5), index=np.arange(5.))]: + for s in [Series(range(5), index=np.arange(5.0))]: assert s.index.is_floating() - for idxr in [lambda x: x.ix, - lambda x: x.loc, - lambda x: x]: + for idxr in [lambda x: x.ix, lambda x: x.loc, lambda x: x]: s2 = s.copy() idxr(s2)[0.1] = 0 @@ -718,27 +817,25 @@ def test_index_type_coercion(self): tm.assert_index_equal(s2.index, s.index) s2 = s.copy() - idxr(s2)['0'] = 0 + idxr(s2)["0"] = 0 assert s2.index.is_object() class TestMisc(Base): - def test_float_index_to_mixed(self): df = DataFrame({0.0: np.random.rand(10), 1.0: np.random.rand(10)}) - df['a'] = 10 - tm.assert_frame_equal(DataFrame({0.0: df[0.0], - 1.0: df[1.0], - 'a': [10] * 10}), - df) + df["a"] = 10 + tm.assert_frame_equal( + DataFrame({0.0: df[0.0], 1.0: df[1.0], "a": [10] * 10}), df + ) def test_float_index_non_scalar_assignment(self): - df = DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}, index=[1., 2., 3.]) + df = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}, index=[1.0, 2.0, 3.0]) df.loc[df.index[:2]] = 1 - expected = DataFrame({'a': [1, 1, 3], 'b': [1, 1, 5]}, index=df.index) + expected = DataFrame({"a": [1, 1, 3], "b": [1, 1, 5]}, index=df.index) tm.assert_frame_equal(expected, df) - df = DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}, index=[1., 2., 3.]) + df = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}, index=[1.0, 2.0, 3.0]) df2 = df.copy() df.loc[df.index] = df.loc[df.index] tm.assert_frame_equal(df, df2) @@ -752,15 +849,15 @@ def test_float_index_at_iat(self): def test_mixed_index_assignment(self): # GH 19860 - s = Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 1, 2]) - s.at['a'] = 11 + s = Series([1, 2, 3, 4, 5], index=["a", "b", "c", 1, 2]) + s.at["a"] = 11 assert s.iat[0] == 11 s.at[1] = 22 assert s.iat[3] == 22 def test_mixed_index_no_fallback(self): # GH 19860 - s = Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 1, 2]) + s = Series([1, 2, 3, 4, 5], index=["a", "b", "c", 1, 2]) with pytest.raises(KeyError): s.at[0] with pytest.raises(KeyError): @@ -771,8 +868,8 @@ def test_rhs_alignment(self): # assigned to. covers both uniform data-type & multi-type cases def run_tests(df, rhs, right): # label, index, slice - lbl_one, idx_one, slice_one = list('bcd'), [1, 2, 3], slice(1, 4) - lbl_two, idx_two, slice_two = ['joe', 'jolie'], [1, 2], slice(1, 3) + lbl_one, idx_one, slice_one = list("bcd"), [1, 2, 3], slice(1, 4) + lbl_two, idx_two, slice_two = ["joe", "jolie"], [1, 2], slice(1, 3) left = df.copy() left.loc[lbl_one, lbl_two] = rhs @@ -802,8 +899,8 @@ def run_tests(df, rhs, right): tm.assert_frame_equal(left, right) xs = np.arange(20).reshape(5, 4) - cols = ['jim', 'joe', 'jolie', 'joline'] - df = DataFrame(xs, columns=cols, index=list('abcde')) + cols = ["jim", "joe", "jolie", "joline"] + df = DataFrame(xs, columns=cols, index=list("abcde")) # right hand side; permute the indices and multiplpy by -2 rhs = -2 * df.iloc[3:0:-1, 2:0:-1] @@ -817,8 +914,8 @@ def run_tests(df, rhs, right): # make frames multi-type & re-run tests for frame in [df, rhs, right]: - frame['joe'] = frame['joe'].astype('float64') - frame['jolie'] = frame['jolie'].map('@{0}'.format) + frame["joe"] = frame["joe"].astype("float64") + frame["jolie"] = frame["jolie"].map("@{0}".format) run_tests(df, rhs, right) @@ -833,30 +930,27 @@ def assert_slices_equivalent(l_slc, i_slc): tm.assert_series_equal(s[l_slc], s.iloc[i_slc]) tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) - for idx in [_mklbl('A', 20), np.arange(20) + 100, - np.linspace(100, 150, 20)]: + for idx in [_mklbl("A", 20), np.arange(20) + 100, np.linspace(100, 150, 20)]: idx = Index(idx) s = Series(np.arange(20), index=idx) - assert_slices_equivalent(SLC[idx[9]::-1], SLC[9::-1]) - assert_slices_equivalent(SLC[:idx[9]:-1], SLC[:8:-1]) - assert_slices_equivalent(SLC[idx[13]:idx[9]:-1], SLC[13:8:-1]) - assert_slices_equivalent(SLC[idx[9]:idx[13]:-1], SLC[:0]) + assert_slices_equivalent(SLC[idx[9] :: -1], SLC[9::-1]) + assert_slices_equivalent(SLC[: idx[9] : -1], SLC[:8:-1]) + assert_slices_equivalent(SLC[idx[13] : idx[9] : -1], SLC[13:8:-1]) + assert_slices_equivalent(SLC[idx[9] : idx[13] : -1], SLC[:0]) def test_slice_with_zero_step_raises(self): - s = Series(np.arange(20), index=_mklbl('A', 20)) - with pytest.raises(ValueError, match='slice step cannot be zero'): + s = Series(np.arange(20), index=_mklbl("A", 20)) + with pytest.raises(ValueError, match="slice step cannot be zero"): s[::0] - with pytest.raises(ValueError, match='slice step cannot be zero'): + with pytest.raises(ValueError, match="slice step cannot be zero"): s.loc[::0] with catch_warnings(record=True): simplefilter("ignore") - with pytest.raises(ValueError, match='slice step cannot be zero'): + with pytest.raises(ValueError, match="slice step cannot be zero"): s.ix[::0] def test_indexing_assignment_dict_already_exists(self): - df = DataFrame({'x': [1, 2, 6], - 'y': [2, 2, 8], - 'z': [-5, 0, 5]}).set_index('z') + df = DataFrame({"x": [1, 2, 6], "y": [2, 2, 8], "z": [-5, 0, 5]}).set_index("z") expected = df.copy() rhs = dict(x=9, y=99) df.loc[5] = rhs @@ -865,16 +959,16 @@ def test_indexing_assignment_dict_already_exists(self): def test_indexing_dtypes_on_empty(self): # Check that .iloc and .ix return correct dtypes GH9983 - df = DataFrame({'a': [1, 2, 3], 'b': ['b', 'b2', 'b3']}) + df = DataFrame({"a": [1, 2, 3], "b": ["b", "b2", "b3"]}) with catch_warnings(record=True): simplefilter("ignore") df2 = df.ix[[], :] - assert df2.loc[:, 'a'].dtype == np.int64 - tm.assert_series_equal(df2.loc[:, 'a'], df2.iloc[:, 0]) + assert df2.loc[:, "a"].dtype == np.int64 + tm.assert_series_equal(df2.loc[:, "a"], df2.iloc[:, 0]) with catch_warnings(record=True): simplefilter("ignore") - tm.assert_series_equal(df2.loc[:, 'a'], df2.ix[:, 0]) + tm.assert_series_equal(df2.loc[:, "a"], df2.ix[:, 0]) def test_range_in_series_indexing(self): # range can cause an indexing error @@ -902,7 +996,7 @@ def test_non_reducing_slice(self): slice(None, None, None), [0, 1], np.array([0, 1]), - Series([0, 1]) + Series([0, 1]), ] for slice_ in slices: tslice_ = _non_reducing_slice(slice_) @@ -910,45 +1004,46 @@ def test_non_reducing_slice(self): def test_list_slice(self): # like dataframe getitem - slices = [['A'], Series(['A']), np.array(['A'])] - df = DataFrame({'A': [1, 2], 'B': [3, 4]}, index=['A', 'B']) - expected = pd.IndexSlice[:, ['A']] + slices = [["A"], Series(["A"]), np.array(["A"])] + df = DataFrame({"A": [1, 2], "B": [3, 4]}, index=["A", "B"]) + expected = pd.IndexSlice[:, ["A"]] for subset in slices: result = _non_reducing_slice(subset) tm.assert_frame_equal(df.loc[result], df.loc[expected]) def test_maybe_numeric_slice(self): - df = DataFrame({'A': [1, 2], 'B': ['c', 'd'], 'C': [True, False]}) + df = DataFrame({"A": [1, 2], "B": ["c", "d"], "C": [True, False]}) result = _maybe_numeric_slice(df, slice_=None) - expected = pd.IndexSlice[:, ['A']] + expected = pd.IndexSlice[:, ["A"]] assert result == expected result = _maybe_numeric_slice(df, None, include_bool=True) - expected = pd.IndexSlice[:, ['A', 'C']] + expected = pd.IndexSlice[:, ["A", "C"]] result = _maybe_numeric_slice(df, [1]) expected = [1] assert result == expected def test_partial_boolean_frame_indexing(self): # GH 17170 - df = DataFrame(np.arange(9.).reshape(3, 3), - index=list('abc'), columns=list('ABC')) - index_df = DataFrame(1, index=list('ab'), columns=list('AB')) + df = DataFrame( + np.arange(9.0).reshape(3, 3), index=list("abc"), columns=list("ABC") + ) + index_df = DataFrame(1, index=list("ab"), columns=list("AB")) result = df[index_df.notnull()] - expected = DataFrame(np.array([[0., 1., np.nan], - [3., 4., np.nan], - [np.nan] * 3]), - index=list('abc'), - columns=list('ABC')) + expected = DataFrame( + np.array([[0.0, 1.0, np.nan], [3.0, 4.0, np.nan], [np.nan] * 3]), + index=list("abc"), + columns=list("ABC"), + ) tm.assert_frame_equal(result, expected) def test_no_reference_cycle(self): - df = DataFrame({'a': [0, 1], 'b': [2, 3]}) - for name in ('loc', 'iloc', 'at', 'iat'): + df = DataFrame({"a": [0, 1], "b": [2, 3]}) + for name in ("loc", "iloc", "at", "iat"): getattr(df, name) with catch_warnings(record=True): simplefilter("ignore") - getattr(df, 'ix') + getattr(df, "ix") wr = weakref.ref(df) del df assert wr() is None @@ -959,11 +1054,11 @@ class TestSeriesNoneCoercion: # For numeric series, we should coerce to NaN. ([1, 2, 3], [np.nan, 2, 3]), ([1.0, 2.0, 3.0], [np.nan, 2.0, 3.0]), - # For datetime series, we should coerce to NaT. - ([datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], - [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)]), - + ( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)], + ), # For objects, we should preserve the None value. (["foo", "bar", "baz"], [None, "bar", "baz"]), ] @@ -1006,56 +1101,60 @@ class TestDataframeNoneCoercion: # For numeric series, we should coerce to NaN. ([1, 2, 3], [np.nan, 2, 3]), ([1.0, 2.0, 3.0], [np.nan, 2.0, 3.0]), - # For datetime series, we should coerce to NaT. - ([datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], - [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)]), - + ( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)], + ), # For objects, we should preserve the None value. (["foo", "bar", "baz"], [None, "bar", "baz"]), ] def test_coercion_with_loc(self): - for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS: - start_dataframe = DataFrame({'foo': start_data}) - start_dataframe.loc[0, ['foo']] = None + for start_data, expected_result in self.EXPECTED_SINGLE_ROW_RESULTS: + start_dataframe = DataFrame({"foo": start_data}) + start_dataframe.loc[0, ["foo"]] = None - expected_dataframe = DataFrame({'foo': expected_result}) + expected_dataframe = DataFrame({"foo": expected_result}) tm.assert_frame_equal(start_dataframe, expected_dataframe) def test_coercion_with_setitem_and_dataframe(self): - for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS: - start_dataframe = DataFrame({'foo': start_data}) - start_dataframe[start_dataframe['foo'] == start_dataframe['foo'][ - 0]] = None + for start_data, expected_result in self.EXPECTED_SINGLE_ROW_RESULTS: + start_dataframe = DataFrame({"foo": start_data}) + start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None - expected_dataframe = DataFrame({'foo': expected_result}) + expected_dataframe = DataFrame({"foo": expected_result}) tm.assert_frame_equal(start_dataframe, expected_dataframe) def test_none_coercion_loc_and_dataframe(self): - for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS: - start_dataframe = DataFrame({'foo': start_data}) - start_dataframe.loc[start_dataframe['foo'] == start_dataframe[ - 'foo'][0]] = None + for start_data, expected_result in self.EXPECTED_SINGLE_ROW_RESULTS: + start_dataframe = DataFrame({"foo": start_data}) + start_dataframe.loc[ + start_dataframe["foo"] == start_dataframe["foo"][0] + ] = None - expected_dataframe = DataFrame({'foo': expected_result}) + expected_dataframe = DataFrame({"foo": expected_result}) tm.assert_frame_equal(start_dataframe, expected_dataframe) def test_none_coercion_mixed_dtypes(self): - start_dataframe = DataFrame({ - 'a': [1, 2, 3], - 'b': [1.0, 2.0, 3.0], - 'c': [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, - 3)], - 'd': ['a', 'b', 'c'] - }) + start_dataframe = DataFrame( + { + "a": [1, 2, 3], + "b": [1.0, 2.0, 3.0], + "c": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + "d": ["a", "b", "c"], + } + ) start_dataframe.iloc[0] = None - exp = DataFrame({'a': [np.nan, 2, 3], - 'b': [np.nan, 2.0, 3.0], - 'c': [NaT, datetime(2000, 1, 2), - datetime(2000, 1, 3)], - 'd': [None, 'b', 'c']}) + exp = DataFrame( + { + "a": [np.nan, 2, 3], + "b": [np.nan, 2.0, 3.0], + "c": [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)], + "d": [None, "b", "c"], + } + ) tm.assert_frame_equal(start_dataframe, exp) @@ -1085,13 +1184,17 @@ def test_validate_indices_empty(): def test_extension_array_cross_section(): # A cross-section of a homogeneous EA should be an EA - df = pd.DataFrame({ - "A": pd.core.arrays.integer_array([1, 2]), - "B": pd.core.arrays.integer_array([3, 4]) - }, index=['a', 'b']) - expected = pd.Series(pd.core.arrays.integer_array([1, 3]), - index=['A', 'B'], name='a') - result = df.loc['a'] + df = pd.DataFrame( + { + "A": pd.core.arrays.integer_array([1, 2]), + "B": pd.core.arrays.integer_array([3, 4]), + }, + index=["a", "b"], + ) + expected = pd.Series( + pd.core.arrays.integer_array([1, 3]), index=["A", "B"], name="a" + ) + result = df.loc["a"] tm.assert_series_equal(result, expected) result = df.iloc[0] @@ -1099,34 +1202,40 @@ def test_extension_array_cross_section(): def test_extension_array_cross_section_converts(): - df = pd.DataFrame({ - "A": pd.core.arrays.integer_array([1, 2]), - "B": np.array([1, 2]), - }, index=['a', 'b']) - result = df.loc['a'] - expected = pd.Series([1, 1], dtype=object, index=['A', 'B'], name='a') + df = pd.DataFrame( + {"A": pd.core.arrays.integer_array([1, 2]), "B": np.array([1, 2])}, + index=["a", "b"], + ) + result = df.loc["a"] + expected = pd.Series([1, 1], dtype=object, index=["A", "B"], name="a") tm.assert_series_equal(result, expected) result = df.iloc[0] tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('idxr, error, error_message', [ - (lambda x: x, - AttributeError, - "'numpy.ndarray' object has no attribute 'get'"), - (lambda x: x.loc, - AttributeError, - "type object 'NDFrame' has no attribute '_AXIS_ALIASES'"), - (lambda x: x.iloc, - AttributeError, - "type object 'NDFrame' has no attribute '_AXIS_ALIASES'"), - pytest.param( - lambda x: x.ix, - ValueError, - "NDFrameIndexer does not support NDFrame objects with ndim > 2", - marks=ignore_ix) -]) +@pytest.mark.parametrize( + "idxr, error, error_message", + [ + (lambda x: x, AttributeError, "'numpy.ndarray' object has no attribute 'get'"), + ( + lambda x: x.loc, + AttributeError, + "type object 'NDFrame' has no attribute '_AXIS_ALIASES'", + ), + ( + lambda x: x.iloc, + AttributeError, + "type object 'NDFrame' has no attribute '_AXIS_ALIASES'", + ), + pytest.param( + lambda x: x.ix, + ValueError, + "NDFrameIndexer does not support NDFrame objects with ndim > 2", + marks=ignore_ix, + ), + ], +) def test_ndframe_indexing_raises(idxr, error, error_message): # GH 25567 frame = NDFrame(np.random.randint(5, size=(2, 2, 2))) diff --git a/pandas/tests/indexing/test_indexing_engines.py b/pandas/tests/indexing/test_indexing_engines.py index 71a797741bbdb4..7303c1ff3d1112 100644 --- a/pandas/tests/indexing/test_indexing_engines.py +++ b/pandas/tests/indexing/test_indexing_engines.py @@ -61,8 +61,7 @@ def test_get_loc(self, numeric_indexing_engine_type_and_dtype): result = engine.get_loc(2) assert (result == expected).all() - def test_get_backfill_indexer( - self, numeric_indexing_engine_type_and_dtype): + def test_get_backfill_indexer(self, numeric_indexing_engine_type_and_dtype): engine_type, dtype = numeric_indexing_engine_type_and_dtype arr = np.array([1, 5, 10], dtype=dtype) @@ -74,8 +73,7 @@ def test_get_backfill_indexer( expected = libalgos.backfill(arr, new) tm.assert_numpy_array_equal(result, expected) - def test_get_pad_indexer( - self, numeric_indexing_engine_type_and_dtype): + def test_get_pad_indexer(self, numeric_indexing_engine_type_and_dtype): engine_type, dtype = numeric_indexing_engine_type_and_dtype arr = np.array([1, 5, 10], dtype=dtype) @@ -91,13 +89,12 @@ def test_get_pad_indexer( class TestObjectEngine: engine_type = libindex.ObjectEngine dtype = np.object_ - values = list('abc') + values = list("abc") def test_is_monotonic(self): num = 1000 - arr = np.array(['a'] * num + ['a'] * num + ['c'] * num, - dtype=self.dtype) + arr = np.array(["a"] * num + ["a"] * num + ["c"] * num, dtype=self.dtype) # monotonic increasing engine = self.engine_type(lambda: arr, len(arr)) @@ -110,8 +107,7 @@ def test_is_monotonic(self): assert engine.is_monotonic_decreasing is True # neither monotonic increasing or decreasing - arr = np.array(['a'] * num + ['b'] * num + ['a'] * num, - dtype=self.dtype) + arr = np.array(["a"] * num + ["b"] * num + ["a"] * num, dtype=self.dtype) engine = self.engine_type(lambda: arr[::-1], len(arr)) assert engine.is_monotonic_increasing is False assert engine.is_monotonic_decreasing is False @@ -123,7 +119,7 @@ def test_is_unique(self): assert engine.is_unique is True # not unique - arr = np.array(['a', 'b', 'a'], dtype=self.dtype) + arr = np.array(["a", "b", "a"], dtype=self.dtype) engine = self.engine_type(lambda: arr, len(arr)) assert engine.is_unique is False @@ -131,37 +127,36 @@ def test_get_loc(self): # unique arr = np.array(self.values, dtype=self.dtype) engine = self.engine_type(lambda: arr, len(arr)) - assert engine.get_loc('b') == 1 + assert engine.get_loc("b") == 1 # monotonic num = 1000 - arr = np.array(['a'] * num + ['b'] * num + ['c'] * num, - dtype=self.dtype) + arr = np.array(["a"] * num + ["b"] * num + ["c"] * num, dtype=self.dtype) engine = self.engine_type(lambda: arr, len(arr)) - assert engine.get_loc('b') == slice(1000, 2000) + assert engine.get_loc("b") == slice(1000, 2000) # not monotonic arr = np.array(self.values * num, dtype=self.dtype) engine = self.engine_type(lambda: arr, len(arr)) expected = np.array([False, True, False] * num, dtype=bool) - result = engine.get_loc('b') + result = engine.get_loc("b") assert (result == expected).all() def test_get_backfill_indexer(self): - arr = np.array(['a', 'e', 'j'], dtype=self.dtype) + arr = np.array(["a", "e", "j"], dtype=self.dtype) engine = self.engine_type(lambda: arr, len(arr)) - new = np.array(list('abcdefghij'), dtype=self.dtype) + new = np.array(list("abcdefghij"), dtype=self.dtype) result = engine.get_backfill_indexer(new) expected = libalgos.backfill["object"](arr, new) tm.assert_numpy_array_equal(result, expected) def test_get_pad_indexer(self): - arr = np.array(['a', 'e', 'j'], dtype=self.dtype) + arr = np.array(["a", "e", "j"], dtype=self.dtype) engine = self.engine_type(lambda: arr, len(arr)) - new = np.array(list('abcdefghij'), dtype=self.dtype) + new = np.array(list("abcdefghij"), dtype=self.dtype) result = engine.get_pad_indexer(new) expected = libalgos.pad["object"](arr, new) diff --git a/pandas/tests/indexing/test_indexing_slow.py b/pandas/tests/indexing/test_indexing_slow.py index 67467a5f34c441..bf8c6afd005614 100644 --- a/pandas/tests/indexing/test_indexing_slow.py +++ b/pandas/tests/indexing/test_indexing_slow.py @@ -5,11 +5,10 @@ class TestIndexingSlow: - @pytest.mark.slow def test_large_dataframe_indexing(self): # GH10692 - result = DataFrame({'x': range(10 ** 6)}, dtype='int64') + result = DataFrame({"x": range(10 ** 6)}, dtype="int64") result.loc[len(result)] = len(result) + 1 - expected = DataFrame({'x': range(10 ** 6 + 1)}, dtype='int64') + expected = DataFrame({"x": range(10 ** 6 + 1)}, dtype="int64") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_ix.py b/pandas/tests/indexing/test_ix.py index 270fa8c5502a65..ee62c91ad9698c 100644 --- a/pandas/tests/indexing/test_ix.py +++ b/pandas/tests/indexing/test_ix.py @@ -15,15 +15,13 @@ def test_ix_deprecation(): # GH 15114 - df = DataFrame({'A': [1, 2, 3]}) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=True): - df.ix[1, 'A'] + df = DataFrame({"A": [1, 2, 3]}) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=True): + df.ix[1, "A"] @pytest.mark.filterwarnings("ignore:\\n.ix:FutureWarning") class TestIX: - def test_ix_loc_setitem_consistency(self): # GH 5771 @@ -35,45 +33,51 @@ def test_ix_loc_setitem_consistency(self): # GH 5928 # chained indexing assignment - df = DataFrame({'a': [0, 1, 2]}) + df = DataFrame({"a": [0, 1, 2]}) expected = df.copy() with catch_warnings(record=True): - expected.ix[[0, 1, 2], 'a'] = -expected.ix[[0, 1, 2], 'a'] + expected.ix[[0, 1, 2], "a"] = -expected.ix[[0, 1, 2], "a"] with catch_warnings(record=True): - df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]] + df["a"].ix[[0, 1, 2]] = -df["a"].ix[[0, 1, 2]] tm.assert_frame_equal(df, expected) - df = DataFrame({'a': [0, 1, 2], 'b': [0, 1, 2]}) + df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2]}) with catch_warnings(record=True): - df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]].astype( - 'float64') + 0.5 - expected = DataFrame({'a': [0.5, -0.5, -1.5], 'b': [0, 1, 2]}) + df["a"].ix[[0, 1, 2]] = -df["a"].ix[[0, 1, 2]].astype("float64") + 0.5 + expected = DataFrame({"a": [0.5, -0.5, -1.5], "b": [0, 1, 2]}) tm.assert_frame_equal(df, expected) # GH 8607 # ix setitem consistency - df = DataFrame({'delta': [1174, 904, 161], - 'elapsed': [7673, 9277, 1470], - 'timestamp': [1413840976, 1413842580, 1413760580]}) - expected = DataFrame({'delta': [1174, 904, 161], - 'elapsed': [7673, 9277, 1470], - 'timestamp': pd.to_datetime( - [1413840976, 1413842580, 1413760580], - unit='s') - }) + df = DataFrame( + { + "delta": [1174, 904, 161], + "elapsed": [7673, 9277, 1470], + "timestamp": [1413840976, 1413842580, 1413760580], + } + ) + expected = DataFrame( + { + "delta": [1174, 904, 161], + "elapsed": [7673, 9277, 1470], + "timestamp": pd.to_datetime( + [1413840976, 1413842580, 1413760580], unit="s" + ), + } + ) df2 = df.copy() - df2['timestamp'] = pd.to_datetime(df['timestamp'], unit='s') + df2["timestamp"] = pd.to_datetime(df["timestamp"], unit="s") tm.assert_frame_equal(df2, expected) df2 = df.copy() - df2.loc[:, 'timestamp'] = pd.to_datetime(df['timestamp'], unit='s') + df2.loc[:, "timestamp"] = pd.to_datetime(df["timestamp"], unit="s") tm.assert_frame_equal(df2, expected) df2 = df.copy() with catch_warnings(record=True): - df2.ix[:, 2] = pd.to_datetime(df['timestamp'], unit='s') + df2.ix[:, 2] = pd.to_datetime(df["timestamp"], unit="s") tm.assert_frame_equal(df2, expected) def test_ix_loc_consistency(self): @@ -89,34 +93,47 @@ def compare(result, expected): assert expected.equals(result) # failure cases for .loc, but these work for .ix - df = DataFrame(np.random.randn(5, 4), columns=list('ABCD')) - for key in [slice(1, 3), tuple([slice(0, 2), slice(0, 2)]), - tuple([slice(0, 2), df.columns[0:2]])]: - - for index in [tm.makeStringIndex, tm.makeUnicodeIndex, - tm.makeDateIndex, tm.makePeriodIndex, - tm.makeTimedeltaIndex]: + df = DataFrame(np.random.randn(5, 4), columns=list("ABCD")) + for key in [ + slice(1, 3), + tuple([slice(0, 2), slice(0, 2)]), + tuple([slice(0, 2), df.columns[0:2]]), + ]: + + for index in [ + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeDateIndex, + tm.makePeriodIndex, + tm.makeTimedeltaIndex, + ]: df.index = index(len(df.index)) with catch_warnings(record=True): df.ix[key] - msg = (r"cannot do slice indexing" - r" on {klass} with these indexers \[(0|1)\] of" - r" {kind}" - .format(klass=type(df.index), kind=str(int))) + msg = ( + r"cannot do slice indexing" + r" on {klass} with these indexers \[(0|1)\] of" + r" {kind}".format(klass=type(df.index), kind=str(int)) + ) with pytest.raises(TypeError, match=msg): df.loc[key] - df = DataFrame(np.random.randn(5, 4), columns=list('ABCD'), - index=pd.date_range('2012-01-01', periods=5)) - - for key in ['2012-01-03', - '2012-01-31', - slice('2012-01-03', '2012-01-03'), - slice('2012-01-03', '2012-01-04'), - slice('2012-01-03', '2012-01-06', 2), - slice('2012-01-03', '2012-01-31'), - tuple([[True, True, True, False, True]]), ]: + df = DataFrame( + np.random.randn(5, 4), + columns=list("ABCD"), + index=pd.date_range("2012-01-01", periods=5), + ) + + for key in [ + "2012-01-03", + "2012-01-31", + slice("2012-01-03", "2012-01-03"), + slice("2012-01-03", "2012-01-04"), + slice("2012-01-03", "2012-01-06", 2), + slice("2012-01-03", "2012-01-31"), + tuple([[True, True, True, False, True]]), + ]: # getitem @@ -142,12 +159,12 @@ def compare(result, expected): compare(df2, df1) # edge cases - s = Series([1, 2, 3, 4], index=list('abde')) + s = Series([1, 2, 3, 4], index=list("abde")) - result1 = s['a':'c'] + result1 = s["a":"c"] with catch_warnings(record=True): - result2 = s.ix['a':'c'] - result3 = s.loc['a':'c'] + result2 = s.ix["a":"c"] + result3 = s.loc["a":"c"] tm.assert_series_equal(result1, result2) tm.assert_series_equal(result1, result3) @@ -166,65 +183,57 @@ def compare(result, expected): def test_ix_weird_slicing(self): # http://stackoverflow.com/q/17056560/1240268 - df = DataFrame({'one': [1, 2, 3, np.nan, np.nan], - 'two': [1, 2, 3, 4, 5]}) - df.loc[df['one'] > 1, 'two'] = -df['two'] - - expected = DataFrame({'one': {0: 1.0, - 1: 2.0, - 2: 3.0, - 3: np.nan, - 4: np.nan}, - 'two': {0: 1, - 1: -2, - 2: -3, - 3: 4, - 4: 5}}) + df = DataFrame({"one": [1, 2, 3, np.nan, np.nan], "two": [1, 2, 3, 4, 5]}) + df.loc[df["one"] > 1, "two"] = -df["two"] + + expected = DataFrame( + { + "one": {0: 1.0, 1: 2.0, 2: 3.0, 3: np.nan, 4: np.nan}, + "two": {0: 1, 1: -2, 2: -3, 3: 4, 4: 5}, + } + ) tm.assert_frame_equal(df, expected) def test_ix_assign_column_mixed(self, float_frame): # GH #1142 df = float_frame - df['foo'] = 'bar' + df["foo"] = "bar" - orig = df.loc[:, 'B'].copy() - df.loc[:, 'B'] = df.loc[:, 'B'] + 1 + orig = df.loc[:, "B"].copy() + df.loc[:, "B"] = df.loc[:, "B"] + 1 tm.assert_series_equal(df.B, orig + 1) # GH 3668, mixed frame with series value - df = DataFrame({'x': np.arange(10), - 'y': np.arange(10, 20), - 'z': 'bar'}) + df = DataFrame({"x": np.arange(10), "y": np.arange(10, 20), "z": "bar"}) expected = df.copy() for i in range(5): indexer = i * 2 v = 1000 + i * 200 - expected.loc[indexer, 'y'] = v - assert expected.loc[indexer, 'y'] == v + expected.loc[indexer, "y"] = v + assert expected.loc[indexer, "y"] == v - df.loc[df.x % 2 == 0, 'y'] = df.loc[df.x % 2 == 0, 'y'] * 100 + df.loc[df.x % 2 == 0, "y"] = df.loc[df.x % 2 == 0, "y"] * 100 tm.assert_frame_equal(df, expected) # GH 4508, making sure consistency of assignments - df = DataFrame({'a': [1, 2, 3], 'b': [0, 1, 2]}) - df.loc[[0, 2, ], 'b'] = [100, -100] - expected = DataFrame({'a': [1, 2, 3], 'b': [100, 1, -100]}) + df = DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]}) + df.loc[[0, 2], "b"] = [100, -100] + expected = DataFrame({"a": [1, 2, 3], "b": [100, 1, -100]}) tm.assert_frame_equal(df, expected) - df = DataFrame({'a': list(range(4))}) - df['b'] = np.nan - df.loc[[1, 3], 'b'] = [100, -100] - expected = DataFrame({'a': [0, 1, 2, 3], - 'b': [np.nan, 100, np.nan, -100]}) + df = DataFrame({"a": list(range(4))}) + df["b"] = np.nan + df.loc[[1, 3], "b"] = [100, -100] + expected = DataFrame({"a": [0, 1, 2, 3], "b": [np.nan, 100, np.nan, -100]}) tm.assert_frame_equal(df, expected) # ok, but chained assignments are dangerous # if we turn off chained assignment it will work - with option_context('chained_assignment', None): - df = DataFrame({'a': list(range(4))}) - df['b'] = np.nan - df['b'].loc[[1, 3]] = [100, -100] + with option_context("chained_assignment", None): + df = DataFrame({"a": list(range(4))}) + df["b"] = np.nan + df["b"].loc[[1, 3]] = [100, -100] tm.assert_frame_equal(df, expected) def test_ix_get_set_consistency(self): @@ -232,58 +241,60 @@ def test_ix_get_set_consistency(self): # GH 4544 # ix/loc get/set not consistent when # a mixed int/string index - df = DataFrame(np.arange(16).reshape((4, 4)), - columns=['a', 'b', 8, 'c'], - index=['e', 7, 'f', 'g']) + df = DataFrame( + np.arange(16).reshape((4, 4)), + columns=["a", "b", 8, "c"], + index=["e", 7, "f", "g"], + ) with catch_warnings(record=True): - assert df.ix['e', 8] == 2 - assert df.loc['e', 8] == 2 + assert df.ix["e", 8] == 2 + assert df.loc["e", 8] == 2 with catch_warnings(record=True): - df.ix['e', 8] = 42 - assert df.ix['e', 8] == 42 - assert df.loc['e', 8] == 42 + df.ix["e", 8] = 42 + assert df.ix["e", 8] == 42 + assert df.loc["e", 8] == 42 - df.loc['e', 8] = 45 + df.loc["e", 8] = 45 with catch_warnings(record=True): - assert df.ix['e', 8] == 45 - assert df.loc['e', 8] == 45 + assert df.ix["e", 8] == 45 + assert df.loc["e", 8] == 45 def test_ix_slicing_strings(self): # see gh-3836 - data = {'Classification': - ['SA EQUITY CFD', 'bbb', 'SA EQUITY', 'SA SSF', 'aaa'], - 'Random': [1, 2, 3, 4, 5], - 'X': ['correct', 'wrong', 'correct', 'correct', 'wrong']} + data = { + "Classification": ["SA EQUITY CFD", "bbb", "SA EQUITY", "SA SSF", "aaa"], + "Random": [1, 2, 3, 4, 5], + "X": ["correct", "wrong", "correct", "correct", "wrong"], + } df = DataFrame(data) - x = df[~df.Classification.isin(['SA EQUITY CFD', 'SA EQUITY', 'SA SSF' - ])] + x = df[~df.Classification.isin(["SA EQUITY CFD", "SA EQUITY", "SA SSF"])] with catch_warnings(record=True): - df.ix[x.index, 'X'] = df['Classification'] - - expected = DataFrame({'Classification': {0: 'SA EQUITY CFD', - 1: 'bbb', - 2: 'SA EQUITY', - 3: 'SA SSF', - 4: 'aaa'}, - 'Random': {0: 1, - 1: 2, - 2: 3, - 3: 4, - 4: 5}, - 'X': {0: 'correct', - 1: 'bbb', - 2: 'correct', - 3: 'correct', - 4: 'aaa'}}) # bug was 4: 'bbb' + df.ix[x.index, "X"] = df["Classification"] + + expected = DataFrame( + { + "Classification": { + 0: "SA EQUITY CFD", + 1: "bbb", + 2: "SA EQUITY", + 3: "SA SSF", + 4: "aaa", + }, + "Random": {0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, + "X": {0: "correct", 1: "bbb", 2: "correct", 3: "correct", 4: "aaa"}, + } + ) # bug was 4: 'bbb' tm.assert_frame_equal(df, expected) def test_ix_setitem_out_of_bounds_axis_0(self): df = DataFrame( - np.random.randn(2, 5), index=["row%s" % i for i in range(2)], - columns=["col%s" % i for i in range(5)]) + np.random.randn(2, 5), + index=["row%s" % i for i in range(2)], + columns=["col%s" % i for i in range(5)], + ) with catch_warnings(record=True): msg = "cannot set by positional indexing with enlargement" with pytest.raises(ValueError, match=msg): @@ -291,8 +302,10 @@ def test_ix_setitem_out_of_bounds_axis_0(self): def test_ix_setitem_out_of_bounds_axis_1(self): df = DataFrame( - np.random.randn(5, 2), index=["row%s" % i for i in range(5)], - columns=["col%s" % i for i in range(2)]) + np.random.randn(5, 2), + index=["row%s" % i for i in range(5)], + columns=["col%s" % i for i in range(2)], + ) with catch_warnings(record=True): msg = "cannot set by positional indexing with enlargement" with pytest.raises(ValueError, match=msg): @@ -301,24 +314,32 @@ def test_ix_setitem_out_of_bounds_axis_1(self): def test_ix_empty_list_indexer_is_ok(self): with catch_warnings(record=True): from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5, 2) # vertical empty - tm.assert_frame_equal(df.ix[:, []], df.iloc[:, :0], - check_index_type=True, - check_column_type=True) + tm.assert_frame_equal( + df.ix[:, []], + df.iloc[:, :0], + check_index_type=True, + check_column_type=True, + ) # horizontal empty - tm.assert_frame_equal(df.ix[[], :], df.iloc[:0, :], - check_index_type=True, - check_column_type=True) + tm.assert_frame_equal( + df.ix[[], :], + df.iloc[:0, :], + check_index_type=True, + check_column_type=True, + ) # horizontal empty - tm.assert_frame_equal(df.ix[[]], df.iloc[:0, :], - check_index_type=True, - check_column_type=True) + tm.assert_frame_equal( + df.ix[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True + ) def test_ix_duplicate_returns_series(self): - df = DataFrame(np.random.randn(3, 3), index=[0.1, 0.2, 0.2], - columns=list('abc')) + df = DataFrame( + np.random.randn(3, 3), index=[0.1, 0.2, 0.2], columns=list("abc") + ) with catch_warnings(record=True): - r = df.ix[0.2, 'a'] - e = df.loc[0.2, 'a'] + r = df.ix[0.2, "a"] + e = df.loc[0.2, "a"] tm.assert_series_equal(r, e) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 2f6e908717071f..d749e697c8282b 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -13,25 +13,30 @@ class TestLoc(Base): - def test_loc_getitem_dups(self): # GH 5678 # repeated getitems on a dup index returning a ndarray df = DataFrame( - np.random.random_sample((20, 5)), - index=['ABCDE' [x % 5] for x in range(20)]) - expected = df.loc['A', 0] - result = df.loc[:, 0].loc['A'] + np.random.random_sample((20, 5)), index=["ABCDE"[x % 5] for x in range(20)] + ) + expected = df.loc["A", 0] + result = df.loc[:, 0].loc["A"] tm.assert_series_equal(result, expected) def test_loc_getitem_dups2(self): # GH4726 # dup indexing with iloc/loc - df = DataFrame([[1, 2, 'foo', 'bar', Timestamp('20130101')]], - columns=['a', 'a', 'a', 'a', 'a'], index=[1]) - expected = Series([1, 2, 'foo', 'bar', Timestamp('20130101')], - index=['a', 'a', 'a', 'a', 'a'], name=1) + df = DataFrame( + [[1, 2, "foo", "bar", Timestamp("20130101")]], + columns=["a", "a", "a", "a", "a"], + index=[1], + ) + expected = Series( + [1, 2, "foo", "bar", Timestamp("20130101")], + index=["a", "a", "a", "a", "a"], + name=1, + ) result = df.iloc[0] tm.assert_series_equal(result, expected) @@ -43,22 +48,25 @@ def test_loc_setitem_dups(self): # GH 6541 df_orig = DataFrame( - {'me': list('rttti'), - 'foo': list('aaade'), - 'bar': np.arange(5, dtype='float64') * 1.34 + 2, - 'bar2': np.arange(5, dtype='float64') * -.34 + 2}).set_index('me') - - indexer = tuple(['r', ['bar', 'bar2']]) + { + "me": list("rttti"), + "foo": list("aaade"), + "bar": np.arange(5, dtype="float64") * 1.34 + 2, + "bar2": np.arange(5, dtype="float64") * -0.34 + 2, + } + ).set_index("me") + + indexer = tuple(["r", ["bar", "bar2"]]) df = df_orig.copy() df.loc[indexer] *= 2.0 tm.assert_series_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) - indexer = tuple(['r', 'bar']) + indexer = tuple(["r", "bar"]) df = df_orig.copy() df.loc[indexer] *= 2.0 assert df.loc[indexer] == 2.0 * df_orig.loc[indexer] - indexer = tuple(['t', ['bar', 'bar2']]) + indexer = tuple(["t", ["bar", "bar2"]]) df = df_orig.copy() df.loc[indexer] *= 2.0 tm.assert_frame_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) @@ -67,109 +75,193 @@ def test_loc_setitem_slice(self): # GH10503 # assigning the same type should not change the type - df1 = DataFrame({'a': [0, 1, 1], - 'b': Series([100, 200, 300], dtype='uint32')}) - ix = df1['a'] == 1 - newb1 = df1.loc[ix, 'b'] + 1 - df1.loc[ix, 'b'] = newb1 - expected = DataFrame({'a': [0, 1, 1], - 'b': Series([100, 201, 301], dtype='uint32')}) + df1 = DataFrame({"a": [0, 1, 1], "b": Series([100, 200, 300], dtype="uint32")}) + ix = df1["a"] == 1 + newb1 = df1.loc[ix, "b"] + 1 + df1.loc[ix, "b"] = newb1 + expected = DataFrame( + {"a": [0, 1, 1], "b": Series([100, 201, 301], dtype="uint32")} + ) tm.assert_frame_equal(df1, expected) # assigning a new type should get the inferred type - df2 = DataFrame({'a': [0, 1, 1], 'b': [100, 200, 300]}, - dtype='uint64') - ix = df1['a'] == 1 - newb2 = df2.loc[ix, 'b'] - df1.loc[ix, 'b'] = newb2 - expected = DataFrame({'a': [0, 1, 1], 'b': [100, 200, 300]}, - dtype='uint64') + df2 = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64") + ix = df1["a"] == 1 + newb2 = df2.loc[ix, "b"] + df1.loc[ix, "b"] = newb2 + expected = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64") tm.assert_frame_equal(df2, expected) def test_loc_getitem_int(self): # int label - self.check_result('int label', 'loc', 2, 'ix', 2, - typs=['ints', 'uints'], axes=0) - self.check_result('int label', 'loc', 3, 'ix', 3, - typs=['ints', 'uints'], axes=1) - self.check_result('int label', 'loc', 2, 'ix', 2, - typs=['label'], fails=KeyError) + self.check_result( + "int label", "loc", 2, "ix", 2, typs=["ints", "uints"], axes=0 + ) + self.check_result( + "int label", "loc", 3, "ix", 3, typs=["ints", "uints"], axes=1 + ) + self.check_result( + "int label", "loc", 2, "ix", 2, typs=["label"], fails=KeyError + ) def test_loc_getitem_label(self): # label - self.check_result('label', 'loc', 'c', 'ix', 'c', typs=['labels'], - axes=0) - self.check_result('label', 'loc', 'null', 'ix', 'null', typs=['mixed'], - axes=0) - self.check_result('label', 'loc', 8, 'ix', 8, typs=['mixed'], axes=0) - self.check_result('label', 'loc', Timestamp('20130102'), 'ix', 1, - typs=['ts'], axes=0) - self.check_result('label', 'loc', 'c', 'ix', 'c', typs=['empty'], - fails=KeyError) + self.check_result("label", "loc", "c", "ix", "c", typs=["labels"], axes=0) + self.check_result("label", "loc", "null", "ix", "null", typs=["mixed"], axes=0) + self.check_result("label", "loc", 8, "ix", 8, typs=["mixed"], axes=0) + self.check_result( + "label", "loc", Timestamp("20130102"), "ix", 1, typs=["ts"], axes=0 + ) + self.check_result( + "label", "loc", "c", "ix", "c", typs=["empty"], fails=KeyError + ) def test_loc_getitem_label_out_of_range(self): # out of range label - self.check_result('label range', 'loc', 'f', 'ix', 'f', - typs=['ints', 'uints', 'labels', 'mixed', 'ts'], - fails=KeyError) - self.check_result('label range', 'loc', 'f', 'ix', 'f', - typs=['floats'], fails=KeyError) - self.check_result('label range', 'loc', 20, 'ix', 20, - typs=['ints', 'uints', 'mixed'], fails=KeyError) - self.check_result('label range', 'loc', 20, 'ix', 20, - typs=['labels'], fails=TypeError) - self.check_result('label range', 'loc', 20, 'ix', 20, typs=['ts'], - axes=0, fails=TypeError) - self.check_result('label range', 'loc', 20, 'ix', 20, typs=['floats'], - axes=0, fails=KeyError) + self.check_result( + "label range", + "loc", + "f", + "ix", + "f", + typs=["ints", "uints", "labels", "mixed", "ts"], + fails=KeyError, + ) + self.check_result( + "label range", "loc", "f", "ix", "f", typs=["floats"], fails=KeyError + ) + self.check_result( + "label range", + "loc", + 20, + "ix", + 20, + typs=["ints", "uints", "mixed"], + fails=KeyError, + ) + self.check_result( + "label range", "loc", 20, "ix", 20, typs=["labels"], fails=TypeError + ) + self.check_result( + "label range", "loc", 20, "ix", 20, typs=["ts"], axes=0, fails=TypeError + ) + self.check_result( + "label range", "loc", 20, "ix", 20, typs=["floats"], axes=0, fails=KeyError + ) def test_loc_getitem_label_list(self): # list of labels - self.check_result('list lbl', 'loc', [0, 2, 4], 'ix', [0, 2, 4], - typs=['ints', 'uints'], axes=0) - self.check_result('list lbl', 'loc', [3, 6, 9], 'ix', [3, 6, 9], - typs=['ints', 'uints'], axes=1) - self.check_result('list lbl', 'loc', ['a', 'b', 'd'], 'ix', - ['a', 'b', 'd'], typs=['labels'], axes=0) - self.check_result('list lbl', 'loc', ['A', 'B', 'C'], 'ix', - ['A', 'B', 'C'], typs=['labels'], axes=1) - self.check_result('list lbl', 'loc', [2, 8, 'null'], 'ix', - [2, 8, 'null'], typs=['mixed'], axes=0) - self.check_result('list lbl', 'loc', - [Timestamp('20130102'), Timestamp('20130103')], 'ix', - [Timestamp('20130102'), Timestamp('20130103')], - typs=['ts'], axes=0) + self.check_result( + "list lbl", + "loc", + [0, 2, 4], + "ix", + [0, 2, 4], + typs=["ints", "uints"], + axes=0, + ) + self.check_result( + "list lbl", + "loc", + [3, 6, 9], + "ix", + [3, 6, 9], + typs=["ints", "uints"], + axes=1, + ) + self.check_result( + "list lbl", + "loc", + ["a", "b", "d"], + "ix", + ["a", "b", "d"], + typs=["labels"], + axes=0, + ) + self.check_result( + "list lbl", + "loc", + ["A", "B", "C"], + "ix", + ["A", "B", "C"], + typs=["labels"], + axes=1, + ) + self.check_result( + "list lbl", + "loc", + [2, 8, "null"], + "ix", + [2, 8, "null"], + typs=["mixed"], + axes=0, + ) + self.check_result( + "list lbl", + "loc", + [Timestamp("20130102"), Timestamp("20130103")], + "ix", + [Timestamp("20130102"), Timestamp("20130103")], + typs=["ts"], + axes=0, + ) def test_loc_getitem_label_list_with_missing(self): - self.check_result('list lbl', 'loc', [0, 1, 2], 'indexer', [0, 1, 2], - typs=['empty'], fails=KeyError) + self.check_result( + "list lbl", + "loc", + [0, 1, 2], + "indexer", + [0, 1, 2], + typs=["empty"], + fails=KeyError, + ) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.check_result('list lbl', 'loc', [0, 2, 10], 'ix', [0, 2, 10], - typs=['ints', 'uints', 'floats'], - axes=0, fails=KeyError) + self.check_result( + "list lbl", + "loc", + [0, 2, 10], + "ix", + [0, 2, 10], + typs=["ints", "uints", "floats"], + axes=0, + fails=KeyError, + ) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.check_result('list lbl', 'loc', [3, 6, 7], 'ix', [3, 6, 7], - typs=['ints', 'uints', 'floats'], - axes=1, fails=KeyError) + self.check_result( + "list lbl", + "loc", + [3, 6, 7], + "ix", + [3, 6, 7], + typs=["ints", "uints", "floats"], + axes=1, + fails=KeyError, + ) # GH 17758 - MultiIndex and missing keys with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.check_result('list lbl', 'loc', [(1, 3), (1, 4), (2, 5)], - 'ix', [(1, 3), (1, 4), (2, 5)], - typs=['multi'], - axes=0) + self.check_result( + "list lbl", + "loc", + [(1, 3), (1, 4), (2, 5)], + "ix", + [(1, 3), (1, 4), (2, 5)], + typs=["multi"], + axes=0, + ) def test_getitem_label_list_with_missing(self): - s = Series(range(3), index=['a', 'b', 'c']) + s = Series(range(3), index=["a", "b", "c"]) # consistency with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s[['a', 'd']] + s[["a", "d"]] s = Series(range(3)) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): @@ -177,53 +269,95 @@ def test_getitem_label_list_with_missing(self): def test_loc_getitem_label_list_fails(self): # fails - self.check_result('list lbl', 'loc', [20, 30, 40], 'ix', [20, 30, 40], - typs=['ints', 'uints'], axes=1, fails=KeyError) + self.check_result( + "list lbl", + "loc", + [20, 30, 40], + "ix", + [20, 30, 40], + typs=["ints", "uints"], + axes=1, + fails=KeyError, + ) def test_loc_getitem_label_array_like(self): # array like - self.check_result('array like', 'loc', Series(index=[0, 2, 4]).index, - 'ix', [0, 2, 4], typs=['ints', 'uints'], axes=0) - self.check_result('array like', 'loc', Series(index=[3, 6, 9]).index, - 'ix', [3, 6, 9], typs=['ints', 'uints'], axes=1) + self.check_result( + "array like", + "loc", + Series(index=[0, 2, 4]).index, + "ix", + [0, 2, 4], + typs=["ints", "uints"], + axes=0, + ) + self.check_result( + "array like", + "loc", + Series(index=[3, 6, 9]).index, + "ix", + [3, 6, 9], + typs=["ints", "uints"], + axes=1, + ) def test_loc_getitem_bool(self): # boolean indexers b = [True, False, True, False] - self.check_result('bool', 'loc', b, 'ix', b, - typs=['ints', 'uints', 'labels', - 'mixed', 'ts', 'floats']) - self.check_result('bool', 'loc', b, 'ix', b, typs=['empty'], - fails=IndexError) - - @pytest.mark.parametrize('index', [[True, False], - [True, False, True, False]]) + self.check_result( + "bool", + "loc", + b, + "ix", + b, + typs=["ints", "uints", "labels", "mixed", "ts", "floats"], + ) + self.check_result("bool", "loc", b, "ix", b, typs=["empty"], fails=IndexError) + + @pytest.mark.parametrize("index", [[True, False], [True, False, True, False]]) def test_loc_getitem_bool_diff_len(self, index): # GH26658 s = Series([1, 2, 3]) - with pytest.raises(IndexError, - match=('Item wrong length {} instead of {}.'.format( - len(index), len(s)))): + with pytest.raises( + IndexError, + match=("Item wrong length {} instead of {}.".format(len(index), len(s))), + ): _ = s.loc[index] def test_loc_getitem_int_slice(self): # ok - self.check_result('int slice2', 'loc', slice(2, 4), 'ix', [2, 4], - typs=['ints', 'uints'], axes=0) - self.check_result('int slice2', 'loc', slice(3, 6), 'ix', [3, 6], - typs=['ints', 'uints'], axes=1) + self.check_result( + "int slice2", + "loc", + slice(2, 4), + "ix", + [2, 4], + typs=["ints", "uints"], + axes=0, + ) + self.check_result( + "int slice2", + "loc", + slice(3, 6), + "ix", + [3, 6], + typs=["ints", "uints"], + axes=1, + ) def test_loc_to_fail(self): # GH3449 - df = DataFrame(np.random.random((3, 3)), - index=['a', 'b', 'c'], - columns=['e', 'f', 'g']) + df = DataFrame( + np.random.random((3, 3)), index=["a", "b", "c"], columns=["e", "f", "g"] + ) # raise a KeyError? - msg = (r"\"None of \[Int64Index\(\[1, 2\], dtype='int64'\)\] are" - r" in the \[index\]\"") + msg = ( + r"\"None of \[Int64Index\(\[1, 2\], dtype='int64'\)\] are" + r" in the \[index\]\"" + ) with pytest.raises(KeyError, match=msg): df.loc[[1, 2], [1, 2]] @@ -232,45 +366,51 @@ def test_loc_to_fail(self): s = Series() s.loc[1] = 1 - s.loc['a'] = 2 + s.loc["a"] = 2 with pytest.raises(KeyError, match=r"^-1$"): s.loc[-1] - msg = (r"\"None of \[Int64Index\(\[-1, -2\], dtype='int64'\)\] are" - r" in the \[index\]\"") + msg = ( + r"\"None of \[Int64Index\(\[-1, -2\], dtype='int64'\)\] are" + r" in the \[index\]\"" + ) with pytest.raises(KeyError, match=msg): s.loc[[-1, -2]] - msg = (r"\"None of \[Index\(\['4'\], dtype='object'\)\] are" - r" in the \[index\]\"") + msg = ( + r"\"None of \[Index\(\['4'\], dtype='object'\)\] are" r" in the \[index\]\"" + ) with pytest.raises(KeyError, match=msg): - s.loc[['4']] + s.loc[["4"]] s.loc[-1] = 3 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = s.loc[[-1, -2]] expected = Series([3, np.nan], index=[-1, -2]) tm.assert_series_equal(result, expected) - s['a'] = 2 - msg = (r"\"None of \[Int64Index\(\[-2\], dtype='int64'\)\] are" - r" in the \[index\]\"") + s["a"] = 2 + msg = ( + r"\"None of \[Int64Index\(\[-2\], dtype='int64'\)\] are" + r" in the \[index\]\"" + ) with pytest.raises(KeyError, match=msg): s.loc[[-2]] - del s['a'] + del s["a"] with pytest.raises(KeyError, match=msg): s.loc[[-2]] = 0 # inconsistency between .loc[values] and .loc[values,:] # GH 7999 - df = DataFrame([['a'], ['b']], index=[1, 2], columns=['value']) + df = DataFrame([["a"], ["b"]], index=[1, 2], columns=["value"]) - msg = (r"\"None of \[Int64Index\(\[3\], dtype='int64'\)\] are" - r" in the \[index\]\"") + msg = ( + r"\"None of \[Int64Index\(\[3\], dtype='int64'\)\] are" + r" in the \[index\]\"" + ) with pytest.raises(KeyError, match=msg): df.loc[[3], :] @@ -297,35 +437,98 @@ def test_loc_getitem_list_with_fail(self): def test_loc_getitem_label_slice(self): # label slices (with ints) - self.check_result('lab slice', 'loc', slice(1, 3), - 'ix', slice(1, 3), - typs=['labels', 'mixed', 'empty', 'ts', 'floats'], - fails=TypeError) + self.check_result( + "lab slice", + "loc", + slice(1, 3), + "ix", + slice(1, 3), + typs=["labels", "mixed", "empty", "ts", "floats"], + fails=TypeError, + ) # real label slices - self.check_result('lab slice', 'loc', slice('a', 'c'), - 'ix', slice('a', 'c'), typs=['labels'], axes=0) - self.check_result('lab slice', 'loc', slice('A', 'C'), - 'ix', slice('A', 'C'), typs=['labels'], axes=1) - - self.check_result('ts slice', 'loc', slice('20130102', '20130104'), - 'ix', slice('20130102', '20130104'), - typs=['ts'], axes=0) - self.check_result('ts slice', 'loc', slice('20130102', '20130104'), - 'ix', slice('20130102', '20130104'), - typs=['ts'], axes=1, fails=TypeError) + self.check_result( + "lab slice", + "loc", + slice("a", "c"), + "ix", + slice("a", "c"), + typs=["labels"], + axes=0, + ) + self.check_result( + "lab slice", + "loc", + slice("A", "C"), + "ix", + slice("A", "C"), + typs=["labels"], + axes=1, + ) + + self.check_result( + "ts slice", + "loc", + slice("20130102", "20130104"), + "ix", + slice("20130102", "20130104"), + typs=["ts"], + axes=0, + ) + self.check_result( + "ts slice", + "loc", + slice("20130102", "20130104"), + "ix", + slice("20130102", "20130104"), + typs=["ts"], + axes=1, + fails=TypeError, + ) # GH 14316 - self.check_result('ts slice rev', 'loc', slice('20130104', '20130102'), - 'indexer', [0, 1, 2], typs=['ts_rev'], axes=0) - - self.check_result('mixed slice', 'loc', slice(2, 8), 'ix', slice(2, 8), - typs=['mixed'], axes=0, fails=TypeError) - self.check_result('mixed slice', 'loc', slice(2, 8), 'ix', slice(2, 8), - typs=['mixed'], axes=1, fails=KeyError) - - self.check_result('mixed slice', 'loc', slice(2, 4, 2), 'ix', slice( - 2, 4, 2), typs=['mixed'], axes=0, fails=TypeError) + self.check_result( + "ts slice rev", + "loc", + slice("20130104", "20130102"), + "indexer", + [0, 1, 2], + typs=["ts_rev"], + axes=0, + ) + + self.check_result( + "mixed slice", + "loc", + slice(2, 8), + "ix", + slice(2, 8), + typs=["mixed"], + axes=0, + fails=TypeError, + ) + self.check_result( + "mixed slice", + "loc", + slice(2, 8), + "ix", + slice(2, 8), + typs=["mixed"], + axes=1, + fails=KeyError, + ) + + self.check_result( + "mixed slice", + "loc", + slice(2, 4, 2), + "ix", + slice(2, 4, 2), + typs=["mixed"], + axes=0, + fails=TypeError, + ) def test_loc_index(self): # gh-17131 @@ -333,7 +536,8 @@ def test_loc_index(self): df = DataFrame( np.random.random(size=(5, 10)), - index=["alpha_0", "alpha_1", "alpha_2", "beta_0", "beta_1"]) + index=["alpha_0", "alpha_1", "alpha_2", "beta_0", "beta_1"], + ) mask = df.index.map(lambda x: "alpha" in x) expected = df.loc[np.array(mask)] @@ -347,74 +551,106 @@ def test_loc_index(self): def test_loc_general(self): df = DataFrame( - np.random.rand(4, 4), columns=['A', 'B', 'C', 'D'], - index=['A', 'B', 'C', 'D']) + np.random.rand(4, 4), + columns=["A", "B", "C", "D"], + index=["A", "B", "C", "D"], + ) # want this to work result = df.loc[:, "A":"B"].iloc[0:2, :] - assert (result.columns == ['A', 'B']).all() - assert (result.index == ['A', 'B']).all() + assert (result.columns == ["A", "B"]).all() + assert (result.index == ["A", "B"]).all() # mixed type - result = DataFrame({'a': [Timestamp('20130101')], 'b': [1]}).iloc[0] - expected = Series([Timestamp('20130101'), 1], index=['a', 'b'], name=0) + result = DataFrame({"a": [Timestamp("20130101")], "b": [1]}).iloc[0] + expected = Series([Timestamp("20130101"), 1], index=["a", "b"], name=0) tm.assert_series_equal(result, expected) assert result.dtype == object def test_loc_setitem_consistency(self): # GH 6149 # coerce similarly for setitem and loc when rows have a null-slice - expected = DataFrame({'date': Series(0, index=range(5), - dtype=np.int64), - 'val': Series(range(5), dtype=np.int64)}) - - df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series( - range(5), dtype=np.int64)}) - df.loc[:, 'date'] = 0 + expected = DataFrame( + { + "date": Series(0, index=range(5), dtype=np.int64), + "val": Series(range(5), dtype=np.int64), + } + ) + + df = DataFrame( + { + "date": date_range("2000-01-01", "2000-01-5"), + "val": Series(range(5), dtype=np.int64), + } + ) + df.loc[:, "date"] = 0 tm.assert_frame_equal(df, expected) - df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series(range(5), dtype=np.int64)}) - df.loc[:, 'date'] = np.array(0, dtype=np.int64) + df = DataFrame( + { + "date": date_range("2000-01-01", "2000-01-5"), + "val": Series(range(5), dtype=np.int64), + } + ) + df.loc[:, "date"] = np.array(0, dtype=np.int64) tm.assert_frame_equal(df, expected) - df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series(range(5), dtype=np.int64)}) - df.loc[:, 'date'] = np.array([0, 0, 0, 0, 0], dtype=np.int64) + df = DataFrame( + { + "date": date_range("2000-01-01", "2000-01-5"), + "val": Series(range(5), dtype=np.int64), + } + ) + df.loc[:, "date"] = np.array([0, 0, 0, 0, 0], dtype=np.int64) tm.assert_frame_equal(df, expected) - expected = DataFrame({'date': Series('foo', index=range(5)), - 'val': Series(range(5), dtype=np.int64)}) - df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series(range(5), dtype=np.int64)}) - df.loc[:, 'date'] = 'foo' + expected = DataFrame( + { + "date": Series("foo", index=range(5)), + "val": Series(range(5), dtype=np.int64), + } + ) + df = DataFrame( + { + "date": date_range("2000-01-01", "2000-01-5"), + "val": Series(range(5), dtype=np.int64), + } + ) + df.loc[:, "date"] = "foo" tm.assert_frame_equal(df, expected) - expected = DataFrame({'date': Series(1.0, index=range(5)), - 'val': Series(range(5), dtype=np.int64)}) - df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series(range(5), dtype=np.int64)}) - df.loc[:, 'date'] = 1.0 + expected = DataFrame( + { + "date": Series(1.0, index=range(5)), + "val": Series(range(5), dtype=np.int64), + } + ) + df = DataFrame( + { + "date": date_range("2000-01-01", "2000-01-5"), + "val": Series(range(5), dtype=np.int64), + } + ) + df.loc[:, "date"] = 1.0 tm.assert_frame_equal(df, expected) # GH 15494 # setting on frame with single row - df = DataFrame({'date': Series([Timestamp('20180101')])}) - df.loc[:, 'date'] = 'string' - expected = DataFrame({'date': Series(['string'])}) + df = DataFrame({"date": Series([Timestamp("20180101")])}) + df.loc[:, "date"] = "string" + expected = DataFrame({"date": Series(["string"])}) tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_empty(self): # empty (essentially noops) - expected = DataFrame(columns=['x', 'y']) - expected['x'] = expected['x'].astype(np.int64) - df = DataFrame(columns=['x', 'y']) - df.loc[:, 'x'] = 1 + expected = DataFrame(columns=["x", "y"]) + expected["x"] = expected["x"].astype(np.int64) + df = DataFrame(columns=["x", "y"]) + df.loc[:, "x"] = 1 tm.assert_frame_equal(df, expected) - df = DataFrame(columns=['x', 'y']) - df['x'] = 1 + df = DataFrame(columns=["x", "y"]) + df["x"] = 1 tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_slice_column_len(self): @@ -429,130 +665,147 @@ def test_loc_setitem_consistency_slice_column_len(self): Region_1,Site_2,3977723089,A,5/20/2015 8:33,5/20/2015 9:09,Yes,No""" df = pd.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1, 2]) - df.loc[:, ('Respondent', 'StartDate')] = pd.to_datetime(df.loc[:, ( - 'Respondent', 'StartDate')]) - df.loc[:, ('Respondent', 'EndDate')] = pd.to_datetime(df.loc[:, ( - 'Respondent', 'EndDate')]) - df.loc[:, ('Respondent', 'Duration')] = df.loc[:, ( - 'Respondent', 'EndDate')] - df.loc[:, ('Respondent', 'StartDate')] - - df.loc[:, ('Respondent', 'Duration')] = df.loc[:, ( - 'Respondent', 'Duration')].astype('timedelta64[s]') - expected = Series([1380, 720, 840, 2160.], index=df.index, - name=('Respondent', 'Duration')) - tm.assert_series_equal(df[('Respondent', 'Duration')], expected) + df.loc[:, ("Respondent", "StartDate")] = pd.to_datetime( + df.loc[:, ("Respondent", "StartDate")] + ) + df.loc[:, ("Respondent", "EndDate")] = pd.to_datetime( + df.loc[:, ("Respondent", "EndDate")] + ) + df.loc[:, ("Respondent", "Duration")] = ( + df.loc[:, ("Respondent", "EndDate")] + - df.loc[:, ("Respondent", "StartDate")] + ) + + df.loc[:, ("Respondent", "Duration")] = df.loc[ + :, ("Respondent", "Duration") + ].astype("timedelta64[s]") + expected = Series( + [1380, 720, 840, 2160.0], index=df.index, name=("Respondent", "Duration") + ) + tm.assert_series_equal(df[("Respondent", "Duration")], expected) def test_loc_setitem_frame(self): df = self.frame_labels result = df.iloc[0, 0] - df.loc['a', 'A'] = 1 - result = df.loc['a', 'A'] + df.loc["a", "A"] = 1 + result = df.loc["a", "A"] assert result == 1 result = df.iloc[0, 0] assert result == 1 - df.loc[:, 'B':'D'] = 0 - expected = df.loc[:, 'B':'D'] + df.loc[:, "B":"D"] = 0 + expected = df.loc[:, "B":"D"] result = df.iloc[:, 1:] tm.assert_frame_equal(result, expected) # GH 6254 # setting issue - df = DataFrame(index=[3, 5, 4], columns=['A']) - df.loc[[4, 3, 5], 'A'] = np.array([1, 2, 3], dtype='int64') - expected = DataFrame(dict(A=Series( - [1, 2, 3], index=[4, 3, 5]))).reindex(index=[3, 5, 4]) + df = DataFrame(index=[3, 5, 4], columns=["A"]) + df.loc[[4, 3, 5], "A"] = np.array([1, 2, 3], dtype="int64") + expected = DataFrame(dict(A=Series([1, 2, 3], index=[4, 3, 5]))).reindex( + index=[3, 5, 4] + ) tm.assert_frame_equal(df, expected) # GH 6252 # setting with an empty frame - keys1 = ['@' + str(i) for i in range(5)] - val1 = np.arange(5, dtype='int64') + keys1 = ["@" + str(i) for i in range(5)] + val1 = np.arange(5, dtype="int64") - keys2 = ['@' + str(i) for i in range(4)] - val2 = np.arange(4, dtype='int64') + keys2 = ["@" + str(i) for i in range(4)] + val2 = np.arange(4, dtype="int64") index = list(set(keys1).union(keys2)) df = DataFrame(index=index) - df['A'] = np.nan - df.loc[keys1, 'A'] = val1 + df["A"] = np.nan + df.loc[keys1, "A"] = val1 - df['B'] = np.nan - df.loc[keys2, 'B'] = val2 + df["B"] = np.nan + df.loc[keys2, "B"] = val2 - expected = DataFrame(dict(A=Series(val1, index=keys1), B=Series( - val2, index=keys2))).reindex(index=index) + expected = DataFrame( + dict(A=Series(val1, index=keys1), B=Series(val2, index=keys2)) + ).reindex(index=index) tm.assert_frame_equal(df, expected) # GH 8669 # invalid coercion of nan -> int - df = DataFrame({'A': [1, 2, 3], 'B': np.nan}) - df.loc[df.B > df.A, 'B'] = df.A - expected = DataFrame({'A': [1, 2, 3], 'B': np.nan}) + df = DataFrame({"A": [1, 2, 3], "B": np.nan}) + df.loc[df.B > df.A, "B"] = df.A + expected = DataFrame({"A": [1, 2, 3], "B": np.nan}) tm.assert_frame_equal(df, expected) # GH 6546 # setting with mixed labels - df = DataFrame({1: [1, 2], 2: [3, 4], 'a': ['a', 'b']}) + df = DataFrame({1: [1, 2], 2: [3, 4], "a": ["a", "b"]}) result = df.loc[0, [1, 2]] expected = Series([1, 3], index=[1, 2], dtype=object, name=0) tm.assert_series_equal(result, expected) - expected = DataFrame({1: [5, 2], 2: [6, 4], 'a': ['a', 'b']}) + expected = DataFrame({1: [5, 2], 2: [6, 4], "a": ["a", "b"]}) df.loc[0, [1, 2]] = [5, 6] tm.assert_frame_equal(df, expected) def test_loc_setitem_frame_multiples(self): # multiple setting - df = DataFrame({'A': ['foo', 'bar', 'baz'], - 'B': Series( - range(3), dtype=np.int64)}) + df = DataFrame( + {"A": ["foo", "bar", "baz"], "B": Series(range(3), dtype=np.int64)} + ) rhs = df.loc[1:2] rhs.index = df.index[0:2] df.loc[0:1] = rhs - expected = DataFrame({'A': ['bar', 'baz', 'baz'], - 'B': Series( - [1, 2, 2], dtype=np.int64)}) + expected = DataFrame( + {"A": ["bar", "baz", "baz"], "B": Series([1, 2, 2], dtype=np.int64)} + ) tm.assert_frame_equal(df, expected) # multiple setting with frame on rhs (with M8) - df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series( - range(5), dtype=np.int64)}) - expected = DataFrame({'date': [Timestamp('20000101'), Timestamp( - '20000102'), Timestamp('20000101'), Timestamp('20000102'), - Timestamp('20000103')], - 'val': Series( - [0, 1, 0, 1, 2], dtype=np.int64)}) + df = DataFrame( + { + "date": date_range("2000-01-01", "2000-01-5"), + "val": Series(range(5), dtype=np.int64), + } + ) + expected = DataFrame( + { + "date": [ + Timestamp("20000101"), + Timestamp("20000102"), + Timestamp("20000101"), + Timestamp("20000102"), + Timestamp("20000103"), + ], + "val": Series([0, 1, 0, 1, 2], dtype=np.int64), + } + ) rhs = df.loc[0:2] rhs.index = df.index[2:5] df.loc[2:4] = rhs tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( - 'indexer', [['A'], slice(None, 'A', None), np.array(['A'])]) - @pytest.mark.parametrize( - 'value', [['Z'], np.array(['Z'])]) + "indexer", [["A"], slice(None, "A", None), np.array(["A"])] + ) + @pytest.mark.parametrize("value", [["Z"], np.array(["Z"])]) def test_loc_setitem_with_scalar_index(self, indexer, value): # GH #19474 # assigning like "df.loc[0, ['A']] = ['Z']" should be evaluated # elementwisely, not using "setter('A', ['Z'])". - df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) df.loc[0, indexer] = value - result = df.loc[0, 'A'] + result = df.loc[0, "A"] - assert is_scalar(result) and result == 'Z' + assert is_scalar(result) and result == "Z" def test_loc_coerceion(self): # 12411 - df = DataFrame({'date': [Timestamp('20130101').tz_localize('UTC'), - pd.NaT]}) + df = DataFrame({"date": [Timestamp("20130101").tz_localize("UTC"), pd.NaT]}) expected = df.dtypes result = df.iloc[[0]] @@ -563,8 +816,10 @@ def test_loc_coerceion(self): # 12045 import datetime - df = DataFrame({'date': [datetime.datetime(2012, 1, 1), - datetime.datetime(1012, 1, 2)]}) + + df = DataFrame( + {"date": [datetime.datetime(2012, 1, 1), datetime.datetime(1012, 1, 2)]} + ) expected = df.dtypes result = df.iloc[[0]] @@ -574,7 +829,7 @@ def test_loc_coerceion(self): tm.assert_series_equal(result.dtypes, expected) # 11594 - df = DataFrame({'text': ['some words'] + [None] * 9}) + df = DataFrame({"text": ["some words"] + [None] * 9}) expected = df.dtypes result = df.iloc[0:2] @@ -589,8 +844,9 @@ def test_loc_non_unique(self): # https://groups.google.com/forum/?fromgroups#!topic/pydata/zTm2No0crYs # these are going to raise because the we are non monotonic - df = DataFrame({'A': [1, 2, 3, 4, 5, 6], - 'B': [3, 4, 5, 6, 7, 8]}, index=[0, 1, 0, 1, 2, 3]) + df = DataFrame( + {"A": [1, 2, 3, 4, 5, 6], "B": [3, 4, 5, 6, 7, 8]}, index=[0, 1, 0, 1, 2, 3] + ) msg = "'Cannot get left slice bound for non-unique label: 1'" with pytest.raises(KeyError, match=msg): df.loc[1:] @@ -602,20 +858,18 @@ def test_loc_non_unique(self): df.loc[1:2] # monotonic are ok - df = DataFrame({'A': [1, 2, 3, 4, 5, 6], - 'B': [3, 4, 5, 6, 7, 8]}, - index=[0, 1, 0, 1, 2, 3]).sort_index(axis=0) + df = DataFrame( + {"A": [1, 2, 3, 4, 5, 6], "B": [3, 4, 5, 6, 7, 8]}, index=[0, 1, 0, 1, 2, 3] + ).sort_index(axis=0) result = df.loc[1:] - expected = DataFrame({'A': [2, 4, 5, 6], 'B': [4, 6, 7, 8]}, - index=[1, 1, 2, 3]) + expected = DataFrame({"A": [2, 4, 5, 6], "B": [4, 6, 7, 8]}, index=[1, 1, 2, 3]) tm.assert_frame_equal(result, expected) result = df.loc[0:] tm.assert_frame_equal(result, df) result = df.loc[1:2] - expected = DataFrame({'A': [2, 4, 5], 'B': [4, 6, 7]}, - index=[1, 1, 2]) + expected = DataFrame({"A": [2, 4, 5], "B": [4, 6, 7]}, index=[1, 1, 2]) tm.assert_frame_equal(result, expected) def test_loc_non_unique_memory_error(self): @@ -623,22 +877,35 @@ def test_loc_non_unique_memory_error(self): # GH 4280 # non_unique index with a large selection triggers a memory error - columns = list('ABCDEFG') + columns = list("ABCDEFG") def gen_test(l, l2): - return pd.concat([ - DataFrame(np.random.randn(l, len(columns)), - index=np.arange(l), columns=columns), - DataFrame(np.ones((l2, len(columns))), - index=[0] * l2, columns=columns)]) + return pd.concat( + [ + DataFrame( + np.random.randn(l, len(columns)), + index=np.arange(l), + columns=columns, + ), + DataFrame( + np.ones((l2, len(columns))), index=[0] * l2, columns=columns + ), + ] + ) def gen_expected(df, mask): len_mask = len(mask) - return pd.concat([df.take([0]), - DataFrame(np.ones((len_mask, len(columns))), - index=[0] * len_mask, - columns=columns), - df.take(mask[1:])]) + return pd.concat( + [ + df.take([0]), + DataFrame( + np.ones((len_mask, len(columns))), + index=[0] * len_mask, + columns=columns, + ), + df.take(mask[1:]), + ] + ) df = gen_test(900, 100) assert df.index.is_unique is False @@ -659,42 +926,45 @@ def gen_expected(df, mask): def test_loc_name(self): # GH 3880 df = DataFrame([[1, 1], [1, 1]]) - df.index.name = 'index_name' + df.index.name = "index_name" result = df.iloc[[0, 1]].index.name - assert result == 'index_name' + assert result == "index_name" with catch_warnings(record=True): filterwarnings("ignore", "\\n.ix", FutureWarning) result = df.ix[[0, 1]].index.name - assert result == 'index_name' + assert result == "index_name" result = df.loc[[0, 1]].index.name - assert result == 'index_name' + assert result == "index_name" def test_loc_empty_list_indexer_is_ok(self): from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5, 2) # vertical empty - tm.assert_frame_equal(df.loc[:, []], df.iloc[:, :0], - check_index_type=True, check_column_type=True) + tm.assert_frame_equal( + df.loc[:, []], df.iloc[:, :0], check_index_type=True, check_column_type=True + ) # horizontal empty - tm.assert_frame_equal(df.loc[[], :], df.iloc[:0, :], - check_index_type=True, check_column_type=True) + tm.assert_frame_equal( + df.loc[[], :], df.iloc[:0, :], check_index_type=True, check_column_type=True + ) # horizontal empty - tm.assert_frame_equal(df.loc[[]], df.iloc[:0, :], - check_index_type=True, - check_column_type=True) + tm.assert_frame_equal( + df.loc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True + ) def test_identity_slice_returns_new_object(self): # GH13873 - original_df = DataFrame({'a': [1, 2, 3]}) + original_df = DataFrame({"a": [1, 2, 3]}) sliced_df = original_df.loc[:] assert sliced_df is not original_df assert original_df[:] is not original_df # should be a shallow copy - original_df['a'] = [4, 4, 4] - assert (sliced_df['a'] == 4).all() + original_df["a"] = [4, 4, 4] + assert (sliced_df["a"] == 4).all() # These should not return copies assert original_df is original_df.loc[:, :] @@ -713,53 +983,53 @@ def test_identity_slice_returns_new_object(self): def test_loc_uint64(self): # GH20722 # Test whether loc accept uint64 max value as index. - s = pd.Series([1, 2], - index=[np.iinfo('uint64').max - 1, - np.iinfo('uint64').max]) + s = pd.Series( + [1, 2], index=[np.iinfo("uint64").max - 1, np.iinfo("uint64").max] + ) - result = s.loc[np.iinfo('uint64').max - 1] + result = s.loc[np.iinfo("uint64").max - 1] expected = s.iloc[0] assert result == expected - result = s.loc[[np.iinfo('uint64').max - 1]] + result = s.loc[[np.iinfo("uint64").max - 1]] expected = s.iloc[[0]] tm.assert_series_equal(result, expected) - result = s.loc[[np.iinfo('uint64').max - 1, - np.iinfo('uint64').max]] + result = s.loc[[np.iinfo("uint64").max - 1, np.iinfo("uint64").max]] tm.assert_series_equal(result, s) def test_loc_setitem_empty_append(self): # GH6173, various appends to an empty dataframe data = [1, 2, 3] - expected = DataFrame({'x': data, 'y': [None] * len(data)}) + expected = DataFrame({"x": data, "y": [None] * len(data)}) # appends to fit length of data - df = DataFrame(columns=['x', 'y']) - df.loc[:, 'x'] = data + df = DataFrame(columns=["x", "y"]) + df.loc[:, "x"] = data tm.assert_frame_equal(df, expected) # only appends one value - expected = DataFrame({'x': [1.0], 'y': [np.nan]}) - df = DataFrame(columns=['x', 'y'], - dtype=np.float) - df.loc[0, 'x'] = expected.loc[0, 'x'] + expected = DataFrame({"x": [1.0], "y": [np.nan]}) + df = DataFrame(columns=["x", "y"], dtype=np.float) + df.loc[0, "x"] = expected.loc[0, "x"] tm.assert_frame_equal(df, expected) def test_loc_setitem_empty_append_raises(self): # GH6173, various appends to an empty dataframe data = [1, 2] - df = DataFrame(columns=['x', 'y']) - msg = (r"None of \[Int64Index\(\[0, 1\], dtype='int64'\)\] " - r"are in the \[index\]") + df = DataFrame(columns=["x", "y"]) + msg = ( + r"None of \[Int64Index\(\[0, 1\], dtype='int64'\)\] " + r"are in the \[index\]" + ) with pytest.raises(KeyError, match=msg): - df.loc[[0, 1], 'x'] = data + df.loc[[0, 1], "x"] = data msg = "cannot copy sequence with size 2 to array axis with dimension 0" with pytest.raises(ValueError, match=msg): - df.loc[0:2, 'x'] = data + df.loc[0:2, "x"] = data def test_indexing_zerodim_np_array(self): # GH24924 diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index f2696f282c2c44..68e93f06e43dc6 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -15,7 +15,6 @@ class TestPartialSetting: - @pytest.mark.filterwarnings("ignore:\\n.ix:FutureWarning") def test_partial_setting(self): @@ -35,100 +34,101 @@ def test_partial_setting(self): tm.assert_series_equal(s, expected) s = s_orig.copy() - s[5] = 5. - expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) + s[5] = 5.0 + expected = Series([1, 2, 3, 5.0], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) s = s_orig.copy() - s.loc[5] = 5. - expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) + s.loc[5] = 5.0 + expected = Series([1, 2, 3, 5.0], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) # iloc/iat raise s = s_orig.copy() with pytest.raises(IndexError): - s.iloc[3] = 5. + s.iloc[3] = 5.0 with pytest.raises(IndexError): - s.iat[3] = 5. + s.iat[3] = 5.0 # ## frame ## df_orig = DataFrame( - np.arange(6).reshape(3, 2), columns=['A', 'B'], dtype='int64') + np.arange(6).reshape(3, 2), columns=["A", "B"], dtype="int64" + ) # iloc/iat raise df = df_orig.copy() with pytest.raises(IndexError): - df.iloc[4, 2] = 5. + df.iloc[4, 2] = 5.0 with pytest.raises(IndexError): - df.iat[4, 2] = 5. + df.iat[4, 2] = 5.0 # row setting where it exists - expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) + expected = DataFrame(dict({"A": [0, 4, 4], "B": [1, 5, 5]})) df = df_orig.copy() df.iloc[1] = df.iloc[2] tm.assert_frame_equal(df, expected) - expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) + expected = DataFrame(dict({"A": [0, 4, 4], "B": [1, 5, 5]})) df = df_orig.copy() df.loc[1] = df.loc[2] tm.assert_frame_equal(df, expected) # like 2578, partial setting with dtype preservation - expected = DataFrame(dict({'A': [0, 2, 4, 4], 'B': [1, 3, 5, 5]})) + expected = DataFrame(dict({"A": [0, 2, 4, 4], "B": [1, 3, 5, 5]})) df = df_orig.copy() df.loc[3] = df.loc[2] tm.assert_frame_equal(df, expected) # single dtype frame, overwrite - expected = DataFrame(dict({'A': [0, 2, 4], 'B': [0, 2, 4]})) + expected = DataFrame(dict({"A": [0, 2, 4], "B": [0, 2, 4]})) df = df_orig.copy() with catch_warnings(record=True): - df.ix[:, 'B'] = df.ix[:, 'A'] + df.ix[:, "B"] = df.ix[:, "A"] tm.assert_frame_equal(df, expected) # mixed dtype frame, overwrite - expected = DataFrame(dict({'A': [0, 2, 4], 'B': Series([0, 2, 4])})) + expected = DataFrame(dict({"A": [0, 2, 4], "B": Series([0, 2, 4])})) df = df_orig.copy() - df['B'] = df['B'].astype(np.float64) + df["B"] = df["B"].astype(np.float64) with catch_warnings(record=True): - df.ix[:, 'B'] = df.ix[:, 'A'] + df.ix[:, "B"] = df.ix[:, "A"] tm.assert_frame_equal(df, expected) # single dtype frame, partial setting expected = df_orig.copy() - expected['C'] = df['A'] + expected["C"] = df["A"] df = df_orig.copy() with catch_warnings(record=True): - df.ix[:, 'C'] = df.ix[:, 'A'] + df.ix[:, "C"] = df.ix[:, "A"] tm.assert_frame_equal(df, expected) # mixed frame, partial setting expected = df_orig.copy() - expected['C'] = df['A'] + expected["C"] = df["A"] df = df_orig.copy() with catch_warnings(record=True): - df.ix[:, 'C'] = df.ix[:, 'A'] + df.ix[:, "C"] = df.ix[:, "A"] tm.assert_frame_equal(df, expected) # GH 8473 - dates = date_range('1/1/2000', periods=8) - df_orig = DataFrame(np.random.randn(8, 4), index=dates, - columns=['A', 'B', 'C', 'D']) - - expected = pd.concat([df_orig, - DataFrame({'A': 7}, - index=[dates[-1] + dates.freq])], - sort=True) + dates = date_range("1/1/2000", periods=8) + df_orig = DataFrame( + np.random.randn(8, 4), index=dates, columns=["A", "B", "C", "D"] + ) + + expected = pd.concat( + [df_orig, DataFrame({"A": 7}, index=[dates[-1] + dates.freq])], sort=True + ) df = df_orig.copy() - df.loc[dates[-1] + dates.freq, 'A'] = 7 + df.loc[dates[-1] + dates.freq, "A"] = 7 tm.assert_frame_equal(df, expected) df = df_orig.copy() - df.at[dates[-1] + dates.freq, 'A'] = 7 + df.at[dates[-1] + dates.freq, "A"] = 7 tm.assert_frame_equal(df, expected) exp_other = DataFrame({0: 7}, index=[dates[-1] + dates.freq]) @@ -155,30 +155,28 @@ def test_partial_setting_mixed_dtype(self): tm.assert_frame_equal(df, expected) # columns will align - df = DataFrame(columns=['A', 'B']) + df = DataFrame(columns=["A", "B"]) df.loc[0] = Series(1, index=range(4)) - tm.assert_frame_equal(df, DataFrame(columns=['A', 'B'], index=[0])) + tm.assert_frame_equal(df, DataFrame(columns=["A", "B"], index=[0])) # columns will align - df = DataFrame(columns=['A', 'B']) - df.loc[0] = Series(1, index=['B']) + df = DataFrame(columns=["A", "B"]) + df.loc[0] = Series(1, index=["B"]) - exp = DataFrame([[np.nan, 1]], columns=['A', 'B'], - index=[0], dtype='float64') + exp = DataFrame([[np.nan, 1]], columns=["A", "B"], index=[0], dtype="float64") tm.assert_frame_equal(df, exp) # list-like must conform - df = DataFrame(columns=['A', 'B']) + df = DataFrame(columns=["A", "B"]) with pytest.raises(ValueError): df.loc[0] = [1, 2, 3] # TODO: #15657, these are left as object and not coerced - df = DataFrame(columns=['A', 'B']) + df = DataFrame(columns=["A", "B"]) df.loc[3] = [6, 7] - exp = DataFrame([[6, 7]], index=[3], columns=['A', 'B'], - dtype='object') + exp = DataFrame([[6, 7]], index=[3], columns=["A", "B"], dtype="object") tm.assert_frame_equal(df, exp) def test_series_partial_set(self): @@ -195,29 +193,31 @@ def test_series_partial_set(self): result = ser.reindex([3, 2, 3]) tm.assert_series_equal(result, expected, check_index_type=True) - expected = Series([np.nan, 0.2, np.nan, np.nan], index=[3, 2, 3, 'x']) + expected = Series([np.nan, 0.2, np.nan, np.nan], index=[3, 2, 3, "x"]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ser.loc[[3, 2, 3, 'x']] + result = ser.loc[[3, 2, 3, "x"]] tm.assert_series_equal(result, expected, check_index_type=True) - result = ser.reindex([3, 2, 3, 'x']) + result = ser.reindex([3, 2, 3, "x"]) tm.assert_series_equal(result, expected, check_index_type=True) expected = Series([0.2, 0.2, 0.1], index=[2, 2, 1]) result = ser.loc[[2, 2, 1]] tm.assert_series_equal(result, expected, check_index_type=True) - expected = Series([0.2, 0.2, np.nan, 0.1], index=[2, 2, 'x', 1]) + expected = Series([0.2, 0.2, np.nan, 0.1], index=[2, 2, "x", 1]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ser.loc[[2, 2, 'x', 1]] + result = ser.loc[[2, 2, "x", 1]] tm.assert_series_equal(result, expected, check_index_type=True) - result = ser.reindex([2, 2, 'x', 1]) + result = ser.reindex([2, 2, "x", 1]) tm.assert_series_equal(result, expected, check_index_type=True) # raises as nothing in in the index - msg = (r"\"None of \[Int64Index\(\[3, 3, 3\], dtype='int64'\)\] are" - r" in the \[index\]\"") + msg = ( + r"\"None of \[Int64Index\(\[3, 3, 3\], dtype='int64'\)\] are" + r" in the \[index\]\"" + ) with pytest.raises(KeyError, match=msg): ser.loc[[3, 3, 3]] @@ -238,8 +238,7 @@ def test_series_partial_set(self): result = s.reindex([3, 4, 4]) tm.assert_series_equal(result, expected, check_index_type=True) - s = Series([0.1, 0.2, 0.3, 0.4], - index=[1, 2, 3, 4]) + s = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]) expected = Series([np.nan, 0.3, 0.3], index=[5, 3, 3]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = s.loc[[5, 3, 3]] @@ -248,8 +247,7 @@ def test_series_partial_set(self): result = s.reindex([5, 3, 3]) tm.assert_series_equal(result, expected, check_index_type=True) - s = Series([0.1, 0.2, 0.3, 0.4], - index=[1, 2, 3, 4]) + s = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]) expected = Series([np.nan, 0.4, 0.4], index=[5, 4, 4]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = s.loc[[5, 4, 4]] @@ -258,8 +256,7 @@ def test_series_partial_set(self): result = s.reindex([5, 4, 4]) tm.assert_series_equal(result, expected, check_index_type=True) - s = Series([0.1, 0.2, 0.3, 0.4], - index=[4, 5, 6, 7]) + s = Series([0.1, 0.2, 0.3, 0.4], index=[4, 5, 6, 7]) expected = Series([0.4, np.nan, np.nan], index=[7, 2, 2]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = s.loc[[7, 2, 2]] @@ -268,8 +265,7 @@ def test_series_partial_set(self): result = s.reindex([7, 2, 2]) tm.assert_series_equal(result, expected, check_index_type=True) - s = Series([0.1, 0.2, 0.3, 0.4], - index=[1, 2, 3, 4]) + s = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]) expected = Series([0.4, np.nan, np.nan], index=[4, 5, 5]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = s.loc[[4, 5, 5]] @@ -286,90 +282,85 @@ def test_series_partial_set(self): def test_series_partial_set_with_name(self): # GH 11497 - idx = Index([1, 2], dtype='int64', name='idx') - ser = Series([0.1, 0.2], index=idx, name='s') + idx = Index([1, 2], dtype="int64", name="idx") + ser = Series([0.1, 0.2], index=idx, name="s") # loc - exp_idx = Index([3, 2, 3], dtype='int64', name='idx') - expected = Series([np.nan, 0.2, np.nan], index=exp_idx, name='s') + exp_idx = Index([3, 2, 3], dtype="int64", name="idx") + expected = Series([np.nan, 0.2, np.nan], index=exp_idx, name="s") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = ser.loc[[3, 2, 3]] tm.assert_series_equal(result, expected, check_index_type=True) - exp_idx = Index([3, 2, 3, 'x'], dtype='object', name='idx') - expected = Series([np.nan, 0.2, np.nan, np.nan], index=exp_idx, - name='s') + exp_idx = Index([3, 2, 3, "x"], dtype="object", name="idx") + expected = Series([np.nan, 0.2, np.nan, np.nan], index=exp_idx, name="s") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ser.loc[[3, 2, 3, 'x']] + result = ser.loc[[3, 2, 3, "x"]] tm.assert_series_equal(result, expected, check_index_type=True) - exp_idx = Index([2, 2, 1], dtype='int64', name='idx') - expected = Series([0.2, 0.2, 0.1], index=exp_idx, name='s') + exp_idx = Index([2, 2, 1], dtype="int64", name="idx") + expected = Series([0.2, 0.2, 0.1], index=exp_idx, name="s") result = ser.loc[[2, 2, 1]] tm.assert_series_equal(result, expected, check_index_type=True) - exp_idx = Index([2, 2, 'x', 1], dtype='object', name='idx') - expected = Series([0.2, 0.2, np.nan, 0.1], index=exp_idx, name='s') + exp_idx = Index([2, 2, "x", 1], dtype="object", name="idx") + expected = Series([0.2, 0.2, np.nan, 0.1], index=exp_idx, name="s") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ser.loc[[2, 2, 'x', 1]] + result = ser.loc[[2, 2, "x", 1]] tm.assert_series_equal(result, expected, check_index_type=True) # raises as nothing in in the index - msg = (r"\"None of \[Int64Index\(\[3, 3, 3\], dtype='int64'," - r" name='idx'\)\] are in the \[index\]\"") + msg = ( + r"\"None of \[Int64Index\(\[3, 3, 3\], dtype='int64'," + r" name='idx'\)\] are in the \[index\]\"" + ) with pytest.raises(KeyError, match=msg): ser.loc[[3, 3, 3]] - exp_idx = Index([2, 2, 3], dtype='int64', name='idx') - expected = Series([0.2, 0.2, np.nan], index=exp_idx, name='s') + exp_idx = Index([2, 2, 3], dtype="int64", name="idx") + expected = Series([0.2, 0.2, np.nan], index=exp_idx, name="s") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = ser.loc[[2, 2, 3]] tm.assert_series_equal(result, expected, check_index_type=True) - exp_idx = Index([3, 4, 4], dtype='int64', name='idx') - expected = Series([0.3, np.nan, np.nan], index=exp_idx, name='s') - idx = Index([1, 2, 3], dtype='int64', name='idx') + exp_idx = Index([3, 4, 4], dtype="int64", name="idx") + expected = Series([0.3, np.nan, np.nan], index=exp_idx, name="s") + idx = Index([1, 2, 3], dtype="int64", name="idx") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = Series([0.1, 0.2, 0.3], - index=idx, - name='s').loc[[3, 4, 4]] + result = Series([0.1, 0.2, 0.3], index=idx, name="s").loc[[3, 4, 4]] tm.assert_series_equal(result, expected, check_index_type=True) - exp_idx = Index([5, 3, 3], dtype='int64', name='idx') - expected = Series([np.nan, 0.3, 0.3], index=exp_idx, name='s') - idx = Index([1, 2, 3, 4], dtype='int64', name='idx') + exp_idx = Index([5, 3, 3], dtype="int64", name="idx") + expected = Series([np.nan, 0.3, 0.3], index=exp_idx, name="s") + idx = Index([1, 2, 3, 4], dtype="int64", name="idx") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = Series([0.1, 0.2, 0.3, 0.4], index=idx, - name='s').loc[[5, 3, 3]] + result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name="s").loc[[5, 3, 3]] tm.assert_series_equal(result, expected, check_index_type=True) - exp_idx = Index([5, 4, 4], dtype='int64', name='idx') - expected = Series([np.nan, 0.4, 0.4], index=exp_idx, name='s') - idx = Index([1, 2, 3, 4], dtype='int64', name='idx') + exp_idx = Index([5, 4, 4], dtype="int64", name="idx") + expected = Series([np.nan, 0.4, 0.4], index=exp_idx, name="s") + idx = Index([1, 2, 3, 4], dtype="int64", name="idx") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = Series([0.1, 0.2, 0.3, 0.4], index=idx, - name='s').loc[[5, 4, 4]] + result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name="s").loc[[5, 4, 4]] tm.assert_series_equal(result, expected, check_index_type=True) - exp_idx = Index([7, 2, 2], dtype='int64', name='idx') - expected = Series([0.4, np.nan, np.nan], index=exp_idx, name='s') - idx = Index([4, 5, 6, 7], dtype='int64', name='idx') + exp_idx = Index([7, 2, 2], dtype="int64", name="idx") + expected = Series([0.4, np.nan, np.nan], index=exp_idx, name="s") + idx = Index([4, 5, 6, 7], dtype="int64", name="idx") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = Series([0.1, 0.2, 0.3, 0.4], index=idx, - name='s').loc[[7, 2, 2]] + result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name="s").loc[[7, 2, 2]] tm.assert_series_equal(result, expected, check_index_type=True) - exp_idx = Index([4, 5, 5], dtype='int64', name='idx') - expected = Series([0.4, np.nan, np.nan], index=exp_idx, name='s') - idx = Index([1, 2, 3, 4], dtype='int64', name='idx') + exp_idx = Index([4, 5, 5], dtype="int64", name="idx") + expected = Series([0.4, np.nan, np.nan], index=exp_idx, name="s") + idx = Index([1, 2, 3, 4], dtype="int64", name="idx") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = Series([0.1, 0.2, 0.3, 0.4], index=idx, - name='s').loc[[4, 5, 5]] + result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name="s").loc[[4, 5, 5]] tm.assert_series_equal(result, expected, check_index_type=True) # iloc - exp_idx = Index([2, 2, 1, 1], dtype='int64', name='idx') - expected = Series([0.2, 0.2, 0.1, 0.1], index=exp_idx, name='s') + exp_idx = Index([2, 2, 1, 1], dtype="int64", name="idx") + expected = Series([0.2, 0.2, 0.1, 0.1], index=exp_idx, name="s") result = ser.iloc[[1, 1, 0, 0]] tm.assert_series_equal(result, expected, check_index_type=True) @@ -402,11 +393,11 @@ def test_partial_set_invalid(self): # allow object conversion here df = orig.copy() with catch_warnings(record=True): - df.loc['a', :] = df.ix[0] - exp = orig.append(Series(df.ix[0], name='a')) + df.loc["a", :] = df.ix[0] + exp = orig.append(Series(df.ix[0], name="a")) tm.assert_frame_equal(df, exp) - tm.assert_index_equal(df.index, Index(orig.index.tolist() + ['a'])) - assert df.index.dtype == 'object' + tm.assert_index_equal(df.index, Index(orig.index.tolist() + ["a"])) + assert df.index.dtype == "object" def test_partial_set_empty_series(self): @@ -420,18 +411,18 @@ def test_partial_set_empty_series(self): tm.assert_series_equal(s, Series([1, 3], index=[1, 3])) s = Series() - s.loc[1] = 1. - tm.assert_series_equal(s, Series([1.], index=[1])) - s.loc[3] = 3. - tm.assert_series_equal(s, Series([1., 3.], index=[1, 3])) + s.loc[1] = 1.0 + tm.assert_series_equal(s, Series([1.0], index=[1])) + s.loc[3] = 3.0 + tm.assert_series_equal(s, Series([1.0, 3.0], index=[1, 3])) s = Series() - s.loc['foo'] = 1 - tm.assert_series_equal(s, Series([1], index=['foo'])) - s.loc['bar'] = 3 - tm.assert_series_equal(s, Series([1, 3], index=['foo', 'bar'])) + s.loc["foo"] = 1 + tm.assert_series_equal(s, Series([1], index=["foo"])) + s.loc["bar"] = 3 + tm.assert_series_equal(s, Series([1, 3], index=["foo", "bar"])) s.loc[3] = 4 - tm.assert_series_equal(s, Series([1, 3, 4], index=['foo', 'bar', 3])) + tm.assert_series_equal(s, Series([1, 3, 4], index=["foo", "bar", 3])) def test_partial_set_empty_frame(self): @@ -443,7 +434,7 @@ def test_partial_set_empty_frame(self): df.loc[1] = 1 with pytest.raises(ValueError): - df.loc[1] = Series([1], index=['foo']) + df.loc[1] = Series([1], index=["foo"]) with pytest.raises(ValueError): df.loc[:, 1] = 1 @@ -451,75 +442,74 @@ def test_partial_set_empty_frame(self): # these work as they don't really change # anything but the index # GH5632 - expected = DataFrame(columns=['foo'], index=Index([], dtype='int64')) + expected = DataFrame(columns=["foo"], index=Index([], dtype="int64")) def f(): df = DataFrame() - df['foo'] = Series([], dtype='object') + df["foo"] = Series([], dtype="object") return df tm.assert_frame_equal(f(), expected) def f(): df = DataFrame() - df['foo'] = Series(df.index) + df["foo"] = Series(df.index) return df tm.assert_frame_equal(f(), expected) def f(): df = DataFrame() - df['foo'] = df.index + df["foo"] = df.index return df tm.assert_frame_equal(f(), expected) - expected = DataFrame(columns=['foo'], index=Index([], dtype='int64')) - expected['foo'] = expected['foo'].astype('float64') + expected = DataFrame(columns=["foo"], index=Index([], dtype="int64")) + expected["foo"] = expected["foo"].astype("float64") def f(): df = DataFrame() - df['foo'] = [] + df["foo"] = [] return df tm.assert_frame_equal(f(), expected) def f(): df = DataFrame() - df['foo'] = Series(np.arange(len(df)), dtype='float64') + df["foo"] = Series(np.arange(len(df)), dtype="float64") return df tm.assert_frame_equal(f(), expected) def f(): df = DataFrame() - tm.assert_index_equal(df.index, Index([], dtype='object')) - df['foo'] = range(len(df)) + tm.assert_index_equal(df.index, Index([], dtype="object")) + df["foo"] = range(len(df)) return df - expected = DataFrame(columns=['foo'], index=Index([], dtype='int64')) - expected['foo'] = expected['foo'].astype('float64') + expected = DataFrame(columns=["foo"], index=Index([], dtype="int64")) + expected["foo"] = expected["foo"].astype("float64") tm.assert_frame_equal(f(), expected) df = DataFrame() tm.assert_index_equal(df.columns, Index([], dtype=object)) df2 = DataFrame() - df2[1] = Series([1], index=['foo']) - df.loc[:, 1] = Series([1], index=['foo']) - tm.assert_frame_equal(df, DataFrame([[1]], index=['foo'], columns=[1])) + df2[1] = Series([1], index=["foo"]) + df.loc[:, 1] = Series([1], index=["foo"]) + tm.assert_frame_equal(df, DataFrame([[1]], index=["foo"], columns=[1])) tm.assert_frame_equal(df, df2) # no index to start - expected = DataFrame({0: Series(1, index=range(4))}, - columns=['A', 'B', 0]) + expected = DataFrame({0: Series(1, index=range(4))}, columns=["A", "B", 0]) - df = DataFrame(columns=['A', 'B']) + df = DataFrame(columns=["A", "B"]) df[0] = Series(1, index=range(4)) df.dtypes str(df) tm.assert_frame_equal(df, expected) - df = DataFrame(columns=['A', 'B']) + df = DataFrame(columns=["A", "B"]) df.loc[:, 0] = Series(1, index=range(4)) df.dtypes str(df) @@ -528,34 +518,32 @@ def f(): def test_partial_set_empty_frame_row(self): # GH5720, GH5744 # don't create rows when empty - expected = DataFrame(columns=['A', 'B', 'New'], - index=Index([], dtype='int64')) - expected['A'] = expected['A'].astype('int64') - expected['B'] = expected['B'].astype('float64') - expected['New'] = expected['New'].astype('float64') + expected = DataFrame(columns=["A", "B", "New"], index=Index([], dtype="int64")) + expected["A"] = expected["A"].astype("int64") + expected["B"] = expected["B"].astype("float64") + expected["New"] = expected["New"].astype("float64") df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]}) y = df[df.A > 5] - y['New'] = np.nan + y["New"] = np.nan tm.assert_frame_equal(y, expected) # tm.assert_frame_equal(y,expected) - expected = DataFrame(columns=['a', 'b', 'c c', 'd']) - expected['d'] = expected['d'].astype('int64') - df = DataFrame(columns=['a', 'b', 'c c']) - df['d'] = 3 + expected = DataFrame(columns=["a", "b", "c c", "d"]) + expected["d"] = expected["d"].astype("int64") + df = DataFrame(columns=["a", "b", "c c"]) + df["d"] = 3 tm.assert_frame_equal(df, expected) - tm.assert_series_equal(df['c c'], Series(name='c c', dtype=object)) + tm.assert_series_equal(df["c c"], Series(name="c c", dtype=object)) # reindex columns is ok df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]}) y = df[df.A > 5] - result = y.reindex(columns=['A', 'B', 'C']) - expected = DataFrame(columns=['A', 'B', 'C'], - index=Index([], dtype='int64')) - expected['A'] = expected['A'].astype('int64') - expected['B'] = expected['B'].astype('float64') - expected['C'] = expected['C'].astype('float64') + result = y.reindex(columns=["A", "B", "C"]) + expected = DataFrame(columns=["A", "B", "C"], index=Index([], dtype="int64")) + expected["A"] = expected["A"].astype("int64") + expected["B"] = expected["B"].astype("float64") + expected["C"] = expected["C"].astype("float64") tm.assert_frame_equal(result, expected) def test_partial_set_empty_frame_set_series(self): @@ -564,33 +552,32 @@ def test_partial_set_empty_frame_set_series(self): df = DataFrame(Series()) tm.assert_frame_equal(df, DataFrame({0: Series()})) - df = DataFrame(Series(name='foo')) - tm.assert_frame_equal(df, DataFrame({'foo': Series()})) + df = DataFrame(Series(name="foo")) + tm.assert_frame_equal(df, DataFrame({"foo": Series()})) def test_partial_set_empty_frame_empty_copy_assignment(self): # GH 5932 # copy on empty with assignment fails df = DataFrame(index=[0]) df = df.copy() - df['a'] = 0 - expected = DataFrame(0, index=[0], columns=['a']) + df["a"] = 0 + expected = DataFrame(0, index=[0], columns=["a"]) tm.assert_frame_equal(df, expected) def test_partial_set_empty_frame_empty_consistencies(self): # GH 6171 # consistency on empty frames - df = DataFrame(columns=['x', 'y']) - df['x'] = [1, 2] + df = DataFrame(columns=["x", "y"]) + df["x"] = [1, 2] expected = DataFrame(dict(x=[1, 2], y=[np.nan, np.nan])) tm.assert_frame_equal(df, expected, check_dtype=False) - df = DataFrame(columns=['x', 'y']) - df['x'] = ['1', '2'] - expected = DataFrame( - dict(x=['1', '2'], y=[np.nan, np.nan]), dtype=object) + df = DataFrame(columns=["x", "y"]) + df["x"] = ["1", "2"] + expected = DataFrame(dict(x=["1", "2"], y=[np.nan, np.nan]), dtype=object) tm.assert_frame_equal(df, expected) - df = DataFrame(columns=['x', 'y']) - df.loc[0, 'x'] = 1 + df = DataFrame(columns=["x", "y"]) + df.loc[0, "x"] = 1 expected = DataFrame(dict(x=[1], y=[np.nan])) tm.assert_frame_equal(df, expected, check_dtype=False) diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py index 20053264ac4f1c..a6e1273a229dc7 100644 --- a/pandas/tests/indexing/test_scalar.py +++ b/pandas/tests/indexing/test_scalar.py @@ -9,7 +9,6 @@ class TestScalar(Base): - def test_at_and_iat_get(self): def _check(f, func, values=False): @@ -25,19 +24,18 @@ def _check(f, func, values=False): d = getattr(self, o) # iat - for f in [d['ints'], d['uints']]: - _check(f, 'iat', values=True) + for f in [d["ints"], d["uints"]]: + _check(f, "iat", values=True) - for f in [d['labels'], d['ts'], d['floats']]: + for f in [d["labels"], d["ts"], d["floats"]]: if f is not None: msg = "iAt based indexing can only have integer indexers" with pytest.raises(ValueError, match=msg): - self.check_values(f, 'iat') + self.check_values(f, "iat") # at - for f in [d['ints'], d['uints'], d['labels'], - d['ts'], d['floats']]: - _check(f, 'at') + for f in [d["ints"], d["uints"], d["labels"], d["ts"], d["floats"]]: + _check(f, "at") def test_at_and_iat_set(self): def _check(f, func, values=False): @@ -54,28 +52,25 @@ def _check(f, func, values=False): d = getattr(self, t) # iat - for f in [d['ints'], d['uints']]: - _check(f, 'iat', values=True) + for f in [d["ints"], d["uints"]]: + _check(f, "iat", values=True) - for f in [d['labels'], d['ts'], d['floats']]: + for f in [d["labels"], d["ts"], d["floats"]]: if f is not None: msg = "iAt based indexing can only have integer indexers" with pytest.raises(ValueError, match=msg): - _check(f, 'iat') + _check(f, "iat") # at - for f in [d['ints'], d['uints'], d['labels'], - d['ts'], d['floats']]: - _check(f, 'at') + for f in [d["ints"], d["uints"], d["labels"], d["ts"], d["floats"]]: + _check(f, "at") def test_at_iat_coercion(self): # as timestamp is not a tuple! - dates = date_range('1/1/2000', periods=8) - df = DataFrame(np.random.randn(8, 4), - index=dates, - columns=['A', 'B', 'C', 'D']) - s = df['A'] + dates = date_range("1/1/2000", periods=8) + df = DataFrame(np.random.randn(8, 4), index=dates, columns=["A", "B", "C", "D"]) + s = df["A"] result = s.at[dates[5]] xp = s.values[5] @@ -83,15 +78,15 @@ def test_at_iat_coercion(self): # GH 7729 # make sure we are boxing the returns - s = Series(['2014-01-01', '2014-02-02'], dtype='datetime64[ns]') - expected = Timestamp('2014-02-02') + s = Series(["2014-01-01", "2014-02-02"], dtype="datetime64[ns]") + expected = Timestamp("2014-02-02") for r in [lambda: s.iat[1], lambda: s.iloc[1]]: result = r() assert result == expected - s = Series(['1 days', '2 days'], dtype='timedelta64[ns]') - expected = Timedelta('2 days') + s = Series(["1 days", "2 days"], dtype="timedelta64[ns]") + expected = Timedelta("2 days") for r in [lambda: s.iat[1], lambda: s.iloc[1]]: result = r() @@ -105,7 +100,7 @@ def test_imethods_with_dups(self): # GH6493 # iat/iloc with dups - s = Series(range(5), index=[1, 1, 2, 2, 3], dtype='int64') + s = Series(range(5), index=[1, 1, 2, 2, 3], dtype="int64") result = s.iloc[2] assert result == 2 result = s.iat[2] @@ -119,7 +114,7 @@ def test_imethods_with_dups(self): s.iat[-10] result = s.iloc[[2, 3]] - expected = Series([2, 3], [2, 2], dtype='int64') + expected = Series([2, 3], [2, 2], dtype="int64") tm.assert_series_equal(result, expected) df = s.to_frame() @@ -133,61 +128,68 @@ def test_imethods_with_dups(self): def test_at_to_fail(self): # at should not fallback # GH 7814 - s = Series([1, 2, 3], index=list('abc')) - result = s.at['a'] + s = Series([1, 2, 3], index=list("abc")) + result = s.at["a"] assert result == 1 - msg = ("At based indexing on an non-integer index can only have" - " non-integer indexers") + msg = ( + "At based indexing on an non-integer index can only have" + " non-integer indexers" + ) with pytest.raises(ValueError, match=msg): s.at[0] - df = DataFrame({'A': [1, 2, 3]}, index=list('abc')) - result = df.at['a', 'A'] + df = DataFrame({"A": [1, 2, 3]}, index=list("abc")) + result = df.at["a", "A"] assert result == 1 with pytest.raises(ValueError, match=msg): - df.at['a', 0] + df.at["a", 0] s = Series([1, 2, 3], index=[3, 2, 1]) result = s.at[1] assert result == 3 - msg = ("At based indexing on an integer index can only have integer" - " indexers") + msg = "At based indexing on an integer index can only have integer" " indexers" with pytest.raises(ValueError, match=msg): - s.at['a'] + s.at["a"] df = DataFrame({0: [1, 2, 3]}, index=[3, 2, 1]) result = df.at[1, 0] assert result == 3 with pytest.raises(ValueError, match=msg): - df.at['a', 0] + df.at["a", 0] # GH 13822, incorrect error string with non-unique columns when missing # column is accessed - df = DataFrame({'x': [1.], 'y': [2.], 'z': [3.]}) - df.columns = ['x', 'x', 'z'] + df = DataFrame({"x": [1.0], "y": [2.0], "z": [3.0]}) + df.columns = ["x", "x", "z"] # Check that we get the correct value in the KeyError with pytest.raises(KeyError, match=r"\['y'\] not in index"): - df[['x', 'y', 'z']] + df[["x", "y", "z"]] def test_at_with_tz(self): # gh-15822 - df = DataFrame({'name': ['John', 'Anderson'], - 'date': [Timestamp(2017, 3, 13, 13, 32, 56), - Timestamp(2017, 2, 16, 12, 10, 3)]}) - df['date'] = df['date'].dt.tz_localize('Asia/Shanghai') - - expected = Timestamp('2017-03-13 13:32:56+0800', tz='Asia/Shanghai') - - result = df.loc[0, 'date'] + df = DataFrame( + { + "name": ["John", "Anderson"], + "date": [ + Timestamp(2017, 3, 13, 13, 32, 56), + Timestamp(2017, 2, 16, 12, 10, 3), + ], + } + ) + df["date"] = df["date"].dt.tz_localize("Asia/Shanghai") + + expected = Timestamp("2017-03-13 13:32:56+0800", tz="Asia/Shanghai") + + result = df.loc[0, "date"] assert result == expected - result = df.at[0, 'date'] + result = df.at[0, "date"] assert result == expected def test_series_set_tz_timestamp(self, tz_naive_fixture): # GH 25506 - ts = Timestamp('2017-08-05 00:00:00+0100', tz=tz_naive_fixture) + ts = Timestamp("2017-08-05 00:00:00+0100", tz=tz_naive_fixture) result = Series(ts) result.at[1] = ts expected = Series([ts, ts]) @@ -195,7 +197,7 @@ def test_series_set_tz_timestamp(self, tz_naive_fixture): def test_mixed_index_at_iat_loc_iloc_series(self): # GH 19860 - s = Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 1, 2]) + s = Series([1, 2, 3, 4, 5], index=["a", "b", "c", 1, 2]) for el, item in s.iteritems(): assert s.at[el] == s.loc[el] == item for i in range(len(s)): @@ -208,8 +210,9 @@ def test_mixed_index_at_iat_loc_iloc_series(self): def test_mixed_index_at_iat_loc_iloc_dataframe(self): # GH 19860 - df = DataFrame([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]], - columns=['a', 'b', 'c', 1, 2]) + df = DataFrame( + [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]], columns=["a", "b", "c", 1, 2] + ) for rowIdx, row in df.iterrows(): for el, item in row.iteritems(): assert df.at[rowIdx, el] == df.loc[rowIdx, el] == item @@ -225,7 +228,7 @@ def test_mixed_index_at_iat_loc_iloc_dataframe(self): def test_iat_setter_incompatible_assignment(self): # GH 23236 - result = DataFrame({'a': [0, 1], 'b': [4, 5]}) + result = DataFrame({"a": [0, 1], "b": [4, 5]}) result.iat[0, 0] = None expected = DataFrame({"a": [None, 1], "b": [4, 5]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_timedelta.py b/pandas/tests/indexing/test_timedelta.py index e3f5bcff4a22e2..7628aa53ef3cbe 100644 --- a/pandas/tests/indexing/test_timedelta.py +++ b/pandas/tests/indexing/test_timedelta.py @@ -8,75 +8,82 @@ class TestTimedeltaIndexing: def test_boolean_indexing(self): # GH 14946 - df = pd.DataFrame({'x': range(10)}) - df.index = pd.to_timedelta(range(10), unit='s') - conditions = [df['x'] > 3, df['x'] == 3, df['x'] < 3] - expected_data = [[0, 1, 2, 3, 10, 10, 10, 10, 10, 10], - [0, 1, 2, 10, 4, 5, 6, 7, 8, 9], - [10, 10, 10, 3, 4, 5, 6, 7, 8, 9]] + df = pd.DataFrame({"x": range(10)}) + df.index = pd.to_timedelta(range(10), unit="s") + conditions = [df["x"] > 3, df["x"] == 3, df["x"] < 3] + expected_data = [ + [0, 1, 2, 3, 10, 10, 10, 10, 10, 10], + [0, 1, 2, 10, 4, 5, 6, 7, 8, 9], + [10, 10, 10, 3, 4, 5, 6, 7, 8, 9], + ] for cond, data in zip(conditions, expected_data): - result = df.assign(x=df.mask(cond, 10).astype('int64')) - expected = pd.DataFrame(data, - index=pd.to_timedelta(range(10), unit='s'), - columns=['x'], - dtype='int64') + result = df.assign(x=df.mask(cond, 10).astype("int64")) + expected = pd.DataFrame( + data, + index=pd.to_timedelta(range(10), unit="s"), + columns=["x"], + dtype="int64", + ) tm.assert_frame_equal(expected, result) @pytest.mark.parametrize( "indexer, expected", - [(0, [20, 1, 2, 3, 4, 5, 6, 7, 8, 9]), - (slice(4, 8), [0, 1, 2, 3, 20, 20, 20, 20, 8, 9]), - ([3, 5], [0, 1, 2, 20, 4, 20, 6, 7, 8, 9])]) + [ + (0, [20, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + (slice(4, 8), [0, 1, 2, 3, 20, 20, 20, 20, 8, 9]), + ([3, 5], [0, 1, 2, 20, 4, 20, 6, 7, 8, 9]), + ], + ) def test_list_like_indexing(self, indexer, expected): # GH 16637 - df = pd.DataFrame({'x': range(10)}, dtype="int64") - df.index = pd.to_timedelta(range(10), unit='s') + df = pd.DataFrame({"x": range(10)}, dtype="int64") + df.index = pd.to_timedelta(range(10), unit="s") - df.loc[df.index[indexer], 'x'] = 20 + df.loc[df.index[indexer], "x"] = 20 - expected = pd.DataFrame(expected, - index=pd.to_timedelta(range(10), unit='s'), - columns=['x'], - dtype="int64") + expected = pd.DataFrame( + expected, + index=pd.to_timedelta(range(10), unit="s"), + columns=["x"], + dtype="int64", + ) tm.assert_frame_equal(expected, df) def test_string_indexing(self): # GH 16896 - df = pd.DataFrame({'x': range(3)}, - index=pd.to_timedelta(range(3), unit='days')) + df = pd.DataFrame({"x": range(3)}, index=pd.to_timedelta(range(3), unit="days")) expected = df.iloc[0] - sliced = df.loc['0 days'] + sliced = df.loc["0 days"] tm.assert_series_equal(sliced, expected) - @pytest.mark.parametrize( - "value", - [None, pd.NaT, np.nan]) + @pytest.mark.parametrize("value", [None, pd.NaT, np.nan]) def test_masked_setitem(self, value): # issue (#18586) - series = pd.Series([0, 1, 2], dtype='timedelta64[ns]') + series = pd.Series([0, 1, 2], dtype="timedelta64[ns]") series[series == series[0]] = value - expected = pd.Series([pd.NaT, 1, 2], dtype='timedelta64[ns]') + expected = pd.Series([pd.NaT, 1, 2], dtype="timedelta64[ns]") tm.assert_series_equal(series, expected) - @pytest.mark.parametrize( - "value", - [None, pd.NaT, np.nan]) + @pytest.mark.parametrize("value", [None, pd.NaT, np.nan]) def test_listlike_setitem(self, value): # issue (#18586) - series = pd.Series([0, 1, 2], dtype='timedelta64[ns]') + series = pd.Series([0, 1, 2], dtype="timedelta64[ns]") series.iloc[0] = value - expected = pd.Series([pd.NaT, 1, 2], dtype='timedelta64[ns]') + expected = pd.Series([pd.NaT, 1, 2], dtype="timedelta64[ns]") tm.assert_series_equal(series, expected) - @pytest.mark.parametrize('start,stop, expected_slice', [ - [np.timedelta64(0, 'ns'), None, slice(0, 11)], - [np.timedelta64(1, 'D'), np.timedelta64(6, 'D'), slice(1, 7)], - [None, np.timedelta64(4, 'D'), slice(0, 5)]]) - def test_numpy_timedelta_scalar_indexing(self, start, stop, - expected_slice): + @pytest.mark.parametrize( + "start,stop, expected_slice", + [ + [np.timedelta64(0, "ns"), None, slice(0, 11)], + [np.timedelta64(1, "D"), np.timedelta64(6, "D"), slice(1, 7)], + [None, np.timedelta64(4, "D"), slice(0, 5)], + ], + ) + def test_numpy_timedelta_scalar_indexing(self, start, stop, expected_slice): # GH 20393 - s = pd.Series(range(11), pd.timedelta_range('0 days', '10 days')) + s = pd.Series(range(11), pd.timedelta_range("0 days", "10 days")) result = s.loc[slice(start, stop)] expected = s.iloc[expected_slice] tm.assert_series_equal(result, expected) @@ -85,19 +92,19 @@ def test_roundtrip_thru_setitem(self): # PR 23462 dt1 = pd.Timedelta(0) dt2 = pd.Timedelta(28767471428571405) - df = pd.DataFrame({'dt': pd.Series([dt1, dt2])}) + df = pd.DataFrame({"dt": pd.Series([dt1, dt2])}) df_copy = df.copy() s = pd.Series([dt1]) - expected = df['dt'].iloc[1].value + expected = df["dt"].iloc[1].value df.loc[[True, False]] = s - result = df['dt'].iloc[1].value + result = df["dt"].iloc[1].value assert expected == result tm.assert_frame_equal(df, df_copy) def test_loc_str_slicing(self): - ix = pd.timedelta_range(start='1 day', end='2 days', freq='1H') + ix = pd.timedelta_range(start="1 day", end="2 days", freq="1H") ser = ix.to_series() result = ser.loc[:"1 days"] expected = ser.iloc[:-1] @@ -105,9 +112,9 @@ def test_loc_str_slicing(self): tm.assert_series_equal(result, expected) def test_loc_slicing(self): - ix = pd.timedelta_range(start='1 day', end='2 days', freq='1H') + ix = pd.timedelta_range(start="1 day", end="2 days", freq="1H") ser = ix.to_series() - result = ser.loc[:ix[-2]] + result = ser.loc[: ix[-2]] expected = ser.iloc[:-1] tm.assert_series_equal(result, expected) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 411146843d60fa..9ce1062a6ec26b 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -13,25 +13,36 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, DatetimeIndex, Index, MultiIndex, Series, - SparseArray) + Categorical, + DataFrame, + DatetimeIndex, + Index, + MultiIndex, + Series, + SparseArray, +) import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray, TimedeltaArray from pandas.core.internals import BlockManager, SingleBlockManager, make_block import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal, randn) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, + randn, +) # in 3.6.1 a c-api slicing function changed, see src/compat_helper.h -PY361 = LooseVersion(sys.version) >= LooseVersion('3.6.1') +PY361 = LooseVersion(sys.version) >= LooseVersion("3.6.1") @pytest.fixture def mgr(): return create_mgr( - 'a: f8; b: object; c: f8; d: object; e: f8;' - 'f: bool; g: i8; h: complex; i: datetime-1; j: datetime-2;' - 'k: M8[ns, US/Eastern]; l: M8[ns, CET];') + "a: f8; b: object; c: f8; d: object; e: f8;" + "f: bool; g: i8; h: complex; i: datetime-1; j: datetime-2;" + "k: M8[ns, US/Eastern]; l: M8[ns, CET];" + ) def assert_block_equal(left, right): @@ -39,14 +50,14 @@ def assert_block_equal(left, right): assert left.dtype == right.dtype assert isinstance(left.mgr_locs, BlockPlacement) assert isinstance(right.mgr_locs, BlockPlacement) - tm.assert_numpy_array_equal(left.mgr_locs.as_array, - right.mgr_locs.as_array) + tm.assert_numpy_array_equal(left.mgr_locs.as_array, right.mgr_locs.as_array) def get_numeric_mat(shape): arr = np.arange(shape[0]) - return np.lib.stride_tricks.as_strided(x=arr, shape=shape, strides=( - arr.itemsize, ) + (0, ) * (len(shape) - 1)).copy() + return np.lib.stride_tricks.as_strided( + x=arr, shape=shape, strides=(arr.itemsize,) + (0,) * (len(shape) - 1) + ).copy() N = 10 @@ -73,50 +84,64 @@ def create_block(typestr, placement, item_shape=None, num_offset=0): num_items = len(placement) if item_shape is None: - item_shape = (N, ) + item_shape = (N,) - shape = (num_items, ) + item_shape + shape = (num_items,) + item_shape mat = get_numeric_mat(shape) - if typestr in ('float', 'f8', 'f4', 'f2', 'int', 'i8', 'i4', 'i2', 'i1', - 'uint', 'u8', 'u4', 'u2', 'u1'): + if typestr in ( + "float", + "f8", + "f4", + "f2", + "int", + "i8", + "i4", + "i2", + "i1", + "uint", + "u8", + "u4", + "u2", + "u1", + ): values = mat.astype(typestr) + num_offset - elif typestr in ('complex', 'c16', 'c8'): - values = 1.j * (mat.astype(typestr) + num_offset) - elif typestr in ('object', 'string', 'O'): - values = np.reshape(['A%d' % i for i in mat.ravel() + num_offset], - shape) - elif typestr in ('b', 'bool', ): + elif typestr in ("complex", "c16", "c8"): + values = 1.0j * (mat.astype(typestr) + num_offset) + elif typestr in ("object", "string", "O"): + values = np.reshape(["A%d" % i for i in mat.ravel() + num_offset], shape) + elif typestr in ("b", "bool"): values = np.ones(shape, dtype=np.bool_) - elif typestr in ('datetime', 'dt', 'M8[ns]'): - values = (mat * 1e9).astype('M8[ns]') - elif typestr.startswith('M8[ns'): + elif typestr in ("datetime", "dt", "M8[ns]"): + values = (mat * 1e9).astype("M8[ns]") + elif typestr.startswith("M8[ns"): # datetime with tz - m = re.search(r'M8\[ns,\s*(\w+\/?\w*)\]', typestr) + m = re.search(r"M8\[ns,\s*(\w+\/?\w*)\]", typestr) assert m is not None, "incompatible typestr -> {0}".format(typestr) tz = m.groups()[0] assert num_items == 1, "must have only 1 num items for a tz-aware" values = DatetimeIndex(np.arange(N) * 1e9, tz=tz) - elif typestr in ('timedelta', 'td', 'm8[ns]'): - values = (mat * 1).astype('m8[ns]') - elif typestr in ('category', ): + elif typestr in ("timedelta", "td", "m8[ns]"): + values = (mat * 1).astype("m8[ns]") + elif typestr in ("category",): values = Categorical([1, 1, 2, 2, 3, 3, 3, 3, 4, 4]) - elif typestr in ('category2', ): - values = Categorical(['a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'c', 'd' - ]) - elif typestr in ('sparse', 'sparse_na'): + elif typestr in ("category2",): + values = Categorical(["a", "a", "a", "a", "b", "b", "c", "c", "c", "d"]) + elif typestr in ("sparse", "sparse_na"): # FIXME: doesn't support num_rows != 10 assert shape[-1] == 10 assert all(s == 1 for s in shape[:-1]) - if typestr.endswith('_na'): + if typestr.endswith("_na"): fill_value = np.nan else: fill_value = 0.0 - values = SparseArray([fill_value, fill_value, 1, 2, 3, fill_value, - 4, 5, fill_value, 6], fill_value=fill_value) + values = SparseArray( + [fill_value, fill_value, 1, 2, 3, fill_value, 4, 5, fill_value, 6], + fill_value=fill_value, + ) arr = values.sp_values.view() - arr += (num_offset - 1) + arr += num_offset - 1 else: raise ValueError('Unsupported typestr: "%s"' % typestr) @@ -129,7 +154,8 @@ def create_single_mgr(typestr, num_rows=None): return SingleBlockManager( create_block(typestr, placement=slice(0, num_rows), item_shape=()), - np.arange(num_rows)) + np.arange(num_rows), + ) def create_mgr(descr, item_shape=None): @@ -154,18 +180,18 @@ def create_mgr(descr, item_shape=None): """ if item_shape is None: - item_shape = (N, ) + item_shape = (N,) offset = 0 mgr_items = [] block_placements = OrderedDict() - for d in descr.split(';'): + for d in descr.split(";"): d = d.strip() if not len(d): continue - names, blockstr = d.partition(':')[::2] + names, blockstr = d.partition(":")[::2] blockstr = blockstr.strip() - names = names.strip().split(',') + names = names.strip().split(",") mgr_items.extend(names) placement = list(np.arange(len(names)) + offset) @@ -180,19 +206,21 @@ def create_mgr(descr, item_shape=None): blocks = [] num_offset = 0 for blockstr, placement in block_placements.items(): - typestr = blockstr.split('-')[0] - blocks.append(create_block(typestr, - placement, - item_shape=item_shape, - num_offset=num_offset, )) + typestr = blockstr.split("-")[0] + blocks.append( + create_block( + typestr, placement, item_shape=item_shape, num_offset=num_offset + ) + ) num_offset += len(placement) - return BlockManager(sorted(blocks, key=lambda b: b.mgr_locs[0]), - [mgr_items] + [np.arange(n) for n in item_shape]) + return BlockManager( + sorted(blocks, key=lambda b: b.mgr_locs[0]), + [mgr_items] + [np.arange(n) for n in item_shape], + ) class TestBlock: - def setup_method(self, method): # self.fblock = get_float_ex() # a,c,e # self.cblock = get_complex_ex() # @@ -200,14 +228,14 @@ def setup_method(self, method): # self.bool_block = get_bool_ex() # self.int_block = get_int_ex() - self.fblock = create_block('float', [0, 2, 4]) - self.cblock = create_block('complex', [7]) - self.oblock = create_block('object', [1, 3]) - self.bool_block = create_block('bool', [5]) - self.int_block = create_block('int', [6]) + self.fblock = create_block("float", [0, 2, 4]) + self.cblock = create_block("complex", [7]) + self.oblock = create_block("object", [1, 3]) + self.bool_block = create_block("bool", [5]) + self.int_block = create_block("int", [6]) def test_constructor(self): - int32block = create_block('i4', [0]) + int32block = create_block("i4", [0]) assert int32block.dtype == np.int32 def test_pickle(self): @@ -221,8 +249,9 @@ def _check(blk): def test_mgr_locs(self): assert isinstance(self.fblock.mgr_locs, BlockPlacement) - tm.assert_numpy_array_equal(self.fblock.mgr_locs.as_array, - np.array([0, 2, 4], dtype=np.int64)) + tm.assert_numpy_array_equal( + self.fblock.mgr_locs.as_array, np.array([0, 2, 4], dtype=np.int64) + ) def test_attrs(self): assert self.fblock.shape == self.fblock.values.shape @@ -233,13 +262,14 @@ def test_merge(self): avals = randn(2, 10) bvals = randn(2, 10) - ref_cols = Index(['e', 'a', 'b', 'd', 'f']) + ref_cols = Index(["e", "a", "b", "d", "f"]) - ablock = make_block(avals, ref_cols.get_indexer(['e', 'b'])) - bblock = make_block(bvals, ref_cols.get_indexer(['a', 'd'])) + ablock = make_block(avals, ref_cols.get_indexer(["e", "b"])) + bblock = make_block(bvals, ref_cols.get_indexer(["a", "d"])) merged = ablock.merge(bblock) - tm.assert_numpy_array_equal(merged.mgr_locs.as_array, - np.array([0, 1, 2, 3], dtype=np.int64)) + tm.assert_numpy_array_equal( + merged.mgr_locs.as_array, np.array([0, 1, 2, 3], dtype=np.int64) + ) tm.assert_numpy_array_equal(merged.values[[0, 2]], np.array(avals)) tm.assert_numpy_array_equal(merged.values[[1, 3]], np.array(bvals)) @@ -263,21 +293,24 @@ def test_delete(self): newb = self.fblock.copy() newb.delete(0) assert isinstance(newb.mgr_locs, BlockPlacement) - tm.assert_numpy_array_equal(newb.mgr_locs.as_array, - np.array([2, 4], dtype=np.int64)) + tm.assert_numpy_array_equal( + newb.mgr_locs.as_array, np.array([2, 4], dtype=np.int64) + ) assert (newb.values[0] == 1).all() newb = self.fblock.copy() newb.delete(1) assert isinstance(newb.mgr_locs, BlockPlacement) - tm.assert_numpy_array_equal(newb.mgr_locs.as_array, - np.array([0, 4], dtype=np.int64)) + tm.assert_numpy_array_equal( + newb.mgr_locs.as_array, np.array([0, 4], dtype=np.int64) + ) assert (newb.values[1] == 2).all() newb = self.fblock.copy() newb.delete(2) - tm.assert_numpy_array_equal(newb.mgr_locs.as_array, - np.array([0, 2], dtype=np.int64)) + tm.assert_numpy_array_equal( + newb.mgr_locs.as_array, np.array([0, 2], dtype=np.int64) + ) assert (newb.values[1] == 1).all() newb = self.fblock.copy() @@ -286,50 +319,45 @@ def test_delete(self): def test_make_block_same_class(self): # issue 19431 - block = create_block('M8[ns, US/Eastern]', [3]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - block.make_block_same_class(block.values, - dtype=block.values.dtype) + block = create_block("M8[ns, US/Eastern]", [3]) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + block.make_block_same_class(block.values, dtype=block.values.dtype) class TestDatetimeBlock: - def test_try_coerce_arg(self): - block = create_block('datetime', [0]) + block = create_block("datetime", [0]) # coerce None none_coerced = block._try_coerce_args(None) assert pd.Timestamp(none_coerced) is pd.NaT # coerce different types of date bojects - vals = (np.datetime64('2010-10-10'), datetime(2010, 10, 10), - date(2010, 10, 10)) + vals = (np.datetime64("2010-10-10"), datetime(2010, 10, 10), date(2010, 10, 10)) for val in vals: coerced = block._try_coerce_args(val) assert np.int64 == type(coerced) - assert pd.Timestamp('2010-10-10') == pd.Timestamp(coerced) + assert pd.Timestamp("2010-10-10") == pd.Timestamp(coerced) class TestBlockManager: - def test_constructor_corner(self): pass def test_attrs(self): - mgr = create_mgr('a,b,c: f8-1; d,e,f: f8-2') + mgr = create_mgr("a,b,c: f8-1; d,e,f: f8-2") assert mgr.nblocks == 2 assert len(mgr) == 6 def test_is_mixed_dtype(self): - assert not create_mgr('a,b:f8').is_mixed_type - assert not create_mgr('a:f8-1; b:f8-2').is_mixed_type + assert not create_mgr("a,b:f8").is_mixed_type + assert not create_mgr("a:f8-1; b:f8-2").is_mixed_type - assert create_mgr('a,b:f8; c,d: f4').is_mixed_type - assert create_mgr('a,b:f8; c,d: object').is_mixed_type + assert create_mgr("a,b:f8; c,d: f4").is_mixed_type + assert create_mgr("a,b:f8; c,d: object").is_mixed_type def test_duplicate_ref_loc_failure(self): - tmp_mgr = create_mgr('a:bool; a: f8') + tmp_mgr = create_mgr("a:bool; a: f8") axes, blocks = tmp_mgr.axes, tmp_mgr.blocks @@ -346,8 +374,8 @@ def test_duplicate_ref_loc_failure(self): mgr.iget(1) def test_contains(self, mgr): - assert 'a' in mgr - assert 'baz' not in mgr + assert "a" in mgr + assert "baz" not in mgr def test_pickle(self, mgr): @@ -367,78 +395,77 @@ def test_pickle(self, mgr): def test_non_unique_pickle(self): - mgr = create_mgr('a,a,a:f8') + mgr = create_mgr("a,a,a:f8") mgr2 = tm.round_trip_pickle(mgr) assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) - mgr = create_mgr('a: f8; a: i8') + mgr = create_mgr("a: f8; a: i8") mgr2 = tm.round_trip_pickle(mgr) assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) def test_categorical_block_pickle(self): - mgr = create_mgr('a: category') + mgr = create_mgr("a: category") mgr2 = tm.round_trip_pickle(mgr) assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) - smgr = create_single_mgr('category') + smgr = create_single_mgr("category") smgr2 = tm.round_trip_pickle(smgr) assert_series_equal(Series(smgr), Series(smgr2)) def test_get(self): - cols = Index(list('abc')) + cols = Index(list("abc")) values = np.random.rand(3, 3) block = make_block(values=values.copy(), placement=np.arange(3)) mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)]) - assert_almost_equal(mgr.get('a', fastpath=False), values[0]) - assert_almost_equal(mgr.get('b', fastpath=False), values[1]) - assert_almost_equal(mgr.get('c', fastpath=False), values[2]) - assert_almost_equal(mgr.get('a').internal_values(), values[0]) - assert_almost_equal(mgr.get('b').internal_values(), values[1]) - assert_almost_equal(mgr.get('c').internal_values(), values[2]) + assert_almost_equal(mgr.get("a", fastpath=False), values[0]) + assert_almost_equal(mgr.get("b", fastpath=False), values[1]) + assert_almost_equal(mgr.get("c", fastpath=False), values[2]) + assert_almost_equal(mgr.get("a").internal_values(), values[0]) + assert_almost_equal(mgr.get("b").internal_values(), values[1]) + assert_almost_equal(mgr.get("c").internal_values(), values[2]) def test_set(self): - mgr = create_mgr('a,b,c: int', item_shape=(3, )) - - mgr.set('d', np.array(['foo'] * 3)) - mgr.set('b', np.array(['bar'] * 3)) - tm.assert_numpy_array_equal(mgr.get('a').internal_values(), - np.array([0] * 3)) - tm.assert_numpy_array_equal(mgr.get('b').internal_values(), - np.array(['bar'] * 3, dtype=np.object_)) - tm.assert_numpy_array_equal(mgr.get('c').internal_values(), - np.array([2] * 3)) - tm.assert_numpy_array_equal(mgr.get('d').internal_values(), - np.array(['foo'] * 3, dtype=np.object_)) + mgr = create_mgr("a,b,c: int", item_shape=(3,)) + + mgr.set("d", np.array(["foo"] * 3)) + mgr.set("b", np.array(["bar"] * 3)) + tm.assert_numpy_array_equal(mgr.get("a").internal_values(), np.array([0] * 3)) + tm.assert_numpy_array_equal( + mgr.get("b").internal_values(), np.array(["bar"] * 3, dtype=np.object_) + ) + tm.assert_numpy_array_equal(mgr.get("c").internal_values(), np.array([2] * 3)) + tm.assert_numpy_array_equal( + mgr.get("d").internal_values(), np.array(["foo"] * 3, dtype=np.object_) + ) def test_set_change_dtype(self, mgr): - mgr.set('baz', np.zeros(N, dtype=bool)) + mgr.set("baz", np.zeros(N, dtype=bool)) - mgr.set('baz', np.repeat('foo', N)) - assert mgr.get('baz').dtype == np.object_ + mgr.set("baz", np.repeat("foo", N)) + assert mgr.get("baz").dtype == np.object_ mgr2 = mgr.consolidate() - mgr2.set('baz', np.repeat('foo', N)) - assert mgr2.get('baz').dtype == np.object_ + mgr2.set("baz", np.repeat("foo", N)) + assert mgr2.get("baz").dtype == np.object_ - mgr2.set('quux', randn(N).astype(int)) - assert mgr2.get('quux').dtype == np.int_ + mgr2.set("quux", randn(N).astype(int)) + assert mgr2.get("quux").dtype == np.int_ - mgr2.set('quux', randn(N)) - assert mgr2.get('quux').dtype == np.float_ + mgr2.set("quux", randn(N)) + assert mgr2.get("quux").dtype == np.float_ def test_set_change_dtype_slice(self): # GH8850 - cols = MultiIndex.from_tuples([('1st', 'a'), ('2nd', 'b'), ('3rd', 'c') - ]) + cols = MultiIndex.from_tuples([("1st", "a"), ("2nd", "b"), ("3rd", "c")]) df = DataFrame([[1.0, 2, 3], [4.0, 5, 6]], columns=cols) - df['2nd'] = df['2nd'] * 2.0 + df["2nd"] = df["2nd"] * 2.0 blocks = df._to_dict_of_blocks() - assert sorted(blocks.keys()) == ['float64', 'int64'] - assert_frame_equal(blocks['float64'], DataFrame( - [[1.0, 4.0], [4.0, 10.0]], columns=cols[:2])) - assert_frame_equal(blocks['int64'], DataFrame( - [[3], [6]], columns=cols[2:])) + assert sorted(blocks.keys()) == ["float64", "int64"] + assert_frame_equal( + blocks["float64"], DataFrame([[1.0, 4.0], [4.0, 10.0]], columns=cols[:2]) + ) + assert_frame_equal(blocks["int64"], DataFrame([[3], [6]], columns=cols[2:])) def test_copy(self, mgr): cp = mgr.copy(deep=False) @@ -464,71 +491,70 @@ def test_copy(self, mgr): assert cp_blk.values.base is None and blk.values.base is None def test_sparse(self): - mgr = create_mgr('a: sparse-1; b: sparse-2') + mgr = create_mgr("a: sparse-1; b: sparse-2") # what to test here? assert mgr.as_array().dtype == np.float64 def test_sparse_mixed(self): - mgr = create_mgr('a: sparse-1; b: sparse-2; c: f8') + mgr = create_mgr("a: sparse-1; b: sparse-2; c: f8") assert len(mgr.blocks) == 3 assert isinstance(mgr, BlockManager) # what to test here? def test_as_array_float(self): - mgr = create_mgr('c: f4; d: f2; e: f8') + mgr = create_mgr("c: f4; d: f2; e: f8") assert mgr.as_array().dtype == np.float64 - mgr = create_mgr('c: f4; d: f2') + mgr = create_mgr("c: f4; d: f2") assert mgr.as_array().dtype == np.float32 def test_as_array_int_bool(self): - mgr = create_mgr('a: bool-1; b: bool-2') + mgr = create_mgr("a: bool-1; b: bool-2") assert mgr.as_array().dtype == np.bool_ - mgr = create_mgr('a: i8-1; b: i8-2; c: i4; d: i2; e: u1') + mgr = create_mgr("a: i8-1; b: i8-2; c: i4; d: i2; e: u1") assert mgr.as_array().dtype == np.int64 - mgr = create_mgr('c: i4; d: i2; e: u1') + mgr = create_mgr("c: i4; d: i2; e: u1") assert mgr.as_array().dtype == np.int32 def test_as_array_datetime(self): - mgr = create_mgr('h: datetime-1; g: datetime-2') - assert mgr.as_array().dtype == 'M8[ns]' + mgr = create_mgr("h: datetime-1; g: datetime-2") + assert mgr.as_array().dtype == "M8[ns]" def test_as_array_datetime_tz(self): - mgr = create_mgr('h: M8[ns, US/Eastern]; g: M8[ns, CET]') - assert mgr.get('h').dtype == 'datetime64[ns, US/Eastern]' - assert mgr.get('g').dtype == 'datetime64[ns, CET]' - assert mgr.as_array().dtype == 'object' + mgr = create_mgr("h: M8[ns, US/Eastern]; g: M8[ns, CET]") + assert mgr.get("h").dtype == "datetime64[ns, US/Eastern]" + assert mgr.get("g").dtype == "datetime64[ns, CET]" + assert mgr.as_array().dtype == "object" def test_astype(self): # coerce all - mgr = create_mgr('c: f4; d: f2; e: f8') - for t in ['float16', 'float32', 'float64', 'int32', 'int64']: + mgr = create_mgr("c: f4; d: f2; e: f8") + for t in ["float16", "float32", "float64", "int32", "int64"]: t = np.dtype(t) tmgr = mgr.astype(t) - assert tmgr.get('c').dtype.type == t - assert tmgr.get('d').dtype.type == t - assert tmgr.get('e').dtype.type == t + assert tmgr.get("c").dtype.type == t + assert tmgr.get("d").dtype.type == t + assert tmgr.get("e").dtype.type == t # mixed - mgr = create_mgr('a,b: object; c: bool; d: datetime;' - 'e: f4; f: f2; g: f8') - for t in ['float16', 'float32', 'float64', 'int32', 'int64']: + mgr = create_mgr("a,b: object; c: bool; d: datetime;" "e: f4; f: f2; g: f8") + for t in ["float16", "float32", "float64", "int32", "int64"]: t = np.dtype(t) - tmgr = mgr.astype(t, errors='ignore') - assert tmgr.get('c').dtype.type == t - assert tmgr.get('e').dtype.type == t - assert tmgr.get('f').dtype.type == t - assert tmgr.get('g').dtype.type == t - - assert tmgr.get('a').dtype.type == np.object_ - assert tmgr.get('b').dtype.type == np.object_ + tmgr = mgr.astype(t, errors="ignore") + assert tmgr.get("c").dtype.type == t + assert tmgr.get("e").dtype.type == t + assert tmgr.get("f").dtype.type == t + assert tmgr.get("g").dtype.type == t + + assert tmgr.get("a").dtype.type == np.object_ + assert tmgr.get("b").dtype.type == np.object_ if t != np.int64: - assert tmgr.get('d').dtype.type == np.datetime64 + assert tmgr.get("d").dtype.type == np.datetime64 else: - assert tmgr.get('d').dtype.type == t + assert tmgr.get("d").dtype.type == t def test_convert(self): def _compare(old_mgr, new_mgr): @@ -555,101 +581,101 @@ def _compare(old_mgr, new_mgr): assert found # noops - mgr = create_mgr('f: i8; g: f8') + mgr = create_mgr("f: i8; g: f8") new_mgr = mgr.convert() _compare(mgr, new_mgr) - mgr = create_mgr('a, b: object; f: i8; g: f8') + mgr = create_mgr("a, b: object; f: i8; g: f8") new_mgr = mgr.convert() _compare(mgr, new_mgr) # convert - mgr = create_mgr('a,b,foo: object; f: i8; g: f8') - mgr.set('a', np.array(['1'] * N, dtype=np.object_)) - mgr.set('b', np.array(['2.'] * N, dtype=np.object_)) - mgr.set('foo', np.array(['foo.'] * N, dtype=np.object_)) + mgr = create_mgr("a,b,foo: object; f: i8; g: f8") + mgr.set("a", np.array(["1"] * N, dtype=np.object_)) + mgr.set("b", np.array(["2."] * N, dtype=np.object_)) + mgr.set("foo", np.array(["foo."] * N, dtype=np.object_)) new_mgr = mgr.convert(numeric=True) - assert new_mgr.get('a').dtype == np.int64 - assert new_mgr.get('b').dtype == np.float64 - assert new_mgr.get('foo').dtype == np.object_ - assert new_mgr.get('f').dtype == np.int64 - assert new_mgr.get('g').dtype == np.float64 - - mgr = create_mgr('a,b,foo: object; f: i4; bool: bool; dt: datetime;' - 'i: i8; g: f8; h: f2') - mgr.set('a', np.array(['1'] * N, dtype=np.object_)) - mgr.set('b', np.array(['2.'] * N, dtype=np.object_)) - mgr.set('foo', np.array(['foo.'] * N, dtype=np.object_)) + assert new_mgr.get("a").dtype == np.int64 + assert new_mgr.get("b").dtype == np.float64 + assert new_mgr.get("foo").dtype == np.object_ + assert new_mgr.get("f").dtype == np.int64 + assert new_mgr.get("g").dtype == np.float64 + + mgr = create_mgr( + "a,b,foo: object; f: i4; bool: bool; dt: datetime;" "i: i8; g: f8; h: f2" + ) + mgr.set("a", np.array(["1"] * N, dtype=np.object_)) + mgr.set("b", np.array(["2."] * N, dtype=np.object_)) + mgr.set("foo", np.array(["foo."] * N, dtype=np.object_)) new_mgr = mgr.convert(numeric=True) - assert new_mgr.get('a').dtype == np.int64 - assert new_mgr.get('b').dtype == np.float64 - assert new_mgr.get('foo').dtype == np.object_ - assert new_mgr.get('f').dtype == np.int32 - assert new_mgr.get('bool').dtype == np.bool_ - assert new_mgr.get('dt').dtype.type, np.datetime64 - assert new_mgr.get('i').dtype == np.int64 - assert new_mgr.get('g').dtype == np.float64 - assert new_mgr.get('h').dtype == np.float16 + assert new_mgr.get("a").dtype == np.int64 + assert new_mgr.get("b").dtype == np.float64 + assert new_mgr.get("foo").dtype == np.object_ + assert new_mgr.get("f").dtype == np.int32 + assert new_mgr.get("bool").dtype == np.bool_ + assert new_mgr.get("dt").dtype.type, np.datetime64 + assert new_mgr.get("i").dtype == np.int64 + assert new_mgr.get("g").dtype == np.float64 + assert new_mgr.get("h").dtype == np.float16 def test_interleave(self): # self - for dtype in ['f8', 'i8', 'object', 'bool', 'complex', 'M8[ns]', - 'm8[ns]']: - mgr = create_mgr('a: {0}'.format(dtype)) + for dtype in ["f8", "i8", "object", "bool", "complex", "M8[ns]", "m8[ns]"]: + mgr = create_mgr("a: {0}".format(dtype)) assert mgr.as_array().dtype == dtype - mgr = create_mgr('a: {0}; b: {0}'.format(dtype)) + mgr = create_mgr("a: {0}; b: {0}".format(dtype)) assert mgr.as_array().dtype == dtype # will be converted according the actual dtype of the underlying - mgr = create_mgr('a: category') - assert mgr.as_array().dtype == 'i8' - mgr = create_mgr('a: category; b: category') - assert mgr.as_array().dtype == 'i8' - mgr = create_mgr('a: category; b: category2') - assert mgr.as_array().dtype == 'object' - mgr = create_mgr('a: category2') - assert mgr.as_array().dtype == 'object' - mgr = create_mgr('a: category2; b: category2') - assert mgr.as_array().dtype == 'object' + mgr = create_mgr("a: category") + assert mgr.as_array().dtype == "i8" + mgr = create_mgr("a: category; b: category") + assert mgr.as_array().dtype == "i8" + mgr = create_mgr("a: category; b: category2") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: category2") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: category2; b: category2") + assert mgr.as_array().dtype == "object" # combinations - mgr = create_mgr('a: f8') - assert mgr.as_array().dtype == 'f8' - mgr = create_mgr('a: f8; b: i8') - assert mgr.as_array().dtype == 'f8' - mgr = create_mgr('a: f4; b: i8') - assert mgr.as_array().dtype == 'f8' - mgr = create_mgr('a: f4; b: i8; d: object') - assert mgr.as_array().dtype == 'object' - mgr = create_mgr('a: bool; b: i8') - assert mgr.as_array().dtype == 'object' - mgr = create_mgr('a: complex') - assert mgr.as_array().dtype == 'complex' - mgr = create_mgr('a: f8; b: category') - assert mgr.as_array().dtype == 'object' - mgr = create_mgr('a: M8[ns]; b: category') - assert mgr.as_array().dtype == 'object' - mgr = create_mgr('a: M8[ns]; b: bool') - assert mgr.as_array().dtype == 'object' - mgr = create_mgr('a: M8[ns]; b: i8') - assert mgr.as_array().dtype == 'object' - mgr = create_mgr('a: m8[ns]; b: bool') - assert mgr.as_array().dtype == 'object' - mgr = create_mgr('a: m8[ns]; b: i8') - assert mgr.as_array().dtype == 'object' - mgr = create_mgr('a: M8[ns]; b: m8[ns]') - assert mgr.as_array().dtype == 'object' + mgr = create_mgr("a: f8") + assert mgr.as_array().dtype == "f8" + mgr = create_mgr("a: f8; b: i8") + assert mgr.as_array().dtype == "f8" + mgr = create_mgr("a: f4; b: i8") + assert mgr.as_array().dtype == "f8" + mgr = create_mgr("a: f4; b: i8; d: object") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: bool; b: i8") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: complex") + assert mgr.as_array().dtype == "complex" + mgr = create_mgr("a: f8; b: category") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: M8[ns]; b: category") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: M8[ns]; b: bool") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: M8[ns]; b: i8") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: m8[ns]; b: bool") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: m8[ns]; b: i8") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: M8[ns]; b: m8[ns]") + assert mgr.as_array().dtype == "object" def test_interleave_non_unique_cols(self): - df = DataFrame([ - [pd.Timestamp('20130101'), 3.5], - [pd.Timestamp('20130102'), 4.5]], - columns=['x', 'x'], - index=[1, 2]) + df = DataFrame( + [[pd.Timestamp("20130101"), 3.5], [pd.Timestamp("20130102"), 4.5]], + columns=["x", "x"], + index=[1, 2], + ) df_unique = df.copy() - df_unique.columns = ['x', 'y'] + df_unique.columns = ["x", "y"] assert df_unique.values.shape == df.values.shape tm.assert_numpy_array_equal(df_unique.values[0], df.values[0]) tm.assert_numpy_array_equal(df_unique.values[1], df.values[1]) @@ -658,11 +684,11 @@ def test_consolidate(self): pass def test_consolidate_ordering_issues(self, mgr): - mgr.set('f', randn(N)) - mgr.set('d', randn(N)) - mgr.set('b', randn(N)) - mgr.set('g', randn(N)) - mgr.set('h', randn(N)) + mgr.set("f", randn(N)) + mgr.set("d", randn(N)) + mgr.set("b", randn(N)) + mgr.set("g", randn(N)) + mgr.set("h", randn(N)) # we have datetime/tz blocks in mgr cons = mgr.consolidate() @@ -670,103 +696,122 @@ def test_consolidate_ordering_issues(self, mgr): cons = mgr.consolidate().get_numeric_data() assert cons.nblocks == 1 assert isinstance(cons.blocks[0].mgr_locs, BlockPlacement) - tm.assert_numpy_array_equal(cons.blocks[0].mgr_locs.as_array, - np.arange(len(cons.items), dtype=np.int64)) + tm.assert_numpy_array_equal( + cons.blocks[0].mgr_locs.as_array, np.arange(len(cons.items), dtype=np.int64) + ) def test_reindex_index(self): pass def test_reindex_items(self): # mgr is not consolidated, f8 & f8-2 blocks - mgr = create_mgr('a: f8; b: i8; c: f8; d: i8; e: f8;' - 'f: bool; g: f8-2') + mgr = create_mgr("a: f8; b: i8; c: f8; d: i8; e: f8;" "f: bool; g: f8-2") - reindexed = mgr.reindex_axis(['g', 'c', 'a', 'd'], axis=0) + reindexed = mgr.reindex_axis(["g", "c", "a", "d"], axis=0) assert reindexed.nblocks == 2 - tm.assert_index_equal(reindexed.items, pd.Index(['g', 'c', 'a', 'd'])) + tm.assert_index_equal(reindexed.items, pd.Index(["g", "c", "a", "d"])) assert_almost_equal( - mgr.get('g', fastpath=False), reindexed.get('g', fastpath=False)) + mgr.get("g", fastpath=False), reindexed.get("g", fastpath=False) + ) assert_almost_equal( - mgr.get('c', fastpath=False), reindexed.get('c', fastpath=False)) + mgr.get("c", fastpath=False), reindexed.get("c", fastpath=False) + ) assert_almost_equal( - mgr.get('a', fastpath=False), reindexed.get('a', fastpath=False)) + mgr.get("a", fastpath=False), reindexed.get("a", fastpath=False) + ) assert_almost_equal( - mgr.get('d', fastpath=False), reindexed.get('d', fastpath=False)) + mgr.get("d", fastpath=False), reindexed.get("d", fastpath=False) + ) assert_almost_equal( - mgr.get('g').internal_values(), - reindexed.get('g').internal_values()) + mgr.get("g").internal_values(), reindexed.get("g").internal_values() + ) assert_almost_equal( - mgr.get('c').internal_values(), - reindexed.get('c').internal_values()) + mgr.get("c").internal_values(), reindexed.get("c").internal_values() + ) assert_almost_equal( - mgr.get('a').internal_values(), - reindexed.get('a').internal_values()) + mgr.get("a").internal_values(), reindexed.get("a").internal_values() + ) assert_almost_equal( - mgr.get('d').internal_values(), - reindexed.get('d').internal_values()) + mgr.get("d").internal_values(), reindexed.get("d").internal_values() + ) def test_get_numeric_data(self): - mgr = create_mgr('int: int; float: float; complex: complex;' - 'str: object; bool: bool; obj: object; dt: datetime', - item_shape=(3, )) - mgr.set('obj', np.array([1, 2, 3], dtype=np.object_)) + mgr = create_mgr( + "int: int; float: float; complex: complex;" + "str: object; bool: bool; obj: object; dt: datetime", + item_shape=(3,), + ) + mgr.set("obj", np.array([1, 2, 3], dtype=np.object_)) numeric = mgr.get_numeric_data() - tm.assert_index_equal(numeric.items, - pd.Index(['int', 'float', 'complex', 'bool'])) + tm.assert_index_equal( + numeric.items, pd.Index(["int", "float", "complex", "bool"]) + ) assert_almost_equal( - mgr.get('float', fastpath=False), numeric.get('float', - fastpath=False)) + mgr.get("float", fastpath=False), numeric.get("float", fastpath=False) + ) assert_almost_equal( - mgr.get('float').internal_values(), - numeric.get('float').internal_values()) + mgr.get("float").internal_values(), numeric.get("float").internal_values() + ) # Check sharing - numeric.set('float', np.array([100., 200., 300.])) + numeric.set("float", np.array([100.0, 200.0, 300.0])) assert_almost_equal( - mgr.get('float', fastpath=False), np.array([100., 200., 300.])) + mgr.get("float", fastpath=False), np.array([100.0, 200.0, 300.0]) + ) assert_almost_equal( - mgr.get('float').internal_values(), np.array([100., 200., 300.])) + mgr.get("float").internal_values(), np.array([100.0, 200.0, 300.0]) + ) numeric2 = mgr.get_numeric_data(copy=True) - tm.assert_index_equal(numeric.items, - pd.Index(['int', 'float', 'complex', 'bool'])) - numeric2.set('float', np.array([1000., 2000., 3000.])) + tm.assert_index_equal( + numeric.items, pd.Index(["int", "float", "complex", "bool"]) + ) + numeric2.set("float", np.array([1000.0, 2000.0, 3000.0])) assert_almost_equal( - mgr.get('float', fastpath=False), np.array([100., 200., 300.])) + mgr.get("float", fastpath=False), np.array([100.0, 200.0, 300.0]) + ) assert_almost_equal( - mgr.get('float').internal_values(), np.array([100., 200., 300.])) + mgr.get("float").internal_values(), np.array([100.0, 200.0, 300.0]) + ) def test_get_bool_data(self): - mgr = create_mgr('int: int; float: float; complex: complex;' - 'str: object; bool: bool; obj: object; dt: datetime', - item_shape=(3, )) - mgr.set('obj', np.array([True, False, True], dtype=np.object_)) + mgr = create_mgr( + "int: int; float: float; complex: complex;" + "str: object; bool: bool; obj: object; dt: datetime", + item_shape=(3,), + ) + mgr.set("obj", np.array([True, False, True], dtype=np.object_)) bools = mgr.get_bool_data() - tm.assert_index_equal(bools.items, pd.Index(['bool'])) - assert_almost_equal(mgr.get('bool', fastpath=False), - bools.get('bool', fastpath=False)) + tm.assert_index_equal(bools.items, pd.Index(["bool"])) + assert_almost_equal( + mgr.get("bool", fastpath=False), bools.get("bool", fastpath=False) + ) assert_almost_equal( - mgr.get('bool').internal_values(), - bools.get('bool').internal_values()) + mgr.get("bool").internal_values(), bools.get("bool").internal_values() + ) - bools.set('bool', np.array([True, False, True])) - tm.assert_numpy_array_equal(mgr.get('bool', fastpath=False), - np.array([True, False, True])) - tm.assert_numpy_array_equal(mgr.get('bool').internal_values(), - np.array([True, False, True])) + bools.set("bool", np.array([True, False, True])) + tm.assert_numpy_array_equal( + mgr.get("bool", fastpath=False), np.array([True, False, True]) + ) + tm.assert_numpy_array_equal( + mgr.get("bool").internal_values(), np.array([True, False, True]) + ) # Check sharing bools2 = mgr.get_bool_data(copy=True) - bools2.set('bool', np.array([False, True, False])) - tm.assert_numpy_array_equal(mgr.get('bool', fastpath=False), - np.array([True, False, True])) - tm.assert_numpy_array_equal(mgr.get('bool').internal_values(), - np.array([True, False, True])) + bools2.set("bool", np.array([False, True, False])) + tm.assert_numpy_array_equal( + mgr.get("bool", fastpath=False), np.array([True, False, True]) + ) + tm.assert_numpy_array_equal( + mgr.get("bool").internal_values(), np.array([True, False, True]) + ) def test_unicode_repr_doesnt_raise(self): - repr(create_mgr('b,\u05d0: object')) + repr(create_mgr("b,\u05d0: object")) def test_missing_unicode_key(self): df = DataFrame({"a": [1]}) @@ -777,11 +822,11 @@ def test_missing_unicode_key(self): def test_equals(self): # unique items - bm1 = create_mgr('a,b,c: i8-1; d,e,f: i8-2') + bm1 = create_mgr("a,b,c: i8-1; d,e,f: i8-2") bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) assert bm1.equals(bm2) - bm1 = create_mgr('a,a,a: i8-1; b,b,b: i8-2') + bm1 = create_mgr("a,a,a: i8-1; b,b,b: i8-2") bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) assert bm1.equals(bm2) @@ -805,12 +850,12 @@ def test_equals_block_order_different_dtypes(self): assert bm_this.equals(bm) def test_single_mgr_ctor(self): - mgr = create_single_mgr('f8', num_rows=5) - assert mgr.as_array().tolist() == [0., 1., 2., 3., 4.] + mgr = create_single_mgr("f8", num_rows=5) + assert mgr.as_array().tolist() == [0.0, 1.0, 2.0, 3.0, 4.0] def test_validate_bool_args(self): invalid_values = [1, "True", [1, 2, 3], 5.0] - bm1 = create_mgr('a,b,c: i8-1; d,e,f: i8-2') + bm1 = create_mgr("a,b,c: i8-1; d,e,f: i8-2") for value in invalid_values: with pytest.raises(ValueError): @@ -827,20 +872,18 @@ class TestIndexing: # and are disabled. MANAGERS = [ - create_single_mgr('f8', N), - create_single_mgr('i8', N), - + create_single_mgr("f8", N), + create_single_mgr("i8", N), # 2-dim - create_mgr('a,b,c,d,e,f: f8', item_shape=(N,)), - create_mgr('a,b,c,d,e,f: i8', item_shape=(N,)), - create_mgr('a,b: f8; c,d: i8; e,f: string', item_shape=(N,)), - create_mgr('a,b: f8; c,d: i8; e,f: f8', item_shape=(N,)), - + create_mgr("a,b,c,d,e,f: f8", item_shape=(N,)), + create_mgr("a,b,c,d,e,f: i8", item_shape=(N,)), + create_mgr("a,b: f8; c,d: i8; e,f: string", item_shape=(N,)), + create_mgr("a,b: f8; c,d: i8; e,f: f8", item_shape=(N,)), # 3-dim - create_mgr('a,b,c,d,e,f: f8', item_shape=(N, N)), - create_mgr('a,b,c,d,e,f: i8', item_shape=(N, N)), - create_mgr('a,b: f8; c,d: i8; e,f: string', item_shape=(N, N)), - create_mgr('a,b: f8; c,d: i8; e,f: f8', item_shape=(N, N)), + create_mgr("a,b,c,d,e,f: f8", item_shape=(N, N)), + create_mgr("a,b,c,d,e,f: i8", item_shape=(N, N)), + create_mgr("a,b: f8; c,d: i8; e,f: string", item_shape=(N, N)), + create_mgr("a,b: f8; c,d: i8; e,f: f8", item_shape=(N, N)), ] # MANAGERS = [MANAGERS[6]] @@ -854,12 +897,14 @@ def assert_slice_ok(mgr, axis, slobj): if isinstance(slobj, np.ndarray): ax = mgr.axes[axis] if len(ax) and len(slobj) and len(slobj) != len(ax): - slobj = np.concatenate([slobj, np.zeros( - len(ax) - len(slobj), dtype=bool)]) + slobj = np.concatenate( + [slobj, np.zeros(len(ax) - len(slobj), dtype=bool)] + ) sliced = mgr.get_slice(slobj, axis=axis) - mat_slobj = (slice(None), ) * axis + (slobj, ) - tm.assert_numpy_array_equal(mat[mat_slobj], sliced.as_array(), - check_dtype=False) + mat_slobj = (slice(None),) * axis + (slobj,) + tm.assert_numpy_array_equal( + mat[mat_slobj], sliced.as_array(), check_dtype=False + ) tm.assert_index_equal(mgr.axes[axis][slobj], sliced.axes[axis]) for mgr in self.MANAGERS: @@ -872,22 +917,15 @@ def assert_slice_ok(mgr, axis, slobj): assert_slice_ok(mgr, ax, slice(3, 0, -2)) # boolean mask - assert_slice_ok( - mgr, ax, np.array([], dtype=np.bool_)) - assert_slice_ok( - mgr, ax, - np.ones(mgr.shape[ax], dtype=np.bool_)) - assert_slice_ok( - mgr, ax, - np.zeros(mgr.shape[ax], dtype=np.bool_)) + assert_slice_ok(mgr, ax, np.array([], dtype=np.bool_)) + assert_slice_ok(mgr, ax, np.ones(mgr.shape[ax], dtype=np.bool_)) + assert_slice_ok(mgr, ax, np.zeros(mgr.shape[ax], dtype=np.bool_)) if mgr.shape[ax] >= 3: + assert_slice_ok(mgr, ax, np.arange(mgr.shape[ax]) % 3 == 0) assert_slice_ok( - mgr, ax, - np.arange(mgr.shape[ax]) % 3 == 0) - assert_slice_ok( - mgr, ax, np.array( - [True, True, False], dtype=np.bool_)) + mgr, ax, np.array([True, True, False], dtype=np.bool_) + ) # fancy indexer assert_slice_ok(mgr, ax, []) @@ -901,10 +939,10 @@ def test_take(self): def assert_take_ok(mgr, axis, indexer): mat = mgr.as_array() taken = mgr.take(indexer, axis) - tm.assert_numpy_array_equal(np.take(mat, indexer, axis), - taken.as_array(), check_dtype=False) - tm.assert_index_equal(mgr.axes[axis].take(indexer), - taken.axes[axis]) + tm.assert_numpy_array_equal( + np.take(mat, indexer, axis), taken.as_array(), check_dtype=False + ) + tm.assert_index_equal(mgr.axes[axis].take(indexer), taken.axes[axis]) for mgr in self.MANAGERS: for ax in range(mgr.ndim): @@ -922,97 +960,106 @@ def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value): mat = mgr.as_array() indexer = mgr.axes[axis].get_indexer_for(new_labels) - reindexed = mgr.reindex_axis(new_labels, axis, - fill_value=fill_value) - tm.assert_numpy_array_equal(algos.take_nd(mat, indexer, axis, - fill_value=fill_value), - reindexed.as_array(), - check_dtype=False) + reindexed = mgr.reindex_axis(new_labels, axis, fill_value=fill_value) + tm.assert_numpy_array_equal( + algos.take_nd(mat, indexer, axis, fill_value=fill_value), + reindexed.as_array(), + check_dtype=False, + ) tm.assert_index_equal(reindexed.axes[axis], new_labels) for mgr in self.MANAGERS: for ax in range(mgr.ndim): - for fill_value in (None, np.nan, 100.): - assert_reindex_axis_is_ok( - mgr, ax, - pd.Index([]), fill_value) - assert_reindex_axis_is_ok( - mgr, ax, mgr.axes[ax], - fill_value) + for fill_value in (None, np.nan, 100.0): + assert_reindex_axis_is_ok(mgr, ax, pd.Index([]), fill_value) + assert_reindex_axis_is_ok(mgr, ax, mgr.axes[ax], fill_value) assert_reindex_axis_is_ok( - mgr, ax, - mgr.axes[ax][[0, 0, 0]], fill_value) + mgr, ax, mgr.axes[ax][[0, 0, 0]], fill_value + ) assert_reindex_axis_is_ok( - mgr, ax, - pd.Index(['foo', 'bar', 'baz']), fill_value) + mgr, ax, pd.Index(["foo", "bar", "baz"]), fill_value + ) assert_reindex_axis_is_ok( - mgr, ax, - pd.Index(['foo', mgr.axes[ax][0], 'baz']), - fill_value) + mgr, ax, pd.Index(["foo", mgr.axes[ax][0], "baz"]), fill_value + ) if mgr.shape[ax] >= 3: assert_reindex_axis_is_ok( - mgr, ax, - mgr.axes[ax][:-3], fill_value) + mgr, ax, mgr.axes[ax][:-3], fill_value + ) assert_reindex_axis_is_ok( - mgr, ax, - mgr.axes[ax][-3::-1], fill_value) + mgr, ax, mgr.axes[ax][-3::-1], fill_value + ) assert_reindex_axis_is_ok( - mgr, ax, - mgr.axes[ax][[0, 1, 2, 0, 1, 2]], fill_value) + mgr, ax, mgr.axes[ax][[0, 1, 2, 0, 1, 2]], fill_value + ) def test_reindex_indexer(self): - - def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, - fill_value): + def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value): mat = mgr.as_array() - reindexed_mat = algos.take_nd(mat, indexer, axis, - fill_value=fill_value) - reindexed = mgr.reindex_indexer(new_labels, indexer, axis, - fill_value=fill_value) - tm.assert_numpy_array_equal(reindexed_mat, - reindexed.as_array(), - check_dtype=False) + reindexed_mat = algos.take_nd(mat, indexer, axis, fill_value=fill_value) + reindexed = mgr.reindex_indexer( + new_labels, indexer, axis, fill_value=fill_value + ) + tm.assert_numpy_array_equal( + reindexed_mat, reindexed.as_array(), check_dtype=False + ) tm.assert_index_equal(reindexed.axes[axis], new_labels) for mgr in self.MANAGERS: for ax in range(mgr.ndim): - for fill_value in (None, np.nan, 100.): + for fill_value in (None, np.nan, 100.0): + assert_reindex_indexer_is_ok(mgr, ax, pd.Index([]), [], fill_value) assert_reindex_indexer_is_ok( - mgr, ax, - pd.Index([]), [], fill_value) + mgr, ax, mgr.axes[ax], np.arange(mgr.shape[ax]), fill_value + ) assert_reindex_indexer_is_ok( - mgr, ax, - mgr.axes[ax], np.arange(mgr.shape[ax]), fill_value) + mgr, + ax, + pd.Index(["foo"] * mgr.shape[ax]), + np.arange(mgr.shape[ax]), + fill_value, + ) assert_reindex_indexer_is_ok( - mgr, ax, - pd.Index(['foo'] * mgr.shape[ax]), - np.arange(mgr.shape[ax]), fill_value) + mgr, + ax, + mgr.axes[ax][::-1], + np.arange(mgr.shape[ax]), + fill_value, + ) assert_reindex_indexer_is_ok( - mgr, ax, - mgr.axes[ax][::-1], np.arange(mgr.shape[ax]), - fill_value) + mgr, + ax, + mgr.axes[ax], + np.arange(mgr.shape[ax])[::-1], + fill_value, + ) assert_reindex_indexer_is_ok( - mgr, ax, mgr.axes[ax], - np.arange(mgr.shape[ax])[::-1], fill_value) + mgr, ax, pd.Index(["foo", "bar", "baz"]), [0, 0, 0], fill_value + ) assert_reindex_indexer_is_ok( - mgr, ax, - pd.Index(['foo', 'bar', 'baz']), - [0, 0, 0], fill_value) + mgr, + ax, + pd.Index(["foo", "bar", "baz"]), + [-1, 0, -1], + fill_value, + ) assert_reindex_indexer_is_ok( - mgr, ax, - pd.Index(['foo', 'bar', 'baz']), - [-1, 0, -1], fill_value) - assert_reindex_indexer_is_ok( - mgr, ax, - pd.Index(['foo', mgr.axes[ax][0], 'baz']), - [-1, -1, -1], fill_value) + mgr, + ax, + pd.Index(["foo", mgr.axes[ax][0], "baz"]), + [-1, -1, -1], + fill_value, + ) if mgr.shape[ax] >= 3: assert_reindex_indexer_is_ok( - mgr, ax, - pd.Index(['foo', 'bar', 'baz']), - [0, 1, 2], fill_value) + mgr, + ax, + pd.Index(["foo", "bar", "baz"]), + [0, 1, 2], + fill_value, + ) # test_get_slice(slice_like, axis) # take(indexer, axis) @@ -1021,7 +1068,6 @@ def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, class TestBlockPlacement: - def test_slice_len(self): assert len(BlockPlacement(slice(0, 4))) == 4 assert len(BlockPlacement(slice(0, 4, 2))) == 2 @@ -1111,8 +1157,8 @@ def test_slice_iter(self): def test_slice_to_array_conversion(self): def assert_as_array_equals(slc, asarray): tm.assert_numpy_array_equal( - BlockPlacement(slc).as_array, - np.asarray(asarray, dtype=np.int64)) + BlockPlacement(slc).as_array, np.asarray(asarray, dtype=np.int64) + ) assert_as_array_equals(slice(0, 3), [0, 1, 2]) assert_as_array_equals(slice(0, 0), []) @@ -1188,45 +1234,56 @@ def any(self, axis=None): class TestCanHoldElement: - @pytest.mark.parametrize('value, dtype', [ - (1, 'i8'), - (1.0, 'f8'), - (2**63, 'f8'), - (1j, 'complex128'), - (2**63, 'complex128'), - (True, 'bool'), - (np.timedelta64(20, 'ns'), '= 1.11 otherwise, @@ -49,36 +49,34 @@ def s3_resource(tips_file, jsonl_file): os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") - moto = pytest.importorskip('moto') + moto = pytest.importorskip("moto") test_s3_files = [ - ('tips#1.csv', tips_file), - ('tips.csv', tips_file), - ('tips.csv.gz', tips_file + '.gz'), - ('tips.csv.bz2', tips_file + '.bz2'), - ('items.jsonl', jsonl_file), + ("tips#1.csv", tips_file), + ("tips.csv", tips_file), + ("tips.csv.gz", tips_file + ".gz"), + ("tips.csv.bz2", tips_file + ".bz2"), + ("items.jsonl", jsonl_file), ] def add_tips_files(bucket_name): for s3_key, file_name in test_s3_files: - with open(file_name, 'rb') as f: - conn.Bucket(bucket_name).put_object( - Key=s3_key, - Body=f) + with open(file_name, "rb") as f: + conn.Bucket(bucket_name).put_object(Key=s3_key, Body=f) try: s3 = moto.mock_s3() s3.start() # see gh-16135 - bucket = 'pandas-test' + bucket = "pandas-test" conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket) add_tips_files(bucket) - conn.create_bucket(Bucket='cant_get_it', ACL='private') - add_tips_files('cant_get_it') + conn.create_bucket(Bucket="cant_get_it", ACL="private") + add_tips_files("cant_get_it") yield conn finally: s3.stop() diff --git a/pandas/tests/io/excel/conftest.py b/pandas/tests/io/excel/conftest.py index dd96fb2366152c..54acd2128369d3 100644 --- a/pandas/tests/io/excel/conftest.py +++ b/pandas/tests/io/excel/conftest.py @@ -25,12 +25,11 @@ def df_ref(): """ Obtain the reference data from read_csv with the Python engine. """ - df_ref = read_csv('test1.csv', index_col=0, - parse_dates=True, engine='python') + df_ref = read_csv("test1.csv", index_col=0, parse_dates=True, engine="python") return df_ref -@pytest.fixture(params=['.xls', '.xlsx', '.xlsm', '.ods']) +@pytest.fixture(params=[".xls", ".xlsx", ".xlsm", ".ods"]) def read_ext(request): """ Valid extensions for reading Excel files. diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py index 76b3fe19a0771c..76871eddf1cee2 100644 --- a/pandas/tests/io/excel/test_odf.py +++ b/pandas/tests/io/excel/test_odf.py @@ -12,15 +12,14 @@ @pytest.fixture(autouse=True) def cd_and_set_engine(monkeypatch, datapath): func = functools.partial(pd.read_excel, engine="odf") - monkeypatch.setattr(pd, 'read_excel', func) + monkeypatch.setattr(pd, "read_excel", func) monkeypatch.chdir(datapath("io", "data")) def test_read_invalid_types_raises(): # the invalid_value_type.ods required manually editing # of the included content.xml file - with pytest.raises(ValueError, - match="Unrecognized type awesome_new_type"): + with pytest.raises(ValueError, match="Unrecognized type awesome_new_type"): pd.read_excel("invalid_value_type.ods") @@ -28,12 +27,12 @@ def test_read_writer_table(): # Also test reading tables from an text OpenDocument file # (.odt) index = pd.Index(["Row 1", "Row 2", "Row 3"], name="Header") - expected = pd.DataFrame([ - [1, np.nan, 7], - [2, np.nan, 8], - [3, np.nan, 9], - ], index=index, columns=["Column 1", "Unnamed: 2", "Column 3"]) + expected = pd.DataFrame( + [[1, np.nan, 7], [2, np.nan, 8], [3, np.nan, 9]], + index=index, + columns=["Column 1", "Unnamed: 2", "Column 3"], + ) - result = pd.read_excel("writertable.odt", 'Table1', index_col=0) + result = pd.read_excel("writertable.odt", "Table1", index_col=0) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 6815d2aa079f89..79fc87a62ad08c 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -7,83 +7,62 @@ openpyxl = pytest.importorskip("openpyxl") -pytestmark = pytest.mark.parametrize("ext", ['.xlsx']) +pytestmark = pytest.mark.parametrize("ext", [".xlsx"]) def test_to_excel_styleconverter(ext): from openpyxl import styles hstyle = { - "font": { - "color": '00FF0000', - "bold": True, - }, - "borders": { - "top": "thin", - "right": "thin", - "bottom": "thin", - "left": "thin", - }, - "alignment": { - "horizontal": "center", - "vertical": "top", - }, - "fill": { - "patternType": 'solid', - 'fgColor': { - 'rgb': '006666FF', - 'tint': 0.3, - }, - }, - "number_format": { - "format_code": "0.00" - }, - "protection": { - "locked": True, - "hidden": False, - }, + "font": {"color": "00FF0000", "bold": True}, + "borders": {"top": "thin", "right": "thin", "bottom": "thin", "left": "thin"}, + "alignment": {"horizontal": "center", "vertical": "top"}, + "fill": {"patternType": "solid", "fgColor": {"rgb": "006666FF", "tint": 0.3}}, + "number_format": {"format_code": "0.00"}, + "protection": {"locked": True, "hidden": False}, } - font_color = styles.Color('00FF0000') + font_color = styles.Color("00FF0000") font = styles.Font(bold=True, color=font_color) side = styles.Side(style=styles.borders.BORDER_THIN) border = styles.Border(top=side, right=side, bottom=side, left=side) - alignment = styles.Alignment(horizontal='center', vertical='top') - fill_color = styles.Color(rgb='006666FF', tint=0.3) - fill = styles.PatternFill(patternType='solid', fgColor=fill_color) + alignment = styles.Alignment(horizontal="center", vertical="top") + fill_color = styles.Color(rgb="006666FF", tint=0.3) + fill = styles.PatternFill(patternType="solid", fgColor=fill_color) - number_format = '0.00' + number_format = "0.00" protection = styles.Protection(locked=True, hidden=False) kw = _OpenpyxlWriter._convert_to_style_kwargs(hstyle) - assert kw['font'] == font - assert kw['border'] == border - assert kw['alignment'] == alignment - assert kw['fill'] == fill - assert kw['number_format'] == number_format - assert kw['protection'] == protection + assert kw["font"] == font + assert kw["border"] == border + assert kw["alignment"] == alignment + assert kw["fill"] == fill + assert kw["number_format"] == number_format + assert kw["protection"] == protection def test_write_cells_merge_styled(ext): from pandas.io.formats.excel import ExcelCell - sheet_name = 'merge_styled' + sheet_name = "merge_styled" - sty_b1 = {'font': {'color': '00FF0000'}} - sty_a2 = {'font': {'color': '0000FF00'}} + sty_b1 = {"font": {"color": "00FF0000"}} + sty_a2 = {"font": {"color": "0000FF00"}} initial_cells = [ ExcelCell(col=1, row=0, val=42, style=sty_b1), ExcelCell(col=0, row=1, val=99, style=sty_a2), ] - sty_merged = {'font': {'color': '000000FF', 'bold': True}} + sty_merged = {"font": {"color": "000000FF", "bold": True}} sty_kwargs = _OpenpyxlWriter._convert_to_style_kwargs(sty_merged) - openpyxl_sty_merged = sty_kwargs['font'] + openpyxl_sty_merged = sty_kwargs["font"] merge_cells = [ - ExcelCell(col=0, row=0, val='pandas', - mergestart=1, mergeend=1, style=sty_merged), + ExcelCell( + col=0, row=0, val="pandas", mergestart=1, mergeend=1, style=sty_merged + ) ] with ensure_clean(ext) as path: @@ -92,27 +71,28 @@ def test_write_cells_merge_styled(ext): writer.write_cells(merge_cells, sheet_name=sheet_name) wks = writer.sheets[sheet_name] - xcell_b1 = wks['B1'] - xcell_a2 = wks['A2'] + xcell_b1 = wks["B1"] + xcell_a2 = wks["A2"] assert xcell_b1.font == openpyxl_sty_merged assert xcell_a2.font == openpyxl_sty_merged -@pytest.mark.parametrize("mode,expected", [ - ('w', ['baz']), ('a', ['foo', 'bar', 'baz'])]) +@pytest.mark.parametrize( + "mode,expected", [("w", ["baz"]), ("a", ["foo", "bar", "baz"])] +) def test_write_append_mode(ext, mode, expected): - df = DataFrame([1], columns=['baz']) + df = DataFrame([1], columns=["baz"]) with ensure_clean(ext) as f: wb = openpyxl.Workbook() - wb.worksheets[0].title = 'foo' - wb.worksheets[0]['A1'].value = 'foo' - wb.create_sheet('bar') - wb.worksheets[1]['A1'].value = 'bar' + wb.worksheets[0].title = "foo" + wb.worksheets[0]["A1"].value = "foo" + wb.create_sheet("bar") + wb.worksheets[1]["A1"].value = "bar" wb.save(f) - writer = ExcelWriter(f, engine='openpyxl', mode=mode) - df.to_excel(writer, sheet_name='baz', index=False) + writer = ExcelWriter(f, engine="openpyxl", mode=mode) + df.to_excel(writer, sheet_name="baz", index=False) writer.save() wb2 = openpyxl.load_workbook(f) @@ -120,4 +100,4 @@ def test_write_append_mode(ext, mode, expected): assert result == expected for index, cell_value in enumerate(expected): - assert wb2.worksheets[index]['A1'].value == cell_value + assert wb2.worksheets[index]["A1"].value == cell_value diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index ae69c2302e60a7..cd8848828f6c4a 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -25,30 +25,42 @@ def ignore_xlrd_time_clock_warning(): """ with warnings.catch_warnings(): warnings.filterwarnings( - action='ignore', - message='time.clock has been deprecated', - category=DeprecationWarning) + action="ignore", + message="time.clock has been deprecated", + category=DeprecationWarning, + ) yield -@pytest.fixture(params=[ - # Add any engines to test here - # When defusedxml is installed it triggers deprecation warnings for - # xlrd and openpyxl, so catch those here - pytest.param('xlrd', marks=[ - td.skip_if_no('xlrd'), - pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), - ]), - pytest.param('openpyxl', marks=[ - td.skip_if_no('openpyxl'), - pytest.mark.filterwarnings("ignore:.*html argument"), - ]), - pytest.param(None, marks=[ - td.skip_if_no('xlrd'), - pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), - ]), - pytest.param("odf", marks=td.skip_if_no("odf")), -]) +@pytest.fixture( + params=[ + # Add any engines to test here + # When defusedxml is installed it triggers deprecation warnings for + # xlrd and openpyxl, so catch those here + pytest.param( + "xlrd", + marks=[ + td.skip_if_no("xlrd"), + pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), + ], + ), + pytest.param( + "openpyxl", + marks=[ + td.skip_if_no("openpyxl"), + pytest.mark.filterwarnings("ignore:.*html argument"), + ], + ), + pytest.param( + None, + marks=[ + td.skip_if_no("xlrd"), + pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), + ], + ), + pytest.param("odf", marks=td.skip_if_no("odf")), + ] +) def engine(request): """ A fixture for Excel reader engines. @@ -57,41 +69,42 @@ def engine(request): class TestReaders: - @pytest.fixture(autouse=True) def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext): """ Change directory and set engine for read_excel calls. """ - if engine == 'openpyxl' and read_ext == '.xls': + if engine == "openpyxl" and read_ext == ".xls": pytest.skip() - if engine == 'odf' and read_ext != '.ods': + if engine == "odf" and read_ext != ".ods": pytest.skip() if read_ext == ".ods" and engine != "odf": pytest.skip() func = partial(pd.read_excel, engine=engine) monkeypatch.chdir(datapath("io", "data")) - monkeypatch.setattr(pd, 'read_excel', func) + monkeypatch.setattr(pd, "read_excel", func) def test_usecols_int(self, read_ext, df_ref): df_ref = df_ref.reindex(columns=["A", "B", "C"]) # usecols as int - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ): with ignore_xlrd_time_clock_warning(): - df1 = pd.read_excel("test1" + read_ext, "Sheet1", - index_col=0, usecols=3) + df1 = pd.read_excel( + "test1" + read_ext, "Sheet1", index_col=0, usecols=3 + ) # usecols as int - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ): with ignore_xlrd_time_clock_warning(): - df2 = pd.read_excel("test1" + read_ext, "Sheet2", skiprows=[1], - index_col=0, usecols=3) + df2 = pd.read_excel( + "test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols=3 + ) # TODO add index to xls file) tm.assert_frame_equal(df1, df_ref, check_names=False) @@ -99,11 +112,13 @@ def test_usecols_int(self, read_ext, df_ref): def test_usecols_list(self, read_ext, df_ref): - df_ref = df_ref.reindex(columns=['B', 'C']) - df1 = pd.read_excel('test1' + read_ext, 'Sheet1', index_col=0, - usecols=[0, 2, 3]) - df2 = pd.read_excel('test1' + read_ext, 'Sheet2', skiprows=[1], - index_col=0, usecols=[0, 2, 3]) + df_ref = df_ref.reindex(columns=["B", "C"]) + df1 = pd.read_excel( + "test1" + read_ext, "Sheet1", index_col=0, usecols=[0, 2, 3] + ) + df2 = pd.read_excel( + "test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols=[0, 2, 3] + ) # TODO add index to xls file) tm.assert_frame_equal(df1, df_ref, check_names=False) @@ -111,50 +126,45 @@ def test_usecols_list(self, read_ext, df_ref): def test_usecols_str(self, read_ext, df_ref): - df1 = df_ref.reindex(columns=['A', 'B', 'C']) - df2 = pd.read_excel('test1' + read_ext, 'Sheet1', index_col=0, - usecols='A:D') - df3 = pd.read_excel('test1' + read_ext, 'Sheet2', skiprows=[1], - index_col=0, usecols='A:D') + df1 = df_ref.reindex(columns=["A", "B", "C"]) + df2 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols="A:D") + df3 = pd.read_excel( + "test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols="A:D" + ) # TODO add index to xls, read xls ignores index name ? tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) - df1 = df_ref.reindex(columns=['B', 'C']) - df2 = pd.read_excel('test1' + read_ext, 'Sheet1', index_col=0, - usecols='A,C,D') - df3 = pd.read_excel('test1' + read_ext, 'Sheet2', skiprows=[1], - index_col=0, usecols='A,C,D') + df1 = df_ref.reindex(columns=["B", "C"]) + df2 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols="A,C,D") + df3 = pd.read_excel( + "test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols="A,C,D" + ) # TODO add index to xls file tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) - df1 = df_ref.reindex(columns=['B', 'C']) - df2 = pd.read_excel('test1' + read_ext, 'Sheet1', index_col=0, - usecols='A,C:D') - df3 = pd.read_excel('test1' + read_ext, 'Sheet2', skiprows=[1], - index_col=0, usecols='A,C:D') + df1 = df_ref.reindex(columns=["B", "C"]) + df2 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols="A,C:D") + df3 = pd.read_excel( + "test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols="A,C:D" + ) tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) - @pytest.mark.parametrize("usecols", [ - [0, 1, 3], [0, 3, 1], - [1, 0, 3], [1, 3, 0], - [3, 0, 1], [3, 1, 0], - ]) - def test_usecols_diff_positional_int_columns_order( - self, read_ext, usecols, df_ref): + @pytest.mark.parametrize( + "usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]] + ) + def test_usecols_diff_positional_int_columns_order(self, read_ext, usecols, df_ref): expected = df_ref[["A", "C"]] - result = pd.read_excel("test1" + read_ext, "Sheet1", - index_col=0, usecols=usecols) + result = pd.read_excel( + "test1" + read_ext, "Sheet1", index_col=0, usecols=usecols + ) tm.assert_frame_equal(result, expected, check_names=False) - @pytest.mark.parametrize("usecols", [ - ["B", "D"], ["D", "B"] - ]) - def test_usecols_diff_positional_str_columns_order( - self, read_ext, usecols, df_ref): + @pytest.mark.parametrize("usecols", [["B", "D"], ["D", "B"]]) + def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_ref): expected = df_ref[["B", "D"]] expected.index = range(len(expected)) @@ -168,8 +178,9 @@ def test_read_excel_without_slicing(self, read_ext, df_ref): def test_usecols_excel_range_str(self, read_ext, df_ref): expected = df_ref[["C", "D"]] - result = pd.read_excel("test1" + read_ext, "Sheet1", - index_col=0, usecols="A,D:E") + result = pd.read_excel( + "test1" + read_ext, "Sheet1", index_col=0, usecols="A,D:E" + ) tm.assert_frame_equal(result, expected, check_names=False) def test_usecols_excel_range_str_invalid(self, read_ext): @@ -182,188 +193,215 @@ def test_index_col_label_error(self, read_ext): msg = "list indices must be integers.*, not str" with pytest.raises(TypeError, match=msg): - pd.read_excel("test1" + read_ext, "Sheet1", index_col=["A"], - usecols=["A", "C"]) + pd.read_excel( + "test1" + read_ext, "Sheet1", index_col=["A"], usecols=["A", "C"] + ) def test_index_col_empty(self, read_ext): # see gh-9208 - result = pd.read_excel("test1" + read_ext, "Sheet3", - index_col=["A", "B", "C"]) - expected = DataFrame(columns=["D", "E", "F"], - index=MultiIndex(levels=[[]] * 3, - codes=[[]] * 3, - names=["A", "B", "C"])) + result = pd.read_excel("test1" + read_ext, "Sheet3", index_col=["A", "B", "C"]) + expected = DataFrame( + columns=["D", "E", "F"], + index=MultiIndex(levels=[[]] * 3, codes=[[]] * 3, names=["A", "B", "C"]), + ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("index_col", [None, 2]) def test_index_col_with_unnamed(self, read_ext, index_col): # see gh-18792 - result = pd.read_excel( - "test1" + read_ext, "Sheet4", index_col=index_col) - expected = DataFrame([["i1", "a", "x"], ["i2", "b", "y"]], - columns=["Unnamed: 0", "col1", "col2"]) + result = pd.read_excel("test1" + read_ext, "Sheet4", index_col=index_col) + expected = DataFrame( + [["i1", "a", "x"], ["i2", "b", "y"]], columns=["Unnamed: 0", "col1", "col2"] + ) if index_col: expected = expected.set_index(expected.columns[index_col]) tm.assert_frame_equal(result, expected) def test_usecols_pass_non_existent_column(self, read_ext): - msg = ("Usecols do not match columns, " - "columns expected but not found: " + r"\['E'\]") + msg = ( + "Usecols do not match columns, " + "columns expected but not found: " + r"\['E'\]" + ) with pytest.raises(ValueError, match=msg): pd.read_excel("test1" + read_ext, usecols=["E"]) def test_usecols_wrong_type(self, read_ext): - msg = ("'usecols' must either be list-like of " - "all strings, all unicode, all integers or a callable.") + msg = ( + "'usecols' must either be list-like of " + "all strings, all unicode, all integers or a callable." + ) with pytest.raises(ValueError, match=msg): pd.read_excel("test1" + read_ext, usecols=["E1", 0]) def test_excel_stop_iterator(self, read_ext): - parsed = pd.read_excel('test2' + read_ext, 'Sheet1') - expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1']) + parsed = pd.read_excel("test2" + read_ext, "Sheet1") + expected = DataFrame([["aaaa", "bbbbb"]], columns=["Test", "Test1"]) tm.assert_frame_equal(parsed, expected) def test_excel_cell_error_na(self, read_ext): - parsed = pd.read_excel('test3' + read_ext, 'Sheet1') - expected = DataFrame([[np.nan]], columns=['Test']) + parsed = pd.read_excel("test3" + read_ext, "Sheet1") + expected = DataFrame([[np.nan]], columns=["Test"]) tm.assert_frame_equal(parsed, expected) def test_excel_table(self, read_ext, df_ref): - df1 = pd.read_excel('test1' + read_ext, 'Sheet1', index_col=0) - df2 = pd.read_excel('test1' + read_ext, 'Sheet2', skiprows=[1], - index_col=0) + df1 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0) + df2 = pd.read_excel("test1" + read_ext, "Sheet2", skiprows=[1], index_col=0) # TODO add index to file tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) - df3 = pd.read_excel( - 'test1' + read_ext, 'Sheet1', index_col=0, skipfooter=1) + df3 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, skipfooter=1) tm.assert_frame_equal(df3, df1.iloc[:-1]) def test_reader_special_dtypes(self, read_ext): - expected = DataFrame.from_dict(OrderedDict([ - ("IntCol", [1, 2, -3, 4, 0]), - ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]), - ("BoolCol", [True, False, True, True, False]), - ("StrCol", [1, 2, 3, 4, 5]), - # GH5394 - this is why convert_float isn't vectorized - ("Str2Col", ["a", 3, "c", "d", "e"]), - ("DateCol", [datetime(2013, 10, 30), datetime(2013, 10, 31), - datetime(1905, 1, 1), datetime(2013, 12, 14), - datetime(2015, 3, 14)]) - ])) - basename = 'test_types' + expected = DataFrame.from_dict( + OrderedDict( + [ + ("IntCol", [1, 2, -3, 4, 0]), + ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]), + ("BoolCol", [True, False, True, True, False]), + ("StrCol", [1, 2, 3, 4, 5]), + # GH5394 - this is why convert_float isn't vectorized + ("Str2Col", ["a", 3, "c", "d", "e"]), + ( + "DateCol", + [ + datetime(2013, 10, 30), + datetime(2013, 10, 31), + datetime(1905, 1, 1), + datetime(2013, 12, 14), + datetime(2015, 3, 14), + ], + ), + ] + ) + ) + basename = "test_types" # should read in correctly and infer types - actual = pd.read_excel(basename + read_ext, 'Sheet1') + actual = pd.read_excel(basename + read_ext, "Sheet1") tm.assert_frame_equal(actual, expected) # if not coercing number, then int comes in as float float_expected = expected.copy() float_expected["IntCol"] = float_expected["IntCol"].astype(float) float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0 - actual = pd.read_excel( - basename + read_ext, 'Sheet1', convert_float=False) + actual = pd.read_excel(basename + read_ext, "Sheet1", convert_float=False) tm.assert_frame_equal(actual, float_expected) # check setting Index (assuming xls and xlsx are the same here) for icol, name in enumerate(expected.columns): - actual = pd.read_excel( - basename + read_ext, 'Sheet1', index_col=icol) + actual = pd.read_excel(basename + read_ext, "Sheet1", index_col=icol) exp = expected.set_index(name) tm.assert_frame_equal(actual, exp) # convert_float and converters should be different but both accepted expected["StrCol"] = expected["StrCol"].apply(str) - actual = pd.read_excel(basename + read_ext, 'Sheet1', - converters={"StrCol": str}) + actual = pd.read_excel( + basename + read_ext, "Sheet1", converters={"StrCol": str} + ) tm.assert_frame_equal(actual, expected) no_convert_float = float_expected.copy() no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) actual = pd.read_excel( - basename + read_ext, 'Sheet1', - convert_float=False, converters={"StrCol": str}) + basename + read_ext, + "Sheet1", + convert_float=False, + converters={"StrCol": str}, + ) tm.assert_frame_equal(actual, no_convert_float) # GH8212 - support for converters and missing values def test_reader_converters(self, read_ext): - basename = 'test_converters' - - expected = DataFrame.from_dict(OrderedDict([ - ("IntCol", [1, 2, -3, -1000, 0]), - ("FloatCol", [12.5, np.nan, 18.3, 19.2, 0.000000005]), - ("BoolCol", ['Found', 'Found', 'Found', 'Not found', 'Found']), - ("StrCol", ['1', np.nan, '3', '4', '5']), - ])) - - converters = {'IntCol': lambda x: int(x) if x != '' else -1000, - 'FloatCol': lambda x: 10 * x if x else np.nan, - 2: lambda x: 'Found' if x != '' else 'Not found', - 3: lambda x: str(x) if x else '', - } + basename = "test_converters" + + expected = DataFrame.from_dict( + OrderedDict( + [ + ("IntCol", [1, 2, -3, -1000, 0]), + ("FloatCol", [12.5, np.nan, 18.3, 19.2, 0.000000005]), + ("BoolCol", ["Found", "Found", "Found", "Not found", "Found"]), + ("StrCol", ["1", np.nan, "3", "4", "5"]), + ] + ) + ) + + converters = { + "IntCol": lambda x: int(x) if x != "" else -1000, + "FloatCol": lambda x: 10 * x if x else np.nan, + 2: lambda x: "Found" if x != "" else "Not found", + 3: lambda x: str(x) if x else "", + } # should read in correctly and set types of single cells (not array # dtypes) - actual = pd.read_excel( - basename + read_ext, 'Sheet1', converters=converters) + actual = pd.read_excel(basename + read_ext, "Sheet1", converters=converters) tm.assert_frame_equal(actual, expected) def test_reader_dtype(self, read_ext): # GH 8212 - basename = 'testdtype' + basename = "testdtype" actual = pd.read_excel(basename + read_ext) - expected = DataFrame({ - 'a': [1, 2, 3, 4], - 'b': [2.5, 3.5, 4.5, 5.5], - 'c': [1, 2, 3, 4], - 'd': [1.0, 2.0, np.nan, 4.0]}).reindex( - columns=['a', 'b', 'c', 'd']) + expected = DataFrame( + { + "a": [1, 2, 3, 4], + "b": [2.5, 3.5, 4.5, 5.5], + "c": [1, 2, 3, 4], + "d": [1.0, 2.0, np.nan, 4.0], + } + ).reindex(columns=["a", "b", "c", "d"]) tm.assert_frame_equal(actual, expected) - actual = pd.read_excel(basename + read_ext, - dtype={'a': 'float64', - 'b': 'float32', - 'c': str}) + actual = pd.read_excel( + basename + read_ext, dtype={"a": "float64", "b": "float32", "c": str} + ) - expected['a'] = expected['a'].astype('float64') - expected['b'] = expected['b'].astype('float32') - expected['c'] = ['001', '002', '003', '004'] + expected["a"] = expected["a"].astype("float64") + expected["b"] = expected["b"].astype("float32") + expected["c"] = ["001", "002", "003", "004"] tm.assert_frame_equal(actual, expected) with pytest.raises(ValueError): - pd.read_excel(basename + read_ext, dtype={'d': 'int64'}) - - @pytest.mark.parametrize("dtype,expected", [ - (None, - DataFrame({ - "a": [1, 2, 3, 4], - "b": [2.5, 3.5, 4.5, 5.5], - "c": [1, 2, 3, 4], - "d": [1.0, 2.0, np.nan, 4.0] - })), - ({"a": "float64", - "b": "float32", - "c": str, - "d": str - }, - DataFrame({ - "a": Series([1, 2, 3, 4], dtype="float64"), - "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), - "c": ["001", "002", "003", "004"], - "d": ["1", "2", np.nan, "4"] - })), - ]) + pd.read_excel(basename + read_ext, dtype={"d": "int64"}) + + @pytest.mark.parametrize( + "dtype,expected", + [ + ( + None, + DataFrame( + { + "a": [1, 2, 3, 4], + "b": [2.5, 3.5, 4.5, 5.5], + "c": [1, 2, 3, 4], + "d": [1.0, 2.0, np.nan, 4.0], + } + ), + ), + ( + {"a": "float64", "b": "float32", "c": str, "d": str}, + DataFrame( + { + "a": Series([1, 2, 3, 4], dtype="float64"), + "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), + "c": ["001", "002", "003", "004"], + "d": ["1", "2", np.nan, "4"], + } + ), + ), + ], + ) def test_reader_dtype_str(self, read_ext, dtype, expected): # see gh-20377 basename = "testdtype" @@ -375,10 +413,10 @@ def test_reading_all_sheets(self, read_ext): # Test reading all sheetnames by setting sheetname to None, # Ensure a dict is returned. # See PR #9450 - basename = 'test_multisheet' + basename = "test_multisheet" dfs = pd.read_excel(basename + read_ext, sheet_name=None) # ensure this is not alphabetical to test order preservation - expected_keys = ['Charlie', 'Alpha', 'Beta'] + expected_keys = ["Charlie", "Alpha", "Beta"] tm.assert_contains_all(expected_keys, dfs.keys()) # Issue 9930 # Ensure sheet order is preserved @@ -390,9 +428,9 @@ def test_reading_multiple_specific_sheets(self, read_ext): # references (positions/names) are removed properly. # Ensure a dict is returned # See PR #9450 - basename = 'test_multisheet' + basename = "test_multisheet" # Explicitly request duplicates. Only the set should be returned. - expected_keys = [2, 'Charlie', 'Charlie'] + expected_keys = [2, "Charlie", "Charlie"] dfs = pd.read_excel(basename + read_ext, sheet_name=expected_keys) expected_keys = list(set(expected_keys)) tm.assert_contains_all(expected_keys, dfs.keys()) @@ -402,81 +440,86 @@ def test_reading_all_sheets_with_blank(self, read_ext): # Test reading all sheetnames by setting sheetname to None, # In the case where some sheets are blank. # Issue #11711 - basename = 'blank_with_header' + basename = "blank_with_header" dfs = pd.read_excel(basename + read_ext, sheet_name=None) - expected_keys = ['Sheet1', 'Sheet2', 'Sheet3'] + expected_keys = ["Sheet1", "Sheet2", "Sheet3"] tm.assert_contains_all(expected_keys, dfs.keys()) # GH6403 def test_read_excel_blank(self, read_ext): - actual = pd.read_excel('blank' + read_ext, 'Sheet1') + actual = pd.read_excel("blank" + read_ext, "Sheet1") tm.assert_frame_equal(actual, DataFrame()) def test_read_excel_blank_with_header(self, read_ext): - expected = DataFrame(columns=['col_1', 'col_2']) - actual = pd.read_excel('blank_with_header' + read_ext, 'Sheet1') + expected = DataFrame(columns=["col_1", "col_2"]) + actual = pd.read_excel("blank_with_header" + read_ext, "Sheet1") tm.assert_frame_equal(actual, expected) def test_date_conversion_overflow(self, read_ext): # GH 10001 : pandas.ExcelFile ignore parse_dates=False - expected = pd.DataFrame([[pd.Timestamp('2016-03-12'), 'Marc Johnson'], - [pd.Timestamp('2016-03-16'), 'Jack Black'], - [1e+20, 'Timothy Brown']], - columns=['DateColWithBigInt', 'StringCol']) - - if pd.read_excel.keywords['engine'] == 'openpyxl': + expected = pd.DataFrame( + [ + [pd.Timestamp("2016-03-12"), "Marc Johnson"], + [pd.Timestamp("2016-03-16"), "Jack Black"], + [1e20, "Timothy Brown"], + ], + columns=["DateColWithBigInt", "StringCol"], + ) + + if pd.read_excel.keywords["engine"] == "openpyxl": pytest.xfail("Maybe not supported by openpyxl") - result = pd.read_excel('testdateoverflow' + read_ext) + result = pd.read_excel("testdateoverflow" + read_ext) tm.assert_frame_equal(result, expected) def test_sheet_name(self, read_ext, df_ref): filename = "test1" sheet_name = "Sheet1" - df1 = pd.read_excel(filename + read_ext, - sheet_name=sheet_name, index_col=0) # doc + df1 = pd.read_excel( + filename + read_ext, sheet_name=sheet_name, index_col=0 + ) # doc with ignore_xlrd_time_clock_warning(): - df2 = pd.read_excel(filename + read_ext, index_col=0, - sheet_name=sheet_name) + df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name) tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) def test_excel_read_buffer(self, read_ext): - pth = 'test1' + read_ext - expected = pd.read_excel(pth, 'Sheet1', index_col=0) - with open(pth, 'rb') as f: - actual = pd.read_excel(f, 'Sheet1', index_col=0) + pth = "test1" + read_ext + expected = pd.read_excel(pth, "Sheet1", index_col=0) + with open(pth, "rb") as f: + actual = pd.read_excel(f, "Sheet1", index_col=0) tm.assert_frame_equal(expected, actual) def test_bad_engine_raises(self, read_ext): - bad_engine = 'foo' + bad_engine = "foo" with pytest.raises(ValueError, match="Unknown engine: foo"): - pd.read_excel('', engine=bad_engine) + pd.read_excel("", engine=bad_engine) @tm.network def test_read_from_http_url(self, read_ext): - if read_ext == '.ods': # TODO: remove once on master + if read_ext == ".ods": # TODO: remove once on master pytest.skip() - url = ('https://raw.github.com/pandas-dev/pandas/master/' - 'pandas/tests/io/data/test1' + read_ext) + url = ( + "https://raw.github.com/pandas-dev/pandas/master/" + "pandas/tests/io/data/test1" + read_ext + ) url_table = pd.read_excel(url) - local_table = pd.read_excel('test1' + read_ext) + local_table = pd.read_excel("test1" + read_ext) tm.assert_frame_equal(url_table, local_table) @td.skip_if_not_us_locale def test_read_from_s3_url(self, read_ext, s3_resource): # Bucket "pandas-test" created in tests/io/conftest.py - with open('test1' + read_ext, "rb") as f: - s3_resource.Bucket("pandas-test").put_object( - Key="test1" + read_ext, Body=f) + with open("test1" + read_ext, "rb") as f: + s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f) - url = ('s3://pandas-test/test1' + read_ext) + url = "s3://pandas-test/test1" + read_ext url_table = pd.read_excel(url) - local_table = pd.read_excel('test1' + read_ext) + local_table = pd.read_excel("test1" + read_ext) tm.assert_frame_equal(url_table, local_table) @pytest.mark.slow @@ -485,16 +528,16 @@ def test_read_from_s3_url(self, read_ext, s3_resource): def test_read_from_file_url(self, read_ext, datapath): # FILE - localtable = os.path.join(datapath("io", "data"), 'test1' + read_ext) + localtable = os.path.join(datapath("io", "data"), "test1" + read_ext) local_table = pd.read_excel(localtable) try: - url_table = pd.read_excel('file://localhost/' + localtable) + url_table = pd.read_excel("file://localhost/" + localtable) except URLError: # fails on some systems import platform - pytest.skip("failing on %s" % - ' '.join(platform.uname()).strip()) + + pytest.skip("failing on %s" % " ".join(platform.uname()).strip()) tm.assert_frame_equal(url_table, local_table) @@ -503,47 +546,53 @@ def test_read_from_pathlib_path(self, read_ext): # GH12655 from pathlib import Path - str_path = 'test1' + read_ext - expected = pd.read_excel(str_path, 'Sheet1', index_col=0) + str_path = "test1" + read_ext + expected = pd.read_excel(str_path, "Sheet1", index_col=0) - path_obj = Path('test1' + read_ext) - actual = pd.read_excel(path_obj, 'Sheet1', index_col=0) + path_obj = Path("test1" + read_ext) + actual = pd.read_excel(path_obj, "Sheet1", index_col=0) tm.assert_frame_equal(expected, actual) - @td.skip_if_no('py.path') + @td.skip_if_no("py.path") def test_read_from_py_localpath(self, read_ext): # GH12655 from py.path import local as LocalPath - str_path = os.path.join('test1' + read_ext) - expected = pd.read_excel(str_path, 'Sheet1', index_col=0) + str_path = os.path.join("test1" + read_ext) + expected = pd.read_excel(str_path, "Sheet1", index_col=0) - path_obj = LocalPath().join('test1' + read_ext) - actual = pd.read_excel(path_obj, 'Sheet1', index_col=0) + path_obj = LocalPath().join("test1" + read_ext) + actual = pd.read_excel(path_obj, "Sheet1", index_col=0) tm.assert_frame_equal(expected, actual) def test_reader_seconds(self, read_ext): # Test reading times with and without milliseconds. GH5945. - expected = DataFrame.from_dict({"Time": [time(1, 2, 3), - time(2, 45, 56, 100000), - time(4, 29, 49, 200000), - time(6, 13, 42, 300000), - time(7, 57, 35, 400000), - time(9, 41, 28, 500000), - time(11, 25, 21, 600000), - time(13, 9, 14, 700000), - time(14, 53, 7, 800000), - time(16, 37, 0, 900000), - time(18, 20, 54)]}) - - actual = pd.read_excel('times_1900' + read_ext, 'Sheet1') + expected = DataFrame.from_dict( + { + "Time": [ + time(1, 2, 3), + time(2, 45, 56, 100000), + time(4, 29, 49, 200000), + time(6, 13, 42, 300000), + time(7, 57, 35, 400000), + time(9, 41, 28, 500000), + time(11, 25, 21, 600000), + time(13, 9, 14, 700000), + time(14, 53, 7, 800000), + time(16, 37, 0, 900000), + time(18, 20, 54), + ] + } + ) + + actual = pd.read_excel("times_1900" + read_ext, "Sheet1") tm.assert_frame_equal(actual, expected) - actual = pd.read_excel('times_1904' + read_ext, 'Sheet1') + actual = pd.read_excel("times_1904" + read_ext, "Sheet1") tm.assert_frame_equal(actual, expected) def test_read_excel_multiindex(self, read_ext): @@ -552,14 +601,17 @@ def test_read_excel_multiindex(self, read_ext): mi_file = "testmultiindex" + read_ext # "mi_column" sheet - expected = DataFrame([[1, 2.5, pd.Timestamp("2015-01-01"), True], - [2, 3.5, pd.Timestamp("2015-01-02"), False], - [3, 4.5, pd.Timestamp("2015-01-03"), False], - [4, 5.5, pd.Timestamp("2015-01-04"), True]], - columns=mi) - - actual = pd.read_excel( - mi_file, "mi_column", header=[0, 1], index_col=0) + expected = DataFrame( + [ + [1, 2.5, pd.Timestamp("2015-01-01"), True], + [2, 3.5, pd.Timestamp("2015-01-02"), False], + [3, 4.5, pd.Timestamp("2015-01-03"), False], + [4, 5.5, pd.Timestamp("2015-01-04"), True], + ], + columns=mi, + ) + + actual = pd.read_excel(mi_file, "mi_column", header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) # "mi_index" sheet @@ -572,45 +624,40 @@ def test_read_excel_multiindex(self, read_ext): # "both" sheet expected.columns = mi - actual = pd.read_excel( - mi_file, "both", index_col=[0, 1], header=[0, 1]) + actual = pd.read_excel(mi_file, "both", index_col=[0, 1], header=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) # "mi_index_name" sheet expected.columns = ["a", "b", "c", "d"] expected.index = mi.set_names(["ilvl1", "ilvl2"]) - actual = pd.read_excel( - mi_file, "mi_index_name", index_col=[0, 1]) + actual = pd.read_excel(mi_file, "mi_index_name", index_col=[0, 1]) tm.assert_frame_equal(actual, expected) # "mi_column_name" sheet expected.index = list(range(4)) expected.columns = mi.set_names(["c1", "c2"]) - actual = pd.read_excel(mi_file, "mi_column_name", - header=[0, 1], index_col=0) + actual = pd.read_excel(mi_file, "mi_column_name", header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) # see gh-11317 # "name_with_int" sheet - expected.columns = mi.set_levels( - [1, 2], level=1).set_names(["c1", "c2"]) + expected.columns = mi.set_levels([1, 2], level=1).set_names(["c1", "c2"]) - actual = pd.read_excel(mi_file, "name_with_int", - index_col=0, header=[0, 1]) + actual = pd.read_excel(mi_file, "name_with_int", index_col=0, header=[0, 1]) tm.assert_frame_equal(actual, expected) # "both_name" sheet expected.columns = mi.set_names(["c1", "c2"]) expected.index = mi.set_names(["ilvl1", "ilvl2"]) - actual = pd.read_excel(mi_file, "both_name", - index_col=[0, 1], header=[0, 1]) + actual = pd.read_excel(mi_file, "both_name", index_col=[0, 1], header=[0, 1]) tm.assert_frame_equal(actual, expected) # "both_skiprows" sheet - actual = pd.read_excel(mi_file, "both_name_skiprows", index_col=[0, 1], - header=[0, 1], skiprows=2) + actual = pd.read_excel( + mi_file, "both_name_skiprows", index_col=[0, 1], header=[0, 1], skiprows=2 + ) tm.assert_frame_equal(actual, expected) def test_read_excel_multiindex_header_only(self, read_ext): @@ -631,21 +678,28 @@ def test_excel_old_index_format(self, read_ext): # We detect headers to determine if index names exist, so # that "index" name in the "names" version of the data will # now be interpreted as rows that include null data. - data = np.array([[None, None, None, None, None], - ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], - ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], - ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], - ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], - ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]]) + data = np.array( + [ + [None, None, None, None, None], + ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], + ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], + ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], + ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], + ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"], + ] + ) columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] - mi = MultiIndex(levels=[["R0", "R_l0_g0", "R_l0_g1", - "R_l0_g2", "R_l0_g3", "R_l0_g4"], - ["R1", "R_l1_g0", "R_l1_g1", - "R_l1_g2", "R_l1_g3", "R_l1_g4"]], - codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]], - names=[None, None]) - si = Index(["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", - "R_l0_g3", "R_l0_g4"], name=None) + mi = MultiIndex( + levels=[ + ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], + ["R1", "R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"], + ], + codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]], + names=[None, None], + ) + si = Index( + ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None + ) expected = pd.DataFrame(data, index=si, columns=columns) @@ -659,20 +713,25 @@ def test_excel_old_index_format(self, read_ext): # The analogous versions of the "names" version data # where there are explicitly no names for the indices. - data = np.array([["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], - ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], - ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], - ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], - ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]]) + data = np.array( + [ + ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], + ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], + ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], + ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], + ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"], + ] + ) columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] - mi = MultiIndex(levels=[["R_l0_g0", "R_l0_g1", "R_l0_g2", - "R_l0_g3", "R_l0_g4"], - ["R_l1_g0", "R_l1_g1", "R_l1_g2", - "R_l1_g3", "R_l1_g4"]], - codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], - names=[None, None]) - si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", - "R_l0_g3", "R_l0_g4"], name=None) + mi = MultiIndex( + levels=[ + ["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], + ["R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"], + ], + codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], + names=[None, None], + ) + si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None) expected = pd.DataFrame(data, index=si, columns=columns) @@ -688,154 +747,167 @@ def test_read_excel_bool_header_arg(self, read_ext): # GH 6114 for arg in [True, False]: with pytest.raises(TypeError): - pd.read_excel('test1' + read_ext, header=arg) + pd.read_excel("test1" + read_ext, header=arg) def test_read_excel_chunksize(self, read_ext): # GH 8011 with pytest.raises(NotImplementedError): - pd.read_excel('test1' + read_ext, chunksize=100) + pd.read_excel("test1" + read_ext, chunksize=100) def test_read_excel_skiprows_list(self, read_ext): # GH 4903 - actual = pd.read_excel('testskiprows' + read_ext, - 'skiprows_list', skiprows=[0, 2]) - expected = DataFrame([[1, 2.5, pd.Timestamp('2015-01-01'), True], - [2, 3.5, pd.Timestamp('2015-01-02'), False], - [3, 4.5, pd.Timestamp('2015-01-03'), False], - [4, 5.5, pd.Timestamp('2015-01-04'), True]], - columns=['a', 'b', 'c', 'd']) + actual = pd.read_excel( + "testskiprows" + read_ext, "skiprows_list", skiprows=[0, 2] + ) + expected = DataFrame( + [ + [1, 2.5, pd.Timestamp("2015-01-01"), True], + [2, 3.5, pd.Timestamp("2015-01-02"), False], + [3, 4.5, pd.Timestamp("2015-01-03"), False], + [4, 5.5, pd.Timestamp("2015-01-04"), True], + ], + columns=["a", "b", "c", "d"], + ) tm.assert_frame_equal(actual, expected) - actual = pd.read_excel('testskiprows' + read_ext, - 'skiprows_list', skiprows=np.array([0, 2])) + actual = pd.read_excel( + "testskiprows" + read_ext, "skiprows_list", skiprows=np.array([0, 2]) + ) tm.assert_frame_equal(actual, expected) def test_read_excel_nrows(self, read_ext): # GH 16645 num_rows_to_pull = 5 - actual = pd.read_excel('test1' + read_ext, nrows=num_rows_to_pull) - expected = pd.read_excel('test1' + read_ext) + actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull) + expected = pd.read_excel("test1" + read_ext) expected = expected[:num_rows_to_pull] tm.assert_frame_equal(actual, expected) def test_read_excel_nrows_greater_than_nrows_in_file(self, read_ext): # GH 16645 - expected = pd.read_excel('test1' + read_ext) + expected = pd.read_excel("test1" + read_ext) num_records_in_file = len(expected) num_rows_to_pull = num_records_in_file + 10 - actual = pd.read_excel('test1' + read_ext, nrows=num_rows_to_pull) + actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull) tm.assert_frame_equal(actual, expected) def test_read_excel_nrows_non_integer_parameter(self, read_ext): # GH 16645 msg = "'nrows' must be an integer >=0" with pytest.raises(ValueError, match=msg): - pd.read_excel('test1' + read_ext, nrows='5') + pd.read_excel("test1" + read_ext, nrows="5") def test_read_excel_squeeze(self, read_ext): # GH 12157 - f = 'test_squeeze' + read_ext + f = "test_squeeze" + read_ext - actual = pd.read_excel(f, 'two_columns', index_col=0, squeeze=True) - expected = pd.Series([2, 3, 4], [4, 5, 6], name='b') - expected.index.name = 'a' + actual = pd.read_excel(f, "two_columns", index_col=0, squeeze=True) + expected = pd.Series([2, 3, 4], [4, 5, 6], name="b") + expected.index.name = "a" tm.assert_series_equal(actual, expected) - actual = pd.read_excel(f, 'two_columns', squeeze=True) - expected = pd.DataFrame({'a': [4, 5, 6], - 'b': [2, 3, 4]}) + actual = pd.read_excel(f, "two_columns", squeeze=True) + expected = pd.DataFrame({"a": [4, 5, 6], "b": [2, 3, 4]}) tm.assert_frame_equal(actual, expected) - actual = pd.read_excel(f, 'one_column', squeeze=True) - expected = pd.Series([1, 2, 3], name='a') + actual = pd.read_excel(f, "one_column", squeeze=True) + expected = pd.Series([1, 2, 3], name="a") tm.assert_series_equal(actual, expected) class TestExcelFileRead: - @pytest.fixture(autouse=True) def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext): """ Change directory and set engine for ExcelFile objects. """ - if engine == 'odf' and read_ext != '.ods': + if engine == "odf" and read_ext != ".ods": pytest.skip() if read_ext == ".ods" and engine != "odf": pytest.skip() - if engine == 'openpyxl' and read_ext == '.xls': + if engine == "openpyxl" and read_ext == ".xls": pytest.skip() func = partial(pd.ExcelFile, engine=engine) monkeypatch.chdir(datapath("io", "data")) - monkeypatch.setattr(pd, 'ExcelFile', func) + monkeypatch.setattr(pd, "ExcelFile", func) def test_excel_passes_na(self, read_ext): - with pd.ExcelFile('test4' + read_ext) as excel: - parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False, - na_values=['apple']) - expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']], - columns=['Test']) + with pd.ExcelFile("test4" + read_ext) as excel: + parsed = pd.read_excel( + excel, "Sheet1", keep_default_na=False, na_values=["apple"] + ) + expected = DataFrame( + [["NA"], [1], ["NA"], [np.nan], ["rabbit"]], columns=["Test"] + ) tm.assert_frame_equal(parsed, expected) - with pd.ExcelFile('test4' + read_ext) as excel: - parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=True, - na_values=['apple']) - expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], - columns=['Test']) + with pd.ExcelFile("test4" + read_ext) as excel: + parsed = pd.read_excel( + excel, "Sheet1", keep_default_na=True, na_values=["apple"] + ) + expected = DataFrame( + [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"] + ) tm.assert_frame_equal(parsed, expected) # 13967 - with pd.ExcelFile('test5' + read_ext) as excel: - parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False, - na_values=['apple']) - expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']], - columns=['Test']) + with pd.ExcelFile("test5" + read_ext) as excel: + parsed = pd.read_excel( + excel, "Sheet1", keep_default_na=False, na_values=["apple"] + ) + expected = DataFrame( + [["1.#QNAN"], [1], ["nan"], [np.nan], ["rabbit"]], columns=["Test"] + ) tm.assert_frame_equal(parsed, expected) - with pd.ExcelFile('test5' + read_ext) as excel: - parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=True, - na_values=['apple']) - expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], - columns=['Test']) + with pd.ExcelFile("test5" + read_ext) as excel: + parsed = pd.read_excel( + excel, "Sheet1", keep_default_na=True, na_values=["apple"] + ) + expected = DataFrame( + [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"] + ) tm.assert_frame_equal(parsed, expected) - @pytest.mark.parametrize('arg', ['sheet', 'sheetname', 'parse_cols']) + @pytest.mark.parametrize("arg", ["sheet", "sheetname", "parse_cols"]) def test_unexpected_kwargs_raises(self, read_ext, arg): # gh-17964 - kwarg = {arg: 'Sheet1'} + kwarg = {arg: "Sheet1"} msg = "unexpected keyword argument `{}`".format(arg) - with pd.ExcelFile('test1' + read_ext) as excel: + with pd.ExcelFile("test1" + read_ext) as excel: with pytest.raises(TypeError, match=msg): pd.read_excel(excel, **kwarg) def test_excel_table_sheet_by_index(self, read_ext, df_ref): - with pd.ExcelFile('test1' + read_ext) as excel: + with pd.ExcelFile("test1" + read_ext) as excel: df1 = pd.read_excel(excel, 0, index_col=0) df2 = pd.read_excel(excel, 1, skiprows=[1], index_col=0) tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) - with pd.ExcelFile('test1' + read_ext) as excel: + with pd.ExcelFile("test1" + read_ext) as excel: df1 = excel.parse(0, index_col=0) df2 = excel.parse(1, skiprows=[1], index_col=0) tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) - with pd.ExcelFile('test1' + read_ext) as excel: + with pd.ExcelFile("test1" + read_ext) as excel: df3 = pd.read_excel(excel, 0, index_col=0, skipfooter=1) tm.assert_frame_equal(df3, df1.iloc[:-1]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False, - raise_on_extra_warnings=False): - with pd.ExcelFile('test1' + read_ext) as excel: + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ): + with pd.ExcelFile("test1" + read_ext) as excel: df4 = pd.read_excel(excel, 0, index_col=0, skip_footer=1) tm.assert_frame_equal(df3, df4) - with pd.ExcelFile('test1' + read_ext) as excel: + with pd.ExcelFile("test1" + read_ext) as excel: df3 = excel.parse(0, index_col=0, skipfooter=1) tm.assert_frame_equal(df3, df1.iloc[:-1]) @@ -848,27 +920,26 @@ def test_sheet_name(self, read_ext, df_ref): df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc with pd.ExcelFile(filename + read_ext) as excel: - df2_parse = excel.parse(index_col=0, - sheet_name=sheet_name) + df2_parse = excel.parse(index_col=0, sheet_name=sheet_name) tm.assert_frame_equal(df1_parse, df_ref, check_names=False) tm.assert_frame_equal(df2_parse, df_ref, check_names=False) def test_excel_read_buffer(self, engine, read_ext): - pth = 'test1' + read_ext - expected = pd.read_excel(pth, 'Sheet1', index_col=0, engine=engine) + pth = "test1" + read_ext + expected = pd.read_excel(pth, "Sheet1", index_col=0, engine=engine) - with open(pth, 'rb') as f: + with open(pth, "rb") as f: with pd.ExcelFile(f) as xls: - actual = pd.read_excel(xls, 'Sheet1', index_col=0) + actual = pd.read_excel(xls, "Sheet1", index_col=0) tm.assert_frame_equal(expected, actual) def test_reader_closes_file(self, engine, read_ext): - f = open('test1' + read_ext, 'rb') + f = open("test1" + read_ext, "rb") with pd.ExcelFile(f) as xlsx: # parses okay - pd.read_excel(xlsx, 'Sheet1', index_col=0, engine=engine) + pd.read_excel(xlsx, "Sheet1", index_col=0, engine=engine) assert f.closed @@ -878,4 +949,4 @@ def test_conflicting_excel_engines(self, read_ext): with pd.ExcelFile("test1" + read_ext) as xl: with pytest.raises(ValueError, match=msg): - pd.read_excel(xl, engine='foo') + pd.read_excel(xl, engine="foo") diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index d8971777f6eb47..76b27bce11b085 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -8,34 +8,48 @@ from pandas.io.formats.excel import ExcelFormatter -@pytest.mark.parametrize('engine', [ - pytest.param('xlwt', - marks=pytest.mark.xfail(reason='xlwt does not support ' - 'openpyxl-compatible ' - 'style dicts')), - 'xlsxwriter', - 'openpyxl', -]) +@pytest.mark.parametrize( + "engine", + [ + pytest.param( + "xlwt", + marks=pytest.mark.xfail( + reason="xlwt does not support " "openpyxl-compatible " "style dicts" + ), + ), + "xlsxwriter", + "openpyxl", + ], +) def test_styler_to_excel(engine): def style(df): # XXX: RGB colors not supported in xlwt - return DataFrame([['font-weight: bold', '', ''], - ['', 'color: blue', ''], - ['', '', 'text-decoration: underline'], - ['border-style: solid', '', ''], - ['', 'font-style: italic', ''], - ['', '', 'text-align: right'], - ['background-color: red', '', ''], - ['number-format: 0%', '', ''], - ['', '', ''], - ['', '', ''], - ['', '', '']], - index=df.index, columns=df.columns) + return DataFrame( + [ + ["font-weight: bold", "", ""], + ["", "color: blue", ""], + ["", "", "text-decoration: underline"], + ["border-style: solid", "", ""], + ["", "font-style: italic", ""], + ["", "", "text-align: right"], + ["background-color: red", "", ""], + ["number-format: 0%", "", ""], + ["", "", ""], + ["", "", ""], + ["", "", ""], + ], + index=df.index, + columns=df.columns, + ) def assert_equal_style(cell1, cell2, engine): - if engine in ['xlsxwriter', 'openpyxl']: - pytest.xfail(reason=("GH25351: failing on some attribute " - "comparisons in {}".format(engine))) + if engine in ["xlsxwriter", "openpyxl"]: + pytest.xfail( + reason=( + "GH25351: failing on some attribute " + "comparisons in {}".format(engine) + ) + ) # XXX: should find a better way to check equality assert cell1.alignment.__dict__ == cell2.alignment.__dict__ assert cell1.border.__dict__ == cell2.border.__dict__ @@ -46,36 +60,36 @@ def assert_equal_style(cell1, cell2, engine): def custom_converter(css): # use bold iff there is custom style attached to the cell - if css.strip(' \n;'): - return {'font': {'bold': True}} + if css.strip(" \n;"): + return {"font": {"bold": True}} return {} - pytest.importorskip('jinja2') + pytest.importorskip("jinja2") pytest.importorskip(engine) # Prepare spreadsheets df = DataFrame(np.random.randn(11, 3)) - with ensure_clean('.xlsx' if engine != 'xlwt' else '.xls') as path: + with ensure_clean(".xlsx" if engine != "xlwt" else ".xls") as path: writer = ExcelWriter(path, engine=engine) - df.to_excel(writer, sheet_name='frame') - df.style.to_excel(writer, sheet_name='unstyled') + df.to_excel(writer, sheet_name="frame") + df.style.to_excel(writer, sheet_name="unstyled") styled = df.style.apply(style, axis=None) - styled.to_excel(writer, sheet_name='styled') + styled.to_excel(writer, sheet_name="styled") ExcelFormatter(styled, style_converter=custom_converter).write( - writer, sheet_name='custom') + writer, sheet_name="custom" + ) writer.save() - if engine not in ('openpyxl', 'xlsxwriter'): + if engine not in ("openpyxl", "xlsxwriter"): # For other engines, we only smoke test return - openpyxl = pytest.importorskip('openpyxl') + openpyxl = pytest.importorskip("openpyxl") wb = openpyxl.load_workbook(path) # (1) compare DataFrame.to_excel and Styler.to_excel when unstyled n_cells = 0 - for col1, col2 in zip(wb['frame'].columns, - wb['unstyled'].columns): + for col1, col2 in zip(wb["frame"].columns, wb["unstyled"].columns): assert len(col1) == len(col2) for cell1, cell2 in zip(col1, col2): assert cell1.value == cell2.value @@ -88,47 +102,47 @@ def custom_converter(css): # (2) check styling with default converter # XXX: openpyxl (as at 2.4) prefixes colors with 00, xlsxwriter with FF - alpha = '00' if engine == 'openpyxl' else 'FF' + alpha = "00" if engine == "openpyxl" else "FF" n_cells = 0 - for col1, col2 in zip(wb['frame'].columns, - wb['styled'].columns): + for col1, col2 in zip(wb["frame"].columns, wb["styled"].columns): assert len(col1) == len(col2) for cell1, cell2 in zip(col1, col2): - ref = '%s%d' % (cell2.column, cell2.row) + ref = "%s%d" % (cell2.column, cell2.row) # XXX: this isn't as strong a test as ideal; we should # confirm that differences are exclusive - if ref == 'B2': + if ref == "B2": assert not cell1.font.bold assert cell2.font.bold - elif ref == 'C3': + elif ref == "C3": assert cell1.font.color.rgb != cell2.font.color.rgb - assert cell2.font.color.rgb == alpha + '0000FF' - elif ref == 'D4': + assert cell2.font.color.rgb == alpha + "0000FF" + elif ref == "D4": assert cell1.font.underline != cell2.font.underline - assert cell2.font.underline == 'single' - elif ref == 'B5': + assert cell2.font.underline == "single" + elif ref == "B5": assert not cell1.border.left.style - assert (cell2.border.top.style == - cell2.border.right.style == - cell2.border.bottom.style == - cell2.border.left.style == - 'medium') - elif ref == 'C6': + assert ( + cell2.border.top.style + == cell2.border.right.style + == cell2.border.bottom.style + == cell2.border.left.style + == "medium" + ) + elif ref == "C6": assert not cell1.font.italic assert cell2.font.italic - elif ref == 'D7': - assert (cell1.alignment.horizontal != - cell2.alignment.horizontal) - assert cell2.alignment.horizontal == 'right' - elif ref == 'B8': + elif ref == "D7": + assert cell1.alignment.horizontal != cell2.alignment.horizontal + assert cell2.alignment.horizontal == "right" + elif ref == "B8": assert cell1.fill.fgColor.rgb != cell2.fill.fgColor.rgb assert cell1.fill.patternType != cell2.fill.patternType - assert cell2.fill.fgColor.rgb == alpha + 'FF0000' - assert cell2.fill.patternType == 'solid' - elif ref == 'B9': - assert cell1.number_format == 'General' - assert cell2.number_format == '0%' + assert cell2.fill.fgColor.rgb == alpha + "FF0000" + assert cell2.fill.patternType == "solid" + elif ref == "B9": + assert cell1.number_format == "General" + assert cell2.number_format == "0%" else: assert_equal_style(cell1, cell2, engine) @@ -139,12 +153,11 @@ def custom_converter(css): # (3) check styling with custom converter n_cells = 0 - for col1, col2 in zip(wb['frame'].columns, - wb['custom'].columns): + for col1, col2 in zip(wb["frame"].columns, wb["custom"].columns): assert len(col1) == len(col2) for cell1, cell2 in zip(col1, col2): - ref = '%s%d' % (cell2.column, cell2.row) - if ref in ('B2', 'C3', 'D4', 'B5', 'C6', 'D7', 'B8', 'B9'): + ref = "%s%d" % (cell2.column, cell2.row) + if ref in ("B2", "C3", "D4", "B5", "C6", "D7", "B8", "B9"): assert not cell1.font.bold assert cell2.font.bold else: diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index d65bebe16804cf..cf26b20e5d0042 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -16,29 +16,28 @@ from pandas.util.testing import ensure_clean, makeCustomDataframe as mkdf from pandas.io.excel import ( - ExcelFile, ExcelWriter, _OpenpyxlWriter, _XlsxWriter, _XlwtWriter, - register_writer) + ExcelFile, + ExcelWriter, + _OpenpyxlWriter, + _XlsxWriter, + _XlwtWriter, + register_writer, +) -@td.skip_if_no('xlrd') -@pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) +@td.skip_if_no("xlrd") +@pytest.mark.parametrize("ext", [".xls", ".xlsx", ".xlsm"]) class TestRoundTrip: - @td.skip_if_no("xlwt") @td.skip_if_no("openpyxl") - @pytest.mark.parametrize("header,expected", [ - (None, DataFrame([np.nan] * 4)), - (0, DataFrame({"Unnamed: 0": [np.nan] * 3})) - ]) + @pytest.mark.parametrize( + "header,expected", + [(None, DataFrame([np.nan] * 4)), (0, DataFrame({"Unnamed: 0": [np.nan] * 3}))], + ) def test_read_one_empty_col_no_header(self, ext, header, expected): # xref gh-12292 filename = "no_header" - df = pd.DataFrame( - [["", 1, 100], - ["", 2, 200], - ["", 3, 300], - ["", 4, 400]] - ) + df = pd.DataFrame([["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]]) with ensure_clean(ext) as path: df.to_excel(path, filename, index=False, header=False) @@ -48,46 +47,41 @@ def test_read_one_empty_col_no_header(self, ext, header, expected): @td.skip_if_no("xlwt") @td.skip_if_no("openpyxl") - @pytest.mark.parametrize("header,expected", [ - (None, DataFrame([0] + [np.nan] * 4)), - (0, DataFrame([np.nan] * 4)) - ]) + @pytest.mark.parametrize( + "header,expected", + [(None, DataFrame([0] + [np.nan] * 4)), (0, DataFrame([np.nan] * 4))], + ) def test_read_one_empty_col_with_header(self, ext, header, expected): filename = "with_header" - df = pd.DataFrame( - [["", 1, 100], - ["", 2, 200], - ["", 3, 300], - ["", 4, 400]] - ) + df = pd.DataFrame([["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]]) with ensure_clean(ext) as path: - df.to_excel(path, 'with_header', index=False, header=True) + df.to_excel(path, "with_header", index=False, header=True) result = pd.read_excel(path, filename, usecols=[0], header=header) tm.assert_frame_equal(result, expected) - @td.skip_if_no('openpyxl') - @td.skip_if_no('xlwt') + @td.skip_if_no("openpyxl") + @td.skip_if_no("xlwt") def test_set_column_names_in_parameter(self, ext): # GH 12870 : pass down column names associated with # keyword argument names - refdf = pd.DataFrame([[1, 'foo'], [2, 'bar'], - [3, 'baz']], columns=['a', 'b']) + refdf = pd.DataFrame([[1, "foo"], [2, "bar"], [3, "baz"]], columns=["a", "b"]) with ensure_clean(ext) as pth: with ExcelWriter(pth) as writer: - refdf.to_excel(writer, 'Data_no_head', - header=False, index=False) - refdf.to_excel(writer, 'Data_with_head', index=False) + refdf.to_excel(writer, "Data_no_head", header=False, index=False) + refdf.to_excel(writer, "Data_with_head", index=False) - refdf.columns = ['A', 'B'] + refdf.columns = ["A", "B"] with ExcelFile(pth) as reader: - xlsdf_no_head = pd.read_excel(reader, 'Data_no_head', - header=None, names=['A', 'B']) + xlsdf_no_head = pd.read_excel( + reader, "Data_no_head", header=None, names=["A", "B"] + ) xlsdf_with_head = pd.read_excel( - reader, 'Data_with_head', index_col=None, names=['A', 'B']) + reader, "Data_with_head", index_col=None, names=["A", "B"] + ) tm.assert_frame_equal(xlsdf_no_head, refdf) tm.assert_frame_equal(xlsdf_with_head, refdf) @@ -122,37 +116,45 @@ def tdf(col_sheet_name): def test_read_excel_multiindex_empty_level(self, ext): # see gh-12453 with ensure_clean(ext) as path: - df = DataFrame({ - ("One", "x"): {0: 1}, - ("Two", "X"): {0: 3}, - ("Two", "Y"): {0: 7}, - ("Zero", ""): {0: 0} - }) - - expected = DataFrame({ - ("One", "x"): {0: 1}, - ("Two", "X"): {0: 3}, - ("Two", "Y"): {0: 7}, - ("Zero", "Unnamed: 4_level_1"): {0: 0} - }) + df = DataFrame( + { + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", ""): {0: 0}, + } + ) + + expected = DataFrame( + { + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", "Unnamed: 4_level_1"): {0: 0}, + } + ) df.to_excel(path) actual = pd.read_excel(path, header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) - df = pd.DataFrame({ - ("Beg", ""): {0: 0}, - ("Middle", "x"): {0: 1}, - ("Tail", "X"): {0: 3}, - ("Tail", "Y"): {0: 7} - }) - - expected = pd.DataFrame({ - ("Beg", "Unnamed: 1_level_1"): {0: 0}, - ("Middle", "x"): {0: 1}, - ("Tail", "X"): {0: 3}, - ("Tail", "Y"): {0: 7} - }) + df = pd.DataFrame( + { + ("Beg", ""): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7}, + } + ) + + expected = pd.DataFrame( + { + ("Beg", "Unnamed: 1_level_1"): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7}, + } + ) df.to_excel(path) actual = pd.read_excel(path, header=[0, 1], index_col=0) @@ -163,37 +165,47 @@ def test_read_excel_multiindex_empty_level(self, ext): @pytest.mark.parametrize("r_idx_names", [True, False]) @pytest.mark.parametrize("c_idx_levels", [1, 3]) @pytest.mark.parametrize("r_idx_levels", [1, 3]) - def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names, - c_idx_levels, r_idx_levels): + def test_excel_multindex_roundtrip( + self, ext, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels + ): # see gh-4679 with ensure_clean(ext) as pth: if c_idx_levels == 1 and c_idx_names: - pytest.skip("Column index name cannot be " - "serialized unless it's a MultiIndex") + pytest.skip( + "Column index name cannot be " "serialized unless it's a MultiIndex" + ) # Empty name case current read in as # unnamed levels, not Nones. check_names = r_idx_names or r_idx_levels <= 1 - df = mkdf(5, 5, c_idx_names, r_idx_names, - c_idx_levels, r_idx_levels) + df = mkdf(5, 5, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels) df.to_excel(pth) - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) + act = pd.read_excel( + pth, + index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels)), + ) tm.assert_frame_equal(df, act, check_names=check_names) df.iloc[0, :] = np.nan df.to_excel(pth) - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) + act = pd.read_excel( + pth, + index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels)), + ) tm.assert_frame_equal(df, act, check_names=check_names) df.iloc[-1, :] = np.nan df.to_excel(pth) - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) + act = pd.read_excel( + pth, + index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels)), + ) tm.assert_frame_equal(df, act, check_names=check_names) @td.skip_if_no("xlwt") @@ -201,8 +213,8 @@ def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names, def test_read_excel_parse_dates(self, ext): # see gh-11544, gh-12051 df = DataFrame( - {"col": [1, 2, 3], - "date_strings": pd.date_range("2012-01-01", periods=3)}) + {"col": [1, 2, 3], "date_strings": pd.date_range("2012-01-01", periods=3)} + ) df2 = df.copy() df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y") @@ -216,13 +228,13 @@ def test_read_excel_parse_dates(self, ext): tm.assert_frame_equal(df, res) date_parser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y") - res = pd.read_excel(pth, parse_dates=["date_strings"], - date_parser=date_parser, index_col=0) + res = pd.read_excel( + pth, parse_dates=["date_strings"], date_parser=date_parser, index_col=0 + ) tm.assert_frame_equal(df, res) class _WriterBase: - @pytest.fixture(autouse=True) def set_engine_and_path(self, engine, ext): """Fixture to set engine and open file for use in each test case @@ -241,7 +253,7 @@ def set_engine_and_path(self, engine, ext): class and any subclasses, on account of the `autouse=True` argument """ - option_name = 'io.excel.{ext}.writer'.format(ext=ext.strip('.')) + option_name = "io.excel.{ext}.writer".format(ext=ext.strip(".")) prev_engine = get_option(option_name) set_option(option_name, engine) with ensure_clean(ext) as path: @@ -250,21 +262,24 @@ class and any subclasses, on account of the `autouse=True` set_option(option_name, prev_engine) # Roll back option change -@td.skip_if_no('xlrd') -@pytest.mark.parametrize("engine,ext", [ - pytest.param('openpyxl', '.xlsx', marks=td.skip_if_no('openpyxl')), - pytest.param('openpyxl', '.xlsm', marks=td.skip_if_no('openpyxl')), - pytest.param('xlwt', '.xls', marks=td.skip_if_no('xlwt')), - pytest.param('xlsxwriter', '.xlsx', marks=td.skip_if_no('xlsxwriter')) -]) +@td.skip_if_no("xlrd") +@pytest.mark.parametrize( + "engine,ext", + [ + pytest.param("openpyxl", ".xlsx", marks=td.skip_if_no("openpyxl")), + pytest.param("openpyxl", ".xlsm", marks=td.skip_if_no("openpyxl")), + pytest.param("xlwt", ".xls", marks=td.skip_if_no("xlwt")), + pytest.param("xlsxwriter", ".xlsx", marks=td.skip_if_no("xlsxwriter")), + ], +) class TestExcelWriter(_WriterBase): # Base class for test cases to run with different Excel writers. def test_excel_sheet_size(self, engine, ext): # GH 26080 - breaking_row_count = 2**20 + 1 - breaking_col_count = 2**14 + 1 + breaking_row_count = 2 ** 20 + 1 + breaking_col_count = 2 ** 14 + 1 # purposely using two arrays to prevent memory issues while testing row_arr = np.zeros(shape=(breaking_row_count, 1)) col_arr = np.zeros(shape=(1, breaking_col_count)) @@ -308,45 +323,42 @@ def test_excel_writer_context_manager(self, frame, engine, ext): def test_roundtrip(self, engine, ext, frame): frame = frame.copy() - frame['A'][:5] = nan + frame["A"][:5] = nan - frame.to_excel(self.path, 'test1') - frame.to_excel(self.path, 'test1', columns=['A', 'B']) - frame.to_excel(self.path, 'test1', header=False) - frame.to_excel(self.path, 'test1', index=False) + frame.to_excel(self.path, "test1") + frame.to_excel(self.path, "test1", columns=["A", "B"]) + frame.to_excel(self.path, "test1", header=False) + frame.to_excel(self.path, "test1", index=False) # test roundtrip - frame.to_excel(self.path, 'test1') - recons = pd.read_excel(self.path, 'test1', index_col=0) + frame.to_excel(self.path, "test1") + recons = pd.read_excel(self.path, "test1", index_col=0) tm.assert_frame_equal(frame, recons) - frame.to_excel(self.path, 'test1', index=False) - recons = pd.read_excel(self.path, 'test1', index_col=None) + frame.to_excel(self.path, "test1", index=False) + recons = pd.read_excel(self.path, "test1", index_col=None) recons.index = frame.index tm.assert_frame_equal(frame, recons) - frame.to_excel(self.path, 'test1', na_rep='NA') - recons = pd.read_excel( - self.path, 'test1', index_col=0, na_values=['NA']) + frame.to_excel(self.path, "test1", na_rep="NA") + recons = pd.read_excel(self.path, "test1", index_col=0, na_values=["NA"]) tm.assert_frame_equal(frame, recons) # GH 3611 - frame.to_excel(self.path, 'test1', na_rep='88') - recons = pd.read_excel( - self.path, 'test1', index_col=0, na_values=['88']) + frame.to_excel(self.path, "test1", na_rep="88") + recons = pd.read_excel(self.path, "test1", index_col=0, na_values=["88"]) tm.assert_frame_equal(frame, recons) - frame.to_excel(self.path, 'test1', na_rep='88') - recons = pd.read_excel( - self.path, 'test1', index_col=0, na_values=[88, 88.0]) + frame.to_excel(self.path, "test1", na_rep="88") + recons = pd.read_excel(self.path, "test1", index_col=0, na_values=[88, 88.0]) tm.assert_frame_equal(frame, recons) # GH 6573 - frame.to_excel(self.path, 'Sheet1') + frame.to_excel(self.path, "Sheet1") recons = pd.read_excel(self.path, index_col=0) tm.assert_frame_equal(frame, recons) - frame.to_excel(self.path, '0') + frame.to_excel(self.path, "0") recons = pd.read_excel(self.path, index_col=0) tm.assert_frame_equal(frame, recons) @@ -358,11 +370,11 @@ def test_roundtrip(self, engine, ext, frame): def test_mixed(self, engine, ext, frame): mixed_frame = frame.copy() - mixed_frame['foo'] = 'bar' + mixed_frame["foo"] = "bar" - mixed_frame.to_excel(self.path, 'test1') + mixed_frame.to_excel(self.path, "test1") reader = ExcelFile(self.path) - recons = pd.read_excel(reader, 'test1', index_col=0) + recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(mixed_frame, recons) def test_ts_frame(self, tsframe, engine, ext): @@ -376,19 +388,17 @@ def test_ts_frame(self, tsframe, engine, ext): def test_basics_with_nan(self, engine, ext, frame): frame = frame.copy() - frame['A'][:5] = nan - frame.to_excel(self.path, 'test1') - frame.to_excel(self.path, 'test1', columns=['A', 'B']) - frame.to_excel(self.path, 'test1', header=False) - frame.to_excel(self.path, 'test1', index=False) - - @pytest.mark.parametrize("np_type", [ - np.int8, np.int16, np.int32, np.int64]) + frame["A"][:5] = nan + frame.to_excel(self.path, "test1") + frame.to_excel(self.path, "test1", columns=["A", "B"]) + frame.to_excel(self.path, "test1", header=False) + frame.to_excel(self.path, "test1", index=False) + + @pytest.mark.parametrize("np_type", [np.int8, np.int16, np.int32, np.int64]) def test_int_types(self, engine, ext, np_type): # Test np.int values read come back as int # (rather than float which is Excel's format). - df = DataFrame(np.random.randint(-10, 10, size=(10, 2)), - dtype=np_type) + df = DataFrame(np.random.randint(-10, 10, size=(10, 2)), dtype=np_type) df.to_excel(self.path, "test1") reader = ExcelFile(self.path) @@ -402,14 +412,12 @@ def test_int_types(self, engine, ext, np_type): # Test with convert_float=False comes back as float. float_frame = df.astype(float) - recons = pd.read_excel(self.path, "test1", - convert_float=False, index_col=0) - tm.assert_frame_equal(recons, float_frame, - check_index_type=False, - check_column_type=False) - - @pytest.mark.parametrize("np_type", [ - np.float16, np.float32, np.float64]) + recons = pd.read_excel(self.path, "test1", convert_float=False, index_col=0) + tm.assert_frame_equal( + recons, float_frame, check_index_type=False, check_column_type=False + ) + + @pytest.mark.parametrize("np_type", [np.float16, np.float32, np.float64]) def test_float_types(self, engine, ext, np_type): # Test np.float values read come back as float. df = DataFrame(np.random.random_sample(10), dtype=np_type) @@ -423,7 +431,7 @@ def test_float_types(self, engine, ext, np_type): @pytest.mark.parametrize("np_type", [np.bool8, np.bool_]) def test_bool_types(self, engine, ext, np_type): # Test np.bool values read come back as float. - df = (DataFrame([1, 0, True, False], dtype=np_type)) + df = DataFrame([1, 0, True, False], dtype=np_type) df.to_excel(self.path, "test1") reader = ExcelFile(self.path) @@ -442,102 +450,99 @@ def test_inf_roundtrip(self, engine, ext): def test_sheets(self, engine, ext, frame, tsframe): frame = frame.copy() - frame['A'][:5] = nan + frame["A"][:5] = nan - frame.to_excel(self.path, 'test1') - frame.to_excel(self.path, 'test1', columns=['A', 'B']) - frame.to_excel(self.path, 'test1', header=False) - frame.to_excel(self.path, 'test1', index=False) + frame.to_excel(self.path, "test1") + frame.to_excel(self.path, "test1", columns=["A", "B"]) + frame.to_excel(self.path, "test1", header=False) + frame.to_excel(self.path, "test1", index=False) # Test writing to separate sheets writer = ExcelWriter(self.path) - frame.to_excel(writer, 'test1') - tsframe.to_excel(writer, 'test2') + frame.to_excel(writer, "test1") + tsframe.to_excel(writer, "test2") writer.save() reader = ExcelFile(self.path) - recons = pd.read_excel(reader, 'test1', index_col=0) + recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(frame, recons) - recons = pd.read_excel(reader, 'test2', index_col=0) + recons = pd.read_excel(reader, "test2", index_col=0) tm.assert_frame_equal(tsframe, recons) assert 2 == len(reader.sheet_names) - assert 'test1' == reader.sheet_names[0] - assert 'test2' == reader.sheet_names[1] + assert "test1" == reader.sheet_names[0] + assert "test2" == reader.sheet_names[1] def test_colaliases(self, engine, ext, frame): frame = frame.copy() - frame['A'][:5] = nan + frame["A"][:5] = nan - frame.to_excel(self.path, 'test1') - frame.to_excel(self.path, 'test1', columns=['A', 'B']) - frame.to_excel(self.path, 'test1', header=False) - frame.to_excel(self.path, 'test1', index=False) + frame.to_excel(self.path, "test1") + frame.to_excel(self.path, "test1", columns=["A", "B"]) + frame.to_excel(self.path, "test1", header=False) + frame.to_excel(self.path, "test1", index=False) # column aliases - col_aliases = Index(['AA', 'X', 'Y', 'Z']) - frame.to_excel(self.path, 'test1', header=col_aliases) + col_aliases = Index(["AA", "X", "Y", "Z"]) + frame.to_excel(self.path, "test1", header=col_aliases) reader = ExcelFile(self.path) - rs = pd.read_excel(reader, 'test1', index_col=0) + rs = pd.read_excel(reader, "test1", index_col=0) xp = frame.copy() xp.columns = col_aliases tm.assert_frame_equal(xp, rs) def test_roundtrip_indexlabels(self, merge_cells, engine, ext, frame): frame = frame.copy() - frame['A'][:5] = nan + frame["A"][:5] = nan - frame.to_excel(self.path, 'test1') - frame.to_excel(self.path, 'test1', columns=['A', 'B']) - frame.to_excel(self.path, 'test1', header=False) - frame.to_excel(self.path, 'test1', index=False) + frame.to_excel(self.path, "test1") + frame.to_excel(self.path, "test1", columns=["A", "B"]) + frame.to_excel(self.path, "test1", header=False) + frame.to_excel(self.path, "test1", index=False) # test index_label - df = (DataFrame(np.random.randn(10, 2)) >= 0) - df.to_excel(self.path, 'test1', - index_label=['test'], - merge_cells=merge_cells) + df = DataFrame(np.random.randn(10, 2)) >= 0 + df.to_excel(self.path, "test1", index_label=["test"], merge_cells=merge_cells) reader = ExcelFile(self.path) - recons = pd.read_excel( - reader, 'test1', index_col=0).astype(np.int64) - df.index.names = ['test'] + recons = pd.read_excel(reader, "test1", index_col=0).astype(np.int64) + df.index.names = ["test"] assert df.index.names == recons.index.names - df = (DataFrame(np.random.randn(10, 2)) >= 0) - df.to_excel(self.path, - 'test1', - index_label=['test', 'dummy', 'dummy2'], - merge_cells=merge_cells) + df = DataFrame(np.random.randn(10, 2)) >= 0 + df.to_excel( + self.path, + "test1", + index_label=["test", "dummy", "dummy2"], + merge_cells=merge_cells, + ) reader = ExcelFile(self.path) - recons = pd.read_excel( - reader, 'test1', index_col=0).astype(np.int64) - df.index.names = ['test'] + recons = pd.read_excel(reader, "test1", index_col=0).astype(np.int64) + df.index.names = ["test"] assert df.index.names == recons.index.names - df = (DataFrame(np.random.randn(10, 2)) >= 0) - df.to_excel(self.path, - 'test1', - index_label='test', - merge_cells=merge_cells) + df = DataFrame(np.random.randn(10, 2)) >= 0 + df.to_excel(self.path, "test1", index_label="test", merge_cells=merge_cells) reader = ExcelFile(self.path) - recons = pd.read_excel( - reader, 'test1', index_col=0).astype(np.int64) - df.index.names = ['test'] + recons = pd.read_excel(reader, "test1", index_col=0).astype(np.int64) + df.index.names = ["test"] tm.assert_frame_equal(df, recons.astype(bool)) - frame.to_excel(self.path, - 'test1', - columns=['A', 'B', 'C', 'D'], - index=False, merge_cells=merge_cells) + frame.to_excel( + self.path, + "test1", + columns=["A", "B", "C", "D"], + index=False, + merge_cells=merge_cells, + ) # take 'A' and 'B' as indexes (same row as cols 'C', 'D') df = frame.copy() - df = df.set_index(['A', 'B']) + df = df.set_index(["A", "B"]) reader = ExcelFile(self.path) - recons = pd.read_excel(reader, 'test1', index_col=[0, 1]) + recons = pd.read_excel(reader, "test1", index_col=[0, 1]) tm.assert_frame_equal(df, recons, check_less_precise=True) def test_excel_roundtrip_indexname(self, merge_cells, engine, ext): df = DataFrame(np.random.randn(10, 4)) - df.index.name = 'foo' + df.index.name = "foo" df.to_excel(self.path, merge_cells=merge_cells) @@ -545,7 +550,7 @@ def test_excel_roundtrip_indexname(self, merge_cells, engine, ext): result = pd.read_excel(xf, xf.sheet_names[0], index_col=0) tm.assert_frame_equal(result, df) - assert result.index.name == 'foo' + assert result.index.name == "foo" def test_excel_roundtrip_datetime(self, merge_cells, tsframe, engine, ext): # datetime.date, not sure what to test here exactly @@ -563,22 +568,30 @@ def test_excel_date_datetime_format(self, engine, ext): # see gh-4133 # # Excel output format strings - df = DataFrame([[date(2014, 1, 31), - date(1999, 9, 24)], - [datetime(1998, 5, 26, 23, 33, 4), - datetime(2014, 2, 28, 13, 5, 13)]], - index=["DATE", "DATETIME"], columns=["X", "Y"]) - df_expected = DataFrame([[datetime(2014, 1, 31), - datetime(1999, 9, 24)], - [datetime(1998, 5, 26, 23, 33, 4), - datetime(2014, 2, 28, 13, 5, 13)]], - index=["DATE", "DATETIME"], columns=["X", "Y"]) + df = DataFrame( + [ + [date(2014, 1, 31), date(1999, 9, 24)], + [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)], + ], + index=["DATE", "DATETIME"], + columns=["X", "Y"], + ) + df_expected = DataFrame( + [ + [datetime(2014, 1, 31), datetime(1999, 9, 24)], + [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)], + ], + index=["DATE", "DATETIME"], + columns=["X", "Y"], + ) with ensure_clean(ext) as filename2: writer1 = ExcelWriter(self.path) - writer2 = ExcelWriter(filename2, - date_format="DD.MM.YYYY", - datetime_format="DD.MM.YYYY HH-MM-SS") + writer2 = ExcelWriter( + filename2, + date_format="DD.MM.YYYY", + datetime_format="DD.MM.YYYY HH-MM-SS", + ) df.to_excel(writer1, "test1") df.to_excel(writer2, "test1") @@ -602,8 +615,7 @@ def test_to_excel_interval_no_labels(self, engine, ext): # see gh-19242 # # Test writing Interval without labels. - df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), - dtype=np.int64) + df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), dtype=np.int64) expected = df.copy() df["new"] = pd.cut(df[0], 10) @@ -619,11 +631,11 @@ def test_to_excel_interval_labels(self, engine, ext): # see gh-19242 # # Test writing Interval with labels. - df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), - dtype=np.int64) + df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), dtype=np.int64) expected = df.copy() - intervals = pd.cut(df[0], 10, labels=["A", "B", "C", "D", "E", - "F", "G", "H", "I", "J"]) + intervals = pd.cut( + df[0], 10, labels=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"] + ) df["new"] = intervals expected["new"] = pd.Series(list(intervals)) @@ -637,13 +649,15 @@ def test_to_excel_timedelta(self, engine, ext): # see gh-19242, gh-9155 # # Test writing timedelta to xls. - df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), - columns=["A"], dtype=np.int64) + df = DataFrame( + np.random.randint(-10, 10, size=(20, 1)), columns=["A"], dtype=np.int64 + ) expected = df.copy() df["new"] = df["A"].apply(lambda x: timedelta(seconds=x)) expected["new"] = expected["A"].apply( - lambda x: timedelta(seconds=x).total_seconds() / float(86400)) + lambda x: timedelta(seconds=x).total_seconds() / float(86400) + ) df.to_excel(self.path, "test1") reader = ExcelFile(self.path) @@ -652,35 +666,34 @@ def test_to_excel_timedelta(self, engine, ext): tm.assert_frame_equal(expected, recons) def test_to_excel_periodindex(self, engine, ext, tsframe): - xp = tsframe.resample('M', kind='period').mean() + xp = tsframe.resample("M", kind="period").mean() - xp.to_excel(self.path, 'sht1') + xp.to_excel(self.path, "sht1") reader = ExcelFile(self.path) - rs = pd.read_excel(reader, 'sht1', index_col=0) - tm.assert_frame_equal(xp, rs.to_period('M')) + rs = pd.read_excel(reader, "sht1", index_col=0) + tm.assert_frame_equal(xp, rs.to_period("M")) def test_to_excel_multiindex(self, merge_cells, engine, ext, frame): arrays = np.arange(len(frame.index) * 2).reshape(2, -1) - new_index = MultiIndex.from_arrays(arrays, - names=['first', 'second']) + new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index - frame.to_excel(self.path, 'test1', header=False) - frame.to_excel(self.path, 'test1', columns=['A', 'B']) + frame.to_excel(self.path, "test1", header=False) + frame.to_excel(self.path, "test1", columns=["A", "B"]) # round trip - frame.to_excel(self.path, 'test1', merge_cells=merge_cells) + frame.to_excel(self.path, "test1", merge_cells=merge_cells) reader = ExcelFile(self.path) - df = pd.read_excel(reader, 'test1', index_col=[0, 1]) + df = pd.read_excel(reader, "test1", index_col=[0, 1]) tm.assert_frame_equal(frame, df) # GH13511 def test_to_excel_multiindex_nan_label(self, merge_cells, engine, ext): - df = pd.DataFrame({'A': [None, 2, 3], - 'B': [10, 20, 30], - 'C': np.random.sample(3)}) - df = df.set_index(['A', 'B']) + df = pd.DataFrame( + {"A": [None, 2, 3], "B": [10, 20, 30], "C": np.random.sample(3)} + ) + df = df.set_index(["A", "B"]) df.to_excel(self.path, merge_cells=merge_cells) df1 = pd.read_excel(self.path, index_col=[0, 1]) @@ -691,46 +704,42 @@ def test_to_excel_multiindex_nan_label(self, merge_cells, engine, ext): # merge_cells def test_to_excel_multiindex_cols(self, merge_cells, engine, ext, frame): arrays = np.arange(len(frame.index) * 2).reshape(2, -1) - new_index = MultiIndex.from_arrays(arrays, - names=['first', 'second']) + new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index - new_cols_index = MultiIndex.from_tuples([(40, 1), (40, 2), - (50, 1), (50, 2)]) + new_cols_index = MultiIndex.from_tuples([(40, 1), (40, 2), (50, 1), (50, 2)]) frame.columns = new_cols_index header = [0, 1] if not merge_cells: header = 0 # round trip - frame.to_excel(self.path, 'test1', merge_cells=merge_cells) + frame.to_excel(self.path, "test1", merge_cells=merge_cells) reader = ExcelFile(self.path) - df = pd.read_excel(reader, 'test1', header=header, index_col=[0, 1]) + df = pd.read_excel(reader, "test1", header=header, index_col=[0, 1]) if not merge_cells: - fm = frame.columns.format(sparsify=False, - adjoin=False, names=False) + fm = frame.columns.format(sparsify=False, adjoin=False, names=False) frame.columns = [".".join(map(str, q)) for q in zip(*fm)] tm.assert_frame_equal(frame, df) - def test_to_excel_multiindex_dates( - self, merge_cells, engine, ext, tsframe): + def test_to_excel_multiindex_dates(self, merge_cells, engine, ext, tsframe): # try multiindex with dates new_index = [tsframe.index, np.arange(len(tsframe.index))] tsframe.index = MultiIndex.from_arrays(new_index) - tsframe.index.names = ['time', 'foo'] - tsframe.to_excel(self.path, 'test1', merge_cells=merge_cells) + tsframe.index.names = ["time", "foo"] + tsframe.to_excel(self.path, "test1", merge_cells=merge_cells) reader = ExcelFile(self.path) - recons = pd.read_excel(reader, 'test1', index_col=[0, 1]) + recons = pd.read_excel(reader, "test1", index_col=[0, 1]) tm.assert_frame_equal(tsframe, recons) - assert recons.index.names == ('time', 'foo') + assert recons.index.names == ("time", "foo") def test_to_excel_multiindex_no_write_index(self, engine, ext): # Test writing and re-reading a MI without the index. GH 5616. # Initial non-MI frame. - frame1 = DataFrame({'a': [10, 20], 'b': [30, 40], 'c': [50, 60]}) + frame1 = DataFrame({"a": [10, 20], "b": [30, 40], "c": [50, 60]}) # Add a MI. frame2 = frame1.copy() @@ -738,40 +747,44 @@ def test_to_excel_multiindex_no_write_index(self, engine, ext): frame2.index = multi_index # Write out to Excel without the index. - frame2.to_excel(self.path, 'test1', index=False) + frame2.to_excel(self.path, "test1", index=False) # Read it back in. reader = ExcelFile(self.path) - frame3 = pd.read_excel(reader, 'test1') + frame3 = pd.read_excel(reader, "test1") # Test that it is the same as the initial frame. tm.assert_frame_equal(frame1, frame3) def test_to_excel_float_format(self, engine, ext): - df = DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - index=["A", "B"], columns=["X", "Y", "Z"]) + df = DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) df.to_excel(self.path, "test1", float_format="%.2f") reader = ExcelFile(self.path) result = pd.read_excel(reader, "test1", index_col=0) - expected = DataFrame([[0.12, 0.23, 0.57], - [12.32, 123123.20, 321321.20]], - index=["A", "B"], columns=["X", "Y", "Z"]) + expected = DataFrame( + [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) tm.assert_frame_equal(result, expected) def test_to_excel_output_encoding(self, engine, ext): # Avoid mixed inferred_type. - df = DataFrame([["\u0192", "\u0193", "\u0194"], - ["\u0195", "\u0196", "\u0197"]], - index=["A\u0192", "B"], - columns=["X\u0193", "Y", "Z"]) + df = DataFrame( + [["\u0192", "\u0193", "\u0194"], ["\u0195", "\u0196", "\u0197"]], + index=["A\u0192", "B"], + columns=["X\u0193", "Y", "Z"], + ) with ensure_clean("__tmp_to_excel_float_format__." + ext) as filename: df.to_excel(filename, sheet_name="TestSheet", encoding="utf8") - result = pd.read_excel(filename, "TestSheet", - encoding="utf8", index_col=0) + result = pd.read_excel(filename, "TestSheet", encoding="utf8", index_col=0) tm.assert_frame_equal(result, df) def test_to_excel_unicode_filename(self, engine, ext): @@ -783,17 +796,21 @@ def test_to_excel_unicode_filename(self, engine, ext): else: f.close() - df = DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - index=["A", "B"], columns=["X", "Y", "Z"]) + df = DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) df.to_excel(filename, "test1", float_format="%.2f") reader = ExcelFile(filename) result = pd.read_excel(reader, "test1", index_col=0) - expected = DataFrame([[0.12, 0.23, 0.57], - [12.32, 123123.20, 321321.20]], - index=["A", "B"], columns=["X", "Y", "Z"]) + expected = DataFrame( + [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) tm.assert_frame_equal(result, expected) # def test_to_excel_header_styling_xls(self, engine, ext): @@ -899,12 +916,13 @@ def test_to_excel_unicode_filename(self, engine, ext): @pytest.mark.parametrize("use_headers", [True, False]) @pytest.mark.parametrize("r_idx_nlevels", [1, 2, 3]) @pytest.mark.parametrize("c_idx_nlevels", [1, 2, 3]) - def test_excel_010_hemstring(self, merge_cells, engine, ext, - c_idx_nlevels, r_idx_nlevels, use_headers): - + def test_excel_010_hemstring( + self, merge_cells, engine, ext, c_idx_nlevels, r_idx_nlevels, use_headers + ): def roundtrip(data, header=True, parser_hdr=0, index=True): - data.to_excel(self.path, header=header, - merge_cells=merge_cells, index=index) + data.to_excel( + self.path, header=header, merge_cells=merge_cells, index=index + ) xf = ExcelFile(self.path) return pd.read_excel(xf, xf.sheet_names[0], header=parser_hdr) @@ -921,11 +939,13 @@ def roundtrip(data, header=True, parser_hdr=0, index=True): ncols = 3 from pandas.util.testing import makeCustomDataframe as mkdf + # ensure limited functionality in 0.10 # override of gh-2370 until sorted out in 0.11 - df = mkdf(nrows, ncols, r_idx_nlevels=r_idx_nlevels, - c_idx_nlevels=c_idx_nlevels) + df = mkdf( + nrows, ncols, r_idx_nlevels=r_idx_nlevels, c_idx_nlevels=c_idx_nlevels + ) # This if will be removed once multi-column Excel writing # is implemented. For now fixing gh-9794. @@ -948,29 +968,28 @@ def roundtrip(data, header=True, parser_hdr=0, index=True): def test_duplicated_columns(self, engine, ext): # see gh-5235 - df = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], - columns=["A", "B", "B"]) + df = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["A", "B", "B"]) df.to_excel(self.path, "test1") - expected = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], - columns=["A", "B", "B.1"]) + expected = DataFrame( + [[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["A", "B", "B.1"] + ) # By default, we mangle. result = pd.read_excel(self.path, "test1", index_col=0) tm.assert_frame_equal(result, expected) # Explicitly, we pass in the parameter. - result = pd.read_excel(self.path, "test1", index_col=0, - mangle_dupe_cols=True) + result = pd.read_excel(self.path, "test1", index_col=0, mangle_dupe_cols=True) tm.assert_frame_equal(result, expected) # see gh-11007, gh-10970 - df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], - columns=["A", "B", "A", "B"]) + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A", "B"]) df.to_excel(self.path, "test1") result = pd.read_excel(self.path, "test1", index_col=0) - expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], - columns=["A", "B", "A.1", "B.1"]) + expected = DataFrame( + [[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A.1", "B.1"] + ) tm.assert_frame_equal(result, expected) # see gh-10982 @@ -982,27 +1001,23 @@ def test_duplicated_columns(self, engine, ext): msg = "Setting mangle_dupe_cols=False is not supported yet" with pytest.raises(ValueError, match=msg): - pd.read_excel( - self.path, "test1", header=None, mangle_dupe_cols=False) + pd.read_excel(self.path, "test1", header=None, mangle_dupe_cols=False) def test_swapped_columns(self, engine, ext): # Test for issue #5427. - write_frame = DataFrame({'A': [1, 1, 1], - 'B': [2, 2, 2]}) - write_frame.to_excel(self.path, 'test1', columns=['B', 'A']) + write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) + write_frame.to_excel(self.path, "test1", columns=["B", "A"]) - read_frame = pd.read_excel(self.path, 'test1', header=0) + read_frame = pd.read_excel(self.path, "test1", header=0) - tm.assert_series_equal(write_frame['A'], read_frame['A']) - tm.assert_series_equal(write_frame['B'], read_frame['B']) + tm.assert_series_equal(write_frame["A"], read_frame["A"]) + tm.assert_series_equal(write_frame["B"], read_frame["B"]) def test_invalid_columns(self, engine, ext): # see gh-10982 - write_frame = DataFrame({"A": [1, 1, 1], - "B": [2, 2, 2]}) + write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): write_frame.to_excel(self.path, "test1", columns=["B", "C"]) expected = write_frame.reindex(columns=["B", "C"]) @@ -1018,8 +1033,7 @@ def test_comment_arg(self, engine, ext): # Test the comment argument functionality to pd.read_excel. # Create file to read in. - df = DataFrame({"A": ["one", "#one", "one"], - "B": ["two", "two", "#two"]}) + df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) df.to_excel(self.path, "test_c") # Read file without comment arg. @@ -1037,13 +1051,12 @@ def test_comment_default(self, engine, ext): # Test the comment argument default to pd.read_excel # Create file to read in - df = DataFrame({'A': ['one', '#one', 'one'], - 'B': ['two', 'two', '#two']}) - df.to_excel(self.path, 'test_c') + df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) + df.to_excel(self.path, "test_c") # Read file with default and explicit comment=None - result1 = pd.read_excel(self.path, 'test_c') - result2 = pd.read_excel(self.path, 'test_c', comment=None) + result1 = pd.read_excel(self.path, "test_c") + result2 = pd.read_excel(self.path, "test_c", comment=None) tm.assert_frame_equal(result1, result2) def test_comment_used(self, engine, ext): @@ -1052,13 +1065,11 @@ def test_comment_used(self, engine, ext): # Test the comment argument is working as expected when used. # Create file to read in. - df = DataFrame({"A": ["one", "#one", "one"], - "B": ["two", "two", "#two"]}) + df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) df.to_excel(self.path, "test_c") # Test read_frame_comment against manually produced expected output. - expected = DataFrame({"A": ["one", None, "one"], - "B": ["two", None, None]}) + expected = DataFrame({"A": ["one", None, "one"], "B": ["two", None, None]}) result = pd.read_excel(self.path, "test_c", comment="#", index_col=0) tm.assert_frame_equal(result, expected) @@ -1066,34 +1077,36 @@ def test_comment_empty_line(self, engine, ext): # Re issue #18735 # Test that pd.read_excel ignores commented lines at the end of file - df = DataFrame({'a': ['1', '#2'], 'b': ['2', '3']}) + df = DataFrame({"a": ["1", "#2"], "b": ["2", "3"]}) df.to_excel(self.path, index=False) # Test that all-comment lines at EoF are ignored - expected = DataFrame({'a': [1], 'b': [2]}) - result = pd.read_excel(self.path, comment='#') + expected = DataFrame({"a": [1], "b": [2]}) + result = pd.read_excel(self.path, comment="#") tm.assert_frame_equal(result, expected) def test_datetimes(self, engine, ext): # Test writing and reading datetimes. For issue #9139. (xref #9185) - datetimes = [datetime(2013, 1, 13, 1, 2, 3), - datetime(2013, 1, 13, 2, 45, 56), - datetime(2013, 1, 13, 4, 29, 49), - datetime(2013, 1, 13, 6, 13, 42), - datetime(2013, 1, 13, 7, 57, 35), - datetime(2013, 1, 13, 9, 41, 28), - datetime(2013, 1, 13, 11, 25, 21), - datetime(2013, 1, 13, 13, 9, 14), - datetime(2013, 1, 13, 14, 53, 7), - datetime(2013, 1, 13, 16, 37, 0), - datetime(2013, 1, 13, 18, 20, 52)] - - write_frame = DataFrame({'A': datetimes}) - write_frame.to_excel(self.path, 'Sheet1') - read_frame = pd.read_excel(self.path, 'Sheet1', header=0) - - tm.assert_series_equal(write_frame['A'], read_frame['A']) + datetimes = [ + datetime(2013, 1, 13, 1, 2, 3), + datetime(2013, 1, 13, 2, 45, 56), + datetime(2013, 1, 13, 4, 29, 49), + datetime(2013, 1, 13, 6, 13, 42), + datetime(2013, 1, 13, 7, 57, 35), + datetime(2013, 1, 13, 9, 41, 28), + datetime(2013, 1, 13, 11, 25, 21), + datetime(2013, 1, 13, 13, 9, 14), + datetime(2013, 1, 13, 14, 53, 7), + datetime(2013, 1, 13, 16, 37, 0), + datetime(2013, 1, 13, 18, 20, 52), + ] + + write_frame = DataFrame({"A": datetimes}) + write_frame.to_excel(self.path, "Sheet1") + read_frame = pd.read_excel(self.path, "Sheet1", header=0) + + tm.assert_series_equal(write_frame["A"], read_frame["A"]) def test_bytes_io(self, engine, ext): # see gh-7074 @@ -1111,9 +1124,13 @@ def test_bytes_io(self, engine, ext): def test_write_lists_dict(self, engine, ext): # see gh-8188. - df = DataFrame({"mixed": ["a", ["b", "c"], {"d": "e", "f": 2}], - "numeric": [1, 2, 3.0], - "str": ["apple", "banana", "cherry"]}) + df = DataFrame( + { + "mixed": ["a", ["b", "c"], {"d": "e", "f": 2}], + "numeric": [1, 2, 3.0], + "str": ["apple", "banana", "cherry"], + } + ) df.to_excel(self.path, "Sheet1") read = pd.read_excel(self.path, "Sheet1", header=0, index_col=0) @@ -1129,8 +1146,9 @@ def test_true_and_false_value_options(self, engine, ext): expected = df.replace({"foo": True, "bar": False}) df.to_excel(self.path) - read_frame = pd.read_excel(self.path, true_values=["foo"], - false_values=["bar"], index_col=0) + read_frame = pd.read_excel( + self.path, true_values=["foo"], false_values=["bar"], index_col=0 + ) tm.assert_frame_equal(read_frame, expected) def test_freeze_panes(self, engine, ext): @@ -1146,8 +1164,7 @@ def test_path_path_lib(self, engine, ext): writer = partial(df.to_excel, engine=engine) reader = partial(pd.read_excel, index_col=0) - result = tm.round_trip_pathlib(writer, reader, - path="foo.{ext}".format(ext=ext)) + result = tm.round_trip_pathlib(writer, reader, path="foo.{ext}".format(ext=ext)) tm.assert_frame_equal(result, df) def test_path_local_path(self, engine, ext): @@ -1155,32 +1172,36 @@ def test_path_local_path(self, engine, ext): writer = partial(df.to_excel, engine=engine) reader = partial(pd.read_excel, index_col=0) - result = tm.round_trip_pathlib(writer, reader, - path="foo.{ext}".format(ext=ext)) + result = tm.round_trip_pathlib(writer, reader, path="foo.{ext}".format(ext=ext)) tm.assert_frame_equal(result, df) def test_merged_cell_custom_objects(self, engine, merge_cells, ext): # see GH-27006 - mi = MultiIndex.from_tuples([(pd.Period('2018'), pd.Period('2018Q1')), - (pd.Period('2018'), pd.Period('2018Q2'))]) + mi = MultiIndex.from_tuples( + [ + (pd.Period("2018"), pd.Period("2018Q1")), + (pd.Period("2018"), pd.Period("2018Q2")), + ] + ) expected = DataFrame(np.ones((2, 2)), columns=mi) expected.to_excel(self.path) - result = pd.read_excel(self.path, header=[0, 1], - index_col=0, convert_float=False) + result = pd.read_excel( + self.path, header=[0, 1], index_col=0, convert_float=False + ) # need to convert PeriodIndexes to standard Indexes for assert equal - expected.columns.set_levels([[str(i) for i in mi.levels[0]], - [str(i) for i in mi.levels[1]]], - level=[0, 1], - inplace=True) + expected.columns.set_levels( + [[str(i) for i in mi.levels[0]], [str(i) for i in mi.levels[1]]], + level=[0, 1], + inplace=True, + ) expected.index = expected.index.astype(np.float64) tm.assert_frame_equal(expected, result) - @pytest.mark.parametrize('dtype', [None, object]) - def test_raise_when_saving_timezones(self, engine, ext, dtype, - tz_aware_fixture): + @pytest.mark.parametrize("dtype", [None, object]) + def test_raise_when_saving_timezones(self, engine, ext, dtype, tz_aware_fixture): # GH 27008, GH 7056 tz = tz_aware_fixture - data = pd.Timestamp('2019', tz=tz) + data = pd.Timestamp("2019", tz=tz) df = DataFrame([data], dtype=dtype) with pytest.raises(ValueError, match="Excel does not support"): df.to_excel(self.path) @@ -1192,25 +1213,26 @@ def test_raise_when_saving_timezones(self, engine, ext, dtype, class TestExcelWriterEngineTests: - - @pytest.mark.parametrize('klass,ext', [ - pytest.param(_XlsxWriter, '.xlsx', marks=td.skip_if_no('xlsxwriter')), - pytest.param( - _OpenpyxlWriter, '.xlsx', marks=td.skip_if_no('openpyxl')), - pytest.param(_XlwtWriter, '.xls', marks=td.skip_if_no('xlwt')) - ]) + @pytest.mark.parametrize( + "klass,ext", + [ + pytest.param(_XlsxWriter, ".xlsx", marks=td.skip_if_no("xlsxwriter")), + pytest.param(_OpenpyxlWriter, ".xlsx", marks=td.skip_if_no("openpyxl")), + pytest.param(_XlwtWriter, ".xls", marks=td.skip_if_no("xlwt")), + ], + ) def test_ExcelWriter_dispatch(self, klass, ext): with ensure_clean(ext) as path: writer = ExcelWriter(path) - if ext == '.xlsx' and td.safe_import('xlsxwriter'): + if ext == ".xlsx" and td.safe_import("xlsxwriter"): # xlsxwriter has preference over openpyxl if both installed assert isinstance(writer, _XlsxWriter) else: assert isinstance(writer, klass) def test_ExcelWriter_dispatch_raises(self): - with pytest.raises(ValueError, match='No engine'): - ExcelWriter('nothing') + with pytest.raises(ValueError, match="No engine"): + ExcelWriter("nothing") def test_register_writer(self): # some awkward mocking to test out dispatch and such actually works @@ -1220,8 +1242,8 @@ def test_register_writer(self): class DummyClass(ExcelWriter): called_save = False called_write_cells = False - supported_extensions = ['xlsx', 'xls'] - engine = 'dummy' + supported_extensions = ["xlsx", "xls"] + engine = "dummy" def save(self): called_save.append(True) @@ -1236,24 +1258,21 @@ def check_called(func): del called_save[:] del called_write_cells[:] - with pd.option_context('io.excel.xlsx.writer', 'dummy'): + with pd.option_context("io.excel.xlsx.writer", "dummy"): register_writer(DummyClass) - writer = ExcelWriter('something.xlsx') + writer = ExcelWriter("something.xlsx") assert isinstance(writer, DummyClass) df = tm.makeCustomDataframe(1, 1) - check_called(lambda: df.to_excel('something.xlsx')) - check_called( - lambda: df.to_excel( - 'something.xls', engine='dummy')) + check_called(lambda: df.to_excel("something.xlsx")) + check_called(lambda: df.to_excel("something.xls", engine="dummy")) -@td.skip_if_no('xlrd') -@td.skip_if_no('openpyxl') -@pytest.mark.skipif(not PY36, reason='requires fspath') +@td.skip_if_no("xlrd") +@td.skip_if_no("openpyxl") +@pytest.mark.skipif(not PY36, reason="requires fspath") class TestFSPath: - def test_excelfile_fspath(self): - with tm.ensure_clean('foo.xlsx') as path: + with tm.ensure_clean("foo.xlsx") as path: df = DataFrame({"A": [1, 2]}) df.to_excel(path) xl = ExcelFile(path) @@ -1261,6 +1280,6 @@ def test_excelfile_fspath(self): assert result == path def test_excelwriter_fspath(self): - with tm.ensure_clean('foo.xlsx') as path: + with tm.ensure_clean("foo.xlsx") as path: writer = ExcelWriter(path) assert os.fspath(writer) == str(path) diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index d749f0ec3e2525..c4d99c827318de 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -30,14 +30,13 @@ def test_read_xlrd_book(read_ext, frame): result = pd.read_excel(xl, sheet_name, index_col=0) tm.assert_frame_equal(df, result) - result = pd.read_excel(book, sheet_name=sheet_name, - engine=engine, index_col=0) + result = pd.read_excel(book, sheet_name=sheet_name, engine=engine, index_col=0) tm.assert_frame_equal(df, result) # TODO: test for openpyxl as well def test_excel_table_sheet_by_index(datapath, read_ext): - path = datapath("io", "data", 'test1{}'.format(read_ext)) + path = datapath("io", "data", "test1{}".format(read_ext)) with pd.ExcelFile(path) as excel: with pytest.raises(xlrd.XLRDError): - pd.read_excel(excel, 'asdf') + pd.read_excel(excel, "asdf") diff --git a/pandas/tests/io/excel/test_xlsxwriter.py b/pandas/tests/io/excel/test_xlsxwriter.py index 391a1085161f00..4dae3db2e7abdb 100644 --- a/pandas/tests/io/excel/test_xlsxwriter.py +++ b/pandas/tests/io/excel/test_xlsxwriter.py @@ -9,7 +9,7 @@ xlsxwriter = pytest.importorskip("xlsxwriter") -pytestmark = pytest.mark.parametrize("ext", ['.xlsx']) +pytestmark = pytest.mark.parametrize("ext", [".xlsx"]) def test_column_format(ext): @@ -21,33 +21,32 @@ def test_column_format(ext): openpyxl = pytest.importorskip("openpyxl") with ensure_clean(ext) as path: - frame = DataFrame({'A': [123456, 123456], - 'B': [123456, 123456]}) + frame = DataFrame({"A": [123456, 123456], "B": [123456, 123456]}) writer = ExcelWriter(path) frame.to_excel(writer) # Add a number format to col B and ensure it is applied to cells. - num_format = '#,##0' + num_format = "#,##0" write_workbook = writer.book write_worksheet = write_workbook.worksheets()[0] - col_format = write_workbook.add_format({'num_format': num_format}) - write_worksheet.set_column('B:B', None, col_format) + col_format = write_workbook.add_format({"num_format": num_format}) + write_worksheet.set_column("B:B", None, col_format) writer.save() read_workbook = openpyxl.load_workbook(path) try: - read_worksheet = read_workbook['Sheet1'] + read_worksheet = read_workbook["Sheet1"] except TypeError: # compat - read_worksheet = read_workbook.get_sheet_by_name(name='Sheet1') + read_worksheet = read_workbook.get_sheet_by_name(name="Sheet1") # Get the number format from the cell. try: - cell = read_worksheet['B2'] + cell = read_worksheet["B2"] except TypeError: # compat - cell = read_worksheet.cell('B2') + cell = read_worksheet.cell("B2") try: read_num_format = cell.number_format @@ -62,4 +61,4 @@ def test_write_append_mode_raises(ext): with ensure_clean(ext) as f: with pytest.raises(ValueError, match=msg): - ExcelWriter(f, engine='xlsxwriter', mode='a') + ExcelWriter(f, engine="xlsxwriter", mode="a") diff --git a/pandas/tests/io/excel/test_xlwt.py b/pandas/tests/io/excel/test_xlwt.py index 9c687f1f514f9a..51f94a2f01b849 100644 --- a/pandas/tests/io/excel/test_xlwt.py +++ b/pandas/tests/io/excel/test_xlwt.py @@ -9,14 +9,14 @@ xlwt = pytest.importorskip("xlwt") -pytestmark = pytest.mark.parametrize("ext,", ['.xls']) +pytestmark = pytest.mark.parametrize("ext,", [".xls"]) def test_excel_raise_error_on_multiindex_columns_and_no_index(ext): # MultiIndex as columns is not yet implemented 9794 - cols = MultiIndex.from_tuples([('site', ''), - ('2014', 'height'), - ('2014', 'weight')]) + cols = MultiIndex.from_tuples( + [("site", ""), ("2014", "height"), ("2014", "weight")] + ) df = DataFrame(np.random.randn(10, 3), columns=cols) with pytest.raises(NotImplementedError): with ensure_clean(ext) as path: @@ -24,9 +24,9 @@ def test_excel_raise_error_on_multiindex_columns_and_no_index(ext): def test_excel_multiindex_columns_and_index_true(ext): - cols = MultiIndex.from_tuples([('site', ''), - ('2014', 'height'), - ('2014', 'weight')]) + cols = MultiIndex.from_tuples( + [("site", ""), ("2014", "height"), ("2014", "weight")] + ) df = pd.DataFrame(np.random.randn(10, 3), columns=cols) with ensure_clean(ext) as path: df.to_excel(path, index=True) @@ -34,21 +34,20 @@ def test_excel_multiindex_columns_and_index_true(ext): def test_excel_multiindex_index(ext): # MultiIndex as index works so assert no error #9794 - cols = MultiIndex.from_tuples([('site', ''), - ('2014', 'height'), - ('2014', 'weight')]) + cols = MultiIndex.from_tuples( + [("site", ""), ("2014", "height"), ("2014", "weight")] + ) df = DataFrame(np.random.randn(3, 10), index=cols) with ensure_clean(ext) as path: df.to_excel(path, index=False) def test_to_excel_styleconverter(ext): - hstyle = {"font": {"bold": True}, - "borders": {"top": "thin", - "right": "thin", - "bottom": "thin", - "left": "thin"}, - "alignment": {"horizontal": "center", "vertical": "top"}} + hstyle = { + "font": {"bold": True}, + "borders": {"top": "thin", "right": "thin", "bottom": "thin", "left": "thin"}, + "alignment": {"horizontal": "center", "vertical": "top"}, + } xls_style = _XlwtWriter._convert_to_style(hstyle) assert xls_style.font.bold @@ -65,4 +64,4 @@ def test_write_append_mode_raises(ext): with ensure_clean(ext) as f: with pytest.raises(ValueError, match=msg): - ExcelWriter(f, engine='xlwt', mode='a') + ExcelWriter(f, engine="xlwt", mode="a") diff --git a/pandas/tests/io/formats/test_console.py b/pandas/tests/io/formats/test_console.py index 2f012c4d019125..f4bee99296a834 100644 --- a/pandas/tests/io/formats/test_console.py +++ b/pandas/tests/io/formats/test_console.py @@ -9,6 +9,7 @@ class MockEncoding: # TODO(py27): replace with mock side effect is a str in nature, the value will be returned. Otherwise, the side effect should be an exception that will be raised. """ + def __init__(self, encoding): super().__init__() self.val = encoding @@ -25,50 +26,45 @@ def raise_or_return(val): raise val -@pytest.mark.parametrize('empty,filled', [ - ['stdin', 'stdout'], - ['stdout', 'stdin'] -]) +@pytest.mark.parametrize("empty,filled", [["stdin", "stdout"], ["stdout", "stdin"]]) def test_detect_console_encoding_from_stdout_stdin(monkeypatch, empty, filled): # Ensures that when sys.stdout.encoding or sys.stdin.encoding is used when # they have values filled. # GH 21552 with monkeypatch.context() as context: - context.setattr('sys.{}'.format(empty), MockEncoding('')) - context.setattr('sys.{}'.format(filled), MockEncoding(filled)) + context.setattr("sys.{}".format(empty), MockEncoding("")) + context.setattr("sys.{}".format(filled), MockEncoding(filled)) assert detect_console_encoding() == filled -@pytest.mark.parametrize('encoding', [ - AttributeError, - IOError, - 'ascii' -]) +@pytest.mark.parametrize("encoding", [AttributeError, IOError, "ascii"]) def test_detect_console_encoding_fallback_to_locale(monkeypatch, encoding): # GH 21552 with monkeypatch.context() as context: - context.setattr('locale.getpreferredencoding', lambda: 'foo') - context.setattr('sys.stdout', MockEncoding(encoding)) - assert detect_console_encoding() == 'foo' + context.setattr("locale.getpreferredencoding", lambda: "foo") + context.setattr("sys.stdout", MockEncoding(encoding)) + assert detect_console_encoding() == "foo" -@pytest.mark.parametrize('std,locale', [ - ['ascii', 'ascii'], - ['ascii', Exception], - [AttributeError, 'ascii'], - [AttributeError, Exception], - [IOError, 'ascii'], - [IOError, Exception] -]) +@pytest.mark.parametrize( + "std,locale", + [ + ["ascii", "ascii"], + ["ascii", Exception], + [AttributeError, "ascii"], + [AttributeError, Exception], + [IOError, "ascii"], + [IOError, Exception], + ], +) def test_detect_console_encoding_fallback_to_default(monkeypatch, std, locale): # When both the stdout/stdin encoding and locale preferred encoding checks # fail (or return 'ascii', we should default to the sys default encoding. # GH 21552 with monkeypatch.context() as context: context.setattr( - 'locale.getpreferredencoding', - lambda: MockEncoding.raise_or_return(locale) + "locale.getpreferredencoding", lambda: MockEncoding.raise_or_return(locale) ) - context.setattr('sys.stdout', MockEncoding(std)) - context.setattr('sys.getdefaultencoding', lambda: 'sysDefaultEncoding') - assert detect_console_encoding() == 'sysDefaultEncoding' + context.setattr("sys.stdout", MockEncoding(std)) + context.setattr("sys.getdefaultencoding", lambda: "sysDefaultEncoding") + assert detect_console_encoding() == "sysDefaultEncoding" diff --git a/pandas/tests/io/formats/test_css.py b/pandas/tests/io/formats/test_css.py index f251bd983509e9..74e78b033bac98 100644 --- a/pandas/tests/io/formats/test_css.py +++ b/pandas/tests/io/formats/test_css.py @@ -18,35 +18,44 @@ def assert_same_resolution(css1, css2, inherited=None): assert resolved1 == resolved2 -@pytest.mark.parametrize('name,norm,abnorm', [ - ('whitespace', 'hello: world; foo: bar', - ' \t hello \t :\n world \n ; \n foo: \tbar\n\n'), - ('case', 'hello: world; foo: bar', 'Hello: WORLD; foO: bar'), - ('empty-decl', 'hello: world; foo: bar', - '; hello: world;; foo: bar;\n; ;'), - ('empty-list', '', ';'), -]) +@pytest.mark.parametrize( + "name,norm,abnorm", + [ + ( + "whitespace", + "hello: world; foo: bar", + " \t hello \t :\n world \n ; \n foo: \tbar\n\n", + ), + ("case", "hello: world; foo: bar", "Hello: WORLD; foO: bar"), + ("empty-decl", "hello: world; foo: bar", "; hello: world;; foo: bar;\n; ;"), + ("empty-list", "", ";"), + ], +) def test_css_parse_normalisation(name, norm, abnorm): assert_same_resolution(norm, abnorm) @pytest.mark.parametrize( - 'invalid_css,remainder', [ + "invalid_css,remainder", + [ # No colon - ('hello-world', ''), - ('border-style: solid; hello-world', 'border-style: solid'), - ('border-style: solid; hello-world; font-weight: bold', - 'border-style: solid; font-weight: bold'), + ("hello-world", ""), + ("border-style: solid; hello-world", "border-style: solid"), + ( + "border-style: solid; hello-world; font-weight: bold", + "border-style: solid; font-weight: bold", + ), # Unclosed string fail # Invalid size - ('font-size: blah', 'font-size: 1em'), - ('font-size: 1a2b', 'font-size: 1em'), - ('font-size: 1e5pt', 'font-size: 1em'), - ('font-size: 1+6pt', 'font-size: 1em'), - ('font-size: 1unknownunit', 'font-size: 1em'), - ('font-size: 10', 'font-size: 1em'), - ('font-size: 10 pt', 'font-size: 1em'), - ]) + ("font-size: blah", "font-size: 1em"), + ("font-size: 1a2b", "font-size: 1em"), + ("font-size: 1e5pt", "font-size: 1em"), + ("font-size: 1+6pt", "font-size: 1em"), + ("font-size: 1unknownunit", "font-size: 1em"), + ("font-size: 10", "font-size: 1em"), + ("font-size: 10 pt", "font-size: 1em"), + ], +) def test_css_parse_invalid(invalid_css, remainder): with tm.assert_produces_warning(CSSWarning): assert_same_resolution(invalid_css, remainder) @@ -55,59 +64,89 @@ def test_css_parse_invalid(invalid_css, remainder): @pytest.mark.parametrize( - 'shorthand,expansions', - [('margin', ['margin-top', 'margin-right', - 'margin-bottom', 'margin-left']), - ('padding', ['padding-top', 'padding-right', - 'padding-bottom', 'padding-left']), - ('border-width', ['border-top-width', 'border-right-width', - 'border-bottom-width', 'border-left-width']), - ('border-color', ['border-top-color', 'border-right-color', - 'border-bottom-color', 'border-left-color']), - ('border-style', ['border-top-style', 'border-right-style', - 'border-bottom-style', 'border-left-style']), - ]) + "shorthand,expansions", + [ + ("margin", ["margin-top", "margin-right", "margin-bottom", "margin-left"]), + ("padding", ["padding-top", "padding-right", "padding-bottom", "padding-left"]), + ( + "border-width", + [ + "border-top-width", + "border-right-width", + "border-bottom-width", + "border-left-width", + ], + ), + ( + "border-color", + [ + "border-top-color", + "border-right-color", + "border-bottom-color", + "border-left-color", + ], + ), + ( + "border-style", + [ + "border-top-style", + "border-right-style", + "border-bottom-style", + "border-left-style", + ], + ), + ], +) def test_css_side_shorthands(shorthand, expansions): top, right, bottom, left = expansions - assert_resolves('{shorthand}: 1pt'.format(shorthand=shorthand), - {top: '1pt', right: '1pt', - bottom: '1pt', left: '1pt'}) + assert_resolves( + "{shorthand}: 1pt".format(shorthand=shorthand), + {top: "1pt", right: "1pt", bottom: "1pt", left: "1pt"}, + ) - assert_resolves('{shorthand}: 1pt 4pt'.format(shorthand=shorthand), - {top: '1pt', right: '4pt', - bottom: '1pt', left: '4pt'}) + assert_resolves( + "{shorthand}: 1pt 4pt".format(shorthand=shorthand), + {top: "1pt", right: "4pt", bottom: "1pt", left: "4pt"}, + ) - assert_resolves('{shorthand}: 1pt 4pt 2pt'.format(shorthand=shorthand), - {top: '1pt', right: '4pt', - bottom: '2pt', left: '4pt'}) + assert_resolves( + "{shorthand}: 1pt 4pt 2pt".format(shorthand=shorthand), + {top: "1pt", right: "4pt", bottom: "2pt", left: "4pt"}, + ) - assert_resolves('{shorthand}: 1pt 4pt 2pt 0pt'.format(shorthand=shorthand), - {top: '1pt', right: '4pt', - bottom: '2pt', left: '0pt'}) + assert_resolves( + "{shorthand}: 1pt 4pt 2pt 0pt".format(shorthand=shorthand), + {top: "1pt", right: "4pt", bottom: "2pt", left: "0pt"}, + ) with tm.assert_produces_warning(CSSWarning): assert_resolves( - '{shorthand}: 1pt 1pt 1pt 1pt 1pt'.format(shorthand=shorthand), {}) - - -@pytest.mark.parametrize('style,inherited,equiv', [ - ('margin: 1px; margin: 2px', '', - 'margin: 2px'), - ('margin: 1px', 'margin: 2px', - 'margin: 1px'), - ('margin: 1px; margin: inherit', 'margin: 2px', - 'margin: 2px'), - ('margin: 1px; margin-top: 2px', '', - 'margin-left: 1px; margin-right: 1px; ' + - 'margin-bottom: 1px; margin-top: 2px'), - ('margin-top: 2px', 'margin: 1px', - 'margin: 1px; margin-top: 2px'), - ('margin: 1px', 'margin-top: 2px', - 'margin: 1px'), - ('margin: 1px; margin-top: inherit', 'margin: 2px', - 'margin: 1px; margin-top: 2px'), -]) + "{shorthand}: 1pt 1pt 1pt 1pt 1pt".format(shorthand=shorthand), {} + ) + + +@pytest.mark.parametrize( + "style,inherited,equiv", + [ + ("margin: 1px; margin: 2px", "", "margin: 2px"), + ("margin: 1px", "margin: 2px", "margin: 1px"), + ("margin: 1px; margin: inherit", "margin: 2px", "margin: 2px"), + ( + "margin: 1px; margin-top: 2px", + "", + "margin-left: 1px; margin-right: 1px; " + + "margin-bottom: 1px; margin-top: 2px", + ), + ("margin-top: 2px", "margin: 1px", "margin: 1px; margin-top: 2px"), + ("margin: 1px", "margin-top: 2px", "margin: 1px"), + ( + "margin: 1px; margin-top: inherit", + "margin: 2px", + "margin: 1px; margin-top: 2px", + ), + ], +) def test_css_precedence(style, inherited, equiv): resolve = CSSResolver() inherited_props = resolve(inherited) @@ -116,72 +155,86 @@ def test_css_precedence(style, inherited, equiv): assert style_props == equiv_props -@pytest.mark.parametrize('style,equiv', [ - ('margin: 1px; margin-top: inherit', - 'margin-bottom: 1px; margin-right: 1px; margin-left: 1px'), - ('margin-top: inherit', ''), - ('margin-top: initial', ''), -]) +@pytest.mark.parametrize( + "style,equiv", + [ + ( + "margin: 1px; margin-top: inherit", + "margin-bottom: 1px; margin-right: 1px; margin-left: 1px", + ), + ("margin-top: inherit", ""), + ("margin-top: initial", ""), + ], +) def test_css_none_absent(style, equiv): assert_same_resolution(style, equiv) -@pytest.mark.parametrize('size,resolved', [ - ('xx-small', '6pt'), - ('x-small', '{pt:f}pt'.format(pt=7.5)), - ('small', '{pt:f}pt'.format(pt=9.6)), - ('medium', '12pt'), - ('large', '{pt:f}pt'.format(pt=13.5)), - ('x-large', '18pt'), - ('xx-large', '24pt'), - - ('8px', '6pt'), - ('1.25pc', '15pt'), - ('.25in', '18pt'), - ('02.54cm', '72pt'), - ('25.4mm', '72pt'), - ('101.6q', '72pt'), - ('101.6q', '72pt'), -]) -@pytest.mark.parametrize('relative_to', # invariant to inherited size - [None, '16pt']) +@pytest.mark.parametrize( + "size,resolved", + [ + ("xx-small", "6pt"), + ("x-small", "{pt:f}pt".format(pt=7.5)), + ("small", "{pt:f}pt".format(pt=9.6)), + ("medium", "12pt"), + ("large", "{pt:f}pt".format(pt=13.5)), + ("x-large", "18pt"), + ("xx-large", "24pt"), + ("8px", "6pt"), + ("1.25pc", "15pt"), + (".25in", "18pt"), + ("02.54cm", "72pt"), + ("25.4mm", "72pt"), + ("101.6q", "72pt"), + ("101.6q", "72pt"), + ], +) +@pytest.mark.parametrize("relative_to", [None, "16pt"]) # invariant to inherited size def test_css_absolute_font_size(size, relative_to, resolved): if relative_to is None: inherited = None else: - inherited = {'font-size': relative_to} - assert_resolves('font-size: {size}'.format(size=size), - {'font-size': resolved}, inherited=inherited) - - -@pytest.mark.parametrize('size,relative_to,resolved', [ - ('1em', None, '12pt'), - ('1.0em', None, '12pt'), - ('1.25em', None, '15pt'), - ('1em', '16pt', '16pt'), - ('1.0em', '16pt', '16pt'), - ('1.25em', '16pt', '20pt'), - ('1rem', '16pt', '12pt'), - ('1.0rem', '16pt', '12pt'), - ('1.25rem', '16pt', '15pt'), - ('100%', None, '12pt'), - ('125%', None, '15pt'), - ('100%', '16pt', '16pt'), - ('125%', '16pt', '20pt'), - ('2ex', None, '12pt'), - ('2.0ex', None, '12pt'), - ('2.50ex', None, '15pt'), - ('inherit', '16pt', '16pt'), - - ('smaller', None, '10pt'), - ('smaller', '18pt', '15pt'), - ('larger', None, '{pt:f}pt'.format(pt=14.4)), - ('larger', '15pt', '18pt'), -]) + inherited = {"font-size": relative_to} + assert_resolves( + "font-size: {size}".format(size=size), + {"font-size": resolved}, + inherited=inherited, + ) + + +@pytest.mark.parametrize( + "size,relative_to,resolved", + [ + ("1em", None, "12pt"), + ("1.0em", None, "12pt"), + ("1.25em", None, "15pt"), + ("1em", "16pt", "16pt"), + ("1.0em", "16pt", "16pt"), + ("1.25em", "16pt", "20pt"), + ("1rem", "16pt", "12pt"), + ("1.0rem", "16pt", "12pt"), + ("1.25rem", "16pt", "15pt"), + ("100%", None, "12pt"), + ("125%", None, "15pt"), + ("100%", "16pt", "16pt"), + ("125%", "16pt", "20pt"), + ("2ex", None, "12pt"), + ("2.0ex", None, "12pt"), + ("2.50ex", None, "15pt"), + ("inherit", "16pt", "16pt"), + ("smaller", None, "10pt"), + ("smaller", "18pt", "15pt"), + ("larger", None, "{pt:f}pt".format(pt=14.4)), + ("larger", "15pt", "18pt"), + ], +) def test_css_relative_font_size(size, relative_to, resolved): if relative_to is None: inherited = None else: - inherited = {'font-size': relative_to} - assert_resolves('font-size: {size}'.format(size=size), - {'font-size': resolved}, inherited=inherited) + inherited = {"font-size": relative_to} + assert_resolves( + "font-size: {size}".format(size=size), + {"font-size": resolved}, + inherited=inherited, + ) diff --git a/pandas/tests/io/formats/test_eng_formatting.py b/pandas/tests/io/formats/test_eng_formatting.py index fc9886bec766f5..b122e4f6c3f33e 100644 --- a/pandas/tests/io/formats/test_eng_formatting.py +++ b/pandas/tests/io/formats/test_eng_formatting.py @@ -8,35 +8,32 @@ class TestEngFormatter: - def test_eng_float_formatter(self): - df = DataFrame({'A': [1.41, 141., 14100, 1410000.]}) + df = DataFrame({"A": [1.41, 141.0, 14100, 1410000.0]}) fmt.set_eng_float_format() result = df.to_string() - expected = (' A\n' - '0 1.410E+00\n' - '1 141.000E+00\n' - '2 14.100E+03\n' - '3 1.410E+06') + expected = ( + " A\n" + "0 1.410E+00\n" + "1 141.000E+00\n" + "2 14.100E+03\n" + "3 1.410E+06" + ) assert result == expected fmt.set_eng_float_format(use_eng_prefix=True) result = df.to_string() - expected = (' A\n' - '0 1.410\n' - '1 141.000\n' - '2 14.100k\n' - '3 1.410M') + expected = ( + " A\n" "0 1.410\n" "1 141.000\n" "2 14.100k\n" "3 1.410M" + ) assert result == expected fmt.set_eng_float_format(accuracy=0) result = df.to_string() - expected = (' A\n' - '0 1E+00\n' - '1 141E+00\n' - '2 14E+03\n' - '3 1E+06') + expected = ( + " A\n" "0 1E+00\n" "1 141E+00\n" "2 14E+03\n" "3 1E+06" + ) assert result == expected tm.reset_display_options() @@ -64,32 +61,58 @@ def test_exponents_with_eng_prefix(self): formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) f = np.sqrt(2) in_out = [ - (f * 10 ** -24, " 1.414y"), (f * 10 ** -23, " 14.142y"), - (f * 10 ** -22, " 141.421y"), (f * 10 ** -21, " 1.414z"), - (f * 10 ** -20, " 14.142z"), (f * 10 ** -19, " 141.421z"), - (f * 10 ** -18, " 1.414a"), (f * 10 ** -17, " 14.142a"), - (f * 10 ** -16, " 141.421a"), (f * 10 ** -15, " 1.414f"), - (f * 10 ** -14, " 14.142f"), (f * 10 ** -13, " 141.421f"), - (f * 10 ** -12, " 1.414p"), (f * 10 ** -11, " 14.142p"), - (f * 10 ** -10, " 141.421p"), (f * 10 ** -9, " 1.414n"), - (f * 10 ** -8, " 14.142n"), (f * 10 ** -7, " 141.421n"), - (f * 10 ** -6, " 1.414u"), (f * 10 ** -5, " 14.142u"), - (f * 10 ** -4, " 141.421u"), (f * 10 ** -3, " 1.414m"), - (f * 10 ** -2, " 14.142m"), (f * 10 ** -1, " 141.421m"), - (f * 10 ** 0, " 1.414"), (f * 10 ** 1, " 14.142"), - (f * 10 ** 2, " 141.421"), (f * 10 ** 3, " 1.414k"), - (f * 10 ** 4, " 14.142k"), (f * 10 ** 5, " 141.421k"), - (f * 10 ** 6, " 1.414M"), (f * 10 ** 7, " 14.142M"), - (f * 10 ** 8, " 141.421M"), (f * 10 ** 9, " 1.414G"), - (f * 10 ** 10, " 14.142G"), (f * 10 ** 11, " 141.421G"), - (f * 10 ** 12, " 1.414T"), (f * 10 ** 13, " 14.142T"), - (f * 10 ** 14, " 141.421T"), (f * 10 ** 15, " 1.414P"), - (f * 10 ** 16, " 14.142P"), (f * 10 ** 17, " 141.421P"), - (f * 10 ** 18, " 1.414E"), (f * 10 ** 19, " 14.142E"), - (f * 10 ** 20, " 141.421E"), (f * 10 ** 21, " 1.414Z"), - (f * 10 ** 22, " 14.142Z"), (f * 10 ** 23, " 141.421Z"), - (f * 10 ** 24, " 1.414Y"), (f * 10 ** 25, " 14.142Y"), - (f * 10 ** 26, " 141.421Y")] + (f * 10 ** -24, " 1.414y"), + (f * 10 ** -23, " 14.142y"), + (f * 10 ** -22, " 141.421y"), + (f * 10 ** -21, " 1.414z"), + (f * 10 ** -20, " 14.142z"), + (f * 10 ** -19, " 141.421z"), + (f * 10 ** -18, " 1.414a"), + (f * 10 ** -17, " 14.142a"), + (f * 10 ** -16, " 141.421a"), + (f * 10 ** -15, " 1.414f"), + (f * 10 ** -14, " 14.142f"), + (f * 10 ** -13, " 141.421f"), + (f * 10 ** -12, " 1.414p"), + (f * 10 ** -11, " 14.142p"), + (f * 10 ** -10, " 141.421p"), + (f * 10 ** -9, " 1.414n"), + (f * 10 ** -8, " 14.142n"), + (f * 10 ** -7, " 141.421n"), + (f * 10 ** -6, " 1.414u"), + (f * 10 ** -5, " 14.142u"), + (f * 10 ** -4, " 141.421u"), + (f * 10 ** -3, " 1.414m"), + (f * 10 ** -2, " 14.142m"), + (f * 10 ** -1, " 141.421m"), + (f * 10 ** 0, " 1.414"), + (f * 10 ** 1, " 14.142"), + (f * 10 ** 2, " 141.421"), + (f * 10 ** 3, " 1.414k"), + (f * 10 ** 4, " 14.142k"), + (f * 10 ** 5, " 141.421k"), + (f * 10 ** 6, " 1.414M"), + (f * 10 ** 7, " 14.142M"), + (f * 10 ** 8, " 141.421M"), + (f * 10 ** 9, " 1.414G"), + (f * 10 ** 10, " 14.142G"), + (f * 10 ** 11, " 141.421G"), + (f * 10 ** 12, " 1.414T"), + (f * 10 ** 13, " 14.142T"), + (f * 10 ** 14, " 141.421T"), + (f * 10 ** 15, " 1.414P"), + (f * 10 ** 16, " 14.142P"), + (f * 10 ** 17, " 141.421P"), + (f * 10 ** 18, " 1.414E"), + (f * 10 ** 19, " 14.142E"), + (f * 10 ** 20, " 141.421E"), + (f * 10 ** 21, " 1.414Z"), + (f * 10 ** 22, " 14.142Z"), + (f * 10 ** 23, " 141.421Z"), + (f * 10 ** 24, " 1.414Y"), + (f * 10 ** 25, " 14.142Y"), + (f * 10 ** 26, " 141.421Y"), + ] self.compare_all(formatter, in_out) def test_exponents_without_eng_prefix(self): @@ -146,44 +169,66 @@ def test_exponents_without_eng_prefix(self): (f * 10 ** 23, " 314.1593E+21"), (f * 10 ** 24, " 3.1416E+24"), (f * 10 ** 25, " 31.4159E+24"), - (f * 10 ** 26, " 314.1593E+24")] + (f * 10 ** 26, " 314.1593E+24"), + ] self.compare_all(formatter, in_out) def test_rounding(self): formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) - in_out = [(5.55555, ' 5.556'), (55.5555, ' 55.556'), - (555.555, ' 555.555'), (5555.55, ' 5.556k'), - (55555.5, ' 55.556k'), (555555, ' 555.555k')] + in_out = [ + (5.55555, " 5.556"), + (55.5555, " 55.556"), + (555.555, " 555.555"), + (5555.55, " 5.556k"), + (55555.5, " 55.556k"), + (555555, " 555.555k"), + ] self.compare_all(formatter, in_out) formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) - in_out = [(5.55555, ' 5.6'), (55.5555, ' 55.6'), (555.555, ' 555.6'), - (5555.55, ' 5.6k'), (55555.5, ' 55.6k'), (555555, ' 555.6k')] + in_out = [ + (5.55555, " 5.6"), + (55.5555, " 55.6"), + (555.555, " 555.6"), + (5555.55, " 5.6k"), + (55555.5, " 55.6k"), + (555555, " 555.6k"), + ] self.compare_all(formatter, in_out) formatter = fmt.EngFormatter(accuracy=0, use_eng_prefix=True) - in_out = [(5.55555, ' 6'), (55.5555, ' 56'), (555.555, ' 556'), - (5555.55, ' 6k'), (55555.5, ' 56k'), (555555, ' 556k')] + in_out = [ + (5.55555, " 6"), + (55.5555, " 56"), + (555.555, " 556"), + (5555.55, " 6k"), + (55555.5, " 56k"), + (555555, " 556k"), + ] self.compare_all(formatter, in_out) formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) result = formatter(0) - assert result == ' 0.000' + assert result == " 0.000" def test_nan(self): # Issue #11981 formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) result = formatter(np.nan) - assert result == 'NaN' - - df = pd.DataFrame({'a': [1.5, 10.3, 20.5], - 'b': [50.3, 60.67, 70.12], - 'c': [100.2, 101.33, 120.33]}) - pt = df.pivot_table(values='a', index='b', columns='c') + assert result == "NaN" + + df = pd.DataFrame( + { + "a": [1.5, 10.3, 20.5], + "b": [50.3, 60.67, 70.12], + "c": [100.2, 101.33, 120.33], + } + ) + pt = df.pivot_table(values="a", index="b", columns="c") fmt.set_eng_float_format(accuracy=1) result = pt.to_string() - assert 'NaN' in result + assert "NaN" in result tm.reset_display_options() def test_inf(self): @@ -191,4 +236,4 @@ def test_inf(self): formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) result = formatter(np.inf) - assert result == 'inf' + assert result == "inf" diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 7098a382cad45b..af862b11c756c4 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -21,8 +21,19 @@ import pandas as pd from pandas import ( - DataFrame, Index, MultiIndex, NaT, Series, Timestamp, date_range, - get_option, option_context, read_csv, reset_option, set_option) + DataFrame, + Index, + MultiIndex, + NaT, + Series, + Timestamp, + date_range, + get_option, + option_context, + read_csv, + reset_option, + set_option, +) import pandas.util.testing as tm import pandas.io.formats.format as fmt @@ -38,8 +49,8 @@ def curpath(): def has_info_repr(df): r = repr(df) - c1 = r.split('\n')[0].startswith(" max_len: - assert '...' in line + assert "..." in line else: - assert '...' not in line + assert "..." not in line with option_context("display.max_colwidth", 999999): - assert '...' not in repr(df) + assert "..." not in repr(df) with option_context("display.max_colwidth", max_len + 2): - assert '...' not in repr(df) + assert "..." not in repr(df) def test_repr_chop_threshold(self): df = DataFrame([[0.1, 0.5], [0.5, -0.1]]) pd.reset_option("display.chop_threshold") # default None - assert repr(df) == ' 0 1\n0 0.1 0.5\n1 0.5 -0.1' + assert repr(df) == " 0 1\n0 0.1 0.5\n1 0.5 -0.1" with option_context("display.chop_threshold", 0.2): - assert repr(df) == ' 0 1\n0 0.0 0.5\n1 0.5 0.0' + assert repr(df) == " 0 1\n0 0.0 0.5\n1 0.5 0.0" with option_context("display.chop_threshold", 0.6): - assert repr(df) == ' 0 1\n0 0.0 0.0\n1 0.0 0.0' + assert repr(df) == " 0 1\n0 0.0 0.0\n1 0.0 0.0" with option_context("display.chop_threshold", None): - assert repr(df) == ' 0 1\n0 0.1 0.5\n1 0.5 -0.1' + assert repr(df) == " 0 1\n0 0.1 0.5\n1 0.5 -0.1" def test_repr_chop_threshold_column_below(self): # GH 6839: validation case - df = pd.DataFrame([[10, 20, 30, 40], - [8e-10, -1e-11, 2e-9, -2e-11]]).T + df = pd.DataFrame([[10, 20, 30, 40], [8e-10, -1e-11, 2e-9, -2e-11]]).T with option_context("display.chop_threshold", 0): - assert repr(df) == (' 0 1\n' - '0 10.0 8.000000e-10\n' - '1 20.0 -1.000000e-11\n' - '2 30.0 2.000000e-09\n' - '3 40.0 -2.000000e-11') + assert repr(df) == ( + " 0 1\n" + "0 10.0 8.000000e-10\n" + "1 20.0 -1.000000e-11\n" + "2 30.0 2.000000e-09\n" + "3 40.0 -2.000000e-11" + ) with option_context("display.chop_threshold", 1e-8): - assert repr(df) == (' 0 1\n' - '0 10.0 0.000000e+00\n' - '1 20.0 0.000000e+00\n' - '2 30.0 0.000000e+00\n' - '3 40.0 0.000000e+00') + assert repr(df) == ( + " 0 1\n" + "0 10.0 0.000000e+00\n" + "1 20.0 0.000000e+00\n" + "2 30.0 0.000000e+00\n" + "3 40.0 0.000000e+00" + ) with option_context("display.chop_threshold", 5e-11): - assert repr(df) == (' 0 1\n' - '0 10.0 8.000000e-10\n' - '1 20.0 0.000000e+00\n' - '2 30.0 2.000000e-09\n' - '3 40.0 0.000000e+00') + assert repr(df) == ( + " 0 1\n" + "0 10.0 8.000000e-10\n" + "1 20.0 0.000000e+00\n" + "2 30.0 2.000000e-09\n" + "3 40.0 0.000000e+00" + ) def test_repr_obeys_max_seq_limit(self): with option_context("display.max_seq_items", 2000): @@ -227,12 +245,12 @@ def test_repr_obeys_max_seq_limit(self): assert len(printing.pprint_thing(list(range(1000)))) < 100 def test_repr_set(self): - assert printing.pprint_thing({1}) == '{1}' + assert printing.pprint_thing({1}) == "{1}" def test_repr_is_valid_construction_code(self): # for the case of Index, where the repr is traditional rather then # stylized - idx = Index(['a', 'b']) + idx = Index(["a", "b"]) res = eval("pd." + repr(idx)) tm.assert_series_equal(Series(res), Series(idx)) @@ -249,20 +267,27 @@ def test_repr_should_return_str(self): assert type(df.__repr__()) == str # both py2 / 3 def test_repr_no_backslash(self): - with option_context('mode.sim_interactive', True): + with option_context("mode.sim_interactive", True): df = DataFrame(np.random.randn(10, 4)) - assert '\\' not in repr(df) + assert "\\" not in repr(df) def test_expand_frame_repr(self): - df_small = DataFrame('hello', index=[0], columns=[0]) - df_wide = DataFrame('hello', index=[0], columns=range(10)) - df_tall = DataFrame('hello', index=range(30), columns=range(5)) - - with option_context('mode.sim_interactive', True): - with option_context('display.max_columns', 10, 'display.width', 20, - 'display.max_rows', 20, - 'display.show_dimensions', True): - with option_context('display.expand_frame_repr', True): + df_small = DataFrame("hello", index=[0], columns=[0]) + df_wide = DataFrame("hello", index=[0], columns=range(10)) + df_tall = DataFrame("hello", index=range(30), columns=range(5)) + + with option_context("mode.sim_interactive", True): + with option_context( + "display.max_columns", + 10, + "display.width", + 20, + "display.max_rows", + 20, + "display.show_dimensions", + True, + ): + with option_context("display.expand_frame_repr", True): assert not has_truncated_repr(df_small) assert not has_expanded_repr(df_small) assert not has_truncated_repr(df_wide) @@ -270,7 +295,7 @@ def test_expand_frame_repr(self): assert has_vertically_truncated_repr(df_tall) assert has_expanded_repr(df_tall) - with option_context('display.expand_frame_repr', False): + with option_context("display.expand_frame_repr", False): assert not has_truncated_repr(df_small) assert not has_expanded_repr(df_small) assert not has_horizontally_truncated_repr(df_wide) @@ -281,10 +306,11 @@ def test_expand_frame_repr(self): def test_repr_non_interactive(self): # in non interactive mode, there can be no dependency on the # result of terminal auto size detection - df = DataFrame('hello', index=range(1000), columns=range(5)) + df = DataFrame("hello", index=range(1000), columns=range(5)) - with option_context('mode.sim_interactive', False, 'display.width', 0, - 'display.max_rows', 5000): + with option_context( + "mode.sim_interactive", False, "display.width", 0, "display.max_rows", 5000 + ): assert not has_truncated_repr(df) assert not has_expanded_repr(df) @@ -292,85 +318,99 @@ def test_repr_truncates_terminal_size(self, monkeypatch): # see gh-21180 terminal_size = (118, 96) - monkeypatch.setattr('pandas.io.formats.format.get_terminal_size', - lambda: terminal_size) + monkeypatch.setattr( + "pandas.io.formats.format.get_terminal_size", lambda: terminal_size + ) index = range(5) - columns = pd.MultiIndex.from_tuples([ - ('This is a long title with > 37 chars.', 'cat'), - ('This is a loooooonger title with > 43 chars.', 'dog'), - ]) + columns = pd.MultiIndex.from_tuples( + [ + ("This is a long title with > 37 chars.", "cat"), + ("This is a loooooonger title with > 43 chars.", "dog"), + ] + ) df = pd.DataFrame(1, index=index, columns=columns) result = repr(df) - h1, h2 = result.split('\n')[:2] - assert 'long' in h1 - assert 'loooooonger' in h1 - assert 'cat' in h2 - assert 'dog' in h2 + h1, h2 = result.split("\n")[:2] + assert "long" in h1 + assert "loooooonger" in h1 + assert "cat" in h2 + assert "dog" in h2 # regular columns - df2 = pd.DataFrame({"A" * 41: [1, 2], 'B' * 41: [1, 2]}) + df2 = pd.DataFrame({"A" * 41: [1, 2], "B" * 41: [1, 2]}) result = repr(df2) - assert df2.columns[0] in result.split('\n')[0] + assert df2.columns[0] in result.split("\n")[0] def test_repr_truncates_terminal_size_full(self, monkeypatch): # GH 22984 ensure entire window is filled terminal_size = (80, 24) df = pd.DataFrame(np.random.rand(1, 7)) - monkeypatch.setattr('pandas.io.formats.format.get_terminal_size', - lambda: terminal_size) + monkeypatch.setattr( + "pandas.io.formats.format.get_terminal_size", lambda: terminal_size + ) assert "..." not in str(df) def test_repr_truncation_column_size(self): # dataframe with last column very wide -> check it is not used to # determine size of truncation (...) column - df = pd.DataFrame({'a': [108480, 30830], 'b': [12345, 12345], - 'c': [12345, 12345], 'd': [12345, 12345], - 'e': ['a' * 50] * 2}) + df = pd.DataFrame( + { + "a": [108480, 30830], + "b": [12345, 12345], + "c": [12345, 12345], + "d": [12345, 12345], + "e": ["a" * 50] * 2, + } + ) assert "..." in str(df) assert " ... " not in str(df) def test_repr_max_columns_max_rows(self): term_width, term_height = get_terminal_size() if term_width < 10 or term_height < 10: - pytest.skip("terminal size too small, " - "{0} x {1}".format(term_width, term_height)) + pytest.skip( + "terminal size too small, " "{0} x {1}".format(term_width, term_height) + ) def mkframe(n): - index = ['{i:05d}'.format(i=i) for i in range(n)] + index = ["{i:05d}".format(i=i) for i in range(n)] return DataFrame(0, index, index) df6 = mkframe(6) df10 = mkframe(10) - with option_context('mode.sim_interactive', True): - with option_context('display.width', term_width * 2): - with option_context('display.max_rows', 5, - 'display.max_columns', 5): + with option_context("mode.sim_interactive", True): + with option_context("display.width", term_width * 2): + with option_context("display.max_rows", 5, "display.max_columns", 5): assert not has_expanded_repr(mkframe(4)) assert not has_expanded_repr(mkframe(5)) assert not has_expanded_repr(df6) assert has_doubly_truncated_repr(df6) - with option_context('display.max_rows', 20, - 'display.max_columns', 10): + with option_context("display.max_rows", 20, "display.max_columns", 10): # Out off max_columns boundary, but no extending # since not exceeding width assert not has_expanded_repr(df6) assert not has_truncated_repr(df6) - with option_context('display.max_rows', 9, - 'display.max_columns', 10): + with option_context("display.max_rows", 9, "display.max_columns", 10): # out vertical bounds can not result in exanded repr assert not has_expanded_repr(df10) assert has_vertically_truncated_repr(df10) # width=None in terminal, auto detection - with option_context('display.max_columns', 100, 'display.max_rows', - term_width * 20, 'display.width', None): + with option_context( + "display.max_columns", + 100, + "display.max_rows", + term_width * 20, + "display.width", + None, + ): df = mkframe((term_width // 7) - 2) assert not has_expanded_repr(df) df = mkframe((term_width // 7) + 2) @@ -378,51 +418,58 @@ def mkframe(n): assert has_expanded_repr(df) def test_repr_min_rows(self): - df = pd.DataFrame({'a': range(20)}) + df = pd.DataFrame({"a": range(20)}) # default setting no truncation even if above min_rows - assert '..' not in repr(df) + assert ".." not in repr(df) - df = pd.DataFrame({'a': range(61)}) + df = pd.DataFrame({"a": range(61)}) # default of max_rows 60 triggers truncation if above - assert '..' in repr(df) + assert ".." in repr(df) - with option_context('display.max_rows', 10, 'display.min_rows', 4): + with option_context("display.max_rows", 10, "display.min_rows", 4): # truncated after first two rows - assert '..' in repr(df) - assert '2 ' not in repr(df) + assert ".." in repr(df) + assert "2 " not in repr(df) - with option_context('display.max_rows', 12, 'display.min_rows', None): + with option_context("display.max_rows", 12, "display.min_rows", None): # when set to None, follow value of max_rows - assert '5 5' in repr(df) + assert "5 5" in repr(df) - with option_context('display.max_rows', 10, 'display.min_rows', 12): + with option_context("display.max_rows", 10, "display.min_rows", 12): # when set value higher as max_rows, use the minimum - assert '5 5' not in repr(df) + assert "5 5" not in repr(df) - with option_context('display.max_rows', None, 'display.min_rows', 12): + with option_context("display.max_rows", None, "display.min_rows", 12): # max_rows of None -> never truncate - assert '..' not in repr(df) + assert ".." not in repr(df) def test_str_max_colwidth(self): # GH 7856 - df = pd.DataFrame([{'a': 'foo', - 'b': 'bar', - 'c': 'uncomfortably long line with lots of stuff', - 'd': 1}, {'a': 'foo', - 'b': 'bar', - 'c': 'stuff', - 'd': 1}]) - df.set_index(['a', 'b', 'c']) + df = pd.DataFrame( + [ + { + "a": "foo", + "b": "bar", + "c": "uncomfortably long line with lots of stuff", + "d": 1, + }, + {"a": "foo", "b": "bar", "c": "stuff", "d": 1}, + ] + ) + df.set_index(["a", "b", "c"]) assert str(df) == ( - ' a b c d\n' - '0 foo bar uncomfortably long line with lots of stuff 1\n' - '1 foo bar stuff 1') - with option_context('max_colwidth', 20): - assert str(df) == (' a b c d\n' - '0 foo bar uncomfortably lo... 1\n' - '1 foo bar stuff 1') + " a b c d\n" + "0 foo bar uncomfortably long line with lots of stuff 1\n" + "1 foo bar stuff 1" + ) + with option_context("max_colwidth", 20): + assert str(df) == ( + " a b c d\n" + "0 foo bar uncomfortably lo... 1\n" + "1 foo bar stuff 1" + ) def test_auto_detect(self): term_width, term_height = get_terminal_size() @@ -430,50 +477,50 @@ def test_auto_detect(self): cols = range(int(term_width * fac)) index = range(10) df = DataFrame(index=index, columns=cols) - with option_context('mode.sim_interactive', True): - with option_context('max_rows', None): - with option_context('max_columns', None): + with option_context("mode.sim_interactive", True): + with option_context("max_rows", None): + with option_context("max_columns", None): # Wrap around with None assert has_expanded_repr(df) - with option_context('max_rows', 0): - with option_context('max_columns', 0): + with option_context("max_rows", 0): + with option_context("max_columns", 0): # Truncate with auto detection. assert has_horizontally_truncated_repr(df) index = range(int(term_height * fac)) df = DataFrame(index=index, columns=cols) - with option_context('max_rows', 0): - with option_context('max_columns', None): + with option_context("max_rows", 0): + with option_context("max_columns", None): # Wrap around with None assert has_expanded_repr(df) # Truncate vertically assert has_vertically_truncated_repr(df) - with option_context('max_rows', None): - with option_context('max_columns', 0): + with option_context("max_rows", None): + with option_context("max_columns", 0): assert has_horizontally_truncated_repr(df) def test_to_string_repr_unicode(self): buf = StringIO() - unicode_values = ['\u03c3'] * 10 + unicode_values = ["\u03c3"] * 10 unicode_values = np.array(unicode_values, dtype=object) - df = DataFrame({'unicode': unicode_values}) + df = DataFrame({"unicode": unicode_values}) df.to_string(col_space=10, buf=buf) # it works! repr(df) - idx = Index(['abc', '\u03c3a', 'aegdvg']) + idx = Index(["abc", "\u03c3a", "aegdvg"]) ser = Series(np.random.randn(len(idx)), idx) - rs = repr(ser).split('\n') + rs = repr(ser).split("\n") line_len = len(rs[0]) for line in rs[1:]: try: line = line.decode(get_option("display.encoding")) except AttributeError: pass - if not line.startswith('dtype:'): + if not line.startswith("dtype:"): assert len(line) == line_len # it works even if sys.stdin in None @@ -485,7 +532,7 @@ def test_to_string_repr_unicode(self): sys.stdin = _stdin def test_to_string_unicode_columns(self, float_frame): - df = DataFrame({'\u03c3': np.arange(10.)}) + df = DataFrame({"\u03c3": np.arange(10.0)}) buf = StringIO() df.to_string(buf=buf) @@ -499,308 +546,385 @@ def test_to_string_unicode_columns(self, float_frame): assert isinstance(result, str) def test_to_string_utf8_columns(self): - n = "\u05d0".encode('utf-8') + n = "\u05d0".encode("utf-8") - with option_context('display.max_rows', 1): + with option_context("display.max_rows", 1): df = DataFrame([1, 2], columns=[n]) repr(df) def test_to_string_unicode_two(self): - dm = DataFrame({'c/\u03c3': []}) + dm = DataFrame({"c/\u03c3": []}) buf = StringIO() dm.to_string(buf) def test_to_string_unicode_three(self): - dm = DataFrame(['\xc2']) + dm = DataFrame(["\xc2"]) buf = StringIO() dm.to_string(buf) def test_to_string_with_formatters(self): - df = DataFrame({'int': [1, 2, 3], - 'float': [1.0, 2.0, 3.0], - 'object': [(1, 2), True, False]}, - columns=['int', 'float', 'object']) - - formatters = [('int', lambda x: '0x{x:x}'.format(x=x)), - ('float', lambda x: '[{x: 4.1f}]'.format(x=x)), - ('object', lambda x: '-{x!s}-'.format(x=x))] + df = DataFrame( + { + "int": [1, 2, 3], + "float": [1.0, 2.0, 3.0], + "object": [(1, 2), True, False], + }, + columns=["int", "float", "object"], + ) + + formatters = [ + ("int", lambda x: "0x{x:x}".format(x=x)), + ("float", lambda x: "[{x: 4.1f}]".format(x=x)), + ("object", lambda x: "-{x!s}-".format(x=x)), + ] result = df.to_string(formatters=dict(formatters)) result2 = df.to_string(formatters=list(zip(*formatters))[1]) - assert result == (' int float object\n' - '0 0x1 [ 1.0] -(1, 2)-\n' - '1 0x2 [ 2.0] -True-\n' - '2 0x3 [ 3.0] -False-') + assert result == ( + " int float object\n" + "0 0x1 [ 1.0] -(1, 2)-\n" + "1 0x2 [ 2.0] -True-\n" + "2 0x3 [ 3.0] -False-" + ) assert result == result2 def test_to_string_with_datetime64_monthformatter(self): months = [datetime(2016, 1, 1), datetime(2016, 2, 2)] - x = DataFrame({'months': months}) + x = DataFrame({"months": months}) def format_func(x): - return x.strftime('%Y-%m') - result = x.to_string(formatters={'months': format_func}) - expected = 'months\n0 2016-01\n1 2016-02' + return x.strftime("%Y-%m") + + result = x.to_string(formatters={"months": format_func}) + expected = "months\n0 2016-01\n1 2016-02" assert result.strip() == expected def test_to_string_with_datetime64_hourformatter(self): - x = DataFrame({'hod': pd.to_datetime(['10:10:10.100', '12:12:12.120'], - format='%H:%M:%S.%f')}) + x = DataFrame( + { + "hod": pd.to_datetime( + ["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f" + ) + } + ) def format_func(x): - return x.strftime('%H:%M') + return x.strftime("%H:%M") - result = x.to_string(formatters={'hod': format_func}) - expected = 'hod\n0 10:10\n1 12:12' + result = x.to_string(formatters={"hod": format_func}) + expected = "hod\n0 10:10\n1 12:12" assert result.strip() == expected def test_to_string_with_formatters_unicode(self): - df = DataFrame({'c/\u03c3': [1, 2, 3]}) - result = df.to_string( - formatters={'c/\u03c3': lambda x: '{x}'.format(x=x)}) - assert result == ' c/\u03c3\n' + '0 1\n1 2\n2 3' + df = DataFrame({"c/\u03c3": [1, 2, 3]}) + result = df.to_string(formatters={"c/\u03c3": lambda x: "{x}".format(x=x)}) + assert result == " c/\u03c3\n" + "0 1\n1 2\n2 3" def test_east_asian_unicode_false(self): # not aligned properly because of east asian width # mid col - df = DataFrame({'a': ['あ', 'いいい', 'う', 'ええええええ'], - 'b': [1, 222, 33333, 4]}, - index=['a', 'bb', 'c', 'ddd']) - expected = (" a b\na あ 1\n" - "bb いいい 222\nc う 33333\n" - "ddd ええええええ 4") + df = DataFrame( + {"a": ["あ", "いいい", "う", "ええええええ"], "b": [1, 222, 33333, 4]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\na あ 1\n" + "bb いいい 222\nc う 33333\n" + "ddd ええええええ 4" + ) assert repr(df) == expected # last col - df = DataFrame({'a': [1, 222, 33333, 4], - 'b': ['あ', 'いいい', 'う', 'ええええええ']}, - index=['a', 'bb', 'c', 'ddd']) - expected = (" a b\na 1 あ\n" - "bb 222 いいい\nc 33333 う\n" - "ddd 4 ええええええ") + df = DataFrame( + {"a": [1, 222, 33333, 4], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\na 1 あ\n" + "bb 222 いいい\nc 33333 う\n" + "ddd 4 ええええええ" + ) assert repr(df) == expected # all col - df = DataFrame({'a': ['あああああ', 'い', 'う', 'えええ'], - 'b': ['あ', 'いいい', 'う', 'ええええええ']}, - index=['a', 'bb', 'c', 'ddd']) - expected = (" a b\na あああああ あ\n" - "bb い いいい\nc う う\n" - "ddd えええ ええええええ") + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\na あああああ あ\n" + "bb い いいい\nc う う\n" + "ddd えええ ええええええ" + ) assert repr(df) == expected # column name - df = DataFrame({'b': ['あ', 'いいい', 'う', 'ええええええ'], - 'あああああ': [1, 222, 33333, 4]}, - index=['a', 'bb', 'c', 'ddd']) - expected = (" b あああああ\na あ 1\n" - "bb いいい 222\nc う 33333\n" - "ddd ええええええ 4") + df = DataFrame( + {"b": ["あ", "いいい", "う", "ええええええ"], "あああああ": [1, 222, 33333, 4]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " b あああああ\na あ 1\n" + "bb いいい 222\nc う 33333\n" + "ddd ええええええ 4" + ) assert repr(df) == expected # index - df = DataFrame({'a': ['あああああ', 'い', 'う', 'えええ'], - 'b': ['あ', 'いいい', 'う', 'ええええええ']}, - index=['あああ', 'いいいいいい', 'うう', 'え']) - expected = (" a b\nあああ あああああ あ\n" - "いいいいいい い いいい\nうう う う\n" - "え えええ ええええええ") + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=["あああ", "いいいいいい", "うう", "え"], + ) + expected = ( + " a b\nあああ あああああ あ\n" + "いいいいいい い いいい\nうう う う\n" + "え えええ ええええええ" + ) assert repr(df) == expected # index name - df = DataFrame({'a': ['あああああ', 'い', 'う', 'えええ'], - 'b': ['あ', 'いいい', 'う', 'ええええええ']}, - index=pd.Index(['あ', 'い', 'うう', 'え'], - name='おおおお')) - expected = (" a b\n" - "おおおお \n" - "あ あああああ あ\n" - "い い いいい\n" - "うう う う\n" - "え えええ ええええええ") + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=pd.Index(["あ", "い", "うう", "え"], name="おおおお"), + ) + expected = ( + " a b\n" + "おおおお \n" + "あ あああああ あ\n" + "い い いいい\n" + "うう う う\n" + "え えええ ええええええ" + ) assert repr(df) == expected # all - df = DataFrame({'あああ': ['あああ', 'い', 'う', 'えええええ'], - 'いいいいい': ['あ', 'いいい', 'う', 'ええ']}, - index=pd.Index(['あ', 'いいい', 'うう', 'え'], - name='お')) - expected = (" あああ いいいいい\n" - "お \n" - "あ あああ あ\n" - "いいい い いいい\n" - "うう う う\n" - "え えええええ ええ") + df = DataFrame( + {"あああ": ["あああ", "い", "う", "えええええ"], "いいいいい": ["あ", "いいい", "う", "ええ"]}, + index=pd.Index(["あ", "いいい", "うう", "え"], name="お"), + ) + expected = ( + " あああ いいいいい\n" + "お \n" + "あ あああ あ\n" + "いいい い いいい\n" + "うう う う\n" + "え えええええ ええ" + ) assert repr(df) == expected # MultiIndex - idx = pd.MultiIndex.from_tuples([('あ', 'いい'), ('う', 'え'), ( - 'おおお', 'かかかか'), ('き', 'くく')]) - df = DataFrame({'a': ['あああああ', 'い', 'う', 'えええ'], - 'b': ['あ', 'いいい', 'う', 'ええええええ']}, - index=idx) - expected = (" a b\n" - "あ いい あああああ あ\n" - "う え い いいい\n" - "おおお かかかか う う\n" - "き くく えええ ええええええ") + idx = pd.MultiIndex.from_tuples( + [("あ", "いい"), ("う", "え"), ("おおお", "かかかか"), ("き", "くく")] + ) + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=idx, + ) + expected = ( + " a b\n" + "あ いい あああああ あ\n" + "う え い いいい\n" + "おおお かかかか う う\n" + "き くく えええ ええええええ" + ) assert repr(df) == expected # truncate - with option_context('display.max_rows', 3, 'display.max_columns', 3): - df = pd.DataFrame({'a': ['あああああ', 'い', 'う', 'えええ'], - 'b': ['あ', 'いいい', 'う', 'ええええええ'], - 'c': ['お', 'か', 'ききき', 'くくくくくく'], - 'ああああ': ['さ', 'し', 'す', 'せ']}, - columns=['a', 'b', 'c', 'ああああ']) - - expected = (" a ... ああああ\n0 あああああ ... さ\n" - ".. ... ... ...\n3 えええ ... せ\n" - "\n[4 rows x 4 columns]") + with option_context("display.max_rows", 3, "display.max_columns", 3): + df = pd.DataFrame( + { + "a": ["あああああ", "い", "う", "えええ"], + "b": ["あ", "いいい", "う", "ええええええ"], + "c": ["お", "か", "ききき", "くくくくくく"], + "ああああ": ["さ", "し", "す", "せ"], + }, + columns=["a", "b", "c", "ああああ"], + ) + + expected = ( + " a ... ああああ\n0 あああああ ... さ\n" + ".. ... ... ...\n3 えええ ... せ\n" + "\n[4 rows x 4 columns]" + ) assert repr(df) == expected - df.index = ['あああ', 'いいいい', 'う', 'aaa'] - expected = (" a ... ああああ\nあああ あああああ ... さ\n" - ".. ... ... ...\naaa えええ ... せ\n" - "\n[4 rows x 4 columns]") + df.index = ["あああ", "いいいい", "う", "aaa"] + expected = ( + " a ... ああああ\nあああ あああああ ... さ\n" + ".. ... ... ...\naaa えええ ... せ\n" + "\n[4 rows x 4 columns]" + ) assert repr(df) == expected def test_east_asian_unicode_true(self): # Enable Unicode option ----------------------------------------- - with option_context('display.unicode.east_asian_width', True): + with option_context("display.unicode.east_asian_width", True): # mid col - df = DataFrame({'a': ['あ', 'いいい', 'う', 'ええええええ'], - 'b': [1, 222, 33333, 4]}, - index=['a', 'bb', 'c', 'ddd']) - expected = (" a b\na あ 1\n" - "bb いいい 222\nc う 33333\n" - "ddd ええええええ 4") + df = DataFrame( + {"a": ["あ", "いいい", "う", "ええええええ"], "b": [1, 222, 33333, 4]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\na あ 1\n" + "bb いいい 222\nc う 33333\n" + "ddd ええええええ 4" + ) assert repr(df) == expected # last col - df = DataFrame({'a': [1, 222, 33333, 4], - 'b': ['あ', 'いいい', 'う', 'ええええええ']}, - index=['a', 'bb', 'c', 'ddd']) - expected = (" a b\na 1 あ\n" - "bb 222 いいい\nc 33333 う\n" - "ddd 4 ええええええ") + df = DataFrame( + {"a": [1, 222, 33333, 4], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\na 1 あ\n" + "bb 222 いいい\nc 33333 う\n" + "ddd 4 ええええええ" + ) assert repr(df) == expected # all col - df = DataFrame({'a': ['あああああ', 'い', 'う', 'えええ'], - 'b': ['あ', 'いいい', 'う', 'ええええええ']}, - index=['a', 'bb', 'c', 'ddd']) - expected = (" a b\n" - "a あああああ あ\n" - "bb い いいい\n" - "c う う\n" - "ddd えええ ええええええ") + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\n" + "a あああああ あ\n" + "bb い いいい\n" + "c う う\n" + "ddd えええ ええええええ" + ) assert repr(df) == expected # column name - df = DataFrame({'b': ['あ', 'いいい', 'う', 'ええええええ'], - 'あああああ': [1, 222, 33333, 4]}, - index=['a', 'bb', 'c', 'ddd']) - expected = (" b あああああ\n" - "a あ 1\n" - "bb いいい 222\n" - "c う 33333\n" - "ddd ええええええ 4") + df = DataFrame( + {"b": ["あ", "いいい", "う", "ええええええ"], "あああああ": [1, 222, 33333, 4]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " b あああああ\n" + "a あ 1\n" + "bb いいい 222\n" + "c う 33333\n" + "ddd ええええええ 4" + ) assert repr(df) == expected # index - df = DataFrame({'a': ['あああああ', 'い', 'う', 'えええ'], - 'b': ['あ', 'いいい', 'う', 'ええええええ']}, - index=['あああ', 'いいいいいい', 'うう', 'え']) - expected = (" a b\n" - "あああ あああああ あ\n" - "いいいいいい い いいい\n" - "うう う う\n" - "え えええ ええええええ") + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=["あああ", "いいいいいい", "うう", "え"], + ) + expected = ( + " a b\n" + "あああ あああああ あ\n" + "いいいいいい い いいい\n" + "うう う う\n" + "え えええ ええええええ" + ) assert repr(df) == expected # index name - df = DataFrame({'a': ['あああああ', 'い', 'う', 'えええ'], - 'b': ['あ', 'いいい', 'う', 'ええええええ']}, - index=pd.Index(['あ', 'い', 'うう', 'え'], - name='おおおお')) - expected = (" a b\n" - "おおおお \n" - "あ あああああ あ\n" - "い い いいい\n" - "うう う う\n" - "え えええ ええええええ") + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=pd.Index(["あ", "い", "うう", "え"], name="おおおお"), + ) + expected = ( + " a b\n" + "おおおお \n" + "あ あああああ あ\n" + "い い いいい\n" + "うう う う\n" + "え えええ ええええええ" + ) assert repr(df) == expected # all - df = DataFrame({'あああ': ['あああ', 'い', 'う', 'えええええ'], - 'いいいいい': ['あ', 'いいい', 'う', 'ええ']}, - index=pd.Index(['あ', 'いいい', 'うう', 'え'], - name='お')) - expected = (" あああ いいいいい\n" - "お \n" - "あ あああ あ\n" - "いいい い いいい\n" - "うう う う\n" - "え えええええ ええ") + df = DataFrame( + {"あああ": ["あああ", "い", "う", "えええええ"], "いいいいい": ["あ", "いいい", "う", "ええ"]}, + index=pd.Index(["あ", "いいい", "うう", "え"], name="お"), + ) + expected = ( + " あああ いいいいい\n" + "お \n" + "あ あああ あ\n" + "いいい い いいい\n" + "うう う う\n" + "え えええええ ええ" + ) assert repr(df) == expected # MultiIndex - idx = pd.MultiIndex.from_tuples([('あ', 'いい'), ('う', 'え'), ( - 'おおお', 'かかかか'), ('き', 'くく')]) - df = DataFrame({'a': ['あああああ', 'い', 'う', 'えええ'], - 'b': ['あ', 'いいい', 'う', 'ええええええ']}, - index=idx) - expected = (" a b\n" - "あ いい あああああ あ\n" - "う え い いいい\n" - "おおお かかかか う う\n" - "き くく えええ ええええええ") + idx = pd.MultiIndex.from_tuples( + [("あ", "いい"), ("う", "え"), ("おおお", "かかかか"), ("き", "くく")] + ) + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=idx, + ) + expected = ( + " a b\n" + "あ いい あああああ あ\n" + "う え い いいい\n" + "おおお かかかか う う\n" + "き くく えええ ええええええ" + ) assert repr(df) == expected # truncate - with option_context('display.max_rows', 3, 'display.max_columns', - 3): - - df = pd.DataFrame({'a': ['あああああ', 'い', 'う', 'えええ'], - 'b': ['あ', 'いいい', 'う', 'ええええええ'], - 'c': ['お', 'か', 'ききき', 'くくくくくく'], - 'ああああ': ['さ', 'し', 'す', 'せ']}, - columns=['a', 'b', 'c', 'ああああ']) - - expected = (" a ... ああああ\n" - "0 あああああ ... さ\n" - ".. ... ... ...\n" - "3 えええ ... せ\n" - "\n[4 rows x 4 columns]") + with option_context("display.max_rows", 3, "display.max_columns", 3): + + df = pd.DataFrame( + { + "a": ["あああああ", "い", "う", "えええ"], + "b": ["あ", "いいい", "う", "ええええええ"], + "c": ["お", "か", "ききき", "くくくくくく"], + "ああああ": ["さ", "し", "す", "せ"], + }, + columns=["a", "b", "c", "ああああ"], + ) + + expected = ( + " a ... ああああ\n" + "0 あああああ ... さ\n" + ".. ... ... ...\n" + "3 えええ ... せ\n" + "\n[4 rows x 4 columns]" + ) assert repr(df) == expected - df.index = ['あああ', 'いいいい', 'う', 'aaa'] - expected = (" a ... ああああ\n" - "あああ あああああ ... さ\n" - "... ... ... ...\n" - "aaa えええ ... せ\n" - "\n[4 rows x 4 columns]") + df.index = ["あああ", "いいいい", "う", "aaa"] + expected = ( + " a ... ああああ\n" + "あああ あああああ ... さ\n" + "... ... ... ...\n" + "aaa えええ ... せ\n" + "\n[4 rows x 4 columns]" + ) assert repr(df) == expected # ambiguous unicode - df = DataFrame({'b': ['あ', 'いいい', '¡¡', 'ええええええ'], - 'あああああ': [1, 222, 33333, 4]}, - index=['a', 'bb', 'c', '¡¡¡']) - expected = (" b あああああ\n" - "a あ 1\n" - "bb いいい 222\n" - "c ¡¡ 33333\n" - "¡¡¡ ええええええ 4") + df = DataFrame( + {"b": ["あ", "いいい", "¡¡", "ええええええ"], "あああああ": [1, 222, 33333, 4]}, + index=["a", "bb", "c", "¡¡¡"], + ) + expected = ( + " b あああああ\n" + "a あ 1\n" + "bb いいい 222\n" + "c ¡¡ 33333\n" + "¡¡¡ ええええええ 4" + ) assert repr(df) == expected def test_to_string_buffer_all_unicode(self): buf = StringIO() - empty = DataFrame({'c/\u03c3': Series()}) - nonempty = DataFrame({'c/\u03c3': Series([1, 2, 3])}) + empty = DataFrame({"c/\u03c3": Series()}) + nonempty = DataFrame({"c/\u03c3": Series([1, 2, 3])}) print(empty, file=buf) print(nonempty, file=buf) @@ -823,37 +947,41 @@ def test_to_string_with_col_space(self): assert len(with_header_row1) == len(no_header) def test_to_string_truncate_indices(self): - for index in [tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex, - tm.makeDateIndex, tm.makePeriodIndex]: + for index in [ + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeIntIndex, + tm.makeDateIndex, + tm.makePeriodIndex, + ]: for column in [tm.makeStringIndex]: for h in [10, 20]: for w in [10, 20]: - with option_context("display.expand_frame_repr", - False): + with option_context("display.expand_frame_repr", False): df = DataFrame(index=index(h), columns=column(w)) with option_context("display.max_rows", 15): if h == 20: assert has_vertically_truncated_repr(df) else: - assert not has_vertically_truncated_repr( - df) + assert not has_vertically_truncated_repr(df) with option_context("display.max_columns", 15): if w == 20: assert has_horizontally_truncated_repr(df) else: - assert not ( - has_horizontally_truncated_repr(df)) - with option_context("display.max_rows", 15, - "display.max_columns", 15): + assert not (has_horizontally_truncated_repr(df)) + with option_context( + "display.max_rows", 15, "display.max_columns", 15 + ): if h == 20 and w == 20: assert has_doubly_truncated_repr(df) else: - assert not has_doubly_truncated_repr( - df) + assert not has_doubly_truncated_repr(df) def test_to_string_truncate_multilevel(self): - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] df = DataFrame(index=arrays, columns=arrays) with option_context("display.max_rows", 7, "display.max_columns", 7): assert has_doubly_truncated_repr(df) @@ -865,337 +993,358 @@ def test_truncate_with_different_dtypes(self): # 11594 import datetime - s = Series([datetime.datetime(2012, 1, 1)] * 10 + - [datetime.datetime(1012, 1, 2)] + [ - datetime.datetime(2012, 1, 3)] * 10) - with pd.option_context('display.max_rows', 8): + s = Series( + [datetime.datetime(2012, 1, 1)] * 10 + + [datetime.datetime(1012, 1, 2)] + + [datetime.datetime(2012, 1, 3)] * 10 + ) + + with pd.option_context("display.max_rows", 8): result = str(s) - assert 'object' in result + assert "object" in result # 12045 - df = DataFrame({'text': ['some words'] + [None] * 9}) + df = DataFrame({"text": ["some words"] + [None] * 9}) - with pd.option_context('display.max_rows', 8, - 'display.max_columns', 3): + with pd.option_context("display.max_rows", 8, "display.max_columns", 3): result = str(df) - assert 'None' in result - assert 'NaN' not in result + assert "None" in result + assert "NaN" not in result def test_datetimelike_frame(self): # GH 12211 df = DataFrame( - {'date': [pd.Timestamp('20130101').tz_localize('UTC')] + - [pd.NaT] * 5}) + {"date": [pd.Timestamp("20130101").tz_localize("UTC")] + [pd.NaT] * 5} + ) with option_context("display.max_rows", 5): result = str(df) - assert '2013-01-01 00:00:00+00:00' in result - assert 'NaT' in result - assert '...' in result - assert '[6 rows x 1 columns]' in result - - dts = [pd.Timestamp('2011-01-01', tz='US/Eastern')] * 5 + [pd.NaT] * 5 - df = pd.DataFrame({"dt": dts, - "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) - with option_context('display.max_rows', 5): - expected = (' dt x\n' - '0 2011-01-01 00:00:00-05:00 1\n' - '1 2011-01-01 00:00:00-05:00 2\n' - '.. ... ..\n' - '8 NaT 9\n' - '9 NaT 10\n\n' - '[10 rows x 2 columns]') + assert "2013-01-01 00:00:00+00:00" in result + assert "NaT" in result + assert "..." in result + assert "[6 rows x 1 columns]" in result + + dts = [pd.Timestamp("2011-01-01", tz="US/Eastern")] * 5 + [pd.NaT] * 5 + df = pd.DataFrame({"dt": dts, "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) + with option_context("display.max_rows", 5): + expected = ( + " dt x\n" + "0 2011-01-01 00:00:00-05:00 1\n" + "1 2011-01-01 00:00:00-05:00 2\n" + ".. ... ..\n" + "8 NaT 9\n" + "9 NaT 10\n\n" + "[10 rows x 2 columns]" + ) assert repr(df) == expected - dts = [pd.NaT] * 5 + [pd.Timestamp('2011-01-01', tz='US/Eastern')] * 5 - df = pd.DataFrame({"dt": dts, - "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) - with option_context('display.max_rows', 5): - expected = (' dt x\n' - '0 NaT 1\n' - '1 NaT 2\n' - '.. ... ..\n' - '8 2011-01-01 00:00:00-05:00 9\n' - '9 2011-01-01 00:00:00-05:00 10\n\n' - '[10 rows x 2 columns]') + dts = [pd.NaT] * 5 + [pd.Timestamp("2011-01-01", tz="US/Eastern")] * 5 + df = pd.DataFrame({"dt": dts, "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) + with option_context("display.max_rows", 5): + expected = ( + " dt x\n" + "0 NaT 1\n" + "1 NaT 2\n" + ".. ... ..\n" + "8 2011-01-01 00:00:00-05:00 9\n" + "9 2011-01-01 00:00:00-05:00 10\n\n" + "[10 rows x 2 columns]" + ) assert repr(df) == expected - dts = ([pd.Timestamp('2011-01-01', tz='Asia/Tokyo')] * 5 + - [pd.Timestamp('2011-01-01', tz='US/Eastern')] * 5) - df = pd.DataFrame({"dt": dts, - "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) - with option_context('display.max_rows', 5): - expected = (' dt x\n' - '0 2011-01-01 00:00:00+09:00 1\n' - '1 2011-01-01 00:00:00+09:00 2\n' - '.. ... ..\n' - '8 2011-01-01 00:00:00-05:00 9\n' - '9 2011-01-01 00:00:00-05:00 10\n\n' - '[10 rows x 2 columns]') + dts = [pd.Timestamp("2011-01-01", tz="Asia/Tokyo")] * 5 + [ + pd.Timestamp("2011-01-01", tz="US/Eastern") + ] * 5 + df = pd.DataFrame({"dt": dts, "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) + with option_context("display.max_rows", 5): + expected = ( + " dt x\n" + "0 2011-01-01 00:00:00+09:00 1\n" + "1 2011-01-01 00:00:00+09:00 2\n" + ".. ... ..\n" + "8 2011-01-01 00:00:00-05:00 9\n" + "9 2011-01-01 00:00:00-05:00 10\n\n" + "[10 rows x 2 columns]" + ) assert repr(df) == expected - @pytest.mark.parametrize('start_date', [ - '2017-01-01 23:59:59.999999999', - '2017-01-01 23:59:59.99999999', - '2017-01-01 23:59:59.9999999', - '2017-01-01 23:59:59.999999', - '2017-01-01 23:59:59.99999', - '2017-01-01 23:59:59.9999', - ]) + @pytest.mark.parametrize( + "start_date", + [ + "2017-01-01 23:59:59.999999999", + "2017-01-01 23:59:59.99999999", + "2017-01-01 23:59:59.9999999", + "2017-01-01 23:59:59.999999", + "2017-01-01 23:59:59.99999", + "2017-01-01 23:59:59.9999", + ], + ) def test_datetimeindex_highprecision(self, start_date): # GH19030 # Check that high-precision time values for the end of day are # included in repr for DatetimeIndex - df = DataFrame({'A': date_range(start=start_date, - freq='D', periods=5)}) + df = DataFrame({"A": date_range(start=start_date, freq="D", periods=5)}) result = str(df) assert start_date in result - dti = date_range(start=start_date, - freq='D', periods=5) - df = DataFrame({'A': range(5)}, index=dti) + dti = date_range(start=start_date, freq="D", periods=5) + df = DataFrame({"A": range(5)}, index=dti) result = str(df.index) assert start_date in result def test_nonunicode_nonascii_alignment(self): df = DataFrame([["aa\xc3\xa4\xc3\xa4", 1], ["bbbb", 2]]) rep_str = df.to_string() - lines = rep_str.split('\n') + lines = rep_str.split("\n") assert len(lines[1]) == len(lines[2]) def test_unicode_problem_decoding_as_ascii(self): - dm = DataFrame({'c/\u03c3': Series({'test': np.nan})}) + dm = DataFrame({"c/\u03c3": Series({"test": np.nan})}) str(dm.to_string()) def test_string_repr_encoding(self, datapath): - filepath = datapath('io', 'parser', 'data', 'unicode_series.csv') - df = pd.read_csv(filepath, header=None, encoding='latin1') + filepath = datapath("io", "parser", "data", "unicode_series.csv") + df = pd.read_csv(filepath, header=None, encoding="latin1") repr(df) repr(df[1]) def test_repr_corner(self): # representing infs poses no problems - df = DataFrame({'foo': [-np.inf, np.inf]}) + df = DataFrame({"foo": [-np.inf, np.inf]}) repr(df) def test_frame_info_encoding(self): - index = ['\'Til There Was You (1997)', - 'ldum klaka (Cold Fever) (1994)'] - fmt.set_option('display.max_rows', 1) - df = DataFrame(columns=['a', 'b', 'c'], index=index) + index = ["'Til There Was You (1997)", "ldum klaka (Cold Fever) (1994)"] + fmt.set_option("display.max_rows", 1) + df = DataFrame(columns=["a", "b", "c"], index=index) repr(df) repr(df.T) - fmt.set_option('display.max_rows', 200) + fmt.set_option("display.max_rows", 200) def test_wide_repr(self): - with option_context('mode.sim_interactive', True, - 'display.show_dimensions', True, - 'display.max_columns', 20): - max_cols = get_option('display.max_columns') + with option_context( + "mode.sim_interactive", + True, + "display.show_dimensions", + True, + "display.max_columns", + 20, + ): + max_cols = get_option("display.max_columns") df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) - set_option('display.expand_frame_repr', False) + set_option("display.expand_frame_repr", False) rep_str = repr(df) assert "10 rows x {c} columns".format(c=max_cols - 1) in rep_str - set_option('display.expand_frame_repr', True) + set_option("display.expand_frame_repr", True) wide_repr = repr(df) assert rep_str != wide_repr - with option_context('display.width', 120): + with option_context("display.width", 120): wider_repr = repr(df) assert len(wider_repr) < len(wide_repr) - reset_option('display.expand_frame_repr') + reset_option("display.expand_frame_repr") def test_wide_repr_wide_columns(self): - with option_context('mode.sim_interactive', True, - 'display.max_columns', 20): - df = DataFrame(np.random.randn(5, 3), - columns=['a' * 90, 'b' * 90, 'c' * 90]) + with option_context("mode.sim_interactive", True, "display.max_columns", 20): + df = DataFrame( + np.random.randn(5, 3), columns=["a" * 90, "b" * 90, "c" * 90] + ) rep_str = repr(df) assert len(rep_str.splitlines()) == 20 def test_wide_repr_named(self): - with option_context('mode.sim_interactive', True, - 'display.max_columns', 20): - max_cols = get_option('display.max_columns') + with option_context("mode.sim_interactive", True, "display.max_columns", 20): + max_cols = get_option("display.max_columns") df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) - df.index.name = 'DataFrame Index' - set_option('display.expand_frame_repr', False) + df.index.name = "DataFrame Index" + set_option("display.expand_frame_repr", False) rep_str = repr(df) - set_option('display.expand_frame_repr', True) + set_option("display.expand_frame_repr", True) wide_repr = repr(df) assert rep_str != wide_repr - with option_context('display.width', 150): + with option_context("display.width", 150): wider_repr = repr(df) assert len(wider_repr) < len(wide_repr) for line in wide_repr.splitlines()[1::13]: - assert 'DataFrame Index' in line + assert "DataFrame Index" in line - reset_option('display.expand_frame_repr') + reset_option("display.expand_frame_repr") def test_wide_repr_multiindex(self): - with option_context('mode.sim_interactive', True, - 'display.max_columns', 20): + with option_context("mode.sim_interactive", True, "display.max_columns", 20): midx = MultiIndex.from_arrays(tm.rands_array(5, size=(2, 10))) - max_cols = get_option('display.max_columns') - df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)), - index=midx) - df.index.names = ['Level 0', 'Level 1'] - set_option('display.expand_frame_repr', False) + max_cols = get_option("display.max_columns") + df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)), index=midx) + df.index.names = ["Level 0", "Level 1"] + set_option("display.expand_frame_repr", False) rep_str = repr(df) - set_option('display.expand_frame_repr', True) + set_option("display.expand_frame_repr", True) wide_repr = repr(df) assert rep_str != wide_repr - with option_context('display.width', 150): + with option_context("display.width", 150): wider_repr = repr(df) assert len(wider_repr) < len(wide_repr) for line in wide_repr.splitlines()[1::13]: - assert 'Level 0 Level 1' in line + assert "Level 0 Level 1" in line - reset_option('display.expand_frame_repr') + reset_option("display.expand_frame_repr") def test_wide_repr_multiindex_cols(self): - with option_context('mode.sim_interactive', True, - 'display.max_columns', 20): - max_cols = get_option('display.max_columns') + with option_context("mode.sim_interactive", True, "display.max_columns", 20): + max_cols = get_option("display.max_columns") midx = MultiIndex.from_arrays(tm.rands_array(5, size=(2, 10))) - mcols = MultiIndex.from_arrays( - tm.rands_array(3, size=(2, max_cols - 1))) - df = DataFrame(tm.rands_array(25, (10, max_cols - 1)), - index=midx, columns=mcols) - df.index.names = ['Level 0', 'Level 1'] - set_option('display.expand_frame_repr', False) + mcols = MultiIndex.from_arrays(tm.rands_array(3, size=(2, max_cols - 1))) + df = DataFrame( + tm.rands_array(25, (10, max_cols - 1)), index=midx, columns=mcols + ) + df.index.names = ["Level 0", "Level 1"] + set_option("display.expand_frame_repr", False) rep_str = repr(df) - set_option('display.expand_frame_repr', True) + set_option("display.expand_frame_repr", True) wide_repr = repr(df) assert rep_str != wide_repr - with option_context('display.width', 150, 'display.max_columns', 20): + with option_context("display.width", 150, "display.max_columns", 20): wider_repr = repr(df) assert len(wider_repr) < len(wide_repr) - reset_option('display.expand_frame_repr') + reset_option("display.expand_frame_repr") def test_wide_repr_unicode(self): - with option_context('mode.sim_interactive', True, - 'display.max_columns', 20): + with option_context("mode.sim_interactive", True, "display.max_columns", 20): max_cols = 20 df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) - set_option('display.expand_frame_repr', False) + set_option("display.expand_frame_repr", False) rep_str = repr(df) - set_option('display.expand_frame_repr', True) + set_option("display.expand_frame_repr", True) wide_repr = repr(df) assert rep_str != wide_repr - with option_context('display.width', 150): + with option_context("display.width", 150): wider_repr = repr(df) assert len(wider_repr) < len(wide_repr) - reset_option('display.expand_frame_repr') + reset_option("display.expand_frame_repr") def test_wide_repr_wide_long_columns(self): - with option_context('mode.sim_interactive', True): - df = DataFrame({'a': ['a' * 30, 'b' * 30], - 'b': ['c' * 70, 'd' * 80]}) + with option_context("mode.sim_interactive", True): + df = DataFrame({"a": ["a" * 30, "b" * 30], "b": ["c" * 70, "d" * 80]}) result = repr(df) - assert 'ccccc' in result - assert 'ddddd' in result + assert "ccccc" in result + assert "ddddd" in result def test_long_series(self): n = 1000 s = Series( np.random.randint(-50, 50, n), - index=['s{x:04d}'.format(x=x) for x in range(n)], dtype='int64') + index=["s{x:04d}".format(x=x) for x in range(n)], + dtype="int64", + ) import re + str_rep = str(s) - nmatches = len(re.findall('dtype', str_rep)) + nmatches = len(re.findall("dtype", str_rep)) assert nmatches == 1 def test_index_with_nan(self): # GH 2850 - df = DataFrame({'id1': {0: '1a3', - 1: '9h4'}, - 'id2': {0: np.nan, - 1: 'd67'}, - 'id3': {0: '78d', - 1: '79d'}, - 'value': {0: 123, - 1: 64}}) + df = DataFrame( + { + "id1": {0: "1a3", 1: "9h4"}, + "id2": {0: np.nan, 1: "d67"}, + "id3": {0: "78d", 1: "79d"}, + "value": {0: 123, 1: 64}, + } + ) # multi-index - y = df.set_index(['id1', 'id2', 'id3']) + y = df.set_index(["id1", "id2", "id3"]) result = y.to_string() - expected = (' value\nid1 id2 id3 \n' - '1a3 NaN 78d 123\n9h4 d67 79d 64') + expected = ( + " value\nid1 id2 id3 \n" + "1a3 NaN 78d 123\n9h4 d67 79d 64" + ) assert result == expected # index - y = df.set_index('id2') + y = df.set_index("id2") result = y.to_string() - expected = (' id1 id3 value\nid2 \n' - 'NaN 1a3 78d 123\nd67 9h4 79d 64') + expected = ( + " id1 id3 value\nid2 \n" + "NaN 1a3 78d 123\nd67 9h4 79d 64" + ) assert result == expected # with append (this failed in 0.12) - y = df.set_index(['id1', 'id2']).set_index('id3', append=True) + y = df.set_index(["id1", "id2"]).set_index("id3", append=True) result = y.to_string() - expected = (' value\nid1 id2 id3 \n' - '1a3 NaN 78d 123\n9h4 d67 79d 64') + expected = ( + " value\nid1 id2 id3 \n" + "1a3 NaN 78d 123\n9h4 d67 79d 64" + ) assert result == expected # all-nan in mi df2 = df.copy() - df2.loc[:, 'id2'] = np.nan - y = df2.set_index('id2') + df2.loc[:, "id2"] = np.nan + y = df2.set_index("id2") result = y.to_string() - expected = (' id1 id3 value\nid2 \n' - 'NaN 1a3 78d 123\nNaN 9h4 79d 64') + expected = ( + " id1 id3 value\nid2 \n" + "NaN 1a3 78d 123\nNaN 9h4 79d 64" + ) assert result == expected # partial nan in mi df2 = df.copy() - df2.loc[:, 'id2'] = np.nan - y = df2.set_index(['id2', 'id3']) + df2.loc[:, "id2"] = np.nan + y = df2.set_index(["id2", "id3"]) result = y.to_string() - expected = (' id1 value\nid2 id3 \n' - 'NaN 78d 1a3 123\n 79d 9h4 64') + expected = ( + " id1 value\nid2 id3 \n" + "NaN 78d 1a3 123\n 79d 9h4 64" + ) assert result == expected - df = DataFrame({'id1': {0: np.nan, - 1: '9h4'}, - 'id2': {0: np.nan, - 1: 'd67'}, - 'id3': {0: np.nan, - 1: '79d'}, - 'value': {0: 123, - 1: 64}}) + df = DataFrame( + { + "id1": {0: np.nan, 1: "9h4"}, + "id2": {0: np.nan, 1: "d67"}, + "id3": {0: np.nan, 1: "79d"}, + "value": {0: 123, 1: 64}, + } + ) - y = df.set_index(['id1', 'id2', 'id3']) + y = df.set_index(["id1", "id2", "id3"]) result = y.to_string() - expected = (' value\nid1 id2 id3 \n' - 'NaN NaN NaN 123\n9h4 d67 79d 64') + expected = ( + " value\nid1 id2 id3 \n" + "NaN NaN NaN 123\n9h4 d67 79d 64" + ) assert result == expected def test_to_string(self): # big mixed - biggie = DataFrame({'A': np.random.randn(200), - 'B': tm.makeStringIndex(200)}, - index=np.arange(200)) + biggie = DataFrame( + {"A": np.random.randn(200), "B": tm.makeStringIndex(200)}, + index=np.arange(200), + ) - biggie.loc[:20, 'A'] = np.nan - biggie.loc[:20, 'B'] = np.nan + biggie.loc[:20, "A"] = np.nan + biggie.loc[:20, "B"] = np.nan s = biggie.to_string() buf = StringIO() @@ -1206,37 +1355,37 @@ def test_to_string(self): assert isinstance(s, str) # print in right order - result = biggie.to_string(columns=['B', 'A'], col_space=17, - float_format='%.5f'.__mod__) - lines = result.split('\n') + result = biggie.to_string( + columns=["B", "A"], col_space=17, float_format="%.5f".__mod__ + ) + lines = result.split("\n") header = lines[0].strip().split() - joined = '\n'.join(re.sub(r'\s+', ' ', x).strip() for x in lines[1:]) - recons = read_csv(StringIO(joined), names=header, - header=None, sep=' ') - tm.assert_series_equal(recons['B'], biggie['B']) - assert recons['A'].count() == biggie['A'].count() - assert (np.abs(recons['A'].dropna() - - biggie['A'].dropna()) < 0.1).all() + joined = "\n".join(re.sub(r"\s+", " ", x).strip() for x in lines[1:]) + recons = read_csv(StringIO(joined), names=header, header=None, sep=" ") + tm.assert_series_equal(recons["B"], biggie["B"]) + assert recons["A"].count() == biggie["A"].count() + assert (np.abs(recons["A"].dropna() - biggie["A"].dropna()) < 0.1).all() # expected = ['B', 'A'] # assert header == expected - result = biggie.to_string(columns=['A'], col_space=17) - header = result.split('\n')[0].strip().split() - expected = ['A'] + result = biggie.to_string(columns=["A"], col_space=17) + header = result.split("\n")[0].strip().split() + expected = ["A"] assert header == expected - biggie.to_string(columns=['B', 'A'], - formatters={'A': lambda x: '{x:.1f}'.format(x=x)}) + biggie.to_string( + columns=["B", "A"], formatters={"A": lambda x: "{x:.1f}".format(x=x)} + ) - biggie.to_string(columns=['B', 'A'], float_format=str) - biggie.to_string(columns=['B', 'A'], col_space=12, float_format=str) + biggie.to_string(columns=["B", "A"], float_format=str) + biggie.to_string(columns=["B", "A"], col_space=12, float_format=str) frame = DataFrame(index=np.arange(200)) frame.to_string() def test_to_string_no_header(self): - df = DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) df_s = df.to_string(header=False) expected = "0 1 4\n1 2 5\n2 3 6" @@ -1244,50 +1393,46 @@ def test_to_string_no_header(self): assert df_s == expected def test_to_string_specified_header(self): - df = DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - df_s = df.to_string(header=['X', 'Y']) - expected = ' X Y\n0 1 4\n1 2 5\n2 3 6' + df_s = df.to_string(header=["X", "Y"]) + expected = " X Y\n0 1 4\n1 2 5\n2 3 6" assert df_s == expected with pytest.raises(ValueError): - df.to_string(header=['X']) + df.to_string(header=["X"]) def test_to_string_no_index(self): # GH 16839, GH 13032 - df = DataFrame({'x': [11, 22], 'y': [33, -44], 'z': ['AAA', ' ']}) + df = DataFrame({"x": [11, 22], "y": [33, -44], "z": ["AAA", " "]}) df_s = df.to_string(index=False) # Leading space is expected for positive numbers. - expected = (" x y z\n" - " 11 33 AAA\n" - " 22 -44 ") + expected = " x y z\n" " 11 33 AAA\n" " 22 -44 " assert df_s == expected - df_s = df[['y', 'x', 'z']].to_string(index=False) - expected = (" y x z\n" - " 33 11 AAA\n" - "-44 22 ") + df_s = df[["y", "x", "z"]].to_string(index=False) + expected = " y x z\n" " 33 11 AAA\n" "-44 22 " assert df_s == expected def test_to_string_line_width_no_index(self): # GH 13998, GH 22505 - df = DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) df_s = df.to_string(line_width=1, index=False) expected = " x \\\n 1 \n 2 \n 3 \n\n y \n 4 \n 5 \n 6 " assert df_s == expected - df = DataFrame({'x': [11, 22, 33], 'y': [4, 5, 6]}) + df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) df_s = df.to_string(line_width=1, index=False) expected = " x \\\n 11 \n 22 \n 33 \n\n y \n 4 \n 5 \n 6 " assert df_s == expected - df = DataFrame({'x': [11, 22, -33], 'y': [4, 5, -6]}) + df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) df_s = df.to_string(line_width=1, index=False) expected = " x \\\n 11 \n 22 \n-33 \n\n y \n 4 \n 5 \n-6 " @@ -1296,129 +1441,147 @@ def test_to_string_line_width_no_index(self): def test_to_string_float_formatting(self): tm.reset_display_options() - fmt.set_option('display.precision', 5, 'display.column_space', 12, - 'display.notebook_repr_html', False) + fmt.set_option( + "display.precision", + 5, + "display.column_space", + 12, + "display.notebook_repr_html", + False, + ) - df = DataFrame({'x': [0, 0.25, 3456.000, 12e+45, 1.64e+6, 1.7e+8, - 1.253456, np.pi, -1e6]}) + df = DataFrame( + {"x": [0, 0.25, 3456.000, 12e45, 1.64e6, 1.7e8, 1.253456, np.pi, -1e6]} + ) df_s = df.to_string() if _three_digit_exp(): - expected = (' x\n0 0.00000e+000\n1 2.50000e-001\n' - '2 3.45600e+003\n3 1.20000e+046\n4 1.64000e+006\n' - '5 1.70000e+008\n6 1.25346e+000\n7 3.14159e+000\n' - '8 -1.00000e+006') + expected = ( + " x\n0 0.00000e+000\n1 2.50000e-001\n" + "2 3.45600e+003\n3 1.20000e+046\n4 1.64000e+006\n" + "5 1.70000e+008\n6 1.25346e+000\n7 3.14159e+000\n" + "8 -1.00000e+006" + ) else: - expected = (' x\n0 0.00000e+00\n1 2.50000e-01\n' - '2 3.45600e+03\n3 1.20000e+46\n4 1.64000e+06\n' - '5 1.70000e+08\n6 1.25346e+00\n7 3.14159e+00\n' - '8 -1.00000e+06') + expected = ( + " x\n0 0.00000e+00\n1 2.50000e-01\n" + "2 3.45600e+03\n3 1.20000e+46\n4 1.64000e+06\n" + "5 1.70000e+08\n6 1.25346e+00\n7 3.14159e+00\n" + "8 -1.00000e+06" + ) assert df_s == expected - df = DataFrame({'x': [3234, 0.253]}) + df = DataFrame({"x": [3234, 0.253]}) df_s = df.to_string() - expected = (' x\n' '0 3234.000\n' '1 0.253') + expected = " x\n" "0 3234.000\n" "1 0.253" assert df_s == expected tm.reset_display_options() assert get_option("display.precision") == 6 - df = DataFrame({'x': [1e9, 0.2512]}) + df = DataFrame({"x": [1e9, 0.2512]}) df_s = df.to_string() if _three_digit_exp(): - expected = (' x\n' - '0 1.000000e+009\n' - '1 2.512000e-001') + expected = " x\n" "0 1.000000e+009\n" "1 2.512000e-001" else: - expected = (' x\n' - '0 1.000000e+09\n' - '1 2.512000e-01') + expected = " x\n" "0 1.000000e+09\n" "1 2.512000e-01" assert df_s == expected def test_to_string_float_format_no_fixed_width(self): # GH 21625 - df = DataFrame({'x': [0.19999]}) - expected = ' x\n0 0.200' - assert df.to_string(float_format='%.3f') == expected + df = DataFrame({"x": [0.19999]}) + expected = " x\n0 0.200" + assert df.to_string(float_format="%.3f") == expected # GH 22270 - df = DataFrame({'x': [100.0]}) - expected = ' x\n0 100' - assert df.to_string(float_format='%.0f') == expected + df = DataFrame({"x": [100.0]}) + expected = " x\n0 100" + assert df.to_string(float_format="%.0f") == expected def test_to_string_small_float_values(self): - df = DataFrame({'a': [1.5, 1e-17, -5.5e-7]}) + df = DataFrame({"a": [1.5, 1e-17, -5.5e-7]}) result = df.to_string() # sadness per above - if '{x:.4g}'.format(x=1.7e8) == '1.7e+008': - expected = (' a\n' - '0 1.500000e+000\n' - '1 1.000000e-017\n' - '2 -5.500000e-007') + if "{x:.4g}".format(x=1.7e8) == "1.7e+008": + expected = ( + " a\n" + "0 1.500000e+000\n" + "1 1.000000e-017\n" + "2 -5.500000e-007" + ) else: - expected = (' a\n' - '0 1.500000e+00\n' - '1 1.000000e-17\n' - '2 -5.500000e-07') + expected = ( + " a\n" + "0 1.500000e+00\n" + "1 1.000000e-17\n" + "2 -5.500000e-07" + ) assert result == expected # but not all exactly zero df = df * 0 result = df.to_string() - expected = (' 0\n' '0 0\n' '1 0\n' '2 -0') + expected = " 0\n" "0 0\n" "1 0\n" "2 -0" def test_to_string_float_index(self): index = Index([1.5, 2, 3, 4, 5]) df = DataFrame(np.arange(5), index=index) result = df.to_string() - expected = (' 0\n' - '1.5 0\n' - '2.0 1\n' - '3.0 2\n' - '4.0 3\n' - '5.0 4') + expected = " 0\n" "1.5 0\n" "2.0 1\n" "3.0 2\n" "4.0 3\n" "5.0 4" assert result == expected def test_to_string_complex_float_formatting(self): # GH #25514 - with pd.option_context('display.precision', 5): - df = DataFrame({'x': [ - (0.4467846931321966 + 0.0715185102060818j), - (0.2739442392974528 + 0.23515228785438969j), - (0.26974928742135185 + 0.3250604054898979j)]}) + with pd.option_context("display.precision", 5): + df = DataFrame( + { + "x": [ + (0.4467846931321966 + 0.0715185102060818j), + (0.2739442392974528 + 0.23515228785438969j), + (0.26974928742135185 + 0.3250604054898979j), + ] + } + ) result = df.to_string() - expected = (' x\n0 0.44678+0.07152j\n' - '1 0.27394+0.23515j\n' - '2 0.26975+0.32506j') + expected = ( + " x\n0 0.44678+0.07152j\n" + "1 0.27394+0.23515j\n" + "2 0.26975+0.32506j" + ) assert result == expected def test_to_string_ascii_error(self): - data = [('0 ', ' .gitignore ', ' 5 ', - ' \xe2\x80\xa2\xe2\x80\xa2\xe2\x80' - '\xa2\xe2\x80\xa2\xe2\x80\xa2')] + data = [ + ( + "0 ", + " .gitignore ", + " 5 ", + " \xe2\x80\xa2\xe2\x80\xa2\xe2\x80" "\xa2\xe2\x80\xa2\xe2\x80\xa2", + ) + ] df = DataFrame(data) # it works! repr(df) def test_to_string_int_formatting(self): - df = DataFrame({'x': [-15, 20, 25, -35]}) - assert issubclass(df['x'].dtype.type, np.integer) + df = DataFrame({"x": [-15, 20, 25, -35]}) + assert issubclass(df["x"].dtype.type, np.integer) output = df.to_string() - expected = (' x\n' '0 -15\n' '1 20\n' '2 25\n' '3 -35') + expected = " x\n" "0 -15\n" "1 20\n" "2 25\n" "3 -35" assert output == expected def test_to_string_index_formatter(self): df = DataFrame([range(5), range(5, 10), range(10, 15)]) - rs = df.to_string(formatters={'__index__': lambda x: 'abc' [x]}) + rs = df.to_string(formatters={"__index__": lambda x: "abc"[x]}) xp = """\ 0 1 2 3 4 @@ -1431,220 +1594,280 @@ def test_to_string_index_formatter(self): def test_to_string_left_justify_cols(self): tm.reset_display_options() - df = DataFrame({'x': [3234, 0.253]}) - df_s = df.to_string(justify='left') - expected = (' x \n' '0 3234.000\n' '1 0.253') + df = DataFrame({"x": [3234, 0.253]}) + df_s = df.to_string(justify="left") + expected = " x \n" "0 3234.000\n" "1 0.253" assert df_s == expected def test_to_string_format_na(self): tm.reset_display_options() - df = DataFrame({'A': [np.nan, -1, -2.1234, 3, 4], - 'B': [np.nan, 'foo', 'foooo', 'fooooo', 'bar']}) + df = DataFrame( + { + "A": [np.nan, -1, -2.1234, 3, 4], + "B": [np.nan, "foo", "foooo", "fooooo", "bar"], + } + ) result = df.to_string() - expected = (' A B\n' - '0 NaN NaN\n' - '1 -1.0000 foo\n' - '2 -2.1234 foooo\n' - '3 3.0000 fooooo\n' - '4 4.0000 bar') + expected = ( + " A B\n" + "0 NaN NaN\n" + "1 -1.0000 foo\n" + "2 -2.1234 foooo\n" + "3 3.0000 fooooo\n" + "4 4.0000 bar" + ) assert result == expected - df = DataFrame({'A': [np.nan, -1., -2., 3., 4.], - 'B': [np.nan, 'foo', 'foooo', 'fooooo', 'bar']}) + df = DataFrame( + { + "A": [np.nan, -1.0, -2.0, 3.0, 4.0], + "B": [np.nan, "foo", "foooo", "fooooo", "bar"], + } + ) result = df.to_string() - expected = (' A B\n' - '0 NaN NaN\n' - '1 -1.0 foo\n' - '2 -2.0 foooo\n' - '3 3.0 fooooo\n' - '4 4.0 bar') + expected = ( + " A B\n" + "0 NaN NaN\n" + "1 -1.0 foo\n" + "2 -2.0 foooo\n" + "3 3.0 fooooo\n" + "4 4.0 bar" + ) assert result == expected def test_to_string_format_inf(self): # Issue #24861 tm.reset_display_options() - df = DataFrame({ - 'A': [-np.inf, np.inf, -1, -2.1234, 3, 4], - 'B': [-np.inf, np.inf, 'foo', 'foooo', 'fooooo', 'bar'] - }) + df = DataFrame( + { + "A": [-np.inf, np.inf, -1, -2.1234, 3, 4], + "B": [-np.inf, np.inf, "foo", "foooo", "fooooo", "bar"], + } + ) result = df.to_string() - expected = (' A B\n' - '0 -inf -inf\n' - '1 inf inf\n' - '2 -1.0000 foo\n' - '3 -2.1234 foooo\n' - '4 3.0000 fooooo\n' - '5 4.0000 bar') + expected = ( + " A B\n" + "0 -inf -inf\n" + "1 inf inf\n" + "2 -1.0000 foo\n" + "3 -2.1234 foooo\n" + "4 3.0000 fooooo\n" + "5 4.0000 bar" + ) assert result == expected - df = DataFrame({ - 'A': [-np.inf, np.inf, -1., -2., 3., 4.], - 'B': [-np.inf, np.inf, 'foo', 'foooo', 'fooooo', 'bar'] - }) + df = DataFrame( + { + "A": [-np.inf, np.inf, -1.0, -2.0, 3.0, 4.0], + "B": [-np.inf, np.inf, "foo", "foooo", "fooooo", "bar"], + } + ) result = df.to_string() - expected = (' A B\n' - '0 -inf -inf\n' - '1 inf inf\n' - '2 -1.0 foo\n' - '3 -2.0 foooo\n' - '4 3.0 fooooo\n' - '5 4.0 bar') + expected = ( + " A B\n" + "0 -inf -inf\n" + "1 inf inf\n" + "2 -1.0 foo\n" + "3 -2.0 foooo\n" + "4 3.0 fooooo\n" + "5 4.0 bar" + ) assert result == expected def test_to_string_decimal(self): # Issue #23614 - df = DataFrame({'A': [6.0, 3.1, 2.2]}) - expected = ' A\n0 6,0\n1 3,1\n2 2,2' - assert df.to_string(decimal=',') == expected + df = DataFrame({"A": [6.0, 3.1, 2.2]}) + expected = " A\n0 6,0\n1 3,1\n2 2,2" + assert df.to_string(decimal=",") == expected def test_to_string_line_width(self): df = DataFrame(123, index=range(10, 15), columns=range(30)) s = df.to_string(line_width=80) - assert max(len(l) for l in s.split('\n')) == 80 + assert max(len(l) for l in s.split("\n")) == 80 def test_show_dimensions(self): df = DataFrame(123, index=range(10, 15), columns=range(30)) - with option_context('display.max_rows', 10, 'display.max_columns', 40, - 'display.width', 500, 'display.expand_frame_repr', - 'info', 'display.show_dimensions', True): - assert '5 rows' in str(df) - assert '5 rows' in df._repr_html_() - with option_context('display.max_rows', 10, 'display.max_columns', 40, - 'display.width', 500, 'display.expand_frame_repr', - 'info', 'display.show_dimensions', False): - assert '5 rows' not in str(df) - assert '5 rows' not in df._repr_html_() - with option_context('display.max_rows', 2, 'display.max_columns', 2, - 'display.width', 500, 'display.expand_frame_repr', - 'info', 'display.show_dimensions', 'truncate'): - assert '5 rows' in str(df) - assert '5 rows' in df._repr_html_() - with option_context('display.max_rows', 10, 'display.max_columns', 40, - 'display.width', 500, 'display.expand_frame_repr', - 'info', 'display.show_dimensions', 'truncate'): - assert '5 rows' not in str(df) - assert '5 rows' not in df._repr_html_() + with option_context( + "display.max_rows", + 10, + "display.max_columns", + 40, + "display.width", + 500, + "display.expand_frame_repr", + "info", + "display.show_dimensions", + True, + ): + assert "5 rows" in str(df) + assert "5 rows" in df._repr_html_() + with option_context( + "display.max_rows", + 10, + "display.max_columns", + 40, + "display.width", + 500, + "display.expand_frame_repr", + "info", + "display.show_dimensions", + False, + ): + assert "5 rows" not in str(df) + assert "5 rows" not in df._repr_html_() + with option_context( + "display.max_rows", + 2, + "display.max_columns", + 2, + "display.width", + 500, + "display.expand_frame_repr", + "info", + "display.show_dimensions", + "truncate", + ): + assert "5 rows" in str(df) + assert "5 rows" in df._repr_html_() + with option_context( + "display.max_rows", + 10, + "display.max_columns", + 40, + "display.width", + 500, + "display.expand_frame_repr", + "info", + "display.show_dimensions", + "truncate", + ): + assert "5 rows" not in str(df) + assert "5 rows" not in df._repr_html_() def test_repr_html(self, float_frame): df = float_frame df._repr_html_() - fmt.set_option('display.max_rows', 1, 'display.max_columns', 1) + fmt.set_option("display.max_rows", 1, "display.max_columns", 1) df._repr_html_() - fmt.set_option('display.notebook_repr_html', False) + fmt.set_option("display.notebook_repr_html", False) df._repr_html_() tm.reset_display_options() df = DataFrame([[1, 2], [3, 4]]) - fmt.set_option('display.show_dimensions', True) - assert '2 rows' in df._repr_html_() - fmt.set_option('display.show_dimensions', False) - assert '2 rows' not in df._repr_html_() + fmt.set_option("display.show_dimensions", True) + assert "2 rows" in df._repr_html_() + fmt.set_option("display.show_dimensions", False) + assert "2 rows" not in df._repr_html_() tm.reset_display_options() def test_repr_html_mathjax(self): df = DataFrame([[1, 2], [3, 4]]) - assert 'tex2jax_ignore' not in df._repr_html_() + assert "tex2jax_ignore" not in df._repr_html_() - with pd.option_context('display.html.use_mathjax', False): - assert 'tex2jax_ignore' in df._repr_html_() + with pd.option_context("display.html.use_mathjax", False): + assert "tex2jax_ignore" in df._repr_html_() def test_repr_html_wide(self): max_cols = 20 df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) - with option_context('display.max_rows', 60, 'display.max_columns', 20): + with option_context("display.max_rows", 60, "display.max_columns", 20): assert "..." not in df._repr_html_() wide_df = DataFrame(tm.rands_array(25, size=(10, max_cols + 1))) - with option_context('display.max_rows', 60, 'display.max_columns', 20): + with option_context("display.max_rows", 60, "display.max_columns", 20): assert "..." in wide_df._repr_html_() def test_repr_html_wide_multiindex_cols(self): max_cols = 20 - mcols = MultiIndex.from_product([np.arange(max_cols // 2), - ['foo', 'bar']], - names=['first', 'second']) - df = DataFrame(tm.rands_array(25, size=(10, len(mcols))), - columns=mcols) + mcols = MultiIndex.from_product( + [np.arange(max_cols // 2), ["foo", "bar"]], names=["first", "second"] + ) + df = DataFrame(tm.rands_array(25, size=(10, len(mcols))), columns=mcols) reg_repr = df._repr_html_() - assert '...' not in reg_repr + assert "..." not in reg_repr - mcols = MultiIndex.from_product((np.arange(1 + (max_cols // 2)), - ['foo', 'bar']), - names=['first', 'second']) - df = DataFrame(tm.rands_array(25, size=(10, len(mcols))), - columns=mcols) - with option_context('display.max_rows', 60, 'display.max_columns', 20): - assert '...' in df._repr_html_() + mcols = MultiIndex.from_product( + (np.arange(1 + (max_cols // 2)), ["foo", "bar"]), names=["first", "second"] + ) + df = DataFrame(tm.rands_array(25, size=(10, len(mcols))), columns=mcols) + with option_context("display.max_rows", 60, "display.max_columns", 20): + assert "..." in df._repr_html_() def test_repr_html_long(self): - with option_context('display.max_rows', 60): - max_rows = get_option('display.max_rows') + with option_context("display.max_rows", 60): + max_rows = get_option("display.max_rows") h = max_rows - 1 - df = DataFrame({'A': np.arange(1, 1 + h), - 'B': np.arange(41, 41 + h)}) + df = DataFrame({"A": np.arange(1, 1 + h), "B": np.arange(41, 41 + h)}) reg_repr = df._repr_html_() - assert '..' not in reg_repr + assert ".." not in reg_repr assert str(41 + max_rows // 2) in reg_repr h = max_rows + 1 - df = DataFrame({'A': np.arange(1, 1 + h), - 'B': np.arange(41, 41 + h)}) + df = DataFrame({"A": np.arange(1, 1 + h), "B": np.arange(41, 41 + h)}) long_repr = df._repr_html_() - assert '..' in long_repr + assert ".." in long_repr assert str(41 + max_rows // 2) not in long_repr - assert '{h} rows '.format(h=h) in long_repr - assert '2 columns' in long_repr + assert "{h} rows ".format(h=h) in long_repr + assert "2 columns" in long_repr def test_repr_html_float(self): - with option_context('display.max_rows', 60): + with option_context("display.max_rows", 60): - max_rows = get_option('display.max_rows') + max_rows = get_option("display.max_rows") h = max_rows - 1 - df = DataFrame({'idx': np.linspace(-10, 10, h), - 'A': np.arange(1, 1 + h), - 'B': np.arange(41, 41 + h)}).set_index('idx') + df = DataFrame( + { + "idx": np.linspace(-10, 10, h), + "A": np.arange(1, 1 + h), + "B": np.arange(41, 41 + h), + } + ).set_index("idx") reg_repr = df._repr_html_() - assert '..' not in reg_repr - assert '{val}'.format(val=str(40 + h)) in reg_repr + assert ".." not in reg_repr + assert "{val}".format(val=str(40 + h)) in reg_repr h = max_rows + 1 - df = DataFrame({'idx': np.linspace(-10, 10, h), - 'A': np.arange(1, 1 + h), - 'B': np.arange(41, 41 + h)}).set_index('idx') + df = DataFrame( + { + "idx": np.linspace(-10, 10, h), + "A": np.arange(1, 1 + h), + "B": np.arange(41, 41 + h), + } + ).set_index("idx") long_repr = df._repr_html_() - assert '..' in long_repr - assert '{val}'.format(val='31') not in long_repr - assert '{h} rows '.format(h=h) in long_repr - assert '2 columns' in long_repr + assert ".." in long_repr + assert "{val}".format(val="31") not in long_repr + assert "{h} rows ".format(h=h) in long_repr + assert "2 columns" in long_repr def test_repr_html_long_multiindex(self): max_rows = 60 max_L1 = max_rows // 2 - tuples = list(itertools.product(np.arange(max_L1), ['foo', 'bar'])) - idx = MultiIndex.from_tuples(tuples, names=['first', 'second']) - df = DataFrame(np.random.randn(max_L1 * 2, 2), index=idx, - columns=['A', 'B']) - with option_context('display.max_rows', 60, 'display.max_columns', 20): + tuples = list(itertools.product(np.arange(max_L1), ["foo", "bar"])) + idx = MultiIndex.from_tuples(tuples, names=["first", "second"]) + df = DataFrame(np.random.randn(max_L1 * 2, 2), index=idx, columns=["A", "B"]) + with option_context("display.max_rows", 60, "display.max_columns", 20): reg_repr = df._repr_html_() - assert '...' not in reg_repr + assert "..." not in reg_repr - tuples = list(itertools.product(np.arange(max_L1 + 1), ['foo', 'bar'])) - idx = MultiIndex.from_tuples(tuples, names=['first', 'second']) - df = DataFrame(np.random.randn((max_L1 + 1) * 2, 2), index=idx, - columns=['A', 'B']) + tuples = list(itertools.product(np.arange(max_L1 + 1), ["foo", "bar"])) + idx = MultiIndex.from_tuples(tuples, names=["first", "second"]) + df = DataFrame( + np.random.randn((max_L1 + 1) * 2, 2), index=idx, columns=["A", "B"] + ) long_repr = df._repr_html_() - assert '...' in long_repr + assert "..." in long_repr def test_repr_html_long_and_wide(self): max_cols = 20 @@ -1652,13 +1875,13 @@ def test_repr_html_long_and_wide(self): h, w = max_rows - 1, max_cols - 1 df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) - with option_context('display.max_rows', 60, 'display.max_columns', 20): - assert '...' not in df._repr_html_() + with option_context("display.max_rows", 60, "display.max_columns", 20): + assert "..." not in df._repr_html_() h, w = max_rows + 1, max_cols + 1 df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) - with option_context('display.max_rows', 60, 'display.max_columns', 20): - assert '...' in df._repr_html_() + with option_context("display.max_rows", 60, "display.max_columns", 20): + assert "..." in df._repr_html_() def test_info_repr(self): # GH#21746 For tests inside a terminal (i.e. not CI) we need to detect @@ -1671,28 +1894,39 @@ def test_info_repr(self): h, w = max_rows + 1, max_cols - 1 df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) assert has_vertically_truncated_repr(df) - with option_context('display.large_repr', 'info'): + with option_context("display.large_repr", "info"): assert has_info_repr(df) # Wide h, w = max_rows - 1, max_cols + 1 df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) assert has_horizontally_truncated_repr(df) - with option_context('display.large_repr', 'info', - 'display.max_columns', max_cols): + with option_context( + "display.large_repr", "info", "display.max_columns", max_cols + ): assert has_info_repr(df) def test_info_repr_max_cols(self): # GH #6939 df = DataFrame(np.random.randn(10, 5)) - with option_context('display.large_repr', 'info', - 'display.max_columns', 1, - 'display.max_info_columns', 4): + with option_context( + "display.large_repr", + "info", + "display.max_columns", + 1, + "display.max_info_columns", + 4, + ): assert has_non_verbose_info_repr(df) - with option_context('display.large_repr', 'info', - 'display.max_columns', 1, - 'display.max_info_columns', 5): + with option_context( + "display.large_repr", + "info", + "display.max_columns", + 1, + "display.max_info_columns", + 5, + ): assert not has_non_verbose_info_repr(df) # test verbose overrides @@ -1704,32 +1938,32 @@ def test_info_repr_html(self): # Long h, w = max_rows + 1, max_cols - 1 df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) - assert r'<class' not in df._repr_html_() - with option_context('display.large_repr', 'info'): - assert r'<class' in df._repr_html_() + assert r"<class" not in df._repr_html_() + with option_context("display.large_repr", "info"): + assert r"<class" in df._repr_html_() # Wide h, w = max_rows - 1, max_cols + 1 df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) - assert ' never truncate - assert '..' not in repr(s) + assert ".." not in repr(s) def test_to_string_name(self): - s = Series(range(100), dtype='int64') - s.name = 'myser' + s = Series(range(100), dtype="int64") + s.name = "myser" res = s.to_string(max_rows=2, name=True) - exp = '0 0\n ..\n99 99\nName: myser' + exp = "0 0\n ..\n99 99\nName: myser" assert res == exp res = s.to_string(max_rows=2, name=False) - exp = '0 0\n ..\n99 99' + exp = "0 0\n ..\n99 99" assert res == exp def test_to_string_dtype(self): - s = Series(range(100), dtype='int64') + s = Series(range(100), dtype="int64") res = s.to_string(max_rows=2, dtype=True) - exp = '0 0\n ..\n99 99\ndtype: int64' + exp = "0 0\n ..\n99 99\ndtype: int64" assert res == exp res = s.to_string(max_rows=2, dtype=False) - exp = '0 0\n ..\n99 99' + exp = "0 0\n ..\n99 99" assert res == exp def test_to_string_length(self): - s = Series(range(100), dtype='int64') + s = Series(range(100), dtype="int64") res = s.to_string(max_rows=2, length=True) - exp = '0 0\n ..\n99 99\nLength: 100' + exp = "0 0\n ..\n99 99\nLength: 100" assert res == exp def test_to_string_na_rep(self): s = pd.Series(index=range(100)) - res = s.to_string(na_rep='foo', max_rows=2) - exp = '0 foo\n ..\n99 foo' + res = s.to_string(na_rep="foo", max_rows=2) + exp = "0 foo\n ..\n99 foo" assert res == exp def test_to_string_float_format(self): - s = pd.Series(range(10), dtype='float64') - res = s.to_string(float_format=lambda x: '{0:2.1f}'.format(x), - max_rows=2) - exp = '0 0.0\n ..\n9 9.0' + s = pd.Series(range(10), dtype="float64") + res = s.to_string(float_format=lambda x: "{0:2.1f}".format(x), max_rows=2) + exp = "0 0.0\n ..\n9 9.0" assert res == exp def test_to_string_header(self): - s = pd.Series(range(10), dtype='int64') - s.index.name = 'foo' + s = pd.Series(range(10), dtype="int64") + s.index.name = "foo" res = s.to_string(header=True, max_rows=2) - exp = 'foo\n0 0\n ..\n9 9' + exp = "foo\n0 0\n ..\n9 9" assert res == exp res = s.to_string(header=False, max_rows=2) - exp = '0 0\n ..\n9 9' + exp = "0 0\n ..\n9 9" assert res == exp def test_to_string_multindex_header(self): # GH 16718 - df = (pd.DataFrame({'a': [0], 'b': [1], 'c': [2], 'd': [3]}) - .set_index(['a', 'b'])) - res = df.to_string(header=['r1', 'r2']) - exp = ' r1 r2\na b \n0 1 2 3' + df = pd.DataFrame({"a": [0], "b": [1], "c": [2], "d": [3]}).set_index( + ["a", "b"] + ) + res = df.to_string(header=["r1", "r2"]) + exp = " r1 r2\na b \n0 1 2 3" assert res == exp def _three_digit_exp(): - return '{x:.4g}'.format(x=1.7e8) == '1.7e+008' + return "{x:.4g}".format(x=1.7e8) == "1.7e+008" class TestFloatArrayFormatter: - def test_misc(self): obj = fmt.FloatArrayFormatter(np.array([], dtype=np.float64)) result = obj.get_result() @@ -2418,57 +2724,70 @@ def test_output_significant_digits(self): # Issue #9764 # In case default display precision changes: - with pd.option_context('display.precision', 6): + with pd.option_context("display.precision", 6): # DataFrame example from issue #9764 d = pd.DataFrame( - {'col1': [9.999e-8, 1e-7, 1.0001e-7, 2e-7, 4.999e-7, 5e-7, - 5.0001e-7, 6e-7, 9.999e-7, 1e-6, 1.0001e-6, 2e-6, - 4.999e-6, 5e-6, 5.0001e-6, 6e-6]}) + { + "col1": [ + 9.999e-8, + 1e-7, + 1.0001e-7, + 2e-7, + 4.999e-7, + 5e-7, + 5.0001e-7, + 6e-7, + 9.999e-7, + 1e-6, + 1.0001e-6, + 2e-6, + 4.999e-6, + 5e-6, + 5.0001e-6, + 6e-6, + ] + } + ) expected_output = { - (0, 6): - ' col1\n' - '0 9.999000e-08\n' - '1 1.000000e-07\n' - '2 1.000100e-07\n' - '3 2.000000e-07\n' - '4 4.999000e-07\n' - '5 5.000000e-07', - (1, 6): - ' col1\n' - '1 1.000000e-07\n' - '2 1.000100e-07\n' - '3 2.000000e-07\n' - '4 4.999000e-07\n' - '5 5.000000e-07', - (1, 8): - ' col1\n' - '1 1.000000e-07\n' - '2 1.000100e-07\n' - '3 2.000000e-07\n' - '4 4.999000e-07\n' - '5 5.000000e-07\n' - '6 5.000100e-07\n' - '7 6.000000e-07', - (8, 16): - ' col1\n' - '8 9.999000e-07\n' - '9 1.000000e-06\n' - '10 1.000100e-06\n' - '11 2.000000e-06\n' - '12 4.999000e-06\n' - '13 5.000000e-06\n' - '14 5.000100e-06\n' - '15 6.000000e-06', - (9, 16): - ' col1\n' - '9 0.000001\n' - '10 0.000001\n' - '11 0.000002\n' - '12 0.000005\n' - '13 0.000005\n' - '14 0.000005\n' - '15 0.000006' + (0, 6): " col1\n" + "0 9.999000e-08\n" + "1 1.000000e-07\n" + "2 1.000100e-07\n" + "3 2.000000e-07\n" + "4 4.999000e-07\n" + "5 5.000000e-07", + (1, 6): " col1\n" + "1 1.000000e-07\n" + "2 1.000100e-07\n" + "3 2.000000e-07\n" + "4 4.999000e-07\n" + "5 5.000000e-07", + (1, 8): " col1\n" + "1 1.000000e-07\n" + "2 1.000100e-07\n" + "3 2.000000e-07\n" + "4 4.999000e-07\n" + "5 5.000000e-07\n" + "6 5.000100e-07\n" + "7 6.000000e-07", + (8, 16): " col1\n" + "8 9.999000e-07\n" + "9 1.000000e-06\n" + "10 1.000100e-06\n" + "11 2.000000e-06\n" + "12 4.999000e-06\n" + "13 5.000000e-06\n" + "14 5.000100e-06\n" + "15 6.000000e-06", + (9, 16): " col1\n" + "9 0.000001\n" + "10 0.000001\n" + "11 0.000002\n" + "12 0.000005\n" + "13 0.000005\n" + "14 0.000005\n" + "15 0.000006", } for (start, stop), v in expected_output.items(): @@ -2476,24 +2795,23 @@ def test_output_significant_digits(self): def test_too_long(self): # GH 10451 - with pd.option_context('display.precision', 4): + with pd.option_context("display.precision", 4): # need both a number > 1e6 and something that normally formats to # having length > display.precision + 6 df = pd.DataFrame(dict(x=[12345.6789])) - assert str(df) == ' x\n0 12345.6789' + assert str(df) == " x\n0 12345.6789" df = pd.DataFrame(dict(x=[2e6])) - assert str(df) == ' x\n0 2000000.0' + assert str(df) == " x\n0 2000000.0" df = pd.DataFrame(dict(x=[12345.6789, 2e6])) - assert str(df) == ' x\n0 1.2346e+04\n1 2.0000e+06' + assert str(df) == " x\n0 1.2346e+04\n1 2.0000e+06" class TestRepr_timedelta64: - def test_none(self): - delta_1d = pd.to_timedelta(1, unit='D') - delta_0d = pd.to_timedelta(0, unit='D') - delta_1s = pd.to_timedelta(1, unit='s') - delta_500ms = pd.to_timedelta(500, unit='ms') + delta_1d = pd.to_timedelta(1, unit="D") + delta_0d = pd.to_timedelta(0, unit="D") + delta_1s = pd.to_timedelta(1, unit="s") + delta_500ms = pd.to_timedelta(500, unit="ms") drepr = lambda x: x._repr_base() assert drepr(delta_1d) == "1 days" @@ -2507,12 +2825,12 @@ def test_none(self): assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000" def test_sub_day(self): - delta_1d = pd.to_timedelta(1, unit='D') - delta_0d = pd.to_timedelta(0, unit='D') - delta_1s = pd.to_timedelta(1, unit='s') - delta_500ms = pd.to_timedelta(500, unit='ms') + delta_1d = pd.to_timedelta(1, unit="D") + delta_0d = pd.to_timedelta(0, unit="D") + delta_1s = pd.to_timedelta(1, unit="s") + delta_500ms = pd.to_timedelta(500, unit="ms") - drepr = lambda x: x._repr_base(format='sub_day') + drepr = lambda x: x._repr_base(format="sub_day") assert drepr(delta_1d) == "1 days" assert drepr(-delta_1d) == "-1 days" assert drepr(delta_0d) == "00:00:00" @@ -2524,12 +2842,12 @@ def test_sub_day(self): assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000" def test_long(self): - delta_1d = pd.to_timedelta(1, unit='D') - delta_0d = pd.to_timedelta(0, unit='D') - delta_1s = pd.to_timedelta(1, unit='s') - delta_500ms = pd.to_timedelta(500, unit='ms') + delta_1d = pd.to_timedelta(1, unit="D") + delta_0d = pd.to_timedelta(0, unit="D") + delta_1s = pd.to_timedelta(1, unit="s") + delta_500ms = pd.to_timedelta(500, unit="ms") - drepr = lambda x: x._repr_base(format='long') + drepr = lambda x: x._repr_base(format="long") assert drepr(delta_1d) == "1 days 00:00:00" assert drepr(-delta_1d) == "-1 days +00:00:00" assert drepr(delta_0d) == "0 days 00:00:00" @@ -2541,11 +2859,11 @@ def test_long(self): assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000" def test_all(self): - delta_1d = pd.to_timedelta(1, unit='D') - delta_0d = pd.to_timedelta(0, unit='D') - delta_1ns = pd.to_timedelta(1, unit='ns') + delta_1d = pd.to_timedelta(1, unit="D") + delta_0d = pd.to_timedelta(0, unit="D") + delta_1ns = pd.to_timedelta(1, unit="ns") - drepr = lambda x: x._repr_base(format='all') + drepr = lambda x: x._repr_base(format="all") assert drepr(delta_1d) == "1 days 00:00:00.000000000" assert drepr(-delta_1d) == "-1 days +00:00:00.000000000" assert drepr(delta_0d) == "0 days 00:00:00.000000000" @@ -2554,9 +2872,8 @@ def test_all(self): class TestTimedelta64Formatter: - def test_days(self): - x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='D') + x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit="D") result = fmt.Timedelta64Formatter(x, box=True).get_result() assert result[0].strip() == "'0 days'" assert result[1].strip() == "'1 days'" @@ -2572,35 +2889,34 @@ def test_days(self): assert result[0].strip() == "1 days" def test_days_neg(self): - x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='D') + x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit="D") result = fmt.Timedelta64Formatter(-x, box=True).get_result() assert result[0].strip() == "'0 days'" assert result[1].strip() == "'-1 days'" def test_subdays(self): - y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='s') + y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit="s") result = fmt.Timedelta64Formatter(y, box=True).get_result() assert result[0].strip() == "'00:00:00'" assert result[1].strip() == "'00:00:01'" def test_subdays_neg(self): - y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='s') + y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit="s") result = fmt.Timedelta64Formatter(-y, box=True).get_result() assert result[0].strip() == "'00:00:00'" assert result[1].strip() == "'-1 days +23:59:59'" def test_zero(self): - x = pd.to_timedelta(list(range(1)) + [pd.NaT], unit='D') + x = pd.to_timedelta(list(range(1)) + [pd.NaT], unit="D") result = fmt.Timedelta64Formatter(x, box=True).get_result() assert result[0].strip() == "'0 days'" - x = pd.to_timedelta(list(range(1)), unit='D') + x = pd.to_timedelta(list(range(1)), unit="D") result = fmt.Timedelta64Formatter(x, box=True).get_result() assert result[0].strip() == "'0 days'" class TestDatetime64Formatter: - def test_mixed(self): x = Series([datetime(2013, 1, 1), datetime(2013, 1, 1, 12), pd.NaT]) result = fmt.Datetime64Formatter(x).get_result() @@ -2622,35 +2938,35 @@ def test_dates_display(self): # 10170 # make sure that we are consistently display date formatting - x = Series(date_range('20130101 09:00:00', periods=5, freq='D')) + x = Series(date_range("20130101 09:00:00", periods=5, freq="D")) x.iloc[1] = np.nan result = fmt.Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01 09:00:00" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-05 09:00:00" - x = Series(date_range('20130101 09:00:00', periods=5, freq='s')) + x = Series(date_range("20130101 09:00:00", periods=5, freq="s")) x.iloc[1] = np.nan result = fmt.Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01 09:00:00" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:04" - x = Series(date_range('20130101 09:00:00', periods=5, freq='ms')) + x = Series(date_range("20130101 09:00:00", periods=5, freq="ms")) x.iloc[1] = np.nan result = fmt.Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01 09:00:00.000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.004" - x = Series(date_range('20130101 09:00:00', periods=5, freq='us')) + x = Series(date_range("20130101 09:00:00", periods=5, freq="us")) x.iloc[1] = np.nan result = fmt.Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01 09:00:00.000000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.000004" - x = Series(date_range('20130101 09:00:00', periods=5, freq='N')) + x = Series(date_range("20130101 09:00:00", periods=5, freq="N")) x.iloc[1] = np.nan result = fmt.Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01 09:00:00.000000000" @@ -2661,27 +2977,27 @@ def test_datetime64formatter_yearmonth(self): x = Series([datetime(2016, 1, 1), datetime(2016, 2, 2)]) def format_func(x): - return x.strftime('%Y-%m') + return x.strftime("%Y-%m") formatter = fmt.Datetime64Formatter(x, formatter=format_func) result = formatter.get_result() - assert result == ['2016-01', '2016-02'] + assert result == ["2016-01", "2016-02"] def test_datetime64formatter_hoursecond(self): - x = Series(pd.to_datetime(['10:10:10.100', '12:12:12.120'], - format='%H:%M:%S.%f')) + x = Series( + pd.to_datetime(["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f") + ) def format_func(x): - return x.strftime('%H:%M') + return x.strftime("%H:%M") formatter = fmt.Datetime64Formatter(x, formatter=format_func) result = formatter.get_result() - assert result == ['10:10', '12:12'] + assert result == ["10:10", "12:12"] class TestNaTFormatting: - def test_repr(self): assert repr(pd.NaT) == "NaT" @@ -2690,7 +3006,6 @@ def test_str(self): class TestDatetimeIndexFormat: - def test_datetime(self): formatted = pd.to_datetime([datetime(2003, 1, 1, 12), pd.NaT]).format() assert formatted[0] == "2003-01-01 12:00:00" @@ -2705,34 +3020,34 @@ def test_date_tz(self): formatted = pd.to_datetime([datetime(2013, 1, 1)], utc=True).format() assert formatted[0] == "2013-01-01 00:00:00+00:00" - formatted = pd.to_datetime( - [datetime(2013, 1, 1), pd.NaT], utc=True).format() + formatted = pd.to_datetime([datetime(2013, 1, 1), pd.NaT], utc=True).format() assert formatted[0] == "2013-01-01 00:00:00+00:00" def test_date_explicit_date_format(self): formatted = pd.to_datetime([datetime(2003, 2, 1), pd.NaT]).format( - date_format="%m-%d-%Y", na_rep="UT") + date_format="%m-%d-%Y", na_rep="UT" + ) assert formatted[0] == "02-01-2003" assert formatted[1] == "UT" class TestDatetimeIndexUnicode: - def test_dates(self): - text = str(pd.to_datetime([datetime(2013, 1, 1), datetime(2014, 1, 1) - ])) + text = str(pd.to_datetime([datetime(2013, 1, 1), datetime(2014, 1, 1)])) assert "['2013-01-01'," in text assert ", '2014-01-01']" in text def test_mixed(self): - text = str(pd.to_datetime([datetime(2013, 1, 1), datetime( - 2014, 1, 1, 12), datetime(2014, 1, 1)])) + text = str( + pd.to_datetime( + [datetime(2013, 1, 1), datetime(2014, 1, 1, 12), datetime(2014, 1, 1)] + ) + ) assert "'2013-01-01 00:00:00'," in text assert "'2014-01-01 00:00:00']" in text class TestStringRepTimestamp: - def test_no_tz(self): dt_date = datetime(2013, 1, 2) assert str(dt_date) == str(Timestamp(dt_date)) @@ -2772,17 +3087,17 @@ def test_tz_dateutil(self): assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us)) def test_nat_representations(self): - for f in (str, repr, methodcaller('isoformat')): - assert f(pd.NaT) == 'NaT' + for f in (str, repr, methodcaller("isoformat")): + assert f(pd.NaT) == "NaT" def test_format_percentiles(): result = fmt.format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999]) - expected = ['1.999%', '2.001%', '50%', '66.667%', '99.99%'] + expected = ["1.999%", "2.001%", "50%", "66.667%", "99.99%"] assert result == expected result = fmt.format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999]) - expected = ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%'] + expected = ["0%", "50%", "2.0%", "50%", "66.67%", "99.99%"] assert result == expected msg = r"percentiles should all be in the interval \[0,1\]" @@ -2793,19 +3108,31 @@ def test_format_percentiles(): with pytest.raises(ValueError, match=msg): fmt.format_percentiles([2, 0.1, 0.5]) with pytest.raises(ValueError, match=msg): - fmt.format_percentiles([0.1, 0.5, 'a']) + fmt.format_percentiles([0.1, 0.5, "a"]) def test_format_percentiles_integer_idx(): # Issue #26660 result = fmt.format_percentiles(np.linspace(0, 1, 10 + 1)) - expected = ['0%', '10%', '20%', '30%', '40%', '50%', - '60%', '70%', '80%', '90%', '100%'] + expected = [ + "0%", + "10%", + "20%", + "30%", + "40%", + "50%", + "60%", + "70%", + "80%", + "90%", + "100%", + ] assert result == expected def test_repr_html_ipython_config(ip): - code = textwrap.dedent("""\ + code = textwrap.dedent( + """\ import pandas as pd df = pd.DataFrame({"A": [1, 2]}) df._repr_html_() @@ -2813,6 +3140,7 @@ def test_repr_html_ipython_config(ip): cfg = get_ipython().config cfg['IPKernelApp']['parent_appname'] df._repr_html_() - """) + """ + ) result = ip.run_cell(code) assert not result.error_in_exec diff --git a/pandas/tests/io/formats/test_printing.py b/pandas/tests/io/formats/test_printing.py index 1ea7c4d94bbbe8..f0d5ef19c44682 100644 --- a/pandas/tests/io/formats/test_printing.py +++ b/pandas/tests/io/formats/test_printing.py @@ -10,22 +10,23 @@ def test_adjoin(): - data = [['a', 'b', 'c'], ['dd', 'ee', 'ff'], ['ggg', 'hhh', 'iii']] - expected = 'a dd ggg\nb ee hhh\nc ff iii' + data = [["a", "b", "c"], ["dd", "ee", "ff"], ["ggg", "hhh", "iii"]] + expected = "a dd ggg\nb ee hhh\nc ff iii" adjoined = printing.adjoin(2, *data) - assert (adjoined == expected) + assert adjoined == expected def test_repr_binary_type(): import string + letters = string.ascii_letters try: - raw = bytes(letters, encoding=cf.get_option('display.encoding')) + raw = bytes(letters, encoding=cf.get_option("display.encoding")) except TypeError: raw = bytes(letters) - b = str(raw.decode('utf-8')) + b = str(raw.decode("utf-8")) res = printing.pprint_thing(b, quote_strings=True) assert res == repr(b) res = printing.pprint_thing(b, quote_strings=False) @@ -33,18 +34,17 @@ def test_repr_binary_type(): class TestFormattBase: - def test_adjoin(self): - data = [['a', 'b', 'c'], ['dd', 'ee', 'ff'], ['ggg', 'hhh', 'iii']] - expected = 'a dd ggg\nb ee hhh\nc ff iii' + data = [["a", "b", "c"], ["dd", "ee", "ff"], ["ggg", "hhh", "iii"]] + expected = "a dd ggg\nb ee hhh\nc ff iii" adjoined = printing.adjoin(2, *data) assert adjoined == expected def test_adjoin_unicode(self): - data = [['あ', 'b', 'c'], ['dd', 'ええ', 'ff'], ['ggg', 'hhh', 'いいい']] - expected = 'あ dd ggg\nb ええ hhh\nc ff いいい' + data = [["あ", "b", "c"], ["dd", "ええ", "ff"], ["ggg", "hhh", "いいい"]] + expected = "あ dd ggg\nb ええ hhh\nc ff いいい" adjoined = printing.adjoin(2, *data) assert adjoined == expected @@ -56,7 +56,7 @@ def test_adjoin_unicode(self): adjoined = adj.adjoin(2, *data) assert adjoined == expected - cols = adjoined.split('\n') + cols = adjoined.split("\n") assert adj.len(cols[0]) == 13 assert adj.len(cols[1]) == 13 assert adj.len(cols[2]) == 16 @@ -67,7 +67,7 @@ def test_adjoin_unicode(self): adjoined = adj.adjoin(7, *data) assert adjoined == expected - cols = adjoined.split('\n') + cols = adjoined.split("\n") assert adj.len(cols[0]) == 23 assert adj.len(cols[1]) == 23 assert adj.len(cols[2]) == 26 @@ -79,92 +79,95 @@ def just(x, *args, **kwargs): # wrapper to test single str return adj.justify([x], *args, **kwargs)[0] - assert just('abc', 5, mode='left') == 'abc ' - assert just('abc', 5, mode='center') == ' abc ' - assert just('abc', 5, mode='right') == ' abc' - assert just('abc', 5, mode='left') == 'abc ' - assert just('abc', 5, mode='center') == ' abc ' - assert just('abc', 5, mode='right') == ' abc' + assert just("abc", 5, mode="left") == "abc " + assert just("abc", 5, mode="center") == " abc " + assert just("abc", 5, mode="right") == " abc" + assert just("abc", 5, mode="left") == "abc " + assert just("abc", 5, mode="center") == " abc " + assert just("abc", 5, mode="right") == " abc" - assert just('パンダ', 5, mode='left') == 'パンダ' - assert just('パンダ', 5, mode='center') == 'パンダ' - assert just('パンダ', 5, mode='right') == 'パンダ' + assert just("パンダ", 5, mode="left") == "パンダ" + assert just("パンダ", 5, mode="center") == "パンダ" + assert just("パンダ", 5, mode="right") == "パンダ" - assert just('パンダ', 10, mode='left') == 'パンダ ' - assert just('パンダ', 10, mode='center') == ' パンダ ' - assert just('パンダ', 10, mode='right') == ' パンダ' + assert just("パンダ", 10, mode="left") == "パンダ " + assert just("パンダ", 10, mode="center") == " パンダ " + assert just("パンダ", 10, mode="right") == " パンダ" def test_east_asian_len(self): adj = fmt.EastAsianTextAdjustment() - assert adj.len('abc') == 3 - assert adj.len('abc') == 3 + assert adj.len("abc") == 3 + assert adj.len("abc") == 3 - assert adj.len('パンダ') == 6 - assert adj.len('パンダ') == 5 - assert adj.len('パンダpanda') == 11 - assert adj.len('パンダpanda') == 10 + assert adj.len("パンダ") == 6 + assert adj.len("パンダ") == 5 + assert adj.len("パンダpanda") == 11 + assert adj.len("パンダpanda") == 10 def test_ambiguous_width(self): adj = fmt.EastAsianTextAdjustment() - assert adj.len('¡¡ab') == 4 + assert adj.len("¡¡ab") == 4 - with cf.option_context('display.unicode.ambiguous_as_wide', True): + with cf.option_context("display.unicode.ambiguous_as_wide", True): adj = fmt.EastAsianTextAdjustment() - assert adj.len('¡¡ab') == 6 + assert adj.len("¡¡ab") == 6 - data = [['あ', 'b', 'c'], ['dd', 'ええ', 'ff'], - ['ggg', '¡¡ab', 'いいい']] - expected = 'あ dd ggg \nb ええ ¡¡ab\nc ff いいい' + data = [["あ", "b", "c"], ["dd", "ええ", "ff"], ["ggg", "¡¡ab", "いいい"]] + expected = "あ dd ggg \nb ええ ¡¡ab\nc ff いいい" adjoined = adj.adjoin(2, *data) assert adjoined == expected class TestTableSchemaRepr: - @classmethod def setup_class(cls): - pytest.importorskip('IPython') + pytest.importorskip("IPython") from IPython.core.interactiveshell import InteractiveShell + cls.display_formatter = InteractiveShell.instance().display_formatter def test_publishes(self): df = pd.DataFrame({"A": [1, 2]}) - objects = [df['A'], df, df] # dataframe / series + objects = [df["A"], df, df] # dataframe / series expected_keys = [ - {'text/plain', 'application/vnd.dataresource+json'}, - {'text/plain', 'text/html', 'application/vnd.dataresource+json'}, + {"text/plain", "application/vnd.dataresource+json"}, + {"text/plain", "text/html", "application/vnd.dataresource+json"}, ] - opt = pd.option_context('display.html.table_schema', True) + opt = pd.option_context("display.html.table_schema", True) for obj, expected in zip(objects, expected_keys): with opt: formatted = self.display_formatter.format(obj) assert set(formatted[0].keys()) == expected - with_latex = pd.option_context('display.latex.repr', True) + with_latex = pd.option_context("display.latex.repr", True) with opt, with_latex: formatted = self.display_formatter.format(obj) - expected = {'text/plain', 'text/html', 'text/latex', - 'application/vnd.dataresource+json'} + expected = { + "text/plain", + "text/html", + "text/latex", + "application/vnd.dataresource+json", + } assert set(formatted[0].keys()) == expected def test_publishes_not_implemented(self): # column MultiIndex # GH 15996 - midx = pd.MultiIndex.from_product([['A', 'B'], ['a', 'b', 'c']]) + midx = pd.MultiIndex.from_product([["A", "B"], ["a", "b", "c"]]) df = pd.DataFrame(np.random.randn(5, len(midx)), columns=midx) - opt = pd.option_context('display.html.table_schema', True) + opt = pd.option_context("display.html.table_schema", True) with opt: formatted = self.display_formatter.format(df) - expected = {'text/plain', 'text/html'} + expected = {"text/plain", "text/html"} assert set(formatted[0].keys()) == expected def test_config_on(self): @@ -184,19 +187,19 @@ def test_config_default_off(self): def test_enable_data_resource_formatter(self): # GH 10491 formatters = self.display_formatter.formatters - mimetype = 'application/vnd.dataresource+json' + mimetype = "application/vnd.dataresource+json" - with pd.option_context('display.html.table_schema', True): - assert 'application/vnd.dataresource+json' in formatters + with pd.option_context("display.html.table_schema", True): + assert "application/vnd.dataresource+json" in formatters assert formatters[mimetype].enabled # still there, just disabled - assert 'application/vnd.dataresource+json' in formatters + assert "application/vnd.dataresource+json" in formatters assert not formatters[mimetype].enabled # able to re-set - with pd.option_context('display.html.table_schema', True): - assert 'application/vnd.dataresource+json' in formatters + with pd.option_context("display.html.table_schema", True): + assert "application/vnd.dataresource+json" in formatters assert formatters[mimetype].enabled # smoke test that it works self.display_formatter.format(cf) diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index dce3bb3b420d48..7bd27b2ad9be32 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -11,30 +11,29 @@ from pandas import DataFrame import pandas.util.testing as tm -jinja2 = pytest.importorskip('jinja2') +jinja2 = pytest.importorskip("jinja2") from pandas.io.formats.style import Styler, _get_level_lengths # noqa # isort:skip class TestStyler: - def setup_method(self, method): np.random.seed(24) - self.s = DataFrame({'A': np.random.permutation(range(6))}) - self.df = DataFrame({'A': [0, 1], 'B': np.random.randn(2)}) + self.s = DataFrame({"A": np.random.permutation(range(6))}) + self.df = DataFrame({"A": [0, 1], "B": np.random.randn(2)}) self.f = lambda x: x self.g = lambda x: x - def h(x, foo='bar'): - return pd.Series( - 'color: {foo}'.format(foo=foo), index=x.index, name=x.name) + def h(x, foo="bar"): + return pd.Series("color: {foo}".format(foo=foo), index=x.index, name=x.name) self.h = h self.styler = Styler(self.df) - self.attrs = pd.DataFrame({'A': ['color: red', 'color: blue']}) + self.attrs = pd.DataFrame({"A": ["color: red", "color: blue"]}) self.dataframes = [ self.df, - pd.DataFrame({'f': [1., 2.], 'o': ['a', 'b'], - 'c': pd.Categorical(['a', 'b'])}) + pd.DataFrame( + {"f": [1.0, 2.0], "o": ["a", "b"], "c": pd.Categorical(["a", "b"])} + ), ] def test_init_non_pandas(self): @@ -50,31 +49,32 @@ def test_repr_html_ok(self): def test_repr_html_mathjax(self): # gh-19824 - assert 'tex2jax_ignore' not in self.styler._repr_html_() + assert "tex2jax_ignore" not in self.styler._repr_html_() - with pd.option_context('display.html.use_mathjax', False): - assert 'tex2jax_ignore' in self.styler._repr_html_() + with pd.option_context("display.html.use_mathjax", False): + assert "tex2jax_ignore" in self.styler._repr_html_() def test_update_ctx(self): self.styler._update_ctx(self.attrs) - expected = {(0, 0): ['color: red'], - (1, 0): ['color: blue']} + expected = {(0, 0): ["color: red"], (1, 0): ["color: blue"]} assert self.styler.ctx == expected def test_update_ctx_flatten_multi(self): - attrs = DataFrame({"A": ['color: red; foo: bar', - 'color: blue; foo: baz']}) + attrs = DataFrame({"A": ["color: red; foo: bar", "color: blue; foo: baz"]}) self.styler._update_ctx(attrs) - expected = {(0, 0): ['color: red', ' foo: bar'], - (1, 0): ['color: blue', ' foo: baz']} + expected = { + (0, 0): ["color: red", " foo: bar"], + (1, 0): ["color: blue", " foo: baz"], + } assert self.styler.ctx == expected def test_update_ctx_flatten_multi_traliing_semi(self): - attrs = DataFrame({"A": ['color: red; foo: bar;', - 'color: blue; foo: baz;']}) + attrs = DataFrame({"A": ["color: red; foo: bar;", "color: blue; foo: baz;"]}) self.styler._update_ctx(attrs) - expected = {(0, 0): ['color: red', ' foo: bar'], - (1, 0): ['color: blue', ' foo: baz']} + expected = { + (0, 0): ["color: red", " foo: bar"], + (1, 0): ["color: blue", " foo: baz"], + } assert self.styler.ctx == expected def test_copy(self): @@ -111,7 +111,7 @@ def test_clear(self): def test_render(self): df = pd.DataFrame({"A": [0, 1]}) style = lambda x: pd.Series(["color: red", "color: blue"], name=x.name) - s = Styler(df, uuid='AB').apply(style) + s = Styler(df, uuid="AB").apply(style) s.render() # it worked? @@ -120,23 +120,23 @@ def test_render_empty_dfs(self): es = Styler(empty_df) es.render() # An index but no columns - DataFrame(columns=['a']).style.render() + DataFrame(columns=["a"]).style.render() # A column but no index - DataFrame(index=['a']).style.render() + DataFrame(index=["a"]).style.render() # No IndexError raised? def test_render_double(self): df = pd.DataFrame({"A": [0, 1]}) - style = lambda x: pd.Series(["color: red; border: 1px", - "color: blue; border: 2px"], name=x.name) - s = Styler(df, uuid='AB').apply(style) + style = lambda x: pd.Series( + ["color: red; border: 1px", "color: blue; border: 2px"], name=x.name + ) + s = Styler(df, uuid="AB").apply(style) s.render() # it worked? def test_set_properties(self): df = pd.DataFrame({"A": [0, 1]}) - result = df.style.set_properties(color='white', - size='10px')._compute().ctx + result = df.style.set_properties(color="white", size="10px")._compute().ctx # order is deterministic v = ["color: white", "size: 10px"] expected = {(0, 0): v, (1, 0): v} @@ -145,77 +145,130 @@ def test_set_properties(self): assert sorted(v1) == sorted(v2) def test_set_properties_subset(self): - df = pd.DataFrame({'A': [0, 1]}) - result = df.style.set_properties(subset=pd.IndexSlice[0, 'A'], - color='white')._compute().ctx - expected = {(0, 0): ['color: white']} + df = pd.DataFrame({"A": [0, 1]}) + result = ( + df.style.set_properties(subset=pd.IndexSlice[0, "A"], color="white") + ._compute() + .ctx + ) + expected = {(0, 0): ["color: white"]} assert result == expected def test_empty_index_name_doesnt_display(self): # https://github.com/pandas-dev/pandas/pull/12090#issuecomment-180695902 - df = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}) + df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) result = df.style._translate() - expected = [[{'class': 'blank level0', 'type': 'th', 'value': '', - 'is_visible': True, 'display_value': ''}, - {'class': 'col_heading level0 col0', - 'display_value': 'A', - 'type': 'th', - 'value': 'A', - 'is_visible': True, - }, - {'class': 'col_heading level0 col1', - 'display_value': 'B', - 'type': 'th', - 'value': 'B', - 'is_visible': True, - }, - {'class': 'col_heading level0 col2', - 'display_value': 'C', - 'type': 'th', - 'value': 'C', - 'is_visible': True, - }]] - - assert result['head'] == expected + expected = [ + [ + { + "class": "blank level0", + "type": "th", + "value": "", + "is_visible": True, + "display_value": "", + }, + { + "class": "col_heading level0 col0", + "display_value": "A", + "type": "th", + "value": "A", + "is_visible": True, + }, + { + "class": "col_heading level0 col1", + "display_value": "B", + "type": "th", + "value": "B", + "is_visible": True, + }, + { + "class": "col_heading level0 col2", + "display_value": "C", + "type": "th", + "value": "C", + "is_visible": True, + }, + ] + ] + + assert result["head"] == expected def test_index_name(self): # https://github.com/pandas-dev/pandas/issues/11655 - df = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}) - result = df.set_index('A').style._translate() - - expected = [[{'class': 'blank level0', 'type': 'th', 'value': '', - 'display_value': '', 'is_visible': True}, - {'class': 'col_heading level0 col0', 'type': 'th', - 'value': 'B', 'display_value': 'B', 'is_visible': True}, - {'class': 'col_heading level0 col1', 'type': 'th', - 'value': 'C', 'display_value': 'C', 'is_visible': True}], - [{'class': 'index_name level0', 'type': 'th', - 'value': 'A'}, - {'class': 'blank', 'type': 'th', 'value': ''}, - {'class': 'blank', 'type': 'th', 'value': ''}]] - - assert result['head'] == expected + df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) + result = df.set_index("A").style._translate() + + expected = [ + [ + { + "class": "blank level0", + "type": "th", + "value": "", + "display_value": "", + "is_visible": True, + }, + { + "class": "col_heading level0 col0", + "type": "th", + "value": "B", + "display_value": "B", + "is_visible": True, + }, + { + "class": "col_heading level0 col1", + "type": "th", + "value": "C", + "display_value": "C", + "is_visible": True, + }, + ], + [ + {"class": "index_name level0", "type": "th", "value": "A"}, + {"class": "blank", "type": "th", "value": ""}, + {"class": "blank", "type": "th", "value": ""}, + ], + ] + + assert result["head"] == expected def test_multiindex_name(self): # https://github.com/pandas-dev/pandas/issues/11655 - df = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}) - result = df.set_index(['A', 'B']).style._translate() - - expected = [[ - {'class': 'blank', 'type': 'th', 'value': '', - 'display_value': '', 'is_visible': True}, - {'class': 'blank level0', 'type': 'th', 'value': '', - 'display_value': '', 'is_visible': True}, - {'class': 'col_heading level0 col0', 'type': 'th', - 'value': 'C', 'display_value': 'C', 'is_visible': True}], - [{'class': 'index_name level0', 'type': 'th', - 'value': 'A'}, - {'class': 'index_name level1', 'type': 'th', - 'value': 'B'}, - {'class': 'blank', 'type': 'th', 'value': ''}]] - - assert result['head'] == expected + df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) + result = df.set_index(["A", "B"]).style._translate() + + expected = [ + [ + { + "class": "blank", + "type": "th", + "value": "", + "display_value": "", + "is_visible": True, + }, + { + "class": "blank level0", + "type": "th", + "value": "", + "display_value": "", + "is_visible": True, + }, + { + "class": "col_heading level0 col0", + "type": "th", + "value": "C", + "display_value": "C", + "is_visible": True, + }, + ], + [ + {"class": "index_name level0", "type": "th", "value": "A"}, + {"class": "index_name level1", "type": "th", "value": "B"}, + {"class": "blank", "type": "th", "value": ""}, + ], + ] + + assert result["head"] == expected def test_numeric_columns(self): # https://github.com/pandas-dev/pandas/issues/12125 @@ -224,19 +277,27 @@ def test_numeric_columns(self): df.style._translate() def test_apply_axis(self): - df = pd.DataFrame({'A': [0, 0], 'B': [1, 1]}) - f = lambda x: ['val: {max}'.format(max=x.max()) for v in x] + df = pd.DataFrame({"A": [0, 0], "B": [1, 1]}) + f = lambda x: ["val: {max}".format(max=x.max()) for v in x] result = df.style.apply(f, axis=1) assert len(result._todo) == 1 assert len(result.ctx) == 0 result._compute() - expected = {(0, 0): ['val: 1'], (0, 1): ['val: 1'], - (1, 0): ['val: 1'], (1, 1): ['val: 1']} + expected = { + (0, 0): ["val: 1"], + (0, 1): ["val: 1"], + (1, 0): ["val: 1"], + (1, 1): ["val: 1"], + } assert result.ctx == expected result = df.style.apply(f, axis=0) - expected = {(0, 0): ['val: 0'], (0, 1): ['val: 1'], - (1, 0): ['val: 0'], (1, 1): ['val: 1']} + expected = { + (0, 0): ["val: 0"], + (0, 1): ["val: 1"], + (1, 0): ["val: 0"], + (1, 1): ["val: 1"], + } result._compute() assert result.ctx == expected result = df.style.apply(f) # default @@ -245,35 +306,50 @@ def test_apply_axis(self): def test_apply_subset(self): axes = [0, 1] - slices = [pd.IndexSlice[:], pd.IndexSlice[:, ['A']], - pd.IndexSlice[[1], :], pd.IndexSlice[[1], ['A']], - pd.IndexSlice[:2, ['A', 'B']]] + slices = [ + pd.IndexSlice[:], + pd.IndexSlice[:, ["A"]], + pd.IndexSlice[[1], :], + pd.IndexSlice[[1], ["A"]], + pd.IndexSlice[:2, ["A", "B"]], + ] for ax in axes: for slice_ in slices: - result = self.df.style.apply(self.h, axis=ax, subset=slice_, - foo='baz')._compute().ctx - expected = {(r, c): ['color: baz'] - for r, row in enumerate(self.df.index) - for c, col in enumerate(self.df.columns) - if row in self.df.loc[slice_].index and - col in self.df.loc[slice_].columns} + result = ( + self.df.style.apply(self.h, axis=ax, subset=slice_, foo="baz") + ._compute() + .ctx + ) + expected = { + (r, c): ["color: baz"] + for r, row in enumerate(self.df.index) + for c, col in enumerate(self.df.columns) + if row in self.df.loc[slice_].index + and col in self.df.loc[slice_].columns + } assert result == expected def test_applymap_subset(self): def f(x): - return 'foo: bar' - - slices = [pd.IndexSlice[:], pd.IndexSlice[:, ['A']], - pd.IndexSlice[[1], :], pd.IndexSlice[[1], ['A']], - pd.IndexSlice[:2, ['A', 'B']]] + return "foo: bar" + + slices = [ + pd.IndexSlice[:], + pd.IndexSlice[:, ["A"]], + pd.IndexSlice[[1], :], + pd.IndexSlice[[1], ["A"]], + pd.IndexSlice[:2, ["A", "B"]], + ] for slice_ in slices: result = self.df.style.applymap(f, subset=slice_)._compute().ctx - expected = {(r, c): ['foo: bar'] - for r, row in enumerate(self.df.index) - for c, col in enumerate(self.df.columns) - if row in self.df.loc[slice_].index and - col in self.df.loc[slice_].columns} + expected = { + (r, c): ["foo: bar"] + for r, row in enumerate(self.df.index) + for c, col in enumerate(self.df.columns) + if row in self.df.loc[slice_].index + and col in self.df.loc[slice_].columns + } assert result == expected def test_applymap_subset_multiindex(self): @@ -285,34 +361,34 @@ def color_negative_red(val): the css property `'color: red'` for negative strings, black otherwise. """ - color = 'red' if val < 0 else 'black' - return 'color: %s' % color + color = "red" if val < 0 else "black" + return "color: %s" % color dic = { - ('a', 'd'): [-1.12, 2.11], - ('a', 'c'): [2.78, -2.88], - ('b', 'c'): [-3.99, 3.77], - ('b', 'd'): [4.21, -1.22], + ("a", "d"): [-1.12, 2.11], + ("a", "c"): [2.78, -2.88], + ("b", "c"): [-3.99, 3.77], + ("b", "d"): [4.21, -1.22], } idx = pd.IndexSlice df = pd.DataFrame(dic, index=[0, 1]) - (df.style - .applymap(color_negative_red, subset=idx[:, idx['b', 'd']]) - .render()) + (df.style.applymap(color_negative_red, subset=idx[:, idx["b", "d"]]).render()) def test_where_with_one_style(self): # GH 17474 def f(x): return x > 0.5 - style1 = 'foo: bar' + style1 = "foo: bar" result = self.df.style.where(f, style1)._compute().ctx - expected = {(r, c): [style1 if f(self.df.loc[row, col]) else ''] - for r, row in enumerate(self.df.index) - for c, col in enumerate(self.df.columns)} + expected = { + (r, c): [style1 if f(self.df.loc[row, col]) else ""] + for r, row in enumerate(self.df.index) + for c, col in enumerate(self.df.columns) + } assert result == expected def test_where_subset(self): @@ -320,22 +396,28 @@ def test_where_subset(self): def f(x): return x > 0.5 - style1 = 'foo: bar' - style2 = 'baz: foo' + style1 = "foo: bar" + style2 = "baz: foo" - slices = [pd.IndexSlice[:], pd.IndexSlice[:, ['A']], - pd.IndexSlice[[1], :], pd.IndexSlice[[1], ['A']], - pd.IndexSlice[:2, ['A', 'B']]] + slices = [ + pd.IndexSlice[:], + pd.IndexSlice[:, ["A"]], + pd.IndexSlice[[1], :], + pd.IndexSlice[[1], ["A"]], + pd.IndexSlice[:2, ["A", "B"]], + ] for slice_ in slices: - result = self.df.style.where(f, style1, style2, - subset=slice_)._compute().ctx - expected = {(r, c): - [style1 if f(self.df.loc[row, col]) else style2] - for r, row in enumerate(self.df.index) - for c, col in enumerate(self.df.columns) - if row in self.df.loc[slice_].index and - col in self.df.loc[slice_].columns} + result = ( + self.df.style.where(f, style1, style2, subset=slice_)._compute().ctx + ) + expected = { + (r, c): [style1 if f(self.df.loc[row, col]) else style2] + for r, row in enumerate(self.df.index) + for c, col in enumerate(self.df.columns) + if row in self.df.loc[slice_].index + and col in self.df.loc[slice_].columns + } assert result == expected def test_where_subset_compare_with_applymap(self): @@ -343,395 +425,579 @@ def test_where_subset_compare_with_applymap(self): def f(x): return x > 0.5 - style1 = 'foo: bar' - style2 = 'baz: foo' + style1 = "foo: bar" + style2 = "baz: foo" def g(x): return style1 if f(x) else style2 - slices = [pd.IndexSlice[:], pd.IndexSlice[:, ['A']], - pd.IndexSlice[[1], :], pd.IndexSlice[[1], ['A']], - pd.IndexSlice[:2, ['A', 'B']]] + slices = [ + pd.IndexSlice[:], + pd.IndexSlice[:, ["A"]], + pd.IndexSlice[[1], :], + pd.IndexSlice[[1], ["A"]], + pd.IndexSlice[:2, ["A", "B"]], + ] for slice_ in slices: - result = self.df.style.where(f, style1, style2, - subset=slice_)._compute().ctx + result = ( + self.df.style.where(f, style1, style2, subset=slice_)._compute().ctx + ) expected = self.df.style.applymap(g, subset=slice_)._compute().ctx assert result == expected def test_empty(self): - df = pd.DataFrame({'A': [1, 0]}) + df = pd.DataFrame({"A": [1, 0]}) s = df.style - s.ctx = {(0, 0): ['color: red'], - (1, 0): ['']} + s.ctx = {(0, 0): ["color: red"], (1, 0): [""]} - result = s._translate()['cellstyle'] - expected = [{'props': [['color', ' red']], 'selector': 'row0_col0'}, - {'props': [['', '']], 'selector': 'row1_col0'}] + result = s._translate()["cellstyle"] + expected = [ + {"props": [["color", " red"]], "selector": "row0_col0"}, + {"props": [["", ""]], "selector": "row1_col0"}, + ] assert result == expected def test_bar_align_left(self): - df = pd.DataFrame({'A': [0, 1, 2]}) + df = pd.DataFrame({"A": [0, 1, 2]}) result = df.style.bar()._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(' - '90deg,#d65f5f 50.0%, transparent 50.0%)'], - (2, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(' - '90deg,#d65f5f 100.0%, transparent 100.0%)'] + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(" + "90deg,#d65f5f 50.0%, transparent 50.0%)", + ], + (2, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(" + "90deg,#d65f5f 100.0%, transparent 100.0%)", + ], } assert result == expected - result = df.style.bar(color='red', width=50)._compute().ctx + result = df.style.bar(color="red", width=50)._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(' - '90deg,red 25.0%, transparent 25.0%)'], - (2, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(' - '90deg,red 50.0%, transparent 50.0%)'] + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(" "90deg,red 25.0%, transparent 25.0%)", + ], + (2, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(" "90deg,red 50.0%, transparent 50.0%)", + ], } assert result == expected - df['C'] = ['a'] * len(df) - result = df.style.bar(color='red', width=50)._compute().ctx + df["C"] = ["a"] * len(df) + result = df.style.bar(color="red", width=50)._compute().ctx assert result == expected - df['C'] = df['C'].astype('category') - result = df.style.bar(color='red', width=50)._compute().ctx + df["C"] = df["C"].astype("category") + result = df.style.bar(color="red", width=50)._compute().ctx assert result == expected def test_bar_align_left_0points(self): df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) result = df.style.bar()._compute().ctx - expected = {(0, 0): ['width: 10em', ' height: 80%'], - (0, 1): ['width: 10em', ' height: 80%'], - (0, 2): ['width: 10em', ' height: 80%'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 50.0%,' - ' transparent 50.0%)'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 50.0%,' - ' transparent 50.0%)'], - (1, 2): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 50.0%,' - ' transparent 50.0%)'], - (2, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 100.0%' - ', transparent 100.0%)'], - (2, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 100.0%' - ', transparent 100.0%)'], - (2, 2): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 100.0%' - ', transparent 100.0%)']} + expected = { + (0, 0): ["width: 10em", " height: 80%"], + (0, 1): ["width: 10em", " height: 80%"], + (0, 2): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 50.0%," + " transparent 50.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 50.0%," + " transparent 50.0%)", + ], + (1, 2): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 50.0%," + " transparent 50.0%)", + ], + (2, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 100.0%" + ", transparent 100.0%)", + ], + (2, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 100.0%" + ", transparent 100.0%)", + ], + (2, 2): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 100.0%" + ", transparent 100.0%)", + ], + } assert result == expected result = df.style.bar(axis=1)._compute().ctx - expected = {(0, 0): ['width: 10em', ' height: 80%'], - (0, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 50.0%,' - ' transparent 50.0%)'], - (0, 2): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 100.0%' - ', transparent 100.0%)'], - (1, 0): ['width: 10em', ' height: 80%'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 50.0%' - ', transparent 50.0%)'], - (1, 2): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 100.0%' - ', transparent 100.0%)'], - (2, 0): ['width: 10em', ' height: 80%'], - (2, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 50.0%' - ', transparent 50.0%)'], - (2, 2): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 100.0%' - ', transparent 100.0%)']} + expected = { + (0, 0): ["width: 10em", " height: 80%"], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 50.0%," + " transparent 50.0%)", + ], + (0, 2): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 100.0%" + ", transparent 100.0%)", + ], + (1, 0): ["width: 10em", " height: 80%"], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 50.0%" + ", transparent 50.0%)", + ], + (1, 2): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 100.0%" + ", transparent 100.0%)", + ], + (2, 0): ["width: 10em", " height: 80%"], + (2, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 50.0%" + ", transparent 50.0%)", + ], + (2, 2): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 100.0%" + ", transparent 100.0%)", + ], + } assert result == expected def test_bar_align_mid_pos_and_neg(self): - df = pd.DataFrame({'A': [-10, 0, 20, 90]}) - - result = df.style.bar(align='mid', color=[ - '#d65f5f', '#5fba7d'])._compute().ctx - - expected = {(0, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#d65f5f 10.0%, transparent 10.0%)'], - (1, 0): ['width: 10em', ' height: 80%', ], - (2, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 10.0%, #5fba7d 10.0%' - ', #5fba7d 30.0%, transparent 30.0%)'], - (3, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 10.0%, ' - '#5fba7d 10.0%, #5fba7d 100.0%, ' - 'transparent 100.0%)']} + df = pd.DataFrame({"A": [-10, 0, 20, 90]}) + + result = df.style.bar(align="mid", color=["#d65f5f", "#5fba7d"])._compute().ctx + + expected = { + (0, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 10.0%, transparent 10.0%)", + ], + (1, 0): ["width: 10em", " height: 80%"], + (2, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 10.0%, #5fba7d 10.0%" + ", #5fba7d 30.0%, transparent 30.0%)", + ], + (3, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 10.0%, " + "#5fba7d 10.0%, #5fba7d 100.0%, " + "transparent 100.0%)", + ], + } assert result == expected def test_bar_align_mid_all_pos(self): - df = pd.DataFrame({'A': [10, 20, 50, 100]}) - - result = df.style.bar(align='mid', color=[ - '#d65f5f', '#5fba7d'])._compute().ctx - - expected = {(0, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#5fba7d 10.0%, transparent 10.0%)'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#5fba7d 20.0%, transparent 20.0%)'], - (2, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#5fba7d 50.0%, transparent 50.0%)'], - (3, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#5fba7d 100.0%, transparent 100.0%)']} + df = pd.DataFrame({"A": [10, 20, 50, 100]}) + + result = df.style.bar(align="mid", color=["#d65f5f", "#5fba7d"])._compute().ctx + + expected = { + (0, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#5fba7d 10.0%, transparent 10.0%)", + ], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#5fba7d 20.0%, transparent 20.0%)", + ], + (2, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#5fba7d 50.0%, transparent 50.0%)", + ], + (3, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#5fba7d 100.0%, transparent 100.0%)", + ], + } assert result == expected def test_bar_align_mid_all_neg(self): - df = pd.DataFrame({'A': [-100, -60, -30, -20]}) - - result = df.style.bar(align='mid', color=[ - '#d65f5f', '#5fba7d'])._compute().ctx - - expected = {(0, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#d65f5f 100.0%, transparent 100.0%)'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 40.0%, ' - '#d65f5f 40.0%, #d65f5f 100.0%, ' - 'transparent 100.0%)'], - (2, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 70.0%, ' - '#d65f5f 70.0%, #d65f5f 100.0%, ' - 'transparent 100.0%)'], - (3, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 80.0%, ' - '#d65f5f 80.0%, #d65f5f 100.0%, ' - 'transparent 100.0%)']} + df = pd.DataFrame({"A": [-100, -60, -30, -20]}) + + result = df.style.bar(align="mid", color=["#d65f5f", "#5fba7d"])._compute().ctx + + expected = { + (0, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 100.0%, transparent 100.0%)", + ], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 40.0%, " + "#d65f5f 40.0%, #d65f5f 100.0%, " + "transparent 100.0%)", + ], + (2, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 70.0%, " + "#d65f5f 70.0%, #d65f5f 100.0%, " + "transparent 100.0%)", + ], + (3, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 80.0%, " + "#d65f5f 80.0%, #d65f5f 100.0%, " + "transparent 100.0%)", + ], + } assert result == expected def test_bar_align_zero_pos_and_neg(self): # See https://github.com/pandas-dev/pandas/pull/14757 - df = pd.DataFrame({'A': [-10, 0, 20, 90]}) - - result = df.style.bar(align='zero', color=[ - '#d65f5f', '#5fba7d'], width=90)._compute().ctx - expected = {(0, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 40.0%, #d65f5f 40.0%, ' - '#d65f5f 45.0%, transparent 45.0%)'], - (1, 0): ['width: 10em', ' height: 80%'], - (2, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 45.0%, #5fba7d 45.0%, ' - '#5fba7d 55.0%, transparent 55.0%)'], - (3, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 45.0%, #5fba7d 45.0%, ' - '#5fba7d 90.0%, transparent 90.0%)']} + df = pd.DataFrame({"A": [-10, 0, 20, 90]}) + + result = ( + df.style.bar(align="zero", color=["#d65f5f", "#5fba7d"], width=90) + ._compute() + .ctx + ) + expected = { + (0, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 40.0%, #d65f5f 40.0%, " + "#d65f5f 45.0%, transparent 45.0%)", + ], + (1, 0): ["width: 10em", " height: 80%"], + (2, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 45.0%, #5fba7d 45.0%, " + "#5fba7d 55.0%, transparent 55.0%)", + ], + (3, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 45.0%, #5fba7d 45.0%, " + "#5fba7d 90.0%, transparent 90.0%)", + ], + } assert result == expected def test_bar_align_left_axis_none(self): - df = pd.DataFrame({'A': [0, 1], 'B': [2, 4]}) + df = pd.DataFrame({"A": [0, 1], "B": [2, 4]}) result = df.style.bar(axis=None)._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#d65f5f 25.0%, transparent 25.0%)'], - (0, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#d65f5f 50.0%, transparent 50.0%)'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#d65f5f 100.0%, transparent 100.0%)'] + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 25.0%, transparent 25.0%)", + ], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 50.0%, transparent 50.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 100.0%, transparent 100.0%)", + ], } assert result == expected def test_bar_align_zero_axis_none(self): - df = pd.DataFrame({'A': [0, 1], 'B': [-2, 4]}) - result = df.style.bar(align='zero', axis=None)._compute().ctx + df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="zero", axis=None)._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 50.0%, #d65f5f 50.0%, ' - '#d65f5f 62.5%, transparent 62.5%)'], - (0, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 25.0%, #d65f5f 25.0%, ' - '#d65f5f 50.0%, transparent 50.0%)'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 50.0%, #d65f5f 50.0%, ' - '#d65f5f 100.0%, transparent 100.0%)'] + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 50.0%, #d65f5f 50.0%, " + "#d65f5f 62.5%, transparent 62.5%)", + ], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 25.0%, #d65f5f 25.0%, " + "#d65f5f 50.0%, transparent 50.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 50.0%, #d65f5f 50.0%, " + "#d65f5f 100.0%, transparent 100.0%)", + ], } assert result == expected def test_bar_align_mid_axis_none(self): - df = pd.DataFrame({'A': [0, 1], 'B': [-2, 4]}) - result = df.style.bar(align='mid', axis=None)._compute().ctx + df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="mid", axis=None)._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 33.3%, #d65f5f 33.3%, ' - '#d65f5f 50.0%, transparent 50.0%)'], - (0, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#d65f5f 33.3%, transparent 33.3%)'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 33.3%, #d65f5f 33.3%, ' - '#d65f5f 100.0%, transparent 100.0%)'] + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 33.3%, #d65f5f 33.3%, " + "#d65f5f 50.0%, transparent 50.0%)", + ], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 33.3%, transparent 33.3%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 33.3%, #d65f5f 33.3%, " + "#d65f5f 100.0%, transparent 100.0%)", + ], } assert result == expected def test_bar_align_mid_vmin(self): - df = pd.DataFrame({'A': [0, 1], 'B': [-2, 4]}) - result = df.style.bar(align='mid', axis=None, vmin=-6)._compute().ctx + df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="mid", axis=None, vmin=-6)._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 60.0%, #d65f5f 60.0%, ' - '#d65f5f 70.0%, transparent 70.0%)'], - (0, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 40.0%, #d65f5f 40.0%, ' - '#d65f5f 60.0%, transparent 60.0%)'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 60.0%, #d65f5f 60.0%, ' - '#d65f5f 100.0%, transparent 100.0%)'] + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 60.0%, #d65f5f 60.0%, " + "#d65f5f 70.0%, transparent 70.0%)", + ], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 40.0%, #d65f5f 40.0%, " + "#d65f5f 60.0%, transparent 60.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 60.0%, #d65f5f 60.0%, " + "#d65f5f 100.0%, transparent 100.0%)", + ], } assert result == expected def test_bar_align_mid_vmax(self): - df = pd.DataFrame({'A': [0, 1], 'B': [-2, 4]}) - result = df.style.bar(align='mid', axis=None, vmax=8)._compute().ctx + df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="mid", axis=None, vmax=8)._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 20.0%, #d65f5f 20.0%, ' - '#d65f5f 30.0%, transparent 30.0%)'], - (0, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#d65f5f 20.0%, transparent 20.0%)'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 20.0%, #d65f5f 20.0%, ' - '#d65f5f 60.0%, transparent 60.0%)'] + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 20.0%, #d65f5f 20.0%, " + "#d65f5f 30.0%, transparent 30.0%)", + ], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 20.0%, transparent 20.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 20.0%, #d65f5f 20.0%, " + "#d65f5f 60.0%, transparent 60.0%)", + ], } assert result == expected def test_bar_align_mid_vmin_vmax_wide(self): - df = pd.DataFrame({'A': [0, 1], 'B': [-2, 4]}) - result = df.style.bar(align='mid', axis=None, - vmin=-3, vmax=7)._compute().ctx + df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="mid", axis=None, vmin=-3, vmax=7)._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 30.0%, #d65f5f 30.0%, ' - '#d65f5f 40.0%, transparent 40.0%)'], - (0, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 10.0%, #d65f5f 10.0%, ' - '#d65f5f 30.0%, transparent 30.0%)'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 30.0%, #d65f5f 30.0%, ' - '#d65f5f 70.0%, transparent 70.0%)'] + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 30.0%, #d65f5f 30.0%, " + "#d65f5f 40.0%, transparent 40.0%)", + ], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 10.0%, #d65f5f 10.0%, " + "#d65f5f 30.0%, transparent 30.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 30.0%, #d65f5f 30.0%, " + "#d65f5f 70.0%, transparent 70.0%)", + ], } assert result == expected def test_bar_align_mid_vmin_vmax_clipping(self): - df = pd.DataFrame({'A': [0, 1], 'B': [-2, 4]}) - result = df.style.bar(align='mid', axis=None, - vmin=-1, vmax=3)._compute().ctx + df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="mid", axis=None, vmin=-1, vmax=3)._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 25.0%, #d65f5f 25.0%, ' - '#d65f5f 50.0%, transparent 50.0%)'], - (0, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#d65f5f 25.0%, transparent 25.0%)'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 25.0%, #d65f5f 25.0%, ' - '#d65f5f 100.0%, transparent 100.0%)'] + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 25.0%, #d65f5f 25.0%, " + "#d65f5f 50.0%, transparent 50.0%)", + ], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 25.0%, transparent 25.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 25.0%, #d65f5f 25.0%, " + "#d65f5f 100.0%, transparent 100.0%)", + ], } assert result == expected def test_bar_align_mid_nans(self): - df = pd.DataFrame({'A': [1, None], 'B': [-1, 3]}) - result = df.style.bar(align='mid', axis=None)._compute().ctx + df = pd.DataFrame({"A": [1, None], "B": [-1, 3]}) + result = df.style.bar(align="mid", axis=None)._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 25.0%, #d65f5f 25.0%, ' - '#d65f5f 50.0%, transparent 50.0%)'], - (1, 0): [''], - (0, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#d65f5f 25.0%, transparent 25.0%)'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 25.0%, #d65f5f 25.0%, ' - '#d65f5f 100.0%, transparent 100.0%)'] + (0, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 25.0%, #d65f5f 25.0%, " + "#d65f5f 50.0%, transparent 50.0%)", + ], + (1, 0): [""], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 25.0%, transparent 25.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 25.0%, #d65f5f 25.0%, " + "#d65f5f 100.0%, transparent 100.0%)", + ], } assert result == expected def test_bar_align_zero_nans(self): - df = pd.DataFrame({'A': [1, None], 'B': [-1, 2]}) - result = df.style.bar(align='zero', axis=None)._compute().ctx + df = pd.DataFrame({"A": [1, None], "B": [-1, 2]}) + result = df.style.bar(align="zero", axis=None)._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 50.0%, #d65f5f 50.0%, ' - '#d65f5f 75.0%, transparent 75.0%)'], - (1, 0): [''], - (0, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 25.0%, #d65f5f 25.0%, ' - '#d65f5f 50.0%, transparent 50.0%)'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 50.0%, #d65f5f 50.0%, ' - '#d65f5f 100.0%, transparent 100.0%)'] + (0, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 50.0%, #d65f5f 50.0%, " + "#d65f5f 75.0%, transparent 75.0%)", + ], + (1, 0): [""], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 25.0%, #d65f5f 25.0%, " + "#d65f5f 50.0%, transparent 50.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 50.0%, #d65f5f 50.0%, " + "#d65f5f 100.0%, transparent 100.0%)", + ], } assert result == expected def test_bar_bad_align_raises(self): - df = pd.DataFrame({'A': [-100, -60, -30, -20]}) + df = pd.DataFrame({"A": [-100, -60, -30, -20]}) with pytest.raises(ValueError): - df.style.bar(align='poorly', color=['#d65f5f', '#5fba7d']) + df.style.bar(align="poorly", color=["#d65f5f", "#5fba7d"]) - def test_highlight_null(self, null_color='red'): - df = pd.DataFrame({'A': [0, np.nan]}) + def test_highlight_null(self, null_color="red"): + df = pd.DataFrame({"A": [0, np.nan]}) result = df.style.highlight_null()._compute().ctx - expected = {(0, 0): [''], - (1, 0): ['background-color: red']} + expected = {(0, 0): [""], (1, 0): ["background-color: red"]} assert result == expected def test_nonunique_raises(self): - df = pd.DataFrame([[1, 2]], columns=['A', 'A']) + df = pd.DataFrame([[1, 2]], columns=["A", "A"]) with pytest.raises(ValueError): df.style @@ -739,38 +1005,38 @@ def test_nonunique_raises(self): Styler(df) def test_caption(self): - styler = Styler(self.df, caption='foo') + styler = Styler(self.df, caption="foo") result = styler.render() - assert all(['caption' in result, 'foo' in result]) + assert all(["caption" in result, "foo" in result]) styler = self.df.style - result = styler.set_caption('baz') + result = styler.set_caption("baz") assert styler is result - assert styler.caption == 'baz' + assert styler.caption == "baz" def test_uuid(self): - styler = Styler(self.df, uuid='abc123') + styler = Styler(self.df, uuid="abc123") result = styler.render() - assert 'abc123' in result + assert "abc123" in result styler = self.df.style - result = styler.set_uuid('aaa') + result = styler.set_uuid("aaa") assert result is styler - assert result.uuid == 'aaa' + assert result.uuid == "aaa" def test_unique_id(self): # See https://github.com/pandas-dev/pandas/issues/16780 - df = pd.DataFrame({'a': [1, 3, 5, 6], 'b': [2, 4, 12, 21]}) - result = df.style.render(uuid='test') - assert 'test' in result + df = pd.DataFrame({"a": [1, 3, 5, 6], "b": [2, 4, 12, 21]}) + result = df.style.render(uuid="test") + assert "test" in result ids = re.findall('id="(.*?)"', result) assert np.unique(ids).size == len(ids) def test_table_styles(self): - style = [{'selector': 'th', 'props': [('foo', 'bar')]}] + style = [{"selector": "th", "props": [("foo", "bar")]}] styler = Styler(self.df, table_styles=style) - result = ' '.join(styler.render().split()) - assert 'th { foo: bar; }' in result + result = " ".join(styler.render().split()) + assert "th { foo: bar; }" in result styler = self.df.style result = styler.set_table_styles(style) @@ -787,7 +1053,7 @@ def test_table_attributes(self): assert 'class="foo" data-bar' in result def test_precision(self): - with pd.option_context('display.precision', 10): + with pd.option_context("display.precision", 10): s = Styler(self.df) assert s.precision == 10 s = Styler(self.df, precision=2) @@ -799,65 +1065,75 @@ def test_precision(self): def test_apply_none(self): def f(x): - return pd.DataFrame(np.where(x == x.max(), 'color: red', ''), - index=x.index, columns=x.columns) - result = (pd.DataFrame([[1, 2], [3, 4]]) - .style.apply(f, axis=None)._compute().ctx) - assert result[(1, 1)] == ['color: red'] + return pd.DataFrame( + np.where(x == x.max(), "color: red", ""), + index=x.index, + columns=x.columns, + ) + + result = pd.DataFrame([[1, 2], [3, 4]]).style.apply(f, axis=None)._compute().ctx + assert result[(1, 1)] == ["color: red"] def test_trim(self): result = self.df.style.render() # trim=True - assert result.count('#') == 0 + assert result.count("#") == 0 result = self.df.style.highlight_max().render() - assert result.count('#') == len(self.df.columns) + assert result.count("#") == len(self.df.columns) def test_highlight_max(self): - df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) # max(df) = min(-df) for max_ in [True, False]: if max_: - attr = 'highlight_max' + attr = "highlight_max" else: df = -df - attr = 'highlight_min' + attr = "highlight_min" result = getattr(df.style, attr)()._compute().ctx - assert result[(1, 1)] == ['background-color: yellow'] + assert result[(1, 1)] == ["background-color: yellow"] - result = getattr(df.style, attr)(color='green')._compute().ctx - assert result[(1, 1)] == ['background-color: green'] + result = getattr(df.style, attr)(color="green")._compute().ctx + assert result[(1, 1)] == ["background-color: green"] - result = getattr(df.style, attr)(subset='A')._compute().ctx - assert result[(1, 0)] == ['background-color: yellow'] + result = getattr(df.style, attr)(subset="A")._compute().ctx + assert result[(1, 0)] == ["background-color: yellow"] result = getattr(df.style, attr)(axis=0)._compute().ctx - expected = {(1, 0): ['background-color: yellow'], - (1, 1): ['background-color: yellow'], - (0, 1): [''], (0, 0): ['']} + expected = { + (1, 0): ["background-color: yellow"], + (1, 1): ["background-color: yellow"], + (0, 1): [""], + (0, 0): [""], + } assert result == expected result = getattr(df.style, attr)(axis=1)._compute().ctx - expected = {(0, 1): ['background-color: yellow'], - (1, 1): ['background-color: yellow'], - (0, 0): [''], (1, 0): ['']} + expected = { + (0, 1): ["background-color: yellow"], + (1, 1): ["background-color: yellow"], + (0, 0): [""], + (1, 0): [""], + } assert result == expected # separate since we can't negate the strs - df['C'] = ['a', 'b'] + df["C"] = ["a", "b"] result = df.style.highlight_max()._compute().ctx - expected = {(1, 1): ['background-color: yellow']} + expected = {(1, 1): ["background-color: yellow"]} result = df.style.highlight_min()._compute().ctx - expected = {(0, 0): ['background-color: yellow']} + expected = {(0, 0): ["background-color: yellow"]} def test_export(self): - f = lambda x: 'color: red' if x > 0 else 'color: blue' - g = lambda x, y, z: 'color: {z}'.format(z=z) \ - if x > 0 else 'color: {z}'.format(z=z) + f = lambda x: "color: red" if x > 0 else "color: blue" + g = ( + lambda x, y, z: "color: {z}".format(z=z) + if x > 0 + else "color: {z}".format(z=z) + ) style1 = self.styler - style1.applymap(f)\ - .applymap(g, y='a', z='b')\ - .highlight_max() + style1.applymap(f).applymap(g, y="a", z="b").highlight_max() result = style1.export() style2 = self.df.style style2.use(result) @@ -868,11 +1144,11 @@ def test_display_format(self): df = pd.DataFrame(np.random.random(size=(2, 2))) ctx = df.style.format("{:0.1f}")._translate() - assert all(['display_value' in c for c in row] - for row in ctx['body']) - assert all([len(c['display_value']) <= 3 for c in row[1:]] - for row in ctx['body']) - assert len(ctx['body'][0][1]['display_value'].lstrip('-')) <= 3 + assert all(["display_value" in c for c in row] for row in ctx["body"]) + assert all( + [len(c["display_value"]) <= 3 for c in row[1:]] for row in ctx["body"] + ) + assert len(ctx["body"][0][1]["display_value"].lstrip("-")) <= 3 def test_display_format_raises(self): df = pd.DataFrame(np.random.randn(2, 2)) @@ -882,165 +1158,202 @@ def test_display_format_raises(self): df.style.format(True) def test_display_subset(self): - df = pd.DataFrame([[.1234, .1234], [1.1234, 1.1234]], - columns=['a', 'b']) - ctx = df.style.format({"a": "{:0.1f}", "b": "{0:.2%}"}, - subset=pd.IndexSlice[0, :])._translate() - expected = '0.1' - assert ctx['body'][0][1]['display_value'] == expected - assert ctx['body'][1][1]['display_value'] == '1.1234' - assert ctx['body'][0][2]['display_value'] == '12.34%' - - raw_11 = '1.1234' - ctx = df.style.format("{:0.1f}", - subset=pd.IndexSlice[0, :])._translate() - assert ctx['body'][0][1]['display_value'] == expected - assert ctx['body'][1][1]['display_value'] == raw_11 - - ctx = df.style.format("{:0.1f}", - subset=pd.IndexSlice[0, :])._translate() - assert ctx['body'][0][1]['display_value'] == expected - assert ctx['body'][1][1]['display_value'] == raw_11 - - ctx = df.style.format("{:0.1f}", - subset=pd.IndexSlice['a'])._translate() - assert ctx['body'][0][1]['display_value'] == expected - assert ctx['body'][0][2]['display_value'] == '0.1234' - - ctx = df.style.format("{:0.1f}", - subset=pd.IndexSlice[0, 'a'])._translate() - assert ctx['body'][0][1]['display_value'] == expected - assert ctx['body'][1][1]['display_value'] == raw_11 - - ctx = df.style.format("{:0.1f}", - subset=pd.IndexSlice[[0, 1], ['a']])._translate() - assert ctx['body'][0][1]['display_value'] == expected - assert ctx['body'][1][1]['display_value'] == '1.1' - assert ctx['body'][0][2]['display_value'] == '0.1234' - assert ctx['body'][1][2]['display_value'] == '1.1234' + df = pd.DataFrame([[0.1234, 0.1234], [1.1234, 1.1234]], columns=["a", "b"]) + ctx = df.style.format( + {"a": "{:0.1f}", "b": "{0:.2%}"}, subset=pd.IndexSlice[0, :] + )._translate() + expected = "0.1" + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][1][1]["display_value"] == "1.1234" + assert ctx["body"][0][2]["display_value"] == "12.34%" + + raw_11 = "1.1234" + ctx = df.style.format("{:0.1f}", subset=pd.IndexSlice[0, :])._translate() + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][1][1]["display_value"] == raw_11 + + ctx = df.style.format("{:0.1f}", subset=pd.IndexSlice[0, :])._translate() + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][1][1]["display_value"] == raw_11 + + ctx = df.style.format("{:0.1f}", subset=pd.IndexSlice["a"])._translate() + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][0][2]["display_value"] == "0.1234" + + ctx = df.style.format("{:0.1f}", subset=pd.IndexSlice[0, "a"])._translate() + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][1][1]["display_value"] == raw_11 + + ctx = df.style.format( + "{:0.1f}", subset=pd.IndexSlice[[0, 1], ["a"]] + )._translate() + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][1][1]["display_value"] == "1.1" + assert ctx["body"][0][2]["display_value"] == "0.1234" + assert ctx["body"][1][2]["display_value"] == "1.1234" def test_display_dict(self): - df = pd.DataFrame([[.1234, .1234], [1.1234, 1.1234]], - columns=['a', 'b']) + df = pd.DataFrame([[0.1234, 0.1234], [1.1234, 1.1234]], columns=["a", "b"]) ctx = df.style.format({"a": "{:0.1f}", "b": "{0:.2%}"})._translate() - assert ctx['body'][0][1]['display_value'] == '0.1' - assert ctx['body'][0][2]['display_value'] == '12.34%' - df['c'] = ['aaa', 'bbb'] + assert ctx["body"][0][1]["display_value"] == "0.1" + assert ctx["body"][0][2]["display_value"] == "12.34%" + df["c"] = ["aaa", "bbb"] ctx = df.style.format({"a": "{:0.1f}", "c": str.upper})._translate() - assert ctx['body'][0][1]['display_value'] == '0.1' - assert ctx['body'][0][3]['display_value'] == 'AAA' + assert ctx["body"][0][1]["display_value"] == "0.1" + assert ctx["body"][0][3]["display_value"] == "AAA" def test_bad_apply_shape(self): df = pd.DataFrame([[1, 2], [3, 4]]) with pytest.raises(ValueError): - df.style._apply(lambda x: 'x', subset=pd.IndexSlice[[0, 1], :]) + df.style._apply(lambda x: "x", subset=pd.IndexSlice[[0, 1], :]) with pytest.raises(ValueError): - df.style._apply(lambda x: [''], subset=pd.IndexSlice[[0, 1], :]) + df.style._apply(lambda x: [""], subset=pd.IndexSlice[[0, 1], :]) with pytest.raises(ValueError): - df.style._apply(lambda x: ['', '', '', '']) + df.style._apply(lambda x: ["", "", "", ""]) with pytest.raises(ValueError): - df.style._apply(lambda x: ['', '', ''], subset=1) + df.style._apply(lambda x: ["", "", ""], subset=1) with pytest.raises(ValueError): - df.style._apply(lambda x: ['', '', ''], axis=1) + df.style._apply(lambda x: ["", "", ""], axis=1) def test_apply_bad_return(self): def f(x): - return '' + return "" + df = pd.DataFrame([[1, 2], [3, 4]]) with pytest.raises(TypeError): df.style._apply(f, axis=None) def test_apply_bad_labels(self): def f(x): - return pd.DataFrame(index=[1, 2], columns=['a', 'b']) + return pd.DataFrame(index=[1, 2], columns=["a", "b"]) + df = pd.DataFrame([[1, 2], [3, 4]]) with pytest.raises(ValueError): df.style._apply(f, axis=None) def test_get_level_lengths(self): - index = pd.MultiIndex.from_product([['a', 'b'], [0, 1, 2]]) - expected = {(0, 0): 3, (0, 3): 3, (1, 0): 1, (1, 1): 1, (1, 2): 1, - (1, 3): 1, (1, 4): 1, (1, 5): 1} + index = pd.MultiIndex.from_product([["a", "b"], [0, 1, 2]]) + expected = { + (0, 0): 3, + (0, 3): 3, + (1, 0): 1, + (1, 1): 1, + (1, 2): 1, + (1, 3): 1, + (1, 4): 1, + (1, 5): 1, + } result = _get_level_lengths(index) tm.assert_dict_equal(result, expected) def test_get_level_lengths_un_sorted(self): - index = pd.MultiIndex.from_arrays([ - [1, 1, 2, 1], - ['a', 'b', 'b', 'd'] - ]) - expected = {(0, 0): 2, (0, 2): 1, (0, 3): 1, - (1, 0): 1, (1, 1): 1, (1, 2): 1, (1, 3): 1} + index = pd.MultiIndex.from_arrays([[1, 1, 2, 1], ["a", "b", "b", "d"]]) + expected = { + (0, 0): 2, + (0, 2): 1, + (0, 3): 1, + (1, 0): 1, + (1, 1): 1, + (1, 2): 1, + (1, 3): 1, + } result = _get_level_lengths(index) tm.assert_dict_equal(result, expected) def test_mi_sparse(self): - df = pd.DataFrame({'A': [1, 2]}, - index=pd.MultiIndex.from_arrays([['a', 'a'], - [0, 1]])) + df = pd.DataFrame( + {"A": [1, 2]}, index=pd.MultiIndex.from_arrays([["a", "a"], [0, 1]]) + ) result = df.style._translate() - body_0 = result['body'][0][0] + body_0 = result["body"][0][0] expected_0 = { - "value": "a", "display_value": "a", "is_visible": True, - "type": "th", "attributes": ["rowspan=2"], - "class": "row_heading level0 row0", "id": "level0_row0" + "value": "a", + "display_value": "a", + "is_visible": True, + "type": "th", + "attributes": ["rowspan=2"], + "class": "row_heading level0 row0", + "id": "level0_row0", } tm.assert_dict_equal(body_0, expected_0) - body_1 = result['body'][0][1] + body_1 = result["body"][0][1] expected_1 = { - "value": 0, "display_value": 0, "is_visible": True, - "type": "th", "class": "row_heading level1 row0", - "id": "level1_row0" + "value": 0, + "display_value": 0, + "is_visible": True, + "type": "th", + "class": "row_heading level1 row0", + "id": "level1_row0", } tm.assert_dict_equal(body_1, expected_1) - body_10 = result['body'][1][0] + body_10 = result["body"][1][0] expected_10 = { - "value": 'a', "display_value": 'a', "is_visible": False, - "type": "th", "class": "row_heading level0 row1", - "id": "level0_row1" + "value": "a", + "display_value": "a", + "is_visible": False, + "type": "th", + "class": "row_heading level0 row1", + "id": "level0_row1", } tm.assert_dict_equal(body_10, expected_10) - head = result['head'][0] + head = result["head"][0] expected = [ - {'type': 'th', 'class': 'blank', 'value': '', - 'is_visible': True, "display_value": ''}, - {'type': 'th', 'class': 'blank level0', 'value': '', - 'is_visible': True, 'display_value': ''}, - {'type': 'th', 'class': 'col_heading level0 col0', 'value': 'A', - 'is_visible': True, 'display_value': 'A'}] + { + "type": "th", + "class": "blank", + "value": "", + "is_visible": True, + "display_value": "", + }, + { + "type": "th", + "class": "blank level0", + "value": "", + "is_visible": True, + "display_value": "", + }, + { + "type": "th", + "class": "col_heading level0 col0", + "value": "A", + "is_visible": True, + "display_value": "A", + }, + ] assert head == expected def test_mi_sparse_disabled(self): - with pd.option_context('display.multi_sparse', False): - df = pd.DataFrame({'A': [1, 2]}, - index=pd.MultiIndex.from_arrays([['a', 'a'], - [0, 1]])) + with pd.option_context("display.multi_sparse", False): + df = pd.DataFrame( + {"A": [1, 2]}, index=pd.MultiIndex.from_arrays([["a", "a"], [0, 1]]) + ) result = df.style._translate() - body = result['body'] + body = result["body"] for row in body: - assert 'attributes' not in row[0] + assert "attributes" not in row[0] def test_mi_sparse_index_names(self): - df = pd.DataFrame({'A': [1, 2]}, index=pd.MultiIndex.from_arrays( - [['a', 'a'], [0, 1]], - names=['idx_level_0', 'idx_level_1']) + df = pd.DataFrame( + {"A": [1, 2]}, + index=pd.MultiIndex.from_arrays( + [["a", "a"], [0, 1]], names=["idx_level_0", "idx_level_1"] + ), ) result = df.style._translate() - head = result['head'][1] - expected = [{ - 'class': 'index_name level0', 'value': 'idx_level_0', - 'type': 'th'}, - {'class': 'index_name level1', 'value': 'idx_level_1', - 'type': 'th'}, - {'class': 'blank', 'value': '', 'type': 'th'}] + head = result["head"][1] + expected = [ + {"class": "index_name level0", "value": "idx_level_0", "type": "th"}, + {"class": "index_name level1", "value": "idx_level_1", "type": "th"}, + {"class": "blank", "value": "", "type": "th"}, + ] assert head == expected @@ -1048,42 +1361,58 @@ def test_mi_sparse_column_names(self): df = pd.DataFrame( np.arange(16).reshape(4, 4), index=pd.MultiIndex.from_arrays( - [['a', 'a', 'b', 'a'], [0, 1, 1, 2]], - names=['idx_level_0', 'idx_level_1']), + [["a", "a", "b", "a"], [0, 1, 1, 2]], + names=["idx_level_0", "idx_level_1"], + ), columns=pd.MultiIndex.from_arrays( - [['C1', 'C1', 'C2', 'C2'], [1, 0, 1, 0]], - names=['col_0', 'col_1'] - ) + [["C1", "C1", "C2", "C2"], [1, 0, 1, 0]], names=["col_0", "col_1"] + ), ) result = df.style._translate() - head = result['head'][1] + head = result["head"][1] expected = [ - {'class': 'blank', 'value': '', 'display_value': '', - 'type': 'th', 'is_visible': True}, - {'class': 'index_name level1', 'value': 'col_1', - 'display_value': 'col_1', 'is_visible': True, 'type': 'th'}, - {'class': 'col_heading level1 col0', - 'display_value': 1, - 'is_visible': True, - 'type': 'th', - 'value': 1}, - {'class': 'col_heading level1 col1', - 'display_value': 0, - 'is_visible': True, - 'type': 'th', - 'value': 0}, - - {'class': 'col_heading level1 col2', - 'display_value': 1, - 'is_visible': True, - 'type': 'th', - 'value': 1}, - - {'class': 'col_heading level1 col3', - 'display_value': 0, - 'is_visible': True, - 'type': 'th', - 'value': 0}, + { + "class": "blank", + "value": "", + "display_value": "", + "type": "th", + "is_visible": True, + }, + { + "class": "index_name level1", + "value": "col_1", + "display_value": "col_1", + "is_visible": True, + "type": "th", + }, + { + "class": "col_heading level1 col0", + "display_value": 1, + "is_visible": True, + "type": "th", + "value": 1, + }, + { + "class": "col_heading level1 col1", + "display_value": 0, + "is_visible": True, + "type": "th", + "value": 0, + }, + { + "class": "col_heading level1 col2", + "display_value": 1, + "is_visible": True, + "type": "th", + "value": 1, + }, + { + "class": "col_heading level1 col3", + "display_value": 0, + "is_visible": True, + "type": "th", + "value": 0, + }, ] assert head == expected @@ -1091,195 +1420,206 @@ def test_hide_single_index(self): # GH 14194 # single unnamed index ctx = self.df.style._translate() - assert ctx['body'][0][0]['is_visible'] - assert ctx['head'][0][0]['is_visible'] + assert ctx["body"][0][0]["is_visible"] + assert ctx["head"][0][0]["is_visible"] ctx2 = self.df.style.hide_index()._translate() - assert not ctx2['body'][0][0]['is_visible'] - assert not ctx2['head'][0][0]['is_visible'] + assert not ctx2["body"][0][0]["is_visible"] + assert not ctx2["head"][0][0]["is_visible"] # single named index - ctx3 = self.df.set_index('A').style._translate() - assert ctx3['body'][0][0]['is_visible'] - assert len(ctx3['head']) == 2 # 2 header levels - assert ctx3['head'][0][0]['is_visible'] + ctx3 = self.df.set_index("A").style._translate() + assert ctx3["body"][0][0]["is_visible"] + assert len(ctx3["head"]) == 2 # 2 header levels + assert ctx3["head"][0][0]["is_visible"] - ctx4 = self.df.set_index('A').style.hide_index()._translate() - assert not ctx4['body'][0][0]['is_visible'] - assert len(ctx4['head']) == 1 # only 1 header levels - assert not ctx4['head'][0][0]['is_visible'] + ctx4 = self.df.set_index("A").style.hide_index()._translate() + assert not ctx4["body"][0][0]["is_visible"] + assert len(ctx4["head"]) == 1 # only 1 header levels + assert not ctx4["head"][0][0]["is_visible"] def test_hide_multiindex(self): # GH 14194 - df = pd.DataFrame({'A': [1, 2]}, index=pd.MultiIndex.from_arrays( - [['a', 'a'], [0, 1]], - names=['idx_level_0', 'idx_level_1']) + df = pd.DataFrame( + {"A": [1, 2]}, + index=pd.MultiIndex.from_arrays( + [["a", "a"], [0, 1]], names=["idx_level_0", "idx_level_1"] + ), ) ctx1 = df.style._translate() # tests for 'a' and '0' - assert ctx1['body'][0][0]['is_visible'] - assert ctx1['body'][0][1]['is_visible'] + assert ctx1["body"][0][0]["is_visible"] + assert ctx1["body"][0][1]["is_visible"] # check for blank header rows - assert ctx1['head'][0][0]['is_visible'] - assert ctx1['head'][0][1]['is_visible'] + assert ctx1["head"][0][0]["is_visible"] + assert ctx1["head"][0][1]["is_visible"] ctx2 = df.style.hide_index()._translate() # tests for 'a' and '0' - assert not ctx2['body'][0][0]['is_visible'] - assert not ctx2['body'][0][1]['is_visible'] + assert not ctx2["body"][0][0]["is_visible"] + assert not ctx2["body"][0][1]["is_visible"] # check for blank header rows - assert not ctx2['head'][0][0]['is_visible'] - assert not ctx2['head'][0][1]['is_visible'] + assert not ctx2["head"][0][0]["is_visible"] + assert not ctx2["head"][0][1]["is_visible"] def test_hide_columns_single_level(self): # GH 14194 # test hiding single column ctx = self.df.style._translate() - assert ctx['head'][0][1]['is_visible'] - assert ctx['head'][0][1]['display_value'] == 'A' - assert ctx['head'][0][2]['is_visible'] - assert ctx['head'][0][2]['display_value'] == 'B' - assert ctx['body'][0][1]['is_visible'] # col A, row 1 - assert ctx['body'][1][2]['is_visible'] # col B, row 1 - - ctx = self.df.style.hide_columns('A')._translate() - assert not ctx['head'][0][1]['is_visible'] - assert not ctx['body'][0][1]['is_visible'] # col A, row 1 - assert ctx['body'][1][2]['is_visible'] # col B, row 1 + assert ctx["head"][0][1]["is_visible"] + assert ctx["head"][0][1]["display_value"] == "A" + assert ctx["head"][0][2]["is_visible"] + assert ctx["head"][0][2]["display_value"] == "B" + assert ctx["body"][0][1]["is_visible"] # col A, row 1 + assert ctx["body"][1][2]["is_visible"] # col B, row 1 + + ctx = self.df.style.hide_columns("A")._translate() + assert not ctx["head"][0][1]["is_visible"] + assert not ctx["body"][0][1]["is_visible"] # col A, row 1 + assert ctx["body"][1][2]["is_visible"] # col B, row 1 # test hiding mulitiple columns - ctx = self.df.style.hide_columns(['A', 'B'])._translate() - assert not ctx['head'][0][1]['is_visible'] - assert not ctx['head'][0][2]['is_visible'] - assert not ctx['body'][0][1]['is_visible'] # col A, row 1 - assert not ctx['body'][1][2]['is_visible'] # col B, row 1 + ctx = self.df.style.hide_columns(["A", "B"])._translate() + assert not ctx["head"][0][1]["is_visible"] + assert not ctx["head"][0][2]["is_visible"] + assert not ctx["body"][0][1]["is_visible"] # col A, row 1 + assert not ctx["body"][1][2]["is_visible"] # col B, row 1 def test_hide_columns_mult_levels(self): # GH 14194 # setup dataframe with multiple column levels and indices - i1 = pd.MultiIndex.from_arrays([['a', 'a'], [0, 1]], - names=['idx_level_0', - 'idx_level_1']) - i2 = pd.MultiIndex.from_arrays([['b', 'b'], [0, 1]], - names=['col_level_0', - 'col_level_1']) + i1 = pd.MultiIndex.from_arrays( + [["a", "a"], [0, 1]], names=["idx_level_0", "idx_level_1"] + ) + i2 = pd.MultiIndex.from_arrays( + [["b", "b"], [0, 1]], names=["col_level_0", "col_level_1"] + ) df = pd.DataFrame([[1, 2], [3, 4]], index=i1, columns=i2) ctx = df.style._translate() # column headers - assert ctx['head'][0][2]['is_visible'] - assert ctx['head'][1][2]['is_visible'] - assert ctx['head'][1][3]['display_value'] == 1 + assert ctx["head"][0][2]["is_visible"] + assert ctx["head"][1][2]["is_visible"] + assert ctx["head"][1][3]["display_value"] == 1 # indices - assert ctx['body'][0][0]['is_visible'] + assert ctx["body"][0][0]["is_visible"] # data - assert ctx['body'][1][2]['is_visible'] - assert ctx['body'][1][2]['display_value'] == 3 - assert ctx['body'][1][3]['is_visible'] - assert ctx['body'][1][3]['display_value'] == 4 + assert ctx["body"][1][2]["is_visible"] + assert ctx["body"][1][2]["display_value"] == 3 + assert ctx["body"][1][3]["is_visible"] + assert ctx["body"][1][3]["display_value"] == 4 # hide top column level, which hides both columns - ctx = df.style.hide_columns('b')._translate() - assert not ctx['head'][0][2]['is_visible'] # b - assert not ctx['head'][1][2]['is_visible'] # 0 - assert not ctx['body'][1][2]['is_visible'] # 3 - assert ctx['body'][0][0]['is_visible'] # index + ctx = df.style.hide_columns("b")._translate() + assert not ctx["head"][0][2]["is_visible"] # b + assert not ctx["head"][1][2]["is_visible"] # 0 + assert not ctx["body"][1][2]["is_visible"] # 3 + assert ctx["body"][0][0]["is_visible"] # index # hide first column only - ctx = df.style.hide_columns([('b', 0)])._translate() - assert ctx['head'][0][2]['is_visible'] # b - assert not ctx['head'][1][2]['is_visible'] # 0 - assert not ctx['body'][1][2]['is_visible'] # 3 - assert ctx['body'][1][3]['is_visible'] - assert ctx['body'][1][3]['display_value'] == 4 + ctx = df.style.hide_columns([("b", 0)])._translate() + assert ctx["head"][0][2]["is_visible"] # b + assert not ctx["head"][1][2]["is_visible"] # 0 + assert not ctx["body"][1][2]["is_visible"] # 3 + assert ctx["body"][1][3]["is_visible"] + assert ctx["body"][1][3]["display_value"] == 4 # hide second column and index - ctx = df.style.hide_columns([('b', 1)]).hide_index()._translate() - assert not ctx['body'][0][0]['is_visible'] # index - assert ctx['head'][0][2]['is_visible'] # b - assert ctx['head'][1][2]['is_visible'] # 0 - assert not ctx['head'][1][3]['is_visible'] # 1 - assert not ctx['body'][1][3]['is_visible'] # 4 - assert ctx['body'][1][2]['is_visible'] - assert ctx['body'][1][2]['display_value'] == 3 + ctx = df.style.hide_columns([("b", 1)]).hide_index()._translate() + assert not ctx["body"][0][0]["is_visible"] # index + assert ctx["head"][0][2]["is_visible"] # b + assert ctx["head"][1][2]["is_visible"] # 0 + assert not ctx["head"][1][3]["is_visible"] # 1 + assert not ctx["body"][1][3]["is_visible"] # 4 + assert ctx["body"][1][2]["is_visible"] + assert ctx["body"][1][2]["display_value"] == 3 def test_pipe(self): def set_caption_from_template(styler, a, b): return styler.set_caption( - 'Dataframe with a = {a} and b = {b}'.format(a=a, b=b)) + "Dataframe with a = {a} and b = {b}".format(a=a, b=b) + ) - styler = self.df.style.pipe(set_caption_from_template, 'A', b='B') - assert 'Dataframe with a = A and b = B' in styler.render() + styler = self.df.style.pipe(set_caption_from_template, "A", b="B") + assert "Dataframe with a = A and b = B" in styler.render() # Test with an argument that is a (callable, keyword_name) pair. def f(a, b, styler): return (a, b, styler) styler = self.df.style - result = styler.pipe((f, 'styler'), a=1, b=2) + result = styler.pipe((f, "styler"), a=1, b=2) assert result == (1, 2, styler) @td.skip_if_no_mpl class TestStylerMatplotlibDep: - def test_background_gradient(self): - df = pd.DataFrame([[1, 2], [2, 4]], columns=['A', 'B']) + df = pd.DataFrame([[1, 2], [2, 4]], columns=["A", "B"]) - for c_map in [None, 'YlOrRd']: + for c_map in [None, "YlOrRd"]: result = df.style.background_gradient(cmap=c_map)._compute().ctx assert all("#" in x[0] for x in result.values()) assert result[(0, 0)] == result[(0, 1)] assert result[(1, 0)] == result[(1, 1)] - result = df.style.background_gradient( - subset=pd.IndexSlice[1, 'A'])._compute().ctx + result = ( + df.style.background_gradient(subset=pd.IndexSlice[1, "A"])._compute().ctx + ) - assert result[(1, 0)] == ['background-color: #fff7fb', - 'color: #000000'] + assert result[(1, 0)] == ["background-color: #fff7fb", "color: #000000"] @pytest.mark.parametrize( - 'c_map,expected', [ - (None, { - (0, 0): ['background-color: #440154', 'color: #f1f1f1'], - (1, 0): ['background-color: #fde725', 'color: #000000']}), - ('YlOrRd', { - (0, 0): ['background-color: #ffffcc', 'color: #000000'], - (1, 0): ['background-color: #800026', 'color: #f1f1f1']})]) + "c_map,expected", + [ + ( + None, + { + (0, 0): ["background-color: #440154", "color: #f1f1f1"], + (1, 0): ["background-color: #fde725", "color: #000000"], + }, + ), + ( + "YlOrRd", + { + (0, 0): ["background-color: #ffffcc", "color: #000000"], + (1, 0): ["background-color: #800026", "color: #f1f1f1"], + }, + ), + ], + ) def test_text_color_threshold(self, c_map, expected): - df = pd.DataFrame([1, 2], columns=['A']) + df = pd.DataFrame([1, 2], columns=["A"]) result = df.style.background_gradient(cmap=c_map)._compute().ctx assert result == expected - @pytest.mark.parametrize("text_color_threshold", [1.1, '1', -1, [2, 2]]) + @pytest.mark.parametrize("text_color_threshold", [1.1, "1", -1, [2, 2]]) def test_text_color_threshold_raises(self, text_color_threshold): - df = pd.DataFrame([[1, 2], [2, 4]], columns=['A', 'B']) + df = pd.DataFrame([[1, 2], [2, 4]], columns=["A", "B"]) msg = "`text_color_threshold` must be a value from 0 to 1." with pytest.raises(ValueError, match=msg): df.style.background_gradient( - text_color_threshold=text_color_threshold)._compute() + text_color_threshold=text_color_threshold + )._compute() @td.skip_if_no_mpl def test_background_gradient_axis(self): - df = pd.DataFrame([[1, 2], [2, 4]], columns=['A', 'B']) + df = pd.DataFrame([[1, 2], [2, 4]], columns=["A", "B"]) - low = ['background-color: #f7fbff', 'color: #000000'] - high = ['background-color: #08306b', 'color: #f1f1f1'] - mid = ['background-color: #abd0e6', 'color: #000000'] - result = df.style.background_gradient(cmap='Blues', - axis=0)._compute().ctx + low = ["background-color: #f7fbff", "color: #000000"] + high = ["background-color: #08306b", "color: #f1f1f1"] + mid = ["background-color: #abd0e6", "color: #000000"] + result = df.style.background_gradient(cmap="Blues", axis=0)._compute().ctx assert result[(0, 0)] == low assert result[(0, 1)] == low assert result[(1, 0)] == high assert result[(1, 1)] == high - result = df.style.background_gradient(cmap='Blues', - axis=1)._compute().ctx + result = df.style.background_gradient(cmap="Blues", axis=1)._compute().ctx assert result[(0, 0)] == low assert result[(0, 1)] == high assert result[(1, 0)] == low assert result[(1, 1)] == high - result = df.style.background_gradient(cmap='Blues', - axis=None)._compute().ctx + result = df.style.background_gradient(cmap="Blues", axis=None)._compute().ctx assert result[(0, 0)] == low assert result[(0, 1)] == mid assert result[(1, 0)] == mid @@ -1289,10 +1629,23 @@ def test_background_gradient_axis(self): def test_block_names(): # catch accidental removal of a block expected = { - 'before_style', 'style', 'table_styles', 'before_cellstyle', - 'cellstyle', 'before_table', 'table', 'caption', 'thead', 'tbody', - 'after_table', 'before_head_rows', 'head_tr', 'after_head_rows', - 'before_rows', 'tr', 'after_rows', + "before_style", + "style", + "table_styles", + "before_cellstyle", + "cellstyle", + "before_table", + "table", + "caption", + "thead", + "tbody", + "after_table", + "before_head_rows", + "head_tr", + "after_head_rows", + "before_rows", + "tr", + "after_rows", } result = set(Styler.template.blocks) assert result == expected @@ -1300,14 +1653,17 @@ def test_block_names(): def test_from_custom_template(tmpdir): p = tmpdir.mkdir("templates").join("myhtml.tpl") - p.write(textwrap.dedent("""\ + p.write( + textwrap.dedent( + """\ {% extends "html.tpl" %} {% block table %}

{{ table_title|default("My Table") }}

{{ super() }} - {% endblock table %}""")) - result = Styler.from_custom_template(str(tmpdir.join('templates')), - 'myhtml.tpl') + {% endblock table %}""" + ) + ) + result = Styler.from_custom_template(str(tmpdir.join("templates")), "myhtml.tpl") assert issubclass(result, Styler) assert result.env is not Styler.env assert result.template is not Styler.template diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 0acdaaa7f82cd8..7b493266144b06 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -10,10 +10,10 @@ class TestToCSV: - - @pytest.mark.xfail((3, 6, 5) > sys.version_info >= (3, 5), - reason=("Python csv library bug " - "(see https://bugs.python.org/issue32255)")) + @pytest.mark.xfail( + (3, 6, 5) > sys.version_info >= (3, 5), + reason=("Python csv library bug " "(see https://bugs.python.org/issue32255)"), + ) def test_to_csv_with_single_column(self): # see gh-18676, https://bugs.python.org/issue32255 # @@ -27,9 +27,9 @@ def test_to_csv_with_single_column(self): "" 1.0 """ - with tm.ensure_clean('test.csv') as path: + with tm.ensure_clean("test.csv") as path: df1.to_csv(path, header=None, index=None) - with open(path, 'r') as f: + with open(path, "r") as f: assert f.read() == expected1 df2 = DataFrame([1, None]) @@ -37,31 +37,31 @@ def test_to_csv_with_single_column(self): 1.0 "" """ - with tm.ensure_clean('test.csv') as path: + with tm.ensure_clean("test.csv") as path: df2.to_csv(path, header=None, index=None) - with open(path, 'r') as f: + with open(path, "r") as f: assert f.read() == expected2 def test_to_csv_defualt_encoding(self): # GH17097 - df = DataFrame({'col': ["AAAAA", "ÄÄÄÄÄ", "ßßßßß", "聞聞聞聞聞"]}) + df = DataFrame({"col": ["AAAAA", "ÄÄÄÄÄ", "ßßßßß", "聞聞聞聞聞"]}) - with tm.ensure_clean('test.csv') as path: + with tm.ensure_clean("test.csv") as path: # the default to_csv encoding is uft-8. df.to_csv(path) tm.assert_frame_equal(pd.read_csv(path, index_col=0), df) def test_to_csv_quotechar(self): - df = DataFrame({'col': [1, 2]}) + df = DataFrame({"col": [1, 2]}) expected = """\ "","col" "0","1" "1","2" """ - with tm.ensure_clean('test.csv') as path: + with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=1) # 1=QUOTE_ALL - with open(path, 'r') as f: + with open(path, "r") as f: assert f.read() == expected expected = """\ @@ -70,396 +70,377 @@ def test_to_csv_quotechar(self): $1$,$2$ """ - with tm.ensure_clean('test.csv') as path: + with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=1, quotechar="$") - with open(path, 'r') as f: + with open(path, "r") as f: assert f.read() == expected - with tm.ensure_clean('test.csv') as path: - with pytest.raises(TypeError, match='quotechar'): + with tm.ensure_clean("test.csv") as path: + with pytest.raises(TypeError, match="quotechar"): df.to_csv(path, quoting=1, quotechar=None) def test_to_csv_doublequote(self): - df = DataFrame({'col': ['a"a', '"bb"']}) + df = DataFrame({"col": ['a"a', '"bb"']}) expected = '''\ "","col" "0","a""a" "1","""bb""" ''' - with tm.ensure_clean('test.csv') as path: + with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL - with open(path, 'r') as f: + with open(path, "r") as f: assert f.read() == expected from _csv import Error - with tm.ensure_clean('test.csv') as path: - with pytest.raises(Error, match='escapechar'): + + with tm.ensure_clean("test.csv") as path: + with pytest.raises(Error, match="escapechar"): df.to_csv(path, doublequote=False) # no escapechar set def test_to_csv_escapechar(self): - df = DataFrame({'col': ['a"a', '"bb"']}) - expected = '''\ + df = DataFrame({"col": ['a"a', '"bb"']}) + expected = """\ "","col" "0","a\\"a" "1","\\"bb\\"" -''' +""" - with tm.ensure_clean('test.csv') as path: # QUOTE_ALL - df.to_csv(path, quoting=1, doublequote=False, escapechar='\\') - with open(path, 'r') as f: + with tm.ensure_clean("test.csv") as path: # QUOTE_ALL + df.to_csv(path, quoting=1, doublequote=False, escapechar="\\") + with open(path, "r") as f: assert f.read() == expected - df = DataFrame({'col': ['a,a', ',bb,']}) + df = DataFrame({"col": ["a,a", ",bb,"]}) expected = """\ ,col 0,a\\,a 1,\\,bb\\, """ - with tm.ensure_clean('test.csv') as path: - df.to_csv(path, quoting=3, escapechar='\\') # QUOTE_NONE - with open(path, 'r') as f: + with tm.ensure_clean("test.csv") as path: + df.to_csv(path, quoting=3, escapechar="\\") # QUOTE_NONE + with open(path, "r") as f: assert f.read() == expected def test_csv_to_string(self): - df = DataFrame({'col': [1, 2]}) - expected_rows = [',col', - '0,1', - '1,2'] + df = DataFrame({"col": [1, 2]}) + expected_rows = [",col", "0,1", "1,2"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv() == expected def test_to_csv_decimal(self): # see gh-781 - df = DataFrame({'col1': [1], 'col2': ['a'], 'col3': [10.1]}) + df = DataFrame({"col1": [1], "col2": ["a"], "col3": [10.1]}) - expected_rows = [',col1,col2,col3', - '0,1,a,10.1'] + expected_rows = [",col1,col2,col3", "0,1,a,10.1"] expected_default = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv() == expected_default - expected_rows = [';col1;col2;col3', - '0;1;a;10,1'] - expected_european_excel = tm.convert_rows_list_to_csv_str( - expected_rows) - assert df.to_csv(decimal=',', sep=';') == expected_european_excel + expected_rows = [";col1;col2;col3", "0;1;a;10,1"] + expected_european_excel = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.to_csv(decimal=",", sep=";") == expected_european_excel - expected_rows = [',col1,col2,col3', - '0,1,a,10.10'] - expected_float_format_default = tm.convert_rows_list_to_csv_str( - expected_rows) - assert df.to_csv(float_format='%.2f') == expected_float_format_default + expected_rows = [",col1,col2,col3", "0,1,a,10.10"] + expected_float_format_default = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.to_csv(float_format="%.2f") == expected_float_format_default - expected_rows = [';col1;col2;col3', - '0;1;a;10,10'] + expected_rows = [";col1;col2;col3", "0;1;a;10,10"] expected_float_format = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.to_csv(decimal=',', sep=';', - float_format='%.2f') == expected_float_format + assert ( + df.to_csv(decimal=",", sep=";", float_format="%.2f") + == expected_float_format + ) # see gh-11553: testing if decimal is taken into account for '0.0' - df = pd.DataFrame({'a': [0, 1.1], 'b': [2.2, 3.3], 'c': 1}) + df = pd.DataFrame({"a": [0, 1.1], "b": [2.2, 3.3], "c": 1}) - expected_rows = ['a,b,c', - '0^0,2^2,1', - '1^1,3^3,1'] + expected_rows = ["a,b,c", "0^0,2^2,1", "1^1,3^3,1"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.to_csv(index=False, decimal='^') == expected + assert df.to_csv(index=False, decimal="^") == expected # same but for an index - assert df.set_index('a').to_csv(decimal='^') == expected + assert df.set_index("a").to_csv(decimal="^") == expected # same for a multi-index - assert df.set_index(['a', 'b']).to_csv(decimal="^") == expected + assert df.set_index(["a", "b"]).to_csv(decimal="^") == expected def test_to_csv_float_format(self): # testing if float_format is taken into account for the index # GH 11553 - df = pd.DataFrame({'a': [0, 1], 'b': [2.2, 3.3], 'c': 1}) + df = pd.DataFrame({"a": [0, 1], "b": [2.2, 3.3], "c": 1}) - expected_rows = ['a,b,c', - '0,2.20,1', - '1,3.30,1'] + expected_rows = ["a,b,c", "0,2.20,1", "1,3.30,1"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.set_index('a').to_csv(float_format='%.2f') == expected + assert df.set_index("a").to_csv(float_format="%.2f") == expected # same for a multi-index - assert df.set_index(['a', 'b']).to_csv( - float_format='%.2f') == expected + assert df.set_index(["a", "b"]).to_csv(float_format="%.2f") == expected def test_to_csv_na_rep(self): # see gh-11553 # # Testing if NaN values are correctly represented in the index. - df = DataFrame({'a': [0, np.NaN], 'b': [0, 1], 'c': [2, 3]}) - expected_rows = ['a,b,c', - '0.0,0,2', - '_,1,3'] + df = DataFrame({"a": [0, np.NaN], "b": [0, 1], "c": [2, 3]}) + expected_rows = ["a,b,c", "0.0,0,2", "_,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.set_index('a').to_csv(na_rep='_') == expected - assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected + assert df.set_index("a").to_csv(na_rep="_") == expected + assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected # now with an index containing only NaNs - df = DataFrame({'a': np.NaN, 'b': [0, 1], 'c': [2, 3]}) - expected_rows = ['a,b,c', - '_,0,2', - '_,1,3'] + df = DataFrame({"a": np.NaN, "b": [0, 1], "c": [2, 3]}) + expected_rows = ["a,b,c", "_,0,2", "_,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.set_index('a').to_csv(na_rep='_') == expected - assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected + assert df.set_index("a").to_csv(na_rep="_") == expected + assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected # check if na_rep parameter does not break anything when no NaN - df = DataFrame({'a': 0, 'b': [0, 1], 'c': [2, 3]}) - expected_rows = ['a,b,c', - '0,0,2', - '0,1,3'] + df = DataFrame({"a": 0, "b": [0, 1], "c": [2, 3]}) + expected_rows = ["a,b,c", "0,0,2", "0,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.set_index('a').to_csv(na_rep='_') == expected - assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected + assert df.set_index("a").to_csv(na_rep="_") == expected + assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected def test_to_csv_date_format(self): # GH 10209 - df_sec = DataFrame({'A': pd.date_range('20130101', periods=5, freq='s') - }) - df_day = DataFrame({'A': pd.date_range('20130101', periods=5, freq='d') - }) - - expected_rows = [',A', - '0,2013-01-01 00:00:00', - '1,2013-01-01 00:00:01', - '2,2013-01-01 00:00:02', - '3,2013-01-01 00:00:03', - '4,2013-01-01 00:00:04'] + df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")}) + df_day = DataFrame({"A": pd.date_range("20130101", periods=5, freq="d")}) + + expected_rows = [ + ",A", + "0,2013-01-01 00:00:00", + "1,2013-01-01 00:00:01", + "2,2013-01-01 00:00:02", + "3,2013-01-01 00:00:03", + "4,2013-01-01 00:00:04", + ] expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows) assert df_sec.to_csv() == expected_default_sec - expected_rows = [',A', - '0,2013-01-01 00:00:00', - '1,2013-01-02 00:00:00', - '2,2013-01-03 00:00:00', - '3,2013-01-04 00:00:00', - '4,2013-01-05 00:00:00'] + expected_rows = [ + ",A", + "0,2013-01-01 00:00:00", + "1,2013-01-02 00:00:00", + "2,2013-01-03 00:00:00", + "3,2013-01-04 00:00:00", + "4,2013-01-05 00:00:00", + ] expected_ymdhms_day = tm.convert_rows_list_to_csv_str(expected_rows) - assert (df_day.to_csv(date_format='%Y-%m-%d %H:%M:%S') == - expected_ymdhms_day) - - expected_rows = [',A', - '0,2013-01-01', - '1,2013-01-01', - '2,2013-01-01', - '3,2013-01-01', - '4,2013-01-01'] + assert df_day.to_csv(date_format="%Y-%m-%d %H:%M:%S") == expected_ymdhms_day + + expected_rows = [ + ",A", + "0,2013-01-01", + "1,2013-01-01", + "2,2013-01-01", + "3,2013-01-01", + "4,2013-01-01", + ] expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) - assert df_sec.to_csv(date_format='%Y-%m-%d') == expected_ymd_sec - - expected_rows = [',A', - '0,2013-01-01', - '1,2013-01-02', - '2,2013-01-03', - '3,2013-01-04', - '4,2013-01-05'] + assert df_sec.to_csv(date_format="%Y-%m-%d") == expected_ymd_sec + + expected_rows = [ + ",A", + "0,2013-01-01", + "1,2013-01-02", + "2,2013-01-03", + "3,2013-01-04", + "4,2013-01-05", + ] expected_default_day = tm.convert_rows_list_to_csv_str(expected_rows) assert df_day.to_csv() == expected_default_day - assert df_day.to_csv(date_format='%Y-%m-%d') == expected_default_day + assert df_day.to_csv(date_format="%Y-%m-%d") == expected_default_day # see gh-7791 # # Testing if date_format parameter is taken into account # for multi-indexed DataFrames. - df_sec['B'] = 0 - df_sec['C'] = 1 + df_sec["B"] = 0 + df_sec["C"] = 1 - expected_rows = ['A,B,C', - '2013-01-01,0,1'] + expected_rows = ["A,B,C", "2013-01-01,0,1"] expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) - df_sec_grouped = df_sec.groupby([pd.Grouper(key='A', freq='1h'), 'B']) - assert (df_sec_grouped.mean().to_csv(date_format='%Y-%m-%d') == - expected_ymd_sec) + df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"]) + assert df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d") == expected_ymd_sec def test_to_csv_multi_index(self): # see gh-6618 df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]])) - exp_rows = [',1', - ',2', - '0,1'] + exp_rows = [",1", ",2", "0,1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv() == exp - exp_rows = ['1', '2', '1'] + exp_rows = ["1", "2", "1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False) == exp - df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]), - index=pd.MultiIndex.from_arrays([[1], [2]])) + df = DataFrame( + [1], + columns=pd.MultiIndex.from_arrays([[1], [2]]), + index=pd.MultiIndex.from_arrays([[1], [2]]), + ) - exp_rows = [',,1', ',,2', '1,2,1'] + exp_rows = [",,1", ",,2", "1,2,1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv() == exp - exp_rows = ['1', '2', '1'] + exp_rows = ["1", "2", "1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False) == exp - df = DataFrame( - [1], columns=pd.MultiIndex.from_arrays([['foo'], ['bar']])) + df = DataFrame([1], columns=pd.MultiIndex.from_arrays([["foo"], ["bar"]])) - exp_rows = [',foo', ',bar', '0,1'] + exp_rows = [",foo", ",bar", "0,1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv() == exp - exp_rows = ['foo', 'bar', '1'] + exp_rows = ["foo", "bar", "1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False) == exp - @pytest.mark.parametrize("ind,expected", [ - (pd.MultiIndex(levels=[[1.0]], - codes=[[0]], - names=["x"]), - "x,data\n1.0,1\n"), - (pd.MultiIndex(levels=[[1.], [2.]], - codes=[[0], [0]], - names=["x", "y"]), - "x,y,data\n1.0,2.0,1\n") - ]) - @pytest.mark.parametrize("klass", [ - pd.DataFrame, pd.Series - ]) + @pytest.mark.parametrize( + "ind,expected", + [ + ( + pd.MultiIndex(levels=[[1.0]], codes=[[0]], names=["x"]), + "x,data\n1.0,1\n", + ), + ( + pd.MultiIndex( + levels=[[1.0], [2.0]], codes=[[0], [0]], names=["x", "y"] + ), + "x,y,data\n1.0,2.0,1\n", + ), + ], + ) + @pytest.mark.parametrize("klass", [pd.DataFrame, pd.Series]) def test_to_csv_single_level_multi_index(self, ind, expected, klass): # see gh-19589 result = klass(pd.Series([1], ind, name="data")).to_csv( - line_terminator="\n", header=True) + line_terminator="\n", header=True + ) assert result == expected def test_to_csv_string_array_ascii(self): # GH 10813 - str_array = [{'names': ['foo', 'bar']}, {'names': ['baz', 'qux']}] + str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] df = pd.DataFrame(str_array) - expected_ascii = '''\ + expected_ascii = """\ ,names 0,"['foo', 'bar']" 1,"['baz', 'qux']" -''' - with tm.ensure_clean('str_test.csv') as path: - df.to_csv(path, encoding='ascii') - with open(path, 'r') as f: +""" + with tm.ensure_clean("str_test.csv") as path: + df.to_csv(path, encoding="ascii") + with open(path, "r") as f: assert f.read() == expected_ascii @pytest.mark.xfail(strict=False) def test_to_csv_string_array_utf8(self): # GH 10813 - str_array = [{'names': ['foo', 'bar']}, {'names': ['baz', 'qux']}] + str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] df = pd.DataFrame(str_array) - expected_utf8 = '''\ + expected_utf8 = """\ ,names 0,"['foo', 'bar']" 1,"['baz', 'qux']" -''' - with tm.ensure_clean('unicode_test.csv') as path: - df.to_csv(path, encoding='utf-8') - with open(path, 'r') as f: +""" + with tm.ensure_clean("unicode_test.csv") as path: + df.to_csv(path, encoding="utf-8") + with open(path, "r") as f: assert f.read() == expected_utf8 def test_to_csv_string_with_lf(self): # GH 20353 - data = { - 'int': [1, 2, 3], - 'str_lf': ['abc', 'd\nef', 'g\nh\n\ni'] - } + data = {"int": [1, 2, 3], "str_lf": ["abc", "d\nef", "g\nh\n\ni"]} df = pd.DataFrame(data) - with tm.ensure_clean('lf_test.csv') as path: + with tm.ensure_clean("lf_test.csv") as path: # case 1: The default line terminator(=os.linesep)(PR 21406) - os_linesep = os.linesep.encode('utf-8') + os_linesep = os.linesep.encode("utf-8") expected_noarg = ( - b'int,str_lf' + os_linesep + - b'1,abc' + os_linesep + - b'2,"d\nef"' + os_linesep + - b'3,"g\nh\n\ni"' + os_linesep + b"int,str_lf" + + os_linesep + + b"1,abc" + + os_linesep + + b'2,"d\nef"' + + os_linesep + + b'3,"g\nh\n\ni"' + + os_linesep ) df.to_csv(path, index=False) - with open(path, 'rb') as f: + with open(path, "rb") as f: assert f.read() == expected_noarg - with tm.ensure_clean('lf_test.csv') as path: + with tm.ensure_clean("lf_test.csv") as path: # case 2: LF as line terminator - expected_lf = ( - b'int,str_lf\n' - b'1,abc\n' - b'2,"d\nef"\n' - b'3,"g\nh\n\ni"\n' - ) - df.to_csv(path, line_terminator='\n', index=False) - with open(path, 'rb') as f: + expected_lf = b"int,str_lf\n" b"1,abc\n" b'2,"d\nef"\n' b'3,"g\nh\n\ni"\n' + df.to_csv(path, line_terminator="\n", index=False) + with open(path, "rb") as f: assert f.read() == expected_lf - with tm.ensure_clean('lf_test.csv') as path: + with tm.ensure_clean("lf_test.csv") as path: # case 3: CRLF as line terminator # 'line_terminator' should not change inner element expected_crlf = ( - b'int,str_lf\r\n' - b'1,abc\r\n' - b'2,"d\nef"\r\n' - b'3,"g\nh\n\ni"\r\n' + b"int,str_lf\r\n" b"1,abc\r\n" b'2,"d\nef"\r\n' b'3,"g\nh\n\ni"\r\n' ) - df.to_csv(path, line_terminator='\r\n', index=False) - with open(path, 'rb') as f: + df.to_csv(path, line_terminator="\r\n", index=False) + with open(path, "rb") as f: assert f.read() == expected_crlf def test_to_csv_string_with_crlf(self): # GH 20353 - data = { - 'int': [1, 2, 3], - 'str_crlf': ['abc', 'd\r\nef', 'g\r\nh\r\n\r\ni'] - } + data = {"int": [1, 2, 3], "str_crlf": ["abc", "d\r\nef", "g\r\nh\r\n\r\ni"]} df = pd.DataFrame(data) - with tm.ensure_clean('crlf_test.csv') as path: + with tm.ensure_clean("crlf_test.csv") as path: # case 1: The default line terminator(=os.linesep)(PR 21406) - os_linesep = os.linesep.encode('utf-8') + os_linesep = os.linesep.encode("utf-8") expected_noarg = ( - b'int,str_crlf' + os_linesep + - b'1,abc' + os_linesep + - b'2,"d\r\nef"' + os_linesep + - b'3,"g\r\nh\r\n\r\ni"' + os_linesep + b"int,str_crlf" + + os_linesep + + b"1,abc" + + os_linesep + + b'2,"d\r\nef"' + + os_linesep + + b'3,"g\r\nh\r\n\r\ni"' + + os_linesep ) df.to_csv(path, index=False) - with open(path, 'rb') as f: + with open(path, "rb") as f: assert f.read() == expected_noarg - with tm.ensure_clean('crlf_test.csv') as path: + with tm.ensure_clean("crlf_test.csv") as path: # case 2: LF as line terminator expected_lf = ( - b'int,str_crlf\n' - b'1,abc\n' - b'2,"d\r\nef"\n' - b'3,"g\r\nh\r\n\r\ni"\n' + b"int,str_crlf\n" b"1,abc\n" b'2,"d\r\nef"\n' b'3,"g\r\nh\r\n\r\ni"\n' ) - df.to_csv(path, line_terminator='\n', index=False) - with open(path, 'rb') as f: + df.to_csv(path, line_terminator="\n", index=False) + with open(path, "rb") as f: assert f.read() == expected_lf - with tm.ensure_clean('crlf_test.csv') as path: + with tm.ensure_clean("crlf_test.csv") as path: # case 3: CRLF as line terminator # 'line_terminator' should not change inner element expected_crlf = ( - b'int,str_crlf\r\n' - b'1,abc\r\n' + b"int,str_crlf\r\n" + b"1,abc\r\n" b'2,"d\r\nef"\r\n' b'3,"g\r\nh\r\n\r\ni"\r\n' ) - df.to_csv(path, line_terminator='\r\n', index=False) - with open(path, 'rb') as f: + df.to_csv(path, line_terminator="\r\n", index=False) + with open(path, "rb") as f: assert f.read() == expected_crlf def test_to_csv_stdout_file(self, capsys): # GH 21561 - df = pd.DataFrame([['foo', 'bar'], ['baz', 'qux']], - columns=['name_1', 'name_2']) - expected_rows = [',name_1,name_2', - '0,foo,bar', - '1,baz,qux'] + df = pd.DataFrame( + [["foo", "bar"], ["baz", "qux"]], columns=["name_1", "name_2"] + ) + expected_rows = [",name_1,name_2", "0,foo,bar", "1,baz,qux"] expected_ascii = tm.convert_rows_list_to_csv_str(expected_rows) - df.to_csv(sys.stdout, encoding='ascii') + df.to_csv(sys.stdout, encoding="ascii") captured = capsys.readouterr() assert captured.out == expected_ascii @@ -467,52 +448,53 @@ def test_to_csv_stdout_file(self, capsys): @pytest.mark.xfail( compat.is_platform_windows(), - reason=("Especially in Windows, file stream should not be passed" - "to csv writer without newline='' option." - "(https://docs.python.org/3.6/library/csv.html#csv.writer)")) + reason=( + "Especially in Windows, file stream should not be passed" + "to csv writer without newline='' option." + "(https://docs.python.org/3.6/library/csv.html#csv.writer)" + ), + ) def test_to_csv_write_to_open_file(self): # GH 21696 - df = pd.DataFrame({'a': ['x', 'y', 'z']}) - expected = '''\ + df = pd.DataFrame({"a": ["x", "y", "z"]}) + expected = """\ manual header x y z -''' - with tm.ensure_clean('test.txt') as path: - with open(path, 'w') as f: - f.write('manual header\n') +""" + with tm.ensure_clean("test.txt") as path: + with open(path, "w") as f: + f.write("manual header\n") df.to_csv(f, header=None, index=None) - with open(path, 'r') as f: + with open(path, "r") as f: assert f.read() == expected def test_to_csv_write_to_open_file_with_newline_py3(self): # see gh-21696 # see gh-20353 - df = pd.DataFrame({'a': ['x', 'y', 'z']}) - expected_rows = ["x", - "y", - "z"] - expected = ("manual header\n" + - tm.convert_rows_list_to_csv_str(expected_rows)) - with tm.ensure_clean('test.txt') as path: - with open(path, 'w', newline='') as f: - f.write('manual header\n') + df = pd.DataFrame({"a": ["x", "y", "z"]}) + expected_rows = ["x", "y", "z"] + expected = "manual header\n" + tm.convert_rows_list_to_csv_str(expected_rows) + with tm.ensure_clean("test.txt") as path: + with open(path, "w", newline="") as f: + f.write("manual header\n") df.to_csv(f, header=None, index=None) - with open(path, 'rb') as f: - assert f.read() == bytes(expected, 'utf-8') + with open(path, "rb") as f: + assert f.read() == bytes(expected, "utf-8") @pytest.mark.parametrize("to_infer", [True, False]) @pytest.mark.parametrize("read_infer", [True, False]) - def test_to_csv_compression(self, compression_only, - read_infer, to_infer): + def test_to_csv_compression(self, compression_only, read_infer, to_infer): # see gh-15008 compression = compression_only if compression == "zip": - pytest.skip("{compression} is not supported " - "for to_csv".format(compression=compression)) + pytest.skip( + "{compression} is not supported " + "for to_csv".format(compression=compression) + ) # We'll complete file extension subsequently. filename = "test." @@ -531,6 +513,5 @@ def test_to_csv_compression(self, compression_only, with tm.ensure_clean(filename) as path: df.to_csv(path, compression=to_compression) - result = pd.read_csv(path, index_col=0, - compression=read_compression) + result = pd.read_csv(path, index_col=0, compression=read_compression) tm.assert_frame_equal(result, df) diff --git a/pandas/tests/io/formats/test_to_excel.py b/pandas/tests/io/formats/test_to_excel.py index 13eb517fcab6a1..1440b0a6f06f12 100644 --- a/pandas/tests/io/formats/test_to_excel.py +++ b/pandas/tests/io/formats/test_to_excel.py @@ -11,172 +11,200 @@ from pandas.io.formats.excel import CSSToExcelConverter -@pytest.mark.parametrize('css,expected', [ - # FONT - # - name - ('font-family: foo,bar', {'font': {'name': 'foo'}}), - ('font-family: "foo bar",baz', {'font': {'name': 'foo bar'}}), - ('font-family: foo,\nbar', {'font': {'name': 'foo'}}), - ('font-family: foo, bar, baz', {'font': {'name': 'foo'}}), - ('font-family: bar, foo', {'font': {'name': 'bar'}}), - ('font-family: \'foo bar\', baz', {'font': {'name': 'foo bar'}}), - ('font-family: \'foo \\\'bar\', baz', {'font': {'name': 'foo \'bar'}}), - ('font-family: "foo \\"bar", baz', {'font': {'name': 'foo "bar'}}), - ('font-family: "foo ,bar", baz', {'font': {'name': 'foo ,bar'}}), - # - family - ('font-family: serif', {'font': {'name': 'serif', 'family': 1}}), - ('font-family: Serif', {'font': {'name': 'serif', 'family': 1}}), - ('font-family: roman, serif', {'font': {'name': 'roman', 'family': 1}}), - ('font-family: roman, sans-serif', {'font': {'name': 'roman', - 'family': 2}}), - ('font-family: roman, sans serif', {'font': {'name': 'roman'}}), - ('font-family: roman, sansserif', {'font': {'name': 'roman'}}), - ('font-family: roman, cursive', {'font': {'name': 'roman', 'family': 4}}), - ('font-family: roman, fantasy', {'font': {'name': 'roman', 'family': 5}}), - # - size - ('font-size: 1em', {'font': {'size': 12}}), - ('font-size: xx-small', {'font': {'size': 6}}), - ('font-size: x-small', {'font': {'size': 7.5}}), - ('font-size: small', {'font': {'size': 9.6}}), - ('font-size: medium', {'font': {'size': 12}}), - ('font-size: large', {'font': {'size': 13.5}}), - ('font-size: x-large', {'font': {'size': 18}}), - ('font-size: xx-large', {'font': {'size': 24}}), - ('font-size: 50%', {'font': {'size': 6}}), - # - bold - ('font-weight: 100', {'font': {'bold': False}}), - ('font-weight: 200', {'font': {'bold': False}}), - ('font-weight: 300', {'font': {'bold': False}}), - ('font-weight: 400', {'font': {'bold': False}}), - ('font-weight: normal', {'font': {'bold': False}}), - ('font-weight: lighter', {'font': {'bold': False}}), - ('font-weight: bold', {'font': {'bold': True}}), - ('font-weight: bolder', {'font': {'bold': True}}), - ('font-weight: 700', {'font': {'bold': True}}), - ('font-weight: 800', {'font': {'bold': True}}), - ('font-weight: 900', {'font': {'bold': True}}), - # - italic - ('font-style: italic', {'font': {'italic': True}}), - ('font-style: oblique', {'font': {'italic': True}}), - # - underline - ('text-decoration: underline', - {'font': {'underline': 'single'}}), - ('text-decoration: overline', - {}), - ('text-decoration: none', - {}), - # - strike - ('text-decoration: line-through', - {'font': {'strike': True}}), - ('text-decoration: underline line-through', - {'font': {'strike': True, 'underline': 'single'}}), - ('text-decoration: underline; text-decoration: line-through', - {'font': {'strike': True}}), - # - color - ('color: red', {'font': {'color': 'FF0000'}}), - ('color: #ff0000', {'font': {'color': 'FF0000'}}), - ('color: #f0a', {'font': {'color': 'FF00AA'}}), - # - shadow - ('text-shadow: none', {'font': {'shadow': False}}), - ('text-shadow: 0px -0em 0px #CCC', {'font': {'shadow': False}}), - ('text-shadow: 0px -0em 0px #999', {'font': {'shadow': False}}), - ('text-shadow: 0px -0em 0px', {'font': {'shadow': False}}), - ('text-shadow: 2px -0em 0px #CCC', {'font': {'shadow': True}}), - ('text-shadow: 0px -2em 0px #CCC', {'font': {'shadow': True}}), - ('text-shadow: 0px -0em 2px #CCC', {'font': {'shadow': True}}), - ('text-shadow: 0px -0em 2px', {'font': {'shadow': True}}), - ('text-shadow: 0px -2em', {'font': {'shadow': True}}), - - # FILL - # - color, fillType - ('background-color: red', {'fill': {'fgColor': 'FF0000', - 'patternType': 'solid'}}), - ('background-color: #ff0000', {'fill': {'fgColor': 'FF0000', - 'patternType': 'solid'}}), - ('background-color: #f0a', {'fill': {'fgColor': 'FF00AA', - 'patternType': 'solid'}}), - # BORDER - # - style - ('border-style: solid', - {'border': {'top': {'style': 'medium'}, - 'bottom': {'style': 'medium'}, - 'left': {'style': 'medium'}, - 'right': {'style': 'medium'}}}), - ('border-style: solid; border-width: thin', - {'border': {'top': {'style': 'thin'}, - 'bottom': {'style': 'thin'}, - 'left': {'style': 'thin'}, - 'right': {'style': 'thin'}}}), - - ('border-top-style: solid; border-top-width: thin', - {'border': {'top': {'style': 'thin'}}}), - ('border-top-style: solid; border-top-width: 1pt', - {'border': {'top': {'style': 'thin'}}}), - ('border-top-style: solid', - {'border': {'top': {'style': 'medium'}}}), - ('border-top-style: solid; border-top-width: medium', - {'border': {'top': {'style': 'medium'}}}), - ('border-top-style: solid; border-top-width: 2pt', - {'border': {'top': {'style': 'medium'}}}), - ('border-top-style: solid; border-top-width: thick', - {'border': {'top': {'style': 'thick'}}}), - ('border-top-style: solid; border-top-width: 4pt', - {'border': {'top': {'style': 'thick'}}}), - - ('border-top-style: dotted', - {'border': {'top': {'style': 'mediumDashDotDot'}}}), - ('border-top-style: dotted; border-top-width: thin', - {'border': {'top': {'style': 'dotted'}}}), - ('border-top-style: dashed', - {'border': {'top': {'style': 'mediumDashed'}}}), - ('border-top-style: dashed; border-top-width: thin', - {'border': {'top': {'style': 'dashed'}}}), - ('border-top-style: double', - {'border': {'top': {'style': 'double'}}}), - # - color - ('border-style: solid; border-color: #0000ff', - {'border': {'top': {'style': 'medium', 'color': '0000FF'}, - 'right': {'style': 'medium', 'color': '0000FF'}, - 'bottom': {'style': 'medium', 'color': '0000FF'}, - 'left': {'style': 'medium', 'color': '0000FF'}}}), - ('border-top-style: double; border-top-color: blue', - {'border': {'top': {'style': 'double', 'color': '0000FF'}}}), - ('border-top-style: solid; border-top-color: #06c', - {'border': {'top': {'style': 'medium', 'color': '0066CC'}}}), - # ALIGNMENT - # - horizontal - ('text-align: center', - {'alignment': {'horizontal': 'center'}}), - ('text-align: left', - {'alignment': {'horizontal': 'left'}}), - ('text-align: right', - {'alignment': {'horizontal': 'right'}}), - ('text-align: justify', - {'alignment': {'horizontal': 'justify'}}), - # - vertical - ('vertical-align: top', - {'alignment': {'vertical': 'top'}}), - ('vertical-align: text-top', - {'alignment': {'vertical': 'top'}}), - ('vertical-align: middle', - {'alignment': {'vertical': 'center'}}), - ('vertical-align: bottom', - {'alignment': {'vertical': 'bottom'}}), - ('vertical-align: text-bottom', - {'alignment': {'vertical': 'bottom'}}), - # - wrap_text - ('white-space: nowrap', - {'alignment': {'wrap_text': False}}), - ('white-space: pre', - {'alignment': {'wrap_text': False}}), - ('white-space: pre-line', - {'alignment': {'wrap_text': False}}), - ('white-space: normal', - {'alignment': {'wrap_text': True}}), - # NUMBER FORMAT - ('number-format: 0%', - {'number_format': {'format_code': '0%'}}), -]) +@pytest.mark.parametrize( + "css,expected", + [ + # FONT + # - name + ("font-family: foo,bar", {"font": {"name": "foo"}}), + ('font-family: "foo bar",baz', {"font": {"name": "foo bar"}}), + ("font-family: foo,\nbar", {"font": {"name": "foo"}}), + ("font-family: foo, bar, baz", {"font": {"name": "foo"}}), + ("font-family: bar, foo", {"font": {"name": "bar"}}), + ("font-family: 'foo bar', baz", {"font": {"name": "foo bar"}}), + ("font-family: 'foo \\'bar', baz", {"font": {"name": "foo 'bar"}}), + ('font-family: "foo \\"bar", baz', {"font": {"name": 'foo "bar'}}), + ('font-family: "foo ,bar", baz', {"font": {"name": "foo ,bar"}}), + # - family + ("font-family: serif", {"font": {"name": "serif", "family": 1}}), + ("font-family: Serif", {"font": {"name": "serif", "family": 1}}), + ("font-family: roman, serif", {"font": {"name": "roman", "family": 1}}), + ("font-family: roman, sans-serif", {"font": {"name": "roman", "family": 2}}), + ("font-family: roman, sans serif", {"font": {"name": "roman"}}), + ("font-family: roman, sansserif", {"font": {"name": "roman"}}), + ("font-family: roman, cursive", {"font": {"name": "roman", "family": 4}}), + ("font-family: roman, fantasy", {"font": {"name": "roman", "family": 5}}), + # - size + ("font-size: 1em", {"font": {"size": 12}}), + ("font-size: xx-small", {"font": {"size": 6}}), + ("font-size: x-small", {"font": {"size": 7.5}}), + ("font-size: small", {"font": {"size": 9.6}}), + ("font-size: medium", {"font": {"size": 12}}), + ("font-size: large", {"font": {"size": 13.5}}), + ("font-size: x-large", {"font": {"size": 18}}), + ("font-size: xx-large", {"font": {"size": 24}}), + ("font-size: 50%", {"font": {"size": 6}}), + # - bold + ("font-weight: 100", {"font": {"bold": False}}), + ("font-weight: 200", {"font": {"bold": False}}), + ("font-weight: 300", {"font": {"bold": False}}), + ("font-weight: 400", {"font": {"bold": False}}), + ("font-weight: normal", {"font": {"bold": False}}), + ("font-weight: lighter", {"font": {"bold": False}}), + ("font-weight: bold", {"font": {"bold": True}}), + ("font-weight: bolder", {"font": {"bold": True}}), + ("font-weight: 700", {"font": {"bold": True}}), + ("font-weight: 800", {"font": {"bold": True}}), + ("font-weight: 900", {"font": {"bold": True}}), + # - italic + ("font-style: italic", {"font": {"italic": True}}), + ("font-style: oblique", {"font": {"italic": True}}), + # - underline + ("text-decoration: underline", {"font": {"underline": "single"}}), + ("text-decoration: overline", {}), + ("text-decoration: none", {}), + # - strike + ("text-decoration: line-through", {"font": {"strike": True}}), + ( + "text-decoration: underline line-through", + {"font": {"strike": True, "underline": "single"}}, + ), + ( + "text-decoration: underline; text-decoration: line-through", + {"font": {"strike": True}}, + ), + # - color + ("color: red", {"font": {"color": "FF0000"}}), + ("color: #ff0000", {"font": {"color": "FF0000"}}), + ("color: #f0a", {"font": {"color": "FF00AA"}}), + # - shadow + ("text-shadow: none", {"font": {"shadow": False}}), + ("text-shadow: 0px -0em 0px #CCC", {"font": {"shadow": False}}), + ("text-shadow: 0px -0em 0px #999", {"font": {"shadow": False}}), + ("text-shadow: 0px -0em 0px", {"font": {"shadow": False}}), + ("text-shadow: 2px -0em 0px #CCC", {"font": {"shadow": True}}), + ("text-shadow: 0px -2em 0px #CCC", {"font": {"shadow": True}}), + ("text-shadow: 0px -0em 2px #CCC", {"font": {"shadow": True}}), + ("text-shadow: 0px -0em 2px", {"font": {"shadow": True}}), + ("text-shadow: 0px -2em", {"font": {"shadow": True}}), + # FILL + # - color, fillType + ( + "background-color: red", + {"fill": {"fgColor": "FF0000", "patternType": "solid"}}, + ), + ( + "background-color: #ff0000", + {"fill": {"fgColor": "FF0000", "patternType": "solid"}}, + ), + ( + "background-color: #f0a", + {"fill": {"fgColor": "FF00AA", "patternType": "solid"}}, + ), + # BORDER + # - style + ( + "border-style: solid", + { + "border": { + "top": {"style": "medium"}, + "bottom": {"style": "medium"}, + "left": {"style": "medium"}, + "right": {"style": "medium"}, + } + }, + ), + ( + "border-style: solid; border-width: thin", + { + "border": { + "top": {"style": "thin"}, + "bottom": {"style": "thin"}, + "left": {"style": "thin"}, + "right": {"style": "thin"}, + } + }, + ), + ( + "border-top-style: solid; border-top-width: thin", + {"border": {"top": {"style": "thin"}}}, + ), + ( + "border-top-style: solid; border-top-width: 1pt", + {"border": {"top": {"style": "thin"}}}, + ), + ("border-top-style: solid", {"border": {"top": {"style": "medium"}}}), + ( + "border-top-style: solid; border-top-width: medium", + {"border": {"top": {"style": "medium"}}}, + ), + ( + "border-top-style: solid; border-top-width: 2pt", + {"border": {"top": {"style": "medium"}}}, + ), + ( + "border-top-style: solid; border-top-width: thick", + {"border": {"top": {"style": "thick"}}}, + ), + ( + "border-top-style: solid; border-top-width: 4pt", + {"border": {"top": {"style": "thick"}}}, + ), + ( + "border-top-style: dotted", + {"border": {"top": {"style": "mediumDashDotDot"}}}, + ), + ( + "border-top-style: dotted; border-top-width: thin", + {"border": {"top": {"style": "dotted"}}}, + ), + ("border-top-style: dashed", {"border": {"top": {"style": "mediumDashed"}}}), + ( + "border-top-style: dashed; border-top-width: thin", + {"border": {"top": {"style": "dashed"}}}, + ), + ("border-top-style: double", {"border": {"top": {"style": "double"}}}), + # - color + ( + "border-style: solid; border-color: #0000ff", + { + "border": { + "top": {"style": "medium", "color": "0000FF"}, + "right": {"style": "medium", "color": "0000FF"}, + "bottom": {"style": "medium", "color": "0000FF"}, + "left": {"style": "medium", "color": "0000FF"}, + } + }, + ), + ( + "border-top-style: double; border-top-color: blue", + {"border": {"top": {"style": "double", "color": "0000FF"}}}, + ), + ( + "border-top-style: solid; border-top-color: #06c", + {"border": {"top": {"style": "medium", "color": "0066CC"}}}, + ), + # ALIGNMENT + # - horizontal + ("text-align: center", {"alignment": {"horizontal": "center"}}), + ("text-align: left", {"alignment": {"horizontal": "left"}}), + ("text-align: right", {"alignment": {"horizontal": "right"}}), + ("text-align: justify", {"alignment": {"horizontal": "justify"}}), + # - vertical + ("vertical-align: top", {"alignment": {"vertical": "top"}}), + ("vertical-align: text-top", {"alignment": {"vertical": "top"}}), + ("vertical-align: middle", {"alignment": {"vertical": "center"}}), + ("vertical-align: bottom", {"alignment": {"vertical": "bottom"}}), + ("vertical-align: text-bottom", {"alignment": {"vertical": "bottom"}}), + # - wrap_text + ("white-space: nowrap", {"alignment": {"wrap_text": False}}), + ("white-space: pre", {"alignment": {"wrap_text": False}}), + ("white-space: pre-line", {"alignment": {"wrap_text": False}}), + ("white-space: normal", {"alignment": {"wrap_text": True}}), + # NUMBER FORMAT + ("number-format: 0%", {"number_format": {"format_code": "0%"}}), + ], +) def test_css_to_excel(css, expected): convert = CSSToExcelConverter() assert expected == convert(css) @@ -184,7 +212,8 @@ def test_css_to_excel(css, expected): def test_css_to_excel_multiple(): convert = CSSToExcelConverter() - actual = convert(''' + actual = convert( + """ font-weight: bold; text-decoration: underline; color: red; @@ -192,63 +221,71 @@ def test_css_to_excel_multiple(): text-align: center; vertical-align: top; unused: something; - ''') - assert {"font": {"bold": True, "underline": "single", "color": "FF0000"}, - "border": {"top": {"style": "thin"}, - "right": {"style": "thin"}, - "bottom": {"style": "thin"}, - "left": {"style": "thin"}}, - "alignment": {"horizontal": "center", - "vertical": "top"}} == actual + """ + ) + assert { + "font": {"bold": True, "underline": "single", "color": "FF0000"}, + "border": { + "top": {"style": "thin"}, + "right": {"style": "thin"}, + "bottom": {"style": "thin"}, + "left": {"style": "thin"}, + }, + "alignment": {"horizontal": "center", "vertical": "top"}, + } == actual -@pytest.mark.parametrize('css,inherited,expected', [ - ('font-weight: bold', '', - {'font': {'bold': True}}), - ('', 'font-weight: bold', - {'font': {'bold': True}}), - ('font-weight: bold', 'font-style: italic', - {'font': {'bold': True, 'italic': True}}), - ('font-style: normal', 'font-style: italic', - {'font': {'italic': False}}), - ('font-style: inherit', '', {}), - ('font-style: normal; font-style: inherit', 'font-style: italic', - {'font': {'italic': True}}), -]) +@pytest.mark.parametrize( + "css,inherited,expected", + [ + ("font-weight: bold", "", {"font": {"bold": True}}), + ("", "font-weight: bold", {"font": {"bold": True}}), + ( + "font-weight: bold", + "font-style: italic", + {"font": {"bold": True, "italic": True}}, + ), + ("font-style: normal", "font-style: italic", {"font": {"italic": False}}), + ("font-style: inherit", "", {}), + ( + "font-style: normal; font-style: inherit", + "font-style: italic", + {"font": {"italic": True}}, + ), + ], +) def test_css_to_excel_inherited(css, inherited, expected): convert = CSSToExcelConverter(inherited) assert expected == convert(css) -@pytest.mark.parametrize("input_color,output_color", ( - [(name, rgb) for name, rgb in CSSToExcelConverter.NAMED_COLORS.items()] + - [("#" + rgb, rgb) for rgb in CSSToExcelConverter.NAMED_COLORS.values()] + - [("#F0F", "FF00FF"), ("#ABC", "AABBCC")]) +@pytest.mark.parametrize( + "input_color,output_color", + ( + [(name, rgb) for name, rgb in CSSToExcelConverter.NAMED_COLORS.items()] + + [("#" + rgb, rgb) for rgb in CSSToExcelConverter.NAMED_COLORS.values()] + + [("#F0F", "FF00FF"), ("#ABC", "AABBCC")] + ), ) def test_css_to_excel_good_colors(input_color, output_color): # see gh-18392 - css = ("border-top-color: {color}; " - "border-right-color: {color}; " - "border-bottom-color: {color}; " - "border-left-color: {color}; " - "background-color: {color}; " - "color: {color}").format(color=input_color) + css = ( + "border-top-color: {color}; " + "border-right-color: {color}; " + "border-bottom-color: {color}; " + "border-left-color: {color}; " + "background-color: {color}; " + "color: {color}" + ).format(color=input_color) expected = dict() - expected["fill"] = { - "patternType": "solid", - "fgColor": output_color - } + expected["fill"] = {"patternType": "solid", "fgColor": output_color} - expected["font"] = { - "color": output_color - } + expected["font"] = {"color": output_color} expected["border"] = { - k: { - "color": output_color, - } for k in ("top", "right", "bottom", "left") + k: {"color": output_color} for k in ("top", "right", "bottom", "left") } with tm.assert_produces_warning(None): @@ -259,19 +296,19 @@ def test_css_to_excel_good_colors(input_color, output_color): @pytest.mark.parametrize("input_color", [None, "not-a-color"]) def test_css_to_excel_bad_colors(input_color): # see gh-18392 - css = ("border-top-color: {color}; " - "border-right-color: {color}; " - "border-bottom-color: {color}; " - "border-left-color: {color}; " - "background-color: {color}; " - "color: {color}").format(color=input_color) + css = ( + "border-top-color: {color}; " + "border-right-color: {color}; " + "border-bottom-color: {color}; " + "border-left-color: {color}; " + "background-color: {color}; " + "color: {color}" + ).format(color=input_color) expected = dict() if input_color is not None: - expected["fill"] = { - "patternType": "solid" - } + expected["fill"] = {"patternType": "solid"} with tm.assert_produces_warning(CSSWarning): convert = CSSToExcelConverter() diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 97d51f079fb2d2..448e869df950dd 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -18,7 +18,8 @@ " ea commodo consequat. Duis aute irure dolor in reprehenderit in" " voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur" " sint occaecat cupidatat non proident, sunt in culpa qui officia" - " deserunt mollit anim id est laborum.") + " deserunt mollit anim id est laborum." +) def expected_html(datapath, name): @@ -36,24 +37,25 @@ def expected_html(datapath, name): ------- str : contents of HTML file. """ - filename = '.'.join([name, 'html']) - filepath = datapath('io', 'formats', 'data', 'html', filename) - with open(filepath, encoding='utf-8') as f: + filename = ".".join([name, "html"]) + filepath = datapath("io", "formats", "data", "html", filename) + with open(filepath, encoding="utf-8") as f: html = f.read() return html.rstrip() -@pytest.fixture(params=['mixed', 'empty']) +@pytest.fixture(params=["mixed", "empty"]) def biggie_df_fixture(request): """Fixture for a big mixed Dataframe and an empty Dataframe""" - if request.param == 'mixed': - df = DataFrame({'A': np.random.randn(200), - 'B': tm.makeStringIndex(200)}, - index=np.arange(200)) - df.loc[:20, 'A'] = np.nan - df.loc[:20, 'B'] = np.nan + if request.param == "mixed": + df = DataFrame( + {"A": np.random.randn(200), "B": tm.makeStringIndex(200)}, + index=np.arange(200), + ) + df.loc[:20, "A"] = np.nan + df.loc[:20, "B"] = np.nan return df - elif request.param == 'empty': + elif request.param == "empty": df = DataFrame(index=np.arange(200)) return df @@ -63,7 +65,7 @@ def justify(request): return request.param -@pytest.mark.parametrize('col_space', [30, 50]) +@pytest.mark.parametrize("col_space", [30, 50]) def test_to_html_with_col_space(col_space): df = DataFrame(np.random.random(size=(1, 3))) # check that col_space affects HTML generation @@ -78,16 +80,19 @@ def test_to_html_with_col_space(col_space): def test_to_html_with_empty_string_label(): # GH 3547, to_html regards empty string labels as repeated labels - data = {'c1': ['a', 'b'], 'c2': ['a', ''], 'data': [1, 2]} - df = DataFrame(data).set_index(['c1', 'c2']) + data = {"c1": ["a", "b"], "c2": ["a", ""], "data": [1, 2]} + df = DataFrame(data).set_index(["c1", "c2"]) result = df.to_html() assert "rowspan" not in result -@pytest.mark.parametrize('df,expected', [ - (DataFrame({'\u03c3': np.arange(10.)}), 'unicode_1'), - (DataFrame({'A': ['\u03c3']}), 'unicode_2') -]) +@pytest.mark.parametrize( + "df,expected", + [ + (DataFrame({"\u03c3": np.arange(10.0)}), "unicode_1"), + (DataFrame({"A": ["\u03c3"]}), "unicode_2"), + ], +) def test_to_html_unicode(df, expected, datapath): expected = expected_html(datapath, expected) result = df.to_html() @@ -96,99 +101,112 @@ def test_to_html_unicode(df, expected, datapath): def test_to_html_decimal(datapath): # GH 12031 - df = DataFrame({'A': [6.0, 3.1, 2.2]}) - result = df.to_html(decimal=',') - expected = expected_html(datapath, 'gh12031_expected_output') + df = DataFrame({"A": [6.0, 3.1, 2.2]}) + result = df.to_html(decimal=",") + expected = expected_html(datapath, "gh12031_expected_output") assert result == expected -@pytest.mark.parametrize('kwargs,string,expected', [ - (dict(), "", 'escaped'), - (dict(escape=False), "bold", 'escape_disabled') -]) +@pytest.mark.parametrize( + "kwargs,string,expected", + [ + (dict(), "", "escaped"), + (dict(escape=False), "bold", "escape_disabled"), + ], +) def test_to_html_escaped(kwargs, string, expected, datapath): - a = 'strl2': {a: string, - b: string}} + test_dict = {"col2": {a: string, b: string}} result = DataFrame(test_dict).to_html(**kwargs) expected = expected_html(datapath, expected) assert result == expected -@pytest.mark.parametrize('index_is_named', [True, False]) +@pytest.mark.parametrize("index_is_named", [True, False]) def test_to_html_multiindex_index_false(index_is_named, datapath): # GH 8452 - df = DataFrame({ - 'a': range(2), - 'b': range(3, 5), - 'c': range(5, 7), - 'd': range(3, 5) - }) - df.columns = MultiIndex.from_product([['a', 'b'], ['c', 'd']]) + df = DataFrame( + {"a": range(2), "b": range(3, 5), "c": range(5, 7), "d": range(3, 5)} + ) + df.columns = MultiIndex.from_product([["a", "b"], ["c", "d"]]) if index_is_named: - df.index = Index(df.index.values, name='idx') + df.index = Index(df.index.values, name="idx") result = df.to_html(index=False) - expected = expected_html(datapath, 'gh8452_expected_output') + expected = expected_html(datapath, "gh8452_expected_output") assert result == expected -@pytest.mark.parametrize('multi_sparse,expected', [ - (False, 'multiindex_sparsify_false_multi_sparse_1'), - (False, 'multiindex_sparsify_false_multi_sparse_2'), - (True, 'multiindex_sparsify_1'), - (True, 'multiindex_sparsify_2') -]) +@pytest.mark.parametrize( + "multi_sparse,expected", + [ + (False, "multiindex_sparsify_false_multi_sparse_1"), + (False, "multiindex_sparsify_false_multi_sparse_2"), + (True, "multiindex_sparsify_1"), + (True, "multiindex_sparsify_2"), + ], +) def test_to_html_multiindex_sparsify(multi_sparse, expected, datapath): - index = MultiIndex.from_arrays([[0, 0, 1, 1], [0, 1, 0, 1]], - names=['foo', None]) + index = MultiIndex.from_arrays([[0, 0, 1, 1], [0, 1, 0, 1]], names=["foo", None]) df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], index=index) - if expected.endswith('2'): + if expected.endswith("2"): df.columns = index[::2] - with option_context('display.multi_sparse', multi_sparse): + with option_context("display.multi_sparse", multi_sparse): result = df.to_html() expected = expected_html(datapath, expected) assert result == expected -@pytest.mark.parametrize('max_rows,expected', [ - (60, 'gh14882_expected_output_1'), - - # Test that ... appears in a middle level - (56, 'gh14882_expected_output_2') -]) +@pytest.mark.parametrize( + "max_rows,expected", + [ + (60, "gh14882_expected_output_1"), + # Test that ... appears in a middle level + (56, "gh14882_expected_output_2"), + ], +) def test_to_html_multiindex_odd_even_truncate(max_rows, expected, datapath): # GH 14882 - Issue on truncation with odd length DataFrame - index = MultiIndex.from_product([[100, 200, 300], - [10, 20, 30], - [1, 2, 3, 4, 5, 6, 7]], - names=['a', 'b', 'c']) - df = DataFrame({'n': range(len(index))}, index=index) + index = MultiIndex.from_product( + [[100, 200, 300], [10, 20, 30], [1, 2, 3, 4, 5, 6, 7]], names=["a", "b", "c"] + ) + df = DataFrame({"n": range(len(index))}, index=index) result = df.to_html(max_rows=max_rows) expected = expected_html(datapath, expected) assert result == expected -@pytest.mark.parametrize('df,formatters,expected', [ - (DataFrame( - [[0, 1], [2, 3], [4, 5], [6, 7]], - columns=['foo', None], index=np.arange(4)), - {'__index__': lambda x: 'abcd' [x]}, - 'index_formatter'), - - (DataFrame( - {'months': [datetime(2016, 1, 1), datetime(2016, 2, 2)]}), - {'months': lambda x: x.strftime('%Y-%m')}, - 'datetime64_monthformatter'), - - (DataFrame({'hod': pd.to_datetime(['10:10:10.100', '12:12:12.120'], - format='%H:%M:%S.%f')}), - {'hod': lambda x: x.strftime('%H:%M')}, - 'datetime64_hourformatter') -]) +@pytest.mark.parametrize( + "df,formatters,expected", + [ + ( + DataFrame( + [[0, 1], [2, 3], [4, 5], [6, 7]], + columns=["foo", None], + index=np.arange(4), + ), + {"__index__": lambda x: "abcd"[x]}, + "index_formatter", + ), + ( + DataFrame({"months": [datetime(2016, 1, 1), datetime(2016, 2, 2)]}), + {"months": lambda x: x.strftime("%Y-%m")}, + "datetime64_monthformatter", + ), + ( + DataFrame( + { + "hod": pd.to_datetime( + ["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f" + ) + } + ), + {"hod": lambda x: x.strftime("%H:%M")}, + "datetime64_hourformatter", + ), + ], +) def test_to_html_formatters(df, formatters, expected, datapath): expected = expected_html(datapath, expected) result = df.to_html(formatters=formatters) @@ -196,55 +214,63 @@ def test_to_html_formatters(df, formatters, expected, datapath): def test_to_html_regression_GH6098(): - df = DataFrame({ - 'clé1': ['a', 'a', 'b', 'b', 'a'], - 'clé2': ['1er', '2ème', '1er', '2ème', '1er'], - 'données1': np.random.randn(5), - 'données2': np.random.randn(5)}) + df = DataFrame( + { + "clé1": ["a", "a", "b", "b", "a"], + "clé2": ["1er", "2ème", "1er", "2ème", "1er"], + "données1": np.random.randn(5), + "données2": np.random.randn(5), + } + ) # it works - df.pivot_table(index=['clé1'], columns=['clé2'])._repr_html_() + df.pivot_table(index=["clé1"], columns=["clé2"])._repr_html_() def test_to_html_truncate(datapath): - index = pd.date_range(start='20010101', freq='D', periods=20) + index = pd.date_range(start="20010101", freq="D", periods=20) df = DataFrame(index=index, columns=range(20)) result = df.to_html(max_rows=8, max_cols=4) - expected = expected_html(datapath, 'truncate') + expected = expected_html(datapath, "truncate") assert result == expected -@pytest.mark.parametrize('sparsify,expected', [ - (True, 'truncate_multi_index'), - (False, 'truncate_multi_index_sparse_off') -]) +@pytest.mark.parametrize( + "sparsify,expected", + [(True, "truncate_multi_index"), (False, "truncate_multi_index_sparse_off")], +) def test_to_html_truncate_multi_index(sparsify, expected, datapath): - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] df = DataFrame(index=arrays, columns=arrays) result = df.to_html(max_rows=7, max_cols=7, sparsify=sparsify) expected = expected_html(datapath, expected) assert result == expected -@pytest.mark.parametrize('option,result,expected', [ - (None, lambda df: df.to_html(), '1'), - (None, lambda df: df.to_html(border=0), '0'), - (0, lambda df: df.to_html(), '0'), - (0, lambda df: df._repr_html_(), '0'), -]) +@pytest.mark.parametrize( + "option,result,expected", + [ + (None, lambda df: df.to_html(), "1"), + (None, lambda df: df.to_html(border=0), "0"), + (0, lambda df: df.to_html(), "0"), + (0, lambda df: df._repr_html_(), "0"), + ], +) def test_to_html_border(option, result, expected): - df = DataFrame({'A': [1, 2]}) + df = DataFrame({"A": [1, 2]}) if option is None: result = result(df) else: - with option_context('display.html.border', option): + with option_context("display.html.border", option): result = result(df) expected = 'border="{}"'.format(expected) assert expected in result -@pytest.mark.parametrize('biggie_df_fixture', ['mixed'], indirect=True) +@pytest.mark.parametrize("biggie_df_fixture", ["mixed"], indirect=True) def test_to_html(biggie_df_fixture): # TODO: split this test df = biggie_df_fixture @@ -257,15 +283,14 @@ def test_to_html(biggie_df_fixture): assert isinstance(s, str) - df.to_html(columns=['B', 'A'], col_space=17) - df.to_html(columns=['B', 'A'], - formatters={'A': lambda x: '{x:.1f}'.format(x=x)}) + df.to_html(columns=["B", "A"], col_space=17) + df.to_html(columns=["B", "A"], formatters={"A": lambda x: "{x:.1f}".format(x=x)}) - df.to_html(columns=['B', 'A'], float_format=str) - df.to_html(columns=['B', 'A'], col_space=12, float_format=str) + df.to_html(columns=["B", "A"], float_format=str) + df.to_html(columns=["B", "A"], col_space=12, float_format=str) -@pytest.mark.parametrize('biggie_df_fixture', ['empty'], indirect=True) +@pytest.mark.parametrize("biggie_df_fixture", ["empty"], indirect=True) def test_to_html_empty_dataframe(biggie_df_fixture): df = biggie_df_fixture df.to_html() @@ -274,55 +299,62 @@ def test_to_html_empty_dataframe(biggie_df_fixture): def test_to_html_filename(biggie_df_fixture, tmpdir): df = biggie_df_fixture expected = df.to_html() - path = tmpdir.join('test.html') + path = tmpdir.join("test.html") df.to_html(path) result = path.read() assert result == expected def test_to_html_with_no_bold(): - df = DataFrame({'x': np.random.randn(5)}) + df = DataFrame({"x": np.random.randn(5)}) html = df.to_html(bold_rows=False) result = html[html.find("")] - assert 'B' not in result - - -@pytest.mark.parametrize('columns,justify,expected', [ - (MultiIndex.from_tuples( - list(zip(np.arange(2).repeat(2), np.mod(range(4), 2))), - names=['CL0', 'CL1']), - 'left', - 'multiindex_1'), - - (MultiIndex.from_tuples( - list(zip(range(4), np.mod(range(4), 2)))), - 'right', - 'multiindex_2') -]) + result = float_frame.to_html(columns=["A"]) + assert "B" not in result + + +@pytest.mark.parametrize( + "columns,justify,expected", + [ + ( + MultiIndex.from_tuples( + list(zip(np.arange(2).repeat(2), np.mod(range(4), 2))), + names=["CL0", "CL1"], + ), + "left", + "multiindex_1", + ), + ( + MultiIndex.from_tuples(list(zip(range(4), np.mod(range(4), 2)))), + "right", + "multiindex_2", + ), + ], +) def test_to_html_multiindex(columns, justify, expected, datapath): - df = DataFrame([list('abcd'), list('efgh')], columns=columns) + df = DataFrame([list("abcd"), list("efgh")], columns=columns) result = df.to_html(justify=justify) expected = expected_html(datapath, expected) assert result == expected def test_to_html_justify(justify, datapath): - df = DataFrame({'A': [6, 30000, 2], - 'B': [1, 2, 70000], - 'C': [223442, 0, 1]}, - columns=['A', 'B', 'C']) + df = DataFrame( + {"A": [6, 30000, 2], "B": [1, 2, 70000], "C": [223442, 0, 1]}, + columns=["A", "B", "C"], + ) result = df.to_html(justify=justify) - expected = expected_html(datapath, 'justify').format(justify=justify) + expected = expected_html(datapath, "justify").format(justify=justify) assert result == expected -@pytest.mark.parametrize("justify", ["super-right", "small-left", - "noinherit", "tiny", "pandas"]) +@pytest.mark.parametrize( + "justify", ["super-right", "small-left", "noinherit", "tiny", "pandas"] +) def test_to_html_invalid_justify(justify): # GH 17527 df = DataFrame() @@ -334,50 +366,47 @@ def test_to_html_invalid_justify(justify): def test_to_html_index(datapath): # TODO: split this test - index = ['foo', 'bar', 'baz'] - df = DataFrame({'A': [1, 2, 3], - 'B': [1.2, 3.4, 5.6], - 'C': ['one', 'two', np.nan]}, - columns=['A', 'B', 'C'], - index=index) - expected_with_index = expected_html(datapath, 'index_1') + index = ["foo", "bar", "baz"] + df = DataFrame( + {"A": [1, 2, 3], "B": [1.2, 3.4, 5.6], "C": ["one", "two", np.nan]}, + columns=["A", "B", "C"], + index=index, + ) + expected_with_index = expected_html(datapath, "index_1") assert df.to_html() == expected_with_index - expected_without_index = expected_html(datapath, 'index_2') + expected_without_index = expected_html(datapath, "index_2") result = df.to_html(index=False) for i in index: assert i not in result assert result == expected_without_index - df.index = Index(['foo', 'bar', 'baz'], name='idx') - expected_with_index = expected_html(datapath, 'index_3') + df.index = Index(["foo", "bar", "baz"], name="idx") + expected_with_index = expected_html(datapath, "index_3") assert df.to_html() == expected_with_index assert df.to_html(index=False) == expected_without_index - tuples = [('foo', 'car'), ('foo', 'bike'), ('bar', 'car')] + tuples = [("foo", "car"), ("foo", "bike"), ("bar", "car")] df.index = MultiIndex.from_tuples(tuples) - expected_with_index = expected_html(datapath, 'index_4') + expected_with_index = expected_html(datapath, "index_4") assert df.to_html() == expected_with_index result = df.to_html(index=False) - for i in ['foo', 'bar', 'car', 'bike']: + for i in ["foo", "bar", "car", "bike"]: assert i not in result # must be the same result as normal index assert result == expected_without_index - df.index = MultiIndex.from_tuples(tuples, names=['idx1', 'idx2']) - expected_with_index = expected_html(datapath, 'index_5') + df.index = MultiIndex.from_tuples(tuples, names=["idx1", "idx2"]) + expected_with_index = expected_html(datapath, "index_5") assert df.to_html() == expected_with_index assert df.to_html(index=False) == expected_without_index -@pytest.mark.parametrize('classes', [ - "sortable draggable", - ["sortable", "draggable"] -]) +@pytest.mark.parametrize("classes", ["sortable draggable", ["sortable", "draggable"]]) def test_to_html_with_classes(classes, datapath): df = DataFrame() - expected = expected_html(datapath, 'with_classes') + expected = expected_html(datapath, "with_classes") result = df.to_html(classes=classes) assert result == expected @@ -386,160 +415,195 @@ def test_to_html_no_index_max_rows(datapath): # GH 14998 df = DataFrame({"A": [1, 2, 3, 4]}) result = df.to_html(index=False, max_rows=1) - expected = expected_html(datapath, 'gh14998_expected_output') + expected = expected_html(datapath, "gh14998_expected_output") assert result == expected def test_to_html_multiindex_max_cols(datapath): # GH 6131 - index = MultiIndex(levels=[['ba', 'bb', 'bc'], ['ca', 'cb', 'cc']], - codes=[[0, 1, 2], [0, 1, 2]], - names=['b', 'c']) - columns = MultiIndex(levels=[['d'], ['aa', 'ab', 'ac']], - codes=[[0, 0, 0], [0, 1, 2]], - names=[None, 'a']) + index = MultiIndex( + levels=[["ba", "bb", "bc"], ["ca", "cb", "cc"]], + codes=[[0, 1, 2], [0, 1, 2]], + names=["b", "c"], + ) + columns = MultiIndex( + levels=[["d"], ["aa", "ab", "ac"]], + codes=[[0, 0, 0], [0, 1, 2]], + names=[None, "a"], + ) data = np.array( - [[1., np.nan, np.nan], [np.nan, 2., np.nan], [np.nan, np.nan, 3.]]) + [[1.0, np.nan, np.nan], [np.nan, 2.0, np.nan], [np.nan, np.nan, 3.0]] + ) df = DataFrame(data, index, columns) result = df.to_html(max_cols=2) - expected = expected_html(datapath, 'gh6131_expected_output') + expected = expected_html(datapath, "gh6131_expected_output") assert result == expected def test_to_html_multi_indexes_index_false(datapath): # GH 22579 - df = DataFrame({'a': range(10), 'b': range(10, 20), 'c': range(10, 20), - 'd': range(10, 20)}) - df.columns = MultiIndex.from_product([['a', 'b'], ['c', 'd']]) - df.index = MultiIndex.from_product([['a', 'b'], - ['c', 'd', 'e', 'f', 'g']]) + df = DataFrame( + {"a": range(10), "b": range(10, 20), "c": range(10, 20), "d": range(10, 20)} + ) + df.columns = MultiIndex.from_product([["a", "b"], ["c", "d"]]) + df.index = MultiIndex.from_product([["a", "b"], ["c", "d", "e", "f", "g"]]) result = df.to_html(index=False) - expected = expected_html(datapath, 'gh22579_expected_output') + expected = expected_html(datapath, "gh22579_expected_output") assert result == expected -@pytest.mark.parametrize('index_names', [True, False]) -@pytest.mark.parametrize('header', [True, False]) -@pytest.mark.parametrize('index', [True, False]) -@pytest.mark.parametrize('column_index, column_type', [ - (Index([0, 1]), 'unnamed_standard'), - (Index([0, 1], name='columns.name'), 'named_standard'), - (MultiIndex.from_product([['a'], ['b', 'c']]), 'unnamed_multi'), - (MultiIndex.from_product( - [['a'], ['b', 'c']], names=['columns.name.0', - 'columns.name.1']), 'named_multi') -]) -@pytest.mark.parametrize('row_index, row_type', [ - (Index([0, 1]), 'unnamed_standard'), - (Index([0, 1], name='index.name'), 'named_standard'), - (MultiIndex.from_product([['a'], ['b', 'c']]), 'unnamed_multi'), - (MultiIndex.from_product( - [['a'], ['b', 'c']], names=['index.name.0', - 'index.name.1']), 'named_multi') -]) +@pytest.mark.parametrize("index_names", [True, False]) +@pytest.mark.parametrize("header", [True, False]) +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize( + "column_index, column_type", + [ + (Index([0, 1]), "unnamed_standard"), + (Index([0, 1], name="columns.name"), "named_standard"), + (MultiIndex.from_product([["a"], ["b", "c"]]), "unnamed_multi"), + ( + MultiIndex.from_product( + [["a"], ["b", "c"]], names=["columns.name.0", "columns.name.1"] + ), + "named_multi", + ), + ], +) +@pytest.mark.parametrize( + "row_index, row_type", + [ + (Index([0, 1]), "unnamed_standard"), + (Index([0, 1], name="index.name"), "named_standard"), + (MultiIndex.from_product([["a"], ["b", "c"]]), "unnamed_multi"), + ( + MultiIndex.from_product( + [["a"], ["b", "c"]], names=["index.name.0", "index.name.1"] + ), + "named_multi", + ), + ], +) def test_to_html_basic_alignment( - datapath, row_index, row_type, column_index, column_type, - index, header, index_names): + datapath, row_index, row_type, column_index, column_type, index, header, index_names +): # GH 22747, GH 22579 - df = DataFrame(np.zeros((2, 2), dtype=int), - index=row_index, columns=column_index) - result = df.to_html( - index=index, header=header, index_names=index_names) + df = DataFrame(np.zeros((2, 2), dtype=int), index=row_index, columns=column_index) + result = df.to_html(index=index, header=header, index_names=index_names) if not index: - row_type = 'none' - elif not index_names and row_type.startswith('named'): - row_type = 'un' + row_type + row_type = "none" + elif not index_names and row_type.startswith("named"): + row_type = "un" + row_type if not header: - column_type = 'none' - elif not index_names and column_type.startswith('named'): - column_type = 'un' + column_type + column_type = "none" + elif not index_names and column_type.startswith("named"): + column_type = "un" + column_type - filename = 'index_' + row_type + '_columns_' + column_type + filename = "index_" + row_type + "_columns_" + column_type expected = expected_html(datapath, filename) assert result == expected -@pytest.mark.parametrize('index_names', [True, False]) -@pytest.mark.parametrize('header', [True, False]) -@pytest.mark.parametrize('index', [True, False]) -@pytest.mark.parametrize('column_index, column_type', [ - (Index(np.arange(8)), 'unnamed_standard'), - (Index(np.arange(8), name='columns.name'), 'named_standard'), - (MultiIndex.from_product( - [['a', 'b'], ['c', 'd'], ['e', 'f']]), 'unnamed_multi'), - (MultiIndex.from_product( - [['a', 'b'], ['c', 'd'], ['e', 'f']], names=['foo', None, 'baz']), - 'named_multi') -]) -@pytest.mark.parametrize('row_index, row_type', [ - (Index(np.arange(8)), 'unnamed_standard'), - (Index(np.arange(8), name='index.name'), 'named_standard'), - (MultiIndex.from_product( - [['a', 'b'], ['c', 'd'], ['e', 'f']]), 'unnamed_multi'), - (MultiIndex.from_product( - [['a', 'b'], ['c', 'd'], ['e', 'f']], names=['foo', None, 'baz']), - 'named_multi') -]) +@pytest.mark.parametrize("index_names", [True, False]) +@pytest.mark.parametrize("header", [True, False]) +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize( + "column_index, column_type", + [ + (Index(np.arange(8)), "unnamed_standard"), + (Index(np.arange(8), name="columns.name"), "named_standard"), + ( + MultiIndex.from_product([["a", "b"], ["c", "d"], ["e", "f"]]), + "unnamed_multi", + ), + ( + MultiIndex.from_product( + [["a", "b"], ["c", "d"], ["e", "f"]], names=["foo", None, "baz"] + ), + "named_multi", + ), + ], +) +@pytest.mark.parametrize( + "row_index, row_type", + [ + (Index(np.arange(8)), "unnamed_standard"), + (Index(np.arange(8), name="index.name"), "named_standard"), + ( + MultiIndex.from_product([["a", "b"], ["c", "d"], ["e", "f"]]), + "unnamed_multi", + ), + ( + MultiIndex.from_product( + [["a", "b"], ["c", "d"], ["e", "f"]], names=["foo", None, "baz"] + ), + "named_multi", + ), + ], +) def test_to_html_alignment_with_truncation( - datapath, row_index, row_type, column_index, column_type, - index, header, index_names): + datapath, row_index, row_type, column_index, column_type, index, header, index_names +): # GH 22747, GH 22579 - df = DataFrame(np.arange(64).reshape(8, 8), - index=row_index, columns=column_index) + df = DataFrame(np.arange(64).reshape(8, 8), index=row_index, columns=column_index) result = df.to_html( - max_rows=4, max_cols=4, - index=index, header=header, index_names=index_names) + max_rows=4, max_cols=4, index=index, header=header, index_names=index_names + ) if not index: - row_type = 'none' - elif not index_names and row_type.startswith('named'): - row_type = 'un' + row_type + row_type = "none" + elif not index_names and row_type.startswith("named"): + row_type = "un" + row_type if not header: - column_type = 'none' - elif not index_names and column_type.startswith('named'): - column_type = 'un' + column_type + column_type = "none" + elif not index_names and column_type.startswith("named"): + column_type = "un" + column_type - filename = 'trunc_df_index_' + row_type + '_columns_' + column_type + filename = "trunc_df_index_" + row_type + "_columns_" + column_type expected = expected_html(datapath, filename) assert result == expected -@pytest.mark.parametrize('index', [False, 0]) +@pytest.mark.parametrize("index", [False, 0]) def test_to_html_truncation_index_false_max_rows(datapath, index): # GH 15019 - data = [[1.764052, 0.400157], - [0.978738, 2.240893], - [1.867558, -0.977278], - [0.950088, -0.151357], - [-0.103219, 0.410599]] + data = [ + [1.764052, 0.400157], + [0.978738, 2.240893], + [1.867558, -0.977278], + [0.950088, -0.151357], + [-0.103219, 0.410599], + ] df = DataFrame(data) result = df.to_html(max_rows=4, index=index) - expected = expected_html(datapath, 'gh15019_expected_output') + expected = expected_html(datapath, "gh15019_expected_output") assert result == expected -@pytest.mark.parametrize('index', [False, 0]) -@pytest.mark.parametrize('col_index_named, expected_output', [ - (False, 'gh22783_expected_output'), - (True, 'gh22783_named_columns_index') -]) +@pytest.mark.parametrize("index", [False, 0]) +@pytest.mark.parametrize( + "col_index_named, expected_output", + [(False, "gh22783_expected_output"), (True, "gh22783_named_columns_index")], +) def test_to_html_truncation_index_false_max_cols( - datapath, index, col_index_named, expected_output): + datapath, index, col_index_named, expected_output +): # GH 22783 - data = [[1.764052, 0.400157, 0.978738, 2.240893, 1.867558], - [-0.977278, 0.950088, -0.151357, -0.103219, 0.410599]] + data = [ + [1.764052, 0.400157, 0.978738, 2.240893, 1.867558], + [-0.977278, 0.950088, -0.151357, -0.103219, 0.410599], + ] df = DataFrame(data) if col_index_named: - df.columns.rename('columns.name', inplace=True) + df.columns.rename("columns.name", inplace=True) result = df.to_html(max_cols=4, index=index) expected = expected_html(datapath, expected_output) assert result == expected -@pytest.mark.parametrize('notebook', [True, False]) +@pytest.mark.parametrize("notebook", [True, False]) def test_to_html_notebook_has_style(notebook): df = DataFrame({"A": [1, 2, 3]}) result = df.to_html(notebook=notebook) @@ -556,59 +620,62 @@ def test_to_html_notebook_has_style(notebook): def test_to_html_with_index_names_false(): # GH 16493 - df = DataFrame({"A": [1, 2]}, index=Index(['a', 'b'], - name='myindexname')) + df = DataFrame({"A": [1, 2]}, index=Index(["a", "b"], name="myindexname")) result = df.to_html(index_names=False) - assert 'myindexname' not in result + assert "myindexname" not in result def test_to_html_with_id(): # GH 8496 - df = DataFrame({"A": [1, 2]}, index=Index(['a', 'b'], - name='myindexname')) + df = DataFrame({"A": [1, 2]}, index=Index(["a", "b"], name="myindexname")) result = df.to_html(index_names=False, table_id="TEST_ID") assert ' id="TEST_ID"' in result -@pytest.mark.parametrize('value,float_format,expected', [ - (0.19999, '%.3f', 'gh21625_expected_output'), - (100.0, '%.0f', 'gh22270_expected_output'), -]) -def test_to_html_float_format_no_fixed_width( - value, float_format, expected, datapath): +@pytest.mark.parametrize( + "value,float_format,expected", + [ + (0.19999, "%.3f", "gh21625_expected_output"), + (100.0, "%.0f", "gh22270_expected_output"), + ], +) +def test_to_html_float_format_no_fixed_width(value, float_format, expected, datapath): # GH 21625, GH 22270 - df = DataFrame({'x': [value]}) + df = DataFrame({"x": [value]}) expected = expected_html(datapath, expected) result = df.to_html(float_format=float_format) assert result == expected -@pytest.mark.parametrize("render_links,expected", [ - (True, 'render_links_true'), - (False, 'render_links_false'), -]) +@pytest.mark.parametrize( + "render_links,expected", + [(True, "render_links_true"), (False, "render_links_false")], +) def test_to_html_render_links(render_links, expected, datapath): # GH 2679 data = [ - [0, 'http://pandas.pydata.org/?q1=a&q2=b', 'pydata.org'], - [0, 'www.pydata.org', 'pydata.org'] + [0, "http://pandas.pydata.org/?q1=a&q2=b", "pydata.org"], + [0, "www.pydata.org", "pydata.org"], ] - df = DataFrame(data, columns=['foo', 'bar', None]) + df = DataFrame(data, columns=["foo", "bar", None]) result = df.to_html(render_links=render_links) expected = expected_html(datapath, expected) assert result == expected -@pytest.mark.parametrize('method,expected', [ - ('to_html', lambda x:lorem_ipsum), - ('_repr_html_', lambda x:lorem_ipsum[:x - 4] + '...') # regression case -]) -@pytest.mark.parametrize('max_colwidth', [10, 20, 50, 100]) +@pytest.mark.parametrize( + "method,expected", + [ + ("to_html", lambda x: lorem_ipsum), + ("_repr_html_", lambda x: lorem_ipsum[: x - 4] + "..."), # regression case + ], +) +@pytest.mark.parametrize("max_colwidth", [10, 20, 50, 100]) def test_ignore_display_max_colwidth(method, expected, max_colwidth): # see gh-17004 df = DataFrame([lorem_ipsum]) - with pd.option_context('display.max_colwidth', max_colwidth): + with pd.option_context("display.max_colwidth", max_colwidth): result = getattr(df, method)() expected = expected(max_colwidth) assert expected in result @@ -627,22 +694,22 @@ def test_to_html_invalid_classes_type(classes): def test_to_html_round_column_headers(): # GH 17280 df = DataFrame([1], columns=[0.55555]) - with pd.option_context('display.precision', 3): + with pd.option_context("display.precision", 3): html = df.to_html(notebook=False) notebook = df.to_html(notebook=True) assert "0.55555" in html assert "0.556" in notebook -@pytest.mark.parametrize("unit", ['100px', '10%', '5em', 150]) +@pytest.mark.parametrize("unit", ["100px", "10%", "5em", 150]) def test_to_html_with_col_space_units(unit): # GH 25941 df = DataFrame(np.random.random(size=(1, 3))) result = df.to_html(col_space=unit) - result = result.split('tbody')[0] + result = result.split("tbody")[0] hdrs = [x for x in result.split("\n") if re.search(r"\s]", x)] if isinstance(unit, int): - unit = str(unit) + 'px' + unit = str(unit) + "px" for h in hdrs: expected = ''.format(unit=unit) assert expected in h diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index b9f28ec36d021a..a8a6a96f60d606 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -9,32 +9,31 @@ class TestToLatex: - def test_to_latex_filename(self, float_frame): - with tm.ensure_clean('test.tex') as path: + with tm.ensure_clean("test.tex") as path: float_frame.to_latex(path) - with open(path, 'r') as f: + with open(path, "r") as f: assert float_frame.to_latex() == f.read() # test with utf-8 and encoding option (GH 7061) - df = DataFrame([['au\xdfgangen']]) - with tm.ensure_clean('test.tex') as path: - df.to_latex(path, encoding='utf-8') - with codecs.open(path, 'r', encoding='utf-8') as f: + df = DataFrame([["au\xdfgangen"]]) + with tm.ensure_clean("test.tex") as path: + df.to_latex(path, encoding="utf-8") + with codecs.open(path, "r", encoding="utf-8") as f: assert df.to_latex() == f.read() # test with utf-8 without encoding option - with tm.ensure_clean('test.tex') as path: + with tm.ensure_clean("test.tex") as path: df.to_latex(path) - with codecs.open(path, 'r', encoding='utf-8') as f: + with codecs.open(path, "r", encoding="utf-8") as f: assert df.to_latex() == f.read() def test_to_latex(self, float_frame): # it works! float_frame.to_latex() - df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) withindex_result = df.to_latex() withindex_expected = r"""\begin{tabular}{lrl} \toprule @@ -63,10 +62,10 @@ def test_to_latex(self, float_frame): def test_to_latex_format(self, float_frame): # GH Bug #9402 - float_frame.to_latex(column_format='ccc') + float_frame.to_latex(column_format="ccc") - df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) - withindex_result = df.to_latex(column_format='ccc') + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + withindex_result = df.to_latex(column_format="ccc") withindex_expected = r"""\begin{tabular}{ccc} \toprule {} & a & b \\ @@ -103,19 +102,26 @@ def test_to_latex_empty(self): assert result == expected def test_to_latex_with_formatters(self): - df = DataFrame({'datetime64': [datetime(2016, 1, 1), - datetime(2016, 2, 5), - datetime(2016, 3, 3)], - 'float': [1.0, 2.0, 3.0], - 'int': [1, 2, 3], - 'object': [(1, 2), True, False], - }) - - formatters = {'datetime64': lambda x: x.strftime('%Y-%m'), - 'float': lambda x: '[{x: 4.1f}]'.format(x=x), - 'int': lambda x: '0x{x:x}'.format(x=x), - 'object': lambda x: '-{x!s}-'.format(x=x), - '__index__': lambda x: 'index: {x}'.format(x=x)} + df = DataFrame( + { + "datetime64": [ + datetime(2016, 1, 1), + datetime(2016, 2, 5), + datetime(2016, 3, 3), + ], + "float": [1.0, 2.0, 3.0], + "int": [1, 2, 3], + "object": [(1, 2), True, False], + } + ) + + formatters = { + "datetime64": lambda x: x.strftime("%Y-%m"), + "float": lambda x: "[{x: 4.1f}]".format(x=x), + "int": lambda x: "0x{x:x}".format(x=x), + "object": lambda x: "-{x!s}-".format(x=x), + "__index__": lambda x: "index: {x}".format(x=x), + } result = df.to_latex(formatters=dict(formatters)) expected = r"""\begin{tabular}{llrrl} @@ -131,7 +137,7 @@ def test_to_latex_with_formatters(self): assert result == expected def test_to_latex_multiindex(self): - df = DataFrame({('x', 'y'): ['a']}) + df = DataFrame({("x", "y"): ["a"]}) result = df.to_latex() expected = r"""\begin{tabular}{ll} \toprule @@ -157,13 +163,15 @@ def test_to_latex_multiindex(self): assert result == expected - df = DataFrame.from_dict({ - ('c1', 0): pd.Series({x: x for x in range(4)}), - ('c1', 1): pd.Series({x: x + 4 for x in range(4)}), - ('c2', 0): pd.Series({x: x for x in range(4)}), - ('c2', 1): pd.Series({x: x + 4 for x in range(4)}), - ('c3', 0): pd.Series({x: x for x in range(4)}), - }).T + df = DataFrame.from_dict( + { + ("c1", 0): pd.Series({x: x for x in range(4)}), + ("c1", 1): pd.Series({x: x + 4 for x in range(4)}), + ("c2", 0): pd.Series({x: x for x in range(4)}), + ("c2", 1): pd.Series({x: x + 4 for x in range(4)}), + ("c3", 0): pd.Series({x: x for x in range(4)}), + } + ).T result = df.to_latex() expected = r"""\begin{tabular}{llrrrr} \toprule @@ -182,7 +190,7 @@ def test_to_latex_multiindex(self): # GH 14184 df = df.T - df.columns.names = ['a', 'b'] + df.columns.names = ["a", "b"] result = df.to_latex() expected = r"""\begin{tabular}{lrrrrr} \toprule @@ -199,10 +207,8 @@ def test_to_latex_multiindex(self): assert result == expected # GH 10660 - df = pd.DataFrame({'a': [0, 0, 1, 1], - 'b': list('abab'), - 'c': [1, 2, 3, 4]}) - result = df.set_index(['a', 'b']).to_latex() + df = pd.DataFrame({"a": [0, 0, 1, 1], "b": list("abab"), "c": [1, 2, 3, 4]}) + result = df.set_index(["a", "b"]).to_latex() expected = r"""\begin{tabular}{llr} \toprule & & c \\ @@ -218,7 +224,7 @@ def test_to_latex_multiindex(self): assert result == expected - result = df.groupby('a').describe().to_latex() + result = df.groupby("a").describe().to_latex() expected = r"""\begin{tabular}{lrrrrrrrr} \toprule {} & \multicolumn{8}{l}{c} \\ @@ -241,8 +247,9 @@ def test_to_latex_multiindex_dupe_level(self): # ONLY happen if all higher order indices (to the left) are # equal too. In this test, 'c' has to be printed both times # because the higher order index 'A' != 'B'. - df = pd.DataFrame(index=pd.MultiIndex.from_tuples( - [('A', 'c'), ('B', 'c')]), columns=['col']) + df = pd.DataFrame( + index=pd.MultiIndex.from_tuples([("A", "c"), ("B", "c")]), columns=["col"] + ) result = df.to_latex() expected = r"""\begin{tabular}{lll} \toprule @@ -256,13 +263,15 @@ def test_to_latex_multiindex_dupe_level(self): assert result == expected def test_to_latex_multicolumnrow(self): - df = pd.DataFrame({ - ('c1', 0): {x: x for x in range(5)}, - ('c1', 1): {x: x + 5 for x in range(5)}, - ('c2', 0): {x: x for x in range(5)}, - ('c2', 1): {x: x + 5 for x in range(5)}, - ('c3', 0): {x: x for x in range(5)} - }) + df = pd.DataFrame( + { + ("c1", 0): {x: x for x in range(5)}, + ("c1", 1): {x: x + 5 for x in range(5)}, + ("c2", 0): {x: x for x in range(5)}, + ("c2", 1): {x: x + 5 for x in range(5)}, + ("c3", 0): {x: x for x in range(5)}, + } + ) result = df.to_latex() expected = r"""\begin{tabular}{lrrrrr} \toprule @@ -313,8 +322,7 @@ def test_to_latex_multicolumnrow(self): assert result == expected df.index = df.T.index - result = df.T.to_latex(multirow=True, multicolumn=True, - multicolumn_format='c') + result = df.T.to_latex(multirow=True, multicolumn=True, multicolumn_format="c") expected = r"""\begin{tabular}{llrrrrr} \toprule & & \multicolumn{2}{c}{c1} & \multicolumn{2}{c}{c2} & c3 \\ @@ -333,19 +341,15 @@ def test_to_latex_multicolumnrow(self): assert result == expected def test_to_latex_escape(self): - a = 'a' - b = 'b' + a = "a" + b = "b" - test_dict = {'co$e^x$': {a: "a", - b: "b"}, - 'co^l1': {a: "a", - b: "b"}} + test_dict = {"co$e^x$": {a: "a", b: "b"}, "co^l1": {a: "a", b: "b"}} unescaped_result = DataFrame(test_dict).to_latex(escape=False) - escaped_result = DataFrame(test_dict).to_latex( - ) # default: escape=True + escaped_result = DataFrame(test_dict).to_latex() # default: escape=True - unescaped_expected = r'''\begin{tabular}{lll} + unescaped_expected = r"""\begin{tabular}{lll} \toprule {} & co$e^x$ & co^l1 \\ \midrule @@ -353,9 +357,9 @@ def test_to_latex_escape(self): b & b & b \\ \bottomrule \end{tabular} -''' +""" - escaped_expected = r'''\begin{tabular}{lll} + escaped_expected = r"""\begin{tabular}{lll} \toprule {} & co\$e\textasciicircum x\$ & co\textasciicircum l1 \\ \midrule @@ -363,7 +367,7 @@ def test_to_latex_escape(self): b & b & b \\ \bottomrule \end{tabular} -''' +""" assert unescaped_result == unescaped_expected assert escaped_result == escaped_expected @@ -387,7 +391,7 @@ def test_to_latex_special_escape(self): def test_to_latex_longtable(self, float_frame): float_frame.to_latex(longtable=True) - df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) withindex_result = df.to_latex(longtable=True) withindex_expected = r"""\begin{longtable}{lrl} \toprule @@ -427,17 +431,16 @@ def test_to_latex_longtable(self, float_frame): assert withoutindex_result == withoutindex_expected - df = DataFrame({'a': [1, 2]}) + df = DataFrame({"a": [1, 2]}) with1column_result = df.to_latex(index=False, longtable=True) assert r"\multicolumn{1}" in with1column_result - df = DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) + df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}) with3columns_result = df.to_latex(index=False, longtable=True) assert r"\multicolumn{3}" in with3columns_result def test_to_latex_escape_special_chars(self): - special_characters = ['&', '%', '$', '#', '_', '{', '}', '~', '^', - '\\'] + special_characters = ["&", "%", "$", "#", "_", "{", "}", "~", "^", "\\"] df = DataFrame(data=special_characters) observed = df.to_latex() expected = r"""\begin{tabular}{ll} @@ -462,7 +465,7 @@ def test_to_latex_escape_special_chars(self): def test_to_latex_no_header(self): # GH 7124 - df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) withindex_result = df.to_latex(header=False) withindex_expected = r"""\begin{tabular}{lrl} \toprule @@ -487,8 +490,8 @@ def test_to_latex_no_header(self): def test_to_latex_specified_header(self): # GH 7124 - df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) - withindex_result = df.to_latex(header=['AA', 'BB']) + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + withindex_result = df.to_latex(header=["AA", "BB"]) withindex_expected = r"""\begin{tabular}{lrl} \toprule {} & AA & BB \\ @@ -501,7 +504,7 @@ def test_to_latex_specified_header(self): assert withindex_result == withindex_expected - withoutindex_result = df.to_latex(header=['AA', 'BB'], index=False) + withoutindex_result = df.to_latex(header=["AA", "BB"], index=False) withoutindex_expected = r"""\begin{tabular}{rl} \toprule AA & BB \\ @@ -514,7 +517,7 @@ def test_to_latex_specified_header(self): assert withoutindex_result == withoutindex_expected - withoutescape_result = df.to_latex(header=['$A$', '$B$'], escape=False) + withoutescape_result = df.to_latex(header=["$A$", "$B$"], escape=False) withoutescape_expected = r"""\begin{tabular}{lrl} \toprule {} & $A$ & $B$ \\ @@ -528,14 +531,14 @@ def test_to_latex_specified_header(self): assert withoutescape_result == withoutescape_expected with pytest.raises(ValueError): - df.to_latex(header=['A']) + df.to_latex(header=["A"]) def test_to_latex_decimal(self, float_frame): # GH 12031 float_frame.to_latex() - df = DataFrame({'a': [1.0, 2.1], 'b': ['b1', 'b2']}) - withindex_result = df.to_latex(decimal=',') + df = DataFrame({"a": [1.0, 2.1], "b": ["b1", "b2"]}) + withindex_result = df.to_latex(decimal=",") withindex_expected = r"""\begin{tabular}{lrl} \toprule @@ -550,7 +553,7 @@ def test_to_latex_decimal(self, float_frame): assert withindex_result == withindex_expected def test_to_latex_series(self): - s = Series(['a', 'b', 'c']) + s = Series(["a", "b", "c"]) withindex_result = s.to_latex() withindex_expected = r"""\begin{tabular}{ll} \toprule @@ -566,7 +569,7 @@ def test_to_latex_series(self): def test_to_latex_bold_rows(self): # GH 16707 - df = pd.DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) + df = pd.DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) observed = df.to_latex(bold_rows=True) expected = r"""\begin{tabular}{lrl} \toprule @@ -581,7 +584,7 @@ def test_to_latex_bold_rows(self): def test_to_latex_no_bold_rows(self): # GH 16707 - df = pd.DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) + df = pd.DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) observed = df.to_latex(bold_rows=False) expected = r"""\begin{tabular}{lrl} \toprule @@ -594,9 +597,9 @@ def test_to_latex_no_bold_rows(self): """ assert observed == expected - @pytest.mark.parametrize('name0', [None, 'named0']) - @pytest.mark.parametrize('name1', [None, 'named1']) - @pytest.mark.parametrize('axes', [[0], [1], [0, 1]]) + @pytest.mark.parametrize("name0", [None, "named0"]) + @pytest.mark.parametrize("name1", [None, "named1"]) + @pytest.mark.parametrize("axes", [[0], [1], [0, 1]]) def test_to_latex_multiindex_names(self, name0, name1, axes): # GH 18667 names = [name0, name1] @@ -605,12 +608,14 @@ def test_to_latex_multiindex_names(self, name0, name1, axes): for idx in axes: df.axes[idx].names = names - idx_names = tuple(n or '{}' for n in names) - idx_names_row = ('%s & %s & & & & \\\\\n' % idx_names - if (0 in axes and any(names)) else '') - placeholder = '{}' if any(names) and 1 in axes else ' ' - col_names = [n if (bool(n) and 1 in axes) else placeholder - for n in names] + idx_names = tuple(n or "{}" for n in names) + idx_names_row = ( + "%s & %s & & & & \\\\\n" % idx_names + if (0 in axes and any(names)) + else "" + ) + placeholder = "{}" if any(names) and 1 in axes else " " + col_names = [n if (bool(n) and 1 in axes) else placeholder for n in names] observed = df.to_latex() expected = r"""\begin{tabular}{llrrrr} \toprule @@ -623,16 +628,18 @@ def test_to_latex_multiindex_names(self, name0, name1, axes): & 4 & -1 & -1 & -1 & -1 \\ \bottomrule \end{tabular} -""" % tuple(list(col_names) + [idx_names_row]) +""" % tuple( + list(col_names) + [idx_names_row] + ) assert observed == expected - @pytest.mark.parametrize('one_row', [True, False]) + @pytest.mark.parametrize("one_row", [True, False]) def test_to_latex_multiindex_nans(self, one_row): # GH 14249 - df = pd.DataFrame({'a': [None, 1], 'b': [2, 3], 'c': [4, 5]}) + df = pd.DataFrame({"a": [None, 1], "b": [2, 3], "c": [4, 5]}) if one_row: df = df.iloc[[0]] - observed = df.set_index(['a', 'b']).to_latex() + observed = df.set_index(["a", "b"]).to_latex() expected = r"""\begin{tabular}{llr} \toprule & & c \\ @@ -665,8 +672,8 @@ def test_to_latex_non_string_index(self): def test_to_latex_midrule_location(self): # GH 18326 - df = pd.DataFrame({'a': [1, 2]}) - df.index.name = 'foo' + df = pd.DataFrame({"a": [1, 2]}) + df.index.name = "foo" observed = df.to_latex(index_names=False) expected = r"""\begin{tabular}{lr} \toprule @@ -682,7 +689,7 @@ def test_to_latex_midrule_location(self): def test_to_latex_multiindex_empty_name(self): # GH 18669 - mi = pd.MultiIndex.from_product([[1, 2]], names=['']) + mi = pd.MultiIndex.from_product([[1, 2]], names=[""]) df = pd.DataFrame(-1, index=mi, columns=range(4)) observed = df.to_latex() expected = r"""\begin{tabular}{lrrrr} @@ -700,7 +707,7 @@ def test_to_latex_multiindex_empty_name(self): def test_to_latex_float_format_no_fixed_width(self): # GH 21625 - df = DataFrame({'x': [0.19999]}) + df = DataFrame({"x": [0.19999]}) expected = r"""\begin{tabular}{lr} \toprule {} & x \\ @@ -709,10 +716,10 @@ def test_to_latex_float_format_no_fixed_width(self): \bottomrule \end{tabular} """ - assert df.to_latex(float_format='%.3f') == expected + assert df.to_latex(float_format="%.3f") == expected # GH 22270 - df = DataFrame({'x': [100.0]}) + df = DataFrame({"x": [100.0]}) expected = r"""\begin{tabular}{lr} \toprule {} & x \\ @@ -721,13 +728,14 @@ def test_to_latex_float_format_no_fixed_width(self): \bottomrule \end{tabular} """ - assert df.to_latex(float_format='%.0f') == expected + assert df.to_latex(float_format="%.0f") == expected def test_to_latex_multindex_header(self): # GH 16718 - df = (pd.DataFrame({'a': [0], 'b': [1], 'c': [2], 'd': [3]}) - .set_index(['a', 'b'])) - observed = df.to_latex(header=['r1', 'r2']) + df = pd.DataFrame({"a": [0], "b": [1], "c": [2], "d": [3]}).set_index( + ["a", "b"] + ) + observed = df.to_latex(header=["r1", "r2"]) expected = r"""\begin{tabular}{llrr} \toprule & & r1 & r2 \\ diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py index 5bb7fe8e6367d0..3ccb29f07dc837 100755 --- a/pandas/tests/io/generate_legacy_storage_files.py +++ b/pandas/tests/io/generate_legacy_storage_files.py @@ -45,15 +45,46 @@ import pandas from pandas import ( - Categorical, DataFrame, Index, MultiIndex, NaT, Period, RangeIndex, Series, - SparseDataFrame, SparseSeries, Timestamp, bdate_range, date_range, - period_range, timedelta_range, to_msgpack) + Categorical, + DataFrame, + Index, + MultiIndex, + NaT, + Period, + RangeIndex, + Series, + SparseDataFrame, + SparseSeries, + Timestamp, + bdate_range, + date_range, + period_range, + timedelta_range, + to_msgpack, +) from pandas.tseries.offsets import ( - FY5253, BusinessDay, BusinessHour, CustomBusinessDay, DateOffset, Day, - Easter, Hour, LastWeekOfMonth, Minute, MonthBegin, MonthEnd, QuarterBegin, - QuarterEnd, SemiMonthBegin, SemiMonthEnd, Week, WeekOfMonth, YearBegin, - YearEnd) + FY5253, + BusinessDay, + BusinessHour, + CustomBusinessDay, + DateOffset, + Day, + Easter, + Hour, + LastWeekOfMonth, + Minute, + MonthBegin, + MonthEnd, + QuarterBegin, + QuarterEnd, + SemiMonthBegin, + SemiMonthEnd, + Week, + WeekOfMonth, + YearBegin, + YearEnd, +) _loose_version = LooseVersion(pandas.__version__) @@ -66,8 +97,8 @@ def _create_sp_series(): arr[7:12] = nan arr[-1:] = nan - bseries = SparseSeries(arr, kind='block') - bseries.name = 'bseries' + bseries = SparseSeries(arr, kind="block") + bseries.name = "bseries" return bseries @@ -79,21 +110,23 @@ def _create_sp_tsseries(): arr[7:12] = nan arr[-1:] = nan - date_index = bdate_range('1/1/2011', periods=len(arr)) - bseries = SparseSeries(arr, index=date_index, kind='block') - bseries.name = 'btsseries' + date_index = bdate_range("1/1/2011", periods=len(arr)) + bseries = SparseSeries(arr, index=date_index, kind="block") + bseries.name = "btsseries" return bseries def _create_sp_frame(): nan = np.nan - data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], - 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], - 'C': np.arange(10).astype(np.int64), - 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} + data = { + "A": [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], + "B": [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], + "C": np.arange(10).astype(np.int64), + "D": [0, 1, 2, 3, 4, 5, nan, nan, nan, nan], + } - dates = bdate_range('1/1/2011', periods=10) + dates = bdate_range("1/1/2011", periods=10) return SparseDataFrame(data, index=dates) @@ -101,132 +134,165 @@ def create_data(): """ create the pickle/msgpack data """ data = { - 'A': [0., 1., 2., 3., np.nan], - 'B': [0, 1, 0, 1, 0], - 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], - 'D': date_range('1/1/2009', periods=5), - 'E': [0., 1, Timestamp('20100101'), 'foo', 2.] + "A": [0.0, 1.0, 2.0, 3.0, np.nan], + "B": [0, 1, 0, 1, 0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": date_range("1/1/2009", periods=5), + "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0], } - scalars = dict(timestamp=Timestamp('20130101'), - period=Period('2012', 'M')) + scalars = dict(timestamp=Timestamp("20130101"), period=Period("2012", "M")) - index = dict(int=Index(np.arange(10)), - date=date_range('20130101', periods=10), - period=period_range('2013-01-01', freq='M', periods=10), - float=Index(np.arange(10, dtype=np.float64)), - uint=Index(np.arange(10, dtype=np.uint64)), - timedelta=timedelta_range('00:00:00', freq='30T', periods=10)) + index = dict( + int=Index(np.arange(10)), + date=date_range("20130101", periods=10), + period=period_range("2013-01-01", freq="M", periods=10), + float=Index(np.arange(10, dtype=np.float64)), + uint=Index(np.arange(10, dtype=np.uint64)), + timedelta=timedelta_range("00:00:00", freq="30T", periods=10), + ) - index['range'] = RangeIndex(10) + index["range"] = RangeIndex(10) - if _loose_version >= LooseVersion('0.21'): + if _loose_version >= LooseVersion("0.21"): from pandas import interval_range - index['interval'] = interval_range(0, periods=10) - - mi = dict(reg2=MultiIndex.from_tuples( - tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', - 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', - 'two', 'one', 'two']])), - names=['first', 'second'])) - - series = dict(float=Series(data['A']), - int=Series(data['B']), - mixed=Series(data['E']), - ts=Series(np.arange(10).astype(np.int64), - index=date_range('20130101', periods=10)), - mi=Series(np.arange(5).astype(np.float64), - index=MultiIndex.from_tuples( - tuple(zip(*[[1, 1, 2, 2, 2], - [3, 4, 3, 4, 5]])), - names=['one', 'two'])), - dup=Series(np.arange(5).astype(np.float64), - index=['A', 'B', 'C', 'D', 'A']), - cat=Series(Categorical(['foo', 'bar', 'baz'])), - dt=Series(date_range('20130101', periods=5)), - dt_tz=Series(date_range('20130101', periods=5, - tz='US/Eastern')), - period=Series([Period('2000Q1')] * 5)) + + index["interval"] = interval_range(0, periods=10) + + mi = dict( + reg2=MultiIndex.from_tuples( + tuple( + zip( + *[ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + ) + ), + names=["first", "second"], + ) + ) + + series = dict( + float=Series(data["A"]), + int=Series(data["B"]), + mixed=Series(data["E"]), + ts=Series( + np.arange(10).astype(np.int64), index=date_range("20130101", periods=10) + ), + mi=Series( + np.arange(5).astype(np.float64), + index=MultiIndex.from_tuples( + tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=["one", "two"] + ), + ), + dup=Series(np.arange(5).astype(np.float64), index=["A", "B", "C", "D", "A"]), + cat=Series(Categorical(["foo", "bar", "baz"])), + dt=Series(date_range("20130101", periods=5)), + dt_tz=Series(date_range("20130101", periods=5, tz="US/Eastern")), + period=Series([Period("2000Q1")] * 5), + ) mixed_dup_df = DataFrame(data) mixed_dup_df.columns = list("ABCDA") - frame = dict(float=DataFrame({'A': series['float'], - 'B': series['float'] + 1}), - int=DataFrame({'A': series['int'], - 'B': series['int'] + 1}), - mixed=DataFrame({k: data[k] - for k in ['A', 'B', 'C', 'D']}), - mi=DataFrame({'A': np.arange(5).astype(np.float64), - 'B': np.arange(5).astype(np.int64)}, - index=MultiIndex.from_tuples( - tuple(zip(*[['bar', 'bar', 'baz', - 'baz', 'baz'], - ['one', 'two', 'one', - 'two', 'three']])), - names=['first', 'second'])), - dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), - columns=['A', 'B', 'A']), - cat_onecol=DataFrame({'A': Categorical(['foo', 'bar'])}), - cat_and_float=DataFrame({ - 'A': Categorical(['foo', 'bar', 'baz']), - 'B': np.arange(3).astype(np.int64)}), - mixed_dup=mixed_dup_df, - dt_mixed_tzs=DataFrame({ - 'A': Timestamp('20130102', tz='US/Eastern'), - 'B': Timestamp('20130603', tz='CET')}, index=range(5)), - dt_mixed2_tzs=DataFrame({ - 'A': Timestamp('20130102', tz='US/Eastern'), - 'B': Timestamp('20130603', tz='CET'), - 'C': Timestamp('20130603', tz='UTC')}, index=range(5)) - ) - - cat = dict(int8=Categorical(list('abcdefg')), - int16=Categorical(np.arange(1000)), - int32=Categorical(np.arange(10000))) - - timestamp = dict(normal=Timestamp('2011-01-01'), - nat=NaT, - tz=Timestamp('2011-01-01', tz='US/Eastern')) - - timestamp['freq'] = Timestamp('2011-01-01', freq='D') - timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo', - freq='M') - - off = {'DateOffset': DateOffset(years=1), - 'DateOffset_h_ns': DateOffset(hour=6, nanoseconds=5824), - 'BusinessDay': BusinessDay(offset=timedelta(seconds=9)), - 'BusinessHour': BusinessHour(normalize=True, n=6, end='15:14'), - 'CustomBusinessDay': CustomBusinessDay(weekmask='Mon Fri'), - 'SemiMonthBegin': SemiMonthBegin(day_of_month=9), - 'SemiMonthEnd': SemiMonthEnd(day_of_month=24), - 'MonthBegin': MonthBegin(1), - 'MonthEnd': MonthEnd(1), - 'QuarterBegin': QuarterBegin(1), - 'QuarterEnd': QuarterEnd(1), - 'Day': Day(1), - 'YearBegin': YearBegin(1), - 'YearEnd': YearEnd(1), - 'Week': Week(1), - 'Week_Tues': Week(2, normalize=False, weekday=1), - 'WeekOfMonth': WeekOfMonth(week=3, weekday=4), - 'LastWeekOfMonth': LastWeekOfMonth(n=1, weekday=3), - 'FY5253': FY5253(n=2, weekday=6, startingMonth=7, variation="last"), - 'Easter': Easter(), - 'Hour': Hour(1), - 'Minute': Minute(1)} - - return dict(series=series, - frame=frame, - index=index, - scalars=scalars, - mi=mi, - sp_series=dict(float=_create_sp_series(), - ts=_create_sp_tsseries()), - sp_frame=dict(float=_create_sp_frame()), - cat=cat, - timestamp=timestamp, - offsets=off) + frame = dict( + float=DataFrame({"A": series["float"], "B": series["float"] + 1}), + int=DataFrame({"A": series["int"], "B": series["int"] + 1}), + mixed=DataFrame({k: data[k] for k in ["A", "B", "C", "D"]}), + mi=DataFrame( + {"A": np.arange(5).astype(np.float64), "B": np.arange(5).astype(np.int64)}, + index=MultiIndex.from_tuples( + tuple( + zip( + *[ + ["bar", "bar", "baz", "baz", "baz"], + ["one", "two", "one", "two", "three"], + ] + ) + ), + names=["first", "second"], + ), + ), + dup=DataFrame( + np.arange(15).reshape(5, 3).astype(np.float64), columns=["A", "B", "A"] + ), + cat_onecol=DataFrame({"A": Categorical(["foo", "bar"])}), + cat_and_float=DataFrame( + { + "A": Categorical(["foo", "bar", "baz"]), + "B": np.arange(3).astype(np.int64), + } + ), + mixed_dup=mixed_dup_df, + dt_mixed_tzs=DataFrame( + { + "A": Timestamp("20130102", tz="US/Eastern"), + "B": Timestamp("20130603", tz="CET"), + }, + index=range(5), + ), + dt_mixed2_tzs=DataFrame( + { + "A": Timestamp("20130102", tz="US/Eastern"), + "B": Timestamp("20130603", tz="CET"), + "C": Timestamp("20130603", tz="UTC"), + }, + index=range(5), + ), + ) + + cat = dict( + int8=Categorical(list("abcdefg")), + int16=Categorical(np.arange(1000)), + int32=Categorical(np.arange(10000)), + ) + + timestamp = dict( + normal=Timestamp("2011-01-01"), + nat=NaT, + tz=Timestamp("2011-01-01", tz="US/Eastern"), + ) + + timestamp["freq"] = Timestamp("2011-01-01", freq="D") + timestamp["both"] = Timestamp("2011-01-01", tz="Asia/Tokyo", freq="M") + + off = { + "DateOffset": DateOffset(years=1), + "DateOffset_h_ns": DateOffset(hour=6, nanoseconds=5824), + "BusinessDay": BusinessDay(offset=timedelta(seconds=9)), + "BusinessHour": BusinessHour(normalize=True, n=6, end="15:14"), + "CustomBusinessDay": CustomBusinessDay(weekmask="Mon Fri"), + "SemiMonthBegin": SemiMonthBegin(day_of_month=9), + "SemiMonthEnd": SemiMonthEnd(day_of_month=24), + "MonthBegin": MonthBegin(1), + "MonthEnd": MonthEnd(1), + "QuarterBegin": QuarterBegin(1), + "QuarterEnd": QuarterEnd(1), + "Day": Day(1), + "YearBegin": YearBegin(1), + "YearEnd": YearEnd(1), + "Week": Week(1), + "Week_Tues": Week(2, normalize=False, weekday=1), + "WeekOfMonth": WeekOfMonth(week=3, weekday=4), + "LastWeekOfMonth": LastWeekOfMonth(n=1, weekday=3), + "FY5253": FY5253(n=2, weekday=6, startingMonth=7, variation="last"), + "Easter": Easter(), + "Hour": Hour(1), + "Minute": Minute(1), + } + + return dict( + series=series, + frame=frame, + index=index, + scalars=scalars, + mi=mi, + sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()), + sp_frame=dict(float=_create_sp_frame()), + cat=cat, + timestamp=timestamp, + offsets=off, + ) def create_pickle_data(): @@ -242,38 +308,47 @@ def _u(x): def create_msgpack_data(): data = create_data() # Not supported - del data['sp_series'] - del data['sp_frame'] - del data['series']['cat'] - del data['series']['period'] - del data['frame']['cat_onecol'] - del data['frame']['cat_and_float'] - del data['scalars']['period'] - if _loose_version >= LooseVersion('0.21') and ( - _loose_version < LooseVersion('0.23.0')): - del data['index']['interval'] - del data['offsets'] + del data["sp_series"] + del data["sp_frame"] + del data["series"]["cat"] + del data["series"]["period"] + del data["frame"]["cat_onecol"] + del data["frame"]["cat_and_float"] + del data["scalars"]["period"] + if _loose_version >= LooseVersion("0.21") and ( + _loose_version < LooseVersion("0.23.0") + ): + del data["index"]["interval"] + del data["offsets"] return _u(data) def platform_name(): - return '_'.join([str(pandas.__version__), str(pl.machine()), - str(pl.system().lower()), str(pl.python_version())]) + return "_".join( + [ + str(pandas.__version__), + str(pl.machine()), + str(pl.system().lower()), + str(pl.python_version()), + ] + ) def write_legacy_pickles(output_dir): version = pandas.__version__ - print("This script generates a storage file for the current arch, system, " - "and python version") + print( + "This script generates a storage file for the current arch, system, " + "and python version" + ) print(" pandas version: {0}".format(version)) print(" output dir : {0}".format(output_dir)) print(" storage format: pickle") - pth = '{0}.pickle'.format(platform_name()) + pth = "{0}.pickle".format(platform_name()) - fh = open(os.path.join(output_dir, pth), 'wb') + fh = open(os.path.join(output_dir, pth), "wb") pickle.dump(create_pickle_data(), fh, pickle.HIGHEST_PROTOCOL) fh.close() @@ -284,26 +359,29 @@ def write_legacy_msgpack(output_dir, compress): version = pandas.__version__ - print("This script generates a storage file for the current arch, " - "system, and python version") + print( + "This script generates a storage file for the current arch, " + "system, and python version" + ) print(" pandas version: {0}".format(version)) print(" output dir : {0}".format(output_dir)) print(" storage format: msgpack") - pth = '{0}.msgpack'.format(platform_name()) - to_msgpack(os.path.join(output_dir, pth), create_msgpack_data(), - compress=compress) + pth = "{0}.msgpack".format(platform_name()) + to_msgpack(os.path.join(output_dir, pth), create_msgpack_data(), compress=compress) print("created msgpack file: %s" % pth) def write_legacy_file(): # force our cwd to be the first searched - sys.path.insert(0, '.') + sys.path.insert(0, ".") if not (3 <= len(sys.argv) <= 4): - exit("Specify output directory and storage type: generate_legacy_" - "storage_files.py " - "") + exit( + "Specify output directory and storage type: generate_legacy_" + "storage_files.py " + "" + ) output_dir = str(sys.argv[1]) storage_type = str(sys.argv[2]) @@ -312,13 +390,13 @@ def write_legacy_file(): except IndexError: compress_type = None - if storage_type == 'pickle': + if storage_type == "pickle": write_legacy_pickles(output_dir=output_dir) - elif storage_type == 'msgpack': + elif storage_type == "msgpack": write_legacy_msgpack(output_dir=output_dir, compress=compress_type) else: exit("storage_type must be one of {'pickle', 'msgpack'}") -if __name__ == '__main__': +if __name__ == "__main__": write_legacy_file() diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 430acbdac804ad..56be84bccc51a0 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -8,18 +8,19 @@ def test_compression_roundtrip(compression): - df = pd.DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) + df = pd.DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) with tm.ensure_clean() as path: df.to_json(path, compression=compression) - assert_frame_equal(df, pd.read_json(path, - compression=compression)) + assert_frame_equal(df, pd.read_json(path, compression=compression)) # explicitly ensure file was compressed. with tm.decompress_file(path, compression) as fh: - result = fh.read().decode('utf8') + result = fh.read().decode("utf8") assert_frame_equal(df, pd.read_json(result)) @@ -28,7 +29,7 @@ def test_read_zipped_json(datapath): uncompressed_df = pd.read_json(uncompressed_path) compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip") - compressed_df = pd.read_json(compressed_path, compression='zip') + compressed_df = pd.read_json(compressed_path, compression="zip") assert_frame_equal(uncompressed_df, compressed_df) @@ -41,11 +42,10 @@ def test_with_s3_url(compression, s3_resource): with tm.ensure_clean() as path: df.to_json(path, compression=compression) - with open(path, 'rb') as f: - s3_resource.Bucket("pandas-test").put_object(Key='test-1', Body=f) + with open(path, "rb") as f: + s3_resource.Bucket("pandas-test").put_object(Key="test-1", Body=f) - roundtripped_df = pd.read_json('s3://pandas-test/test-1', - compression=compression) + roundtripped_df = pd.read_json("s3://pandas-test/test-1", compression=compression) assert_frame_equal(df, roundtripped_df) @@ -53,10 +53,8 @@ def test_lines_with_compression(compression): with tm.ensure_clean() as path: df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}') - df.to_json(path, orient='records', lines=True, - compression=compression) - roundtripped_df = pd.read_json(path, lines=True, - compression=compression) + df.to_json(path, orient="records", lines=True, compression=compression) + roundtripped_df = pd.read_json(path, lines=True, compression=compression) assert_frame_equal(df, roundtripped_df) @@ -64,11 +62,9 @@ def test_chunksize_with_compression(compression): with tm.ensure_clean() as path: df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}') - df.to_json(path, orient='records', lines=True, - compression=compression) + df.to_json(path, orient="records", lines=True, compression=compression) - res = pd.read_json(path, lines=True, chunksize=1, - compression=compression) + res = pd.read_json(path, lines=True, chunksize=1, compression=compression) roundtripped_df = pd.concat(res) assert_frame_equal(df, roundtripped_df) @@ -90,14 +86,15 @@ def test_read_unsupported_compression_type(): @pytest.mark.parametrize("to_infer", [True, False]) @pytest.mark.parametrize("read_infer", [True, False]) -def test_to_json_compression(compression_only, - read_infer, to_infer): +def test_to_json_compression(compression_only, read_infer, to_infer): # see gh-15008 compression = compression_only if compression == "zip": - pytest.skip("{compression} is not supported " - "for to_csv".format(compression=compression)) + pytest.skip( + "{compression} is not supported " + "for to_csv".format(compression=compression) + ) # We'll complete file extension subsequently. filename = "test." diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 4cc62d3db124f7..28c8837731ec16 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -5,559 +5,703 @@ import numpy as np import pytest -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, DatetimeTZDtype, PeriodDtype) +from pandas.core.dtypes.dtypes import CategoricalDtype, DatetimeTZDtype, PeriodDtype import pandas as pd from pandas import DataFrame import pandas.util.testing as tm from pandas.io.json.table_schema import ( - as_json_table_type, build_table_schema, convert_json_field_to_pandas_type, - convert_pandas_type_to_json_field, set_default_names) + as_json_table_type, + build_table_schema, + convert_json_field_to_pandas_type, + convert_pandas_type_to_json_field, + set_default_names, +) class TestBuildSchema: - def setup_method(self, method): self.df = DataFrame( - {'A': [1, 2, 3, 4], - 'B': ['a', 'b', 'c', 'c'], - 'C': pd.date_range('2016-01-01', freq='d', periods=4), - 'D': pd.timedelta_range('1H', periods=4, freq='T'), - }, - index=pd.Index(range(4), name='idx')) + { + "A": [1, 2, 3, 4], + "B": ["a", "b", "c", "c"], + "C": pd.date_range("2016-01-01", freq="d", periods=4), + "D": pd.timedelta_range("1H", periods=4, freq="T"), + }, + index=pd.Index(range(4), name="idx"), + ) def test_build_table_schema(self): result = build_table_schema(self.df, version=False) expected = { - 'fields': [{'name': 'idx', 'type': 'integer'}, - {'name': 'A', 'type': 'integer'}, - {'name': 'B', 'type': 'string'}, - {'name': 'C', 'type': 'datetime'}, - {'name': 'D', 'type': 'duration'}, - ], - 'primaryKey': ['idx'] + "fields": [ + {"name": "idx", "type": "integer"}, + {"name": "A", "type": "integer"}, + {"name": "B", "type": "string"}, + {"name": "C", "type": "datetime"}, + {"name": "D", "type": "duration"}, + ], + "primaryKey": ["idx"], } assert result == expected result = build_table_schema(self.df) assert "pandas_version" in result def test_series(self): - s = pd.Series([1, 2, 3], name='foo') + s = pd.Series([1, 2, 3], name="foo") result = build_table_schema(s, version=False) - expected = {'fields': [{'name': 'index', 'type': 'integer'}, - {'name': 'foo', 'type': 'integer'}], - 'primaryKey': ['index']} + expected = { + "fields": [ + {"name": "index", "type": "integer"}, + {"name": "foo", "type": "integer"}, + ], + "primaryKey": ["index"], + } assert result == expected result = build_table_schema(s) - assert 'pandas_version' in result + assert "pandas_version" in result def test_series_unnamed(self): result = build_table_schema(pd.Series([1, 2, 3]), version=False) - expected = {'fields': [{'name': 'index', 'type': 'integer'}, - {'name': 'values', 'type': 'integer'}], - 'primaryKey': ['index']} + expected = { + "fields": [ + {"name": "index", "type": "integer"}, + {"name": "values", "type": "integer"}, + ], + "primaryKey": ["index"], + } assert result == expected def test_multiindex(self): df = self.df.copy() - idx = pd.MultiIndex.from_product([('a', 'b'), (1, 2)]) + idx = pd.MultiIndex.from_product([("a", "b"), (1, 2)]) df.index = idx result = build_table_schema(df, version=False) expected = { - 'fields': [{'name': 'level_0', 'type': 'string'}, - {'name': 'level_1', 'type': 'integer'}, - {'name': 'A', 'type': 'integer'}, - {'name': 'B', 'type': 'string'}, - {'name': 'C', 'type': 'datetime'}, - {'name': 'D', 'type': 'duration'}, - ], - 'primaryKey': ['level_0', 'level_1'] + "fields": [ + {"name": "level_0", "type": "string"}, + {"name": "level_1", "type": "integer"}, + {"name": "A", "type": "integer"}, + {"name": "B", "type": "string"}, + {"name": "C", "type": "datetime"}, + {"name": "D", "type": "duration"}, + ], + "primaryKey": ["level_0", "level_1"], } assert result == expected - df.index.names = ['idx0', None] - expected['fields'][0]['name'] = 'idx0' - expected['primaryKey'] = ['idx0', 'level_1'] + df.index.names = ["idx0", None] + expected["fields"][0]["name"] = "idx0" + expected["primaryKey"] = ["idx0", "level_1"] result = build_table_schema(df, version=False) assert result == expected class TestTableSchemaType: - - @pytest.mark.parametrize('int_type', [ - np.int, np.int16, np.int32, np.int64]) + @pytest.mark.parametrize("int_type", [np.int, np.int16, np.int32, np.int64]) def test_as_json_table_type_int_data(self, int_type): int_data = [1, 2, 3] - assert as_json_table_type(np.array( - int_data, dtype=int_type)) == 'integer' + assert as_json_table_type(np.array(int_data, dtype=int_type)) == "integer" - @pytest.mark.parametrize('float_type', [ - np.float, np.float16, np.float32, np.float64]) + @pytest.mark.parametrize( + "float_type", [np.float, np.float16, np.float32, np.float64] + ) def test_as_json_table_type_float_data(self, float_type): - float_data = [1., 2., 3.] - assert as_json_table_type(np.array( - float_data, dtype=float_type)) == 'number' + float_data = [1.0, 2.0, 3.0] + assert as_json_table_type(np.array(float_data, dtype=float_type)) == "number" - @pytest.mark.parametrize('bool_type', [bool, np.bool]) + @pytest.mark.parametrize("bool_type", [bool, np.bool]) def test_as_json_table_type_bool_data(self, bool_type): bool_data = [True, False] - assert as_json_table_type(np.array( - bool_data, dtype=bool_type)) == 'boolean' - - @pytest.mark.parametrize('date_data', [ - pd.to_datetime(['2016']), - pd.to_datetime(['2016'], utc=True), - pd.Series(pd.to_datetime(['2016'])), - pd.Series(pd.to_datetime(['2016'], utc=True)), - pd.period_range('2016', freq='A', periods=3) - ]) + assert as_json_table_type(np.array(bool_data, dtype=bool_type)) == "boolean" + + @pytest.mark.parametrize( + "date_data", + [ + pd.to_datetime(["2016"]), + pd.to_datetime(["2016"], utc=True), + pd.Series(pd.to_datetime(["2016"])), + pd.Series(pd.to_datetime(["2016"], utc=True)), + pd.period_range("2016", freq="A", periods=3), + ], + ) def test_as_json_table_type_date_data(self, date_data): - assert as_json_table_type(date_data) == 'datetime' + assert as_json_table_type(date_data) == "datetime" - @pytest.mark.parametrize('str_data', [ - pd.Series(['a', 'b']), pd.Index(['a', 'b'])]) + @pytest.mark.parametrize("str_data", [pd.Series(["a", "b"]), pd.Index(["a", "b"])]) def test_as_json_table_type_string_data(self, str_data): - assert as_json_table_type(str_data) == 'string' - - @pytest.mark.parametrize('cat_data', [ - pd.Categorical(['a']), - pd.Categorical([1]), - pd.Series(pd.Categorical([1])), - pd.CategoricalIndex([1]), - pd.Categorical([1])]) + assert as_json_table_type(str_data) == "string" + + @pytest.mark.parametrize( + "cat_data", + [ + pd.Categorical(["a"]), + pd.Categorical([1]), + pd.Series(pd.Categorical([1])), + pd.CategoricalIndex([1]), + pd.Categorical([1]), + ], + ) def test_as_json_table_type_categorical_data(self, cat_data): - assert as_json_table_type(cat_data) == 'any' + assert as_json_table_type(cat_data) == "any" # ------ # dtypes # ------ - @pytest.mark.parametrize('int_dtype', [ - np.int, np.int16, np.int32, np.int64]) + @pytest.mark.parametrize("int_dtype", [np.int, np.int16, np.int32, np.int64]) def test_as_json_table_type_int_dtypes(self, int_dtype): - assert as_json_table_type(int_dtype) == 'integer' + assert as_json_table_type(int_dtype) == "integer" - @pytest.mark.parametrize('float_dtype', [ - np.float, np.float16, np.float32, np.float64]) + @pytest.mark.parametrize( + "float_dtype", [np.float, np.float16, np.float32, np.float64] + ) def test_as_json_table_type_float_dtypes(self, float_dtype): - assert as_json_table_type(float_dtype) == 'number' + assert as_json_table_type(float_dtype) == "number" - @pytest.mark.parametrize('bool_dtype', [bool, np.bool]) + @pytest.mark.parametrize("bool_dtype", [bool, np.bool]) def test_as_json_table_type_bool_dtypes(self, bool_dtype): - assert as_json_table_type(bool_dtype) == 'boolean' - - @pytest.mark.parametrize('date_dtype', [ - np.datetime64, np.dtype("=1" with pytest.raises(ValueError, match=msg): - pd.read_json(StringIO(lines_json_df), lines=True, - chunksize=chunksize) + pd.read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize) @pytest.mark.parametrize("chunksize", [None, 1, 2]) @@ -163,9 +167,10 @@ def test_readjson_chunks_multiple_empty_lines(chunksize): {"A":3,"B":6} """ - orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + orig = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) test = pd.read_json(j, lines=True, chunksize=chunksize) if chunksize is not None: test = pd.concat(test) tm.assert_frame_equal( - orig, test, obj="chunksize: {chunksize}".format(chunksize=chunksize)) + orig, test, obj="chunksize: {chunksize}".format(chunksize=chunksize) + ) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 0d84221b8d4e34..69a246487ddf15 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -40,12 +40,9 @@ def _clean_dict(d): return {str(k): v for k, v in d.items()} -@pytest.fixture(params=[ - None, # Column indexed by default. - "split", - "records", - "values", - "index"]) +@pytest.fixture( + params=[None, "split", "records", "values", "index"] # Column indexed by default. +) def orient(request): return request.param @@ -56,9 +53,9 @@ def numpy(request): class TestUltraJSONTests: - - @pytest.mark.skipif(compat.is_platform_32bit(), - reason="not compliant on 32-bit, xref #15865") + @pytest.mark.skipif( + compat.is_platform_32bit(), reason="not compliant on 32-bit, xref #15865" + ) def test_encode_decimal(self): sut = decimal.Decimal("1337.1337") encoded = ujson.encode(sut, double_precision=15) @@ -117,15 +114,15 @@ def test_encode_decimal(self): @pytest.mark.parametrize("ensure_ascii", [True, False]) def test_encode_string_conversion(self, ensure_ascii): string_input = "A string \\ / \b \f \n \r \t &" - not_html_encoded = ('"A string \\\\ \\/ \\b \\f \\n ' - '\\r \\t <\\/script> &"') - html_encoded = ('"A string \\\\ \\/ \\b \\f \\n \\r \\t ' - '\\u003c\\/script\\u003e \\u0026"') + not_html_encoded = '"A string \\\\ \\/ \\b \\f \\n ' '\\r \\t <\\/script> &"' + html_encoded = ( + '"A string \\\\ \\/ \\b \\f \\n \\r \\t ' '\\u003c\\/script\\u003e \\u0026"' + ) def helper(expected_output, **encode_kwargs): - output = ujson.encode(string_input, - ensure_ascii=ensure_ascii, - **encode_kwargs) + output = ujson.encode( + string_input, ensure_ascii=ensure_ascii, **encode_kwargs + ) assert output == expected_output assert string_input == json.loads(output) @@ -140,9 +137,9 @@ def helper(expected_output, **encode_kwargs): # Make sure explicit encode_html_chars=True does the encoding. helper(html_encoded, encode_html_chars=True) - @pytest.mark.parametrize("long_number", [ - -4342969734183514, -12345678901234.56789012, -528656961.4399388 - ]) + @pytest.mark.parametrize( + "long_number", [-4342969734183514, -12345678901234.56789012, -528656961.4399388] + ) def test_double_long_numbers(self, long_number): sut = {"a": long_number} encoded = ujson.encode(sut, double_precision=15) @@ -177,17 +174,14 @@ def test_encode_double_tiny_exponential(self): num = -1e-145 assert np.allclose(num, ujson.decode(ujson.encode(num))) - @pytest.mark.parametrize("unicode_key", [ - "key1", "بن" - ]) + @pytest.mark.parametrize("unicode_key", ["key1", "بن"]) def test_encode_dict_with_unicode_keys(self, unicode_key): unicode_dict = {unicode_key: "value1"} assert unicode_dict == ujson.decode(ujson.encode(unicode_dict)) - @pytest.mark.parametrize("double_input", [ - math.pi, - -math.pi # Should work with negatives too. - ]) + @pytest.mark.parametrize( + "double_input", [math.pi, -math.pi] # Should work with negatives too. + ) def test_encode_double_conversion(self, double_input): output = ujson.encode(double_input) assert round(double_input, 5) == round(json.loads(output), 5) @@ -207,19 +201,20 @@ def test_encode_array_of_nested_arrays(self): assert nested_input == ujson.decode(output) nested_input = np.array(nested_input) - tm.assert_numpy_array_equal(nested_input, ujson.decode( - output, numpy=True, dtype=nested_input.dtype)) + tm.assert_numpy_array_equal( + nested_input, ujson.decode(output, numpy=True, dtype=nested_input.dtype) + ) def test_encode_array_of_doubles(self): - doubles_input = [31337.31337, 31337.31337, - 31337.31337, 31337.31337] * 10 + doubles_input = [31337.31337, 31337.31337, 31337.31337, 31337.31337] * 10 output = ujson.encode(doubles_input) assert doubles_input == json.loads(output) assert doubles_input == ujson.decode(output) - tm.assert_numpy_array_equal(np.array(doubles_input), - ujson.decode(output, numpy=True)) + tm.assert_numpy_array_equal( + np.array(doubles_input), ujson.decode(output, numpy=True) + ) def test_double_precision(self): double_input = 30.012345678901234 @@ -229,20 +224,16 @@ def test_double_precision(self): assert double_input == ujson.decode(output) for double_precision in (3, 9): - output = ujson.encode(double_input, - double_precision=double_precision) + output = ujson.encode(double_input, double_precision=double_precision) rounded_input = round(double_input, double_precision) assert rounded_input == json.loads(output) assert rounded_input == ujson.decode(output) - @pytest.mark.parametrize("invalid_val", [ - 20, -1, "9", None - ]) + @pytest.mark.parametrize("invalid_val", [20, -1, "9", None]) def test_invalid_double_precision(self, invalid_val): double_input = 30.12345678901234567890 - expected_exception = (ValueError if isinstance(invalid_val, int) - else TypeError) + expected_exception = ValueError if isinstance(invalid_val, int) else TypeError with pytest.raises(expected_exception): ujson.encode(double_input, double_precision=invalid_val) @@ -255,10 +246,10 @@ def test_encode_string_conversion2(self): assert string_input == ujson.decode(output) assert output == '"A string \\\\ \\/ \\b \\f \\n \\r \\t"' - @pytest.mark.parametrize("unicode_input", [ - "Räksmörgås اسامة بن محمد بن عوض بن لادن", - "\xe6\x97\xa5\xd1\x88" - ]) + @pytest.mark.parametrize( + "unicode_input", + ["Räksmörgås اسامة بن محمد بن عوض بن لادن", "\xe6\x97\xa5\xd1\x88"], + ) def test_encode_unicode_conversion(self, unicode_input): enc = ujson.encode(unicode_input) dec = ujson.decode(enc) @@ -307,14 +298,18 @@ def test_encode_array_in_array(self): assert output == json.dumps(arr_in_arr_input) assert arr_in_arr_input == ujson.decode(output) - tm.assert_numpy_array_equal(np.array(arr_in_arr_input), - ujson.decode(output, numpy=True)) - - @pytest.mark.parametrize("num_input", [ - 31337, - -31337, # Negative number. - -9223372036854775808 # Large negative number. - ]) + tm.assert_numpy_array_equal( + np.array(arr_in_arr_input), ujson.decode(output, numpy=True) + ) + + @pytest.mark.parametrize( + "num_input", + [ + 31337, + -31337, # Negative number. + -9223372036854775808, # Large negative number. + ], + ) def test_encode_num_conversion(self, num_input): output = ujson.encode(num_input) assert num_input == json.loads(output) @@ -328,8 +323,9 @@ def test_encode_list_conversion(self): assert list_input == json.loads(output) assert list_input == ujson.decode(output) - tm.assert_numpy_array_equal(np.array(list_input), - ujson.decode(output, numpy=True)) + tm.assert_numpy_array_equal( + np.array(list_input), ujson.decode(output, numpy=True) + ) def test_encode_dict_conversion(self): dict_input = {"k1": 1, "k2": 2, "k3": 3, "k4": 4} @@ -363,11 +359,10 @@ def test_encode_date_conversion(self): assert int(expected) == json.loads(output) assert int(expected) == ujson.decode(output) - @pytest.mark.parametrize("test", [ - datetime.time(), - datetime.time(1, 2, 3), - datetime.time(10, 12, 15, 343243), - ]) + @pytest.mark.parametrize( + "test", + [datetime.time(), datetime.time(1, 2, 3), datetime.time(10, 12, 15, 343243)], + ) def test_encode_time_conversion_basic(self, test): output = ujson.encode(test) expected = '"{iso}"'.format(iso=test.isoformat()) @@ -387,13 +382,9 @@ def test_encode_time_conversion_dateutil(self): expected = '"{iso}"'.format(iso=test.isoformat()) assert expected == output - @pytest.mark.parametrize("decoded_input", [ - NaT, - np.datetime64("NaT"), - np.nan, - np.inf, - -np.inf - ]) + @pytest.mark.parametrize( + "decoded_input", [NaT, np.datetime64("NaT"), np.nan, np.inf, -np.inf] + ) def test_encode_as_null(self, decoded_input): assert ujson.encode(decoded_input) == "null", "Expected null" @@ -401,21 +392,21 @@ def test_datetime_units(self): val = datetime.datetime(2013, 8, 17, 21, 17, 12, 215504) stamp = Timestamp(val) - roundtrip = ujson.decode(ujson.encode(val, date_unit='s')) - assert roundtrip == stamp.value // 10**9 + roundtrip = ujson.decode(ujson.encode(val, date_unit="s")) + assert roundtrip == stamp.value // 10 ** 9 - roundtrip = ujson.decode(ujson.encode(val, date_unit='ms')) - assert roundtrip == stamp.value // 10**6 + roundtrip = ujson.decode(ujson.encode(val, date_unit="ms")) + assert roundtrip == stamp.value // 10 ** 6 - roundtrip = ujson.decode(ujson.encode(val, date_unit='us')) - assert roundtrip == stamp.value // 10**3 + roundtrip = ujson.decode(ujson.encode(val, date_unit="us")) + assert roundtrip == stamp.value // 10 ** 3 - roundtrip = ujson.decode(ujson.encode(val, date_unit='ns')) + roundtrip = ujson.decode(ujson.encode(val, date_unit="ns")) assert roundtrip == stamp.value msg = "Invalid value 'foo' for option 'date_unit'" with pytest.raises(ValueError, match=msg): - ujson.encode(val, date_unit='foo') + ujson.encode(val, date_unit="foo") def test_encode_to_utf8(self): unencoded = "\xe6\x97\xa5\xd1\x88" @@ -427,7 +418,7 @@ def test_encode_to_utf8(self): assert dec == json.loads(enc) def test_decode_from_unicode(self): - unicode_input = "{\"obj\": 31337}" + unicode_input = '{"obj": 31337}' dec1 = ujson.decode(unicode_input) dec2 = ujson.decode(str(unicode_input)) @@ -458,57 +449,59 @@ def test_decode_jibberish(self): with pytest.raises(ValueError): ujson.decode(jibberish) - @pytest.mark.parametrize("broken_json", [ - "[", # Broken array start. - "{", # Broken object start. - "]", # Broken array end. - "}", # Broken object end. - ]) + @pytest.mark.parametrize( + "broken_json", + [ + "[", # Broken array start. + "{", # Broken object start. + "]", # Broken array end. + "}", # Broken object end. + ], + ) def test_decode_broken_json(self, broken_json): with pytest.raises(ValueError): ujson.decode(broken_json) - @pytest.mark.parametrize("too_big_char", [ - "[", - "{", - ]) + @pytest.mark.parametrize("too_big_char", ["[", "{"]) def test_decode_depth_too_big(self, too_big_char): with pytest.raises(ValueError): ujson.decode(too_big_char * (1024 * 1024)) - @pytest.mark.parametrize("bad_string", [ - "\"TESTING", # Unterminated. - "\"TESTING\\\"", # Unterminated escape. - "tru", # Broken True. - "fa", # Broken False. - "n", # Broken None. - ]) + @pytest.mark.parametrize( + "bad_string", + [ + '"TESTING', # Unterminated. + '"TESTING\\"', # Unterminated escape. + "tru", # Broken True. + "fa", # Broken False. + "n", # Broken None. + ], + ) def test_decode_bad_string(self, bad_string): with pytest.raises(ValueError): ujson.decode(bad_string) - @pytest.mark.parametrize("broken_json", [ - '{{1337:""}}', - '{{"key":"}', - '[[[true', - ]) + @pytest.mark.parametrize("broken_json", ['{{1337:""}}', '{{"key":"}', "[[[true"]) def test_decode_broken_json_leak(self, broken_json): for _ in range(1000): with pytest.raises(ValueError): ujson.decode(broken_json) - @pytest.mark.parametrize("invalid_dict", [ - "{{{{31337}}}}", # No key. - "{{{{\"key\":}}}}", # No value. - "{{{{\"key\"}}}}", # No colon or value. - ]) + @pytest.mark.parametrize( + "invalid_dict", + [ + "{{{{31337}}}}", # No key. + '{{{{"key":}}}}', # No value. + '{{{{"key"}}}}', # No colon or value. + ], + ) def test_decode_invalid_dict(self, invalid_dict): with pytest.raises(ValueError): ujson.decode(invalid_dict) - @pytest.mark.parametrize("numeric_int_as_str", [ - "31337", "-31337" # Should work with negatives. - ]) + @pytest.mark.parametrize( + "numeric_int_as_str", ["31337", "-31337"] # Should work with negatives. + ) def test_decode_numeric_int(self, numeric_int_as_str): assert int(numeric_int_as_str) == ujson.decode(numeric_int_as_str) @@ -529,21 +522,26 @@ def test_encode_null_character(self): assert '" \\u0000\\r\\n "' == ujson.dumps(" \u0000\r\n ") def test_decode_null_character(self): - wrapped_input = "\"31337 \\u0000 31337\"" + wrapped_input = '"31337 \\u0000 31337"' assert ujson.decode(wrapped_input) == json.loads(wrapped_input) def test_encode_list_long_conversion(self): - long_input = [9223372036854775807, 9223372036854775807, - 9223372036854775807, 9223372036854775807, - 9223372036854775807, 9223372036854775807] + long_input = [ + 9223372036854775807, + 9223372036854775807, + 9223372036854775807, + 9223372036854775807, + 9223372036854775807, + 9223372036854775807, + ] output = ujson.encode(long_input) assert long_input == json.loads(output) assert long_input == ujson.decode(output) - tm.assert_numpy_array_equal(np.array(long_input), - ujson.decode(output, numpy=True, - dtype=np.int64)) + tm.assert_numpy_array_equal( + np.array(long_input), ujson.decode(output, numpy=True, dtype=np.int64) + ) def test_encode_long_conversion(self): long_input = 9223372036854775807 @@ -553,9 +551,9 @@ def test_encode_long_conversion(self): assert output == json.dumps(long_input) assert long_input == ujson.decode(output) - @pytest.mark.parametrize("int_exp", [ - "1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4" - ]) + @pytest.mark.parametrize( + "int_exp", ["1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4"] + ) def test_decode_numeric_int_exp(self, int_exp): assert ujson.decode(int_exp) == json.loads(int_exp) @@ -565,8 +563,9 @@ def test_loads_non_str_bytes_raises(self): ujson.loads(None) def test_version(self): - assert re.match(r'^\d+\.\d+(\.\d+)?$', ujson.__version__), \ - "ujson.__version__ must be a string like '1.4.0'" + assert re.match( + r"^\d+\.\d+(\.\d+)?$", ujson.__version__ + ), "ujson.__version__ must be a string like '1.4.0'" def test_encode_numeric_overflow(self): with pytest.raises(OverflowError): @@ -580,9 +579,7 @@ class Nested: with pytest.raises(OverflowError): ujson.encode(Nested()) - @pytest.mark.parametrize("val", [ - 3590016419, 2**31, 2**32, (2**32) - 1 - ]) + @pytest.mark.parametrize("val", [3590016419, 2 ** 31, 2 ** 32, (2 ** 32) - 1]) def test_decode_number_with_32bit_sign_bit(self, val): # Test that numbers that fit within 32 bits but would have the # sign bit set (2**31 <= x < 2**32) are decoded properly. @@ -592,14 +589,14 @@ def test_decode_number_with_32bit_sign_bit(self, val): def test_encode_big_escape(self): # Make sure no Exception is raised. for _ in range(10): - base = '\u00e5'.encode("utf-8") + base = "\u00e5".encode("utf-8") escape_input = base * 1024 * 1024 * 2 ujson.encode(escape_input) def test_decode_big_escape(self): # Make sure no Exception is raised. for _ in range(10): - base = '\u00e5'.encode("utf-8") + base = "\u00e5".encode("utf-8") quote = b'"' escape_input = quote + (base * 1024 * 1024 * 2) + quote @@ -619,9 +616,7 @@ def toDict(self): assert dec == d def test_default_handler(self): - class _TestObject: - def __init__(self, val): self.val = val @@ -635,14 +630,14 @@ def __str__(self): msg = "Maximum recursion level reached" with pytest.raises(OverflowError, match=msg): ujson.encode(_TestObject("foo")) - assert '"foo"' == ujson.encode(_TestObject("foo"), - default_handler=str) + assert '"foo"' == ujson.encode(_TestObject("foo"), default_handler=str) def my_handler(_): return "foobar" - assert '"foobar"' == ujson.encode(_TestObject("foo"), - default_handler=my_handler) + assert '"foobar"' == ujson.encode( + _TestObject("foo"), default_handler=my_handler + ) def my_handler_raises(_): raise TypeError("I raise for anything") @@ -653,34 +648,39 @@ def my_handler_raises(_): def my_int_handler(_): return 42 - assert ujson.decode(ujson.encode(_TestObject("foo"), - default_handler=my_int_handler)) == 42 + assert ( + ujson.decode( + ujson.encode(_TestObject("foo"), default_handler=my_int_handler) + ) + == 42 + ) def my_obj_handler(_): return datetime.datetime(2013, 2, 3) - assert (ujson.decode(ujson.encode(datetime.datetime(2013, 2, 3))) == - ujson.decode(ujson.encode(_TestObject("foo"), - default_handler=my_obj_handler))) + assert ujson.decode( + ujson.encode(datetime.datetime(2013, 2, 3)) + ) == ujson.decode( + ujson.encode(_TestObject("foo"), default_handler=my_obj_handler) + ) obj_list = [_TestObject("foo"), _TestObject("bar")] - assert (json.loads(json.dumps(obj_list, default=str)) == - ujson.decode(ujson.encode(obj_list, default_handler=str))) + assert json.loads(json.dumps(obj_list, default=str)) == ujson.decode( + ujson.encode(obj_list, default_handler=str) + ) class TestNumpyJSONTests: - @pytest.mark.parametrize("bool_input", [True, False]) def test_bool(self, bool_input): b = np.bool(bool_input) assert ujson.decode(ujson.encode(b)) == b def test_bool_array(self): - bool_array = np.array([ - True, False, True, True, - False, True, False, False], dtype=np.bool) - output = np.array(ujson.decode( - ujson.encode(bool_array)), dtype=np.bool) + bool_array = np.array( + [True, False, True, True, False, True, False, False], dtype=np.bool + ) + output = np.array(ujson.decode(ujson.encode(bool_array)), dtype=np.bool) tm.assert_numpy_array_equal(bool_array, output) def test_int(self, any_int_dtype): @@ -693,8 +693,9 @@ def test_int_array(self, any_int_dtype): arr = np.arange(100, dtype=np.int) arr_input = arr.astype(any_int_dtype) - arr_output = np.array(ujson.decode(ujson.encode(arr_input)), - dtype=any_int_dtype) + arr_output = np.array( + ujson.decode(ujson.encode(arr_input)), dtype=any_int_dtype + ) tm.assert_numpy_array_equal(arr_input, arr_output) def test_int_max(self, any_int_dtype): @@ -722,47 +723,49 @@ def test_float_array(self, float_dtype): arr = np.arange(12.5, 185.72, 1.7322, dtype=np.float) float_input = arr.astype(float_dtype) - float_output = np.array(ujson.decode( - ujson.encode(float_input, double_precision=15)), - dtype=float_dtype) + float_output = np.array( + ujson.decode(ujson.encode(float_input, double_precision=15)), + dtype=float_dtype, + ) tm.assert_almost_equal(float_input, float_output) def test_float_max(self, float_dtype): klass = np.dtype(float_dtype).type num = klass(np.finfo(float_dtype).max / 10) - tm.assert_almost_equal(klass(ujson.decode( - ujson.encode(num, double_precision=15))), num) + tm.assert_almost_equal( + klass(ujson.decode(ujson.encode(num, double_precision=15))), num + ) def test_array_basic(self): arr = np.arange(96) arr = arr.reshape((2, 2, 2, 2, 3, 2)) - tm.assert_numpy_array_equal( - np.array(ujson.decode(ujson.encode(arr))), arr) - tm.assert_numpy_array_equal(ujson.decode( - ujson.encode(arr), numpy=True), arr) - - @pytest.mark.parametrize("shape", [ - (10, 10), - (5, 5, 4), - (100, 1), - ]) + tm.assert_numpy_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + tm.assert_numpy_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + + @pytest.mark.parametrize("shape", [(10, 10), (5, 5, 4), (100, 1)]) def test_array_reshaped(self, shape): arr = np.arange(100) arr = arr.reshape(shape) - tm.assert_numpy_array_equal( - np.array(ujson.decode(ujson.encode(arr))), arr) - tm.assert_numpy_array_equal(ujson.decode( - ujson.encode(arr), numpy=True), arr) + tm.assert_numpy_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + tm.assert_numpy_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) def test_array_list(self): - arr_list = ["a", list(), dict(), dict(), list(), - 42, 97.8, ["a", "b"], {"key": "val"}] + arr_list = [ + "a", + list(), + dict(), + dict(), + list(), + 42, + 97.8, + ["a", "b"], + {"key": "val"}, + ] arr = np.array(arr_list) - tm.assert_numpy_array_equal( - np.array(ujson.decode(ujson.encode(arr))), arr) + tm.assert_numpy_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) def test_array_float(self): dtype = np.float32 @@ -780,70 +783,69 @@ def test_0d_array(self): with pytest.raises(TypeError): ujson.encode(np.array(1)) - @pytest.mark.parametrize("bad_input,exc_type,kwargs", [ - ([{}, []], ValueError, {}), - ([42, None], TypeError, {}), - ([["a"], 42], ValueError, {}), - ([42, {}, "a"], TypeError, {}), - ([42, ["a"], 42], ValueError, {}), - (["a", "b", [], "c"], ValueError, {}), - ([{"a": "b"}], ValueError, dict(labelled=True)), - ({"a": {"b": {"c": 42}}}, ValueError, dict(labelled=True)), - ([{"a": 42, "b": 23}, {"c": 17}], ValueError, dict(labelled=True)) - ]) + @pytest.mark.parametrize( + "bad_input,exc_type,kwargs", + [ + ([{}, []], ValueError, {}), + ([42, None], TypeError, {}), + ([["a"], 42], ValueError, {}), + ([42, {}, "a"], TypeError, {}), + ([42, ["a"], 42], ValueError, {}), + (["a", "b", [], "c"], ValueError, {}), + ([{"a": "b"}], ValueError, dict(labelled=True)), + ({"a": {"b": {"c": 42}}}, ValueError, dict(labelled=True)), + ([{"a": 42, "b": 23}, {"c": 17}], ValueError, dict(labelled=True)), + ], + ) def test_array_numpy_except(self, bad_input, exc_type, kwargs): with pytest.raises(exc_type): ujson.decode(ujson.dumps(bad_input), numpy=True, **kwargs) def test_array_numpy_labelled(self): labelled_input = {"a": []} - output = ujson.loads(ujson.dumps(labelled_input), - numpy=True, labelled=True) + output = ujson.loads(ujson.dumps(labelled_input), numpy=True, labelled=True) assert (np.empty((1, 0)) == output[0]).all() assert (np.array(["a"]) == output[1]).all() assert output[2] is None labelled_input = [{"a": 42}] - output = ujson.loads(ujson.dumps(labelled_input), - numpy=True, labelled=True) + output = ujson.loads(ujson.dumps(labelled_input), numpy=True, labelled=True) assert (np.array(["a"]) == output[2]).all() assert (np.array([42]) == output[0]).all() assert output[1] is None # see gh-10837: write out the dump explicitly # so there is no dependency on iteration order - input_dumps = ('[{"a": 42, "b":31}, {"a": 24, "c": 99}, ' - '{"a": 2.4, "b": 78}]') + input_dumps = '[{"a": 42, "b":31}, {"a": 24, "c": 99}, ' '{"a": 2.4, "b": 78}]' output = ujson.loads(input_dumps, numpy=True, labelled=True) - expected_vals = np.array( - [42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3, 2)) + expected_vals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3, 2)) assert (expected_vals == output[0]).all() assert output[1] is None assert (np.array(["a", "b"]) == output[2]).all() - input_dumps = ('{"1": {"a": 42, "b":31}, "2": {"a": 24, "c": 99}, ' - '"3": {"a": 2.4, "b": 78}}') + input_dumps = ( + '{"1": {"a": 42, "b":31}, "2": {"a": 24, "c": 99}, ' + '"3": {"a": 2.4, "b": 78}}' + ) output = ujson.loads(input_dumps, numpy=True, labelled=True) - expected_vals = np.array( - [42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3, 2)) + expected_vals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3, 2)) assert (expected_vals == output[0]).all() assert (np.array(["1", "2", "3"]) == output[1]).all() assert (np.array(["a", "b"]) == output[2]).all() class TestPandasJSONTests: - def test_dataframe(self, orient, numpy): if orient == "records" and numpy: pytest.skip("Not idiomatic pandas") - df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ - "a", "b"], columns=["x", "y", "z"]) + df = DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["a", "b"], columns=["x", "y", "z"] + ) encode_kwargs = {} if orient is None else dict(orient=orient) decode_kwargs = {} if numpy is None else dict(numpy=numpy) - output = ujson.decode(ujson.encode(df, **encode_kwargs), - **decode_kwargs) + output = ujson.decode(ujson.encode(df, **encode_kwargs), **decode_kwargs) # Ensure proper DataFrame initialization. if orient == "split": @@ -864,26 +866,34 @@ def test_dataframe(self, orient, numpy): tm.assert_frame_equal(output, df, check_dtype=False) def test_dataframe_nested(self, orient): - df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ - "a", "b"], columns=["x", "y", "z"]) + df = DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["a", "b"], columns=["x", "y", "z"] + ) nested = {"df1": df, "df2": df.copy()} kwargs = {} if orient is None else dict(orient=orient) - exp = {"df1": ujson.decode(ujson.encode(df, **kwargs)), - "df2": ujson.decode(ujson.encode(df, **kwargs))} + exp = { + "df1": ujson.decode(ujson.encode(df, **kwargs)), + "df2": ujson.decode(ujson.encode(df, **kwargs)), + } assert ujson.decode(ujson.encode(nested, **kwargs)) == exp def test_dataframe_numpy_labelled(self, orient): if orient in ("split", "values"): pytest.skip("Incompatible with labelled=True") - df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ - "a", "b"], columns=["x", "y", "z"], dtype=np.int) + df = DataFrame( + [[1, 2, 3], [4, 5, 6]], + index=["a", "b"], + columns=["x", "y", "z"], + dtype=np.int, + ) kwargs = {} if orient is None else dict(orient=orient) - output = DataFrame(*ujson.decode(ujson.encode(df, **kwargs), - numpy=True, labelled=True)) + output = DataFrame( + *ujson.decode(ujson.encode(df, **kwargs), numpy=True, labelled=True) + ) if orient is None: df = df.T @@ -893,14 +903,14 @@ def test_dataframe_numpy_labelled(self, orient): tm.assert_frame_equal(output, df) def test_series(self, orient, numpy): - s = Series([10, 20, 30, 40, 50, 60], name="series", - index=[6, 7, 8, 9, 10, 15]).sort_values() + s = Series( + [10, 20, 30, 40, 50, 60], name="series", index=[6, 7, 8, 9, 10, 15] + ).sort_values() encode_kwargs = {} if orient is None else dict(orient=orient) decode_kwargs = {} if numpy is None else dict(numpy=numpy) - output = ujson.decode(ujson.encode(s, **encode_kwargs), - **decode_kwargs) + output = ujson.decode(ujson.encode(s, **encode_kwargs), **decode_kwargs) if orient == "split": dec = _clean_dict(output) @@ -919,13 +929,16 @@ def test_series(self, orient, numpy): tm.assert_series_equal(output, s, check_dtype=False) def test_series_nested(self, orient): - s = Series([10, 20, 30, 40, 50, 60], name="series", - index=[6, 7, 8, 9, 10, 15]).sort_values() + s = Series( + [10, 20, 30, 40, 50, 60], name="series", index=[6, 7, 8, 9, 10, 15] + ).sort_values() nested = {"s1": s, "s2": s.copy()} kwargs = {} if orient is None else dict(orient=orient) - exp = {"s1": ujson.decode(ujson.encode(s, **kwargs)), - "s2": ujson.decode(ujson.encode(s, **kwargs))} + exp = { + "s1": ujson.decode(ujson.encode(s, **kwargs)), + "s2": ujson.decode(ujson.encode(s, **kwargs)), + } assert ujson.decode(ujson.encode(nested, **kwargs)) == exp def test_index(self): @@ -944,35 +957,34 @@ def test_index(self): tm.assert_index_equal(i, output) assert i.name == output.name - dec = _clean_dict(ujson.decode(ujson.encode(i, orient="split"), - numpy=True)) + dec = _clean_dict(ujson.decode(ujson.encode(i, orient="split"), numpy=True)) output = Index(**dec) tm.assert_index_equal(i, output) assert i.name == output.name - output = Index(ujson.decode(ujson.encode(i, orient="values")), - name="index") + output = Index(ujson.decode(ujson.encode(i, orient="values")), name="index") tm.assert_index_equal(i, output) - output = Index(ujson.decode(ujson.encode(i, orient="values"), - numpy=True), name="index") + output = Index( + ujson.decode(ujson.encode(i, orient="values"), numpy=True), name="index" + ) tm.assert_index_equal(i, output) - output = Index(ujson.decode(ujson.encode(i, orient="records")), - name="index") + output = Index(ujson.decode(ujson.encode(i, orient="records")), name="index") tm.assert_index_equal(i, output) - output = Index(ujson.decode(ujson.encode(i, orient="records"), - numpy=True), name="index") + output = Index( + ujson.decode(ujson.encode(i, orient="records"), numpy=True), name="index" + ) tm.assert_index_equal(i, output) - output = Index(ujson.decode(ujson.encode(i, orient="index")), - name="index") + output = Index(ujson.decode(ujson.encode(i, orient="index")), name="index") tm.assert_index_equal(i, output) - output = Index(ujson.decode(ujson.encode(i, orient="index"), - numpy=True), name="index") + output = Index( + ujson.decode(ujson.encode(i, orient="index"), numpy=True), name="index" + ) tm.assert_index_equal(i, output) def test_datetime_index(self): @@ -991,31 +1003,30 @@ def test_datetime_index(self): decoded.index = DatetimeIndex(idx_values) tm.assert_series_equal(ts, decoded) - @pytest.mark.parametrize("invalid_arr", [ - "[31337,]", # Trailing comma. - "[,31337]", # Leading comma. - "[]]", # Unmatched bracket. - "[,]", # Only comma. - ]) + @pytest.mark.parametrize( + "invalid_arr", + [ + "[31337,]", # Trailing comma. + "[,31337]", # Leading comma. + "[]]", # Unmatched bracket. + "[,]", # Only comma. + ], + ) def test_decode_invalid_array(self, invalid_arr): with pytest.raises(ValueError): ujson.decode(invalid_arr) - @pytest.mark.parametrize("arr", [ - [], [31337] - ]) + @pytest.mark.parametrize("arr", [[], [31337]]) def test_decode_array(self, arr): assert arr == ujson.decode(str(arr)) - @pytest.mark.parametrize("extreme_num", [ - 9223372036854775807, -9223372036854775808 - ]) + @pytest.mark.parametrize("extreme_num", [9223372036854775807, -9223372036854775808]) def test_decode_extreme_numbers(self, extreme_num): assert extreme_num == ujson.decode(str(extreme_num)) - @pytest.mark.parametrize("too_extreme_num", [ - "9223372036854775808", "-90223372036854775809" - ]) + @pytest.mark.parametrize( + "too_extreme_num", ["9223372036854775808", "-90223372036854775809"] + ) def test_decode_too_extreme_numbers(self, too_extreme_num): with pytest.raises(ValueError): ujson.decode(too_extreme_num) @@ -1031,17 +1042,26 @@ def test_decode_array_with_big_int(self): with pytest.raises(ValueError): ujson.loads("[18446098363113800555]") - @pytest.mark.parametrize("float_number", [ - 1.1234567893, 1.234567893, 1.34567893, - 1.4567893, 1.567893, 1.67893, - 1.7893, 1.893, 1.3, - ]) + @pytest.mark.parametrize( + "float_number", + [ + 1.1234567893, + 1.234567893, + 1.34567893, + 1.4567893, + 1.567893, + 1.67893, + 1.7893, + 1.893, + 1.3, + ], + ) @pytest.mark.parametrize("sign", [-1, 1]) def test_decode_floating_point(self, sign, float_number): float_number *= sign - tm.assert_almost_equal(float_number, - ujson.loads(str(float_number)), - check_less_precise=15) + tm.assert_almost_equal( + float_number, ujson.loads(str(float_number)), check_less_precise=15 + ) def test_encode_big_set(self): s = set() diff --git a/pandas/tests/io/msgpack/test_buffer.py b/pandas/tests/io/msgpack/test_buffer.py index e36dc5bbdb4ba6..fe1f4e73eba24b 100644 --- a/pandas/tests/io/msgpack/test_buffer.py +++ b/pandas/tests/io/msgpack/test_buffer.py @@ -7,15 +7,16 @@ def test_unpack_buffer(): from array import array - buf = array('b') - frombytes(buf, packb((b'foo', b'bar'))) + + buf = array("b") + frombytes(buf, packb((b"foo", b"bar"))) obj = unpackb(buf, use_list=1) - assert [b'foo', b'bar'] == obj + assert [b"foo", b"bar"] == obj def test_unpack_bytearray(): - buf = bytearray(packb(('foo', 'bar'))) + buf = bytearray(packb(("foo", "bar"))) obj = unpackb(buf, use_list=1) - assert [b'foo', b'bar'] == obj + assert [b"foo", b"bar"] == obj expected_type = bytes assert all(type(s) == expected_type for s in obj) diff --git a/pandas/tests/io/msgpack/test_case.py b/pandas/tests/io/msgpack/test_case.py index c0e76b37ee46d4..15b7090c11badc 100644 --- a/pandas/tests/io/msgpack/test_case.py +++ b/pandas/tests/io/msgpack/test_case.py @@ -5,14 +5,23 @@ def check(length, obj): v = packb(obj) - assert len(v) == length, \ - "%r length should be %r but get %r" % (obj, length, len(v)) + assert len(v) == length, "%r length should be %r but get %r" % (obj, length, len(v)) assert unpackb(v, use_list=0) == obj def test_1(): - for o in [None, True, False, 0, 1, (1 << 6), (1 << 7) - 1, -1, - -((1 << 5) - 1), -(1 << 5)]: + for o in [ + None, + True, + False, + 0, + 1, + (1 << 6), + (1 << 7) - 1, + -1, + -((1 << 5) - 1), + -(1 << 5), + ]: check(1, o) @@ -32,8 +41,16 @@ def test_5(): def test_9(): - for o in [1 << 32, (1 << 64) - 1, -((1 << 31) + 1), -(1 << 63), 1.0, 0.1, - -0.1, -1.0]: + for o in [ + 1 << 32, + (1 << 64) - 1, + -((1 << 31) + 1), + -(1 << 63), + 1.0, + 0.1, + -0.1, + -1.0, + ]: check(9, o) @@ -56,7 +73,7 @@ def test_raw32(): def check_array(overhead, num): - check(num + overhead, (None, ) * num) + check(num + overhead, (None,) * num) def test_fixarray(): @@ -80,31 +97,46 @@ def match(obj, buf): def test_match(): cases = [ - (None, b'\xc0'), - (False, b'\xc2'), - (True, b'\xc3'), - (0, b'\x00'), - (127, b'\x7f'), - (128, b'\xcc\x80'), - (256, b'\xcd\x01\x00'), - (-1, b'\xff'), - (-33, b'\xd0\xdf'), - (-129, b'\xd1\xff\x7f'), - ({1: 1}, b'\x81\x01\x01'), + (None, b"\xc0"), + (False, b"\xc2"), + (True, b"\xc3"), + (0, b"\x00"), + (127, b"\x7f"), + (128, b"\xcc\x80"), + (256, b"\xcd\x01\x00"), + (-1, b"\xff"), + (-33, b"\xd0\xdf"), + (-129, b"\xd1\xff\x7f"), + ({1: 1}, b"\x81\x01\x01"), (1.0, b"\xcb\x3f\xf0\x00\x00\x00\x00\x00\x00"), - ((), b'\x90'), - (tuple(range(15)), (b"\x9f\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09" - b"\x0a\x0b\x0c\x0d\x0e")), - (tuple(range(16)), (b"\xdc\x00\x10\x00\x01\x02\x03\x04\x05\x06\x07" - b"\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f")), - ({}, b'\x80'), - ({x: x for x in range(15)}, - (b'\x8f\x00\x00\x01\x01\x02\x02\x03\x03\x04\x04\x05\x05\x06\x06\x07' - b'\x07\x08\x08\t\t\n\n\x0b\x0b\x0c\x0c\r\r\x0e\x0e')), - ({x: x for x in range(16)}, - (b'\xde\x00\x10\x00\x00\x01\x01\x02\x02\x03\x03\x04\x04\x05\x05\x06' - b'\x06\x07\x07\x08\x08\t\t\n\n\x0b\x0b\x0c\x0c\r\r\x0e\x0e' - b'\x0f\x0f')), + ((), b"\x90"), + ( + tuple(range(15)), + (b"\x9f\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09" b"\x0a\x0b\x0c\x0d\x0e"), + ), + ( + tuple(range(16)), + ( + b"\xdc\x00\x10\x00\x01\x02\x03\x04\x05\x06\x07" + b"\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + ), + ), + ({}, b"\x80"), + ( + {x: x for x in range(15)}, + ( + b"\x8f\x00\x00\x01\x01\x02\x02\x03\x03\x04\x04\x05\x05\x06\x06\x07" + b"\x07\x08\x08\t\t\n\n\x0b\x0b\x0c\x0c\r\r\x0e\x0e" + ), + ), + ( + {x: x for x in range(16)}, + ( + b"\xde\x00\x10\x00\x00\x01\x01\x02\x02\x03\x03\x04\x04\x05\x05\x06" + b"\x06\x07\x07\x08\x08\t\t\n\n\x0b\x0b\x0c\x0c\r\r\x0e\x0e" + b"\x0f\x0f" + ), + ), ] for v, p in cases: @@ -112,4 +144,4 @@ def test_match(): def test_unicode(): - assert unpackb(packb('foobar'), use_list=1) == b'foobar' + assert unpackb(packb("foobar"), use_list=1) == b"foobar" diff --git a/pandas/tests/io/msgpack/test_except.py b/pandas/tests/io/msgpack/test_except.py index 0eeda3389a9351..60c1dcca162a9b 100644 --- a/pandas/tests/io/msgpack/test_except.py +++ b/pandas/tests/io/msgpack/test_except.py @@ -12,9 +12,8 @@ class DummyException(Exception): class TestExceptions: - def test_raise_on_find_unsupported_value(self): - msg = "can\'t serialize datetime" + msg = "can't serialize datetime" with pytest.raises(TypeError, match=msg): packb(datetime.now()) @@ -25,13 +24,13 @@ def hook(_): with pytest.raises(DummyException): unpackb(packb({}), object_hook=hook) with pytest.raises(DummyException): - unpackb(packb({'fizz': 'buzz'}), object_hook=hook) + unpackb(packb({"fizz": "buzz"}), object_hook=hook) with pytest.raises(DummyException): - unpackb(packb({'fizz': 'buzz'}), object_pairs_hook=hook) + unpackb(packb({"fizz": "buzz"}), object_pairs_hook=hook) with pytest.raises(DummyException): - unpackb(packb({'fizz': {'buzz': 'spam'}}), object_hook=hook) + unpackb(packb({"fizz": {"buzz": "spam"}}), object_hook=hook) with pytest.raises(DummyException): - unpackb(packb({'fizz': {'buzz': 'spam'}}), object_pairs_hook=hook) + unpackb(packb({"fizz": {"buzz": "spam"}}), object_pairs_hook=hook) def test_invalid_value(self): msg = "Unpack failed: error" diff --git a/pandas/tests/io/msgpack/test_extension.py b/pandas/tests/io/msgpack/test_extension.py index 1bbfde5b9cd585..12f27459f5afe5 100644 --- a/pandas/tests/io/msgpack/test_extension.py +++ b/pandas/tests/io/msgpack/test_extension.py @@ -12,50 +12,52 @@ def p(s): packer.pack_ext_type(0x42, s) return packer.bytes() - assert p(b'A') == b'\xd4\x42A' # fixext 1 - assert p(b'AB') == b'\xd5\x42AB' # fixext 2 - assert p(b'ABCD') == b'\xd6\x42ABCD' # fixext 4 - assert p(b'ABCDEFGH') == b'\xd7\x42ABCDEFGH' # fixext 8 - assert p(b'A' * 16) == b'\xd8\x42' + b'A' * 16 # fixext 16 - assert p(b'ABC') == b'\xc7\x03\x42ABC' # ext 8 - assert p(b'A' * 0x0123) == b'\xc8\x01\x23\x42' + b'A' * 0x0123 # ext 16 - assert (p(b'A' * 0x00012345) == - b'\xc9\x00\x01\x23\x45\x42' + b'A' * 0x00012345) # ext 32 + assert p(b"A") == b"\xd4\x42A" # fixext 1 + assert p(b"AB") == b"\xd5\x42AB" # fixext 2 + assert p(b"ABCD") == b"\xd6\x42ABCD" # fixext 4 + assert p(b"ABCDEFGH") == b"\xd7\x42ABCDEFGH" # fixext 8 + assert p(b"A" * 16) == b"\xd8\x42" + b"A" * 16 # fixext 16 + assert p(b"ABC") == b"\xc7\x03\x42ABC" # ext 8 + assert p(b"A" * 0x0123) == b"\xc8\x01\x23\x42" + b"A" * 0x0123 # ext 16 + assert ( + p(b"A" * 0x00012345) == b"\xc9\x00\x01\x23\x45\x42" + b"A" * 0x00012345 + ) # ext 32 def test_unpack_ext_type(): def check(b, expected): assert msgpack.unpackb(b) == expected - check(b'\xd4\x42A', ExtType(0x42, b'A')) # fixext 1 - check(b'\xd5\x42AB', ExtType(0x42, b'AB')) # fixext 2 - check(b'\xd6\x42ABCD', ExtType(0x42, b'ABCD')) # fixext 4 - check(b'\xd7\x42ABCDEFGH', ExtType(0x42, b'ABCDEFGH')) # fixext 8 - check(b'\xd8\x42' + b'A' * 16, ExtType(0x42, b'A' * 16)) # fixext 16 - check(b'\xc7\x03\x42ABC', ExtType(0x42, b'ABC')) # ext 8 - check(b'\xc8\x01\x23\x42' + b'A' * 0x0123, - ExtType(0x42, b'A' * 0x0123)) # ext 16 - check(b'\xc9\x00\x01\x23\x45\x42' + b'A' * 0x00012345, - ExtType(0x42, b'A' * 0x00012345)) # ext 32 + check(b"\xd4\x42A", ExtType(0x42, b"A")) # fixext 1 + check(b"\xd5\x42AB", ExtType(0x42, b"AB")) # fixext 2 + check(b"\xd6\x42ABCD", ExtType(0x42, b"ABCD")) # fixext 4 + check(b"\xd7\x42ABCDEFGH", ExtType(0x42, b"ABCDEFGH")) # fixext 8 + check(b"\xd8\x42" + b"A" * 16, ExtType(0x42, b"A" * 16)) # fixext 16 + check(b"\xc7\x03\x42ABC", ExtType(0x42, b"ABC")) # ext 8 + check(b"\xc8\x01\x23\x42" + b"A" * 0x0123, ExtType(0x42, b"A" * 0x0123)) # ext 16 + check( + b"\xc9\x00\x01\x23\x45\x42" + b"A" * 0x00012345, + ExtType(0x42, b"A" * 0x00012345), + ) # ext 32 def test_extension_type(): def default(obj): - print('default called', obj) + print("default called", obj) if isinstance(obj, array.array): typecode = 123 # application specific typecode data = tobytes(obj) return ExtType(typecode, data) - raise TypeError("Unknown type object %r" % (obj, )) + raise TypeError("Unknown type object %r" % (obj,)) def ext_hook(code, data): - print('ext_hook called', code, data) + print("ext_hook called", code, data) assert code == 123 - obj = array.array('d') + obj = array.array("d") frombytes(obj, data) return obj - obj = [42, b'hello', array.array('d', [1.1, 2.2, 3.3])] + obj = [42, b"hello", array.array("d", [1.1, 2.2, 3.3])] s = msgpack.packb(obj, default=default) obj2 = msgpack.unpackb(s, ext_hook=ext_hook) assert obj == obj2 diff --git a/pandas/tests/io/msgpack/test_format.py b/pandas/tests/io/msgpack/test_format.py index 3659602e1381f5..46d0116bc39263 100644 --- a/pandas/tests/io/msgpack/test_format.py +++ b/pandas/tests/io/msgpack/test_format.py @@ -8,84 +8,77 @@ def check(src, should, use_list=0): def testSimpleValue(): - check(b"\x93\xc0\xc2\xc3", (None, False, True, )) + check(b"\x93\xc0\xc2\xc3", (None, False, True)) def testFixnum(): - check(b"\x92\x93\x00\x40\x7f\x93\xe0\xf0\xff", ((0, - 64, - 127, ), - (-32, - -16, - -1, ), )) + check(b"\x92\x93\x00\x40\x7f\x93\xe0\xf0\xff", ((0, 64, 127), (-32, -16, -1))) def testFixArray(): - check(b"\x92\x90\x91\x91\xc0", ((), ((None, ), ), ), ) + check(b"\x92\x90\x91\x91\xc0", ((), ((None,),))) def testFixRaw(): - check(b"\x94\xa0\xa1a\xa2bc\xa3def", (b"", b"a", b"bc", b"def", ), ) + check(b"\x94\xa0\xa1a\xa2bc\xa3def", (b"", b"a", b"bc", b"def")) def testFixMap(): - check(b"\x82\xc2\x81\xc0\xc0\xc3\x81\xc0\x80", - {False: {None: None}, - True: {None: {}}}, ) + check( + b"\x82\xc2\x81\xc0\xc0\xc3\x81\xc0\x80", {False: {None: None}, True: {None: {}}} + ) def testUnsignedInt(): - check(b"\x99\xcc\x00\xcc\x80\xcc\xff\xcd\x00\x00\xcd\x80\x00" - b"\xcd\xff\xff\xce\x00\x00\x00\x00\xce\x80\x00\x00\x00" - b"\xce\xff\xff\xff\xff", - (0, - 128, - 255, - 0, - 32768, - 65535, - 0, - 2147483648, - 4294967295, ), ) + check( + b"\x99\xcc\x00\xcc\x80\xcc\xff\xcd\x00\x00\xcd\x80\x00" + b"\xcd\xff\xff\xce\x00\x00\x00\x00\xce\x80\x00\x00\x00" + b"\xce\xff\xff\xff\xff", + (0, 128, 255, 0, 32768, 65535, 0, 2147483648, 4294967295), + ) def testSignedInt(): - check(b"\x99\xd0\x00\xd0\x80\xd0\xff\xd1\x00\x00\xd1\x80\x00" - b"\xd1\xff\xff\xd2\x00\x00\x00\x00\xd2\x80\x00\x00\x00" - b"\xd2\xff\xff\xff\xff", (0, - -128, - -1, - 0, - -32768, - -1, - 0, - -2147483648, - -1, )) + check( + b"\x99\xd0\x00\xd0\x80\xd0\xff\xd1\x00\x00\xd1\x80\x00" + b"\xd1\xff\xff\xd2\x00\x00\x00\x00\xd2\x80\x00\x00\x00" + b"\xd2\xff\xff\xff\xff", + (0, -128, -1, 0, -32768, -1, 0, -2147483648, -1), + ) def testRaw(): - check(b"\x96\xda\x00\x00\xda\x00\x01a\xda\x00\x02ab\xdb\x00\x00" - b"\x00\x00\xdb\x00\x00\x00\x01a\xdb\x00\x00\x00\x02ab", - (b"", b"a", b"ab", b"", b"a", b"ab")) + check( + b"\x96\xda\x00\x00\xda\x00\x01a\xda\x00\x02ab\xdb\x00\x00" + b"\x00\x00\xdb\x00\x00\x00\x01a\xdb\x00\x00\x00\x02ab", + (b"", b"a", b"ab", b"", b"a", b"ab"), + ) def testArray(): - check(b"\x96\xdc\x00\x00\xdc\x00\x01\xc0\xdc\x00\x02\xc2\xc3\xdd\x00" - b"\x00\x00\x00\xdd\x00\x00\x00\x01\xc0\xdd\x00\x00\x00\x02" - b"\xc2\xc3", ((), (None, ), (False, True), (), (None, ), - (False, True))) + check( + b"\x96\xdc\x00\x00\xdc\x00\x01\xc0\xdc\x00\x02\xc2\xc3\xdd\x00" + b"\x00\x00\x00\xdd\x00\x00\x00\x01\xc0\xdd\x00\x00\x00\x02" + b"\xc2\xc3", + ((), (None,), (False, True), (), (None,), (False, True)), + ) def testMap(): - check(b"\x96" - b"\xde\x00\x00" - b"\xde\x00\x01\xc0\xc2" - b"\xde\x00\x02\xc0\xc2\xc3\xc2" - b"\xdf\x00\x00\x00\x00" - b"\xdf\x00\x00\x00\x01\xc0\xc2" - b"\xdf\x00\x00\x00\x02\xc0\xc2\xc3\xc2", ({}, {None: False}, - {True: False, - None: False}, {}, - {None: False}, - {True: False, - None: False})) + check( + b"\x96" + b"\xde\x00\x00" + b"\xde\x00\x01\xc0\xc2" + b"\xde\x00\x02\xc0\xc2\xc3\xc2" + b"\xdf\x00\x00\x00\x00" + b"\xdf\x00\x00\x00\x01\xc0\xc2" + b"\xdf\x00\x00\x00\x02\xc0\xc2\xc3\xc2", + ( + {}, + {None: False}, + {True: False, None: False}, + {}, + {None: False}, + {True: False, None: False}, + ), + ) diff --git a/pandas/tests/io/msgpack/test_limits.py b/pandas/tests/io/msgpack/test_limits.py index d90a9adfa5c873..4c0697f8faf643 100644 --- a/pandas/tests/io/msgpack/test_limits.py +++ b/pandas/tests/io/msgpack/test_limits.py @@ -5,12 +5,13 @@ class TestLimits: - def test_integer(self): x = -(2 ** 63) assert unpackb(packb(x)) == x - msg = (r"((long |Python )?(int )?too (big|large) to convert" - r"( to C (unsigned )?long))?") + msg = ( + r"((long |Python )?(int )?too (big|large) to convert" + r"( to C (unsigned )?long))?" + ) with pytest.raises((OverflowError, ValueError), match=msg): packb(x - 1) x = 2 ** 64 - 1 @@ -31,14 +32,14 @@ def test_map_header(self): packer.pack_array_header(2 ** 32) def test_max_str_len(self): - d = 'x' * 3 + d = "x" * 3 packed = packb(d) - unpacker = Unpacker(max_str_len=3, encoding='utf-8') + unpacker = Unpacker(max_str_len=3, encoding="utf-8") unpacker.feed(packed) assert unpacker.unpack() == d - unpacker = Unpacker(max_str_len=2, encoding='utf-8') + unpacker = Unpacker(max_str_len=2, encoding="utf-8") unpacker.feed(packed) msg = "3 exceeds max_str_len" @@ -46,7 +47,7 @@ def test_max_str_len(self): unpacker.unpack() def test_max_bin_len(self): - d = b'x' * 3 + d = b"x" * 3 packed = packb(d, use_bin_type=True) unpacker = Unpacker(max_bin_len=3) diff --git a/pandas/tests/io/msgpack/test_newspec.py b/pandas/tests/io/msgpack/test_newspec.py index d92c649c5e1cac..a1cf966b9d253f 100644 --- a/pandas/tests/io/msgpack/test_newspec.py +++ b/pandas/tests/io/msgpack/test_newspec.py @@ -4,65 +4,65 @@ def test_str8(): - header = b'\xd9' - data = b'x' * 32 + header = b"\xd9" + data = b"x" * 32 b = packb(data.decode(), use_bin_type=True) assert len(b) == len(data) + 2 - assert b[0:2] == header + b'\x20' + assert b[0:2] == header + b"\x20" assert b[2:] == data assert unpackb(b) == data - data = b'x' * 255 + data = b"x" * 255 b = packb(data.decode(), use_bin_type=True) assert len(b) == len(data) + 2 - assert b[0:2] == header + b'\xff' + assert b[0:2] == header + b"\xff" assert b[2:] == data assert unpackb(b) == data def test_bin8(): - header = b'\xc4' - data = b'' + header = b"\xc4" + data = b"" b = packb(data, use_bin_type=True) assert len(b) == len(data) + 2 - assert b[0:2] == header + b'\x00' + assert b[0:2] == header + b"\x00" assert b[2:] == data assert unpackb(b) == data - data = b'x' * 255 + data = b"x" * 255 b = packb(data, use_bin_type=True) assert len(b) == len(data) + 2 - assert b[0:2] == header + b'\xff' + assert b[0:2] == header + b"\xff" assert b[2:] == data assert unpackb(b) == data def test_bin16(): - header = b'\xc5' - data = b'x' * 256 + header = b"\xc5" + data = b"x" * 256 b = packb(data, use_bin_type=True) assert len(b) == len(data) + 3 assert b[0:1] == header - assert b[1:3] == b'\x01\x00' + assert b[1:3] == b"\x01\x00" assert b[3:] == data assert unpackb(b) == data - data = b'x' * 65535 + data = b"x" * 65535 b = packb(data, use_bin_type=True) assert len(b) == len(data) + 3 assert b[0:1] == header - assert b[1:3] == b'\xff\xff' + assert b[1:3] == b"\xff\xff" assert b[3:] == data assert unpackb(b) == data def test_bin32(): - header = b'\xc6' - data = b'x' * 65536 + header = b"\xc6" + data = b"x" * 65536 b = packb(data, use_bin_type=True) assert len(b) == len(data) + 5 assert b[0:1] == header - assert b[1:5] == b'\x00\x01\x00\x00' + assert b[1:5] == b"\x00\x01\x00\x00" assert b[5:] == data assert unpackb(b) == data @@ -72,21 +72,19 @@ def check(ext, packed): assert packb(ext) == packed assert unpackb(packed) == ext - check(ExtType(0x42, b'Z'), b'\xd4\x42Z') # fixext 1 - check(ExtType(0x42, b'ZZ'), b'\xd5\x42ZZ') # fixext 2 - check(ExtType(0x42, b'Z' * 4), b'\xd6\x42' + b'Z' * 4) # fixext 4 - check(ExtType(0x42, b'Z' * 8), b'\xd7\x42' + b'Z' * 8) # fixext 8 - check(ExtType(0x42, b'Z' * 16), b'\xd8\x42' + b'Z' * 16) # fixext 16 + check(ExtType(0x42, b"Z"), b"\xd4\x42Z") # fixext 1 + check(ExtType(0x42, b"ZZ"), b"\xd5\x42ZZ") # fixext 2 + check(ExtType(0x42, b"Z" * 4), b"\xd6\x42" + b"Z" * 4) # fixext 4 + check(ExtType(0x42, b"Z" * 8), b"\xd7\x42" + b"Z" * 8) # fixext 8 + check(ExtType(0x42, b"Z" * 16), b"\xd8\x42" + b"Z" * 16) # fixext 16 # ext 8 - check(ExtType(0x42, b''), b'\xc7\x00\x42') - check(ExtType(0x42, b'Z' * 255), b'\xc7\xff\x42' + b'Z' * 255) + check(ExtType(0x42, b""), b"\xc7\x00\x42") + check(ExtType(0x42, b"Z" * 255), b"\xc7\xff\x42" + b"Z" * 255) # ext 16 - check(ExtType(0x42, b'Z' * 256), b'\xc8\x01\x00\x42' + b'Z' * 256) - check(ExtType(0x42, b'Z' * 0xffff), b'\xc8\xff\xff\x42' + b'Z' * 0xffff) + check(ExtType(0x42, b"Z" * 256), b"\xc8\x01\x00\x42" + b"Z" * 256) + check(ExtType(0x42, b"Z" * 0xFFFF), b"\xc8\xff\xff\x42" + b"Z" * 0xFFFF) # ext 32 - check( - ExtType(0x42, b'Z' * - 0x10000), b'\xc9\x00\x01\x00\x00\x42' + b'Z' * 0x10000) + check(ExtType(0x42, b"Z" * 0x10000), b"\xc9\x00\x01\x00\x00\x42" + b"Z" * 0x10000) # needs large memory # check(ExtType(0x42, b'Z'*0xffffffff), # b'\xc9\xff\xff\xff\xff\x42' + b'Z'*0xffffffff) diff --git a/pandas/tests/io/msgpack/test_obj.py b/pandas/tests/io/msgpack/test_obj.py index 342c00f49ebffb..03d8807c0922c2 100644 --- a/pandas/tests/io/msgpack/test_obj.py +++ b/pandas/tests/io/msgpack/test_obj.py @@ -10,47 +10,44 @@ class DecodeError(Exception): class TestObj: - def _arr_to_str(self, arr): - return ''.join(str(c) for c in arr) + return "".join(str(c) for c in arr) def bad_complex_decoder(self, o): raise DecodeError("Ooops!") def _decode_complex(self, obj): - if b'__complex__' in obj: - return complex(obj[b'real'], obj[b'imag']) + if b"__complex__" in obj: + return complex(obj[b"real"], obj[b"imag"]) return obj def _encode_complex(self, obj): if isinstance(obj, complex): - return {b'__complex__': True, b'real': 1, b'imag': 2} + return {b"__complex__": True, b"real": 1, b"imag": 2} return obj def test_encode_hook(self): packed = packb([3, 1 + 2j], default=self._encode_complex) unpacked = unpackb(packed, use_list=1) - assert unpacked[1] == {b'__complex__': True, b'real': 1, b'imag': 2} + assert unpacked[1] == {b"__complex__": True, b"real": 1, b"imag": 2} def test_decode_hook(self): - packed = packb([3, {b'__complex__': True, b'real': 1, b'imag': 2}]) - unpacked = unpackb(packed, object_hook=self._decode_complex, - use_list=1) + packed = packb([3, {b"__complex__": True, b"real": 1, b"imag": 2}]) + unpacked = unpackb(packed, object_hook=self._decode_complex, use_list=1) assert unpacked[1] == 1 + 2j def test_decode_pairs_hook(self): packed = packb([3, {1: 2, 3: 4}]) prod_sum = 1 * 2 + 3 * 4 unpacked = unpackb( - packed, object_pairs_hook=lambda l: sum(k * v for k, v in l), - use_list=1) + packed, object_pairs_hook=lambda l: sum(k * v for k, v in l), use_list=1 + ) assert unpacked[1] == prod_sum def test_only_one_obj_hook(self): msg = "object_pairs_hook and object_hook are mutually exclusive" with pytest.raises(TypeError, match=msg): - unpackb(b'', object_hook=lambda x: x, - object_pairs_hook=lambda x: x) + unpackb(b"", object_hook=lambda x: x, object_pairs_hook=lambda x: x) def test_bad_hook(self): msg = r"can't serialize \(1\+2j\)" @@ -61,14 +58,14 @@ def test_bad_hook(self): def test_array_hook(self): packed = packb([1, 2, 3]) unpacked = unpackb(packed, list_hook=self._arr_to_str, use_list=1) - assert unpacked == '123' + assert unpacked == "123" def test_an_exception_in_objecthook1(self): - with pytest.raises(DecodeError, match='Ooops!'): - packed = packb({1: {'__complex__': True, 'real': 1, 'imag': 2}}) + with pytest.raises(DecodeError, match="Ooops!"): + packed = packb({1: {"__complex__": True, "real": 1, "imag": 2}}) unpackb(packed, object_hook=self.bad_complex_decoder) def test_an_exception_in_objecthook2(self): - with pytest.raises(DecodeError, match='Ooops!'): - packed = packb({1: [{'__complex__': True, 'real': 1, 'imag': 2}]}) + with pytest.raises(DecodeError, match="Ooops!"): + packed = packb({1: [{"__complex__": True, "real": 1, "imag": 2}]}) unpackb(packed, list_hook=self.bad_complex_decoder, use_list=1) diff --git a/pandas/tests/io/msgpack/test_pack.py b/pandas/tests/io/msgpack/test_pack.py index ba9f1ae57741d1..5fc24027589cb5 100644 --- a/pandas/tests/io/msgpack/test_pack.py +++ b/pandas/tests/io/msgpack/test_pack.py @@ -9,19 +9,38 @@ class TestPack: - def check(self, data, use_list=False): re = unpackb(packb(data), use_list=use_list) assert re == data def testPack(self): test_data = [ - 0, 1, 127, 128, 255, 256, 65535, 65536, - -1, -32, -33, -128, -129, -32768, -32769, + 0, + 1, + 127, + 128, + 255, + 256, + 65535, + 65536, + -1, + -32, + -33, + -128, + -129, + -32768, + -32769, 1.0, - b"", b"a", b"a" * 31, b"a" * 32, - None, True, False, - (), ((),), ((), None,), + b"", + b"a", + b"a" * 31, + b"a" * 32, + None, + True, + False, + (), + ((),), + ((), None), {None: 0}, (1 << 23), ] @@ -29,50 +48,54 @@ def testPack(self): self.check(td) def testPackUnicode(self): - test_data = ["", "abcd", ["defgh"], "Русский текст", ] + test_data = ["", "abcd", ["defgh"], "Русский текст"] for td in test_data: - re = unpackb( - packb(td, encoding='utf-8'), use_list=1, encoding='utf-8') + re = unpackb(packb(td, encoding="utf-8"), use_list=1, encoding="utf-8") assert re == td - packer = Packer(encoding='utf-8') + packer = Packer(encoding="utf-8") data = packer.pack(td) - re = Unpacker(BytesIO(data), encoding='utf-8', use_list=1).unpack() + re = Unpacker(BytesIO(data), encoding="utf-8", use_list=1).unpack() assert re == td def testPackUTF32(self): test_data = ["", "abcd", ["defgh"], "Русский текст"] for td in test_data: - re = unpackb( - packb(td, encoding='utf-32'), use_list=1, encoding='utf-32') + re = unpackb(packb(td, encoding="utf-32"), use_list=1, encoding="utf-32") assert re == td def testPackBytes(self): - test_data = [b"", b"abcd", (b"defgh", ), ] + test_data = [b"", b"abcd", (b"defgh",)] for td in test_data: self.check(td) def testIgnoreUnicodeErrors(self): re = unpackb( - packb(b'abc\xeddef'), encoding='utf-8', unicode_errors='ignore', - use_list=1) + packb(b"abc\xeddef"), encoding="utf-8", unicode_errors="ignore", use_list=1 + ) assert re == "abcdef" def testStrictUnicodeUnpack(self): - msg = (r"'utf-*8' codec can't decode byte 0xed in position 3:" - " invalid continuation byte") + msg = ( + r"'utf-*8' codec can't decode byte 0xed in position 3:" + " invalid continuation byte" + ) with pytest.raises(UnicodeDecodeError, match=msg): - unpackb(packb(b'abc\xeddef'), encoding='utf-8', use_list=1) + unpackb(packb(b"abc\xeddef"), encoding="utf-8", use_list=1) def testStrictUnicodePack(self): - msg = (r"'ascii' codec can't encode character '\\xed' in position 3:" - r" ordinal not in range\(128\)") + msg = ( + r"'ascii' codec can't encode character '\\xed' in position 3:" + r" ordinal not in range\(128\)" + ) with pytest.raises(UnicodeEncodeError, match=msg): - packb("abc\xeddef", encoding='ascii', unicode_errors='strict') + packb("abc\xeddef", encoding="ascii", unicode_errors="strict") def testIgnoreErrorsPack(self): re = unpackb( - packb("abcФФФdef", encoding='ascii', unicode_errors='ignore'), - encoding='utf-8', use_list=1) + packb("abcФФФdef", encoding="ascii", unicode_errors="ignore"), + encoding="utf-8", + use_list=1, + ) assert re == "abcdef" def testNoEncoding(self): @@ -85,10 +108,8 @@ def testDecodeBinary(self): assert re == b"abc" def testPackFloat(self): - assert packb(1.0, - use_single_float=True) == b'\xca' + struct.pack('>f', 1.0) - assert packb( - 1.0, use_single_float=False) == b'\xcb' + struct.pack('>d', 1.0) + assert packb(1.0, use_single_float=True) == b"\xca" + struct.pack(">f", 1.0) + assert packb(1.0, use_single_float=False) == b"\xcb" + struct.pack(">d", 1.0) def testArraySize(self, sizes=[0, 5, 50, 1000]): bio = BytesIO() @@ -116,7 +137,7 @@ def test_manualreset(self, sizes=[0, 5, 50, 1000]): assert unpacker.unpack() == list(range(size)) packer.reset() - assert packer.bytes() == b'' + assert packer.bytes() == b"" def testMapSize(self, sizes=[0, 5, 50, 1000]): bio = BytesIO() @@ -133,18 +154,17 @@ def testMapSize(self, sizes=[0, 5, 50, 1000]): assert unpacker.unpack() == {i: i * 2 for i in range(size)} def test_odict(self): - seq = [(b'one', 1), (b'two', 2), (b'three', 3), (b'four', 4)] + seq = [(b"one", 1), (b"two", 2), (b"three", 3), (b"four", 4)] od = OrderedDict(seq) assert unpackb(packb(od), use_list=1) == dict(seq) def pair_hook(seq): return list(seq) - assert unpackb( - packb(od), object_pairs_hook=pair_hook, use_list=1) == seq + assert unpackb(packb(od), object_pairs_hook=pair_hook, use_list=1) == seq def test_pairlist(self): - pairlist = [(b'a', 1), (2, b'b'), (b'foo', b'bar')] + pairlist = [(b"a", 1), (2, b"b"), (b"foo", b"bar")] packer = Packer() packed = packer.pack_map_pairs(pairlist) unpacked = unpackb(packed, object_pairs_hook=list) diff --git a/pandas/tests/io/msgpack/test_read_size.py b/pandas/tests/io/msgpack/test_read_size.py index 42791b571e8e77..7d2b539f120858 100644 --- a/pandas/tests/io/msgpack/test_read_size.py +++ b/pandas/tests/io/msgpack/test_read_size.py @@ -6,29 +6,29 @@ def test_read_array_header(): unpacker = Unpacker() - unpacker.feed(packb(['a', 'b', 'c'])) + unpacker.feed(packb(["a", "b", "c"])) assert unpacker.read_array_header() == 3 - assert unpacker.unpack() == b'a' - assert unpacker.unpack() == b'b' - assert unpacker.unpack() == b'c' + assert unpacker.unpack() == b"a" + assert unpacker.unpack() == b"b" + assert unpacker.unpack() == b"c" try: unpacker.unpack() - assert 0, 'should raise exception' + assert 0, "should raise exception" except OutOfData: - assert 1, 'okay' + assert 1, "okay" def test_read_map_header(): unpacker = Unpacker() - unpacker.feed(packb({'a': 'A'})) + unpacker.feed(packb({"a": "A"})) assert unpacker.read_map_header() == 1 - assert unpacker.unpack() == B'a' - assert unpacker.unpack() == B'A' + assert unpacker.unpack() == b"a" + assert unpacker.unpack() == b"A" try: unpacker.unpack() - assert 0, 'should raise exception' + assert 0, "should raise exception" except OutOfData: - assert 1, 'okay' + assert 1, "okay" def test_incorrect_type_array(): @@ -36,9 +36,9 @@ def test_incorrect_type_array(): unpacker.feed(packb(1)) try: unpacker.read_array_header() - assert 0, 'should raise exception' + assert 0, "should raise exception" except UnexpectedTypeException: - assert 1, 'okay' + assert 1, "okay" def test_incorrect_type_map(): @@ -46,26 +46,26 @@ def test_incorrect_type_map(): unpacker.feed(packb(1)) try: unpacker.read_map_header() - assert 0, 'should raise exception' + assert 0, "should raise exception" except UnexpectedTypeException: - assert 1, 'okay' + assert 1, "okay" def test_correct_type_nested_array(): unpacker = Unpacker() - unpacker.feed(packb({'a': ['b', 'c', 'd']})) + unpacker.feed(packb({"a": ["b", "c", "d"]})) try: unpacker.read_array_header() - assert 0, 'should raise exception' + assert 0, "should raise exception" except UnexpectedTypeException: - assert 1, 'okay' + assert 1, "okay" def test_incorrect_type_nested_map(): unpacker = Unpacker() - unpacker.feed(packb([{'a': 'b'}])) + unpacker.feed(packb([{"a": "b"}])) try: unpacker.read_map_header() - assert 0, 'should raise exception' + assert 0, "should raise exception" except UnexpectedTypeException: - assert 1, 'okay' + assert 1, "okay" diff --git a/pandas/tests/io/msgpack/test_seq.py b/pandas/tests/io/msgpack/test_seq.py index 68be8c2d975aa9..c4ac13980bc671 100644 --- a/pandas/tests/io/msgpack/test_seq.py +++ b/pandas/tests/io/msgpack/test_seq.py @@ -8,7 +8,7 @@ def gen_binary_data(idx): - return binarydata[:idx % 300] + return binarydata[: idx % 300] def test_exceeding_unpacker_read_size(): diff --git a/pandas/tests/io/msgpack/test_sequnpack.py b/pandas/tests/io/msgpack/test_sequnpack.py index ea1e5035c78342..79feb78b3b0137 100644 --- a/pandas/tests/io/msgpack/test_sequnpack.py +++ b/pandas/tests/io/msgpack/test_sequnpack.py @@ -7,7 +7,6 @@ class TestPack: - def test_partial_data(self): unpacker = Unpacker() msg = "No more data to unpack" @@ -22,34 +21,34 @@ def test_partial_data(self): def test_foobar(self): unpacker = Unpacker(read_size=3, use_list=1) - unpacker.feed(b'foobar') - assert unpacker.unpack() == ord(b'f') - assert unpacker.unpack() == ord(b'o') - assert unpacker.unpack() == ord(b'o') - assert unpacker.unpack() == ord(b'b') - assert unpacker.unpack() == ord(b'a') - assert unpacker.unpack() == ord(b'r') + unpacker.feed(b"foobar") + assert unpacker.unpack() == ord(b"f") + assert unpacker.unpack() == ord(b"o") + assert unpacker.unpack() == ord(b"o") + assert unpacker.unpack() == ord(b"b") + assert unpacker.unpack() == ord(b"a") + assert unpacker.unpack() == ord(b"r") msg = "No more data to unpack" with pytest.raises(OutOfData, match=msg): unpacker.unpack() - unpacker.feed(b'foo') - unpacker.feed(b'bar') + unpacker.feed(b"foo") + unpacker.feed(b"bar") k = 0 - for o, e in zip(unpacker, 'foobarbaz'): + for o, e in zip(unpacker, "foobarbaz"): assert o == ord(e) k += 1 - assert k == len(b'foobar') + assert k == len(b"foobar") def test_foobar_skip(self): unpacker = Unpacker(read_size=3, use_list=1) - unpacker.feed(b'foobar') - assert unpacker.unpack() == ord(b'f') + unpacker.feed(b"foobar") + assert unpacker.unpack() == ord(b"f") unpacker.skip() - assert unpacker.unpack() == ord(b'o') + assert unpacker.unpack() == ord(b"o") unpacker.skip() - assert unpacker.unpack() == ord(b'a') + assert unpacker.unpack() == ord(b"a") unpacker.skip() msg = "No more data to unpack" with pytest.raises(OutOfData, match=msg): @@ -62,42 +61,42 @@ def test_maxbuffersize_read_size_exceeds_max_buffer_size(self): def test_maxbuffersize_bufferfull(self): unpacker = Unpacker(read_size=3, max_buffer_size=3, use_list=1) - unpacker.feed(b'foo') - with pytest.raises(BufferFull, match=r'^$'): - unpacker.feed(b'b') + unpacker.feed(b"foo") + with pytest.raises(BufferFull, match=r"^$"): + unpacker.feed(b"b") def test_maxbuffersize(self): unpacker = Unpacker(read_size=3, max_buffer_size=3, use_list=1) - unpacker.feed(b'foo') - assert ord('f') == next(unpacker) - unpacker.feed(b'b') - assert ord('o') == next(unpacker) - assert ord('o') == next(unpacker) - assert ord('b') == next(unpacker) + unpacker.feed(b"foo") + assert ord("f") == next(unpacker) + unpacker.feed(b"b") + assert ord("o") == next(unpacker) + assert ord("o") == next(unpacker) + assert ord("b") == next(unpacker) def test_readbytes(self): unpacker = Unpacker(read_size=3) - unpacker.feed(b'foobar') - assert unpacker.unpack() == ord(b'f') - assert unpacker.read_bytes(3) == b'oob' - assert unpacker.unpack() == ord(b'a') - assert unpacker.unpack() == ord(b'r') + unpacker.feed(b"foobar") + assert unpacker.unpack() == ord(b"f") + assert unpacker.read_bytes(3) == b"oob" + assert unpacker.unpack() == ord(b"a") + assert unpacker.unpack() == ord(b"r") # Test buffer refill - unpacker = Unpacker(BytesIO(b'foobar'), read_size=3) - assert unpacker.unpack() == ord(b'f') - assert unpacker.read_bytes(3) == b'oob' - assert unpacker.unpack() == ord(b'a') - assert unpacker.unpack() == ord(b'r') + unpacker = Unpacker(BytesIO(b"foobar"), read_size=3) + assert unpacker.unpack() == ord(b"f") + assert unpacker.read_bytes(3) == b"oob" + assert unpacker.unpack() == ord(b"a") + assert unpacker.unpack() == ord(b"r") def test_issue124(self): unpacker = Unpacker() - unpacker.feed(b'\xa1?\xa1!') - assert tuple(unpacker) == (b'?', b'!') + unpacker.feed(b"\xa1?\xa1!") + assert tuple(unpacker) == (b"?", b"!") assert tuple(unpacker) == () unpacker.feed(b"\xa1?\xa1") - assert tuple(unpacker) == (b'?', ) + assert tuple(unpacker) == (b"?",) assert tuple(unpacker) == () unpacker.feed(b"!") - assert tuple(unpacker) == (b'!', ) + assert tuple(unpacker) == (b"!",) assert tuple(unpacker) == () diff --git a/pandas/tests/io/msgpack/test_subtype.py b/pandas/tests/io/msgpack/test_subtype.py index 8af7e0b91d9b7f..c82f6f6d3bf4e0 100644 --- a/pandas/tests/io/msgpack/test_subtype.py +++ b/pandas/tests/io/msgpack/test_subtype.py @@ -17,7 +17,7 @@ class MyTuple(tuple): pass -MyNamedTuple = namedtuple('MyNamedTuple', 'x y') +MyNamedTuple = namedtuple("MyNamedTuple", "x y") def test_types(): diff --git a/pandas/tests/io/msgpack/test_unpack.py b/pandas/tests/io/msgpack/test_unpack.py index f33e0865a11451..483e09efe6bb8f 100644 --- a/pandas/tests/io/msgpack/test_unpack.py +++ b/pandas/tests/io/msgpack/test_unpack.py @@ -7,7 +7,6 @@ class TestUnpack: - def test_unpack_array_header_from_file(self): f = BytesIO(packb([1, 2, 3, 4])) unpacker = Unpacker(f) @@ -21,8 +20,8 @@ def test_unpack_array_header_from_file(self): unpacker.unpack() def test_unpacker_hook_refcnt(self): - if not hasattr(sys, 'getrefcount'): - pytest.skip('no sys.getrefcount()') + if not hasattr(sys, "getrefcount"): + pytest.skip("no sys.getrefcount()") result = [] def hook(x): @@ -47,9 +46,8 @@ def hook(x): def test_unpacker_ext_hook(self): class MyUnpacker(Unpacker): - def __init__(self): - super().__init__(ext_hook=self._hook, encoding='utf-8') + super().__init__(ext_hook=self._hook, encoding="utf-8") def _hook(self, code, data): if code == 1: @@ -58,9 +56,9 @@ def _hook(self, code, data): return ExtType(code, data) unpacker = MyUnpacker() - unpacker.feed(packb({'a': 1}, encoding='utf-8')) - assert unpacker.unpack() == {'a': 1} - unpacker.feed(packb({'a': ExtType(1, b'123')}, encoding='utf-8')) - assert unpacker.unpack() == {'a': 123} - unpacker.feed(packb({'a': ExtType(2, b'321')}, encoding='utf-8')) - assert unpacker.unpack() == {'a': ExtType(2, b'321')} + unpacker.feed(packb({"a": 1}, encoding="utf-8")) + assert unpacker.unpack() == {"a": 1} + unpacker.feed(packb({"a": ExtType(1, b"123")}, encoding="utf-8")) + assert unpacker.unpack() == {"a": 123} + unpacker.feed(packb({"a": ExtType(2, b"321")}, encoding="utf-8")) + assert unpacker.unpack() == {"a": ExtType(2, b"321")} diff --git a/pandas/tests/io/msgpack/test_unpack_raw.py b/pandas/tests/io/msgpack/test_unpack_raw.py index 09ebb681d87092..f844553bfc34a4 100644 --- a/pandas/tests/io/msgpack/test_unpack_raw.py +++ b/pandas/tests/io/msgpack/test_unpack_raw.py @@ -7,16 +7,16 @@ def test_write_bytes(): unpacker = Unpacker() - unpacker.feed(b'abc') + unpacker.feed(b"abc") f = io.BytesIO() - assert unpacker.unpack(f.write) == ord('a') - assert f.getvalue() == b'a' + assert unpacker.unpack(f.write) == ord("a") + assert f.getvalue() == b"a" f = io.BytesIO() assert unpacker.skip(f.write) is None - assert f.getvalue() == b'b' + assert f.getvalue() == b"b" f = io.BytesIO() assert unpacker.skip() is None - assert f.getvalue() == b'' + assert f.getvalue() == b"" def test_write_bytes_multi_buffer(): diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 8e35b58b90c480..2c347a096006a9 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -12,8 +12,7 @@ class BaseParser: def update_kwargs(self, kwargs): kwargs = kwargs.copy() - kwargs.update(dict(engine=self.engine, - low_memory=self.low_memory)) + kwargs.update(dict(engine=self.engine, low_memory=self.low_memory)) return kwargs @@ -67,19 +66,16 @@ def csv1(csv_dir_path): _all_parser_ids = _c_parser_ids + _py_parser_ids -@pytest.fixture(params=_all_parsers, - ids=_all_parser_ids) +@pytest.fixture(params=_all_parsers, ids=_all_parser_ids) def all_parsers(request): return request.param -@pytest.fixture(params=_c_parsers_only, - ids=_c_parser_ids) +@pytest.fixture(params=_c_parsers_only, ids=_c_parser_ids) def c_parser_only(request): return request.param -@pytest.fixture(params=_py_parsers_only, - ids=_py_parser_ids) +@pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids) def python_parser_only(request): return request.param diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 13f547bde692fd..77b52eb90d61fc 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -22,10 +22,9 @@ @pytest.mark.parametrize( "malformed", - ["1\r1\r1\r 1\r 1\r", - "1\r1\r1\r 1\r 1\r11\r", - "1\r1\r1\r 1\r 1\r11\r1\r"], - ids=["words pointer", "stream pointer", "lines pointer"]) + ["1\r1\r1\r 1\r 1\r", "1\r1\r1\r 1\r 1\r11\r", "1\r1\r1\r 1\r 1\r11\r1\r"], + ids=["words pointer", "stream pointer", "lines pointer"], +) def test_buffer_overflow(c_parser_only, malformed): # see gh-9205: test certain malformed input files that cause # buffer overflows in tokenizer.c @@ -41,17 +40,17 @@ def test_buffer_rd_bytes(c_parser_only): # to a segfault if a corrupt gzip file is read with 'read_csv', and the # buffer is filled more than once before gzip raises an Exception. - data = "\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09" \ - "\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0" \ - "\xA6\x4D" + "\x55" * 267 + \ - "\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00" \ - "\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO" + data = ( + "\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09" + "\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0" + "\xA6\x4D" + "\x55" * 267 + "\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00" + "\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO" + ) parser = c_parser_only for _ in range(100): try: - parser.read_csv(StringIO(data), compression="gzip", - delim_whitespace=True) + parser.read_csv(StringIO(data), compression="gzip", delim_whitespace=True) except Exception: pass @@ -61,10 +60,8 @@ def test_delim_whitespace_custom_terminator(c_parser_only): data = "a b c~1 2 3~4 5 6~7 8 9" parser = c_parser_only - df = parser.read_csv(StringIO(data), lineterminator="~", - delim_whitespace=True) - expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - columns=["a", "b", "c"]) + df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True) + expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]) tm.assert_frame_equal(df, expected) @@ -82,18 +79,15 @@ def test_dtype_and_names_error(c_parser_only): expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]]) tm.assert_frame_equal(result, expected) - result = parser.read_csv(StringIO(data), sep=r"\s+", - header=None, names=["a", "b"]) - expected = DataFrame( - [[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"]) + result = parser.read_csv(StringIO(data), sep=r"\s+", header=None, names=["a", "b"]) + expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) # fallback casting - result = parser.read_csv(StringIO( - data), sep=r"\s+", header=None, - names=["a", "b"], dtype={"a": np.int32}) - expected = DataFrame([[1, 1], [2, 2], [3, 3]], - columns=["a", "b"]) + result = parser.read_csv( + StringIO(data), sep=r"\s+", header=None, names=["a", "b"], dtype={"a": np.int32} + ) + expected = DataFrame([[1, 1], [2, 2], [3, 3]], columns=["a", "b"]) expected["a"] = expected["a"].astype(np.int32) tm.assert_frame_equal(result, expected) @@ -104,31 +98,46 @@ def test_dtype_and_names_error(c_parser_only): """ # fallback casting, but not castable with pytest.raises(ValueError, match="cannot safely convert"): - parser.read_csv(StringIO(data), sep=r"\s+", header=None, - names=["a", "b"], dtype={"a": np.int32}) - - -@pytest.mark.parametrize("match,kwargs", [ - # For each of these cases, all of the dtypes are valid, just unsupported. - (("the dtype datetime64 is not supported for parsing, " - "pass this column using parse_dates instead"), - dict(dtype={"A": "datetime64", "B": "float64"})), - - (("the dtype datetime64 is not supported for parsing, " - "pass this column using parse_dates instead"), - dict(dtype={"A": "datetime64", "B": "float64"}, - parse_dates=["B"])), + parser.read_csv( + StringIO(data), + sep=r"\s+", + header=None, + names=["a", "b"], + dtype={"a": np.int32}, + ) - ("the dtype timedelta64 is not supported for parsing", - dict(dtype={"A": "timedelta64", "B": "float64"})), - ("the dtype 262144b) parser = c_parser_only - header_narrow = "\t".join(["COL_HEADER_" + str(i) - for i in range(10)]) + "\n" - data_narrow = "\t".join(["somedatasomedatasomedata1" - for _ in range(10)]) + "\n" - header_wide = "\t".join(["COL_HEADER_" + str(i) - for i in range(15)]) + "\n" - data_wide = "\t".join(["somedatasomedatasomedata2" - for _ in range(15)]) + "\n" - test_input = (header_narrow + data_narrow * 1050 + - header_wide + data_wide * 2) + header_narrow = "\t".join(["COL_HEADER_" + str(i) for i in range(10)]) + "\n" + data_narrow = "\t".join(["somedatasomedatasomedata1" for _ in range(10)]) + "\n" + header_wide = "\t".join(["COL_HEADER_" + str(i) for i in range(15)]) + "\n" + data_wide = "\t".join(["somedatasomedatasomedata2" for _ in range(15)]) + "\n" + test_input = header_narrow + data_narrow * 1050 + header_wide + data_wide * 2 df = parser.read_csv(StringIO(test_input), sep="\t", nrows=1010) @@ -414,8 +434,7 @@ def test_read_nrows_large(c_parser_only): def test_float_precision_round_trip_with_text(c_parser_only): # see gh-15140 parser = c_parser_only - df = parser.read_csv(StringIO("a"), header=None, - float_precision="round_trip") + df = parser.read_csv(StringIO("a"), header=None, float_precision="round_trip") tm.assert_frame_equal(df, DataFrame({0: ["a"]})) @@ -439,7 +458,7 @@ def test_data_after_quote(c_parser_only): # see gh-15910 parser = c_parser_only - data = "a\n1\n\"b\"a" + data = 'a\n1\n"b"a' result = parser.read_csv(StringIO(data)) expected = DataFrame({"a": ["1", "ba"]}) @@ -459,18 +478,19 @@ def test_comment_whitespace_delimited(c_parser_only, capsys): 8# 1 field, NaN 9 2 3 # skipped line # comment""" - df = parser.read_csv(StringIO(test_input), comment="#", header=None, - delimiter="\\s+", skiprows=0, - error_bad_lines=False) + df = parser.read_csv( + StringIO(test_input), + comment="#", + header=None, + delimiter="\\s+", + skiprows=0, + error_bad_lines=False, + ) captured = capsys.readouterr() # skipped lines 2, 3, 4, 9 for line_num in (2, 3, 4, 9): assert "Skipping line {}".format(line_num) in captured.err - expected = DataFrame([[1, 2], - [5, 2], - [6, 2], - [7, np.nan], - [8, np.nan]]) + expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]]) tm.assert_frame_equal(df, expected) @@ -531,8 +551,7 @@ def test_bytes_exceed_2gb(c_parser_only): if parser.low_memory: pytest.skip("not a high_memory test") - csv = StringIO("strings\n" + "\n".join( - ["x" * (1 << 20) for _ in range(2100)])) + csv = StringIO("strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)])) df = parser.read_csv(csv) assert not df.empty diff --git a/pandas/tests/io/parser/test_comment.py b/pandas/tests/io/parser/test_comment.py index 89c9f300b2cf41..e1d422142ab0bb 100644 --- a/pandas/tests/io/parser/test_comment.py +++ b/pandas/tests/io/parser/test_comment.py @@ -18,18 +18,16 @@ def test_comment(all_parsers, na_values): 1,2.,4.#hello world 5.,NaN,10.0 """ - expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]], - columns=["A", "B", "C"]) - result = parser.read_csv(StringIO(data), comment="#", - na_values=na_values) + expected = DataFrame( + [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] + ) + result = parser.read_csv(StringIO(data), comment="#", na_values=na_values) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("read_kwargs", [ - dict(), - dict(lineterminator="*"), - dict(delim_whitespace=True), -]) +@pytest.mark.parametrize( + "read_kwargs", [dict(), dict(lineterminator="*"), dict(delim_whitespace=True)] +) def test_line_comment(all_parsers, read_kwargs): parser = all_parsers data = """# empty @@ -49,8 +47,9 @@ def test_line_comment(all_parsers, read_kwargs): read_kwargs["comment"] = "#" result = parser.read_csv(StringIO(data), **read_kwargs) - expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]], - columns=["A", "B", "C"]) + expected = DataFrame( + [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] + ) tm.assert_frame_equal(result, expected) @@ -65,8 +64,9 @@ def test_comment_skiprows(all_parsers): 5.,NaN,10.0 """ # This should ignore the first four lines (including comments). - expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]], - columns=["A", "B", "C"]) + expected = DataFrame( + [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] + ) result = parser.read_csv(StringIO(data), comment="#", skiprows=4) tm.assert_frame_equal(result, expected) @@ -81,8 +81,9 @@ def test_comment_header(all_parsers): 5.,NaN,10.0 """ # Header should begin at the second non-comment line. - expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]], - columns=["A", "B", "C"]) + expected = DataFrame( + [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] + ) result = parser.read_csv(StringIO(data), comment="#", header=1) tm.assert_frame_equal(result, expected) @@ -101,8 +102,9 @@ def test_comment_skiprows_header(all_parsers): # Skiprows should skip the first 4 lines (including comments), # while header should start from the second non-commented line, # starting with line 5. - expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]], - columns=["A", "B", "C"]) + expected = DataFrame( + [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] + ) result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1) tm.assert_frame_equal(result, expected) @@ -111,8 +113,9 @@ def test_comment_skiprows_header(all_parsers): def test_custom_comment_char(all_parsers, comment_char): parser = all_parsers data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo" - result = parser.read_csv(StringIO(data.replace("#", comment_char)), - comment=comment_char) + result = parser.read_csv( + StringIO(data.replace("#", comment_char)), comment=comment_char + ) expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index c74e57627d679b..7d5bf9ec850bcf 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -55,17 +55,16 @@ def _set_noconvert_columns(self): parse_dates = [[1, 2]] cols = { "a": [0, 0], - "c_d": [ - Timestamp("2014-01-01 09:00:00"), - Timestamp("2014-01-02 10:00:00") - ] + "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], } expected = DataFrame(cols, columns=["c_d", "a"]) parser = MyTextFileReader() - parser.options = {"usecols": [0, 2, 3], - "parse_dates": parse_dates, - "delimiter": ","} + parser.options = { + "usecols": [0, 2, 3], + "parse_dates": parse_dates, + "delimiter": ",", + } parser._engine = MyCParserWrapper(StringIO(data), **parser.options) result = parser.read() @@ -104,14 +103,14 @@ def test_bad_stream_exception(all_parsers, csv_dir_path): # and swallowing the exception that caused read to fail. path = os.path.join(csv_dir_path, "sauron.SHIFT_JIS.csv") codec = codecs.lookup("utf-8") - utf8 = codecs.lookup('utf-8') + utf8 = codecs.lookup("utf-8") parser = all_parsers msg = "'utf-8' codec can't decode byte" # Stream must be binary UTF8. with open(path, "rb") as handle, codecs.StreamRecoder( - handle, utf8.encode, utf8.decode, codec.streamreader, - codec.streamwriter) as stream: + handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter + ) as stream: with pytest.raises(UnicodeDecodeError, match=msg): parser.read_csv(stream) @@ -124,21 +123,30 @@ def test_read_csv_local(all_parsers, csv1): fname = prefix + str(os.path.abspath(csv1)) result = parser.read_csv(fname, index_col=0, parse_dates=True) - expected = DataFrame([[0.980269, 3.685731, -0.364216805298, -1.159738], - [1.047916, -0.041232, -0.16181208307, 0.212549], - [0.498581, 0.731168, -0.537677223318, 1.346270], - [1.120202, 1.567621, 0.00364077397681, 0.675253], - [-0.487094, 0.571455, -1.6116394093, 0.103469], - [0.836649, 0.246462, 0.588542635376, 1.062782], - [-0.157161, 1.340307, 1.1957779562, -1.097007]], - columns=["A", "B", "C", "D"], - index=Index([datetime(2000, 1, 3), - datetime(2000, 1, 4), - datetime(2000, 1, 5), - datetime(2000, 1, 6), - datetime(2000, 1, 7), - datetime(2000, 1, 10), - datetime(2000, 1, 11)], name="index")) + expected = DataFrame( + [ + [0.980269, 3.685731, -0.364216805298, -1.159738], + [1.047916, -0.041232, -0.16181208307, 0.212549], + [0.498581, 0.731168, -0.537677223318, 1.346270], + [1.120202, 1.567621, 0.00364077397681, 0.675253], + [-0.487094, 0.571455, -1.6116394093, 0.103469], + [0.836649, 0.246462, 0.588542635376, 1.062782], + [-0.157161, 1.340307, 1.1957779562, -1.097007], + ], + columns=["A", "B", "C", "D"], + index=Index( + [ + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + datetime(2000, 1, 6), + datetime(2000, 1, 7), + datetime(2000, 1, 10), + datetime(2000, 1, 11), + ], + name="index", + ), + ) tm.assert_frame_equal(result, expected) @@ -148,11 +156,7 @@ def test_1000_sep(all_parsers): 1|2,334|5 10|13|10. """ - expected = DataFrame({ - "A": [1, 10], - "B": [2334, 13], - "C": [5, 10.] - }) + expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]}) result = parser.read_csv(StringIO(data), sep="|", thousands=",") tm.assert_frame_equal(result, expected) @@ -168,8 +172,7 @@ def test_squeeze(all_parsers): index = Index(["a", "b", "c"], name=0) expected = Series([1, 2, 3], name=1, index=index) - result = parser.read_csv(StringIO(data), index_col=0, - header=None, squeeze=True) + result = parser.read_csv(StringIO(data), index_col=0, header=None, squeeze=True) tm.assert_series_equal(result, expected) # see gh-8217 @@ -203,9 +206,10 @@ def test_malformed_chunks(all_parsers, nrows): 2,3,4 """ parser = all_parsers - msg = 'Expected 3 fields in line 6, saw 5' - reader = parser.read_csv(StringIO(data), header=1, comment="#", - iterator=True, chunksize=1, skiprows=[2]) + msg = "Expected 3 fields in line 6, saw 5" + reader = parser.read_csv( + StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2] + ) with pytest.raises(ParserError, match=msg): reader.read(nrows) @@ -218,12 +222,11 @@ def test_unnamed_columns(all_parsers): 11,12,13,14,15 """ parser = all_parsers - expected = DataFrame([[1, 2, 3, 4, 5], - [6, 7, 8, 9, 10], - [11, 12, 13, 14, 15]], - dtype=np.int64, columns=["A", "B", "C", - "Unnamed: 3", - "Unnamed: 4"]) + expected = DataFrame( + [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], + dtype=np.int64, + columns=["A", "B", "C", "Unnamed: 3", "Unnamed: 4"], + ) result = parser.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) @@ -235,9 +238,7 @@ def test_csv_mixed_type(all_parsers): c,4,5 """ parser = all_parsers - expected = DataFrame({"A": ["a", "b", "c"], - "B": [1, 3, 4], - "C": [2, 4, 5]}) + expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}) result = parser.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) @@ -254,8 +255,7 @@ def test_read_csv_low_memory_no_rows_with_index(all_parsers): 2,2,3,4 3,3,4,5 """ - result = parser.read_csv(StringIO(data), low_memory=True, - index_col=0, nrows=0) + result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0) expected = DataFrame(columns=["A", "B", "C"]) tm.assert_frame_equal(result, expected) @@ -264,21 +264,30 @@ def test_read_csv_dataframe(all_parsers, csv1): parser = all_parsers result = parser.read_csv(csv1, index_col=0, parse_dates=True) - expected = DataFrame([[0.980269, 3.685731, -0.364216805298, -1.159738], - [1.047916, -0.041232, -0.16181208307, 0.212549], - [0.498581, 0.731168, -0.537677223318, 1.346270], - [1.120202, 1.567621, 0.00364077397681, 0.675253], - [-0.487094, 0.571455, -1.6116394093, 0.103469], - [0.836649, 0.246462, 0.588542635376, 1.062782], - [-0.157161, 1.340307, 1.1957779562, -1.097007]], - columns=["A", "B", "C", "D"], - index=Index([datetime(2000, 1, 3), - datetime(2000, 1, 4), - datetime(2000, 1, 5), - datetime(2000, 1, 6), - datetime(2000, 1, 7), - datetime(2000, 1, 10), - datetime(2000, 1, 11)], name="index")) + expected = DataFrame( + [ + [0.980269, 3.685731, -0.364216805298, -1.159738], + [1.047916, -0.041232, -0.16181208307, 0.212549], + [0.498581, 0.731168, -0.537677223318, 1.346270], + [1.120202, 1.567621, 0.00364077397681, 0.675253], + [-0.487094, 0.571455, -1.6116394093, 0.103469], + [0.836649, 0.246462, 0.588542635376, 1.062782], + [-0.157161, 1.340307, 1.1957779562, -1.097007], + ], + columns=["A", "B", "C", "D"], + index=Index( + [ + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + datetime(2000, 1, 6), + datetime(2000, 1, 7), + datetime(2000, 1, 10), + datetime(2000, 1, 11), + ], + name="index", + ), + ) tm.assert_frame_equal(result, expected) @@ -287,22 +296,25 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path): csv2 = os.path.join(csv_dir_path, "test2.csv") result = parser.read_csv(csv2, index_col=0, parse_dates=True) - expected = DataFrame([[0.980269, 3.685731, -0.364216805298, - -1.159738, "foo"], - [1.047916, -0.041232, -0.16181208307, - 0.212549, "bar"], - [0.498581, 0.731168, -0.537677223318, - 1.346270, "baz"], - [1.120202, 1.567621, 0.00364077397681, - 0.675253, "qux"], - [-0.487094, 0.571455, -1.6116394093, - 0.103469, "foo2"]], - columns=["A", "B", "C", "D", "E"], - index=Index([datetime(2000, 1, 3), - datetime(2000, 1, 4), - datetime(2000, 1, 5), - datetime(2000, 1, 6), - datetime(2000, 1, 7)])) + expected = DataFrame( + [ + [0.980269, 3.685731, -0.364216805298, -1.159738, "foo"], + [1.047916, -0.041232, -0.16181208307, 0.212549, "bar"], + [0.498581, 0.731168, -0.537677223318, 1.346270, "baz"], + [1.120202, 1.567621, 0.00364077397681, 0.675253, "qux"], + [-0.487094, 0.571455, -1.6116394093, 0.103469, "foo2"], + ], + columns=["A", "B", "C", "D", "E"], + index=Index( + [ + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + datetime(2000, 1, 6), + datetime(2000, 1, 7), + ] + ), + ) tm.assert_frame_equal(result, expected) @@ -341,12 +353,18 @@ def test_read_duplicate_index_explicit(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data), index_col=0) - expected = DataFrame([[2, 3, 4, 5], [7, 8, 9, 10], - [12, 13, 14, 15], [12, 13, 14, 15], - [12, 13, 14, 15], [12, 13, 14, 15]], - columns=["A", "B", "C", "D"], - index=Index(["foo", "bar", "baz", - "qux", "foo", "bar"], name="index")) + expected = DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + columns=["A", "B", "C", "D"], + index=Index(["foo", "bar", "baz", "qux", "foo", "bar"], name="index"), + ) tm.assert_frame_equal(result, expected) @@ -362,29 +380,49 @@ def test_read_duplicate_index_implicit(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data)) - expected = DataFrame([[2, 3, 4, 5], [7, 8, 9, 10], - [12, 13, 14, 15], [12, 13, 14, 15], - [12, 13, 14, 15], [12, 13, 14, 15]], - columns=["A", "B", "C", "D"], - index=Index(["foo", "bar", "baz", - "qux", "foo", "bar"])) + expected = DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + columns=["A", "B", "C", "D"], + index=Index(["foo", "bar", "baz", "qux", "foo", "bar"]), + ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,kwargs,expected", [ - ("A,B\nTrue,1\nFalse,2\nTrue,3", dict(), - DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"])), - ("A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3", - dict(true_values=["yes", "Yes", "YES"], - false_values=["no", "NO", "No"]), - DataFrame([[True, 1], [False, 2], [True, 3], - [False, 3], [True, 3]], columns=["A", "B"])), - ("A,B\nTRUE,1\nFALSE,2\nTRUE,3", dict(), - DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"])), - ("A,B\nfoo,bar\nbar,foo", dict(true_values=["foo"], - false_values=["bar"]), - DataFrame([[True, False], [False, True]], columns=["A", "B"])) -]) +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + "A,B\nTrue,1\nFalse,2\nTrue,3", + dict(), + DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]), + ), + ( + "A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3", + dict(true_values=["yes", "Yes", "YES"], false_values=["no", "NO", "No"]), + DataFrame( + [[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]], + columns=["A", "B"], + ), + ), + ( + "A,B\nTRUE,1\nFALSE,2\nTRUE,3", + dict(), + DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]), + ), + ( + "A,B\nfoo,bar\nbar,foo", + dict(true_values=["foo"], false_values=["bar"]), + DataFrame([[True, False], [False, True]], columns=["A", "B"]), + ), + ], +) def test_parse_bool(all_parsers, data, kwargs, expected): parser = all_parsers result = parser.read_csv(StringIO(data), **kwargs) @@ -415,10 +453,10 @@ def test_read_nrows(all_parsers, nrows): foo2,12,13,14,15 bar2,12,13,14,15 """ - expected = DataFrame([["foo", 2, 3, 4, 5], - ["bar", 7, 8, 9, 10], - ["baz", 12, 13, 14, 15]], - columns=["index", "A", "B", "C", "D"]) + expected = DataFrame( + [["foo", 2, 3, 4, 5], ["bar", 7, 8, 9, 10], ["baz", 12, 13, 14, 15]], + columns=["index", "A", "B", "C", "D"], + ) parser = all_parsers result = parser.read_csv(StringIO(data), nrows=nrows) @@ -455,13 +493,17 @@ def test_read_chunksize_with_index(all_parsers, index_col): """ reader = parser.read_csv(StringIO(data), index_col=0, chunksize=2) - expected = DataFrame([["foo", 2, 3, 4, 5], - ["bar", 7, 8, 9, 10], - ["baz", 12, 13, 14, 15], - ["qux", 12, 13, 14, 15], - ["foo2", 12, 13, 14, 15], - ["bar2", 12, 13, 14, 15]], - columns=["index", "A", "B", "C", "D"]) + expected = DataFrame( + [ + ["foo", 2, 3, 4, 5], + ["bar", 7, 8, 9, 10], + ["baz", 12, 13, 14, 15], + ["qux", 12, 13, 14, 15], + ["foo2", 12, 13, 14, 15], + ["bar2", 12, 13, 14, 15], + ], + columns=["index", "A", "B", "C", "D"], + ) expected = expected.set_index("index") chunks = list(reader) @@ -578,8 +620,7 @@ def test_read_data_list(all_parsers): kwargs = dict(index_col=0) data = "A,B,C\nfoo,1,2,3\nbar,4,5,6" - data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], - ["bar", "4", "5", "6"]] + data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]] expected = parser.read_csv(StringIO(data), **kwargs) parser = TextParser(data_list, chunksize=2, **kwargs) @@ -622,9 +663,11 @@ def test_iterator2(all_parsers): reader = parser.read_csv(StringIO(data), iterator=True) result = list(reader) - expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=["foo", "bar", "baz"], - columns=["A", "B", "C"]) + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"], + ) tm.assert_frame_equal(result[0], expected) @@ -685,18 +728,17 @@ def test_iterator_stop_on_chunksize(all_parsers): result = list(reader) assert len(result) == 3 - expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=["foo", "bar", "baz"], - columns=["A", "B", "C"]) + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"], + ) tm.assert_frame_equal(concat(result), expected) -@pytest.mark.parametrize("kwargs", [ - dict(iterator=True, - chunksize=1), - dict(iterator=True), - dict(chunksize=1) -]) +@pytest.mark.parametrize( + "kwargs", [dict(iterator=True, chunksize=1), dict(iterator=True), dict(chunksize=1)] +) def test_iterator_skipfooter_errors(all_parsers, kwargs): msg = "'skipfooter' not supported for 'iteration'" parser = all_parsers @@ -715,33 +757,62 @@ def test_nrows_skipfooter_errors(all_parsers): parser.read_csv(StringIO(data), skipfooter=1, nrows=5) -@pytest.mark.parametrize("data,kwargs,expected", [ - ("""foo,2,3,4,5 +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + """foo,2,3,4,5 bar,7,8,9,10 baz,12,13,14,15 qux,12,13,14,15 foo2,12,13,14,15 bar2,12,13,14,15 -""", dict(index_col=0, names=["index", "A", "B", "C", "D"]), - DataFrame([[2, 3, 4, 5], [7, 8, 9, 10], [12, 13, 14, 15], - [12, 13, 14, 15], [12, 13, 14, 15], [12, 13, 14, 15]], - index=Index(["foo", "bar", "baz", "qux", - "foo2", "bar2"], name="index"), - columns=["A", "B", "C", "D"])), - ("""foo,one,2,3,4,5 +""", + dict(index_col=0, names=["index", "A", "B", "C", "D"]), + DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + index=Index(["foo", "bar", "baz", "qux", "foo2", "bar2"], name="index"), + columns=["A", "B", "C", "D"], + ), + ), + ( + """foo,one,2,3,4,5 foo,two,7,8,9,10 foo,three,12,13,14,15 bar,one,12,13,14,15 bar,two,12,13,14,15 -""", dict(index_col=[0, 1], names=["index1", "index2", "A", "B", "C", "D"]), - DataFrame([[2, 3, 4, 5], [7, 8, 9, 10], [12, 13, 14, 15], - [12, 13, 14, 15], [12, 13, 14, 15]], - index=MultiIndex.from_tuples([ - ("foo", "one"), ("foo", "two"), ("foo", "three"), - ("bar", "one"), ("bar", "two")], - names=["index1", "index2"]), - columns=["A", "B", "C", "D"])), -]) +""", + dict(index_col=[0, 1], names=["index1", "index2", "A", "B", "C", "D"]), + DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + index=MultiIndex.from_tuples( + [ + ("foo", "one"), + ("foo", "two"), + ("foo", "three"), + ("bar", "one"), + ("bar", "two"), + ], + names=["index1", "index2"], + ), + columns=["A", "B", "C", "D"], + ), + ), + ], +) def test_pass_names_with_index(all_parsers, data, kwargs, expected): parser = all_parsers result = parser.read_csv(StringIO(data), **kwargs) @@ -757,14 +828,14 @@ def test_multi_index_no_level_names(all_parsers, index_col): bar,one,12,13,14,15 bar,two,12,13,14,15 """ - headless_data = '\n'.join(data.split("\n")[1:]) + headless_data = "\n".join(data.split("\n")[1:]) names = ["A", "B", "C", "D"] parser = all_parsers - result = parser.read_csv(StringIO(headless_data), - index_col=index_col, - header=None, names=names) + result = parser.read_csv( + StringIO(headless_data), index_col=index_col, header=None, names=names + ) expected = parser.read_csv(StringIO(data), index_col=index_col) # No index names in headless data. @@ -783,20 +854,39 @@ def test_multi_index_no_level_names_implicit(all_parsers): """ result = parser.read_csv(StringIO(data)) - expected = DataFrame([[2, 3, 4, 5], [7, 8, 9, 10], [12, 13, 14, 15], - [12, 13, 14, 15], [12, 13, 14, 15]], - columns=["A", "B", "C", "D"], - index=MultiIndex.from_tuples([ - ("foo", "one"), ("foo", "two"), ("foo", "three"), - ("bar", "one"), ("bar", "two")])) + expected = DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + columns=["A", "B", "C", "D"], + index=MultiIndex.from_tuples( + [ + ("foo", "one"), + ("foo", "two"), + ("foo", "three"), + ("bar", "one"), + ("bar", "two"), + ] + ), + ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,expected,header", [ - ("a,b", DataFrame(columns=["a", "b"]), [0]), - ("a,b\nc,d", DataFrame(columns=MultiIndex.from_tuples( - [("a", "c"), ("b", "d")])), [0, 1]), -]) +@pytest.mark.parametrize( + "data,expected,header", + [ + ("a,b", DataFrame(columns=["a", "b"]), [0]), + ( + "a,b\nc,d", + DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])), + [0, 1], + ), + ], +) @pytest.mark.parametrize("round_trip", [True, False]) def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip): # see gh-14545 @@ -815,9 +905,10 @@ def test_no_unnamed_index(all_parsers): 2 2 2 e f """ result = parser.read_csv(StringIO(data), sep=" ") - expected = DataFrame([[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], - [2, 2, 2, "e", "f"]], columns=["Unnamed: 0", "id", - "c0", "c1", "c2"]) + expected = DataFrame( + [[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], [2, 2, 2, "e", "f"]], + columns=["Unnamed: 0", "id", "c0", "c1", "c2"], + ) tm.assert_frame_equal(result, expected) @@ -840,8 +931,10 @@ def test_url(all_parsers, csv_dir_path): parser = all_parsers kwargs = dict(sep="\t") - url = ("https://raw.github.com/pandas-dev/pandas/master/" - "pandas/tests/io/parser/data/salaries.csv") + url = ( + "https://raw.github.com/pandas-dev/pandas/master/" + "pandas/tests/io/parser/data/salaries.csv" + ) url_result = parser.read_csv(url, **kwargs) local_path = os.path.join(csv_dir_path, "salaries.csv") @@ -869,8 +962,7 @@ def test_local_file(all_parsers, csv_dir_path): def test_path_path_lib(all_parsers): parser = all_parsers df = tm.makeDataFrame() - result = tm.round_trip_pathlib( - df.to_csv, lambda p: parser.read_csv(p, index_col=0)) + result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0)) tm.assert_frame_equal(df, result) @@ -878,7 +970,8 @@ def test_path_local_path(all_parsers): parser = all_parsers df = tm.makeDataFrame() result = tm.round_trip_localpath( - df.to_csv, lambda p: parser.read_csv(p, index_col=0)) + df.to_csv, lambda p: parser.read_csv(p, index_col=0) + ) tm.assert_frame_equal(df, result) @@ -888,14 +981,12 @@ def test_nonexistent_path(all_parsers): parser = all_parsers path = "%s.csv" % tm.rands(10) - msg = ("does not exist" if parser.engine == "c" - else r"\[Errno 2\]") + msg = "does not exist" if parser.engine == "c" else r"\[Errno 2\]" with pytest.raises(FileNotFoundError, match=msg) as e: parser.read_csv(path) filename = e.value.filename - filename = filename.decode() if isinstance( - filename, bytes) else filename + filename = filename.decode() if isinstance(filename, bytes) else filename assert path == filename @@ -908,30 +999,70 @@ def test_missing_trailing_delimiters(all_parsers): 1,4,5""" result = parser.read_csv(StringIO(data)) - expected = DataFrame([[1, 2, 3, 4], [1, 3, 3, np.nan], - [1, 4, 5, np.nan]], columns=["A", "B", "C", "D"]) + expected = DataFrame( + [[1, 2, 3, 4], [1, 3, 3, np.nan], [1, 4, 5, np.nan]], + columns=["A", "B", "C", "D"], + ) tm.assert_frame_equal(result, expected) def test_skip_initial_space(all_parsers): - data = ('"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' - '1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, ' - '314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, ' - '70.06056, 344.98370, 1, 1, -0.689265, -0.692787, ' - '0.212036, 14.7674, 41.605, -9999.0, -9999.0, ' - '-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128') - parser = all_parsers - - result = parser.read_csv(StringIO(data), names=list(range(33)), - header=None, na_values=["-9999.0"], - skipinitialspace=True) - expected = DataFrame([["09-Apr-2012", "01:10:18.300", 2456026.548822908, - 12849, 1.00361, 1.12551, 330.65659, - 355626618.16711, 73.48821, 314.11625, 1917.09447, - 179.71425, 80.0, 240.0, -350, 70.06056, 344.9837, - 1, 1, -0.689265, -0.692787, 0.212036, 14.7674, - 41.605, np.nan, np.nan, np.nan, np.nan, np.nan, - np.nan, 0, 12, 128]]) + data = ( + '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' + "1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, " + "314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, " + "70.06056, 344.98370, 1, 1, -0.689265, -0.692787, " + "0.212036, 14.7674, 41.605, -9999.0, -9999.0, " + "-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128" + ) + parser = all_parsers + + result = parser.read_csv( + StringIO(data), + names=list(range(33)), + header=None, + na_values=["-9999.0"], + skipinitialspace=True, + ) + expected = DataFrame( + [ + [ + "09-Apr-2012", + "01:10:18.300", + 2456026.548822908, + 12849, + 1.00361, + 1.12551, + 330.65659, + 355626618.16711, + 73.48821, + 314.11625, + 1917.09447, + 179.71425, + 80.0, + 240.0, + -350, + 70.06056, + 344.9837, + 1, + 1, + -0.689265, + -0.692787, + 0.212036, + 14.7674, + 41.605, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + 0, + 12, + 128, + ] + ] + ) tm.assert_frame_equal(result, expected) @@ -944,13 +1075,16 @@ def test_utf16_bom_skiprows(all_parsers, sep, encoding): skip this too A,B,C 1,2,3 -4,5,6""".replace(",", sep) +4,5,6""".replace( + ",", sep + ) path = "__%s__.csv" % tm.rands(10) kwargs = dict(sep=sep, skiprows=2) utf8 = "utf-8" with tm.ensure_clean(path) as path: from io import TextIOWrapper + bytes_data = data.encode(encoding) with open(path, "wb") as f: @@ -981,7 +1115,7 @@ def test_unicode_encoding(all_parsers, csv_dir_path): result = result.set_index(0) got = result[1][1632] - expected = '\xc1 k\xf6ldum klaka (Cold Fever) (1994)' + expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)" assert got == expected @@ -1007,13 +1141,14 @@ def test_escapechar(all_parsers): "SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals serie","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa parser = all_parsers - result = parser.read_csv(StringIO(data), escapechar='\\', - quotechar='"', encoding='utf-8') + result = parser.read_csv( + StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8" + ) - assert result['SEARCH_TERM'][2] == ('SLAGBORD, "Bergslagen", ' - 'IKEA:s 1700-tals serie') - tm.assert_index_equal(result.columns, - Index(['SEARCH_TERM', 'ACTUAL_URL'])) + assert result["SEARCH_TERM"][2] == ( + 'SLAGBORD, "Bergslagen", ' "IKEA:s 1700-tals serie" + ) + tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"])) def test_int64_min_issues(all_parsers): @@ -1040,16 +1175,22 @@ def test_parse_integers_above_fp_precision(all_parsers): 17007000002000194""" parser = all_parsers result = parser.read_csv(StringIO(data)) - expected = DataFrame({"Numbers": [17007000002000191, - 17007000002000191, - 17007000002000191, - 17007000002000191, - 17007000002000192, - 17007000002000192, - 17007000002000192, - 17007000002000192, - 17007000002000192, - 17007000002000194]}) + expected = DataFrame( + { + "Numbers": [ + 17007000002000191, + 17007000002000191, + 17007000002000191, + 17007000002000191, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000194, + ] + } + ) tm.assert_frame_equal(result, expected) @@ -1101,10 +1242,12 @@ def test_catch_too_many_names(all_parsers): 7,8,9 10,11,12\n""" parser = all_parsers - msg = ("Too many columns specified: " - "expected 4 and found 3" if parser.engine == "c" - else "Number of passed names did not match " - "number of header fields in the file") + msg = ( + "Too many columns specified: " "expected 4 and found 3" + if parser.engine == "c" + else "Number of passed names did not match " + "number of header fields in the file" + ) with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) @@ -1146,9 +1289,9 @@ def test_empty_with_multi_index(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data), index_col=["x", "y"]) - expected = DataFrame(columns=["z"], - index=MultiIndex.from_arrays( - [[]] * 2, names=["x", "y"])) + expected = DataFrame( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"]) + ) tm.assert_frame_equal(result, expected) @@ -1157,9 +1300,9 @@ def test_empty_with_reversed_multi_index(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data), index_col=[1, 0]) - expected = DataFrame(columns=["z"], - index=MultiIndex.from_arrays( - [[]] * 2, names=["y", "x"])) + expected = DataFrame( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"]) + ) tm.assert_frame_equal(result, expected) @@ -1175,15 +1318,14 @@ def test_float_parser(all_parsers): def test_scientific_no_exponent(all_parsers): # see gh-12215 - df = DataFrame.from_dict(OrderedDict([("w", ["2e"]), ("x", ["3E"]), - ("y", ["42e"]), - ("z", ["632E"])])) + df = DataFrame.from_dict( + OrderedDict([("w", ["2e"]), ("x", ["3E"]), ("y", ["42e"]), ("z", ["632E"])]) + ) data = df.to_csv(index=False) parser = all_parsers for precision in parser.float_precision_choices: - df_roundtrip = parser.read_csv(StringIO(data), - float_precision=precision) + df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision) tm.assert_frame_equal(df_roundtrip, df) @@ -1203,31 +1345,36 @@ def test_int64_overflow(all_parsers, conv): # 13007854817840016671868 > UINT64_MAX, so this # will overflow and return object as the dtype. result = parser.read_csv(StringIO(data)) - expected = DataFrame(["00013007854817840016671868", - "00013007854817840016749251", - "00013007854817840016754630", - "00013007854817840016781876", - "00013007854817840017028824", - "00013007854817840017963235", - "00013007854817840018860166"], columns=["ID"]) + expected = DataFrame( + [ + "00013007854817840016671868", + "00013007854817840016749251", + "00013007854817840016754630", + "00013007854817840016781876", + "00013007854817840017028824", + "00013007854817840017963235", + "00013007854817840018860166", + ], + columns=["ID"], + ) tm.assert_frame_equal(result, expected) else: # 13007854817840016671868 > UINT64_MAX, so attempts # to cast to either int64 or uint64 will result in # an OverflowError being raised. - msg = ("(Python int too large to convert to C long)|" - "(long too big to convert)|" - "(int too big to convert)") + msg = ( + "(Python int too large to convert to C long)|" + "(long too big to convert)|" + "(int too big to convert)" + ) with pytest.raises(OverflowError, match=msg): parser.read_csv(StringIO(data), converters={"ID": conv}) -@pytest.mark.parametrize("val", [ - np.iinfo(np.uint64).max, - np.iinfo(np.int64).max, - np.iinfo(np.int64).min -]) +@pytest.mark.parametrize( + "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min] +) def test_int64_uint64_range(all_parsers, val): # These numbers fall right inside the int64-uint64 # range, so they should be parsed as string. @@ -1238,10 +1385,9 @@ def test_int64_uint64_range(all_parsers, val): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("val", [ - np.iinfo(np.uint64).max + 1, - np.iinfo(np.int64).min - 1 -]) +@pytest.mark.parametrize( + "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] +) def test_outside_int64_uint64_range(all_parsers, val): # These numbers fall just outside the int64-uint64 # range, so they should be parsed as string. @@ -1252,8 +1398,7 @@ def test_outside_int64_uint64_range(all_parsers, val): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("exp_data", [[str(-1), str(2**63)], - [str(2**63), str(-1)]]) +@pytest.mark.parametrize("exp_data", [[str(-1), str(2 ** 63)], [str(2 ** 63), str(-1)]]) def test_numeric_range_too_wide(all_parsers, exp_data): # No numerical dtype can hold both negative and uint64 # values, so they should be cast as string. @@ -1282,55 +1427,101 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,kwargs,expected,msg", [ - # gh-10728: WHITESPACE_LINE - ("a,b,c\n4,5,6\n ", dict(), - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None), - - # gh-10548: EAT_LINE_COMMENT - ("a,b,c\n4,5,6\n#comment", dict(comment="#"), - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None), - - # EAT_CRNL_NOP - ("a,b,c\n4,5,6\n\r", dict(), - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None), - - # EAT_COMMENT - ("a,b,c\n4,5,6#comment", dict(comment="#"), - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None), - - # SKIP_LINE - ("a,b,c\n4,5,6\nskipme", dict(skiprows=[2]), - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None), - - # EAT_LINE_COMMENT - ("a,b,c\n4,5,6\n#comment", dict(comment="#", skip_blank_lines=False), - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None), - - # IN_FIELD - ("a,b,c\n4,5,6\n ", dict(skip_blank_lines=False), - DataFrame([["4", 5, 6], [" ", None, None]], - columns=["a", "b", "c"]), None), - - # EAT_CRNL - ("a,b,c\n4,5,6\n\r", dict(skip_blank_lines=False), - DataFrame([[4, 5, 6], [None, None, None]], - columns=["a", "b", "c"]), None), - - # ESCAPED_CHAR - ("a,b,c\n4,5,6\n\\", dict(escapechar="\\"), - None, "(EOF following escape character)|(unexpected end of data)"), - - # ESCAPE_IN_QUOTED_FIELD - ('a,b,c\n4,5,6\n"\\', dict(escapechar="\\"), - None, "(EOF inside string starting at row 2)|(unexpected end of data)"), - - # IN_QUOTED_FIELD - ('a,b,c\n4,5,6\n"', dict(escapechar="\\"), - None, "(EOF inside string starting at row 2)|(unexpected end of data)"), -], ids=["whitespace-line", "eat-line-comment", "eat-crnl-nop", "eat-comment", - "skip-line", "eat-line-comment", "in-field", "eat-crnl", - "escaped-char", "escape-in-quoted-field", "in-quoted-field"]) +@pytest.mark.parametrize( + "data,kwargs,expected,msg", + [ + # gh-10728: WHITESPACE_LINE + ( + "a,b,c\n4,5,6\n ", + dict(), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # gh-10548: EAT_LINE_COMMENT + ( + "a,b,c\n4,5,6\n#comment", + dict(comment="#"), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # EAT_CRNL_NOP + ( + "a,b,c\n4,5,6\n\r", + dict(), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # EAT_COMMENT + ( + "a,b,c\n4,5,6#comment", + dict(comment="#"), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # SKIP_LINE + ( + "a,b,c\n4,5,6\nskipme", + dict(skiprows=[2]), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # EAT_LINE_COMMENT + ( + "a,b,c\n4,5,6\n#comment", + dict(comment="#", skip_blank_lines=False), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # IN_FIELD + ( + "a,b,c\n4,5,6\n ", + dict(skip_blank_lines=False), + DataFrame([["4", 5, 6], [" ", None, None]], columns=["a", "b", "c"]), + None, + ), + # EAT_CRNL + ( + "a,b,c\n4,5,6\n\r", + dict(skip_blank_lines=False), + DataFrame([[4, 5, 6], [None, None, None]], columns=["a", "b", "c"]), + None, + ), + # ESCAPED_CHAR + ( + "a,b,c\n4,5,6\n\\", + dict(escapechar="\\"), + None, + "(EOF following escape character)|(unexpected end of data)", + ), + # ESCAPE_IN_QUOTED_FIELD + ( + 'a,b,c\n4,5,6\n"\\', + dict(escapechar="\\"), + None, + "(EOF inside string starting at row 2)|(unexpected end of data)", + ), + # IN_QUOTED_FIELD + ( + 'a,b,c\n4,5,6\n"', + dict(escapechar="\\"), + None, + "(EOF inside string starting at row 2)|(unexpected end of data)", + ), + ], + ids=[ + "whitespace-line", + "eat-line-comment", + "eat-crnl-nop", + "eat-comment", + "skip-line", + "eat-line-comment", + "in-field", + "eat-crnl", + "escaped-char", + "escape-in-quoted-field", + "in-quoted-field", + ], +) def test_eof_states(all_parsers, data, kwargs, expected, msg): # see gh-10728, gh-10548 parser = all_parsers @@ -1359,25 +1550,31 @@ def test_uneven_lines_with_usecols(all_parsers, usecols): with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data)) else: - expected = DataFrame({ - "a": [0, 3, 8], - "b": [1, 4, 9] - }) + expected = DataFrame({"a": [0, 3, 8], "b": [1, 4, 9]}) result = parser.read_csv(StringIO(data), usecols=usecols) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,kwargs,expected", [ - # First, check to see that the response of parser when faced with no - # provided columns raises the correct error, with or without usecols. - ("", dict(), None), - ("", dict(usecols=["X"]), None), - (",,", dict(names=["Dummy", "X", "Dummy_2"], usecols=["X"]), - DataFrame(columns=["X"], index=[0], dtype=np.float64)), - ("", dict(names=["Dummy", "X", "Dummy_2"], usecols=["X"]), - DataFrame(columns=["X"])), -]) +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + # First, check to see that the response of parser when faced with no + # provided columns raises the correct error, with or without usecols. + ("", dict(), None), + ("", dict(usecols=["X"]), None), + ( + ",,", + dict(names=["Dummy", "X", "Dummy_2"], usecols=["X"]), + DataFrame(columns=["X"], index=[0], dtype=np.float64), + ), + ( + "", + dict(names=["Dummy", "X", "Dummy_2"], usecols=["X"]), + DataFrame(columns=["X"]), + ), + ], +) def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): # see gh-12493 parser = all_parsers @@ -1391,19 +1588,29 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs,expected", [ - # gh-8661, gh-8679: this should ignore six lines, including - # lines with trailing whitespace and blank lines. - (dict(header=None, delim_whitespace=True, skiprows=[0, 1, 2, 3, 5, 6], - skip_blank_lines=True), DataFrame([[1., 2., 4.], - [5.1, np.nan, 10.]])), - - # gh-8983: test skipping set of rows after a row with trailing spaces. - (dict(delim_whitespace=True, skiprows=[1, 2, 3, 5, 6], - skip_blank_lines=True), DataFrame({"A": [1., 5.1], - "B": [2., np.nan], - "C": [4., 10]})), -]) +@pytest.mark.parametrize( + "kwargs,expected", + [ + # gh-8661, gh-8679: this should ignore six lines, including + # lines with trailing whitespace and blank lines. + ( + dict( + header=None, + delim_whitespace=True, + skiprows=[0, 1, 2, 3, 5, 6], + skip_blank_lines=True, + ), + DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]), + ), + # gh-8983: test skipping set of rows after a row with trailing spaces. + ( + dict( + delim_whitespace=True, skiprows=[1, 2, 3, 5, 6], skip_blank_lines=True + ), + DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}), + ), + ], +) def test_trailing_spaces(all_parsers, kwargs, expected): data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa parser = all_parsers @@ -1433,18 +1640,31 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace): b\n""" expected = DataFrame({"MyColumn": list("abab")}) - result = parser.read_csv(StringIO(data), skipinitialspace=True, - delim_whitespace=delim_whitespace) + result = parser.read_csv( + StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace + ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("sep,skip_blank_lines,exp_data", [ - (",", True, [[1., 2., 4.], [5., np.nan, 10.], [-70., .4, 1.]]), - (r"\s+", True, [[1., 2., 4.], [5., np.nan, 10.], [-70., .4, 1.]]), - (",", False, [[1., 2., 4.], [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], [5., np.nan, 10.], - [np.nan, np.nan, np.nan], [-70., .4, 1.]]), -]) +@pytest.mark.parametrize( + "sep,skip_blank_lines,exp_data", + [ + (",", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]), + (r"\s+", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]), + ( + ",", + False, + [ + [1.0, 2.0, 4.0], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + [5.0, np.nan, 10.0], + [np.nan, np.nan, np.nan], + [-70.0, 0.4, 1.0], + ], + ), + ], +) def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data): parser = all_parsers data = """\ @@ -1460,8 +1680,7 @@ def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data): if sep == r"\s+": data = data.replace(",", " ") - result = parser.read_csv(StringIO(data), sep=sep, - skip_blank_lines=skip_blank_lines) + result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines) expected = DataFrame(exp_data, columns=["A", "B", "C"]) tm.assert_frame_equal(result, expected) @@ -1476,22 +1695,32 @@ def test_whitespace_lines(all_parsers): \t 1,2.,4. 5.,NaN,10.0 """ - expected = DataFrame([[1, 2., 4.], [5., np.nan, 10.]], - columns=["A", "B", "C"]) + expected = DataFrame([[1, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]) result = parser.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,expected", [ - (""" A B C D +@pytest.mark.parametrize( + "data,expected", + [ + ( + """ A B C D a 1 2 3 4 b 1 2 3 4 c 1 2 3 4 -""", DataFrame([[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], - columns=["A", "B", "C", "D"], index=["a", "b", "c"])), - (" a b c\n1 2 3 \n4 5 6\n 7 8 9", - DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"])), -]) +""", + DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], + columns=["A", "B", "C", "D"], + index=["a", "b", "c"], + ), + ), + ( + " a b c\n1 2 3 \n4 5 6\n 7 8 9", + DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]), + ), + ], +) def test_whitespace_regex_separator(all_parsers, data, expected): # see gh-6607 parser = all_parsers @@ -1563,27 +1792,34 @@ def test_iteration_open_handle(all_parsers): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("data,thousands,decimal", [ - ("""A|B|C +@pytest.mark.parametrize( + "data,thousands,decimal", + [ + ( + """A|B|C 1|2,334.01|5 10|13|10. -""", ",", "."), - ("""A|B|C +""", + ",", + ".", + ), + ( + """A|B|C 1|2.334,01|5 10|13|10, -""", ".", ","), -]) +""", + ".", + ",", + ), + ], +) def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal): parser = all_parsers - expected = DataFrame({ - "A": [1, 10], - "B": [2334.01, 13], - "C": [5, 10.] - }) + expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]}) - result = parser.read_csv(StringIO(data), sep="|", - thousands=thousands, - decimal=decimal) + result = parser.read_csv( + StringIO(data), sep="|", thousands=thousands, decimal=decimal + ) tm.assert_frame_equal(result, expected) @@ -1595,11 +1831,14 @@ def test_euro_decimal_format(all_parsers): 3;878,158;108013,434;GHI;rez;2,735694704""" result = parser.read_csv(StringIO(data), sep=";", decimal=",") - expected = DataFrame([ - [1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819], - [2, 121.12, 14897.76, "DEF", "uyt", 0.377320872], - [3, 878.158, 108013.434, "GHI", "rez", 2.735694704] - ], columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"]) + expected = DataFrame( + [ + [1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819], + [2, 121.12, 14897.76, "DEF", "uyt", 0.377320872], + [3, 878.158, 108013.434, "GHI", "rez", 2.735694704], + ], + columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"], + ) tm.assert_frame_equal(result, expected) @@ -1618,9 +1857,10 @@ def test_inf_parsing(all_parsers, na_filter): h,-INf i,inF j,-inF""" - expected = DataFrame({"A": [float("inf"), float("-inf")] * 5}, - index=["a", "b", "c", "d", "e", - "f", "g", "h", "i", "j"]) + expected = DataFrame( + {"A": [float("inf"), float("-inf")] * 5}, + index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], + ) result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter) tm.assert_frame_equal(result, expected) @@ -1639,11 +1879,9 @@ def test_memory_map(all_parsers, csv_dir_path): mmap_file = os.path.join(csv_dir_path, "test_mmap.csv") parser = all_parsers - expected = DataFrame({ - "a": [1, 2, 3], - "b": ["one", "two", "three"], - "c": ["I", "II", "III"] - }) + expected = DataFrame( + {"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]} + ) result = parser.read_csv(mmap_file, memory_map=True) tm.assert_frame_equal(result, expected) @@ -1665,23 +1903,25 @@ def test_null_byte_char(all_parsers): parser.read_csv(StringIO(data), names=names) -@pytest.mark.parametrize("data,kwargs,expected", [ - # Basic test - ("a\n1", dict(), DataFrame({"a": [1]})), - - # "Regular" quoting - ('"a"\n1', dict(quotechar='"'), DataFrame({"a": [1]})), - - # Test in a data row instead of header - ("b\n1", dict(names=["a"]), DataFrame({"a": ["b", "1"]})), - - # Test in empty data row with skipping - ("\n1", dict(names=["a"], skip_blank_lines=True), DataFrame({"a": [1]})), - - # Test in empty data row without skipping - ("\n1", dict(names=["a"], skip_blank_lines=False), - DataFrame({"a": [np.nan, 1]})), -]) +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + # Basic test + ("a\n1", dict(), DataFrame({"a": [1]})), + # "Regular" quoting + ('"a"\n1', dict(quotechar='"'), DataFrame({"a": [1]})), + # Test in a data row instead of header + ("b\n1", dict(names=["a"]), DataFrame({"a": ["b", "1"]})), + # Test in empty data row with skipping + ("\n1", dict(names=["a"], skip_blank_lines=True), DataFrame({"a": [1]})), + # Test in empty data row without skipping + ( + "\n1", + dict(names=["a"], skip_blank_lines=False), + DataFrame({"a": [np.nan, 1]}), + ), + ], +) def test_utf8_bom(all_parsers, data, kwargs, expected): # see gh-4793 parser = all_parsers @@ -1692,8 +1932,7 @@ def _encode_data_with_bom(_data): bom_data = (bom + _data).encode(utf8) return BytesIO(bom_data) - result = parser.read_csv(_encode_data_with_bom(data), - encoding=utf8, **kwargs) + result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs) tm.assert_frame_equal(result, expected) @@ -1715,8 +1954,7 @@ def test_temporary_file(all_parsers): @pytest.mark.parametrize("byte", [8, 16]) -@pytest.mark.parametrize("fmt", ["utf-{0}", "utf_{0}", - "UTF-{0}", "UTF_{0}"]) +@pytest.mark.parametrize("fmt", ["utf-{0}", "utf_{0}", "UTF-{0}", "UTF_{0}"]) def test_read_csv_utf_aliases(all_parsers, byte, fmt): # see gh-13549 expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]}) @@ -1743,8 +1981,7 @@ def test_internal_eof_byte_to_file(all_parsers): # see gh-16559 parser = all_parsers data = b'c1,c2\r\n"test \x1a test", test\r\n' - expected = DataFrame([["test \x1a test", " test"]], - columns=["c1", "c2"]) + expected = DataFrame([["test \x1a test", " test"]], columns=["c1", "c2"]) path = "__%s__.csv" % tm.rands(10) with tm.ensure_clean(path) as path: @@ -1834,14 +2071,13 @@ def seek(self, pos, whence=0): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs", [ - dict(), # Default is True. - dict(error_bad_lines=True), # Explicitly pass in. -]) -@pytest.mark.parametrize("warn_kwargs", [ - dict(), dict(warn_bad_lines=True), - dict(warn_bad_lines=False) -]) +@pytest.mark.parametrize( + "kwargs", + [dict(), dict(error_bad_lines=True)], # Default is True. # Explicitly pass in. +) +@pytest.mark.parametrize( + "warn_kwargs", [dict(), dict(warn_bad_lines=True), dict(warn_bad_lines=False)] +) def test_error_bad_lines(all_parsers, kwargs, warn_kwargs): # see gh-15925 parser = all_parsers @@ -1859,9 +2095,7 @@ def test_warn_bad_lines(all_parsers, capsys): data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - result = parser.read_csv(StringIO(data), - error_bad_lines=False, - warn_bad_lines=True) + result = parser.read_csv(StringIO(data), error_bad_lines=False, warn_bad_lines=True) tm.assert_frame_equal(result, expected) captured = capsys.readouterr() @@ -1875,17 +2109,19 @@ def test_suppress_error_output(all_parsers, capsys): data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - result = parser.read_csv(StringIO(data), - error_bad_lines=False, - warn_bad_lines=False) + result = parser.read_csv( + StringIO(data), error_bad_lines=False, warn_bad_lines=False + ) tm.assert_frame_equal(result, expected) captured = capsys.readouterr() assert captured.err == "" -@pytest.mark.skipif(compat.is_platform_windows() and not compat.PY36, - reason="On Python < 3.6 won't pass on Windows") +@pytest.mark.skipif( + compat.is_platform_windows() and not compat.PY36, + reason="On Python < 3.6 won't pass on Windows", +) @pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv"]) def test_filename_with_special_chars(all_parsers, filename): # see gh-15086. @@ -1932,6 +2168,6 @@ def test_first_row_bom(all_parsers): parser = all_parsers data = '''\ufeff"Head1" "Head2" "Head3"''' - result = parser.read_csv(StringIO(data), delimiter='\t') + result = parser.read_csv(StringIO(data), delimiter="\t") expected = DataFrame(columns=["Head1", "Head2", "Head3"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index a04a5c4e4ad6b1..06ae2c0fef1b9e 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -76,8 +76,7 @@ def test_zip_error_invalid_zip(parser_and_data): with tm.ensure_clean() as path: with open(path, "wb") as f: - with pytest.raises(zipfile.BadZipfile, - match="File is not a zip file"): + with pytest.raises(zipfile.BadZipfile, match="File is not a zip file"): parser.read_csv(f, compression="zip") @@ -90,8 +89,7 @@ def test_compression(parser_and_data, compression_only, buffer, filename): filename = filename if filename is None else filename.format(ext=ext) if filename and buffer: - pytest.skip("Cannot deduce compression from " - "buffer of compressed data.") + pytest.skip("Cannot deduce compression from " "buffer of compressed data.") with tm.ensure_clean(filename=filename) as path: tm.write_to_compressed(compress_type, path, data) @@ -130,12 +128,13 @@ def test_compression_utf16_encoding(all_parsers, csv_dir_path): parser = all_parsers path = os.path.join(csv_dir_path, "utf16_ex_small.zip") - result = parser.read_csv(path, encoding="utf-16", - compression="zip", sep="\t") - expected = pd.DataFrame({ - "Country": ["Venezuela", "Venezuela"], - "Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."] - }) + result = parser.read_csv(path, encoding="utf-16", compression="zip", sep="\t") + expected = pd.DataFrame( + { + "Country": ["Venezuela", "Venezuela"], + "Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."], + } + ) tm.assert_frame_equal(result, expected) @@ -145,8 +144,7 @@ def test_invalid_compression(all_parsers, invalid_compression): parser = all_parsers compress_kwargs = dict(compression=invalid_compression) - msg = ("Unrecognized compression " - "type: {compression}".format(**compress_kwargs)) + msg = "Unrecognized compression " "type: {compression}".format(**compress_kwargs) with pytest.raises(ValueError, match=msg): parser.read_csv("test_file.zip", **compress_kwargs) diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 4df99d396b7ac4..2a3b1dc82fc59f 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -24,10 +24,9 @@ def test_converters_type_must_be_dict(all_parsers): @pytest.mark.parametrize("column", [3, "D"]) -@pytest.mark.parametrize("converter", [ - parse, - lambda x: int(x.split("/")[2]) # Produce integer. -]) +@pytest.mark.parametrize( + "converter", [parse, lambda x: int(x.split("/")[2])] # Produce integer. +) def test_converters(all_parsers, column, converter): parser = all_parsers data = """A,B,C,D @@ -49,8 +48,7 @@ def test_converters_no_implicit_conv(all_parsers): data = """000102,1.2,A\n001245,2,B""" converters = {0: lambda x: x.strip()} - result = parser.read_csv(StringIO(data), header=None, - converters=converters) + result = parser.read_csv(StringIO(data), header=None, converters=converters) # Column 0 should not be casted to numeric and should remain as object. expected = DataFrame([["000102", 1.2, "A"], ["001245", 2, "B"]]) @@ -66,15 +64,19 @@ def test_converters_euro_decimal_format(all_parsers): 1;1521,1541;187101,9543;ABC;poi;4,7387 2;121,12;14897,76;DEF;uyt;0,3773 3;878,158;108013,434;GHI;rez;2,7356""" - converters["Number1"] = converters["Number2"] =\ - converters["Number3"] = lambda x: float(x.replace(",", ".")) + converters["Number1"] = converters["Number2"] = converters[ + "Number3" + ] = lambda x: float(x.replace(",", ".")) result = parser.read_csv(StringIO(data), sep=";", converters=converters) - expected = DataFrame([[1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387], - [2, 121.12, 14897.76, "DEF", "uyt", 0.3773], - [3, 878.158, 108013.434, "GHI", "rez", 2.7356]], - columns=["Id", "Number1", "Number2", - "Text1", "Text2", "Number3"]) + expected = DataFrame( + [ + [1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387], + [2, 121.12, 14897.76, "DEF", "uyt", 0.3773], + [3, 878.158, 108013.434, "GHI", "rez", 2.7356], + ], + columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"], + ) tm.assert_frame_equal(result, expected) @@ -134,10 +136,11 @@ def convert_score(x): results = [] for day_converter in [convert_days, convert_days_sentinel]: - result = parser.read_csv(StringIO(data), - converters={"score": convert_score, - "days": day_converter}, - na_values=["", None]) + result = parser.read_csv( + StringIO(data), + converters={"score": convert_score, "days": day_converter}, + na_values=["", None], + ) assert pd.isna(result["days"][1]) results.append(result) @@ -149,8 +152,9 @@ def test_converter_index_col_bug(all_parsers): parser = all_parsers data = "A;B\n1;2\n3;4" - rs = parser.read_csv(StringIO(data), sep=";", index_col="A", - converters={"A": lambda x: x}) + rs = parser.read_csv( + StringIO(data), sep=";", index_col="A", converters={"A": lambda x: x} + ) xp = DataFrame({"B": [2, 4]}, index=Index([1, 3], name="A")) tm.assert_frame_equal(rs, xp) diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py index 6f2878fd2363a6..dc10352bc64601 100644 --- a/pandas/tests/io/parser/test_dialect.py +++ b/pandas/tests/io/parser/test_dialect.py @@ -17,8 +17,14 @@ @pytest.fixture def custom_dialect(): dialect_name = "weird" - dialect_kwargs = dict(doublequote=False, escapechar="~", delimiter=":", - skipinitialspace=False, quotechar="~", quoting=3) + dialect_kwargs = dict( + doublequote=False, + escapechar="~", + delimiter=":", + skipinitialspace=False, + quotechar="~", + quoting=3, + ) return dialect_name, dialect_kwargs @@ -40,7 +46,7 @@ def test_dialect(all_parsers): index2,b,d,f """ exp = parser.read_csv(StringIO(data)) - exp.replace("a", "\"a", inplace=True) + exp.replace("a", '"a', inplace=True) tm.assert_frame_equal(df, exp) @@ -52,10 +58,7 @@ def test_dialect_str(all_parsers): apple:broccoli pear:tomato """ - exp = DataFrame({ - "fruit": ["apple", "pear"], - "vegetable": ["broccoli", "tomato"] - }) + exp = DataFrame({"fruit": ["apple", "pear"], "vegetable": ["broccoli", "tomato"]}) with tm.with_csv_dialect(dialect_name, delimiter=":"): df = parser.read_csv(StringIO(data), dialect=dialect_name) @@ -74,11 +77,12 @@ class InvalidDialect: parser.read_csv(StringIO(data), dialect=InvalidDialect) -@pytest.mark.parametrize("arg", [None, "doublequote", "escapechar", - "skipinitialspace", "quotechar", "quoting"]) +@pytest.mark.parametrize( + "arg", + [None, "doublequote", "escapechar", "skipinitialspace", "quotechar", "quoting"], +) @pytest.mark.parametrize("value", ["dialect", "default", "other"]) -def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, - arg, value): +def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, arg, value): # see gh-23761. dialect_name, dialect_kwargs = custom_dialect parser = all_parsers @@ -95,6 +99,7 @@ def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, kwds[arg] = dialect_kwargs[arg] elif "value" == "default": # Default --> no warning. from pandas.io.parsers import _parser_defaults + kwds[arg] = _parser_defaults[arg] else: # Non-default + conflict with dialect --> warning. warning_klass = ParserWarning @@ -102,23 +107,30 @@ def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, with tm.with_csv_dialect(dialect_name, **dialect_kwargs): with tm.assert_produces_warning(warning_klass): - result = parser.read_csv(StringIO(data), - dialect=dialect_name, **kwds) + result = parser.read_csv(StringIO(data), dialect=dialect_name, **kwds) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs,warning_klass", [ - (dict(sep=","), None), # sep is default --> sep_override=True - (dict(sep="."), ParserWarning), # sep isn't default --> sep_override=False - (dict(delimiter=":"), None), # No conflict - (dict(delimiter=None), None), # Default arguments --> sep_override=True - (dict(delimiter=","), ParserWarning), # Conflict - (dict(delimiter="."), ParserWarning), # Conflict -], ids=["sep-override-true", "sep-override-false", - "delimiter-no-conflict", "delimiter-default-arg", - "delimiter-conflict", "delimiter-conflict2"]) -def test_dialect_conflict_delimiter(all_parsers, custom_dialect, - kwargs, warning_klass): +@pytest.mark.parametrize( + "kwargs,warning_klass", + [ + (dict(sep=","), None), # sep is default --> sep_override=True + (dict(sep="."), ParserWarning), # sep isn't default --> sep_override=False + (dict(delimiter=":"), None), # No conflict + (dict(delimiter=None), None), # Default arguments --> sep_override=True + (dict(delimiter=","), ParserWarning), # Conflict + (dict(delimiter="."), ParserWarning), # Conflict + ], + ids=[ + "sep-override-true", + "sep-override-false", + "delimiter-no-conflict", + "delimiter-default-arg", + "delimiter-conflict", + "delimiter-conflict2", + ], +) +def test_dialect_conflict_delimiter(all_parsers, custom_dialect, kwargs, warning_klass): # see gh-23761. dialect_name, dialect_kwargs = custom_dialect parser = all_parsers @@ -128,6 +140,5 @@ def test_dialect_conflict_delimiter(all_parsers, custom_dialect, with tm.with_csv_dialect(dialect_name, **dialect_kwargs): with tm.assert_produces_warning(warning_klass): - result = parser.read_csv(StringIO(data), - dialect=dialect_name, **kwargs) + result = parser.read_csv(StringIO(data), dialect=dialect_name, **kwargs) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 738b9d96937507..92c91565e1c23d 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -13,8 +13,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd -from pandas import ( - Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat) +from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat import pandas.util.testing as tm @@ -24,8 +23,11 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig): # see gh-3795, gh-6607 parser = all_parsers - df = DataFrame(np.random.rand(5, 2).round(4), columns=list("AB"), - index=["1A", "1B", "1C", "1D", "1E"]) + df = DataFrame( + np.random.rand(5, 2).round(4), + columns=list("AB"), + index=["1A", "1B", "1C", "1D", "1E"], + ) with tm.ensure_clean("__passing_str_as_dtype__.csv") as path: df.to_csv(path) @@ -58,13 +60,13 @@ def test_dtype_per_column(all_parsers): 2,3.5 3,4.5 4,5.5""" - expected = DataFrame([[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], - columns=["one", "two"]) + expected = DataFrame( + [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] + ) expected["one"] = expected["one"].astype(np.float64) expected["two"] = expected["two"].astype(object) - result = parser.read_csv(StringIO(data), dtype={"one": np.float64, - 1: str}) + result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) tm.assert_frame_equal(result, expected) @@ -81,13 +83,14 @@ def test_invalid_dtype_per_column(all_parsers): parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) -@pytest.mark.parametrize("dtype", [ - "category", - CategoricalDtype(), - {"a": "category", - "b": "category", - "c": CategoricalDtype()} -]) +@pytest.mark.parametrize( + "dtype", + [ + "category", + CategoricalDtype(), + {"a": "category", "b": "category", "c": CategoricalDtype()}, + ], +) def test_categorical_dtype(all_parsers, dtype): # see gh-10153 parser = all_parsers @@ -95,17 +98,18 @@ def test_categorical_dtype(all_parsers, dtype): 1,a,3.4 1,a,3.4 2,b,4.5""" - expected = DataFrame({"a": Categorical(["1", "1", "2"]), - "b": Categorical(["a", "a", "b"]), - "c": Categorical(["3.4", "3.4", "4.5"])}) + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["a", "a", "b"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) actual = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(actual, expected) -@pytest.mark.parametrize("dtype", [ - {"b": "category"}, - {1: "category"} -]) +@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) def test_categorical_dtype_single(all_parsers, dtype): # see gh-10153 parser = all_parsers @@ -113,9 +117,9 @@ def test_categorical_dtype_single(all_parsers, dtype): 1,a,3.4 1,a,3.4 2,b,4.5""" - expected = DataFrame({"a": [1, 1, 2], - "b": Categorical(["a", "a", "b"]), - "c": [3.4, 3.4, 4.5]}) + expected = DataFrame( + {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]} + ) actual = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(actual, expected) @@ -127,9 +131,13 @@ def test_categorical_dtype_unsorted(all_parsers): 1,b,3.4 1,b,3.4 2,a,4.5""" - expected = DataFrame({"a": Categorical(["1", "1", "2"]), - "b": Categorical(["b", "b", "a"]), - "c": Categorical(["3.4", "3.4", "4.5"])}) + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["b", "b", "a"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) actual = parser.read_csv(StringIO(data), dtype="category") tm.assert_frame_equal(actual, expected) @@ -141,9 +149,13 @@ def test_categorical_dtype_missing(all_parsers): 1,b,3.4 1,nan,3.4 2,a,4.5""" - expected = DataFrame({"a": Categorical(["1", "1", "2"]), - "b": Categorical(["b", np.nan, "a"]), - "c": Categorical(["3.4", "3.4", "4.5"])}) + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["b", np.nan, "a"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) actual = parser.read_csv(StringIO(data), dtype="category") tm.assert_frame_equal(actual, expected) @@ -155,10 +167,10 @@ def test_categorical_dtype_high_cardinality_numeric(all_parsers): data = np.sort([str(i) for i in range(524289)]) expected = DataFrame({"a": Categorical(data, ordered=True)}) - actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), - dtype="category") + actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category") actual["a"] = actual["a"].cat.reorder_categories( - np.sort(actual.a.cat.categories), ordered=True) + np.sort(actual.a.cat.categories), ordered=True + ) tm.assert_frame_equal(actual, expected) @@ -171,8 +183,7 @@ def test_categorical_dtype_latin1(all_parsers, csv_dir_path): expected = parser.read_csv(pth, header=None, encoding=encoding) expected[1] = Categorical(expected[1]) - actual = parser.read_csv(pth, header=None, encoding=encoding, - dtype={1: "category"}) + actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"}) tm.assert_frame_equal(actual, expected) @@ -198,13 +209,11 @@ def test_categorical_dtype_chunksize_infer_categories(all_parsers): 1,b 1,b 2,c""" - expecteds = [DataFrame({"a": [1, 1], - "b": Categorical(["a", "b"])}), - DataFrame({"a": [1, 2], - "b": Categorical(["b", "c"])}, - index=[2, 3])] - actuals = parser.read_csv(StringIO(data), dtype={"b": "category"}, - chunksize=2) + expecteds = [ + DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}), + DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]), + ] + actuals = parser.read_csv(StringIO(data), dtype={"b": "category"}, chunksize=2) for actual, expected in zip(actuals, expecteds): tm.assert_frame_equal(actual, expected) @@ -219,13 +228,12 @@ def test_categorical_dtype_chunksize_explicit_categories(all_parsers): 1,b 2,c""" cats = ["a", "b", "c"] - expecteds = [DataFrame({"a": [1, 1], - "b": Categorical(["a", "b"], - categories=cats)}), - DataFrame({"a": [1, 2], - "b": Categorical(["b", "c"], - categories=cats)}, - index=[2, 3])] + expecteds = [ + DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}), + DataFrame( + {"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)}, index=[2, 3] + ), + ] dtype = CategoricalDtype(cats) actuals = parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) @@ -234,12 +242,10 @@ def test_categorical_dtype_chunksize_explicit_categories(all_parsers): @pytest.mark.parametrize("ordered", [False, True]) -@pytest.mark.parametrize("categories", [ - ["a", "b", "c"], - ["a", "c", "b"], - ["a", "b", "c", "d"], - ["c", "b", "a"], -]) +@pytest.mark.parametrize( + "categories", + [["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]], +) def test_categorical_category_dtype(all_parsers, categories, ordered): parser = all_parsers data = """a,b @@ -247,15 +253,16 @@ def test_categorical_category_dtype(all_parsers, categories, ordered): 1,b 1,b 2,c""" - expected = DataFrame({ - "a": [1, 1, 1, 2], - "b": Categorical(["a", "b", "b", "c"], - categories=categories, - ordered=ordered) - }) - - dtype = {"b": CategoricalDtype(categories=categories, - ordered=ordered)} + expected = DataFrame( + { + "a": [1, 1, 1, 2], + "b": Categorical( + ["a", "b", "b", "c"], categories=categories, ordered=ordered + ), + } + ) + + dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)} result = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(result, expected) @@ -268,10 +275,12 @@ def test_categorical_category_dtype_unsorted(all_parsers): 1,b 2,c""" dtype = CategoricalDtype(["c", "b", "a"]) - expected = DataFrame({ - "a": [1, 1, 1, 2], - "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]) - }) + expected = DataFrame( + { + "a": [1, 1, 1, 2], + "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]), + } + ) result = parser.read_csv(StringIO(data), dtype={"b": dtype}) tm.assert_frame_equal(result, expected) @@ -321,12 +330,15 @@ def test_categorical_coerces_timedelta(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data", [ - "b\nTrue\nFalse\nNA\nFalse", - "b\ntrue\nfalse\nNA\nfalse", - "b\nTRUE\nFALSE\nNA\nFALSE", - "b\nTrue\nFalse\nNA\nFALSE", -]) +@pytest.mark.parametrize( + "data", + [ + "b\nTrue\nFalse\nNA\nFalse", + "b\ntrue\nfalse\nNA\nfalse", + "b\nTRUE\nFALSE\nNA\nFALSE", + "b\nTrue\nFalse\nNA\nFALSE", + ], +) def test_categorical_dtype_coerces_boolean(all_parsers, data): # see gh-20498 parser = all_parsers @@ -342,8 +354,7 @@ def test_categorical_unexpected_categories(all_parsers): dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])} data = "b\nd\na\nc\nd" # Unexpected c - expected = DataFrame({"b": Categorical(list("dacd"), - dtype=dtype["b"])}) + expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])}) result = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(result, expected) @@ -355,9 +366,10 @@ def test_empty_pass_dtype(all_parsers): data = "one,two" result = parser.read_csv(StringIO(data), dtype={"one": "u1"}) - expected = DataFrame({"one": np.empty(0, dtype="u1"), - "two": np.empty(0, dtype=np.object)}, - index=Index([], dtype=object)) + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=np.object)}, + index=Index([], dtype=object), + ) tm.assert_frame_equal(result, expected) @@ -365,11 +377,13 @@ def test_empty_with_index_pass_dtype(all_parsers): parser = all_parsers data = "one,two" - result = parser.read_csv(StringIO(data), index_col=["one"], - dtype={"one": "u1", 1: "f"}) + result = parser.read_csv( + StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"} + ) - expected = DataFrame({"two": np.empty(0, dtype="f")}, - index=Index([], dtype="u1", name="one")) + expected = DataFrame( + {"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one") + ) tm.assert_frame_equal(result, expected) @@ -377,14 +391,14 @@ def test_empty_with_multi_index_pass_dtype(all_parsers): parser = all_parsers data = "one,two,three" - result = parser.read_csv(StringIO(data), index_col=["one", "two"], - dtype={"one": "u1", 1: "f8"}) - - exp_idx = MultiIndex.from_arrays([np.empty(0, dtype="u1"), - np.empty(0, dtype=np.float64)], - names=["one", "two"]) - expected = DataFrame({"three": np.empty(0, dtype=np.object)}, - index=exp_idx) + result = parser.read_csv( + StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"} + ) + + exp_idx = MultiIndex.from_arrays( + [np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)], names=["one", "two"] + ) + expected = DataFrame({"three": np.empty(0, dtype=np.object)}, index=exp_idx) tm.assert_frame_equal(result, expected) @@ -394,9 +408,10 @@ def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers): data = "one,one" result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"}) - expected = DataFrame({"one": np.empty(0, dtype="u1"), - "one.1": np.empty(0, dtype="f")}, - index=Index([], dtype=object)) + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, + index=Index([], dtype=object), + ) tm.assert_frame_equal(result, expected) @@ -406,17 +421,20 @@ def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): data = "one,one" result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) - expected = DataFrame({"one": np.empty(0, dtype="u1"), - "one.1": np.empty(0, dtype="f")}, - index=Index([], dtype=object)) + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, + index=Index([], dtype=object), + ) tm.assert_frame_equal(result, expected) def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers): # see gh-9424 parser = all_parsers - expected = concat([Series([], name="one", dtype="u1"), - Series([], name="one.1", dtype="f")], axis=1) + expected = concat( + [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], + axis=1, + ) expected.index = expected.index.astype(object) data = "one,one" @@ -427,14 +445,15 @@ def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers): def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers): # see gh-9424 parser = all_parsers - expected = concat([Series([], name="one", dtype="u1"), - Series([], name="one.1", dtype="f")], axis=1) + expected = concat( + [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], + axis=1, + ) expected.index = expected.index.astype(object) - with pytest.raises(ValueError, match='Duplicate names'): + with pytest.raises(ValueError, match="Duplicate names"): data = "" - parser.read_csv(StringIO(data), names=["one", "one"], - dtype={0: "u1", 1: "f"}) + parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"}) def test_raise_on_passed_int_dtype_with_nas(all_parsers): @@ -445,11 +464,13 @@ def test_raise_on_passed_int_dtype_with_nas(all_parsers): 2001,,11 2001,106380451,67""" - msg = ("Integer column has NA values" if parser.engine == "c" else - "Unable to convert column DOY") + msg = ( + "Integer column has NA values" + if parser.engine == "c" + else "Unable to convert column DOY" + ) with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, - skipinitialspace=True) + parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True) def test_dtype_with_converters(all_parsers): @@ -460,36 +481,56 @@ def test_dtype_with_converters(all_parsers): # Dtype spec ignored if converted specified. with tm.assert_produces_warning(ParserWarning): - result = parser.read_csv(StringIO(data), dtype={"a": "i8"}, - converters={"a": lambda x: str(x)}) + result = parser.read_csv( + StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)} + ) expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]}) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("dtype,expected", [ - (np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)), - ("category", DataFrame({"a": Categorical([]), - "b": Categorical([])}, - index=[])), - (dict(a="category", b="category"), - DataFrame({"a": Categorical([]), - "b": Categorical([])}, - index=[])), - ("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")), - ("timedelta64[ns]", DataFrame({"a": Series([], dtype="timedelta64[ns]"), - "b": Series([], dtype="timedelta64[ns]")}, - index=[])), - (dict(a=np.int64, - b=np.int32), DataFrame({"a": Series([], dtype=np.int64), - "b": Series([], dtype=np.int32)}, - index=[])), - ({0: np.int64, 1: np.int32}, DataFrame({"a": Series([], dtype=np.int64), - "b": Series([], dtype=np.int32)}, - index=[])), - ({"a": np.int64, 1: np.int32}, DataFrame({"a": Series([], dtype=np.int64), - "b": Series([], dtype=np.int32)}, - index=[])), -]) +@pytest.mark.parametrize( + "dtype,expected", + [ + (np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)), + ("category", DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[])), + ( + dict(a="category", b="category"), + DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]), + ), + ("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")), + ( + "timedelta64[ns]", + DataFrame( + { + "a": Series([], dtype="timedelta64[ns]"), + "b": Series([], dtype="timedelta64[ns]"), + }, + index=[], + ), + ), + ( + dict(a=np.int64, b=np.int32), + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + index=[], + ), + ), + ( + {0: np.int64, 1: np.int32}, + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + index=[], + ), + ), + ( + {"a": np.int64, 1: np.int32}, + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + index=[], + ), + ), + ], +) def test_empty_dtype(all_parsers, dtype, expected): # see gh-14712 parser = all_parsers @@ -499,8 +540,9 @@ def test_empty_dtype(all_parsers, dtype, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("dtype", list(np.typecodes["AllInteger"] + - np.typecodes["Float"])) +@pytest.mark.parametrize( + "dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"]) +) def test_numeric_dtype(all_parsers, dtype): data = "0\n1" parser = all_parsers diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index faf62f98be2e00..ff1dd10bdd0d9e 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -46,10 +46,10 @@ def test_no_header_prefix(all_parsers): 11,12,13,14,15 """ result = parser.read_csv(StringIO(data), prefix="Field", header=None) - expected = DataFrame([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], - [11, 12, 13, 14, 15]], - columns=["Field0", "Field1", "Field2", - "Field3", "Field4"]) + expected = DataFrame( + [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], + columns=["Field0", "Field1", "Field2", "Field3", "Field4"], + ) tm.assert_frame_equal(result, expected) @@ -62,9 +62,11 @@ def test_header_with_index_col(all_parsers): names = ["A", "B", "C"] result = parser.read_csv(StringIO(data), names=names) - expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=["foo", "bar", "baz"], - columns=["A", "B", "C"]) + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"], + ) tm.assert_frame_equal(result, expected) @@ -90,8 +92,7 @@ def test_header_not_first_line(all_parsers): def test_header_multi_index(all_parsers): parser = all_parsers - expected = tm.makeCustomDataframe( - 5, 3, r_idx_nlevels=2, c_idx_nlevels=4) + expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) data = """\ C0,,C_l0_g0,C_l0_g1,C_l0_g2 @@ -106,23 +107,31 @@ def test_header_multi_index(all_parsers): R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 """ - result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], - index_col=[0, 1]) + result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1]) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs,msg", [ - (dict(index_col=["foo", "bar"]), ("index_col must only contain " - "row numbers when specifying " - "a multi-index header")), - (dict(index_col=[0, 1], names=["foo", "bar"]), ("cannot specify names " - "when specifying a " - "multi-index header")), - (dict(index_col=[0, 1], usecols=["foo", "bar"]), ("cannot specify " - "usecols when " - "specifying a " - "multi-index header")), -]) +@pytest.mark.parametrize( + "kwargs,msg", + [ + ( + dict(index_col=["foo", "bar"]), + ( + "index_col must only contain " + "row numbers when specifying " + "a multi-index header" + ), + ), + ( + dict(index_col=[0, 1], names=["foo", "bar"]), + ("cannot specify names " "when specifying a " "multi-index header"), + ), + ( + dict(index_col=[0, 1], usecols=["foo", "bar"]), + ("cannot specify " "usecols when " "specifying a " "multi-index header"), + ), + ], +) def test_header_multi_index_invalid(all_parsers, kwargs, msg): data = """\ C0,,C_l0_g0,C_l0_g1,C_l0_g2 @@ -146,23 +155,43 @@ def test_header_multi_index_invalid(all_parsers, kwargs, msg): _TestTuple = namedtuple("names", ["first", "second"]) -@pytest.mark.parametrize("kwargs", [ - dict(header=[0, 1]), - dict(skiprows=3, - names=[("a", "q"), ("a", "r"), ("a", "s"), - ("b", "t"), ("c", "u"), ("c", "v")]), - dict(skiprows=3, - names=[_TestTuple("a", "q"), _TestTuple("a", "r"), - _TestTuple("a", "s"), _TestTuple("b", "t"), - _TestTuple("c", "u"), _TestTuple("c", "v")]) -]) +@pytest.mark.parametrize( + "kwargs", + [ + dict(header=[0, 1]), + dict( + skiprows=3, + names=[ + ("a", "q"), + ("a", "r"), + ("a", "s"), + ("b", "t"), + ("c", "u"), + ("c", "v"), + ], + ), + dict( + skiprows=3, + names=[ + _TestTuple("a", "q"), + _TestTuple("a", "r"), + _TestTuple("a", "s"), + _TestTuple("b", "t"), + _TestTuple("c", "u"), + _TestTuple("c", "v"), + ], + ), + ], +) def test_header_multi_index_common_format1(all_parsers, kwargs): parser = all_parsers - expected = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], - index=["one", "two"], - columns=MultiIndex.from_tuples( - [("a", "q"), ("a", "r"), ("a", "s"), - ("b", "t"), ("c", "u"), ("c", "v")])) + expected = DataFrame( + [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + index=["one", "two"], + columns=MultiIndex.from_tuples( + [("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")] + ), + ) data = """,a,a,a,b,c,c ,q,r,s,t,u,v ,,,,,, @@ -173,23 +202,43 @@ def test_header_multi_index_common_format1(all_parsers, kwargs): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs", [ - dict(header=[0, 1]), - dict(skiprows=2, - names=[("a", "q"), ("a", "r"), ("a", "s"), - ("b", "t"), ("c", "u"), ("c", "v")]), - dict(skiprows=2, - names=[_TestTuple("a", "q"), _TestTuple("a", "r"), - _TestTuple("a", "s"), _TestTuple("b", "t"), - _TestTuple("c", "u"), _TestTuple("c", "v")]) -]) +@pytest.mark.parametrize( + "kwargs", + [ + dict(header=[0, 1]), + dict( + skiprows=2, + names=[ + ("a", "q"), + ("a", "r"), + ("a", "s"), + ("b", "t"), + ("c", "u"), + ("c", "v"), + ], + ), + dict( + skiprows=2, + names=[ + _TestTuple("a", "q"), + _TestTuple("a", "r"), + _TestTuple("a", "s"), + _TestTuple("b", "t"), + _TestTuple("c", "u"), + _TestTuple("c", "v"), + ], + ), + ], +) def test_header_multi_index_common_format2(all_parsers, kwargs): parser = all_parsers - expected = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], - index=["one", "two"], - columns=MultiIndex.from_tuples( - [("a", "q"), ("a", "r"), ("a", "s"), - ("b", "t"), ("c", "u"), ("c", "v")])) + expected = DataFrame( + [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + index=["one", "two"], + columns=MultiIndex.from_tuples( + [("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")] + ), + ) data = """,a,a,a,b,c,c ,q,r,s,t,u,v one,1,2,3,4,5,6 @@ -199,23 +248,43 @@ def test_header_multi_index_common_format2(all_parsers, kwargs): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs", [ - dict(header=[0, 1]), - dict(skiprows=2, - names=[("a", "q"), ("a", "r"), ("a", "s"), - ("b", "t"), ("c", "u"), ("c", "v")]), - dict(skiprows=2, - names=[_TestTuple("a", "q"), _TestTuple("a", "r"), - _TestTuple("a", "s"), _TestTuple("b", "t"), - _TestTuple("c", "u"), _TestTuple("c", "v")]) -]) +@pytest.mark.parametrize( + "kwargs", + [ + dict(header=[0, 1]), + dict( + skiprows=2, + names=[ + ("a", "q"), + ("a", "r"), + ("a", "s"), + ("b", "t"), + ("c", "u"), + ("c", "v"), + ], + ), + dict( + skiprows=2, + names=[ + _TestTuple("a", "q"), + _TestTuple("a", "r"), + _TestTuple("a", "s"), + _TestTuple("b", "t"), + _TestTuple("c", "u"), + _TestTuple("c", "v"), + ], + ), + ], +) def test_header_multi_index_common_format3(all_parsers, kwargs): parser = all_parsers - expected = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], - index=["one", "two"], - columns=MultiIndex.from_tuples( - [("a", "q"), ("a", "r"), ("a", "s"), - ("b", "t"), ("c", "u"), ("c", "v")])) + expected = DataFrame( + [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + index=["one", "two"], + columns=MultiIndex.from_tuples( + [("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")] + ), + ) expected = expected.reset_index(drop=True) data = """a,a,a,b,c,c q,r,s,t,u,v @@ -228,14 +297,15 @@ def test_header_multi_index_common_format3(all_parsers, kwargs): def test_header_multi_index_common_format_malformed1(all_parsers): parser = all_parsers - expected = DataFrame(np.array( - [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"), + expected = DataFrame( + np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"), index=Index([1, 7]), - columns=MultiIndex(levels=[["a", "b", "c"], - ["r", "s", "t", - "u", "v"]], - codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], - names=["a", "q"])) + columns=MultiIndex( + levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]], + codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + names=["a", "q"], + ), + ) data = """a,a,a,b,c,c q,r,s,t,u,v 1,2,3,4,5,6 @@ -247,14 +317,15 @@ def test_header_multi_index_common_format_malformed1(all_parsers): def test_header_multi_index_common_format_malformed2(all_parsers): parser = all_parsers - expected = DataFrame(np.array( - [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"), + expected = DataFrame( + np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"), index=Index([1, 7]), - columns=MultiIndex(levels=[["a", "b", "c"], - ["r", "s", "t", - "u", "v"]], - codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], - names=[None, "q"])) + columns=MultiIndex( + levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]], + codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + names=[None, "q"], + ), + ) data = """,a,a,b,c,c q,r,s,t,u,v @@ -267,14 +338,15 @@ def test_header_multi_index_common_format_malformed2(all_parsers): def test_header_multi_index_common_format_malformed3(all_parsers): parser = all_parsers - expected = DataFrame(np.array( - [[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"), - index=MultiIndex(levels=[[1, 7], [2, 8]], - codes=[[0, 1], [0, 1]]), - columns=MultiIndex(levels=[["a", "b", "c"], - ["s", "t", "u", "v"]], - codes=[[0, 1, 2, 2], [0, 1, 2, 3]], - names=[None, "q"])) + expected = DataFrame( + np.array([[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"), + index=MultiIndex(levels=[[1, 7], [2, 8]], codes=[[0, 1], [0, 1]]), + columns=MultiIndex( + levels=[["a", "b", "c"], ["s", "t", "u", "v"]], + codes=[[0, 1, 2, 2], [0, 1, 2, 3]], + names=[None, "q"], + ), + ) data = """,a,a,b,c,c q,r,s,t,u,v 1,2,3,4,5,6 @@ -284,24 +356,19 @@ def test_header_multi_index_common_format_malformed3(all_parsers): tm.assert_frame_equal(expected, result) -@pytest.mark.parametrize("data,header", [ - ("1,2,3\n4,5,6", None), - ("foo,bar,baz\n1,2,3\n4,5,6", 0), -]) +@pytest.mark.parametrize( + "data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)] +) def test_header_names_backward_compat(all_parsers, data, header): # see gh-2539 parser = all_parsers - expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), - names=["a", "b", "c"]) + expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), names=["a", "b", "c"]) - result = parser.read_csv(StringIO(data), names=["a", "b", "c"], - header=header) + result = parser.read_csv(StringIO(data), names=["a", "b", "c"], header=header) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs", [ - dict(), dict(index_col=False) -]) +@pytest.mark.parametrize("kwargs", [dict(), dict(index_col=False)]) def test_read_only_header_no_rows(all_parsers, kwargs): # See gh-7773 parser = all_parsers @@ -311,29 +378,31 @@ def test_read_only_header_no_rows(all_parsers, kwargs): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs,names", [ - (dict(), [0, 1, 2, 3, 4]), - (dict(prefix="X"), ["X0", "X1", "X2", "X3", "X4"]), - (dict(names=["foo", "bar", "baz", "quux", "panda"]), - ["foo", "bar", "baz", "quux", "panda"]) -]) +@pytest.mark.parametrize( + "kwargs,names", + [ + (dict(), [0, 1, 2, 3, 4]), + (dict(prefix="X"), ["X0", "X1", "X2", "X3", "X4"]), + ( + dict(names=["foo", "bar", "baz", "quux", "panda"]), + ["foo", "bar", "baz", "quux", "panda"], + ), + ], +) def test_no_header(all_parsers, kwargs, names): parser = all_parsers data = """1,2,3,4,5 6,7,8,9,10 11,12,13,14,15 """ - expected = DataFrame([[1, 2, 3, 4, 5], - [6, 7, 8, 9, 10], - [11, 12, 13, 14, 15]], columns=names) + expected = DataFrame( + [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], columns=names + ) result = parser.read_csv(StringIO(data), header=None, **kwargs) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("header", [ - ["a", "b"], - "string_header" -]) +@pytest.mark.parametrize("header", [["a", "b"], "string_header"]) def test_non_int_header(all_parsers, header): # see gh-16338 msg = "header must be integer or list of integers" @@ -354,24 +423,44 @@ def test_singleton_header(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,expected", [ - ("A,A,A,B\none,one,one,two\n0,40,34,0.1", - DataFrame([[0, 40, 34, 0.1]], - columns=MultiIndex.from_tuples( - [("A", "one"), ("A", "one.1"), - ("A", "one.2"), ("B", "two")]))), - ("A,A,A,B\none,one,one.1,two\n0,40,34,0.1", - DataFrame([[0, 40, 34, 0.1]], - columns=MultiIndex.from_tuples( - [("A", "one"), ("A", "one.1"), - ("A", "one.1.1"), ("B", "two")]))), - ("A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1", - DataFrame([[0, 40, 34, 0.1, 0.1]], - columns=MultiIndex.from_tuples( - [("A", "one"), ("A", "one.1"), - ("A", "one.1.1"), ("B", "two"), - ("B", "two.1")]))) -]) +@pytest.mark.parametrize( + "data,expected", + [ + ( + "A,A,A,B\none,one,one,two\n0,40,34,0.1", + DataFrame( + [[0, 40, 34, 0.1]], + columns=MultiIndex.from_tuples( + [("A", "one"), ("A", "one.1"), ("A", "one.2"), ("B", "two")] + ), + ), + ), + ( + "A,A,A,B\none,one,one.1,two\n0,40,34,0.1", + DataFrame( + [[0, 40, 34, 0.1]], + columns=MultiIndex.from_tuples( + [("A", "one"), ("A", "one.1"), ("A", "one.1.1"), ("B", "two")] + ), + ), + ), + ( + "A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1", + DataFrame( + [[0, 40, 34, 0.1, 0.1]], + columns=MultiIndex.from_tuples( + [ + ("A", "one"), + ("A", "one.1"), + ("A", "one.1.1"), + ("B", "two"), + ("B", "two.1"), + ] + ), + ), + ), + ], +) def test_mangles_multi_index(all_parsers, data, expected): # see gh-18062 parser = all_parsers @@ -381,10 +470,9 @@ def test_mangles_multi_index(all_parsers, data, expected): @pytest.mark.parametrize("index_col", [None, [0]]) -@pytest.mark.parametrize("columns", [None, - (["", "Unnamed"]), - (["Unnamed", ""]), - (["Unnamed", "NotUnnamed"])]) +@pytest.mark.parametrize( + "columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])] +) def test_multi_index_unnamed(all_parsers, index_col, columns): # see gh-23687 # @@ -400,18 +488,17 @@ def test_multi_index_unnamed(all_parsers, index_col, columns): if index_col is None: data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n" else: - data = (",".join([""] + (columns or ["", ""])) + - "\n,0,1\n0,2,3\n1,4,5\n") + data = ",".join([""] + (columns or ["", ""])) + "\n,0,1\n0,2,3\n1,4,5\n" if columns is None: - msg = (r"Passed header=\[0,1\] are too " - r"many rows for this multi_index of columns") + msg = ( + r"Passed header=\[0,1\] are too " + r"many rows for this multi_index of columns" + ) with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), header=header, - index_col=index_col) + parser.read_csv(StringIO(data), header=header, index_col=index_col) else: - result = parser.read_csv(StringIO(data), header=header, - index_col=index_col) + result = parser.read_csv(StringIO(data), header=header, index_col=index_col) template = "Unnamed: {i}_level_0" exp_columns = [] diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index de212880d15766..8199d632223c1f 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -21,7 +21,9 @@ def test_index_col_named(all_parsers, with_header): KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa - header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" # noqa + header = ( + "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" + ) # noqa if with_header: data = header + no_header @@ -45,14 +47,13 @@ def test_index_col_named2(all_parsers): 9,10,11,12,foo """ - expected = DataFrame({"a": [1, 5, 9], "b": [2, 6, 10], - "c": [3, 7, 11], "d": [4, 8, 12]}, - index=Index(["hello", "world", "foo"], - name="message")) + expected = DataFrame( + {"a": [1, 5, 9], "b": [2, 6, 10], "c": [3, 7, 11], "d": [4, 8, 12]}, + index=Index(["hello", "world", "foo"], name="message"), + ) names = ["a", "b", "c", "d", "message"] - result = parser.read_csv(StringIO(data), names=names, - index_col=["message"]) + result = parser.read_csv(StringIO(data), names=names, index_col=["message"]) tm.assert_frame_equal(result, expected) @@ -61,8 +62,9 @@ def test_index_col_is_true(all_parsers): data = "a,b\n1,2" parser = all_parsers - with pytest.raises(ValueError, match="The value of index_col " - "couldn't be 'True'"): + with pytest.raises( + ValueError, match="The value of index_col " "couldn't be 'True'" + ): parser.read_csv(StringIO(data), index_col=True) @@ -75,28 +77,49 @@ def test_infer_index_col(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data)) - expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=["foo", "bar", "baz"], - columns=["A", "B", "C"]) + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"], + ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("index_col,kwargs", [ - (None, dict(columns=["x", "y", "z"])), - (False, dict(columns=["x", "y", "z"])), - (0, dict(columns=["y", "z"], index=Index([], name="x"))), - (1, dict(columns=["x", "z"], index=Index([], name="y"))), - ("x", dict(columns=["y", "z"], index=Index([], name="x"))), - ("y", dict(columns=["x", "z"], index=Index([], name="y"))), - ([0, 1], dict(columns=["z"], index=MultiIndex.from_arrays( - [[]] * 2, names=["x", "y"]))), - (["x", "y"], dict(columns=["z"], index=MultiIndex.from_arrays( - [[]] * 2, names=["x", "y"]))), - ([1, 0], dict(columns=["z"], index=MultiIndex.from_arrays( - [[]] * 2, names=["y", "x"]))), - (["y", "x"], dict(columns=["z"], index=MultiIndex.from_arrays( - [[]] * 2, names=["y", "x"]))), -]) +@pytest.mark.parametrize( + "index_col,kwargs", + [ + (None, dict(columns=["x", "y", "z"])), + (False, dict(columns=["x", "y", "z"])), + (0, dict(columns=["y", "z"], index=Index([], name="x"))), + (1, dict(columns=["x", "z"], index=Index([], name="y"))), + ("x", dict(columns=["y", "z"], index=Index([], name="x"))), + ("y", dict(columns=["x", "z"], index=Index([], name="y"))), + ( + [0, 1], + dict( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"]) + ), + ), + ( + ["x", "y"], + dict( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"]) + ), + ), + ( + [1, 0], + dict( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"]) + ), + ), + ( + ["y", "x"], + dict( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"]) + ), + ), + ], +) def test_index_col_empty_data(all_parsers, index_col, kwargs): data = "x,y,z" parser = all_parsers @@ -116,13 +139,16 @@ def test_empty_with_index_col_false(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("index_names", [ - ["", ""], - ["foo", ""], - ["", "bar"], - ["foo", "bar"], - ["NotReallyUnnamed", "Unnamed: 0"], -]) +@pytest.mark.parametrize( + "index_names", + [ + ["", ""], + ["foo", ""], + ["", "bar"], + ["foo", "bar"], + ["NotReallyUnnamed", "Unnamed: 0"], + ], +) def test_multi_index_naming(all_parsers, index_names): parser = all_parsers @@ -130,9 +156,9 @@ def test_multi_index_naming(all_parsers, index_names): data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"]) result = parser.read_csv(StringIO(data), index_col=[0, 1]) - expected = DataFrame({"col": [1, 2, 3, 4]}, - index=MultiIndex.from_product([["a", "b"], - ["c", "d"]])) + expected = DataFrame( + {"col": [1, 2, 3, 4]}, index=MultiIndex.from_product([["a", "b"], ["c", "d"]]) + ) expected.index.names = [name if name else None for name in index_names] tm.assert_frame_equal(result, expected) @@ -142,8 +168,10 @@ def test_multi_index_naming_not_all_at_beginning(all_parsers): data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4" result = parser.read_csv(StringIO(data), index_col=[0, 2]) - expected = DataFrame({"Unnamed: 2": ["c", "d", "c", "d"]}, - index=MultiIndex( - levels=[['a', 'b'], [1, 2, 3, 4]], - codes=[[0, 0, 1, 1], [0, 1, 2, 3]])) + expected = DataFrame( + {"Unnamed: 2": ["c", "d", "c", "d"]}, + index=MultiIndex( + levels=[["a", "b"], [1, 2, 3, 4]], codes=[[0, 0, 1, 1], [0, 1, 2, 3]] + ), + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 6ab761398631b9..d1444210902744 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -20,8 +20,7 @@ def test_basic(all_parsers, kwargs): data = "a,a,b,b,b\n1,2,3,4,5" result = parser.read_csv(StringIO(data), sep=",", **kwargs) - expected = DataFrame([[1, 2, 3, 4, 5]], - columns=["a", "a.1", "b", "b.1", "b.2"]) + expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"]) tm.assert_frame_equal(result, expected) @@ -30,8 +29,7 @@ def test_basic_names(all_parsers): parser = all_parsers data = "a,b,a\n0,1,2\n3,4,5" - expected = DataFrame([[0, 1, 2], [3, 4, 5]], - columns=["a", "b", "a.1"]) + expected = DataFrame([[0, 1, 2], [3, 4, 5]], columns=["a", "b", "a.1"]) result = parser.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) @@ -42,20 +40,30 @@ def test_basic_names_raise(all_parsers): parser = all_parsers data = "0,1,2\n3,4,5" - with pytest.raises(ValueError, match='Duplicate names'): + with pytest.raises(ValueError, match="Duplicate names"): parser.read_csv(StringIO(data), names=["a", "b", "a"]) -@pytest.mark.parametrize("data,expected", [ - ("a,a,a.1\n1,2,3", - DataFrame([[1, 2, 3]], columns=["a", "a.1", "a.1.1"])), - ("a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6", - DataFrame([[1, 2, 3, 4, 5, 6]], columns=["a", "a.1", "a.1.1", "a.1.1.1", - "a.1.1.1.1", "a.1.1.1.1.1"])), - ("a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7", - DataFrame([[1, 2, 3, 4, 5, 6, 7]], columns=["a", "a.1", "a.3", "a.1.1", - "a.2", "a.2.1", "a.3.1"])) -]) +@pytest.mark.parametrize( + "data,expected", + [ + ("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.1", "a.1.1"])), + ( + "a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6", + DataFrame( + [[1, 2, 3, 4, 5, 6]], + columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"], + ), + ), + ( + "a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7", + DataFrame( + [[1, 2, 3, 4, 5, 6, 7]], + columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"], + ), + ), + ], +) def test_thorough_mangle_columns(all_parsers, data, expected): # see gh-17060 parser = all_parsers @@ -64,29 +72,42 @@ def test_thorough_mangle_columns(all_parsers, data, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,names,expected", [ - ("a,b,b\n1,2,3", - ["a.1", "a.1", "a.1.1"], - DataFrame([["a", "b", "b"], ["1", "2", "3"]], - columns=["a.1", "a.1.1", "a.1.1.1"])), - ("a,b,c,d,e,f\n1,2,3,4,5,6", - ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"], - DataFrame([["a", "b", "c", "d", "e", "f"], - ["1", "2", "3", "4", "5", "6"]], - columns=["a", "a.1", "a.1.1", "a.1.1.1", - "a.1.1.1.1", "a.1.1.1.1.1"])), - ("a,b,c,d,e,f,g\n1,2,3,4,5,6,7", - ["a", "a", "a.3", "a.1", "a.2", "a", "a"], - DataFrame([["a", "b", "c", "d", "e", "f", "g"], - ["1", "2", "3", "4", "5", "6", "7"]], - columns=["a", "a.1", "a.3", "a.1.1", - "a.2", "a.2.1", "a.3.1"])), -]) +@pytest.mark.parametrize( + "data,names,expected", + [ + ( + "a,b,b\n1,2,3", + ["a.1", "a.1", "a.1.1"], + DataFrame( + [["a", "b", "b"], ["1", "2", "3"]], columns=["a.1", "a.1.1", "a.1.1.1"] + ), + ), + ( + "a,b,c,d,e,f\n1,2,3,4,5,6", + ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"], + DataFrame( + [["a", "b", "c", "d", "e", "f"], ["1", "2", "3", "4", "5", "6"]], + columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"], + ), + ), + ( + "a,b,c,d,e,f,g\n1,2,3,4,5,6,7", + ["a", "a", "a.3", "a.1", "a.2", "a", "a"], + DataFrame( + [ + ["a", "b", "c", "d", "e", "f", "g"], + ["1", "2", "3", "4", "5", "6", "7"], + ], + columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"], + ), + ), + ], +) def test_thorough_mangle_names(all_parsers, data, names, expected): # see gh-17095 parser = all_parsers - with pytest.raises(ValueError, match='Duplicate names'): + with pytest.raises(ValueError, match="Duplicate names"): parser.read_csv(StringIO(data), names=names) diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index b749a8984284d0..392628ee74ba2f 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -29,9 +29,7 @@ def _construct_dataframe(num_rows): df["foo"] = "foo" df["bar"] = "bar" df["baz"] = "baz" - df["date"] = pd.date_range("20000101 09:00:00", - periods=num_rows, - freq="s") + df["date"] = pd.date_range("20000101 09:00:00", periods=num_rows, freq="s") df["int"] = np.arange(num_rows, dtype="int64") return df @@ -43,9 +41,9 @@ def test_multi_thread_string_io_read_csv(all_parsers): num_files = 100 bytes_to_df = [ - "\n".join( - ["%d,%d,%d" % (i, i, i) for i in range(max_row_range)] - ).encode() for _ in range(num_files)] + "\n".join(["%d,%d,%d" % (i, i, i) for i in range(max_row_range)]).encode() + for _ in range(num_files) + ] files = [BytesIO(b) for b in bytes_to_df] # Read all files in many threads. @@ -77,6 +75,7 @@ def _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks): ------- df : DataFrame """ + def reader(arg): """ Create a reader for part of the CSV. @@ -98,16 +97,21 @@ def reader(arg): start, nrows = arg if not start: - return parser.read_csv(path, index_col=0, header=0, - nrows=nrows, parse_dates=["date"]) - - return parser.read_csv(path, index_col=0, header=None, - skiprows=int(start) + 1, - nrows=nrows, parse_dates=[9]) + return parser.read_csv( + path, index_col=0, header=0, nrows=nrows, parse_dates=["date"] + ) + + return parser.read_csv( + path, + index_col=0, + header=None, + skiprows=int(start) + 1, + nrows=nrows, + parse_dates=[9], + ) tasks = [ - (num_rows * i // num_tasks, - num_rows // num_tasks) for i in range(num_tasks) + (num_rows * i // num_tasks, num_rows // num_tasks) for i in range(num_tasks) ] pool = ThreadPool(processes=num_tasks) @@ -134,6 +138,7 @@ def test_multi_thread_path_multipart_read_csv(all_parsers): with tm.ensure_clean(file_name) as path: df.to_csv(path) - final_dataframe = _generate_multi_thread_dataframe(parser, path, - num_rows, num_tasks) + final_dataframe = _generate_multi_thread_dataframe( + parser, path, num_rows, num_tasks + ) tm.assert_frame_equal(df, final_dataframe) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 2367f71a2557e6..f154d09358dc13 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -21,10 +21,10 @@ def test_string_nas(all_parsers): ,g,h """ result = parser.read_csv(StringIO(data)) - expected = DataFrame([["a", "b", "c"], - ["d", np.nan, "f"], - [np.nan, "g", "h"]], - columns=["A", "B", "C"]) + expected = DataFrame( + [["a", "b", "c"], ["d", np.nan, "f"], [np.nan, "g", "h"]], + columns=["A", "B", "C"], + ) tm.assert_frame_equal(result, expected) @@ -35,46 +35,70 @@ def test_detect_string_na(all_parsers): NA,baz NaN,nan """ - expected = DataFrame([["foo", "bar"], [np.nan, "baz"], - [np.nan, np.nan]], columns=["A", "B"]) + expected = DataFrame( + [["foo", "bar"], [np.nan, "baz"], [np.nan, np.nan]], columns=["A", "B"] + ) result = parser.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("na_values", [ - ["-999.0", "-999"], - [-999, -999.0], - [-999.0, -999], - ["-999.0"], ["-999"], - [-999.0], [-999] -]) -@pytest.mark.parametrize("data", [ - """A,B +@pytest.mark.parametrize( + "na_values", + [ + ["-999.0", "-999"], + [-999, -999.0], + [-999.0, -999], + ["-999.0"], + ["-999"], + [-999.0], + [-999], + ], +) +@pytest.mark.parametrize( + "data", + [ + """A,B -999,1.2 2,-999 3,4.5 """, - """A,B + """A,B -999,1.200 2,-999.000 3,4.500 -""" -]) +""", + ], +) def test_non_string_na_values(all_parsers, data, na_values): # see gh-3611: with an odd float format, we can't match # the string "999.0" exactly but still need float matching parser = all_parsers - expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], - [3.0, 4.5]], columns=["A", "B"]) + expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], [3.0, 4.5]], columns=["A", "B"]) result = parser.read_csv(StringIO(data), na_values=na_values) tm.assert_frame_equal(result, expected) def test_default_na_values(all_parsers): - _NA_VALUES = {"-1.#IND", "1.#QNAN", "1.#IND", "-1.#QNAN", "#N/A", - "N/A", "n/a", "NA", "#NA", "NULL", "null", "NaN", "nan", - "-NaN", "-nan", "#N/A N/A", ""} + _NA_VALUES = { + "-1.#IND", + "1.#QNAN", + "1.#IND", + "-1.#QNAN", + "#N/A", + "N/A", + "n/a", + "NA", + "#NA", + "NULL", + "null", + "NaN", + "nan", + "-NaN", + "-nan", + "#N/A N/A", + "", + } assert _NA_VALUES == com._NA_VALUES parser = all_parsers @@ -109,8 +133,9 @@ def test_custom_na_values(all_parsers, na_values): -1.#IND,5,baz 7,8,NaN """ - expected = DataFrame([[1., np.nan, 3], [np.nan, 5, np.nan], - [7, 8, np.nan]], columns=["A", "B", "C"]) + expected = DataFrame( + [[1.0, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=["A", "B", "C"] + ) result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1]) tm.assert_frame_equal(result, expected) @@ -122,9 +147,13 @@ def test_bool_na_values(all_parsers): False,NA,True""" parser = all_parsers result = parser.read_csv(StringIO(data)) - expected = DataFrame({"A": np.array([True, np.nan, False], dtype=object), - "B": np.array([False, True, np.nan], dtype=object), - "C": [True, False, True]}) + expected = DataFrame( + { + "A": np.array([True, np.nan, False], dtype=object), + "B": np.array([False, True, np.nan], dtype=object), + "C": [True, False, True], + } + ) tm.assert_frame_equal(result, expected) @@ -135,54 +164,95 @@ def test_na_value_dict(all_parsers): foo,bar,NA bar,foo,foo""" parser = all_parsers - df = parser.read_csv(StringIO(data), - na_values={"A": ["foo"], "B": ["bar"]}) - expected = DataFrame({"A": [np.nan, "bar", np.nan, "bar"], - "B": [np.nan, "foo", np.nan, "foo"], - "C": [np.nan, "foo", np.nan, "foo"]}) + df = parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]}) + expected = DataFrame( + { + "A": [np.nan, "bar", np.nan, "bar"], + "B": [np.nan, "foo", np.nan, "foo"], + "C": [np.nan, "foo", np.nan, "foo"], + } + ) tm.assert_frame_equal(df, expected) -@pytest.mark.parametrize("index_col,expected", [ - ([0], DataFrame({"b": [np.nan], "c": [1], "d": [5]}, - index=Index([0], name="a"))), - ([0, 2], DataFrame({"b": [np.nan], "d": [5]}, - index=MultiIndex.from_tuples( - [(0, 1)], names=["a", "c"]))), - (["a", "c"], DataFrame({"b": [np.nan], "d": [5]}, - index=MultiIndex.from_tuples( - [(0, 1)], names=["a", "c"]))), -]) +@pytest.mark.parametrize( + "index_col,expected", + [ + ( + [0], + DataFrame({"b": [np.nan], "c": [1], "d": [5]}, index=Index([0], name="a")), + ), + ( + [0, 2], + DataFrame( + {"b": [np.nan], "d": [5]}, + index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]), + ), + ), + ( + ["a", "c"], + DataFrame( + {"b": [np.nan], "d": [5]}, + index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]), + ), + ), + ], +) def test_na_value_dict_multi_index(all_parsers, index_col, expected): data = """\ a,b,c,d 0,NA,1,5 """ parser = all_parsers - result = parser.read_csv(StringIO(data), na_values=set(), - index_col=index_col) + result = parser.read_csv(StringIO(data), na_values=set(), index_col=index_col) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs,expected", [ - (dict(), DataFrame({"A": ["a", "b", np.nan, "d", "e", np.nan, "g"], - "B": [1, 2, 3, 4, 5, 6, 7], - "C": ["one", "two", "three", np.nan, "five", - np.nan, "seven"]})), - (dict(na_values={"A": [], "C": []}, keep_default_na=False), - DataFrame({"A": ["a", "b", "", "d", "e", "nan", "g"], - "B": [1, 2, 3, 4, 5, 6, 7], - "C": ["one", "two", "three", "nan", "five", "", "seven"]})), - (dict(na_values=["a"], keep_default_na=False), - DataFrame({"A": [np.nan, "b", "", "d", "e", "nan", "g"], - "B": [1, 2, 3, 4, 5, 6, 7], - "C": ["one", "two", "three", "nan", "five", "", "seven"]})), - (dict(na_values={"A": [], "C": []}), - DataFrame({"A": ["a", "b", np.nan, "d", "e", np.nan, "g"], - "B": [1, 2, 3, 4, 5, 6, 7], - "C": ["one", "two", "three", np.nan, - "five", np.nan, "seven"]})), -]) +@pytest.mark.parametrize( + "kwargs,expected", + [ + ( + dict(), + DataFrame( + { + "A": ["a", "b", np.nan, "d", "e", np.nan, "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", np.nan, "five", np.nan, "seven"], + } + ), + ), + ( + dict(na_values={"A": [], "C": []}, keep_default_na=False), + DataFrame( + { + "A": ["a", "b", "", "d", "e", "nan", "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", "nan", "five", "", "seven"], + } + ), + ), + ( + dict(na_values=["a"], keep_default_na=False), + DataFrame( + { + "A": [np.nan, "b", "", "d", "e", "nan", "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", "nan", "five", "", "seven"], + } + ), + ), + ( + dict(na_values={"A": [], "C": []}), + DataFrame( + { + "A": ["a", "b", np.nan, "d", "e", np.nan, "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", np.nan, "five", np.nan, "seven"], + } + ), + ), + ], +) def test_na_values_keep_default(all_parsers, kwargs, expected): data = """\ A,B,C @@ -215,10 +285,13 @@ def test_no_na_values_no_keep_default(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data), keep_default_na=False) - expected = DataFrame({"A": ["a", "b", "", "d", "e", "nan", "g"], - "B": [1, 2, 3, 4, 5, 6, 7], - "C": ["None", "two", "None", "nan", - "five", "", "seven"]}) + expected = DataFrame( + { + "A": ["a", "b", "", "d", "e", "nan", "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["None", "two", "None", "nan", "five", "", "seven"], + } + ) tm.assert_frame_equal(result, expected) @@ -226,8 +299,9 @@ def test_no_keep_default_na_dict_na_values(all_parsers): # see gh-19227 data = "a,b\n,2" parser = all_parsers - result = parser.read_csv(StringIO(data), na_values={"b": ["2"]}, - keep_default_na=False) + result = parser.read_csv( + StringIO(data), na_values={"b": ["2"]}, keep_default_na=False + ) expected = DataFrame({"a": [""], "b": [np.nan]}) tm.assert_frame_equal(result, expected) @@ -238,42 +312,47 @@ def test_no_keep_default_na_dict_na_scalar_values(all_parsers): # Scalar values shouldn't cause the parsing to crash or fail. data = "a,b\n1,2" parser = all_parsers - df = parser.read_csv(StringIO(data), na_values={"b": 2}, - keep_default_na=False) + df = parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False) expected = DataFrame({"a": [1], "b": [np.nan]}) tm.assert_frame_equal(df, expected) -@pytest.mark.parametrize("col_zero_na_values", [ - 113125, "113125" -]) -def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, - col_zero_na_values): +@pytest.mark.parametrize("col_zero_na_values", [113125, "113125"]) +def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values): # see gh-19227 data = """\ 113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008 729639,"qwer","",asdfkj,466.681,,252.373 """ parser = all_parsers - expected = DataFrame({0: [np.nan, 729639.0], - 1: [np.nan, "qwer"], - 2: ["/blaha", np.nan], - 3: ["kjsdkj", "asdfkj"], - 4: [412.166, 466.681], - 5: ["225.874", ""], - 6: [np.nan, 252.373]}) - - result = parser.read_csv(StringIO(data), header=None, - keep_default_na=False, - na_values={2: "", 6: "214.008", - 1: "blah", 0: col_zero_na_values}) + expected = DataFrame( + { + 0: [np.nan, 729639.0], + 1: [np.nan, "qwer"], + 2: ["/blaha", np.nan], + 3: ["kjsdkj", "asdfkj"], + 4: [412.166, 466.681], + 5: ["225.874", ""], + 6: [np.nan, 252.373], + } + ) + + result = parser.read_csv( + StringIO(data), + header=None, + keep_default_na=False, + na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values}, + ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("na_filter,row_data", [ - (True, [[1, "A"], [np.nan, np.nan], [3, "C"]]), - (False, [["1", "A"], ["nan", "B"], ["3", "C"]]), -]) +@pytest.mark.parametrize( + "na_filter,row_data", + [ + (True, [[1, "A"], [np.nan, np.nan], [3, "C"]]), + (False, [["1", "A"], ["nan", "B"], ["3", "C"]]), + ], +) def test_na_values_na_filter_override(all_parsers, na_filter, row_data): data = """\ A,B @@ -282,8 +361,7 @@ def test_na_values_na_filter_override(all_parsers, na_filter, row_data): 3,C """ parser = all_parsers - result = parser.read_csv(StringIO(data), na_values=["B"], - na_filter=na_filter) + result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter) expected = DataFrame(row_data, columns=["A", "B"]) tm.assert_frame_equal(result, expected) @@ -297,18 +375,32 @@ def test_na_trailing_columns(all_parsers): # Trailing columns should be all NaN. result = parser.read_csv(StringIO(data)) - expected = DataFrame([ - ["2012-03-14", "USD", "AAPL", "BUY", 1000, np.nan, np.nan, np.nan], - ["2012-05-12", "USD", "SBUX", "SELL", 500, np.nan, np.nan, np.nan], - ], columns=["Date", "Currency", "Symbol", "Type", - "Units", "UnitPrice", "Cost", "Tax"]) + expected = DataFrame( + [ + ["2012-03-14", "USD", "AAPL", "BUY", 1000, np.nan, np.nan, np.nan], + ["2012-05-12", "USD", "SBUX", "SELL", 500, np.nan, np.nan, np.nan], + ], + columns=[ + "Date", + "Currency", + "Symbol", + "Type", + "Units", + "UnitPrice", + "Cost", + "Tax", + ], + ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("na_values,row_data", [ - (1, [[np.nan, 2.0], [2.0, np.nan]]), - ({"a": 2, "b": 1}, [[1.0, 2.0], [np.nan, np.nan]]), -]) +@pytest.mark.parametrize( + "na_values,row_data", + [ + (1, [[np.nan, 2.0], [2.0, np.nan]]), + ({"a": 2, "b": 1}, [[1.0, 2.0], [np.nan, np.nan]]), + ], +) def test_na_values_scalar(all_parsers, na_values, row_data): # see gh-12224 parser = all_parsers @@ -346,14 +438,18 @@ def test_na_values_dict_col_index(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,kwargs,expected", [ - (str(2**63) + "\n" + str(2**63 + 1), - dict(na_values=[2**63]), DataFrame([str(2**63), str(2**63 + 1)])), - (str(2**63) + ",1" + "\n,2", - dict(), DataFrame([[str(2**63), 1], ['', 2]])), - (str(2**63) + "\n1", - dict(na_values=[2**63]), DataFrame([np.nan, 1])), -]) +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + str(2 ** 63) + "\n" + str(2 ** 63 + 1), + dict(na_values=[2 ** 63]), + DataFrame([str(2 ** 63), str(2 ** 63 + 1)]), + ), + (str(2 ** 63) + ",1" + "\n,2", dict(), DataFrame([[str(2 ** 63), 1], ["", 2]])), + (str(2 ** 63) + "\n1", dict(na_values=[2 ** 63]), DataFrame([np.nan, 1])), + ], +) def test_na_values_uint64(all_parsers, data, kwargs, expected): # see gh-14983 parser = all_parsers @@ -367,15 +463,13 @@ def test_empty_na_values_no_default_with_index(all_parsers): parser = all_parsers expected = DataFrame({"1": [2]}, index=Index(["b"], name="a")) - result = parser.read_csv(StringIO(data), index_col=0, - keep_default_na=False) + result = parser.read_csv(StringIO(data), index_col=0, keep_default_na=False) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("na_filter,index_data", [ - (False, ["", "5"]), - (True, [np.nan, 5.0]), -]) +@pytest.mark.parametrize( + "na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])] +) def test_no_na_filter_on_index(all_parsers, na_filter, index_data): # see gh-5239 # @@ -383,10 +477,8 @@ def test_no_na_filter_on_index(all_parsers, na_filter, index_data): parser = all_parsers data = "a,b,c\n1,,3\n4,5,6" - expected = DataFrame({"a": [1, 4], "c": [3, 6]}, - index=Index(index_data, name="b")) - result = parser.read_csv(StringIO(data), index_col=[1], - na_filter=na_filter) + expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index(index_data, name="b")) + result = parser.read_csv(StringIO(data), index_col=[1], na_filter=na_filter) tm.assert_frame_equal(result, expected) @@ -396,10 +488,10 @@ def test_inf_na_values_with_int_index(all_parsers): data = "idx,col1,col2\n1,3,4\n2,inf,-inf" # Don't fail with OverflowError with inf's and integer index column. - out = parser.read_csv(StringIO(data), index_col=[0], - na_values=["inf", "-inf"]) - expected = DataFrame({"col1": [3, np.nan], "col2": [4, np.nan]}, - index=Index([1, 2], name="idx")) + out = parser.read_csv(StringIO(data), index_col=[0], na_values=["inf", "-inf"]) + expected = DataFrame( + {"col1": [3, np.nan], "col2": [4, np.nan]}, index=Index([1, 2], name="idx") + ) tm.assert_frame_equal(out, expected) @@ -412,27 +504,35 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): # na_filter=True --> missing value becomes NaN. # na_filter=False --> missing value remains empty string. empty = np.nan if na_filter else "" - expected = DataFrame({"a": ["1", "4"], - "b": [empty, "5"], - "c": ["3", "6"]}) + expected = DataFrame({"a": ["1", "4"], "b": [empty, "5"], "c": ["3", "6"]}) result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data, na_values", [ - ("false,1\n,1\ntrue", None), - ("false,1\nnull,1\ntrue", None), - ("false,1\nnan,1\ntrue", None), - ("false,1\nfoo,1\ntrue", 'foo'), - ("false,1\nfoo,1\ntrue", ['foo']), - ("false,1\nfoo,1\ntrue", {'a': 'foo'}), -]) +@pytest.mark.parametrize( + "data, na_values", + [ + ("false,1\n,1\ntrue", None), + ("false,1\nnull,1\ntrue", None), + ("false,1\nnan,1\ntrue", None), + ("false,1\nfoo,1\ntrue", "foo"), + ("false,1\nfoo,1\ntrue", ["foo"]), + ("false,1\nfoo,1\ntrue", {"a": "foo"}), + ], +) def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): parser = all_parsers - msg = ("(Bool column has NA values in column [0a])|" - "(cannot safely convert passed user dtype of " - "bool for object dtyped data in column 0)") + msg = ( + "(Bool column has NA values in column [0a])|" + "(cannot safely convert passed user dtype of " + "bool for object dtyped data in column 0)" + ) with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), header=None, names=['a', 'b'], - dtype={'a': 'bool'}, na_values=na_values) + parser.read_csv( + StringIO(data), + header=None, + names=["a", "b"], + dtype={"a": "bool"}, + na_values=na_values, + ) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 20bd5b74f1784d..dbe721b10a3ce9 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -17,57 +17,56 @@ @pytest.mark.network -@pytest.mark.parametrize("compress_type, extension", [ - ('gzip', '.gz'), ('bz2', '.bz2'), ('zip', '.zip'), ('xz', '.xz')]) -@pytest.mark.parametrize('mode', ['explicit', 'infer']) -@pytest.mark.parametrize('engine', ['python', 'c']) -def test_compressed_urls(salaries_table, compress_type, extension, mode, - engine): - check_compressed_urls(salaries_table, compress_type, extension, mode, - engine) +@pytest.mark.parametrize( + "compress_type, extension", + [("gzip", ".gz"), ("bz2", ".bz2"), ("zip", ".zip"), ("xz", ".xz")], +) +@pytest.mark.parametrize("mode", ["explicit", "infer"]) +@pytest.mark.parametrize("engine", ["python", "c"]) +def test_compressed_urls(salaries_table, compress_type, extension, mode, engine): + check_compressed_urls(salaries_table, compress_type, extension, mode, engine) @tm.network -def check_compressed_urls(salaries_table, compression, extension, mode, - engine): +def check_compressed_urls(salaries_table, compression, extension, mode, engine): # test reading compressed urls with various engines and # extension inference - base_url = ('https://github.com/pandas-dev/pandas/raw/master/' - 'pandas/tests/io/parser/data/salaries.csv') + base_url = ( + "https://github.com/pandas-dev/pandas/raw/master/" + "pandas/tests/io/parser/data/salaries.csv" + ) url = base_url + extension - if mode != 'explicit': + if mode != "explicit": compression = mode - url_table = read_csv(url, sep='\t', compression=compression, engine=engine) + url_table = read_csv(url, sep="\t", compression=compression, engine=engine) tm.assert_frame_equal(url_table, salaries_table) @pytest.fixture def tips_df(datapath): """DataFrame with the tips dataset.""" - return read_csv(datapath('io', 'parser', 'data', 'tips.csv')) + return read_csv(datapath("io", "parser", "data", "tips.csv")) @pytest.mark.usefixtures("s3_resource") @td.skip_if_not_us_locale() class TestS3: - def test_parse_public_s3_bucket(self, tips_df): - pytest.importorskip('s3fs') + pytest.importorskip("s3fs") # more of an integration test due to the not-public contents portion # can probably mock this though. - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - df = read_csv('s3://pandas-test/tips.csv' + - ext, compression=comp) + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df = read_csv("s3://pandas-test/tips.csv" + ext, compression=comp) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) # Read public file from bucket with not-public contents - df = read_csv('s3://cant_get_it/tips.csv') + df = read_csv("s3://cant_get_it/tips.csv") assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) @@ -75,22 +74,21 @@ def test_parse_public_s3_bucket(self, tips_df): def test_parse_public_s3n_bucket(self, tips_df): # Read from AWS s3 as "s3n" URL - df = read_csv('s3n://pandas-test/tips.csv', nrows=10) + df = read_csv("s3n://pandas-test/tips.csv", nrows=10) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) def test_parse_public_s3a_bucket(self, tips_df): # Read from AWS s3 as "s3a" URL - df = read_csv('s3a://pandas-test/tips.csv', nrows=10) + df = read_csv("s3a://pandas-test/tips.csv", nrows=10) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) def test_parse_public_s3_bucket_nrows(self, tips_df): - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - df = read_csv('s3://pandas-test/tips.csv' + - ext, nrows=10, compression=comp) + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df = read_csv("s3://pandas-test/tips.csv" + ext, nrows=10, compression=comp) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) @@ -98,9 +96,10 @@ def test_parse_public_s3_bucket_nrows(self, tips_df): def test_parse_public_s3_bucket_chunked(self, tips_df): # Read with a chunksize chunksize = 5 - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - df_reader = read_csv('s3://pandas-test/tips.csv' + ext, - chunksize=chunksize, compression=comp) + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df_reader = read_csv( + "s3://pandas-test/tips.csv" + ext, chunksize=chunksize, compression=comp + ) assert df_reader.chunksize == chunksize for i_chunk in [0, 1, 2]: # Read a couple of chunks and make sure we see them @@ -108,70 +107,75 @@ def test_parse_public_s3_bucket_chunked(self, tips_df): df = df_reader.get_chunk() assert isinstance(df, DataFrame) assert not df.empty - true_df = tips_df.iloc[ - chunksize * i_chunk: chunksize * (i_chunk + 1)] + true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) def test_parse_public_s3_bucket_chunked_python(self, tips_df): # Read with a chunksize using the Python parser chunksize = 5 - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - df_reader = read_csv('s3://pandas-test/tips.csv' + ext, - chunksize=chunksize, compression=comp, - engine='python') + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df_reader = read_csv( + "s3://pandas-test/tips.csv" + ext, + chunksize=chunksize, + compression=comp, + engine="python", + ) assert df_reader.chunksize == chunksize for i_chunk in [0, 1, 2]: # Read a couple of chunks and make sure we see them properly. df = df_reader.get_chunk() assert isinstance(df, DataFrame) assert not df.empty - true_df = tips_df.iloc[ - chunksize * i_chunk: chunksize * (i_chunk + 1)] + true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) def test_parse_public_s3_bucket_python(self, tips_df): - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', - compression=comp) + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df = read_csv( + "s3://pandas-test/tips.csv" + ext, engine="python", compression=comp + ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) def test_infer_s3_compression(self, tips_df): - for ext in ['', '.gz', '.bz2']: - df = read_csv('s3://pandas-test/tips.csv' + ext, - engine='python', compression='infer') + for ext in ["", ".gz", ".bz2"]: + df = read_csv( + "s3://pandas-test/tips.csv" + ext, engine="python", compression="infer" + ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) def test_parse_public_s3_bucket_nrows_python(self, tips_df): - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', - nrows=10, compression=comp) + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df = read_csv( + "s3://pandas-test/tips.csv" + ext, + engine="python", + nrows=10, + compression=comp, + ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) def test_s3_fails(self): with pytest.raises(IOError): - read_csv('s3://nyqpug/asdf.csv') + read_csv("s3://nyqpug/asdf.csv") # Receive a permission error when trying to read a private bucket. # It's irrelevant here that this isn't actually a table. with pytest.raises(IOError): - read_csv('s3://cant_get_it/') + read_csv("s3://cant_get_it/") - def test_read_csv_handles_boto_s3_object(self, - s3_resource, - tips_file): + def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): # see gh-16135 s3_object = s3_resource.meta.client.get_object( - Bucket='pandas-test', - Key='tips.csv') + Bucket="pandas-test", Key="tips.csv" + ) - result = read_csv(BytesIO(s3_object["Body"].read()), encoding='utf8') + result = read_csv(BytesIO(s3_object["Body"].read()), encoding="utf8") assert isinstance(result, DataFrame) assert not result.empty @@ -180,24 +184,22 @@ def test_read_csv_handles_boto_s3_object(self, def test_read_csv_chunked_download(self, s3_resource, caplog): # 8 MB, S3FS usees 5MB chunks - df = DataFrame(np.random.randn(100000, 4), columns=list('abcd')) + df = DataFrame(np.random.randn(100000, 4), columns=list("abcd")) buf = BytesIO() str_buf = StringIO() df.to_csv(str_buf) - buf = BytesIO(str_buf.getvalue().encode('utf-8')) + buf = BytesIO(str_buf.getvalue().encode("utf-8")) - s3_resource.Bucket("pandas-test").put_object( - Key="large-file.csv", - Body=buf) + s3_resource.Bucket("pandas-test").put_object(Key="large-file.csv", Body=buf) - with caplog.at_level(logging.DEBUG, logger='s3fs.core'): + with caplog.at_level(logging.DEBUG, logger="s3fs.core"): read_csv("s3://pandas-test/large-file.csv", nrows=5) # log of fetch_range (start, stop) - assert ((0, 5505024) in {x.args[-2:] for x in caplog.records}) + assert (0, 5505024) in {x.args[-2:] for x in caplog.records} def test_read_s3_with_hash_in_key(self, tips_df): # GH 25945 - result = read_csv('s3://pandas-test/tips#1.csv') + result = read_csv("s3://pandas-test/tips#1.csv") tm.assert_frame_equal(tips_df, result) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 25589a1682f7a6..99e4e5c022ecb4 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -42,11 +42,17 @@ def test_separator_date_conflict(all_parsers): # date parsing do not conflict. parser = all_parsers data = "06-02-2013;13:00;1-000.215" - expected = DataFrame([[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], - columns=["Date", 2]) + expected = DataFrame( + [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], columns=["Date", 2] + ) - df = parser.read_csv(StringIO(data), sep=";", thousands="-", - parse_dates={"Date": [0, 1]}, header=None) + df = parser.read_csv( + StringIO(data), + sep=";", + thousands="-", + parse_dates={"Date": [0, 1]}, + header=None, + ) tm.assert_frame_equal(df, expected) @@ -77,32 +83,109 @@ def date_parser(*date_cols): """ return parsing.try_parse_dates(parsing._concat_date_cols(date_cols)) - result = parser.read_csv(StringIO(data), header=None, - date_parser=date_parser, prefix="X", - parse_dates={"actual": [1, 2], - "nominal": [1, 3]}, - keep_date_col=keep_date_col) - expected = DataFrame([ - [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), - "KORD", "19990127", " 19:00:00", " 18:56:00", - 0.81, 2.81, 7.2, 0.0, 280.0], - [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), - "KORD", "19990127", " 20:00:00", " 19:56:00", - 0.01, 2.21, 7.2, 0.0, 260.0], - [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56), - "KORD", "19990127", " 21:00:00", " 20:56:00", - -0.59, 2.21, 5.7, 0.0, 280.0], - [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18), - "KORD", "19990127", " 21:00:00", " 21:18:00", - -0.99, 2.01, 3.6, 0.0, 270.0], - [datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56), - "KORD", "19990127", " 22:00:00", " 21:56:00", - -0.59, 1.71, 5.1, 0.0, 290.0], - [datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56), - "KORD", "19990127", " 23:00:00", " 22:56:00", - -0.59, 1.71, 4.6, 0.0, 280.0], - ], columns=["actual", "nominal", "X0", "X1", "X2", - "X3", "X4", "X5", "X6", "X7", "X8"]) + result = parser.read_csv( + StringIO(data), + header=None, + date_parser=date_parser, + prefix="X", + parse_dates={"actual": [1, 2], "nominal": [1, 3]}, + keep_date_col=keep_date_col, + ) + expected = DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + datetime(1999, 1, 27, 18, 56), + "KORD", + "19990127", + " 19:00:00", + " 18:56:00", + 0.81, + 2.81, + 7.2, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 20, 0), + datetime(1999, 1, 27, 19, 56), + "KORD", + "19990127", + " 20:00:00", + " 19:56:00", + 0.01, + 2.21, + 7.2, + 0.0, + 260.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 20, 56), + "KORD", + "19990127", + " 21:00:00", + " 20:56:00", + -0.59, + 2.21, + 5.7, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 21, 18), + "KORD", + "19990127", + " 21:00:00", + " 21:18:00", + -0.99, + 2.01, + 3.6, + 0.0, + 270.0, + ], + [ + datetime(1999, 1, 27, 22, 0), + datetime(1999, 1, 27, 21, 56), + "KORD", + "19990127", + " 22:00:00", + " 21:56:00", + -0.59, + 1.71, + 5.1, + 0.0, + 290.0, + ], + [ + datetime(1999, 1, 27, 23, 0), + datetime(1999, 1, 27, 22, 56), + "KORD", + "19990127", + " 23:00:00", + " 22:56:00", + -0.59, + 1.71, + 4.6, + 0.0, + 280.0, + ], + ], + columns=[ + "actual", + "nominal", + "X0", + "X1", + "X2", + "X3", + "X4", + "X5", + "X6", + "X7", + "X8", + ], + ) if not keep_date_col: expected = expected.drop(["X1", "X2", "X3"], axis=1) @@ -139,30 +222,108 @@ def test_multiple_date_col(all_parsers, keep_date_col): KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ parser = all_parsers - result = parser.read_csv(StringIO(data), header=None, - prefix="X", parse_dates=[[1, 2], [1, 3]], - keep_date_col=keep_date_col) - expected = DataFrame([ - [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), - "KORD", "19990127", " 19:00:00", " 18:56:00", - 0.81, 2.81, 7.2, 0.0, 280.0], - [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), - "KORD", "19990127", " 20:00:00", " 19:56:00", - 0.01, 2.21, 7.2, 0.0, 260.0], - [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56), - "KORD", "19990127", " 21:00:00", " 20:56:00", - -0.59, 2.21, 5.7, 0.0, 280.0], - [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18), - "KORD", "19990127", " 21:00:00", " 21:18:00", - -0.99, 2.01, 3.6, 0.0, 270.0], - [datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56), - "KORD", "19990127", " 22:00:00", " 21:56:00", - -0.59, 1.71, 5.1, 0.0, 290.0], - [datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56), - "KORD", "19990127", " 23:00:00", " 22:56:00", - -0.59, 1.71, 4.6, 0.0, 280.0], - ], columns=["X1_X2", "X1_X3", "X0", "X1", "X2", - "X3", "X4", "X5", "X6", "X7", "X8"]) + result = parser.read_csv( + StringIO(data), + header=None, + prefix="X", + parse_dates=[[1, 2], [1, 3]], + keep_date_col=keep_date_col, + ) + expected = DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + datetime(1999, 1, 27, 18, 56), + "KORD", + "19990127", + " 19:00:00", + " 18:56:00", + 0.81, + 2.81, + 7.2, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 20, 0), + datetime(1999, 1, 27, 19, 56), + "KORD", + "19990127", + " 20:00:00", + " 19:56:00", + 0.01, + 2.21, + 7.2, + 0.0, + 260.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 20, 56), + "KORD", + "19990127", + " 21:00:00", + " 20:56:00", + -0.59, + 2.21, + 5.7, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 21, 18), + "KORD", + "19990127", + " 21:00:00", + " 21:18:00", + -0.99, + 2.01, + 3.6, + 0.0, + 270.0, + ], + [ + datetime(1999, 1, 27, 22, 0), + datetime(1999, 1, 27, 21, 56), + "KORD", + "19990127", + " 22:00:00", + " 21:56:00", + -0.59, + 1.71, + 5.1, + 0.0, + 290.0, + ], + [ + datetime(1999, 1, 27, 23, 0), + datetime(1999, 1, 27, 22, 56), + "KORD", + "19990127", + " 23:00:00", + " 22:56:00", + -0.59, + 1.71, + 4.6, + 0.0, + 280.0, + ], + ], + columns=[ + "X1_X2", + "X1_X3", + "X0", + "X1", + "X2", + "X3", + "X4", + "X5", + "X6", + "X7", + "X8", + ], + ) if not keep_date_col: expected = expected.drop(["X1", "X2", "X3"], axis=1) @@ -181,49 +342,84 @@ def test_date_col_as_index_col(all_parsers): KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 """ parser = all_parsers - result = parser.read_csv(StringIO(data), header=None, prefix="X", - parse_dates=[1], index_col=1) - - index = Index([datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 20, 0), - datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 22, 0)], name="X1") - expected = DataFrame([ - ["KORD", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0], - ["KORD", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0], - ["KORD", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0], - ["KORD", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0], - ["KORD", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0], - ], columns=["X0", "X2", "X3", "X4", "X5", "X6", "X7"], index=index) + result = parser.read_csv( + StringIO(data), header=None, prefix="X", parse_dates=[1], index_col=1 + ) + + index = Index( + [ + datetime(1999, 1, 27, 19, 0), + datetime(1999, 1, 27, 20, 0), + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 22, 0), + ], + name="X1", + ) + expected = DataFrame( + [ + ["KORD", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0], + ["KORD", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0], + ["KORD", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0], + ["KORD", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0], + ["KORD", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0], + ], + columns=["X0", "X2", "X3", "X4", "X5", "X6", "X7"], + index=index, + ) tm.assert_frame_equal(result, expected) def test_multiple_date_cols_int_cast(all_parsers): - data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900") + data = ( + "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900" + ) parse_dates = {"actual": [1, 2], "nominal": [1, 3]} parser = all_parsers - result = parser.read_csv(StringIO(data), header=None, - date_parser=conv.parse_date_time, - parse_dates=parse_dates, prefix="X") - expected = DataFrame([ - [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), - "KORD", 0.81], - [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), - "KORD", 0.01], - [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56), - "KORD", -0.59], - [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18), - "KORD", -0.99], - [datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56), - "KORD", -0.59], - [datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56), - "KORD", -0.59], - ], columns=["actual", "nominal", "X0", "X4"]) + result = parser.read_csv( + StringIO(data), + header=None, + date_parser=conv.parse_date_time, + parse_dates=parse_dates, + prefix="X", + ) + expected = DataFrame( + [ + [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), "KORD", 0.81], + [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), "KORD", 0.01], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 20, 56), + "KORD", + -0.59, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 21, 18), + "KORD", + -0.99, + ], + [ + datetime(1999, 1, 27, 22, 0), + datetime(1999, 1, 27, 21, 56), + "KORD", + -0.59, + ], + [ + datetime(1999, 1, 27, 23, 0), + datetime(1999, 1, 27, 22, 56), + "KORD", + -0.59, + ], + ], + columns=["actual", "nominal", "X0", "X4"], + ) # Python can sometimes be flaky about how # the aggregated columns are entered, so @@ -237,14 +433,32 @@ def test_multiple_date_col_timestamp_parse(all_parsers): data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25""" - result = parser.read_csv(StringIO(data), parse_dates=[[0, 1]], - header=None, date_parser=Timestamp) - expected = DataFrame([ - [Timestamp("05/31/2012, 15:30:00.029"), - 1306.25, 1, "E", 0, np.nan, 1306.25], - [Timestamp("05/31/2012, 15:30:00.029"), - 1306.25, 8, "E", 0, np.nan, 1306.25] - ], columns=["0_1", 2, 3, 4, 5, 6, 7]) + result = parser.read_csv( + StringIO(data), parse_dates=[[0, 1]], header=None, date_parser=Timestamp + ) + expected = DataFrame( + [ + [ + Timestamp("05/31/2012, 15:30:00.029"), + 1306.25, + 1, + "E", + 0, + np.nan, + 1306.25, + ], + [ + Timestamp("05/31/2012, 15:30:00.029"), + 1306.25, + 8, + "E", + 0, + np.nan, + 1306.25, + ], + ], + columns=["0_1", 2, 3, 4, 5, 6, 7], + ) tm.assert_frame_equal(result, expected) @@ -260,35 +474,104 @@ def test_multiple_date_cols_with_header(all_parsers): KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) - expected = DataFrame([ - [datetime(1999, 1, 27, 19, 0), "KORD", " 18:56:00", - 0.81, 2.81, 7.2, 0.0, 280.0], - [datetime(1999, 1, 27, 20, 0), "KORD", " 19:56:00", - 0.01, 2.21, 7.2, 0.0, 260.0], - [datetime(1999, 1, 27, 21, 0), "KORD", " 20:56:00", - -0.59, 2.21, 5.7, 0.0, 280.0], - [datetime(1999, 1, 27, 21, 0), "KORD", " 21:18:00", - -0.99, 2.01, 3.6, 0.0, 270.0], - [datetime(1999, 1, 27, 22, 0), "KORD", " 21:56:00", - -0.59, 1.71, 5.1, 0.0, 290.0], - [datetime(1999, 1, 27, 23, 0), "KORD", " 22:56:00", - -0.59, 1.71, 4.6, 0.0, 280.0], - ], columns=["nominal", "ID", "ActualTime", "TDew", - "TAir", "Windspeed", "Precip", "WindDir"]) + expected = DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + "KORD", + " 18:56:00", + 0.81, + 2.81, + 7.2, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 20, 0), + "KORD", + " 19:56:00", + 0.01, + 2.21, + 7.2, + 0.0, + 260.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD", + " 20:56:00", + -0.59, + 2.21, + 5.7, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD", + " 21:18:00", + -0.99, + 2.01, + 3.6, + 0.0, + 270.0, + ], + [ + datetime(1999, 1, 27, 22, 0), + "KORD", + " 21:56:00", + -0.59, + 1.71, + 5.1, + 0.0, + 290.0, + ], + [ + datetime(1999, 1, 27, 23, 0), + "KORD", + " 22:56:00", + -0.59, + 1.71, + 4.6, + 0.0, + 280.0, + ], + ], + columns=[ + "nominal", + "ID", + "ActualTime", + "TDew", + "TAir", + "Windspeed", + "Precip", + "WindDir", + ], + ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,parse_dates,msg", [ - ("""\ +@pytest.mark.parametrize( + "data,parse_dates,msg", + [ + ( + """\ date_NominalTime,date,NominalTime KORD1,19990127, 19:00:00 -KORD2,19990127, 20:00:00""", [[1, 2]], ("New date column already " - "in dict date_NominalTime")), - ("""\ +KORD2,19990127, 20:00:00""", + [[1, 2]], + ("New date column already " "in dict date_NominalTime"), + ), + ( + """\ ID,date,nominalTime KORD,19990127, 19:00:00 -KORD,19990127, 20:00:00""", dict(ID=[1, 2]), "Date column ID already in dict") -]) +KORD,19990127, 20:00:00""", + dict(ID=[1, 2]), + "Date column ID already in dict", + ), + ], +) def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg): parser = all_parsers @@ -299,30 +582,61 @@ def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg): def test_date_parser_int_bug(all_parsers): # see gh-3071 parser = all_parsers - data = ("posix_timestamp,elapsed,sys,user,queries,query_time,rows," - "accountid,userid,contactid,level,silo,method\n" - "1343103150,0.062353,0,4,6,0.01690,3," - "12345,1,-1,3,invoice_InvoiceResource,search\n") + data = ( + "posix_timestamp,elapsed,sys,user,queries,query_time,rows," + "accountid,userid,contactid,level,silo,method\n" + "1343103150,0.062353,0,4,6,0.01690,3," + "12345,1,-1,3,invoice_InvoiceResource,search\n" + ) result = parser.read_csv( - StringIO(data), index_col=0, parse_dates=[0], - date_parser=lambda x: datetime.utcfromtimestamp(int(x))) - expected = DataFrame([[0.062353, 0, 4, 6, 0.01690, 3, 12345, 1, -1, - 3, "invoice_InvoiceResource", "search"]], - columns=["elapsed", "sys", "user", "queries", - "query_time", "rows", "accountid", - "userid", "contactid", "level", - "silo", "method"], - index=Index([Timestamp("2012-07-24 04:12:30")], - name="posix_timestamp")) + StringIO(data), + index_col=0, + parse_dates=[0], + date_parser=lambda x: datetime.utcfromtimestamp(int(x)), + ) + expected = DataFrame( + [ + [ + 0.062353, + 0, + 4, + 6, + 0.01690, + 3, + 12345, + 1, + -1, + 3, + "invoice_InvoiceResource", + "search", + ] + ], + columns=[ + "elapsed", + "sys", + "user", + "queries", + "query_time", + "rows", + "accountid", + "userid", + "contactid", + "level", + "silo", + "method", + ], + index=Index([Timestamp("2012-07-24 04:12:30")], name="posix_timestamp"), + ) tm.assert_frame_equal(result, expected) def test_nat_parse(all_parsers): # see gh-3062 parser = all_parsers - df = DataFrame(dict({"A": np.arange(10, dtype="float64"), - "B": pd.Timestamp("20010101")})) + df = DataFrame( + dict({"A": np.arange(10, dtype="float64"), "B": pd.Timestamp("20010101")}) + ) df.iloc[3:6, :] = np.nan with tm.ensure_clean("__nat_parse_.csv") as path: @@ -340,8 +654,8 @@ def test_csv_custom_parser(all_parsers): """ parser = all_parsers result = parser.read_csv( - StringIO(data), - date_parser=lambda x: datetime.strptime(x, "%Y%m%d")) + StringIO(data), date_parser=lambda x: datetime.strptime(x, "%Y%m%d") + ) expected = parser.read_csv(StringIO(data), parse_dates=True) tm.assert_frame_equal(result, expected) @@ -355,8 +669,7 @@ def test_parse_dates_implicit_first_col(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data), parse_dates=True) - expected = parser.read_csv(StringIO(data), index_col=0, - parse_dates=True) + expected = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) tm.assert_frame_equal(result, expected) @@ -367,23 +680,20 @@ def test_parse_dates_string(all_parsers): 20090103,c,4,5 """ parser = all_parsers - result = parser.read_csv(StringIO(data), index_col="date", - parse_dates=["date"]) + result = parser.read_csv(StringIO(data), index_col="date", parse_dates=["date"]) index = date_range("1/1/2009", periods=3) index.name = "date" - expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], - "C": [2, 4, 5]}, index=index) + expected = DataFrame( + {"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}, index=index + ) tm.assert_frame_equal(result, expected) # Bug in https://github.com/dateutil/dateutil/issues/217 # has been addressed, but we just don't pass in the `yearfirst` @pytest.mark.xfail(reason="yearfirst is not surfaced in read_*") -@pytest.mark.parametrize("parse_dates", [ - [["date", "time"]], - [[0, 1]] -]) +@pytest.mark.parametrize("parse_dates", [[["date", "time"]], [[0, 1]]]) def test_yy_format_with_year_first(all_parsers, parse_dates): data = """date,time,B,C 090131,0010,1,2 @@ -391,12 +701,16 @@ def test_yy_format_with_year_first(all_parsers, parse_dates): 090331,0830,5,6 """ parser = all_parsers - result = parser.read_csv(StringIO(data), index_col=0, - parse_dates=parse_dates) - index = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0), - datetime(2009, 2, 28, 10, 20, 0), - datetime(2009, 3, 31, 8, 30, 0)], - dtype=object, name="date_time") + result = parser.read_csv(StringIO(data), index_col=0, parse_dates=parse_dates) + index = DatetimeIndex( + [ + datetime(2009, 1, 31, 0, 10, 0), + datetime(2009, 2, 28, 10, 20, 0), + datetime(2009, 3, 31, 8, 30, 0), + ], + dtype=object, + name="date_time", + ) expected = DataFrame({"B": [1, 3, 5], "C": [2, 4, 6]}, index=index) tm.assert_frame_equal(result, expected) @@ -406,12 +720,14 @@ def test_parse_dates_column_list(all_parsers, parse_dates): data = "a,b,c\n01/01/2010,1,15/02/2010" parser = all_parsers - expected = DataFrame({"a": [datetime(2010, 1, 1)], "b": [1], - "c": [datetime(2010, 2, 15)]}) + expected = DataFrame( + {"a": [datetime(2010, 1, 1)], "b": [1], "c": [datetime(2010, 2, 15)]} + ) expected = expected.set_index(["a", "b"]) - result = parser.read_csv(StringIO(data), index_col=[0, 1], - parse_dates=parse_dates, dayfirst=True) + result = parser.read_csv( + StringIO(data), index_col=[0, 1], parse_dates=parse_dates, dayfirst=True + ) tm.assert_frame_equal(result, expected) @@ -429,27 +745,38 @@ def test_multi_index_parse_dates(all_parsers, index_col): 20090103,three,c,4,5 """ parser = all_parsers - index = MultiIndex.from_product([ - (datetime(2009, 1, 1), datetime(2009, 1, 2), - datetime(2009, 1, 3)), ("one", "two", "three")], - names=["index1", "index2"]) + index = MultiIndex.from_product( + [ + (datetime(2009, 1, 1), datetime(2009, 1, 2), datetime(2009, 1, 3)), + ("one", "two", "three"), + ], + names=["index1", "index2"], + ) # Out of order. if index_col == [1, 0]: index = index.swaplevel(0, 1) - expected = DataFrame([["a", 1, 2], ["b", 3, 4], ["c", 4, 5], - ["a", 1, 2], ["b", 3, 4], ["c", 4, 5], - ["a", 1, 2], ["b", 3, 4], ["c", 4, 5]], - columns=["A", "B", "C"], index=index) - result = parser.read_csv(StringIO(data), index_col=index_col, - parse_dates=True) + expected = DataFrame( + [ + ["a", 1, 2], + ["b", 3, 4], + ["c", 4, 5], + ["a", 1, 2], + ["b", 3, 4], + ["c", 4, 5], + ["a", 1, 2], + ["b", 3, 4], + ["c", 4, 5], + ], + columns=["A", "B", "C"], + index=index, + ) + result = parser.read_csv(StringIO(data), index_col=index_col, parse_dates=True) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs", [ - dict(dayfirst=True), dict(day_first=True) -]) +@pytest.mark.parametrize("kwargs", [dict(dayfirst=True), dict(day_first=True)]) def test_parse_dates_custom_euro_format(all_parsers, kwargs): parser = all_parsers data = """foo,bar,baz @@ -458,22 +785,37 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs): 02/02/2010,1,2 """ if "dayfirst" in kwargs: - df = parser.read_csv(StringIO(data), names=["time", "Q", "NTU"], - date_parser=lambda d: du_parse(d, **kwargs), - header=0, index_col=0, parse_dates=True, - na_values=["NA"]) - exp_index = Index([datetime(2010, 1, 31), datetime(2010, 2, 1), - datetime(2010, 2, 2)], name="time") - expected = DataFrame({"Q": [1, 1, 1], "NTU": [2, np.nan, 2]}, - index=exp_index, columns=["Q", "NTU"]) + df = parser.read_csv( + StringIO(data), + names=["time", "Q", "NTU"], + date_parser=lambda d: du_parse(d, **kwargs), + header=0, + index_col=0, + parse_dates=True, + na_values=["NA"], + ) + exp_index = Index( + [datetime(2010, 1, 31), datetime(2010, 2, 1), datetime(2010, 2, 2)], + name="time", + ) + expected = DataFrame( + {"Q": [1, 1, 1], "NTU": [2, np.nan, 2]}, + index=exp_index, + columns=["Q", "NTU"], + ) tm.assert_frame_equal(df, expected) else: msg = "got an unexpected keyword argument 'day_first'" with pytest.raises(TypeError, match=msg): - parser.read_csv(StringIO(data), names=["time", "Q", "NTU"], - date_parser=lambda d: du_parse(d, **kwargs), - skiprows=[0], index_col=0, parse_dates=True, - na_values=["NA"]) + parser.read_csv( + StringIO(data), + names=["time", "Q", "NTU"], + date_parser=lambda d: du_parse(d, **kwargs), + skiprows=[0], + index_col=0, + parse_dates=True, + na_values=["NA"], + ) def test_parse_tz_aware(all_parsers): @@ -481,19 +823,18 @@ def test_parse_tz_aware(all_parsers): parser = all_parsers data = "Date,x\n2012-06-13T01:39:00Z,0.5" - result = parser.read_csv(StringIO(data), index_col=0, - parse_dates=True) - expected = DataFrame({"x": [0.5]}, index=Index([Timestamp( - "2012-06-13 01:39:00+00:00")], name="Date")) + result = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) + expected = DataFrame( + {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") + ) tm.assert_frame_equal(result, expected) assert result.index.tz is pytz.utc -@pytest.mark.parametrize("parse_dates,index_col", [ - ({"nominal": [1, 2]}, "nominal"), - ({"nominal": [1, 2]}, 0), - ([[1, 2]], 0), -]) +@pytest.mark.parametrize( + "parse_dates,index_col", + [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)], +) def test_multiple_date_cols_index(all_parsers, parse_dates, index_col): parser = all_parsers data = """ @@ -505,28 +846,88 @@ def test_multiple_date_cols_index(all_parsers, parse_dates, index_col): KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ - expected = DataFrame([ - [datetime(1999, 1, 27, 19, 0), "KORD1", " 18:56:00", - 0.81, 2.81, 7.2, 0.0, 280.0], - [datetime(1999, 1, 27, 20, 0), "KORD2", " 19:56:00", - 0.01, 2.21, 7.2, 0.0, 260.0], - [datetime(1999, 1, 27, 21, 0), "KORD3", " 20:56:00", - -0.59, 2.21, 5.7, 0.0, 280.0], - [datetime(1999, 1, 27, 21, 0), "KORD4", " 21:18:00", - -0.99, 2.01, 3.6, 0.0, 270.0], - [datetime(1999, 1, 27, 22, 0), "KORD5", " 21:56:00", - -0.59, 1.71, 5.1, 0.0, 290.0], - [datetime(1999, 1, 27, 23, 0), "KORD6", " 22:56:00", - -0.59, 1.71, 4.6, 0.0, 280.0], - ], columns=["nominal", "ID", "ActualTime", "TDew", - "TAir", "Windspeed", "Precip", "WindDir"]) + expected = DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + "KORD1", + " 18:56:00", + 0.81, + 2.81, + 7.2, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 20, 0), + "KORD2", + " 19:56:00", + 0.01, + 2.21, + 7.2, + 0.0, + 260.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD3", + " 20:56:00", + -0.59, + 2.21, + 5.7, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD4", + " 21:18:00", + -0.99, + 2.01, + 3.6, + 0.0, + 270.0, + ], + [ + datetime(1999, 1, 27, 22, 0), + "KORD5", + " 21:56:00", + -0.59, + 1.71, + 5.1, + 0.0, + 290.0, + ], + [ + datetime(1999, 1, 27, 23, 0), + "KORD6", + " 22:56:00", + -0.59, + 1.71, + 4.6, + 0.0, + 280.0, + ], + ], + columns=[ + "nominal", + "ID", + "ActualTime", + "TDew", + "TAir", + "Windspeed", + "Precip", + "WindDir", + ], + ) expected = expected.set_index("nominal") if not isinstance(parse_dates, dict): expected.index.name = "date_NominalTime" - result = parser.read_csv(StringIO(data), parse_dates=parse_dates, - index_col=index_col) + result = parser.read_csv( + StringIO(data), parse_dates=parse_dates, index_col=index_col + ) tm.assert_frame_equal(result, expected) @@ -542,24 +943,79 @@ def test_multiple_date_cols_chunked(all_parsers): KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ - expected = DataFrame([ - [datetime(1999, 1, 27, 19, 0), "KORD", " 18:56:00", - 0.81, 2.81, 7.2, 0.0, 280.0], - [datetime(1999, 1, 27, 20, 0), "KORD", " 19:56:00", - 0.01, 2.21, 7.2, 0.0, 260.0], - [datetime(1999, 1, 27, 21, 0), "KORD", " 20:56:00", - -0.59, 2.21, 5.7, 0.0, 280.0], - [datetime(1999, 1, 27, 21, 0), "KORD", " 21:18:00", - -0.99, 2.01, 3.6, 0.0, 270.0], - [datetime(1999, 1, 27, 22, 0), "KORD", " 21:56:00", - -0.59, 1.71, 5.1, 0.0, 290.0], - [datetime(1999, 1, 27, 23, 0), "KORD", " 22:56:00", - -0.59, 1.71, 4.6, 0.0, 280.0], - ], columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"]) + expected = DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + "KORD", + " 18:56:00", + 0.81, + 2.81, + 7.2, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 20, 0), + "KORD", + " 19:56:00", + 0.01, + 2.21, + 7.2, + 0.0, + 260.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD", + " 20:56:00", + -0.59, + 2.21, + 5.7, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD", + " 21:18:00", + -0.99, + 2.01, + 3.6, + 0.0, + 270.0, + ], + [ + datetime(1999, 1, 27, 22, 0), + "KORD", + " 21:56:00", + -0.59, + 1.71, + 5.1, + 0.0, + 290.0, + ], + [ + datetime(1999, 1, 27, 23, 0), + "KORD", + " 22:56:00", + -0.59, + 1.71, + 4.6, + 0.0, + 280.0, + ], + ], + columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"], + ) expected = expected.set_index("nominal") - reader = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}, - index_col="nominal", chunksize=2) + reader = parser.read_csv( + StringIO(data), + parse_dates={"nominal": [1, 2]}, + index_col="nominal", + chunksize=2, + ) chunks = list(reader) tm.assert_frame_equal(chunks[0], expected[:2]) @@ -579,12 +1035,14 @@ def test_multiple_date_col_named_index_compat(all_parsers): KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ - with_indices = parser.read_csv(StringIO(data), - parse_dates={"nominal": [1, 2]}, - index_col="nominal") - with_names = parser.read_csv(StringIO(data), index_col="nominal", - parse_dates={"nominal": [ - "date", "nominalTime"]}) + with_indices = parser.read_csv( + StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal" + ) + with_names = parser.read_csv( + StringIO(data), + index_col="nominal", + parse_dates={"nominal": ["date", "nominalTime"]}, + ) tm.assert_frame_equal(with_indices, with_names) @@ -599,10 +1057,10 @@ def test_multiple_date_col_multiple_index_compat(all_parsers): KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ - result = parser.read_csv(StringIO(data), index_col=["nominal", "ID"], - parse_dates={"nominal": [1, 2]}) - expected = parser.read_csv(StringIO(data), - parse_dates={"nominal": [1, 2]}) + result = parser.read_csv( + StringIO(data), index_col=["nominal", "ID"], parse_dates={"nominal": [1, 2]} + ) + expected = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) expected = expected.set_index(["nominal", "ID"]) tm.assert_frame_equal(result, expected) @@ -612,8 +1070,10 @@ def test_multiple_date_col_multiple_index_compat(all_parsers): def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): # see gh-5636 parser = all_parsers - msg = ("Only booleans, lists, and dictionaries " - "are accepted for the 'parse_dates' parameter") + msg = ( + "Only booleans, lists, and dictionaries " + "are accepted for the 'parse_dates' parameter" + ) data = """A,B,C 1,2,2003-11-1""" @@ -621,13 +1081,13 @@ def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): parser.read_csv(StringIO(data), parse_dates="C", **kwargs) -@pytest.mark.parametrize("parse_dates", [ - (1,), np.array([4, 5]), {1, 3, 3} -]) +@pytest.mark.parametrize("parse_dates", [(1,), np.array([4, 5]), {1, 3, 3}]) def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): parser = all_parsers - msg = ("Only booleans, lists, and dictionaries " - "are accepted for the 'parse_dates' parameter") + msg = ( + "Only booleans, lists, and dictionaries " + "are accepted for the 'parse_dates' parameter" + ) data = """A,B,C 1,2,2003-11-1""" @@ -636,44 +1096,66 @@ def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): @pytest.mark.parametrize("cache_dates", [True, False]) -@pytest.mark.parametrize("value", [ - 'nan', '0', '']) +@pytest.mark.parametrize("value", ["nan", "0", ""]) def test_bad_date_parse(all_parsers, cache_dates, value): # if we have an invalid date make sure that we handle this with # and w/o the cache properly parser = all_parsers - s = StringIO(('%s,\n' % value) * 50000) + s = StringIO(("%s,\n" % value) * 50000) - parser.read_csv(s, - header=None, names=['foo', 'bar'], parse_dates=['foo'], - infer_datetime_format=False, - cache_dates=cache_dates) + parser.read_csv( + s, + header=None, + names=["foo", "bar"], + parse_dates=["foo"], + infer_datetime_format=False, + cache_dates=cache_dates, + ) def test_parse_dates_empty_string(all_parsers): # see gh-2263 parser = all_parsers data = "Date,test\n2012-01-01,1\n,2" - result = parser.read_csv(StringIO(data), parse_dates=["Date"], - na_filter=False) + result = parser.read_csv(StringIO(data), parse_dates=["Date"], na_filter=False) - expected = DataFrame([[datetime(2012, 1, 1), 1], [pd.NaT, 2]], - columns=["Date", "test"]) + expected = DataFrame( + [[datetime(2012, 1, 1), 1], [pd.NaT, 2]], columns=["Date", "test"] + ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,kwargs,expected", [ - ("a\n04.15.2016", dict(parse_dates=["a"]), - DataFrame([datetime(2016, 4, 15)], columns=["a"])), - ("a\n04.15.2016", dict(parse_dates=True, index_col=0), - DataFrame(index=DatetimeIndex(["2016-04-15"], name="a"))), - ("a,b\n04.15.2016,09.16.2013", dict(parse_dates=["a", "b"]), - DataFrame([[datetime(2016, 4, 15), datetime(2013, 9, 16)]], - columns=["a", "b"])), - ("a,b\n04.15.2016,09.16.2013", dict(parse_dates=True, index_col=[0, 1]), - DataFrame(index=MultiIndex.from_tuples( - [(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"]))), -]) +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + "a\n04.15.2016", + dict(parse_dates=["a"]), + DataFrame([datetime(2016, 4, 15)], columns=["a"]), + ), + ( + "a\n04.15.2016", + dict(parse_dates=True, index_col=0), + DataFrame(index=DatetimeIndex(["2016-04-15"], name="a")), + ), + ( + "a,b\n04.15.2016,09.16.2013", + dict(parse_dates=["a", "b"]), + DataFrame( + [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=["a", "b"] + ), + ), + ( + "a,b\n04.15.2016,09.16.2013", + dict(parse_dates=True, index_col=[0, 1]), + DataFrame( + index=MultiIndex.from_tuples( + [(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"] + ) + ), + ), + ], +) def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected): # see gh-14066 parser = all_parsers @@ -690,51 +1172,96 @@ def test_parse_date_time_multi_level_column_name(all_parsers): 2001-01-06, 00:00:00, 1.0, 11. """ parser = all_parsers - result = parser.read_csv(StringIO(data), header=[0, 1], - parse_dates={"date_time": [0, 1]}, - date_parser=conv.parse_date_time) - - expected_data = [[datetime(2001, 1, 5, 9, 0, 0), 0., 10.], - [datetime(2001, 1, 6, 0, 0, 0), 1., 11.]] - expected = DataFrame(expected_data, - columns=["date_time", ("A", "a"), ("B", "b")]) + result = parser.read_csv( + StringIO(data), + header=[0, 1], + parse_dates={"date_time": [0, 1]}, + date_parser=conv.parse_date_time, + ) + + expected_data = [ + [datetime(2001, 1, 5, 9, 0, 0), 0.0, 10.0], + [datetime(2001, 1, 6, 0, 0, 0), 1.0, 11.0], + ] + expected = DataFrame(expected_data, columns=["date_time", ("A", "a"), ("B", "b")]) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,kwargs,expected", [ - ("""\ +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + """\ date,time,a,b 2001-01-05, 10:00:00, 0.0, 10. 2001-01-05, 00:00:00, 1., 11. -""", dict(header=0, parse_dates={"date_time": [0, 1]}), - DataFrame([[datetime(2001, 1, 5, 10, 0, 0), 0.0, 10], - [datetime(2001, 1, 5, 0, 0, 0), 1.0, 11.0]], - columns=["date_time", "a", "b"])), - (("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900"), - dict(header=None, parse_dates={"actual": [1, 2], "nominal": [1, 3]}), - DataFrame([ - [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), - "KORD", 0.81], - [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), - "KORD", 0.01], - [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56), - "KORD", -0.59], - [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18), - "KORD", -0.99], - [datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56), - "KORD", -0.59], - [datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56), - "KORD", -0.59]], columns=["actual", "nominal", 0, 4])), -]) +""", + dict(header=0, parse_dates={"date_time": [0, 1]}), + DataFrame( + [ + [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10], + [datetime(2001, 1, 5, 0, 0, 0), 1.0, 11.0], + ], + columns=["date_time", "a", "b"], + ), + ), + ( + ( + "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900" + ), + dict(header=None, parse_dates={"actual": [1, 2], "nominal": [1, 3]}), + DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + datetime(1999, 1, 27, 18, 56), + "KORD", + 0.81, + ], + [ + datetime(1999, 1, 27, 20, 0), + datetime(1999, 1, 27, 19, 56), + "KORD", + 0.01, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 20, 56), + "KORD", + -0.59, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 21, 18), + "KORD", + -0.99, + ], + [ + datetime(1999, 1, 27, 22, 0), + datetime(1999, 1, 27, 21, 56), + "KORD", + -0.59, + ], + [ + datetime(1999, 1, 27, 23, 0), + datetime(1999, 1, 27, 22, 56), + "KORD", + -0.59, + ], + ], + columns=["actual", "nominal", 0, 4], + ), + ), + ], +) def test_parse_date_time(all_parsers, data, kwargs, expected): parser = all_parsers - result = parser.read_csv(StringIO(data), date_parser=conv.parse_date_time, - **kwargs) + result = parser.read_csv(StringIO(data), date_parser=conv.parse_date_time, **kwargs) # Python can sometimes be flaky about how # the aggregated columns are entered, so @@ -745,14 +1272,18 @@ def test_parse_date_time(all_parsers, data, kwargs, expected): def test_parse_date_fields(all_parsers): parser = all_parsers - data = ("year,month,day,a\n2001,01,10,10.\n" - "2001,02,1,11.") - result = parser.read_csv(StringIO(data), header=0, - parse_dates={"ymd": [0, 1, 2]}, - date_parser=conv.parse_date_fields) - - expected = DataFrame([[datetime(2001, 1, 10), 10.], - [datetime(2001, 2, 1), 11.]], columns=["ymd", "a"]) + data = "year,month,day,a\n2001,01,10,10.\n" "2001,02,1,11." + result = parser.read_csv( + StringIO(data), + header=0, + parse_dates={"ymd": [0, 1, 2]}, + date_parser=conv.parse_date_fields, + ) + + expected = DataFrame( + [[datetime(2001, 1, 10), 10.0], [datetime(2001, 2, 1), 11.0]], + columns=["ymd", "a"], + ) tm.assert_frame_equal(result, expected) @@ -763,12 +1294,19 @@ def test_parse_date_all_fields(all_parsers): 2001,01,05,10,00,0,0.0,10. 2001,01,5,10,0,00,1.,11. """ - result = parser.read_csv(StringIO(data), header=0, - date_parser=conv.parse_all_fields, - parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}) - expected = DataFrame([[datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0], - [datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0]], - columns=["ymdHMS", "a", "b"]) + result = parser.read_csv( + StringIO(data), + header=0, + date_parser=conv.parse_all_fields, + parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, + ) + expected = DataFrame( + [ + [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0], + [datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0], + ], + columns=["ymdHMS", "a", "b"], + ) tm.assert_frame_equal(result, expected) @@ -779,14 +1317,19 @@ def test_datetime_fractional_seconds(all_parsers): 2001,01,05,10,00,0.123456,0.0,10. 2001,01,5,10,0,0.500000,1.,11. """ - result = parser.read_csv(StringIO(data), header=0, - date_parser=conv.parse_all_fields, - parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}) - expected = DataFrame([[datetime(2001, 1, 5, 10, 0, 0, - microsecond=123456), 0.0, 10.0], - [datetime(2001, 1, 5, 10, 0, 0, - microsecond=500000), 1.0, 11.0]], - columns=["ymdHMS", "a", "b"]) + result = parser.read_csv( + StringIO(data), + header=0, + date_parser=conv.parse_all_fields, + parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, + ) + expected = DataFrame( + [ + [datetime(2001, 1, 5, 10, 0, 0, microsecond=123456), 0.0, 10.0], + [datetime(2001, 1, 5, 10, 0, 0, microsecond=500000), 1.0, 11.0], + ], + columns=["ymdHMS", "a", "b"], + ) tm.assert_frame_equal(result, expected) @@ -794,14 +1337,16 @@ def test_generic(all_parsers): parser = all_parsers data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." - result = parser.read_csv(StringIO(data), header=0, - parse_dates={"ym": [0, 1]}, - date_parser=lambda y, m: date(year=int(y), - month=int(m), - day=1)) - expected = DataFrame([[date(2001, 1, 1), 10, 10.], - [date(2001, 2, 1), 1, 11.]], - columns=["ym", "day", "a"]) + result = parser.read_csv( + StringIO(data), + header=0, + parse_dates={"ym": [0, 1]}, + date_parser=lambda y, m: date(year=int(y), month=int(m), day=1), + ) + expected = DataFrame( + [[date(2001, 1, 1), 10, 10.0], [date(2001, 2, 1), 1, 11.0]], + columns=["ym", "day", "a"], + ) tm.assert_frame_equal(result, expected) @@ -816,19 +1361,25 @@ def test_date_parser_resolution_if_not_ns(all_parsers): """ def date_parser(dt, time): - return np_array_datetime64_compat(dt + "T" + time + "Z", - dtype="datetime64[s]") - - result = parser.read_csv(StringIO(data), date_parser=date_parser, - parse_dates={"datetime": ["date", "time"]}, - index_col=["datetime", "prn"]) - - datetimes = np_array_datetime64_compat(["2013-11-03T19:00:00Z"] * 3, - dtype="datetime64[s]") - expected = DataFrame(data={"rxstatus": ["00E80000"] * 3}, - index=MultiIndex.from_tuples( - [(datetimes[0], 126), (datetimes[1], 23), - (datetimes[2], 13)], names=["datetime", "prn"])) + return np_array_datetime64_compat(dt + "T" + time + "Z", dtype="datetime64[s]") + + result = parser.read_csv( + StringIO(data), + date_parser=date_parser, + parse_dates={"datetime": ["date", "time"]}, + index_col=["datetime", "prn"], + ) + + datetimes = np_array_datetime64_compat( + ["2013-11-03T19:00:00Z"] * 3, dtype="datetime64[s]" + ) + expected = DataFrame( + data={"rxstatus": ["00E80000"] * 3}, + index=MultiIndex.from_tuples( + [(datetimes[0], 126), (datetimes[1], 23), (datetimes[2], 13)], + names=["datetime", "prn"], + ), + ) tm.assert_frame_equal(result, expected) @@ -838,22 +1389,24 @@ def test_parse_date_column_with_empty_string(all_parsers): data = "case,opdate\n7,10/18/2006\n7,10/18/2008\n621, " result = parser.read_csv(StringIO(data), parse_dates=["opdate"]) - expected_data = [[7, "10/18/2006"], - [7, "10/18/2008"], - [621, " "]] + expected_data = [[7, "10/18/2006"], [7, "10/18/2008"], [621, " "]] expected = DataFrame(expected_data, columns=["case", "opdate"]) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,expected", [ - ("a\n135217135789158401\n1352171357E+5", - DataFrame({"a": [135217135789158401, - 135217135700000]}, dtype="float64")), - ("a\n99999999999\n123456789012345\n1234E+0", - DataFrame({"a": [99999999999, - 123456789012345, - 1234]}, dtype="float64")) -]) +@pytest.mark.parametrize( + "data,expected", + [ + ( + "a\n135217135789158401\n1352171357E+5", + DataFrame({"a": [135217135789158401, 135217135700000]}, dtype="float64"), + ), + ( + "a\n99999999999\n123456789012345\n1234E+0", + DataFrame({"a": [99999999999, 123456789012345, 1234]}, dtype="float64"), + ), + ], +) @pytest.mark.parametrize("parse_dates", [True, False]) def test_parse_date_float(all_parsers, data, expected, parse_dates): # see gh-2697 @@ -877,47 +1430,48 @@ def test_parse_timezone(all_parsers): 2018-01-04 09:05:00+09:00,23400""" result = parser.read_csv(StringIO(data), parse_dates=["dt"]) - dti = pd.date_range(start="2018-01-04 09:01:00", - end="2018-01-04 09:05:00", freq="1min", - tz=pytz.FixedOffset(540)) + dti = pd.date_range( + start="2018-01-04 09:01:00", + end="2018-01-04 09:05:00", + freq="1min", + tz=pytz.FixedOffset(540), + ) expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]} expected = DataFrame(expected_data) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("date_string", [ - "32/32/2019", - "02/30/2019", - "13/13/2019", - "13/2019", - "a3/11/2018", - "10/11/2o17" -]) +@pytest.mark.parametrize( + "date_string", + ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"], +) def test_invalid_parse_delimited_date(all_parsers, date_string): parser = all_parsers expected = DataFrame({0: [date_string]}, dtype="object") - result = parser.read_csv(StringIO(date_string), - header=None, parse_dates=[0]) + result = parser.read_csv(StringIO(date_string), header=None, parse_dates=[0]) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("date_string,dayfirst,expected", [ - # %d/%m/%Y; month > 12 thus replacement - ("13/02/2019", False, datetime(2019, 2, 13)), - ("13/02/2019", True, datetime(2019, 2, 13)), - # %m/%d/%Y; day > 12 thus there will be no replacement - ("02/13/2019", False, datetime(2019, 2, 13)), - ("02/13/2019", True, datetime(2019, 2, 13)), - # %d/%m/%Y; dayfirst==True thus replacement - ("04/02/2019", True, datetime(2019, 2, 4)) -]) -def test_parse_delimited_date_swap(all_parsers, date_string, - dayfirst, expected): +@pytest.mark.parametrize( + "date_string,dayfirst,expected", + [ + # %d/%m/%Y; month > 12 thus replacement + ("13/02/2019", False, datetime(2019, 2, 13)), + ("13/02/2019", True, datetime(2019, 2, 13)), + # %m/%d/%Y; day > 12 thus there will be no replacement + ("02/13/2019", False, datetime(2019, 2, 13)), + ("02/13/2019", True, datetime(2019, 2, 13)), + # %d/%m/%Y; dayfirst==True thus replacement + ("04/02/2019", True, datetime(2019, 2, 4)), + ], +) +def test_parse_delimited_date_swap(all_parsers, date_string, dayfirst, expected): parser = all_parsers expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") - result = parser.read_csv(StringIO(date_string), header=None, - dayfirst=dayfirst, parse_dates=[0]) + result = parser.read_csv( + StringIO(date_string), header=None, dayfirst=dayfirst, parse_dates=[0] + ) tm.assert_frame_equal(result, expected) @@ -935,31 +1489,30 @@ def _helper_hypothesis_delimited_date(call, date_string, **kwargs): @settings(deadline=None) @pytest.mark.parametrize("delimiter", list(" -./")) @pytest.mark.parametrize("dayfirst", [True, False]) -@pytest.mark.parametrize("date_format", [ - "%d %m %Y", - "%m %d %Y", - "%m %Y", - "%Y %m %d", - "%y %m %d", - "%Y%m%d", - "%y%m%d", -]) -def test_hypothesis_delimited_date(date_format, dayfirst, - delimiter, test_datetime): +@pytest.mark.parametrize( + "date_format", + ["%d %m %Y", "%m %d %Y", "%m %Y", "%Y %m %d", "%y %m %d", "%Y%m%d", "%y%m%d"], +) +def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_datetime): if date_format == "%m %Y" and delimiter == ".": - pytest.skip("parse_datetime_string cannot reliably tell whether \ - e.g. %m.%Y is a float or a date, thus we skip it") + pytest.skip( + "parse_datetime_string cannot reliably tell whether \ + e.g. %m.%Y is a float or a date, thus we skip it" + ) result, expected = None, None except_in_dateutil, except_out_dateutil = None, None - date_string = test_datetime.strftime(date_format.replace(' ', delimiter)) + date_string = test_datetime.strftime(date_format.replace(" ", delimiter)) except_out_dateutil, result = _helper_hypothesis_delimited_date( - parse_datetime_string, date_string, - dayfirst=dayfirst) + parse_datetime_string, date_string, dayfirst=dayfirst + ) except_in_dateutil, expected = _helper_hypothesis_delimited_date( - du_parse, date_string, + du_parse, + date_string, default=_DEFAULT_DATETIME, - dayfirst=dayfirst, yearfirst=False) + dayfirst=dayfirst, + yearfirst=False, + ) assert except_out_dateutil == except_in_dateutil assert result == expected diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 51c44c08cb80cf..5b381e43e3e199 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -49,10 +49,7 @@ def test_invalid_skipfooter_negative(python_parser_only): parser.read_csv(StringIO(data), skipfooter=-1) -@pytest.mark.parametrize("kwargs", [ - dict(sep=None), - dict(delimiter="|") -]) +@pytest.mark.parametrize("kwargs", [dict(sep=None), dict(delimiter="|")]) def test_sniff_delimiter(python_parser_only, kwargs): data = """index|A|B|C foo|1|2|3 @@ -61,9 +58,11 @@ def test_sniff_delimiter(python_parser_only, kwargs): """ parser = python_parser_only result = parser.read_csv(StringIO(data), index_col=0, **kwargs) - expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - columns=["A", "B", "C"], - index=Index(["foo", "bar", "baz"], name="index")) + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=["A", "B", "C"], + index=Index(["foo", "bar", "baz"], name="index"), + ) tm.assert_frame_equal(result, expected) @@ -80,25 +79,26 @@ def test_sniff_delimiter_encoding(python_parser_only, encoding): if encoding is not None: from io import TextIOWrapper + data = data.encode(encoding) data = BytesIO(data) data = TextIOWrapper(data, encoding=encoding) else: data = StringIO(data) - result = parser.read_csv(data, index_col=0, sep=None, - skiprows=2, encoding=encoding) - expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - columns=["A", "B", "C"], - index=Index(["foo", "bar", "baz"], name="index")) + result = parser.read_csv(data, index_col=0, sep=None, skiprows=2, encoding=encoding) + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=["A", "B", "C"], + index=Index(["foo", "bar", "baz"], name="index"), + ) tm.assert_frame_equal(result, expected) def test_single_line(python_parser_only): # see gh-6607: sniff separator parser = python_parser_only - result = parser.read_csv(StringIO("1,2"), names=["a", "b"], - header=None, sep=None) + result = parser.read_csv(StringIO("1,2"), names=["a", "b"], header=None, sep=None) expected = DataFrame({"a": [1], "b": [2]}) tm.assert_frame_equal(result, expected) @@ -117,15 +117,13 @@ def test_skipfooter(python_parser_only, kwargs): parser = python_parser_only result = parser.read_csv(StringIO(data), **kwargs) - expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - columns=["A", "B", "C"]) + expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["A", "B", "C"]) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("compression,klass", [ - ("gzip", "GzipFile"), - ("bz2", "BZ2File"), -]) +@pytest.mark.parametrize( + "compression,klass", [("gzip", "GzipFile"), ("bz2", "BZ2File")] +) def test_decompression_regex_sep(python_parser_only, csv1, compression, klass): # see gh-6607 parser = python_parser_only @@ -144,8 +142,7 @@ def test_decompression_regex_sep(python_parser_only, csv1, compression, klass): tmp.write(data) tmp.close() - result = parser.read_csv(path, sep="::", - compression=compression) + result = parser.read_csv(path, sep="::", compression=compression) tm.assert_frame_equal(result, expected) @@ -158,15 +155,18 @@ def test_read_csv_buglet_4x_multi_index(python_parser_only): x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" parser = python_parser_only - expected = DataFrame([[-0.5109, -2.3358, -0.4645, 0.05076, 0.3640], - [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], - [-0.6662, -0.5243, -0.3580, 0.89145, 2.5838]], - columns=["A", "B", "C", "D", "E"], - index=MultiIndex.from_tuples([ - ("a", "b", 10.0032, 5), - ("a", "q", 20, 4), - ("x", "q", 30, 3), - ], names=["one", "two", "three", "four"])) + expected = DataFrame( + [ + [-0.5109, -2.3358, -0.4645, 0.05076, 0.3640], + [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], + [-0.6662, -0.5243, -0.3580, 0.89145, 2.5838], + ], + columns=["A", "B", "C", "D", "E"], + index=MultiIndex.from_tuples( + [("a", "b", 10.0032, 5), ("a", "q", 20, 4), ("x", "q", 30, 3)], + names=["one", "two", "three", "four"], + ), + ) result = parser.read_csv(StringIO(data), sep=r"\s+") tm.assert_frame_equal(result, expected) @@ -178,7 +178,9 @@ def test_read_csv_buglet_4x_multi_index2(python_parser_only): expected = DataFrame.from_records( [(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)], - columns=list("abcABC"), index=list("abc")) + columns=list("abcABC"), + index=list("abc"), + ) result = parser.read_csv(StringIO(data), sep=r"\s+") tm.assert_frame_equal(result, expected) @@ -198,15 +200,16 @@ def test_skipfooter_with_decimal(python_parser_only, add_footer): else: kwargs = dict() - result = parser.read_csv(StringIO(data), names=["a"], - decimal="#", **kwargs) + result = parser.read_csv(StringIO(data), names=["a"], decimal="#", **kwargs) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("sep", ["::", "#####", "!!!", "123", "#1!c5", - "%!c!d", "@@#4:2", "_!pd#_"]) -@pytest.mark.parametrize("encoding", ["utf-16", "utf-16-be", "utf-16-le", - "utf-32", "cp037"]) +@pytest.mark.parametrize( + "sep", ["::", "#####", "!!!", "123", "#1!c5", "%!c!d", "@@#4:2", "_!pd#_"] +) +@pytest.mark.parametrize( + "encoding", ["utf-16", "utf-16-be", "utf-16-le", "utf-32", "cp037"] +) def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding): # see gh-3404 expected = DataFrame({"a": [1], "b": [2]}) @@ -215,8 +218,9 @@ def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding): data = "1" + sep + "2" encoded_data = data.encode(encoding) - result = parser.read_csv(BytesIO(encoded_data), sep=sep, - names=["a", "b"], encoding=encoding) + result = parser.read_csv( + BytesIO(encoded_data), sep=sep, names=["a", "b"], encoding=encoding + ) tm.assert_frame_equal(result, expected) @@ -251,17 +255,16 @@ def test_none_delimiter(python_parser_only, capsys): # We expect the third line in the data to be # skipped because it is malformed, but we do # not expect any errors to occur. - result = parser.read_csv(StringIO(data), header=0, - sep=None, warn_bad_lines=True, - error_bad_lines=False) + result = parser.read_csv( + StringIO(data), header=0, sep=None, warn_bad_lines=True, error_bad_lines=False + ) tm.assert_frame_equal(result, expected) captured = capsys.readouterr() assert "Skipping line 3" in captured.err -@pytest.mark.parametrize("data", [ - 'a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz']) +@pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz']) @pytest.mark.parametrize("skipfooter", [0, 1]) def test_skipfooter_bad_row(python_parser_only, data, skipfooter): # see gh-13879 and gh-15910 @@ -292,5 +295,4 @@ def test_malformed_skipfooter(python_parser_only): """ msg = "Expected 3 fields in line 4, saw 5" with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), header=1, - comment="#", skipfooter=1) + parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1) diff --git a/pandas/tests/io/parser/test_quoting.py b/pandas/tests/io/parser/test_quoting.py index 71d23077737911..94858226d0b441 100644 --- a/pandas/tests/io/parser/test_quoting.py +++ b/pandas/tests/io/parser/test_quoting.py @@ -14,12 +14,17 @@ import pandas.util.testing as tm -@pytest.mark.parametrize("kwargs,msg", [ - (dict(quotechar="foo"), '"quotechar" must be a(n)? 1-character string'), - (dict(quotechar=None, quoting=csv.QUOTE_MINIMAL), - "quotechar must be set if quoting enabled"), - (dict(quotechar=2), '"quotechar" must be string, not int') -]) +@pytest.mark.parametrize( + "kwargs,msg", + [ + (dict(quotechar="foo"), '"quotechar" must be a(n)? 1-character string'), + ( + dict(quotechar=None, quoting=csv.QUOTE_MINIMAL), + "quotechar must be set if quoting enabled", + ), + (dict(quotechar=2), '"quotechar" must be string, not int'), + ], +) def test_bad_quote_char(all_parsers, kwargs, msg): data = "1,2,3" parser = all_parsers @@ -28,10 +33,13 @@ def test_bad_quote_char(all_parsers, kwargs, msg): parser.read_csv(StringIO(data), **kwargs) -@pytest.mark.parametrize("quoting,msg", [ - ("foo", '"quoting" must be an integer'), - (5, 'bad "quoting" value'), # quoting must be in the range [0, 3] -]) +@pytest.mark.parametrize( + "quoting,msg", + [ + ("foo", '"quoting" must be an integer'), + (5, 'bad "quoting" value'), # quoting must be in the range [0, 3] + ], +) def test_bad_quoting(all_parsers, quoting, msg): data = "1,2,3" parser = all_parsers @@ -43,8 +51,7 @@ def test_bad_quoting(all_parsers, quoting, msg): def test_quote_char_basic(all_parsers): parser = all_parsers data = 'a,b,c\n1,2,"cat"' - expected = DataFrame([[1, 2, "cat"]], - columns=["a", "b", "c"]) + expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"]) result = parser.read_csv(StringIO(data), quotechar='"') tm.assert_frame_equal(result, expected) @@ -53,8 +60,7 @@ def test_quote_char_basic(all_parsers): @pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"]) def test_quote_char_various(all_parsers, quote_char): parser = all_parsers - expected = DataFrame([[1, 2, "cat"]], - columns=["a", "b", "c"]) + expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"]) data = 'a,b,c\n1,2,"cat"' new_data = data.replace('"', quote_char) @@ -82,23 +88,22 @@ def test_null_quote_char(all_parsers, quoting, quote_char): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs,exp_data", [ - (dict(), [[1, 2, "foo"]]), # Test default. - - # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. - (dict(quotechar='"', quoting=csv.QUOTE_MINIMAL), [[1, 2, "foo"]]), - - # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. - (dict(quotechar='"', quoting=csv.QUOTE_ALL), [[1, 2, "foo"]]), - - # QUOTE_NONE tells the reader to do no special handling - # of quote characters and leave them alone. - (dict(quotechar='"', quoting=csv.QUOTE_NONE), [[1, 2, '"foo"']]), - - # QUOTE_NONNUMERIC tells the reader to cast - # all non-quoted fields to float - (dict(quotechar='"', quoting=csv.QUOTE_NONNUMERIC), [[1.0, 2.0, "foo"]]) -]) +@pytest.mark.parametrize( + "kwargs,exp_data", + [ + (dict(), [[1, 2, "foo"]]), # Test default. + # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. + (dict(quotechar='"', quoting=csv.QUOTE_MINIMAL), [[1, 2, "foo"]]), + # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. + (dict(quotechar='"', quoting=csv.QUOTE_ALL), [[1, 2, "foo"]]), + # QUOTE_NONE tells the reader to do no special handling + # of quote characters and leave them alone. + (dict(quotechar='"', quoting=csv.QUOTE_NONE), [[1, 2, '"foo"']]), + # QUOTE_NONNUMERIC tells the reader to cast + # all non-quoted fields to float + (dict(quotechar='"', quoting=csv.QUOTE_NONNUMERIC), [[1.0, 2.0, "foo"]]), + ], +) def test_quoting_various(all_parsers, kwargs, exp_data): data = '1,2,"foo"' parser = all_parsers @@ -109,21 +114,19 @@ def test_quoting_various(all_parsers, kwargs, exp_data): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("doublequote,exp_data", [ - (True, [[3, '4 " 5']]), - (False, [[3, '4 " 5"']]), -]) +@pytest.mark.parametrize( + "doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])] +) def test_double_quote(all_parsers, doublequote, exp_data): parser = all_parsers data = 'a,b\n3,"4 "" 5"' - result = parser.read_csv(StringIO(data), quotechar='"', - doublequote=doublequote) + result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote) expected = DataFrame(exp_data, columns=["a", "b"]) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("quotechar", ['"', '\u0001']) +@pytest.mark.parametrize("quotechar", ['"', "\u0001"]) def test_quotechar_unicode(all_parsers, quotechar): # see gh-14477 data = "a\n1" @@ -138,7 +141,7 @@ def test_quotechar_unicode(all_parsers, quotechar): def test_unbalanced_quoting(all_parsers, balanced): # see gh-22789. parser = all_parsers - data = "a,b,c\n1,2,\"3" + data = 'a,b,c\n1,2,"3' if balanced: # Re-balance the quoting and read in without errors. @@ -146,8 +149,11 @@ def test_unbalanced_quoting(all_parsers, balanced): result = parser.read_csv(StringIO(data + '"')) tm.assert_frame_equal(result, expected) else: - msg = ("EOF inside string starting at row 1" if parser.engine == "c" - else "unexpected end of data") + msg = ( + "EOF inside string starting at row 1" + if parser.engine == "c" + else "unexpected end of data" + ) with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data)) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 10859bc41d5084..72885315e06bcd 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -27,12 +27,16 @@ def test_basic(): 201162 502.953953 173.237159 12468.3 """ result = read_fwf(StringIO(data)) - expected = DataFrame([[201158, 360.242940, 149.910199, 11950.7], - [201159, 444.953632, 166.985655, 11788.4], - [201160, 364.136849, 183.628767, 11806.2], - [201161, 413.836124, 184.375703, 11916.8], - [201162, 502.953953, 173.237159, 12468.3]], - columns=["A", "B", "C", "D"]) + expected = DataFrame( + [ + [201158, 360.242940, 149.910199, 11950.7], + [201159, 444.953632, 166.985655, 11788.4], + [201160, 364.136849, 183.628767, 11806.2], + [201161, 413.836124, 184.375703, 11916.8], + [201162, 502.953953, 173.237159, 12468.3], + ], + columns=["A", "B", "C", "D"], + ) tm.assert_frame_equal(result, expected) @@ -48,12 +52,16 @@ def test_colspecs(): colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] result = read_fwf(StringIO(data), colspecs=colspecs) - expected = DataFrame([[2011, 58, 360.242940, 149.910199, 11950.7], - [2011, 59, 444.953632, 166.985655, 11788.4], - [2011, 60, 364.136849, 183.628767, 11806.2], - [2011, 61, 413.836124, 184.375703, 11916.8], - [2011, 62, 502.953953, 173.237159, 12468.3]], - columns=["A", "B", "C", "D", "E"]) + expected = DataFrame( + [ + [2011, 58, 360.242940, 149.910199, 11950.7], + [2011, 59, 444.953632, 166.985655, 11788.4], + [2011, 60, 364.136849, 183.628767, 11806.2], + [2011, 61, 413.836124, 184.375703, 11916.8], + [2011, 62, 502.953953, 173.237159, 12468.3], + ], + columns=["A", "B", "C", "D", "E"], + ) tm.assert_frame_equal(result, expected) @@ -68,12 +76,16 @@ def test_widths(): """ result = read_fwf(StringIO(data), widths=[5, 5, 13, 13, 7]) - expected = DataFrame([[2011, 58, 360.242940, 149.910199, 11950.7], - [2011, 59, 444.953632, 166.985655, 11788.4], - [2011, 60, 364.136849, 183.628767, 11806.2], - [2011, 61, 413.836124, 184.375703, 11916.8], - [2011, 62, 502.953953, 173.237159, 12468.3]], - columns=["A", "B", "C", "D", "E"]) + expected = DataFrame( + [ + [2011, 58, 360.242940, 149.910199, 11950.7], + [2011, 59, 444.953632, 166.985655, 11788.4], + [2011, 60, 364.136849, 183.628767, 11806.2], + [2011, 61, 413.836124, 184.375703, 11916.8], + [2011, 62, 502.953953, 173.237159, 12468.3], + ], + columns=["A", "B", "C", "D", "E"], + ) tm.assert_frame_equal(result, expected) @@ -95,12 +107,16 @@ def test_non_space_filler(): colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] result = read_fwf(StringIO(data), colspecs=colspecs, delimiter="~") - expected = DataFrame([[2011, 58, 360.242940, 149.910199, 11950.7], - [2011, 59, 444.953632, 166.985655, 11788.4], - [2011, 60, 364.136849, 183.628767, 11806.2], - [2011, 61, 413.836124, 184.375703, 11916.8], - [2011, 62, 502.953953, 173.237159, 12468.3]], - columns=["A", "B", "C", "D", "E"]) + expected = DataFrame( + [ + [2011, 58, 360.242940, 149.910199, 11950.7], + [2011, 59, 444.953632, 166.985655, 11788.4], + [2011, 60, 364.136849, 183.628767, 11806.2], + [2011, 61, 413.836124, 184.375703, 11916.8], + [2011, 62, 502.953953, 173.237159, 12468.3], + ], + columns=["A", "B", "C", "D", "E"], + ) tm.assert_frame_equal(result, expected) @@ -157,8 +173,9 @@ def test_read_csv_compat(): def test_bytes_io_input(): - result = read_fwf(BytesIO("שלום\nשלום".encode('utf8')), - widths=[2, 2], encoding="utf8") + result = read_fwf( + BytesIO("שלום\nשלום".encode("utf8")), widths=[2, 2], encoding="utf8" + ) expected = DataFrame([["של", "ום"]], columns=["של", "ום"]) tm.assert_frame_equal(result, expected) @@ -195,12 +212,15 @@ def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples(): read_fwf(StringIO(data), [("a", 1)]) -@pytest.mark.parametrize("colspecs,exp_data", [ - ([(0, 3), (3, None)], [[123, 456], [456, 789]]), - ([(None, 3), (3, 6)], [[123, 456], [456, 789]]), - ([(0, None), (3, None)], [[123456, 456], [456789, 789]]), - ([(None, None), (3, 6)], [[123456, 456], [456789, 789]]), -]) +@pytest.mark.parametrize( + "colspecs,exp_data", + [ + ([(0, 3), (3, None)], [[123, 456], [456, 789]]), + ([(None, 3), (3, 6)], [[123, 456], [456, 789]]), + ([(0, None), (3, None)], [[123456, 456], [456789, 789]]), + ([(None, None), (3, 6)], [[123456, 456], [456789, 789]]), + ], +) def test_fwf_colspecs_none(colspecs, exp_data): # see gh-7079 data = """\ @@ -213,13 +233,15 @@ def test_fwf_colspecs_none(colspecs, exp_data): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("infer_nrows,exp_data", [ - # infer_nrows --> colspec == [(2, 3), (5, 6)] - (1, [[1, 2], [3, 8]]), - - # infer_nrows > number of rows - (10, [[1, 2], [123, 98]]), -]) +@pytest.mark.parametrize( + "infer_nrows,exp_data", + [ + # infer_nrows --> colspec == [(2, 3), (5, 6)] + (1, [[1, 2], [3, 8]]), + # infer_nrows > number of rows + (10, [[1, 2], [123, 98]]), + ], +) def test_fwf_colspecs_infer_nrows(infer_nrows, exp_data): # see gh-15138 data = """\ @@ -247,39 +269,59 @@ def test_fwf_regression(): 2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039 """ - result = read_fwf(StringIO(data), index_col=0, header=None, names=names, - widths=widths, parse_dates=True, - date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S")) - expected = DataFrame([ - [9.5403, 9.4105, 8.6571, 7.8372, 6.0612, 5.8843, 5.5192], - [9.5435, 9.2010, 8.6167, 7.8176, 6.0804, 5.8728, 5.4869], - [9.5873, 9.1326, 8.4694, 7.5889, 6.0422, 5.8526, 5.4657], - [9.5810, 9.0896, 8.4009, 7.4652, 6.0322, 5.8189, 5.4379], - [9.6034, 9.0897, 8.3822, 7.4905, 6.0908, 5.7904, 5.4039], - ], index=DatetimeIndex(["2009-06-13 20:20:00", "2009-06-13 20:30:00", - "2009-06-13 20:40:00", "2009-06-13 20:50:00", - "2009-06-13 21:00:00"]), - columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"]) + result = read_fwf( + StringIO(data), + index_col=0, + header=None, + names=names, + widths=widths, + parse_dates=True, + date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S"), + ) + expected = DataFrame( + [ + [9.5403, 9.4105, 8.6571, 7.8372, 6.0612, 5.8843, 5.5192], + [9.5435, 9.2010, 8.6167, 7.8176, 6.0804, 5.8728, 5.4869], + [9.5873, 9.1326, 8.4694, 7.5889, 6.0422, 5.8526, 5.4657], + [9.5810, 9.0896, 8.4009, 7.4652, 6.0322, 5.8189, 5.4379], + [9.6034, 9.0897, 8.3822, 7.4905, 6.0908, 5.7904, 5.4039], + ], + index=DatetimeIndex( + [ + "2009-06-13 20:20:00", + "2009-06-13 20:30:00", + "2009-06-13 20:40:00", + "2009-06-13 20:50:00", + "2009-06-13 21:00:00", + ] + ), + columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"], + ) tm.assert_frame_equal(result, expected) def test_fwf_for_uint8(): data = """1421302965.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127 1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" # noqa - df = read_fwf(StringIO(data), - colspecs=[(0, 17), (25, 26), (33, 37), - (49, 51), (58, 62), (63, 1000)], - names=["time", "pri", "pgn", "dst", "src", "data"], - converters={ - "pgn": lambda x: int(x, 16), - "src": lambda x: int(x, 16), - "dst": lambda x: int(x, 16), - "data": lambda x: len(x.split(" "))}) - - expected = DataFrame([[1421302965.213420, 3, 61184, 23, 40, 8], - [1421302964.226776, 6, 61442, None, 71, 8]], - columns=["time", "pri", "pgn", - "dst", "src", "data"]) + df = read_fwf( + StringIO(data), + colspecs=[(0, 17), (25, 26), (33, 37), (49, 51), (58, 62), (63, 1000)], + names=["time", "pri", "pgn", "dst", "src", "data"], + converters={ + "pgn": lambda x: int(x, 16), + "src": lambda x: int(x, 16), + "dst": lambda x: int(x, 16), + "data": lambda x: len(x.split(" ")), + }, + ) + + expected = DataFrame( + [ + [1421302965.213420, 3, 61184, 23, 40, 8], + [1421302964.226776, 6, 61442, None, 71, 8], + ], + columns=["time", "pri", "pgn", "dst", "src", "data"], + ) expected["dst"] = expected["dst"].astype(object) tm.assert_frame_equal(df, expected) @@ -293,10 +335,9 @@ def test_fwf_comment(comment): data = data.replace("#", comment) colspecs = [(0, 3), (4, 9), (9, 25)] - expected = DataFrame([[1, 2., 4], [5, np.nan, 10.]]) + expected = DataFrame([[1, 2.0, 4], [5, np.nan, 10.0]]) - result = read_fwf(StringIO(data), colspecs=colspecs, - header=None, comment=comment) + result = read_fwf(StringIO(data), colspecs=colspecs, header=None, comment=comment) tm.assert_almost_equal(result, expected) @@ -309,10 +350,11 @@ def test_fwf_thousands(thousands): data = data.replace(",", thousands) colspecs = [(0, 3), (3, 11), (12, 16)] - expected = DataFrame([[1, 2334., 5], [10, 13, 10.]]) + expected = DataFrame([[1, 2334.0, 5], [10, 13, 10.0]]) - result = read_fwf(StringIO(data), header=None, - colspecs=colspecs, thousands=thousands) + result = read_fwf( + StringIO(data), header=None, colspecs=colspecs, thousands=thousands + ) tm.assert_almost_equal(result, expected) @@ -374,7 +416,9 @@ def test_full_file_with_spaces(): 868 Jennifer Love Hewitt 0 17000.00 5/25/1985 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 317 Bill Murray 789.65 5000.00 2/5/2007 -""".strip("\r\n") +""".strip( + "\r\n" + ) colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) expected = read_fwf(StringIO(test), colspecs=colspecs) @@ -391,7 +435,9 @@ def test_full_file_with_spaces_and_missing(): 868 5/25/1985 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 317 Bill Murray 789.65 -""".strip("\r\n") +""".strip( + "\r\n" + ) colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) expected = read_fwf(StringIO(test), colspecs=colspecs) @@ -408,7 +454,9 @@ def test_messed_up_data(): 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 317 Bill Murray 789.65 -""".strip("\r\n") +""".strip( + "\r\n" + ) colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79)) expected = read_fwf(StringIO(test), colspecs=colspecs) @@ -424,7 +472,9 @@ def test_multiple_delimiters(): ++44~~~~12.01 baz~~Jennifer Love Hewitt ~~55 11+++foo++++Jada Pinkett-Smith ..66++++++.03~~~bar Bill Murray -""".strip("\r\n") +""".strip( + "\r\n" + ) delimiter = " +~.\\" colspecs = ((0, 4), (7, 13), (15, 19), (21, 41)) expected = read_fwf(StringIO(test), colspecs=colspecs, delimiter=delimiter) @@ -438,19 +488,20 @@ def test_variable_width_unicode(): שלום שלום ום שלל של ום -""".strip("\r\n") +""".strip( + "\r\n" + ) encoding = "utf8" kwargs = dict(header=None, encoding=encoding) - expected = read_fwf(BytesIO(data.encode(encoding)), - colspecs=[(0, 4), (5, 9)], **kwargs) + expected = read_fwf( + BytesIO(data.encode(encoding)), colspecs=[(0, 4), (5, 9)], **kwargs + ) result = read_fwf(BytesIO(data.encode(encoding)), **kwargs) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("dtype", [ - dict(), {"a": "float64", "b": str, "c": "int32"} -]) +@pytest.mark.parametrize("dtype", [dict(), {"a": "float64", "b": str, "c": "int32"}]) def test_dtype(dtype): data = """ a b c 1 2 3.2 @@ -459,9 +510,9 @@ def test_dtype(dtype): colspecs = [(0, 5), (5, 10), (10, None)] result = read_fwf(StringIO(data), colspecs=colspecs, dtype=dtype) - expected = pd.DataFrame({ - "a": [1, 3], "b": [2, 4], - "c": [3.2, 5.2]}, columns=["a", "b", "c"]) + expected = pd.DataFrame( + {"a": [1, 3], "b": [2, 4], "c": [3.2, 5.2]}, columns=["a", "b", "c"] + ) for col, dt in dtype.items(): expected[col] = expected[col].astype(dt) @@ -479,8 +530,7 @@ def test_skiprows_inference(): 101.6 956.1 """.strip() skiprows = 2 - expected = read_csv(StringIO(data), skiprows=skiprows, - delim_whitespace=True) + expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) result = read_fwf(StringIO(data), skiprows=skiprows) tm.assert_frame_equal(result, expected) @@ -495,8 +545,7 @@ def test_skiprows_by_index_inference(): 456 78 9 456 """.strip() skiprows = [0, 2] - expected = read_csv(StringIO(data), skiprows=skiprows, - delim_whitespace=True) + expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) result = read_fwf(StringIO(data), skiprows=skiprows) tm.assert_frame_equal(result, expected) @@ -524,8 +573,9 @@ def test_whitespace_preservation(): fwf_data = """ a bbb ccdd """ - result = read_fwf(StringIO(fwf_data), widths=[3, 3], - header=header, skiprows=[0], delimiter="\n\t") + result = read_fwf( + StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0], delimiter="\n\t" + ) expected = read_csv(StringIO(csv_data), header=header) tm.assert_frame_equal(result, expected) @@ -539,8 +589,7 @@ def test_default_delimiter(): fwf_data = """ a \tbbb cc\tdd """ - result = read_fwf(StringIO(fwf_data), widths=[3, 3], - header=header, skiprows=[0]) + result = read_fwf(StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0]) expected = read_csv(StringIO(csv_data), header=header) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index e63aac9d6ff6fe..d4f219d13ac532 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -29,24 +29,28 @@ def test_skip_rows_bug(all_parsers, skiprows): 1/2/2000,4,5,6 1/3/2000,7,8,9 """ - result = parser.read_csv(StringIO(text), skiprows=skiprows, header=None, - index_col=0, parse_dates=True) - index = Index([datetime(2000, 1, 1), datetime(2000, 1, 2), - datetime(2000, 1, 3)], name=0) - - expected = DataFrame(np.arange(1., 10.).reshape((3, 3)), - columns=[1, 2, 3], index=index) + result = parser.read_csv( + StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True + ) + index = Index( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + ) + + expected = DataFrame( + np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index + ) tm.assert_frame_equal(result, expected) def test_deep_skip_rows(all_parsers): # see gh-4382 parser = all_parsers - data = "a,b,c\n" + "\n".join([",".join([str(i), str(i + 1), str(i + 2)]) - for i in range(10)]) - condensed_data = "a,b,c\n" + "\n".join([ - ",".join([str(i), str(i + 1), str(i + 2)]) - for i in [0, 1, 2, 3, 4, 6, 8, 9]]) + data = "a,b,c\n" + "\n".join( + [",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)] + ) + condensed_data = "a,b,c\n" + "\n".join( + [",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]] + ) result = parser.read_csv(StringIO(data), skiprows=[6, 8]) condensed_result = parser.read_csv(StringIO(condensed_data)) @@ -66,38 +70,52 @@ def test_skip_rows_blank(all_parsers): 1/2/2000,4,5,6 1/3/2000,7,8,9 """ - data = parser.read_csv(StringIO(text), skiprows=6, header=None, - index_col=0, parse_dates=True) - index = Index([datetime(2000, 1, 1), datetime(2000, 1, 2), - datetime(2000, 1, 3)], name=0) - - expected = DataFrame(np.arange(1., 10.).reshape((3, 3)), - columns=[1, 2, 3], - index=index) + data = parser.read_csv( + StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True + ) + index = Index( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + ) + + expected = DataFrame( + np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index + ) tm.assert_frame_equal(data, expected) -@pytest.mark.parametrize("data,kwargs,expected", [ - ("""id,text,num_lines +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + """id,text,num_lines 1,"line 11 line 12",2 2,"line 21 line 22",2 3,"line 31",1""", - dict(skiprows=[1]), - DataFrame([[2, "line 21\nline 22", 2], - [3, "line 31", 1]], columns=["id", "text", "num_lines"])), - ("a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~", - dict(quotechar="~", skiprows=[2]), - DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"])), - (("Text,url\n~example\n " - "sentence\n one~,url1\n~" - "example\n sentence\n two~,url2\n~" - "example\n sentence\n three~,url3"), - dict(quotechar="~", skiprows=[1, 3]), - DataFrame([['example\n sentence\n two', 'url2']], - columns=["Text", "url"])) -]) + dict(skiprows=[1]), + DataFrame( + [[2, "line 21\nline 22", 2], [3, "line 31", 1]], + columns=["id", "text", "num_lines"], + ), + ), + ( + "a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~", + dict(quotechar="~", skiprows=[2]), + DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]), + ), + ( + ( + "Text,url\n~example\n " + "sentence\n one~,url1\n~" + "example\n sentence\n two~,url2\n~" + "example\n sentence\n three~,url3" + ), + dict(quotechar="~", skiprows=[1, 3]), + DataFrame([["example\n sentence\n two", "url2"]], columns=["Text", "url"]), + ), + ], +) def test_skip_row_with_newline(all_parsers, data, kwargs, expected): # see gh-12775 and gh-10911 parser = all_parsers @@ -113,35 +131,39 @@ def test_skip_row_with_quote(all_parsers): 2,"line '21' line 22",2 3,"line '31' line 32",1""" - exp_data = [[2, "line '21' line 22", 2], - [3, "line '31' line 32", 1]] - expected = DataFrame(exp_data, columns=[ - "id", "text", "num_lines"]) + exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]] + expected = DataFrame(exp_data, columns=["id", "text", "num_lines"]) result = parser.read_csv(StringIO(data), skiprows=[1]) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,exp_data", [ - ("""id,text,num_lines +@pytest.mark.parametrize( + "data,exp_data", + [ + ( + """id,text,num_lines 1,"line \n'11' line 12",2 2,"line \n'21' line 22",2 3,"line \n'31' line 32",1""", - [[2, "line \n'21' line 22", 2], - [3, "line \n'31' line 32", 1]]), - ("""id,text,num_lines + [[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]], + ), + ( + """id,text,num_lines 1,"line '11\n' line 12",2 2,"line '21\n' line 22",2 3,"line '31\n' line 32",1""", - [[2, "line '21\n' line 22", 2], - [3, "line '31\n' line 32", 1]]), - ("""id,text,num_lines + [[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]], + ), + ( + """id,text,num_lines 1,"line '11\n' \r\tline 12",2 2,"line '21\n' \r\tline 22",2 3,"line '31\n' \r\tline 32",1""", - [[2, "line '21\n' \r\tline 22", 2], - [3, "line '31\n' \r\tline 32", 1]]), -]) + [[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]], + ), + ], +) def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data): # see gh-12775 and gh-10911 parser = all_parsers @@ -151,54 +173,64 @@ def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("line_terminator", [ - "\n", # "LF" - "\r\n", # "CRLF" - "\r" # "CR" -]) +@pytest.mark.parametrize( + "line_terminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR" +) def test_skiprows_lineterminator(all_parsers, line_terminator): # see gh-9079 parser = all_parsers - data = "\n".join(["SMOSMANIA ThetaProbe-ML2X ", - "2007/01/01 01:00 0.2140 U M ", - "2007/01/01 02:00 0.2141 M O ", - "2007/01/01 04:00 0.2142 D M "]) - expected = DataFrame([["2007/01/01", "01:00", 0.2140, "U", "M"], - ["2007/01/01", "02:00", 0.2141, "M", "O"], - ["2007/01/01", "04:00", 0.2142, "D", "M"]], - columns=["date", "time", "var", "flag", - "oflag"]) + data = "\n".join( + [ + "SMOSMANIA ThetaProbe-ML2X ", + "2007/01/01 01:00 0.2140 U M ", + "2007/01/01 02:00 0.2141 M O ", + "2007/01/01 04:00 0.2142 D M ", + ] + ) + expected = DataFrame( + [ + ["2007/01/01", "01:00", 0.2140, "U", "M"], + ["2007/01/01", "02:00", 0.2141, "M", "O"], + ["2007/01/01", "04:00", 0.2142, "D", "M"], + ], + columns=["date", "time", "var", "flag", "oflag"], + ) if parser.engine == "python" and line_terminator == "\r": pytest.skip("'CR' not respect with the Python parser yet") data = data.replace("\n", line_terminator) - result = parser.read_csv(StringIO(data), skiprows=1, delim_whitespace=True, - names=["date", "time", "var", "flag", "oflag"]) + result = parser.read_csv( + StringIO(data), + skiprows=1, + delim_whitespace=True, + names=["date", "time", "var", "flag", "oflag"], + ) tm.assert_frame_equal(result, expected) def test_skiprows_infield_quote(all_parsers): # see gh-14459 parser = all_parsers - data = "a\"\nb\"\na\n1" + data = 'a"\nb"\na\n1' expected = DataFrame({"a": [1]}) result = parser.read_csv(StringIO(data), skiprows=2) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs,expected", [ - (dict(), DataFrame({"1": [3, 5]})), - (dict(header=0, names=["foo"]), DataFrame({"foo": [3, 5]})) -]) +@pytest.mark.parametrize( + "kwargs,expected", + [ + (dict(), DataFrame({"1": [3, 5]})), + (dict(header=0, names=["foo"]), DataFrame({"foo": [3, 5]})), + ], +) def test_skip_rows_callable(all_parsers, kwargs, expected): parser = all_parsers data = "a\n1\n2\n3\n4\n5" - result = parser.read_csv(StringIO(data), - skiprows=lambda x: x % 2 == 0, - **kwargs) + result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 2177d6bb931089..57096a2652b883 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -20,16 +20,15 @@ class TestTextReader: - @pytest.fixture(autouse=True) def setup_method(self, datapath): - self.dirpath = datapath('io', 'parser', 'data') - self.csv1 = os.path.join(self.dirpath, 'test1.csv') - self.csv2 = os.path.join(self.dirpath, 'test2.csv') - self.xls1 = os.path.join(self.dirpath, 'test.xls') + self.dirpath = datapath("io", "parser", "data") + self.csv1 = os.path.join(self.dirpath, "test1.csv") + self.csv2 = os.path.join(self.dirpath, "test2.csv") + self.xls1 = os.path.join(self.dirpath, "test.xls") def test_file_handle(self): - with open(self.csv1, 'rb') as f: + with open(self.csv1, "rb") as f: reader = TextReader(f) reader.read() @@ -38,12 +37,12 @@ def test_string_filename(self): reader.read() def test_file_handle_mmap(self): - with open(self.csv1, 'rb') as f: + with open(self.csv1, "rb") as f: reader = TextReader(f, memory_map=True, header=None) reader.read() def test_StringIO(self): - with open(self.csv1, 'rb') as f: + with open(self.csv1, "rb") as f: text = f.read() src = BytesIO(text) reader = TextReader(src, header=None) @@ -51,28 +50,26 @@ def test_StringIO(self): def test_string_factorize(self): # should this be optional? - data = 'a\nb\na\nb\na' + data = "a\nb\na\nb\na" reader = TextReader(StringIO(data), header=None) result = reader.read() assert len(set(map(id, result[0]))) == 2 def test_skipinitialspace(self): - data = ('a, b\n' - 'a, b\n' - 'a, b\n' - 'a, b') + data = "a, b\n" "a, b\n" "a, b\n" "a, b" - reader = TextReader(StringIO(data), skipinitialspace=True, - header=None) + reader = TextReader(StringIO(data), skipinitialspace=True, header=None) result = reader.read() - tm.assert_numpy_array_equal(result[0], np.array(['a', 'a', 'a', 'a'], - dtype=np.object_)) - tm.assert_numpy_array_equal(result[1], np.array(['b', 'b', 'b', 'b'], - dtype=np.object_)) + tm.assert_numpy_array_equal( + result[0], np.array(["a", "a", "a", "a"], dtype=np.object_) + ) + tm.assert_numpy_array_equal( + result[1], np.array(["b", "b", "b", "b"], dtype=np.object_) + ) def test_parse_booleans(self): - data = 'True\nFalse\nTrue\nTrue' + data = "True\nFalse\nTrue\nTrue" reader = TextReader(StringIO(data), header=None) result = reader.read() @@ -82,14 +79,15 @@ def test_parse_booleans(self): def test_delimit_whitespace(self): data = 'a b\na\t\t "b"\n"a"\t \t b' - reader = TextReader(StringIO(data), delim_whitespace=True, - header=None) + reader = TextReader(StringIO(data), delim_whitespace=True, header=None) result = reader.read() - tm.assert_numpy_array_equal(result[0], np.array(['a', 'a', 'a'], - dtype=np.object_)) - tm.assert_numpy_array_equal(result[1], np.array(['b', 'b', 'b'], - dtype=np.object_)) + tm.assert_numpy_array_equal( + result[0], np.array(["a", "a", "a"], dtype=np.object_) + ) + tm.assert_numpy_array_equal( + result[1], np.array(["b", "b", "b"], dtype=np.object_) + ) def test_embedded_newline(self): data = 'a\n"hello\nthere"\nthis' @@ -97,34 +95,33 @@ def test_embedded_newline(self): reader = TextReader(StringIO(data), header=None) result = reader.read() - expected = np.array(['a', 'hello\nthere', 'this'], dtype=np.object_) + expected = np.array(["a", "hello\nthere", "this"], dtype=np.object_) tm.assert_numpy_array_equal(result[0], expected) def test_euro_decimal(self): - data = '12345,67\n345,678' + data = "12345,67\n345,678" - reader = TextReader(StringIO(data), delimiter=':', - decimal=',', header=None) + reader = TextReader(StringIO(data), delimiter=":", decimal=",", header=None) result = reader.read() expected = np.array([12345.67, 345.678]) tm.assert_almost_equal(result[0], expected) def test_integer_thousands(self): - data = '123,456\n12,500' + data = "123,456\n12,500" - reader = TextReader(StringIO(data), delimiter=':', - thousands=',', header=None) + reader = TextReader(StringIO(data), delimiter=":", thousands=",", header=None) result = reader.read() expected = np.array([123456, 12500], dtype=np.int64) tm.assert_almost_equal(result[0], expected) def test_integer_thousands_alt(self): - data = '123.456\n12.500' + data = "123.456\n12.500" - reader = TextFileReader(StringIO(data), delimiter=':', - thousands='.', header=None) + reader = TextFileReader( + StringIO(data), delimiter=":", thousands=".", header=None + ) result = reader.read() expected = DataFrame([123456, 12500]) @@ -132,65 +129,61 @@ def test_integer_thousands_alt(self): def test_skip_bad_lines(self, capsys): # too many lines, see #2430 for why - data = ('a:b:c\n' - 'd:e:f\n' - 'g:h:i\n' - 'j:k:l:m\n' - 'l:m:n\n' - 'o:p:q:r') - - reader = TextReader(StringIO(data), delimiter=':', - header=None) - msg = (r"Error tokenizing data\. C error: Expected 3 fields in" - " line 4, saw 4") + data = "a:b:c\n" "d:e:f\n" "g:h:i\n" "j:k:l:m\n" "l:m:n\n" "o:p:q:r" + + reader = TextReader(StringIO(data), delimiter=":", header=None) + msg = r"Error tokenizing data\. C error: Expected 3 fields in" " line 4, saw 4" with pytest.raises(parser.ParserError, match=msg): reader.read() - reader = TextReader(StringIO(data), delimiter=':', - header=None, - error_bad_lines=False, - warn_bad_lines=False) + reader = TextReader( + StringIO(data), + delimiter=":", + header=None, + error_bad_lines=False, + warn_bad_lines=False, + ) result = reader.read() - expected = {0: np.array(['a', 'd', 'g', 'l'], dtype=object), - 1: np.array(['b', 'e', 'h', 'm'], dtype=object), - 2: np.array(['c', 'f', 'i', 'n'], dtype=object)} + expected = { + 0: np.array(["a", "d", "g", "l"], dtype=object), + 1: np.array(["b", "e", "h", "m"], dtype=object), + 2: np.array(["c", "f", "i", "n"], dtype=object), + } assert_array_dicts_equal(result, expected) - reader = TextReader(StringIO(data), delimiter=':', - header=None, - error_bad_lines=False, - warn_bad_lines=True) + reader = TextReader( + StringIO(data), + delimiter=":", + header=None, + error_bad_lines=False, + warn_bad_lines=True, + ) reader.read() captured = capsys.readouterr() - assert 'Skipping line 4' in captured.err - assert 'Skipping line 6' in captured.err + assert "Skipping line 4" in captured.err + assert "Skipping line 6" in captured.err def test_header_not_enough_lines(self): - data = ('skip this\n' - 'skip this\n' - 'a,b,c\n' - '1,2,3\n' - '4,5,6') + data = "skip this\n" "skip this\n" "a,b,c\n" "1,2,3\n" "4,5,6" - reader = TextReader(StringIO(data), delimiter=',', header=2) + reader = TextReader(StringIO(data), delimiter=",", header=2) header = reader.header - expected = [['a', 'b', 'c']] + expected = [["a", "b", "c"]] assert header == expected recs = reader.read() - expected = {0: np.array([1, 4], dtype=np.int64), - 1: np.array([2, 5], dtype=np.int64), - 2: np.array([3, 6], dtype=np.int64)} + expected = { + 0: np.array([1, 4], dtype=np.int64), + 1: np.array([2, 5], dtype=np.int64), + 2: np.array([3, 6], dtype=np.int64), + } assert_array_dicts_equal(recs, expected) def test_escapechar(self): - data = ('\\"hello world\"\n' - '\\"hello world\"\n' - '\\"hello world\"') + data = '\\"hello world"\n' '\\"hello world"\n' '\\"hello world"' - reader = TextReader(StringIO(data), delimiter=',', header=None, - escapechar='\\') + reader = TextReader(StringIO(data), delimiter=",", header=None, escapechar="\\") result = reader.read() expected = {0: np.array(['"hello world"'] * 3, dtype=object)} assert_array_dicts_equal(result, expected) @@ -211,24 +204,23 @@ def test_numpy_string_dtype(self): aaaaa,5""" def _make_reader(**kwds): - return TextReader(StringIO(data), delimiter=',', header=None, - **kwds) + return TextReader(StringIO(data), delimiter=",", header=None, **kwds) - reader = _make_reader(dtype='S5,i4') + reader = _make_reader(dtype="S5,i4") result = reader.read() - assert result[0].dtype == 'S5' + assert result[0].dtype == "S5" - ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaaa'], dtype='S5') + ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaaa"], dtype="S5") assert (result[0] == ex_values).all() - assert result[1].dtype == 'i4' + assert result[1].dtype == "i4" - reader = _make_reader(dtype='S4') + reader = _make_reader(dtype="S4") result = reader.read() - assert result[0].dtype == 'S4' - ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaa'], dtype='S4') + assert result[0].dtype == "S4" + ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaa"], dtype="S4") assert (result[0] == ex_values).all() - assert result[1].dtype == 'S4' + assert result[1].dtype == "S4" def test_pass_dtype(self): data = """\ @@ -239,23 +231,22 @@ def test_pass_dtype(self): 4,d""" def _make_reader(**kwds): - return TextReader(StringIO(data), delimiter=',', **kwds) + return TextReader(StringIO(data), delimiter=",", **kwds) - reader = _make_reader(dtype={'one': 'u1', 1: 'S1'}) + reader = _make_reader(dtype={"one": "u1", 1: "S1"}) result = reader.read() - assert result[0].dtype == 'u1' - assert result[1].dtype == 'S1' + assert result[0].dtype == "u1" + assert result[1].dtype == "S1" - reader = _make_reader(dtype={'one': np.uint8, 1: object}) + reader = _make_reader(dtype={"one": np.uint8, 1: object}) result = reader.read() - assert result[0].dtype == 'u1' - assert result[1].dtype == 'O' + assert result[0].dtype == "u1" + assert result[1].dtype == "O" - reader = _make_reader(dtype={'one': np.dtype('u1'), - 1: np.dtype('O')}) + reader = _make_reader(dtype={"one": np.dtype("u1"), 1: np.dtype("O")}) result = reader.read() - assert result[0].dtype == 'u1' - assert result[1].dtype == 'O' + assert result[0].dtype == "u1" + assert result[1].dtype == "O" def test_usecols(self): data = """\ @@ -266,7 +257,7 @@ def test_usecols(self): 10,11,12""" def _make_reader(**kwds): - return TextReader(StringIO(data), delimiter=',', **kwds) + return TextReader(StringIO(data), delimiter=",", **kwds) reader = _make_reader(usecols=(1, 2)) result = reader.read() @@ -278,72 +269,76 @@ def _make_reader(**kwds): def test_cr_delimited(self): def _test(text, **kwargs): - nice_text = text.replace('\r', '\r\n') + nice_text = text.replace("\r", "\r\n") result = TextReader(StringIO(text), **kwargs).read() expected = TextReader(StringIO(nice_text), **kwargs).read() assert_array_dicts_equal(result, expected) - data = 'a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12' - _test(data, delimiter=',') + data = "a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12" + _test(data, delimiter=",") - data = 'a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12' + data = "a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12" _test(data, delim_whitespace=True) - data = 'a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12' - _test(data, delimiter=',') + data = "a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12" + _test(data, delimiter=",") - sample = ('A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r' - 'AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r' - ',BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0') - _test(sample, delimiter=',') + sample = ( + "A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r" + "AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r" + ",BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0" + ) + _test(sample, delimiter=",") - data = 'A B C\r 2 3\r4 5 6' + data = "A B C\r 2 3\r4 5 6" _test(data, delim_whitespace=True) - data = 'A B C\r2 3\r4 5 6' + data = "A B C\r2 3\r4 5 6" _test(data, delim_whitespace=True) def test_empty_field_eof(self): - data = 'a,b,c\n1,2,3\n4,,' + data = "a,b,c\n1,2,3\n4,," - result = TextReader(StringIO(data), delimiter=',').read() + result = TextReader(StringIO(data), delimiter=",").read() - expected = {0: np.array([1, 4], dtype=np.int64), - 1: np.array(['2', ''], dtype=object), - 2: np.array(['3', ''], dtype=object)} + expected = { + 0: np.array([1, 4], dtype=np.int64), + 1: np.array(["2", ""], dtype=object), + 2: np.array(["3", ""], dtype=object), + } assert_array_dicts_equal(result, expected) # GH5664 - a = DataFrame([['b'], [nan]], columns=['a'], index=['a', 'c']) - b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], - columns=list('abcd'), - index=[1, 1]) - c = DataFrame([[1, 2, 3, 4], [6, nan, nan, nan], - [8, 9, 10, 11], [13, 14, nan, nan]], - columns=list('abcd'), - index=[0, 5, 7, 12]) + a = DataFrame([["b"], [nan]], columns=["a"], index=["a", "c"]) + b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], columns=list("abcd"), index=[1, 1]) + c = DataFrame( + [[1, 2, 3, 4], [6, nan, nan, nan], [8, 9, 10, 11], [13, 14, nan, nan]], + columns=list("abcd"), + index=[0, 5, 7, 12], + ) for _ in range(100): - df = read_csv(StringIO('a,b\nc\n'), skiprows=0, - names=['a'], engine='c') + df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c") assert_frame_equal(df, a) - df = read_csv(StringIO('1,1,1,1,0\n' * 2 + '\n' * 2), - names=list("abcd"), engine='c') + df = read_csv( + StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c" + ) assert_frame_equal(df, b) - df = read_csv(StringIO('0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14'), - names=list('abcd'), engine='c') + df = read_csv( + StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"), + names=list("abcd"), + engine="c", + ) assert_frame_equal(df, c) def test_empty_csv_input(self): # GH14867 - df = read_csv(StringIO(), chunksize=20, header=None, - names=['a', 'b', 'c']) + df = read_csv(StringIO(), chunksize=20, header=None, names=["a", "b", "c"]) assert isinstance(df, TextFileReader) def assert_array_dicts_equal(left, right): for k, v in left.items(): - tm.assert_numpy_array_equal(np.asarray(v), - np.asarray(right[k])) + tm.assert_numpy_array_equal(np.asarray(v), np.asarray(right[k])) diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index a8748c88e0e558..f135fac65f56a7 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -24,40 +24,37 @@ def python_engine(request): class TestUnsupportedFeatures: - def test_mangle_dupe_cols_false(self): # see gh-12935 - data = 'a b c\n1 2 3' - msg = 'is not supported' + data = "a b c\n1 2 3" + msg = "is not supported" - for engine in ('c', 'python'): + for engine in ("c", "python"): with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), engine=engine, - mangle_dupe_cols=False) + read_csv(StringIO(data), engine=engine, mangle_dupe_cols=False) def test_c_engine(self): # see gh-6607 - data = 'a b c\n1 2 3' - msg = 'does not support' + data = "a b c\n1 2 3" + msg = "does not support" # specify C engine with unsupported options (raise) with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), engine='c', - sep=None, delim_whitespace=False) + read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False) with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), engine='c', sep=r'\s') + read_csv(StringIO(data), engine="c", sep=r"\s") with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), engine='c', sep='\t', quotechar=chr(128)) + read_csv(StringIO(data), engine="c", sep="\t", quotechar=chr(128)) with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), engine='c', skipfooter=1) + read_csv(StringIO(data), engine="c", skipfooter=1) # specify C-unsupported options without python-unsupported options with tm.assert_produces_warning(parsers.ParserWarning): read_csv(StringIO(data), sep=None, delim_whitespace=False) with tm.assert_produces_warning(parsers.ParserWarning): - read_csv(StringIO(data), sep=r'\s') + read_csv(StringIO(data), sep=r"\s") with tm.assert_produces_warning(parsers.ParserWarning): - read_csv(StringIO(data), sep='\t', quotechar=chr(128)) + read_csv(StringIO(data), sep="\t", quotechar=chr(128)) with tm.assert_produces_warning(parsers.ParserWarning): read_csv(StringIO(data), skipfooter=1) @@ -66,12 +63,12 @@ def test_c_engine(self): a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" - msg = 'Error tokenizing data' + msg = "Error tokenizing data" with pytest.raises(ParserError, match=msg): - read_csv(StringIO(text), sep='\\s+') + read_csv(StringIO(text), sep="\\s+") with pytest.raises(ParserError, match=msg): - read_csv(StringIO(text), engine='c', sep='\\s+') + read_csv(StringIO(text), engine="c", sep="\\s+") msg = "Only length-1 thousands markers supported" data = """A|B|C @@ -79,14 +76,14 @@ def test_c_engine(self): 10|13|10. """ with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), thousands=',,') + read_csv(StringIO(data), thousands=",,") with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), thousands='') + read_csv(StringIO(data), thousands="") msg = "Only length-1 line terminators supported" - data = 'a,b,c~~1,2,3~~4,5,6' + data = "a,b,c~~1,2,3~~4,5,6" with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), lineterminator='~~') + read_csv(StringIO(data), lineterminator="~~") def test_python_engine(self, python_engine): from pandas.io.parsers import _python_unsupported as py_unsupported @@ -98,8 +95,10 @@ def test_python_engine(self, python_engine): 1,2,3,4,""" for default in py_unsupported: - msg = ('The %r option is not supported ' - 'with the %r engine' % (default, python_engine)) + msg = "The %r option is not supported " "with the %r engine" % ( + default, + python_engine, + ) kwargs = {default: object()} with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index 92cd0e873c02f9..b449e848a0b5a0 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -12,19 +12,25 @@ from pandas import DataFrame, Index import pandas.util.testing as tm -_msg_validate_usecols_arg = ("'usecols' must either be list-like " - "of all strings, all unicode, all " - "integers or a callable.") -_msg_validate_usecols_names = ("Usecols do not match columns, columns " - "expected but not found: {0}") - - -@pytest.mark.parametrize("names,usecols,missing", [ - (None, [0, 3], r"\[3\]"), - (["a", "b", "c"], [0, -1, 2], r"\[-1\]"), - (None, [3], r"\[3\]"), - (["a"], [3], r"\[3\]") -]) +_msg_validate_usecols_arg = ( + "'usecols' must either be list-like " + "of all strings, all unicode, all " + "integers or a callable." +) +_msg_validate_usecols_names = ( + "Usecols do not match columns, columns " "expected but not found: {0}" +) + + +@pytest.mark.parametrize( + "names,usecols,missing", + [ + (None, [0, 3], r"\[3\]"), + (["a", "b", "c"], [0, -1, 2], r"\[-1\]"), + (None, [3], r"\[3\]"), + (["a"], [3], r"\[3\]"), + ], +) def test_usecols_out_of_bounds(all_parsers, names, usecols, missing): # See gh-25623 data = "a,b,c\n1,2,3\n4,5,6" @@ -59,8 +65,7 @@ def test_usecols(all_parsers, usecols): parser = all_parsers result = parser.read_csv(StringIO(data), usecols=usecols) - expected = DataFrame([[2, 3], [5, 6], [8, 9], - [11, 12]], columns=["b", "c"]) + expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) tm.assert_frame_equal(result, expected) @@ -73,18 +78,15 @@ def test_usecols_with_names(all_parsers): 10,11,12""" parser = all_parsers names = ["foo", "bar"] - result = parser.read_csv(StringIO(data), names=names, - usecols=[1, 2], header=0) + result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0) - expected = DataFrame([[2, 3], [5, 6], [8, 9], - [11, 12]], columns=names) + expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("names,usecols", [ - (["b", "c"], [1, 2]), - (["a", "b", "c"], ["b", "c"]) -]) +@pytest.mark.parametrize( + "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])] +) def test_usecols_relative_to_names(all_parsers, names, usecols): data = """\ 1,2,3 @@ -92,11 +94,9 @@ def test_usecols_relative_to_names(all_parsers, names, usecols): 7,8,9 10,11,12""" parser = all_parsers - result = parser.read_csv(StringIO(data), names=names, - header=None, usecols=usecols) + result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols) - expected = DataFrame([[2, 3], [5, 6], [8, 9], - [11, 12]], columns=["b", "c"]) + expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) tm.assert_frame_equal(result, expected) @@ -108,11 +108,11 @@ def test_usecols_relative_to_names2(all_parsers): 7,8,9 10,11,12""" parser = all_parsers - result = parser.read_csv(StringIO(data), names=["a", "b"], - header=None, usecols=[0, 1]) + result = parser.read_csv( + StringIO(data), names=["a", "b"], header=None, usecols=[0, 1] + ) - expected = DataFrame([[1, 2], [4, 5], [7, 8], - [10, 11]], columns=["a", "b"]) + expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) @@ -123,14 +123,14 @@ def test_usecols_name_length_conflict(all_parsers): 7,8,9 10,11,12""" parser = all_parsers - msg = ("Number of passed names did not " - "match number of header fields in the file" - if parser.engine == "python" else - "Passed header names mismatches usecols") + msg = ( + "Number of passed names did not " "match number of header fields in the file" + if parser.engine == "python" + else "Passed header names mismatches usecols" + ) with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), names=["a", "b"], - header=None, usecols=[1]) + parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1]) def test_usecols_single_string(all_parsers): @@ -144,8 +144,9 @@ def test_usecols_single_string(all_parsers): parser.read_csv(StringIO(data), usecols="foo") -@pytest.mark.parametrize("data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", - "a,b,c,d\n1,2,3,4,\n5,6,7,8,"]) +@pytest.mark.parametrize( + "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"] +) def test_usecols_index_col_false(all_parsers, data): # see gh-9082 parser = all_parsers @@ -164,8 +165,7 @@ def test_usecols_index_col_conflict(all_parsers, usecols, index_col): data = "a,b,c,d\nA,a,1,one\nB,b,2,two" expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b")) - result = parser.read_csv(StringIO(data), usecols=usecols, - index_col=index_col) + result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col) tm.assert_frame_equal(result, expected) @@ -177,8 +177,9 @@ def test_usecols_index_col_conflict2(all_parsers): expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")}) expected = expected.set_index(["b", "c"]) - result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], - index_col=["b", "c"]) + result = parser.read_csv( + StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"] + ) tm.assert_frame_equal(result, expected) @@ -188,8 +189,7 @@ def test_usecols_implicit_index_col(all_parsers): data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10" result = parser.read_csv(StringIO(data), usecols=["a", "b"]) - expected = DataFrame({"a": ["apple", "orange"], - "b": ["bat", "cow"]}, index=[4, 8]) + expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) tm.assert_frame_equal(result, expected) @@ -199,8 +199,7 @@ def test_usecols_regex_sep(all_parsers): data = "a b c\n4 apple bat 5.7\n8 orange cow 10" result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) - expected = DataFrame({"a": ["apple", "orange"], - "b": ["bat", "cow"]}, index=[4, 8]) + expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) tm.assert_frame_equal(result, expected) @@ -208,22 +207,20 @@ def test_usecols_with_whitespace(all_parsers): parser = all_parsers data = "a b c\n4 apple bat 5.7\n8 orange cow 10" - result = parser.read_csv(StringIO(data), delim_whitespace=True, - usecols=("a", "b")) - expected = DataFrame({"a": ["apple", "orange"], - "b": ["bat", "cow"]}, index=[4, 8]) + result = parser.read_csv(StringIO(data), delim_whitespace=True, usecols=("a", "b")) + expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("usecols,expected", [ - # Column selection by index. - ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], - columns=["2", "0"])), - - # Column selection by name. - (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], - columns=["0", "1"])), -]) +@pytest.mark.parametrize( + "usecols,expected", + [ + # Column selection by index. + ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])), + # Column selection by name. + (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"])), + ], +) def test_usecols_with_integer_like_header(all_parsers, usecols, expected): parser = all_parsers data = """2,0,1 @@ -245,14 +242,10 @@ def test_usecols_with_parse_dates(all_parsers, usecols): cols = { "a": [0, 0], - "c_d": [ - Timestamp("2014-01-01 09:00:00"), - Timestamp("2014-01-02 10:00:00") - ] + "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], } expected = DataFrame(cols, columns=["c_d", "a"]) - result = parser.read_csv(StringIO(data), usecols=usecols, - parse_dates=parse_dates) + result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) tm.assert_frame_equal(result, expected) @@ -267,16 +260,25 @@ def test_usecols_with_parse_dates2(all_parsers): usecols = names[:] parse_dates = [0] - index = Index([Timestamp("2008-02-07 09:40"), - Timestamp("2008-02-07 09:50"), - Timestamp("2008-02-07 10:00")], - name="date") + index = Index( + [ + Timestamp("2008-02-07 09:40"), + Timestamp("2008-02-07 09:50"), + Timestamp("2008-02-07 10:00"), + ], + name="date", + ) cols = {"values": [1032.43, 1042.54, 1051.65]} expected = DataFrame(cols, index=index) - result = parser.read_csv(StringIO(data), parse_dates=parse_dates, - index_col=0, usecols=usecols, - header=None, names=names) + result = parser.read_csv( + StringIO(data), + parse_dates=parse_dates, + index_col=0, + usecols=usecols, + header=None, + names=names, + ) tm.assert_frame_equal(result, expected) @@ -289,14 +291,21 @@ def test_usecols_with_parse_dates3(all_parsers): usecols = list("abcdefghij") parse_dates = [0] - cols = {"a": Timestamp("2016-09-21"), - "b": [1], "c": [1], "d": [2], - "e": [3], "f": [4], "g": [5], - "h": [6], "i": [7], "j": [8]} + cols = { + "a": Timestamp("2016-09-21"), + "b": [1], + "c": [1], + "d": [2], + "e": [3], + "f": [4], + "g": [5], + "h": [6], + "i": [7], + "j": [8], + } expected = DataFrame(cols, columns=usecols) - result = parser.read_csv(StringIO(data), usecols=usecols, - parse_dates=parse_dates) + result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) tm.assert_frame_equal(result, expected) @@ -306,21 +315,31 @@ def test_usecols_with_parse_dates4(all_parsers): parse_dates = [[0, 1]] parser = all_parsers - cols = {"a_b": "2016/09/21 1", - "c": [1], "d": [2], "e": [3], "f": [4], - "g": [5], "h": [6], "i": [7], "j": [8]} + cols = { + "a_b": "2016/09/21 1", + "c": [1], + "d": [2], + "e": [3], + "f": [4], + "g": [5], + "h": [6], + "i": [7], + "j": [8], + } expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) - result = parser.read_csv(StringIO(data), usecols=usecols, - parse_dates=parse_dates) + result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) -@pytest.mark.parametrize("names", [ - list("abcde"), # Names span all columns in original data. - list("acd"), # Names span only the selected columns. -]) +@pytest.mark.parametrize( + "names", + [ + list("abcde"), # Names span all columns in original data. + list("acd"), # Names span only the selected columns. + ], +) def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names): # see gh-9755 s = """0,1,20140101,0900,4 @@ -330,16 +349,13 @@ def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names): cols = { "a": [0, 0], - "c_d": [ - Timestamp("2014-01-01 09:00:00"), - Timestamp("2014-01-02 10:00:00") - ] + "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], } expected = DataFrame(cols, columns=["c_d", "a"]) - result = parser.read_csv(StringIO(s), names=names, - parse_dates=parse_dates, - usecols=usecols) + result = parser.read_csv( + StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols + ) tm.assert_frame_equal(result, expected) @@ -352,12 +368,8 @@ def test_usecols_with_unicode_strings(all_parsers): parser = all_parsers exp_data = { - "AAA": { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002 - }, - "BBB": {0: 8, 1: 2, 2: 7} + "AAA": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002}, + "BBB": {0: 8, 1: 2, 2: 7}, } expected = DataFrame(exp_data) @@ -374,12 +386,8 @@ def test_usecols_with_single_byte_unicode_strings(all_parsers): parser = all_parsers exp_data = { - "A": { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002 - }, - "B": {0: 8, 1: 2, 2: 7} + "A": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002}, + "B": {0: 8, 1: 2, 2: 7}, } expected = DataFrame(exp_data) @@ -399,10 +407,7 @@ def test_usecols_with_mixed_encoding_strings(all_parsers, usecols): parser.read_csv(StringIO(data), usecols=usecols) -@pytest.mark.parametrize("usecols", [ - ["あああ", "いい"], - ["あああ", "いい"] -]) +@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]]) def test_usecols_with_multi_byte_characters(all_parsers, usecols): data = """あああ,いい,ううう,ええええ 0.056674973,8,True,a @@ -411,12 +416,8 @@ def test_usecols_with_multi_byte_characters(all_parsers, usecols): parser = all_parsers exp_data = { - "あああ": { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002 - }, - "いい": {0: 8, 1: 2, 2: 7} + "あああ": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002}, + "いい": {0: 8, 1: 2, 2: 7}, } expected = DataFrame(exp_data) @@ -444,19 +445,26 @@ def test_np_array_usecols(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("usecols,expected", [ - (lambda x: x.upper() in ["AAA", "BBB", "DDD"], - DataFrame({ - "AaA": { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002 - }, - "bBb": {0: 8, 1: 2, 2: 7}, - "ddd": {0: "a", 1: "b", 2: "a"} - })), - (lambda x: False, DataFrame()), -]) +@pytest.mark.parametrize( + "usecols,expected", + [ + ( + lambda x: x.upper() in ["AAA", "BBB", "DDD"], + DataFrame( + { + "AaA": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "bBb": {0: 8, 1: 2, 2: 7}, + "ddd": {0: "a", 1: "b", 2: "a"}, + } + ), + ), + (lambda x: False, DataFrame()), + ], +) def test_callable_usecols(all_parsers, usecols, expected): # see gh-14154 data = """AaA,bBb,CCC,ddd @@ -481,18 +489,31 @@ def test_incomplete_first_row(all_parsers, usecols): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,usecols,kwargs,expected", [ - # see gh-8985 - ("19,29,39\n" * 2 + "10,20,30,40", [0, 1, 2], - dict(header=None), DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]])), - - # see gh-9549 - (("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n" - "1,2,3,,,1,\n1,2,3\n5,6,7"), ["A", "B", "C"], - dict(), DataFrame({"A": [1, 3, 1, 1, 1, 5], - "B": [2, 4, 2, 2, 2, 6], - "C": [3, 5, 4, 3, 3, 7]})), -]) +@pytest.mark.parametrize( + "data,usecols,kwargs,expected", + [ + # see gh-8985 + ( + "19,29,39\n" * 2 + "10,20,30,40", + [0, 1, 2], + dict(header=None), + DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]), + ), + # see gh-9549 + ( + ("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n" "1,2,3,,,1,\n1,2,3\n5,6,7"), + ["A", "B", "C"], + dict(), + DataFrame( + { + "A": [1, 3, 1, 1, 1, 5], + "B": [2, 4, 2, 2, 2, 6], + "C": [3, 5, 4, 3, 3, 7], + } + ), + ), + ], +) def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected): # see gh-8985 parser = all_parsers @@ -500,27 +521,50 @@ def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("usecols,kwargs,expected,msg", [ - (["a", "b", "c", "d"], dict(), - DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), None), - (["a", "b", "c", "f"], dict(), None, - _msg_validate_usecols_names.format(r"\['f'\]")), - (["a", "b", "f"], dict(), None, - _msg_validate_usecols_names.format(r"\['f'\]")), - (["a", "b", "f", "g"], dict(), None, - _msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]")), - - # see gh-14671 - (None, dict(header=0, names=["A", "B", "C", "D"]), - DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], - "D": [4, 8]}), None), - (["A", "B", "C", "f"], dict(header=0, names=["A", "B", "C", "D"]), - None, _msg_validate_usecols_names.format(r"\['f'\]")), - (["A", "B", "f"], dict(names=["A", "B", "C", "D"]), - None, _msg_validate_usecols_names.format(r"\['f'\]")), -]) -def test_raises_on_usecols_names_mismatch(all_parsers, usecols, - kwargs, expected, msg): +@pytest.mark.parametrize( + "usecols,kwargs,expected,msg", + [ + ( + ["a", "b", "c", "d"], + dict(), + DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), + None, + ), + ( + ["a", "b", "c", "f"], + dict(), + None, + _msg_validate_usecols_names.format(r"\['f'\]"), + ), + (["a", "b", "f"], dict(), None, _msg_validate_usecols_names.format(r"\['f'\]")), + ( + ["a", "b", "f", "g"], + dict(), + None, + _msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"), + ), + # see gh-14671 + ( + None, + dict(header=0, names=["A", "B", "C", "D"]), + DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}), + None, + ), + ( + ["A", "B", "C", "f"], + dict(header=0, names=["A", "B", "C", "D"]), + None, + _msg_validate_usecols_names.format(r"\['f'\]"), + ), + ( + ["A", "B", "f"], + dict(names=["A", "B", "C", "D"]), + None, + _msg_validate_usecols_names.format(r"\['f'\]"), + ), + ], +) +def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected, msg): data = "a,b,c,d\n1,2,3,4\n5,6,7,8" kwargs.update(usecols=usecols) parser = all_parsers @@ -534,15 +578,14 @@ def test_raises_on_usecols_names_mismatch(all_parsers, usecols, @pytest.mark.xfail( - reason="see gh-16469: works on the C engine but not the Python engine", - strict=False) + reason="see gh-16469: works on the C engine but not the Python engine", strict=False +) @pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]]) def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols): data = "a,b,c,d\n1,2,3,4\n5,6,7,8" names = ["A", "B", "C", "D"] parser = all_parsers - result = parser.read_csv(StringIO(data), header=0, - names=names, usecols=usecols) + result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols) expected = DataFrame({"A": [1, 5], "C": [3, 7]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_compat.py b/pandas/tests/io/pytables/test_compat.py index 34ed066dd37488..f5f73beab6d609 100644 --- a/pandas/tests/io/pytables/test_compat.py +++ b/pandas/tests/io/pytables/test_compat.py @@ -4,7 +4,7 @@ from pandas.tests.io.pytables.test_pytables import ensure_clean_path from pandas.util.testing import assert_frame_equal -tables = pytest.importorskip('tables') +tables = pytest.importorskip("tables") @pytest.fixture @@ -12,26 +12,26 @@ def pytables_hdf5_file(): """Use PyTables to create a simple HDF5 file.""" table_schema = { - 'c0': tables.Time64Col(pos=0), - 'c1': tables.StringCol(5, pos=1), - 'c2': tables.Int64Col(pos=2), + "c0": tables.Time64Col(pos=0), + "c1": tables.StringCol(5, pos=1), + "c2": tables.Int64Col(pos=2), } t0 = 1561105000.0 testsamples = [ - {'c0': t0, 'c1': 'aaaaa', 'c2': 1}, - {'c0': t0 + 1, 'c1': 'bbbbb', 'c2': 2}, - {'c0': t0 + 2, 'c1': 'ccccc', 'c2': 10**5}, - {'c0': t0 + 3, 'c1': 'ddddd', 'c2': 4294967295}, + {"c0": t0, "c1": "aaaaa", "c2": 1}, + {"c0": t0 + 1, "c1": "bbbbb", "c2": 2}, + {"c0": t0 + 2, "c1": "ccccc", "c2": 10 ** 5}, + {"c0": t0 + 3, "c1": "ddddd", "c2": 4294967295}, ] - objname = 'pandas_test_timeseries' + objname = "pandas_test_timeseries" - with ensure_clean_path('written_with_pytables.h5') as path: + with ensure_clean_path("written_with_pytables.h5") as path: # The `ensure_clean_path` context mgr removes the temp file upon exit. - with tables.open_file(path, mode='w') as f: - t = f.create_table('/', name=objname, description=table_schema) + with tables.open_file(path, mode="w") as f: + t = f.create_table("/", name=objname, description=table_schema) for sample in testsamples: for key, value in sample.items(): t.row[key] = value diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index 00062b04d07d8a..fee7e1cb2ba5ff 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -17,33 +17,53 @@ import pandas as pd from pandas import ( - Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Index, Int64Index, - MultiIndex, RangeIndex, Series, Timestamp, bdate_range, concat, date_range, - isna, timedelta_range) + Categorical, + CategoricalIndex, + DataFrame, + DatetimeIndex, + Index, + Int64Index, + MultiIndex, + RangeIndex, + Series, + Timestamp, + bdate_range, + concat, + date_range, + isna, + timedelta_range, +) import pandas.util.testing as tm -from pandas.util.testing import ( - assert_frame_equal, assert_series_equal, set_timezone) +from pandas.util.testing import assert_frame_equal, assert_series_equal, set_timezone from pandas.io import pytables as pytables # noqa:E402 from pandas.io.formats.printing import pprint_thing from pandas.io.pytables import ( - ClosedFileError, HDFStore, PossibleDataLossError, Term, read_hdf) + ClosedFileError, + HDFStore, + PossibleDataLossError, + Term, + read_hdf, +) from pandas.io.pytables import TableIterator # noqa:E402 -tables = pytest.importorskip('tables') +tables = pytest.importorskip("tables") # TODO: # remove when gh-24839 is fixed; this affects numpy 1.16 # and pytables 3.4.4 xfail_non_writeable = pytest.mark.xfail( - LooseVersion(np.__version__) >= LooseVersion('1.16') and - LooseVersion(tables.__version__) < LooseVersion('3.5.1'), - reason=('gh-25511, gh-24839. pytables needs a ' - 'release beyong 3.4.4 to support numpy 1.16x')) + LooseVersion(np.__version__) >= LooseVersion("1.16") + and LooseVersion(tables.__version__) < LooseVersion("3.5.1"), + reason=( + "gh-25511, gh-24839. pytables needs a " + "release beyong 3.4.4 to support numpy 1.16x" + ), +) -_default_compressor = 'blosc' +_default_compressor = "blosc" ignore_natural_naming_warning = pytest.mark.filterwarnings( @@ -82,8 +102,7 @@ def create_tempfile(path): @contextmanager -def ensure_clean_store(path, mode='a', complevel=None, complib=None, - fletcher32=False): +def ensure_clean_store(path, mode="a", complevel=None, complib=None, fletcher32=False): try: @@ -91,12 +110,13 @@ def ensure_clean_store(path, mode='a', complevel=None, complib=None, if not len(os.path.dirname(path)): path = create_tempfile(path) - store = HDFStore(path, mode=mode, complevel=complevel, - complib=complib, fletcher32=False) + store = HDFStore( + path, mode=mode, complevel=complevel, complib=complib, fletcher32=False + ) yield store finally: safe_close(store) - if mode == 'w' or mode == 'a': + if mode == "w" or mode == "a": safe_remove(path) @@ -135,7 +155,6 @@ def _maybe_remove(store, key): class Base: - @classmethod def setup_class(cls): @@ -149,7 +168,7 @@ def teardown_class(cls): tm.set_testing_mode() def setup_method(self, method): - self.path = 'tmp.__%s__.h5' % tm.rands(10) + self.path = "tmp.__%s__.h5" % tm.rands(10) def teardown_method(self, method): pass @@ -157,7 +176,6 @@ def teardown_method(self, method): @pytest.mark.single class TestHDFStore(Base): - def test_format_kwarg_in_constructor(self): # GH 13291 with ensure_clean_path(self.path) as path: @@ -168,7 +186,7 @@ def test_context(self): path = create_tempfile(self.path) try: with HDFStore(path) as tbl: - raise ValueError('blah') + raise ValueError("blah") except ValueError: pass finally: @@ -176,34 +194,35 @@ def test_context(self): try: with HDFStore(path) as tbl: - tbl['a'] = tm.makeDataFrame() + tbl["a"] = tm.makeDataFrame() with HDFStore(path) as tbl: assert len(tbl) == 1 - assert type(tbl['a']) == DataFrame + assert type(tbl["a"]) == DataFrame finally: safe_remove(path) def test_conv_read_write(self): path = create_tempfile(self.path) try: + def roundtrip(key, obj, **kwargs): obj.to_hdf(path, key, **kwargs) return read_hdf(path, key) o = tm.makeTimeSeries() - assert_series_equal(o, roundtrip('series', o)) + assert_series_equal(o, roundtrip("series", o)) o = tm.makeStringSeries() - assert_series_equal(o, roundtrip('string_series', o)) + assert_series_equal(o, roundtrip("string_series", o)) o = tm.makeDataFrame() - assert_frame_equal(o, roundtrip('frame', o)) + assert_frame_equal(o, roundtrip("frame", o)) # table df = DataFrame(dict(A=range(5), B=range(5))) - df.to_hdf(path, 'table', append=True) - result = read_hdf(path, 'table', where=['index>2']) + df.to_hdf(path, "table", append=True) + result = read_hdf(path, "table", where=["index>2"]) assert_frame_equal(df[df.index > 2], result) finally: @@ -212,13 +231,14 @@ def roundtrip(key, obj, **kwargs): def test_long_strings(self): # GH6166 - df = DataFrame({'a': tm.rands_array(100, size=10)}, - index=tm.rands_array(100, size=10)) + df = DataFrame( + {"a": tm.rands_array(100, size=10)}, index=tm.rands_array(100, size=10) + ) with ensure_clean_store(self.path) as store: - store.append('df', df, data_columns=['a']) + store.append("df", df, data_columns=["a"]) - result = store.select('df') + result = store.select("df") assert_frame_equal(df, result) def test_api(self): @@ -228,68 +248,68 @@ def test_api(self): with ensure_clean_path(self.path) as path: df = tm.makeDataFrame() - df.iloc[:10].to_hdf(path, 'df', append=True, format='table') - df.iloc[10:].to_hdf(path, 'df', append=True, format='table') - assert_frame_equal(read_hdf(path, 'df'), df) + df.iloc[:10].to_hdf(path, "df", append=True, format="table") + df.iloc[10:].to_hdf(path, "df", append=True, format="table") + assert_frame_equal(read_hdf(path, "df"), df) # append to False - df.iloc[:10].to_hdf(path, 'df', append=False, format='table') - df.iloc[10:].to_hdf(path, 'df', append=True, format='table') - assert_frame_equal(read_hdf(path, 'df'), df) + df.iloc[:10].to_hdf(path, "df", append=False, format="table") + df.iloc[10:].to_hdf(path, "df", append=True, format="table") + assert_frame_equal(read_hdf(path, "df"), df) with ensure_clean_path(self.path) as path: df = tm.makeDataFrame() - df.iloc[:10].to_hdf(path, 'df', append=True) - df.iloc[10:].to_hdf(path, 'df', append=True, format='table') - assert_frame_equal(read_hdf(path, 'df'), df) + df.iloc[:10].to_hdf(path, "df", append=True) + df.iloc[10:].to_hdf(path, "df", append=True, format="table") + assert_frame_equal(read_hdf(path, "df"), df) # append to False - df.iloc[:10].to_hdf(path, 'df', append=False, format='table') - df.iloc[10:].to_hdf(path, 'df', append=True) - assert_frame_equal(read_hdf(path, 'df'), df) + df.iloc[:10].to_hdf(path, "df", append=False, format="table") + df.iloc[10:].to_hdf(path, "df", append=True) + assert_frame_equal(read_hdf(path, "df"), df) with ensure_clean_path(self.path) as path: df = tm.makeDataFrame() - df.to_hdf(path, 'df', append=False, format='fixed') - assert_frame_equal(read_hdf(path, 'df'), df) + df.to_hdf(path, "df", append=False, format="fixed") + assert_frame_equal(read_hdf(path, "df"), df) - df.to_hdf(path, 'df', append=False, format='f') - assert_frame_equal(read_hdf(path, 'df'), df) + df.to_hdf(path, "df", append=False, format="f") + assert_frame_equal(read_hdf(path, "df"), df) - df.to_hdf(path, 'df', append=False) - assert_frame_equal(read_hdf(path, 'df'), df) + df.to_hdf(path, "df", append=False) + assert_frame_equal(read_hdf(path, "df"), df) - df.to_hdf(path, 'df') - assert_frame_equal(read_hdf(path, 'df'), df) + df.to_hdf(path, "df") + assert_frame_equal(read_hdf(path, "df"), df) with ensure_clean_store(self.path) as store: path = store._path df = tm.makeDataFrame() - _maybe_remove(store, 'df') - store.append('df', df.iloc[:10], append=True, format='table') - store.append('df', df.iloc[10:], append=True, format='table') - assert_frame_equal(store.select('df'), df) + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=True, format="table") + store.append("df", df.iloc[10:], append=True, format="table") + assert_frame_equal(store.select("df"), df) # append to False - _maybe_remove(store, 'df') - store.append('df', df.iloc[:10], append=False, format='table') - store.append('df', df.iloc[10:], append=True, format='table') - assert_frame_equal(store.select('df'), df) + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=False, format="table") + store.append("df", df.iloc[10:], append=True, format="table") + assert_frame_equal(store.select("df"), df) # formats - _maybe_remove(store, 'df') - store.append('df', df.iloc[:10], append=False, format='table') - store.append('df', df.iloc[10:], append=True, format='table') - assert_frame_equal(store.select('df'), df) + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=False, format="table") + store.append("df", df.iloc[10:], append=True, format="table") + assert_frame_equal(store.select("df"), df) - _maybe_remove(store, 'df') - store.append('df', df.iloc[:10], append=False, format='table') - store.append('df', df.iloc[10:], append=True, format=None) - assert_frame_equal(store.select('df'), df) + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=False, format="table") + store.append("df", df.iloc[10:], append=True, format=None) + assert_frame_equal(store.select("df"), df) with ensure_clean_path(self.path) as path: # Invalid. @@ -318,53 +338,53 @@ def test_api_default_format(self): with ensure_clean_store(self.path) as store: df = tm.makeDataFrame() - pd.set_option('io.hdf.default_format', 'fixed') - _maybe_remove(store, 'df') - store.put('df', df) - assert not store.get_storer('df').is_table + pd.set_option("io.hdf.default_format", "fixed") + _maybe_remove(store, "df") + store.put("df", df) + assert not store.get_storer("df").is_table with pytest.raises(ValueError): store.append("df2", df) - pd.set_option('io.hdf.default_format', 'table') - _maybe_remove(store, 'df') - store.put('df', df) - assert store.get_storer('df').is_table - _maybe_remove(store, 'df2') - store.append('df2', df) - assert store.get_storer('df').is_table + pd.set_option("io.hdf.default_format", "table") + _maybe_remove(store, "df") + store.put("df", df) + assert store.get_storer("df").is_table + _maybe_remove(store, "df2") + store.append("df2", df) + assert store.get_storer("df").is_table - pd.set_option('io.hdf.default_format', None) + pd.set_option("io.hdf.default_format", None) with ensure_clean_path(self.path) as path: df = tm.makeDataFrame() - pd.set_option('io.hdf.default_format', 'fixed') - df.to_hdf(path, 'df') + pd.set_option("io.hdf.default_format", "fixed") + df.to_hdf(path, "df") with HDFStore(path) as store: - assert not store.get_storer('df').is_table + assert not store.get_storer("df").is_table with pytest.raises(ValueError): df.to_hdf(path, "df2", append=True) - pd.set_option('io.hdf.default_format', 'table') - df.to_hdf(path, 'df3') + pd.set_option("io.hdf.default_format", "table") + df.to_hdf(path, "df3") with HDFStore(path) as store: - assert store.get_storer('df3').is_table - df.to_hdf(path, 'df4', append=True) + assert store.get_storer("df3").is_table + df.to_hdf(path, "df4", append=True) with HDFStore(path) as store: - assert store.get_storer('df4').is_table + assert store.get_storer("df4").is_table - pd.set_option('io.hdf.default_format', None) + pd.set_option("io.hdf.default_format", None) def test_keys(self): with ensure_clean_store(self.path) as store: - store['a'] = tm.makeTimeSeries() - store['b'] = tm.makeStringSeries() - store['c'] = tm.makeDataFrame() + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeStringSeries() + store["c"] = tm.makeDataFrame() assert len(store) == 3 - expected = {'/a', '/b', '/c'} + expected = {"/a", "/b", "/c"} assert set(store.keys()) == expected assert set(store) == expected @@ -396,31 +416,31 @@ def test_repr(self): with ensure_clean_store(self.path) as store: repr(store) store.info() - store['a'] = tm.makeTimeSeries() - store['b'] = tm.makeStringSeries() - store['c'] = tm.makeDataFrame() + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeStringSeries() + store["c"] = tm.makeDataFrame() df = tm.makeDataFrame() - df['obj1'] = 'foo' - df['obj2'] = 'bar' - df['bool1'] = df['A'] > 0 - df['bool2'] = df['B'] > 0 - df['bool3'] = True - df['int1'] = 1 - df['int2'] = 2 - df['timestamp1'] = Timestamp('20010102') - df['timestamp2'] = Timestamp('20010103') - df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0) - df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0) - df.loc[3:6, ['obj1']] = np.nan + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["bool3"] = True + df["int1"] = 1 + df["int2"] = 2 + df["timestamp1"] = Timestamp("20010102") + df["timestamp2"] = Timestamp("20010103") + df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) + df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) + df.loc[3:6, ["obj1"]] = np.nan df = df._consolidate()._convert(datetime=True) with catch_warnings(record=True): simplefilter("ignore", pd.errors.PerformanceWarning) - store['df'] = df + store["df"] = df # make a random group in hdf space - store._handle.create_group(store._handle.root, 'bah') + store._handle.create_group(store._handle.root, "bah") assert store.filename in repr(store) assert store.filename in str(store) @@ -430,9 +450,9 @@ def test_repr(self): with ensure_clean_store(self.path) as store: df = tm.makeDataFrame() - store.append('df', df) + store.append("df", df) - s = store.get_storer('df') + s = store.get_storer("df") repr(s) str(s) @@ -440,42 +460,42 @@ def test_repr(self): def test_contains(self): with ensure_clean_store(self.path) as store: - store['a'] = tm.makeTimeSeries() - store['b'] = tm.makeDataFrame() - store['foo/bar'] = tm.makeDataFrame() - assert 'a' in store - assert 'b' in store - assert 'c' not in store - assert 'foo/bar' in store - assert '/foo/bar' in store - assert '/foo/b' not in store - assert 'bar' not in store + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeDataFrame() + store["foo/bar"] = tm.makeDataFrame() + assert "a" in store + assert "b" in store + assert "c" not in store + assert "foo/bar" in store + assert "/foo/bar" in store + assert "/foo/b" not in store + assert "bar" not in store # gh-2694: tables.NaturalNameWarning with catch_warnings(record=True): - store['node())'] = tm.makeDataFrame() - assert 'node())' in store + store["node())"] = tm.makeDataFrame() + assert "node())" in store def test_versioning(self): with ensure_clean_store(self.path) as store: - store['a'] = tm.makeTimeSeries() - store['b'] = tm.makeDataFrame() + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeDataFrame() df = tm.makeTimeDataFrame() - _maybe_remove(store, 'df1') - store.append('df1', df[:10]) - store.append('df1', df[10:]) - assert store.root.a._v_attrs.pandas_version == '0.15.2' - assert store.root.b._v_attrs.pandas_version == '0.15.2' - assert store.root.df1._v_attrs.pandas_version == '0.15.2' + _maybe_remove(store, "df1") + store.append("df1", df[:10]) + store.append("df1", df[10:]) + assert store.root.a._v_attrs.pandas_version == "0.15.2" + assert store.root.b._v_attrs.pandas_version == "0.15.2" + assert store.root.df1._v_attrs.pandas_version == "0.15.2" # write a file and wipe its versioning - _maybe_remove(store, 'df2') - store.append('df2', df) + _maybe_remove(store, "df2") + store.append("df2", df) # this is an error because its table_type is appendable, but no # version info - store.get_node('df2')._v_attrs.pandas_version = None + store.get_node("df2")._v_attrs.pandas_version = None with pytest.raises(Exception): store.select("df2") @@ -488,7 +508,7 @@ def check(mode): with ensure_clean_path(self.path) as path: # constructor - if mode in ['r', 'r+']: + if mode in ["r", "r+"]: with pytest.raises(IOError): HDFStore(path, mode=mode) @@ -500,7 +520,7 @@ def check(mode): with ensure_clean_path(self.path) as path: # context - if mode in ['r', 'r+']: + if mode in ["r", "r+"]: with pytest.raises(IOError): with HDFStore(path, mode=mode) as store: # noqa pass @@ -511,41 +531,41 @@ def check(mode): with ensure_clean_path(self.path) as path: # conv write - if mode in ['r', 'r+']: + if mode in ["r", "r+"]: with pytest.raises(IOError): df.to_hdf(path, "df", mode=mode) - df.to_hdf(path, 'df', mode='w') + df.to_hdf(path, "df", mode="w") else: - df.to_hdf(path, 'df', mode=mode) + df.to_hdf(path, "df", mode=mode) # conv read - if mode in ['w']: + if mode in ["w"]: with pytest.raises(ValueError): read_hdf(path, "df", mode=mode) else: - result = read_hdf(path, 'df', mode=mode) + result = read_hdf(path, "df", mode=mode) assert_frame_equal(result, df) def check_default_mode(): # read_hdf uses default mode with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', mode='w') - result = read_hdf(path, 'df') + df.to_hdf(path, "df", mode="w") + result = read_hdf(path, "df") assert_frame_equal(result, df) - check('r') - check('r+') - check('a') - check('w') + check("r") + check("r+") + check("a") + check("w") check_default_mode() def test_reopen_handle(self): with ensure_clean_path(self.path) as path: - store = HDFStore(path, mode='a') - store['a'] = tm.makeTimeSeries() + store = HDFStore(path, mode="a") + store["a"] = tm.makeTimeSeries() # invalid mode change with pytest.raises(PossibleDataLossError): @@ -555,36 +575,36 @@ def test_reopen_handle(self): assert not store.is_open # truncation ok here - store.open('w') + store.open("w") assert store.is_open assert len(store) == 0 store.close() assert not store.is_open - store = HDFStore(path, mode='a') - store['a'] = tm.makeTimeSeries() + store = HDFStore(path, mode="a") + store["a"] = tm.makeTimeSeries() # reopen as read - store.open('r') + store.open("r") assert store.is_open assert len(store) == 1 - assert store._mode == 'r' + assert store._mode == "r" store.close() assert not store.is_open # reopen as append - store.open('a') + store.open("a") assert store.is_open assert len(store) == 1 - assert store._mode == 'a' + assert store._mode == "a" store.close() assert not store.is_open # reopen as append (again) - store.open('a') + store.open("a") assert store.is_open assert len(store) == 1 - assert store._mode == 'a' + assert store._mode == "a" store.close() assert not store.is_open @@ -595,13 +615,14 @@ def test_open_args(self): df = tm.makeDataFrame() # create an in memory store - store = HDFStore(path, mode='a', driver='H5FD_CORE', - driver_core_backing_store=0) - store['df'] = df - store.append('df2', df) + store = HDFStore( + path, mode="a", driver="H5FD_CORE", driver_core_backing_store=0 + ) + store["df"] = df + store.append("df2", df) - tm.assert_frame_equal(store['df'], df) - tm.assert_frame_equal(store['df2'], df) + tm.assert_frame_equal(store["df"], df) + tm.assert_frame_equal(store["df2"], df) store.close() @@ -611,61 +632,70 @@ def test_open_args(self): def test_flush(self): with ensure_clean_store(self.path) as store: - store['a'] = tm.makeTimeSeries() + store["a"] = tm.makeTimeSeries() store.flush() store.flush(fsync=True) def test_get(self): with ensure_clean_store(self.path) as store: - store['a'] = tm.makeTimeSeries() - left = store.get('a') - right = store['a'] + store["a"] = tm.makeTimeSeries() + left = store.get("a") + right = store["a"] tm.assert_series_equal(left, right) - left = store.get('/a') - right = store['/a'] + left = store.get("/a") + right = store["/a"] tm.assert_series_equal(left, right) with pytest.raises(KeyError): store.get("b") - @pytest.mark.parametrize('where, expected', [ - ('/', { - '': ({'first_group', 'second_group'}, set()), - '/first_group': (set(), {'df1', 'df2'}), - '/second_group': ({'third_group'}, {'df3', 's1'}), - '/second_group/third_group': (set(), {'df4'}), - }), - ('/second_group', { - '/second_group': ({'third_group'}, {'df3', 's1'}), - '/second_group/third_group': (set(), {'df4'}), - }) - ]) + @pytest.mark.parametrize( + "where, expected", + [ + ( + "/", + { + "": ({"first_group", "second_group"}, set()), + "/first_group": (set(), {"df1", "df2"}), + "/second_group": ({"third_group"}, {"df3", "s1"}), + "/second_group/third_group": (set(), {"df4"}), + }, + ), + ( + "/second_group", + { + "/second_group": ({"third_group"}, {"df3", "s1"}), + "/second_group/third_group": (set(), {"df4"}), + }, + ), + ], + ) def test_walk(self, where, expected): # GH10143 objs = { - 'df1': pd.DataFrame([1, 2, 3]), - 'df2': pd.DataFrame([4, 5, 6]), - 'df3': pd.DataFrame([6, 7, 8]), - 'df4': pd.DataFrame([9, 10, 11]), - 's1': pd.Series([10, 9, 8]), + "df1": pd.DataFrame([1, 2, 3]), + "df2": pd.DataFrame([4, 5, 6]), + "df3": pd.DataFrame([6, 7, 8]), + "df4": pd.DataFrame([9, 10, 11]), + "s1": pd.Series([10, 9, 8]), # Next 3 items aren't pandas objects and should be ignored - 'a1': np.array([[1, 2, 3], [4, 5, 6]]), - 'tb1': np.array([(1, 2, 3), (4, 5, 6)], dtype='i,i,i'), - 'tb2': np.array([(7, 8, 9), (10, 11, 12)], dtype='i,i,i') + "a1": np.array([[1, 2, 3], [4, 5, 6]]), + "tb1": np.array([(1, 2, 3), (4, 5, 6)], dtype="i,i,i"), + "tb2": np.array([(7, 8, 9), (10, 11, 12)], dtype="i,i,i"), } - with ensure_clean_store('walk_groups.hdf', mode='w') as store: - store.put('/first_group/df1', objs['df1']) - store.put('/first_group/df2', objs['df2']) - store.put('/second_group/df3', objs['df3']) - store.put('/second_group/s1', objs['s1']) - store.put('/second_group/third_group/df4', objs['df4']) + with ensure_clean_store("walk_groups.hdf", mode="w") as store: + store.put("/first_group/df1", objs["df1"]) + store.put("/first_group/df2", objs["df2"]) + store.put("/second_group/df3", objs["df3"]) + store.put("/second_group/s1", objs["s1"]) + store.put("/second_group/third_group/df4", objs["df4"]) # Create non-pandas objects - store._handle.create_array('/first_group', 'a1', objs['a1']) - store._handle.create_table('/first_group', 'tb1', obj=objs['tb1']) - store._handle.create_table('/second_group', 'tb2', obj=objs['tb2']) + store._handle.create_array("/first_group", "a1", objs["a1"]) + store._handle.create_table("/first_group", "tb1", obj=objs["tb1"]) + store._handle.create_table("/second_group", "tb2", obj=objs["tb2"]) assert len(list(store.walk(where=where))) == len(expected) for path, groups, leaves in store.walk(where=where): @@ -674,9 +704,9 @@ def test_walk(self, where, expected): assert expected_groups == set(groups) assert expected_frames == set(leaves) for leaf in leaves: - frame_path = '/'.join([path, leaf]) + frame_path = "/".join([path, leaf]) obj = store.get(frame_path) - if 'df' in leaf: + if "df" in leaf: tm.assert_frame_equal(obj, objs[leaf]) else: tm.assert_series_equal(obj, objs[leaf]) @@ -686,16 +716,16 @@ def test_getattr(self): with ensure_clean_store(self.path) as store: s = tm.makeTimeSeries() - store['a'] = s + store["a"] = s # test attribute access result = store.a tm.assert_series_equal(result, s) - result = getattr(store, 'a') + result = getattr(store, "a") tm.assert_series_equal(result, s) df = tm.makeTimeDataFrame() - store['df'] = df + store["df"] = df result = store.df tm.assert_frame_equal(result, df) @@ -705,7 +735,7 @@ def test_getattr(self): getattr(store, x) # not stores - for x in ['mode', 'path', 'handle', 'complib']: + for x in ["mode", "path", "handle", "complib"]: getattr(store, "_%s" % x) def test_put(self): @@ -714,12 +744,12 @@ def test_put(self): ts = tm.makeTimeSeries() df = tm.makeTimeDataFrame() - store['a'] = ts - store['b'] = df[:10] - store['foo/bar/bah'] = df[:10] - store['foo'] = df[:10] - store['/foo'] = df[:10] - store.put('c', df[:10], format='table') + store["a"] = ts + store["b"] = df[:10] + store["foo/bar/bah"] = df[:10] + store["foo"] = df[:10] + store["/foo"] = df[:10] + store.put("c", df[:10], format="table") # not OK, not a table with pytest.raises(ValueError): @@ -727,7 +757,7 @@ def test_put(self): # node does not currently exist, test _is_table_type returns False # in this case - _maybe_remove(store, 'f') + _maybe_remove(store, "f") with pytest.raises(ValueError): store.put("f", df[10:], append=True) @@ -736,43 +766,43 @@ def test_put(self): store.put("c", df[10:], append=True) # overwrite table - store.put('c', df[:10], format='table', append=False) - tm.assert_frame_equal(df[:10], store['c']) + store.put("c", df[:10], format="table", append=False) + tm.assert_frame_equal(df[:10], store["c"]) def test_put_string_index(self): with ensure_clean_store(self.path) as store: - index = Index( - ["I am a very long string index: %s" % i for i in range(20)]) + index = Index(["I am a very long string index: %s" % i for i in range(20)]) s = Series(np.arange(20), index=index) - df = DataFrame({'A': s, 'B': s}) + df = DataFrame({"A": s, "B": s}) - store['a'] = s - tm.assert_series_equal(store['a'], s) + store["a"] = s + tm.assert_series_equal(store["a"], s) - store['b'] = df - tm.assert_frame_equal(store['b'], df) + store["b"] = df + tm.assert_frame_equal(store["b"], df) # mixed length - index = Index(['abcdefghijklmnopqrstuvwxyz1234567890'] + - ["I am a very long string index: %s" % i - for i in range(20)]) + index = Index( + ["abcdefghijklmnopqrstuvwxyz1234567890"] + + ["I am a very long string index: %s" % i for i in range(20)] + ) s = Series(np.arange(21), index=index) - df = DataFrame({'A': s, 'B': s}) - store['a'] = s - tm.assert_series_equal(store['a'], s) + df = DataFrame({"A": s, "B": s}) + store["a"] = s + tm.assert_series_equal(store["a"], s) - store['b'] = df - tm.assert_frame_equal(store['b'], df) + store["b"] = df + tm.assert_frame_equal(store["b"], df) def test_put_compression(self): with ensure_clean_store(self.path) as store: df = tm.makeTimeDataFrame() - store.put('c', df, format='table', complib='zlib') - tm.assert_frame_equal(store['c'], df) + store.put("c", df, format="table", complib="zlib") + tm.assert_frame_equal(store["c"], df) # can't compress if format='fixed' with pytest.raises(ValueError): @@ -786,10 +816,10 @@ def test_put_compression_blosc(self): # can't compress if format='fixed' with pytest.raises(ValueError): - store.put('b', df, format='fixed', complib='blosc') + store.put("b", df, format="fixed", complib="blosc") - store.put('c', df, format='table', complib='blosc') - tm.assert_frame_equal(store['c'], df) + store.put("c", df, format="table", complib="blosc") + tm.assert_frame_equal(store["c"], df) def test_complibs_default_settings(self): # GH15943 @@ -798,51 +828,51 @@ def test_complibs_default_settings(self): # Set complevel and check if complib is automatically set to # default value with ensure_clean_path(self.path) as tmpfile: - df.to_hdf(tmpfile, 'df', complevel=9) - result = pd.read_hdf(tmpfile, 'df') + df.to_hdf(tmpfile, "df", complevel=9) + result = pd.read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) - with tables.open_file(tmpfile, mode='r') as h5file: - for node in h5file.walk_nodes(where='/df', classname='Leaf'): + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): assert node.filters.complevel == 9 - assert node.filters.complib == 'zlib' + assert node.filters.complib == "zlib" # Set complib and check to see if compression is disabled with ensure_clean_path(self.path) as tmpfile: - df.to_hdf(tmpfile, 'df', complib='zlib') - result = pd.read_hdf(tmpfile, 'df') + df.to_hdf(tmpfile, "df", complib="zlib") + result = pd.read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) - with tables.open_file(tmpfile, mode='r') as h5file: - for node in h5file.walk_nodes(where='/df', classname='Leaf'): + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): assert node.filters.complevel == 0 assert node.filters.complib is None # Check if not setting complib or complevel results in no compression with ensure_clean_path(self.path) as tmpfile: - df.to_hdf(tmpfile, 'df') - result = pd.read_hdf(tmpfile, 'df') + df.to_hdf(tmpfile, "df") + result = pd.read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) - with tables.open_file(tmpfile, mode='r') as h5file: - for node in h5file.walk_nodes(where='/df', classname='Leaf'): + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): assert node.filters.complevel == 0 assert node.filters.complib is None # Check if file-defaults can be overridden on a per table basis with ensure_clean_path(self.path) as tmpfile: store = pd.HDFStore(tmpfile) - store.append('dfc', df, complevel=9, complib='blosc') - store.append('df', df) + store.append("dfc", df, complevel=9, complib="blosc") + store.append("df", df) store.close() - with tables.open_file(tmpfile, mode='r') as h5file: - for node in h5file.walk_nodes(where='/df', classname='Leaf'): + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): assert node.filters.complevel == 0 assert node.filters.complib is None - for node in h5file.walk_nodes(where='/dfc', classname='Leaf'): + for node in h5file.walk_nodes(where="/dfc", classname="Leaf"): assert node.filters.complevel == 9 - assert node.filters.complib == 'blosc' + assert node.filters.complib == "blosc" def test_complibs(self): # GH14478 @@ -851,8 +881,8 @@ def test_complibs(self): # Building list of all complibs and complevels tuples all_complibs = tables.filters.all_complibs # Remove lzo if its not available on this platform - if not tables.which_lib_version('lzo'): - all_complibs.remove('lzo') + if not tables.which_lib_version("lzo"): + all_complibs.remove("lzo") # Remove bzip2 if its not available on this platform if not tables.which_lib_version("bzip2"): all_complibs.remove("bzip2") @@ -862,7 +892,7 @@ def test_complibs(self): for (lib, lvl) in all_tests: with ensure_clean_path(self.path) as tmpfile: - gname = 'foo' + gname = "foo" # Write and read file to see if data is consistent df.to_hdf(tmpfile, gname, complib=lib, complevel=lvl) @@ -871,9 +901,8 @@ def test_complibs(self): # Open file and check metadata # for correct amount of compression - h5table = tables.open_file(tmpfile, mode='r') - for node in h5table.walk_nodes(where='/' + gname, - classname='Leaf'): + h5table = tables.open_file(tmpfile, mode="r") + for node in h5table.walk_nodes(where="/" + gname, classname="Leaf"): assert node.filters.complevel == lvl if lvl == 0: assert node.filters.complib is None @@ -889,29 +918,29 @@ def test_put_integer(self): @xfail_non_writeable def test_put_mixed_type(self): df = tm.makeTimeDataFrame() - df['obj1'] = 'foo' - df['obj2'] = 'bar' - df['bool1'] = df['A'] > 0 - df['bool2'] = df['B'] > 0 - df['bool3'] = True - df['int1'] = 1 - df['int2'] = 2 - df['timestamp1'] = Timestamp('20010102') - df['timestamp2'] = Timestamp('20010103') - df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0) - df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0) - df.loc[3:6, ['obj1']] = np.nan + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["bool3"] = True + df["int1"] = 1 + df["int2"] = 2 + df["timestamp1"] = Timestamp("20010102") + df["timestamp2"] = Timestamp("20010103") + df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) + df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) + df.loc[3:6, ["obj1"]] = np.nan df = df._consolidate()._convert(datetime=True) with ensure_clean_store(self.path) as store: - _maybe_remove(store, 'df') + _maybe_remove(store, "df") # PerformanceWarning with catch_warnings(record=True): simplefilter("ignore", pd.errors.PerformanceWarning) - store.put('df', df) + store.put("df", df) - expected = store.get('df') + expected = store.get("df") tm.assert_frame_equal(expected, df) @pytest.mark.filterwarnings( @@ -926,56 +955,64 @@ def test_append(self): with catch_warnings(record=True): df = tm.makeTimeDataFrame() - _maybe_remove(store, 'df1') - store.append('df1', df[:10]) - store.append('df1', df[10:]) - tm.assert_frame_equal(store['df1'], df) + _maybe_remove(store, "df1") + store.append("df1", df[:10]) + store.append("df1", df[10:]) + tm.assert_frame_equal(store["df1"], df) - _maybe_remove(store, 'df2') - store.put('df2', df[:10], format='table') - store.append('df2', df[10:]) - tm.assert_frame_equal(store['df2'], df) + _maybe_remove(store, "df2") + store.put("df2", df[:10], format="table") + store.append("df2", df[10:]) + tm.assert_frame_equal(store["df2"], df) - _maybe_remove(store, 'df3') - store.append('/df3', df[:10]) - store.append('/df3', df[10:]) - tm.assert_frame_equal(store['df3'], df) + _maybe_remove(store, "df3") + store.append("/df3", df[:10]) + store.append("/df3", df[10:]) + tm.assert_frame_equal(store["df3"], df) # this is allowed by almost always don't want to do it # tables.NaturalNameWarning - _maybe_remove(store, '/df3 foo') - store.append('/df3 foo', df[:10]) - store.append('/df3 foo', df[10:]) - tm.assert_frame_equal(store['df3 foo'], df) + _maybe_remove(store, "/df3 foo") + store.append("/df3 foo", df[:10]) + store.append("/df3 foo", df[10:]) + tm.assert_frame_equal(store["df3 foo"], df) # dtype issues - mizxed type in a single object column df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]]) - df['mixed_column'] = 'testing' - df.loc[2, 'mixed_column'] = np.nan - _maybe_remove(store, 'df') - store.append('df', df) - tm.assert_frame_equal(store['df'], df) + df["mixed_column"] = "testing" + df.loc[2, "mixed_column"] = np.nan + _maybe_remove(store, "df") + store.append("df", df) + tm.assert_frame_equal(store["df"], df) # uints - test storage of uints - uint_data = DataFrame({ - 'u08': Series(np.random.randint(0, high=255, size=5), - dtype=np.uint8), - 'u16': Series(np.random.randint(0, high=65535, size=5), - dtype=np.uint16), - 'u32': Series(np.random.randint(0, high=2**30, size=5), - dtype=np.uint32), - 'u64': Series([2**58, 2**59, 2**60, 2**61, 2**62], - dtype=np.uint64)}, index=np.arange(5)) - _maybe_remove(store, 'uints') - store.append('uints', uint_data) - tm.assert_frame_equal(store['uints'], uint_data) + uint_data = DataFrame( + { + "u08": Series( + np.random.randint(0, high=255, size=5), dtype=np.uint8 + ), + "u16": Series( + np.random.randint(0, high=65535, size=5), dtype=np.uint16 + ), + "u32": Series( + np.random.randint(0, high=2 ** 30, size=5), dtype=np.uint32 + ), + "u64": Series( + [2 ** 58, 2 ** 59, 2 ** 60, 2 ** 61, 2 ** 62], + dtype=np.uint64, + ), + }, + index=np.arange(5), + ) + _maybe_remove(store, "uints") + store.append("uints", uint_data) + tm.assert_frame_equal(store["uints"], uint_data) # uints - test storage of uints in indexable columns - _maybe_remove(store, 'uints') + _maybe_remove(store, "uints") # 64-bit indices not yet supported - store.append('uints', uint_data, data_columns=[ - 'u08', 'u16', 'u32']) - tm.assert_frame_equal(store['uints'], uint_data) + store.append("uints", uint_data, data_columns=["u08", "u16", "u32"]) + tm.assert_frame_equal(store["uints"], uint_data) def test_append_series(self): @@ -986,42 +1023,42 @@ def test_append_series(self): ts = tm.makeTimeSeries() ns = Series(np.arange(100)) - store.append('ss', ss) - result = store['ss'] + store.append("ss", ss) + result = store["ss"] tm.assert_series_equal(result, ss) assert result.name is None - store.append('ts', ts) - result = store['ts'] + store.append("ts", ts) + result = store["ts"] tm.assert_series_equal(result, ts) assert result.name is None - ns.name = 'foo' - store.append('ns', ns) - result = store['ns'] + ns.name = "foo" + store.append("ns", ns) + result = store["ns"] tm.assert_series_equal(result, ns) assert result.name == ns.name # select on the values expected = ns[ns > 60] - result = store.select('ns', 'foo>60') + result = store.select("ns", "foo>60") tm.assert_series_equal(result, expected) # select on the index and values expected = ns[(ns > 70) & (ns.index < 90)] - result = store.select('ns', 'foo>70 and index<90') + result = store.select("ns", "foo>70 and index<90") tm.assert_series_equal(result, expected) # multi-index - mi = DataFrame(np.random.randn(5, 1), columns=['A']) - mi['B'] = np.arange(len(mi)) - mi['C'] = 'foo' - mi.loc[3:5, 'C'] = 'bar' - mi.set_index(['C', 'B'], inplace=True) + mi = DataFrame(np.random.randn(5, 1), columns=["A"]) + mi["B"] = np.arange(len(mi)) + mi["C"] = "foo" + mi.loc[3:5, "C"] = "bar" + mi.set_index(["C", "B"], inplace=True) s = mi.stack() s.index = s.index.droplevel(2) - store.append('mi', s) - tm.assert_series_equal(store['mi'], s) + store.append("mi", s) + tm.assert_series_equal(store["mi"], s) def test_store_index_types(self): # GH5386 @@ -1030,76 +1067,82 @@ def test_store_index_types(self): with ensure_clean_store(self.path) as store: def check(format, index): - df = DataFrame(np.random.randn(10, 2), columns=list('AB')) + df = DataFrame(np.random.randn(10, 2), columns=list("AB")) df.index = index(len(df)) - _maybe_remove(store, 'df') - store.put('df', df, format=format) - assert_frame_equal(df, store['df']) + _maybe_remove(store, "df") + store.put("df", df, format=format) + assert_frame_equal(df, store["df"]) - for index in [tm.makeFloatIndex, tm.makeStringIndex, - tm.makeIntIndex, tm.makeDateIndex]: + for index in [ + tm.makeFloatIndex, + tm.makeStringIndex, + tm.makeIntIndex, + tm.makeDateIndex, + ]: - check('table', index) - check('fixed', index) + check("table", index) + check("fixed", index) # period index currently broken for table # seee GH7796 FIXME - check('fixed', tm.makePeriodIndex) + check("fixed", tm.makePeriodIndex) # check('table',tm.makePeriodIndex) # unicode index = tm.makeUnicodeIndex - check('table', index) - check('fixed', index) + check("table", index) + check("fixed", index) - @pytest.mark.skipif(not is_platform_little_endian(), - reason="reason platform is not little endian") + @pytest.mark.skipif( + not is_platform_little_endian(), reason="reason platform is not little endian" + ) def test_encoding(self): with ensure_clean_store(self.path) as store: - df = DataFrame(dict(A='foo', B='bar'), index=range(5)) - df.loc[2, 'A'] = np.nan - df.loc[3, 'B'] = np.nan - _maybe_remove(store, 'df') - store.append('df', df, encoding='ascii') - tm.assert_frame_equal(store['df'], df) - - expected = df.reindex(columns=['A']) - result = store.select('df', Term('columns=A', encoding='ascii')) + df = DataFrame(dict(A="foo", B="bar"), index=range(5)) + df.loc[2, "A"] = np.nan + df.loc[3, "B"] = np.nan + _maybe_remove(store, "df") + store.append("df", df, encoding="ascii") + tm.assert_frame_equal(store["df"], df) + + expected = df.reindex(columns=["A"]) + result = store.select("df", Term("columns=A", encoding="ascii")) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('val', [ - [b'E\xc9, 17', b'', b'a', b'b', b'c'], - [b'E\xc9, 17', b'a', b'b', b'c'], - [b'EE, 17', b'', b'a', b'b', b'c'], - [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'], - [b'', b'a', b'b', b'c'], - [b'\xf8\xfc', b'a', b'b', b'c'], - [b'A\xf8\xfc', b'', b'a', b'b', b'c'], - [np.nan, b'', b'b', b'c'], - [b'A\xf8\xfc', np.nan, b'', b'b', b'c'] - ]) - @pytest.mark.parametrize('dtype', ['category', object]) + @pytest.mark.parametrize( + "val", + [ + [b"E\xc9, 17", b"", b"a", b"b", b"c"], + [b"E\xc9, 17", b"a", b"b", b"c"], + [b"EE, 17", b"", b"a", b"b", b"c"], + [b"E\xc9, 17", b"\xf8\xfc", b"a", b"b", b"c"], + [b"", b"a", b"b", b"c"], + [b"\xf8\xfc", b"a", b"b", b"c"], + [b"A\xf8\xfc", b"", b"a", b"b", b"c"], + [np.nan, b"", b"b", b"c"], + [b"A\xf8\xfc", np.nan, b"", b"b", b"c"], + ], + ) + @pytest.mark.parametrize("dtype", ["category", object]) def test_latin_encoding(self, dtype, val): - enc = 'latin-1' - nan_rep = '' - key = 'data' + enc = "latin-1" + nan_rep = "" + key = "data" val = [x.decode(enc) if isinstance(x, bytes) else x for x in val] ser = pd.Series(val, dtype=dtype) with ensure_clean_path(self.path) as store: - ser.to_hdf(store, key, format='table', encoding=enc, - nan_rep=nan_rep) + ser.to_hdf(store, key, format="table", encoding=enc, nan_rep=nan_rep) retr = read_hdf(store, key) s_nan = ser.replace(nan_rep, np.nan) if is_categorical_dtype(s_nan): assert is_categorical_dtype(retr) - assert_series_equal(s_nan, retr, check_dtype=False, - check_categorical=False) + assert_series_equal(s_nan, retr, check_dtype=False, check_categorical=False) else: assert_series_equal(s_nan, retr) @@ -1111,139 +1154,156 @@ def test_latin_encoding(self, dtype, val): def test_append_some_nans(self): with ensure_clean_store(self.path) as store: - df = DataFrame({'A': Series(np.random.randn(20)).astype('int32'), - 'A1': np.random.randn(20), - 'A2': np.random.randn(20), - 'B': 'foo', 'C': 'bar', - 'D': Timestamp("20010101"), - 'E': datetime.datetime(2001, 1, 2, 0, 0)}, - index=np.arange(20)) + df = DataFrame( + { + "A": Series(np.random.randn(20)).astype("int32"), + "A1": np.random.randn(20), + "A2": np.random.randn(20), + "B": "foo", + "C": "bar", + "D": Timestamp("20010101"), + "E": datetime.datetime(2001, 1, 2, 0, 0), + }, + index=np.arange(20), + ) # some nans - _maybe_remove(store, 'df1') - df.loc[0:15, ['A1', 'B', 'D', 'E']] = np.nan - store.append('df1', df[:10]) - store.append('df1', df[10:]) - tm.assert_frame_equal(store['df1'], df) + _maybe_remove(store, "df1") + df.loc[0:15, ["A1", "B", "D", "E"]] = np.nan + store.append("df1", df[:10]) + store.append("df1", df[10:]) + tm.assert_frame_equal(store["df1"], df) # first column df1 = df.copy() - df1.loc[:, 'A1'] = np.nan - _maybe_remove(store, 'df1') - store.append('df1', df1[:10]) - store.append('df1', df1[10:]) - tm.assert_frame_equal(store['df1'], df1) + df1.loc[:, "A1"] = np.nan + _maybe_remove(store, "df1") + store.append("df1", df1[:10]) + store.append("df1", df1[10:]) + tm.assert_frame_equal(store["df1"], df1) # 2nd column df2 = df.copy() - df2.loc[:, 'A2'] = np.nan - _maybe_remove(store, 'df2') - store.append('df2', df2[:10]) - store.append('df2', df2[10:]) - tm.assert_frame_equal(store['df2'], df2) + df2.loc[:, "A2"] = np.nan + _maybe_remove(store, "df2") + store.append("df2", df2[:10]) + store.append("df2", df2[10:]) + tm.assert_frame_equal(store["df2"], df2) # datetimes df3 = df.copy() - df3.loc[:, 'E'] = np.nan - _maybe_remove(store, 'df3') - store.append('df3', df3[:10]) - store.append('df3', df3[10:]) - tm.assert_frame_equal(store['df3'], df3) + df3.loc[:, "E"] = np.nan + _maybe_remove(store, "df3") + store.append("df3", df3[:10]) + store.append("df3", df3[10:]) + tm.assert_frame_equal(store["df3"], df3) def test_append_all_nans(self): with ensure_clean_store(self.path) as store: - df = DataFrame({'A1': np.random.randn(20), - 'A2': np.random.randn(20)}, - index=np.arange(20)) + df = DataFrame( + {"A1": np.random.randn(20), "A2": np.random.randn(20)}, + index=np.arange(20), + ) df.loc[0:15, :] = np.nan # nan some entire rows (dropna=True) - _maybe_remove(store, 'df') - store.append('df', df[:10], dropna=True) - store.append('df', df[10:], dropna=True) - tm.assert_frame_equal(store['df'], df[-4:]) + _maybe_remove(store, "df") + store.append("df", df[:10], dropna=True) + store.append("df", df[10:], dropna=True) + tm.assert_frame_equal(store["df"], df[-4:]) # nan some entire rows (dropna=False) - _maybe_remove(store, 'df2') - store.append('df2', df[:10], dropna=False) - store.append('df2', df[10:], dropna=False) - tm.assert_frame_equal(store['df2'], df) + _maybe_remove(store, "df2") + store.append("df2", df[:10], dropna=False) + store.append("df2", df[10:], dropna=False) + tm.assert_frame_equal(store["df2"], df) # tests the option io.hdf.dropna_table - pd.set_option('io.hdf.dropna_table', False) - _maybe_remove(store, 'df3') - store.append('df3', df[:10]) - store.append('df3', df[10:]) - tm.assert_frame_equal(store['df3'], df) - - pd.set_option('io.hdf.dropna_table', True) - _maybe_remove(store, 'df4') - store.append('df4', df[:10]) - store.append('df4', df[10:]) - tm.assert_frame_equal(store['df4'], df[-4:]) + pd.set_option("io.hdf.dropna_table", False) + _maybe_remove(store, "df3") + store.append("df3", df[:10]) + store.append("df3", df[10:]) + tm.assert_frame_equal(store["df3"], df) + + pd.set_option("io.hdf.dropna_table", True) + _maybe_remove(store, "df4") + store.append("df4", df[:10]) + store.append("df4", df[10:]) + tm.assert_frame_equal(store["df4"], df[-4:]) # nan some entire rows (string are still written!) - df = DataFrame({'A1': np.random.randn(20), - 'A2': np.random.randn(20), - 'B': 'foo', 'C': 'bar'}, - index=np.arange(20)) + df = DataFrame( + { + "A1": np.random.randn(20), + "A2": np.random.randn(20), + "B": "foo", + "C": "bar", + }, + index=np.arange(20), + ) df.loc[0:15, :] = np.nan - _maybe_remove(store, 'df') - store.append('df', df[:10], dropna=True) - store.append('df', df[10:], dropna=True) - tm.assert_frame_equal(store['df'], df) + _maybe_remove(store, "df") + store.append("df", df[:10], dropna=True) + store.append("df", df[10:], dropna=True) + tm.assert_frame_equal(store["df"], df) - _maybe_remove(store, 'df2') - store.append('df2', df[:10], dropna=False) - store.append('df2', df[10:], dropna=False) - tm.assert_frame_equal(store['df2'], df) + _maybe_remove(store, "df2") + store.append("df2", df[:10], dropna=False) + store.append("df2", df[10:], dropna=False) + tm.assert_frame_equal(store["df2"], df) # nan some entire rows (but since we have dates they are still # written!) - df = DataFrame({'A1': np.random.randn(20), - 'A2': np.random.randn(20), - 'B': 'foo', 'C': 'bar', - 'D': Timestamp("20010101"), - 'E': datetime.datetime(2001, 1, 2, 0, 0)}, - index=np.arange(20)) + df = DataFrame( + { + "A1": np.random.randn(20), + "A2": np.random.randn(20), + "B": "foo", + "C": "bar", + "D": Timestamp("20010101"), + "E": datetime.datetime(2001, 1, 2, 0, 0), + }, + index=np.arange(20), + ) df.loc[0:15, :] = np.nan - _maybe_remove(store, 'df') - store.append('df', df[:10], dropna=True) - store.append('df', df[10:], dropna=True) - tm.assert_frame_equal(store['df'], df) + _maybe_remove(store, "df") + store.append("df", df[:10], dropna=True) + store.append("df", df[10:], dropna=True) + tm.assert_frame_equal(store["df"], df) - _maybe_remove(store, 'df2') - store.append('df2', df[:10], dropna=False) - store.append('df2', df[10:], dropna=False) - tm.assert_frame_equal(store['df2'], df) + _maybe_remove(store, "df2") + store.append("df2", df[:10], dropna=False) + store.append("df2", df[10:], dropna=False) + tm.assert_frame_equal(store["df2"], df) # Test to make sure defaults are to not drop. # Corresponding to Issue 9382 df_with_missing = DataFrame( - {'col1': [0, np.nan, 2], 'col2': [1, np.nan, np.nan]}) + {"col1": [0, np.nan, 2], "col2": [1, np.nan, np.nan]} + ) with ensure_clean_path(self.path) as path: - df_with_missing.to_hdf(path, 'df_with_missing', format='table') - reloaded = read_hdf(path, 'df_with_missing') + df_with_missing.to_hdf(path, "df_with_missing", format="table") + reloaded = read_hdf(path, "df_with_missing") tm.assert_frame_equal(df_with_missing, reloaded) def test_read_missing_key_close_store(self): # GH 25766 with ensure_clean_path(self.path) as path: - df = pd.DataFrame({'a': range(2), 'b': range(2)}) - df.to_hdf(path, 'k1') + df = pd.DataFrame({"a": range(2), "b": range(2)}) + df.to_hdf(path, "k1") with pytest.raises(KeyError): - pd.read_hdf(path, 'k2') + pd.read_hdf(path, "k2") # smoke test to test that file is properly closed after # read with KeyError before another write - df.to_hdf(path, 'k2') + df.to_hdf(path, "k2") def test_append_frame_column_oriented(self): @@ -1251,25 +1311,23 @@ def test_append_frame_column_oriented(self): # column oriented df = tm.makeTimeDataFrame() - _maybe_remove(store, 'df1') - store.append('df1', df.iloc[:, :2], axes=['columns']) - store.append('df1', df.iloc[:, 2:]) - tm.assert_frame_equal(store['df1'], df) + _maybe_remove(store, "df1") + store.append("df1", df.iloc[:, :2], axes=["columns"]) + store.append("df1", df.iloc[:, 2:]) + tm.assert_frame_equal(store["df1"], df) - result = store.select('df1', 'columns=A') - expected = df.reindex(columns=['A']) + result = store.select("df1", "columns=A") + expected = df.reindex(columns=["A"]) tm.assert_frame_equal(expected, result) # selection on the non-indexable - result = store.select( - 'df1', ('columns=A', 'index=df.index[0:4]')) - expected = df.reindex(columns=['A'], index=df.index[0:4]) + result = store.select("df1", ("columns=A", "index=df.index[0:4]")) + expected = df.reindex(columns=["A"], index=df.index[0:4]) tm.assert_frame_equal(expected, result) # this isn't supported with pytest.raises(TypeError): - store.select('df1', - 'columns=A and index>df.index[4]') + store.select("df1", "columns=A and index>df.index[4]") def test_append_with_different_block_ordering(self): @@ -1278,42 +1336,41 @@ def test_append_with_different_block_ordering(self): for i in range(10): - df = DataFrame(np.random.randn(10, 2), columns=list('AB')) - df['index'] = range(10) - df['index'] += i * 10 - df['int64'] = Series([1] * len(df), dtype='int64') - df['int16'] = Series([1] * len(df), dtype='int16') + df = DataFrame(np.random.randn(10, 2), columns=list("AB")) + df["index"] = range(10) + df["index"] += i * 10 + df["int64"] = Series([1] * len(df), dtype="int64") + df["int16"] = Series([1] * len(df), dtype="int16") if i % 2 == 0: - del df['int64'] - df['int64'] = Series([1] * len(df), dtype='int64') + del df["int64"] + df["int64"] = Series([1] * len(df), dtype="int64") if i % 3 == 0: - a = df.pop('A') - df['A'] = a + a = df.pop("A") + df["A"] = a - df.set_index('index', inplace=True) + df.set_index("index", inplace=True) - store.append('df', df) + store.append("df", df) # test a different ordering but with more fields (like invalid # combinate) with ensure_clean_store(self.path) as store: - df = DataFrame(np.random.randn(10, 2), - columns=list('AB'), dtype='float64') - df['int64'] = Series([1] * len(df), dtype='int64') - df['int16'] = Series([1] * len(df), dtype='int16') - store.append('df', df) + df = DataFrame(np.random.randn(10, 2), columns=list("AB"), dtype="float64") + df["int64"] = Series([1] * len(df), dtype="int64") + df["int16"] = Series([1] * len(df), dtype="int16") + store.append("df", df) # store additional fields in different blocks - df['int16_2'] = Series([1] * len(df), dtype='int16') + df["int16_2"] = Series([1] * len(df), dtype="int16") with pytest.raises(ValueError): - store.append('df', df) + store.append("df", df) # store multile additional fields in different blocks - df['float_3'] = Series([1.] * len(df), dtype='float64') + df["float_3"] = Series([1.0] * len(df), dtype="float64") with pytest.raises(ValueError): - store.append('df', df) + store.append("df", df) def test_append_with_strings(self): @@ -1321,309 +1378,311 @@ def test_append_with_strings(self): with catch_warnings(record=True): def check_col(key, name, size): - assert getattr(store.get_storer(key) - .table.description, name).itemsize == size + assert ( + getattr(store.get_storer(key).table.description, name).itemsize + == size + ) # avoid truncation on elements - df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']]) - store.append('df_big', df) - tm.assert_frame_equal(store.select('df_big'), df) - check_col('df_big', 'values_block_1', 15) + df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) + store.append("df_big", df) + tm.assert_frame_equal(store.select("df_big"), df) + check_col("df_big", "values_block_1", 15) # appending smaller string ok - df2 = DataFrame([[124, 'asdqy'], [346, 'dggnhefbdfb']]) - store.append('df_big', df2) + df2 = DataFrame([[124, "asdqy"], [346, "dggnhefbdfb"]]) + store.append("df_big", df2) expected = concat([df, df2]) - tm.assert_frame_equal(store.select('df_big'), expected) - check_col('df_big', 'values_block_1', 15) + tm.assert_frame_equal(store.select("df_big"), expected) + check_col("df_big", "values_block_1", 15) # avoid truncation on elements - df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']]) - store.append('df_big2', df, min_itemsize={'values': 50}) - tm.assert_frame_equal(store.select('df_big2'), df) - check_col('df_big2', 'values_block_1', 50) + df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) + store.append("df_big2", df, min_itemsize={"values": 50}) + tm.assert_frame_equal(store.select("df_big2"), df) + check_col("df_big2", "values_block_1", 50) # bigger string on next append - store.append('df_new', df) + store.append("df_new", df) df_new = DataFrame( - [[124, 'abcdefqhij'], [346, 'abcdefghijklmnopqrtsuvwxyz']]) + [[124, "abcdefqhij"], [346, "abcdefghijklmnopqrtsuvwxyz"]] + ) with pytest.raises(ValueError): - store.append('df_new', df_new) + store.append("df_new", df_new) # min_itemsize on Series index (GH 11412) - df = tm.makeMixedDataFrame().set_index('C') - store.append('ss', df['B'], min_itemsize={'index': 4}) - tm.assert_series_equal(store.select('ss'), df['B']) + df = tm.makeMixedDataFrame().set_index("C") + store.append("ss", df["B"], min_itemsize={"index": 4}) + tm.assert_series_equal(store.select("ss"), df["B"]) # same as above, with data_columns=True - store.append('ss2', df['B'], data_columns=True, - min_itemsize={'index': 4}) - tm.assert_series_equal(store.select('ss2'), df['B']) + store.append( + "ss2", df["B"], data_columns=True, min_itemsize={"index": 4} + ) + tm.assert_series_equal(store.select("ss2"), df["B"]) # min_itemsize in index without appending (GH 10381) - store.put('ss3', df, format='table', - min_itemsize={'index': 6}) + store.put("ss3", df, format="table", min_itemsize={"index": 6}) # just make sure there is a longer string: - df2 = df.copy().reset_index().assign(C='longer').set_index('C') - store.append('ss3', df2) - tm.assert_frame_equal(store.select('ss3'), - pd.concat([df, df2])) + df2 = df.copy().reset_index().assign(C="longer").set_index("C") + store.append("ss3", df2) + tm.assert_frame_equal(store.select("ss3"), pd.concat([df, df2])) # same as above, with a Series - store.put('ss4', df['B'], format='table', - min_itemsize={'index': 6}) - store.append('ss4', df2['B']) - tm.assert_series_equal(store.select('ss4'), - pd.concat([df['B'], df2['B']])) + store.put("ss4", df["B"], format="table", min_itemsize={"index": 6}) + store.append("ss4", df2["B"]) + tm.assert_series_equal( + store.select("ss4"), pd.concat([df["B"], df2["B"]]) + ) # with nans - _maybe_remove(store, 'df') + _maybe_remove(store, "df") df = tm.makeTimeDataFrame() - df['string'] = 'foo' - df.loc[1:4, 'string'] = np.nan - df['string2'] = 'bar' - df.loc[4:8, 'string2'] = np.nan - df['string3'] = 'bah' - df.loc[1:, 'string3'] = np.nan - store.append('df', df) - result = store.select('df') + df["string"] = "foo" + df.loc[1:4, "string"] = np.nan + df["string2"] = "bar" + df.loc[4:8, "string2"] = np.nan + df["string3"] = "bah" + df.loc[1:, "string3"] = np.nan + store.append("df", df) + result = store.select("df") tm.assert_frame_equal(result, df) with ensure_clean_store(self.path) as store: def check_col(key, name, size): - assert getattr(store.get_storer(key) - .table.description, name).itemsize, size + assert getattr( + store.get_storer(key).table.description, name + ).itemsize, size - df = DataFrame(dict(A='foo', B='bar'), index=range(10)) + df = DataFrame(dict(A="foo", B="bar"), index=range(10)) # a min_itemsize that creates a data_column - _maybe_remove(store, 'df') - store.append('df', df, min_itemsize={'A': 200}) - check_col('df', 'A', 200) - assert store.get_storer('df').data_columns == ['A'] + _maybe_remove(store, "df") + store.append("df", df, min_itemsize={"A": 200}) + check_col("df", "A", 200) + assert store.get_storer("df").data_columns == ["A"] # a min_itemsize that creates a data_column2 - _maybe_remove(store, 'df') - store.append('df', df, data_columns=['B'], min_itemsize={'A': 200}) - check_col('df', 'A', 200) - assert store.get_storer('df').data_columns == ['B', 'A'] + _maybe_remove(store, "df") + store.append("df", df, data_columns=["B"], min_itemsize={"A": 200}) + check_col("df", "A", 200) + assert store.get_storer("df").data_columns == ["B", "A"] # a min_itemsize that creates a data_column2 - _maybe_remove(store, 'df') - store.append('df', df, data_columns=[ - 'B'], min_itemsize={'values': 200}) - check_col('df', 'B', 200) - check_col('df', 'values_block_0', 200) - assert store.get_storer('df').data_columns == ['B'] + _maybe_remove(store, "df") + store.append("df", df, data_columns=["B"], min_itemsize={"values": 200}) + check_col("df", "B", 200) + check_col("df", "values_block_0", 200) + assert store.get_storer("df").data_columns == ["B"] # infer the .typ on subsequent appends - _maybe_remove(store, 'df') - store.append('df', df[:5], min_itemsize=200) - store.append('df', df[5:], min_itemsize=200) - tm.assert_frame_equal(store['df'], df) + _maybe_remove(store, "df") + store.append("df", df[:5], min_itemsize=200) + store.append("df", df[5:], min_itemsize=200) + tm.assert_frame_equal(store["df"], df) # invalid min_itemsize keys - df = DataFrame(['foo', 'foo', 'foo', 'barh', - 'barh', 'barh'], columns=['A']) - _maybe_remove(store, 'df') + df = DataFrame(["foo", "foo", "foo", "barh", "barh", "barh"], columns=["A"]) + _maybe_remove(store, "df") with pytest.raises(ValueError): - store.append('df', df, min_itemsize={'foo': 20, 'foobar': 20}) + store.append("df", df, min_itemsize={"foo": 20, "foobar": 20}) def test_append_with_empty_string(self): with ensure_clean_store(self.path) as store: # with all empty strings (GH 12242) - df = DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', '']}) - store.append('df', df[:-1], min_itemsize={'x': 1}) - store.append('df', df[-1:], min_itemsize={'x': 1}) - tm.assert_frame_equal(store.select('df'), df) + df = DataFrame({"x": ["a", "b", "c", "d", "e", "f", ""]}) + store.append("df", df[:-1], min_itemsize={"x": 1}) + store.append("df", df[-1:], min_itemsize={"x": 1}) + tm.assert_frame_equal(store.select("df"), df) def test_to_hdf_with_min_itemsize(self): with ensure_clean_path(self.path) as path: # min_itemsize in index with to_hdf (GH 10381) - df = tm.makeMixedDataFrame().set_index('C') - df.to_hdf(path, 'ss3', format='table', min_itemsize={'index': 6}) + df = tm.makeMixedDataFrame().set_index("C") + df.to_hdf(path, "ss3", format="table", min_itemsize={"index": 6}) # just make sure there is a longer string: - df2 = df.copy().reset_index().assign(C='longer').set_index('C') - df2.to_hdf(path, 'ss3', append=True, format='table') - tm.assert_frame_equal(pd.read_hdf(path, 'ss3'), - pd.concat([df, df2])) + df2 = df.copy().reset_index().assign(C="longer").set_index("C") + df2.to_hdf(path, "ss3", append=True, format="table") + tm.assert_frame_equal(pd.read_hdf(path, "ss3"), pd.concat([df, df2])) # same as above, with a Series - df['B'].to_hdf(path, 'ss4', format='table', - min_itemsize={'index': 6}) - df2['B'].to_hdf(path, 'ss4', append=True, format='table') - tm.assert_series_equal(pd.read_hdf(path, 'ss4'), - pd.concat([df['B'], df2['B']])) + df["B"].to_hdf(path, "ss4", format="table", min_itemsize={"index": 6}) + df2["B"].to_hdf(path, "ss4", append=True, format="table") + tm.assert_series_equal( + pd.read_hdf(path, "ss4"), pd.concat([df["B"], df2["B"]]) + ) @pytest.mark.parametrize( - "format", - [pytest.param('fixed', marks=xfail_non_writeable), - 'table']) + "format", [pytest.param("fixed", marks=xfail_non_writeable), "table"] + ) def test_to_hdf_errors(self, format): - data = ['\ud800foo'] + data = ["\ud800foo"] ser = pd.Series(data, index=pd.Index(data)) with ensure_clean_path(self.path) as path: # GH 20835 - ser.to_hdf(path, 'table', format=format, errors='surrogatepass') + ser.to_hdf(path, "table", format=format, errors="surrogatepass") - result = pd.read_hdf(path, 'table', errors='surrogatepass') + result = pd.read_hdf(path, "table", errors="surrogatepass") tm.assert_series_equal(result, ser) def test_append_with_data_columns(self): with ensure_clean_store(self.path) as store: df = tm.makeTimeDataFrame() - df.iloc[0, df.columns.get_loc('B')] = 1. - _maybe_remove(store, 'df') - store.append('df', df[:2], data_columns=['B']) - store.append('df', df[2:]) - tm.assert_frame_equal(store['df'], df) + df.iloc[0, df.columns.get_loc("B")] = 1.0 + _maybe_remove(store, "df") + store.append("df", df[:2], data_columns=["B"]) + store.append("df", df[2:]) + tm.assert_frame_equal(store["df"], df) # check that we have indices created - assert(store._handle.root.df.table.cols.index.is_indexed is True) - assert(store._handle.root.df.table.cols.B.is_indexed is True) + assert store._handle.root.df.table.cols.index.is_indexed is True + assert store._handle.root.df.table.cols.B.is_indexed is True # data column searching - result = store.select('df', 'B>0') + result = store.select("df", "B>0") expected = df[df.B > 0] tm.assert_frame_equal(result, expected) # data column searching (with an indexable and a data_columns) - result = store.select( - 'df', 'B>0 and index>df.index[3]') + result = store.select("df", "B>0 and index>df.index[3]") df_new = df.reindex(index=df.index[4:]) expected = df_new[df_new.B > 0] tm.assert_frame_equal(result, expected) # data column selection with a string data_column df_new = df.copy() - df_new['string'] = 'foo' - df_new.loc[1:4, 'string'] = np.nan - df_new.loc[5:6, 'string'] = 'bar' - _maybe_remove(store, 'df') - store.append('df', df_new, data_columns=['string']) - result = store.select('df', "string='foo'") - expected = df_new[df_new.string == 'foo'] + df_new["string"] = "foo" + df_new.loc[1:4, "string"] = np.nan + df_new.loc[5:6, "string"] = "bar" + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["string"]) + result = store.select("df", "string='foo'") + expected = df_new[df_new.string == "foo"] tm.assert_frame_equal(result, expected) # using min_itemsize and a data column def check_col(key, name, size): - assert getattr(store.get_storer(key) - .table.description, name).itemsize == size + assert ( + getattr(store.get_storer(key).table.description, name).itemsize + == size + ) with ensure_clean_store(self.path) as store: - _maybe_remove(store, 'df') - store.append('df', df_new, data_columns=['string'], - min_itemsize={'string': 30}) - check_col('df', 'string', 30) - _maybe_remove(store, 'df') + _maybe_remove(store, "df") + store.append( + "df", df_new, data_columns=["string"], min_itemsize={"string": 30} + ) + check_col("df", "string", 30) + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["string"], min_itemsize=30) + check_col("df", "string", 30) + _maybe_remove(store, "df") store.append( - 'df', df_new, data_columns=['string'], min_itemsize=30) - check_col('df', 'string', 30) - _maybe_remove(store, 'df') - store.append('df', df_new, data_columns=['string'], - min_itemsize={'values': 30}) - check_col('df', 'string', 30) + "df", df_new, data_columns=["string"], min_itemsize={"values": 30} + ) + check_col("df", "string", 30) with ensure_clean_store(self.path) as store: - df_new['string2'] = 'foobarbah' - df_new['string_block1'] = 'foobarbah1' - df_new['string_block2'] = 'foobarbah2' - _maybe_remove(store, 'df') - store.append('df', df_new, data_columns=['string', 'string2'], - min_itemsize={'string': 30, 'string2': 40, - 'values': 50}) - check_col('df', 'string', 30) - check_col('df', 'string2', 40) - check_col('df', 'values_block_1', 50) + df_new["string2"] = "foobarbah" + df_new["string_block1"] = "foobarbah1" + df_new["string_block2"] = "foobarbah2" + _maybe_remove(store, "df") + store.append( + "df", + df_new, + data_columns=["string", "string2"], + min_itemsize={"string": 30, "string2": 40, "values": 50}, + ) + check_col("df", "string", 30) + check_col("df", "string2", 40) + check_col("df", "values_block_1", 50) with ensure_clean_store(self.path) as store: # multiple data columns df_new = df.copy() - df_new.iloc[0, df_new.columns.get_loc('A')] = 1. - df_new.iloc[0, df_new.columns.get_loc('B')] = -1. - df_new['string'] = 'foo' + df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0 + df_new.iloc[0, df_new.columns.get_loc("B")] = -1.0 + df_new["string"] = "foo" - sl = df_new.columns.get_loc('string') + sl = df_new.columns.get_loc("string") df_new.iloc[1:4, sl] = np.nan - df_new.iloc[5:6, sl] = 'bar' + df_new.iloc[5:6, sl] = "bar" - df_new['string2'] = 'foo' - sl = df_new.columns.get_loc('string2') + df_new["string2"] = "foo" + sl = df_new.columns.get_loc("string2") df_new.iloc[2:5, sl] = np.nan - df_new.iloc[7:8, sl] = 'bar' - _maybe_remove(store, 'df') - store.append( - 'df', df_new, data_columns=['A', 'B', 'string', 'string2']) - result = store.select('df', - "string='foo' and string2='foo'" - " and A>0 and B<0") - expected = df_new[(df_new.string == 'foo') & ( - df_new.string2 == 'foo') & (df_new.A > 0) & (df_new.B < 0)] + df_new.iloc[7:8, sl] = "bar" + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["A", "B", "string", "string2"]) + result = store.select( + "df", "string='foo' and string2='foo'" " and A>0 and B<0" + ) + expected = df_new[ + (df_new.string == "foo") + & (df_new.string2 == "foo") + & (df_new.A > 0) + & (df_new.B < 0) + ] tm.assert_frame_equal(result, expected, check_index_type=False) # yield an empty frame - result = store.select('df', "string='foo' and string2='cool'") - expected = df_new[(df_new.string == 'foo') & ( - df_new.string2 == 'cool')] + result = store.select("df", "string='foo' and string2='cool'") + expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")] tm.assert_frame_equal(result, expected, check_index_type=False) with ensure_clean_store(self.path) as store: # doc example df_dc = df.copy() - df_dc['string'] = 'foo' - df_dc.loc[4:6, 'string'] = np.nan - df_dc.loc[7:9, 'string'] = 'bar' - df_dc['string2'] = 'cool' - df_dc['datetime'] = Timestamp('20010102') + df_dc["string"] = "foo" + df_dc.loc[4:6, "string"] = np.nan + df_dc.loc[7:9, "string"] = "bar" + df_dc["string2"] = "cool" + df_dc["datetime"] = Timestamp("20010102") df_dc = df_dc._convert(datetime=True) - df_dc.loc[3:5, ['A', 'B', 'datetime']] = np.nan + df_dc.loc[3:5, ["A", "B", "datetime"]] = np.nan - _maybe_remove(store, 'df_dc') - store.append('df_dc', df_dc, - data_columns=['B', 'C', 'string', - 'string2', 'datetime']) - result = store.select('df_dc', 'B>0') + _maybe_remove(store, "df_dc") + store.append( + "df_dc", df_dc, data_columns=["B", "C", "string", "string2", "datetime"] + ) + result = store.select("df_dc", "B>0") expected = df_dc[df_dc.B > 0] tm.assert_frame_equal(result, expected, check_index_type=False) - result = store.select( - 'df_dc', ['B > 0', 'C > 0', 'string == foo']) - expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & ( - df_dc.string == 'foo')] + result = store.select("df_dc", ["B > 0", "C > 0", "string == foo"]) + expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] tm.assert_frame_equal(result, expected, check_index_type=False) with ensure_clean_store(self.path) as store: # doc example part 2 np.random.seed(1234) - index = date_range('1/1/2000', periods=8) - df_dc = DataFrame(np.random.randn(8, 3), index=index, - columns=['A', 'B', 'C']) - df_dc['string'] = 'foo' - df_dc.loc[4:6, 'string'] = np.nan - df_dc.loc[7:9, 'string'] = 'bar' - df_dc.loc[:, ['B', 'C']] = df_dc.loc[:, ['B', 'C']].abs() - df_dc['string2'] = 'cool' + index = date_range("1/1/2000", periods=8) + df_dc = DataFrame( + np.random.randn(8, 3), index=index, columns=["A", "B", "C"] + ) + df_dc["string"] = "foo" + df_dc.loc[4:6, "string"] = np.nan + df_dc.loc[7:9, "string"] = "bar" + df_dc.loc[:, ["B", "C"]] = df_dc.loc[:, ["B", "C"]].abs() + df_dc["string2"] = "cool" # on-disk operations - store.append('df_dc', df_dc, data_columns=[ - 'B', 'C', 'string', 'string2']) + store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"]) - result = store.select('df_dc', 'B>0') + result = store.select("df_dc", "B>0") expected = df_dc[df_dc.B > 0] tm.assert_frame_equal(result, expected) - result = store.select( - 'df_dc', ['B > 0', 'C > 0', 'string == "foo"']) - expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & - (df_dc.string == 'foo')] + result = store.select("df_dc", ["B > 0", "C > 0", 'string == "foo"']) + expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] tm.assert_frame_equal(result, expected) def test_create_table_index(self): @@ -1631,64 +1690,64 @@ def test_create_table_index(self): with ensure_clean_store(self.path) as store: with catch_warnings(record=True): + def col(t, column): return getattr(store.get_storer(t).table.cols, column) # data columns df = tm.makeTimeDataFrame() - df['string'] = 'foo' - df['string2'] = 'bar' - store.append('f', df, data_columns=['string', 'string2']) - assert(col('f', 'index').is_indexed is True) - assert(col('f', 'string').is_indexed is True) - assert(col('f', 'string2').is_indexed is True) + df["string"] = "foo" + df["string2"] = "bar" + store.append("f", df, data_columns=["string", "string2"]) + assert col("f", "index").is_indexed is True + assert col("f", "string").is_indexed is True + assert col("f", "string2").is_indexed is True # specify index=columns store.append( - 'f2', df, index=['string'], - data_columns=['string', 'string2']) - assert(col('f2', 'index').is_indexed is False) - assert(col('f2', 'string').is_indexed is True) - assert(col('f2', 'string2').is_indexed is False) + "f2", df, index=["string"], data_columns=["string", "string2"] + ) + assert col("f2", "index").is_indexed is False + assert col("f2", "string").is_indexed is True + assert col("f2", "string2").is_indexed is False # try to index a non-table - _maybe_remove(store, 'f2') - store.put('f2', df) + _maybe_remove(store, "f2") + store.put("f2", df) with pytest.raises(TypeError): - store.create_table_index('f2') + store.create_table_index("f2") def test_append_hierarchical(self): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['foo', 'bar']) - df = DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["foo", "bar"], + ) + df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) with ensure_clean_store(self.path) as store: - store.append('mi', df) - result = store.select('mi') + store.append("mi", df) + result = store.select("mi") tm.assert_frame_equal(result, df) # GH 3748 - result = store.select('mi', columns=['A', 'B']) - expected = df.reindex(columns=['A', 'B']) + result = store.select("mi", columns=["A", "B"]) + expected = df.reindex(columns=["A", "B"]) tm.assert_frame_equal(result, expected) - with ensure_clean_path('test.hdf') as path: - df.to_hdf(path, 'df', format='table') - result = read_hdf(path, 'df', columns=['A', 'B']) - expected = df.reindex(columns=['A', 'B']) + with ensure_clean_path("test.hdf") as path: + df.to_hdf(path, "df", format="table") + result = read_hdf(path, "df", columns=["A", "B"]) + expected = df.reindex(columns=["A", "B"]) tm.assert_frame_equal(result, expected) def test_column_multiindex(self): # GH 4710 # recreate multi-indexes properly - index = MultiIndex.from_tuples([('A', 'a'), ('A', 'b'), - ('B', 'a'), ('B', 'b')], - names=['first', 'second']) + index = MultiIndex.from_tuples( + [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")], names=["first", "second"] + ) df = DataFrame(np.arange(12).reshape(3, 4), columns=index) expected = df.copy() if isinstance(expected.index, RangeIndex): @@ -1696,41 +1755,42 @@ def test_column_multiindex(self): with ensure_clean_store(self.path) as store: - store.put('df', df) - tm.assert_frame_equal(store['df'], expected, - check_index_type=True, - check_column_type=True) + store.put("df", df) + tm.assert_frame_equal( + store["df"], expected, check_index_type=True, check_column_type=True + ) - store.put('df1', df, format='table') - tm.assert_frame_equal(store['df1'], expected, - check_index_type=True, - check_column_type=True) + store.put("df1", df, format="table") + tm.assert_frame_equal( + store["df1"], expected, check_index_type=True, check_column_type=True + ) with pytest.raises(ValueError): - store.put('df2', df, format='table', data_columns=['A']) + store.put("df2", df, format="table", data_columns=["A"]) with pytest.raises(ValueError): - store.put('df3', df, format='table', data_columns=True) + store.put("df3", df, format="table", data_columns=True) # appending multi-column on existing table (see GH 6167) with ensure_clean_store(self.path) as store: - store.append('df2', df) - store.append('df2', df) + store.append("df2", df) + store.append("df2", df) - tm.assert_frame_equal(store['df2'], concat((df, df))) + tm.assert_frame_equal(store["df2"], concat((df, df))) # non_index_axes name - df = DataFrame(np.arange(12).reshape(3, 4), - columns=Index(list('ABCD'), name='foo')) + df = DataFrame( + np.arange(12).reshape(3, 4), columns=Index(list("ABCD"), name="foo") + ) expected = df.copy() if isinstance(expected.index, RangeIndex): expected.index = Int64Index(expected.index) with ensure_clean_store(self.path) as store: - store.put('df1', df, format='table') - tm.assert_frame_equal(store['df1'], expected, - check_index_type=True, - check_column_type=True) + store.put("df1", df, format="table") + tm.assert_frame_equal( + store["df1"], expected, check_index_type=True, check_column_type=True + ) def test_store_multiindex(self): @@ -1739,97 +1799,108 @@ def test_store_multiindex(self): with ensure_clean_store(self.path) as store: def make_index(names=None): - return MultiIndex.from_tuples([(datetime.datetime(2013, 12, d), - s, t) - for d in range(1, 3) - for s in range(2) - for t in range(3)], - names=names) + return MultiIndex.from_tuples( + [ + (datetime.datetime(2013, 12, d), s, t) + for d in range(1, 3) + for s in range(2) + for t in range(3) + ], + names=names, + ) # no names - _maybe_remove(store, 'df') - df = DataFrame(np.zeros((12, 2)), columns=[ - 'a', 'b'], index=make_index()) - store.append('df', df) - tm.assert_frame_equal(store.select('df'), df) + _maybe_remove(store, "df") + df = DataFrame(np.zeros((12, 2)), columns=["a", "b"], index=make_index()) + store.append("df", df) + tm.assert_frame_equal(store.select("df"), df) # partial names - _maybe_remove(store, 'df') - df = DataFrame(np.zeros((12, 2)), columns=[ - 'a', 'b'], index=make_index(['date', None, None])) - store.append('df', df) - tm.assert_frame_equal(store.select('df'), df) + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", None, None]), + ) + store.append("df", df) + tm.assert_frame_equal(store.select("df"), df) # series - _maybe_remove(store, 's') - s = Series(np.zeros(12), index=make_index(['date', None, None])) - store.append('s', s) - xp = Series(np.zeros(12), index=make_index( - ['date', 'level_1', 'level_2'])) - tm.assert_series_equal(store.select('s'), xp) + _maybe_remove(store, "s") + s = Series(np.zeros(12), index=make_index(["date", None, None])) + store.append("s", s) + xp = Series(np.zeros(12), index=make_index(["date", "level_1", "level_2"])) + tm.assert_series_equal(store.select("s"), xp) # dup with column - _maybe_remove(store, 'df') - df = DataFrame(np.zeros((12, 2)), columns=[ - 'a', 'b'], index=make_index(['date', 'a', 't'])) + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", "a", "t"]), + ) with pytest.raises(ValueError): - store.append('df', df) + store.append("df", df) # dup within level - _maybe_remove(store, 'df') - df = DataFrame(np.zeros((12, 2)), columns=['a', 'b'], - index=make_index(['date', 'date', 'date'])) + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", "date", "date"]), + ) with pytest.raises(ValueError): - store.append('df', df) + store.append("df", df) # fully names - _maybe_remove(store, 'df') - df = DataFrame(np.zeros((12, 2)), columns=[ - 'a', 'b'], index=make_index(['date', 's', 't'])) - store.append('df', df) - tm.assert_frame_equal(store.select('df'), df) + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", "s", "t"]), + ) + store.append("df", df) + tm.assert_frame_equal(store.select("df"), df) def test_select_columns_in_where(self): # GH 6169 # recreate multi-indexes when columns is passed # in the `where` argument - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['foo_name', 'bar_name']) + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["foo_name", "bar_name"], + ) # With a DataFrame - df = DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) + df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) with ensure_clean_store(self.path) as store: - store.put('df', df, format='table') - expected = df[['A']] + store.put("df", df, format="table") + expected = df[["A"]] - tm.assert_frame_equal(store.select('df', columns=['A']), expected) + tm.assert_frame_equal(store.select("df", columns=["A"]), expected) - tm.assert_frame_equal(store.select( - 'df', where="columns=['A']"), expected) + tm.assert_frame_equal(store.select("df", where="columns=['A']"), expected) # With a Series - s = Series(np.random.randn(10), index=index, - name='A') + s = Series(np.random.randn(10), index=index, name="A") with ensure_clean_store(self.path) as store: - store.put('s', s, format='table') - tm.assert_series_equal(store.select('s', where="columns=['A']"), s) + store.put("s", s, format="table") + tm.assert_series_equal(store.select("s", where="columns=['A']"), s) def test_mi_data_columns(self): # GH 14435 - idx = pd.MultiIndex.from_arrays([date_range('2000-01-01', periods=5), - range(5)], names=['date', 'id']) - df = pd.DataFrame({'a': [1.1, 1.2, 1.3, 1.4, 1.5]}, index=idx) + idx = pd.MultiIndex.from_arrays( + [date_range("2000-01-01", periods=5), range(5)], names=["date", "id"] + ) + df = pd.DataFrame({"a": [1.1, 1.2, 1.3, 1.4, 1.5]}, index=idx) with ensure_clean_store(self.path) as store: - store.append('df', df, data_columns=True) + store.append("df", df, data_columns=True) - actual = store.select('df', where='id == 1') + actual = store.select("df", where="id == 1") expected = df.iloc[[1], :] tm.assert_frame_equal(actual, expected) @@ -1838,62 +1909,62 @@ def test_pass_spec_to_storer(self): df = tm.makeDataFrame() with ensure_clean_store(self.path) as store: - store.put('df', df) + store.put("df", df) with pytest.raises(TypeError): - store.select('df', columns=['A']) + store.select("df", columns=["A"]) with pytest.raises(TypeError): - store.select('df', where=[('columns=A')]) + store.select("df", where=[("columns=A")]) @xfail_non_writeable def test_append_misc(self): with ensure_clean_store(self.path) as store: df = tm.makeDataFrame() - store.append('df', df, chunksize=1) - result = store.select('df') + store.append("df", df, chunksize=1) + result = store.select("df") tm.assert_frame_equal(result, df) - store.append('df1', df, expectedrows=10) - result = store.select('df1') + store.append("df1", df, expectedrows=10) + result = store.select("df1") tm.assert_frame_equal(result, df) # more chunksize in append tests def check(obj, comparator): for c in [10, 200, 1000]: - with ensure_clean_store(self.path, mode='w') as store: - store.append('obj', obj, chunksize=c) - result = store.select('obj') + with ensure_clean_store(self.path, mode="w") as store: + store.append("obj", obj, chunksize=c) + result = store.select("obj") comparator(result, obj) df = tm.makeDataFrame() - df['string'] = 'foo' - df['float322'] = 1. - df['float322'] = df['float322'].astype('float32') - df['bool'] = df['float322'] > 0 - df['time1'] = Timestamp('20130101') - df['time2'] = Timestamp('20130102') + df["string"] = "foo" + df["float322"] = 1.0 + df["float322"] = df["float322"].astype("float32") + df["bool"] = df["float322"] > 0 + df["time1"] = Timestamp("20130101") + df["time2"] = Timestamp("20130102") check(df, tm.assert_frame_equal) # empty frame, GH4273 with ensure_clean_store(self.path) as store: # 0 len - df_empty = DataFrame(columns=list('ABC')) - store.append('df', df_empty) + df_empty = DataFrame(columns=list("ABC")) + store.append("df", df_empty) with pytest.raises(KeyError): - store.select('df') + store.select("df") # repeated append of 0/non-zero frames - df = DataFrame(np.random.rand(10, 3), columns=list('ABC')) - store.append('df', df) - assert_frame_equal(store.select('df'), df) - store.append('df', df_empty) - assert_frame_equal(store.select('df'), df) + df = DataFrame(np.random.rand(10, 3), columns=list("ABC")) + store.append("df", df) + assert_frame_equal(store.select("df"), df) + store.append("df", df_empty) + assert_frame_equal(store.select("df"), df) # store - df = DataFrame(columns=list('ABC')) - store.put('df2', df) - assert_frame_equal(store.select('df2'), df) + df = DataFrame(columns=list("ABC")) + store.put("df2", df) + assert_frame_equal(store.select("df2"), df) def test_append_raise(self): @@ -1903,93 +1974,104 @@ def test_append_raise(self): # list in column df = tm.makeDataFrame() - df['invalid'] = [['a']] * len(df) - assert df.dtypes['invalid'] == np.object_ + df["invalid"] = [["a"]] * len(df) + assert df.dtypes["invalid"] == np.object_ with pytest.raises(TypeError): - store.append('df', df) + store.append("df", df) # multiple invalid columns - df['invalid2'] = [['a']] * len(df) - df['invalid3'] = [['a']] * len(df) + df["invalid2"] = [["a"]] * len(df) + df["invalid3"] = [["a"]] * len(df) with pytest.raises(TypeError): - store.append('df', df) + store.append("df", df) # datetime with embedded nans as object df = tm.makeDataFrame() s = Series(datetime.datetime(2001, 1, 2), index=df.index) s = s.astype(object) s[0:5] = np.nan - df['invalid'] = s - assert df.dtypes['invalid'] == np.object_ + df["invalid"] = s + assert df.dtypes["invalid"] == np.object_ with pytest.raises(TypeError): - store.append('df', df) + store.append("df", df) # directly ndarray with pytest.raises(TypeError): - store.append('df', np.arange(10)) + store.append("df", np.arange(10)) # series directly with pytest.raises(TypeError): - store.append('df', Series(np.arange(10))) + store.append("df", Series(np.arange(10))) # appending an incompatible table df = tm.makeDataFrame() - store.append('df', df) + store.append("df", df) - df['foo'] = 'foo' + df["foo"] = "foo" with pytest.raises(ValueError): - store.append('df', df) + store.append("df", df) def test_table_index_incompatible_dtypes(self): - df1 = DataFrame({'a': [1, 2, 3]}) - df2 = DataFrame({'a': [4, 5, 6]}, - index=date_range('1/1/2000', periods=3)) + df1 = DataFrame({"a": [1, 2, 3]}) + df2 = DataFrame({"a": [4, 5, 6]}, index=date_range("1/1/2000", periods=3)) with ensure_clean_store(self.path) as store: - store.put('frame', df1, format='table') + store.put("frame", df1, format="table") with pytest.raises(TypeError): - store.put('frame', df2, format='table', append=True) + store.put("frame", df2, format="table", append=True) def test_table_values_dtypes_roundtrip(self): with ensure_clean_store(self.path) as store: - df1 = DataFrame({'a': [1, 2, 3]}, dtype='f8') - store.append('df_f8', df1) - assert_series_equal(df1.dtypes, store['df_f8'].dtypes) + df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8") + store.append("df_f8", df1) + assert_series_equal(df1.dtypes, store["df_f8"].dtypes) - df2 = DataFrame({'a': [1, 2, 3]}, dtype='i8') - store.append('df_i8', df2) - assert_series_equal(df2.dtypes, store['df_i8'].dtypes) + df2 = DataFrame({"a": [1, 2, 3]}, dtype="i8") + store.append("df_i8", df2) + assert_series_equal(df2.dtypes, store["df_i8"].dtypes) # incompatible dtype with pytest.raises(ValueError): - store.append('df_i8', df1) + store.append("df_i8", df1) # check creation/storage/retrieval of float32 (a bit hacky to # actually create them thought) - df1 = DataFrame( - np.array([[1], [2], [3]], dtype='f4'), columns=['A']) - store.append('df_f4', df1) - assert_series_equal(df1.dtypes, store['df_f4'].dtypes) - assert df1.dtypes[0] == 'float32' + df1 = DataFrame(np.array([[1], [2], [3]], dtype="f4"), columns=["A"]) + store.append("df_f4", df1) + assert_series_equal(df1.dtypes, store["df_f4"].dtypes) + assert df1.dtypes[0] == "float32" # check with mixed dtypes - df1 = DataFrame({c: Series(np.random.randint(5), dtype=c) - for c in ['float32', 'float64', 'int32', - 'int64', 'int16', 'int8']}) - df1['string'] = 'foo' - df1['float322'] = 1. - df1['float322'] = df1['float322'].astype('float32') - df1['bool'] = df1['float32'] > 0 - df1['time1'] = Timestamp('20130101') - df1['time2'] = Timestamp('20130102') - - store.append('df_mixed_dtypes1', df1) - result = store.select('df_mixed_dtypes1').dtypes.value_counts() + df1 = DataFrame( + { + c: Series(np.random.randint(5), dtype=c) + for c in ["float32", "float64", "int32", "int64", "int16", "int8"] + } + ) + df1["string"] = "foo" + df1["float322"] = 1.0 + df1["float322"] = df1["float322"].astype("float32") + df1["bool"] = df1["float32"] > 0 + df1["time1"] = Timestamp("20130101") + df1["time2"] = Timestamp("20130102") + + store.append("df_mixed_dtypes1", df1) + result = store.select("df_mixed_dtypes1").dtypes.value_counts() result.index = [str(i) for i in result.index] - expected = Series({'float32': 2, 'float64': 1, 'int32': 1, - 'bool': 1, 'int16': 1, 'int8': 1, - 'int64': 1, 'object': 1, 'datetime64[ns]': 2}) + expected = Series( + { + "float32": 2, + "float64": 1, + "int32": 1, + "bool": 1, + "int16": 1, + "int8": 1, + "int64": 1, + "object": 1, + "datetime64[ns]": 2, + } + ) result = result.sort_index() expected = expected.sort_index() tm.assert_series_equal(result, expected) @@ -1998,87 +2080,93 @@ def test_table_mixed_dtypes(self): # frame df = tm.makeDataFrame() - df['obj1'] = 'foo' - df['obj2'] = 'bar' - df['bool1'] = df['A'] > 0 - df['bool2'] = df['B'] > 0 - df['bool3'] = True - df['int1'] = 1 - df['int2'] = 2 - df['timestamp1'] = Timestamp('20010102') - df['timestamp2'] = Timestamp('20010103') - df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0) - df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0) - df.loc[3:6, ['obj1']] = np.nan + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["bool3"] = True + df["int1"] = 1 + df["int2"] = 2 + df["timestamp1"] = Timestamp("20010102") + df["timestamp2"] = Timestamp("20010103") + df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) + df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) + df.loc[3:6, ["obj1"]] = np.nan df = df._consolidate()._convert(datetime=True) with ensure_clean_store(self.path) as store: - store.append('df1_mixed', df) - tm.assert_frame_equal(store.select('df1_mixed'), df) + store.append("df1_mixed", df) + tm.assert_frame_equal(store.select("df1_mixed"), df) def test_unimplemented_dtypes_table_columns(self): with ensure_clean_store(self.path) as store: - dtypes = [('date', datetime.date(2001, 1, 2))] + dtypes = [("date", datetime.date(2001, 1, 2))] # currently not supported dtypes #### for n, f in dtypes: df = tm.makeDataFrame() df[n] = f with pytest.raises(TypeError): - store.append('df1_%s' % n, df) + store.append("df1_%s" % n, df) # frame df = tm.makeDataFrame() - df['obj1'] = 'foo' - df['obj2'] = 'bar' - df['datetime1'] = datetime.date(2001, 1, 2) + df["obj1"] = "foo" + df["obj2"] = "bar" + df["datetime1"] = datetime.date(2001, 1, 2) df = df._consolidate()._convert(datetime=True) with ensure_clean_store(self.path) as store: # this fails because we have a date in the object block...... with pytest.raises(TypeError): - store.append('df_unimplemented', df) + store.append("df_unimplemented", df) @xfail_non_writeable @pytest.mark.skipif( - LooseVersion(np.__version__) == LooseVersion('1.15.0'), - reason=("Skipping pytables test when numpy version is " - "exactly equal to 1.15.0: gh-22098")) + LooseVersion(np.__version__) == LooseVersion("1.15.0"), + reason=( + "Skipping pytables test when numpy version is " + "exactly equal to 1.15.0: gh-22098" + ), + ) def test_calendar_roundtrip_issue(self): # 8591 # doc example from tseries holiday section - weekmask_egypt = 'Sun Mon Tue Wed Thu' - holidays = ['2012-05-01', - datetime.datetime(2013, 5, 1), np.datetime64('2014-05-01')] + weekmask_egypt = "Sun Mon Tue Wed Thu" + holidays = [ + "2012-05-01", + datetime.datetime(2013, 5, 1), + np.datetime64("2014-05-01"), + ] bday_egypt = pd.offsets.CustomBusinessDay( - holidays=holidays, weekmask=weekmask_egypt) + holidays=holidays, weekmask=weekmask_egypt + ) dt = datetime.datetime(2013, 4, 30) dts = date_range(dt, periods=5, freq=bday_egypt) - s = (Series(dts.weekday, dts).map( - Series('Mon Tue Wed Thu Fri Sat Sun'.split()))) + s = Series(dts.weekday, dts).map(Series("Mon Tue Wed Thu Fri Sat Sun".split())) with ensure_clean_store(self.path) as store: - store.put('fixed', s) - result = store.select('fixed') + store.put("fixed", s) + result = store.select("fixed") assert_series_equal(result, s) - store.append('table', s) - result = store.select('table') + store.append("table", s) + result = store.select("table") assert_series_equal(result, s) def test_roundtrip_tz_aware_index(self): # GH 17618 - time = pd.Timestamp('2000-01-01 01:00:00', tz='US/Eastern') + time = pd.Timestamp("2000-01-01 01:00:00", tz="US/Eastern") df = pd.DataFrame(data=[0], index=[time]) with ensure_clean_store(self.path) as store: - store.put('frame', df, format='fixed') - recons = store['frame'] + store.put("frame", df, format="fixed") + recons = store["frame"] tm.assert_frame_equal(recons, df) assert recons.index[0].value == 946706400000000000 @@ -2086,42 +2174,49 @@ def test_append_with_timedelta(self): # GH 3577 # append timedelta - df = DataFrame(dict(A=Timestamp('20130101'), B=[Timestamp( - '20130101') + timedelta(days=i, seconds=10) for i in range(10)])) - df['C'] = df['A'] - df['B'] - df.loc[3:5, 'C'] = np.nan + df = DataFrame( + dict( + A=Timestamp("20130101"), + B=[ + Timestamp("20130101") + timedelta(days=i, seconds=10) + for i in range(10) + ], + ) + ) + df["C"] = df["A"] - df["B"] + df.loc[3:5, "C"] = np.nan with ensure_clean_store(self.path) as store: # table - _maybe_remove(store, 'df') - store.append('df', df, data_columns=True) - result = store.select('df') + _maybe_remove(store, "df") + store.append("df", df, data_columns=True) + result = store.select("df") assert_frame_equal(result, df) - result = store.select('df', where="C<100000") + result = store.select("df", where="C<100000") assert_frame_equal(result, df) - result = store.select('df', where="C') + store.select("df", "index>") # from the docs with ensure_clean_path(self.path) as path: - dfq = DataFrame(np.random.randn(10, 4), columns=list( - 'ABCD'), index=date_range('20130101', periods=10)) - dfq.to_hdf(path, 'dfq', format='table', data_columns=True) + dfq = DataFrame( + np.random.randn(10, 4), + columns=list("ABCD"), + index=date_range("20130101", periods=10), + ) + dfq.to_hdf(path, "dfq", format="table", data_columns=True) # check ok - read_hdf(path, 'dfq', - where="index>Timestamp('20130104') & columns=['A', 'B']") - read_hdf(path, 'dfq', where="A>0 or C>0") + read_hdf( + path, "dfq", where="index>Timestamp('20130104') & columns=['A', 'B']" + ) + read_hdf(path, "dfq", where="A>0 or C>0") # catch the invalid reference with ensure_clean_path(self.path) as path: - dfq = DataFrame(np.random.randn(10, 4), columns=list( - 'ABCD'), index=date_range('20130101', periods=10)) - dfq.to_hdf(path, 'dfq', format='table') + dfq = DataFrame( + np.random.randn(10, 4), + columns=list("ABCD"), + index=date_range("20130101", periods=10), + ) + dfq.to_hdf(path, "dfq", format="table") with pytest.raises(ValueError): - read_hdf(path, 'dfq', where="A>0 or C>0") + read_hdf(path, "dfq", where="A>0 or C>0") def test_same_name_scoping(self): with ensure_clean_store(self.path) as store: import pandas as pd - df = DataFrame(np.random.randn(20, 2), - index=pd.date_range('20130101', periods=20)) - store.put('df', df, format='table') - expected = df[df.index > pd.Timestamp('20130105')] + + df = DataFrame( + np.random.randn(20, 2), index=pd.date_range("20130101", periods=20) + ) + store.put("df", df, format="table") + expected = df[df.index > pd.Timestamp("20130105")] import datetime # noqa - result = store.select('df', 'index>datetime.datetime(2013,1,5)') + + result = store.select("df", "index>datetime.datetime(2013,1,5)") assert_frame_equal(result, expected) from datetime import datetime # noqa # technically an error, but allow it - result = store.select('df', 'index>datetime.datetime(2013,1,5)') + result = store.select("df", "index>datetime.datetime(2013,1,5)") assert_frame_equal(result, expected) - result = store.select('df', 'index>datetime(2013,1,5)') + result = store.select("df", "index>datetime(2013,1,5)") assert_frame_equal(result, expected) def test_series(self): @@ -2239,10 +2344,8 @@ def test_series(self): ts2 = Series(ts.index, Index(ts.index, dtype=object)) self._check_roundtrip(ts2, tm.assert_series_equal) - ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), - dtype=object)) - self._check_roundtrip(ts3, tm.assert_series_equal, - check_index_type=False) + ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) + self._check_roundtrip(ts3, tm.assert_series_equal, check_index_type=False) @ignore_sparse @ignore_series_tosparse @@ -2251,16 +2354,13 @@ def test_sparse_series(self): s = tm.makeStringSeries() s.iloc[3:5] = np.nan ss = s.to_sparse() - self._check_roundtrip(ss, tm.assert_series_equal, - check_series_type=True) + self._check_roundtrip(ss, tm.assert_series_equal, check_series_type=True) - ss2 = s.to_sparse(kind='integer') - self._check_roundtrip(ss2, tm.assert_series_equal, - check_series_type=True) + ss2 = s.to_sparse(kind="integer") + self._check_roundtrip(ss2, tm.assert_series_equal, check_series_type=True) ss3 = s.to_sparse(fill_value=0) - self._check_roundtrip(ss3, tm.assert_series_equal, - check_series_type=True) + self._check_roundtrip(ss3, tm.assert_series_equal, check_series_type=True) @ignore_sparse @ignore_dataframe_tosparse @@ -2271,16 +2371,13 @@ def test_sparse_frame(self): s.iloc[8:10, -2] = np.nan ss = s.to_sparse() - self._check_double_roundtrip(ss, tm.assert_frame_equal, - check_frame_type=True) + self._check_double_roundtrip(ss, tm.assert_frame_equal, check_frame_type=True) - ss2 = s.to_sparse(kind='integer') - self._check_double_roundtrip(ss2, tm.assert_frame_equal, - check_frame_type=True) + ss2 = s.to_sparse(kind="integer") + self._check_double_roundtrip(ss2, tm.assert_frame_equal, check_frame_type=True) ss3 = s.to_sparse(fill_value=0) - self._check_double_roundtrip(ss3, tm.assert_frame_equal, - check_frame_type=True) + self._check_double_roundtrip(ss3, tm.assert_frame_equal, check_frame_type=True) def test_float_index(self): @@ -2294,7 +2391,7 @@ def test_tuple_index(self): # GH #492 col = np.arange(10) - idx = [(0., 1.), (2., 3.), (4., 5.)] + idx = [(0.0, 1.0), (2.0, 3.0), (4.0, 5.0)] data = np.random.randn(30).reshape((3, 10)) DF = DataFrame(data, index=idx, columns=col) @@ -2309,13 +2406,12 @@ def test_index_types(self): with catch_warnings(record=True): values = np.random.randn(2) - func = lambda l, r: tm.assert_series_equal(l, r, - check_dtype=True, - check_index_type=True, - check_series_type=True) + func = lambda l, r: tm.assert_series_equal( + l, r, check_dtype=True, check_index_type=True, check_series_type=True + ) with catch_warnings(record=True): - ser = Series(values, [0, 'y']) + ser = Series(values, [0, "y"]) self._check_roundtrip(ser, func) with catch_warnings(record=True): @@ -2323,28 +2419,28 @@ def test_index_types(self): self._check_roundtrip(ser, func) with catch_warnings(record=True): - ser = Series(values, ['y', 0]) + ser = Series(values, ["y", 0]) self._check_roundtrip(ser, func) with catch_warnings(record=True): - ser = Series(values, [datetime.date.today(), 'a']) + ser = Series(values, [datetime.date.today(), "a"]) self._check_roundtrip(ser, func) with catch_warnings(record=True): - ser = Series(values, [0, 'y']) + ser = Series(values, [0, "y"]) self._check_roundtrip(ser, func) ser = Series(values, [datetime.datetime.today(), 0]) self._check_roundtrip(ser, func) - ser = Series(values, ['y', 0]) + ser = Series(values, ["y", 0]) self._check_roundtrip(ser, func) - ser = Series(values, [datetime.date.today(), 'a']) + ser = Series(values, [datetime.date.today(), "a"]) self._check_roundtrip(ser, func) - ser = Series(values, [1.23, 'b']) + ser = Series(values, [1.23, "b"]) self._check_roundtrip(ser, func) ser = Series(values, [1, 1.53]) @@ -2353,23 +2449,24 @@ def test_index_types(self): ser = Series(values, [1, 5]) self._check_roundtrip(ser, func) - ser = Series(values, [datetime.datetime( - 2012, 1, 1), datetime.datetime(2012, 1, 2)]) + ser = Series( + values, [datetime.datetime(2012, 1, 1), datetime.datetime(2012, 1, 2)] + ) self._check_roundtrip(ser, func) def test_timeseries_preepoch(self): - dr = bdate_range('1/1/1940', '1/1/1960') + dr = bdate_range("1/1/1940", "1/1/1960") ts = Series(np.random.randn(len(dr)), index=dr) try: self._check_roundtrip(ts, tm.assert_series_equal) except OverflowError: - pytest.skip('known failer on some windows platforms') + pytest.skip("known failer on some windows platforms") @xfail_non_writeable - @pytest.mark.parametrize("compression", [ - False, pytest.param(True, marks=td.skip_if_windows_python_3) - ]) + @pytest.mark.parametrize( + "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] + ) def test_frame(self, compression): df = tm.makeDataFrame() @@ -2378,20 +2475,17 @@ def test_frame(self, compression): df.values[0, 0] = np.nan df.values[5, 3] = np.nan - self._check_roundtrip_table(df, tm.assert_frame_equal, - compression=compression) - self._check_roundtrip(df, tm.assert_frame_equal, - compression=compression) + self._check_roundtrip_table(df, tm.assert_frame_equal, compression=compression) + self._check_roundtrip(df, tm.assert_frame_equal, compression=compression) tdf = tm.makeTimeDataFrame() - self._check_roundtrip(tdf, tm.assert_frame_equal, - compression=compression) + self._check_roundtrip(tdf, tm.assert_frame_equal, compression=compression) with ensure_clean_store(self.path) as store: # not consolidated - df['foo'] = np.random.randn(len(df)) - store['df'] = df - recons = store['df'] + df["foo"] = np.random.randn(len(df)) + store["df"] = df + recons = store["df"] assert recons._data.is_consolidated() # empty @@ -2400,10 +2494,10 @@ def test_frame(self, compression): @xfail_non_writeable def test_empty_series_frame(self): s0 = Series() - s1 = Series(name='myseries') + s1 = Series(name="myseries") df0 = DataFrame() - df1 = DataFrame(index=['a', 'b', 'c']) - df2 = DataFrame(columns=['d', 'e', 'f']) + df1 = DataFrame(index=["a", "b", "c"]) + df2 = DataFrame(columns=["d", "e", "f"]) self._check_roundtrip(s0, tm.assert_series_equal) self._check_roundtrip(s1, tm.assert_series_equal) @@ -2413,73 +2507,75 @@ def test_empty_series_frame(self): @xfail_non_writeable @pytest.mark.parametrize( - 'dtype', [np.int64, np.float64, np.object, 'm8[ns]', 'M8[ns]']) + "dtype", [np.int64, np.float64, np.object, "m8[ns]", "M8[ns]"] + ) def test_empty_series(self, dtype): s = Series(dtype=dtype) self._check_roundtrip(s, tm.assert_series_equal) def test_can_serialize_dates(self): - rng = [x.date() for x in bdate_range('1/1/2000', '1/30/2000')] + rng = [x.date() for x in bdate_range("1/1/2000", "1/30/2000")] frame = DataFrame(np.random.randn(len(rng), 4), index=rng) self._check_roundtrip(frame, tm.assert_frame_equal) def test_store_hierarchical(self): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['foo', 'bar']) - frame = DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["foo", "bar"], + ) + frame = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) self._check_roundtrip(frame, tm.assert_frame_equal) self._check_roundtrip(frame.T, tm.assert_frame_equal) - self._check_roundtrip(frame['A'], tm.assert_series_equal) + self._check_roundtrip(frame["A"], tm.assert_series_equal) # check that the names are stored with ensure_clean_store(self.path) as store: - store['frame'] = frame - recons = store['frame'] + store["frame"] = frame + recons = store["frame"] tm.assert_frame_equal(recons, frame) def test_store_index_name(self): df = tm.makeDataFrame() - df.index.name = 'foo' + df.index.name = "foo" with ensure_clean_store(self.path) as store: - store['frame'] = df - recons = store['frame'] + store["frame"] = df + recons = store["frame"] tm.assert_frame_equal(recons, df) def test_store_index_name_with_tz(self): # GH 13884 - df = pd.DataFrame({'A': [1, 2]}) + df = pd.DataFrame({"A": [1, 2]}) df.index = pd.DatetimeIndex([1234567890123456787, 1234567890123456788]) - df.index = df.index.tz_localize('UTC') - df.index.name = 'foo' + df.index = df.index.tz_localize("UTC") + df.index.name = "foo" with ensure_clean_store(self.path) as store: - store.put('frame', df, format='table') - recons = store['frame'] + store.put("frame", df, format="table") + recons = store["frame"] tm.assert_frame_equal(recons, df) - @pytest.mark.parametrize('table_format', ['table', 'fixed']) + @pytest.mark.parametrize("table_format", ["table", "fixed"]) def test_store_index_name_numpy_str(self, table_format): # GH #13492 - idx = pd.Index(pd.to_datetime([datetime.date(2000, 1, 1), - datetime.date(2000, 1, 2)]), - name='cols\u05d2') - idx1 = pd.Index(pd.to_datetime([datetime.date(2010, 1, 1), - datetime.date(2010, 1, 2)]), - name='rows\u05d0') + idx = pd.Index( + pd.to_datetime([datetime.date(2000, 1, 1), datetime.date(2000, 1, 2)]), + name="cols\u05d2", + ) + idx1 = pd.Index( + pd.to_datetime([datetime.date(2010, 1, 1), datetime.date(2010, 1, 2)]), + name="rows\u05d0", + ) df = pd.DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1) # This used to fail, returning numpy strings instead of python strings. with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', format=table_format) - df2 = read_hdf(path, 'df') + df.to_hdf(path, "df", format=table_format) + df2 = read_hdf(path, "df") assert_frame_equal(df, df2, check_names=True) @@ -2488,27 +2584,26 @@ def test_store_index_name_numpy_str(self, table_format): def test_store_series_name(self): df = tm.makeDataFrame() - series = df['A'] + series = df["A"] with ensure_clean_store(self.path) as store: - store['series'] = series - recons = store['series'] + store["series"] = series + recons = store["series"] tm.assert_series_equal(recons, series) @xfail_non_writeable - @pytest.mark.parametrize("compression", [ - False, pytest.param(True, marks=td.skip_if_windows_python_3) - ]) + @pytest.mark.parametrize( + "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] + ) def test_store_mixed(self, compression): - def _make_one(): df = tm.makeDataFrame() - df['obj1'] = 'foo' - df['obj2'] = 'bar' - df['bool1'] = df['A'] > 0 - df['bool2'] = df['B'] > 0 - df['int1'] = 1 - df['int2'] = 2 + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["int1"] = 1 + df["int2"] = 2 return df._consolidate() df1 = _make_one() @@ -2518,18 +2613,21 @@ def _make_one(): self._check_roundtrip(df2, tm.assert_frame_equal) with ensure_clean_store(self.path) as store: - store['obj'] = df1 - tm.assert_frame_equal(store['obj'], df1) - store['obj'] = df2 - tm.assert_frame_equal(store['obj'], df2) + store["obj"] = df1 + tm.assert_frame_equal(store["obj"], df1) + store["obj"] = df2 + tm.assert_frame_equal(store["obj"], df2) # check that can store Series of all of these types - self._check_roundtrip(df1['obj1'], tm.assert_series_equal, - compression=compression) - self._check_roundtrip(df1['bool1'], tm.assert_series_equal, - compression=compression) - self._check_roundtrip(df1['int1'], tm.assert_series_equal, - compression=compression) + self._check_roundtrip( + df1["obj1"], tm.assert_series_equal, compression=compression + ) + self._check_roundtrip( + df1["bool1"], tm.assert_series_equal, compression=compression + ) + self._check_roundtrip( + df1["int1"], tm.assert_series_equal, compression=compression + ) @pytest.mark.filterwarnings( "ignore:\\nduplicate:pandas.io.pytables.DuplicateWarning" @@ -2537,70 +2635,73 @@ def _make_one(): def test_select_with_dups(self): # single dtypes - df = DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']) - df.index = date_range('20130101 9:30', periods=10, freq='T') + df = DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]) + df.index = date_range("20130101 9:30", periods=10, freq="T") with ensure_clean_store(self.path) as store: - store.append('df', df) + store.append("df", df) - result = store.select('df') + result = store.select("df") expected = df assert_frame_equal(result, expected, by_blocks=True) - result = store.select('df', columns=df.columns) + result = store.select("df", columns=df.columns) expected = df assert_frame_equal(result, expected, by_blocks=True) - result = store.select('df', columns=['A']) - expected = df.loc[:, ['A']] + result = store.select("df", columns=["A"]) + expected = df.loc[:, ["A"]] assert_frame_equal(result, expected) # dups across dtypes - df = concat([DataFrame(np.random.randn(10, 4), - columns=['A', 'A', 'B', 'B']), - DataFrame(np.random.randint(0, 10, size=20) - .reshape(10, 2), - columns=['A', 'C'])], - axis=1) - df.index = date_range('20130101 9:30', periods=10, freq='T') + df = concat( + [ + DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]), + DataFrame( + np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] + ), + ], + axis=1, + ) + df.index = date_range("20130101 9:30", periods=10, freq="T") with ensure_clean_store(self.path) as store: - store.append('df', df) + store.append("df", df) - result = store.select('df') + result = store.select("df") expected = df assert_frame_equal(result, expected, by_blocks=True) - result = store.select('df', columns=df.columns) + result = store.select("df", columns=df.columns) expected = df assert_frame_equal(result, expected, by_blocks=True) - expected = df.loc[:, ['A']] - result = store.select('df', columns=['A']) + expected = df.loc[:, ["A"]] + result = store.select("df", columns=["A"]) assert_frame_equal(result, expected, by_blocks=True) - expected = df.loc[:, ['B', 'A']] - result = store.select('df', columns=['B', 'A']) + expected = df.loc[:, ["B", "A"]] + result = store.select("df", columns=["B", "A"]) assert_frame_equal(result, expected, by_blocks=True) # duplicates on both index and columns with ensure_clean_store(self.path) as store: - store.append('df', df) - store.append('df', df) + store.append("df", df) + store.append("df", df) - expected = df.loc[:, ['B', 'A']] + expected = df.loc[:, ["B", "A"]] expected = concat([expected, expected]) - result = store.select('df', columns=['B', 'A']) + result = store.select("df", columns=["B", "A"]) assert_frame_equal(result, expected, by_blocks=True) def test_overwrite_node(self): with ensure_clean_store(self.path) as store: - store['a'] = tm.makeTimeDataFrame() + store["a"] = tm.makeTimeDataFrame() ts = tm.makeTimeSeries() - store['a'] = ts + store["a"] = ts - tm.assert_series_equal(store['a'], ts) + tm.assert_series_equal(store["a"], ts) @ignore_sparse @ignore_dataframe_tosparse @@ -2609,32 +2710,32 @@ def test_sparse_with_compression(self): # GH 2931 # make sparse dataframe - arr = np.random.binomial(n=1, p=.01, size=(1000, 10)) + arr = np.random.binomial(n=1, p=0.01, size=(1000, 10)) df = DataFrame(arr).to_sparse(fill_value=0) # case 1: store uncompressed - self._check_double_roundtrip(df, tm.assert_frame_equal, - compression=False, - check_frame_type=True) + self._check_double_roundtrip( + df, tm.assert_frame_equal, compression=False, check_frame_type=True + ) # case 2: store compressed (works) - self._check_double_roundtrip(df, tm.assert_frame_equal, - compression='zlib', - check_frame_type=True) + self._check_double_roundtrip( + df, tm.assert_frame_equal, compression="zlib", check_frame_type=True + ) # set one series to be completely sparse df[0] = np.zeros(1000) # case 3: store df with completely sparse series uncompressed - self._check_double_roundtrip(df, tm.assert_frame_equal, - compression=False, - check_frame_type=True) + self._check_double_roundtrip( + df, tm.assert_frame_equal, compression=False, check_frame_type=True + ) # case 4: try storing df with completely sparse series compressed # (fails) - self._check_double_roundtrip(df, tm.assert_frame_equal, - compression='zlib', - check_frame_type=True) + self._check_double_roundtrip( + df, tm.assert_frame_equal, compression="zlib", check_frame_type=True + ) def test_select(self): @@ -2644,113 +2745,113 @@ def test_select(self): # select with columns= df = tm.makeTimeDataFrame() - _maybe_remove(store, 'df') - store.append('df', df) - result = store.select('df', columns=['A', 'B']) - expected = df.reindex(columns=['A', 'B']) + _maybe_remove(store, "df") + store.append("df", df) + result = store.select("df", columns=["A", "B"]) + expected = df.reindex(columns=["A", "B"]) tm.assert_frame_equal(expected, result) # equivalently - result = store.select('df', [("columns=['A', 'B']")]) - expected = df.reindex(columns=['A', 'B']) + result = store.select("df", [("columns=['A', 'B']")]) + expected = df.reindex(columns=["A", "B"]) tm.assert_frame_equal(expected, result) # with a data column - _maybe_remove(store, 'df') - store.append('df', df, data_columns=['A']) - result = store.select('df', ['A > 0'], columns=['A', 'B']) - expected = df[df.A > 0].reindex(columns=['A', 'B']) + _maybe_remove(store, "df") + store.append("df", df, data_columns=["A"]) + result = store.select("df", ["A > 0"], columns=["A", "B"]) + expected = df[df.A > 0].reindex(columns=["A", "B"]) tm.assert_frame_equal(expected, result) # all a data columns - _maybe_remove(store, 'df') - store.append('df', df, data_columns=True) - result = store.select('df', ['A > 0'], columns=['A', 'B']) - expected = df[df.A > 0].reindex(columns=['A', 'B']) + _maybe_remove(store, "df") + store.append("df", df, data_columns=True) + result = store.select("df", ["A > 0"], columns=["A", "B"]) + expected = df[df.A > 0].reindex(columns=["A", "B"]) tm.assert_frame_equal(expected, result) # with a data column, but different columns - _maybe_remove(store, 'df') - store.append('df', df, data_columns=['A']) - result = store.select('df', ['A > 0'], columns=['C', 'D']) - expected = df[df.A > 0].reindex(columns=['C', 'D']) + _maybe_remove(store, "df") + store.append("df", df, data_columns=["A"]) + result = store.select("df", ["A > 0"], columns=["C", "D"]) + expected = df[df.A > 0].reindex(columns=["C", "D"]) tm.assert_frame_equal(expected, result) def test_select_dtypes(self): with ensure_clean_store(self.path) as store: # with a Timestamp data column (GH #2637) - df = DataFrame(dict( - ts=bdate_range('2012-01-01', periods=300), - A=np.random.randn(300))) - _maybe_remove(store, 'df') - store.append('df', df, data_columns=['ts', 'A']) - - result = store.select('df', "ts>=Timestamp('2012-02-01')") - expected = df[df.ts >= Timestamp('2012-02-01')] + df = DataFrame( + dict(ts=bdate_range("2012-01-01", periods=300), A=np.random.randn(300)) + ) + _maybe_remove(store, "df") + store.append("df", df, data_columns=["ts", "A"]) + + result = store.select("df", "ts>=Timestamp('2012-02-01')") + expected = df[df.ts >= Timestamp("2012-02-01")] tm.assert_frame_equal(expected, result) # bool columns (GH #2849) - df = DataFrame(np.random.randn(5, 2), columns=['A', 'B']) - df['object'] = 'foo' - df.loc[4:5, 'object'] = 'bar' - df['boolv'] = df['A'] > 0 - _maybe_remove(store, 'df') - store.append('df', df, data_columns=True) - - expected = (df[df.boolv == True] # noqa - .reindex(columns=['A', 'boolv'])) - for v in [True, 'true', 1]: - result = store.select('df', 'boolv == %s' % str(v), - columns=['A', 'boolv']) + df = DataFrame(np.random.randn(5, 2), columns=["A", "B"]) + df["object"] = "foo" + df.loc[4:5, "object"] = "bar" + df["boolv"] = df["A"] > 0 + _maybe_remove(store, "df") + store.append("df", df, data_columns=True) + + expected = df[df.boolv == True].reindex(columns=["A", "boolv"]) # noqa + for v in [True, "true", 1]: + result = store.select( + "df", "boolv == %s" % str(v), columns=["A", "boolv"] + ) tm.assert_frame_equal(expected, result) - expected = (df[df.boolv == False] # noqa - .reindex(columns=['A', 'boolv'])) - for v in [False, 'false', 0]: + expected = df[df.boolv == False].reindex(columns=["A", "boolv"]) # noqa + for v in [False, "false", 0]: result = store.select( - 'df', 'boolv == %s' % str(v), columns=['A', 'boolv']) + "df", "boolv == %s" % str(v), columns=["A", "boolv"] + ) tm.assert_frame_equal(expected, result) # integer index df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) - _maybe_remove(store, 'df_int') - store.append('df_int', df) - result = store.select( - 'df_int', "index<10 and columns=['A']") - expected = df.reindex(index=list(df.index)[0:10], columns=['A']) + _maybe_remove(store, "df_int") + store.append("df_int", df) + result = store.select("df_int", "index<10 and columns=['A']") + expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) tm.assert_frame_equal(expected, result) # float index - df = DataFrame(dict(A=np.random.rand( - 20), B=np.random.rand(20), index=np.arange(20, dtype='f8'))) - _maybe_remove(store, 'df_float') - store.append('df_float', df) - result = store.select( - 'df_float', "index<10.0 and columns=['A']") - expected = df.reindex(index=list(df.index)[0:10], columns=['A']) + df = DataFrame( + dict( + A=np.random.rand(20), + B=np.random.rand(20), + index=np.arange(20, dtype="f8"), + ) + ) + _maybe_remove(store, "df_float") + store.append("df_float", df) + result = store.select("df_float", "index<10.0 and columns=['A']") + expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) tm.assert_frame_equal(expected, result) with ensure_clean_store(self.path) as store: # floats w/o NaN - df = DataFrame( - dict(cols=range(11), values=range(11)), dtype='float64') - df['cols'] = (df['cols'] + 10).apply(str) + df = DataFrame(dict(cols=range(11), values=range(11)), dtype="float64") + df["cols"] = (df["cols"] + 10).apply(str) - store.append('df1', df, data_columns=True) - result = store.select( - 'df1', where='values>2.0') - expected = df[df['values'] > 2.0] + store.append("df1", df, data_columns=True) + result = store.select("df1", where="values>2.0") + expected = df[df["values"] > 2.0] tm.assert_frame_equal(expected, result) # floats with NaN df.iloc[0] = np.nan - expected = df[df['values'] > 2.0] + expected = df[df["values"] > 2.0] - store.append('df2', df, data_columns=True, index=False) - result = store.select( - 'df2', where='values>2.0') + store.append("df2", df, data_columns=True, index=False) + result = store.select("df2", where="values>2.0") tm.assert_frame_equal(expected, result) # https://github.com/PyTables/PyTables/issues/282 @@ -2761,16 +2862,14 @@ def test_select_dtypes(self): # tm.assert_frame_equal(expected, result) # not in first position float with NaN ok too - df = DataFrame( - dict(cols=range(11), values=range(11)), dtype='float64') - df['cols'] = (df['cols'] + 10).apply(str) + df = DataFrame(dict(cols=range(11), values=range(11)), dtype="float64") + df["cols"] = (df["cols"] + 10).apply(str) df.iloc[1] = np.nan - expected = df[df['values'] > 2.0] + expected = df[df["values"] > 2.0] - store.append('df4', df, data_columns=True) - result = store.select( - 'df4', where='values>2.0') + store.append("df4", df, data_columns=True) + result = store.select("df4", where="values>2.0") tm.assert_frame_equal(expected, result) # test selection with comparison against numpy scalar @@ -2778,56 +2877,62 @@ def test_select_dtypes(self): with ensure_clean_store(self.path) as store: df = tm.makeDataFrame() - expected = df[df['A'] > 0] + expected = df[df["A"] > 0] - store.append('df', df, data_columns=True) + store.append("df", df, data_columns=True) np_zero = np.float64(0) # noqa - result = store.select('df', where=["A>np_zero"]) + result = store.select("df", where=["A>np_zero"]) tm.assert_frame_equal(expected, result) def test_select_with_many_inputs(self): with ensure_clean_store(self.path) as store: - df = DataFrame(dict(ts=bdate_range('2012-01-01', periods=300), - A=np.random.randn(300), - B=range(300), - users=['a'] * 50 + ['b'] * 50 + ['c'] * 100 + - ['a%03d' % i for i in range(100)])) - _maybe_remove(store, 'df') - store.append('df', df, data_columns=['ts', 'A', 'B', 'users']) + df = DataFrame( + dict( + ts=bdate_range("2012-01-01", periods=300), + A=np.random.randn(300), + B=range(300), + users=["a"] * 50 + + ["b"] * 50 + + ["c"] * 100 + + ["a%03d" % i for i in range(100)], + ) + ) + _maybe_remove(store, "df") + store.append("df", df, data_columns=["ts", "A", "B", "users"]) # regular select - result = store.select('df', "ts>=Timestamp('2012-02-01')") - expected = df[df.ts >= Timestamp('2012-02-01')] + result = store.select("df", "ts>=Timestamp('2012-02-01')") + expected = df[df.ts >= Timestamp("2012-02-01")] tm.assert_frame_equal(expected, result) # small selector result = store.select( - 'df', - "ts>=Timestamp('2012-02-01') & users=['a','b','c']") - expected = df[(df.ts >= Timestamp('2012-02-01')) & - df.users.isin(['a', 'b', 'c'])] + "df", "ts>=Timestamp('2012-02-01') & users=['a','b','c']" + ) + expected = df[ + (df.ts >= Timestamp("2012-02-01")) & df.users.isin(["a", "b", "c"]) + ] tm.assert_frame_equal(expected, result) # big selector along the columns - selector = ['a', 'b', 'c'] + ['a%03d' % i for i in range(60)] + selector = ["a", "b", "c"] + ["a%03d" % i for i in range(60)] result = store.select( - 'df', - "ts>=Timestamp('2012-02-01') and users=selector") - expected = df[(df.ts >= Timestamp('2012-02-01')) & - df.users.isin(selector)] + "df", "ts>=Timestamp('2012-02-01') and users=selector" + ) + expected = df[(df.ts >= Timestamp("2012-02-01")) & df.users.isin(selector)] tm.assert_frame_equal(expected, result) selector = range(100, 200) - result = store.select('df', 'B=selector') + result = store.select("df", "B=selector") expected = df[df.B.isin(selector)] tm.assert_frame_equal(expected, result) assert len(result) == 100 # big selector along the index selector = Index(df.ts[0:100].values) - result = store.select('df', 'ts=selector') + result = store.select("df", "ts=selector") expected = df[df.ts.isin(selector.values)] tm.assert_frame_equal(expected, result) assert len(result) == 100 @@ -2838,65 +2943,67 @@ def test_select_iterator(self): with ensure_clean_store(self.path) as store: df = tm.makeTimeDataFrame(500) - _maybe_remove(store, 'df') - store.append('df', df) + _maybe_remove(store, "df") + store.append("df", df) - expected = store.select('df') + expected = store.select("df") - results = [s for s in store.select('df', iterator=True)] + results = [s for s in store.select("df", iterator=True)] result = concat(results) tm.assert_frame_equal(expected, result) - results = [s for s in store.select('df', chunksize=100)] + results = [s for s in store.select("df", chunksize=100)] assert len(results) == 5 result = concat(results) tm.assert_frame_equal(expected, result) - results = [s for s in store.select('df', chunksize=150)] + results = [s for s in store.select("df", chunksize=150)] result = concat(results) tm.assert_frame_equal(result, expected) with ensure_clean_path(self.path) as path: df = tm.makeTimeDataFrame(500) - df.to_hdf(path, 'df_non_table') + df.to_hdf(path, "df_non_table") with pytest.raises(TypeError): - read_hdf(path, 'df_non_table', chunksize=100) + read_hdf(path, "df_non_table", chunksize=100) with pytest.raises(TypeError): - read_hdf(path, 'df_non_table', iterator=True) + read_hdf(path, "df_non_table", iterator=True) with ensure_clean_path(self.path) as path: df = tm.makeTimeDataFrame(500) - df.to_hdf(path, 'df', format='table') + df.to_hdf(path, "df", format="table") - results = [s for s in read_hdf(path, 'df', chunksize=100)] + results = [s for s in read_hdf(path, "df", chunksize=100)] result = concat(results) assert len(results) == 5 tm.assert_frame_equal(result, df) - tm.assert_frame_equal(result, read_hdf(path, 'df')) + tm.assert_frame_equal(result, read_hdf(path, "df")) # multiple with ensure_clean_store(self.path) as store: df1 = tm.makeTimeDataFrame(500) - store.append('df1', df1, data_columns=True) - df2 = tm.makeTimeDataFrame(500).rename( - columns=lambda x: "%s_2" % x) - df2['foo'] = 'bar' - store.append('df2', df2) + store.append("df1", df1, data_columns=True) + df2 = tm.makeTimeDataFrame(500).rename(columns=lambda x: "%s_2" % x) + df2["foo"] = "bar" + store.append("df2", df2) df = concat([df1, df2], axis=1) # full selection - expected = store.select_as_multiple( - ['df1', 'df2'], selector='df1') - results = [s for s in store.select_as_multiple( - ['df1', 'df2'], selector='df1', chunksize=150)] + expected = store.select_as_multiple(["df1", "df2"], selector="df1") + results = [ + s + for s in store.select_as_multiple( + ["df1", "df2"], selector="df1", chunksize=150 + ) + ] result = concat(results) tm.assert_frame_equal(expected, result) @@ -2909,68 +3016,65 @@ def test_select_iterator_complete_8014(self): # no iterator with ensure_clean_store(self.path) as store: - expected = tm.makeTimeDataFrame(100064, 'S') - _maybe_remove(store, 'df') - store.append('df', expected) + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) beg_dt = expected.index[0] end_dt = expected.index[-1] # select w/o iteration and no where clause works - result = store.select('df') + result = store.select("df") tm.assert_frame_equal(expected, result) # select w/o iterator and where clause, single term, begin # of range, works where = "index >= '%s'" % beg_dt - result = store.select('df', where=where) + result = store.select("df", where=where) tm.assert_frame_equal(expected, result) # select w/o iterator and where clause, single term, end # of range, works where = "index <= '%s'" % end_dt - result = store.select('df', where=where) + result = store.select("df", where=where) tm.assert_frame_equal(expected, result) # select w/o iterator and where clause, inclusive range, # works where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt) - result = store.select('df', where=where) + result = store.select("df", where=where) tm.assert_frame_equal(expected, result) # with iterator, full range with ensure_clean_store(self.path) as store: - expected = tm.makeTimeDataFrame(100064, 'S') - _maybe_remove(store, 'df') - store.append('df', expected) + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) beg_dt = expected.index[0] end_dt = expected.index[-1] # select w/iterator and no where clause works - results = [s for s in store.select('df', chunksize=chunksize)] + results = [s for s in store.select("df", chunksize=chunksize)] result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, single term, begin of range where = "index >= '%s'" % beg_dt - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, single term, end of range where = "index <= '%s'" % end_dt - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, inclusive range where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt) - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] result = concat(results) tm.assert_frame_equal(expected, result) @@ -2983,51 +3087,48 @@ def test_select_iterator_non_complete_8014(self): # with iterator, non complete range with ensure_clean_store(self.path) as store: - expected = tm.makeTimeDataFrame(100064, 'S') - _maybe_remove(store, 'df') - store.append('df', expected) + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) beg_dt = expected.index[1] end_dt = expected.index[-2] # select w/iterator and where clause, single term, begin of range where = "index >= '%s'" % beg_dt - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] result = concat(results) rexpected = expected[expected.index >= beg_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, single term, end of range where = "index <= '%s'" % end_dt - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] result = concat(results) rexpected = expected[expected.index <= end_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, inclusive range where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt) - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] result = concat(results) - rexpected = expected[(expected.index >= beg_dt) & - (expected.index <= end_dt)] + rexpected = expected[ + (expected.index >= beg_dt) & (expected.index <= end_dt) + ] tm.assert_frame_equal(rexpected, result) # with iterator, empty where with ensure_clean_store(self.path) as store: - expected = tm.makeTimeDataFrame(100064, 'S') - _maybe_remove(store, 'df') - store.append('df', expected) + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) end_dt = expected.index[-1] # select w/iterator and where clause, single term, begin of range where = "index > '%s'" % end_dt - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] assert 0 == len(results) def test_select_iterator_many_empty_frames(self): @@ -3040,25 +3141,23 @@ def test_select_iterator_many_empty_frames(self): # with iterator, range limited to the first chunk with ensure_clean_store(self.path) as store: - expected = tm.makeTimeDataFrame(100000, 'S') - _maybe_remove(store, 'df') - store.append('df', expected) + expected = tm.makeTimeDataFrame(100000, "S") + _maybe_remove(store, "df") + store.append("df", expected) beg_dt = expected.index[0] end_dt = expected.index[chunksize - 1] # select w/iterator and where clause, single term, begin of range where = "index >= '%s'" % beg_dt - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] result = concat(results) rexpected = expected[expected.index >= beg_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, single term, end of range where = "index <= '%s'" % end_dt - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] assert len(results) == 1 result = concat(results) @@ -3067,14 +3166,14 @@ def test_select_iterator_many_empty_frames(self): # select w/iterator and where clause, inclusive range where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt) - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] # should be 1, is 10 assert len(results) == 1 result = concat(results) - rexpected = expected[(expected.index >= beg_dt) & - (expected.index <= end_dt)] + rexpected = expected[ + (expected.index >= beg_dt) & (expected.index <= end_dt) + ] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause which selects @@ -3085,8 +3184,7 @@ def test_select_iterator_many_empty_frames(self): # True. where = "index <= '%s' & index >= '%s'" % (beg_dt, end_dt) - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] # should be [] assert len(results) == 0 @@ -3097,44 +3195,59 @@ def test_select_iterator_many_empty_frames(self): def test_retain_index_attributes(self): # GH 3499, losing frequency info on index recreation - df = DataFrame(dict( - A=Series(range(3), - index=date_range('2000-1-1', periods=3, freq='H')))) + df = DataFrame( + dict(A=Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))) + ) with ensure_clean_store(self.path) as store: - _maybe_remove(store, 'data') - store.put('data', df, format='table') + _maybe_remove(store, "data") + store.put("data", df, format="table") - result = store.get('data') + result = store.get("data") tm.assert_frame_equal(df, result) - for attr in ['freq', 'tz', 'name']: - for idx in ['index', 'columns']: - assert (getattr(getattr(df, idx), attr, None) == - getattr(getattr(result, idx), attr, None)) + for attr in ["freq", "tz", "name"]: + for idx in ["index", "columns"]: + assert getattr(getattr(df, idx), attr, None) == getattr( + getattr(result, idx), attr, None + ) # try to append a table with a different frequency with catch_warnings(record=True): - df2 = DataFrame(dict( - A=Series(range(3), - index=date_range('2002-1-1', - periods=3, freq='D')))) - store.append('data', df2) + df2 = DataFrame( + dict( + A=Series( + range(3), index=date_range("2002-1-1", periods=3, freq="D") + ) + ) + ) + store.append("data", df2) - assert store.get_storer('data').info['index']['freq'] is None + assert store.get_storer("data").info["index"]["freq"] is None # this is ok - _maybe_remove(store, 'df2') - df2 = DataFrame(dict( - A=Series(range(3), - index=[Timestamp('20010101'), Timestamp('20010102'), - Timestamp('20020101')]))) - store.append('df2', df2) - df3 = DataFrame(dict( - A=Series(range(3), - index=date_range('2002-1-1', periods=3, - freq='D')))) - store.append('df2', df3) + _maybe_remove(store, "df2") + df2 = DataFrame( + dict( + A=Series( + range(3), + index=[ + Timestamp("20010101"), + Timestamp("20010102"), + Timestamp("20020101"), + ], + ) + ) + ) + store.append("df2", df2) + df3 = DataFrame( + dict( + A=Series( + range(3), index=date_range("2002-1-1", periods=3, freq="D") + ) + ) + ) + store.append("df2", df3) @pytest.mark.filterwarnings( "ignore:\\nthe :pandas.io.pytables.AttributeConflictWarning" @@ -3144,60 +3257,66 @@ def test_retain_index_attributes2(self): with catch_warnings(record=True): - df = DataFrame(dict( - A=Series(range(3), - index=date_range('2000-1-1', - periods=3, freq='H')))) - df.to_hdf(path, 'data', mode='w', append=True) - df2 = DataFrame(dict( - A=Series(range(3), - index=date_range('2002-1-1', periods=3, - freq='D')))) - df2.to_hdf(path, 'data', append=True) - - idx = date_range('2000-1-1', periods=3, freq='H') - idx.name = 'foo' + df = DataFrame( + dict( + A=Series( + range(3), index=date_range("2000-1-1", periods=3, freq="H") + ) + ) + ) + df.to_hdf(path, "data", mode="w", append=True) + df2 = DataFrame( + dict( + A=Series( + range(3), index=date_range("2002-1-1", periods=3, freq="D") + ) + ) + ) + df2.to_hdf(path, "data", append=True) + + idx = date_range("2000-1-1", periods=3, freq="H") + idx.name = "foo" df = DataFrame(dict(A=Series(range(3), index=idx))) - df.to_hdf(path, 'data', mode='w', append=True) + df.to_hdf(path, "data", mode="w", append=True) - assert read_hdf(path, 'data').index.name == 'foo' + assert read_hdf(path, "data").index.name == "foo" with catch_warnings(record=True): - idx2 = date_range('2001-1-1', periods=3, freq='H') - idx2.name = 'bar' + idx2 = date_range("2001-1-1", periods=3, freq="H") + idx2.name = "bar" df2 = DataFrame(dict(A=Series(range(3), index=idx2))) - df2.to_hdf(path, 'data', append=True) + df2.to_hdf(path, "data", append=True) - assert read_hdf(path, 'data').index.name is None + assert read_hdf(path, "data").index.name is None def test_frame_select(self): df = tm.makeTimeDataFrame() with ensure_clean_store(self.path) as store: - store.put('frame', df, format='table') + store.put("frame", df, format="table") date = df.index[len(df) // 2] - crit1 = Term('index>=date') - assert crit1.env.scope['date'] == date + crit1 = Term("index>=date") + assert crit1.env.scope["date"] == date - crit2 = ("columns=['A', 'D']") - crit3 = ('columns=A') + crit2 = "columns=['A', 'D']" + crit3 = "columns=A" - result = store.select('frame', [crit1, crit2]) - expected = df.loc[date:, ['A', 'D']] + result = store.select("frame", [crit1, crit2]) + expected = df.loc[date:, ["A", "D"]] tm.assert_frame_equal(result, expected) - result = store.select('frame', [crit3]) - expected = df.loc[:, ['A']] + result = store.select("frame", [crit3]) + expected = df.loc[:, ["A"]] tm.assert_frame_equal(result, expected) # invalid terms df = tm.makeTimeDataFrame() - store.append('df_time', df) + store.append("df_time", df) with pytest.raises(ValueError): - store.select('df_time', "index>0") + store.select("df_time", "index>0") # can't select if not written as table # store['frame'] = df @@ -3208,113 +3327,114 @@ def test_frame_select_complex(self): # select via complex criteria df = tm.makeTimeDataFrame() - df['string'] = 'foo' - df.loc[df.index[0:4], 'string'] = 'bar' + df["string"] = "foo" + df.loc[df.index[0:4], "string"] = "bar" with ensure_clean_store(self.path) as store: - store.put('df', df, format='table', data_columns=['string']) + store.put("df", df, format="table", data_columns=["string"]) # empty - result = store.select('df', 'index>df.index[3] & string="bar"') - expected = df.loc[(df.index > df.index[3]) & (df.string == 'bar')] + result = store.select("df", 'index>df.index[3] & string="bar"') + expected = df.loc[(df.index > df.index[3]) & (df.string == "bar")] tm.assert_frame_equal(result, expected) - result = store.select('df', 'index>df.index[3] & string="foo"') - expected = df.loc[(df.index > df.index[3]) & (df.string == 'foo')] + result = store.select("df", 'index>df.index[3] & string="foo"') + expected = df.loc[(df.index > df.index[3]) & (df.string == "foo")] tm.assert_frame_equal(result, expected) # or - result = store.select('df', 'index>df.index[3] | string="bar"') - expected = df.loc[(df.index > df.index[3]) | (df.string == 'bar')] + result = store.select("df", 'index>df.index[3] | string="bar"') + expected = df.loc[(df.index > df.index[3]) | (df.string == "bar")] tm.assert_frame_equal(result, expected) - result = store.select('df', '(index>df.index[3] & ' - 'index<=df.index[6]) | string="bar"') - expected = df.loc[((df.index > df.index[3]) & ( - df.index <= df.index[6])) | (df.string == 'bar')] + result = store.select( + "df", "(index>df.index[3] & " 'index<=df.index[6]) | string="bar"' + ) + expected = df.loc[ + ((df.index > df.index[3]) & (df.index <= df.index[6])) + | (df.string == "bar") + ] tm.assert_frame_equal(result, expected) # invert - result = store.select('df', 'string!="bar"') - expected = df.loc[df.string != 'bar'] + result = store.select("df", 'string!="bar"') + expected = df.loc[df.string != "bar"] tm.assert_frame_equal(result, expected) # invert not implemented in numexpr :( with pytest.raises(NotImplementedError): - store.select('df', '~(string="bar")') + store.select("df", '~(string="bar")') # invert ok for filters - result = store.select('df', "~(columns=['A','B'])") - expected = df.loc[:, df.columns.difference(['A', 'B'])] + result = store.select("df", "~(columns=['A','B'])") + expected = df.loc[:, df.columns.difference(["A", "B"])] tm.assert_frame_equal(result, expected) # in - result = store.select( - 'df', "index>df.index[3] & columns in ['A','B']") - expected = df.loc[df.index > df.index[3]].reindex(columns=[ - 'A', 'B']) + result = store.select("df", "index>df.index[3] & columns in ['A','B']") + expected = df.loc[df.index > df.index[3]].reindex(columns=["A", "B"]) tm.assert_frame_equal(result, expected) def test_frame_select_complex2(self): - with ensure_clean_path(['parms.hdf', 'hist.hdf']) as paths: + with ensure_clean_path(["parms.hdf", "hist.hdf"]) as paths: pp, hh = paths # use non-trivial selection criteria - parms = DataFrame({'A': [1, 1, 2, 2, 3]}) - parms.to_hdf(pp, 'df', mode='w', - format='table', data_columns=['A']) + parms = DataFrame({"A": [1, 1, 2, 2, 3]}) + parms.to_hdf(pp, "df", mode="w", format="table", data_columns=["A"]) - selection = read_hdf(pp, 'df', where='A=[2,3]') - hist = DataFrame(np.random.randn(25, 1), - columns=['data'], - index=MultiIndex.from_tuples( - [(i, j) for i in range(5) - for j in range(5)], - names=['l1', 'l2'])) + selection = read_hdf(pp, "df", where="A=[2,3]") + hist = DataFrame( + np.random.randn(25, 1), + columns=["data"], + index=MultiIndex.from_tuples( + [(i, j) for i in range(5) for j in range(5)], names=["l1", "l2"] + ), + ) - hist.to_hdf(hh, 'df', mode='w', format='table') + hist.to_hdf(hh, "df", mode="w", format="table") - expected = read_hdf(hh, 'df', where='l1=[2, 3, 4]') + expected = read_hdf(hh, "df", where="l1=[2, 3, 4]") # scope with list like l = selection.index.tolist() # noqa store = HDFStore(hh) - result = store.select('df', where='l1=l') + result = store.select("df", where="l1=l") assert_frame_equal(result, expected) store.close() - result = read_hdf(hh, 'df', where='l1=l') + result = read_hdf(hh, "df", where="l1=l") assert_frame_equal(result, expected) # index index = selection.index # noqa - result = read_hdf(hh, 'df', where='l1=index') + result = read_hdf(hh, "df", where="l1=index") assert_frame_equal(result, expected) - result = read_hdf(hh, 'df', where='l1=selection.index') + result = read_hdf(hh, "df", where="l1=selection.index") assert_frame_equal(result, expected) - result = read_hdf(hh, 'df', where='l1=selection.index.tolist()') + result = read_hdf(hh, "df", where="l1=selection.index.tolist()") assert_frame_equal(result, expected) - result = read_hdf(hh, 'df', where='l1=list(selection.index)') + result = read_hdf(hh, "df", where="l1=list(selection.index)") assert_frame_equal(result, expected) # scope with index store = HDFStore(hh) - result = store.select('df', where='l1=index') + result = store.select("df", where="l1=index") assert_frame_equal(result, expected) - result = store.select('df', where='l1=selection.index') + result = store.select("df", where="l1=selection.index") assert_frame_equal(result, expected) - result = store.select('df', where='l1=selection.index.tolist()') + result = store.select("df", where="l1=selection.index.tolist()") assert_frame_equal(result, expected) - result = store.select('df', where='l1=list(selection.index)') + result = store.select("df", where="l1=list(selection.index)") assert_frame_equal(result, expected) store.close() @@ -3326,15 +3446,15 @@ def test_invalid_filtering(self): df = tm.makeTimeDataFrame() with ensure_clean_store(self.path) as store: - store.put('df', df, format='table') + store.put("df", df, format="table") # not implemented with pytest.raises(NotImplementedError): - store.select('df', "columns=['A'] | columns=['B']") + store.select("df", "columns=['A'] | columns=['B']") # in theory we could deal with this with pytest.raises(NotImplementedError): - store.select('df', "columns=['A','B'] & columns=['C']") + store.select("df", "columns=['A','B'] & columns=['C']") def test_string_select(self): # GH 2973 @@ -3343,18 +3463,18 @@ def test_string_select(self): df = tm.makeTimeDataFrame() # test string ==/!= - df['x'] = 'none' - df.loc[2:7, 'x'] = '' + df["x"] = "none" + df.loc[2:7, "x"] = "" - store.append('df', df, data_columns=['x']) + store.append("df", df, data_columns=["x"]) - result = store.select('df', 'x=none') - expected = df[df.x == 'none'] + result = store.select("df", "x=none") + expected = df[df.x == "none"] assert_frame_equal(result, expected) try: - result = store.select('df', 'x!=none') - expected = df[df.x != 'none'] + result = store.select("df", "x!=none") + expected = df[df.x != "none"] assert_frame_equal(result, expected) except Exception as detail: pprint_thing("[{0}]".format(detail)) @@ -3362,24 +3482,24 @@ def test_string_select(self): pprint_thing(expected) df2 = df.copy() - df2.loc[df2.x == '', 'x'] = np.nan + df2.loc[df2.x == "", "x"] = np.nan - store.append('df2', df2, data_columns=['x']) - result = store.select('df2', 'x!=none') + store.append("df2", df2, data_columns=["x"]) + result = store.select("df2", "x!=none") expected = df2[isna(df2.x)] assert_frame_equal(result, expected) # int ==/!= - df['int'] = 1 - df.loc[2:7, 'int'] = 2 + df["int"] = 1 + df.loc[2:7, "int"] = 2 - store.append('df3', df, data_columns=['int']) + store.append("df3", df, data_columns=["int"]) - result = store.select('df3', 'int=2') + result = store.select("df3", "int=2") expected = df[df.int == 2] assert_frame_equal(result, expected) - result = store.select('df3', 'int!=2') + result = store.select("df3", "int!=2") expected = df[df.int != 2] assert_frame_equal(result, expected) @@ -3388,71 +3508,70 @@ def test_read_column(self): df = tm.makeTimeDataFrame() with ensure_clean_store(self.path) as store: - _maybe_remove(store, 'df') + _maybe_remove(store, "df") # GH 17912 # HDFStore.select_column should raise a KeyError # exception if the key is not a valid store - with pytest.raises(KeyError, - match='No object named df in the file'): - store.select_column('df', 'index') + with pytest.raises(KeyError, match="No object named df in the file"): + store.select_column("df", "index") - store.append('df', df) + store.append("df", df) # error with pytest.raises(KeyError): - store.select_column('df', 'foo') + store.select_column("df", "foo") with pytest.raises(Exception): - store.select_column('df', 'index', where=['index>5']) + store.select_column("df", "index", where=["index>5"]) # valid - result = store.select_column('df', 'index') + result = store.select_column("df", "index") tm.assert_almost_equal(result.values, Series(df.index).values) assert isinstance(result, Series) # not a data indexable column with pytest.raises(ValueError): - store.select_column('df', 'values_block_0') + store.select_column("df", "values_block_0") # a data column df2 = df.copy() - df2['string'] = 'foo' - store.append('df2', df2, data_columns=['string']) - result = store.select_column('df2', 'string') - tm.assert_almost_equal(result.values, df2['string'].values) + df2["string"] = "foo" + store.append("df2", df2, data_columns=["string"]) + result = store.select_column("df2", "string") + tm.assert_almost_equal(result.values, df2["string"].values) # a data column with NaNs, result excludes the NaNs df3 = df.copy() - df3['string'] = 'foo' - df3.loc[4:6, 'string'] = np.nan - store.append('df3', df3, data_columns=['string']) - result = store.select_column('df3', 'string') - tm.assert_almost_equal(result.values, df3['string'].values) + df3["string"] = "foo" + df3.loc[4:6, "string"] = np.nan + store.append("df3", df3, data_columns=["string"]) + result = store.select_column("df3", "string") + tm.assert_almost_equal(result.values, df3["string"].values) # start/stop - result = store.select_column('df3', 'string', start=2) - tm.assert_almost_equal(result.values, df3['string'].values[2:]) + result = store.select_column("df3", "string", start=2) + tm.assert_almost_equal(result.values, df3["string"].values[2:]) - result = store.select_column('df3', 'string', start=-2) - tm.assert_almost_equal(result.values, df3['string'].values[-2:]) + result = store.select_column("df3", "string", start=-2) + tm.assert_almost_equal(result.values, df3["string"].values[-2:]) - result = store.select_column('df3', 'string', stop=2) - tm.assert_almost_equal(result.values, df3['string'].values[:2]) + result = store.select_column("df3", "string", stop=2) + tm.assert_almost_equal(result.values, df3["string"].values[:2]) - result = store.select_column('df3', 'string', stop=-2) - tm.assert_almost_equal(result.values, df3['string'].values[:-2]) + result = store.select_column("df3", "string", stop=-2) + tm.assert_almost_equal(result.values, df3["string"].values[:-2]) - result = store.select_column('df3', 'string', start=2, stop=-2) - tm.assert_almost_equal(result.values, df3['string'].values[2:-2]) + result = store.select_column("df3", "string", start=2, stop=-2) + tm.assert_almost_equal(result.values, df3["string"].values[2:-2]) - result = store.select_column('df3', 'string', start=-2, stop=2) - tm.assert_almost_equal(result.values, df3['string'].values[-2:2]) + result = store.select_column("df3", "string", start=-2, stop=2) + tm.assert_almost_equal(result.values, df3["string"].values[-2:2]) # GH 10392 - make sure column name is preserved - df4 = DataFrame({'A': np.random.randn(10), 'B': 'foo'}) - store.append('df4', df4, data_columns=True) - expected = df4['B'] - result = store.select_column('df4', 'B') + df4 = DataFrame({"A": np.random.randn(10), "B": "foo"}) + store.append("df4", df4, data_columns=True) + expected = df4["B"] + result = store.select_column("df4", "B") tm.assert_series_equal(result, expected) def test_coordinates(self): @@ -3460,42 +3579,42 @@ def test_coordinates(self): with ensure_clean_store(self.path) as store: - _maybe_remove(store, 'df') - store.append('df', df) + _maybe_remove(store, "df") + store.append("df", df) # all - c = store.select_as_coordinates('df') - assert((c.values == np.arange(len(df.index))).all()) + c = store.select_as_coordinates("df") + assert (c.values == np.arange(len(df.index))).all() # get coordinates back & test vs frame - _maybe_remove(store, 'df') + _maybe_remove(store, "df") df = DataFrame(dict(A=range(5), B=range(5))) - store.append('df', df) - c = store.select_as_coordinates('df', ['index<3']) - assert((c.values == np.arange(3)).all()) - result = store.select('df', where=c) + store.append("df", df) + c = store.select_as_coordinates("df", ["index<3"]) + assert (c.values == np.arange(3)).all() + result = store.select("df", where=c) expected = df.loc[0:2, :] tm.assert_frame_equal(result, expected) - c = store.select_as_coordinates('df', ['index>=3', 'index<=4']) - assert((c.values == np.arange(2) + 3).all()) - result = store.select('df', where=c) + c = store.select_as_coordinates("df", ["index>=3", "index<=4"]) + assert (c.values == np.arange(2) + 3).all() + result = store.select("df", where=c) expected = df.loc[3:4, :] tm.assert_frame_equal(result, expected) assert isinstance(c, Index) # multiple tables - _maybe_remove(store, 'df1') - _maybe_remove(store, 'df2') + _maybe_remove(store, "df1") + _maybe_remove(store, "df2") df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) - store.append('df1', df1, data_columns=['A', 'B']) - store.append('df2', df2) + store.append("df1", df1, data_columns=["A", "B"]) + store.append("df2", df2) - c = store.select_as_coordinates('df1', ['A>0', 'B>0']) - df1_result = store.select('df1', c) - df2_result = store.select('df2', c) + c = store.select_as_coordinates("df1", ["A>0", "B>0"]) + df1_result = store.select("df1", c) + df2_result = store.select("df2", c) result = concat([df1_result, df2_result], axis=1) expected = concat([df1, df2], axis=1) @@ -3505,63 +3624,64 @@ def test_coordinates(self): # pass array/mask as the coordinates with ensure_clean_store(self.path) as store: - df = DataFrame(np.random.randn(1000, 2), - index=date_range('20000101', periods=1000)) - store.append('df', df) - c = store.select_column('df', 'index') + df = DataFrame( + np.random.randn(1000, 2), index=date_range("20000101", periods=1000) + ) + store.append("df", df) + c = store.select_column("df", "index") where = c[DatetimeIndex(c).month == 5].index expected = df.iloc[where] # locations - result = store.select('df', where=where) + result = store.select("df", where=where) tm.assert_frame_equal(result, expected) # boolean - result = store.select('df', where=where) + result = store.select("df", where=where) tm.assert_frame_equal(result, expected) # invalid with pytest.raises(ValueError): - store.select('df', where=np.arange(len(df), dtype='float64')) + store.select("df", where=np.arange(len(df), dtype="float64")) with pytest.raises(ValueError): - store.select('df', where=np.arange(len(df) + 1)) + store.select("df", where=np.arange(len(df) + 1)) with pytest.raises(ValueError): - store.select('df', where=np.arange(len(df)), start=5) + store.select("df", where=np.arange(len(df)), start=5) with pytest.raises(ValueError): - store.select('df', where=np.arange(len(df)), start=5, stop=10) + store.select("df", where=np.arange(len(df)), start=5, stop=10) # selection with filter - selection = date_range('20000101', periods=500) - result = store.select('df', where='index in selection') + selection = date_range("20000101", periods=500) + result = store.select("df", where="index in selection") expected = df[df.index.isin(selection)] tm.assert_frame_equal(result, expected) # list df = DataFrame(np.random.randn(10, 2)) - store.append('df2', df) - result = store.select('df2', where=[0, 3, 5]) + store.append("df2", df) + result = store.select("df2", where=[0, 3, 5]) expected = df.iloc[[0, 3, 5]] tm.assert_frame_equal(result, expected) # boolean where = [True] * 10 where[-2] = False - result = store.select('df2', where=where) + result = store.select("df2", where=where) expected = df.loc[where] tm.assert_frame_equal(result, expected) # start/stop - result = store.select('df2', start=5, stop=10) + result = store.select("df2", start=5, stop=10) expected = df[5:10] tm.assert_frame_equal(result, expected) def test_append_to_multiple(self): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) - df2['foo'] = 'bar' + df2["foo"] = "bar" df = concat([df1, df2], axis=1) with ensure_clean_store(self.path) as store: @@ -3569,148 +3689,154 @@ def test_append_to_multiple(self): # exceptions with pytest.raises(ValueError): store.append_to_multiple( - {'df1': ['A', 'B'], 'df2': None}, df, selector='df3') + {"df1": ["A", "B"], "df2": None}, df, selector="df3" + ) with pytest.raises(ValueError): - store.append_to_multiple( - {'df1': None, 'df2': None}, df, selector='df3') + store.append_to_multiple({"df1": None, "df2": None}, df, selector="df3") with pytest.raises(ValueError): - store.append_to_multiple('df1', df, 'df1') + store.append_to_multiple("df1", df, "df1") # regular operation store.append_to_multiple( - {'df1': ['A', 'B'], 'df2': None}, df, selector='df1') + {"df1": ["A", "B"], "df2": None}, df, selector="df1" + ) result = store.select_as_multiple( - ['df1', 'df2'], where=['A>0', 'B>0'], selector='df1') + ["df1", "df2"], where=["A>0", "B>0"], selector="df1" + ) expected = df[(df.A > 0) & (df.B > 0)] tm.assert_frame_equal(result, expected) def test_append_to_multiple_dropna(self): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) - df1.iloc[1, df1.columns.get_indexer(['A', 'B'])] = np.nan + df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan df = concat([df1, df2], axis=1) with ensure_clean_store(self.path) as store: # dropna=True should guarantee rows are synchronized store.append_to_multiple( - {'df1': ['A', 'B'], 'df2': None}, df, selector='df1', - dropna=True) - result = store.select_as_multiple(['df1', 'df2']) + {"df1": ["A", "B"], "df2": None}, df, selector="df1", dropna=True + ) + result = store.select_as_multiple(["df1", "df2"]) expected = df.dropna() tm.assert_frame_equal(result, expected) - tm.assert_index_equal(store.select('df1').index, - store.select('df2').index) + tm.assert_index_equal(store.select("df1").index, store.select("df2").index) - @pytest.mark.xfail(run=False, - reason="append_to_multiple_dropna_false " - "is not raising as failed") + @pytest.mark.xfail( + run=False, reason="append_to_multiple_dropna_false " "is not raising as failed" + ) def test_append_to_multiple_dropna_false(self): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) - df1.iloc[1, df1.columns.get_indexer(['A', 'B'])] = np.nan + df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan df = concat([df1, df2], axis=1) with ensure_clean_store(self.path) as store: # dropna=False shouldn't synchronize row indexes store.append_to_multiple( - {'df1a': ['A', 'B'], 'df2a': None}, df, selector='df1a', - dropna=False) + {"df1a": ["A", "B"], "df2a": None}, df, selector="df1a", dropna=False + ) with pytest.raises(ValueError): - store.select_as_multiple(['df1a', 'df2a']) + store.select_as_multiple(["df1a", "df2a"]) - assert not store.select('df1a').index.equals( - store.select('df2a').index) + assert not store.select("df1a").index.equals(store.select("df2a").index) def test_select_as_multiple(self): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) - df2['foo'] = 'bar' + df2["foo"] = "bar" with ensure_clean_store(self.path) as store: # no tables stored with pytest.raises(Exception): - store.select_as_multiple( - None, where=['A>0', 'B>0'], selector='df1') + store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1") - store.append('df1', df1, data_columns=['A', 'B']) - store.append('df2', df2) + store.append("df1", df1, data_columns=["A", "B"]) + store.append("df2", df2) # exceptions with pytest.raises(Exception): - store.select_as_multiple(None, where=['A>0', 'B>0'], - selector='df1') + store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1") with pytest.raises(Exception): - store.select_as_multiple([None], where=['A>0', 'B>0'], - selector='df1') + store.select_as_multiple([None], where=["A>0", "B>0"], selector="df1") with pytest.raises(KeyError): store.select_as_multiple( - ['df1', 'df3'], where=['A>0', 'B>0'], selector='df1') + ["df1", "df3"], where=["A>0", "B>0"], selector="df1" + ) with pytest.raises(KeyError): - store.select_as_multiple( - ['df3'], where=['A>0', 'B>0'], selector='df1') + store.select_as_multiple(["df3"], where=["A>0", "B>0"], selector="df1") with pytest.raises(KeyError): store.select_as_multiple( - ['df1', 'df2'], where=['A>0', 'B>0'], selector='df4') + ["df1", "df2"], where=["A>0", "B>0"], selector="df4" + ) # default select - result = store.select('df1', ['A>0', 'B>0']) + result = store.select("df1", ["A>0", "B>0"]) expected = store.select_as_multiple( - ['df1'], where=['A>0', 'B>0'], selector='df1') + ["df1"], where=["A>0", "B>0"], selector="df1" + ) tm.assert_frame_equal(result, expected) expected = store.select_as_multiple( - 'df1', where=['A>0', 'B>0'], selector='df1') + "df1", where=["A>0", "B>0"], selector="df1" + ) tm.assert_frame_equal(result, expected) # multiple result = store.select_as_multiple( - ['df1', 'df2'], where=['A>0', 'B>0'], selector='df1') + ["df1", "df2"], where=["A>0", "B>0"], selector="df1" + ) expected = concat([df1, df2], axis=1) expected = expected[(expected.A > 0) & (expected.B > 0)] tm.assert_frame_equal(result, expected) # multiple (diff selector) result = store.select_as_multiple( - ['df1', 'df2'], where='index>df2.index[4]', selector='df2') + ["df1", "df2"], where="index>df2.index[4]", selector="df2" + ) expected = concat([df1, df2], axis=1) expected = expected[5:] tm.assert_frame_equal(result, expected) # test exception for diff rows - store.append('df3', tm.makeTimeDataFrame(nper=50)) + store.append("df3", tm.makeTimeDataFrame(nper=50)) with pytest.raises(ValueError): store.select_as_multiple( - ['df1', 'df3'], where=['A>0', 'B>0'], selector='df1') + ["df1", "df3"], where=["A>0", "B>0"], selector="df1" + ) @pytest.mark.skipif( - LooseVersion(tables.__version__) < LooseVersion('3.1.0'), - reason=("tables version does not support fix for nan selection " - "bug: GH 4858")) + LooseVersion(tables.__version__) < LooseVersion("3.1.0"), + reason=( + "tables version does not support fix for nan selection " "bug: GH 4858" + ), + ) def test_nan_selection_bug_4858(self): with ensure_clean_store(self.path) as store: - df = DataFrame(dict(cols=range(6), values=range(6)), - dtype='float64') - df['cols'] = (df['cols'] + 10).apply(str) + df = DataFrame(dict(cols=range(6), values=range(6)), dtype="float64") + df["cols"] = (df["cols"] + 10).apply(str) df.iloc[0] = np.nan - expected = DataFrame(dict(cols=['13.0', '14.0', '15.0'], values=[ - 3., 4., 5.]), index=[3, 4, 5]) + expected = DataFrame( + dict(cols=["13.0", "14.0", "15.0"], values=[3.0, 4.0, 5.0]), + index=[3, 4, 5], + ) # write w/o the index on that particular column - store.append('df', df, data_columns=True, index=['cols']) - result = store.select('df', where='values>2.0') + store.append("df", df, data_columns=True, index=["cols"]) + result = store.select("df", where="values>2.0") assert_frame_equal(result, expected) def test_start_stop_table(self): @@ -3719,18 +3845,16 @@ def test_start_stop_table(self): # table df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) - store.append('df', df) + store.append("df", df) - result = store.select( - 'df', "columns=['A']", start=0, stop=5) - expected = df.loc[0:4, ['A']] + result = store.select("df", "columns=['A']", start=0, stop=5) + expected = df.loc[0:4, ["A"]] tm.assert_frame_equal(result, expected) # out of range - result = store.select( - 'df', "columns=['A']", start=30, stop=40) + result = store.select("df", "columns=['A']", start=30, stop=40) assert len(result) == 0 - expected = df.loc[30:40, ['A']] + expected = df.loc[30:40, ["A"]] tm.assert_frame_equal(result, expected) def test_start_stop_multiple(self): @@ -3740,12 +3864,13 @@ def test_start_stop_multiple(self): df = DataFrame({"foo": [1, 2], "bar": [1, 2]}) - store.append_to_multiple({'selector': ['foo'], 'data': None}, df, - selector='selector') - result = store.select_as_multiple(['selector', 'data'], - selector='selector', start=0, - stop=1) - expected = df.loc[[0], ['foo', 'bar']] + store.append_to_multiple( + {"selector": ["foo"], "data": None}, df, selector="selector" + ) + result = store.select_as_multiple( + ["selector", "data"], selector="selector", start=0, stop=1 + ) + expected = df.loc[[0], ["foo", "bar"]] tm.assert_frame_equal(result, expected) @ignore_sparse @@ -3755,35 +3880,33 @@ def test_start_stop_fixed(self): with ensure_clean_store(self.path) as store: # fixed, GH 8287 - df = DataFrame(dict(A=np.random.rand(20), - B=np.random.rand(20)), - index=pd.date_range('20130101', periods=20)) - store.put('df', df) + df = DataFrame( + dict(A=np.random.rand(20), B=np.random.rand(20)), + index=pd.date_range("20130101", periods=20), + ) + store.put("df", df) - result = store.select( - 'df', start=0, stop=5) + result = store.select("df", start=0, stop=5) expected = df.iloc[0:5, :] tm.assert_frame_equal(result, expected) - result = store.select( - 'df', start=5, stop=10) + result = store.select("df", start=5, stop=10) expected = df.iloc[5:10, :] tm.assert_frame_equal(result, expected) # out of range - result = store.select( - 'df', start=30, stop=40) + result = store.select("df", start=30, stop=40) expected = df.iloc[30:40, :] tm.assert_frame_equal(result, expected) # series s = df.A - store.put('s', s) - result = store.select('s', start=0, stop=5) + store.put("s", s) + result = store.select("s", start=0, stop=5) expected = s.iloc[0:5] tm.assert_series_equal(result, expected) - result = store.select('s', start=5, stop=10) + result = store.select("s", start=5, stop=10) expected = s.iloc[5:10] tm.assert_series_equal(result, expected) @@ -3792,46 +3915,50 @@ def test_start_stop_fixed(self): df.iloc[3:5, 1:3] = np.nan df.iloc[8:10, -2] = np.nan dfs = df.to_sparse() - store.put('dfs', dfs) + store.put("dfs", dfs) with pytest.raises(NotImplementedError): - store.select('dfs', start=0, stop=5) + store.select("dfs", start=0, stop=5) def test_select_filter_corner(self): df = DataFrame(np.random.randn(50, 100)) - df.index = ['%.3d' % c for c in df.index] - df.columns = ['%.3d' % c for c in df.columns] + df.index = ["%.3d" % c for c in df.index] + df.columns = ["%.3d" % c for c in df.columns] with ensure_clean_store(self.path) as store: - store.put('frame', df, format='table') + store.put("frame", df, format="table") - crit = 'columns=df.columns[:75]' - result = store.select('frame', [crit]) + crit = "columns=df.columns[:75]" + result = store.select("frame", [crit]) tm.assert_frame_equal(result, df.loc[:, df.columns[:75]]) - crit = 'columns=df.columns[:75:2]' - result = store.select('frame', [crit]) + crit = "columns=df.columns[:75:2]" + result = store.select("frame", [crit]) tm.assert_frame_equal(result, df.loc[:, df.columns[:75:2]]) def test_path_pathlib(self): df = tm.makeDataFrame() result = tm.round_trip_pathlib( - lambda p: df.to_hdf(p, 'df'), - lambda p: pd.read_hdf(p, 'df')) + lambda p: df.to_hdf(p, "df"), lambda p: pd.read_hdf(p, "df") + ) tm.assert_frame_equal(df, result) - @pytest.mark.parametrize('start, stop', [(0, 2), (1, 2), (None, None)]) + @pytest.mark.parametrize("start, stop", [(0, 2), (1, 2), (None, None)]) def test_contiguous_mixed_data_table(self, start, stop): # GH 17021 # ValueError when reading a contiguous mixed-data table ft. VLArray - df = DataFrame({'a': Series([20111010, 20111011, 20111012]), - 'b': Series(['ab', 'cd', 'ab'])}) + df = DataFrame( + { + "a": Series([20111010, 20111011, 20111012]), + "b": Series(["ab", "cd", "ab"]), + } + ) with ensure_clean_store(self.path) as store: - store.append('test_dataset', df) + store.append("test_dataset", df) - result = store.select('test_dataset', start=start, stop=stop) + result = store.select("test_dataset", start=start, stop=stop) assert_frame_equal(df[start:stop], result) def test_path_pathlib_hdfstore(self): @@ -3839,11 +3966,11 @@ def test_path_pathlib_hdfstore(self): def writer(path): with pd.HDFStore(path) as store: - df.to_hdf(store, 'df') + df.to_hdf(store, "df") def reader(path): with pd.HDFStore(path) as store: - return pd.read_hdf(store, 'df') + return pd.read_hdf(store, "df") result = tm.round_trip_pathlib(writer, reader) tm.assert_frame_equal(df, result) @@ -3851,8 +3978,8 @@ def reader(path): def test_pickle_path_localpath(self): df = tm.makeDataFrame() result = tm.round_trip_pathlib( - lambda p: df.to_hdf(p, 'df'), - lambda p: pd.read_hdf(p, 'df')) + lambda p: df.to_hdf(p, "df"), lambda p: pd.read_hdf(p, "df") + ) tm.assert_frame_equal(df, result) def test_path_localpath_hdfstore(self): @@ -3860,11 +3987,11 @@ def test_path_localpath_hdfstore(self): def writer(path): with pd.HDFStore(path) as store: - df.to_hdf(store, 'df') + df.to_hdf(store, "df") def reader(path): with pd.HDFStore(path) as store: - return pd.read_hdf(store, 'df') + return pd.read_hdf(store, "df") result = tm.round_trip_localpath(writer, reader) tm.assert_frame_equal(df, result) @@ -3873,35 +4000,34 @@ def _check_roundtrip(self, obj, comparator, compression=False, **kwargs): options = {} if compression: - options['complib'] = _default_compressor + options["complib"] = _default_compressor - with ensure_clean_store(self.path, 'w', **options) as store: - store['obj'] = obj - retrieved = store['obj'] + with ensure_clean_store(self.path, "w", **options) as store: + store["obj"] = obj + retrieved = store["obj"] comparator(retrieved, obj, **kwargs) - def _check_double_roundtrip(self, obj, comparator, compression=False, - **kwargs): + def _check_double_roundtrip(self, obj, comparator, compression=False, **kwargs): options = {} if compression: - options['complib'] = compression or _default_compressor + options["complib"] = compression or _default_compressor - with ensure_clean_store(self.path, 'w', **options) as store: - store['obj'] = obj - retrieved = store['obj'] + with ensure_clean_store(self.path, "w", **options) as store: + store["obj"] = obj + retrieved = store["obj"] comparator(retrieved, obj, **kwargs) - store['obj'] = retrieved - again = store['obj'] + store["obj"] = retrieved + again = store["obj"] comparator(again, obj, **kwargs) def _check_roundtrip_table(self, obj, comparator, compression=False): options = {} if compression: - options['complib'] = _default_compressor + options["complib"] = _default_compressor - with ensure_clean_store(self.path, 'w', **options) as store: - store.put('obj', obj, format='table') - retrieved = store['obj'] + with ensure_clean_store(self.path, "w", **options) as store: + store.put("obj", obj, format="table") + retrieved = store["obj"] comparator(retrieved, obj) @@ -3911,15 +4037,15 @@ def test_multiple_open_close(self): with ensure_clean_path(self.path) as path: df = tm.makeDataFrame() - df.to_hdf(path, 'df', mode='w', format='table') + df.to_hdf(path, "df", mode="w", format="table") # single store = HDFStore(path) - assert 'CLOSED' not in store.info() + assert "CLOSED" not in store.info() assert store.is_open store.close() - assert 'CLOSED' in store.info() + assert "CLOSED" in store.info() assert not store.is_open with ensure_clean_path(self.path) as path: @@ -3939,55 +4065,55 @@ def test_multiple_open_close(self): store1 = HDFStore(path) store2 = HDFStore(path) - assert 'CLOSED' not in store1.info() - assert 'CLOSED' not in store2.info() + assert "CLOSED" not in store1.info() + assert "CLOSED" not in store2.info() assert store1.is_open assert store2.is_open store1.close() - assert 'CLOSED' in store1.info() + assert "CLOSED" in store1.info() assert not store1.is_open - assert 'CLOSED' not in store2.info() + assert "CLOSED" not in store2.info() assert store2.is_open store2.close() - assert 'CLOSED' in store1.info() - assert 'CLOSED' in store2.info() + assert "CLOSED" in store1.info() + assert "CLOSED" in store2.info() assert not store1.is_open assert not store2.is_open # nested close - store = HDFStore(path, mode='w') - store.append('df', df) + store = HDFStore(path, mode="w") + store.append("df", df) store2 = HDFStore(path) - store2.append('df2', df) + store2.append("df2", df) store2.close() - assert 'CLOSED' in store2.info() + assert "CLOSED" in store2.info() assert not store2.is_open store.close() - assert 'CLOSED' in store.info() + assert "CLOSED" in store.info() assert not store.is_open # double closing - store = HDFStore(path, mode='w') - store.append('df', df) + store = HDFStore(path, mode="w") + store.append("df", df) store2 = HDFStore(path) store.close() - assert 'CLOSED' in store.info() + assert "CLOSED" in store.info() assert not store.is_open store2.close() - assert 'CLOSED' in store2.info() + assert "CLOSED" in store2.info() assert not store2.is_open # ops on a closed store with ensure_clean_path(self.path) as path: df = tm.makeDataFrame() - df.to_hdf(path, 'df', mode='w', format='table') + df.to_hdf(path, "df", mode="w", format="table") store = HDFStore(path) store.close() @@ -3996,53 +4122,54 @@ def test_multiple_open_close(self): store.keys() with pytest.raises(ClosedFileError): - 'df' in store + "df" in store with pytest.raises(ClosedFileError): len(store) with pytest.raises(ClosedFileError): - store['df'] + store["df"] with pytest.raises(AttributeError): store.df with pytest.raises(ClosedFileError): - store.select('df') + store.select("df") with pytest.raises(ClosedFileError): - store.get('df') + store.get("df") with pytest.raises(ClosedFileError): - store.append('df2', df) + store.append("df2", df) with pytest.raises(ClosedFileError): - store.put('df3', df) + store.put("df3", df) with pytest.raises(ClosedFileError): - store.get_storer('df2') + store.get_storer("df2") with pytest.raises(ClosedFileError): - store.remove('df2') + store.remove("df2") - with pytest.raises(ClosedFileError, match='file is not open'): - store.select('df') + with pytest.raises(ClosedFileError, match="file is not open"): + store.select("df") def test_pytables_native_read(self, datapath): with ensure_clean_store( - datapath('io', 'data', 'legacy_hdf/pytables_native.h5'), - mode='r') as store: - d2 = store['detector/readout'] + datapath("io", "data", "legacy_hdf/pytables_native.h5"), mode="r" + ) as store: + d2 = store["detector/readout"] assert isinstance(d2, DataFrame) - @pytest.mark.skipif(is_platform_windows(), - reason="native2 read fails oddly on windows") + @pytest.mark.skipif( + is_platform_windows(), reason="native2 read fails oddly on windows" + ) def test_pytables_native2_read(self, datapath): with ensure_clean_store( - datapath('io', 'data', 'legacy_hdf', 'pytables_native2.h5'), - mode='r') as store: + datapath("io", "data", "legacy_hdf", "pytables_native2.h5"), mode="r" + ) as store: str(store) - d1 = store['detector'] + d1 = store["detector"] assert isinstance(d1, DataFrame) @xfail_non_writeable @@ -4050,46 +4177,43 @@ def test_legacy_table_fixed_format_read_py2(self, datapath): # GH 24510 # legacy table with fixed format written in Python 2 with ensure_clean_store( - datapath('io', 'data', 'legacy_hdf', - 'legacy_table_fixed_py2.h5'), - mode='r') as store: - result = store.select('df') - expected = pd.DataFrame([[1, 2, 3, 'D']], - columns=['A', 'B', 'C', 'D'], - index=pd.Index(['ABC'], - name='INDEX_NAME')) + datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r" + ) as store: + result = store.select("df") + expected = pd.DataFrame( + [[1, 2, 3, "D"]], + columns=["A", "B", "C", "D"], + index=pd.Index(["ABC"], name="INDEX_NAME"), + ) assert_frame_equal(expected, result) def test_legacy_table_read_py2(self, datapath): # issue: 24925 # legacy table written in Python 2 with ensure_clean_store( - datapath('io', 'data', 'legacy_hdf', - 'legacy_table_py2.h5'), - mode='r') as store: - result = store.select('table') - - expected = pd.DataFrame({ - "a": ["a", "b"], - "b": [2, 3] - }) + datapath("io", "data", "legacy_hdf", "legacy_table_py2.h5"), mode="r" + ) as store: + result = store.select("table") + + expected = pd.DataFrame({"a": ["a", "b"], "b": [2, 3]}) assert_frame_equal(expected, result) def test_copy(self): with catch_warnings(record=True): - def do_copy(f, new_f=None, keys=None, - propindexes=True, **kwargs): + def do_copy(f, new_f=None, keys=None, propindexes=True, **kwargs): try: - store = HDFStore(f, 'r') + store = HDFStore(f, "r") if new_f is None: import tempfile + fd, new_f = tempfile.mkstemp() tstore = store.copy( - new_f, keys=keys, propindexes=propindexes, **kwargs) + new_f, keys=keys, propindexes=propindexes, **kwargs + ) # check keys if keys is None: @@ -4125,7 +4249,7 @@ def do_copy(f, new_f=None, keys=None, try: path = create_tempfile(self.path) st = HDFStore(path) - st.append('df', df, data_columns=['A']) + st.append("df", df, data_columns=["A"]) st.close() do_copy(f=path) do_copy(f=path, propindexes=False) @@ -4137,16 +4261,16 @@ def test_store_datetime_fractional_secs(self): with ensure_clean_store(self.path) as store: dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) series = Series([0], [dt]) - store['a'] = series - assert store['a'].index[0] == dt + store["a"] = series + assert store["a"].index[0] == dt def test_tseries_indices_series(self): with ensure_clean_store(self.path) as store: idx = tm.makeDateIndex(10) ser = Series(np.random.randn(len(idx)), idx) - store['a'] = ser - result = store['a'] + store["a"] = ser + result = store["a"] tm.assert_series_equal(result, ser) assert result.index.freq == ser.index.freq @@ -4154,8 +4278,8 @@ def test_tseries_indices_series(self): idx = tm.makePeriodIndex(10) ser = Series(np.random.randn(len(idx)), idx) - store['a'] = ser - result = store['a'] + store["a"] = ser + result = store["a"] tm.assert_series_equal(result, ser) assert result.index.freq == ser.index.freq @@ -4166,27 +4290,25 @@ def test_tseries_indices_frame(self): with ensure_clean_store(self.path) as store: idx = tm.makeDateIndex(10) df = DataFrame(np.random.randn(len(idx), 3), index=idx) - store['a'] = df - result = store['a'] + store["a"] = df + result = store["a"] assert_frame_equal(result, df) assert result.index.freq == df.index.freq - tm.assert_class_equal(result.index, df.index, - obj="dataframe index") + tm.assert_class_equal(result.index, df.index, obj="dataframe index") idx = tm.makePeriodIndex(10) df = DataFrame(np.random.randn(len(idx), 3), idx) - store['a'] = df - result = store['a'] + store["a"] = df + result = store["a"] assert_frame_equal(result, df) assert result.index.freq == df.index.freq - tm.assert_class_equal(result.index, df.index, - obj="dataframe index") + tm.assert_class_equal(result.index, df.index, obj="dataframe index") def test_unicode_index(self): - unicode_values = ['\u03c3', '\u03c3\u03c3'] + unicode_values = ["\u03c3", "\u03c3\u03c3"] # PerformanceWarning with catch_warnings(record=True): @@ -4196,26 +4318,25 @@ def test_unicode_index(self): def test_unicode_longer_encoded(self): # GH 11234 - char = '\u0394' - df = pd.DataFrame({'A': [char]}) + char = "\u0394" + df = pd.DataFrame({"A": [char]}) with ensure_clean_store(self.path) as store: - store.put('df', df, format='table', encoding='utf-8') - result = store.get('df') + store.put("df", df, format="table", encoding="utf-8") + result = store.get("df") tm.assert_frame_equal(result, df) - df = pd.DataFrame({'A': ['a', char], 'B': ['b', 'b']}) + df = pd.DataFrame({"A": ["a", char], "B": ["b", "b"]}) with ensure_clean_store(self.path) as store: - store.put('df', df, format='table', encoding='utf-8') - result = store.get('df') + store.put("df", df, format="table", encoding="utf-8") + result = store.get("df") tm.assert_frame_equal(result, df) @xfail_non_writeable def test_store_datetime_mixed(self): - df = DataFrame( - {'a': [1, 2, 3], 'b': [1., 2., 3.], 'c': ['a', 'b', 'c']}) + df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]}) ts = tm.makeTimeSeries() - df['d'] = ts.index[:3] + df["d"] = ts.index[:3] self._check_roundtrip(df, tm.assert_frame_equal) # def test_cant_write_multiindex_table(self): @@ -4229,13 +4350,13 @@ def test_store_datetime_mixed(self): def test_append_with_diff_col_name_types_raises_value_error(self): df = DataFrame(np.random.randn(10, 1)) - df2 = DataFrame({'a': np.random.randn(10)}) + df2 = DataFrame({"a": np.random.randn(10)}) df3 = DataFrame({(1, 2): np.random.randn(10)}) - df4 = DataFrame({('1', 2): np.random.randn(10)}) - df5 = DataFrame({('1', 2, object): np.random.randn(10)}) + df4 = DataFrame({("1", 2): np.random.randn(10)}) + df5 = DataFrame({("1", 2, object): np.random.randn(10)}) with ensure_clean_store(self.path) as store: - name = 'df_%s' % tm.rands(10) + name = "df_%s" % tm.rands(10) store.append(name, df) for d in (df2, df3, df4, df5): @@ -4243,13 +4364,16 @@ def test_append_with_diff_col_name_types_raises_value_error(self): store.append(name, d) def test_query_with_nested_special_character(self): - df = DataFrame({'a': ['a', 'a', 'c', 'b', - 'test & test', 'c', 'b', 'e'], - 'b': [1, 2, 3, 4, 5, 6, 7, 8]}) - expected = df[df.a == 'test & test'] + df = DataFrame( + { + "a": ["a", "a", "c", "b", "test & test", "c", "b", "e"], + "b": [1, 2, 3, 4, 5, 6, 7, 8], + } + ) + expected = df[df.a == "test & test"] with ensure_clean_store(self.path) as store: - store.append('test', df, format='table', data_columns=True) - result = store.select('test', 'a = "test & test"') + store.append("test", df, format="table", data_columns=True) + result = store.select("test", 'a = "test & test"') tm.assert_frame_equal(expected, result) def test_categorical(self): @@ -4257,111 +4381,126 @@ def test_categorical(self): with ensure_clean_store(self.path) as store: # Basic - _maybe_remove(store, 's') - s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=[ - 'a', 'b', 'c', 'd'], ordered=False)) - store.append('s', s, format='table') - result = store.select('s') + _maybe_remove(store, "s") + s = Series( + Categorical( + ["a", "b", "b", "a", "a", "c"], + categories=["a", "b", "c", "d"], + ordered=False, + ) + ) + store.append("s", s, format="table") + result = store.select("s") tm.assert_series_equal(s, result) - _maybe_remove(store, 's_ordered') - s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=[ - 'a', 'b', 'c', 'd'], ordered=True)) - store.append('s_ordered', s, format='table') - result = store.select('s_ordered') + _maybe_remove(store, "s_ordered") + s = Series( + Categorical( + ["a", "b", "b", "a", "a", "c"], + categories=["a", "b", "c", "d"], + ordered=True, + ) + ) + store.append("s_ordered", s, format="table") + result = store.select("s_ordered") tm.assert_series_equal(s, result) - _maybe_remove(store, 'df') + _maybe_remove(store, "df") df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]}) - store.append('df', df, format='table') - result = store.select('df') + store.append("df", df, format="table") + result = store.select("df") tm.assert_frame_equal(result, df) # Dtypes - _maybe_remove(store, 'si') - s = Series([1, 1, 2, 2, 3, 4, 5]).astype('category') - store.append('si', s) - result = store.select('si') + _maybe_remove(store, "si") + s = Series([1, 1, 2, 2, 3, 4, 5]).astype("category") + store.append("si", s) + result = store.select("si") tm.assert_series_equal(result, s) - _maybe_remove(store, 'si2') - s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype('category') - store.append('si2', s) - result = store.select('si2') + _maybe_remove(store, "si2") + s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype("category") + store.append("si2", s) + result = store.select("si2") tm.assert_series_equal(result, s) # Multiple - _maybe_remove(store, 'df2') + _maybe_remove(store, "df2") df2 = df.copy() - df2['s2'] = Series(list('abcdefg')).astype('category') - store.append('df2', df2) - result = store.select('df2') + df2["s2"] = Series(list("abcdefg")).astype("category") + store.append("df2", df2) + result = store.select("df2") tm.assert_frame_equal(result, df2) # Make sure the metadata is OK info = store.info() - assert '/df2 ' in info + assert "/df2 " in info # assert '/df2/meta/values_block_0/meta' in info - assert '/df2/meta/values_block_1/meta' in info + assert "/df2/meta/values_block_1/meta" in info # unordered - _maybe_remove(store, 's2') - s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=[ - 'a', 'b', 'c', 'd'], ordered=False)) - store.append('s2', s, format='table') - result = store.select('s2') + _maybe_remove(store, "s2") + s = Series( + Categorical( + ["a", "b", "b", "a", "a", "c"], + categories=["a", "b", "c", "d"], + ordered=False, + ) + ) + store.append("s2", s, format="table") + result = store.select("s2") tm.assert_series_equal(result, s) # Query - _maybe_remove(store, 'df3') - store.append('df3', df, data_columns=['s']) - expected = df[df.s.isin(['b', 'c'])] - result = store.select('df3', where=['s in ["b","c"]']) + _maybe_remove(store, "df3") + store.append("df3", df, data_columns=["s"]) + expected = df[df.s.isin(["b", "c"])] + result = store.select("df3", where=['s in ["b","c"]']) tm.assert_frame_equal(result, expected) - expected = df[df.s.isin(['b', 'c'])] - result = store.select('df3', where=['s = ["b","c"]']) + expected = df[df.s.isin(["b", "c"])] + result = store.select("df3", where=['s = ["b","c"]']) tm.assert_frame_equal(result, expected) - expected = df[df.s.isin(['d'])] - result = store.select('df3', where=['s in ["d"]']) + expected = df[df.s.isin(["d"])] + result = store.select("df3", where=['s in ["d"]']) tm.assert_frame_equal(result, expected) - expected = df[df.s.isin(['f'])] - result = store.select('df3', where=['s in ["f"]']) + expected = df[df.s.isin(["f"])] + result = store.select("df3", where=['s in ["f"]']) tm.assert_frame_equal(result, expected) # Appending with same categories is ok - store.append('df3', df) + store.append("df3", df) df = concat([df, df]) - expected = df[df.s.isin(['b', 'c'])] - result = store.select('df3', where=['s in ["b","c"]']) + expected = df[df.s.isin(["b", "c"])] + result = store.select("df3", where=['s in ["b","c"]']) tm.assert_frame_equal(result, expected) # Appending must have the same categories df3 = df.copy() - df3['s'].cat.remove_unused_categories(inplace=True) + df3["s"].cat.remove_unused_categories(inplace=True) with pytest.raises(ValueError): - store.append('df3', df3) + store.append("df3", df3) # Remove, and make sure meta data is removed (its a recursive # removal so should be). - result = store.select('df3/meta/s/meta') + result = store.select("df3/meta/s/meta") assert result is not None - store.remove('df3') + store.remove("df3") with pytest.raises(KeyError): - store.select('df3/meta/s/meta') + store.select("df3/meta/s/meta") def test_categorical_conversion(self): # GH13322 # Check that read_hdf with categorical columns doesn't return rows if # where criteria isn't met. - obsids = ['ESP_012345_6789', 'ESP_987654_3210'] - imgids = ['APF00006np', 'APF0001imm'] + obsids = ["ESP_012345_6789", "ESP_987654_3210"] + imgids = ["APF00006np", "APF0001imm"] data = [4.3, 9.8] # Test without categories @@ -4370,38 +4509,40 @@ def test_categorical_conversion(self): # We are expecting an empty DataFrame matching types of df expected = df.iloc[[], :] with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', format='table', data_columns=True) - result = read_hdf(path, 'df', where='obsids=B') + df.to_hdf(path, "df", format="table", data_columns=True) + result = read_hdf(path, "df", where="obsids=B") tm.assert_frame_equal(result, expected) # Test with categories - df.obsids = df.obsids.astype('category') - df.imgids = df.imgids.astype('category') + df.obsids = df.obsids.astype("category") + df.imgids = df.imgids.astype("category") # We are expecting an empty DataFrame matching types of df expected = df.iloc[[], :] with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', format='table', data_columns=True) - result = read_hdf(path, 'df', where='obsids=B') + df.to_hdf(path, "df", format="table", data_columns=True) + result = read_hdf(path, "df", where="obsids=B") tm.assert_frame_equal(result, expected) def test_categorical_nan_only_columns(self): # GH18413 # Check that read_hdf with categorical columns with NaN-only values can # be read back. - df = pd.DataFrame({ - 'a': ['a', 'b', 'c', np.nan], - 'b': [np.nan, np.nan, np.nan, np.nan], - 'c': [1, 2, 3, 4], - 'd': pd.Series([None] * 4, dtype=object) - }) - df['a'] = df.a.astype('category') - df['b'] = df.b.astype('category') - df['d'] = df.b.astype('category') + df = pd.DataFrame( + { + "a": ["a", "b", "c", np.nan], + "b": [np.nan, np.nan, np.nan, np.nan], + "c": [1, 2, 3, 4], + "d": pd.Series([None] * 4, dtype=object), + } + ) + df["a"] = df.a.astype("category") + df["b"] = df.b.astype("category") + df["d"] = df.b.astype("category") expected = df with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', format='table', data_columns=True) - result = read_hdf(path, 'df') + df.to_hdf(path, "df", format="table", data_columns=True) + result = read_hdf(path, "df") tm.assert_frame_equal(result, expected) def test_duplicate_column_name(self): @@ -4409,10 +4550,10 @@ def test_duplicate_column_name(self): with ensure_clean_path(self.path) as path: with pytest.raises(ValueError): - df.to_hdf(path, 'df', format='fixed') + df.to_hdf(path, "df", format="fixed") - df.to_hdf(path, 'df', format='table') - other = read_hdf(path, 'df') + df.to_hdf(path, "df", format="table") + other = read_hdf(path, "df") tm.assert_frame_equal(df, other) assert df.equals(other) @@ -4423,8 +4564,8 @@ def test_round_trip_equals(self): df = DataFrame({"B": [1, 2], "A": ["x", "y"]}) with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', format='table') - other = read_hdf(path, 'df') + df.to_hdf(path, "df", format="table") + other = read_hdf(path, "df") tm.assert_frame_equal(df, other) assert df.equals(other) assert other.equals(df) @@ -4434,35 +4575,35 @@ def test_preserve_timedeltaindex_type(self): # Storing TimedeltaIndexed DataFrames in fixed stores did not preserve # the type of the index. df = DataFrame(np.random.normal(size=(10, 5))) - df.index = timedelta_range( - start='0s', periods=10, freq='1s', name='example') + df.index = timedelta_range(start="0s", periods=10, freq="1s", name="example") with ensure_clean_store(self.path) as store: - store['df'] = df - assert_frame_equal(store['df'], df) + store["df"] = df + assert_frame_equal(store["df"], df) def test_columns_multiindex_modified(self): # BUG: 7212 # read_hdf store.select modified the passed columns parameters # when multi-indexed. - df = DataFrame(np.random.rand(4, 5), - index=list('abcd'), - columns=list('ABCDE')) - df.index.name = 'letters' - df = df.set_index(keys='E', append=True) + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + df.index.name = "letters" + df = df.set_index(keys="E", append=True) data_columns = df.index.names + df.columns.tolist() with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', - mode='a', - append=True, - data_columns=data_columns, - index=False) - cols2load = list('BCD') + df.to_hdf( + path, + "df", + mode="a", + append=True, + data_columns=data_columns, + index=False, + ) + cols2load = list("BCD") cols2load_original = list(cols2load) - df_loaded = read_hdf(path, 'df', columns=cols2load) # noqa + df_loaded = read_hdf(path, "df", columns=cols2load) # noqa assert cols2load_original == cols2load @ignore_natural_naming_warning @@ -4471,11 +4612,18 @@ def test_to_hdf_with_object_column_names(self): # Writing HDF5 table format should only work for string-like # column types - types_should_fail = [tm.makeIntIndex, tm.makeFloatIndex, - tm.makeDateIndex, tm.makeTimedeltaIndex, - tm.makePeriodIndex] - types_should_run = [tm.makeStringIndex, tm.makeCategoricalIndex, - tm.makeUnicodeIndex] + types_should_fail = [ + tm.makeIntIndex, + tm.makeFloatIndex, + tm.makeDateIndex, + tm.makeTimedeltaIndex, + tm.makePeriodIndex, + ] + types_should_run = [ + tm.makeStringIndex, + tm.makeCategoricalIndex, + tm.makeUnicodeIndex, + ] for index in types_should_fail: df = DataFrame(np.random.randn(10, 2), columns=index(2)) @@ -4483,107 +4631,97 @@ def test_to_hdf_with_object_column_names(self): with catch_warnings(record=True): msg = "cannot have non-object label DataIndexableCol" with pytest.raises(ValueError, match=msg): - df.to_hdf(path, 'df', format='table', - data_columns=True) + df.to_hdf(path, "df", format="table", data_columns=True) for index in types_should_run: df = DataFrame(np.random.randn(10, 2), columns=index(2)) with ensure_clean_path(self.path) as path: with catch_warnings(record=True): - df.to_hdf(path, 'df', format='table', data_columns=True) + df.to_hdf(path, "df", format="table", data_columns=True) result = pd.read_hdf( - path, 'df', where="index = [{0}]".format(df.index[0])) - assert(len(result)) + path, "df", where="index = [{0}]".format(df.index[0]) + ) + assert len(result) def test_read_hdf_open_store(self): # GH10330 # No check for non-string path_or-buf, and no test of open store - df = DataFrame(np.random.rand(4, 5), - index=list('abcd'), - columns=list('ABCDE')) - df.index.name = 'letters' - df = df.set_index(keys='E', append=True) + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + df.index.name = "letters" + df = df.set_index(keys="E", append=True) with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', mode='w') - direct = read_hdf(path, 'df') - store = HDFStore(path, mode='r') - indirect = read_hdf(store, 'df') + df.to_hdf(path, "df", mode="w") + direct = read_hdf(path, "df") + store = HDFStore(path, mode="r") + indirect = read_hdf(store, "df") tm.assert_frame_equal(direct, indirect) assert store.is_open store.close() def test_read_hdf_iterator(self): - df = DataFrame(np.random.rand(4, 5), - index=list('abcd'), - columns=list('ABCDE')) - df.index.name = 'letters' - df = df.set_index(keys='E', append=True) + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + df.index.name = "letters" + df = df.set_index(keys="E", append=True) with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', mode='w', format='t') - direct = read_hdf(path, 'df') - iterator = read_hdf(path, 'df', iterator=True) + df.to_hdf(path, "df", mode="w", format="t") + direct = read_hdf(path, "df") + iterator = read_hdf(path, "df", iterator=True) assert isinstance(iterator, TableIterator) indirect = next(iterator.__iter__()) tm.assert_frame_equal(direct, indirect) iterator.store.close() def test_read_hdf_errors(self): - df = DataFrame(np.random.rand(4, 5), - index=list('abcd'), - columns=list('ABCDE')) + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) with ensure_clean_path(self.path) as path: with pytest.raises(IOError): - read_hdf(path, 'key') + read_hdf(path, "key") - df.to_hdf(path, 'df') - store = HDFStore(path, mode='r') + df.to_hdf(path, "df") + store = HDFStore(path, mode="r") store.close() with pytest.raises(IOError): - read_hdf(store, 'df') + read_hdf(store, "df") def test_read_hdf_generic_buffer_errors(self): with pytest.raises(NotImplementedError): - read_hdf(BytesIO(b''), 'df') + read_hdf(BytesIO(b""), "df") def test_invalid_complib(self): - df = DataFrame(np.random.rand(4, 5), - index=list('abcd'), - columns=list('ABCDE')) + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) with ensure_clean_path(self.path) as path: with pytest.raises(ValueError): - df.to_hdf(path, 'df', complib='foolib') + df.to_hdf(path, "df", complib="foolib") + # GH10443 def test_read_nokey(self): - df = DataFrame(np.random.rand(4, 5), - index=list('abcd'), - columns=list('ABCDE')) + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) # Categorical dtype not supported for "fixed" format. So no need # to test with that dtype in the dataframe here. with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', mode='a') + df.to_hdf(path, "df", mode="a") reread = read_hdf(path) assert_frame_equal(df, reread) - df.to_hdf(path, 'df2', mode='a') + df.to_hdf(path, "df2", mode="a") with pytest.raises(ValueError): read_hdf(path) def test_read_nokey_table(self): # GH13231 - df = DataFrame({'i': range(5), - 'c': Series(list('abacd'), dtype='category')}) + df = DataFrame({"i": range(5), "c": Series(list("abacd"), dtype="category")}) with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', mode='a', format='table') + df.to_hdf(path, "df", mode="a", format="table") reread = read_hdf(path) assert_frame_equal(df, reread) - df.to_hdf(path, 'df2', mode='a', format='table') + df.to_hdf(path, "df2", mode="a", format="table") with pytest.raises(ValueError): read_hdf(path) @@ -4596,121 +4734,123 @@ def test_read_nokey_empty(self): with pytest.raises(ValueError): read_hdf(path) - @td.skip_if_no('pathlib') + @td.skip_if_no("pathlib") def test_read_from_pathlib_path(self): # GH11773 from pathlib import Path - expected = DataFrame(np.random.rand(4, 5), - index=list('abcd'), - columns=list('ABCDE')) + expected = DataFrame( + np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") + ) with ensure_clean_path(self.path) as filename: path_obj = Path(filename) - expected.to_hdf(path_obj, 'df', mode='a') - actual = read_hdf(path_obj, 'df') + expected.to_hdf(path_obj, "df", mode="a") + actual = read_hdf(path_obj, "df") tm.assert_frame_equal(expected, actual) - @td.skip_if_no('py.path') + @td.skip_if_no("py.path") def test_read_from_py_localpath(self): # GH11773 from py.path import local as LocalPath - expected = DataFrame(np.random.rand(4, 5), - index=list('abcd'), - columns=list('ABCDE')) + expected = DataFrame( + np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") + ) with ensure_clean_path(self.path) as filename: path_obj = LocalPath(filename) - expected.to_hdf(path_obj, 'df', mode='a') - actual = read_hdf(path_obj, 'df') + expected.to_hdf(path_obj, "df", mode="a") + actual = read_hdf(path_obj, "df") tm.assert_frame_equal(expected, actual) def test_query_long_float_literal(self): # GH 14241 - df = pd.DataFrame({'A': [1000000000.0009, - 1000000000.0011, - 1000000000.0015]}) + df = pd.DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]}) with ensure_clean_store(self.path) as store: - store.append('test', df, format='table', data_columns=True) + store.append("test", df, format="table", data_columns=True) cutoff = 1000000000.0006 - result = store.select('test', "A < %.4f" % cutoff) + result = store.select("test", "A < %.4f" % cutoff) assert result.empty cutoff = 1000000000.0010 - result = store.select('test', "A > %.4f" % cutoff) + result = store.select("test", "A > %.4f" % cutoff) expected = df.loc[[1, 2], :] tm.assert_frame_equal(expected, result) exact = 1000000000.0011 - result = store.select('test', 'A == %.4f' % exact) + result = store.select("test", "A == %.4f" % exact) expected = df.loc[[1], :] tm.assert_frame_equal(expected, result) def test_query_compare_column_type(self): # GH 15492 - df = pd.DataFrame({'date': ['2014-01-01', '2014-01-02'], - 'real_date': date_range('2014-01-01', periods=2), - 'float': [1.1, 1.2], - 'int': [1, 2]}, - columns=['date', 'real_date', 'float', 'int']) + df = pd.DataFrame( + { + "date": ["2014-01-01", "2014-01-02"], + "real_date": date_range("2014-01-01", periods=2), + "float": [1.1, 1.2], + "int": [1, 2], + }, + columns=["date", "real_date", "float", "int"], + ) with ensure_clean_store(self.path) as store: - store.append('test', df, format='table', data_columns=True) + store.append("test", df, format="table", data_columns=True) - ts = pd.Timestamp('2014-01-01') # noqa - result = store.select('test', where='real_date > ts') + ts = pd.Timestamp("2014-01-01") # noqa + result = store.select("test", where="real_date > ts") expected = df.loc[[1], :] tm.assert_frame_equal(expected, result) - for op in ['<', '>', '==']: + for op in ["<", ">", "=="]: # non strings to string column always fail - for v in [2.1, True, pd.Timestamp('2014-01-01'), - pd.Timedelta(1, 's')]: - query = 'date {op} v'.format(op=op) + for v in [2.1, True, pd.Timestamp("2014-01-01"), pd.Timedelta(1, "s")]: + query = "date {op} v".format(op=op) with pytest.raises(TypeError): - store.select('test', where=query) + store.select("test", where=query) # strings to other columns must be convertible to type - v = 'a' - for col in ['int', 'float', 'real_date']: - query = '{col} {op} v'.format(op=op, col=col) + v = "a" + for col in ["int", "float", "real_date"]: + query = "{col} {op} v".format(op=op, col=col) with pytest.raises(ValueError): - store.select('test', where=query) + store.select("test", where=query) - for v, col in zip(['1', '1.1', '2014-01-01'], - ['int', 'float', 'real_date']): - query = '{col} {op} v'.format(op=op, col=col) - result = store.select('test', where=query) + for v, col in zip( + ["1", "1.1", "2014-01-01"], ["int", "float", "real_date"] + ): + query = "{col} {op} v".format(op=op, col=col) + result = store.select("test", where=query) - if op == '==': + if op == "==": expected = df.loc[[0], :] - elif op == '>': + elif op == ">": expected = df.loc[[1], :] else: expected = df.loc[[], :] tm.assert_frame_equal(expected, result) - @pytest.mark.parametrize('format', ['fixed', 'table']) + @pytest.mark.parametrize("format", ["fixed", "table"]) def test_read_hdf_series_mode_r(self, format): # GH 16583 # Tests that reading a Series saved to an HDF file # still works if a mode='r' argument is supplied series = tm.makeFloatSeries() with ensure_clean_path(self.path) as path: - series.to_hdf(path, key='data', format=format) - result = pd.read_hdf(path, key='data', mode='r') + series.to_hdf(path, key="data", format=format) + result = pd.read_hdf(path, key="data", mode="r") tm.assert_series_equal(result, series) @pytest.mark.skipif(not PY36, reason="Need python 3.6") def test_fspath(self): - with tm.ensure_clean('foo.h5') as path: + with tm.ensure_clean("foo.h5") as path: with pd.HDFStore(path) as store: assert os.fspath(store) == str(path) @@ -4725,17 +4865,21 @@ def test_read_py2_hdf_file_in_py3(self, datapath): # ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B')) # df.to_hdf('periodindex_0.20.1_x86_64_darwin_2.7.13.h5', 'p') - expected = pd.DataFrame([1., 2, 3], index=pd.PeriodIndex( - ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B')) + expected = pd.DataFrame( + [1.0, 2, 3], + index=pd.PeriodIndex(["2015-01-01", "2015-01-02", "2015-01-05"], freq="B"), + ) with ensure_clean_store( - datapath('io', 'data', 'legacy_hdf', - 'periodindex_0.20.1_x86_64_darwin_2.7.13.h5'), - mode='r') as store: - result = store['p'] + datapath( + "io", "data", "legacy_hdf", "periodindex_0.20.1_x86_64_darwin_2.7.13.h5" + ), + mode="r", + ) as store: + result = store["p"] assert_frame_equal(result, expected) - @pytest.mark.parametrize("where", ["", (), (None, ), [], [None]]) + @pytest.mark.parametrize("where", ["", (), (None,), [], [None]]) def test_select_empty_where(self, where): # GH26610 @@ -4750,118 +4894,139 @@ def test_select_empty_where(self, where): result = pd.read_hdf(store, "df", where=where) assert_frame_equal(result, df) - @pytest.mark.parametrize('idx', [ - date_range('2019', freq='D', periods=3, tz='UTC'), - CategoricalIndex(list('abc')) - ]) + @pytest.mark.parametrize( + "idx", + [ + date_range("2019", freq="D", periods=3, tz="UTC"), + CategoricalIndex(list("abc")), + ], + ) def test_to_hdf_multiindex_extension_dtype(self, idx): # GH 7775 mi = MultiIndex.from_arrays([idx, idx]) - df = pd.DataFrame(0, index=mi, columns=['a']) + df = pd.DataFrame(0, index=mi, columns=["a"]) with ensure_clean_path(self.path) as path: - with pytest.raises(NotImplementedError, - match="Saving a MultiIndex"): - df.to_hdf(path, 'df') + with pytest.raises(NotImplementedError, match="Saving a MultiIndex"): + df.to_hdf(path, "df") class TestHDFComplexValues(Base): # GH10447 def test_complex_fixed(self): - df = DataFrame(np.random.rand(4, 5).astype(np.complex64), - index=list('abcd'), - columns=list('ABCDE')) + df = DataFrame( + np.random.rand(4, 5).astype(np.complex64), + index=list("abcd"), + columns=list("ABCDE"), + ) with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df') - reread = read_hdf(path, 'df') + df.to_hdf(path, "df") + reread = read_hdf(path, "df") assert_frame_equal(df, reread) - df = DataFrame(np.random.rand(4, 5).astype(np.complex128), - index=list('abcd'), - columns=list('ABCDE')) + df = DataFrame( + np.random.rand(4, 5).astype(np.complex128), + index=list("abcd"), + columns=list("ABCDE"), + ) with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df') - reread = read_hdf(path, 'df') + df.to_hdf(path, "df") + reread = read_hdf(path, "df") assert_frame_equal(df, reread) def test_complex_table(self): - df = DataFrame(np.random.rand(4, 5).astype(np.complex64), - index=list('abcd'), - columns=list('ABCDE')) + df = DataFrame( + np.random.rand(4, 5).astype(np.complex64), + index=list("abcd"), + columns=list("ABCDE"), + ) with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', format='table') - reread = read_hdf(path, 'df') + df.to_hdf(path, "df", format="table") + reread = read_hdf(path, "df") assert_frame_equal(df, reread) - df = DataFrame(np.random.rand(4, 5).astype(np.complex128), - index=list('abcd'), - columns=list('ABCDE')) + df = DataFrame( + np.random.rand(4, 5).astype(np.complex128), + index=list("abcd"), + columns=list("ABCDE"), + ) with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', format='table', mode='w') - reread = read_hdf(path, 'df') + df.to_hdf(path, "df", format="table", mode="w") + reread = read_hdf(path, "df") assert_frame_equal(df, reread) @xfail_non_writeable def test_complex_mixed_fixed(self): - complex64 = np.array([1.0 + 1.0j, 1.0 + 1.0j, - 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64) - complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], - dtype=np.complex128) - df = DataFrame({'A': [1, 2, 3, 4], - 'B': ['a', 'b', 'c', 'd'], - 'C': complex64, - 'D': complex128, - 'E': [1.0, 2.0, 3.0, 4.0]}, - index=list('abcd')) + complex64 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 + ) + complex128 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 + ) + df = DataFrame( + { + "A": [1, 2, 3, 4], + "B": ["a", "b", "c", "d"], + "C": complex64, + "D": complex128, + "E": [1.0, 2.0, 3.0, 4.0], + }, + index=list("abcd"), + ) with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df') - reread = read_hdf(path, 'df') + df.to_hdf(path, "df") + reread = read_hdf(path, "df") assert_frame_equal(df, reread) def test_complex_mixed_table(self): - complex64 = np.array([1.0 + 1.0j, 1.0 + 1.0j, - 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64) - complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], - dtype=np.complex128) - df = DataFrame({'A': [1, 2, 3, 4], - 'B': ['a', 'b', 'c', 'd'], - 'C': complex64, - 'D': complex128, - 'E': [1.0, 2.0, 3.0, 4.0]}, - index=list('abcd')) + complex64 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 + ) + complex128 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 + ) + df = DataFrame( + { + "A": [1, 2, 3, 4], + "B": ["a", "b", "c", "d"], + "C": complex64, + "D": complex128, + "E": [1.0, 2.0, 3.0, 4.0], + }, + index=list("abcd"), + ) with ensure_clean_store(self.path) as store: - store.append('df', df, data_columns=['A', 'B']) - result = store.select('df', where='A>2') + store.append("df", df, data_columns=["A", "B"]) + result = store.select("df", where="A>2") assert_frame_equal(df.loc[df.A > 2], result) with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', format='table') - reread = read_hdf(path, 'df') + df.to_hdf(path, "df", format="table") + reread = read_hdf(path, "df") assert_frame_equal(df, reread) def test_complex_across_dimensions_fixed(self): with catch_warnings(record=True): - complex128 = np.array( - [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) - s = Series(complex128, index=list('abcd')) - df = DataFrame({'A': s, 'B': s}) + complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) + s = Series(complex128, index=list("abcd")) + df = DataFrame({"A": s, "B": s}) objs = [s, df] comps = [tm.assert_series_equal, tm.assert_frame_equal] for obj, comp in zip(objs, comps): with ensure_clean_path(self.path) as path: - obj.to_hdf(path, 'obj', format='fixed') - reread = read_hdf(path, 'obj') + obj.to_hdf(path, "obj", format="fixed") + reread = read_hdf(path, "obj") comp(obj, reread) def test_complex_across_dimensions(self): complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) - s = Series(complex128, index=list('abcd')) - df = DataFrame({'A': s, 'B': s}) + s = Series(complex128, index=list("abcd")) + df = DataFrame({"A": s, "B": s}) with catch_warnings(record=True): @@ -4869,47 +5034,48 @@ def test_complex_across_dimensions(self): comps = [tm.assert_frame_equal] for obj, comp in zip(objs, comps): with ensure_clean_path(self.path) as path: - obj.to_hdf(path, 'obj', format='table') - reread = read_hdf(path, 'obj') + obj.to_hdf(path, "obj", format="table") + reread = read_hdf(path, "obj") comp(obj, reread) def test_complex_indexing_error(self): - complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], - dtype=np.complex128) - df = DataFrame({'A': [1, 2, 3, 4], - 'B': ['a', 'b', 'c', 'd'], - 'C': complex128}, - index=list('abcd')) + complex128 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 + ) + df = DataFrame( + {"A": [1, 2, 3, 4], "B": ["a", "b", "c", "d"], "C": complex128}, + index=list("abcd"), + ) with ensure_clean_store(self.path) as store: with pytest.raises(TypeError): - store.append('df', df, data_columns=['C']) + store.append("df", df, data_columns=["C"]) def test_complex_series_error(self): complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) - s = Series(complex128, index=list('abcd')) + s = Series(complex128, index=list("abcd")) with ensure_clean_path(self.path) as path: with pytest.raises(TypeError): - s.to_hdf(path, 'obj', format='t') + s.to_hdf(path, "obj", format="t") with ensure_clean_path(self.path) as path: - s.to_hdf(path, 'obj', format='t', index=False) - reread = read_hdf(path, 'obj') + s.to_hdf(path, "obj", format="t", index=False) + reread = read_hdf(path, "obj") tm.assert_series_equal(s, reread) def test_complex_append(self): - df = DataFrame({'a': np.random.randn(100).astype(np.complex128), - 'b': np.random.randn(100)}) + df = DataFrame( + {"a": np.random.randn(100).astype(np.complex128), "b": np.random.randn(100)} + ) with ensure_clean_store(self.path) as store: - store.append('df', df, data_columns=['b']) - store.append('df', df) - result = store.select('df') + store.append("df", df, data_columns=["b"]) + store.append("df", df) + result = store.select("df") assert_frame_equal(pd.concat([df, df], 0), result) class TestTimezones(Base): - def _compare_with_tz(self, a, b): tm.assert_frame_equal(a, b) @@ -4919,8 +5085,7 @@ def _compare_with_tz(self, a, b): a_e = a.loc[i, c] b_e = b.loc[i, c] if not (a_e == b_e and a_e.tz == b_e.tz): - raise AssertionError( - "invalid tz comparison [%s] [%s]" % (a_e, b_e)) + raise AssertionError("invalid tz comparison [%s] [%s]" % (a_e, b_e)) def test_append_with_timezones_dateutil(self): @@ -4929,74 +5094,98 @@ def test_append_with_timezones_dateutil(self): # use maybe_get_tz instead of dateutil.tz.gettz to handle the windows # filename issues. from pandas._libs.tslibs.timezones import maybe_get_tz - gettz = lambda x: maybe_get_tz('dateutil/' + x) + + gettz = lambda x: maybe_get_tz("dateutil/" + x) # as columns with ensure_clean_store(self.path) as store: - _maybe_remove(store, 'df_tz') - df = DataFrame(dict(A=[Timestamp('20130102 2:00:00', tz=gettz( - 'US/Eastern')) + timedelta(hours=1) * i for i in range(5)])) - - store.append('df_tz', df, data_columns=['A']) - result = store['df_tz'] + _maybe_remove(store, "df_tz") + df = DataFrame( + dict( + A=[ + Timestamp("20130102 2:00:00", tz=gettz("US/Eastern")) + + timedelta(hours=1) * i + for i in range(5) + ] + ) + ) + + store.append("df_tz", df, data_columns=["A"]) + result = store["df_tz"] self._compare_with_tz(result, df) assert_frame_equal(result, df) # select with tz aware expected = df[df.A >= df.A[3]] - result = store.select('df_tz', where='A>=df.A[3]') + result = store.select("df_tz", where="A>=df.A[3]") self._compare_with_tz(result, expected) # ensure we include dates in DST and STD time here. - _maybe_remove(store, 'df_tz') - df = DataFrame(dict(A=Timestamp('20130102', - tz=gettz('US/Eastern')), - B=Timestamp('20130603', - tz=gettz('US/Eastern'))), - index=range(5)) - store.append('df_tz', df) - result = store['df_tz'] + _maybe_remove(store, "df_tz") + df = DataFrame( + dict( + A=Timestamp("20130102", tz=gettz("US/Eastern")), + B=Timestamp("20130603", tz=gettz("US/Eastern")), + ), + index=range(5), + ) + store.append("df_tz", df) + result = store["df_tz"] self._compare_with_tz(result, df) assert_frame_equal(result, df) - df = DataFrame(dict(A=Timestamp('20130102', - tz=gettz('US/Eastern')), - B=Timestamp('20130102', tz=gettz('EET'))), - index=range(5)) + df = DataFrame( + dict( + A=Timestamp("20130102", tz=gettz("US/Eastern")), + B=Timestamp("20130102", tz=gettz("EET")), + ), + index=range(5), + ) with pytest.raises(ValueError): - store.append('df_tz', df) + store.append("df_tz", df) # this is ok - _maybe_remove(store, 'df_tz') - store.append('df_tz', df, data_columns=['A', 'B']) - result = store['df_tz'] + _maybe_remove(store, "df_tz") + store.append("df_tz", df, data_columns=["A", "B"]) + result = store["df_tz"] self._compare_with_tz(result, df) assert_frame_equal(result, df) # can't append with diff timezone - df = DataFrame(dict(A=Timestamp('20130102', - tz=gettz('US/Eastern')), - B=Timestamp('20130102', tz=gettz('CET'))), - index=range(5)) + df = DataFrame( + dict( + A=Timestamp("20130102", tz=gettz("US/Eastern")), + B=Timestamp("20130102", tz=gettz("CET")), + ), + index=range(5), + ) with pytest.raises(ValueError): - store.append('df_tz', df) + store.append("df_tz", df) # as index with ensure_clean_store(self.path) as store: # GH 4098 example - df = DataFrame(dict(A=Series(range(3), index=date_range( - '2000-1-1', periods=3, freq='H', tz=gettz('US/Eastern'))))) - - _maybe_remove(store, 'df') - store.put('df', df) - result = store.select('df') + df = DataFrame( + dict( + A=Series( + range(3), + index=date_range( + "2000-1-1", periods=3, freq="H", tz=gettz("US/Eastern") + ), + ) + ) + ) + + _maybe_remove(store, "df") + store.put("df", df) + result = store.select("df") assert_frame_equal(result, df) - _maybe_remove(store, 'df') - store.append('df', df) - result = store.select('df') + _maybe_remove(store, "df") + store.append("df", df) + result = store.select("df") assert_frame_equal(result, df) def test_append_with_timezones_pytz(self): @@ -5006,65 +5195,91 @@ def test_append_with_timezones_pytz(self): # as columns with ensure_clean_store(self.path) as store: - _maybe_remove(store, 'df_tz') - df = DataFrame(dict(A=[Timestamp('20130102 2:00:00', - tz='US/Eastern') + - timedelta(hours=1) * i - for i in range(5)])) - store.append('df_tz', df, data_columns=['A']) - result = store['df_tz'] + _maybe_remove(store, "df_tz") + df = DataFrame( + dict( + A=[ + Timestamp("20130102 2:00:00", tz="US/Eastern") + + timedelta(hours=1) * i + for i in range(5) + ] + ) + ) + store.append("df_tz", df, data_columns=["A"]) + result = store["df_tz"] self._compare_with_tz(result, df) assert_frame_equal(result, df) # select with tz aware - self._compare_with_tz(store.select( - 'df_tz', where='A>=df.A[3]'), df[df.A >= df.A[3]]) + self._compare_with_tz( + store.select("df_tz", where="A>=df.A[3]"), df[df.A >= df.A[3]] + ) - _maybe_remove(store, 'df_tz') + _maybe_remove(store, "df_tz") # ensure we include dates in DST and STD time here. - df = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), - B=Timestamp('20130603', tz='US/Eastern')), - index=range(5)) - store.append('df_tz', df) - result = store['df_tz'] + df = DataFrame( + dict( + A=Timestamp("20130102", tz="US/Eastern"), + B=Timestamp("20130603", tz="US/Eastern"), + ), + index=range(5), + ) + store.append("df_tz", df) + result = store["df_tz"] self._compare_with_tz(result, df) assert_frame_equal(result, df) - df = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), - B=Timestamp('20130102', tz='EET')), - index=range(5)) + df = DataFrame( + dict( + A=Timestamp("20130102", tz="US/Eastern"), + B=Timestamp("20130102", tz="EET"), + ), + index=range(5), + ) with pytest.raises(ValueError): - store.append('df_tz', df) + store.append("df_tz", df) # this is ok - _maybe_remove(store, 'df_tz') - store.append('df_tz', df, data_columns=['A', 'B']) - result = store['df_tz'] + _maybe_remove(store, "df_tz") + store.append("df_tz", df, data_columns=["A", "B"]) + result = store["df_tz"] self._compare_with_tz(result, df) assert_frame_equal(result, df) # can't append with diff timezone - df = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), - B=Timestamp('20130102', tz='CET')), - index=range(5)) + df = DataFrame( + dict( + A=Timestamp("20130102", tz="US/Eastern"), + B=Timestamp("20130102", tz="CET"), + ), + index=range(5), + ) with pytest.raises(ValueError): - store.append('df_tz', df) + store.append("df_tz", df) # as index with ensure_clean_store(self.path) as store: # GH 4098 example - df = DataFrame(dict(A=Series(range(3), index=date_range( - '2000-1-1', periods=3, freq='H', tz='US/Eastern')))) - - _maybe_remove(store, 'df') - store.put('df', df) - result = store.select('df') + df = DataFrame( + dict( + A=Series( + range(3), + index=date_range( + "2000-1-1", periods=3, freq="H", tz="US/Eastern" + ), + ) + ) + ) + + _maybe_remove(store, "df") + store.put("df", df) + result = store.select("df") assert_frame_equal(result, df) - _maybe_remove(store, 'df') - store.append('df', df) - result = store.select('df') + _maybe_remove(store, "df") + store.append("df", df) + result = store.select("df") assert_frame_equal(result, df) def test_tseries_select_index_column(self): @@ -5073,60 +5288,65 @@ def test_tseries_select_index_column(self): # not preserve UTC tzinfo set before storing # check that no tz still works - rng = date_range('1/1/2000', '1/30/2000') + rng = date_range("1/1/2000", "1/30/2000") frame = DataFrame(np.random.randn(len(rng), 4), index=rng) with ensure_clean_store(self.path) as store: - store.append('frame', frame) - result = store.select_column('frame', 'index') + store.append("frame", frame) + result = store.select_column("frame", "index") assert rng.tz == DatetimeIndex(result.values).tz # check utc - rng = date_range('1/1/2000', '1/30/2000', tz='UTC') + rng = date_range("1/1/2000", "1/30/2000", tz="UTC") frame = DataFrame(np.random.randn(len(rng), 4), index=rng) with ensure_clean_store(self.path) as store: - store.append('frame', frame) - result = store.select_column('frame', 'index') + store.append("frame", frame) + result = store.select_column("frame", "index") assert rng.tz == result.dt.tz # double check non-utc - rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern') + rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern") frame = DataFrame(np.random.randn(len(rng), 4), index=rng) with ensure_clean_store(self.path) as store: - store.append('frame', frame) - result = store.select_column('frame', 'index') + store.append("frame", frame) + result = store.select_column("frame", "index") assert rng.tz == result.dt.tz def test_timezones_fixed(self): with ensure_clean_store(self.path) as store: # index - rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern') + rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern") df = DataFrame(np.random.randn(len(rng), 4), index=rng) - store['df'] = df - result = store['df'] + store["df"] = df + result = store["df"] assert_frame_equal(result, df) # as data # GH11411 - _maybe_remove(store, 'df') - df = DataFrame({'A': rng, - 'B': rng.tz_convert('UTC').tz_localize(None), - 'C': rng.tz_convert('CET'), - 'D': range(len(rng))}, index=rng) - store['df'] = df - result = store['df'] + _maybe_remove(store, "df") + df = DataFrame( + { + "A": rng, + "B": rng.tz_convert("UTC").tz_localize(None), + "C": rng.tz_convert("CET"), + "D": range(len(rng)), + }, + index=rng, + ) + store["df"] = df + result = store["df"] assert_frame_equal(result, df) def test_fixed_offset_tz(self): - rng = date_range('1/1/2000 00:00:00-07:00', '1/30/2000 00:00:00-07:00') + rng = date_range("1/1/2000 00:00:00-07:00", "1/30/2000 00:00:00-07:00") frame = DataFrame(np.random.randn(len(rng), 4), index=rng) with ensure_clean_store(self.path) as store: - store['frame'] = frame - recons = store['frame'] + store["frame"] = frame + recons = store["frame"] tm.assert_index_equal(recons.index, rng) assert rng.tz == recons.index.tz @@ -5141,63 +5361,67 @@ def test_store_timezone(self): today = datetime.date(2013, 9, 10) df = DataFrame([1, 2, 3], index=[today, today, today]) - store['obj1'] = df - result = store['obj1'] + store["obj1"] = df + result = store["obj1"] assert_frame_equal(result, df) # with tz setting with ensure_clean_store(self.path) as store: - with set_timezone('EST5EDT'): + with set_timezone("EST5EDT"): today = datetime.date(2013, 9, 10) df = DataFrame([1, 2, 3], index=[today, today, today]) - store['obj1'] = df + store["obj1"] = df - with set_timezone('CST6CDT'): - result = store['obj1'] + with set_timezone("CST6CDT"): + result = store["obj1"] assert_frame_equal(result, df) def test_legacy_datetimetz_object(self, datapath): # legacy from < 0.17.0 # 8260 - expected = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), - B=Timestamp('20130603', tz='CET')), - index=range(5)) + expected = DataFrame( + dict( + A=Timestamp("20130102", tz="US/Eastern"), + B=Timestamp("20130603", tz="CET"), + ), + index=range(5), + ) with ensure_clean_store( - datapath('io', 'data', 'legacy_hdf', 'datetimetz_object.h5'), - mode='r') as store: - result = store['df'] + datapath("io", "data", "legacy_hdf", "datetimetz_object.h5"), mode="r" + ) as store: + result = store["df"] assert_frame_equal(result, expected) def test_dst_transitions(self): # make sure we are not failing on transitions with ensure_clean_store(self.path) as store: - times = pd.date_range("2013-10-26 23:00", "2013-10-27 01:00", - tz="Europe/London", - freq="H", - ambiguous='infer') - - for i in [times, times + pd.Timedelta('10min')]: - _maybe_remove(store, 'df') - df = DataFrame({'A': range(len(i)), 'B': i}, index=i) - store.append('df', df) - result = store.select('df') + times = pd.date_range( + "2013-10-26 23:00", + "2013-10-27 01:00", + tz="Europe/London", + freq="H", + ambiguous="infer", + ) + + for i in [times, times + pd.Timedelta("10min")]: + _maybe_remove(store, "df") + df = DataFrame({"A": range(len(i)), "B": i}, index=i) + store.append("df", df) + result = store.select("df") assert_frame_equal(result, df) def test_read_with_where_tz_aware_index(self): # GH 11926 periods = 10 - dts = pd.date_range('20151201', periods=periods, - freq='D', tz='UTC') - mi = pd.MultiIndex.from_arrays([dts, range(periods)], - names=['DATE', 'NO']) - expected = pd.DataFrame({'MYCOL': 0}, index=mi) + dts = pd.date_range("20151201", periods=periods, freq="D", tz="UTC") + mi = pd.MultiIndex.from_arrays([dts, range(periods)], names=["DATE", "NO"]) + expected = pd.DataFrame({"MYCOL": 0}, index=mi) - key = 'mykey' + key = "mykey" with ensure_clean_path(self.path) as path: with pd.HDFStore(path) as store: - store.append(key, expected, format='table', append=True) - result = pd.read_hdf(path, key, - where="DATE > 20151130") + store.append(key, expected, format="table", append=True) + result = pd.read_hdf(path, key, where="DATE > 20151130") assert_frame_equal(result, expected) diff --git a/pandas/tests/io/sas/test_sas.py b/pandas/tests/io/sas/test_sas.py index 134aa810db5be7..fcd2e0e35ad9e1 100644 --- a/pandas/tests/io/sas/test_sas.py +++ b/pandas/tests/io/sas/test_sas.py @@ -7,19 +7,20 @@ class TestSas: - def test_sas_buffer_format(self): # see gh-14947 b = StringIO("") - msg = ("If this is a buffer object rather than a string " - "name, you must specify a format string") + msg = ( + "If this is a buffer object rather than a string " + "name, you must specify a format string" + ) with pytest.raises(ValueError, match=msg): read_sas(b) def test_sas_read_no_format_or_extension(self): # see gh-24548 - msg = ("unable to infer format of SAS file") - with tm.ensure_clean('test_file_no_extension') as path: + msg = "unable to infer format of SAS file" + with tm.ensure_clean("test_file_no_extension") as path: with pytest.raises(ValueError, match=msg): read_sas(path) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 2c8d1281f2c340..e37561c865c7a0 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -14,20 +14,18 @@ # https://github.com/cython/cython/issues/1720 @pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning") class TestSAS7BDAT: - @pytest.fixture(autouse=True) def setup_method(self, datapath): self.dirpath = datapath("io", "sas", "data") self.data = [] self.test_ix = [list(range(1, 16)), [16]] for j in 1, 2: - fname = os.path.join( - self.dirpath, "test_sas7bdat_{j}.csv".format(j=j)) + fname = os.path.join(self.dirpath, "test_sas7bdat_{j}.csv".format(j=j)) df = pd.read_csv(fname) epoch = pd.datetime(1960, 1, 1) - t1 = pd.to_timedelta(df["Column4"], unit='d') + t1 = pd.to_timedelta(df["Column4"], unit="d") df["Column4"] = epoch + t1 - t2 = pd.to_timedelta(df["Column12"], unit='d') + t2 = pd.to_timedelta(df["Column12"], unit="d") df["Column12"] = epoch + t2 for k in range(df.shape[1]): col = df.iloc[:, k] @@ -39,22 +37,21 @@ def test_from_file(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: - fname = os.path.join( - self.dirpath, "test{k}.sas7bdat".format(k=k)) - df = pd.read_sas(fname, encoding='utf-8') + fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) + df = pd.read_sas(fname, encoding="utf-8") tm.assert_frame_equal(df, df0) def test_from_buffer(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: - fname = os.path.join( - self.dirpath, "test{k}.sas7bdat".format(k=k)) - with open(fname, 'rb') as f: + fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) + with open(fname, "rb") as f: byts = f.read() buf = io.BytesIO(byts) - rdr = pd.read_sas(buf, format="sas7bdat", - iterator=True, encoding='utf-8') + rdr = pd.read_sas( + buf, format="sas7bdat", iterator=True, encoding="utf-8" + ) df = rdr.read() tm.assert_frame_equal(df, df0, check_exact=False) rdr.close() @@ -63,35 +60,36 @@ def test_from_iterator(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: - fname = os.path.join( - self.dirpath, "test{k}.sas7bdat".format(k=k)) - rdr = pd.read_sas(fname, iterator=True, encoding='utf-8') + fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) + rdr = pd.read_sas(fname, iterator=True, encoding="utf-8") df = rdr.read(2) tm.assert_frame_equal(df, df0.iloc[0:2, :]) df = rdr.read(3) tm.assert_frame_equal(df, df0.iloc[2:5, :]) rdr.close() - @td.skip_if_no('pathlib') + @td.skip_if_no("pathlib") def test_path_pathlib(self): from pathlib import Path + for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: - fname = Path(os.path.join( - self.dirpath, "test{k}.sas7bdat".format(k=k))) - df = pd.read_sas(fname, encoding='utf-8') + fname = Path(os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k))) + df = pd.read_sas(fname, encoding="utf-8") tm.assert_frame_equal(df, df0) - @td.skip_if_no('py.path') + @td.skip_if_no("py.path") def test_path_localpath(self): from py.path import local as LocalPath + for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: - fname = LocalPath(os.path.join( - self.dirpath, "test{k}.sas7bdat".format(k=k))) - df = pd.read_sas(fname, encoding='utf-8') + fname = LocalPath( + os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) + ) + df = pd.read_sas(fname, encoding="utf-8") tm.assert_frame_equal(df, df0) def test_iterator_loop(self): @@ -99,9 +97,8 @@ def test_iterator_loop(self): for j in 0, 1: for k in self.test_ix[j]: for chunksize in 3, 5, 10, 11: - fname = os.path.join( - self.dirpath, "test{k}.sas7bdat".format(k=k)) - rdr = pd.read_sas(fname, chunksize=10, encoding='utf-8') + fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) + rdr = pd.read_sas(fname, chunksize=10, encoding="utf-8") y = 0 for x in rdr: y += x.shape[0] @@ -112,8 +109,7 @@ def test_iterator_read_too_much(self): # github #14734 k = self.test_ix[0][0] fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) - rdr = pd.read_sas(fname, format="sas7bdat", - iterator=True, encoding='utf-8') + rdr = pd.read_sas(fname, format="sas7bdat", iterator=True, encoding="utf-8") d1 = rdr.read(rdr.row_count + 20) rdr.close() @@ -126,27 +122,28 @@ def test_iterator_read_too_much(self): def test_encoding_options(datapath): fname = datapath("io", "sas", "data", "test1.sas7bdat") df1 = pd.read_sas(fname) - df2 = pd.read_sas(fname, encoding='utf-8') + df2 = pd.read_sas(fname, encoding="utf-8") for col in df1.columns: try: - df1[col] = df1[col].str.decode('utf-8') + df1[col] = df1[col].str.decode("utf-8") except AttributeError: pass tm.assert_frame_equal(df1, df2) from pandas.io.sas.sas7bdat import SAS7BDATReader + rdr = SAS7BDATReader(fname, convert_header_text=False) df3 = rdr.read() rdr.close() for x, y in zip(df1.columns, df3.columns): - assert(x == y.decode()) + assert x == y.decode() def test_productsales(datapath): fname = datapath("io", "sas", "data", "productsales.sas7bdat") - df = pd.read_sas(fname, encoding='utf-8') + df = pd.read_sas(fname, encoding="utf-8") fname = datapath("io", "sas", "data", "productsales.csv") - df0 = pd.read_csv(fname, parse_dates=['MONTH']) + df0 = pd.read_csv(fname, parse_dates=["MONTH"]) vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"] df0[vn] = df0[vn].astype(np.float64) tm.assert_frame_equal(df, df0) @@ -175,42 +172,43 @@ def test_date_time(datapath): fname = datapath("io", "sas", "data", "datetime.sas7bdat") df = pd.read_sas(fname) fname = datapath("io", "sas", "data", "datetime.csv") - df0 = pd.read_csv(fname, parse_dates=['Date1', 'Date2', 'DateTime', - 'DateTimeHi', 'Taiw']) + df0 = pd.read_csv( + fname, parse_dates=["Date1", "Date2", "DateTime", "DateTimeHi", "Taiw"] + ) # GH 19732: Timestamps imported from sas will incur floating point errors - df.iloc[:, 3] = df.iloc[:, 3].dt.round('us') + df.iloc[:, 3] = df.iloc[:, 3].dt.round("us") tm.assert_frame_equal(df, df0) def test_compact_numerical_values(datapath): # Regression test for #21616 fname = datapath("io", "sas", "data", "cars.sas7bdat") - df = pd.read_sas(fname, encoding='latin-1') + df = pd.read_sas(fname, encoding="latin-1") # The two columns CYL and WGT in cars.sas7bdat have column # width < 8 and only contain integral values. # Test that pandas doesn't corrupt the numbers by adding # decimals. - result = df['WGT'] - expected = df['WGT'].round() + result = df["WGT"] + expected = df["WGT"].round() tm.assert_series_equal(result, expected, check_exact=True) - result = df['CYL'] - expected = df['CYL'].round() + result = df["CYL"] + expected = df["CYL"].round() tm.assert_series_equal(result, expected, check_exact=True) def test_many_columns(datapath): # Test for looking for column information in more places (PR #22628) fname = datapath("io", "sas", "data", "many_columns.sas7bdat") - df = pd.read_sas(fname, encoding='latin-1') + df = pd.read_sas(fname, encoding="latin-1") fname = datapath("io", "sas", "data", "many_columns.csv") - df0 = pd.read_csv(fname, encoding='latin-1') + df0 = pd.read_csv(fname, encoding="latin-1") tm.assert_frame_equal(df, df0) def test_inconsistent_number_of_rows(datapath): # Regression test for issue #16615. (PR #22628) fname = datapath("io", "sas", "data", "load_log.sas7bdat") - df = pd.read_sas(fname, encoding='latin-1') + df = pd.read_sas(fname, encoding="latin-1") assert len(df) == 2097 diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py index 9024216fb60fa2..7893877be2033f 100644 --- a/pandas/tests/io/sas/test_xport.py +++ b/pandas/tests/io/sas/test_xport.py @@ -16,12 +16,11 @@ def numeric_as_float(data): for v in data.columns: - if data[v].dtype is np.dtype('int64'): + if data[v].dtype is np.dtype("int64"): data[v] = data[v].astype(np.float64) class TestXport: - @pytest.fixture(autouse=True) def setup_method(self, datapath): self.dirpath = datapath("io", "sas", "data") @@ -85,20 +84,16 @@ def test1_index(self): tm.assert_frame_equal(data, data_csv, check_index_type=False) # Test incremental read with `read` method. - reader = read_sas(self.file01, index="SEQN", format="xport", - iterator=True) + reader = read_sas(self.file01, index="SEQN", format="xport", iterator=True) data = reader.read(10) reader.close() - tm.assert_frame_equal(data, data_csv.iloc[0:10, :], - check_index_type=False) + tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) # Test incremental read with `get_chunk` method. - reader = read_sas(self.file01, index="SEQN", format="xport", - chunksize=10) + reader = read_sas(self.file01, index="SEQN", format="xport", chunksize=10) data = reader.get_chunk() reader.close() - tm.assert_frame_equal(data, data_csv.iloc[0:10, :], - check_index_type=False) + tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) def test1_incremental(self): # Test with DEMO_G.xpt, reading full file incrementally @@ -143,4 +138,4 @@ def test_truncated_float_support(self): data_csv = pd.read_csv(self.file04.replace(".xpt", ".csv")) data = read_sas(self.file04, format="xport") - tm.assert_frame_equal(data.astype('int64'), data_csv) + tm.assert_frame_equal(data.astype("int64"), data_csv) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 04223efd6eacb2..fccd52f9916b84 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -13,7 +13,7 @@ from pandas.io.clipboard.exceptions import PyperclipException try: - DataFrame({'A': [1, 2]}).to_clipboard() + DataFrame({"A": [1, 2]}).to_clipboard() _DEPS_INSTALLED = 1 except (PyperclipException, RuntimeError): _DEPS_INSTALLED = 0 @@ -21,57 +21,90 @@ def build_kwargs(sep, excel): kwargs = {} - if excel != 'default': - kwargs['excel'] = excel - if sep != 'default': - kwargs['sep'] = sep + if excel != "default": + kwargs["excel"] = excel + if sep != "default": + kwargs["sep"] = sep return kwargs -@pytest.fixture(params=['delims', 'utf8', 'utf16', 'string', 'long', - 'nonascii', 'colwidth', 'mixed', 'float', 'int']) +@pytest.fixture( + params=[ + "delims", + "utf8", + "utf16", + "string", + "long", + "nonascii", + "colwidth", + "mixed", + "float", + "int", + ] +) def df(request): data_type = request.param - if data_type == 'delims': - return pd.DataFrame({'a': ['"a,\t"b|c', 'd\tef´'], - 'b': ['hi\'j', 'k\'\'lm']}) - elif data_type == 'utf8': - return pd.DataFrame({'a': ['µasd', 'Ωœ∑´'], - 'b': ['øπ∆˚¬', 'œ∑´®']}) - elif data_type == 'utf16': - return pd.DataFrame({'a': ['\U0001f44d\U0001f44d', - '\U0001f44d\U0001f44d'], - 'b': ['abc', 'def']}) - elif data_type == 'string': - return mkdf(5, 3, c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - elif data_type == 'long': - max_rows = get_option('display.max_rows') - return mkdf(max_rows + 1, 3, - data_gen_f=lambda *args: randint(2), - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - elif data_type == 'nonascii': - return pd.DataFrame({'en': 'in English'.split(), - 'es': 'en español'.split()}) - elif data_type == 'colwidth': - _cw = get_option('display.max_colwidth') + 1 - return mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw, - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - elif data_type == 'mixed': - return DataFrame({'a': np.arange(1.0, 6.0) + 0.01, - 'b': np.arange(1, 6), - 'c': list('abcde')}) - elif data_type == 'float': - return mkdf(5, 3, data_gen_f=lambda r, c: float(r) + 0.01, - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - elif data_type == 'int': - return mkdf(5, 3, data_gen_f=lambda *args: randint(2), - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) + if data_type == "delims": + return pd.DataFrame({"a": ['"a,\t"b|c', "d\tef´"], "b": ["hi'j", "k''lm"]}) + elif data_type == "utf8": + return pd.DataFrame({"a": ["µasd", "Ωœ∑´"], "b": ["øπ∆˚¬", "œ∑´®"]}) + elif data_type == "utf16": + return pd.DataFrame( + {"a": ["\U0001f44d\U0001f44d", "\U0001f44d\U0001f44d"], "b": ["abc", "def"]} + ) + elif data_type == "string": + return mkdf( + 5, 3, c_idx_type="s", r_idx_type="i", c_idx_names=[None], r_idx_names=[None] + ) + elif data_type == "long": + max_rows = get_option("display.max_rows") + return mkdf( + max_rows + 1, + 3, + data_gen_f=lambda *args: randint(2), + c_idx_type="s", + r_idx_type="i", + c_idx_names=[None], + r_idx_names=[None], + ) + elif data_type == "nonascii": + return pd.DataFrame({"en": "in English".split(), "es": "en español".split()}) + elif data_type == "colwidth": + _cw = get_option("display.max_colwidth") + 1 + return mkdf( + 5, + 3, + data_gen_f=lambda *args: "x" * _cw, + c_idx_type="s", + r_idx_type="i", + c_idx_names=[None], + r_idx_names=[None], + ) + elif data_type == "mixed": + return DataFrame( + {"a": np.arange(1.0, 6.0) + 0.01, "b": np.arange(1, 6), "c": list("abcde")} + ) + elif data_type == "float": + return mkdf( + 5, + 3, + data_gen_f=lambda r, c: float(r) + 0.01, + c_idx_type="s", + r_idx_type="i", + c_idx_names=[None], + r_idx_names=[None], + ) + elif data_type == "int": + return mkdf( + 5, + 3, + data_gen_f=lambda *args: randint(2), + c_idx_type="s", + r_idx_type="i", + c_idx_names=[None], + r_idx_names=[None], + ) else: raise ValueError @@ -108,6 +141,7 @@ def _mock_get(): @pytest.mark.clipboard def test_mock_clipboard(mock_clipboard): import pandas.io.clipboard + pandas.io.clipboard.clipboard_set("abc") assert "abc" in set(mock_clipboard.values()) result = pandas.io.clipboard.clipboard_get() @@ -116,16 +150,12 @@ def test_mock_clipboard(mock_clipboard): @pytest.mark.single @pytest.mark.clipboard -@pytest.mark.skipif(not _DEPS_INSTALLED, - reason="clipboard primitives not installed") +@pytest.mark.skipif(not _DEPS_INSTALLED, reason="clipboard primitives not installed") @pytest.mark.usefixtures("mock_clipboard") class TestClipboard: - - def check_round_trip_frame(self, data, excel=None, sep=None, - encoding=None): + def check_round_trip_frame(self, data, excel=None, sep=None, encoding=None): data.to_clipboard(excel=excel, sep=sep, encoding=encoding) - result = read_clipboard(sep=sep or '\t', index_col=0, - encoding=encoding) + result = read_clipboard(sep=sep or "\t", index_col=0, encoding=encoding) tm.assert_frame_equal(data, result, check_dtype=False) # Test that default arguments copy as tab delimited @@ -133,7 +163,7 @@ def test_round_trip_frame(self, df): self.check_round_trip_frame(df) # Test that explicit delimiters are respected - @pytest.mark.parametrize('sep', ['\t', ',', '|']) + @pytest.mark.parametrize("sep", ["\t", ",", "|"]) def test_round_trip_frame_sep(self, df, sep): self.check_round_trip_frame(df, sep=sep) @@ -148,63 +178,67 @@ def test_round_trip_frame_string(self, df): # Test that multi-character separators are not silently passed def test_excel_sep_warning(self, df): with tm.assert_produces_warning(): - df.to_clipboard(excel=True, sep=r'\t') + df.to_clipboard(excel=True, sep=r"\t") # Separator is ignored when excel=False and should produce a warning def test_copy_delim_warning(self, df): with tm.assert_produces_warning(): - df.to_clipboard(excel=False, sep='\t') + df.to_clipboard(excel=False, sep="\t") # Tests that the default behavior of to_clipboard is tab # delimited and excel="True" - @pytest.mark.parametrize('sep', ['\t', None, 'default']) - @pytest.mark.parametrize('excel', [True, None, 'default']) - def test_clipboard_copy_tabs_default(self, sep, excel, df, request, - mock_clipboard): + @pytest.mark.parametrize("sep", ["\t", None, "default"]) + @pytest.mark.parametrize("excel", [True, None, "default"]) + def test_clipboard_copy_tabs_default(self, sep, excel, df, request, mock_clipboard): kwargs = build_kwargs(sep, excel) df.to_clipboard(**kwargs) - assert mock_clipboard[request.node.name] == df.to_csv(sep='\t') + assert mock_clipboard[request.node.name] == df.to_csv(sep="\t") # Tests reading of white space separated tables - @pytest.mark.parametrize('sep', [None, 'default']) - @pytest.mark.parametrize('excel', [False]) + @pytest.mark.parametrize("sep", [None, "default"]) + @pytest.mark.parametrize("excel", [False]) def test_clipboard_copy_strings(self, sep, excel, df): kwargs = build_kwargs(sep, excel) df.to_clipboard(**kwargs) - result = read_clipboard(sep=r'\s+') + result = read_clipboard(sep=r"\s+") assert result.to_string() == df.to_string() assert df.shape == result.shape - def test_read_clipboard_infer_excel(self, request, - mock_clipboard): + def test_read_clipboard_infer_excel(self, request, mock_clipboard): # gh-19010: avoid warnings clip_kwargs = dict(engine="python") - text = dedent(""" + text = dedent( + """ John James Charlie Mingus 1 2 4 Harry Carney - """.strip()) + """.strip() + ) mock_clipboard[request.node.name] = text df = pd.read_clipboard(**clip_kwargs) # excel data is parsed correctly - assert df.iloc[1][1] == 'Harry Carney' + assert df.iloc[1][1] == "Harry Carney" # having diff tab counts doesn't trigger it - text = dedent(""" + text = dedent( + """ a\t b 1 2 3 4 - """.strip()) + """.strip() + ) mock_clipboard[request.node.name] = text res = pd.read_clipboard(**clip_kwargs) - text = dedent(""" + text = dedent( + """ a b 1 2 3 4 - """.strip()) + """.strip() + ) mock_clipboard[request.node.name] = text exp = pd.read_clipboard(**clip_kwargs) @@ -213,20 +247,19 @@ def test_read_clipboard_infer_excel(self, request, def test_invalid_encoding(self, df): # test case for testing invalid encoding with pytest.raises(ValueError): - df.to_clipboard(encoding='ascii') + df.to_clipboard(encoding="ascii") with pytest.raises(NotImplementedError): - pd.read_clipboard(encoding='ascii') + pd.read_clipboard(encoding="ascii") - @pytest.mark.parametrize('enc', ['UTF-8', 'utf-8', 'utf8']) + @pytest.mark.parametrize("enc", ["UTF-8", "utf-8", "utf8"]) def test_round_trip_valid_encodings(self, enc, df): self.check_round_trip_frame(df, encoding=enc) @pytest.mark.single @pytest.mark.clipboard -@pytest.mark.skipif(not _DEPS_INSTALLED, - reason="clipboard primitives not installed") -@pytest.mark.parametrize('data', ['\U0001f44d...', 'Ωœ∑´...', 'abcd...']) +@pytest.mark.skipif(not _DEPS_INSTALLED, reason="clipboard primitives not installed") +@pytest.mark.parametrize("data", ["\U0001f44d...", "Ωœ∑´...", "abcd..."]) def test_raw_roundtrip(data): # PR #25040 wide unicode wasn't copied correctly on PY3 on windows clipboard_set(data) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 04faf5aee4b6d4..426698bfa1e940 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -18,6 +18,7 @@ class CustomFSPath: """For testing fspath on unknown objects""" + def __init__(self, path): self.path = path @@ -30,12 +31,14 @@ def __fspath__(self): try: from pathlib import Path + path_types.append(Path) except ImportError: pass try: from py.path import local as LocalPath + path_types.append(LocalPath) except ImportError: pass @@ -57,7 +60,7 @@ class TestCommonIOCapabilities: """ def test_expand_user(self): - filename = '~/sometest' + filename = "~/sometest" expanded_name = icom._expand_user(filename) assert expanded_name != filename @@ -65,48 +68,44 @@ def test_expand_user(self): assert os.path.expanduser(filename) == expanded_name def test_expand_user_normal_path(self): - filename = '/somefolder/sometest' + filename = "/somefolder/sometest" expanded_name = icom._expand_user(filename) assert expanded_name == filename assert os.path.expanduser(filename) == expanded_name - @td.skip_if_no('pathlib') + @td.skip_if_no("pathlib") def test_stringify_path_pathlib(self): - rel_path = icom._stringify_path(Path('.')) - assert rel_path == '.' - redundant_path = icom._stringify_path(Path('foo//bar')) - assert redundant_path == os.path.join('foo', 'bar') + rel_path = icom._stringify_path(Path(".")) + assert rel_path == "." + redundant_path = icom._stringify_path(Path("foo//bar")) + assert redundant_path == os.path.join("foo", "bar") - @td.skip_if_no('py.path') + @td.skip_if_no("py.path") def test_stringify_path_localpath(self): - path = os.path.join('foo', 'bar') + path = os.path.join("foo", "bar") abs_path = os.path.abspath(path) lpath = LocalPath(path) assert icom._stringify_path(lpath) == abs_path def test_stringify_path_fspath(self): - p = CustomFSPath('foo/bar.csv') + p = CustomFSPath("foo/bar.csv") result = icom._stringify_path(p) - assert result == 'foo/bar.csv' - - @pytest.mark.parametrize('extension,expected', [ - ('', None), - ('.gz', 'gzip'), - ('.bz2', 'bz2'), - ('.zip', 'zip'), - ('.xz', 'xz'), - ]) - @pytest.mark.parametrize('path_type', path_types) + assert result == "foo/bar.csv" + + @pytest.mark.parametrize( + "extension,expected", + [("", None), (".gz", "gzip"), (".bz2", "bz2"), (".zip", "zip"), (".xz", "xz")], + ) + @pytest.mark.parametrize("path_type", path_types) def test_infer_compression_from_path(self, extension, expected, path_type): - path = path_type('foo/bar.csv' + extension) - compression = icom._infer_compression(path, compression='infer') + path = path_type("foo/bar.csv" + extension) + compression = icom._infer_compression(path, compression="infer") assert compression == expected def test_get_filepath_or_buffer_with_path(self): - filename = '~/sometest' - filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer( - filename) + filename = "~/sometest" + filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer(filename) assert filepath_or_buffer != filename assert os.path.isabs(filepath_or_buffer) assert os.path.expanduser(filename) == filepath_or_buffer @@ -115,7 +114,8 @@ def test_get_filepath_or_buffer_with_path(self): def test_get_filepath_or_buffer_with_buffer(self): input_buffer = StringIO() filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer( - input_buffer) + input_buffer + ) assert filepath_or_buffer == input_buffer assert not should_close @@ -131,82 +131,100 @@ def test_iterator(self): tm.assert_frame_equal(first, expected.iloc[[0]]) tm.assert_frame_equal(pd.concat(it), expected.iloc[1:]) - @pytest.mark.parametrize('reader, module, error_class, fn_ext', [ - (pd.read_csv, 'os', FileNotFoundError, 'csv'), - (pd.read_fwf, 'os', FileNotFoundError, 'txt'), - (pd.read_excel, 'xlrd', FileNotFoundError, 'xlsx'), - (pd.read_feather, 'feather', Exception, 'feather'), - (pd.read_hdf, 'tables', FileNotFoundError, 'h5'), - (pd.read_stata, 'os', FileNotFoundError, 'dta'), - (pd.read_sas, 'os', FileNotFoundError, 'sas7bdat'), - (pd.read_json, 'os', ValueError, 'json'), - (pd.read_msgpack, 'os', ValueError, 'mp'), - (pd.read_pickle, 'os', FileNotFoundError, 'pickle'), - ]) + @pytest.mark.parametrize( + "reader, module, error_class, fn_ext", + [ + (pd.read_csv, "os", FileNotFoundError, "csv"), + (pd.read_fwf, "os", FileNotFoundError, "txt"), + (pd.read_excel, "xlrd", FileNotFoundError, "xlsx"), + (pd.read_feather, "feather", Exception, "feather"), + (pd.read_hdf, "tables", FileNotFoundError, "h5"), + (pd.read_stata, "os", FileNotFoundError, "dta"), + (pd.read_sas, "os", FileNotFoundError, "sas7bdat"), + (pd.read_json, "os", ValueError, "json"), + (pd.read_msgpack, "os", ValueError, "mp"), + (pd.read_pickle, "os", FileNotFoundError, "pickle"), + ], + ) def test_read_non_existant(self, reader, module, error_class, fn_ext): pytest.importorskip(module) - path = os.path.join(HERE, 'data', 'does_not_exist.' + fn_ext) - msg1 = (r"File (b')?.+does_not_exist\.{}'? does not exist" - .format(fn_ext)) - msg2 = (r"\[Errno 2\] No such file or directory: '.+does_not_exist" - r"\.{}'").format(fn_ext) + path = os.path.join(HERE, "data", "does_not_exist." + fn_ext) + msg1 = r"File (b')?.+does_not_exist\.{}'? does not exist".format(fn_ext) + msg2 = ( + r"\[Errno 2\] No such file or directory: '.+does_not_exist" r"\.{}'" + ).format(fn_ext) msg3 = "Expected object or value" msg4 = "path_or_buf needs to be a string file path or file-like" - msg5 = (r"\[Errno 2\] File .+does_not_exist\.{} does not exist:" - r" '.+does_not_exist\.{}'").format(fn_ext, fn_ext) - with pytest.raises(error_class, match=r"({}|{}|{}|{}|{})".format( - msg1, msg2, msg3, msg4, msg5)): + msg5 = ( + r"\[Errno 2\] File .+does_not_exist\.{} does not exist:" + r" '.+does_not_exist\.{}'" + ).format(fn_ext, fn_ext) + with pytest.raises( + error_class, match=r"({}|{}|{}|{}|{})".format(msg1, msg2, msg3, msg4, msg5) + ): reader(path) - @pytest.mark.parametrize('reader, module, error_class, fn_ext', [ - (pd.read_csv, 'os', FileNotFoundError, 'csv'), - (pd.read_table, 'os', FileNotFoundError, 'csv'), - (pd.read_fwf, 'os', FileNotFoundError, 'txt'), - (pd.read_excel, 'xlrd', FileNotFoundError, 'xlsx'), - (pd.read_feather, 'feather', Exception, 'feather'), - (pd.read_hdf, 'tables', FileNotFoundError, 'h5'), - (pd.read_stata, 'os', FileNotFoundError, 'dta'), - (pd.read_sas, 'os', FileNotFoundError, 'sas7bdat'), - (pd.read_json, 'os', ValueError, 'json'), - (pd.read_msgpack, 'os', ValueError, 'mp'), - (pd.read_pickle, 'os', FileNotFoundError, 'pickle'), - ]) - def test_read_expands_user_home_dir(self, reader, module, - error_class, fn_ext, monkeypatch): + @pytest.mark.parametrize( + "reader, module, error_class, fn_ext", + [ + (pd.read_csv, "os", FileNotFoundError, "csv"), + (pd.read_table, "os", FileNotFoundError, "csv"), + (pd.read_fwf, "os", FileNotFoundError, "txt"), + (pd.read_excel, "xlrd", FileNotFoundError, "xlsx"), + (pd.read_feather, "feather", Exception, "feather"), + (pd.read_hdf, "tables", FileNotFoundError, "h5"), + (pd.read_stata, "os", FileNotFoundError, "dta"), + (pd.read_sas, "os", FileNotFoundError, "sas7bdat"), + (pd.read_json, "os", ValueError, "json"), + (pd.read_msgpack, "os", ValueError, "mp"), + (pd.read_pickle, "os", FileNotFoundError, "pickle"), + ], + ) + def test_read_expands_user_home_dir( + self, reader, module, error_class, fn_ext, monkeypatch + ): pytest.importorskip(module) - path = os.path.join('~', 'does_not_exist.' + fn_ext) - monkeypatch.setattr(icom, '_expand_user', - lambda x: os.path.join('foo', x)) + path = os.path.join("~", "does_not_exist." + fn_ext) + monkeypatch.setattr(icom, "_expand_user", lambda x: os.path.join("foo", x)) - msg1 = (r"File (b')?.+does_not_exist\.{}'? does not exist" - .format(fn_ext)) - msg2 = (r"\[Errno 2\] No such file or directory:" - r" '.+does_not_exist\.{}'").format(fn_ext) + msg1 = r"File (b')?.+does_not_exist\.{}'? does not exist".format(fn_ext) + msg2 = ( + r"\[Errno 2\] No such file or directory:" r" '.+does_not_exist\.{}'" + ).format(fn_ext) msg3 = "Unexpected character found when decoding 'false'" msg4 = "path_or_buf needs to be a string file path or file-like" - msg5 = (r"\[Errno 2\] File .+does_not_exist\.{} does not exist:" - r" '.+does_not_exist\.{}'").format(fn_ext, fn_ext) - - with pytest.raises(error_class, match=r"({}|{}|{}|{}|{})".format( - msg1, msg2, msg3, msg4, msg5)): + msg5 = ( + r"\[Errno 2\] File .+does_not_exist\.{} does not exist:" + r" '.+does_not_exist\.{}'" + ).format(fn_ext, fn_ext) + + with pytest.raises( + error_class, match=r"({}|{}|{}|{}|{})".format(msg1, msg2, msg3, msg4, msg5) + ): reader(path) - @pytest.mark.parametrize('reader, module, path', [ - (pd.read_csv, 'os', ('io', 'data', 'iris.csv')), - (pd.read_table, 'os', ('io', 'data', 'iris.csv')), - (pd.read_fwf, 'os', ('io', 'data', 'fixed_width_format.txt')), - (pd.read_excel, 'xlrd', ('io', 'data', 'test1.xlsx')), - (pd.read_feather, 'feather', ('io', 'data', 'feather-0_3_1.feather')), - (pd.read_hdf, 'tables', ('io', 'data', 'legacy_hdf', - 'datetimetz_object.h5')), - (pd.read_stata, 'os', ('io', 'data', 'stata10_115.dta')), - (pd.read_sas, 'os', ('io', 'sas', 'data', 'test1.sas7bdat')), - (pd.read_json, 'os', ('io', 'json', 'data', 'tsframe_v012.json')), - (pd.read_msgpack, 'os', ('io', 'msgpack', 'data', 'frame.mp')), - (pd.read_pickle, 'os', ('io', 'data', 'categorical_0_14_1.pickle')), - ]) + @pytest.mark.parametrize( + "reader, module, path", + [ + (pd.read_csv, "os", ("io", "data", "iris.csv")), + (pd.read_table, "os", ("io", "data", "iris.csv")), + (pd.read_fwf, "os", ("io", "data", "fixed_width_format.txt")), + (pd.read_excel, "xlrd", ("io", "data", "test1.xlsx")), + (pd.read_feather, "feather", ("io", "data", "feather-0_3_1.feather")), + ( + pd.read_hdf, + "tables", + ("io", "data", "legacy_hdf", "datetimetz_object.h5"), + ), + (pd.read_stata, "os", ("io", "data", "stata10_115.dta")), + (pd.read_sas, "os", ("io", "sas", "data", "test1.sas7bdat")), + (pd.read_json, "os", ("io", "json", "data", "tsframe_v012.json")), + (pd.read_msgpack, "os", ("io", "msgpack", "data", "frame.mp")), + (pd.read_pickle, "os", ("io", "data", "categorical_0_14_1.pickle")), + ], + ) def test_read_fspath_all(self, reader, module, path, datapath): pytest.importorskip(module) path = datapath(*path) @@ -215,26 +233,29 @@ def test_read_fspath_all(self, reader, module, path, datapath): result = reader(mypath) expected = reader(path) - if path.endswith('.pickle'): + if path.endswith(".pickle"): # categorical tm.assert_categorical_equal(result, expected) else: tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('writer_name, writer_kwargs, module', [ - ('to_csv', {}, 'os'), - ('to_excel', {'engine': 'xlwt'}, 'xlwt'), - ('to_feather', {}, 'feather'), - ('to_html', {}, 'os'), - ('to_json', {}, 'os'), - ('to_latex', {}, 'os'), - ('to_msgpack', {}, 'os'), - ('to_pickle', {}, 'os'), - ('to_stata', {'time_stamp': pd.to_datetime('2019-01-01 00:00')}, 'os'), - ]) + @pytest.mark.parametrize( + "writer_name, writer_kwargs, module", + [ + ("to_csv", {}, "os"), + ("to_excel", {"engine": "xlwt"}, "xlwt"), + ("to_feather", {}, "feather"), + ("to_html", {}, "os"), + ("to_json", {}, "os"), + ("to_latex", {}, "os"), + ("to_msgpack", {}, "os"), + ("to_pickle", {}, "os"), + ("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"), + ], + ) def test_write_fspath_all(self, writer_name, writer_kwargs, module): - p1 = tm.ensure_clean('string') - p2 = tm.ensure_clean('fspath') + p1 = tm.ensure_clean("string") + p2 = tm.ensure_clean("fspath") df = pd.DataFrame({"A": [1, 2]}) with p1 as string, p2 as fspath: @@ -243,11 +264,11 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): writer = getattr(df, writer_name) writer(string, **writer_kwargs) - with open(string, 'rb') as f: + with open(string, "rb") as f: expected = f.read() writer(mypath, **writer_kwargs) - with open(fspath, 'rb') as f: + with open(fspath, "rb") as f: result = f.read() assert result == expected @@ -256,32 +277,31 @@ def test_write_fspath_hdf5(self): # Same test as write_fspath_all, except HDF5 files aren't # necessarily byte-for-byte identical for a given dataframe, so we'll # have to read and compare equality - pytest.importorskip('tables') + pytest.importorskip("tables") df = pd.DataFrame({"A": [1, 2]}) - p1 = tm.ensure_clean('string') - p2 = tm.ensure_clean('fspath') + p1 = tm.ensure_clean("string") + p2 = tm.ensure_clean("fspath") with p1 as string, p2 as fspath: mypath = CustomFSPath(fspath) - df.to_hdf(mypath, key='bar') - df.to_hdf(string, key='bar') + df.to_hdf(mypath, key="bar") + df.to_hdf(string, key="bar") - result = pd.read_hdf(fspath, key='bar') - expected = pd.read_hdf(string, key='bar') + result = pd.read_hdf(fspath, key="bar") + expected = pd.read_hdf(string, key="bar") tm.assert_frame_equal(result, expected) @pytest.fixture def mmap_file(datapath): - return datapath('io', 'data', 'test_mmap.csv') + return datapath("io", "data", "test_mmap.csv") class TestMMapWrapper: - def test_constructor_bad_file(self, mmap_file): - non_file = StringIO('I am not a file') + non_file = StringIO("I am not a file") non_file.fileno = lambda: -1 # the error raised is different on Windows @@ -295,7 +315,7 @@ def test_constructor_bad_file(self, mmap_file): with pytest.raises(err, match=msg): icom.MMapWrapper(non_file) - target = open(mmap_file, 'r') + target = open(mmap_file, "r") target.close() msg = "I/O operation on closed file" @@ -303,21 +323,20 @@ def test_constructor_bad_file(self, mmap_file): icom.MMapWrapper(target) def test_get_attr(self, mmap_file): - with open(mmap_file, 'r') as target: + with open(mmap_file, "r") as target: wrapper = icom.MMapWrapper(target) attrs = dir(wrapper.mmap) - attrs = [attr for attr in attrs - if not attr.startswith('__')] - attrs.append('__next__') + attrs = [attr for attr in attrs if not attr.startswith("__")] + attrs.append("__next__") for attr in attrs: assert hasattr(wrapper, attr) - assert not hasattr(wrapper, 'foo') + assert not hasattr(wrapper, "foo") def test_next(self, mmap_file): - with open(mmap_file, 'r') as target: + with open(mmap_file, "r") as target: wrapper = icom.MMapWrapper(target) lines = target.readlines() @@ -325,12 +344,12 @@ def test_next(self, mmap_file): next_line = next(wrapper) assert next_line.strip() == line.strip() - with pytest.raises(StopIteration, match=r'^$'): + with pytest.raises(StopIteration, match=r"^$"): next(wrapper) def test_unknown_engine(self): with tm.ensure_clean() as path: df = tm.makeDataFrame() df.to_csv(path) - with pytest.raises(ValueError, match='Unknown engine'): - pd.read_csv(path, engine='pyt') + with pytest.raises(ValueError, match="Unknown engine"): + pd.read_csv(path, engine="pyt") diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index c0e19f07c148d0..ce459ab24afe0d 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -21,12 +21,17 @@ def catch_to_csv_depr(): yield -@pytest.mark.parametrize('obj', [ - pd.DataFrame(100 * [[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - columns=['X', 'Y', 'Z']), - pd.Series(100 * [0.123456, 0.234567, 0.567567], name='X')]) -@pytest.mark.parametrize('method', ['to_pickle', 'to_json', 'to_csv']) +@pytest.mark.parametrize( + "obj", + [ + pd.DataFrame( + 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + columns=["X", "Y", "Z"], + ), + pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"), + ], +) +@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"]) def test_compression_size(obj, method, compression_only): with tm.ensure_clean() as path: with catch_to_csv_depr(): @@ -37,15 +42,20 @@ def test_compression_size(obj, method, compression_only): assert uncompressed_size > compressed_size -@pytest.mark.parametrize('obj', [ - pd.DataFrame(100 * [[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - columns=['X', 'Y', 'Z']), - pd.Series(100 * [0.123456, 0.234567, 0.567567], name='X')]) -@pytest.mark.parametrize('method', ['to_csv', 'to_json']) +@pytest.mark.parametrize( + "obj", + [ + pd.DataFrame( + 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + columns=["X", "Y", "Z"], + ), + pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"), + ], +) +@pytest.mark.parametrize("method", ["to_csv", "to_json"]) def test_compression_size_fh(obj, method, compression_only): with tm.ensure_clean() as path: - f, handles = icom._get_handle(path, 'w', compression=compression_only) + f, handles = icom._get_handle(path, "w", compression=compression_only) with catch_to_csv_depr(): with f: getattr(obj, method)(f) @@ -53,7 +63,7 @@ def test_compression_size_fh(obj, method, compression_only): assert f.closed compressed_size = os.path.getsize(path) with tm.ensure_clean() as path: - f, handles = icom._get_handle(path, 'w', compression=None) + f, handles = icom._get_handle(path, "w", compression=None) with catch_to_csv_depr(): with f: getattr(obj, method)(f) @@ -63,35 +73,41 @@ def test_compression_size_fh(obj, method, compression_only): assert uncompressed_size > compressed_size -@pytest.mark.parametrize('write_method, write_kwargs, read_method', [ - ('to_csv', {'index': False}, pd.read_csv), - ('to_json', {}, pd.read_json), - ('to_pickle', {}, pd.read_pickle), -]) +@pytest.mark.parametrize( + "write_method, write_kwargs, read_method", + [ + ("to_csv", {"index": False}, pd.read_csv), + ("to_json", {}, pd.read_json), + ("to_pickle", {}, pd.read_pickle), + ], +) def test_dataframe_compression_defaults_to_infer( - write_method, write_kwargs, read_method, compression_only): + write_method, write_kwargs, read_method, compression_only +): # GH22004 - input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=['X', 'Y', 'Z']) + input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=["X", "Y", "Z"]) extension = icom._compression_to_extension[compression_only] - with tm.ensure_clean('compressed' + extension) as path: + with tm.ensure_clean("compressed" + extension) as path: getattr(input, write_method)(path, **write_kwargs) output = read_method(path, compression=compression_only) tm.assert_frame_equal(output, input) -@pytest.mark.parametrize('write_method,write_kwargs,read_method,read_kwargs', [ - ('to_csv', {'index': False, 'header': True}, - pd.read_csv, {'squeeze': True}), - ('to_json', {}, pd.read_json, {'typ': 'series'}), - ('to_pickle', {}, pd.read_pickle, {}), -]) +@pytest.mark.parametrize( + "write_method,write_kwargs,read_method,read_kwargs", + [ + ("to_csv", {"index": False, "header": True}, pd.read_csv, {"squeeze": True}), + ("to_json", {}, pd.read_json, {"typ": "series"}), + ("to_pickle", {}, pd.read_pickle, {}), + ], +) def test_series_compression_defaults_to_infer( - write_method, write_kwargs, read_method, read_kwargs, - compression_only): + write_method, write_kwargs, read_method, read_kwargs, compression_only +): # GH22004 - input = pd.Series([0, 5, -2, 10], name='X') + input = pd.Series([0, 5, -2, 10], name="X") extension = icom._compression_to_extension[compression_only] - with tm.ensure_clean('compressed' + extension) as path: + with tm.ensure_clean("compressed" + extension) as path: getattr(input, write_method)(path, **write_kwargs) output = read_method(path, compression=compression_only, **read_kwargs) tm.assert_series_equal(output, input, check_names=False) @@ -100,12 +116,12 @@ def test_series_compression_defaults_to_infer( def test_compression_warning(compression_only): # Assert that passing a file object to to_csv while explicitly specifying a # compression protocol triggers a RuntimeWarning, as per GH21227. - df = pd.DataFrame(100 * [[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - columns=['X', 'Y', 'Z']) + df = pd.DataFrame( + 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + columns=["X", "Y", "Z"], + ) with tm.ensure_clean() as path: - f, handles = icom._get_handle(path, 'w', compression=compression_only) - with tm.assert_produces_warning(RuntimeWarning, - check_stacklevel=False): + f, handles = icom._get_handle(path, "w", compression=compression_only) + with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False): with f: df.to_csv(f, compression=compression_only) diff --git a/pandas/tests/io/test_date_converters.py b/pandas/tests/io/test_date_converters.py index c5a94883aa6098..2fa5e3b30d6af8 100644 --- a/pandas/tests/io/test_date_converters.py +++ b/pandas/tests/io/test_date_converters.py @@ -8,10 +8,9 @@ def test_parse_date_time(): - dates = np.array(['2007/1/3', '2008/2/4'], dtype=object) - times = np.array(['05:07:09', '06:08:00'], dtype=object) - expected = np.array([datetime(2007, 1, 3, 5, 7, 9), - datetime(2008, 2, 4, 6, 8, 0)]) + dates = np.array(["2007/1/3", "2008/2/4"], dtype=object) + times = np.array(["05:07:09", "06:08:00"], dtype=object) + expected = np.array([datetime(2007, 1, 3, 5, 7, 9), datetime(2008, 2, 4, 6, 8, 0)]) result = conv.parse_date_time(dates, times) tm.assert_numpy_array_equal(result, expected) @@ -36,8 +35,6 @@ def test_parse_all_fields(): years = np.array([2007, 2008]) months = np.array([1, 2]) - result = conv.parse_all_fields(years, months, days, - hours, minutes, seconds) - expected = np.array([datetime(2007, 1, 3, 5, 7, 9), - datetime(2008, 2, 4, 6, 8, 0)]) + result = conv.parse_all_fields(years, months, days, hours, minutes, seconds) + expected = np.array([datetime(2007, 1, 3, 5, 7, 9), datetime(2008, 2, 4, 6, 8, 0)]) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 805ce67e76e280..fa63f102580fff 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -10,7 +10,7 @@ from pandas.io.feather_format import read_feather, to_feather # noqa:E402 -pyarrow = pytest.importorskip('pyarrow') +pyarrow = pytest.importorskip("pyarrow") pyarrow_version = LooseVersion(pyarrow.__version__) @@ -18,7 +18,6 @@ @pytest.mark.single class TestFeather: - def check_error_on_write(self, df, exc): # check that we are raising the exception # on writing @@ -40,37 +39,46 @@ def check_round_trip(self, df, expected=None, **kwargs): def test_error(self): - for obj in [pd.Series([1, 2, 3]), 1, 'foo', pd.Timestamp('20130101'), - np.array([1, 2, 3])]: + for obj in [ + pd.Series([1, 2, 3]), + 1, + "foo", + pd.Timestamp("20130101"), + np.array([1, 2, 3]), + ]: self.check_error_on_write(obj, ValueError) def test_basic(self): - df = pd.DataFrame({'string': list('abc'), - 'int': list(range(1, 4)), - 'uint': np.arange(3, 6).astype('u1'), - 'float': np.arange(4.0, 7.0, dtype='float64'), - 'float_with_null': [1., np.nan, 3], - 'bool': [True, False, True], - 'bool_with_null': [True, np.nan, False], - 'cat': pd.Categorical(list('abc')), - 'dt': pd.date_range('20130101', periods=3), - 'dttz': pd.date_range('20130101', periods=3, - tz='US/Eastern'), - 'dt_with_null': [pd.Timestamp('20130101'), pd.NaT, - pd.Timestamp('20130103')], - 'dtns': pd.date_range('20130101', periods=3, - freq='ns')}) - - assert df.dttz.dtype.tz.zone == 'US/Eastern' + df = pd.DataFrame( + { + "string": list("abc"), + "int": list(range(1, 4)), + "uint": np.arange(3, 6).astype("u1"), + "float": np.arange(4.0, 7.0, dtype="float64"), + "float_with_null": [1.0, np.nan, 3], + "bool": [True, False, True], + "bool_with_null": [True, np.nan, False], + "cat": pd.Categorical(list("abc")), + "dt": pd.date_range("20130101", periods=3), + "dttz": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "dt_with_null": [ + pd.Timestamp("20130101"), + pd.NaT, + pd.Timestamp("20130103"), + ], + "dtns": pd.date_range("20130101", periods=3, freq="ns"), + } + ) + + assert df.dttz.dtype.tz.zone == "US/Eastern" self.check_round_trip(df) def test_duplicate_columns(self): # https://github.com/wesm/feather/issues/53 # not currently able to handle duplicate columns - df = pd.DataFrame(np.arange(12).reshape(4, 3), - columns=list('aaa')).copy() + df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() self.check_error_on_write(df, ValueError) def test_stringify_columns(self): @@ -80,71 +88,71 @@ def test_stringify_columns(self): def test_read_columns(self): # GH 24025 - df = pd.DataFrame({'col1': list('abc'), - 'col2': list(range(1, 4)), - 'col3': list('xyz'), - 'col4': list(range(4, 7))}) - columns = ['col1', 'col3'] - self.check_round_trip(df, expected=df[columns], - columns=columns) + df = pd.DataFrame( + { + "col1": list("abc"), + "col2": list(range(1, 4)), + "col3": list("xyz"), + "col4": list(range(4, 7)), + } + ) + columns = ["col1", "col3"] + self.check_round_trip(df, expected=df[columns], columns=columns) def test_unsupported_other(self): # period - df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)}) + df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) # Some versions raise ValueError, others raise ArrowInvalid. self.check_error_on_write(df, Exception) def test_rw_nthreads(self): - df = pd.DataFrame({'A': np.arange(100000)}) + df = pd.DataFrame({"A": np.arange(100000)}) expected_warning = ( - "the 'nthreads' keyword is deprecated, " - "use 'use_threads' instead" + "the 'nthreads' keyword is deprecated, " "use 'use_threads' instead" ) # TODO: make the warning work with check_stacklevel=True - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False) as w: + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w: self.check_round_trip(df, nthreads=2) # we have an extra FutureWarning because of #GH23752 assert any(expected_warning in str(x) for x in w) # TODO: make the warning work with check_stacklevel=True - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False) as w: + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w: self.check_round_trip(df, nthreads=1) # we have an extra FutureWarnings because of #GH23752 assert any(expected_warning in str(x) for x in w) def test_rw_use_threads(self): - df = pd.DataFrame({'A': np.arange(100000)}) + df = pd.DataFrame({"A": np.arange(100000)}) self.check_round_trip(df, use_threads=True) self.check_round_trip(df, use_threads=False) def test_write_with_index(self): - df = pd.DataFrame({'A': [1, 2, 3]}) + df = pd.DataFrame({"A": [1, 2, 3]}) self.check_round_trip(df) # non-default index - for index in [[2, 3, 4], - pd.date_range('20130101', periods=3), - list('abc'), - [1, 3, 4], - pd.MultiIndex.from_tuples([('a', 1), ('a', 2), - ('b', 1)]), - ]: + for index in [ + [2, 3, 4], + pd.date_range("20130101", periods=3), + list("abc"), + [1, 3, 4], + pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]), + ]: df.index = index self.check_error_on_write(df, ValueError) # index with meta-data df.index = [0, 1, 2] - df.index.name = 'foo' + df.index.name = "foo" self.check_error_on_write(df, ValueError) # column multi-index df.index = [0, 1, 2] - df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]), + df.columns = (pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]),) self.check_error_on_write(df, ValueError) def test_path_pathlib(self): diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index 21e0a63bf4ce79..6ca6da01a6d6f6 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -18,9 +18,9 @@ PRIVATE_KEY_JSON_PATH = None PRIVATE_KEY_JSON_CONTENTS = None -DATASET_ID = 'pydata_pandas_bq_testing_py3' +DATASET_ID = "pydata_pandas_bq_testing_py3" -TABLE_ID = 'new_test' +TABLE_ID = "new_test" DESTINATION_TABLE = "{0}.{1}".format(DATASET_ID + "1", TABLE_ID) VERSION = platform.python_version() @@ -28,43 +28,42 @@ def _skip_if_no_project_id(): if not _get_project_id(): - pytest.skip( - "Cannot run integration tests without a project id") + pytest.skip("Cannot run integration tests without a project id") def _skip_if_no_private_key_path(): if not _get_private_key_path(): - pytest.skip("Cannot run integration tests without a " - "private key json file path") + pytest.skip( + "Cannot run integration tests without a " "private key json file path" + ) def _in_travis_environment(): - return 'TRAVIS_BUILD_DIR' in os.environ and \ - 'GBQ_PROJECT_ID' in os.environ + return "TRAVIS_BUILD_DIR" in os.environ and "GBQ_PROJECT_ID" in os.environ def _get_project_id(): if _in_travis_environment(): - return os.environ.get('GBQ_PROJECT_ID') - return PROJECT_ID or os.environ.get('GBQ_PROJECT_ID') + return os.environ.get("GBQ_PROJECT_ID") + return PROJECT_ID or os.environ.get("GBQ_PROJECT_ID") def _get_private_key_path(): if _in_travis_environment(): - return os.path.join(*[os.environ.get('TRAVIS_BUILD_DIR'), 'ci', - 'travis_gbq.json']) + return os.path.join( + *[os.environ.get("TRAVIS_BUILD_DIR"), "ci", "travis_gbq.json"] + ) private_key_path = PRIVATE_KEY_JSON_PATH if not private_key_path: - private_key_path = os.environ.get('GBQ_GOOGLE_APPLICATION_CREDENTIALS') + private_key_path = os.environ.get("GBQ_GOOGLE_APPLICATION_CREDENTIALS") return private_key_path def _get_credentials(): private_key_path = _get_private_key_path() if private_key_path: - return service_account.Credentials.from_service_account_file( - private_key_path) + return service_account.Credentials.from_service_account_file(private_key_path) def _get_client(): @@ -79,14 +78,17 @@ def make_mixed_dataframe_v2(test_size): flts = np.random.randn(1, test_size) ints = np.random.randint(1, 10, size=(1, test_size)) strs = np.random.randint(1, 10, size=(1, test_size)).astype(str) - times = [datetime.now(pytz.timezone('US/Arizona')) - for t in range(test_size)] - return DataFrame({'bools': bools[0], - 'flts': flts[0], - 'ints': ints[0], - 'strs': strs[0], - 'times': times[0]}, - index=range(test_size)) + times = [datetime.now(pytz.timezone("US/Arizona")) for t in range(test_size)] + return DataFrame( + { + "bools": bools[0], + "flts": flts[0], + "ints": ints[0], + "strs": strs[0], + "times": times[0], + }, + index=range(test_size), + ) def test_read_gbq_with_deprecated_kwargs(monkeypatch): @@ -146,7 +148,6 @@ def mock_read_gbq(sql, **kwargs): @pytest.mark.single class TestToGBQIntegrationWithServiceAccountKeyPath: - @classmethod def setup_class(cls): # - GLOBAL CLASS FIXTURES - @@ -179,12 +180,17 @@ def test_roundtrip(self): test_size = 20001 df = make_mixed_dataframe_v2(test_size) - df.to_gbq(destination_table, _get_project_id(), chunksize=None, - credentials=_get_credentials()) - - result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" - .format(destination_table), - project_id=_get_project_id(), - credentials=_get_credentials(), - dialect="standard") - assert result['num_rows'][0] == test_size + df.to_gbq( + destination_table, + _get_project_id(), + chunksize=None, + credentials=_get_credentials(), + ) + + result = pd.read_gbq( + "SELECT COUNT(*) AS num_rows FROM {0}".format(destination_table), + project_id=_get_project_id(), + credentials=_get_credentials(), + dialect="standard", + ) + assert result["num_rows"][0] == test_size diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 4fa0f3246910a8..2ca56230b5b8c1 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -16,57 +16,76 @@ def test_is_gcs_url(): assert not is_gcs_url("s3://pandas/somethingelse.com") -@td.skip_if_no('gcsfs') +@td.skip_if_no("gcsfs") def test_read_csv_gcs(monkeypatch): - df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'], - 'dt': date_range('2018-06-18', periods=2)}) + df1 = DataFrame( + { + "int": [1, 3], + "float": [2.0, np.nan], + "str": ["t", "s"], + "dt": date_range("2018-06-18", periods=2), + } + ) class MockGCSFileSystem: def open(*args): return StringIO(df1.to_csv(index=False)) - monkeypatch.setattr('gcsfs.GCSFileSystem', MockGCSFileSystem) - df2 = read_csv('gs://test/test.csv', parse_dates=['dt']) + monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) + df2 = read_csv("gs://test/test.csv", parse_dates=["dt"]) assert_frame_equal(df1, df2) -@td.skip_if_no('gcsfs') +@td.skip_if_no("gcsfs") def test_to_csv_gcs(monkeypatch): - df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'], - 'dt': date_range('2018-06-18', periods=2)}) + df1 = DataFrame( + { + "int": [1, 3], + "float": [2.0, np.nan], + "str": ["t", "s"], + "dt": date_range("2018-06-18", periods=2), + } + ) s = StringIO() class MockGCSFileSystem: def open(*args): return s - monkeypatch.setattr('gcsfs.GCSFileSystem', MockGCSFileSystem) - df1.to_csv('gs://test/test.csv', index=True) - df2 = read_csv(StringIO(s.getvalue()), parse_dates=['dt'], index_col=0) + monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) + df1.to_csv("gs://test/test.csv", index=True) + df2 = read_csv(StringIO(s.getvalue()), parse_dates=["dt"], index_col=0) assert_frame_equal(df1, df2) -@td.skip_if_no('gcsfs') +@td.skip_if_no("gcsfs") def test_gcs_get_filepath_or_buffer(monkeypatch): - df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'], - 'dt': date_range('2018-06-18', periods=2)}) + df1 = DataFrame( + { + "int": [1, 3], + "float": [2.0, np.nan], + "str": ["t", "s"], + "dt": date_range("2018-06-18", periods=2), + } + ) def mock_get_filepath_or_buffer(*args, **kwargs): - return (StringIO(df1.to_csv(index=False)), - None, None, False) + return (StringIO(df1.to_csv(index=False)), None, None, False) - monkeypatch.setattr('pandas.io.gcs.get_filepath_or_buffer', - mock_get_filepath_or_buffer) - df2 = read_csv('gs://test/test.csv', parse_dates=['dt']) + monkeypatch.setattr( + "pandas.io.gcs.get_filepath_or_buffer", mock_get_filepath_or_buffer + ) + df2 = read_csv("gs://test/test.csv", parse_dates=["dt"]) assert_frame_equal(df1, df2) -@pytest.mark.skipif(td.safe_import('gcsfs'), - reason='Only check when gcsfs not installed') +@pytest.mark.skipif( + td.safe_import("gcsfs"), reason="Only check when gcsfs not installed" +) def test_gcs_not_present_exception(): with pytest.raises(ImportError) as e: - read_csv('gs://test/test.csv') - assert 'gcsfs library is required' in str(e.value) + read_csv("gs://test/test.csv") + assert "gcsfs library is required" in str(e.value) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 63184dd1a8f839..d3d05b6281d5b5 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -13,8 +13,7 @@ from pandas.errors import ParserError import pandas.util._test_decorators as td -from pandas import ( - DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv) +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv import pandas.util.testing as tm from pandas.util.testing import makeCustomDataframe as mkdf, network @@ -25,37 +24,46 @@ HERE = os.path.dirname(__file__) -@pytest.fixture(params=[ - 'chinese_utf-16.html', - 'chinese_utf-32.html', - 'chinese_utf-8.html', - 'letz_latin1.html', -]) +@pytest.fixture( + params=[ + "chinese_utf-16.html", + "chinese_utf-32.html", + "chinese_utf-8.html", + "letz_latin1.html", + ] +) def html_encoding_file(request, datapath): """Parametrized fixture for HTML encoding test filenames.""" - return datapath('io', 'data', 'html_encoding', request.param) + return datapath("io", "data", "html_encoding", request.param) def assert_framelist_equal(list1, list2, *args, **kwargs): - assert len(list1) == len(list2), ('lists are not of equal size ' - 'len(list1) == {0}, ' - 'len(list2) == {1}'.format(len(list1), - len(list2))) - msg = 'not all list elements are DataFrames' - both_frames = all(map(lambda x, y: isinstance(x, DataFrame) and - isinstance(y, DataFrame), list1, list2)) + assert len(list1) == len(list2), ( + "lists are not of equal size " + "len(list1) == {0}, " + "len(list2) == {1}".format(len(list1), len(list2)) + ) + msg = "not all list elements are DataFrames" + both_frames = all( + map( + lambda x, y: isinstance(x, DataFrame) and isinstance(y, DataFrame), + list1, + list2, + ) + ) assert both_frames, msg for frame_i, frame_j in zip(list1, list2): tm.assert_frame_equal(frame_i, frame_j, *args, **kwargs) - assert not frame_i.empty, 'frames are both empty' + assert not frame_i.empty, "frames are both empty" -@td.skip_if_no('bs4') +@td.skip_if_no("bs4") def test_bs4_version_fails(monkeypatch, datapath): import bs4 - monkeypatch.setattr(bs4, '__version__', '4.2') + + monkeypatch.setattr(bs4, "__version__", "4.2") with pytest.raises(ImportError, match="Pandas requires version"): - read_html(datapath("io", "data", "spam.html"), flavor='bs4') + read_html(datapath("io", "data", "spam.html"), flavor="bs4") def test_invalid_flavor(): @@ -67,25 +75,29 @@ def test_invalid_flavor(): read_html(url, "google", flavor=flavor) -@td.skip_if_no('bs4') -@td.skip_if_no('lxml') +@td.skip_if_no("bs4") +@td.skip_if_no("lxml") def test_same_ordering(datapath): - filename = datapath('io', 'data', 'valid_markup.html') - dfs_lxml = read_html(filename, index_col=0, flavor=['lxml']) - dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4']) + filename = datapath("io", "data", "valid_markup.html") + dfs_lxml = read_html(filename, index_col=0, flavor=["lxml"]) + dfs_bs4 = read_html(filename, index_col=0, flavor=["bs4"]) assert_framelist_equal(dfs_lxml, dfs_bs4) -@pytest.mark.parametrize("flavor", [ - pytest.param('bs4', marks=td.skip_if_no('lxml')), - pytest.param('lxml', marks=td.skip_if_no('lxml'))], scope="class") +@pytest.mark.parametrize( + "flavor", + [ + pytest.param("bs4", marks=td.skip_if_no("lxml")), + pytest.param("lxml", marks=td.skip_if_no("lxml")), + ], + scope="class", +) class TestReadHtml: - @pytest.fixture(autouse=True) def set_files(self, datapath): - self.spam_data = datapath('io', 'data', 'spam.html') + self.spam_data = datapath("io", "data", "spam.html") self.spam_data_kwargs = {} - self.spam_data_kwargs['encoding'] = 'UTF-8' + self.spam_data_kwargs["encoding"] = "UTF-8" self.banklist_data = datapath("io", "data", "banklist.html") @pytest.fixture(autouse=True, scope="function") @@ -94,46 +106,56 @@ def set_defaults(self, flavor, request): yield def test_to_html_compat(self): - df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False, - r_idx_names=False).applymap('{0:.3f}'.format).astype(float) + df = ( + mkdf( + 4, + 3, + data_gen_f=lambda *args: rand(), + c_idx_names=False, + r_idx_names=False, + ) + .applymap("{0:.3f}".format) + .astype(float) + ) out = df.to_html() - res = self.read_html(out, attrs={'class': 'dataframe'}, index_col=0)[0] + res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0] tm.assert_frame_equal(res, df) @network def test_banklist_url(self): - url = 'http://www.fdic.gov/bank/individual/failed/banklist.html' - df1 = self.read_html(url, 'First Federal Bank of Florida', - attrs={"id": 'table'}) - df2 = self.read_html(url, 'Metcalf Bank', attrs={'id': 'table'}) + url = "http://www.fdic.gov/bank/individual/failed/banklist.html" + df1 = self.read_html( + url, "First Federal Bank of Florida", attrs={"id": "table"} + ) + df2 = self.read_html(url, "Metcalf Bank", attrs={"id": "table"}) assert_framelist_equal(df1, df2) @network def test_spam_url(self): - url = ('http://ndb.nal.usda.gov/ndb/foods/show/300772?fg=&man=&' - 'lfacet=&format=&count=&max=25&offset=&sort=&qlookup=spam') - df1 = self.read_html(url, '.*Water.*') - df2 = self.read_html(url, 'Unit') + url = ( + "http://ndb.nal.usda.gov/ndb/foods/show/300772?fg=&man=&" + "lfacet=&format=&count=&max=25&offset=&sort=&qlookup=spam" + ) + df1 = self.read_html(url, ".*Water.*") + df2 = self.read_html(url, "Unit") assert_framelist_equal(df1, df2) @pytest.mark.slow def test_banklist(self): - df1 = self.read_html(self.banklist_data, '.*Florida.*', - attrs={'id': 'table'}) - df2 = self.read_html(self.banklist_data, 'Metcalf Bank', - attrs={'id': 'table'}) + df1 = self.read_html(self.banklist_data, ".*Florida.*", attrs={"id": "table"}) + df2 = self.read_html(self.banklist_data, "Metcalf Bank", attrs={"id": "table"}) assert_framelist_equal(df1, df2) def test_spam(self): - df1 = self.read_html(self.spam_data, '.*Water.*') - df2 = self.read_html(self.spam_data, 'Unit') + df1 = self.read_html(self.spam_data, ".*Water.*") + df2 = self.read_html(self.spam_data, "Unit") assert_framelist_equal(df1, df2) - assert df1[0].iloc[0, 0] == 'Proximates' - assert df1[0].columns[0] == 'Nutrient' + assert df1[0].iloc[0, 0] == "Proximates" + assert df1[0].columns[0] == "Nutrient" def test_spam_no_match(self): dfs = self.read_html(self.spam_data) @@ -141,90 +163,88 @@ def test_spam_no_match(self): assert isinstance(df, DataFrame) def test_banklist_no_match(self): - dfs = self.read_html(self.banklist_data, attrs={'id': 'table'}) + dfs = self.read_html(self.banklist_data, attrs={"id": "table"}) for df in dfs: assert isinstance(df, DataFrame) def test_spam_header(self): - df = self.read_html(self.spam_data, '.*Water.*', header=2)[0] - assert df.columns[0] == 'Proximates' + df = self.read_html(self.spam_data, ".*Water.*", header=2)[0] + assert df.columns[0] == "Proximates" assert not df.empty def test_skiprows_int(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1) - df2 = self.read_html(self.spam_data, 'Unit', skiprows=1) + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=1) + df2 = self.read_html(self.spam_data, "Unit", skiprows=1) assert_framelist_equal(df1, df2) def test_skiprows_xrange(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=range(2))[0] - df2 = self.read_html(self.spam_data, 'Unit', skiprows=range(2))[0] + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=range(2))[0] + df2 = self.read_html(self.spam_data, "Unit", skiprows=range(2))[0] tm.assert_frame_equal(df1, df2) def test_skiprows_list(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=[1, 2]) - df2 = self.read_html(self.spam_data, 'Unit', skiprows=[2, 1]) + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=[1, 2]) + df2 = self.read_html(self.spam_data, "Unit", skiprows=[2, 1]) assert_framelist_equal(df1, df2) def test_skiprows_set(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skiprows={1, 2}) - df2 = self.read_html(self.spam_data, 'Unit', skiprows={2, 1}) + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows={1, 2}) + df2 = self.read_html(self.spam_data, "Unit", skiprows={2, 1}) assert_framelist_equal(df1, df2) def test_skiprows_slice(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1) - df2 = self.read_html(self.spam_data, 'Unit', skiprows=1) + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=1) + df2 = self.read_html(self.spam_data, "Unit", skiprows=1) assert_framelist_equal(df1, df2) def test_skiprows_slice_short(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2)) - df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(2)) + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=slice(2)) + df2 = self.read_html(self.spam_data, "Unit", skiprows=slice(2)) assert_framelist_equal(df1, df2) def test_skiprows_slice_long(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2, 5)) - df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(4, 1, -1)) + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=slice(2, 5)) + df2 = self.read_html(self.spam_data, "Unit", skiprows=slice(4, 1, -1)) assert_framelist_equal(df1, df2) def test_skiprows_ndarray(self): - df1 = self.read_html(self.spam_data, '.*Water.*', - skiprows=np.arange(2)) - df2 = self.read_html(self.spam_data, 'Unit', skiprows=np.arange(2)) + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=np.arange(2)) + df2 = self.read_html(self.spam_data, "Unit", skiprows=np.arange(2)) assert_framelist_equal(df1, df2) def test_skiprows_invalid(self): - with pytest.raises(TypeError, match=('is not a valid type ' - 'for skipping rows')): - self.read_html(self.spam_data, '.*Water.*', skiprows='asdf') + with pytest.raises( + TypeError, match=("is not a valid type " "for skipping rows") + ): + self.read_html(self.spam_data, ".*Water.*", skiprows="asdf") def test_index(self): - df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0) - df2 = self.read_html(self.spam_data, 'Unit', index_col=0) + df1 = self.read_html(self.spam_data, ".*Water.*", index_col=0) + df2 = self.read_html(self.spam_data, "Unit", index_col=0) assert_framelist_equal(df1, df2) def test_header_and_index_no_types(self): - df1 = self.read_html(self.spam_data, '.*Water.*', header=1, - index_col=0) - df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0) + df1 = self.read_html(self.spam_data, ".*Water.*", header=1, index_col=0) + df2 = self.read_html(self.spam_data, "Unit", header=1, index_col=0) assert_framelist_equal(df1, df2) def test_header_and_index_with_types(self): - df1 = self.read_html(self.spam_data, '.*Water.*', header=1, - index_col=0) - df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0) + df1 = self.read_html(self.spam_data, ".*Water.*", header=1, index_col=0) + df2 = self.read_html(self.spam_data, "Unit", header=1, index_col=0) assert_framelist_equal(df1, df2) def test_infer_types(self): # 10892 infer_types removed - df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0) - df2 = self.read_html(self.spam_data, 'Unit', index_col=0) + df1 = self.read_html(self.spam_data, ".*Water.*", index_col=0) + df2 = self.read_html(self.spam_data, "Unit", index_col=0) assert_framelist_equal(df1, df2) def test_string_io(self): @@ -234,49 +254,48 @@ def test_string_io(self): with open(self.spam_data, **self.spam_data_kwargs) as f: data2 = StringIO(f.read()) - df1 = self.read_html(data1, '.*Water.*') - df2 = self.read_html(data2, 'Unit') + df1 = self.read_html(data1, ".*Water.*") + df2 = self.read_html(data2, "Unit") assert_framelist_equal(df1, df2) def test_string(self): with open(self.spam_data, **self.spam_data_kwargs) as f: data = f.read() - df1 = self.read_html(data, '.*Water.*') - df2 = self.read_html(data, 'Unit') + df1 = self.read_html(data, ".*Water.*") + df2 = self.read_html(data, "Unit") assert_framelist_equal(df1, df2) def test_file_like(self): with open(self.spam_data, **self.spam_data_kwargs) as f: - df1 = self.read_html(f, '.*Water.*') + df1 = self.read_html(f, ".*Water.*") with open(self.spam_data, **self.spam_data_kwargs) as f: - df2 = self.read_html(f, 'Unit') + df2 = self.read_html(f, "Unit") assert_framelist_equal(df1, df2) @network def test_bad_url_protocol(self): with pytest.raises(URLError): - self.read_html('git://github.com', match='.*Water.*') + self.read_html("git://github.com", match=".*Water.*") @network @pytest.mark.slow def test_invalid_url(self): try: with pytest.raises(URLError): - self.read_html('http://www.a23950sdfa908sd.com', - match='.*Water.*') + self.read_html("http://www.a23950sdfa908sd.com", match=".*Water.*") except ValueError as e: - assert 'No tables found' in str(e) + assert "No tables found" in str(e) @pytest.mark.slow def test_file_url(self): url = self.banklist_data - dfs = self.read_html(file_path_to_url(os.path.abspath(url)), - 'First', - attrs={'id': 'table'}) + dfs = self.read_html( + file_path_to_url(os.path.abspath(url)), "First", attrs={"id": "table"} + ) assert isinstance(dfs, list) for df in dfs: assert isinstance(df, DataFrame) @@ -284,13 +303,15 @@ def test_file_url(self): @pytest.mark.slow def test_invalid_table_attrs(self): url = self.banklist_data - with pytest.raises(ValueError, match='No tables found'): - self.read_html(url, 'First Federal Bank of Florida', - attrs={'id': 'tasdfable'}) + with pytest.raises(ValueError, match="No tables found"): + self.read_html( + url, "First Federal Bank of Florida", attrs={"id": "tasdfable"} + ) def _bank_data(self, *args, **kwargs): - return self.read_html(self.banklist_data, 'Metcalf', - attrs={'id': 'table'}, *args, **kwargs) + return self.read_html( + self.banklist_data, "Metcalf", attrs={"id": "table"}, *args, **kwargs + ) @pytest.mark.slow def test_multiindex_header(self): @@ -327,37 +348,38 @@ def test_multiindex_header_index_skiprows(self): @pytest.mark.slow def test_regex_idempotency(self): url = self.banklist_data - dfs = self.read_html(file_path_to_url(os.path.abspath(url)), - match=re.compile(re.compile('Florida')), - attrs={'id': 'table'}) + dfs = self.read_html( + file_path_to_url(os.path.abspath(url)), + match=re.compile(re.compile("Florida")), + attrs={"id": "table"}, + ) assert isinstance(dfs, list) for df in dfs: assert isinstance(df, DataFrame) def test_negative_skiprows(self): - msg = r'\(you passed a negative value\)' + msg = r"\(you passed a negative value\)" with pytest.raises(ValueError, match=msg): - self.read_html(self.spam_data, 'Water', skiprows=-1) + self.read_html(self.spam_data, "Water", skiprows=-1) @network def test_multiple_matches(self): - url = 'https://docs.python.org/2/' - dfs = self.read_html(url, match='Python') + url = "https://docs.python.org/2/" + dfs = self.read_html(url, match="Python") assert len(dfs) > 1 @network def test_python_docs_table(self): - url = 'https://docs.python.org/2/' - dfs = self.read_html(url, match='Python') + url = "https://docs.python.org/2/" + dfs = self.read_html(url, match="Python") zz = [df.iloc[0, 0][0:4] for df in dfs] - assert sorted(zz) == sorted(['Repo', 'What']) + assert sorted(zz) == sorted(["Repo", "What"]) @pytest.mark.slow def test_thousands_macau_stats(self, datapath): all_non_nan_table_index = -2 macau_data = datapath("io", "data", "macau.html") - dfs = self.read_html(macau_data, index_col=0, - attrs={'class': 'style1'}) + dfs = self.read_html(macau_data, index_col=0, attrs={"class": "style1"}) df = dfs[all_non_nan_table_index] assert not any(s.isna().any() for _, s in df.iteritems()) @@ -365,7 +387,7 @@ def test_thousands_macau_stats(self, datapath): @pytest.mark.slow def test_thousands_macau_index_col(self, datapath): all_non_nan_table_index = -2 - macau_data = datapath('io', 'data', 'macau.html') + macau_data = datapath("io", "data", "macau.html") dfs = self.read_html(macau_data, index_col=0, header=0) df = dfs[all_non_nan_table_index] @@ -375,7 +397,8 @@ def test_empty_tables(self): """ Make sure that read_html ignores empty tables. """ - result = self.read_html(''' + result = self.read_html( + """ @@ -394,14 +417,16 @@ def test_empty_tables(self):
- ''') + """ + ) assert len(result) == 1 def test_multiple_tbody(self): # GH-20690 # Read all tbody tags within a single table. - result = self.read_html(''' + result = self.read_html( + """
@@ -420,9 +445,10 @@ def test_multiple_tbody(self): -
A4
''')[0] + """ + )[0] - expected = DataFrame(data=[[1, 2], [3, 4]], columns=['A', 'B']) + expected = DataFrame(data=[[1, 2], [3, 4]], columns=["A", "B"]) tm.assert_frame_equal(result, expected) @@ -431,7 +457,8 @@ def test_header_and_one_column(self): Don't fail with bs4 when there is a header and only one column as described in issue #9178 """ - result = self.read_html(''' + result = self.read_html( + """
@@ -442,9 +469,10 @@ def test_header_and_one_column(self): -
Headerfirst
''')[0] + """ + )[0] - expected = DataFrame(data={'Header': 'first'}, index=[0]) + expected = DataFrame(data={"Header": "first"}, index=[0]) tm.assert_frame_equal(result, expected) @@ -452,7 +480,8 @@ def test_thead_without_tr(self): """ Ensure parser adds within on malformed HTML. """ - result = self.read_html(''' + result = self.read_html( + """
@@ -467,10 +496,13 @@ def test_thead_without_tr(self): -
Country1944
''')[0] + """ + )[0] - expected = DataFrame(data=[['Ukraine', 'Odessa', 1944]], - columns=['Country', 'Municipality', 'Year']) + expected = DataFrame( + data=[["Ukraine", "Odessa", 1944]], + columns=["Country", "Municipality", "Year"], + ) tm.assert_frame_equal(result, expected) @@ -479,7 +511,7 @@ def test_tfoot_read(self): Make sure that read_html reads tfoot, containing td or th. Ignores empty tfoot """ - data_template = ''' + data_template = """
@@ -495,16 +527,16 @@ def test_tfoot_read(self): {footer} -
A
''' + """ - expected1 = DataFrame(data=[['bodyA', 'bodyB']], columns=['A', 'B']) + expected1 = DataFrame(data=[["bodyA", "bodyB"]], columns=["A", "B"]) - expected2 = DataFrame(data=[['bodyA', 'bodyB'], ['footA', 'footB']], - columns=['A', 'B']) + expected2 = DataFrame( + data=[["bodyA", "bodyB"], ["footA", "footB"]], columns=["A", "B"] + ) data1 = data_template.format(footer="") - data2 = data_template.format( - footer="footAfootB") + data2 = data_template.format(footer="footAfootB") result1 = self.read_html(data1)[0] result2 = self.read_html(data2)[0] @@ -515,7 +547,8 @@ def test_tfoot_read(self): def test_parse_header_of_non_string_column(self): # GH5048: if header is specified explicitly, an int column should be # parsed as int while its header is parsed as str - result = self.read_html(''' + result = self.read_html( + """ @@ -526,19 +559,27 @@ def test_parse_header_of_non_string_column(self):
S1944
- ''', header=0)[0] + """, + header=0, + )[0] - expected = DataFrame([['text', 1944]], columns=('S', 'I')) + expected = DataFrame([["text", 1944]], columns=("S", "I")) tm.assert_frame_equal(result, expected) def test_nyse_wsj_commas_table(self, datapath): - data = datapath('io', 'data', 'nyse_wsj.html') - df = self.read_html(data, index_col=0, header=0, - attrs={'class': 'mdcTable'})[0] - - expected = Index(['Issue(Roll over for charts and headlines)', - 'Volume', 'Price', 'Chg', '% Chg']) + data = datapath("io", "data", "nyse_wsj.html") + df = self.read_html(data, index_col=0, header=0, attrs={"class": "mdcTable"})[0] + + expected = Index( + [ + "Issue(Roll over for charts and headlines)", + "Volume", + "Price", + "Chg", + "% Chg", + ] + ) nrows = 100 assert df.shape[0] == nrows tm.assert_index_equal(df.columns, expected) @@ -553,48 +594,57 @@ def try_remove_ws(x): except AttributeError: return x - df = self.read_html(self.banklist_data, 'Metcalf', - attrs={'id': 'table'})[0] - ground_truth = read_csv(datapath('io', 'data', 'banklist.csv'), - converters={'Updated Date': Timestamp, - 'Closing Date': Timestamp}) + df = self.read_html(self.banklist_data, "Metcalf", attrs={"id": "table"})[0] + ground_truth = read_csv( + datapath("io", "data", "banklist.csv"), + converters={"Updated Date": Timestamp, "Closing Date": Timestamp}, + ) assert df.shape == ground_truth.shape - old = ['First Vietnamese American BankIn Vietnamese', - 'Westernbank Puerto RicoEn Espanol', - 'R-G Premier Bank of Puerto RicoEn Espanol', - 'EurobankEn Espanol', 'Sanderson State BankEn Espanol', - 'Washington Mutual Bank(Including its subsidiary Washington ' - 'Mutual Bank FSB)', - 'Silver State BankEn Espanol', - 'AmTrade International BankEn Espanol', - 'Hamilton Bank, NAEn Espanol', - 'The Citizens Savings BankPioneer Community Bank, Inc.'] - new = ['First Vietnamese American Bank', 'Westernbank Puerto Rico', - 'R-G Premier Bank of Puerto Rico', 'Eurobank', - 'Sanderson State Bank', 'Washington Mutual Bank', - 'Silver State Bank', 'AmTrade International Bank', - 'Hamilton Bank, NA', 'The Citizens Savings Bank'] + old = [ + "First Vietnamese American BankIn Vietnamese", + "Westernbank Puerto RicoEn Espanol", + "R-G Premier Bank of Puerto RicoEn Espanol", + "EurobankEn Espanol", + "Sanderson State BankEn Espanol", + "Washington Mutual Bank(Including its subsidiary Washington " + "Mutual Bank FSB)", + "Silver State BankEn Espanol", + "AmTrade International BankEn Espanol", + "Hamilton Bank, NAEn Espanol", + "The Citizens Savings BankPioneer Community Bank, Inc.", + ] + new = [ + "First Vietnamese American Bank", + "Westernbank Puerto Rico", + "R-G Premier Bank of Puerto Rico", + "Eurobank", + "Sanderson State Bank", + "Washington Mutual Bank", + "Silver State Bank", + "AmTrade International Bank", + "Hamilton Bank, NA", + "The Citizens Savings Bank", + ] dfnew = df.applymap(try_remove_ws).replace(old, new) gtnew = ground_truth.applymap(try_remove_ws) converted = dfnew._convert(datetime=True, numeric=True) - date_cols = ['Closing Date', 'Updated Date'] - converted[date_cols] = converted[date_cols]._convert(datetime=True, - coerce=True) + date_cols = ["Closing Date", "Updated Date"] + converted[date_cols] = converted[date_cols]._convert(datetime=True, coerce=True) tm.assert_frame_equal(converted, gtnew) @pytest.mark.slow def test_gold_canyon(self): - gc = 'Gold Canyon' - with open(self.banklist_data, 'r') as f: + gc = "Gold Canyon" + with open(self.banklist_data, "r") as f: raw_text = f.read() assert gc in raw_text - df = self.read_html(self.banklist_data, 'Gold Canyon', - attrs={'id': 'table'})[0] + df = self.read_html(self.banklist_data, "Gold Canyon", attrs={"id": "table"})[0] assert gc in df.to_string() def test_different_number_of_cols(self): - expected = self.read_html(""" + expected = self.read_html( + """
@@ -623,9 +673,12 @@ def test_different_number_of_cols(self): -
0.222
""", index_col=0)[0] + """, + index_col=0, + )[0] - result = self.read_html(""" + result = self.read_html( + """
@@ -651,13 +704,16 @@ def test_different_number_of_cols(self): -
0.222
""", index_col=0)[0] + """, + index_col=0, + )[0] tm.assert_frame_equal(result, expected) def test_colspan_rowspan_1(self): # GH17054 - result = self.read_html(""" + result = self.read_html( + """ @@ -670,9 +726,10 @@ def test_colspan_rowspan_1(self):
Ac
- """)[0] + """ + )[0] - expected = DataFrame([['a', 'b', 'c']], columns=['A', 'B', 'C']) + expected = DataFrame([["a", "b", "c"]], columns=["A", "B", "C"]) tm.assert_frame_equal(result, expected) @@ -684,7 +741,8 @@ def test_colspan_rowspan_copy_values(self): # X x Y Z W # A B b z C - result = self.read_html(""" + result = self.read_html( + """ @@ -698,10 +756,13 @@ def test_colspan_rowspan_copy_values(self):
XC
- """, header=0)[0] + """, + header=0, + )[0] - expected = DataFrame(data=[['A', 'B', 'B', 'Z', 'C']], - columns=['X', 'X.1', 'Y', 'Z', 'W']) + expected = DataFrame( + data=[["A", "B", "B", "Z", "C"]], columns=["X", "X.1", "Y", "Z", "W"] + ) tm.assert_frame_equal(result, expected) @@ -713,7 +774,8 @@ def test_colspan_rowspan_both_not_1(self): # A B b b C # a b b b D - result = self.read_html(""" + result = self.read_html( + """ @@ -724,10 +786,13 @@ def test_colspan_rowspan_both_not_1(self):
AD
- """, header=0)[0] + """, + header=0, + )[0] - expected = DataFrame(data=[['A', 'B', 'B', 'B', 'D']], - columns=['A', 'B', 'B.1', 'B.2', 'C']) + expected = DataFrame( + data=[["A", "B", "B", "B", "D"]], columns=["A", "B", "B.1", "B.2", "C"] + ) tm.assert_frame_equal(result, expected) @@ -739,7 +804,8 @@ def test_rowspan_at_end_of_row(self): # A B # C b - result = self.read_html(""" + result = self.read_html( + """ @@ -749,32 +815,37 @@ def test_rowspan_at_end_of_row(self):
AC
- """, header=0)[0] + """, + header=0, + )[0] - expected = DataFrame(data=[['C', 'B']], columns=['A', 'B']) + expected = DataFrame(data=[["C", "B"]], columns=["A", "B"]) tm.assert_frame_equal(result, expected) def test_rowspan_only_rows(self): # GH17054 - result = self.read_html(""" + result = self.read_html( + """
A B
- """, header=0)[0] + """, + header=0, + )[0] - expected = DataFrame(data=[['A', 'B'], ['A', 'B']], - columns=['A', 'B']) + expected = DataFrame(data=[["A", "B"], ["A", "B"]], columns=["A", "B"]) tm.assert_frame_equal(result, expected) def test_header_inferred_from_rows_with_only_th(self): # GH17054 - result = self.read_html(""" + result = self.read_html( + """ @@ -789,53 +860,63 @@ def test_header_inferred_from_rows_with_only_th(self):
A2
- """)[0] + """ + )[0] - columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']], - codes=[[0, 1], [0, 1]]) + columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]]) expected = DataFrame(data=[[1, 2]], columns=columns) tm.assert_frame_equal(result, expected) def test_parse_dates_list(self): - df = DataFrame({'date': date_range('1/1/2001', periods=10)}) + df = DataFrame({"date": date_range("1/1/2001", periods=10)}) expected = df.to_html() res = self.read_html(expected, parse_dates=[1], index_col=0) tm.assert_frame_equal(df, res[0]) - res = self.read_html(expected, parse_dates=['date'], index_col=0) + res = self.read_html(expected, parse_dates=["date"], index_col=0) tm.assert_frame_equal(df, res[0]) def test_parse_dates_combine(self): - raw_dates = Series(date_range('1/1/2001', periods=10)) - df = DataFrame({'date': raw_dates.map(lambda x: str(x.date())), - 'time': raw_dates.map(lambda x: str(x.time()))}) - res = self.read_html(df.to_html(), parse_dates={'datetime': [1, 2]}, - index_col=1) - newdf = DataFrame({'datetime': raw_dates}) + raw_dates = Series(date_range("1/1/2001", periods=10)) + df = DataFrame( + { + "date": raw_dates.map(lambda x: str(x.date())), + "time": raw_dates.map(lambda x: str(x.time())), + } + ) + res = self.read_html( + df.to_html(), parse_dates={"datetime": [1, 2]}, index_col=1 + ) + newdf = DataFrame({"datetime": raw_dates}) tm.assert_frame_equal(newdf, res[0]) def test_computer_sales_page(self, datapath): - data = datapath('io', 'data', 'computer_sales_page.html') - msg = (r"Passed header=\[0,1\] are too many " - r"rows for this multi_index of columns") + data = datapath("io", "data", "computer_sales_page.html") + msg = ( + r"Passed header=\[0,1\] are too many " + r"rows for this multi_index of columns" + ) with pytest.raises(ParserError, match=msg): self.read_html(data, header=[0, 1]) - data = datapath('io', 'data', 'computer_sales_page.html') + data = datapath("io", "data", "computer_sales_page.html") assert self.read_html(data, header=[1, 2]) def test_wikipedia_states_table(self, datapath): - data = datapath('io', 'data', 'wikipedia_states.html') - assert os.path.isfile(data), '%r is not a file' % data - assert os.path.getsize(data), '%r is an empty file' % data - result = self.read_html(data, 'Arizona', header=1)[0] - assert result['sq mi'].dtype == np.dtype('float64') + data = datapath("io", "data", "wikipedia_states.html") + assert os.path.isfile(data), "%r is not a file" % data + assert os.path.getsize(data), "%r is an empty file" % data + result = self.read_html(data, "Arizona", header=1)[0] + assert result["sq mi"].dtype == np.dtype("float64") def test_parser_error_on_empty_header_row(self): - msg = (r"Passed header=\[0,1\] are too many " - r"rows for this multi_index of columns") + msg = ( + r"Passed header=\[0,1\] are too many " + r"rows for this multi_index of columns" + ) with pytest.raises(ParserError, match=msg): - self.read_html(""" + self.read_html( + """ @@ -845,11 +926,14 @@ def test_parser_error_on_empty_header_row(self):
ab
- """, header=[0, 1]) + """, + header=[0, 1], + ) def test_decimal_rows(self): # GH 12907 - result = self.read_html(''' + result = self.read_html( + """ @@ -864,11 +948,13 @@ def test_decimal_rows(self):
- ''', decimal='#')[0] + """, + decimal="#", + )[0] - expected = DataFrame(data={'Header': 1100.101}, index=[0]) + expected = DataFrame(data={"Header": 1100.101}, index=[0]) - assert result['Header'].dtype == np.dtype('float64') + assert result["Header"].dtype == np.dtype("float64") tm.assert_frame_equal(result, expected) def test_bool_header_arg(self): @@ -895,10 +981,10 @@ def test_converters(self): """, - converters={'a': str} + converters={"a": str}, )[0] - expected = DataFrame({'a': ['0.763', '0.244']}) + expected = DataFrame({"a": ["0.763", "0.244"]}) tm.assert_frame_equal(result, expected) @@ -920,9 +1006,10 @@ def test_na_values(self): """, - na_values=[0.244])[0] + na_values=[0.244], + )[0] - expected = DataFrame({'a': [0.763, np.nan]}) + expected = DataFrame({"a": [0.763, np.nan]}) tm.assert_frame_equal(result, expected) @@ -943,16 +1030,17 @@ def test_keep_default_na(self): """ - expected_df = DataFrame({'a': ['N/A', 'NA']}) + expected_df = DataFrame({"a": ["N/A", "NA"]}) html_df = self.read_html(html_data, keep_default_na=False)[0] tm.assert_frame_equal(expected_df, html_df) - expected_df = DataFrame({'a': [np.nan, np.nan]}) + expected_df = DataFrame({"a": [np.nan, np.nan]}) html_df = self.read_html(html_data, keep_default_na=True)[0] tm.assert_frame_equal(expected_df, html_df) def test_preserve_empty_rows(self): - result = self.read_html(""" + result = self.read_html( + """ @@ -967,15 +1055,16 @@ def test_preserve_empty_rows(self):
A
- """)[0] + """ + )[0] - expected = DataFrame(data=[['a', 'b'], [np.nan, np.nan]], - columns=['A', 'B']) + expected = DataFrame(data=[["a", "b"], [np.nan, np.nan]], columns=["A", "B"]) tm.assert_frame_equal(result, expected) def test_ignore_empty_rows_when_inferring_header(self): - result = self.read_html(""" + result = self.read_html( + """ @@ -986,50 +1075,56 @@ def test_ignore_empty_rows_when_inferring_header(self):
12
- """)[0] + """ + )[0] - columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']], - codes=[[0, 1], [0, 1]]) + columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]]) expected = DataFrame(data=[[1, 2]], columns=columns) tm.assert_frame_equal(result, expected) def test_multiple_header_rows(self): # Issue #13434 - expected_df = DataFrame(data=[("Hillary", 68, "D"), - ("Bernie", 74, "D"), - ("Donald", 69, "R")]) - expected_df.columns = [["Unnamed: 0_level_0", "Age", "Party"], - ["Name", "Unnamed: 1_level_1", - "Unnamed: 2_level_1"]] + expected_df = DataFrame( + data=[("Hillary", 68, "D"), ("Bernie", 74, "D"), ("Donald", 69, "R")] + ) + expected_df.columns = [ + ["Unnamed: 0_level_0", "Age", "Party"], + ["Name", "Unnamed: 1_level_1", "Unnamed: 2_level_1"], + ] html = expected_df.to_html(index=False) - html_df = self.read_html(html, )[0] + html_df = self.read_html(html)[0] tm.assert_frame_equal(expected_df, html_df) def test_works_on_valid_markup(self, datapath): - filename = datapath('io', 'data', 'valid_markup.html') + filename = datapath("io", "data", "valid_markup.html") dfs = self.read_html(filename, index_col=0) assert isinstance(dfs, list) assert isinstance(dfs[0], DataFrame) @pytest.mark.slow def test_fallback_success(self, datapath): - banklist_data = datapath('io', 'data', 'banklist.html') - self.read_html(banklist_data, '.*Water.*', flavor=['lxml', 'html5lib']) + banklist_data = datapath("io", "data", "banklist.html") + self.read_html(banklist_data, ".*Water.*", flavor=["lxml", "html5lib"]) def test_to_html_timestamp(self): - rng = date_range('2000-01-01', periods=10) + rng = date_range("2000-01-01", periods=10) df = DataFrame(np.random.randn(10, 4), index=rng) result = df.to_html() - assert '2000-01-01' in result - - @pytest.mark.parametrize("displayed_only,exp0,exp1", [ - (True, DataFrame(["foo"]), None), - (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))]) + assert "2000-01-01" in result + + @pytest.mark.parametrize( + "displayed_only,exp0,exp1", + [ + (True, DataFrame(["foo"]), None), + (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"])), + ], + ) def test_displayed_only(self, displayed_only, exp0, exp1): # GH 20027 - data = StringIO(""" + data = StringIO( + """ @@ -1047,7 +1142,8 @@ def test_displayed_only(self, displayed_only, exp0, exp1):
- """) + """ + ) dfs = self.read_html(data, displayed_only=displayed_only) tm.assert_frame_equal(dfs[0], exp0) @@ -1058,49 +1154,51 @@ def test_displayed_only(self, displayed_only, exp0, exp1): assert len(dfs) == 1 # Should not parse hidden table def test_encode(self, html_encoding_file): - _, encoding = os.path.splitext( - os.path.basename(html_encoding_file) - )[0].split('_') + _, encoding = os.path.splitext(os.path.basename(html_encoding_file))[0].split( + "_" + ) try: - with open(html_encoding_file, 'rb') as fobj: - from_string = self.read_html(fobj.read(), encoding=encoding, - index_col=0).pop() - - with open(html_encoding_file, 'rb') as fobj: - from_file_like = self.read_html(BytesIO(fobj.read()), - encoding=encoding, - index_col=0).pop() - - from_filename = self.read_html(html_encoding_file, - encoding=encoding, - index_col=0).pop() + with open(html_encoding_file, "rb") as fobj: + from_string = self.read_html( + fobj.read(), encoding=encoding, index_col=0 + ).pop() + + with open(html_encoding_file, "rb") as fobj: + from_file_like = self.read_html( + BytesIO(fobj.read()), encoding=encoding, index_col=0 + ).pop() + + from_filename = self.read_html( + html_encoding_file, encoding=encoding, index_col=0 + ).pop() tm.assert_frame_equal(from_string, from_file_like) tm.assert_frame_equal(from_string, from_filename) except Exception: # seems utf-16/32 fail on windows if is_platform_windows(): - if '16' in encoding or '32' in encoding: + if "16" in encoding or "32" in encoding: pytest.skip() raise def test_parse_failure_unseekable(self): # Issue #17975 - if self.read_html.keywords.get('flavor') == 'lxml': + if self.read_html.keywords.get("flavor") == "lxml": pytest.skip("Not applicable for lxml") class UnseekableStringIO(StringIO): def seekable(self): return False - bad = UnseekableStringIO(''' -
spameggs
''') + bad = UnseekableStringIO( + """ +
spameggs
""" + ) assert self.read_html(bad) - with pytest.raises(ValueError, - match='passed a non-rewindable file object'): + with pytest.raises(ValueError, match="passed a non-rewindable file object"): self.read_html(bad) def test_parse_failure_rewinds(self): @@ -1112,7 +1210,7 @@ def __init__(self, data): self.at_end = False def read(self, size=None): - data = '' if self.at_end else self.data + data = "" if self.at_end else self.data self.at_end = True return data @@ -1122,8 +1220,8 @@ def seek(self, offset): def seekable(self): return True - good = MockFile('
spam
eggs
') - bad = MockFile('
spameggs
') + good = MockFile("
spam
eggs
") + bad = MockFile("
spameggs
") assert self.read_html(good) assert self.read_html(bad) @@ -1144,7 +1242,7 @@ def run(self): # force import check by reinitalising global vars in html.py reload(pandas.io.html) - filename = datapath('io', 'data', 'valid_markup.html') + filename = datapath("io", "data", "valid_markup.html") helper_thread1 = ErrorThread(target=self.read_html, args=(filename,)) helper_thread2 = ErrorThread(target=self.read_html, args=(filename,)) diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index 203b550b8936ae..83c11cd9ab996e 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -12,12 +12,27 @@ import pandas from pandas import ( - Categorical, DataFrame, Index, Interval, MultiIndex, NaT, Period, Series, - Timestamp, bdate_range, date_range, period_range) + Categorical, + DataFrame, + Index, + Interval, + MultiIndex, + NaT, + Period, + Series, + Timestamp, + bdate_range, + date_range, + period_range, +) import pandas.util.testing as tm from pandas.util.testing import ( - assert_categorical_equal, assert_frame_equal, assert_index_equal, - assert_series_equal, ensure_clean) + assert_categorical_equal, + assert_frame_equal, + assert_index_equal, + assert_series_equal, + ensure_clean, +) from pandas.io.packers import read_msgpack, to_msgpack @@ -38,26 +53,26 @@ _ZLIB_INSTALLED = True -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def current_packers_data(): # our current version packers data - from pandas.tests.io.generate_legacy_storage_files import ( - create_msgpack_data) + from pandas.tests.io.generate_legacy_storage_files import create_msgpack_data + return create_msgpack_data() -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def all_packers_data(): # our all of our current version packers data - from pandas.tests.io.generate_legacy_storage_files import ( - create_data) + from pandas.tests.io.generate_legacy_storage_files import create_data + return create_data() def check_arbitrary(a, b): if isinstance(a, (list, tuple)) and isinstance(b, (list, tuple)): - assert(len(a) == len(b)) + assert len(a) == len(b) for a_, b_ in zip(a, b): check_arbitrary(a_, b_) elif isinstance(a, DataFrame): @@ -70,7 +85,7 @@ def check_arbitrary(a, b): # Temp, # Categorical.categories is changed from str to bytes in PY3 # maybe the same as GH 13591 - if b.categories.inferred_type == 'string': + if b.categories.inferred_type == "string": pass else: tm.assert_categorical_equal(a, b) @@ -80,14 +95,13 @@ def check_arbitrary(a, b): assert a == b assert a.freq == b.freq else: - assert(a == b) + assert a == b @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestPackers: - def setup_method(self, method): - self.path = '__%s__.msg' % tm.rands(10) + self.path = "__%s__.msg" % tm.rands(10) def teardown_method(self, method): pass @@ -100,7 +114,6 @@ def encode_decode(self, x, compress=None, **kwargs): @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestAPI(TestPackers): - def test_string_io(self): df = DataFrame(np.random.randn(10, 2)) @@ -123,7 +136,7 @@ def test_string_io(self): with ensure_clean(self.path) as p: s = df.to_msgpack() - with open(p, 'wb') as fh: + with open(p, "wb") as fh: fh.write(s) result = read_msgpack(p) tm.assert_frame_equal(result, df) @@ -148,22 +161,20 @@ def test_iterator_with_string_io(self): def test_invalid_arg(self): # GH10369 class A: - def __init__(self): self.read = 0 msg = "Invalid file path or buffer object type: " - with pytest.raises(ValueError, match=msg.format('NoneType')): + with pytest.raises(ValueError, match=msg.format("NoneType")): read_msgpack(path_or_buf=None) - with pytest.raises(ValueError, match=msg.format('dict')): + with pytest.raises(ValueError, match=msg.format("dict")): read_msgpack(path_or_buf={}) - with pytest.raises(ValueError, match=msg.format(r'.*\.A')): + with pytest.raises(ValueError, match=msg.format(r".*\.A")): read_msgpack(path_or_buf=A()) @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestNumpy(TestPackers): - def test_numpy_scalar_float(self): x = np.float32(np.random.rand()) x_rec = self.encode_decode(x) @@ -203,12 +214,12 @@ def test_list_numpy_float(self): tm.assert_almost_equal(tuple(x), x_rec) def test_list_numpy_float_complex(self): - if not hasattr(np, 'complex128'): - pytest.skip('numpy can not handle complex128') + if not hasattr(np, "complex128"): + pytest.skip("numpy can not handle complex128") - x = [np.float32(np.random.rand()) for i in range(5)] + \ - [np.complex128(np.random.rand() + 1j * np.random.rand()) - for i in range(5)] + x = [np.float32(np.random.rand()) for i in range(5)] + [ + np.complex128(np.random.rand() + 1j * np.random.rand()) for i in range(5) + ] x_rec = self.encode_decode(x) assert np.allclose(x, x_rec) @@ -222,18 +233,19 @@ def test_list_float(self): tm.assert_almost_equal(tuple(x), x_rec) def test_list_float_complex(self): - x = [np.random.rand() for i in range(5)] + \ - [(np.random.rand() + 1j * np.random.rand()) for i in range(5)] + x = [np.random.rand() for i in range(5)] + [ + (np.random.rand() + 1j * np.random.rand()) for i in range(5) + ] x_rec = self.encode_decode(x) assert np.allclose(x, x_rec) def test_dict_float(self): - x = {'foo': 1.0, 'bar': 2.0} + x = {"foo": 1.0, "bar": 2.0} x_rec = self.encode_decode(x) tm.assert_almost_equal(x, x_rec) def test_dict_complex(self): - x = {'foo': 1.0 + 1.0j, 'bar': 2.0 + 2.0j} + x = {"foo": 1.0 + 1.0j, "bar": 2.0 + 2.0j} x_rec = self.encode_decode(x) tm.assert_dict_equal(x, x_rec) @@ -241,13 +253,12 @@ def test_dict_complex(self): tm.assert_class_equal(x[key], x_rec[key], obj="complex value") def test_dict_numpy_float(self): - x = {'foo': np.float32(1.0), 'bar': np.float32(2.0)} + x = {"foo": np.float32(1.0), "bar": np.float32(2.0)} x_rec = self.encode_decode(x) tm.assert_almost_equal(x, x_rec) def test_dict_numpy_complex(self): - x = {'foo': np.complex128(1.0 + 1.0j), - 'bar': np.complex128(2.0 + 2.0j)} + x = {"foo": np.complex128(1.0 + 1.0j), "bar": np.complex128(2.0 + 2.0j)} x_rec = self.encode_decode(x) tm.assert_dict_equal(x, x_rec) @@ -259,7 +270,7 @@ def test_numpy_array_float(self): # run multiple times for n in range(10): x = np.random.rand(10) - for dtype in ['float32', 'float64']: + for dtype in ["float32", "float64"]: x = x.astype(dtype) x_rec = self.encode_decode(x) tm.assert_almost_equal(x, x_rec) @@ -267,11 +278,10 @@ def test_numpy_array_float(self): def test_numpy_array_complex(self): x = (np.random.rand(5) + 1j * np.random.rand(5)).astype(np.complex128) x_rec = self.encode_decode(x) - assert (all(map(lambda x, y: x == y, x, x_rec)) and - x.dtype == x_rec.dtype) + assert all(map(lambda x, y: x == y, x, x_rec)) and x.dtype == x_rec.dtype def test_list_mixed(self): - x = [1.0, np.float32(3.5), np.complex128(4.25), 'foo', np.bool_(1)] + x = [1.0, np.float32(3.5), np.complex128(4.25), "foo", np.bool_(1)] x_rec = self.encode_decode(x) # current msgpack cannot distinguish list/tuple tm.assert_almost_equal(tuple(x), x_rec) @@ -282,12 +292,13 @@ def test_list_mixed(self): @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestBasic(TestPackers): - def test_timestamp(self): - for i in [Timestamp( - '20130101'), Timestamp('20130101', tz='US/Eastern'), - Timestamp('201301010501')]: + for i in [ + Timestamp("20130101"), + Timestamp("20130101", tz="US/Eastern"), + Timestamp("201301010501"), + ]: i_rec = self.encode_decode(i) assert i == i_rec @@ -297,62 +308,70 @@ def test_nat(self): def test_datetimes(self): - for i in [datetime.datetime(2013, 1, 1), - datetime.datetime(2013, 1, 1, 5, 1), - datetime.date(2013, 1, 1), - np.datetime64(datetime.datetime(2013, 1, 5, 2, 15))]: + for i in [ + datetime.datetime(2013, 1, 1), + datetime.datetime(2013, 1, 1, 5, 1), + datetime.date(2013, 1, 1), + np.datetime64(datetime.datetime(2013, 1, 5, 2, 15)), + ]: i_rec = self.encode_decode(i) assert i == i_rec def test_timedeltas(self): - for i in [datetime.timedelta(days=1), - datetime.timedelta(days=1, seconds=10), - np.timedelta64(1000000)]: + for i in [ + datetime.timedelta(days=1), + datetime.timedelta(days=1, seconds=10), + np.timedelta64(1000000), + ]: i_rec = self.encode_decode(i) assert i == i_rec def test_periods(self): # 13463 - for i in [Period('2010-09', 'M'), Period('2014-Q1', 'Q')]: + for i in [Period("2010-09", "M"), Period("2014-Q1", "Q")]: i_rec = self.encode_decode(i) assert i == i_rec def test_intervals(self): # 19967 - for i in [Interval(0, 1), Interval(0, 1, 'left'), - Interval(10, 25., 'right')]: + for i in [Interval(0, 1), Interval(0, 1, "left"), Interval(10, 25.0, "right")]: i_rec = self.encode_decode(i) assert i == i_rec @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestIndex(TestPackers): - def setup_method(self, method): super().setup_method(method) self.d = { - 'string': tm.makeStringIndex(100), - 'date': tm.makeDateIndex(100), - 'int': tm.makeIntIndex(100), - 'rng': tm.makeRangeIndex(100), - 'float': tm.makeFloatIndex(100), - 'empty': Index([]), - 'tuple': Index(zip(['foo', 'bar', 'baz'], [1, 2, 3])), - 'period': Index(period_range('2012-1-1', freq='M', periods=3)), - 'date2': Index(date_range('2013-01-1', periods=10)), - 'bdate': Index(bdate_range('2013-01-02', periods=10)), - 'cat': tm.makeCategoricalIndex(100), - 'interval': tm.makeIntervalIndex(100), - 'timedelta': tm.makeTimedeltaIndex(100, 'H') + "string": tm.makeStringIndex(100), + "date": tm.makeDateIndex(100), + "int": tm.makeIntIndex(100), + "rng": tm.makeRangeIndex(100), + "float": tm.makeFloatIndex(100), + "empty": Index([]), + "tuple": Index(zip(["foo", "bar", "baz"], [1, 2, 3])), + "period": Index(period_range("2012-1-1", freq="M", periods=3)), + "date2": Index(date_range("2013-01-1", periods=10)), + "bdate": Index(bdate_range("2013-01-02", periods=10)), + "cat": tm.makeCategoricalIndex(100), + "interval": tm.makeIntervalIndex(100), + "timedelta": tm.makeTimedeltaIndex(100, "H"), } self.mi = { - 'reg': MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'), - ('foo', 'two'), - ('qux', 'one'), ('qux', 'two')], - names=['first', 'second']), + "reg": MultiIndex.from_tuples( + [ + ("bar", "one"), + ("baz", "two"), + ("foo", "two"), + ("qux", "one"), + ("qux", "two"), + ], + names=["first", "second"], + ) } def test_basic_index(self): @@ -362,13 +381,14 @@ def test_basic_index(self): tm.assert_index_equal(i, i_rec) # datetime with no freq (GH5506) - i = Index([Timestamp('20130101'), Timestamp('20130103')]) + i = Index([Timestamp("20130101"), Timestamp("20130103")]) i_rec = self.encode_decode(i) tm.assert_index_equal(i, i_rec) # datetime with timezone - i = Index([Timestamp('20130101 9:00:00'), Timestamp( - '20130103 11:00:00')]).tz_localize('US/Eastern') + i = Index( + [Timestamp("20130101 9:00:00"), Timestamp("20130103 11:00:00")] + ).tz_localize("US/Eastern") i_rec = self.encode_decode(i) tm.assert_index_equal(i, i_rec) @@ -387,52 +407,51 @@ def test_unicode(self): def categorical_index(self): # GH15487 df = DataFrame(np.random.randn(10, 2)) - df = df.astype({0: 'category'}).set_index(0) + df = df.astype({0: "category"}).set_index(0) result = self.encode_decode(df) tm.assert_frame_equal(result, df) @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestSeries(TestPackers): - def setup_method(self, method): super().setup_method(method) self.d = {} s = tm.makeStringSeries() - s.name = 'string' - self.d['string'] = s + s.name = "string" + self.d["string"] = s s = tm.makeObjectSeries() - s.name = 'object' - self.d['object'] = s + s.name = "object" + self.d["object"] = s - s = Series(iNaT, dtype='M8[ns]', index=range(5)) - self.d['date'] = s + s = Series(iNaT, dtype="M8[ns]", index=range(5)) + self.d["date"] = s data = { - 'A': [0., 1., 2., 3., np.nan], - 'B': [0, 1, 0, 1, 0], - 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], - 'D': date_range('1/1/2009', periods=5), - 'E': [0., 1, Timestamp('20100101'), 'foo', 2.], - 'F': [Timestamp('20130102', tz='US/Eastern')] * 2 + - [Timestamp('20130603', tz='CET')] * 3, - 'G': [Timestamp('20130102', tz='US/Eastern')] * 5, - 'H': Categorical([1, 2, 3, 4, 5]), - 'I': Categorical([1, 2, 3, 4, 5], ordered=True), - 'J': (np.bool_(1), 2, 3, 4, 5), + "A": [0.0, 1.0, 2.0, 3.0, np.nan], + "B": [0, 1, 0, 1, 0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": date_range("1/1/2009", periods=5), + "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0], + "F": [Timestamp("20130102", tz="US/Eastern")] * 2 + + [Timestamp("20130603", tz="CET")] * 3, + "G": [Timestamp("20130102", tz="US/Eastern")] * 5, + "H": Categorical([1, 2, 3, 4, 5]), + "I": Categorical([1, 2, 3, 4, 5], ordered=True), + "J": (np.bool_(1), 2, 3, 4, 5), } - self.d['float'] = Series(data['A']) - self.d['int'] = Series(data['B']) - self.d['mixed'] = Series(data['E']) - self.d['dt_tz_mixed'] = Series(data['F']) - self.d['dt_tz'] = Series(data['G']) - self.d['cat_ordered'] = Series(data['H']) - self.d['cat_unordered'] = Series(data['I']) - self.d['numpy_bool_mixed'] = Series(data['J']) + self.d["float"] = Series(data["A"]) + self.d["int"] = Series(data["B"]) + self.d["mixed"] = Series(data["E"]) + self.d["dt_tz_mixed"] = Series(data["F"]) + self.d["dt_tz"] = Series(data["G"]) + self.d["cat_ordered"] = Series(data["H"]) + self.d["cat_unordered"] = Series(data["I"]) + self.d["numpy_bool_mixed"] = Series(data["J"]) def test_basic(self): @@ -445,18 +464,18 @@ def test_basic(self): @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestCategorical(TestPackers): - def setup_method(self, method): super().setup_method(method) self.d = {} - self.d['plain_str'] = Categorical(['a', 'b', 'c', 'd', 'e']) - self.d['plain_str_ordered'] = Categorical(['a', 'b', 'c', 'd', 'e'], - ordered=True) + self.d["plain_str"] = Categorical(["a", "b", "c", "d", "e"]) + self.d["plain_str_ordered"] = Categorical( + ["a", "b", "c", "d", "e"], ordered=True + ) - self.d['plain_int'] = Categorical([5, 6, 7, 8]) - self.d['plain_int_ordered'] = Categorical([5, 6, 7, 8], ordered=True) + self.d["plain_int"] = Categorical([5, 6, 7, 8]) + self.d["plain_int_ordered"] = Categorical([5, 6, 7, 8], ordered=True) def test_basic(self): @@ -469,26 +488,26 @@ def test_basic(self): @pytest.mark.filterwarnings("ignore:msgpack:FutureWarning") class TestNDFrame(TestPackers): - def setup_method(self, method): super().setup_method(method) data = { - 'A': [0., 1., 2., 3., np.nan], - 'B': [0, 1, 0, 1, 0], - 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], - 'D': date_range('1/1/2009', periods=5), - 'E': [0., 1, Timestamp('20100101'), 'foo', 2.], - 'F': [Timestamp('20130102', tz='US/Eastern')] * 5, - 'G': [Timestamp('20130603', tz='CET')] * 5, - 'H': Categorical(['a', 'b', 'c', 'd', 'e']), - 'I': Categorical(['a', 'b', 'c', 'd', 'e'], ordered=True), + "A": [0.0, 1.0, 2.0, 3.0, np.nan], + "B": [0, 1, 0, 1, 0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": date_range("1/1/2009", periods=5), + "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0], + "F": [Timestamp("20130102", tz="US/Eastern")] * 5, + "G": [Timestamp("20130603", tz="CET")] * 5, + "H": Categorical(["a", "b", "c", "d", "e"]), + "I": Categorical(["a", "b", "c", "d", "e"], ordered=True), } self.frame = { - 'float': DataFrame(dict(A=data['A'], B=Series(data['A']) + 1)), - 'int': DataFrame(dict(A=data['B'], B=Series(data['B']) + 1)), - 'mixed': DataFrame(data)} + "float": DataFrame(dict(A=data["A"], B=Series(data["A"]) + 1)), + "int": DataFrame(dict(A=data["B"], B=Series(data["B"]) + 1)), + "mixed": DataFrame(data), + } def test_basic_frame(self): @@ -502,22 +521,31 @@ def test_multi(self): for k in self.frame.keys(): assert_frame_equal(self.frame[k], i_rec[k]) - packed_items = tuple([self.frame['float'], self.frame['float'].A, - self.frame['float'].B, None]) + packed_items = tuple( + [self.frame["float"], self.frame["float"].A, self.frame["float"].B, None] + ) l_rec = self.encode_decode(packed_items) check_arbitrary(packed_items, l_rec) # this is an oddity in that packed lists will be returned as tuples - packed_items = [self.frame['float'], self.frame['float'].A, - self.frame['float'].B, None] + packed_items = [ + self.frame["float"], + self.frame["float"].A, + self.frame["float"].B, + None, + ] l_rec = self.encode_decode(packed_items) assert isinstance(l_rec, tuple) check_arbitrary(packed_items, l_rec) def test_iterator(self): - packed_items = [self.frame['float'], self.frame['float'].A, - self.frame['float'].B, None] + packed_items = [ + self.frame["float"], + self.frame["float"].A, + self.frame["float"].B, + None, + ] with ensure_clean(self.path) as path: to_msgpack(path, *packed_items) @@ -528,22 +556,22 @@ def tests_datetimeindex_freq_issue(self): # GH 5947 # inferring freq on the datetimeindex - df = DataFrame([1, 2, 3], index=date_range('1/1/2013', '1/3/2013')) + df = DataFrame([1, 2, 3], index=date_range("1/1/2013", "1/3/2013")) result = self.encode_decode(df) assert_frame_equal(result, df) - df = DataFrame([1, 2], index=date_range('1/1/2013', '1/2/2013')) + df = DataFrame([1, 2], index=date_range("1/1/2013", "1/2/2013")) result = self.encode_decode(df) assert_frame_equal(result, df) def test_dataframe_duplicate_column_names(self): # GH 9618 - expected_1 = DataFrame(columns=['a', 'a']) + expected_1 = DataFrame(columns=["a", "a"]) expected_2 = DataFrame(columns=[1] * 100) expected_2.loc[0] = np.random.randn(100) expected_3 = DataFrame(columns=[1, 1]) - expected_3.loc[0] = ['abc', np.nan] + expected_3.loc[0] = ["abc", np.nan] result_1 = self.encode_decode(expected_1) result_2 = self.encode_decode(expected_2) @@ -559,7 +587,6 @@ def test_dataframe_duplicate_column_names(self): @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestSparse(TestPackers): - def _check_roundtrip(self, obj, comparator, **kwargs): # currently these are not implemetned @@ -574,16 +601,13 @@ def test_sparse_series(self): s = tm.makeStringSeries() s[3:5] = np.nan ss = s.to_sparse() - self._check_roundtrip(ss, tm.assert_series_equal, - check_series_type=True) + self._check_roundtrip(ss, tm.assert_series_equal, check_series_type=True) - ss2 = s.to_sparse(kind='integer') - self._check_roundtrip(ss2, tm.assert_series_equal, - check_series_type=True) + ss2 = s.to_sparse(kind="integer") + self._check_roundtrip(ss2, tm.assert_series_equal, check_series_type=True) ss3 = s.to_sparse(fill_value=0) - self._check_roundtrip(ss3, tm.assert_series_equal, - check_series_type=True) + self._check_roundtrip(ss3, tm.assert_series_equal, check_series_type=True) def test_sparse_frame(self): @@ -592,16 +616,13 @@ def test_sparse_frame(self): s.loc[8:10, -2] = np.nan ss = s.to_sparse() - self._check_roundtrip(ss, tm.assert_frame_equal, - check_frame_type=True) + self._check_roundtrip(ss, tm.assert_frame_equal, check_frame_type=True) - ss2 = s.to_sparse(kind='integer') - self._check_roundtrip(ss2, tm.assert_frame_equal, - check_frame_type=True) + ss2 = s.to_sparse(kind="integer") + self._check_roundtrip(ss2, tm.assert_frame_equal, check_frame_type=True) ss3 = s.to_sparse(fill_value=0) - self._check_roundtrip(ss3, tm.assert_frame_equal, - check_frame_type=True) + self._check_roundtrip(ss3, tm.assert_frame_equal, check_frame_type=True) @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") @@ -612,6 +633,7 @@ class TestCompression(TestPackers): def setup_method(self, method): try: from sqlalchemy import create_engine + self._create_sql_engine = create_engine except ImportError: self._SQLALCHEMY_INSTALLED = False @@ -620,16 +642,16 @@ def setup_method(self, method): super().setup_method(method) data = { - 'A': np.arange(1000, dtype=np.float64), - 'B': np.arange(1000, dtype=np.int32), - 'C': list(100 * 'abcdefghij'), - 'D': date_range(datetime.datetime(2015, 4, 1), periods=1000), - 'E': [datetime.timedelta(days=x) for x in range(1000)], + "A": np.arange(1000, dtype=np.float64), + "B": np.arange(1000, dtype=np.int32), + "C": list(100 * "abcdefghij"), + "D": date_range(datetime.datetime(2015, 4, 1), periods=1000), + "E": [datetime.timedelta(days=x) for x in range(1000)], } self.frame = { - 'float': DataFrame({k: data[k] for k in ['A', 'A']}), - 'int': DataFrame({k: data[k] for k in ['B', 'B']}), - 'mixed': DataFrame(data), + "float": DataFrame({k: data[k] for k in ["A", "A"]}), + "int": DataFrame({k: data[k] for k in ["B", "B"]}), + "mixed": DataFrame(data), } def test_plain(self): @@ -649,16 +671,15 @@ def _test_compression(self, compress): def test_compression_zlib(self): if not _ZLIB_INSTALLED: - pytest.skip('no zlib') - self._test_compression('zlib') + pytest.skip("no zlib") + self._test_compression("zlib") def test_compression_blosc(self): if not _BLOSC_INSTALLED: - pytest.skip('no blosc') - self._test_compression('blosc') + pytest.skip("no blosc") + self._test_compression("blosc") - def _test_compression_warns_when_decompress_caches( - self, monkeypatch, compress): + def _test_compression_warns_when_decompress_caches(self, monkeypatch, compress): not_garbage = [] control = [] # copied data @@ -676,19 +697,20 @@ def decompress(ob): # types mapped to values to add in place. rhs = { - np.dtype('float64'): 1.0, - np.dtype('int32'): 1, - np.dtype('object'): 'a', - np.dtype('datetime64[ns]'): np.timedelta64(1, 'ns'), - np.dtype('timedelta64[ns]'): np.timedelta64(1, 'ns'), + np.dtype("float64"): 1.0, + np.dtype("int32"): 1, + np.dtype("object"): "a", + np.dtype("datetime64[ns]"): np.timedelta64(1, "ns"), + np.dtype("timedelta64[ns]"): np.timedelta64(1, "ns"), } - with monkeypatch.context() as m, \ - tm.assert_produces_warning(PerformanceWarning) as ws: - m.setattr(compress_module, 'decompress', decompress) + with monkeypatch.context() as m, tm.assert_produces_warning( + PerformanceWarning + ) as ws: + m.setattr(compress_module, "decompress", decompress) with catch_warnings(): - filterwarnings('ignore', category=FutureWarning) + filterwarnings("ignore", category=FutureWarning) i_rec = self.encode_decode(self.frame, compress=compress) for k in self.frame.keys(): @@ -704,9 +726,11 @@ def decompress(ob): for w in ws: # check the messages from our warnings - assert str(w.message) == ('copying data after decompressing; ' - 'this may mean that decompress is ' - 'caching its result') + assert str(w.message) == ( + "copying data after decompressing; " + "this may mean that decompress is " + "caching its result" + ) for buf, control_buf in zip(not_garbage, control): # make sure none of our mutations above affected the @@ -715,121 +739,115 @@ def decompress(ob): def test_compression_warns_when_decompress_caches_zlib(self, monkeypatch): if not _ZLIB_INSTALLED: - pytest.skip('no zlib') - self._test_compression_warns_when_decompress_caches( - monkeypatch, 'zlib') + pytest.skip("no zlib") + self._test_compression_warns_when_decompress_caches(monkeypatch, "zlib") def test_compression_warns_when_decompress_caches_blosc(self, monkeypatch): if not _BLOSC_INSTALLED: - pytest.skip('no blosc') - self._test_compression_warns_when_decompress_caches( - monkeypatch, 'blosc') + pytest.skip("no blosc") + self._test_compression_warns_when_decompress_caches(monkeypatch, "blosc") def _test_small_strings_no_warn(self, compress): - empty = np.array([], dtype='uint8') + empty = np.array([], dtype="uint8") with tm.assert_produces_warning(None): with catch_warnings(): - filterwarnings('ignore', category=FutureWarning) + filterwarnings("ignore", category=FutureWarning) empty_unpacked = self.encode_decode(empty, compress=compress) tm.assert_numpy_array_equal(empty_unpacked, empty) assert empty_unpacked.flags.writeable - char = np.array([ord(b'a')], dtype='uint8') + char = np.array([ord(b"a")], dtype="uint8") with tm.assert_produces_warning(None): with catch_warnings(): - filterwarnings('ignore', category=FutureWarning) + filterwarnings("ignore", category=FutureWarning) char_unpacked = self.encode_decode(char, compress=compress) tm.assert_numpy_array_equal(char_unpacked, char) assert char_unpacked.flags.writeable # if this test fails I am sorry because the interpreter is now in a # bad state where b'a' points to 98 == ord(b'b'). - char_unpacked[0] = ord(b'b') + char_unpacked[0] = ord(b"b") # we compare the ord of bytes b'a' with unicode 'a' because the should # always be the same (unless we were able to mutate the shared # character singleton in which case ord(b'a') == ord(b'b'). - assert ord(b'a') == ord('a') - tm.assert_numpy_array_equal( - char_unpacked, - np.array([ord(b'b')], dtype='uint8'), - ) + assert ord(b"a") == ord("a") + tm.assert_numpy_array_equal(char_unpacked, np.array([ord(b"b")], dtype="uint8")) def test_small_strings_no_warn_zlib(self): if not _ZLIB_INSTALLED: - pytest.skip('no zlib') - self._test_small_strings_no_warn('zlib') + pytest.skip("no zlib") + self._test_small_strings_no_warn("zlib") def test_small_strings_no_warn_blosc(self): if not _BLOSC_INSTALLED: - pytest.skip('no blosc') - self._test_small_strings_no_warn('blosc') + pytest.skip("no blosc") + self._test_small_strings_no_warn("blosc") def test_readonly_axis_blosc(self): # GH11880 if not _BLOSC_INSTALLED: - pytest.skip('no blosc') - df1 = DataFrame({'A': list('abcd')}) - df2 = DataFrame(df1, index=[1., 2., 3., 4.]) - assert 1 in self.encode_decode(df1['A'], compress='blosc') - assert 1. in self.encode_decode(df2['A'], compress='blosc') + pytest.skip("no blosc") + df1 = DataFrame({"A": list("abcd")}) + df2 = DataFrame(df1, index=[1.0, 2.0, 3.0, 4.0]) + assert 1 in self.encode_decode(df1["A"], compress="blosc") + assert 1.0 in self.encode_decode(df2["A"], compress="blosc") def test_readonly_axis_zlib(self): # GH11880 - df1 = DataFrame({'A': list('abcd')}) - df2 = DataFrame(df1, index=[1., 2., 3., 4.]) - assert 1 in self.encode_decode(df1['A'], compress='zlib') - assert 1. in self.encode_decode(df2['A'], compress='zlib') + df1 = DataFrame({"A": list("abcd")}) + df2 = DataFrame(df1, index=[1.0, 2.0, 3.0, 4.0]) + assert 1 in self.encode_decode(df1["A"], compress="zlib") + assert 1.0 in self.encode_decode(df2["A"], compress="zlib") def test_readonly_axis_blosc_to_sql(self): # GH11880 if not _BLOSC_INSTALLED: - pytest.skip('no blosc') + pytest.skip("no blosc") if not self._SQLALCHEMY_INSTALLED: - pytest.skip('no sqlalchemy') - expected = DataFrame({'A': list('abcd')}) - df = self.encode_decode(expected, compress='blosc') + pytest.skip("no sqlalchemy") + expected = DataFrame({"A": list("abcd")}) + df = self.encode_decode(expected, compress="blosc") eng = self._create_sql_engine("sqlite:///:memory:") - df.to_sql('test', eng, if_exists='append') - result = pandas.read_sql_table('test', eng, index_col='index') + df.to_sql("test", eng, if_exists="append") + result = pandas.read_sql_table("test", eng, index_col="index") result.index.names = [None] assert_frame_equal(expected, result) def test_readonly_axis_zlib_to_sql(self): # GH11880 if not _ZLIB_INSTALLED: - pytest.skip('no zlib') + pytest.skip("no zlib") if not self._SQLALCHEMY_INSTALLED: - pytest.skip('no sqlalchemy') - expected = DataFrame({'A': list('abcd')}) - df = self.encode_decode(expected, compress='zlib') + pytest.skip("no sqlalchemy") + expected = DataFrame({"A": list("abcd")}) + df = self.encode_decode(expected, compress="zlib") eng = self._create_sql_engine("sqlite:///:memory:") - df.to_sql('test', eng, if_exists='append') - result = pandas.read_sql_table('test', eng, index_col='index') + df.to_sql("test", eng, if_exists="append") + result = pandas.read_sql_table("test", eng, index_col="index") result.index.names = [None] assert_frame_equal(expected, result) @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestEncoding(TestPackers): - def setup_method(self, method): super().setup_method(method) data = { - 'A': ['\u2019'] * 1000, - 'B': np.arange(1000, dtype=np.int32), - 'C': list(100 * 'abcdefghij'), - 'D': date_range(datetime.datetime(2015, 4, 1), periods=1000), - 'E': [datetime.timedelta(days=x) for x in range(1000)], - 'G': [400] * 1000 + "A": ["\u2019"] * 1000, + "B": np.arange(1000, dtype=np.int32), + "C": list(100 * "abcdefghij"), + "D": date_range(datetime.datetime(2015, 4, 1), periods=1000), + "E": [datetime.timedelta(days=x) for x in range(1000)], + "G": [400] * 1000, } self.frame = { - 'float': DataFrame({k: data[k] for k in ['A', 'A']}), - 'int': DataFrame({k: data[k] for k in ['B', 'B']}), - 'mixed': DataFrame(data), + "float": DataFrame({k: data[k] for k in ["A", "A"]}), + "int": DataFrame({k: data[k] for k in ["B", "B"]}), + "mixed": DataFrame(data), } - self.utf_encodings = ['utf8', 'utf16', 'utf32'] + self.utf_encodings = ["utf8", "utf16", "utf32"] def test_utf(self): # GH10581 @@ -841,14 +859,15 @@ def test_utf(self): def test_default_encoding(self): for frame in self.frame.values(): result = frame.to_msgpack() - expected = frame.to_msgpack(encoding='utf8') + expected = frame.to_msgpack(encoding="utf8") assert result == expected result = self.encode_decode(frame) assert_frame_equal(result, frame) -files = glob.glob(os.path.join(os.path.dirname(__file__), "data", - "legacy_msgpack", "*", "*.msgpack")) +files = glob.glob( + os.path.join(os.path.dirname(__file__), "data", "legacy_msgpack", "*", "*.msgpack") +) @pytest.fixture(params=files) @@ -869,11 +888,12 @@ class TestMsgpack: 3. Move the created pickle to "data/legacy_msgpack/" directory. """ - minimum_structure = {'series': ['float', 'int', 'mixed', - 'ts', 'mi', 'dup'], - 'frame': ['float', 'int', 'mixed', 'mi'], - 'index': ['int', 'date', 'period'], - 'mi': ['reg2']} + minimum_structure = { + "series": ["float", "int", "mixed", "ts", "mi", "dup"], + "frame": ["float", "int", "mixed", "mi"], + "index": ["int", "date", "period"], + "mi": ["reg2"], + } def check_min_structure(self, data, version): for typ, v in self.minimum_structure.items(): @@ -888,12 +908,13 @@ def compare(self, current_data, all_data, vf, version): self.check_min_structure(data, version) for typ, dv in data.items(): - assert typ in all_data, ('unpacked data contains ' - 'extra key "{0}"' - .format(typ)) + assert typ in all_data, "unpacked data contains " 'extra key "{0}"'.format( + typ + ) for dt, result in dv.items(): - assert dt in current_data[typ], ('data["{0}"] contains extra ' - 'key "{1}"'.format(typ, dt)) + assert ( + dt in current_data[typ] + ), 'data["{0}"] contains extra ' 'key "{1}"'.format(typ, dt) try: expected = current_data[typ][dt] except KeyError: @@ -916,21 +937,23 @@ def compare_series_dt_tz(self, result, expected, typ, version): def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): tm.assert_frame_equal(result, expected) - def test_msgpacks_legacy(self, current_packers_data, all_packers_data, - legacy_packer, datapath): + def test_msgpacks_legacy( + self, current_packers_data, all_packers_data, legacy_packer, datapath + ): version = os.path.basename(os.path.dirname(legacy_packer)) try: with catch_warnings(record=True): - self.compare(current_packers_data, all_packers_data, - legacy_packer, version) + self.compare( + current_packers_data, all_packers_data, legacy_packer, version + ) except ImportError: # blosc not installed pass def test_msgpack_period_freq(self): # https://github.com/pandas-dev/pandas/issues/24135 - s = Series(np.random.rand(5), index=date_range('20130101', periods=5)) + s = Series(np.random.rand(5), index=date_range("20130101", periods=5)) r = read_msgpack(s.to_msgpack()) repr(r) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index f5f8dac71d095f..f3e045be2e790f 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -12,31 +12,45 @@ from pandas.util import testing as tm from pandas.io.parquet import ( - FastParquetImpl, PyArrowImpl, get_engine, read_parquet, to_parquet) + FastParquetImpl, + PyArrowImpl, + get_engine, + read_parquet, + to_parquet, +) try: import pyarrow # noqa + _HAVE_PYARROW = True except ImportError: _HAVE_PYARROW = False try: import fastparquet # noqa + _HAVE_FASTPARQUET = True except ImportError: _HAVE_FASTPARQUET = False # setup engines & skips -@pytest.fixture(params=[ - pytest.param('fastparquet', - marks=pytest.mark.skipif(not _HAVE_FASTPARQUET, - reason='fastparquet is ' - 'not installed')), - pytest.param('pyarrow', - marks=pytest.mark.skipif(not _HAVE_PYARROW, - reason='pyarrow is ' - 'not installed'))]) +@pytest.fixture( + params=[ + pytest.param( + "fastparquet", + marks=pytest.mark.skipif( + not _HAVE_FASTPARQUET, reason="fastparquet is " "not installed" + ), + ), + pytest.param( + "pyarrow", + marks=pytest.mark.skipif( + not _HAVE_PYARROW, reason="pyarrow is " "not installed" + ), + ), + ] +) def engine(request): return request.param @@ -45,59 +59,73 @@ def engine(request): def pa(): if not _HAVE_PYARROW: pytest.skip("pyarrow is not installed") - return 'pyarrow' + return "pyarrow" @pytest.fixture def fp(): if not _HAVE_FASTPARQUET: pytest.skip("fastparquet is not installed") - return 'fastparquet' + return "fastparquet" @pytest.fixture def df_compat(): - return pd.DataFrame({'A': [1, 2, 3], 'B': 'foo'}) + return pd.DataFrame({"A": [1, 2, 3], "B": "foo"}) @pytest.fixture def df_cross_compat(): - df = pd.DataFrame({'a': list('abc'), - 'b': list(range(1, 4)), - # 'c': np.arange(3, 6).astype('u1'), - 'd': np.arange(4.0, 7.0, dtype='float64'), - 'e': [True, False, True], - 'f': pd.date_range('20130101', periods=3), - # 'g': pd.date_range('20130101', periods=3, - # tz='US/Eastern'), - # 'h': pd.date_range('20130101', periods=3, freq='ns') - }) + df = pd.DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + # 'c': np.arange(3, 6).astype('u1'), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("20130101", periods=3), + # 'g': pd.date_range('20130101', periods=3, + # tz='US/Eastern'), + # 'h': pd.date_range('20130101', periods=3, freq='ns') + } + ) return df @pytest.fixture def df_full(): return pd.DataFrame( - {'string': list('abc'), - 'string_with_nan': ['a', np.nan, 'c'], - 'string_with_none': ['a', None, 'c'], - 'bytes': [b'foo', b'bar', b'baz'], - 'unicode': ['foo', 'bar', 'baz'], - 'int': list(range(1, 4)), - 'uint': np.arange(3, 6).astype('u1'), - 'float': np.arange(4.0, 7.0, dtype='float64'), - 'float_with_nan': [2., np.nan, 3.], - 'bool': [True, False, True], - 'datetime': pd.date_range('20130101', periods=3), - 'datetime_with_nat': [pd.Timestamp('20130101'), - pd.NaT, - pd.Timestamp('20130103')]}) - - -def check_round_trip(df, engine=None, path=None, - write_kwargs=None, read_kwargs=None, - expected=None, check_names=True, - repeat=2): + { + "string": list("abc"), + "string_with_nan": ["a", np.nan, "c"], + "string_with_none": ["a", None, "c"], + "bytes": [b"foo", b"bar", b"baz"], + "unicode": ["foo", "bar", "baz"], + "int": list(range(1, 4)), + "uint": np.arange(3, 6).astype("u1"), + "float": np.arange(4.0, 7.0, dtype="float64"), + "float_with_nan": [2.0, np.nan, 3.0], + "bool": [True, False, True], + "datetime": pd.date_range("20130101", periods=3), + "datetime_with_nat": [ + pd.Timestamp("20130101"), + pd.NaT, + pd.Timestamp("20130103"), + ], + } + ) + + +def check_round_trip( + df, + engine=None, + path=None, + write_kwargs=None, + read_kwargs=None, + expected=None, + check_names=True, + repeat=2, +): """Verify parquet serializer and deserializer produce the same results. Performs a pandas to disk and disk to pandas round trip, @@ -119,23 +147,22 @@ def check_round_trip(df, engine=None, path=None, How many times to repeat the test """ - write_kwargs = write_kwargs or {'compression': None} + write_kwargs = write_kwargs or {"compression": None} read_kwargs = read_kwargs or {} if expected is None: expected = df if engine: - write_kwargs['engine'] = engine - read_kwargs['engine'] = engine + write_kwargs["engine"] = engine + read_kwargs["engine"] = engine def compare(repeat): for _ in range(repeat): df.to_parquet(path, **write_kwargs) with catch_warnings(record=True): actual = read_parquet(path, **read_kwargs) - tm.assert_frame_equal(expected, actual, - check_names=check_names) + tm.assert_frame_equal(expected, actual, check_names=check_names) if path is None: with tm.ensure_clean() as path: @@ -146,48 +173,48 @@ def compare(repeat): def test_invalid_engine(df_compat): with pytest.raises(ValueError): - check_round_trip(df_compat, 'foo', 'bar') + check_round_trip(df_compat, "foo", "bar") def test_options_py(df_compat, pa): # use the set option - with pd.option_context('io.parquet.engine', 'pyarrow'): + with pd.option_context("io.parquet.engine", "pyarrow"): check_round_trip(df_compat) def test_options_fp(df_compat, fp): # use the set option - with pd.option_context('io.parquet.engine', 'fastparquet'): + with pd.option_context("io.parquet.engine", "fastparquet"): check_round_trip(df_compat) def test_options_auto(df_compat, fp, pa): # use the set option - with pd.option_context('io.parquet.engine', 'auto'): + with pd.option_context("io.parquet.engine", "auto"): check_round_trip(df_compat) def test_options_get_engine(fp, pa): - assert isinstance(get_engine('pyarrow'), PyArrowImpl) - assert isinstance(get_engine('fastparquet'), FastParquetImpl) + assert isinstance(get_engine("pyarrow"), PyArrowImpl) + assert isinstance(get_engine("fastparquet"), FastParquetImpl) - with pd.option_context('io.parquet.engine', 'pyarrow'): - assert isinstance(get_engine('auto'), PyArrowImpl) - assert isinstance(get_engine('pyarrow'), PyArrowImpl) - assert isinstance(get_engine('fastparquet'), FastParquetImpl) + with pd.option_context("io.parquet.engine", "pyarrow"): + assert isinstance(get_engine("auto"), PyArrowImpl) + assert isinstance(get_engine("pyarrow"), PyArrowImpl) + assert isinstance(get_engine("fastparquet"), FastParquetImpl) - with pd.option_context('io.parquet.engine', 'fastparquet'): - assert isinstance(get_engine('auto'), FastParquetImpl) - assert isinstance(get_engine('pyarrow'), PyArrowImpl) - assert isinstance(get_engine('fastparquet'), FastParquetImpl) + with pd.option_context("io.parquet.engine", "fastparquet"): + assert isinstance(get_engine("auto"), FastParquetImpl) + assert isinstance(get_engine("pyarrow"), PyArrowImpl) + assert isinstance(get_engine("fastparquet"), FastParquetImpl) - with pd.option_context('io.parquet.engine', 'auto'): - assert isinstance(get_engine('auto'), PyArrowImpl) - assert isinstance(get_engine('pyarrow'), PyArrowImpl) - assert isinstance(get_engine('fastparquet'), FastParquetImpl) + with pd.option_context("io.parquet.engine", "auto"): + assert isinstance(get_engine("auto"), PyArrowImpl) + assert isinstance(get_engine("pyarrow"), PyArrowImpl) + assert isinstance(get_engine("fastparquet"), FastParquetImpl) def test_cross_engine_pa_fp(df_cross_compat, pa, fp): @@ -200,8 +227,8 @@ def test_cross_engine_pa_fp(df_cross_compat, pa, fp): result = read_parquet(path, engine=fp) tm.assert_frame_equal(result, df) - result = read_parquet(path, engine=fp, columns=['a', 'd']) - tm.assert_frame_equal(result, df[['a', 'd']]) + result = read_parquet(path, engine=fp, columns=["a", "d"]) + tm.assert_frame_equal(result, df[["a", "d"]]) def test_cross_engine_fp_pa(df_cross_compat, pa, fp): @@ -215,12 +242,11 @@ def test_cross_engine_fp_pa(df_cross_compat, pa, fp): result = read_parquet(path, engine=pa) tm.assert_frame_equal(result, df) - result = read_parquet(path, engine=pa, columns=['a', 'd']) - tm.assert_frame_equal(result, df[['a', 'd']]) + result = read_parquet(path, engine=pa, columns=["a", "d"]) + tm.assert_frame_equal(result, df[["a", "d"]]) class Base: - def check_error_on_write(self, df, engine, exc): # check that we are raising the exception on writing with tm.ensure_clean() as path: @@ -229,68 +255,72 @@ def check_error_on_write(self, df, engine, exc): class TestBasic(Base): - def test_error(self, engine): - for obj in [pd.Series([1, 2, 3]), 1, 'foo', pd.Timestamp('20130101'), - np.array([1, 2, 3])]: + for obj in [ + pd.Series([1, 2, 3]), + 1, + "foo", + pd.Timestamp("20130101"), + np.array([1, 2, 3]), + ]: self.check_error_on_write(obj, engine, ValueError) def test_columns_dtypes(self, engine): - df = pd.DataFrame({'string': list('abc'), - 'int': list(range(1, 4))}) + df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) # unicode - df.columns = ['foo', 'bar'] + df.columns = ["foo", "bar"] check_round_trip(df, engine) def test_columns_dtypes_invalid(self, engine): - df = pd.DataFrame({'string': list('abc'), - 'int': list(range(1, 4))}) + df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) # numeric df.columns = [0, 1] self.check_error_on_write(df, engine, ValueError) # bytes - df.columns = [b'foo', b'bar'] + df.columns = [b"foo", b"bar"] self.check_error_on_write(df, engine, ValueError) # python object - df.columns = [datetime.datetime(2011, 1, 1, 0, 0), - datetime.datetime(2011, 1, 1, 1, 1)] + df.columns = [ + datetime.datetime(2011, 1, 1, 0, 0), + datetime.datetime(2011, 1, 1, 1, 1), + ] self.check_error_on_write(df, engine, ValueError) - @pytest.mark.parametrize('compression', [None, 'gzip', 'snappy', 'brotli']) + @pytest.mark.parametrize("compression", [None, "gzip", "snappy", "brotli"]) def test_compression(self, engine, compression): - if compression == 'snappy': - pytest.importorskip('snappy') + if compression == "snappy": + pytest.importorskip("snappy") - elif compression == 'brotli': - pytest.importorskip('brotli') + elif compression == "brotli": + pytest.importorskip("brotli") - df = pd.DataFrame({'A': [1, 2, 3]}) - check_round_trip(df, engine, write_kwargs={'compression': compression}) + df = pd.DataFrame({"A": [1, 2, 3]}) + check_round_trip(df, engine, write_kwargs={"compression": compression}) def test_read_columns(self, engine): # GH18154 - df = pd.DataFrame({'string': list('abc'), - 'int': list(range(1, 4))}) + df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) - expected = pd.DataFrame({'string': list('abc')}) - check_round_trip(df, engine, expected=expected, - read_kwargs={'columns': ['string']}) + expected = pd.DataFrame({"string": list("abc")}) + check_round_trip( + df, engine, expected=expected, read_kwargs={"columns": ["string"]} + ) def test_write_index(self, engine): - check_names = engine != 'fastparquet' + check_names = engine != "fastparquet" - df = pd.DataFrame({'A': [1, 2, 3]}) + df = pd.DataFrame({"A": [1, 2, 3]}) check_round_trip(df, engine) indexes = [ [2, 3, 4], - pd.date_range('20130101', periods=3), - list('abc'), + pd.date_range("20130101", periods=3), + list("abc"), [1, 3, 4], ] # non-default index @@ -300,122 +330,118 @@ def test_write_index(self, engine): # index with meta-data df.index = [0, 1, 2] - df.index.name = 'foo' + df.index.name = "foo" check_round_trip(df, engine) def test_write_multiindex(self, pa): # Not supported in fastparquet as of 0.1.3 or older pyarrow version engine = pa - df = pd.DataFrame({'A': [1, 2, 3]}) - index = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]) + df = pd.DataFrame({"A": [1, 2, 3]}) + index = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) df.index = index check_round_trip(df, engine) def test_write_column_multiindex(self, engine): # column multi-index - mi_columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]) + mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns) self.check_error_on_write(df, engine, ValueError) def test_multiindex_with_columns(self, pa): engine = pa - dates = pd.date_range('01-Jan-2018', '01-Dec-2018', freq='MS') - df = pd.DataFrame(np.random.randn(2 * len(dates), 3), - columns=list('ABC')) + dates = pd.date_range("01-Jan-2018", "01-Dec-2018", freq="MS") + df = pd.DataFrame(np.random.randn(2 * len(dates), 3), columns=list("ABC")) index1 = pd.MultiIndex.from_product( - [['Level1', 'Level2'], dates], - names=['level', 'date']) + [["Level1", "Level2"], dates], names=["level", "date"] + ) index2 = index1.copy(names=None) for index in [index1, index2]: df.index = index check_round_trip(df, engine) - check_round_trip(df, engine, read_kwargs={'columns': ['A', 'B']}, - expected=df[['A', 'B']]) + check_round_trip( + df, engine, read_kwargs={"columns": ["A", "B"]}, expected=df[["A", "B"]] + ) def test_write_ignoring_index(self, engine): # ENH 20768 # Ensure index=False omits the index from the written Parquet file. - df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']}) + df = pd.DataFrame({"a": [1, 2, 3], "b": ["q", "r", "s"]}) - write_kwargs = { - 'compression': None, - 'index': False, - } + write_kwargs = {"compression": None, "index": False} # Because we're dropping the index, we expect the loaded dataframe to # have the default integer index. expected = df.reset_index(drop=True) - check_round_trip(df, engine, write_kwargs=write_kwargs, - expected=expected) + check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) # Ignore custom index - df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']}, - index=['zyx', 'wvu', 'tsr']) + df = pd.DataFrame( + {"a": [1, 2, 3], "b": ["q", "r", "s"]}, index=["zyx", "wvu", "tsr"] + ) - check_round_trip(df, engine, write_kwargs=write_kwargs, - expected=expected) + check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) # Ignore multi-indexes as well. - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] - df = pd.DataFrame({'one': [i for i in range(8)], - 'two': [-i for i in range(8)]}, index=arrays) + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + df = pd.DataFrame( + {"one": [i for i in range(8)], "two": [-i for i in range(8)]}, index=arrays + ) expected = df.reset_index(drop=True) - check_round_trip(df, engine, write_kwargs=write_kwargs, - expected=expected) + check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) class TestParquetPyArrow(Base): - def test_basic(self, pa, df_full): df = df_full # additional supported types for pyarrow - df['datetime_tz'] = pd.date_range('20130101', periods=3, - tz='Europe/Brussels') - df['bool_with_none'] = [True, None, True] + df["datetime_tz"] = pd.date_range("20130101", periods=3, tz="Europe/Brussels") + df["bool_with_none"] = [True, None, True] check_round_trip(df, pa) # TODO: This doesn't fail on all systems; track down which - @pytest.mark.xfail(reason="pyarrow fails on this (ARROW-1883)", - strict=False) + @pytest.mark.xfail(reason="pyarrow fails on this (ARROW-1883)", strict=False) def test_basic_subset_columns(self, pa, df_full): # GH18628 df = df_full # additional supported types for pyarrow - df['datetime_tz'] = pd.date_range('20130101', periods=3, - tz='Europe/Brussels') + df["datetime_tz"] = pd.date_range("20130101", periods=3, tz="Europe/Brussels") - check_round_trip(df, pa, expected=df[['string', 'int']], - read_kwargs={'columns': ['string', 'int']}) + check_round_trip( + df, + pa, + expected=df[["string", "int"]], + read_kwargs={"columns": ["string", "int"]}, + ) def test_duplicate_columns(self, pa): # not currently able to handle duplicate columns - df = pd.DataFrame(np.arange(12).reshape(4, 3), - columns=list('aaa')).copy() + df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() self.check_error_on_write(df, pa, ValueError) def test_unsupported(self, pa): # period - df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)}) + df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) # pyarrow 0.11 raises ArrowTypeError # older pyarrows raise ArrowInvalid self.check_error_on_write(df, pa, Exception) # timedelta - df = pd.DataFrame({'a': pd.timedelta_range('1 day', - periods=3)}) + df = pd.DataFrame({"a": pd.timedelta_range("1 day", periods=3)}) self.check_error_on_write(df, pa, NotImplementedError) # mixed python objects - df = pd.DataFrame({'a': ['a', 1, 2.0]}) + df = pd.DataFrame({"a": ["a", 1, 2.0]}) # pyarrow 0.11 raises ArrowTypeError # older pyarrows raise ArrowInvalid self.check_error_on_write(df, pa, Exception) @@ -423,7 +449,7 @@ def test_unsupported(self, pa): def test_categorical(self, pa): # supported in >= 0.7.0 - df = pd.DataFrame({'a': pd.Categorical(list('abc'))}) + df = pd.DataFrame({"a": pd.Categorical(list("abc"))}) # de-serialized as object expected = df.assign(a=df.a.astype(object)) @@ -431,104 +457,112 @@ def test_categorical(self, pa): def test_s3_roundtrip(self, df_compat, s3_resource, pa): # GH #19134 - check_round_trip(df_compat, pa, - path='s3://pandas-test/pyarrow.parquet') + check_round_trip(df_compat, pa, path="s3://pandas-test/pyarrow.parquet") def test_partition_cols_supported(self, pa, df_full): # GH #23283 - partition_cols = ['bool', 'int'] + partition_cols = ["bool", "int"] df = df_full with tm.ensure_clean_dir() as path: - df.to_parquet(path, partition_cols=partition_cols, - compression=None) + df.to_parquet(path, partition_cols=partition_cols, compression=None) import pyarrow.parquet as pq + dataset = pq.ParquetDataset(path, validate_schema=False) assert len(dataset.partitions.partition_names) == 2 assert dataset.partitions.partition_names == set(partition_cols) class TestParquetFastParquet(Base): - - @td.skip_if_no('fastparquet', min_version="0.2.1") + @td.skip_if_no("fastparquet", min_version="0.2.1") def test_basic(self, fp, df_full): df = df_full - df['datetime_tz'] = pd.date_range('20130101', periods=3, - tz='US/Eastern') - df['timedelta'] = pd.timedelta_range('1 day', periods=3) + df["datetime_tz"] = pd.date_range("20130101", periods=3, tz="US/Eastern") + df["timedelta"] = pd.timedelta_range("1 day", periods=3) check_round_trip(df, fp) @pytest.mark.skip(reason="not supported") def test_duplicate_columns(self, fp): # not currently able to handle duplicate columns - df = pd.DataFrame(np.arange(12).reshape(4, 3), - columns=list('aaa')).copy() + df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() self.check_error_on_write(df, fp, ValueError) def test_bool_with_none(self, fp): - df = pd.DataFrame({'a': [True, None, False]}) - expected = pd.DataFrame({'a': [1.0, np.nan, 0.0]}, dtype='float16') + df = pd.DataFrame({"a": [True, None, False]}) + expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") check_round_trip(df, fp, expected=expected) def test_unsupported(self, fp): # period - df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)}) + df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) self.check_error_on_write(df, fp, ValueError) # mixed - df = pd.DataFrame({'a': ['a', 1, 2.0]}) + df = pd.DataFrame({"a": ["a", 1, 2.0]}) self.check_error_on_write(df, fp, ValueError) def test_categorical(self, fp): - df = pd.DataFrame({'a': pd.Categorical(list('abc'))}) + df = pd.DataFrame({"a": pd.Categorical(list("abc"))}) check_round_trip(df, fp) def test_filter_row_groups(self, fp): - d = {'a': list(range(0, 3))} + d = {"a": list(range(0, 3))} df = pd.DataFrame(d) with tm.ensure_clean() as path: - df.to_parquet(path, fp, compression=None, - row_group_offsets=1) - result = read_parquet(path, fp, filters=[('a', '==', 0)]) + df.to_parquet(path, fp, compression=None, row_group_offsets=1) + result = read_parquet(path, fp, filters=[("a", "==", 0)]) assert len(result) == 1 def test_s3_roundtrip(self, df_compat, s3_resource, fp): # GH #19134 - check_round_trip(df_compat, fp, - path='s3://pandas-test/fastparquet.parquet') + check_round_trip(df_compat, fp, path="s3://pandas-test/fastparquet.parquet") def test_partition_cols_supported(self, fp, df_full): # GH #23283 - partition_cols = ['bool', 'int'] + partition_cols = ["bool", "int"] df = df_full with tm.ensure_clean_dir() as path: - df.to_parquet(path, engine="fastparquet", - partition_cols=partition_cols, compression=None) + df.to_parquet( + path, + engine="fastparquet", + partition_cols=partition_cols, + compression=None, + ) assert os.path.exists(path) import fastparquet # noqa: F811 + actual_partition_cols = fastparquet.ParquetFile(path, False).cats assert len(actual_partition_cols) == 2 def test_partition_on_supported(self, fp, df_full): # GH #23283 - partition_cols = ['bool', 'int'] + partition_cols = ["bool", "int"] df = df_full with tm.ensure_clean_dir() as path: - df.to_parquet(path, engine="fastparquet", compression=None, - partition_on=partition_cols) + df.to_parquet( + path, + engine="fastparquet", + compression=None, + partition_on=partition_cols, + ) assert os.path.exists(path) import fastparquet # noqa: F811 + actual_partition_cols = fastparquet.ParquetFile(path, False).cats assert len(actual_partition_cols) == 2 def test_error_on_using_partition_cols_and_partition_on(self, fp, df_full): # GH #23283 - partition_cols = ['bool', 'int'] + partition_cols = ["bool", "int"] df = df_full with pytest.raises(ValueError): with tm.ensure_clean_dir() as path: - df.to_parquet(path, engine="fastparquet", compression=None, - partition_on=partition_cols, - partition_cols=partition_cols) + df.to_parquet( + path, + engine="fastparquet", + compression=None, + partition_on=partition_cols, + partition_cols=partition_cols, + ) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index eb912908d28f49..7aba2a3677f84c 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -31,11 +31,11 @@ from pandas.tseries.offsets import Day, MonthEnd -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def current_pickle_data(): # our current version pickle data - from pandas.tests.io.generate_legacy_storage_files import ( - create_pickle_data) + from pandas.tests.io.generate_legacy_storage_files import create_pickle_data + return create_pickle_data() @@ -47,18 +47,17 @@ def compare_element(result, expected, typ, version=None): tm.assert_index_equal(expected, result) return - if typ.startswith('sp_'): + if typ.startswith("sp_"): comparator = getattr(tm, "assert_%s_equal" % typ) comparator(result, expected, exact_indices=False) - elif typ == 'timestamp': + elif typ == "timestamp": if expected is pd.NaT: assert result is pd.NaT else: assert result == expected assert result.freq == expected.freq else: - comparator = getattr(tm, "assert_%s_equal" % - typ, tm.assert_almost_equal) + comparator = getattr(tm, "assert_%s_equal" % typ, tm.assert_almost_equal) comparator(result, expected) @@ -75,7 +74,7 @@ def compare(data, vf, version): # if available comparator = "compare_{typ}_{dt}".format(typ=typ, dt=dt) - comparator = m.get(comparator, m['compare_element']) + comparator = m.get(comparator, m["compare_element"]) comparator(result, expected, typ, version) return data @@ -128,7 +127,7 @@ def compare_index_period(result, expected, typ, version): tm.assert_index_equal(result, expected) assert isinstance(result.freq, MonthEnd) assert result.freq == MonthEnd() - assert result.freqstr == 'M' + assert result.freqstr == "M" tm.assert_index_equal(result.shift(2), expected.shift(2)) @@ -136,8 +135,9 @@ def compare_sp_frame_float(result, expected, typ, version): tm.assert_sp_frame_equal(result, expected) -files = glob.glob(os.path.join(os.path.dirname(__file__), "data", - "legacy_pickle", "*", "*.pickle")) +files = glob.glob( + os.path.join(os.path.dirname(__file__), "data", "legacy_pickle", "*", "*.pickle") +) @pytest.fixture(params=files) @@ -161,13 +161,12 @@ def test_pickles(current_pickle_data, legacy_pickle): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_round_trip_current(current_pickle_data): - def python_pickler(obj, path): - with open(path, 'wb') as fh: + with open(path, "wb") as fh: pickle.dump(obj, fh, protocol=-1) def python_unpickler(path): - with open(path, 'rb') as fh: + with open(path, "rb") as fh: fh.seek(0) return pickle.load(fh) @@ -194,9 +193,10 @@ def python_unpickler(path): def test_pickle_v0_14_1(datapath): - cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False, - categories=['a', 'b', 'c', 'd']) - pickle_path = datapath('io', 'data', 'categorical_0_14_1.pickle') + cat = pd.Categorical( + values=["a", "b", "c"], ordered=False, categories=["a", "b", "c", "d"] + ) + pickle_path = datapath("io", "data", "categorical_0_14_1.pickle") # This code was executed once on v0.14.1 to generate the pickle: # # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], @@ -210,9 +210,10 @@ def test_pickle_v0_15_2(datapath): # ordered -> _ordered # GH 9347 - cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False, - categories=['a', 'b', 'c', 'd']) - pickle_path = datapath('io', 'data', 'categorical_0_15_2.pickle') + cat = pd.Categorical( + values=["a", "b", "c"], ordered=False, categories=["a", "b", "c", "d"] + ) + pickle_path = datapath("io", "data", "categorical_0_15_2.pickle") # This code was executed once on v0.15.2 to generate the pickle: # # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], @@ -238,19 +239,20 @@ def test_pickle_path_localpath(): # test pickle compression # --------------------- + @pytest.fixture def get_random_path(): - return '__%s__.pickle' % tm.rands(10) + return "__%s__.pickle" % tm.rands(10) class TestCompression: _compression_to_extension = { None: ".none", - 'gzip': '.gz', - 'bz2': '.bz2', - 'zip': '.zip', - 'xz': '.xz', + "gzip": ".gz", + "bz2": ".bz2", + "zip": ".zip", + "xz": ".xz", } def compress_file(self, src_path, dest_path, compression): @@ -258,18 +260,17 @@ def compress_file(self, src_path, dest_path, compression): shutil.copyfile(src_path, dest_path) return - if compression == 'gzip': + if compression == "gzip": f = gzip.open(dest_path, "w") - elif compression == 'bz2': + elif compression == "bz2": f = bz2.BZ2File(dest_path, "w") - elif compression == 'zip': - with zipfile.ZipFile(dest_path, "w", - compression=zipfile.ZIP_DEFLATED) as f: + elif compression == "zip": + with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f: f.write(src_path, os.path.basename(src_path)) - elif compression == 'xz': + elif compression == "xz": f = lzma.LZMAFile(dest_path, "w") else: - msg = 'Unrecognized compression type: {}'.format(compression) + msg = "Unrecognized compression type: {}".format(compression) raise ValueError(msg) if compression != "zip": @@ -297,14 +298,14 @@ def test_write_explicit(self, compression, get_random_path): tm.assert_frame_equal(df, df2) - @pytest.mark.parametrize('compression', ['', 'None', 'bad', '7z']) + @pytest.mark.parametrize("compression", ["", "None", "bad", "7z"]) def test_write_explicit_bad(self, compression, get_random_path): with pytest.raises(ValueError, match="Unrecognized compression type"): with tm.ensure_clean(get_random_path) as path: df = tm.makeDataFrame() df.to_pickle(path, compression=compression) - @pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.no_compress', '.xz']) + @pytest.mark.parametrize("ext", ["", ".gz", ".bz2", ".no_compress", ".xz"]) def test_write_infer(self, ext, get_random_path): base = get_random_path path1 = base + ext @@ -350,8 +351,7 @@ def test_read_explicit(self, compression, get_random_path): tm.assert_frame_equal(df, df2) - @pytest.mark.parametrize('ext', [ - '', '.gz', '.bz2', '.zip', '.no_compress', '.xz']) + @pytest.mark.parametrize("ext", ["", ".gz", ".bz2", ".zip", ".no_compress", ".xz"]) def test_read_infer(self, ext, get_random_path): base = get_random_path path1 = base + ".raw" @@ -381,9 +381,9 @@ def test_read_infer(self, ext, get_random_path): # test pickle compression # --------------------- -class TestProtocol: - @pytest.mark.parametrize('protocol', [-1, 0, 1, 2]) +class TestProtocol: + @pytest.mark.parametrize("protocol", [-1, 0, 1, 2]) def test_read(self, protocol, get_random_path): with tm.ensure_clean(get_random_path) as path: df = tm.makeDataFrame() diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py index 23075db2b38ce2..04c6979596ecab 100644 --- a/pandas/tests/io/test_s3.py +++ b/pandas/tests/io/test_s3.py @@ -8,7 +8,6 @@ class TestS3URL: - def test_is_s3_url(self): assert is_s3_url("s3://pandas/somethingelse.com") assert not is_s3_url("s4://pandas/somethingelse.com") @@ -17,13 +16,10 @@ def test_is_s3_url(self): def test_streaming_s3_objects(): # GH17135 # botocore gained iteration support in 1.10.47, can now be used in read_* - pytest.importorskip('botocore', minversion='1.10.47') + pytest.importorskip("botocore", minversion="1.10.47") from botocore.response import StreamingBody - data = [ - b'foo,bar,baz\n1,2,3\n4,5,6\n', - b'just,the,header\n', - ] + data = [b"foo,bar,baz\n1,2,3\n4,5,6\n", b"just,the,header\n"] for el in data: body = StreamingBody(BytesIO(el), content_length=len(el)) read_csv(body) diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index b9f58f9bf6cf65..b7a62b7ba431b6 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -54,10 +54,9 @@ def test_spss_umlauts(datapath): fname = datapath("io", "data", "umlauts.sav") df = pd.read_spss(fname, convert_categoricals=True) - expected = pd.DataFrame({"var1": ["the ä umlaut", - "the ü umlaut", - "the ä umlaut", - "the ö umlaut"]}) + expected = pd.DataFrame( + {"var1": ["the ä umlaut", "the ü umlaut", "the ä umlaut", "the ö umlaut"]} + ) expected["var1"] = pd.Categorical(expected["var1"]) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index b053afa4dd7d5d..4fc90ea41718dd 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -28,13 +28,21 @@ from pandas.compat import PY36 -from pandas.core.dtypes.common import ( - is_datetime64_dtype, is_datetime64tz_dtype) +from pandas.core.dtypes.common import is_datetime64_dtype, is_datetime64tz_dtype import pandas as pd from pandas import ( - DataFrame, Index, MultiIndex, Series, Timestamp, concat, date_range, isna, - to_datetime, to_timedelta) + DataFrame, + Index, + MultiIndex, + Series, + Timestamp, + concat, + date_range, + isna, + to_datetime, + to_timedelta, +) import pandas.util.testing as tm import pandas.io.sql as sql @@ -46,41 +54,42 @@ import sqlalchemy.sql.sqltypes as sqltypes from sqlalchemy.ext import declarative from sqlalchemy.orm import session as sa_session + SQLALCHEMY_INSTALLED = True except ImportError: SQLALCHEMY_INSTALLED = False SQL_STRINGS = { - 'create_iris': { - 'sqlite': """CREATE TABLE iris ( + "create_iris": { + "sqlite": """CREATE TABLE iris ( "SepalLength" REAL, "SepalWidth" REAL, "PetalLength" REAL, "PetalWidth" REAL, "Name" TEXT )""", - 'mysql': """CREATE TABLE iris ( + "mysql": """CREATE TABLE iris ( `SepalLength` DOUBLE, `SepalWidth` DOUBLE, `PetalLength` DOUBLE, `PetalWidth` DOUBLE, `Name` VARCHAR(200) )""", - 'postgresql': """CREATE TABLE iris ( + "postgresql": """CREATE TABLE iris ( "SepalLength" DOUBLE PRECISION, "SepalWidth" DOUBLE PRECISION, "PetalLength" DOUBLE PRECISION, "PetalWidth" DOUBLE PRECISION, "Name" VARCHAR(200) - )""" + )""", }, - 'insert_iris': { - 'sqlite': """INSERT INTO iris VALUES(?, ?, ?, ?, ?)""", - 'mysql': """INSERT INTO iris VALUES(%s, %s, %s, %s, "%s");""", - 'postgresql': """INSERT INTO iris VALUES(%s, %s, %s, %s, %s);""" + "insert_iris": { + "sqlite": """INSERT INTO iris VALUES(?, ?, ?, ?, ?)""", + "mysql": """INSERT INTO iris VALUES(%s, %s, %s, %s, "%s");""", + "postgresql": """INSERT INTO iris VALUES(%s, %s, %s, %s, %s);""", }, - 'create_test_types': { - 'sqlite': """CREATE TABLE types_test_data ( + "create_test_types": { + "sqlite": """CREATE TABLE types_test_data ( "TextCol" TEXT, "DateCol" TEXT, "IntDateCol" INTEGER, @@ -91,7 +100,7 @@ "IntColWithNull" INTEGER, "BoolColWithNull" INTEGER )""", - 'mysql': """CREATE TABLE types_test_data ( + "mysql": """CREATE TABLE types_test_data ( `TextCol` TEXT, `DateCol` DATETIME, `IntDateCol` INTEGER, @@ -102,7 +111,7 @@ `IntColWithNull` INTEGER, `BoolColWithNull` BOOLEAN )""", - 'postgresql': """CREATE TABLE types_test_data ( + "postgresql": """CREATE TABLE types_test_data ( "TextCol" TEXT, "DateCol" TIMESTAMP, "DateColWithTz" TIMESTAMP WITH TIME ZONE, @@ -113,95 +122,112 @@ "BoolCol" BOOLEAN, "IntColWithNull" INTEGER, "BoolColWithNull" BOOLEAN - )""" + )""", }, - 'insert_test_types': { - 'sqlite': { - 'query': """ + "insert_test_types": { + "sqlite": { + "query": """ INSERT INTO types_test_data VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?) """, - 'fields': ( - 'TextCol', 'DateCol', 'IntDateCol', 'IntDateOnlyCol', - 'FloatCol', 'IntCol', 'BoolCol', 'IntColWithNull', - 'BoolColWithNull' - ) + "fields": ( + "TextCol", + "DateCol", + "IntDateCol", + "IntDateOnlyCol", + "FloatCol", + "IntCol", + "BoolCol", + "IntColWithNull", + "BoolColWithNull", + ), }, - 'mysql': { - 'query': """ + "mysql": { + "query": """ INSERT INTO types_test_data VALUES("%s", %s, %s, %s, %s, %s, %s, %s, %s) """, - 'fields': ( - 'TextCol', 'DateCol', 'IntDateCol', 'IntDateOnlyCol', - 'FloatCol', 'IntCol', 'BoolCol', 'IntColWithNull', - 'BoolColWithNull' - ) + "fields": ( + "TextCol", + "DateCol", + "IntDateCol", + "IntDateOnlyCol", + "FloatCol", + "IntCol", + "BoolCol", + "IntColWithNull", + "BoolColWithNull", + ), }, - 'postgresql': { - 'query': """ + "postgresql": { + "query": """ INSERT INTO types_test_data VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """, - 'fields': ( - 'TextCol', 'DateCol', 'DateColWithTz', - 'IntDateCol', 'IntDateOnlyCol', 'FloatCol', - 'IntCol', 'BoolCol', 'IntColWithNull', 'BoolColWithNull' - ) + "fields": ( + "TextCol", + "DateCol", + "DateColWithTz", + "IntDateCol", + "IntDateOnlyCol", + "FloatCol", + "IntCol", + "BoolCol", + "IntColWithNull", + "BoolColWithNull", + ), }, }, - 'read_parameters': { - 'sqlite': "SELECT * FROM iris WHERE Name=? AND SepalLength=?", - 'mysql': 'SELECT * FROM iris WHERE `Name`="%s" AND `SepalLength`=%s', - 'postgresql': 'SELECT * FROM iris WHERE "Name"=%s AND "SepalLength"=%s' + "read_parameters": { + "sqlite": "SELECT * FROM iris WHERE Name=? AND SepalLength=?", + "mysql": 'SELECT * FROM iris WHERE `Name`="%s" AND `SepalLength`=%s', + "postgresql": 'SELECT * FROM iris WHERE "Name"=%s AND "SepalLength"=%s', }, - 'read_named_parameters': { - 'sqlite': """ + "read_named_parameters": { + "sqlite": """ SELECT * FROM iris WHERE Name=:name AND SepalLength=:length """, - 'mysql': """ + "mysql": """ SELECT * FROM iris WHERE `Name`="%(name)s" AND `SepalLength`=%(length)s """, - 'postgresql': """ + "postgresql": """ SELECT * FROM iris WHERE "Name"=%(name)s AND "SepalLength"=%(length)s - """ + """, }, - 'create_view': { - 'sqlite': """ + "create_view": { + "sqlite": """ CREATE VIEW iris_view AS SELECT * FROM iris """ - } + }, } class MixInBase: - def teardown_method(self, method): # if setup fails, there may not be a connection to close. - if hasattr(self, 'conn'): + if hasattr(self, "conn"): for tbl in self._get_all_tables(): self.drop_table(tbl) self._close_conn() class MySQLMixIn(MixInBase): - def drop_table(self, table_name): cur = self.conn.cursor() - cur.execute("DROP TABLE IF EXISTS %s" % - sql._get_valid_mysql_name(table_name)) + cur.execute("DROP TABLE IF EXISTS %s" % sql._get_valid_mysql_name(table_name)) self.conn.commit() def _get_all_tables(self): cur = self.conn.cursor() - cur.execute('SHOW TABLES') + cur.execute("SHOW TABLES") return [table[0] for table in cur.fetchall()] def _close_conn(self): from pymysql.err import Error + try: self.conn.close() except Error: @@ -209,15 +235,14 @@ def _close_conn(self): class SQLiteMixIn(MixInBase): - def drop_table(self, table_name): - self.conn.execute("DROP TABLE IF EXISTS %s" % - sql._get_valid_sqlite_name(table_name)) + self.conn.execute( + "DROP TABLE IF EXISTS %s" % sql._get_valid_sqlite_name(table_name) + ) self.conn.commit() def _get_all_tables(self): - c = self.conn.execute( - "SELECT name FROM sqlite_master WHERE type='table'") + c = self.conn.execute("SELECT name FROM sqlite_master WHERE type='table'") return [table[0] for table in c.fetchall()] def _close_conn(self): @@ -225,7 +250,6 @@ def _close_conn(self): class SQLAlchemyMixIn(MixInBase): - def drop_table(self, table_name): sql.SQLDatabase(self.conn).drop_table(table_name) @@ -246,116 +270,144 @@ class PandasSQLTest: """ def _get_exec(self): - if hasattr(self.conn, 'execute'): + if hasattr(self.conn, "execute"): return self.conn else: return self.conn.cursor() - @pytest.fixture(params=[('io', 'data', 'iris.csv')]) + @pytest.fixture(params=[("io", "data", "iris.csv")]) def load_iris_data(self, datapath, request): import io + iris_csv_file = datapath(*request.param) - if not hasattr(self, 'conn'): + if not hasattr(self, "conn"): self.setup_connect() - self.drop_table('iris') - self._get_exec().execute(SQL_STRINGS['create_iris'][self.flavor]) + self.drop_table("iris") + self._get_exec().execute(SQL_STRINGS["create_iris"][self.flavor]) - with io.open(iris_csv_file, mode='r', newline=None) as iris_csv: + with io.open(iris_csv_file, mode="r", newline=None) as iris_csv: r = csv.reader(iris_csv) next(r) # skip header row - ins = SQL_STRINGS['insert_iris'][self.flavor] + ins = SQL_STRINGS["insert_iris"][self.flavor] for row in r: self._get_exec().execute(ins, row) def _load_iris_view(self): - self.drop_table('iris_view') - self._get_exec().execute(SQL_STRINGS['create_view'][self.flavor]) + self.drop_table("iris_view") + self._get_exec().execute(SQL_STRINGS["create_view"][self.flavor]) def _check_iris_loaded_frame(self, iris_frame): pytype = iris_frame.dtypes[0].type row = iris_frame.iloc[0] assert issubclass(pytype, np.floating) - tm.equalContents(row.values, [5.1, 3.5, 1.4, 0.2, 'Iris-setosa']) + tm.equalContents(row.values, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) def _load_test1_data(self): - columns = ['index', 'A', 'B', 'C', 'D'] - data = [( - '2000-01-03 00:00:00', 0.980268513777, 3.68573087906, - -0.364216805298, -1.15973806169), - ('2000-01-04 00:00:00', 1.04791624281, - - 0.0412318367011, -0.16181208307, 0.212549316967), - ('2000-01-05 00:00:00', 0.498580885705, - 0.731167677815, -0.537677223318, 1.34627041952), - ('2000-01-06 00:00:00', 1.12020151869, 1.56762092543, - 0.00364077397681, 0.67525259227)] + columns = ["index", "A", "B", "C", "D"] + data = [ + ( + "2000-01-03 00:00:00", + 0.980268513777, + 3.68573087906, + -0.364216805298, + -1.15973806169, + ), + ( + "2000-01-04 00:00:00", + 1.04791624281, + -0.0412318367011, + -0.16181208307, + 0.212549316967, + ), + ( + "2000-01-05 00:00:00", + 0.498580885705, + 0.731167677815, + -0.537677223318, + 1.34627041952, + ), + ( + "2000-01-06 00:00:00", + 1.12020151869, + 1.56762092543, + 0.00364077397681, + 0.67525259227, + ), + ] self.test_frame1 = DataFrame(data, columns=columns) def _load_test2_data(self): - df = DataFrame(dict(A=[4, 1, 3, 6], - B=['asd', 'gsq', 'ylt', 'jkl'], - C=[1.1, 3.1, 6.9, 5.3], - D=[False, True, True, False], - E=['1990-11-22', '1991-10-26', - '1993-11-26', '1995-12-12'])) - df['E'] = to_datetime(df['E']) + df = DataFrame( + dict( + A=[4, 1, 3, 6], + B=["asd", "gsq", "ylt", "jkl"], + C=[1.1, 3.1, 6.9, 5.3], + D=[False, True, True, False], + E=["1990-11-22", "1991-10-26", "1993-11-26", "1995-12-12"], + ) + ) + df["E"] = to_datetime(df["E"]) self.test_frame2 = df def _load_test3_data(self): - columns = ['index', 'A', 'B'] - data = [( - '2000-01-03 00:00:00', 2 ** 31 - 1, -1.987670), - ('2000-01-04 00:00:00', -29, -0.0412318367011), - ('2000-01-05 00:00:00', 20000, 0.731167677815), - ('2000-01-06 00:00:00', -290867, 1.56762092543)] + columns = ["index", "A", "B"] + data = [ + ("2000-01-03 00:00:00", 2 ** 31 - 1, -1.987670), + ("2000-01-04 00:00:00", -29, -0.0412318367011), + ("2000-01-05 00:00:00", 20000, 0.731167677815), + ("2000-01-06 00:00:00", -290867, 1.56762092543), + ] self.test_frame3 = DataFrame(data, columns=columns) def _load_raw_sql(self): - self.drop_table('types_test_data') - self._get_exec().execute(SQL_STRINGS['create_test_types'][self.flavor]) - ins = SQL_STRINGS['insert_test_types'][self.flavor] + self.drop_table("types_test_data") + self._get_exec().execute(SQL_STRINGS["create_test_types"][self.flavor]) + ins = SQL_STRINGS["insert_test_types"][self.flavor] data = [ { - 'TextCol': 'first', - 'DateCol': '2000-01-03 00:00:00', - 'DateColWithTz': '2000-01-01 00:00:00-08:00', - 'IntDateCol': 535852800, - 'IntDateOnlyCol': 20101010, - 'FloatCol': 10.10, - 'IntCol': 1, - 'BoolCol': False, - 'IntColWithNull': 1, - 'BoolColWithNull': False, + "TextCol": "first", + "DateCol": "2000-01-03 00:00:00", + "DateColWithTz": "2000-01-01 00:00:00-08:00", + "IntDateCol": 535852800, + "IntDateOnlyCol": 20101010, + "FloatCol": 10.10, + "IntCol": 1, + "BoolCol": False, + "IntColWithNull": 1, + "BoolColWithNull": False, }, { - 'TextCol': 'first', - 'DateCol': '2000-01-04 00:00:00', - 'DateColWithTz': '2000-06-01 00:00:00-07:00', - 'IntDateCol': 1356998400, - 'IntDateOnlyCol': 20101212, - 'FloatCol': 10.10, - 'IntCol': 1, - 'BoolCol': False, - 'IntColWithNull': None, - 'BoolColWithNull': None, + "TextCol": "first", + "DateCol": "2000-01-04 00:00:00", + "DateColWithTz": "2000-06-01 00:00:00-07:00", + "IntDateCol": 1356998400, + "IntDateOnlyCol": 20101212, + "FloatCol": 10.10, + "IntCol": 1, + "BoolCol": False, + "IntColWithNull": None, + "BoolColWithNull": None, }, ] for d in data: self._get_exec().execute( - ins['query'], - [d[field] for field in ins['fields']] + ins["query"], [d[field] for field in ins["fields"]] ) def _count_rows(self, table_name): - result = self._get_exec().execute( - "SELECT count(*) AS count_1 FROM %s" % table_name).fetchone() + result = ( + self._get_exec() + .execute("SELECT count(*) AS count_1 FROM %s" % table_name) + .fetchone() + ) return result[0] def _read_sql_iris(self): @@ -363,81 +415,75 @@ def _read_sql_iris(self): self._check_iris_loaded_frame(iris_frame) def _read_sql_iris_parameter(self): - query = SQL_STRINGS['read_parameters'][self.flavor] - params = ['Iris-setosa', 5.1] + query = SQL_STRINGS["read_parameters"][self.flavor] + params = ["Iris-setosa", 5.1] iris_frame = self.pandasSQL.read_query(query, params=params) self._check_iris_loaded_frame(iris_frame) def _read_sql_iris_named_parameter(self): - query = SQL_STRINGS['read_named_parameters'][self.flavor] - params = {'name': 'Iris-setosa', 'length': 5.1} + query = SQL_STRINGS["read_named_parameters"][self.flavor] + params = {"name": "Iris-setosa", "length": 5.1} iris_frame = self.pandasSQL.read_query(query, params=params) self._check_iris_loaded_frame(iris_frame) def _to_sql(self, method=None): - self.drop_table('test_frame1') + self.drop_table("test_frame1") - self.pandasSQL.to_sql(self.test_frame1, 'test_frame1', method=method) - assert self.pandasSQL.has_table('test_frame1') + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", method=method) + assert self.pandasSQL.has_table("test_frame1") num_entries = len(self.test_frame1) - num_rows = self._count_rows('test_frame1') + num_rows = self._count_rows("test_frame1") assert num_rows == num_entries # Nuke table - self.drop_table('test_frame1') + self.drop_table("test_frame1") def _to_sql_empty(self): - self.drop_table('test_frame1') - self.pandasSQL.to_sql(self.test_frame1.iloc[:0], 'test_frame1') + self.drop_table("test_frame1") + self.pandasSQL.to_sql(self.test_frame1.iloc[:0], "test_frame1") def _to_sql_fail(self): - self.drop_table('test_frame1') + self.drop_table("test_frame1") - self.pandasSQL.to_sql( - self.test_frame1, 'test_frame1', if_exists='fail') - assert self.pandasSQL.has_table('test_frame1') + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="fail") + assert self.pandasSQL.has_table("test_frame1") msg = "Table 'test_frame1' already exists" with pytest.raises(ValueError, match=msg): - self.pandasSQL.to_sql( - self.test_frame1, 'test_frame1', if_exists='fail') + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="fail") - self.drop_table('test_frame1') + self.drop_table("test_frame1") def _to_sql_replace(self): - self.drop_table('test_frame1') + self.drop_table("test_frame1") - self.pandasSQL.to_sql( - self.test_frame1, 'test_frame1', if_exists='fail') + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="fail") # Add to table again - self.pandasSQL.to_sql( - self.test_frame1, 'test_frame1', if_exists='replace') - assert self.pandasSQL.has_table('test_frame1') + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="replace") + assert self.pandasSQL.has_table("test_frame1") num_entries = len(self.test_frame1) - num_rows = self._count_rows('test_frame1') + num_rows = self._count_rows("test_frame1") assert num_rows == num_entries - self.drop_table('test_frame1') + self.drop_table("test_frame1") def _to_sql_append(self): # Nuke table just in case - self.drop_table('test_frame1') + self.drop_table("test_frame1") - self.pandasSQL.to_sql( - self.test_frame1, 'test_frame1', if_exists='fail') + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="fail") # Add to table again - self.pandasSQL.to_sql( - self.test_frame1, 'test_frame1', if_exists='append') - assert self.pandasSQL.has_table('test_frame1') + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="append") + assert self.pandasSQL.has_table("test_frame1") num_entries = 2 * len(self.test_frame1) - num_rows = self._count_rows('test_frame1') + num_rows = self._count_rows("test_frame1") assert num_rows == num_entries - self.drop_table('test_frame1') + self.drop_table("test_frame1") def _to_sql_method_callable(self): check = [] # used to double check function below is really being used @@ -446,25 +492,25 @@ def sample(pd_table, conn, keys, data_iter): check.append(1) data = [dict(zip(keys, row)) for row in data_iter] conn.execute(pd_table.table.insert(), data) - self.drop_table('test_frame1') - self.pandasSQL.to_sql(self.test_frame1, 'test_frame1', method=sample) - assert self.pandasSQL.has_table('test_frame1') + self.drop_table("test_frame1") + + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", method=sample) + assert self.pandasSQL.has_table("test_frame1") assert check == [1] num_entries = len(self.test_frame1) - num_rows = self._count_rows('test_frame1') + num_rows = self._count_rows("test_frame1") assert num_rows == num_entries # Nuke table - self.drop_table('test_frame1') + self.drop_table("test_frame1") def _roundtrip(self): - self.drop_table('test_frame_roundtrip') - self.pandasSQL.to_sql(self.test_frame1, 'test_frame_roundtrip') - result = self.pandasSQL.read_query( - 'SELECT * FROM test_frame_roundtrip') + self.drop_table("test_frame_roundtrip") + self.pandasSQL.to_sql(self.test_frame1, "test_frame_roundtrip") + result = self.pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") - result.set_index('level_0', inplace=True) + result.set_index("level_0", inplace=True) # result.index.astype(int) result.index.name = None @@ -475,14 +521,15 @@ def _execute_sql(self): # drop_sql = "DROP TABLE IF EXISTS test" # should already be done iris_results = self.pandasSQL.execute("SELECT * FROM iris") row = iris_results.fetchone() - tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, 'Iris-setosa']) + tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) def _to_sql_save_index(self): - df = DataFrame.from_records([(1, 2.1, 'line1'), (2, 1.5, 'line2')], - columns=['A', 'B', 'C'], index=['A']) - self.pandasSQL.to_sql(df, 'test_to_sql_saves_index') - ix_cols = self._get_index_columns('test_to_sql_saves_index') - assert ix_cols == [['A', ], ] + df = DataFrame.from_records( + [(1, 2.1, "line1"), (2, 1.5, "line2")], columns=["A", "B", "C"], index=["A"] + ) + self.pandasSQL.to_sql(df, "test_to_sql_saves_index") + ix_cols = self._get_index_columns("test_to_sql_saves_index") + assert ix_cols == [["A"]] def _transaction_test(self): self.pandasSQL.execute("CREATE TABLE test_trans (A INT, B TEXT)") @@ -493,23 +540,24 @@ def _transaction_test(self): try: with self.pandasSQL.run_transaction() as trans: trans.execute(ins_sql) - raise Exception('error') + raise Exception("error") except Exception: # ignore raised exception pass - res = self.pandasSQL.read_query('SELECT * FROM test_trans') + res = self.pandasSQL.read_query("SELECT * FROM test_trans") assert len(res) == 0 # Make sure when transaction is committed, rows do get inserted with self.pandasSQL.run_transaction() as trans: trans.execute(ins_sql) - res2 = self.pandasSQL.read_query('SELECT * FROM test_trans') + res2 = self.pandasSQL.read_query("SELECT * FROM test_trans") assert len(res2) == 1 # ----------------------------------------------------------------------------- # -- Testing the public API + class _TestSQLApi(PandasSQLTest): """ @@ -527,7 +575,8 @@ class _TestSQLApi(PandasSQLTest): we don't use drop_table because that isn't part of the public api """ - flavor = 'sqlite' + + flavor = "sqlite" mode = None def setup_connect(self): @@ -545,95 +594,87 @@ def load_test_data_and_sql(self): self._load_raw_sql() def test_read_sql_iris(self): - iris_frame = sql.read_sql_query( - "SELECT * FROM iris", self.conn) + iris_frame = sql.read_sql_query("SELECT * FROM iris", self.conn) self._check_iris_loaded_frame(iris_frame) def test_read_sql_view(self): - iris_frame = sql.read_sql_query( - "SELECT * FROM iris_view", self.conn) + iris_frame = sql.read_sql_query("SELECT * FROM iris_view", self.conn) self._check_iris_loaded_frame(iris_frame) def test_to_sql(self): - sql.to_sql(self.test_frame1, 'test_frame1', self.conn) - assert sql.has_table('test_frame1', self.conn) + sql.to_sql(self.test_frame1, "test_frame1", self.conn) + assert sql.has_table("test_frame1", self.conn) def test_to_sql_fail(self): - sql.to_sql(self.test_frame1, 'test_frame2', - self.conn, if_exists='fail') - assert sql.has_table('test_frame2', self.conn) + sql.to_sql(self.test_frame1, "test_frame2", self.conn, if_exists="fail") + assert sql.has_table("test_frame2", self.conn) msg = "Table 'test_frame2' already exists" with pytest.raises(ValueError, match=msg): - sql.to_sql(self.test_frame1, 'test_frame2', - self.conn, if_exists='fail') + sql.to_sql(self.test_frame1, "test_frame2", self.conn, if_exists="fail") def test_to_sql_replace(self): - sql.to_sql(self.test_frame1, 'test_frame3', - self.conn, if_exists='fail') + sql.to_sql(self.test_frame1, "test_frame3", self.conn, if_exists="fail") # Add to table again - sql.to_sql(self.test_frame1, 'test_frame3', - self.conn, if_exists='replace') - assert sql.has_table('test_frame3', self.conn) + sql.to_sql(self.test_frame1, "test_frame3", self.conn, if_exists="replace") + assert sql.has_table("test_frame3", self.conn) num_entries = len(self.test_frame1) - num_rows = self._count_rows('test_frame3') + num_rows = self._count_rows("test_frame3") assert num_rows == num_entries def test_to_sql_append(self): - sql.to_sql(self.test_frame1, 'test_frame4', - self.conn, if_exists='fail') + sql.to_sql(self.test_frame1, "test_frame4", self.conn, if_exists="fail") # Add to table again - sql.to_sql(self.test_frame1, 'test_frame4', - self.conn, if_exists='append') - assert sql.has_table('test_frame4', self.conn) + sql.to_sql(self.test_frame1, "test_frame4", self.conn, if_exists="append") + assert sql.has_table("test_frame4", self.conn) num_entries = 2 * len(self.test_frame1) - num_rows = self._count_rows('test_frame4') + num_rows = self._count_rows("test_frame4") assert num_rows == num_entries def test_to_sql_type_mapping(self): - sql.to_sql(self.test_frame3, 'test_frame5', self.conn, index=False) + sql.to_sql(self.test_frame3, "test_frame5", self.conn, index=False) result = sql.read_sql("SELECT * FROM test_frame5", self.conn) tm.assert_frame_equal(self.test_frame3, result) def test_to_sql_series(self): - s = Series(np.arange(5, dtype='int64'), name='series') + s = Series(np.arange(5, dtype="int64"), name="series") sql.to_sql(s, "test_series", self.conn, index=False) s2 = sql.read_sql_query("SELECT * FROM test_series", self.conn) tm.assert_frame_equal(s.to_frame(), s2) def test_roundtrip(self): - sql.to_sql(self.test_frame1, 'test_frame_roundtrip', - con=self.conn) - result = sql.read_sql_query( - 'SELECT * FROM test_frame_roundtrip', - con=self.conn) + sql.to_sql(self.test_frame1, "test_frame_roundtrip", con=self.conn) + result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=self.conn) # HACK! result.index = self.test_frame1.index - result.set_index('level_0', inplace=True) + result.set_index("level_0", inplace=True) result.index.astype(int) result.index.name = None tm.assert_frame_equal(result, self.test_frame1) def test_roundtrip_chunksize(self): - sql.to_sql(self.test_frame1, 'test_frame_roundtrip', con=self.conn, - index=False, chunksize=2) - result = sql.read_sql_query( - 'SELECT * FROM test_frame_roundtrip', - con=self.conn) + sql.to_sql( + self.test_frame1, + "test_frame_roundtrip", + con=self.conn, + index=False, + chunksize=2, + ) + result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=self.conn) tm.assert_frame_equal(result, self.test_frame1) def test_execute_sql(self): # drop_sql = "DROP TABLE IF EXISTS test" # should already be done iris_results = sql.execute("SELECT * FROM iris", con=self.conn) row = iris_results.fetchone() - tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, 'Iris-setosa']) + tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) def test_date_parsing(self): # Test date parsing in read_sql @@ -641,52 +682,64 @@ def test_date_parsing(self): df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn) assert not issubclass(df.DateCol.dtype.type, np.datetime64) - df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, - parse_dates=['DateCol']) + df = sql.read_sql_query( + "SELECT * FROM types_test_data", self.conn, parse_dates=["DateCol"] + ) assert issubclass(df.DateCol.dtype.type, np.datetime64) assert df.DateCol.tolist() == [ pd.Timestamp(2000, 1, 3, 0, 0, 0), - pd.Timestamp(2000, 1, 4, 0, 0, 0) + pd.Timestamp(2000, 1, 4, 0, 0, 0), ] - df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, - parse_dates={'DateCol': '%Y-%m-%d %H:%M:%S'}) + df = sql.read_sql_query( + "SELECT * FROM types_test_data", + self.conn, + parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"}, + ) assert issubclass(df.DateCol.dtype.type, np.datetime64) assert df.DateCol.tolist() == [ pd.Timestamp(2000, 1, 3, 0, 0, 0), - pd.Timestamp(2000, 1, 4, 0, 0, 0) + pd.Timestamp(2000, 1, 4, 0, 0, 0), ] - df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, - parse_dates=['IntDateCol']) + df = sql.read_sql_query( + "SELECT * FROM types_test_data", self.conn, parse_dates=["IntDateCol"] + ) assert issubclass(df.IntDateCol.dtype.type, np.datetime64) assert df.IntDateCol.tolist() == [ pd.Timestamp(1986, 12, 25, 0, 0, 0), - pd.Timestamp(2013, 1, 1, 0, 0, 0) + pd.Timestamp(2013, 1, 1, 0, 0, 0), ] - df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, - parse_dates={'IntDateCol': 's'}) + df = sql.read_sql_query( + "SELECT * FROM types_test_data", self.conn, parse_dates={"IntDateCol": "s"} + ) assert issubclass(df.IntDateCol.dtype.type, np.datetime64) assert df.IntDateCol.tolist() == [ pd.Timestamp(1986, 12, 25, 0, 0, 0), - pd.Timestamp(2013, 1, 1, 0, 0, 0) + pd.Timestamp(2013, 1, 1, 0, 0, 0), ] - df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, - parse_dates={'IntDateOnlyCol': '%Y%m%d'}) + df = sql.read_sql_query( + "SELECT * FROM types_test_data", + self.conn, + parse_dates={"IntDateOnlyCol": "%Y%m%d"}, + ) assert issubclass(df.IntDateOnlyCol.dtype.type, np.datetime64) assert df.IntDateOnlyCol.tolist() == [ - pd.Timestamp('2010-10-10'), - pd.Timestamp('2010-12-12') + pd.Timestamp("2010-10-10"), + pd.Timestamp("2010-12-12"), ] def test_date_and_index(self): # Test case where same column appears in parse_date and index_col - df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, - index_col='DateCol', - parse_dates=['DateCol', 'IntDateCol']) + df = sql.read_sql_query( + "SELECT * FROM types_test_data", + self.conn, + index_col="DateCol", + parse_dates=["DateCol", "IntDateCol"], + ) assert issubclass(df.index.dtype.type, np.datetime64) assert issubclass(df.IntDateCol.dtype.type, np.datetime64) @@ -694,120 +747,139 @@ def test_date_and_index(self): def test_timedelta(self): # see #6921 - df = to_timedelta( - Series(['00:00:01', '00:00:03'], name='foo')).to_frame() + df = to_timedelta(Series(["00:00:01", "00:00:03"], name="foo")).to_frame() with tm.assert_produces_warning(UserWarning): - df.to_sql('test_timedelta', self.conn) - result = sql.read_sql_query('SELECT * FROM test_timedelta', self.conn) - tm.assert_series_equal(result['foo'], df['foo'].astype('int64')) + df.to_sql("test_timedelta", self.conn) + result = sql.read_sql_query("SELECT * FROM test_timedelta", self.conn) + tm.assert_series_equal(result["foo"], df["foo"].astype("int64")) def test_complex_raises(self): - df = DataFrame({'a': [1 + 1j, 2j]}) + df = DataFrame({"a": [1 + 1j, 2j]}) msg = "Complex datatypes not supported" with pytest.raises(ValueError, match=msg): - df.to_sql('test_complex', self.conn) - - @pytest.mark.parametrize("index_name,index_label,expected", [ - # no index name, defaults to 'index' - (None, None, "index"), - # specifying index_label - (None, "other_label", "other_label"), - # using the index name - ("index_name", None, "index_name"), - # has index name, but specifying index_label - ("index_name", "other_label", "other_label"), - # index name is integer - (0, None, "0"), - # index name is None but index label is integer - (None, 0, "0"), - ]) - def test_to_sql_index_label(self, index_name, - index_label, expected): - temp_frame = DataFrame({'col1': range(4)}) + df.to_sql("test_complex", self.conn) + + @pytest.mark.parametrize( + "index_name,index_label,expected", + [ + # no index name, defaults to 'index' + (None, None, "index"), + # specifying index_label + (None, "other_label", "other_label"), + # using the index name + ("index_name", None, "index_name"), + # has index name, but specifying index_label + ("index_name", "other_label", "other_label"), + # index name is integer + (0, None, "0"), + # index name is None but index label is integer + (None, 0, "0"), + ], + ) + def test_to_sql_index_label(self, index_name, index_label, expected): + temp_frame = DataFrame({"col1": range(4)}) temp_frame.index.name = index_name - query = 'SELECT * FROM test_index_label' - sql.to_sql(temp_frame, 'test_index_label', self.conn, - index_label=index_label) + query = "SELECT * FROM test_index_label" + sql.to_sql(temp_frame, "test_index_label", self.conn, index_label=index_label) frame = sql.read_sql_query(query, self.conn) assert frame.columns[0] == expected def test_to_sql_index_label_multiindex(self): - temp_frame = DataFrame({'col1': range(4)}, - index=MultiIndex.from_product( - [('A0', 'A1'), ('B0', 'B1')])) + temp_frame = DataFrame( + {"col1": range(4)}, + index=MultiIndex.from_product([("A0", "A1"), ("B0", "B1")]), + ) # no index name, defaults to 'level_0' and 'level_1' - sql.to_sql(temp_frame, 'test_index_label', self.conn) - frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) - assert frame.columns[0] == 'level_0' - assert frame.columns[1] == 'level_1' + sql.to_sql(temp_frame, "test_index_label", self.conn) + frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) + assert frame.columns[0] == "level_0" + assert frame.columns[1] == "level_1" # specifying index_label - sql.to_sql(temp_frame, 'test_index_label', self.conn, - if_exists='replace', index_label=['A', 'B']) - frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) - assert frame.columns[:2].tolist() == ['A', 'B'] + sql.to_sql( + temp_frame, + "test_index_label", + self.conn, + if_exists="replace", + index_label=["A", "B"], + ) + frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) + assert frame.columns[:2].tolist() == ["A", "B"] # using the index name - temp_frame.index.names = ['A', 'B'] - sql.to_sql(temp_frame, 'test_index_label', self.conn, - if_exists='replace') - frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) - assert frame.columns[:2].tolist() == ['A', 'B'] + temp_frame.index.names = ["A", "B"] + sql.to_sql(temp_frame, "test_index_label", self.conn, if_exists="replace") + frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) + assert frame.columns[:2].tolist() == ["A", "B"] # has index name, but specifying index_label - sql.to_sql(temp_frame, 'test_index_label', self.conn, - if_exists='replace', index_label=['C', 'D']) - frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) - assert frame.columns[:2].tolist() == ['C', 'D'] + sql.to_sql( + temp_frame, + "test_index_label", + self.conn, + if_exists="replace", + index_label=["C", "D"], + ) + frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) + assert frame.columns[:2].tolist() == ["C", "D"] - msg = ("Length of 'index_label' should match number of levels, which" - " is 2") + msg = "Length of 'index_label' should match number of levels, which" " is 2" with pytest.raises(ValueError, match=msg): - sql.to_sql(temp_frame, 'test_index_label', self.conn, - if_exists='replace', index_label='C') + sql.to_sql( + temp_frame, + "test_index_label", + self.conn, + if_exists="replace", + index_label="C", + ) def test_multiindex_roundtrip(self): - df = DataFrame.from_records([(1, 2.1, 'line1'), (2, 1.5, 'line2')], - columns=['A', 'B', 'C'], index=['A', 'B']) + df = DataFrame.from_records( + [(1, 2.1, "line1"), (2, 1.5, "line2")], + columns=["A", "B", "C"], + index=["A", "B"], + ) - df.to_sql('test_multiindex_roundtrip', self.conn) - result = sql.read_sql_query('SELECT * FROM test_multiindex_roundtrip', - self.conn, index_col=['A', 'B']) + df.to_sql("test_multiindex_roundtrip", self.conn) + result = sql.read_sql_query( + "SELECT * FROM test_multiindex_roundtrip", self.conn, index_col=["A", "B"] + ) tm.assert_frame_equal(df, result, check_index_type=True) def test_integer_col_names(self): df = DataFrame([[1, 2], [3, 4]], columns=[0, 1]) - sql.to_sql(df, "test_frame_integer_col_names", self.conn, - if_exists='replace') + sql.to_sql(df, "test_frame_integer_col_names", self.conn, if_exists="replace") def test_get_schema(self): - create_sql = sql.get_schema(self.test_frame1, 'test', con=self.conn) - assert 'CREATE' in create_sql + create_sql = sql.get_schema(self.test_frame1, "test", con=self.conn) + assert "CREATE" in create_sql def test_get_schema_dtypes(self): - float_frame = DataFrame({'a': [1.1, 1.2], 'b': [2.1, 2.2]}) - dtype = sqlalchemy.Integer if self.mode == 'sqlalchemy' else 'INTEGER' - create_sql = sql.get_schema(float_frame, 'test', - con=self.conn, dtype={'b': dtype}) - assert 'CREATE' in create_sql - assert 'INTEGER' in create_sql + float_frame = DataFrame({"a": [1.1, 1.2], "b": [2.1, 2.2]}) + dtype = sqlalchemy.Integer if self.mode == "sqlalchemy" else "INTEGER" + create_sql = sql.get_schema( + float_frame, "test", con=self.conn, dtype={"b": dtype} + ) + assert "CREATE" in create_sql + assert "INTEGER" in create_sql def test_get_schema_keys(self): - frame = DataFrame({'Col1': [1.1, 1.2], 'Col2': [2.1, 2.2]}) - create_sql = sql.get_schema(frame, 'test', con=self.conn, keys='Col1') + frame = DataFrame({"Col1": [1.1, 1.2], "Col2": [2.1, 2.2]}) + create_sql = sql.get_schema(frame, "test", con=self.conn, keys="Col1") constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("Col1")' assert constraint_sentence in create_sql # multiple columns as key (GH10385) - create_sql = sql.get_schema(self.test_frame1, 'test', - con=self.conn, keys=['A', 'B']) + create_sql = sql.get_schema( + self.test_frame1, "test", con=self.conn, keys=["A", "B"] + ) constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("A", "B")' assert constraint_sentence in create_sql def test_chunksize_read(self): - df = DataFrame(np.random.randn(22, 5), columns=list('abcde')) - df.to_sql('test_chunksize', self.conn, index=False) + df = DataFrame(np.random.randn(22, 5), columns=list("abcde")) + df.to_sql("test_chunksize", self.conn, index=False) # reading the query in one time res1 = sql.read_sql_query("select * from test_chunksize", self.conn) @@ -817,8 +889,9 @@ def test_chunksize_read(self): i = 0 sizes = [5, 5, 5, 5, 2] - for chunk in sql.read_sql_query("select * from test_chunksize", - self.conn, chunksize=5): + for chunk in sql.read_sql_query( + "select * from test_chunksize", self.conn, chunksize=5 + ): res2 = concat([res2, chunk], ignore_index=True) assert len(chunk) == sizes[i] i += 1 @@ -826,13 +899,12 @@ def test_chunksize_read(self): tm.assert_frame_equal(res1, res2) # reading the query in chunks with read_sql_query - if self.mode == 'sqlalchemy': + if self.mode == "sqlalchemy": res3 = DataFrame() i = 0 sizes = [5, 5, 5, 5, 2] - for chunk in sql.read_sql_table("test_chunksize", self.conn, - chunksize=5): + for chunk in sql.read_sql_table("test_chunksize", self.conn, chunksize=5): res3 = concat([res3, chunk], ignore_index=True) assert len(chunk) == sizes[i] i += 1 @@ -843,35 +915,36 @@ def test_categorical(self): # GH8624 # test that categorical gets written correctly as dense column df = DataFrame( - {'person_id': [1, 2, 3], - 'person_name': ['John P. Doe', 'Jane Dove', 'John P. Doe']}) + { + "person_id": [1, 2, 3], + "person_name": ["John P. Doe", "Jane Dove", "John P. Doe"], + } + ) df2 = df.copy() - df2['person_name'] = df2['person_name'].astype('category') + df2["person_name"] = df2["person_name"].astype("category") - df2.to_sql('test_categorical', self.conn, index=False) - res = sql.read_sql_query('SELECT * FROM test_categorical', self.conn) + df2.to_sql("test_categorical", self.conn, index=False) + res = sql.read_sql_query("SELECT * FROM test_categorical", self.conn) tm.assert_frame_equal(res, df) def test_unicode_column_name(self): # GH 11431 - df = DataFrame([[1, 2], [3, 4]], columns=['\xe9', 'b']) - df.to_sql('test_unicode', self.conn, index=False) + df = DataFrame([[1, 2], [3, 4]], columns=["\xe9", "b"]) + df.to_sql("test_unicode", self.conn, index=False) def test_escaped_table_name(self): # GH 13206 - df = DataFrame({'A': [0, 1, 2], 'B': [0.2, np.nan, 5.6]}) - df.to_sql('d1187b08-4943-4c8d-a7f6', self.conn, index=False) + df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]}) + df.to_sql("d1187b08-4943-4c8d-a7f6", self.conn, index=False) - res = sql.read_sql_query('SELECT * FROM `d1187b08-4943-4c8d-a7f6`', - self.conn) + res = sql.read_sql_query("SELECT * FROM `d1187b08-4943-4c8d-a7f6`", self.conn) tm.assert_frame_equal(res, df) @pytest.mark.single -@pytest.mark.skipif( - not SQLALCHEMY_INSTALLED, reason='SQLAlchemy not installed') +@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="SQLAlchemy not installed") class TestSQLApi(SQLAlchemyMixIn, _TestSQLApi): """ Test the public API as it would be used directly @@ -880,46 +953,44 @@ class TestSQLApi(SQLAlchemyMixIn, _TestSQLApi): sqlalchemy mode. """ - flavor = 'sqlite' - mode = 'sqlalchemy' + + flavor = "sqlite" + mode = "sqlalchemy" def connect(self): - return sqlalchemy.create_engine('sqlite:///:memory:') + return sqlalchemy.create_engine("sqlite:///:memory:") def test_read_table_columns(self): # test columns argument in read_table - sql.to_sql(self.test_frame1, 'test_frame', self.conn) + sql.to_sql(self.test_frame1, "test_frame", self.conn) - cols = ['A', 'B'] - result = sql.read_sql_table('test_frame', self.conn, columns=cols) + cols = ["A", "B"] + result = sql.read_sql_table("test_frame", self.conn, columns=cols) assert result.columns.tolist() == cols def test_read_table_index_col(self): # test columns argument in read_table - sql.to_sql(self.test_frame1, 'test_frame', self.conn) + sql.to_sql(self.test_frame1, "test_frame", self.conn) - result = sql.read_sql_table('test_frame', self.conn, index_col="index") + result = sql.read_sql_table("test_frame", self.conn, index_col="index") assert result.index.names == ["index"] - result = sql.read_sql_table( - 'test_frame', self.conn, index_col=["A", "B"]) + result = sql.read_sql_table("test_frame", self.conn, index_col=["A", "B"]) assert result.index.names == ["A", "B"] - result = sql.read_sql_table('test_frame', self.conn, - index_col=["A", "B"], - columns=["C", "D"]) + result = sql.read_sql_table( + "test_frame", self.conn, index_col=["A", "B"], columns=["C", "D"] + ) assert result.index.names == ["A", "B"] assert result.columns.tolist() == ["C", "D"] def test_read_sql_delegate(self): - iris_frame1 = sql.read_sql_query( - "SELECT * FROM iris", self.conn) - iris_frame2 = sql.read_sql( - "SELECT * FROM iris", self.conn) + iris_frame1 = sql.read_sql_query("SELECT * FROM iris", self.conn) + iris_frame2 = sql.read_sql("SELECT * FROM iris", self.conn) tm.assert_frame_equal(iris_frame1, iris_frame2) - iris_frame1 = sql.read_sql_table('iris', self.conn) - iris_frame2 = sql.read_sql('iris', self.conn) + iris_frame1 = sql.read_sql_table("iris", self.conn) + iris_frame2 = sql.read_sql("iris", self.conn) tm.assert_frame_equal(iris_frame1, iris_frame2) def test_not_reflect_all_tables(self): @@ -933,8 +1004,8 @@ def test_not_reflect_all_tables(self): # Cause all warnings to always be triggered. warnings.simplefilter("always") # Trigger a warning. - sql.read_sql_table('other_table', self.conn) - sql.read_sql_query('SELECT * FROM other_table', self.conn) + sql.read_sql_table("other_table", self.conn) + sql.read_sql_query("SELECT * FROM other_table", self.conn) # Verify some things assert len(w) == 0 @@ -948,26 +1019,28 @@ def test_warning_case_insensitive_table_name(self): # Cause all warnings to always be triggered. warnings.simplefilter("always") # This should not trigger a Warning - self.test_frame1.to_sql('CaseSensitive', self.conn) + self.test_frame1.to_sql("CaseSensitive", self.conn) # Verify some things assert len(w) == 0 def _get_index_columns(self, tbl_name): from sqlalchemy.engine import reflection + insp = reflection.Inspector.from_engine(self.conn) - ixs = insp.get_indexes('test_index_saved') - ixs = [i['column_names'] for i in ixs] + ixs = insp.get_indexes("test_index_saved") + ixs = [i["column_names"] for i in ixs] return ixs def test_sqlalchemy_type_mapping(self): # Test Timestamp objects (no datetime64 because of timezone) (GH9085) - df = DataFrame({'time': to_datetime(['201412120154', '201412110254'], - utc=True)}) + df = DataFrame( + {"time": to_datetime(["201412120154", "201412110254"], utc=True)} + ) db = sql.SQLDatabase(self.conn) table = sql.SQLTable("test_type", db, frame=df) # GH 9086: TIMESTAMP is the suggested type for datetimes with timezones - assert isinstance(table.table.c['time'].type, sqltypes.TIMESTAMP) + assert isinstance(table.table.c["time"].type, sqltypes.TIMESTAMP) def test_database_uri_string(self): @@ -977,12 +1050,12 @@ def test_database_uri_string(self): # sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near # "iris": syntax error [SQL: 'iris'] with tm.ensure_clean() as name: - db_uri = 'sqlite:///' + name - table = 'iris' - test_frame1.to_sql(table, db_uri, if_exists='replace', index=False) + db_uri = "sqlite:///" + name + table = "iris" + test_frame1.to_sql(table, db_uri, if_exists="replace", index=False) test_frame2 = sql.read_sql(table, db_uri) test_frame3 = sql.read_sql_table(table, db_uri) - query = 'SELECT * FROM iris' + query = "SELECT * FROM iris" test_frame4 = sql.read_sql_query(query, db_uri) tm.assert_frame_equal(test_frame1, test_frame2) tm.assert_frame_equal(test_frame1, test_frame3) @@ -993,6 +1066,7 @@ def test_database_uri_string(self): try: # the rest of this test depends on pg8000's being absent import pg8000 # noqa + pytest.skip("pg8000 is installed") except ImportError: pass @@ -1004,34 +1078,35 @@ def test_database_uri_string(self): def _make_iris_table_metadata(self): sa = sqlalchemy metadata = sa.MetaData() - iris = sa.Table('iris', metadata, - sa.Column('SepalLength', sa.REAL), - sa.Column('SepalWidth', sa.REAL), - sa.Column('PetalLength', sa.REAL), - sa.Column('PetalWidth', sa.REAL), - sa.Column('Name', sa.TEXT) - ) + iris = sa.Table( + "iris", + metadata, + sa.Column("SepalLength", sa.REAL), + sa.Column("SepalWidth", sa.REAL), + sa.Column("PetalLength", sa.REAL), + sa.Column("PetalWidth", sa.REAL), + sa.Column("Name", sa.TEXT), + ) return iris def test_query_by_text_obj(self): # WIP : GH10846 - name_text = sqlalchemy.text('select * from iris where name=:name') - iris_df = sql.read_sql(name_text, self.conn, params={ - 'name': 'Iris-versicolor'}) - all_names = set(iris_df['Name']) - assert all_names == {'Iris-versicolor'} + name_text = sqlalchemy.text("select * from iris where name=:name") + iris_df = sql.read_sql(name_text, self.conn, params={"name": "Iris-versicolor"}) + all_names = set(iris_df["Name"]) + assert all_names == {"Iris-versicolor"} def test_query_by_select_obj(self): # WIP : GH10846 iris = self._make_iris_table_metadata() name_select = sqlalchemy.select([iris]).where( - iris.c.Name == sqlalchemy.bindparam('name')) - iris_df = sql.read_sql(name_select, self.conn, - params={'name': 'Iris-setosa'}) - all_names = set(iris_df['Name']) - assert all_names == {'Iris-setosa'} + iris.c.Name == sqlalchemy.bindparam("name") + ) + iris_df = sql.read_sql(name_select, self.conn, params={"name": "Iris-setosa"}) + all_names = set(iris_df["Name"]) + assert all_names == {"Iris-setosa"} class _EngineToConnMixin: @@ -1070,8 +1145,9 @@ class TestSQLiteFallbackApi(SQLiteMixIn, _TestSQLApi): Test the public sqlite connection fallback API """ - flavor = 'sqlite' - mode = 'fallback' + + flavor = "sqlite" + mode = "fallback" def connect(self, database=":memory:"): return sqlite3.connect(database) @@ -1083,20 +1159,18 @@ def test_sql_open_close(self): with tm.ensure_clean() as name: conn = self.connect(name) - sql.to_sql(self.test_frame3, "test_frame3_legacy", - conn, index=False) + sql.to_sql(self.test_frame3, "test_frame3_legacy", conn, index=False) conn.close() conn = self.connect(name) - result = sql.read_sql_query("SELECT * FROM test_frame3_legacy;", - conn) + result = sql.read_sql_query("SELECT * FROM test_frame3_legacy;", conn) conn.close() tm.assert_frame_equal(self.test_frame3, result) - @pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason='SQLAlchemy is installed') + @pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason="SQLAlchemy is installed") def test_con_string_import_error(self): - conn = 'mysql://root@localhost/pandas_nosetest' + conn = "mysql://root@localhost/pandas_nosetest" msg = "Using URI string without sqlalchemy installed" with pytest.raises(ImportError, match=msg): sql.read_sql("SELECT * FROM iris", conn) @@ -1108,36 +1182,37 @@ def test_read_sql_delegate(self): msg = "Execution failed on sql 'iris': near \"iris\": syntax error" with pytest.raises(sql.DatabaseError, match=msg): - sql.read_sql('iris', self.conn) + sql.read_sql("iris", self.conn) def test_safe_names_warning(self): # GH 6798 - df = DataFrame([[1, 2], [3, 4]], columns=['a', 'b ']) # has a space + df = DataFrame([[1, 2], [3, 4]], columns=["a", "b "]) # has a space # warns on create table with spaces in names with tm.assert_produces_warning(): sql.to_sql(df, "test_frame3_legacy", self.conn, index=False) def test_get_schema2(self): # without providing a connection object (available for backwards comp) - create_sql = sql.get_schema(self.test_frame1, 'test') - assert 'CREATE' in create_sql + create_sql = sql.get_schema(self.test_frame1, "test") + assert "CREATE" in create_sql def _get_sqlite_column_type(self, schema, column): - for col in schema.split('\n'): + for col in schema.split("\n"): if col.split()[0].strip('""') == column: return col.split()[1] - raise ValueError('Column %s not found' % (column)) + raise ValueError("Column %s not found" % (column)) def test_sqlite_type_mapping(self): # Test Timestamp objects (no datetime64 because of timezone) (GH9085) - df = DataFrame({'time': to_datetime(['201412120154', '201412110254'], - utc=True)}) + df = DataFrame( + {"time": to_datetime(["201412120154", "201412110254"], utc=True)} + ) db = sql.SQLiteDatabase(self.conn) table = sql.SQLiteTable("test_type", db, frame=df) schema = table.sql_schema() - assert self._get_sqlite_column_type(schema, 'time') == "TIMESTAMP" + assert self._get_sqlite_column_type(schema, "time") == "TIMESTAMP" # ----------------------------------------------------------------------------- @@ -1152,9 +1227,10 @@ class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest): deviate for each flavor are overwritten there. """ + flavor = None - @pytest.fixture(autouse=True, scope='class') + @pytest.fixture(autouse=True, scope="class") def setup_class(cls): cls.setup_import() cls.setup_driver() @@ -1173,7 +1249,7 @@ def setup_method(self, load_iris_data): def setup_import(cls): # Skip this test if SQLAlchemy not available if not SQLALCHEMY_INSTALLED: - pytest.skip('SQLAlchemy not installed') + pytest.skip("SQLAlchemy not installed") @classmethod def setup_driver(cls): @@ -1190,8 +1266,7 @@ def setup_connect(self): # to test if connection can be made: self.conn.connect() except sqlalchemy.exc.OperationalError: - pytest.skip( - "Can't connect to {0} server".format(self.flavor)) + pytest.skip("Can't connect to {0} server".format(self.flavor)) def test_read_sql(self): self._read_sql_iris() @@ -1218,7 +1293,7 @@ def test_to_sql_append(self): self._to_sql_append() def test_to_sql_method_multi(self): - self._to_sql(method='multi') + self._to_sql(method="multi") def test_to_sql_method_callable(self): self._to_sql_method_callable() @@ -1226,27 +1301,29 @@ def test_to_sql_method_callable(self): def test_create_table(self): temp_conn = self.connect() temp_frame = DataFrame( - {'one': [1., 2., 3., 4.], 'two': [4., 3., 2., 1.]}) + {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} + ) pandasSQL = sql.SQLDatabase(temp_conn) - pandasSQL.to_sql(temp_frame, 'temp_frame') + pandasSQL.to_sql(temp_frame, "temp_frame") - assert temp_conn.has_table('temp_frame') + assert temp_conn.has_table("temp_frame") def test_drop_table(self): temp_conn = self.connect() temp_frame = DataFrame( - {'one': [1., 2., 3., 4.], 'two': [4., 3., 2., 1.]}) + {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} + ) pandasSQL = sql.SQLDatabase(temp_conn) - pandasSQL.to_sql(temp_frame, 'temp_frame') + pandasSQL.to_sql(temp_frame, "temp_frame") - assert temp_conn.has_table('temp_frame') + assert temp_conn.has_table("temp_frame") - pandasSQL.drop_table('temp_frame') + pandasSQL.drop_table("temp_frame") - assert not temp_conn.has_table('temp_frame') + assert not temp_conn.has_table("temp_frame") def test_roundtrip(self): self._roundtrip() @@ -1260,9 +1337,9 @@ def test_read_table(self): def test_read_table_columns(self): iris_frame = sql.read_sql_table( - "iris", con=self.conn, columns=['SepalLength', 'SepalLength']) - tm.equalContents( - iris_frame.columns.values, ['SepalLength', 'SepalLength']) + "iris", con=self.conn, columns=["SepalLength", "SepalLength"] + ) + tm.equalContents(iris_frame.columns.values, ["SepalLength", "SepalLength"]) def test_read_table_absent_raises(self): msg = "Table this_doesnt_exist not found" @@ -1283,9 +1360,9 @@ def test_default_type_conversion(self): def test_bigint(self): # int64 should be converted to BigInteger, GH7433 - df = DataFrame(data={'i64': [2**62]}) - df.to_sql('test_bigint', self.conn, index=False) - result = sql.read_sql_table('test_bigint', self.conn) + df = DataFrame(data={"i64": [2 ** 62]}) + df.to_sql("test_bigint", self.conn, index=False) + result = sql.read_sql_table("test_bigint", self.conn) tm.assert_frame_equal(df, result) @@ -1308,32 +1385,35 @@ def check(col): # "2000-01-01 00:00:00-08:00" should convert to # "2000-01-01 08:00:00" - assert col[0] == Timestamp('2000-01-01 08:00:00') + assert col[0] == Timestamp("2000-01-01 08:00:00") # "2000-06-01 00:00:00-07:00" should convert to # "2000-06-01 07:00:00" - assert col[1] == Timestamp('2000-06-01 07:00:00') + assert col[1] == Timestamp("2000-06-01 07:00:00") elif is_datetime64tz_dtype(col.dtype): - assert str(col.dt.tz) == 'UTC' + assert str(col.dt.tz) == "UTC" # "2000-01-01 00:00:00-08:00" should convert to # "2000-01-01 08:00:00" # "2000-06-01 00:00:00-07:00" should convert to # "2000-06-01 07:00:00" # GH 6415 - expected_data = [Timestamp('2000-01-01 08:00:00', tz='UTC'), - Timestamp('2000-06-01 07:00:00', tz='UTC')] + expected_data = [ + Timestamp("2000-01-01 08:00:00", tz="UTC"), + Timestamp("2000-06-01 07:00:00", tz="UTC"), + ] expected = Series(expected_data, name=col.name) tm.assert_series_equal(col, expected) else: - raise AssertionError("DateCol loaded with incorrect type " - "-> {0}".format(col.dtype)) + raise AssertionError( + "DateCol loaded with incorrect type " "-> {0}".format(col.dtype) + ) # GH11216 df = pd.read_sql_query("select * from types_test_data", self.conn) - if not hasattr(df, 'DateColWithTz'): + if not hasattr(df, "DateColWithTz"): pytest.skip("no column with datetime with time zone") # this is parsed on Travis (linux), but not on macosx for some reason @@ -1342,21 +1422,27 @@ def check(col): col = df.DateColWithTz assert is_datetime64tz_dtype(col.dtype) - df = pd.read_sql_query("select * from types_test_data", - self.conn, parse_dates=['DateColWithTz']) - if not hasattr(df, 'DateColWithTz'): + df = pd.read_sql_query( + "select * from types_test_data", self.conn, parse_dates=["DateColWithTz"] + ) + if not hasattr(df, "DateColWithTz"): pytest.skip("no column with datetime with time zone") col = df.DateColWithTz assert is_datetime64tz_dtype(col.dtype) - assert str(col.dt.tz) == 'UTC' + assert str(col.dt.tz) == "UTC" check(df.DateColWithTz) - df = pd.concat(list(pd.read_sql_query("select * from types_test_data", - self.conn, chunksize=1)), - ignore_index=True) + df = pd.concat( + list( + pd.read_sql_query( + "select * from types_test_data", self.conn, chunksize=1 + ) + ), + ignore_index=True, + ) col = df.DateColWithTz assert is_datetime64tz_dtype(col.dtype) - assert str(col.dt.tz) == 'UTC' + assert str(col.dt.tz) == "UTC" expected = sql.read_sql_table("types_test_data", self.conn) col = expected.DateColWithTz assert is_datetime64tz_dtype(col.dtype) @@ -1372,106 +1458,111 @@ def test_datetime_with_timezone_roundtrip(self): # Write datetimetz data to a db and read it back # For dbs that support timestamps with timezones, should get back UTC # otherwise naive data should be returned - expected = DataFrame({'A': date_range( - '2013-01-01 09:00:00', periods=3, tz='US/Pacific' - )}) - expected.to_sql('test_datetime_tz', self.conn, index=False) + expected = DataFrame( + {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific")} + ) + expected.to_sql("test_datetime_tz", self.conn, index=False) - if self.flavor == 'postgresql': + if self.flavor == "postgresql": # SQLAlchemy "timezones" (i.e. offsets) are coerced to UTC - expected['A'] = expected['A'].dt.tz_convert('UTC') + expected["A"] = expected["A"].dt.tz_convert("UTC") else: # Otherwise, timestamps are returned as local, naive - expected['A'] = expected['A'].dt.tz_localize(None) + expected["A"] = expected["A"].dt.tz_localize(None) - result = sql.read_sql_table('test_datetime_tz', self.conn) + result = sql.read_sql_table("test_datetime_tz", self.conn) tm.assert_frame_equal(result, expected) - result = sql.read_sql_query( - 'SELECT * FROM test_datetime_tz', self.conn - ) - if self.flavor == 'sqlite': + result = sql.read_sql_query("SELECT * FROM test_datetime_tz", self.conn) + if self.flavor == "sqlite": # read_sql_query does not return datetime type like read_sql_table - assert isinstance(result.loc[0, 'A'], str) - result['A'] = to_datetime(result['A']) + assert isinstance(result.loc[0, "A"], str) + result["A"] = to_datetime(result["A"]) tm.assert_frame_equal(result, expected) def test_naive_datetimeindex_roundtrip(self): # GH 23510 # Ensure that a naive DatetimeIndex isn't converted to UTC - dates = date_range('2018-01-01', periods=5, freq='6H') - expected = DataFrame({'nums': range(5)}, index=dates) - expected.to_sql('foo_table', self.conn, index_label='info_date') - result = sql.read_sql_table('foo_table', self.conn, - index_col='info_date') + dates = date_range("2018-01-01", periods=5, freq="6H") + expected = DataFrame({"nums": range(5)}, index=dates) + expected.to_sql("foo_table", self.conn, index_label="info_date") + result = sql.read_sql_table("foo_table", self.conn, index_col="info_date") # result index with gain a name from a set_index operation; expected tm.assert_frame_equal(result, expected, check_names=False) def test_date_parsing(self): # No Parsing df = sql.read_sql_table("types_test_data", self.conn) - expected_type = object if self.flavor == 'sqlite' else np.datetime64 + expected_type = object if self.flavor == "sqlite" else np.datetime64 assert issubclass(df.DateCol.dtype.type, expected_type) - df = sql.read_sql_table("types_test_data", self.conn, - parse_dates=['DateCol']) + df = sql.read_sql_table("types_test_data", self.conn, parse_dates=["DateCol"]) assert issubclass(df.DateCol.dtype.type, np.datetime64) - df = sql.read_sql_table("types_test_data", self.conn, - parse_dates={'DateCol': '%Y-%m-%d %H:%M:%S'}) + df = sql.read_sql_table( + "types_test_data", self.conn, parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"} + ) assert issubclass(df.DateCol.dtype.type, np.datetime64) - df = sql.read_sql_table("types_test_data", self.conn, parse_dates={ - 'DateCol': {'format': '%Y-%m-%d %H:%M:%S'}}) + df = sql.read_sql_table( + "types_test_data", + self.conn, + parse_dates={"DateCol": {"format": "%Y-%m-%d %H:%M:%S"}}, + ) assert issubclass(df.DateCol.dtype.type, np.datetime64) df = sql.read_sql_table( - "types_test_data", self.conn, parse_dates=['IntDateCol']) + "types_test_data", self.conn, parse_dates=["IntDateCol"] + ) assert issubclass(df.IntDateCol.dtype.type, np.datetime64) df = sql.read_sql_table( - "types_test_data", self.conn, parse_dates={'IntDateCol': 's'}) + "types_test_data", self.conn, parse_dates={"IntDateCol": "s"} + ) assert issubclass(df.IntDateCol.dtype.type, np.datetime64) - df = sql.read_sql_table("types_test_data", self.conn, - parse_dates={'IntDateCol': {'unit': 's'}}) + df = sql.read_sql_table( + "types_test_data", self.conn, parse_dates={"IntDateCol": {"unit": "s"}} + ) assert issubclass(df.IntDateCol.dtype.type, np.datetime64) def test_datetime(self): - df = DataFrame({'A': date_range('2013-01-01 09:00:00', periods=3), - 'B': np.arange(3.0)}) - df.to_sql('test_datetime', self.conn) + df = DataFrame( + {"A": date_range("2013-01-01 09:00:00", periods=3), "B": np.arange(3.0)} + ) + df.to_sql("test_datetime", self.conn) # with read_table -> type information from schema used - result = sql.read_sql_table('test_datetime', self.conn) - result = result.drop('index', axis=1) + result = sql.read_sql_table("test_datetime", self.conn) + result = result.drop("index", axis=1) tm.assert_frame_equal(result, df) # with read_sql -> no type information -> sqlite has no native - result = sql.read_sql_query('SELECT * FROM test_datetime', self.conn) - result = result.drop('index', axis=1) - if self.flavor == 'sqlite': - assert isinstance(result.loc[0, 'A'], str) - result['A'] = to_datetime(result['A']) + result = sql.read_sql_query("SELECT * FROM test_datetime", self.conn) + result = result.drop("index", axis=1) + if self.flavor == "sqlite": + assert isinstance(result.loc[0, "A"], str) + result["A"] = to_datetime(result["A"]) tm.assert_frame_equal(result, df) else: tm.assert_frame_equal(result, df) def test_datetime_NaT(self): - df = DataFrame({'A': date_range('2013-01-01 09:00:00', periods=3), - 'B': np.arange(3.0)}) - df.loc[1, 'A'] = np.nan - df.to_sql('test_datetime', self.conn, index=False) + df = DataFrame( + {"A": date_range("2013-01-01 09:00:00", periods=3), "B": np.arange(3.0)} + ) + df.loc[1, "A"] = np.nan + df.to_sql("test_datetime", self.conn, index=False) # with read_table -> type information from schema used - result = sql.read_sql_table('test_datetime', self.conn) + result = sql.read_sql_table("test_datetime", self.conn) tm.assert_frame_equal(result, df) # with read_sql -> no type information -> sqlite has no native - result = sql.read_sql_query('SELECT * FROM test_datetime', self.conn) - if self.flavor == 'sqlite': - assert isinstance(result.loc[0, 'A'], str) - result['A'] = to_datetime(result['A'], errors='coerce') + result = sql.read_sql_query("SELECT * FROM test_datetime", self.conn) + if self.flavor == "sqlite": + assert isinstance(result.loc[0, "A"], str) + result["A"] = to_datetime(result["A"], errors="coerce") tm.assert_frame_equal(result, df) else: tm.assert_frame_equal(result, df) @@ -1479,18 +1570,18 @@ def test_datetime_NaT(self): def test_datetime_date(self): # test support for datetime.date df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) - df.to_sql('test_date', self.conn, index=False) - res = read_sql_table('test_date', self.conn) - result = res['a'] - expected = to_datetime(df['a']) + df.to_sql("test_date", self.conn, index=False) + res = read_sql_table("test_date", self.conn) + result = res["a"] + expected = to_datetime(df["a"]) # comes back as datetime64 tm.assert_series_equal(result, expected) def test_datetime_time(self): # test support for datetime.time df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"]) - df.to_sql('test_time', self.conn, index=False) - res = read_sql_table('test_time', self.conn) + df.to_sql("test_time", self.conn, index=False) + res = read_sql_table("test_time", self.conn) tm.assert_frame_equal(res, df) # GH8341 @@ -1502,7 +1593,7 @@ def test_datetime_time(self): tm.assert_frame_equal(ref, res) # check if adapter is in place # then test if sqlalchemy is unaffected by the sqlite adapter sql.to_sql(df, "test_time3", self.conn, index=False) - if self.flavor == 'sqlite': + if self.flavor == "sqlite": res = sql.read_sql_query("SELECT * FROM test_time3", self.conn) ref = df.applymap(lambda _: _.strftime("%H:%M:%S.%f")) tm.assert_frame_equal(ref, res) @@ -1511,9 +1602,9 @@ def test_datetime_time(self): def test_mixed_dtype_insert(self): # see GH6509 - s1 = Series(2**25 + 1, dtype=np.int32) + s1 = Series(2 ** 25 + 1, dtype=np.int32) s2 = Series(0.0, dtype=np.float32) - df = DataFrame({'s1': s1, 's2': s2}) + df = DataFrame({"s1": s1, "s2": s2}) # write and read again df.to_sql("test_read_write", self.conn, index=False) @@ -1523,53 +1614,54 @@ def test_mixed_dtype_insert(self): def test_nan_numeric(self): # NaNs in numeric float column - df = DataFrame({'A': [0, 1, 2], 'B': [0.2, np.nan, 5.6]}) - df.to_sql('test_nan', self.conn, index=False) + df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]}) + df.to_sql("test_nan", self.conn, index=False) # with read_table - result = sql.read_sql_table('test_nan', self.conn) + result = sql.read_sql_table("test_nan", self.conn) tm.assert_frame_equal(result, df) # with read_sql - result = sql.read_sql_query('SELECT * FROM test_nan', self.conn) + result = sql.read_sql_query("SELECT * FROM test_nan", self.conn) tm.assert_frame_equal(result, df) def test_nan_fullcolumn(self): # full NaN column (numeric float column) - df = DataFrame({'A': [0, 1, 2], 'B': [np.nan, np.nan, np.nan]}) - df.to_sql('test_nan', self.conn, index=False) + df = DataFrame({"A": [0, 1, 2], "B": [np.nan, np.nan, np.nan]}) + df.to_sql("test_nan", self.conn, index=False) # with read_table - result = sql.read_sql_table('test_nan', self.conn) + result = sql.read_sql_table("test_nan", self.conn) tm.assert_frame_equal(result, df) # with read_sql -> not type info from table -> stays None - df['B'] = df['B'].astype('object') - df['B'] = None - result = sql.read_sql_query('SELECT * FROM test_nan', self.conn) + df["B"] = df["B"].astype("object") + df["B"] = None + result = sql.read_sql_query("SELECT * FROM test_nan", self.conn) tm.assert_frame_equal(result, df) def test_nan_string(self): # NaNs in string column - df = DataFrame({'A': [0, 1, 2], 'B': ['a', 'b', np.nan]}) - df.to_sql('test_nan', self.conn, index=False) + df = DataFrame({"A": [0, 1, 2], "B": ["a", "b", np.nan]}) + df.to_sql("test_nan", self.conn, index=False) # NaNs are coming back as None - df.loc[2, 'B'] = None + df.loc[2, "B"] = None # with read_table - result = sql.read_sql_table('test_nan', self.conn) + result = sql.read_sql_table("test_nan", self.conn) tm.assert_frame_equal(result, df) # with read_sql - result = sql.read_sql_query('SELECT * FROM test_nan', self.conn) + result = sql.read_sql_query("SELECT * FROM test_nan", self.conn) tm.assert_frame_equal(result, df) def _get_index_columns(self, tbl_name): from sqlalchemy.engine import reflection + insp = reflection.Inspector.from_engine(self.conn) ixs = insp.get_indexes(tbl_name) - ixs = [i['column_names'] for i in ixs] + ixs = [i["column_names"] for i in ixs] return ixs def test_to_sql_save_index(self): @@ -1584,112 +1676,117 @@ def test_get_schema_create_table(self): # mismatch) self._load_test3_data() - tbl = 'test_get_schema_create_table' + tbl = "test_get_schema_create_table" create_sql = sql.get_schema(self.test_frame3, tbl, con=self.conn) blank_test_df = self.test_frame3.iloc[:0] self.drop_table(tbl) self.conn.execute(create_sql) returned_df = sql.read_sql_table(tbl, self.conn) - tm.assert_frame_equal(returned_df, blank_test_df, - check_index_type=False) + tm.assert_frame_equal(returned_df, blank_test_df, check_index_type=False) self.drop_table(tbl) def test_dtype(self): - cols = ['A', 'B'] - data = [(0.8, True), - (0.9, None)] + cols = ["A", "B"] + data = [(0.8, True), (0.9, None)] df = DataFrame(data, columns=cols) - df.to_sql('dtype_test', self.conn) - df.to_sql('dtype_test2', self.conn, dtype={'B': sqlalchemy.TEXT}) + df.to_sql("dtype_test", self.conn) + df.to_sql("dtype_test2", self.conn, dtype={"B": sqlalchemy.TEXT}) meta = sqlalchemy.schema.MetaData(bind=self.conn) meta.reflect() - sqltype = meta.tables['dtype_test2'].columns['B'].type + sqltype = meta.tables["dtype_test2"].columns["B"].type assert isinstance(sqltype, sqlalchemy.TEXT) msg = "The type of B is not a SQLAlchemy type" with pytest.raises(ValueError, match=msg): - df.to_sql('error', self.conn, dtype={'B': str}) + df.to_sql("error", self.conn, dtype={"B": str}) # GH9083 - df.to_sql('dtype_test3', self.conn, dtype={'B': sqlalchemy.String(10)}) + df.to_sql("dtype_test3", self.conn, dtype={"B": sqlalchemy.String(10)}) meta.reflect() - sqltype = meta.tables['dtype_test3'].columns['B'].type + sqltype = meta.tables["dtype_test3"].columns["B"].type assert isinstance(sqltype, sqlalchemy.String) assert sqltype.length == 10 # single dtype - df.to_sql('single_dtype_test', self.conn, dtype=sqlalchemy.TEXT) + df.to_sql("single_dtype_test", self.conn, dtype=sqlalchemy.TEXT) meta = sqlalchemy.schema.MetaData(bind=self.conn) meta.reflect() - sqltypea = meta.tables['single_dtype_test'].columns['A'].type - sqltypeb = meta.tables['single_dtype_test'].columns['B'].type + sqltypea = meta.tables["single_dtype_test"].columns["A"].type + sqltypeb = meta.tables["single_dtype_test"].columns["B"].type assert isinstance(sqltypea, sqlalchemy.TEXT) assert isinstance(sqltypeb, sqlalchemy.TEXT) def test_notna_dtype(self): - cols = {'Bool': Series([True, None]), - 'Date': Series([datetime(2012, 5, 1), None]), - 'Int': Series([1, None], dtype='object'), - 'Float': Series([1.1, None]) - } + cols = { + "Bool": Series([True, None]), + "Date": Series([datetime(2012, 5, 1), None]), + "Int": Series([1, None], dtype="object"), + "Float": Series([1.1, None]), + } df = DataFrame(cols) - tbl = 'notna_dtype_test' + tbl = "notna_dtype_test" df.to_sql(tbl, self.conn) returned_df = sql.read_sql_table(tbl, self.conn) # noqa meta = sqlalchemy.schema.MetaData(bind=self.conn) meta.reflect() - if self.flavor == 'mysql': + if self.flavor == "mysql": my_type = sqltypes.Integer else: my_type = sqltypes.Boolean col_dict = meta.tables[tbl].columns - assert isinstance(col_dict['Bool'].type, my_type) - assert isinstance(col_dict['Date'].type, sqltypes.DateTime) - assert isinstance(col_dict['Int'].type, sqltypes.Integer) - assert isinstance(col_dict['Float'].type, sqltypes.Float) + assert isinstance(col_dict["Bool"].type, my_type) + assert isinstance(col_dict["Date"].type, sqltypes.DateTime) + assert isinstance(col_dict["Int"].type, sqltypes.Integer) + assert isinstance(col_dict["Float"].type, sqltypes.Float) def test_double_precision(self): V = 1.23456789101112131415 - df = DataFrame({'f32': Series([V, ], dtype='float32'), - 'f64': Series([V, ], dtype='float64'), - 'f64_as_f32': Series([V, ], dtype='float64'), - 'i32': Series([5, ], dtype='int32'), - 'i64': Series([5, ], dtype='int64'), - }) + df = DataFrame( + { + "f32": Series([V], dtype="float32"), + "f64": Series([V], dtype="float64"), + "f64_as_f32": Series([V], dtype="float64"), + "i32": Series([5], dtype="int32"), + "i64": Series([5], dtype="int64"), + } + ) - df.to_sql('test_dtypes', self.conn, index=False, if_exists='replace', - dtype={'f64_as_f32': sqlalchemy.Float(precision=23)}) - res = sql.read_sql_table('test_dtypes', self.conn) + df.to_sql( + "test_dtypes", + self.conn, + index=False, + if_exists="replace", + dtype={"f64_as_f32": sqlalchemy.Float(precision=23)}, + ) + res = sql.read_sql_table("test_dtypes", self.conn) # check precision of float64 - assert (np.round(df['f64'].iloc[0], 14) == - np.round(res['f64'].iloc[0], 14)) + assert np.round(df["f64"].iloc[0], 14) == np.round(res["f64"].iloc[0], 14) # check sql types meta = sqlalchemy.schema.MetaData(bind=self.conn) meta.reflect() - col_dict = meta.tables['test_dtypes'].columns - assert str(col_dict['f32'].type) == str(col_dict['f64_as_f32'].type) - assert isinstance(col_dict['f32'].type, sqltypes.Float) - assert isinstance(col_dict['f64'].type, sqltypes.Float) - assert isinstance(col_dict['i32'].type, sqltypes.Integer) - assert isinstance(col_dict['i64'].type, sqltypes.BigInteger) + col_dict = meta.tables["test_dtypes"].columns + assert str(col_dict["f32"].type) == str(col_dict["f64_as_f32"].type) + assert isinstance(col_dict["f32"].type, sqltypes.Float) + assert isinstance(col_dict["f64"].type, sqltypes.Float) + assert isinstance(col_dict["i32"].type, sqltypes.Integer) + assert isinstance(col_dict["i64"].type, sqltypes.BigInteger) def test_connectable_issue_example(self): # This tests the example raised in issue # https://github.com/pandas-dev/pandas/issues/10104 def foo(connection): - query = 'SELECT test_foo_data FROM test_foo_data' + query = "SELECT test_foo_data FROM test_foo_data" return sql.read_sql_query(query, con=connection) def bar(connection, data): - data.to_sql(name='test_foo_data', - con=connection, if_exists='append') + data.to_sql(name="test_foo_data", con=connection, if_exists="append") def main(connectable): with connectable.connect() as conn: @@ -1697,18 +1794,17 @@ def main(connectable): foo_data = conn.run_callable(foo) conn.run_callable(bar, foo_data) - DataFrame({'test_foo_data': [0, 1, 2]}).to_sql( - 'test_foo_data', self.conn) + DataFrame({"test_foo_data": [0, 1, 2]}).to_sql("test_foo_data", self.conn) main(self.conn) def test_temporary_table(self): - test_data = 'Hello, World!' - expected = DataFrame({'spam': [test_data]}) + test_data = "Hello, World!" + expected = DataFrame({"spam": [test_data]}) Base = declarative.declarative_base() class Temporary(Base): - __tablename__ = 'temp_test' - __table_args__ = {'prefixes': ['TEMPORARY']} + __tablename__ = "temp_test" + __table_args__ = {"prefixes": ["TEMPORARY"]} id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True) spam = sqlalchemy.Column(sqlalchemy.Unicode(30), nullable=False) @@ -1719,19 +1815,14 @@ class Temporary(Base): Temporary.__table__.create(conn) session.add(Temporary(spam=test_data)) session.flush() - df = sql.read_sql_query( - sql=sqlalchemy.select([Temporary.spam]), - con=conn, - ) + df = sql.read_sql_query(sql=sqlalchemy.select([Temporary.spam]), con=conn) tm.assert_frame_equal(df, expected) class _TestSQLAlchemyConn(_EngineToConnMixin, _TestSQLAlchemy): - def test_transactions(self): - pytest.skip( - "Nested transactions rollbacks don't work with Pandas") + pytest.skip("Nested transactions rollbacks don't work with Pandas") class _TestSQLiteAlchemy: @@ -1739,11 +1830,12 @@ class _TestSQLiteAlchemy: Test the sqlalchemy backend against an in-memory sqlite database. """ - flavor = 'sqlite' + + flavor = "sqlite" @classmethod def connect(cls): - return sqlalchemy.create_engine('sqlite:///:memory:') + return sqlalchemy.create_engine("sqlite:///:memory:") @classmethod def setup_driver(cls): @@ -1773,12 +1865,12 @@ def test_default_date_load(self): def test_bigint_warning(self): # test no warning for BIGINT (to support int64) is raised (GH7433) - df = DataFrame({'a': [1, 2]}, dtype='int64') - df.to_sql('test_bigintwarning', self.conn, index=False) + df = DataFrame({"a": [1, 2]}, dtype="int64") + df.to_sql("test_bigintwarning", self.conn, index=False) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") - sql.read_sql_table('test_bigintwarning', self.conn) + sql.read_sql_table("test_bigintwarning", self.conn) assert len(w) == 0 @@ -1787,20 +1879,21 @@ class _TestMySQLAlchemy: Test the sqlalchemy backend against an MySQL database. """ - flavor = 'mysql' + + flavor = "mysql" @classmethod def connect(cls): - url = 'mysql+{driver}://root@localhost/pandas_nosetest' - return sqlalchemy.create_engine(url.format(driver=cls.driver), - connect_args=cls.connect_args) + url = "mysql+{driver}://root@localhost/pandas_nosetest" + return sqlalchemy.create_engine( + url.format(driver=cls.driver), connect_args=cls.connect_args + ) @classmethod def setup_driver(cls): - pymysql = pytest.importorskip('pymysql') - cls.driver = 'pymysql' - cls.connect_args = { - 'client_flag': pymysql.constants.CLIENT.MULTI_STATEMENTS} + pymysql = pytest.importorskip("pymysql") + cls.driver = "pymysql" + cls.connect_args = {"client_flag": pymysql.constants.CLIENT.MULTI_STATEMENTS} def test_default_type_conversion(self): df = sql.read_sql_table("types_test_data", self.conn) @@ -1819,10 +1912,11 @@ def test_default_type_conversion(self): def test_read_procedure(self): import pymysql + # see GH7324. Although it is more an api test, it is added to the # mysql tests as sqlite does not have stored procedures - df = DataFrame({'a': [1, 2, 3], 'b': [0.1, 0.2, 0.3]}) - df.to_sql('test_procedure', self.conn, index=False) + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + df.to_sql("test_procedure", self.conn, index=False) proc = """DROP PROCEDURE IF EXISTS get_testdb; @@ -1854,48 +1948,49 @@ class _TestPostgreSQLAlchemy: Test the sqlalchemy backend against an PostgreSQL database. """ - flavor = 'postgresql' + + flavor = "postgresql" @classmethod def connect(cls): - url = 'postgresql+{driver}://postgres@localhost/pandas_nosetest' + url = "postgresql+{driver}://postgres@localhost/pandas_nosetest" return sqlalchemy.create_engine(url.format(driver=cls.driver)) @classmethod def setup_driver(cls): - pytest.importorskip('psycopg2') - cls.driver = 'psycopg2' + pytest.importorskip("psycopg2") + cls.driver = "psycopg2" def test_schema_support(self): # only test this for postgresql (schema's not supported in # mysql/sqlite) - df = DataFrame({'col1': [1, 2], 'col2': [ - 0.1, 0.2], 'col3': ['a', 'n']}) + df = DataFrame({"col1": [1, 2], "col2": [0.1, 0.2], "col3": ["a", "n"]}) # create a schema self.conn.execute("DROP SCHEMA IF EXISTS other CASCADE;") self.conn.execute("CREATE SCHEMA other;") # write dataframe to different schema's - df.to_sql('test_schema_public', self.conn, index=False) - df.to_sql('test_schema_public_explicit', self.conn, index=False, - schema='public') - df.to_sql('test_schema_other', self.conn, index=False, schema='other') + df.to_sql("test_schema_public", self.conn, index=False) + df.to_sql( + "test_schema_public_explicit", self.conn, index=False, schema="public" + ) + df.to_sql("test_schema_other", self.conn, index=False, schema="other") # read dataframes back in - res1 = sql.read_sql_table('test_schema_public', self.conn) + res1 = sql.read_sql_table("test_schema_public", self.conn) tm.assert_frame_equal(df, res1) - res2 = sql.read_sql_table('test_schema_public_explicit', self.conn) + res2 = sql.read_sql_table("test_schema_public_explicit", self.conn) tm.assert_frame_equal(df, res2) - res3 = sql.read_sql_table('test_schema_public_explicit', self.conn, - schema='public') + res3 = sql.read_sql_table( + "test_schema_public_explicit", self.conn, schema="public" + ) tm.assert_frame_equal(df, res3) - res4 = sql.read_sql_table('test_schema_other', self.conn, - schema='other') + res4 = sql.read_sql_table("test_schema_other", self.conn, schema="other") tm.assert_frame_equal(df, res4) msg = "Table test_schema_other not found" with pytest.raises(ValueError, match=msg): - sql.read_sql_table('test_schema_other', self.conn, schema='public') + sql.read_sql_table("test_schema_other", self.conn, schema="public") # different if_exists options @@ -1904,13 +1999,22 @@ def test_schema_support(self): self.conn.execute("CREATE SCHEMA other;") # write dataframe with different if_exists options - df.to_sql('test_schema_other', self.conn, schema='other', index=False) - df.to_sql('test_schema_other', self.conn, schema='other', index=False, - if_exists='replace') - df.to_sql('test_schema_other', self.conn, schema='other', index=False, - if_exists='append') - res = sql.read_sql_table( - 'test_schema_other', self.conn, schema='other') + df.to_sql("test_schema_other", self.conn, schema="other", index=False) + df.to_sql( + "test_schema_other", + self.conn, + schema="other", + index=False, + if_exists="replace", + ) + df.to_sql( + "test_schema_other", + self.conn, + schema="other", + index=False, + if_exists="append", + ) + res = sql.read_sql_table("test_schema_other", self.conn, schema="other") tm.assert_frame_equal(concat([df, df], ignore_index=True), res) # specifying schema in user-provided meta @@ -1919,16 +2023,13 @@ def test_schema_support(self): # because of transactional schemas if isinstance(self.conn, sqlalchemy.engine.Engine): engine2 = self.connect() - meta = sqlalchemy.MetaData(engine2, schema='other') + meta = sqlalchemy.MetaData(engine2, schema="other") pdsql = sql.SQLDatabase(engine2, meta=meta) - pdsql.to_sql(df, 'test_schema_other2', index=False) - pdsql.to_sql(df, 'test_schema_other2', - index=False, if_exists='replace') - pdsql.to_sql(df, 'test_schema_other2', - index=False, if_exists='append') - res1 = sql.read_sql_table( - 'test_schema_other2', self.conn, schema='other') - res2 = pdsql.read_table('test_schema_other2') + pdsql.to_sql(df, "test_schema_other2", index=False) + pdsql.to_sql(df, "test_schema_other2", index=False, if_exists="replace") + pdsql.to_sql(df, "test_schema_other2", index=False, if_exists="append") + res1 = sql.read_sql_table("test_schema_other2", self.conn, schema="other") + res2 = pdsql.read_table("test_schema_other2") tm.assert_frame_equal(res1, res2) def test_copy_from_callable_insertion_method(self): @@ -1944,21 +2045,22 @@ def psql_insert_copy(table, conn, keys, data_iter): writer.writerows(data_iter) s_buf.seek(0) - columns = ', '.join('"{}"'.format(k) for k in keys) + columns = ", ".join('"{}"'.format(k) for k in keys) if table.schema: - table_name = '{}.{}'.format(table.schema, table.name) + table_name = "{}.{}".format(table.schema, table.name) else: table_name = table.name - sql_query = 'COPY {} ({}) FROM STDIN WITH CSV'.format( - table_name, columns) + sql_query = "COPY {} ({}) FROM STDIN WITH CSV".format( + table_name, columns + ) cur.copy_expert(sql=sql_query, file=s_buf) - expected = DataFrame({'col1': [1, 2], 'col2': [0.1, 0.2], - 'col3': ['a', 'n']}) - expected.to_sql('test_copy_insert', self.conn, index=False, - method=psql_insert_copy) - result = sql.read_sql_table('test_copy_insert', self.conn) + expected = DataFrame({"col1": [1, 2], "col2": [0.1, 0.2], "col3": ["a", "n"]}) + expected.to_sql( + "test_copy_insert", self.conn, index=False, method=psql_insert_copy + ) + result = sql.read_sql_table("test_copy_insert", self.conn) tm.assert_frame_equal(result, expected) @@ -1999,17 +2101,19 @@ class TestSQLiteAlchemyConn(_TestSQLiteAlchemy, _TestSQLAlchemyConn): # ----------------------------------------------------------------------------- # -- Test Sqlite / MySQL fallback + @pytest.mark.single class TestSQLiteFallback(SQLiteMixIn, PandasSQLTest): """ Test the fallback mode against an in-memory sqlite database. """ - flavor = 'sqlite' + + flavor = "sqlite" @classmethod def connect(cls): - return sqlite3.connect(':memory:') + return sqlite3.connect(":memory:") def setup_connect(self): self.conn = self.connect() @@ -2048,15 +2152,16 @@ def test_to_sql_append(self): def test_create_and_drop_table(self): temp_frame = DataFrame( - {'one': [1., 2., 3., 4.], 'two': [4., 3., 2., 1.]}) + {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} + ) - self.pandasSQL.to_sql(temp_frame, 'drop_test_frame') + self.pandasSQL.to_sql(temp_frame, "drop_test_frame") - assert self.pandasSQL.has_table('drop_test_frame') + assert self.pandasSQL.has_table("drop_test_frame") - self.pandasSQL.drop_table('drop_test_frame') + self.pandasSQL.drop_table("drop_test_frame") - assert not self.pandasSQL.has_table('drop_test_frame') + assert not self.pandasSQL.has_table("drop_test_frame") def test_roundtrip(self): self._roundtrip() @@ -2067,32 +2172,33 @@ def test_execute_sql(self): def test_datetime_date(self): # test support for datetime.date df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) - df.to_sql('test_date', self.conn, index=False) - res = read_sql_query('SELECT * FROM test_date', self.conn) - if self.flavor == 'sqlite': + df.to_sql("test_date", self.conn, index=False) + res = read_sql_query("SELECT * FROM test_date", self.conn) + if self.flavor == "sqlite": # comes back as strings tm.assert_frame_equal(res, df.astype(str)) - elif self.flavor == 'mysql': + elif self.flavor == "mysql": tm.assert_frame_equal(res, df) def test_datetime_time(self): # test support for datetime.time, GH #8341 df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"]) - df.to_sql('test_time', self.conn, index=False) - res = read_sql_query('SELECT * FROM test_time', self.conn) - if self.flavor == 'sqlite': + df.to_sql("test_time", self.conn, index=False) + res = read_sql_query("SELECT * FROM test_time", self.conn) + if self.flavor == "sqlite": # comes back as strings expected = df.applymap(lambda _: _.strftime("%H:%M:%S.%f")) tm.assert_frame_equal(res, expected) def _get_index_columns(self, tbl_name): ixs = sql.read_sql_query( - "SELECT * FROM sqlite_master WHERE type = 'index' " + - "AND tbl_name = '%s'" % tbl_name, self.conn) + "SELECT * FROM sqlite_master WHERE type = 'index' " + + "AND tbl_name = '%s'" % tbl_name, + self.conn, + ) ix_cols = [] for ix_name in ixs.name: - ix_info = sql.read_sql_query( - "PRAGMA index_info(%s)" % ix_name, self.conn) + ix_info = sql.read_sql_query("PRAGMA index_info(%s)" % ix_name, self.conn) ix_cols.append(ix_info.name.tolist()) return ix_cols @@ -2105,76 +2211,81 @@ def test_transactions(self): self._transaction_test() def _get_sqlite_column_type(self, table, column): - recs = self.conn.execute('PRAGMA table_info(%s)' % table) + recs = self.conn.execute("PRAGMA table_info(%s)" % table) for cid, name, ctype, not_null, default, pk in recs: if name == column: return ctype - raise ValueError('Table %s, column %s not found' % (table, column)) + raise ValueError("Table %s, column %s not found" % (table, column)) def test_dtype(self): - if self.flavor == 'mysql': - pytest.skip('Not applicable to MySQL legacy') - cols = ['A', 'B'] - data = [(0.8, True), - (0.9, None)] + if self.flavor == "mysql": + pytest.skip("Not applicable to MySQL legacy") + cols = ["A", "B"] + data = [(0.8, True), (0.9, None)] df = DataFrame(data, columns=cols) - df.to_sql('dtype_test', self.conn) - df.to_sql('dtype_test2', self.conn, dtype={'B': 'STRING'}) + df.to_sql("dtype_test", self.conn) + df.to_sql("dtype_test2", self.conn, dtype={"B": "STRING"}) # sqlite stores Boolean values as INTEGER - assert self._get_sqlite_column_type( - 'dtype_test', 'B') == 'INTEGER' + assert self._get_sqlite_column_type("dtype_test", "B") == "INTEGER" - assert self._get_sqlite_column_type( - 'dtype_test2', 'B') == 'STRING' + assert self._get_sqlite_column_type("dtype_test2", "B") == "STRING" msg = r"B \(\) not a string" with pytest.raises(ValueError, match=msg): - df.to_sql('error', self.conn, dtype={'B': bool}) + df.to_sql("error", self.conn, dtype={"B": bool}) # single dtype - df.to_sql('single_dtype_test', self.conn, dtype='STRING') - assert self._get_sqlite_column_type( - 'single_dtype_test', 'A') == 'STRING' - assert self._get_sqlite_column_type( - 'single_dtype_test', 'B') == 'STRING' + df.to_sql("single_dtype_test", self.conn, dtype="STRING") + assert self._get_sqlite_column_type("single_dtype_test", "A") == "STRING" + assert self._get_sqlite_column_type("single_dtype_test", "B") == "STRING" def test_notna_dtype(self): - if self.flavor == 'mysql': - pytest.skip('Not applicable to MySQL legacy') - - cols = {'Bool': Series([True, None]), - 'Date': Series([datetime(2012, 5, 1), None]), - 'Int': Series([1, None], dtype='object'), - 'Float': Series([1.1, None]) - } + if self.flavor == "mysql": + pytest.skip("Not applicable to MySQL legacy") + + cols = { + "Bool": Series([True, None]), + "Date": Series([datetime(2012, 5, 1), None]), + "Int": Series([1, None], dtype="object"), + "Float": Series([1.1, None]), + } df = DataFrame(cols) - tbl = 'notna_dtype_test' + tbl = "notna_dtype_test" df.to_sql(tbl, self.conn) - assert self._get_sqlite_column_type(tbl, 'Bool') == 'INTEGER' - assert self._get_sqlite_column_type(tbl, 'Date') == 'TIMESTAMP' - assert self._get_sqlite_column_type(tbl, 'Int') == 'INTEGER' - assert self._get_sqlite_column_type(tbl, 'Float') == 'REAL' + assert self._get_sqlite_column_type(tbl, "Bool") == "INTEGER" + assert self._get_sqlite_column_type(tbl, "Date") == "TIMESTAMP" + assert self._get_sqlite_column_type(tbl, "Int") == "INTEGER" + assert self._get_sqlite_column_type(tbl, "Float") == "REAL" def test_illegal_names(self): # For sqlite, these should work fine - df = DataFrame([[1, 2], [3, 4]], columns=['a', 'b']) + df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) msg = "Empty table or column name specified" with pytest.raises(ValueError, match=msg): df.to_sql("", self.conn) for ndx, weird_name in enumerate( - ['test_weird_name]', 'test_weird_name[', - 'test_weird_name`', 'test_weird_name"', 'test_weird_name\'', - '_b.test_weird_name_01-30', '"_b.test_weird_name_01-30"', - '99beginswithnumber', '12345', '\xe9']): + [ + "test_weird_name]", + "test_weird_name[", + "test_weird_name`", + 'test_weird_name"', + "test_weird_name'", + "_b.test_weird_name_01-30", + '"_b.test_weird_name_01-30"', + "99beginswithnumber", + "12345", + "\xe9", + ] + ): df.to_sql(weird_name, self.conn) sql.table_exists(weird_name, self.conn) - df2 = DataFrame([[1, 2], [3, 4]], columns=['a', weird_name]) - c_tbl = 'test_weird_col_name%d' % ndx + df2 = DataFrame([[1, 2], [3, 4]], columns=["a", weird_name]) + c_tbl = "test_weird_col_name%d" % ndx df2.to_sql(c_tbl, self.conn) sql.table_exists(c_tbl, self.conn) @@ -2185,7 +2296,7 @@ def test_illegal_names(self): def date_format(dt): """Returns date in YYYYMMDD format.""" - return dt.strftime('%Y%m%d') + return dt.strftime("%Y%m%d") _formatters = { @@ -2227,17 +2338,16 @@ def tquery(query, con=None, cur=None): @pytest.mark.single class TestXSQLite(SQLiteMixIn): - @pytest.fixture(autouse=True) def setup_method(self, request, datapath): self.method = request.function - self.conn = sqlite3.connect(':memory:') + self.conn = sqlite3.connect(":memory:") # In some test cases we may close db connection # Re-open conn here so we can perform cleanup in teardown yield self.method = request.function - self.conn = sqlite3.connect(':memory:') + self.conn = sqlite3.connect(":memory:") def test_basic(self): frame = tm.makeTimeDataFrame() @@ -2247,7 +2357,7 @@ def test_write_row_by_row(self): frame = tm.makeTimeDataFrame() frame.iloc[0, 0] = np.nan - create_sql = sql.get_schema(frame, 'test') + create_sql = sql.get_schema(frame, "test") cur = self.conn.cursor() cur.execute(create_sql) @@ -2266,7 +2376,7 @@ def test_write_row_by_row(self): def test_execute(self): frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, 'test') + create_sql = sql.get_schema(frame, "test") cur = self.conn.cursor() cur.execute(create_sql) ins = "INSERT INTO test VALUES (?, ?, ?, ?)" @@ -2281,15 +2391,15 @@ def test_execute(self): def test_schema(self): frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, 'test') + create_sql = sql.get_schema(frame, "test") lines = create_sql.splitlines() for l in lines: - tokens = l.split(' ') - if len(tokens) == 2 and tokens[0] == 'A': - assert tokens[1] == 'DATETIME' + tokens = l.split(" ") + if len(tokens) == 2 and tokens[0] == "A": + assert tokens[1] == "DATETIME" frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, 'test', keys=['A', 'B']) + create_sql = sql.get_schema(frame, "test", keys=["A", "B"]) lines = create_sql.splitlines() assert 'PRIMARY KEY ("A", "B")' in create_sql cur = self.conn.cursor() @@ -2337,7 +2447,7 @@ def test_na_roundtrip(self): pass def _check_roundtrip(self, frame): - sql.to_sql(frame, name='test_table', con=self.conn, index=False) + sql.to_sql(frame, name="test_table", con=self.conn, index=False) result = sql.read_sql("select * from test_table", self.conn) # HACK! Change this once indexes are handled properly. @@ -2346,32 +2456,30 @@ def _check_roundtrip(self, frame): expected = frame tm.assert_frame_equal(result, expected) - frame['txt'] = ['a'] * len(frame) + frame["txt"] = ["a"] * len(frame) frame2 = frame.copy() new_idx = Index(np.arange(len(frame2))) + 10 - frame2['Idx'] = new_idx.copy() - sql.to_sql(frame2, name='test_table2', con=self.conn, index=False) - result = sql.read_sql("select * from test_table2", self.conn, - index_col='Idx') + frame2["Idx"] = new_idx.copy() + sql.to_sql(frame2, name="test_table2", con=self.conn, index=False) + result = sql.read_sql("select * from test_table2", self.conn, index_col="Idx") expected = frame.copy() expected.index = new_idx - expected.index.name = 'Idx' + expected.index.name = "Idx" tm.assert_frame_equal(expected, result) def test_keyword_as_column_names(self): - df = DataFrame({'From': np.ones(5)}) - sql.to_sql(df, con=self.conn, name='testkeywords', index=False) + df = DataFrame({"From": np.ones(5)}) + sql.to_sql(df, con=self.conn, name="testkeywords", index=False) def test_onecolumn_of_integer(self): # GH 3628 # a column_of_integers dataframe should transfer well to sql - mono_df = DataFrame([1, 2], columns=['c0']) - sql.to_sql(mono_df, con=self.conn, name='mono_df', index=False) + mono_df = DataFrame([1, 2], columns=["c0"]) + sql.to_sql(mono_df, con=self.conn, name="mono_df", index=False) # computing the sum via sql con_x = self.conn - the_sum = sum(my_c0[0] - for my_c0 in con_x.execute("select * from mono_df")) + the_sum = sum(my_c0[0] for my_c0 in con_x.execute("select * from mono_df")) # it should not fail, and gives 3 ( Issue #3628 ) assert the_sum == 3 @@ -2379,10 +2487,9 @@ def test_onecolumn_of_integer(self): tm.assert_frame_equal(result, mono_df) def test_if_exists(self): - df_if_exists_1 = DataFrame({'col1': [1, 2], 'col2': ['A', 'B']}) - df_if_exists_2 = DataFrame( - {'col1': [3, 4, 5], 'col2': ['C', 'D', 'E']}) - table_name = 'table_if_exists' + df_if_exists_1 = DataFrame({"col1": [1, 2], "col2": ["A", "B"]}) + df_if_exists_2 = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"]}) + table_name = "table_if_exists" sql_select = "SELECT * FROM %s" % table_name def clean_up(test_table_to_drop): @@ -2394,81 +2501,113 @@ def clean_up(test_table_to_drop): msg = "'notvalidvalue' is not valid for if_exists" with pytest.raises(ValueError, match=msg): - sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - if_exists='notvalidvalue') + sql.to_sql( + frame=df_if_exists_1, + con=self.conn, + name=table_name, + if_exists="notvalidvalue", + ) clean_up(table_name) # test if_exists='fail' - sql.to_sql(frame=df_if_exists_1, con=self.conn, - name=table_name, if_exists='fail') + sql.to_sql( + frame=df_if_exists_1, con=self.conn, name=table_name, if_exists="fail" + ) msg = "Table 'table_if_exists' already exists" with pytest.raises(ValueError, match=msg): - sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - if_exists='fail') + sql.to_sql( + frame=df_if_exists_1, con=self.conn, name=table_name, if_exists="fail" + ) # test if_exists='replace' - sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - if_exists='replace', index=False) - assert tquery(sql_select, con=self.conn) == [(1, 'A'), (2, 'B')] - sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, - if_exists='replace', index=False) - assert (tquery(sql_select, con=self.conn) == - [(3, 'C'), (4, 'D'), (5, 'E')]) + sql.to_sql( + frame=df_if_exists_1, + con=self.conn, + name=table_name, + if_exists="replace", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [(1, "A"), (2, "B")] + sql.to_sql( + frame=df_if_exists_2, + con=self.conn, + name=table_name, + if_exists="replace", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [(3, "C"), (4, "D"), (5, "E")] clean_up(table_name) # test if_exists='append' - sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - if_exists='fail', index=False) - assert tquery(sql_select, con=self.conn) == [(1, 'A'), (2, 'B')] - sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, - if_exists='append', index=False) - assert (tquery(sql_select, con=self.conn) == - [(1, 'A'), (2, 'B'), (3, 'C'), (4, 'D'), (5, 'E')]) + sql.to_sql( + frame=df_if_exists_1, + con=self.conn, + name=table_name, + if_exists="fail", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [(1, "A"), (2, "B")] + sql.to_sql( + frame=df_if_exists_2, + con=self.conn, + name=table_name, + if_exists="append", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [ + (1, "A"), + (2, "B"), + (3, "C"), + (4, "D"), + (5, "E"), + ] clean_up(table_name) @pytest.mark.single @pytest.mark.db -@pytest.mark.skip(reason="gh-13611: there is no support for MySQL " - "if SQLAlchemy is not installed") +@pytest.mark.skip( + reason="gh-13611: there is no support for MySQL " "if SQLAlchemy is not installed" +) class TestXMySQL(MySQLMixIn): - - @pytest.fixture(autouse=True, scope='class') + @pytest.fixture(autouse=True, scope="class") def setup_class(cls): - pymysql = pytest.importorskip('pymysql') - pymysql.connect(host='localhost', user='root', passwd='', - db='pandas_nosetest') + pymysql = pytest.importorskip("pymysql") + pymysql.connect(host="localhost", user="root", passwd="", db="pandas_nosetest") try: - pymysql.connect(read_default_group='pandas') + pymysql.connect(read_default_group="pandas") except pymysql.ProgrammingError: raise RuntimeError( "Create a group of connection parameters under the heading " "[pandas] in your system's mysql default file, " - "typically located at ~/.my.cnf or /etc/.my.cnf.") + "typically located at ~/.my.cnf or /etc/.my.cnf." + ) except pymysql.Error: raise RuntimeError( "Cannot connect to database. " "Create a group of connection parameters under the heading " "[pandas] in your system's mysql default file, " - "typically located at ~/.my.cnf or /etc/.my.cnf.") + "typically located at ~/.my.cnf or /etc/.my.cnf." + ) @pytest.fixture(autouse=True) def setup_method(self, request, datapath): - pymysql = pytest.importorskip('pymysql') - pymysql.connect(host='localhost', user='root', passwd='', - db='pandas_nosetest') + pymysql = pytest.importorskip("pymysql") + pymysql.connect(host="localhost", user="root", passwd="", db="pandas_nosetest") try: - pymysql.connect(read_default_group='pandas') + pymysql.connect(read_default_group="pandas") except pymysql.ProgrammingError: raise RuntimeError( "Create a group of connection parameters under the heading " "[pandas] in your system's mysql default file, " - "typically located at ~/.my.cnf or /etc/.my.cnf.") + "typically located at ~/.my.cnf or /etc/.my.cnf." + ) except pymysql.Error: raise RuntimeError( "Cannot connect to database. " "Create a group of connection parameters under the heading " "[pandas] in your system's mysql default file, " - "typically located at ~/.my.cnf or /etc/.my.cnf.") + "typically located at ~/.my.cnf or /etc/.my.cnf." + ) self.method = request.function @@ -2480,7 +2619,7 @@ def test_write_row_by_row(self): frame = tm.makeTimeDataFrame() frame.iloc[0, 0] = np.nan drop_sql = "DROP TABLE IF EXISTS test" - create_sql = sql.get_schema(frame, 'test') + create_sql = sql.get_schema(frame, "test") cur = self.conn.cursor() cur.execute(drop_sql) cur.execute(create_sql) @@ -2501,18 +2640,19 @@ def test_chunksize_read_type(self): drop_sql = "DROP TABLE IF EXISTS test" cur = self.conn.cursor() cur.execute(drop_sql) - sql.to_sql(frame, name='test', con=self.conn) + sql.to_sql(frame, name="test", con=self.conn) query = "select * from test" chunksize = 5 - chunk_gen = pd.read_sql_query(sql=query, con=self.conn, - chunksize=chunksize, index_col="index") + chunk_gen = pd.read_sql_query( + sql=query, con=self.conn, chunksize=chunksize, index_col="index" + ) chunk_df = next(chunk_gen) tm.assert_frame_equal(frame[:chunksize], chunk_df) def test_execute(self): frame = tm.makeTimeDataFrame() drop_sql = "DROP TABLE IF EXISTS test" - create_sql = sql.get_schema(frame, 'test') + create_sql = sql.get_schema(frame, "test") cur = self.conn.cursor() with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unknown table.*") @@ -2530,18 +2670,18 @@ def test_execute(self): def test_schema(self): frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, 'test') + create_sql = sql.get_schema(frame, "test") lines = create_sql.splitlines() for l in lines: - tokens = l.split(' ') - if len(tokens) == 2 and tokens[0] == 'A': - assert tokens[1] == 'DATETIME' + tokens = l.split(" ") + if len(tokens) == 2 and tokens[0] == "A": + assert tokens[1] == "DATETIME" frame = tm.makeTimeDataFrame() drop_sql = "DROP TABLE IF EXISTS test" - create_sql = sql.get_schema(frame, 'test', keys=['A', 'B']) + create_sql = sql.get_schema(frame, "test", keys=["A", "B"]) lines = create_sql.splitlines() - assert 'PRIMARY KEY (`A`, `B`)' in create_sql + assert "PRIMARY KEY (`A`, `B`)" in create_sql cur = self.conn.cursor() cur.execute(drop_sql) cur.execute(create_sql) @@ -2600,7 +2740,7 @@ def _check_roundtrip(self, frame): with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unknown table.*") cur.execute(drop_sql) - sql.to_sql(frame, name='test_table', con=self.conn, index=False) + sql.to_sql(frame, name="test_table", con=self.conn, index=False) result = sql.read_sql("select * from test_table", self.conn) # HACK! Change this once indexes are handled properly. @@ -2610,19 +2750,17 @@ def _check_roundtrip(self, frame): expected = frame tm.assert_frame_equal(result, expected) - frame['txt'] = ['a'] * len(frame) + frame["txt"] = ["a"] * len(frame) frame2 = frame.copy() index = Index(np.arange(len(frame2))) + 10 - frame2['Idx'] = index + frame2["Idx"] = index drop_sql = "DROP TABLE IF EXISTS test_table2" cur = self.conn.cursor() with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unknown table.*") cur.execute(drop_sql) - sql.to_sql(frame2, name='test_table2', - con=self.conn, index=False) - result = sql.read_sql("select * from test_table2", self.conn, - index_col='Idx') + sql.to_sql(frame2, name="test_table2", con=self.conn, index=False) + result = sql.read_sql("select * from test_table2", self.conn, index_col="Idx") expected = frame.copy() # HACK! Change this once indexes are handled properly. @@ -2631,15 +2769,15 @@ def _check_roundtrip(self, frame): tm.assert_frame_equal(expected, result) def test_keyword_as_column_names(self): - df = DataFrame({'From': np.ones(5)}) - sql.to_sql(df, con=self.conn, name='testkeywords', - if_exists='replace', index=False) + df = DataFrame({"From": np.ones(5)}) + sql.to_sql( + df, con=self.conn, name="testkeywords", if_exists="replace", index=False + ) def test_if_exists(self): - df_if_exists_1 = DataFrame({'col1': [1, 2], 'col2': ['A', 'B']}) - df_if_exists_2 = DataFrame( - {'col1': [3, 4, 5], 'col2': ['C', 'D', 'E']}) - table_name = 'table_if_exists' + df_if_exists_1 = DataFrame({"col1": [1, 2], "col2": ["A", "B"]}) + df_if_exists_2 = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"]}) + table_name = "table_if_exists" sql_select = "SELECT * FROM %s" % table_name def clean_up(test_table_to_drop): @@ -2651,33 +2789,67 @@ def clean_up(test_table_to_drop): # test if invalid value for if_exists raises appropriate error with pytest.raises(ValueError, match=""): - sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - if_exists='notvalidvalue') + sql.to_sql( + frame=df_if_exists_1, + con=self.conn, + name=table_name, + if_exists="notvalidvalue", + ) clean_up(table_name) # test if_exists='fail' - sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - if_exists='fail', index=False) + sql.to_sql( + frame=df_if_exists_1, + con=self.conn, + name=table_name, + if_exists="fail", + index=False, + ) with pytest.raises(ValueError, match=""): - sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - if_exists='fail') + sql.to_sql( + frame=df_if_exists_1, con=self.conn, name=table_name, if_exists="fail" + ) # test if_exists='replace' - sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - if_exists='replace', index=False) - assert tquery(sql_select, con=self.conn) == [(1, 'A'), (2, 'B')] - sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, - if_exists='replace', index=False) - assert (tquery(sql_select, con=self.conn) == - [(3, 'C'), (4, 'D'), (5, 'E')]) + sql.to_sql( + frame=df_if_exists_1, + con=self.conn, + name=table_name, + if_exists="replace", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [(1, "A"), (2, "B")] + sql.to_sql( + frame=df_if_exists_2, + con=self.conn, + name=table_name, + if_exists="replace", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [(3, "C"), (4, "D"), (5, "E")] clean_up(table_name) # test if_exists='append' - sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - if_exists='fail', index=False) - assert tquery(sql_select, con=self.conn) == [(1, 'A'), (2, 'B')] - sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, - if_exists='append', index=False) - assert (tquery(sql_select, con=self.conn) == - [(1, 'A'), (2, 'B'), (3, 'C'), (4, 'D'), (5, 'E')]) + sql.to_sql( + frame=df_if_exists_1, + con=self.conn, + name=table_name, + if_exists="fail", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [(1, "A"), (2, "B")] + sql.to_sql( + frame=df_if_exists_2, + con=self.conn, + name=table_name, + if_exists="append", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [ + (1, "A"), + (2, "B"), + (3, "C"), + (4, "D"), + (5, "E"), + ] clean_up(table_name) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index e6fe6e3b7888f9..715c7e370210fd 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -18,8 +18,12 @@ from pandas.io.parsers import read_csv from pandas.io.stata import ( - InvalidColumnName, PossiblePrecisionLoss, StataMissingValue, StataReader, - read_stata) + InvalidColumnName, + PossiblePrecisionLoss, + StataMissingValue, + StataReader, + read_stata, +) @pytest.fixture @@ -29,77 +33,75 @@ def dirpath(datapath): @pytest.fixture def parsed_114(dirpath): - dta14_114 = os.path.join(dirpath, 'stata5_114.dta') + dta14_114 = os.path.join(dirpath, "stata5_114.dta") parsed_114 = read_stata(dta14_114, convert_dates=True) - parsed_114.index.name = 'index' + parsed_114.index.name = "index" return parsed_114 class TestStata: - @pytest.fixture(autouse=True) def setup_method(self, datapath): self.dirpath = datapath("io", "data") - self.dta1_114 = os.path.join(self.dirpath, 'stata1_114.dta') - self.dta1_117 = os.path.join(self.dirpath, 'stata1_117.dta') + self.dta1_114 = os.path.join(self.dirpath, "stata1_114.dta") + self.dta1_117 = os.path.join(self.dirpath, "stata1_117.dta") - self.dta2_113 = os.path.join(self.dirpath, 'stata2_113.dta') - self.dta2_114 = os.path.join(self.dirpath, 'stata2_114.dta') - self.dta2_115 = os.path.join(self.dirpath, 'stata2_115.dta') - self.dta2_117 = os.path.join(self.dirpath, 'stata2_117.dta') + self.dta2_113 = os.path.join(self.dirpath, "stata2_113.dta") + self.dta2_114 = os.path.join(self.dirpath, "stata2_114.dta") + self.dta2_115 = os.path.join(self.dirpath, "stata2_115.dta") + self.dta2_117 = os.path.join(self.dirpath, "stata2_117.dta") - self.dta3_113 = os.path.join(self.dirpath, 'stata3_113.dta') - self.dta3_114 = os.path.join(self.dirpath, 'stata3_114.dta') - self.dta3_115 = os.path.join(self.dirpath, 'stata3_115.dta') - self.dta3_117 = os.path.join(self.dirpath, 'stata3_117.dta') - self.csv3 = os.path.join(self.dirpath, 'stata3.csv') + self.dta3_113 = os.path.join(self.dirpath, "stata3_113.dta") + self.dta3_114 = os.path.join(self.dirpath, "stata3_114.dta") + self.dta3_115 = os.path.join(self.dirpath, "stata3_115.dta") + self.dta3_117 = os.path.join(self.dirpath, "stata3_117.dta") + self.csv3 = os.path.join(self.dirpath, "stata3.csv") - self.dta4_113 = os.path.join(self.dirpath, 'stata4_113.dta') - self.dta4_114 = os.path.join(self.dirpath, 'stata4_114.dta') - self.dta4_115 = os.path.join(self.dirpath, 'stata4_115.dta') - self.dta4_117 = os.path.join(self.dirpath, 'stata4_117.dta') + self.dta4_113 = os.path.join(self.dirpath, "stata4_113.dta") + self.dta4_114 = os.path.join(self.dirpath, "stata4_114.dta") + self.dta4_115 = os.path.join(self.dirpath, "stata4_115.dta") + self.dta4_117 = os.path.join(self.dirpath, "stata4_117.dta") - self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta') - self.dta_encoding_118 = os.path.join(self.dirpath, - 'stata1_encoding_118.dta') + self.dta_encoding = os.path.join(self.dirpath, "stata1_encoding.dta") + self.dta_encoding_118 = os.path.join(self.dirpath, "stata1_encoding_118.dta") - self.csv14 = os.path.join(self.dirpath, 'stata5.csv') - self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta') - self.dta14_114 = os.path.join(self.dirpath, 'stata5_114.dta') - self.dta14_115 = os.path.join(self.dirpath, 'stata5_115.dta') - self.dta14_117 = os.path.join(self.dirpath, 'stata5_117.dta') + self.csv14 = os.path.join(self.dirpath, "stata5.csv") + self.dta14_113 = os.path.join(self.dirpath, "stata5_113.dta") + self.dta14_114 = os.path.join(self.dirpath, "stata5_114.dta") + self.dta14_115 = os.path.join(self.dirpath, "stata5_115.dta") + self.dta14_117 = os.path.join(self.dirpath, "stata5_117.dta") - self.csv15 = os.path.join(self.dirpath, 'stata6.csv') - self.dta15_113 = os.path.join(self.dirpath, 'stata6_113.dta') - self.dta15_114 = os.path.join(self.dirpath, 'stata6_114.dta') - self.dta15_115 = os.path.join(self.dirpath, 'stata6_115.dta') - self.dta15_117 = os.path.join(self.dirpath, 'stata6_117.dta') + self.csv15 = os.path.join(self.dirpath, "stata6.csv") + self.dta15_113 = os.path.join(self.dirpath, "stata6_113.dta") + self.dta15_114 = os.path.join(self.dirpath, "stata6_114.dta") + self.dta15_115 = os.path.join(self.dirpath, "stata6_115.dta") + self.dta15_117 = os.path.join(self.dirpath, "stata6_117.dta") - self.dta16_115 = os.path.join(self.dirpath, 'stata7_115.dta') - self.dta16_117 = os.path.join(self.dirpath, 'stata7_117.dta') + self.dta16_115 = os.path.join(self.dirpath, "stata7_115.dta") + self.dta16_117 = os.path.join(self.dirpath, "stata7_117.dta") - self.dta17_113 = os.path.join(self.dirpath, 'stata8_113.dta') - self.dta17_115 = os.path.join(self.dirpath, 'stata8_115.dta') - self.dta17_117 = os.path.join(self.dirpath, 'stata8_117.dta') + self.dta17_113 = os.path.join(self.dirpath, "stata8_113.dta") + self.dta17_115 = os.path.join(self.dirpath, "stata8_115.dta") + self.dta17_117 = os.path.join(self.dirpath, "stata8_117.dta") - self.dta18_115 = os.path.join(self.dirpath, 'stata9_115.dta') - self.dta18_117 = os.path.join(self.dirpath, 'stata9_117.dta') + self.dta18_115 = os.path.join(self.dirpath, "stata9_115.dta") + self.dta18_117 = os.path.join(self.dirpath, "stata9_117.dta") - self.dta19_115 = os.path.join(self.dirpath, 'stata10_115.dta') - self.dta19_117 = os.path.join(self.dirpath, 'stata10_117.dta') + self.dta19_115 = os.path.join(self.dirpath, "stata10_115.dta") + self.dta19_117 = os.path.join(self.dirpath, "stata10_117.dta") - self.dta20_115 = os.path.join(self.dirpath, 'stata11_115.dta') - self.dta20_117 = os.path.join(self.dirpath, 'stata11_117.dta') + self.dta20_115 = os.path.join(self.dirpath, "stata11_115.dta") + self.dta20_117 = os.path.join(self.dirpath, "stata11_117.dta") - self.dta21_117 = os.path.join(self.dirpath, 'stata12_117.dta') + self.dta21_117 = os.path.join(self.dirpath, "stata12_117.dta") - self.dta22_118 = os.path.join(self.dirpath, 'stata14_118.dta') - self.dta23 = os.path.join(self.dirpath, 'stata15.dta') + self.dta22_118 = os.path.join(self.dirpath, "stata14_118.dta") + self.dta23 = os.path.join(self.dirpath, "stata15.dta") - self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta') - self.dta25_118 = os.path.join(self.dirpath, 'stata16_118.dta') + self.dta24_111 = os.path.join(self.dirpath, "stata7_111.dta") + self.dta25_118 = os.path.join(self.dirpath, "stata16_118.dta") - self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta') + self.stata_dates = os.path.join(self.dirpath, "stata13_dates.dta") def read_dta(self, file): # Legacy default reader configuration @@ -108,9 +110,9 @@ def read_dta(self, file): def read_csv(self, file): return read_csv(file, parse_dates=True) - @pytest.mark.parametrize('version', [114, 117]) + @pytest.mark.parametrize("version", [114, 117]) def test_read_empty_dta(self, version): - empty_ds = DataFrame(columns=['unit']) + empty_ds = DataFrame(columns=["unit"]) # GH 7369, make sure can read a 0-obs dta file with tm.ensure_clean() as path: empty_ds.to_stata(path, write_index=False, version=version) @@ -127,8 +129,7 @@ def test_data_method(self): parsed_114_read = rdr.read() tm.assert_frame_equal(parsed_114_data, parsed_114_read) - @pytest.mark.parametrize( - 'file', ['dta1_114', 'dta1_117']) + @pytest.mark.parametrize("file", ["dta1_114", "dta1_117"]) def test_read_dta1(self, file): file = getattr(self, file) @@ -136,13 +137,14 @@ def test_read_dta1(self, file): # Pandas uses np.nan as missing value. # Thus, all columns will be of type float, regardless of their name. - expected = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], - columns=['float_miss', 'double_miss', 'byte_miss', - 'int_miss', 'long_miss']) + expected = DataFrame( + [(np.nan, np.nan, np.nan, np.nan, np.nan)], + columns=["float_miss", "double_miss", "byte_miss", "int_miss", "long_miss"], + ) # this is an oddity as really the nan should be float64, but # the casting doesn't fail so need to match stata here - expected['float_miss'] = expected['float_miss'].astype(np.float32) + expected["float_miss"] = expected["float_miss"].astype(np.float32) tm.assert_frame_equal(parsed, expected) @@ -158,7 +160,7 @@ def test_read_dta2(self): datetime(2010, 1, 1), datetime(1974, 7, 1), datetime(2010, 1, 1), - datetime(2010, 1, 1) + datetime(2010, 1, 1), ), ( datetime(1959, 12, 31, 20, 3, 20), @@ -168,24 +170,22 @@ def test_read_dta2(self): datetime(1955, 1, 1), datetime(1955, 7, 1), datetime(1955, 1, 1), - datetime(2, 1, 1) + datetime(2, 1, 1), ), - ( - pd.NaT, - pd.NaT, - pd.NaT, - pd.NaT, - pd.NaT, - pd.NaT, - pd.NaT, - pd.NaT, - ) + (pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT), + ], + columns=[ + "datetime_c", + "datetime_big_c", + "date", + "weekly_date", + "monthly_date", + "quarterly_date", + "half_yearly_date", + "yearly_date", ], - columns=['datetime_c', 'datetime_big_c', 'date', 'weekly_date', - 'monthly_date', 'quarterly_date', 'half_yearly_date', - 'yearly_date'] ) - expected['yearly_date'] = expected['yearly_date'].astype('O') + expected["yearly_date"] = expected["yearly_date"].astype("O") with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") @@ -204,15 +204,11 @@ def test_read_dta2(self): # buggy test because of the NaT comparison on certain platforms # Format 113 test fails since it does not support tc and tC formats # tm.assert_frame_equal(parsed_113, expected) - tm.assert_frame_equal(parsed_114, expected, - check_datetimelike_compat=True) - tm.assert_frame_equal(parsed_115, expected, - check_datetimelike_compat=True) - tm.assert_frame_equal(parsed_117, expected, - check_datetimelike_compat=True) + tm.assert_frame_equal(parsed_114, expected, check_datetimelike_compat=True) + tm.assert_frame_equal(parsed_115, expected, check_datetimelike_compat=True) + tm.assert_frame_equal(parsed_117, expected, check_datetimelike_compat=True) - @pytest.mark.parametrize( - 'file', ['dta3_113', 'dta3_114', 'dta3_115', 'dta3_117']) + @pytest.mark.parametrize("file", ["dta3_113", "dta3_114", "dta3_115", "dta3_117"]) def test_read_dta3(self, file): file = getattr(self, file) @@ -221,13 +217,12 @@ def test_read_dta3(self, file): # match stata here expected = self.read_csv(self.csv3) expected = expected.astype(np.float32) - expected['year'] = expected['year'].astype(np.int16) - expected['quarter'] = expected['quarter'].astype(np.int8) + expected["year"] = expected["year"].astype(np.int16) + expected["quarter"] = expected["quarter"].astype(np.int8) tm.assert_frame_equal(parsed, expected) - @pytest.mark.parametrize( - 'file', ['dta4_113', 'dta4_114', 'dta4_115', 'dta4_117']) + @pytest.mark.parametrize("file", ["dta4_113", "dta4_114", "dta4_115", "dta4_117"]) def test_read_dta4(self, file): file = getattr(self, file) @@ -244,14 +239,21 @@ def test_read_dta4(self, file): ["seven", "four", 7, np.nan, "seven"], ["eight", "three", 8, np.nan, "eight"], ["nine", "two", 9, np.nan, "nine"], - ["ten", "one", "ten", np.nan, "ten"] + ["ten", "one", "ten", np.nan, "ten"], ], - columns=['fully_labeled', 'fully_labeled2', 'incompletely_labeled', - 'labeled_with_missings', 'float_labelled']) + columns=[ + "fully_labeled", + "fully_labeled2", + "incompletely_labeled", + "labeled_with_missings", + "float_labelled", + ], + ) # these are all categoricals - expected = pd.concat([expected[col].astype('category') - for col in expected], axis=1) + expected = pd.concat( + [expected[col].astype("category") for col in expected], axis=1 + ) # stata doesn't save .category metadata tm.assert_frame_equal(parsed, expected, check_categorical=False) @@ -265,105 +267,122 @@ def test_read_dta12(self): [3, "cba", "qwertywertyqwerty"], [93, "", "strl"], ], - columns=['x', 'y', 'z']) + columns=["x", "y", "z"], + ) tm.assert_frame_equal(parsed_117, expected, check_dtype=False) def test_read_dta18(self): parsed_118 = self.read_dta(self.dta22_118) - parsed_118["Bytes"] = parsed_118["Bytes"].astype('O') + parsed_118["Bytes"] = parsed_118["Bytes"].astype("O") expected = DataFrame.from_records( - [['Cat', 'Bogota', 'Bogotá', 1, 1.0, 'option b Ünicode', 1.0], - ['Dog', 'Boston', 'Uzunköprü', np.nan, np.nan, np.nan, np.nan], - ['Plane', 'Rome', 'Tromsø', 0, 0.0, 'option a', 0.0], - ['Potato', 'Tokyo', 'Elâzığ', -4, 4.0, 4, 4], - ['', '', '', 0, 0.3332999, 'option a', 1 / 3.] - ], - columns=['Things', 'Cities', 'Unicode_Cities_Strl', - 'Ints', 'Floats', 'Bytes', 'Longs']) + [ + ["Cat", "Bogota", "Bogotá", 1, 1.0, "option b Ünicode", 1.0], + ["Dog", "Boston", "Uzunköprü", np.nan, np.nan, np.nan, np.nan], + ["Plane", "Rome", "Tromsø", 0, 0.0, "option a", 0.0], + ["Potato", "Tokyo", "Elâzığ", -4, 4.0, 4, 4], + ["", "", "", 0, 0.3332999, "option a", 1 / 3.0], + ], + columns=[ + "Things", + "Cities", + "Unicode_Cities_Strl", + "Ints", + "Floats", + "Bytes", + "Longs", + ], + ) expected["Floats"] = expected["Floats"].astype(np.float32) for col in parsed_118.columns: tm.assert_almost_equal(parsed_118[col], expected[col]) with StataReader(self.dta22_118) as rdr: vl = rdr.variable_labels() - vl_expected = {'Unicode_Cities_Strl': - 'Here are some strls with Ünicode chars', - 'Longs': 'long data', - 'Things': 'Here are some things', - 'Bytes': 'byte data', - 'Ints': 'int data', - 'Cities': 'Here are some cities', - 'Floats': 'float data'} + vl_expected = { + "Unicode_Cities_Strl": "Here are some strls with Ünicode chars", + "Longs": "long data", + "Things": "Here are some things", + "Bytes": "byte data", + "Ints": "int data", + "Cities": "Here are some cities", + "Floats": "float data", + } tm.assert_dict_equal(vl, vl_expected) - assert rdr.data_label == 'This is a Ünicode data label' + assert rdr.data_label == "This is a Ünicode data label" def test_read_write_dta5(self): - original = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], - columns=['float_miss', 'double_miss', 'byte_miss', - 'int_miss', 'long_miss']) - original.index.name = 'index' + original = DataFrame( + [(np.nan, np.nan, np.nan, np.nan, np.nan)], + columns=["float_miss", "double_miss", "byte_miss", "int_miss", "long_miss"], + ) + original.index.name = "index" with tm.ensure_clean() as path: original.to_stata(path, None) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal(written_and_read_again.set_index('index'), - original) + tm.assert_frame_equal(written_and_read_again.set_index("index"), original) def test_write_dta6(self): original = self.read_csv(self.csv3) - original.index.name = 'index' + original.index.name = "index" original.index = original.index.astype(np.int32) - original['year'] = original['year'].astype(np.int32) - original['quarter'] = original['quarter'].astype(np.int32) + original["year"] = original["year"].astype(np.int32) + original["quarter"] = original["quarter"].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, None) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal(written_and_read_again.set_index('index'), - original, check_index_type=False) + tm.assert_frame_equal( + written_and_read_again.set_index("index"), + original, + check_index_type=False, + ) - @pytest.mark.parametrize('version', [114, 117]) + @pytest.mark.parametrize("version", [114, 117]) def test_read_write_dta10(self, version): - original = DataFrame(data=[["string", "object", 1, 1.1, - np.datetime64('2003-12-25')]], - columns=['string', 'object', 'integer', - 'floating', 'datetime']) + original = DataFrame( + data=[["string", "object", 1, 1.1, np.datetime64("2003-12-25")]], + columns=["string", "object", "integer", "floating", "datetime"], + ) original["object"] = Series(original["object"], dtype=object) - original.index.name = 'index' + original.index.name = "index" original.index = original.index.astype(np.int32) - original['integer'] = original['integer'].astype(np.int32) + original["integer"] = original["integer"].astype(np.int32) with tm.ensure_clean() as path: - original.to_stata(path, {'datetime': 'tc'}, version=version) + original.to_stata(path, {"datetime": "tc"}, version=version) written_and_read_again = self.read_dta(path) # original.index is np.int32, read index is np.int64 - tm.assert_frame_equal(written_and_read_again.set_index('index'), - original, check_index_type=False) + tm.assert_frame_equal( + written_and_read_again.set_index("index"), + original, + check_index_type=False, + ) def test_stata_doc_examples(self): with tm.ensure_clean() as path: - df = DataFrame(np.random.randn(10, 2), columns=list('AB')) + df = DataFrame(np.random.randn(10, 2), columns=list("AB")) df.to_stata(path) def test_write_preserves_original(self): # 9795 np.random.seed(423) - df = pd.DataFrame(np.random.randn(5, 4), columns=list('abcd')) - df.loc[2, 'a':'c'] = np.nan + df = pd.DataFrame(np.random.randn(5, 4), columns=list("abcd")) + df.loc[2, "a":"c"] = np.nan df_copy = df.copy() with tm.ensure_clean() as path: df.to_stata(path, write_index=False) tm.assert_frame_equal(df, df_copy) - @pytest.mark.parametrize('version', [114, 117]) + @pytest.mark.parametrize("version", [114, 117]) def test_encoding(self, version): # GH 4626, proper encoding handling raw = read_stata(self.dta_encoding) with tm.assert_produces_warning(FutureWarning): - encoded = read_stata(self.dta_encoding, encoding='latin-1') + encoded = read_stata(self.dta_encoding, encoding="latin-1") result = encoded.kreis1849[0] expected = raw.kreis1849[0] @@ -372,19 +391,27 @@ def test_encoding(self, version): with tm.ensure_clean() as path: with tm.assert_produces_warning(FutureWarning): - encoded.to_stata(path, write_index=False, version=version, - encoding='latin-1') + encoded.to_stata( + path, write_index=False, version=version, encoding="latin-1" + ) reread_encoded = read_stata(path) tm.assert_frame_equal(encoded, reread_encoded) def test_read_write_dta11(self): - original = DataFrame([(1, 2, 3, 4)], - columns=['good', 'b\u00E4d', '8number', - 'astringwithmorethan32characters______']) - formatted = DataFrame([(1, 2, 3, 4)], - columns=['good', 'b_d', '_8number', - 'astringwithmorethan32characters_']) - formatted.index.name = 'index' + original = DataFrame( + [(1, 2, 3, 4)], + columns=[ + "good", + "b\u00E4d", + "8number", + "astringwithmorethan32characters______", + ], + ) + formatted = DataFrame( + [(1, 2, 3, 4)], + columns=["good", "b_d", "_8number", "astringwithmorethan32characters_"], + ) + formatted.index.name = "index" formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: @@ -392,216 +419,217 @@ def test_read_write_dta11(self): original.to_stata(path, None) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal( - written_and_read_again.set_index('index'), formatted) + tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted) - @pytest.mark.parametrize('version', [114, 117]) + @pytest.mark.parametrize("version", [114, 117]) def test_read_write_dta12(self, version): - original = DataFrame([(1, 2, 3, 4, 5, 6)], - columns=['astringwithmorethan32characters_1', - 'astringwithmorethan32characters_2', - '+', - '-', - 'short', - 'delete']) - formatted = DataFrame([(1, 2, 3, 4, 5, 6)], - columns=['astringwithmorethan32characters_', - '_0astringwithmorethan32character', - '_', - '_1_', - '_short', - '_delete']) - formatted.index.name = 'index' + original = DataFrame( + [(1, 2, 3, 4, 5, 6)], + columns=[ + "astringwithmorethan32characters_1", + "astringwithmorethan32characters_2", + "+", + "-", + "short", + "delete", + ], + ) + formatted = DataFrame( + [(1, 2, 3, 4, 5, 6)], + columns=[ + "astringwithmorethan32characters_", + "_0astringwithmorethan32character", + "_", + "_1_", + "_short", + "_delete", + ], + ) + formatted.index.name = "index" formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always', InvalidColumnName) + warnings.simplefilter("always", InvalidColumnName) original.to_stata(path, None, version=version) # should get a warning for that format. assert len(w) == 1 written_and_read_again = self.read_dta(path) - tm.assert_frame_equal( - written_and_read_again.set_index('index'), formatted) + tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted) def test_read_write_dta13(self): s1 = Series(2 ** 9, dtype=np.int16) s2 = Series(2 ** 17, dtype=np.int32) s3 = Series(2 ** 33, dtype=np.int64) - original = DataFrame({'int16': s1, 'int32': s2, 'int64': s3}) - original.index.name = 'index' + original = DataFrame({"int16": s1, "int32": s2, "int64": s3}) + original.index.name = "index" formatted = original - formatted['int64'] = formatted['int64'].astype(np.float64) + formatted["int64"] = formatted["int64"].astype(np.float64) with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal(written_and_read_again.set_index('index'), - formatted) + tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted) - @pytest.mark.parametrize('version', [114, 117]) + @pytest.mark.parametrize("version", [114, 117]) @pytest.mark.parametrize( - 'file', ['dta14_113', 'dta14_114', 'dta14_115', 'dta14_117']) + "file", ["dta14_113", "dta14_114", "dta14_115", "dta14_117"] + ) def test_read_write_reread_dta14(self, file, parsed_114, version): file = getattr(self, file) parsed = self.read_dta(file) - parsed.index.name = 'index' + parsed.index.name = "index" expected = self.read_csv(self.csv14) - cols = ['byte_', 'int_', 'long_', 'float_', 'double_'] + cols = ["byte_", "int_", "long_", "float_", "double_"] for col in cols: expected[col] = expected[col]._convert(datetime=True, numeric=True) - expected['float_'] = expected['float_'].astype(np.float32) - expected['date_td'] = pd.to_datetime( - expected['date_td'], errors='coerce') + expected["float_"] = expected["float_"].astype(np.float32) + expected["date_td"] = pd.to_datetime(expected["date_td"], errors="coerce") tm.assert_frame_equal(parsed_114, parsed) with tm.ensure_clean() as path: - parsed_114.to_stata(path, {'date_td': 'td'}, version=version) + parsed_114.to_stata(path, {"date_td": "td"}, version=version) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal( - written_and_read_again.set_index('index'), parsed_114) + tm.assert_frame_equal(written_and_read_again.set_index("index"), parsed_114) @pytest.mark.parametrize( - 'file', ['dta15_113', 'dta15_114', 'dta15_115', 'dta15_117']) + "file", ["dta15_113", "dta15_114", "dta15_115", "dta15_117"] + ) def test_read_write_reread_dta15(self, file): expected = self.read_csv(self.csv15) - expected['byte_'] = expected['byte_'].astype(np.int8) - expected['int_'] = expected['int_'].astype(np.int16) - expected['long_'] = expected['long_'].astype(np.int32) - expected['float_'] = expected['float_'].astype(np.float32) - expected['double_'] = expected['double_'].astype(np.float64) - expected['date_td'] = expected['date_td'].apply( - datetime.strptime, args=('%Y-%m-%d',)) + expected["byte_"] = expected["byte_"].astype(np.int8) + expected["int_"] = expected["int_"].astype(np.int16) + expected["long_"] = expected["long_"].astype(np.int32) + expected["float_"] = expected["float_"].astype(np.float32) + expected["double_"] = expected["double_"].astype(np.float64) + expected["date_td"] = expected["date_td"].apply( + datetime.strptime, args=("%Y-%m-%d",) + ) file = getattr(self, file) parsed = self.read_dta(file) tm.assert_frame_equal(expected, parsed) - @pytest.mark.parametrize('version', [114, 117]) + @pytest.mark.parametrize("version", [114, 117]) def test_timestamp_and_label(self, version): - original = DataFrame([(1,)], columns=['variable']) + original = DataFrame([(1,)], columns=["variable"]) time_stamp = datetime(2000, 2, 29, 14, 21) - data_label = 'This is a data file.' + data_label = "This is a data file." with tm.ensure_clean() as path: - original.to_stata(path, time_stamp=time_stamp, - data_label=data_label, - version=version) + original.to_stata( + path, time_stamp=time_stamp, data_label=data_label, version=version + ) with StataReader(path) as reader: - assert reader.time_stamp == '29 Feb 2000 14:21' + assert reader.time_stamp == "29 Feb 2000 14:21" assert reader.data_label == data_label - @pytest.mark.parametrize('version', [114, 117]) + @pytest.mark.parametrize("version", [114, 117]) def test_invalid_timestamp(self, version): - original = DataFrame([(1,)], columns=['variable']) - time_stamp = '01 Jan 2000, 00:00:00' + original = DataFrame([(1,)], columns=["variable"]) + time_stamp = "01 Jan 2000, 00:00:00" with tm.ensure_clean() as path: msg = "time_stamp should be datetime type" with pytest.raises(ValueError, match=msg): - original.to_stata(path, time_stamp=time_stamp, - version=version) + original.to_stata(path, time_stamp=time_stamp, version=version) def test_numeric_column_names(self): original = DataFrame(np.reshape(np.arange(25.0), (5, 5))) - original.index.name = 'index' + original.index.name = "index" with tm.ensure_clean() as path: # should get a warning for that format. with tm.assert_produces_warning(InvalidColumnName): original.to_stata(path) written_and_read_again = self.read_dta(path) - written_and_read_again = written_and_read_again.set_index('index') + written_and_read_again = written_and_read_again.set_index("index") columns = list(written_and_read_again.columns) convert_col_name = lambda x: int(x[1]) written_and_read_again.columns = map(convert_col_name, columns) tm.assert_frame_equal(original, written_and_read_again) - @pytest.mark.parametrize('version', [114, 117]) + @pytest.mark.parametrize("version", [114, 117]) def test_nan_to_missing_value(self, version): s1 = Series(np.arange(4.0), dtype=np.float32) s2 = Series(np.arange(4.0), dtype=np.float64) s1[::2] = np.nan s2[1::2] = np.nan - original = DataFrame({'s1': s1, 's2': s2}) - original.index.name = 'index' + original = DataFrame({"s1": s1, "s2": s2}) + original.index.name = "index" with tm.ensure_clean() as path: original.to_stata(path, version=version) written_and_read_again = self.read_dta(path) - written_and_read_again = written_and_read_again.set_index('index') + written_and_read_again = written_and_read_again.set_index("index") tm.assert_frame_equal(written_and_read_again, original) def test_no_index(self): - columns = ['x', 'y'] - original = DataFrame(np.reshape(np.arange(10.0), (5, 2)), - columns=columns) - original.index.name = 'index_not_written' + columns = ["x", "y"] + original = DataFrame(np.reshape(np.arange(10.0), (5, 2)), columns=columns) + original.index.name = "index_not_written" with tm.ensure_clean() as path: original.to_stata(path, write_index=False) written_and_read_again = self.read_dta(path) with pytest.raises(KeyError, match=original.index.name): - written_and_read_again['index_not_written'] + written_and_read_again["index_not_written"] def test_string_no_dates(self): - s1 = Series(['a', 'A longer string']) + s1 = Series(["a", "A longer string"]) s2 = Series([1.0, 2.0], dtype=np.float64) - original = DataFrame({'s1': s1, 's2': s2}) - original.index.name = 'index' + original = DataFrame({"s1": s1, "s2": s2}) + original.index.name = "index" with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal(written_and_read_again.set_index('index'), - original) + tm.assert_frame_equal(written_and_read_again.set_index("index"), original) def test_large_value_conversion(self): s0 = Series([1, 99], dtype=np.int8) s1 = Series([1, 127], dtype=np.int8) s2 = Series([1, 2 ** 15 - 1], dtype=np.int16) s3 = Series([1, 2 ** 63 - 1], dtype=np.int64) - original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3}) - original.index.name = 'index' + original = DataFrame({"s0": s0, "s1": s1, "s2": s2, "s3": s3}) + original.index.name = "index" with tm.ensure_clean() as path: with tm.assert_produces_warning(PossiblePrecisionLoss): original.to_stata(path) written_and_read_again = self.read_dta(path) modified = original.copy() - modified['s1'] = Series(modified['s1'], dtype=np.int16) - modified['s2'] = Series(modified['s2'], dtype=np.int32) - modified['s3'] = Series(modified['s3'], dtype=np.float64) - tm.assert_frame_equal(written_and_read_again.set_index('index'), - modified) + modified["s1"] = Series(modified["s1"], dtype=np.int16) + modified["s2"] = Series(modified["s2"], dtype=np.int32) + modified["s3"] = Series(modified["s3"], dtype=np.float64) + tm.assert_frame_equal(written_and_read_again.set_index("index"), modified) def test_dates_invalid_column(self): original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)]) - original.index.name = 'index' + original.index.name = "index" with tm.ensure_clean() as path: with tm.assert_produces_warning(InvalidColumnName): - original.to_stata(path, {0: 'tc'}) + original.to_stata(path, {0: "tc"}) written_and_read_again = self.read_dta(path) modified = original.copy() - modified.columns = ['_0'] - tm.assert_frame_equal(written_and_read_again.set_index('index'), - modified) + modified.columns = ["_0"] + tm.assert_frame_equal(written_and_read_again.set_index("index"), modified) def test_105(self): # Data obtained from: # http://go.worldbank.org/ZXY29PVJ21 - dpath = os.path.join(self.dirpath, 'S4_EDUC1.dta') + dpath = os.path.join(self.dirpath, "S4_EDUC1.dta") df = pd.read_stata(dpath) df0 = [[1, 1, 3, -2], [2, 1, 2, -2], [4, 1, 1, -2]] df0 = pd.DataFrame(df0) df0.columns = ["clustnum", "pri_schl", "psch_num", "psch_dis"] - df0['clustnum'] = df0["clustnum"].astype(np.int16) - df0['pri_schl'] = df0["pri_schl"].astype(np.int8) - df0['psch_num'] = df0["psch_num"].astype(np.int8) - df0['psch_dis'] = df0["psch_dis"].astype(np.float32) + df0["clustnum"] = df0["clustnum"].astype(np.int16) + df0["pri_schl"] = df0["pri_schl"].astype(np.int8) + df0["psch_num"] = df0["psch_num"].astype(np.int8) + df0["psch_dis"] = df0["psch_dis"].astype(np.float32) tm.assert_frame_equal(df.head(3), df0) def test_value_labels_old_format(self): @@ -609,45 +637,45 @@ def test_value_labels_old_format(self): # # Test that value_labels() returns an empty dict if the file format # predates supporting value labels. - dpath = os.path.join(self.dirpath, 'S4_EDUC1.dta') + dpath = os.path.join(self.dirpath, "S4_EDUC1.dta") reader = StataReader(dpath) assert reader.value_labels() == {} reader.close() def test_date_export_formats(self): - columns = ['tc', 'td', 'tw', 'tm', 'tq', 'th', 'ty'] + columns = ["tc", "td", "tw", "tm", "tq", "th", "ty"] conversions = {c: c for c in columns} data = [datetime(2006, 11, 20, 23, 13, 20)] * len(columns) original = DataFrame([data], columns=columns) - original.index.name = 'index' - expected_values = [datetime(2006, 11, 20, 23, 13, 20), # Time - datetime(2006, 11, 20), # Day - datetime(2006, 11, 19), # Week - datetime(2006, 11, 1), # Month - datetime(2006, 10, 1), # Quarter year - datetime(2006, 7, 1), # Half year - datetime(2006, 1, 1)] # Year + original.index.name = "index" + expected_values = [ + datetime(2006, 11, 20, 23, 13, 20), # Time + datetime(2006, 11, 20), # Day + datetime(2006, 11, 19), # Week + datetime(2006, 11, 1), # Month + datetime(2006, 10, 1), # Quarter year + datetime(2006, 7, 1), # Half year + datetime(2006, 1, 1), + ] # Year expected = DataFrame([expected_values], columns=columns) - expected.index.name = 'index' + expected.index.name = "index" with tm.ensure_clean() as path: original.to_stata(path, conversions) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal(written_and_read_again.set_index('index'), - expected) + tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) def test_write_missing_strings(self): original = DataFrame([["1"], [None]], columns=["foo"]) expected = DataFrame([["1"], [""]], columns=["foo"]) - expected.index.name = 'index' + expected.index.name = "index" with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal(written_and_read_again.set_index('index'), - expected) + tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) - @pytest.mark.parametrize('version', [114, 117]) - @pytest.mark.parametrize('byteorder', ['>', '<']) + @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("byteorder", [">", "<"]) def test_bool_uint(self, byteorder, version): s0 = Series([0, 1, True], dtype=np.bool) s1 = Series([0, 1, 100], dtype=np.uint8) @@ -657,19 +685,27 @@ def test_bool_uint(self, byteorder, version): s5 = Series([0, 1, 2 ** 31 - 100], dtype=np.uint32) s6 = Series([0, 1, 2 ** 32 - 1], dtype=np.uint32) - original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3, - 's4': s4, 's5': s5, 's6': s6}) - original.index.name = 'index' + original = DataFrame( + {"s0": s0, "s1": s1, "s2": s2, "s3": s3, "s4": s4, "s5": s5, "s6": s6} + ) + original.index.name = "index" expected = original.copy() - expected_types = (np.int8, np.int8, np.int16, np.int16, np.int32, - np.int32, np.float64) + expected_types = ( + np.int8, + np.int8, + np.int16, + np.int16, + np.int32, + np.int32, + np.float64, + ) for c, t in zip(expected.columns, expected_types): expected[c] = expected[c].astype(t) with tm.ensure_clean() as path: original.to_stata(path, byteorder=byteorder, version=version) written_and_read_again = self.read_dta(path) - written_and_read_again = written_and_read_again.set_index('index') + written_and_read_again = written_and_read_again.set_index("index") tm.assert_frame_equal(written_and_read_again, expected) def test_variable_labels(self): @@ -677,8 +713,8 @@ def test_variable_labels(self): sr_115 = rdr.variable_labels() with StataReader(self.dta16_117) as rdr: sr_117 = rdr.variable_labels() - keys = ('var1', 'var2', 'var3') - labels = ('label1', 'label2', 'label3') + keys = ("var1", "var2", "var3") + labels = ("label1", "label2", "label3") for k, v in sr_115.items(): assert k in sr_117 assert v == sr_117[k] @@ -689,8 +725,9 @@ def test_minimal_size_col(self): str_lens = (1, 100, 244) s = {} for str_len in str_lens: - s['s' + str(str_len)] = Series(['a' * str_len, - 'b' * str_len, 'c' * str_len]) + s["s" + str(str_len)] = Series( + ["a" * str_len, "b" * str_len, "c" * str_len] + ) original = DataFrame(s) with tm.ensure_clean() as path: original.to_stata(path, write_index=False) @@ -707,26 +744,29 @@ def test_excessively_long_string(self): str_lens = (1, 244, 500) s = {} for str_len in str_lens: - s['s' + str(str_len)] = Series(['a' * str_len, - 'b' * str_len, 'c' * str_len]) + s["s" + str(str_len)] = Series( + ["a" * str_len, "b" * str_len, "c" * str_len] + ) original = DataFrame(s) - msg = (r"Fixed width strings in Stata \.dta files are limited to 244" - r" \(or fewer\)\ncharacters\. Column 's500' does not satisfy" - r" this restriction\. Use the\n'version=117' parameter to write" - r" the newer \(Stata 13 and later\) format\.") + msg = ( + r"Fixed width strings in Stata \.dta files are limited to 244" + r" \(or fewer\)\ncharacters\. Column 's500' does not satisfy" + r" this restriction\. Use the\n'version=117' parameter to write" + r" the newer \(Stata 13 and later\) format\." + ) with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: original.to_stata(path) def test_missing_value_generator(self): - types = ('b', 'h', 'l') - df = DataFrame([[0.0]], columns=['float_']) + types = ("b", "h", "l") + df = DataFrame([[0.0]], columns=["float_"]) with tm.ensure_clean() as path: df.to_stata(path) with StataReader(path) as rdr: valid_range = rdr.VALID_RANGE - expected_values = ['.' + chr(97 + i) for i in range(26)] - expected_values.insert(0, '.') + expected_values = ["." + chr(97 + i) for i in range(26)] + expected_values.insert(0, ".") for t in types: offset = valid_range[t][1] for i in range(0, 27): @@ -734,23 +774,24 @@ def test_missing_value_generator(self): assert val.string == expected_values[i] # Test extremes for floats - val = StataMissingValue(struct.unpack('= 1.5 """ - return [v[field] for v in rcParams['axes.prop_cycle']] + return [v[field] for v in rcParams["axes.prop_cycle"]] -def _check_plot_works(f, filterwarnings='always', **kwargs): +def _check_plot_works(f, filterwarnings="always", **kwargs): import matplotlib.pyplot as plt + ret = None with warnings.catch_warnings(): warnings.simplefilter(filterwarnings) try: try: - fig = kwargs['figure'] + fig = kwargs["figure"] except KeyError: fig = plt.gcf() plt.clf() - ax = kwargs.get('ax', fig.add_subplot(211)) # noqa + ax = kwargs.get("ax", fig.add_subplot(211)) # noqa ret = f(**kwargs) assert_is_valid_plot_return_object(ret) try: - kwargs['ax'] = fig.add_subplot(212) + kwargs["ax"] = fig.add_subplot(212) ret = f(**kwargs) except Exception: pass diff --git a/pandas/tests/plotting/test_backend.py b/pandas/tests/plotting/test_backend.py index 65e1d690d5f8f6..51f2abb6cc2f4d 100644 --- a/pandas/tests/plotting/test_backend.py +++ b/pandas/tests/plotting/test_backend.py @@ -4,30 +4,35 @@ def test_matplotlib_backend_error(): - msg = ('matplotlib is required for plotting when the default backend ' - '"matplotlib" is selected.') + msg = ( + "matplotlib is required for plotting when the default backend " + '"matplotlib" is selected.' + ) try: import matplotlib # noqa except ImportError: with pytest.raises(ImportError, match=msg): - pandas.set_option('plotting.backend', 'matplotlib') + pandas.set_option("plotting.backend", "matplotlib") def test_backend_is_not_module(): - msg = ('"not_an_existing_module" does not seem to be an installed module. ' - 'A pandas plotting backend must be a module that can be imported') + msg = ( + '"not_an_existing_module" does not seem to be an installed module. ' + "A pandas plotting backend must be a module that can be imported" + ) with pytest.raises(ValueError, match=msg): - pandas.set_option('plotting.backend', 'not_an_existing_module') + pandas.set_option("plotting.backend", "not_an_existing_module") def test_backend_is_correct(monkeypatch): - monkeypatch.setattr('pandas.core.config_init.importlib.import_module', - lambda name: None) - pandas.set_option('plotting.backend', 'correct_backend') - assert pandas.get_option('plotting.backend') == 'correct_backend' + monkeypatch.setattr( + "pandas.core.config_init.importlib.import_module", lambda name: None + ) + pandas.set_option("plotting.backend", "correct_backend") + assert pandas.get_option("plotting.backend") == "correct_backend" # Restore backend for other tests (matplotlib can be not installed) try: - pandas.set_option('plotting.backend', 'matplotlib') + pandas.set_option("plotting.backend", "matplotlib") except ImportError: pass diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index de1ac0c2931890..cab0efe53f1fc4 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -20,64 +20,63 @@ @td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): - @pytest.mark.slow def test_boxplot_legacy1(self): - df = DataFrame(np.random.randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=['one', 'two', 'three', 'four']) - df['indic'] = ['foo', 'bar'] * 3 - df['indic2'] = ['foo', 'bar', 'foo'] * 2 - - _check_plot_works(df.boxplot, return_type='dict') - _check_plot_works(df.boxplot, column=[ - 'one', 'two'], return_type='dict') + df = DataFrame( + np.random.randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=["one", "two", "three", "four"], + ) + df["indic"] = ["foo", "bar"] * 3 + df["indic2"] = ["foo", "bar", "foo"] * 2 + + _check_plot_works(df.boxplot, return_type="dict") + _check_plot_works(df.boxplot, column=["one", "two"], return_type="dict") # _check_plot_works adds an ax so catch warning. see GH #13188 with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.boxplot, column=['one', 'two'], - by='indic') - _check_plot_works(df.boxplot, column='one', by=['indic', 'indic2']) + _check_plot_works(df.boxplot, column=["one", "two"], by="indic") + _check_plot_works(df.boxplot, column="one", by=["indic", "indic2"]) with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.boxplot, by='indic') + _check_plot_works(df.boxplot, by="indic") with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.boxplot, by=['indic', 'indic2']) - _check_plot_works(plotting._core.boxplot, data=df['one'], - return_type='dict') - _check_plot_works(df.boxplot, notch=1, return_type='dict') + _check_plot_works(df.boxplot, by=["indic", "indic2"]) + _check_plot_works(plotting._core.boxplot, data=df["one"], return_type="dict") + _check_plot_works(df.boxplot, notch=1, return_type="dict") with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.boxplot, by='indic', notch=1) + _check_plot_works(df.boxplot, by="indic", notch=1) @pytest.mark.slow def test_boxplot_legacy2(self): - df = DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2']) - df['X'] = Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B']) - df['Y'] = Series(['A'] * 10) + df = DataFrame(np.random.rand(10, 2), columns=["Col1", "Col2"]) + df["X"] = Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"]) + df["Y"] = Series(["A"] * 10) with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.boxplot, by='X') + _check_plot_works(df.boxplot, by="X") # When ax is supplied and required number of axes is 1, # passed ax should be used: fig, ax = self.plt.subplots() - axes = df.boxplot('Col1', by='X', ax=ax) + axes = df.boxplot("Col1", by="X", ax=ax) ax_axes = ax.axes assert ax_axes is axes fig, ax = self.plt.subplots() - axes = df.groupby('Y').boxplot(ax=ax, return_type='axes') + axes = df.groupby("Y").boxplot(ax=ax, return_type="axes") ax_axes = ax.axes - assert ax_axes is axes['A'] + assert ax_axes is axes["A"] # Multiple columns with an ax argument should use same figure fig, ax = self.plt.subplots() with tm.assert_produces_warning(UserWarning): - axes = df.boxplot(column=['Col1', 'Col2'], - by='X', ax=ax, return_type='axes') - assert axes['Col1'].get_figure() is fig + axes = df.boxplot( + column=["Col1", "Col2"], by="X", ax=ax, return_type="axes" + ) + assert axes["Col1"].get_figure() is fig # When by is None, check that all relevant lines are present in the # dict fig, ax = self.plt.subplots() - d = df.boxplot(ax=ax, return_type='dict') + d = df.boxplot(ax=ax, return_type="dict") lines = list(itertools.chain.from_iterable(d.values())) assert len(ax.get_lines()) == len(lines) @@ -92,51 +91,52 @@ def test_boxplot_return_type_legacy(self): # API change in https://github.com/pandas-dev/pandas/pull/7096 import matplotlib as mpl # noqa - df = DataFrame(np.random.randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=['one', 'two', 'three', 'four']) + df = DataFrame( + np.random.randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=["one", "two", "three", "four"], + ) with pytest.raises(ValueError): - df.boxplot(return_type='NOTATYPE') + df.boxplot(return_type="NOTATYPE") result = df.boxplot() - self._check_box_return_type(result, 'axes') + self._check_box_return_type(result, "axes") with tm.assert_produces_warning(False): - result = df.boxplot(return_type='dict') - self._check_box_return_type(result, 'dict') + result = df.boxplot(return_type="dict") + self._check_box_return_type(result, "dict") with tm.assert_produces_warning(False): - result = df.boxplot(return_type='axes') - self._check_box_return_type(result, 'axes') + result = df.boxplot(return_type="axes") + self._check_box_return_type(result, "axes") with tm.assert_produces_warning(False): - result = df.boxplot(return_type='both') - self._check_box_return_type(result, 'both') + result = df.boxplot(return_type="both") + self._check_box_return_type(result, "both") @pytest.mark.slow def test_boxplot_axis_limits(self): - def _check_ax_limits(col, ax): y_min, y_max = ax.get_ylim() assert y_min <= col.min() assert y_max >= col.max() df = self.hist_df.copy() - df['age'] = np.random.randint(1, 20, df.shape[0]) + df["age"] = np.random.randint(1, 20, df.shape[0]) # One full row - height_ax, weight_ax = df.boxplot(['height', 'weight'], by='category') - _check_ax_limits(df['height'], height_ax) - _check_ax_limits(df['weight'], weight_ax) + height_ax, weight_ax = df.boxplot(["height", "weight"], by="category") + _check_ax_limits(df["height"], height_ax) + _check_ax_limits(df["weight"], weight_ax) assert weight_ax._sharey == height_ax # Two rows, one partial - p = df.boxplot(['height', 'weight', 'age'], by='category') + p = df.boxplot(["height", "weight", "age"], by="category") height_ax, weight_ax, age_ax = p[0, 0], p[0, 1], p[1, 0] dummy_ax = p[1, 1] - _check_ax_limits(df['height'], height_ax) - _check_ax_limits(df['weight'], weight_ax) - _check_ax_limits(df['age'], age_ax) + _check_ax_limits(df["height"], height_ax) + _check_ax_limits(df["weight"], weight_ax) + _check_ax_limits(df["age"], age_ax) assert weight_ax._sharey == height_ax assert age_ax._sharey == height_ax assert dummy_ax._sharey is None @@ -145,60 +145,54 @@ def _check_ax_limits(col, ax): def test_boxplot_empty_column(self): df = DataFrame(np.random.randn(20, 4)) df.loc[:, 0] = np.nan - _check_plot_works(df.boxplot, return_type='axes') + _check_plot_works(df.boxplot, return_type="axes") @pytest.mark.slow def test_figsize(self): - df = DataFrame(np.random.rand(10, 5), - columns=['A', 'B', 'C', 'D', 'E']) - result = df.boxplot(return_type='axes', figsize=(12, 8)) + df = DataFrame(np.random.rand(10, 5), columns=["A", "B", "C", "D", "E"]) + result = df.boxplot(return_type="axes", figsize=(12, 8)) assert result.figure.bbox_inches.width == 12 assert result.figure.bbox_inches.height == 8 def test_fontsize(self): df = DataFrame({"a": [1, 2, 3, 4, 5, 6]}) - self._check_ticks_props(df.boxplot("a", fontsize=16), - xlabelsize=16, ylabelsize=16) + self._check_ticks_props( + df.boxplot("a", fontsize=16), xlabelsize=16, ylabelsize=16 + ) @td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): - @pytest.mark.slow def test_boxplot_legacy1(self): - grouped = self.hist_df.groupby(by='gender') + grouped = self.hist_df.groupby(by="gender") with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(grouped.boxplot, return_type='axes') + axes = _check_plot_works(grouped.boxplot, return_type="axes") self._check_axes_shape(list(axes.values), axes_num=2, layout=(1, 2)) - axes = _check_plot_works(grouped.boxplot, subplots=False, - return_type='axes') + axes = _check_plot_works(grouped.boxplot, subplots=False, return_type="axes") self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) @pytest.mark.slow def test_boxplot_legacy2(self): tuples = zip(string.ascii_letters[:10], range(10)) - df = DataFrame(np.random.rand(10, 3), - index=MultiIndex.from_tuples(tuples)) + df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples)) grouped = df.groupby(level=1) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(grouped.boxplot, return_type='axes') + axes = _check_plot_works(grouped.boxplot, return_type="axes") self._check_axes_shape(list(axes.values), axes_num=10, layout=(4, 3)) - axes = _check_plot_works(grouped.boxplot, subplots=False, - return_type='axes') + axes = _check_plot_works(grouped.boxplot, subplots=False, return_type="axes") self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) @pytest.mark.slow def test_boxplot_legacy3(self): tuples = zip(string.ascii_letters[:10], range(10)) - df = DataFrame(np.random.rand(10, 3), - index=MultiIndex.from_tuples(tuples)) + df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples)) grouped = df.unstack(level=1).groupby(level=0, axis=1) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(grouped.boxplot, return_type='axes') + axes = _check_plot_works(grouped.boxplot, return_type="axes") self._check_axes_shape(list(axes.values), axes_num=3, layout=(2, 2)) - axes = _check_plot_works(grouped.boxplot, subplots=False, - return_type='axes') + axes = _check_plot_works(grouped.boxplot, subplots=False, return_type="axes") self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) @pytest.mark.slow @@ -207,22 +201,22 @@ def test_grouped_plot_fignums(self): weight = Series(np.random.normal(166, 20, size=n)) height = Series(np.random.normal(60, 10, size=n)) with tm.RNGContext(42): - gender = np.random.choice(['male', 'female'], size=n) - df = DataFrame({'height': height, 'weight': weight, 'gender': gender}) - gb = df.groupby('gender') + gender = np.random.choice(["male", "female"], size=n) + df = DataFrame({"height": height, "weight": weight, "gender": gender}) + gb = df.groupby("gender") res = gb.plot() assert len(self.plt.get_fignums()) == 2 assert len(res) == 2 tm.close() - res = gb.boxplot(return_type='axes') + res = gb.boxplot(return_type="axes") assert len(self.plt.get_fignums()) == 1 assert len(res) == 2 tm.close() # now works with GH 5610 as gender is excluded - res = df.groupby('gender').hist() + res = df.groupby("gender").hist() tm.close() @pytest.mark.slow @@ -230,36 +224,34 @@ def test_grouped_box_return_type(self): df = self.hist_df # old style: return_type=None - result = df.boxplot(by='gender') + result = df.boxplot(by="gender") assert isinstance(result, np.ndarray) self._check_box_return_type( - result, None, - expected_keys=['height', 'weight', 'category']) + result, None, expected_keys=["height", "weight", "category"] + ) # now for groupby - result = df.groupby('gender').boxplot(return_type='dict') - self._check_box_return_type( - result, 'dict', expected_keys=['Male', 'Female']) + result = df.groupby("gender").boxplot(return_type="dict") + self._check_box_return_type(result, "dict", expected_keys=["Male", "Female"]) - columns2 = 'X B C D A G Y N Q O'.split() + columns2 = "X B C D A G Y N Q O".split() df2 = DataFrame(random.randn(50, 10), columns=columns2) - categories2 = 'A B C D E F G H I J'.split() - df2['category'] = categories2 * 5 + categories2 = "A B C D E F G H I J".split() + df2["category"] = categories2 * 5 - for t in ['dict', 'axes', 'both']: - returned = df.groupby('classroom').boxplot(return_type=t) - self._check_box_return_type( - returned, t, expected_keys=['A', 'B', 'C']) + for t in ["dict", "axes", "both"]: + returned = df.groupby("classroom").boxplot(return_type=t) + self._check_box_return_type(returned, t, expected_keys=["A", "B", "C"]) - returned = df.boxplot(by='classroom', return_type=t) + returned = df.boxplot(by="classroom", return_type=t) self._check_box_return_type( - returned, t, - expected_keys=['height', 'weight', 'category']) + returned, t, expected_keys=["height", "weight", "category"] + ) - returned = df2.groupby('category').boxplot(return_type=t) + returned = df2.groupby("category").boxplot(return_type=t) self._check_box_return_type(returned, t, expected_keys=categories2) - returned = df2.boxplot(by='category', return_type=t) + returned = df2.boxplot(by="category", return_type=t) self._check_box_return_type(returned, t, expected_keys=columns2) @pytest.mark.slow @@ -268,79 +260,92 @@ def test_grouped_box_layout(self): msg = "Layout of 1x1 must be larger than required size 2" with pytest.raises(ValueError, match=msg): - df.boxplot(column=['weight', 'height'], by=df.gender, - layout=(1, 1)) + df.boxplot(column=["weight", "height"], by=df.gender, layout=(1, 1)) msg = "The 'layout' keyword is not supported when 'by' is None" with pytest.raises(ValueError, match=msg): - df.boxplot(column=['height', 'weight', 'category'], - layout=(2, 1), return_type='dict') + df.boxplot( + column=["height", "weight", "category"], + layout=(2, 1), + return_type="dict", + ) msg = "At least one dimension of layout must be positive" with pytest.raises(ValueError, match=msg): - df.boxplot(column=['weight', 'height'], by=df.gender, - layout=(-1, -1)) + df.boxplot(column=["weight", "height"], by=df.gender, layout=(-1, -1)) # _check_plot_works adds an ax so catch warning. see GH #13188 with tm.assert_produces_warning(UserWarning): - box = _check_plot_works(df.groupby('gender').boxplot, - column='height', return_type='dict') + box = _check_plot_works( + df.groupby("gender").boxplot, column="height", return_type="dict" + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=2, layout=(1, 2)) with tm.assert_produces_warning(UserWarning): - box = _check_plot_works(df.groupby('category').boxplot, - column='height', - return_type='dict') + box = _check_plot_works( + df.groupby("category").boxplot, column="height", return_type="dict" + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(2, 2)) # GH 6769 with tm.assert_produces_warning(UserWarning): - box = _check_plot_works(df.groupby('classroom').boxplot, - column='height', return_type='dict') + box = _check_plot_works( + df.groupby("classroom").boxplot, column="height", return_type="dict" + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) # GH 5897 - axes = df.boxplot(column=['height', 'weight', 'category'], by='gender', - return_type='axes') + axes = df.boxplot( + column=["height", "weight", "category"], by="gender", return_type="axes" + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) - for ax in [axes['height']]: + for ax in [axes["height"]]: self._check_visible(ax.get_xticklabels(), visible=False) self._check_visible([ax.xaxis.get_label()], visible=False) - for ax in [axes['weight'], axes['category']]: + for ax in [axes["weight"], axes["category"]]: self._check_visible(ax.get_xticklabels()) self._check_visible([ax.xaxis.get_label()]) - box = df.groupby('classroom').boxplot( - column=['height', 'weight', 'category'], return_type='dict') + box = df.groupby("classroom").boxplot( + column=["height", "weight", "category"], return_type="dict" + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) with tm.assert_produces_warning(UserWarning): - box = _check_plot_works(df.groupby('category').boxplot, - column='height', - layout=(3, 2), return_type='dict') + box = _check_plot_works( + df.groupby("category").boxplot, + column="height", + layout=(3, 2), + return_type="dict", + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(3, 2)) with tm.assert_produces_warning(UserWarning): - box = _check_plot_works(df.groupby('category').boxplot, - column='height', - layout=(3, -1), return_type='dict') + box = _check_plot_works( + df.groupby("category").boxplot, + column="height", + layout=(3, -1), + return_type="dict", + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(3, 2)) - box = df.boxplot(column=['height', 'weight', 'category'], by='gender', - layout=(4, 1)) + box = df.boxplot( + column=["height", "weight", "category"], by="gender", layout=(4, 1) + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(4, 1)) - box = df.boxplot(column=['height', 'weight', 'category'], by='gender', - layout=(-1, 1)) + box = df.boxplot( + column=["height", "weight", "category"], by="gender", layout=(-1, 1) + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(3, 1)) - box = df.groupby('classroom').boxplot( - column=['height', 'weight', 'category'], layout=(1, 4), - return_type='dict') + box = df.groupby("classroom").boxplot( + column=["height", "weight", "category"], layout=(1, 4), return_type="dict" + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 4)) - box = df.groupby('classroom').boxplot( # noqa - column=['height', 'weight', 'category'], layout=(1, -1), - return_type='dict') + box = df.groupby("classroom").boxplot( # noqa + column=["height", "weight", "category"], layout=(1, -1), return_type="dict" + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 3)) @pytest.mark.slow @@ -355,15 +360,17 @@ def test_grouped_box_multiple_axes(self): # which has earlier alphabetical order with tm.assert_produces_warning(UserWarning): fig, axes = self.plt.subplots(2, 2) - df.groupby('category').boxplot( - column='height', return_type='axes', ax=axes) - self._check_axes_shape(self.plt.gcf().axes, - axes_num=4, layout=(2, 2)) + df.groupby("category").boxplot(column="height", return_type="axes", ax=axes) + self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(2, 2)) fig, axes = self.plt.subplots(2, 3) with tm.assert_produces_warning(UserWarning): - returned = df.boxplot(column=['height', 'weight', 'category'], - by='gender', return_type='axes', ax=axes[0]) + returned = df.boxplot( + column=["height", "weight", "category"], + by="gender", + return_type="axes", + ax=axes[0], + ) returned = np.array(list(returned.values)) self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) tm.assert_numpy_array_equal(returned, axes[0]) @@ -371,9 +378,9 @@ def test_grouped_box_multiple_axes(self): # draw on second row with tm.assert_produces_warning(UserWarning): - returned = df.groupby('classroom').boxplot( - column=['height', 'weight', 'category'], - return_type='axes', ax=axes[1]) + returned = df.groupby("classroom").boxplot( + column=["height", "weight", "category"], return_type="axes", ax=axes[1] + ) returned = np.array(list(returned.values)) self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) tm.assert_numpy_array_equal(returned, axes[1]) @@ -383,9 +390,10 @@ def test_grouped_box_multiple_axes(self): fig, axes = self.plt.subplots(2, 3) # pass different number of axes from required with tm.assert_produces_warning(UserWarning): - axes = df.groupby('classroom').boxplot(ax=axes) + axes = df.groupby("classroom").boxplot(ax=axes) def test_fontsize(self): df = DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}) - self._check_ticks_props(df.boxplot("a", by="b", fontsize=16), - xlabelsize=16, ylabelsize=16) + self._check_ticks_props( + df.boxplot("a", by="b", fontsize=16), xlabelsize=16, ylabelsize=16 + ) diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 92d207e46b7ab8..35d12706f05902 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -13,7 +13,9 @@ import pandas.util.testing as tm from pandas.plotting import ( - deregister_matplotlib_converters, register_matplotlib_converters) + deregister_matplotlib_converters, + register_matplotlib_converters, +) from pandas.tseries.offsets import Day, Micro, Milli, Second try: @@ -23,7 +25,7 @@ # causing an improprer skip pass -pytest.importorskip('matplotlib.pyplot') +pytest.importorskip("matplotlib.pyplot") def test_initial_warning(): @@ -33,35 +35,35 @@ def test_initial_warning(): "fig, ax = plt.subplots(); " "ax.plot(s.index, s.values)" ) - call = [sys.executable, '-c', code] + call = [sys.executable, "-c", code] out = subprocess.check_output(call, stderr=subprocess.STDOUT).decode() - assert 'Using an implicitly' in out + assert "Using an implicitly" in out def test_timtetonum_accepts_unicode(): - assert (converter.time2num("00:01") == converter.time2num("00:01")) + assert converter.time2num("00:01") == converter.time2num("00:01") class TestRegistration: - def test_register_by_default(self): # Run in subprocess to ensure a clean state - code = ("'import matplotlib.units; " - "import pandas as pd; " - "units = dict(matplotlib.units.registry); " - "assert pd.Timestamp in units)'") - call = [sys.executable, '-c', code] + code = ( + "'import matplotlib.units; " + "import pandas as pd; " + "units = dict(matplotlib.units.registry); " + "assert pd.Timestamp in units)'" + ) + call = [sys.executable, "-c", code] assert subprocess.check_call(call) == 0 def test_warns(self): plt = pytest.importorskip("matplotlib.pyplot") - s = Series(range(12), index=date_range('2017', periods=12)) + s = Series(range(12), index=date_range("2017", periods=12)) _, ax = plt.subplots() # Set to the "warning" state, in case this isn't the first test run converter._WARN = True - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False) as w: + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w: ax.plot(s.index, s.values) plt.close() @@ -70,7 +72,7 @@ def test_warns(self): def test_registering_no_warning(self): plt = pytest.importorskip("matplotlib.pyplot") - s = Series(range(12), index=date_range('2017', periods=12)) + s = Series(range(12), index=date_range("2017", periods=12)) _, ax = plt.subplots() # Set to the "warn" state, in case this isn't the first test run @@ -83,7 +85,7 @@ def test_registering_no_warning(self): def test_pandas_plots_register(self): pytest.importorskip("matplotlib.pyplot") - s = Series(range(12), index=date_range('2017', periods=12)) + s = Series(range(12), index=date_range("2017", periods=12)) # Set to the "warn" state, in case this isn't the first test run converter._WARN = True with tm.assert_produces_warning(None) as w: @@ -95,8 +97,7 @@ def test_matplotlib_formatters(self): units = pytest.importorskip("matplotlib.units") assert Timestamp in units.registry - ctx = cf.option_context("plotting.matplotlib.register_converters", - False) + ctx = cf.option_context("plotting.matplotlib.register_converters", False) with ctx: assert Timestamp not in units.registry @@ -104,10 +105,9 @@ def test_matplotlib_formatters(self): def test_option_no_warning(self): pytest.importorskip("matplotlib.pyplot") - ctx = cf.option_context("plotting.matplotlib.register_converters", - False) + ctx = cf.option_context("plotting.matplotlib.register_converters", False) plt = pytest.importorskip("matplotlib.pyplot") - s = Series(range(12), index=date_range('2017', periods=12)) + s = Series(range(12), index=date_range("2017", periods=12)) _, ax = plt.subplots() converter._WARN = True @@ -155,15 +155,14 @@ def test_registry_resets(self): def test_old_import_warns(self): with tm.assert_produces_warning(FutureWarning) as w: from pandas.tseries import converter + converter.register() assert len(w) - assert ('pandas.plotting.register_matplotlib_converters' in - str(w[0].message)) + assert "pandas.plotting.register_matplotlib_converters" in str(w[0].message) class TestDateTimeConverter: - def setup_method(self, method): self.dtc = converter.DatetimeConverter() self.tc = converter.TimeFormatter(None) @@ -171,14 +170,14 @@ def setup_method(self, method): def test_convert_accepts_unicode(self): r1 = self.dtc.convert("12:22", None, None) r2 = self.dtc.convert("12:22", None, None) - assert (r1 == r2), "DatetimeConverter.convert should accept unicode" + assert r1 == r2, "DatetimeConverter.convert should accept unicode" def test_conversion(self): - rs = self.dtc.convert(['2012-1-1'], None, None)[0] + rs = self.dtc.convert(["2012-1-1"], None, None)[0] xp = datetime(2012, 1, 1).toordinal() assert rs == xp - rs = self.dtc.convert('2012-1-1', None, None) + rs = self.dtc.convert("2012-1-1", None, None) assert rs == xp rs = self.dtc.convert(date(2012, 1, 1), None, None) @@ -187,31 +186,36 @@ def test_conversion(self): rs = self.dtc.convert(datetime(2012, 1, 1).toordinal(), None, None) assert rs == xp - rs = self.dtc.convert('2012-1-1', None, None) + rs = self.dtc.convert("2012-1-1", None, None) assert rs == xp - rs = self.dtc.convert(Timestamp('2012-1-1'), None, None) + rs = self.dtc.convert(Timestamp("2012-1-1"), None, None) assert rs == xp # also testing datetime64 dtype (GH8614) - rs = self.dtc.convert(np_datetime64_compat('2012-01-01'), None, None) + rs = self.dtc.convert(np_datetime64_compat("2012-01-01"), None, None) assert rs == xp - rs = self.dtc.convert(np_datetime64_compat( - '2012-01-01 00:00:00+0000'), None, None) + rs = self.dtc.convert( + np_datetime64_compat("2012-01-01 00:00:00+0000"), None, None + ) assert rs == xp - rs = self.dtc.convert(np.array([ - np_datetime64_compat('2012-01-01 00:00:00+0000'), - np_datetime64_compat('2012-01-02 00:00:00+0000')]), None, None) + rs = self.dtc.convert( + np.array( + [ + np_datetime64_compat("2012-01-01 00:00:00+0000"), + np_datetime64_compat("2012-01-02 00:00:00+0000"), + ] + ), + None, + None, + ) assert rs[0] == xp # we have a tz-aware date (constructed to that when we turn to utc it # is the same as our sample) - ts = (Timestamp('2012-01-01') - .tz_localize('UTC') - .tz_convert('US/Eastern') - ) + ts = Timestamp("2012-01-01").tz_localize("UTC").tz_convert("US/Eastern") rs = self.dtc.convert(ts, None, None) assert rs == xp @@ -221,20 +225,19 @@ def test_conversion(self): rs = self.dtc.convert(Index([ts - Day(1), ts]), None, None) assert rs[1] == xp - rs = self.dtc.convert(Index([ts - Day(1), ts]).to_pydatetime(), - None, None) + rs = self.dtc.convert(Index([ts - Day(1), ts]).to_pydatetime(), None, None) assert rs[1] == xp def test_conversion_float(self): decimals = 9 - rs = self.dtc.convert( - Timestamp('2012-1-1 01:02:03', tz='UTC'), None, None) - xp = converter.dates.date2num(Timestamp('2012-1-1 01:02:03', tz='UTC')) + rs = self.dtc.convert(Timestamp("2012-1-1 01:02:03", tz="UTC"), None, None) + xp = converter.dates.date2num(Timestamp("2012-1-1 01:02:03", tz="UTC")) tm.assert_almost_equal(rs, xp, decimals) rs = self.dtc.convert( - Timestamp('2012-1-1 09:02:03', tz='Asia/Hong_Kong'), None, None) + Timestamp("2012-1-1 09:02:03", tz="Asia/Hong_Kong"), None, None + ) tm.assert_almost_equal(rs, xp, decimals) rs = self.dtc.convert(datetime(2012, 1, 1, 1, 2, 3), None, None) @@ -258,13 +261,16 @@ def test_conversion_outofbounds_datetime(self): xp = converter.dates.date2num(values[0]) assert rs == xp - @pytest.mark.parametrize('time,format_expected', [ - (0, '00:00'), # time2num(datetime.time.min) - (86399.999999, '23:59:59.999999'), # time2num(datetime.time.max) - (90000, '01:00'), - (3723, '01:02:03'), - (39723.2, '11:02:03.200') - ]) + @pytest.mark.parametrize( + "time,format_expected", + [ + (0, "00:00"), # time2num(datetime.time.min) + (86399.999999, "23:59:59.999999"), # time2num(datetime.time.max) + (90000, "01:00"), + (3723, "01:02:03"), + (39723.2, "11:02:03.200"), + ], + ) def test_time_formatter(self, time, format_expected): # issue 18478 result = self.tc(time) @@ -273,7 +279,7 @@ def test_time_formatter(self, time, format_expected): def test_dateindex_conversion(self): decimals = 9 - for freq in ('B', 'L', 'S'): + for freq in ("B", "L", "S"): dateindex = tm.makeDateIndex(k=10, freq=freq) rs = self.dtc.convert(dateindex, None, None) xp = converter.dates.date2num(dateindex._mpl_repr()) @@ -284,18 +290,17 @@ def _assert_less(ts1, ts2): val1 = self.dtc.convert(ts1, None, None) val2 = self.dtc.convert(ts2, None, None) if not val1 < val2: - raise AssertionError('{0} is not less than {1}.'.format(val1, - val2)) + raise AssertionError("{0} is not less than {1}.".format(val1, val2)) # Matplotlib's time representation using floats cannot distinguish # intervals smaller than ~10 microsecond in the common range of years. - ts = Timestamp('2012-1-1') + ts = Timestamp("2012-1-1") _assert_less(ts, ts + Second()) _assert_less(ts, ts + Milli()) _assert_less(ts, ts + Micro(50)) def test_convert_nested(self): - inner = [Timestamp('2017-01-01'), Timestamp('2017-01-02')] + inner = [Timestamp("2017-01-01"), Timestamp("2017-01-02")] data = [inner, inner] result = self.dtc.convert(data, None, None) expected = [self.dtc.convert(x, None, None) for x in data] @@ -303,7 +308,6 @@ def test_convert_nested(self): class TestPeriodConverter: - def setup_method(self, method): self.pc = converter.PeriodConverter() @@ -311,7 +315,7 @@ class Axis: pass self.axis = Axis() - self.axis.freq = 'D' + self.axis.freq = "D" def test_convert_accepts_unicode(self): r1 = self.pc.convert("2012-1-1", None, self.axis) @@ -319,11 +323,11 @@ def test_convert_accepts_unicode(self): assert r1 == r2 def test_conversion(self): - rs = self.pc.convert(['2012-1-1'], None, self.axis)[0] - xp = Period('2012-1-1').ordinal + rs = self.pc.convert(["2012-1-1"], None, self.axis)[0] + xp = Period("2012-1-1").ordinal assert rs == xp - rs = self.pc.convert('2012-1-1', None, self.axis) + rs = self.pc.convert("2012-1-1", None, self.axis) assert rs == xp rs = self.pc.convert([date(2012, 1, 1)], None, self.axis)[0] @@ -332,24 +336,30 @@ def test_conversion(self): rs = self.pc.convert(date(2012, 1, 1), None, self.axis) assert rs == xp - rs = self.pc.convert([Timestamp('2012-1-1')], None, self.axis)[0] + rs = self.pc.convert([Timestamp("2012-1-1")], None, self.axis)[0] assert rs == xp - rs = self.pc.convert(Timestamp('2012-1-1'), None, self.axis) + rs = self.pc.convert(Timestamp("2012-1-1"), None, self.axis) assert rs == xp - rs = self.pc.convert( - np_datetime64_compat('2012-01-01'), None, self.axis) + rs = self.pc.convert(np_datetime64_compat("2012-01-01"), None, self.axis) assert rs == xp rs = self.pc.convert( - np_datetime64_compat('2012-01-01 00:00:00+0000'), None, self.axis) + np_datetime64_compat("2012-01-01 00:00:00+0000"), None, self.axis + ) assert rs == xp - rs = self.pc.convert(np.array([ - np_datetime64_compat('2012-01-01 00:00:00+0000'), - np_datetime64_compat('2012-01-02 00:00:00+0000')]), - None, self.axis) + rs = self.pc.convert( + np.array( + [ + np_datetime64_compat("2012-01-01 00:00:00+0000"), + np_datetime64_compat("2012-01-02 00:00:00+0000"), + ] + ), + None, + self.axis, + ) assert rs[0] == xp def test_integer_passthrough(self): @@ -359,7 +369,7 @@ def test_integer_passthrough(self): assert rs == xp def test_convert_nested(self): - data = ['2012-1-1', '2012-1-2'] + data = ["2012-1-1", "2012-1-2"] r1 = self.pc.convert([data, data], None, self.axis) r2 = [self.pc.convert(data, None, self.axis) for _ in range(2)] assert r1 == r2 diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index c3d824389aa4db..ecd575020eca60 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -22,24 +22,24 @@ @td.skip_if_no_mpl class TestTSPlot(TestPlotBase): - def setup_method(self, method): TestPlotBase.setup_method(self, method) - self.freq = ['S', 'T', 'H', 'D', 'W', 'M', 'Q', 'A'] - idx = [ - period_range('12/31/1999', freq=x, periods=100) for x in self.freq] + self.freq = ["S", "T", "H", "D", "W", "M", "Q", "A"] + idx = [period_range("12/31/1999", freq=x, periods=100) for x in self.freq] self.period_ser = [Series(np.random.randn(len(x)), x) for x in idx] - self.period_df = [DataFrame(np.random.randn(len(x), 3), index=x, - columns=['A', 'B', 'C']) - for x in idx] + self.period_df = [ + DataFrame(np.random.randn(len(x), 3), index=x, columns=["A", "B", "C"]) + for x in idx + ] - freq = ['S', 'T', 'H', 'D', 'W', 'M', 'Q-DEC', 'A', '1B30Min'] - idx = [date_range('12/31/1999', freq=x, periods=100) for x in freq] + freq = ["S", "T", "H", "D", "W", "M", "Q-DEC", "A", "1B30Min"] + idx = [date_range("12/31/1999", freq=x, periods=100) for x in freq] self.datetime_ser = [Series(np.random.randn(len(x)), x) for x in idx] - self.datetime_df = [DataFrame(np.random.randn(len(x), 3), index=x, - columns=['A', 'B', 'C']) - for x in idx] + self.datetime_df = [ + DataFrame(np.random.randn(len(x), 3), index=x, columns=["A", "B", "C"]) + for x in idx + ] def teardown_method(self, method): tm.close() @@ -47,8 +47,7 @@ def teardown_method(self, method): @pytest.mark.slow def test_ts_plot_with_tz(self): # GH2877 - index = date_range('1/1/2011', periods=2, freq='H', - tz='Europe/Brussels') + index = date_range("1/1/2011", periods=2, freq="H", tz="Europe/Brussels") ts = Series([188.5, 328.25], index=index) _check_plot_works(ts.plot) @@ -57,13 +56,13 @@ def test_fontsize_set_correctly(self): df = DataFrame(np.random.randn(10, 9), index=range(10)) fig, ax = self.plt.subplots() df.plot(fontsize=2, ax=ax) - for label in (ax.get_xticklabels() + ax.get_yticklabels()): + for label in ax.get_xticklabels() + ax.get_yticklabels(): assert label.get_fontsize() == 2 @pytest.mark.slow def test_frame_inferred(self): # inferred freq - idx = date_range('1/1/1987', freq='MS', periods=100) + idx = date_range("1/1/1987", freq="MS", periods=100) idx = DatetimeIndex(idx.values, freq=None) df = DataFrame(np.random.randn(len(idx), 3), index=idx) @@ -75,7 +74,7 @@ def test_frame_inferred(self): _check_plot_works(df2.plot) # N > 1 - idx = date_range('2008-1-1 00:15:00', freq='15T', periods=10) + idx = date_range("2008-1-1 00:15:00", freq="15T", periods=10) idx = DatetimeIndex(idx.values, freq=None) df = DataFrame(np.random.randn(len(idx), 3), index=idx) _check_plot_works(df.plot) @@ -88,8 +87,8 @@ def test_is_error_nozeroindex(self): _check_plot_works(a.plot, yerr=a) def test_nonnumeric_exclude(self): - idx = date_range('1/1/1987', freq='A', periods=3) - df = DataFrame({'A': ["x", "y", "z"], 'B': [1, 2, 3]}, idx) + idx = date_range("1/1/1987", freq="A", periods=3) + df = DataFrame({"A": ["x", "y", "z"], "B": [1, 2, 3]}, idx) fig, ax = self.plt.subplots() df.plot(ax=ax) # it works @@ -98,7 +97,7 @@ def test_nonnumeric_exclude(self): msg = "no numeric data to plot" with pytest.raises(TypeError, match=msg): - df['A'].plot() + df["A"].plot() def test_tsplot_deprecated(self): from pandas.tseries.plotting import tsplot @@ -134,40 +133,41 @@ def f(*args, **kwds): _check_plot_works(s.plot, ax=ax) _, ax = self.plt.subplots() - ts.plot(style='k', ax=ax) - color = (0., 0., 0., 1) + ts.plot(style="k", ax=ax) + color = (0.0, 0.0, 0.0, 1) assert color == ax.get_lines()[0].get_color() def test_both_style_and_color(self): ts = tm.makeTimeSeries() - msg = ("Cannot pass 'style' string with a color symbol and 'color' " - "keyword argument. Please use one or the other or pass 'style'" - " without a color symbol") + msg = ( + "Cannot pass 'style' string with a color symbol and 'color' " + "keyword argument. Please use one or the other or pass 'style'" + " without a color symbol" + ) with pytest.raises(ValueError, match=msg): - ts.plot(style='b-', color='#000099') + ts.plot(style="b-", color="#000099") s = ts.reset_index(drop=True) with pytest.raises(ValueError, match=msg): - s.plot(style='b-', color='#000099') + s.plot(style="b-", color="#000099") @pytest.mark.slow def test_high_freq(self): - freaks = ['ms', 'us'] + freaks = ["ms", "us"] for freq in freaks: _, ax = self.plt.subplots() - rng = date_range('1/1/2012', periods=100, freq=freq) + rng = date_range("1/1/2012", periods=100, freq=freq) ser = Series(np.random.randn(len(rng)), rng) _check_plot_works(ser.plot, ax=ax) def test_get_datevalue(self): from pandas.plotting._matplotlib.converter import get_datevalue - assert get_datevalue(None, 'D') is None - assert get_datevalue(1987, 'A') == 1987 - assert (get_datevalue(Period(1987, 'A'), 'M') == - Period('1987-12', 'M').ordinal) - assert (get_datevalue('1/1/1987', 'D') == - Period('1987-1-1', 'D').ordinal) + + assert get_datevalue(None, "D") is None + assert get_datevalue(1987, "A") == 1987 + assert get_datevalue(Period(1987, "A"), "M") == Period("1987-12", "M").ordinal + assert get_datevalue("1/1/1987", "D") == Period("1987-1-1", "D").ordinal @pytest.mark.slow def test_ts_plot_format_coord(self): @@ -178,32 +178,32 @@ def check_format_of_first_point(ax, expected_string): try: assert expected_string == ax.format_coord(first_x, first_y) except (ValueError): - pytest.skip("skipping test because issue forming " - "test comparison GH7664") + pytest.skip( + "skipping test because issue forming " "test comparison GH7664" + ) - annual = Series(1, index=date_range('2014-01-01', periods=3, - freq='A-DEC')) + annual = Series(1, index=date_range("2014-01-01", periods=3, freq="A-DEC")) _, ax = self.plt.subplots() annual.plot(ax=ax) - check_format_of_first_point(ax, 't = 2014 y = 1.000000') + check_format_of_first_point(ax, "t = 2014 y = 1.000000") # note this is added to the annual plot already in existence, and # changes its freq field - daily = Series(1, index=date_range('2014-01-01', periods=3, freq='D')) + daily = Series(1, index=date_range("2014-01-01", periods=3, freq="D")) daily.plot(ax=ax) - check_format_of_first_point(ax, - 't = 2014-01-01 y = 1.000000') + check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") tm.close() # tsplot from pandas.tseries.plotting import tsplot + _, ax = self.plt.subplots() with tm.assert_produces_warning(FutureWarning): tsplot(annual, self.plt.Axes.plot, ax=ax) - check_format_of_first_point(ax, 't = 2014 y = 1.000000') + check_format_of_first_point(ax, "t = 2014 y = 1.000000") with tm.assert_produces_warning(FutureWarning): tsplot(daily, self.plt.Axes.plot, ax=ax) - check_format_of_first_point(ax, 't = 2014-01-01 y = 1.000000') + check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") @pytest.mark.slow def test_line_plot_period_series(self): @@ -212,11 +212,12 @@ def test_line_plot_period_series(self): @pytest.mark.slow @pytest.mark.parametrize( - 'frqncy', ['1S', '3S', '5T', '7H', '4D', '8W', '11M', '3A']) + "frqncy", ["1S", "3S", "5T", "7H", "4D", "8W", "11M", "3A"] + ) def test_line_plot_period_mlt_series(self, frqncy): # test period index line plot for series with multiples (`mlt`) of the # frequency (`frqncy`) rule code. tests resolution of issue #14763 - idx = period_range('12/31/1999', freq=frqncy, periods=100) + idx = period_range("12/31/1999", freq=frqncy, periods=100) s = Series(np.random.randn(len(idx)), idx) _check_plot_works(s.plot, s.index.freq.rule_code) @@ -232,14 +233,14 @@ def test_line_plot_period_frame(self): @pytest.mark.slow @pytest.mark.parametrize( - 'frqncy', ['1S', '3S', '5T', '7H', '4D', '8W', '11M', '3A']) + "frqncy", ["1S", "3S", "5T", "7H", "4D", "8W", "11M", "3A"] + ) def test_line_plot_period_mlt_frame(self, frqncy): # test period index line plot for DataFrames with multiples (`mlt`) # of the frequency (`frqncy`) rule code. tests resolution of issue # #14763 - idx = period_range('12/31/1999', freq=frqncy, periods=100) - df = DataFrame(np.random.randn(len(idx), 3), index=idx, - columns=['A', 'B', 'C']) + idx = period_range("12/31/1999", freq=frqncy, periods=100) + df = DataFrame(np.random.randn(len(idx), 3), index=idx, columns=["A", "B", "C"]) freq = df.index.asfreq(df.index.freq.rule_code).freq _check_plot_works(df.plot, freq) @@ -260,33 +261,32 @@ def test_line_plot_inferred_freq(self): def test_fake_inferred_business(self): _, ax = self.plt.subplots() - rng = date_range('2001-1-1', '2001-1-10') + rng = date_range("2001-1-1", "2001-1-10") ts = Series(range(len(rng)), index=rng) ts = ts[:3].append(ts[5:]) ts.plot(ax=ax) - assert not hasattr(ax, 'freq') + assert not hasattr(ax, "freq") @pytest.mark.slow def test_plot_offset_freq(self): ser = tm.makeTimeSeries() _check_plot_works(ser.plot) - dr = date_range(ser.index[0], freq='BQS', periods=10) + dr = date_range(ser.index[0], freq="BQS", periods=10) ser = Series(np.random.randn(len(dr)), index=dr) _check_plot_works(ser.plot) @pytest.mark.slow def test_plot_multiple_inferred_freq(self): - dr = Index([datetime(2000, 1, 1), - datetime(2000, 1, 6), - datetime(2000, 1, 11)]) + dr = Index([datetime(2000, 1, 1), datetime(2000, 1, 6), datetime(2000, 1, 11)]) ser = Series(np.random.randn(len(dr)), index=dr) _check_plot_works(ser.plot) @pytest.mark.slow def test_uhf(self): import pandas.plotting._matplotlib.converter as conv - idx = date_range('2012-6-22 21:59:51.960928', freq='L', periods=500) + + idx = date_range("2012-6-22 21:59:51.960928", freq="L", periods=500) df = DataFrame(np.random.randn(len(idx), 2), index=idx) _, ax = self.plt.subplots() @@ -296,14 +296,14 @@ def test_uhf(self): tlocs = axis.get_ticklocs() tlabels = axis.get_ticklabels() for loc, label in zip(tlocs, tlabels): - xp = conv._from_ordinal(loc).strftime('%H:%M:%S.%f') + xp = conv._from_ordinal(loc).strftime("%H:%M:%S.%f") rs = str(label.get_text()) if len(rs): assert xp == rs @pytest.mark.slow def test_irreg_hf(self): - idx = date_range('2012-6-22 21:59:51', freq='S', periods=100) + idx = date_range("2012-6-22 21:59:51", freq="S", periods=100) df = DataFrame(np.random.randn(len(idx), 2), index=idx) irreg = df.iloc[[0, 1, 3, 4]] @@ -311,7 +311,7 @@ def test_irreg_hf(self): irreg.plot(ax=ax) diffs = Series(ax.get_lines()[0].get_xydata()[:, 0]).diff() - sec = 1. / 24 / 60 / 60 + sec = 1.0 / 24 / 60 / 60 assert (np.fabs(diffs[1:] - [sec, sec * 2, sec]) < 1e-8).all() _, ax = self.plt.subplots() @@ -339,22 +339,21 @@ def test_business_freq(self): bts.plot(ax=ax) assert ax.get_lines()[0].get_xydata()[0, 0] == bts.index[0].ordinal idx = ax.get_lines()[0].get_xdata() - assert PeriodIndex(data=idx).freqstr == 'B' + assert PeriodIndex(data=idx).freqstr == "B" @pytest.mark.slow def test_business_freq_convert(self): - bts = tm.makeTimeSeries(300).asfreq('BM') - ts = bts.to_period('M') + bts = tm.makeTimeSeries(300).asfreq("BM") + ts = bts.to_period("M") _, ax = self.plt.subplots() bts.plot(ax=ax) assert ax.get_lines()[0].get_xydata()[0, 0] == ts.index[0].ordinal idx = ax.get_lines()[0].get_xdata() - assert PeriodIndex(data=idx).freqstr == 'M' + assert PeriodIndex(data=idx).freqstr == "M" def test_nonzero_base(self): # GH2571 - idx = (date_range('2012-12-20', periods=24, freq='H') + timedelta( - minutes=30)) + idx = date_range("2012-12-20", periods=24, freq="H") + timedelta(minutes=30) df = DataFrame(np.arange(24), index=idx) _, ax = self.plt.subplots() df.plot(ax=ax) @@ -362,7 +361,7 @@ def test_nonzero_base(self): assert not Index(rs).is_normalized def test_dataframe(self): - bts = DataFrame({'a': tm.makeTimeSeries()}) + bts = DataFrame({"a": tm.makeTimeSeries()}) _, ax = self.plt.subplots() bts.plot(ax=ax) idx = ax.get_lines()[0].get_xdata() @@ -370,7 +369,6 @@ def test_dataframe(self): @pytest.mark.slow def test_axis_limits(self): - def _test(ax): xlim = ax.get_xlim() ax.set_xlim(xlim[0] - 5, xlim[1] + 10) @@ -379,16 +377,14 @@ def _test(ax): assert result[1] == xlim[1] + 10 # string - expected = (Period('1/1/2000', ax.freq), - Period('4/1/2000', ax.freq)) - ax.set_xlim('1/1/2000', '4/1/2000') + expected = (Period("1/1/2000", ax.freq), Period("4/1/2000", ax.freq)) + ax.set_xlim("1/1/2000", "4/1/2000") result = ax.get_xlim() assert int(result[0]) == expected[0].ordinal assert int(result[1]) == expected[1].ordinal # datetime - expected = (Period('1/1/2000', ax.freq), - Period('4/1/2000', ax.freq)) + expected = (Period("1/1/2000", ax.freq), Period("4/1/2000", ax.freq)) ax.set_xlim(datetime(2000, 1, 1), datetime(2000, 4, 1)) result = ax.get_xlim() assert int(result[0]) == expected[0].ordinal @@ -402,11 +398,11 @@ def _test(ax): _test(ax) _, ax = self.plt.subplots() - df = DataFrame({'a': ser, 'b': ser + 1}) + df = DataFrame({"a": ser, "b": ser + 1}) df.plot(ax=ax) _test(ax) - df = DataFrame({'a': ser, 'b': ser + 1}) + df = DataFrame({"a": ser, "b": ser + 1}) axes = df.plot(subplots=True) for ax in axes: @@ -415,22 +411,22 @@ def _test(ax): def test_get_finder(self): import pandas.plotting._matplotlib.converter as conv - assert conv.get_finder('B') == conv._daily_finder - assert conv.get_finder('D') == conv._daily_finder - assert conv.get_finder('M') == conv._monthly_finder - assert conv.get_finder('Q') == conv._quarterly_finder - assert conv.get_finder('A') == conv._annual_finder - assert conv.get_finder('W') == conv._daily_finder + assert conv.get_finder("B") == conv._daily_finder + assert conv.get_finder("D") == conv._daily_finder + assert conv.get_finder("M") == conv._monthly_finder + assert conv.get_finder("Q") == conv._quarterly_finder + assert conv.get_finder("A") == conv._annual_finder + assert conv.get_finder("W") == conv._daily_finder @pytest.mark.slow def test_finder_daily(self): day_lst = [10, 40, 252, 400, 950, 2750, 10000] - xpl1 = xpl2 = [Period('1999-1-1', freq='B').ordinal] * len(day_lst) + xpl1 = xpl2 = [Period("1999-1-1", freq="B").ordinal] * len(day_lst) rs1 = [] rs2 = [] for i, n in enumerate(day_lst): - rng = bdate_range('1999-1-1', periods=n) + rng = bdate_range("1999-1-1", periods=n) ser = Series(np.random.randn(len(rng)), rng) _, ax = self.plt.subplots() ser.plot(ax=ax) @@ -449,11 +445,11 @@ def test_finder_daily(self): def test_finder_quarterly(self): yrs = [3.5, 11] - xpl1 = xpl2 = [Period('1988Q1').ordinal] * len(yrs) + xpl1 = xpl2 = [Period("1988Q1").ordinal] * len(yrs) rs1 = [] rs2 = [] for i, n in enumerate(yrs): - rng = period_range('1987Q2', periods=int(n * 4), freq='Q') + rng = period_range("1987Q2", periods=int(n * 4), freq="Q") ser = Series(np.random.randn(len(rng)), rng) _, ax = self.plt.subplots() ser.plot(ax=ax) @@ -472,11 +468,11 @@ def test_finder_quarterly(self): def test_finder_monthly(self): yrs = [1.15, 2.5, 4, 11] - xpl1 = xpl2 = [Period('Jan 1988').ordinal] * len(yrs) + xpl1 = xpl2 = [Period("Jan 1988").ordinal] * len(yrs) rs1 = [] rs2 = [] for i, n in enumerate(yrs): - rng = period_range('1987Q2', periods=int(n * 12), freq='M') + rng = period_range("1987Q2", periods=int(n * 12), freq="M") ser = Series(np.random.randn(len(rng)), rng) _, ax = self.plt.subplots() ser.plot(ax=ax) @@ -492,22 +488,22 @@ def test_finder_monthly(self): assert rs2 == xpl2 def test_finder_monthly_long(self): - rng = period_range('1988Q1', periods=24 * 12, freq='M') + rng = period_range("1988Q1", periods=24 * 12, freq="M") ser = Series(np.random.randn(len(rng)), rng) _, ax = self.plt.subplots() ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] - xp = Period('1989Q1', 'M').ordinal + xp = Period("1989Q1", "M").ordinal assert rs == xp @pytest.mark.slow def test_finder_annual(self): xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170] - xp = [Period(x, freq='A').ordinal for x in xp] + xp = [Period(x, freq="A").ordinal for x in xp] rs = [] for i, nyears in enumerate([5, 10, 19, 49, 99, 199, 599, 1001]): - rng = period_range('1987', periods=nyears, freq='A') + rng = period_range("1987", periods=nyears, freq="A") ser = Series(np.random.randn(len(rng)), rng) _, ax = self.plt.subplots() ser.plot(ax=ax) @@ -520,25 +516,25 @@ def test_finder_annual(self): @pytest.mark.slow def test_finder_minutely(self): nminutes = 50 * 24 * 60 - rng = date_range('1/1/1999', freq='Min', periods=nminutes) + rng = date_range("1/1/1999", freq="Min", periods=nminutes) ser = Series(np.random.randn(len(rng)), rng) _, ax = self.plt.subplots() ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] - xp = Period('1/1/1999', freq='Min').ordinal + xp = Period("1/1/1999", freq="Min").ordinal assert rs == xp def test_finder_hourly(self): nhours = 23 - rng = date_range('1/1/1999', freq='H', periods=nhours) + rng = date_range("1/1/1999", freq="H", periods=nhours) ser = Series(np.random.randn(len(rng)), rng) _, ax = self.plt.subplots() ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] - xp = Period('1/1/1999', freq='H').ordinal + xp = Period("1/1/1999", freq="H").ordinal assert rs == xp @@ -604,7 +600,7 @@ def test_gap_upsample(self): _, ax = self.plt.subplots() low.plot(ax=ax) - idxh = date_range(low.index[0], low.index[-1], freq='12h') + idxh = date_range(low.index[0], low.index[-1], freq="12h") s = Series(np.random.randn(len(idxh)), idxh) s.plot(secondary_y=True) lines = ax.get_lines() @@ -626,51 +622,49 @@ def test_secondary_y(self): ser2 = Series(np.random.randn(10)) fig, _ = self.plt.subplots() ax = ser.plot(secondary_y=True) - assert hasattr(ax, 'left_ax') - assert not hasattr(ax, 'right_ax') + assert hasattr(ax, "left_ax") + assert not hasattr(ax, "right_ax") axes = fig.get_axes() line = ax.get_lines()[0] xp = Series(line.get_ydata(), line.get_xdata()) assert_series_equal(ser, xp) - assert ax.get_yaxis().get_ticks_position() == 'right' + assert ax.get_yaxis().get_ticks_position() == "right" assert not axes[0].get_yaxis().get_visible() self.plt.close(fig) _, ax2 = self.plt.subplots() ser2.plot(ax=ax2) - assert (ax2.get_yaxis().get_ticks_position() == - self.default_tick_position) + assert ax2.get_yaxis().get_ticks_position() == self.default_tick_position self.plt.close(ax2.get_figure()) ax = ser2.plot() ax2 = ser.plot(secondary_y=True) assert ax.get_yaxis().get_visible() - assert not hasattr(ax, 'left_ax') - assert hasattr(ax, 'right_ax') - assert hasattr(ax2, 'left_ax') - assert not hasattr(ax2, 'right_ax') + assert not hasattr(ax, "left_ax") + assert hasattr(ax, "right_ax") + assert hasattr(ax2, "left_ax") + assert not hasattr(ax2, "right_ax") @pytest.mark.slow def test_secondary_y_ts(self): - idx = date_range('1/1/2000', periods=10) + idx = date_range("1/1/2000", periods=10) ser = Series(np.random.randn(10), idx) ser2 = Series(np.random.randn(10), idx) fig, _ = self.plt.subplots() ax = ser.plot(secondary_y=True) - assert hasattr(ax, 'left_ax') - assert not hasattr(ax, 'right_ax') + assert hasattr(ax, "left_ax") + assert not hasattr(ax, "right_ax") axes = fig.get_axes() line = ax.get_lines()[0] xp = Series(line.get_ydata(), line.get_xdata()).to_timestamp() assert_series_equal(ser, xp) - assert ax.get_yaxis().get_ticks_position() == 'right' + assert ax.get_yaxis().get_ticks_position() == "right" assert not axes[0].get_yaxis().get_visible() self.plt.close(fig) _, ax2 = self.plt.subplots() ser2.plot(ax=ax2) - assert (ax2.get_yaxis().get_ticks_position() == - self.default_tick_position) + assert ax2.get_yaxis().get_ticks_position() == self.default_tick_position self.plt.close(ax2.get_figure()) ax = ser2.plot() @@ -683,37 +677,35 @@ def test_secondary_kde(self): ser = Series(np.random.randn(10)) fig, ax = self.plt.subplots() - ax = ser.plot(secondary_y=True, kind='density', ax=ax) - assert hasattr(ax, 'left_ax') - assert not hasattr(ax, 'right_ax') + ax = ser.plot(secondary_y=True, kind="density", ax=ax) + assert hasattr(ax, "left_ax") + assert not hasattr(ax, "right_ax") axes = fig.get_axes() - assert axes[1].get_yaxis().get_ticks_position() == 'right' + assert axes[1].get_yaxis().get_ticks_position() == "right" @pytest.mark.slow def test_secondary_bar(self): ser = Series(np.random.randn(10)) fig, ax = self.plt.subplots() - ser.plot(secondary_y=True, kind='bar', ax=ax) + ser.plot(secondary_y=True, kind="bar", ax=ax) axes = fig.get_axes() - assert axes[1].get_yaxis().get_ticks_position() == 'right' + assert axes[1].get_yaxis().get_ticks_position() == "right" @pytest.mark.slow def test_secondary_frame(self): - df = DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c']) - axes = df.plot(secondary_y=['a', 'c'], subplots=True) - assert axes[0].get_yaxis().get_ticks_position() == 'right' - assert (axes[1].get_yaxis().get_ticks_position() == - self.default_tick_position) - assert axes[2].get_yaxis().get_ticks_position() == 'right' + df = DataFrame(np.random.randn(5, 3), columns=["a", "b", "c"]) + axes = df.plot(secondary_y=["a", "c"], subplots=True) + assert axes[0].get_yaxis().get_ticks_position() == "right" + assert axes[1].get_yaxis().get_ticks_position() == self.default_tick_position + assert axes[2].get_yaxis().get_ticks_position() == "right" @pytest.mark.slow def test_secondary_bar_frame(self): - df = DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c']) - axes = df.plot(kind='bar', secondary_y=['a', 'c'], subplots=True) - assert axes[0].get_yaxis().get_ticks_position() == 'right' - assert (axes[1].get_yaxis().get_ticks_position() == - self.default_tick_position) - assert axes[2].get_yaxis().get_ticks_position() == 'right' + df = DataFrame(np.random.randn(5, 3), columns=["a", "b", "c"]) + axes = df.plot(kind="bar", secondary_y=["a", "c"], subplots=True) + assert axes[0].get_yaxis().get_ticks_position() == "right" + assert axes[1].get_yaxis().get_ticks_position() == self.default_tick_position + assert axes[2].get_yaxis().get_ticks_position() == "right" def test_mixed_freq_regular_first(self): # TODO @@ -724,13 +716,13 @@ def test_mixed_freq_regular_first(self): _, ax = self.plt.subplots() s1.plot(ax=ax) - ax2 = s2.plot(style='g', ax=ax) + ax2 = s2.plot(style="g", ax=ax) lines = ax2.get_lines() idx1 = PeriodIndex(lines[0].get_xdata()) idx2 = PeriodIndex(lines[1].get_xdata()) - tm.assert_index_equal(idx1, s1.index.to_period('B')) - tm.assert_index_equal(idx2, s2.index.to_period('B')) + tm.assert_index_equal(idx1, s1.index.to_period("B")) + tm.assert_index_equal(idx2, s2.index.to_period("B")) left, right = ax2.get_xlim() pidx = s1.index.to_period() @@ -742,9 +734,9 @@ def test_mixed_freq_irregular_first(self): s1 = tm.makeTimeSeries() s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]] _, ax = self.plt.subplots() - s2.plot(style='g', ax=ax) + s2.plot(style="g", ax=ax) s1.plot(ax=ax) - assert not hasattr(ax, 'freq') + assert not hasattr(ax, "freq") lines = ax.get_lines() x1 = lines[0].get_xdata() tm.assert_numpy_array_equal(x1, s2.index.astype(object).values) @@ -757,12 +749,12 @@ def test_mixed_freq_regular_first_df(self): s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15], :] _, ax = self.plt.subplots() s1.plot(ax=ax) - ax2 = s2.plot(style='g', ax=ax) + ax2 = s2.plot(style="g", ax=ax) lines = ax2.get_lines() idx1 = PeriodIndex(lines[0].get_xdata()) idx2 = PeriodIndex(lines[1].get_xdata()) - assert idx1.equals(s1.index.to_period('B')) - assert idx2.equals(s2.index.to_period('B')) + assert idx1.equals(s1.index.to_period("B")) + assert idx2.equals(s2.index.to_period("B")) left, right = ax2.get_xlim() pidx = s1.index.to_period() assert left <= pidx[0].ordinal @@ -774,9 +766,9 @@ def test_mixed_freq_irregular_first_df(self): s1 = tm.makeTimeSeries().to_frame() s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15], :] _, ax = self.plt.subplots() - s2.plot(style='g', ax=ax) + s2.plot(style="g", ax=ax) s1.plot(ax=ax) - assert not hasattr(ax, 'freq') + assert not hasattr(ax, "freq") lines = ax.get_lines() x1 = lines[0].get_xdata() tm.assert_numpy_array_equal(x1, s2.index.astype(object).values) @@ -784,60 +776,60 @@ def test_mixed_freq_irregular_first_df(self): tm.assert_numpy_array_equal(x2, s1.index.astype(object).values) def test_mixed_freq_hf_first(self): - idxh = date_range('1/1/1999', periods=365, freq='D') - idxl = date_range('1/1/1999', periods=12, freq='M') + idxh = date_range("1/1/1999", periods=365, freq="D") + idxl = date_range("1/1/1999", periods=12, freq="M") high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) _, ax = self.plt.subplots() high.plot(ax=ax) low.plot(ax=ax) for l in ax.get_lines(): - assert PeriodIndex(data=l.get_xdata()).freq == 'D' + assert PeriodIndex(data=l.get_xdata()).freq == "D" @pytest.mark.slow def test_mixed_freq_alignment(self): - ts_ind = date_range('2012-01-01 13:00', '2012-01-02', freq='H') + ts_ind = date_range("2012-01-01 13:00", "2012-01-02", freq="H") ts_data = np.random.randn(12) ts = Series(ts_data, index=ts_ind) - ts2 = ts.asfreq('T').interpolate() + ts2 = ts.asfreq("T").interpolate() _, ax = self.plt.subplots() ax = ts.plot(ax=ax) - ts2.plot(style='r', ax=ax) + ts2.plot(style="r", ax=ax) assert ax.lines[0].get_xdata()[0] == ax.lines[1].get_xdata()[0] @pytest.mark.slow def test_mixed_freq_lf_first(self): - idxh = date_range('1/1/1999', periods=365, freq='D') - idxl = date_range('1/1/1999', periods=12, freq='M') + idxh = date_range("1/1/1999", periods=365, freq="D") + idxl = date_range("1/1/1999", periods=12, freq="M") high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) _, ax = self.plt.subplots() low.plot(legend=True, ax=ax) high.plot(legend=True, ax=ax) for l in ax.get_lines(): - assert PeriodIndex(data=l.get_xdata()).freq == 'D' + assert PeriodIndex(data=l.get_xdata()).freq == "D" leg = ax.get_legend() assert len(leg.texts) == 2 self.plt.close(ax.get_figure()) - idxh = date_range('1/1/1999', periods=240, freq='T') - idxl = date_range('1/1/1999', periods=4, freq='H') + idxh = date_range("1/1/1999", periods=240, freq="T") + idxl = date_range("1/1/1999", periods=4, freq="H") high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) _, ax = self.plt.subplots() low.plot(ax=ax) high.plot(ax=ax) for l in ax.get_lines(): - assert PeriodIndex(data=l.get_xdata()).freq == 'T' + assert PeriodIndex(data=l.get_xdata()).freq == "T" def test_mixed_freq_irreg_period(self): ts = tm.makeTimeSeries() irreg = ts[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 16, 17, 18, 29]] - rng = period_range('1/3/2000', periods=30, freq='B') + rng = period_range("1/3/2000", periods=30, freq="B") ps = Series(np.random.randn(len(rng)), rng) _, ax = self.plt.subplots() irreg.plot(ax=ax) @@ -846,7 +838,7 @@ def test_mixed_freq_irreg_period(self): def test_mixed_freq_shared_ax(self): # GH13341, using sharex=True - idx1 = date_range('2015-01-01', periods=3, freq='M') + idx1 = date_range("2015-01-01", periods=3, freq="M") idx2 = idx1[:1].union(idx1[2:]) s1 = Series(range(len(idx1)), idx1) s2 = Series(range(len(idx2)), idx2) @@ -855,10 +847,9 @@ def test_mixed_freq_shared_ax(self): s1.plot(ax=ax1) s2.plot(ax=ax2) - assert ax1.freq == 'M' - assert ax2.freq == 'M' - assert (ax1.lines[0].get_xydata()[0, 0] == - ax2.lines[0].get_xydata()[0, 0]) + assert ax1.freq == "M" + assert ax2.freq == "M" + assert ax1.lines[0].get_xydata()[0, 0] == ax2.lines[0].get_xydata()[0, 0] # using twinx fig, ax1 = self.plt.subplots() @@ -866,8 +857,7 @@ def test_mixed_freq_shared_ax(self): s1.plot(ax=ax1) s2.plot(ax=ax2) - assert (ax1.lines[0].get_xydata()[0, 0] == - ax2.lines[0].get_xydata()[0, 0]) + assert ax1.lines[0].get_xydata()[0, 0] == ax2.lines[0].get_xydata()[0, 0] # TODO (GH14330, GH14322) # plotting the irregular first does not yet work @@ -882,7 +872,7 @@ def test_nat_handling(self): _, ax = self.plt.subplots() - dti = DatetimeIndex(['2015-01-01', NaT, '2015-01-03']) + dti = DatetimeIndex(["2015-01-01", NaT, "2015-01-03"]) s = Series(range(len(dti)), dti) s.plot(ax=ax) xdata = ax.get_lines()[0].get_xdata() @@ -892,8 +882,8 @@ def test_nat_handling(self): @pytest.mark.slow def test_to_weekly_resampling(self): - idxh = date_range('1/1/1999', periods=52, freq='W') - idxl = date_range('1/1/1999', periods=12, freq='M') + idxh = date_range("1/1/1999", periods=52, freq="W") + idxl = date_range("1/1/1999", periods=12, freq="M") high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) _, ax = self.plt.subplots() @@ -904,6 +894,7 @@ def test_to_weekly_resampling(self): _, ax = self.plt.subplots() from pandas.tseries.plotting import tsplot + with tm.assert_produces_warning(FutureWarning): tsplot(high, self.plt.Axes.plot, ax=ax) with tm.assert_produces_warning(FutureWarning): @@ -913,8 +904,8 @@ def test_to_weekly_resampling(self): @pytest.mark.slow def test_from_weekly_resampling(self): - idxh = date_range('1/1/1999', periods=52, freq='W') - idxl = date_range('1/1/1999', periods=12, freq='M') + idxh = date_range("1/1/1999", periods=52, freq="W") + idxl = date_range("1/1/1999", periods=12, freq="M") high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) _, ax = self.plt.subplots() @@ -922,8 +913,10 @@ def test_from_weekly_resampling(self): high.plot(ax=ax) expected_h = idxh.to_period().asi8.astype(np.float64) - expected_l = np.array([1514, 1519, 1523, 1527, 1531, 1536, 1540, 1544, - 1549, 1553, 1558, 1562], dtype=np.float64) + expected_l = np.array( + [1514, 1519, 1523, 1527, 1531, 1536, 1540, 1544, 1549, 1553, 1558, 1562], + dtype=np.float64, + ) for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq xdata = l.get_xdata(orig=False) @@ -935,6 +928,7 @@ def test_from_weekly_resampling(self): _, ax = self.plt.subplots() from pandas.tseries.plotting import tsplot + with tm.assert_produces_warning(FutureWarning): tsplot(low, self.plt.Axes.plot, ax=ax) with tm.assert_produces_warning(FutureWarning): @@ -949,33 +943,43 @@ def test_from_weekly_resampling(self): @pytest.mark.slow def test_from_resampling_area_line_mixed(self): - idxh = date_range('1/1/1999', periods=52, freq='W') - idxl = date_range('1/1/1999', periods=12, freq='M') - high = DataFrame(np.random.rand(len(idxh), 3), - index=idxh, columns=[0, 1, 2]) - low = DataFrame(np.random.rand(len(idxl), 3), - index=idxl, columns=[0, 1, 2]) + idxh = date_range("1/1/1999", periods=52, freq="W") + idxl = date_range("1/1/1999", periods=12, freq="M") + high = DataFrame(np.random.rand(len(idxh), 3), index=idxh, columns=[0, 1, 2]) + low = DataFrame(np.random.rand(len(idxl), 3), index=idxl, columns=[0, 1, 2]) # low to high - for kind1, kind2 in [('line', 'area'), ('area', 'line')]: + for kind1, kind2 in [("line", "area"), ("area", "line")]: _, ax = self.plt.subplots() low.plot(kind=kind1, stacked=True, ax=ax) high.plot(kind=kind2, stacked=True, ax=ax) # check low dataframe result - expected_x = np.array([1514, 1519, 1523, 1527, 1531, 1536, 1540, - 1544, 1549, 1553, 1558, 1562], - dtype=np.float64) + expected_x = np.array( + [ + 1514, + 1519, + 1523, + 1527, + 1531, + 1536, + 1540, + 1544, + 1549, + 1553, + 1558, + 1562, + ], + dtype=np.float64, + ) expected_y = np.zeros(len(expected_x), dtype=np.float64) for i in range(3): line = ax.lines[i] assert PeriodIndex(line.get_xdata()).freq == idxh.freq - tm.assert_numpy_array_equal(line.get_xdata(orig=False), - expected_x) + tm.assert_numpy_array_equal(line.get_xdata(orig=False), expected_x) # check stacked values are correct expected_y += low[i].values - tm.assert_numpy_array_equal(line.get_ydata(orig=False), - expected_y) + tm.assert_numpy_array_equal(line.get_ydata(orig=False), expected_y) # check high dataframe result expected_x = idxh.to_period().asi8.astype(np.float64) @@ -983,14 +987,12 @@ def test_from_resampling_area_line_mixed(self): for i in range(3): line = ax.lines[3 + i] assert PeriodIndex(data=line.get_xdata()).freq == idxh.freq - tm.assert_numpy_array_equal(line.get_xdata(orig=False), - expected_x) + tm.assert_numpy_array_equal(line.get_xdata(orig=False), expected_x) expected_y += high[i].values - tm.assert_numpy_array_equal(line.get_ydata(orig=False), - expected_y) + tm.assert_numpy_array_equal(line.get_ydata(orig=False), expected_y) # high to low - for kind1, kind2 in [('line', 'area'), ('area', 'line')]: + for kind1, kind2 in [("line", "area"), ("area", "line")]: _, ax = self.plt.subplots() high.plot(kind=kind1, stacked=True, ax=ax) low.plot(kind=kind2, stacked=True, ax=ax) @@ -1001,31 +1003,41 @@ def test_from_resampling_area_line_mixed(self): for i in range(3): line = ax.lines[i] assert PeriodIndex(data=line.get_xdata()).freq == idxh.freq - tm.assert_numpy_array_equal(line.get_xdata(orig=False), - expected_x) + tm.assert_numpy_array_equal(line.get_xdata(orig=False), expected_x) expected_y += high[i].values - tm.assert_numpy_array_equal(line.get_ydata(orig=False), - expected_y) + tm.assert_numpy_array_equal(line.get_ydata(orig=False), expected_y) # check low dataframe result - expected_x = np.array([1514, 1519, 1523, 1527, 1531, 1536, 1540, - 1544, 1549, 1553, 1558, 1562], - dtype=np.float64) + expected_x = np.array( + [ + 1514, + 1519, + 1523, + 1527, + 1531, + 1536, + 1540, + 1544, + 1549, + 1553, + 1558, + 1562, + ], + dtype=np.float64, + ) expected_y = np.zeros(len(expected_x), dtype=np.float64) for i in range(3): lines = ax.lines[3 + i] assert PeriodIndex(data=lines.get_xdata()).freq == idxh.freq - tm.assert_numpy_array_equal(lines.get_xdata(orig=False), - expected_x) + tm.assert_numpy_array_equal(lines.get_xdata(orig=False), expected_x) expected_y += low[i].values - tm.assert_numpy_array_equal(lines.get_ydata(orig=False), - expected_y) + tm.assert_numpy_array_equal(lines.get_ydata(orig=False), expected_y) @pytest.mark.slow def test_mixed_freq_second_millisecond(self): # GH 7772, GH 7760 - idxh = date_range('2014-07-01 09:00', freq='S', periods=50) - idxl = date_range('2014-07-01 09:00', freq='100L', periods=500) + idxh = date_range("2014-07-01 09:00", freq="S", periods=50) + idxl = date_range("2014-07-01 09:00", freq="100L", periods=500) high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) # high to low @@ -1034,7 +1046,7 @@ def test_mixed_freq_second_millisecond(self): low.plot(ax=ax) assert len(ax.get_lines()) == 2 for l in ax.get_lines(): - assert PeriodIndex(data=l.get_xdata()).freq == 'L' + assert PeriodIndex(data=l.get_xdata()).freq == "L" tm.close() # low to high @@ -1043,7 +1055,7 @@ def test_mixed_freq_second_millisecond(self): high.plot(ax=ax) assert len(ax.get_lines()) == 2 for l in ax.get_lines(): - assert PeriodIndex(data=l.get_xdata()).freq == 'L' + assert PeriodIndex(data=l.get_xdata()).freq == "L" @pytest.mark.slow def test_irreg_dtypes(self): @@ -1053,7 +1065,7 @@ def test_irreg_dtypes(self): _check_plot_works(df.plot) # np.datetime64 - idx = date_range('1/1/2000', periods=10) + idx = date_range("1/1/2000", periods=10) idx = idx[[0, 2, 5, 9]].astype(object) df = DataFrame(np.random.randn(len(idx), 3), idx) _, ax = self.plt.subplots() @@ -1064,9 +1076,9 @@ def test_time(self): t = datetime(1, 1, 1, 3, 30, 0) deltas = np.random.randint(1, 20, 3).cumsum() ts = np.array([(t + timedelta(minutes=int(x))).time() for x in deltas]) - df = DataFrame({'a': np.random.randn(len(ts)), - 'b': np.random.randn(len(ts))}, - index=ts) + df = DataFrame( + {"a": np.random.randn(len(ts)), "b": np.random.randn(len(ts))}, index=ts + ) fig, ax = self.plt.subplots() df.plot(ax=ax) @@ -1079,9 +1091,9 @@ def test_time(self): rs = l.get_text() if len(rs) > 0: if s != 0: - xp = time(h, m, s).strftime('%H:%M:%S') + xp = time(h, m, s).strftime("%H:%M:%S") else: - xp = time(h, m, s).strftime('%H:%M') + xp = time(h, m, s).strftime("%H:%M") assert xp == rs @pytest.mark.slow @@ -1090,9 +1102,9 @@ def test_time_change_xlim(self): t = datetime(1, 1, 1, 3, 30, 0) deltas = np.random.randint(1, 20, 3).cumsum() ts = np.array([(t + timedelta(minutes=int(x))).time() for x in deltas]) - df = DataFrame({'a': np.random.randn(len(ts)), - 'b': np.random.randn(len(ts))}, - index=ts) + df = DataFrame( + {"a": np.random.randn(len(ts)), "b": np.random.randn(len(ts))}, index=ts + ) fig, ax = self.plt.subplots() df.plot(ax=ax) @@ -1105,13 +1117,13 @@ def test_time_change_xlim(self): rs = l.get_text() if len(rs) > 0: if s != 0: - xp = time(h, m, s).strftime('%H:%M:%S') + xp = time(h, m, s).strftime("%H:%M:%S") else: - xp = time(h, m, s).strftime('%H:%M') + xp = time(h, m, s).strftime("%H:%M") assert xp == rs # change xlim - ax.set_xlim('1:30', '5:00') + ax.set_xlim("1:30", "5:00") # check tick labels again ticks = ax.get_xticks() @@ -1122,20 +1134,19 @@ def test_time_change_xlim(self): rs = l.get_text() if len(rs) > 0: if s != 0: - xp = time(h, m, s).strftime('%H:%M:%S') + xp = time(h, m, s).strftime("%H:%M:%S") else: - xp = time(h, m, s).strftime('%H:%M') + xp = time(h, m, s).strftime("%H:%M") assert xp == rs @pytest.mark.slow def test_time_musec(self): t = datetime(1, 1, 1, 3, 30, 0) deltas = np.random.randint(1, 20, 3).cumsum() - ts = np.array([(t + timedelta(microseconds=int(x))).time() - for x in deltas]) - df = DataFrame({'a': np.random.randn(len(ts)), - 'b': np.random.randn(len(ts))}, - index=ts) + ts = np.array([(t + timedelta(microseconds=int(x))).time() for x in deltas]) + df = DataFrame( + {"a": np.random.randn(len(ts)), "b": np.random.randn(len(ts))}, index=ts + ) fig, ax = self.plt.subplots() ax = df.plot(ax=ax) @@ -1151,30 +1162,30 @@ def test_time_musec(self): rs = l.get_text() if len(rs) > 0: if (us % 1000) != 0: - xp = time(h, m, s, us).strftime('%H:%M:%S.%f') + xp = time(h, m, s, us).strftime("%H:%M:%S.%f") elif (us // 1000) != 0: - xp = time(h, m, s, us).strftime('%H:%M:%S.%f')[:-3] + xp = time(h, m, s, us).strftime("%H:%M:%S.%f")[:-3] elif s != 0: - xp = time(h, m, s, us).strftime('%H:%M:%S') + xp = time(h, m, s, us).strftime("%H:%M:%S") else: - xp = time(h, m, s, us).strftime('%H:%M') + xp = time(h, m, s, us).strftime("%H:%M") assert xp == rs @pytest.mark.slow def test_secondary_upsample(self): - idxh = date_range('1/1/1999', periods=365, freq='D') - idxl = date_range('1/1/1999', periods=12, freq='M') + idxh = date_range("1/1/1999", periods=365, freq="D") + idxl = date_range("1/1/1999", periods=12, freq="M") high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) _, ax = self.plt.subplots() low.plot(ax=ax) ax = high.plot(secondary_y=True, ax=ax) for l in ax.get_lines(): - assert PeriodIndex(l.get_xdata()).freq == 'D' - assert hasattr(ax, 'left_ax') - assert not hasattr(ax, 'right_ax') + assert PeriodIndex(l.get_xdata()).freq == "D" + assert hasattr(ax, "left_ax") + assert not hasattr(ax, "right_ax") for l in ax.left_ax.get_lines(): - assert PeriodIndex(l.get_xdata()).freq == 'D' + assert PeriodIndex(l.get_xdata()).freq == "D" @pytest.mark.slow def test_secondary_legend(self): @@ -1183,13 +1194,13 @@ def test_secondary_legend(self): # ts df = tm.makeTimeDataFrame() - df.plot(secondary_y=['A', 'B'], ax=ax) + df.plot(secondary_y=["A", "B"], ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 - assert leg.get_texts()[0].get_text() == 'A (right)' - assert leg.get_texts()[1].get_text() == 'B (right)' - assert leg.get_texts()[2].get_text() == 'C' - assert leg.get_texts()[3].get_text() == 'D' + assert leg.get_texts()[0].get_text() == "A (right)" + assert leg.get_texts()[1].get_text() == "B (right)" + assert leg.get_texts()[2].get_text() == "C" + assert leg.get_texts()[3].get_text() == "D" assert ax.right_ax.get_legend() is None colors = set() for line in leg.get_lines(): @@ -1201,33 +1212,33 @@ def test_secondary_legend(self): fig = self.plt.figure() ax = fig.add_subplot(211) - df.plot(secondary_y=['A', 'C'], mark_right=False, ax=ax) + df.plot(secondary_y=["A", "C"], mark_right=False, ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 - assert leg.get_texts()[0].get_text() == 'A' - assert leg.get_texts()[1].get_text() == 'B' - assert leg.get_texts()[2].get_text() == 'C' - assert leg.get_texts()[3].get_text() == 'D' + assert leg.get_texts()[0].get_text() == "A" + assert leg.get_texts()[1].get_text() == "B" + assert leg.get_texts()[2].get_text() == "C" + assert leg.get_texts()[3].get_text() == "D" self.plt.close(fig) fig, ax = self.plt.subplots() - df.plot(kind='bar', secondary_y=['A'], ax=ax) + df.plot(kind="bar", secondary_y=["A"], ax=ax) leg = ax.get_legend() - assert leg.get_texts()[0].get_text() == 'A (right)' - assert leg.get_texts()[1].get_text() == 'B' + assert leg.get_texts()[0].get_text() == "A (right)" + assert leg.get_texts()[1].get_text() == "B" self.plt.close(fig) fig, ax = self.plt.subplots() - df.plot(kind='bar', secondary_y=['A'], mark_right=False, ax=ax) + df.plot(kind="bar", secondary_y=["A"], mark_right=False, ax=ax) leg = ax.get_legend() - assert leg.get_texts()[0].get_text() == 'A' - assert leg.get_texts()[1].get_text() == 'B' + assert leg.get_texts()[0].get_text() == "A" + assert leg.get_texts()[1].get_text() == "B" self.plt.close(fig) fig = self.plt.figure() ax = fig.add_subplot(211) df = tm.makeTimeDataFrame() - ax = df.plot(secondary_y=['C', 'D'], ax=ax) + ax = df.plot(secondary_y=["C", "D"], ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 assert ax.right_ax.get_legend() is None @@ -1243,7 +1254,7 @@ def test_secondary_legend(self): df = tm.makeDataFrame() fig = self.plt.figure() ax = fig.add_subplot(211) - ax = df.plot(secondary_y=['A', 'B'], ax=ax) + ax = df.plot(secondary_y=["A", "B"], ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 assert ax.right_ax.get_legend() is None @@ -1257,7 +1268,7 @@ def test_secondary_legend(self): fig = self.plt.figure() ax = fig.add_subplot(211) - ax = df.plot(secondary_y=['C', 'D'], ax=ax) + ax = df.plot(secondary_y=["C", "D"], ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 assert ax.right_ax.get_legend() is None @@ -1269,7 +1280,7 @@ def test_secondary_legend(self): assert len(colors) == 4 def test_format_date_axis(self): - rng = date_range('1/1/2012', periods=12, freq='M') + rng = date_range("1/1/2012", periods=12, freq="M") df = DataFrame(np.random.randn(len(rng), 3), rng) _, ax = self.plt.subplots() ax = df.plot(ax=ax) @@ -1280,10 +1291,10 @@ def test_format_date_axis(self): @pytest.mark.slow def test_ax_plot(self): - x = date_range(start='2012-01-02', periods=10, freq='D') + x = date_range(start="2012-01-02", periods=10, freq="D") y = list(range(len(x))) _, ax = self.plt.subplots() - lines = ax.plot(x, y, label='Y') + lines = ax.plot(x, y, label="Y") tm.assert_index_equal(DatetimeIndex(lines[0].get_xdata()), x) @pytest.mark.slow @@ -1292,7 +1303,7 @@ def test_mpl_nopandas(self): values1 = np.arange(10.0, 11.0, 0.5) values2 = np.arange(11.0, 12.0, 0.5) - kw = dict(fmt='-', lw=4) + kw = dict(fmt="-", lw=4) _, ax = self.plt.subplots() ax.plot_date([x.toordinal() for x in dates], values1, **kw) @@ -1341,8 +1352,8 @@ def test_secondary_y_non_ts_xlim(self): @pytest.mark.slow def test_secondary_y_regular_ts_xlim(self): # GH 3490 - regular-timeseries with secondary y - index_1 = date_range(start='2000-01-01', periods=4, freq='D') - index_2 = date_range(start='2000-01-05', periods=4, freq='D') + index_1 = date_range(start="2000-01-01", periods=4, freq="D") + index_2 = date_range(start="2000-01-05", periods=4, freq="D") s1 = Series(1, index=index_1) s2 = Series(2, index=index_2) @@ -1358,13 +1369,13 @@ def test_secondary_y_regular_ts_xlim(self): @pytest.mark.slow def test_secondary_y_mixed_freq_ts_xlim(self): # GH 3490 - mixed frequency timeseries with secondary y - rng = date_range('2000-01-01', periods=10000, freq='min') + rng = date_range("2000-01-01", periods=10000, freq="min") ts = Series(1, index=rng) _, ax = self.plt.subplots() ts.plot(ax=ax) left_before, right_before = ax.get_xlim() - ts.resample('D').mean().plot(secondary_y=True, ax=ax) + ts.resample("D").mean().plot(secondary_y=True, ax=ax) left_after, right_after = ax.get_xlim() # a downsample should not have changed either limit @@ -1399,10 +1410,9 @@ def test_plot_outofbounds_datetime(self): def test_format_timedelta_ticks_narrow(self): - expected_labels = (['00:00:00.0000000{:0>2d}'.format(i) - for i in range(10)]) + expected_labels = ["00:00:00.0000000{:0>2d}".format(i) for i in range(10)] - rng = timedelta_range('0', periods=10, freq='ns') + rng = timedelta_range("0", periods=10, freq="ns") df = DataFrame(np.random.randn(len(rng), 3), rng) fig, ax = self.plt.subplots() df.plot(fontsize=2, ax=ax) @@ -1415,18 +1425,18 @@ def test_format_timedelta_ticks_narrow(self): def test_format_timedelta_ticks_wide(self): expected_labels = [ - '00:00:00', - '1 days 03:46:40', - '2 days 07:33:20', - '3 days 11:20:00', - '4 days 15:06:40', - '5 days 18:53:20', - '6 days 22:40:00', - '8 days 02:26:40', - '9 days 06:13:20', + "00:00:00", + "1 days 03:46:40", + "2 days 07:33:20", + "3 days 11:20:00", + "4 days 15:06:40", + "5 days 18:53:20", + "6 days 22:40:00", + "8 days 02:26:40", + "9 days 06:13:20", ] - rng = timedelta_range('0', periods=10, freq='1 d') + rng = timedelta_range("0", periods=10, freq="1 d") df = DataFrame(np.random.randn(len(rng), 3), rng) fig, ax = self.plt.subplots() ax = df.plot(fontsize=2, ax=ax) @@ -1439,42 +1449,50 @@ def test_format_timedelta_ticks_wide(self): def test_timedelta_plot(self): # test issue #8711 - s = Series(range(5), timedelta_range('1day', periods=5)) + s = Series(range(5), timedelta_range("1day", periods=5)) _, ax = self.plt.subplots() _check_plot_works(s.plot, ax=ax) # test long period - index = timedelta_range('1 day 2 hr 30 min 10 s', - periods=10, freq='1 d') + index = timedelta_range("1 day 2 hr 30 min 10 s", periods=10, freq="1 d") s = Series(np.random.randn(len(index)), index) _, ax = self.plt.subplots() _check_plot_works(s.plot, ax=ax) # test short period - index = timedelta_range('1 day 2 hr 30 min 10 s', - periods=10, freq='1 ns') + index = timedelta_range("1 day 2 hr 30 min 10 s", periods=10, freq="1 ns") s = Series(np.random.randn(len(index)), index) _, ax = self.plt.subplots() _check_plot_works(s.plot, ax=ax) def test_hist(self): # https://github.com/matplotlib/matplotlib/issues/8459 - rng = date_range('1/1/2011', periods=10, freq='H') + rng = date_range("1/1/2011", periods=10, freq="H") x = rng - w1 = np.arange(0, 1, .1) - w2 = np.arange(0, 1, .1)[::-1] + w1 = np.arange(0, 1, 0.1) + w2 = np.arange(0, 1, 0.1)[::-1] _, ax = self.plt.subplots() ax.hist([x, x], weights=[w1, w2]) @pytest.mark.slow def test_overlapping_datetime(self): # GB 6608 - s1 = Series([1, 2, 3], index=[datetime(1995, 12, 31), - datetime(2000, 12, 31), - datetime(2005, 12, 31)]) - s2 = Series([1, 2, 3], index=[datetime(1997, 12, 31), - datetime(2003, 12, 31), - datetime(2008, 12, 31)]) + s1 = Series( + [1, 2, 3], + index=[ + datetime(1995, 12, 31), + datetime(2000, 12, 31), + datetime(2005, 12, 31), + ], + ) + s2 = Series( + [1, 2, 3], + index=[ + datetime(1997, 12, 31), + datetime(2003, 12, 31), + datetime(2008, 12, 31), + ], + ) # plot first series, then add the second series to those axes, # then try adding the first series again @@ -1483,23 +1501,20 @@ def test_overlapping_datetime(self): s2.plot(ax=ax) s1.plot(ax=ax) - @pytest.mark.xfail(reason="GH9053 matplotlib does not use" - " ax.xaxis.converter") + @pytest.mark.xfail(reason="GH9053 matplotlib does not use" " ax.xaxis.converter") def test_add_matplotlib_datetime64(self): # GH9053 - ensure that a plot with PeriodConverter still understands # datetime64 data. This still fails because matplotlib overrides the # ax.xaxis.converter with a DatetimeConverter - s = Series(np.random.randn(10), - index=date_range('1970-01-02', periods=10)) + s = Series(np.random.randn(10), index=date_range("1970-01-02", periods=10)) ax = s.plot() - ax.plot(s.index, s.values, color='g') + ax.plot(s.index, s.values, color="g") l1, l2 = ax.lines tm.assert_numpy_array_equal(l1.get_xydata(), l2.get_xydata()) def test_matplotlib_scatter_datetime64(self): # https://github.com/matplotlib/matplotlib/issues/11391 - df = DataFrame(np.random.RandomState(0).rand(10, 2), - columns=["x", "y"]) + df = DataFrame(np.random.RandomState(0).rand(10, 2), columns=["x", "y"]) df["time"] = date_range("2018-01-01", periods=10, freq="D") fig, ax = self.plt.subplots() ax.scatter(x="time", y="y", data=df) @@ -1520,13 +1535,13 @@ def _check_plot_works(f, freq=None, series=None, *args, **kwargs): try: plt.clf() ax = fig.add_subplot(211) - orig_ax = kwargs.pop('ax', plt.gca()) - orig_axfreq = getattr(orig_ax, 'freq', None) + orig_ax = kwargs.pop("ax", plt.gca()) + orig_axfreq = getattr(orig_ax, "freq", None) ret = f(*args, **kwargs) assert ret is not None # do something more intelligent - ax = kwargs.pop('ax', plt.gca()) + ax = kwargs.pop("ax", plt.gca()) if series is not None: dfreq = series.index.freq if isinstance(dfreq, DateOffset): @@ -1539,7 +1554,7 @@ def _check_plot_works(f, freq=None, series=None, *args, **kwargs): ax = fig.add_subplot(212) try: - kwargs['ax'] = ax + kwargs["ax"] = ax ret = f(*args, **kwargs) assert ret is not None # do something more intelligent except Exception: @@ -1554,7 +1569,7 @@ def _check_plot_works(f, freq=None, series=None, *args, **kwargs): # TODO(statsmodels 0.10.0): Remove the statsmodels check # https://github.com/pandas-dev/pandas/issues/24088 # https://github.com/statsmodels/statsmodels/issues/4772 - if 'statsmodels' not in sys.modules: + if "statsmodels" not in sys.modules: with ensure_clean(return_filelike=True) as path: pickle.dump(fig, path) finally: diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 272f01a12156bd..0215b79cb993d5 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -15,8 +15,7 @@ from pandas.core.dtypes.api import is_list_like import pandas as pd -from pandas import ( - DataFrame, MultiIndex, PeriodIndex, Series, bdate_range, date_range) +from pandas import DataFrame, MultiIndex, PeriodIndex, Series, bdate_range, date_range from pandas.core.arrays import integer_array from pandas.tests.plotting.common import TestPlotBase, _check_plot_works import pandas.util.testing as tm @@ -27,17 +26,20 @@ @td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): - def setup_method(self, method): TestPlotBase.setup_method(self, method) import matplotlib as mpl + mpl.rcdefaults() self.tdf = tm.makeTimeDataFrame() - self.hexbin_df = DataFrame({"A": np.random.uniform(size=20), - "B": np.random.uniform(size=20), - "C": np.arange(20) + np.random.uniform( - size=20)}) + self.hexbin_df = DataFrame( + { + "A": np.random.uniform(size=20), + "B": np.random.uniform(size=20), + "C": np.arange(20) + np.random.uniform(size=20), + } + ) def _assert_ytickslabels_visibility(self, axes, expected): for ax, exp in zip(axes, expected): @@ -55,21 +57,18 @@ def test_plot(self): _check_plot_works(df.plot, grid=False) # _check_plot_works adds an ax so catch warning. see GH #13188 with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot, - subplots=True) + axes = _check_plot_works(df.plot, subplots=True) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot, - subplots=True, layout=(-1, 2)) + axes = _check_plot_works(df.plot, subplots=True, layout=(-1, 2)) self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot, - subplots=True, use_index=False) + axes = _check_plot_works(df.plot, subplots=True, use_index=False) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - df = DataFrame({'x': [1, 2], 'y': [3, 4]}) + df = DataFrame({"x": [1, 2], "y": [3, 4]}) if _mpl_ge_3_1_0(): msg = "'Line2D' object has no property 'blarg'" else: @@ -77,8 +76,7 @@ def test_plot(self): with pytest.raises(AttributeError, match=msg): df.plot.line(blarg=True) - df = DataFrame(np.random.rand(10, 3), - index=list(string.ascii_letters[:10])) + df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) _check_plot_works(df.plot, use_index=True) _check_plot_works(df.plot, sort_columns=False) @@ -87,14 +85,14 @@ def test_plot(self): _check_plot_works(df.plot, ylim=(-100, 100), xlim=(-100, 100)) with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.plot, subplots=True, title='blah') + _check_plot_works(df.plot, subplots=True, title="blah") # We have to redo it here because _check_plot_works does two plots, # once without an ax kwarg and once with an ax kwarg and the new sharex # behaviour does not remove the visibility of the latter axis (as ax is # present). see: https://github.com/pandas-dev/pandas/issues/9737 - axes = df.plot(subplots=True, title='blah') + axes = df.plot(subplots=True, title="blah") self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) # axes[0].figure.savefig("test.png") for ax in axes[:2]: @@ -108,32 +106,35 @@ def test_plot(self): self._check_visible([ax.xaxis.get_label()]) self._check_ticks_props(ax, xrot=0) - _check_plot_works(df.plot, title='blah') + _check_plot_works(df.plot, title="blah") tuples = zip(string.ascii_letters[:10], range(10)) - df = DataFrame(np.random.rand(10, 3), - index=MultiIndex.from_tuples(tuples)) + df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples)) _check_plot_works(df.plot, use_index=True) # unicode - index = MultiIndex.from_tuples([('\u03b1', 0), - ('\u03b1', 1), - ('\u03b2', 2), - ('\u03b2', 3), - ('\u03b3', 4), - ('\u03b3', 5), - ('\u03b4', 6), - ('\u03b4', 7)], names=['i0', 'i1']) + index = MultiIndex.from_tuples( + [ + ("\u03b1", 0), + ("\u03b1", 1), + ("\u03b2", 2), + ("\u03b2", 3), + ("\u03b3", 4), + ("\u03b3", 5), + ("\u03b4", 6), + ("\u03b4", 7), + ], + names=["i0", "i1"], + ) columns = MultiIndex.from_tuples( - [('bar', '\u0394'), ('bar', '\u0395')], names=['c0', 'c1']) - df = DataFrame(np.random.randint(0, 10, (8, 2)), - columns=columns, - index=index) - _check_plot_works(df.plot, title='\u03A3') + [("bar", "\u0394"), ("bar", "\u0395")], names=["c0", "c1"] + ) + df = DataFrame(np.random.randint(0, 10, (8, 2)), columns=columns, index=index) + _check_plot_works(df.plot, title="\u03A3") # GH 6951 # Test with single column - df = DataFrame({'x': np.random.rand(10)}) + df = DataFrame({"x": np.random.rand(10)}) axes = _check_plot_works(df.plot.bar, subplots=True) self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) @@ -157,82 +158,82 @@ def test_integer_array_plot(self): _check_plot_works(s.plot.hist) _check_plot_works(s.plot.pie) - df = DataFrame({'x': arr, 'y': arr}) + df = DataFrame({"x": arr, "y": arr}) _check_plot_works(df.plot.line) _check_plot_works(df.plot.bar) _check_plot_works(df.plot.hist) - _check_plot_works(df.plot.pie, y='y') - _check_plot_works(df.plot.scatter, x='x', y='y') - _check_plot_works(df.plot.hexbin, x='x', y='y') + _check_plot_works(df.plot.pie, y="y") + _check_plot_works(df.plot.scatter, x="x", y="y") + _check_plot_works(df.plot.hexbin, x="x", y="y") def test_mpl2_color_cycle_str(self): # GH 15516 - colors = ['C' + str(x) for x in range(10)] - df = DataFrame(randn(10, 3), columns=['a', 'b', 'c']) + colors = ["C" + str(x) for x in range(10)] + df = DataFrame(randn(10, 3), columns=["a", "b", "c"]) for c in colors: _check_plot_works(df.plot, color=c) def test_color_single_series_list(self): # GH 3486 df = DataFrame({"A": [1, 2, 3]}) - _check_plot_works(df.plot, color=['red']) + _check_plot_works(df.plot, color=["red"]) def test_rgb_tuple_color(self): # GH 16695 - df = DataFrame({'x': [1, 2], 'y': [3, 4]}) - _check_plot_works(df.plot, x='x', y='y', color=(1, 0, 0)) - _check_plot_works(df.plot, x='x', y='y', color=(1, 0, 0, 0.5)) + df = DataFrame({"x": [1, 2], "y": [3, 4]}) + _check_plot_works(df.plot, x="x", y="y", color=(1, 0, 0)) + _check_plot_works(df.plot, x="x", y="y", color=(1, 0, 0, 0.5)) def test_color_empty_string(self): df = DataFrame(randn(10, 2)) with pytest.raises(ValueError): - df.plot(color='') + df.plot(color="") def test_color_and_style_arguments(self): - df = DataFrame({'x': [1, 2], 'y': [3, 4]}) + df = DataFrame({"x": [1, 2], "y": [3, 4]}) # passing both 'color' and 'style' arguments should be allowed # if there is no color symbol in the style strings: - ax = df.plot(color=['red', 'black'], style=['-', '--']) + ax = df.plot(color=["red", "black"], style=["-", "--"]) # check that the linestyles are correctly set: linestyle = [line.get_linestyle() for line in ax.lines] - assert linestyle == ['-', '--'] + assert linestyle == ["-", "--"] # check that the colors are correctly set: color = [line.get_color() for line in ax.lines] - assert color == ['red', 'black'] + assert color == ["red", "black"] # passing both 'color' and 'style' arguments should not be allowed # if there is a color symbol in the style strings: with pytest.raises(ValueError): - df.plot(color=['red', 'black'], style=['k-', 'r--']) + df.plot(color=["red", "black"], style=["k-", "r--"]) def test_nonnumeric_exclude(self): - df = DataFrame({'A': ["x", "y", "z"], 'B': [1, 2, 3]}) + df = DataFrame({"A": ["x", "y", "z"], "B": [1, 2, 3]}) ax = df.plot() assert len(ax.get_lines()) == 1 # B was plotted @pytest.mark.slow def test_implicit_label(self): - df = DataFrame(randn(10, 3), columns=['a', 'b', 'c']) - ax = df.plot(x='a', y='b') - self._check_text_labels(ax.xaxis.get_label(), 'a') + df = DataFrame(randn(10, 3), columns=["a", "b", "c"]) + ax = df.plot(x="a", y="b") + self._check_text_labels(ax.xaxis.get_label(), "a") @pytest.mark.slow def test_donot_overwrite_index_name(self): # GH 8494 - df = DataFrame(randn(2, 2), columns=['a', 'b']) - df.index.name = 'NAME' - df.plot(y='b', label='LABEL') - assert df.index.name == 'NAME' + df = DataFrame(randn(2, 2), columns=["a", "b"]) + df.index.name = "NAME" + df.plot(y="b", label="LABEL") + assert df.index.name == "NAME" @pytest.mark.slow def test_plot_xy(self): # columns.inferred_type == 'string' df = self.tdf - self._check_data(df.plot(x=0, y=1), df.set_index('A')['B'].plot()) - self._check_data(df.plot(x=0), df.set_index('A').plot()) + self._check_data(df.plot(x=0, y=1), df.set_index("A")["B"].plot()) + self._check_data(df.plot(x=0), df.set_index("A").plot()) self._check_data(df.plot(y=0), df.B.plot()) - self._check_data(df.plot(x='A', y='B'), df.set_index('A').B.plot()) - self._check_data(df.plot(x='A'), df.set_index('A').plot()) - self._check_data(df.plot(y='B'), df.B.plot()) + self._check_data(df.plot(x="A", y="B"), df.set_index("A").B.plot()) + self._check_data(df.plot(x="A"), df.set_index("A").plot()) + self._check_data(df.plot(y="B"), df.B.plot()) # columns.inferred_type == 'integer' df.columns = np.arange(1, len(df.columns) + 1) @@ -241,21 +242,19 @@ def test_plot_xy(self): self._check_data(df.plot(y=1), df[1].plot()) # figsize and title - ax = df.plot(x=1, y=2, title='Test', figsize=(16, 8)) - self._check_text_labels(ax.title, 'Test') - self._check_axes_shape(ax, axes_num=1, layout=(1, 1), - figsize=(16., 8.)) + ax = df.plot(x=1, y=2, title="Test", figsize=(16, 8)) + self._check_text_labels(ax.title, "Test") + self._check_axes_shape(ax, axes_num=1, layout=(1, 1), figsize=(16.0, 8.0)) # columns.inferred_type == 'mixed' # TODO add MultiIndex test @pytest.mark.slow - @pytest.mark.parametrize("input_log, expected_log", [ - (True, 'log'), - ('sym', 'symlog') - ]) + @pytest.mark.parametrize( + "input_log, expected_log", [(True, "log"), ("sym", "symlog")] + ) def test_logscales(self, input_log, expected_log): - df = DataFrame({'a': np.arange(100)}, index=np.arange(100)) + df = DataFrame({"a": np.arange(100)}, index=np.arange(100)) ax = df.plot(logy=input_log) self._check_ax_scales(ax, yaxis=expected_log) @@ -273,7 +272,7 @@ def test_logscales(self, input_log, expected_log): @pytest.mark.parametrize("input_param", ["logx", "logy", "loglog"]) def test_invalid_logscale(self, input_param): # GH: 24867 - df = DataFrame({'a': np.arange(100)}, index=np.arange(100)) + df = DataFrame({"a": np.arange(100)}, index=np.arange(100)) msg = "Boolean, None and 'sym' are valid options, 'sm' is given." with pytest.raises(ValueError, match=msg): @@ -289,13 +288,13 @@ def test_xcompat(self): assert not isinstance(lines[0].get_xdata(), PeriodIndex) tm.close() - pd.plotting.plot_params['xaxis.compat'] = True + pd.plotting.plot_params["xaxis.compat"] = True ax = df.plot() lines = ax.get_lines() assert not isinstance(lines[0].get_xdata(), PeriodIndex) tm.close() - pd.plotting.plot_params['x_compat'] = False + pd.plotting.plot_params["x_compat"] = False ax = df.plot() lines = ax.get_lines() @@ -304,7 +303,7 @@ def test_xcompat(self): tm.close() # useful if you're plotting a bunch together - with pd.plotting.plot_params.use('x_compat', True): + with pd.plotting.plot_params.use("x_compat", True): ax = df.plot() lines = ax.get_lines() assert not isinstance(lines[0].get_xdata(), PeriodIndex) @@ -321,19 +320,21 @@ def test_period_compat(self): df = DataFrame( np.random.rand(21, 2), index=bdate_range(datetime(2000, 1, 1), datetime(2000, 1, 31)), - columns=['a', 'b']) + columns=["a", "b"], + ) df.plot() self.plt.axhline(y=0) tm.close() def test_unsorted_index(self): - df = DataFrame({'y': np.arange(100)}, index=np.arange(99, -1, -1), - dtype=np.int64) + df = DataFrame( + {"y": np.arange(100)}, index=np.arange(99, -1, -1), dtype=np.int64 + ) ax = df.plot() lines = ax.get_lines()[0] rs = lines.get_xydata() - rs = Series(rs[:, 1], rs[:, 0], dtype=np.int64, name='y') + rs = Series(rs[:, 1], rs[:, 0], dtype=np.int64, name="y") tm.assert_series_equal(rs, df.y, check_index_type=False) tm.close() @@ -341,27 +342,29 @@ def test_unsorted_index(self): ax = df.plot() lines = ax.get_lines()[0] rs = lines.get_xydata() - rs = Series(rs[:, 1], rs[:, 0], dtype=np.int64, name='y') + rs = Series(rs[:, 1], rs[:, 0], dtype=np.int64, name="y") tm.assert_series_equal(rs, df.y) def test_unsorted_index_lims(self): - df = DataFrame({'y': [0., 1., 2., 3.]}, index=[1., 0., 3., 2.]) + df = DataFrame({"y": [0.0, 1.0, 2.0, 3.0]}, index=[1.0, 0.0, 3.0, 2.0]) ax = df.plot() xmin, xmax = ax.get_xlim() lines = ax.get_lines() assert xmin <= np.nanmin(lines[0].get_data()[0]) assert xmax >= np.nanmax(lines[0].get_data()[0]) - df = DataFrame({'y': [0., 1., np.nan, 3., 4., 5., 6.]}, - index=[1., 0., 3., 2., np.nan, 3., 2.]) + df = DataFrame( + {"y": [0.0, 1.0, np.nan, 3.0, 4.0, 5.0, 6.0]}, + index=[1.0, 0.0, 3.0, 2.0, np.nan, 3.0, 2.0], + ) ax = df.plot() xmin, xmax = ax.get_xlim() lines = ax.get_lines() assert xmin <= np.nanmin(lines[0].get_data()[0]) assert xmax >= np.nanmax(lines[0].get_data()[0]) - df = DataFrame({'y': [0., 1., 2., 3.], 'z': [91., 90., 93., 92.]}) - ax = df.plot(x='z', y='y') + df = DataFrame({"y": [0.0, 1.0, 2.0, 3.0], "z": [91.0, 90.0, 93.0, 92.0]}) + ax = df.plot(x="z", y="y") xmin, xmax = ax.get_xlim() lines = ax.get_lines() assert xmin <= np.nanmin(lines[0].get_data()[0]) @@ -369,25 +372,22 @@ def test_unsorted_index_lims(self): @pytest.mark.slow def test_subplots(self): - df = DataFrame(np.random.rand(10, 3), - index=list(string.ascii_letters[:10])) + df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) - for kind in ['bar', 'barh', 'line', 'area']: + for kind in ["bar", "barh", "line", "area"]: axes = df.plot(kind=kind, subplots=True, sharex=True, legend=True) self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) - assert axes.shape == (3, ) + assert axes.shape == (3,) for ax, column in zip(axes, df.columns): - self._check_legend_labels(ax, - labels=[pprint_thing(column)]) + self._check_legend_labels(ax, labels=[pprint_thing(column)]) for ax in axes[:-2]: self._check_visible(ax.xaxis) # xaxis must be visible for grid self._check_visible(ax.get_xticklabels(), visible=False) - if not (kind == 'bar' and self.mpl_ge_3_1_0): + if not (kind == "bar" and self.mpl_ge_3_1_0): # change https://github.com/pandas-dev/pandas/issues/26714 - self._check_visible( - ax.get_xticklabels(minor=True), visible=False) + self._check_visible(ax.get_xticklabels(minor=True), visible=False) self._check_visible(ax.xaxis.get_label(), visible=False) self._check_visible(ax.get_yticklabels()) @@ -414,23 +414,27 @@ def test_groupby_boxplot_sharey(self): # sharey can now be switched check whether the right # pair of axes is turned on or off - df = DataFrame({'a': [-1.43, -0.15, -3.70, -1.43, -0.14], - 'b': [0.56, 0.84, 0.29, 0.56, 0.85], - 'c': [0, 1, 2, 3, 1]}, - index=[0, 1, 2, 3, 4]) + df = DataFrame( + { + "a": [-1.43, -0.15, -3.70, -1.43, -0.14], + "b": [0.56, 0.84, 0.29, 0.56, 0.85], + "c": [0, 1, 2, 3, 1], + }, + index=[0, 1, 2, 3, 4], + ) # behavior without keyword - axes = df.groupby('c').boxplot() + axes = df.groupby("c").boxplot() expected = [True, False, True, False] self._assert_ytickslabels_visibility(axes, expected) # set sharey=True should be identical - axes = df.groupby('c').boxplot(sharey=True) + axes = df.groupby("c").boxplot(sharey=True) expected = [True, False, True, False] self._assert_ytickslabels_visibility(axes, expected) # sharey=False, all yticklabels should be visible - axes = df.groupby('c').boxplot(sharey=False) + axes = df.groupby("c").boxplot(sharey=False) expected = [True, True, True, True] self._assert_ytickslabels_visibility(axes, expected) @@ -439,33 +443,37 @@ def test_groupby_boxplot_sharex(self): # sharex can now be switched check whether the right # pair of axes is turned on or off - df = DataFrame({'a': [-1.43, -0.15, -3.70, -1.43, -0.14], - 'b': [0.56, 0.84, 0.29, 0.56, 0.85], - 'c': [0, 1, 2, 3, 1]}, - index=[0, 1, 2, 3, 4]) + df = DataFrame( + { + "a": [-1.43, -0.15, -3.70, -1.43, -0.14], + "b": [0.56, 0.84, 0.29, 0.56, 0.85], + "c": [0, 1, 2, 3, 1], + }, + index=[0, 1, 2, 3, 4], + ) # behavior without keyword - axes = df.groupby('c').boxplot() + axes = df.groupby("c").boxplot() expected = [True, True, True, True] self._assert_xtickslabels_visibility(axes, expected) # set sharex=False should be identical - axes = df.groupby('c').boxplot(sharex=False) + axes = df.groupby("c").boxplot(sharex=False) expected = [True, True, True, True] self._assert_xtickslabels_visibility(axes, expected) # sharex=True, yticklabels should be visible # only for bottom plots - axes = df.groupby('c').boxplot(sharex=True) + axes = df.groupby("c").boxplot(sharex=True) expected = [False, False, True, True] self._assert_xtickslabels_visibility(axes, expected) @pytest.mark.slow def test_subplots_timeseries(self): - idx = date_range(start='2014-07-01', freq='M', periods=10) + idx = date_range(start="2014-07-01", freq="M", periods=10) df = DataFrame(np.random.rand(10, 3), index=idx) - for kind in ['line', 'area']: + for kind in ["line", "area"]: axes = df.plot(kind=kind, subplots=True, sharex=True) self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) @@ -473,8 +481,7 @@ def test_subplots_timeseries(self): # GH 7801 self._check_visible(ax.xaxis) # xaxis must be visible for grid self._check_visible(ax.get_xticklabels(), visible=False) - self._check_visible( - ax.get_xticklabels(minor=True), visible=False) + self._check_visible(ax.get_xticklabels(minor=True), visible=False) self._check_visible(ax.xaxis.get_label(), visible=False) self._check_visible(ax.get_yticklabels()) @@ -485,54 +492,64 @@ def test_subplots_timeseries(self): self._check_visible(axes[-1].get_yticklabels()) self._check_ticks_props(axes, xrot=0) - axes = df.plot(kind=kind, subplots=True, sharex=False, rot=45, - fontsize=7) + axes = df.plot(kind=kind, subplots=True, sharex=False, rot=45, fontsize=7) for ax in axes: self._check_visible(ax.xaxis) self._check_visible(ax.get_xticklabels()) self._check_visible(ax.get_xticklabels(minor=True)) self._check_visible(ax.xaxis.get_label()) self._check_visible(ax.get_yticklabels()) - self._check_ticks_props(ax, xlabelsize=7, xrot=45, - ylabelsize=7) + self._check_ticks_props(ax, xlabelsize=7, xrot=45, ylabelsize=7) def test_subplots_timeseries_y_axis(self): # GH16953 - data = {"numeric": np.array([1, 2, 5]), - "timedelta": [pd.Timedelta(-10, unit="s"), - pd.Timedelta(10, unit="m"), - pd.Timedelta(10, unit="h")], - "datetime_no_tz": [pd.to_datetime("2017-08-01 00:00:00"), - pd.to_datetime("2017-08-01 02:00:00"), - pd.to_datetime("2017-08-02 00:00:00")], - "datetime_all_tz": [pd.to_datetime("2017-08-01 00:00:00", - utc=True), - pd.to_datetime("2017-08-01 02:00:00", - utc=True), - pd.to_datetime("2017-08-02 00:00:00", - utc=True)], - "text": ["This", "should", "fail"]} + data = { + "numeric": np.array([1, 2, 5]), + "timedelta": [ + pd.Timedelta(-10, unit="s"), + pd.Timedelta(10, unit="m"), + pd.Timedelta(10, unit="h"), + ], + "datetime_no_tz": [ + pd.to_datetime("2017-08-01 00:00:00"), + pd.to_datetime("2017-08-01 02:00:00"), + pd.to_datetime("2017-08-02 00:00:00"), + ], + "datetime_all_tz": [ + pd.to_datetime("2017-08-01 00:00:00", utc=True), + pd.to_datetime("2017-08-01 02:00:00", utc=True), + pd.to_datetime("2017-08-02 00:00:00", utc=True), + ], + "text": ["This", "should", "fail"], + } testdata = DataFrame(data) ax_numeric = testdata.plot(y="numeric") - assert (ax_numeric.get_lines()[0].get_data()[1] == - testdata["numeric"].values).all() + assert ( + ax_numeric.get_lines()[0].get_data()[1] == testdata["numeric"].values + ).all() ax_timedelta = testdata.plot(y="timedelta") - assert (ax_timedelta.get_lines()[0].get_data()[1] == - testdata["timedelta"].values).all() + assert ( + ax_timedelta.get_lines()[0].get_data()[1] == testdata["timedelta"].values + ).all() ax_datetime_no_tz = testdata.plot(y="datetime_no_tz") - assert (ax_datetime_no_tz.get_lines()[0].get_data()[1] == - testdata["datetime_no_tz"].values).all() + assert ( + ax_datetime_no_tz.get_lines()[0].get_data()[1] + == testdata["datetime_no_tz"].values + ).all() ax_datetime_all_tz = testdata.plot(y="datetime_all_tz") - assert (ax_datetime_all_tz.get_lines()[0].get_data()[1] == - testdata["datetime_all_tz"].values).all() + assert ( + ax_datetime_all_tz.get_lines()[0].get_data()[1] + == testdata["datetime_all_tz"].values + ).all() msg = "no numeric data to plot" with pytest.raises(TypeError, match=msg): testdata.plot(y="text") - @pytest.mark.xfail(reason='not support for period, categorical, ' - 'datetime_mixed_tz') + @pytest.mark.xfail( + reason="not support for period, categorical, " "datetime_mixed_tz" + ) def test_subplots_timeseries_y_axis_not_supported(self): """ This test will fail for: @@ -551,34 +568,42 @@ def test_subplots_timeseries_y_axis_not_supported(self): generally converting ``datetime`` objects in a tz-aware form could help with this problem """ - data = {"numeric": np.array([1, 2, 5]), - "period": [pd.Period('2017-08-01 00:00:00', freq='H'), - pd.Period('2017-08-01 02:00', freq='H'), - pd.Period('2017-08-02 00:00:00', freq='H')], - "categorical": pd.Categorical(["c", "b", "a"], - categories=["a", "b", "c"], - ordered=False), - "datetime_mixed_tz": [pd.to_datetime("2017-08-01 00:00:00", - utc=True), - pd.to_datetime("2017-08-01 02:00:00"), - pd.to_datetime("2017-08-02 00:00:00")]} + data = { + "numeric": np.array([1, 2, 5]), + "period": [ + pd.Period("2017-08-01 00:00:00", freq="H"), + pd.Period("2017-08-01 02:00", freq="H"), + pd.Period("2017-08-02 00:00:00", freq="H"), + ], + "categorical": pd.Categorical( + ["c", "b", "a"], categories=["a", "b", "c"], ordered=False + ), + "datetime_mixed_tz": [ + pd.to_datetime("2017-08-01 00:00:00", utc=True), + pd.to_datetime("2017-08-01 02:00:00"), + pd.to_datetime("2017-08-02 00:00:00"), + ], + } testdata = pd.DataFrame(data) ax_period = testdata.plot(x="numeric", y="period") - assert (ax_period.get_lines()[0].get_data()[1] == - testdata["period"].values).all() + assert ( + ax_period.get_lines()[0].get_data()[1] == testdata["period"].values + ).all() ax_categorical = testdata.plot(x="numeric", y="categorical") - assert (ax_categorical.get_lines()[0].get_data()[1] == - testdata["categorical"].values).all() - ax_datetime_mixed_tz = testdata.plot(x="numeric", - y="datetime_mixed_tz") - assert (ax_datetime_mixed_tz.get_lines()[0].get_data()[1] == - testdata["datetime_mixed_tz"].values).all() + assert ( + ax_categorical.get_lines()[0].get_data()[1] + == testdata["categorical"].values + ).all() + ax_datetime_mixed_tz = testdata.plot(x="numeric", y="datetime_mixed_tz") + assert ( + ax_datetime_mixed_tz.get_lines()[0].get_data()[1] + == testdata["datetime_mixed_tz"].values + ).all() @pytest.mark.slow def test_subplots_layout(self): # GH 6667 - df = DataFrame(np.random.rand(10, 3), - index=list(string.ascii_letters[:10])) + df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) axes = df.plot(subplots=True, layout=(2, 2)) self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) @@ -610,11 +635,10 @@ def test_subplots_layout(self): df.plot(subplots=True, layout=(-1, -1)) # single column - df = DataFrame(np.random.rand(10, 1), - index=list(string.ascii_letters[:10])) + df = DataFrame(np.random.rand(10, 1), index=list(string.ascii_letters[:10])) axes = df.plot(subplots=True) self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - assert axes.shape == (1, ) + assert axes.shape == (1,) axes = df.plot(subplots=True, layout=(3, 3)) self._check_axes_shape(axes, axes_num=1, layout=(3, 3)) @@ -627,27 +651,25 @@ def test_subplots_warnings(self): df = DataFrame(np.random.randn(100, 4)) df.plot(subplots=True, layout=(3, 2)) - df = DataFrame(np.random.randn(100, 4), - index=date_range('1/1/2000', periods=100)) + df = DataFrame( + np.random.randn(100, 4), index=date_range("1/1/2000", periods=100) + ) df.plot(subplots=True, layout=(3, 2)) @pytest.mark.slow def test_subplots_multiple_axes(self): # GH 5353, 6970, GH 7069 fig, axes = self.plt.subplots(2, 3) - df = DataFrame(np.random.rand(10, 3), - index=list(string.ascii_letters[:10])) + df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) - returned = df.plot(subplots=True, ax=axes[0], sharex=False, - sharey=False) + returned = df.plot(subplots=True, ax=axes[0], sharex=False, sharey=False) self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) - assert returned.shape == (3, ) + assert returned.shape == (3,) assert returned[0].figure is fig # draw on second row - returned = df.plot(subplots=True, ax=axes[1], sharex=False, - sharey=False) + returned = df.plot(subplots=True, ax=axes[1], sharex=False, sharey=False) self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) - assert returned.shape == (3, ) + assert returned.shape == (3,) assert returned[0].figure is fig self._check_axes_shape(axes, axes_num=6, layout=(2, 3)) tm.close() @@ -664,32 +686,33 @@ def test_subplots_multiple_axes(self): fig, axes = self.plt.subplots(2, 2) with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) - df = DataFrame(np.random.rand(10, 4), - index=list(string.ascii_letters[:10])) + df = DataFrame(np.random.rand(10, 4), index=list(string.ascii_letters[:10])) - returned = df.plot(subplots=True, ax=axes, layout=(2, 1), - sharex=False, sharey=False) + returned = df.plot( + subplots=True, ax=axes, layout=(2, 1), sharex=False, sharey=False + ) self._check_axes_shape(returned, axes_num=4, layout=(2, 2)) - assert returned.shape == (4, ) + assert returned.shape == (4,) - returned = df.plot(subplots=True, ax=axes, layout=(2, -1), - sharex=False, sharey=False) + returned = df.plot( + subplots=True, ax=axes, layout=(2, -1), sharex=False, sharey=False + ) self._check_axes_shape(returned, axes_num=4, layout=(2, 2)) - assert returned.shape == (4, ) + assert returned.shape == (4,) - returned = df.plot(subplots=True, ax=axes, layout=(-1, 2), - sharex=False, sharey=False) + returned = df.plot( + subplots=True, ax=axes, layout=(-1, 2), sharex=False, sharey=False + ) self._check_axes_shape(returned, axes_num=4, layout=(2, 2)) - assert returned.shape == (4, ) + assert returned.shape == (4,) # single column fig, axes = self.plt.subplots(1, 1) - df = DataFrame(np.random.rand(10, 1), - index=list(string.ascii_letters[:10])) + df = DataFrame(np.random.rand(10, 1), index=list(string.ascii_letters[:10])) axes = df.plot(subplots=True, ax=[axes], sharex=False, sharey=False) self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - assert axes.shape == (1, ) + assert axes.shape == (1,) def test_subplots_ts_share_axes(self): # GH 3964 @@ -697,7 +720,8 @@ def test_subplots_ts_share_axes(self): self.plt.subplots_adjust(left=0.05, right=0.95, hspace=0.3, wspace=0.3) df = DataFrame( np.random.randn(10, 9), - index=date_range(start='2014-07-01', freq='M', periods=10)) + index=date_range(start="2014-07-01", freq="M", periods=10), + ) for i, ax in enumerate(axes.ravel()): df[i].plot(ax=ax, fontsize=5) @@ -721,11 +745,11 @@ def test_subplots_ts_share_axes(self): def test_subplots_sharex_axes_existing_axes(self): # GH 9158 - d = {'A': [1., 2., 3., 4.], 'B': [4., 3., 2., 1.], 'C': [5, 1, 3, 4]} - df = DataFrame(d, index=date_range('2014 10 11', '2014 10 14')) + d = {"A": [1.0, 2.0, 3.0, 4.0], "B": [4.0, 3.0, 2.0, 1.0], "C": [5, 1, 3, 4]} + df = DataFrame(d, index=date_range("2014 10 11", "2014 10 14")) - axes = df[['A', 'B']].plot(subplots=True) - df['C'].plot(ax=axes[0], secondary_y=True) + axes = df[["A", "B"]].plot(subplots=True) + df["C"].plot(ax=axes[0], secondary_y=True) self._check_visible(axes[0].get_xticklabels(), visible=False) self._check_visible(axes[1].get_xticklabels(), visible=True) @@ -735,29 +759,31 @@ def test_subplots_sharex_axes_existing_axes(self): @pytest.mark.slow def test_subplots_dup_columns(self): # GH 10962 - df = DataFrame(np.random.rand(5, 5), columns=list('aaaaa')) + df = DataFrame(np.random.rand(5, 5), columns=list("aaaaa")) axes = df.plot(subplots=True) for ax in axes: - self._check_legend_labels(ax, labels=['a']) + self._check_legend_labels(ax, labels=["a"]) assert len(ax.lines) == 1 tm.close() - axes = df.plot(subplots=True, secondary_y='a') + axes = df.plot(subplots=True, secondary_y="a") for ax in axes: # (right) is only attached when subplots=False - self._check_legend_labels(ax, labels=['a']) + self._check_legend_labels(ax, labels=["a"]) assert len(ax.lines) == 1 tm.close() - ax = df.plot(secondary_y='a') - self._check_legend_labels(ax, labels=['a (right)'] * 5) + ax = df.plot(secondary_y="a") + self._check_legend_labels(ax, labels=["a (right)"] * 5) assert len(ax.lines) == 0 assert len(ax.right_ax.lines) == 5 def test_negative_log(self): - df = - DataFrame(rand(6, 4), - index=list(string.ascii_letters[:6]), - columns=['x', 'y', 'z', 'four']) + df = -DataFrame( + rand(6, 4), + index=list(string.ascii_letters[:6]), + columns=["x", "y", "z", "four"], + ) with pytest.raises(ValueError): df.plot.area(logy=True) @@ -773,19 +799,20 @@ def _compare_stacked_y_cood(self, normal_lines, stacked_lines): def test_line_area_stacked(self): with tm.RNGContext(42): - df = DataFrame(rand(6, 4), columns=['w', 'x', 'y', 'z']) + df = DataFrame(rand(6, 4), columns=["w", "x", "y", "z"]) neg_df = -df # each column has either positive or negative value - sep_df = DataFrame({'w': rand(6), - 'x': rand(6), - 'y': -rand(6), - 'z': -rand(6)}) + sep_df = DataFrame( + {"w": rand(6), "x": rand(6), "y": -rand(6), "z": -rand(6)} + ) # each column has positive-negative mixed value - mixed_df = DataFrame(randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=['w', 'x', 'y', 'z']) + mixed_df = DataFrame( + randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=["w", "x", "y", "z"], + ) - for kind in ['line', 'area']: + for kind in ["line", "area"]: ax1 = _check_plot_works(df.plot, kind=kind, stacked=False) ax2 = _check_plot_works(df.plot, kind=kind, stacked=True) self._compare_stacked_y_cood(ax1.lines, ax2.lines) @@ -811,9 +838,8 @@ def test_line_area_stacked(self): def test_line_area_nan_df(self): values1 = [1, 2, np.nan, 3] values2 = [3, np.nan, 2, 1] - df = DataFrame({'a': values1, 'b': values2}) - tdf = DataFrame({'a': values1, - 'b': values2}, index=tm.makeDateIndex(k=4)) + df = DataFrame({"a": values1, "b": values2}) + tdf = DataFrame({"a": values1, "b": values2}, index=tm.makeDateIndex(k=4)) for d in [df, tdf]: ax = _check_plot_works(d.plot) @@ -827,29 +853,29 @@ def test_line_area_nan_df(self): exp = np.array([3, 2, 1], dtype=np.float64) tm.assert_numpy_array_equal(np.delete(masked2.data, 1), exp) tm.assert_numpy_array_equal( - masked1.mask, np.array([False, False, True, False])) + masked1.mask, np.array([False, False, True, False]) + ) tm.assert_numpy_array_equal( - masked2.mask, np.array([False, True, False, False])) + masked2.mask, np.array([False, True, False, False]) + ) expected1 = np.array([1, 2, 0, 3], dtype=np.float64) expected2 = np.array([3, 0, 2, 1], dtype=np.float64) ax = _check_plot_works(d.plot, stacked=True) tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1) - tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), - expected1 + expected2) + tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2) ax = _check_plot_works(d.plot.area) tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1) - tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), - expected1 + expected2) + tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2) ax = _check_plot_works(d.plot.area, stacked=False) tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1) tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected2) def test_line_lim(self): - df = DataFrame(rand(6, 3), columns=['x', 'y', 'z']) + df = DataFrame(rand(6, 3), columns=["x", "y", "z"]) ax = df.plot() xmin, xmax = ax.get_xlim() lines = ax.get_lines() @@ -865,15 +891,15 @@ def test_line_lim(self): axes = df.plot(secondary_y=True, subplots=True) self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) for ax in axes: - assert hasattr(ax, 'left_ax') - assert not hasattr(ax, 'right_ax') + assert hasattr(ax, "left_ax") + assert not hasattr(ax, "right_ax") xmin, xmax = ax.get_xlim() lines = ax.get_lines() assert xmin <= lines[0].get_data()[0][0] assert xmax >= lines[0].get_data()[0][-1] def test_area_lim(self): - df = DataFrame(rand(6, 4), columns=['x', 'y', 'z', 'four']) + df = DataFrame(rand(6, 4), columns=["x", "y", "z", "four"]) neg_df = -df for stacked in [True, False]: @@ -892,6 +918,7 @@ def test_area_lim(self): @pytest.mark.slow def test_bar_colors(self): import matplotlib.pyplot as plt + default_colors = self._unpack_cycler(plt.rcParams) df = DataFrame(randn(5, 5)) @@ -899,14 +926,15 @@ def test_bar_colors(self): self._check_colors(ax.patches[::5], facecolors=default_colors[:5]) tm.close() - custom_colors = 'rgcby' + custom_colors = "rgcby" ax = df.plot.bar(color=custom_colors) self._check_colors(ax.patches[::5], facecolors=custom_colors) tm.close() from matplotlib import cm + # Test str -> colormap functionality - ax = df.plot.bar(colormap='jet') + ax = df.plot.bar(colormap="jet") rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, 5)] self._check_colors(ax.patches[::5], facecolors=rgba_colors) tm.close() @@ -917,26 +945,28 @@ def test_bar_colors(self): self._check_colors(ax.patches[::5], facecolors=rgba_colors) tm.close() - ax = df.loc[:, [0]].plot.bar(color='DodgerBlue') - self._check_colors([ax.patches[0]], facecolors=['DodgerBlue']) + ax = df.loc[:, [0]].plot.bar(color="DodgerBlue") + self._check_colors([ax.patches[0]], facecolors=["DodgerBlue"]) tm.close() - ax = df.plot(kind='bar', color='green') - self._check_colors(ax.patches[::5], facecolors=['green'] * 5) + ax = df.plot(kind="bar", color="green") + self._check_colors(ax.patches[::5], facecolors=["green"] * 5) tm.close() def test_bar_user_colors(self): - df = pd.DataFrame({"A": range(4), - "B": range(1, 5), - "color": ['red', 'blue', 'blue', 'red']}) + df = pd.DataFrame( + {"A": range(4), "B": range(1, 5), "color": ["red", "blue", "blue", "red"]} + ) # This should *only* work when `y` is specified, else # we use one color per column - ax = df.plot.bar(y='A', color=df['color']) + ax = df.plot.bar(y="A", color=df["color"]) result = [p.get_facecolor() for p in ax.patches] - expected = [(1., 0., 0., 1.), - (0., 0., 1., 1.), - (0., 0., 1., 1.), - (1., 0., 0., 1.)] + expected = [ + (1.0, 0.0, 0.0, 1.0), + (0.0, 0.0, 1.0, 1.0), + (0.0, 0.0, 1.0, 1.0), + (1.0, 0.0, 0.0, 1.0), + ] assert result == expected @pytest.mark.slow @@ -1001,25 +1031,29 @@ def test_bar_barwidth(self): @pytest.mark.slow def test_bar_barwidth_position(self): df = DataFrame(randn(5, 5)) - self._check_bar_alignment(df, kind='bar', stacked=False, width=0.9, - position=0.2) - self._check_bar_alignment(df, kind='bar', stacked=True, width=0.9, - position=0.2) - self._check_bar_alignment(df, kind='barh', stacked=False, width=0.9, - position=0.2) - self._check_bar_alignment(df, kind='barh', stacked=True, width=0.9, - position=0.2) - self._check_bar_alignment(df, kind='bar', subplots=True, width=0.9, - position=0.2) - self._check_bar_alignment(df, kind='barh', subplots=True, width=0.9, - position=0.2) + self._check_bar_alignment( + df, kind="bar", stacked=False, width=0.9, position=0.2 + ) + self._check_bar_alignment(df, kind="bar", stacked=True, width=0.9, position=0.2) + self._check_bar_alignment( + df, kind="barh", stacked=False, width=0.9, position=0.2 + ) + self._check_bar_alignment( + df, kind="barh", stacked=True, width=0.9, position=0.2 + ) + self._check_bar_alignment( + df, kind="bar", subplots=True, width=0.9, position=0.2 + ) + self._check_bar_alignment( + df, kind="barh", subplots=True, width=0.9, position=0.2 + ) @pytest.mark.slow def test_bar_barwidth_position_int(self): # GH 12979 df = DataFrame(randn(5, 5)) - for w in [1, 1.]: + for w in [1, 1.0]: ax = df.plot.bar(stacked=True, width=w) ticks = ax.xaxis.get_ticklocs() tm.assert_numpy_array_equal(ticks, np.array([0, 1, 2, 3, 4])) @@ -1028,11 +1062,11 @@ def test_bar_barwidth_position_int(self): assert ax.patches[0].get_x() == -0.5 assert ax.patches[-1].get_x() == 3.5 - self._check_bar_alignment(df, kind='bar', stacked=True, width=1) - self._check_bar_alignment(df, kind='barh', stacked=False, width=1) - self._check_bar_alignment(df, kind='barh', stacked=True, width=1) - self._check_bar_alignment(df, kind='bar', subplots=True, width=1) - self._check_bar_alignment(df, kind='barh', subplots=True, width=1) + self._check_bar_alignment(df, kind="bar", stacked=True, width=1) + self._check_bar_alignment(df, kind="barh", stacked=False, width=1) + self._check_bar_alignment(df, kind="barh", stacked=True, width=1) + self._check_bar_alignment(df, kind="bar", subplots=True, width=1) + self._check_bar_alignment(df, kind="barh", subplots=True, width=1) @pytest.mark.slow def test_bar_bottom_left(self): @@ -1065,9 +1099,7 @@ def test_bar_bottom_left(self): @pytest.mark.slow def test_bar_nan(self): - df = DataFrame({'A': [10, np.nan, 20], - 'B': [5, 10, 20], - 'C': [1, 2, 3]}) + df = DataFrame({"A": [10, np.nan, 20], "B": [5, 10, 20], "C": [1, 2, 3]}) ax = df.plot.bar() expected = [10, 0, 20, 5, 10, 20, 1, 2, 3] result = [p.get_height() for p in ax.patches] @@ -1084,13 +1116,17 @@ def test_bar_nan(self): @pytest.mark.slow def test_bar_categorical(self): # GH 13019 - df1 = pd.DataFrame(np.random.randn(6, 5), - index=pd.Index(list('ABCDEF')), - columns=pd.Index(list('abcde'))) + df1 = pd.DataFrame( + np.random.randn(6, 5), + index=pd.Index(list("ABCDEF")), + columns=pd.Index(list("abcde")), + ) # categorical index must behave the same - df2 = pd.DataFrame(np.random.randn(6, 5), - index=pd.CategoricalIndex(list('ABCDEF')), - columns=pd.CategoricalIndex(list('abcde'))) + df2 = pd.DataFrame( + np.random.randn(6, 5), + index=pd.CategoricalIndex(list("ABCDEF")), + columns=pd.CategoricalIndex(list("abcde")), + ) for df in [df1, df2]: ax = df.plot.bar() @@ -1109,20 +1145,22 @@ def test_bar_categorical(self): @pytest.mark.slow def test_plot_scatter(self): - df = DataFrame(randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=['x', 'y', 'z', 'four']) + df = DataFrame( + randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=["x", "y", "z", "four"], + ) - _check_plot_works(df.plot.scatter, x='x', y='y') + _check_plot_works(df.plot.scatter, x="x", y="y") _check_plot_works(df.plot.scatter, x=1, y=2) with pytest.raises(TypeError): - df.plot.scatter(x='x') + df.plot.scatter(x="x") with pytest.raises(TypeError): - df.plot.scatter(y='y') + df.plot.scatter(y="y") # GH 6951 - axes = df.plot(x='x', y='y', kind='scatter', subplots=True) + axes = df.plot(x="x", y="y", kind="scatter", subplots=True) self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) @pytest.mark.slow @@ -1131,26 +1169,22 @@ def test_if_scatterplot_colorbar_affects_xaxis_visibility(self): # interfere with x-axis label and ticklabels with # ipython inline backend. random_array = np.random.random((1000, 3)) - df = pd.DataFrame(random_array, - columns=['A label', 'B label', 'C label']) + df = pd.DataFrame(random_array, columns=["A label", "B label", "C label"]) - ax1 = df.plot.scatter(x='A label', y='B label') - ax2 = df.plot.scatter(x='A label', y='B label', c='C label') + ax1 = df.plot.scatter(x="A label", y="B label") + ax2 = df.plot.scatter(x="A label", y="B label", c="C label") - vis1 = [vis.get_visible() for vis in - ax1.xaxis.get_minorticklabels()] - vis2 = [vis.get_visible() for vis in - ax2.xaxis.get_minorticklabels()] + vis1 = [vis.get_visible() for vis in ax1.xaxis.get_minorticklabels()] + vis2 = [vis.get_visible() for vis in ax2.xaxis.get_minorticklabels()] assert vis1 == vis2 - vis1 = [vis.get_visible() for vis in - ax1.xaxis.get_majorticklabels()] - vis2 = [vis.get_visible() for vis in - ax2.xaxis.get_majorticklabels()] + vis1 = [vis.get_visible() for vis in ax1.xaxis.get_majorticklabels()] + vis2 = [vis.get_visible() for vis in ax2.xaxis.get_majorticklabels()] assert vis1 == vis2 - assert (ax1.xaxis.get_label().get_visible() == - ax2.xaxis.get_label().get_visible()) + assert ( + ax1.xaxis.get_label().get_visible() == ax2.xaxis.get_label().get_visible() + ) @pytest.mark.slow def test_if_hexbin_xaxis_label_is_visible(self): @@ -1158,82 +1192,79 @@ def test_if_hexbin_xaxis_label_is_visible(self): # interfere with x-axis label and ticklabels with # ipython inline backend. random_array = np.random.random((1000, 3)) - df = pd.DataFrame(random_array, - columns=['A label', 'B label', 'C label']) - - ax = df.plot.hexbin('A label', 'B label', gridsize=12) - assert all(vis.get_visible() for vis in - ax.xaxis.get_minorticklabels()) - assert all(vis.get_visible() for vis in - ax.xaxis.get_majorticklabels()) + df = pd.DataFrame(random_array, columns=["A label", "B label", "C label"]) + + ax = df.plot.hexbin("A label", "B label", gridsize=12) + assert all(vis.get_visible() for vis in ax.xaxis.get_minorticklabels()) + assert all(vis.get_visible() for vis in ax.xaxis.get_majorticklabels()) assert ax.xaxis.get_label().get_visible() @pytest.mark.slow def test_if_scatterplot_colorbars_are_next_to_parent_axes(self): import matplotlib.pyplot as plt + random_array = np.random.random((1000, 3)) - df = pd.DataFrame(random_array, - columns=['A label', 'B label', 'C label']) + df = pd.DataFrame(random_array, columns=["A label", "B label", "C label"]) fig, axes = plt.subplots(1, 2) - df.plot.scatter('A label', 'B label', c='C label', ax=axes[0]) - df.plot.scatter('A label', 'B label', c='C label', ax=axes[1]) + df.plot.scatter("A label", "B label", c="C label", ax=axes[0]) + df.plot.scatter("A label", "B label", c="C label", ax=axes[1]) plt.tight_layout() - points = np.array([ax.get_position().get_points() - for ax in fig.axes]) + points = np.array([ax.get_position().get_points() for ax in fig.axes]) axes_x_coords = points[:, :, 0] parent_distance = axes_x_coords[1, :] - axes_x_coords[0, :] colorbar_distance = axes_x_coords[3, :] - axes_x_coords[2, :] - assert np.isclose(parent_distance, - colorbar_distance, atol=1e-7).all() + assert np.isclose(parent_distance, colorbar_distance, atol=1e-7).all() @pytest.mark.slow def test_plot_scatter_with_categorical_data(self): # GH 16199 - df = pd.DataFrame({'x': [1, 2, 3, 4], - 'y': pd.Categorical(['a', 'b', 'a', 'c'])}) + df = pd.DataFrame( + {"x": [1, 2, 3, 4], "y": pd.Categorical(["a", "b", "a", "c"])} + ) with pytest.raises(ValueError) as ve: - df.plot(x='x', y='y', kind='scatter') - ve.match('requires y column to be numeric') + df.plot(x="x", y="y", kind="scatter") + ve.match("requires y column to be numeric") with pytest.raises(ValueError) as ve: - df.plot(x='y', y='x', kind='scatter') - ve.match('requires x column to be numeric') + df.plot(x="y", y="x", kind="scatter") + ve.match("requires x column to be numeric") with pytest.raises(ValueError) as ve: - df.plot(x='y', y='y', kind='scatter') - ve.match('requires x column to be numeric') + df.plot(x="y", y="y", kind="scatter") + ve.match("requires x column to be numeric") @pytest.mark.slow def test_plot_scatter_with_c(self): - df = DataFrame(randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=['x', 'y', 'z', 'four']) + df = DataFrame( + randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=["x", "y", "z", "four"], + ) - axes = [df.plot.scatter(x='x', y='y', c='z'), - df.plot.scatter(x=0, y=1, c=2)] + axes = [df.plot.scatter(x="x", y="y", c="z"), df.plot.scatter(x=0, y=1, c=2)] for ax in axes: # default to Greys - assert ax.collections[0].cmap.name == 'Greys' + assert ax.collections[0].cmap.name == "Greys" # n.b. there appears to be no public method # to get the colorbar label - assert ax.collections[0].colorbar._label == 'z' + assert ax.collections[0].colorbar._label == "z" - cm = 'cubehelix' - ax = df.plot.scatter(x='x', y='y', c='z', colormap=cm) + cm = "cubehelix" + ax = df.plot.scatter(x="x", y="y", c="z", colormap=cm) assert ax.collections[0].cmap.name == cm # verify turning off colorbar works - ax = df.plot.scatter(x='x', y='y', c='z', colorbar=False) + ax = df.plot.scatter(x="x", y="y", c="z", colorbar=False) assert ax.collections[0].colorbar is None # verify that we can still plot a solid color - ax = df.plot.scatter(x=0, y=1, c='red') + ax = df.plot.scatter(x=0, y=1, c="red") assert ax.collections[0].colorbar is None - self._check_colors(ax.collections, facecolors=['r']) + self._check_colors(ax.collections, facecolors=["r"]) # Ensure that we can pass an np.array straight through to matplotlib, # this functionality was accidentally removed previously. @@ -1241,44 +1272,48 @@ def test_plot_scatter_with_c(self): # # Exercise colormap path and non-colormap path as they are independent # - df = DataFrame({'A': [1, 2], 'B': [3, 4]}) + df = DataFrame({"A": [1, 2], "B": [3, 4]}) red_rgba = [1.0, 0.0, 0.0, 1.0] green_rgba = [0.0, 1.0, 0.0, 1.0] rgba_array = np.array([red_rgba, green_rgba]) - ax = df.plot.scatter(x='A', y='B', c=rgba_array) + ax = df.plot.scatter(x="A", y="B", c=rgba_array) # expect the face colors of the points in the non-colormap path to be # identical to the values we supplied, normally we'd be on shaky ground # comparing floats for equality but here we expect them to be # identical. - tm.assert_numpy_array_equal(ax.collections[0] - .get_facecolor(), rgba_array) + tm.assert_numpy_array_equal(ax.collections[0].get_facecolor(), rgba_array) # we don't test the colors of the faces in this next plot because they # are dependent on the spring colormap, which may change its colors # later. float_array = np.array([0.0, 1.0]) - df.plot.scatter(x='A', y='B', c=float_array, cmap='spring') + df.plot.scatter(x="A", y="B", c=float_array, cmap="spring") def test_scatter_colors(self): - df = DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3], 'c': [1, 2, 3]}) + df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]}) with pytest.raises(TypeError): - df.plot.scatter(x='a', y='b', c='c', color='green') + df.plot.scatter(x="a", y="b", c="c", color="green") default_colors = self._unpack_cycler(self.plt.rcParams) - ax = df.plot.scatter(x='a', y='b', c='c') + ax = df.plot.scatter(x="a", y="b", c="c") tm.assert_numpy_array_equal( ax.collections[0].get_facecolor()[0], - np.array(self.colorconverter.to_rgba(default_colors[0]))) + np.array(self.colorconverter.to_rgba(default_colors[0])), + ) - ax = df.plot.scatter(x='a', y='b', color='white') - tm.assert_numpy_array_equal(ax.collections[0].get_facecolor()[0], - np.array([1, 1, 1, 1], dtype=np.float64)) + ax = df.plot.scatter(x="a", y="b", color="white") + tm.assert_numpy_array_equal( + ax.collections[0].get_facecolor()[0], + np.array([1, 1, 1, 1], dtype=np.float64), + ) @pytest.mark.slow def test_plot_bar(self): - df = DataFrame(randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=['one', 'two', 'three', 'four']) + df = DataFrame( + randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=["one", "two", "three", "four"], + ) _check_plot_works(df.plot.bar) _check_plot_works(df.plot.bar, legend=False) @@ -1287,12 +1322,12 @@ def test_plot_bar(self): _check_plot_works(df.plot.bar, subplots=True) _check_plot_works(df.plot.bar, stacked=True) - df = DataFrame(randn(10, 15), - index=list(string.ascii_letters[:10]), - columns=range(15)) + df = DataFrame( + randn(10, 15), index=list(string.ascii_letters[:10]), columns=range(15) + ) _check_plot_works(df.plot.bar) - df = DataFrame({'a': [0, 1], 'b': [1, 0]}) + df = DataFrame({"a": [0, 1], "b": [1, 0]}) ax = _check_plot_works(df.plot.bar) self._check_ticks_props(ax, xrot=90) @@ -1305,22 +1340,36 @@ def test_plot_bar(self): ax = df.plot.barh(rot=55, fontsize=11) self._check_ticks_props(ax, yrot=55, ylabelsize=11, xlabelsize=11) - def _check_bar_alignment(self, df, kind='bar', stacked=False, - subplots=False, align='center', width=0.5, - position=0.5): - - axes = df.plot(kind=kind, stacked=stacked, subplots=subplots, - align=align, width=width, position=position, grid=True) + def _check_bar_alignment( + self, + df, + kind="bar", + stacked=False, + subplots=False, + align="center", + width=0.5, + position=0.5, + ): + + axes = df.plot( + kind=kind, + stacked=stacked, + subplots=subplots, + align=align, + width=width, + position=position, + grid=True, + ) axes = self._flatten_visible(axes) for ax in axes: - if kind == 'bar': + if kind == "bar": axis = ax.xaxis ax_min, ax_max = ax.get_xlim() min_edge = min(p.get_x() for p in ax.patches) max_edge = max(p.get_x() + p.get_width() for p in ax.patches) - elif kind == 'barh': + elif kind == "barh": axis = ax.yaxis ax_min, ax_max = ax.get_ylim() min_edge = min(p.get_y() for p in ax.patches) @@ -1334,18 +1383,17 @@ def _check_bar_alignment(self, df, kind='bar', stacked=False, tm.assert_almost_equal(ax_max, max_edge + 0.25) p = ax.patches[0] - if kind == 'bar' and (stacked is True or subplots is True): + if kind == "bar" and (stacked is True or subplots is True): edge = p.get_x() center = edge + p.get_width() * position - elif kind == 'bar' and stacked is False: + elif kind == "bar" and stacked is False: center = p.get_x() + p.get_width() * len(df.columns) * position edge = p.get_x() - elif kind == 'barh' and (stacked is True or subplots is True): + elif kind == "barh" and (stacked is True or subplots is True): center = p.get_y() + p.get_height() * position edge = p.get_y() - elif kind == 'barh' and stacked is False: - center = p.get_y() + p.get_height() * len( - df.columns) * position + elif kind == "barh" and stacked is False: + center = p.get_y() + p.get_height() * len(df.columns) * position edge = p.get_y() else: raise ValueError @@ -1353,10 +1401,10 @@ def _check_bar_alignment(self, df, kind='bar', stacked=False, # Check the ticks locates on integer assert (axis.get_ticklocs() == np.arange(len(df))).all() - if align == 'center': + if align == "center": # Check whether the bar locates on center tm.assert_almost_equal(axis.get_ticklocs()[0], center) - elif align == 'edge': + elif align == "edge": # Check whether the bar's edge starts from the tick tm.assert_almost_equal(axis.get_ticklocs()[0], edge) else: @@ -1367,80 +1415,85 @@ def _check_bar_alignment(self, df, kind='bar', stacked=False, @pytest.mark.slow def test_bar_stacked_center(self): # GH2157 - df = DataFrame({'A': [3] * 5, 'B': list(range(5))}, index=range(5)) - self._check_bar_alignment(df, kind='bar', stacked=True) - self._check_bar_alignment(df, kind='bar', stacked=True, width=0.9) - self._check_bar_alignment(df, kind='barh', stacked=True) - self._check_bar_alignment(df, kind='barh', stacked=True, width=0.9) + df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) + self._check_bar_alignment(df, kind="bar", stacked=True) + self._check_bar_alignment(df, kind="bar", stacked=True, width=0.9) + self._check_bar_alignment(df, kind="barh", stacked=True) + self._check_bar_alignment(df, kind="barh", stacked=True, width=0.9) @pytest.mark.slow def test_bar_center(self): - df = DataFrame({'A': [3] * 5, 'B': list(range(5))}, index=range(5)) - self._check_bar_alignment(df, kind='bar', stacked=False) - self._check_bar_alignment(df, kind='bar', stacked=False, width=0.9) - self._check_bar_alignment(df, kind='barh', stacked=False) - self._check_bar_alignment(df, kind='barh', stacked=False, width=0.9) + df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) + self._check_bar_alignment(df, kind="bar", stacked=False) + self._check_bar_alignment(df, kind="bar", stacked=False, width=0.9) + self._check_bar_alignment(df, kind="barh", stacked=False) + self._check_bar_alignment(df, kind="barh", stacked=False, width=0.9) @pytest.mark.slow def test_bar_subplots_center(self): - df = DataFrame({'A': [3] * 5, 'B': list(range(5))}, index=range(5)) - self._check_bar_alignment(df, kind='bar', subplots=True) - self._check_bar_alignment(df, kind='bar', subplots=True, width=0.9) - self._check_bar_alignment(df, kind='barh', subplots=True) - self._check_bar_alignment(df, kind='barh', subplots=True, width=0.9) + df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) + self._check_bar_alignment(df, kind="bar", subplots=True) + self._check_bar_alignment(df, kind="bar", subplots=True, width=0.9) + self._check_bar_alignment(df, kind="barh", subplots=True) + self._check_bar_alignment(df, kind="barh", subplots=True, width=0.9) @pytest.mark.slow def test_bar_align_single_column(self): df = DataFrame(randn(5)) - self._check_bar_alignment(df, kind='bar', stacked=False) - self._check_bar_alignment(df, kind='bar', stacked=True) - self._check_bar_alignment(df, kind='barh', stacked=False) - self._check_bar_alignment(df, kind='barh', stacked=True) - self._check_bar_alignment(df, kind='bar', subplots=True) - self._check_bar_alignment(df, kind='barh', subplots=True) + self._check_bar_alignment(df, kind="bar", stacked=False) + self._check_bar_alignment(df, kind="bar", stacked=True) + self._check_bar_alignment(df, kind="barh", stacked=False) + self._check_bar_alignment(df, kind="barh", stacked=True) + self._check_bar_alignment(df, kind="bar", subplots=True) + self._check_bar_alignment(df, kind="barh", subplots=True) @pytest.mark.slow def test_bar_edge(self): - df = DataFrame({'A': [3] * 5, 'B': list(range(5))}, index=range(5)) - - self._check_bar_alignment(df, kind='bar', stacked=True, align='edge') - self._check_bar_alignment(df, kind='bar', stacked=True, width=0.9, - align='edge') - self._check_bar_alignment(df, kind='barh', stacked=True, align='edge') - self._check_bar_alignment(df, kind='barh', stacked=True, width=0.9, - align='edge') - - self._check_bar_alignment(df, kind='bar', stacked=False, align='edge') - self._check_bar_alignment(df, kind='bar', stacked=False, width=0.9, - align='edge') - self._check_bar_alignment(df, kind='barh', stacked=False, align='edge') - self._check_bar_alignment(df, kind='barh', stacked=False, width=0.9, - align='edge') - - self._check_bar_alignment(df, kind='bar', subplots=True, align='edge') - self._check_bar_alignment(df, kind='bar', subplots=True, width=0.9, - align='edge') - self._check_bar_alignment(df, kind='barh', subplots=True, align='edge') - self._check_bar_alignment(df, kind='barh', subplots=True, width=0.9, - align='edge') + df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) + + self._check_bar_alignment(df, kind="bar", stacked=True, align="edge") + self._check_bar_alignment(df, kind="bar", stacked=True, width=0.9, align="edge") + self._check_bar_alignment(df, kind="barh", stacked=True, align="edge") + self._check_bar_alignment( + df, kind="barh", stacked=True, width=0.9, align="edge" + ) + + self._check_bar_alignment(df, kind="bar", stacked=False, align="edge") + self._check_bar_alignment( + df, kind="bar", stacked=False, width=0.9, align="edge" + ) + self._check_bar_alignment(df, kind="barh", stacked=False, align="edge") + self._check_bar_alignment( + df, kind="barh", stacked=False, width=0.9, align="edge" + ) + + self._check_bar_alignment(df, kind="bar", subplots=True, align="edge") + self._check_bar_alignment( + df, kind="bar", subplots=True, width=0.9, align="edge" + ) + self._check_bar_alignment(df, kind="barh", subplots=True, align="edge") + self._check_bar_alignment( + df, kind="barh", subplots=True, width=0.9, align="edge" + ) @pytest.mark.slow def test_bar_log_no_subplots(self): # GH3254, GH3298 matplotlib/matplotlib#1882, #1892 # regressions in 1.2.1 - expected = np.array([.1, 1., 10., 100]) + expected = np.array([0.1, 1.0, 10.0, 100]) # no subplots - df = DataFrame({'A': [3] * 5, 'B': list(range(1, 6))}, index=range(5)) + df = DataFrame({"A": [3] * 5, "B": list(range(1, 6))}, index=range(5)) ax = df.plot.bar(grid=True, log=True) tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) @pytest.mark.slow def test_bar_log_subplots(self): - expected = np.array([.1, 1., 10., 100., 1000., 1e4]) + expected = np.array([0.1, 1.0, 10.0, 100.0, 1000.0, 1e4]) ax = DataFrame([Series([200, 300]), Series([300, 500])]).plot.bar( - log=True, subplots=True) + log=True, subplots=True + ) tm.assert_numpy_array_equal(ax[0].yaxis.get_ticklocs(), expected) tm.assert_numpy_array_equal(ax[1].yaxis.get_ticklocs(), expected) @@ -1448,14 +1501,15 @@ def test_bar_log_subplots(self): @pytest.mark.slow def test_boxplot(self): df = self.hist_df - series = df['height'] + series = df["height"] numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] ax = _check_plot_works(df.plot.box) self._check_text_labels(ax.get_xticklabels(), labels) - tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), - np.arange(1, len(numeric_cols) + 1)) + tm.assert_numpy_array_equal( + ax.xaxis.get_ticklocs(), np.arange(1, len(numeric_cols) + 1) + ) assert len(ax.lines) == self.bp_n_objects * len(numeric_cols) axes = series.plot.box(rot=40) @@ -1486,10 +1540,9 @@ def test_boxplot_vertical(self): # _check_plot_works adds an ax so catch warning. see GH #13188 with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot.box, - subplots=True, vert=False, logx=True) + axes = _check_plot_works(df.plot.box, subplots=True, vert=False, logx=True) self._check_axes_shape(axes, axes_num=3, layout=(1, 3)) - self._check_ax_scales(axes, xaxis='log') + self._check_ax_scales(axes, xaxis="log") for ax, label in zip(axes, labels): self._check_text_labels(ax.get_yticklabels(), [label]) assert len(ax.lines) == self.bp_n_objects @@ -1502,23 +1555,25 @@ def test_boxplot_vertical(self): @pytest.mark.slow def test_boxplot_return_type(self): - df = DataFrame(randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=['one', 'two', 'three', 'four']) + df = DataFrame( + randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=["one", "two", "three", "four"], + ) with pytest.raises(ValueError): - df.plot.box(return_type='NOTATYPE') + df.plot.box(return_type="NOTATYPE") - result = df.plot.box(return_type='dict') - self._check_box_return_type(result, 'dict') + result = df.plot.box(return_type="dict") + self._check_box_return_type(result, "dict") - result = df.plot.box(return_type='axes') - self._check_box_return_type(result, 'axes') + result = df.plot.box(return_type="axes") + self._check_box_return_type(result, "axes") result = df.plot.box() # default axes - self._check_box_return_type(result, 'axes') + self._check_box_return_type(result, "axes") - result = df.plot.box(return_type='both') - self._check_box_return_type(result, 'both') + result = df.plot.box(return_type="both") + self._check_box_return_type(result, "both") @pytest.mark.slow def test_boxplot_subplots_return_type(self): @@ -1527,42 +1582,44 @@ def test_boxplot_subplots_return_type(self): # normal style: return_type=None result = df.plot.box(subplots=True) assert isinstance(result, Series) - self._check_box_return_type(result, None, expected_keys=[ - 'height', 'weight', 'category']) + self._check_box_return_type( + result, None, expected_keys=["height", "weight", "category"] + ) - for t in ['dict', 'axes', 'both']: + for t in ["dict", "axes", "both"]: returned = df.plot.box(return_type=t, subplots=True) self._check_box_return_type( - returned, t, - expected_keys=['height', 'weight', 'category'], - check_ax_title=False) + returned, + t, + expected_keys=["height", "weight", "category"], + check_ax_title=False, + ) @pytest.mark.slow @td.skip_if_no_scipy def test_kde_df(self): df = DataFrame(randn(100, 4)) - ax = _check_plot_works(df.plot, kind='kde') + ax = _check_plot_works(df.plot, kind="kde") expected = [pprint_thing(c) for c in df.columns] self._check_legend_labels(ax, labels=expected) self._check_ticks_props(ax, xrot=0) - ax = df.plot(kind='kde', rot=20, fontsize=5) + ax = df.plot(kind="kde", rot=20, fontsize=5) self._check_ticks_props(ax, xrot=20, xlabelsize=5, ylabelsize=5) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot, kind='kde', - subplots=True) + axes = _check_plot_works(df.plot, kind="kde", subplots=True) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - axes = df.plot(kind='kde', logy=True, subplots=True) - self._check_ax_scales(axes, yaxis='log') + axes = df.plot(kind="kde", logy=True, subplots=True) + self._check_ax_scales(axes, yaxis="log") @pytest.mark.slow @td.skip_if_no_scipy def test_kde_missing_vals(self): df = DataFrame(np.random.uniform(size=(100, 4))) df.loc[0, 0] = np.nan - _check_plot_works(df.plot, kind='kde') + _check_plot_works(df.plot, kind="kde") @pytest.mark.slow def test_hist_df(self): @@ -1576,10 +1633,9 @@ def test_hist_df(self): self._check_legend_labels(ax, labels=expected) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot.hist, - subplots=True, logy=True) + axes = _check_plot_works(df.plot.hist, subplots=True, logy=True) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - self._check_ax_scales(axes, yaxis='log') + self._check_ax_scales(axes, yaxis="log") axes = series.plot.hist(rot=40) self._check_ticks_props(axes, xrot=40, yrot=0) @@ -1598,11 +1654,17 @@ def test_hist_df(self): tm.close() # if horizontal, yticklabels are rotated - axes = df.plot.hist(rot=50, fontsize=8, orientation='horizontal') + axes = df.plot.hist(rot=50, fontsize=8, orientation="horizontal") self._check_ticks_props(axes, xrot=0, yrot=50, ylabelsize=8) - def _check_box_coord(self, patches, expected_y=None, expected_h=None, - expected_x=None, expected_w=None): + def _check_box_coord( + self, + patches, + expected_y=None, + expected_h=None, + expected_x=None, + expected_w=None, + ): result_y = np.array([p.get_y() for p in patches]) result_height = np.array([p.get_height() for p in patches]) result_x = np.array([p.get_x() for p in patches]) @@ -1610,106 +1672,145 @@ def _check_box_coord(self, patches, expected_y=None, expected_h=None, # dtype is depending on above values, no need to check if expected_y is not None: - tm.assert_numpy_array_equal(result_y, expected_y, - check_dtype=False) + tm.assert_numpy_array_equal(result_y, expected_y, check_dtype=False) if expected_h is not None: - tm.assert_numpy_array_equal(result_height, expected_h, - check_dtype=False) + tm.assert_numpy_array_equal(result_height, expected_h, check_dtype=False) if expected_x is not None: - tm.assert_numpy_array_equal(result_x, expected_x, - check_dtype=False) + tm.assert_numpy_array_equal(result_x, expected_x, check_dtype=False) if expected_w is not None: - tm.assert_numpy_array_equal(result_width, expected_w, - check_dtype=False) + tm.assert_numpy_array_equal(result_width, expected_w, check_dtype=False) @pytest.mark.slow def test_hist_df_coord(self): - normal_df = DataFrame({'A': np.repeat(np.array([1, 2, 3, 4, 5]), - np.array([10, 9, 8, 7, 6])), - 'B': np.repeat(np.array([1, 2, 3, 4, 5]), - np.array([8, 8, 8, 8, 8])), - 'C': np.repeat(np.array([1, 2, 3, 4, 5]), - np.array([6, 7, 8, 9, 10]))}, - columns=['A', 'B', 'C']) - - nan_df = DataFrame({'A': np.repeat(np.array([np.nan, 1, 2, 3, 4, 5]), - np.array([3, 10, 9, 8, 7, 6])), - 'B': np.repeat(np.array([1, np.nan, 2, 3, 4, 5]), - np.array([8, 3, 8, 8, 8, 8])), - 'C': np.repeat(np.array([1, 2, 3, np.nan, 4, 5]), - np.array([6, 7, 8, 3, 9, 10]))}, - columns=['A', 'B', 'C']) + normal_df = DataFrame( + { + "A": np.repeat(np.array([1, 2, 3, 4, 5]), np.array([10, 9, 8, 7, 6])), + "B": np.repeat(np.array([1, 2, 3, 4, 5]), np.array([8, 8, 8, 8, 8])), + "C": np.repeat(np.array([1, 2, 3, 4, 5]), np.array([6, 7, 8, 9, 10])), + }, + columns=["A", "B", "C"], + ) + + nan_df = DataFrame( + { + "A": np.repeat( + np.array([np.nan, 1, 2, 3, 4, 5]), np.array([3, 10, 9, 8, 7, 6]) + ), + "B": np.repeat( + np.array([1, np.nan, 2, 3, 4, 5]), np.array([8, 3, 8, 8, 8, 8]) + ), + "C": np.repeat( + np.array([1, 2, 3, np.nan, 4, 5]), np.array([6, 7, 8, 3, 9, 10]) + ), + }, + columns=["A", "B", "C"], + ) for df in [normal_df, nan_df]: ax = df.plot.hist(bins=5) - self._check_box_coord(ax.patches[:5], - expected_y=np.array([0, 0, 0, 0, 0]), - expected_h=np.array([10, 9, 8, 7, 6])) - self._check_box_coord(ax.patches[5:10], - expected_y=np.array([0, 0, 0, 0, 0]), - expected_h=np.array([8, 8, 8, 8, 8])) - self._check_box_coord(ax.patches[10:], - expected_y=np.array([0, 0, 0, 0, 0]), - expected_h=np.array([6, 7, 8, 9, 10])) + self._check_box_coord( + ax.patches[:5], + expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([10, 9, 8, 7, 6]), + ) + self._check_box_coord( + ax.patches[5:10], + expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([8, 8, 8, 8, 8]), + ) + self._check_box_coord( + ax.patches[10:], + expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([6, 7, 8, 9, 10]), + ) ax = df.plot.hist(bins=5, stacked=True) - self._check_box_coord(ax.patches[:5], - expected_y=np.array([0, 0, 0, 0, 0]), - expected_h=np.array([10, 9, 8, 7, 6])) - self._check_box_coord(ax.patches[5:10], - expected_y=np.array([10, 9, 8, 7, 6]), - expected_h=np.array([8, 8, 8, 8, 8])) - self._check_box_coord(ax.patches[10:], - expected_y=np.array([18, 17, 16, 15, 14]), - expected_h=np.array([6, 7, 8, 9, 10])) + self._check_box_coord( + ax.patches[:5], + expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([10, 9, 8, 7, 6]), + ) + self._check_box_coord( + ax.patches[5:10], + expected_y=np.array([10, 9, 8, 7, 6]), + expected_h=np.array([8, 8, 8, 8, 8]), + ) + self._check_box_coord( + ax.patches[10:], + expected_y=np.array([18, 17, 16, 15, 14]), + expected_h=np.array([6, 7, 8, 9, 10]), + ) axes = df.plot.hist(bins=5, stacked=True, subplots=True) - self._check_box_coord(axes[0].patches, - expected_y=np.array([0, 0, 0, 0, 0]), - expected_h=np.array([10, 9, 8, 7, 6])) - self._check_box_coord(axes[1].patches, - expected_y=np.array([0, 0, 0, 0, 0]), - expected_h=np.array([8, 8, 8, 8, 8])) - self._check_box_coord(axes[2].patches, - expected_y=np.array([0, 0, 0, 0, 0]), - expected_h=np.array([6, 7, 8, 9, 10])) + self._check_box_coord( + axes[0].patches, + expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([10, 9, 8, 7, 6]), + ) + self._check_box_coord( + axes[1].patches, + expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([8, 8, 8, 8, 8]), + ) + self._check_box_coord( + axes[2].patches, + expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([6, 7, 8, 9, 10]), + ) # horizontal - ax = df.plot.hist(bins=5, orientation='horizontal') - self._check_box_coord(ax.patches[:5], - expected_x=np.array([0, 0, 0, 0, 0]), - expected_w=np.array([10, 9, 8, 7, 6])) - self._check_box_coord(ax.patches[5:10], - expected_x=np.array([0, 0, 0, 0, 0]), - expected_w=np.array([8, 8, 8, 8, 8])) - self._check_box_coord(ax.patches[10:], - expected_x=np.array([0, 0, 0, 0, 0]), - expected_w=np.array([6, 7, 8, 9, 10])) - - ax = df.plot.hist(bins=5, stacked=True, - orientation='horizontal') - self._check_box_coord(ax.patches[:5], - expected_x=np.array([0, 0, 0, 0, 0]), - expected_w=np.array([10, 9, 8, 7, 6])) - self._check_box_coord(ax.patches[5:10], - expected_x=np.array([10, 9, 8, 7, 6]), - expected_w=np.array([8, 8, 8, 8, 8])) + ax = df.plot.hist(bins=5, orientation="horizontal") + self._check_box_coord( + ax.patches[:5], + expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([10, 9, 8, 7, 6]), + ) + self._check_box_coord( + ax.patches[5:10], + expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([8, 8, 8, 8, 8]), + ) + self._check_box_coord( + ax.patches[10:], + expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([6, 7, 8, 9, 10]), + ) + + ax = df.plot.hist(bins=5, stacked=True, orientation="horizontal") + self._check_box_coord( + ax.patches[:5], + expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([10, 9, 8, 7, 6]), + ) + self._check_box_coord( + ax.patches[5:10], + expected_x=np.array([10, 9, 8, 7, 6]), + expected_w=np.array([8, 8, 8, 8, 8]), + ) self._check_box_coord( ax.patches[10:], expected_x=np.array([18, 17, 16, 15, 14]), - expected_w=np.array([6, 7, 8, 9, 10])) + expected_w=np.array([6, 7, 8, 9, 10]), + ) - axes = df.plot.hist(bins=5, stacked=True, subplots=True, - orientation='horizontal') - self._check_box_coord(axes[0].patches, - expected_x=np.array([0, 0, 0, 0, 0]), - expected_w=np.array([10, 9, 8, 7, 6])) - self._check_box_coord(axes[1].patches, - expected_x=np.array([0, 0, 0, 0, 0]), - expected_w=np.array([8, 8, 8, 8, 8])) - self._check_box_coord(axes[2].patches, - expected_x=np.array([0, 0, 0, 0, 0]), - expected_w=np.array([6, 7, 8, 9, 10])) + axes = df.plot.hist( + bins=5, stacked=True, subplots=True, orientation="horizontal" + ) + self._check_box_coord( + axes[0].patches, + expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([10, 9, 8, 7, 6]), + ) + self._check_box_coord( + axes[1].patches, + expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([8, 8, 8, 8, 8]), + ) + self._check_box_coord( + axes[2].patches, + expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([6, 7, 8, 9, 10]), + ) @pytest.mark.slow def test_plot_int_columns(self): @@ -1718,11 +1819,11 @@ def test_plot_int_columns(self): @pytest.mark.slow def test_df_legend_labels(self): - kinds = ['line', 'bar', 'barh', 'kde', 'area', 'hist'] - df = DataFrame(rand(3, 3), columns=['a', 'b', 'c']) - df2 = DataFrame(rand(3, 3), columns=['d', 'e', 'f']) - df3 = DataFrame(rand(3, 3), columns=['g', 'h', 'i']) - df4 = DataFrame(rand(3, 3), columns=['j', 'k', 'l']) + kinds = ["line", "bar", "barh", "kde", "area", "hist"] + df = DataFrame(rand(3, 3), columns=["a", "b", "c"]) + df2 = DataFrame(rand(3, 3), columns=["d", "e", "f"]) + df3 = DataFrame(rand(3, 3), columns=["g", "h", "i"]) + df4 = DataFrame(rand(3, 3), columns=["j", "k", "l"]) for kind in kinds: @@ -1735,82 +1836,82 @@ def test_df_legend_labels(self): ax = df3.plot(kind=kind, legend=True, ax=ax) self._check_legend_labels(ax, labels=df.columns.union(df3.columns)) - ax = df4.plot(kind=kind, legend='reverse', ax=ax) - expected = list(df.columns.union(df3.columns)) + list(reversed( - df4.columns)) + ax = df4.plot(kind=kind, legend="reverse", ax=ax) + expected = list(df.columns.union(df3.columns)) + list(reversed(df4.columns)) self._check_legend_labels(ax, labels=expected) # Secondary Y - ax = df.plot(legend=True, secondary_y='b') - self._check_legend_labels(ax, labels=['a', 'b (right)', 'c']) + ax = df.plot(legend=True, secondary_y="b") + self._check_legend_labels(ax, labels=["a", "b (right)", "c"]) ax = df2.plot(legend=False, ax=ax) - self._check_legend_labels(ax, labels=['a', 'b (right)', 'c']) - ax = df3.plot(kind='bar', legend=True, secondary_y='h', ax=ax) + self._check_legend_labels(ax, labels=["a", "b (right)", "c"]) + ax = df3.plot(kind="bar", legend=True, secondary_y="h", ax=ax) self._check_legend_labels( - ax, labels=['a', 'b (right)', 'c', 'g', 'h (right)', 'i']) + ax, labels=["a", "b (right)", "c", "g", "h (right)", "i"] + ) # Time Series - ind = date_range('1/1/2014', periods=3) - df = DataFrame(randn(3, 3), columns=['a', 'b', 'c'], index=ind) - df2 = DataFrame(randn(3, 3), columns=['d', 'e', 'f'], index=ind) - df3 = DataFrame(randn(3, 3), columns=['g', 'h', 'i'], index=ind) - ax = df.plot(legend=True, secondary_y='b') - self._check_legend_labels(ax, labels=['a', 'b (right)', 'c']) + ind = date_range("1/1/2014", periods=3) + df = DataFrame(randn(3, 3), columns=["a", "b", "c"], index=ind) + df2 = DataFrame(randn(3, 3), columns=["d", "e", "f"], index=ind) + df3 = DataFrame(randn(3, 3), columns=["g", "h", "i"], index=ind) + ax = df.plot(legend=True, secondary_y="b") + self._check_legend_labels(ax, labels=["a", "b (right)", "c"]) ax = df2.plot(legend=False, ax=ax) - self._check_legend_labels(ax, labels=['a', 'b (right)', 'c']) + self._check_legend_labels(ax, labels=["a", "b (right)", "c"]) ax = df3.plot(legend=True, ax=ax) - self._check_legend_labels( - ax, labels=['a', 'b (right)', 'c', 'g', 'h', 'i']) + self._check_legend_labels(ax, labels=["a", "b (right)", "c", "g", "h", "i"]) # scatter - ax = df.plot.scatter(x='a', y='b', label='data1') - self._check_legend_labels(ax, labels=['data1']) - ax = df2.plot.scatter(x='d', y='e', legend=False, label='data2', ax=ax) - self._check_legend_labels(ax, labels=['data1']) - ax = df3.plot.scatter(x='g', y='h', label='data3', ax=ax) - self._check_legend_labels(ax, labels=['data1', 'data3']) + ax = df.plot.scatter(x="a", y="b", label="data1") + self._check_legend_labels(ax, labels=["data1"]) + ax = df2.plot.scatter(x="d", y="e", legend=False, label="data2", ax=ax) + self._check_legend_labels(ax, labels=["data1"]) + ax = df3.plot.scatter(x="g", y="h", label="data3", ax=ax) + self._check_legend_labels(ax, labels=["data1", "data3"]) # ensure label args pass through and # index name does not mutate # column names don't mutate - df5 = df.set_index('a') - ax = df5.plot(y='b') - self._check_legend_labels(ax, labels=['b']) - ax = df5.plot(y='b', label='LABEL_b') - self._check_legend_labels(ax, labels=['LABEL_b']) - self._check_text_labels(ax.xaxis.get_label(), 'a') - ax = df5.plot(y='c', label='LABEL_c', ax=ax) - self._check_legend_labels(ax, labels=['LABEL_b', 'LABEL_c']) - assert df5.columns.tolist() == ['b', 'c'] + df5 = df.set_index("a") + ax = df5.plot(y="b") + self._check_legend_labels(ax, labels=["b"]) + ax = df5.plot(y="b", label="LABEL_b") + self._check_legend_labels(ax, labels=["LABEL_b"]) + self._check_text_labels(ax.xaxis.get_label(), "a") + ax = df5.plot(y="c", label="LABEL_c", ax=ax) + self._check_legend_labels(ax, labels=["LABEL_b", "LABEL_c"]) + assert df5.columns.tolist() == ["b", "c"] def test_legend_name(self): - multi = DataFrame(randn(4, 4), - columns=[np.array(['a', 'a', 'b', 'b']), - np.array(['x', 'y', 'x', 'y'])]) - multi.columns.names = ['group', 'individual'] + multi = DataFrame( + randn(4, 4), + columns=[np.array(["a", "a", "b", "b"]), np.array(["x", "y", "x", "y"])], + ) + multi.columns.names = ["group", "individual"] ax = multi.plot() leg_title = ax.legend_.get_title() - self._check_text_labels(leg_title, 'group,individual') + self._check_text_labels(leg_title, "group,individual") df = DataFrame(randn(5, 5)) ax = df.plot(legend=True, ax=ax) leg_title = ax.legend_.get_title() - self._check_text_labels(leg_title, 'group,individual') + self._check_text_labels(leg_title, "group,individual") - df.columns.name = 'new' + df.columns.name = "new" ax = df.plot(legend=False, ax=ax) leg_title = ax.legend_.get_title() - self._check_text_labels(leg_title, 'group,individual') + self._check_text_labels(leg_title, "group,individual") ax = df.plot(legend=True, ax=ax) leg_title = ax.legend_.get_title() - self._check_text_labels(leg_title, 'new') + self._check_text_labels(leg_title, "new") @pytest.mark.slow def test_no_legend(self): - kinds = ['line', 'bar', 'barh', 'kde', 'area', 'hist'] - df = DataFrame(rand(3, 3), columns=['a', 'b', 'c']) + kinds = ["line", "bar", "barh", "kde", "area", "hist"] + df = DataFrame(rand(3, 3), columns=["a", "b", "c"]) for kind in kinds: @@ -1820,17 +1921,20 @@ def test_no_legend(self): @pytest.mark.slow def test_style_by_column(self): import matplotlib.pyplot as plt + fig = plt.gcf() df = DataFrame(randn(100, 3)) - for markers in [{0: '^', - 1: '+', - 2: 'o'}, {0: '^', - 1: '+'}, ['^', '+', 'o'], ['^', '+']]: + for markers in [ + {0: "^", 1: "+", 2: "o"}, + {0: "^", 1: "+"}, + ["^", "+", "o"], + ["^", "+"], + ]: fig.clf() fig.add_subplot(111) ax = df.plot(style=markers) - for i, l in enumerate(ax.get_lines()[:len(markers)]): + for i, l in enumerate(ax.get_lines()[: len(markers)]): assert l.get_marker() == markers[i] @pytest.mark.slow @@ -1840,13 +1944,13 @@ def test_line_label_none(self): assert ax.get_legend() is None ax = s.plot(legend=True) - assert ax.get_legend().get_texts()[0].get_text() == 'None' + assert ax.get_legend().get_texts()[0].get_text() == "None" @pytest.mark.slow def test_line_colors(self): from matplotlib import cm - custom_colors = 'rgcby' + custom_colors = "rgcby" df = DataFrame(randn(5, 5)) ax = df.plot(color=custom_colors) @@ -1862,7 +1966,7 @@ def test_line_colors(self): tm.close() - ax = df.plot(colormap='jet') + ax = df.plot(colormap="jet") rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] self._check_colors(ax.get_lines(), linecolors=rgba_colors) tm.close() @@ -1874,28 +1978,28 @@ def test_line_colors(self): # make color a list if plotting one column frame # handles cases like df.plot(color='DodgerBlue') - ax = df.loc[:, [0]].plot(color='DodgerBlue') - self._check_colors(ax.lines, linecolors=['DodgerBlue']) + ax = df.loc[:, [0]].plot(color="DodgerBlue") + self._check_colors(ax.lines, linecolors=["DodgerBlue"]) - ax = df.plot(color='red') - self._check_colors(ax.get_lines(), linecolors=['red'] * 5) + ax = df.plot(color="red") + self._check_colors(ax.get_lines(), linecolors=["red"] * 5) tm.close() # GH 10299 - custom_colors = ['#FF0000', '#0000FF', '#FFFF00', '#000000', '#FFFFFF'] + custom_colors = ["#FF0000", "#0000FF", "#FFFF00", "#000000", "#FFFFFF"] ax = df.plot(color=custom_colors) self._check_colors(ax.get_lines(), linecolors=custom_colors) tm.close() with pytest.raises(ValueError): # Color contains shorthand hex value results in ValueError - custom_colors = ['#F00', '#00F', '#FF0', '#000', '#FFF'] + custom_colors = ["#F00", "#00F", "#FF0", "#000", "#FFF"] # Forced show plot _check_plot_works(df.plot, color=custom_colors) @pytest.mark.slow def test_dont_modify_colors(self): - colors = ['r', 'g', 'b'] + colors = ["r", "g", "b"] pd.DataFrame(np.random.rand(10, 2)).plot(color=colors) assert len(colors) == 3 @@ -1903,6 +2007,7 @@ def test_dont_modify_colors(self): def test_line_colors_and_styles_subplots(self): # GH 9894 from matplotlib import cm + default_colors = self._unpack_cycler(self.plt.rcParams) df = DataFrame(randn(5, 5)) @@ -1914,18 +2019,18 @@ def test_line_colors_and_styles_subplots(self): tm.close() # single color char - axes = df.plot(subplots=True, color='k') + axes = df.plot(subplots=True, color="k") for ax in axes: - self._check_colors(ax.get_lines(), linecolors=['k']) + self._check_colors(ax.get_lines(), linecolors=["k"]) tm.close() # single color str - axes = df.plot(subplots=True, color='green') + axes = df.plot(subplots=True, color="green") for ax in axes: - self._check_colors(ax.get_lines(), linecolors=['green']) + self._check_colors(ax.get_lines(), linecolors=["green"]) tm.close() - custom_colors = 'rgcby' + custom_colors = "rgcby" axes = df.plot(color=custom_colors, subplots=True) for ax, c in zip(axes, list(custom_colors)): self._check_colors(ax.get_lines(), linecolors=[c]) @@ -1937,7 +2042,7 @@ def test_line_colors_and_styles_subplots(self): tm.close() # GH 10299 - custom_colors = ['#FF0000', '#0000FF', '#FFFF00', '#000000', '#FFFFFF'] + custom_colors = ["#FF0000", "#0000FF", "#FFFF00", "#000000", "#FFFFFF"] axes = df.plot(color=custom_colors, subplots=True) for ax, c in zip(axes, list(custom_colors)): self._check_colors(ax.get_lines(), linecolors=[c]) @@ -1945,14 +2050,14 @@ def test_line_colors_and_styles_subplots(self): with pytest.raises(ValueError): # Color contains shorthand hex value results in ValueError - custom_colors = ['#F00', '#00F', '#FF0', '#000', '#FFF'] + custom_colors = ["#F00", "#00F", "#FF0", "#000", "#FFF"] # Forced show plot # _check_plot_works adds an ax so catch warning. see GH #13188 with tm.assert_produces_warning(UserWarning): _check_plot_works(df.plot, color=custom_colors, subplots=True) rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] - for cmap in ['jet', cm.jet]: + for cmap in ["jet", cm.jet]: axes = df.plot(colormap=cmap, subplots=True) for ax, c in zip(axes, rgba_colors): self._check_colors(ax.get_lines(), linecolors=[c]) @@ -1960,17 +2065,17 @@ def test_line_colors_and_styles_subplots(self): # make color a list if plotting one column frame # handles cases like df.plot(color='DodgerBlue') - axes = df.loc[:, [0]].plot(color='DodgerBlue', subplots=True) - self._check_colors(axes[0].lines, linecolors=['DodgerBlue']) + axes = df.loc[:, [0]].plot(color="DodgerBlue", subplots=True) + self._check_colors(axes[0].lines, linecolors=["DodgerBlue"]) # single character style - axes = df.plot(style='r', subplots=True) + axes = df.plot(style="r", subplots=True) for ax in axes: - self._check_colors(ax.get_lines(), linecolors=['r']) + self._check_colors(ax.get_lines(), linecolors=["r"]) tm.close() # list of styles - styles = list('rgcby') + styles = list("rgcby") axes = df.plot(style=styles, subplots=True) for ax, c in zip(axes, styles): self._check_colors(ax.get_lines(), linecolors=[c]) @@ -1981,7 +2086,7 @@ def test_area_colors(self): from matplotlib import cm from matplotlib.collections import PolyCollection - custom_colors = 'rgcby' + custom_colors = "rgcby" df = DataFrame(rand(5, 5)) ax = df.plot.area(color=custom_colors) @@ -1996,7 +2101,7 @@ def test_area_colors(self): assert h.get_alpha() is None tm.close() - ax = df.plot.area(colormap='jet') + ax = df.plot.area(colormap="jet") jet_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] self._check_colors(ax.get_lines(), linecolors=jet_colors) poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] @@ -2017,7 +2122,7 @@ def test_area_colors(self): handles, labels = ax.get_legend_handles_labels() linecolors = jet_with_alpha - self._check_colors(handles[:len(jet_colors)], linecolors=linecolors) + self._check_colors(handles[: len(jet_colors)], linecolors=linecolors) for h in handles: assert h.get_alpha() == 0.5 @@ -2030,14 +2135,15 @@ def test_hist_colors(self): self._check_colors(ax.patches[::10], facecolors=default_colors[:5]) tm.close() - custom_colors = 'rgcby' + custom_colors = "rgcby" ax = df.plot.hist(color=custom_colors) self._check_colors(ax.patches[::10], facecolors=custom_colors) tm.close() from matplotlib import cm + # Test str -> colormap functionality - ax = df.plot.hist(colormap='jet') + ax = df.plot.hist(colormap="jet") rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, 5)] self._check_colors(ax.patches[::10], facecolors=rgba_colors) tm.close() @@ -2048,11 +2154,11 @@ def test_hist_colors(self): self._check_colors(ax.patches[::10], facecolors=rgba_colors) tm.close() - ax = df.loc[:, [0]].plot.hist(color='DodgerBlue') - self._check_colors([ax.patches[0]], facecolors=['DodgerBlue']) + ax = df.loc[:, [0]].plot.hist(color="DodgerBlue") + self._check_colors([ax.patches[0]], facecolors=["DodgerBlue"]) - ax = df.plot(kind='hist', color='green') - self._check_colors(ax.patches[::10], facecolors=['green'] * 5) + ax = df.plot(kind="hist", color="green") + self._check_colors(ax.patches[::10], facecolors=["green"] * 5) tm.close() @pytest.mark.slow @@ -2060,14 +2166,14 @@ def test_hist_colors(self): def test_kde_colors(self): from matplotlib import cm - custom_colors = 'rgcby' + custom_colors = "rgcby" df = DataFrame(rand(5, 5)) ax = df.plot.kde(color=custom_colors) self._check_colors(ax.get_lines(), linecolors=custom_colors) tm.close() - ax = df.plot.kde(colormap='jet') + ax = df.plot.kde(colormap="jet") rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] self._check_colors(ax.get_lines(), linecolors=rgba_colors) tm.close() @@ -2080,129 +2186,133 @@ def test_kde_colors(self): @td.skip_if_no_scipy def test_kde_colors_and_styles_subplots(self): from matplotlib import cm + default_colors = self._unpack_cycler(self.plt.rcParams) df = DataFrame(randn(5, 5)) - axes = df.plot(kind='kde', subplots=True) + axes = df.plot(kind="kde", subplots=True) for ax, c in zip(axes, list(default_colors)): self._check_colors(ax.get_lines(), linecolors=[c]) tm.close() # single color char - axes = df.plot(kind='kde', color='k', subplots=True) + axes = df.plot(kind="kde", color="k", subplots=True) for ax in axes: - self._check_colors(ax.get_lines(), linecolors=['k']) + self._check_colors(ax.get_lines(), linecolors=["k"]) tm.close() # single color str - axes = df.plot(kind='kde', color='red', subplots=True) + axes = df.plot(kind="kde", color="red", subplots=True) for ax in axes: - self._check_colors(ax.get_lines(), linecolors=['red']) + self._check_colors(ax.get_lines(), linecolors=["red"]) tm.close() - custom_colors = 'rgcby' - axes = df.plot(kind='kde', color=custom_colors, subplots=True) + custom_colors = "rgcby" + axes = df.plot(kind="kde", color=custom_colors, subplots=True) for ax, c in zip(axes, list(custom_colors)): self._check_colors(ax.get_lines(), linecolors=[c]) tm.close() rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] - for cmap in ['jet', cm.jet]: - axes = df.plot(kind='kde', colormap=cmap, subplots=True) + for cmap in ["jet", cm.jet]: + axes = df.plot(kind="kde", colormap=cmap, subplots=True) for ax, c in zip(axes, rgba_colors): self._check_colors(ax.get_lines(), linecolors=[c]) tm.close() # make color a list if plotting one column frame # handles cases like df.plot(color='DodgerBlue') - axes = df.loc[:, [0]].plot(kind='kde', color='DodgerBlue', - subplots=True) - self._check_colors(axes[0].lines, linecolors=['DodgerBlue']) + axes = df.loc[:, [0]].plot(kind="kde", color="DodgerBlue", subplots=True) + self._check_colors(axes[0].lines, linecolors=["DodgerBlue"]) # single character style - axes = df.plot(kind='kde', style='r', subplots=True) + axes = df.plot(kind="kde", style="r", subplots=True) for ax in axes: - self._check_colors(ax.get_lines(), linecolors=['r']) + self._check_colors(ax.get_lines(), linecolors=["r"]) tm.close() # list of styles - styles = list('rgcby') - axes = df.plot(kind='kde', style=styles, subplots=True) + styles = list("rgcby") + axes = df.plot(kind="kde", style=styles, subplots=True) for ax, c in zip(axes, styles): self._check_colors(ax.get_lines(), linecolors=[c]) tm.close() @pytest.mark.slow def test_boxplot_colors(self): - def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c='k', - fliers_c=None): + def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c="k", fliers_c=None): # TODO: outside this func? if fliers_c is None: - fliers_c = 'k' - self._check_colors(bp['boxes'], - linecolors=[box_c] * len(bp['boxes'])) - self._check_colors(bp['whiskers'], - linecolors=[whiskers_c] * len(bp['whiskers'])) - self._check_colors(bp['medians'], - linecolors=[medians_c] * len(bp['medians'])) - self._check_colors(bp['fliers'], - linecolors=[fliers_c] * len(bp['fliers'])) - self._check_colors(bp['caps'], - linecolors=[caps_c] * len(bp['caps'])) + fliers_c = "k" + self._check_colors(bp["boxes"], linecolors=[box_c] * len(bp["boxes"])) + self._check_colors( + bp["whiskers"], linecolors=[whiskers_c] * len(bp["whiskers"]) + ) + self._check_colors( + bp["medians"], linecolors=[medians_c] * len(bp["medians"]) + ) + self._check_colors(bp["fliers"], linecolors=[fliers_c] * len(bp["fliers"])) + self._check_colors(bp["caps"], linecolors=[caps_c] * len(bp["caps"])) default_colors = self._unpack_cycler(self.plt.rcParams) df = DataFrame(randn(5, 5)) - bp = df.plot.box(return_type='dict') - _check_colors(bp, default_colors[0], default_colors[0], - default_colors[2]) + bp = df.plot.box(return_type="dict") + _check_colors(bp, default_colors[0], default_colors[0], default_colors[2]) tm.close() - dict_colors = dict(boxes='#572923', whiskers='#982042', - medians='#804823', caps='#123456') - bp = df.plot.box(color=dict_colors, sym='r+', return_type='dict') - _check_colors(bp, dict_colors['boxes'], dict_colors['whiskers'], - dict_colors['medians'], dict_colors['caps'], 'r') + dict_colors = dict( + boxes="#572923", whiskers="#982042", medians="#804823", caps="#123456" + ) + bp = df.plot.box(color=dict_colors, sym="r+", return_type="dict") + _check_colors( + bp, + dict_colors["boxes"], + dict_colors["whiskers"], + dict_colors["medians"], + dict_colors["caps"], + "r", + ) tm.close() # partial colors - dict_colors = dict(whiskers='c', medians='m') - bp = df.plot.box(color=dict_colors, return_type='dict') - _check_colors(bp, default_colors[0], 'c', 'm') + dict_colors = dict(whiskers="c", medians="m") + bp = df.plot.box(color=dict_colors, return_type="dict") + _check_colors(bp, default_colors[0], "c", "m") tm.close() from matplotlib import cm + # Test str -> colormap functionality - bp = df.plot.box(colormap='jet', return_type='dict') + bp = df.plot.box(colormap="jet", return_type="dict") jet_colors = [cm.jet(n) for n in np.linspace(0, 1, 3)] _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2]) tm.close() # Test colormap functionality - bp = df.plot.box(colormap=cm.jet, return_type='dict') + bp = df.plot.box(colormap=cm.jet, return_type="dict") _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2]) tm.close() # string color is applied to all artists except fliers - bp = df.plot.box(color='DodgerBlue', return_type='dict') - _check_colors(bp, 'DodgerBlue', 'DodgerBlue', 'DodgerBlue', - 'DodgerBlue') + bp = df.plot.box(color="DodgerBlue", return_type="dict") + _check_colors(bp, "DodgerBlue", "DodgerBlue", "DodgerBlue", "DodgerBlue") # tuple is also applied to all artists except fliers - bp = df.plot.box(color=(0, 1, 0), sym='#123456', return_type='dict') - _check_colors(bp, (0, 1, 0), (0, 1, 0), (0, 1, 0), - (0, 1, 0), '#123456') + bp = df.plot.box(color=(0, 1, 0), sym="#123456", return_type="dict") + _check_colors(bp, (0, 1, 0), (0, 1, 0), (0, 1, 0), (0, 1, 0), "#123456") with pytest.raises(ValueError): # Color contains invalid key results in ValueError - df.plot.box(color=dict(boxes='red', xxxx='blue')) + df.plot.box(color=dict(boxes="red", xxxx="blue")) def test_default_color_cycle(self): import matplotlib.pyplot as plt import cycler - colors = list('rgbk') - plt.rcParams['axes.prop_cycle'] = cycler.cycler('color', colors) + + colors = list("rgbk") + plt.rcParams["axes.prop_cycle"] = cycler.cycler("color", colors) df = DataFrame(randn(5, 3)) ax = df.plot() @@ -2211,11 +2321,11 @@ def test_default_color_cycle(self): self._check_colors(ax.get_lines(), linecolors=expected) def test_unordered_ts(self): - df = DataFrame(np.array([3.0, 2.0, 1.0]), - index=[date(2012, 10, 1), - date(2012, 9, 1), - date(2012, 8, 1)], - columns=['test']) + df = DataFrame( + np.array([3.0, 2.0, 1.0]), + index=[date(2012, 10, 1), date(2012, 9, 1), date(2012, 8, 1)], + columns=["test"], + ) ax = df.plot() xticks = ax.lines[0].get_xdata() assert xticks[0] < xticks[1] @@ -2224,17 +2334,17 @@ def test_unordered_ts(self): @td.skip_if_no_scipy def test_kind_both_ways(self): - df = DataFrame({'x': [1, 2, 3]}) + df = DataFrame({"x": [1, 2, 3]}) for kind in plotting.PlotAccessor._common_kinds: df.plot(kind=kind) getattr(df.plot, kind)() - for kind in ['scatter', 'hexbin']: - df.plot('x', 'x', kind=kind) - getattr(df.plot, kind)('x', 'x') + for kind in ["scatter", "hexbin"]: + df.plot("x", "x", kind=kind) + getattr(df.plot, kind)("x", "x") def test_all_invalid_plot_data(self): - df = DataFrame(list('abcd')) + df = DataFrame(list("abcd")) for kind in plotting.PlotAccessor._common_kinds: msg = "no numeric data to plot" @@ -2245,7 +2355,7 @@ def test_all_invalid_plot_data(self): def test_partially_invalid_plot_data(self): with tm.RNGContext(42): df = DataFrame(randn(10, 2), dtype=object) - df[np.random.rand(df.shape[0]) > 0.5] = 'a' + df[np.random.rand(df.shape[0]) > 0.5] = "a" for kind in plotting.PlotAccessor._common_kinds: msg = "no numeric data to plot" @@ -2254,9 +2364,9 @@ def test_partially_invalid_plot_data(self): with tm.RNGContext(42): # area plot doesn't support positive/negative mixed data - kinds = ['area'] + kinds = ["area"] df = DataFrame(rand(10, 2), dtype=object) - df[np.random.rand(df.shape[0]) > 0.5] = 'a' + df[np.random.rand(df.shape[0]) > 0.5] = "a" for kind in kinds: with pytest.raises(TypeError): df.plot(kind=kind) @@ -2264,50 +2374,50 @@ def test_partially_invalid_plot_data(self): def test_invalid_kind(self): df = DataFrame(randn(10, 2)) with pytest.raises(ValueError): - df.plot(kind='aasdf') - - @pytest.mark.parametrize("x,y,lbl", [ - (['B', 'C'], 'A', 'a'), - (['A'], ['B', 'C'], ['b', 'c']), - ('A', ['B', 'C'], 'badlabel') - ]) + df.plot(kind="aasdf") + + @pytest.mark.parametrize( + "x,y,lbl", + [ + (["B", "C"], "A", "a"), + (["A"], ["B", "C"], ["b", "c"]), + ("A", ["B", "C"], "badlabel"), + ], + ) def test_invalid_xy_args(self, x, y, lbl): # GH 18671, 19699 allows y to be list-like but not x - df = DataFrame({"A": [1, 2], 'B': [3, 4], 'C': [5, 6]}) + df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) with pytest.raises(ValueError): df.plot(x=x, y=y, label=lbl) - @pytest.mark.parametrize("x,y", [ - ('A', 'B'), - (['A'], 'B') - ]) + @pytest.mark.parametrize("x,y", [("A", "B"), (["A"], "B")]) def test_invalid_xy_args_dup_cols(self, x, y): # GH 18671, 19699 allows y to be list-like but not x - df = DataFrame([[1, 3, 5], [2, 4, 6]], columns=list('AAB')) + df = DataFrame([[1, 3, 5], [2, 4, 6]], columns=list("AAB")) with pytest.raises(ValueError): df.plot(x=x, y=y) - @pytest.mark.parametrize("x,y,lbl,colors", [ - ('A', ['B'], ['b'], ['red']), - ('A', ['B', 'C'], ['b', 'c'], ['red', 'blue']), - (0, [1, 2], ['bokeh', 'cython'], ['green', 'yellow']) - ]) + @pytest.mark.parametrize( + "x,y,lbl,colors", + [ + ("A", ["B"], ["b"], ["red"]), + ("A", ["B", "C"], ["b", "c"], ["red", "blue"]), + (0, [1, 2], ["bokeh", "cython"], ["green", "yellow"]), + ], + ) def test_y_listlike(self, x, y, lbl, colors): # GH 19699: tests list-like y and verifies lbls & colors - df = DataFrame({"A": [1, 2], 'B': [3, 4], 'C': [5, 6]}) - _check_plot_works(df.plot, x='A', y=y, label=lbl) + df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) + _check_plot_works(df.plot, x="A", y=y, label=lbl) ax = df.plot(x=x, y=y, label=lbl, color=colors) assert len(ax.lines) == len(y) self._check_colors(ax.get_lines(), linecolors=colors) - @pytest.mark.parametrize("x,y,colnames", [ - (0, 1, ['A', 'B']), - (1, 0, [0, 1]) - ]) + @pytest.mark.parametrize("x,y,colnames", [(0, 1, ["A", "B"]), (1, 0, [0, 1])]) def test_xy_args_integer(self, x, y, colnames): # GH 20056: tests integer args for xy and checks col names - df = DataFrame({"A": [1, 2], 'B': [3, 4]}) + df = DataFrame({"A": [1, 2], "B": [3, 4]}) df.columns = colnames _check_plot_works(df.plot, x=x, y=y) @@ -2315,12 +2425,12 @@ def test_xy_args_integer(self, x, y, colnames): def test_hexbin_basic(self): df = self.hexbin_df - ax = df.plot.hexbin(x='A', y='B', gridsize=10) + ax = df.plot.hexbin(x="A", y="B", gridsize=10) # TODO: need better way to test. This just does existence. assert len(ax.collections) == 1 # GH 6951 - axes = df.plot.hexbin(x='A', y='B', subplots=True) + axes = df.plot.hexbin(x="A", y="B", subplots=True) # hexbin should have 2 axes in the figure, 1 for plotting and another # is colorbar assert len(axes[0].figure.axes) == 2 @@ -2331,10 +2441,10 @@ def test_hexbin_basic(self): def test_hexbin_with_c(self): df = self.hexbin_df - ax = df.plot.hexbin(x='A', y='B', C='C') + ax = df.plot.hexbin(x="A", y="B", C="C") assert len(ax.collections) == 1 - ax = df.plot.hexbin(x='A', y='B', C='C', reduce_C_function=np.std) + ax = df.plot.hexbin(x="A", y="B", C="C", reduce_C_function=np.std) assert len(ax.collections) == 1 @pytest.mark.slow @@ -2342,38 +2452,41 @@ def test_hexbin_cmap(self): df = self.hexbin_df # Default to BuGn - ax = df.plot.hexbin(x='A', y='B') - assert ax.collections[0].cmap.name == 'BuGn' + ax = df.plot.hexbin(x="A", y="B") + assert ax.collections[0].cmap.name == "BuGn" - cm = 'cubehelix' - ax = df.plot.hexbin(x='A', y='B', colormap=cm) + cm = "cubehelix" + ax = df.plot.hexbin(x="A", y="B", colormap=cm) assert ax.collections[0].cmap.name == cm @pytest.mark.slow def test_no_color_bar(self): df = self.hexbin_df - ax = df.plot.hexbin(x='A', y='B', colorbar=None) + ax = df.plot.hexbin(x="A", y="B", colorbar=None) assert ax.collections[0].colorbar is None @pytest.mark.slow def test_allow_cmap(self): df = self.hexbin_df - ax = df.plot.hexbin(x='A', y='B', cmap='YlGn') - assert ax.collections[0].cmap.name == 'YlGn' + ax = df.plot.hexbin(x="A", y="B", cmap="YlGn") + assert ax.collections[0].cmap.name == "YlGn" with pytest.raises(TypeError): - df.plot.hexbin(x='A', y='B', cmap='YlGn', colormap='BuGn') + df.plot.hexbin(x="A", y="B", cmap="YlGn", colormap="BuGn") @pytest.mark.slow def test_pie_df(self): - df = DataFrame(np.random.rand(5, 3), columns=['X', 'Y', 'Z'], - index=['a', 'b', 'c', 'd', 'e']) + df = DataFrame( + np.random.rand(5, 3), + columns=["X", "Y", "Z"], + index=["a", "b", "c", "d", "e"], + ) with pytest.raises(ValueError): df.plot.pie() - ax = _check_plot_works(df.plot.pie, y='Y') + ax = _check_plot_works(df.plot.pie, y="Y") self._check_text_labels(ax.texts, df.index) ax = _check_plot_works(df.plot.pie, y=2) @@ -2381,20 +2494,19 @@ def test_pie_df(self): # _check_plot_works adds an ax so catch warning. see GH #13188 with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot.pie, - subplots=True) + axes = _check_plot_works(df.plot.pie, subplots=True) assert len(axes) == len(df.columns) for ax in axes: self._check_text_labels(ax.texts, df.index) for ax, ylabel in zip(axes, df.columns): assert ax.get_ylabel() == ylabel - labels = ['A', 'B', 'C', 'D', 'E'] - color_args = ['r', 'g', 'b', 'c', 'm'] + labels = ["A", "B", "C", "D", "E"] + color_args = ["r", "g", "b", "c", "m"] with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot.pie, - subplots=True, labels=labels, - colors=color_args) + axes = _check_plot_works( + df.plot.pie, subplots=True, labels=labels, colors=color_args + ) assert len(axes) == len(df.columns) for ax in axes: @@ -2408,24 +2520,25 @@ def test_pie_df_nan(self): fig, axes = self.plt.subplots(ncols=4) df.plot.pie(subplots=True, ax=axes, legend=True) - base_expected = ['0', '1', '2', '3'] + base_expected = ["0", "1", "2", "3"] for i, ax in enumerate(axes): expected = list(base_expected) # force copy - expected[i] = '' + expected[i] = "" result = [x.get_text() for x in ax.texts] assert result == expected # legend labels # NaN's not included in legend with subplots # see https://github.com/pandas-dev/pandas/issues/8390 - assert ([x.get_text() for x in ax.get_legend().get_texts()] == - base_expected[:i] + base_expected[i + 1:]) + assert [x.get_text() for x in ax.get_legend().get_texts()] == base_expected[ + :i + ] + base_expected[i + 1 :] @pytest.mark.slow def test_errorbar_plot(self): with warnings.catch_warnings(): - d = {'x': np.arange(12), 'y': np.arange(12, 0, -1)} + d = {"x": np.arange(12), "y": np.arange(12, 0, -1)} df = DataFrame(d) - d_err = {'x': np.ones(12) * 0.2, 'y': np.ones(12) * 0.4} + d_err = {"x": np.ones(12) * 0.2, "y": np.ones(12) * 0.4} df_err = DataFrame(d_err) # check line plots @@ -2436,58 +2549,57 @@ def test_errorbar_plot(self): ax = _check_plot_works(df.plot, yerr=df_err, loglog=True) self._check_has_errorbars(ax, xerr=0, yerr=2) - kinds = ['line', 'bar', 'barh'] + kinds = ["line", "bar", "barh"] for kind in kinds: - ax = _check_plot_works(df.plot, yerr=df_err['x'], kind=kind) + ax = _check_plot_works(df.plot, yerr=df_err["x"], kind=kind) self._check_has_errorbars(ax, xerr=0, yerr=2) ax = _check_plot_works(df.plot, yerr=d_err, kind=kind) self._check_has_errorbars(ax, xerr=0, yerr=2) - ax = _check_plot_works(df.plot, yerr=df_err, xerr=df_err, - kind=kind) + ax = _check_plot_works(df.plot, yerr=df_err, xerr=df_err, kind=kind) self._check_has_errorbars(ax, xerr=2, yerr=2) - ax = _check_plot_works(df.plot, yerr=df_err['x'], - xerr=df_err['x'], - kind=kind) + ax = _check_plot_works( + df.plot, yerr=df_err["x"], xerr=df_err["x"], kind=kind + ) self._check_has_errorbars(ax, xerr=2, yerr=2) ax = _check_plot_works(df.plot, xerr=0.2, yerr=0.2, kind=kind) self._check_has_errorbars(ax, xerr=2, yerr=2) # _check_plot_works adds an ax so catch warning. see GH #13188 - axes = _check_plot_works(df.plot, - yerr=df_err, xerr=df_err, - subplots=True, - kind=kind) + axes = _check_plot_works( + df.plot, yerr=df_err, xerr=df_err, subplots=True, kind=kind + ) self._check_has_errorbars(axes, xerr=1, yerr=1) - ax = _check_plot_works((df + 1).plot, yerr=df_err, - xerr=df_err, kind='bar', log=True) + ax = _check_plot_works( + (df + 1).plot, yerr=df_err, xerr=df_err, kind="bar", log=True + ) self._check_has_errorbars(ax, xerr=2, yerr=2) # yerr is raw error values - ax = _check_plot_works(df['y'].plot, yerr=np.ones(12) * 0.4) + ax = _check_plot_works(df["y"].plot, yerr=np.ones(12) * 0.4) self._check_has_errorbars(ax, xerr=0, yerr=1) ax = _check_plot_works(df.plot, yerr=np.ones((2, 12)) * 0.4) self._check_has_errorbars(ax, xerr=0, yerr=2) # yerr is iterator import itertools - ax = _check_plot_works(df.plot, - yerr=itertools.repeat(0.1, len(df))) + + ax = _check_plot_works(df.plot, yerr=itertools.repeat(0.1, len(df))) self._check_has_errorbars(ax, xerr=0, yerr=2) # yerr is column name - for yerr in ['yerr', '誤差']: + for yerr in ["yerr", "誤差"]: s_df = df.copy() s_df[yerr] = np.ones(12) * 0.2 ax = _check_plot_works(s_df.plot, yerr=yerr) self._check_has_errorbars(ax, xerr=0, yerr=2) - ax = _check_plot_works(s_df.plot, y='y', x='x', yerr=yerr) + ax = _check_plot_works(s_df.plot, y="y", x="x", yerr=yerr) self._check_has_errorbars(ax, xerr=0, yerr=1) with pytest.raises(ValueError): df.plot(yerr=np.random.randn(11)) - df_err = DataFrame({'x': ['zzz'] * 12, 'y': ['zzz'] * 12}) + df_err = DataFrame({"x": ["zzz"] * 12, "y": ["zzz"] * 12}) with pytest.raises((ValueError, TypeError)): df.plot(yerr=df_err) @@ -2505,20 +2617,20 @@ def test_errorbar_with_integer_column_names(self): def test_errorbar_with_partial_columns(self): df = DataFrame(np.random.randn(10, 3)) df_err = DataFrame(np.random.randn(10, 2), columns=[0, 2]) - kinds = ['line', 'bar'] + kinds = ["line", "bar"] for kind in kinds: ax = _check_plot_works(df.plot, yerr=df_err, kind=kind) self._check_has_errorbars(ax, xerr=0, yerr=2) - ix = date_range('1/1/2000', periods=10, freq='M') + ix = date_range("1/1/2000", periods=10, freq="M") df.set_index(ix, inplace=True) df_err.set_index(ix, inplace=True) - ax = _check_plot_works(df.plot, yerr=df_err, kind='line') + ax = _check_plot_works(df.plot, yerr=df_err, kind="line") self._check_has_errorbars(ax, xerr=0, yerr=2) - d = {'x': np.arange(12), 'y': np.arange(12, 0, -1)} + d = {"x": np.arange(12), "y": np.arange(12, 0, -1)} df = DataFrame(d) - d_err = {'x': np.ones(12) * 0.2, 'z': np.ones(12) * 0.4} + d_err = {"x": np.ones(12) * 0.2, "z": np.ones(12) * 0.4} df_err = DataFrame(d_err) for err in [d_err, df_err]: ax = _check_plot_works(df.plot, yerr=err) @@ -2528,32 +2640,31 @@ def test_errorbar_with_partial_columns(self): def test_errorbar_timeseries(self): with warnings.catch_warnings(): - d = {'x': np.arange(12), 'y': np.arange(12, 0, -1)} - d_err = {'x': np.ones(12) * 0.2, 'y': np.ones(12) * 0.4} + d = {"x": np.arange(12), "y": np.arange(12, 0, -1)} + d_err = {"x": np.ones(12) * 0.2, "y": np.ones(12) * 0.4} # check time-series plots - ix = date_range('1/1/2000', '1/1/2001', freq='M') + ix = date_range("1/1/2000", "1/1/2001", freq="M") tdf = DataFrame(d, index=ix) tdf_err = DataFrame(d_err, index=ix) - kinds = ['line', 'bar', 'barh'] + kinds = ["line", "bar", "barh"] for kind in kinds: ax = _check_plot_works(tdf.plot, yerr=tdf_err, kind=kind) self._check_has_errorbars(ax, xerr=0, yerr=2) ax = _check_plot_works(tdf.plot, yerr=d_err, kind=kind) self._check_has_errorbars(ax, xerr=0, yerr=2) - ax = _check_plot_works(tdf.plot, y='y', yerr=tdf_err['x'], - kind=kind) + ax = _check_plot_works(tdf.plot, y="y", yerr=tdf_err["x"], kind=kind) self._check_has_errorbars(ax, xerr=0, yerr=1) - ax = _check_plot_works(tdf.plot, y='y', yerr='x', kind=kind) + ax = _check_plot_works(tdf.plot, y="y", yerr="x", kind=kind) self._check_has_errorbars(ax, xerr=0, yerr=1) ax = _check_plot_works(tdf.plot, yerr=tdf_err, kind=kind) self._check_has_errorbars(ax, xerr=0, yerr=2) # _check_plot_works adds an ax so catch warning. see GH #13188 - axes = _check_plot_works(tdf.plot, - kind=kind, yerr=tdf_err, - subplots=True) + axes = _check_plot_works( + tdf.plot, kind=kind, yerr=tdf_err, subplots=True + ) self._check_has_errorbars(axes, xerr=0, yerr=1) def test_errorbar_asymmetrical(self): @@ -2576,8 +2687,7 @@ def test_errorbar_asymmetrical(self): tm.close() def test_table(self): - df = DataFrame(np.random.rand(10, 3), - index=list(string.ascii_letters[:10])) + df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) _check_plot_works(df.plot, table=True) _check_plot_works(df.plot, table=df) @@ -2587,26 +2697,24 @@ def test_table(self): assert len(ax.tables) == 1 def test_errorbar_scatter(self): - df = DataFrame( - np.random.randn(5, 2), index=range(5), columns=['x', 'y']) - df_err = DataFrame(np.random.randn(5, 2) / 5, - index=range(5), columns=['x', 'y']) + df = DataFrame(np.random.randn(5, 2), index=range(5), columns=["x", "y"]) + df_err = DataFrame( + np.random.randn(5, 2) / 5, index=range(5), columns=["x", "y"] + ) - ax = _check_plot_works(df.plot.scatter, x='x', y='y') + ax = _check_plot_works(df.plot.scatter, x="x", y="y") self._check_has_errorbars(ax, xerr=0, yerr=0) - ax = _check_plot_works(df.plot.scatter, x='x', y='y', xerr=df_err) + ax = _check_plot_works(df.plot.scatter, x="x", y="y", xerr=df_err) self._check_has_errorbars(ax, xerr=1, yerr=0) - ax = _check_plot_works(df.plot.scatter, x='x', y='y', yerr=df_err) + ax = _check_plot_works(df.plot.scatter, x="x", y="y", yerr=df_err) self._check_has_errorbars(ax, xerr=0, yerr=1) - ax = _check_plot_works(df.plot.scatter, x='x', y='y', xerr=df_err, - yerr=df_err) + ax = _check_plot_works(df.plot.scatter, x="x", y="y", xerr=df_err, yerr=df_err) self._check_has_errorbars(ax, xerr=1, yerr=1) - def _check_errorbar_color(containers, expected, has_err='has_xerr'): + def _check_errorbar_color(containers, expected, has_err="has_xerr"): lines = [] - errs = [c.lines - for c in ax.containers if getattr(c, has_err, False)][0] + errs = [c.lines for c in ax.containers if getattr(c, has_err, False)][0] for el in errs: if is_list_like(el): lines.extend(el) @@ -2614,19 +2722,19 @@ def _check_errorbar_color(containers, expected, has_err='has_xerr'): lines.append(el) err_lines = [x for x in lines if x in ax.collections] self._check_colors( - err_lines, linecolors=np.array([expected] * len(err_lines))) + err_lines, linecolors=np.array([expected] * len(err_lines)) + ) # GH 8081 - df = DataFrame( - np.random.randn(10, 5), columns=['a', 'b', 'c', 'd', 'e']) - ax = df.plot.scatter(x='a', y='b', xerr='d', yerr='e', c='red') + df = DataFrame(np.random.randn(10, 5), columns=["a", "b", "c", "d", "e"]) + ax = df.plot.scatter(x="a", y="b", xerr="d", yerr="e", c="red") self._check_has_errorbars(ax, xerr=1, yerr=1) - _check_errorbar_color(ax.containers, 'red', has_err='has_xerr') - _check_errorbar_color(ax.containers, 'red', has_err='has_yerr') + _check_errorbar_color(ax.containers, "red", has_err="has_xerr") + _check_errorbar_color(ax.containers, "red", has_err="has_yerr") - ax = df.plot.scatter(x='a', y='b', yerr='e', color='green') + ax = df.plot.scatter(x="a", y="b", yerr="e", color="green") self._check_has_errorbars(ax, xerr=0, yerr=1) - _check_errorbar_color(ax.containers, 'green', has_err='has_yerr') + _check_errorbar_color(ax.containers, "green", has_err="has_yerr") @pytest.mark.slow def test_sharex_and_ax(self): @@ -2634,13 +2742,18 @@ def test_sharex_and_ax(self): # the axis in fig.get_axis() are sorted differently than pandas # expected them, so make sure that only the right ones are removed import matplotlib.pyplot as plt - plt.close('all') + + plt.close("all") gs, axes = _generate_4_axes_via_gridspec() - df = DataFrame({"a": [1, 2, 3, 4, 5, 6], - "b": [1, 2, 3, 4, 5, 6], - "c": [1, 2, 3, 4, 5, 6], - "d": [1, 2, 3, 4, 5, 6]}) + df = DataFrame( + { + "a": [1, 2, 3, 4, 5, 6], + "b": [1, 2, 3, 4, 5, 6], + "c": [1, 2, 3, 4, 5, 6], + "d": [1, 2, 3, 4, 5, 6], + } + ) def _check(axes): for ax in axes: @@ -2648,12 +2761,10 @@ def _check(axes): self._check_visible(ax.get_yticklabels(), visible=True) for ax in [axes[0], axes[2]]: self._check_visible(ax.get_xticklabels(), visible=False) - self._check_visible( - ax.get_xticklabels(minor=True), visible=False) + self._check_visible(ax.get_xticklabels(minor=True), visible=False) for ax in [axes[1], axes[3]]: self._check_visible(ax.get_xticklabels(), visible=True) - self._check_visible( - ax.get_xticklabels(minor=True), visible=True) + self._check_visible(ax.get_xticklabels(minor=True), visible=True) for ax in axes: df.plot(x="a", y="b", title="title", ax=ax, sharex=True) @@ -2689,17 +2800,20 @@ def test_sharey_and_ax(self): gs, axes = _generate_4_axes_via_gridspec() - df = DataFrame({"a": [1, 2, 3, 4, 5, 6], - "b": [1, 2, 3, 4, 5, 6], - "c": [1, 2, 3, 4, 5, 6], - "d": [1, 2, 3, 4, 5, 6]}) + df = DataFrame( + { + "a": [1, 2, 3, 4, 5, 6], + "b": [1, 2, 3, 4, 5, 6], + "c": [1, 2, 3, 4, 5, 6], + "d": [1, 2, 3, 4, 5, 6], + } + ) def _check(axes): for ax in axes: assert len(ax.lines) == 1 self._check_visible(ax.get_xticklabels(), visible=True) - self._check_visible( - ax.get_xticklabels(minor=True), visible=True) + self._check_visible(ax.get_xticklabels(minor=True), visible=True) for ax in [axes[0], axes[1]]: self._check_visible(ax.get_yticklabels(), visible=True) for ax in [axes[2], axes[3]]: @@ -2741,10 +2855,10 @@ def test_memory_leak(self): for kind in plotting.PlotAccessor._all_kinds: args = {} - if kind in ['hexbin', 'scatter', 'pie']: + if kind in ["hexbin", "scatter", "pie"]: df = self.hexbin_df - args = {'x': 'A', 'y': 'B'} - elif kind == 'area': + args = {"x": "A", "y": "B"} + elif kind == "area": df = self.tdf.abs() else: df = self.tdf @@ -2768,9 +2882,11 @@ def test_df_subplots_patterns_minorticks(self): # GH 10657 import matplotlib.pyplot as plt - df = DataFrame(np.random.randn(10, 2), - index=date_range('1/1/2000', periods=10), - columns=list('AB')) + df = DataFrame( + np.random.randn(10, 2), + index=date_range("1/1/2000", periods=10), + columns=list("AB"), + ) # shared subplots fig, axes = plt.subplots(2, 1, sharex=True) @@ -2814,11 +2930,9 @@ def test_df_gridspec_patterns(self): import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec - ts = Series(np.random.randn(10), - index=date_range('1/1/2000', periods=10)) + ts = Series(np.random.randn(10), index=date_range("1/1/2000", periods=10)) - df = DataFrame(np.random.randn(10, 2), index=ts.index, - columns=list('AB')) + df = DataFrame(np.random.randn(10, 2), index=ts.index, columns=list("AB")) def _get_vertical_grid(): gs = gridspec.GridSpec(3, 1) @@ -2842,8 +2956,7 @@ def _get_horizontal_grid(): for ax in [ax1, ax2]: self._check_visible(ax.get_yticklabels(), visible=True) self._check_visible(ax.get_xticklabels(), visible=True) - self._check_visible( - ax.get_xticklabels(minor=True), visible=True) + self._check_visible(ax.get_xticklabels(minor=True), visible=True) tm.close() # subplots=True @@ -2854,15 +2967,13 @@ def _get_horizontal_grid(): for ax in axes: self._check_visible(ax.get_yticklabels(), visible=True) self._check_visible(ax.get_xticklabels(), visible=True) - self._check_visible( - ax.get_xticklabels(minor=True), visible=True) + self._check_visible(ax.get_xticklabels(minor=True), visible=True) tm.close() # vertical / subplots / sharex=True / sharey=True ax1, ax2 = _get_vertical_grid() with tm.assert_produces_warning(UserWarning): - axes = df.plot(subplots=True, ax=[ax1, ax2], sharex=True, - sharey=True) + axes = df.plot(subplots=True, ax=[ax1, ax2], sharex=True, sharey=True) assert len(axes[0].lines) == 1 assert len(axes[1].lines) == 1 for ax in [ax1, ax2]: @@ -2878,8 +2989,7 @@ def _get_horizontal_grid(): # horizontal / subplots / sharex=True / sharey=True ax1, ax2 = _get_horizontal_grid() with tm.assert_produces_warning(UserWarning): - axes = df.plot(subplots=True, ax=[ax1, ax2], sharex=True, - sharey=True) + axes = df.plot(subplots=True, ax=[ax1, ax2], sharex=True, sharey=True) assert len(axes[0].lines) == 1 assert len(axes[1].lines) == 1 self._check_visible(axes[0].get_yticklabels(), visible=True) @@ -2902,8 +3012,7 @@ def _get_boxed_grid(): return ax1, ax2, ax3, ax4 axes = _get_boxed_grid() - df = DataFrame(np.random.randn(10, 4), - index=ts.index, columns=list('ABCD')) + df = DataFrame(np.random.randn(10, 4), index=ts.index, columns=list("ABCD")) axes = df.plot(subplots=True, ax=axes) for ax in axes: assert len(ax.lines) == 1 @@ -2935,14 +3044,16 @@ def _get_boxed_grid(): def test_df_grid_settings(self): # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 self._check_grid_settings( - DataFrame({'a': [1, 2, 3], 'b': [2, 3, 4]}), - plotting.PlotAccessor._dataframe_kinds, kws={'x': 'a', 'y': 'b'}) + DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}), + plotting.PlotAccessor._dataframe_kinds, + kws={"x": "a", "y": "b"}, + ) def test_invalid_colormap(self): - df = DataFrame(randn(3, 2), columns=['A', 'B']) + df = DataFrame(randn(3, 2), columns=["A", "B"]) with pytest.raises(ValueError): - df.plot(colormap='invalid_colormap') + df.plot(colormap="invalid_colormap") def test_plain_axes(self): @@ -2954,14 +3065,15 @@ def test_plain_axes(self): # supplied ax itself is a plain Axes, but because the cmap keyword # a new ax is created for the colorbar -> also multiples axes (GH11520) - df = DataFrame({'a': randn(8), 'b': randn(8)}) + df = DataFrame({"a": randn(8), "b": randn(8)}) fig = self.plt.figure() ax = fig.add_axes((0, 0, 1, 1)) - df.plot(kind='scatter', ax=ax, x='a', y='b', c='a', cmap='hsv') + df.plot(kind="scatter", ax=ax, x="a", y="b", c="a", cmap="hsv") # other examples fig, ax = self.plt.subplots() from mpl_toolkits.axes_grid1 import make_axes_locatable + divider = make_axes_locatable(ax) cax = divider.append_axes("right", size="5%", pad=0.05) Series(rand(10)).plot(ax=ax) @@ -2969,12 +3081,14 @@ def test_plain_axes(self): fig, ax = self.plt.subplots() from mpl_toolkits.axes_grid1.inset_locator import inset_axes - iax = inset_axes(ax, width="30%", height=1., loc=3) + + iax = inset_axes(ax, width="30%", height=1.0, loc=3) Series(rand(10)).plot(ax=ax) Series(rand(10)).plot(ax=iax) def test_passed_bar_colors(self): import matplotlib as mpl + color_tuples = [(0.9, 0, 0, 1), (0, 0.9, 0, 1), (0, 0, 0.9, 1)] colormap = mpl.colors.ListedColormap(color_tuples) barplot = pd.DataFrame([[1, 2, 3]]).plot(kind="bar", cmap=colormap) @@ -2982,62 +3096,63 @@ def test_passed_bar_colors(self): def test_rcParams_bar_colors(self): import matplotlib as mpl + color_tuples = [(0.9, 0, 0, 1), (0, 0.9, 0, 1), (0, 0, 0.9, 1)] - with mpl.rc_context( - rc={'axes.prop_cycle': mpl.cycler("color", color_tuples)}): + with mpl.rc_context(rc={"axes.prop_cycle": mpl.cycler("color", color_tuples)}): barplot = pd.DataFrame([[1, 2, 3]]).plot(kind="bar") assert color_tuples == [c.get_facecolor() for c in barplot.patches] - @pytest.mark.parametrize('method', ['line', 'barh', 'bar']) + @pytest.mark.parametrize("method", ["line", "barh", "bar"]) def test_secondary_axis_font_size(self, method): # GH: 12565 - df = (pd.DataFrame(np.random.randn(15, 2), - columns=list('AB')) - .assign(C=lambda df: df.B.cumsum()) - .assign(D=lambda df: df.C * 1.1)) + df = ( + pd.DataFrame(np.random.randn(15, 2), columns=list("AB")) + .assign(C=lambda df: df.B.cumsum()) + .assign(D=lambda df: df.C * 1.1) + ) fontsize = 20 - sy = ['C', 'D'] + sy = ["C", "D"] - kwargs = dict(secondary_y=sy, fontsize=fontsize, - mark_right=True) + kwargs = dict(secondary_y=sy, fontsize=fontsize, mark_right=True) ax = getattr(df.plot, method)(**kwargs) - self._check_ticks_props(axes=ax.right_ax, - ylabelsize=fontsize) + self._check_ticks_props(axes=ax.right_ax, ylabelsize=fontsize) @pytest.mark.slow def test_x_string_values_ticks(self): # Test if string plot index have a fixed xtick position # GH: 7612, GH: 22334 - df = pd.DataFrame({'sales': [3, 2, 3], - 'visits': [20, 42, 28], - 'day': ['Monday', 'Tuesday', 'Wednesday']}) - ax = df.plot.area(x='day') + df = pd.DataFrame( + { + "sales": [3, 2, 3], + "visits": [20, 42, 28], + "day": ["Monday", "Tuesday", "Wednesday"], + } + ) + ax = df.plot.area(x="day") ax.set_xlim(-1, 3) xticklabels = [t.get_text() for t in ax.get_xticklabels()] labels_position = dict(zip(xticklabels, ax.get_xticks())) # Testing if the label stayed at the right position - assert labels_position['Monday'] == 0.0 - assert labels_position['Tuesday'] == 1.0 - assert labels_position['Wednesday'] == 2.0 + assert labels_position["Monday"] == 0.0 + assert labels_position["Tuesday"] == 1.0 + assert labels_position["Wednesday"] == 2.0 @pytest.mark.slow def test_x_multiindex_values_ticks(self): # Test if multiindex plot index have a fixed xtick position # GH: 15912 index = pd.MultiIndex.from_product([[2012, 2013], [1, 2]]) - df = pd.DataFrame(np.random.randn(4, 2), - columns=['A', 'B'], - index=index) + df = pd.DataFrame(np.random.randn(4, 2), columns=["A", "B"], index=index) ax = df.plot() ax.set_xlim(-1, 4) xticklabels = [t.get_text() for t in ax.get_xticklabels()] labels_position = dict(zip(xticklabels, ax.get_xticks())) # Testing if the label stayed at the right position - assert labels_position['(2012, 1)'] == 0.0 - assert labels_position['(2012, 2)'] == 1.0 - assert labels_position['(2013, 1)'] == 2.0 - assert labels_position['(2013, 2)'] == 3.0 + assert labels_position["(2012, 1)"] == 0.0 + assert labels_position["(2012, 2)"] == 1.0 + assert labels_position["(2013, 1)"] == 2.0 + assert labels_position["(2013, 2)"] == 3.0 def _generate_4_axes_via_gridspec(): diff --git a/pandas/tests/plotting/test_groupby.py b/pandas/tests/plotting/test_groupby.py index 5a5ee75928c977..bb1747710fe187 100644 --- a/pandas/tests/plotting/test_groupby.py +++ b/pandas/tests/plotting/test_groupby.py @@ -14,13 +14,12 @@ @td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): - def test_series_groupby_plotting_nominally_works(self): n = 10 weight = Series(np.random.normal(166, 20, size=n)) height = Series(np.random.normal(60, 10, size=n)) with tm.RNGContext(42): - gender = np.random.choice(['male', 'female'], size=n) + gender = np.random.choice(["male", "female"], size=n) weight.groupby(gender).plot() tm.close() @@ -32,44 +31,39 @@ def test_series_groupby_plotting_nominally_works(self): def test_plotting_with_float_index_works(self): # GH 7025 - df = DataFrame({'def': [1, 1, 1, 2, 2, 2, 3, 3, 3], - 'val': np.random.randn(9)}, - index=[1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0]) + df = DataFrame( + {"def": [1, 1, 1, 2, 2, 2, 3, 3, 3], "val": np.random.randn(9)}, + index=[1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0], + ) - df.groupby('def')['val'].plot() + df.groupby("def")["val"].plot() tm.close() - df.groupby('def')['val'].apply(lambda x: x.plot()) + df.groupby("def")["val"].apply(lambda x: x.plot()) tm.close() def test_hist_single_row(self): # GH10214 bins = np.arange(80, 100 + 2, 1) - df = DataFrame({"Name": ["AAA", "BBB"], - "ByCol": [1, 2], - "Mark": [85, 89]}) + df = DataFrame({"Name": ["AAA", "BBB"], "ByCol": [1, 2], "Mark": [85, 89]}) df["Mark"].hist(by=df["ByCol"], bins=bins) df = DataFrame({"Name": ["AAA"], "ByCol": [1], "Mark": [85]}) df["Mark"].hist(by=df["ByCol"], bins=bins) def test_plot_submethod_works(self): - df = DataFrame({'x': [1, 2, 3, 4, 5], - 'y': [1, 2, 3, 2, 1], - 'z': list('ababa')}) - df.groupby('z').plot.scatter('x', 'y') + df = DataFrame({"x": [1, 2, 3, 4, 5], "y": [1, 2, 3, 2, 1], "z": list("ababa")}) + df.groupby("z").plot.scatter("x", "y") tm.close() - df.groupby('z')['x'].plot.line() + df.groupby("z")["x"].plot.line() tm.close() def test_plot_kwargs(self): - df = DataFrame({'x': [1, 2, 3, 4, 5], - 'y': [1, 2, 3, 2, 1], - 'z': list('ababa')}) + df = DataFrame({"x": [1, 2, 3, 4, 5], "y": [1, 2, 3, 2, 1], "z": list("ababa")}) - res = df.groupby('z').plot(kind='scatter', x='x', y='y') + res = df.groupby("z").plot(kind="scatter", x="x", y="y") # check that a scatter plot is effectively plotted: the axes should # contain a PathCollection from the scatter plot (GH11805) - assert len(res['a'].collections) == 1 + assert len(res["a"].collections) == 1 - res = df.groupby('z').plot.scatter(x='x', y='y') - assert len(res['a'].collections) == 1 + res = df.groupby("z").plot.scatter(x="x", y="y") + assert len(res["a"].collections) == 1 diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 79ce4187680448..14cb2bc9d7b623 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -15,14 +15,14 @@ @td.skip_if_no_mpl class TestSeriesPlots(TestPlotBase): - def setup_method(self, method): TestPlotBase.setup_method(self, method) import matplotlib as mpl + mpl.rcdefaults() self.ts = tm.makeTimeSeries() - self.ts.name = 'ts' + self.ts.name = "ts" @pytest.mark.slow def test_hist_legacy(self): @@ -71,47 +71,40 @@ def test_hist_layout_with_by(self): # so we get a warning about an axis being cleared, even # though we don't explicing pass one, see GH #13188 with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, by=df.gender, - layout=(2, 1)) + axes = _check_plot_works(df.height.hist, by=df.gender, layout=(2, 1)) self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, by=df.gender, - layout=(3, -1)) + axes = _check_plot_works(df.height.hist, by=df.gender, layout=(3, -1)) self._check_axes_shape(axes, axes_num=2, layout=(3, 1)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, by=df.category, - layout=(4, 1)) + axes = _check_plot_works(df.height.hist, by=df.category, layout=(4, 1)) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works( - df.height.hist, by=df.category, layout=(2, -1)) + axes = _check_plot_works(df.height.hist, by=df.category, layout=(2, -1)) self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works( - df.height.hist, by=df.category, layout=(3, -1)) + axes = _check_plot_works(df.height.hist, by=df.category, layout=(3, -1)) self._check_axes_shape(axes, axes_num=4, layout=(3, 2)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works( - df.height.hist, by=df.category, layout=(-1, 4)) + axes = _check_plot_works(df.height.hist, by=df.category, layout=(-1, 4)) self._check_axes_shape(axes, axes_num=4, layout=(1, 4)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works( - df.height.hist, by=df.classroom, layout=(2, 2)) + axes = _check_plot_works(df.height.hist, by=df.classroom, layout=(2, 2)) self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7)) - self._check_axes_shape( - axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) @pytest.mark.slow def test_hist_no_overlap(self): from matplotlib.pyplot import subplot, gcf + x = Series(randn(2)) y = Series(randn(2)) subplot(121) @@ -131,6 +124,7 @@ def test_hist_by_no_extra_plots(self): @pytest.mark.slow def test_plot_fails_when_ax_differs_from_figure(self): from pylab import figure + fig1 = figure() fig2 = figure() ax1 = fig1.add_subplot(111) @@ -140,10 +134,10 @@ def test_plot_fails_when_ax_differs_from_figure(self): @td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): - @pytest.mark.slow def test_hist_df_legacy(self): from matplotlib.patches import Rectangle + with tm.assert_produces_warning(UserWarning): _check_plot_works(self.hist_df.hist) @@ -180,14 +174,16 @@ def test_hist_df_legacy(self): xf, yf = 20, 18 xrot, yrot = 30, 40 axes = ser.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) - self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, - ylabelsize=yf, yrot=yrot) + self._check_ticks_props( + axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot + ) xf, yf = 20, 18 xrot, yrot = 30, 40 axes = df.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) - self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, - ylabelsize=yf, yrot=yrot) + self._check_ticks_props( + axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot + ) tm.close() @@ -199,13 +195,13 @@ def test_hist_df_legacy(self): tm.close() ax = ser.hist(log=True) # scale of y must be 'log' - self._check_ax_scales(ax, yaxis='log') + self._check_ax_scales(ax, yaxis="log") tm.close() # propagate attr exception from matplotlib.Axes.hist with pytest.raises(AttributeError): - ser.hist(foo='bar') + ser.hist(foo="bar") @pytest.mark.slow def test_hist_non_numerical_raises(self): @@ -222,20 +218,20 @@ def test_hist_layout(self): df = DataFrame(randn(100, 3)) layout_to_expected_size = ( - {'layout': None, 'expected_size': (2, 2)}, # default is 2x2 - {'layout': (2, 2), 'expected_size': (2, 2)}, - {'layout': (4, 1), 'expected_size': (4, 1)}, - {'layout': (1, 4), 'expected_size': (1, 4)}, - {'layout': (3, 3), 'expected_size': (3, 3)}, - {'layout': (-1, 4), 'expected_size': (1, 4)}, - {'layout': (4, -1), 'expected_size': (4, 1)}, - {'layout': (-1, 2), 'expected_size': (2, 2)}, - {'layout': (2, -1), 'expected_size': (2, 2)} + {"layout": None, "expected_size": (2, 2)}, # default is 2x2 + {"layout": (2, 2), "expected_size": (2, 2)}, + {"layout": (4, 1), "expected_size": (4, 1)}, + {"layout": (1, 4), "expected_size": (1, 4)}, + {"layout": (3, 3), "expected_size": (3, 3)}, + {"layout": (-1, 4), "expected_size": (1, 4)}, + {"layout": (4, -1), "expected_size": (4, 1)}, + {"layout": (-1, 2), "expected_size": (2, 2)}, + {"layout": (2, -1), "expected_size": (2, 2)}, ) for layout_test in layout_to_expected_size: - axes = df.hist(layout=layout_test['layout']) - expected = layout_test['expected_size'] + axes = df.hist(layout=layout_test["layout"]) + expected = layout_test["expected_size"] self._check_axes_shape(axes, axes_num=3, layout=expected) # layout too small for all 4 plots @@ -260,15 +256,14 @@ def test_tight_layout(self): @td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): - @pytest.mark.slow def test_grouped_hist_legacy(self): from matplotlib.patches import Rectangle from pandas.plotting._matplotlib.hist import _grouped_hist - df = DataFrame(randn(500, 2), columns=['A', 'B']) - df['C'] = np.random.randint(0, 4, 500) - df['D'] = ['X'] * 500 + df = DataFrame(randn(500, 2), columns=["A", "B"]) + df["C"] = np.random.randint(0, 4, 500) + df["D"] = ["X"] * 500 axes = _grouped_hist(df.A, by=df.C) self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) @@ -279,7 +274,7 @@ def test_grouped_hist_legacy(self): tm.close() # group by a key with single value - axes = df.hist(by='D', rot=30) + axes = df.hist(by="D", rot=30) self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) self._check_ticks_props(axes, xrot=30) @@ -288,29 +283,38 @@ def test_grouped_hist_legacy(self): xf, yf = 20, 18 xrot, yrot = 30, 40 - axes = _grouped_hist(df.A, by=df.C, cumulative=True, - bins=4, xlabelsize=xf, xrot=xrot, - ylabelsize=yf, yrot=yrot, density=True) + axes = _grouped_hist( + df.A, + by=df.C, + cumulative=True, + bins=4, + xlabelsize=xf, + xrot=xrot, + ylabelsize=yf, + yrot=yrot, + density=True, + ) # height of last bin (index 5) must be 1.0 for ax in axes.ravel(): rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] height = rects[-1].get_height() tm.assert_almost_equal(height, 1.0) - self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, - ylabelsize=yf, yrot=yrot) + self._check_ticks_props( + axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot + ) tm.close() axes = _grouped_hist(df.A, by=df.C, log=True) # scale of y must be 'log' - self._check_ax_scales(axes, yaxis='log') + self._check_ax_scales(axes, yaxis="log") tm.close() # propagate attr exception from matplotlib.Axes.hist with pytest.raises(AttributeError): - _grouped_hist(df.A, by=df.C, foo='bar') + _grouped_hist(df.A, by=df.C, foo="bar") with tm.assert_produces_warning(FutureWarning): - df.hist(by='C', figsize='default') + df.hist(by="C", figsize="default") @pytest.mark.slow def test_grouped_hist_legacy2(self): @@ -319,9 +323,8 @@ def test_grouped_hist_legacy2(self): height = Series(np.random.normal(60, 10, size=n)) with tm.RNGContext(42): gender_int = np.random.choice([0, 1], size=n) - df_int = DataFrame({'height': height, 'weight': weight, - 'gender': gender_int}) - gb = df_int.groupby('gender') + df_int = DataFrame({"height": height, "weight": weight, "gender": gender_int}) + gb = df_int.groupby("gender") axes = gb.hist() assert len(axes) == 2 assert len(self.plt.get_fignums()) == 2 @@ -332,53 +335,54 @@ def test_grouped_hist_layout(self): df = self.hist_df msg = "Layout of 1x1 must be larger than required size 2" with pytest.raises(ValueError, match=msg): - df.hist(column='weight', by=df.gender, layout=(1, 1)) + df.hist(column="weight", by=df.gender, layout=(1, 1)) msg = "Layout of 1x3 must be larger than required size 4" with pytest.raises(ValueError, match=msg): - df.hist(column='height', by=df.category, layout=(1, 3)) + df.hist(column="height", by=df.category, layout=(1, 3)) msg = "At least one dimension of layout must be positive" with pytest.raises(ValueError, match=msg): - df.hist(column='height', by=df.category, layout=(-1, -1)) + df.hist(column="height", by=df.category, layout=(-1, -1)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.hist, column='height', by=df.gender, - layout=(2, 1)) + axes = _check_plot_works( + df.hist, column="height", by=df.gender, layout=(2, 1) + ) self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.hist, column='height', by=df.gender, - layout=(2, -1)) + axes = _check_plot_works( + df.hist, column="height", by=df.gender, layout=(2, -1) + ) self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) - axes = df.hist(column='height', by=df.category, layout=(4, 1)) + axes = df.hist(column="height", by=df.category, layout=(4, 1)) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - axes = df.hist(column='height', by=df.category, layout=(-1, 1)) + axes = df.hist(column="height", by=df.category, layout=(-1, 1)) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - axes = df.hist(column='height', by=df.category, - layout=(4, 2), figsize=(12, 8)) - self._check_axes_shape( - axes, axes_num=4, layout=(4, 2), figsize=(12, 8)) + axes = df.hist(column="height", by=df.category, layout=(4, 2), figsize=(12, 8)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 8)) tm.close() # GH 6769 with tm.assert_produces_warning(UserWarning): axes = _check_plot_works( - df.hist, column='height', by='classroom', layout=(2, 2)) + df.hist, column="height", by="classroom", layout=(2, 2) + ) self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) # without column with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.hist, by='classroom') + axes = _check_plot_works(df.hist, by="classroom") self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - axes = df.hist(by='gender', layout=(3, 5)) + axes = df.hist(by="gender", layout=(3, 5)) self._check_axes_shape(axes, axes_num=2, layout=(3, 5)) - axes = df.hist(column=['height', 'weight', 'category']) + axes = df.hist(column=["height", "weight", "category"]) self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) @pytest.mark.slow @@ -387,11 +391,11 @@ def test_grouped_hist_multiple_axes(self): df = self.hist_df fig, axes = self.plt.subplots(2, 3) - returned = df.hist(column=['height', 'weight', 'category'], ax=axes[0]) + returned = df.hist(column=["height", "weight", "category"], ax=axes[0]) self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) tm.assert_numpy_array_equal(returned, axes[0]) assert returned[0].figure is fig - returned = df.hist(by='classroom', ax=axes[1]) + returned = df.hist(by="classroom", ax=axes[1]) self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) tm.assert_numpy_array_equal(returned, axes[1]) assert returned[0].figure is fig @@ -399,13 +403,13 @@ def test_grouped_hist_multiple_axes(self): with pytest.raises(ValueError): fig, axes = self.plt.subplots(2, 3) # pass different number of axes from required - axes = df.hist(column='height', ax=axes) + axes = df.hist(column="height", ax=axes) @pytest.mark.slow def test_axis_share_x(self): df = self.hist_df # GH4089 - ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True) + ax1, ax2 = df.hist(column="height", by=df.gender, sharex=True) # share x assert ax1._shared_x_axes.joined(ax1, ax2) @@ -418,7 +422,7 @@ def test_axis_share_x(self): @pytest.mark.slow def test_axis_share_y(self): df = self.hist_df - ax1, ax2 = df.hist(column='height', by=df.gender, sharey=True) + ax1, ax2 = df.hist(column="height", by=df.gender, sharey=True) # share y assert ax1._shared_y_axes.joined(ax1, ax2) @@ -431,8 +435,7 @@ def test_axis_share_y(self): @pytest.mark.slow def test_axis_share_xy(self): df = self.hist_df - ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True, - sharey=True) + ax1, ax2 = df.hist(column="height", by=df.gender, sharex=True, sharey=True) # share both x and y assert ax1._shared_x_axes.joined(ax1, ax2) diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index b27df946aeacfd..6cb6f818d40fdd 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -28,71 +28,76 @@ def test_import_error_message(): def test_get_accessor_args(): func = plotting._core.PlotAccessor._get_call_args - msg = 'Called plot accessor for type list, expected Series or DataFrame' + msg = "Called plot accessor for type list, expected Series or DataFrame" with pytest.raises(TypeError, match=msg): - func(backend_name='', data=[], args=[], kwargs={}) + func(backend_name="", data=[], args=[], kwargs={}) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - x, y, kind, kwargs = func(backend_name='', data=Series(), - args=['line', None], kwargs={}) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + x, y, kind, kwargs = func( + backend_name="", data=Series(), args=["line", None], kwargs={} + ) assert x is None assert y is None - assert kind == 'line' - assert kwargs == {'ax': None} - - x, y, kind, kwargs = func(backend_name='', data=DataFrame(), - args=['x'], kwargs={'y': 'y', - 'kind': 'bar', - 'grid': False}) - assert x == 'x' - assert y == 'y' - assert kind == 'bar' - assert kwargs == {'grid': False} - - x, y, kind, kwargs = func(backend_name='pandas.plotting._matplotlib', - data=Series(), args=[], kwargs={}) + assert kind == "line" + assert kwargs == {"ax": None} + + x, y, kind, kwargs = func( + backend_name="", + data=DataFrame(), + args=["x"], + kwargs={"y": "y", "kind": "bar", "grid": False}, + ) + assert x == "x" + assert y == "y" + assert kind == "bar" + assert kwargs == {"grid": False} + + x, y, kind, kwargs = func( + backend_name="pandas.plotting._matplotlib", data=Series(), args=[], kwargs={} + ) assert x is None assert y is None - assert kind == 'line' + assert kind == "line" assert len(kwargs) == 22 @td.skip_if_no_mpl class TestSeriesPlots(TestPlotBase): - def setup_method(self, method): TestPlotBase.setup_method(self, method) import matplotlib as mpl + mpl.rcdefaults() self.ts = tm.makeTimeSeries() - self.ts.name = 'ts' + self.ts.name = "ts" @pytest.mark.slow def test_autocorrelation_plot(self): from pandas.plotting import autocorrelation_plot + _check_plot_works(autocorrelation_plot, series=self.ts) _check_plot_works(autocorrelation_plot, series=self.ts.values) - ax = autocorrelation_plot(self.ts, label='Test') - self._check_legend_labels(ax, labels=['Test']) + ax = autocorrelation_plot(self.ts, label="Test") + self._check_legend_labels(ax, labels=["Test"]) @pytest.mark.slow def test_lag_plot(self): from pandas.plotting import lag_plot + _check_plot_works(lag_plot, series=self.ts) _check_plot_works(lag_plot, series=self.ts, lag=5) @pytest.mark.slow def test_bootstrap_plot(self): from pandas.plotting import bootstrap_plot + _check_plot_works(bootstrap_plot, series=self.ts, size=10) @td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): - @td.skip_if_no_scipy def test_scatter_matrix_axis(self): scatter_matrix = plotting.scatter_matrix @@ -102,27 +107,27 @@ def test_scatter_matrix_axis(self): # we are plotting multiples on a sub-plot with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(scatter_matrix, filterwarnings='always', - frame=df, range_padding=.1) + axes = _check_plot_works( + scatter_matrix, filterwarnings="always", frame=df, range_padding=0.1 + ) axes0_labels = axes[0][0].yaxis.get_majorticklabels() # GH 5662 - expected = ['-2', '0', '2'] + expected = ["-2", "0", "2"] self._check_text_labels(axes0_labels, expected) - self._check_ticks_props( - axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) + self._check_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) - df[0] = ((df[0] - 2) / 3) + df[0] = (df[0] - 2) / 3 # we are plotting multiples on a sub-plot with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(scatter_matrix, filterwarnings='always', - frame=df, range_padding=.1) + axes = _check_plot_works( + scatter_matrix, filterwarnings="always", frame=df, range_padding=0.1 + ) axes0_labels = axes[0][0].yaxis.get_majorticklabels() - expected = ['-1.0', '-0.5', '0.0'] + expected = ["-1.0", "-0.5", "0.0"] self._check_text_labels(axes0_labels, expected) - self._check_ticks_props( - axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) + self._check_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) @pytest.mark.slow def test_andrews_curves(self, iris): @@ -131,63 +136,76 @@ def test_andrews_curves(self, iris): df = iris - _check_plot_works(andrews_curves, frame=df, class_column='Name') + _check_plot_works(andrews_curves, frame=df, class_column="Name") - rgba = ('#556270', '#4ECDC4', '#C7F464') - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', color=rgba) + rgba = ("#556270", "#4ECDC4", "#C7F464") + ax = _check_plot_works( + andrews_curves, frame=df, class_column="Name", color=rgba + ) self._check_colors( - ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) + ax.get_lines()[:10], linecolors=rgba, mapping=df["Name"][:10] + ) - cnames = ['dodgerblue', 'aquamarine', 'seagreen'] - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', color=cnames) + cnames = ["dodgerblue", "aquamarine", "seagreen"] + ax = _check_plot_works( + andrews_curves, frame=df, class_column="Name", color=cnames + ) self._check_colors( - ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) + ax.get_lines()[:10], linecolors=cnames, mapping=df["Name"][:10] + ) - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', colormap=cm.jet) - cmaps = [cm.jet(n) for n in np.linspace(0, 1, df['Name'].nunique())] + ax = _check_plot_works( + andrews_curves, frame=df, class_column="Name", colormap=cm.jet + ) + cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] self._check_colors( - ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) + ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10] + ) length = 10 - df = DataFrame({"A": random.rand(length), - "B": random.rand(length), - "C": random.rand(length), - "Name": ["A"] * length}) - - _check_plot_works(andrews_curves, frame=df, class_column='Name') - - rgba = ('#556270', '#4ECDC4', '#C7F464') - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', color=rgba) + df = DataFrame( + { + "A": random.rand(length), + "B": random.rand(length), + "C": random.rand(length), + "Name": ["A"] * length, + } + ) + + _check_plot_works(andrews_curves, frame=df, class_column="Name") + + rgba = ("#556270", "#4ECDC4", "#C7F464") + ax = _check_plot_works( + andrews_curves, frame=df, class_column="Name", color=rgba + ) self._check_colors( - ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) + ax.get_lines()[:10], linecolors=rgba, mapping=df["Name"][:10] + ) - cnames = ['dodgerblue', 'aquamarine', 'seagreen'] - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', color=cnames) + cnames = ["dodgerblue", "aquamarine", "seagreen"] + ax = _check_plot_works( + andrews_curves, frame=df, class_column="Name", color=cnames + ) self._check_colors( - ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) + ax.get_lines()[:10], linecolors=cnames, mapping=df["Name"][:10] + ) - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', colormap=cm.jet) - cmaps = [cm.jet(n) for n in np.linspace(0, 1, df['Name'].nunique())] + ax = _check_plot_works( + andrews_curves, frame=df, class_column="Name", colormap=cm.jet + ) + cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] self._check_colors( - ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) - - colors = ['b', 'g', 'r'] - df = DataFrame({"A": [1, 2, 3], - "B": [1, 2, 3], - "C": [1, 2, 3], - "Name": colors}) - ax = andrews_curves(df, 'Name', color=colors) + ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10] + ) + + colors = ["b", "g", "r"] + df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) + ax = andrews_curves(df, "Name", color=colors) handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, linecolors=colors) with tm.assert_produces_warning(FutureWarning): - andrews_curves(data=df, class_column='Name') + andrews_curves(data=df, class_column="Name") @pytest.mark.slow def test_parallel_coordinates(self, iris): @@ -196,46 +214,49 @@ def test_parallel_coordinates(self, iris): df = iris - ax = _check_plot_works(parallel_coordinates, - frame=df, class_column='Name') + ax = _check_plot_works(parallel_coordinates, frame=df, class_column="Name") nlines = len(ax.get_lines()) nxticks = len(ax.xaxis.get_ticklabels()) - rgba = ('#556270', '#4ECDC4', '#C7F464') - ax = _check_plot_works(parallel_coordinates, - frame=df, class_column='Name', color=rgba) + rgba = ("#556270", "#4ECDC4", "#C7F464") + ax = _check_plot_works( + parallel_coordinates, frame=df, class_column="Name", color=rgba + ) self._check_colors( - ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) + ax.get_lines()[:10], linecolors=rgba, mapping=df["Name"][:10] + ) - cnames = ['dodgerblue', 'aquamarine', 'seagreen'] - ax = _check_plot_works(parallel_coordinates, - frame=df, class_column='Name', color=cnames) + cnames = ["dodgerblue", "aquamarine", "seagreen"] + ax = _check_plot_works( + parallel_coordinates, frame=df, class_column="Name", color=cnames + ) self._check_colors( - ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) + ax.get_lines()[:10], linecolors=cnames, mapping=df["Name"][:10] + ) - ax = _check_plot_works(parallel_coordinates, - frame=df, class_column='Name', colormap=cm.jet) - cmaps = [cm.jet(n) for n in np.linspace(0, 1, df['Name'].nunique())] + ax = _check_plot_works( + parallel_coordinates, frame=df, class_column="Name", colormap=cm.jet + ) + cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] self._check_colors( - ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) + ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10] + ) - ax = _check_plot_works(parallel_coordinates, - frame=df, class_column='Name', axvlines=False) + ax = _check_plot_works( + parallel_coordinates, frame=df, class_column="Name", axvlines=False + ) assert len(ax.get_lines()) == (nlines - nxticks) - colors = ['b', 'g', 'r'] - df = DataFrame({"A": [1, 2, 3], - "B": [1, 2, 3], - "C": [1, 2, 3], - "Name": colors}) - ax = parallel_coordinates(df, 'Name', color=colors) + colors = ["b", "g", "r"] + df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) + ax = parallel_coordinates(df, "Name", color=colors) handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, linecolors=colors) with tm.assert_produces_warning(FutureWarning): - parallel_coordinates(data=df, class_column='Name') + parallel_coordinates(data=df, class_column="Name") with tm.assert_produces_warning(FutureWarning): - parallel_coordinates(df, 'Name', colors=colors) + parallel_coordinates(df, "Name", colors=colors) # not sure if this is indicative of a problem @pytest.mark.filterwarnings("ignore:Attempting to set:UserWarning") @@ -243,18 +264,24 @@ def test_parallel_coordinates_with_sorted_labels(self): """ For #15908 """ from pandas.plotting import parallel_coordinates - df = DataFrame({"feat": [i for i in range(30)], - "class": [2 for _ in range(10)] + - [3 for _ in range(10)] + - [1 for _ in range(10)]}) - ax = parallel_coordinates(df, 'class', sort_labels=True) + df = DataFrame( + { + "feat": [i for i in range(30)], + "class": [2 for _ in range(10)] + + [3 for _ in range(10)] + + [1 for _ in range(10)], + } + ) + ax = parallel_coordinates(df, "class", sort_labels=True) polylines, labels = ax.get_legend_handles_labels() - color_label_tuples = \ - zip([polyline.get_color() for polyline in polylines], labels) - ordered_color_label_tuples = sorted(color_label_tuples, - key=lambda x: x[1]) - prev_next_tupels = zip([i for i in ordered_color_label_tuples[0:-1]], - [i for i in ordered_color_label_tuples[1:]]) + color_label_tuples = zip( + [polyline.get_color() for polyline in polylines], labels + ) + ordered_color_label_tuples = sorted(color_label_tuples, key=lambda x: x[1]) + prev_next_tupels = zip( + [i for i in ordered_color_label_tuples[0:-1]], + [i for i in ordered_color_label_tuples[1:]], + ) for prev, nxt in prev_next_tupels: # labels and colors are ordered strictly increasing assert prev[1] < nxt[1] and prev[0] < nxt[0] @@ -265,41 +292,35 @@ def test_radviz(self, iris): from matplotlib import cm df = iris - _check_plot_works(radviz, frame=df, class_column='Name') + _check_plot_works(radviz, frame=df, class_column="Name") - rgba = ('#556270', '#4ECDC4', '#C7F464') - ax = _check_plot_works( - radviz, frame=df, class_column='Name', color=rgba) + rgba = ("#556270", "#4ECDC4", "#C7F464") + ax = _check_plot_works(radviz, frame=df, class_column="Name", color=rgba) # skip Circle drawn as ticks - patches = [p for p in ax.patches[:20] if p.get_label() != ''] - self._check_colors( - patches[:10], facecolors=rgba, mapping=df['Name'][:10]) - - cnames = ['dodgerblue', 'aquamarine', 'seagreen'] - _check_plot_works(radviz, frame=df, class_column='Name', color=cnames) - patches = [p for p in ax.patches[:20] if p.get_label() != ''] - self._check_colors(patches, facecolors=cnames, mapping=df['Name'][:10]) - - _check_plot_works(radviz, frame=df, - class_column='Name', colormap=cm.jet) - cmaps = [cm.jet(n) for n in np.linspace(0, 1, df['Name'].nunique())] - patches = [p for p in ax.patches[:20] if p.get_label() != ''] - self._check_colors(patches, facecolors=cmaps, mapping=df['Name'][:10]) - - colors = [[0., 0., 1., 1.], - [0., 0.5, 1., 1.], - [1., 0., 0., 1.]] - df = DataFrame({"A": [1, 2, 3], - "B": [2, 1, 3], - "C": [3, 2, 1], - "Name": ['b', 'g', 'r']}) - ax = radviz(df, 'Name', color=colors) + patches = [p for p in ax.patches[:20] if p.get_label() != ""] + self._check_colors(patches[:10], facecolors=rgba, mapping=df["Name"][:10]) + + cnames = ["dodgerblue", "aquamarine", "seagreen"] + _check_plot_works(radviz, frame=df, class_column="Name", color=cnames) + patches = [p for p in ax.patches[:20] if p.get_label() != ""] + self._check_colors(patches, facecolors=cnames, mapping=df["Name"][:10]) + + _check_plot_works(radviz, frame=df, class_column="Name", colormap=cm.jet) + cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] + patches = [p for p in ax.patches[:20] if p.get_label() != ""] + self._check_colors(patches, facecolors=cmaps, mapping=df["Name"][:10]) + + colors = [[0.0, 0.0, 1.0, 1.0], [0.0, 0.5, 1.0, 1.0], [1.0, 0.0, 0.0, 1.0]] + df = DataFrame( + {"A": [1, 2, 3], "B": [2, 1, 3], "C": [3, 2, 1], "Name": ["b", "g", "r"]} + ) + ax = radviz(df, "Name", color=colors) handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, facecolors=colors) @pytest.mark.slow def test_subplot_titles(self, iris): - df = iris.drop('Name', axis=1).head() + df = iris.drop("Name", axis=1).head() # Use the column names as the subplot titles title = list(df.columns) @@ -308,8 +329,10 @@ def test_subplot_titles(self, iris): assert [p.get_title() for p in plot] == title # Case len(title) > len(df) - msg = ("The length of `title` must equal the number of columns if" - " using `title` of type `list` and `subplots=True`") + msg = ( + "The length of `title` must equal the number of columns if" + " using `title` of type `list` and `subplots=True`" + ) with pytest.raises(ValueError, match=msg): df.plot(subplots=True, title=title + ["kittens > puppies"]) @@ -318,16 +341,19 @@ def test_subplot_titles(self, iris): df.plot(subplots=True, title=title[:2]) # Case subplots=False and title is of type list - msg = ("Using `title` of type `list` is not supported unless" - " `subplots=True` is passed") + msg = ( + "Using `title` of type `list` is not supported unless" + " `subplots=True` is passed" + ) with pytest.raises(ValueError, match=msg): df.plot(subplots=False, title=title) # Case df with 3 numeric columns but layout of (2,2) - plot = df.drop('SepalWidth', axis=1).plot(subplots=True, layout=(2, 2), - title=title[:-1]) + plot = df.drop("SepalWidth", axis=1).plot( + subplots=True, layout=(2, 2), title=title[:-1] + ) title_list = [ax.get_title() for sublist in plot for ax in sublist] - assert title_list == title[:3] + [''] + assert title_list == title[:3] + [""] def test_get_standard_colors_random_seed(self): # GH17525 @@ -342,32 +368,34 @@ def test_get_standard_colors_random_seed(self): # Make sure it produces the same colors every time it's called from pandas.plotting._matplotlib.style import _get_standard_colors - color1 = _get_standard_colors(1, color_type='random') - color2 = _get_standard_colors(1, color_type='random') + + color1 = _get_standard_colors(1, color_type="random") + color2 = _get_standard_colors(1, color_type="random") assert color1 == color2 def test_get_standard_colors_default_num_colors(self): from pandas.plotting._matplotlib.style import _get_standard_colors # Make sure the default color_types returns the specified amount - color1 = _get_standard_colors(1, color_type='default') - color2 = _get_standard_colors(9, color_type='default') - color3 = _get_standard_colors(20, color_type='default') + color1 = _get_standard_colors(1, color_type="default") + color2 = _get_standard_colors(9, color_type="default") + color3 = _get_standard_colors(20, color_type="default") assert len(color1) == 1 assert len(color2) == 9 assert len(color3) == 20 def test_plot_single_color(self): # Example from #20585. All 3 bars should have the same color - df = DataFrame({'account-start': ['2017-02-03', '2017-03-03', - '2017-01-01'], - 'client': ['Alice Anders', 'Bob Baker', - 'Charlie Chaplin'], - 'balance': [-1432.32, 10.43, 30000.00], - 'db-id': [1234, 2424, 251], - 'proxy-id': [525, 1525, 2542], - 'rank': [52, 525, 32], - }) + df = DataFrame( + { + "account-start": ["2017-02-03", "2017-03-03", "2017-01-01"], + "client": ["Alice Anders", "Bob Baker", "Charlie Chaplin"], + "balance": [-1432.32, 10.43, 30000.00], + "db-id": [1234, 2424, 251], + "proxy-id": [525, 1525, 2542], + "rank": [52, 525, 32], + } + ) ax = df.client.value_counts().plot.bar() colors = [rect.get_facecolor() for rect in ax.get_children()[0:3]] assert all(color == colors[0] for color in colors) @@ -379,6 +407,7 @@ def test_get_standard_colors_no_appending(self): # correctly. from matplotlib import cm from pandas.plotting._matplotlib.style import _get_standard_colors + color_before = cm.gnuplot(range(5)) color_after = _get_standard_colors(1, color=color_before) assert len(color_after) == len(color_before) @@ -387,5 +416,4 @@ def test_get_standard_colors_no_appending(self): color_list = cm.gnuplot(np.linspace(0, 1, 16)) p = df.A.plot.bar(figsize=(16, 7), color=color_list) - assert (p.patches[1].get_facecolor() - == p.patches[17].get_facecolor()) + assert p.patches[1].get_facecolor() == p.patches[17].get_facecolor() diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index d10620b4e75471..4c5b1e66d00751 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -22,47 +22,47 @@ @td.skip_if_no_mpl class TestSeriesPlots(TestPlotBase): - def setup_method(self, method): TestPlotBase.setup_method(self, method) import matplotlib as mpl + mpl.rcdefaults() self.ts = tm.makeTimeSeries() - self.ts.name = 'ts' + self.ts.name = "ts" self.series = tm.makeStringSeries() - self.series.name = 'series' + self.series.name = "series" self.iseries = tm.makePeriodSeries() - self.iseries.name = 'iseries' + self.iseries.name = "iseries" @pytest.mark.slow def test_plot(self): - _check_plot_works(self.ts.plot, label='foo') + _check_plot_works(self.ts.plot, label="foo") _check_plot_works(self.ts.plot, use_index=False) axes = _check_plot_works(self.ts.plot, rot=0) self._check_ticks_props(axes, xrot=0) - ax = _check_plot_works(self.ts.plot, style='.', logy=True) - self._check_ax_scales(ax, yaxis='log') + ax = _check_plot_works(self.ts.plot, style=".", logy=True) + self._check_ax_scales(ax, yaxis="log") - ax = _check_plot_works(self.ts.plot, style='.', logx=True) - self._check_ax_scales(ax, xaxis='log') + ax = _check_plot_works(self.ts.plot, style=".", logx=True) + self._check_ax_scales(ax, xaxis="log") - ax = _check_plot_works(self.ts.plot, style='.', loglog=True) - self._check_ax_scales(ax, xaxis='log', yaxis='log') + ax = _check_plot_works(self.ts.plot, style=".", loglog=True) + self._check_ax_scales(ax, xaxis="log", yaxis="log") _check_plot_works(self.ts[:10].plot.bar) _check_plot_works(self.ts.plot.area, stacked=False) _check_plot_works(self.iseries.plot) - for kind in ['line', 'bar', 'barh', 'kde', 'hist', 'box']: + for kind in ["line", "bar", "barh", "kde", "hist", "box"]: _check_plot_works(self.series[:5].plot, kind=kind) _check_plot_works(self.series[:10].plot.barh) - ax = _check_plot_works(Series(randn(10)).plot.bar, color='black') - self._check_colors([ax.patches[0]], facecolors=['black']) + ax = _check_plot_works(Series(randn(10)).plot.bar, color="black") + self._check_colors([ax.patches[0]], facecolors=["black"]) # GH 6951 ax = _check_plot_works(self.ts.plot, subplots=True) @@ -77,13 +77,13 @@ def test_plot(self): def test_plot_figsize_and_title(self): # figsize and title _, ax = self.plt.subplots() - ax = self.series.plot(title='Test', figsize=(16, 8), ax=ax) - self._check_text_labels(ax.title, 'Test') + ax = self.series.plot(title="Test", figsize=(16, 8), ax=ax) + self._check_text_labels(ax.title, "Test") self._check_axes_shape(ax, axes_num=1, layout=(1, 1), figsize=(16, 8)) def test_dont_modify_rcParams(self): # GH 8242 - key = 'axes.prop_cycle' + key = "axes.prop_cycle" colors = self.plt.rcParams[key] _, ax = self.plt.subplots() Series([1, 2, 3]).plot(ax=ax) @@ -123,7 +123,7 @@ def test_ts_area_lim(self): tm.close() tz_ts = self.ts.copy() - tz_ts.index = tz_ts.tz_localize('GMT').tz_convert('CET') + tz_ts.index = tz_ts.tz_localize("GMT").tz_convert("CET") _, ax = self.plt.subplots() ax = tz_ts.plot.area(stacked=False, x_compat=True, ax=ax) xmin, xmax = ax.get_xlim() @@ -142,30 +142,30 @@ def test_ts_area_lim(self): def test_label(self): s = Series([1, 2]) _, ax = self.plt.subplots() - ax = s.plot(label='LABEL', legend=True, ax=ax) - self._check_legend_labels(ax, labels=['LABEL']) + ax = s.plot(label="LABEL", legend=True, ax=ax) + self._check_legend_labels(ax, labels=["LABEL"]) self.plt.close() _, ax = self.plt.subplots() ax = s.plot(legend=True, ax=ax) - self._check_legend_labels(ax, labels=['None']) + self._check_legend_labels(ax, labels=["None"]) self.plt.close() # get name from index - s.name = 'NAME' + s.name = "NAME" _, ax = self.plt.subplots() ax = s.plot(legend=True, ax=ax) - self._check_legend_labels(ax, labels=['NAME']) + self._check_legend_labels(ax, labels=["NAME"]) self.plt.close() # override the default _, ax = self.plt.subplots() - ax = s.plot(legend=True, label='LABEL', ax=ax) - self._check_legend_labels(ax, labels=['LABEL']) + ax = s.plot(legend=True, label="LABEL", ax=ax) + self._check_legend_labels(ax, labels=["LABEL"]) self.plt.close() # Add lebel info, but don't draw _, ax = self.plt.subplots() - ax = s.plot(legend=False, label='LABEL', ax=ax) + ax = s.plot(legend=False, label="LABEL", ax=ax) assert ax.get_legend() is None # Hasn't been drawn ax.legend() # draw it - self._check_legend_labels(ax, labels=['LABEL']) + self._check_legend_labels(ax, labels=["LABEL"]) def test_line_area_nan_series(self): values = [1, 2, np.nan, 3] @@ -179,7 +179,8 @@ def test_line_area_nan_series(self): exp = np.array([1, 2, 3], dtype=np.float64) tm.assert_numpy_array_equal(np.delete(masked.data, 2), exp) tm.assert_numpy_array_equal( - masked.mask, np.array([False, False, True, False])) + masked.mask, np.array([False, False, True, False]) + ) expected = np.array([1, 2, 0, 3], dtype=np.float64) ax = _check_plot_works(d.plot, stacked=True) @@ -190,16 +191,16 @@ def test_line_area_nan_series(self): tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected) def test_line_use_index_false(self): - s = Series([1, 2, 3], index=['a', 'b', 'c']) - s.index.name = 'The Index' + s = Series([1, 2, 3], index=["a", "b", "c"]) + s.index.name = "The Index" _, ax = self.plt.subplots() ax = s.plot(use_index=False, ax=ax) label = ax.get_xlabel() - assert label == '' + assert label == "" _, ax = self.plt.subplots() ax2 = s.plot.bar(use_index=False, ax=ax) label2 = ax2.get_xlabel() - assert label2 == '' + assert label2 == "" @pytest.mark.slow def test_bar_log(self): @@ -219,7 +220,7 @@ def test_bar_log(self): expected = np.array([1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1]) _, ax = self.plt.subplots() - ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='bar', ax=ax) + ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind="bar", ax=ax) ymin = 0.0007943282347242822 ymax = 0.12589254117941673 res = ax.get_ylim() @@ -229,7 +230,7 @@ def test_bar_log(self): tm.close() _, ax = self.plt.subplots() - ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='barh', ax=ax) + ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind="barh", ax=ax) res = ax.get_xlim() tm.assert_almost_equal(res[0], ymin) tm.assert_almost_equal(res[1], ymax) @@ -237,19 +238,21 @@ def test_bar_log(self): @pytest.mark.slow def test_bar_ignore_index(self): - df = Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + df = Series([1, 2, 3, 4], index=["a", "b", "c", "d"]) _, ax = self.plt.subplots() ax = df.plot.bar(use_index=False, ax=ax) - self._check_text_labels(ax.get_xticklabels(), ['0', '1', '2', '3']) + self._check_text_labels(ax.get_xticklabels(), ["0", "1", "2", "3"]) def test_bar_user_colors(self): s = Series([1, 2, 3, 4]) - ax = s.plot.bar(color=['red', 'blue', 'blue', 'red']) + ax = s.plot.bar(color=["red", "blue", "blue", "red"]) result = [p.get_facecolor() for p in ax.patches] - expected = [(1., 0., 0., 1.), - (0., 0., 1., 1.), - (0., 0., 1., 1.), - (1., 0., 0., 1.)] + expected = [ + (1.0, 0.0, 0.0, 1.0), + (0.0, 0.0, 1.0, 1.0), + (0.0, 0.0, 1.0, 1.0), + (1.0, 0.0, 0.0, 1.0), + ] assert result == expected def test_rotation(self): @@ -264,18 +267,20 @@ def test_rotation(self): self._check_ticks_props(axes, xrot=30) def test_irregular_datetime(self): - rng = date_range('1/1/2000', '3/1/2000') + rng = date_range("1/1/2000", "3/1/2000") rng = rng[[0, 1, 2, 3, 5, 9, 10, 11, 12]] ser = Series(randn(len(rng)), rng) _, ax = self.plt.subplots() ax = ser.plot(ax=ax) xp = datetime(1999, 1, 1).toordinal() - ax.set_xlim('1/1/1999', '1/1/2001') + ax.set_xlim("1/1/1999", "1/1/2001") assert xp == ax.get_xlim()[0] def test_unsorted_index_xlim(self): - ser = Series([0., 1., np.nan, 3., 4., 5., 6.], - index=[1., 0., 3., 2., np.nan, 3., 2.]) + ser = Series( + [0.0, 1.0, np.nan, 3.0, 4.0, 5.0, 6.0], + index=[1.0, 0.0, 3.0, 2.0, np.nan, 3.0, 2.0], + ) _, ax = self.plt.subplots() ax = ser.plot(ax=ax) xmin, xmax = ax.get_xlim() @@ -287,36 +292,36 @@ def test_unsorted_index_xlim(self): def test_pie_series(self): # if sum of values is less than 1.0, pie handle them as rate and draw # semicircle. - series = Series(np.random.randint(1, 5), - index=['a', 'b', 'c', 'd', 'e'], name='YLABEL') + series = Series( + np.random.randint(1, 5), index=["a", "b", "c", "d", "e"], name="YLABEL" + ) ax = _check_plot_works(series.plot.pie) self._check_text_labels(ax.texts, series.index) - assert ax.get_ylabel() == 'YLABEL' + assert ax.get_ylabel() == "YLABEL" # without wedge labels ax = _check_plot_works(series.plot.pie, labels=None) - self._check_text_labels(ax.texts, [''] * 5) + self._check_text_labels(ax.texts, [""] * 5) # with less colors than elements - color_args = ['r', 'g', 'b'] + color_args = ["r", "g", "b"] ax = _check_plot_works(series.plot.pie, colors=color_args) - color_expected = ['r', 'g', 'b', 'r', 'g'] + color_expected = ["r", "g", "b", "r", "g"] self._check_colors(ax.patches, facecolors=color_expected) # with labels and colors - labels = ['A', 'B', 'C', 'D', 'E'] - color_args = ['r', 'g', 'b', 'c', 'm'] - ax = _check_plot_works(series.plot.pie, labels=labels, - colors=color_args) + labels = ["A", "B", "C", "D", "E"] + color_args = ["r", "g", "b", "c", "m"] + ax = _check_plot_works(series.plot.pie, labels=labels, colors=color_args) self._check_text_labels(ax.texts, labels) self._check_colors(ax.patches, facecolors=color_args) # with autopct and fontsize - ax = _check_plot_works(series.plot.pie, colors=color_args, - autopct='%.2f', fontsize=7) - pcts = ['{0:.2f}'.format(s * 100) - for s in series.values / float(series.sum())] + ax = _check_plot_works( + series.plot.pie, colors=color_args, autopct="%.2f", fontsize=7 + ) + pcts = ["{0:.2f}".format(s * 100) for s in series.values / float(series.sum())] expected_texts = list(chain.from_iterable(zip(series.index, pcts))) self._check_text_labels(ax.texts, expected_texts) for t in ax.texts: @@ -324,20 +329,19 @@ def test_pie_series(self): # includes negative value with pytest.raises(ValueError): - series = Series([1, 2, 0, 4, -1], index=['a', 'b', 'c', 'd', 'e']) + series = Series([1, 2, 0, 4, -1], index=["a", "b", "c", "d", "e"]) series.plot.pie() # includes nan - series = Series([1, 2, np.nan, 4], index=['a', 'b', 'c', 'd'], - name='YLABEL') + series = Series([1, 2, np.nan, 4], index=["a", "b", "c", "d"], name="YLABEL") ax = _check_plot_works(series.plot.pie) - self._check_text_labels(ax.texts, ['a', 'b', '', 'd']) + self._check_text_labels(ax.texts, ["a", "b", "", "d"]) def test_pie_nan(self): s = Series([1, np.nan, 1, 1]) _, ax = self.plt.subplots() ax = s.plot.pie(legend=True, ax=ax) - expected = ['0', '', '2', '3'] + expected = ["0", "", "2", "3"] result = [x.get_text() for x in ax.texts] assert result == expected @@ -352,9 +356,8 @@ def test_hist_df_kwargs(self): def test_hist_df_with_nonnumerics(self): # GH 9853 with tm.RNGContext(1): - df = DataFrame( - np.random.randn(10, 4), columns=['A', 'B', 'C', 'D']) - df['E'] = ['x', 'y'] * 5 + df = DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"]) + df["E"] = ["x", "y"] * 5 _, ax = self.plt.subplots() ax = df.plot.hist(bins=5, ax=ax) assert len(ax.patches) == 20 @@ -370,11 +373,9 @@ def test_hist_legacy(self): _check_plot_works(self.ts.hist, figsize=(8, 10)) # _check_plot_works adds an ax so catch warning. see GH #13188 with tm.assert_produces_warning(UserWarning): - _check_plot_works(self.ts.hist, - by=self.ts.index.month) + _check_plot_works(self.ts.hist, by=self.ts.index.month) with tm.assert_produces_warning(UserWarning): - _check_plot_works(self.ts.hist, - by=self.ts.index.month, bins=5) + _check_plot_works(self.ts.hist, by=self.ts.index.month, bins=5) fig, ax = self.plt.subplots(1, 1) _check_plot_works(self.ts.hist, ax=ax) @@ -410,47 +411,40 @@ def test_hist_layout_with_by(self): # _check_plot_works adds an ax so catch warning. see GH #13188 with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, - by=df.gender, layout=(2, 1)) + axes = _check_plot_works(df.height.hist, by=df.gender, layout=(2, 1)) self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, - by=df.gender, layout=(3, -1)) + axes = _check_plot_works(df.height.hist, by=df.gender, layout=(3, -1)) self._check_axes_shape(axes, axes_num=2, layout=(3, 1)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, - by=df.category, layout=(4, 1)) + axes = _check_plot_works(df.height.hist, by=df.category, layout=(4, 1)) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, - by=df.category, layout=(2, -1)) + axes = _check_plot_works(df.height.hist, by=df.category, layout=(2, -1)) self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, - by=df.category, layout=(3, -1)) + axes = _check_plot_works(df.height.hist, by=df.category, layout=(3, -1)) self._check_axes_shape(axes, axes_num=4, layout=(3, 2)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, - by=df.category, layout=(-1, 4)) + axes = _check_plot_works(df.height.hist, by=df.category, layout=(-1, 4)) self._check_axes_shape(axes, axes_num=4, layout=(1, 4)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, - by=df.classroom, layout=(2, 2)) + axes = _check_plot_works(df.height.hist, by=df.classroom, layout=(2, 2)) self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7)) - self._check_axes_shape(axes, axes_num=4, layout=(4, 2), - figsize=(12, 7)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) @pytest.mark.slow def test_hist_no_overlap(self): from matplotlib.pyplot import subplot, gcf + x = Series(randn(2)) y = Series(randn(2)) subplot(121) @@ -464,39 +458,38 @@ def test_hist_no_overlap(self): @pytest.mark.slow def test_hist_secondary_legend(self): # GH 9610 - df = DataFrame(np.random.randn(30, 4), columns=list('abcd')) + df = DataFrame(np.random.randn(30, 4), columns=list("abcd")) # primary -> secondary _, ax = self.plt.subplots() - ax = df['a'].plot.hist(legend=True, ax=ax) - df['b'].plot.hist(ax=ax, legend=True, secondary_y=True) + ax = df["a"].plot.hist(legend=True, ax=ax) + df["b"].plot.hist(ax=ax, legend=True, secondary_y=True) # both legends are dran on left ax # left and right axis must be visible - self._check_legend_labels(ax, labels=['a', 'b (right)']) + self._check_legend_labels(ax, labels=["a", "b (right)"]) assert ax.get_yaxis().get_visible() assert ax.right_ax.get_yaxis().get_visible() tm.close() # secondary -> secondary _, ax = self.plt.subplots() - ax = df['a'].plot.hist(legend=True, secondary_y=True, ax=ax) - df['b'].plot.hist(ax=ax, legend=True, secondary_y=True) + ax = df["a"].plot.hist(legend=True, secondary_y=True, ax=ax) + df["b"].plot.hist(ax=ax, legend=True, secondary_y=True) # both legends are draw on left ax # left axis must be invisible, right axis must be visible - self._check_legend_labels(ax.left_ax, - labels=['a (right)', 'b (right)']) + self._check_legend_labels(ax.left_ax, labels=["a (right)", "b (right)"]) assert not ax.left_ax.get_yaxis().get_visible() assert ax.get_yaxis().get_visible() tm.close() # secondary -> primary _, ax = self.plt.subplots() - ax = df['a'].plot.hist(legend=True, secondary_y=True, ax=ax) + ax = df["a"].plot.hist(legend=True, secondary_y=True, ax=ax) # right axes is returned - df['b'].plot.hist(ax=ax, legend=True) + df["b"].plot.hist(ax=ax, legend=True) # both legends are draw on left ax # left and right axis must be visible - self._check_legend_labels(ax.left_ax, labels=['a (right)', 'b']) + self._check_legend_labels(ax.left_ax, labels=["a (right)", "b"]) assert ax.left_ax.get_yaxis().get_visible() assert ax.get_yaxis().get_visible() tm.close() @@ -504,8 +497,8 @@ def test_hist_secondary_legend(self): @pytest.mark.slow def test_df_series_secondary_legend(self): # GH 9779 - df = DataFrame(np.random.randn(30, 3), columns=list('abc')) - s = Series(np.random.randn(30), name='x') + df = DataFrame(np.random.randn(30, 3), columns=list("abc")) + s = Series(np.random.randn(30), name="x") # primary -> secondary (without passing ax) _, ax = self.plt.subplots() @@ -513,7 +506,7 @@ def test_df_series_secondary_legend(self): s.plot(legend=True, secondary_y=True, ax=ax) # both legends are dran on left ax # left and right axis must be visible - self._check_legend_labels(ax, labels=['a', 'b', 'c', 'x (right)']) + self._check_legend_labels(ax, labels=["a", "b", "c", "x (right)"]) assert ax.get_yaxis().get_visible() assert ax.right_ax.get_yaxis().get_visible() tm.close() @@ -524,7 +517,7 @@ def test_df_series_secondary_legend(self): s.plot(ax=ax, legend=True, secondary_y=True) # both legends are dran on left ax # left and right axis must be visible - self._check_legend_labels(ax, labels=['a', 'b', 'c', 'x (right)']) + self._check_legend_labels(ax, labels=["a", "b", "c", "x (right)"]) assert ax.get_yaxis().get_visible() assert ax.right_ax.get_yaxis().get_visible() tm.close() @@ -535,7 +528,7 @@ def test_df_series_secondary_legend(self): s.plot(legend=True, secondary_y=True, ax=ax) # both legends are dran on left ax # left axis must be invisible and right axis must be visible - expected = ['a (right)', 'b (right)', 'c (right)', 'x (right)'] + expected = ["a (right)", "b (right)", "c (right)", "x (right)"] self._check_legend_labels(ax.left_ax, labels=expected) assert not ax.left_ax.get_yaxis().get_visible() assert ax.get_yaxis().get_visible() @@ -547,7 +540,7 @@ def test_df_series_secondary_legend(self): s.plot(ax=ax, legend=True, secondary_y=True) # both legends are dran on left ax # left axis must be invisible and right axis must be visible - expected = ['a (right)', 'b (right)', 'c (right)', 'x (right)'] + expected = ["a (right)", "b (right)", "c (right)", "x (right)"] self._check_legend_labels(ax.left_ax, expected) assert not ax.left_ax.get_yaxis().get_visible() assert ax.get_yaxis().get_visible() @@ -559,17 +552,16 @@ def test_df_series_secondary_legend(self): s.plot(ax=ax, legend=True, secondary_y=True) # both legends are dran on left ax # left axis must be invisible and right axis must be visible - expected = ['a', 'b', 'c', 'x (right)'] + expected = ["a", "b", "c", "x (right)"] self._check_legend_labels(ax.left_ax, expected) assert not ax.left_ax.get_yaxis().get_visible() assert ax.get_yaxis().get_visible() tm.close() @pytest.mark.slow - @pytest.mark.parametrize("input_logy, expected_scale", [ - (True, 'log'), - ('sym', 'symlog') - ]) + @pytest.mark.parametrize( + "input_logy, expected_scale", [(True, "log"), ("sym", "symlog")] + ) def test_secondary_logy(self, input_logy, expected_scale): # GH 25545 s1 = Series(np.random.randn(30)) @@ -587,7 +579,7 @@ def test_plot_fails_with_dupe_color_and_style(self): x = Series(randn(2)) with pytest.raises(ValueError): _, ax = self.plt.subplots() - x.plot(style='k--', color='k', ax=ax) + x.plot(style="k--", color="k", ax=ax) @pytest.mark.slow @td.skip_if_no_scipy @@ -595,38 +587,36 @@ def test_hist_kde(self): _, ax = self.plt.subplots() ax = self.ts.plot.hist(logy=True, ax=ax) - self._check_ax_scales(ax, yaxis='log') + self._check_ax_scales(ax, yaxis="log") xlabels = ax.get_xticklabels() # ticks are values, thus ticklabels are blank - self._check_text_labels(xlabels, [''] * len(xlabels)) + self._check_text_labels(xlabels, [""] * len(xlabels)) ylabels = ax.get_yticklabels() - self._check_text_labels(ylabels, [''] * len(ylabels)) + self._check_text_labels(ylabels, [""] * len(ylabels)) _check_plot_works(self.ts.plot.kde) _check_plot_works(self.ts.plot.density) _, ax = self.plt.subplots() ax = self.ts.plot.kde(logy=True, ax=ax) - self._check_ax_scales(ax, yaxis='log') + self._check_ax_scales(ax, yaxis="log") xlabels = ax.get_xticklabels() - self._check_text_labels(xlabels, [''] * len(xlabels)) + self._check_text_labels(xlabels, [""] * len(xlabels)) ylabels = ax.get_yticklabels() - self._check_text_labels(ylabels, [''] * len(ylabels)) + self._check_text_labels(ylabels, [""] * len(ylabels)) @pytest.mark.slow @td.skip_if_no_scipy def test_kde_kwargs(self): sample_points = np.linspace(-100, 100, 20) - _check_plot_works(self.ts.plot.kde, bw_method='scott', ind=20) + _check_plot_works(self.ts.plot.kde, bw_method="scott", ind=20) _check_plot_works(self.ts.plot.kde, bw_method=None, ind=20) _check_plot_works(self.ts.plot.kde, bw_method=None, ind=np.int(20)) - _check_plot_works(self.ts.plot.kde, bw_method=.5, ind=sample_points) - _check_plot_works(self.ts.plot.density, bw_method=.5, - ind=sample_points) + _check_plot_works(self.ts.plot.kde, bw_method=0.5, ind=sample_points) + _check_plot_works(self.ts.plot.density, bw_method=0.5, ind=sample_points) _, ax = self.plt.subplots() - ax = self.ts.plot.kde(logy=True, bw_method=.5, ind=sample_points, - ax=ax) - self._check_ax_scales(ax, yaxis='log') - self._check_text_labels(ax.yaxis.get_label(), 'Density') + ax = self.ts.plot.kde(logy=True, bw_method=0.5, ind=sample_points, ax=ax) + self._check_ax_scales(ax, yaxis="log") + self._check_text_labels(ax.yaxis.get_label(), "Density") @pytest.mark.slow @td.skip_if_no_scipy @@ -643,49 +633,50 @@ def test_hist_kwargs(self): _, ax = self.plt.subplots() ax = self.ts.plot.hist(bins=5, ax=ax) assert len(ax.patches) == 5 - self._check_text_labels(ax.yaxis.get_label(), 'Frequency') + self._check_text_labels(ax.yaxis.get_label(), "Frequency") tm.close() _, ax = self.plt.subplots() - ax = self.ts.plot.hist(orientation='horizontal', ax=ax) - self._check_text_labels(ax.xaxis.get_label(), 'Frequency') + ax = self.ts.plot.hist(orientation="horizontal", ax=ax) + self._check_text_labels(ax.xaxis.get_label(), "Frequency") tm.close() _, ax = self.plt.subplots() - ax = self.ts.plot.hist(align='left', stacked=True, ax=ax) + ax = self.ts.plot.hist(align="left", stacked=True, ax=ax) tm.close() @pytest.mark.slow @td.skip_if_no_scipy def test_hist_kde_color(self): _, ax = self.plt.subplots() - ax = self.ts.plot.hist(logy=True, bins=10, color='b', ax=ax) - self._check_ax_scales(ax, yaxis='log') + ax = self.ts.plot.hist(logy=True, bins=10, color="b", ax=ax) + self._check_ax_scales(ax, yaxis="log") assert len(ax.patches) == 10 - self._check_colors(ax.patches, facecolors=['b'] * 10) + self._check_colors(ax.patches, facecolors=["b"] * 10) _, ax = self.plt.subplots() - ax = self.ts.plot.kde(logy=True, color='r', ax=ax) - self._check_ax_scales(ax, yaxis='log') + ax = self.ts.plot.kde(logy=True, color="r", ax=ax) + self._check_ax_scales(ax, yaxis="log") lines = ax.get_lines() assert len(lines) == 1 - self._check_colors(lines, ['r']) + self._check_colors(lines, ["r"]) @pytest.mark.slow def test_boxplot_series(self): _, ax = self.plt.subplots() ax = self.ts.plot.box(logy=True, ax=ax) - self._check_ax_scales(ax, yaxis='log') + self._check_ax_scales(ax, yaxis="log") xlabels = ax.get_xticklabels() self._check_text_labels(xlabels, [self.ts.name]) ylabels = ax.get_yticklabels() - self._check_text_labels(ylabels, [''] * len(ylabels)) + self._check_text_labels(ylabels, [""] * len(ylabels)) @pytest.mark.slow def test_kind_both_ways(self): s = Series(range(3)) - kinds = (plotting.PlotAccessor._common_kinds + - plotting.PlotAccessor._series_kinds) + kinds = ( + plotting.PlotAccessor._common_kinds + plotting.PlotAccessor._series_kinds + ) _, ax = self.plt.subplots() for kind in kinds: @@ -694,7 +685,7 @@ def test_kind_both_ways(self): @pytest.mark.slow def test_invalid_plot_data(self): - s = Series(list('abcd')) + s = Series(list("abcd")) _, ax = self.plt.subplots() for kind in plotting.PlotAccessor._common_kinds: @@ -709,7 +700,7 @@ def test_valid_object_plot(self): _check_plot_works(s.plot, kind=kind) def test_partially_invalid_plot_data(self): - s = Series(['a', 'b', 1.0, 2]) + s = Series(["a", "b", 1.0, 2]) _, ax = self.plt.subplots() for kind in plotting.PlotAccessor._common_kinds: @@ -720,12 +711,12 @@ def test_partially_invalid_plot_data(self): def test_invalid_kind(self): s = Series([1, 2]) with pytest.raises(ValueError): - s.plot(kind='aasdf') + s.plot(kind="aasdf") @pytest.mark.slow def test_dup_datetime_index_plot(self): - dr1 = date_range('1/1/2009', periods=4) - dr2 = date_range('1/2/2009', periods=4) + dr1 = date_range("1/1/2009", periods=4) + dr2 = date_range("1/2/2009", periods=4) index = dr1.append(dr2) values = randn(index.size) s = Series(values, index=index) @@ -734,11 +725,11 @@ def test_dup_datetime_index_plot(self): @pytest.mark.slow def test_errorbar_plot(self): - s = Series(np.arange(10), name='x') + s = Series(np.arange(10), name="x") s_err = np.random.randn(10) - d_err = DataFrame(randn(10, 2), index=s.index, columns=['x', 'y']) + d_err = DataFrame(randn(10, 2), index=s.index, columns=["x", "y"]) # test line and bar plots - kinds = ['line', 'bar'] + kinds = ["line", "bar"] for kind in kinds: ax = _check_plot_works(s.plot, yerr=Series(s_err), kind=kind) self._check_has_errorbars(ax, xerr=0, yerr=1) @@ -755,10 +746,10 @@ def test_errorbar_plot(self): self._check_has_errorbars(ax, xerr=1, yerr=0) # test time series plotting - ix = date_range('1/1/2000', '1/1/2001', freq='M') - ts = Series(np.arange(12), index=ix, name='x') + ix = date_range("1/1/2000", "1/1/2001", freq="M") + ts = Series(np.arange(12), index=ix, name="x") ts_err = Series(np.random.randn(12), index=ix) - td_err = DataFrame(randn(12, 2), index=ix, columns=['x', 'y']) + td_err = DataFrame(randn(12, 2), index=ix, columns=["x", "y"]) ax = _check_plot_works(ts.plot, yerr=ts_err) self._check_has_errorbars(ax, xerr=0, yerr=1) @@ -769,7 +760,7 @@ def test_errorbar_plot(self): with pytest.raises(ValueError): s.plot(yerr=np.arange(11)) - s_err = ['zzz'] * 10 + s_err = ["zzz"] * 10 with pytest.raises(TypeError): s.plot(yerr=s_err) @@ -780,15 +771,16 @@ def test_table(self): @pytest.mark.slow def test_series_grid_settings(self): # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 - self._check_grid_settings(Series([1, 2, 3]), - plotting.PlotAccessor._series_kinds + - plotting.PlotAccessor._common_kinds) + self._check_grid_settings( + Series([1, 2, 3]), + plotting.PlotAccessor._series_kinds + plotting.PlotAccessor._common_kinds, + ) @pytest.mark.slow def test_standard_colors(self): from pandas.plotting._matplotlib.style import _get_standard_colors - for c in ['r', 'red', 'green', '#FF0000']: + for c in ["r", "red", "green", "#FF0000"]: result = _get_standard_colors(1, color=c) assert result == [c] @@ -837,21 +829,22 @@ def test_standard_colors_all(self): def test_series_plot_color_kwargs(self): # GH1890 _, ax = self.plt.subplots() - ax = Series(np.arange(12) + 1).plot(color='green', ax=ax) - self._check_colors(ax.get_lines(), linecolors=['green']) + ax = Series(np.arange(12) + 1).plot(color="green", ax=ax) + self._check_colors(ax.get_lines(), linecolors=["green"]) def test_time_series_plot_color_kwargs(self): # #1890 _, ax = self.plt.subplots() - ax = Series(np.arange(12) + 1, index=date_range( - '1/1/2000', periods=12)).plot(color='green', ax=ax) - self._check_colors(ax.get_lines(), linecolors=['green']) + ax = Series(np.arange(12) + 1, index=date_range("1/1/2000", periods=12)).plot( + color="green", ax=ax + ) + self._check_colors(ax.get_lines(), linecolors=["green"]) def test_time_series_plot_color_with_empty_kwargs(self): import matplotlib as mpl def_colors = self._unpack_cycler(mpl.rcParams) - index = date_range('1/1/2000', periods=12) + index = date_range("1/1/2000", periods=12) s = Series(np.arange(1, 13), index=index) ncolors = 3 @@ -863,18 +856,24 @@ def test_time_series_plot_color_with_empty_kwargs(self): def test_xticklabels(self): # GH11529 - s = Series(np.arange(10), index=['P%02d' % i for i in range(10)]) + s = Series(np.arange(10), index=["P%02d" % i for i in range(10)]) _, ax = self.plt.subplots() ax = s.plot(xticks=[0, 3, 5, 9], ax=ax) - exp = ['P%02d' % i for i in [0, 3, 5, 9]] + exp = ["P%02d" % i for i in [0, 3, 5, 9]] self._check_text_labels(ax.get_xticklabels(), exp) def test_custom_business_day_freq(self): # GH7222 from pandas.tseries.offsets import CustomBusinessDay - s = Series(range(100, 121), index=pd.bdate_range( - start='2014-05-01', end='2014-06-01', - freq=CustomBusinessDay(holidays=['2014-05-26']))) + + s = Series( + range(100, 121), + index=pd.bdate_range( + start="2014-05-01", + end="2014-06-01", + freq=CustomBusinessDay(holidays=["2014-05-26"]), + ), + ) _check_plot_works(s.plot) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 61ab759aa8d9bc..1e7a40b9040b7a 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -5,27 +5,40 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, DatetimeIndex, Index, NaT, Period, PeriodIndex, - RangeIndex, Series, Timedelta, TimedeltaIndex, Timestamp, isna, - timedelta_range, to_timedelta) + Categorical, + DataFrame, + DatetimeIndex, + Index, + NaT, + Period, + PeriodIndex, + RangeIndex, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, + isna, + timedelta_range, + to_timedelta, +) from pandas.core import nanops import pandas.util.testing as tm def get_objs(): indexes = [ - tm.makeBoolIndex(10, name='a'), - tm.makeIntIndex(10, name='a'), - tm.makeFloatIndex(10, name='a'), - tm.makeDateIndex(10, name='a'), - tm.makeDateIndex(10, name='a').tz_localize(tz='US/Eastern'), - tm.makePeriodIndex(10, name='a'), - tm.makeStringIndex(10, name='a'), - tm.makeUnicodeIndex(10, name='a') + tm.makeBoolIndex(10, name="a"), + tm.makeIntIndex(10, name="a"), + tm.makeFloatIndex(10, name="a"), + tm.makeDateIndex(10, name="a"), + tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern"), + tm.makePeriodIndex(10, name="a"), + tm.makeStringIndex(10, name="a"), + tm.makeUnicodeIndex(10, name="a"), ] arr = np.random.randn(10) - series = [Series(arr, index=idx, name='a') for idx in indexes] + series = [Series(arr, index=idx, name="a") for idx in indexes] objs = indexes + series return objs @@ -35,30 +48,29 @@ def get_objs(): class TestReductions: - - @pytest.mark.parametrize('opname', ['max', 'min']) - @pytest.mark.parametrize('obj', objs) + @pytest.mark.parametrize("opname", ["max", "min"]) + @pytest.mark.parametrize("obj", objs) def test_ops(self, opname, obj): result = getattr(obj, opname)() if not isinstance(obj, PeriodIndex): expected = getattr(obj.values, opname)() else: expected = pd.Period( - ordinal=getattr(obj._ndarray_values, opname)(), - freq=obj.freq) + ordinal=getattr(obj._ndarray_values, opname)(), freq=obj.freq + ) try: assert result == expected except TypeError: # comparing tz-aware series with np.array results in # TypeError - expected = expected.astype('M8[ns]').astype('int64') + expected = expected.astype("M8[ns]").astype("int64") assert result.value == expected def test_nanops(self): # GH#7261 - for opname in ['max', 'min']: + for opname in ["max", "min"]: for klass in [Index, Series]: - arg_op = 'arg' + opname if klass is Index else 'idx' + opname + arg_op = "arg" + opname if klass is Index else "idx" + opname obj = klass([np.nan, 2.0]) assert getattr(obj, opname)() == 2.0 @@ -108,7 +120,7 @@ def test_nanops(self): getattr(obj, arg_op)(skipna=False) # argmin/max - obj = Index(np.arange(5, dtype='int64')) + obj = Index(np.arange(5, dtype="int64")) assert obj.argmin() == 0 assert obj.argmax() == 4 @@ -124,8 +136,7 @@ def test_nanops(self): assert obj.argmin(skipna=False) == -1 assert obj.argmax(skipna=False) == -1 - obj = Index([pd.NaT, datetime(2011, 11, 1), datetime(2011, 11, 2), - pd.NaT]) + obj = Index([pd.NaT, datetime(2011, 11, 1), datetime(2011, 11, 2), pd.NaT]) assert obj.argmin() == 1 assert obj.argmax() == 2 assert obj.argmin(skipna=False) == -1 @@ -137,15 +148,13 @@ def test_nanops(self): assert obj.argmin(skipna=False) == -1 assert obj.argmax(skipna=False) == -1 - @pytest.mark.parametrize('op, expected_col', [ - ['max', 'a'], ['min', 'b'] - ]) + @pytest.mark.parametrize("op, expected_col", [["max", "a"], ["min", "b"]]) def test_same_tz_min_max_axis_1(self, op, expected_col): # GH 10390 - df = DataFrame(pd.date_range('2016-01-01 00:00:00', periods=3, - tz='UTC'), - columns=['a']) - df['b'] = df.a.subtract(pd.Timedelta(seconds=3600)) + df = DataFrame( + pd.date_range("2016-01-01 00:00:00", periods=3, tz="UTC"), columns=["a"] + ) + df["b"] = df.a.subtract(pd.Timedelta(seconds=3600)) result = getattr(df, op)(axis=1) expected = df[expected_col].rename(None) tm.assert_series_equal(result, expected) @@ -156,9 +165,16 @@ class TestIndexReductions: # were moved from a Index-specific test file, _not_ that these tests are # intended long-term to be Index-specific - @pytest.mark.parametrize('start,stop,step', - [(0, 400, 3), (500, 0, -6), (-10**6, 10**6, 4), - (10**6, -10**6, -4), (0, 10, 20)]) + @pytest.mark.parametrize( + "start,stop,step", + [ + (0, 400, 3), + (500, 0, -6), + (-10 ** 6, 10 ** 6, 4), + (10 ** 6, -10 ** 6, -4), + (0, 10, 20), + ], + ) def test_max_min_range(self, start, stop, step): # GH#17607 idx = RangeIndex(start, stop, step) @@ -186,20 +202,20 @@ def test_max_min_range(self, start, stop, step): def test_minmax_timedelta64(self): # monotonic - idx1 = TimedeltaIndex(['1 days', '2 days', '3 days']) + idx1 = TimedeltaIndex(["1 days", "2 days", "3 days"]) assert idx1.is_monotonic # non-monotonic - idx2 = TimedeltaIndex(['1 days', np.nan, '3 days', 'NaT']) + idx2 = TimedeltaIndex(["1 days", np.nan, "3 days", "NaT"]) assert not idx2.is_monotonic for idx in [idx1, idx2]: - assert idx.min() == Timedelta('1 days') - assert idx.max() == Timedelta('3 days') + assert idx.min() == Timedelta("1 days") + assert idx.max() == Timedelta("3 days") assert idx.argmin() == 0 assert idx.argmax() == 2 - for op in ['min', 'max']: + for op in ["min", "max"]: # Return NaT obj = TimedeltaIndex([]) assert pd.isna(getattr(obj, op)()) @@ -211,10 +227,10 @@ def test_minmax_timedelta64(self): assert pd.isna(getattr(obj, op)()) def test_numpy_minmax_timedelta64(self): - td = timedelta_range('16815 days', '16820 days', freq='D') + td = timedelta_range("16815 days", "16820 days", freq="D") - assert np.min(td) == Timedelta('16815 days') - assert np.max(td) == Timedelta('16820 days') + assert np.min(td) == Timedelta("16815 days") + assert np.max(td) == Timedelta("16820 days") errmsg = "the 'out' parameter is not supported" with pytest.raises(ValueError, match=errmsg): @@ -234,8 +250,9 @@ def test_numpy_minmax_timedelta64(self): def test_timedelta_ops(self): # GH#4984 # make sure ops return Timedelta - s = Series([Timestamp('20130101') + timedelta(seconds=i * i) - for i in range(10)]) + s = Series( + [Timestamp("20130101") + timedelta(seconds=i * i) for i in range(10)] + ) td = s.diff() result = td.mean() @@ -245,12 +262,12 @@ def test_timedelta_ops(self): result = td.to_frame().mean() assert result[0] == expected - result = td.quantile(.1) - expected = Timedelta(np.timedelta64(2600, 'ms')) + result = td.quantile(0.1) + expected = Timedelta(np.timedelta64(2600, "ms")) assert result == expected result = td.median() - expected = to_timedelta('00:00:09') + expected = to_timedelta("00:00:09") assert result == expected result = td.to_frame().median() @@ -259,7 +276,7 @@ def test_timedelta_ops(self): # GH#6462 # consistency in returned values for sum result = td.sum() - expected = to_timedelta('00:01:21') + expected = to_timedelta("00:01:21") assert result == expected result = td.to_frame().sum() @@ -274,39 +291,40 @@ def test_timedelta_ops(self): assert result[0] == expected # invalid ops - for op in ['skew', 'kurt', 'sem', 'prod']: + for op in ["skew", "kurt", "sem", "prod"]: msg = "reduction operation '{}' not allowed for this dtype" with pytest.raises(TypeError, match=msg.format(op)): getattr(td, op)() # GH#10040 # make sure NaT is properly handled by median() - s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07')]) + s = Series([Timestamp("2015-02-03"), Timestamp("2015-02-07")]) assert s.diff().median() == timedelta(days=4) - s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07'), - Timestamp('2015-02-15')]) + s = Series( + [Timestamp("2015-02-03"), Timestamp("2015-02-07"), Timestamp("2015-02-15")] + ) assert s.diff().median() == timedelta(days=6) def test_minmax_tz(self, tz_naive_fixture): tz = tz_naive_fixture # monotonic - idx1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02', - '2011-01-03'], tz=tz) + idx1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], tz=tz) assert idx1.is_monotonic # non-monotonic - idx2 = pd.DatetimeIndex(['2011-01-01', pd.NaT, '2011-01-03', - '2011-01-02', pd.NaT], tz=tz) + idx2 = pd.DatetimeIndex( + ["2011-01-01", pd.NaT, "2011-01-03", "2011-01-02", pd.NaT], tz=tz + ) assert not idx2.is_monotonic for idx in [idx1, idx2]: - assert idx.min() == Timestamp('2011-01-01', tz=tz) - assert idx.max() == Timestamp('2011-01-03', tz=tz) + assert idx.min() == Timestamp("2011-01-01", tz=tz) + assert idx.max() == Timestamp("2011-01-03", tz=tz) assert idx.argmin() == 0 assert idx.argmax() == 2 - @pytest.mark.parametrize('op', ['min', 'max']) + @pytest.mark.parametrize("op", ["min", "max"]) def test_minmax_nat_datetime64(self, op): # Return NaT obj = DatetimeIndex([]) @@ -372,10 +390,10 @@ def test_numpy_minmax_range(self): # is the same as basic integer index def test_numpy_minmax_datetime64(self): - dr = pd.date_range(start='2016-01-15', end='2016-01-20') + dr = pd.date_range(start="2016-01-15", end="2016-01-20") - assert np.min(dr) == Timestamp('2016-01-15 00:00:00', freq='D') - assert np.max(dr) == Timestamp('2016-01-20 00:00:00', freq='D') + assert np.min(dr) == Timestamp("2016-01-15 00:00:00", freq="D") + assert np.max(dr) == Timestamp("2016-01-20 00:00:00", freq="D") errmsg = "the 'out' parameter is not supported" with pytest.raises(ValueError, match=errmsg): @@ -397,42 +415,42 @@ def test_numpy_minmax_datetime64(self): def test_minmax_period(self): # monotonic - idx1 = pd.PeriodIndex([NaT, '2011-01-01', '2011-01-02', - '2011-01-03'], freq='D') + idx1 = pd.PeriodIndex([NaT, "2011-01-01", "2011-01-02", "2011-01-03"], freq="D") assert idx1.is_monotonic # non-monotonic - idx2 = pd.PeriodIndex(['2011-01-01', NaT, '2011-01-03', - '2011-01-02', NaT], freq='D') + idx2 = pd.PeriodIndex( + ["2011-01-01", NaT, "2011-01-03", "2011-01-02", NaT], freq="D" + ) assert not idx2.is_monotonic for idx in [idx1, idx2]: - assert idx.min() == pd.Period('2011-01-01', freq='D') - assert idx.max() == pd.Period('2011-01-03', freq='D') + assert idx.min() == pd.Period("2011-01-01", freq="D") + assert idx.max() == pd.Period("2011-01-03", freq="D") assert idx1.argmin() == 1 assert idx2.argmin() == 0 assert idx1.argmax() == 3 assert idx2.argmax() == 2 - for op in ['min', 'max']: + for op in ["min", "max"]: # Return NaT - obj = PeriodIndex([], freq='M') + obj = PeriodIndex([], freq="M") result = getattr(obj, op)() assert result is NaT - obj = PeriodIndex([NaT], freq='M') + obj = PeriodIndex([NaT], freq="M") result = getattr(obj, op)() assert result is NaT - obj = PeriodIndex([NaT, NaT, NaT], freq='M') + obj = PeriodIndex([NaT, NaT, NaT], freq="M") result = getattr(obj, op)() assert result is NaT def test_numpy_minmax_period(self): - pr = pd.period_range(start='2016-01-15', end='2016-01-20') + pr = pd.period_range(start="2016-01-15", end="2016-01-20") - assert np.min(pr) == Period('2016-01-15', freq='D') - assert np.max(pr) == Period('2016-01-20', freq='D') + assert np.min(pr) == Period("2016-01-15", freq="D") + assert np.max(pr) == Period("2016-01-20", freq="D") errmsg = "the 'out' parameter is not supported" with pytest.raises(ValueError, match=errmsg): @@ -451,19 +469,15 @@ def test_numpy_minmax_period(self): def test_min_max_categorical(self): - ci = pd.CategoricalIndex(list('aabbca'), - categories=list('cab'), - ordered=False) + ci = pd.CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) with pytest.raises(TypeError): ci.min() with pytest.raises(TypeError): ci.max() - ci = pd.CategoricalIndex(list('aabbca'), - categories=list('cab'), - ordered=True) - assert ci.min() == 'c' - assert ci.max() == 'b' + ci = pd.CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=True) + assert ci.min() == "c" + assert ci.max() == "b" class TestSeriesReductions: @@ -480,7 +494,7 @@ def test_sum_inf(self): assert np.isinf(s.sum()) - arr = np.random.randn(100, 100).astype('f4') + arr = np.random.randn(100, 100).astype("f4") arr[:, 2] = np.inf with pd.option_context("mode.use_inf_as_na", True): @@ -490,10 +504,7 @@ def test_sum_inf(self): assert np.isinf(res).all() @pytest.mark.parametrize("use_bottleneck", [True, False]) - @pytest.mark.parametrize("method, unit", [ - ("sum", 0.0), - ("prod", 1.0) - ]) + @pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)]) def test_empty(self, method, unit, use_bottleneck): with pd.option_context("use_bottleneck", use_bottleneck): # GH#9422 / GH#18921 @@ -584,30 +595,28 @@ def test_empty(self, method, unit, use_bottleneck): result = getattr(s, method)(min_count=2) assert pd.isna(result) - @pytest.mark.parametrize('method, unit', [ - ('sum', 0.0), - ('prod', 1.0), - ]) + @pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)]) def test_empty_multi(self, method, unit): - s = pd.Series([1, np.nan, np.nan, np.nan], - index=pd.MultiIndex.from_product([('a', 'b'), (0, 1)])) + s = pd.Series( + [1, np.nan, np.nan, np.nan], + index=pd.MultiIndex.from_product([("a", "b"), (0, 1)]), + ) # 1 / 0 by default result = getattr(s, method)(level=0) - expected = pd.Series([1, unit], index=['a', 'b']) + expected = pd.Series([1, unit], index=["a", "b"]) tm.assert_series_equal(result, expected) # min_count=0 result = getattr(s, method)(level=0, min_count=0) - expected = pd.Series([1, unit], index=['a', 'b']) + expected = pd.Series([1, unit], index=["a", "b"]) tm.assert_series_equal(result, expected) # min_count=1 result = getattr(s, method)(level=0, min_count=1) - expected = pd.Series([1, np.nan], index=['a', 'b']) + expected = pd.Series([1, np.nan], index=["a", "b"]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( - "method", ['mean', 'median', 'std', 'var']) + @pytest.mark.parametrize("method", ["mean", "median", "std", "var"]) def test_ops_consistency_on_empty(self, method): # GH#7869 @@ -618,7 +627,7 @@ def test_ops_consistency_on_empty(self, method): assert pd.isna(result) # timedelta64[ns] - result = getattr(Series(dtype='m8[ns]'), method)() + result = getattr(Series(dtype="m8[ns]"), method)() assert result is pd.NaT def test_nansum_buglet(self): @@ -629,21 +638,21 @@ def test_nansum_buglet(self): @pytest.mark.parametrize("use_bottleneck", [True, False]) def test_sum_overflow(self, use_bottleneck): - with pd.option_context('use_bottleneck', use_bottleneck): + with pd.option_context("use_bottleneck", use_bottleneck): # GH#6915 # overflowing on the smaller int dtypes - for dtype in ['int32', 'int64']: + for dtype in ["int32", "int64"]: v = np.arange(5000000, dtype=dtype) s = Series(v) result = s.sum(skipna=False) - assert int(result) == v.sum(dtype='int64') + assert int(result) == v.sum(dtype="int64") result = s.min(skipna=False) assert int(result) == 0 result = s.max(skipna=False) assert int(result) == v[-1] - for dtype in ['float32', 'float64']: + for dtype in ["float32", "float64"]: v = np.arange(5000000, dtype=dtype) s = Series(v) @@ -656,7 +665,7 @@ def test_sum_overflow(self, use_bottleneck): def test_empty_timeseries_reductions_return_nat(self): # covers GH#11245 - for dtype in ('m8[ns]', 'm8[ns]', 'M8[ns]', 'M8[ns, UTC]'): + for dtype in ("m8[ns]", "m8[ns]", "M8[ns]", "M8[ns, UTC]"): assert Series([], dtype=dtype).min() is pd.NaT assert Series([], dtype=dtype).max() is pd.NaT assert Series([], dtype=dtype).min(skipna=False) is pd.NaT @@ -681,8 +690,7 @@ def test_numpy_argmin_deprecated(self): assert result == 1 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): msg = "the 'out' parameter is not supported" with pytest.raises(ValueError, match=msg): np.argmin(s, out=data) @@ -705,8 +713,7 @@ def test_numpy_argmax_deprecated(self): assert result == 10 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): msg = "the 'out' parameter is not supported" with pytest.raises(ValueError, match=msg): np.argmax(s, out=data) @@ -714,7 +721,7 @@ def test_numpy_argmax_deprecated(self): def test_idxmin(self): # test idxmin # _check_stat_op approach can not be used here because of isna check. - string_series = tm.makeStringSeries().rename('series') + string_series = tm.makeStringSeries().rename("series") # add some NaNs string_series[5:15] = np.NaN @@ -726,15 +733,14 @@ def test_idxmin(self): # no NaNs nona = string_series.dropna() assert nona[nona.idxmin()] == nona.min() - assert (nona.index.values.tolist().index(nona.idxmin()) == - nona.values.argmin()) + assert nona.index.values.tolist().index(nona.idxmin()) == nona.values.argmin() # all NaNs allna = string_series * np.nan assert pd.isna(allna.idxmin()) # datetime64[ns] - s = Series(pd.date_range('20130102', periods=6)) + s = Series(pd.date_range("20130102", periods=6)) result = s.idxmin() assert result == 0 @@ -745,7 +751,7 @@ def test_idxmin(self): def test_idxmax(self): # test idxmax # _check_stat_op approach can not be used here because of isna check. - string_series = tm.makeStringSeries().rename('series') + string_series = tm.makeStringSeries().rename("series") # add some NaNs string_series[5:15] = np.NaN @@ -757,15 +763,15 @@ def test_idxmax(self): # no NaNs nona = string_series.dropna() assert nona[nona.idxmax()] == nona.max() - assert (nona.index.values.tolist().index(nona.idxmax()) == - nona.values.argmax()) + assert nona.index.values.tolist().index(nona.idxmax()) == nona.values.argmax() # all NaNs allna = string_series * np.nan assert pd.isna(allna.idxmax()) from pandas import date_range - s = Series(date_range('20130102', periods=6)) + + s = Series(date_range("20130102", periods=6)) result = s.idxmax() assert result == 5 @@ -794,8 +800,8 @@ def test_all_any(self): assert bool_series.any() # Alternative types, with implicit 'object' dtype. - s = Series(['abc', True]) - assert 'abc' == s.any() # 'abc' || True => 'abc' + s = Series(["abc", True]) + assert "abc" == s.any() # 'abc' || True => 'abc' def test_all_any_params(self): # Check skipna, with implicit 'object' dtype. @@ -807,8 +813,7 @@ def test_all_any_params(self): assert not s2.any(skipna=True) # Check level. - s = pd.Series([False, False, True, True, False, True], - index=[0, 0, 1, 1, 2, 2]) + s = pd.Series([False, False, True, True, False, True], index=[0, 0, 1, 1, 2, 2]) tm.assert_series_equal(s.all(level=0), Series([False, True, False])) tm.assert_series_equal(s.any(level=0), Series([False, True, True])) @@ -820,15 +825,15 @@ def test_all_any_params(self): # bool_only is not implemented alone. with pytest.raises(NotImplementedError): - s.any(bool_only=True,) + s.any(bool_only=True) with pytest.raises(NotImplementedError): s.all(bool_only=True) def test_timedelta64_analytics(self): # index min/max - dti = pd.date_range('2012-1-1', periods=3, freq='D') - td = Series(dti) - pd.Timestamp('20120101') + dti = pd.date_range("2012-1-1", periods=3, freq="D") + td = Series(dti) - pd.Timestamp("20120101") result = td.idxmin() assert result == 0 @@ -847,8 +852,8 @@ def test_timedelta64_analytics(self): assert result == 2 # abs - s1 = Series(pd.date_range('20120101', periods=3)) - s2 = Series(pd.date_range('20120102', periods=3)) + s1 = Series(pd.date_range("20120101", periods=3)) + s2 = Series(pd.date_range("20120102", periods=3)) expected = Series(s2 - s1) # FIXME: don't leave commented-out code @@ -861,28 +866,23 @@ def test_timedelta64_analytics(self): # max/min result = td.max() - expected = pd.Timedelta('2 days') + expected = pd.Timedelta("2 days") assert result == expected result = td.min() - expected = pd.Timedelta('1 days') + expected = pd.Timedelta("1 days") assert result == expected @pytest.mark.parametrize( "test_input,error_type", [ (pd.Series([]), ValueError), - # For strings, or any Series with dtype 'O' - (pd.Series(['foo', 'bar', 'baz']), TypeError), + (pd.Series(["foo", "bar", "baz"]), TypeError), (pd.Series([(1,), (2,)]), TypeError), - # For mixed data types - ( - pd.Series(['foo', 'foo', 'bar', 'bar', None, np.nan, 'baz']), - TypeError - ), - ] + (pd.Series(["foo", "foo", "bar", "bar", None, np.nan, "baz"]), TypeError), + ], ) def test_assert_idxminmax_raises(self, test_input, error_type): """ @@ -909,7 +909,7 @@ def test_idxminmax_with_inf(self): # Using old-style behavior that treats floating point nan, -inf, and # +inf as missing - with pd.option_context('mode.use_inf_as_na', True): + with pd.option_context("mode.use_inf_as_na", True): assert s.idxmin() == 0 assert np.isnan(s.idxmin(skipna=False)) assert s.idxmax() == 0 @@ -921,10 +921,14 @@ class TestDatetime64SeriesReductions: # were moved from a series-specific test file, _not_ that these tests are # intended long-term to be series-specific - @pytest.mark.parametrize('nat_ser', [ - Series([pd.NaT, pd.NaT]), - Series([pd.NaT, pd.Timedelta('nat')]), - Series([pd.Timedelta('nat'), pd.Timedelta('nat')])]) + @pytest.mark.parametrize( + "nat_ser", + [ + Series([pd.NaT, pd.NaT]), + Series([pd.NaT, pd.Timedelta("nat")]), + Series([pd.Timedelta("nat"), pd.Timedelta("nat")]), + ], + ) def test_minmax_nat_series(self, nat_ser): # GH#23282 assert nat_ser.min() is pd.NaT @@ -932,10 +936,14 @@ def test_minmax_nat_series(self, nat_ser): assert nat_ser.min(skipna=False) is pd.NaT assert nat_ser.max(skipna=False) is pd.NaT - @pytest.mark.parametrize('nat_df', [ - pd.DataFrame([pd.NaT, pd.NaT]), - pd.DataFrame([pd.NaT, pd.Timedelta('nat')]), - pd.DataFrame([pd.Timedelta('nat'), pd.Timedelta('nat')])]) + @pytest.mark.parametrize( + "nat_df", + [ + pd.DataFrame([pd.NaT, pd.NaT]), + pd.DataFrame([pd.NaT, pd.Timedelta("nat")]), + pd.DataFrame([pd.Timedelta("nat"), pd.Timedelta("nat")]), + ], + ) def test_minmax_nat_dataframe(self, nat_df): # GH#23282 assert nat_df.min()[0] is pd.NaT @@ -944,7 +952,7 @@ def test_minmax_nat_dataframe(self, nat_df): assert nat_df.max(skipna=False)[0] is pd.NaT def test_min_max(self): - rng = pd.date_range('1/1/2000', '12/31/2000') + rng = pd.date_range("1/1/2000", "12/31/2000") rng2 = rng.take(np.random.permutation(len(rng))) the_min = rng2.min() @@ -958,9 +966,9 @@ def test_min_max(self): assert rng.max() == rng[-1] def test_min_max_series(self): - rng = pd.date_range('1/1/2000', periods=10, freq='4h') - lvls = ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C'] - df = DataFrame({'TS': rng, 'V': np.random.randn(len(rng)), 'L': lvls}) + rng = pd.date_range("1/1/2000", periods=10, freq="4h") + lvls = ["A", "A", "A", "B", "B", "B", "C", "C", "C", "C"] + df = DataFrame({"TS": rng, "V": np.random.randn(len(rng)), "L": lvls}) result = df.TS.max() exp = pd.Timestamp(df.TS.iat[-1]) @@ -992,23 +1000,33 @@ def test_min_max(self): assert _min == "a" assert _max == "d" - cat = Series(Categorical(["a", "b", "c", "d"], categories=[ - 'd', 'c', 'b', 'a'], ordered=True)) + cat = Series( + Categorical( + ["a", "b", "c", "d"], categories=["d", "c", "b", "a"], ordered=True + ) + ) _min = cat.min() _max = cat.max() assert _min == "d" assert _max == "a" - cat = Series(Categorical( - [np.nan, "b", "c", np.nan], categories=['d', 'c', 'b', 'a' - ], ordered=True)) + cat = Series( + Categorical( + [np.nan, "b", "c", np.nan], + categories=["d", "c", "b", "a"], + ordered=True, + ) + ) _min = cat.min() _max = cat.max() assert np.isnan(_min) assert _max == "b" - cat = Series(Categorical( - [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True)) + cat = Series( + Categorical( + [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True + ) + ) _min = cat.min() _max = cat.max() assert np.isnan(_min) @@ -1017,8 +1035,9 @@ def test_min_max(self): def test_min_max_numeric_only(self): # TODO deprecate numeric_only argument for Categorical and use # skipna as well, see GH25303 - cat = Series(Categorical( - ["a", "b", np.nan, "a"], categories=['b', 'a'], ordered=True)) + cat = Series( + Categorical(["a", "b", np.nan, "a"], categories=["b", "a"], ordered=True) + ) _min = cat.min() _max = cat.max() @@ -1041,24 +1060,26 @@ class TestSeriesMode: # were moved from a series-specific test file, _not_ that these tests are # intended long-term to be series-specific - @pytest.mark.parametrize('dropna, expected', [ - (True, Series([], dtype=np.float64)), - (False, Series([], dtype=np.float64)) - ]) + @pytest.mark.parametrize( + "dropna, expected", + [(True, Series([], dtype=np.float64)), (False, Series([], dtype=np.float64))], + ) def test_mode_empty(self, dropna, expected): s = Series([], dtype=np.float64) result = s.mode(dropna) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('dropna, data, expected', [ - (True, [1, 1, 1, 2], [1]), - (True, [1, 1, 1, 2, 3, 3, 3], [1, 3]), - (False, [1, 1, 1, 2], [1]), - (False, [1, 1, 1, 2, 3, 3, 3], [1, 3]), - ]) @pytest.mark.parametrize( - 'dt', - list(np.typecodes['AllInteger'] + np.typecodes['Float']) + "dropna, data, expected", + [ + (True, [1, 1, 1, 2], [1]), + (True, [1, 1, 1, 2, 3, 3, 3], [1, 3]), + (False, [1, 1, 1, 2], [1]), + (False, [1, 1, 1, 2, 3, 3, 3], [1, 3]), + ], + ) + @pytest.mark.parametrize( + "dt", list(np.typecodes["AllInteger"] + np.typecodes["Float"]) ) def test_mode_numerical(self, dropna, data, expected, dt): s = Series(data, dtype=dt) @@ -1066,134 +1087,173 @@ def test_mode_numerical(self, dropna, data, expected, dt): expected = Series(expected, dtype=dt) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('dropna, expected', [ - (True, [1.0]), - (False, [1, np.nan]), - ]) + @pytest.mark.parametrize("dropna, expected", [(True, [1.0]), (False, [1, np.nan])]) def test_mode_numerical_nan(self, dropna, expected): s = Series([1, 1, 2, np.nan, np.nan]) result = s.mode(dropna) expected = Series(expected) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ - (True, ['b'], ['bar'], ['nan']), - (False, ['b'], [np.nan], ['nan']) - ]) + @pytest.mark.parametrize( + "dropna, expected1, expected2, expected3", + [(True, ["b"], ["bar"], ["nan"]), (False, ["b"], [np.nan], ["nan"])], + ) def test_mode_str_obj(self, dropna, expected1, expected2, expected3): # Test string and object types. - data = ['a'] * 2 + ['b'] * 3 + data = ["a"] * 2 + ["b"] * 3 - s = Series(data, dtype='c') + s = Series(data, dtype="c") result = s.mode(dropna) - expected1 = Series(expected1, dtype='c') + expected1 = Series(expected1, dtype="c") tm.assert_series_equal(result, expected1) - data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] + data = ["foo", "bar", "bar", np.nan, np.nan, np.nan] s = Series(data, dtype=object) result = s.mode(dropna) expected2 = Series(expected2, dtype=object) tm.assert_series_equal(result, expected2) - data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] + data = ["foo", "bar", "bar", np.nan, np.nan, np.nan] s = Series(data, dtype=object).astype(str) result = s.mode(dropna) expected3 = Series(expected3, dtype=str) tm.assert_series_equal(result, expected3) - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, ['foo'], ['foo']), - (False, ['foo'], [np.nan]) - ]) + @pytest.mark.parametrize( + "dropna, expected1, expected2", + [(True, ["foo"], ["foo"]), (False, ["foo"], [np.nan])], + ) def test_mode_mixeddtype(self, dropna, expected1, expected2): - s = Series([1, 'foo', 'foo']) + s = Series([1, "foo", "foo"]) result = s.mode(dropna) expected = Series(expected1) tm.assert_series_equal(result, expected) - s = Series([1, 'foo', 'foo', np.nan, np.nan, np.nan]) + s = Series([1, "foo", "foo", np.nan, np.nan, np.nan]) result = s.mode(dropna) expected = Series(expected2, dtype=object) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, ['1900-05-03', '2011-01-03', '2013-01-02'], - ['2011-01-03', '2013-01-02']), - (False, [np.nan], [np.nan, '2011-01-03', '2013-01-02']), - ]) + @pytest.mark.parametrize( + "dropna, expected1, expected2", + [ + ( + True, + ["1900-05-03", "2011-01-03", "2013-01-02"], + ["2011-01-03", "2013-01-02"], + ), + (False, [np.nan], [np.nan, "2011-01-03", "2013-01-02"]), + ], + ) def test_mode_datetime(self, dropna, expected1, expected2): - s = Series(['2011-01-03', '2013-01-02', - '1900-05-03', 'nan', 'nan'], dtype='M8[ns]') + s = Series( + ["2011-01-03", "2013-01-02", "1900-05-03", "nan", "nan"], dtype="M8[ns]" + ) result = s.mode(dropna) - expected1 = Series(expected1, dtype='M8[ns]') + expected1 = Series(expected1, dtype="M8[ns]") tm.assert_series_equal(result, expected1) - s = Series(['2011-01-03', '2013-01-02', '1900-05-03', - '2011-01-03', '2013-01-02', 'nan', 'nan'], - dtype='M8[ns]') + s = Series( + [ + "2011-01-03", + "2013-01-02", + "1900-05-03", + "2011-01-03", + "2013-01-02", + "nan", + "nan", + ], + dtype="M8[ns]", + ) result = s.mode(dropna) - expected2 = Series(expected2, dtype='M8[ns]') + expected2 = Series(expected2, dtype="M8[ns]") tm.assert_series_equal(result, expected2) - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, ['-1 days', '0 days', '1 days'], ['2 min', '1 day']), - (False, [np.nan], [np.nan, '2 min', '1 day']), - ]) + @pytest.mark.parametrize( + "dropna, expected1, expected2", + [ + (True, ["-1 days", "0 days", "1 days"], ["2 min", "1 day"]), + (False, [np.nan], [np.nan, "2 min", "1 day"]), + ], + ) def test_mode_timedelta(self, dropna, expected1, expected2): # gh-5986: Test timedelta types. - s = Series(['1 days', '-1 days', '0 days', 'nan', 'nan'], - dtype='timedelta64[ns]') + s = Series( + ["1 days", "-1 days", "0 days", "nan", "nan"], dtype="timedelta64[ns]" + ) result = s.mode(dropna) - expected1 = Series(expected1, dtype='timedelta64[ns]') + expected1 = Series(expected1, dtype="timedelta64[ns]") tm.assert_series_equal(result, expected1) - s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min', - '2 min', '2 min', 'nan', 'nan'], - dtype='timedelta64[ns]') + s = Series( + [ + "1 day", + "1 day", + "-1 day", + "-1 day 2 min", + "2 min", + "2 min", + "nan", + "nan", + ], + dtype="timedelta64[ns]", + ) result = s.mode(dropna) - expected2 = Series(expected2, dtype='timedelta64[ns]') + expected2 = Series(expected2, dtype="timedelta64[ns]") tm.assert_series_equal(result, expected2) - @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ - (True, Categorical([1, 2], categories=[1, 2]), - Categorical(['a'], categories=[1, 'a']), - Categorical([3, 1], categories=[3, 2, 1], ordered=True)), - (False, Categorical([np.nan], categories=[1, 2]), - Categorical([np.nan, 'a'], categories=[1, 'a']), - Categorical([np.nan, 3, 1], categories=[3, 2, 1], ordered=True)), - ]) + @pytest.mark.parametrize( + "dropna, expected1, expected2, expected3", + [ + ( + True, + Categorical([1, 2], categories=[1, 2]), + Categorical(["a"], categories=[1, "a"]), + Categorical([3, 1], categories=[3, 2, 1], ordered=True), + ), + ( + False, + Categorical([np.nan], categories=[1, 2]), + Categorical([np.nan, "a"], categories=[1, "a"]), + Categorical([np.nan, 3, 1], categories=[3, 2, 1], ordered=True), + ), + ], + ) def test_mode_category(self, dropna, expected1, expected2, expected3): s = Series(Categorical([1, 2, np.nan, np.nan])) result = s.mode(dropna) - expected1 = Series(expected1, dtype='category') + expected1 = Series(expected1, dtype="category") tm.assert_series_equal(result, expected1) - s = Series(Categorical([1, 'a', 'a', np.nan, np.nan])) + s = Series(Categorical([1, "a", "a", np.nan, np.nan])) result = s.mode(dropna) - expected2 = Series(expected2, dtype='category') + expected2 = Series(expected2, dtype="category") tm.assert_series_equal(result, expected2) - s = Series(Categorical([1, 1, 2, 3, 3, np.nan, np.nan], - categories=[3, 2, 1], ordered=True)) + s = Series( + Categorical( + [1, 1, 2, 3, 3, np.nan, np.nan], categories=[3, 2, 1], ordered=True + ) + ) result = s.mode(dropna) - expected3 = Series(expected3, dtype='category') + expected3 = Series(expected3, dtype="category") tm.assert_series_equal(result, expected3) - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, [2**63], [1, 2**63]), - (False, [2**63], [1, 2**63]) - ]) + @pytest.mark.parametrize( + "dropna, expected1, expected2", + [(True, [2 ** 63], [1, 2 ** 63]), (False, [2 ** 63], [1, 2 ** 63])], + ) def test_mode_intoverflow(self, dropna, expected1, expected2): # Test for uint64 overflow. - s = Series([1, 2**63, 2**63], dtype=np.uint64) + s = Series([1, 2 ** 63, 2 ** 63], dtype=np.uint64) result = s.mode(dropna) expected1 = Series(expected1, dtype=np.uint64) tm.assert_series_equal(result, expected1) - s = Series([1, 2**63], dtype=np.uint64) + s = Series([1, 2 ** 63], dtype=np.uint64) result = s.mode(dropna) expected2 = Series(expected2, dtype=np.uint64) tm.assert_series_equal(result, expected2) @@ -1202,8 +1262,8 @@ def test_mode_sortwarning(self): # Check for the warning that is raised when the mode # results cannot be sorted - expected = Series(['foo', np.nan]) - s = Series([1, 'foo', 'foo', np.nan, np.nan]) + expected = Series(["foo", np.nan]) + s = Series([1, "foo", "foo", np.nan, np.nan]) with tm.assert_produces_warning(UserWarning, check_stacklevel=False): result = s.mode(dropna=False) diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index b0fd2f290031e7..432811b5a8264b 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -15,37 +15,36 @@ class TestDatetimeLikeStatReductions: - - @pytest.mark.parametrize('box', [Series, pd.Index, DatetimeArray]) + @pytest.mark.parametrize("box", [Series, pd.Index, DatetimeArray]) def test_dt64_mean(self, tz_naive_fixture, box): tz = tz_naive_fixture - dti = pd.date_range('2001-01-01', periods=11, tz=tz) + dti = pd.date_range("2001-01-01", periods=11, tz=tz) # shuffle so that we are not just working with monotone-increasing dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6]) dtarr = dti._data obj = box(dtarr) - assert obj.mean() == pd.Timestamp('2001-01-06', tz=tz) - assert obj.mean(skipna=False) == pd.Timestamp('2001-01-06', tz=tz) + assert obj.mean() == pd.Timestamp("2001-01-06", tz=tz) + assert obj.mean(skipna=False) == pd.Timestamp("2001-01-06", tz=tz) # dtarr[-2] will be the first date 2001-01-1 dtarr[-2] = pd.NaT obj = box(dtarr) - assert obj.mean() == pd.Timestamp('2001-01-06 07:12:00', tz=tz) + assert obj.mean() == pd.Timestamp("2001-01-06 07:12:00", tz=tz) assert obj.mean(skipna=False) is pd.NaT - @pytest.mark.parametrize('box', [Series, pd.Index, PeriodArray]) + @pytest.mark.parametrize("box", [Series, pd.Index, PeriodArray]) def test_period_mean(self, box): # GH#24757 - dti = pd.date_range('2001-01-01', periods=11) + dti = pd.date_range("2001-01-01", periods=11) # shuffle so that we are not just working with monotone-increasing dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6]) # use hourly frequency to avoid rounding errors in expected results # TODO: flesh this out with different frequencies - parr = dti._data.to_period('H') + parr = dti._data.to_period("H") obj = box(parr) with pytest.raises(TypeError, match="ambiguous"): obj.mean() @@ -60,10 +59,9 @@ def test_period_mean(self, box): with pytest.raises(TypeError, match="ambiguous"): obj.mean(skipna=True) - @pytest.mark.parametrize('box', [Series, pd.Index, TimedeltaArray]) + @pytest.mark.parametrize("box", [Series, pd.Index, TimedeltaArray]) def test_td64_mean(self, box): - tdi = pd.TimedeltaIndex([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], - unit='D') + tdi = pd.TimedeltaIndex([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], unit="D") tdarr = tdi._data obj = box(tdarr) @@ -79,7 +77,7 @@ def test_td64_mean(self, box): assert result2 == tdi[1:].mean() # exact equality fails by 1 nanosecond - assert result2.round('us') == (result * 11. / 10).round('us') + assert result2.round("us") == (result * 11.0 / 10).round("us") class TestSeriesStatReductions: @@ -87,18 +85,19 @@ class TestSeriesStatReductions: # were moved from a series-specific test file, _not_ that these tests are # intended long-term to be series-specific - def _check_stat_op(self, name, alternate, string_series_, - check_objects=False, check_allna=False): + def _check_stat_op( + self, name, alternate, string_series_, check_objects=False, check_allna=False + ): - with pd.option_context('use_bottleneck', False): + with pd.option_context("use_bottleneck", False): f = getattr(Series, name) # add some NaNs string_series_[5:15] = np.NaN # mean, idxmax, idxmin, min, and max are valid for dates - if name not in ['max', 'min', 'mean']: - ds = Series(pd.date_range('1/1/2001', periods=10)) + if name not in ["max", "min", "mean"]: + ds = Series(pd.date_range("1/1/2001", periods=10)) with pytest.raises(TypeError): f(ds) @@ -123,67 +122,67 @@ def _check_stat_op(self, name, alternate, string_series_, # GH#2888 items = [0] items.extend(range(2 ** 40, 2 ** 40 + 1000)) - s = Series(items, dtype='int64') + s = Series(items, dtype="int64") tm.assert_almost_equal(float(f(s)), float(alternate(s.values))) # check date range if check_objects: - s = Series(pd.bdate_range('1/1/2000', periods=10)) + s = Series(pd.bdate_range("1/1/2000", periods=10)) res = f(s) exp = alternate(s) assert res == exp # check on string data - if name not in ['sum', 'min', 'max']: + if name not in ["sum", "min", "max"]: with pytest.raises(TypeError): - f(Series(list('abc'))) + f(Series(list("abc"))) # Invalid axis. with pytest.raises(ValueError): f(string_series_, axis=1) # Unimplemented numeric_only parameter. - if 'numeric_only' in inspect.getfullargspec(f).args: + if "numeric_only" in inspect.getfullargspec(f).args: with pytest.raises(NotImplementedError, match=name): f(string_series_, numeric_only=True) def test_sum(self): - string_series = tm.makeStringSeries().rename('series') - self._check_stat_op('sum', np.sum, string_series, check_allna=False) + string_series = tm.makeStringSeries().rename("series") + self._check_stat_op("sum", np.sum, string_series, check_allna=False) def test_mean(self): - string_series = tm.makeStringSeries().rename('series') - self._check_stat_op('mean', np.mean, string_series) + string_series = tm.makeStringSeries().rename("series") + self._check_stat_op("mean", np.mean, string_series) def test_median(self): - string_series = tm.makeStringSeries().rename('series') - self._check_stat_op('median', np.median, string_series) + string_series = tm.makeStringSeries().rename("series") + self._check_stat_op("median", np.median, string_series) # test with integers, test failure int_ts = Series(np.ones(10, dtype=int), index=range(10)) tm.assert_almost_equal(np.median(int_ts), int_ts.median()) def test_prod(self): - string_series = tm.makeStringSeries().rename('series') - self._check_stat_op('prod', np.prod, string_series) + string_series = tm.makeStringSeries().rename("series") + self._check_stat_op("prod", np.prod, string_series) def test_min(self): - string_series = tm.makeStringSeries().rename('series') - self._check_stat_op('min', np.min, string_series, check_objects=True) + string_series = tm.makeStringSeries().rename("series") + self._check_stat_op("min", np.min, string_series, check_objects=True) def test_max(self): - string_series = tm.makeStringSeries().rename('series') - self._check_stat_op('max', np.max, string_series, check_objects=True) + string_series = tm.makeStringSeries().rename("series") + self._check_stat_op("max", np.max, string_series, check_objects=True) def test_var_std(self): - string_series = tm.makeStringSeries().rename('series') - datetime_series = tm.makeTimeSeries().rename('ts') + string_series = tm.makeStringSeries().rename("series") + datetime_series = tm.makeTimeSeries().rename("ts") alt = lambda x: np.std(x, ddof=1) - self._check_stat_op('std', alt, string_series) + self._check_stat_op("std", alt, string_series) alt = lambda x: np.var(x, ddof=1) - self._check_stat_op('var', alt, string_series) + self._check_stat_op("var", alt, string_series) result = datetime_series.std(ddof=4) expected = np.std(datetime_series.values, ddof=4) @@ -202,15 +201,16 @@ def test_var_std(self): assert pd.isna(result) def test_sem(self): - string_series = tm.makeStringSeries().rename('series') - datetime_series = tm.makeTimeSeries().rename('ts') + string_series = tm.makeStringSeries().rename("series") + datetime_series = tm.makeTimeSeries().rename("ts") alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x)) - self._check_stat_op('sem', alt, string_series) + self._check_stat_op("sem", alt, string_series) result = datetime_series.sem(ddof=4) - expected = np.std(datetime_series.values, - ddof=4) / np.sqrt(len(datetime_series.values)) + expected = np.std(datetime_series.values, ddof=4) / np.sqrt( + len(datetime_series.values) + ) tm.assert_almost_equal(result, expected) # 1 - element series with ddof=1 @@ -222,10 +222,10 @@ def test_sem(self): def test_skew(self): from scipy.stats import skew - string_series = tm.makeStringSeries().rename('series') + string_series = tm.makeStringSeries().rename("series") alt = lambda x: skew(x, bias=False) - self._check_stat_op('skew', alt, string_series) + self._check_stat_op("skew", alt, string_series) # test corner cases, skew() returns NaN unless there's at least 3 # values @@ -244,17 +244,17 @@ def test_skew(self): def test_kurt(self): from scipy.stats import kurtosis - string_series = tm.makeStringSeries().rename('series') + string_series = tm.makeStringSeries().rename("series") alt = lambda x: kurtosis(x, bias=False) - self._check_stat_op('kurt', alt, string_series) + self._check_stat_op("kurt", alt, string_series) index = pd.MultiIndex( - levels=[['bar'], ['one', 'two', 'three'], [0, 1]], - codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]] + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], ) s = Series(np.random.randn(6), index=index) - tm.assert_almost_equal(s.kurt(), s.kurt(level=0)['bar']) + tm.assert_almost_equal(s.kurt(), s.kurt(level=0)["bar"]) # test corner cases, kurt() returns NaN unless there's at least 4 # values diff --git a/pandas/tests/resample/conftest.py b/pandas/tests/resample/conftest.py index d0f78f6d5b4398..bb4f7ced3350fc 100644 --- a/pandas/tests/resample/conftest.py +++ b/pandas/tests/resample/conftest.py @@ -8,10 +8,23 @@ from pandas.core.indexes.period import period_range # The various methods we support -downsample_methods = ['min', 'max', 'first', 'last', 'sum', 'mean', 'sem', - 'median', 'prod', 'var', 'std', 'ohlc', 'quantile'] -upsample_methods = ['count', 'size'] -series_methods = ['nunique'] +downsample_methods = [ + "min", + "max", + "first", + "last", + "sum", + "mean", + "sem", + "median", + "prod", + "var", + "std", + "ohlc", + "quantile", +] +upsample_methods = ["count", "size"] +series_methods = ["nunique"] resample_methods = downsample_methods + upsample_methods + series_methods @@ -38,9 +51,11 @@ def simple_date_range_series(): """ Series with date range index and random data for test purposes. """ - def _simple_date_range_series(start, end, freq='D'): + + def _simple_date_range_series(start, end, freq="D"): rng = date_range(start, end, freq=freq) return Series(np.random.randn(len(rng)), index=rng) + return _simple_date_range_series @@ -49,9 +64,11 @@ def simple_period_range_series(): """ Series with period range index and random data for test purposes. """ - def _simple_period_range_series(start, end, freq='D'): + + def _simple_period_range_series(start, end, freq="D"): rng = period_range(start, end, freq=freq) return Series(np.random.randn(len(rng)), index=rng) + return _simple_period_range_series @@ -70,7 +87,7 @@ def _index_end(): @pytest.fixture def _index_freq(): """Fixture for parametrization of index, series and frame.""" - return 'D' + return "D" @pytest.fixture @@ -83,8 +100,7 @@ def _index_name(): def index(_index_factory, _index_start, _index_end, _index_freq, _index_name): """Fixture for parametrization of date_range, period_range and timedelta_range indexes""" - return _index_factory( - _index_start, _index_end, freq=_index_freq, name=_index_name) + return _index_factory(_index_start, _index_end, freq=_index_freq, name=_index_name) @pytest.fixture @@ -121,7 +137,7 @@ def frame(index, _series_name, _static_values): """Fixture for parametrization of DataFrame with date_range, period_range and timedelta_range indexes""" # _series_name is intentionally unused - return DataFrame({'value': _static_values}, index=index) + return DataFrame({"value": _static_values}, index=index) @pytest.fixture diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 63fa2007e401d7..51e309130e45d0 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -12,8 +12,11 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_index_equal, - assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_index_equal, + assert_series_equal, +) # a fixture value can be overridden by the test parameter value. Note that the # value of the fixture can be overridden this way even if the test doesn't use @@ -21,14 +24,13 @@ # see https://docs.pytest.org/en/latest/fixture.html#override-a-fixture-with-direct-test-parametrization # noqa # in this module we override the fixture values defined in conftest.py # tuples of '_index_factory,_series_name,_index_start,_index_end' -DATE_RANGE = (date_range, 'dti', datetime(2005, 1, 1), datetime(2005, 1, 10)) -PERIOD_RANGE = ( - period_range, 'pi', datetime(2005, 1, 1), datetime(2005, 1, 10)) -TIMEDELTA_RANGE = (timedelta_range, 'tdi', '1 day', '10 day') +DATE_RANGE = (date_range, "dti", datetime(2005, 1, 1), datetime(2005, 1, 10)) +PERIOD_RANGE = (period_range, "pi", datetime(2005, 1, 1), datetime(2005, 1, 10)) +TIMEDELTA_RANGE = (timedelta_range, "tdi", "1 day", "10 day") all_ts = pytest.mark.parametrize( - '_index_factory,_series_name,_index_start,_index_end', - [DATE_RANGE, PERIOD_RANGE, TIMEDELTA_RANGE] + "_index_factory,_series_name,_index_start,_index_end", + [DATE_RANGE, PERIOD_RANGE, TIMEDELTA_RANGE], ) @@ -37,13 +39,13 @@ def create_index(_index_factory): def _create_index(*args, **kwargs): """ return the _index_factory created using the args, kwargs """ return _index_factory(*args, **kwargs) + return _create_index -@pytest.mark.parametrize('freq', ['2D', '1H']) +@pytest.mark.parametrize("freq", ["2D", "1H"]) @pytest.mark.parametrize( - '_index_factory,_series_name,_index_start,_index_end', - [DATE_RANGE, TIMEDELTA_RANGE] + "_index_factory,_series_name,_index_start,_index_end", [DATE_RANGE, TIMEDELTA_RANGE] ) def test_asfreq(series_and_frame, freq, create_index): obj = series_and_frame @@ -55,24 +57,22 @@ def test_asfreq(series_and_frame, freq, create_index): @pytest.mark.parametrize( - '_index_factory,_series_name,_index_start,_index_end', - [DATE_RANGE, TIMEDELTA_RANGE] + "_index_factory,_series_name,_index_start,_index_end", [DATE_RANGE, TIMEDELTA_RANGE] ) def test_asfreq_fill_value(series, create_index): # test for fill value during resampling, issue 3715 s = series - result = s.resample('1H').asfreq() - new_index = create_index(s.index[0], s.index[-1], freq='1H') + result = s.resample("1H").asfreq() + new_index = create_index(s.index[0], s.index[-1], freq="1H") expected = s.reindex(new_index) assert_series_equal(result, expected) - frame = s.to_frame('value') + frame = s.to_frame("value") frame.iloc[1] = None - result = frame.resample('1H').asfreq(fill_value=4.0) - new_index = create_index(frame.index[0], - frame.index[-1], freq='1H') + result = frame.resample("1H").asfreq(fill_value=4.0) + new_index = create_index(frame.index[0], frame.index[-1], freq="1H") expected = frame.reindex(new_index, fill_value=4.0) assert_frame_equal(result, expected) @@ -82,26 +82,28 @@ def test_resample_interpolate(frame): # # 12925 df = frame assert_frame_equal( - df.resample('1T').asfreq().interpolate(), - df.resample('1T').interpolate()) + df.resample("1T").asfreq().interpolate(), df.resample("1T").interpolate() + ) def test_raises_on_non_datetimelike_index(): # this is a non datetimelike index xp = DataFrame() - msg = ("Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex," - " but got an instance of 'Index'") + msg = ( + "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex," + " but got an instance of 'Index'" + ) with pytest.raises(TypeError, match=msg): - xp.resample('A').mean() + xp.resample("A").mean() @all_ts -@pytest.mark.parametrize('freq', ['M', 'D', 'H']) +@pytest.mark.parametrize("freq", ["M", "D", "H"]) def test_resample_empty_series(freq, empty_series, resample_method): # GH12771 & GH12868 - if resample_method == 'ohlc': - pytest.skip('need to test for ohlc from GH13083') + if resample_method == "ohlc": + pytest.skip("need to test for ohlc from GH13083") s = empty_series result = getattr(s.resample(freq), resample_method)() @@ -117,13 +119,13 @@ def test_resample_empty_series(freq, empty_series, resample_method): @all_ts -@pytest.mark.parametrize('freq', ['M', 'D', 'H']) +@pytest.mark.parametrize("freq", ["M", "D", "H"]) def test_resample_empty_dataframe(empty_frame, freq, resample_method): # GH13212 df = empty_frame # count retains dimensions too result = getattr(df.resample(freq), resample_method)() - if resample_method != 'size': + if resample_method != "size": expected = df.copy() else: # GH14962 @@ -141,9 +143,7 @@ def test_resample_empty_dataframe(empty_frame, freq, resample_method): @pytest.mark.parametrize("index", tm.all_timeseries_index_generator(0)) -@pytest.mark.parametrize( - "dtype", - [np.float, np.int, np.object, 'datetime64[ns]']) +@pytest.mark.parametrize("dtype", [np.float, np.int, np.object, "datetime64[ns]"]) def test_resample_empty_dtypes(index, dtype, resample_method): # Empty series were sometimes causing a segfault (for the functions @@ -151,7 +151,7 @@ def test_resample_empty_dtypes(index, dtype, resample_method): # them to ensure they no longer do. (GH #10228) empty_series = Series([], index, dtype) try: - getattr(empty_series.resample('d'), resample_method)() + getattr(empty_series.resample("d"), resample_method)() except DataError: # Ignore these since some combinations are invalid # (ex: doing mean with dtype of np.object) @@ -162,30 +162,25 @@ def test_resample_empty_dtypes(index, dtype, resample_method): def test_resample_loffset_arg_type(frame, create_index): # GH 13218, 15002 df = frame - expected_means = [df.values[i:i + 2].mean() - for i in range(0, len(df.values), 2)] - expected_index = create_index(df.index[0], - periods=len(df.index) / 2, - freq='2D') + expected_means = [df.values[i : i + 2].mean() for i in range(0, len(df.values), 2)] + expected_index = create_index(df.index[0], periods=len(df.index) / 2, freq="2D") # loffset coerces PeriodIndex to DateTimeIndex if isinstance(expected_index, PeriodIndex): expected_index = expected_index.to_timestamp() expected_index += timedelta(hours=2) - expected = DataFrame({'value': expected_means}, index=expected_index) + expected = DataFrame({"value": expected_means}, index=expected_index) - for arg in ['mean', {'value': 'mean'}, ['mean']]: + for arg in ["mean", {"value": "mean"}, ["mean"]]: - result_agg = df.resample('2D', loffset='2H').agg(arg) + result_agg = df.resample("2D", loffset="2H").agg(arg) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result_how = df.resample('2D', how=arg, loffset='2H') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result_how = df.resample("2D", how=arg, loffset="2H") if isinstance(arg, list): - expected.columns = pd.MultiIndex.from_tuples([('value', - 'mean')]) + expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) # GH 13022, 7687 - TODO: fix resample w/ TimedeltaIndex if isinstance(expected.index, TimedeltaIndex): @@ -203,7 +198,7 @@ def test_resample_loffset_arg_type(frame, create_index): def test_apply_to_empty_series(empty_series): # GH 14313 s = empty_series - for freq in ['M', 'D', 'H']: + for freq in ["M", "D", "H"]: result = s.resample(freq).apply(lambda x: 1) expected = s.resample(freq).apply(np.sum) @@ -213,8 +208,8 @@ def test_apply_to_empty_series(empty_series): @all_ts def test_resampler_is_iterable(series): # GH 15314 - freq = 'H' - tg = Grouper(freq=freq, convention='start') + freq = "H" + tg = Grouper(freq=freq, convention="start") grouped = series.groupby(tg) resampled = series.resample(freq) for (rk, rv), (gk, gv) in zip(resampled, grouped): @@ -227,7 +222,7 @@ def test_resample_quantile(series): # GH 15023 s = series q = 0.75 - freq = 'H' + freq = "H" result = s.resample(freq).quantile(q) expected = s.resample(freq).agg(lambda x: x.quantile(q)).rename(s.name) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 830ba6062cc720..929bd1725b30a7 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -16,7 +16,10 @@ from pandas.core.resample import DatetimeIndex, _get_timestamp_range_edges import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) import pandas.tseries.offsets as offsets from pandas.tseries.offsets import BDay, Minute @@ -29,7 +32,7 @@ def _index_factory(): @pytest.fixture def _index_freq(): - return 'Min' + return "Min" @pytest.fixture @@ -40,20 +43,20 @@ def _static_values(index): def test_custom_grouper(index): dti = index - s = Series(np.array([1] * len(dti)), index=dti, dtype='int64') + s = Series(np.array([1] * len(dti)), index=dti, dtype="int64") b = Grouper(freq=Minute(5)) g = s.groupby(b) # check all cython functions work - funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var'] + funcs = ["add", "mean", "prod", "ohlc", "min", "max", "var"] for f in funcs: g._cython_agg_general(f) - b = Grouper(freq=Minute(5), closed='right', label='right') + b = Grouper(freq=Minute(5), closed="right", label="right") g = s.groupby(b) # check all cython functions work - funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var'] + funcs = ["add", "mean", "prod", "ohlc", "min", "max", "var"] for f in funcs: g._cython_agg_general(f) @@ -70,8 +73,7 @@ def test_custom_grouper(index): result = g.agg(np.sum) assert_series_equal(result, expect) - df = DataFrame(np.random.rand(len(dti), 10), - index=dti, dtype='float64') + df = DataFrame(np.random.rand(len(dti), 10), index=dti, dtype="float64") r = df.groupby(b).agg(np.sum) assert len(r.columns) == 10 @@ -79,77 +81,90 @@ def test_custom_grouper(index): @pytest.mark.parametrize( - '_index_start,_index_end,_index_name', - [('1/1/2000 00:00:00', '1/1/2000 00:13:00', 'index')]) -@pytest.mark.parametrize('closed, expected', [ - ('right', - lambda s: Series( - [s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], - index=date_range( - '1/1/2000', periods=4, freq='5min', name='index'))), - ('left', - lambda s: Series( - [s[:5].mean(), s[5:10].mean(), s[10:].mean()], - index=date_range( - '1/1/2000 00:05', periods=3, freq='5min', name='index')) - ) -]) + "_index_start,_index_end,_index_name", + [("1/1/2000 00:00:00", "1/1/2000 00:13:00", "index")], +) +@pytest.mark.parametrize( + "closed, expected", + [ + ( + "right", + lambda s: Series( + [s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], + index=date_range("1/1/2000", periods=4, freq="5min", name="index"), + ), + ), + ( + "left", + lambda s: Series( + [s[:5].mean(), s[5:10].mean(), s[10:].mean()], + index=date_range( + "1/1/2000 00:05", periods=3, freq="5min", name="index" + ), + ), + ), + ], +) def test_resample_basic(series, closed, expected): s = series expected = expected(s) - result = s.resample('5min', closed=closed, label='right').mean() + result = s.resample("5min", closed=closed, label="right").mean() assert_series_equal(result, expected) def test_resample_integerarray(): # GH 25580, resample on IntegerArray - ts = pd.Series(range(9), - index=pd.date_range('1/1/2000', periods=9, freq='T'), - dtype='Int64') - result = ts.resample('3T').sum() - expected = Series([3, 12, 21], - index=pd.date_range('1/1/2000', periods=3, freq='3T'), - dtype="Int64") + ts = pd.Series( + range(9), index=pd.date_range("1/1/2000", periods=9, freq="T"), dtype="Int64" + ) + result = ts.resample("3T").sum() + expected = Series( + [3, 12, 21], + index=pd.date_range("1/1/2000", periods=3, freq="3T"), + dtype="Int64", + ) assert_series_equal(result, expected) - result = ts.resample('3T').mean() - expected = Series([1, 4, 7], - index=pd.date_range('1/1/2000', periods=3, freq='3T'), - dtype='Int64') + result = ts.resample("3T").mean() + expected = Series( + [1, 4, 7], index=pd.date_range("1/1/2000", periods=3, freq="3T"), dtype="Int64" + ) assert_series_equal(result, expected) def test_resample_basic_grouper(series): s = series - result = s.resample('5Min').last() - grouper = Grouper(freq=Minute(5), closed='left', label='left') + result = s.resample("5Min").last() + grouper = Grouper(freq=Minute(5), closed="left", label="left") expected = s.groupby(grouper).agg(lambda x: x[-1]) assert_series_equal(result, expected) @pytest.mark.parametrize( - '_index_start,_index_end,_index_name', - [('1/1/2000 00:00:00', '1/1/2000 00:13:00', 'index')]) -@pytest.mark.parametrize('keyword,value', [ - ('label', 'righttt'), - ('closed', 'righttt'), - ('convention', 'starttt') -]) + "_index_start,_index_end,_index_name", + [("1/1/2000 00:00:00", "1/1/2000 00:13:00", "index")], +) +@pytest.mark.parametrize( + "keyword,value", + [("label", "righttt"), ("closed", "righttt"), ("convention", "starttt")], +) def test_resample_string_kwargs(series, keyword, value): # see gh-19303 # Check that wrong keyword argument strings raise an error msg = "Unsupported value {value} for `{keyword}`".format( - value=value, keyword=keyword) + value=value, keyword=keyword + ) with pytest.raises(ValueError, match=msg): - series.resample('5min', **({keyword: value})) + series.resample("5min", **({keyword: value})) @pytest.mark.parametrize( - '_index_start,_index_end,_index_name', - [('1/1/2000 00:00:00', '1/1/2000 00:13:00', 'index')]) + "_index_start,_index_end,_index_name", + [("1/1/2000 00:00:00", "1/1/2000 00:13:00", "index")], +) def test_resample_how(series, downsample_method): - if downsample_method == 'ohlc': - pytest.skip('covered by test_resample_how_ohlc') + if downsample_method == "ohlc": + pytest.skip("covered by test_resample_how_ohlc") s = series grouplist = np.ones_like(s) @@ -158,17 +173,18 @@ def test_resample_how(series, downsample_method): grouplist[6:11] = 2 grouplist[11:] = 3 expected = s.groupby(grouplist).agg(downsample_method) - expected.index = date_range( - '1/1/2000', periods=4, freq='5min', name='index') + expected.index = date_range("1/1/2000", periods=4, freq="5min", name="index") - result = getattr(s.resample( - '5min', closed='right', label='right'), downsample_method)() + result = getattr( + s.resample("5min", closed="right", label="right"), downsample_method + )() assert_series_equal(result, expected) @pytest.mark.parametrize( - '_index_start,_index_end,_index_name', - [('1/1/2000 00:00:00', '1/1/2000 00:13:00', 'index')]) + "_index_start,_index_end,_index_name", + [("1/1/2000 00:00:00", "1/1/2000 00:13:00", "index")], +) def test_resample_how_ohlc(series): s = series grouplist = np.ones_like(s) @@ -184,20 +200,19 @@ def _ohlc(group): expected = DataFrame( s.groupby(grouplist).agg(_ohlc).values.tolist(), - index=date_range('1/1/2000', periods=4, freq='5min', name='index'), - columns=['open', 'high', 'low', 'close']) + index=date_range("1/1/2000", periods=4, freq="5min", name="index"), + columns=["open", "high", "low", "close"], + ) - result = s.resample('5min', closed='right', label='right').ohlc() + result = s.resample("5min", closed="right", label="right").ohlc() assert_frame_equal(result, expected) -@pytest.mark.parametrize( - 'func', ['min', 'max', 'sum', 'prod', 'mean', 'var', 'std']) +@pytest.mark.parametrize("func", ["min", "max", "sum", "prod", "mean", "var", "std"]) def test_numpy_compat(func): # see gh-12811 - s = Series([1, 2, 3, 4, 5], index=date_range( - '20130101', periods=5, freq='s')) - r = s.resample('2s') + s = Series([1, 2, 3, 4, 5], index=date_range("20130101", periods=5, freq="s")) + r = s.resample("2s") msg = "numpy operations are not valid with resample" @@ -210,14 +225,13 @@ def test_numpy_compat(func): def test_resample_how_callables(): # GH#7929 data = np.arange(5, dtype=np.int64) - ind = date_range(start='2014-01-01', periods=len(data), freq='d') + ind = date_range(start="2014-01-01", periods=len(data), freq="d") df = DataFrame({"A": data, "B": data}, index=ind) def fn(x, a=1): return str(type(x)) class FnClass: - def __call__(self, x): return str(type(x)) @@ -258,120 +272,126 @@ def test_resample_rounding(): 11-08-2014,00:00:20.674,1 11-08-2014,00:00:21.191,1""" - df = pd.read_csv(StringIO(data), parse_dates={'timestamp': [ - 'date', 'time']}, index_col='timestamp') + df = pd.read_csv( + StringIO(data), + parse_dates={"timestamp": ["date", "time"]}, + index_col="timestamp", + ) df.index.name = None - result = df.resample('6s').sum() - expected = DataFrame({'value': [ - 4, 9, 4, 2 - ]}, index=date_range('2014-11-08', freq='6s', periods=4)) + result = df.resample("6s").sum() + expected = DataFrame( + {"value": [4, 9, 4, 2]}, index=date_range("2014-11-08", freq="6s", periods=4) + ) assert_frame_equal(result, expected) - result = df.resample('7s').sum() - expected = DataFrame({'value': [ - 4, 10, 4, 1 - ]}, index=date_range('2014-11-08', freq='7s', periods=4)) + result = df.resample("7s").sum() + expected = DataFrame( + {"value": [4, 10, 4, 1]}, index=date_range("2014-11-08", freq="7s", periods=4) + ) assert_frame_equal(result, expected) - result = df.resample('11s').sum() - expected = DataFrame({'value': [ - 11, 8 - ]}, index=date_range('2014-11-08', freq='11s', periods=2)) + result = df.resample("11s").sum() + expected = DataFrame( + {"value": [11, 8]}, index=date_range("2014-11-08", freq="11s", periods=2) + ) assert_frame_equal(result, expected) - result = df.resample('13s').sum() - expected = DataFrame({'value': [ - 13, 6 - ]}, index=date_range('2014-11-08', freq='13s', periods=2)) + result = df.resample("13s").sum() + expected = DataFrame( + {"value": [13, 6]}, index=date_range("2014-11-08", freq="13s", periods=2) + ) assert_frame_equal(result, expected) - result = df.resample('17s').sum() - expected = DataFrame({'value': [ - 16, 3 - ]}, index=date_range('2014-11-08', freq='17s', periods=2)) + result = df.resample("17s").sum() + expected = DataFrame( + {"value": [16, 3]}, index=date_range("2014-11-08", freq="17s", periods=2) + ) assert_frame_equal(result, expected) def test_resample_basic_from_daily(): # from daily - dti = date_range(start=datetime(2005, 1, 1), - end=datetime(2005, 1, 10), freq='D', name='index') + dti = date_range( + start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="D", name="index" + ) s = Series(np.random.rand(len(dti)), dti) # to weekly - result = s.resample('w-sun').last() + result = s.resample("w-sun").last() assert len(result) == 3 assert (result.index.dayofweek == [6, 6, 6]).all() - assert result.iloc[0] == s['1/2/2005'] - assert result.iloc[1] == s['1/9/2005'] + assert result.iloc[0] == s["1/2/2005"] + assert result.iloc[1] == s["1/9/2005"] assert result.iloc[2] == s.iloc[-1] - result = s.resample('W-MON').last() + result = s.resample("W-MON").last() assert len(result) == 2 assert (result.index.dayofweek == [0, 0]).all() - assert result.iloc[0] == s['1/3/2005'] - assert result.iloc[1] == s['1/10/2005'] + assert result.iloc[0] == s["1/3/2005"] + assert result.iloc[1] == s["1/10/2005"] - result = s.resample('W-TUE').last() + result = s.resample("W-TUE").last() assert len(result) == 2 assert (result.index.dayofweek == [1, 1]).all() - assert result.iloc[0] == s['1/4/2005'] - assert result.iloc[1] == s['1/10/2005'] + assert result.iloc[0] == s["1/4/2005"] + assert result.iloc[1] == s["1/10/2005"] - result = s.resample('W-WED').last() + result = s.resample("W-WED").last() assert len(result) == 2 assert (result.index.dayofweek == [2, 2]).all() - assert result.iloc[0] == s['1/5/2005'] - assert result.iloc[1] == s['1/10/2005'] + assert result.iloc[0] == s["1/5/2005"] + assert result.iloc[1] == s["1/10/2005"] - result = s.resample('W-THU').last() + result = s.resample("W-THU").last() assert len(result) == 2 assert (result.index.dayofweek == [3, 3]).all() - assert result.iloc[0] == s['1/6/2005'] - assert result.iloc[1] == s['1/10/2005'] + assert result.iloc[0] == s["1/6/2005"] + assert result.iloc[1] == s["1/10/2005"] - result = s.resample('W-FRI').last() + result = s.resample("W-FRI").last() assert len(result) == 2 assert (result.index.dayofweek == [4, 4]).all() - assert result.iloc[0] == s['1/7/2005'] - assert result.iloc[1] == s['1/10/2005'] + assert result.iloc[0] == s["1/7/2005"] + assert result.iloc[1] == s["1/10/2005"] # to biz day - result = s.resample('B').last() + result = s.resample("B").last() assert len(result) == 7 assert (result.index.dayofweek == [4, 0, 1, 2, 3, 4, 0]).all() - assert result.iloc[0] == s['1/2/2005'] - assert result.iloc[1] == s['1/3/2005'] - assert result.iloc[5] == s['1/9/2005'] - assert result.index.name == 'index' + assert result.iloc[0] == s["1/2/2005"] + assert result.iloc[1] == s["1/3/2005"] + assert result.iloc[5] == s["1/9/2005"] + assert result.index.name == "index" def test_resample_upsampling_picked_but_not_correct(): # Test for issue #3020 - dates = date_range('01-Jan-2014', '05-Jan-2014', freq='D') + dates = date_range("01-Jan-2014", "05-Jan-2014", freq="D") series = Series(1, index=dates) - result = series.resample('D').mean() + result = series.resample("D").mean() assert result.index[0] == dates[0] # GH 5955 # incorrect deciding to upsample when the axis frequency matches the # resample frequency - s = Series(np.arange(1., 6), index=[datetime( - 1975, 1, i, 12, 0) for i in range(1, 6)]) - expected = Series(np.arange(1., 6), index=date_range( - '19750101', periods=5, freq='D')) + s = Series( + np.arange(1.0, 6), index=[datetime(1975, 1, i, 12, 0) for i in range(1, 6)] + ) + expected = Series( + np.arange(1.0, 6), index=date_range("19750101", periods=5, freq="D") + ) - result = s.resample('D').count() + result = s.resample("D").count() assert_series_equal(result, Series(1, index=expected.index)) - result1 = s.resample('D').sum() - result2 = s.resample('D').mean() + result1 = s.resample("D").sum() + result2 = s.resample("D").mean() assert_series_equal(result1, expected) assert_series_equal(result2, expected) @@ -379,77 +399,76 @@ def test_resample_upsampling_picked_but_not_correct(): def test_resample_frame_basic(): df = tm.makeTimeDataFrame() - b = Grouper(freq='M') + b = Grouper(freq="M") g = df.groupby(b) # check all cython functions work - funcs = ['add', 'mean', 'prod', 'min', 'max', 'var'] + funcs = ["add", "mean", "prod", "min", "max", "var"] for f in funcs: g._cython_agg_general(f) - result = df.resample('A').mean() - assert_series_equal(result['A'], df['A'].resample('A').mean()) + result = df.resample("A").mean() + assert_series_equal(result["A"], df["A"].resample("A").mean()) - result = df.resample('M').mean() - assert_series_equal(result['A'], df['A'].resample('M').mean()) + result = df.resample("M").mean() + assert_series_equal(result["A"], df["A"].resample("M").mean()) - df.resample('M', kind='period').mean() - df.resample('W-WED', kind='period').mean() + df.resample("M", kind="period").mean() + df.resample("W-WED", kind="period").mean() -@pytest.mark.parametrize('loffset', [timedelta(minutes=1), - '1min', Minute(1), - np.timedelta64(1, 'm')]) +@pytest.mark.parametrize( + "loffset", [timedelta(minutes=1), "1min", Minute(1), np.timedelta64(1, "m")] +) def test_resample_loffset(loffset): # GH 7687 - rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min') + rng = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="min") s = Series(np.random.randn(14), index=rng) - result = s.resample('5min', closed='right', label='right', - loffset=loffset).mean() - idx = date_range('1/1/2000', periods=4, freq='5min') - expected = Series([s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], - index=idx + timedelta(minutes=1)) + result = s.resample("5min", closed="right", label="right", loffset=loffset).mean() + idx = date_range("1/1/2000", periods=4, freq="5min") + expected = Series( + [s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], + index=idx + timedelta(minutes=1), + ) assert_series_equal(result, expected) assert result.index.freq == Minute(5) # from daily - dti = date_range(start=datetime(2005, 1, 1), - end=datetime(2005, 1, 10), freq='D') + dti = date_range(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="D") ser = Series(np.random.rand(len(dti)), dti) # to weekly - result = ser.resample('w-sun').last() + result = ser.resample("w-sun").last() business_day_offset = BDay() - expected = ser.resample('w-sun', loffset=-business_day_offset).last() + expected = ser.resample("w-sun", loffset=-business_day_offset).last() assert result.index[0] - business_day_offset == expected.index[0] def test_resample_loffset_upsample(): # GH 20744 - rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min') + rng = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="min") s = Series(np.random.randn(14), index=rng) - result = s.resample('5min', closed='right', label='right', - loffset=timedelta(minutes=1)).ffill() - idx = date_range('1/1/2000', periods=4, freq='5min') - expected = Series([s[0], s[5], s[10], s[-1]], - index=idx + timedelta(minutes=1)) + result = s.resample( + "5min", closed="right", label="right", loffset=timedelta(minutes=1) + ).ffill() + idx = date_range("1/1/2000", periods=4, freq="5min") + expected = Series([s[0], s[5], s[10], s[-1]], index=idx + timedelta(minutes=1)) assert_series_equal(result, expected) def test_resample_loffset_count(): # GH 12725 - start_time = '1/1/2000 00:00:00' - rng = date_range(start_time, periods=100, freq='S') + start_time = "1/1/2000 00:00:00" + rng = date_range(start_time, periods=100, freq="S") ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.resample('10S', loffset='1s').count() + result = ts.resample("10S", loffset="1s").count() - expected_index = ( - date_range(start_time, periods=10, freq='10S') + - timedelta(seconds=1) + expected_index = date_range(start_time, periods=10, freq="10S") + timedelta( + seconds=1 ) expected = Series(10, index=expected_index) @@ -457,70 +476,78 @@ def test_resample_loffset_count(): # Same issue should apply to .size() since it goes through # same code path - result = ts.resample('10S', loffset='1s').size() + result = ts.resample("10S", loffset="1s").size() assert_series_equal(result, expected) def test_resample_upsample(): # from daily - dti = date_range(start=datetime(2005, 1, 1), - end=datetime(2005, 1, 10), freq='D', name='index') + dti = date_range( + start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="D", name="index" + ) s = Series(np.random.rand(len(dti)), dti) # to minutely, by padding - result = s.resample('Min').pad() + result = s.resample("Min").pad() assert len(result) == 12961 assert result[0] == s[0] assert result[-1] == s[-1] - assert result.index.name == 'index' + assert result.index.name == "index" def test_resample_how_method(): # GH9915 - s = Series([11, 22], - index=[Timestamp('2015-03-31 21:48:52.672000'), - Timestamp('2015-03-31 21:49:52.739000')]) - expected = Series([11, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, 22], - index=[Timestamp('2015-03-31 21:48:50'), - Timestamp('2015-03-31 21:49:00'), - Timestamp('2015-03-31 21:49:10'), - Timestamp('2015-03-31 21:49:20'), - Timestamp('2015-03-31 21:49:30'), - Timestamp('2015-03-31 21:49:40'), - Timestamp('2015-03-31 21:49:50')]) + s = Series( + [11, 22], + index=[ + Timestamp("2015-03-31 21:48:52.672000"), + Timestamp("2015-03-31 21:49:52.739000"), + ], + ) + expected = Series( + [11, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, 22], + index=[ + Timestamp("2015-03-31 21:48:50"), + Timestamp("2015-03-31 21:49:00"), + Timestamp("2015-03-31 21:49:10"), + Timestamp("2015-03-31 21:49:20"), + Timestamp("2015-03-31 21:49:30"), + Timestamp("2015-03-31 21:49:40"), + Timestamp("2015-03-31 21:49:50"), + ], + ) assert_series_equal(s.resample("10S").mean(), expected) def test_resample_extra_index_point(): # GH#9756 - index = date_range(start='20150101', end='20150331', freq='BM') - expected = DataFrame({'A': Series([21, 41, 63], index=index)}) + index = date_range(start="20150101", end="20150331", freq="BM") + expected = DataFrame({"A": Series([21, 41, 63], index=index)}) - index = date_range(start='20150101', end='20150331', freq='B') - df = DataFrame( - {'A': Series(range(len(index)), index=index)}, dtype='int64') - result = df.resample('BM').last() + index = date_range(start="20150101", end="20150331", freq="B") + df = DataFrame({"A": Series(range(len(index)), index=index)}, dtype="int64") + result = df.resample("BM").last() assert_frame_equal(result, expected) def test_upsample_with_limit(): - rng = date_range('1/1/2000', periods=3, freq='5t') + rng = date_range("1/1/2000", periods=3, freq="5t") ts = Series(np.random.randn(len(rng)), rng) - result = ts.resample('t').ffill(limit=2) - expected = ts.reindex(result.index, method='ffill', limit=2) + result = ts.resample("t").ffill(limit=2) + expected = ts.reindex(result.index, method="ffill", limit=2) assert_series_equal(result, expected) def test_nearest_upsample_with_limit(): - rng = date_range('1/1/2000', periods=3, freq='5t') + rng = date_range("1/1/2000", periods=3, freq="5t") ts = Series(np.random.randn(len(rng)), rng) - result = ts.resample('t').nearest(limit=2) - expected = ts.reindex(result.index, method='nearest', limit=2) + result = ts.resample("t").nearest(limit=2) + expected = ts.reindex(result.index, method="nearest", limit=2) assert_series_equal(result, expected) @@ -529,75 +556,93 @@ def test_resample_ohlc(series): grouper = Grouper(freq=Minute(5)) expect = s.groupby(grouper).agg(lambda x: x[-1]) - result = s.resample('5Min').ohlc() + result = s.resample("5Min").ohlc() assert len(result) == len(expect) assert len(result.columns) == 4 xs = result.iloc[-2] - assert xs['open'] == s[-6] - assert xs['high'] == s[-6:-1].max() - assert xs['low'] == s[-6:-1].min() - assert xs['close'] == s[-2] + assert xs["open"] == s[-6] + assert xs["high"] == s[-6:-1].max() + assert xs["low"] == s[-6:-1].min() + assert xs["close"] == s[-2] xs = result.iloc[0] - assert xs['open'] == s[0] - assert xs['high'] == s[:5].max() - assert xs['low'] == s[:5].min() - assert xs['close'] == s[4] + assert xs["open"] == s[0] + assert xs["high"] == s[:5].max() + assert xs["low"] == s[:5].min() + assert xs["close"] == s[4] def test_resample_ohlc_result(): # GH 12332 - index = pd.date_range('1-1-2000', '2-15-2000', freq='h') - index = index.union(pd.date_range('4-15-2000', '5-15-2000', freq='h')) + index = pd.date_range("1-1-2000", "2-15-2000", freq="h") + index = index.union(pd.date_range("4-15-2000", "5-15-2000", freq="h")) s = Series(range(len(index)), index=index) - a = s.loc[:'4-15-2000'].resample('30T').ohlc() + a = s.loc[:"4-15-2000"].resample("30T").ohlc() assert isinstance(a, DataFrame) - b = s.loc[:'4-14-2000'].resample('30T').ohlc() + b = s.loc[:"4-14-2000"].resample("30T").ohlc() assert isinstance(b, DataFrame) # GH12348 # raising on odd period - rng = date_range('2013-12-30', '2014-01-07') - index = rng.drop([Timestamp('2014-01-01'), - Timestamp('2013-12-31'), - Timestamp('2014-01-04'), - Timestamp('2014-01-05')]) + rng = date_range("2013-12-30", "2014-01-07") + index = rng.drop( + [ + Timestamp("2014-01-01"), + Timestamp("2013-12-31"), + Timestamp("2014-01-04"), + Timestamp("2014-01-05"), + ] + ) df = DataFrame(data=np.arange(len(index)), index=index) - result = df.resample('B').mean() - expected = df.reindex(index=date_range(rng[0], rng[-1], freq='B')) + result = df.resample("B").mean() + expected = df.reindex(index=date_range(rng[0], rng[-1], freq="B")) assert_frame_equal(result, expected) def test_resample_ohlc_dataframe(): df = ( - DataFrame({ - 'PRICE': { - Timestamp('2011-01-06 10:59:05', tz=None): 24990, - Timestamp('2011-01-06 12:43:33', tz=None): 25499, - Timestamp('2011-01-06 12:54:09', tz=None): 25499}, - 'VOLUME': { - Timestamp('2011-01-06 10:59:05', tz=None): 1500000000, - Timestamp('2011-01-06 12:43:33', tz=None): 5000000000, - Timestamp('2011-01-06 12:54:09', tz=None): 100000000}}) - ).reindex(['VOLUME', 'PRICE'], axis=1) - res = df.resample('H').ohlc() - exp = pd.concat([df['VOLUME'].resample('H').ohlc(), - df['PRICE'].resample('H').ohlc()], - axis=1, - keys=['VOLUME', 'PRICE']) + DataFrame( + { + "PRICE": { + Timestamp("2011-01-06 10:59:05", tz=None): 24990, + Timestamp("2011-01-06 12:43:33", tz=None): 25499, + Timestamp("2011-01-06 12:54:09", tz=None): 25499, + }, + "VOLUME": { + Timestamp("2011-01-06 10:59:05", tz=None): 1500000000, + Timestamp("2011-01-06 12:43:33", tz=None): 5000000000, + Timestamp("2011-01-06 12:54:09", tz=None): 100000000, + }, + } + ) + ).reindex(["VOLUME", "PRICE"], axis=1) + res = df.resample("H").ohlc() + exp = pd.concat( + [df["VOLUME"].resample("H").ohlc(), df["PRICE"].resample("H").ohlc()], + axis=1, + keys=["VOLUME", "PRICE"], + ) assert_frame_equal(exp, res) - df.columns = [['a', 'b'], ['c', 'd']] - res = df.resample('H').ohlc() - exp.columns = pd.MultiIndex.from_tuples([ - ('a', 'c', 'open'), ('a', 'c', 'high'), ('a', 'c', 'low'), - ('a', 'c', 'close'), ('b', 'd', 'open'), ('b', 'd', 'high'), - ('b', 'd', 'low'), ('b', 'd', 'close')]) + df.columns = [["a", "b"], ["c", "d"]] + res = df.resample("H").ohlc() + exp.columns = pd.MultiIndex.from_tuples( + [ + ("a", "c", "open"), + ("a", "c", "high"), + ("a", "c", "low"), + ("a", "c", "close"), + ("b", "d", "open"), + ("b", "d", "high"), + ("b", "d", "low"), + ("b", "d", "close"), + ] + ) assert_frame_equal(exp, res) # dupe columns fail atm @@ -608,49 +653,49 @@ def test_resample_dup_index(): # GH 4812 # dup columns with resample raising - df = DataFrame(np.random.randn(4, 12), index=[2000, 2000, 2000, 2000], - columns=[Period(year=2000, month=i + 1, freq='M') - for i in range(12)]) + df = DataFrame( + np.random.randn(4, 12), + index=[2000, 2000, 2000, 2000], + columns=[Period(year=2000, month=i + 1, freq="M") for i in range(12)], + ) df.iloc[3, :] = np.nan - result = df.resample('Q', axis=1).mean() + result = df.resample("Q", axis=1).mean() expected = df.groupby(lambda x: int((x.month - 1) / 3), axis=1).mean() - expected.columns = [ - Period(year=2000, quarter=i + 1, freq='Q') for i in range(4)] + expected.columns = [Period(year=2000, quarter=i + 1, freq="Q") for i in range(4)] assert_frame_equal(result, expected) def test_resample_reresample(): - dti = date_range(start=datetime(2005, 1, 1), - end=datetime(2005, 1, 10), freq='D') + dti = date_range(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="D") s = Series(np.random.rand(len(dti)), dti) - bs = s.resample('B', closed='right', label='right').mean() - result = bs.resample('8H').mean() + bs = s.resample("B", closed="right", label="right").mean() + result = bs.resample("8H").mean() assert len(result) == 22 assert isinstance(result.index.freq, offsets.DateOffset) assert result.index.freq == offsets.Hour(8) def test_resample_timestamp_to_period(simple_date_range_series): - ts = simple_date_range_series('1/1/1990', '1/1/2000') + ts = simple_date_range_series("1/1/1990", "1/1/2000") - result = ts.resample('A-DEC', kind='period').mean() - expected = ts.resample('A-DEC').mean() - expected.index = period_range('1990', '2000', freq='a-dec') + result = ts.resample("A-DEC", kind="period").mean() + expected = ts.resample("A-DEC").mean() + expected.index = period_range("1990", "2000", freq="a-dec") assert_series_equal(result, expected) - result = ts.resample('A-JUN', kind='period').mean() - expected = ts.resample('A-JUN').mean() - expected.index = period_range('1990', '2000', freq='a-jun') + result = ts.resample("A-JUN", kind="period").mean() + expected = ts.resample("A-JUN").mean() + expected.index = period_range("1990", "2000", freq="a-jun") assert_series_equal(result, expected) - result = ts.resample('M', kind='period').mean() - expected = ts.resample('M').mean() - expected.index = period_range('1990-01', '2000-01', freq='M') + result = ts.resample("M", kind="period").mean() + expected = ts.resample("M").mean() + expected.index = period_range("1990-01", "2000-01", freq="M") assert_series_equal(result, expected) - result = ts.resample('M', kind='period').mean() - expected = ts.resample('M').mean() - expected.index = period_range('1990-01', '2000-01', freq='M') + result = ts.resample("M", kind="period").mean() + expected = ts.resample("M").mean() + expected.index = period_range("1990-01", "2000-01", freq="M") assert_series_equal(result, expected) @@ -660,27 +705,26 @@ def _ohlc(group): return np.repeat(np.nan, 4) return [group[0], group.max(), group.min(), group[-1]] - rng = date_range('1/1/2000 00:00:00', '1/1/2000 5:59:50', freq='10s') + rng = date_range("1/1/2000 00:00:00", "1/1/2000 5:59:50", freq="10s") ts = Series(np.random.randn(len(rng)), index=rng) - resampled = ts.resample('5min', closed='right', - label='right').ohlc() + resampled = ts.resample("5min", closed="right", label="right").ohlc() - assert (resampled.loc['1/1/2000 00:00'] == ts[0]).all() + assert (resampled.loc["1/1/2000 00:00"] == ts[0]).all() exp = _ohlc(ts[1:31]) - assert (resampled.loc['1/1/2000 00:05'] == exp).all() + assert (resampled.loc["1/1/2000 00:05"] == exp).all() - exp = _ohlc(ts['1/1/2000 5:55:01':]) - assert (resampled.loc['1/1/2000 6:00:00'] == exp).all() + exp = _ohlc(ts["1/1/2000 5:55:01":]) + assert (resampled.loc["1/1/2000 6:00:00"] == exp).all() def test_downsample_non_unique(): - rng = date_range('1/1/2000', '2/29/2000') + rng = date_range("1/1/2000", "2/29/2000") rng2 = rng.repeat(5).values ts = Series(np.random.randn(len(rng2)), index=rng2) - result = ts.resample('M').mean() + result = ts.resample("M").mean() expected = ts.groupby(lambda x: x.month).mean() assert len(result) == 2 @@ -690,22 +734,21 @@ def test_downsample_non_unique(): def test_asfreq_non_unique(): # GH #1077 - rng = date_range('1/1/2000', '2/29/2000') + rng = date_range("1/1/2000", "2/29/2000") rng2 = rng.repeat(2).values ts = Series(np.random.randn(len(rng2)), index=rng2) - msg = 'cannot reindex from a duplicate axis' + msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): - ts.asfreq('B') + ts.asfreq("B") def test_resample_axis1(): - rng = date_range('1/1/2000', '2/29/2000') - df = DataFrame(np.random.randn(3, len(rng)), columns=rng, - index=['a', 'b', 'c']) + rng = date_range("1/1/2000", "2/29/2000") + df = DataFrame(np.random.randn(3, len(rng)), columns=rng, index=["a", "b", "c"]) - result = df.resample('M', axis=1).mean() - expected = df.T.resample('M').mean().T + result = df.resample("M", axis=1).mean() + expected = df.T.resample("M").mean().T tm.assert_frame_equal(result, expected) @@ -715,53 +758,52 @@ def test_resample_anchored_ticks(): # than starting from the first timestamp which might start in the # middle of a desired interval - rng = date_range('1/1/2000 04:00:00', periods=86400, freq='s') + rng = date_range("1/1/2000 04:00:00", periods=86400, freq="s") ts = Series(np.random.randn(len(rng)), index=rng) ts[:2] = np.nan # so results are the same - freqs = ['t', '5t', '15t', '30t', '4h', '12h'] + freqs = ["t", "5t", "15t", "30t", "4h", "12h"] for freq in freqs: - result = ts[2:].resample(freq, closed='left', label='left').mean() - expected = ts.resample(freq, closed='left', label='left').mean() + result = ts[2:].resample(freq, closed="left", label="left").mean() + expected = ts.resample(freq, closed="left", label="left").mean() assert_series_equal(result, expected) def test_resample_single_group(): mysum = lambda x: x.sum() - rng = date_range('2000-1-1', '2000-2-10', freq='D') + rng = date_range("2000-1-1", "2000-2-10", freq="D") ts = Series(np.random.randn(len(rng)), index=rng) - assert_series_equal(ts.resample('M').sum(), - ts.resample('M').apply(mysum)) + assert_series_equal(ts.resample("M").sum(), ts.resample("M").apply(mysum)) - rng = date_range('2000-1-1', '2000-1-10', freq='D') + rng = date_range("2000-1-1", "2000-1-10", freq="D") ts = Series(np.random.randn(len(rng)), index=rng) - assert_series_equal(ts.resample('M').sum(), - ts.resample('M').apply(mysum)) + assert_series_equal(ts.resample("M").sum(), ts.resample("M").apply(mysum)) # GH 3849 - s = Series([30.1, 31.6], index=[Timestamp('20070915 15:30:00'), - Timestamp('20070915 15:40:00')]) - expected = Series([0.75], index=[Timestamp('20070915')]) - result = s.resample('D').apply(lambda x: np.std(x)) + s = Series( + [30.1, 31.6], + index=[Timestamp("20070915 15:30:00"), Timestamp("20070915 15:40:00")], + ) + expected = Series([0.75], index=[Timestamp("20070915")]) + result = s.resample("D").apply(lambda x: np.std(x)) assert_series_equal(result, expected) def test_resample_base(): - rng = date_range('1/1/2000 00:00:00', '1/1/2000 02:00', freq='s') + rng = date_range("1/1/2000 00:00:00", "1/1/2000 02:00", freq="s") ts = Series(np.random.randn(len(rng)), index=rng) - resampled = ts.resample('5min', base=2).mean() - exp_rng = date_range('12/31/1999 23:57:00', '1/1/2000 01:57', - freq='5min') + resampled = ts.resample("5min", base=2).mean() + exp_rng = date_range("12/31/1999 23:57:00", "1/1/2000 01:57", freq="5min") tm.assert_index_equal(resampled.index, exp_rng) def test_resample_float_base(): # GH25161 - dt = pd.to_datetime(["2018-11-26 16:17:43.51", - "2018-11-26 16:17:44.51", - "2018-11-26 16:17:45.51"]) + dt = pd.to_datetime( + ["2018-11-26 16:17:43.51", "2018-11-26 16:17:44.51", "2018-11-26 16:17:45.51"] + ) s = Series(np.arange(3), index=dt) base = 17 + 43.51 / 60 @@ -771,35 +813,37 @@ def test_resample_float_base(): def test_resample_daily_anchored(): - rng = date_range('1/1/2000 0:00:00', periods=10000, freq='T') + rng = date_range("1/1/2000 0:00:00", periods=10000, freq="T") ts = Series(np.random.randn(len(rng)), index=rng) ts[:2] = np.nan # so results are the same - result = ts[2:].resample('D', closed='left', label='left').mean() - expected = ts.resample('D', closed='left', label='left').mean() + result = ts[2:].resample("D", closed="left", label="left").mean() + expected = ts.resample("D", closed="left", label="left").mean() assert_series_equal(result, expected) def test_resample_to_period_monthly_buglet(): # GH #1259 - rng = date_range('1/1/2000', '12/31/2000') + rng = date_range("1/1/2000", "12/31/2000") ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.resample('M', kind='period').mean() - exp_index = period_range('Jan-2000', 'Dec-2000', freq='M') + result = ts.resample("M", kind="period").mean() + exp_index = period_range("Jan-2000", "Dec-2000", freq="M") tm.assert_index_equal(result.index, exp_index) def test_period_with_agg(): # aggregate a period resampler with a lambda - s2 = Series(np.random.randint(0, 5, 50), - index=pd.period_range('2012-01-01', freq='H', periods=50), - dtype='float64') + s2 = Series( + np.random.randint(0, 5, 50), + index=pd.period_range("2012-01-01", freq="H", periods=50), + dtype="float64", + ) - expected = s2.to_timestamp().resample('D').mean().to_period() - result = s2.resample('D').agg(lambda x: x.mean()) + expected = s2.to_timestamp().resample("D").mean().to_period() + result = s2.resample("D").agg(lambda x: x.mean()) assert_series_equal(result, expected) @@ -810,11 +854,12 @@ def test_resample_segfault(): (1, datetime(2013, 10, 1, 16, 20), 1, 0), (2, datetime(2013, 10, 1, 16, 10), 1, 0), (2, datetime(2013, 10, 1, 18, 15), 1, 0), - (2, datetime(2013, 10, 1, 16, 10, 31), 1, 0)] + (2, datetime(2013, 10, 1, 16, 10, 31), 1, 0), + ] - df = DataFrame.from_records(all_wins_and_wagers, - columns=("ID", "timestamp", "A", "B") - ).set_index("timestamp") + df = DataFrame.from_records( + all_wins_and_wagers, columns=("ID", "timestamp", "A", "B") + ).set_index("timestamp") result = df.groupby("ID").resample("5min").sum() expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) assert_frame_equal(result, expected) @@ -825,34 +870,30 @@ def test_resample_dtype_preservation(): # GH 12202 # validation tests for dtype preservation - df = DataFrame({'date': pd.date_range(start='2016-01-01', - periods=4, freq='W'), - 'group': [1, 1, 2, 2], - 'val': Series([5, 6, 7, 8], - dtype='int32')} - ).set_index('date') - - result = df.resample('1D').ffill() + df = DataFrame( + { + "date": pd.date_range(start="2016-01-01", periods=4, freq="W"), + "group": [1, 1, 2, 2], + "val": Series([5, 6, 7, 8], dtype="int32"), + } + ).set_index("date") + + result = df.resample("1D").ffill() assert result.val.dtype == np.int32 - result = df.groupby('group').resample('1D').ffill() + result = df.groupby("group").resample("1D").ffill() assert result.val.dtype == np.int32 def test_resample_dtype_coerceion(): - pytest.importorskip('scipy.interpolate') + pytest.importorskip("scipy.interpolate") # GH 16361 df = {"a": [1, 3, 1, 4]} df = DataFrame(df, index=pd.date_range("2017-01-01", "2017-01-04")) - expected = (df.astype("float64") - .resample("H") - .mean() - ["a"] - .interpolate("cubic") - ) + expected = df.astype("float64").resample("H").mean()["a"].interpolate("cubic") result = df.resample("H")["a"].mean().interpolate("cubic") tm.assert_series_equal(result, expected) @@ -863,20 +904,20 @@ def test_resample_dtype_coerceion(): def test_weekly_resample_buglet(): # #1327 - rng = date_range('1/1/2000', freq='B', periods=20) + rng = date_range("1/1/2000", freq="B", periods=20) ts = Series(np.random.randn(len(rng)), index=rng) - resampled = ts.resample('W').mean() - expected = ts.resample('W-SUN').mean() + resampled = ts.resample("W").mean() + expected = ts.resample("W-SUN").mean() assert_series_equal(resampled, expected) def test_monthly_resample_error(): # #1451 - dates = date_range('4/16/2012 20:00', periods=5000, freq='h') + dates = date_range("4/16/2012 20:00", periods=5000, freq="h") ts = Series(np.random.randn(len(dates)), index=dates) # it works! - ts.resample('M') + ts.resample("M") def test_nanosecond_resample_error(): @@ -884,20 +925,12 @@ def test_nanosecond_resample_error(): # Resampling using pd.tseries.offsets.Nano as period start = 1443707890427 exp_start = 1443707890400 - indx = pd.date_range( - start=pd.to_datetime(start), - periods=10, - freq='100n' - ) + indx = pd.date_range(start=pd.to_datetime(start), periods=10, freq="100n") ts = Series(range(len(indx)), index=indx) r = ts.resample(pd.tseries.offsets.Nano(100)) - result = r.agg('mean') + result = r.agg("mean") - exp_indx = pd.date_range( - start=pd.to_datetime(exp_start), - periods=10, - freq='100n' - ) + exp_indx = pd.date_range(start=pd.to_datetime(exp_start), periods=10, freq="100n") exp = Series(range(len(exp_indx)), index=exp_indx) assert_series_equal(result, exp) @@ -906,48 +939,44 @@ def test_nanosecond_resample_error(): def test_resample_anchored_intraday(simple_date_range_series): # #1471, #1458 - rng = date_range('1/1/2012', '4/1/2012', freq='100min') + rng = date_range("1/1/2012", "4/1/2012", freq="100min") df = DataFrame(rng.month, index=rng) - result = df.resample('M').mean() - expected = df.resample( - 'M', kind='period').mean().to_timestamp(how='end') - expected.index += Timedelta(1, 'ns') - Timedelta(1, 'D') + result = df.resample("M").mean() + expected = df.resample("M", kind="period").mean().to_timestamp(how="end") + expected.index += Timedelta(1, "ns") - Timedelta(1, "D") tm.assert_frame_equal(result, expected) - result = df.resample('M', closed='left').mean() - exp = df.tshift(1, freq='D').resample('M', kind='period').mean() - exp = exp.to_timestamp(how='end') + result = df.resample("M", closed="left").mean() + exp = df.tshift(1, freq="D").resample("M", kind="period").mean() + exp = exp.to_timestamp(how="end") - exp.index = exp.index + Timedelta(1, 'ns') - Timedelta(1, 'D') + exp.index = exp.index + Timedelta(1, "ns") - Timedelta(1, "D") tm.assert_frame_equal(result, exp) - rng = date_range('1/1/2012', '4/1/2012', freq='100min') + rng = date_range("1/1/2012", "4/1/2012", freq="100min") df = DataFrame(rng.month, index=rng) - result = df.resample('Q').mean() - expected = df.resample( - 'Q', kind='period').mean().to_timestamp(how='end') - expected.index += Timedelta(1, 'ns') - Timedelta(1, 'D') + result = df.resample("Q").mean() + expected = df.resample("Q", kind="period").mean().to_timestamp(how="end") + expected.index += Timedelta(1, "ns") - Timedelta(1, "D") tm.assert_frame_equal(result, expected) - result = df.resample('Q', closed='left').mean() - expected = df.tshift(1, freq='D').resample('Q', kind='period', - closed='left').mean() - expected = expected.to_timestamp(how='end') - expected.index += Timedelta(1, 'ns') - Timedelta(1, 'D') + result = df.resample("Q", closed="left").mean() + expected = df.tshift(1, freq="D").resample("Q", kind="period", closed="left").mean() + expected = expected.to_timestamp(how="end") + expected.index += Timedelta(1, "ns") - Timedelta(1, "D") tm.assert_frame_equal(result, expected) - ts = simple_date_range_series('2012-04-29 23:00', '2012-04-30 5:00', - freq='h') - resampled = ts.resample('M').mean() + ts = simple_date_range_series("2012-04-29 23:00", "2012-04-30 5:00", freq="h") + resampled = ts.resample("M").mean() assert len(resampled) == 1 def test_resample_anchored_monthstart(simple_date_range_series): - ts = simple_date_range_series('1/1/2000', '12/31/2002') + ts = simple_date_range_series("1/1/2000", "12/31/2002") - freqs = ['MS', 'BMS', 'QS-MAR', 'AS-DEC', 'AS-JUN'] + freqs = ["MS", "BMS", "QS-MAR", "AS-DEC", "AS-JUN"] for freq in freqs: ts.resample(freq).mean() @@ -961,115 +990,113 @@ def test_resample_anchored_multiday(): # See: https://github.com/pandas-dev/pandas/issues/8683 index = pd.date_range( - '2014-10-14 23:06:23.206', periods=3, freq='400L' - ) | pd.date_range( - '2014-10-15 23:00:00', periods=2, freq='2200L') + "2014-10-14 23:06:23.206", periods=3, freq="400L" + ) | pd.date_range("2014-10-15 23:00:00", periods=2, freq="2200L") s = Series(np.random.randn(5), index=index) # Ensure left closing works - result = s.resample('2200L').mean() - assert result.index[-1] == Timestamp('2014-10-15 23:00:02.000') + result = s.resample("2200L").mean() + assert result.index[-1] == Timestamp("2014-10-15 23:00:02.000") # Ensure right closing works - result = s.resample('2200L', label='right').mean() - assert result.index[-1] == Timestamp('2014-10-15 23:00:04.200') + result = s.resample("2200L", label="right").mean() + assert result.index[-1] == Timestamp("2014-10-15 23:00:04.200") -def test_corner_cases(simple_period_range_series, - simple_date_range_series): +def test_corner_cases(simple_period_range_series, simple_date_range_series): # miscellaneous test coverage - rng = date_range('1/1/2000', periods=12, freq='t') + rng = date_range("1/1/2000", periods=12, freq="t") ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.resample('5t', closed='right', label='left').mean() - ex_index = date_range('1999-12-31 23:55', periods=4, freq='5t') + result = ts.resample("5t", closed="right", label="left").mean() + ex_index = date_range("1999-12-31 23:55", periods=4, freq="5t") tm.assert_index_equal(result.index, ex_index) - len0pts = simple_period_range_series( - '2007-01', '2010-05', freq='M')[:0] + len0pts = simple_period_range_series("2007-01", "2010-05", freq="M")[:0] # it works - result = len0pts.resample('A-DEC').mean() + result = len0pts.resample("A-DEC").mean() assert len(result) == 0 # resample to periods - ts = simple_date_range_series( - '2000-04-28', '2000-04-30 11:00', freq='h') - result = ts.resample('M', kind='period').mean() + ts = simple_date_range_series("2000-04-28", "2000-04-30 11:00", freq="h") + result = ts.resample("M", kind="period").mean() assert len(result) == 1 - assert result.index[0] == Period('2000-04', freq='M') + assert result.index[0] == Period("2000-04", freq="M") def test_anchored_lowercase_buglet(): - dates = date_range('4/16/2012 20:00', periods=50000, freq='s') + dates = date_range("4/16/2012 20:00", periods=50000, freq="s") ts = Series(np.random.randn(len(dates)), index=dates) # it works! - ts.resample('d').mean() + ts.resample("d").mean() def test_upsample_apply_functions(): # #1596 - rng = pd.date_range('2012-06-12', periods=4, freq='h') + rng = pd.date_range("2012-06-12", periods=4, freq="h") ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.resample('20min').aggregate(['mean', 'sum']) + result = ts.resample("20min").aggregate(["mean", "sum"]) assert isinstance(result, DataFrame) def test_resample_not_monotonic(): - rng = pd.date_range('2012-06-12', periods=200, freq='h') + rng = pd.date_range("2012-06-12", periods=200, freq="h") ts = Series(np.random.randn(len(rng)), index=rng) ts = ts.take(np.random.permutation(len(ts))) - result = ts.resample('D').sum() - exp = ts.sort_index().resample('D').sum() + result = ts.resample("D").sum() + exp = ts.sort_index().resample("D").sum() assert_series_equal(result, exp) def test_resample_median_bug_1688(): - for dtype in ['int64', 'int32', 'float64', 'float32']: - df = DataFrame([1, 2], index=[datetime(2012, 1, 1, 0, 0, 0), - datetime(2012, 1, 1, 0, 5, 0)], - dtype=dtype) + for dtype in ["int64", "int32", "float64", "float32"]: + df = DataFrame( + [1, 2], + index=[datetime(2012, 1, 1, 0, 0, 0), datetime(2012, 1, 1, 0, 5, 0)], + dtype=dtype, + ) result = df.resample("T").apply(lambda x: x.mean()) - exp = df.asfreq('T') + exp = df.asfreq("T") tm.assert_frame_equal(result, exp) result = df.resample("T").median() - exp = df.asfreq('T') + exp = df.asfreq("T") tm.assert_frame_equal(result, exp) def test_how_lambda_functions(simple_date_range_series): - ts = simple_date_range_series('1/1/2000', '4/1/2000') + ts = simple_date_range_series("1/1/2000", "4/1/2000") - result = ts.resample('M').apply(lambda x: x.mean()) - exp = ts.resample('M').mean() + result = ts.resample("M").apply(lambda x: x.mean()) + exp = ts.resample("M").mean() tm.assert_series_equal(result, exp) - foo_exp = ts.resample('M').mean() - foo_exp.name = 'foo' - bar_exp = ts.resample('M').std() - bar_exp.name = 'bar' + foo_exp = ts.resample("M").mean() + foo_exp.name = "foo" + bar_exp = ts.resample("M").std() + bar_exp.name = "bar" - result = ts.resample('M').apply( - [lambda x: x.mean(), lambda x: x.std(ddof=1)]) - result.columns = ['foo', 'bar'] - tm.assert_series_equal(result['foo'], foo_exp) - tm.assert_series_equal(result['bar'], bar_exp) + result = ts.resample("M").apply([lambda x: x.mean(), lambda x: x.std(ddof=1)]) + result.columns = ["foo", "bar"] + tm.assert_series_equal(result["foo"], foo_exp) + tm.assert_series_equal(result["bar"], bar_exp) # this is a MI Series, so comparing the names of the results # doesn't make sense - result = ts.resample('M').aggregate({'foo': lambda x: x.mean(), - 'bar': lambda x: x.std(ddof=1)}) - tm.assert_series_equal(result['foo'], foo_exp, check_names=False) - tm.assert_series_equal(result['bar'], bar_exp, check_names=False) + result = ts.resample("M").aggregate( + {"foo": lambda x: x.mean(), "bar": lambda x: x.std(ddof=1)} + ) + tm.assert_series_equal(result["foo"], foo_exp, check_names=False) + tm.assert_series_equal(result["bar"], bar_exp, check_names=False) def test_resample_unequal_times(): @@ -1078,10 +1105,10 @@ def test_resample_unequal_times(): # end hour is less than start end = datetime(2012, 7, 31, 4) bad_ind = date_range(start, end, freq="30min") - df = DataFrame({'close': 1}, index=bad_ind) + df = DataFrame({"close": 1}, index=bad_ind) # it works! - df.resample('AS').sum() + df.resample("AS").sum() def test_resample_consistency(): @@ -1089,18 +1116,18 @@ def test_resample_consistency(): # GH 6418 # resample with bfill / limit / reindex consistency - i30 = pd.date_range('2002-02-02', periods=4, freq='30T') - s = Series(np.arange(4.), index=i30) + i30 = pd.date_range("2002-02-02", periods=4, freq="30T") + s = Series(np.arange(4.0), index=i30) s[2] = np.NaN # Upsample by factor 3 with reindex() and resample() methods: - i10 = pd.date_range(i30[0], i30[-1], freq='10T') + i10 = pd.date_range(i30[0], i30[-1], freq="10T") - s10 = s.reindex(index=i10, method='bfill') - s10_2 = s.reindex(index=i10, method='bfill', limit=2) - rl = s.reindex_like(s10, method='bfill', limit=2) - r10_2 = s.resample('10Min').bfill(limit=2) - r10 = s.resample('10Min').bfill() + s10 = s.reindex(index=i10, method="bfill") + s10_2 = s.reindex(index=i10, method="bfill", limit=2) + rl = s.reindex_like(s10, method="bfill", limit=2) + r10_2 = s.resample("10Min").bfill(limit=2) + r10 = s.resample("10Min").bfill() # s10_2, r10, r10_2, rl should all be equal assert_series_equal(s10_2, r10) @@ -1110,110 +1137,118 @@ def test_resample_consistency(): def test_resample_timegrouper(): # GH 7227 - dates1 = [datetime(2014, 10, 1), datetime(2014, 9, 3), - datetime(2014, 11, 5), datetime(2014, 9, 5), - datetime(2014, 10, 8), datetime(2014, 7, 15)] + dates1 = [ + datetime(2014, 10, 1), + datetime(2014, 9, 3), + datetime(2014, 11, 5), + datetime(2014, 9, 5), + datetime(2014, 10, 8), + datetime(2014, 7, 15), + ] dates2 = dates1[:2] + [pd.NaT] + dates1[2:4] + [pd.NaT] + dates1[4:] dates3 = [pd.NaT] + dates1 + [pd.NaT] for dates in [dates1, dates2, dates3]: df = DataFrame(dict(A=dates, B=np.arange(len(dates)))) - result = df.set_index('A').resample('M').count() - exp_idx = pd.DatetimeIndex(['2014-07-31', '2014-08-31', - '2014-09-30', - '2014-10-31', '2014-11-30'], - freq='M', name='A') - expected = DataFrame({'B': [1, 0, 2, 2, 1]}, index=exp_idx) + result = df.set_index("A").resample("M").count() + exp_idx = pd.DatetimeIndex( + ["2014-07-31", "2014-08-31", "2014-09-30", "2014-10-31", "2014-11-30"], + freq="M", + name="A", + ) + expected = DataFrame({"B": [1, 0, 2, 2, 1]}, index=exp_idx) assert_frame_equal(result, expected) - result = df.groupby(pd.Grouper(freq='M', key='A')).count() + result = df.groupby(pd.Grouper(freq="M", key="A")).count() assert_frame_equal(result, expected) - df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange( - len(dates)))) - result = df.set_index('A').resample('M').count() - expected = DataFrame({'B': [1, 0, 2, 2, 1], 'C': [1, 0, 2, 2, 1]}, - index=exp_idx, columns=['B', 'C']) + df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange(len(dates)))) + result = df.set_index("A").resample("M").count() + expected = DataFrame( + {"B": [1, 0, 2, 2, 1], "C": [1, 0, 2, 2, 1]}, + index=exp_idx, + columns=["B", "C"], + ) assert_frame_equal(result, expected) - result = df.groupby(pd.Grouper(freq='M', key='A')).count() + result = df.groupby(pd.Grouper(freq="M", key="A")).count() assert_frame_equal(result, expected) def test_resample_nunique(): # GH 12352 - df = DataFrame({ - 'ID': {Timestamp('2015-06-05 00:00:00'): '0010100903', - Timestamp('2015-06-08 00:00:00'): '0010150847'}, - 'DATE': {Timestamp('2015-06-05 00:00:00'): '2015-06-05', - Timestamp('2015-06-08 00:00:00'): '2015-06-08'}}) - r = df.resample('D') - g = df.groupby(pd.Grouper(freq='D')) - expected = df.groupby(pd.Grouper(freq='D')).ID.apply(lambda x: - x.nunique()) - assert expected.name == 'ID' + df = DataFrame( + { + "ID": { + Timestamp("2015-06-05 00:00:00"): "0010100903", + Timestamp("2015-06-08 00:00:00"): "0010150847", + }, + "DATE": { + Timestamp("2015-06-05 00:00:00"): "2015-06-05", + Timestamp("2015-06-08 00:00:00"): "2015-06-08", + }, + } + ) + r = df.resample("D") + g = df.groupby(pd.Grouper(freq="D")) + expected = df.groupby(pd.Grouper(freq="D")).ID.apply(lambda x: x.nunique()) + assert expected.name == "ID" for t in [r, g]: result = r.ID.nunique() assert_series_equal(result, expected) - result = df.ID.resample('D').nunique() + result = df.ID.resample("D").nunique() assert_series_equal(result, expected) - result = df.ID.groupby(pd.Grouper(freq='D')).nunique() + result = df.ID.groupby(pd.Grouper(freq="D")).nunique() assert_series_equal(result, expected) def test_resample_nunique_preserves_column_level_names(): # see gh-23222 df = tm.makeTimeDataFrame(freq="1D").abs() - df.columns = pd.MultiIndex.from_arrays([df.columns.tolist()] * 2, - names=["lev0", "lev1"]) + df.columns = pd.MultiIndex.from_arrays( + [df.columns.tolist()] * 2, names=["lev0", "lev1"] + ) result = df.resample("1h").nunique() tm.assert_index_equal(df.columns, result.columns) def test_resample_nunique_with_date_gap(): # GH 13453 - index = pd.date_range('1-1-2000', '2-15-2000', freq='h') - index2 = pd.date_range('4-15-2000', '5-15-2000', freq='h') + index = pd.date_range("1-1-2000", "2-15-2000", freq="h") + index2 = pd.date_range("4-15-2000", "5-15-2000", freq="h") index3 = index.append(index2) - s = Series(range(len(index3)), index=index3, dtype='int64') - r = s.resample('M') + s = Series(range(len(index3)), index=index3, dtype="int64") + r = s.resample("M") # Since all elements are unique, these should all be the same - results = [ - r.count(), - r.nunique(), - r.agg(Series.nunique), - r.agg('nunique') - ] + results = [r.count(), r.nunique(), r.agg(Series.nunique), r.agg("nunique")] assert_series_equal(results[0], results[1]) assert_series_equal(results[0], results[2]) assert_series_equal(results[0], results[3]) -@pytest.mark.parametrize('n', [10000, 100000]) -@pytest.mark.parametrize('k', [10, 100, 1000]) +@pytest.mark.parametrize("n", [10000, 100000]) +@pytest.mark.parametrize("k", [10, 100, 1000]) def test_resample_group_info(n, k): # GH10914 # use a fixed seed to always have the same uniques prng = np.random.RandomState(1234) - dr = date_range(start='2015-08-27', periods=n // 10, freq='T') - ts = Series(prng.randint(0, n // k, n).astype('int64'), - index=prng.choice(dr, n)) + dr = date_range(start="2015-08-27", periods=n // 10, freq="T") + ts = Series(prng.randint(0, n // k, n).astype("int64"), index=prng.choice(dr, n)) - left = ts.resample('30T').nunique() - ix = date_range(start=ts.index.min(), end=ts.index.max(), - freq='30T') + left = ts.resample("30T").nunique() + ix = date_range(start=ts.index.min(), end=ts.index.max(), freq="30T") vals = ts.values - bins = np.searchsorted(ix.values, ts.index, side='right') + bins = np.searchsorted(ix.values, ts.index, side="right") sorter = np.lexsort((vals, bins)) vals, bins = vals[sorter], bins[sorter] @@ -1221,8 +1256,7 @@ def test_resample_group_info(n, k): mask = np.r_[True, vals[1:] != vals[:-1]] mask |= np.r_[True, bins[1:] != bins[:-1]] - arr = np.bincount(bins[mask] - 1, - minlength=len(ix)).astype('int64', copy=False) + arr = np.bincount(bins[mask] - 1, minlength=len(ix)).astype("int64", copy=False) right = Series(arr, index=ix) assert_series_equal(left, right) @@ -1230,15 +1264,14 @@ def test_resample_group_info(n, k): def test_resample_size(): n = 10000 - dr = date_range('2015-09-19', periods=n, freq='T') + dr = date_range("2015-09-19", periods=n, freq="T") ts = Series(np.random.randn(n), index=np.random.choice(dr, n)) - left = ts.resample('7T').size() - ix = date_range(start=left.index.min(), end=ts.index.max(), freq='7T') + left = ts.resample("7T").size() + ix = date_range(start=left.index.min(), end=ts.index.max(), freq="7T") - bins = np.searchsorted(ix.values, ts.index.values, side='right') - val = np.bincount(bins, minlength=len(ix) + 1)[1:].astype('int64', - copy=False) + bins = np.searchsorted(ix.values, ts.index.values, side="right") + val = np.bincount(bins, minlength=len(ix) + 1)[1:].astype("int64", copy=False) right = Series(val, index=ix) assert_series_equal(left, right) @@ -1252,20 +1285,24 @@ def test_resample_across_dst(): # The DatetimeIndex we will start with # (note that DST happens at 03:00+02:00 -> 02:00+01:00) # 2016-10-30 02:23:00+02:00, 2016-10-30 02:23:00+01:00 - df1 = DataFrame([1477786980, 1477790580], columns=['ts']) - dti1 = DatetimeIndex(pd.to_datetime(df1.ts, unit='s') - .dt.tz_localize('UTC') - .dt.tz_convert('Europe/Madrid')) + df1 = DataFrame([1477786980, 1477790580], columns=["ts"]) + dti1 = DatetimeIndex( + pd.to_datetime(df1.ts, unit="s") + .dt.tz_localize("UTC") + .dt.tz_convert("Europe/Madrid") + ) # The expected DatetimeIndex after resampling. # 2016-10-30 02:00:00+02:00, 2016-10-30 02:00:00+01:00 - df2 = DataFrame([1477785600, 1477789200], columns=['ts']) - dti2 = DatetimeIndex(pd.to_datetime(df2.ts, unit='s') - .dt.tz_localize('UTC') - .dt.tz_convert('Europe/Madrid')) + df2 = DataFrame([1477785600, 1477789200], columns=["ts"]) + dti2 = DatetimeIndex( + pd.to_datetime(df2.ts, unit="s") + .dt.tz_localize("UTC") + .dt.tz_convert("Europe/Madrid") + ) df = DataFrame([5, 5], index=dti1) - result = df.resample(rule='H').sum() + result = df.resample(rule="H").sum() expected = DataFrame([5, 5], index=dti2) assert_frame_equal(result, expected) @@ -1273,13 +1310,15 @@ def test_resample_across_dst(): def test_groupby_with_dst_time_change(): # GH 24972 - index = pd.DatetimeIndex([1478064900001000000, 1480037118776792000], - tz='UTC').tz_convert('America/Chicago') + index = pd.DatetimeIndex( + [1478064900001000000, 1480037118776792000], tz="UTC" + ).tz_convert("America/Chicago") df = pd.DataFrame([1, 2], index=index) - result = df.groupby(pd.Grouper(freq='1d')).last() - expected_index_values = pd.date_range('2016-11-02', '2016-11-24', - freq='d', tz='America/Chicago') + result = df.groupby(pd.Grouper(freq="1d")).last() + expected_index_values = pd.date_range( + "2016-11-02", "2016-11-24", freq="d", tz="America/Chicago" + ) index = pd.DatetimeIndex(expected_index_values) expected = pd.DataFrame([1.0] + ([np.nan] * 21) + [2.0], index=index) @@ -1288,132 +1327,151 @@ def test_groupby_with_dst_time_change(): def test_resample_dst_anchor(): # 5172 - dti = DatetimeIndex([datetime(2012, 11, 4, 23)], tz='US/Eastern') + dti = DatetimeIndex([datetime(2012, 11, 4, 23)], tz="US/Eastern") df = DataFrame([5], index=dti) - assert_frame_equal(df.resample(rule='D').sum(), - DataFrame([5], index=df.index.normalize())) - df.resample(rule='MS').sum() assert_frame_equal( - df.resample(rule='MS').sum(), - DataFrame([5], index=DatetimeIndex([datetime(2012, 11, 1)], - tz='US/Eastern'))) + df.resample(rule="D").sum(), DataFrame([5], index=df.index.normalize()) + ) + df.resample(rule="MS").sum() + assert_frame_equal( + df.resample(rule="MS").sum(), + DataFrame([5], index=DatetimeIndex([datetime(2012, 11, 1)], tz="US/Eastern")), + ) - dti = date_range('2013-09-30', '2013-11-02', freq='30Min', - tz='Europe/Paris') + dti = date_range("2013-09-30", "2013-11-02", freq="30Min", tz="Europe/Paris") values = range(dti.size) - df = DataFrame({"a": values, - "b": values, - "c": values}, index=dti, dtype='int64') + df = DataFrame({"a": values, "b": values, "c": values}, index=dti, dtype="int64") how = {"a": "min", "b": "max", "c": "count"} assert_frame_equal( df.resample("W-MON").agg(how)[["a", "b", "c"]], - DataFrame({"a": [0, 48, 384, 720, 1056, 1394], - "b": [47, 383, 719, 1055, 1393, 1586], - "c": [48, 336, 336, 336, 338, 193]}, - index=date_range('9/30/2013', '11/4/2013', - freq='W-MON', tz='Europe/Paris')), - 'W-MON Frequency') + DataFrame( + { + "a": [0, 48, 384, 720, 1056, 1394], + "b": [47, 383, 719, 1055, 1393, 1586], + "c": [48, 336, 336, 336, 338, 193], + }, + index=date_range("9/30/2013", "11/4/2013", freq="W-MON", tz="Europe/Paris"), + ), + "W-MON Frequency", + ) assert_frame_equal( df.resample("2W-MON").agg(how)[["a", "b", "c"]], - DataFrame({"a": [0, 48, 720, 1394], - "b": [47, 719, 1393, 1586], - "c": [48, 672, 674, 193]}, - index=date_range('9/30/2013', '11/11/2013', - freq='2W-MON', tz='Europe/Paris')), - '2W-MON Frequency') + DataFrame( + { + "a": [0, 48, 720, 1394], + "b": [47, 719, 1393, 1586], + "c": [48, 672, 674, 193], + }, + index=date_range( + "9/30/2013", "11/11/2013", freq="2W-MON", tz="Europe/Paris" + ), + ), + "2W-MON Frequency", + ) assert_frame_equal( df.resample("MS").agg(how)[["a", "b", "c"]], - DataFrame({"a": [0, 48, 1538], - "b": [47, 1537, 1586], - "c": [48, 1490, 49]}, - index=date_range('9/1/2013', '11/1/2013', - freq='MS', tz='Europe/Paris')), - 'MS Frequency') + DataFrame( + {"a": [0, 48, 1538], "b": [47, 1537, 1586], "c": [48, 1490, 49]}, + index=date_range("9/1/2013", "11/1/2013", freq="MS", tz="Europe/Paris"), + ), + "MS Frequency", + ) assert_frame_equal( df.resample("2MS").agg(how)[["a", "b", "c"]], - DataFrame({"a": [0, 1538], - "b": [1537, 1586], - "c": [1538, 49]}, - index=date_range('9/1/2013', '11/1/2013', - freq='2MS', tz='Europe/Paris')), - '2MS Frequency') - - df_daily = df['10/26/2013':'10/29/2013'] + DataFrame( + {"a": [0, 1538], "b": [1537, 1586], "c": [1538, 49]}, + index=date_range("9/1/2013", "11/1/2013", freq="2MS", tz="Europe/Paris"), + ), + "2MS Frequency", + ) + + df_daily = df["10/26/2013":"10/29/2013"] assert_frame_equal( - df_daily.resample("D").agg({"a": "min", "b": "max", "c": "count"}) - [["a", "b", "c"]], - DataFrame({"a": [1248, 1296, 1346, 1394], - "b": [1295, 1345, 1393, 1441], - "c": [48, 50, 48, 48]}, - index=date_range('10/26/2013', '10/29/2013', - freq='D', tz='Europe/Paris')), - 'D Frequency') + df_daily.resample("D").agg({"a": "min", "b": "max", "c": "count"})[ + ["a", "b", "c"] + ], + DataFrame( + { + "a": [1248, 1296, 1346, 1394], + "b": [1295, 1345, 1393, 1441], + "c": [48, 50, 48, 48], + }, + index=date_range("10/26/2013", "10/29/2013", freq="D", tz="Europe/Paris"), + ), + "D Frequency", + ) def test_downsample_across_dst(): # GH 8531 - tz = pytz.timezone('Europe/Berlin') + tz = pytz.timezone("Europe/Berlin") dt = datetime(2014, 10, 26) - dates = date_range(tz.localize(dt), periods=4, freq='2H') - result = Series(5, index=dates).resample('H').mean() - expected = Series([5., np.nan] * 3 + [5.], - index=date_range(tz.localize(dt), periods=7, - freq='H')) + dates = date_range(tz.localize(dt), periods=4, freq="2H") + result = Series(5, index=dates).resample("H").mean() + expected = Series( + [5.0, np.nan] * 3 + [5.0], + index=date_range(tz.localize(dt), periods=7, freq="H"), + ) tm.assert_series_equal(result, expected) def test_downsample_across_dst_weekly(): # GH 9119, GH 21459 - df = DataFrame(index=DatetimeIndex([ - '2017-03-25', '2017-03-26', '2017-03-27', - '2017-03-28', '2017-03-29' - ], tz='Europe/Amsterdam'), - data=[11, 12, 13, 14, 15]) - result = df.resample('1W').sum() - expected = DataFrame([23, 42], index=pd.DatetimeIndex([ - '2017-03-26', '2017-04-02' - ], tz='Europe/Amsterdam')) + df = DataFrame( + index=DatetimeIndex( + ["2017-03-25", "2017-03-26", "2017-03-27", "2017-03-28", "2017-03-29"], + tz="Europe/Amsterdam", + ), + data=[11, 12, 13, 14, 15], + ) + result = df.resample("1W").sum() + expected = DataFrame( + [23, 42], + index=pd.DatetimeIndex(["2017-03-26", "2017-04-02"], tz="Europe/Amsterdam"), + ) tm.assert_frame_equal(result, expected) - idx = pd.date_range("2013-04-01", "2013-05-01", tz='Europe/London', - freq='H') + idx = pd.date_range("2013-04-01", "2013-05-01", tz="Europe/London", freq="H") s = Series(index=idx) - result = s.resample('W').mean() - expected = Series(index=pd.date_range( - '2013-04-07', freq='W', periods=5, tz='Europe/London' - )) + result = s.resample("W").mean() + expected = Series( + index=pd.date_range("2013-04-07", freq="W", periods=5, tz="Europe/London") + ) tm.assert_series_equal(result, expected) def test_resample_with_nat(): # GH 13020 - index = DatetimeIndex([pd.NaT, - '1970-01-01 00:00:00', - pd.NaT, - '1970-01-01 00:00:01', - '1970-01-01 00:00:02']) + index = DatetimeIndex( + [ + pd.NaT, + "1970-01-01 00:00:00", + pd.NaT, + "1970-01-01 00:00:01", + "1970-01-01 00:00:02", + ] + ) frame = DataFrame([2, 3, 5, 7, 11], index=index) - index_1s = DatetimeIndex(['1970-01-01 00:00:00', - '1970-01-01 00:00:01', - '1970-01-01 00:00:02']) + index_1s = DatetimeIndex( + ["1970-01-01 00:00:00", "1970-01-01 00:00:01", "1970-01-01 00:00:02"] + ) frame_1s = DataFrame([3, 7, 11], index=index_1s) - assert_frame_equal(frame.resample('1s').mean(), frame_1s) + assert_frame_equal(frame.resample("1s").mean(), frame_1s) - index_2s = DatetimeIndex(['1970-01-01 00:00:00', - '1970-01-01 00:00:02']) + index_2s = DatetimeIndex(["1970-01-01 00:00:00", "1970-01-01 00:00:02"]) frame_2s = DataFrame([5, 11], index=index_2s) - assert_frame_equal(frame.resample('2s').mean(), frame_2s) + assert_frame_equal(frame.resample("2s").mean(), frame_2s) - index_3s = DatetimeIndex(['1970-01-01 00:00:00']) + index_3s = DatetimeIndex(["1970-01-01 00:00:00"]) frame_3s = DataFrame([7], index=index_3s) - assert_frame_equal(frame.resample('3s').mean(), frame_3s) + assert_frame_equal(frame.resample("3s").mean(), frame_3s) - assert_frame_equal(frame.resample('60s').mean(), frame_3s) + assert_frame_equal(frame.resample("60s").mean(), frame_3s) def test_resample_datetime_values(): @@ -1422,15 +1480,17 @@ def test_resample_datetime_values(): # introduced by the resampling dates = [datetime(2016, 1, 15), datetime(2016, 1, 19)] - df = DataFrame({'timestamp': dates}, index=dates) + df = DataFrame({"timestamp": dates}, index=dates) - exp = Series([datetime(2016, 1, 15), pd.NaT, datetime(2016, 1, 19)], - index=date_range('2016-01-15', periods=3, freq='2D'), - name='timestamp') + exp = Series( + [datetime(2016, 1, 15), pd.NaT, datetime(2016, 1, 19)], + index=date_range("2016-01-15", periods=3, freq="2D"), + name="timestamp", + ) - res = df.resample('2D').first()['timestamp'] + res = df.resample("2D").first()["timestamp"] tm.assert_series_equal(res, exp) - res = df['timestamp'].resample('2D').first() + res = df["timestamp"].resample("2D").first() tm.assert_series_equal(res, exp) @@ -1440,43 +1500,45 @@ def f(data, add_arg): return np.mean(data) * add_arg multiplier = 10 - result = series.resample('D').apply(f, multiplier) - expected = series.resample('D').mean().multiply(multiplier) + result = series.resample("D").apply(f, multiplier) + expected = series.resample("D").mean().multiply(multiplier) tm.assert_series_equal(result, expected) # Testing as kwarg - result = series.resample('D').apply(f, add_arg=multiplier) - expected = series.resample('D').mean().multiply(multiplier) + result = series.resample("D").apply(f, add_arg=multiplier) + expected = series.resample("D").mean().multiply(multiplier) tm.assert_series_equal(result, expected) # Testing dataframe - df = pd.DataFrame({"A": 1, "B": 2}, - index=pd.date_range('2017', periods=10)) + df = pd.DataFrame({"A": 1, "B": 2}, index=pd.date_range("2017", periods=10)) result = df.groupby("A").resample("D").agg(f, multiplier) - expected = df.groupby("A").resample('D').mean().multiply(multiplier) + expected = df.groupby("A").resample("D").mean().multiply(multiplier) assert_frame_equal(result, expected) -@pytest.mark.parametrize('k', [1, 2, 3]) -@pytest.mark.parametrize('n1, freq1, n2, freq2', [ - (30, 'S', 0.5, 'Min'), - (60, 'S', 1, 'Min'), - (3600, 'S', 1, 'H'), - (60, 'Min', 1, 'H'), - (21600, 'S', 0.25, 'D'), - (86400, 'S', 1, 'D'), - (43200, 'S', 0.5, 'D'), - (1440, 'Min', 1, 'D'), - (12, 'H', 0.5, 'D'), - (24, 'H', 1, 'D'), -]) +@pytest.mark.parametrize("k", [1, 2, 3]) +@pytest.mark.parametrize( + "n1, freq1, n2, freq2", + [ + (30, "S", 0.5, "Min"), + (60, "S", 1, "Min"), + (3600, "S", 1, "H"), + (60, "Min", 1, "H"), + (21600, "S", 0.25, "D"), + (86400, "S", 1, "D"), + (43200, "S", 0.5, "D"), + (1440, "Min", 1, "D"), + (12, "H", 0.5, "D"), + (24, "H", 1, "D"), + ], +) def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k): # GH 24127 n1_ = n1 * k n2_ = n2 * k - s = pd.Series(0, index=pd.date_range('19910905 13:00', - '19911005 07:00', - freq=freq1)) + s = pd.Series( + 0, index=pd.date_range("19910905 13:00", "19911005 07:00", freq=freq1) + ) s = s + range(len(s)) result1 = s.resample(str(n1_) + freq1).mean() @@ -1484,17 +1546,18 @@ def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k): assert_series_equal(result1, result2) -@pytest.mark.parametrize('first,last,offset,exp_first,exp_last', [ - ('19910905', '19920406', 'D', '19910905', '19920407'), - ('19910905 00:00', '19920406 06:00', 'D', '19910905', '19920407'), - ('19910905 06:00', '19920406 06:00', 'H', '19910905 06:00', - '19920406 07:00'), - ('19910906', '19920406', 'M', '19910831', '19920430'), - ('19910831', '19920430', 'M', '19910831', '19920531'), - ('1991-08', '1992-04', 'M', '19910831', '19920531'), -]) -def test_get_timestamp_range_edges(first, last, offset, - exp_first, exp_last): +@pytest.mark.parametrize( + "first,last,offset,exp_first,exp_last", + [ + ("19910905", "19920406", "D", "19910905", "19920407"), + ("19910905 00:00", "19920406 06:00", "D", "19910905", "19920407"), + ("19910905 06:00", "19920406 06:00", "H", "19910905 06:00", "19920406 07:00"), + ("19910906", "19920406", "M", "19910831", "19920430"), + ("19910831", "19920430", "M", "19910831", "19920531"), + ("1991-08", "1992-04", "M", "19910831", "19920531"), + ], +) +def test_get_timestamp_range_edges(first, last, offset, exp_first, exp_last): first = pd.Period(first) first = first.to_timestamp(first.freq) last = pd.Period(last) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 228de8a14c5068..2ced955652c213 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -16,7 +16,10 @@ from pandas.core.resample import _get_period_range_edges import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) import pandas.tseries.offsets as offsets @@ -28,25 +31,23 @@ def _index_factory(): @pytest.fixture def _series_name(): - return 'pi' + return "pi" class TestPeriodIndex: - - @pytest.mark.parametrize('freq', ['2D', '1H', '2H']) - @pytest.mark.parametrize('kind', ['period', None, 'timestamp']) + @pytest.mark.parametrize("freq", ["2D", "1H", "2H"]) + @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) def test_asfreq(self, series_and_frame, freq, kind): # GH 12884, 15944 # make sure .asfreq() returns PeriodIndex (except kind='timestamp') obj = series_and_frame - if kind == 'timestamp': + if kind == "timestamp": expected = obj.to_timestamp().resample(freq).asfreq() else: - start = obj.index[0].to_timestamp(how='start') - end = (obj.index[-1] + obj.index.freq).to_timestamp(how='start') - new_index = date_range(start=start, end=end, freq=freq, - closed='left') + start = obj.index[0].to_timestamp(how="start") + end = (obj.index[-1] + obj.index.freq).to_timestamp(how="start") + new_index = date_range(start=start, end=end, freq=freq, closed="left") expected = obj.to_timestamp().reindex(new_index).to_period(freq) result = obj.resample(freq, kind=kind).asfreq() assert_almost_equal(result, expected) @@ -55,45 +56,52 @@ def test_asfreq_fill_value(self, series): # test for fill value during resampling, issue 3715 s = series - new_index = date_range(s.index[0].to_timestamp(how='start'), - (s.index[-1]).to_timestamp(how='start'), - freq='1H') + new_index = date_range( + s.index[0].to_timestamp(how="start"), + (s.index[-1]).to_timestamp(how="start"), + freq="1H", + ) expected = s.to_timestamp().reindex(new_index, fill_value=4.0) - result = s.resample('1H', kind='timestamp').asfreq(fill_value=4.0) + result = s.resample("1H", kind="timestamp").asfreq(fill_value=4.0) assert_series_equal(result, expected) - frame = s.to_frame('value') - new_index = date_range(frame.index[0].to_timestamp(how='start'), - (frame.index[-1]).to_timestamp(how='start'), - freq='1H') + frame = s.to_frame("value") + new_index = date_range( + frame.index[0].to_timestamp(how="start"), + (frame.index[-1]).to_timestamp(how="start"), + freq="1H", + ) expected = frame.to_timestamp().reindex(new_index, fill_value=3.0) - result = frame.resample('1H', kind='timestamp').asfreq(fill_value=3.0) + result = frame.resample("1H", kind="timestamp").asfreq(fill_value=3.0) assert_frame_equal(result, expected) - @pytest.mark.parametrize('freq', ['H', '12H', '2D', 'W']) - @pytest.mark.parametrize('kind', [None, 'period', 'timestamp']) - @pytest.mark.parametrize('kwargs', [dict(on='date'), dict(level='d')]) + @pytest.mark.parametrize("freq", ["H", "12H", "2D", "W"]) + @pytest.mark.parametrize("kind", [None, "period", "timestamp"]) + @pytest.mark.parametrize("kwargs", [dict(on="date"), dict(level="d")]) def test_selection(self, index, freq, kind, kwargs): # This is a bug, these should be implemented # GH 14008 rng = np.arange(len(index), dtype=np.int64) - df = DataFrame({'date': index, 'a': rng}, - index=pd.MultiIndex.from_arrays([rng, index], - names=['v', 'd'])) - msg = ("Resampling from level= or on= selection with a PeriodIndex is" - r" not currently supported, use \.set_index\(\.\.\.\) to" - " explicitly set index") + df = DataFrame( + {"date": index, "a": rng}, + index=pd.MultiIndex.from_arrays([rng, index], names=["v", "d"]), + ) + msg = ( + "Resampling from level= or on= selection with a PeriodIndex is" + r" not currently supported, use \.set_index\(\.\.\.\) to" + " explicitly set index" + ) with pytest.raises(NotImplementedError, match=msg): df.resample(freq, kind=kind, **kwargs) - @pytest.mark.parametrize('month', MONTHS) - @pytest.mark.parametrize('meth', ['ffill', 'bfill']) - @pytest.mark.parametrize('conv', ['start', 'end']) - @pytest.mark.parametrize('targ', ['D', 'B', 'M']) - def test_annual_upsample_cases(self, targ, conv, meth, month, - simple_period_range_series): - ts = simple_period_range_series( - '1/1/1990', '12/31/1991', freq='A-%s' % month) + @pytest.mark.parametrize("month", MONTHS) + @pytest.mark.parametrize("meth", ["ffill", "bfill"]) + @pytest.mark.parametrize("conv", ["start", "end"]) + @pytest.mark.parametrize("targ", ["D", "B", "M"]) + def test_annual_upsample_cases( + self, targ, conv, meth, month, simple_period_range_series + ): + ts = simple_period_range_series("1/1/1990", "12/31/1991", freq="A-%s" % month) result = getattr(ts.resample(targ, convention=conv), meth)() expected = result.to_timestamp(targ, how=conv) @@ -101,269 +109,289 @@ def test_annual_upsample_cases(self, targ, conv, meth, month, assert_series_equal(result, expected) def test_basic_downsample(self, simple_period_range_series): - ts = simple_period_range_series('1/1/1990', '6/30/1995', freq='M') - result = ts.resample('a-dec').mean() + ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="M") + result = ts.resample("a-dec").mean() expected = ts.groupby(ts.index.year).mean() - expected.index = period_range('1/1/1990', '6/30/1995', freq='a-dec') + expected.index = period_range("1/1/1990", "6/30/1995", freq="a-dec") assert_series_equal(result, expected) # this is ok - assert_series_equal(ts.resample('a-dec').mean(), result) - assert_series_equal(ts.resample('a').mean(), result) - - @pytest.mark.parametrize('rule,expected_error_msg', [ - ('a-dec', ''), - ('q-mar', ''), - ('M', ''), - ('w-thu', '') - ]) - def test_not_subperiod( - self, simple_period_range_series, rule, expected_error_msg): + assert_series_equal(ts.resample("a-dec").mean(), result) + assert_series_equal(ts.resample("a").mean(), result) + + @pytest.mark.parametrize( + "rule,expected_error_msg", + [ + ("a-dec", ""), + ("q-mar", ""), + ("M", ""), + ("w-thu", ""), + ], + ) + def test_not_subperiod(self, simple_period_range_series, rule, expected_error_msg): # These are incompatible period rules for resampling - ts = simple_period_range_series('1/1/1990', '6/30/1995', freq='w-wed') - msg = ("Frequency cannot be resampled to {}, as they" - " are not sub or super periods").format(expected_error_msg) + ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="w-wed") + msg = ( + "Frequency cannot be resampled to {}, as they" + " are not sub or super periods" + ).format(expected_error_msg) with pytest.raises(IncompatibleFrequency, match=msg): ts.resample(rule).mean() - @pytest.mark.parametrize('freq', ['D', '2D']) + @pytest.mark.parametrize("freq", ["D", "2D"]) def test_basic_upsample(self, freq, simple_period_range_series): - ts = simple_period_range_series('1/1/1990', '6/30/1995', freq='M') - result = ts.resample('a-dec').mean() + ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="M") + result = ts.resample("a-dec").mean() - resampled = result.resample(freq, convention='end').ffill() - expected = result.to_timestamp(freq, how='end') - expected = expected.asfreq(freq, 'ffill').to_period(freq) + resampled = result.resample(freq, convention="end").ffill() + expected = result.to_timestamp(freq, how="end") + expected = expected.asfreq(freq, "ffill").to_period(freq) assert_series_equal(resampled, expected) def test_upsample_with_limit(self): - rng = period_range('1/1/2000', periods=5, freq='A') + rng = period_range("1/1/2000", periods=5, freq="A") ts = Series(np.random.randn(len(rng)), rng) - result = ts.resample('M', convention='end').ffill(limit=2) - expected = ts.asfreq('M').reindex(result.index, method='ffill', - limit=2) + result = ts.resample("M", convention="end").ffill(limit=2) + expected = ts.asfreq("M").reindex(result.index, method="ffill", limit=2) assert_series_equal(result, expected) def test_annual_upsample(self, simple_period_range_series): - ts = simple_period_range_series('1/1/1990', '12/31/1995', freq='A-DEC') - df = DataFrame({'a': ts}) - rdf = df.resample('D').ffill() - exp = df['a'].resample('D').ffill() - assert_series_equal(rdf['a'], exp) + ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="A-DEC") + df = DataFrame({"a": ts}) + rdf = df.resample("D").ffill() + exp = df["a"].resample("D").ffill() + assert_series_equal(rdf["a"], exp) - rng = period_range('2000', '2003', freq='A-DEC') + rng = period_range("2000", "2003", freq="A-DEC") ts = Series([1, 2, 3, 4], index=rng) - result = ts.resample('M').ffill() - ex_index = period_range('2000-01', '2003-12', freq='M') + result = ts.resample("M").ffill() + ex_index = period_range("2000-01", "2003-12", freq="M") - expected = ts.asfreq('M', how='start').reindex(ex_index, - method='ffill') + expected = ts.asfreq("M", how="start").reindex(ex_index, method="ffill") assert_series_equal(result, expected) - @pytest.mark.parametrize('month', MONTHS) - @pytest.mark.parametrize('target', ['D', 'B', 'M']) - @pytest.mark.parametrize('convention', ['start', 'end']) - def test_quarterly_upsample(self, month, target, convention, - simple_period_range_series): - freq = 'Q-{month}'.format(month=month) - ts = simple_period_range_series('1/1/1990', '12/31/1995', freq=freq) + @pytest.mark.parametrize("month", MONTHS) + @pytest.mark.parametrize("target", ["D", "B", "M"]) + @pytest.mark.parametrize("convention", ["start", "end"]) + def test_quarterly_upsample( + self, month, target, convention, simple_period_range_series + ): + freq = "Q-{month}".format(month=month) + ts = simple_period_range_series("1/1/1990", "12/31/1995", freq=freq) result = ts.resample(target, convention=convention).ffill() expected = result.to_timestamp(target, how=convention) - expected = expected.asfreq(target, 'ffill').to_period() + expected = expected.asfreq(target, "ffill").to_period() assert_series_equal(result, expected) - @pytest.mark.parametrize('target', ['D', 'B']) - @pytest.mark.parametrize('convention', ['start', 'end']) - def test_monthly_upsample(self, target, convention, - simple_period_range_series): - ts = simple_period_range_series('1/1/1990', '12/31/1995', freq='M') + @pytest.mark.parametrize("target", ["D", "B"]) + @pytest.mark.parametrize("convention", ["start", "end"]) + def test_monthly_upsample(self, target, convention, simple_period_range_series): + ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="M") result = ts.resample(target, convention=convention).ffill() expected = result.to_timestamp(target, how=convention) - expected = expected.asfreq(target, 'ffill').to_period() + expected = expected.asfreq(target, "ffill").to_period() assert_series_equal(result, expected) def test_resample_basic(self): # GH3609 - s = Series(range(100), index=date_range( - '20130101', freq='s', periods=100, name='idx'), dtype='float') + s = Series( + range(100), + index=date_range("20130101", freq="s", periods=100, name="idx"), + dtype="float", + ) s[10:30] = np.nan - index = PeriodIndex([ - Period('2013-01-01 00:00', 'T'), - Period('2013-01-01 00:01', 'T')], name='idx') + index = PeriodIndex( + [Period("2013-01-01 00:00", "T"), Period("2013-01-01 00:01", "T")], + name="idx", + ) expected = Series([34.5, 79.5], index=index) - result = s.to_period().resample('T', kind='period').mean() + result = s.to_period().resample("T", kind="period").mean() assert_series_equal(result, expected) - result2 = s.resample('T', kind='period').mean() + result2 = s.resample("T", kind="period").mean() assert_series_equal(result2, expected) - @pytest.mark.parametrize('freq,expected_vals', [('M', [31, 29, 31, 9]), - ('2M', [31 + 29, 31 + 9])]) + @pytest.mark.parametrize( + "freq,expected_vals", [("M", [31, 29, 31, 9]), ("2M", [31 + 29, 31 + 9])] + ) def test_resample_count(self, freq, expected_vals): # GH12774 - series = Series(1, index=pd.period_range(start='2000', periods=100)) + series = Series(1, index=pd.period_range(start="2000", periods=100)) result = series.resample(freq).count() - expected_index = pd.period_range(start='2000', freq=freq, - periods=len(expected_vals)) + expected_index = pd.period_range( + start="2000", freq=freq, periods=len(expected_vals) + ) expected = Series(expected_vals, index=expected_index) assert_series_equal(result, expected) def test_resample_same_freq(self, resample_method): # GH12770 - series = Series(range(3), index=pd.period_range( - start='2000', periods=3, freq='M')) + series = Series( + range(3), index=pd.period_range(start="2000", periods=3, freq="M") + ) expected = series - result = getattr(series.resample('M'), resample_method)() + result = getattr(series.resample("M"), resample_method)() assert_series_equal(result, expected) def test_resample_incompat_freq(self): - msg = ("Frequency cannot be resampled to ," - " as they are not sub or super periods") + msg = ( + "Frequency cannot be resampled to ," + " as they are not sub or super periods" + ) with pytest.raises(IncompatibleFrequency, match=msg): - Series(range(3), index=pd.period_range( - start='2000', periods=3, freq='M')).resample('W').mean() + Series( + range(3), index=pd.period_range(start="2000", periods=3, freq="M") + ).resample("W").mean() def test_with_local_timezone_pytz(self): # see gh-5430 - local_timezone = pytz.timezone('America/Los_Angeles') + local_timezone = pytz.timezone("America/Los_Angeles") - start = datetime(year=2013, month=11, day=1, hour=0, minute=0, - tzinfo=pytz.utc) + start = datetime(year=2013, month=11, day=1, hour=0, minute=0, tzinfo=pytz.utc) # 1 day later - end = datetime(year=2013, month=11, day=2, hour=0, minute=0, - tzinfo=pytz.utc) + end = datetime(year=2013, month=11, day=2, hour=0, minute=0, tzinfo=pytz.utc) - index = pd.date_range(start, end, freq='H') + index = pd.date_range(start, end, freq="H") series = Series(1, index=index) series = series.tz_convert(local_timezone) - result = series.resample('D', kind='period').mean() + result = series.resample("D", kind="period").mean() # Create the expected series # Index is moved back a day with the timezone conversion from UTC to # Pacific - expected_index = (pd.period_range(start=start, end=end, freq='D') - - offsets.Day()) + expected_index = pd.period_range(start=start, end=end, freq="D") - offsets.Day() expected = Series(1, index=expected_index) assert_series_equal(result, expected) def test_resample_with_pytz(self): # GH 13238 - s = Series(2, index=pd.date_range('2017-01-01', periods=48, freq="H", - tz="US/Eastern")) + s = Series( + 2, index=pd.date_range("2017-01-01", periods=48, freq="H", tz="US/Eastern") + ) result = s.resample("D").mean() - expected = Series(2, index=pd.DatetimeIndex(['2017-01-01', - '2017-01-02'], - tz="US/Eastern")) + expected = Series( + 2, index=pd.DatetimeIndex(["2017-01-01", "2017-01-02"], tz="US/Eastern") + ) assert_series_equal(result, expected) # Especially assert that the timezone is LMT for pytz - assert result.index.tz == pytz.timezone('US/Eastern') + assert result.index.tz == pytz.timezone("US/Eastern") def test_with_local_timezone_dateutil(self): # see gh-5430 - local_timezone = 'dateutil/America/Los_Angeles' + local_timezone = "dateutil/America/Los_Angeles" - start = datetime(year=2013, month=11, day=1, hour=0, minute=0, - tzinfo=dateutil.tz.tzutc()) + start = datetime( + year=2013, month=11, day=1, hour=0, minute=0, tzinfo=dateutil.tz.tzutc() + ) # 1 day later - end = datetime(year=2013, month=11, day=2, hour=0, minute=0, - tzinfo=dateutil.tz.tzutc()) + end = datetime( + year=2013, month=11, day=2, hour=0, minute=0, tzinfo=dateutil.tz.tzutc() + ) - index = pd.date_range(start, end, freq='H', name='idx') + index = pd.date_range(start, end, freq="H", name="idx") series = Series(1, index=index) series = series.tz_convert(local_timezone) - result = series.resample('D', kind='period').mean() + result = series.resample("D", kind="period").mean() # Create the expected series # Index is moved back a day with the timezone conversion from UTC to # Pacific - expected_index = (pd.period_range(start=start, end=end, freq='D', - name='idx') - offsets.Day()) + expected_index = ( + pd.period_range(start=start, end=end, freq="D", name="idx") - offsets.Day() + ) expected = Series(1, index=expected_index) assert_series_equal(result, expected) def test_resample_nonexistent_time_bin_edge(self): # GH 19375 - index = date_range('2017-03-12', '2017-03-12 1:45:00', freq='15T') + index = date_range("2017-03-12", "2017-03-12 1:45:00", freq="15T") s = Series(np.zeros(len(index)), index=index) - expected = s.tz_localize('US/Pacific') - result = expected.resample('900S').mean() + expected = s.tz_localize("US/Pacific") + result = expected.resample("900S").mean() tm.assert_series_equal(result, expected) # GH 23742 - index = date_range(start='2017-10-10', end='2017-10-20', freq='1H') - index = index.tz_localize('UTC').tz_convert('America/Sao_Paulo') + index = date_range(start="2017-10-10", end="2017-10-20", freq="1H") + index = index.tz_localize("UTC").tz_convert("America/Sao_Paulo") df = DataFrame(data=list(range(len(index))), index=index) - result = df.groupby(pd.Grouper(freq='1D')).count() - expected = date_range(start='2017-10-09', end='2017-10-20', freq='D', - tz="America/Sao_Paulo", - nonexistent='shift_forward', closed='left') + result = df.groupby(pd.Grouper(freq="1D")).count() + expected = date_range( + start="2017-10-09", + end="2017-10-20", + freq="D", + tz="America/Sao_Paulo", + nonexistent="shift_forward", + closed="left", + ) tm.assert_index_equal(result.index, expected) def test_resample_ambiguous_time_bin_edge(self): # GH 10117 - idx = pd.date_range("2014-10-25 22:00:00", "2014-10-26 00:30:00", - freq="30T", tz="Europe/London") + idx = pd.date_range( + "2014-10-25 22:00:00", "2014-10-26 00:30:00", freq="30T", tz="Europe/London" + ) expected = Series(np.zeros(len(idx)), index=idx) - result = expected.resample('30T').mean() + result = expected.resample("30T").mean() tm.assert_series_equal(result, expected) def test_fill_method_and_how_upsample(self): # GH2073 - s = Series(np.arange(9, dtype='int64'), - index=date_range('2010-01-01', periods=9, freq='Q')) - last = s.resample('M').ffill() - both = s.resample('M').ffill().resample('M').last().astype('int64') + s = Series( + np.arange(9, dtype="int64"), + index=date_range("2010-01-01", periods=9, freq="Q"), + ) + last = s.resample("M").ffill() + both = s.resample("M").ffill().resample("M").last().astype("int64") assert_series_equal(last, both) - @pytest.mark.parametrize('day', DAYS) - @pytest.mark.parametrize('target', ['D', 'B']) - @pytest.mark.parametrize('convention', ['start', 'end']) - def test_weekly_upsample(self, day, target, convention, - simple_period_range_series): - freq = 'W-{day}'.format(day=day) - ts = simple_period_range_series('1/1/1990', '12/31/1995', freq=freq) + @pytest.mark.parametrize("day", DAYS) + @pytest.mark.parametrize("target", ["D", "B"]) + @pytest.mark.parametrize("convention", ["start", "end"]) + def test_weekly_upsample(self, day, target, convention, simple_period_range_series): + freq = "W-{day}".format(day=day) + ts = simple_period_range_series("1/1/1990", "12/31/1995", freq=freq) result = ts.resample(target, convention=convention).ffill() expected = result.to_timestamp(target, how=convention) - expected = expected.asfreq(target, 'ffill').to_period() + expected = expected.asfreq(target, "ffill").to_period() assert_series_equal(result, expected) def test_resample_to_timestamps(self, simple_period_range_series): - ts = simple_period_range_series('1/1/1990', '12/31/1995', freq='M') + ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="M") - result = ts.resample('A-DEC', kind='timestamp').mean() - expected = ts.to_timestamp(how='start').resample('A-DEC').mean() + result = ts.resample("A-DEC", kind="timestamp").mean() + expected = ts.to_timestamp(how="start").resample("A-DEC").mean() assert_series_equal(result, expected) def test_resample_to_quarterly(self, simple_period_range_series): for month in MONTHS: - ts = simple_period_range_series( - '1990', '1992', freq='A-%s' % month) - quar_ts = ts.resample('Q-%s' % month).ffill() + ts = simple_period_range_series("1990", "1992", freq="A-%s" % month) + quar_ts = ts.resample("Q-%s" % month).ffill() - stamps = ts.to_timestamp('D', how='start') - qdates = period_range(ts.index[0].asfreq('D', 'start'), - ts.index[-1].asfreq('D', 'end'), - freq='Q-%s' % month) + stamps = ts.to_timestamp("D", how="start") + qdates = period_range( + ts.index[0].asfreq("D", "start"), + ts.index[-1].asfreq("D", "end"), + freq="Q-%s" % month, + ) - expected = stamps.reindex(qdates.to_timestamp('D', 's'), - method='ffill') + expected = stamps.reindex(qdates.to_timestamp("D", "s"), method="ffill") expected.index = qdates assert_series_equal(quar_ts, expected) # conforms, but different month - ts = simple_period_range_series('1990', '1992', freq='A-JUN') + ts = simple_period_range_series("1990", "1992", freq="A-JUN") - for how in ['start', 'end']: - result = ts.resample('Q-MAR', convention=how).ffill() - expected = ts.asfreq('Q-MAR', how=how) - expected = expected.reindex(result.index, method='ffill') + for how in ["start", "end"]: + result = ts.resample("Q-MAR", convention=how).ffill() + expected = ts.asfreq("Q-MAR", how=how) + expected = expected.reindex(result.index, method="ffill") # .to_timestamp('D') # expected = expected.resample('Q-MAR').ffill() @@ -371,149 +399,154 @@ def test_resample_to_quarterly(self, simple_period_range_series): assert_series_equal(result, expected) def test_resample_fill_missing(self): - rng = PeriodIndex([2000, 2005, 2007, 2009], freq='A') + rng = PeriodIndex([2000, 2005, 2007, 2009], freq="A") s = Series(np.random.randn(4), index=rng) stamps = s.to_timestamp() - filled = s.resample('A').ffill() - expected = stamps.resample('A').ffill().to_period('A') + filled = s.resample("A").ffill() + expected = stamps.resample("A").ffill().to_period("A") assert_series_equal(filled, expected) def test_cant_fill_missing_dups(self): - rng = PeriodIndex([2000, 2005, 2005, 2007, 2007], freq='A') + rng = PeriodIndex([2000, 2005, 2005, 2007, 2007], freq="A") s = Series(np.random.randn(5), index=rng) msg = "Reindexing only valid with uniquely valued Index objects" with pytest.raises(InvalidIndexError, match=msg): - s.resample('A').ffill() + s.resample("A").ffill() - @pytest.mark.parametrize('freq', ['5min']) - @pytest.mark.parametrize('kind', ['period', None, 'timestamp']) + @pytest.mark.parametrize("freq", ["5min"]) + @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) def test_resample_5minute(self, freq, kind): - rng = period_range('1/1/2000', '1/5/2000', freq='T') + rng = period_range("1/1/2000", "1/5/2000", freq="T") ts = Series(np.random.randn(len(rng)), index=rng) expected = ts.to_timestamp().resample(freq).mean() - if kind != 'timestamp': + if kind != "timestamp": expected = expected.to_period(freq) result = ts.resample(freq, kind=kind).mean() assert_series_equal(result, expected) def test_upsample_daily_business_daily(self, simple_period_range_series): - ts = simple_period_range_series('1/1/2000', '2/1/2000', freq='B') + ts = simple_period_range_series("1/1/2000", "2/1/2000", freq="B") - result = ts.resample('D').asfreq() - expected = ts.asfreq('D').reindex(period_range('1/3/2000', '2/1/2000')) + result = ts.resample("D").asfreq() + expected = ts.asfreq("D").reindex(period_range("1/3/2000", "2/1/2000")) assert_series_equal(result, expected) - ts = simple_period_range_series('1/1/2000', '2/1/2000') - result = ts.resample('H', convention='s').asfreq() - exp_rng = period_range('1/1/2000', '2/1/2000 23:00', freq='H') - expected = ts.asfreq('H', how='s').reindex(exp_rng) + ts = simple_period_range_series("1/1/2000", "2/1/2000") + result = ts.resample("H", convention="s").asfreq() + exp_rng = period_range("1/1/2000", "2/1/2000 23:00", freq="H") + expected = ts.asfreq("H", how="s").reindex(exp_rng) assert_series_equal(result, expected) def test_resample_irregular_sparse(self): - dr = date_range(start='1/1/2012', freq='5min', periods=1000) + dr = date_range(start="1/1/2012", freq="5min", periods=1000) s = Series(np.array(100), index=dr) # subset the data. - subset = s[:'2012-01-04 06:55'] + subset = s[:"2012-01-04 06:55"] - result = subset.resample('10min').apply(len) - expected = s.resample('10min').apply(len).loc[result.index] + result = subset.resample("10min").apply(len) + expected = s.resample("10min").apply(len).loc[result.index] assert_series_equal(result, expected) def test_resample_weekly_all_na(self): - rng = date_range('1/1/2000', periods=10, freq='W-WED') + rng = date_range("1/1/2000", periods=10, freq="W-WED") ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.resample('W-THU').asfreq() + result = ts.resample("W-THU").asfreq() assert result.isna().all() - result = ts.resample('W-THU').asfreq().ffill()[:-1] - expected = ts.asfreq('W-THU').ffill() + result = ts.resample("W-THU").asfreq().ffill()[:-1] + expected = ts.asfreq("W-THU").ffill() assert_series_equal(result, expected) def test_resample_tz_localized(self): - dr = date_range(start='2012-4-13', end='2012-5-1') + dr = date_range(start="2012-4-13", end="2012-5-1") ts = Series(range(len(dr)), index=dr) - ts_utc = ts.tz_localize('UTC') - ts_local = ts_utc.tz_convert('America/Los_Angeles') + ts_utc = ts.tz_localize("UTC") + ts_local = ts_utc.tz_convert("America/Los_Angeles") - result = ts_local.resample('W').mean() + result = ts_local.resample("W").mean() ts_local_naive = ts_local.copy() - ts_local_naive.index = [x.replace(tzinfo=None) - for x in ts_local_naive.index.to_pydatetime()] + ts_local_naive.index = [ + x.replace(tzinfo=None) for x in ts_local_naive.index.to_pydatetime() + ] - exp = ts_local_naive.resample( - 'W').mean().tz_localize('America/Los_Angeles') + exp = ts_local_naive.resample("W").mean().tz_localize("America/Los_Angeles") assert_series_equal(result, exp) # it works - result = ts_local.resample('D').mean() + result = ts_local.resample("D").mean() # #2245 - idx = date_range('2001-09-20 15:59', '2001-09-20 16:00', freq='T', - tz='Australia/Sydney') + idx = date_range( + "2001-09-20 15:59", "2001-09-20 16:00", freq="T", tz="Australia/Sydney" + ) s = Series([1, 2], index=idx) - result = s.resample('D', closed='right', label='right').mean() - ex_index = date_range('2001-09-21', periods=1, freq='D', - tz='Australia/Sydney') + result = s.resample("D", closed="right", label="right").mean() + ex_index = date_range("2001-09-21", periods=1, freq="D", tz="Australia/Sydney") expected = Series([1.5], index=ex_index) assert_series_equal(result, expected) # for good measure - result = s.resample('D', kind='period').mean() - ex_index = period_range('2001-09-20', periods=1, freq='D') + result = s.resample("D", kind="period").mean() + ex_index = period_range("2001-09-20", periods=1, freq="D") expected = Series([1.5], index=ex_index) assert_series_equal(result, expected) # GH 6397 # comparing an offset that doesn't propagate tz's - rng = date_range('1/1/2011', periods=20000, freq='H') - rng = rng.tz_localize('EST') + rng = date_range("1/1/2011", periods=20000, freq="H") + rng = rng.tz_localize("EST") ts = DataFrame(index=rng) - ts['first'] = np.random.randn(len(rng)) - ts['second'] = np.cumsum(np.random.randn(len(rng))) + ts["first"] = np.random.randn(len(rng)) + ts["second"] = np.cumsum(np.random.randn(len(rng))) expected = DataFrame( { - 'first': ts.resample('A').sum()['first'], - 'second': ts.resample('A').mean()['second']}, - columns=['first', 'second']) - result = ts.resample( - 'A').agg({'first': np.sum, - 'second': np.mean}).reindex(columns=['first', 'second']) + "first": ts.resample("A").sum()["first"], + "second": ts.resample("A").mean()["second"], + }, + columns=["first", "second"], + ) + result = ( + ts.resample("A") + .agg({"first": np.sum, "second": np.mean}) + .reindex(columns=["first", "second"]) + ) assert_frame_equal(result, expected) def test_closed_left_corner(self): # #1465 - s = Series(np.random.randn(21), - index=date_range(start='1/1/2012 9:30', - freq='1min', periods=21)) + s = Series( + np.random.randn(21), + index=date_range(start="1/1/2012 9:30", freq="1min", periods=21), + ) s[0] = np.nan - result = s.resample('10min', closed='left', label='right').mean() - exp = s[1:].resample('10min', closed='left', label='right').mean() + result = s.resample("10min", closed="left", label="right").mean() + exp = s[1:].resample("10min", closed="left", label="right").mean() assert_series_equal(result, exp) - result = s.resample('10min', closed='left', label='left').mean() - exp = s[1:].resample('10min', closed='left', label='left').mean() + result = s.resample("10min", closed="left", label="left").mean() + exp = s[1:].resample("10min", closed="left", label="left").mean() - ex_index = date_range(start='1/1/2012 9:30', freq='10min', periods=3) + ex_index = date_range(start="1/1/2012 9:30", freq="10min", periods=3) tm.assert_index_equal(result.index, ex_index) assert_series_equal(result, exp) def test_quarterly_resampling(self): - rng = period_range('2000Q1', periods=10, freq='Q-DEC') + rng = period_range("2000Q1", periods=10, freq="Q-DEC") ts = Series(np.arange(10), index=rng) - result = ts.resample('A').mean() - exp = ts.to_timestamp().resample('A').mean().to_period() + result = ts.resample("A").mean() + exp = ts.to_timestamp().resample("A").mean().to_period() assert_series_equal(result, exp) def test_resample_weekly_bug_1726(self): @@ -521,48 +554,51 @@ def test_resample_weekly_bug_1726(self): ind = date_range(start="8/6/2012", end="8/26/2012", freq="D") n = len(ind) data = [[x] * 5 for x in range(n)] - df = DataFrame(data, columns=['open', 'high', 'low', 'close', 'vol'], - index=ind) + df = DataFrame(data, columns=["open", "high", "low", "close", "vol"], index=ind) # it works! - df.resample('W-MON', closed='left', label='left').first() + df.resample("W-MON", closed="left", label="left").first() def test_resample_with_dst_time_change(self): # GH 15549 index = ( pd.DatetimeIndex([1457537600000000000, 1458059600000000000]) - .tz_localize("UTC").tz_convert('America/Chicago') + .tz_localize("UTC") + .tz_convert("America/Chicago") ) df = pd.DataFrame([1, 2], index=index) - result = df.resample('12h', closed='right', - label='right').last().ffill() - - expected_index_values = ['2016-03-09 12:00:00-06:00', - '2016-03-10 00:00:00-06:00', - '2016-03-10 12:00:00-06:00', - '2016-03-11 00:00:00-06:00', - '2016-03-11 12:00:00-06:00', - '2016-03-12 00:00:00-06:00', - '2016-03-12 12:00:00-06:00', - '2016-03-13 00:00:00-06:00', - '2016-03-13 13:00:00-05:00', - '2016-03-14 01:00:00-05:00', - '2016-03-14 13:00:00-05:00', - '2016-03-15 01:00:00-05:00', - '2016-03-15 13:00:00-05:00'] + result = df.resample("12h", closed="right", label="right").last().ffill() + + expected_index_values = [ + "2016-03-09 12:00:00-06:00", + "2016-03-10 00:00:00-06:00", + "2016-03-10 12:00:00-06:00", + "2016-03-11 00:00:00-06:00", + "2016-03-11 12:00:00-06:00", + "2016-03-12 00:00:00-06:00", + "2016-03-12 12:00:00-06:00", + "2016-03-13 00:00:00-06:00", + "2016-03-13 13:00:00-05:00", + "2016-03-14 01:00:00-05:00", + "2016-03-14 13:00:00-05:00", + "2016-03-15 01:00:00-05:00", + "2016-03-15 13:00:00-05:00", + ] index = pd.to_datetime(expected_index_values, utc=True).tz_convert( - 'America/Chicago') - expected = pd.DataFrame([1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 2.0], index=index) + "America/Chicago" + ) + expected = pd.DataFrame( + [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0], + index=index, + ) assert_frame_equal(result, expected) def test_resample_bms_2752(self): # GH2753 - foo = Series(index=pd.bdate_range('20000101', '20000201')) + foo = Series(index=pd.bdate_range("20000101", "20000201")) res1 = foo.resample("BMS").mean() res2 = foo.resample("BMS").mean().resample("B").mean() - assert res1.index[0] == Timestamp('20000103') + assert res1.index[0] == Timestamp("20000103") assert res1.index[0] == res2.index[0] # def test_monthly_convention_span(self): @@ -579,28 +615,30 @@ def test_resample_bms_2752(self): # assert_series_equal(result, expected) def test_default_right_closed_label(self): - end_freq = ['D', 'Q', 'M', 'D'] - end_types = ['M', 'A', 'Q', 'W'] + end_freq = ["D", "Q", "M", "D"] + end_types = ["M", "A", "Q", "W"] for from_freq, to_freq in zip(end_freq, end_types): - idx = date_range(start='8/15/2012', periods=100, freq=from_freq) + idx = date_range(start="8/15/2012", periods=100, freq=from_freq) df = DataFrame(np.random.randn(len(idx), 2), idx) resampled = df.resample(to_freq).mean() - assert_frame_equal(resampled, df.resample(to_freq, closed='right', - label='right').mean()) + assert_frame_equal( + resampled, df.resample(to_freq, closed="right", label="right").mean() + ) def test_default_left_closed_label(self): - others = ['MS', 'AS', 'QS', 'D', 'H'] - others_freq = ['D', 'Q', 'M', 'H', 'T'] + others = ["MS", "AS", "QS", "D", "H"] + others_freq = ["D", "Q", "M", "H", "T"] for from_freq, to_freq in zip(others_freq, others): - idx = date_range(start='8/15/2012', periods=100, freq=from_freq) + idx = date_range(start="8/15/2012", periods=100, freq=from_freq) df = DataFrame(np.random.randn(len(idx), 2), idx) resampled = df.resample(to_freq).mean() - assert_frame_equal(resampled, df.resample(to_freq, closed='left', - label='left').mean()) + assert_frame_equal( + resampled, df.resample(to_freq, closed="left", label="left").mean() + ) def test_all_values_single_bin(self): # 2070 @@ -614,151 +652,218 @@ def test_evenly_divisible_with_no_extra_bins(self): # 4076 # when the frequency is evenly divisible, sometimes extra bins - df = DataFrame(np.random.randn(9, 3), - index=date_range('2000-1-1', periods=9)) - result = df.resample('5D').mean() - expected = pd.concat( - [df.iloc[0:5].mean(), df.iloc[5:].mean()], axis=1).T - expected.index = [Timestamp('2000-1-1'), Timestamp('2000-1-6')] + df = DataFrame(np.random.randn(9, 3), index=date_range("2000-1-1", periods=9)) + result = df.resample("5D").mean() + expected = pd.concat([df.iloc[0:5].mean(), df.iloc[5:].mean()], axis=1).T + expected.index = [Timestamp("2000-1-1"), Timestamp("2000-1-6")] assert_frame_equal(result, expected) - index = date_range(start='2001-5-4', periods=28) + index = date_range(start="2001-5-4", periods=28) df = DataFrame( - [{'REST_KEY': 1, 'DLY_TRN_QT': 80, 'DLY_SLS_AMT': 90, - 'COOP_DLY_TRN_QT': 30, 'COOP_DLY_SLS_AMT': 20}] * 28 + - [{'REST_KEY': 2, 'DLY_TRN_QT': 70, 'DLY_SLS_AMT': 10, - 'COOP_DLY_TRN_QT': 50, 'COOP_DLY_SLS_AMT': 20}] * 28, - index=index.append(index)).sort_index() - - index = date_range('2001-5-4', periods=4, freq='7D') + [ + { + "REST_KEY": 1, + "DLY_TRN_QT": 80, + "DLY_SLS_AMT": 90, + "COOP_DLY_TRN_QT": 30, + "COOP_DLY_SLS_AMT": 20, + } + ] + * 28 + + [ + { + "REST_KEY": 2, + "DLY_TRN_QT": 70, + "DLY_SLS_AMT": 10, + "COOP_DLY_TRN_QT": 50, + "COOP_DLY_SLS_AMT": 20, + } + ] + * 28, + index=index.append(index), + ).sort_index() + + index = date_range("2001-5-4", periods=4, freq="7D") expected = DataFrame( - [{'REST_KEY': 14, 'DLY_TRN_QT': 14, 'DLY_SLS_AMT': 14, - 'COOP_DLY_TRN_QT': 14, 'COOP_DLY_SLS_AMT': 14}] * 4, - index=index) - result = df.resample('7D').count() + [ + { + "REST_KEY": 14, + "DLY_TRN_QT": 14, + "DLY_SLS_AMT": 14, + "COOP_DLY_TRN_QT": 14, + "COOP_DLY_SLS_AMT": 14, + } + ] + * 4, + index=index, + ) + result = df.resample("7D").count() assert_frame_equal(result, expected) expected = DataFrame( - [{'REST_KEY': 21, 'DLY_TRN_QT': 1050, 'DLY_SLS_AMT': 700, - 'COOP_DLY_TRN_QT': 560, 'COOP_DLY_SLS_AMT': 280}] * 4, - index=index) - result = df.resample('7D').sum() + [ + { + "REST_KEY": 21, + "DLY_TRN_QT": 1050, + "DLY_SLS_AMT": 700, + "COOP_DLY_TRN_QT": 560, + "COOP_DLY_SLS_AMT": 280, + } + ] + * 4, + index=index, + ) + result = df.resample("7D").sum() assert_frame_equal(result, expected) - @pytest.mark.parametrize('kind', ['period', None, 'timestamp']) - @pytest.mark.parametrize('agg_arg', ['mean', {'value': 'mean'}, ['mean']]) + @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) + @pytest.mark.parametrize("agg_arg", ["mean", {"value": "mean"}, ["mean"]]) def test_loffset_returns_datetimeindex(self, frame, kind, agg_arg): # make sure passing loffset returns DatetimeIndex in all cases # basic method taken from Base.test_resample_loffset_arg_type() df = frame - expected_means = [df.values[i:i + 2].mean() - for i in range(0, len(df.values), 2)] - expected_index = period_range( - df.index[0], periods=len(df.index) / 2, freq='2D') + expected_means = [ + df.values[i : i + 2].mean() for i in range(0, len(df.values), 2) + ] + expected_index = period_range(df.index[0], periods=len(df.index) / 2, freq="2D") # loffset coerces PeriodIndex to DateTimeIndex expected_index = expected_index.to_timestamp() expected_index += timedelta(hours=2) - expected = DataFrame({'value': expected_means}, index=expected_index) + expected = DataFrame({"value": expected_means}, index=expected_index) - result_agg = df.resample('2D', loffset='2H', kind=kind).agg(agg_arg) + result_agg = df.resample("2D", loffset="2H", kind=kind).agg(agg_arg) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result_how = df.resample('2D', how=agg_arg, loffset='2H', - kind=kind) + result_how = df.resample("2D", how=agg_arg, loffset="2H", kind=kind) if isinstance(agg_arg, list): - expected.columns = pd.MultiIndex.from_tuples([('value', 'mean')]) + expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) assert_frame_equal(result_agg, expected) assert_frame_equal(result_how, expected) - @pytest.mark.parametrize('freq, period_mult', [('H', 24), ('12H', 2)]) - @pytest.mark.parametrize('kind', [None, 'period']) + @pytest.mark.parametrize("freq, period_mult", [("H", 24), ("12H", 2)]) + @pytest.mark.parametrize("kind", [None, "period"]) def test_upsampling_ohlc(self, freq, period_mult, kind): # GH 13083 - pi = period_range(start='2000', freq='D', periods=10) + pi = period_range(start="2000", freq="D", periods=10) s = Series(range(len(pi)), index=pi) expected = s.to_timestamp().resample(freq).ohlc().to_period(freq) # timestamp-based resampling doesn't include all sub-periods # of the last original period, so extend accordingly: - new_index = period_range(start='2000', freq=freq, - periods=period_mult * len(pi)) + new_index = period_range(start="2000", freq=freq, periods=period_mult * len(pi)) expected = expected.reindex(new_index) result = s.resample(freq, kind=kind).ohlc() assert_frame_equal(result, expected) - @pytest.mark.parametrize('periods, values', - [([pd.NaT, '1970-01-01 00:00:00', pd.NaT, - '1970-01-01 00:00:02', '1970-01-01 00:00:03'], - [2, 3, 5, 7, 11]), - ([pd.NaT, pd.NaT, '1970-01-01 00:00:00', pd.NaT, - pd.NaT, pd.NaT, '1970-01-01 00:00:02', - '1970-01-01 00:00:03', pd.NaT, pd.NaT], - [1, 2, 3, 5, 6, 8, 7, 11, 12, 13])]) - @pytest.mark.parametrize('freq, expected_values', - [('1s', [3, np.NaN, 7, 11]), - ('2s', [3, int((7 + 11) / 2)]), - ('3s', [int((3 + 7) / 2), 11])]) + @pytest.mark.parametrize( + "periods, values", + [ + ( + [ + pd.NaT, + "1970-01-01 00:00:00", + pd.NaT, + "1970-01-01 00:00:02", + "1970-01-01 00:00:03", + ], + [2, 3, 5, 7, 11], + ), + ( + [ + pd.NaT, + pd.NaT, + "1970-01-01 00:00:00", + pd.NaT, + pd.NaT, + pd.NaT, + "1970-01-01 00:00:02", + "1970-01-01 00:00:03", + pd.NaT, + pd.NaT, + ], + [1, 2, 3, 5, 6, 8, 7, 11, 12, 13], + ), + ], + ) + @pytest.mark.parametrize( + "freq, expected_values", + [ + ("1s", [3, np.NaN, 7, 11]), + ("2s", [3, int((7 + 11) / 2)]), + ("3s", [int((3 + 7) / 2), 11]), + ], + ) def test_resample_with_nat(self, periods, values, freq, expected_values): # GH 13224 - index = PeriodIndex(periods, freq='S') + index = PeriodIndex(periods, freq="S") frame = DataFrame(values, index=index) - expected_index = period_range('1970-01-01 00:00:00', - periods=len(expected_values), freq=freq) + expected_index = period_range( + "1970-01-01 00:00:00", periods=len(expected_values), freq=freq + ) expected = DataFrame(expected_values, index=expected_index) result = frame.resample(freq).mean() assert_frame_equal(result, expected) def test_resample_with_only_nat(self): # GH 13224 - pi = PeriodIndex([pd.NaT] * 3, freq='S') + pi = PeriodIndex([pd.NaT] * 3, freq="S") frame = DataFrame([2, 3, 5], index=pi) expected_index = PeriodIndex(data=[], freq=pi.freq) expected = DataFrame(index=expected_index) - result = frame.resample('1s').mean() + result = frame.resample("1s").mean() assert_frame_equal(result, expected) - @pytest.mark.parametrize('start,end,start_freq,end_freq,base', [ - ('19910905', '19910909 03:00', 'H', '24H', 10), - ('19910905', '19910909 12:00', 'H', '24H', 10), - ('19910905', '19910909 23:00', 'H', '24H', 10), - ('19910905 10:00', '19910909', 'H', '24H', 10), - ('19910905 10:00', '19910909 10:00', 'H', '24H', 10), - ('19910905', '19910909 10:00', 'H', '24H', 10), - ('19910905 12:00', '19910909', 'H', '24H', 10), - ('19910905 12:00', '19910909 03:00', 'H', '24H', 10), - ('19910905 12:00', '19910909 12:00', 'H', '24H', 10), - ('19910905 12:00', '19910909 12:00', 'H', '24H', 34), - ('19910905 12:00', '19910909 12:00', 'H', '17H', 10), - ('19910905 12:00', '19910909 12:00', 'H', '17H', 3), - ('19910905 12:00', '19910909 1:00', 'H', 'M', 3), - ('19910905', '19910913 06:00', '2H', '24H', 10), - ('19910905', '19910905 01:39', 'Min', '5Min', 3), - ('19910905', '19910905 03:18', '2Min', '5Min', 3), - ]) - def test_resample_with_non_zero_base(self, start, end, start_freq, - end_freq, base): + @pytest.mark.parametrize( + "start,end,start_freq,end_freq,base", + [ + ("19910905", "19910909 03:00", "H", "24H", 10), + ("19910905", "19910909 12:00", "H", "24H", 10), + ("19910905", "19910909 23:00", "H", "24H", 10), + ("19910905 10:00", "19910909", "H", "24H", 10), + ("19910905 10:00", "19910909 10:00", "H", "24H", 10), + ("19910905", "19910909 10:00", "H", "24H", 10), + ("19910905 12:00", "19910909", "H", "24H", 10), + ("19910905 12:00", "19910909 03:00", "H", "24H", 10), + ("19910905 12:00", "19910909 12:00", "H", "24H", 10), + ("19910905 12:00", "19910909 12:00", "H", "24H", 34), + ("19910905 12:00", "19910909 12:00", "H", "17H", 10), + ("19910905 12:00", "19910909 12:00", "H", "17H", 3), + ("19910905 12:00", "19910909 1:00", "H", "M", 3), + ("19910905", "19910913 06:00", "2H", "24H", 10), + ("19910905", "19910905 01:39", "Min", "5Min", 3), + ("19910905", "19910905 03:18", "2Min", "5Min", 3), + ], + ) + def test_resample_with_non_zero_base(self, start, end, start_freq, end_freq, base): # GH 23882 s = pd.Series(0, index=pd.period_range(start, end, freq=start_freq)) s = s + np.arange(len(s)) result = s.resample(end_freq, base=base).mean() result = result.to_timestamp(end_freq) # to_timestamp casts 24H -> D - result = result.asfreq(end_freq) if end_freq == '24H' else result + result = result.asfreq(end_freq) if end_freq == "24H" else result expected = s.to_timestamp().resample(end_freq, base=base).mean() assert_series_equal(result, expected) - @pytest.mark.parametrize('first,last,offset,exp_first,exp_last', [ - ('19910905', '19920406', 'D', '19910905', '19920406'), - ('19910905 00:00', '19920406 06:00', 'D', '19910905', '19920406'), - ('19910905 06:00', '19920406 06:00', 'H', '19910905 06:00', - '19920406 06:00'), - ('19910906', '19920406', 'M', '1991-09', '1992-04'), - ('19910831', '19920430', 'M', '1991-08', '1992-04'), - ('1991-08', '1992-04', 'M', '1991-08', '1992-04'), - ]) - def test_get_period_range_edges(self, first, last, offset, - exp_first, exp_last): + @pytest.mark.parametrize( + "first,last,offset,exp_first,exp_last", + [ + ("19910905", "19920406", "D", "19910905", "19920406"), + ("19910905 00:00", "19920406 06:00", "D", "19910905", "19920406"), + ( + "19910905 06:00", + "19920406 06:00", + "H", + "19910905 06:00", + "19920406 06:00", + ), + ("19910906", "19920406", "M", "1991-09", "1992-04"), + ("19910831", "19920430", "M", "1991-08", "1992-04"), + ("1991-08", "1992-04", "M", "1991-08", "1992-04"), + ], + ) + def test_get_period_range_edges(self, first, last, offset, exp_first, exp_last): first = pd.Period(first) last = pd.Period(last) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 6943d30276a21b..94bc884d668355 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -10,12 +10,10 @@ import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal -dti = date_range(start=datetime(2005, 1, 1), - end=datetime(2005, 1, 10), freq='Min') +dti = date_range(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="Min") test_series = Series(np.random.rand(len(dti)), dti) -_test_frame = DataFrame( - {'A': test_series, 'B': test_series, 'C': np.arange(len(dti))}) +_test_frame = DataFrame({"A": test_series, "B": test_series, "C": np.arange(len(dti))}) @pytest.fixture @@ -25,19 +23,21 @@ def test_frame(): def test_str(): - r = test_series.resample('H') - assert ('DatetimeIndexResampler [freq=, axis=0, closed=left, ' - 'label=left, convention=start, base=0]' in str(r)) + r = test_series.resample("H") + assert ( + "DatetimeIndexResampler [freq=, axis=0, closed=left, " + "label=left, convention=start, base=0]" in str(r) + ) def test_api(): - r = test_series.resample('H') + r = test_series.resample("H") result = r.mean() assert isinstance(result, Series) assert len(result) == 217 - r = test_series.to_frame().resample('H') + r = test_series.to_frame().resample("H") result = r.mean() assert isinstance(result, DataFrame) assert len(result) == 217 @@ -48,21 +48,22 @@ def test_groupby_resample_api(): # GH 12448 # .groupby(...).resample(...) hitting warnings # when appropriate - df = DataFrame({'date': pd.date_range(start='2016-01-01', - periods=4, - freq='W'), - 'group': [1, 1, 2, 2], - 'val': [5, 6, 7, 8]}).set_index('date') + df = DataFrame( + { + "date": pd.date_range(start="2016-01-01", periods=4, freq="W"), + "group": [1, 1, 2, 2], + "val": [5, 6, 7, 8], + } + ).set_index("date") # replication step - i = pd.date_range('2016-01-03', periods=8).tolist() + \ - pd.date_range('2016-01-17', periods=8).tolist() - index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i], - names=['group', 'date']) - expected = DataFrame({'val': [5] * 7 + [6] + [7] * 7 + [8]}, - index=index) - result = df.groupby('group').apply( - lambda x: x.resample('1D').ffill())[['val']] + i = ( + pd.date_range("2016-01-03", periods=8).tolist() + + pd.date_range("2016-01-17", periods=8).tolist() + ) + index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i], names=["group", "date"]) + expected = DataFrame({"val": [5] * 7 + [6] + [7] * 7 + [8]}, index=index) + result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] assert_frame_equal(result, expected) @@ -71,13 +72,17 @@ def test_groupby_resample_on_api(): # GH 15021 # .groupby(...).resample(on=...) results in an unexpected # keyword warning. - df = DataFrame({'key': ['A', 'B'] * 5, - 'dates': pd.date_range('2016-01-01', periods=10), - 'values': np.random.randn(10)}) + df = DataFrame( + { + "key": ["A", "B"] * 5, + "dates": pd.date_range("2016-01-01", periods=10), + "values": np.random.randn(10), + } + ) - expected = df.set_index('dates').groupby('key').resample('D').mean() + expected = df.set_index("dates").groupby("key").resample("D").mean() - result = df.groupby('key').resample('D', on='dates').mean() + result = df.groupby("key").resample("D", on="dates").mean() assert_frame_equal(result, expected) @@ -85,13 +90,13 @@ def test_pipe(test_frame): # GH17905 # series - r = test_series.resample('H') + r = test_series.resample("H") expected = r.max() - r.mean() result = r.pipe(lambda x: x.max() - x.mean()) tm.assert_series_equal(result, expected) # dataframe - r = test_frame.resample('H') + r = test_frame.resample("H") expected = r.max() - r.mean() result = r.pipe(lambda x: x.max() - x.mean()) tm.assert_frame_equal(result, expected) @@ -99,25 +104,23 @@ def test_pipe(test_frame): def test_getitem(test_frame): - r = test_frame.resample('H') + r = test_frame.resample("H") tm.assert_index_equal(r._selected_obj.columns, test_frame.columns) - r = test_frame.resample('H')['B'] + r = test_frame.resample("H")["B"] assert r._selected_obj.name == test_frame.columns[1] # technically this is allowed - r = test_frame.resample('H')['A', 'B'] - tm.assert_index_equal(r._selected_obj.columns, - test_frame.columns[[0, 1]]) + r = test_frame.resample("H")["A", "B"] + tm.assert_index_equal(r._selected_obj.columns, test_frame.columns[[0, 1]]) - r = test_frame.resample('H')['A', 'B'] - tm.assert_index_equal(r._selected_obj.columns, - test_frame.columns[[0, 1]]) + r = test_frame.resample("H")["A", "B"] + tm.assert_index_equal(r._selected_obj.columns, test_frame.columns[[0, 1]]) -@pytest.mark.parametrize('key', [['D'], ['A', 'D']]) +@pytest.mark.parametrize("key", [["D"], ["A", "D"]]) def test_select_bad_cols(key, test_frame): - g = test_frame.resample('H') + g = test_frame.resample("H") # 'A' should not be referenced as a bad column... # will have to rethink regex if you change message! msg = r"^\"Columns not found: 'D'\"$" @@ -127,18 +130,18 @@ def test_select_bad_cols(key, test_frame): def test_attribute_access(test_frame): - r = test_frame.resample('H') - tm.assert_series_equal(r.A.sum(), r['A'].sum()) + r = test_frame.resample("H") + tm.assert_series_equal(r.A.sum(), r["A"].sum()) def test_api_compat_before_use(): # make sure that we are setting the binner # on these attributes - for attr in ['groups', 'ngroups', 'indices']: - rng = pd.date_range('1/1/2012', periods=100, freq='S') + for attr in ["groups", "ngroups", "indices"]: + rng = pd.date_range("1/1/2012", periods=100, freq="S") ts = Series(np.arange(len(rng)), index=rng) - rs = ts.resample('30s') + rs = ts.resample("30s") # before use getattr(rs, attr) @@ -151,13 +154,13 @@ def test_api_compat_before_use(): def tests_skip_nuisance(test_frame): df = test_frame - df['D'] = 'foo' - r = df.resample('H') - result = r[['A', 'B']].sum() + df["D"] = "foo" + r = df.resample("H") + result = r[["A", "B"]].sum() expected = pd.concat([r.A.sum(), r.B.sum()], axis=1) assert_frame_equal(result, expected) - expected = r[['A', 'B', 'C']].sum() + expected = r[["A", "B", "C"]].sum() result = r.sum() assert_frame_equal(result, expected) @@ -165,13 +168,13 @@ def tests_skip_nuisance(test_frame): def test_downsample_but_actually_upsampling(): # this is reindex / asfreq - rng = pd.date_range('1/1/2012', periods=100, freq='S') - ts = Series(np.arange(len(rng), dtype='int64'), index=rng) - result = ts.resample('20s').asfreq() - expected = Series([0, 20, 40, 60, 80], - index=pd.date_range('2012-01-01 00:00:00', - freq='20s', - periods=5)) + rng = pd.date_range("1/1/2012", periods=100, freq="S") + ts = Series(np.arange(len(rng), dtype="int64"), index=rng) + result = ts.resample("20s").asfreq() + expected = Series( + [0, 20, 40, 60, 80], + index=pd.date_range("2012-01-01 00:00:00", freq="20s", periods=5), + ) assert_series_equal(result, expected) @@ -181,43 +184,43 @@ def test_combined_up_downsampling_of_irregular(): # ts2.resample('2s').mean().ffill() # preserve these semantics - rng = pd.date_range('1/1/2012', periods=100, freq='S') + rng = pd.date_range("1/1/2012", periods=100, freq="S") ts = Series(np.arange(len(rng)), index=rng) ts2 = ts.iloc[[0, 1, 2, 3, 5, 7, 11, 15, 16, 25, 30]] - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = ts2.resample('2s', how='mean', fill_method='ffill') - expected = ts2.resample('2s').mean().ffill() + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = ts2.resample("2s", how="mean", fill_method="ffill") + expected = ts2.resample("2s").mean().ffill() assert_series_equal(result, expected) def test_transform(): - r = test_series.resample('20min') - expected = test_series.groupby( - pd.Grouper(freq='20min')).transform('mean') - result = r.transform('mean') + r = test_series.resample("20min") + expected = test_series.groupby(pd.Grouper(freq="20min")).transform("mean") + result = r.transform("mean") assert_series_equal(result, expected) def test_fillna(): # need to upsample here - rng = pd.date_range('1/1/2012', periods=10, freq='2S') - ts = Series(np.arange(len(rng), dtype='int64'), index=rng) - r = ts.resample('s') + rng = pd.date_range("1/1/2012", periods=10, freq="2S") + ts = Series(np.arange(len(rng), dtype="int64"), index=rng) + r = ts.resample("s") expected = r.ffill() - result = r.fillna(method='ffill') + result = r.fillna(method="ffill") assert_series_equal(result, expected) expected = r.bfill() - result = r.fillna(method='bfill') + result = r.fillna(method="bfill") assert_series_equal(result, expected) - msg = (r"Invalid fill method\. Expecting pad \(ffill\), backfill" - r" \(bfill\) or nearest\. Got 0") + msg = ( + r"Invalid fill method\. Expecting pad \(ffill\), backfill" + r" \(bfill\) or nearest\. Got 0" + ) with pytest.raises(ValueError, match=msg): r.fillna(0) @@ -225,8 +228,8 @@ def test_fillna(): def test_apply_without_aggregation(): # both resample and groupby should work w/o aggregation - r = test_series.resample('20min') - g = test_series.groupby(pd.Grouper(freq='20min')) + r = test_series.resample("20min") + g = test_series.groupby(pd.Grouper(freq="20min")) for t in [g, r]: result = t.apply(lambda x: x) @@ -237,18 +240,20 @@ def test_agg_consistency(): # make sure that we are consistent across # similar aggregations with and w/o selection list - df = DataFrame(np.random.randn(1000, 3), - index=pd.date_range('1/1/2012', freq='S', periods=1000), - columns=['A', 'B', 'C']) + df = DataFrame( + np.random.randn(1000, 3), + index=pd.date_range("1/1/2012", freq="S", periods=1000), + columns=["A", "B", "C"], + ) - r = df.resample('3T') + r = df.resample("3T") - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - expected = r[['A', 'B', 'C']].agg({'r1': 'mean', 'r2': 'sum'}) - result = r.agg({'r1': 'mean', 'r2': 'sum'}) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + expected = r[["A", "B", "C"]].agg({"r1": "mean", "r2": "sum"}) + result = r.agg({"r1": "mean", "r2": "sum"}) assert_frame_equal(result, expected, check_like=True) + # TODO: once GH 14008 is fixed, move these tests into # `Base` test class @@ -257,184 +262,167 @@ def test_agg(): # test with all three Resampler apis and TimeGrouper np.random.seed(1234) - index = date_range(datetime(2005, 1, 1), - datetime(2005, 1, 10), freq='D') - index.name = 'date' - df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index) + index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") + index.name = "date" + df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index) df_col = df.reset_index() df_mult = df_col.copy() - df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], - names=['index', 'date']) - r = df.resample('2D') + df_mult.index = pd.MultiIndex.from_arrays( + [range(10), df.index], names=["index", "date"] + ) + r = df.resample("2D") cases = [ r, - df_col.resample('2D', on='date'), - df_mult.resample('2D', level='date'), - df.groupby(pd.Grouper(freq='2D')) + df_col.resample("2D", on="date"), + df_mult.resample("2D", level="date"), + df.groupby(pd.Grouper(freq="2D")), ] - a_mean = r['A'].mean() - a_std = r['A'].std() - a_sum = r['A'].sum() - b_mean = r['B'].mean() - b_std = r['B'].std() - b_sum = r['B'].sum() + a_mean = r["A"].mean() + a_std = r["A"].std() + a_sum = r["A"].sum() + b_mean = r["B"].mean() + b_std = r["B"].std() + b_sum = r["B"].sum() expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) - expected.columns = pd.MultiIndex.from_product([['A', 'B'], - ['mean', 'std']]) + expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) for t in cases: result = t.aggregate([np.mean, np.std]) assert_frame_equal(result, expected) expected = pd.concat([a_mean, b_std], axis=1) for t in cases: - result = t.aggregate({'A': np.mean, - 'B': np.std}) + result = t.aggregate({"A": np.mean, "B": np.std}) assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_std], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), - ('A', 'std')]) + expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std")]) for t in cases: - result = t.aggregate({'A': ['mean', 'std']}) + result = t.aggregate({"A": ["mean", "std"]}) assert_frame_equal(result, expected) expected = pd.concat([a_mean, a_sum], axis=1) - expected.columns = ['mean', 'sum'] + expected.columns = ["mean", "sum"] for t in cases: - result = t['A'].aggregate(['mean', 'sum']) + result = t["A"].aggregate(["mean", "sum"]) assert_frame_equal(result, expected) expected = pd.concat([a_mean, a_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), - ('A', 'sum')]) + expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "sum")]) for t in cases: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}}) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = t.aggregate({"A": {"mean": "mean", "sum": "sum"}}) assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), - ('A', 'sum'), - ('B', 'mean2'), - ('B', 'sum2')]) + expected.columns = pd.MultiIndex.from_tuples( + [("A", "mean"), ("A", "sum"), ("B", "mean2"), ("B", "sum2")] + ) for t in cases: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}, - 'B': {'mean2': 'mean', 'sum2': 'sum'}}) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = t.aggregate( + { + "A": {"mean": "mean", "sum": "sum"}, + "B": {"mean2": "mean", "sum2": "sum"}, + } + ) assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), - ('A', 'std'), - ('B', 'mean'), - ('B', 'std')]) + expected.columns = pd.MultiIndex.from_tuples( + [("A", "mean"), ("A", "std"), ("B", "mean"), ("B", "std")] + ) for t in cases: - result = t.aggregate({'A': ['mean', 'std'], - 'B': ['mean', 'std']}) + result = t.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('r1', 'A', 'mean'), - ('r1', 'A', 'sum'), - ('r2', 'B', 'mean'), - ('r2', 'B', 'sum')]) + expected.columns = pd.MultiIndex.from_tuples( + [ + ("r1", "A", "mean"), + ("r1", "A", "sum"), + ("r2", "B", "mean"), + ("r2", "B", "sum"), + ] + ) def test_agg_misc(): # test with all three Resampler apis and TimeGrouper np.random.seed(1234) - index = date_range(datetime(2005, 1, 1), - datetime(2005, 1, 10), freq='D') - index.name = 'date' - df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index) + index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") + index.name = "date" + df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index) df_col = df.reset_index() df_mult = df_col.copy() - df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], - names=['index', 'date']) + df_mult.index = pd.MultiIndex.from_arrays( + [range(10), df.index], names=["index", "date"] + ) - r = df.resample('2D') + r = df.resample("2D") cases = [ r, - df_col.resample('2D', on='date'), - df_mult.resample('2D', level='date'), - df.groupby(pd.Grouper(freq='2D')) + df_col.resample("2D", on="date"), + df_mult.resample("2D", level="date"), + df.groupby(pd.Grouper(freq="2D")), ] # passed lambda for t in cases: - result = t.agg({'A': np.sum, - 'B': lambda x: np.std(x, ddof=1)}) - rcustom = t['B'].apply(lambda x: np.std(x, ddof=1)) - expected = pd.concat([r['A'].sum(), rcustom], axis=1) + result = t.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) + rcustom = t["B"].apply(lambda x: np.std(x, ddof=1)) + expected = pd.concat([r["A"].sum(), rcustom], axis=1) assert_frame_equal(result, expected, check_like=True) # agg with renamers - expected = pd.concat([t['A'].sum(), - t['B'].sum(), - t['A'].mean(), - t['B'].mean()], - axis=1) - expected.columns = pd.MultiIndex.from_tuples([('result1', 'A'), - ('result1', 'B'), - ('result2', 'A'), - ('result2', 'B')]) + expected = pd.concat( + [t["A"].sum(), t["B"].sum(), t["A"].mean(), t["B"].mean()], axis=1 + ) + expected.columns = pd.MultiIndex.from_tuples( + [("result1", "A"), ("result1", "B"), ("result2", "A"), ("result2", "B")] + ) for t in cases: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = t[['A', 'B']].agg(OrderedDict([('result1', np.sum), - ('result2', np.mean)])) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = t[["A", "B"]].agg( + OrderedDict([("result1", np.sum), ("result2", np.mean)]) + ) assert_frame_equal(result, expected, check_like=True) # agg with different hows - expected = pd.concat([t['A'].sum(), - t['A'].std(), - t['B'].mean(), - t['B'].std()], - axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'), - ('A', 'std'), - ('B', 'mean'), - ('B', 'std')]) + expected = pd.concat( + [t["A"].sum(), t["A"].std(), t["B"].mean(), t["B"].std()], axis=1 + ) + expected.columns = pd.MultiIndex.from_tuples( + [("A", "sum"), ("A", "std"), ("B", "mean"), ("B", "std")] + ) for t in cases: - result = t.agg(OrderedDict([('A', ['sum', 'std']), - ('B', ['mean', 'std'])])) + result = t.agg(OrderedDict([("A", ["sum", "std"]), ("B", ["mean", "std"])])) assert_frame_equal(result, expected, check_like=True) # equivalent of using a selection list / or not for t in cases: - result = t[['A', 'B']].agg({'A': ['sum', 'std'], - 'B': ['mean', 'std']}) + result = t[["A", "B"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) assert_frame_equal(result, expected, check_like=True) # series like aggs for t in cases: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = t['A'].agg({'A': ['sum', 'std']}) - expected = pd.concat([t['A'].sum(), - t['A'].std()], - axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'), - ('A', 'std')]) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = t["A"].agg({"A": ["sum", "std"]}) + expected = pd.concat([t["A"].sum(), t["A"].std()], axis=1) + expected.columns = pd.MultiIndex.from_tuples([("A", "sum"), ("A", "std")]) assert_frame_equal(result, expected, check_like=True) - expected = pd.concat([t['A'].agg(['sum', 'std']), - t['A'].agg(['mean', 'std'])], - axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'), - ('A', 'std'), - ('B', 'mean'), - ('B', 'std')]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = t['A'].agg({'A': ['sum', 'std'], - 'B': ['mean', 'std']}) + expected = pd.concat( + [t["A"].agg(["sum", "std"]), t["A"].agg(["mean", "std"])], axis=1 + ) + expected.columns = pd.MultiIndex.from_tuples( + [("A", "sum"), ("A", "std"), ("B", "mean"), ("B", "std")] + ) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = t["A"].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) assert_frame_equal(result, expected, check_like=True) # errors @@ -442,133 +430,138 @@ def test_agg_misc(): msg = "\"Column 'B' does not exist!\"" for t in cases: with pytest.raises(KeyError, match=msg): - t[['A']].agg({'A': ['sum', 'std'], - 'B': ['mean', 'std']}) + t[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) def test_agg_nested_dicts(): np.random.seed(1234) - index = date_range(datetime(2005, 1, 1), - datetime(2005, 1, 10), freq='D') - index.name = 'date' - df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index) + index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") + index.name = "date" + df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index) df_col = df.reset_index() df_mult = df_col.copy() - df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], - names=['index', 'date']) - r = df.resample('2D') + df_mult.index = pd.MultiIndex.from_arrays( + [range(10), df.index], names=["index", "date"] + ) + r = df.resample("2D") cases = [ r, - df_col.resample('2D', on='date'), - df_mult.resample('2D', level='date'), - df.groupby(pd.Grouper(freq='2D')) + df_col.resample("2D", on="date"), + df_mult.resample("2D", level="date"), + df.groupby(pd.Grouper(freq="2D")), ] msg = r"cannot perform renaming for r(1|2) with a nested dictionary" for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): - t.aggregate({'r1': {'A': ['mean', 'sum']}, - 'r2': {'B': ['mean', 'sum']}}) + t.aggregate({"r1": {"A": ["mean", "sum"]}, "r2": {"B": ["mean", "sum"]}}) for t in cases: - expected = pd.concat([t['A'].mean(), t['A'].std(), t['B'].mean(), - t['B'].std()], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( - 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')]) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = t[['A', 'B']].agg({'A': {'ra': ['mean', 'std']}, - 'B': {'rb': ['mean', 'std']}}) + expected = pd.concat( + [t["A"].mean(), t["A"].std(), t["B"].mean(), t["B"].std()], axis=1 + ) + expected.columns = pd.MultiIndex.from_tuples( + [("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")] + ) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = t[["A", "B"]].agg( + {"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}} + ) assert_frame_equal(result, expected, check_like=True) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = t.agg({'A': {'ra': ['mean', 'std']}, - 'B': {'rb': ['mean', 'std']}}) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = t.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) assert_frame_equal(result, expected, check_like=True) def test_try_aggregate_non_existing_column(): # GH 16766 data = [ - {'dt': datetime(2017, 6, 1, 0), 'x': 1.0, 'y': 2.0}, - {'dt': datetime(2017, 6, 1, 1), 'x': 2.0, 'y': 2.0}, - {'dt': datetime(2017, 6, 1, 2), 'x': 3.0, 'y': 1.5} + {"dt": datetime(2017, 6, 1, 0), "x": 1.0, "y": 2.0}, + {"dt": datetime(2017, 6, 1, 1), "x": 2.0, "y": 2.0}, + {"dt": datetime(2017, 6, 1, 2), "x": 3.0, "y": 1.5}, ] - df = DataFrame(data).set_index('dt') + df = DataFrame(data).set_index("dt") # Error as we don't have 'z' column msg = "\"Column 'z' does not exist!\"" with pytest.raises(KeyError, match=msg): - df.resample('30T').agg({'x': ['mean'], - 'y': ['median'], - 'z': ['sum']}) + df.resample("30T").agg({"x": ["mean"], "y": ["median"], "z": ["sum"]}) def test_selection_api_validation(): # GH 13500 - index = date_range(datetime(2005, 1, 1), - datetime(2005, 1, 10), freq='D') + index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") rng = np.arange(len(index), dtype=np.int64) - df = DataFrame({'date': index, 'a': rng}, - index=pd.MultiIndex.from_arrays([rng, index], - names=['v', 'd'])) - df_exp = DataFrame({'a': rng}, index=index) + df = DataFrame( + {"date": index, "a": rng}, + index=pd.MultiIndex.from_arrays([rng, index], names=["v", "d"]), + ) + df_exp = DataFrame({"a": rng}, index=index) # non DatetimeIndex - msg = ("Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex," - " but got an instance of 'Int64Index'") + msg = ( + "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex," + " but got an instance of 'Int64Index'" + ) with pytest.raises(TypeError, match=msg): - df.resample('2D', level='v') + df.resample("2D", level="v") msg = "The Grouper cannot specify both a key and a level!" with pytest.raises(ValueError, match=msg): - df.resample('2D', on='date', level='d') + df.resample("2D", on="date", level="d") msg = "unhashable type: 'list'" with pytest.raises(TypeError, match=msg): - df.resample('2D', on=['a', 'date']) + df.resample("2D", on=["a", "date"]) msg = r"\"Level \['a', 'date'\] not found\"" with pytest.raises(KeyError, match=msg): - df.resample('2D', level=['a', 'date']) + df.resample("2D", level=["a", "date"]) # upsampling not allowed - msg = ("Upsampling from level= or on= selection is not supported, use" - r" \.set_index\(\.\.\.\) to explicitly set index to datetime-like") + msg = ( + "Upsampling from level= or on= selection is not supported, use" + r" \.set_index\(\.\.\.\) to explicitly set index to datetime-like" + ) with pytest.raises(ValueError, match=msg): - df.resample('2D', level='d').asfreq() + df.resample("2D", level="d").asfreq() with pytest.raises(ValueError, match=msg): - df.resample('2D', on='date').asfreq() + df.resample("2D", on="date").asfreq() - exp = df_exp.resample('2D').sum() - exp.index.name = 'date' - assert_frame_equal(exp, df.resample('2D', on='date').sum()) + exp = df_exp.resample("2D").sum() + exp.index.name = "date" + assert_frame_equal(exp, df.resample("2D", on="date").sum()) - exp.index.name = 'd' - assert_frame_equal(exp, df.resample('2D', level='d').sum()) + exp.index.name = "d" + assert_frame_equal(exp, df.resample("2D", level="d").sum()) -@pytest.mark.parametrize('col_name', ['t2', 't2x', 't2q', 'T_2M', - 't2p', 't2m', 't2m1', 'T2M']) +@pytest.mark.parametrize( + "col_name", ["t2", "t2x", "t2q", "T_2M", "t2p", "t2m", "t2m1", "T2M"] +) def test_agg_with_datetime_index_list_agg_func(col_name): # GH 22660 # The parametrized column names would get converted to dates by our # date parser. Some would result in OutOfBoundsError (ValueError) while # others would result in OverflowError when passed into Timestamp. # We catch these errors and move on to the correct branch. - df = pd.DataFrame(list(range(200)), - index=pd.date_range(start='2017-01-01', freq='15min', - periods=200, tz='Europe/Berlin'), - columns=[col_name]) - result = df.resample('1d').aggregate(['mean']) - expected = pd.DataFrame([47.5, 143.5, 195.5], - index=pd.date_range(start='2017-01-01', freq='D', - periods=3, tz='Europe/Berlin'), - columns=pd.MultiIndex(levels=[[col_name], - ['mean']], - codes=[[0], [0]])) + df = pd.DataFrame( + list(range(200)), + index=pd.date_range( + start="2017-01-01", freq="15min", periods=200, tz="Europe/Berlin" + ), + columns=[col_name], + ) + result = df.resample("1d").aggregate(["mean"]) + expected = pd.DataFrame( + [47.5, 143.5, 195.5], + index=pd.date_range( + start="2017-01-01", freq="D", periods=3, tz="Europe/Berlin" + ), + columns=pd.MultiIndex(levels=[[col_name], ["mean"]], codes=[[0], [0]]), + ) assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 959b6febcf1c95..9053a7ebfea2b8 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -8,71 +8,84 @@ import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal -test_frame = DataFrame({'A': [1] * 20 + [2] * 12 + [3] * 8, - 'B': np.arange(40)}, - index=date_range('1/1/2000', - freq='s', - periods=40)) +test_frame = DataFrame( + {"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}, + index=date_range("1/1/2000", freq="s", periods=40), +) def test_tab_complete_ipython6_warning(ip): from IPython.core.completer import provisionalcompleter - code = dedent("""\ + + code = dedent( + """\ import pandas.util.testing as tm s = tm.makeTimeSeries() rs = s.resample("D") - """) + """ + ) ip.run_code(code) with tm.assert_produces_warning(None): - with provisionalcompleter('ignore'): - list(ip.Completer.completions('rs.', 1)) + with provisionalcompleter("ignore"): + list(ip.Completer.completions("rs.", 1)) def test_deferred_with_groupby(): # GH 12486 # support deferred resample ops with groupby - data = [['2010-01-01', 'A', 2], ['2010-01-02', 'A', 3], - ['2010-01-05', 'A', 8], ['2010-01-10', 'A', 7], - ['2010-01-13', 'A', 3], ['2010-01-01', 'B', 5], - ['2010-01-03', 'B', 2], ['2010-01-04', 'B', 1], - ['2010-01-11', 'B', 7], ['2010-01-14', 'B', 3]] - - df = DataFrame(data, columns=['date', 'id', 'score']) + data = [ + ["2010-01-01", "A", 2], + ["2010-01-02", "A", 3], + ["2010-01-05", "A", 8], + ["2010-01-10", "A", 7], + ["2010-01-13", "A", 3], + ["2010-01-01", "B", 5], + ["2010-01-03", "B", 2], + ["2010-01-04", "B", 1], + ["2010-01-11", "B", 7], + ["2010-01-14", "B", 3], + ] + + df = DataFrame(data, columns=["date", "id", "score"]) df.date = pd.to_datetime(df.date) def f(x): - return x.set_index('date').resample('D').asfreq() - expected = df.groupby('id').apply(f) - result = df.set_index('date').groupby('id').resample('D').asfreq() + return x.set_index("date").resample("D").asfreq() + + expected = df.groupby("id").apply(f) + result = df.set_index("date").groupby("id").resample("D").asfreq() assert_frame_equal(result, expected) - df = DataFrame({'date': pd.date_range(start='2016-01-01', - periods=4, - freq='W'), - 'group': [1, 1, 2, 2], - 'val': [5, 6, 7, 8]}).set_index('date') + df = DataFrame( + { + "date": pd.date_range(start="2016-01-01", periods=4, freq="W"), + "group": [1, 1, 2, 2], + "val": [5, 6, 7, 8], + } + ).set_index("date") def f(x): - return x.resample('1D').ffill() - expected = df.groupby('group').apply(f) - result = df.groupby('group').resample('1D').ffill() + return x.resample("1D").ffill() + + expected = df.groupby("group").apply(f) + result = df.groupby("group").resample("1D").ffill() assert_frame_equal(result, expected) def test_getitem(): - g = test_frame.groupby('A') + g = test_frame.groupby("A") - expected = g.B.apply(lambda x: x.resample('2s').mean()) + expected = g.B.apply(lambda x: x.resample("2s").mean()) - result = g.resample('2s').B.mean() + result = g.resample("2s").B.mean() assert_series_equal(result, expected) - result = g.B.resample('2s').mean() + result = g.B.resample("2s").mean() assert_series_equal(result, expected) - result = g.resample('2s').mean().B + result = g.resample("2s").mean().B assert_series_equal(result, expected) @@ -80,29 +93,31 @@ def test_getitem_multiple(): # GH 13174 # multiple calls after selection causing an issue with aliasing - data = [{'id': 1, 'buyer': 'A'}, {'id': 2, 'buyer': 'B'}] - df = DataFrame(data, index=pd.date_range('2016-01-01', periods=2)) - r = df.groupby('id').resample('1D') - result = r['buyer'].count() - expected = Series([1, 1], - index=pd.MultiIndex.from_tuples( - [(1, Timestamp('2016-01-01')), - (2, Timestamp('2016-01-02'))], - names=['id', None]), - name='buyer') + data = [{"id": 1, "buyer": "A"}, {"id": 2, "buyer": "B"}] + df = DataFrame(data, index=pd.date_range("2016-01-01", periods=2)) + r = df.groupby("id").resample("1D") + result = r["buyer"].count() + expected = Series( + [1, 1], + index=pd.MultiIndex.from_tuples( + [(1, Timestamp("2016-01-01")), (2, Timestamp("2016-01-02"))], + names=["id", None], + ), + name="buyer", + ) assert_series_equal(result, expected) - result = r['buyer'].count() + result = r["buyer"].count() assert_series_equal(result, expected) def test_groupby_resample_on_api_with_getitem(): # GH 17813 - df = pd.DataFrame({'id': list('aabbb'), - 'date': pd.date_range('1-1-2016', periods=5), - 'data': 1}) - exp = df.set_index('date').groupby('id').resample('2D')['data'].sum() - result = df.groupby('id').resample('2D', on='date')['data'].sum() + df = pd.DataFrame( + {"id": list("aabbb"), "date": pd.date_range("1-1-2016", periods=5), "data": 1} + ) + exp = df.set_index("date").groupby("id").resample("2D")["data"].sum() + result = df.groupby("id").resample("2D", on="date")["data"].sum() assert_series_equal(result, exp) @@ -110,78 +125,84 @@ def test_nearest(): # GH 17496 # Resample nearest - index = pd.date_range('1/1/2000', periods=3, freq='T') - result = Series(range(3), index=index).resample('20s').nearest() + index = pd.date_range("1/1/2000", periods=3, freq="T") + result = Series(range(3), index=index).resample("20s").nearest() expected = Series( [0, 0, 1, 1, 1, 2, 2], index=pd.DatetimeIndex( - ['2000-01-01 00:00:00', '2000-01-01 00:00:20', - '2000-01-01 00:00:40', '2000-01-01 00:01:00', - '2000-01-01 00:01:20', '2000-01-01 00:01:40', - '2000-01-01 00:02:00'], - dtype='datetime64[ns]', - freq='20S')) + [ + "2000-01-01 00:00:00", + "2000-01-01 00:00:20", + "2000-01-01 00:00:40", + "2000-01-01 00:01:00", + "2000-01-01 00:01:20", + "2000-01-01 00:01:40", + "2000-01-01 00:02:00", + ], + dtype="datetime64[ns]", + freq="20S", + ), + ) assert_series_equal(result, expected) def test_methods(): - g = test_frame.groupby('A') - r = g.resample('2s') + g = test_frame.groupby("A") + r = g.resample("2s") - for f in ['first', 'last', 'median', 'sem', 'sum', 'mean', - 'min', 'max']: + for f in ["first", "last", "median", "sem", "sum", "mean", "min", "max"]: result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.resample('2s'), f)()) + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) assert_frame_equal(result, expected) - for f in ['size']: + for f in ["size"]: result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.resample('2s'), f)()) + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) assert_series_equal(result, expected) - for f in ['count']: + for f in ["count"]: result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.resample('2s'), f)()) + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) assert_frame_equal(result, expected) # series only - for f in ['nunique']: + for f in ["nunique"]: result = getattr(r.B, f)() - expected = g.B.apply(lambda x: getattr(x.resample('2s'), f)()) + expected = g.B.apply(lambda x: getattr(x.resample("2s"), f)()) assert_series_equal(result, expected) - for f in ['nearest', 'backfill', 'ffill', 'asfreq']: + for f in ["nearest", "backfill", "ffill", "asfreq"]: result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.resample('2s'), f)()) + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) assert_frame_equal(result, expected) result = r.ohlc() - expected = g.apply(lambda x: x.resample('2s').ohlc()) + expected = g.apply(lambda x: x.resample("2s").ohlc()) assert_frame_equal(result, expected) - for f in ['std', 'var']: + for f in ["std", "var"]: result = getattr(r, f)(ddof=1) - expected = g.apply(lambda x: getattr(x.resample('2s'), f)(ddof=1)) + expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) assert_frame_equal(result, expected) def test_apply(): - g = test_frame.groupby('A') - r = g.resample('2s') + g = test_frame.groupby("A") + r = g.resample("2s") # reduction - expected = g.resample('2s').sum() + expected = g.resample("2s").sum() def f(x): - return x.resample('2s').sum() + return x.resample("2s").sum() result = r.apply(f) assert_frame_equal(result, expected) def f(x): - return x.resample('2s').apply(lambda y: y.sum()) + return x.resample("2s").apply(lambda y: y.sum()) result = g.apply(f) assert_frame_equal(result, expected) @@ -189,41 +210,40 @@ def f(x): def test_apply_with_mutated_index(): # GH 15169 - index = pd.date_range('1-1-2015', '12-31-15', freq='D') - df = DataFrame(data={'col1': np.random.rand(len(index))}, index=index) + index = pd.date_range("1-1-2015", "12-31-15", freq="D") + df = DataFrame(data={"col1": np.random.rand(len(index))}, index=index) def f(x): - s = Series([1, 2], index=['a', 'b']) + s = Series([1, 2], index=["a", "b"]) return s - expected = df.groupby(pd.Grouper(freq='M')).apply(f) + expected = df.groupby(pd.Grouper(freq="M")).apply(f) - result = df.resample('M').apply(f) + result = df.resample("M").apply(f) assert_frame_equal(result, expected) # A case for series - expected = df['col1'].groupby(pd.Grouper(freq='M')).apply(f) - result = df['col1'].resample('M').apply(f) + expected = df["col1"].groupby(pd.Grouper(freq="M")).apply(f) + result = df["col1"].resample("M").apply(f) assert_series_equal(result, expected) def test_resample_groupby_with_label(): # GH 13235 - index = date_range('2000-01-01', freq='2D', periods=5) - df = DataFrame(index=index, - data={'col0': [0, 0, 1, 1, 2], 'col1': [1, 1, 1, 1, 1]} - ) - result = df.groupby('col0').resample('1W', label='left').sum() - - mi = [np.array([0, 0, 1, 2]), - pd.to_datetime(np.array(['1999-12-26', '2000-01-02', - '2000-01-02', '2000-01-02']) - ) - ] - mindex = pd.MultiIndex.from_arrays(mi, names=['col0', None]) - expected = DataFrame(data={'col0': [0, 0, 2, 2], 'col1': [1, 1, 2, 1]}, - index=mindex - ) + index = date_range("2000-01-01", freq="2D", periods=5) + df = DataFrame(index=index, data={"col0": [0, 0, 1, 1, 2], "col1": [1, 1, 1, 1, 1]}) + result = df.groupby("col0").resample("1W", label="left").sum() + + mi = [ + np.array([0, 0, 1, 2]), + pd.to_datetime( + np.array(["1999-12-26", "2000-01-02", "2000-01-02", "2000-01-02"]) + ), + ] + mindex = pd.MultiIndex.from_arrays(mi, names=["col0", None]) + expected = DataFrame( + data={"col0": [0, 0, 2, 2], "col1": [1, 1, 2, 1]}, index=mindex + ) assert_frame_equal(result, expected) @@ -232,12 +252,12 @@ def test_consistency_with_window(): # consistent return values with window df = test_frame - expected = pd.Int64Index([1, 2, 3], name='A') - result = df.groupby('A').resample('2s').mean() + expected = pd.Int64Index([1, 2, 3], name="A") + result = df.groupby("A").resample("2s").mean() assert result.index.nlevels == 2 tm.assert_index_equal(result.index.levels[0], expected) - result = df.groupby('A').rolling(20).mean() + result = df.groupby("A").rolling(20).mean() assert result.index.nlevels == 2 tm.assert_index_equal(result.index.levels[0], expected) @@ -245,12 +265,14 @@ def test_consistency_with_window(): def test_median_duplicate_columns(): # GH 14233 - df = DataFrame(np.random.randn(20, 3), - columns=list('aaa'), - index=pd.date_range('2012-01-01', periods=20, freq='s')) + df = DataFrame( + np.random.randn(20, 3), + columns=list("aaa"), + index=pd.date_range("2012-01-01", periods=20, freq="s"), + ) df2 = df.copy() - df2.columns = ['a', 'b', 'c'] - expected = df2.resample('5s').median() - result = df.resample('5s').median() + df2.columns = ["a", "b", "c"] + expected = df2.resample("5s").median() + result = df.resample("5s").median() expected.columns = result.columns assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 3f767f8e7100fa..648d78d92e7d45 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -11,12 +11,11 @@ import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal -test_series = Series(np.random.randn(1000), - index=date_range('1/1/2000', periods=1000)) +test_series = Series(np.random.randn(1000), index=date_range("1/1/2000", periods=1000)) def test_apply(): - grouper = Grouper(freq='A', label='right', closed='right') + grouper = Grouper(freq="A", label="right", closed="right") grouped = test_series.groupby(grouper) @@ -36,18 +35,18 @@ def test_count(): expected = test_series.groupby(lambda x: x.year).count() - grouper = Grouper(freq='A', label='right', closed='right') + grouper = Grouper(freq="A", label="right", closed="right") result = test_series.groupby(grouper).count() expected.index = result.index assert_series_equal(result, expected) - result = test_series.resample('A').count() + result = test_series.resample("A").count() expected.index = result.index assert_series_equal(result, expected) def test_numpy_reduction(): - result = test_series.resample('A', closed='right').prod() + result = test_series.resample("A", closed="right").prod() expected = test_series.groupby(lambda x: x.year).agg(np.prod) expected.index = result.index @@ -59,8 +58,8 @@ def test_apply_iteration(): # #2300 N = 1000 ind = pd.date_range(start="2000-01-01", freq="D", periods=N) - df = DataFrame({'open': 1, 'close': 2}, index=ind) - tg = Grouper(freq='M') + df = DataFrame({"open": 1, "close": 2}, index=ind) + tg = Grouper(freq="M") _, grouper, _ = tg._get_grouper(df) @@ -68,28 +67,33 @@ def test_apply_iteration(): grouped = df.groupby(grouper, group_keys=False) def f(df): - return df['close'] / df['open'] + return df["close"] / df["open"] # it works! result = grouped.apply(f) tm.assert_index_equal(result.index, df.index) -@pytest.mark.parametrize('name, func', [ - ('Int64Index', tm.makeIntIndex), - ('Index', tm.makeUnicodeIndex), - ('Float64Index', tm.makeFloatIndex), - ('MultiIndex', lambda m: tm.makeCustomIndex(m, 2)) -]) +@pytest.mark.parametrize( + "name, func", + [ + ("Int64Index", tm.makeIntIndex), + ("Index", tm.makeUnicodeIndex), + ("Float64Index", tm.makeFloatIndex), + ("MultiIndex", lambda m: tm.makeCustomIndex(m, 2)), + ], +) def test_fails_on_no_datetime_index(name, func): n = 2 index = func(n) - df = DataFrame({'a': np.random.randn(n)}, index=index) + df = DataFrame({"a": np.random.randn(n)}, index=index) - msg = ("Only valid with DatetimeIndex, TimedeltaIndex " - "or PeriodIndex, but got an instance of '{}'".format(name)) + msg = ( + "Only valid with DatetimeIndex, TimedeltaIndex " + "or PeriodIndex, but got an instance of '{}'".format(name) + ) with pytest.raises(TypeError, match=msg): - df.groupby(Grouper(freq='D')) + df.groupby(Grouper(freq="D")) def test_aaa_group_order(): @@ -97,46 +101,48 @@ def test_aaa_group_order(): # check TimeGrouper perform stable sorts n = 20 data = np.random.randn(n, 4) - df = DataFrame(data, columns=['A', 'B', 'C', 'D']) - df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), - datetime(2013, 1, 3), datetime(2013, 1, 4), - datetime(2013, 1, 5)] * 4 - grouped = df.groupby(Grouper(key='key', freq='D')) - - tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 1)), - df[::5]) - tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 2)), - df[1::5]) - tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 3)), - df[2::5]) - tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 4)), - df[3::5]) - tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 5)), - df[4::5]) + df = DataFrame(data, columns=["A", "B", "C", "D"]) + df["key"] = [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + datetime(2013, 1, 3), + datetime(2013, 1, 4), + datetime(2013, 1, 5), + ] * 4 + grouped = df.groupby(Grouper(key="key", freq="D")) + + tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 1)), df[::5]) + tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 2)), df[1::5]) + tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 3)), df[2::5]) + tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 4)), df[3::5]) + tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 5)), df[4::5]) def test_aggregate_normal(resample_method): """Check TimeGrouper's aggregation is identical as normal groupby.""" - if resample_method == 'ohlc': - pytest.xfail(reason='DataError: No numeric types to aggregate') + if resample_method == "ohlc": + pytest.xfail(reason="DataError: No numeric types to aggregate") data = np.random.randn(20, 4) - normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) - normal_df['key'] = [1, 2, 3, 4, 5] * 4 + normal_df = DataFrame(data, columns=["A", "B", "C", "D"]) + normal_df["key"] = [1, 2, 3, 4, 5] * 4 - dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) - dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), - datetime(2013, 1, 3), datetime(2013, 1, 4), - datetime(2013, 1, 5)] * 4 + dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) + dt_df["key"] = [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + datetime(2013, 1, 3), + datetime(2013, 1, 4), + datetime(2013, 1, 5), + ] * 4 - normal_grouped = normal_df.groupby('key') - dt_grouped = dt_df.groupby(Grouper(key='key', freq='D')) + normal_grouped = normal_df.groupby("key") + dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) expected = getattr(normal_grouped, resample_method)() dt_result = getattr(dt_grouped, resample_method)() - expected.index = date_range(start='2013-01-01', freq='D', - periods=5, name='key') + expected.index = date_range(start="2013-01-01", freq="D", periods=5, name="key") tm.assert_equal(expected, dt_result) # if TimeGrouper is used included, 'nth' doesn't work yet @@ -151,74 +157,81 @@ def test_aggregate_normal(resample_method): """ -@pytest.mark.parametrize('method, method_args, unit', [ - ('sum', dict(), 0), - ('sum', dict(min_count=0), 0), - ('sum', dict(min_count=1), np.nan), - ('prod', dict(), 1), - ('prod', dict(min_count=0), 1), - ('prod', dict(min_count=1), np.nan) -]) +@pytest.mark.parametrize( + "method, method_args, unit", + [ + ("sum", dict(), 0), + ("sum", dict(min_count=0), 0), + ("sum", dict(min_count=1), np.nan), + ("prod", dict(), 1), + ("prod", dict(min_count=0), 1), + ("prod", dict(min_count=1), np.nan), + ], +) def test_resample_entirly_nat_window(method, method_args, unit): - s = pd.Series([0] * 2 + [np.nan] * 2, - index=pd.date_range('2017', periods=4)) + s = pd.Series([0] * 2 + [np.nan] * 2, index=pd.date_range("2017", periods=4)) result = methodcaller(method, **method_args)(s.resample("2d")) - expected = pd.Series([0.0, unit], - index=pd.to_datetime(['2017-01-01', - '2017-01-03'])) + expected = pd.Series( + [0.0, unit], index=pd.to_datetime(["2017-01-01", "2017-01-03"]) + ) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('func, fill_value', [ - ('min', np.nan), - ('max', np.nan), - ('sum', 0), - ('prod', 1), - ('count', 0), -]) +@pytest.mark.parametrize( + "func, fill_value", + [("min", np.nan), ("max", np.nan), ("sum", 0), ("prod", 1), ("count", 0)], +) def test_aggregate_with_nat(func, fill_value): # check TimeGrouper's aggregation is identical as normal groupby # if NaT is included, 'var', 'std', 'mean', 'first','last' # and 'nth' doesn't work yet n = 20 - data = np.random.randn(n, 4).astype('int64') - normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) - normal_df['key'] = [1, 2, np.nan, 4, 5] * 4 - - dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) - dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT, - datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4 - - normal_grouped = normal_df.groupby('key') - dt_grouped = dt_df.groupby(Grouper(key='key', freq='D')) + data = np.random.randn(n, 4).astype("int64") + normal_df = DataFrame(data, columns=["A", "B", "C", "D"]) + normal_df["key"] = [1, 2, np.nan, 4, 5] * 4 + + dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) + dt_df["key"] = [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + pd.NaT, + datetime(2013, 1, 4), + datetime(2013, 1, 5), + ] * 4 + + normal_grouped = normal_df.groupby("key") + dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) normal_result = getattr(normal_grouped, func)() dt_result = getattr(dt_grouped, func)() - pad = DataFrame([[fill_value] * 4], index=[3], - columns=['A', 'B', 'C', 'D']) + pad = DataFrame([[fill_value] * 4], index=[3], columns=["A", "B", "C", "D"]) expected = normal_result.append(pad) expected = expected.sort_index() - expected.index = date_range(start='2013-01-01', freq='D', - periods=5, name='key') + expected.index = date_range(start="2013-01-01", freq="D", periods=5, name="key") assert_frame_equal(expected, dt_result) - assert dt_result.index.name == 'key' + assert dt_result.index.name == "key" def test_aggregate_with_nat_size(): # GH 9925 n = 20 - data = np.random.randn(n, 4).astype('int64') - normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) - normal_df['key'] = [1, 2, np.nan, 4, 5] * 4 - - dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) - dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT, - datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4 - - normal_grouped = normal_df.groupby('key') - dt_grouped = dt_df.groupby(Grouper(key='key', freq='D')) + data = np.random.randn(n, 4).astype("int64") + normal_df = DataFrame(data, columns=["A", "B", "C", "D"]) + normal_df["key"] = [1, 2, np.nan, 4, 5] * 4 + + dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) + dt_df["key"] = [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + pd.NaT, + datetime(2013, 1, 4), + datetime(2013, 1, 5), + ] * 4 + + normal_grouped = normal_df.groupby("key") + dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) normal_result = normal_grouped.size() dt_result = dt_grouped.size() @@ -226,37 +239,41 @@ def test_aggregate_with_nat_size(): pad = Series([0], index=[3]) expected = normal_result.append(pad) expected = expected.sort_index() - expected.index = date_range(start='2013-01-01', freq='D', - periods=5, name='key') + expected.index = date_range(start="2013-01-01", freq="D", periods=5, name="key") assert_series_equal(expected, dt_result) - assert dt_result.index.name == 'key' + assert dt_result.index.name == "key" def test_repr(): # GH18203 - result = repr(Grouper(key='A', freq='H')) - expected = ("TimeGrouper(key='A', freq=, axis=0, sort=True, " - "closed='left', label='left', how='mean', " - "convention='e', base=0)") + result = repr(Grouper(key="A", freq="H")) + expected = ( + "TimeGrouper(key='A', freq=, axis=0, sort=True, " + "closed='left', label='left', how='mean', " + "convention='e', base=0)" + ) assert result == expected -@pytest.mark.parametrize('method, method_args, expected_values', [ - ('sum', dict(), [1, 0, 1]), - ('sum', dict(min_count=0), [1, 0, 1]), - ('sum', dict(min_count=1), [1, np.nan, 1]), - ('sum', dict(min_count=2), [np.nan, np.nan, np.nan]), - ('prod', dict(), [1, 1, 1]), - ('prod', dict(min_count=0), [1, 1, 1]), - ('prod', dict(min_count=1), [1, np.nan, 1]), - ('prod', dict(min_count=2), [np.nan, np.nan, np.nan]), -]) +@pytest.mark.parametrize( + "method, method_args, expected_values", + [ + ("sum", dict(), [1, 0, 1]), + ("sum", dict(min_count=0), [1, 0, 1]), + ("sum", dict(min_count=1), [1, np.nan, 1]), + ("sum", dict(min_count=2), [np.nan, np.nan, np.nan]), + ("prod", dict(), [1, 1, 1]), + ("prod", dict(min_count=0), [1, 1, 1]), + ("prod", dict(min_count=1), [1, np.nan, 1]), + ("prod", dict(min_count=2), [np.nan, np.nan, np.nan]), + ], +) def test_upsample_sum(method, method_args, expected_values): s = pd.Series(1, index=pd.date_range("2017", periods=2, freq="H")) resampled = s.resample("30T") - index = pd.to_datetime(['2017-01-01T00:00:00', - '2017-01-01T00:30:00', - '2017-01-01T01:00:00']) + index = pd.to_datetime( + ["2017-01-01T00:00:00", "2017-01-01T00:30:00", "2017-01-01T01:00:00"] + ) result = methodcaller(method, **method_args)(resampled) expected = pd.Series(expected_values, index=index) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 3498d30d116894..9ce419d5cd20ac 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -10,72 +10,72 @@ def test_asfreq_bug(): - df = DataFrame(data=[1, 3], - index=[timedelta(), timedelta(minutes=3)]) - result = df.resample('1T').asfreq() - expected = DataFrame(data=[1, np.nan, np.nan, 3], - index=timedelta_range('0 day', - periods=4, - freq='1T')) + df = DataFrame(data=[1, 3], index=[timedelta(), timedelta(minutes=3)]) + result = df.resample("1T").asfreq() + expected = DataFrame( + data=[1, np.nan, np.nan, 3], + index=timedelta_range("0 day", periods=4, freq="1T"), + ) assert_frame_equal(result, expected) def test_resample_with_nat(): # GH 13223 - index = pd.to_timedelta(['0s', pd.NaT, '2s']) - result = DataFrame({'value': [2, 3, 5]}, index).resample('1s').mean() - expected = DataFrame({'value': [2.5, np.nan, 5.0]}, - index=timedelta_range('0 day', - periods=3, - freq='1S')) + index = pd.to_timedelta(["0s", pd.NaT, "2s"]) + result = DataFrame({"value": [2, 3, 5]}, index).resample("1s").mean() + expected = DataFrame( + {"value": [2.5, np.nan, 5.0]}, + index=timedelta_range("0 day", periods=3, freq="1S"), + ) assert_frame_equal(result, expected) def test_resample_as_freq_with_subperiod(): # GH 13022 - index = timedelta_range('00:00:00', '00:10:00', freq='5T') - df = DataFrame(data={'value': [1, 5, 10]}, index=index) - result = df.resample('2T').asfreq() - expected_data = {'value': [1, np.nan, np.nan, np.nan, np.nan, 10]} - expected = DataFrame(data=expected_data, - index=timedelta_range('00:00:00', - '00:10:00', freq='2T')) + index = timedelta_range("00:00:00", "00:10:00", freq="5T") + df = DataFrame(data={"value": [1, 5, 10]}, index=index) + result = df.resample("2T").asfreq() + expected_data = {"value": [1, np.nan, np.nan, np.nan, np.nan, 10]} + expected = DataFrame( + data=expected_data, index=timedelta_range("00:00:00", "00:10:00", freq="2T") + ) tm.assert_frame_equal(result, expected) def test_resample_with_timedeltas(): - expected = DataFrame({'A': np.arange(1480)}) + expected = DataFrame({"A": np.arange(1480)}) expected = expected.groupby(expected.index // 30).sum() - expected.index = pd.timedelta_range('0 days', freq='30T', periods=50) + expected.index = pd.timedelta_range("0 days", freq="30T", periods=50) - df = DataFrame({'A': np.arange(1480)}, index=pd.to_timedelta( - np.arange(1480), unit='T')) - result = df.resample('30T').sum() + df = DataFrame( + {"A": np.arange(1480)}, index=pd.to_timedelta(np.arange(1480), unit="T") + ) + result = df.resample("30T").sum() assert_frame_equal(result, expected) - s = df['A'] - result = s.resample('30T').sum() - assert_series_equal(result, expected['A']) + s = df["A"] + result = s.resample("30T").sum() + assert_series_equal(result, expected["A"]) def test_resample_single_period_timedelta(): - s = Series(list(range(5)), index=pd.timedelta_range( - '1 day', freq='s', periods=5)) - result = s.resample('2s').sum() - expected = Series([1, 5, 4], index=pd.timedelta_range( - '1 day', freq='2s', periods=3)) + s = Series(list(range(5)), index=pd.timedelta_range("1 day", freq="s", periods=5)) + result = s.resample("2s").sum() + expected = Series( + [1, 5, 4], index=pd.timedelta_range("1 day", freq="2s", periods=3) + ) assert_series_equal(result, expected) def test_resample_timedelta_idempotency(): # GH 12072 - index = pd.timedelta_range('0', periods=9, freq='10L') + index = pd.timedelta_range("0", periods=9, freq="10L") series = Series(range(9), index=index) - result = series.resample('10L').mean() + result = series.resample("10L").mean() expected = series assert_series_equal(result, expected) @@ -83,14 +83,14 @@ def test_resample_timedelta_idempotency(): def test_resample_base_with_timedeltaindex(): # GH 10530 - rng = timedelta_range(start='0s', periods=25, freq='s') + rng = timedelta_range(start="0s", periods=25, freq="s") ts = Series(np.random.randn(len(rng)), index=rng) - with_base = ts.resample('2s', base=5).mean() - without_base = ts.resample('2s').mean() + with_base = ts.resample("2s", base=5).mean() + without_base = ts.resample("2s").mean() - exp_without_base = timedelta_range(start='0s', end='25s', freq='2s') - exp_with_base = timedelta_range(start='5s', end='29s', freq='2s') + exp_without_base = timedelta_range(start="0s", end="25s", freq="2s") + exp_with_base = timedelta_range(start="5s", end="29s", freq="2s") tm.assert_index_equal(without_base.index, exp_without_base) tm.assert_index_equal(with_base.index, exp_with_base) @@ -98,15 +98,15 @@ def test_resample_base_with_timedeltaindex(): def test_resample_categorical_data_with_timedeltaindex(): # GH #12169 - df = DataFrame({'Group_obj': 'A'}, - index=pd.to_timedelta(list(range(20)), unit='s')) - df['Group'] = df['Group_obj'].astype('category') - result = df.resample('10s').agg(lambda x: (x.value_counts().index[0])) - expected = DataFrame({'Group_obj': ['A', 'A'], - 'Group': ['A', 'A']}, - index=pd.to_timedelta([0, 10], unit='s')) - expected = expected.reindex(['Group_obj', 'Group'], axis=1) - expected['Group'] = expected['Group_obj'].astype('category') + df = DataFrame({"Group_obj": "A"}, index=pd.to_timedelta(list(range(20)), unit="s")) + df["Group"] = df["Group_obj"].astype("category") + result = df.resample("10s").agg(lambda x: (x.value_counts().index[0])) + expected = DataFrame( + {"Group_obj": ["A", "A"], "Group": ["A", "A"]}, + index=pd.to_timedelta([0, 10], unit="s"), + ) + expected = expected.reindex(["Group_obj", "Group"], axis=1) + expected["Group"] = expected["Group_obj"].astype("category") tm.assert_frame_equal(result, expected) @@ -115,14 +115,14 @@ def test_resample_timedelta_values(): # check that timedelta dtype is preserved when NaT values are # introduced by the resampling - times = timedelta_range('1 day', '4 day', freq='4D') - df = DataFrame({'time': times}, index=times) + times = timedelta_range("1 day", "4 day", freq="4D") + df = DataFrame({"time": times}, index=times) - times2 = timedelta_range('1 day', '4 day', freq='2D') - exp = Series(times2, index=times2, name='time') + times2 = timedelta_range("1 day", "4 day", freq="2D") + exp = Series(times2, index=times2, name="time") exp.iloc[1] = pd.NaT - res = df.resample('2D').first()['time'] + res = df.resample("2D").first()["time"] tm.assert_series_equal(res, exp) - res = df['time'].resample('2D').first() + res = df["time"].resample("2D").first() tm.assert_series_equal(res, exp) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 72d04e26234a3a..16cfe3a469b340 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -14,28 +14,35 @@ class TestJoin: - def setup_method(self, method): # aggregate multiple columns - self.df = DataFrame({'key1': get_test_data(), - 'key2': get_test_data(), - 'data1': np.random.randn(N), - 'data2': np.random.randn(N)}) + self.df = DataFrame( + { + "key1": get_test_data(), + "key2": get_test_data(), + "data1": np.random.randn(N), + "data2": np.random.randn(N), + } + ) # exclude a couple keys for fun - self.df = self.df[self.df['key2'] > 1] + self.df = self.df[self.df["key2"] > 1] - self.df2 = DataFrame({'key1': get_test_data(n=N // 5), - 'key2': get_test_data(ngroups=NGROUPS // 2, - n=N // 5), - 'value': np.random.randn(N // 5)}) + self.df2 = DataFrame( + { + "key1": get_test_data(n=N // 5), + "key2": get_test_data(ngroups=NGROUPS // 2, n=N // 5), + "value": np.random.randn(N // 5), + } + ) index, data = tm.getMixedTypeDict() self.target = DataFrame(data, index=index) # Join on string value - self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']}, - index=data['C']) + self.source = DataFrame( + {"MergedA": data["A"], "MergedD": data["D"]}, index=data["C"] + ) def test_cython_left_outer_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) @@ -44,13 +51,11 @@ def test_cython_left_outer_join(self): ls, rs = libjoin.left_outer_join(left, right, max_group) - exp_ls = left.argsort(kind='mergesort') - exp_rs = right.argsort(kind='mergesort') + exp_ls = left.argsort(kind="mergesort") + exp_rs = right.argsort(kind="mergesort") - exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, - 6, 6, 7, 7, 8, 8, 9, 10]) - exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, - 4, 5, 4, 5, 4, 5, -1, -1]) + exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10]) + exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 @@ -68,15 +73,35 @@ def test_cython_right_outer_join(self): rs, ls = libjoin.left_outer_join(right, left, max_group) - exp_ls = left.argsort(kind='mergesort') - exp_rs = right.argsort(kind='mergesort') + exp_ls = left.argsort(kind="mergesort") + exp_rs = right.argsort(kind="mergesort") # 0 1 1 1 - exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5, - # 2 2 4 - 6, 7, 8, 6, 7, 8, -1]) - exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, - 4, 4, 4, 5, 5, 5, 6]) + exp_li = a_( + [ + 0, + 1, + 2, + 3, + 4, + 5, + 3, + 4, + 5, + 3, + 4, + 5, + # 2 2 4 + 6, + 7, + 8, + 6, + 7, + 8, + -1, + ] + ) + exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 @@ -94,13 +119,11 @@ def test_cython_inner_join(self): ls, rs = libjoin.inner_join(left, right, max_group) - exp_ls = left.argsort(kind='mergesort') - exp_rs = right.argsort(kind='mergesort') + exp_ls = left.argsort(kind="mergesort") + exp_rs = right.argsort(kind="mergesort") - exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, - 6, 6, 7, 7, 8, 8]) - exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, - 4, 5, 4, 5, 4, 5]) + exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8]) + exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 @@ -112,207 +135,209 @@ def test_cython_inner_join(self): tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_left_outer_join(self): - joined_key2 = merge(self.df, self.df2, on='key2') - _check_join(self.df, self.df2, joined_key2, ['key2'], how='left') + joined_key2 = merge(self.df, self.df2, on="key2") + _check_join(self.df, self.df2, joined_key2, ["key2"], how="left") joined_both = merge(self.df, self.df2) - _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], - how='left') + _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="left") def test_right_outer_join(self): - joined_key2 = merge(self.df, self.df2, on='key2', how='right') - _check_join(self.df, self.df2, joined_key2, ['key2'], how='right') + joined_key2 = merge(self.df, self.df2, on="key2", how="right") + _check_join(self.df, self.df2, joined_key2, ["key2"], how="right") - joined_both = merge(self.df, self.df2, how='right') - _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], - how='right') + joined_both = merge(self.df, self.df2, how="right") + _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="right") def test_full_outer_join(self): - joined_key2 = merge(self.df, self.df2, on='key2', how='outer') - _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer') + joined_key2 = merge(self.df, self.df2, on="key2", how="outer") + _check_join(self.df, self.df2, joined_key2, ["key2"], how="outer") - joined_both = merge(self.df, self.df2, how='outer') - _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], - how='outer') + joined_both = merge(self.df, self.df2, how="outer") + _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="outer") def test_inner_join(self): - joined_key2 = merge(self.df, self.df2, on='key2', how='inner') - _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner') + joined_key2 = merge(self.df, self.df2, on="key2", how="inner") + _check_join(self.df, self.df2, joined_key2, ["key2"], how="inner") - joined_both = merge(self.df, self.df2, how='inner') - _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], - how='inner') + joined_both = merge(self.df, self.df2, how="inner") + _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="inner") def test_handle_overlap(self): - joined = merge(self.df, self.df2, on='key2', - suffixes=['.foo', '.bar']) + joined = merge(self.df, self.df2, on="key2", suffixes=[".foo", ".bar"]) - assert 'key1.foo' in joined - assert 'key1.bar' in joined + assert "key1.foo" in joined + assert "key1.bar" in joined def test_handle_overlap_arbitrary_key(self): - joined = merge(self.df, self.df2, - left_on='key2', right_on='key1', - suffixes=['.foo', '.bar']) - assert 'key1.foo' in joined - assert 'key2.bar' in joined + joined = merge( + self.df, + self.df2, + left_on="key2", + right_on="key1", + suffixes=[".foo", ".bar"], + ) + assert "key1.foo" in joined + assert "key2.bar" in joined def test_join_on(self): target = self.target source = self.source - merged = target.join(source, on='C') - tm.assert_series_equal(merged['MergedA'], target['A'], - check_names=False) - tm.assert_series_equal(merged['MergedD'], target['D'], - check_names=False) + merged = target.join(source, on="C") + tm.assert_series_equal(merged["MergedA"], target["A"], check_names=False) + tm.assert_series_equal(merged["MergedD"], target["D"], check_names=False) # join with duplicates (fix regression from DataFrame/Matrix merge) - df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) - df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) - joined = df.join(df2, on='key') - expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'], - 'value': [0, 0, 1, 1, 2]}) + df = DataFrame({"key": ["a", "a", "b", "b", "c"]}) + df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"]) + joined = df.join(df2, on="key") + expected = DataFrame( + {"key": ["a", "a", "b", "b", "c"], "value": [0, 0, 1, 1, 2]} + ) assert_frame_equal(joined, expected) # Test when some are missing - df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], - columns=['one']) - df_b = DataFrame([['foo'], ['bar']], index=[1, 2], - columns=['two']) - df_c = DataFrame([[1], [2]], index=[1, 2], - columns=['three']) - joined = df_a.join(df_b, on='one') - joined = joined.join(df_c, on='one') - assert np.isnan(joined['two']['c']) - assert np.isnan(joined['three']['c']) + df_a = DataFrame([[1], [2], [3]], index=["a", "b", "c"], columns=["one"]) + df_b = DataFrame([["foo"], ["bar"]], index=[1, 2], columns=["two"]) + df_c = DataFrame([[1], [2]], index=[1, 2], columns=["three"]) + joined = df_a.join(df_b, on="one") + joined = joined.join(df_c, on="one") + assert np.isnan(joined["two"]["c"]) + assert np.isnan(joined["three"]["c"]) # merge column not p resent with pytest.raises(KeyError, match="^'E'$"): - target.join(source, on='E') + target.join(source, on="E") # overlap source_copy = source.copy() - source_copy['A'] = 0 - msg = ("You are trying to merge on float64 and object columns. If" - " you wish to proceed you should use pd.concat") + source_copy["A"] = 0 + msg = ( + "You are trying to merge on float64 and object columns. If" + " you wish to proceed you should use pd.concat" + ) with pytest.raises(ValueError, match=msg): - target.join(source_copy, on='A') + target.join(source_copy, on="A") def test_join_on_fails_with_different_right_index(self): - df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), - 'b': np.random.randn(3)}) - df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), - 'b': np.random.randn(10)}, - index=tm.makeCustomIndex(10, 2)) - msg = (r'len\(left_on\) must equal the number of levels in the index' - ' of "right"') + df = DataFrame( + {"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)} + ) + df2 = DataFrame( + {"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)}, + index=tm.makeCustomIndex(10, 2), + ) + msg = ( + r"len\(left_on\) must equal the number of levels in the index" ' of "right"' + ) with pytest.raises(ValueError, match=msg): - merge(df, df2, left_on='a', right_index=True) + merge(df, df2, left_on="a", right_index=True) def test_join_on_fails_with_different_left_index(self): - df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), - 'b': np.random.randn(3)}, - index=tm.makeCustomIndex(3, 2)) - df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), - 'b': np.random.randn(10)}) - msg = (r'len\(right_on\) must equal the number of levels in the index' - ' of "left"') + df = DataFrame( + {"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)}, + index=tm.makeCustomIndex(3, 2), + ) + df2 = DataFrame( + {"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)} + ) + msg = ( + r"len\(right_on\) must equal the number of levels in the index" ' of "left"' + ) with pytest.raises(ValueError, match=msg): - merge(df, df2, right_on='b', left_index=True) + merge(df, df2, right_on="b", left_index=True) def test_join_on_fails_with_different_column_counts(self): - df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), - 'b': np.random.randn(3)}) - df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), - 'b': np.random.randn(10)}, - index=tm.makeCustomIndex(10, 2)) + df = DataFrame( + {"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)} + ) + df2 = DataFrame( + {"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)}, + index=tm.makeCustomIndex(10, 2), + ) msg = r"len\(right_on\) must equal len\(left_on\)" with pytest.raises(ValueError, match=msg): - merge(df, df2, right_on='a', left_on=['a', 'b']) + merge(df, df2, right_on="a", left_on=["a", "b"]) - @pytest.mark.parametrize("wrong_type", [2, 'str', None, np.array([0, 1])]) + @pytest.mark.parametrize("wrong_type", [2, "str", None, np.array([0, 1])]) def test_join_on_fails_with_wrong_object_type(self, wrong_type): # GH12081 - original issue # GH21220 - merging of Series and DataFrame is now allowed # Edited test to remove the Series object from test parameters - df = DataFrame({'a': [1, 1]}) - msg = ("Can only merge Series or DataFrame objects, a {} was passed" - .format(str(type(wrong_type)))) + df = DataFrame({"a": [1, 1]}) + msg = "Can only merge Series or DataFrame objects, a {} was passed".format( + str(type(wrong_type)) + ) with pytest.raises(TypeError, match=msg): - merge(wrong_type, df, left_on='a', right_on='a') + merge(wrong_type, df, left_on="a", right_on="a") with pytest.raises(TypeError, match=msg): - merge(df, wrong_type, left_on='a', right_on='a') + merge(df, wrong_type, left_on="a", right_on="a") def test_join_on_pass_vector(self): - expected = self.target.join(self.source, on='C') - del expected['C'] + expected = self.target.join(self.source, on="C") + del expected["C"] - join_col = self.target.pop('C') + join_col = self.target.pop("C") result = self.target.join(self.source, on=join_col) assert_frame_equal(result, expected) def test_join_with_len0(self): # nothing to merge - merged = self.target.join(self.source.reindex([]), on='C') + merged = self.target.join(self.source.reindex([]), on="C") for col in self.source: assert col in merged assert merged[col].isna().all() - merged2 = self.target.join(self.source.reindex([]), on='C', - how='inner') + merged2 = self.target.join(self.source.reindex([]), on="C", how="inner") tm.assert_index_equal(merged2.columns, merged.columns) assert len(merged2) == 0 def test_join_on_inner(self): - df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']}) - df2 = DataFrame({'value': [0, 1]}, index=['a', 'b']) + df = DataFrame({"key": ["a", "a", "d", "b", "b", "c"]}) + df2 = DataFrame({"value": [0, 1]}, index=["a", "b"]) - joined = df.join(df2, on='key', how='inner') + joined = df.join(df2, on="key", how="inner") - expected = df.join(df2, on='key') - expected = expected[expected['value'].notna()] - tm.assert_series_equal(joined['key'], expected['key'], - check_dtype=False) - tm.assert_series_equal(joined['value'], expected['value'], - check_dtype=False) + expected = df.join(df2, on="key") + expected = expected[expected["value"].notna()] + tm.assert_series_equal(joined["key"], expected["key"], check_dtype=False) + tm.assert_series_equal(joined["value"], expected["value"], check_dtype=False) tm.assert_index_equal(joined.index, expected.index) def test_join_on_singlekey_list(self): - df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) - df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) + df = DataFrame({"key": ["a", "a", "b", "b", "c"]}) + df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"]) # corner cases - joined = df.join(df2, on=['key']) - expected = df.join(df2, on='key') + joined = df.join(df2, on=["key"]) + expected = df.join(df2, on="key") assert_frame_equal(joined, expected) def test_join_on_series(self): - result = self.target.join(self.source['MergedA'], on='C') - expected = self.target.join(self.source[['MergedA']], on='C') + result = self.target.join(self.source["MergedA"], on="C") + expected = self.target.join(self.source[["MergedA"]], on="C") assert_frame_equal(result, expected) def test_join_on_series_buglet(self): # GH #638 - df = DataFrame({'a': [1, 1]}) - ds = Series([2], index=[1], name='b') - result = df.join(ds, on='a') - expected = DataFrame({'a': [1, 1], - 'b': [2, 2]}, index=df.index) + df = DataFrame({"a": [1, 1]}) + ds = Series([2], index=[1], name="b") + result = df.join(ds, on="a") + expected = DataFrame({"a": [1, 1], "b": [2, 2]}, index=df.index) tm.assert_frame_equal(result, expected) def test_join_index_mixed(self, join_type): # no overlapping blocks df1 = DataFrame(index=np.arange(10)) - df1['bool'] = True - df1['string'] = 'foo' + df1["bool"] = True + df1["string"] = "foo" df2 = DataFrame(index=np.arange(5, 15)) - df2['int'] = 1 - df2['float'] = 1. + df2["int"] = 1 + df2["float"] = 1.0 joined = df1.join(df2, how=join_type) expected = _join_by_hand(df1, df2, how=join_type) @@ -323,20 +348,32 @@ def test_join_index_mixed(self, join_type): assert_frame_equal(joined, expected) def test_join_index_mixed_overlap(self): - df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, - index=np.arange(10), - columns=['A', 'B', 'C', 'D']) - assert df1['B'].dtype == np.int64 - assert df1['D'].dtype == np.bool_ + df1 = DataFrame( + {"A": 1.0, "B": 2, "C": "foo", "D": True}, + index=np.arange(10), + columns=["A", "B", "C", "D"], + ) + assert df1["B"].dtype == np.int64 + assert df1["D"].dtype == np.bool_ - df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, - index=np.arange(0, 10, 2), - columns=['A', 'B', 'C', 'D']) + df2 = DataFrame( + {"A": 1.0, "B": 2, "C": "foo", "D": True}, + index=np.arange(0, 10, 2), + columns=["A", "B", "C", "D"], + ) # overlap - joined = df1.join(df2, lsuffix='_one', rsuffix='_two') - expected_columns = ['A_one', 'B_one', 'C_one', 'D_one', - 'A_two', 'B_two', 'C_two', 'D_two'] + joined = df1.join(df2, lsuffix="_one", rsuffix="_two") + expected_columns = [ + "A_one", + "B_one", + "C_one", + "D_one", + "A_two", + "B_two", + "C_two", + "D_two", + ] df1.columns = expected_columns[:4] df2.columns = expected_columns[4:] expected = _join_by_hand(df1, df2) @@ -345,37 +382,37 @@ def test_join_index_mixed_overlap(self): def test_join_empty_bug(self): # generated an exception in 0.4.3 x = DataFrame() - x.join(DataFrame([3], index=[0], columns=['A']), how='outer') + x.join(DataFrame([3], index=[0], columns=["A"]), how="outer") def test_join_unconsolidated(self): # GH #331 - a = DataFrame(randn(30, 2), columns=['a', 'b']) + a = DataFrame(randn(30, 2), columns=["a", "b"]) c = Series(randn(30)) - a['c'] = c - d = DataFrame(randn(30, 1), columns=['q']) + a["c"] = c + d = DataFrame(randn(30, 1), columns=["q"]) # it works! a.join(d) d.join(a) def test_join_multiindex(self): - index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'], - [1, 2, 3, 1, 2, 3]], - names=['first', 'second']) + index1 = MultiIndex.from_arrays( + [["a", "a", "a", "b", "b", "b"], [1, 2, 3, 1, 2, 3]], + names=["first", "second"], + ) - index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'], - [1, 2, 3, 1, 2, 3]], - names=['first', 'second']) + index2 = MultiIndex.from_arrays( + [["b", "b", "b", "c", "c", "c"], [1, 2, 3, 1, 2, 3]], + names=["first", "second"], + ) - df1 = DataFrame(data=np.random.randn(6), index=index1, - columns=['var X']) - df2 = DataFrame(data=np.random.randn(6), index=index2, - columns=['var Y']) + df1 = DataFrame(data=np.random.randn(6), index=index1, columns=["var X"]) + df2 = DataFrame(data=np.random.randn(6), index=index2, columns=["var Y"]) df1 = df1.sort_index(level=0) df2 = df2.sort_index(level=0) - joined = df1.join(df2, how='outer') + joined = df1.join(df2, how="outer") ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names @@ -385,7 +422,7 @@ def test_join_multiindex(self): df1 = df1.sort_index(level=1) df2 = df2.sort_index(level=1) - joined = df1.join(df2, how='outer').sort_index(level=0) + joined = df1.join(df2, how="outer").sort_index(level=0) ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names @@ -394,38 +431,62 @@ def test_join_multiindex(self): assert joined.index.names == index1.names def test_join_inner_multiindex(self): - key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', - 'qux', 'snap'] - key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', - 'three', 'one'] + key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"] + key2 = [ + "two", + "one", + "three", + "one", + "two", + "one", + "two", + "two", + "three", + "one", + ] data = np.random.randn(len(key1)) - data = DataFrame({'key1': key1, 'key2': key2, - 'data': data}) - - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - to_join = DataFrame(np.random.randn(10, 3), index=index, - columns=['j_one', 'j_two', 'j_three']) - - joined = data.join(to_join, on=['key1', 'key2'], how='inner') - expected = merge(data, to_join.reset_index(), - left_on=['key1', 'key2'], - right_on=['first', 'second'], how='inner', - sort=False) - - expected2 = merge(to_join, data, - right_on=['key1', 'key2'], left_index=True, - how='inner', sort=False) + data = DataFrame({"key1": key1, "key2": key2, "data": data}) + + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + to_join = DataFrame( + np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"] + ) + + joined = data.join(to_join, on=["key1", "key2"], how="inner") + expected = merge( + data, + to_join.reset_index(), + left_on=["key1", "key2"], + right_on=["first", "second"], + how="inner", + sort=False, + ) + + expected2 = merge( + to_join, + data, + right_on=["key1", "key2"], + left_index=True, + how="inner", + sort=False, + ) assert_frame_equal(joined, expected2.reindex_like(joined)) - expected2 = merge(to_join, data, right_on=['key1', 'key2'], - left_index=True, how='inner', sort=False) + expected2 = merge( + to_join, + data, + right_on=["key1", "key2"], + left_index=True, + how="inner", + sort=False, + ) - expected = expected.drop(['first', 'second'], axis=1) + expected = expected.drop(["first", "second"], axis=1) expected.index = joined.index assert joined.index.is_monotonic @@ -435,39 +496,38 @@ def test_join_inner_multiindex(self): def test_join_hierarchical_mixed(self): # GH 2024 - df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c']) - new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]}) - other_df = DataFrame( - [(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd']) - other_df.set_index('a', inplace=True) + df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"]) + new_df = df.groupby(["a"]).agg({"b": [np.mean, np.sum]}) + other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"]) + other_df.set_index("a", inplace=True) # GH 9455, 12219 with tm.assert_produces_warning(UserWarning): result = merge(new_df, other_df, left_index=True, right_index=True) - assert ('b', 'mean') in result - assert 'b' in result + assert ("b", "mean") in result + assert "b" in result def test_join_float64_float32(self): - a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64) - b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32) + a = DataFrame(randn(10, 2), columns=["a", "b"], dtype=np.float64) + b = DataFrame(randn(10, 1), columns=["c"], dtype=np.float32) joined = a.join(b) - assert joined.dtypes['a'] == 'float64' - assert joined.dtypes['b'] == 'float64' - assert joined.dtypes['c'] == 'float32' - - a = np.random.randint(0, 5, 100).astype('int64') - b = np.random.random(100).astype('float64') - c = np.random.random(100).astype('float32') - df = DataFrame({'a': a, 'b': b, 'c': c}) - xpdf = DataFrame({'a': a, 'b': b, 'c': c}) - s = DataFrame(np.random.random(5).astype('float32'), columns=['md']) - rs = df.merge(s, left_on='a', right_index=True) - assert rs.dtypes['a'] == 'int64' - assert rs.dtypes['b'] == 'float64' - assert rs.dtypes['c'] == 'float32' - assert rs.dtypes['md'] == 'float32' - - xp = xpdf.merge(s, left_on='a', right_index=True) + assert joined.dtypes["a"] == "float64" + assert joined.dtypes["b"] == "float64" + assert joined.dtypes["c"] == "float32" + + a = np.random.randint(0, 5, 100).astype("int64") + b = np.random.random(100).astype("float64") + c = np.random.random(100).astype("float32") + df = DataFrame({"a": a, "b": b, "c": c}) + xpdf = DataFrame({"a": a, "b": b, "c": c}) + s = DataFrame(np.random.random(5).astype("float32"), columns=["md"]) + rs = df.merge(s, left_on="a", right_index=True) + assert rs.dtypes["a"] == "int64" + assert rs.dtypes["b"] == "float64" + assert rs.dtypes["c"] == "float32" + assert rs.dtypes["md"] == "float32" + + xp = xpdf.merge(s, left_on="a", right_index=True) assert_frame_equal(rs, xp) def test_join_many_non_unique_index(self): @@ -478,119 +538,124 @@ def test_join_many_non_unique_index(self): idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) - result = idf1.join([idf2, idf3], how='outer') + result = idf1.join([idf2, idf3], how="outer") - df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer') - expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') + df_partially_merged = merge(df1, df2, on=["a", "b"], how="outer") + expected = merge(df_partially_merged, df3, on=["a", "b"], how="outer") result = result.reset_index() expected = expected[result.columns] - expected['a'] = expected.a.astype('int64') - expected['b'] = expected.b.astype('int64') + expected["a"] = expected.a.astype("int64") + expected["b"] = expected.b.astype("int64") assert_frame_equal(result, expected) df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) - df3 = DataFrame( - {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]}) + df3 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]}) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) - result = idf1.join([idf2, idf3], how='inner') + result = idf1.join([idf2, idf3], how="inner") - df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner') - expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner') + df_partially_merged = merge(df1, df2, on=["a", "b"], how="inner") + expected = merge(df_partially_merged, df3, on=["a", "b"], how="inner") result = result.reset_index() assert_frame_equal(result, expected.loc[:, result.columns]) # GH 11519 - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - s = Series(np.repeat(np.arange(8), 2), - index=np.repeat(np.arange(8), 2), name='TEST') - inner = df.join(s, how='inner') - outer = df.join(s, how='outer') - left = df.join(s, how='left') - right = df.join(s, how='right') + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) + s = Series( + np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name="TEST" + ) + inner = df.join(s, how="inner") + outer = df.join(s, how="outer") + left = df.join(s, how="left") + right = df.join(s, how="right") assert_frame_equal(inner, outer) assert_frame_equal(inner, left) assert_frame_equal(inner, right) def test_join_sort(self): - left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'], - 'value': [1, 2, 3, 4]}) - right = DataFrame({'value2': ['a', 'b', 'c']}, - index=['bar', 'baz', 'foo']) - - joined = left.join(right, on='key', sort=True) - expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'], - 'value': [2, 3, 1, 4], - 'value2': ['a', 'b', 'c', 'c']}, - index=[1, 2, 0, 3]) + left = DataFrame({"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]}) + right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"]) + + joined = left.join(right, on="key", sort=True) + expected = DataFrame( + { + "key": ["bar", "baz", "foo", "foo"], + "value": [2, 3, 1, 4], + "value2": ["a", "b", "c", "c"], + }, + index=[1, 2, 0, 3], + ) assert_frame_equal(joined, expected) # smoke test - joined = left.join(right, on='key', sort=False) + joined = left.join(right, on="key", sort=False) tm.assert_index_equal(joined.index, pd.Index(list(range(4)))) def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index - df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a']) - df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4]) + df1 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 3, "a"]) + df2 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 3, 3, 4]) result = df1.join(df2) - expected = DataFrame({'a': [1, 2, 3, 3, 4], - 'b': [5, np.nan, 6, 7, np.nan]}, - index=[1, 2, 3, 3, 'a']) + expected = DataFrame( + {"a": [1, 2, 3, 3, 4], "b": [5, np.nan, 6, 7, np.nan]}, + index=[1, 2, 3, 3, "a"], + ) tm.assert_frame_equal(result, expected) - df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a']) - df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4]) + df3 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 2, "a"]) + df4 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 2, 3, 4]) result = df3.join(df4) - expected = DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan]}, - index=[1, 2, 2, 'a']) + expected = DataFrame( + {"a": [1, 2, 3, 4], "b": [5, 6, 6, np.nan]}, index=[1, 2, 2, "a"] + ) tm.assert_frame_equal(result, expected) def test_join_non_unique_period_index(self): # GH #16871 - index = pd.period_range('2016-01-01', periods=16, freq='M') - df = DataFrame([i for i in range(len(index))], - index=index, columns=['pnum']) + index = pd.period_range("2016-01-01", periods=16, freq="M") + df = DataFrame([i for i in range(len(index))], index=index, columns=["pnum"]) df2 = concat([df, df]) - result = df.join(df2, how='inner', rsuffix='_df2') + result = df.join(df2, how="inner", rsuffix="_df2") expected = DataFrame( np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2), - columns=['pnum', 'pnum_df2'], index=df2.sort_index().index) + columns=["pnum", "pnum_df2"], + index=df2.sort_index().index, + ) tm.assert_frame_equal(result, expected) def test_mixed_type_join_with_suffix(self): # GH #916 - df = DataFrame(np.random.randn(20, 6), - columns=['a', 'b', 'c', 'd', 'e', 'f']) - df.insert(0, 'id', 0) - df.insert(5, 'dt', 'foo') + df = DataFrame(np.random.randn(20, 6), columns=["a", "b", "c", "d", "e", "f"]) + df.insert(0, "id", 0) + df.insert(5, "dt", "foo") - grouped = df.groupby('id') + grouped = df.groupby("id") mn = grouped.mean() cn = grouped.count() # it works! - mn.join(cn, rsuffix='_right') + mn.join(cn, rsuffix="_right") def test_join_many(self): - df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) - df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] + df = DataFrame(np.random.randn(10, 6), columns=list("abcdef")) + df_list = [df[["a", "b"]], df[["c", "d"]], df[["e", "f"]]] joined = df_list[0].join(df_list[1:]) tm.assert_frame_equal(joined, df) - df_list = [df[['a', 'b']][:-2], - df[['c', 'd']][2:], df[['e', 'f']][1:9]] + df_list = [df[["a", "b"]][:-2], df[["c", "d"]][2:], df[["e", "f"]][1:9]] def _check_diff_index(df_list, result, exp_index): reindexed = [x.reindex(exp_index) for x in df_list] @@ -598,25 +663,25 @@ def _check_diff_index(df_list, result, exp_index): tm.assert_frame_equal(result, expected) # different join types - joined = df_list[0].join(df_list[1:], how='outer') + joined = df_list[0].join(df_list[1:], how="outer") _check_diff_index(df_list, joined, df.index) joined = df_list[0].join(df_list[1:]) _check_diff_index(df_list, joined, df_list[0].index) - joined = df_list[0].join(df_list[1:], how='inner') + joined = df_list[0].join(df_list[1:], how="inner") _check_diff_index(df_list, joined, df.index[2:8]) msg = "Joining multiple DataFrames only supported for joining on index" with pytest.raises(ValueError, match=msg): - df_list[0].join(df_list[1:], on='a') + df_list[0].join(df_list[1:], on="a") def test_join_many_mixed(self): - df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) - df['key'] = ['foo', 'bar'] * 4 - df1 = df.loc[:, ['A', 'B']] - df2 = df.loc[:, ['C', 'D']] - df3 = df.loc[:, ['key']] + df = DataFrame(np.random.randn(8, 4), columns=["A", "B", "C", "D"]) + df["key"] = ["foo", "bar"] * 4 + df1 = df.loc[:, ["A", "B"]] + df2 = df.loc[:, ["C", "D"]] + df3 = df.loc[:, ["key"]] result = df1.join([df2, df3]) assert_frame_equal(result, df) @@ -624,15 +689,18 @@ def test_join_many_mixed(self): def test_join_dups(self): # joining dups - df = concat([DataFrame(np.random.randn(10, 4), - columns=['A', 'A', 'B', 'B']), - DataFrame(np.random.randint(0, 10, size=20) - .reshape(10, 2), - columns=['A', 'C'])], - axis=1) + df = concat( + [ + DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]), + DataFrame( + np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] + ), + ], + axis=1, + ) expected = concat([df, df], axis=1) - result = df.join(df, rsuffix='_2') + result = df.join(df, rsuffix="_2") result.columns = expected.columns assert_frame_equal(result, expected) @@ -643,70 +711,72 @@ def test_join_dups(self): z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) dta = x.merge(y, left_index=True, right_index=True).merge( - z, left_index=True, right_index=True, how="outer") + z, left_index=True, right_index=True, how="outer" + ) dta = dta.merge(w, left_index=True, right_index=True) expected = concat([x, y, z, w], axis=1) - expected.columns = ['x_x', 'y_x', 'x_y', - 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'] + expected.columns = ["x_x", "y_x", "x_y", "y_y", "x_x", "y_x", "x_y", "y_y"] assert_frame_equal(dta, expected) def test_join_multi_to_multi(self, join_type): # GH 20475 - leftindex = MultiIndex.from_product([list('abc'), list('xy'), [1, 2]], - names=['abc', 'xy', 'num']) - left = DataFrame({'v1': range(12)}, index=leftindex) - - rightindex = MultiIndex.from_product([list('abc'), list('xy')], - names=['abc', 'xy']) - right = DataFrame({'v2': [100 * i for i in range(1, 7)]}, - index=rightindex) - - result = left.join(right, on=['abc', 'xy'], how=join_type) - expected = (left.reset_index() - .merge(right.reset_index(), - on=['abc', 'xy'], how=join_type) - .set_index(['abc', 'xy', 'num']) - ) + leftindex = MultiIndex.from_product( + [list("abc"), list("xy"), [1, 2]], names=["abc", "xy", "num"] + ) + left = DataFrame({"v1": range(12)}, index=leftindex) + + rightindex = MultiIndex.from_product( + [list("abc"), list("xy")], names=["abc", "xy"] + ) + right = DataFrame({"v2": [100 * i for i in range(1, 7)]}, index=rightindex) + + result = left.join(right, on=["abc", "xy"], how=join_type) + expected = ( + left.reset_index() + .merge(right.reset_index(), on=["abc", "xy"], how=join_type) + .set_index(["abc", "xy", "num"]) + ) assert_frame_equal(expected, result) - msg = (r'len\(left_on\) must equal the number of levels in the index' - ' of "right"') + msg = ( + r"len\(left_on\) must equal the number of levels in the index" ' of "right"' + ) with pytest.raises(ValueError, match=msg): - left.join(right, on='xy', how=join_type) + left.join(right, on="xy", how=join_type) with pytest.raises(ValueError, match=msg): - right.join(left, on=['abc', 'xy'], how=join_type) + right.join(left, on=["abc", "xy"], how=join_type) def test_join_on_tz_aware_datetimeindex(self): # GH 23931, 26335 df1 = pd.DataFrame( { - 'date': pd.date_range(start='2018-01-01', periods=5, - tz='America/Chicago'), - 'vals': list('abcde') + "date": pd.date_range( + start="2018-01-01", periods=5, tz="America/Chicago" + ), + "vals": list("abcde"), } ) df2 = pd.DataFrame( { - 'date': pd.date_range(start='2018-01-03', periods=5, - tz='America/Chicago'), - 'vals_2': list('tuvwx') + "date": pd.date_range( + start="2018-01-03", periods=5, tz="America/Chicago" + ), + "vals_2": list("tuvwx"), } ) - result = df1.join(df2.set_index('date'), on='date') + result = df1.join(df2.set_index("date"), on="date") expected = df1.copy() - expected['vals_2'] = pd.Series([np.nan] * 2 + list('tuv'), - dtype=object) + expected["vals_2"] = pd.Series([np.nan] * 2 + list("tuv"), dtype=object) assert_frame_equal(result, expected) -def _check_join(left, right, result, join_col, how='left', - lsuffix='_x', rsuffix='_y'): +def _check_join(left, right, result, join_col, how="left", lsuffix="_x", rsuffix="_y"): # some smoke tests for c in join_col: - assert(result[c].notna().all()) + assert result[c].notna().all() left_grouped = left.groupby(join_col) right_grouped = right.groupby(join_col) @@ -718,9 +788,10 @@ def _check_join(left, right, result, join_col, how='left', try: lgroup = left_grouped.get_group(group_key) except KeyError: - if how in ('left', 'inner'): - raise AssertionError('key %s should not have been in the join' - % str(group_key)) + if how in ("left", "inner"): + raise AssertionError( + "key %s should not have been in the join" % str(group_key) + ) _assert_all_na(l_joined, left.columns, join_col) else: @@ -729,9 +800,10 @@ def _check_join(left, right, result, join_col, how='left', try: rgroup = right_grouped.get_group(group_key) except KeyError: - if how in ('right', 'inner'): - raise AssertionError('key %s should not have been in the join' - % str(group_key)) + if how in ("right", "inner"): + raise AssertionError( + "key %s should not have been in the join" % str(group_key) + ) _assert_all_na(r_joined, right.columns, join_col) else: @@ -739,14 +811,15 @@ def _check_join(left, right, result, join_col, how='left', def _restrict_to_columns(group, columns, suffix): - found = [c for c in group.columns - if c in columns or c.replace(suffix, '') in columns] + found = [ + c for c in group.columns if c in columns or c.replace(suffix, "") in columns + ] # filter group = group.loc[:, found] # get rid of suffixes, if any - group = group.rename(columns=lambda x: x.replace(suffix, '')) + group = group.rename(columns=lambda x: x.replace(suffix, "")) # put in the right order... group = group.loc[:, columns] @@ -761,18 +834,18 @@ def _assert_same_contents(join_chunk, source): svalues = source.fillna(NA_SENTINEL).drop_duplicates().values rows = {tuple(row) for row in jvalues} - assert(len(rows) == len(source)) - assert(all(tuple(row) in rows for row in svalues)) + assert len(rows) == len(source) + assert all(tuple(row) in rows for row in svalues) def _assert_all_na(join_chunk, source_columns, join_col): for c in source_columns: if c in join_col: continue - assert(join_chunk[c].isna().all()) + assert join_chunk[c].isna().all() -def _join_by_hand(a, b, how='left'): +def _join_by_hand(a, b, how="left"): join_index = a.index.join(b.index, how=how) a_re = a.reindex(join_index) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 8eb41415552602..80365e34fa87af 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -12,9 +12,20 @@ import pandas as pd from pandas import ( - Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index, - Int64Index, IntervalIndex, MultiIndex, PeriodIndex, RangeIndex, Series, - TimedeltaIndex, UInt64Index) + Categorical, + CategoricalIndex, + DataFrame, + DatetimeIndex, + Float64Index, + Int64Index, + IntervalIndex, + MultiIndex, + PeriodIndex, + RangeIndex, + Series, + TimedeltaIndex, + UInt64Index, +) from pandas.api.types import CategoricalDtype as CDT from pandas.core.reshape.concat import concat from pandas.core.reshape.merge import MergeError, merge @@ -30,7 +41,7 @@ def get_test_data(ngroups=NGROUPS, n=N): arr = np.asarray(np.tile(unique_groups, n // ngroups)) if len(arr) < n: - arr = np.asarray(list(arr) + unique_groups[:n - len(arr)]) + arr = np.asarray(list(arr) + unique_groups[: n - len(arr)]) random.shuffle(arr) return arr @@ -38,21 +49,21 @@ def get_test_data(ngroups=NGROUPS, n=N): def get_series(): return [ - pd.Series([1], dtype='int64'), - pd.Series([1], dtype='Int64'), + pd.Series([1], dtype="int64"), + pd.Series([1], dtype="Int64"), pd.Series([1.23]), - pd.Series(['foo']), + pd.Series(["foo"]), pd.Series([True]), - pd.Series([pd.Timestamp('2018-01-01')]), - pd.Series([pd.Timestamp('2018-01-01', tz='US/Eastern')]), + pd.Series([pd.Timestamp("2018-01-01")]), + pd.Series([pd.Timestamp("2018-01-01", tz="US/Eastern")]), ] def get_series_na(): return [ - pd.Series([np.nan], dtype='Int64'), - pd.Series([np.nan], dtype='float'), - pd.Series([np.nan], dtype='object'), + pd.Series([np.nan], dtype="Int64"), + pd.Series([np.nan], dtype="float"), + pd.Series([np.nan], dtype="object"), pd.Series([pd.NaT]), ] @@ -85,82 +96,90 @@ def series_of_dtype_all_na(request): class TestMerge: - def setup_method(self, method): # aggregate multiple columns - self.df = DataFrame({'key1': get_test_data(), - 'key2': get_test_data(), - 'data1': np.random.randn(N), - 'data2': np.random.randn(N)}) + self.df = DataFrame( + { + "key1": get_test_data(), + "key2": get_test_data(), + "data1": np.random.randn(N), + "data2": np.random.randn(N), + } + ) # exclude a couple keys for fun - self.df = self.df[self.df['key2'] > 1] - - self.df2 = DataFrame({'key1': get_test_data(n=N // 5), - 'key2': get_test_data(ngroups=NGROUPS // 2, - n=N // 5), - 'value': np.random.randn(N // 5)}) + self.df = self.df[self.df["key2"] > 1] + + self.df2 = DataFrame( + { + "key1": get_test_data(n=N // 5), + "key2": get_test_data(ngroups=NGROUPS // 2, n=N // 5), + "value": np.random.randn(N // 5), + } + ) - self.left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], - 'v1': np.random.randn(7)}) - self.right = DataFrame({'v2': np.random.randn(4)}, - index=['d', 'b', 'c', 'a']) + self.left = DataFrame( + {"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)} + ) + self.right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"]) def test_merge_inner_join_empty(self): # GH 15328 df_empty = pd.DataFrame() - df_a = pd.DataFrame({'a': [1, 2]}, index=[0, 1], dtype='int64') + df_a = pd.DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64") result = pd.merge(df_empty, df_a, left_index=True, right_index=True) - expected = pd.DataFrame({'a': []}, index=[], dtype='int64') + expected = pd.DataFrame({"a": []}, index=[], dtype="int64") assert_frame_equal(result, expected) def test_merge_common(self): joined = merge(self.df, self.df2) - exp = merge(self.df, self.df2, on=['key1', 'key2']) + exp = merge(self.df, self.df2, on=["key1", "key2"]) tm.assert_frame_equal(joined, exp) def test_merge_index_as_on_arg(self): # GH14355 - left = self.df.set_index('key1') - right = self.df2.set_index('key1') - result = merge(left, right, on='key1') - expected = merge(self.df, self.df2, on='key1').set_index('key1') + left = self.df.set_index("key1") + right = self.df2.set_index("key1") + result = merge(left, right, on="key1") + expected = merge(self.df, self.df2, on="key1").set_index("key1") assert_frame_equal(result, expected) def test_merge_index_singlekey_right_vs_left(self): - left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], - 'v1': np.random.randn(7)}) - right = DataFrame({'v2': np.random.randn(4)}, - index=['d', 'b', 'c', 'a']) - - merged1 = merge(left, right, left_on='key', - right_index=True, how='left', sort=False) - merged2 = merge(right, left, right_on='key', - left_index=True, how='right', sort=False) + left = DataFrame( + {"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)} + ) + right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"]) + + merged1 = merge( + left, right, left_on="key", right_index=True, how="left", sort=False + ) + merged2 = merge( + right, left, right_on="key", left_index=True, how="right", sort=False + ) assert_frame_equal(merged1, merged2.loc[:, merged1.columns]) - merged1 = merge(left, right, left_on='key', - right_index=True, how='left', sort=True) - merged2 = merge(right, left, right_on='key', - left_index=True, how='right', sort=True) + merged1 = merge( + left, right, left_on="key", right_index=True, how="left", sort=True + ) + merged2 = merge( + right, left, right_on="key", left_index=True, how="right", sort=True + ) assert_frame_equal(merged1, merged2.loc[:, merged1.columns]) def test_merge_index_singlekey_inner(self): - left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], - 'v1': np.random.randn(7)}) - right = DataFrame({'v2': np.random.randn(4)}, - index=['d', 'b', 'c', 'a']) + left = DataFrame( + {"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)} + ) + right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"]) # inner join - result = merge(left, right, left_on='key', right_index=True, - how='inner') - expected = left.join(right, on='key').loc[result.index] + result = merge(left, right, left_on="key", right_index=True, how="inner") + expected = left.join(right, on="key").loc[result.index] assert_frame_equal(result, expected) - result = merge(right, left, right_on='key', left_index=True, - how='inner') - expected = left.join(right, on='key').loc[result.index] + result = merge(right, left, right_on="key", left_index=True, how="inner") + expected = left.join(right, on="key").loc[result.index] assert_frame_equal(result, expected.loc[:, result.columns]) def test_merge_misspecified(self): @@ -171,166 +190,182 @@ def test_merge_misspecified(self): with pytest.raises(pd.errors.MergeError, match=msg): merge(self.left, self.right, right_index=True) - msg = ('Can only pass argument "on" OR "left_on" and "right_on", not' - ' a combination of both') + msg = ( + 'Can only pass argument "on" OR "left_on" and "right_on", not' + " a combination of both" + ) with pytest.raises(pd.errors.MergeError, match=msg): - merge(self.left, self.left, left_on='key', on='key') + merge(self.left, self.left, left_on="key", on="key") msg = r"len\(right_on\) must equal len\(left_on\)" with pytest.raises(ValueError, match=msg): - merge(self.df, self.df2, left_on=['key1'], - right_on=['key1', 'key2']) + merge(self.df, self.df2, left_on=["key1"], right_on=["key1", "key2"]) def test_index_and_on_parameters_confusion(self): msg = "right_index parameter must be of type bool, not " with pytest.raises(ValueError, match=msg): - merge(self.df, self.df2, how='left', - left_index=False, right_index=['key1', 'key2']) + merge( + self.df, + self.df2, + how="left", + left_index=False, + right_index=["key1", "key2"], + ) msg = "left_index parameter must be of type bool, not " with pytest.raises(ValueError, match=msg): - merge(self.df, self.df2, how='left', - left_index=['key1', 'key2'], right_index=False) + merge( + self.df, + self.df2, + how="left", + left_index=["key1", "key2"], + right_index=False, + ) with pytest.raises(ValueError, match=msg): - merge(self.df, self.df2, how='left', - left_index=['key1', 'key2'], right_index=['key1', 'key2']) + merge( + self.df, + self.df2, + how="left", + left_index=["key1", "key2"], + right_index=["key1", "key2"], + ) def test_merge_overlap(self): - merged = merge(self.left, self.left, on='key') - exp_len = (self.left['key'].value_counts() ** 2).sum() + merged = merge(self.left, self.left, on="key") + exp_len = (self.left["key"].value_counts() ** 2).sum() assert len(merged) == exp_len - assert 'v1_x' in merged - assert 'v1_y' in merged + assert "v1_x" in merged + assert "v1_y" in merged def test_merge_different_column_key_names(self): - left = DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], - 'value': [1, 2, 3, 4]}) - right = DataFrame({'rkey': ['foo', 'bar', 'qux', 'foo'], - 'value': [5, 6, 7, 8]}) + left = DataFrame({"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]}) + right = DataFrame({"rkey": ["foo", "bar", "qux", "foo"], "value": [5, 6, 7, 8]}) - merged = left.merge(right, left_on='lkey', right_on='rkey', - how='outer', sort=True) + merged = left.merge( + right, left_on="lkey", right_on="rkey", how="outer", sort=True + ) - exp = pd.Series(['bar', 'baz', 'foo', 'foo', 'foo', 'foo', np.nan], - name='lkey') - tm.assert_series_equal(merged['lkey'], exp) + exp = pd.Series(["bar", "baz", "foo", "foo", "foo", "foo", np.nan], name="lkey") + tm.assert_series_equal(merged["lkey"], exp) - exp = pd.Series(['bar', np.nan, 'foo', 'foo', 'foo', 'foo', 'qux'], - name='rkey') - tm.assert_series_equal(merged['rkey'], exp) + exp = pd.Series(["bar", np.nan, "foo", "foo", "foo", "foo", "qux"], name="rkey") + tm.assert_series_equal(merged["rkey"], exp) - exp = pd.Series([2, 3, 1, 1, 4, 4, np.nan], name='value_x') - tm.assert_series_equal(merged['value_x'], exp) + exp = pd.Series([2, 3, 1, 1, 4, 4, np.nan], name="value_x") + tm.assert_series_equal(merged["value_x"], exp) - exp = pd.Series([6, np.nan, 5, 8, 5, 8, 7], name='value_y') - tm.assert_series_equal(merged['value_y'], exp) + exp = pd.Series([6, np.nan, 5, 8, 5, 8, 7], name="value_y") + tm.assert_series_equal(merged["value_y"], exp) def test_merge_copy(self): - left = DataFrame({'a': 0, 'b': 1}, index=range(10)) - right = DataFrame({'c': 'foo', 'd': 'bar'}, index=range(10)) + left = DataFrame({"a": 0, "b": 1}, index=range(10)) + right = DataFrame({"c": "foo", "d": "bar"}, index=range(10)) - merged = merge(left, right, left_index=True, - right_index=True, copy=True) + merged = merge(left, right, left_index=True, right_index=True, copy=True) - merged['a'] = 6 - assert (left['a'] == 0).all() + merged["a"] = 6 + assert (left["a"] == 0).all() - merged['d'] = 'peekaboo' - assert (right['d'] == 'bar').all() + merged["d"] = "peekaboo" + assert (right["d"] == "bar").all() def test_merge_nocopy(self): - left = DataFrame({'a': 0, 'b': 1}, index=range(10)) - right = DataFrame({'c': 'foo', 'd': 'bar'}, index=range(10)) + left = DataFrame({"a": 0, "b": 1}, index=range(10)) + right = DataFrame({"c": "foo", "d": "bar"}, index=range(10)) - merged = merge(left, right, left_index=True, - right_index=True, copy=False) + merged = merge(left, right, left_index=True, right_index=True, copy=False) - merged['a'] = 6 - assert (left['a'] == 6).all() + merged["a"] = 6 + assert (left["a"] == 6).all() - merged['d'] = 'peekaboo' - assert (right['d'] == 'peekaboo').all() + merged["d"] = "peekaboo" + assert (right["d"] == "peekaboo").all() def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame - left = DataFrame({'key': [1, 1, 2, 2, 3], - 'value': list(range(5))}, - columns=['value', 'key']) - right = DataFrame({'key': [1, 1, 2, 3, 4, 5], - 'rvalue': list(range(6))}) - - joined = merge(left, right, on='key', how='outer') - expected = DataFrame({'key': [1, 1, 1, 1, 2, 2, 3, 4, 5], - 'value': np.array([0, 0, 1, 1, 2, 3, 4, - np.nan, np.nan]), - 'rvalue': [0, 1, 0, 1, 2, 2, 3, 4, 5]}, - columns=['value', 'key', 'rvalue']) + left = DataFrame( + {"key": [1, 1, 2, 2, 3], "value": list(range(5))}, columns=["value", "key"] + ) + right = DataFrame({"key": [1, 1, 2, 3, 4, 5], "rvalue": list(range(6))}) + + joined = merge(left, right, on="key", how="outer") + expected = DataFrame( + { + "key": [1, 1, 1, 1, 2, 2, 3, 4, 5], + "value": np.array([0, 0, 1, 1, 2, 3, 4, np.nan, np.nan]), + "rvalue": [0, 1, 0, 1, 2, 2, 3, 4, 5], + }, + columns=["value", "key", "rvalue"], + ) assert_frame_equal(joined, expected) def test_merge_join_key_dtype_cast(self): # #8596 - df1 = DataFrame({'key': [1], 'v1': [10]}) - df2 = DataFrame({'key': [2], 'v1': [20]}) - df = merge(df1, df2, how='outer') - assert df['key'].dtype == 'int64' + df1 = DataFrame({"key": [1], "v1": [10]}) + df2 = DataFrame({"key": [2], "v1": [20]}) + df = merge(df1, df2, how="outer") + assert df["key"].dtype == "int64" - df1 = DataFrame({'key': [True], 'v1': [1]}) - df2 = DataFrame({'key': [False], 'v1': [0]}) - df = merge(df1, df2, how='outer') + df1 = DataFrame({"key": [True], "v1": [1]}) + df2 = DataFrame({"key": [False], "v1": [0]}) + df = merge(df1, df2, how="outer") # GH13169 # this really should be bool - assert df['key'].dtype == 'object' + assert df["key"].dtype == "object" - df1 = DataFrame({'val': [1]}) - df2 = DataFrame({'val': [2]}) + df1 = DataFrame({"val": [1]}) + df2 = DataFrame({"val": [2]}) lkey = np.array([1]) rkey = np.array([2]) - df = merge(df1, df2, left_on=lkey, right_on=rkey, how='outer') - assert df['key_0'].dtype == 'int64' + df = merge(df1, df2, left_on=lkey, right_on=rkey, how="outer") + assert df["key_0"].dtype == "int64" def test_handle_join_key_pass_array(self): - left = DataFrame({'key': [1, 1, 2, 2, 3], - 'value': np.arange(5)}, - columns=['value', 'key']) - right = DataFrame({'rvalue': np.arange(6)}) + left = DataFrame( + {"key": [1, 1, 2, 2, 3], "value": np.arange(5)}, columns=["value", "key"] + ) + right = DataFrame({"rvalue": np.arange(6)}) key = np.array([1, 1, 2, 3, 4, 5]) - merged = merge(left, right, left_on='key', right_on=key, how='outer') - merged2 = merge(right, left, left_on=key, right_on='key', how='outer') + merged = merge(left, right, left_on="key", right_on=key, how="outer") + merged2 = merge(right, left, left_on=key, right_on="key", how="outer") - assert_series_equal(merged['key'], merged2['key']) - assert merged['key'].notna().all() - assert merged2['key'].notna().all() + assert_series_equal(merged["key"], merged2["key"]) + assert merged["key"].notna().all() + assert merged2["key"].notna().all() - left = DataFrame({'value': np.arange(5)}, columns=['value']) - right = DataFrame({'rvalue': np.arange(6)}) + left = DataFrame({"value": np.arange(5)}, columns=["value"]) + right = DataFrame({"rvalue": np.arange(6)}) lkey = np.array([1, 1, 2, 2, 3]) rkey = np.array([1, 1, 2, 3, 4, 5]) - merged = merge(left, right, left_on=lkey, right_on=rkey, how='outer') - tm.assert_series_equal(merged['key_0'], Series([1, 1, 1, 1, 2, - 2, 3, 4, 5], - name='key_0')) + merged = merge(left, right, left_on=lkey, right_on=rkey, how="outer") + tm.assert_series_equal( + merged["key_0"], Series([1, 1, 1, 1, 2, 2, 3, 4, 5], name="key_0") + ) - left = DataFrame({'value': np.arange(3)}) - right = DataFrame({'rvalue': np.arange(6)}) + left = DataFrame({"value": np.arange(3)}) + right = DataFrame({"rvalue": np.arange(6)}) key = np.array([0, 1, 1, 2, 2, 3], dtype=np.int64) - merged = merge(left, right, left_index=True, right_on=key, how='outer') - tm.assert_series_equal(merged['key_0'], Series(key, name='key_0')) + merged = merge(left, right, left_index=True, right_on=key, how="outer") + tm.assert_series_equal(merged["key_0"], Series(key, name="key_0")) def test_no_overlap_more_informative_error(self): dt = datetime.now() - df1 = DataFrame({'x': ['a']}, index=[dt]) + df1 = DataFrame({"x": ["a"]}, index=[dt]) - df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt]) + df2 = DataFrame({"y": ["b", "c"]}, index=[dt, dt]) - msg = ('No common columns to perform merge on. ' - 'Merge options: left_on={lon}, right_on={ron}, ' - 'left_index={lidx}, right_index={ridx}' - .format(lon=None, ron=None, lidx=False, ridx=False)) + msg = ( + "No common columns to perform merge on. " + "Merge options: left_on={lon}, right_on={ron}, " + "left_index={lidx}, right_index={ridx}".format( + lon=None, ron=None, lidx=False, ridx=False + ) + ) with pytest.raises(MergeError, match=msg): merge(df1, df2) @@ -342,165 +377,191 @@ def test_merge_non_unique_indexes(self): dt3 = datetime(2012, 5, 3) dt4 = datetime(2012, 5, 4) - df1 = DataFrame({'x': ['a']}, index=[dt]) - df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt]) + df1 = DataFrame({"x": ["a"]}, index=[dt]) + df2 = DataFrame({"y": ["b", "c"]}, index=[dt, dt]) _check_merge(df1, df2) # Not monotonic - df1 = DataFrame({'x': ['a', 'b', 'q']}, index=[dt2, dt, dt4]) - df2 = DataFrame({'y': ['c', 'd', 'e', 'f', 'g', 'h']}, - index=[dt3, dt3, dt2, dt2, dt, dt]) + df1 = DataFrame({"x": ["a", "b", "q"]}, index=[dt2, dt, dt4]) + df2 = DataFrame( + {"y": ["c", "d", "e", "f", "g", "h"]}, index=[dt3, dt3, dt2, dt2, dt, dt] + ) _check_merge(df1, df2) - df1 = DataFrame({'x': ['a', 'b']}, index=[dt, dt]) - df2 = DataFrame({'y': ['c', 'd']}, index=[dt, dt]) + df1 = DataFrame({"x": ["a", "b"]}, index=[dt, dt]) + df2 = DataFrame({"y": ["c", "d"]}, index=[dt, dt]) _check_merge(df1, df2) def test_merge_non_unique_index_many_to_many(self): dt = datetime(2012, 5, 1) dt2 = datetime(2012, 5, 2) dt3 = datetime(2012, 5, 3) - df1 = DataFrame({'x': ['a', 'b', 'c', 'd']}, - index=[dt2, dt2, dt, dt]) - df2 = DataFrame({'y': ['e', 'f', 'g', ' h', 'i']}, - index=[dt2, dt2, dt3, dt, dt]) + df1 = DataFrame({"x": ["a", "b", "c", "d"]}, index=[dt2, dt2, dt, dt]) + df2 = DataFrame( + {"y": ["e", "f", "g", " h", "i"]}, index=[dt2, dt2, dt3, dt, dt] + ) _check_merge(df1, df2) def test_left_merge_empty_dataframe(self): - left = DataFrame({'key': [1], 'value': [2]}) - right = DataFrame({'key': []}) + left = DataFrame({"key": [1], "value": [2]}) + right = DataFrame({"key": []}) - result = merge(left, right, on='key', how='left') + result = merge(left, right, on="key", how="left") assert_frame_equal(result, left) - result = merge(right, left, on='key', how='right') + result = merge(right, left, on="key", how="right") assert_frame_equal(result, left) - @pytest.mark.parametrize('kwarg', - [dict(left_index=True, right_index=True), - dict(left_index=True, right_on='x'), - dict(left_on='a', right_index=True), - dict(left_on='a', right_on='x')]) + @pytest.mark.parametrize( + "kwarg", + [ + dict(left_index=True, right_index=True), + dict(left_index=True, right_on="x"), + dict(left_on="a", right_index=True), + dict(left_on="a", right_on="x"), + ], + ) def test_merge_left_empty_right_empty(self, join_type, kwarg): # GH 10824 - left = pd.DataFrame(columns=['a', 'b', 'c']) - right = pd.DataFrame(columns=['x', 'y', 'z']) + left = pd.DataFrame(columns=["a", "b", "c"]) + right = pd.DataFrame(columns=["x", "y", "z"]) - exp_in = pd.DataFrame(columns=['a', 'b', 'c', 'x', 'y', 'z'], - index=pd.Index([], dtype=object), - dtype=object) + exp_in = pd.DataFrame( + columns=["a", "b", "c", "x", "y", "z"], + index=pd.Index([], dtype=object), + dtype=object, + ) result = pd.merge(left, right, how=join_type, **kwarg) tm.assert_frame_equal(result, exp_in) def test_merge_left_empty_right_notempty(self): # GH 10824 - left = pd.DataFrame(columns=['a', 'b', 'c']) - right = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - columns=['x', 'y', 'z']) - - exp_out = pd.DataFrame({'a': np.array([np.nan] * 3, dtype=object), - 'b': np.array([np.nan] * 3, dtype=object), - 'c': np.array([np.nan] * 3, dtype=object), - 'x': [1, 4, 7], - 'y': [2, 5, 8], - 'z': [3, 6, 9]}, - columns=['a', 'b', 'c', 'x', 'y', 'z']) + left = pd.DataFrame(columns=["a", "b", "c"]) + right = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["x", "y", "z"]) + + exp_out = pd.DataFrame( + { + "a": np.array([np.nan] * 3, dtype=object), + "b": np.array([np.nan] * 3, dtype=object), + "c": np.array([np.nan] * 3, dtype=object), + "x": [1, 4, 7], + "y": [2, 5, 8], + "z": [3, 6, 9], + }, + columns=["a", "b", "c", "x", "y", "z"], + ) exp_in = exp_out[0:0] # make empty DataFrame keeping dtype # result will have object dtype exp_in.index = exp_in.index.astype(object) def check1(exp, kwarg): - result = pd.merge(left, right, how='inner', **kwarg) + result = pd.merge(left, right, how="inner", **kwarg) tm.assert_frame_equal(result, exp) - result = pd.merge(left, right, how='left', **kwarg) + result = pd.merge(left, right, how="left", **kwarg) tm.assert_frame_equal(result, exp) def check2(exp, kwarg): - result = pd.merge(left, right, how='right', **kwarg) + result = pd.merge(left, right, how="right", **kwarg) tm.assert_frame_equal(result, exp) - result = pd.merge(left, right, how='outer', **kwarg) + result = pd.merge(left, right, how="outer", **kwarg) tm.assert_frame_equal(result, exp) - for kwarg in [dict(left_index=True, right_index=True), - dict(left_index=True, right_on='x')]: + for kwarg in [ + dict(left_index=True, right_index=True), + dict(left_index=True, right_on="x"), + ]: check1(exp_in, kwarg) check2(exp_out, kwarg) - kwarg = dict(left_on='a', right_index=True) + kwarg = dict(left_on="a", right_index=True) check1(exp_in, kwarg) - exp_out['a'] = [0, 1, 2] + exp_out["a"] = [0, 1, 2] check2(exp_out, kwarg) - kwarg = dict(left_on='a', right_on='x') + kwarg = dict(left_on="a", right_on="x") check1(exp_in, kwarg) - exp_out['a'] = np.array([np.nan] * 3, dtype=object) + exp_out["a"] = np.array([np.nan] * 3, dtype=object) check2(exp_out, kwarg) def test_merge_left_notempty_right_empty(self): # GH 10824 - left = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - columns=['a', 'b', 'c']) - right = pd.DataFrame(columns=['x', 'y', 'z']) - - exp_out = pd.DataFrame({'a': [1, 4, 7], - 'b': [2, 5, 8], - 'c': [3, 6, 9], - 'x': np.array([np.nan] * 3, dtype=object), - 'y': np.array([np.nan] * 3, dtype=object), - 'z': np.array([np.nan] * 3, dtype=object)}, - columns=['a', 'b', 'c', 'x', 'y', 'z']) + left = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]) + right = pd.DataFrame(columns=["x", "y", "z"]) + + exp_out = pd.DataFrame( + { + "a": [1, 4, 7], + "b": [2, 5, 8], + "c": [3, 6, 9], + "x": np.array([np.nan] * 3, dtype=object), + "y": np.array([np.nan] * 3, dtype=object), + "z": np.array([np.nan] * 3, dtype=object), + }, + columns=["a", "b", "c", "x", "y", "z"], + ) exp_in = exp_out[0:0] # make empty DataFrame keeping dtype # result will have object dtype exp_in.index = exp_in.index.astype(object) def check1(exp, kwarg): - result = pd.merge(left, right, how='inner', **kwarg) + result = pd.merge(left, right, how="inner", **kwarg) tm.assert_frame_equal(result, exp) - result = pd.merge(left, right, how='right', **kwarg) + result = pd.merge(left, right, how="right", **kwarg) tm.assert_frame_equal(result, exp) def check2(exp, kwarg): - result = pd.merge(left, right, how='left', **kwarg) + result = pd.merge(left, right, how="left", **kwarg) tm.assert_frame_equal(result, exp) - result = pd.merge(left, right, how='outer', **kwarg) + result = pd.merge(left, right, how="outer", **kwarg) tm.assert_frame_equal(result, exp) - for kwarg in [dict(left_index=True, right_index=True), - dict(left_index=True, right_on='x'), - dict(left_on='a', right_index=True), - dict(left_on='a', right_on='x')]: + for kwarg in [ + dict(left_index=True, right_index=True), + dict(left_index=True, right_on="x"), + dict(left_on="a", right_index=True), + dict(left_on="a", right_on="x"), + ]: check1(exp_in, kwarg) check2(exp_out, kwarg) def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2): # GH 25183 - df = pd.DataFrame({'key': series_of_dtype, 'value': series_of_dtype2}, - columns=['key', 'value']) + df = pd.DataFrame( + {"key": series_of_dtype, "value": series_of_dtype2}, + columns=["key", "value"], + ) df_empty = df[:0] - expected = pd.DataFrame({ - 'value_x': pd.Series(dtype=df.dtypes['value']), - 'key': pd.Series(dtype=df.dtypes['key']), - 'value_y': pd.Series(dtype=df.dtypes['value']), - }, columns=['value_x', 'key', 'value_y']) - actual = df_empty.merge(df, on='key') + expected = pd.DataFrame( + { + "value_x": pd.Series(dtype=df.dtypes["value"]), + "key": pd.Series(dtype=df.dtypes["key"]), + "value_y": pd.Series(dtype=df.dtypes["value"]), + }, + columns=["value_x", "key", "value_y"], + ) + actual = df_empty.merge(df, on="key") assert_frame_equal(actual, expected) - def test_merge_all_na_column(self, series_of_dtype, - series_of_dtype_all_na): + def test_merge_all_na_column(self, series_of_dtype, series_of_dtype_all_na): # GH 25183 df_left = pd.DataFrame( - {'key': series_of_dtype, 'value': series_of_dtype_all_na}, - columns=['key', 'value']) + {"key": series_of_dtype, "value": series_of_dtype_all_na}, + columns=["key", "value"], + ) df_right = pd.DataFrame( - {'key': series_of_dtype, 'value': series_of_dtype_all_na}, - columns=['key', 'value']) - expected = pd.DataFrame({ - 'key': series_of_dtype, - 'value_x': series_of_dtype_all_na, - 'value_y': series_of_dtype_all_na, - }, columns=['key', 'value_x', 'value_y']) - actual = df_left.merge(df_right, on='key') + {"key": series_of_dtype, "value": series_of_dtype_all_na}, + columns=["key", "value"], + ) + expected = pd.DataFrame( + { + "key": series_of_dtype, + "value_x": series_of_dtype_all_na, + "value_y": series_of_dtype_all_na, + }, + columns=["key", "value_x", "value_y"], + ) + actual = df_left.merge(df_right, on="key") assert_frame_equal(actual, expected) def test_merge_nosort(self): @@ -508,26 +569,29 @@ def test_merge_nosort(self): from datetime import datetime - d = {"var1": np.random.randint(0, 10, size=10), - "var2": np.random.randint(0, 10, size=10), - "var3": [datetime(2012, 1, 12), - datetime(2011, 2, 4), - datetime(2010, 2, 3), - datetime(2012, 1, 12), - datetime(2011, 2, 4), - datetime(2012, 4, 3), - datetime(2012, 3, 4), - datetime(2008, 5, 1), - datetime(2010, 2, 3), - datetime(2012, 2, 3)]} + d = { + "var1": np.random.randint(0, 10, size=10), + "var2": np.random.randint(0, 10, size=10), + "var3": [ + datetime(2012, 1, 12), + datetime(2011, 2, 4), + datetime(2010, 2, 3), + datetime(2012, 1, 12), + datetime(2011, 2, 4), + datetime(2012, 4, 3), + datetime(2012, 3, 4), + datetime(2008, 5, 1), + datetime(2010, 2, 3), + datetime(2012, 2, 3), + ], + } df = DataFrame.from_dict(d) var3 = df.var3.unique() var3.sort() - new = DataFrame.from_dict({"var3": var3, - "var8": np.random.random(7)}) + new = DataFrame.from_dict({"var3": var3, "var8": np.random.random(7)}) result = df.merge(new, on="var3", sort=False) - exp = merge(df, new, on='var3', sort=False) + exp = merge(df, new, on="var3", sort=False) assert_frame_equal(result, exp) assert (df.var3.unique() == result.var3.unique()).all() @@ -536,33 +600,42 @@ def test_merge_nan_right(self): df1 = DataFrame({"i1": [0, 1], "i2": [0, 1]}) df2 = DataFrame({"i1": [0], "i3": [0]}) result = df1.join(df2, on="i1", rsuffix="_") - expected = (DataFrame({'i1': {0: 0.0, 1: 1}, 'i2': {0: 0, 1: 1}, - 'i1_': {0: 0, 1: np.nan}, - 'i3': {0: 0.0, 1: np.nan}, - None: {0: 0, 1: 0}}) - .set_index(None) - .reset_index()[['i1', 'i2', 'i1_', 'i3']]) + expected = ( + DataFrame( + { + "i1": {0: 0.0, 1: 1}, + "i2": {0: 0, 1: 1}, + "i1_": {0: 0, 1: np.nan}, + "i3": {0: 0.0, 1: np.nan}, + None: {0: 0, 1: 0}, + } + ) + .set_index(None) + .reset_index()[["i1", "i2", "i1_", "i3"]] + ) assert_frame_equal(result, expected, check_dtype=False) df1 = DataFrame({"i1": [0, 1], "i2": [0.5, 1.5]}) df2 = DataFrame({"i1": [0], "i3": [0.7]}) - result = df1.join(df2, rsuffix="_", on='i1') - expected = (DataFrame({'i1': {0: 0, 1: 1}, 'i1_': {0: 0.0, 1: nan}, - 'i2': {0: 0.5, 1: 1.5}, - 'i3': {0: 0.69999999999999996, - 1: nan}}) - [['i1', 'i2', 'i1_', 'i3']]) + result = df1.join(df2, rsuffix="_", on="i1") + expected = DataFrame( + { + "i1": {0: 0, 1: 1}, + "i1_": {0: 0.0, 1: nan}, + "i2": {0: 0.5, 1: 1.5}, + "i3": {0: 0.69999999999999996, 1: nan}, + } + )[["i1", "i2", "i1_", "i3"]] assert_frame_equal(result, expected) def test_merge_type(self): class NotADataFrame(DataFrame): - @property def _constructor(self): return NotADataFrame nad = NotADataFrame(self.df) - result = nad.merge(self.df2, on='key1') + result = nad.merge(self.df2, on="key1") assert isinstance(result, NotADataFrame) @@ -574,237 +647,318 @@ def test_join_append_timedeltas(self): # timedelta64 issues with join/merge # GH 5695 - d = {'d': dt.datetime(2013, 11, 5, 5, 56), 't': dt.timedelta(0, 22500)} - df = DataFrame(columns=list('dt')) + d = {"d": dt.datetime(2013, 11, 5, 5, 56), "t": dt.timedelta(0, 22500)} + df = DataFrame(columns=list("dt")) df = df.append(d, ignore_index=True) result = df.append(d, ignore_index=True) - expected = DataFrame({'d': [dt.datetime(2013, 11, 5, 5, 56), - dt.datetime(2013, 11, 5, 5, 56)], - 't': [dt.timedelta(0, 22500), - dt.timedelta(0, 22500)]}) + expected = DataFrame( + { + "d": [dt.datetime(2013, 11, 5, 5, 56), dt.datetime(2013, 11, 5, 5, 56)], + "t": [dt.timedelta(0, 22500), dt.timedelta(0, 22500)], + } + ) assert_frame_equal(result, expected) td = np.timedelta64(300000000) lhs = DataFrame(Series([td, td], index=["A", "B"])) rhs = DataFrame(Series([td], index=["A"])) - result = lhs.join(rhs, rsuffix='r', how="left") - expected = DataFrame({'0': Series([td, td], index=list('AB')), - '0r': Series([td, NaT], index=list('AB'))}) + result = lhs.join(rhs, rsuffix="r", how="left") + expected = DataFrame( + { + "0": Series([td, td], index=list("AB")), + "0r": Series([td, NaT], index=list("AB")), + } + ) assert_frame_equal(result, expected) def test_other_datetime_unit(self): # GH 13389 - df1 = pd.DataFrame({'entity_id': [101, 102]}) - s = pd.Series([None, None], index=[101, 102], name='days') - - for dtype in ['datetime64[D]', 'datetime64[h]', 'datetime64[m]', - 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', - 'datetime64[ns]']: - - df2 = s.astype(dtype).to_frame('days') + df1 = pd.DataFrame({"entity_id": [101, 102]}) + s = pd.Series([None, None], index=[101, 102], name="days") + + for dtype in [ + "datetime64[D]", + "datetime64[h]", + "datetime64[m]", + "datetime64[s]", + "datetime64[ms]", + "datetime64[us]", + "datetime64[ns]", + ]: + + df2 = s.astype(dtype).to_frame("days") # coerces to datetime64[ns], thus should not be affected - assert df2['days'].dtype == 'datetime64[ns]' + assert df2["days"].dtype == "datetime64[ns]" - result = df1.merge(df2, left_on='entity_id', right_index=True) + result = df1.merge(df2, left_on="entity_id", right_index=True) - exp = pd.DataFrame({'entity_id': [101, 102], - 'days': np.array(['nat', 'nat'], - dtype='datetime64[ns]')}, - columns=['entity_id', 'days']) + exp = pd.DataFrame( + { + "entity_id": [101, 102], + "days": np.array(["nat", "nat"], dtype="datetime64[ns]"), + }, + columns=["entity_id", "days"], + ) tm.assert_frame_equal(result, exp) - @pytest.mark.parametrize("unit", ['D', 'h', 'm', 's', 'ms', 'us', 'ns']) + @pytest.mark.parametrize("unit", ["D", "h", "m", "s", "ms", "us", "ns"]) def test_other_timedelta_unit(self, unit): # GH 13389 - df1 = pd.DataFrame({'entity_id': [101, 102]}) - s = pd.Series([None, None], index=[101, 102], name='days') + df1 = pd.DataFrame({"entity_id": [101, 102]}) + s = pd.Series([None, None], index=[101, 102], name="days") dtype = "m8[{}]".format(unit) - df2 = s.astype(dtype).to_frame('days') - assert df2['days'].dtype == 'm8[ns]' + df2 = s.astype(dtype).to_frame("days") + assert df2["days"].dtype == "m8[ns]" - result = df1.merge(df2, left_on='entity_id', right_index=True) + result = df1.merge(df2, left_on="entity_id", right_index=True) - exp = pd.DataFrame({'entity_id': [101, 102], - 'days': np.array(['nat', 'nat'], - dtype=dtype)}, - columns=['entity_id', 'days']) + exp = pd.DataFrame( + {"entity_id": [101, 102], "days": np.array(["nat", "nat"], dtype=dtype)}, + columns=["entity_id", "days"], + ) tm.assert_frame_equal(result, exp) def test_overlapping_columns_error_message(self): - df = DataFrame({'key': [1, 2, 3], - 'v1': [4, 5, 6], - 'v2': [7, 8, 9]}) - df2 = DataFrame({'key': [1, 2, 3], - 'v1': [4, 5, 6], - 'v2': [7, 8, 9]}) - - df.columns = ['key', 'foo', 'foo'] - df2.columns = ['key', 'bar', 'bar'] - expected = DataFrame({'key': [1, 2, 3], - 'v1': [4, 5, 6], - 'v2': [7, 8, 9], - 'v3': [4, 5, 6], - 'v4': [7, 8, 9]}) - expected.columns = ['key', 'foo', 'foo', 'bar', 'bar'] + df = DataFrame({"key": [1, 2, 3], "v1": [4, 5, 6], "v2": [7, 8, 9]}) + df2 = DataFrame({"key": [1, 2, 3], "v1": [4, 5, 6], "v2": [7, 8, 9]}) + + df.columns = ["key", "foo", "foo"] + df2.columns = ["key", "bar", "bar"] + expected = DataFrame( + { + "key": [1, 2, 3], + "v1": [4, 5, 6], + "v2": [7, 8, 9], + "v3": [4, 5, 6], + "v4": [7, 8, 9], + } + ) + expected.columns = ["key", "foo", "foo", "bar", "bar"] assert_frame_equal(merge(df, df2), expected) # #2649, #10639 - df2.columns = ['key1', 'foo', 'foo'] - msg = (r"Data columns not unique: Index\(\['foo', 'foo'\]," - r" dtype='object'\)") + df2.columns = ["key1", "foo", "foo"] + msg = r"Data columns not unique: Index\(\['foo', 'foo'\]," r" dtype='object'\)" with pytest.raises(MergeError, match=msg): merge(df, df2) def test_merge_on_datetime64tz(self): # GH11405 - left = pd.DataFrame({'key': pd.date_range('20151010', periods=2, - tz='US/Eastern'), - 'value': [1, 2]}) - right = pd.DataFrame({'key': pd.date_range('20151011', periods=3, - tz='US/Eastern'), - 'value': [1, 2, 3]}) - - expected = DataFrame({'key': pd.date_range('20151010', periods=4, - tz='US/Eastern'), - 'value_x': [1, 2, np.nan, np.nan], - 'value_y': [np.nan, 1, 2, 3]}) - result = pd.merge(left, right, on='key', how='outer') + left = pd.DataFrame( + { + "key": pd.date_range("20151010", periods=2, tz="US/Eastern"), + "value": [1, 2], + } + ) + right = pd.DataFrame( + { + "key": pd.date_range("20151011", periods=3, tz="US/Eastern"), + "value": [1, 2, 3], + } + ) + + expected = DataFrame( + { + "key": pd.date_range("20151010", periods=4, tz="US/Eastern"), + "value_x": [1, 2, np.nan, np.nan], + "value_y": [np.nan, 1, 2, 3], + } + ) + result = pd.merge(left, right, on="key", how="outer") assert_frame_equal(result, expected) - left = pd.DataFrame({'key': [1, 2], - 'value': pd.date_range('20151010', periods=2, - tz='US/Eastern')}) - right = pd.DataFrame({'key': [2, 3], - 'value': pd.date_range('20151011', periods=2, - tz='US/Eastern')}) - expected = DataFrame({ - 'key': [1, 2, 3], - 'value_x': list(pd.date_range('20151010', periods=2, - tz='US/Eastern')) + [pd.NaT], - 'value_y': [pd.NaT] + list(pd.date_range('20151011', periods=2, - tz='US/Eastern'))}) - result = pd.merge(left, right, on='key', how='outer') + left = pd.DataFrame( + { + "key": [1, 2], + "value": pd.date_range("20151010", periods=2, tz="US/Eastern"), + } + ) + right = pd.DataFrame( + { + "key": [2, 3], + "value": pd.date_range("20151011", periods=2, tz="US/Eastern"), + } + ) + expected = DataFrame( + { + "key": [1, 2, 3], + "value_x": list(pd.date_range("20151010", periods=2, tz="US/Eastern")) + + [pd.NaT], + "value_y": [pd.NaT] + + list(pd.date_range("20151011", periods=2, tz="US/Eastern")), + } + ) + result = pd.merge(left, right, on="key", how="outer") assert_frame_equal(result, expected) - assert result['value_x'].dtype == 'datetime64[ns, US/Eastern]' - assert result['value_y'].dtype == 'datetime64[ns, US/Eastern]' + assert result["value_x"].dtype == "datetime64[ns, US/Eastern]" + assert result["value_y"].dtype == "datetime64[ns, US/Eastern]" def test_merge_on_datetime64tz_empty(self): # https://github.com/pandas-dev/pandas/issues/25014 - dtz = pd.DatetimeTZDtype(tz='UTC') - right = pd.DataFrame({'date': [pd.Timestamp('2018', tz=dtz.tz)], - 'value': [4.0], - 'date2': [pd.Timestamp('2019', tz=dtz.tz)]}, - columns=['date', 'value', 'date2']) + dtz = pd.DatetimeTZDtype(tz="UTC") + right = pd.DataFrame( + { + "date": [pd.Timestamp("2018", tz=dtz.tz)], + "value": [4.0], + "date2": [pd.Timestamp("2019", tz=dtz.tz)], + }, + columns=["date", "value", "date2"], + ) left = right[:0] - result = left.merge(right, on='date') - expected = pd.DataFrame({ - 'value_x': pd.Series(dtype=float), - 'date2_x': pd.Series(dtype=dtz), - 'date': pd.Series(dtype=dtz), - 'value_y': pd.Series(dtype=float), - 'date2_y': pd.Series(dtype=dtz), - }, columns=['value_x', 'date2_x', 'date', 'value_y', 'date2_y']) + result = left.merge(right, on="date") + expected = pd.DataFrame( + { + "value_x": pd.Series(dtype=float), + "date2_x": pd.Series(dtype=dtz), + "date": pd.Series(dtype=dtz), + "value_y": pd.Series(dtype=float), + "date2_y": pd.Series(dtype=dtz), + }, + columns=["value_x", "date2_x", "date", "value_y", "date2_y"], + ) tm.assert_frame_equal(result, expected) def test_merge_datetime64tz_with_dst_transition(self): # GH 18885 - df1 = pd.DataFrame(pd.date_range( - '2017-10-29 01:00', periods=4, freq='H', tz='Europe/Madrid'), - columns=['date']) - df1['value'] = 1 - df2 = pd.DataFrame({ - 'date': pd.to_datetime([ - '2017-10-29 03:00:00', '2017-10-29 04:00:00', - '2017-10-29 05:00:00' - ]), - 'value': 2 - }) - df2['date'] = df2['date'].dt.tz_localize('UTC').dt.tz_convert( - 'Europe/Madrid') - result = pd.merge(df1, df2, how='outer', on='date') - expected = pd.DataFrame({ - 'date': pd.date_range( - '2017-10-29 01:00', periods=7, freq='H', tz='Europe/Madrid'), - 'value_x': [1] * 4 + [np.nan] * 3, - 'value_y': [np.nan] * 4 + [2] * 3 - }) + df1 = pd.DataFrame( + pd.date_range("2017-10-29 01:00", periods=4, freq="H", tz="Europe/Madrid"), + columns=["date"], + ) + df1["value"] = 1 + df2 = pd.DataFrame( + { + "date": pd.to_datetime( + [ + "2017-10-29 03:00:00", + "2017-10-29 04:00:00", + "2017-10-29 05:00:00", + ] + ), + "value": 2, + } + ) + df2["date"] = df2["date"].dt.tz_localize("UTC").dt.tz_convert("Europe/Madrid") + result = pd.merge(df1, df2, how="outer", on="date") + expected = pd.DataFrame( + { + "date": pd.date_range( + "2017-10-29 01:00", periods=7, freq="H", tz="Europe/Madrid" + ), + "value_x": [1] * 4 + [np.nan] * 3, + "value_y": [np.nan] * 4 + [2] * 3, + } + ) assert_frame_equal(result, expected) def test_merge_non_unique_period_index(self): # GH #16871 - index = pd.period_range('2016-01-01', periods=16, freq='M') - df = DataFrame([i for i in range(len(index))], - index=index, columns=['pnum']) + index = pd.period_range("2016-01-01", periods=16, freq="M") + df = DataFrame([i for i in range(len(index))], index=index, columns=["pnum"]) df2 = concat([df, df]) - result = df.merge(df2, left_index=True, right_index=True, how='inner') + result = df.merge(df2, left_index=True, right_index=True, how="inner") expected = DataFrame( np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2), - columns=['pnum_x', 'pnum_y'], index=df2.sort_index().index) + columns=["pnum_x", "pnum_y"], + index=df2.sort_index().index, + ) tm.assert_frame_equal(result, expected) def test_merge_on_periods(self): - left = pd.DataFrame({'key': pd.period_range('20151010', periods=2, - freq='D'), - 'value': [1, 2]}) - right = pd.DataFrame({'key': pd.period_range('20151011', periods=3, - freq='D'), - 'value': [1, 2, 3]}) - - expected = DataFrame({'key': pd.period_range('20151010', periods=4, - freq='D'), - 'value_x': [1, 2, np.nan, np.nan], - 'value_y': [np.nan, 1, 2, 3]}) - result = pd.merge(left, right, on='key', how='outer') + left = pd.DataFrame( + {"key": pd.period_range("20151010", periods=2, freq="D"), "value": [1, 2]} + ) + right = pd.DataFrame( + { + "key": pd.period_range("20151011", periods=3, freq="D"), + "value": [1, 2, 3], + } + ) + + expected = DataFrame( + { + "key": pd.period_range("20151010", periods=4, freq="D"), + "value_x": [1, 2, np.nan, np.nan], + "value_y": [np.nan, 1, 2, 3], + } + ) + result = pd.merge(left, right, on="key", how="outer") assert_frame_equal(result, expected) - left = pd.DataFrame({'key': [1, 2], - 'value': pd.period_range('20151010', periods=2, - freq='D')}) - right = pd.DataFrame({'key': [2, 3], - 'value': pd.period_range('20151011', periods=2, - freq='D')}) - - exp_x = pd.period_range('20151010', periods=2, freq='D') - exp_y = pd.period_range('20151011', periods=2, freq='D') - expected = DataFrame({'key': [1, 2, 3], - 'value_x': list(exp_x) + [pd.NaT], - 'value_y': [pd.NaT] + list(exp_y)}) - result = pd.merge(left, right, on='key', how='outer') + left = pd.DataFrame( + {"key": [1, 2], "value": pd.period_range("20151010", periods=2, freq="D")} + ) + right = pd.DataFrame( + {"key": [2, 3], "value": pd.period_range("20151011", periods=2, freq="D")} + ) + + exp_x = pd.period_range("20151010", periods=2, freq="D") + exp_y = pd.period_range("20151011", periods=2, freq="D") + expected = DataFrame( + { + "key": [1, 2, 3], + "value_x": list(exp_x) + [pd.NaT], + "value_y": [pd.NaT] + list(exp_y), + } + ) + result = pd.merge(left, right, on="key", how="outer") assert_frame_equal(result, expected) - assert result['value_x'].dtype == 'Period[D]' - assert result['value_y'].dtype == 'Period[D]' + assert result["value_x"].dtype == "Period[D]" + assert result["value_y"].dtype == "Period[D]" def test_indicator(self): # PR #10054. xref #7412 and closes #8790. - df1 = DataFrame({'col1': [0, 1], 'col_conflict': [1, 2], - 'col_left': ['a', 'b']}) + df1 = DataFrame( + {"col1": [0, 1], "col_conflict": [1, 2], "col_left": ["a", "b"]} + ) df1_copy = df1.copy() - df2 = DataFrame({'col1': [1, 2, 3, 4, 5], - 'col_conflict': [1, 2, 3, 4, 5], - 'col_right': [2, 2, 2, 2, 2]}) + df2 = DataFrame( + { + "col1": [1, 2, 3, 4, 5], + "col_conflict": [1, 2, 3, 4, 5], + "col_right": [2, 2, 2, 2, 2], + } + ) df2_copy = df2.copy() - df_result = DataFrame({ - 'col1': [0, 1, 2, 3, 4, 5], - 'col_conflict_x': [1, 2, np.nan, np.nan, np.nan, np.nan], - 'col_left': ['a', 'b', np.nan, np.nan, np.nan, np.nan], - 'col_conflict_y': [np.nan, 1, 2, 3, 4, 5], - 'col_right': [np.nan, 2, 2, 2, 2, 2]}) - df_result['_merge'] = Categorical( - ['left_only', 'both', 'right_only', - 'right_only', 'right_only', 'right_only'], - categories=['left_only', 'right_only', 'both']) - - df_result = df_result[['col1', 'col_conflict_x', 'col_left', - 'col_conflict_y', 'col_right', '_merge']] - - test = merge(df1, df2, on='col1', how='outer', indicator=True) + df_result = DataFrame( + { + "col1": [0, 1, 2, 3, 4, 5], + "col_conflict_x": [1, 2, np.nan, np.nan, np.nan, np.nan], + "col_left": ["a", "b", np.nan, np.nan, np.nan, np.nan], + "col_conflict_y": [np.nan, 1, 2, 3, 4, 5], + "col_right": [np.nan, 2, 2, 2, 2, 2], + } + ) + df_result["_merge"] = Categorical( + [ + "left_only", + "both", + "right_only", + "right_only", + "right_only", + "right_only", + ], + categories=["left_only", "right_only", "both"], + ) + + df_result = df_result[ + [ + "col1", + "col_conflict_x", + "col_left", + "col_conflict_y", + "col_right", + "_merge", + ] + ] + + test = merge(df1, df2, on="col1", how="outer", indicator=True) assert_frame_equal(test, df_result) - test = df1.merge(df2, on='col1', how='outer', indicator=True) + test = df1.merge(df2, on="col1", how="outer", indicator=True) assert_frame_equal(test, df_result) # No side effects @@ -814,258 +968,337 @@ def test_indicator(self): # Check with custom name df_result_custom_name = df_result df_result_custom_name = df_result_custom_name.rename( - columns={'_merge': 'custom_name'}) + columns={"_merge": "custom_name"} + ) test_custom_name = merge( - df1, df2, on='col1', how='outer', indicator='custom_name') + df1, df2, on="col1", how="outer", indicator="custom_name" + ) assert_frame_equal(test_custom_name, df_result_custom_name) test_custom_name = df1.merge( - df2, on='col1', how='outer', indicator='custom_name') + df2, on="col1", how="outer", indicator="custom_name" + ) assert_frame_equal(test_custom_name, df_result_custom_name) # Check only accepts strings and booleans msg = "indicator option can only accept boolean or string arguments" with pytest.raises(ValueError, match=msg): - merge(df1, df2, on='col1', how='outer', indicator=5) + merge(df1, df2, on="col1", how="outer", indicator=5) with pytest.raises(ValueError, match=msg): - df1.merge(df2, on='col1', how='outer', indicator=5) + df1.merge(df2, on="col1", how="outer", indicator=5) # Check result integrity - test2 = merge(df1, df2, on='col1', how='left', indicator=True) - assert (test2._merge != 'right_only').all() - test2 = df1.merge(df2, on='col1', how='left', indicator=True) - assert (test2._merge != 'right_only').all() + test2 = merge(df1, df2, on="col1", how="left", indicator=True) + assert (test2._merge != "right_only").all() + test2 = df1.merge(df2, on="col1", how="left", indicator=True) + assert (test2._merge != "right_only").all() - test3 = merge(df1, df2, on='col1', how='right', indicator=True) - assert (test3._merge != 'left_only').all() - test3 = df1.merge(df2, on='col1', how='right', indicator=True) - assert (test3._merge != 'left_only').all() + test3 = merge(df1, df2, on="col1", how="right", indicator=True) + assert (test3._merge != "left_only").all() + test3 = df1.merge(df2, on="col1", how="right", indicator=True) + assert (test3._merge != "left_only").all() - test4 = merge(df1, df2, on='col1', how='inner', indicator=True) - assert (test4._merge == 'both').all() - test4 = df1.merge(df2, on='col1', how='inner', indicator=True) - assert (test4._merge == 'both').all() + test4 = merge(df1, df2, on="col1", how="inner", indicator=True) + assert (test4._merge == "both").all() + test4 = df1.merge(df2, on="col1", how="inner", indicator=True) + assert (test4._merge == "both").all() # Check if working name in df - for i in ['_right_indicator', '_left_indicator', '_merge']: - df_badcolumn = DataFrame({'col1': [1, 2], i: [2, 2]}) - - msg = ("Cannot use `indicator=True` option when data contains a" - " column named {}|" - "Cannot use name of an existing column for indicator" - " column").format(i) + for i in ["_right_indicator", "_left_indicator", "_merge"]: + df_badcolumn = DataFrame({"col1": [1, 2], i: [2, 2]}) + + msg = ( + "Cannot use `indicator=True` option when data contains a" + " column named {}|" + "Cannot use name of an existing column for indicator" + " column" + ).format(i) with pytest.raises(ValueError, match=msg): - merge(df1, df_badcolumn, on='col1', - how='outer', indicator=True) + merge(df1, df_badcolumn, on="col1", how="outer", indicator=True) with pytest.raises(ValueError, match=msg): - df1.merge(df_badcolumn, on='col1', how='outer', indicator=True) + df1.merge(df_badcolumn, on="col1", how="outer", indicator=True) # Check for name conflict with custom name - df_badcolumn = DataFrame( - {'col1': [1, 2], 'custom_column_name': [2, 2]}) + df_badcolumn = DataFrame({"col1": [1, 2], "custom_column_name": [2, 2]}) msg = "Cannot use name of an existing column for indicator column" with pytest.raises(ValueError, match=msg): - merge(df1, df_badcolumn, on='col1', how='outer', - indicator='custom_column_name') + merge( + df1, + df_badcolumn, + on="col1", + how="outer", + indicator="custom_column_name", + ) with pytest.raises(ValueError, match=msg): - df1.merge(df_badcolumn, on='col1', how='outer', - indicator='custom_column_name') + df1.merge( + df_badcolumn, on="col1", how="outer", indicator="custom_column_name" + ) # Merge on multiple columns - df3 = DataFrame({'col1': [0, 1], 'col2': ['a', 'b']}) + df3 = DataFrame({"col1": [0, 1], "col2": ["a", "b"]}) - df4 = DataFrame({'col1': [1, 1, 3], 'col2': ['b', 'x', 'y']}) + df4 = DataFrame({"col1": [1, 1, 3], "col2": ["b", "x", "y"]}) - hand_coded_result = DataFrame({'col1': [0, 1, 1, 3], - 'col2': ['a', 'b', 'x', 'y']}) - hand_coded_result['_merge'] = Categorical( - ['left_only', 'both', 'right_only', 'right_only'], - categories=['left_only', 'right_only', 'both']) + hand_coded_result = DataFrame( + {"col1": [0, 1, 1, 3], "col2": ["a", "b", "x", "y"]} + ) + hand_coded_result["_merge"] = Categorical( + ["left_only", "both", "right_only", "right_only"], + categories=["left_only", "right_only", "both"], + ) - test5 = merge(df3, df4, on=['col1', 'col2'], - how='outer', indicator=True) + test5 = merge(df3, df4, on=["col1", "col2"], how="outer", indicator=True) assert_frame_equal(test5, hand_coded_result) - test5 = df3.merge(df4, on=['col1', 'col2'], - how='outer', indicator=True) + test5 = df3.merge(df4, on=["col1", "col2"], how="outer", indicator=True) assert_frame_equal(test5, hand_coded_result) def test_validation(self): - left = DataFrame({'a': ['a', 'b', 'c', 'd'], - 'b': ['cat', 'dog', 'weasel', 'horse']}, - index=range(4)) + left = DataFrame( + {"a": ["a", "b", "c", "d"], "b": ["cat", "dog", "weasel", "horse"]}, + index=range(4), + ) - right = DataFrame({'a': ['a', 'b', 'c', 'd', 'e'], - 'c': ['meow', 'bark', 'um... weasel noise?', - 'nay', 'chirp']}, - index=range(5)) + right = DataFrame( + { + "a": ["a", "b", "c", "d", "e"], + "c": ["meow", "bark", "um... weasel noise?", "nay", "chirp"], + }, + index=range(5), + ) # Make sure no side effects. left_copy = left.copy() right_copy = right.copy() - result = merge(left, right, left_index=True, right_index=True, - validate='1:1') + result = merge(left, right, left_index=True, right_index=True, validate="1:1") assert_frame_equal(left, left_copy) assert_frame_equal(right, right_copy) # make sure merge still correct - expected = DataFrame({'a_x': ['a', 'b', 'c', 'd'], - 'b': ['cat', 'dog', 'weasel', 'horse'], - 'a_y': ['a', 'b', 'c', 'd'], - 'c': ['meow', 'bark', 'um... weasel noise?', - 'nay']}, - index=range(4), - columns=['a_x', 'b', 'a_y', 'c']) - - result = merge(left, right, left_index=True, right_index=True, - validate='one_to_one') + expected = DataFrame( + { + "a_x": ["a", "b", "c", "d"], + "b": ["cat", "dog", "weasel", "horse"], + "a_y": ["a", "b", "c", "d"], + "c": ["meow", "bark", "um... weasel noise?", "nay"], + }, + index=range(4), + columns=["a_x", "b", "a_y", "c"], + ) + + result = merge( + left, right, left_index=True, right_index=True, validate="one_to_one" + ) assert_frame_equal(result, expected) - expected_2 = DataFrame({'a': ['a', 'b', 'c', 'd'], - 'b': ['cat', 'dog', 'weasel', 'horse'], - 'c': ['meow', 'bark', 'um... weasel noise?', - 'nay']}, - index=range(4)) + expected_2 = DataFrame( + { + "a": ["a", "b", "c", "d"], + "b": ["cat", "dog", "weasel", "horse"], + "c": ["meow", "bark", "um... weasel noise?", "nay"], + }, + index=range(4), + ) - result = merge(left, right, on='a', validate='1:1') + result = merge(left, right, on="a", validate="1:1") assert_frame_equal(left, left_copy) assert_frame_equal(right, right_copy) assert_frame_equal(result, expected_2) - result = merge(left, right, on='a', validate='one_to_one') + result = merge(left, right, on="a", validate="one_to_one") assert_frame_equal(result, expected_2) # One index, one column - expected_3 = DataFrame({'b': ['cat', 'dog', 'weasel', 'horse'], - 'a': ['a', 'b', 'c', 'd'], - 'c': ['meow', 'bark', 'um... weasel noise?', - 'nay']}, - columns=['b', 'a', 'c'], - index=range(4)) - - left_index_reset = left.set_index('a') - result = merge(left_index_reset, right, left_index=True, - right_on='a', validate='one_to_one') + expected_3 = DataFrame( + { + "b": ["cat", "dog", "weasel", "horse"], + "a": ["a", "b", "c", "d"], + "c": ["meow", "bark", "um... weasel noise?", "nay"], + }, + columns=["b", "a", "c"], + index=range(4), + ) + + left_index_reset = left.set_index("a") + result = merge( + left_index_reset, + right, + left_index=True, + right_on="a", + validate="one_to_one", + ) assert_frame_equal(result, expected_3) # Dups on right - right_w_dups = right.append(pd.DataFrame({'a': ['e'], 'c': ['moo']}, - index=[4])) - merge(left, right_w_dups, left_index=True, right_index=True, - validate='one_to_many') + right_w_dups = right.append(pd.DataFrame({"a": ["e"], "c": ["moo"]}, index=[4])) + merge( + left, + right_w_dups, + left_index=True, + right_index=True, + validate="one_to_many", + ) - msg = ("Merge keys are not unique in right dataset; not a one-to-one" - " merge") + msg = "Merge keys are not unique in right dataset; not a one-to-one" " merge" with pytest.raises(MergeError, match=msg): - merge(left, right_w_dups, left_index=True, right_index=True, - validate='one_to_one') + merge( + left, + right_w_dups, + left_index=True, + right_index=True, + validate="one_to_one", + ) with pytest.raises(MergeError, match=msg): - merge(left, right_w_dups, on='a', validate='one_to_one') + merge(left, right_w_dups, on="a", validate="one_to_one") # Dups on left - left_w_dups = left.append(pd.DataFrame({'a': ['a'], 'c': ['cow']}, - index=[3]), sort=True) - merge(left_w_dups, right, left_index=True, right_index=True, - validate='many_to_one') + left_w_dups = left.append( + pd.DataFrame({"a": ["a"], "c": ["cow"]}, index=[3]), sort=True + ) + merge( + left_w_dups, + right, + left_index=True, + right_index=True, + validate="many_to_one", + ) - msg = ("Merge keys are not unique in left dataset; not a one-to-one" - " merge") + msg = "Merge keys are not unique in left dataset; not a one-to-one" " merge" with pytest.raises(MergeError, match=msg): - merge(left_w_dups, right, left_index=True, right_index=True, - validate='one_to_one') + merge( + left_w_dups, + right, + left_index=True, + right_index=True, + validate="one_to_one", + ) with pytest.raises(MergeError, match=msg): - merge(left_w_dups, right, on='a', validate='one_to_one') + merge(left_w_dups, right, on="a", validate="one_to_one") # Dups on both - merge(left_w_dups, right_w_dups, on='a', validate='many_to_many') + merge(left_w_dups, right_w_dups, on="a", validate="many_to_many") - msg = ("Merge keys are not unique in right dataset; not a many-to-one" - " merge") + msg = "Merge keys are not unique in right dataset; not a many-to-one" " merge" with pytest.raises(MergeError, match=msg): - merge(left_w_dups, right_w_dups, left_index=True, - right_index=True, validate='many_to_one') - - msg = ("Merge keys are not unique in left dataset; not a one-to-many" - " merge") + merge( + left_w_dups, + right_w_dups, + left_index=True, + right_index=True, + validate="many_to_one", + ) + + msg = "Merge keys are not unique in left dataset; not a one-to-many" " merge" with pytest.raises(MergeError, match=msg): - merge(left_w_dups, right_w_dups, on='a', - validate='one_to_many') + merge(left_w_dups, right_w_dups, on="a", validate="one_to_many") # Check invalid arguments msg = "Not a valid argument for validate" with pytest.raises(ValueError, match=msg): - merge(left, right, on='a', validate='jibberish') + merge(left, right, on="a", validate="jibberish") # Two column merge, dups in both, but jointly no dups. - left = DataFrame({'a': ['a', 'a', 'b', 'b'], - 'b': [0, 1, 0, 1], - 'c': ['cat', 'dog', 'weasel', 'horse']}, - index=range(4)) - - right = DataFrame({'a': ['a', 'a', 'b'], - 'b': [0, 1, 0], - 'd': ['meow', 'bark', 'um... weasel noise?']}, - index=range(3)) - - expected_multi = DataFrame({'a': ['a', 'a', 'b'], - 'b': [0, 1, 0], - 'c': ['cat', 'dog', 'weasel'], - 'd': ['meow', 'bark', - 'um... weasel noise?']}, - index=range(3)) - - msg = ("Merge keys are not unique in either left or right dataset;" - " not a one-to-one merge") + left = DataFrame( + { + "a": ["a", "a", "b", "b"], + "b": [0, 1, 0, 1], + "c": ["cat", "dog", "weasel", "horse"], + }, + index=range(4), + ) + + right = DataFrame( + { + "a": ["a", "a", "b"], + "b": [0, 1, 0], + "d": ["meow", "bark", "um... weasel noise?"], + }, + index=range(3), + ) + + expected_multi = DataFrame( + { + "a": ["a", "a", "b"], + "b": [0, 1, 0], + "c": ["cat", "dog", "weasel"], + "d": ["meow", "bark", "um... weasel noise?"], + }, + index=range(3), + ) + + msg = ( + "Merge keys are not unique in either left or right dataset;" + " not a one-to-one merge" + ) with pytest.raises(MergeError, match=msg): - merge(left, right, on='a', validate='1:1') + merge(left, right, on="a", validate="1:1") - result = merge(left, right, on=['a', 'b'], validate='1:1') + result = merge(left, right, on=["a", "b"], validate="1:1") assert_frame_equal(result, expected_multi) def test_merge_two_empty_df_no_division_error(self): # GH17776, PR #17846 - a = pd.DataFrame({'a': [], 'b': [], 'c': []}) - with np.errstate(divide='raise'): - merge(a, a, on=('a', 'b')) + a = pd.DataFrame({"a": [], "b": [], "c": []}) + with np.errstate(divide="raise"): + merge(a, a, on=("a", "b")) - @pytest.mark.parametrize('how', ['right', 'outer']) + @pytest.mark.parametrize("how", ["right", "outer"]) @pytest.mark.parametrize( - 'index,expected_index', - [(CategoricalIndex([1, 2, 4]), - CategoricalIndex([1, 2, 4, None, None, None])), - (DatetimeIndex(['2001-01-01', '2002-02-02', '2003-03-03']), - DatetimeIndex(['2001-01-01', '2002-02-02', '2003-03-03', - pd.NaT, pd.NaT, pd.NaT])), - (Float64Index([1, 2, 3]), - Float64Index([1, 2, 3, None, None, None])), - (Int64Index([1, 2, 3]), - Float64Index([1, 2, 3, None, None, None])), - (IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)]), - IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4), - np.nan, np.nan, np.nan])), - (PeriodIndex(['2001-01-01', '2001-01-02', '2001-01-03'], freq='D'), - PeriodIndex(['2001-01-01', '2001-01-02', '2001-01-03', - pd.NaT, pd.NaT, pd.NaT], freq='D')), - (TimedeltaIndex(['1d', '2d', '3d']), - TimedeltaIndex(['1d', '2d', '3d', pd.NaT, pd.NaT, pd.NaT]))]) + "index,expected_index", + [ + ( + CategoricalIndex([1, 2, 4]), + CategoricalIndex([1, 2, 4, None, None, None]), + ), + ( + DatetimeIndex(["2001-01-01", "2002-02-02", "2003-03-03"]), + DatetimeIndex( + ["2001-01-01", "2002-02-02", "2003-03-03", pd.NaT, pd.NaT, pd.NaT] + ), + ), + (Float64Index([1, 2, 3]), Float64Index([1, 2, 3, None, None, None])), + (Int64Index([1, 2, 3]), Float64Index([1, 2, 3, None, None, None])), + ( + IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)]), + IntervalIndex.from_tuples( + [(1, 2), (2, 3), (3, 4), np.nan, np.nan, np.nan] + ), + ), + ( + PeriodIndex(["2001-01-01", "2001-01-02", "2001-01-03"], freq="D"), + PeriodIndex( + ["2001-01-01", "2001-01-02", "2001-01-03", pd.NaT, pd.NaT, pd.NaT], + freq="D", + ), + ), + ( + TimedeltaIndex(["1d", "2d", "3d"]), + TimedeltaIndex(["1d", "2d", "3d", pd.NaT, pd.NaT, pd.NaT]), + ), + ], + ) def test_merge_on_index_with_more_values(self, how, index, expected_index): # GH 24212 # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that # -1 is interpreted as a missing value instead of the last element - df1 = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 2, 2]}, index=index) - df2 = pd.DataFrame({'b': [1, 2, 3, 4, 5]}) - result = df1.merge(df2, left_on='key', right_index=True, how=how) - expected = pd.DataFrame([[1.0, 0, 1], - [2.0, 2, 3], - [3.0, 2, 3], - [np.nan, 1, 2], - [np.nan, 3, 4], - [np.nan, 4, 5]], - columns=['a', 'key', 'b']) + df1 = pd.DataFrame({"a": [1, 2, 3], "key": [0, 2, 2]}, index=index) + df2 = pd.DataFrame({"b": [1, 2, 3, 4, 5]}) + result = df1.merge(df2, left_on="key", right_index=True, how=how) + expected = pd.DataFrame( + [ + [1.0, 0, 1], + [2.0, 2, 3], + [3.0, 2, 3], + [np.nan, 1, 2], + [np.nan, 3, 4], + [np.nan, 4, 5], + ], + columns=["a", "key", "b"], + ) expected.set_index(expected_index, inplace=True) assert_frame_equal(result, expected) @@ -1073,294 +1306,313 @@ def test_merge_right_index_right(self): # Note: the expected output here is probably incorrect. # See https://github.com/pandas-dev/pandas/issues/17257 for more. # We include this as a regression test for GH-24897. - left = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 1, 1]}) - right = pd.DataFrame({'b': [1, 2, 3]}) - - expected = pd.DataFrame({'a': [1, 2, 3, None], - 'key': [0, 1, 1, 2], - 'b': [1, 2, 2, 3]}, - columns=['a', 'key', 'b'], - index=[0, 1, 2, np.nan]) - result = left.merge(right, left_on='key', right_index=True, - how='right') + left = pd.DataFrame({"a": [1, 2, 3], "key": [0, 1, 1]}) + right = pd.DataFrame({"b": [1, 2, 3]}) + + expected = pd.DataFrame( + {"a": [1, 2, 3, None], "key": [0, 1, 1, 2], "b": [1, 2, 2, 3]}, + columns=["a", "key", "b"], + index=[0, 1, 2, np.nan], + ) + result = left.merge(right, left_on="key", right_index=True, how="right") tm.assert_frame_equal(result, expected) def test_merge_take_missing_values_from_index_of_other_dtype(self): # GH 24212 - left = pd.DataFrame({'a': [1, 2, 3], - 'key': pd.Categorical(['a', 'a', 'b'], - categories=list('abc'))}) - right = pd.DataFrame({'b': [1, 2, 3]}, - index=pd.CategoricalIndex(['a', 'b', 'c'])) - result = left.merge(right, left_on='key', - right_index=True, how='right') - expected = pd.DataFrame({'a': [1, 2, 3, None], - 'key': pd.Categorical(['a', 'a', 'b', 'c']), - 'b': [1, 1, 2, 3]}, - index=[0, 1, 2, np.nan]) - expected = expected.reindex(columns=['a', 'key', 'b']) + left = pd.DataFrame( + { + "a": [1, 2, 3], + "key": pd.Categorical(["a", "a", "b"], categories=list("abc")), + } + ) + right = pd.DataFrame( + {"b": [1, 2, 3]}, index=pd.CategoricalIndex(["a", "b", "c"]) + ) + result = left.merge(right, left_on="key", right_index=True, how="right") + expected = pd.DataFrame( + { + "a": [1, 2, 3, None], + "key": pd.Categorical(["a", "a", "b", "c"]), + "b": [1, 1, 2, 3], + }, + index=[0, 1, 2, np.nan], + ) + expected = expected.reindex(columns=["a", "key", "b"]) tm.assert_frame_equal(result, expected) def _check_merge(x, y): - for how in ['inner', 'left', 'outer']: + for how in ["inner", "left", "outer"]: result = x.join(y, how=how) - expected = merge(x.reset_index(), y.reset_index(), how=how, - sort=True) - expected = expected.set_index('index') + expected = merge(x.reset_index(), y.reset_index(), how=how, sort=True) + expected = expected.set_index("index") # TODO check_names on merge? assert_frame_equal(result, expected, check_names=False) class TestMergeDtypes: - - @pytest.mark.parametrize('right_vals', [ - ['foo', 'bar'], - Series(['foo', 'bar']).astype('category'), - ]) + @pytest.mark.parametrize( + "right_vals", [["foo", "bar"], Series(["foo", "bar"]).astype("category")] + ) def test_different(self, right_vals): - left = DataFrame({'A': ['foo', 'bar'], - 'B': Series(['foo', 'bar']).astype('category'), - 'C': [1, 2], - 'D': [1.0, 2.0], - 'E': Series([1, 2], dtype='uint64'), - 'F': Series([1, 2], dtype='int32')}) - right = DataFrame({'A': right_vals}) + left = DataFrame( + { + "A": ["foo", "bar"], + "B": Series(["foo", "bar"]).astype("category"), + "C": [1, 2], + "D": [1.0, 2.0], + "E": Series([1, 2], dtype="uint64"), + "F": Series([1, 2], dtype="int32"), + } + ) + right = DataFrame({"A": right_vals}) # GH 9780 # We allow merging on object and categorical cols and cast # categorical cols to object - result = pd.merge(left, right, on='A') + result = pd.merge(left, right, on="A") assert is_object_dtype(result.A.dtype) - @pytest.mark.parametrize('d1', [np.int64, np.int32, - np.int16, np.int8, np.uint8]) - @pytest.mark.parametrize('d2', [np.int64, np.float64, - np.float32, np.float16]) + @pytest.mark.parametrize("d1", [np.int64, np.int32, np.int16, np.int8, np.uint8]) + @pytest.mark.parametrize("d2", [np.int64, np.float64, np.float32, np.float16]) def test_join_multi_dtypes(self, d1, d2): dtype1 = np.dtype(d1) dtype2 = np.dtype(d2) - left = DataFrame({'k1': np.array([0, 1, 2] * 8, dtype=dtype1), - 'k2': ['foo', 'bar'] * 12, - 'v': np.array(np.arange(24), dtype=np.int64)}) + left = DataFrame( + { + "k1": np.array([0, 1, 2] * 8, dtype=dtype1), + "k2": ["foo", "bar"] * 12, + "v": np.array(np.arange(24), dtype=np.int64), + } + ) - index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) - right = DataFrame({'v2': np.array([5, 7], dtype=dtype2)}, index=index) + index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")]) + right = DataFrame({"v2": np.array([5, 7], dtype=dtype2)}, index=index) - result = left.join(right, on=['k1', 'k2']) + result = left.join(right, on=["k1", "k2"]) expected = left.copy() - if dtype2.kind == 'i': - dtype2 = np.dtype('float64') - expected['v2'] = np.array(np.nan, dtype=dtype2) - expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 - expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 + if dtype2.kind == "i": + dtype2 = np.dtype("float64") + expected["v2"] = np.array(np.nan, dtype=dtype2) + expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5 + expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7 tm.assert_frame_equal(result, expected) - result = left.join(right, on=['k1', 'k2'], sort=True) - expected.sort_values(['k1', 'k2'], kind='mergesort', inplace=True) + result = left.join(right, on=["k1", "k2"], sort=True) + expected.sort_values(["k1", "k2"], kind="mergesort", inplace=True) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('int_vals, float_vals, exp_vals', [ - ([1, 2, 3], [1.0, 2.0, 3.0], {'X': [1, 2, 3], 'Y': [1.0, 2.0, 3.0]}), - ([1, 2, 3], [1.0, 3.0], {'X': [1, 3], 'Y': [1.0, 3.0]}), - ([1, 2], [1.0, 2.0, 3.0], {'X': [1, 2], 'Y': [1.0, 2.0]}), - ]) + @pytest.mark.parametrize( + "int_vals, float_vals, exp_vals", + [ + ([1, 2, 3], [1.0, 2.0, 3.0], {"X": [1, 2, 3], "Y": [1.0, 2.0, 3.0]}), + ([1, 2, 3], [1.0, 3.0], {"X": [1, 3], "Y": [1.0, 3.0]}), + ([1, 2], [1.0, 2.0, 3.0], {"X": [1, 2], "Y": [1.0, 2.0]}), + ], + ) def test_merge_on_ints_floats(self, int_vals, float_vals, exp_vals): # GH 16572 # Check that float column is not cast to object if # merging on float and int columns - A = DataFrame({'X': int_vals}) - B = DataFrame({'Y': float_vals}) + A = DataFrame({"X": int_vals}) + B = DataFrame({"Y": float_vals}) expected = DataFrame(exp_vals) - result = A.merge(B, left_on='X', right_on='Y') + result = A.merge(B, left_on="X", right_on="Y") assert_frame_equal(result, expected) - result = B.merge(A, left_on='Y', right_on='X') - assert_frame_equal(result, expected[['Y', 'X']]) + result = B.merge(A, left_on="Y", right_on="X") + assert_frame_equal(result, expected[["Y", "X"]]) def test_merge_on_ints_floats_warning(self): # GH 16572 # merge will produce a warning when merging on int and # float columns where the float values are not exactly # equal to their int representation - A = DataFrame({'X': [1, 2, 3]}) - B = DataFrame({'Y': [1.1, 2.5, 3.0]}) - expected = DataFrame({'X': [3], 'Y': [3.0]}) + A = DataFrame({"X": [1, 2, 3]}) + B = DataFrame({"Y": [1.1, 2.5, 3.0]}) + expected = DataFrame({"X": [3], "Y": [3.0]}) with tm.assert_produces_warning(UserWarning): - result = A.merge(B, left_on='X', right_on='Y') + result = A.merge(B, left_on="X", right_on="Y") assert_frame_equal(result, expected) with tm.assert_produces_warning(UserWarning): - result = B.merge(A, left_on='Y', right_on='X') - assert_frame_equal(result, expected[['Y', 'X']]) + result = B.merge(A, left_on="Y", right_on="X") + assert_frame_equal(result, expected[["Y", "X"]]) # test no warning if float has NaNs - B = DataFrame({'Y': [np.nan, np.nan, 3.0]}) + B = DataFrame({"Y": [np.nan, np.nan, 3.0]}) with tm.assert_produces_warning(None): - result = B.merge(A, left_on='Y', right_on='X') - assert_frame_equal(result, expected[['Y', 'X']]) + result = B.merge(A, left_on="Y", right_on="X") + assert_frame_equal(result, expected[["Y", "X"]]) def test_merge_incompat_infer_boolean_object(self): # GH21119: bool + object bool merge OK - df1 = DataFrame({'key': Series([True, False], dtype=object)}) - df2 = DataFrame({'key': [True, False]}) + df1 = DataFrame({"key": Series([True, False], dtype=object)}) + df2 = DataFrame({"key": [True, False]}) - expected = DataFrame({'key': [True, False]}, dtype=object) - result = pd.merge(df1, df2, on='key') + expected = DataFrame({"key": [True, False]}, dtype=object) + result = pd.merge(df1, df2, on="key") assert_frame_equal(result, expected) - result = pd.merge(df2, df1, on='key') + result = pd.merge(df2, df1, on="key") assert_frame_equal(result, expected) # with missing value - df1 = DataFrame({'key': Series([True, False, np.nan], dtype=object)}) - df2 = DataFrame({'key': [True, False]}) + df1 = DataFrame({"key": Series([True, False, np.nan], dtype=object)}) + df2 = DataFrame({"key": [True, False]}) - expected = DataFrame({'key': [True, False]}, dtype=object) - result = pd.merge(df1, df2, on='key') + expected = DataFrame({"key": [True, False]}, dtype=object) + result = pd.merge(df1, df2, on="key") assert_frame_equal(result, expected) - result = pd.merge(df2, df1, on='key') + result = pd.merge(df2, df1, on="key") assert_frame_equal(result, expected) - @pytest.mark.parametrize('df1_vals, df2_vals', [ - - # merge on category coerces to object - ([0, 1, 2], Series(['a', 'b', 'a']).astype('category')), - ([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')), - - # no not infer - ([0, 1], pd.Series([False, True], dtype=object)), - ([0, 1], pd.Series([False, True], dtype=bool)), - ]) + @pytest.mark.parametrize( + "df1_vals, df2_vals", + [ + # merge on category coerces to object + ([0, 1, 2], Series(["a", "b", "a"]).astype("category")), + ([0.0, 1.0, 2.0], Series(["a", "b", "a"]).astype("category")), + # no not infer + ([0, 1], pd.Series([False, True], dtype=object)), + ([0, 1], pd.Series([False, True], dtype=bool)), + ], + ) def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals): # these are explicitly allowed incompat merges, that pass thru # the result type is dependent on if the values on the rhs are # inferred, otherwise these will be coerced to object - df1 = DataFrame({'A': df1_vals}) - df2 = DataFrame({'A': df2_vals}) + df1 = DataFrame({"A": df1_vals}) + df2 = DataFrame({"A": df2_vals}) - result = pd.merge(df1, df2, on=['A']) + result = pd.merge(df1, df2, on=["A"]) assert is_object_dtype(result.A.dtype) - result = pd.merge(df2, df1, on=['A']) + result = pd.merge(df2, df1, on=["A"]) assert is_object_dtype(result.A.dtype) - @pytest.mark.parametrize('df1_vals, df2_vals', [ - # do not infer to numeric - - (Series([1, 2], dtype='uint64'), ["a", "b", "c"]), - (Series([1, 2], dtype='int32'), ["a", "b", "c"]), - ([0, 1, 2], ["0", "1", "2"]), - ([0.0, 1.0, 2.0], ["0", "1", "2"]), - ([0, 1, 2], ["0", "1", "2"]), - (pd.date_range('1/1/2011', periods=2, freq='D'), ['2011-01-01', - '2011-01-02']), - (pd.date_range('1/1/2011', periods=2, freq='D'), [0, 1]), - (pd.date_range('1/1/2011', periods=2, freq='D'), [0.0, 1.0]), - (pd.date_range('20130101', periods=3), - pd.date_range('20130101', periods=3, tz='US/Eastern')), - ]) + @pytest.mark.parametrize( + "df1_vals, df2_vals", + [ + # do not infer to numeric + (Series([1, 2], dtype="uint64"), ["a", "b", "c"]), + (Series([1, 2], dtype="int32"), ["a", "b", "c"]), + ([0, 1, 2], ["0", "1", "2"]), + ([0.0, 1.0, 2.0], ["0", "1", "2"]), + ([0, 1, 2], ["0", "1", "2"]), + ( + pd.date_range("1/1/2011", periods=2, freq="D"), + ["2011-01-01", "2011-01-02"], + ), + (pd.date_range("1/1/2011", periods=2, freq="D"), [0, 1]), + (pd.date_range("1/1/2011", periods=2, freq="D"), [0.0, 1.0]), + ( + pd.date_range("20130101", periods=3), + pd.date_range("20130101", periods=3, tz="US/Eastern"), + ), + ], + ) def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals): # GH 9780, GH 15800 # Raise a ValueError when a user tries to merge on # dtypes that are incompatible (e.g., obj and int/float) - df1 = DataFrame({'A': df1_vals}) - df2 = DataFrame({'A': df2_vals}) + df1 = DataFrame({"A": df1_vals}) + df2 = DataFrame({"A": df2_vals}) - msg = ("You are trying to merge on {lk_dtype} and " - "{rk_dtype} columns. If you wish to proceed " - "you should use pd.concat".format(lk_dtype=df1['A'].dtype, - rk_dtype=df2['A'].dtype)) + msg = ( + "You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format( + lk_dtype=df1["A"].dtype, rk_dtype=df2["A"].dtype + ) + ) msg = re.escape(msg) with pytest.raises(ValueError, match=msg): - pd.merge(df1, df2, on=['A']) + pd.merge(df1, df2, on=["A"]) # Check that error still raised when swapping order of dataframes - msg = ("You are trying to merge on {lk_dtype} and " - "{rk_dtype} columns. If you wish to proceed " - "you should use pd.concat".format(lk_dtype=df2['A'].dtype, - rk_dtype=df1['A'].dtype)) + msg = ( + "You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format( + lk_dtype=df2["A"].dtype, rk_dtype=df1["A"].dtype + ) + ) msg = re.escape(msg) with pytest.raises(ValueError, match=msg): - pd.merge(df2, df1, on=['A']) + pd.merge(df2, df1, on=["A"]) @pytest.fixture def left(): np.random.seed(1234) return DataFrame( - {'X': Series(np.random.choice( - ['foo', 'bar'], - size=(10,))).astype(CDT(['foo', 'bar'])), - 'Y': np.random.choice(['one', 'two', 'three'], size=(10,))}) + { + "X": Series(np.random.choice(["foo", "bar"], size=(10,))).astype( + CDT(["foo", "bar"]) + ), + "Y": np.random.choice(["one", "two", "three"], size=(10,)), + } + ) @pytest.fixture def right(): np.random.seed(1234) return DataFrame( - {'X': Series(['foo', 'bar']).astype(CDT(['foo', 'bar'])), - 'Z': [1, 2]}) + {"X": Series(["foo", "bar"]).astype(CDT(["foo", "bar"])), "Z": [1, 2]} + ) class TestMergeCategorical: - def test_identical(self, left): # merging on the same, should preserve dtypes - merged = pd.merge(left, left, on='X') + merged = pd.merge(left, left, on="X") result = merged.dtypes.sort_index() - expected = Series([CategoricalDtype(), - np.dtype('O'), - np.dtype('O')], - index=['X', 'Y_x', 'Y_y']) + expected = Series( + [CategoricalDtype(), np.dtype("O"), np.dtype("O")], + index=["X", "Y_x", "Y_y"], + ) assert_series_equal(result, expected) def test_basic(self, left, right): # we have matching Categorical dtypes in X # so should preserve the merged column - merged = pd.merge(left, right, on='X') + merged = pd.merge(left, right, on="X") result = merged.dtypes.sort_index() - expected = Series([CategoricalDtype(), - np.dtype('O'), - np.dtype('int64')], - index=['X', 'Y', 'Z']) + expected = Series( + [CategoricalDtype(), np.dtype("O"), np.dtype("int64")], + index=["X", "Y", "Z"], + ) assert_series_equal(result, expected) def test_merge_categorical(self): # GH 9426 - right = DataFrame({'c': {0: 'a', - 1: 'b', - 2: 'c', - 3: 'd', - 4: 'e'}, - 'd': {0: 'null', - 1: 'null', - 2: 'null', - 3: 'null', - 4: 'null'}}) - left = DataFrame({'a': {0: 'f', - 1: 'f', - 2: 'f', - 3: 'f', - 4: 'f'}, - 'b': {0: 'g', - 1: 'g', - 2: 'g', - 3: 'g', - 4: 'g'}}) - df = pd.merge(left, right, how='left', left_on='b', right_on='c') + right = DataFrame( + { + "c": {0: "a", 1: "b", 2: "c", 3: "d", 4: "e"}, + "d": {0: "null", 1: "null", 2: "null", 3: "null", 4: "null"}, + } + ) + left = DataFrame( + { + "a": {0: "f", 1: "f", 2: "f", 3: "f", 4: "f"}, + "b": {0: "g", 1: "g", 2: "g", 3: "g", 4: "g"}, + } + ) + df = pd.merge(left, right, how="left", left_on="b", right_on="c") # object-object expected = df.copy() @@ -1369,54 +1621,60 @@ def test_merge_categorical(self): # note that we propagate the category # because we don't have any matching rows cright = right.copy() - cright['d'] = cright['d'].astype('category') - result = pd.merge(left, cright, how='left', left_on='b', right_on='c') - expected['d'] = expected['d'].astype(CategoricalDtype(['null'])) + cright["d"] = cright["d"].astype("category") + result = pd.merge(left, cright, how="left", left_on="b", right_on="c") + expected["d"] = expected["d"].astype(CategoricalDtype(["null"])) tm.assert_frame_equal(result, expected) # cat-object cleft = left.copy() - cleft['b'] = cleft['b'].astype('category') - result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c') + cleft["b"] = cleft["b"].astype("category") + result = pd.merge(cleft, cright, how="left", left_on="b", right_on="c") tm.assert_frame_equal(result, expected) # cat-cat cright = right.copy() - cright['d'] = cright['d'].astype('category') + cright["d"] = cright["d"].astype("category") cleft = left.copy() - cleft['b'] = cleft['b'].astype('category') - result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c') + cleft["b"] = cleft["b"].astype("category") + result = pd.merge(cleft, cright, how="left", left_on="b", right_on="c") tm.assert_frame_equal(result, expected) def tests_merge_categorical_unordered_equal(self): # GH-19551 - df1 = DataFrame({ - 'Foo': Categorical(['A', 'B', 'C'], categories=['A', 'B', 'C']), - 'Left': ['A0', 'B0', 'C0'], - }) - - df2 = DataFrame({ - 'Foo': Categorical(['C', 'B', 'A'], categories=['C', 'B', 'A']), - 'Right': ['C1', 'B1', 'A1'], - }) - result = pd.merge(df1, df2, on=['Foo']) - expected = DataFrame({ - 'Foo': pd.Categorical(['A', 'B', 'C']), - 'Left': ['A0', 'B0', 'C0'], - 'Right': ['A1', 'B1', 'C1'], - }) + df1 = DataFrame( + { + "Foo": Categorical(["A", "B", "C"], categories=["A", "B", "C"]), + "Left": ["A0", "B0", "C0"], + } + ) + + df2 = DataFrame( + { + "Foo": Categorical(["C", "B", "A"], categories=["C", "B", "A"]), + "Right": ["C1", "B1", "A1"], + } + ) + result = pd.merge(df1, df2, on=["Foo"]) + expected = DataFrame( + { + "Foo": pd.Categorical(["A", "B", "C"]), + "Left": ["A0", "B0", "C0"], + "Right": ["A1", "B1", "C1"], + } + ) assert_frame_equal(result, expected) def test_other_columns(self, left, right): # non-merge columns should preserve if possible - right = right.assign(Z=right.Z.astype('category')) + right = right.assign(Z=right.Z.astype("category")) - merged = pd.merge(left, right, on='X') + merged = pd.merge(left, right, on="X") result = merged.dtypes.sort_index() - expected = Series([CategoricalDtype(), - np.dtype('O'), - CategoricalDtype()], - index=['X', 'Y', 'Z']) + expected = Series( + [CategoricalDtype(), np.dtype("O"), CategoricalDtype()], + index=["X", "Y", "Z"], + ) assert_series_equal(result, expected) # categories are preserved @@ -1424,44 +1682,64 @@ def test_other_columns(self, left, right): assert right.Z.values.is_dtype_equal(merged.Z.values) @pytest.mark.parametrize( - 'change', [lambda x: x, - lambda x: x.astype(CDT(['foo', 'bar', 'bah'])), - lambda x: x.astype(CDT(ordered=True))]) + "change", + [ + lambda x: x, + lambda x: x.astype(CDT(["foo", "bar", "bah"])), + lambda x: x.astype(CDT(ordered=True)), + ], + ) def test_dtype_on_merged_different(self, change, join_type, left, right): # our merging columns, X now has 2 different dtypes # so we must be object as a result - X = change(right.X.astype('object')) + X = change(right.X.astype("object")) right = right.assign(X=X) assert is_categorical_dtype(left.X.values) # assert not left.X.values.is_dtype_equal(right.X.values) - merged = pd.merge(left, right, on='X', how=join_type) + merged = pd.merge(left, right, on="X", how=join_type) result = merged.dtypes.sort_index() - expected = Series([np.dtype('O'), - np.dtype('O'), - np.dtype('int64')], - index=['X', 'Y', 'Z']) + expected = Series( + [np.dtype("O"), np.dtype("O"), np.dtype("int64")], index=["X", "Y", "Z"] + ) assert_series_equal(result, expected) def test_self_join_multiple_categories(self): # GH 16767 # non-duplicates should work with multiple categories m = 5 - df = pd.DataFrame({ - 'a': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] * m, - 'b': ['t', 'w', 'x', 'y', 'z'] * 2 * m, - 'c': [letter - for each in ['m', 'n', 'u', 'p', 'o'] - for letter in [each] * 2 * m], - 'd': [letter - for each in ['aa', 'bb', 'cc', 'dd', 'ee', - 'ff', 'gg', 'hh', 'ii', 'jj'] - for letter in [each] * m]}) + df = pd.DataFrame( + { + "a": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"] * m, + "b": ["t", "w", "x", "y", "z"] * 2 * m, + "c": [ + letter + for each in ["m", "n", "u", "p", "o"] + for letter in [each] * 2 * m + ], + "d": [ + letter + for each in [ + "aa", + "bb", + "cc", + "dd", + "ee", + "ff", + "gg", + "hh", + "ii", + "jj", + ] + for letter in [each] * m + ], + } + ) # change them all to categorical variables - df = df.apply(lambda x: x.astype('category')) + df = df.apply(lambda x: x.astype("category")) # self-join should equal ourselves result = pd.merge(df, df, on=list(df.columns)) @@ -1473,192 +1751,237 @@ def test_dtype_on_categorical_dates(self): # dates should not be coerced to ints df = pd.DataFrame( - [[date(2001, 1, 1), 1.1], - [date(2001, 1, 2), 1.3]], - columns=['date', 'num2'] + [[date(2001, 1, 1), 1.1], [date(2001, 1, 2), 1.3]], columns=["date", "num2"] ) - df['date'] = df['date'].astype('category') + df["date"] = df["date"].astype("category") df2 = pd.DataFrame( - [[date(2001, 1, 1), 1.3], - [date(2001, 1, 3), 1.4]], - columns=['date', 'num4'] + [[date(2001, 1, 1), 1.3], [date(2001, 1, 3), 1.4]], columns=["date", "num4"] ) - df2['date'] = df2['date'].astype('category') - - expected_outer = pd.DataFrame([ - [pd.Timestamp('2001-01-01'), 1.1, 1.3], - [pd.Timestamp('2001-01-02'), 1.3, np.nan], - [pd.Timestamp('2001-01-03'), np.nan, 1.4]], - columns=['date', 'num2', 'num4'] + df2["date"] = df2["date"].astype("category") + + expected_outer = pd.DataFrame( + [ + [pd.Timestamp("2001-01-01"), 1.1, 1.3], + [pd.Timestamp("2001-01-02"), 1.3, np.nan], + [pd.Timestamp("2001-01-03"), np.nan, 1.4], + ], + columns=["date", "num2", "num4"], ) - result_outer = pd.merge(df, df2, how='outer', on=['date']) + result_outer = pd.merge(df, df2, how="outer", on=["date"]) assert_frame_equal(result_outer, expected_outer) expected_inner = pd.DataFrame( - [[pd.Timestamp('2001-01-01'), 1.1, 1.3]], - columns=['date', 'num2', 'num4'] + [[pd.Timestamp("2001-01-01"), 1.1, 1.3]], columns=["date", "num2", "num4"] ) - result_inner = pd.merge(df, df2, how='inner', on=['date']) + result_inner = pd.merge(df, df2, how="inner", on=["date"]) assert_frame_equal(result_inner, expected_inner) - @pytest.mark.parametrize('ordered', [True, False]) - @pytest.mark.parametrize('category_column,categories,expected_categories', - [([False, True, True, False], [True, False], - [True, False]), - ([2, 1, 1, 2], [1, 2], [1, 2]), - (['False', 'True', 'True', 'False'], - ['True', 'False'], ['True', 'False'])]) - def test_merging_with_bool_or_int_cateorical_column(self, category_column, - categories, - expected_categories, - ordered): + @pytest.mark.parametrize("ordered", [True, False]) + @pytest.mark.parametrize( + "category_column,categories,expected_categories", + [ + ([False, True, True, False], [True, False], [True, False]), + ([2, 1, 1, 2], [1, 2], [1, 2]), + (["False", "True", "True", "False"], ["True", "False"], ["True", "False"]), + ], + ) + def test_merging_with_bool_or_int_cateorical_column( + self, category_column, categories, expected_categories, ordered + ): # GH 17187 # merging with a boolean/int categorical column - df1 = pd.DataFrame({'id': [1, 2, 3, 4], - 'cat': category_column}) - df1['cat'] = df1['cat'].astype(CDT(categories, ordered=ordered)) - df2 = pd.DataFrame({'id': [2, 4], 'num': [1, 9]}) + df1 = pd.DataFrame({"id": [1, 2, 3, 4], "cat": category_column}) + df1["cat"] = df1["cat"].astype(CDT(categories, ordered=ordered)) + df2 = pd.DataFrame({"id": [2, 4], "num": [1, 9]}) result = df1.merge(df2) - expected = pd.DataFrame({'id': [2, 4], 'cat': expected_categories, - 'num': [1, 9]}) - expected['cat'] = expected['cat'].astype( - CDT(categories, ordered=ordered)) + expected = pd.DataFrame( + {"id": [2, 4], "cat": expected_categories, "num": [1, 9]} + ) + expected["cat"] = expected["cat"].astype(CDT(categories, ordered=ordered)) assert_frame_equal(expected, result) def test_merge_on_int_array(self): # GH 23020 - df = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'), - 'B': 1}) - result = pd.merge(df, df, on='A') - expected = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'), - 'B_x': 1, - 'B_y': 1}) + df = pd.DataFrame({"A": pd.Series([1, 2, np.nan], dtype="Int64"), "B": 1}) + result = pd.merge(df, df, on="A") + expected = pd.DataFrame( + {"A": pd.Series([1, 2, np.nan], dtype="Int64"), "B_x": 1, "B_y": 1} + ) assert_frame_equal(result, expected) @pytest.fixture def left_df(): - return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) + return DataFrame({"a": [20, 10, 0]}, index=[2, 1, 0]) @pytest.fixture def right_df(): - return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2]) + return DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2]) class TestMergeOnIndexes: - @pytest.mark.parametrize( "how, sort, expected", - [('inner', False, DataFrame({'a': [20, 10], - 'b': [200, 100]}, - index=[2, 1])), - ('inner', True, DataFrame({'a': [10, 20], - 'b': [100, 200]}, - index=[1, 2])), - ('left', False, DataFrame({'a': [20, 10, 0], - 'b': [200, 100, np.nan]}, - index=[2, 1, 0])), - ('left', True, DataFrame({'a': [0, 10, 20], - 'b': [np.nan, 100, 200]}, - index=[0, 1, 2])), - ('right', False, DataFrame({'a': [np.nan, 10, 20], - 'b': [300, 100, 200]}, - index=[3, 1, 2])), - ('right', True, DataFrame({'a': [10, 20, np.nan], - 'b': [100, 200, 300]}, - index=[1, 2, 3])), - ('outer', False, DataFrame({'a': [0, 10, 20, np.nan], - 'b': [np.nan, 100, 200, 300]}, - index=[0, 1, 2, 3])), - ('outer', True, DataFrame({'a': [0, 10, 20, np.nan], - 'b': [np.nan, 100, 200, 300]}, - index=[0, 1, 2, 3]))]) + [ + ("inner", False, DataFrame({"a": [20, 10], "b": [200, 100]}, index=[2, 1])), + ("inner", True, DataFrame({"a": [10, 20], "b": [100, 200]}, index=[1, 2])), + ( + "left", + False, + DataFrame({"a": [20, 10, 0], "b": [200, 100, np.nan]}, index=[2, 1, 0]), + ), + ( + "left", + True, + DataFrame({"a": [0, 10, 20], "b": [np.nan, 100, 200]}, index=[0, 1, 2]), + ), + ( + "right", + False, + DataFrame( + {"a": [np.nan, 10, 20], "b": [300, 100, 200]}, index=[3, 1, 2] + ), + ), + ( + "right", + True, + DataFrame( + {"a": [10, 20, np.nan], "b": [100, 200, 300]}, index=[1, 2, 3] + ), + ), + ( + "outer", + False, + DataFrame( + {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3], + ), + ), + ( + "outer", + True, + DataFrame( + {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3], + ), + ), + ], + ) def test_merge_on_indexes(self, left_df, right_df, how, sort, expected): - result = pd.merge(left_df, right_df, - left_index=True, - right_index=True, - how=how, - sort=sort) + result = pd.merge( + left_df, right_df, left_index=True, right_index=True, how=how, sort=sort + ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - 'index', [ - CategoricalIndex(['A', 'B'], categories=['A', 'B'], name='index_col'), - Float64Index([1.0, 2.0], name='index_col'), - Int64Index([1, 2], name='index_col'), - UInt64Index([1, 2], name='index_col'), - RangeIndex(start=0, stop=2, name='index_col'), - DatetimeIndex(["2018-01-01", "2018-01-02"], name='index_col'), - ], ids=lambda x: type(x).__name__) + "index", + [ + CategoricalIndex(["A", "B"], categories=["A", "B"], name="index_col"), + Float64Index([1.0, 2.0], name="index_col"), + Int64Index([1, 2], name="index_col"), + UInt64Index([1, 2], name="index_col"), + RangeIndex(start=0, stop=2, name="index_col"), + DatetimeIndex(["2018-01-01", "2018-01-02"], name="index_col"), + ], + ids=lambda x: type(x).__name__, +) def test_merge_index_types(index): # gh-20777 # assert key access is consistent across index types left = DataFrame({"left_data": [1, 2]}, index=index) right = DataFrame({"right_data": [1.0, 2.0]}, index=index) - result = left.merge(right, on=['index_col']) + result = left.merge(right, on=["index_col"]) expected = DataFrame( - OrderedDict([('left_data', [1, 2]), ('right_data', [1.0, 2.0])]), - index=index) + OrderedDict([("left_data", [1, 2]), ("right_data", [1.0, 2.0])]), index=index + ) assert_frame_equal(result, expected) -@pytest.mark.parametrize("on,left_on,right_on,left_index,right_index,nm", [ - (['outer', 'inner'], None, None, False, False, 'B'), - (None, None, None, True, True, 'B'), - (None, ['outer', 'inner'], None, False, True, 'B'), - (None, None, ['outer', 'inner'], True, False, 'B'), - (['outer', 'inner'], None, None, False, False, None), - (None, None, None, True, True, None), - (None, ['outer', 'inner'], None, False, True, None), - (None, None, ['outer', 'inner'], True, False, None)]) +@pytest.mark.parametrize( + "on,left_on,right_on,left_index,right_index,nm", + [ + (["outer", "inner"], None, None, False, False, "B"), + (None, None, None, True, True, "B"), + (None, ["outer", "inner"], None, False, True, "B"), + (None, None, ["outer", "inner"], True, False, "B"), + (["outer", "inner"], None, None, False, False, None), + (None, None, None, True, True, None), + (None, ["outer", "inner"], None, False, True, None), + (None, None, ["outer", "inner"], True, False, None), + ], +) def test_merge_series(on, left_on, right_on, left_index, right_index, nm): # GH 21220 - a = pd.DataFrame({"A": [1, 2, 3, 4]}, - index=pd.MultiIndex.from_product([['a', 'b'], [0, 1]], - names=['outer', 'inner'])) - b = pd.Series([1, 2, 3, 4], - index=pd.MultiIndex.from_product([['a', 'b'], [1, 2]], - names=['outer', 'inner']), name=nm) - expected = pd.DataFrame({"A": [2, 4], "B": [1, 3]}, - index=pd.MultiIndex.from_product([['a', 'b'], [1]], - names=['outer', 'inner'])) + a = pd.DataFrame( + {"A": [1, 2, 3, 4]}, + index=pd.MultiIndex.from_product( + [["a", "b"], [0, 1]], names=["outer", "inner"] + ), + ) + b = pd.Series( + [1, 2, 3, 4], + index=pd.MultiIndex.from_product( + [["a", "b"], [1, 2]], names=["outer", "inner"] + ), + name=nm, + ) + expected = pd.DataFrame( + {"A": [2, 4], "B": [1, 3]}, + index=pd.MultiIndex.from_product([["a", "b"], [1]], names=["outer", "inner"]), + ) if nm is not None: - result = pd.merge(a, b, on=on, left_on=left_on, right_on=right_on, - left_index=left_index, right_index=right_index) + result = pd.merge( + a, + b, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + ) tm.assert_frame_equal(result, expected) else: msg = "Cannot merge a Series without a name" with pytest.raises(ValueError, match=msg): - result = pd.merge(a, b, on=on, left_on=left_on, right_on=right_on, - left_index=left_index, right_index=right_index) - - -@pytest.mark.parametrize("col1, col2, kwargs, expected_cols", [ - (0, 0, dict(suffixes=("", "_dup")), ["0", "0_dup"]), - (0, 0, dict(suffixes=(None, "_dup")), [0, "0_dup"]), - (0, 0, dict(suffixes=("_x", "_y")), ["0_x", "0_y"]), - ("a", 0, dict(suffixes=(None, "_y")), ["a", 0]), - (0.0, 0.0, dict(suffixes=("_x", None)), ["0.0_x", 0.0]), - ("b", "b", dict(suffixes=(None, "_y")), ["b", "b_y"]), - ("a", "a", dict(suffixes=("_x", None)), ["a_x", "a"]), - ("a", "b", dict(suffixes=("_x", None)), ["a", "b"]), - ("a", "a", dict(suffixes=[None, "_x"]), ["a", "a_x"]), - (0, 0, dict(suffixes=["_a", None]), ["0_a", 0]), - ("a", "a", dict(), ["a_x", "a_y"]), - (0, 0, dict(), ["0_x", "0_y"]) -]) + result = pd.merge( + a, + b, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + ) + + +@pytest.mark.parametrize( + "col1, col2, kwargs, expected_cols", + [ + (0, 0, dict(suffixes=("", "_dup")), ["0", "0_dup"]), + (0, 0, dict(suffixes=(None, "_dup")), [0, "0_dup"]), + (0, 0, dict(suffixes=("_x", "_y")), ["0_x", "0_y"]), + ("a", 0, dict(suffixes=(None, "_y")), ["a", 0]), + (0.0, 0.0, dict(suffixes=("_x", None)), ["0.0_x", 0.0]), + ("b", "b", dict(suffixes=(None, "_y")), ["b", "b_y"]), + ("a", "a", dict(suffixes=("_x", None)), ["a_x", "a"]), + ("a", "b", dict(suffixes=("_x", None)), ["a", "b"]), + ("a", "a", dict(suffixes=[None, "_x"]), ["a", "a_x"]), + (0, 0, dict(suffixes=["_a", None]), ["0_a", 0]), + ("a", "a", dict(), ["a_x", "a_y"]), + (0, 0, dict(), ["0_x", "0_y"]), + ], +) def test_merge_suffix(col1, col2, kwargs, expected_cols): # issue: 24782 a = pd.DataFrame({col1: [1, 2, 3]}) b = pd.DataFrame({col2: [4, 5, 6]}) - expected = pd.DataFrame([[1, 4], [2, 5], [3, 6]], - columns=expected_cols) + expected = pd.DataFrame([[1, 4], [2, 5], [3, 6]], columns=expected_cols) result = a.merge(b, left_index=True, right_index=True, **kwargs) tm.assert_frame_equal(result, expected) @@ -1667,13 +1990,16 @@ def test_merge_suffix(col1, col2, kwargs, expected_cols): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("col1, col2, suffixes", [ - ("a", "a", [None, None]), - ("a", "a", (None, None)), - ("a", "a", ("", None)), - (0, 0, [None, None]), - (0, 0, (None, "")) -]) +@pytest.mark.parametrize( + "col1, col2, suffixes", + [ + ("a", "a", [None, None]), + ("a", "a", (None, None)), + ("a", "a", ("", None)), + (0, 0, [None, None]), + (0, 0, (None, "")), + ], +) def test_merge_suffix_error(col1, col2, suffixes): # issue: 24782 a = pd.DataFrame({col1: [1, 2, 3]}) @@ -1685,10 +2011,7 @@ def test_merge_suffix_error(col1, col2, suffixes): pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) -@pytest.mark.parametrize("col1, col2, suffixes", [ - ("a", "a", None), - (0, 0, None) -]) +@pytest.mark.parametrize("col1, col2, suffixes", [("a", "a", None), (0, 0, None)]) def test_merge_suffix_none_error(col1, col2, suffixes): # issue: 24782 a = pd.DataFrame({col1: [1, 2, 3]}) @@ -1709,10 +2032,9 @@ def test_merge_equal_cat_dtypes(cat_dtype, reverse): "two": CategoricalDtype(categories=["a", "b", "c"], ordered=False), } - df1 = DataFrame({ - "foo": Series(["a", "b", "c"]).astype(cat_dtypes["one"]), - "left": [1, 2, 3], - }).set_index("foo") + df1 = DataFrame( + {"foo": Series(["a", "b", "c"]).astype(cat_dtypes["one"]), "left": [1, 2, 3]} + ).set_index("foo") data_foo = ["a", "b", "c"] data_right = [1, 2, 3] @@ -1721,18 +2043,19 @@ def test_merge_equal_cat_dtypes(cat_dtype, reverse): data_foo.reverse() data_right.reverse() - df2 = DataFrame({ - "foo": Series(data_foo).astype(cat_dtypes[cat_dtype]), - "right": data_right - }).set_index("foo") + df2 = DataFrame( + {"foo": Series(data_foo).astype(cat_dtypes[cat_dtype]), "right": data_right} + ).set_index("foo") result = df1.merge(df2, left_index=True, right_index=True) - expected = DataFrame({ - "left": [1, 2, 3], - "right": [1, 2, 3], - "foo": Series(["a", "b", "c"]).astype(cat_dtypes["one"]), - }).set_index("foo") + expected = DataFrame( + { + "left": [1, 2, 3], + "right": [1, 2, 3], + "foo": Series(["a", "b", "c"]).astype(cat_dtypes["one"]), + } + ).set_index("foo") # Categorical is unordered, so don't check ordering. tm.assert_frame_equal(result, expected, check_categorical=False) @@ -1743,23 +2066,19 @@ def test_merge_equal_cat_dtypes2(): cat_dtype = CategoricalDtype(categories=["a", "b", "c"], ordered=False) # Test Data - df1 = DataFrame({ - "foo": Series(["a", "b"]).astype(cat_dtype), - "left": [1, 2], - }).set_index("foo") + df1 = DataFrame( + {"foo": Series(["a", "b"]).astype(cat_dtype), "left": [1, 2]} + ).set_index("foo") - df2 = DataFrame({ - "foo": Series(["a", "b", "c"]).astype(cat_dtype), - "right": [3, 2, 1], - }).set_index("foo") + df2 = DataFrame( + {"foo": Series(["a", "b", "c"]).astype(cat_dtype), "right": [3, 2, 1]} + ).set_index("foo") result = df1.merge(df2, left_index=True, right_index=True) - expected = DataFrame({ - "left": [1, 2], - "right": [3, 2], - "foo": Series(["a", "b"]).astype(cat_dtype), - }).set_index("foo") + expected = DataFrame( + {"left": [1, 2], "right": [3, 2], "foo": Series(["a", "b"]).astype(cat_dtype)} + ).set_index("foo") # Categorical is unordered, so don't check ordering. tm.assert_frame_equal(result, expected, check_categorical=False) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 684fba5867c007..e2e17397464fe7 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -9,143 +9,154 @@ class TestAsOfMerge: - def read_data(self, datapath, name, dedupe=False): - path = datapath('reshape', 'merge', 'data', name) + path = datapath("reshape", "merge", "data", name) x = read_csv(path) if dedupe: - x = (x.drop_duplicates(['time', 'ticker'], keep='last') - .reset_index(drop=True) - ) + x = x.drop_duplicates(["time", "ticker"], keep="last").reset_index( + drop=True + ) x.time = to_datetime(x.time) return x @pytest.fixture(autouse=True) def setup_method(self, datapath): - self.trades = self.read_data(datapath, 'trades.csv') - self.quotes = self.read_data(datapath, 'quotes.csv', dedupe=True) - self.asof = self.read_data(datapath, 'asof.csv') - self.tolerance = self.read_data(datapath, 'tolerance.csv') - self.allow_exact_matches = self.read_data(datapath, - 'allow_exact_matches.csv') + self.trades = self.read_data(datapath, "trades.csv") + self.quotes = self.read_data(datapath, "quotes.csv", dedupe=True) + self.asof = self.read_data(datapath, "asof.csv") + self.tolerance = self.read_data(datapath, "tolerance.csv") + self.allow_exact_matches = self.read_data(datapath, "allow_exact_matches.csv") self.allow_exact_matches_and_tolerance = self.read_data( - datapath, 'allow_exact_matches_and_tolerance.csv') + datapath, "allow_exact_matches_and_tolerance.csv" + ) def test_examples1(self): """ doc-string examples """ - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 2, 3, 6, 7], - 'right_val': [1, 2, 3, 6, 7]}) + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) - expected = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c'], - 'right_val': [1, 3, 7]}) + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, 3, 7]} + ) - result = pd.merge_asof(left, right, on='a') + result = pd.merge_asof(left, right, on="a") assert_frame_equal(result, expected) def test_examples2(self): """ doc-string examples """ - trades = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.038', - '20160525 13:30:00.048', - '20160525 13:30:00.048', - '20160525 13:30:00.048']), - 'ticker': ['MSFT', 'MSFT', - 'GOOG', 'GOOG', 'AAPL'], - 'price': [51.95, 51.95, - 720.77, 720.92, 98.00], - 'quantity': [75, 155, - 100, 100, 100]}, - columns=['time', 'ticker', 'price', 'quantity']) - - quotes = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.030', - '20160525 13:30:00.041', - '20160525 13:30:00.048', - '20160525 13:30:00.049', - '20160525 13:30:00.072', - '20160525 13:30:00.075']), - 'ticker': ['GOOG', 'MSFT', 'MSFT', - 'MSFT', 'GOOG', 'AAPL', 'GOOG', - 'MSFT'], - 'bid': [720.50, 51.95, 51.97, 51.99, - 720.50, 97.99, 720.50, 52.01], - 'ask': [720.93, 51.96, 51.98, 52.00, - 720.93, 98.01, 720.88, 52.03]}, - columns=['time', 'ticker', 'bid', 'ask']) - - pd.merge_asof(trades, quotes, - on='time', - by='ticker') - - pd.merge_asof(trades, quotes, - on='time', - by='ticker', - tolerance=pd.Timedelta('2ms')) - - expected = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.038', - '20160525 13:30:00.048', - '20160525 13:30:00.048', - '20160525 13:30:00.048']), - 'ticker': ['MSFT', 'MSFT', 'GOOG', 'GOOG', 'AAPL'], - 'price': [51.95, 51.95, - 720.77, 720.92, 98.00], - 'quantity': [75, 155, - 100, 100, 100], - 'bid': [np.nan, 51.97, np.nan, - np.nan, np.nan], - 'ask': [np.nan, 51.98, np.nan, - np.nan, np.nan]}, - columns=['time', 'ticker', 'price', 'quantity', - 'bid', 'ask']) - - result = pd.merge_asof(trades, quotes, - on='time', - by='ticker', - tolerance=pd.Timedelta('10ms'), - allow_exact_matches=False) + trades = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.038", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + ] + ), + "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + }, + columns=["time", "ticker", "price", "quantity"], + ) + + quotes = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.030", + "20160525 13:30:00.041", + "20160525 13:30:00.048", + "20160525 13:30:00.049", + "20160525 13:30:00.072", + "20160525 13:30:00.075", + ] + ), + "ticker": [ + "GOOG", + "MSFT", + "MSFT", + "MSFT", + "GOOG", + "AAPL", + "GOOG", + "MSFT", + ], + "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], + "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], + }, + columns=["time", "ticker", "bid", "ask"], + ) + + pd.merge_asof(trades, quotes, on="time", by="ticker") + + pd.merge_asof( + trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms") + ) + + expected = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.038", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + ] + ), + "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + "bid": [np.nan, 51.97, np.nan, np.nan, np.nan], + "ask": [np.nan, 51.98, np.nan, np.nan, np.nan], + }, + columns=["time", "ticker", "price", "quantity", "bid", "ask"], + ) + + result = pd.merge_asof( + trades, + quotes, + on="time", + by="ticker", + tolerance=pd.Timedelta("10ms"), + allow_exact_matches=False, + ) assert_frame_equal(result, expected) def test_examples3(self): """ doc-string examples """ # GH14887 - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 2, 3, 6, 7], - 'right_val': [1, 2, 3, 6, 7]}) + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) - expected = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c'], - 'right_val': [1, 6, np.nan]}) + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, 6, np.nan]} + ) - result = pd.merge_asof(left, right, on='a', direction='forward') + result = pd.merge_asof(left, right, on="a", direction="forward") assert_frame_equal(result, expected) def test_examples4(self): """ doc-string examples """ # GH14887 - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 2, 3, 6, 7], - 'right_val': [1, 2, 3, 6, 7]}) + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) - expected = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c'], - 'right_val': [1, 6, 7]}) + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, 6, 7]} + ) - result = pd.merge_asof(left, right, on='a', direction='nearest') + result = pd.merge_asof(left, right, on="a", direction="nearest") assert_frame_equal(result, expected) def test_basic(self): @@ -154,36 +165,31 @@ def test_basic(self): trades = self.trades quotes = self.quotes - result = merge_asof(trades, quotes, - on='time', - by='ticker') + result = merge_asof(trades, quotes, on="time", by="ticker") assert_frame_equal(result, expected) def test_basic_categorical(self): expected = self.asof trades = self.trades.copy() - trades.ticker = trades.ticker.astype('category') + trades.ticker = trades.ticker.astype("category") quotes = self.quotes.copy() - quotes.ticker = quotes.ticker.astype('category') - expected.ticker = expected.ticker.astype('category') + quotes.ticker = quotes.ticker.astype("category") + expected.ticker = expected.ticker.astype("category") - result = merge_asof(trades, quotes, - on='time', - by='ticker') + result = merge_asof(trades, quotes, on="time", by="ticker") assert_frame_equal(result, expected) def test_basic_left_index(self): # GH14253 expected = self.asof - trades = self.trades.set_index('time') + trades = self.trades.set_index("time") quotes = self.quotes - result = merge_asof(trades, quotes, - left_index=True, - right_on='time', - by='ticker') + result = merge_asof( + trades, quotes, left_index=True, right_on="time", by="ticker" + ) # left-only index uses right's index, oddly expected.index = result.index # time column appears after left's columns @@ -194,61 +200,53 @@ def test_basic_right_index(self): expected = self.asof trades = self.trades - quotes = self.quotes.set_index('time') + quotes = self.quotes.set_index("time") - result = merge_asof(trades, quotes, - left_on='time', - right_index=True, - by='ticker') + result = merge_asof( + trades, quotes, left_on="time", right_index=True, by="ticker" + ) assert_frame_equal(result, expected) def test_basic_left_index_right_index(self): - expected = self.asof.set_index('time') - trades = self.trades.set_index('time') - quotes = self.quotes.set_index('time') + expected = self.asof.set_index("time") + trades = self.trades.set_index("time") + quotes = self.quotes.set_index("time") - result = merge_asof(trades, quotes, - left_index=True, - right_index=True, - by='ticker') + result = merge_asof( + trades, quotes, left_index=True, right_index=True, by="ticker" + ) assert_frame_equal(result, expected) def test_multi_index(self): # MultiIndex is prohibited - trades = self.trades.set_index(['time', 'price']) - quotes = self.quotes.set_index('time') + trades = self.trades.set_index(["time", "price"]) + quotes = self.quotes.set_index("time") with pytest.raises(MergeError): - merge_asof(trades, quotes, - left_index=True, - right_index=True) + merge_asof(trades, quotes, left_index=True, right_index=True) - trades = self.trades.set_index('time') - quotes = self.quotes.set_index(['time', 'bid']) + trades = self.trades.set_index("time") + quotes = self.quotes.set_index(["time", "bid"]) with pytest.raises(MergeError): - merge_asof(trades, quotes, - left_index=True, - right_index=True) + merge_asof(trades, quotes, left_index=True, right_index=True) def test_on_and_index(self): # 'on' parameter and index together is prohibited - trades = self.trades.set_index('time') - quotes = self.quotes.set_index('time') + trades = self.trades.set_index("time") + quotes = self.quotes.set_index("time") with pytest.raises(MergeError): - merge_asof(trades, quotes, - left_on='price', - left_index=True, - right_index=True) + merge_asof( + trades, quotes, left_on="price", left_index=True, right_index=True + ) - trades = self.trades.set_index('time') - quotes = self.quotes.set_index('time') + trades = self.trades.set_index("time") + quotes = self.quotes.set_index("time") with pytest.raises(MergeError): - merge_asof(trades, quotes, - right_on='bid', - left_index=True, - right_index=True) + merge_asof( + trades, quotes, right_on="bid", left_index=True, right_index=True + ) def test_basic_left_by_right_by(self): @@ -257,10 +255,9 @@ def test_basic_left_by_right_by(self): trades = self.trades quotes = self.quotes - result = merge_asof(trades, quotes, - on='time', - left_by='ticker', - right_by='ticker') + result = merge_asof( + trades, quotes, on="time", left_by="ticker", right_by="ticker" + ) assert_frame_equal(result, expected) def test_missing_right_by(self): @@ -269,181 +266,211 @@ def test_missing_right_by(self): trades = self.trades quotes = self.quotes - q = quotes[quotes.ticker != 'MSFT'] - result = merge_asof(trades, q, - on='time', - by='ticker') - expected.loc[expected.ticker == 'MSFT', ['bid', 'ask']] = np.nan + q = quotes[quotes.ticker != "MSFT"] + result = merge_asof(trades, q, on="time", by="ticker") + expected.loc[expected.ticker == "MSFT", ["bid", "ask"]] = np.nan assert_frame_equal(result, expected) def test_multiby(self): # GH13936 - trades = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.046', - '20160525 13:30:00.048', - '20160525 13:30:00.050']), - 'ticker': ['MSFT', 'MSFT', - 'GOOG', 'GOOG', 'AAPL'], - 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'], - 'price': [51.95, 51.95, - 720.77, 720.92, 98.00], - 'quantity': [75, 155, - 100, 100, 100]}, - columns=['time', 'ticker', 'exch', - 'price', 'quantity']) - - quotes = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.030', - '20160525 13:30:00.041', - '20160525 13:30:00.045', - '20160525 13:30:00.049']), - 'ticker': ['GOOG', 'MSFT', 'MSFT', - 'MSFT', 'GOOG', 'AAPL'], - 'exch': ['BATS', 'NSDQ', 'ARCA', 'ARCA', - 'NSDQ', 'ARCA'], - 'bid': [720.51, 51.95, 51.97, 51.99, - 720.50, 97.99], - 'ask': [720.92, 51.96, 51.98, 52.00, - 720.93, 98.01]}, - columns=['time', 'ticker', 'exch', 'bid', 'ask']) - - expected = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.046', - '20160525 13:30:00.048', - '20160525 13:30:00.050']), - 'ticker': ['MSFT', 'MSFT', - 'GOOG', 'GOOG', 'AAPL'], - 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'], - 'price': [51.95, 51.95, - 720.77, 720.92, 98.00], - 'quantity': [75, 155, - 100, 100, 100], - 'bid': [np.nan, 51.95, 720.50, 720.51, np.nan], - 'ask': [np.nan, 51.96, 720.93, 720.92, np.nan]}, - columns=['time', 'ticker', 'exch', - 'price', 'quantity', 'bid', 'ask']) - - result = pd.merge_asof(trades, quotes, on='time', - by=['ticker', 'exch']) + trades = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.046", + "20160525 13:30:00.048", + "20160525 13:30:00.050", + ] + ), + "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + "exch": ["ARCA", "NSDQ", "NSDQ", "BATS", "NSDQ"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + }, + columns=["time", "ticker", "exch", "price", "quantity"], + ) + + quotes = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.030", + "20160525 13:30:00.041", + "20160525 13:30:00.045", + "20160525 13:30:00.049", + ] + ), + "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL"], + "exch": ["BATS", "NSDQ", "ARCA", "ARCA", "NSDQ", "ARCA"], + "bid": [720.51, 51.95, 51.97, 51.99, 720.50, 97.99], + "ask": [720.92, 51.96, 51.98, 52.00, 720.93, 98.01], + }, + columns=["time", "ticker", "exch", "bid", "ask"], + ) + + expected = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.046", + "20160525 13:30:00.048", + "20160525 13:30:00.050", + ] + ), + "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + "exch": ["ARCA", "NSDQ", "NSDQ", "BATS", "NSDQ"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + "bid": [np.nan, 51.95, 720.50, 720.51, np.nan], + "ask": [np.nan, 51.96, 720.93, 720.92, np.nan], + }, + columns=["time", "ticker", "exch", "price", "quantity", "bid", "ask"], + ) + + result = pd.merge_asof(trades, quotes, on="time", by=["ticker", "exch"]) assert_frame_equal(result, expected) def test_multiby_heterogeneous_types(self): # GH13936 - trades = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.046', - '20160525 13:30:00.048', - '20160525 13:30:00.050']), - 'ticker': [0, 0, 1, 1, 2], - 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'], - 'price': [51.95, 51.95, - 720.77, 720.92, 98.00], - 'quantity': [75, 155, - 100, 100, 100]}, - columns=['time', 'ticker', 'exch', - 'price', 'quantity']) - - quotes = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.030', - '20160525 13:30:00.041', - '20160525 13:30:00.045', - '20160525 13:30:00.049']), - 'ticker': [1, 0, 0, 0, 1, 2], - 'exch': ['BATS', 'NSDQ', 'ARCA', 'ARCA', - 'NSDQ', 'ARCA'], - 'bid': [720.51, 51.95, 51.97, 51.99, - 720.50, 97.99], - 'ask': [720.92, 51.96, 51.98, 52.00, - 720.93, 98.01]}, - columns=['time', 'ticker', 'exch', 'bid', 'ask']) - - expected = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.046', - '20160525 13:30:00.048', - '20160525 13:30:00.050']), - 'ticker': [0, 0, 1, 1, 2], - 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'], - 'price': [51.95, 51.95, - 720.77, 720.92, 98.00], - 'quantity': [75, 155, - 100, 100, 100], - 'bid': [np.nan, 51.95, 720.50, 720.51, np.nan], - 'ask': [np.nan, 51.96, 720.93, 720.92, np.nan]}, - columns=['time', 'ticker', 'exch', - 'price', 'quantity', 'bid', 'ask']) - - result = pd.merge_asof(trades, quotes, on='time', - by=['ticker', 'exch']) + trades = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.046", + "20160525 13:30:00.048", + "20160525 13:30:00.050", + ] + ), + "ticker": [0, 0, 1, 1, 2], + "exch": ["ARCA", "NSDQ", "NSDQ", "BATS", "NSDQ"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + }, + columns=["time", "ticker", "exch", "price", "quantity"], + ) + + quotes = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.030", + "20160525 13:30:00.041", + "20160525 13:30:00.045", + "20160525 13:30:00.049", + ] + ), + "ticker": [1, 0, 0, 0, 1, 2], + "exch": ["BATS", "NSDQ", "ARCA", "ARCA", "NSDQ", "ARCA"], + "bid": [720.51, 51.95, 51.97, 51.99, 720.50, 97.99], + "ask": [720.92, 51.96, 51.98, 52.00, 720.93, 98.01], + }, + columns=["time", "ticker", "exch", "bid", "ask"], + ) + + expected = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.046", + "20160525 13:30:00.048", + "20160525 13:30:00.050", + ] + ), + "ticker": [0, 0, 1, 1, 2], + "exch": ["ARCA", "NSDQ", "NSDQ", "BATS", "NSDQ"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + "bid": [np.nan, 51.95, 720.50, 720.51, np.nan], + "ask": [np.nan, 51.96, 720.93, 720.92, np.nan], + }, + columns=["time", "ticker", "exch", "price", "quantity", "bid", "ask"], + ) + + result = pd.merge_asof(trades, quotes, on="time", by=["ticker", "exch"]) assert_frame_equal(result, expected) def test_multiby_indexed(self): # GH15676 - left = pd.DataFrame([ - [pd.to_datetime('20160602'), 1, 'a'], - [pd.to_datetime('20160602'), 2, 'a'], - [pd.to_datetime('20160603'), 1, 'b'], - [pd.to_datetime('20160603'), 2, 'b']], - columns=['time', 'k1', 'k2']).set_index('time') - - right = pd.DataFrame([ - [pd.to_datetime('20160502'), 1, 'a', 1.0], - [pd.to_datetime('20160502'), 2, 'a', 2.0], - [pd.to_datetime('20160503'), 1, 'b', 3.0], - [pd.to_datetime('20160503'), 2, 'b', 4.0]], - columns=['time', 'k1', 'k2', 'value']).set_index('time') - - expected = pd.DataFrame([ - [pd.to_datetime('20160602'), 1, 'a', 1.0], - [pd.to_datetime('20160602'), 2, 'a', 2.0], - [pd.to_datetime('20160603'), 1, 'b', 3.0], - [pd.to_datetime('20160603'), 2, 'b', 4.0]], - columns=['time', 'k1', 'k2', 'value']).set_index('time') - - result = pd.merge_asof(left, - right, - left_index=True, - right_index=True, - by=['k1', 'k2']) + left = pd.DataFrame( + [ + [pd.to_datetime("20160602"), 1, "a"], + [pd.to_datetime("20160602"), 2, "a"], + [pd.to_datetime("20160603"), 1, "b"], + [pd.to_datetime("20160603"), 2, "b"], + ], + columns=["time", "k1", "k2"], + ).set_index("time") + + right = pd.DataFrame( + [ + [pd.to_datetime("20160502"), 1, "a", 1.0], + [pd.to_datetime("20160502"), 2, "a", 2.0], + [pd.to_datetime("20160503"), 1, "b", 3.0], + [pd.to_datetime("20160503"), 2, "b", 4.0], + ], + columns=["time", "k1", "k2", "value"], + ).set_index("time") + + expected = pd.DataFrame( + [ + [pd.to_datetime("20160602"), 1, "a", 1.0], + [pd.to_datetime("20160602"), 2, "a", 2.0], + [pd.to_datetime("20160603"), 1, "b", 3.0], + [pd.to_datetime("20160603"), 2, "b", 4.0], + ], + columns=["time", "k1", "k2", "value"], + ).set_index("time") + + result = pd.merge_asof( + left, right, left_index=True, right_index=True, by=["k1", "k2"] + ) assert_frame_equal(expected, result) with pytest.raises(MergeError): - pd.merge_asof(left, right, left_index=True, right_index=True, - left_by=['k1', 'k2'], right_by=['k1']) + pd.merge_asof( + left, + right, + left_index=True, + right_index=True, + left_by=["k1", "k2"], + right_by=["k1"], + ) def test_basic2(self, datapath): - expected = self.read_data(datapath, 'asof2.csv') - trades = self.read_data(datapath, 'trades2.csv') - quotes = self.read_data(datapath, 'quotes2.csv', dedupe=True) + expected = self.read_data(datapath, "asof2.csv") + trades = self.read_data(datapath, "trades2.csv") + quotes = self.read_data(datapath, "quotes2.csv", dedupe=True) - result = merge_asof(trades, quotes, - on='time', - by='ticker') + result = merge_asof(trades, quotes, on="time", by="ticker") assert_frame_equal(result, expected) def test_basic_no_by(self): - f = lambda x: x[x.ticker == 'MSFT'].drop('ticker', axis=1) \ + f = ( + lambda x: x[x.ticker == "MSFT"] + .drop("ticker", axis=1) .reset_index(drop=True) + ) # just use a single ticker expected = f(self.asof) trades = f(self.trades) quotes = f(self.quotes) - result = merge_asof(trades, quotes, - on='time') + result = merge_asof(trades, quotes, on="time") assert_frame_equal(result, expected) def test_valid_join_keys(self): @@ -452,40 +479,33 @@ def test_valid_join_keys(self): quotes = self.quotes with pytest.raises(MergeError): - merge_asof(trades, quotes, - left_on='time', - right_on='bid', - by='ticker') + merge_asof(trades, quotes, left_on="time", right_on="bid", by="ticker") with pytest.raises(MergeError): - merge_asof(trades, quotes, - on=['time', 'ticker'], - by='ticker') + merge_asof(trades, quotes, on=["time", "ticker"], by="ticker") with pytest.raises(MergeError): - merge_asof(trades, quotes, - by='ticker') + merge_asof(trades, quotes, by="ticker") def test_with_duplicates(self, datapath): - q = pd.concat([self.quotes, self.quotes]).sort_values( - ['time', 'ticker']).reset_index(drop=True) - result = merge_asof(self.trades, q, - on='time', - by='ticker') - expected = self.read_data(datapath, 'asof.csv') + q = ( + pd.concat([self.quotes, self.quotes]) + .sort_values(["time", "ticker"]) + .reset_index(drop=True) + ) + result = merge_asof(self.trades, q, on="time", by="ticker") + expected = self.read_data(datapath, "asof.csv") assert_frame_equal(result, expected) def test_with_duplicates_no_on(self): - df1 = pd.DataFrame({'key': [1, 1, 3], - 'left_val': [1, 2, 3]}) - df2 = pd.DataFrame({'key': [1, 2, 2], - 'right_val': [1, 2, 3]}) - result = merge_asof(df1, df2, on='key') - expected = pd.DataFrame({'key': [1, 1, 3], - 'left_val': [1, 2, 3], - 'right_val': [1, 1, 3]}) + df1 = pd.DataFrame({"key": [1, 1, 3], "left_val": [1, 2, 3]}) + df2 = pd.DataFrame({"key": [1, 2, 2], "right_val": [1, 2, 3]}) + result = merge_asof(df1, df2, on="key") + expected = pd.DataFrame( + {"key": [1, 1, 3], "left_val": [1, 2, 3], "right_val": [1, 1, 3]} + ) assert_frame_equal(result, expected) def test_valid_allow_exact_matches(self): @@ -494,10 +514,9 @@ def test_valid_allow_exact_matches(self): quotes = self.quotes with pytest.raises(MergeError): - merge_asof(trades, quotes, - on='time', - by='ticker', - allow_exact_matches='foo') + merge_asof( + trades, quotes, on="time", by="ticker", allow_exact_matches="foo" + ) def test_valid_tolerance(self): @@ -505,403 +524,507 @@ def test_valid_tolerance(self): quotes = self.quotes # dti - merge_asof(trades, quotes, - on='time', - by='ticker', - tolerance=Timedelta('1s')) + merge_asof(trades, quotes, on="time", by="ticker", tolerance=Timedelta("1s")) # integer - merge_asof(trades.reset_index(), quotes.reset_index(), - on='index', - by='ticker', - tolerance=1) + merge_asof( + trades.reset_index(), + quotes.reset_index(), + on="index", + by="ticker", + tolerance=1, + ) # incompat with pytest.raises(MergeError): - merge_asof(trades, quotes, - on='time', - by='ticker', - tolerance=1) + merge_asof(trades, quotes, on="time", by="ticker", tolerance=1) # invalid with pytest.raises(MergeError): - merge_asof(trades.reset_index(), quotes.reset_index(), - on='index', - by='ticker', - tolerance=1.0) + merge_asof( + trades.reset_index(), + quotes.reset_index(), + on="index", + by="ticker", + tolerance=1.0, + ) # invalid negative with pytest.raises(MergeError): - merge_asof(trades, quotes, - on='time', - by='ticker', - tolerance=-Timedelta('1s')) + merge_asof( + trades, quotes, on="time", by="ticker", tolerance=-Timedelta("1s") + ) with pytest.raises(MergeError): - merge_asof(trades.reset_index(), quotes.reset_index(), - on='index', - by='ticker', - tolerance=-1) + merge_asof( + trades.reset_index(), + quotes.reset_index(), + on="index", + by="ticker", + tolerance=-1, + ) def test_non_sorted(self): - trades = self.trades.sort_values('time', ascending=False) - quotes = self.quotes.sort_values('time', ascending=False) + trades = self.trades.sort_values("time", ascending=False) + quotes = self.quotes.sort_values("time", ascending=False) # we require that we are already sorted on time & quotes assert not trades.time.is_monotonic assert not quotes.time.is_monotonic with pytest.raises(ValueError): - merge_asof(trades, quotes, - on='time', - by='ticker') + merge_asof(trades, quotes, on="time", by="ticker") - trades = self.trades.sort_values('time') + trades = self.trades.sort_values("time") assert trades.time.is_monotonic assert not quotes.time.is_monotonic with pytest.raises(ValueError): - merge_asof(trades, quotes, - on='time', - by='ticker') + merge_asof(trades, quotes, on="time", by="ticker") - quotes = self.quotes.sort_values('time') + quotes = self.quotes.sort_values("time") assert trades.time.is_monotonic assert quotes.time.is_monotonic # ok, though has dupes - merge_asof(trades, self.quotes, - on='time', - by='ticker') + merge_asof(trades, self.quotes, on="time", by="ticker") def test_tolerance(self): trades = self.trades quotes = self.quotes - result = merge_asof(trades, quotes, - on='time', - by='ticker', - tolerance=Timedelta('1day')) + result = merge_asof( + trades, quotes, on="time", by="ticker", tolerance=Timedelta("1day") + ) expected = self.tolerance assert_frame_equal(result, expected) def test_tolerance_forward(self): # GH14887 - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 2, 3, 7, 11], - 'right_val': [1, 2, 3, 7, 11]}) + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 7, 11], "right_val": [1, 2, 3, 7, 11]}) - expected = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c'], - 'right_val': [1, np.nan, 11]}) + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, np.nan, 11]} + ) - result = pd.merge_asof(left, right, on='a', direction='forward', - tolerance=1) + result = pd.merge_asof(left, right, on="a", direction="forward", tolerance=1) assert_frame_equal(result, expected) def test_tolerance_nearest(self): # GH14887 - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 2, 3, 7, 11], - 'right_val': [1, 2, 3, 7, 11]}) + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 7, 11], "right_val": [1, 2, 3, 7, 11]}) - expected = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c'], - 'right_val': [1, np.nan, 11]}) + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, np.nan, 11]} + ) - result = pd.merge_asof(left, right, on='a', direction='nearest', - tolerance=1) + result = pd.merge_asof(left, right, on="a", direction="nearest", tolerance=1) assert_frame_equal(result, expected) def test_tolerance_tz(self): # GH 14844 left = pd.DataFrame( - {'date': pd.date_range(start=pd.to_datetime('2016-01-02'), - freq='D', periods=5, - tz=pytz.timezone('UTC')), - 'value1': np.arange(5)}) + { + "date": pd.date_range( + start=pd.to_datetime("2016-01-02"), + freq="D", + periods=5, + tz=pytz.timezone("UTC"), + ), + "value1": np.arange(5), + } + ) right = pd.DataFrame( - {'date': pd.date_range(start=pd.to_datetime('2016-01-01'), - freq='D', periods=5, - tz=pytz.timezone('UTC')), - 'value2': list("ABCDE")}) - result = pd.merge_asof(left, right, on='date', - tolerance=pd.Timedelta('1 day')) + { + "date": pd.date_range( + start=pd.to_datetime("2016-01-01"), + freq="D", + periods=5, + tz=pytz.timezone("UTC"), + ), + "value2": list("ABCDE"), + } + ) + result = pd.merge_asof(left, right, on="date", tolerance=pd.Timedelta("1 day")) expected = pd.DataFrame( - {'date': pd.date_range(start=pd.to_datetime('2016-01-02'), - freq='D', periods=5, - tz=pytz.timezone('UTC')), - 'value1': np.arange(5), - 'value2': list("BCDEE")}) + { + "date": pd.date_range( + start=pd.to_datetime("2016-01-02"), + freq="D", + periods=5, + tz=pytz.timezone("UTC"), + ), + "value1": np.arange(5), + "value2": list("BCDEE"), + } + ) assert_frame_equal(result, expected) def test_tolerance_float(self): # GH22981 - left = pd.DataFrame({'a': [1.1, 3.5, 10.9], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1.0, 2.5, 3.3, 7.5, 11.5], - 'right_val': [1.0, 2.5, 3.3, 7.5, 11.5]}) - - expected = pd.DataFrame({'a': [1.1, 3.5, 10.9], - 'left_val': ['a', 'b', 'c'], - 'right_val': [1, 3.3, np.nan]}) + left = pd.DataFrame({"a": [1.1, 3.5, 10.9], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame( + {"a": [1.0, 2.5, 3.3, 7.5, 11.5], "right_val": [1.0, 2.5, 3.3, 7.5, 11.5]} + ) - result = pd.merge_asof(left, right, on='a', direction='nearest', - tolerance=0.5) + expected = pd.DataFrame( + { + "a": [1.1, 3.5, 10.9], + "left_val": ["a", "b", "c"], + "right_val": [1, 3.3, np.nan], + } + ) + + result = pd.merge_asof(left, right, on="a", direction="nearest", tolerance=0.5) assert_frame_equal(result, expected) def test_index_tolerance(self): # GH 15135 - expected = self.tolerance.set_index('time') - trades = self.trades.set_index('time') - quotes = self.quotes.set_index('time') - - result = pd.merge_asof(trades, quotes, - left_index=True, - right_index=True, - by='ticker', - tolerance=pd.Timedelta('1day')) + expected = self.tolerance.set_index("time") + trades = self.trades.set_index("time") + quotes = self.quotes.set_index("time") + + result = pd.merge_asof( + trades, + quotes, + left_index=True, + right_index=True, + by="ticker", + tolerance=pd.Timedelta("1day"), + ) assert_frame_equal(result, expected) def test_allow_exact_matches(self): - result = merge_asof(self.trades, self.quotes, - on='time', - by='ticker', - allow_exact_matches=False) + result = merge_asof( + self.trades, self.quotes, on="time", by="ticker", allow_exact_matches=False + ) expected = self.allow_exact_matches assert_frame_equal(result, expected) def test_allow_exact_matches_forward(self): # GH14887 - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 2, 3, 7, 11], - 'right_val': [1, 2, 3, 7, 11]}) + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 7, 11], "right_val": [1, 2, 3, 7, 11]}) - expected = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c'], - 'right_val': [2, 7, 11]}) + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [2, 7, 11]} + ) - result = pd.merge_asof(left, right, on='a', direction='forward', - allow_exact_matches=False) + result = pd.merge_asof( + left, right, on="a", direction="forward", allow_exact_matches=False + ) assert_frame_equal(result, expected) def test_allow_exact_matches_nearest(self): # GH14887 - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 2, 3, 7, 11], - 'right_val': [1, 2, 3, 7, 11]}) + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 7, 11], "right_val": [1, 2, 3, 7, 11]}) - expected = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c'], - 'right_val': [2, 3, 11]}) + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [2, 3, 11]} + ) - result = pd.merge_asof(left, right, on='a', direction='nearest', - allow_exact_matches=False) + result = pd.merge_asof( + left, right, on="a", direction="nearest", allow_exact_matches=False + ) assert_frame_equal(result, expected) def test_allow_exact_matches_and_tolerance(self): - result = merge_asof(self.trades, self.quotes, - on='time', - by='ticker', - tolerance=Timedelta('100ms'), - allow_exact_matches=False) + result = merge_asof( + self.trades, + self.quotes, + on="time", + by="ticker", + tolerance=Timedelta("100ms"), + allow_exact_matches=False, + ) expected = self.allow_exact_matches_and_tolerance assert_frame_equal(result, expected) def test_allow_exact_matches_and_tolerance2(self): # GH 13695 - df1 = pd.DataFrame({ - 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), - 'username': ['bob']}) - df2 = pd.DataFrame({ - 'time': pd.to_datetime(['2016-07-15 13:30:00.000', - '2016-07-15 13:30:00.030']), - 'version': [1, 2]}) - - result = pd.merge_asof(df1, df2, on='time') - expected = pd.DataFrame({ - 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), - 'username': ['bob'], - 'version': [2]}) + df1 = pd.DataFrame( + {"time": pd.to_datetime(["2016-07-15 13:30:00.030"]), "username": ["bob"]} + ) + df2 = pd.DataFrame( + { + "time": pd.to_datetime( + ["2016-07-15 13:30:00.000", "2016-07-15 13:30:00.030"] + ), + "version": [1, 2], + } + ) + + result = pd.merge_asof(df1, df2, on="time") + expected = pd.DataFrame( + { + "time": pd.to_datetime(["2016-07-15 13:30:00.030"]), + "username": ["bob"], + "version": [2], + } + ) assert_frame_equal(result, expected) - result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False) - expected = pd.DataFrame({ - 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), - 'username': ['bob'], - 'version': [1]}) + result = pd.merge_asof(df1, df2, on="time", allow_exact_matches=False) + expected = pd.DataFrame( + { + "time": pd.to_datetime(["2016-07-15 13:30:00.030"]), + "username": ["bob"], + "version": [1], + } + ) assert_frame_equal(result, expected) - result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False, - tolerance=pd.Timedelta('10ms')) - expected = pd.DataFrame({ - 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), - 'username': ['bob'], - 'version': [np.nan]}) + result = pd.merge_asof( + df1, + df2, + on="time", + allow_exact_matches=False, + tolerance=pd.Timedelta("10ms"), + ) + expected = pd.DataFrame( + { + "time": pd.to_datetime(["2016-07-15 13:30:00.030"]), + "username": ["bob"], + "version": [np.nan], + } + ) assert_frame_equal(result, expected) def test_allow_exact_matches_and_tolerance3(self): # GH 13709 - df1 = pd.DataFrame({ - 'time': pd.to_datetime(['2016-07-15 13:30:00.030', - '2016-07-15 13:30:00.030']), - 'username': ['bob', 'charlie']}) - df2 = pd.DataFrame({ - 'time': pd.to_datetime(['2016-07-15 13:30:00.000', - '2016-07-15 13:30:00.030']), - 'version': [1, 2]}) - - result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False, - tolerance=pd.Timedelta('10ms')) - expected = pd.DataFrame({ - 'time': pd.to_datetime(['2016-07-15 13:30:00.030', - '2016-07-15 13:30:00.030']), - 'username': ['bob', 'charlie'], - 'version': [np.nan, np.nan]}) + df1 = pd.DataFrame( + { + "time": pd.to_datetime( + ["2016-07-15 13:30:00.030", "2016-07-15 13:30:00.030"] + ), + "username": ["bob", "charlie"], + } + ) + df2 = pd.DataFrame( + { + "time": pd.to_datetime( + ["2016-07-15 13:30:00.000", "2016-07-15 13:30:00.030"] + ), + "version": [1, 2], + } + ) + + result = pd.merge_asof( + df1, + df2, + on="time", + allow_exact_matches=False, + tolerance=pd.Timedelta("10ms"), + ) + expected = pd.DataFrame( + { + "time": pd.to_datetime( + ["2016-07-15 13:30:00.030", "2016-07-15 13:30:00.030"] + ), + "username": ["bob", "charlie"], + "version": [np.nan, np.nan], + } + ) assert_frame_equal(result, expected) def test_allow_exact_matches_and_tolerance_forward(self): # GH14887 - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 3, 4, 6, 11], - 'right_val': [1, 3, 4, 6, 11]}) + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 3, 4, 6, 11], "right_val": [1, 3, 4, 6, 11]}) - expected = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c'], - 'right_val': [np.nan, 6, 11]}) - - result = pd.merge_asof(left, right, on='a', direction='forward', - allow_exact_matches=False, tolerance=1) + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [np.nan, 6, 11]} + ) + + result = pd.merge_asof( + left, + right, + on="a", + direction="forward", + allow_exact_matches=False, + tolerance=1, + ) assert_frame_equal(result, expected) def test_allow_exact_matches_and_tolerance_nearest(self): # GH14887 - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 3, 4, 6, 11], - 'right_val': [1, 3, 4, 7, 11]}) - - expected = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c'], - 'right_val': [np.nan, 4, 11]}) + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 3, 4, 6, 11], "right_val": [1, 3, 4, 7, 11]}) - result = pd.merge_asof(left, right, on='a', direction='nearest', - allow_exact_matches=False, tolerance=1) + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [np.nan, 4, 11]} + ) + + result = pd.merge_asof( + left, + right, + on="a", + direction="nearest", + allow_exact_matches=False, + tolerance=1, + ) assert_frame_equal(result, expected) def test_forward_by(self): # GH14887 - left = pd.DataFrame({'a': [1, 5, 10, 12, 15], - 'b': ['X', 'X', 'Y', 'Z', 'Y'], - 'left_val': ['a', 'b', 'c', 'd', 'e']}) - right = pd.DataFrame({'a': [1, 6, 11, 15, 16], - 'b': ['X', 'Z', 'Y', 'Z', 'Y'], - 'right_val': [1, 6, 11, 15, 16]}) - - expected = pd.DataFrame({'a': [1, 5, 10, 12, 15], - 'b': ['X', 'X', 'Y', 'Z', 'Y'], - 'left_val': ['a', 'b', 'c', 'd', 'e'], - 'right_val': [1, np.nan, 11, 15, 16]}) + left = pd.DataFrame( + { + "a": [1, 5, 10, 12, 15], + "b": ["X", "X", "Y", "Z", "Y"], + "left_val": ["a", "b", "c", "d", "e"], + } + ) + right = pd.DataFrame( + { + "a": [1, 6, 11, 15, 16], + "b": ["X", "Z", "Y", "Z", "Y"], + "right_val": [1, 6, 11, 15, 16], + } + ) - result = pd.merge_asof(left, right, on='a', by='b', - direction='forward') + expected = pd.DataFrame( + { + "a": [1, 5, 10, 12, 15], + "b": ["X", "X", "Y", "Z", "Y"], + "left_val": ["a", "b", "c", "d", "e"], + "right_val": [1, np.nan, 11, 15, 16], + } + ) + + result = pd.merge_asof(left, right, on="a", by="b", direction="forward") assert_frame_equal(result, expected) def test_nearest_by(self): # GH14887 - left = pd.DataFrame({'a': [1, 5, 10, 12, 15], - 'b': ['X', 'X', 'Z', 'Z', 'Y'], - 'left_val': ['a', 'b', 'c', 'd', 'e']}) - right = pd.DataFrame({'a': [1, 6, 11, 15, 16], - 'b': ['X', 'Z', 'Z', 'Z', 'Y'], - 'right_val': [1, 6, 11, 15, 16]}) - - expected = pd.DataFrame({'a': [1, 5, 10, 12, 15], - 'b': ['X', 'X', 'Z', 'Z', 'Y'], - 'left_val': ['a', 'b', 'c', 'd', 'e'], - 'right_val': [1, 1, 11, 11, 16]}) + left = pd.DataFrame( + { + "a": [1, 5, 10, 12, 15], + "b": ["X", "X", "Z", "Z", "Y"], + "left_val": ["a", "b", "c", "d", "e"], + } + ) + right = pd.DataFrame( + { + "a": [1, 6, 11, 15, 16], + "b": ["X", "Z", "Z", "Z", "Y"], + "right_val": [1, 6, 11, 15, 16], + } + ) - result = pd.merge_asof(left, right, on='a', by='b', - direction='nearest') + expected = pd.DataFrame( + { + "a": [1, 5, 10, 12, 15], + "b": ["X", "X", "Z", "Z", "Y"], + "left_val": ["a", "b", "c", "d", "e"], + "right_val": [1, 1, 11, 11, 16], + } + ) + + result = pd.merge_asof(left, right, on="a", by="b", direction="nearest") assert_frame_equal(result, expected) def test_by_int(self): # we specialize by type, so test that this is correct - df1 = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.020', - '20160525 13:30:00.030', - '20160525 13:30:00.040', - '20160525 13:30:00.050', - '20160525 13:30:00.060']), - 'key': [1, 2, 1, 3, 2], - 'value1': [1.1, 1.2, 1.3, 1.4, 1.5]}, - columns=['time', 'key', 'value1']) - - df2 = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.015', - '20160525 13:30:00.020', - '20160525 13:30:00.025', - '20160525 13:30:00.035', - '20160525 13:30:00.040', - '20160525 13:30:00.055', - '20160525 13:30:00.060', - '20160525 13:30:00.065']), - 'key': [2, 1, 1, 3, 2, 1, 2, 3], - 'value2': [2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8]}, - columns=['time', 'key', 'value2']) - - result = pd.merge_asof(df1, df2, on='time', by='key') - - expected = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.020', - '20160525 13:30:00.030', - '20160525 13:30:00.040', - '20160525 13:30:00.050', - '20160525 13:30:00.060']), - 'key': [1, 2, 1, 3, 2], - 'value1': [1.1, 1.2, 1.3, 1.4, 1.5], - 'value2': [2.2, 2.1, 2.3, 2.4, 2.7]}, - columns=['time', 'key', 'value1', 'value2']) + df1 = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.020", + "20160525 13:30:00.030", + "20160525 13:30:00.040", + "20160525 13:30:00.050", + "20160525 13:30:00.060", + ] + ), + "key": [1, 2, 1, 3, 2], + "value1": [1.1, 1.2, 1.3, 1.4, 1.5], + }, + columns=["time", "key", "value1"], + ) + + df2 = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.015", + "20160525 13:30:00.020", + "20160525 13:30:00.025", + "20160525 13:30:00.035", + "20160525 13:30:00.040", + "20160525 13:30:00.055", + "20160525 13:30:00.060", + "20160525 13:30:00.065", + ] + ), + "key": [2, 1, 1, 3, 2, 1, 2, 3], + "value2": [2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8], + }, + columns=["time", "key", "value2"], + ) + + result = pd.merge_asof(df1, df2, on="time", by="key") + + expected = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.020", + "20160525 13:30:00.030", + "20160525 13:30:00.040", + "20160525 13:30:00.050", + "20160525 13:30:00.060", + ] + ), + "key": [1, 2, 1, 3, 2], + "value1": [1.1, 1.2, 1.3, 1.4, 1.5], + "value2": [2.2, 2.1, 2.3, 2.4, 2.7], + }, + columns=["time", "key", "value1", "value2"], + ) assert_frame_equal(result, expected) def test_on_float(self): # mimics how to determine the minimum-price variation - df1 = pd.DataFrame({ - 'price': [5.01, 0.0023, 25.13, 340.05, 30.78, 1040.90, 0.0078], - 'symbol': list("ABCDEFG")}, - columns=['symbol', 'price']) + df1 = pd.DataFrame( + { + "price": [5.01, 0.0023, 25.13, 340.05, 30.78, 1040.90, 0.0078], + "symbol": list("ABCDEFG"), + }, + columns=["symbol", "price"], + ) - df2 = pd.DataFrame({ - 'price': [0.0, 1.0, 100.0], - 'mpv': [0.0001, 0.01, 0.05]}, - columns=['price', 'mpv']) + df2 = pd.DataFrame( + {"price": [0.0, 1.0, 100.0], "mpv": [0.0001, 0.01, 0.05]}, + columns=["price", "mpv"], + ) - df1 = df1.sort_values('price').reset_index(drop=True) + df1 = df1.sort_values("price").reset_index(drop=True) - result = pd.merge_asof(df1, df2, on='price') + result = pd.merge_asof(df1, df2, on="price") - expected = pd.DataFrame({ - 'symbol': list("BGACEDF"), - 'price': [0.0023, 0.0078, 5.01, 25.13, 30.78, 340.05, 1040.90], - 'mpv': [0.0001, 0.0001, 0.01, 0.01, 0.01, 0.05, 0.05]}, - columns=['symbol', 'price', 'mpv']) + expected = pd.DataFrame( + { + "symbol": list("BGACEDF"), + "price": [0.0023, 0.0078, 5.01, 25.13, 30.78, 340.05, 1040.90], + "mpv": [0.0001, 0.0001, 0.01, 0.01, 0.01, 0.05, 0.05], + }, + columns=["symbol", "price", "mpv"], + ) assert_frame_equal(result, expected) @@ -909,26 +1032,29 @@ def test_on_specialized_type(self, any_real_dtype): # see gh-13936 dtype = np.dtype(any_real_dtype).type - df1 = pd.DataFrame({ - "value": [5, 2, 25, 100, 78, 120, 79], - "symbol": list("ABCDEFG")}, - columns=["symbol", "value"]) + df1 = pd.DataFrame( + {"value": [5, 2, 25, 100, 78, 120, 79], "symbol": list("ABCDEFG")}, + columns=["symbol", "value"], + ) df1.value = dtype(df1.value) - df2 = pd.DataFrame({ - "value": [0, 80, 120, 125], - "result": list("xyzw")}, - columns=["value", "result"]) + df2 = pd.DataFrame( + {"value": [0, 80, 120, 125], "result": list("xyzw")}, + columns=["value", "result"], + ) df2.value = dtype(df2.value) df1 = df1.sort_values("value").reset_index(drop=True) result = pd.merge_asof(df1, df2, on="value") expected = pd.DataFrame( - {"symbol": list("BACEGDF"), - "value": [2, 5, 25, 78, 79, 100, 120], - "result": list("xxxxxyz") - }, columns=["symbol", "value", "result"]) + { + "symbol": list("BACEGDF"), + "value": [2, 5, 25, 78, 79, 100, 120], + "result": list("xxxxxyz"), + }, + columns=["symbol", "value", "result"], + ) expected.value = dtype(expected.value) assert_frame_equal(result, expected) @@ -937,113 +1063,160 @@ def test_on_specialized_type_by_int(self, any_real_dtype): # see gh-13936 dtype = np.dtype(any_real_dtype).type - df1 = pd.DataFrame({ - "value": [5, 2, 25, 100, 78, 120, 79], - "key": [1, 2, 3, 2, 3, 1, 2], - "symbol": list("ABCDEFG")}, - columns=["symbol", "key", "value"]) + df1 = pd.DataFrame( + { + "value": [5, 2, 25, 100, 78, 120, 79], + "key": [1, 2, 3, 2, 3, 1, 2], + "symbol": list("ABCDEFG"), + }, + columns=["symbol", "key", "value"], + ) df1.value = dtype(df1.value) - df2 = pd.DataFrame({ - "value": [0, 80, 120, 125], - "key": [1, 2, 2, 3], - "result": list("xyzw")}, - columns=["value", "key", "result"]) + df2 = pd.DataFrame( + {"value": [0, 80, 120, 125], "key": [1, 2, 2, 3], "result": list("xyzw")}, + columns=["value", "key", "result"], + ) df2.value = dtype(df2.value) df1 = df1.sort_values("value").reset_index(drop=True) result = pd.merge_asof(df1, df2, on="value", by="key") - expected = pd.DataFrame({ - "symbol": list("BACEGDF"), - "key": [2, 1, 3, 3, 2, 2, 1], - "value": [2, 5, 25, 78, 79, 100, 120], - "result": [np.nan, "x", np.nan, np.nan, np.nan, "y", "x"]}, - columns=["symbol", "key", "value", "result"]) + expected = pd.DataFrame( + { + "symbol": list("BACEGDF"), + "key": [2, 1, 3, 3, 2, 2, 1], + "value": [2, 5, 25, 78, 79, 100, 120], + "result": [np.nan, "x", np.nan, np.nan, np.nan, "y", "x"], + }, + columns=["symbol", "key", "value", "result"], + ) expected.value = dtype(expected.value) assert_frame_equal(result, expected) def test_on_float_by_int(self): # type specialize both "by" and "on" parameters - df1 = pd.DataFrame({ - 'symbol': list("AAABBBCCC"), - 'exch': [1, 2, 3, 1, 2, 3, 1, 2, 3], - 'price': [3.26, 3.2599, 3.2598, 12.58, 12.59, - 12.5, 378.15, 378.2, 378.25]}, - columns=['symbol', 'exch', 'price']) - - df2 = pd.DataFrame({ - 'exch': [1, 1, 1, 2, 2, 2, 3, 3, 3], - 'price': [0.0, 1.0, 100.0, 0.0, 5.0, 100.0, 0.0, 5.0, 1000.0], - 'mpv': [0.0001, 0.01, 0.05, 0.0001, 0.01, 0.1, 0.0001, 0.25, 1.0]}, - columns=['exch', 'price', 'mpv']) - - df1 = df1.sort_values('price').reset_index(drop=True) - df2 = df2.sort_values('price').reset_index(drop=True) - - result = pd.merge_asof(df1, df2, on='price', by='exch') - - expected = pd.DataFrame({ - 'symbol': list("AAABBBCCC"), - 'exch': [3, 2, 1, 3, 1, 2, 1, 2, 3], - 'price': [3.2598, 3.2599, 3.26, 12.5, 12.58, - 12.59, 378.15, 378.2, 378.25], - 'mpv': [0.0001, 0.0001, 0.01, 0.25, 0.01, 0.01, 0.05, 0.1, 0.25]}, - columns=['symbol', 'exch', 'price', 'mpv']) + df1 = pd.DataFrame( + { + "symbol": list("AAABBBCCC"), + "exch": [1, 2, 3, 1, 2, 3, 1, 2, 3], + "price": [ + 3.26, + 3.2599, + 3.2598, + 12.58, + 12.59, + 12.5, + 378.15, + 378.2, + 378.25, + ], + }, + columns=["symbol", "exch", "price"], + ) + + df2 = pd.DataFrame( + { + "exch": [1, 1, 1, 2, 2, 2, 3, 3, 3], + "price": [0.0, 1.0, 100.0, 0.0, 5.0, 100.0, 0.0, 5.0, 1000.0], + "mpv": [0.0001, 0.01, 0.05, 0.0001, 0.01, 0.1, 0.0001, 0.25, 1.0], + }, + columns=["exch", "price", "mpv"], + ) + + df1 = df1.sort_values("price").reset_index(drop=True) + df2 = df2.sort_values("price").reset_index(drop=True) + + result = pd.merge_asof(df1, df2, on="price", by="exch") + + expected = pd.DataFrame( + { + "symbol": list("AAABBBCCC"), + "exch": [3, 2, 1, 3, 1, 2, 1, 2, 3], + "price": [ + 3.2598, + 3.2599, + 3.26, + 12.5, + 12.58, + 12.59, + 378.15, + 378.2, + 378.25, + ], + "mpv": [0.0001, 0.0001, 0.01, 0.25, 0.01, 0.01, 0.05, 0.1, 0.25], + }, + columns=["symbol", "exch", "price", "mpv"], + ) assert_frame_equal(result, expected) def test_merge_datatype_error_raises(self): - msg = r'incompatible merge keys \[0\] .*, must be the same type' + msg = r"incompatible merge keys \[0\] .*, must be the same type" - left = pd.DataFrame({'left_val': [1, 5, 10], - 'a': ['a', 'b', 'c']}) - right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7], - 'a': [1, 2, 3, 6, 7]}) + left = pd.DataFrame({"left_val": [1, 5, 10], "a": ["a", "b", "c"]}) + right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7], "a": [1, 2, 3, 6, 7]}) with pytest.raises(MergeError, match=msg): - merge_asof(left, right, on='a') + merge_asof(left, right, on="a") def test_merge_datatype_categorical_error_raises(self): - msg = (r'incompatible merge keys \[0\] .* both sides category, ' - 'but not equal ones') + msg = ( + r"incompatible merge keys \[0\] .* both sides category, " + "but not equal ones" + ) - left = pd.DataFrame({'left_val': [1, 5, 10], - 'a': pd.Categorical(['a', 'b', 'c'])}) - right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7], - 'a': pd.Categorical(['a', 'X', 'c', 'X', 'b'])}) + left = pd.DataFrame( + {"left_val": [1, 5, 10], "a": pd.Categorical(["a", "b", "c"])} + ) + right = pd.DataFrame( + { + "right_val": [1, 2, 3, 6, 7], + "a": pd.Categorical(["a", "X", "c", "X", "b"]), + } + ) with pytest.raises(MergeError, match=msg): - merge_asof(left, right, on='a') + merge_asof(left, right, on="a") - @pytest.mark.parametrize('func', [lambda x: x, lambda x: to_datetime(x)], - ids=['numeric', 'datetime']) - @pytest.mark.parametrize('side', ['left', 'right']) + @pytest.mark.parametrize( + "func", [lambda x: x, lambda x: to_datetime(x)], ids=["numeric", "datetime"] + ) + @pytest.mark.parametrize("side", ["left", "right"]) def test_merge_on_nans(self, func, side): # GH 23189 msg = "Merge keys contain null values on {} side".format(side) nulls = func([1.0, 5.0, np.nan]) - non_nulls = func([1.0, 5.0, 10.]) - df_null = pd.DataFrame({'a': nulls, 'left_val': ['a', 'b', 'c']}) - df = pd.DataFrame({'a': non_nulls, 'right_val': [1, 6, 11]}) + non_nulls = func([1.0, 5.0, 10.0]) + df_null = pd.DataFrame({"a": nulls, "left_val": ["a", "b", "c"]}) + df = pd.DataFrame({"a": non_nulls, "right_val": [1, 6, 11]}) with pytest.raises(ValueError, match=msg): - if side == 'left': - merge_asof(df_null, df, on='a') + if side == "left": + merge_asof(df_null, df, on="a") else: - merge_asof(df, df_null, on='a') + merge_asof(df, df_null, on="a") def test_merge_by_col_tz_aware(self): # GH 21184 left = pd.DataFrame( - {'by_col': pd.DatetimeIndex(['2018-01-01']).tz_localize('UTC'), - 'on_col': [2], 'values': ['a']}) + { + "by_col": pd.DatetimeIndex(["2018-01-01"]).tz_localize("UTC"), + "on_col": [2], + "values": ["a"], + } + ) right = pd.DataFrame( - {'by_col': pd.DatetimeIndex(['2018-01-01']).tz_localize('UTC'), - 'on_col': [1], 'values': ['b']}) - result = pd.merge_asof(left, right, by='by_col', on='on_col') - expected = pd.DataFrame([ - [pd.Timestamp('2018-01-01', tz='UTC'), 2, 'a', 'b'] - ], columns=['by_col', 'on_col', 'values_x', 'values_y']) + { + "by_col": pd.DatetimeIndex(["2018-01-01"]).tz_localize("UTC"), + "on_col": [1], + "values": ["b"], + } + ) + result = pd.merge_asof(left, right, by="by_col", on="on_col") + expected = pd.DataFrame( + [[pd.Timestamp("2018-01-01", tz="UTC"), 2, "a", "b"]], + columns=["by_col", "on_col", "values_x", "values_y"], + ) assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge_index_as_string.py b/pandas/tests/reshape/merge/test_merge_index_as_string.py index 12d9483af87614..5e3bf03a0a4eca 100644 --- a/pandas/tests/reshape/merge/test_merge_index_as_string.py +++ b/pandas/tests/reshape/merge/test_merge_index_as_string.py @@ -7,21 +7,27 @@ @pytest.fixture def df1(): - return DataFrame(dict( - outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], - inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], - v1=np.linspace(0, 1, 11))) + return DataFrame( + dict( + outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], + inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], + v1=np.linspace(0, 1, 11), + ) + ) @pytest.fixture def df2(): - return DataFrame(dict( - outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], - inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], - v2=np.linspace(10, 11, 12))) + return DataFrame( + dict( + outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], + inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], + v2=np.linspace(10, 11, 12), + ) + ) -@pytest.fixture(params=[[], ['outer'], ['outer', 'inner']]) +@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]]) def left_df(request, df1): """ Construct left test DataFrame with specified levels (any of 'outer', 'inner', and 'v1')""" @@ -32,7 +38,7 @@ def left_df(request, df1): return df1 -@pytest.fixture(params=[[], ['outer'], ['outer', 'inner']]) +@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]]) def right_df(request, df2): """ Construct right test DataFrame with specified levels (any of 'outer', 'inner', and 'v2')""" @@ -44,8 +50,7 @@ def right_df(request, df2): return df2 -def compute_expected(df_left, df_right, - on=None, left_on=None, right_on=None, how=None): +def compute_expected(df_left, df_right, on=None, left_on=None, right_on=None, how=None): """ Compute the expected merge result for the test case. @@ -85,8 +90,7 @@ def compute_expected(df_left, df_right, right_levels = [n for n in df_right.index.names if n is not None] # Compute output named index levels - output_levels = [i for i in left_on - if i in right_levels and i in left_levels] + output_levels = [i for i in left_on if i in right_levels and i in left_levels] # Drop index levels that aren't involved in the merge drop_left = [n for n in left_levels if n not in left_on] @@ -107,10 +111,7 @@ def compute_expected(df_left, df_right, df_right = df_right.reset_index(level=reset_right) # Perform merge - expected = df_left.merge(df_right, - left_on=left_on, - right_on=right_on, - how=how) + expected = df_left.merge(df_right, left_on=left_on, right_on=right_on, how=how) # Restore index levels if output_levels: @@ -119,11 +120,15 @@ def compute_expected(df_left, df_right, return expected -@pytest.mark.parametrize('on,how', - [(['outer'], 'inner'), - (['inner'], 'left'), - (['outer', 'inner'], 'right'), - (['inner', 'outer'], 'outer')]) +@pytest.mark.parametrize( + "on,how", + [ + (["outer"], "inner"), + (["inner"], "left"), + (["outer", "inner"], "right"), + (["inner", "outer"], "outer"), + ], +) def test_merge_indexes_and_columns_on(left_df, right_df, on, how): # Construct expected result @@ -134,44 +139,50 @@ def test_merge_indexes_and_columns_on(left_df, right_df, on, how): assert_frame_equal(result, expected, check_like=True) -@pytest.mark.parametrize('left_on,right_on,how', - [(['outer'], ['outer'], 'inner'), - (['inner'], ['inner'], 'right'), - (['outer', 'inner'], ['outer', 'inner'], 'left'), - (['inner', 'outer'], ['inner', 'outer'], 'outer')]) +@pytest.mark.parametrize( + "left_on,right_on,how", + [ + (["outer"], ["outer"], "inner"), + (["inner"], ["inner"], "right"), + (["outer", "inner"], ["outer", "inner"], "left"), + (["inner", "outer"], ["inner", "outer"], "outer"), + ], +) def test_merge_indexes_and_columns_lefton_righton( - left_df, right_df, left_on, right_on, how): + left_df, right_df, left_on, right_on, how +): # Construct expected result - expected = compute_expected(left_df, right_df, - left_on=left_on, - right_on=right_on, - how=how) + expected = compute_expected( + left_df, right_df, left_on=left_on, right_on=right_on, how=how + ) # Perform merge - result = left_df.merge(right_df, - left_on=left_on, right_on=right_on, how=how) + result = left_df.merge(right_df, left_on=left_on, right_on=right_on, how=how) assert_frame_equal(result, expected, check_like=True) -@pytest.mark.parametrize('left_index', - ['inner', ['inner', 'outer']]) +@pytest.mark.parametrize("left_index", ["inner", ["inner", "outer"]]) def test_join_indexes_and_columns_on(df1, df2, left_index, join_type): # Construct left_df left_df = df1.set_index(left_index) # Construct right_df - right_df = df2.set_index(['outer', 'inner']) + right_df = df2.set_index(["outer", "inner"]) # Result - expected = (left_df.reset_index() - .join(right_df, on=['outer', 'inner'], how=join_type, - lsuffix='_x', rsuffix='_y') - .set_index(left_index)) + expected = ( + left_df.reset_index() + .join( + right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y" + ) + .set_index(left_index) + ) # Perform join - result = left_df.join(right_df, on=['outer', 'inner'], how=join_type, - lsuffix='_x', rsuffix='_y') + result = left_df.join( + right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y" + ) assert_frame_equal(result, expected, check_like=True) diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index da8ac0b470f770..2b79548be7b59d 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -7,60 +7,69 @@ class TestMergeOrdered: - def setup_method(self, method): - self.left = DataFrame({'key': ['a', 'c', 'e'], - 'lvalue': [1, 2., 3]}) + self.left = DataFrame({"key": ["a", "c", "e"], "lvalue": [1, 2.0, 3]}) - self.right = DataFrame({'key': ['b', 'c', 'd', 'f'], - 'rvalue': [1, 2, 3., 4]}) + self.right = DataFrame({"key": ["b", "c", "d", "f"], "rvalue": [1, 2, 3.0, 4]}) def test_basic(self): - result = merge_ordered(self.left, self.right, on='key') - expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], - 'lvalue': [1, nan, 2, nan, 3, nan], - 'rvalue': [nan, 1, 2, 3, nan, 4]}) + result = merge_ordered(self.left, self.right, on="key") + expected = DataFrame( + { + "key": ["a", "b", "c", "d", "e", "f"], + "lvalue": [1, nan, 2, nan, 3, nan], + "rvalue": [nan, 1, 2, 3, nan, 4], + } + ) assert_frame_equal(result, expected) def test_ffill(self): - result = merge_ordered( - self.left, self.right, on='key', fill_method='ffill') - expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], - 'lvalue': [1., 1, 2, 2, 3, 3.], - 'rvalue': [nan, 1, 2, 3, 3, 4]}) + result = merge_ordered(self.left, self.right, on="key", fill_method="ffill") + expected = DataFrame( + { + "key": ["a", "b", "c", "d", "e", "f"], + "lvalue": [1.0, 1, 2, 2, 3, 3.0], + "rvalue": [nan, 1, 2, 3, 3, 4], + } + ) assert_frame_equal(result, expected) def test_multigroup(self): left = pd.concat([self.left, self.left], ignore_index=True) - left['group'] = ['a'] * 3 + ['b'] * 3 + left["group"] = ["a"] * 3 + ["b"] * 3 - result = merge_ordered(left, self.right, on='key', left_by='group', - fill_method='ffill') - expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2, - 'lvalue': [1., 1, 2, 2, 3, 3.] * 2, - 'rvalue': [nan, 1, 2, 3, 3, 4] * 2}) - expected['group'] = ['a'] * 6 + ['b'] * 6 + result = merge_ordered( + left, self.right, on="key", left_by="group", fill_method="ffill" + ) + expected = DataFrame( + { + "key": ["a", "b", "c", "d", "e", "f"] * 2, + "lvalue": [1.0, 1, 2, 2, 3, 3.0] * 2, + "rvalue": [nan, 1, 2, 3, 3, 4] * 2, + } + ) + expected["group"] = ["a"] * 6 + ["b"] * 6 assert_frame_equal(result, expected.loc[:, result.columns]) - result2 = merge_ordered(self.right, left, on='key', right_by='group', - fill_method='ffill') + result2 = merge_ordered( + self.right, left, on="key", right_by="group", fill_method="ffill" + ) assert_frame_equal(result, result2.loc[:, result.columns]) - result = merge_ordered(left, self.right, on='key', left_by='group') - assert result['group'].notna().all() + result = merge_ordered(left, self.right, on="key", left_by="group") + assert result["group"].notna().all() def test_merge_type(self): class NotADataFrame(DataFrame): - @property def _constructor(self): return NotADataFrame nad = NotADataFrame(self.left) - result = nad.merge(self.right, on='key') + result = nad.merge(self.right, on="key") assert isinstance(result, NotADataFrame) @@ -73,7 +82,7 @@ def test_empty_sequence_concat(self): ([], empty_pat), ({}, empty_pat), ([None], none_pat), - ([None, None], none_pat) + ([None, None], none_pat), ] for df_seq, pattern in test_cases: with pytest.raises(ValueError, match=pattern): @@ -84,20 +93,25 @@ def test_empty_sequence_concat(self): pd.concat([pd.DataFrame(), None]) def test_doc_example(self): - left = DataFrame({'group': list('aaabbb'), - 'key': ['a', 'c', 'e', 'a', 'c', 'e'], - 'lvalue': [1, 2, 3] * 2, - }) - - right = DataFrame({'key': ['b', 'c', 'd'], - 'rvalue': [1, 2, 3]}) - - result = merge_ordered(left, right, fill_method='ffill', - left_by='group') - - expected = DataFrame({'group': list('aaaaabbbbb'), - 'key': ['a', 'b', 'c', 'd', 'e'] * 2, - 'lvalue': [1, 1, 2, 2, 3] * 2, - 'rvalue': [nan, 1, 2, 3, 3] * 2}) + left = DataFrame( + { + "group": list("aaabbb"), + "key": ["a", "c", "e", "a", "c", "e"], + "lvalue": [1, 2, 3] * 2, + } + ) + + right = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]}) + + result = merge_ordered(left, right, fill_method="ffill", left_by="group") + + expected = DataFrame( + { + "group": list("aaaaabbbbb"), + "key": ["a", "b", "c", "d", "e"] * 2, + "lvalue": [1, 1, 2, 2, 3] * 2, + "rvalue": [nan, 1, 2, 3, 3] * 2, + } + ) assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 0bfc8ebbd28716..7aea85153d9083 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -16,161 +16,165 @@ def left(): """left dataframe (not multi-indexed) for multi-index join tests""" # a little relevant example with NAs - key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', - 'qux', 'snap'] - key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', - 'three', 'one'] + key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"] + key2 = ["two", "one", "three", "one", "two", "one", "two", "two", "three", "one"] data = np.random.randn(len(key1)) - return DataFrame({'key1': key1, 'key2': key2, 'data': data}) + return DataFrame({"key1": key1, "key2": key2, "data": data}) @pytest.fixture def right(): """right dataframe (multi-indexed) for multi-index join tests""" - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['key1', 'key2']) + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["key1", "key2"], + ) - return DataFrame(np.random.randn(10, 3), index=index, - columns=['j_one', 'j_two', 'j_three']) + return DataFrame( + np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"] + ) @pytest.fixture def left_multi(): - return ( - DataFrame( - dict(Origin=['A', 'A', 'B', 'B', 'C'], - Destination=['A', 'B', 'A', 'C', 'A'], - Period=['AM', 'AM', 'IP', 'AM', 'OP'], - TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'], - Trips=[1987, 3647, 2470, 4296, 4444]), - columns=['Origin', 'Destination', 'Period', - 'TripPurp', 'Trips']) - .set_index(['Origin', 'Destination', 'Period', 'TripPurp'])) + return DataFrame( + dict( + Origin=["A", "A", "B", "B", "C"], + Destination=["A", "B", "A", "C", "A"], + Period=["AM", "AM", "IP", "AM", "OP"], + TripPurp=["hbw", "nhb", "hbo", "nhb", "hbw"], + Trips=[1987, 3647, 2470, 4296, 4444], + ), + columns=["Origin", "Destination", "Period", "TripPurp", "Trips"], + ).set_index(["Origin", "Destination", "Period", "TripPurp"]) @pytest.fixture def right_multi(): - return ( - DataFrame( - dict(Origin=['A', 'A', 'B', 'B', 'C', 'C', 'E'], - Destination=['A', 'B', 'A', 'B', 'A', 'B', 'F'], - Period=['AM', 'AM', 'IP', 'AM', 'OP', 'IP', 'AM'], - LinkType=['a', 'b', 'c', 'b', 'a', 'b', 'a'], - Distance=[100, 80, 90, 80, 75, 35, 55]), - columns=['Origin', 'Destination', 'Period', - 'LinkType', 'Distance']) - .set_index(['Origin', 'Destination', 'Period', 'LinkType'])) + return DataFrame( + dict( + Origin=["A", "A", "B", "B", "C", "C", "E"], + Destination=["A", "B", "A", "B", "A", "B", "F"], + Period=["AM", "AM", "IP", "AM", "OP", "IP", "AM"], + LinkType=["a", "b", "c", "b", "a", "b", "a"], + Distance=[100, 80, 90, 80, 75, 35, 55], + ), + columns=["Origin", "Destination", "Period", "LinkType", "Distance"], + ).set_index(["Origin", "Destination", "Period", "LinkType"]) @pytest.fixture def on_cols_multi(): - return ['Origin', 'Destination', 'Period'] + return ["Origin", "Destination", "Period"] @pytest.fixture def idx_cols_multi(): - return ['Origin', 'Destination', 'Period', 'TripPurp', 'LinkType'] + return ["Origin", "Destination", "Period", "TripPurp", "LinkType"] class TestMergeMulti: - def setup_method(self): - self.index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - self.to_join = DataFrame(np.random.randn(10, 3), index=self.index, - columns=['j_one', 'j_two', 'j_three']) + self.index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + self.to_join = DataFrame( + np.random.randn(10, 3), + index=self.index, + columns=["j_one", "j_two", "j_three"], + ) # a little relevant example with NAs - key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', - 'qux', 'snap'] - key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', - 'three', 'one'] + key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"] + key2 = [ + "two", + "one", + "three", + "one", + "two", + "one", + "two", + "two", + "three", + "one", + ] data = np.random.randn(len(key1)) - self.data = DataFrame({'key1': key1, 'key2': key2, - 'data': data}) + self.data = DataFrame({"key1": key1, "key2": key2, "data": data}) def test_merge_on_multikey(self, left, right, join_type): - on_cols = ['key1', 'key2'] - result = (left.join(right, on=on_cols, how=join_type) - .reset_index(drop=True)) + on_cols = ["key1", "key2"] + result = left.join(right, on=on_cols, how=join_type).reset_index(drop=True) - expected = pd.merge(left, right.reset_index(), - on=on_cols, how=join_type) + expected = pd.merge(left, right.reset_index(), on=on_cols, how=join_type) tm.assert_frame_equal(result, expected) - result = (left.join(right, on=on_cols, how=join_type, sort=True) - .reset_index(drop=True)) + result = left.join(right, on=on_cols, how=join_type, sort=True).reset_index( + drop=True + ) - expected = pd.merge(left, right.reset_index(), - on=on_cols, how=join_type, sort=True) + expected = pd.merge( + left, right.reset_index(), on=on_cols, how=join_type, sort=True + ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("sort", [False, True]) def test_left_join_multi_index(self, left, right, sort): - icols = ['1st', '2nd', '3rd'] + icols = ["1st", "2nd", "3rd"] def bind_cols(df): iord = lambda a: 0 if a != a else ord(a) - f = lambda ts: ts.map(iord) - ord('a') - return (f(df['1st']) + f(df['3rd']) * 1e2 + - df['2nd'].fillna(0) * 1e4) + f = lambda ts: ts.map(iord) - ord("a") + return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 1e4 def run_asserts(left, right, sort): - res = left.join(right, on=icols, how='left', sort=sort) + res = left.join(right, on=icols, how="left", sort=sort) assert len(left) < len(res) + 1 - assert not res['4th'].isna().any() - assert not res['5th'].isna().any() + assert not res["4th"].isna().any() + assert not res["5th"].isna().any() - tm.assert_series_equal( - res['4th'], - res['5th'], check_names=False) + tm.assert_series_equal(res["4th"], -res["5th"], check_names=False) result = bind_cols(res.iloc[:, :-2]) - tm.assert_series_equal(res['4th'], result, check_names=False) + tm.assert_series_equal(res["4th"], result, check_names=False) assert result.name is None if sort: - tm.assert_frame_equal( - res, res.sort_values(icols, kind='mergesort')) + tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort")) - out = merge(left, right.reset_index(), on=icols, - sort=sort, how='left') + out = merge(left, right.reset_index(), on=icols, sort=sort, how="left") res.index = np.arange(len(res)) tm.assert_frame_equal(out, res) - lc = list(map(chr, np.arange(ord('a'), ord('z') + 1))) - left = DataFrame(np.random.choice(lc, (5000, 2)), - columns=['1st', '3rd']) - left.insert(1, '2nd', np.random.randint(0, 1000, len(left))) + lc = list(map(chr, np.arange(ord("a"), ord("z") + 1))) + left = DataFrame(np.random.choice(lc, (5000, 2)), columns=["1st", "3rd"]) + left.insert(1, "2nd", np.random.randint(0, 1000, len(left))) i = np.random.permutation(len(left)) right = left.iloc[i].copy() - left['4th'] = bind_cols(left) - right['5th'] = - bind_cols(right) + left["4th"] = bind_cols(left) + right["5th"] = -bind_cols(right) right.set_index(icols, inplace=True) run_asserts(left, right, sort) # inject some nulls - left.loc[1::23, '1st'] = np.nan - left.loc[2::37, '2nd'] = np.nan - left.loc[3::43, '3rd'] = np.nan - left['4th'] = bind_cols(left) + left.loc[1::23, "1st"] = np.nan + left.loc[2::37, "2nd"] = np.nan + left.loc[3::43, "3rd"] = np.nan + left["4th"] = bind_cols(left) i = np.random.permutation(len(left)) right = left.iloc[i, :-1] - right['5th'] = - bind_cols(right) + right["5th"] = -bind_cols(right) right.set_index(icols, inplace=True) run_asserts(left, right, sort) @@ -178,14 +182,14 @@ def run_asserts(left, right, sort): @pytest.mark.parametrize("sort", [False, True]) def test_merge_right_vs_left(self, left, right, sort): # compare left vs right merge with multikey - on_cols = ['key1', 'key2'] - merged_left_right = left.merge(right, - left_on=on_cols, right_index=True, - how='left', sort=sort) + on_cols = ["key1", "key2"] + merged_left_right = left.merge( + right, left_on=on_cols, right_index=True, how="left", sort=sort + ) - merge_right_left = right.merge(left, - right_on=on_cols, left_index=True, - how='right', sort=sort) + merge_right_left = right.merge( + left, right_on=on_cols, left_index=True, how="right", sort=sort + ) # Reorder columns merge_right_left = merge_right_left[merged_left_right.columns] @@ -199,210 +203,241 @@ def test_compress_group_combinations(self): key1 = np.tile(key1, 2) key2 = key1[::-1] - df = DataFrame({'key1': key1, 'key2': key2, - 'value1': np.random.randn(20000)}) + df = DataFrame({"key1": key1, "key2": key2, "value1": np.random.randn(20000)}) - df2 = DataFrame({'key1': key1[::2], 'key2': key2[::2], - 'value2': np.random.randn(10000)}) + df2 = DataFrame( + {"key1": key1[::2], "key2": key2[::2], "value2": np.random.randn(10000)} + ) # just to hit the label compression code path - merge(df, df2, how='outer') + merge(df, df2, how="outer") def test_left_join_index_preserve_order(self): - on_cols = ['k1', 'k2'] - left = DataFrame({'k1': [0, 1, 2] * 8, - 'k2': ['foo', 'bar'] * 12, - 'v': np.array(np.arange(24), dtype=np.int64)}) + on_cols = ["k1", "k2"] + left = DataFrame( + { + "k1": [0, 1, 2] * 8, + "k2": ["foo", "bar"] * 12, + "v": np.array(np.arange(24), dtype=np.int64), + } + ) - index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) - right = DataFrame({'v2': [5, 7]}, index=index) + index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")]) + right = DataFrame({"v2": [5, 7]}, index=index) result = left.join(right, on=on_cols) expected = left.copy() - expected['v2'] = np.nan - expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 - expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 + expected["v2"] = np.nan + expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5 + expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7 tm.assert_frame_equal(result, expected) - result.sort_values(on_cols, kind='mergesort', inplace=True) + result.sort_values(on_cols, kind="mergesort", inplace=True) expected = left.join(right, on=on_cols, sort=True) tm.assert_frame_equal(result, expected) # test join with multi dtypes blocks - left = DataFrame({'k1': [0, 1, 2] * 8, - 'k2': ['foo', 'bar'] * 12, - 'k3': np.array([0, 1, 2] * 8, dtype=np.float32), - 'v': np.array(np.arange(24), dtype=np.int32)}) + left = DataFrame( + { + "k1": [0, 1, 2] * 8, + "k2": ["foo", "bar"] * 12, + "k3": np.array([0, 1, 2] * 8, dtype=np.float32), + "v": np.array(np.arange(24), dtype=np.int32), + } + ) - index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) - right = DataFrame({'v2': [5, 7]}, index=index) + index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")]) + right = DataFrame({"v2": [5, 7]}, index=index) result = left.join(right, on=on_cols) expected = left.copy() - expected['v2'] = np.nan - expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 - expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 + expected["v2"] = np.nan + expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5 + expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7 tm.assert_frame_equal(result, expected) - result = result.sort_values(on_cols, kind='mergesort') + result = result.sort_values(on_cols, kind="mergesort") expected = left.join(right, on=on_cols, sort=True) tm.assert_frame_equal(result, expected) def test_left_join_index_multi_match_multiindex(self): - left = DataFrame([ - ['X', 'Y', 'C', 'a'], - ['W', 'Y', 'C', 'e'], - ['V', 'Q', 'A', 'h'], - ['V', 'R', 'D', 'i'], - ['X', 'Y', 'D', 'b'], - ['X', 'Y', 'A', 'c'], - ['W', 'Q', 'B', 'f'], - ['W', 'R', 'C', 'g'], - ['V', 'Y', 'C', 'j'], - ['X', 'Y', 'B', 'd']], - columns=['cola', 'colb', 'colc', 'tag'], - index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8]) - - right = (DataFrame([ - ['W', 'R', 'C', 0], - ['W', 'Q', 'B', 3], - ['W', 'Q', 'B', 8], - ['X', 'Y', 'A', 1], - ['X', 'Y', 'A', 4], - ['X', 'Y', 'B', 5], - ['X', 'Y', 'C', 6], - ['X', 'Y', 'C', 9], - ['X', 'Q', 'C', -6], - ['X', 'R', 'C', -9], - ['V', 'Y', 'C', 7], - ['V', 'R', 'D', 2], - ['V', 'R', 'D', -1], - ['V', 'Q', 'A', -3]], - columns=['col1', 'col2', 'col3', 'val']) - .set_index(['col1', 'col2', 'col3'])) - - result = left.join(right, on=['cola', 'colb', 'colc'], how='left') - - expected = DataFrame([ - ['X', 'Y', 'C', 'a', 6], - ['X', 'Y', 'C', 'a', 9], - ['W', 'Y', 'C', 'e', nan], - ['V', 'Q', 'A', 'h', -3], - ['V', 'R', 'D', 'i', 2], - ['V', 'R', 'D', 'i', -1], - ['X', 'Y', 'D', 'b', nan], - ['X', 'Y', 'A', 'c', 1], - ['X', 'Y', 'A', 'c', 4], - ['W', 'Q', 'B', 'f', 3], - ['W', 'Q', 'B', 'f', 8], - ['W', 'R', 'C', 'g', 0], - ['V', 'Y', 'C', 'j', 7], - ['X', 'Y', 'B', 'd', 5]], - columns=['cola', 'colb', 'colc', 'tag', 'val'], - index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8]) + left = DataFrame( + [ + ["X", "Y", "C", "a"], + ["W", "Y", "C", "e"], + ["V", "Q", "A", "h"], + ["V", "R", "D", "i"], + ["X", "Y", "D", "b"], + ["X", "Y", "A", "c"], + ["W", "Q", "B", "f"], + ["W", "R", "C", "g"], + ["V", "Y", "C", "j"], + ["X", "Y", "B", "d"], + ], + columns=["cola", "colb", "colc", "tag"], + index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8], + ) + + right = DataFrame( + [ + ["W", "R", "C", 0], + ["W", "Q", "B", 3], + ["W", "Q", "B", 8], + ["X", "Y", "A", 1], + ["X", "Y", "A", 4], + ["X", "Y", "B", 5], + ["X", "Y", "C", 6], + ["X", "Y", "C", 9], + ["X", "Q", "C", -6], + ["X", "R", "C", -9], + ["V", "Y", "C", 7], + ["V", "R", "D", 2], + ["V", "R", "D", -1], + ["V", "Q", "A", -3], + ], + columns=["col1", "col2", "col3", "val"], + ).set_index(["col1", "col2", "col3"]) + + result = left.join(right, on=["cola", "colb", "colc"], how="left") + + expected = DataFrame( + [ + ["X", "Y", "C", "a", 6], + ["X", "Y", "C", "a", 9], + ["W", "Y", "C", "e", nan], + ["V", "Q", "A", "h", -3], + ["V", "R", "D", "i", 2], + ["V", "R", "D", "i", -1], + ["X", "Y", "D", "b", nan], + ["X", "Y", "A", "c", 1], + ["X", "Y", "A", "c", 4], + ["W", "Q", "B", "f", 3], + ["W", "Q", "B", "f", 8], + ["W", "R", "C", "g", 0], + ["V", "Y", "C", "j", 7], + ["X", "Y", "B", "d", 5], + ], + columns=["cola", "colb", "colc", "tag", "val"], + index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8], + ) tm.assert_frame_equal(result, expected) - result = left.join(right, on=['cola', 'colb', 'colc'], - how='left', sort=True) + result = left.join(right, on=["cola", "colb", "colc"], how="left", sort=True) - expected = expected.sort_values(['cola', 'colb', 'colc'], - kind='mergesort') + expected = expected.sort_values(["cola", "colb", "colc"], kind="mergesort") tm.assert_frame_equal(result, expected) def test_left_join_index_multi_match(self): - left = DataFrame([ - ['c', 0], - ['b', 1], - ['a', 2], - ['b', 3]], - columns=['tag', 'val'], - index=[2, 0, 1, 3]) - - right = (DataFrame([ - ['a', 'v'], - ['c', 'w'], - ['c', 'x'], - ['d', 'y'], - ['a', 'z'], - ['c', 'r'], - ['e', 'q'], - ['c', 's']], - columns=['tag', 'char']) - .set_index('tag')) - - result = left.join(right, on='tag', how='left') - - expected = DataFrame([ - ['c', 0, 'w'], - ['c', 0, 'x'], - ['c', 0, 'r'], - ['c', 0, 's'], - ['b', 1, nan], - ['a', 2, 'v'], - ['a', 2, 'z'], - ['b', 3, nan]], - columns=['tag', 'val', 'char'], - index=[2, 2, 2, 2, 0, 1, 1, 3]) + left = DataFrame( + [["c", 0], ["b", 1], ["a", 2], ["b", 3]], + columns=["tag", "val"], + index=[2, 0, 1, 3], + ) + + right = DataFrame( + [ + ["a", "v"], + ["c", "w"], + ["c", "x"], + ["d", "y"], + ["a", "z"], + ["c", "r"], + ["e", "q"], + ["c", "s"], + ], + columns=["tag", "char"], + ).set_index("tag") + + result = left.join(right, on="tag", how="left") + + expected = DataFrame( + [ + ["c", 0, "w"], + ["c", 0, "x"], + ["c", 0, "r"], + ["c", 0, "s"], + ["b", 1, nan], + ["a", 2, "v"], + ["a", 2, "z"], + ["b", 3, nan], + ], + columns=["tag", "val", "char"], + index=[2, 2, 2, 2, 0, 1, 1, 3], + ) tm.assert_frame_equal(result, expected) - result = left.join(right, on='tag', how='left', sort=True) - expected2 = expected.sort_values('tag', kind='mergesort') + result = left.join(right, on="tag", how="left", sort=True) + expected2 = expected.sort_values("tag", kind="mergesort") tm.assert_frame_equal(result, expected2) # GH7331 - maintain left frame order in left merge - result = merge(left, right.reset_index(), how='left', on='tag') + result = merge(left, right.reset_index(), how="left", on="tag") expected.index = np.arange(len(expected)) tm.assert_frame_equal(result, expected) def test_left_merge_na_buglet(self): - left = DataFrame({'id': list('abcde'), 'v1': randn(5), - 'v2': randn(5), 'dummy': list('abcde'), - 'v3': randn(5)}, - columns=['id', 'v1', 'v2', 'dummy', 'v3']) - right = DataFrame({'id': ['a', 'b', np.nan, np.nan, np.nan], - 'sv3': [1.234, 5.678, np.nan, np.nan, np.nan]}) + left = DataFrame( + { + "id": list("abcde"), + "v1": randn(5), + "v2": randn(5), + "dummy": list("abcde"), + "v3": randn(5), + }, + columns=["id", "v1", "v2", "dummy", "v3"], + ) + right = DataFrame( + { + "id": ["a", "b", np.nan, np.nan, np.nan], + "sv3": [1.234, 5.678, np.nan, np.nan, np.nan], + } + ) - result = merge(left, right, on='id', how='left') + result = merge(left, right, on="id", how="left") - rdf = right.drop(['id'], axis=1) + rdf = right.drop(["id"], axis=1) expected = left.join(rdf) tm.assert_frame_equal(result, expected) def test_merge_na_keys(self): - data = [[1950, "A", 1.5], - [1950, "B", 1.5], - [1955, "B", 1.5], - [1960, "B", np.nan], - [1970, "B", 4.], - [1950, "C", 4.], - [1960, "C", np.nan], - [1965, "C", 3.], - [1970, "C", 4.]] + data = [ + [1950, "A", 1.5], + [1950, "B", 1.5], + [1955, "B", 1.5], + [1960, "B", np.nan], + [1970, "B", 4.0], + [1950, "C", 4.0], + [1960, "C", np.nan], + [1965, "C", 3.0], + [1970, "C", 4.0], + ] frame = DataFrame(data, columns=["year", "panel", "data"]) - other_data = [[1960, 'A', np.nan], - [1970, 'A', np.nan], - [1955, 'A', np.nan], - [1965, 'A', np.nan], - [1965, 'B', np.nan], - [1955, 'C', np.nan]] - other = DataFrame(other_data, columns=['year', 'panel', 'data']) + other_data = [ + [1960, "A", np.nan], + [1970, "A", np.nan], + [1955, "A", np.nan], + [1965, "A", np.nan], + [1965, "B", np.nan], + [1955, "C", np.nan], + ] + other = DataFrame(other_data, columns=["year", "panel", "data"]) - result = frame.merge(other, how='outer') + result = frame.merge(other, how="outer") - expected = frame.fillna(-999).merge(other.fillna(-999), how='outer') + expected = frame.fillna(-999).merge(other.fillna(-999), how="outer") expected = expected.replace(-999, np.nan) tm.assert_frame_equal(result, expected) @@ -410,9 +445,9 @@ def test_merge_na_keys(self): @pytest.mark.parametrize("klass", [None, np.asarray, Series, Index]) def test_merge_datetime_index(self, klass): # see gh-19038 - df = DataFrame([1, 2, 3], - ["2016-01-01", "2017-01-01", "2018-01-01"], - columns=["a"]) + df = DataFrame( + [1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"] + ) df.index = pd.to_datetime(df.index) on_vector = df.index.year @@ -420,21 +455,16 @@ def test_merge_datetime_index(self, klass): on_vector = klass(on_vector) expected = DataFrame( - OrderedDict([ - ("a", [1, 2, 3]), - ("key_1", [2016, 2017, 2018]), - ]) + OrderedDict([("a", [1, 2, 3]), ("key_1", [2016, 2017, 2018])]) ) result = df.merge(df, on=["a", on_vector], how="inner") tm.assert_frame_equal(result, expected) expected = DataFrame( - OrderedDict([ - ("key_0", [2016, 2017, 2018]), - ("a_x", [1, 2, 3]), - ("a_y", [1, 2, 3]), - ]) + OrderedDict( + [("key_0", [2016, 2017, 2018]), ("a_x", [1, 2, 3]), ("a_y", [1, 2, 3])] + ) ) result = df.merge(df, on=[df.index.year], how="inner") @@ -444,166 +474,284 @@ def test_join_multi_levels(self): # GH 3662 # merge multi-levels - household = ( - DataFrame( - dict(household_id=[1, 2, 3], - male=[0, 1, 0], - wealth=[196087.3, 316478.7, 294750]), - columns=['household_id', 'male', 'wealth']) - .set_index('household_id')) - portfolio = ( - DataFrame( - dict(household_id=[1, 2, 2, 3, 3, 3, 4], - asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29", - "gb00b03mlx29", "lu0197800237", "nl0000289965", - np.nan], - name=["ABN Amro", "Robeco", "Royal Dutch Shell", - "Royal Dutch Shell", - "AAB Eastern Europe Equity Fund", - "Postbank BioTech Fonds", np.nan], - share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]), - columns=['household_id', 'asset_id', 'name', 'share']) - .set_index(['household_id', 'asset_id'])) - result = household.join(portfolio, how='inner') + household = DataFrame( + dict( + household_id=[1, 2, 3], + male=[0, 1, 0], + wealth=[196087.3, 316478.7, 294750], + ), + columns=["household_id", "male", "wealth"], + ).set_index("household_id") + portfolio = DataFrame( + dict( + household_id=[1, 2, 2, 3, 3, 3, 4], + asset_id=[ + "nl0000301109", + "nl0000289783", + "gb00b03mlx29", + "gb00b03mlx29", + "lu0197800237", + "nl0000289965", + np.nan, + ], + name=[ + "ABN Amro", + "Robeco", + "Royal Dutch Shell", + "Royal Dutch Shell", + "AAB Eastern Europe Equity Fund", + "Postbank BioTech Fonds", + np.nan, + ], + share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0], + ), + columns=["household_id", "asset_id", "name", "share"], + ).set_index(["household_id", "asset_id"]) + result = household.join(portfolio, how="inner") expected = ( DataFrame( - dict(male=[0, 1, 1, 0, 0, 0], - wealth=[196087.3, 316478.7, 316478.7, - 294750.0, 294750.0, 294750.0], - name=['ABN Amro', 'Robeco', 'Royal Dutch Shell', - 'Royal Dutch Shell', - 'AAB Eastern Europe Equity Fund', - 'Postbank BioTech Fonds'], - share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25], - household_id=[1, 2, 2, 3, 3, 3], - asset_id=['nl0000301109', 'nl0000289783', 'gb00b03mlx29', - 'gb00b03mlx29', 'lu0197800237', - 'nl0000289965'])) - .set_index(['household_id', 'asset_id']) - .reindex(columns=['male', 'wealth', 'name', 'share'])) + dict( + male=[0, 1, 1, 0, 0, 0], + wealth=[196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0], + name=[ + "ABN Amro", + "Robeco", + "Royal Dutch Shell", + "Royal Dutch Shell", + "AAB Eastern Europe Equity Fund", + "Postbank BioTech Fonds", + ], + share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25], + household_id=[1, 2, 2, 3, 3, 3], + asset_id=[ + "nl0000301109", + "nl0000289783", + "gb00b03mlx29", + "gb00b03mlx29", + "lu0197800237", + "nl0000289965", + ], + ) + ) + .set_index(["household_id", "asset_id"]) + .reindex(columns=["male", "wealth", "name", "share"]) + ) tm.assert_frame_equal(result, expected) # equivalency - result = (merge(household.reset_index(), portfolio.reset_index(), - on=['household_id'], how='inner') - .set_index(['household_id', 'asset_id'])) + result = merge( + household.reset_index(), + portfolio.reset_index(), + on=["household_id"], + how="inner", + ).set_index(["household_id", "asset_id"]) tm.assert_frame_equal(result, expected) - result = household.join(portfolio, how='outer') - expected = (concat([ - expected, - (DataFrame( - dict(share=[1.00]), - index=MultiIndex.from_tuples( - [(4, np.nan)], - names=['household_id', 'asset_id']))) - ], axis=0, sort=True).reindex(columns=expected.columns)) + result = household.join(portfolio, how="outer") + expected = concat( + [ + expected, + ( + DataFrame( + dict(share=[1.00]), + index=MultiIndex.from_tuples( + [(4, np.nan)], names=["household_id", "asset_id"] + ), + ) + ), + ], + axis=0, + sort=True, + ).reindex(columns=expected.columns) tm.assert_frame_equal(result, expected) # invalid cases - household.index.name = 'foo' + household.index.name = "foo" with pytest.raises(ValueError): - household.join(portfolio, how='inner') + household.join(portfolio, how="inner") portfolio2 = portfolio.copy() - portfolio2.index.set_names(['household_id', 'foo']) + portfolio2.index.set_names(["household_id", "foo"]) with pytest.raises(ValueError): - portfolio2.join(portfolio, how='inner') + portfolio2.join(portfolio, how="inner") def test_join_multi_levels2(self): # some more advanced merges # GH6360 - household = ( - DataFrame( - dict(household_id=[1, 2, 2, 3, 3, 3, 4], - asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29", - "gb00b03mlx29", "lu0197800237", "nl0000289965", - np.nan], - share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]), - columns=['household_id', 'asset_id', 'share']) - .set_index(['household_id', 'asset_id'])) - - log_return = DataFrame(dict( - asset_id=["gb00b03mlx29", "gb00b03mlx29", - "gb00b03mlx29", "lu0197800237", "lu0197800237"], - t=[233, 234, 235, 180, 181], - log_return=[.09604978, -.06524096, .03532373, .03025441, .036997] - )).set_index(["asset_id", "t"]) + household = DataFrame( + dict( + household_id=[1, 2, 2, 3, 3, 3, 4], + asset_id=[ + "nl0000301109", + "nl0000301109", + "gb00b03mlx29", + "gb00b03mlx29", + "lu0197800237", + "nl0000289965", + np.nan, + ], + share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0], + ), + columns=["household_id", "asset_id", "share"], + ).set_index(["household_id", "asset_id"]) + + log_return = DataFrame( + dict( + asset_id=[ + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "lu0197800237", + "lu0197800237", + ], + t=[233, 234, 235, 180, 181], + log_return=[0.09604978, -0.06524096, 0.03532373, 0.03025441, 0.036997], + ) + ).set_index(["asset_id", "t"]) expected = ( - DataFrame(dict( - household_id=[2, 2, 2, 3, 3, 3, 3, 3], - asset_id=["gb00b03mlx29", "gb00b03mlx29", - "gb00b03mlx29", "gb00b03mlx29", - "gb00b03mlx29", "gb00b03mlx29", - "lu0197800237", "lu0197800237"], - t=[233, 234, 235, 233, 234, 235, 180, 181], - share=[0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6], - log_return=[.09604978, -.06524096, .03532373, - .09604978, -.06524096, .03532373, - .03025441, .036997] - )) + DataFrame( + dict( + household_id=[2, 2, 2, 3, 3, 3, 3, 3], + asset_id=[ + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "lu0197800237", + "lu0197800237", + ], + t=[233, 234, 235, 233, 234, 235, 180, 181], + share=[0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6], + log_return=[ + 0.09604978, + -0.06524096, + 0.03532373, + 0.09604978, + -0.06524096, + 0.03532373, + 0.03025441, + 0.036997, + ], + ) + ) .set_index(["household_id", "asset_id", "t"]) - .reindex(columns=['share', 'log_return'])) + .reindex(columns=["share", "log_return"]) + ) # this is the equivalency - result = (merge(household.reset_index(), log_return.reset_index(), - on=['asset_id'], how='inner') - .set_index(['household_id', 'asset_id', 't'])) + result = merge( + household.reset_index(), + log_return.reset_index(), + on=["asset_id"], + how="inner", + ).set_index(["household_id", "asset_id", "t"]) tm.assert_frame_equal(result, expected) expected = ( - DataFrame(dict( - household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], - asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29", - "gb00b03mlx29", "gb00b03mlx29", - "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", - "lu0197800237", "lu0197800237", - "nl0000289965", None], - t=[None, None, 233, 234, 235, 233, 234, - 235, 180, 181, None, None], - share=[1.0, 0.4, 0.6, 0.6, 0.6, 0.15, - 0.15, 0.15, 0.6, 0.6, 0.25, 1.0], - log_return=[None, None, .09604978, -.06524096, .03532373, - .09604978, -.06524096, .03532373, - .03025441, .036997, None, None] - )) + DataFrame( + dict( + household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], + asset_id=[ + "nl0000301109", + "nl0000301109", + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "lu0197800237", + "lu0197800237", + "nl0000289965", + None, + ], + t=[None, None, 233, 234, 235, 233, 234, 235, 180, 181, None, None], + share=[ + 1.0, + 0.4, + 0.6, + 0.6, + 0.6, + 0.15, + 0.15, + 0.15, + 0.6, + 0.6, + 0.25, + 1.0, + ], + log_return=[ + None, + None, + 0.09604978, + -0.06524096, + 0.03532373, + 0.09604978, + -0.06524096, + 0.03532373, + 0.03025441, + 0.036997, + None, + None, + ], + ) + ) .set_index(["household_id", "asset_id", "t"]) - .reindex(columns=['share', 'log_return'])) + .reindex(columns=["share", "log_return"]) + ) - result = (merge(household.reset_index(), log_return.reset_index(), - on=['asset_id'], how='outer') - .set_index(['household_id', 'asset_id', 't'])) + result = merge( + household.reset_index(), + log_return.reset_index(), + on=["asset_id"], + how="outer", + ).set_index(["household_id", "asset_id", "t"]) tm.assert_frame_equal(result, expected) class TestJoinMultiMulti: - - def test_join_multi_multi(self, left_multi, right_multi, join_type, - on_cols_multi, idx_cols_multi): + def test_join_multi_multi( + self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi + ): # Multi-index join tests - expected = (pd.merge(left_multi.reset_index(), - right_multi.reset_index(), - how=join_type, on=on_cols_multi). - set_index(idx_cols_multi).sort_index()) + expected = ( + pd.merge( + left_multi.reset_index(), + right_multi.reset_index(), + how=join_type, + on=on_cols_multi, + ) + .set_index(idx_cols_multi) + .sort_index() + ) result = left_multi.join(right_multi, how=join_type).sort_index() tm.assert_frame_equal(result, expected) - def test_join_multi_empty_frames(self, left_multi, right_multi, join_type, - on_cols_multi, idx_cols_multi): + def test_join_multi_empty_frames( + self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi + ): left_multi = left_multi.drop(columns=left_multi.columns) right_multi = right_multi.drop(columns=right_multi.columns) - expected = (pd.merge(left_multi.reset_index(), - right_multi.reset_index(), - how=join_type, on=on_cols_multi) - .set_index(idx_cols_multi).sort_index()) + expected = ( + pd.merge( + left_multi.reset_index(), + right_multi.reset_index(), + how=join_type, + on=on_cols_multi, + ) + .set_index(idx_cols_multi) + .sort_index() + ) result = left_multi.join(right_multi, how=join_type).sort_index() tm.assert_frame_equal(result, expected) @@ -611,9 +759,9 @@ def test_join_multi_empty_frames(self, left_multi, right_multi, join_type, @pytest.mark.parametrize("box", [None, np.asarray, Series, Index]) def test_merge_datetime_index(self, box): # see gh-19038 - df = DataFrame([1, 2, 3], - ["2016-01-01", "2017-01-01", "2018-01-01"], - columns=["a"]) + df = DataFrame( + [1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"] + ) df.index = pd.to_datetime(df.index) on_vector = df.index.year @@ -621,46 +769,42 @@ def test_merge_datetime_index(self, box): on_vector = box(on_vector) expected = DataFrame( - OrderedDict([ - ("a", [1, 2, 3]), - ("key_1", [2016, 2017, 2018]), - ]) + OrderedDict([("a", [1, 2, 3]), ("key_1", [2016, 2017, 2018])]) ) result = df.merge(df, on=["a", on_vector], how="inner") tm.assert_frame_equal(result, expected) expected = DataFrame( - OrderedDict([ - ("key_0", [2016, 2017, 2018]), - ("a_x", [1, 2, 3]), - ("a_y", [1, 2, 3]), - ]) + OrderedDict( + [("key_0", [2016, 2017, 2018]), ("a_x", [1, 2, 3]), ("a_y", [1, 2, 3])] + ) ) result = df.merge(df, on=[df.index.year], how="inner") tm.assert_frame_equal(result, expected) def test_single_common_level(self): - index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'), - ('K1', 'X2')], - names=['key', 'X']) + index_left = pd.MultiIndex.from_tuples( + [("K0", "X0"), ("K0", "X1"), ("K1", "X2")], names=["key", "X"] + ) - left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], - 'B': ['B0', 'B1', 'B2']}, - index=index_left) + left = pd.DataFrame( + {"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=index_left + ) - index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'), - ('K2', 'Y2'), ('K2', 'Y3')], - names=['key', 'Y']) + index_right = pd.MultiIndex.from_tuples( + [("K0", "Y0"), ("K1", "Y1"), ("K2", "Y2"), ("K2", "Y3")], names=["key", "Y"] + ) - right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3']}, - index=index_right) + right = pd.DataFrame( + {"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]}, + index=index_right, + ) result = left.join(right) - expected = (pd.merge(left.reset_index(), right.reset_index(), - on=['key'], how='inner') - .set_index(['key', 'X', 'Y'])) + expected = pd.merge( + left.reset_index(), right.reset_index(), on=["key"], how="inner" + ).set_index(["key", "X", "Y"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 031f3abf31b163..6366bf0521fbc2 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -15,8 +15,18 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, DatetimeIndex, Index, MultiIndex, Series, - Timestamp, concat, date_range, isna, read_csv) + Categorical, + DataFrame, + DatetimeIndex, + Index, + MultiIndex, + Series, + Timestamp, + concat, + date_range, + isna, + read_csv, +) import pandas.core.common as com from pandas.tests.extension.decimal import to_decimal from pandas.util import testing as tm @@ -47,30 +57,40 @@ class TestConcatAppendCommon: def setup_method(self, method): - dt_data = [pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-03')] - tz_data = [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timestamp('2011-01-03', tz='US/Eastern')] - - td_data = [pd.Timedelta('1 days'), - pd.Timedelta('2 days'), - pd.Timedelta('3 days')] - - period_data = [pd.Period('2011-01', freq='M'), - pd.Period('2011-02', freq='M'), - pd.Period('2011-03', freq='M')] - - self.data = {'bool': [True, False, True], - 'int64': [1, 2, 3], - 'float64': [1.1, np.nan, 3.3], - 'category': pd.Categorical(['X', 'Y', 'Z']), - 'object': ['a', 'b', 'c'], - 'datetime64[ns]': dt_data, - 'datetime64[ns, US/Eastern]': tz_data, - 'timedelta64[ns]': td_data, - 'period[M]': period_data} + dt_data = [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + ] + tz_data = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03", tz="US/Eastern"), + ] + + td_data = [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + ] + + period_data = [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Period("2011-03", freq="M"), + ] + + self.data = { + "bool": [True, False, True], + "int64": [1, 2, 3], + "float64": [1.1, np.nan, 3.3], + "category": pd.Categorical(["X", "Y", "Z"]), + "object": ["a", "b", "c"], + "datetime64[ns]": dt_data, + "datetime64[ns, US/Eastern]": tz_data, + "timedelta64[ns]": td_data, + "period[M]": period_data, + } def _check_expected_dtype(self, obj, label): """ @@ -78,13 +98,13 @@ def _check_expected_dtype(self, obj, label): considering not-supported dtypes """ if isinstance(obj, pd.Index): - if label == 'bool': - assert obj.dtype == 'object' + if label == "bool": + assert obj.dtype == "object" else: assert obj.dtype == label elif isinstance(obj, pd.Series): - if label.startswith('period'): - assert obj.dtype == 'Period[M]' + if label.startswith("period"): + assert obj.dtype == "Period[M]" else: assert obj.dtype == label else: @@ -103,10 +123,9 @@ def test_concatlike_same_dtypes(self): vals2 = vals1 vals3 = vals1 - if typ1 == 'category': + if typ1 == "category": exp_data = pd.Categorical(list(vals1) + list(vals2)) - exp_data3 = pd.Categorical(list(vals1) + list(vals2) + - list(vals3)) + exp_data3 = pd.Categorical(list(vals1) + list(vals2) + list(vals3)) else: exp_data = vals1 + vals2 exp_data3 = vals1 + vals2 + vals3 @@ -124,52 +143,53 @@ def test_concatlike_same_dtypes(self): tm.assert_index_equal(res, exp) # index.append name mismatch - i1 = pd.Index(vals1, name='x') - i2 = pd.Index(vals2, name='y') + i1 = pd.Index(vals1, name="x") + i2 = pd.Index(vals2, name="y") res = i1.append(i2) exp = pd.Index(exp_data) tm.assert_index_equal(res, exp) # index.append name match - i1 = pd.Index(vals1, name='x') - i2 = pd.Index(vals2, name='x') + i1 = pd.Index(vals1, name="x") + i2 = pd.Index(vals2, name="x") res = i1.append(i2) - exp = pd.Index(exp_data, name='x') + exp = pd.Index(exp_data, name="x") tm.assert_index_equal(res, exp) # cannot append non-index - with pytest.raises(TypeError, match='all inputs must be Index'): + with pytest.raises(TypeError, match="all inputs must be Index"): pd.Index(vals1).append(vals2) - with pytest.raises(TypeError, match='all inputs must be Index'): + with pytest.raises(TypeError, match="all inputs must be Index"): pd.Index(vals1).append([pd.Index(vals2), vals3]) # ----- Series ----- # # series.append - res = pd.Series(vals1).append(pd.Series(vals2), - ignore_index=True) + res = pd.Series(vals1).append(pd.Series(vals2), ignore_index=True) exp = pd.Series(exp_data) tm.assert_series_equal(res, exp, check_index_type=True) # concat - res = pd.concat([pd.Series(vals1), pd.Series(vals2)], - ignore_index=True) + res = pd.concat([pd.Series(vals1), pd.Series(vals2)], ignore_index=True) tm.assert_series_equal(res, exp, check_index_type=True) # 3 elements - res = pd.Series(vals1).append([pd.Series(vals2), pd.Series(vals3)], - ignore_index=True) + res = pd.Series(vals1).append( + [pd.Series(vals2), pd.Series(vals3)], ignore_index=True + ) exp = pd.Series(exp_data3) tm.assert_series_equal(res, exp) - res = pd.concat([pd.Series(vals1), pd.Series(vals2), - pd.Series(vals3)], ignore_index=True) + res = pd.concat( + [pd.Series(vals1), pd.Series(vals2), pd.Series(vals3)], + ignore_index=True, + ) tm.assert_series_equal(res, exp) # name mismatch - s1 = pd.Series(vals1, name='x') - s2 = pd.Series(vals2, name='y') + s1 = pd.Series(vals1, name="x") + s2 = pd.Series(vals2, name="y") res = s1.append(s2, ignore_index=True) exp = pd.Series(exp_data) tm.assert_series_equal(res, exp, check_index_type=True) @@ -178,18 +198,20 @@ def test_concatlike_same_dtypes(self): tm.assert_series_equal(res, exp, check_index_type=True) # name match - s1 = pd.Series(vals1, name='x') - s2 = pd.Series(vals2, name='x') + s1 = pd.Series(vals1, name="x") + s2 = pd.Series(vals2, name="x") res = s1.append(s2, ignore_index=True) - exp = pd.Series(exp_data, name='x') + exp = pd.Series(exp_data, name="x") tm.assert_series_equal(res, exp, check_index_type=True) res = pd.concat([s1, s2], ignore_index=True) tm.assert_series_equal(res, exp, check_index_type=True) # cannot append non-index - msg = (r"cannot concatenate object of type '.+';" - " only Series and DataFrame objs are valid") + msg = ( + r"cannot concatenate object of type '.+';" + " only Series and DataFrame objs are valid" + ) with pytest.raises(TypeError, match=msg): pd.Series(vals1).append(vals2) @@ -216,21 +238,23 @@ def test_concatlike_dtypes_coercion(self): if typ1 == typ2: # same dtype is tested in test_concatlike_same_dtypes continue - elif typ1 == 'category' or typ2 == 'category': + elif typ1 == "category" or typ2 == "category": # ToDo: suspicious continue # specify expected dtype - if typ1 == 'bool' and typ2 in ('int64', 'float64'): + if typ1 == "bool" and typ2 in ("int64", "float64"): # series coerces to numeric based on numpy rule # index doesn't because bool is object dtype exp_series_dtype = typ2 - elif typ2 == 'bool' and typ1 in ('int64', 'float64'): + elif typ2 == "bool" and typ1 in ("int64", "float64"): exp_series_dtype = typ1 - elif (typ1 == 'datetime64[ns, US/Eastern]' or - typ2 == 'datetime64[ns, US/Eastern]' or - typ1 == 'timedelta64[ns]' or - typ2 == 'timedelta64[ns]'): + elif ( + typ1 == "datetime64[ns, US/Eastern]" + or typ2 == "datetime64[ns, US/Eastern]" + or typ1 == "timedelta64[ns]" + or typ2 == "timedelta64[ns]" + ): exp_index_dtype = object exp_series_dtype = object @@ -245,45 +269,48 @@ def test_concatlike_dtypes_coercion(self): tm.assert_index_equal(res, exp) # 3 elements - res = pd.Index(vals1).append([pd.Index(vals2), - pd.Index(vals3)]) + res = pd.Index(vals1).append([pd.Index(vals2), pd.Index(vals3)]) exp = pd.Index(exp_data3, dtype=exp_index_dtype) tm.assert_index_equal(res, exp) # ----- Series ----- # # series.append - res = pd.Series(vals1).append(pd.Series(vals2), - ignore_index=True) + res = pd.Series(vals1).append(pd.Series(vals2), ignore_index=True) exp = pd.Series(exp_data, dtype=exp_series_dtype) tm.assert_series_equal(res, exp, check_index_type=True) # concat - res = pd.concat([pd.Series(vals1), pd.Series(vals2)], - ignore_index=True) + res = pd.concat([pd.Series(vals1), pd.Series(vals2)], ignore_index=True) tm.assert_series_equal(res, exp, check_index_type=True) # 3 elements - res = pd.Series(vals1).append([pd.Series(vals2), - pd.Series(vals3)], - ignore_index=True) + res = pd.Series(vals1).append( + [pd.Series(vals2), pd.Series(vals3)], ignore_index=True + ) exp = pd.Series(exp_data3, dtype=exp_series_dtype) tm.assert_series_equal(res, exp) - res = pd.concat([pd.Series(vals1), pd.Series(vals2), - pd.Series(vals3)], ignore_index=True) + res = pd.concat( + [pd.Series(vals1), pd.Series(vals2), pd.Series(vals3)], + ignore_index=True, + ) tm.assert_series_equal(res, exp) def test_concatlike_common_coerce_to_pandas_object(self): # GH 13626 # result must be Timestamp/Timedelta, not datetime.datetime/timedelta - dti = pd.DatetimeIndex(['2011-01-01', '2011-01-02']) - tdi = pd.TimedeltaIndex(['1 days', '2 days']) - - exp = pd.Index([pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-02'), - pd.Timedelta('1 days'), - pd.Timedelta('2 days')]) + dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"]) + tdi = pd.TimedeltaIndex(["1 days", "2 days"]) + + exp = pd.Index( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + ] + ) res = dti.append(tdi) tm.assert_index_equal(res, exp) @@ -305,11 +332,12 @@ def test_concatlike_common_coerce_to_pandas_object(self): def test_concatlike_datetimetz(self, tz_aware_fixture): tz = tz_aware_fixture # GH 7795 - dti1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz) - dti2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02'], tz=tz) + dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) + dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz=tz) - exp = pd.DatetimeIndex(['2011-01-01', '2011-01-02', - '2012-01-01', '2012-01-02'], tz=tz) + exp = pd.DatetimeIndex( + ["2011-01-01", "2011-01-02", "2012-01-01", "2012-01-02"], tz=tz + ) res = dti1.append(dti2) tm.assert_index_equal(res, exp) @@ -322,20 +350,19 @@ def test_concatlike_datetimetz(self, tz_aware_fixture): res = pd.concat([dts1, dts2]) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) - @pytest.mark.parametrize('tz', - ['UTC', 'US/Eastern', 'Asia/Tokyo', 'EST5EDT']) + @pytest.mark.parametrize("tz", ["UTC", "US/Eastern", "Asia/Tokyo", "EST5EDT"]) def test_concatlike_datetimetz_short(self, tz): # GH#7795 - ix1 = pd.date_range(start='2014-07-15', end='2014-07-17', - freq='D', tz=tz) - ix2 = pd.DatetimeIndex(['2014-07-11', '2014-07-21'], tz=tz) - df1 = pd.DataFrame(0, index=ix1, columns=['A', 'B']) - df2 = pd.DataFrame(0, index=ix2, columns=['A', 'B']) - - exp_idx = pd.DatetimeIndex(['2014-07-15', '2014-07-16', - '2014-07-17', '2014-07-11', - '2014-07-21'], tz=tz) - exp = pd.DataFrame(0, index=exp_idx, columns=['A', 'B']) + ix1 = pd.date_range(start="2014-07-15", end="2014-07-17", freq="D", tz=tz) + ix2 = pd.DatetimeIndex(["2014-07-11", "2014-07-21"], tz=tz) + df1 = pd.DataFrame(0, index=ix1, columns=["A", "B"]) + df2 = pd.DataFrame(0, index=ix2, columns=["A", "B"]) + + exp_idx = pd.DatetimeIndex( + ["2014-07-15", "2014-07-16", "2014-07-17", "2014-07-11", "2014-07-21"], + tz=tz, + ) + exp = pd.DataFrame(0, index=exp_idx, columns=["A", "B"]) tm.assert_frame_equal(df1.append(df2), exp) tm.assert_frame_equal(pd.concat([df1, df2]), exp) @@ -345,13 +372,18 @@ def test_concatlike_datetimetz_to_object(self, tz_aware_fixture): # GH 13660 # different tz coerces to object - dti1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz) - dti2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02']) - - exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2011-01-02', tz=tz), - pd.Timestamp('2012-01-01'), - pd.Timestamp('2012-01-02')], dtype=object) + dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) + dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"]) + + exp = pd.Index( + [ + pd.Timestamp("2011-01-01", tz=tz), + pd.Timestamp("2011-01-02", tz=tz), + pd.Timestamp("2012-01-01"), + pd.Timestamp("2012-01-02"), + ], + dtype=object, + ) res = dti1.append(dti2) tm.assert_index_equal(res, exp) @@ -365,14 +397,17 @@ def test_concatlike_datetimetz_to_object(self, tz_aware_fixture): tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) # different tz - dti3 = pd.DatetimeIndex(['2012-01-01', '2012-01-02'], - tz='US/Pacific') - - exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2011-01-02', tz=tz), - pd.Timestamp('2012-01-01', tz='US/Pacific'), - pd.Timestamp('2012-01-02', tz='US/Pacific')], - dtype=object) + dti3 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz="US/Pacific") + + exp = pd.Index( + [ + pd.Timestamp("2011-01-01", tz=tz), + pd.Timestamp("2011-01-02", tz=tz), + pd.Timestamp("2012-01-01", tz="US/Pacific"), + pd.Timestamp("2012-01-02", tz="US/Pacific"), + ], + dtype=object, + ) res = dti1.append(dti3) # tm.assert_index_equal(res, exp) @@ -387,11 +422,10 @@ def test_concatlike_datetimetz_to_object(self, tz_aware_fixture): def test_concatlike_common_period(self): # GH 13660 - pi1 = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') - pi2 = pd.PeriodIndex(['2012-01', '2012-02'], freq='M') + pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") + pi2 = pd.PeriodIndex(["2012-01", "2012-02"], freq="M") - exp = pd.PeriodIndex(['2011-01', '2011-02', '2012-01', - '2012-02'], freq='M') + exp = pd.PeriodIndex(["2011-01", "2011-02", "2012-01", "2012-02"], freq="M") res = pi1.append(pi2) tm.assert_index_equal(res, exp) @@ -406,13 +440,18 @@ def test_concatlike_common_period(self): def test_concatlike_common_period_diff_freq_to_object(self): # GH 13221 - pi1 = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') - pi2 = pd.PeriodIndex(['2012-01-01', '2012-02-01'], freq='D') - - exp = pd.Index([pd.Period('2011-01', freq='M'), - pd.Period('2011-02', freq='M'), - pd.Period('2012-01-01', freq='D'), - pd.Period('2012-02-01', freq='D')], dtype=object) + pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") + pi2 = pd.PeriodIndex(["2012-01-01", "2012-02-01"], freq="D") + + exp = pd.Index( + [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Period("2012-01-01", freq="D"), + pd.Period("2012-02-01", freq="D"), + ], + dtype=object, + ) res = pi1.append(pi2) tm.assert_index_equal(res, exp) @@ -428,12 +467,17 @@ def test_concatlike_common_period_diff_freq_to_object(self): def test_concatlike_common_period_mixed_dt_to_object(self): # GH 13221 # different datetimelike - pi1 = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') - tdi = pd.TimedeltaIndex(['1 days', '2 days']) - exp = pd.Index([pd.Period('2011-01', freq='M'), - pd.Period('2011-02', freq='M'), - pd.Timedelta('1 days'), - pd.Timedelta('2 days')], dtype=object) + pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") + tdi = pd.TimedeltaIndex(["1 days", "2 days"]) + exp = pd.Index( + [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + ], + dtype=object, + ) res = pi1.append(tdi) tm.assert_index_equal(res, exp) @@ -447,10 +491,15 @@ def test_concatlike_common_period_mixed_dt_to_object(self): tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) # inverse - exp = pd.Index([pd.Timedelta('1 days'), - pd.Timedelta('2 days'), - pd.Period('2011-01', freq='M'), - pd.Period('2011-02', freq='M')], dtype=object) + exp = pd.Index( + [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + ], + dtype=object, + ) res = tdi.append(pi1) tm.assert_index_equal(res, exp) @@ -467,56 +516,57 @@ def test_concat_categorical(self): # GH 13524 # same categories -> category - s1 = pd.Series([1, 2, np.nan], dtype='category') - s2 = pd.Series([2, 1, 2], dtype='category') + s1 = pd.Series([1, 2, np.nan], dtype="category") + s2 = pd.Series([2, 1, 2], dtype="category") - exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype='category') + exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype="category") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) # partially different categories => not-category - s1 = pd.Series([3, 2], dtype='category') - s2 = pd.Series([2, 1], dtype='category') + s1 = pd.Series([3, 2], dtype="category") + s2 = pd.Series([2, 1], dtype="category") exp = pd.Series([3, 2, 2, 1]) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) # completely different categories (same dtype) => not-category - s1 = pd.Series([10, 11, np.nan], dtype='category') - s2 = pd.Series([np.nan, 1, 3, 2], dtype='category') + s1 = pd.Series([10, 11, np.nan], dtype="category") + s2 = pd.Series([np.nan, 1, 3, 2], dtype="category") - exp = pd.Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype='object') + exp = pd.Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype="object") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) def test_union_categorical_same_categories_different_order(self): # https://github.com/pandas-dev/pandas/issues/19096 - a = pd.Series(Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c'])) - b = pd.Series(Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c'])) + a = pd.Series(Categorical(["a", "b", "c"], categories=["a", "b", "c"])) + b = pd.Series(Categorical(["a", "b", "c"], categories=["b", "a", "c"])) result = pd.concat([a, b], ignore_index=True) - expected = pd.Series(Categorical(['a', 'b', 'c', 'a', 'b', 'c'], - categories=['a', 'b', 'c'])) + expected = pd.Series( + Categorical(["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"]) + ) tm.assert_series_equal(result, expected) def test_concat_categorical_coercion(self): # GH 13524 # category + not-category => not-category - s1 = pd.Series([1, 2, np.nan], dtype='category') + s1 = pd.Series([1, 2, np.nan], dtype="category") s2 = pd.Series([2, 1, 2]) - exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype='object') + exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype="object") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) # result shouldn't be affected by 1st elem dtype - exp = pd.Series([2, 1, 2, 1, 2, np.nan], dtype='object') + exp = pd.Series([2, 1, 2, 1, 2, np.nan], dtype="object") tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) # all values are not in category => not-category - s1 = pd.Series([3, 2], dtype='category') + s1 = pd.Series([3, 2], dtype="category") s2 = pd.Series([2, 1]) exp = pd.Series([3, 2, 2, 1]) @@ -528,31 +578,31 @@ def test_concat_categorical_coercion(self): tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) # completely different categories => not-category - s1 = pd.Series([10, 11, np.nan], dtype='category') + s1 = pd.Series([10, 11, np.nan], dtype="category") s2 = pd.Series([1, 3, 2]) - exp = pd.Series([10, 11, np.nan, 1, 3, 2], dtype='object') + exp = pd.Series([10, 11, np.nan, 1, 3, 2], dtype="object") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - exp = pd.Series([1, 3, 2, 10, 11, np.nan], dtype='object') + exp = pd.Series([1, 3, 2, 10, 11, np.nan], dtype="object") tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) # different dtype => not-category - s1 = pd.Series([10, 11, np.nan], dtype='category') - s2 = pd.Series(['a', 'b', 'c']) + s1 = pd.Series([10, 11, np.nan], dtype="category") + s2 = pd.Series(["a", "b", "c"]) - exp = pd.Series([10, 11, np.nan, 'a', 'b', 'c']) + exp = pd.Series([10, 11, np.nan, "a", "b", "c"]) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - exp = pd.Series(['a', 'b', 'c', 10, 11, np.nan]) + exp = pd.Series(["a", "b", "c", 10, 11, np.nan]) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) # if normal series only contains NaN-likes => not-category - s1 = pd.Series([10, 11], dtype='category') + s1 = pd.Series([10, 11], dtype="category") s2 = pd.Series([np.nan, np.nan, np.nan]) exp = pd.Series([10, 11, np.nan, np.nan, np.nan]) @@ -567,23 +617,21 @@ def test_concat_categorical_3elem_coercion(self): # GH 13524 # mixed dtypes => not-category - s1 = pd.Series([1, 2, np.nan], dtype='category') - s2 = pd.Series([2, 1, 2], dtype='category') + s1 = pd.Series([1, 2, np.nan], dtype="category") + s2 = pd.Series([2, 1, 2], dtype="category") s3 = pd.Series([1, 2, 1, 2, np.nan]) - exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], - dtype='object') + exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="object") tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) - exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], - dtype='object') + exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="object") tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) # values are all in either category => not-category - s1 = pd.Series([4, 5, 6], dtype='category') - s2 = pd.Series([1, 2, 3], dtype='category') + s1 = pd.Series([4, 5, 6], dtype="category") + s2 = pd.Series([1, 2, 3], dtype="category") s3 = pd.Series([1, 3, 4]) exp = pd.Series([4, 5, 6, 1, 2, 3, 1, 3, 4]) @@ -595,8 +643,8 @@ def test_concat_categorical_3elem_coercion(self): tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) # values are all in either category => not-category - s1 = pd.Series([4, 5, 6], dtype='category') - s2 = pd.Series([1, 2, 3], dtype='category') + s1 = pd.Series([4, 5, 6], dtype="category") + s2 = pd.Series([1, 2, 3], dtype="category") s3 = pd.Series([10, 11, 12]) exp = pd.Series([4, 5, 6, 1, 2, 3, 10, 11, 12]) @@ -610,12 +658,12 @@ def test_concat_categorical_3elem_coercion(self): def test_concat_categorical_multi_coercion(self): # GH 13524 - s1 = pd.Series([1, 3], dtype='category') - s2 = pd.Series([3, 4], dtype='category') + s1 = pd.Series([1, 3], dtype="category") + s2 = pd.Series([3, 4], dtype="category") s3 = pd.Series([2, 3]) - s4 = pd.Series([2, 2], dtype='category') + s4 = pd.Series([2, 2], dtype="category") s5 = pd.Series([1, np.nan]) - s6 = pd.Series([1, 3, 2], dtype='category') + s6 = pd.Series([1, 3, 2], dtype="category") # mixed dtype, values are all in categories => not-category exp = pd.Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2]) @@ -640,8 +688,9 @@ def test_concat_categorical_ordered(self): tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - exp = pd.Series(pd.Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan], - ordered=True)) + exp = pd.Series( + pd.Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan], ordered=True) + ) tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp) tm.assert_series_equal(s1.append([s2, s1], ignore_index=True), exp) @@ -650,23 +699,22 @@ def test_concat_categorical_coercion_nan(self): # some edge cases # category + not-category => not category - s1 = pd.Series(np.array([np.nan, np.nan], dtype=np.float64), - dtype='category') + s1 = pd.Series(np.array([np.nan, np.nan], dtype=np.float64), dtype="category") s2 = pd.Series([np.nan, 1]) exp = pd.Series([np.nan, np.nan, np.nan, 1]) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - s1 = pd.Series([1, np.nan], dtype='category') + s1 = pd.Series([1, np.nan], dtype="category") s2 = pd.Series([np.nan, np.nan]) - exp = pd.Series([1, np.nan, np.nan, np.nan], dtype='object') + exp = pd.Series([1, np.nan, np.nan, np.nan], dtype="object") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) # mixed dtype, all nan-likes => not-category - s1 = pd.Series([np.nan, np.nan], dtype='category') + s1 = pd.Series([np.nan, np.nan], dtype="category") s2 = pd.Series([np.nan, np.nan]) exp = pd.Series([np.nan, np.nan, np.nan, np.nan]) @@ -676,10 +724,10 @@ def test_concat_categorical_coercion_nan(self): tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) # all category nan-likes => category - s1 = pd.Series([np.nan, np.nan], dtype='category') - s2 = pd.Series([np.nan, np.nan], dtype='category') + s1 = pd.Series([np.nan, np.nan], dtype="category") + s2 = pd.Series([np.nan, np.nan], dtype="category") - exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype='category') + exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype="category") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) @@ -687,8 +735,8 @@ def test_concat_categorical_coercion_nan(self): def test_concat_categorical_empty(self): # GH 13524 - s1 = pd.Series([], dtype='category') - s2 = pd.Series([1, 2], dtype='category') + s1 = pd.Series([], dtype="category") + s2 = pd.Series([1, 2], dtype="category") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) @@ -696,14 +744,14 @@ def test_concat_categorical_empty(self): tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) - s1 = pd.Series([], dtype='category') - s2 = pd.Series([], dtype='category') + s1 = pd.Series([], dtype="category") + s2 = pd.Series([], dtype="category") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) - s1 = pd.Series([], dtype='category') - s2 = pd.Series([], dtype='object') + s1 = pd.Series([], dtype="category") + s2 = pd.Series([], dtype="object") # different dtype => not-category tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) @@ -711,7 +759,7 @@ def test_concat_categorical_empty(self): tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) - s1 = pd.Series([], dtype='category') + s1 = pd.Series([], dtype="category") s2 = pd.Series([np.nan, np.nan]) # empty Series is ignored @@ -724,30 +772,28 @@ def test_concat_categorical_empty(self): def test_concat_join_axes_deprecated(self, axis): # GH21951 - one = pd.DataFrame([[0., 1.], [2., 3.]], columns=list('ab')) - two = pd.DataFrame([[10., 11.], [12., 13.]], index=[1, 2], - columns=list('bc')) + one = pd.DataFrame([[0.0, 1.0], [2.0, 3.0]], columns=list("ab")) + two = pd.DataFrame( + [[10.0, 11.0], [12.0, 13.0]], index=[1, 2], columns=list("bc") + ) - expected = pd.concat([one, two], - axis=1, sort=False).reindex(index=two.index) + expected = pd.concat([one, two], axis=1, sort=False).reindex(index=two.index) with tm.assert_produces_warning(expected_warning=FutureWarning): - result = pd.concat([one, two], - axis=1, sort=False, join_axes=[two.index]) + result = pd.concat([one, two], axis=1, sort=False, join_axes=[two.index]) tm.assert_frame_equal(result, expected) - expected = pd.concat([one, two], - axis=0, sort=False).reindex(columns=two.columns) + expected = pd.concat([one, two], axis=0, sort=False).reindex( + columns=two.columns + ) with tm.assert_produces_warning(expected_warning=FutureWarning): - result = pd.concat([one, two], - axis=0, sort=False, join_axes=[two.columns]) + result = pd.concat([one, two], axis=0, sort=False, join_axes=[two.columns]) tm.assert_frame_equal(result, expected) class TestAppend: - def test_append(self, sort, float_frame): mixed_frame = float_frame.copy() - mixed_frame['foo'] = 'bar' + mixed_frame["foo"] = "bar" begin_index = float_frame.index[:5] end_index = float_frame.index[5:] @@ -756,14 +802,14 @@ def test_append(self, sort, float_frame): end_frame = float_frame.reindex(end_index) appended = begin_frame.append(end_frame) - tm.assert_almost_equal(appended['A'], float_frame['A']) + tm.assert_almost_equal(appended["A"], float_frame["A"]) - del end_frame['A'] + del end_frame["A"] partial_appended = begin_frame.append(end_frame, sort=sort) - assert 'A' in partial_appended + assert "A" in partial_appended partial_appended = end_frame.append(begin_frame, sort=sort) - assert 'A' in partial_appended + assert "A" in partial_appended # mixed type handling appended = mixed_frame[:5].append(mixed_frame[5:]) @@ -775,8 +821,9 @@ def test_append(self, sort, float_frame): # all equal except 'foo' column tm.assert_frame_equal( - mixed_appended.reindex(columns=['A', 'B', 'C', 'D']), - mixed_appended2.reindex(columns=['A', 'B', 'C', 'D'])) + mixed_appended.reindex(columns=["A", "B", "C", "D"]), + mixed_appended2.reindex(columns=["A", "B", "C", "D"]), + ) def test_append_empty(self, float_frame): empty = DataFrame() @@ -796,29 +843,32 @@ def test_append_overlap_raises(self, float_frame): def test_append_new_columns(self): # see gh-6129: new columns - df = DataFrame({'a': {'x': 1, 'y': 2}, 'b': {'x': 3, 'y': 4}}) - row = Series([5, 6, 7], index=['a', 'b', 'c'], name='z') - expected = DataFrame({'a': {'x': 1, 'y': 2, 'z': 5}, 'b': { - 'x': 3, 'y': 4, 'z': 6}, 'c': {'z': 7}}) + df = DataFrame({"a": {"x": 1, "y": 2}, "b": {"x": 3, "y": 4}}) + row = Series([5, 6, 7], index=["a", "b", "c"], name="z") + expected = DataFrame( + { + "a": {"x": 1, "y": 2, "z": 5}, + "b": {"x": 3, "y": 4, "z": 6}, + "c": {"z": 7}, + } + ) result = df.append(row) tm.assert_frame_equal(result, expected) def test_append_length0_frame(self, sort): - df = DataFrame(columns=['A', 'B', 'C']) - df3 = DataFrame(index=[0, 1], columns=['A', 'B']) + df = DataFrame(columns=["A", "B", "C"]) + df3 = DataFrame(index=[0, 1], columns=["A", "B"]) df5 = df.append(df3, sort=sort) - expected = DataFrame(index=[0, 1], columns=['A', 'B', 'C']) + expected = DataFrame(index=[0, 1], columns=["A", "B", "C"]) assert_frame_equal(df5, expected) def test_append_records(self): - arr1 = np.zeros((2,), dtype=('i4,f4,a10')) - arr1[:] = [(1, 2., 'Hello'), (2, 3., "World")] + arr1 = np.zeros((2,), dtype=("i4,f4,a10")) + arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] - arr2 = np.zeros((3,), dtype=('i4,f4,a10')) - arr2[:] = [(3, 4., 'foo'), - (5, 6., "bar"), - (7., 8., 'baz')] + arr2 = np.zeros((3,), dtype=("i4,f4,a10")) + arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")] df1 = DataFrame(arr1) df2 = DataFrame(arr2) @@ -829,15 +879,14 @@ def test_append_records(self): # rewrite sort fixture, since we also want to test default of None def test_append_sorts(self, sort_with_none): - df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a']) - df2 = pd.DataFrame({"a": [1, 2], 'c': [3, 4]}, index=[2, 3]) + df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"]) + df2 = pd.DataFrame({"a": [1, 2], "c": [3, 4]}, index=[2, 3]) if sort_with_none is None: # only warn if not explicitly specified # don't check stacklevel since its set for concat, and append # has an extra stack. - ctx = tm.assert_produces_warning(FutureWarning, - check_stacklevel=False) + ctx = tm.assert_produces_warning(FutureWarning, check_stacklevel=False) else: ctx = tm.assert_produces_warning(None) @@ -845,74 +894,83 @@ def test_append_sorts(self, sort_with_none): result = df1.append(df2, sort=sort_with_none) # for None / True - expected = pd.DataFrame({"b": [1, 2, None, None], - "a": [1, 2, 1, 2], - "c": [None, None, 3, 4]}, - columns=['a', 'b', 'c']) + expected = pd.DataFrame( + {"b": [1, 2, None, None], "a": [1, 2, 1, 2], "c": [None, None, 3, 4]}, + columns=["a", "b", "c"], + ) if sort_with_none is False: - expected = expected[['b', 'a', 'c']] + expected = expected[["b", "a", "c"]] tm.assert_frame_equal(result, expected) def test_append_different_columns(self, sort): - df = DataFrame({'bools': np.random.randn(10) > 0, - 'ints': np.random.randint(0, 10, 10), - 'floats': np.random.randn(10), - 'strings': ['foo', 'bar'] * 5}) + df = DataFrame( + { + "bools": np.random.randn(10) > 0, + "ints": np.random.randint(0, 10, 10), + "floats": np.random.randn(10), + "strings": ["foo", "bar"] * 5, + } + ) - a = df[:5].loc[:, ['bools', 'ints', 'floats']] - b = df[5:].loc[:, ['strings', 'ints', 'floats']] + a = df[:5].loc[:, ["bools", "ints", "floats"]] + b = df[5:].loc[:, ["strings", "ints", "floats"]] appended = a.append(b, sort=sort) - assert isna(appended['strings'][0:4]).all() - assert isna(appended['bools'][5:]).all() + assert isna(appended["strings"][0:4]).all() + assert isna(appended["bools"][5:]).all() def test_append_many(self, sort, float_frame): - chunks = [float_frame[:5], float_frame[5:10], - float_frame[10:15], float_frame[15:]] + chunks = [ + float_frame[:5], + float_frame[5:10], + float_frame[10:15], + float_frame[15:], + ] result = chunks[0].append(chunks[1:]) tm.assert_frame_equal(result, float_frame) chunks[-1] = chunks[-1].copy() - chunks[-1]['foo'] = 'bar' + chunks[-1]["foo"] = "bar" result = chunks[0].append(chunks[1:], sort=sort) tm.assert_frame_equal(result.loc[:, float_frame.columns], float_frame) - assert (result['foo'][15:] == 'bar').all() - assert result['foo'][:15].isna().all() + assert (result["foo"][15:] == "bar").all() + assert result["foo"][:15].isna().all() def test_append_preserve_index_name(self): # #980 - df1 = DataFrame(columns=['A', 'B', 'C']) - df1 = df1.set_index(['A']) - df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], - columns=['A', 'B', 'C']) - df2 = df2.set_index(['A']) + df1 = DataFrame(columns=["A", "B", "C"]) + df1 = df1.set_index(["A"]) + df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"]) + df2 = df2.set_index(["A"]) result = df1.append(df2) - assert result.index.name == 'A' + assert result.index.name == "A" indexes_can_append = [ pd.RangeIndex(3), pd.Index([4, 5, 6]), pd.Index([4.5, 5.5, 6.5]), - pd.Index(list('abc')), - pd.CategoricalIndex('A B C'.split()), - pd.CategoricalIndex('D E F'.split(), ordered=True), + pd.Index(list("abc")), + pd.CategoricalIndex("A B C".split()), + pd.CategoricalIndex("D E F".split(), ordered=True), pd.IntervalIndex.from_breaks([7, 8, 9, 10]), - pd.DatetimeIndex([dt.datetime(2013, 1, 3, 0, 0), - dt.datetime(2013, 1, 3, 6, 10), - dt.datetime(2013, 1, 3, 7, 12)]), + pd.DatetimeIndex( + [ + dt.datetime(2013, 1, 3, 0, 0), + dt.datetime(2013, 1, 3, 6, 10), + dt.datetime(2013, 1, 3, 7, 12), + ] + ), ] indexes_cannot_append_with_other = [ - pd.MultiIndex.from_arrays(['A B C'.split(), 'D E F'.split()]), + pd.MultiIndex.from_arrays(["A B C".split(), "D E F".split()]) ] all_indexes = indexes_can_append + indexes_cannot_append_with_other - @pytest.mark.parametrize("index", - all_indexes, - ids=lambda x: x.__class__.__name__) + @pytest.mark.parametrize("index", all_indexes, ids=lambda x: x.__class__.__name__) def test_append_same_columns_type(self, index): # GH18359 @@ -921,9 +979,9 @@ def test_append_same_columns_type(self, index): ser_index = index[:2] ser = pd.Series([7, 8], index=ser_index, name=2) result = df.append(ser) - expected = pd.DataFrame([[1., 2., 3.], [4, 5, 6], [7, 8, np.nan]], - index=[0, 1, 2], - columns=index) + expected = pd.DataFrame( + [[1.0, 2.0, 3.0], [4, 5, 6], [7, 8, np.nan]], index=[0, 1, 2], columns=index + ) assert_frame_equal(result, expected) # ser wider than df @@ -932,14 +990,18 @@ def test_append_same_columns_type(self, index): df = pd.DataFrame([[1, 2], [4, 5]], columns=index) ser = pd.Series([7, 8, 9], index=ser_index, name=2) result = df.append(ser) - expected = pd.DataFrame([[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]], - index=[0, 1, 2], - columns=ser_index) + expected = pd.DataFrame( + [[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]], + index=[0, 1, 2], + columns=ser_index, + ) assert_frame_equal(result, expected) - @pytest.mark.parametrize("df_columns, series_index", - combinations(indexes_can_append, r=2), - ids=lambda x: x.__class__.__name__) + @pytest.mark.parametrize( + "df_columns, series_index", + combinations(indexes_can_append, r=2), + ids=lambda x: x.__class__.__name__, + ) def test_append_different_columns_types(self, df_columns, series_index): # GH18359 # See also test 'test_append_different_columns_types_raises' below @@ -951,20 +1013,28 @@ def test_append_different_columns_types(self, df_columns, series_index): result = df.append(ser) idx_diff = ser.index.difference(df_columns) combined_columns = Index(df_columns.tolist()).append(idx_diff) - expected = pd.DataFrame([[1., 2., 3., np.nan, np.nan, np.nan], - [4, 5, 6, np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan, 7, 8, 9]], - index=[0, 1, 2], - columns=combined_columns) + expected = pd.DataFrame( + [ + [1.0, 2.0, 3.0, np.nan, np.nan, np.nan], + [4, 5, 6, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, 7, 8, 9], + ], + index=[0, 1, 2], + columns=combined_columns, + ) assert_frame_equal(result, expected) - @pytest.mark.parametrize('index_can_append', indexes_can_append, - ids=lambda x: x.__class__.__name__) - @pytest.mark.parametrize('index_cannot_append_with_other', - indexes_cannot_append_with_other, - ids=lambda x: x.__class__.__name__) + @pytest.mark.parametrize( + "index_can_append", indexes_can_append, ids=lambda x: x.__class__.__name__ + ) + @pytest.mark.parametrize( + "index_cannot_append_with_other", + indexes_cannot_append_with_other, + ids=lambda x: x.__class__.__name__, + ) def test_append_different_columns_types_raises( - self, index_can_append, index_cannot_append_with_other): + self, index_can_append, index_cannot_append_with_other + ): # GH18359 # Dataframe.append will raise if MultiIndex appends # or is appended to a different index type @@ -973,17 +1043,19 @@ def test_append_different_columns_types_raises( # appending without raising. df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append) - ser = pd.Series([7, 8, 9], index=index_cannot_append_with_other, - name=2) - msg = (r"Expected tuple, got (int|long|float|str|" - r"pandas._libs.interval.Interval)|" - r"object of type '(int|float|Timestamp|" - r"pandas._libs.interval.Interval)' has no len\(\)|") + ser = pd.Series([7, 8, 9], index=index_cannot_append_with_other, name=2) + msg = ( + r"Expected tuple, got (int|long|float|str|" + r"pandas._libs.interval.Interval)|" + r"object of type '(int|float|Timestamp|" + r"pandas._libs.interval.Interval)' has no len\(\)|" + ) with pytest.raises(TypeError, match=msg): df.append(ser) - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], - columns=index_cannot_append_with_other) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], columns=index_cannot_append_with_other + ) ser = pd.Series([7, 8, 9], index=index_can_append, name=2) with pytest.raises(TypeError, match=msg): @@ -994,69 +1066,87 @@ def test_append_dtype_coerce(self, sort): # GH 4993 # appending with datetime will incorrectly convert datetime64 - df1 = DataFrame(index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0), - dt.datetime(2013, 1, 2, 0, 0)], - columns=['start_time']) - df2 = DataFrame(index=[4, 5], data=[[dt.datetime(2013, 1, 3, 0, 0), - dt.datetime(2013, 1, 3, 6, 10)], - [dt.datetime(2013, 1, 4, 0, 0), - dt.datetime(2013, 1, 4, 7, 10)]], - columns=['start_time', 'end_time']) - - expected = concat([Series([pd.NaT, - pd.NaT, - dt.datetime(2013, 1, 3, 6, 10), - dt.datetime(2013, 1, 4, 7, 10)], - name='end_time'), - Series([dt.datetime(2013, 1, 1, 0, 0), - dt.datetime(2013, 1, 2, 0, 0), - dt.datetime(2013, 1, 3, 0, 0), - dt.datetime(2013, 1, 4, 0, 0)], - name='start_time')], - axis=1, sort=sort) + df1 = DataFrame( + index=[1, 2], + data=[dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0)], + columns=["start_time"], + ) + df2 = DataFrame( + index=[4, 5], + data=[ + [dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10)], + [dt.datetime(2013, 1, 4, 0, 0), dt.datetime(2013, 1, 4, 7, 10)], + ], + columns=["start_time", "end_time"], + ) + + expected = concat( + [ + Series( + [ + pd.NaT, + pd.NaT, + dt.datetime(2013, 1, 3, 6, 10), + dt.datetime(2013, 1, 4, 7, 10), + ], + name="end_time", + ), + Series( + [ + dt.datetime(2013, 1, 1, 0, 0), + dt.datetime(2013, 1, 2, 0, 0), + dt.datetime(2013, 1, 3, 0, 0), + dt.datetime(2013, 1, 4, 0, 0), + ], + name="start_time", + ), + ], + axis=1, + sort=sort, + ) result = df1.append(df2, ignore_index=True, sort=sort) if sort: - expected = expected[['end_time', 'start_time']] + expected = expected[["end_time", "start_time"]] else: - expected = expected[['start_time', 'end_time']] + expected = expected[["start_time", "end_time"]] assert_frame_equal(result, expected) def test_append_missing_column_proper_upcast(self, sort): - df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8')}) - df2 = DataFrame({'B': np.array([True, False, True, False], - dtype=bool)}) + df1 = DataFrame({"A": np.array([1, 2, 3, 4], dtype="i8")}) + df2 = DataFrame({"B": np.array([True, False, True, False], dtype=bool)}) appended = df1.append(df2, ignore_index=True, sort=sort) - assert appended['A'].dtype == 'f8' - assert appended['B'].dtype == 'O' + assert appended["A"].dtype == "f8" + assert appended["B"].dtype == "O" def test_append_empty_frame_to_series_with_dateutil_tz(self): # GH 23682 - date = Timestamp('2018-10-24 07:30:00', tz=dateutil.tz.tzutc()) - s = Series({'date': date, 'a': 1.0, 'b': 2.0}) - df = DataFrame(columns=['c', 'd']) + date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc()) + s = Series({"date": date, "a": 1.0, "b": 2.0}) + df = DataFrame(columns=["c", "d"]) result = df.append(s, ignore_index=True) # n.b. it's not clear to me that expected is correct here. # It's possible that the `date` column should have # datetime64[ns, tz] dtype for both result and expected. # that would be more consistent with new columns having # their own dtype (float for a and b, datetime64ns, tz for date). - expected = DataFrame([[np.nan, np.nan, 1., 2., date]], - columns=['c', 'd', 'a', 'b', 'date'], - dtype=object) + expected = DataFrame( + [[np.nan, np.nan, 1.0, 2.0, date]], + columns=["c", "d", "a", "b", "date"], + dtype=object, + ) # These columns get cast to object after append - expected['a'] = expected['a'].astype(float) - expected['b'] = expected['b'].astype(float) + expected["a"] = expected["a"].astype(float) + expected["b"] = expected["b"].astype(float) assert_frame_equal(result, expected) class TestConcatenate: - def test_concat_copy(self): df = DataFrame(np.random.randn(4, 3)) df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1)) - df3 = DataFrame({5: 'foo'}, index=range(4)) + df3 = DataFrame({5: "foo"}, index=range(4)) # These are actual copies. result = concat([df, df2, df3], axis=1, copy=True) @@ -1095,17 +1185,15 @@ def test_concat_with_group_keys(self): df2 = DataFrame(np.random.randn(4, 4)) result = concat([df, df2], keys=[0, 1]) - exp_index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1, 1], - [0, 1, 2, 0, 1, 2, 3]]) - expected = DataFrame(np.r_[df.values, df2.values], - index=exp_index) + exp_index = MultiIndex.from_arrays( + [[0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 0, 1, 2, 3]] + ) + expected = DataFrame(np.r_[df.values, df2.values], index=exp_index) tm.assert_frame_equal(result, expected) result = concat([df, df], keys=[0, 1]) - exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], - [0, 1, 2, 0, 1, 2]]) - expected = DataFrame(np.r_[df.values, df.values], - index=exp_index2) + exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) + expected = DataFrame(np.r_[df.values, df.values], index=exp_index2) tm.assert_frame_equal(result, expected) # axis=1 @@ -1113,53 +1201,55 @@ def test_concat_with_group_keys(self): df2 = DataFrame(np.random.randn(4, 4)) result = concat([df, df2], keys=[0, 1], axis=1) - expected = DataFrame(np.c_[df.values, df2.values], - columns=exp_index) + expected = DataFrame(np.c_[df.values, df2.values], columns=exp_index) tm.assert_frame_equal(result, expected) result = concat([df, df], keys=[0, 1], axis=1) - expected = DataFrame(np.c_[df.values, df.values], - columns=exp_index2) + expected = DataFrame(np.c_[df.values, df.values], columns=exp_index2) tm.assert_frame_equal(result, expected) def test_concat_keys_specific_levels(self): df = DataFrame(np.random.randn(10, 4)) pieces = [df.iloc[:, [0, 1]], df.iloc[:, [2]], df.iloc[:, [3]]] - level = ['three', 'two', 'one', 'zero'] - result = concat(pieces, axis=1, keys=['one', 'two', 'three'], - levels=[level], - names=['group_key']) + level = ["three", "two", "one", "zero"] + result = concat( + pieces, + axis=1, + keys=["one", "two", "three"], + levels=[level], + names=["group_key"], + ) - tm.assert_index_equal(result.columns.levels[0], - Index(level, name='group_key')) - assert result.columns.names[0] == 'group_key' + tm.assert_index_equal(result.columns.levels[0], Index(level, name="group_key")) + assert result.columns.names[0] == "group_key" def test_concat_dataframe_keys_bug(self, sort): - t1 = DataFrame({ - 'value': Series([1, 2, 3], index=Index(['a', 'b', 'c'], - name='id'))}) - t2 = DataFrame({ - 'value': Series([7, 8], index=Index(['a', 'b'], name='id'))}) + t1 = DataFrame( + {"value": Series([1, 2, 3], index=Index(["a", "b", "c"], name="id"))} + ) + t2 = DataFrame({"value": Series([7, 8], index=Index(["a", "b"], name="id"))}) # it works - result = concat([t1, t2], axis=1, keys=['t1', 't2'], sort=sort) - assert list(result.columns) == [('t1', 'value'), ('t2', 'value')] + result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort) + assert list(result.columns) == [("t1", "value"), ("t2", "value")] def test_concat_series_partial_columns_names(self): # GH10698 - foo = Series([1, 2], name='foo') + foo = Series([1, 2], name="foo") bar = Series([1, 2]) baz = Series([4, 5]) result = concat([foo, bar, baz], axis=1) - expected = DataFrame({'foo': [1, 2], 0: [1, 2], 1: [ - 4, 5]}, columns=['foo', 0, 1]) + expected = DataFrame( + {"foo": [1, 2], 0: [1, 2], 1: [4, 5]}, columns=["foo", 0, 1] + ) tm.assert_frame_equal(result, expected) - result = concat([foo, bar, baz], axis=1, keys=[ - 'red', 'blue', 'yellow']) - expected = DataFrame({'red': [1, 2], 'blue': [1, 2], 'yellow': [ - 4, 5]}, columns=['red', 'blue', 'yellow']) + result = concat([foo, bar, baz], axis=1, keys=["red", "blue", "yellow"]) + expected = DataFrame( + {"red": [1, 2], "blue": [1, 2], "yellow": [4, 5]}, + columns=["red", "blue", "yellow"], + ) tm.assert_frame_equal(result, expected) result = concat([foo, bar, baz], axis=1, ignore_index=True) @@ -1167,10 +1257,12 @@ def test_concat_series_partial_columns_names(self): tm.assert_frame_equal(result, expected) def test_concat_dict(self): - frames = {'foo': DataFrame(np.random.randn(4, 3)), - 'bar': DataFrame(np.random.randn(4, 3)), - 'baz': DataFrame(np.random.randn(4, 3)), - 'qux': DataFrame(np.random.randn(4, 3))} + frames = { + "foo": DataFrame(np.random.randn(4, 3)), + "bar": DataFrame(np.random.randn(4, 3)), + "baz": DataFrame(np.random.randn(4, 3)), + "qux": DataFrame(np.random.randn(4, 3)), + } sorted_keys = com.dict_keys_to_ordered_list(frames) @@ -1179,198 +1271,226 @@ def test_concat_dict(self): tm.assert_frame_equal(result, expected) result = concat(frames, axis=1) - expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys, - axis=1) + expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys, axis=1) tm.assert_frame_equal(result, expected) - keys = ['baz', 'foo', 'bar'] + keys = ["baz", "foo", "bar"] result = concat(frames, keys=keys) expected = concat([frames[k] for k in keys], keys=keys) tm.assert_frame_equal(result, expected) def test_concat_ignore_index(self, sort): - frame1 = DataFrame({"test1": ["a", "b", "c"], - "test2": [1, 2, 3], - "test3": [4.5, 3.2, 1.2]}) + frame1 = DataFrame( + {"test1": ["a", "b", "c"], "test2": [1, 2, 3], "test3": [4.5, 3.2, 1.2]} + ) frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]}) frame1.index = Index(["x", "y", "z"]) frame2.index = Index(["x", "y", "q"]) - v1 = concat([frame1, frame2], axis=1, - ignore_index=True, sort=sort) + v1 = concat([frame1, frame2], axis=1, ignore_index=True, sort=sort) nan = np.nan - expected = DataFrame([[nan, nan, nan, 4.3], - ['a', 1, 4.5, 5.2], - ['b', 2, 3.2, 2.2], - ['c', 3, 1.2, nan]], - index=Index(["q", "x", "y", "z"])) + expected = DataFrame( + [ + [nan, nan, nan, 4.3], + ["a", 1, 4.5, 5.2], + ["b", 2, 3.2, 2.2], + ["c", 3, 1.2, nan], + ], + index=Index(["q", "x", "y", "z"]), + ) if not sort: - expected = expected.loc[['x', 'y', 'z', 'q']] + expected = expected.loc[["x", "y", "z", "q"]] tm.assert_frame_equal(v1, expected) def test_concat_multiindex_with_keys(self): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - frame = DataFrame(np.random.randn(10, 3), index=index, - columns=Index(['A', 'B', 'C'], name='exp')) - result = concat([frame, frame], keys=[0, 1], names=['iteration']) - - assert result.index.names == ('iteration',) + index.names + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + frame = DataFrame( + np.random.randn(10, 3), + index=index, + columns=Index(["A", "B", "C"], name="exp"), + ) + result = concat([frame, frame], keys=[0, 1], names=["iteration"]) + + assert result.index.names == ("iteration",) + index.names tm.assert_frame_equal(result.loc[0], frame) tm.assert_frame_equal(result.loc[1], frame) assert result.index.nlevels == 3 def test_concat_multiindex_with_tz(self): # GH 6606 - df = DataFrame({'dt': [datetime(2014, 1, 1), - datetime(2014, 1, 2), - datetime(2014, 1, 3)], - 'b': ['A', 'B', 'C'], - 'c': [1, 2, 3], 'd': [4, 5, 6]}) - df['dt'] = df['dt'].apply(lambda d: Timestamp(d, tz='US/Pacific')) - df = df.set_index(['dt', 'b']) - - exp_idx1 = DatetimeIndex(['2014-01-01', '2014-01-02', - '2014-01-03'] * 2, - tz='US/Pacific', name='dt') - exp_idx2 = Index(['A', 'B', 'C'] * 2, name='b') + df = DataFrame( + { + "dt": [ + datetime(2014, 1, 1), + datetime(2014, 1, 2), + datetime(2014, 1, 3), + ], + "b": ["A", "B", "C"], + "c": [1, 2, 3], + "d": [4, 5, 6], + } + ) + df["dt"] = df["dt"].apply(lambda d: Timestamp(d, tz="US/Pacific")) + df = df.set_index(["dt", "b"]) + + exp_idx1 = DatetimeIndex( + ["2014-01-01", "2014-01-02", "2014-01-03"] * 2, tz="US/Pacific", name="dt" + ) + exp_idx2 = Index(["A", "B", "C"] * 2, name="b") exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) - expected = DataFrame({'c': [1, 2, 3] * 2, 'd': [4, 5, 6] * 2}, - index=exp_idx, columns=['c', 'd']) + expected = DataFrame( + {"c": [1, 2, 3] * 2, "d": [4, 5, 6] * 2}, index=exp_idx, columns=["c", "d"] + ) result = concat([df, df]) tm.assert_frame_equal(result, expected) def test_concat_multiindex_with_none_in_index_names(self): # GH 15787 - index = pd.MultiIndex.from_product([[1], range(5)], - names=['level1', None]) - df = pd.DataFrame({'col': range(5)}, index=index, dtype=np.int32) - - result = concat([df, df], keys=[1, 2], names=['level2']) - index = pd.MultiIndex.from_product([[1, 2], [1], range(5)], - names=['level2', 'level1', None]) - expected = pd.DataFrame({'col': list(range(5)) * 2}, - index=index, dtype=np.int32) + index = pd.MultiIndex.from_product([[1], range(5)], names=["level1", None]) + df = pd.DataFrame({"col": range(5)}, index=index, dtype=np.int32) + + result = concat([df, df], keys=[1, 2], names=["level2"]) + index = pd.MultiIndex.from_product( + [[1, 2], [1], range(5)], names=["level2", "level1", None] + ) + expected = pd.DataFrame( + {"col": list(range(5)) * 2}, index=index, dtype=np.int32 + ) assert_frame_equal(result, expected) - result = concat([df, df[:2]], keys=[1, 2], names=['level2']) + result = concat([df, df[:2]], keys=[1, 2], names=["level2"]) level2 = [1] * 5 + [2] * 2 level1 = [1] * 7 no_name = list(range(5)) + list(range(2)) tuples = list(zip(level2, level1, no_name)) - index = pd.MultiIndex.from_tuples(tuples, - names=['level2', 'level1', None]) - expected = pd.DataFrame({'col': no_name}, index=index, - dtype=np.int32) + index = pd.MultiIndex.from_tuples(tuples, names=["level2", "level1", None]) + expected = pd.DataFrame({"col": no_name}, index=index, dtype=np.int32) assert_frame_equal(result, expected) def test_concat_keys_and_levels(self): df = DataFrame(np.random.randn(1, 3)) df2 = DataFrame(np.random.randn(1, 4)) - levels = [['foo', 'baz'], ['one', 'two']] - names = ['first', 'second'] - result = concat([df, df2, df, df2], - keys=[('foo', 'one'), ('foo', 'two'), - ('baz', 'one'), ('baz', 'two')], - levels=levels, - names=names) + levels = [["foo", "baz"], ["one", "two"]] + names = ["first", "second"] + result = concat( + [df, df2, df, df2], + keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], + levels=levels, + names=names, + ) expected = concat([df, df2, df, df2]) - exp_index = MultiIndex(levels=levels + [[0]], - codes=[[0, 0, 1, 1], [0, 1, 0, 1], - [0, 0, 0, 0]], - names=names + [None]) + exp_index = MultiIndex( + levels=levels + [[0]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1], [0, 0, 0, 0]], + names=names + [None], + ) expected.index = exp_index tm.assert_frame_equal(result, expected) # no names - result = concat([df, df2, df, df2], - keys=[('foo', 'one'), ('foo', 'two'), - ('baz', 'one'), ('baz', 'two')], - levels=levels) + result = concat( + [df, df2, df, df2], + keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], + levels=levels, + ) assert result.index.names == (None,) * 3 # no levels - result = concat([df, df2, df, df2], - keys=[('foo', 'one'), ('foo', 'two'), - ('baz', 'one'), ('baz', 'two')], - names=['first', 'second']) - assert result.index.names == ('first', 'second') + (None,) - tm.assert_index_equal(result.index.levels[0], - Index(['baz', 'foo'], name='first')) + result = concat( + [df, df2, df, df2], + keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], + names=["first", "second"], + ) + assert result.index.names == ("first", "second") + (None,) + tm.assert_index_equal( + result.index.levels[0], Index(["baz", "foo"], name="first") + ) def test_concat_keys_levels_no_overlap(self): # GH #1406 - df = DataFrame(np.random.randn(1, 3), index=['a']) - df2 = DataFrame(np.random.randn(1, 4), index=['b']) + df = DataFrame(np.random.randn(1, 3), index=["a"]) + df2 = DataFrame(np.random.randn(1, 4), index=["b"]) msg = "Values not found in passed level" with pytest.raises(ValueError, match=msg): - concat([df, df], - keys=['one', 'two'], levels=[['foo', 'bar', 'baz']]) + concat([df, df], keys=["one", "two"], levels=[["foo", "bar", "baz"]]) msg = "Key one not in level" with pytest.raises(ValueError, match=msg): - concat([df, df2], - keys=['one', 'two'], levels=[['foo', 'bar', 'baz']]) + concat([df, df2], keys=["one", "two"], levels=[["foo", "bar", "baz"]]) def test_concat_rename_index(self): - a = DataFrame(np.random.rand(3, 3), - columns=list('ABC'), - index=Index(list('abc'), name='index_a')) - b = DataFrame(np.random.rand(3, 3), - columns=list('ABC'), - index=Index(list('abc'), name='index_b')) + a = DataFrame( + np.random.rand(3, 3), + columns=list("ABC"), + index=Index(list("abc"), name="index_a"), + ) + b = DataFrame( + np.random.rand(3, 3), + columns=list("ABC"), + index=Index(list("abc"), name="index_b"), + ) - result = concat([a, b], keys=['key0', 'key1'], - names=['lvl0', 'lvl1']) + result = concat([a, b], keys=["key0", "key1"], names=["lvl0", "lvl1"]) - exp = concat([a, b], keys=['key0', 'key1'], names=['lvl0']) + exp = concat([a, b], keys=["key0", "key1"], names=["lvl0"]) names = list(exp.index.names) - names[1] = 'lvl1' + names[1] = "lvl1" exp.index.set_names(names, inplace=True) tm.assert_frame_equal(result, exp) assert result.index.names == exp.index.names def test_crossed_dtypes_weird_corner(self): - columns = ['A', 'B', 'C', 'D'] - df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='f8'), - 'B': np.array([1, 2, 3, 4], dtype='i8'), - 'C': np.array([1, 2, 3, 4], dtype='f8'), - 'D': np.array([1, 2, 3, 4], dtype='i8')}, - columns=columns) - - df2 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8'), - 'B': np.array([1, 2, 3, 4], dtype='f8'), - 'C': np.array([1, 2, 3, 4], dtype='i8'), - 'D': np.array([1, 2, 3, 4], dtype='f8')}, - columns=columns) + columns = ["A", "B", "C", "D"] + df1 = DataFrame( + { + "A": np.array([1, 2, 3, 4], dtype="f8"), + "B": np.array([1, 2, 3, 4], dtype="i8"), + "C": np.array([1, 2, 3, 4], dtype="f8"), + "D": np.array([1, 2, 3, 4], dtype="i8"), + }, + columns=columns, + ) + + df2 = DataFrame( + { + "A": np.array([1, 2, 3, 4], dtype="i8"), + "B": np.array([1, 2, 3, 4], dtype="f8"), + "C": np.array([1, 2, 3, 4], dtype="i8"), + "D": np.array([1, 2, 3, 4], dtype="f8"), + }, + columns=columns, + ) appended = df1.append(df2, ignore_index=True) - expected = DataFrame(np.concatenate([df1.values, df2.values], axis=0), - columns=columns) + expected = DataFrame( + np.concatenate([df1.values, df2.values], axis=0), columns=columns + ) tm.assert_frame_equal(appended, expected) - df = DataFrame(np.random.randn(1, 3), index=['a']) - df2 = DataFrame(np.random.randn(1, 4), index=['b']) - result = concat( - [df, df2], keys=['one', 'two'], names=['first', 'second']) - assert result.index.names == ('first', 'second') + df = DataFrame(np.random.randn(1, 3), index=["a"]) + df2 = DataFrame(np.random.randn(1, 4), index=["b"]) + result = concat([df, df2], keys=["one", "two"], names=["first", "second"]) + assert result.index.names == ("first", "second") def test_dups_index(self): # GH 4771 # single dtypes - df = DataFrame(np.random.randint(0, 10, size=40).reshape( - 10, 4), columns=['A', 'A', 'C', 'C']) + df = DataFrame( + np.random.randint(0, 10, size=40).reshape(10, 4), + columns=["A", "A", "C", "C"], + ) result = concat([df, df], axis=1) assert_frame_equal(result.iloc[:, :4], df) @@ -1381,12 +1501,15 @@ def test_dups_index(self): assert_frame_equal(result.iloc[10:], df) # multi dtypes - df = concat([DataFrame(np.random.randn(10, 4), - columns=['A', 'A', 'B', 'B']), - DataFrame(np.random.randint(0, 10, size=20) - .reshape(10, 2), - columns=['A', 'C'])], - axis=1) + df = concat( + [ + DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]), + DataFrame( + np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] + ), + ], + axis=1, + ) result = concat([df, df], axis=1) assert_frame_equal(result.iloc[:, :6], df) @@ -1410,32 +1533,33 @@ def test_dups_index(self): def test_with_mixed_tuples(self, sort): # 10697 # columns have mixed tuples, so handle properly - df1 = DataFrame({'A': 'foo', ('B', 1): 'bar'}, index=range(2)) - df2 = DataFrame({'B': 'foo', ('B', 1): 'bar'}, index=range(2)) + df1 = DataFrame({"A": "foo", ("B", 1): "bar"}, index=range(2)) + df2 = DataFrame({"B": "foo", ("B", 1): "bar"}, index=range(2)) # it works concat([df1, df2], sort=sort) def test_handle_empty_objects(self, sort): - df = DataFrame(np.random.randn(10, 4), columns=list('abcd')) + df = DataFrame(np.random.randn(10, 4), columns=list("abcd")) baz = df[:5].copy() - baz['foo'] = 'bar' + baz["foo"] = "bar" empty = df[5:5] frames = [baz, empty, empty, df[5:]] concatted = concat(frames, axis=0, sort=sort) - expected = df.reindex(columns=['a', 'b', 'c', 'd', 'foo']) - expected['foo'] = expected['foo'].astype('O') - expected.loc[0:4, 'foo'] = 'bar' + expected = df.reindex(columns=["a", "b", "c", "d", "foo"]) + expected["foo"] = expected["foo"].astype("O") + expected.loc[0:4, "foo"] = "bar" tm.assert_frame_equal(concatted, expected) # empty as first element with time series # GH3259 - df = DataFrame(dict(A=range(10000)), index=date_range( - '20130101', periods=10000, freq='s')) + df = DataFrame( + dict(A=range(10000)), index=date_range("20130101", periods=10000, freq="s") + ) empty = DataFrame() result = concat([df, empty], axis=1) assert_frame_equal(result, df) @@ -1453,54 +1577,62 @@ def test_concat_mixed_objs(self): # G2385 # axis 1 - index = date_range('01-Jan-2013', periods=10, freq='H') - arr = np.arange(10, dtype='int64') + index = date_range("01-Jan-2013", periods=10, freq="H") + arr = np.arange(10, dtype="int64") s1 = Series(arr, index=index) s2 = Series(arr, index=index) df = DataFrame(arr.reshape(-1, 1), index=index) - expected = DataFrame(np.repeat(arr, 2).reshape(-1, 2), - index=index, columns=[0, 0]) + expected = DataFrame( + np.repeat(arr, 2).reshape(-1, 2), index=index, columns=[0, 0] + ) result = concat([df, df], axis=1) assert_frame_equal(result, expected) - expected = DataFrame(np.repeat(arr, 2).reshape(-1, 2), - index=index, columns=[0, 1]) + expected = DataFrame( + np.repeat(arr, 2).reshape(-1, 2), index=index, columns=[0, 1] + ) result = concat([s1, s2], axis=1) assert_frame_equal(result, expected) - expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), - index=index, columns=[0, 1, 2]) + expected = DataFrame( + np.repeat(arr, 3).reshape(-1, 3), index=index, columns=[0, 1, 2] + ) result = concat([s1, s2, s1], axis=1) assert_frame_equal(result, expected) - expected = DataFrame(np.repeat(arr, 5).reshape(-1, 5), - index=index, columns=[0, 0, 1, 2, 3]) + expected = DataFrame( + np.repeat(arr, 5).reshape(-1, 5), index=index, columns=[0, 0, 1, 2, 3] + ) result = concat([s1, df, s2, s2, s1], axis=1) assert_frame_equal(result, expected) # with names - s1.name = 'foo' - expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), - index=index, columns=['foo', 0, 0]) + s1.name = "foo" + expected = DataFrame( + np.repeat(arr, 3).reshape(-1, 3), index=index, columns=["foo", 0, 0] + ) result = concat([s1, df, s2], axis=1) assert_frame_equal(result, expected) - s2.name = 'bar' - expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), - index=index, columns=['foo', 0, 'bar']) + s2.name = "bar" + expected = DataFrame( + np.repeat(arr, 3).reshape(-1, 3), index=index, columns=["foo", 0, "bar"] + ) result = concat([s1, df, s2], axis=1) assert_frame_equal(result, expected) # ignore index - expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), - index=index, columns=[0, 1, 2]) + expected = DataFrame( + np.repeat(arr, 3).reshape(-1, 3), index=index, columns=[0, 1, 2] + ) result = concat([s1, df, s2], axis=1, ignore_index=True) assert_frame_equal(result, expected) # axis 0 - expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), - index=index.tolist() * 3, columns=[0]) + expected = DataFrame( + np.tile(arr, 3).reshape(-1, 1), index=index.tolist() * 3, columns=[0] + ) result = concat([s1, df, s2]) assert_frame_equal(result, expected) @@ -1516,8 +1648,8 @@ def test_empty_dtype_coerce(self): # see below # 10571 - df1 = DataFrame(data=[[1, None], [2, None]], columns=['a', 'b']) - df2 = DataFrame(data=[[3, None], [4, None]], columns=['a', 'b']) + df1 = DataFrame(data=[[1, None], [2, None]], columns=["a", "b"]) + df2 = DataFrame(data=[[3, None], [4, None]], columns=["a", "b"]) result = concat([df1, df2]) expected = df1.dtypes tm.assert_series_equal(result.dtypes, expected) @@ -1525,28 +1657,29 @@ def test_empty_dtype_coerce(self): def test_dtype_coerceion(self): # 12411 - df = DataFrame({'date': [pd.Timestamp('20130101').tz_localize('UTC'), - pd.NaT]}) + df = DataFrame({"date": [pd.Timestamp("20130101").tz_localize("UTC"), pd.NaT]}) result = concat([df.iloc[[0]], df.iloc[[1]]]) tm.assert_series_equal(result.dtypes, df.dtypes) # 12045 import datetime - df = DataFrame({'date': [datetime.datetime(2012, 1, 1), - datetime.datetime(1012, 1, 2)]}) + + df = DataFrame( + {"date": [datetime.datetime(2012, 1, 1), datetime.datetime(1012, 1, 2)]} + ) result = concat([df.iloc[[0]], df.iloc[[1]]]) tm.assert_series_equal(result.dtypes, df.dtypes) # 11594 - df = DataFrame({'text': ['some words'] + [None] * 9}) + df = DataFrame({"text": ["some words"] + [None] * 9}) result = concat([df.iloc[[0]], df.iloc[[1]]]) tm.assert_series_equal(result.dtypes, df.dtypes) def test_concat_series(self): ts = tm.makeTimeSeries() - ts.name = 'foo' + ts.name = "foo" pieces = [ts[:5], ts[5:15], ts[15:]] @@ -1557,12 +1690,10 @@ def test_concat_series(self): result = concat(pieces, keys=[0, 1, 2]) expected = ts.copy() - ts.index = DatetimeIndex(np.array(ts.index.values, dtype='M8[ns]')) + ts.index = DatetimeIndex(np.array(ts.index.values, dtype="M8[ns]")) - exp_codes = [np.repeat([0, 1, 2], [len(x) for x in pieces]), - np.arange(len(ts))] - exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], - codes=exp_codes) + exp_codes = [np.repeat([0, 1, 2], [len(x) for x in pieces]), np.arange(len(ts))] + exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], codes=exp_codes) expected.index = exp_index tm.assert_series_equal(result, expected) @@ -1575,52 +1706,51 @@ def test_concat_series_axis1(self, sort=sort): expected = DataFrame(pieces).T assert_frame_equal(result, expected) - result = concat(pieces, keys=['A', 'B', 'C'], axis=1) - expected = DataFrame(pieces, index=['A', 'B', 'C']).T + result = concat(pieces, keys=["A", "B", "C"], axis=1) + expected = DataFrame(pieces, index=["A", "B", "C"]).T assert_frame_equal(result, expected) # preserve series names, #2489 - s = Series(randn(5), name='A') - s2 = Series(randn(5), name='B') + s = Series(randn(5), name="A") + s2 = Series(randn(5), name="B") result = concat([s, s2], axis=1) - expected = DataFrame({'A': s, 'B': s2}) + expected = DataFrame({"A": s, "B": s2}) assert_frame_equal(result, expected) s2.name = None result = concat([s, s2], axis=1) - tm.assert_index_equal(result.columns, - Index(['A', 0], dtype='object')) + tm.assert_index_equal(result.columns, Index(["A", 0], dtype="object")) # must reindex, #2603 - s = Series(randn(3), index=['c', 'a', 'b'], name='A') - s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B') + s = Series(randn(3), index=["c", "a", "b"], name="A") + s2 = Series(randn(4), index=["d", "a", "b", "c"], name="B") result = concat([s, s2], axis=1, sort=sort) - expected = DataFrame({'A': s, 'B': s2}) + expected = DataFrame({"A": s, "B": s2}) assert_frame_equal(result, expected) def test_concat_series_axis1_names_applied(self): # ensure names argument is not ignored on axis=1, #23490 s = Series([1, 2, 3]) s2 = Series([4, 5, 6]) - result = concat([s, s2], axis=1, keys=['a', 'b'], names=['A']) - expected = DataFrame([[1, 4], [2, 5], [3, 6]], - columns=pd.Index(['a', 'b'], name='A')) + result = concat([s, s2], axis=1, keys=["a", "b"], names=["A"]) + expected = DataFrame( + [[1, 4], [2, 5], [3, 6]], columns=pd.Index(["a", "b"], name="A") + ) assert_frame_equal(result, expected) - result = concat([s, s2], axis=1, keys=[('a', 1), ('b', 2)], - names=['A', 'B']) - expected = DataFrame([[1, 4], [2, 5], [3, 6]], - columns=MultiIndex.from_tuples([('a', 1), - ('b', 2)], - names=['A', 'B'])) + result = concat([s, s2], axis=1, keys=[("a", 1), ("b", 2)], names=["A", "B"]) + expected = DataFrame( + [[1, 4], [2, 5], [3, 6]], + columns=MultiIndex.from_tuples([("a", 1), ("b", 2)], names=["A", "B"]), + ) assert_frame_equal(result, expected) def test_concat_single_with_key(self): df = DataFrame(np.random.randn(10, 4)) - result = concat([df], keys=['foo']) - expected = concat([df, df], keys=['foo', 'bar']) + result = concat([df], keys=["foo"]) + expected = concat([df, df], keys=["foo", "bar"]) tm.assert_frame_equal(result, expected[:10]) def test_concat_exclude_none(self): @@ -1635,24 +1765,24 @@ def test_concat_exclude_none(self): def test_concat_datetime64_block(self): from pandas.core.indexes.datetimes import date_range - rng = date_range('1/1/2000', periods=10) + rng = date_range("1/1/2000", periods=10) - df = DataFrame({'time': rng}) + df = DataFrame({"time": rng}) result = concat([df, df]) - assert (result.iloc[:10]['time'] == rng).all() - assert (result.iloc[10:]['time'] == rng).all() + assert (result.iloc[:10]["time"] == rng).all() + assert (result.iloc[10:]["time"] == rng).all() def test_concat_timedelta64_block(self): from pandas import to_timedelta - rng = to_timedelta(np.arange(10), unit='s') + rng = to_timedelta(np.arange(10), unit="s") - df = DataFrame({'time': rng}) + df = DataFrame({"time": rng}) result = concat([df, df]) - assert (result.iloc[:10]['time'] == rng).all() - assert (result.iloc[10:]['time'] == rng).all() + assert (result.iloc[:10]["time"] == rng).all() + assert (result.iloc[10:]["time"] == rng).all() def test_concat_keys_with_none(self): # #1649 @@ -1662,10 +1792,10 @@ def test_concat_keys_with_none(self): expected = concat(dict(b=df0, c=df0[:2], d=df0[:1], e=df0)) tm.assert_frame_equal(result, expected) - result = concat([None, df0, df0[:2], df0[:1], df0], - keys=['a', 'b', 'c', 'd', 'e']) - expected = concat([df0, df0[:2], df0[:1], df0], - keys=['b', 'c', 'd', 'e']) + result = concat( + [None, df0, df0[:2], df0[:1], df0], keys=["a", "b", "c", "d", "e"] + ) + expected = concat([df0, df0[:2], df0[:1], df0], keys=["b", "c", "d", "e"]) tm.assert_frame_equal(result, expected) def test_concat_bug_1719(self): @@ -1674,33 +1804,43 @@ def test_concat_bug_1719(self): # to join with union # these two are of different length! - left = concat([ts1, ts2], join='outer', axis=1) - right = concat([ts2, ts1], join='outer', axis=1) + left = concat([ts1, ts2], join="outer", axis=1) + right = concat([ts2, ts1], join="outer", axis=1) assert len(left) == len(right) def test_concat_bug_2972(self): ts0 = Series(np.zeros(5)) ts1 = Series(np.ones(5)) - ts0.name = ts1.name = 'same name' + ts0.name = ts1.name = "same name" result = concat([ts0, ts1], axis=1) expected = DataFrame({0: ts0, 1: ts1}) - expected.columns = ['same name', 'same name'] + expected.columns = ["same name", "same name"] assert_frame_equal(result, expected) def test_concat_bug_3602(self): # GH 3602, duplicate columns - df1 = DataFrame({'firmNo': [0, 0, 0, 0], 'prc': [6, 6, 6, 6], - 'stringvar': ['rrr', 'rrr', 'rrr', 'rrr']}) - df2 = DataFrame({'C': [9, 10, 11, 12], 'misc': [1, 2, 3, 4], - 'prc': [6, 6, 6, 6]}) - expected = DataFrame([[0, 6, 'rrr', 9, 1, 6], - [0, 6, 'rrr', 10, 2, 6], - [0, 6, 'rrr', 11, 3, 6], - [0, 6, 'rrr', 12, 4, 6]]) - expected.columns = ['firmNo', 'prc', 'stringvar', 'C', 'misc', 'prc'] + df1 = DataFrame( + { + "firmNo": [0, 0, 0, 0], + "prc": [6, 6, 6, 6], + "stringvar": ["rrr", "rrr", "rrr", "rrr"], + } + ) + df2 = DataFrame( + {"C": [9, 10, 11, 12], "misc": [1, 2, 3, 4], "prc": [6, 6, 6, 6]} + ) + expected = DataFrame( + [ + [0, 6, "rrr", 9, 1, 6], + [0, 6, "rrr", 10, 2, 6], + [0, 6, "rrr", 11, 3, 6], + [0, 6, "rrr", 12, 4, 6], + ] + ) + expected.columns = ["firmNo", "prc", "stringvar", "C", "misc", "prc"] result = concat([df1, df2], axis=1) assert_frame_equal(result, expected) @@ -1708,17 +1848,17 @@ def test_concat_bug_3602(self): def test_concat_inner_join_empty(self): # GH 15328 df_empty = pd.DataFrame() - df_a = pd.DataFrame({'a': [1, 2]}, index=[0, 1], dtype='int64') - df_expected = pd.DataFrame({'a': []}, index=[], dtype='int64') + df_a = pd.DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64") + df_expected = pd.DataFrame({"a": []}, index=[], dtype="int64") - for how, expected in [('inner', df_expected), ('outer', df_a)]: + for how, expected in [("inner", df_expected), ("outer", df_a)]: result = pd.concat([df_a, df_empty], axis=1, join=how) assert_frame_equal(result, expected) def test_concat_series_axis1_same_names_ignore_index(self): - dates = date_range('01-Jan-2013', '01-Jan-2014', freq='MS')[0:-1] - s1 = Series(randn(len(dates)), index=dates, name='value') - s2 = Series(randn(len(dates)), index=dates, name='value') + dates = date_range("01-Jan-2013", "01-Jan-2014", freq="MS")[0:-1] + s1 = Series(randn(len(dates)), index=dates, name="value") + s2 = Series(randn(len(dates)), index=dates, name="value") result = concat([s1, s2], axis=1, ignore_index=True) expected = Index([0, 1]) @@ -1733,13 +1873,12 @@ def test_concat_iterables(self): expected = DataFrame([1, 2, 3, 4, 5, 6]) assert_frame_equal(concat((df1, df2), ignore_index=True), expected) assert_frame_equal(concat([df1, df2], ignore_index=True), expected) - assert_frame_equal(concat((df for df in (df1, df2)), - ignore_index=True), expected) assert_frame_equal( - concat(deque((df1, df2)), ignore_index=True), expected) + concat((df for df in (df1, df2)), ignore_index=True), expected + ) + assert_frame_equal(concat(deque((df1, df2)), ignore_index=True), expected) class CustomIterator1: - def __len__(self): return 2 @@ -1748,23 +1887,24 @@ def __getitem__(self, index): return {0: df1, 1: df2}[index] except KeyError: raise IndexError - assert_frame_equal(pd.concat(CustomIterator1(), - ignore_index=True), expected) - class CustomIterator2(abc.Iterable): + assert_frame_equal(pd.concat(CustomIterator1(), ignore_index=True), expected) + class CustomIterator2(abc.Iterable): def __iter__(self): yield df1 yield df2 - assert_frame_equal(pd.concat(CustomIterator2(), - ignore_index=True), expected) + + assert_frame_equal(pd.concat(CustomIterator2(), ignore_index=True), expected) def test_concat_invalid(self): # trying to concat a ndframe with a non-ndframe df1 = mkdf(10, 2) - msg = ("cannot concatenate object of type '{}';" - " only Series and DataFrame objs are valid") + msg = ( + "cannot concatenate object of type '{}';" + " only Series and DataFrame objs are valid" + ) for obj in [1, dict(), [1, 2], (1, 2)]: with pytest.raises(TypeError, match=msg.format(type(obj))): concat([df1, obj]) @@ -1772,8 +1912,10 @@ def test_concat_invalid(self): def test_concat_invalid_first_argument(self): df1 = mkdf(10, 2) df2 = mkdf(10, 2) - msg = ('first argument must be an iterable of pandas ' - 'objects, you passed an object of type "DataFrame"') + msg = ( + "first argument must be an iterable of pandas " + 'objects, you passed an object of type "DataFrame"' + ) with pytest.raises(TypeError, match=msg): concat(df1, df2) @@ -1799,25 +1941,23 @@ def test_concat_invalid_first_argument(self): def test_concat_NaT_series(self): # GH 11693 # test for merging NaT series with datetime series. - x = Series(date_range('20151124 08:00', '20151124 09:00', - freq='1h', tz='US/Eastern')) - y = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]') + x = Series( + date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="US/Eastern") + ) + y = Series(pd.NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]") expected = Series([x[0], x[1], pd.NaT, pd.NaT]) result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) # all NaT with tz - expected = Series(pd.NaT, index=range(4), - dtype='datetime64[ns, US/Eastern]') + expected = Series(pd.NaT, index=range(4), dtype="datetime64[ns, US/Eastern]") result = pd.concat([y, y], ignore_index=True) tm.assert_series_equal(result, expected) # without tz - x = pd.Series(pd.date_range('20151124 08:00', - '20151124 09:00', freq='1h')) - y = pd.Series(pd.date_range('20151124 10:00', - '20151124 11:00', freq='1h')) + x = pd.Series(pd.date_range("20151124 08:00", "20151124 09:00", freq="1h")) + y = pd.Series(pd.date_range("20151124 10:00", "20151124 11:00", freq="1h")) y[:] = pd.NaT expected = pd.Series([x[0], x[1], pd.NaT, pd.NaT]) result = pd.concat([x, y], ignore_index=True) @@ -1825,15 +1965,18 @@ def test_concat_NaT_series(self): # all NaT without tz x[:] = pd.NaT - expected = pd.Series(pd.NaT, index=range(4), - dtype='datetime64[ns]') + expected = pd.Series(pd.NaT, index=range(4), dtype="datetime64[ns]") result = pd.concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) def test_concat_tz_frame(self): - df2 = DataFrame(dict(A=pd.Timestamp('20130102', tz='US/Eastern'), - B=pd.Timestamp('20130603', tz='CET')), - index=range(5)) + df2 = DataFrame( + dict( + A=pd.Timestamp("20130102", tz="US/Eastern"), + B=pd.Timestamp("20130603", tz="CET"), + ), + index=range(5), + ) # concat df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) @@ -1841,250 +1984,270 @@ def test_concat_tz_frame(self): def test_concat_tz_series(self): # gh-11755: tz and no tz - x = Series(date_range('20151124 08:00', - '20151124 09:00', - freq='1h', tz='UTC')) - y = Series(date_range('2012-01-01', '2012-01-02')) - expected = Series([x[0], x[1], y[0], y[1]], - dtype='object') + x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC")) + y = Series(date_range("2012-01-01", "2012-01-02")) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) # gh-11887: concat tz and object - x = Series(date_range('20151124 08:00', - '20151124 09:00', - freq='1h', tz='UTC')) - y = Series(['a', 'b']) - expected = Series([x[0], x[1], y[0], y[1]], - dtype='object') + x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC")) + y = Series(["a", "b"]) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) # see gh-12217 and gh-12306 # Concatenating two UTC times first = pd.DataFrame([[datetime(2016, 1, 1)]]) - first[0] = first[0].dt.tz_localize('UTC') + first[0] = first[0].dt.tz_localize("UTC") second = pd.DataFrame([[datetime(2016, 1, 2)]]) - second[0] = second[0].dt.tz_localize('UTC') + second[0] = second[0].dt.tz_localize("UTC") result = pd.concat([first, second]) - assert result[0].dtype == 'datetime64[ns, UTC]' + assert result[0].dtype == "datetime64[ns, UTC]" # Concatenating two London times first = pd.DataFrame([[datetime(2016, 1, 1)]]) - first[0] = first[0].dt.tz_localize('Europe/London') + first[0] = first[0].dt.tz_localize("Europe/London") second = pd.DataFrame([[datetime(2016, 1, 2)]]) - second[0] = second[0].dt.tz_localize('Europe/London') + second[0] = second[0].dt.tz_localize("Europe/London") result = pd.concat([first, second]) - assert result[0].dtype == 'datetime64[ns, Europe/London]' + assert result[0].dtype == "datetime64[ns, Europe/London]" # Concatenating 2+1 London times first = pd.DataFrame([[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]]) - first[0] = first[0].dt.tz_localize('Europe/London') + first[0] = first[0].dt.tz_localize("Europe/London") second = pd.DataFrame([[datetime(2016, 1, 3)]]) - second[0] = second[0].dt.tz_localize('Europe/London') + second[0] = second[0].dt.tz_localize("Europe/London") result = pd.concat([first, second]) - assert result[0].dtype == 'datetime64[ns, Europe/London]' + assert result[0].dtype == "datetime64[ns, Europe/London]" # Concat'ing 1+2 London times first = pd.DataFrame([[datetime(2016, 1, 1)]]) - first[0] = first[0].dt.tz_localize('Europe/London') + first[0] = first[0].dt.tz_localize("Europe/London") second = pd.DataFrame([[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]]) - second[0] = second[0].dt.tz_localize('Europe/London') + second[0] = second[0].dt.tz_localize("Europe/London") result = pd.concat([first, second]) - assert result[0].dtype == 'datetime64[ns, Europe/London]' + assert result[0].dtype == "datetime64[ns, Europe/London]" def test_concat_tz_series_with_datetimelike(self): # see gh-12620: tz and timedelta - x = [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-02-01', tz='US/Eastern')] - y = [pd.Timedelta('1 day'), pd.Timedelta('2 day')] + x = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-02-01", tz="US/Eastern"), + ] + y = [pd.Timedelta("1 day"), pd.Timedelta("2 day")] result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) - tm.assert_series_equal(result, pd.Series(x + y, dtype='object')) + tm.assert_series_equal(result, pd.Series(x + y, dtype="object")) # tz and period - y = [pd.Period('2011-03', freq='M'), pd.Period('2011-04', freq='M')] + y = [pd.Period("2011-03", freq="M"), pd.Period("2011-04", freq="M")] result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) - tm.assert_series_equal(result, pd.Series(x + y, dtype='object')) + tm.assert_series_equal(result, pd.Series(x + y, dtype="object")) def test_concat_tz_series_tzlocal(self): # see gh-13583 - x = [pd.Timestamp('2011-01-01', tz=dateutil.tz.tzlocal()), - pd.Timestamp('2011-02-01', tz=dateutil.tz.tzlocal())] - y = [pd.Timestamp('2012-01-01', tz=dateutil.tz.tzlocal()), - pd.Timestamp('2012-02-01', tz=dateutil.tz.tzlocal())] + x = [ + pd.Timestamp("2011-01-01", tz=dateutil.tz.tzlocal()), + pd.Timestamp("2011-02-01", tz=dateutil.tz.tzlocal()), + ] + y = [ + pd.Timestamp("2012-01-01", tz=dateutil.tz.tzlocal()), + pd.Timestamp("2012-02-01", tz=dateutil.tz.tzlocal()), + ] result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) tm.assert_series_equal(result, pd.Series(x + y)) - assert result.dtype == 'datetime64[ns, tzlocal()]' + assert result.dtype == "datetime64[ns, tzlocal()]" - @pytest.mark.parametrize('tz1', [None, 'UTC']) - @pytest.mark.parametrize('tz2', [None, 'UTC']) - @pytest.mark.parametrize('s', [pd.NaT, pd.Timestamp('20150101')]) + @pytest.mark.parametrize("tz1", [None, "UTC"]) + @pytest.mark.parametrize("tz2", [None, "UTC"]) + @pytest.mark.parametrize("s", [pd.NaT, pd.Timestamp("20150101")]) def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, s): # GH 12396 # tz-naive first = pd.DataFrame([[pd.NaT], [pd.NaT]]).apply( - lambda x: x.dt.tz_localize(tz1)) + lambda x: x.dt.tz_localize(tz1) + ) second = pd.DataFrame([s]).apply(lambda x: x.dt.tz_localize(tz2)) result = pd.concat([first, second], axis=0) - expected = pd.DataFrame(pd.Series( - [pd.NaT, pd.NaT, s], index=[0, 1, 0])) + expected = pd.DataFrame(pd.Series([pd.NaT, pd.NaT, s], index=[0, 1, 0])) expected = expected.apply(lambda x: x.dt.tz_localize(tz2)) if tz1 != tz2: expected = expected.astype(object) assert_frame_equal(result, expected) - @pytest.mark.parametrize('tz1', [None, 'UTC']) - @pytest.mark.parametrize('tz2', [None, 'UTC']) + @pytest.mark.parametrize("tz1", [None, "UTC"]) + @pytest.mark.parametrize("tz2", [None, "UTC"]) def test_concat_NaT_dataframes_all_NaT_axis_1(self, tz1, tz2): # GH 12396 first = pd.DataFrame(pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1)) - second = pd.DataFrame(pd.Series( - [pd.NaT]).dt.tz_localize(tz2), columns=[1]) + second = pd.DataFrame(pd.Series([pd.NaT]).dt.tz_localize(tz2), columns=[1]) expected = pd.DataFrame( - {0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1), - 1: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz2)} + { + 0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1), + 1: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz2), + } ) result = pd.concat([first, second], axis=1) assert_frame_equal(result, expected) - @pytest.mark.parametrize('tz1', [None, 'UTC']) - @pytest.mark.parametrize('tz2', [None, 'UTC']) + @pytest.mark.parametrize("tz1", [None, "UTC"]) + @pytest.mark.parametrize("tz2", [None, "UTC"]) def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2): # GH 12396 # tz-naive first = pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1) - second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz=tz2)], - [pd.Timestamp('2016/01/01', tz=tz2)]], - index=[2, 3]) + second = pd.DataFrame( + [ + [pd.Timestamp("2015/01/01", tz=tz2)], + [pd.Timestamp("2016/01/01", tz=tz2)], + ], + index=[2, 3], + ) - expected = pd.DataFrame([pd.NaT, pd.NaT, - pd.Timestamp('2015/01/01', tz=tz2), - pd.Timestamp('2016/01/01', tz=tz2)]) + expected = pd.DataFrame( + [ + pd.NaT, + pd.NaT, + pd.Timestamp("2015/01/01", tz=tz2), + pd.Timestamp("2016/01/01", tz=tz2), + ] + ) if tz1 != tz2: expected = expected.astype(object) result = pd.concat([first, second]) assert_frame_equal(result, expected) - @pytest.mark.parametrize('tz', [None, 'UTC']) + @pytest.mark.parametrize("tz", [None, "UTC"]) def test_concat_NaT_dataframes(self, tz): # GH 12396 first = pd.DataFrame([[pd.NaT], [pd.NaT]]) first = first.apply(lambda x: x.dt.tz_localize(tz)) - second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz=tz)], - [pd.Timestamp('2016/01/01', tz=tz)]], - index=[2, 3]) - expected = pd.DataFrame([pd.NaT, pd.NaT, - pd.Timestamp('2015/01/01', tz=tz), - pd.Timestamp('2016/01/01', tz=tz)]) + second = pd.DataFrame( + [[pd.Timestamp("2015/01/01", tz=tz)], [pd.Timestamp("2016/01/01", tz=tz)]], + index=[2, 3], + ) + expected = pd.DataFrame( + [ + pd.NaT, + pd.NaT, + pd.Timestamp("2015/01/01", tz=tz), + pd.Timestamp("2016/01/01", tz=tz), + ] + ) result = pd.concat([first, second], axis=0) assert_frame_equal(result, expected) def test_concat_period_series(self): - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='D')) - expected = Series([x[0], x[1], y[0], y[1]], dtype='Period[D]') + x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) + y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="D")) + expected = Series([x[0], x[1], y[0], y[1]], dtype="Period[D]") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) def test_concat_period_multiple_freq_series(self): - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='M')) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) + y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="M")) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) - assert result.dtype == 'object' + assert result.dtype == "object" def test_concat_period_other_series(self): - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='M')) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) + y = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="M")) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) - assert result.dtype == 'object' + assert result.dtype == "object" # non-period - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(pd.DatetimeIndex(['2015-11-01', '2015-12-01'])) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) + y = Series(pd.DatetimeIndex(["2015-11-01", "2015-12-01"])) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) - assert result.dtype == 'object' + assert result.dtype == "object" - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(['A', 'B']) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) + y = Series(["A", "B"]) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) - assert result.dtype == 'object' + assert result.dtype == "object" def test_concat_empty_series(self): # GH 11082 - s1 = pd.Series([1, 2, 3], name='x') - s2 = pd.Series(name='y') + s1 = pd.Series([1, 2, 3], name="x") + s2 = pd.Series(name="y") res = pd.concat([s1, s2], axis=1) - exp = pd.DataFrame({'x': [1, 2, 3], 'y': [np.nan, np.nan, np.nan]}, - index=pd.Index([0, 1, 2], dtype='O')) + exp = pd.DataFrame( + {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan]}, + index=pd.Index([0, 1, 2], dtype="O"), + ) tm.assert_frame_equal(res, exp) - s1 = pd.Series([1, 2, 3], name='x') - s2 = pd.Series(name='y') + s1 = pd.Series([1, 2, 3], name="x") + s2 = pd.Series(name="y") res = pd.concat([s1, s2], axis=0) # name will be reset exp = pd.Series([1, 2, 3]) tm.assert_series_equal(res, exp) # empty Series with no name - s1 = pd.Series([1, 2, 3], name='x') + s1 = pd.Series([1, 2, 3], name="x") s2 = pd.Series(name=None) res = pd.concat([s1, s2], axis=1) - exp = pd.DataFrame({'x': [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, - columns=['x', 0], - index=pd.Index([0, 1, 2], dtype='O')) + exp = pd.DataFrame( + {"x": [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, + columns=["x", 0], + index=pd.Index([0, 1, 2], dtype="O"), + ) tm.assert_frame_equal(res, exp) - @pytest.mark.parametrize('tz', [None, 'UTC']) - @pytest.mark.parametrize('values', [[], [1, 2, 3]]) + @pytest.mark.parametrize("tz", [None, "UTC"]) + @pytest.mark.parametrize("values", [[], [1, 2, 3]]) def test_concat_empty_series_timelike(self, tz, values): # GH 18447 - first = Series([], dtype='M8[ns]').dt.tz_localize(tz) + first = Series([], dtype="M8[ns]").dt.tz_localize(tz) second = Series(values) expected = DataFrame( - {0: pd.Series([pd.NaT] * len(values), - dtype='M8[ns]' - ).dt.tz_localize(tz), - 1: values}) + { + 0: pd.Series([pd.NaT] * len(values), dtype="M8[ns]").dt.tz_localize(tz), + 1: values, + } + ) result = concat([first, second], axis=1) assert_frame_equal(result, expected) def test_default_index(self): # is_series and ignore_index - s1 = pd.Series([1, 2, 3], name='x') - s2 = pd.Series([4, 5, 6], name='y') + s1 = pd.Series([1, 2, 3], name="x") + s2 = pd.Series([4, 5, 6], name="y") res = pd.concat([s1, s2], axis=1, ignore_index=True) assert isinstance(res.columns, pd.RangeIndex) exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) # use check_index_type=True to check the result have # RangeIndex (default index) - tm.assert_frame_equal(res, exp, check_index_type=True, - check_column_type=True) + tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) # is_series and all inputs have no names s1 = pd.Series([1, 2, 3]) @@ -2093,23 +2256,19 @@ def test_default_index(self): assert isinstance(res.columns, pd.RangeIndex) exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) exp.columns = pd.RangeIndex(2) - tm.assert_frame_equal(res, exp, check_index_type=True, - check_column_type=True) + tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) # is_dataframe and ignore_index - df1 = pd.DataFrame({'A': [1, 2], 'B': [5, 6]}) - df2 = pd.DataFrame({'A': [3, 4], 'B': [7, 8]}) + df1 = pd.DataFrame({"A": [1, 2], "B": [5, 6]}) + df2 = pd.DataFrame({"A": [3, 4], "B": [7, 8]}) res = pd.concat([df1, df2], axis=0, ignore_index=True) - exp = pd.DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], - columns=['A', 'B']) - tm.assert_frame_equal(res, exp, check_index_type=True, - check_column_type=True) + exp = pd.DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], columns=["A", "B"]) + tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) res = pd.concat([df1, df2], axis=1, ignore_index=True) exp = pd.DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]]) - tm.assert_frame_equal(res, exp, check_index_type=True, - check_column_type=True) + tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) def test_concat_multiindex_rangeindex(self): # GH13542 @@ -2117,9 +2276,10 @@ def test_concat_multiindex_rangeindex(self): # there is a bug in concat with objects of len 1 df = DataFrame(np.random.randn(9, 2)) - df.index = MultiIndex(levels=[pd.RangeIndex(3), pd.RangeIndex(3)], - codes=[np.repeat(np.arange(3), 3), - np.tile(np.arange(3), 3)]) + df.index = MultiIndex( + levels=[pd.RangeIndex(3), pd.RangeIndex(3)], + codes=[np.repeat(np.arange(3), 3), np.tile(np.arange(3), 3)], + ) res = concat([df.iloc[[2, 3, 4], :], df.iloc[[5], :]]) exp = df.iloc[[2, 3, 4, 5], :] @@ -2128,22 +2288,23 @@ def test_concat_multiindex_rangeindex(self): def test_concat_multiindex_dfs_with_deepcopy(self): # GH 9967 from copy import deepcopy - example_multiindex1 = pd.MultiIndex.from_product([['a'], ['b']]) + + example_multiindex1 = pd.MultiIndex.from_product([["a"], ["b"]]) example_dataframe1 = pd.DataFrame([0], index=example_multiindex1) - example_multiindex2 = pd.MultiIndex.from_product([['a'], ['c']]) + example_multiindex2 = pd.MultiIndex.from_product([["a"], ["c"]]) example_dataframe2 = pd.DataFrame([1], index=example_multiindex2) - example_dict = {'s1': example_dataframe1, 's2': example_dataframe2} - expected_index = pd.MultiIndex(levels=[['s1', 's2'], - ['a'], - ['b', 'c']], - codes=[[0, 1], [0, 0], [0, 1]], - names=['testname', None, None]) + example_dict = {"s1": example_dataframe1, "s2": example_dataframe2} + expected_index = pd.MultiIndex( + levels=[["s1", "s2"], ["a"], ["b", "c"]], + codes=[[0, 1], [0, 0], [0, 1]], + names=["testname", None, None], + ) expected = pd.DataFrame([[0], [1]], index=expected_index) - result_copy = pd.concat(deepcopy(example_dict), names=['testname']) + result_copy = pd.concat(deepcopy(example_dict), names=["testname"]) tm.assert_frame_equal(result_copy, expected) - result_no_copy = pd.concat(example_dict, names=['testname']) + result_no_copy = pd.concat(example_dict, names=["testname"]) tm.assert_frame_equal(result_no_copy, expected) def test_categorical_concat_append(self): @@ -2152,8 +2313,7 @@ def test_categorical_concat_append(self): df = DataFrame({"cats": cat, "vals": vals}) cat2 = Categorical(["a", "b", "a", "b"], categories=["a", "b"]) vals2 = [1, 2, 1, 2] - exp = DataFrame({"cats": cat2, "vals": vals2}, - index=Index([0, 1, 0, 1])) + exp = DataFrame({"cats": cat2, "vals": vals2}, index=Index([0, 1, 0, 1])) tm.assert_frame_equal(pd.concat([df, df]), exp) tm.assert_frame_equal(df.append(df), exp) @@ -2164,7 +2324,7 @@ def test_categorical_concat_append(self): df_different_categories = DataFrame({"cats": cat3, "vals": vals3}) res = pd.concat([df, df_different_categories], ignore_index=True) - exp = DataFrame({"cats": list('abab'), "vals": [1, 2, 1, 2]}) + exp = DataFrame({"cats": list("abab"), "vals": [1, 2, 1, 2]}) tm.assert_frame_equal(res, exp) res = df.append(df_different_categories, ignore_index=True) @@ -2173,115 +2333,132 @@ def test_categorical_concat_append(self): def test_categorical_concat_dtypes(self): # GH8143 - index = ['cat', 'obj', 'num'] - cat = Categorical(['a', 'b', 'c']) - obj = Series(['a', 'b', 'c']) + index = ["cat", "obj", "num"] + cat = Categorical(["a", "b", "c"]) + obj = Series(["a", "b", "c"]) num = Series([1, 2, 3]) df = pd.concat([Series(cat), obj, num], axis=1, keys=index) - result = df.dtypes == 'object' + result = df.dtypes == "object" expected = Series([False, True, False], index=index) tm.assert_series_equal(result, expected) - result = df.dtypes == 'int64' + result = df.dtypes == "int64" expected = Series([False, False, True], index=index) tm.assert_series_equal(result, expected) - result = df.dtypes == 'category' + result = df.dtypes == "category" expected = Series([True, False, False], index=index) tm.assert_series_equal(result, expected) def test_categorical_concat(self, sort): # See GH 10177 - df1 = DataFrame(np.arange(18, dtype='int64').reshape(6, 3), - columns=["a", "b", "c"]) + df1 = DataFrame( + np.arange(18, dtype="int64").reshape(6, 3), columns=["a", "b", "c"] + ) - df2 = DataFrame(np.arange(14, dtype='int64').reshape(7, 2), - columns=["a", "c"]) + df2 = DataFrame(np.arange(14, dtype="int64").reshape(7, 2), columns=["a", "c"]) cat_values = ["one", "one", "two", "one", "two", "two", "one"] - df2['h'] = Series(Categorical(cat_values)) + df2["h"] = Series(Categorical(cat_values)) res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=sort) - exp = DataFrame({'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], - 'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, np.nan, - np.nan, np.nan, np.nan, np.nan], - 'c': [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13], - 'h': [None] * 6 + cat_values}) + exp = DataFrame( + { + "a": [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], + "b": [ + 1, + 4, + 7, + 10, + 13, + 16, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ], + "c": [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13], + "h": [None] * 6 + cat_values, + } + ) tm.assert_frame_equal(res, exp) def test_categorical_concat_gh7864(self): # GH 7864 # make sure ordering is preserved - df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list('abbaae')}) + df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list("abbaae")}) df["grade"] = Categorical(df["raw_grade"]) - df['grade'].cat.set_categories(['e', 'a', 'b']) + df["grade"].cat.set_categories(["e", "a", "b"]) df1 = df[0:3] df2 = df[3:] - tm.assert_index_equal(df['grade'].cat.categories, - df1['grade'].cat.categories) - tm.assert_index_equal(df['grade'].cat.categories, - df2['grade'].cat.categories) + tm.assert_index_equal(df["grade"].cat.categories, df1["grade"].cat.categories) + tm.assert_index_equal(df["grade"].cat.categories, df2["grade"].cat.categories) dfx = pd.concat([df1, df2]) - tm.assert_index_equal(df['grade'].cat.categories, - dfx['grade'].cat.categories) + tm.assert_index_equal(df["grade"].cat.categories, dfx["grade"].cat.categories) dfa = df1.append(df2) - tm.assert_index_equal(df['grade'].cat.categories, - dfa['grade'].cat.categories) + tm.assert_index_equal(df["grade"].cat.categories, dfa["grade"].cat.categories) def test_categorical_concat_preserve(self): # GH 8641 series concat not preserving category dtype # GH 13524 can concat different categories - s = Series(list('abc'), dtype='category') - s2 = Series(list('abd'), dtype='category') + s = Series(list("abc"), dtype="category") + s2 = Series(list("abd"), dtype="category") - exp = Series(list('abcabd')) + exp = Series(list("abcabd")) res = pd.concat([s, s2], ignore_index=True) tm.assert_series_equal(res, exp) - exp = Series(list('abcabc'), dtype='category') + exp = Series(list("abcabc"), dtype="category") res = pd.concat([s, s], ignore_index=True) tm.assert_series_equal(res, exp) - exp = Series(list('abcabc'), index=[0, 1, 2, 0, 1, 2], - dtype='category') + exp = Series(list("abcabc"), index=[0, 1, 2, 0, 1, 2], dtype="category") res = pd.concat([s, s]) tm.assert_series_equal(res, exp) - a = Series(np.arange(6, dtype='int64')) - b = Series(list('aabbca')) + a = Series(np.arange(6, dtype="int64")) + b = Series(list("aabbca")) - df2 = DataFrame({'A': a, - 'B': b.astype(CategoricalDtype(list('cab')))}) + df2 = DataFrame({"A": a, "B": b.astype(CategoricalDtype(list("cab")))}) res = pd.concat([df2, df2]) exp = DataFrame( - {'A': pd.concat([a, a]), - 'B': pd.concat([b, b]).astype(CategoricalDtype(list('cab')))}) + { + "A": pd.concat([a, a]), + "B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))), + } + ) tm.assert_frame_equal(res, exp) def test_categorical_index_preserver(self): - a = Series(np.arange(6, dtype='int64')) - b = Series(list('aabbca')) + a = Series(np.arange(6, dtype="int64")) + b = Series(list("aabbca")) - df2 = DataFrame({'A': a, - 'B': b.astype(CategoricalDtype(list('cab'))) - }).set_index('B') + df2 = DataFrame( + {"A": a, "B": b.astype(CategoricalDtype(list("cab")))} + ).set_index("B") result = pd.concat([df2, df2]) expected = DataFrame( - {'A': pd.concat([a, a]), - 'B': pd.concat([b, b]).astype(CategoricalDtype(list('cab'))) - }).set_index('B') + { + "A": pd.concat([a, a]), + "B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))), + } + ).set_index("B") tm.assert_frame_equal(result, expected) # wrong categories - df3 = DataFrame({'A': a, 'B': Categorical(b, categories=list('abe')) - }).set_index('B') + df3 = DataFrame( + {"A": a, "B": Categorical(b, categories=list("abe"))} + ).set_index("B") msg = "categories must match existing categories when appending" with pytest.raises(TypeError, match=msg): pd.concat([df2, df3]) @@ -2290,28 +2467,30 @@ def test_concat_categoricalindex(self): # GH 16111, categories that aren't lexsorted categories = [9, 0, 1, 2, 3] - a = pd.Series(1, index=pd.CategoricalIndex([9, 0], - categories=categories)) - b = pd.Series(2, index=pd.CategoricalIndex([0, 1], - categories=categories)) - c = pd.Series(3, index=pd.CategoricalIndex([1, 2], - categories=categories)) + a = pd.Series(1, index=pd.CategoricalIndex([9, 0], categories=categories)) + b = pd.Series(2, index=pd.CategoricalIndex([0, 1], categories=categories)) + c = pd.Series(3, index=pd.CategoricalIndex([1, 2], categories=categories)) result = pd.concat([a, b, c], axis=1) exp_idx = pd.CategoricalIndex([9, 0, 1, 2], categories=categories) - exp = pd.DataFrame({0: [1, 1, np.nan, np.nan], - 1: [np.nan, 2, 2, np.nan], - 2: [np.nan, np.nan, 3, 3]}, - columns=[0, 1, 2], - index=exp_idx) + exp = pd.DataFrame( + { + 0: [1, 1, np.nan, np.nan], + 1: [np.nan, 2, 2, np.nan], + 2: [np.nan, np.nan, 3, 3], + }, + columns=[0, 1, 2], + index=exp_idx, + ) tm.assert_frame_equal(result, exp) def test_concat_order(self): # GH 17344 - dfs = [pd.DataFrame(index=range(3), columns=['a', 1, None])] - dfs += [pd.DataFrame(index=range(3), columns=[None, 1, 'a']) - for i in range(100)] + dfs = [pd.DataFrame(index=range(3), columns=["a", 1, None])] + dfs += [ + pd.DataFrame(index=range(3), columns=[None, 1, "a"]) for i in range(100) + ] result = pd.concat(dfs, sort=True).columns expected = dfs[0].columns @@ -2319,49 +2498,69 @@ def test_concat_order(self): def test_concat_datetime_timezone(self): # GH 18523 - idx1 = pd.date_range('2011-01-01', periods=3, freq='H', - tz='Europe/Paris') - idx2 = pd.date_range(start=idx1[0], end=idx1[-1], freq='H') - df1 = pd.DataFrame({'a': [1, 2, 3]}, index=idx1) - df2 = pd.DataFrame({'b': [1, 2, 3]}, index=idx2) + idx1 = pd.date_range("2011-01-01", periods=3, freq="H", tz="Europe/Paris") + idx2 = pd.date_range(start=idx1[0], end=idx1[-1], freq="H") + df1 = pd.DataFrame({"a": [1, 2, 3]}, index=idx1) + df2 = pd.DataFrame({"b": [1, 2, 3]}, index=idx2) result = pd.concat([df1, df2], axis=1) - exp_idx = DatetimeIndex(['2011-01-01 00:00:00+01:00', - '2011-01-01 01:00:00+01:00', - '2011-01-01 02:00:00+01:00'], - freq='H' - ).tz_convert('UTC').tz_convert('Europe/Paris') + exp_idx = ( + DatetimeIndex( + [ + "2011-01-01 00:00:00+01:00", + "2011-01-01 01:00:00+01:00", + "2011-01-01 02:00:00+01:00", + ], + freq="H", + ) + .tz_convert("UTC") + .tz_convert("Europe/Paris") + ) - expected = pd.DataFrame([[1, 1], [2, 2], [3, 3]], - index=exp_idx, columns=['a', 'b']) + expected = pd.DataFrame( + [[1, 1], [2, 2], [3, 3]], index=exp_idx, columns=["a", "b"] + ) tm.assert_frame_equal(result, expected) - idx3 = pd.date_range('2011-01-01', periods=3, - freq='H', tz='Asia/Tokyo') - df3 = pd.DataFrame({'b': [1, 2, 3]}, index=idx3) + idx3 = pd.date_range("2011-01-01", periods=3, freq="H", tz="Asia/Tokyo") + df3 = pd.DataFrame({"b": [1, 2, 3]}, index=idx3) result = pd.concat([df1, df3], axis=1) - exp_idx = DatetimeIndex(['2010-12-31 15:00:00+00:00', - '2010-12-31 16:00:00+00:00', - '2010-12-31 17:00:00+00:00', - '2010-12-31 23:00:00+00:00', - '2011-01-01 00:00:00+00:00', - '2011-01-01 01:00:00+00:00'] - ) + exp_idx = DatetimeIndex( + [ + "2010-12-31 15:00:00+00:00", + "2010-12-31 16:00:00+00:00", + "2010-12-31 17:00:00+00:00", + "2010-12-31 23:00:00+00:00", + "2011-01-01 00:00:00+00:00", + "2011-01-01 01:00:00+00:00", + ] + ) - expected = pd.DataFrame([[np.nan, 1], [np.nan, 2], [np.nan, 3], - [1, np.nan], [2, np.nan], [3, np.nan]], - index=exp_idx, columns=['a', 'b']) + expected = pd.DataFrame( + [ + [np.nan, 1], + [np.nan, 2], + [np.nan, 3], + [1, np.nan], + [2, np.nan], + [3, np.nan], + ], + index=exp_idx, + columns=["a", "b"], + ) tm.assert_frame_equal(result, expected) # GH 13783: Concat after resample - result = pd.concat([df1.resample('H').mean(), - df2.resample('H').mean()], sort=True) - expected = pd.DataFrame({'a': [1, 2, 3] + [np.nan] * 3, - 'b': [np.nan] * 3 + [1, 2, 3]}, - index=idx1.append(idx1)) + result = pd.concat( + [df1.resample("H").mean(), df2.resample("H").mean()], sort=True + ) + expected = pd.DataFrame( + {"a": [1, 2, 3] + [np.nan] * 3, "b": [np.nan] * 3 + [1, 2, 3]}, + index=idx1.append(idx1), + ) tm.assert_frame_equal(result, expected) def test_concat_different_extension_dtypes_upcasts(self): @@ -2369,50 +2568,55 @@ def test_concat_different_extension_dtypes_upcasts(self): b = pd.Series(to_decimal([1, 2])) result = pd.concat([a, b], ignore_index=True) - expected = pd.Series([ - 1, 2, - Decimal(1), Decimal(2) - ], dtype=object) + expected = pd.Series([1, 2, Decimal(1), Decimal(2)], dtype=object) tm.assert_series_equal(result, expected) def test_concat_odered_dict(self): # GH 21510 - expected = pd.concat([pd.Series(range(3)), pd.Series(range(4))], - keys=['First', 'Another']) - result = pd.concat(OrderedDict([('First', pd.Series(range(3))), - ('Another', pd.Series(range(4)))])) + expected = pd.concat( + [pd.Series(range(3)), pd.Series(range(4))], keys=["First", "Another"] + ) + result = pd.concat( + OrderedDict( + [("First", pd.Series(range(3))), ("Another", pd.Series(range(4)))] + ) + ) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame]) -@pytest.mark.parametrize('dt', np.sctypes['float']) +@pytest.mark.parametrize("pdt", [pd.Series, pd.DataFrame]) +@pytest.mark.parametrize("dt", np.sctypes["float"]) def test_concat_no_unnecessary_upcast(dt, pdt): # GH 13247 dims = pdt().ndim - dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)), - pdt(np.array([np.nan], dtype=dt, ndmin=dims)), - pdt(np.array([5], dtype=dt, ndmin=dims))] + dfs = [ + pdt(np.array([1], dtype=dt, ndmin=dims)), + pdt(np.array([np.nan], dtype=dt, ndmin=dims)), + pdt(np.array([5], dtype=dt, ndmin=dims)), + ] x = pd.concat(dfs) assert x.values.dtype == dt -@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame]) -@pytest.mark.parametrize('dt', np.sctypes['int']) +@pytest.mark.parametrize("pdt", [pd.Series, pd.DataFrame]) +@pytest.mark.parametrize("dt", np.sctypes["int"]) def test_concat_will_upcast(dt, pdt): with catch_warnings(record=True): dims = pdt().ndim - dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)), - pdt(np.array([np.nan], ndmin=dims)), - pdt(np.array([5], dtype=dt, ndmin=dims))] + dfs = [ + pdt(np.array([1], dtype=dt, ndmin=dims)), + pdt(np.array([np.nan], ndmin=dims)), + pdt(np.array([5], dtype=dt, ndmin=dims)), + ] x = pd.concat(dfs) - assert x.values.dtype == 'float64' + assert x.values.dtype == "float64" def test_concat_empty_and_non_empty_frame_regression(): # GH 18178 regression test - df1 = pd.DataFrame({'foo': [1]}) - df2 = pd.DataFrame({'foo': []}) - expected = pd.DataFrame({'foo': [1.0]}) + df1 = pd.DataFrame({"foo": [1]}) + df2 = pd.DataFrame({"foo": []}) + expected = pd.DataFrame({"foo": [1.0]}) result = pd.concat([df1, df2]) assert_frame_equal(result, expected) @@ -2428,17 +2632,17 @@ def test_concat_empty_and_non_empty_series_regression(): def test_concat_sorts_columns(sort_with_none): # GH-4588 - df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a']) + df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"]) df2 = pd.DataFrame({"a": [3, 4], "c": [5, 6]}) # for sort=True/None - expected = pd.DataFrame({"a": [1, 2, 3, 4], - "b": [1, 2, None, None], - "c": [None, None, 5, 6]}, - columns=['a', 'b', 'c']) + expected = pd.DataFrame( + {"a": [1, 2, 3, 4], "b": [1, 2, None, None], "c": [None, None, 5, 6]}, + columns=["a", "b", "c"], + ) if sort_with_none is False: - expected = expected[['b', 'a', 'c']] + expected = expected[["b", "a", "c"]] if sort_with_none is None: # only warn if not explicitly specified @@ -2453,15 +2657,15 @@ def test_concat_sorts_columns(sort_with_none): def test_concat_sorts_index(sort_with_none): - df1 = pd.DataFrame({"a": [1, 2, 3]}, index=['c', 'a', 'b']) - df2 = pd.DataFrame({"b": [1, 2]}, index=['a', 'b']) + df1 = pd.DataFrame({"a": [1, 2, 3]}, index=["c", "a", "b"]) + df2 = pd.DataFrame({"b": [1, 2]}, index=["a", "b"]) # For True/None - expected = pd.DataFrame({"a": [2, 3, 1], "b": [1, 2, None]}, - index=['a', 'b', 'c'], - columns=['a', 'b']) + expected = pd.DataFrame( + {"a": [2, 3, 1], "b": [1, 2, None]}, index=["a", "b", "c"], columns=["a", "b"] + ) if sort_with_none is False: - expected = expected.loc[['c', 'a', 'b']] + expected = expected.loc[["c", "a", "b"]] if sort_with_none is None: # only warn if not explicitly specified @@ -2477,78 +2681,76 @@ def test_concat_sorts_index(sort_with_none): def test_concat_inner_sort(sort_with_none): # https://github.com/pandas-dev/pandas/pull/20613 - df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 2]}, - columns=['b', 'a', 'c']) - df2 = pd.DataFrame({"a": [1, 2], 'b': [3, 4]}, index=[3, 4]) + df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 2]}, columns=["b", "a", "c"]) + df2 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[3, 4]) with tm.assert_produces_warning(None): # unset sort should *not* warn for inner join # since that never sorted - result = pd.concat([df1, df2], sort=sort_with_none, - join='inner', - ignore_index=True) + result = pd.concat( + [df1, df2], sort=sort_with_none, join="inner", ignore_index=True + ) - expected = pd.DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]}, - columns=['b', 'a']) + expected = pd.DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]}, columns=["b", "a"]) if sort_with_none is True: - expected = expected[['a', 'b']] + expected = expected[["a", "b"]] tm.assert_frame_equal(result, expected) def test_concat_aligned_sort(): # GH-4588 - df = pd.DataFrame({"c": [1, 2], "b": [3, 4], 'a': [5, 6]}, - columns=['c', 'b', 'a']) + df = pd.DataFrame({"c": [1, 2], "b": [3, 4], "a": [5, 6]}, columns=["c", "b", "a"]) result = pd.concat([df, df], sort=True, ignore_index=True) - expected = pd.DataFrame({'a': [5, 6, 5, 6], 'b': [3, 4, 3, 4], - 'c': [1, 2, 1, 2]}, - columns=['a', 'b', 'c']) + expected = pd.DataFrame( + {"a": [5, 6, 5, 6], "b": [3, 4, 3, 4], "c": [1, 2, 1, 2]}, + columns=["a", "b", "c"], + ) tm.assert_frame_equal(result, expected) - result = pd.concat([df, df[['c', 'b']]], join='inner', sort=True, - ignore_index=True) - expected = expected[['b', 'c']] + result = pd.concat([df, df[["c", "b"]]], join="inner", sort=True, ignore_index=True) + expected = expected[["b", "c"]] tm.assert_frame_equal(result, expected) def test_concat_aligned_sort_does_not_raise(): # GH-4588 # We catch TypeErrors from sorting internally and do not re-raise. - df = pd.DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, 'a']) - expected = pd.DataFrame({1: [1, 2, 1, 2], 'a': [3, 4, 3, 4]}, - columns=[1, 'a']) + df = pd.DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, "a"]) + expected = pd.DataFrame({1: [1, 2, 1, 2], "a": [3, 4, 3, 4]}, columns=[1, "a"]) result = pd.concat([df, df], ignore_index=True, sort=True) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("s1name,s2name", [ - (np.int64(190), (43, 0)), (190, (43, 0))]) +@pytest.mark.parametrize("s1name,s2name", [(np.int64(190), (43, 0)), (190, (43, 0))]) def test_concat_series_name_npscalar_tuple(s1name, s2name): # GH21015 - s1 = pd.Series({'a': 1, 'b': 2}, name=s1name) - s2 = pd.Series({'c': 5, 'd': 6}, name=s2name) + s1 = pd.Series({"a": 1, "b": 2}, name=s1name) + s2 = pd.Series({"c": 5, "d": 6}, name=s2name) result = pd.concat([s1, s2]) - expected = pd.Series({'a': 1, 'b': 2, 'c': 5, 'd': 6}) + expected = pd.Series({"a": 1, "b": 2, "c": 5, "d": 6}) tm.assert_series_equal(result, expected) def test_concat_categorical_tz(): # GH-23816 - a = pd.Series(pd.date_range('2017-01-01', periods=2, tz='US/Pacific')) - b = pd.Series(['a', 'b'], dtype='category') + a = pd.Series(pd.date_range("2017-01-01", periods=2, tz="US/Pacific")) + b = pd.Series(["a", "b"], dtype="category") result = pd.concat([a, b], ignore_index=True) - expected = pd.Series([ - pd.Timestamp('2017-01-01', tz="US/Pacific"), - pd.Timestamp('2017-01-02', tz="US/Pacific"), - 'a', 'b' - ]) + expected = pd.Series( + [ + pd.Timestamp("2017-01-01", tz="US/Pacific"), + pd.Timestamp("2017-01-02", tz="US/Pacific"), + "a", + "b", + ] + ) tm.assert_series_equal(result, expected) def test_concat_datetimeindex_freq(): # GH 3232 # Monotonic index result - dr = pd.date_range('01-Jan-2013', periods=100, freq='50L', tz='UTC') + dr = pd.date_range("01-Jan-2013", periods=100, freq="50L", tz="UTC") data = list(range(100)) expected = pd.DataFrame(data, index=dr) result = pd.concat([expected[:50], expected[50:]]) @@ -2556,7 +2758,6 @@ def test_concat_datetimeindex_freq(): # Non-monotonic index result result = pd.concat([expected[50:], expected[:50]]) - expected = pd.DataFrame(data[50:] + data[:50], - index=dr[50:].append(dr[:50])) + expected = pd.DataFrame(data[50:] + data[:50], index=dr[50:].append(dr[:50])) expected.index.freq = None tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index f71730fb4a3134..a2ebf2359f55fe 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -3,9 +3,22 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, DatetimeIndex, Index, Interval, IntervalIndex, - Series, TimedeltaIndex, Timestamp, cut, date_range, isna, qcut, - timedelta_range, to_datetime) + Categorical, + DataFrame, + DatetimeIndex, + Index, + Interval, + IntervalIndex, + Series, + TimedeltaIndex, + Timestamp, + cut, + date_range, + isna, + qcut, + timedelta_range, + to_datetime, +) from pandas.api.types import CategoricalDtype as CDT import pandas.core.reshape.tile as tmod import pandas.util.testing as tm @@ -20,7 +33,7 @@ def test_simple(): def test_bins(): - data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]) + data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1]) result, bins = cut(data, 3, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3)) @@ -28,12 +41,11 @@ def test_bins(): expected = Categorical(intervals, ordered=True) tm.assert_categorical_equal(result, expected) - tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, - 6.53333333, 9.7])) + tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, 6.53333333, 9.7])) def test_right(): - data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) + data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=True, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3)) @@ -45,7 +57,7 @@ def test_right(): def test_no_right(): - data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) + data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=False, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3), closed="left") @@ -57,7 +69,7 @@ def test_no_right(): def test_array_like(): - data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] + data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1] result, bins = cut(data, 3, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3)) @@ -65,8 +77,7 @@ def test_array_like(): expected = Categorical(intervals, ordered=True) tm.assert_categorical_equal(result, expected) - tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, - 6.53333333, 9.7])) + tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, 6.53333333, 9.7])) def test_bins_from_interval_index(): @@ -75,9 +86,9 @@ def test_bins_from_interval_index(): result = cut(range(5), bins=expected.categories) tm.assert_categorical_equal(result, expected) - expected = Categorical.from_codes(np.append(c.codes, -1), - categories=c.categories, - ordered=True) + expected = Categorical.from_codes( + np.append(c.codes, -1), categories=c.categories, ordered=True + ) result = cut(range(6), bins=expected.categories) tm.assert_categorical_equal(result, expected) @@ -91,8 +102,7 @@ def test_bins_from_interval_index_doc_example(): result = cut([25, 20, 50], bins=c.categories) tm.assert_index_equal(result.categories, expected) - tm.assert_numpy_array_equal(result.codes, - np.array([1, 1, 2], dtype="int8")) + tm.assert_numpy_array_equal(result.codes, np.array([1, 1, 2], dtype="int8")) def test_bins_not_overlapping_from_interval_index(): @@ -106,35 +116,52 @@ def test_bins_not_overlapping_from_interval_index(): def test_bins_not_monotonic(): msg = "bins must increase monotonically" - data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] + data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1] with pytest.raises(ValueError, match=msg): cut(data, [0.1, 1.5, 1, 10]) -@pytest.mark.parametrize("x, bins, expected", [ - (date_range("2017-12-31", periods=3), - [Timestamp.min, Timestamp('2018-01-01'), Timestamp.max], - IntervalIndex.from_tuples([ - (Timestamp.min, Timestamp('2018-01-01')), - (Timestamp('2018-01-01'), Timestamp.max)])), - - ([-1, 0, 1], - np.array([np.iinfo(np.int64).min, 0, np.iinfo(np.int64).max], - dtype="int64"), - IntervalIndex.from_tuples([ - (np.iinfo(np.int64).min, 0), - (0, np.iinfo(np.int64).max)])), - - ([np.timedelta64(-1), np.timedelta64(0), np.timedelta64(1)], - np.array([ - np.timedelta64(-np.iinfo(np.int64).max), - np.timedelta64(0), - np.timedelta64(np.iinfo(np.int64).max)]), - IntervalIndex.from_tuples([ - (np.timedelta64(-np.iinfo(np.int64).max), np.timedelta64(0)), - (np.timedelta64(0), np.timedelta64(np.iinfo(np.int64).max))])), -]) +@pytest.mark.parametrize( + "x, bins, expected", + [ + ( + date_range("2017-12-31", periods=3), + [Timestamp.min, Timestamp("2018-01-01"), Timestamp.max], + IntervalIndex.from_tuples( + [ + (Timestamp.min, Timestamp("2018-01-01")), + (Timestamp("2018-01-01"), Timestamp.max), + ] + ), + ), + ( + [-1, 0, 1], + np.array( + [np.iinfo(np.int64).min, 0, np.iinfo(np.int64).max], dtype="int64" + ), + IntervalIndex.from_tuples( + [(np.iinfo(np.int64).min, 0), (0, np.iinfo(np.int64).max)] + ), + ), + ( + [np.timedelta64(-1), np.timedelta64(0), np.timedelta64(1)], + np.array( + [ + np.timedelta64(-np.iinfo(np.int64).max), + np.timedelta64(0), + np.timedelta64(np.iinfo(np.int64).max), + ] + ), + IntervalIndex.from_tuples( + [ + (np.timedelta64(-np.iinfo(np.int64).max), np.timedelta64(0)), + (np.timedelta64(0), np.timedelta64(np.iinfo(np.int64).max)), + ] + ), + ), + ], +) def test_bins_monotonic_not_overflowing(x, bins, expected): # GH 26045 result = cut(x, bins) @@ -143,16 +170,19 @@ def test_bins_monotonic_not_overflowing(x, bins, expected): def test_wrong_num_labels(): msg = "Bin labels must be one fewer than the number of bin edges" - data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] + data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1] with pytest.raises(ValueError, match=msg): cut(data, [0, 1, 10], labels=["foo", "bar", "baz"]) -@pytest.mark.parametrize("x,bins,msg", [ - ([], 2, "Cannot cut empty array"), - ([1, 2, 3], 0.5, "`bins` should be a positive integer") -]) +@pytest.mark.parametrize( + "x,bins,msg", + [ + ([], 2, "Cannot cut empty array"), + ([1, 2, 3], 0.5, "`bins` should be a positive integer"), + ], +) def test_cut_corner(x, bins, msg): with pytest.raises(ValueError, match=msg): cut(x, bins) @@ -166,13 +196,17 @@ def test_cut_not_1d_arg(arg, cut_func): cut_func(arg, 2) -@pytest.mark.parametrize('data', [ - [0, 1, 2, 3, 4, np.inf], - [-np.inf, 0, 1, 2, 3, 4], - [-np.inf, 0, 1, 2, 3, 4, np.inf]]) +@pytest.mark.parametrize( + "data", + [ + [0, 1, 2, 3, 4, np.inf], + [-np.inf, 0, 1, 2, 3, 4], + [-np.inf, 0, 1, 2, 3, 4, np.inf], + ], +) def test_int_bins_with_inf(data): # GH 24314 - msg = 'cannot specify integer `bins` when input data contains infinity' + msg = "cannot specify integer `bins` when input data contains infinity" with pytest.raises(ValueError, match=msg): cut(data, bins=3) @@ -188,10 +222,13 @@ def test_cut_out_of_range_more(): tm.assert_series_equal(ind, exp) -@pytest.mark.parametrize("right,breaks,closed", [ - (True, [-1e-3, 0.25, 0.5, 0.75, 1], "right"), - (False, [0, 0.25, 0.5, 0.75, 1 + 1e-3], "left") -]) +@pytest.mark.parametrize( + "right,breaks,closed", + [ + (True, [-1e-3, 0.25, 0.5, 0.75, 1], "right"), + (False, [0, 0.25, 0.5, 0.75, 1 + 1e-3], "left"), + ], +) def test_labels(right, breaks, closed): arr = np.tile(np.arange(0, 1.01, 0.1), 4) @@ -254,14 +291,23 @@ def test_cut_out_of_bounds(): tm.assert_numpy_array_equal(mask, ex_mask) -@pytest.mark.parametrize("get_labels,get_expected", [ - (lambda labels: labels, - lambda labels: Categorical(["Medium"] + 4 * ["Small"] + - ["Medium", "Large"], - categories=labels, ordered=True)), - (lambda labels: Categorical.from_codes([0, 1, 2], labels), - lambda labels: Categorical.from_codes([1] + 4 * [0] + [1, 2], labels)) -]) +@pytest.mark.parametrize( + "get_labels,get_expected", + [ + ( + lambda labels: labels, + lambda labels: Categorical( + ["Medium"] + 4 * ["Small"] + ["Medium", "Large"], + categories=labels, + ordered=True, + ), + ), + ( + lambda labels: Categorical.from_codes([0, 1, 2], labels), + lambda labels: Categorical.from_codes([1] + 4 * [0] + [1, 2], labels), + ), + ], +) def test_cut_pass_labels(get_labels, get_expected): bins = [0, 25, 50, 100] arr = [50, 5, 10, 15, 20, 30, 70] @@ -277,23 +323,25 @@ def test_cut_pass_labels_compat(): labels = ["Good", "Medium", "Bad"] result = cut(arr, 3, labels=labels) - exp = cut(arr, 3, labels=Categorical(labels, categories=labels, - ordered=True)) + exp = cut(arr, 3, labels=Categorical(labels, categories=labels, ordered=True)) tm.assert_categorical_equal(result, exp) -@pytest.mark.parametrize("x", [np.arange(11.), np.arange(11.) / 1e10]) +@pytest.mark.parametrize("x", [np.arange(11.0), np.arange(11.0) / 1e10]) def test_round_frac_just_works(x): # It works. cut(x, 2) -@pytest.mark.parametrize("val,precision,expected", [ - (-117.9998, 3, -118), - (117.9998, 3, 118), - (117.9998, 2, 118), - (0.000123456, 2, 0.00012) -]) +@pytest.mark.parametrize( + "val,precision,expected", + [ + (-117.9998, 3, -118), + (117.9998, 3, 118), + (117.9998, 2, 118), + (0.000123456, 2, 0.00012), + ], +) def test_round_frac(val, precision, expected): # see gh-1979 result = tmod._round_frac(val, precision=precision) @@ -307,8 +355,11 @@ def test_cut_return_intervals(): exp_bins = np.linspace(0, 8, num=4).round(3) exp_bins[0] -= 0.008 - expected = Series(IntervalIndex.from_breaks(exp_bins, closed="right").take( - [0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True)) + expected = Series( + IntervalIndex.from_breaks(exp_bins, closed="right").take( + [0, 0, 0, 1, 1, 1, 2, 2, 2] + ) + ).astype(CDT(ordered=True)) tm.assert_series_equal(result, expected) @@ -317,17 +368,21 @@ def test_series_ret_bins(): ser = Series(np.arange(4)) result, bins = cut(ser, 2, retbins=True) - expected = Series(IntervalIndex.from_breaks( - [-0.003, 1.5, 3], closed="right").repeat(2)).astype(CDT(ordered=True)) + expected = Series( + IntervalIndex.from_breaks([-0.003, 1.5, 3], closed="right").repeat(2) + ).astype(CDT(ordered=True)) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("kwargs,msg", [ - (dict(duplicates="drop"), None), - (dict(), "Bin edges must be unique"), - (dict(duplicates="raise"), "Bin edges must be unique"), - (dict(duplicates="foo"), "invalid value for 'duplicates' parameter") -]) +@pytest.mark.parametrize( + "kwargs,msg", + [ + (dict(duplicates="drop"), None), + (dict(), "Bin edges must be unique"), + (dict(duplicates="raise"), "Bin edges must be unique"), + (dict(duplicates="foo"), "invalid value for 'duplicates' parameter"), + ], +) def test_cut_duplicates_bin(kwargs, msg): # see gh-20947 bins = [0, 2, 4, 6, 10, 10] @@ -354,8 +409,8 @@ def test_single_bin(data, length): @pytest.mark.parametrize( - "array_1_writeable,array_2_writeable", - [(True, True), (True, False), (False, False)]) + "array_1_writeable,array_2_writeable", [(True, True), (True, False), (False, False)] +) def test_cut_read_only(array_1_writeable, array_2_writeable): # issue 18773 array_1 = np.arange(0, 100, 10) @@ -365,58 +420,93 @@ def test_cut_read_only(array_1_writeable, array_2_writeable): array_2.flags.writeable = array_2_writeable hundred_elements = np.arange(100) - tm.assert_categorical_equal(cut(hundred_elements, array_1), - cut(hundred_elements, array_2)) + tm.assert_categorical_equal( + cut(hundred_elements, array_1), cut(hundred_elements, array_2) + ) -@pytest.mark.parametrize("conv", [ - lambda v: Timestamp(v), - lambda v: to_datetime(v), - lambda v: np.datetime64(v), - lambda v: Timestamp(v).to_pydatetime(), -]) +@pytest.mark.parametrize( + "conv", + [ + lambda v: Timestamp(v), + lambda v: to_datetime(v), + lambda v: np.datetime64(v), + lambda v: Timestamp(v).to_pydatetime(), + ], +) def test_datetime_bin(conv): data = [np.datetime64("2012-12-13"), np.datetime64("2012-12-15")] bin_data = ["2012-12-12", "2012-12-14", "2012-12-16"] - expected = Series(IntervalIndex([ - Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])), - Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2]))])).astype( - CDT(ordered=True)) + expected = Series( + IntervalIndex( + [ + Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])), + Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])), + ] + ) + ).astype(CDT(ordered=True)) bins = [conv(v) for v in bin_data] result = Series(cut(data, bins=bins)) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("data", [ - to_datetime(Series(["2013-01-01", "2013-01-02", "2013-01-03"])), - [np.datetime64("2013-01-01"), np.datetime64("2013-01-02"), - np.datetime64("2013-01-03")], - np.array([np.datetime64("2013-01-01"), np.datetime64("2013-01-02"), - np.datetime64("2013-01-03")]), - DatetimeIndex(["2013-01-01", "2013-01-02", "2013-01-03"]) -]) +@pytest.mark.parametrize( + "data", + [ + to_datetime(Series(["2013-01-01", "2013-01-02", "2013-01-03"])), + [ + np.datetime64("2013-01-01"), + np.datetime64("2013-01-02"), + np.datetime64("2013-01-03"), + ], + np.array( + [ + np.datetime64("2013-01-01"), + np.datetime64("2013-01-02"), + np.datetime64("2013-01-03"), + ] + ), + DatetimeIndex(["2013-01-01", "2013-01-02", "2013-01-03"]), + ], +) def test_datetime_cut(data): # see gh-14714 # # Testing time data when it comes in various collection types. result, _ = cut(data, 3, retbins=True) - expected = Series(IntervalIndex([ - Interval(Timestamp("2012-12-31 23:57:07.200000"), - Timestamp("2013-01-01 16:00:00")), - Interval(Timestamp("2013-01-01 16:00:00"), - Timestamp("2013-01-02 08:00:00")), - Interval(Timestamp("2013-01-02 08:00:00"), - Timestamp("2013-01-03 00:00:00"))])).astype(CDT(ordered=True)) + expected = Series( + IntervalIndex( + [ + Interval( + Timestamp("2012-12-31 23:57:07.200000"), + Timestamp("2013-01-01 16:00:00"), + ), + Interval( + Timestamp("2013-01-01 16:00:00"), Timestamp("2013-01-02 08:00:00") + ), + Interval( + Timestamp("2013-01-02 08:00:00"), Timestamp("2013-01-03 00:00:00") + ), + ] + ) + ).astype(CDT(ordered=True)) tm.assert_series_equal(Series(result), expected) -@pytest.mark.parametrize("bins", [ - 3, [Timestamp("2013-01-01 04:57:07.200000"), - Timestamp("2013-01-01 21:00:00"), - Timestamp("2013-01-02 13:00:00"), - Timestamp("2013-01-03 05:00:00")]]) +@pytest.mark.parametrize( + "bins", + [ + 3, + [ + Timestamp("2013-01-01 04:57:07.200000"), + Timestamp("2013-01-01 21:00:00"), + Timestamp("2013-01-02 13:00:00"), + Timestamp("2013-01-03 05:00:00"), + ], + ], +) @pytest.mark.parametrize("box", [list, np.array, Index, Series]) def test_datetime_tz_cut(bins, box): # see gh-19872 @@ -427,14 +517,24 @@ def test_datetime_tz_cut(bins, box): bins = box(bins) result = cut(s, bins) - expected = Series(IntervalIndex([ - Interval(Timestamp("2012-12-31 23:57:07.200000", tz=tz), - Timestamp("2013-01-01 16:00:00", tz=tz)), - Interval(Timestamp("2013-01-01 16:00:00", tz=tz), - Timestamp("2013-01-02 08:00:00", tz=tz)), - Interval(Timestamp("2013-01-02 08:00:00", tz=tz), - Timestamp("2013-01-03 00:00:00", tz=tz))])).astype( - CDT(ordered=True)) + expected = Series( + IntervalIndex( + [ + Interval( + Timestamp("2012-12-31 23:57:07.200000", tz=tz), + Timestamp("2013-01-01 16:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-01 16:00:00", tz=tz), + Timestamp("2013-01-02 08:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-02 08:00:00", tz=tz), + Timestamp("2013-01-03 00:00:00", tz=tz), + ), + ] + ) + ).astype(CDT(ordered=True)) tm.assert_series_equal(result, expected) @@ -446,15 +546,15 @@ def test_datetime_nan_error(): def test_datetime_nan_mask(): - result = cut(date_range("20130102", periods=5), - bins=date_range("20130101", periods=2)) + result = cut( + date_range("20130102", periods=5), bins=date_range("20130101", periods=2) + ) mask = result.categories.isna() tm.assert_numpy_array_equal(mask, np.array([False])) mask = result.isna() - tm.assert_numpy_array_equal(mask, np.array([False, True, True, - True, True])) + tm.assert_numpy_array_equal(mask, np.array([False, True, True, True, True])) @pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"]) @@ -466,9 +566,9 @@ def test_datetime_cut_roundtrip(tz): expected = cut(ser, result_bins) tm.assert_series_equal(result, expected) - expected_bins = DatetimeIndex(["2017-12-31 23:57:07.200000", - "2018-01-02 00:00:00", - "2018-01-03 00:00:00"]) + expected_bins = DatetimeIndex( + ["2017-12-31 23:57:07.200000", "2018-01-02 00:00:00", "2018-01-03 00:00:00"] + ) expected_bins = expected_bins.tz_localize(tz) tm.assert_index_equal(result_bins, expected_bins) @@ -481,7 +581,7 @@ def test_timedelta_cut_roundtrip(): expected = cut(ser, result_bins) tm.assert_series_equal(result, expected) - expected_bins = TimedeltaIndex(["0 days 23:57:07.200000", - "2 days 00:00:00", - "3 days 00:00:00"]) + expected_bins = TimedeltaIndex( + ["0 days 23:57:07.200000", "2 days 00:00:00", "3 days 00:00:00"] + ) tm.assert_index_equal(result_bins, expected_bins) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index fbec775bbf407f..56e83ada9eb992 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -8,232 +8,269 @@ class TestMelt: - def setup_method(self, method): self.df = tm.makeTimeDataFrame()[:10] - self.df['id1'] = (self.df['A'] > 0).astype(np.int64) - self.df['id2'] = (self.df['B'] > 0).astype(np.int64) - - self.var_name = 'var' - self.value_name = 'val' - - self.df1 = pd.DataFrame([[1.067683, -1.110463, 0.20867 - ], [-1.321405, 0.368915, -1.055342], - [-0.807333, 0.08298, -0.873361]]) - self.df1.columns = [list('ABC'), list('abc')] - self.df1.columns.names = ['CAP', 'low'] + self.df["id1"] = (self.df["A"] > 0).astype(np.int64) + self.df["id2"] = (self.df["B"] > 0).astype(np.int64) + + self.var_name = "var" + self.value_name = "val" + + self.df1 = pd.DataFrame( + [ + [1.067683, -1.110463, 0.20867], + [-1.321405, 0.368915, -1.055342], + [-0.807333, 0.08298, -0.873361], + ] + ) + self.df1.columns = [list("ABC"), list("abc")] + self.df1.columns.names = ["CAP", "low"] def test_top_level_method(self): result = melt(self.df) - assert result.columns.tolist() == ['variable', 'value'] + assert result.columns.tolist() == ["variable", "value"] def test_method_signatures(self): - tm.assert_frame_equal(self.df.melt(), - melt(self.df)) + tm.assert_frame_equal(self.df.melt(), melt(self.df)) - tm.assert_frame_equal(self.df.melt(id_vars=['id1', 'id2'], - value_vars=['A', 'B']), - melt(self.df, - id_vars=['id1', 'id2'], - value_vars=['A', 'B'])) + tm.assert_frame_equal( + self.df.melt(id_vars=["id1", "id2"], value_vars=["A", "B"]), + melt(self.df, id_vars=["id1", "id2"], value_vars=["A", "B"]), + ) - tm.assert_frame_equal(self.df.melt(var_name=self.var_name, - value_name=self.value_name), - melt(self.df, - var_name=self.var_name, - value_name=self.value_name)) + tm.assert_frame_equal( + self.df.melt(var_name=self.var_name, value_name=self.value_name), + melt(self.df, var_name=self.var_name, value_name=self.value_name), + ) - tm.assert_frame_equal(self.df1.melt(col_level=0), - melt(self.df1, col_level=0)) + tm.assert_frame_equal(self.df1.melt(col_level=0), melt(self.df1, col_level=0)) def test_default_col_names(self): result = self.df.melt() - assert result.columns.tolist() == ['variable', 'value'] + assert result.columns.tolist() == ["variable", "value"] - result1 = self.df.melt(id_vars=['id1']) - assert result1.columns.tolist() == ['id1', 'variable', 'value'] + result1 = self.df.melt(id_vars=["id1"]) + assert result1.columns.tolist() == ["id1", "variable", "value"] - result2 = self.df.melt(id_vars=['id1', 'id2']) - assert result2.columns.tolist() == ['id1', 'id2', 'variable', 'value'] + result2 = self.df.melt(id_vars=["id1", "id2"]) + assert result2.columns.tolist() == ["id1", "id2", "variable", "value"] def test_value_vars(self): - result3 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A') + result3 = self.df.melt(id_vars=["id1", "id2"], value_vars="A") assert len(result3) == 10 - result4 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B']) - expected4 = DataFrame({'id1': self.df['id1'].tolist() * 2, - 'id2': self.df['id2'].tolist() * 2, - 'variable': ['A'] * 10 + ['B'] * 10, - 'value': (self.df['A'].tolist() + - self.df['B'].tolist())}, - columns=['id1', 'id2', 'variable', 'value']) + result4 = self.df.melt(id_vars=["id1", "id2"], value_vars=["A", "B"]) + expected4 = DataFrame( + { + "id1": self.df["id1"].tolist() * 2, + "id2": self.df["id2"].tolist() * 2, + "variable": ["A"] * 10 + ["B"] * 10, + "value": (self.df["A"].tolist() + self.df["B"].tolist()), + }, + columns=["id1", "id2", "variable", "value"], + ) tm.assert_frame_equal(result4, expected4) def test_value_vars_types(self): # GH 15348 - expected = DataFrame({'id1': self.df['id1'].tolist() * 2, - 'id2': self.df['id2'].tolist() * 2, - 'variable': ['A'] * 10 + ['B'] * 10, - 'value': (self.df['A'].tolist() + - self.df['B'].tolist())}, - columns=['id1', 'id2', 'variable', 'value']) + expected = DataFrame( + { + "id1": self.df["id1"].tolist() * 2, + "id2": self.df["id2"].tolist() * 2, + "variable": ["A"] * 10 + ["B"] * 10, + "value": (self.df["A"].tolist() + self.df["B"].tolist()), + }, + columns=["id1", "id2", "variable", "value"], + ) for type_ in (tuple, list, np.array): - result = self.df.melt(id_vars=['id1', 'id2'], - value_vars=type_(('A', 'B'))) + result = self.df.melt(id_vars=["id1", "id2"], value_vars=type_(("A", "B"))) tm.assert_frame_equal(result, expected) def test_vars_work_with_multiindex(self): - expected = DataFrame({ - ('A', 'a'): self.df1[('A', 'a')], - 'CAP': ['B'] * len(self.df1), - 'low': ['b'] * len(self.df1), - 'value': self.df1[('B', 'b')], - }, columns=[('A', 'a'), 'CAP', 'low', 'value']) - - result = self.df1.melt(id_vars=[('A', 'a')], value_vars=[('B', 'b')]) + expected = DataFrame( + { + ("A", "a"): self.df1[("A", "a")], + "CAP": ["B"] * len(self.df1), + "low": ["b"] * len(self.df1), + "value": self.df1[("B", "b")], + }, + columns=[("A", "a"), "CAP", "low", "value"], + ) + + result = self.df1.melt(id_vars=[("A", "a")], value_vars=[("B", "b")]) tm.assert_frame_equal(result, expected) def test_single_vars_work_with_multiindex(self): - expected = DataFrame({ - 'A': {0: 1.067683, 1: -1.321405, 2: -0.807333}, - 'CAP': {0: 'B', 1: 'B', 2: 'B'}, - 'value': {0: -1.110463, 1: 0.368915, 2: 0.08298}}) - result = self.df1.melt(['A'], ['B'], col_level=0) + expected = DataFrame( + { + "A": {0: 1.067683, 1: -1.321405, 2: -0.807333}, + "CAP": {0: "B", 1: "B", 2: "B"}, + "value": {0: -1.110463, 1: 0.368915, 2: 0.08298}, + } + ) + result = self.df1.melt(["A"], ["B"], col_level=0) tm.assert_frame_equal(result, expected) def test_tuple_vars_fail_with_multiindex(self): # melt should fail with an informative error message if # the columns have a MultiIndex and a tuple is passed # for id_vars or value_vars. - tuple_a = ('A', 'a') + tuple_a = ("A", "a") list_a = [tuple_a] - tuple_b = ('B', 'b') + tuple_b = ("B", "b") list_b = [tuple_b] - msg = (r"(id|value)_vars must be a list of tuples when columns are" - " a MultiIndex") - for id_vars, value_vars in ((tuple_a, list_b), (list_a, tuple_b), - (tuple_a, tuple_b)): + msg = ( + r"(id|value)_vars must be a list of tuples when columns are" " a MultiIndex" + ) + for id_vars, value_vars in ( + (tuple_a, list_b), + (list_a, tuple_b), + (tuple_a, tuple_b), + ): with pytest.raises(ValueError, match=msg): self.df1.melt(id_vars=id_vars, value_vars=value_vars) def test_custom_var_name(self): result5 = self.df.melt(var_name=self.var_name) - assert result5.columns.tolist() == ['var', 'value'] - - result6 = self.df.melt(id_vars=['id1'], var_name=self.var_name) - assert result6.columns.tolist() == ['id1', 'var', 'value'] - - result7 = self.df.melt(id_vars=['id1', 'id2'], var_name=self.var_name) - assert result7.columns.tolist() == ['id1', 'id2', 'var', 'value'] - - result8 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A', - var_name=self.var_name) - assert result8.columns.tolist() == ['id1', 'id2', 'var', 'value'] - - result9 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'], - var_name=self.var_name) - expected9 = DataFrame({'id1': self.df['id1'].tolist() * 2, - 'id2': self.df['id2'].tolist() * 2, - self.var_name: ['A'] * 10 + ['B'] * 10, - 'value': (self.df['A'].tolist() + - self.df['B'].tolist())}, - columns=['id1', 'id2', self.var_name, 'value']) + assert result5.columns.tolist() == ["var", "value"] + + result6 = self.df.melt(id_vars=["id1"], var_name=self.var_name) + assert result6.columns.tolist() == ["id1", "var", "value"] + + result7 = self.df.melt(id_vars=["id1", "id2"], var_name=self.var_name) + assert result7.columns.tolist() == ["id1", "id2", "var", "value"] + + result8 = self.df.melt( + id_vars=["id1", "id2"], value_vars="A", var_name=self.var_name + ) + assert result8.columns.tolist() == ["id1", "id2", "var", "value"] + + result9 = self.df.melt( + id_vars=["id1", "id2"], value_vars=["A", "B"], var_name=self.var_name + ) + expected9 = DataFrame( + { + "id1": self.df["id1"].tolist() * 2, + "id2": self.df["id2"].tolist() * 2, + self.var_name: ["A"] * 10 + ["B"] * 10, + "value": (self.df["A"].tolist() + self.df["B"].tolist()), + }, + columns=["id1", "id2", self.var_name, "value"], + ) tm.assert_frame_equal(result9, expected9) def test_custom_value_name(self): result10 = self.df.melt(value_name=self.value_name) - assert result10.columns.tolist() == ['variable', 'val'] - - result11 = self.df.melt(id_vars=['id1'], value_name=self.value_name) - assert result11.columns.tolist() == ['id1', 'variable', 'val'] - - result12 = self.df.melt(id_vars=['id1', 'id2'], - value_name=self.value_name) - assert result12.columns.tolist() == ['id1', 'id2', 'variable', 'val'] - - result13 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A', - value_name=self.value_name) - assert result13.columns.tolist() == ['id1', 'id2', 'variable', 'val'] - - result14 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'], - value_name=self.value_name) - expected14 = DataFrame({'id1': self.df['id1'].tolist() * 2, - 'id2': self.df['id2'].tolist() * 2, - 'variable': ['A'] * 10 + ['B'] * 10, - self.value_name: (self.df['A'].tolist() + - self.df['B'].tolist())}, - columns=['id1', 'id2', 'variable', - self.value_name]) + assert result10.columns.tolist() == ["variable", "val"] + + result11 = self.df.melt(id_vars=["id1"], value_name=self.value_name) + assert result11.columns.tolist() == ["id1", "variable", "val"] + + result12 = self.df.melt(id_vars=["id1", "id2"], value_name=self.value_name) + assert result12.columns.tolist() == ["id1", "id2", "variable", "val"] + + result13 = self.df.melt( + id_vars=["id1", "id2"], value_vars="A", value_name=self.value_name + ) + assert result13.columns.tolist() == ["id1", "id2", "variable", "val"] + + result14 = self.df.melt( + id_vars=["id1", "id2"], value_vars=["A", "B"], value_name=self.value_name + ) + expected14 = DataFrame( + { + "id1": self.df["id1"].tolist() * 2, + "id2": self.df["id2"].tolist() * 2, + "variable": ["A"] * 10 + ["B"] * 10, + self.value_name: (self.df["A"].tolist() + self.df["B"].tolist()), + }, + columns=["id1", "id2", "variable", self.value_name], + ) tm.assert_frame_equal(result14, expected14) def test_custom_var_and_value_name(self): - result15 = self.df.melt(var_name=self.var_name, - value_name=self.value_name) - assert result15.columns.tolist() == ['var', 'val'] - - result16 = self.df.melt(id_vars=['id1'], var_name=self.var_name, - value_name=self.value_name) - assert result16.columns.tolist() == ['id1', 'var', 'val'] - - result17 = self.df.melt(id_vars=['id1', 'id2'], - var_name=self.var_name, - value_name=self.value_name) - assert result17.columns.tolist() == ['id1', 'id2', 'var', 'val'] - - result18 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A', - var_name=self.var_name, - value_name=self.value_name) - assert result18.columns.tolist() == ['id1', 'id2', 'var', 'val'] - - result19 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'], - var_name=self.var_name, - value_name=self.value_name) - expected19 = DataFrame({'id1': self.df['id1'].tolist() * 2, - 'id2': self.df['id2'].tolist() * 2, - self.var_name: ['A'] * 10 + ['B'] * 10, - self.value_name: (self.df['A'].tolist() + - self.df['B'].tolist())}, - columns=['id1', 'id2', self.var_name, - self.value_name]) + result15 = self.df.melt(var_name=self.var_name, value_name=self.value_name) + assert result15.columns.tolist() == ["var", "val"] + + result16 = self.df.melt( + id_vars=["id1"], var_name=self.var_name, value_name=self.value_name + ) + assert result16.columns.tolist() == ["id1", "var", "val"] + + result17 = self.df.melt( + id_vars=["id1", "id2"], var_name=self.var_name, value_name=self.value_name + ) + assert result17.columns.tolist() == ["id1", "id2", "var", "val"] + + result18 = self.df.melt( + id_vars=["id1", "id2"], + value_vars="A", + var_name=self.var_name, + value_name=self.value_name, + ) + assert result18.columns.tolist() == ["id1", "id2", "var", "val"] + + result19 = self.df.melt( + id_vars=["id1", "id2"], + value_vars=["A", "B"], + var_name=self.var_name, + value_name=self.value_name, + ) + expected19 = DataFrame( + { + "id1": self.df["id1"].tolist() * 2, + "id2": self.df["id2"].tolist() * 2, + self.var_name: ["A"] * 10 + ["B"] * 10, + self.value_name: (self.df["A"].tolist() + self.df["B"].tolist()), + }, + columns=["id1", "id2", self.var_name, self.value_name], + ) tm.assert_frame_equal(result19, expected19) df20 = self.df.copy() - df20.columns.name = 'foo' + df20.columns.name = "foo" result20 = df20.melt() - assert result20.columns.tolist() == ['foo', 'value'] + assert result20.columns.tolist() == ["foo", "value"] def test_col_level(self): res1 = self.df1.melt(col_level=0) - res2 = self.df1.melt(col_level='CAP') - assert res1.columns.tolist() == ['CAP', 'value'] - assert res2.columns.tolist() == ['CAP', 'value'] + res2 = self.df1.melt(col_level="CAP") + assert res1.columns.tolist() == ["CAP", "value"] + assert res2.columns.tolist() == ["CAP", "value"] def test_multiindex(self): res = self.df1.melt() - assert res.columns.tolist() == ['CAP', 'low', 'value'] - - @pytest.mark.parametrize("col", [ - pd.Series(pd.date_range('2010', periods=5, tz='US/Pacific')), - pd.Series(["a", "b", "c", "a", "d"], dtype="category"), - pd.Series([0, 1, 0, 0, 0])]) + assert res.columns.tolist() == ["CAP", "low", "value"] + + @pytest.mark.parametrize( + "col", + [ + pd.Series(pd.date_range("2010", periods=5, tz="US/Pacific")), + pd.Series(["a", "b", "c", "a", "d"], dtype="category"), + pd.Series([0, 1, 0, 0, 0]), + ], + ) def test_pandas_dtypes(self, col): # GH 15785 - df = DataFrame({'klass': range(5), - 'col': col, - 'attr1': [1, 0, 0, 0, 0], - 'attr2': col}) - expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col], - ignore_index=True) - result = melt(df, id_vars=['klass', 'col'], var_name='attribute', - value_name='value') - expected = DataFrame({0: list(range(5)) * 2, - 1: pd.concat([col] * 2, ignore_index=True), - 2: ['attr1'] * 5 + ['attr2'] * 5, - 3: expected_value}) - expected.columns = ['klass', 'col', 'attribute', 'value'] + df = DataFrame( + {"klass": range(5), "col": col, "attr1": [1, 0, 0, 0, 0], "attr2": col} + ) + expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col], ignore_index=True) + result = melt( + df, id_vars=["klass", "col"], var_name="attribute", value_name="value" + ) + expected = DataFrame( + { + 0: list(range(5)) * 2, + 1: pd.concat([col] * 2, ignore_index=True), + 2: ["attr1"] * 5 + ["attr2"] * 5, + 3: expected_value, + } + ) + expected.columns = ["klass", "col", "attribute", "value"] tm.assert_frame_equal(result, expected) def test_melt_missing_columns_raises(self): @@ -242,472 +279,688 @@ def test_melt_missing_columns_raises(self): # attempted with column names absent from the dataframe # Generate data - df = pd.DataFrame(np.random.randn(5, 4), columns=list('abcd')) + df = pd.DataFrame(np.random.randn(5, 4), columns=list("abcd")) # Try to melt with missing `value_vars` column name msg = "The following '{Var}' are not present in the DataFrame: {Col}" with pytest.raises( - KeyError, - match=msg.format(Var='value_vars', Col="\\['C'\\]")): - df.melt(['a', 'b'], ['C', 'd']) + KeyError, match=msg.format(Var="value_vars", Col="\\['C'\\]") + ): + df.melt(["a", "b"], ["C", "d"]) # Try to melt with missing `id_vars` column name - with pytest.raises( - KeyError, - match=msg.format(Var='id_vars', Col="\\['A'\\]")): - df.melt(['A', 'b'], ['c', 'd']) + with pytest.raises(KeyError, match=msg.format(Var="id_vars", Col="\\['A'\\]")): + df.melt(["A", "b"], ["c", "d"]) # Multiple missing with pytest.raises( - KeyError, - match=msg.format(Var='id_vars', - Col="\\['not_here', 'or_there'\\]")): - df.melt(['a', 'b', 'not_here', 'or_there'], ['c', 'd']) + KeyError, + match=msg.format(Var="id_vars", Col="\\['not_here', 'or_there'\\]"), + ): + df.melt(["a", "b", "not_here", "or_there"], ["c", "d"]) # Multiindex melt fails if column is missing from multilevel melt multi = df.copy() - multi.columns = [list('ABCD'), list('abcd')] - with pytest.raises( - KeyError, - match=msg.format(Var='id_vars', - Col="\\['E'\\]")): - multi.melt([('E', 'a')], [('B', 'b')]) + multi.columns = [list("ABCD"), list("abcd")] + with pytest.raises(KeyError, match=msg.format(Var="id_vars", Col="\\['E'\\]")): + multi.melt([("E", "a")], [("B", "b")]) # Multiindex fails if column is missing from single level melt with pytest.raises( - KeyError, - match=msg.format(Var='value_vars', - Col="\\['F'\\]")): - multi.melt(['A'], ['F'], col_level=0) + KeyError, match=msg.format(Var="value_vars", Col="\\['F'\\]") + ): + multi.melt(["A"], ["F"], col_level=0) class TestLreshape: - def test_pairs(self): - data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008', - '11jan2009'], - 'birthwt': [1766, 3301, 1454, 3139, 4133], - 'id': [101, 102, 103, 104, 105], - 'sex': ['Male', 'Female', 'Female', 'Female', 'Female'], - 'visitdt1': ['11jan2009', '22dec2008', '04jan2009', - '29dec2008', '20jan2009'], - 'visitdt2': - ['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'], - 'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'], - 'wt1': [1823, 3338, 1549, 3298, 4306], - 'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0], - 'wt3': [2293.0, nan, nan, 3377.0, 4805.0]} + data = { + "birthdt": [ + "08jan2009", + "20dec2008", + "30dec2008", + "21dec2008", + "11jan2009", + ], + "birthwt": [1766, 3301, 1454, 3139, 4133], + "id": [101, 102, 103, 104, 105], + "sex": ["Male", "Female", "Female", "Female", "Female"], + "visitdt1": [ + "11jan2009", + "22dec2008", + "04jan2009", + "29dec2008", + "20jan2009", + ], + "visitdt2": ["21jan2009", nan, "22jan2009", "31dec2008", "03feb2009"], + "visitdt3": ["05feb2009", nan, nan, "02jan2009", "15feb2009"], + "wt1": [1823, 3338, 1549, 3298, 4306], + "wt2": [2011.0, nan, 1892.0, 3338.0, 4575.0], + "wt3": [2293.0, nan, nan, 3377.0, 4805.0], + } df = DataFrame(data) - spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)], - 'wt': ['wt%d' % i for i in range(1, 4)]} + spec = { + "visitdt": ["visitdt%d" % i for i in range(1, 4)], + "wt": ["wt%d" % i for i in range(1, 4)], + } result = lreshape(df, spec) - exp_data = {'birthdt': - ['08jan2009', '20dec2008', '30dec2008', '21dec2008', - '11jan2009', '08jan2009', '30dec2008', '21dec2008', - '11jan2009', '08jan2009', '21dec2008', '11jan2009'], - 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 1454, 3139, - 4133, 1766, 3139, 4133], - 'id': [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, - 104, 105], - 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', - 'Male', 'Female', 'Female', 'Female', 'Male', - 'Female', 'Female'], - 'visitdt': ['11jan2009', '22dec2008', '04jan2009', - '29dec2008', '20jan2009', '21jan2009', - '22jan2009', '31dec2008', '03feb2009', - '05feb2009', '02jan2009', '15feb2009'], - 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, - 1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]} + exp_data = { + "birthdt": [ + "08jan2009", + "20dec2008", + "30dec2008", + "21dec2008", + "11jan2009", + "08jan2009", + "30dec2008", + "21dec2008", + "11jan2009", + "08jan2009", + "21dec2008", + "11jan2009", + ], + "birthwt": [ + 1766, + 3301, + 1454, + 3139, + 4133, + 1766, + 1454, + 3139, + 4133, + 1766, + 3139, + 4133, + ], + "id": [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, 104, 105], + "sex": [ + "Male", + "Female", + "Female", + "Female", + "Female", + "Male", + "Female", + "Female", + "Female", + "Male", + "Female", + "Female", + ], + "visitdt": [ + "11jan2009", + "22dec2008", + "04jan2009", + "29dec2008", + "20jan2009", + "21jan2009", + "22jan2009", + "31dec2008", + "03feb2009", + "05feb2009", + "02jan2009", + "15feb2009", + ], + "wt": [ + 1823.0, + 3338.0, + 1549.0, + 3298.0, + 4306.0, + 2011.0, + 1892.0, + 3338.0, + 4575.0, + 2293.0, + 3377.0, + 4805.0, + ], + } exp = DataFrame(exp_data, columns=result.columns) tm.assert_frame_equal(result, exp) result = lreshape(df, spec, dropna=False) - exp_data = {'birthdt': - ['08jan2009', '20dec2008', '30dec2008', '21dec2008', - '11jan2009', '08jan2009', '20dec2008', '30dec2008', - '21dec2008', '11jan2009', '08jan2009', '20dec2008', - '30dec2008', '21dec2008', '11jan2009'], - 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454, - 3139, 4133, 1766, 3301, 1454, 3139, 4133], - 'id': [101, 102, 103, 104, 105, 101, 102, 103, 104, 105, - 101, 102, 103, 104, 105], - 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', - 'Male', 'Female', 'Female', 'Female', 'Female', - 'Male', 'Female', 'Female', 'Female', 'Female'], - 'visitdt': ['11jan2009', '22dec2008', '04jan2009', - '29dec2008', '20jan2009', '21jan2009', nan, - '22jan2009', '31dec2008', '03feb2009', - '05feb2009', nan, nan, '02jan2009', - '15feb2009'], - 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, nan, - 1892.0, 3338.0, 4575.0, 2293.0, nan, nan, 3377.0, - 4805.0]} + exp_data = { + "birthdt": [ + "08jan2009", + "20dec2008", + "30dec2008", + "21dec2008", + "11jan2009", + "08jan2009", + "20dec2008", + "30dec2008", + "21dec2008", + "11jan2009", + "08jan2009", + "20dec2008", + "30dec2008", + "21dec2008", + "11jan2009", + ], + "birthwt": [ + 1766, + 3301, + 1454, + 3139, + 4133, + 1766, + 3301, + 1454, + 3139, + 4133, + 1766, + 3301, + 1454, + 3139, + 4133, + ], + "id": [ + 101, + 102, + 103, + 104, + 105, + 101, + 102, + 103, + 104, + 105, + 101, + 102, + 103, + 104, + 105, + ], + "sex": [ + "Male", + "Female", + "Female", + "Female", + "Female", + "Male", + "Female", + "Female", + "Female", + "Female", + "Male", + "Female", + "Female", + "Female", + "Female", + ], + "visitdt": [ + "11jan2009", + "22dec2008", + "04jan2009", + "29dec2008", + "20jan2009", + "21jan2009", + nan, + "22jan2009", + "31dec2008", + "03feb2009", + "05feb2009", + nan, + nan, + "02jan2009", + "15feb2009", + ], + "wt": [ + 1823.0, + 3338.0, + 1549.0, + 3298.0, + 4306.0, + 2011.0, + nan, + 1892.0, + 3338.0, + 4575.0, + 2293.0, + nan, + nan, + 3377.0, + 4805.0, + ], + } exp = DataFrame(exp_data, columns=result.columns) tm.assert_frame_equal(result, exp) - spec = {'visitdt': ['visitdt%d' % i for i in range(1, 3)], - 'wt': ['wt%d' % i for i in range(1, 4)]} + spec = { + "visitdt": ["visitdt%d" % i for i in range(1, 3)], + "wt": ["wt%d" % i for i in range(1, 4)], + } msg = "All column lists must be same length" with pytest.raises(ValueError, match=msg): lreshape(df, spec) class TestWideToLong: - def test_simple(self): np.random.seed(123) x = np.random.randn(3) - df = pd.DataFrame({"A1970": {0: "a", - 1: "b", - 2: "c"}, - "A1980": {0: "d", - 1: "e", - 2: "f"}, - "B1970": {0: 2.5, - 1: 1.2, - 2: .7}, - "B1980": {0: 3.2, - 1: 1.3, - 2: .1}, - "X": dict(zip( - range(3), x))}) + df = pd.DataFrame( + { + "A1970": {0: "a", 1: "b", 2: "c"}, + "A1980": {0: "d", 1: "e", 2: "f"}, + "B1970": {0: 2.5, 1: 1.2, 2: 0.7}, + "B1980": {0: 3.2, 1: 1.3, 2: 0.1}, + "X": dict(zip(range(3), x)), + } + ) df["id"] = df.index - exp_data = {"X": x.tolist() + x.tolist(), - "A": ['a', 'b', 'c', 'd', 'e', 'f'], - "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], - "year": [1970, 1970, 1970, 1980, 1980, 1980], - "id": [0, 1, 2, 0, 1, 2]} + exp_data = { + "X": x.tolist() + x.tolist(), + "A": ["a", "b", "c", "d", "e", "f"], + "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": [1970, 1970, 1970, 1980, 1980, 1980], + "id": [0, 1, 2, 0, 1, 2], + } expected = DataFrame(exp_data) - expected = expected.set_index(['id', 'year'])[["X", "A", "B"]] + expected = expected.set_index(["id", "year"])[["X", "A", "B"]] result = wide_to_long(df, ["A", "B"], i="id", j="year") tm.assert_frame_equal(result, expected) def test_stubs(self): # GH9204 df = pd.DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]]) - df.columns = ['id', 'inc1', 'inc2', 'edu1', 'edu2'] - stubs = ['inc', 'edu'] + df.columns = ["id", "inc1", "inc2", "edu1", "edu2"] + stubs = ["inc", "edu"] # TODO: unused? - df_long = pd.wide_to_long(df, stubs, i='id', j='age') # noqa + df_long = pd.wide_to_long(df, stubs, i="id", j="age") # noqa - assert stubs == ['inc', 'edu'] + assert stubs == ["inc", "edu"] def test_separating_character(self): # GH14779 np.random.seed(123) x = np.random.randn(3) - df = pd.DataFrame({"A.1970": {0: "a", - 1: "b", - 2: "c"}, - "A.1980": {0: "d", - 1: "e", - 2: "f"}, - "B.1970": {0: 2.5, - 1: 1.2, - 2: .7}, - "B.1980": {0: 3.2, - 1: 1.3, - 2: .1}, - "X": dict(zip( - range(3), x))}) + df = pd.DataFrame( + { + "A.1970": {0: "a", 1: "b", 2: "c"}, + "A.1980": {0: "d", 1: "e", 2: "f"}, + "B.1970": {0: 2.5, 1: 1.2, 2: 0.7}, + "B.1980": {0: 3.2, 1: 1.3, 2: 0.1}, + "X": dict(zip(range(3), x)), + } + ) df["id"] = df.index - exp_data = {"X": x.tolist() + x.tolist(), - "A": ['a', 'b', 'c', 'd', 'e', 'f'], - "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], - "year": [1970, 1970, 1970, 1980, 1980, 1980], - "id": [0, 1, 2, 0, 1, 2]} + exp_data = { + "X": x.tolist() + x.tolist(), + "A": ["a", "b", "c", "d", "e", "f"], + "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": [1970, 1970, 1970, 1980, 1980, 1980], + "id": [0, 1, 2, 0, 1, 2], + } expected = DataFrame(exp_data) - expected = expected.set_index(['id', 'year'])[["X", "A", "B"]] + expected = expected.set_index(["id", "year"])[["X", "A", "B"]] result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".") tm.assert_frame_equal(result, expected) def test_escapable_characters(self): np.random.seed(123) x = np.random.randn(3) - df = pd.DataFrame({"A(quarterly)1970": {0: "a", - 1: "b", - 2: "c"}, - "A(quarterly)1980": {0: "d", - 1: "e", - 2: "f"}, - "B(quarterly)1970": {0: 2.5, - 1: 1.2, - 2: .7}, - "B(quarterly)1980": {0: 3.2, - 1: 1.3, - 2: .1}, - "X": dict(zip( - range(3), x))}) + df = pd.DataFrame( + { + "A(quarterly)1970": {0: "a", 1: "b", 2: "c"}, + "A(quarterly)1980": {0: "d", 1: "e", 2: "f"}, + "B(quarterly)1970": {0: 2.5, 1: 1.2, 2: 0.7}, + "B(quarterly)1980": {0: 3.2, 1: 1.3, 2: 0.1}, + "X": dict(zip(range(3), x)), + } + ) df["id"] = df.index - exp_data = {"X": x.tolist() + x.tolist(), - "A(quarterly)": ['a', 'b', 'c', 'd', 'e', 'f'], - "B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], - "year": [1970, 1970, 1970, 1980, 1980, 1980], - "id": [0, 1, 2, 0, 1, 2]} + exp_data = { + "X": x.tolist() + x.tolist(), + "A(quarterly)": ["a", "b", "c", "d", "e", "f"], + "B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": [1970, 1970, 1970, 1980, 1980, 1980], + "id": [0, 1, 2, 0, 1, 2], + } expected = DataFrame(exp_data) - expected = expected.set_index( - ['id', 'year'])[["X", "A(quarterly)", "B(quarterly)"]] - result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"], - i="id", j="year") + expected = expected.set_index(["id", "year"])[ + ["X", "A(quarterly)", "B(quarterly)"] + ] + result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"], i="id", j="year") tm.assert_frame_equal(result, expected) def test_unbalanced(self): # test that we can have a varying amount of time variables - df = pd.DataFrame({'A2010': [1.0, 2.0], - 'A2011': [3.0, 4.0], - 'B2010': [5.0, 6.0], - 'X': ['X1', 'X2']}) - df['id'] = df.index - exp_data = {'X': ['X1', 'X1', 'X2', 'X2'], - 'A': [1.0, 3.0, 2.0, 4.0], - 'B': [5.0, np.nan, 6.0, np.nan], - 'id': [0, 0, 1, 1], - 'year': [2010, 2011, 2010, 2011]} + df = pd.DataFrame( + { + "A2010": [1.0, 2.0], + "A2011": [3.0, 4.0], + "B2010": [5.0, 6.0], + "X": ["X1", "X2"], + } + ) + df["id"] = df.index + exp_data = { + "X": ["X1", "X1", "X2", "X2"], + "A": [1.0, 3.0, 2.0, 4.0], + "B": [5.0, np.nan, 6.0, np.nan], + "id": [0, 0, 1, 1], + "year": [2010, 2011, 2010, 2011], + } expected = pd.DataFrame(exp_data) - expected = expected.set_index(['id', 'year'])[["X", "A", "B"]] - result = wide_to_long(df, ['A', 'B'], i='id', j='year') + expected = expected.set_index(["id", "year"])[["X", "A", "B"]] + result = wide_to_long(df, ["A", "B"], i="id", j="year") tm.assert_frame_equal(result, expected) def test_character_overlap(self): # Test we handle overlapping characters in both id_vars and value_vars - df = pd.DataFrame({ - 'A11': ['a11', 'a22', 'a33'], - 'A12': ['a21', 'a22', 'a23'], - 'B11': ['b11', 'b12', 'b13'], - 'B12': ['b21', 'b22', 'b23'], - 'BB11': [1, 2, 3], - 'BB12': [4, 5, 6], - 'BBBX': [91, 92, 93], - 'BBBZ': [91, 92, 93] - }) - df['id'] = df.index - expected = pd.DataFrame({ - 'BBBX': [91, 92, 93, 91, 92, 93], - 'BBBZ': [91, 92, 93, 91, 92, 93], - 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'], - 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'], - 'BB': [1, 2, 3, 4, 5, 6], - 'id': [0, 1, 2, 0, 1, 2], - 'year': [11, 11, 11, 12, 12, 12]}) - expected = expected.set_index(['id', 'year'])[ - ['BBBX', 'BBBZ', 'A', 'B', 'BB']] - result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year') - tm.assert_frame_equal(result.sort_index(axis=1), - expected.sort_index(axis=1)) + df = pd.DataFrame( + { + "A11": ["a11", "a22", "a33"], + "A12": ["a21", "a22", "a23"], + "B11": ["b11", "b12", "b13"], + "B12": ["b21", "b22", "b23"], + "BB11": [1, 2, 3], + "BB12": [4, 5, 6], + "BBBX": [91, 92, 93], + "BBBZ": [91, 92, 93], + } + ) + df["id"] = df.index + expected = pd.DataFrame( + { + "BBBX": [91, 92, 93, 91, 92, 93], + "BBBZ": [91, 92, 93, 91, 92, 93], + "A": ["a11", "a22", "a33", "a21", "a22", "a23"], + "B": ["b11", "b12", "b13", "b21", "b22", "b23"], + "BB": [1, 2, 3, 4, 5, 6], + "id": [0, 1, 2, 0, 1, 2], + "year": [11, 11, 11, 12, 12, 12], + } + ) + expected = expected.set_index(["id", "year"])[["BBBX", "BBBZ", "A", "B", "BB"]] + result = wide_to_long(df, ["A", "B", "BB"], i="id", j="year") + tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) def test_invalid_separator(self): # if an invalid separator is supplied a empty data frame is returned - sep = 'nope!' - df = pd.DataFrame({'A2010': [1.0, 2.0], - 'A2011': [3.0, 4.0], - 'B2010': [5.0, 6.0], - 'X': ['X1', 'X2']}) - df['id'] = df.index - exp_data = {'X': '', - 'A2010': [], - 'A2011': [], - 'B2010': [], - 'id': [], - 'year': [], - 'A': [], - 'B': []} - expected = pd.DataFrame(exp_data).astype({'year': 'int'}) - expected = expected.set_index(['id', 'year'])[[ - 'X', 'A2010', 'A2011', 'B2010', 'A', 'B']] + sep = "nope!" + df = pd.DataFrame( + { + "A2010": [1.0, 2.0], + "A2011": [3.0, 4.0], + "B2010": [5.0, 6.0], + "X": ["X1", "X2"], + } + ) + df["id"] = df.index + exp_data = { + "X": "", + "A2010": [], + "A2011": [], + "B2010": [], + "id": [], + "year": [], + "A": [], + "B": [], + } + expected = pd.DataFrame(exp_data).astype({"year": "int"}) + expected = expected.set_index(["id", "year"])[ + ["X", "A2010", "A2011", "B2010", "A", "B"] + ] expected.index.set_levels([0, 1], level=0, inplace=True) - result = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep) - tm.assert_frame_equal(result.sort_index(axis=1), - expected.sort_index(axis=1)) + result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=sep) + tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) def test_num_string_disambiguation(self): # Test that we can disambiguate number value_vars from # string value_vars - df = pd.DataFrame({ - 'A11': ['a11', 'a22', 'a33'], - 'A12': ['a21', 'a22', 'a23'], - 'B11': ['b11', 'b12', 'b13'], - 'B12': ['b21', 'b22', 'b23'], - 'BB11': [1, 2, 3], - 'BB12': [4, 5, 6], - 'Arating': [91, 92, 93], - 'Arating_old': [91, 92, 93] - }) - df['id'] = df.index - expected = pd.DataFrame({ - 'Arating': [91, 92, 93, 91, 92, 93], - 'Arating_old': [91, 92, 93, 91, 92, 93], - 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'], - 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'], - 'BB': [1, 2, 3, 4, 5, 6], - 'id': [0, 1, 2, 0, 1, 2], - 'year': [11, 11, 11, 12, 12, 12]}) - expected = expected.set_index(['id', 'year'])[ - ['Arating', 'Arating_old', 'A', 'B', 'BB']] - result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year') - tm.assert_frame_equal(result.sort_index(axis=1), - expected.sort_index(axis=1)) + df = pd.DataFrame( + { + "A11": ["a11", "a22", "a33"], + "A12": ["a21", "a22", "a23"], + "B11": ["b11", "b12", "b13"], + "B12": ["b21", "b22", "b23"], + "BB11": [1, 2, 3], + "BB12": [4, 5, 6], + "Arating": [91, 92, 93], + "Arating_old": [91, 92, 93], + } + ) + df["id"] = df.index + expected = pd.DataFrame( + { + "Arating": [91, 92, 93, 91, 92, 93], + "Arating_old": [91, 92, 93, 91, 92, 93], + "A": ["a11", "a22", "a33", "a21", "a22", "a23"], + "B": ["b11", "b12", "b13", "b21", "b22", "b23"], + "BB": [1, 2, 3, 4, 5, 6], + "id": [0, 1, 2, 0, 1, 2], + "year": [11, 11, 11, 12, 12, 12], + } + ) + expected = expected.set_index(["id", "year"])[ + ["Arating", "Arating_old", "A", "B", "BB"] + ] + result = wide_to_long(df, ["A", "B", "BB"], i="id", j="year") + tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) def test_invalid_suffixtype(self): # If all stubs names end with a string, but a numeric suffix is # assumed, an empty data frame is returned - df = pd.DataFrame({'Aone': [1.0, 2.0], - 'Atwo': [3.0, 4.0], - 'Bone': [5.0, 6.0], - 'X': ['X1', 'X2']}) - df['id'] = df.index - exp_data = {'X': '', - 'Aone': [], - 'Atwo': [], - 'Bone': [], - 'id': [], - 'year': [], - 'A': [], - 'B': []} - expected = pd.DataFrame(exp_data).astype({'year': 'int'}) - - expected = expected.set_index(['id', 'year']) + df = pd.DataFrame( + { + "Aone": [1.0, 2.0], + "Atwo": [3.0, 4.0], + "Bone": [5.0, 6.0], + "X": ["X1", "X2"], + } + ) + df["id"] = df.index + exp_data = { + "X": "", + "Aone": [], + "Atwo": [], + "Bone": [], + "id": [], + "year": [], + "A": [], + "B": [], + } + expected = pd.DataFrame(exp_data).astype({"year": "int"}) + + expected = expected.set_index(["id", "year"]) expected.index.set_levels([0, 1], level=0, inplace=True) - result = wide_to_long(df, ['A', 'B'], i='id', j='year') - tm.assert_frame_equal(result.sort_index(axis=1), - expected.sort_index(axis=1)) + result = wide_to_long(df, ["A", "B"], i="id", j="year") + tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) def test_multiple_id_columns(self): # Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm - df = pd.DataFrame({ - 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], - 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], - 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], - 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] - }) - expected = pd.DataFrame({ - 'ht': [2.8, 3.4, 2.9, 3.8, 2.2, 2.9, 2.0, 3.2, 1.8, - 2.8, 1.9, 2.4, 2.2, 3.3, 2.3, 3.4, 2.1, 2.9], - 'famid': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3], - 'birth': [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3], - 'age': [1, 2, 1, 2, 1, 2, 1, 2, 1, - 2, 1, 2, 1, 2, 1, 2, 1, 2] - }) - expected = expected.set_index(['famid', 'birth', 'age'])[['ht']] - result = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age') + df = pd.DataFrame( + { + "famid": [1, 1, 1, 2, 2, 2, 3, 3, 3], + "birth": [1, 2, 3, 1, 2, 3, 1, 2, 3], + "ht1": [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], + "ht2": [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9], + } + ) + expected = pd.DataFrame( + { + "ht": [ + 2.8, + 3.4, + 2.9, + 3.8, + 2.2, + 2.9, + 2.0, + 3.2, + 1.8, + 2.8, + 1.9, + 2.4, + 2.2, + 3.3, + 2.3, + 3.4, + 2.1, + 2.9, + ], + "famid": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3], + "birth": [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3], + "age": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], + } + ) + expected = expected.set_index(["famid", "birth", "age"])[["ht"]] + result = wide_to_long(df, "ht", i=["famid", "birth"], j="age") tm.assert_frame_equal(result, expected) def test_non_unique_idvars(self): # GH16382 # Raise an error message if non unique id vars (i) are passed - df = pd.DataFrame({ - 'A_A1': [1, 2, 3, 4, 5], - 'B_B1': [1, 2, 3, 4, 5], - 'x': [1, 1, 1, 1, 1] - }) + df = pd.DataFrame( + {"A_A1": [1, 2, 3, 4, 5], "B_B1": [1, 2, 3, 4, 5], "x": [1, 1, 1, 1, 1]} + ) msg = "the id variables need to uniquely identify each row" with pytest.raises(ValueError, match=msg): - wide_to_long(df, ['A_A', 'B_B'], i='x', j='colname') + wide_to_long(df, ["A_A", "B_B"], i="x", j="colname") def test_cast_j_int(self): - df = pd.DataFrame({ - 'actor_1': ['CCH Pounder', 'Johnny Depp', 'Christoph Waltz'], - 'actor_2': ['Joel David Moore', 'Orlando Bloom', 'Rory Kinnear'], - 'actor_fb_likes_1': [1000.0, 40000.0, 11000.0], - 'actor_fb_likes_2': [936.0, 5000.0, 393.0], - 'title': ['Avatar', "Pirates of the Caribbean", 'Spectre']}) - - expected = pd.DataFrame({ - 'actor': ['CCH Pounder', - 'Johnny Depp', - 'Christoph Waltz', - 'Joel David Moore', - 'Orlando Bloom', - 'Rory Kinnear'], - 'actor_fb_likes': [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0], - 'num': [1, 1, 1, 2, 2, 2], - 'title': ['Avatar', - 'Pirates of the Caribbean', - 'Spectre', - 'Avatar', - 'Pirates of the Caribbean', - 'Spectre']}).set_index(['title', 'num']) - result = wide_to_long(df, ['actor', 'actor_fb_likes'], - i='title', j='num', sep='_') + df = pd.DataFrame( + { + "actor_1": ["CCH Pounder", "Johnny Depp", "Christoph Waltz"], + "actor_2": ["Joel David Moore", "Orlando Bloom", "Rory Kinnear"], + "actor_fb_likes_1": [1000.0, 40000.0, 11000.0], + "actor_fb_likes_2": [936.0, 5000.0, 393.0], + "title": ["Avatar", "Pirates of the Caribbean", "Spectre"], + } + ) + + expected = pd.DataFrame( + { + "actor": [ + "CCH Pounder", + "Johnny Depp", + "Christoph Waltz", + "Joel David Moore", + "Orlando Bloom", + "Rory Kinnear", + ], + "actor_fb_likes": [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0], + "num": [1, 1, 1, 2, 2, 2], + "title": [ + "Avatar", + "Pirates of the Caribbean", + "Spectre", + "Avatar", + "Pirates of the Caribbean", + "Spectre", + ], + } + ).set_index(["title", "num"]) + result = wide_to_long( + df, ["actor", "actor_fb_likes"], i="title", j="num", sep="_" + ) tm.assert_frame_equal(result, expected) def test_identical_stubnames(self): - df = pd.DataFrame({'A2010': [1.0, 2.0], - 'A2011': [3.0, 4.0], - 'B2010': [5.0, 6.0], - 'A': ['X1', 'X2']}) + df = pd.DataFrame( + { + "A2010": [1.0, 2.0], + "A2011": [3.0, 4.0], + "B2010": [5.0, 6.0], + "A": ["X1", "X2"], + } + ) msg = "stubname can't be identical to a column name" with pytest.raises(ValueError, match=msg): - wide_to_long(df, ['A', 'B'], i='A', j='colname') + wide_to_long(df, ["A", "B"], i="A", j="colname") def test_nonnumeric_suffix(self): - df = pd.DataFrame({'treatment_placebo': [1.0, 2.0], - 'treatment_test': [3.0, 4.0], - 'result_placebo': [5.0, 6.0], - 'A': ['X1', 'X2']}) - expected = pd.DataFrame({ - 'A': ['X1', 'X1', 'X2', 'X2'], - 'colname': ['placebo', 'test', 'placebo', 'test'], - 'result': [5.0, np.nan, 6.0, np.nan], - 'treatment': [1.0, 3.0, 2.0, 4.0]}) - expected = expected.set_index(['A', 'colname']) - result = wide_to_long(df, ['result', 'treatment'], - i='A', j='colname', suffix='[a-z]+', sep='_') + df = pd.DataFrame( + { + "treatment_placebo": [1.0, 2.0], + "treatment_test": [3.0, 4.0], + "result_placebo": [5.0, 6.0], + "A": ["X1", "X2"], + } + ) + expected = pd.DataFrame( + { + "A": ["X1", "X1", "X2", "X2"], + "colname": ["placebo", "test", "placebo", "test"], + "result": [5.0, np.nan, 6.0, np.nan], + "treatment": [1.0, 3.0, 2.0, 4.0], + } + ) + expected = expected.set_index(["A", "colname"]) + result = wide_to_long( + df, ["result", "treatment"], i="A", j="colname", suffix="[a-z]+", sep="_" + ) tm.assert_frame_equal(result, expected) def test_mixed_type_suffix(self): - df = pd.DataFrame({ - 'A': ['X1', 'X2'], - 'result_1': [0, 9], - 'result_foo': [5.0, 6.0], - 'treatment_1': [1.0, 2.0], - 'treatment_foo': [3.0, 4.0]}) - expected = pd.DataFrame({ - 'A': ['X1', 'X2', 'X1', 'X2'], - 'colname': ['1', '1', 'foo', 'foo'], - 'result': [0.0, 9.0, 5.0, 6.0], - 'treatment': [1.0, 2.0, 3.0, 4.0]}).set_index(['A', 'colname']) - result = wide_to_long(df, ['result', 'treatment'], - i='A', j='colname', suffix='.+', sep='_') + df = pd.DataFrame( + { + "A": ["X1", "X2"], + "result_1": [0, 9], + "result_foo": [5.0, 6.0], + "treatment_1": [1.0, 2.0], + "treatment_foo": [3.0, 4.0], + } + ) + expected = pd.DataFrame( + { + "A": ["X1", "X2", "X1", "X2"], + "colname": ["1", "1", "foo", "foo"], + "result": [0.0, 9.0, 5.0, 6.0], + "treatment": [1.0, 2.0, 3.0, 4.0], + } + ).set_index(["A", "colname"]) + result = wide_to_long( + df, ["result", "treatment"], i="A", j="colname", suffix=".+", sep="_" + ) tm.assert_frame_equal(result, expected) def test_float_suffix(self): - df = pd.DataFrame({ - 'treatment_1.1': [1.0, 2.0], - 'treatment_2.1': [3.0, 4.0], - 'result_1.2': [5.0, 6.0], - 'result_1': [0, 9], - 'A': ['X1', 'X2']}) - expected = pd.DataFrame({ - 'A': ['X1', 'X1', 'X1', 'X1', 'X2', 'X2', 'X2', 'X2'], - 'colname': [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1], - 'result': [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan], - 'treatment': [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0]}) - expected = expected.set_index(['A', 'colname']) - result = wide_to_long(df, ['result', 'treatment'], - i='A', j='colname', suffix='[0-9.]+', sep='_') + df = pd.DataFrame( + { + "treatment_1.1": [1.0, 2.0], + "treatment_2.1": [3.0, 4.0], + "result_1.2": [5.0, 6.0], + "result_1": [0, 9], + "A": ["X1", "X2"], + } + ) + expected = pd.DataFrame( + { + "A": ["X1", "X1", "X1", "X1", "X2", "X2", "X2", "X2"], + "colname": [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1], + "result": [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan], + "treatment": [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0], + } + ) + expected = expected.set_index(["A", "colname"]) + result = wide_to_long( + df, ["result", "treatment"], i="A", j="colname", suffix="[0-9.]+", sep="_" + ) tm.assert_frame_equal(result, expected) def test_col_substring_of_stubname(self): # GH22468 # Don't raise ValueError when a column name is a substring # of a stubname that's been passed as a string - wide_data = {'node_id': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}, - 'A': {0: 0.80, 1: 0.0, 2: 0.25, 3: 1.0, 4: 0.81}, - 'PA0': {0: 0.74, 1: 0.56, 2: 0.56, 3: 0.98, 4: 0.6}, - 'PA1': {0: 0.77, 1: 0.64, 2: 0.52, 3: 0.98, 4: 0.67}, - 'PA3': {0: 0.34, 1: 0.70, 2: 0.52, 3: 0.98, 4: 0.67} - } + wide_data = { + "node_id": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}, + "A": {0: 0.80, 1: 0.0, 2: 0.25, 3: 1.0, 4: 0.81}, + "PA0": {0: 0.74, 1: 0.56, 2: 0.56, 3: 0.98, 4: 0.6}, + "PA1": {0: 0.77, 1: 0.64, 2: 0.52, 3: 0.98, 4: 0.67}, + "PA3": {0: 0.34, 1: 0.70, 2: 0.52, 3: 0.98, 4: 0.67}, + } wide_df = pd.DataFrame.from_dict(wide_data) - expected = pd.wide_to_long(wide_df, - stubnames=['PA'], - i=['node_id', 'A'], - j='time') - result = pd.wide_to_long(wide_df, - stubnames='PA', - i=['node_id', 'A'], - j='time') + expected = pd.wide_to_long( + wide_df, stubnames=["PA"], i=["node_id", "A"], j="time" + ) + result = pd.wide_to_long(wide_df, stubnames="PA", i=["node_id", "A"], j="time") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 7795c356bf43ec..b497f6c3aa9b44 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -7,8 +7,15 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, Grouper, Index, MultiIndex, Series, concat, - date_range) + Categorical, + DataFrame, + Grouper, + Index, + MultiIndex, + Series, + concat, + date_range, +) from pandas.api.types import CategoricalDtype as CDT from pandas.core.reshape.pivot import crosstab, pivot_table import pandas.util.testing as tm @@ -26,33 +33,68 @@ def interval_values(request, closed): class TestPivotTable: - def setup_method(self, method): - self.data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', - 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', - 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', - 'dull', 'shiny', 'shiny', 'dull', - 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) + self.data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) def test_pivot_table(self, observed): - index = ['A', 'B'] - columns = 'C' - table = pivot_table(self.data, values='D', - index=index, columns=columns, observed=observed) + index = ["A", "B"] + columns = "C" + table = pivot_table( + self.data, values="D", index=index, columns=columns, observed=observed + ) table2 = self.data.pivot_table( - values='D', index=index, columns=columns, observed=observed) + values="D", index=index, columns=columns, observed=observed + ) tm.assert_frame_equal(table, table2) # this works - pivot_table(self.data, values='D', index=index, observed=observed) + pivot_table(self.data, values="D", index=index, observed=observed) if len(index) > 1: assert table.index.names == tuple(index) @@ -64,174 +106,199 @@ def test_pivot_table(self, observed): else: assert table.columns.name == columns[0] - expected = self.data.groupby( - index + [columns])['D'].agg(np.mean).unstack() + expected = self.data.groupby(index + [columns])["D"].agg(np.mean).unstack() tm.assert_frame_equal(table, expected) def test_pivot_table_categorical_observed_equal(self, observed): # issue #24923 - df = pd.DataFrame({'col1': list('abcde'), - 'col2': list('fghij'), - 'col3': [1, 2, 3, 4, 5]}) + df = pd.DataFrame( + {"col1": list("abcde"), "col2": list("fghij"), "col3": [1, 2, 3, 4, 5]} + ) - expected = df.pivot_table(index='col1', values='col3', - columns='col2', aggfunc=np.sum, - fill_value=0) + expected = df.pivot_table( + index="col1", values="col3", columns="col2", aggfunc=np.sum, fill_value=0 + ) - expected.index = expected.index.astype('category') - expected.columns = expected.columns.astype('category') + expected.index = expected.index.astype("category") + expected.columns = expected.columns.astype("category") - df.col1 = df.col1.astype('category') - df.col2 = df.col2.astype('category') + df.col1 = df.col1.astype("category") + df.col2 = df.col2.astype("category") - result = df.pivot_table(index='col1', values='col3', - columns='col2', aggfunc=np.sum, - fill_value=0, observed=observed) + result = df.pivot_table( + index="col1", + values="col3", + columns="col2", + aggfunc=np.sum, + fill_value=0, + observed=observed, + ) tm.assert_frame_equal(result, expected) def test_pivot_table_nocols(self): - df = DataFrame({'rows': ['a', 'b', 'c'], - 'cols': ['x', 'y', 'z'], - 'values': [1, 2, 3]}) - rs = df.pivot_table(columns='cols', aggfunc=np.sum) - xp = df.pivot_table(index='cols', aggfunc=np.sum).T + df = DataFrame( + {"rows": ["a", "b", "c"], "cols": ["x", "y", "z"], "values": [1, 2, 3]} + ) + rs = df.pivot_table(columns="cols", aggfunc=np.sum) + xp = df.pivot_table(index="cols", aggfunc=np.sum).T tm.assert_frame_equal(rs, xp) - rs = df.pivot_table(columns='cols', aggfunc={'values': 'mean'}) - xp = df.pivot_table(index='cols', aggfunc={'values': 'mean'}).T + rs = df.pivot_table(columns="cols", aggfunc={"values": "mean"}) + xp = df.pivot_table(index="cols", aggfunc={"values": "mean"}).T tm.assert_frame_equal(rs, xp) def test_pivot_table_dropna(self): - df = DataFrame({'amount': {0: 60000, 1: 100000, 2: 50000, 3: 30000}, - 'customer': {0: 'A', 1: 'A', 2: 'B', 3: 'C'}, - 'month': {0: 201307, 1: 201309, 2: 201308, 3: 201310}, - 'product': {0: 'a', 1: 'b', 2: 'c', 3: 'd'}, - 'quantity': {0: 2000000, 1: 500000, - 2: 1000000, 3: 1000000}}) - pv_col = df.pivot_table('quantity', 'month', [ - 'customer', 'product'], dropna=False) + df = DataFrame( + { + "amount": {0: 60000, 1: 100000, 2: 50000, 3: 30000}, + "customer": {0: "A", 1: "A", 2: "B", 3: "C"}, + "month": {0: 201307, 1: 201309, 2: 201308, 3: 201310}, + "product": {0: "a", 1: "b", 2: "c", 3: "d"}, + "quantity": {0: 2000000, 1: 500000, 2: 1000000, 3: 1000000}, + } + ) + pv_col = df.pivot_table( + "quantity", "month", ["customer", "product"], dropna=False + ) pv_ind = df.pivot_table( - 'quantity', ['customer', 'product'], 'month', dropna=False) + "quantity", ["customer", "product"], "month", dropna=False + ) - m = MultiIndex.from_tuples([('A', 'a'), ('A', 'b'), ('A', 'c'), - ('A', 'd'), ('B', 'a'), ('B', 'b'), - ('B', 'c'), ('B', 'd'), ('C', 'a'), - ('C', 'b'), ('C', 'c'), ('C', 'd')], - names=['customer', 'product']) + m = MultiIndex.from_tuples( + [ + ("A", "a"), + ("A", "b"), + ("A", "c"), + ("A", "d"), + ("B", "a"), + ("B", "b"), + ("B", "c"), + ("B", "d"), + ("C", "a"), + ("C", "b"), + ("C", "c"), + ("C", "d"), + ], + names=["customer", "product"], + ) tm.assert_index_equal(pv_col.columns, m) tm.assert_index_equal(pv_ind.index, m) def test_pivot_table_categorical(self): - cat1 = Categorical(["a", "a", "b", "b"], - categories=["a", "b", "z"], ordered=True) - cat2 = Categorical(["c", "d", "c", "d"], - categories=["c", "d", "y"], ordered=True) + cat1 = Categorical( + ["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True + ) + cat2 = Categorical( + ["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True + ) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) - result = pd.pivot_table(df, values='values', index=['A', 'B'], - dropna=True) + result = pd.pivot_table(df, values="values", index=["A", "B"], dropna=True) - exp_index = pd.MultiIndex.from_arrays( - [cat1, cat2], - names=['A', 'B']) - expected = DataFrame( - {'values': [1, 2, 3, 4]}, - index=exp_index) + exp_index = pd.MultiIndex.from_arrays([cat1, cat2], names=["A", "B"]) + expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index) tm.assert_frame_equal(result, expected) def test_pivot_table_dropna_categoricals(self, dropna): # GH 15193 - categories = ['a', 'b', 'c', 'd'] - - df = DataFrame({'A': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c'], - 'B': [1, 2, 3, 1, 2, 3, 1, 2, 3], - 'C': range(0, 9)}) - - df['A'] = df['A'].astype(CDT(categories, ordered=False)) - result = df.pivot_table(index='B', columns='A', values='C', - dropna=dropna) - expected_columns = Series(['a', 'b', 'c'], name='A') - expected_columns = expected_columns.astype( - CDT(categories, ordered=False)) - expected_index = Series([1, 2, 3], name='B') - expected = DataFrame([[0, 3, 6], - [1, 4, 7], - [2, 5, 8]], - index=expected_index, - columns=expected_columns,) + categories = ["a", "b", "c", "d"] + + df = DataFrame( + { + "A": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "B": [1, 2, 3, 1, 2, 3, 1, 2, 3], + "C": range(0, 9), + } + ) + + df["A"] = df["A"].astype(CDT(categories, ordered=False)) + result = df.pivot_table(index="B", columns="A", values="C", dropna=dropna) + expected_columns = Series(["a", "b", "c"], name="A") + expected_columns = expected_columns.astype(CDT(categories, ordered=False)) + expected_index = Series([1, 2, 3], name="B") + expected = DataFrame( + [[0, 3, 6], [1, 4, 7], [2, 5, 8]], + index=expected_index, + columns=expected_columns, + ) if not dropna: # add back the non observed to compare - expected = expected.reindex( - columns=Categorical(categories)).astype('float') + expected = expected.reindex(columns=Categorical(categories)).astype("float") tm.assert_frame_equal(result, expected) def test_pivot_with_non_observable_dropna(self, dropna): # gh-21133 df = pd.DataFrame( - {'A': pd.Categorical([np.nan, 'low', 'high', 'low', 'high'], - categories=['low', 'high'], - ordered=True), - 'B': range(5)}) + { + "A": pd.Categorical( + [np.nan, "low", "high", "low", "high"], + categories=["low", "high"], + ordered=True, + ), + "B": range(5), + } + ) - result = df.pivot_table(index='A', values='B', dropna=dropna) + result = df.pivot_table(index="A", values="B", dropna=dropna) expected = pd.DataFrame( - {'B': [2, 3]}, + {"B": [2, 3]}, index=pd.Index( - pd.Categorical.from_codes([0, 1], - categories=['low', 'high'], - ordered=True), - name='A')) + pd.Categorical.from_codes( + [0, 1], categories=["low", "high"], ordered=True + ), + name="A", + ), + ) tm.assert_frame_equal(result, expected) # gh-21378 df = pd.DataFrame( - {'A': pd.Categorical(['left', 'low', 'high', 'low', 'high'], - categories=['low', 'high', 'left'], - ordered=True), - 'B': range(5)}) + { + "A": pd.Categorical( + ["left", "low", "high", "low", "high"], + categories=["low", "high", "left"], + ordered=True, + ), + "B": range(5), + } + ) - result = df.pivot_table(index='A', values='B', dropna=dropna) + result = df.pivot_table(index="A", values="B", dropna=dropna) expected = pd.DataFrame( - {'B': [2, 3, 0]}, + {"B": [2, 3, 0]}, index=pd.Index( - pd.Categorical.from_codes([0, 1, 2], - categories=['low', 'high', 'left'], - ordered=True), - name='A')) + pd.Categorical.from_codes( + [0, 1, 2], categories=["low", "high", "left"], ordered=True + ), + name="A", + ), + ) tm.assert_frame_equal(result, expected) def test_pivot_with_interval_index(self, interval_values, dropna): # GH 25814 - df = DataFrame( - {'A': interval_values, - 'B': 1}) - result = df.pivot_table(index='A', values='B', dropna=dropna) - expected = DataFrame( - {'B': 1}, - index=Index(interval_values.unique(), - name='A')) + df = DataFrame({"A": interval_values, "B": 1}) + result = df.pivot_table(index="A", values="B", dropna=dropna) + expected = DataFrame({"B": 1}, index=Index(interval_values.unique(), name="A")) tm.assert_frame_equal(result, expected) def test_pass_array(self): - result = self.data.pivot_table( - 'D', index=self.data.A, columns=self.data.C) - expected = self.data.pivot_table('D', index='A', columns='C') + result = self.data.pivot_table("D", index=self.data.A, columns=self.data.C) + expected = self.data.pivot_table("D", index="A", columns="C") tm.assert_frame_equal(result, expected) def test_pass_function(self): - result = self.data.pivot_table('D', index=lambda x: x // 5, - columns=self.data.C) - expected = self.data.pivot_table('D', index=self.data.index // 5, - columns='C') + result = self.data.pivot_table("D", index=lambda x: x // 5, columns=self.data.C) + expected = self.data.pivot_table("D", index=self.data.index // 5, columns="C") tm.assert_frame_equal(result, expected) def test_pivot_table_multiple(self): - index = ['A', 'B'] - columns = 'C' + index = ["A", "B"] + columns = "C" table = pivot_table(self.data, index=index, columns=columns) expected = self.data.groupby(index + [columns]).agg(np.mean).unstack() tm.assert_frame_equal(table, expected) @@ -239,417 +306,533 @@ def test_pivot_table_multiple(self): def test_pivot_dtypes(self): # can convert dtypes - f = DataFrame({'a': ['cat', 'bat', 'cat', 'bat'], 'v': [ - 1, 2, 3, 4], 'i': ['a', 'b', 'a', 'b']}) - assert f.dtypes['v'] == 'int64' + f = DataFrame( + { + "a": ["cat", "bat", "cat", "bat"], + "v": [1, 2, 3, 4], + "i": ["a", "b", "a", "b"], + } + ) + assert f.dtypes["v"] == "int64" - z = pivot_table(f, values='v', index=['a'], columns=[ - 'i'], fill_value=0, aggfunc=np.sum) + z = pivot_table( + f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc=np.sum + ) result = z.dtypes - expected = Series([np.dtype('int64')] * 2, - index=Index(list('ab'), name='i')) + expected = Series([np.dtype("int64")] * 2, index=Index(list("ab"), name="i")) tm.assert_series_equal(result, expected) # cannot convert dtypes - f = DataFrame({'a': ['cat', 'bat', 'cat', 'bat'], 'v': [ - 1.5, 2.5, 3.5, 4.5], 'i': ['a', 'b', 'a', 'b']}) - assert f.dtypes['v'] == 'float64' + f = DataFrame( + { + "a": ["cat", "bat", "cat", "bat"], + "v": [1.5, 2.5, 3.5, 4.5], + "i": ["a", "b", "a", "b"], + } + ) + assert f.dtypes["v"] == "float64" - z = pivot_table(f, values='v', index=['a'], columns=[ - 'i'], fill_value=0, aggfunc=np.mean) + z = pivot_table( + f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc=np.mean + ) result = z.dtypes - expected = Series([np.dtype('float64')] * 2, - index=Index(list('ab'), name='i')) + expected = Series([np.dtype("float64")] * 2, index=Index(list("ab"), name="i")) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('columns,values', - [('bool1', ['float1', 'float2']), - ('bool1', ['float1', 'float2', 'bool1']), - ('bool2', ['float1', 'float2', 'bool1'])]) + @pytest.mark.parametrize( + "columns,values", + [ + ("bool1", ["float1", "float2"]), + ("bool1", ["float1", "float2", "bool1"]), + ("bool2", ["float1", "float2", "bool1"]), + ], + ) def test_pivot_preserve_dtypes(self, columns, values): # GH 7142 regression test v = np.arange(5, dtype=np.float64) - df = DataFrame({'float1': v, 'float2': v + 2.0, - 'bool1': v <= 2, 'bool2': v <= 3}) + df = DataFrame( + {"float1": v, "float2": v + 2.0, "bool1": v <= 2, "bool2": v <= 3} + ) df_res = df.reset_index().pivot_table( - index='index', columns=columns, values=values) + index="index", columns=columns, values=values + ) result = dict(df_res.dtypes) - expected = {col: np.dtype('O') if col[0].startswith('b') - else np.dtype('float64') for col in df_res} + expected = { + col: np.dtype("O") if col[0].startswith("b") else np.dtype("float64") + for col in df_res + } assert result == expected def test_pivot_no_values(self): # GH 14380 - idx = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-01-02', - '2011-01-01', '2011-01-02']) - df = pd.DataFrame({'A': [1, 2, 3, 4, 5]}, - index=idx) + idx = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01", "2011-01-02", "2011-01-01", "2011-01-02"] + ) + df = pd.DataFrame({"A": [1, 2, 3, 4, 5]}, index=idx) res = df.pivot_table(index=df.index.month, columns=df.index.day) - exp_columns = pd.MultiIndex.from_tuples([('A', 1), ('A', 2)]) - exp = pd.DataFrame([[2.5, 4.0], [2.0, np.nan]], - index=[1, 2], columns=exp_columns) + exp_columns = pd.MultiIndex.from_tuples([("A", 1), ("A", 2)]) + exp = pd.DataFrame( + [[2.5, 4.0], [2.0, np.nan]], index=[1, 2], columns=exp_columns + ) tm.assert_frame_equal(res, exp) - df = pd.DataFrame({'A': [1, 2, 3, 4, 5], - 'dt': pd.date_range('2011-01-01', freq='D', - periods=5)}, - index=idx) - res = df.pivot_table(index=df.index.month, - columns=pd.Grouper(key='dt', freq='M')) - exp_columns = pd.MultiIndex.from_tuples([('A', - pd.Timestamp('2011-01-31'))]) - exp_columns.names = [None, 'dt'] - exp = pd.DataFrame([3.25, 2.0], - index=[1, 2], columns=exp_columns) + df = pd.DataFrame( + { + "A": [1, 2, 3, 4, 5], + "dt": pd.date_range("2011-01-01", freq="D", periods=5), + }, + index=idx, + ) + res = df.pivot_table( + index=df.index.month, columns=pd.Grouper(key="dt", freq="M") + ) + exp_columns = pd.MultiIndex.from_tuples([("A", pd.Timestamp("2011-01-31"))]) + exp_columns.names = [None, "dt"] + exp = pd.DataFrame([3.25, 2.0], index=[1, 2], columns=exp_columns) tm.assert_frame_equal(res, exp) - res = df.pivot_table(index=pd.Grouper(freq='A'), - columns=pd.Grouper(key='dt', freq='M')) - exp = pd.DataFrame([3], - index=pd.DatetimeIndex(['2011-12-31']), - columns=exp_columns) + res = df.pivot_table( + index=pd.Grouper(freq="A"), columns=pd.Grouper(key="dt", freq="M") + ) + exp = pd.DataFrame( + [3], index=pd.DatetimeIndex(["2011-12-31"]), columns=exp_columns + ) tm.assert_frame_equal(res, exp) def test_pivot_multi_values(self): - result = pivot_table(self.data, values=['D', 'E'], - index='A', columns=['B', 'C'], fill_value=0) - expected = pivot_table(self.data.drop(['F'], axis=1), - index='A', columns=['B', 'C'], fill_value=0) + result = pivot_table( + self.data, values=["D", "E"], index="A", columns=["B", "C"], fill_value=0 + ) + expected = pivot_table( + self.data.drop(["F"], axis=1), index="A", columns=["B", "C"], fill_value=0 + ) tm.assert_frame_equal(result, expected) def test_pivot_multi_functions(self): - f = lambda func: pivot_table(self.data, values=['D', 'E'], - index=['A', 'B'], columns='C', - aggfunc=func) + f = lambda func: pivot_table( + self.data, values=["D", "E"], index=["A", "B"], columns="C", aggfunc=func + ) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) - expected = concat([means, stds], keys=['mean', 'std'], axis=1) + expected = concat([means, stds], keys=["mean", "std"], axis=1) tm.assert_frame_equal(result, expected) # margins not supported?? - f = lambda func: pivot_table(self.data, values=['D', 'E'], - index=['A', 'B'], columns='C', - aggfunc=func, margins=True) + f = lambda func: pivot_table( + self.data, + values=["D", "E"], + index=["A", "B"], + columns="C", + aggfunc=func, + margins=True, + ) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) - expected = concat([means, stds], keys=['mean', 'std'], axis=1) + expected = concat([means, stds], keys=["mean", "std"], axis=1) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('method', [True, False]) + @pytest.mark.parametrize("method", [True, False]) def test_pivot_index_with_nan(self, method): # GH 3588 nan = np.nan - df = DataFrame({'a': ['R1', 'R2', nan, 'R4'], - 'b': ['C1', 'C2', 'C3', 'C4'], - 'c': [10, 15, 17, 20]}) + df = DataFrame( + { + "a": ["R1", "R2", nan, "R4"], + "b": ["C1", "C2", "C3", "C4"], + "c": [10, 15, 17, 20], + } + ) if method: - result = df.pivot('a', 'b', 'c') + result = df.pivot("a", "b", "c") else: - result = pd.pivot(df, 'a', 'b', 'c') - expected = DataFrame([[nan, nan, 17, nan], [10, nan, nan, nan], - [nan, 15, nan, nan], [nan, nan, nan, 20]], - index=Index([nan, 'R1', 'R2', 'R4'], name='a'), - columns=Index(['C1', 'C2', 'C3', 'C4'], name='b')) + result = pd.pivot(df, "a", "b", "c") + expected = DataFrame( + [ + [nan, nan, 17, nan], + [10, nan, nan, nan], + [nan, 15, nan, nan], + [nan, nan, nan, 20], + ], + index=Index([nan, "R1", "R2", "R4"], name="a"), + columns=Index(["C1", "C2", "C3", "C4"], name="b"), + ) tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(df.pivot('b', 'a', 'c'), expected.T) + tm.assert_frame_equal(df.pivot("b", "a", "c"), expected.T) # GH9491 - df = DataFrame({'a': pd.date_range('2014-02-01', periods=6, freq='D'), - 'c': 100 + np.arange(6)}) - df['b'] = df['a'] - pd.Timestamp('2014-02-02') - df.loc[1, 'a'] = df.loc[3, 'a'] = nan - df.loc[1, 'b'] = df.loc[4, 'b'] = nan + df = DataFrame( + { + "a": pd.date_range("2014-02-01", periods=6, freq="D"), + "c": 100 + np.arange(6), + } + ) + df["b"] = df["a"] - pd.Timestamp("2014-02-02") + df.loc[1, "a"] = df.loc[3, "a"] = nan + df.loc[1, "b"] = df.loc[4, "b"] = nan if method: - pv = df.pivot('a', 'b', 'c') + pv = df.pivot("a", "b", "c") else: - pv = pd.pivot(df, 'a', 'b', 'c') + pv = pd.pivot(df, "a", "b", "c") assert pv.notna().values.sum() == len(df) for _, row in df.iterrows(): - assert pv.loc[row['a'], row['b']] == row['c'] + assert pv.loc[row["a"], row["b"]] == row["c"] if method: - result = df.pivot('b', 'a', 'c') + result = df.pivot("b", "a", "c") else: - result = pd.pivot(df, 'b', 'a', 'c') + result = pd.pivot(df, "b", "a", "c") tm.assert_frame_equal(result, pv.T) - @pytest.mark.parametrize('method', [True, False]) + @pytest.mark.parametrize("method", [True, False]) def test_pivot_with_tz(self, method): # GH 5878 - df = DataFrame({'dt1': [datetime(2013, 1, 1, 9, 0), - datetime(2013, 1, 2, 9, 0), - datetime(2013, 1, 1, 9, 0), - datetime(2013, 1, 2, 9, 0)], - 'dt2': [datetime(2014, 1, 1, 9, 0), - datetime(2014, 1, 1, 9, 0), - datetime(2014, 1, 2, 9, 0), - datetime(2014, 1, 2, 9, 0)], - 'data1': np.arange(4, dtype='int64'), - 'data2': np.arange(4, dtype='int64')}) - - df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific')) - df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo')) - - exp_col1 = Index(['data1', 'data1', 'data2', 'data2']) - exp_col2 = pd.DatetimeIndex(['2014/01/01 09:00', - '2014/01/02 09:00'] * 2, - name='dt2', tz='Asia/Tokyo') + df = DataFrame( + { + "dt1": [ + datetime(2013, 1, 1, 9, 0), + datetime(2013, 1, 2, 9, 0), + datetime(2013, 1, 1, 9, 0), + datetime(2013, 1, 2, 9, 0), + ], + "dt2": [ + datetime(2014, 1, 1, 9, 0), + datetime(2014, 1, 1, 9, 0), + datetime(2014, 1, 2, 9, 0), + datetime(2014, 1, 2, 9, 0), + ], + "data1": np.arange(4, dtype="int64"), + "data2": np.arange(4, dtype="int64"), + } + ) + + df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d, tz="US/Pacific")) + df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d, tz="Asia/Tokyo")) + + exp_col1 = Index(["data1", "data1", "data2", "data2"]) + exp_col2 = pd.DatetimeIndex( + ["2014/01/01 09:00", "2014/01/02 09:00"] * 2, name="dt2", tz="Asia/Tokyo" + ) exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2]) - expected = DataFrame([[0, 2, 0, 2], [1, 3, 1, 3]], - index=pd.DatetimeIndex(['2013/01/01 09:00', - '2013/01/02 09:00'], - name='dt1', - tz='US/Pacific'), - columns=exp_col) + expected = DataFrame( + [[0, 2, 0, 2], [1, 3, 1, 3]], + index=pd.DatetimeIndex( + ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific" + ), + columns=exp_col, + ) if method: - pv = df.pivot(index='dt1', columns='dt2') + pv = df.pivot(index="dt1", columns="dt2") else: - pv = pd.pivot(df, index='dt1', columns='dt2') + pv = pd.pivot(df, index="dt1", columns="dt2") tm.assert_frame_equal(pv, expected) - expected = DataFrame([[0, 2], [1, 3]], - index=pd.DatetimeIndex(['2013/01/01 09:00', - '2013/01/02 09:00'], - name='dt1', - tz='US/Pacific'), - columns=pd.DatetimeIndex(['2014/01/01 09:00', - '2014/01/02 09:00'], - name='dt2', - tz='Asia/Tokyo')) + expected = DataFrame( + [[0, 2], [1, 3]], + index=pd.DatetimeIndex( + ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific" + ), + columns=pd.DatetimeIndex( + ["2014/01/01 09:00", "2014/01/02 09:00"], name="dt2", tz="Asia/Tokyo" + ), + ) if method: - pv = df.pivot(index='dt1', columns='dt2', values='data1') + pv = df.pivot(index="dt1", columns="dt2", values="data1") else: - pv = pd.pivot(df, index='dt1', columns='dt2', values='data1') + pv = pd.pivot(df, index="dt1", columns="dt2", values="data1") tm.assert_frame_equal(pv, expected) def test_pivot_tz_in_values(self): # GH 14948 - df = pd.DataFrame([{'uid': u'aa', - 'ts': pd.Timestamp('2016-08-12 13:00:00-0700', - tz='US/Pacific')}, - {'uid': u'aa', - 'ts': pd.Timestamp('2016-08-12 08:00:00-0700', - tz='US/Pacific')}, - {'uid': u'aa', - 'ts': pd.Timestamp('2016-08-12 14:00:00-0700', - tz='US/Pacific')}, - {'uid': u'aa', - 'ts': pd.Timestamp('2016-08-25 11:00:00-0700', - tz='US/Pacific')}, - {'uid': u'aa', - 'ts': pd.Timestamp('2016-08-25 13:00:00-0700', - tz='US/Pacific')}]) - - df = df.set_index('ts').reset_index() - mins = df.ts.map(lambda x: x.replace(hour=0, minute=0, - second=0, microsecond=0)) - - result = pd.pivot_table(df.set_index('ts').reset_index(), - values='ts', index=['uid'], columns=[mins], - aggfunc=np.min) + df = pd.DataFrame( + [ + { + "uid": u"aa", + "ts": pd.Timestamp("2016-08-12 13:00:00-0700", tz="US/Pacific"), + }, + { + "uid": u"aa", + "ts": pd.Timestamp("2016-08-12 08:00:00-0700", tz="US/Pacific"), + }, + { + "uid": u"aa", + "ts": pd.Timestamp("2016-08-12 14:00:00-0700", tz="US/Pacific"), + }, + { + "uid": u"aa", + "ts": pd.Timestamp("2016-08-25 11:00:00-0700", tz="US/Pacific"), + }, + { + "uid": u"aa", + "ts": pd.Timestamp("2016-08-25 13:00:00-0700", tz="US/Pacific"), + }, + ] + ) + + df = df.set_index("ts").reset_index() + mins = df.ts.map(lambda x: x.replace(hour=0, minute=0, second=0, microsecond=0)) + + result = pd.pivot_table( + df.set_index("ts").reset_index(), + values="ts", + index=["uid"], + columns=[mins], + aggfunc=np.min, + ) expected = pd.DataFrame( [ - [pd.Timestamp('2016-08-12 08:00:00-0700', tz='US/Pacific'), - pd.Timestamp('2016-08-25 11:00:00-0700', tz='US/Pacific')] + [ + pd.Timestamp("2016-08-12 08:00:00-0700", tz="US/Pacific"), + pd.Timestamp("2016-08-25 11:00:00-0700", tz="US/Pacific"), + ] ], - index=pd.Index(['aa'], name='uid'), + index=pd.Index(["aa"], name="uid"), columns=pd.DatetimeIndex( [ - pd.Timestamp('2016-08-12 00:00:00', tz='US/Pacific'), - pd.Timestamp('2016-08-25 00:00:00', tz='US/Pacific') + pd.Timestamp("2016-08-12 00:00:00", tz="US/Pacific"), + pd.Timestamp("2016-08-25 00:00:00", tz="US/Pacific"), ], - name='ts') + name="ts", + ), ) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('method', [True, False]) + @pytest.mark.parametrize("method", [True, False]) def test_pivot_periods(self, method): - df = DataFrame({'p1': [pd.Period('2013-01-01', 'D'), - pd.Period('2013-01-02', 'D'), - pd.Period('2013-01-01', 'D'), - pd.Period('2013-01-02', 'D')], - 'p2': [pd.Period('2013-01', 'M'), - pd.Period('2013-01', 'M'), - pd.Period('2013-02', 'M'), - pd.Period('2013-02', 'M')], - 'data1': np.arange(4, dtype='int64'), - 'data2': np.arange(4, dtype='int64')}) - - exp_col1 = Index(['data1', 'data1', 'data2', 'data2']) - exp_col2 = pd.PeriodIndex(['2013-01', '2013-02'] * 2, - name='p2', freq='M') + df = DataFrame( + { + "p1": [ + pd.Period("2013-01-01", "D"), + pd.Period("2013-01-02", "D"), + pd.Period("2013-01-01", "D"), + pd.Period("2013-01-02", "D"), + ], + "p2": [ + pd.Period("2013-01", "M"), + pd.Period("2013-01", "M"), + pd.Period("2013-02", "M"), + pd.Period("2013-02", "M"), + ], + "data1": np.arange(4, dtype="int64"), + "data2": np.arange(4, dtype="int64"), + } + ) + + exp_col1 = Index(["data1", "data1", "data2", "data2"]) + exp_col2 = pd.PeriodIndex(["2013-01", "2013-02"] * 2, name="p2", freq="M") exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2]) - expected = DataFrame([[0, 2, 0, 2], [1, 3, 1, 3]], - index=pd.PeriodIndex(['2013-01-01', '2013-01-02'], - name='p1', freq='D'), - columns=exp_col) + expected = DataFrame( + [[0, 2, 0, 2], [1, 3, 1, 3]], + index=pd.PeriodIndex(["2013-01-01", "2013-01-02"], name="p1", freq="D"), + columns=exp_col, + ) if method: - pv = df.pivot(index='p1', columns='p2') + pv = df.pivot(index="p1", columns="p2") else: - pv = pd.pivot(df, index='p1', columns='p2') + pv = pd.pivot(df, index="p1", columns="p2") tm.assert_frame_equal(pv, expected) - expected = DataFrame([[0, 2], [1, 3]], - index=pd.PeriodIndex(['2013-01-01', '2013-01-02'], - name='p1', freq='D'), - columns=pd.PeriodIndex(['2013-01', '2013-02'], - name='p2', freq='M')) + expected = DataFrame( + [[0, 2], [1, 3]], + index=pd.PeriodIndex(["2013-01-01", "2013-01-02"], name="p1", freq="D"), + columns=pd.PeriodIndex(["2013-01", "2013-02"], name="p2", freq="M"), + ) if method: - pv = df.pivot(index='p1', columns='p2', values='data1') + pv = df.pivot(index="p1", columns="p2", values="data1") else: - pv = pd.pivot(df, index='p1', columns='p2', values='data1') + pv = pd.pivot(df, index="p1", columns="p2", values="data1") tm.assert_frame_equal(pv, expected) - @pytest.mark.parametrize('values', [ - ['baz', 'zoo'], np.array(['baz', 'zoo']), - pd.Series(['baz', 'zoo']), pd.Index(['baz', 'zoo']) - ]) - @pytest.mark.parametrize('method', [True, False]) + @pytest.mark.parametrize( + "values", + [ + ["baz", "zoo"], + np.array(["baz", "zoo"]), + pd.Series(["baz", "zoo"]), + pd.Index(["baz", "zoo"]), + ], + ) + @pytest.mark.parametrize("method", [True, False]) def test_pivot_with_list_like_values(self, values, method): # issue #17160 - df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'], - 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], - 'baz': [1, 2, 3, 4, 5, 6], - 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) + df = pd.DataFrame( + { + "foo": ["one", "one", "one", "two", "two", "two"], + "bar": ["A", "B", "C", "A", "B", "C"], + "baz": [1, 2, 3, 4, 5, 6], + "zoo": ["x", "y", "z", "q", "w", "t"], + } + ) if method: - result = df.pivot(index='foo', columns='bar', values=values) + result = df.pivot(index="foo", columns="bar", values=values) else: - result = pd.pivot(df, index='foo', columns='bar', values=values) - - data = [[1, 2, 3, 'x', 'y', 'z'], - [4, 5, 6, 'q', 'w', 't']] - index = Index(data=['one', 'two'], name='foo') - columns = MultiIndex(levels=[['baz', 'zoo'], ['A', 'B', 'C']], - codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], - names=[None, 'bar']) - expected = DataFrame(data=data, index=index, - columns=columns, dtype='object') + result = pd.pivot(df, index="foo", columns="bar", values=values) + + data = [[1, 2, 3, "x", "y", "z"], [4, 5, 6, "q", "w", "t"]] + index = Index(data=["one", "two"], name="foo") + columns = MultiIndex( + levels=[["baz", "zoo"], ["A", "B", "C"]], + codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], + names=[None, "bar"], + ) + expected = DataFrame(data=data, index=index, columns=columns, dtype="object") tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('values', [ - ['bar', 'baz'], np.array(['bar', 'baz']), - pd.Series(['bar', 'baz']), pd.Index(['bar', 'baz']) - ]) - @pytest.mark.parametrize('method', [True, False]) + @pytest.mark.parametrize( + "values", + [ + ["bar", "baz"], + np.array(["bar", "baz"]), + pd.Series(["bar", "baz"]), + pd.Index(["bar", "baz"]), + ], + ) + @pytest.mark.parametrize("method", [True, False]) def test_pivot_with_list_like_values_nans(self, values, method): # issue #17160 - df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'], - 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], - 'baz': [1, 2, 3, 4, 5, 6], - 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) + df = pd.DataFrame( + { + "foo": ["one", "one", "one", "two", "two", "two"], + "bar": ["A", "B", "C", "A", "B", "C"], + "baz": [1, 2, 3, 4, 5, 6], + "zoo": ["x", "y", "z", "q", "w", "t"], + } + ) if method: - result = df.pivot(index='zoo', columns='foo', values=values) + result = df.pivot(index="zoo", columns="foo", values=values) else: - result = pd.pivot(df, index='zoo', columns='foo', values=values) - - data = [[np.nan, 'A', np.nan, 4], - [np.nan, 'C', np.nan, 6], - [np.nan, 'B', np.nan, 5], - ['A', np.nan, 1, np.nan], - ['B', np.nan, 2, np.nan], - ['C', np.nan, 3, np.nan]] - index = Index(data=['q', 't', 'w', 'x', 'y', 'z'], name='zoo') - columns = MultiIndex(levels=[['bar', 'baz'], ['one', 'two']], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]], - names=[None, 'foo']) - expected = DataFrame(data=data, index=index, - columns=columns, dtype='object') + result = pd.pivot(df, index="zoo", columns="foo", values=values) + + data = [ + [np.nan, "A", np.nan, 4], + [np.nan, "C", np.nan, 6], + [np.nan, "B", np.nan, 5], + ["A", np.nan, 1, np.nan], + ["B", np.nan, 2, np.nan], + ["C", np.nan, 3, np.nan], + ] + index = Index(data=["q", "t", "w", "x", "y", "z"], name="zoo") + columns = MultiIndex( + levels=[["bar", "baz"], ["one", "two"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=[None, "foo"], + ) + expected = DataFrame(data=data, index=index, columns=columns, dtype="object") tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(reason='MultiIndexed unstack with tuple names fails' - 'with KeyError GH#19966') - @pytest.mark.parametrize('method', [True, False]) + @pytest.mark.xfail( + reason="MultiIndexed unstack with tuple names fails" "with KeyError GH#19966" + ) + @pytest.mark.parametrize("method", [True, False]) def test_pivot_with_multiindex(self, method): # issue #17160 index = Index(data=[0, 1, 2, 3, 4, 5]) - data = [['one', 'A', 1, 'x'], - ['one', 'B', 2, 'y'], - ['one', 'C', 3, 'z'], - ['two', 'A', 4, 'q'], - ['two', 'B', 5, 'w'], - ['two', 'C', 6, 't']] - columns = MultiIndex(levels=[['bar', 'baz'], ['first', 'second']], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) - df = DataFrame(data=data, index=index, columns=columns, dtype='object') + data = [ + ["one", "A", 1, "x"], + ["one", "B", 2, "y"], + ["one", "C", 3, "z"], + ["two", "A", 4, "q"], + ["two", "B", 5, "w"], + ["two", "C", 6, "t"], + ] + columns = MultiIndex( + levels=[["bar", "baz"], ["first", "second"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + ) + df = DataFrame(data=data, index=index, columns=columns, dtype="object") if method: - result = df.pivot(index=('bar', 'first'), - columns=('bar', 'second'), - values=('baz', 'first')) + result = df.pivot( + index=("bar", "first"), + columns=("bar", "second"), + values=("baz", "first"), + ) else: - result = pd.pivot(df, - index=('bar', 'first'), - columns=('bar', 'second'), - values=('baz', 'first')) - - data = {'A': Series([1, 4], index=['one', 'two']), - 'B': Series([2, 5], index=['one', 'two']), - 'C': Series([3, 6], index=['one', 'two'])} + result = pd.pivot( + df, + index=("bar", "first"), + columns=("bar", "second"), + values=("baz", "first"), + ) + + data = { + "A": Series([1, 4], index=["one", "two"]), + "B": Series([2, 5], index=["one", "two"]), + "C": Series([3, 6], index=["one", "two"]), + } expected = DataFrame(data) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('method', [True, False]) + @pytest.mark.parametrize("method", [True, False]) def test_pivot_with_tuple_of_values(self, method): # issue #17160 - df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'], - 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], - 'baz': [1, 2, 3, 4, 5, 6], - 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) + df = pd.DataFrame( + { + "foo": ["one", "one", "one", "two", "two", "two"], + "bar": ["A", "B", "C", "A", "B", "C"], + "baz": [1, 2, 3, 4, 5, 6], + "zoo": ["x", "y", "z", "q", "w", "t"], + } + ) with pytest.raises(KeyError, match=r"^\('bar', 'baz'\)$"): # tuple is seen as a single column name if method: - df.pivot(index='zoo', columns='foo', values=('bar', 'baz')) + df.pivot(index="zoo", columns="foo", values=("bar", "baz")) else: - pd.pivot(df, index='zoo', columns='foo', values=('bar', 'baz')) + pd.pivot(df, index="zoo", columns="foo", values=("bar", "baz")) def test_margins(self): - def _check_output(result, values_col, index=['A', 'B'], - columns=['C'], - margins_col='All'): + def _check_output( + result, values_col, index=["A", "B"], columns=["C"], margins_col="All" + ): col_margins = result.loc[result.index[:-1], margins_col] expected_col_margins = self.data.groupby(index)[values_col].mean() - tm.assert_series_equal(col_margins, expected_col_margins, - check_names=False) + tm.assert_series_equal(col_margins, expected_col_margins, check_names=False) assert col_margins.name == margins_col result = result.sort_index() - index_margins = result.loc[(margins_col, '')].iloc[:-1] + index_margins = result.loc[(margins_col, "")].iloc[:-1] expected_ix_margins = self.data.groupby(columns)[values_col].mean() - tm.assert_series_equal(index_margins, expected_ix_margins, - check_names=False) - assert index_margins.name == (margins_col, '') + tm.assert_series_equal( + index_margins, expected_ix_margins, check_names=False + ) + assert index_margins.name == (margins_col, "") - grand_total_margins = result.loc[(margins_col, ''), margins_col] + grand_total_margins = result.loc[(margins_col, ""), margins_col] expected_total_margins = self.data[values_col].mean() assert grand_total_margins == expected_total_margins # column specified - result = self.data.pivot_table(values='D', index=['A', 'B'], - columns='C', - margins=True, aggfunc=np.mean) - _check_output(result, 'D') + result = self.data.pivot_table( + values="D", index=["A", "B"], columns="C", margins=True, aggfunc=np.mean + ) + _check_output(result, "D") # Set a different margins_name (not 'All') - result = self.data.pivot_table(values='D', index=['A', 'B'], - columns='C', - margins=True, aggfunc=np.mean, - margins_name='Totals') - _check_output(result, 'D', margins_col='Totals') + result = self.data.pivot_table( + values="D", + index=["A", "B"], + columns="C", + margins=True, + aggfunc=np.mean, + margins_name="Totals", + ) + _check_output(result, "D", margins_col="Totals") # no column specified - table = self.data.pivot_table(index=['A', 'B'], columns='C', - margins=True, aggfunc=np.mean) + table = self.data.pivot_table( + index=["A", "B"], columns="C", margins=True, aggfunc=np.mean + ) for value_col in table.columns.levels[0]: _check_output(table[value_col], value_col) @@ -657,55 +840,63 @@ def _check_output(result, values_col, index=['A', 'B'], # to help with a buglet self.data.columns = [k * 2 for k in self.data.columns] - table = self.data.pivot_table(index=['AA', 'BB'], margins=True, - aggfunc=np.mean) + table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc=np.mean) for value_col in table.columns: - totals = table.loc[('All', ''), value_col] + totals = table.loc[("All", ""), value_col] assert totals == self.data[value_col].mean() # no rows - rtable = self.data.pivot_table(columns=['AA', 'BB'], margins=True, - aggfunc=np.mean) + rtable = self.data.pivot_table( + columns=["AA", "BB"], margins=True, aggfunc=np.mean + ) assert isinstance(rtable, Series) - table = self.data.pivot_table(index=['AA', 'BB'], margins=True, - aggfunc='mean') - for item in ['DD', 'EE', 'FF']: - totals = table.loc[('All', ''), item] + table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") + for item in ["DD", "EE", "FF"]: + totals = table.loc[("All", ""), item] assert totals == self.data[item].mean() def test_margins_dtype(self): # GH 17013 df = self.data.copy() - df[['D', 'E', 'F']] = np.arange(len(df) * 3).reshape(len(df), 3) + df[["D", "E", "F"]] = np.arange(len(df) * 3).reshape(len(df), 3) - mi_val = list(product(['bar', 'foo'], ['one', 'two'])) + [('All', '')] - mi = MultiIndex.from_tuples(mi_val, names=('A', 'B')) - expected = DataFrame({'dull': [12, 21, 3, 9, 45], - 'shiny': [33, 0, 36, 51, 120]}, - index=mi).rename_axis('C', axis=1) - expected['All'] = expected['dull'] + expected['shiny'] + mi_val = list(product(["bar", "foo"], ["one", "two"])) + [("All", "")] + mi = MultiIndex.from_tuples(mi_val, names=("A", "B")) + expected = DataFrame( + {"dull": [12, 21, 3, 9, 45], "shiny": [33, 0, 36, 51, 120]}, index=mi + ).rename_axis("C", axis=1) + expected["All"] = expected["dull"] + expected["shiny"] - result = df.pivot_table(values='D', index=['A', 'B'], - columns='C', margins=True, - aggfunc=np.sum, fill_value=0) + result = df.pivot_table( + values="D", + index=["A", "B"], + columns="C", + margins=True, + aggfunc=np.sum, + fill_value=0, + ) tm.assert_frame_equal(expected, result) - @pytest.mark.xfail(reason='GH#17035 (len of floats is casted back to ' - 'floats)') + @pytest.mark.xfail(reason="GH#17035 (len of floats is casted back to " "floats)") def test_margins_dtype_len(self): - mi_val = list(product(['bar', 'foo'], ['one', 'two'])) + [('All', '')] - mi = MultiIndex.from_tuples(mi_val, names=('A', 'B')) - expected = DataFrame({'dull': [1, 1, 2, 1, 5], - 'shiny': [2, 0, 2, 2, 6]}, - index=mi).rename_axis('C', axis=1) - expected['All'] = expected['dull'] + expected['shiny'] + mi_val = list(product(["bar", "foo"], ["one", "two"])) + [("All", "")] + mi = MultiIndex.from_tuples(mi_val, names=("A", "B")) + expected = DataFrame( + {"dull": [1, 1, 2, 1, 5], "shiny": [2, 0, 2, 2, 6]}, index=mi + ).rename_axis("C", axis=1) + expected["All"] = expected["dull"] + expected["shiny"] - result = self.data.pivot_table(values='D', index=['A', 'B'], - columns='C', margins=True, - aggfunc=len, fill_value=0) + result = self.data.pivot_table( + values="D", + index=["A", "B"], + columns="C", + margins=True, + aggfunc=len, + fill_value=0, + ) tm.assert_frame_equal(expected, result) @@ -713,407 +904,599 @@ def test_pivot_integer_columns(self): # caused by upstream bug in unstack d = date.min - data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'], - [d + timedelta(i) - for i in range(20)], [1.0])) + data = list( + product( + ["foo", "bar"], + ["A", "B", "C"], + ["x1", "x2"], + [d + timedelta(i) for i in range(20)], + [1.0], + ) + ) df = DataFrame(data) table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2]) df2 = df.rename(columns=str) - table2 = df2.pivot_table( - values='4', index=['0', '1', '3'], columns=['2']) + table2 = df2.pivot_table(values="4", index=["0", "1", "3"], columns=["2"]) tm.assert_frame_equal(table, table2, check_names=False) def test_pivot_no_level_overlap(self): # GH #1181 - data = DataFrame({'a': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'] * 2, - 'b': [0, 0, 0, 0, 1, 1, 1, 1] * 2, - 'c': (['foo'] * 4 + ['bar'] * 4) * 2, - 'value': np.random.randn(16)}) + data = DataFrame( + { + "a": ["a", "a", "a", "a", "b", "b", "b", "b"] * 2, + "b": [0, 0, 0, 0, 1, 1, 1, 1] * 2, + "c": (["foo"] * 4 + ["bar"] * 4) * 2, + "value": np.random.randn(16), + } + ) - table = data.pivot_table('value', index='a', columns=['b', 'c']) + table = data.pivot_table("value", index="a", columns=["b", "c"]) - grouped = data.groupby(['a', 'b', 'c'])['value'].mean() - expected = grouped.unstack('b').unstack('c').dropna(axis=1, how='all') + grouped = data.groupby(["a", "b", "c"])["value"].mean() + expected = grouped.unstack("b").unstack("c").dropna(axis=1, how="all") tm.assert_frame_equal(table, expected) def test_pivot_columns_lexsorted(self): n = 10000 - dtype = np.dtype([ - ("Index", object), - ("Symbol", object), - ("Year", int), - ("Month", int), - ("Day", int), - ("Quantity", int), - ("Price", float), - ]) - - products = np.array([ - ('SP500', 'ADBE'), - ('SP500', 'NVDA'), - ('SP500', 'ORCL'), - ('NDQ100', 'AAPL'), - ('NDQ100', 'MSFT'), - ('NDQ100', 'GOOG'), - ('FTSE', 'DGE.L'), - ('FTSE', 'TSCO.L'), - ('FTSE', 'GSK.L'), - ], dtype=[('Index', object), ('Symbol', object)]) + dtype = np.dtype( + [ + ("Index", object), + ("Symbol", object), + ("Year", int), + ("Month", int), + ("Day", int), + ("Quantity", int), + ("Price", float), + ] + ) + + products = np.array( + [ + ("SP500", "ADBE"), + ("SP500", "NVDA"), + ("SP500", "ORCL"), + ("NDQ100", "AAPL"), + ("NDQ100", "MSFT"), + ("NDQ100", "GOOG"), + ("FTSE", "DGE.L"), + ("FTSE", "TSCO.L"), + ("FTSE", "GSK.L"), + ], + dtype=[("Index", object), ("Symbol", object)], + ) items = np.empty(n, dtype=dtype) iproduct = np.random.randint(0, len(products), n) - items['Index'] = products['Index'][iproduct] - items['Symbol'] = products['Symbol'][iproduct] - dr = pd.date_range(date(2000, 1, 1), - date(2010, 12, 31)) + items["Index"] = products["Index"][iproduct] + items["Symbol"] = products["Symbol"][iproduct] + dr = pd.date_range(date(2000, 1, 1), date(2010, 12, 31)) dates = dr[np.random.randint(0, len(dr), n)] - items['Year'] = dates.year - items['Month'] = dates.month - items['Day'] = dates.day - items['Price'] = np.random.lognormal(4.0, 2.0, n) + items["Year"] = dates.year + items["Month"] = dates.month + items["Day"] = dates.day + items["Price"] = np.random.lognormal(4.0, 2.0, n) df = DataFrame(items) - pivoted = df.pivot_table('Price', index=['Month', 'Day'], - columns=['Index', 'Symbol', 'Year'], - aggfunc='mean') + pivoted = df.pivot_table( + "Price", + index=["Month", "Day"], + columns=["Index", "Symbol", "Year"], + aggfunc="mean", + ) assert pivoted.columns.is_monotonic def test_pivot_complex_aggfunc(self): - f = OrderedDict([('D', ['std']), ('E', ['sum'])]) - expected = self.data.groupby(['A', 'B']).agg(f).unstack('B') - result = self.data.pivot_table(index='A', columns='B', aggfunc=f) + f = OrderedDict([("D", ["std"]), ("E", ["sum"])]) + expected = self.data.groupby(["A", "B"]).agg(f).unstack("B") + result = self.data.pivot_table(index="A", columns="B", aggfunc=f) tm.assert_frame_equal(result, expected) def test_margins_no_values_no_cols(self): # Regression test on pivot table: no values or cols passed. - result = self.data[['A', 'B']].pivot_table( - index=['A', 'B'], aggfunc=len, margins=True) + result = self.data[["A", "B"]].pivot_table( + index=["A", "B"], aggfunc=len, margins=True + ) result_list = result.tolist() assert sum(result_list[:-1]) == result_list[-1] def test_margins_no_values_two_rows(self): # Regression test on pivot table: no values passed but rows are a # multi-index - result = self.data[['A', 'B', 'C']].pivot_table( - index=['A', 'B'], columns='C', aggfunc=len, margins=True) + result = self.data[["A", "B", "C"]].pivot_table( + index=["A", "B"], columns="C", aggfunc=len, margins=True + ) assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0] def test_margins_no_values_one_row_one_col(self): # Regression test on pivot table: no values passed but row and col # defined - result = self.data[['A', 'B']].pivot_table( - index='A', columns='B', aggfunc=len, margins=True) + result = self.data[["A", "B"]].pivot_table( + index="A", columns="B", aggfunc=len, margins=True + ) assert result.All.tolist() == [4.0, 7.0, 11.0] def test_margins_no_values_two_row_two_cols(self): # Regression test on pivot table: no values passed but rows and cols # are multi-indexed - self.data['D'] = ['a', 'b', 'c', 'd', - 'e', 'f', 'g', 'h', 'i', 'j', 'k'] - result = self.data[['A', 'B', 'C', 'D']].pivot_table( - index=['A', 'B'], columns=['C', 'D'], aggfunc=len, margins=True) + self.data["D"] = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"] + result = self.data[["A", "B", "C", "D"]].pivot_table( + index=["A", "B"], columns=["C", "D"], aggfunc=len, margins=True + ) assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0] - @pytest.mark.parametrize( - 'margin_name', ['foo', 'one', 666, None, ['a', 'b']]) + @pytest.mark.parametrize("margin_name", ["foo", "one", 666, None, ["a", "b"]]) def test_pivot_table_with_margins_set_margin_name(self, margin_name): # see gh-3335 - msg = (r'Conflicting name "{}" in margins|' - "margins_name argument must be a string").format(margin_name) + msg = ( + r'Conflicting name "{}" in margins|' + "margins_name argument must be a string" + ).format(margin_name) with pytest.raises(ValueError, match=msg): # multi-index index - pivot_table(self.data, values='D', index=['A', 'B'], - columns=['C'], margins=True, - margins_name=margin_name) + pivot_table( + self.data, + values="D", + index=["A", "B"], + columns=["C"], + margins=True, + margins_name=margin_name, + ) with pytest.raises(ValueError, match=msg): # multi-index column - pivot_table(self.data, values='D', index=['C'], - columns=['A', 'B'], margins=True, - margins_name=margin_name) + pivot_table( + self.data, + values="D", + index=["C"], + columns=["A", "B"], + margins=True, + margins_name=margin_name, + ) with pytest.raises(ValueError, match=msg): # non-multi-index index/column - pivot_table(self.data, values='D', index=['A'], - columns=['B'], margins=True, - margins_name=margin_name) + pivot_table( + self.data, + values="D", + index=["A"], + columns=["B"], + margins=True, + margins_name=margin_name, + ) def test_pivot_timegrouper(self): - df = DataFrame({ - 'Branch': 'A A A A A A A B'.split(), - 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), - 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], - 'Date': [datetime(2013, 1, 1), - datetime(2013, 1, 1), - datetime(2013, 10, 1), - datetime(2013, 10, 2), - datetime(2013, 10, 1), - datetime(2013, 10, 2), - datetime(2013, 12, 2), - datetime(2013, 12, 2), ]}).set_index('Date') - - expected = DataFrame(np.array([10, 18, 3], dtype='int64') - .reshape(1, 3), - index=[datetime(2013, 12, 31)], - columns='Carl Joe Mark'.split()) - expected.index.name = 'Date' - expected.columns.name = 'Buyer' - - result = pivot_table(df, index=Grouper(freq='A'), columns='Buyer', - values='Quantity', aggfunc=np.sum) + df = DataFrame( + { + "Branch": "A A A A A A A B".split(), + "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), + "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], + "Date": [ + datetime(2013, 1, 1), + datetime(2013, 1, 1), + datetime(2013, 10, 1), + datetime(2013, 10, 2), + datetime(2013, 10, 1), + datetime(2013, 10, 2), + datetime(2013, 12, 2), + datetime(2013, 12, 2), + ], + } + ).set_index("Date") + + expected = DataFrame( + np.array([10, 18, 3], dtype="int64").reshape(1, 3), + index=[datetime(2013, 12, 31)], + columns="Carl Joe Mark".split(), + ) + expected.index.name = "Date" + expected.columns.name = "Buyer" + + result = pivot_table( + df, + index=Grouper(freq="A"), + columns="Buyer", + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index='Buyer', columns=Grouper(freq='A'), - values='Quantity', aggfunc=np.sum) + result = pivot_table( + df, + index="Buyer", + columns=Grouper(freq="A"), + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected.T) - expected = DataFrame(np.array([1, np.nan, 3, 9, 18, np.nan]) - .reshape(2, 3), - index=[datetime(2013, 1, 1), - datetime(2013, 7, 1)], - columns='Carl Joe Mark'.split()) - expected.index.name = 'Date' - expected.columns.name = 'Buyer' + expected = DataFrame( + np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3), + index=[datetime(2013, 1, 1), datetime(2013, 7, 1)], + columns="Carl Joe Mark".split(), + ) + expected.index.name = "Date" + expected.columns.name = "Buyer" - result = pivot_table(df, index=Grouper(freq='6MS'), columns='Buyer', - values='Quantity', aggfunc=np.sum) + result = pivot_table( + df, + index=Grouper(freq="6MS"), + columns="Buyer", + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS'), - values='Quantity', aggfunc=np.sum) + result = pivot_table( + df, + index="Buyer", + columns=Grouper(freq="6MS"), + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected.T) # passing the name df = df.reset_index() - result = pivot_table(df, index=Grouper(freq='6MS', key='Date'), - columns='Buyer', - values='Quantity', aggfunc=np.sum) + result = pivot_table( + df, + index=Grouper(freq="6MS", key="Date"), + columns="Buyer", + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index='Buyer', - columns=Grouper(freq='6MS', key='Date'), - values='Quantity', aggfunc=np.sum) + result = pivot_table( + df, + index="Buyer", + columns=Grouper(freq="6MS", key="Date"), + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected.T) msg = "'The grouper name foo is not found'" with pytest.raises(KeyError, match=msg): - pivot_table(df, index=Grouper(freq='6MS', key='foo'), - columns='Buyer', values='Quantity', aggfunc=np.sum) + pivot_table( + df, + index=Grouper(freq="6MS", key="foo"), + columns="Buyer", + values="Quantity", + aggfunc=np.sum, + ) with pytest.raises(KeyError, match=msg): - pivot_table(df, index='Buyer', - columns=Grouper(freq='6MS', key='foo'), - values='Quantity', aggfunc=np.sum) + pivot_table( + df, + index="Buyer", + columns=Grouper(freq="6MS", key="foo"), + values="Quantity", + aggfunc=np.sum, + ) # passing the level - df = df.set_index('Date') - result = pivot_table(df, index=Grouper(freq='6MS', level='Date'), - columns='Buyer', values='Quantity', - aggfunc=np.sum) + df = df.set_index("Date") + result = pivot_table( + df, + index=Grouper(freq="6MS", level="Date"), + columns="Buyer", + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index='Buyer', - columns=Grouper(freq='6MS', level='Date'), - values='Quantity', aggfunc=np.sum) + result = pivot_table( + df, + index="Buyer", + columns=Grouper(freq="6MS", level="Date"), + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected.T) msg = "The level foo is not valid" with pytest.raises(ValueError, match=msg): - pivot_table(df, index=Grouper(freq='6MS', level='foo'), - columns='Buyer', values='Quantity', aggfunc=np.sum) + pivot_table( + df, + index=Grouper(freq="6MS", level="foo"), + columns="Buyer", + values="Quantity", + aggfunc=np.sum, + ) with pytest.raises(ValueError, match=msg): - pivot_table(df, index='Buyer', - columns=Grouper(freq='6MS', level='foo'), - values='Quantity', aggfunc=np.sum) + pivot_table( + df, + index="Buyer", + columns=Grouper(freq="6MS", level="foo"), + values="Quantity", + aggfunc=np.sum, + ) # double grouper - df = DataFrame({ - 'Branch': 'A A A A A A A B'.split(), - 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), - 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], - 'Date': [datetime(2013, 11, 1, 13, 0), datetime(2013, 9, 1, 13, 5), - datetime(2013, 10, 1, 20, 0), - datetime(2013, 10, 2, 10, 0), - datetime(2013, 11, 1, 20, 0), - datetime(2013, 10, 2, 10, 0), - datetime(2013, 10, 2, 12, 0), - datetime(2013, 12, 5, 14, 0)], - 'PayDay': [datetime(2013, 10, 4, 0, 0), - datetime(2013, 10, 15, 13, 5), - datetime(2013, 9, 5, 20, 0), - datetime(2013, 11, 2, 10, 0), - datetime(2013, 10, 7, 20, 0), - datetime(2013, 9, 5, 10, 0), - datetime(2013, 12, 30, 12, 0), - datetime(2013, 11, 20, 14, 0), ]}) - - result = pivot_table(df, index=Grouper(freq='M', key='Date'), - columns=Grouper(freq='M', key='PayDay'), - values='Quantity', aggfunc=np.sum) - expected = DataFrame(np.array([np.nan, 3, np.nan, np.nan, - 6, np.nan, 1, 9, - np.nan, 9, np.nan, np.nan, np.nan, - np.nan, 3, np.nan]).reshape(4, 4), - index=[datetime(2013, 9, 30), - datetime(2013, 10, 31), - datetime(2013, 11, 30), - datetime(2013, 12, 31)], - columns=[datetime(2013, 9, 30), - datetime(2013, 10, 31), - datetime(2013, 11, 30), - datetime(2013, 12, 31)]) - expected.index.name = 'Date' - expected.columns.name = 'PayDay' + df = DataFrame( + { + "Branch": "A A A A A A A B".split(), + "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), + "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], + "Date": [ + datetime(2013, 11, 1, 13, 0), + datetime(2013, 9, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 11, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 10, 2, 12, 0), + datetime(2013, 12, 5, 14, 0), + ], + "PayDay": [ + datetime(2013, 10, 4, 0, 0), + datetime(2013, 10, 15, 13, 5), + datetime(2013, 9, 5, 20, 0), + datetime(2013, 11, 2, 10, 0), + datetime(2013, 10, 7, 20, 0), + datetime(2013, 9, 5, 10, 0), + datetime(2013, 12, 30, 12, 0), + datetime(2013, 11, 20, 14, 0), + ], + } + ) + + result = pivot_table( + df, + index=Grouper(freq="M", key="Date"), + columns=Grouper(freq="M", key="PayDay"), + values="Quantity", + aggfunc=np.sum, + ) + expected = DataFrame( + np.array( + [ + np.nan, + 3, + np.nan, + np.nan, + 6, + np.nan, + 1, + 9, + np.nan, + 9, + np.nan, + np.nan, + np.nan, + np.nan, + 3, + np.nan, + ] + ).reshape(4, 4), + index=[ + datetime(2013, 9, 30), + datetime(2013, 10, 31), + datetime(2013, 11, 30), + datetime(2013, 12, 31), + ], + columns=[ + datetime(2013, 9, 30), + datetime(2013, 10, 31), + datetime(2013, 11, 30), + datetime(2013, 12, 31), + ], + ) + expected.index.name = "Date" + expected.columns.name = "PayDay" tm.assert_frame_equal(result, expected) - result = pivot_table(df, index=Grouper(freq='M', key='PayDay'), - columns=Grouper(freq='M', key='Date'), - values='Quantity', aggfunc=np.sum) + result = pivot_table( + df, + index=Grouper(freq="M", key="PayDay"), + columns=Grouper(freq="M", key="Date"), + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected.T) - tuples = [(datetime(2013, 9, 30), datetime(2013, 10, 31)), - (datetime(2013, 10, 31), - datetime(2013, 9, 30)), - (datetime(2013, 10, 31), - datetime(2013, 11, 30)), - (datetime(2013, 10, 31), - datetime(2013, 12, 31)), - (datetime(2013, 11, 30), - datetime(2013, 10, 31)), - (datetime(2013, 12, 31), datetime(2013, 11, 30)), ] - idx = MultiIndex.from_tuples(tuples, names=['Date', 'PayDay']) - expected = DataFrame(np.array([3, np.nan, 6, np.nan, 1, np.nan, - 9, np.nan, 9, np.nan, - np.nan, 3]).reshape(6, 2), - index=idx, columns=['A', 'B']) - expected.columns.name = 'Branch' + tuples = [ + (datetime(2013, 9, 30), datetime(2013, 10, 31)), + (datetime(2013, 10, 31), datetime(2013, 9, 30)), + (datetime(2013, 10, 31), datetime(2013, 11, 30)), + (datetime(2013, 10, 31), datetime(2013, 12, 31)), + (datetime(2013, 11, 30), datetime(2013, 10, 31)), + (datetime(2013, 12, 31), datetime(2013, 11, 30)), + ] + idx = MultiIndex.from_tuples(tuples, names=["Date", "PayDay"]) + expected = DataFrame( + np.array( + [3, np.nan, 6, np.nan, 1, np.nan, 9, np.nan, 9, np.nan, np.nan, 3] + ).reshape(6, 2), + index=idx, + columns=["A", "B"], + ) + expected.columns.name = "Branch" result = pivot_table( - df, index=[Grouper(freq='M', key='Date'), - Grouper(freq='M', key='PayDay')], columns=['Branch'], - values='Quantity', aggfunc=np.sum) + df, + index=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")], + columns=["Branch"], + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index=['Branch'], - columns=[Grouper(freq='M', key='Date'), - Grouper(freq='M', key='PayDay')], - values='Quantity', aggfunc=np.sum) + result = pivot_table( + df, + index=["Branch"], + columns=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")], + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected.T) def test_pivot_datetime_tz(self): - dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', - '2011-07-19 09:00:00', - '2011-07-19 07:00:00', '2011-07-19 08:00:00', - '2011-07-19 09:00:00'] - dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00', - '2013-01-01 15:00:00', - '2013-02-01 15:00:00', '2013-02-01 15:00:00', - '2013-02-01 15:00:00'] - df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], - 'dt1': dates1, 'dt2': dates2, - 'value1': np.arange(6, dtype='int64'), - 'value2': [1, 2] * 3}) - df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific')) - df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo')) - - exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', - '2011-07-19 08:00:00', - '2011-07-19 09:00:00'], - tz='US/Pacific', name='dt1') - exp_col1 = Index(['value1', 'value1']) - exp_col2 = Index(['a', 'b'], name='label') + dates1 = [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ] + dates2 = [ + "2013-01-01 15:00:00", + "2013-01-01 15:00:00", + "2013-01-01 15:00:00", + "2013-02-01 15:00:00", + "2013-02-01 15:00:00", + "2013-02-01 15:00:00", + ] + df = DataFrame( + { + "label": ["a", "a", "a", "b", "b", "b"], + "dt1": dates1, + "dt2": dates2, + "value1": np.arange(6, dtype="int64"), + "value2": [1, 2] * 3, + } + ) + df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d, tz="US/Pacific")) + df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d, tz="Asia/Tokyo")) + + exp_idx = pd.DatetimeIndex( + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], + tz="US/Pacific", + name="dt1", + ) + exp_col1 = Index(["value1", "value1"]) + exp_col2 = Index(["a", "b"], name="label") exp_col = MultiIndex.from_arrays([exp_col1, exp_col2]) - expected = DataFrame([[0, 3], [1, 4], [2, 5]], - index=exp_idx, columns=exp_col) - result = pivot_table(df, index=['dt1'], columns=[ - 'label'], values=['value1']) + expected = DataFrame([[0, 3], [1, 4], [2, 5]], index=exp_idx, columns=exp_col) + result = pivot_table(df, index=["dt1"], columns=["label"], values=["value1"]) tm.assert_frame_equal(result, expected) - exp_col1 = Index(['sum', 'sum', 'sum', 'sum', - 'mean', 'mean', 'mean', 'mean']) - exp_col2 = Index(['value1', 'value1', 'value2', 'value2'] * 2) - exp_col3 = pd.DatetimeIndex(['2013-01-01 15:00:00', - '2013-02-01 15:00:00'] * 4, - tz='Asia/Tokyo', name='dt2') + exp_col1 = Index(["sum", "sum", "sum", "sum", "mean", "mean", "mean", "mean"]) + exp_col2 = Index(["value1", "value1", "value2", "value2"] * 2) + exp_col3 = pd.DatetimeIndex( + ["2013-01-01 15:00:00", "2013-02-01 15:00:00"] * 4, + tz="Asia/Tokyo", + name="dt2", + ) exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3]) - expected = DataFrame(np.array([[0, 3, 1, 2, 0, 3, 1, 2], - [1, 4, 2, 1, 1, 4, 2, 1], - [2, 5, 1, 2, 2, 5, 1, 2]], - dtype='int64'), - index=exp_idx, - columns=exp_col) - - result = pivot_table(df, index=['dt1'], columns=['dt2'], - values=['value1', 'value2'], - aggfunc=[np.sum, np.mean]) + expected = DataFrame( + np.array( + [ + [0, 3, 1, 2, 0, 3, 1, 2], + [1, 4, 2, 1, 1, 4, 2, 1], + [2, 5, 1, 2, 2, 5, 1, 2], + ], + dtype="int64", + ), + index=exp_idx, + columns=exp_col, + ) + + result = pivot_table( + df, + index=["dt1"], + columns=["dt2"], + values=["value1", "value2"], + aggfunc=[np.sum, np.mean], + ) tm.assert_frame_equal(result, expected) def test_pivot_dtaccessor(self): # GH 8103 - dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', - '2011-07-19 09:00:00', - '2011-07-19 07:00:00', '2011-07-19 08:00:00', - '2011-07-19 09:00:00'] - dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00', - '2013-01-01 15:00:00', - '2013-02-01 15:00:00', '2013-02-01 15:00:00', - '2013-02-01 15:00:00'] - df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], - 'dt1': dates1, 'dt2': dates2, - 'value1': np.arange(6, dtype='int64'), - 'value2': [1, 2] * 3}) - df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d)) - df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d)) - - result = pivot_table(df, index='label', columns=df['dt1'].dt.hour, - values='value1') - - exp_idx = Index(['a', 'b'], name='label') - expected = DataFrame({7: [0, 3], 8: [1, 4], 9: [2, 5]}, - index=exp_idx, - columns=Index([7, 8, 9], name='dt1')) + dates1 = [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ] + dates2 = [ + "2013-01-01 15:00:00", + "2013-01-01 15:00:00", + "2013-01-01 15:00:00", + "2013-02-01 15:00:00", + "2013-02-01 15:00:00", + "2013-02-01 15:00:00", + ] + df = DataFrame( + { + "label": ["a", "a", "a", "b", "b", "b"], + "dt1": dates1, + "dt2": dates2, + "value1": np.arange(6, dtype="int64"), + "value2": [1, 2] * 3, + } + ) + df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d)) + df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d)) + + result = pivot_table( + df, index="label", columns=df["dt1"].dt.hour, values="value1" + ) + + exp_idx = Index(["a", "b"], name="label") + expected = DataFrame( + {7: [0, 3], 8: [1, 4], 9: [2, 5]}, + index=exp_idx, + columns=Index([7, 8, 9], name="dt1"), + ) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index=df['dt2'].dt.month, - columns=df['dt1'].dt.hour, - values='value1') + result = pivot_table( + df, index=df["dt2"].dt.month, columns=df["dt1"].dt.hour, values="value1" + ) - expected = DataFrame({7: [0, 3], 8: [1, 4], 9: [2, 5]}, - index=Index([1, 2], name='dt2'), - columns=Index([7, 8, 9], name='dt1')) + expected = DataFrame( + {7: [0, 3], 8: [1, 4], 9: [2, 5]}, + index=Index([1, 2], name="dt2"), + columns=Index([7, 8, 9], name="dt1"), + ) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index=df['dt2'].dt.year.values, - columns=[df['dt1'].dt.hour, df['dt2'].dt.month], - values='value1') + result = pivot_table( + df, + index=df["dt2"].dt.year.values, + columns=[df["dt1"].dt.hour, df["dt2"].dt.month], + values="value1", + ) exp_col = MultiIndex.from_arrays( - [[7, 7, 8, 8, 9, 9], [1, 2] * 3], names=['dt1', 'dt2']) - expected = DataFrame(np.array([[0, 3, 1, 4, 2, 5]], dtype='int64'), - index=[2013], columns=exp_col) + [[7, 7, 8, 8, 9, 9], [1, 2] * 3], names=["dt1", "dt2"] + ) + expected = DataFrame( + np.array([[0, 3, 1, 4, 2, 5]], dtype="int64"), index=[2013], columns=exp_col + ) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index=np.array(['X', 'X', 'X', - 'X', 'Y', 'Y']), - columns=[df['dt1'].dt.hour, df['dt2'].dt.month], - values='value1') - expected = DataFrame(np.array([[0, 3, 1, np.nan, 2, np.nan], - [np.nan, np.nan, np.nan, - 4, np.nan, 5]]), - index=['X', 'Y'], columns=exp_col) + result = pivot_table( + df, + index=np.array(["X", "X", "X", "X", "Y", "Y"]), + columns=[df["dt1"].dt.hour, df["dt2"].dt.month], + values="value1", + ) + expected = DataFrame( + np.array( + [[0, 3, 1, np.nan, 2, np.nan], [np.nan, np.nan, np.nan, 4, np.nan, 5]] + ), + index=["X", "Y"], + columns=exp_col, + ) tm.assert_frame_equal(result, expected) def test_daily(self): - rng = date_range('1/1/2000', '12/31/2004', freq='D') + rng = date_range("1/1/2000", "12/31/2004", freq="D") ts = Series(np.random.randn(len(rng)), index=rng) - annual = pivot_table(DataFrame(ts), index=ts.index.year, - columns=ts.index.dayofyear) + annual = pivot_table( + DataFrame(ts), index=ts.index.year, columns=ts.index.dayofyear + ) annual.columns = annual.columns.droplevel(0) doy = np.asarray(ts.index.dayofyear) @@ -1127,11 +1510,12 @@ def test_daily(self): assert result.name == i def test_monthly(self): - rng = date_range('1/1/2000', '12/31/2004', freq='M') + rng = date_range("1/1/2000", "12/31/2004", freq="M") ts = Series(np.random.randn(len(rng)), index=rng) - annual = pivot_table(pd.DataFrame(ts), index=ts.index.year, - columns=ts.index.month) + annual = pivot_table( + pd.DataFrame(ts), index=ts.index.year, columns=ts.index.month + ) annual.columns = annual.columns.droplevel(0) month = ts.index.month @@ -1144,120 +1528,140 @@ def test_monthly(self): def test_pivot_table_with_iterator_values(self): # GH 12017 - aggs = {'D': 'sum', 'E': 'mean'} + aggs = {"D": "sum", "E": "mean"} pivot_values_list = pd.pivot_table( - self.data, index=['A'], values=list(aggs.keys()), aggfunc=aggs, + self.data, index=["A"], values=list(aggs.keys()), aggfunc=aggs ) pivot_values_keys = pd.pivot_table( - self.data, index=['A'], values=aggs.keys(), aggfunc=aggs, + self.data, index=["A"], values=aggs.keys(), aggfunc=aggs ) tm.assert_frame_equal(pivot_values_keys, pivot_values_list) agg_values_gen = (value for value in aggs.keys()) pivot_values_gen = pd.pivot_table( - self.data, index=['A'], values=agg_values_gen, aggfunc=aggs, + self.data, index=["A"], values=agg_values_gen, aggfunc=aggs ) tm.assert_frame_equal(pivot_values_gen, pivot_values_list) def test_pivot_table_margins_name_with_aggfunc_list(self): # GH 13354 - margins_name = 'Weekly' + margins_name = "Weekly" costs = pd.DataFrame( - {'item': ['bacon', 'cheese', 'bacon', 'cheese'], - 'cost': [2.5, 4.5, 3.2, 3.3], - 'day': ['M', 'M', 'T', 'T']} + { + "item": ["bacon", "cheese", "bacon", "cheese"], + "cost": [2.5, 4.5, 3.2, 3.3], + "day": ["M", "M", "T", "T"], + } ) table = costs.pivot_table( - index="item", columns="day", margins=True, - margins_name=margins_name, aggfunc=[np.mean, max] - ) - ix = pd.Index( - ['bacon', 'cheese', margins_name], dtype='object', name='item' + index="item", + columns="day", + margins=True, + margins_name=margins_name, + aggfunc=[np.mean, max], ) - tups = [('mean', 'cost', 'M'), ('mean', 'cost', 'T'), - ('mean', 'cost', margins_name), ('max', 'cost', 'M'), - ('max', 'cost', 'T'), ('max', 'cost', margins_name)] - cols = pd.MultiIndex.from_tuples(tups, names=[None, None, 'day']) + ix = pd.Index(["bacon", "cheese", margins_name], dtype="object", name="item") + tups = [ + ("mean", "cost", "M"), + ("mean", "cost", "T"), + ("mean", "cost", margins_name), + ("max", "cost", "M"), + ("max", "cost", "T"), + ("max", "cost", margins_name), + ] + cols = pd.MultiIndex.from_tuples(tups, names=[None, None, "day"]) expected = pd.DataFrame(table.values, index=ix, columns=cols) tm.assert_frame_equal(table, expected) - @pytest.mark.xfail(reason='GH#17035 (np.mean of ints is casted back to ' - 'ints)') + @pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to " "ints)") def test_categorical_margins(self, observed): # GH 10989 - df = pd.DataFrame({'x': np.arange(8), - 'y': np.arange(8) // 4, - 'z': np.arange(8) % 2}) + df = pd.DataFrame( + {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2} + ) expected = pd.DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]]) - expected.index = Index([0, 1, 'All'], name='y') - expected.columns = Index([0, 1, 'All'], name='z') + expected.index = Index([0, 1, "All"], name="y") + expected.columns = Index([0, 1, "All"], name="z") - table = df.pivot_table('x', 'y', 'z', dropna=observed, margins=True) + table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) tm.assert_frame_equal(table, expected) - @pytest.mark.xfail(reason='GH#17035 (np.mean of ints is casted back to ' - 'ints)') + @pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to " "ints)") def test_categorical_margins_category(self, observed): - df = pd.DataFrame({'x': np.arange(8), - 'y': np.arange(8) // 4, - 'z': np.arange(8) % 2}) + df = pd.DataFrame( + {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2} + ) expected = pd.DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]]) - expected.index = Index([0, 1, 'All'], name='y') - expected.columns = Index([0, 1, 'All'], name='z') + expected.index = Index([0, 1, "All"], name="y") + expected.columns = Index([0, 1, "All"], name="z") - df.y = df.y.astype('category') - df.z = df.z.astype('category') - table = df.pivot_table('x', 'y', 'z', dropna=observed, margins=True) + df.y = df.y.astype("category") + df.z = df.z.astype("category") + table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) tm.assert_frame_equal(table, expected) def test_categorical_aggfunc(self, observed): # GH 9534 - df = pd.DataFrame({"C1": ["A", "B", "C", "C"], - "C2": ["a", "a", "b", "b"], - "V": [1, 2, 3, 4]}) + df = pd.DataFrame( + {"C1": ["A", "B", "C", "C"], "C2": ["a", "a", "b", "b"], "V": [1, 2, 3, 4]} + ) df["C1"] = df["C1"].astype("category") - result = df.pivot_table("V", index="C1", columns="C2", - dropna=observed, aggfunc="count") - - expected_index = pd.CategoricalIndex(['A', 'B', 'C'], - categories=['A', 'B', 'C'], - ordered=False, - name='C1') - expected_columns = pd.Index(['a', 'b'], name='C2') - expected_data = np.array([[1., np.nan], - [1., np.nan], - [np.nan, 2.]]) - expected = pd.DataFrame(expected_data, - index=expected_index, - columns=expected_columns) + result = df.pivot_table( + "V", index="C1", columns="C2", dropna=observed, aggfunc="count" + ) + + expected_index = pd.CategoricalIndex( + ["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1" + ) + expected_columns = pd.Index(["a", "b"], name="C2") + expected_data = np.array([[1.0, np.nan], [1.0, np.nan], [np.nan, 2.0]]) + expected = pd.DataFrame( + expected_data, index=expected_index, columns=expected_columns + ) tm.assert_frame_equal(result, expected) def test_categorical_pivot_index_ordering(self, observed): # GH 8731 - df = pd.DataFrame({'Sales': [100, 120, 220], - 'Month': ['January', 'January', 'January'], - 'Year': [2013, 2014, 2013]}) - months = ['January', 'February', 'March', 'April', 'May', 'June', - 'July', 'August', 'September', 'October', 'November', - 'December'] - df['Month'] = df['Month'].astype('category').cat.set_categories(months) - result = df.pivot_table(values='Sales', - index='Month', - columns='Year', - dropna=observed, - aggfunc='sum') - expected_columns = pd.Int64Index([2013, 2014], name='Year') - expected_index = pd.CategoricalIndex(['January'], - categories=months, - ordered=False, - name='Month') - expected = pd.DataFrame([[320, 120]], - index=expected_index, - columns=expected_columns) + df = pd.DataFrame( + { + "Sales": [100, 120, 220], + "Month": ["January", "January", "January"], + "Year": [2013, 2014, 2013], + } + ) + months = [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", + ] + df["Month"] = df["Month"].astype("category").cat.set_categories(months) + result = df.pivot_table( + values="Sales", + index="Month", + columns="Year", + dropna=observed, + aggfunc="sum", + ) + expected_columns = pd.Int64Index([2013, 2014], name="Year") + expected_index = pd.CategoricalIndex( + ["January"], categories=months, ordered=False, name="Month" + ) + expected = pd.DataFrame( + [[320, 120]], index=expected_index, columns=expected_columns + ) if not observed: result = result.dropna().astype(np.int64) @@ -1268,111 +1672,142 @@ def test_pivot_table_not_series(self): # pivot_table always returns a DataFrame # when values is not list like and columns is None # and aggfunc is not instance of list - df = DataFrame({'col1': [3, 4, 5], - 'col2': ['C', 'D', 'E'], - 'col3': [1, 3, 9]}) + df = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"], "col3": [1, 3, 9]}) - result = df.pivot_table('col1', index=['col3', 'col2'], aggfunc=np.sum) - m = MultiIndex.from_arrays([[1, 3, 9], - ['C', 'D', 'E']], - names=['col3', 'col2']) - expected = DataFrame([3, 4, 5], - index=m, columns=['col1']) + result = df.pivot_table("col1", index=["col3", "col2"], aggfunc=np.sum) + m = MultiIndex.from_arrays([[1, 3, 9], ["C", "D", "E"]], names=["col3", "col2"]) + expected = DataFrame([3, 4, 5], index=m, columns=["col1"]) tm.assert_frame_equal(result, expected) - result = df.pivot_table( - 'col1', index='col3', columns='col2', aggfunc=np.sum + result = df.pivot_table("col1", index="col3", columns="col2", aggfunc=np.sum) + expected = DataFrame( + [[3, np.NaN, np.NaN], [np.NaN, 4, np.NaN], [np.NaN, np.NaN, 5]], + index=Index([1, 3, 9], name="col3"), + columns=Index(["C", "D", "E"], name="col2"), ) - expected = DataFrame([[3, np.NaN, np.NaN], - [np.NaN, 4, np.NaN], - [np.NaN, np.NaN, 5]], - index=Index([1, 3, 9], name='col3'), - columns=Index(['C', 'D', 'E'], name='col2')) tm.assert_frame_equal(result, expected) - result = df.pivot_table('col1', index='col3', aggfunc=[np.sum]) - m = MultiIndex.from_arrays([['sum'], - ['col1']]) - expected = DataFrame([3, 4, 5], - index=Index([1, 3, 9], name='col3'), - columns=m) + result = df.pivot_table("col1", index="col3", aggfunc=[np.sum]) + m = MultiIndex.from_arrays([["sum"], ["col1"]]) + expected = DataFrame([3, 4, 5], index=Index([1, 3, 9], name="col3"), columns=m) tm.assert_frame_equal(result, expected) def test_pivot_margins_name_unicode(self): # issue #13292 - greek = '\u0394\u03bf\u03ba\u03b9\u03bc\u03ae' - frame = pd.DataFrame({'foo': [1, 2, 3]}) - table = pd.pivot_table(frame, index=['foo'], aggfunc=len, margins=True, - margins_name=greek) - index = pd.Index([1, 2, 3, greek], dtype='object', name='foo') + greek = "\u0394\u03bf\u03ba\u03b9\u03bc\u03ae" + frame = pd.DataFrame({"foo": [1, 2, 3]}) + table = pd.pivot_table( + frame, index=["foo"], aggfunc=len, margins=True, margins_name=greek + ) + index = pd.Index([1, 2, 3, greek], dtype="object", name="foo") expected = pd.DataFrame(index=index) tm.assert_frame_equal(table, expected) def test_pivot_string_as_func(self): # GH #18713 # for correctness purposes - data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', - 'bar', 'bar', 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', - 'one', 'two', 'two', 'two', 'one'], - 'C': range(11)}) - - result = pivot_table(data, index='A', columns='B', aggfunc='sum') - mi = MultiIndex(levels=[['C'], ['one', 'two']], - codes=[[0, 0], [0, 1]], names=[None, 'B']) - expected = DataFrame({('C', 'one'): {'bar': 15, 'foo': 13}, - ('C', 'two'): {'bar': 7, 'foo': 20}}, - columns=mi).rename_axis('A') + data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": range(11), + } + ) + + result = pivot_table(data, index="A", columns="B", aggfunc="sum") + mi = MultiIndex( + levels=[["C"], ["one", "two"]], codes=[[0, 0], [0, 1]], names=[None, "B"] + ) + expected = DataFrame( + {("C", "one"): {"bar": 15, "foo": 13}, ("C", "two"): {"bar": 7, "foo": 20}}, + columns=mi, + ).rename_axis("A") tm.assert_frame_equal(result, expected) - result = pivot_table(data, index='A', columns='B', - aggfunc=['sum', 'mean']) - mi = MultiIndex(levels=[['sum', 'mean'], ['C'], ['one', 'two']], - codes=[[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 1]], - names=[None, None, 'B']) - expected = DataFrame({('mean', 'C', 'one'): {'bar': 5.0, 'foo': 3.25}, - ('mean', 'C', 'two'): {'bar': 7.0, - 'foo': 6.666666666666667}, - ('sum', 'C', 'one'): {'bar': 15, 'foo': 13}, - ('sum', 'C', 'two'): {'bar': 7, 'foo': 20}}, - columns=mi).rename_axis('A') + result = pivot_table(data, index="A", columns="B", aggfunc=["sum", "mean"]) + mi = MultiIndex( + levels=[["sum", "mean"], ["C"], ["one", "two"]], + codes=[[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 1]], + names=[None, None, "B"], + ) + expected = DataFrame( + { + ("mean", "C", "one"): {"bar": 5.0, "foo": 3.25}, + ("mean", "C", "two"): {"bar": 7.0, "foo": 6.666666666666667}, + ("sum", "C", "one"): {"bar": 15, "foo": 13}, + ("sum", "C", "two"): {"bar": 7, "foo": 20}, + }, + columns=mi, + ).rename_axis("A") tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('f, f_numpy', - [('sum', np.sum), - ('mean', np.mean), - ('std', np.std), - (['sum', 'mean'], [np.sum, np.mean]), - (['sum', 'std'], [np.sum, np.std]), - (['std', 'mean'], [np.std, np.mean])]) + @pytest.mark.parametrize( + "f, f_numpy", + [ + ("sum", np.sum), + ("mean", np.mean), + ("std", np.std), + (["sum", "mean"], [np.sum, np.mean]), + (["sum", "std"], [np.sum, np.std]), + (["std", "mean"], [np.std, np.mean]), + ], + ) def test_pivot_string_func_vs_func(self, f, f_numpy): # GH #18713 # for consistency purposes - result = pivot_table(self.data, index='A', columns='B', aggfunc=f) - expected = pivot_table(self.data, index='A', columns='B', - aggfunc=f_numpy) + result = pivot_table(self.data, index="A", columns="B", aggfunc=f) + expected = pivot_table(self.data, index="A", columns="B", aggfunc=f_numpy) tm.assert_frame_equal(result, expected) @pytest.mark.slow def test_pivot_number_of_levels_larger_than_int32(self): # GH 20601 - df = DataFrame({'ind1': np.arange(2 ** 16), - 'ind2': np.arange(2 ** 16), - 'count': 0}) + df = DataFrame( + {"ind1": np.arange(2 ** 16), "ind2": np.arange(2 ** 16), "count": 0} + ) msg = "Unstacked DataFrame is too big, causing int32 overflow" with pytest.raises(ValueError, match=msg): - df.pivot_table(index='ind1', columns='ind2', - values='count', aggfunc='count') + df.pivot_table( + index="ind1", columns="ind2", values="count", aggfunc="count" + ) def test_pivot_table_aggfunc_dropna(self, dropna): # GH 22159 - df = pd.DataFrame({'fruit': ['apple', 'peach', 'apple'], - 'size': [1, 1, 2], - 'taste': [7, 6, 6]}) + df = pd.DataFrame( + { + "fruit": ["apple", "peach", "apple"], + "size": [1, 1, 2], + "taste": [7, 6, 6], + } + ) def ret_one(x): return 1 @@ -1383,77 +1818,108 @@ def ret_sum(x): def ret_none(x): return np.nan - result = pd.pivot_table(df, columns='fruit', - aggfunc=[ret_sum, ret_none, ret_one], - dropna=dropna) + result = pd.pivot_table( + df, columns="fruit", aggfunc=[ret_sum, ret_none, ret_one], dropna=dropna + ) data = [[3, 1, np.nan, np.nan, 1, 1], [13, 6, np.nan, np.nan, 1, 1]] - col = pd.MultiIndex.from_product([['ret_sum', 'ret_none', 'ret_one'], - ['apple', 'peach']], - names=[None, 'fruit']) - expected = pd.DataFrame(data, index=['size', 'taste'], columns=col) + col = pd.MultiIndex.from_product( + [["ret_sum", "ret_none", "ret_one"], ["apple", "peach"]], + names=[None, "fruit"], + ) + expected = pd.DataFrame(data, index=["size", "taste"], columns=col) if dropna: - expected = expected.dropna(axis='columns') + expected = expected.dropna(axis="columns") tm.assert_frame_equal(result, expected) def test_pivot_table_aggfunc_scalar_dropna(self, dropna): # GH 22159 - df = pd.DataFrame({'A': ['one', 'two', 'one'], - 'x': [3, np.nan, 2], - 'y': [1, np.nan, np.nan]}) + df = pd.DataFrame( + {"A": ["one", "two", "one"], "x": [3, np.nan, 2], "y": [1, np.nan, np.nan]} + ) - result = pd.pivot_table(df, columns='A', - aggfunc=np.mean, - dropna=dropna) + result = pd.pivot_table(df, columns="A", aggfunc=np.mean, dropna=dropna) data = [[2.5, np.nan], [1, np.nan]] - col = pd.Index(['one', 'two'], name='A') - expected = pd.DataFrame(data, index=['x', 'y'], columns=col) + col = pd.Index(["one", "two"], name="A") + expected = pd.DataFrame(data, index=["x", "y"], columns=col) if dropna: - expected = expected.dropna(axis='columns') + expected = expected.dropna(axis="columns") tm.assert_frame_equal(result, expected) class TestCrosstab: - def setup_method(self, method): - df = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', - 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', - 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', - 'dull', 'shiny', 'shiny', 'dull', - 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) + df = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) self.df = df.append(df, ignore_index=True) def test_crosstab_single(self): df = self.df - result = crosstab(df['A'], df['C']) - expected = df.groupby(['A', 'C']).size().unstack() + result = crosstab(df["A"], df["C"]) + expected = df.groupby(["A", "C"]).size().unstack() tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64)) def test_crosstab_multiple(self): df = self.df - result = crosstab(df['A'], [df['B'], df['C']]) - expected = df.groupby(['A', 'B', 'C']).size() - expected = expected.unstack( - 'B').unstack('C').fillna(0).astype(np.int64) + result = crosstab(df["A"], [df["B"], df["C"]]) + expected = df.groupby(["A", "B", "C"]).size() + expected = expected.unstack("B").unstack("C").fillna(0).astype(np.int64) tm.assert_frame_equal(result, expected) - result = crosstab([df['B'], df['C']], df['A']) - expected = df.groupby(['B', 'C', 'A']).size() - expected = expected.unstack('A').fillna(0).astype(np.int64) + result = crosstab([df["B"], df["C"]], df["A"]) + expected = df.groupby(["B", "C", "A"]).size() + expected = expected.unstack("A").fillna(0).astype(np.int64) tm.assert_frame_equal(result, expected) def test_crosstab_ndarray(self): @@ -1461,30 +1927,32 @@ def test_crosstab_ndarray(self): b = np.random.randint(0, 3, size=100) c = np.random.randint(0, 10, size=100) - df = DataFrame({'a': a, 'b': b, 'c': c}) + df = DataFrame({"a": a, "b": b, "c": c}) - result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c')) - expected = crosstab(df['a'], [df['b'], df['c']]) + result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c")) + expected = crosstab(df["a"], [df["b"], df["c"]]) tm.assert_frame_equal(result, expected) - result = crosstab([b, c], a, colnames=['a'], rownames=('b', 'c')) - expected = crosstab([df['b'], df['c']], df['a']) + result = crosstab([b, c], a, colnames=["a"], rownames=("b", "c")) + expected = crosstab([df["b"], df["c"]], df["a"]) tm.assert_frame_equal(result, expected) # assign arbitrary names - result = crosstab(self.df['A'].values, self.df['C'].values) - assert result.index.name == 'row_0' - assert result.columns.name == 'col_0' + result = crosstab(self.df["A"].values, self.df["C"].values) + assert result.index.name == "row_0" + assert result.columns.name == "col_0" def test_crosstab_non_aligned(self): # GH 17005 - a = pd.Series([0, 1, 1], index=['a', 'b', 'c']) - b = pd.Series([3, 4, 3, 4, 3], index=['a', 'b', 'c', 'd', 'f']) + a = pd.Series([0, 1, 1], index=["a", "b", "c"]) + b = pd.Series([3, 4, 3, 4, 3], index=["a", "b", "c", "d", "f"]) c = np.array([3, 4, 3]) - expected = pd.DataFrame([[1, 0], [1, 1]], - index=Index([0, 1], name='row_0'), - columns=Index([3, 4], name='col_0')) + expected = pd.DataFrame( + [[1, 0], [1, 1]], + index=Index([0, 1], name="row_0"), + columns=Index([3, 4], name="col_0"), + ) result = crosstab(a, b) tm.assert_frame_equal(result, expected) @@ -1497,27 +1965,26 @@ def test_crosstab_margins(self): b = np.random.randint(0, 3, size=100) c = np.random.randint(0, 5, size=100) - df = DataFrame({'a': a, 'b': b, 'c': c}) + df = DataFrame({"a": a, "b": b, "c": c}) - result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'), - margins=True) + result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True) - assert result.index.names == ('a',) - assert result.columns.names == ['b', 'c'] + assert result.index.names == ("a",) + assert result.columns.names == ["b", "c"] - all_cols = result['All', ''] - exp_cols = df.groupby(['a']).size().astype('i8') + all_cols = result["All", ""] + exp_cols = df.groupby(["a"]).size().astype("i8") # to keep index.name - exp_margin = Series([len(df)], index=Index(['All'], name='a')) + exp_margin = Series([len(df)], index=Index(["All"], name="a")) exp_cols = exp_cols.append(exp_margin) - exp_cols.name = ('All', '') + exp_cols.name = ("All", "") tm.assert_series_equal(all_cols, exp_cols) - all_rows = result.loc['All'] - exp_rows = df.groupby(['b', 'c']).size().astype('i8') - exp_rows = exp_rows.append(Series([len(df)], index=[('All', '')])) - exp_rows.name = 'All' + all_rows = result.loc["All"] + exp_rows = df.groupby(["b", "c"]).size().astype("i8") + exp_rows = exp_rows.append(Series([len(df)], index=[("All", "")])) + exp_rows.name = "All" exp_rows = exp_rows.reindex(all_rows.index) exp_rows = exp_rows.fillna(0).astype(np.int64) @@ -1529,37 +1996,49 @@ def test_crosstab_margins_set_margin_name(self): b = np.random.randint(0, 3, size=100) c = np.random.randint(0, 5, size=100) - df = DataFrame({'a': a, 'b': b, 'c': c}) + df = DataFrame({"a": a, "b": b, "c": c}) - result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'), - margins=True, margins_name='TOTAL') + result = crosstab( + a, + [b, c], + rownames=["a"], + colnames=("b", "c"), + margins=True, + margins_name="TOTAL", + ) - assert result.index.names == ('a',) - assert result.columns.names == ['b', 'c'] + assert result.index.names == ("a",) + assert result.columns.names == ["b", "c"] - all_cols = result['TOTAL', ''] - exp_cols = df.groupby(['a']).size().astype('i8') + all_cols = result["TOTAL", ""] + exp_cols = df.groupby(["a"]).size().astype("i8") # to keep index.name - exp_margin = Series([len(df)], index=Index(['TOTAL'], name='a')) + exp_margin = Series([len(df)], index=Index(["TOTAL"], name="a")) exp_cols = exp_cols.append(exp_margin) - exp_cols.name = ('TOTAL', '') + exp_cols.name = ("TOTAL", "") tm.assert_series_equal(all_cols, exp_cols) - all_rows = result.loc['TOTAL'] - exp_rows = df.groupby(['b', 'c']).size().astype('i8') - exp_rows = exp_rows.append(Series([len(df)], index=[('TOTAL', '')])) - exp_rows.name = 'TOTAL' + all_rows = result.loc["TOTAL"] + exp_rows = df.groupby(["b", "c"]).size().astype("i8") + exp_rows = exp_rows.append(Series([len(df)], index=[("TOTAL", "")])) + exp_rows.name = "TOTAL" exp_rows = exp_rows.reindex(all_rows.index) exp_rows = exp_rows.fillna(0).astype(np.int64) tm.assert_series_equal(all_rows, exp_rows) msg = "margins_name argument must be a string" - for margins_name in [666, None, ['a', 'b']]: + for margins_name in [666, None, ["a", "b"]]: with pytest.raises(ValueError, match=msg): - crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'), - margins=True, margins_name=margins_name) + crosstab( + a, + [b, c], + rownames=["a"], + colnames=("b", "c"), + margins=True, + margins_name=margins_name, + ) def test_crosstab_pass_values(self): a = np.random.randint(0, 7, size=100) @@ -1567,28 +2046,29 @@ def test_crosstab_pass_values(self): c = np.random.randint(0, 5, size=100) values = np.random.randn(100) - table = crosstab([a, b], c, values, aggfunc=np.sum, - rownames=['foo', 'bar'], colnames=['baz']) + table = crosstab( + [a, b], c, values, aggfunc=np.sum, rownames=["foo", "bar"], colnames=["baz"] + ) - df = DataFrame({'foo': a, 'bar': b, 'baz': c, 'values': values}) + df = DataFrame({"foo": a, "bar": b, "baz": c, "values": values}) - expected = df.pivot_table('values', index=['foo', 'bar'], - columns='baz', aggfunc=np.sum) + expected = df.pivot_table( + "values", index=["foo", "bar"], columns="baz", aggfunc=np.sum + ) tm.assert_frame_equal(table, expected) def test_crosstab_dropna(self): # GH 3820 - a = np.array(['foo', 'foo', 'foo', 'bar', - 'bar', 'foo', 'foo'], dtype=object) - b = np.array(['one', 'one', 'two', 'one', - 'two', 'two', 'two'], dtype=object) - c = np.array(['dull', 'dull', 'dull', 'dull', - 'dull', 'shiny', 'shiny'], dtype=object) - res = pd.crosstab(a, [b, c], rownames=['a'], - colnames=['b', 'c'], dropna=False) - m = MultiIndex.from_tuples([('one', 'dull'), ('one', 'shiny'), - ('two', 'dull'), ('two', 'shiny')], - names=['b', 'c']) + a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) + b = np.array(["one", "one", "two", "one", "two", "two", "two"], dtype=object) + c = np.array( + ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object + ) + res = pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=False) + m = MultiIndex.from_tuples( + [("one", "dull"), ("one", "shiny"), ("two", "dull"), ("two", "shiny")], + names=["b", "c"], + ) tm.assert_index_equal(res.columns, m) def test_crosstab_no_overlap(self): @@ -1607,302 +2087,319 @@ def test_margin_dropna(self): # pivot_table counts null into margin ('All') # when margins=true and dropna=true - df = pd.DataFrame({'a': [1, 2, 2, 2, 2, np.nan], - 'b': [3, 3, 4, 4, 4, 4]}) + df = pd.DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]]) - expected.index = Index([1.0, 2.0, 'All'], name='a') - expected.columns = Index([3, 4, 'All'], name='b') + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3, 4, "All"], name="b") tm.assert_frame_equal(actual, expected) - df = DataFrame({'a': [1, np.nan, np.nan, np.nan, 2, np.nan], - 'b': [3, np.nan, 4, 4, 4, 4]}) + df = DataFrame( + {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} + ) actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) - expected.index = Index([1.0, 2.0, 'All'], name='a') - expected.columns = Index([3.0, 4.0, 'All'], name='b') + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3.0, 4.0, "All"], name="b") tm.assert_frame_equal(actual, expected) - df = DataFrame({'a': [1, np.nan, np.nan, np.nan, np.nan, 2], - 'b': [3, 3, 4, 4, 4, 4]}) + df = DataFrame( + {"a": [1, np.nan, np.nan, np.nan, np.nan, 2], "b": [3, 3, 4, 4, 4, 4]} + ) actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) - expected.index = Index([1.0, 2.0, 'All'], name='a') - expected.columns = Index([3, 4, 'All'], name='b') + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3, 4, "All"], name="b") tm.assert_frame_equal(actual, expected) # GH 12642 # _add_margins raises KeyError: Level None not found # when margins=True and dropna=False - df = pd.DataFrame({'a': [1, 2, 2, 2, 2, np.nan], - 'b': [3, 3, 4, 4, 4, 4]}) + df = pd.DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) actual = pd.crosstab(df.a, df.b, margins=True, dropna=False) expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]]) - expected.index = Index([1.0, 2.0, 'All'], name='a') - expected.columns = Index([3, 4, 'All'], name='b') + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3, 4, "All"], name="b") tm.assert_frame_equal(actual, expected) - df = DataFrame({'a': [1, np.nan, np.nan, np.nan, 2, np.nan], - 'b': [3, np.nan, 4, 4, 4, 4]}) + df = DataFrame( + {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} + ) actual = pd.crosstab(df.a, df.b, margins=True, dropna=False) expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 4, 6]]) - expected.index = Index([1.0, 2.0, 'All'], name='a') - expected.columns = Index([3.0, 4.0, 'All'], name='b') + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3.0, 4.0, "All"], name="b") tm.assert_frame_equal(actual, expected) - a = np.array(['foo', 'foo', 'foo', 'bar', - 'bar', 'foo', 'foo'], dtype=object) - b = np.array(['one', 'one', 'two', 'one', - 'two', np.nan, 'two'], dtype=object) - c = np.array(['dull', 'dull', 'dull', 'dull', - 'dull', 'shiny', 'shiny'], dtype=object) - - actual = pd.crosstab(a, [b, c], rownames=['a'], - colnames=['b', 'c'], margins=True, dropna=False) - m = MultiIndex.from_arrays([['one', 'one', 'two', 'two', 'All'], - ['dull', 'shiny', 'dull', 'shiny', '']], - names=['b', 'c']) - expected = DataFrame([[1, 0, 1, 0, 2], [2, 0, 1, 1, 5], - [3, 0, 2, 1, 7]], columns=m) - expected.index = Index(['bar', 'foo', 'All'], name='a') + a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) + b = np.array(["one", "one", "two", "one", "two", np.nan, "two"], dtype=object) + c = np.array( + ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object + ) + + actual = pd.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], margins=True, dropna=False + ) + m = MultiIndex.from_arrays( + [ + ["one", "one", "two", "two", "All"], + ["dull", "shiny", "dull", "shiny", ""], + ], + names=["b", "c"], + ) + expected = DataFrame( + [[1, 0, 1, 0, 2], [2, 0, 1, 1, 5], [3, 0, 2, 1, 7]], columns=m + ) + expected.index = Index(["bar", "foo", "All"], name="a") tm.assert_frame_equal(actual, expected) - actual = pd.crosstab([a, b], c, rownames=['a', 'b'], - colnames=['c'], margins=True, dropna=False) - m = MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo', 'All'], - ['one', 'two', 'one', 'two', '']], - names=['a', 'b']) - expected = DataFrame([[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], - [5, 2, 7]], index=m) - expected.columns = Index(['dull', 'shiny', 'All'], name='c') + actual = pd.crosstab( + [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=False + ) + m = MultiIndex.from_arrays( + [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], + names=["a", "b"], + ) + expected = DataFrame( + [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 2, 7]], index=m + ) + expected.columns = Index(["dull", "shiny", "All"], name="c") tm.assert_frame_equal(actual, expected) - actual = pd.crosstab([a, b], c, rownames=['a', 'b'], - colnames=['c'], margins=True, dropna=True) - m = MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo', 'All'], - ['one', 'two', 'one', 'two', '']], - names=['a', 'b']) - expected = DataFrame([[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], - [5, 1, 6]], index=m) - expected.columns = Index(['dull', 'shiny', 'All'], name='c') + actual = pd.crosstab( + [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=True + ) + m = MultiIndex.from_arrays( + [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], + names=["a", "b"], + ) + expected = DataFrame( + [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 1, 6]], index=m + ) + expected.columns = Index(["dull", "shiny", "All"], name="c") tm.assert_frame_equal(actual, expected) def test_crosstab_normalize(self): # Issue 12578 - df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], - 'c': [1, 1, np.nan, 1, 1]}) - - rindex = pd.Index([1, 2], name='a') - cindex = pd.Index([3, 4], name='b') - full_normal = pd.DataFrame([[0.2, 0], [0.2, 0.6]], - index=rindex, columns=cindex) - row_normal = pd.DataFrame([[1.0, 0], [0.25, 0.75]], - index=rindex, columns=cindex) - col_normal = pd.DataFrame([[0.5, 0], [0.5, 1.0]], - index=rindex, columns=cindex) + df = pd.DataFrame( + {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} + ) + + rindex = pd.Index([1, 2], name="a") + cindex = pd.Index([3, 4], name="b") + full_normal = pd.DataFrame([[0.2, 0], [0.2, 0.6]], index=rindex, columns=cindex) + row_normal = pd.DataFrame( + [[1.0, 0], [0.25, 0.75]], index=rindex, columns=cindex + ) + col_normal = pd.DataFrame([[0.5, 0], [0.5, 1.0]], index=rindex, columns=cindex) # Check all normalize args - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='all'), - full_normal) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True), - full_normal) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index'), - row_normal) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns'), - col_normal) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=1), - pd.crosstab(df.a, df.b, normalize='columns')) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=0), - pd.crosstab(df.a, df.b, normalize='index')) - - row_normal_margins = pd.DataFrame([[1.0, 0], - [0.25, 0.75], - [0.4, 0.6]], - index=pd.Index([1, 2, 'All'], - name='a', - dtype='object'), - columns=pd.Index([3, 4], name='b', - dtype='object')) - col_normal_margins = pd.DataFrame([[0.5, 0, 0.2], [0.5, 1.0, 0.8]], - index=pd.Index([1, 2], name='a', - dtype='object'), - columns=pd.Index([3, 4, 'All'], - name='b', - dtype='object')) - - all_normal_margins = pd.DataFrame([[0.2, 0, 0.2], - [0.2, 0.6, 0.8], - [0.4, 0.6, 1]], - index=pd.Index([1, 2, 'All'], - name='a', - dtype='object'), - columns=pd.Index([3, 4, 'All'], - name='b', - dtype='object')) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index', - margins=True), row_normal_margins) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns', - margins=True), - col_normal_margins) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True, - margins=True), all_normal_margins) + tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize="all"), full_normal) + tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True), full_normal) + tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize="index"), row_normal) + tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize="columns"), col_normal) + tm.assert_frame_equal( + pd.crosstab(df.a, df.b, normalize=1), + pd.crosstab(df.a, df.b, normalize="columns"), + ) + tm.assert_frame_equal( + pd.crosstab(df.a, df.b, normalize=0), + pd.crosstab(df.a, df.b, normalize="index"), + ) + + row_normal_margins = pd.DataFrame( + [[1.0, 0], [0.25, 0.75], [0.4, 0.6]], + index=pd.Index([1, 2, "All"], name="a", dtype="object"), + columns=pd.Index([3, 4], name="b", dtype="object"), + ) + col_normal_margins = pd.DataFrame( + [[0.5, 0, 0.2], [0.5, 1.0, 0.8]], + index=pd.Index([1, 2], name="a", dtype="object"), + columns=pd.Index([3, 4, "All"], name="b", dtype="object"), + ) + + all_normal_margins = pd.DataFrame( + [[0.2, 0, 0.2], [0.2, 0.6, 0.8], [0.4, 0.6, 1]], + index=pd.Index([1, 2, "All"], name="a", dtype="object"), + columns=pd.Index([3, 4, "All"], name="b", dtype="object"), + ) + tm.assert_frame_equal( + pd.crosstab(df.a, df.b, normalize="index", margins=True), row_normal_margins + ) + tm.assert_frame_equal( + pd.crosstab(df.a, df.b, normalize="columns", margins=True), + col_normal_margins, + ) + tm.assert_frame_equal( + pd.crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins + ) # Test arrays - pd.crosstab([np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])], - np.array([1, 2, 1, 2])) + pd.crosstab( + [np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])], np.array([1, 2, 1, 2]) + ) # Test with aggfunc - norm_counts = pd.DataFrame([[0.25, 0, 0.25], - [0.25, 0.5, 0.75], - [0.5, 0.5, 1]], - index=pd.Index([1, 2, 'All'], - name='a', - dtype='object'), - columns=pd.Index([3, 4, 'All'], - name='b')) - test_case = pd.crosstab(df.a, df.b, df.c, aggfunc='count', - normalize='all', - margins=True) + norm_counts = pd.DataFrame( + [[0.25, 0, 0.25], [0.25, 0.5, 0.75], [0.5, 0.5, 1]], + index=pd.Index([1, 2, "All"], name="a", dtype="object"), + columns=pd.Index([3, 4, "All"], name="b"), + ) + test_case = pd.crosstab( + df.a, df.b, df.c, aggfunc="count", normalize="all", margins=True + ) tm.assert_frame_equal(test_case, norm_counts) - df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], - 'c': [0, 4, np.nan, 3, 3]}) - - norm_sum = pd.DataFrame([[0, 0, 0.], - [0.4, 0.6, 1], - [0.4, 0.6, 1]], - index=pd.Index([1, 2, 'All'], - name='a', - dtype='object'), - columns=pd.Index([3, 4, 'All'], - name='b', - dtype='object')) - test_case = pd.crosstab(df.a, df.b, df.c, aggfunc=np.sum, - normalize='all', - margins=True) + df = pd.DataFrame( + {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [0, 4, np.nan, 3, 3]} + ) + + norm_sum = pd.DataFrame( + [[0, 0, 0.0], [0.4, 0.6, 1], [0.4, 0.6, 1]], + index=pd.Index([1, 2, "All"], name="a", dtype="object"), + columns=pd.Index([3, 4, "All"], name="b", dtype="object"), + ) + test_case = pd.crosstab( + df.a, df.b, df.c, aggfunc=np.sum, normalize="all", margins=True + ) tm.assert_frame_equal(test_case, norm_sum) def test_crosstab_with_empties(self): # Check handling of empties - df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], - 'c': [np.nan, np.nan, np.nan, np.nan, np.nan]}) - - empty = pd.DataFrame([[0.0, 0.0], [0.0, 0.0]], - index=pd.Index([1, 2], - name='a', - dtype='int64'), - columns=pd.Index([3, 4], name='b')) - - for i in [True, 'index', 'columns']: - calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count', - normalize=i) + df = pd.DataFrame( + { + "a": [1, 2, 2, 2, 2], + "b": [3, 3, 4, 4, 4], + "c": [np.nan, np.nan, np.nan, np.nan, np.nan], + } + ) + + empty = pd.DataFrame( + [[0.0, 0.0], [0.0, 0.0]], + index=pd.Index([1, 2], name="a", dtype="int64"), + columns=pd.Index([3, 4], name="b"), + ) + + for i in [True, "index", "columns"]: + calculated = pd.crosstab( + df.a, df.b, values=df.c, aggfunc="count", normalize=i + ) tm.assert_frame_equal(empty, calculated) - nans = pd.DataFrame([[0.0, np.nan], [0.0, 0.0]], - index=pd.Index([1, 2], - name='a', - dtype='int64'), - columns=pd.Index([3, 4], name='b')) + nans = pd.DataFrame( + [[0.0, np.nan], [0.0, 0.0]], + index=pd.Index([1, 2], name="a", dtype="int64"), + columns=pd.Index([3, 4], name="b"), + ) - calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count', - normalize=False) + calculated = pd.crosstab( + df.a, df.b, values=df.c, aggfunc="count", normalize=False + ) tm.assert_frame_equal(nans, calculated) def test_crosstab_errors(self): # Issue 12578 - df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], - 'c': [1, 1, np.nan, 1, 1]}) + df = pd.DataFrame( + {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} + ) - error = 'values cannot be used without an aggfunc.' + error = "values cannot be used without an aggfunc." with pytest.raises(ValueError, match=error): pd.crosstab(df.a, df.b, values=df.c) - error = 'aggfunc cannot be used without values' + error = "aggfunc cannot be used without values" with pytest.raises(ValueError, match=error): pd.crosstab(df.a, df.b, aggfunc=np.mean) - error = 'Not a valid normalize argument' + error = "Not a valid normalize argument" with pytest.raises(ValueError, match=error): - pd.crosstab(df.a, df.b, normalize='42') + pd.crosstab(df.a, df.b, normalize="42") with pytest.raises(ValueError, match=error): pd.crosstab(df.a, df.b, normalize=42) - error = 'Not a valid margins argument' + error = "Not a valid margins argument" with pytest.raises(ValueError, match=error): - pd.crosstab(df.a, df.b, normalize='all', margins=42) + pd.crosstab(df.a, df.b, normalize="all", margins=42) def test_crosstab_with_categorial_columns(self): # GH 8860 - df = pd.DataFrame({'MAKE': ['Honda', 'Acura', 'Tesla', - 'Honda', 'Honda', 'Acura'], - 'MODEL': ['Sedan', 'Sedan', 'Electric', - 'Pickup', 'Sedan', 'Sedan']}) - categories = ['Sedan', 'Electric', 'Pickup'] - df['MODEL'] = (df['MODEL'].astype('category') - .cat.set_categories(categories)) - result = pd.crosstab(df['MAKE'], df['MODEL']) - - expected_index = pd.Index(['Acura', 'Honda', 'Tesla'], name='MAKE') - expected_columns = pd.CategoricalIndex(categories, - categories=categories, - ordered=False, - name='MODEL') + df = pd.DataFrame( + { + "MAKE": ["Honda", "Acura", "Tesla", "Honda", "Honda", "Acura"], + "MODEL": ["Sedan", "Sedan", "Electric", "Pickup", "Sedan", "Sedan"], + } + ) + categories = ["Sedan", "Electric", "Pickup"] + df["MODEL"] = df["MODEL"].astype("category").cat.set_categories(categories) + result = pd.crosstab(df["MAKE"], df["MODEL"]) + + expected_index = pd.Index(["Acura", "Honda", "Tesla"], name="MAKE") + expected_columns = pd.CategoricalIndex( + categories, categories=categories, ordered=False, name="MODEL" + ) expected_data = [[2, 0, 0], [2, 0, 1], [0, 1, 0]] - expected = pd.DataFrame(expected_data, - index=expected_index, - columns=expected_columns) + expected = pd.DataFrame( + expected_data, index=expected_index, columns=expected_columns + ) tm.assert_frame_equal(result, expected) def test_crosstab_with_numpy_size(self): # GH 4003 - df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 6, - 'B': ['A', 'B', 'C'] * 8, - 'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4, - 'D': np.random.randn(24), - 'E': np.random.randn(24)}) - result = pd.crosstab(index=[df['A'], df['B']], - columns=[df['C']], - margins=True, - aggfunc=np.size, - values=df['D']) - expected_index = pd.MultiIndex(levels=[['All', 'one', 'three', 'two'], - ['', 'A', 'B', 'C']], - codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], - [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]], - names=['A', 'B']) - expected_column = pd.Index(['bar', 'foo', 'All'], - dtype='object', - name='C') - expected_data = np.array([[2., 2., 4.], - [2., 2., 4.], - [2., 2., 4.], - [2., np.nan, 2.], - [np.nan, 2., 2.], - [2., np.nan, 2.], - [np.nan, 2., 2.], - [2., np.nan, 2.], - [np.nan, 2., 2.], - [12., 12., 24.]]) - expected = pd.DataFrame(expected_data, - index=expected_index, - columns=expected_column) + df = pd.DataFrame( + { + "A": ["one", "one", "two", "three"] * 6, + "B": ["A", "B", "C"] * 8, + "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, + "D": np.random.randn(24), + "E": np.random.randn(24), + } + ) + result = pd.crosstab( + index=[df["A"], df["B"]], + columns=[df["C"]], + margins=True, + aggfunc=np.size, + values=df["D"], + ) + expected_index = pd.MultiIndex( + levels=[["All", "one", "three", "two"], ["", "A", "B", "C"]], + codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]], + names=["A", "B"], + ) + expected_column = pd.Index(["bar", "foo", "All"], dtype="object", name="C") + expected_data = np.array( + [ + [2.0, 2.0, 4.0], + [2.0, 2.0, 4.0], + [2.0, 2.0, 4.0], + [2.0, np.nan, 2.0], + [np.nan, 2.0, 2.0], + [2.0, np.nan, 2.0], + [np.nan, 2.0, 2.0], + [2.0, np.nan, 2.0], + [np.nan, 2.0, 2.0], + [12.0, 12.0, 24.0], + ] + ) + expected = pd.DataFrame( + expected_data, index=expected_index, columns=expected_column + ) tm.assert_frame_equal(result, expected) def test_crosstab_dup_index_names(self): # GH 13279 - s = pd.Series(range(3), name='foo') + s = pd.Series(range(3), name="foo") result = pd.crosstab(s, s) - expected_index = pd.Index(range(3), name='foo') - expected = pd.DataFrame(np.eye(3, dtype=np.int64), - index=expected_index, - columns=expected_index) + expected_index = pd.Index(range(3), name="foo") + expected = pd.DataFrame( + np.eye(3, dtype=np.int64), index=expected_index, columns=expected_index + ) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("names", [['a', ('b', 'c')], - [('a', 'b'), 'c']]) + @pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]]) def test_crosstab_tuple_name(self, names): s1 = pd.Series(range(3), name=names[0]) s2 = pd.Series(range(1, 4), name=names[1]) @@ -1914,13 +2411,13 @@ def test_crosstab_tuple_name(self, names): tm.assert_frame_equal(result, expected) def test_crosstab_unsorted_order(self): - df = pd.DataFrame({"b": [3, 1, 2], 'a': [5, 4, 6]}, - index=['C', 'A', 'B']) + df = pd.DataFrame({"b": [3, 1, 2], "a": [5, 4, 6]}, index=["C", "A", "B"]) result = pd.crosstab(df.index, [df.b, df.a]) - e_idx = pd.Index(['A', 'B', 'C'], name='row_0') - e_columns = pd.MultiIndex.from_tuples([(1, 4), (2, 6), (3, 5)], - names=['b', 'a']) - expected = pd.DataFrame([[1, 0, 0], [0, 1, 0], [0, 0, 1]], - index=e_idx, - columns=e_columns) + e_idx = pd.Index(["A", "B", "C"], name="row_0") + e_columns = pd.MultiIndex.from_tuples( + [(1, 4), (2, 6), (3, 5)], names=["b", "a"] + ) + expected = pd.DataFrame( + [[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index e66484822c625a..cb46918157e89c 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -4,8 +4,20 @@ import pytest from pandas import ( - Categorical, DatetimeIndex, Interval, IntervalIndex, NaT, Series, - TimedeltaIndex, Timestamp, cut, date_range, isna, qcut, timedelta_range) + Categorical, + DatetimeIndex, + Interval, + IntervalIndex, + NaT, + Series, + TimedeltaIndex, + Timestamp, + cut, + date_range, + isna, + qcut, + timedelta_range, +) from pandas.api.types import CategoricalDtype as CDT from pandas.core.algorithms import quantile import pandas.util.testing as tm @@ -19,7 +31,7 @@ def test_qcut(): # We store the bins as Index that have been # rounded to comparisons are a bit tricky. labels, bins = qcut(arr, 4, retbins=True) - ex_bins = quantile(arr, [0, .25, .5, .75, 1.]) + ex_bins = quantile(arr, [0, 0.25, 0.5, 0.75, 1.0]) result = labels.categories.left.values assert np.allclose(result, ex_bins[:-1], atol=1e-2) @@ -40,7 +52,7 @@ def test_qcut_bounds(): def test_qcut_specify_quantiles(): arr = np.random.randn(100) - factor = qcut(arr, [0, .25, .5, .75, 1.]) + factor = qcut(arr, [0, 0.25, 0.5, 0.75, 1.0]) expected = qcut(arr, 4) tm.assert_categorical_equal(factor, expected) @@ -55,8 +67,14 @@ def test_qcut_include_lowest(): values = np.arange(10) ii = qcut(values, 4) - ex_levels = IntervalIndex([Interval(-0.001, 2.25), Interval(2.25, 4.5), - Interval(4.5, 6.75), Interval(6.75, 9)]) + ex_levels = IntervalIndex( + [ + Interval(-0.001, 2.25), + Interval(2.25, 4.5), + Interval(4.5, 6.75), + Interval(6.75, 9), + ] + ) tm.assert_index_equal(ii.categories, ex_levels) @@ -93,8 +111,9 @@ def test_qcut_binning_issues(datapath): starts.append(float(s)) ends.append(float(e)) - for (sp, sn), (ep, en) in zip(zip(starts[:-1], starts[1:]), - zip(ends[:-1], ends[1:])): + for (sp, sn), (ep, en) in zip( + zip(starts[:-1], starts[1:]), zip(ends[:-1], ends[1:]) + ): assert sp < sn assert ep < en assert ep <= sn @@ -104,19 +123,22 @@ def test_qcut_return_intervals(): ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8]) res = qcut(ser, [0, 0.333, 0.666, 1]) - exp_levels = np.array([Interval(-0.001, 2.664), - Interval(2.664, 5.328), Interval(5.328, 8)]) - exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype( - CDT(ordered=True)) + exp_levels = np.array( + [Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)] + ) + exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True)) tm.assert_series_equal(res, exp) -@pytest.mark.parametrize("kwargs,msg", [ - (dict(duplicates="drop"), None), - (dict(), "Bin edges must be unique"), - (dict(duplicates="raise"), "Bin edges must be unique"), - (dict(duplicates="foo"), "invalid value for 'duplicates' parameter") -]) +@pytest.mark.parametrize( + "kwargs,msg", + [ + (dict(duplicates="drop"), None), + (dict(), "Bin edges must be unique"), + (dict(duplicates="raise"), "Bin edges must be unique"), + (dict(duplicates="foo"), "invalid value for 'duplicates' parameter"), + ], +) def test_qcut_duplicates_bin(kwargs, msg): # see gh-7751 values = [0, 0, 0, 0, 1, 2, 3] @@ -130,11 +152,9 @@ def test_qcut_duplicates_bin(kwargs, msg): tm.assert_index_equal(result.categories, expected) -@pytest.mark.parametrize("data,start,end", [ - (9.0, 8.999, 9.0), - (0.0, -0.001, 0.0), - (-9.0, -9.001, -9.0), -]) +@pytest.mark.parametrize( + "data,start,end", [(9.0, 8.999, 9.0), (0.0, -0.001, 0.0), (-9.0, -9.001, -9.0)] +) @pytest.mark.parametrize("length", [1, 2]) @pytest.mark.parametrize("labels", [None, False]) def test_single_quantile(data, start, end, length, labels): @@ -143,8 +163,7 @@ def test_single_quantile(data, start, end, length, labels): result = qcut(ser, 1, labels=labels) if labels is None: - intervals = IntervalIndex([Interval(start, end)] * - length, closed="right") + intervals = IntervalIndex([Interval(start, end)] * length, closed="right") expected = Series(intervals).astype(CDT(ordered=True)) else: expected = Series([0] * length) @@ -152,15 +171,19 @@ def test_single_quantile(data, start, end, length, labels): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("ser", [ - Series(DatetimeIndex(["20180101", NaT, "20180103"])), - Series(TimedeltaIndex(["0 days", NaT, "2 days"]))], - ids=lambda x: str(x.dtype)) +@pytest.mark.parametrize( + "ser", + [ + Series(DatetimeIndex(["20180101", NaT, "20180103"])), + Series(TimedeltaIndex(["0 days", NaT, "2 days"])), + ], + ids=lambda x: str(x.dtype), +) def test_qcut_nat(ser): # see gh-19768 - intervals = IntervalIndex.from_tuples([ - (ser[0] - Nano(), ser[2] - Day()), - np.nan, (ser[2] - Day(), ser[2])]) + intervals = IntervalIndex.from_tuples( + [(ser[0] - Nano(), ser[2] - Day()), np.nan, (ser[2] - Day(), ser[2])] + ) expected = Series(Categorical(intervals, ordered=True)) result = qcut(ser, 2) @@ -174,22 +197,40 @@ def test_datetime_tz_qcut(bins): ser = Series(date_range("20130101", periods=3, tz=tz)) result = qcut(ser, bins) - expected = Series(IntervalIndex([ - Interval(Timestamp("2012-12-31 23:59:59.999999999", tz=tz), - Timestamp("2013-01-01 16:00:00", tz=tz)), - Interval(Timestamp("2013-01-01 16:00:00", tz=tz), - Timestamp("2013-01-02 08:00:00", tz=tz)), - Interval(Timestamp("2013-01-02 08:00:00", tz=tz), - Timestamp("2013-01-03 00:00:00", tz=tz))])).astype( - CDT(ordered=True)) + expected = Series( + IntervalIndex( + [ + Interval( + Timestamp("2012-12-31 23:59:59.999999999", tz=tz), + Timestamp("2013-01-01 16:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-01 16:00:00", tz=tz), + Timestamp("2013-01-02 08:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-02 08:00:00", tz=tz), + Timestamp("2013-01-03 00:00:00", tz=tz), + ), + ] + ) + ).astype(CDT(ordered=True)) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("arg,expected_bins", [ - [timedelta_range("1day", periods=3), - TimedeltaIndex(["1 days", "2 days", "3 days"])], - [date_range("20180101", periods=3), - DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"])]]) +@pytest.mark.parametrize( + "arg,expected_bins", + [ + [ + timedelta_range("1day", periods=3), + TimedeltaIndex(["1 days", "2 days", "3 days"]), + ], + [ + date_range("20180101", periods=3), + DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"]), + ], + ], +) def test_date_like_qcut_bins(arg, expected_bins): # see gh-19891 ser = Series(arg) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index d0979fb86d36d5..1c9e3e57bc310d 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -15,22 +15,19 @@ @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestGetDummies: - @pytest.fixture def df(self): - return DataFrame({'A': ['a', 'b', 'a'], - 'B': ['b', 'b', 'c'], - 'C': [1, 2, 3]}) + return DataFrame({"A": ["a", "b", "a"], "B": ["b", "b", "c"], "C": [1, 2, 3]}) - @pytest.fixture(params=['uint8', 'i8', np.float64, bool, None]) + @pytest.fixture(params=["uint8", "i8", np.float64, bool, None]) def dtype(self, request): return np.dtype(request.param) - @pytest.fixture(params=['dense', 'sparse']) + @pytest.fixture(params=["dense", "sparse"]) def sparse(self, request): # params are strings to simplify reading test results, # e.g. TestGetDummies::test_basic[uint8-sparse] instead of [uint8-True] - return request.param == 'sparse' + return request.param == "sparse" def effective_dtype(self, dtype): if dtype is None: @@ -39,17 +36,17 @@ def effective_dtype(self, dtype): def test_raises_on_dtype_object(self, df): with pytest.raises(ValueError): - get_dummies(df, dtype='object') + get_dummies(df, dtype="object") def test_basic(self, sparse, dtype): - s_list = list('abc') + s_list = list("abc") s_series = Series(s_list) - s_series_index = Series(s_list, list('ABC')) + s_series_index = Series(s_list, list("ABC")) - expected = DataFrame({'a': [1, 0, 0], - 'b': [0, 1, 0], - 'c': [0, 0, 1]}, - dtype=self.effective_dtype(dtype)) + expected = DataFrame( + {"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]}, + dtype=self.effective_dtype(dtype), + ) if sparse: expected = expected.apply(pd.SparseArray, fill_value=0.0) result = get_dummies(s_list, sparse=sparse, dtype=dtype) @@ -58,23 +55,23 @@ def test_basic(self, sparse, dtype): result = get_dummies(s_series, sparse=sparse, dtype=dtype) assert_frame_equal(result, expected) - expected.index = list('ABC') + expected.index = list("ABC") result = get_dummies(s_series_index, sparse=sparse, dtype=dtype) assert_frame_equal(result, expected) def test_basic_types(self, sparse, dtype): # GH 10531 - s_list = list('abc') + s_list = list("abc") s_series = Series(s_list) - s_df = DataFrame({'a': [0, 1, 0, 1, 2], - 'b': ['A', 'A', 'B', 'C', 'C'], - 'c': [2, 3, 3, 3, 2]}) - - expected = DataFrame({'a': [1, 0, 0], - 'b': [0, 1, 0], - 'c': [0, 0, 1]}, - dtype=self.effective_dtype(dtype), - columns=list('abc')) + s_df = DataFrame( + {"a": [0, 1, 0, 1, 2], "b": ["A", "A", "B", "C", "C"], "c": [2, 3, 3, 3, 2]} + ) + + expected = DataFrame( + {"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]}, + dtype=self.effective_dtype(dtype), + columns=list("abc"), + ) if sparse: if is_integer_dtype(dtype): fill_value = 0 @@ -90,12 +87,10 @@ def test_basic_types(self, sparse, dtype): result = get_dummies(s_series, sparse=sparse, dtype=dtype) tm.assert_frame_equal(result, expected) - result = get_dummies(s_df, columns=s_df.columns, - sparse=sparse, dtype=dtype) + result = get_dummies(s_df, columns=s_df.columns, sparse=sparse, dtype=dtype) if sparse: - dtype_name = 'Sparse[{}, {}]'.format( - self.effective_dtype(dtype).name, - fill_value + dtype_name = "Sparse[{}, {}]".format( + self.effective_dtype(dtype).name, fill_value ) else: dtype_name = self.effective_dtype(dtype).name @@ -105,9 +100,9 @@ def test_basic_types(self, sparse, dtype): result.index = [str(i) for i in result.index] tm.assert_series_equal(result, expected) - result = get_dummies(s_df, columns=['a'], sparse=sparse, dtype=dtype) + result = get_dummies(s_df, columns=["a"], sparse=sparse, dtype=dtype) - expected_counts = {'int64': 1, 'object': 1} + expected_counts = {"int64": 1, "object": 1} expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0) expected = Series(expected_counts).sort_index() @@ -119,7 +114,7 @@ def test_basic_types(self, sparse, dtype): def test_just_na(self, sparse): just_na_list = [np.nan] just_na_series = Series(just_na_list) - just_na_series_index = Series(just_na_list, index=['A']) + just_na_series_index = Series(just_na_list, index=["A"]) res_list = get_dummies(just_na_list, sparse=sparse) res_series = get_dummies(just_na_series, sparse=sparse) @@ -131,66 +126,68 @@ def test_just_na(self, sparse): assert res_list.index.tolist() == [0] assert res_series.index.tolist() == [0] - assert res_series_index.index.tolist() == ['A'] + assert res_series_index.index.tolist() == ["A"] def test_include_na(self, sparse, dtype): - s = ['a', 'b', np.nan] + s = ["a", "b", np.nan] res = get_dummies(s, sparse=sparse, dtype=dtype) - exp = DataFrame({'a': [1, 0, 0], - 'b': [0, 1, 0]}, - dtype=self.effective_dtype(dtype)) + exp = DataFrame( + {"a": [1, 0, 0], "b": [0, 1, 0]}, dtype=self.effective_dtype(dtype) + ) if sparse: exp = exp.apply(pd.SparseArray, fill_value=0.0) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype) - exp_na = DataFrame({nan: [0, 0, 1], - 'a': [1, 0, 0], - 'b': [0, 1, 0]}, - dtype=self.effective_dtype(dtype)) - exp_na = exp_na.reindex(['a', 'b', nan], axis=1) + exp_na = DataFrame( + {nan: [0, 0, 1], "a": [1, 0, 0], "b": [0, 1, 0]}, + dtype=self.effective_dtype(dtype), + ) + exp_na = exp_na.reindex(["a", "b", nan], axis=1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns if sparse: exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0) assert_frame_equal(res_na, exp_na) - res_just_na = get_dummies([nan], dummy_na=True, - sparse=sparse, dtype=dtype) - exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan], - dtype=self.effective_dtype(dtype)) + res_just_na = get_dummies([nan], dummy_na=True, sparse=sparse, dtype=dtype) + exp_just_na = DataFrame( + Series(1, index=[0]), columns=[nan], dtype=self.effective_dtype(dtype) + ) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values) def test_unicode(self, sparse): # See GH 6885 - get_dummies chokes on unicode values import unicodedata - e = 'e' - eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE') + + e = "e" + eacute = unicodedata.lookup("LATIN SMALL LETTER E WITH ACUTE") s = [e, eacute, eacute] - res = get_dummies(s, prefix='letter', sparse=sparse) - exp = DataFrame({'letter_e': [1, 0, 0], - 'letter_%s' % eacute: [0, 1, 1]}, - dtype=np.uint8) + res = get_dummies(s, prefix="letter", sparse=sparse) + exp = DataFrame( + {"letter_e": [1, 0, 0], "letter_%s" % eacute: [0, 1, 1]}, dtype=np.uint8 + ) if sparse: exp = exp.apply(pd.SparseArray, fill_value=0) assert_frame_equal(res, exp) def test_dataframe_dummies_all_obj(self, df, sparse): - df = df[['A', 'B']] + df = df[["A", "B"]] result = get_dummies(df, sparse=sparse) - expected = DataFrame({'A_a': [1, 0, 1], - 'A_b': [0, 1, 0], - 'B_b': [1, 1, 0], - 'B_c': [0, 0, 1]}, - dtype=np.uint8) + expected = DataFrame( + {"A_a": [1, 0, 1], "A_b": [0, 1, 0], "B_b": [1, 1, 0], "B_c": [0, 0, 1]}, + dtype=np.uint8, + ) if sparse: - expected = pd.DataFrame({ - "A_a": pd.SparseArray([1, 0, 1], dtype='uint8'), - "A_b": pd.SparseArray([0, 1, 0], dtype='uint8'), - "B_b": pd.SparseArray([1, 1, 0], dtype='uint8'), - "B_c": pd.SparseArray([0, 0, 1], dtype='uint8'), - }) + expected = pd.DataFrame( + { + "A_a": pd.SparseArray([1, 0, 1], dtype="uint8"), + "A_b": pd.SparseArray([0, 1, 0], dtype="uint8"), + "B_b": pd.SparseArray([1, 1, 0], dtype="uint8"), + "B_c": pd.SparseArray([0, 0, 1], dtype="uint8"), + } + ) assert_frame_equal(result, expected) @@ -202,26 +199,34 @@ def test_dataframe_dummies_mix_default(self, df, sparse, dtype): else: arr = np.array typ = dtype - expected = DataFrame({'C': [1, 2, 3], - 'A_a': arr([1, 0, 1], dtype=typ), - 'A_b': arr([0, 1, 0], dtype=typ), - 'B_b': arr([1, 1, 0], dtype=typ), - 'B_c': arr([0, 0, 1], dtype=typ)}) - expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] + expected = DataFrame( + { + "C": [1, 2, 3], + "A_a": arr([1, 0, 1], dtype=typ), + "A_b": arr([0, 1, 0], dtype=typ), + "B_b": arr([1, 1, 0], dtype=typ), + "B_c": arr([0, 0, 1], dtype=typ), + } + ) + expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]] assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_list(self, df, sparse): - prefixes = ['from_A', 'from_B'] + prefixes = ["from_A", "from_B"] result = get_dummies(df, prefix=prefixes, sparse=sparse) - expected = DataFrame({'C': [1, 2, 3], - 'from_A_a': [1, 0, 1], - 'from_A_b': [0, 1, 0], - 'from_B_b': [1, 1, 0], - 'from_B_c': [0, 0, 1]}, - dtype=np.uint8) - expected[['C']] = df[['C']] - cols = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c'] - expected = expected[['C'] + cols] + expected = DataFrame( + { + "C": [1, 2, 3], + "from_A_a": [1, 0, 1], + "from_A_b": [0, 1, 0], + "from_B_b": [1, 1, 0], + "from_B_c": [0, 0, 1], + }, + dtype=np.uint8, + ) + expected[["C"]] = df[["C"]] + cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"] + expected = expected[["C"] + cols] typ = pd.SparseArray if sparse else pd.Series expected[cols] = expected[cols].apply(lambda x: typ(x)) @@ -229,98 +234,109 @@ def test_dataframe_dummies_prefix_list(self, df, sparse): def test_dataframe_dummies_prefix_str(self, df, sparse): # not that you should do this... - result = get_dummies(df, prefix='bad', sparse=sparse) - bad_columns = ['bad_a', 'bad_b', 'bad_b', 'bad_c'] - expected = DataFrame([[1, 1, 0, 1, 0], - [2, 0, 1, 1, 0], - [3, 1, 0, 0, 1]], - columns=['C'] + bad_columns, - dtype=np.uint8) + result = get_dummies(df, prefix="bad", sparse=sparse) + bad_columns = ["bad_a", "bad_b", "bad_b", "bad_c"] + expected = DataFrame( + [[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], + columns=["C"] + bad_columns, + dtype=np.uint8, + ) expected = expected.astype({"C": np.int64}) if sparse: # work around astyping & assigning with duplicate columns # https://github.com/pandas-dev/pandas/issues/14427 - expected = pd.concat([ - pd.Series([1, 2, 3], name='C'), - pd.Series([1, 0, 1], name='bad_a', dtype='Sparse[uint8]'), - pd.Series([0, 1, 0], name='bad_b', dtype='Sparse[uint8]'), - pd.Series([1, 1, 0], name='bad_b', dtype='Sparse[uint8]'), - pd.Series([0, 0, 1], name='bad_c', dtype='Sparse[uint8]'), - ], axis=1) + expected = pd.concat( + [ + pd.Series([1, 2, 3], name="C"), + pd.Series([1, 0, 1], name="bad_a", dtype="Sparse[uint8]"), + pd.Series([0, 1, 0], name="bad_b", dtype="Sparse[uint8]"), + pd.Series([1, 1, 0], name="bad_b", dtype="Sparse[uint8]"), + pd.Series([0, 0, 1], name="bad_c", dtype="Sparse[uint8]"), + ], + axis=1, + ) assert_frame_equal(result, expected) def test_dataframe_dummies_subset(self, df, sparse): - result = get_dummies(df, prefix=['from_A'], columns=['A'], - sparse=sparse) - expected = DataFrame({'B': ['b', 'b', 'c'], - 'C': [1, 2, 3], - 'from_A_a': [1, 0, 1], - 'from_A_b': [0, 1, 0]}, dtype=np.uint8) - expected[['C']] = df[['C']] + result = get_dummies(df, prefix=["from_A"], columns=["A"], sparse=sparse) + expected = DataFrame( + { + "B": ["b", "b", "c"], + "C": [1, 2, 3], + "from_A_a": [1, 0, 1], + "from_A_b": [0, 1, 0], + }, + dtype=np.uint8, + ) + expected[["C"]] = df[["C"]] if sparse: - cols = ['from_A_a', 'from_A_b'] + cols = ["from_A_a", "from_A_b"] expected[cols] = expected[cols].apply(lambda x: pd.SparseSeries(x)) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_sep(self, df, sparse): - result = get_dummies(df, prefix_sep='..', sparse=sparse) - expected = DataFrame({'C': [1, 2, 3], - 'A..a': [1, 0, 1], - 'A..b': [0, 1, 0], - 'B..b': [1, 1, 0], - 'B..c': [0, 0, 1]}, - dtype=np.uint8) - expected[['C']] = df[['C']] - expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] + result = get_dummies(df, prefix_sep="..", sparse=sparse) + expected = DataFrame( + { + "C": [1, 2, 3], + "A..a": [1, 0, 1], + "A..b": [0, 1, 0], + "B..b": [1, 1, 0], + "B..c": [0, 0, 1], + }, + dtype=np.uint8, + ) + expected[["C"]] = df[["C"]] + expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]] if sparse: - cols = ['A..a', 'A..b', 'B..b', 'B..c'] + cols = ["A..a", "A..b", "B..b", "B..c"] expected[cols] = expected[cols].apply(lambda x: pd.SparseSeries(x)) assert_frame_equal(result, expected) - result = get_dummies(df, prefix_sep=['..', '__'], sparse=sparse) - expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'}) + result = get_dummies(df, prefix_sep=["..", "__"], sparse=sparse) + expected = expected.rename(columns={"B..b": "B__b", "B..c": "B__c"}) assert_frame_equal(result, expected) - result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}, - sparse=sparse) + result = get_dummies(df, prefix_sep={"A": "..", "B": "__"}, sparse=sparse) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_bad_length(self, df, sparse): with pytest.raises(ValueError): - get_dummies(df, prefix=['too few'], sparse=sparse) + get_dummies(df, prefix=["too few"], sparse=sparse) def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse): with pytest.raises(ValueError): - get_dummies(df, prefix_sep=['bad'], sparse=sparse) + get_dummies(df, prefix_sep=["bad"], sparse=sparse) def test_dataframe_dummies_prefix_dict(self, sparse): - prefixes = {'A': 'from_A', 'B': 'from_B'} - df = DataFrame({'C': [1, 2, 3], - 'A': ['a', 'b', 'a'], - 'B': ['b', 'b', 'c']}) + prefixes = {"A": "from_A", "B": "from_B"} + df = DataFrame({"C": [1, 2, 3], "A": ["a", "b", "a"], "B": ["b", "b", "c"]}) result = get_dummies(df, prefix=prefixes, sparse=sparse) - expected = DataFrame({'C': [1, 2, 3], - 'from_A_a': [1, 0, 1], - 'from_A_b': [0, 1, 0], - 'from_B_b': [1, 1, 0], - 'from_B_c': [0, 0, 1]}) + expected = DataFrame( + { + "C": [1, 2, 3], + "from_A_a": [1, 0, 1], + "from_A_b": [0, 1, 0], + "from_B_b": [1, 1, 0], + "from_B_c": [0, 0, 1], + } + ) - columns = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c'] + columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"] expected[columns] = expected[columns].astype(np.uint8) if sparse: - expected[columns] = expected[columns].apply( - lambda x: pd.SparseSeries(x) - ) + expected[columns] = expected[columns].apply(lambda x: pd.SparseSeries(x)) assert_frame_equal(result, expected) def test_dataframe_dummies_with_na(self, df, sparse, dtype): df.loc[3, :] = [np.nan, np.nan, np.nan] - result = get_dummies(df, dummy_na=True, - sparse=sparse, dtype=dtype).sort_index(axis=1) + result = get_dummies(df, dummy_na=True, sparse=sparse, dtype=dtype).sort_index( + axis=1 + ) if sparse: arr = SparseArray @@ -329,23 +345,26 @@ def test_dataframe_dummies_with_na(self, df, sparse, dtype): arr = np.array typ = dtype - expected = DataFrame({'C': [1, 2, 3, np.nan], - 'A_a': arr([1, 0, 1, 0], dtype=typ), - 'A_b': arr([0, 1, 0, 0], dtype=typ), - 'A_nan': arr([0, 0, 0, 1], dtype=typ), - 'B_b': arr([1, 1, 0, 0], dtype=typ), - 'B_c': arr([0, 0, 1, 0], dtype=typ), - 'B_nan': arr([0, 0, 0, 1], dtype=typ) - }).sort_index(axis=1) + expected = DataFrame( + { + "C": [1, 2, 3, np.nan], + "A_a": arr([1, 0, 1, 0], dtype=typ), + "A_b": arr([0, 1, 0, 0], dtype=typ), + "A_nan": arr([0, 0, 0, 1], dtype=typ), + "B_b": arr([1, 1, 0, 0], dtype=typ), + "B_c": arr([0, 0, 1, 0], dtype=typ), + "B_nan": arr([0, 0, 0, 1], dtype=typ), + } + ).sort_index(axis=1) assert_frame_equal(result, expected) result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype) - expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] + expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]] assert_frame_equal(result, expected) def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): - df['cat'] = pd.Categorical(['x', 'y', 'y']) + df["cat"] = pd.Categorical(["x", "y", "y"]) result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1) if sparse: arr = SparseArray @@ -354,29 +373,41 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): arr = np.array typ = dtype - expected = DataFrame({'C': [1, 2, 3], - 'A_a': arr([1, 0, 1], dtype=typ), - 'A_b': arr([0, 1, 0], dtype=typ), - 'B_b': arr([1, 1, 0], dtype=typ), - 'B_c': arr([0, 0, 1], dtype=typ), - 'cat_x': arr([1, 0, 0], dtype=typ), - 'cat_y': arr([0, 1, 1], dtype=typ) - }).sort_index(axis=1) + expected = DataFrame( + { + "C": [1, 2, 3], + "A_a": arr([1, 0, 1], dtype=typ), + "A_b": arr([0, 1, 0], dtype=typ), + "B_b": arr([1, 1, 0], dtype=typ), + "B_c": arr([0, 0, 1], dtype=typ), + "cat_x": arr([1, 0, 0], dtype=typ), + "cat_y": arr([0, 1, 1], dtype=typ), + } + ).sort_index(axis=1) assert_frame_equal(result, expected) - @pytest.mark.parametrize('get_dummies_kwargs,expected', [ - ({'data': pd.DataFrame(({'ä': ['a']}))}, - pd.DataFrame({'ä_a': [1]}, dtype=np.uint8)), - - ({'data': pd.DataFrame({'x': ['ä']})}, - pd.DataFrame({'x_ä': [1]}, dtype=np.uint8)), - - ({'data': pd.DataFrame({'x': ['a']}), 'prefix':'ä'}, - pd.DataFrame({'ä_a': [1]}, dtype=np.uint8)), - - ({'data': pd.DataFrame({'x': ['a']}), 'prefix_sep':'ä'}, - pd.DataFrame({'xäa': [1]}, dtype=np.uint8))]) + @pytest.mark.parametrize( + "get_dummies_kwargs,expected", + [ + ( + {"data": pd.DataFrame(({"ä": ["a"]}))}, + pd.DataFrame({"ä_a": [1]}, dtype=np.uint8), + ), + ( + {"data": pd.DataFrame({"x": ["ä"]})}, + pd.DataFrame({"x_ä": [1]}, dtype=np.uint8), + ), + ( + {"data": pd.DataFrame({"x": ["a"]}), "prefix": "ä"}, + pd.DataFrame({"ä_a": [1]}, dtype=np.uint8), + ), + ( + {"data": pd.DataFrame({"x": ["a"]}), "prefix_sep": "ä"}, + pd.DataFrame({"xäa": [1]}, dtype=np.uint8), + ), + ], + ) def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected): # GH22084 pd.get_dummies incorrectly encodes unicode characters # in dataframe column names @@ -386,13 +417,11 @@ def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected): def test_basic_drop_first(self, sparse): # GH12402 Add a new parameter `drop_first` to avoid collinearity # Basic case - s_list = list('abc') + s_list = list("abc") s_series = Series(s_list) - s_series_index = Series(s_list, list('ABC')) + s_series_index = Series(s_list, list("ABC")) - expected = DataFrame({'b': [0, 1, 0], - 'c': [0, 0, 1]}, - dtype=np.uint8) + expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=np.uint8) result = get_dummies(s_list, drop_first=True, sparse=sparse) if sparse: @@ -402,15 +431,15 @@ def test_basic_drop_first(self, sparse): result = get_dummies(s_series, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) - expected.index = list('ABC') + expected.index = list("ABC") result = get_dummies(s_series_index, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) def test_basic_drop_first_one_level(self, sparse): # Test the case that categorical variable only has one level. - s_list = list('aaa') + s_list = list("aaa") s_series = Series(s_list) - s_series_index = Series(s_list, list('ABC')) + s_series_index = Series(s_list, list("ABC")) expected = DataFrame(index=np.arange(3)) @@ -420,56 +449,49 @@ def test_basic_drop_first_one_level(self, sparse): result = get_dummies(s_series, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) - expected = DataFrame(index=list('ABC')) + expected = DataFrame(index=list("ABC")) result = get_dummies(s_series_index, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) def test_basic_drop_first_NA(self, sparse): # Test NA handling together with drop_first - s_NA = ['a', 'b', np.nan] + s_NA = ["a", "b", np.nan] res = get_dummies(s_NA, drop_first=True, sparse=sparse) - exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8) + exp = DataFrame({"b": [0, 1, 0]}, dtype=np.uint8) if sparse: exp = exp.apply(pd.SparseArray, fill_value=0) assert_frame_equal(res, exp) - res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, - sparse=sparse) - exp_na = DataFrame( - {'b': [0, 1, 0], - nan: [0, 0, 1]}, - dtype=np.uint8).reindex(['b', nan], axis=1) + res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse) + exp_na = DataFrame({"b": [0, 1, 0], nan: [0, 0, 1]}, dtype=np.uint8).reindex( + ["b", nan], axis=1 + ) if sparse: exp_na = exp_na.apply(pd.SparseArray, fill_value=0) assert_frame_equal(res_na, exp_na) - res_just_na = get_dummies([nan], dummy_na=True, drop_first=True, - sparse=sparse) + res_just_na = get_dummies([nan], dummy_na=True, drop_first=True, sparse=sparse) exp_just_na = DataFrame(index=np.arange(1)) assert_frame_equal(res_just_na, exp_just_na) def test_dataframe_dummies_drop_first(self, df, sparse): - df = df[['A', 'B']] + df = df[["A", "B"]] result = get_dummies(df, drop_first=True, sparse=sparse) - expected = DataFrame({'A_b': [0, 1, 0], - 'B_c': [0, 0, 1]}, - dtype=np.uint8) + expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=np.uint8) if sparse: expected = expected.apply(pd.SparseArray, fill_value=0) assert_frame_equal(result, expected) - def test_dataframe_dummies_drop_first_with_categorical( - self, df, sparse, dtype): - df['cat'] = pd.Categorical(['x', 'y', 'y']) + def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype): + df["cat"] = pd.Categorical(["x", "y", "y"]) result = get_dummies(df, drop_first=True, sparse=sparse) - expected = DataFrame({'C': [1, 2, 3], - 'A_b': [0, 1, 0], - 'B_c': [0, 0, 1], - 'cat_y': [0, 1, 1]}) - cols = ['A_b', 'B_c', 'cat_y'] + expected = DataFrame( + {"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]} + ) + cols = ["A_b", "B_c", "cat_y"] expected[cols] = expected[cols].astype(np.uint8) - expected = expected[['C', 'A_b', 'B_c', 'cat_y']] + expected = expected[["C", "A_b", "B_c", "cat_y"]] if sparse: for col in cols: expected[col] = pd.SparseSeries(expected[col]) @@ -477,14 +499,19 @@ def test_dataframe_dummies_drop_first_with_categorical( def test_dataframe_dummies_drop_first_with_na(self, df, sparse): df.loc[3, :] = [np.nan, np.nan, np.nan] - result = get_dummies(df, dummy_na=True, drop_first=True, - sparse=sparse).sort_index(axis=1) - expected = DataFrame({'C': [1, 2, 3, np.nan], - 'A_b': [0, 1, 0, 0], - 'A_nan': [0, 0, 0, 1], - 'B_c': [0, 0, 1, 0], - 'B_nan': [0, 0, 0, 1]}) - cols = ['A_b', 'A_nan', 'B_c', 'B_nan'] + result = get_dummies( + df, dummy_na=True, drop_first=True, sparse=sparse + ).sort_index(axis=1) + expected = DataFrame( + { + "C": [1, 2, 3, np.nan], + "A_b": [0, 1, 0, 0], + "A_nan": [0, 0, 0, 1], + "B_c": [0, 0, 1, 0], + "B_nan": [0, 0, 0, 1], + } + ) + cols = ["A_b", "A_nan", "B_c", "B_nan"] expected[cols] = expected[cols].astype(np.uint8) expected = expected.sort_index(axis=1) if sparse: @@ -493,85 +520,76 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse): assert_frame_equal(result, expected) - result = get_dummies(df, dummy_na=False, drop_first=True, - sparse=sparse) - expected = expected[['C', 'A_b', 'B_c']] + result = get_dummies(df, dummy_na=False, drop_first=True, sparse=sparse) + expected = expected[["C", "A_b", "B_c"]] assert_frame_equal(result, expected) def test_int_int(self): data = Series([1, 2, 1]) result = pd.get_dummies(data) - expected = DataFrame([[1, 0], - [0, 1], - [1, 0]], - columns=[1, 2], - dtype=np.uint8) + expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=np.uint8) tm.assert_frame_equal(result, expected) - data = Series(pd.Categorical(['a', 'b', 'a'])) + data = Series(pd.Categorical(["a", "b", "a"])) result = pd.get_dummies(data) - expected = DataFrame([[1, 0], - [0, 1], - [1, 0]], - columns=pd.Categorical(['a', 'b']), - dtype=np.uint8) + expected = DataFrame( + [[1, 0], [0, 1], [1, 0]], columns=pd.Categorical(["a", "b"]), dtype=np.uint8 + ) tm.assert_frame_equal(result, expected) def test_int_df(self, dtype): data = DataFrame( - {'A': [1, 2, 1], - 'B': pd.Categorical(['a', 'b', 'a']), - 'C': [1, 2, 1], - 'D': [1., 2., 1.] - } + { + "A": [1, 2, 1], + "B": pd.Categorical(["a", "b", "a"]), + "C": [1, 2, 1], + "D": [1.0, 2.0, 1.0], + } + ) + columns = ["C", "D", "A_1", "A_2", "B_a", "B_b"] + expected = DataFrame( + [[1, 1.0, 1, 0, 1, 0], [2, 2.0, 0, 1, 0, 1], [1, 1.0, 1, 0, 1, 0]], + columns=columns, ) - columns = ['C', 'D', 'A_1', 'A_2', 'B_a', 'B_b'] - expected = DataFrame([ - [1, 1., 1, 0, 1, 0], - [2, 2., 0, 1, 0, 1], - [1, 1., 1, 0, 1, 0] - ], columns=columns) expected[columns[2:]] = expected[columns[2:]].astype(dtype) - result = pd.get_dummies(data, columns=['A', 'B'], dtype=dtype) + result = pd.get_dummies(data, columns=["A", "B"], dtype=dtype) tm.assert_frame_equal(result, expected) def test_dataframe_dummies_preserve_categorical_dtype(self, dtype): # GH13854 for ordered in [False, True]: - cat = pd.Categorical(list("xy"), categories=list("xyz"), - ordered=ordered) + cat = pd.Categorical(list("xy"), categories=list("xyz"), ordered=ordered) result = get_dummies(cat, dtype=dtype) - data = np.array([[1, 0, 0], [0, 1, 0]], - dtype=self.effective_dtype(dtype)) - cols = pd.CategoricalIndex(cat.categories, - categories=cat.categories, - ordered=ordered) - expected = DataFrame(data, columns=cols, - dtype=self.effective_dtype(dtype)) + data = np.array([[1, 0, 0], [0, 1, 0]], dtype=self.effective_dtype(dtype)) + cols = pd.CategoricalIndex( + cat.categories, categories=cat.categories, ordered=ordered + ) + expected = DataFrame(data, columns=cols, dtype=self.effective_dtype(dtype)) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('sparse', [True, False]) + @pytest.mark.parametrize("sparse", [True, False]) def test_get_dummies_dont_sparsify_all_columns(self, sparse): # GH18914 - df = DataFrame.from_dict(OrderedDict([('GDP', [1, 2]), - ('Nation', ['AB', 'CD'])])) - df = get_dummies(df, columns=['Nation'], sparse=sparse) - df2 = df.reindex(columns=['GDP']) + df = DataFrame.from_dict( + OrderedDict([("GDP", [1, 2]), ("Nation", ["AB", "CD"])]) + ) + df = get_dummies(df, columns=["Nation"], sparse=sparse) + df2 = df.reindex(columns=["GDP"]) - tm.assert_frame_equal(df[['GDP']], df2) + tm.assert_frame_equal(df[["GDP"]], df2) def test_get_dummies_duplicate_columns(self, df): # GH20839 df.columns = ["A", "A", "A"] result = get_dummies(df).sort_index(axis=1) - expected = DataFrame([[1, 1, 0, 1, 0], - [2, 0, 1, 1, 0], - [3, 1, 0, 0, 1]], - columns=['A', 'A_a', 'A_b', 'A_b', 'A_c'], - dtype=np.uint8).sort_index(axis=1) + expected = DataFrame( + [[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], + columns=["A", "A_a", "A_b", "A_b", "A_c"], + dtype=np.uint8, + ).sort_index(axis=1) expected = expected.astype({"A": np.int64}) @@ -579,54 +597,54 @@ def test_get_dummies_duplicate_columns(self, df): def test_get_dummies_all_sparse(self): df = pd.DataFrame({"A": [1, 2]}) - result = pd.get_dummies(df, columns=['A'], sparse=True) - dtype = SparseDtype('uint8', 0) - expected = pd.DataFrame({ - 'A_1': SparseArray([1, 0], dtype=dtype), - 'A_2': SparseArray([0, 1], dtype=dtype), - }) + result = pd.get_dummies(df, columns=["A"], sparse=True) + dtype = SparseDtype("uint8", 0) + expected = pd.DataFrame( + { + "A_1": SparseArray([1, 0], dtype=dtype), + "A_2": SparseArray([0, 1], dtype=dtype), + } + ) tm.assert_frame_equal(result, expected) class TestCategoricalReshape: - def test_reshaping_multi_index_categorical(self): - cols = ['ItemA', 'ItemB', 'ItemC'] + cols = ["ItemA", "ItemB", "ItemC"] data = {c: tm.makeTimeDataFrame() for c in cols} - df = pd.concat({c: data[c].stack() for c in data}, axis='columns') - df.index.names = ['major', 'minor'] - df['str'] = 'foo' + df = pd.concat({c: data[c].stack() for c in data}, axis="columns") + df.index.names = ["major", "minor"] + df["str"] = "foo" dti = df.index.levels[0] - df['category'] = df['str'].astype('category') - result = df['category'].unstack() + df["category"] = df["str"].astype("category") + result = df["category"].unstack() - c = Categorical(['foo'] * len(dti)) - expected = DataFrame({'A': c.copy(), - 'B': c.copy(), - 'C': c.copy(), - 'D': c.copy()}, - columns=Index(list('ABCD'), name='minor'), - index=dti) + c = Categorical(["foo"] * len(dti)) + expected = DataFrame( + {"A": c.copy(), "B": c.copy(), "C": c.copy(), "D": c.copy()}, + columns=Index(list("ABCD"), name="minor"), + index=dti, + ) tm.assert_frame_equal(result, expected) class TestMakeAxisDummies: - def test_preserve_categorical_dtype(self): # GH13854 for ordered in [False, True]: cidx = pd.CategoricalIndex(list("xyz"), ordered=ordered) - midx = pd.MultiIndex(levels=[['a'], cidx], - codes=[[0, 0], [0, 1]]) + midx = pd.MultiIndex(levels=[["a"], cidx], codes=[[0, 0], [0, 1]]) df = DataFrame([[10, 11]], index=midx) - expected = DataFrame([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], - index=midx, columns=cidx) + expected = DataFrame( + [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], index=midx, columns=cidx + ) from pandas.core.reshape.reshape import make_axis_dummies + result = make_axis_dummies(df) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py index 75dc2ccc54a83d..188f08777668e1 100644 --- a/pandas/tests/reshape/test_union_categoricals.py +++ b/pandas/tests/reshape/test_union_categoricals.py @@ -9,44 +9,47 @@ class TestUnionCategoricals: - def test_union_categorical(self): # GH 13361 data = [ - (list('abc'), list('abd'), list('abcabd')), + (list("abc"), list("abd"), list("abcabd")), ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]), ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]), - - (['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'], - ['b', 'b', np.nan, 'a', 'a', np.nan, 'c']), - - (pd.date_range('2014-01-01', '2014-01-05'), - pd.date_range('2014-01-06', '2014-01-07'), - pd.date_range('2014-01-01', '2014-01-07')), - - (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'), - pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'), - pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')), - - (pd.period_range('2014-01-01', '2014-01-05'), - pd.period_range('2014-01-06', '2014-01-07'), - pd.period_range('2014-01-01', '2014-01-07')), + ( + ["b", "b", np.nan, "a"], + ["a", np.nan, "c"], + ["b", "b", np.nan, "a", "a", np.nan, "c"], + ), + ( + pd.date_range("2014-01-01", "2014-01-05"), + pd.date_range("2014-01-06", "2014-01-07"), + pd.date_range("2014-01-01", "2014-01-07"), + ), + ( + pd.date_range("2014-01-01", "2014-01-05", tz="US/Central"), + pd.date_range("2014-01-06", "2014-01-07", tz="US/Central"), + pd.date_range("2014-01-01", "2014-01-07", tz="US/Central"), + ), + ( + pd.period_range("2014-01-01", "2014-01-05"), + pd.period_range("2014-01-06", "2014-01-07"), + pd.period_range("2014-01-01", "2014-01-07"), + ), ] for a, b, combined in data: for box in [Categorical, CategoricalIndex, Series]: - result = union_categoricals([box(Categorical(a)), - box(Categorical(b))]) + result = union_categoricals([box(Categorical(a)), box(Categorical(b))]) expected = Categorical(combined) - tm.assert_categorical_equal(result, expected, - check_category_order=True) + tm.assert_categorical_equal(result, expected, check_category_order=True) # new categories ordered by appearance - s = Categorical(['x', 'y', 'z']) - s2 = Categorical(['a', 'b', 'c']) + s = Categorical(["x", "y", "z"]) + s2 = Categorical(["a", "b", "c"]) result = union_categoricals([s, s2]) - expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], - categories=['x', 'y', 'z', 'a', 'b', 'c']) + expected = Categorical( + ["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"] + ) tm.assert_categorical_equal(result, expected) s = Categorical([0, 1.2, 2], ordered=True) @@ -58,60 +61,66 @@ def test_union_categorical(self): # must exactly match types s = Categorical([0, 1.2, 2]) s2 = Categorical([2, 3, 4]) - msg = 'dtype of categories must be the same' + msg = "dtype of categories must be the same" with pytest.raises(TypeError, match=msg): union_categoricals([s, s2]) - msg = 'No Categoricals to union' + msg = "No Categoricals to union" with pytest.raises(ValueError, match=msg): union_categoricals([]) def test_union_categoricals_nan(self): # GH 13759 - res = union_categoricals([pd.Categorical([1, 2, np.nan]), - pd.Categorical([3, 2, np.nan])]) + res = union_categoricals( + [pd.Categorical([1, 2, np.nan]), pd.Categorical([3, 2, np.nan])] + ) exp = Categorical([1, 2, np.nan, 3, 2, np.nan]) tm.assert_categorical_equal(res, exp) - res = union_categoricals([pd.Categorical(['A', 'B']), - pd.Categorical(['B', 'B', np.nan])]) - exp = Categorical(['A', 'B', 'B', 'B', np.nan]) + res = union_categoricals( + [pd.Categorical(["A", "B"]), pd.Categorical(["B", "B", np.nan])] + ) + exp = Categorical(["A", "B", "B", "B", np.nan]) tm.assert_categorical_equal(res, exp) - val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'), - pd.NaT] - val2 = [pd.NaT, pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-02-01')] + val1 = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-03-01"), pd.NaT] + val2 = [pd.NaT, pd.Timestamp("2011-01-01"), pd.Timestamp("2011-02-01")] res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)]) - exp = Categorical(val1 + val2, - categories=[pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-03-01'), - pd.Timestamp('2011-02-01')]) + exp = Categorical( + val1 + val2, + categories=[ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-03-01"), + pd.Timestamp("2011-02-01"), + ], + ) tm.assert_categorical_equal(res, exp) # all NaN - res = union_categoricals([pd.Categorical(np.array([np.nan, np.nan], - dtype=object)), - pd.Categorical(['X'])]) - exp = Categorical([np.nan, np.nan, 'X']) + res = union_categoricals( + [ + pd.Categorical(np.array([np.nan, np.nan], dtype=object)), + pd.Categorical(["X"]), + ] + ) + exp = Categorical([np.nan, np.nan, "X"]) tm.assert_categorical_equal(res, exp) - res = union_categoricals([pd.Categorical([np.nan, np.nan]), - pd.Categorical([np.nan, np.nan])]) + res = union_categoricals( + [pd.Categorical([np.nan, np.nan]), pd.Categorical([np.nan, np.nan])] + ) exp = Categorical([np.nan, np.nan, np.nan, np.nan]) tm.assert_categorical_equal(res, exp) def test_union_categoricals_empty(self): # GH 13759 - res = union_categoricals([pd.Categorical([]), - pd.Categorical([])]) + res = union_categoricals([pd.Categorical([]), pd.Categorical([])]) exp = Categorical([]) tm.assert_categorical_equal(res, exp) - res = union_categoricals([Categorical([]), - Categorical(['1'])]) - exp = Categorical(['1']) + res = union_categoricals([Categorical([]), Categorical(["1"])]) + exp = Categorical(["1"]) tm.assert_categorical_equal(res, exp) def test_union_categorical_same_category(self): @@ -119,31 +128,30 @@ def test_union_categorical_same_category(self): c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4]) c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4]) res = union_categoricals([c1, c2]) - exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], - categories=[1, 2, 3, 4]) + exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], categories=[1, 2, 3, 4]) tm.assert_categorical_equal(res, exp) - c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z']) - c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z']) + c1 = Categorical(["z", "z", "z"], categories=["x", "y", "z"]) + c2 = Categorical(["x", "x", "x"], categories=["x", "y", "z"]) res = union_categoricals([c1, c2]) - exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'], - categories=['x', 'y', 'z']) + exp = Categorical(["z", "z", "z", "x", "x", "x"], categories=["x", "y", "z"]) tm.assert_categorical_equal(res, exp) def test_union_categorical_same_categories_different_order(self): # https://github.com/pandas-dev/pandas/issues/19096 - c1 = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c']) - c2 = Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c']) + c1 = Categorical(["a", "b", "c"], categories=["a", "b", "c"]) + c2 = Categorical(["a", "b", "c"], categories=["b", "a", "c"]) result = union_categoricals([c1, c2]) - expected = Categorical(['a', 'b', 'c', 'a', 'b', 'c'], - categories=['a', 'b', 'c']) + expected = Categorical( + ["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"] + ) tm.assert_categorical_equal(result, expected) def test_union_categoricals_ordered(self): c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([1, 2, 3], ordered=False) - msg = 'Categorical.ordered must be the same' + msg = "Categorical.ordered must be the same" with pytest.raises(TypeError, match=msg): union_categoricals([c1, c2]) @@ -174,7 +182,7 @@ def test_union_categoricals_ignore_order(self): exp = Categorical([1, 2, 3, 1, 2, 3]) tm.assert_categorical_equal(res, exp) - msg = 'Categorical.ordered must be the same' + msg = "Categorical.ordered must be the same" with pytest.raises(TypeError, match=msg): union_categoricals([c1, c2], ignore_order=False) @@ -183,8 +191,7 @@ def test_union_categoricals_ignore_order(self): tm.assert_categorical_equal(res, exp) res = union_categoricals([c1, c1], ignore_order=False) - exp = Categorical([1, 2, 3, 1, 2, 3], - categories=[1, 2, 3], ordered=True) + exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3, np.nan], ordered=True) @@ -201,8 +208,7 @@ def test_union_categoricals_ignore_order(self): exp = Categorical([1, 2, 3, 1, 2, 3]) tm.assert_categorical_equal(res, exp) - res = union_categoricals([c2, c1], ignore_order=True, - sort_categories=True) + res = union_categoricals([c2, c1], ignore_order=True, sort_categories=True) exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3]) tm.assert_categorical_equal(res, exp) @@ -221,41 +227,38 @@ def test_union_categoricals_ignore_order(self): def test_union_categoricals_sort(self): # GH 13846 - c1 = Categorical(['x', 'y', 'z']) - c2 = Categorical(['a', 'b', 'c']) + c1 = Categorical(["x", "y", "z"]) + c2 = Categorical(["a", "b", "c"]) result = union_categoricals([c1, c2], sort_categories=True) - expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], - categories=['a', 'b', 'c', 'x', 'y', 'z']) + expected = Categorical( + ["x", "y", "z", "a", "b", "c"], categories=["a", "b", "c", "x", "y", "z"] + ) tm.assert_categorical_equal(result, expected) # fastpath - c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c']) - c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c']) + c1 = Categorical(["a", "b"], categories=["b", "a", "c"]) + c2 = Categorical(["b", "c"], categories=["b", "a", "c"]) result = union_categoricals([c1, c2], sort_categories=True) - expected = Categorical(['a', 'b', 'b', 'c'], - categories=['a', 'b', 'c']) + expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"]) tm.assert_categorical_equal(result, expected) - c1 = Categorical(['a', 'b'], categories=['c', 'a', 'b']) - c2 = Categorical(['b', 'c'], categories=['c', 'a', 'b']) + c1 = Categorical(["a", "b"], categories=["c", "a", "b"]) + c2 = Categorical(["b", "c"], categories=["c", "a", "b"]) result = union_categoricals([c1, c2], sort_categories=True) - expected = Categorical(['a', 'b', 'b', 'c'], - categories=['a', 'b', 'c']) + expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"]) tm.assert_categorical_equal(result, expected) # fastpath - skip resort - c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c']) - c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c']) + c1 = Categorical(["a", "b"], categories=["a", "b", "c"]) + c2 = Categorical(["b", "c"], categories=["a", "b", "c"]) result = union_categoricals([c1, c2], sort_categories=True) - expected = Categorical(['a', 'b', 'b', 'c'], - categories=['a', 'b', 'c']) + expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"]) tm.assert_categorical_equal(result, expected) - c1 = Categorical(['x', np.nan]) - c2 = Categorical([np.nan, 'b']) + c1 = Categorical(["x", np.nan]) + c2 = Categorical([np.nan, "b"]) result = union_categoricals([c1, c2], sort_categories=True) - expected = Categorical(['x', np.nan, np.nan, 'b'], - categories=['b', 'x']) + expected = Categorical(["x", np.nan, np.nan, "b"], categories=["b", "x"]) tm.assert_categorical_equal(result, expected) c1 = Categorical([np.nan]) @@ -270,41 +273,39 @@ def test_union_categoricals_sort(self): expected = Categorical([]) tm.assert_categorical_equal(result, expected) - c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True) - c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True) + c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True) + c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True) with pytest.raises(TypeError): union_categoricals([c1, c2], sort_categories=True) def test_union_categoricals_sort_false(self): # GH 13846 - c1 = Categorical(['x', 'y', 'z']) - c2 = Categorical(['a', 'b', 'c']) + c1 = Categorical(["x", "y", "z"]) + c2 = Categorical(["a", "b", "c"]) result = union_categoricals([c1, c2], sort_categories=False) - expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], - categories=['x', 'y', 'z', 'a', 'b', 'c']) + expected = Categorical( + ["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"] + ) tm.assert_categorical_equal(result, expected) # fastpath - c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c']) - c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c']) + c1 = Categorical(["a", "b"], categories=["b", "a", "c"]) + c2 = Categorical(["b", "c"], categories=["b", "a", "c"]) result = union_categoricals([c1, c2], sort_categories=False) - expected = Categorical(['a', 'b', 'b', 'c'], - categories=['b', 'a', 'c']) + expected = Categorical(["a", "b", "b", "c"], categories=["b", "a", "c"]) tm.assert_categorical_equal(result, expected) # fastpath - skip resort - c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c']) - c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c']) + c1 = Categorical(["a", "b"], categories=["a", "b", "c"]) + c2 = Categorical(["b", "c"], categories=["a", "b", "c"]) result = union_categoricals([c1, c2], sort_categories=False) - expected = Categorical(['a', 'b', 'b', 'c'], - categories=['a', 'b', 'c']) + expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"]) tm.assert_categorical_equal(result, expected) - c1 = Categorical(['x', np.nan]) - c2 = Categorical([np.nan, 'b']) + c1 = Categorical(["x", np.nan]) + c2 = Categorical([np.nan, "b"]) result = union_categoricals([c1, c2], sort_categories=False) - expected = Categorical(['x', np.nan, np.nan, 'b'], - categories=['x', 'b']) + expected = Categorical(["x", np.nan, np.nan, "b"], categories=["x", "b"]) tm.assert_categorical_equal(result, expected) c1 = Categorical([np.nan]) @@ -319,19 +320,20 @@ def test_union_categoricals_sort_false(self): expected = Categorical([]) tm.assert_categorical_equal(result, expected) - c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True) - c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True) + c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True) + c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True) result = union_categoricals([c1, c2], sort_categories=False) - expected = Categorical(['b', 'a', 'a', 'c'], - categories=['b', 'a', 'c'], ordered=True) + expected = Categorical( + ["b", "a", "a", "c"], categories=["b", "a", "c"], ordered=True + ) tm.assert_categorical_equal(result, expected) def test_union_categorical_unwrap(self): # GH 14173 - c1 = Categorical(['a', 'b']) - c2 = pd.Series(['b', 'c'], dtype='category') + c1 = Categorical(["a", "b"]) + c2 = pd.Series(["b", "c"], dtype="category") result = union_categoricals([c1, c2]) - expected = Categorical(['a', 'b', 'b', 'c']) + expected = Categorical(["a", "b", "b", "c"]) tm.assert_categorical_equal(result, expected) c2 = CategoricalIndex(c2) @@ -343,4 +345,4 @@ def test_union_categorical_unwrap(self): tm.assert_categorical_equal(result, expected) with pytest.raises(TypeError): - union_categoricals([c1, ['a', 'b', 'c']]) + union_categoricals([c1, ["a", "b", "c"]]) diff --git a/pandas/tests/reshape/test_util.py b/pandas/tests/reshape/test_util.py index 92a3bb9e29219c..60c6d7ec3017b4 100644 --- a/pandas/tests/reshape/test_util.py +++ b/pandas/tests/reshape/test_util.py @@ -7,11 +7,10 @@ class TestCartesianProduct: - def test_simple(self): - x, y = list('ABC'), [1, 22] + x, y = list("ABC"), [1, 22] result1, result2 = cartesian_product([x, y]) - expected1 = np.array(['A', 'A', 'B', 'B', 'C', 'C']) + expected1 = np.array(["A", "A", "B", "B", "C", "C"]) expected2 = np.array([1, 22, 1, 22, 1, 22]) tm.assert_numpy_array_equal(result1, expected1) tm.assert_numpy_array_equal(result2, expected2) @@ -19,7 +18,7 @@ def test_simple(self): def test_datetimeindex(self): # regression test for GitHub issue #6439 # make sure that the ordering on datetimeindex is consistent - x = date_range('2000-01-01', periods=2) + x = date_range("2000-01-01", periods=2) result1, result2 = [Index(y).day for y in cartesian_product([x, x])] expected1 = Index([1, 1, 2, 2]) expected2 = Index([1, 2, 1, 2]) @@ -29,7 +28,7 @@ def test_datetimeindex(self): def test_empty(self): # product of empty factors X = [[], [0, 1], []] - Y = [[], [], ['a', 'b', 'c']] + Y = [[], [], ["a", "b", "c"]] for x, y in zip(X, Y): expected1 = np.array([], dtype=np.asarray(x).dtype) expected2 = np.array([], dtype=np.asarray(y).dtype) @@ -42,10 +41,9 @@ def test_empty(self): expected = [] assert result == expected - @pytest.mark.parametrize("X", [ - 1, [1], [1, 2], [[1], 2], - 'a', ['a'], ['a', 'b'], [['a'], 'b'] - ]) + @pytest.mark.parametrize( + "X", [1, [1], [1, 2], [[1], 2], "a", ["a"], ["a", "b"], [["a"], "b"]] + ) def test_invalid_input(self, X): msg = "Input must be a list-like of list-likes" diff --git a/pandas/tests/scalar/interval/test_interval.py b/pandas/tests/scalar/interval/test_interval.py index 66452443187765..e4987e4483fd94 100644 --- a/pandas/tests/scalar/interval/test_interval.py +++ b/pandas/tests/scalar/interval/test_interval.py @@ -11,9 +11,8 @@ def interval(): class TestInterval: - def test_properties(self, interval): - assert interval.closed == 'right' + assert interval.closed == "right" assert interval.left == 0 assert interval.right == 1 assert interval.mid == 0.5 @@ -22,7 +21,7 @@ def test_repr(self, interval): assert repr(interval) == "Interval(0, 1, closed='right')" assert str(interval) == "(0, 1]" - interval_left = Interval(0, 1, closed='left') + interval_left = Interval(0, 1, closed="left") assert repr(interval_left) == "Interval(0, 1, closed='left')" assert str(interval_left) == "[0, 1)" @@ -35,22 +34,22 @@ def test_contains(self, interval): with pytest.raises(TypeError, match=msg): interval in interval - interval_both = Interval(0, 1, closed='both') + interval_both = Interval(0, 1, closed="both") assert 0 in interval_both assert 1 in interval_both - interval_neither = Interval(0, 1, closed='neither') + interval_neither = Interval(0, 1, closed="neither") assert 0 not in interval_neither assert 0.5 in interval_neither assert 1 not in interval_neither def test_equal(self): - assert Interval(0, 1) == Interval(0, 1, closed='right') - assert Interval(0, 1) != Interval(0, 1, closed='left') + assert Interval(0, 1) == Interval(0, 1, closed="right") + assert Interval(0, 1) != Interval(0, 1, closed="left") assert Interval(0, 1) != 0 def test_comparison(self): - with pytest.raises(TypeError, match='unorderable types'): + with pytest.raises(TypeError, match="unorderable types"): Interval(0, 1) < 2 assert Interval(0, 1) < Interval(1, 2) @@ -64,29 +63,37 @@ def test_hash(self, interval): # should not raise hash(interval) - @pytest.mark.parametrize('left, right, expected', [ - (0, 5, 5), - (-2, 5.5, 7.5), - (10, 10, 0), - (10, np.inf, np.inf), - (-np.inf, -5, np.inf), - (-np.inf, np.inf, np.inf), - (Timedelta('0 days'), Timedelta('5 days'), Timedelta('5 days')), - (Timedelta('10 days'), Timedelta('10 days'), Timedelta('0 days')), - (Timedelta('1H10M'), Timedelta('5H5M'), Timedelta('3H55M')), - (Timedelta('5S'), Timedelta('1H'), Timedelta('59M55S'))]) + @pytest.mark.parametrize( + "left, right, expected", + [ + (0, 5, 5), + (-2, 5.5, 7.5), + (10, 10, 0), + (10, np.inf, np.inf), + (-np.inf, -5, np.inf), + (-np.inf, np.inf, np.inf), + (Timedelta("0 days"), Timedelta("5 days"), Timedelta("5 days")), + (Timedelta("10 days"), Timedelta("10 days"), Timedelta("0 days")), + (Timedelta("1H10M"), Timedelta("5H5M"), Timedelta("3H55M")), + (Timedelta("5S"), Timedelta("1H"), Timedelta("59M55S")), + ], + ) def test_length(self, left, right, expected): # GH 18789 iv = Interval(left, right) result = iv.length assert result == expected - @pytest.mark.parametrize('left, right, expected', [ - ('2017-01-01', '2017-01-06', '5 days'), - ('2017-01-01', '2017-01-01 12:00:00', '12 hours'), - ('2017-01-01 12:00', '2017-01-01 12:00:00', '0 days'), - ('2017-01-01 12:01', '2017-01-05 17:31:00', '4 days 5 hours 30 min')]) - @pytest.mark.parametrize('tz', (None, 'UTC', 'CET', 'US/Eastern')) + @pytest.mark.parametrize( + "left, right, expected", + [ + ("2017-01-01", "2017-01-06", "5 days"), + ("2017-01-01", "2017-01-01 12:00:00", "12 hours"), + ("2017-01-01 12:00", "2017-01-01 12:00:00", "0 days"), + ("2017-01-01 12:01", "2017-01-05 17:31:00", "4 days 5 hours 30 min"), + ], + ) + @pytest.mark.parametrize("tz", (None, "UTC", "CET", "US/Eastern")) def test_length_timestamp(self, tz, left, right, expected): # GH 18789 iv = Interval(Timestamp(left, tz=tz), Timestamp(right, tz=tz)) @@ -94,12 +101,18 @@ def test_length_timestamp(self, tz, left, right, expected): expected = Timedelta(expected) assert result == expected - @pytest.mark.parametrize('left, right', [ - (0, 1), - (Timedelta('0 days'), Timedelta('1 day')), - (Timestamp('2018-01-01'), Timestamp('2018-01-02')), - (Timestamp('2018-01-01', tz='US/Eastern'), - Timestamp('2018-01-02', tz='US/Eastern'))]) + @pytest.mark.parametrize( + "left, right", + [ + (0, 1), + (Timedelta("0 days"), Timedelta("1 day")), + (Timestamp("2018-01-01"), Timestamp("2018-01-02")), + ( + Timestamp("2018-01-01", tz="US/Eastern"), + Timestamp("2018-01-02", tz="US/Eastern"), + ), + ], + ) def test_is_empty(self, left, right, closed): # GH27219 # non-empty always return False @@ -109,16 +122,19 @@ def test_is_empty(self, left, right, closed): # same endpoint is empty except when closed='both' (contains one point) iv = Interval(left, left, closed) result = iv.is_empty - expected = closed != 'both' + expected = closed != "both" assert result is expected - @pytest.mark.parametrize('left, right', [ - ('a', 'z'), - (('a', 'b'), ('c', 'd')), - (list('AB'), list('ab')), - (Interval(0, 1), Interval(1, 2)), - (Period('2018Q1', freq='Q'), Period('2018Q1', freq='Q')) - ]) + @pytest.mark.parametrize( + "left, right", + [ + ("a", "z"), + (("a", "b"), ("c", "d")), + (list("AB"), list("ab")), + (Interval(0, 1), Interval(1, 2)), + (Period("2018Q1", freq="Q"), Period("2018Q1", freq="Q")), + ], + ) def test_construct_errors(self, left, right): # GH 23013 msg = "Only numeric, Timestamp and Timedelta endpoints are allowed" @@ -144,7 +160,7 @@ def test_math_add(self, closed): interval + interval with pytest.raises(TypeError, match=msg): - interval + 'foo' + interval + "foo" def test_math_sub(self, closed): interval = Interval(0, 1, closed=closed) @@ -162,7 +178,7 @@ def test_math_sub(self, closed): interval - interval with pytest.raises(TypeError, match=msg): - interval - 'foo' + interval - "foo" def test_math_mult(self, closed): interval = Interval(0, 1, closed=closed) @@ -184,7 +200,7 @@ def test_math_mult(self, closed): msg = r"can\'t multiply sequence by non-int" with pytest.raises(TypeError, match=msg): - interval * 'foo' + interval * "foo" def test_math_div(self, closed): interval = Interval(0, 1, closed=closed) @@ -202,7 +218,7 @@ def test_math_div(self, closed): interval / interval with pytest.raises(TypeError, match=msg): - interval / 'foo' + interval / "foo" def test_math_floordiv(self, closed): interval = Interval(1, 2, closed=closed) @@ -220,23 +236,24 @@ def test_math_floordiv(self, closed): interval // interval with pytest.raises(TypeError, match=msg): - interval // 'foo' + interval // "foo" def test_constructor_errors(self): msg = "invalid option for 'closed': foo" with pytest.raises(ValueError, match=msg): - Interval(0, 1, closed='foo') + Interval(0, 1, closed="foo") - msg = 'left side of interval must be <= right side' + msg = "left side of interval must be <= right side" with pytest.raises(ValueError, match=msg): Interval(1, 0) - @pytest.mark.parametrize('tz_left, tz_right', [ - (None, 'UTC'), ('UTC', None), ('UTC', 'US/Eastern')]) + @pytest.mark.parametrize( + "tz_left, tz_right", [(None, "UTC"), ("UTC", None), ("UTC", "US/Eastern")] + ) def test_constructor_errors_tz(self, tz_left, tz_right): # GH 18538 - left = Timestamp('2017-01-01', tz=tz_left) - right = Timestamp('2017-01-02', tz=tz_right) + left = Timestamp("2017-01-01", tz=tz_left) + right = Timestamp("2017-01-02", tz=tz_right) error = TypeError if com._any_none(tz_left, tz_right) else ValueError with pytest.raises(error): Interval(left, right) diff --git a/pandas/tests/scalar/interval/test_ops.py b/pandas/tests/scalar/interval/test_ops.py index 963fe14d46dcd5..f560c42617260b 100644 --- a/pandas/tests/scalar/interval/test_ops.py +++ b/pandas/tests/scalar/interval/test_ops.py @@ -4,10 +4,14 @@ from pandas import Interval, Timedelta, Timestamp -@pytest.fixture(params=[ - (Timedelta('0 days'), Timedelta('1 day')), - (Timestamp('2018-01-01'), Timedelta('1 day')), - (0, 1)], ids=lambda x: type(x[0]).__name__) +@pytest.fixture( + params=[ + (Timedelta("0 days"), Timedelta("1 day")), + (Timestamp("2018-01-01"), Timedelta("1 day")), + (0, 1), + ], + ids=lambda x: type(x[0]).__name__, +) def start_shift(request): """ Fixture for generating intervals of types from a start value and a shift @@ -17,7 +21,6 @@ def start_shift(request): class TestOverlaps: - def test_overlaps_self(self, start_shift, closed): start, shift = start_shift interval = Interval(start, start + shift, closed) @@ -49,12 +52,15 @@ def test_overlaps_endpoint(self, start_shift, closed, other_closed): expected = interval1.closed_right and interval2.closed_left assert result == expected - @pytest.mark.parametrize('other', [ - 10, True, 'foo', Timedelta('1 day'), Timestamp('2018-01-01')], - ids=lambda x: type(x).__name__) + @pytest.mark.parametrize( + "other", + [10, True, "foo", Timedelta("1 day"), Timestamp("2018-01-01")], + ids=lambda x: type(x).__name__, + ) def test_overlaps_invalid_type(self, other): interval = Interval(0, 1) - msg = '`other` must be an Interval, got {other}'.format( - other=type(other).__name__) + msg = "`other` must be an Interval, got {other}".format( + other=type(other).__name__ + ) with pytest.raises(TypeError, match=msg): interval.overlaps(other) diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index c6f649aeba12f8..ee0ff87e31aea3 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -1,7 +1,6 @@ import pytest -from pandas._libs.tslibs.frequencies import ( - INVALID_FREQ_ERR_MSG, _period_code_map) +from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG, _period_code_map from pandas.errors import OutOfBoundsDatetime from pandas import Period, offsets @@ -9,10 +8,11 @@ class TestFreqConversion: """Test frequency conversion of date objects""" - @pytest.mark.parametrize('freq', ['A', 'Q', 'M', 'W', 'B', 'D']) + + @pytest.mark.parametrize("freq", ["A", "Q", "M", "W", "B", "D"]) def test_asfreq_near_zero(self, freq): # GH#19643, GH#19650 - per = Period('0001-01-01', freq=freq) + per = Period("0001-01-01", freq=freq) tup1 = (per.year, per.hour, per.day) prev = per - 1 @@ -22,673 +22,709 @@ def test_asfreq_near_zero(self, freq): def test_asfreq_near_zero_weekly(self): # GH#19834 - per1 = Period('0001-01-01', 'D') + 6 - per2 = Period('0001-01-01', 'D') - 6 - week1 = per1.asfreq('W') - week2 = per2.asfreq('W') + per1 = Period("0001-01-01", "D") + 6 + per2 = Period("0001-01-01", "D") - 6 + week1 = per1.asfreq("W") + week2 = per2.asfreq("W") assert week1 != week2 - assert week1.asfreq('D', 'E') >= per1 - assert week2.asfreq('D', 'S') <= per2 + assert week1.asfreq("D", "E") >= per1 + assert week2.asfreq("D", "S") <= per2 - @pytest.mark.xfail(reason='GH#19643 period_helper asfreq functions fail ' - 'to check for overflows') + @pytest.mark.xfail( + reason="GH#19643 period_helper asfreq functions fail " "to check for overflows" + ) def test_to_timestamp_out_of_bounds(self): # GH#19643, currently gives Timestamp('1754-08-30 22:43:41.128654848') - per = Period('0001-01-01', freq='B') + per = Period("0001-01-01", freq="B") with pytest.raises(OutOfBoundsDatetime): per.to_timestamp() def test_asfreq_corner(self): - val = Period(freq='A', year=2007) - result1 = val.asfreq('5t') - result2 = val.asfreq('t') - expected = Period('2007-12-31 23:59', freq='t') + val = Period(freq="A", year=2007) + result1 = val.asfreq("5t") + result2 = val.asfreq("t") + expected = Period("2007-12-31 23:59", freq="t") assert result1.ordinal == expected.ordinal - assert result1.freqstr == '5T' + assert result1.freqstr == "5T" assert result2.ordinal == expected.ordinal - assert result2.freqstr == 'T' + assert result2.freqstr == "T" def test_conv_annual(self): # frequency conversion tests: from Annual Frequency - ival_A = Period(freq='A', year=2007) + ival_A = Period(freq="A", year=2007) ival_AJAN = Period(freq="A-JAN", year=2007) ival_AJUN = Period(freq="A-JUN", year=2007) ival_ANOV = Period(freq="A-NOV", year=2007) - ival_A_to_Q_start = Period(freq='Q', year=2007, quarter=1) - ival_A_to_Q_end = Period(freq='Q', year=2007, quarter=4) - ival_A_to_M_start = Period(freq='M', year=2007, month=1) - ival_A_to_M_end = Period(freq='M', year=2007, month=12) - ival_A_to_W_start = Period(freq='W', year=2007, month=1, day=1) - ival_A_to_W_end = Period(freq='W', year=2007, month=12, day=31) - ival_A_to_B_start = Period(freq='B', year=2007, month=1, day=1) - ival_A_to_B_end = Period(freq='B', year=2007, month=12, day=31) - ival_A_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_A_to_D_end = Period(freq='D', year=2007, month=12, day=31) - ival_A_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_A_to_H_end = Period(freq='H', year=2007, month=12, day=31, - hour=23) - ival_A_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_A_to_T_end = Period(freq='Min', year=2007, month=12, day=31, - hour=23, minute=59) - ival_A_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_A_to_S_end = Period(freq='S', year=2007, month=12, day=31, - hour=23, minute=59, second=59) - - ival_AJAN_to_D_end = Period(freq='D', year=2007, month=1, day=31) - ival_AJAN_to_D_start = Period(freq='D', year=2006, month=2, day=1) - ival_AJUN_to_D_end = Period(freq='D', year=2007, month=6, day=30) - ival_AJUN_to_D_start = Period(freq='D', year=2006, month=7, day=1) - ival_ANOV_to_D_end = Period(freq='D', year=2007, month=11, day=30) - ival_ANOV_to_D_start = Period(freq='D', year=2006, month=12, day=1) - - assert ival_A.asfreq('Q', 'S') == ival_A_to_Q_start - assert ival_A.asfreq('Q', 'e') == ival_A_to_Q_end - assert ival_A.asfreq('M', 's') == ival_A_to_M_start - assert ival_A.asfreq('M', 'E') == ival_A_to_M_end - assert ival_A.asfreq('W', 'S') == ival_A_to_W_start - assert ival_A.asfreq('W', 'E') == ival_A_to_W_end - assert ival_A.asfreq('B', 'S') == ival_A_to_B_start - assert ival_A.asfreq('B', 'E') == ival_A_to_B_end - assert ival_A.asfreq('D', 'S') == ival_A_to_D_start - assert ival_A.asfreq('D', 'E') == ival_A_to_D_end - assert ival_A.asfreq('H', 'S') == ival_A_to_H_start - assert ival_A.asfreq('H', 'E') == ival_A_to_H_end - assert ival_A.asfreq('min', 'S') == ival_A_to_T_start - assert ival_A.asfreq('min', 'E') == ival_A_to_T_end - assert ival_A.asfreq('T', 'S') == ival_A_to_T_start - assert ival_A.asfreq('T', 'E') == ival_A_to_T_end - assert ival_A.asfreq('S', 'S') == ival_A_to_S_start - assert ival_A.asfreq('S', 'E') == ival_A_to_S_end - - assert ival_AJAN.asfreq('D', 'S') == ival_AJAN_to_D_start - assert ival_AJAN.asfreq('D', 'E') == ival_AJAN_to_D_end - - assert ival_AJUN.asfreq('D', 'S') == ival_AJUN_to_D_start - assert ival_AJUN.asfreq('D', 'E') == ival_AJUN_to_D_end - - assert ival_ANOV.asfreq('D', 'S') == ival_ANOV_to_D_start - assert ival_ANOV.asfreq('D', 'E') == ival_ANOV_to_D_end - - assert ival_A.asfreq('A') == ival_A + ival_A_to_Q_start = Period(freq="Q", year=2007, quarter=1) + ival_A_to_Q_end = Period(freq="Q", year=2007, quarter=4) + ival_A_to_M_start = Period(freq="M", year=2007, month=1) + ival_A_to_M_end = Period(freq="M", year=2007, month=12) + ival_A_to_W_start = Period(freq="W", year=2007, month=1, day=1) + ival_A_to_W_end = Period(freq="W", year=2007, month=12, day=31) + ival_A_to_B_start = Period(freq="B", year=2007, month=1, day=1) + ival_A_to_B_end = Period(freq="B", year=2007, month=12, day=31) + ival_A_to_D_start = Period(freq="D", year=2007, month=1, day=1) + ival_A_to_D_end = Period(freq="D", year=2007, month=12, day=31) + ival_A_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_A_to_H_end = Period(freq="H", year=2007, month=12, day=31, hour=23) + ival_A_to_T_start = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0 + ) + ival_A_to_T_end = Period( + freq="Min", year=2007, month=12, day=31, hour=23, minute=59 + ) + ival_A_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_A_to_S_end = Period( + freq="S", year=2007, month=12, day=31, hour=23, minute=59, second=59 + ) + + ival_AJAN_to_D_end = Period(freq="D", year=2007, month=1, day=31) + ival_AJAN_to_D_start = Period(freq="D", year=2006, month=2, day=1) + ival_AJUN_to_D_end = Period(freq="D", year=2007, month=6, day=30) + ival_AJUN_to_D_start = Period(freq="D", year=2006, month=7, day=1) + ival_ANOV_to_D_end = Period(freq="D", year=2007, month=11, day=30) + ival_ANOV_to_D_start = Period(freq="D", year=2006, month=12, day=1) + + assert ival_A.asfreq("Q", "S") == ival_A_to_Q_start + assert ival_A.asfreq("Q", "e") == ival_A_to_Q_end + assert ival_A.asfreq("M", "s") == ival_A_to_M_start + assert ival_A.asfreq("M", "E") == ival_A_to_M_end + assert ival_A.asfreq("W", "S") == ival_A_to_W_start + assert ival_A.asfreq("W", "E") == ival_A_to_W_end + assert ival_A.asfreq("B", "S") == ival_A_to_B_start + assert ival_A.asfreq("B", "E") == ival_A_to_B_end + assert ival_A.asfreq("D", "S") == ival_A_to_D_start + assert ival_A.asfreq("D", "E") == ival_A_to_D_end + assert ival_A.asfreq("H", "S") == ival_A_to_H_start + assert ival_A.asfreq("H", "E") == ival_A_to_H_end + assert ival_A.asfreq("min", "S") == ival_A_to_T_start + assert ival_A.asfreq("min", "E") == ival_A_to_T_end + assert ival_A.asfreq("T", "S") == ival_A_to_T_start + assert ival_A.asfreq("T", "E") == ival_A_to_T_end + assert ival_A.asfreq("S", "S") == ival_A_to_S_start + assert ival_A.asfreq("S", "E") == ival_A_to_S_end + + assert ival_AJAN.asfreq("D", "S") == ival_AJAN_to_D_start + assert ival_AJAN.asfreq("D", "E") == ival_AJAN_to_D_end + + assert ival_AJUN.asfreq("D", "S") == ival_AJUN_to_D_start + assert ival_AJUN.asfreq("D", "E") == ival_AJUN_to_D_end + + assert ival_ANOV.asfreq("D", "S") == ival_ANOV_to_D_start + assert ival_ANOV.asfreq("D", "E") == ival_ANOV_to_D_end + + assert ival_A.asfreq("A") == ival_A def test_conv_quarterly(self): # frequency conversion tests: from Quarterly Frequency - ival_Q = Period(freq='Q', year=2007, quarter=1) - ival_Q_end_of_year = Period(freq='Q', year=2007, quarter=4) + ival_Q = Period(freq="Q", year=2007, quarter=1) + ival_Q_end_of_year = Period(freq="Q", year=2007, quarter=4) ival_QEJAN = Period(freq="Q-JAN", year=2007, quarter=1) ival_QEJUN = Period(freq="Q-JUN", year=2007, quarter=1) - ival_Q_to_A = Period(freq='A', year=2007) - ival_Q_to_M_start = Period(freq='M', year=2007, month=1) - ival_Q_to_M_end = Period(freq='M', year=2007, month=3) - ival_Q_to_W_start = Period(freq='W', year=2007, month=1, day=1) - ival_Q_to_W_end = Period(freq='W', year=2007, month=3, day=31) - ival_Q_to_B_start = Period(freq='B', year=2007, month=1, day=1) - ival_Q_to_B_end = Period(freq='B', year=2007, month=3, day=30) - ival_Q_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_Q_to_D_end = Period(freq='D', year=2007, month=3, day=31) - ival_Q_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_Q_to_H_end = Period(freq='H', year=2007, month=3, day=31, hour=23) - ival_Q_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_Q_to_T_end = Period(freq='Min', year=2007, month=3, day=31, - hour=23, minute=59) - ival_Q_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_Q_to_S_end = Period(freq='S', year=2007, month=3, day=31, hour=23, - minute=59, second=59) - - ival_QEJAN_to_D_start = Period(freq='D', year=2006, month=2, day=1) - ival_QEJAN_to_D_end = Period(freq='D', year=2006, month=4, day=30) - - ival_QEJUN_to_D_start = Period(freq='D', year=2006, month=7, day=1) - ival_QEJUN_to_D_end = Period(freq='D', year=2006, month=9, day=30) - - assert ival_Q.asfreq('A') == ival_Q_to_A - assert ival_Q_end_of_year.asfreq('A') == ival_Q_to_A - - assert ival_Q.asfreq('M', 'S') == ival_Q_to_M_start - assert ival_Q.asfreq('M', 'E') == ival_Q_to_M_end - assert ival_Q.asfreq('W', 'S') == ival_Q_to_W_start - assert ival_Q.asfreq('W', 'E') == ival_Q_to_W_end - assert ival_Q.asfreq('B', 'S') == ival_Q_to_B_start - assert ival_Q.asfreq('B', 'E') == ival_Q_to_B_end - assert ival_Q.asfreq('D', 'S') == ival_Q_to_D_start - assert ival_Q.asfreq('D', 'E') == ival_Q_to_D_end - assert ival_Q.asfreq('H', 'S') == ival_Q_to_H_start - assert ival_Q.asfreq('H', 'E') == ival_Q_to_H_end - assert ival_Q.asfreq('Min', 'S') == ival_Q_to_T_start - assert ival_Q.asfreq('Min', 'E') == ival_Q_to_T_end - assert ival_Q.asfreq('S', 'S') == ival_Q_to_S_start - assert ival_Q.asfreq('S', 'E') == ival_Q_to_S_end - - assert ival_QEJAN.asfreq('D', 'S') == ival_QEJAN_to_D_start - assert ival_QEJAN.asfreq('D', 'E') == ival_QEJAN_to_D_end - assert ival_QEJUN.asfreq('D', 'S') == ival_QEJUN_to_D_start - assert ival_QEJUN.asfreq('D', 'E') == ival_QEJUN_to_D_end - - assert ival_Q.asfreq('Q') == ival_Q + ival_Q_to_A = Period(freq="A", year=2007) + ival_Q_to_M_start = Period(freq="M", year=2007, month=1) + ival_Q_to_M_end = Period(freq="M", year=2007, month=3) + ival_Q_to_W_start = Period(freq="W", year=2007, month=1, day=1) + ival_Q_to_W_end = Period(freq="W", year=2007, month=3, day=31) + ival_Q_to_B_start = Period(freq="B", year=2007, month=1, day=1) + ival_Q_to_B_end = Period(freq="B", year=2007, month=3, day=30) + ival_Q_to_D_start = Period(freq="D", year=2007, month=1, day=1) + ival_Q_to_D_end = Period(freq="D", year=2007, month=3, day=31) + ival_Q_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_Q_to_H_end = Period(freq="H", year=2007, month=3, day=31, hour=23) + ival_Q_to_T_start = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0 + ) + ival_Q_to_T_end = Period( + freq="Min", year=2007, month=3, day=31, hour=23, minute=59 + ) + ival_Q_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_Q_to_S_end = Period( + freq="S", year=2007, month=3, day=31, hour=23, minute=59, second=59 + ) + + ival_QEJAN_to_D_start = Period(freq="D", year=2006, month=2, day=1) + ival_QEJAN_to_D_end = Period(freq="D", year=2006, month=4, day=30) + + ival_QEJUN_to_D_start = Period(freq="D", year=2006, month=7, day=1) + ival_QEJUN_to_D_end = Period(freq="D", year=2006, month=9, day=30) + + assert ival_Q.asfreq("A") == ival_Q_to_A + assert ival_Q_end_of_year.asfreq("A") == ival_Q_to_A + + assert ival_Q.asfreq("M", "S") == ival_Q_to_M_start + assert ival_Q.asfreq("M", "E") == ival_Q_to_M_end + assert ival_Q.asfreq("W", "S") == ival_Q_to_W_start + assert ival_Q.asfreq("W", "E") == ival_Q_to_W_end + assert ival_Q.asfreq("B", "S") == ival_Q_to_B_start + assert ival_Q.asfreq("B", "E") == ival_Q_to_B_end + assert ival_Q.asfreq("D", "S") == ival_Q_to_D_start + assert ival_Q.asfreq("D", "E") == ival_Q_to_D_end + assert ival_Q.asfreq("H", "S") == ival_Q_to_H_start + assert ival_Q.asfreq("H", "E") == ival_Q_to_H_end + assert ival_Q.asfreq("Min", "S") == ival_Q_to_T_start + assert ival_Q.asfreq("Min", "E") == ival_Q_to_T_end + assert ival_Q.asfreq("S", "S") == ival_Q_to_S_start + assert ival_Q.asfreq("S", "E") == ival_Q_to_S_end + + assert ival_QEJAN.asfreq("D", "S") == ival_QEJAN_to_D_start + assert ival_QEJAN.asfreq("D", "E") == ival_QEJAN_to_D_end + assert ival_QEJUN.asfreq("D", "S") == ival_QEJUN_to_D_start + assert ival_QEJUN.asfreq("D", "E") == ival_QEJUN_to_D_end + + assert ival_Q.asfreq("Q") == ival_Q def test_conv_monthly(self): # frequency conversion tests: from Monthly Frequency - ival_M = Period(freq='M', year=2007, month=1) - ival_M_end_of_year = Period(freq='M', year=2007, month=12) - ival_M_end_of_quarter = Period(freq='M', year=2007, month=3) - ival_M_to_A = Period(freq='A', year=2007) - ival_M_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_M_to_W_start = Period(freq='W', year=2007, month=1, day=1) - ival_M_to_W_end = Period(freq='W', year=2007, month=1, day=31) - ival_M_to_B_start = Period(freq='B', year=2007, month=1, day=1) - ival_M_to_B_end = Period(freq='B', year=2007, month=1, day=31) - ival_M_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_M_to_D_end = Period(freq='D', year=2007, month=1, day=31) - ival_M_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_M_to_H_end = Period(freq='H', year=2007, month=1, day=31, hour=23) - ival_M_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_M_to_T_end = Period(freq='Min', year=2007, month=1, day=31, - hour=23, minute=59) - ival_M_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_M_to_S_end = Period(freq='S', year=2007, month=1, day=31, hour=23, - minute=59, second=59) - - assert ival_M.asfreq('A') == ival_M_to_A - assert ival_M_end_of_year.asfreq('A') == ival_M_to_A - assert ival_M.asfreq('Q') == ival_M_to_Q - assert ival_M_end_of_quarter.asfreq('Q') == ival_M_to_Q - - assert ival_M.asfreq('W', 'S') == ival_M_to_W_start - assert ival_M.asfreq('W', 'E') == ival_M_to_W_end - assert ival_M.asfreq('B', 'S') == ival_M_to_B_start - assert ival_M.asfreq('B', 'E') == ival_M_to_B_end - assert ival_M.asfreq('D', 'S') == ival_M_to_D_start - assert ival_M.asfreq('D', 'E') == ival_M_to_D_end - assert ival_M.asfreq('H', 'S') == ival_M_to_H_start - assert ival_M.asfreq('H', 'E') == ival_M_to_H_end - assert ival_M.asfreq('Min', 'S') == ival_M_to_T_start - assert ival_M.asfreq('Min', 'E') == ival_M_to_T_end - assert ival_M.asfreq('S', 'S') == ival_M_to_S_start - assert ival_M.asfreq('S', 'E') == ival_M_to_S_end - - assert ival_M.asfreq('M') == ival_M + ival_M = Period(freq="M", year=2007, month=1) + ival_M_end_of_year = Period(freq="M", year=2007, month=12) + ival_M_end_of_quarter = Period(freq="M", year=2007, month=3) + ival_M_to_A = Period(freq="A", year=2007) + ival_M_to_Q = Period(freq="Q", year=2007, quarter=1) + ival_M_to_W_start = Period(freq="W", year=2007, month=1, day=1) + ival_M_to_W_end = Period(freq="W", year=2007, month=1, day=31) + ival_M_to_B_start = Period(freq="B", year=2007, month=1, day=1) + ival_M_to_B_end = Period(freq="B", year=2007, month=1, day=31) + ival_M_to_D_start = Period(freq="D", year=2007, month=1, day=1) + ival_M_to_D_end = Period(freq="D", year=2007, month=1, day=31) + ival_M_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_M_to_H_end = Period(freq="H", year=2007, month=1, day=31, hour=23) + ival_M_to_T_start = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0 + ) + ival_M_to_T_end = Period( + freq="Min", year=2007, month=1, day=31, hour=23, minute=59 + ) + ival_M_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_M_to_S_end = Period( + freq="S", year=2007, month=1, day=31, hour=23, minute=59, second=59 + ) + + assert ival_M.asfreq("A") == ival_M_to_A + assert ival_M_end_of_year.asfreq("A") == ival_M_to_A + assert ival_M.asfreq("Q") == ival_M_to_Q + assert ival_M_end_of_quarter.asfreq("Q") == ival_M_to_Q + + assert ival_M.asfreq("W", "S") == ival_M_to_W_start + assert ival_M.asfreq("W", "E") == ival_M_to_W_end + assert ival_M.asfreq("B", "S") == ival_M_to_B_start + assert ival_M.asfreq("B", "E") == ival_M_to_B_end + assert ival_M.asfreq("D", "S") == ival_M_to_D_start + assert ival_M.asfreq("D", "E") == ival_M_to_D_end + assert ival_M.asfreq("H", "S") == ival_M_to_H_start + assert ival_M.asfreq("H", "E") == ival_M_to_H_end + assert ival_M.asfreq("Min", "S") == ival_M_to_T_start + assert ival_M.asfreq("Min", "E") == ival_M_to_T_end + assert ival_M.asfreq("S", "S") == ival_M_to_S_start + assert ival_M.asfreq("S", "E") == ival_M_to_S_end + + assert ival_M.asfreq("M") == ival_M def test_conv_weekly(self): # frequency conversion tests: from Weekly Frequency - ival_W = Period(freq='W', year=2007, month=1, day=1) - - ival_WSUN = Period(freq='W', year=2007, month=1, day=7) - ival_WSAT = Period(freq='W-SAT', year=2007, month=1, day=6) - ival_WFRI = Period(freq='W-FRI', year=2007, month=1, day=5) - ival_WTHU = Period(freq='W-THU', year=2007, month=1, day=4) - ival_WWED = Period(freq='W-WED', year=2007, month=1, day=3) - ival_WTUE = Period(freq='W-TUE', year=2007, month=1, day=2) - ival_WMON = Period(freq='W-MON', year=2007, month=1, day=1) - - ival_WSUN_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_WSUN_to_D_end = Period(freq='D', year=2007, month=1, day=7) - ival_WSAT_to_D_start = Period(freq='D', year=2006, month=12, day=31) - ival_WSAT_to_D_end = Period(freq='D', year=2007, month=1, day=6) - ival_WFRI_to_D_start = Period(freq='D', year=2006, month=12, day=30) - ival_WFRI_to_D_end = Period(freq='D', year=2007, month=1, day=5) - ival_WTHU_to_D_start = Period(freq='D', year=2006, month=12, day=29) - ival_WTHU_to_D_end = Period(freq='D', year=2007, month=1, day=4) - ival_WWED_to_D_start = Period(freq='D', year=2006, month=12, day=28) - ival_WWED_to_D_end = Period(freq='D', year=2007, month=1, day=3) - ival_WTUE_to_D_start = Period(freq='D', year=2006, month=12, day=27) - ival_WTUE_to_D_end = Period(freq='D', year=2007, month=1, day=2) - ival_WMON_to_D_start = Period(freq='D', year=2006, month=12, day=26) - ival_WMON_to_D_end = Period(freq='D', year=2007, month=1, day=1) - - ival_W_end_of_year = Period(freq='W', year=2007, month=12, day=31) - ival_W_end_of_quarter = Period(freq='W', year=2007, month=3, day=31) - ival_W_end_of_month = Period(freq='W', year=2007, month=1, day=31) - ival_W_to_A = Period(freq='A', year=2007) - ival_W_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_W_to_M = Period(freq='M', year=2007, month=1) - - if Period(freq='D', year=2007, month=12, day=31).weekday == 6: - ival_W_to_A_end_of_year = Period(freq='A', year=2007) + ival_W = Period(freq="W", year=2007, month=1, day=1) + + ival_WSUN = Period(freq="W", year=2007, month=1, day=7) + ival_WSAT = Period(freq="W-SAT", year=2007, month=1, day=6) + ival_WFRI = Period(freq="W-FRI", year=2007, month=1, day=5) + ival_WTHU = Period(freq="W-THU", year=2007, month=1, day=4) + ival_WWED = Period(freq="W-WED", year=2007, month=1, day=3) + ival_WTUE = Period(freq="W-TUE", year=2007, month=1, day=2) + ival_WMON = Period(freq="W-MON", year=2007, month=1, day=1) + + ival_WSUN_to_D_start = Period(freq="D", year=2007, month=1, day=1) + ival_WSUN_to_D_end = Period(freq="D", year=2007, month=1, day=7) + ival_WSAT_to_D_start = Period(freq="D", year=2006, month=12, day=31) + ival_WSAT_to_D_end = Period(freq="D", year=2007, month=1, day=6) + ival_WFRI_to_D_start = Period(freq="D", year=2006, month=12, day=30) + ival_WFRI_to_D_end = Period(freq="D", year=2007, month=1, day=5) + ival_WTHU_to_D_start = Period(freq="D", year=2006, month=12, day=29) + ival_WTHU_to_D_end = Period(freq="D", year=2007, month=1, day=4) + ival_WWED_to_D_start = Period(freq="D", year=2006, month=12, day=28) + ival_WWED_to_D_end = Period(freq="D", year=2007, month=1, day=3) + ival_WTUE_to_D_start = Period(freq="D", year=2006, month=12, day=27) + ival_WTUE_to_D_end = Period(freq="D", year=2007, month=1, day=2) + ival_WMON_to_D_start = Period(freq="D", year=2006, month=12, day=26) + ival_WMON_to_D_end = Period(freq="D", year=2007, month=1, day=1) + + ival_W_end_of_year = Period(freq="W", year=2007, month=12, day=31) + ival_W_end_of_quarter = Period(freq="W", year=2007, month=3, day=31) + ival_W_end_of_month = Period(freq="W", year=2007, month=1, day=31) + ival_W_to_A = Period(freq="A", year=2007) + ival_W_to_Q = Period(freq="Q", year=2007, quarter=1) + ival_W_to_M = Period(freq="M", year=2007, month=1) + + if Period(freq="D", year=2007, month=12, day=31).weekday == 6: + ival_W_to_A_end_of_year = Period(freq="A", year=2007) else: - ival_W_to_A_end_of_year = Period(freq='A', year=2008) + ival_W_to_A_end_of_year = Period(freq="A", year=2008) - if Period(freq='D', year=2007, month=3, day=31).weekday == 6: - ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, quarter=1) + if Period(freq="D", year=2007, month=3, day=31).weekday == 6: + ival_W_to_Q_end_of_quarter = Period(freq="Q", year=2007, quarter=1) else: - ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, quarter=2) + ival_W_to_Q_end_of_quarter = Period(freq="Q", year=2007, quarter=2) - if Period(freq='D', year=2007, month=1, day=31).weekday == 6: - ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=1) + if Period(freq="D", year=2007, month=1, day=31).weekday == 6: + ival_W_to_M_end_of_month = Period(freq="M", year=2007, month=1) else: - ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=2) - - ival_W_to_B_start = Period(freq='B', year=2007, month=1, day=1) - ival_W_to_B_end = Period(freq='B', year=2007, month=1, day=5) - ival_W_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_W_to_D_end = Period(freq='D', year=2007, month=1, day=7) - ival_W_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_W_to_H_end = Period(freq='H', year=2007, month=1, day=7, hour=23) - ival_W_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_W_to_T_end = Period(freq='Min', year=2007, month=1, day=7, - hour=23, minute=59) - ival_W_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_W_to_S_end = Period(freq='S', year=2007, month=1, day=7, hour=23, - minute=59, second=59) - - assert ival_W.asfreq('A') == ival_W_to_A - assert ival_W_end_of_year.asfreq('A') == ival_W_to_A_end_of_year - - assert ival_W.asfreq('Q') == ival_W_to_Q - assert ival_W_end_of_quarter.asfreq('Q') == ival_W_to_Q_end_of_quarter - - assert ival_W.asfreq('M') == ival_W_to_M - assert ival_W_end_of_month.asfreq('M') == ival_W_to_M_end_of_month - - assert ival_W.asfreq('B', 'S') == ival_W_to_B_start - assert ival_W.asfreq('B', 'E') == ival_W_to_B_end - - assert ival_W.asfreq('D', 'S') == ival_W_to_D_start - assert ival_W.asfreq('D', 'E') == ival_W_to_D_end - - assert ival_WSUN.asfreq('D', 'S') == ival_WSUN_to_D_start - assert ival_WSUN.asfreq('D', 'E') == ival_WSUN_to_D_end - assert ival_WSAT.asfreq('D', 'S') == ival_WSAT_to_D_start - assert ival_WSAT.asfreq('D', 'E') == ival_WSAT_to_D_end - assert ival_WFRI.asfreq('D', 'S') == ival_WFRI_to_D_start - assert ival_WFRI.asfreq('D', 'E') == ival_WFRI_to_D_end - assert ival_WTHU.asfreq('D', 'S') == ival_WTHU_to_D_start - assert ival_WTHU.asfreq('D', 'E') == ival_WTHU_to_D_end - assert ival_WWED.asfreq('D', 'S') == ival_WWED_to_D_start - assert ival_WWED.asfreq('D', 'E') == ival_WWED_to_D_end - assert ival_WTUE.asfreq('D', 'S') == ival_WTUE_to_D_start - assert ival_WTUE.asfreq('D', 'E') == ival_WTUE_to_D_end - assert ival_WMON.asfreq('D', 'S') == ival_WMON_to_D_start - assert ival_WMON.asfreq('D', 'E') == ival_WMON_to_D_end - - assert ival_W.asfreq('H', 'S') == ival_W_to_H_start - assert ival_W.asfreq('H', 'E') == ival_W_to_H_end - assert ival_W.asfreq('Min', 'S') == ival_W_to_T_start - assert ival_W.asfreq('Min', 'E') == ival_W_to_T_end - assert ival_W.asfreq('S', 'S') == ival_W_to_S_start - assert ival_W.asfreq('S', 'E') == ival_W_to_S_end - - assert ival_W.asfreq('W') == ival_W + ival_W_to_M_end_of_month = Period(freq="M", year=2007, month=2) + + ival_W_to_B_start = Period(freq="B", year=2007, month=1, day=1) + ival_W_to_B_end = Period(freq="B", year=2007, month=1, day=5) + ival_W_to_D_start = Period(freq="D", year=2007, month=1, day=1) + ival_W_to_D_end = Period(freq="D", year=2007, month=1, day=7) + ival_W_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_W_to_H_end = Period(freq="H", year=2007, month=1, day=7, hour=23) + ival_W_to_T_start = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0 + ) + ival_W_to_T_end = Period( + freq="Min", year=2007, month=1, day=7, hour=23, minute=59 + ) + ival_W_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_W_to_S_end = Period( + freq="S", year=2007, month=1, day=7, hour=23, minute=59, second=59 + ) + + assert ival_W.asfreq("A") == ival_W_to_A + assert ival_W_end_of_year.asfreq("A") == ival_W_to_A_end_of_year + + assert ival_W.asfreq("Q") == ival_W_to_Q + assert ival_W_end_of_quarter.asfreq("Q") == ival_W_to_Q_end_of_quarter + + assert ival_W.asfreq("M") == ival_W_to_M + assert ival_W_end_of_month.asfreq("M") == ival_W_to_M_end_of_month + + assert ival_W.asfreq("B", "S") == ival_W_to_B_start + assert ival_W.asfreq("B", "E") == ival_W_to_B_end + + assert ival_W.asfreq("D", "S") == ival_W_to_D_start + assert ival_W.asfreq("D", "E") == ival_W_to_D_end + + assert ival_WSUN.asfreq("D", "S") == ival_WSUN_to_D_start + assert ival_WSUN.asfreq("D", "E") == ival_WSUN_to_D_end + assert ival_WSAT.asfreq("D", "S") == ival_WSAT_to_D_start + assert ival_WSAT.asfreq("D", "E") == ival_WSAT_to_D_end + assert ival_WFRI.asfreq("D", "S") == ival_WFRI_to_D_start + assert ival_WFRI.asfreq("D", "E") == ival_WFRI_to_D_end + assert ival_WTHU.asfreq("D", "S") == ival_WTHU_to_D_start + assert ival_WTHU.asfreq("D", "E") == ival_WTHU_to_D_end + assert ival_WWED.asfreq("D", "S") == ival_WWED_to_D_start + assert ival_WWED.asfreq("D", "E") == ival_WWED_to_D_end + assert ival_WTUE.asfreq("D", "S") == ival_WTUE_to_D_start + assert ival_WTUE.asfreq("D", "E") == ival_WTUE_to_D_end + assert ival_WMON.asfreq("D", "S") == ival_WMON_to_D_start + assert ival_WMON.asfreq("D", "E") == ival_WMON_to_D_end + + assert ival_W.asfreq("H", "S") == ival_W_to_H_start + assert ival_W.asfreq("H", "E") == ival_W_to_H_end + assert ival_W.asfreq("Min", "S") == ival_W_to_T_start + assert ival_W.asfreq("Min", "E") == ival_W_to_T_end + assert ival_W.asfreq("S", "S") == ival_W_to_S_start + assert ival_W.asfreq("S", "E") == ival_W_to_S_end + + assert ival_W.asfreq("W") == ival_W msg = INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): - ival_W.asfreq('WK') + ival_W.asfreq("WK") def test_conv_weekly_legacy(self): # frequency conversion tests: from Weekly Frequency msg = INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): - Period(freq='WK', year=2007, month=1, day=1) + Period(freq="WK", year=2007, month=1, day=1) with pytest.raises(ValueError, match=msg): - Period(freq='WK-SAT', year=2007, month=1, day=6) + Period(freq="WK-SAT", year=2007, month=1, day=6) with pytest.raises(ValueError, match=msg): - Period(freq='WK-FRI', year=2007, month=1, day=5) + Period(freq="WK-FRI", year=2007, month=1, day=5) with pytest.raises(ValueError, match=msg): - Period(freq='WK-THU', year=2007, month=1, day=4) + Period(freq="WK-THU", year=2007, month=1, day=4) with pytest.raises(ValueError, match=msg): - Period(freq='WK-WED', year=2007, month=1, day=3) + Period(freq="WK-WED", year=2007, month=1, day=3) with pytest.raises(ValueError, match=msg): - Period(freq='WK-TUE', year=2007, month=1, day=2) + Period(freq="WK-TUE", year=2007, month=1, day=2) with pytest.raises(ValueError, match=msg): - Period(freq='WK-MON', year=2007, month=1, day=1) + Period(freq="WK-MON", year=2007, month=1, day=1) def test_conv_business(self): # frequency conversion tests: from Business Frequency" - ival_B = Period(freq='B', year=2007, month=1, day=1) - ival_B_end_of_year = Period(freq='B', year=2007, month=12, day=31) - ival_B_end_of_quarter = Period(freq='B', year=2007, month=3, day=30) - ival_B_end_of_month = Period(freq='B', year=2007, month=1, day=31) - ival_B_end_of_week = Period(freq='B', year=2007, month=1, day=5) - - ival_B_to_A = Period(freq='A', year=2007) - ival_B_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_B_to_M = Period(freq='M', year=2007, month=1) - ival_B_to_W = Period(freq='W', year=2007, month=1, day=7) - ival_B_to_D = Period(freq='D', year=2007, month=1, day=1) - ival_B_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_B_to_H_end = Period(freq='H', year=2007, month=1, day=1, hour=23) - ival_B_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_B_to_T_end = Period(freq='Min', year=2007, month=1, day=1, - hour=23, minute=59) - ival_B_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_B_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=23, - minute=59, second=59) - - assert ival_B.asfreq('A') == ival_B_to_A - assert ival_B_end_of_year.asfreq('A') == ival_B_to_A - assert ival_B.asfreq('Q') == ival_B_to_Q - assert ival_B_end_of_quarter.asfreq('Q') == ival_B_to_Q - assert ival_B.asfreq('M') == ival_B_to_M - assert ival_B_end_of_month.asfreq('M') == ival_B_to_M - assert ival_B.asfreq('W') == ival_B_to_W - assert ival_B_end_of_week.asfreq('W') == ival_B_to_W - - assert ival_B.asfreq('D') == ival_B_to_D - - assert ival_B.asfreq('H', 'S') == ival_B_to_H_start - assert ival_B.asfreq('H', 'E') == ival_B_to_H_end - assert ival_B.asfreq('Min', 'S') == ival_B_to_T_start - assert ival_B.asfreq('Min', 'E') == ival_B_to_T_end - assert ival_B.asfreq('S', 'S') == ival_B_to_S_start - assert ival_B.asfreq('S', 'E') == ival_B_to_S_end - - assert ival_B.asfreq('B') == ival_B + ival_B = Period(freq="B", year=2007, month=1, day=1) + ival_B_end_of_year = Period(freq="B", year=2007, month=12, day=31) + ival_B_end_of_quarter = Period(freq="B", year=2007, month=3, day=30) + ival_B_end_of_month = Period(freq="B", year=2007, month=1, day=31) + ival_B_end_of_week = Period(freq="B", year=2007, month=1, day=5) + + ival_B_to_A = Period(freq="A", year=2007) + ival_B_to_Q = Period(freq="Q", year=2007, quarter=1) + ival_B_to_M = Period(freq="M", year=2007, month=1) + ival_B_to_W = Period(freq="W", year=2007, month=1, day=7) + ival_B_to_D = Period(freq="D", year=2007, month=1, day=1) + ival_B_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_B_to_H_end = Period(freq="H", year=2007, month=1, day=1, hour=23) + ival_B_to_T_start = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0 + ) + ival_B_to_T_end = Period( + freq="Min", year=2007, month=1, day=1, hour=23, minute=59 + ) + ival_B_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_B_to_S_end = Period( + freq="S", year=2007, month=1, day=1, hour=23, minute=59, second=59 + ) + + assert ival_B.asfreq("A") == ival_B_to_A + assert ival_B_end_of_year.asfreq("A") == ival_B_to_A + assert ival_B.asfreq("Q") == ival_B_to_Q + assert ival_B_end_of_quarter.asfreq("Q") == ival_B_to_Q + assert ival_B.asfreq("M") == ival_B_to_M + assert ival_B_end_of_month.asfreq("M") == ival_B_to_M + assert ival_B.asfreq("W") == ival_B_to_W + assert ival_B_end_of_week.asfreq("W") == ival_B_to_W + + assert ival_B.asfreq("D") == ival_B_to_D + + assert ival_B.asfreq("H", "S") == ival_B_to_H_start + assert ival_B.asfreq("H", "E") == ival_B_to_H_end + assert ival_B.asfreq("Min", "S") == ival_B_to_T_start + assert ival_B.asfreq("Min", "E") == ival_B_to_T_end + assert ival_B.asfreq("S", "S") == ival_B_to_S_start + assert ival_B.asfreq("S", "E") == ival_B_to_S_end + + assert ival_B.asfreq("B") == ival_B def test_conv_daily(self): # frequency conversion tests: from Business Frequency" - ival_D = Period(freq='D', year=2007, month=1, day=1) - ival_D_end_of_year = Period(freq='D', year=2007, month=12, day=31) - ival_D_end_of_quarter = Period(freq='D', year=2007, month=3, day=31) - ival_D_end_of_month = Period(freq='D', year=2007, month=1, day=31) - ival_D_end_of_week = Period(freq='D', year=2007, month=1, day=7) + ival_D = Period(freq="D", year=2007, month=1, day=1) + ival_D_end_of_year = Period(freq="D", year=2007, month=12, day=31) + ival_D_end_of_quarter = Period(freq="D", year=2007, month=3, day=31) + ival_D_end_of_month = Period(freq="D", year=2007, month=1, day=31) + ival_D_end_of_week = Period(freq="D", year=2007, month=1, day=7) - ival_D_friday = Period(freq='D', year=2007, month=1, day=5) - ival_D_saturday = Period(freq='D', year=2007, month=1, day=6) - ival_D_sunday = Period(freq='D', year=2007, month=1, day=7) + ival_D_friday = Period(freq="D", year=2007, month=1, day=5) + ival_D_saturday = Period(freq="D", year=2007, month=1, day=6) + ival_D_sunday = Period(freq="D", year=2007, month=1, day=7) # TODO: unused? # ival_D_monday = Period(freq='D', year=2007, month=1, day=8) - ival_B_friday = Period(freq='B', year=2007, month=1, day=5) - ival_B_monday = Period(freq='B', year=2007, month=1, day=8) + ival_B_friday = Period(freq="B", year=2007, month=1, day=5) + ival_B_monday = Period(freq="B", year=2007, month=1, day=8) - ival_D_to_A = Period(freq='A', year=2007) + ival_D_to_A = Period(freq="A", year=2007) - ival_Deoq_to_AJAN = Period(freq='A-JAN', year=2008) - ival_Deoq_to_AJUN = Period(freq='A-JUN', year=2007) - ival_Deoq_to_ADEC = Period(freq='A-DEC', year=2007) + ival_Deoq_to_AJAN = Period(freq="A-JAN", year=2008) + ival_Deoq_to_AJUN = Period(freq="A-JUN", year=2007) + ival_Deoq_to_ADEC = Period(freq="A-DEC", year=2007) ival_D_to_QEJAN = Period(freq="Q-JAN", year=2007, quarter=4) ival_D_to_QEJUN = Period(freq="Q-JUN", year=2007, quarter=3) ival_D_to_QEDEC = Period(freq="Q-DEC", year=2007, quarter=1) - ival_D_to_M = Period(freq='M', year=2007, month=1) - ival_D_to_W = Period(freq='W', year=2007, month=1, day=7) - - ival_D_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_D_to_H_end = Period(freq='H', year=2007, month=1, day=1, hour=23) - ival_D_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_D_to_T_end = Period(freq='Min', year=2007, month=1, day=1, - hour=23, minute=59) - ival_D_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_D_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=23, - minute=59, second=59) - - assert ival_D.asfreq('A') == ival_D_to_A - - assert ival_D_end_of_quarter.asfreq('A-JAN') == ival_Deoq_to_AJAN - assert ival_D_end_of_quarter.asfreq('A-JUN') == ival_Deoq_to_AJUN - assert ival_D_end_of_quarter.asfreq('A-DEC') == ival_Deoq_to_ADEC - - assert ival_D_end_of_year.asfreq('A') == ival_D_to_A - assert ival_D_end_of_quarter.asfreq('Q') == ival_D_to_QEDEC + ival_D_to_M = Period(freq="M", year=2007, month=1) + ival_D_to_W = Period(freq="W", year=2007, month=1, day=7) + + ival_D_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_D_to_H_end = Period(freq="H", year=2007, month=1, day=1, hour=23) + ival_D_to_T_start = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0 + ) + ival_D_to_T_end = Period( + freq="Min", year=2007, month=1, day=1, hour=23, minute=59 + ) + ival_D_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_D_to_S_end = Period( + freq="S", year=2007, month=1, day=1, hour=23, minute=59, second=59 + ) + + assert ival_D.asfreq("A") == ival_D_to_A + + assert ival_D_end_of_quarter.asfreq("A-JAN") == ival_Deoq_to_AJAN + assert ival_D_end_of_quarter.asfreq("A-JUN") == ival_Deoq_to_AJUN + assert ival_D_end_of_quarter.asfreq("A-DEC") == ival_Deoq_to_ADEC + + assert ival_D_end_of_year.asfreq("A") == ival_D_to_A + assert ival_D_end_of_quarter.asfreq("Q") == ival_D_to_QEDEC assert ival_D.asfreq("Q-JAN") == ival_D_to_QEJAN assert ival_D.asfreq("Q-JUN") == ival_D_to_QEJUN assert ival_D.asfreq("Q-DEC") == ival_D_to_QEDEC - assert ival_D.asfreq('M') == ival_D_to_M - assert ival_D_end_of_month.asfreq('M') == ival_D_to_M - assert ival_D.asfreq('W') == ival_D_to_W - assert ival_D_end_of_week.asfreq('W') == ival_D_to_W - - assert ival_D_friday.asfreq('B') == ival_B_friday - assert ival_D_saturday.asfreq('B', 'S') == ival_B_friday - assert ival_D_saturday.asfreq('B', 'E') == ival_B_monday - assert ival_D_sunday.asfreq('B', 'S') == ival_B_friday - assert ival_D_sunday.asfreq('B', 'E') == ival_B_monday - - assert ival_D.asfreq('H', 'S') == ival_D_to_H_start - assert ival_D.asfreq('H', 'E') == ival_D_to_H_end - assert ival_D.asfreq('Min', 'S') == ival_D_to_T_start - assert ival_D.asfreq('Min', 'E') == ival_D_to_T_end - assert ival_D.asfreq('S', 'S') == ival_D_to_S_start - assert ival_D.asfreq('S', 'E') == ival_D_to_S_end - - assert ival_D.asfreq('D') == ival_D + assert ival_D.asfreq("M") == ival_D_to_M + assert ival_D_end_of_month.asfreq("M") == ival_D_to_M + assert ival_D.asfreq("W") == ival_D_to_W + assert ival_D_end_of_week.asfreq("W") == ival_D_to_W + + assert ival_D_friday.asfreq("B") == ival_B_friday + assert ival_D_saturday.asfreq("B", "S") == ival_B_friday + assert ival_D_saturday.asfreq("B", "E") == ival_B_monday + assert ival_D_sunday.asfreq("B", "S") == ival_B_friday + assert ival_D_sunday.asfreq("B", "E") == ival_B_monday + + assert ival_D.asfreq("H", "S") == ival_D_to_H_start + assert ival_D.asfreq("H", "E") == ival_D_to_H_end + assert ival_D.asfreq("Min", "S") == ival_D_to_T_start + assert ival_D.asfreq("Min", "E") == ival_D_to_T_end + assert ival_D.asfreq("S", "S") == ival_D_to_S_start + assert ival_D.asfreq("S", "E") == ival_D_to_S_end + + assert ival_D.asfreq("D") == ival_D def test_conv_hourly(self): # frequency conversion tests: from Hourly Frequency" - ival_H = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_H_end_of_year = Period(freq='H', year=2007, month=12, day=31, - hour=23) - ival_H_end_of_quarter = Period(freq='H', year=2007, month=3, day=31, - hour=23) - ival_H_end_of_month = Period(freq='H', year=2007, month=1, day=31, - hour=23) - ival_H_end_of_week = Period(freq='H', year=2007, month=1, day=7, - hour=23) - ival_H_end_of_day = Period(freq='H', year=2007, month=1, day=1, - hour=23) - ival_H_end_of_bus = Period(freq='H', year=2007, month=1, day=1, - hour=23) - - ival_H_to_A = Period(freq='A', year=2007) - ival_H_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_H_to_M = Period(freq='M', year=2007, month=1) - ival_H_to_W = Period(freq='W', year=2007, month=1, day=7) - ival_H_to_D = Period(freq='D', year=2007, month=1, day=1) - ival_H_to_B = Period(freq='B', year=2007, month=1, day=1) - - ival_H_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_H_to_T_end = Period(freq='Min', year=2007, month=1, day=1, hour=0, - minute=59) - ival_H_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_H_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=59, second=59) - - assert ival_H.asfreq('A') == ival_H_to_A - assert ival_H_end_of_year.asfreq('A') == ival_H_to_A - assert ival_H.asfreq('Q') == ival_H_to_Q - assert ival_H_end_of_quarter.asfreq('Q') == ival_H_to_Q - assert ival_H.asfreq('M') == ival_H_to_M - assert ival_H_end_of_month.asfreq('M') == ival_H_to_M - assert ival_H.asfreq('W') == ival_H_to_W - assert ival_H_end_of_week.asfreq('W') == ival_H_to_W - assert ival_H.asfreq('D') == ival_H_to_D - assert ival_H_end_of_day.asfreq('D') == ival_H_to_D - assert ival_H.asfreq('B') == ival_H_to_B - assert ival_H_end_of_bus.asfreq('B') == ival_H_to_B - - assert ival_H.asfreq('Min', 'S') == ival_H_to_T_start - assert ival_H.asfreq('Min', 'E') == ival_H_to_T_end - assert ival_H.asfreq('S', 'S') == ival_H_to_S_start - assert ival_H.asfreq('S', 'E') == ival_H_to_S_end - - assert ival_H.asfreq('H') == ival_H + ival_H = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_H_end_of_year = Period(freq="H", year=2007, month=12, day=31, hour=23) + ival_H_end_of_quarter = Period(freq="H", year=2007, month=3, day=31, hour=23) + ival_H_end_of_month = Period(freq="H", year=2007, month=1, day=31, hour=23) + ival_H_end_of_week = Period(freq="H", year=2007, month=1, day=7, hour=23) + ival_H_end_of_day = Period(freq="H", year=2007, month=1, day=1, hour=23) + ival_H_end_of_bus = Period(freq="H", year=2007, month=1, day=1, hour=23) + + ival_H_to_A = Period(freq="A", year=2007) + ival_H_to_Q = Period(freq="Q", year=2007, quarter=1) + ival_H_to_M = Period(freq="M", year=2007, month=1) + ival_H_to_W = Period(freq="W", year=2007, month=1, day=7) + ival_H_to_D = Period(freq="D", year=2007, month=1, day=1) + ival_H_to_B = Period(freq="B", year=2007, month=1, day=1) + + ival_H_to_T_start = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0 + ) + ival_H_to_T_end = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=59 + ) + ival_H_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_H_to_S_end = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=59, second=59 + ) + + assert ival_H.asfreq("A") == ival_H_to_A + assert ival_H_end_of_year.asfreq("A") == ival_H_to_A + assert ival_H.asfreq("Q") == ival_H_to_Q + assert ival_H_end_of_quarter.asfreq("Q") == ival_H_to_Q + assert ival_H.asfreq("M") == ival_H_to_M + assert ival_H_end_of_month.asfreq("M") == ival_H_to_M + assert ival_H.asfreq("W") == ival_H_to_W + assert ival_H_end_of_week.asfreq("W") == ival_H_to_W + assert ival_H.asfreq("D") == ival_H_to_D + assert ival_H_end_of_day.asfreq("D") == ival_H_to_D + assert ival_H.asfreq("B") == ival_H_to_B + assert ival_H_end_of_bus.asfreq("B") == ival_H_to_B + + assert ival_H.asfreq("Min", "S") == ival_H_to_T_start + assert ival_H.asfreq("Min", "E") == ival_H_to_T_end + assert ival_H.asfreq("S", "S") == ival_H_to_S_start + assert ival_H.asfreq("S", "E") == ival_H_to_S_end + + assert ival_H.asfreq("H") == ival_H def test_conv_minutely(self): # frequency conversion tests: from Minutely Frequency" - ival_T = Period(freq='Min', year=2007, month=1, day=1, hour=0, - minute=0) - ival_T_end_of_year = Period(freq='Min', year=2007, month=12, day=31, - hour=23, minute=59) - ival_T_end_of_quarter = Period(freq='Min', year=2007, month=3, day=31, - hour=23, minute=59) - ival_T_end_of_month = Period(freq='Min', year=2007, month=1, day=31, - hour=23, minute=59) - ival_T_end_of_week = Period(freq='Min', year=2007, month=1, day=7, - hour=23, minute=59) - ival_T_end_of_day = Period(freq='Min', year=2007, month=1, day=1, - hour=23, minute=59) - ival_T_end_of_bus = Period(freq='Min', year=2007, month=1, day=1, - hour=23, minute=59) - ival_T_end_of_hour = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=59) - - ival_T_to_A = Period(freq='A', year=2007) - ival_T_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_T_to_M = Period(freq='M', year=2007, month=1) - ival_T_to_W = Period(freq='W', year=2007, month=1, day=7) - ival_T_to_D = Period(freq='D', year=2007, month=1, day=1) - ival_T_to_B = Period(freq='B', year=2007, month=1, day=1) - ival_T_to_H = Period(freq='H', year=2007, month=1, day=1, hour=0) - - ival_T_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_T_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=59) - - assert ival_T.asfreq('A') == ival_T_to_A - assert ival_T_end_of_year.asfreq('A') == ival_T_to_A - assert ival_T.asfreq('Q') == ival_T_to_Q - assert ival_T_end_of_quarter.asfreq('Q') == ival_T_to_Q - assert ival_T.asfreq('M') == ival_T_to_M - assert ival_T_end_of_month.asfreq('M') == ival_T_to_M - assert ival_T.asfreq('W') == ival_T_to_W - assert ival_T_end_of_week.asfreq('W') == ival_T_to_W - assert ival_T.asfreq('D') == ival_T_to_D - assert ival_T_end_of_day.asfreq('D') == ival_T_to_D - assert ival_T.asfreq('B') == ival_T_to_B - assert ival_T_end_of_bus.asfreq('B') == ival_T_to_B - assert ival_T.asfreq('H') == ival_T_to_H - assert ival_T_end_of_hour.asfreq('H') == ival_T_to_H - - assert ival_T.asfreq('S', 'S') == ival_T_to_S_start - assert ival_T.asfreq('S', 'E') == ival_T_to_S_end - - assert ival_T.asfreq('Min') == ival_T + ival_T = Period(freq="Min", year=2007, month=1, day=1, hour=0, minute=0) + ival_T_end_of_year = Period( + freq="Min", year=2007, month=12, day=31, hour=23, minute=59 + ) + ival_T_end_of_quarter = Period( + freq="Min", year=2007, month=3, day=31, hour=23, minute=59 + ) + ival_T_end_of_month = Period( + freq="Min", year=2007, month=1, day=31, hour=23, minute=59 + ) + ival_T_end_of_week = Period( + freq="Min", year=2007, month=1, day=7, hour=23, minute=59 + ) + ival_T_end_of_day = Period( + freq="Min", year=2007, month=1, day=1, hour=23, minute=59 + ) + ival_T_end_of_bus = Period( + freq="Min", year=2007, month=1, day=1, hour=23, minute=59 + ) + ival_T_end_of_hour = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=59 + ) + + ival_T_to_A = Period(freq="A", year=2007) + ival_T_to_Q = Period(freq="Q", year=2007, quarter=1) + ival_T_to_M = Period(freq="M", year=2007, month=1) + ival_T_to_W = Period(freq="W", year=2007, month=1, day=7) + ival_T_to_D = Period(freq="D", year=2007, month=1, day=1) + ival_T_to_B = Period(freq="B", year=2007, month=1, day=1) + ival_T_to_H = Period(freq="H", year=2007, month=1, day=1, hour=0) + + ival_T_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_T_to_S_end = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=59 + ) + + assert ival_T.asfreq("A") == ival_T_to_A + assert ival_T_end_of_year.asfreq("A") == ival_T_to_A + assert ival_T.asfreq("Q") == ival_T_to_Q + assert ival_T_end_of_quarter.asfreq("Q") == ival_T_to_Q + assert ival_T.asfreq("M") == ival_T_to_M + assert ival_T_end_of_month.asfreq("M") == ival_T_to_M + assert ival_T.asfreq("W") == ival_T_to_W + assert ival_T_end_of_week.asfreq("W") == ival_T_to_W + assert ival_T.asfreq("D") == ival_T_to_D + assert ival_T_end_of_day.asfreq("D") == ival_T_to_D + assert ival_T.asfreq("B") == ival_T_to_B + assert ival_T_end_of_bus.asfreq("B") == ival_T_to_B + assert ival_T.asfreq("H") == ival_T_to_H + assert ival_T_end_of_hour.asfreq("H") == ival_T_to_H + + assert ival_T.asfreq("S", "S") == ival_T_to_S_start + assert ival_T.asfreq("S", "E") == ival_T_to_S_end + + assert ival_T.asfreq("Min") == ival_T def test_conv_secondly(self): # frequency conversion tests: from Secondly Frequency" - ival_S = Period(freq='S', year=2007, month=1, day=1, hour=0, minute=0, - second=0) - ival_S_end_of_year = Period(freq='S', year=2007, month=12, day=31, - hour=23, minute=59, second=59) - ival_S_end_of_quarter = Period(freq='S', year=2007, month=3, day=31, - hour=23, minute=59, second=59) - ival_S_end_of_month = Period(freq='S', year=2007, month=1, day=31, - hour=23, minute=59, second=59) - ival_S_end_of_week = Period(freq='S', year=2007, month=1, day=7, - hour=23, minute=59, second=59) - ival_S_end_of_day = Period(freq='S', year=2007, month=1, day=1, - hour=23, minute=59, second=59) - ival_S_end_of_bus = Period(freq='S', year=2007, month=1, day=1, - hour=23, minute=59, second=59) - ival_S_end_of_hour = Period(freq='S', year=2007, month=1, day=1, - hour=0, minute=59, second=59) - ival_S_end_of_minute = Period(freq='S', year=2007, month=1, day=1, - hour=0, minute=0, second=59) - - ival_S_to_A = Period(freq='A', year=2007) - ival_S_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_S_to_M = Period(freq='M', year=2007, month=1) - ival_S_to_W = Period(freq='W', year=2007, month=1, day=7) - ival_S_to_D = Period(freq='D', year=2007, month=1, day=1) - ival_S_to_B = Period(freq='B', year=2007, month=1, day=1) - ival_S_to_H = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_S_to_T = Period(freq='Min', year=2007, month=1, day=1, hour=0, - minute=0) - - assert ival_S.asfreq('A') == ival_S_to_A - assert ival_S_end_of_year.asfreq('A') == ival_S_to_A - assert ival_S.asfreq('Q') == ival_S_to_Q - assert ival_S_end_of_quarter.asfreq('Q') == ival_S_to_Q - assert ival_S.asfreq('M') == ival_S_to_M - assert ival_S_end_of_month.asfreq('M') == ival_S_to_M - assert ival_S.asfreq('W') == ival_S_to_W - assert ival_S_end_of_week.asfreq('W') == ival_S_to_W - assert ival_S.asfreq('D') == ival_S_to_D - assert ival_S_end_of_day.asfreq('D') == ival_S_to_D - assert ival_S.asfreq('B') == ival_S_to_B - assert ival_S_end_of_bus.asfreq('B') == ival_S_to_B - assert ival_S.asfreq('H') == ival_S_to_H - assert ival_S_end_of_hour.asfreq('H') == ival_S_to_H - assert ival_S.asfreq('Min') == ival_S_to_T - assert ival_S_end_of_minute.asfreq('Min') == ival_S_to_T - - assert ival_S.asfreq('S') == ival_S + ival_S = Period(freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0) + ival_S_end_of_year = Period( + freq="S", year=2007, month=12, day=31, hour=23, minute=59, second=59 + ) + ival_S_end_of_quarter = Period( + freq="S", year=2007, month=3, day=31, hour=23, minute=59, second=59 + ) + ival_S_end_of_month = Period( + freq="S", year=2007, month=1, day=31, hour=23, minute=59, second=59 + ) + ival_S_end_of_week = Period( + freq="S", year=2007, month=1, day=7, hour=23, minute=59, second=59 + ) + ival_S_end_of_day = Period( + freq="S", year=2007, month=1, day=1, hour=23, minute=59, second=59 + ) + ival_S_end_of_bus = Period( + freq="S", year=2007, month=1, day=1, hour=23, minute=59, second=59 + ) + ival_S_end_of_hour = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=59, second=59 + ) + ival_S_end_of_minute = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=59 + ) + + ival_S_to_A = Period(freq="A", year=2007) + ival_S_to_Q = Period(freq="Q", year=2007, quarter=1) + ival_S_to_M = Period(freq="M", year=2007, month=1) + ival_S_to_W = Period(freq="W", year=2007, month=1, day=7) + ival_S_to_D = Period(freq="D", year=2007, month=1, day=1) + ival_S_to_B = Period(freq="B", year=2007, month=1, day=1) + ival_S_to_H = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_S_to_T = Period(freq="Min", year=2007, month=1, day=1, hour=0, minute=0) + + assert ival_S.asfreq("A") == ival_S_to_A + assert ival_S_end_of_year.asfreq("A") == ival_S_to_A + assert ival_S.asfreq("Q") == ival_S_to_Q + assert ival_S_end_of_quarter.asfreq("Q") == ival_S_to_Q + assert ival_S.asfreq("M") == ival_S_to_M + assert ival_S_end_of_month.asfreq("M") == ival_S_to_M + assert ival_S.asfreq("W") == ival_S_to_W + assert ival_S_end_of_week.asfreq("W") == ival_S_to_W + assert ival_S.asfreq("D") == ival_S_to_D + assert ival_S_end_of_day.asfreq("D") == ival_S_to_D + assert ival_S.asfreq("B") == ival_S_to_B + assert ival_S_end_of_bus.asfreq("B") == ival_S_to_B + assert ival_S.asfreq("H") == ival_S_to_H + assert ival_S_end_of_hour.asfreq("H") == ival_S_to_H + assert ival_S.asfreq("Min") == ival_S_to_T + assert ival_S_end_of_minute.asfreq("Min") == ival_S_to_T + + assert ival_S.asfreq("S") == ival_S def test_asfreq_mult(self): # normal freq to mult freq - p = Period(freq='A', year=2007) + p = Period(freq="A", year=2007) # ordinal will not change - for freq in ['3A', offsets.YearEnd(3)]: + for freq in ["3A", offsets.YearEnd(3)]: result = p.asfreq(freq) - expected = Period('2007', freq='3A') + expected = Period("2007", freq="3A") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq # ordinal will not change - for freq in ['3A', offsets.YearEnd(3)]: - result = p.asfreq(freq, how='S') - expected = Period('2007', freq='3A') + for freq in ["3A", offsets.YearEnd(3)]: + result = p.asfreq(freq, how="S") + expected = Period("2007", freq="3A") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq # mult freq to normal freq - p = Period(freq='3A', year=2007) + p = Period(freq="3A", year=2007) # ordinal will change because how=E is the default - for freq in ['A', offsets.YearEnd()]: + for freq in ["A", offsets.YearEnd()]: result = p.asfreq(freq) - expected = Period('2009', freq='A') + expected = Period("2009", freq="A") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq # ordinal will not change - for freq in ['A', offsets.YearEnd()]: - result = p.asfreq(freq, how='S') - expected = Period('2007', freq='A') + for freq in ["A", offsets.YearEnd()]: + result = p.asfreq(freq, how="S") + expected = Period("2007", freq="A") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq - p = Period(freq='A', year=2007) - for freq in ['2M', offsets.MonthEnd(2)]: + p = Period(freq="A", year=2007) + for freq in ["2M", offsets.MonthEnd(2)]: result = p.asfreq(freq) - expected = Period('2007-12', freq='2M') + expected = Period("2007-12", freq="2M") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq - for freq in ['2M', offsets.MonthEnd(2)]: - result = p.asfreq(freq, how='S') - expected = Period('2007-01', freq='2M') + for freq in ["2M", offsets.MonthEnd(2)]: + result = p.asfreq(freq, how="S") + expected = Period("2007-01", freq="2M") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq - p = Period(freq='3A', year=2007) - for freq in ['2M', offsets.MonthEnd(2)]: + p = Period(freq="3A", year=2007) + for freq in ["2M", offsets.MonthEnd(2)]: result = p.asfreq(freq) - expected = Period('2009-12', freq='2M') + expected = Period("2009-12", freq="2M") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq - for freq in ['2M', offsets.MonthEnd(2)]: - result = p.asfreq(freq, how='S') - expected = Period('2007-01', freq='2M') + for freq in ["2M", offsets.MonthEnd(2)]: + result = p.asfreq(freq, how="S") + expected = Period("2007-01", freq="2M") assert result == expected assert result.ordinal == expected.ordinal @@ -696,24 +732,24 @@ def test_asfreq_mult(self): def test_asfreq_combined(self): # normal freq to combined freq - p = Period('2007', freq='H') + p = Period("2007", freq="H") # ordinal will not change - expected = Period('2007', freq='25H') - for freq, how in zip(['1D1H', '1H1D'], ['E', 'S']): + expected = Period("2007", freq="25H") + for freq, how in zip(["1D1H", "1H1D"], ["E", "S"]): result = p.asfreq(freq, how=how) assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq # combined freq to normal freq - p1 = Period(freq='1D1H', year=2007) - p2 = Period(freq='1H1D', year=2007) + p1 = Period(freq="1D1H", year=2007) + p2 = Period(freq="1H1D", year=2007) # ordinal will change because how=E is the default - result1 = p1.asfreq('H') - result2 = p2.asfreq('H') - expected = Period('2007-01-02', freq='H') + result1 = p1.asfreq("H") + result2 = p2.asfreq("H") + expected = Period("2007-01-02", freq="H") assert result1 == expected assert result1.ordinal == expected.ordinal assert result1.freq == expected.freq @@ -722,9 +758,9 @@ def test_asfreq_combined(self): assert result2.freq == expected.freq # ordinal will not change - result1 = p1.asfreq('H', how='S') - result2 = p2.asfreq('H', how='S') - expected = Period('2007-01-01', freq='H') + result1 = p1.asfreq("H", how="S") + result2 = p2.asfreq("H", how="S") + expected = Period("2007-01-01", freq="H") assert result1 == expected assert result1.ordinal == expected.ordinal assert result1.freq == expected.freq @@ -735,13 +771,13 @@ def test_asfreq_combined(self): def test_asfreq_MS(self): initial = Period("2013") - assert initial.asfreq(freq="M", how="S") == Period('2013-01', 'M') + assert initial.asfreq(freq="M", how="S") == Period("2013-01", "M") msg = INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): initial.asfreq(freq="MS", how="S") with pytest.raises(ValueError, match=msg): - Period('2013-01', 'MS') + Period("2013-01", "MS") assert _period_code_map.get("MS") is None diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 2a765086af4036..34d2fa6a9194ca 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -20,60 +20,58 @@ class TestPeriodConstruction: def test_construction(self): - i1 = Period('1/1/2005', freq='M') - i2 = Period('Jan 2005') + i1 = Period("1/1/2005", freq="M") + i2 = Period("Jan 2005") assert i1 == i2 - i1 = Period('2005', freq='A') - i2 = Period('2005') - i3 = Period('2005', freq='a') + i1 = Period("2005", freq="A") + i2 = Period("2005") + i3 = Period("2005", freq="a") assert i1 == i2 assert i1 == i3 - i4 = Period('2005', freq='M') - i5 = Period('2005', freq='m') + i4 = Period("2005", freq="M") + i5 = Period("2005", freq="m") msg = r"Input has different freq=M from Period\(freq=A-DEC\)" with pytest.raises(IncompatibleFrequency, match=msg): i1 != i4 assert i4 == i5 - i1 = Period.now('Q') - i2 = Period(datetime.now(), freq='Q') - i3 = Period.now('q') + i1 = Period.now("Q") + i2 = Period(datetime.now(), freq="Q") + i3 = Period.now("q") assert i1 == i2 assert i1 == i3 - i1 = Period('1982', freq='min') - i2 = Period('1982', freq='MIN') + i1 = Period("1982", freq="min") + i2 = Period("1982", freq="MIN") assert i1 == i2 - i2 = Period('1982', freq=('Min', 1)) + i2 = Period("1982", freq=("Min", 1)) assert i1 == i2 - i1 = Period(year=2005, month=3, day=1, freq='D') - i2 = Period('3/1/2005', freq='D') + i1 = Period(year=2005, month=3, day=1, freq="D") + i2 = Period("3/1/2005", freq="D") assert i1 == i2 - i3 = Period(year=2005, month=3, day=1, freq='d') + i3 = Period(year=2005, month=3, day=1, freq="d") assert i1 == i3 - i1 = Period('2007-01-01 09:00:00.001') - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq='L') + i1 = Period("2007-01-01 09:00:00.001") + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq="L") assert i1 == expected - expected = Period(np_datetime64_compat( - '2007-01-01 09:00:00.001Z'), freq='L') + expected = Period(np_datetime64_compat("2007-01-01 09:00:00.001Z"), freq="L") assert i1 == expected - i1 = Period('2007-01-01 09:00:00.00101') - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq='U') + i1 = Period("2007-01-01 09:00:00.00101") + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq="U") assert i1 == expected - expected = Period(np_datetime64_compat('2007-01-01 09:00:00.00101Z'), - freq='U') + expected = Period(np_datetime64_compat("2007-01-01 09:00:00.00101Z"), freq="U") assert i1 == expected msg = "Must supply freq for ordinal value" @@ -81,161 +79,158 @@ def test_construction(self): Period(ordinal=200701) with pytest.raises(ValueError, match="Invalid frequency: X"): - Period('2007-1-1', freq='X') + Period("2007-1-1", freq="X") def test_construction_bday(self): # Biz day construction, roll forward if non-weekday - i1 = Period('3/10/12', freq='B') - i2 = Period('3/10/12', freq='D') - assert i1 == i2.asfreq('B') - i2 = Period('3/11/12', freq='D') - assert i1 == i2.asfreq('B') - i2 = Period('3/12/12', freq='D') - assert i1 == i2.asfreq('B') - - i3 = Period('3/10/12', freq='b') + i1 = Period("3/10/12", freq="B") + i2 = Period("3/10/12", freq="D") + assert i1 == i2.asfreq("B") + i2 = Period("3/11/12", freq="D") + assert i1 == i2.asfreq("B") + i2 = Period("3/12/12", freq="D") + assert i1 == i2.asfreq("B") + + i3 = Period("3/10/12", freq="b") assert i1 == i3 - i1 = Period(year=2012, month=3, day=10, freq='B') - i2 = Period('3/12/12', freq='B') + i1 = Period(year=2012, month=3, day=10, freq="B") + i2 = Period("3/12/12", freq="B") assert i1 == i2 def test_construction_quarter(self): - i1 = Period(year=2005, quarter=1, freq='Q') - i2 = Period('1/1/2005', freq='Q') + i1 = Period(year=2005, quarter=1, freq="Q") + i2 = Period("1/1/2005", freq="Q") assert i1 == i2 - i1 = Period(year=2005, quarter=3, freq='Q') - i2 = Period('9/1/2005', freq='Q') + i1 = Period(year=2005, quarter=3, freq="Q") + i2 = Period("9/1/2005", freq="Q") assert i1 == i2 - i1 = Period('2005Q1') - i2 = Period(year=2005, quarter=1, freq='Q') - i3 = Period('2005q1') + i1 = Period("2005Q1") + i2 = Period(year=2005, quarter=1, freq="Q") + i3 = Period("2005q1") assert i1 == i2 assert i1 == i3 - i1 = Period('05Q1') + i1 = Period("05Q1") assert i1 == i2 - lower = Period('05q1') + lower = Period("05q1") assert i1 == lower - i1 = Period('1Q2005') + i1 = Period("1Q2005") assert i1 == i2 - lower = Period('1q2005') + lower = Period("1q2005") assert i1 == lower - i1 = Period('1Q05') + i1 = Period("1Q05") assert i1 == i2 - lower = Period('1q05') + lower = Period("1q05") assert i1 == lower - i1 = Period('4Q1984') + i1 = Period("4Q1984") assert i1.year == 1984 - lower = Period('4q1984') + lower = Period("4q1984") assert i1 == lower def test_construction_month(self): - expected = Period('2007-01', freq='M') - i1 = Period('200701', freq='M') + expected = Period("2007-01", freq="M") + i1 = Period("200701", freq="M") assert i1 == expected - i1 = Period('200701', freq='M') + i1 = Period("200701", freq="M") assert i1 == expected - i1 = Period(200701, freq='M') + i1 = Period(200701, freq="M") assert i1 == expected - i1 = Period(ordinal=200701, freq='M') + i1 = Period(ordinal=200701, freq="M") assert i1.year == 18695 - i1 = Period(datetime(2007, 1, 1), freq='M') - i2 = Period('200701', freq='M') + i1 = Period(datetime(2007, 1, 1), freq="M") + i2 = Period("200701", freq="M") assert i1 == i2 - i1 = Period(date(2007, 1, 1), freq='M') - i2 = Period(datetime(2007, 1, 1), freq='M') - i3 = Period(np.datetime64('2007-01-01'), freq='M') - i4 = Period(np_datetime64_compat('2007-01-01 00:00:00Z'), freq='M') - i5 = Period(np_datetime64_compat('2007-01-01 00:00:00.000Z'), freq='M') + i1 = Period(date(2007, 1, 1), freq="M") + i2 = Period(datetime(2007, 1, 1), freq="M") + i3 = Period(np.datetime64("2007-01-01"), freq="M") + i4 = Period(np_datetime64_compat("2007-01-01 00:00:00Z"), freq="M") + i5 = Period(np_datetime64_compat("2007-01-01 00:00:00.000Z"), freq="M") assert i1 == i2 assert i1 == i3 assert i1 == i4 assert i1 == i5 def test_period_constructor_offsets(self): - assert (Period('1/1/2005', freq=offsets.MonthEnd()) == - Period('1/1/2005', freq='M')) - assert (Period('2005', freq=offsets.YearEnd()) == - Period('2005', freq='A')) - assert (Period('2005', freq=offsets.MonthEnd()) == - Period('2005', freq='M')) - assert (Period('3/10/12', freq=offsets.BusinessDay()) == - Period('3/10/12', freq='B')) - assert (Period('3/10/12', freq=offsets.Day()) == - Period('3/10/12', freq='D')) - - assert (Period(year=2005, quarter=1, - freq=offsets.QuarterEnd(startingMonth=12)) == - Period(year=2005, quarter=1, freq='Q')) - assert (Period(year=2005, quarter=2, - freq=offsets.QuarterEnd(startingMonth=12)) == - Period(year=2005, quarter=2, freq='Q')) - - assert (Period(year=2005, month=3, day=1, freq=offsets.Day()) == - Period(year=2005, month=3, day=1, freq='D')) - assert (Period(year=2012, month=3, day=10, freq=offsets.BDay()) == - Period(year=2012, month=3, day=10, freq='B')) - - expected = Period('2005-03-01', freq='3D') - assert (Period(year=2005, month=3, day=1, - freq=offsets.Day(3)) == expected) - assert Period(year=2005, month=3, day=1, freq='3D') == expected - - assert (Period(year=2012, month=3, day=10, - freq=offsets.BDay(3)) == - Period(year=2012, month=3, day=10, freq='3B')) - - assert (Period(200701, freq=offsets.MonthEnd()) == - Period(200701, freq='M')) + assert Period("1/1/2005", freq=offsets.MonthEnd()) == Period( + "1/1/2005", freq="M" + ) + assert Period("2005", freq=offsets.YearEnd()) == Period("2005", freq="A") + assert Period("2005", freq=offsets.MonthEnd()) == Period("2005", freq="M") + assert Period("3/10/12", freq=offsets.BusinessDay()) == Period( + "3/10/12", freq="B" + ) + assert Period("3/10/12", freq=offsets.Day()) == Period("3/10/12", freq="D") + + assert Period( + year=2005, quarter=1, freq=offsets.QuarterEnd(startingMonth=12) + ) == Period(year=2005, quarter=1, freq="Q") + assert Period( + year=2005, quarter=2, freq=offsets.QuarterEnd(startingMonth=12) + ) == Period(year=2005, quarter=2, freq="Q") + + assert Period(year=2005, month=3, day=1, freq=offsets.Day()) == Period( + year=2005, month=3, day=1, freq="D" + ) + assert Period(year=2012, month=3, day=10, freq=offsets.BDay()) == Period( + year=2012, month=3, day=10, freq="B" + ) + + expected = Period("2005-03-01", freq="3D") + assert Period(year=2005, month=3, day=1, freq=offsets.Day(3)) == expected + assert Period(year=2005, month=3, day=1, freq="3D") == expected + + assert Period(year=2012, month=3, day=10, freq=offsets.BDay(3)) == Period( + year=2012, month=3, day=10, freq="3B" + ) + + assert Period(200701, freq=offsets.MonthEnd()) == Period(200701, freq="M") i1 = Period(ordinal=200701, freq=offsets.MonthEnd()) - i2 = Period(ordinal=200701, freq='M') + i2 = Period(ordinal=200701, freq="M") assert i1 == i2 assert i1.year == 18695 assert i2.year == 18695 - i1 = Period(datetime(2007, 1, 1), freq='M') - i2 = Period('200701', freq='M') + i1 = Period(datetime(2007, 1, 1), freq="M") + i2 = Period("200701", freq="M") assert i1 == i2 - i1 = Period(date(2007, 1, 1), freq='M') - i2 = Period(datetime(2007, 1, 1), freq='M') - i3 = Period(np.datetime64('2007-01-01'), freq='M') - i4 = Period(np_datetime64_compat('2007-01-01 00:00:00Z'), freq='M') - i5 = Period(np_datetime64_compat('2007-01-01 00:00:00.000Z'), freq='M') + i1 = Period(date(2007, 1, 1), freq="M") + i2 = Period(datetime(2007, 1, 1), freq="M") + i3 = Period(np.datetime64("2007-01-01"), freq="M") + i4 = Period(np_datetime64_compat("2007-01-01 00:00:00Z"), freq="M") + i5 = Period(np_datetime64_compat("2007-01-01 00:00:00.000Z"), freq="M") assert i1 == i2 assert i1 == i3 assert i1 == i4 assert i1 == i5 - i1 = Period('2007-01-01 09:00:00.001') - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq='L') + i1 = Period("2007-01-01 09:00:00.001") + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq="L") assert i1 == expected - expected = Period(np_datetime64_compat( - '2007-01-01 09:00:00.001Z'), freq='L') + expected = Period(np_datetime64_compat("2007-01-01 09:00:00.001Z"), freq="L") assert i1 == expected - i1 = Period('2007-01-01 09:00:00.00101') - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq='U') + i1 = Period("2007-01-01 09:00:00.00101") + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq="U") assert i1 == expected - expected = Period(np_datetime64_compat('2007-01-01 09:00:00.00101Z'), - freq='U') + expected = Period(np_datetime64_compat("2007-01-01 09:00:00.00101Z"), freq="U") assert i1 == expected def test_invalid_arguments(self): @@ -245,62 +240,62 @@ def test_invalid_arguments(self): Period(datetime.now().date()) with pytest.raises(ValueError): - Period(1.6, freq='D') + Period(1.6, freq="D") with pytest.raises(ValueError): - Period(ordinal=1.6, freq='D') + Period(ordinal=1.6, freq="D") with pytest.raises(ValueError): - Period(ordinal=2, value=1, freq='D') + Period(ordinal=2, value=1, freq="D") with pytest.raises(ValueError): Period(month=1) with pytest.raises(ValueError): - Period('-2000', 'A') + Period("-2000", "A") with pytest.raises(DateParseError): - Period('0', 'A') + Period("0", "A") with pytest.raises(DateParseError): - Period('1/1/-2000', 'A') + Period("1/1/-2000", "A") def test_constructor_corner(self): - expected = Period('2007-01', freq='2M') - assert Period(year=2007, month=1, freq='2M') == expected + expected = Period("2007-01", freq="2M") + assert Period(year=2007, month=1, freq="2M") == expected assert Period(None) is NaT - p = Period('2007-01-01', freq='D') + p = Period("2007-01-01", freq="D") - result = Period(p, freq='A') - exp = Period('2007', freq='A') + result = Period(p, freq="A") + exp = Period("2007", freq="A") assert result == exp def test_constructor_infer_freq(self): - p = Period('2007-01-01') - assert p.freq == 'D' + p = Period("2007-01-01") + assert p.freq == "D" - p = Period('2007-01-01 07') - assert p.freq == 'H' + p = Period("2007-01-01 07") + assert p.freq == "H" - p = Period('2007-01-01 07:10') - assert p.freq == 'T' + p = Period("2007-01-01 07:10") + assert p.freq == "T" - p = Period('2007-01-01 07:10:15') - assert p.freq == 'S' + p = Period("2007-01-01 07:10:15") + assert p.freq == "S" - p = Period('2007-01-01 07:10:15.123') - assert p.freq == 'L' + p = Period("2007-01-01 07:10:15.123") + assert p.freq == "L" - p = Period('2007-01-01 07:10:15.123000') - assert p.freq == 'L' + p = Period("2007-01-01 07:10:15.123000") + assert p.freq == "L" - p = Period('2007-01-01 07:10:15.123400') - assert p.freq == 'U' + p = Period("2007-01-01 07:10:15.123400") + assert p.freq == "U" def test_multiples(self): - result1 = Period('1989', freq='2A') - result2 = Period('1989', freq='A') + result1 = Period("1989", freq="2A") + result2 = Period("1989", freq="A") assert result1.ordinal == result2.ordinal - assert result1.freqstr == '2A-DEC' - assert result2.freqstr == 'A-DEC' + assert result1.freqstr == "2A-DEC" + assert result2.freqstr == "A-DEC" assert result1.freq == offsets.YearEnd(2) assert result2.freq == offsets.YearEnd() @@ -309,428 +304,428 @@ def test_multiples(self): assert (result1 - 1).ordinal == result2.ordinal - 2 assert (-1 + result1).ordinal == result2.ordinal - 2 - @pytest.mark.parametrize('month', MONTHS) + @pytest.mark.parametrize("month", MONTHS) def test_period_cons_quarterly(self, month): # bugs in scikits.timeseries - freq = 'Q-%s' % month - exp = Period('1989Q3', freq=freq) - assert '1989Q3' in str(exp) - stamp = exp.to_timestamp('D', how='end') + freq = "Q-%s" % month + exp = Period("1989Q3", freq=freq) + assert "1989Q3" in str(exp) + stamp = exp.to_timestamp("D", how="end") p = Period(stamp, freq=freq) assert p == exp - stamp = exp.to_timestamp('3D', how='end') + stamp = exp.to_timestamp("3D", how="end") p = Period(stamp, freq=freq) assert p == exp - @pytest.mark.parametrize('month', MONTHS) + @pytest.mark.parametrize("month", MONTHS) def test_period_cons_annual(self, month): # bugs in scikits.timeseries - freq = 'A-%s' % month - exp = Period('1989', freq=freq) - stamp = exp.to_timestamp('D', how='end') + timedelta(days=30) + freq = "A-%s" % month + exp = Period("1989", freq=freq) + stamp = exp.to_timestamp("D", how="end") + timedelta(days=30) p = Period(stamp, freq=freq) assert p == exp + 1 assert isinstance(p, Period) - @pytest.mark.parametrize('day', DAYS) - @pytest.mark.parametrize('num', range(10, 17)) + @pytest.mark.parametrize("day", DAYS) + @pytest.mark.parametrize("num", range(10, 17)) def test_period_cons_weekly(self, num, day): - daystr = '2011-02-%d' % num - freq = 'W-%s' % day + daystr = "2011-02-%d" % num + freq = "W-%s" % day result = Period(daystr, freq=freq) - expected = Period(daystr, freq='D').asfreq(freq) + expected = Period(daystr, freq="D").asfreq(freq) assert result == expected assert isinstance(result, Period) def test_period_from_ordinal(self): - p = Period('2011-01', freq='M') - res = Period._from_ordinal(p.ordinal, freq='M') + p = Period("2011-01", freq="M") + res = Period._from_ordinal(p.ordinal, freq="M") assert p == res assert isinstance(res, Period) def test_period_cons_nat(self): - p = Period('NaT', freq='M') + p = Period("NaT", freq="M") assert p is NaT - p = Period('nat', freq='W-SUN') + p = Period("nat", freq="W-SUN") assert p is NaT - p = Period(iNaT, freq='D') + p = Period(iNaT, freq="D") assert p is NaT - p = Period(iNaT, freq='3D') + p = Period(iNaT, freq="3D") assert p is NaT - p = Period(iNaT, freq='1D1H') + p = Period(iNaT, freq="1D1H") assert p is NaT - p = Period('NaT') + p = Period("NaT") assert p is NaT p = Period(iNaT) assert p is NaT def test_period_cons_mult(self): - p1 = Period('2011-01', freq='3M') - p2 = Period('2011-01', freq='M') + p1 = Period("2011-01", freq="3M") + p2 = Period("2011-01", freq="M") assert p1.ordinal == p2.ordinal assert p1.freq == offsets.MonthEnd(3) - assert p1.freqstr == '3M' + assert p1.freqstr == "3M" assert p2.freq == offsets.MonthEnd() - assert p2.freqstr == 'M' + assert p2.freqstr == "M" result = p1 + 1 assert result.ordinal == (p2 + 3).ordinal assert result.freq == p1.freq - assert result.freqstr == '3M' + assert result.freqstr == "3M" result = p1 - 1 assert result.ordinal == (p2 - 3).ordinal assert result.freq == p1.freq - assert result.freqstr == '3M' + assert result.freqstr == "3M" - msg = ('Frequency must be positive, because it' - ' represents span: -3M') + msg = "Frequency must be positive, because it" " represents span: -3M" with pytest.raises(ValueError, match=msg): - Period('2011-01', freq='-3M') + Period("2011-01", freq="-3M") - msg = ('Frequency must be positive, because it' ' represents span: 0M') + msg = "Frequency must be positive, because it" " represents span: 0M" with pytest.raises(ValueError, match=msg): - Period('2011-01', freq='0M') + Period("2011-01", freq="0M") def test_period_cons_combined(self): - p = [(Period('2011-01', freq='1D1H'), - Period('2011-01', freq='1H1D'), - Period('2011-01', freq='H')), - (Period(ordinal=1, freq='1D1H'), - Period(ordinal=1, freq='1H1D'), - Period(ordinal=1, freq='H'))] + p = [ + ( + Period("2011-01", freq="1D1H"), + Period("2011-01", freq="1H1D"), + Period("2011-01", freq="H"), + ), + ( + Period(ordinal=1, freq="1D1H"), + Period(ordinal=1, freq="1H1D"), + Period(ordinal=1, freq="H"), + ), + ] for p1, p2, p3 in p: assert p1.ordinal == p3.ordinal assert p2.ordinal == p3.ordinal assert p1.freq == offsets.Hour(25) - assert p1.freqstr == '25H' + assert p1.freqstr == "25H" assert p2.freq == offsets.Hour(25) - assert p2.freqstr == '25H' + assert p2.freqstr == "25H" assert p3.freq == offsets.Hour() - assert p3.freqstr == 'H' + assert p3.freqstr == "H" result = p1 + 1 assert result.ordinal == (p3 + 25).ordinal assert result.freq == p1.freq - assert result.freqstr == '25H' + assert result.freqstr == "25H" result = p2 + 1 assert result.ordinal == (p3 + 25).ordinal assert result.freq == p2.freq - assert result.freqstr == '25H' + assert result.freqstr == "25H" result = p1 - 1 assert result.ordinal == (p3 - 25).ordinal assert result.freq == p1.freq - assert result.freqstr == '25H' + assert result.freqstr == "25H" result = p2 - 1 assert result.ordinal == (p3 - 25).ordinal assert result.freq == p2.freq - assert result.freqstr == '25H' + assert result.freqstr == "25H" - msg = ('Frequency must be positive, because it' - ' represents span: -25H') + msg = "Frequency must be positive, because it" " represents span: -25H" with pytest.raises(ValueError, match=msg): - Period('2011-01', freq='-1D1H') + Period("2011-01", freq="-1D1H") with pytest.raises(ValueError, match=msg): - Period('2011-01', freq='-1H1D') + Period("2011-01", freq="-1H1D") with pytest.raises(ValueError, match=msg): - Period(ordinal=1, freq='-1D1H') + Period(ordinal=1, freq="-1D1H") with pytest.raises(ValueError, match=msg): - Period(ordinal=1, freq='-1H1D') + Period(ordinal=1, freq="-1H1D") - msg = ('Frequency must be positive, because it' - ' represents span: 0D') + msg = "Frequency must be positive, because it" " represents span: 0D" with pytest.raises(ValueError, match=msg): - Period('2011-01', freq='0D0H') + Period("2011-01", freq="0D0H") with pytest.raises(ValueError, match=msg): - Period(ordinal=1, freq='0D0H') + Period(ordinal=1, freq="0D0H") # You can only combine together day and intraday offsets - msg = ('Invalid frequency: 1W1D') + msg = "Invalid frequency: 1W1D" with pytest.raises(ValueError, match=msg): - Period('2011-01', freq='1W1D') - msg = ('Invalid frequency: 1D1W') + Period("2011-01", freq="1W1D") + msg = "Invalid frequency: 1D1W" with pytest.raises(ValueError, match=msg): - Period('2011-01', freq='1D1W') + Period("2011-01", freq="1D1W") class TestPeriodMethods: def test_round_trip(self): - p = Period('2000Q1') + p = Period("2000Q1") new_p = tm.round_trip_pickle(p) assert new_p == p def test_hash(self): - assert (hash(Period('2011-01', freq='M')) == - hash(Period('2011-01', freq='M'))) + assert hash(Period("2011-01", freq="M")) == hash(Period("2011-01", freq="M")) - assert (hash(Period('2011-01-01', freq='D')) != - hash(Period('2011-01', freq='M'))) + assert hash(Period("2011-01-01", freq="D")) != hash(Period("2011-01", freq="M")) - assert (hash(Period('2011-01', freq='3M')) != - hash(Period('2011-01', freq='2M'))) + assert hash(Period("2011-01", freq="3M")) != hash(Period("2011-01", freq="2M")) - assert (hash(Period('2011-01', freq='M')) != - hash(Period('2011-02', freq='M'))) + assert hash(Period("2011-01", freq="M")) != hash(Period("2011-02", freq="M")) # -------------------------------------------------------------- # to_timestamp - @pytest.mark.parametrize('tzstr', ['Europe/Brussels', - 'Asia/Tokyo', 'US/Pacific']) + @pytest.mark.parametrize("tzstr", ["Europe/Brussels", "Asia/Tokyo", "US/Pacific"]) def test_to_timestamp_tz_arg(self, tzstr): - p = Period('1/1/2005', freq='M').to_timestamp(tz=tzstr) - exp = Timestamp('1/1/2005', tz='UTC').tz_convert(tzstr) + p = Period("1/1/2005", freq="M").to_timestamp(tz=tzstr) + exp = Timestamp("1/1/2005", tz="UTC").tz_convert(tzstr) exp_zone = pytz.timezone(tzstr).normalize(p) assert p == exp assert p.tz == exp_zone.tzinfo assert p.tz == exp.tz - p = Period('1/1/2005', freq='3H').to_timestamp(tz=tzstr) - exp = Timestamp('1/1/2005', tz='UTC').tz_convert(tzstr) + p = Period("1/1/2005", freq="3H").to_timestamp(tz=tzstr) + exp = Timestamp("1/1/2005", tz="UTC").tz_convert(tzstr) exp_zone = pytz.timezone(tzstr).normalize(p) assert p == exp assert p.tz == exp_zone.tzinfo assert p.tz == exp.tz - p = Period('1/1/2005', freq='A').to_timestamp(freq='A', tz=tzstr) - exp = Timestamp('31/12/2005', tz='UTC').tz_convert(tzstr) + p = Period("1/1/2005", freq="A").to_timestamp(freq="A", tz=tzstr) + exp = Timestamp("31/12/2005", tz="UTC").tz_convert(tzstr) exp_zone = pytz.timezone(tzstr).normalize(p) assert p == exp assert p.tz == exp_zone.tzinfo assert p.tz == exp.tz - p = Period('1/1/2005', freq='A').to_timestamp(freq='3H', tz=tzstr) - exp = Timestamp('1/1/2005', tz='UTC').tz_convert(tzstr) + p = Period("1/1/2005", freq="A").to_timestamp(freq="3H", tz=tzstr) + exp = Timestamp("1/1/2005", tz="UTC").tz_convert(tzstr) exp_zone = pytz.timezone(tzstr).normalize(p) assert p == exp assert p.tz == exp_zone.tzinfo assert p.tz == exp.tz - @pytest.mark.parametrize('tzstr', ['dateutil/Europe/Brussels', - 'dateutil/Asia/Tokyo', - 'dateutil/US/Pacific']) + @pytest.mark.parametrize( + "tzstr", + ["dateutil/Europe/Brussels", "dateutil/Asia/Tokyo", "dateutil/US/Pacific"], + ) def test_to_timestamp_tz_arg_dateutil(self, tzstr): tz = maybe_get_tz(tzstr) - p = Period('1/1/2005', freq='M').to_timestamp(tz=tz) - exp = Timestamp('1/1/2005', tz='UTC').tz_convert(tzstr) + p = Period("1/1/2005", freq="M").to_timestamp(tz=tz) + exp = Timestamp("1/1/2005", tz="UTC").tz_convert(tzstr) assert p == exp - assert p.tz == dateutil_gettz(tzstr.split('/', 1)[1]) + assert p.tz == dateutil_gettz(tzstr.split("/", 1)[1]) assert p.tz == exp.tz - p = Period('1/1/2005', freq='M').to_timestamp(freq='3H', tz=tz) - exp = Timestamp('1/1/2005', tz='UTC').tz_convert(tzstr) + p = Period("1/1/2005", freq="M").to_timestamp(freq="3H", tz=tz) + exp = Timestamp("1/1/2005", tz="UTC").tz_convert(tzstr) assert p == exp - assert p.tz == dateutil_gettz(tzstr.split('/', 1)[1]) + assert p.tz == dateutil_gettz(tzstr.split("/", 1)[1]) assert p.tz == exp.tz def test_to_timestamp_tz_arg_dateutil_from_string(self): - p = Period('1/1/2005', - freq='M').to_timestamp(tz='dateutil/Europe/Brussels') - assert p.tz == dateutil_gettz('Europe/Brussels') + p = Period("1/1/2005", freq="M").to_timestamp(tz="dateutil/Europe/Brussels") + assert p.tz == dateutil_gettz("Europe/Brussels") def test_to_timestamp_mult(self): - p = Period('2011-01', freq='M') - assert p.to_timestamp(how='S') == Timestamp('2011-01-01') - expected = Timestamp('2011-02-01') - Timedelta(1, 'ns') - assert p.to_timestamp(how='E') == expected + p = Period("2011-01", freq="M") + assert p.to_timestamp(how="S") == Timestamp("2011-01-01") + expected = Timestamp("2011-02-01") - Timedelta(1, "ns") + assert p.to_timestamp(how="E") == expected - p = Period('2011-01', freq='3M') - assert p.to_timestamp(how='S') == Timestamp('2011-01-01') - expected = Timestamp('2011-04-01') - Timedelta(1, 'ns') - assert p.to_timestamp(how='E') == expected + p = Period("2011-01", freq="3M") + assert p.to_timestamp(how="S") == Timestamp("2011-01-01") + expected = Timestamp("2011-04-01") - Timedelta(1, "ns") + assert p.to_timestamp(how="E") == expected def test_to_timestamp(self): - p = Period('1982', freq='A') - start_ts = p.to_timestamp(how='S') - aliases = ['s', 'StarT', 'BEGIn'] + p = Period("1982", freq="A") + start_ts = p.to_timestamp(how="S") + aliases = ["s", "StarT", "BEGIn"] for a in aliases: - assert start_ts == p.to_timestamp('D', how=a) + assert start_ts == p.to_timestamp("D", how=a) # freq with mult should not affect to the result - assert start_ts == p.to_timestamp('3D', how=a) + assert start_ts == p.to_timestamp("3D", how=a) - end_ts = p.to_timestamp(how='E') - aliases = ['e', 'end', 'FINIsH'] + end_ts = p.to_timestamp(how="E") + aliases = ["e", "end", "FINIsH"] for a in aliases: - assert end_ts == p.to_timestamp('D', how=a) - assert end_ts == p.to_timestamp('3D', how=a) + assert end_ts == p.to_timestamp("D", how=a) + assert end_ts == p.to_timestamp("3D", how=a) - from_lst = ['A', 'Q', 'M', 'W', 'B', 'D', 'H', 'Min', 'S'] + from_lst = ["A", "Q", "M", "W", "B", "D", "H", "Min", "S"] def _ex(p): return Timestamp((p + p.freq).start_time.value - 1) for i, fcode in enumerate(from_lst): - p = Period('1982', freq=fcode) + p = Period("1982", freq=fcode) result = p.to_timestamp().to_period(fcode) assert result == p - assert p.start_time == p.to_timestamp(how='S') + assert p.start_time == p.to_timestamp(how="S") assert p.end_time == _ex(p) # Frequency other than daily - p = Period('1985', freq='A') + p = Period("1985", freq="A") - result = p.to_timestamp('H', how='end') - expected = Timestamp(1986, 1, 1) - Timedelta(1, 'ns') + result = p.to_timestamp("H", how="end") + expected = Timestamp(1986, 1, 1) - Timedelta(1, "ns") assert result == expected - result = p.to_timestamp('3H', how='end') + result = p.to_timestamp("3H", how="end") assert result == expected - result = p.to_timestamp('T', how='end') - expected = Timestamp(1986, 1, 1) - Timedelta(1, 'ns') + result = p.to_timestamp("T", how="end") + expected = Timestamp(1986, 1, 1) - Timedelta(1, "ns") assert result == expected - result = p.to_timestamp('2T', how='end') + result = p.to_timestamp("2T", how="end") assert result == expected - result = p.to_timestamp(how='end') - expected = Timestamp(1986, 1, 1) - Timedelta(1, 'ns') + result = p.to_timestamp(how="end") + expected = Timestamp(1986, 1, 1) - Timedelta(1, "ns") assert result == expected expected = datetime(1985, 1, 1) - result = p.to_timestamp('H', how='start') + result = p.to_timestamp("H", how="start") assert result == expected - result = p.to_timestamp('T', how='start') + result = p.to_timestamp("T", how="start") assert result == expected - result = p.to_timestamp('S', how='start') + result = p.to_timestamp("S", how="start") assert result == expected - result = p.to_timestamp('3H', how='start') + result = p.to_timestamp("3H", how="start") assert result == expected - result = p.to_timestamp('5S', how='start') + result = p.to_timestamp("5S", how="start") assert result == expected # -------------------------------------------------------------- # Rendering: __repr__, strftime, etc def test_repr(self): - p = Period('Jan-2000') - assert '2000-01' in repr(p) + p = Period("Jan-2000") + assert "2000-01" in repr(p) - p = Period('2000-12-15') - assert '2000-12-15' in repr(p) + p = Period("2000-12-15") + assert "2000-12-15" in repr(p) def test_repr_nat(self): - p = Period('nat', freq='M') + p = Period("nat", freq="M") assert repr(NaT) in repr(p) def test_millisecond_repr(self): - p = Period('2000-01-01 12:15:02.123') + p = Period("2000-01-01 12:15:02.123") assert repr(p) == "Period('2000-01-01 12:15:02.123', 'L')" def test_microsecond_repr(self): - p = Period('2000-01-01 12:15:02.123567') + p = Period("2000-01-01 12:15:02.123567") assert repr(p) == "Period('2000-01-01 12:15:02.123567', 'U')" def test_strftime(self): # GH#3363 - p = Period('2000-1-1 12:34:12', freq='S') - res = p.strftime('%Y-%m-%d %H:%M:%S') - assert res == '2000-01-01 12:34:12' + p = Period("2000-1-1 12:34:12", freq="S") + res = p.strftime("%Y-%m-%d %H:%M:%S") + assert res == "2000-01-01 12:34:12" assert isinstance(res, str) class TestPeriodProperties: "Test properties such as year, month, weekday, etc...." - @pytest.mark.parametrize('freq', ['A', 'M', 'D', 'H']) + @pytest.mark.parametrize("freq", ["A", "M", "D", "H"]) def test_is_leap_year(self, freq): # GH 13727 - p = Period('2000-01-01 00:00:00', freq=freq) + p = Period("2000-01-01 00:00:00", freq=freq) assert p.is_leap_year assert isinstance(p.is_leap_year, bool) - p = Period('1999-01-01 00:00:00', freq=freq) + p = Period("1999-01-01 00:00:00", freq=freq) assert not p.is_leap_year - p = Period('2004-01-01 00:00:00', freq=freq) + p = Period("2004-01-01 00:00:00", freq=freq) assert p.is_leap_year - p = Period('2100-01-01 00:00:00', freq=freq) + p = Period("2100-01-01 00:00:00", freq=freq) assert not p.is_leap_year def test_quarterly_negative_ordinals(self): - p = Period(ordinal=-1, freq='Q-DEC') + p = Period(ordinal=-1, freq="Q-DEC") assert p.year == 1969 assert p.quarter == 4 assert isinstance(p, Period) - p = Period(ordinal=-2, freq='Q-DEC') + p = Period(ordinal=-2, freq="Q-DEC") assert p.year == 1969 assert p.quarter == 3 assert isinstance(p, Period) - p = Period(ordinal=-2, freq='M') + p = Period(ordinal=-2, freq="M") assert p.year == 1969 assert p.month == 11 assert isinstance(p, Period) def test_freq_str(self): - i1 = Period('1982', freq='Min') + i1 = Period("1982", freq="Min") assert i1.freq == offsets.Minute() - assert i1.freqstr == 'T' + assert i1.freqstr == "T" def test_period_deprecated_freq(self): - cases = {"M": ["MTH", "MONTH", "MONTHLY", "Mth", "month", "monthly"], - "B": ["BUS", "BUSINESS", "BUSINESSLY", "WEEKDAY", "bus"], - "D": ["DAY", "DLY", "DAILY", "Day", "Dly", "Daily"], - "H": ["HR", "HOUR", "HRLY", "HOURLY", "hr", "Hour", "HRly"], - "T": ["minute", "MINUTE", "MINUTELY", "minutely"], - "S": ["sec", "SEC", "SECOND", "SECONDLY", "second"], - "L": ["MILLISECOND", "MILLISECONDLY", "millisecond"], - "U": ["MICROSECOND", "MICROSECONDLY", "microsecond"], - "N": ["NANOSECOND", "NANOSECONDLY", "nanosecond"]} + cases = { + "M": ["MTH", "MONTH", "MONTHLY", "Mth", "month", "monthly"], + "B": ["BUS", "BUSINESS", "BUSINESSLY", "WEEKDAY", "bus"], + "D": ["DAY", "DLY", "DAILY", "Day", "Dly", "Daily"], + "H": ["HR", "HOUR", "HRLY", "HOURLY", "hr", "Hour", "HRly"], + "T": ["minute", "MINUTE", "MINUTELY", "minutely"], + "S": ["sec", "SEC", "SECOND", "SECONDLY", "second"], + "L": ["MILLISECOND", "MILLISECONDLY", "millisecond"], + "U": ["MICROSECOND", "MICROSECONDLY", "microsecond"], + "N": ["NANOSECOND", "NANOSECONDLY", "nanosecond"], + } msg = INVALID_FREQ_ERR_MSG for exp, freqs in cases.items(): for freq in freqs: with pytest.raises(ValueError, match=msg): - Period('2016-03-01 09:00', freq=freq) + Period("2016-03-01 09:00", freq=freq) with pytest.raises(ValueError, match=msg): Period(ordinal=1, freq=freq) # check supported freq-aliases still works - p1 = Period('2016-03-01 09:00', freq=exp) + p1 = Period("2016-03-01 09:00", freq=exp) p2 = Period(ordinal=1, freq=exp) assert isinstance(p1, Period) assert isinstance(p2, Period) def test_start_time(self): - freq_lst = ['A', 'Q', 'M', 'D', 'H', 'T', 'S'] + freq_lst = ["A", "Q", "M", "D", "H", "T", "S"] xp = datetime(2012, 1, 1) for f in freq_lst: - p = Period('2012', freq=f) + p = Period("2012", freq=f) assert p.start_time == xp - assert Period('2012', freq='B').start_time == datetime(2012, 1, 2) - assert Period('2012', freq='W').start_time == datetime(2011, 12, 26) + assert Period("2012", freq="B").start_time == datetime(2012, 1, 2) + assert Period("2012", freq="W").start_time == datetime(2011, 12, 26) def test_end_time(self): - p = Period('2012', freq='A') + p = Period("2012", freq="A") def _ex(*args): return Timestamp(Timestamp(datetime(*args)).value - 1) @@ -738,40 +733,40 @@ def _ex(*args): xp = _ex(2013, 1, 1) assert xp == p.end_time - p = Period('2012', freq='Q') + p = Period("2012", freq="Q") xp = _ex(2012, 4, 1) assert xp == p.end_time - p = Period('2012', freq='M') + p = Period("2012", freq="M") xp = _ex(2012, 2, 1) assert xp == p.end_time - p = Period('2012', freq='D') + p = Period("2012", freq="D") xp = _ex(2012, 1, 2) assert xp == p.end_time - p = Period('2012', freq='H') + p = Period("2012", freq="H") xp = _ex(2012, 1, 1, 1) assert xp == p.end_time - p = Period('2012', freq='B') + p = Period("2012", freq="B") xp = _ex(2012, 1, 3) assert xp == p.end_time - p = Period('2012', freq='W') + p = Period("2012", freq="W") xp = _ex(2012, 1, 2) assert xp == p.end_time # Test for GH 11738 - p = Period('2012', freq='15D') + p = Period("2012", freq="15D") xp = _ex(2012, 1, 16) assert xp == p.end_time - p = Period('2012', freq='1D1H') + p = Period("2012", freq="1D1H") xp = _ex(2012, 1, 2, 1) assert xp == p.end_time - p = Period('2012', freq='1H1D') + p = Period("2012", freq="1H1D") xp = _ex(2012, 1, 2, 1) assert xp == p.end_time @@ -779,13 +774,13 @@ def test_anchor_week_end_time(self): def _ex(*args): return Timestamp(Timestamp(datetime(*args)).value - 1) - p = Period('2013-1-1', 'W-SAT') + p = Period("2013-1-1", "W-SAT") xp = _ex(2013, 1, 6) assert p.end_time == xp def test_properties_annually(self): # Test properties on Periods with annually frequency. - a_date = Period(freq='A', year=2007) + a_date = Period(freq="A", year=2007) assert a_date.year == 2007 def test_properties_quarterly(self): @@ -801,7 +796,7 @@ def test_properties_quarterly(self): def test_properties_monthly(self): # Test properties on Periods with daily frequency. - m_date = Period(freq='M', year=2007, month=1) + m_date = Period(freq="M", year=2007, month=1) for x in range(11): m_ival_x = m_date + x assert m_ival_x.year == 2007 @@ -817,7 +812,7 @@ def test_properties_monthly(self): def test_properties_weekly(self): # Test properties on Periods with daily frequency. - w_date = Period(freq='W', year=2007, month=1, day=7) + w_date = Period(freq="W", year=2007, month=1, day=7) # assert w_date.year == 2007 assert w_date.quarter == 1 @@ -825,12 +820,11 @@ def test_properties_weekly(self): assert w_date.week == 1 assert (w_date - 1).week == 52 assert w_date.days_in_month == 31 - assert Period(freq='W', year=2012, - month=2, day=1).days_in_month == 29 + assert Period(freq="W", year=2012, month=2, day=1).days_in_month == 29 def test_properties_weekly_legacy(self): # Test properties on Periods with daily frequency. - w_date = Period(freq='W', year=2007, month=1, day=7) + w_date = Period(freq="W", year=2007, month=1, day=7) assert w_date.year == 2007 assert w_date.quarter == 1 assert w_date.month == 1 @@ -838,16 +832,16 @@ def test_properties_weekly_legacy(self): assert (w_date - 1).week == 52 assert w_date.days_in_month == 31 - exp = Period(freq='W', year=2012, month=2, day=1) + exp = Period(freq="W", year=2012, month=2, day=1) assert exp.days_in_month == 29 msg = INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): - Period(freq='WK', year=2007, month=1, day=7) + Period(freq="WK", year=2007, month=1, day=7) def test_properties_daily(self): # Test properties on Periods with daily frequency. - b_date = Period(freq='B', year=2007, month=1, day=1) + b_date = Period(freq="B", year=2007, month=1, day=1) # assert b_date.year == 2007 assert b_date.quarter == 1 @@ -856,10 +850,9 @@ def test_properties_daily(self): assert b_date.weekday == 0 assert b_date.dayofyear == 1 assert b_date.days_in_month == 31 - assert Period(freq='B', year=2012, - month=2, day=1).days_in_month == 29 + assert Period(freq="B", year=2012, month=2, day=1).days_in_month == 29 - d_date = Period(freq='D', year=2007, month=1, day=1) + d_date = Period(freq="D", year=2007, month=1, day=1) assert d_date.year == 2007 assert d_date.quarter == 1 @@ -868,13 +861,12 @@ def test_properties_daily(self): assert d_date.weekday == 0 assert d_date.dayofyear == 1 assert d_date.days_in_month == 31 - assert Period(freq='D', year=2012, month=2, - day=1).days_in_month == 29 + assert Period(freq="D", year=2012, month=2, day=1).days_in_month == 29 def test_properties_hourly(self): # Test properties on Periods with hourly frequency. - h_date1 = Period(freq='H', year=2007, month=1, day=1, hour=0) - h_date2 = Period(freq='2H', year=2007, month=1, day=1, hour=0) + h_date1 = Period(freq="H", year=2007, month=1, day=1, hour=0) + h_date2 = Period(freq="2H", year=2007, month=1, day=1, hour=0) for h_date in [h_date1, h_date2]: assert h_date.year == 2007 @@ -885,13 +877,13 @@ def test_properties_hourly(self): assert h_date.dayofyear == 1 assert h_date.hour == 0 assert h_date.days_in_month == 31 - assert Period(freq='H', year=2012, month=2, day=1, - hour=0).days_in_month == 29 + assert ( + Period(freq="H", year=2012, month=2, day=1, hour=0).days_in_month == 29 + ) def test_properties_minutely(self): # Test properties on Periods with minutely frequency. - t_date = Period(freq='Min', year=2007, month=1, day=1, hour=0, - minute=0) + t_date = Period(freq="Min", year=2007, month=1, day=1, hour=0, minute=0) # assert t_date.quarter == 1 assert t_date.month == 1 @@ -901,13 +893,16 @@ def test_properties_minutely(self): assert t_date.hour == 0 assert t_date.minute == 0 assert t_date.days_in_month == 31 - assert Period(freq='D', year=2012, month=2, day=1, hour=0, - minute=0).days_in_month == 29 + assert ( + Period(freq="D", year=2012, month=2, day=1, hour=0, minute=0).days_in_month + == 29 + ) def test_properties_secondly(self): # Test properties on Periods with secondly frequency. - s_date = Period(freq='Min', year=2007, month=1, day=1, hour=0, - minute=0, second=0) + s_date = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) # assert s_date.year == 2007 assert s_date.quarter == 1 @@ -919,12 +914,15 @@ def test_properties_secondly(self): assert s_date.minute == 0 assert s_date.second == 0 assert s_date.days_in_month == 31 - assert Period(freq='Min', year=2012, month=2, day=1, hour=0, - minute=0, second=0).days_in_month == 29 + assert ( + Period( + freq="Min", year=2012, month=2, day=1, hour=0, minute=0, second=0 + ).days_in_month + == 29 + ) class TestPeriodField: - def test_get_period_field_array_raises_on_out_of_range(self): msg = "Buffer dtype mismatch, expected 'int64_t' but got 'double'" with pytest.raises(ValueError, match=msg): @@ -932,13 +930,12 @@ def test_get_period_field_array_raises_on_out_of_range(self): class TestComparisons: - def setup_method(self, method): - self.january1 = Period('2000-01', 'M') - self.january2 = Period('2000-01', 'M') - self.february = Period('2000-02', 'M') - self.march = Period('2000-03', 'M') - self.day = Period('2012-01-01', 'D') + self.january1 = Period("2000-01", "M") + self.january2 = Period("2000-01", "M") + self.february = Period("2000-02", "M") + self.march = Period("2000-03", "M") + self.day = Period("2012-01-01", "D") def test_equal(self): assert self.january1 == self.january2 @@ -1000,14 +997,20 @@ def test_sort(self): assert sorted(periods) == correctPeriods def test_period_nat_comp(self): - p_nat = Period('NaT', freq='D') - p = Period('2011-01-01', freq='D') + p_nat = Period("NaT", freq="D") + p = Period("2011-01-01", freq="D") - nat = Timestamp('NaT') - t = Timestamp('2011-01-01') + nat = Timestamp("NaT") + t = Timestamp("2011-01-01") # confirm Period('NaT') work identical with Timestamp('NaT') - for left, right in [(p_nat, p), (p, p_nat), (p_nat, p_nat), (nat, t), - (t, nat), (nat, nat)]: + for left, right in [ + (p_nat, p), + (p, p_nat), + (p_nat, p_nat), + (nat, t), + (t, nat), + (nat, nat), + ]: assert not left < right assert not left > right assert not left == right @@ -1017,30 +1020,29 @@ def test_period_nat_comp(self): class TestArithmetic: - def test_sub_delta(self): - left, right = Period('2011', freq='A'), Period('2007', freq='A') + left, right = Period("2011", freq="A"), Period("2007", freq="A") result = left - right assert result == 4 * right.freq with pytest.raises(period.IncompatibleFrequency): - left - Period('2007-01', freq='M') + left - Period("2007-01", freq="M") def test_add_integer(self): - per1 = Period(freq='D', year=2008, month=1, day=1) - per2 = Period(freq='D', year=2008, month=1, day=2) + per1 = Period(freq="D", year=2008, month=1, day=1) + per2 = Period(freq="D", year=2008, month=1, day=2) assert per1 + 1 == per2 assert 1 + per1 == per2 def test_add_sub_nat(self): # GH#13071 - p = Period('2011-01', freq='M') + p = Period("2011-01", freq="M") assert p + NaT is NaT assert NaT + p is NaT assert p - NaT is NaT assert NaT - p is NaT - p = Period('NaT', freq='M') + p = Period("NaT", freq="M") assert p + NaT is NaT assert NaT + p is NaT assert p - NaT is NaT @@ -1048,8 +1050,8 @@ def test_add_sub_nat(self): def test_add_invalid(self): # GH#4731 - per1 = Period(freq='D', year=2008, month=1, day=1) - per2 = Period(freq='D', year=2008, month=1, day=2) + per1 = Period(freq="D", year=2008, month=1, day=1) + per2 = Period(freq="D", year=2008, month=1, day=2) msg = r"unsupported operand type\(s\)" with pytest.raises(TypeError, match=msg): @@ -1060,20 +1062,22 @@ def test_add_invalid(self): per1 + per2 boxes = [lambda x: x, lambda x: pd.Series([x]), lambda x: pd.Index([x])] - ids = ['identity', 'Series', 'Index'] + ids = ["identity", "Series", "Index"] - @pytest.mark.parametrize('lbox', boxes, ids=ids) - @pytest.mark.parametrize('rbox', boxes, ids=ids) + @pytest.mark.parametrize("lbox", boxes, ids=ids) + @pytest.mark.parametrize("rbox", boxes, ids=ids) def test_add_timestamp_raises(self, rbox, lbox): # GH#17983 - ts = Timestamp('2017') - per = Period('2017', freq='M') + ts = Timestamp("2017") + per = Period("2017", freq="M") # We may get a different message depending on which class raises # the error. - msg = (r"cannot add|unsupported operand|" - r"can only operate on a|incompatible type|" - r"ufunc add cannot use operands") + msg = ( + r"cannot add|unsupported operand|" + r"can only operate on a|incompatible type|" + r"ufunc add cannot use operands" + ) with pytest.raises(TypeError, match=msg): lbox(ts) + rbox(per) @@ -1084,8 +1088,8 @@ def test_add_timestamp_raises(self, rbox, lbox): lbox(per) + rbox(per) def test_sub(self): - per1 = Period('2011-01-01', freq='D') - per2 = Period('2011-01-15', freq='D') + per1 = Period("2011-01-01", freq="D") + per2 = Period("2011-01-15", freq="D") off = per1.freq assert per1 - per2 == -14 * off @@ -1093,51 +1097,60 @@ def test_sub(self): msg = r"Input has different freq=M from Period\(freq=D\)" with pytest.raises(period.IncompatibleFrequency, match=msg): - per1 - Period('2011-02', freq='M') + per1 - Period("2011-02", freq="M") - @pytest.mark.parametrize('n', [1, 2, 3, 4]) + @pytest.mark.parametrize("n", [1, 2, 3, 4]) def test_sub_n_gt_1_ticks(self, tick_classes, n): # GH 23878 - p1 = pd.Period('19910905', freq=tick_classes(n)) - p2 = pd.Period('19920406', freq=tick_classes(n)) + p1 = pd.Period("19910905", freq=tick_classes(n)) + p2 = pd.Period("19920406", freq=tick_classes(n)) - expected = (pd.Period(str(p2), freq=p2.freq.base) - - pd.Period(str(p1), freq=p1.freq.base)) + expected = pd.Period(str(p2), freq=p2.freq.base) - pd.Period( + str(p1), freq=p1.freq.base + ) assert (p2 - p1) == expected - @pytest.mark.parametrize('normalize', [True, False]) - @pytest.mark.parametrize('n', [1, 2, 3, 4]) - @pytest.mark.parametrize('offset, kwd_name', [ - (pd.offsets.YearEnd, 'month'), - (pd.offsets.QuarterEnd, 'startingMonth'), - (pd.offsets.MonthEnd, None), - (pd.offsets.Week, 'weekday') - ]) + @pytest.mark.parametrize("normalize", [True, False]) + @pytest.mark.parametrize("n", [1, 2, 3, 4]) + @pytest.mark.parametrize( + "offset, kwd_name", + [ + (pd.offsets.YearEnd, "month"), + (pd.offsets.QuarterEnd, "startingMonth"), + (pd.offsets.MonthEnd, None), + (pd.offsets.Week, "weekday"), + ], + ) def test_sub_n_gt_1_offsets(self, offset, kwd_name, n, normalize): # GH 23878 kwds = {kwd_name: 3} if kwd_name is not None else {} - p1_d = '19910905' - p2_d = '19920406' + p1_d = "19910905" + p2_d = "19920406" p1 = pd.Period(p1_d, freq=offset(n, normalize, **kwds)) p2 = pd.Period(p2_d, freq=offset(n, normalize, **kwds)) - expected = (pd.Period(p2_d, freq=p2.freq.base) - - pd.Period(p1_d, freq=p1.freq.base)) + expected = pd.Period(p2_d, freq=p2.freq.base) - pd.Period( + p1_d, freq=p1.freq.base + ) assert (p2 - p1) == expected def test_add_offset(self): # freq is DateOffset - for freq in ['A', '2A', '3A']: - p = Period('2011', freq=freq) - exp = Period('2013', freq=freq) + for freq in ["A", "2A", "3A"]: + p = Period("2011", freq=freq) + exp = Period("2013", freq=freq) assert p + offsets.YearEnd(2) == exp assert offsets.YearEnd(2) + p == exp - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: with pytest.raises(period.IncompatibleFrequency): p + o @@ -1148,19 +1161,23 @@ def test_add_offset(self): with pytest.raises(period.IncompatibleFrequency): o + p - for freq in ['M', '2M', '3M']: - p = Period('2011-03', freq=freq) - exp = Period('2011-05', freq=freq) + for freq in ["M", "2M", "3M"]: + p = Period("2011-03", freq=freq) + exp = Period("2011-05", freq=freq) assert p + offsets.MonthEnd(2) == exp assert offsets.MonthEnd(2) + p == exp - exp = Period('2012-03', freq=freq) + exp = Period("2012-03", freq=freq) assert p + offsets.MonthEnd(12) == exp assert offsets.MonthEnd(12) + p == exp - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: with pytest.raises(period.IncompatibleFrequency): p + o @@ -1172,38 +1189,42 @@ def test_add_offset(self): o + p # freq is Tick - for freq in ['D', '2D', '3D']: - p = Period('2011-04-01', freq=freq) + for freq in ["D", "2D", "3D"]: + p = Period("2011-04-01", freq=freq) - exp = Period('2011-04-06', freq=freq) + exp = Period("2011-04-06", freq=freq) assert p + offsets.Day(5) == exp assert offsets.Day(5) + p == exp - exp = Period('2011-04-02', freq=freq) + exp = Period("2011-04-02", freq=freq) assert p + offsets.Hour(24) == exp assert offsets.Hour(24) + p == exp - exp = Period('2011-04-03', freq=freq) - assert p + np.timedelta64(2, 'D') == exp + exp = Period("2011-04-03", freq=freq) + assert p + np.timedelta64(2, "D") == exp with pytest.raises(TypeError): - np.timedelta64(2, 'D') + p + np.timedelta64(2, "D") + p - exp = Period('2011-04-02', freq=freq) - assert p + np.timedelta64(3600 * 24, 's') == exp + exp = Period("2011-04-02", freq=freq) + assert p + np.timedelta64(3600 * 24, "s") == exp with pytest.raises(TypeError): - np.timedelta64(3600 * 24, 's') + p + np.timedelta64(3600 * 24, "s") + p - exp = Period('2011-03-30', freq=freq) + exp = Period("2011-03-30", freq=freq) assert p + timedelta(-2) == exp assert timedelta(-2) + p == exp - exp = Period('2011-04-03', freq=freq) + exp = Period("2011-04-03", freq=freq) assert p + timedelta(hours=48) == exp assert timedelta(hours=48) + p == exp - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(4, 'h'), - timedelta(hours=23)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(4, "h"), + timedelta(hours=23), + ]: with pytest.raises(period.IncompatibleFrequency): p + o @@ -1214,38 +1235,42 @@ def test_add_offset(self): with pytest.raises(period.IncompatibleFrequency): o + p - for freq in ['H', '2H', '3H']: - p = Period('2011-04-01 09:00', freq=freq) + for freq in ["H", "2H", "3H"]: + p = Period("2011-04-01 09:00", freq=freq) - exp = Period('2011-04-03 09:00', freq=freq) + exp = Period("2011-04-03 09:00", freq=freq) assert p + offsets.Day(2) == exp assert offsets.Day(2) + p == exp - exp = Period('2011-04-01 12:00', freq=freq) + exp = Period("2011-04-01 12:00", freq=freq) assert p + offsets.Hour(3) == exp assert offsets.Hour(3) + p == exp - exp = Period('2011-04-01 12:00', freq=freq) - assert p + np.timedelta64(3, 'h') == exp + exp = Period("2011-04-01 12:00", freq=freq) + assert p + np.timedelta64(3, "h") == exp with pytest.raises(TypeError): - np.timedelta64(3, 'h') + p + np.timedelta64(3, "h") + p - exp = Period('2011-04-01 10:00', freq=freq) - assert p + np.timedelta64(3600, 's') == exp + exp = Period("2011-04-01 10:00", freq=freq) + assert p + np.timedelta64(3600, "s") == exp with pytest.raises(TypeError): - np.timedelta64(3600, 's') + p + np.timedelta64(3600, "s") + p - exp = Period('2011-04-01 11:00', freq=freq) + exp = Period("2011-04-01 11:00", freq=freq) assert p + timedelta(minutes=120) == exp assert timedelta(minutes=120) + p == exp - exp = Period('2011-04-05 12:00', freq=freq) + exp = Period("2011-04-05 12:00", freq=freq) assert p + timedelta(days=4, minutes=180) == exp assert timedelta(days=4, minutes=180) + p == exp - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(3200, 's'), - timedelta(hours=23, minutes=30)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(3200, "s"), + timedelta(hours=23, minutes=30), + ]: with pytest.raises(period.IncompatibleFrequency): p + o @@ -1258,15 +1283,19 @@ def test_add_offset(self): def test_add_offset_nat(self): # freq is DateOffset - for freq in ['A', '2A', '3A']: - p = Period('NaT', freq=freq) + for freq in ["A", "2A", "3A"]: + p = Period("NaT", freq=freq) for o in [offsets.YearEnd(2)]: assert p + o is NaT assert o + p is NaT - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: assert p + o is NaT if isinstance(o, np.timedelta64): @@ -1275,8 +1304,8 @@ def test_add_offset_nat(self): else: assert o + p is NaT - for freq in ['M', '2M', '3M']: - p = Period('NaT', freq=freq) + for freq in ["M", "2M", "3M"]: + p = Period("NaT", freq=freq) for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: assert p + o is NaT @@ -1286,9 +1315,13 @@ def test_add_offset_nat(self): else: assert o + p is NaT - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: assert p + o is NaT if isinstance(o, np.timedelta64): @@ -1298,11 +1331,16 @@ def test_add_offset_nat(self): assert o + p is NaT # freq is Tick - for freq in ['D', '2D', '3D']: - p = Period('NaT', freq=freq) - for o in [offsets.Day(5), offsets.Hour(24), np.timedelta64(2, 'D'), - np.timedelta64(3600 * 24, 's'), timedelta(-2), - timedelta(hours=48)]: + for freq in ["D", "2D", "3D"]: + p = Period("NaT", freq=freq) + for o in [ + offsets.Day(5), + offsets.Hour(24), + np.timedelta64(2, "D"), + np.timedelta64(3600 * 24, "s"), + timedelta(-2), + timedelta(hours=48), + ]: assert p + o is NaT if isinstance(o, np.timedelta64): @@ -1311,9 +1349,13 @@ def test_add_offset_nat(self): else: assert o + p is NaT - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(4, 'h'), - timedelta(hours=23)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(4, "h"), + timedelta(hours=23), + ]: assert p + o is NaT if isinstance(o, np.timedelta64): @@ -1322,19 +1364,28 @@ def test_add_offset_nat(self): else: assert o + p is NaT - for freq in ['H', '2H', '3H']: - p = Period('NaT', freq=freq) - for o in [offsets.Day(2), offsets.Hour(3), np.timedelta64(3, 'h'), - np.timedelta64(3600, 's'), timedelta(minutes=120), - timedelta(days=4, minutes=180)]: + for freq in ["H", "2H", "3H"]: + p = Period("NaT", freq=freq) + for o in [ + offsets.Day(2), + offsets.Hour(3), + np.timedelta64(3, "h"), + np.timedelta64(3600, "s"), + timedelta(minutes=120), + timedelta(days=4, minutes=180), + ]: assert p + o is NaT if not isinstance(o, np.timedelta64): assert o + p is NaT - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(3200, 's'), - timedelta(hours=23, minutes=30)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(3200, "s"), + timedelta(hours=23, minutes=30), + ]: assert p + o is NaT if isinstance(o, np.timedelta64): @@ -1345,128 +1396,168 @@ def test_add_offset_nat(self): def test_sub_offset(self): # freq is DateOffset - for freq in ['A', '2A', '3A']: - p = Period('2011', freq=freq) - assert p - offsets.YearEnd(2) == Period('2009', freq=freq) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: + for freq in ["A", "2A", "3A"]: + p = Period("2011", freq=freq) + assert p - offsets.YearEnd(2) == Period("2009", freq=freq) + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: with pytest.raises(period.IncompatibleFrequency): p - o - for freq in ['M', '2M', '3M']: - p = Period('2011-03', freq=freq) - assert p - offsets.MonthEnd(2) == Period('2011-01', freq=freq) - assert p - offsets.MonthEnd(12) == Period('2010-03', freq=freq) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: + for freq in ["M", "2M", "3M"]: + p = Period("2011-03", freq=freq) + assert p - offsets.MonthEnd(2) == Period("2011-01", freq=freq) + assert p - offsets.MonthEnd(12) == Period("2010-03", freq=freq) + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: with pytest.raises(period.IncompatibleFrequency): p - o # freq is Tick - for freq in ['D', '2D', '3D']: - p = Period('2011-04-01', freq=freq) - assert p - offsets.Day(5) == Period('2011-03-27', freq=freq) - assert p - offsets.Hour(24) == Period('2011-03-31', freq=freq) - assert p - np.timedelta64(2, 'D') == Period( - '2011-03-30', freq=freq) - assert p - np.timedelta64(3600 * 24, 's') == Period( - '2011-03-31', freq=freq) - assert p - timedelta(-2) == Period('2011-04-03', freq=freq) - assert p - timedelta(hours=48) == Period('2011-03-30', freq=freq) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(4, 'h'), - timedelta(hours=23)]: + for freq in ["D", "2D", "3D"]: + p = Period("2011-04-01", freq=freq) + assert p - offsets.Day(5) == Period("2011-03-27", freq=freq) + assert p - offsets.Hour(24) == Period("2011-03-31", freq=freq) + assert p - np.timedelta64(2, "D") == Period("2011-03-30", freq=freq) + assert p - np.timedelta64(3600 * 24, "s") == Period("2011-03-31", freq=freq) + assert p - timedelta(-2) == Period("2011-04-03", freq=freq) + assert p - timedelta(hours=48) == Period("2011-03-30", freq=freq) + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(4, "h"), + timedelta(hours=23), + ]: with pytest.raises(period.IncompatibleFrequency): p - o - for freq in ['H', '2H', '3H']: - p = Period('2011-04-01 09:00', freq=freq) - assert p - offsets.Day(2) == Period('2011-03-30 09:00', freq=freq) - assert p - offsets.Hour(3) == Period('2011-04-01 06:00', freq=freq) - assert p - np.timedelta64(3, 'h') == Period( - '2011-04-01 06:00', freq=freq) - assert p - np.timedelta64(3600, 's') == Period( - '2011-04-01 08:00', freq=freq) - assert p - timedelta(minutes=120) == Period( - '2011-04-01 07:00', freq=freq) + for freq in ["H", "2H", "3H"]: + p = Period("2011-04-01 09:00", freq=freq) + assert p - offsets.Day(2) == Period("2011-03-30 09:00", freq=freq) + assert p - offsets.Hour(3) == Period("2011-04-01 06:00", freq=freq) + assert p - np.timedelta64(3, "h") == Period("2011-04-01 06:00", freq=freq) + assert p - np.timedelta64(3600, "s") == Period( + "2011-04-01 08:00", freq=freq + ) + assert p - timedelta(minutes=120) == Period("2011-04-01 07:00", freq=freq) assert p - timedelta(days=4, minutes=180) == Period( - '2011-03-28 06:00', freq=freq) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(3200, 's'), - timedelta(hours=23, minutes=30)]: + "2011-03-28 06:00", freq=freq + ) + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(3200, "s"), + timedelta(hours=23, minutes=30), + ]: with pytest.raises(period.IncompatibleFrequency): p - o def test_sub_offset_nat(self): # freq is DateOffset - for freq in ['A', '2A', '3A']: - p = Period('NaT', freq=freq) + for freq in ["A", "2A", "3A"]: + p = Period("NaT", freq=freq) for o in [offsets.YearEnd(2)]: assert p - o is NaT - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: assert p - o is NaT - for freq in ['M', '2M', '3M']: - p = Period('NaT', freq=freq) + for freq in ["M", "2M", "3M"]: + p = Period("NaT", freq=freq) for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: assert p - o is NaT - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: assert p - o is NaT # freq is Tick - for freq in ['D', '2D', '3D']: - p = Period('NaT', freq=freq) - for o in [offsets.Day(5), offsets.Hour(24), np.timedelta64(2, 'D'), - np.timedelta64(3600 * 24, 's'), timedelta(-2), - timedelta(hours=48)]: + for freq in ["D", "2D", "3D"]: + p = Period("NaT", freq=freq) + for o in [ + offsets.Day(5), + offsets.Hour(24), + np.timedelta64(2, "D"), + np.timedelta64(3600 * 24, "s"), + timedelta(-2), + timedelta(hours=48), + ]: assert p - o is NaT - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(4, 'h'), - timedelta(hours=23)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(4, "h"), + timedelta(hours=23), + ]: assert p - o is NaT - for freq in ['H', '2H', '3H']: - p = Period('NaT', freq=freq) - for o in [offsets.Day(2), offsets.Hour(3), np.timedelta64(3, 'h'), - np.timedelta64(3600, 's'), timedelta(minutes=120), - timedelta(days=4, minutes=180)]: + for freq in ["H", "2H", "3H"]: + p = Period("NaT", freq=freq) + for o in [ + offsets.Day(2), + offsets.Hour(3), + np.timedelta64(3, "h"), + np.timedelta64(3600, "s"), + timedelta(minutes=120), + timedelta(days=4, minutes=180), + ]: assert p - o is NaT - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(3200, 's'), - timedelta(hours=23, minutes=30)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(3200, "s"), + timedelta(hours=23, minutes=30), + ]: assert p - o is NaT - @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) def test_nat_ops(self, freq): - p = Period('NaT', freq=freq) + p = Period("NaT", freq=freq) assert p + 1 is NaT assert 1 + p is NaT assert p - 1 is NaT - assert p - Period('2011-01', freq=freq) is NaT - assert Period('2011-01', freq=freq) - p is NaT + assert p - Period("2011-01", freq=freq) is NaT + assert Period("2011-01", freq=freq) - p is NaT def test_period_ops_offset(self): - p = Period('2011-04-01', freq='D') + p = Period("2011-04-01", freq="D") result = p + offsets.Day() - exp = Period('2011-04-02', freq='D') + exp = Period("2011-04-02", freq="D") assert result == exp result = p - offsets.Day(2) - exp = Period('2011-03-30', freq='D') + exp = Period("2011-03-30", freq="D") assert result == exp msg = r"Input cannot be converted to Period\(freq=D\)" @@ -1479,7 +1570,7 @@ def test_period_ops_offset(self): def test_period_immutable(): # see gh-17116 - per = Period('2014Q1') + per = Period("2014Q1") with pytest.raises(AttributeError): per.ordinal = 14 @@ -1489,9 +1580,8 @@ def test_period_immutable(): # TODO: This doesn't fail on all systems; track down which -@pytest.mark.xfail(reason="Parses as Jan 1, 0007 on some systems", - strict=False) +@pytest.mark.xfail(reason="Parses as Jan 1, 0007 on some systems", strict=False) def test_small_year_parsing(): - per1 = Period('0001-01-07', 'D') + per1 = Period("0001-01-07", "D") assert per1.year == 1 assert per1.day == 7 diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 19426c3bf3ffbb..f935a7fa880c70 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -8,15 +8,28 @@ import pandas.compat as compat from pandas import ( - DatetimeIndex, Index, NaT, Period, Series, Timedelta, TimedeltaIndex, - Timestamp, isna) + DatetimeIndex, + Index, + NaT, + Period, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, + isna, +) from pandas.core.arrays import PeriodArray from pandas.util import testing as tm -@pytest.mark.parametrize("nat,idx", [(Timestamp("NaT"), DatetimeIndex), - (Timedelta("NaT"), TimedeltaIndex), - (Period("NaT", freq="M"), PeriodArray)]) +@pytest.mark.parametrize( + "nat,idx", + [ + (Timestamp("NaT"), DatetimeIndex), + (Timedelta("NaT"), TimedeltaIndex), + (Period("NaT", freq="M"), PeriodArray), + ], +) def test_nat_fields(nat, idx): for field in idx._field_ops: @@ -72,8 +85,7 @@ def test_nat_vector_field_access(): @pytest.mark.parametrize("klass", [Timestamp, Timedelta, Period]) -@pytest.mark.parametrize("value", [None, np.nan, iNaT, float("nan"), - NaT, "NaT", "nat"]) +@pytest.mark.parametrize("value", [None, np.nan, iNaT, float("nan"), NaT, "NaT", "nat"]) def test_identity(klass, value): assert klass(value) is NaT @@ -98,13 +110,31 @@ def test_round_nat(klass, method, freq): assert round_method(freq) is ts -@pytest.mark.parametrize("method", [ - "astimezone", "combine", "ctime", "dst", "fromordinal", - "fromtimestamp", "isocalendar", "strftime", "strptime", - "time", "timestamp", "timetuple", "timetz", "toordinal", - "tzname", "utcfromtimestamp", "utcnow", "utcoffset", - "utctimetuple", "timestamp" -]) +@pytest.mark.parametrize( + "method", + [ + "astimezone", + "combine", + "ctime", + "dst", + "fromordinal", + "fromtimestamp", + "isocalendar", + "strftime", + "strptime", + "time", + "timestamp", + "timetuple", + "timetz", + "toordinal", + "tzname", + "utcfromtimestamp", + "utcnow", + "utcoffset", + "utctimetuple", + "timestamp", + ], +) def test_nat_methods_raise(method): # see gh-9513, gh-17329 msg = "NaTType does not support {method}".format(method=method) @@ -113,38 +143,46 @@ def test_nat_methods_raise(method): getattr(NaT, method)() -@pytest.mark.parametrize("method", [ - "weekday", "isoweekday" -]) +@pytest.mark.parametrize("method", ["weekday", "isoweekday"]) def test_nat_methods_nan(method): # see gh-9513, gh-17329 assert np.isnan(getattr(NaT, method)()) -@pytest.mark.parametrize("method", [ - "date", "now", "replace", "today", - "tz_convert", "tz_localize" -]) +@pytest.mark.parametrize( + "method", ["date", "now", "replace", "today", "tz_convert", "tz_localize"] +) def test_nat_methods_nat(method): # see gh-8254, gh-9513, gh-17329 assert getattr(NaT, method)() is NaT -@pytest.mark.parametrize("get_nat", [ - lambda x: NaT, - lambda x: Timedelta(x), - lambda x: Timestamp(x) -]) +@pytest.mark.parametrize( + "get_nat", [lambda x: NaT, lambda x: Timedelta(x), lambda x: Timestamp(x)] +) def test_nat_iso_format(get_nat): # see gh-12300 assert get_nat("NaT").isoformat() == "NaT" -@pytest.mark.parametrize("klass,expected", [ - (Timestamp, ["freqstr", "normalize", "to_julian_date", "to_period", "tz"]), - (Timedelta, ["components", "delta", "is_populated", "resolution_string", - "to_pytimedelta", "to_timedelta64", "view"]) -]) +@pytest.mark.parametrize( + "klass,expected", + [ + (Timestamp, ["freqstr", "normalize", "to_julian_date", "to_period", "tz"]), + ( + Timedelta, + [ + "components", + "delta", + "is_populated", + "resolution_string", + "to_pytimedelta", + "to_timedelta64", + "view", + ], + ), + ], +) def test_missing_public_nat_methods(klass, expected): # see gh-17327 # @@ -154,8 +192,7 @@ def test_missing_public_nat_methods(klass, expected): nat_names = dir(NaT) klass_names = dir(klass) - missing = [x for x in klass_names if x not in nat_names and - not x.startswith("_")] + missing = [x for x in klass_names if x not in nat_names and not x.startswith("_")] missing.sort() assert missing == expected @@ -179,9 +216,11 @@ def _get_overlap_public_nat_methods(klass, as_tuple=False): nat_names = dir(NaT) klass_names = dir(klass) - overlap = [x for x in nat_names if x in klass_names and - not x.startswith("_") and - callable(getattr(klass, x))] + overlap = [ + x + for x in nat_names + if x in klass_names and not x.startswith("_") and callable(getattr(klass, x)) + ] # Timestamp takes precedence over Timedelta in terms of overlap. if klass is Timedelta: @@ -195,18 +234,54 @@ def _get_overlap_public_nat_methods(klass, as_tuple=False): return overlap -@pytest.mark.parametrize("klass,expected", [ - (Timestamp, ["astimezone", "ceil", "combine", "ctime", "date", "day_name", - "dst", "floor", "fromisoformat", "fromordinal", - "fromtimestamp", "isocalendar", "isoformat", "isoweekday", - "month_name", "now", "replace", "round", "strftime", - "strptime", "time", "timestamp", "timetuple", "timetz", - "to_datetime64", "to_numpy", "to_pydatetime", "today", - "toordinal", "tz_convert", "tz_localize", "tzname", - "utcfromtimestamp", "utcnow", "utcoffset", "utctimetuple", - "weekday"]), - (Timedelta, ["total_seconds"]) -]) +@pytest.mark.parametrize( + "klass,expected", + [ + ( + Timestamp, + [ + "astimezone", + "ceil", + "combine", + "ctime", + "date", + "day_name", + "dst", + "floor", + "fromisoformat", + "fromordinal", + "fromtimestamp", + "isocalendar", + "isoformat", + "isoweekday", + "month_name", + "now", + "replace", + "round", + "strftime", + "strptime", + "time", + "timestamp", + "timetuple", + "timetz", + "to_datetime64", + "to_numpy", + "to_pydatetime", + "today", + "toordinal", + "tz_convert", + "tz_localize", + "tzname", + "utcfromtimestamp", + "utcnow", + "utcoffset", + "utctimetuple", + "weekday", + ], + ), + (Timedelta, ["total_seconds"]), + ], +) def test_overlap_public_nat_methods(klass, expected): # see gh-17327 # @@ -221,9 +296,12 @@ def test_overlap_public_nat_methods(klass, expected): assert _get_overlap_public_nat_methods(klass) == expected -@pytest.mark.parametrize("compare", ( - _get_overlap_public_nat_methods(Timestamp, True) + - _get_overlap_public_nat_methods(Timedelta, True)) +@pytest.mark.parametrize( + "compare", + ( + _get_overlap_public_nat_methods(Timestamp, True) + + _get_overlap_public_nat_methods(Timedelta, True) + ), ) def test_nat_doc_strings(compare): # see gh-17327 @@ -249,32 +327,42 @@ def test_nat_doc_strings(compare): @pytest.mark.parametrize("op_name", list(_ops.keys())) -@pytest.mark.parametrize("value,val_type", [ - (2, "scalar"), - (1.5, "scalar"), - (np.nan, "scalar"), - (timedelta(3600), "timedelta"), - (Timedelta("5s"), "timedelta"), - (datetime(2014, 1, 1), "timestamp"), - (Timestamp("2014-01-01"), "timestamp"), - (Timestamp("2014-01-01", tz="UTC"), "timestamp"), - (Timestamp("2014-01-01", tz="US/Eastern"), "timestamp"), - (pytz.timezone("Asia/Tokyo").localize(datetime(2014, 1, 1)), "timestamp"), -]) +@pytest.mark.parametrize( + "value,val_type", + [ + (2, "scalar"), + (1.5, "scalar"), + (np.nan, "scalar"), + (timedelta(3600), "timedelta"), + (Timedelta("5s"), "timedelta"), + (datetime(2014, 1, 1), "timestamp"), + (Timestamp("2014-01-01"), "timestamp"), + (Timestamp("2014-01-01", tz="UTC"), "timestamp"), + (Timestamp("2014-01-01", tz="US/Eastern"), "timestamp"), + (pytz.timezone("Asia/Tokyo").localize(datetime(2014, 1, 1)), "timestamp"), + ], +) def test_nat_arithmetic_scalar(op_name, value, val_type): # see gh-6873 invalid_ops = { "scalar": {"right_div_left"}, "timedelta": {"left_times_right", "right_times_left"}, - "timestamp": {"left_times_right", "right_times_left", - "left_div_right", "right_div_left"} + "timestamp": { + "left_times_right", + "right_times_left", + "left_div_right", + "right_div_left", + }, } op = _ops[op_name] if op_name in invalid_ops.get(val_type, set()): - if (val_type == "timedelta" and "times" in op_name and - isinstance(value, Timedelta)): + if ( + val_type == "timedelta" + and "times" in op_name + and isinstance(value, Timedelta) + ): msg = "Cannot multiply" else: msg = "unsupported operand type" @@ -290,11 +378,9 @@ def test_nat_arithmetic_scalar(op_name, value, val_type): assert op(NaT, value) is expected -@pytest.mark.parametrize("val,expected", [ - (np.nan, NaT), - (NaT, np.nan), - (np.timedelta64("NaT"), np.nan) -]) +@pytest.mark.parametrize( + "val,expected", [(np.nan, NaT), (NaT, np.nan), (np.timedelta64("NaT"), np.nan)] +) def test_nat_rfloordiv_timedelta(val, expected): # see gh-#18846 # @@ -303,15 +389,18 @@ def test_nat_rfloordiv_timedelta(val, expected): assert td // val is expected -@pytest.mark.parametrize("op_name", [ - "left_plus_right", "right_plus_left", - "left_minus_right", "right_minus_left" -]) -@pytest.mark.parametrize("value", [ - DatetimeIndex(["2011-01-01", "2011-01-02"], name="x"), - DatetimeIndex(["2011-01-01", "2011-01-02"], name="x"), - TimedeltaIndex(["1 day", "2 day"], name="x"), -]) +@pytest.mark.parametrize( + "op_name", + ["left_plus_right", "right_plus_left", "left_minus_right", "right_minus_left"], +) +@pytest.mark.parametrize( + "value", + [ + DatetimeIndex(["2011-01-01", "2011-01-02"], name="x"), + DatetimeIndex(["2011-01-01", "2011-01-02"], name="x"), + TimedeltaIndex(["1 day", "2 day"], name="x"), + ], +) def test_nat_arithmetic_index(op_name, value): # see gh-11718 exp_name = "x" @@ -325,10 +414,10 @@ def test_nat_arithmetic_index(op_name, value): tm.assert_index_equal(_ops[op_name](NaT, value), expected) -@pytest.mark.parametrize("op_name", [ - "left_plus_right", "right_plus_left", - "left_minus_right", "right_minus_left" -]) +@pytest.mark.parametrize( + "op_name", + ["left_plus_right", "right_plus_left", "left_minus_right", "right_minus_left"], +) @pytest.mark.parametrize("box", [TimedeltaIndex, Series]) def test_nat_arithmetic_td64_vector(op_name, box): # see gh-19124 @@ -350,9 +439,7 @@ def test_to_numpy_alias(): assert isna(expected) and isna(result) -@pytest.mark.parametrize("other", [ - Timedelta(0), Timestamp(0) -]) +@pytest.mark.parametrize("other", [Timedelta(0), Timestamp(0)]) def test_nat_comparisons(compare_operators_no_eq_ne, other): # GH 26039 assert getattr(NaT, compare_operators_no_eq_ne)(other) is False diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index 0fdbcf6ff46475..52f32d41a02ff9 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -20,17 +20,22 @@ class TestTimedeltaAdditionSubtraction: __add__, __radd__, __sub__, __rsub__ """ - @pytest.mark.parametrize('ten_seconds', [ - Timedelta(10, unit='s'), - timedelta(seconds=10), - np.timedelta64(10, 's'), - np.timedelta64(10000000000, 'ns'), - pd.offsets.Second(10)]) + + @pytest.mark.parametrize( + "ten_seconds", + [ + Timedelta(10, unit="s"), + timedelta(seconds=10), + np.timedelta64(10, "s"), + np.timedelta64(10000000000, "ns"), + pd.offsets.Second(10), + ], + ) def test_td_add_sub_ten_seconds(self, ten_seconds): # GH#6808 - base = Timestamp('20130101 09:01:12.123456') - expected_add = Timestamp('20130101 09:01:22.123456') - expected_sub = Timestamp('20130101 09:01:02.123456') + base = Timestamp("20130101 09:01:12.123456") + expected_add = Timestamp("20130101 09:01:22.123456") + expected_sub = Timestamp("20130101 09:01:02.123456") result = base + ten_seconds assert result == expected_add @@ -38,17 +43,21 @@ def test_td_add_sub_ten_seconds(self, ten_seconds): result = base - ten_seconds assert result == expected_sub - @pytest.mark.parametrize('one_day_ten_secs', [ - Timedelta('1 day, 00:00:10'), - Timedelta('1 days, 00:00:10'), - timedelta(days=1, seconds=10), - np.timedelta64(1, 'D') + np.timedelta64(10, 's'), - pd.offsets.Day() + pd.offsets.Second(10)]) + @pytest.mark.parametrize( + "one_day_ten_secs", + [ + Timedelta("1 day, 00:00:10"), + Timedelta("1 days, 00:00:10"), + timedelta(days=1, seconds=10), + np.timedelta64(1, "D") + np.timedelta64(10, "s"), + pd.offsets.Day() + pd.offsets.Second(10), + ], + ) def test_td_add_sub_one_day_ten_seconds(self, one_day_ten_secs): # GH#6808 - base = Timestamp('20130102 09:01:12.123456') - expected_add = Timestamp('20130103 09:01:22.123456') - expected_sub = Timestamp('20130101 09:01:02.123456') + base = Timestamp("20130102 09:01:12.123456") + expected_add = Timestamp("20130103 09:01:22.123456") + expected_sub = Timestamp("20130101 09:01:02.123456") result = base + one_day_ten_secs assert result == expected_add @@ -56,10 +65,10 @@ def test_td_add_sub_one_day_ten_seconds(self, one_day_ten_secs): result = base - one_day_ten_secs assert result == expected_sub - @pytest.mark.parametrize('op', [operator.add, ops.radd]) + @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_datetimelike_scalar(self, op): # GH#19738 - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = op(td, datetime(2016, 1, 1)) if op is operator.add: @@ -68,57 +77,57 @@ def test_td_add_datetimelike_scalar(self, op): assert isinstance(result, Timestamp) assert result == Timestamp(2016, 1, 11) - result = op(td, Timestamp('2018-01-12 18:09')) + result = op(td, Timestamp("2018-01-12 18:09")) assert isinstance(result, Timestamp) - assert result == Timestamp('2018-01-22 18:09') + assert result == Timestamp("2018-01-22 18:09") - result = op(td, np.datetime64('2018-01-12')) + result = op(td, np.datetime64("2018-01-12")) assert isinstance(result, Timestamp) - assert result == Timestamp('2018-01-22') + assert result == Timestamp("2018-01-22") result = op(td, NaT) assert result is NaT - @pytest.mark.parametrize('op', [operator.add, ops.radd]) + @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_td(self, op): - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = op(td, Timedelta(days=10)) assert isinstance(result, Timedelta) assert result == Timedelta(days=20) - @pytest.mark.parametrize('op', [operator.add, ops.radd]) + @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_pytimedelta(self, op): - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = op(td, timedelta(days=9)) assert isinstance(result, Timedelta) assert result == Timedelta(days=19) - @pytest.mark.parametrize('op', [operator.add, ops.radd]) + @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_timedelta64(self, op): - td = Timedelta(10, unit='d') - result = op(td, np.timedelta64(-4, 'D')) + td = Timedelta(10, unit="d") + result = op(td, np.timedelta64(-4, "D")) assert isinstance(result, Timedelta) assert result == Timedelta(days=6) - @pytest.mark.parametrize('op', [operator.add, ops.radd]) + @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_offset(self, op): - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = op(td, pd.offsets.Hour(6)) assert isinstance(result, Timedelta) assert result == Timedelta(days=10, hours=6) def test_td_sub_td(self): - td = Timedelta(10, unit='d') - expected = Timedelta(0, unit='ns') + td = Timedelta(10, unit="d") + expected = Timedelta(0, unit="ns") result = td - td assert isinstance(result, Timedelta) assert result == expected def test_td_sub_pytimedelta(self): - td = Timedelta(10, unit='d') - expected = Timedelta(0, unit='ns') + td = Timedelta(10, unit="d") + expected = Timedelta(0, unit="ns") result = td - td.to_pytimedelta() assert isinstance(result, Timedelta) @@ -129,8 +138,8 @@ def test_td_sub_pytimedelta(self): assert result == expected def test_td_sub_timedelta64(self): - td = Timedelta(10, unit='d') - expected = Timedelta(0, unit='ns') + td = Timedelta(10, unit="d") + expected = Timedelta(0, unit="ns") result = td - td.to_timedelta64() assert isinstance(result, Timedelta) @@ -142,13 +151,13 @@ def test_td_sub_timedelta64(self): def test_td_sub_nat(self): # In this context pd.NaT is treated as timedelta-like - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = td - NaT assert result is NaT def test_td_sub_td64_nat(self): - td = Timedelta(10, unit='d') - td_nat = np.timedelta64('NaT') + td = Timedelta(10, unit="d") + td_nat = np.timedelta64("NaT") result = td - td_nat assert result is NaT @@ -157,13 +166,13 @@ def test_td_sub_td64_nat(self): assert result is NaT def test_td_sub_offset(self): - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = td - pd.offsets.Hour(1) assert isinstance(result, Timedelta) - assert result == Timedelta(239, unit='h') + assert result == Timedelta(239, unit="h") def test_td_add_sub_numeric_raises(self): - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") for other in [2, 2.0, np.int64(2), np.float64(2)]: with pytest.raises(TypeError): td + other @@ -175,67 +184,61 @@ def test_td_add_sub_numeric_raises(self): other - td def test_td_rsub_nat(self): - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = NaT - td assert result is NaT - result = np.datetime64('NaT') - td + result = np.datetime64("NaT") - td assert result is NaT def test_td_rsub_offset(self): - result = pd.offsets.Hour(1) - Timedelta(10, unit='d') + result = pd.offsets.Hour(1) - Timedelta(10, unit="d") assert isinstance(result, Timedelta) - assert result == Timedelta(-239, unit='h') + assert result == Timedelta(-239, unit="h") def test_td_sub_timedeltalike_object_dtype_array(self): # GH#21980 - arr = np.array([Timestamp('20130101 9:01'), - Timestamp('20121230 9:02')]) - exp = np.array([Timestamp('20121231 9:01'), - Timestamp('20121229 9:02')]) - res = arr - Timedelta('1D') + arr = np.array([Timestamp("20130101 9:01"), Timestamp("20121230 9:02")]) + exp = np.array([Timestamp("20121231 9:01"), Timestamp("20121229 9:02")]) + res = arr - Timedelta("1D") tm.assert_numpy_array_equal(res, exp) def test_td_sub_mixed_most_timedeltalike_object_dtype_array(self): # GH#21980 now = Timestamp.now() - arr = np.array([now, - Timedelta('1D'), - np.timedelta64(2, 'h')]) - exp = np.array([now - Timedelta('1D'), - Timedelta('0D'), - np.timedelta64(2, 'h') - Timedelta('1D')]) - res = arr - Timedelta('1D') + arr = np.array([now, Timedelta("1D"), np.timedelta64(2, "h")]) + exp = np.array( + [ + now - Timedelta("1D"), + Timedelta("0D"), + np.timedelta64(2, "h") - Timedelta("1D"), + ] + ) + res = arr - Timedelta("1D") tm.assert_numpy_array_equal(res, exp) def test_td_rsub_mixed_most_timedeltalike_object_dtype_array(self): # GH#21980 now = Timestamp.now() - arr = np.array([now, - Timedelta('1D'), - np.timedelta64(2, 'h')]) + arr = np.array([now, Timedelta("1D"), np.timedelta64(2, "h")]) with pytest.raises(TypeError): - Timedelta('1D') - arr + Timedelta("1D") - arr - @pytest.mark.parametrize('op', [operator.add, ops.radd]) + @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_timedeltalike_object_dtype_array(self, op): # GH#21980 - arr = np.array([Timestamp('20130101 9:01'), - Timestamp('20121230 9:02')]) - exp = np.array([Timestamp('20130102 9:01'), - Timestamp('20121231 9:02')]) - res = op(arr, Timedelta('1D')) + arr = np.array([Timestamp("20130101 9:01"), Timestamp("20121230 9:02")]) + exp = np.array([Timestamp("20130102 9:01"), Timestamp("20121231 9:02")]) + res = op(arr, Timedelta("1D")) tm.assert_numpy_array_equal(res, exp) - @pytest.mark.parametrize('op', [operator.add, ops.radd]) + @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_mixed_timedeltalike_object_dtype_array(self, op): # GH#21980 now = Timestamp.now() - arr = np.array([now, - Timedelta('1D')]) - exp = np.array([now + Timedelta('1D'), - Timedelta('2D')]) - res = op(arr, Timedelta('1D')) + arr = np.array([now, Timedelta("1D")]) + exp = np.array([now + Timedelta("1D"), Timedelta("2D")]) + res = op(arr, Timedelta("1D")) tm.assert_numpy_array_equal(res, exp) @@ -254,25 +257,25 @@ class TestTimedeltaMultiplicationDivision: # --------------------------------------------------------------- # Timedelta.__mul__, __rmul__ - @pytest.mark.parametrize('td_nat', [NaT, - np.timedelta64('NaT', 'ns'), - np.timedelta64('NaT')]) - @pytest.mark.parametrize('op', [operator.mul, ops.rmul]) + @pytest.mark.parametrize( + "td_nat", [NaT, np.timedelta64("NaT", "ns"), np.timedelta64("NaT")] + ) + @pytest.mark.parametrize("op", [operator.mul, ops.rmul]) def test_td_mul_nat(self, op, td_nat): # GH#19819 - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") with pytest.raises(TypeError): op(td, td_nat) - @pytest.mark.parametrize('nan', [np.nan, np.float64('NaN'), float('nan')]) - @pytest.mark.parametrize('op', [operator.mul, ops.rmul]) + @pytest.mark.parametrize("nan", [np.nan, np.float64("NaN"), float("nan")]) + @pytest.mark.parametrize("op", [operator.mul, ops.rmul]) def test_td_mul_nan(self, op, nan): # np.float64('NaN') has a 'dtype' attr, avoid treating as array - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = op(td, nan) assert result is NaT - @pytest.mark.parametrize('op', [operator.mul, ops.rmul]) + @pytest.mark.parametrize("op", [operator.mul, ops.rmul]) def test_td_mul_scalar(self, op): # GH#19738 td = Timedelta(minutes=3) @@ -301,19 +304,19 @@ def test_td_mul_scalar(self, op): def test_td_div_timedeltalike_scalar(self): # GH#19738 - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = td / pd.offsets.Hour(1) assert result == 240 assert td / td == 1 - assert td / np.timedelta64(60, 'h') == 4 + assert td / np.timedelta64(60, "h") == 4 assert np.isnan(td / NaT) def test_td_div_numeric_scalar(self): # GH#19738 - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = td / 2 assert isinstance(result, Timedelta) @@ -323,10 +326,10 @@ def test_td_div_numeric_scalar(self): assert isinstance(result, Timedelta) assert result == Timedelta(days=2) - @pytest.mark.parametrize('nan', [np.nan, np.float64('NaN'), float('nan')]) + @pytest.mark.parametrize("nan", [np.nan, np.float64("NaN"), float("nan")]) def test_td_div_nan(self, nan): # np.float64('NaN') has a 'dtype' attr, avoid treating as array - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = td / nan assert result is NaT @@ -338,11 +341,11 @@ def test_td_div_nan(self, nan): def test_td_rdiv_timedeltalike_scalar(self): # GH#19738 - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = pd.offsets.Hour(1) / td assert result == 1 / 240.0 - assert np.timedelta64(60, 'h') / td == 0.25 + assert np.timedelta64(60, "h") / td == 0.25 # --------------------------------------------------------------- # Timedelta.__floordiv__ @@ -362,7 +365,7 @@ def test_td_floordiv_null_scalar(self): assert td // np.nan is NaT assert np.isnan(td // NaT) - assert np.isnan(td // np.timedelta64('NaT')) + assert np.isnan(td // np.timedelta64("NaT")) def test_td_floordiv_offsets(self): # GH#19738 @@ -375,7 +378,7 @@ def test_td_floordiv_invalid_scalar(self): td = Timedelta(hours=3, minutes=4) with pytest.raises(TypeError): - td // np.datetime64('2016-01-01', dtype='datetime64[us]') + td // np.datetime64("2016-01-01", dtype="datetime64[us]") def test_td_floordiv_numeric_scalar(self): # GH#18846 @@ -400,8 +403,7 @@ def test_td_floordiv_timedeltalike_array(self): expected = np.array([3], dtype=np.int64) tm.assert_numpy_array_equal(res, expected) - res = (10 * td) // np.array([scalar.to_timedelta64(), - np.timedelta64('NaT')]) + res = (10 * td) // np.array([scalar.to_timedelta64(), np.timedelta64("NaT")]) expected = np.array([10, np.nan]) tm.assert_numpy_array_equal(res, expected) @@ -410,7 +412,7 @@ def test_td_floordiv_numeric_series(self): td = Timedelta(hours=3, minutes=4) ser = pd.Series([1], dtype=np.int64) res = td // ser - assert res.dtype.kind == 'm' + assert res.dtype.kind == "m" # --------------------------------------------------------------- # Timedelta.__rfloordiv__ @@ -435,7 +437,7 @@ def test_td_rfloordiv_null_scalar(self): td = Timedelta(hours=3, minutes=3) assert np.isnan(td.__rfloordiv__(NaT)) - assert np.isnan(td.__rfloordiv__(np.timedelta64('NaT'))) + assert np.isnan(td.__rfloordiv__(np.timedelta64("NaT"))) def test_td_rfloordiv_offsets(self): # GH#19738 @@ -445,7 +447,7 @@ def test_td_rfloordiv_invalid_scalar(self): # GH#18846 td = Timedelta(hours=3, minutes=3) - dt64 = np.datetime64('2016-01-01', dtype='datetime64[us]') + dt64 = np.datetime64("2016-01-01", dtype="datetime64[us]") with pytest.raises(TypeError): td.__rfloordiv__(dt64) @@ -477,8 +479,7 @@ def test_td_rfloordiv_timedeltalike_array(self): expected = np.array([3], dtype=np.int64) tm.assert_numpy_array_equal(res, expected) - arr = np.array([(10 * scalar).to_timedelta64(), - np.timedelta64('NaT')]) + arr = np.array([(10 * scalar).to_timedelta64(), np.timedelta64("NaT")]) res = td.__rfloordiv__(arr) expected = np.array([10, np.nan]) tm.assert_numpy_array_equal(res, expected) @@ -516,14 +517,14 @@ def test_mod_timedelta64_nat(self): # GH#19365 td = Timedelta(hours=37) - result = td % np.timedelta64('NaT', 'ns') + result = td % np.timedelta64("NaT", "ns") assert result is NaT def test_mod_timedelta64(self): # GH#19365 td = Timedelta(hours=37) - result = td % np.timedelta64(2, 'h') + result = td % np.timedelta64(2, "h") assert isinstance(result, Timedelta) assert result == Timedelta(hours=1) @@ -557,7 +558,7 @@ def test_mod_invalid(self): td = Timedelta(hours=37) with pytest.raises(TypeError): - td % Timestamp('2018-01-22') + td % Timestamp("2018-01-22") with pytest.raises(TypeError): td % [] @@ -573,7 +574,7 @@ def test_rmod_pytimedelta(self): def test_rmod_timedelta64(self): # GH#19365 td = Timedelta(minutes=3) - result = np.timedelta64(5, 'm') % td + result = np.timedelta64(5, "m") % td assert isinstance(result, Timedelta) assert result == Timedelta(minutes=2) @@ -582,7 +583,7 @@ def test_rmod_invalid(self): td = Timedelta(minutes=3) with pytest.raises(TypeError): - Timestamp('2018-01-22') % td + Timestamp("2018-01-22") % td with pytest.raises(TypeError): 15 % td @@ -601,7 +602,7 @@ def test_divmod_numeric(self): td = Timedelta(days=2, hours=6) result = divmod(td, 53 * 3600 * 1e9) - assert result[0] == Timedelta(1, unit='ns') + assert result[0] == Timedelta(1, unit="ns") assert isinstance(result[1], Timedelta) assert result[1] == Timedelta(hours=1) @@ -642,7 +643,7 @@ def test_divmod_invalid(self): td = Timedelta(days=2, hours=6) with pytest.raises(TypeError): - divmod(td, Timestamp('2018-01-22')) + divmod(td, Timestamp("2018-01-22")) def test_rdivmod_pytimedelta(self): # GH#19365 @@ -662,7 +663,7 @@ def test_rdivmod_invalid(self): td = Timedelta(minutes=3) with pytest.raises(TypeError): - divmod(Timestamp('2018-01-22'), td) + divmod(Timestamp("2018-01-22"), td) with pytest.raises(TypeError): divmod(15, td) @@ -675,16 +676,16 @@ def test_rdivmod_invalid(self): # ---------------------------------------------------------------- - @pytest.mark.parametrize('op', [ - operator.mul, - ops.rmul, - operator.truediv, - ops.rdiv, - ops.rsub]) - @pytest.mark.parametrize('arr', [ - np.array([Timestamp('20130101 9:01'), Timestamp('20121230 9:02')]), - np.array([Timestamp.now(), Timedelta('1D')]) - ]) + @pytest.mark.parametrize( + "op", [operator.mul, ops.rmul, operator.truediv, ops.rdiv, ops.rsub] + ) + @pytest.mark.parametrize( + "arr", + [ + np.array([Timestamp("20130101 9:01"), Timestamp("20121230 9:02")]), + np.array([Timestamp.now(), Timedelta("1D")]), + ], + ) def test_td_op_timedelta_timedeltalike_array(self, op, arr): with pytest.raises(TypeError): - op(arr, Timedelta('1D')) + op(arr, Timedelta("1D")) diff --git a/pandas/tests/scalar/timedelta/test_construction.py b/pandas/tests/scalar/timedelta/test_construction.py index 4d24680ac52307..9917e8bc4c9ac9 100644 --- a/pandas/tests/scalar/timedelta/test_construction.py +++ b/pandas/tests/scalar/timedelta/test_construction.py @@ -7,102 +7,108 @@ def test_construction(): - expected = np.timedelta64(10, 'D').astype('m8[ns]').view('i8') - assert Timedelta(10, unit='d').value == expected - assert Timedelta(10.0, unit='d').value == expected - assert Timedelta('10 days').value == expected + expected = np.timedelta64(10, "D").astype("m8[ns]").view("i8") + assert Timedelta(10, unit="d").value == expected + assert Timedelta(10.0, unit="d").value == expected + assert Timedelta("10 days").value == expected assert Timedelta(days=10).value == expected assert Timedelta(days=10.0).value == expected - expected += np.timedelta64(10, 's').astype('m8[ns]').view('i8') - assert Timedelta('10 days 00:00:10').value == expected + expected += np.timedelta64(10, "s").astype("m8[ns]").view("i8") + assert Timedelta("10 days 00:00:10").value == expected assert Timedelta(days=10, seconds=10).value == expected assert Timedelta(days=10, milliseconds=10 * 1000).value == expected - assert Timedelta(days=10, - microseconds=10 * 1000 * 1000).value == expected + assert Timedelta(days=10, microseconds=10 * 1000 * 1000).value == expected # rounding cases assert Timedelta(82739999850000).value == 82739999850000 - assert ('0 days 22:58:59.999850' in str(Timedelta(82739999850000))) + assert "0 days 22:58:59.999850" in str(Timedelta(82739999850000)) assert Timedelta(123072001000000).value == 123072001000000 - assert ('1 days 10:11:12.001' in str(Timedelta(123072001000000))) + assert "1 days 10:11:12.001" in str(Timedelta(123072001000000)) # string conversion with/without leading zero # GH#9570 - assert Timedelta('0:00:00') == timedelta(hours=0) - assert Timedelta('00:00:00') == timedelta(hours=0) - assert Timedelta('-1:00:00') == -timedelta(hours=1) - assert Timedelta('-01:00:00') == -timedelta(hours=1) + assert Timedelta("0:00:00") == timedelta(hours=0) + assert Timedelta("00:00:00") == timedelta(hours=0) + assert Timedelta("-1:00:00") == -timedelta(hours=1) + assert Timedelta("-01:00:00") == -timedelta(hours=1) # more strings & abbrevs # GH#8190 - assert Timedelta('1 h') == timedelta(hours=1) - assert Timedelta('1 hour') == timedelta(hours=1) - assert Timedelta('1 hr') == timedelta(hours=1) - assert Timedelta('1 hours') == timedelta(hours=1) - assert Timedelta('-1 hours') == -timedelta(hours=1) - assert Timedelta('1 m') == timedelta(minutes=1) - assert Timedelta('1.5 m') == timedelta(seconds=90) - assert Timedelta('1 minute') == timedelta(minutes=1) - assert Timedelta('1 minutes') == timedelta(minutes=1) - assert Timedelta('1 s') == timedelta(seconds=1) - assert Timedelta('1 second') == timedelta(seconds=1) - assert Timedelta('1 seconds') == timedelta(seconds=1) - assert Timedelta('1 ms') == timedelta(milliseconds=1) - assert Timedelta('1 milli') == timedelta(milliseconds=1) - assert Timedelta('1 millisecond') == timedelta(milliseconds=1) - assert Timedelta('1 us') == timedelta(microseconds=1) - assert Timedelta('1 micros') == timedelta(microseconds=1) - assert Timedelta('1 microsecond') == timedelta(microseconds=1) - assert Timedelta('1.5 microsecond') == Timedelta('00:00:00.000001500') - assert Timedelta('1 ns') == Timedelta('00:00:00.000000001') - assert Timedelta('1 nano') == Timedelta('00:00:00.000000001') - assert Timedelta('1 nanosecond') == Timedelta('00:00:00.000000001') + assert Timedelta("1 h") == timedelta(hours=1) + assert Timedelta("1 hour") == timedelta(hours=1) + assert Timedelta("1 hr") == timedelta(hours=1) + assert Timedelta("1 hours") == timedelta(hours=1) + assert Timedelta("-1 hours") == -timedelta(hours=1) + assert Timedelta("1 m") == timedelta(minutes=1) + assert Timedelta("1.5 m") == timedelta(seconds=90) + assert Timedelta("1 minute") == timedelta(minutes=1) + assert Timedelta("1 minutes") == timedelta(minutes=1) + assert Timedelta("1 s") == timedelta(seconds=1) + assert Timedelta("1 second") == timedelta(seconds=1) + assert Timedelta("1 seconds") == timedelta(seconds=1) + assert Timedelta("1 ms") == timedelta(milliseconds=1) + assert Timedelta("1 milli") == timedelta(milliseconds=1) + assert Timedelta("1 millisecond") == timedelta(milliseconds=1) + assert Timedelta("1 us") == timedelta(microseconds=1) + assert Timedelta("1 micros") == timedelta(microseconds=1) + assert Timedelta("1 microsecond") == timedelta(microseconds=1) + assert Timedelta("1.5 microsecond") == Timedelta("00:00:00.000001500") + assert Timedelta("1 ns") == Timedelta("00:00:00.000000001") + assert Timedelta("1 nano") == Timedelta("00:00:00.000000001") + assert Timedelta("1 nanosecond") == Timedelta("00:00:00.000000001") # combos - assert Timedelta('10 days 1 hour') == timedelta(days=10, hours=1) - assert Timedelta('10 days 1 h') == timedelta(days=10, hours=1) - assert Timedelta('10 days 1 h 1m 1s') == timedelta( - days=10, hours=1, minutes=1, seconds=1) - assert Timedelta('-10 days 1 h 1m 1s') == -timedelta( - days=10, hours=1, minutes=1, seconds=1) - assert Timedelta('-10 days 1 h 1m 1s') == -timedelta( - days=10, hours=1, minutes=1, seconds=1) - assert Timedelta('-10 days 1 h 1m 1s 3us') == -timedelta( - days=10, hours=1, minutes=1, seconds=1, microseconds=3) - assert Timedelta('-10 days 1 h 1.5m 1s 3us') == -timedelta( - days=10, hours=1, minutes=1, seconds=31, microseconds=3) + assert Timedelta("10 days 1 hour") == timedelta(days=10, hours=1) + assert Timedelta("10 days 1 h") == timedelta(days=10, hours=1) + assert Timedelta("10 days 1 h 1m 1s") == timedelta( + days=10, hours=1, minutes=1, seconds=1 + ) + assert Timedelta("-10 days 1 h 1m 1s") == -timedelta( + days=10, hours=1, minutes=1, seconds=1 + ) + assert Timedelta("-10 days 1 h 1m 1s") == -timedelta( + days=10, hours=1, minutes=1, seconds=1 + ) + assert Timedelta("-10 days 1 h 1m 1s 3us") == -timedelta( + days=10, hours=1, minutes=1, seconds=1, microseconds=3 + ) + assert Timedelta("-10 days 1 h 1.5m 1s 3us") == -timedelta( + days=10, hours=1, minutes=1, seconds=31, microseconds=3 + ) # Currently invalid as it has a - on the hh:mm:dd part # (only allowed on the days) with pytest.raises(ValueError): - Timedelta('-10 days -1 h 1.5m 1s 3us') + Timedelta("-10 days -1 h 1.5m 1s 3us") # only leading neg signs are allowed with pytest.raises(ValueError): - Timedelta('10 days -1 h 1.5m 1s 3us') + Timedelta("10 days -1 h 1.5m 1s 3us") # no units specified with pytest.raises(ValueError): - Timedelta('3.1415') + Timedelta("3.1415") # invalid construction with pytest.raises(ValueError, match="cannot construct a Timedelta"): Timedelta() with pytest.raises(ValueError, match="unit abbreviation w/o a number"): - Timedelta('foo') + Timedelta("foo") - msg = ("cannot construct a Timedelta from " - "the passed arguments, allowed keywords are ") + msg = ( + "cannot construct a Timedelta from " + "the passed arguments, allowed keywords are " + ) with pytest.raises(ValueError, match=msg): Timedelta(day=10) # floats - expected = np.timedelta64( - 10, 's').astype('m8[ns]').view('i8') + np.timedelta64( - 500, 'ms').astype('m8[ns]').view('i8') - assert Timedelta(10.5, unit='s').value == expected + expected = np.timedelta64(10, "s").astype("m8[ns]").view("i8") + np.timedelta64( + 500, "ms" + ).astype("m8[ns]").view("i8") + assert Timedelta(10.5, unit="s").value == expected # offset assert to_timedelta(offsets.Hour(2)) == Timedelta(hours=2) @@ -110,35 +116,55 @@ def test_construction(): assert Timedelta(offsets.Second(2)) == Timedelta(seconds=2) # GH#11995: unicode - expected = Timedelta('1H') - result = Timedelta('1H') + expected = Timedelta("1H") + result = Timedelta("1H") assert result == expected - assert to_timedelta(offsets.Hour(2)) == Timedelta('0 days, 02:00:00') + assert to_timedelta(offsets.Hour(2)) == Timedelta("0 days, 02:00:00") with pytest.raises(ValueError): - Timedelta('foo bar') - - -@pytest.mark.parametrize('item', list({'days': 'D', - 'seconds': 's', - 'microseconds': 'us', - 'milliseconds': 'ms', - 'minutes': 'm', - 'hours': 'h', - 'weeks': 'W'}.items())) -@pytest.mark.parametrize('npdtype', [np.int64, np.int32, np.int16, - np.float64, np.float32, np.float16]) + Timedelta("foo bar") + + +@pytest.mark.parametrize( + "item", + list( + { + "days": "D", + "seconds": "s", + "microseconds": "us", + "milliseconds": "ms", + "minutes": "m", + "hours": "h", + "weeks": "W", + }.items() + ), +) +@pytest.mark.parametrize( + "npdtype", [np.int64, np.int32, np.int16, np.float64, np.float32, np.float16] +) def test_td_construction_with_np_dtypes(npdtype, item): # GH#8757: test construction with np dtypes pykwarg, npkwarg = item - expected = np.timedelta64(1, npkwarg).astype('m8[ns]').view('i8') + expected = np.timedelta64(1, npkwarg).astype("m8[ns]").view("i8") assert Timedelta(**{pykwarg: npdtype(1)}).value == expected -@pytest.mark.parametrize('val', [ - '1s', '-1s', '1us', '-1us', '1 day', '-1 day', - '-23:59:59.999999', '-1 days +23:59:59.999999', '-1ns', - '1ns', '-23:59:59.999999999']) +@pytest.mark.parametrize( + "val", + [ + "1s", + "-1s", + "1us", + "-1us", + "1 day", + "-1 day", + "-23:59:59.999999", + "-1 days +23:59:59.999999", + "-1ns", + "1ns", + "-23:59:59.999999999", + ], +) def test_td_from_repr_roundtrip(val): # round-trip both for string and value td = Timedelta(val) @@ -147,58 +173,100 @@ def test_td_from_repr_roundtrip(val): # str does not normally display nanos if not td.nanoseconds: assert Timedelta(str(td)) == td - assert Timedelta(td._repr_base(format='all')) == td + assert Timedelta(td._repr_base(format="all")) == td def test_overflow_on_construction(): # GH#3374 - value = Timedelta('1day').value * 20169940 + value = Timedelta("1day").value * 20169940 with pytest.raises(OverflowError): Timedelta(value) # xref GH#17637 with pytest.raises(OverflowError): - Timedelta(7 * 19999, unit='D') + Timedelta(7 * 19999, unit="D") with pytest.raises(OverflowError): Timedelta(timedelta(days=13 * 19999)) -@pytest.mark.parametrize('fmt,exp', [ - ('P6DT0H50M3.010010012S', Timedelta(days=6, minutes=50, seconds=3, - milliseconds=10, microseconds=10, - nanoseconds=12)), - ('P-6DT0H50M3.010010012S', Timedelta(days=-6, minutes=50, seconds=3, - milliseconds=10, microseconds=10, - nanoseconds=12)), - ('P4DT12H30M5S', Timedelta(days=4, hours=12, minutes=30, seconds=5)), - ('P0DT0H0M0.000000123S', Timedelta(nanoseconds=123)), - ('P0DT0H0M0.00001S', Timedelta(microseconds=10)), - ('P0DT0H0M0.001S', Timedelta(milliseconds=1)), - ('P0DT0H1M0S', Timedelta(minutes=1)), - ('P1DT25H61M61S', Timedelta(days=1, hours=25, minutes=61, seconds=61)) -]) +@pytest.mark.parametrize( + "fmt,exp", + [ + ( + "P6DT0H50M3.010010012S", + Timedelta( + days=6, + minutes=50, + seconds=3, + milliseconds=10, + microseconds=10, + nanoseconds=12, + ), + ), + ( + "P-6DT0H50M3.010010012S", + Timedelta( + days=-6, + minutes=50, + seconds=3, + milliseconds=10, + microseconds=10, + nanoseconds=12, + ), + ), + ("P4DT12H30M5S", Timedelta(days=4, hours=12, minutes=30, seconds=5)), + ("P0DT0H0M0.000000123S", Timedelta(nanoseconds=123)), + ("P0DT0H0M0.00001S", Timedelta(microseconds=10)), + ("P0DT0H0M0.001S", Timedelta(milliseconds=1)), + ("P0DT0H1M0S", Timedelta(minutes=1)), + ("P1DT25H61M61S", Timedelta(days=1, hours=25, minutes=61, seconds=61)), + ], +) def test_iso_constructor(fmt, exp): assert Timedelta(fmt) == exp -@pytest.mark.parametrize('fmt', [ - 'PPPPPPPPPPPP', 'PDTHMS', 'P0DT999H999M999S', - 'P1DT0H0M0.0000000000000S', 'P1DT0H0M00000000000S', - 'P1DT0H0M0.S']) +@pytest.mark.parametrize( + "fmt", + [ + "PPPPPPPPPPPP", + "PDTHMS", + "P0DT999H999M999S", + "P1DT0H0M0.0000000000000S", + "P1DT0H0M00000000000S", + "P1DT0H0M0.S", + ], +) def test_iso_constructor_raises(fmt): - with pytest.raises(ValueError, match=('Invalid ISO 8601 Duration ' - 'format - {}'.format(fmt))): + with pytest.raises( + ValueError, match=("Invalid ISO 8601 Duration " "format - {}".format(fmt)) + ): Timedelta(fmt) -@pytest.mark.parametrize('constructed_td, conversion', [ - (Timedelta(nanoseconds=100), '100ns'), - (Timedelta(days=1, hours=1, minutes=1, weeks=1, seconds=1, milliseconds=1, - microseconds=1, nanoseconds=1), 694861001001001), - (Timedelta(microseconds=1) + Timedelta(nanoseconds=1), '1us1ns'), - (Timedelta(microseconds=1) - Timedelta(nanoseconds=1), '999ns'), - (Timedelta(microseconds=1) + 5 * Timedelta(nanoseconds=-2), '990ns')]) +@pytest.mark.parametrize( + "constructed_td, conversion", + [ + (Timedelta(nanoseconds=100), "100ns"), + ( + Timedelta( + days=1, + hours=1, + minutes=1, + weeks=1, + seconds=1, + milliseconds=1, + microseconds=1, + nanoseconds=1, + ), + 694861001001001, + ), + (Timedelta(microseconds=1) + Timedelta(nanoseconds=1), "1us1ns"), + (Timedelta(microseconds=1) - Timedelta(nanoseconds=1), "999ns"), + (Timedelta(microseconds=1) + 5 * Timedelta(nanoseconds=-2), "990ns"), + ], +) def test_td_constructor_on_nanoseconds(constructed_td, conversion): # GH#9273 assert constructed_td == Timedelta(conversion) @@ -206,4 +274,4 @@ def test_td_constructor_on_nanoseconds(constructed_td, conversion): def test_td_constructor_value_error(): with pytest.raises(TypeError): - Timedelta(nanoseconds='abc') + Timedelta(nanoseconds="abc") diff --git a/pandas/tests/scalar/timedelta/test_formats.py b/pandas/tests/scalar/timedelta/test_formats.py index 7db79f3f832f99..753186ee4b7382 100644 --- a/pandas/tests/scalar/timedelta/test_formats.py +++ b/pandas/tests/scalar/timedelta/test_formats.py @@ -3,25 +3,42 @@ from pandas import Timedelta -@pytest.mark.parametrize('td, expected_repr', [ - (Timedelta(10, unit='d'), "Timedelta('10 days 00:00:00')"), - (Timedelta(10, unit='s'), "Timedelta('0 days 00:00:10')"), - (Timedelta(10, unit='ms'), "Timedelta('0 days 00:00:00.010000')"), - (Timedelta(-10, unit='ms'), "Timedelta('-1 days +23:59:59.990000')")]) +@pytest.mark.parametrize( + "td, expected_repr", + [ + (Timedelta(10, unit="d"), "Timedelta('10 days 00:00:00')"), + (Timedelta(10, unit="s"), "Timedelta('0 days 00:00:10')"), + (Timedelta(10, unit="ms"), "Timedelta('0 days 00:00:00.010000')"), + (Timedelta(-10, unit="ms"), "Timedelta('-1 days +23:59:59.990000')"), + ], +) def test_repr(td, expected_repr): assert repr(td) == expected_repr -@pytest.mark.parametrize('td, expected_iso', [ - (Timedelta(days=6, minutes=50, seconds=3, milliseconds=10, microseconds=10, - nanoseconds=12), 'P6DT0H50M3.010010012S'), - (Timedelta(days=4, hours=12, minutes=30, seconds=5), 'P4DT12H30M5S'), - (Timedelta(nanoseconds=123), 'P0DT0H0M0.000000123S'), - # trim nano - (Timedelta(microseconds=10), 'P0DT0H0M0.00001S'), - # trim micro - (Timedelta(milliseconds=1), 'P0DT0H0M0.001S'), - # don't strip every 0 - (Timedelta(minutes=1), 'P0DT0H1M0S')]) +@pytest.mark.parametrize( + "td, expected_iso", + [ + ( + Timedelta( + days=6, + minutes=50, + seconds=3, + milliseconds=10, + microseconds=10, + nanoseconds=12, + ), + "P6DT0H50M3.010010012S", + ), + (Timedelta(days=4, hours=12, minutes=30, seconds=5), "P4DT12H30M5S"), + (Timedelta(nanoseconds=123), "P0DT0H0M0.000000123S"), + # trim nano + (Timedelta(microseconds=10), "P0DT0H0M0.00001S"), + # trim micro + (Timedelta(milliseconds=1), "P0DT0H0M0.001S"), + # don't strip every 0 + (Timedelta(minutes=1), "P0DT0H1M0S"), + ], +) def test_isoformat(td, expected_iso): assert td.isoformat() == expected_iso diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 469072970133d0..e4980be49d35fd 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -8,37 +8,36 @@ from pandas._libs.tslibs import NaT, iNaT import pandas as pd -from pandas import ( - Series, Timedelta, TimedeltaIndex, timedelta_range, to_timedelta) +from pandas import Series, Timedelta, TimedeltaIndex, timedelta_range, to_timedelta import pandas.util.testing as tm class TestTimedeltaArithmetic: - def test_arithmetic_overflow(self): with pytest.raises(OverflowError): - pd.Timestamp('1700-01-01') + pd.Timedelta(13 * 19999, unit='D') + pd.Timestamp("1700-01-01") + pd.Timedelta(13 * 19999, unit="D") with pytest.raises(OverflowError): - pd.Timestamp('1700-01-01') + timedelta(days=13 * 19999) + pd.Timestamp("1700-01-01") + timedelta(days=13 * 19999) def test_array_timedelta_floordiv(self): # https://github.com/pandas-dev/pandas/issues/19761 - ints = pd.date_range('2012-10-08', periods=4, freq='D').view('i8') + ints = pd.date_range("2012-10-08", periods=4, freq="D").view("i8") msg = r"Use 'array // timedelta.value'" with tm.assert_produces_warning(FutureWarning) as m: - result = ints // pd.Timedelta(1, unit='s') + result = ints // pd.Timedelta(1, unit="s") assert msg in str(m[0].message) - expected = np.array([1349654400, 1349740800, 1349827200, 1349913600], - dtype='i8') + expected = np.array( + [1349654400, 1349740800, 1349827200, 1349913600], dtype="i8" + ) tm.assert_numpy_array_equal(result, expected) def test_ops_error_str(self): # GH 13624 - td = Timedelta('1 day') + td = Timedelta("1 day") - for left, right in [(td, 'a'), ('a', td)]: + for left, right in [(td, "a"), ("a", td)]: with pytest.raises(TypeError): left + right @@ -55,7 +54,7 @@ class Other: other = Other() - td = Timedelta('1 day') + td = Timedelta("1 day") assert td.__add__(other) is NotImplemented assert td.__sub__(other) is NotImplemented assert td.__truediv__(other) is NotImplemented @@ -63,17 +62,17 @@ class Other: assert td.__floordiv__(other) is NotImplemented def test_unary_ops(self): - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") # __neg__, __pos__ - assert -td == Timedelta(-10, unit='d') - assert -td == Timedelta('-10d') - assert +td == Timedelta(10, unit='d') + assert -td == Timedelta(-10, unit="d") + assert -td == Timedelta("-10d") + assert +td == Timedelta(10, unit="d") # __abs__, __abs__(__neg__) assert abs(td) == td assert abs(-td) == td - assert abs(-td) == Timedelta('10d') + assert abs(-td) == Timedelta("10d") class TestTimedeltaComparison: @@ -100,8 +99,8 @@ def test_compare_tick(self, tick_classes): def test_comparison_object_array(self): # analogous to GH#15183 - td = Timedelta('2 days') - other = Timedelta('3 hours') + td = Timedelta("2 days") + other = Timedelta("3 hours") arr = np.array([other, td], dtype=object) res = arr == td @@ -109,9 +108,7 @@ def test_comparison_object_array(self): assert (res == expected).all() # 2D case - arr = np.array([[other, td], - [td, other]], - dtype=object) + arr = np.array([[other, td], [td, other]], dtype=object) res = arr != td expected = np.array([[True, False], [False, True]], dtype=bool) assert res.shape == expected.shape @@ -119,7 +116,7 @@ def test_comparison_object_array(self): def test_compare_timedelta_ndarray(self): # GH11835 - periods = [Timedelta('0 days 01:00:00'), Timedelta('0 days 01:00:00')] + periods = [Timedelta("0 days 01:00:00"), Timedelta("0 days 01:00:00")] arr = np.array(periods) result = arr[0] > arr expected = np.array([False, False]) @@ -131,8 +128,8 @@ def test_compare_custom_object(self): Make sure non supported operations on Timedelta returns NonImplemented and yields to other operand (GH#20829). """ - class CustomClass: + class CustomClass: def __init__(self, cmp_result=None): self.cmp_result = cmp_result @@ -148,7 +145,7 @@ def __eq__(self, other): def __gt__(self, other): return self.generic_result() - t = Timedelta('1s') + t = Timedelta("1s") assert not (t == "string") assert not (t == 1) @@ -163,7 +160,7 @@ def __gt__(self, other): @pytest.mark.parametrize("val", ["string", 1]) def test_compare_unknown_type(self, val): # GH20829 - t = Timedelta('1s') + t = Timedelta("1s") with pytest.raises(TypeError): t >= val with pytest.raises(TypeError): @@ -175,10 +172,14 @@ def test_compare_unknown_type(self, val): class TestTimedeltas: - - @pytest.mark.parametrize("unit, value, expected", [ - ('us', 9.999, 9999), ('ms', 9.999999, 9999999), - ('s', 9.999999999, 9999999999)]) + @pytest.mark.parametrize( + "unit, value, expected", + [ + ("us", 9.999, 9999), + ("ms", 9.999999, 9999999), + ("s", 9.999999999, 9999999999), + ], + ) def test_rounding_on_int_unit_construction(self, unit, value, expected): # GH 12690 result = Timedelta(value, unit=unit) @@ -188,8 +189,8 @@ def test_rounding_on_int_unit_construction(self, unit, value, expected): def test_total_seconds_scalar(self): # see gh-10939 - rng = Timedelta('1 days, 10:11:12.100123456') - expt = 1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456. / 1e9 + rng = Timedelta("1 days, 10:11:12.100123456") + expt = 1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456.0 / 1e9 tm.assert_almost_equal(rng.total_seconds(), expt) rng = Timedelta(np.nan) @@ -197,44 +198,42 @@ def test_total_seconds_scalar(self): def test_conversion(self): - for td in [Timedelta(10, unit='d'), - Timedelta('1 days, 10:11:12.012345')]: + for td in [Timedelta(10, unit="d"), Timedelta("1 days, 10:11:12.012345")]: pydt = td.to_pytimedelta() assert td == Timedelta(pydt) assert td == pydt - assert (isinstance(pydt, timedelta) and not isinstance( - pydt, Timedelta)) + assert isinstance(pydt, timedelta) and not isinstance(pydt, Timedelta) - assert td == np.timedelta64(td.value, 'ns') + assert td == np.timedelta64(td.value, "ns") td64 = td.to_timedelta64() - assert td64 == np.timedelta64(td.value, 'ns') + assert td64 == np.timedelta64(td.value, "ns") assert td == td64 assert isinstance(td64, np.timedelta64) # this is NOT equal and cannot be roundtripped (because of the nanos) - td = Timedelta('1 days, 10:11:12.012345678') + td = Timedelta("1 days, 10:11:12.012345678") assert td != td.to_pytimedelta() def test_freq_conversion(self): # truediv - td = Timedelta('1 days 2 hours 3 ns') - result = td / np.timedelta64(1, 'D') + td = Timedelta("1 days 2 hours 3 ns") + result = td / np.timedelta64(1, "D") assert result == td.value / float(86400 * 1e9) - result = td / np.timedelta64(1, 's') + result = td / np.timedelta64(1, "s") assert result == td.value / float(1e9) - result = td / np.timedelta64(1, 'ns') + result = td / np.timedelta64(1, "ns") assert result == td.value # floordiv - td = Timedelta('1 days 2 hours 3 ns') - result = td // np.timedelta64(1, 'D') + td = Timedelta("1 days 2 hours 3 ns") + result = td // np.timedelta64(1, "D") assert result == 1 - result = td // np.timedelta64(1, 's') + result = td // np.timedelta64(1, "s") assert result == 93600 - result = td // np.timedelta64(1, 'ns') + result = td // np.timedelta64(1, "ns") assert result == td.value def test_fields(self): @@ -243,18 +242,18 @@ def check(value): assert isinstance(value, int) # compat to datetime.timedelta - rng = to_timedelta('1 days, 10:11:12') + rng = to_timedelta("1 days, 10:11:12") assert rng.days == 1 assert rng.seconds == 10 * 3600 + 11 * 60 + 12 assert rng.microseconds == 0 assert rng.nanoseconds == 0 msg = "'Timedelta' object has no attribute '{}'" - with pytest.raises(AttributeError, match=msg.format('hours')): + with pytest.raises(AttributeError, match=msg.format("hours")): rng.hours - with pytest.raises(AttributeError, match=msg.format('minutes')): + with pytest.raises(AttributeError, match=msg.format("minutes")): rng.minutes - with pytest.raises(AttributeError, match=msg.format('milliseconds')): + with pytest.raises(AttributeError, match=msg.format("milliseconds")): rng.milliseconds # GH 10050 @@ -263,28 +262,28 @@ def check(value): check(rng.microseconds) check(rng.nanoseconds) - td = Timedelta('-1 days, 10:11:12') - assert abs(td) == Timedelta('13:48:48') + td = Timedelta("-1 days, 10:11:12") + assert abs(td) == Timedelta("13:48:48") assert str(td) == "-1 days +10:11:12" - assert -td == Timedelta('0 days 13:48:48') - assert -Timedelta('-1 days, 10:11:12').value == 49728000000000 - assert Timedelta('-1 days, 10:11:12').value == -49728000000000 + assert -td == Timedelta("0 days 13:48:48") + assert -Timedelta("-1 days, 10:11:12").value == 49728000000000 + assert Timedelta("-1 days, 10:11:12").value == -49728000000000 - rng = to_timedelta('-1 days, 10:11:12.100123456') + rng = to_timedelta("-1 days, 10:11:12.100123456") assert rng.days == -1 assert rng.seconds == 10 * 3600 + 11 * 60 + 12 assert rng.microseconds == 100 * 1000 + 123 assert rng.nanoseconds == 456 msg = "'Timedelta' object has no attribute '{}'" - with pytest.raises(AttributeError, match=msg.format('hours')): + with pytest.raises(AttributeError, match=msg.format("hours")): rng.hours - with pytest.raises(AttributeError, match=msg.format('minutes')): + with pytest.raises(AttributeError, match=msg.format("minutes")): rng.minutes - with pytest.raises(AttributeError, match=msg.format('milliseconds')): + with pytest.raises(AttributeError, match=msg.format("milliseconds")): rng.milliseconds # components - tup = pd.to_timedelta(-1, 'us').components + tup = pd.to_timedelta(-1, "us").components assert tup.days == -1 assert tup.hours == 23 assert tup.minutes == 59 @@ -302,7 +301,7 @@ def check(value): check(tup.microseconds) check(tup.nanoseconds) - tup = Timedelta('-1 days 1 us').components + tup = Timedelta("-1 days 1 us").components assert tup.days == -2 assert tup.hours == 23 assert tup.minutes == 59 @@ -313,142 +312,189 @@ def check(value): def test_iso_conversion(self): # GH #21877 - expected = Timedelta(1, unit='s') - assert to_timedelta('P0DT0H0M1S') == expected + expected = Timedelta(1, unit="s") + assert to_timedelta("P0DT0H0M1S") == expected def test_nat_converters(self): - result = to_timedelta('nat').to_numpy() - assert result.dtype.kind == 'M' - assert result.astype('int64') == iNaT + result = to_timedelta("nat").to_numpy() + assert result.dtype.kind == "M" + assert result.astype("int64") == iNaT - result = to_timedelta('nan').to_numpy() - assert result.dtype.kind == 'M' - assert result.astype('int64') == iNaT + result = to_timedelta("nan").to_numpy() + assert result.dtype.kind == "M" + assert result.astype("int64") == iNaT @pytest.mark.filterwarnings("ignore:M and Y units are deprecated") - @pytest.mark.parametrize('units, np_unit', - [(['Y', 'y'], 'Y'), - (['M'], 'M'), - (['W', 'w'], 'W'), - (['D', 'd', 'days', 'day', 'Days', 'Day'], 'D'), - (['m', 'minute', 'min', 'minutes', 't', - 'Minute', 'Min', 'Minutes', 'T'], 'm'), - (['s', 'seconds', 'sec', 'second', - 'S', 'Seconds', 'Sec', 'Second'], 's'), - (['ms', 'milliseconds', 'millisecond', 'milli', - 'millis', 'l', 'MS', 'Milliseconds', - 'Millisecond', 'Milli', 'Millis', 'L'], 'ms'), - (['us', 'microseconds', 'microsecond', 'micro', - 'micros', 'u', 'US', 'Microseconds', - 'Microsecond', 'Micro', 'Micros', 'U'], 'us'), - (['ns', 'nanoseconds', 'nanosecond', 'nano', - 'nanos', 'n', 'NS', 'Nanoseconds', - 'Nanosecond', 'Nano', 'Nanos', 'N'], 'ns')]) - @pytest.mark.parametrize('wrapper', [np.array, list, pd.Index]) + @pytest.mark.parametrize( + "units, np_unit", + [ + (["Y", "y"], "Y"), + (["M"], "M"), + (["W", "w"], "W"), + (["D", "d", "days", "day", "Days", "Day"], "D"), + ( + ["m", "minute", "min", "minutes", "t", "Minute", "Min", "Minutes", "T"], + "m", + ), + (["s", "seconds", "sec", "second", "S", "Seconds", "Sec", "Second"], "s"), + ( + [ + "ms", + "milliseconds", + "millisecond", + "milli", + "millis", + "l", + "MS", + "Milliseconds", + "Millisecond", + "Milli", + "Millis", + "L", + ], + "ms", + ), + ( + [ + "us", + "microseconds", + "microsecond", + "micro", + "micros", + "u", + "US", + "Microseconds", + "Microsecond", + "Micro", + "Micros", + "U", + ], + "us", + ), + ( + [ + "ns", + "nanoseconds", + "nanosecond", + "nano", + "nanos", + "n", + "NS", + "Nanoseconds", + "Nanosecond", + "Nano", + "Nanos", + "N", + ], + "ns", + ), + ], + ) + @pytest.mark.parametrize("wrapper", [np.array, list, pd.Index]) def test_unit_parser(self, units, np_unit, wrapper): # validate all units, GH 6855, GH 21762 for unit in units: # array-likes - expected = TimedeltaIndex([np.timedelta64(i, np_unit) - for i in np.arange(5).tolist()]) + expected = TimedeltaIndex( + [np.timedelta64(i, np_unit) for i in np.arange(5).tolist()] + ) result = to_timedelta(wrapper(range(5)), unit=unit) tm.assert_index_equal(result, expected) result = TimedeltaIndex(wrapper(range(5)), unit=unit) tm.assert_index_equal(result, expected) - if unit == 'M': + if unit == "M": # M is treated as minutes in string repr - expected = TimedeltaIndex([np.timedelta64(i, 'm') - for i in np.arange(5).tolist()]) + expected = TimedeltaIndex( + [np.timedelta64(i, "m") for i in np.arange(5).tolist()] + ) - str_repr = ['{}{}'.format(x, unit) for x in np.arange(5)] + str_repr = ["{}{}".format(x, unit) for x in np.arange(5)] result = to_timedelta(wrapper(str_repr)) tm.assert_index_equal(result, expected) result = TimedeltaIndex(wrapper(str_repr)) tm.assert_index_equal(result, expected) # scalar - expected = Timedelta(np.timedelta64(2, np_unit).astype( - 'timedelta64[ns]')) + expected = Timedelta(np.timedelta64(2, np_unit).astype("timedelta64[ns]")) result = to_timedelta(2, unit=unit) assert result == expected result = Timedelta(2, unit=unit) assert result == expected - if unit == 'M': - expected = Timedelta(np.timedelta64(2, 'm').astype( - 'timedelta64[ns]')) + if unit == "M": + expected = Timedelta(np.timedelta64(2, "m").astype("timedelta64[ns]")) - result = to_timedelta('2{}'.format(unit)) + result = to_timedelta("2{}".format(unit)) assert result == expected - result = Timedelta('2{}'.format(unit)) + result = Timedelta("2{}".format(unit)) assert result == expected - @pytest.mark.parametrize('unit', ['Y', 'y', 'M']) + @pytest.mark.parametrize("unit", ["Y", "y", "M"]) def test_unit_m_y_deprecated(self, unit): with tm.assert_produces_warning(FutureWarning) as w1: Timedelta(10, unit) - msg = r'.* units are deprecated .*' + msg = r".* units are deprecated .*" assert re.match(msg, str(w1[0].message)) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False) as w2: + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w2: to_timedelta(10, unit) - msg = r'.* units are deprecated .*' + msg = r".* units are deprecated .*" assert re.match(msg, str(w2[0].message)) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False) as w3: + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w3: to_timedelta([1, 2], unit) - msg = r'.* units are deprecated .*' + msg = r".* units are deprecated .*" assert re.match(msg, str(w3[0].message)) def test_numeric_conversions(self): - assert Timedelta(0) == np.timedelta64(0, 'ns') - assert Timedelta(10) == np.timedelta64(10, 'ns') - assert Timedelta(10, unit='ns') == np.timedelta64(10, 'ns') + assert Timedelta(0) == np.timedelta64(0, "ns") + assert Timedelta(10) == np.timedelta64(10, "ns") + assert Timedelta(10, unit="ns") == np.timedelta64(10, "ns") - assert Timedelta(10, unit='us') == np.timedelta64(10, 'us') - assert Timedelta(10, unit='ms') == np.timedelta64(10, 'ms') - assert Timedelta(10, unit='s') == np.timedelta64(10, 's') - assert Timedelta(10, unit='d') == np.timedelta64(10, 'D') + assert Timedelta(10, unit="us") == np.timedelta64(10, "us") + assert Timedelta(10, unit="ms") == np.timedelta64(10, "ms") + assert Timedelta(10, unit="s") == np.timedelta64(10, "s") + assert Timedelta(10, unit="d") == np.timedelta64(10, "D") def test_timedelta_conversions(self): - assert (Timedelta(timedelta(seconds=1)) == - np.timedelta64(1, 's').astype('m8[ns]')) - assert (Timedelta(timedelta(microseconds=1)) == - np.timedelta64(1, 'us').astype('m8[ns]')) - assert (Timedelta(timedelta(days=1)) == - np.timedelta64(1, 'D').astype('m8[ns]')) + assert Timedelta(timedelta(seconds=1)) == np.timedelta64(1, "s").astype( + "m8[ns]" + ) + assert Timedelta(timedelta(microseconds=1)) == np.timedelta64(1, "us").astype( + "m8[ns]" + ) + assert Timedelta(timedelta(days=1)) == np.timedelta64(1, "D").astype("m8[ns]") def test_to_numpy_alias(self): # GH 24653: alias .to_numpy() for scalars - td = Timedelta('10m7s') + td = Timedelta("10m7s") assert td.to_timedelta64() == td.to_numpy() def test_round(self): - t1 = Timedelta('1 days 02:34:56.789123456') - t2 = Timedelta('-1 days 02:34:56.789123456') - - for (freq, s1, s2) in [('N', t1, t2), - ('U', Timedelta('1 days 02:34:56.789123000'), - Timedelta('-1 days 02:34:56.789123000')), - ('L', Timedelta('1 days 02:34:56.789000000'), - Timedelta('-1 days 02:34:56.789000000')), - ('S', Timedelta('1 days 02:34:57'), - Timedelta('-1 days 02:34:57')), - ('2S', Timedelta('1 days 02:34:56'), - Timedelta('-1 days 02:34:56')), - ('5S', Timedelta('1 days 02:34:55'), - Timedelta('-1 days 02:34:55')), - ('T', Timedelta('1 days 02:35:00'), - Timedelta('-1 days 02:35:00')), - ('12T', Timedelta('1 days 02:36:00'), - Timedelta('-1 days 02:36:00')), - ('H', Timedelta('1 days 03:00:00'), - Timedelta('-1 days 03:00:00')), - ('d', Timedelta('1 days'), - Timedelta('-1 days'))]: + t1 = Timedelta("1 days 02:34:56.789123456") + t2 = Timedelta("-1 days 02:34:56.789123456") + + for (freq, s1, s2) in [ + ("N", t1, t2), + ( + "U", + Timedelta("1 days 02:34:56.789123000"), + Timedelta("-1 days 02:34:56.789123000"), + ), + ( + "L", + Timedelta("1 days 02:34:56.789000000"), + Timedelta("-1 days 02:34:56.789000000"), + ), + ("S", Timedelta("1 days 02:34:57"), Timedelta("-1 days 02:34:57")), + ("2S", Timedelta("1 days 02:34:56"), Timedelta("-1 days 02:34:56")), + ("5S", Timedelta("1 days 02:34:55"), Timedelta("-1 days 02:34:55")), + ("T", Timedelta("1 days 02:35:00"), Timedelta("-1 days 02:35:00")), + ("12T", Timedelta("1 days 02:36:00"), Timedelta("-1 days 02:36:00")), + ("H", Timedelta("1 days 03:00:00"), Timedelta("-1 days 03:00:00")), + ("d", Timedelta("1 days"), Timedelta("-1 days")), + ]: r1 = t1.round(freq) assert r1 == s1 r2 = t2.round(freq) @@ -456,51 +502,60 @@ def test_round(self): # invalid for freq, msg in [ - ('Y', ' is a non-fixed frequency'), - ('M', ' is a non-fixed frequency'), - ('foobar', 'Invalid frequency: foobar')]: + ("Y", " is a non-fixed frequency"), + ("M", " is a non-fixed frequency"), + ("foobar", "Invalid frequency: foobar"), + ]: with pytest.raises(ValueError, match=msg): t1.round(freq) - t1 = timedelta_range('1 days', periods=3, freq='1 min 2 s 3 us') + t1 = timedelta_range("1 days", periods=3, freq="1 min 2 s 3 us") t2 = -1 * t1 - t1a = timedelta_range('1 days', periods=3, freq='1 min 2 s') - t1c = pd.TimedeltaIndex([1, 1, 1], unit='D') + t1a = timedelta_range("1 days", periods=3, freq="1 min 2 s") + t1c = pd.TimedeltaIndex([1, 1, 1], unit="D") # note that negative times round DOWN! so don't give whole numbers - for (freq, s1, s2) in [('N', t1, t2), - ('U', t1, t2), - ('L', t1a, - TimedeltaIndex(['-1 days +00:00:00', - '-2 days +23:58:58', - '-2 days +23:57:56'], - dtype='timedelta64[ns]', - freq=None) - ), - ('S', t1a, - TimedeltaIndex(['-1 days +00:00:00', - '-2 days +23:58:58', - '-2 days +23:57:56'], - dtype='timedelta64[ns]', - freq=None) - ), - ('12T', t1c, - TimedeltaIndex(['-1 days', - '-1 days', - '-1 days'], - dtype='timedelta64[ns]', - freq=None) - ), - ('H', t1c, - TimedeltaIndex(['-1 days', - '-1 days', - '-1 days'], - dtype='timedelta64[ns]', - freq=None) - ), - ('d', t1c, - pd.TimedeltaIndex([-1, -1, -1], unit='D') - )]: + for (freq, s1, s2) in [ + ("N", t1, t2), + ("U", t1, t2), + ( + "L", + t1a, + TimedeltaIndex( + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"], + dtype="timedelta64[ns]", + freq=None, + ), + ), + ( + "S", + t1a, + TimedeltaIndex( + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"], + dtype="timedelta64[ns]", + freq=None, + ), + ), + ( + "12T", + t1c, + TimedeltaIndex( + ["-1 days", "-1 days", "-1 days"], + dtype="timedelta64[ns]", + freq=None, + ), + ), + ( + "H", + t1c, + TimedeltaIndex( + ["-1 days", "-1 days", "-1 days"], + dtype="timedelta64[ns]", + freq=None, + ), + ), + ("d", t1c, pd.TimedeltaIndex([-1, -1, -1], unit="D")), + ]: r1 = t1.round(freq) tm.assert_index_equal(r1, s1) @@ -509,114 +564,115 @@ def test_round(self): # invalid for freq, msg in [ - ('Y', ' is a non-fixed frequency'), - ('M', ' is a non-fixed frequency'), - ('foobar', 'Invalid frequency: foobar')]: + ("Y", " is a non-fixed frequency"), + ("M", " is a non-fixed frequency"), + ("foobar", "Invalid frequency: foobar"), + ]: with pytest.raises(ValueError, match=msg): t1.round(freq) def test_contains(self): # Checking for any NaT-like objects # GH 13603 - td = to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) - for v in [pd.NaT, None, float('nan'), np.nan]: + td = to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) + for v in [pd.NaT, None, float("nan"), np.nan]: assert not (v in td) td = to_timedelta([pd.NaT]) - for v in [pd.NaT, None, float('nan'), np.nan]: - assert (v in td) + for v in [pd.NaT, None, float("nan"), np.nan]: + assert v in td def test_identity(self): - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") assert isinstance(td, Timedelta) assert isinstance(td, timedelta) def test_short_format_converters(self): def conv(v): - return v.astype('m8[ns]') - - assert Timedelta('10') == np.timedelta64(10, 'ns') - assert Timedelta('10ns') == np.timedelta64(10, 'ns') - assert Timedelta('100') == np.timedelta64(100, 'ns') - assert Timedelta('100ns') == np.timedelta64(100, 'ns') - - assert Timedelta('1000') == np.timedelta64(1000, 'ns') - assert Timedelta('1000ns') == np.timedelta64(1000, 'ns') - assert Timedelta('1000NS') == np.timedelta64(1000, 'ns') - - assert Timedelta('10us') == np.timedelta64(10000, 'ns') - assert Timedelta('100us') == np.timedelta64(100000, 'ns') - assert Timedelta('1000us') == np.timedelta64(1000000, 'ns') - assert Timedelta('1000Us') == np.timedelta64(1000000, 'ns') - assert Timedelta('1000uS') == np.timedelta64(1000000, 'ns') - - assert Timedelta('1ms') == np.timedelta64(1000000, 'ns') - assert Timedelta('10ms') == np.timedelta64(10000000, 'ns') - assert Timedelta('100ms') == np.timedelta64(100000000, 'ns') - assert Timedelta('1000ms') == np.timedelta64(1000000000, 'ns') - - assert Timedelta('-1s') == -np.timedelta64(1000000000, 'ns') - assert Timedelta('1s') == np.timedelta64(1000000000, 'ns') - assert Timedelta('10s') == np.timedelta64(10000000000, 'ns') - assert Timedelta('100s') == np.timedelta64(100000000000, 'ns') - assert Timedelta('1000s') == np.timedelta64(1000000000000, 'ns') - - assert Timedelta('1d') == conv(np.timedelta64(1, 'D')) - assert Timedelta('-1d') == -conv(np.timedelta64(1, 'D')) - assert Timedelta('1D') == conv(np.timedelta64(1, 'D')) - assert Timedelta('10D') == conv(np.timedelta64(10, 'D')) - assert Timedelta('100D') == conv(np.timedelta64(100, 'D')) - assert Timedelta('1000D') == conv(np.timedelta64(1000, 'D')) - assert Timedelta('10000D') == conv(np.timedelta64(10000, 'D')) + return v.astype("m8[ns]") + + assert Timedelta("10") == np.timedelta64(10, "ns") + assert Timedelta("10ns") == np.timedelta64(10, "ns") + assert Timedelta("100") == np.timedelta64(100, "ns") + assert Timedelta("100ns") == np.timedelta64(100, "ns") + + assert Timedelta("1000") == np.timedelta64(1000, "ns") + assert Timedelta("1000ns") == np.timedelta64(1000, "ns") + assert Timedelta("1000NS") == np.timedelta64(1000, "ns") + + assert Timedelta("10us") == np.timedelta64(10000, "ns") + assert Timedelta("100us") == np.timedelta64(100000, "ns") + assert Timedelta("1000us") == np.timedelta64(1000000, "ns") + assert Timedelta("1000Us") == np.timedelta64(1000000, "ns") + assert Timedelta("1000uS") == np.timedelta64(1000000, "ns") + + assert Timedelta("1ms") == np.timedelta64(1000000, "ns") + assert Timedelta("10ms") == np.timedelta64(10000000, "ns") + assert Timedelta("100ms") == np.timedelta64(100000000, "ns") + assert Timedelta("1000ms") == np.timedelta64(1000000000, "ns") + + assert Timedelta("-1s") == -np.timedelta64(1000000000, "ns") + assert Timedelta("1s") == np.timedelta64(1000000000, "ns") + assert Timedelta("10s") == np.timedelta64(10000000000, "ns") + assert Timedelta("100s") == np.timedelta64(100000000000, "ns") + assert Timedelta("1000s") == np.timedelta64(1000000000000, "ns") + + assert Timedelta("1d") == conv(np.timedelta64(1, "D")) + assert Timedelta("-1d") == -conv(np.timedelta64(1, "D")) + assert Timedelta("1D") == conv(np.timedelta64(1, "D")) + assert Timedelta("10D") == conv(np.timedelta64(10, "D")) + assert Timedelta("100D") == conv(np.timedelta64(100, "D")) + assert Timedelta("1000D") == conv(np.timedelta64(1000, "D")) + assert Timedelta("10000D") == conv(np.timedelta64(10000, "D")) # space - assert Timedelta(' 10000D ') == conv(np.timedelta64(10000, 'D')) - assert Timedelta(' - 10000D ') == -conv(np.timedelta64(10000, 'D')) + assert Timedelta(" 10000D ") == conv(np.timedelta64(10000, "D")) + assert Timedelta(" - 10000D ") == -conv(np.timedelta64(10000, "D")) # invalid with pytest.raises(ValueError): - Timedelta('1foo') + Timedelta("1foo") with pytest.raises(ValueError): - Timedelta('foo') + Timedelta("foo") def test_full_format_converters(self): def conv(v): - return v.astype('m8[ns]') + return v.astype("m8[ns]") - d1 = np.timedelta64(1, 'D') + d1 = np.timedelta64(1, "D") - assert Timedelta('1days') == conv(d1) - assert Timedelta('1days,') == conv(d1) - assert Timedelta('- 1days,') == -conv(d1) + assert Timedelta("1days") == conv(d1) + assert Timedelta("1days,") == conv(d1) + assert Timedelta("- 1days,") == -conv(d1) - assert Timedelta('00:00:01') == conv(np.timedelta64(1, 's')) - assert Timedelta('06:00:01') == conv(np.timedelta64(6 * 3600 + 1, 's')) - assert Timedelta('06:00:01.0') == conv( - np.timedelta64(6 * 3600 + 1, 's')) - assert Timedelta('06:00:01.01') == conv(np.timedelta64( - 1000 * (6 * 3600 + 1) + 10, 'ms')) + assert Timedelta("00:00:01") == conv(np.timedelta64(1, "s")) + assert Timedelta("06:00:01") == conv(np.timedelta64(6 * 3600 + 1, "s")) + assert Timedelta("06:00:01.0") == conv(np.timedelta64(6 * 3600 + 1, "s")) + assert Timedelta("06:00:01.01") == conv( + np.timedelta64(1000 * (6 * 3600 + 1) + 10, "ms") + ) - assert (Timedelta('- 1days, 00:00:01') == - conv(-d1 + np.timedelta64(1, 's'))) - assert (Timedelta('1days, 06:00:01') == - conv(d1 + np.timedelta64(6 * 3600 + 1, 's'))) - assert (Timedelta('1days, 06:00:01.01') == - conv(d1 + np.timedelta64(1000 * (6 * 3600 + 1) + 10, 'ms'))) + assert Timedelta("- 1days, 00:00:01") == conv(-d1 + np.timedelta64(1, "s")) + assert Timedelta("1days, 06:00:01") == conv( + d1 + np.timedelta64(6 * 3600 + 1, "s") + ) + assert Timedelta("1days, 06:00:01.01") == conv( + d1 + np.timedelta64(1000 * (6 * 3600 + 1) + 10, "ms") + ) # invalid with pytest.raises(ValueError): - Timedelta('- 1days, 00') + Timedelta("- 1days, 00") def test_overflow(self): # GH 9442 - s = Series(pd.date_range('20130101', periods=100000, freq='H')) - s[0] += pd.Timedelta('1s 1ms') + s = Series(pd.date_range("20130101", periods=100000, freq="H")) + s[0] += pd.Timedelta("1s 1ms") # mean result = (s - s.min()).mean() - expected = pd.Timedelta((pd.TimedeltaIndex((s - s.min())).asi8 / len(s) - ).sum()) + expected = pd.Timedelta((pd.TimedeltaIndex((s - s.min())).asi8 / len(s)).sum()) # the computation is converted to float so # might be some loss of precision @@ -634,24 +690,24 @@ def test_overflow(self): def test_pickle(self): - v = Timedelta('1 days 10:11:12.0123456') + v = Timedelta("1 days 10:11:12.0123456") v_p = tm.round_trip_pickle(v) assert v == v_p def test_timedelta_hash_equality(self): # GH 11129 - v = Timedelta(1, 'D') + v = Timedelta(1, "D") td = timedelta(days=1) assert hash(v) == hash(td) d = {td: 2} assert d[v] == 2 - tds = timedelta_range('1 second', periods=20) + tds = timedelta_range("1 second", periods=20) assert all(hash(td) == hash(td.to_pytimedelta()) for td in tds) # python timedeltas drop ns resolution - ns_td = Timedelta(1, 'ns') + ns_td = Timedelta(1, "ns") assert hash(ns_td) != hash(ns_td.to_pytimedelta()) def test_implementation_limits(self): @@ -664,65 +720,65 @@ def test_implementation_limits(self): assert max_td.value == np.iinfo(np.int64).max # Beyond lower limit, a NAT before the Overflow - assert (min_td - Timedelta(1, 'ns')) is NaT + assert (min_td - Timedelta(1, "ns")) is NaT with pytest.raises(OverflowError): - min_td - Timedelta(2, 'ns') + min_td - Timedelta(2, "ns") with pytest.raises(OverflowError): - max_td + Timedelta(1, 'ns') + max_td + Timedelta(1, "ns") # Same tests using the internal nanosecond values - td = Timedelta(min_td.value - 1, 'ns') + td = Timedelta(min_td.value - 1, "ns") assert td is NaT with pytest.raises(OverflowError): - Timedelta(min_td.value - 2, 'ns') + Timedelta(min_td.value - 2, "ns") with pytest.raises(OverflowError): - Timedelta(max_td.value + 1, 'ns') + Timedelta(max_td.value + 1, "ns") def test_total_seconds_precision(self): # GH 19458 - assert Timedelta('30S').total_seconds() == 30.0 - assert Timedelta('0').total_seconds() == 0.0 - assert Timedelta('-2S').total_seconds() == -2.0 - assert Timedelta('5.324S').total_seconds() == 5.324 - assert (Timedelta('30S').total_seconds() - 30.0) < 1e-20 - assert (30.0 - Timedelta('30S').total_seconds()) < 1e-20 + assert Timedelta("30S").total_seconds() == 30.0 + assert Timedelta("0").total_seconds() == 0.0 + assert Timedelta("-2S").total_seconds() == -2.0 + assert Timedelta("5.324S").total_seconds() == 5.324 + assert (Timedelta("30S").total_seconds() - 30.0) < 1e-20 + assert (30.0 - Timedelta("30S").total_seconds()) < 1e-20 def test_timedelta_arithmetic(self): - data = pd.Series(['nat', '32 days'], dtype='timedelta64[ns]') - deltas = [timedelta(days=1), Timedelta(1, unit='D')] + data = pd.Series(["nat", "32 days"], dtype="timedelta64[ns]") + deltas = [timedelta(days=1), Timedelta(1, unit="D")] for delta in deltas: result_method = data.add(delta) result_operator = data + delta - expected = pd.Series(['nat', '33 days'], dtype='timedelta64[ns]') + expected = pd.Series(["nat", "33 days"], dtype="timedelta64[ns]") tm.assert_series_equal(result_operator, expected) tm.assert_series_equal(result_method, expected) result_method = data.sub(delta) result_operator = data - delta - expected = pd.Series(['nat', '31 days'], dtype='timedelta64[ns]') + expected = pd.Series(["nat", "31 days"], dtype="timedelta64[ns]") tm.assert_series_equal(result_operator, expected) tm.assert_series_equal(result_method, expected) # GH 9396 result_method = data.div(delta) result_operator = data / delta - expected = pd.Series([np.nan, 32.], dtype='float64') + expected = pd.Series([np.nan, 32.0], dtype="float64") tm.assert_series_equal(result_operator, expected) tm.assert_series_equal(result_method, expected) def test_apply_to_timedelta(self): - timedelta_NaT = pd.to_timedelta('NaT') + timedelta_NaT = pd.to_timedelta("NaT") - list_of_valid_strings = ['00:00:01', '00:00:02'] + list_of_valid_strings = ["00:00:01", "00:00:02"] a = pd.to_timedelta(list_of_valid_strings) b = Series(list_of_valid_strings).apply(pd.to_timedelta) # Can't compare until apply on a Series gives the correct dtype # assert_series_equal(a, b) - list_of_strings = ['00:00:01', np.nan, pd.NaT, timedelta_NaT] + list_of_strings = ["00:00:01", np.nan, pd.NaT, timedelta_NaT] # TODO: unused? a = pd.to_timedelta(list_of_strings) # noqa @@ -731,7 +787,7 @@ def test_apply_to_timedelta(self): # assert_series_equal(a, b) def test_components(self): - rng = timedelta_range('1 days, 10:11:12', periods=2, freq='s') + rng = timedelta_range("1 days, 10:11:12", periods=2, freq="s") rng.components # with nat @@ -743,13 +799,13 @@ def test_components(self): assert result.iloc[1].isna().all() def test_resolution_string(self): - assert Timedelta(days=1).resolution_string == 'D' - assert Timedelta(days=1, hours=6).resolution_string == 'H' - assert Timedelta(days=1, minutes=6).resolution_string == 'T' - assert Timedelta(days=1, seconds=6).resolution_string == 'S' - assert Timedelta(days=1, milliseconds=6).resolution_string == 'L' - assert Timedelta(days=1, microseconds=6).resolution_string == 'U' - assert Timedelta(days=1, nanoseconds=6).resolution_string == 'N' + assert Timedelta(days=1).resolution_string == "D" + assert Timedelta(days=1, hours=6).resolution_string == "H" + assert Timedelta(days=1, minutes=6).resolution_string == "T" + assert Timedelta(days=1, seconds=6).resolution_string == "S" + assert Timedelta(days=1, milliseconds=6).resolution_string == "L" + assert Timedelta(days=1, microseconds=6).resolution_string == "U" + assert Timedelta(days=1, nanoseconds=6).resolution_string == "N" def test_resolution_deprecated(self): # GH#21344 @@ -759,15 +815,18 @@ def test_resolution_deprecated(self): assert "Use Timedelta.resolution_string instead" in str(w[0].message) -@pytest.mark.parametrize('value, expected', [ - (Timedelta('10S'), True), - (Timedelta('-10S'), True), - (Timedelta(10, unit='ns'), True), - (Timedelta(0, unit='ns'), False), - (Timedelta(-10, unit='ns'), True), - (Timedelta(None), True), - (pd.NaT, True), -]) +@pytest.mark.parametrize( + "value, expected", + [ + (Timedelta("10S"), True), + (Timedelta("-10S"), True), + (Timedelta(10, unit="ns"), True), + (Timedelta(0, unit="ns"), False), + (Timedelta(-10, unit="ns"), True), + (Timedelta(None), True), + (pd.NaT, True), + ], +) def test_truthiness(value, expected): # https://github.com/pandas-dev/pandas/issues/21484 assert bool(value) is expected diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index 4f20bdbd65ba13..58bd03129f2df0 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -29,11 +29,13 @@ def test_overflow_offset_raises(self): # xref https://github.com/statsmodels/statsmodels/issues/3374 # ends up multiplying really large numbers which overflow - stamp = Timestamp('2017-01-13 00:00:00', freq='D') + stamp = Timestamp("2017-01-13 00:00:00", freq="D") offset_overflow = 20169940 * offsets.Day(1) - msg = ("the add operation between " - r"\<-?\d+ \* Days\> and \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} " - "will overflow") + msg = ( + "the add operation between " + r"\<-?\d+ \* Days\> and \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} " + "will overflow" + ) with pytest.raises(OverflowError, match=msg): stamp + offset_overflow @@ -77,7 +79,7 @@ def test_addition_subtraction_types(self): td = timedelta(seconds=1) # build a timestamp with a frequency, since then it supports # addition/subtraction of integers - ts = Timestamp(dt, freq='D') + ts = Timestamp(dt, freq="D") with tm.assert_produces_warning(FutureWarning): # GH#22535 add/sub with integers is deprecated @@ -92,20 +94,23 @@ def test_addition_subtraction_types(self): # Timestamp +/- datetime64 not supported, so not tested (could possibly # assert error raised?) - td64 = np.timedelta64(1, 'D') + td64 = np.timedelta64(1, "D") assert type(ts + td64) == Timestamp assert type(ts - td64) == Timestamp - @pytest.mark.parametrize('freq, td, td64', [ - ('S', timedelta(seconds=1), np.timedelta64(1, 's')), - ('min', timedelta(minutes=1), np.timedelta64(1, 'm')), - ('H', timedelta(hours=1), np.timedelta64(1, 'h')), - ('D', timedelta(days=1), np.timedelta64(1, 'D')), - ('W', timedelta(weeks=1), np.timedelta64(1, 'W')), - ('M', None, np.timedelta64(1, 'M')) - ]) + @pytest.mark.parametrize( + "freq, td, td64", + [ + ("S", timedelta(seconds=1), np.timedelta64(1, "s")), + ("min", timedelta(minutes=1), np.timedelta64(1, "m")), + ("H", timedelta(hours=1), np.timedelta64(1, "h")), + ("D", timedelta(days=1), np.timedelta64(1, "D")), + ("W", timedelta(weeks=1), np.timedelta64(1, "W")), + ("M", None, np.timedelta64(1, "M")), + ], + ) def test_addition_subtraction_preserve_frequency(self, freq, td, td64): - ts = Timestamp('2014-03-05 00:00:00', freq=freq) + ts = Timestamp("2014-03-05 00:00:00", freq=freq) original_freq = ts.freq with tm.assert_produces_warning(FutureWarning): @@ -124,20 +129,23 @@ def test_addition_subtraction_preserve_frequency(self, freq, td, td64): assert (ts + td64).freq == original_freq assert (ts - td64).freq == original_freq - @pytest.mark.parametrize('td', [Timedelta(hours=3), - np.timedelta64(3, 'h'), - timedelta(hours=3)]) + @pytest.mark.parametrize( + "td", [Timedelta(hours=3), np.timedelta64(3, "h"), timedelta(hours=3)] + ) def test_radd_tdscalar(self, td): # GH#24775 timedelta64+Timestamp should not raise ts = Timestamp.now() assert td + ts == ts + td - @pytest.mark.parametrize('other,expected_difference', [ - (np.timedelta64(-123, 'ns'), -123), - (np.timedelta64(1234567898, 'ns'), 1234567898), - (np.timedelta64(-123, 'us'), -123000), - (np.timedelta64(-123, 'ms'), -123000000) - ]) + @pytest.mark.parametrize( + "other,expected_difference", + [ + (np.timedelta64(-123, "ns"), -123), + (np.timedelta64(1234567898, "ns"), 1234567898), + (np.timedelta64(-123, "us"), -123000), + (np.timedelta64(-123, "ms"), -123000000), + ], + ) def test_timestamp_add_timedelta64_unit(self, other, expected_difference): ts = Timestamp(datetime.utcnow()) result = ts + other diff --git a/pandas/tests/scalar/timestamp/test_comparisons.py b/pandas/tests/scalar/timestamp/test_comparisons.py index b572b4607108cd..4ff0f843278545 100644 --- a/pandas/tests/scalar/timestamp/test_comparisons.py +++ b/pandas/tests/scalar/timestamp/test_comparisons.py @@ -10,9 +10,9 @@ class TestTimestampComparison: def test_comparison_object_array(self): # GH#15183 - ts = Timestamp('2011-01-03 00:00:00-0500', tz='US/Eastern') - other = Timestamp('2011-01-01 00:00:00-0500', tz='US/Eastern') - naive = Timestamp('2011-01-01 00:00:00') + ts = Timestamp("2011-01-03 00:00:00-0500", tz="US/Eastern") + other = Timestamp("2011-01-01 00:00:00-0500", tz="US/Eastern") + naive = Timestamp("2011-01-01 00:00:00") arr = np.array([other, ts], dtype=object) res = arr == ts @@ -20,9 +20,7 @@ def test_comparison_object_array(self): assert (res == expected).all() # 2D case - arr = np.array([[other, ts], - [ts, other]], - dtype=object) + arr = np.array([[other, ts], [ts, other]], dtype=object) res = arr != ts expected = np.array([[True, False], [False, True]], dtype=bool) assert res.shape == expected.shape @@ -65,27 +63,27 @@ def test_comparison(self): def test_compare_invalid(self): # GH#8058 - val = Timestamp('20130101 12:01:02') - assert not val == 'foo' + val = Timestamp("20130101 12:01:02") + assert not val == "foo" assert not val == 10.0 assert not val == 1 assert not val == [] - assert not val == {'foo': 1} + assert not val == {"foo": 1} assert not val == np.float64(1) assert not val == np.int64(1) - assert val != 'foo' + assert val != "foo" assert val != 10.0 assert val != 1 assert val != [] - assert val != {'foo': 1} + assert val != {"foo": 1} assert val != np.float64(1) assert val != np.int64(1) def test_cant_compare_tz_naive_w_aware(self, utc_fixture): # see GH#1404 - a = Timestamp('3/12/2012') - b = Timestamp('3/12/2012', tz=utc_fixture) + a = Timestamp("3/12/2012") + b = Timestamp("3/12/2012", tz=utc_fixture) with pytest.raises(TypeError): a == b @@ -119,15 +117,10 @@ def test_cant_compare_tz_naive_w_aware(self, utc_fixture): def test_timestamp_compare_scalars(self): # case where ndim == 0 lhs = np.datetime64(datetime(2013, 12, 6)) - rhs = Timestamp('now') - nat = Timestamp('nat') + rhs = Timestamp("now") + nat = Timestamp("nat") - ops = {'gt': 'lt', - 'lt': 'gt', - 'ge': 'le', - 'le': 'ge', - 'eq': 'eq', - 'ne': 'ne'} + ops = {"gt": "lt", "lt": "gt", "ge": "le", "le": "ge", "eq": "eq", "ne": "ne"} for left, right in ops.items(): left_f = getattr(operator, left) @@ -143,7 +136,7 @@ def test_timestamp_compare_scalars(self): def test_timestamp_compare_with_early_datetime(self): # e.g. datetime.min - stamp = Timestamp('2012-01-01') + stamp = Timestamp("2012-01-01") assert not stamp == datetime.min assert not stamp == datetime(1600, 1, 1) @@ -159,7 +152,7 @@ def test_timestamp_compare_with_early_datetime(self): def test_compare_zerodim_array(self): # GH#26916 ts = Timestamp.now() - dt64 = np.datetime64('2016-01-01', 'ns') + dt64 = np.datetime64("2016-01-01", "ns") arr = np.array(dt64) assert arr.ndim == 0 @@ -190,7 +183,7 @@ def __eq__(self, o): return isinstance(o, Inf) inf = Inf() - timestamp = Timestamp('2018-11-30') + timestamp = Timestamp("2018-11-30") for left, right in [(inf, timestamp), (timestamp, inf)]: assert left > right or left < right diff --git a/pandas/tests/scalar/timestamp/test_rendering.py b/pandas/tests/scalar/timestamp/test_rendering.py index 69ea0a810c4ce3..6b64b230a0bb90 100644 --- a/pandas/tests/scalar/timestamp/test_rendering.py +++ b/pandas/tests/scalar/timestamp/test_rendering.py @@ -8,18 +8,18 @@ class TestTimestampRendering: - timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', - 'dateutil/US/Pacific'] + timezones = ["UTC", "Asia/Tokyo", "US/Eastern", "dateutil/US/Pacific"] - @pytest.mark.parametrize('tz', timezones) - @pytest.mark.parametrize('freq', ['D', 'M', 'S', 'N']) - @pytest.mark.parametrize('date', ['2014-03-07', '2014-01-01 09:00', - '2014-01-01 00:00:00.000000001']) + @pytest.mark.parametrize("tz", timezones) + @pytest.mark.parametrize("freq", ["D", "M", "S", "N"]) + @pytest.mark.parametrize( + "date", ["2014-03-07", "2014-01-01 09:00", "2014-01-01 00:00:00.000000001"] + ) def test_repr(self, date, freq, tz): # avoid to match with timezone name freq_repr = "'{0}'".format(freq) - if tz.startswith('dateutil'): - tz_repr = tz.replace('dateutil', '') + if tz.startswith("dateutil"): + tz_repr = tz.replace("dateutil", "") else: tz_repr = tz @@ -50,28 +50,28 @@ def test_repr(self, date, freq, tz): def test_repr_utcoffset(self): # This can cause the tz field to be populated, but it's redundant to # include this information in the date-string. - date_with_utc_offset = Timestamp('2014-03-13 00:00:00-0400', tz=None) - assert '2014-03-13 00:00:00-0400' in repr(date_with_utc_offset) - assert 'tzoffset' not in repr(date_with_utc_offset) - assert 'pytz.FixedOffset(-240)' in repr(date_with_utc_offset) - expr = repr(date_with_utc_offset).replace("'pytz.FixedOffset(-240)'", - 'pytz.FixedOffset(-240)') + date_with_utc_offset = Timestamp("2014-03-13 00:00:00-0400", tz=None) + assert "2014-03-13 00:00:00-0400" in repr(date_with_utc_offset) + assert "tzoffset" not in repr(date_with_utc_offset) + assert "pytz.FixedOffset(-240)" in repr(date_with_utc_offset) + expr = repr(date_with_utc_offset).replace( + "'pytz.FixedOffset(-240)'", "pytz.FixedOffset(-240)" + ) assert date_with_utc_offset == eval(expr) def test_timestamp_repr_pre1900(self): # pre-1900 - stamp = Timestamp('1850-01-01', tz='US/Eastern') + stamp = Timestamp("1850-01-01", tz="US/Eastern") repr(stamp) - iso8601 = '1850-01-01 01:23:45.012345' - stamp = Timestamp(iso8601, tz='US/Eastern') + iso8601 = "1850-01-01 01:23:45.012345" + stamp = Timestamp(iso8601, tz="US/Eastern") result = repr(stamp) assert iso8601 in result def test_pprint(self): # GH#12622 - nested_obj = {'foo': 1, - 'bar': [{'w': {'a': Timestamp('2011-01-01')}}] * 10} + nested_obj = {"foo": 1, "bar": [{"w": {"a": Timestamp("2011-01-01")}}] * 10} result = pprint.pformat(nested_obj, width=50) expected = r"""{'bar': [{'w': {'a': Timestamp('2011-01-01 00:00:00')}}, {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index b9946796a4e1fb..7b0ff83aee5d43 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -25,21 +25,20 @@ class TestTimestampProperties: - def test_properties_business(self): - ts = Timestamp('2017-10-01', freq='B') - control = Timestamp('2017-10-01') + ts = Timestamp("2017-10-01", freq="B") + control = Timestamp("2017-10-01") assert ts.dayofweek == 6 - assert not ts.is_month_start # not a weekday + assert not ts.is_month_start # not a weekday assert not ts.is_quarter_start # not a weekday # Control case: non-business is month/qtr start assert control.is_month_start assert control.is_quarter_start - ts = Timestamp('2017-09-30', freq='B') - control = Timestamp('2017-09-30') + ts = Timestamp("2017-09-30", freq="B") + control = Timestamp("2017-09-30") assert ts.dayofweek == 5 - assert not ts.is_month_end # not a weekday + assert not ts.is_month_end # not a weekday assert not ts.is_quarter_end # not a weekday # Control case: non-business is month/qtr start assert control.is_month_end @@ -52,7 +51,7 @@ def check(value, equal): assert value == equal # GH 10050 - ts = Timestamp('2015-05-10 09:06:03.000100001') + ts = Timestamp("2015-05-10 09:06:03.000100001") check(ts.year, 2015) check(ts.month, 5) check(ts.day, 10) @@ -72,7 +71,7 @@ def check(value, equal): check(ts.daysinmonth, 31) # GH 13303 - ts = Timestamp('2014-12-31 23:59:00-05:00', tz='US/Eastern') + ts = Timestamp("2014-12-31 23:59:00-05:00", tz="US/Eastern") check(ts.year, 2014) check(ts.month, 12) check(ts.day, 31) @@ -90,30 +89,31 @@ def check(value, equal): check(ts.week, 1) check(ts.daysinmonth, 31) - ts = Timestamp('2014-01-01 00:00:00+01:00') - starts = ['is_month_start', 'is_quarter_start', 'is_year_start'] + ts = Timestamp("2014-01-01 00:00:00+01:00") + starts = ["is_month_start", "is_quarter_start", "is_year_start"] for start in starts: assert getattr(ts, start) - ts = Timestamp('2014-12-31 23:59:59+01:00') - ends = ['is_month_end', 'is_year_end', 'is_quarter_end'] + ts = Timestamp("2014-12-31 23:59:59+01:00") + ends = ["is_month_end", "is_year_end", "is_quarter_end"] for end in ends: assert getattr(ts, end) # GH 12806 - @pytest.mark.parametrize('data', - [Timestamp('2017-08-28 23:00:00'), - Timestamp('2017-08-28 23:00:00', tz='EST')]) - @pytest.mark.parametrize('time_locale', [ - None] if tm.get_locales() is None else [None] + tm.get_locales()) + @pytest.mark.parametrize( + "data", + [Timestamp("2017-08-28 23:00:00"), Timestamp("2017-08-28 23:00:00", tz="EST")], + ) + @pytest.mark.parametrize( + "time_locale", [None] if tm.get_locales() is None else [None] + tm.get_locales() + ) def test_names(self, data, time_locale): # GH 17354 # Test .weekday_name, .day_name(), .month_name - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - assert data.weekday_name == 'Monday' + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert data.weekday_name == "Monday" if time_locale is None: - expected_day = 'Monday' - expected_month = 'August' + expected_day = "Monday" + expected_month = "August" else: with tm.set_locale(time_locale, locale.LC_TIME): expected_day = calendar.day_name[0].capitalize() @@ -127,7 +127,7 @@ def test_names(self, data, time_locale): expected_day = unicodedata.normalize("NFD", expected_day) expected_month = unicodedata.normalize("NFD", expected_month) - result_day = unicodedata.normalize("NFD", result_day,) + result_day = unicodedata.normalize("NFD", result_day) result_month = unicodedata.normalize("NFD", result_month) assert result_day == expected_day @@ -141,17 +141,17 @@ def test_names(self, data, time_locale): def test_is_leap_year(self, tz_naive_fixture): tz = tz_naive_fixture # GH 13727 - dt = Timestamp('2000-01-01 00:00:00', tz=tz) + dt = Timestamp("2000-01-01 00:00:00", tz=tz) assert dt.is_leap_year assert isinstance(dt.is_leap_year, bool) - dt = Timestamp('1999-01-01 00:00:00', tz=tz) + dt = Timestamp("1999-01-01 00:00:00", tz=tz) assert not dt.is_leap_year - dt = Timestamp('2004-01-01 00:00:00', tz=tz) + dt = Timestamp("2004-01-01 00:00:00", tz=tz) assert dt.is_leap_year - dt = Timestamp('2100-01-01 00:00:00', tz=tz) + dt = Timestamp("2100-01-01 00:00:00", tz=tz) assert not dt.is_leap_year def test_woy_boundary(self): @@ -181,43 +181,60 @@ def test_woy_boundary(self): expected = 53 # ISO standard assert result == expected - result = np.array([Timestamp(datetime(*args)).week - for args in [(2000, 1, 1), (2000, 1, 2), ( - 2005, 1, 1), (2005, 1, 2)]]) + result = np.array( + [ + Timestamp(datetime(*args)).week + for args in [(2000, 1, 1), (2000, 1, 2), (2005, 1, 1), (2005, 1, 2)] + ] + ) assert (result == [52, 52, 53, 53]).all() def test_resolution(self): # GH#21336, GH#21365 - dt = Timestamp('2100-01-01 00:00:00') + dt = Timestamp("2100-01-01 00:00:00") assert dt.resolution == Timedelta(nanoseconds=1) class TestTimestampConstructors: - def test_constructor(self): - base_str = '2014-07-01 09:00' + base_str = "2014-07-01 09:00" base_dt = datetime(2014, 7, 1, 9) base_expected = 1404205200000000000 # confirm base representation is correct import calendar - assert (calendar.timegm(base_dt.timetuple()) * 1000000000 == - base_expected) - - tests = [(base_str, base_dt, base_expected), - ('2014-07-01 10:00', datetime(2014, 7, 1, 10), - base_expected + 3600 * 1000000000), - ('2014-07-01 09:00:00.000008000', - datetime(2014, 7, 1, 9, 0, 0, 8), - base_expected + 8000), - ('2014-07-01 09:00:00.000000005', - Timestamp('2014-07-01 09:00:00.000000005'), - base_expected + 5)] - - timezones = [(None, 0), ('UTC', 0), (pytz.utc, 0), ('Asia/Tokyo', 9), - ('US/Eastern', -4), ('dateutil/US/Pacific', -7), - (pytz.FixedOffset(-180), -3), - (dateutil.tz.tzoffset(None, 18000), 5)] + + assert calendar.timegm(base_dt.timetuple()) * 1000000000 == base_expected + + tests = [ + (base_str, base_dt, base_expected), + ( + "2014-07-01 10:00", + datetime(2014, 7, 1, 10), + base_expected + 3600 * 1000000000, + ), + ( + "2014-07-01 09:00:00.000008000", + datetime(2014, 7, 1, 9, 0, 0, 8), + base_expected + 8000, + ), + ( + "2014-07-01 09:00:00.000000005", + Timestamp("2014-07-01 09:00:00.000000005"), + base_expected + 5, + ), + ] + + timezones = [ + (None, 0), + ("UTC", 0), + (pytz.utc, 0), + ("Asia/Tokyo", 9), + ("US/Eastern", -4), + ("dateutil/US/Pacific", -7), + (pytz.FixedOffset(-180), -3), + (dateutil.tz.tzoffset(None, 18000), 5), + ] for date_str, date, expected in tests: for result in [Timestamp(date_str), Timestamp(date)]: @@ -232,8 +249,7 @@ def test_constructor(self): # with timezone for tz, offset in timezones: - for result in [Timestamp(date_str, tz=tz), Timestamp(date, - tz=tz)]: + for result in [Timestamp(date_str, tz=tz), Timestamp(date, tz=tz)]: expected_tz = expected - offset * 3600 * 1000000000 assert result.value == expected_tz assert conversion.pydt_to_i8(result) == expected_tz @@ -245,34 +261,41 @@ def test_constructor(self): # should convert to UTC if tz is not None: - result = Timestamp(result).tz_convert('UTC') + result = Timestamp(result).tz_convert("UTC") else: - result = Timestamp(result, tz='UTC') + result = Timestamp(result, tz="UTC") expected_utc = expected - offset * 3600 * 1000000000 assert result.value == expected_utc assert conversion.pydt_to_i8(result) == expected_utc def test_constructor_with_stringoffset(self): # GH 7833 - base_str = '2014-07-01 11:00:00+02:00' + base_str = "2014-07-01 11:00:00+02:00" base_dt = datetime(2014, 7, 1, 9) base_expected = 1404205200000000000 # confirm base representation is correct import calendar - assert (calendar.timegm(base_dt.timetuple()) * 1000000000 == - base_expected) - tests = [(base_str, base_expected), - ('2014-07-01 12:00:00+02:00', - base_expected + 3600 * 1000000000), - ('2014-07-01 11:00:00.000008000+02:00', base_expected + 8000), - ('2014-07-01 11:00:00.000000005+02:00', base_expected + 5)] - - timezones = [(None, 0), ('UTC', 0), (pytz.utc, 0), ('Asia/Tokyo', 9), - ('US/Eastern', -4), ('dateutil/US/Pacific', -7), - (pytz.FixedOffset(-180), -3), - (dateutil.tz.tzoffset(None, 18000), 5)] + assert calendar.timegm(base_dt.timetuple()) * 1000000000 == base_expected + + tests = [ + (base_str, base_expected), + ("2014-07-01 12:00:00+02:00", base_expected + 3600 * 1000000000), + ("2014-07-01 11:00:00.000008000+02:00", base_expected + 8000), + ("2014-07-01 11:00:00.000000005+02:00", base_expected + 5), + ] + + timezones = [ + (None, 0), + ("UTC", 0), + (pytz.utc, 0), + ("Asia/Tokyo", 9), + ("US/Eastern", -4), + ("dateutil/US/Pacific", -7), + (pytz.FixedOffset(-180), -3), + (dateutil.tz.tzoffset(None, 18000), 5), + ] for date_str, expected in tests: for result in [Timestamp(date_str)]: @@ -298,23 +321,23 @@ def test_constructor_with_stringoffset(self): assert conversion.pydt_to_i8(result) == expected_tz # should convert to UTC - result = Timestamp(result).tz_convert('UTC') + result = Timestamp(result).tz_convert("UTC") expected_utc = expected assert result.value == expected_utc assert conversion.pydt_to_i8(result) == expected_utc # This should be 2013-11-01 05:00 in UTC # converted to Chicago tz - result = Timestamp('2013-11-01 00:00:00-0500', tz='America/Chicago') - assert result.value == Timestamp('2013-11-01 05:00').value + result = Timestamp("2013-11-01 00:00:00-0500", tz="America/Chicago") + assert result.value == Timestamp("2013-11-01 05:00").value expected = "Timestamp('2013-11-01 00:00:00-0500', tz='America/Chicago')" # noqa assert repr(result) == expected assert result == eval(repr(result)) # This should be 2013-11-01 05:00 in UTC # converted to Tokyo tz (+09:00) - result = Timestamp('2013-11-01 00:00:00-0500', tz='Asia/Tokyo') - assert result.value == Timestamp('2013-11-01 05:00').value + result = Timestamp("2013-11-01 00:00:00-0500", tz="Asia/Tokyo") + assert result.value == Timestamp("2013-11-01 05:00").value expected = "Timestamp('2013-11-01 14:00:00+0900', tz='Asia/Tokyo')" assert repr(result) == expected assert result == eval(repr(result)) @@ -337,41 +360,43 @@ def test_constructor_with_stringoffset(self): assert result == eval(repr(result)) def test_constructor_invalid(self): - with pytest.raises(TypeError, match='Cannot convert input'): + with pytest.raises(TypeError, match="Cannot convert input"): Timestamp(slice(2)) - with pytest.raises(ValueError, match='Cannot convert Period'): - Timestamp(Period('1000-01-01')) + with pytest.raises(ValueError, match="Cannot convert Period"): + Timestamp(Period("1000-01-01")) def test_constructor_invalid_tz(self): # GH#17690 - with pytest.raises(TypeError, match='must be a datetime.tzinfo'): - Timestamp('2017-10-22', tzinfo='US/Eastern') + with pytest.raises(TypeError, match="must be a datetime.tzinfo"): + Timestamp("2017-10-22", tzinfo="US/Eastern") - with pytest.raises(ValueError, match='at most one of'): - Timestamp('2017-10-22', tzinfo=utc, tz='UTC') + with pytest.raises(ValueError, match="at most one of"): + Timestamp("2017-10-22", tzinfo=utc, tz="UTC") with pytest.raises(ValueError, match="Invalid frequency:"): # GH#5168 # case where user tries to pass tz as an arg, not kwarg, gets # interpreted as a `freq` - Timestamp('2012-01-01', 'US/Pacific') + Timestamp("2012-01-01", "US/Pacific") def test_constructor_strptime(self): # GH25016 # Test support for Timestamp.strptime - fmt = '%Y%m%d-%H%M%S-%f%z' - ts = '20190129-235348-000001+0000' + fmt = "%Y%m%d-%H%M%S-%f%z" + ts = "20190129-235348-000001+0000" with pytest.raises(NotImplementedError): Timestamp.strptime(ts, fmt) def test_constructor_tz_or_tzinfo(self): # GH#17943, GH#17690, GH#5168 - stamps = [Timestamp(year=2017, month=10, day=22, tz='UTC'), - Timestamp(year=2017, month=10, day=22, tzinfo=utc), - Timestamp(year=2017, month=10, day=22, tz=utc), - Timestamp(datetime(2017, 10, 22), tzinfo=utc), - Timestamp(datetime(2017, 10, 22), tz='UTC'), - Timestamp(datetime(2017, 10, 22), tz=utc)] + stamps = [ + Timestamp(year=2017, month=10, day=22, tz="UTC"), + Timestamp(year=2017, month=10, day=22, tzinfo=utc), + Timestamp(year=2017, month=10, day=22, tz=utc), + Timestamp(datetime(2017, 10, 22), tzinfo=utc), + Timestamp(datetime(2017, 10, 22), tz="UTC"), + Timestamp(datetime(2017, 10, 22), tz=utc), + ] assert all(ts == stamps[0] for ts in stamps) def test_constructor_positional(self): @@ -388,10 +413,10 @@ def test_constructor_positional(self): Timestamp(2000, 1, 32) # see gh-11630 - assert (repr(Timestamp(2015, 11, 12)) == - repr(Timestamp('20151112'))) - assert (repr(Timestamp(2015, 11, 12, 1, 2, 3, 999999)) == - repr(Timestamp('2015-11-12 01:02:03.999999'))) + assert repr(Timestamp(2015, 11, 12)) == repr(Timestamp("20151112")) + assert repr(Timestamp(2015, 11, 12, 1, 2, 3, 999999)) == repr( + Timestamp("2015-11-12 01:02:03.999999") + ) def test_constructor_keyword(self): # GH 10758 @@ -406,23 +431,32 @@ def test_constructor_keyword(self): with pytest.raises(ValueError): Timestamp(year=2000, month=1, day=32) - assert (repr(Timestamp(year=2015, month=11, day=12)) == - repr(Timestamp('20151112'))) - - assert (repr(Timestamp(year=2015, month=11, day=12, hour=1, minute=2, - second=3, microsecond=999999)) == - repr(Timestamp('2015-11-12 01:02:03.999999'))) + assert repr(Timestamp(year=2015, month=11, day=12)) == repr( + Timestamp("20151112") + ) + + assert repr( + Timestamp( + year=2015, + month=11, + day=12, + hour=1, + minute=2, + second=3, + microsecond=999999, + ) + ) == repr(Timestamp("2015-11-12 01:02:03.999999")) def test_constructor_fromordinal(self): base = datetime(2000, 1, 1) - ts = Timestamp.fromordinal(base.toordinal(), freq='D') + ts = Timestamp.fromordinal(base.toordinal(), freq="D") assert base == ts - assert ts.freq == 'D' + assert ts.freq == "D" assert base.toordinal() == ts.toordinal() - ts = Timestamp.fromordinal(base.toordinal(), tz='US/Eastern') - assert Timestamp('2000-01-01', tz='US/Eastern') == ts + ts = Timestamp.fromordinal(base.toordinal(), tz="US/Eastern") + assert Timestamp("2000-01-01", tz="US/Eastern") == ts assert base.toordinal() == ts.toordinal() # GH#3042 @@ -431,37 +465,69 @@ def test_constructor_fromordinal(self): assert ts.to_pydatetime() == dt # with a tzinfo - stamp = Timestamp('2011-4-16', tz='US/Eastern') + stamp = Timestamp("2011-4-16", tz="US/Eastern") dt_tz = stamp.to_pydatetime() - ts = Timestamp.fromordinal(dt_tz.toordinal(), tz='US/Eastern') + ts = Timestamp.fromordinal(dt_tz.toordinal(), tz="US/Eastern") assert ts.to_pydatetime() == dt_tz - @pytest.mark.parametrize('result', [ - Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), nanosecond=1), - Timestamp(year=2000, month=1, day=2, hour=3, minute=4, second=5, - microsecond=6, nanosecond=1), - Timestamp(year=2000, month=1, day=2, hour=3, minute=4, second=5, - microsecond=6, nanosecond=1, tz='UTC'), - Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, None), - Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, pytz.UTC)]) + @pytest.mark.parametrize( + "result", + [ + Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), nanosecond=1), + Timestamp( + year=2000, + month=1, + day=2, + hour=3, + minute=4, + second=5, + microsecond=6, + nanosecond=1, + ), + Timestamp( + year=2000, + month=1, + day=2, + hour=3, + minute=4, + second=5, + microsecond=6, + nanosecond=1, + tz="UTC", + ), + Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, None), + Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, pytz.UTC), + ], + ) def test_constructor_nanosecond(self, result): # GH 18898 expected = Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), tz=result.tz) expected = expected + Timedelta(nanoseconds=1) assert result == expected - @pytest.mark.parametrize('z', ['Z0', 'Z00']) + @pytest.mark.parametrize("z", ["Z0", "Z00"]) def test_constructor_invalid_Z0_isostring(self, z): # GH 8910 with pytest.raises(ValueError): - Timestamp('2014-11-02 01:00{}'.format(z)) - - @pytest.mark.parametrize('arg', ['year', 'month', 'day', 'hour', 'minute', - 'second', 'microsecond', 'nanosecond']) + Timestamp("2014-11-02 01:00{}".format(z)) + + @pytest.mark.parametrize( + "arg", + [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "microsecond", + "nanosecond", + ], + ) def test_invalid_date_kwarg_with_string_input(self, arg): kwarg = {arg: 1} with pytest.raises(ValueError): - Timestamp('2010-10-10 12:59:59.999999999', **kwarg) + Timestamp("2010-10-10 12:59:59.999999999", **kwarg) def test_out_of_bounds_integer_value(self): # GH#26651 check that we raise OutOfBoundsDatetime, not OverflowError @@ -471,12 +537,12 @@ def test_out_of_bounds_integer_value(self): Timestamp(Timestamp.min.value * 2) def test_out_of_bounds_value(self): - one_us = np.timedelta64(1).astype('timedelta64[us]') + one_us = np.timedelta64(1).astype("timedelta64[us]") # By definition we can't go out of bounds in [ns], so we # convert the datetime64s to [us] so we can go out of bounds - min_ts_us = np.datetime64(Timestamp.min).astype('M8[us]') - max_ts_us = np.datetime64(Timestamp.max).astype('M8[us]') + min_ts_us = np.datetime64(Timestamp.min).astype("M8[us]") + max_ts_us = np.datetime64(Timestamp.max).astype("M8[us]") # No error for the min/max datetimes Timestamp(min_ts_us) @@ -492,33 +558,33 @@ def test_out_of_bounds_value(self): def test_out_of_bounds_string(self): with pytest.raises(ValueError): - Timestamp('1676-01-01') + Timestamp("1676-01-01") with pytest.raises(ValueError): - Timestamp('2263-01-01') + Timestamp("2263-01-01") def test_barely_out_of_bounds(self): # GH#19529 # GH#19382 close enough to bounds that dropping nanos would result # in an in-bounds datetime with pytest.raises(OutOfBoundsDatetime): - Timestamp('2262-04-11 23:47:16.854775808') + Timestamp("2262-04-11 23:47:16.854775808") def test_bounds_with_different_units(self): - out_of_bounds_dates = ('1677-09-21', '2262-04-12') + out_of_bounds_dates = ("1677-09-21", "2262-04-12") - time_units = ('D', 'h', 'm', 's', 'ms', 'us') + time_units = ("D", "h", "m", "s", "ms", "us") for date_string in out_of_bounds_dates: for unit in time_units: - dt64 = np.datetime64(date_string, dtype='M8[%s]' % unit) + dt64 = np.datetime64(date_string, dtype="M8[%s]" % unit) with pytest.raises(ValueError): Timestamp(dt64) - in_bounds_dates = ('1677-09-23', '2262-04-11') + in_bounds_dates = ("1677-09-23", "2262-04-11") for date_string in in_bounds_dates: for unit in time_units: - dt64 = np.datetime64(date_string, dtype='M8[%s]' % unit) + dt64 = np.datetime64(date_string, dtype="M8[%s]" % unit) Timestamp(dt64) def test_min_valid(self): @@ -531,12 +597,12 @@ def test_max_valid(self): def test_now(self): # GH#9000 - ts_from_string = Timestamp('now') + ts_from_string = Timestamp("now") ts_from_method = Timestamp.now() ts_datetime = datetime.now() - ts_from_string_tz = Timestamp('now', tz='US/Eastern') - ts_from_method_tz = Timestamp.now(tz='US/Eastern') + ts_from_string_tz = Timestamp("now", tz="US/Eastern") + ts_from_method_tz = Timestamp.now(tz="US/Eastern") # Check that the delta between the times is less than 1s (arbitrarily # small) @@ -544,16 +610,21 @@ def test_now(self): assert abs(ts_from_method - ts_from_string) < delta assert abs(ts_datetime - ts_from_method) < delta assert abs(ts_from_method_tz - ts_from_string_tz) < delta - assert (abs(ts_from_string_tz.tz_localize(None) - - ts_from_method_tz.tz_localize(None)) < delta) + assert ( + abs( + ts_from_string_tz.tz_localize(None) + - ts_from_method_tz.tz_localize(None) + ) + < delta + ) def test_today(self): - ts_from_string = Timestamp('today') + ts_from_string = Timestamp("today") ts_from_method = Timestamp.today() ts_datetime = datetime.today() - ts_from_string_tz = Timestamp('today', tz='US/Eastern') - ts_from_method_tz = Timestamp.today(tz='US/Eastern') + ts_from_string_tz = Timestamp("today", tz="US/Eastern") + ts_from_method_tz = Timestamp.today(tz="US/Eastern") # Check that the delta between the times is less than 1s (arbitrarily # small) @@ -561,26 +632,33 @@ def test_today(self): assert abs(ts_from_method - ts_from_string) < delta assert abs(ts_datetime - ts_from_method) < delta assert abs(ts_from_method_tz - ts_from_string_tz) < delta - assert (abs(ts_from_string_tz.tz_localize(None) - - ts_from_method_tz.tz_localize(None)) < delta) - - @pytest.mark.parametrize('tz', [None, pytz.timezone('US/Pacific')]) + assert ( + abs( + ts_from_string_tz.tz_localize(None) + - ts_from_method_tz.tz_localize(None) + ) + < delta + ) + + @pytest.mark.parametrize("tz", [None, pytz.timezone("US/Pacific")]) def test_disallow_setting_tz(self, tz): # GH 3746 - ts = Timestamp('2010') + ts = Timestamp("2010") with pytest.raises(AttributeError): ts.tz = tz - @pytest.mark.parametrize('offset', ['+0300', '+0200']) + @pytest.mark.parametrize("offset", ["+0300", "+0200"]) def test_construct_timestamp_near_dst(self, offset): # GH 20854 - expected = Timestamp('2016-10-30 03:00:00{}'.format(offset), - tz='Europe/Helsinki') - result = Timestamp(expected).tz_convert('Europe/Helsinki') + expected = Timestamp( + "2016-10-30 03:00:00{}".format(offset), tz="Europe/Helsinki" + ) + result = Timestamp(expected).tz_convert("Europe/Helsinki") assert result == expected - @pytest.mark.parametrize('arg', [ - '2013/01/01 00:00:00+09:00', '2013-01-01 00:00:00+09:00']) + @pytest.mark.parametrize( + "arg", ["2013/01/01 00:00:00+09:00", "2013-01-01 00:00:00+09:00"] + ) def test_construct_with_different_string_format(self, arg): # GH 12064 result = Timestamp(arg) @@ -589,21 +667,21 @@ def test_construct_with_different_string_format(self, arg): def test_construct_timestamp_preserve_original_frequency(self): # GH 22311 - result = Timestamp(Timestamp('2010-08-08', freq='D')).freq + result = Timestamp(Timestamp("2010-08-08", freq="D")).freq expected = offsets.Day() assert result == expected def test_constructor_invalid_frequency(self): # GH 22311 with pytest.raises(ValueError, match="Invalid frequency:"): - Timestamp('2012-01-01', freq=[]) + Timestamp("2012-01-01", freq=[]) - @pytest.mark.parametrize('box', [datetime, Timestamp]) + @pytest.mark.parametrize("box", [datetime, Timestamp]) def test_depreciate_tz_and_tzinfo_in_datetime_input(self, box): # GH 23579 - kwargs = {'year': 2018, 'month': 1, 'day': 1, 'tzinfo': utc} + kwargs = {"year": 2018, "month": 1, "day": 1, "tzinfo": utc} with tm.assert_produces_warning(FutureWarning): - Timestamp(box(**kwargs), tz='US/Pacific') + Timestamp(box(**kwargs), tz="US/Pacific") def test_dont_convert_dateutil_utc_to_pytz_utc(self): result = Timestamp(datetime(2018, 1, 1), tz=tzutc()) @@ -624,83 +702,92 @@ class SubDatetime(datetime): class TestTimestamp: - def test_tz(self): - tstr = '2014-02-01 09:00' + tstr = "2014-02-01 09:00" ts = Timestamp(tstr) - local = ts.tz_localize('Asia/Tokyo') + local = ts.tz_localize("Asia/Tokyo") assert local.hour == 9 - assert local == Timestamp(tstr, tz='Asia/Tokyo') - conv = local.tz_convert('US/Eastern') - assert conv == Timestamp('2014-01-31 19:00', tz='US/Eastern') + assert local == Timestamp(tstr, tz="Asia/Tokyo") + conv = local.tz_convert("US/Eastern") + assert conv == Timestamp("2014-01-31 19:00", tz="US/Eastern") assert conv.hour == 19 # preserves nanosecond ts = Timestamp(tstr) + offsets.Nano(5) - local = ts.tz_localize('Asia/Tokyo') + local = ts.tz_localize("Asia/Tokyo") assert local.hour == 9 assert local.nanosecond == 5 - conv = local.tz_convert('US/Eastern') + conv = local.tz_convert("US/Eastern") assert conv.nanosecond == 5 assert conv.hour == 19 def test_utc_z_designator(self): - assert get_timezone(Timestamp('2014-11-02 01:00Z').tzinfo) is utc + assert get_timezone(Timestamp("2014-11-02 01:00Z").tzinfo) is utc def test_asm8(self): np.random.seed(7960929) ns = [Timestamp.min.value, Timestamp.max.value, 1000] for n in ns: - assert (Timestamp(n).asm8.view('i8') == - np.datetime64(n, 'ns').view('i8') == n) + assert ( + Timestamp(n).asm8.view("i8") == np.datetime64(n, "ns").view("i8") == n + ) - assert (Timestamp('nat').asm8.view('i8') == - np.datetime64('nat', 'ns').view('i8')) + assert Timestamp("nat").asm8.view("i8") == np.datetime64("nat", "ns").view("i8") def test_class_ops_pytz(self): def compare(x, y): - assert (int(Timestamp(x).value / 1e9) == - int(Timestamp(y).value / 1e9)) + assert int(Timestamp(x).value / 1e9) == int(Timestamp(y).value / 1e9) compare(Timestamp.now(), datetime.now()) - compare(Timestamp.now('UTC'), datetime.now(timezone('UTC'))) + compare(Timestamp.now("UTC"), datetime.now(timezone("UTC"))) compare(Timestamp.utcnow(), datetime.utcnow()) compare(Timestamp.today(), datetime.today()) current_time = calendar.timegm(datetime.now().utctimetuple()) - compare(Timestamp.utcfromtimestamp(current_time), - datetime.utcfromtimestamp(current_time)) - compare(Timestamp.fromtimestamp(current_time), - datetime.fromtimestamp(current_time)) + compare( + Timestamp.utcfromtimestamp(current_time), + datetime.utcfromtimestamp(current_time), + ) + compare( + Timestamp.fromtimestamp(current_time), datetime.fromtimestamp(current_time) + ) date_component = datetime.utcnow() time_component = (date_component + timedelta(minutes=10)).time() - compare(Timestamp.combine(date_component, time_component), - datetime.combine(date_component, time_component)) + compare( + Timestamp.combine(date_component, time_component), + datetime.combine(date_component, time_component), + ) def test_class_ops_dateutil(self): def compare(x, y): - assert (int(np.round(Timestamp(x).value / 1e9)) == - int(np.round(Timestamp(y).value / 1e9))) + assert int(np.round(Timestamp(x).value / 1e9)) == int( + np.round(Timestamp(y).value / 1e9) + ) compare(Timestamp.now(), datetime.now()) - compare(Timestamp.now('UTC'), datetime.now(tzutc())) + compare(Timestamp.now("UTC"), datetime.now(tzutc())) compare(Timestamp.utcnow(), datetime.utcnow()) compare(Timestamp.today(), datetime.today()) current_time = calendar.timegm(datetime.now().utctimetuple()) - compare(Timestamp.utcfromtimestamp(current_time), - datetime.utcfromtimestamp(current_time)) - compare(Timestamp.fromtimestamp(current_time), - datetime.fromtimestamp(current_time)) + compare( + Timestamp.utcfromtimestamp(current_time), + datetime.utcfromtimestamp(current_time), + ) + compare( + Timestamp.fromtimestamp(current_time), datetime.fromtimestamp(current_time) + ) date_component = datetime.utcnow() time_component = (date_component + timedelta(minutes=10)).time() - compare(Timestamp.combine(date_component, time_component), - datetime.combine(date_component, time_component)) + compare( + Timestamp.combine(date_component, time_component), + datetime.combine(date_component, time_component), + ) def test_basics_nanos(self): - val = np.int64(946684800000000000).view('M8[ns]') - stamp = Timestamp(val.view('i8') + 500) + val = np.int64(946684800000000000).view("M8[ns]") + stamp = Timestamp(val.view("i8") + 500) assert stamp.year == 2000 assert stamp.month == 1 assert stamp.microsecond == 0 @@ -715,26 +802,30 @@ def test_basics_nanos(self): assert stamp.microsecond == 145224 assert stamp.nanosecond == 192 - @pytest.mark.parametrize('value, check_kwargs', [ - [946688461000000000, {}], - [946688461000000000 / 1000, dict(unit='us')], - [946688461000000000 / 1000000, dict(unit='ms')], - [946688461000000000 / 1000000000, dict(unit='s')], - [10957, dict(unit='D', h=0)], - [(946688461000000000 + 500000) / 1000000000, - dict(unit='s', us=499, ns=964)], - [(946688461000000000 + 500000000) / 1000000000, - dict(unit='s', us=500000)], - [(946688461000000000 + 500000) / 1000000, dict(unit='ms', us=500)], - [(946688461000000000 + 500000) / 1000, dict(unit='us', us=500)], - [(946688461000000000 + 500000000) / 1000000, - dict(unit='ms', us=500000)], - [946688461000000000 / 1000.0 + 5, dict(unit='us', us=5)], - [946688461000000000 / 1000.0 + 5000, dict(unit='us', us=5000)], - [946688461000000000 / 1000000.0 + 0.5, dict(unit='ms', us=500)], - [946688461000000000 / 1000000.0 + 0.005, dict(unit='ms', us=5, ns=5)], - [946688461000000000 / 1000000000.0 + 0.5, dict(unit='s', us=500000)], - [10957 + 0.5, dict(unit='D', h=12)]]) + @pytest.mark.parametrize( + "value, check_kwargs", + [ + [946688461000000000, {}], + [946688461000000000 / 1000, dict(unit="us")], + [946688461000000000 / 1000000, dict(unit="ms")], + [946688461000000000 / 1000000000, dict(unit="s")], + [10957, dict(unit="D", h=0)], + [ + (946688461000000000 + 500000) / 1000000000, + dict(unit="s", us=499, ns=964), + ], + [(946688461000000000 + 500000000) / 1000000000, dict(unit="s", us=500000)], + [(946688461000000000 + 500000) / 1000000, dict(unit="ms", us=500)], + [(946688461000000000 + 500000) / 1000, dict(unit="us", us=500)], + [(946688461000000000 + 500000000) / 1000000, dict(unit="ms", us=500000)], + [946688461000000000 / 1000.0 + 5, dict(unit="us", us=5)], + [946688461000000000 / 1000.0 + 5000, dict(unit="us", us=5000)], + [946688461000000000 / 1000000.0 + 0.5, dict(unit="ms", us=500)], + [946688461000000000 / 1000000.0 + 0.005, dict(unit="ms", us=5, ns=5)], + [946688461000000000 / 1000000000.0 + 0.5, dict(unit="s", us=500000)], + [10957 + 0.5, dict(unit="D", h=12)], + ], + ) def test_unit(self, value, check_kwargs): def check(value, unit=None, h=1, s=1, us=0, ns=0): stamp = Timestamp(value, unit=unit) @@ -742,7 +833,7 @@ def check(value, unit=None, h=1, s=1, us=0, ns=0): assert stamp.month == 1 assert stamp.day == 1 assert stamp.hour == h - if unit != 'D': + if unit != "D": assert stamp.minute == 1 assert stamp.second == s assert stamp.microsecond == us @@ -758,26 +849,26 @@ def test_roundtrip(self): # test value to string and back conversions # further test accessors - base = Timestamp('20140101 00:00:00') + base = Timestamp("20140101 00:00:00") - result = Timestamp(base.value + Timedelta('5ms').value) + result = Timestamp(base.value + Timedelta("5ms").value) assert result == Timestamp(str(base) + ".005000") assert result.microsecond == 5000 - result = Timestamp(base.value + Timedelta('5us').value) + result = Timestamp(base.value + Timedelta("5us").value) assert result == Timestamp(str(base) + ".000005") assert result.microsecond == 5 - result = Timestamp(base.value + Timedelta('5ns').value) + result = Timestamp(base.value + Timedelta("5ns").value) assert result == Timestamp(str(base) + ".000000005") assert result.nanosecond == 5 assert result.microsecond == 0 - result = Timestamp(base.value + Timedelta('6ms 5us').value) + result = Timestamp(base.value + Timedelta("6ms 5us").value) assert result == Timestamp(str(base) + ".006005") assert result.microsecond == 5 + 6 * 1000 - result = Timestamp(base.value + Timedelta('200ms 5us').value) + result = Timestamp(base.value + Timedelta("200ms 5us").value) assert result == Timestamp(str(base) + ".200005") assert result.microsecond == 5 + 200 * 1000 @@ -788,43 +879,42 @@ def test_hash_equivalent(self): def test_tz_conversion_freq(self, tz_naive_fixture): # GH25241 - t1 = Timestamp('2019-01-01 10:00', freq='H') + t1 = Timestamp("2019-01-01 10:00", freq="H") assert t1.tz_localize(tz=tz_naive_fixture).freq == t1.freq - t2 = Timestamp('2019-01-02 12:00', tz='UTC', freq='T') - assert t2.tz_convert(tz='UTC').freq == t2.freq + t2 = Timestamp("2019-01-02 12:00", tz="UTC", freq="T") + assert t2.tz_convert(tz="UTC").freq == t2.freq class TestTimestampNsOperations: - def test_nanosecond_string_parsing(self): - ts = Timestamp('2013-05-01 07:15:45.123456789') + ts = Timestamp("2013-05-01 07:15:45.123456789") # GH 7878 - expected_repr = '2013-05-01 07:15:45.123456789' + expected_repr = "2013-05-01 07:15:45.123456789" expected_value = 1367392545123456789 assert ts.value == expected_value assert expected_repr in repr(ts) - ts = Timestamp('2013-05-01 07:15:45.123456789+09:00', tz='Asia/Tokyo') + ts = Timestamp("2013-05-01 07:15:45.123456789+09:00", tz="Asia/Tokyo") assert ts.value == expected_value - 9 * 3600 * 1000000000 assert expected_repr in repr(ts) - ts = Timestamp('2013-05-01 07:15:45.123456789', tz='UTC') + ts = Timestamp("2013-05-01 07:15:45.123456789", tz="UTC") assert ts.value == expected_value assert expected_repr in repr(ts) - ts = Timestamp('2013-05-01 07:15:45.123456789', tz='US/Eastern') + ts = Timestamp("2013-05-01 07:15:45.123456789", tz="US/Eastern") assert ts.value == expected_value + 4 * 3600 * 1000000000 assert expected_repr in repr(ts) # GH 10041 - ts = Timestamp('20130501T071545.123456789') + ts = Timestamp("20130501T071545.123456789") assert ts.value == expected_value assert expected_repr in repr(ts) def test_nanosecond_timestamp(self): # GH 7610 expected = 1293840000000000005 - t = Timestamp('2011-01-01') + offsets.Nano(5) + t = Timestamp("2011-01-01") + offsets.Nano(5) assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000005')" assert t.value == expected assert t.nanosecond == 5 @@ -834,7 +924,7 @@ def test_nanosecond_timestamp(self): assert t.value == expected assert t.nanosecond == 5 - t = Timestamp(np_datetime64_compat('2011-01-01 00:00:00.000000005Z')) + t = Timestamp(np_datetime64_compat("2011-01-01 00:00:00.000000005Z")) assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000005')" assert t.value == expected assert t.nanosecond == 5 @@ -850,39 +940,38 @@ def test_nanosecond_timestamp(self): assert t.value == expected assert t.nanosecond == 10 - t = Timestamp(np_datetime64_compat('2011-01-01 00:00:00.000000010Z')) + t = Timestamp(np_datetime64_compat("2011-01-01 00:00:00.000000010Z")) assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000010')" assert t.value == expected assert t.nanosecond == 10 class TestTimestampToJulianDate: - def test_compare_1700(self): - r = Timestamp('1700-06-23').to_julian_date() + r = Timestamp("1700-06-23").to_julian_date() assert r == 2342145.5 def test_compare_2000(self): - r = Timestamp('2000-04-12').to_julian_date() + r = Timestamp("2000-04-12").to_julian_date() assert r == 2451646.5 def test_compare_2100(self): - r = Timestamp('2100-08-12').to_julian_date() + r = Timestamp("2100-08-12").to_julian_date() assert r == 2488292.5 def test_compare_hour01(self): - r = Timestamp('2000-08-12T01:00:00').to_julian_date() + r = Timestamp("2000-08-12T01:00:00").to_julian_date() assert r == 2451768.5416666666666666 def test_compare_hour13(self): - r = Timestamp('2000-08-12T13:00:00').to_julian_date() + r = Timestamp("2000-08-12T13:00:00").to_julian_date() assert r == 2451769.0416666666666666 class TestTimestampConversion: def test_conversion(self): # GH#9255 - ts = Timestamp('2000-01-01') + ts = Timestamp("2000-01-01") result = ts.to_pydatetime() expected = datetime(2000, 1, 1) @@ -890,42 +979,41 @@ def test_conversion(self): assert type(result) == type(expected) result = ts.to_datetime64() - expected = np.datetime64(ts.value, 'ns') + expected = np.datetime64(ts.value, "ns") assert result == expected assert type(result) == type(expected) assert result.dtype == expected.dtype def test_to_pydatetime_nonzero_nano(self): - ts = Timestamp('2011-01-01 9:00:00.123456789') + ts = Timestamp("2011-01-01 9:00:00.123456789") # Warn the user of data loss (nanoseconds). - with tm.assert_produces_warning(UserWarning, - check_stacklevel=False): + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): expected = datetime(2011, 1, 1, 9, 0, 0, 123456) result = ts.to_pydatetime() assert result == expected def test_timestamp_to_datetime(self): - stamp = Timestamp('20090415', tz='US/Eastern', freq='D') + stamp = Timestamp("20090415", tz="US/Eastern", freq="D") dtval = stamp.to_pydatetime() assert stamp == dtval assert stamp.tzinfo == dtval.tzinfo def test_timestamp_to_datetime_dateutil(self): - stamp = Timestamp('20090415', tz='dateutil/US/Eastern', freq='D') + stamp = Timestamp("20090415", tz="dateutil/US/Eastern", freq="D") dtval = stamp.to_pydatetime() assert stamp == dtval assert stamp.tzinfo == dtval.tzinfo def test_timestamp_to_datetime_explicit_pytz(self): - stamp = Timestamp('20090415', tz=pytz.timezone('US/Eastern'), freq='D') + stamp = Timestamp("20090415", tz=pytz.timezone("US/Eastern"), freq="D") dtval = stamp.to_pydatetime() assert stamp == dtval assert stamp.tzinfo == dtval.tzinfo @td.skip_if_windows_python_3 def test_timestamp_to_datetime_explicit_dateutil(self): - stamp = Timestamp('20090415', tz=gettz('US/Eastern'), freq='D') + stamp = Timestamp("20090415", tz=gettz("US/Eastern"), freq="D") dtval = stamp.to_pydatetime() assert stamp == dtval assert stamp.tzinfo == dtval.tzinfo @@ -935,21 +1023,25 @@ def test_to_datetime_bijective(self): # by going from nanoseconds to microseconds. exp_warning = None if Timestamp.max.nanosecond == 0 else UserWarning with tm.assert_produces_warning(exp_warning, check_stacklevel=False): - assert (Timestamp(Timestamp.max.to_pydatetime()).value / 1000 == - Timestamp.max.value / 1000) + assert ( + Timestamp(Timestamp.max.to_pydatetime()).value / 1000 + == Timestamp.max.value / 1000 + ) exp_warning = None if Timestamp.min.nanosecond == 0 else UserWarning with tm.assert_produces_warning(exp_warning, check_stacklevel=False): - assert (Timestamp(Timestamp.min.to_pydatetime()).value / 1000 == - Timestamp.min.value / 1000) + assert ( + Timestamp(Timestamp.min.to_pydatetime()).value / 1000 + == Timestamp.min.value / 1000 + ) def test_to_period_tz_warning(self): # GH#21333 make sure a warning is issued when timezone # info is lost - ts = Timestamp('2009-04-15 16:17:18', tz='US/Eastern') + ts = Timestamp("2009-04-15 16:17:18", tz="US/Eastern") with tm.assert_produces_warning(UserWarning): # warning that timezone info will be lost - ts.to_period('D') + ts.to_period("D") def test_to_numpy_alias(self): # GH 24653: alias .to_numpy() for scalars diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py index 914423fcf5ba77..f64cf97acf8054 100644 --- a/pandas/tests/scalar/timestamp/test_timezones.py +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -24,61 +24,63 @@ class TestTimestampTZOperations: def test_tz_localize_pushes_out_of_bounds(self): # GH#12677 # tz_localize that pushes away from the boundary is OK - pac = Timestamp.min.tz_localize('US/Pacific') + pac = Timestamp.min.tz_localize("US/Pacific") assert pac.value > Timestamp.min.value - pac.tz_convert('Asia/Tokyo') # tz_convert doesn't change value + pac.tz_convert("Asia/Tokyo") # tz_convert doesn't change value with pytest.raises(OutOfBoundsDatetime): - Timestamp.min.tz_localize('Asia/Tokyo') + Timestamp.min.tz_localize("Asia/Tokyo") # tz_localize that pushes away from the boundary is OK - tokyo = Timestamp.max.tz_localize('Asia/Tokyo') + tokyo = Timestamp.max.tz_localize("Asia/Tokyo") assert tokyo.value < Timestamp.max.value - tokyo.tz_convert('US/Pacific') # tz_convert doesn't change value + tokyo.tz_convert("US/Pacific") # tz_convert doesn't change value with pytest.raises(OutOfBoundsDatetime): - Timestamp.max.tz_localize('US/Pacific') + Timestamp.max.tz_localize("US/Pacific") def test_tz_localize_ambiguous_bool(self): # make sure that we are correctly accepting bool values as ambiguous # GH#14402 - ts = Timestamp('2015-11-01 01:00:03') - expected0 = Timestamp('2015-11-01 01:00:03-0500', tz='US/Central') - expected1 = Timestamp('2015-11-01 01:00:03-0600', tz='US/Central') + ts = Timestamp("2015-11-01 01:00:03") + expected0 = Timestamp("2015-11-01 01:00:03-0500", tz="US/Central") + expected1 = Timestamp("2015-11-01 01:00:03-0600", tz="US/Central") with pytest.raises(pytz.AmbiguousTimeError): - ts.tz_localize('US/Central') + ts.tz_localize("US/Central") - result = ts.tz_localize('US/Central', ambiguous=True) + result = ts.tz_localize("US/Central", ambiguous=True) assert result == expected0 - result = ts.tz_localize('US/Central', ambiguous=False) + result = ts.tz_localize("US/Central", ambiguous=False) assert result == expected1 def test_tz_localize_ambiguous(self): - ts = Timestamp('2014-11-02 01:00') - ts_dst = ts.tz_localize('US/Eastern', ambiguous=True) - ts_no_dst = ts.tz_localize('US/Eastern', ambiguous=False) + ts = Timestamp("2014-11-02 01:00") + ts_dst = ts.tz_localize("US/Eastern", ambiguous=True) + ts_no_dst = ts.tz_localize("US/Eastern", ambiguous=False) assert (ts_no_dst.value - ts_dst.value) / 1e9 == 3600 with pytest.raises(ValueError): - ts.tz_localize('US/Eastern', ambiguous='infer') + ts.tz_localize("US/Eastern", ambiguous="infer") # GH#8025 - msg = ('Cannot localize tz-aware Timestamp, ' - 'use tz_convert for conversions') + msg = "Cannot localize tz-aware Timestamp, " "use tz_convert for conversions" with pytest.raises(TypeError, match=msg): - Timestamp('2011-01-01', tz='US/Eastern').tz_localize('Asia/Tokyo') + Timestamp("2011-01-01", tz="US/Eastern").tz_localize("Asia/Tokyo") - msg = ('Cannot convert tz-naive Timestamp, ' - 'use tz_localize to localize') + msg = "Cannot convert tz-naive Timestamp, " "use tz_localize to localize" with pytest.raises(TypeError, match=msg): - Timestamp('2011-01-01').tz_convert('Asia/Tokyo') - - @pytest.mark.parametrize('stamp, tz', [ - ('2015-03-08 02:00', 'US/Eastern'), - ('2015-03-08 02:30', 'US/Pacific'), - ('2015-03-29 02:00', 'Europe/Paris'), - ('2015-03-29 02:30', 'Europe/Belgrade')]) - @pytest.mark.filterwarnings('ignore::FutureWarning') + Timestamp("2011-01-01").tz_convert("Asia/Tokyo") + + @pytest.mark.parametrize( + "stamp, tz", + [ + ("2015-03-08 02:00", "US/Eastern"), + ("2015-03-08 02:30", "US/Pacific"), + ("2015-03-29 02:00", "Europe/Paris"), + ("2015-03-29 02:30", "Europe/Belgrade"), + ], + ) + @pytest.mark.filterwarnings("ignore::FutureWarning") def test_tz_localize_nonexistent(self, stamp, tz): # GH#13057 ts = Timestamp(stamp) @@ -87,38 +89,45 @@ def test_tz_localize_nonexistent(self, stamp, tz): # GH 22644 with pytest.raises(NonExistentTimeError): with tm.assert_produces_warning(FutureWarning): - ts.tz_localize(tz, errors='raise') + ts.tz_localize(tz, errors="raise") with tm.assert_produces_warning(FutureWarning): - assert ts.tz_localize(tz, errors='coerce') is NaT + assert ts.tz_localize(tz, errors="coerce") is NaT def test_tz_localize_errors_ambiguous(self): # GH#13057 - ts = Timestamp('2015-11-1 01:00') + ts = Timestamp("2015-11-1 01:00") with pytest.raises(AmbiguousTimeError): with tm.assert_produces_warning(FutureWarning): - ts.tz_localize('US/Pacific', errors='coerce') + ts.tz_localize("US/Pacific", errors="coerce") - @pytest.mark.filterwarnings('ignore::FutureWarning') + @pytest.mark.filterwarnings("ignore::FutureWarning") def test_tz_localize_errors_invalid_arg(self): # GH 22644 - tz = 'Europe/Warsaw' - ts = Timestamp('2015-03-29 02:00:00') + tz = "Europe/Warsaw" + ts = Timestamp("2015-03-29 02:00:00") with pytest.raises(ValueError): with tm.assert_produces_warning(FutureWarning): - ts.tz_localize(tz, errors='foo') + ts.tz_localize(tz, errors="foo") def test_tz_localize_errors_coerce(self): # GH 22644 # make sure errors='coerce' gets mapped correctly to nonexistent - tz = 'Europe/Warsaw' - ts = Timestamp('2015-03-29 02:00:00') + tz = "Europe/Warsaw" + ts = Timestamp("2015-03-29 02:00:00") with tm.assert_produces_warning(FutureWarning): - result = ts.tz_localize(tz, errors='coerce') - expected = ts.tz_localize(tz, nonexistent='NaT') + result = ts.tz_localize(tz, errors="coerce") + expected = ts.tz_localize(tz, nonexistent="NaT") assert result is expected - @pytest.mark.parametrize('stamp', ['2014-02-01 09:00', '2014-07-08 09:00', - '2014-11-01 17:00', '2014-11-05 00:00']) + @pytest.mark.parametrize( + "stamp", + [ + "2014-02-01 09:00", + "2014-07-08 09:00", + "2014-11-01 17:00", + "2014-11-05 00:00", + ], + ) def test_tz_localize_roundtrip(self, stamp, tz_aware_fixture): tz = tz_aware_fixture ts = Timestamp(stamp) @@ -135,10 +144,10 @@ def test_tz_localize_roundtrip(self, stamp, tz_aware_fixture): def test_tz_localize_ambiguous_compat(self): # validate that pytz and dateutil are compat for dst # when the transition happens - naive = Timestamp('2013-10-27 01:00:00') + naive = Timestamp("2013-10-27 01:00:00") - pytz_zone = 'Europe/London' - dateutil_zone = 'dateutil/Europe/London' + pytz_zone = "Europe/London" + dateutil_zone = "dateutil/Europe/London" result_pytz = naive.tz_localize(pytz_zone, ambiguous=0) result_dateutil = naive.tz_localize(dateutil_zone, ambiguous=0) assert result_pytz.value == result_dateutil.value @@ -146,8 +155,8 @@ def test_tz_localize_ambiguous_compat(self): # fixed ambiguous behavior # see gh-14621 - assert result_pytz.to_pydatetime().tzname() == 'GMT' - assert result_dateutil.to_pydatetime().tzname() == 'BST' + assert result_pytz.to_pydatetime().tzname() == "GMT" + assert result_dateutil.to_pydatetime().tzname() == "BST" assert str(result_pytz) != str(result_dateutil) # 1 hour difference @@ -158,99 +167,138 @@ def test_tz_localize_ambiguous_compat(self): # see gh-14621 assert str(result_pytz) == str(result_dateutil) - assert (result_pytz.to_pydatetime().tzname() == - result_dateutil.to_pydatetime().tzname()) - - @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), - gettz('US/Eastern'), - 'US/Eastern', 'dateutil/US/Eastern']) + assert ( + result_pytz.to_pydatetime().tzname() + == result_dateutil.to_pydatetime().tzname() + ) + + @pytest.mark.parametrize( + "tz", + [ + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + "US/Eastern", + "dateutil/US/Eastern", + ], + ) def test_timestamp_tz_localize(self, tz): - stamp = Timestamp('3/11/2012 04:00') + stamp = Timestamp("3/11/2012 04:00") result = stamp.tz_localize(tz) - expected = Timestamp('3/11/2012 04:00', tz=tz) + expected = Timestamp("3/11/2012 04:00", tz=tz) assert result.hour == expected.hour assert result == expected - @pytest.mark.parametrize('start_ts, tz, end_ts, shift', [ - ['2015-03-29 02:20:00', 'Europe/Warsaw', '2015-03-29 03:00:00', - 'forward'], - ['2015-03-29 02:20:00', 'Europe/Warsaw', - '2015-03-29 01:59:59.999999999', 'backward'], - ['2015-03-29 02:20:00', 'Europe/Warsaw', - '2015-03-29 03:20:00', timedelta(hours=1)], - ['2015-03-29 02:20:00', 'Europe/Warsaw', - '2015-03-29 01:20:00', timedelta(hours=-1)], - ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 03:00:00', - 'forward'], - ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 01:59:59.999999999', - 'backward'], - ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 03:33:00', - timedelta(hours=1)], - ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 01:33:00', - timedelta(hours=-1)] - ]) - @pytest.mark.parametrize('tz_type', ['', 'dateutil/']) - def test_timestamp_tz_localize_nonexistent_shift(self, start_ts, tz, - end_ts, shift, - tz_type): + @pytest.mark.parametrize( + "start_ts, tz, end_ts, shift", + [ + ["2015-03-29 02:20:00", "Europe/Warsaw", "2015-03-29 03:00:00", "forward"], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 01:59:59.999999999", + "backward", + ], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 03:20:00", + timedelta(hours=1), + ], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 01:20:00", + timedelta(hours=-1), + ], + ["2018-03-11 02:33:00", "US/Pacific", "2018-03-11 03:00:00", "forward"], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 01:59:59.999999999", + "backward", + ], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 03:33:00", + timedelta(hours=1), + ], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 01:33:00", + timedelta(hours=-1), + ], + ], + ) + @pytest.mark.parametrize("tz_type", ["", "dateutil/"]) + def test_timestamp_tz_localize_nonexistent_shift( + self, start_ts, tz, end_ts, shift, tz_type + ): # GH 8917, 24466 tz = tz_type + tz if isinstance(shift, str): - shift = 'shift_' + shift + shift = "shift_" + shift ts = Timestamp(start_ts) result = ts.tz_localize(tz, nonexistent=shift) expected = Timestamp(end_ts).tz_localize(tz) assert result == expected - @pytest.mark.parametrize('offset', [-1, 1]) - @pytest.mark.parametrize('tz_type', ['', 'dateutil/']) - def test_timestamp_tz_localize_nonexistent_shift_invalid(self, offset, - tz_type): + @pytest.mark.parametrize("offset", [-1, 1]) + @pytest.mark.parametrize("tz_type", ["", "dateutil/"]) + def test_timestamp_tz_localize_nonexistent_shift_invalid(self, offset, tz_type): # GH 8917, 24466 - tz = tz_type + 'Europe/Warsaw' - ts = Timestamp('2015-03-29 02:20:00') + tz = tz_type + "Europe/Warsaw" + ts = Timestamp("2015-03-29 02:20:00") msg = "The provided timedelta will relocalize on a nonexistent time" with pytest.raises(ValueError, match=msg): ts.tz_localize(tz, nonexistent=timedelta(seconds=offset)) - @pytest.mark.parametrize('tz', ['Europe/Warsaw', 'dateutil/Europe/Warsaw']) + @pytest.mark.parametrize("tz", ["Europe/Warsaw", "dateutil/Europe/Warsaw"]) def test_timestamp_tz_localize_nonexistent_NaT(self, tz): # GH 8917 - ts = Timestamp('2015-03-29 02:20:00') - result = ts.tz_localize(tz, nonexistent='NaT') + ts = Timestamp("2015-03-29 02:20:00") + result = ts.tz_localize(tz, nonexistent="NaT") assert result is NaT - @pytest.mark.parametrize('tz', ['Europe/Warsaw', 'dateutil/Europe/Warsaw']) + @pytest.mark.parametrize("tz", ["Europe/Warsaw", "dateutil/Europe/Warsaw"]) def test_timestamp_tz_localize_nonexistent_raise(self, tz): # GH 8917 - ts = Timestamp('2015-03-29 02:20:00') + ts = Timestamp("2015-03-29 02:20:00") with pytest.raises(pytz.NonExistentTimeError): - ts.tz_localize(tz, nonexistent='raise') + ts.tz_localize(tz, nonexistent="raise") with pytest.raises(ValueError): - ts.tz_localize(tz, nonexistent='foo') + ts.tz_localize(tz, nonexistent="foo") # ------------------------------------------------------------------ # Timestamp.tz_convert - @pytest.mark.parametrize('stamp', ['2014-02-01 09:00', '2014-07-08 09:00', - '2014-11-01 17:00', '2014-11-05 00:00']) + @pytest.mark.parametrize( + "stamp", + [ + "2014-02-01 09:00", + "2014-07-08 09:00", + "2014-11-01 17:00", + "2014-11-05 00:00", + ], + ) def test_tz_convert_roundtrip(self, stamp, tz_aware_fixture): tz = tz_aware_fixture - ts = Timestamp(stamp, tz='UTC') + ts = Timestamp(stamp, tz="UTC") converted = ts.tz_convert(tz) reset = converted.tz_convert(None) assert reset == Timestamp(stamp) assert reset.tzinfo is None - assert reset == converted.tz_convert('UTC').tz_localize(None) + assert reset == converted.tz_convert("UTC").tz_localize(None) - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_astimezone(self, tzstr): # astimezone is an alias for tz_convert, so keep it with # the tz_convert tests - utcdate = Timestamp('3/11/2012 22:00', tz='UTC') + utcdate = Timestamp("3/11/2012 22:00", tz="UTC") expected = utcdate.tz_convert(tzstr) result = utcdate.astimezone(tzstr) assert expected == result @@ -261,12 +309,12 @@ def test_tz_convert_utc_with_system_utc(self): from pandas._libs.tslibs.timezones import maybe_get_tz # from system utc to real utc - ts = Timestamp('2001-01-05 11:56', tz=maybe_get_tz('dateutil/UTC')) + ts = Timestamp("2001-01-05 11:56", tz=maybe_get_tz("dateutil/UTC")) # check that the time hasn't changed. assert ts == ts.tz_convert(dateutil.tz.tzutc()) # from system utc to real utc - ts = Timestamp('2001-01-05 11:56', tz=maybe_get_tz('dateutil/UTC')) + ts = Timestamp("2001-01-05 11:56", tz=maybe_get_tz("dateutil/UTC")) # check that the time hasn't changed. assert ts == ts.tz_convert(dateutil.tz.tzutc()) @@ -274,16 +322,16 @@ def test_tz_convert_utc_with_system_utc(self): # Timestamp.__init__ with tz str or tzinfo def test_timestamp_constructor_tz_utc(self): - utc_stamp = Timestamp('3/11/2012 05:00', tz='utc') + utc_stamp = Timestamp("3/11/2012 05:00", tz="utc") assert utc_stamp.tzinfo is pytz.utc assert utc_stamp.hour == 5 - utc_stamp = Timestamp('3/11/2012 05:00').tz_localize('utc') + utc_stamp = Timestamp("3/11/2012 05:00").tz_localize("utc") assert utc_stamp.hour == 5 def test_timestamp_to_datetime_tzoffset(self): tzinfo = tzoffset(None, 7200) - expected = Timestamp('3/11/2012 04:00', tz=tzinfo) + expected = Timestamp("3/11/2012 04:00", tz=tzinfo) result = Timestamp(expected.to_pydatetime()) assert expected == result @@ -292,85 +340,95 @@ def test_timestamp_constructor_near_dst_boundary(self): # Naive string timestamps were being localized incorrectly # with tz_convert_single instead of tz_localize_to_utc - for tz in ['Europe/Brussels', 'Europe/Prague']: - result = Timestamp('2015-10-25 01:00', tz=tz) - expected = Timestamp('2015-10-25 01:00').tz_localize(tz) + for tz in ["Europe/Brussels", "Europe/Prague"]: + result = Timestamp("2015-10-25 01:00", tz=tz) + expected = Timestamp("2015-10-25 01:00").tz_localize(tz) assert result == expected with pytest.raises(pytz.AmbiguousTimeError): - Timestamp('2015-10-25 02:00', tz=tz) + Timestamp("2015-10-25 02:00", tz=tz) - result = Timestamp('2017-03-26 01:00', tz='Europe/Paris') - expected = Timestamp('2017-03-26 01:00').tz_localize('Europe/Paris') + result = Timestamp("2017-03-26 01:00", tz="Europe/Paris") + expected = Timestamp("2017-03-26 01:00").tz_localize("Europe/Paris") assert result == expected with pytest.raises(pytz.NonExistentTimeError): - Timestamp('2017-03-26 02:00', tz='Europe/Paris') + Timestamp("2017-03-26 02:00", tz="Europe/Paris") # GH#11708 - naive = Timestamp('2015-11-18 10:00:00') - result = naive.tz_localize('UTC').tz_convert('Asia/Kolkata') - expected = Timestamp('2015-11-18 15:30:00+0530', tz='Asia/Kolkata') + naive = Timestamp("2015-11-18 10:00:00") + result = naive.tz_localize("UTC").tz_convert("Asia/Kolkata") + expected = Timestamp("2015-11-18 15:30:00+0530", tz="Asia/Kolkata") assert result == expected # GH#15823 - result = Timestamp('2017-03-26 00:00', tz='Europe/Paris') - expected = Timestamp('2017-03-26 00:00:00+0100', tz='Europe/Paris') + result = Timestamp("2017-03-26 00:00", tz="Europe/Paris") + expected = Timestamp("2017-03-26 00:00:00+0100", tz="Europe/Paris") assert result == expected - result = Timestamp('2017-03-26 01:00', tz='Europe/Paris') - expected = Timestamp('2017-03-26 01:00:00+0100', tz='Europe/Paris') + result = Timestamp("2017-03-26 01:00", tz="Europe/Paris") + expected = Timestamp("2017-03-26 01:00:00+0100", tz="Europe/Paris") assert result == expected with pytest.raises(pytz.NonExistentTimeError): - Timestamp('2017-03-26 02:00', tz='Europe/Paris') + Timestamp("2017-03-26 02:00", tz="Europe/Paris") - result = Timestamp('2017-03-26 02:00:00+0100', tz='Europe/Paris') + result = Timestamp("2017-03-26 02:00:00+0100", tz="Europe/Paris") naive = Timestamp(result.value) - expected = naive.tz_localize('UTC').tz_convert('Europe/Paris') + expected = naive.tz_localize("UTC").tz_convert("Europe/Paris") assert result == expected - result = Timestamp('2017-03-26 03:00', tz='Europe/Paris') - expected = Timestamp('2017-03-26 03:00:00+0200', tz='Europe/Paris') + result = Timestamp("2017-03-26 03:00", tz="Europe/Paris") + expected = Timestamp("2017-03-26 03:00:00+0200", tz="Europe/Paris") assert result == expected - @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), - gettz('US/Eastern'), - 'US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize( + "tz", + [ + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + "US/Eastern", + "dateutil/US/Eastern", + ], + ) def test_timestamp_constructed_by_date_and_tz(self, tz): # GH#2993, Timestamp cannot be constructed by datetime.date # and tz correctly result = Timestamp(date(2012, 3, 11), tz=tz) - expected = Timestamp('3/11/2012', tz=tz) + expected = Timestamp("3/11/2012", tz=tz) assert result.hour == expected.hour assert result == expected - @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), - gettz('US/Eastern'), - 'US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize( + "tz", + [ + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + "US/Eastern", + "dateutil/US/Eastern", + ], + ) def test_timestamp_add_timedelta_push_over_dst_boundary(self, tz): # GH#1389 # 4 hours before DST transition - stamp = Timestamp('3/10/2012 22:00', tz=tz) + stamp = Timestamp("3/10/2012 22:00", tz=tz) result = stamp + timedelta(hours=6) # spring forward, + "7" hours - expected = Timestamp('3/11/2012 05:00', tz=tz) + expected = Timestamp("3/11/2012 05:00", tz=tz) assert result == expected - def test_timestamp_timetz_equivalent_with_datetime_tz(self, - tz_naive_fixture): + def test_timestamp_timetz_equivalent_with_datetime_tz(self, tz_naive_fixture): # GH21358 tz = timezones.maybe_get_tz(tz_naive_fixture) - stamp = Timestamp('2018-06-04 10:20:30', tz=tz) - _datetime = datetime(2018, 6, 4, hour=10, - minute=20, second=30, tzinfo=tz) + stamp = Timestamp("2018-06-04 10:20:30", tz=tz) + _datetime = datetime(2018, 6, 4, hour=10, minute=20, second=30, tzinfo=tz) result = stamp.timetz() expected = _datetime.timetz() diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index 8b13458050ce8d..dffb957b8f3b03 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -20,15 +20,18 @@ class TestTimestampUnaryOps: # -------------------------------------------------------------- # Timestamp.round - @pytest.mark.parametrize('timestamp, freq, expected', [ - ('20130101 09:10:11', 'D', '20130101'), - ('20130101 19:10:11', 'D', '20130102'), - ('20130201 12:00:00', 'D', '20130202'), - ('20130104 12:00:00', 'D', '20130105'), - ('2000-01-05 05:09:15.13', 'D', '2000-01-05 00:00:00'), - ('2000-01-05 05:09:15.13', 'H', '2000-01-05 05:00:00'), - ('2000-01-05 05:09:15.13', 'S', '2000-01-05 05:09:15') - ]) + @pytest.mark.parametrize( + "timestamp, freq, expected", + [ + ("20130101 09:10:11", "D", "20130101"), + ("20130101 19:10:11", "D", "20130102"), + ("20130201 12:00:00", "D", "20130202"), + ("20130104 12:00:00", "D", "20130105"), + ("2000-01-05 05:09:15.13", "D", "2000-01-05 00:00:00"), + ("2000-01-05 05:09:15.13", "H", "2000-01-05 05:00:00"), + ("2000-01-05 05:09:15.13", "S", "2000-01-05 05:09:15"), + ], + ) def test_round_frequencies(self, timestamp, freq, expected): dt = Timestamp(timestamp) result = dt.round(freq) @@ -36,61 +39,72 @@ def test_round_frequencies(self, timestamp, freq, expected): assert result == expected def test_round_tzaware(self): - dt = Timestamp('20130101 09:10:11', tz='US/Eastern') - result = dt.round('D') - expected = Timestamp('20130101', tz='US/Eastern') + dt = Timestamp("20130101 09:10:11", tz="US/Eastern") + result = dt.round("D") + expected = Timestamp("20130101", tz="US/Eastern") assert result == expected - dt = Timestamp('20130101 09:10:11', tz='US/Eastern') - result = dt.round('s') + dt = Timestamp("20130101 09:10:11", tz="US/Eastern") + result = dt.round("s") assert result == dt def test_round_30min(self): # round - dt = Timestamp('20130104 12:32:00') - result = dt.round('30Min') - expected = Timestamp('20130104 12:30:00') + dt = Timestamp("20130104 12:32:00") + result = dt.round("30Min") + expected = Timestamp("20130104 12:30:00") assert result == expected def test_round_subsecond(self): # GH#14440 & GH#15578 - result = Timestamp('2016-10-17 12:00:00.0015').round('ms') - expected = Timestamp('2016-10-17 12:00:00.002000') + result = Timestamp("2016-10-17 12:00:00.0015").round("ms") + expected = Timestamp("2016-10-17 12:00:00.002000") assert result == expected - result = Timestamp('2016-10-17 12:00:00.00149').round('ms') - expected = Timestamp('2016-10-17 12:00:00.001000') + result = Timestamp("2016-10-17 12:00:00.00149").round("ms") + expected = Timestamp("2016-10-17 12:00:00.001000") assert result == expected - ts = Timestamp('2016-10-17 12:00:00.0015') - for freq in ['us', 'ns']: + ts = Timestamp("2016-10-17 12:00:00.0015") + for freq in ["us", "ns"]: assert ts == ts.round(freq) - result = Timestamp('2016-10-17 12:00:00.001501031').round('10ns') - expected = Timestamp('2016-10-17 12:00:00.001501030') + result = Timestamp("2016-10-17 12:00:00.001501031").round("10ns") + expected = Timestamp("2016-10-17 12:00:00.001501030") assert result == expected def test_round_nonstandard_freq(self): with tm.assert_produces_warning(False): - Timestamp('2016-10-17 12:00:00.001501031').round('1010ns') + Timestamp("2016-10-17 12:00:00.001501031").round("1010ns") def test_round_invalid_arg(self): - stamp = Timestamp('2000-01-05 05:09:15.13') + stamp = Timestamp("2000-01-05 05:09:15.13") with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - stamp.round('foo') - - @pytest.mark.parametrize('test_input, rounder, freq, expected', [ - ('2117-01-01 00:00:45', 'floor', '15s', '2117-01-01 00:00:45'), - ('2117-01-01 00:00:45', 'ceil', '15s', '2117-01-01 00:00:45'), - ('2117-01-01 00:00:45.000000012', 'floor', '10ns', - '2117-01-01 00:00:45.000000010'), - ('1823-01-01 00:00:01.000000012', 'ceil', '10ns', - '1823-01-01 00:00:01.000000020'), - ('1823-01-01 00:00:01', 'floor', '1s', '1823-01-01 00:00:01'), - ('1823-01-01 00:00:01', 'ceil', '1s', '1823-01-01 00:00:01'), - ('NaT', 'floor', '1s', 'NaT'), - ('NaT', 'ceil', '1s', 'NaT') - ]) + stamp.round("foo") + + @pytest.mark.parametrize( + "test_input, rounder, freq, expected", + [ + ("2117-01-01 00:00:45", "floor", "15s", "2117-01-01 00:00:45"), + ("2117-01-01 00:00:45", "ceil", "15s", "2117-01-01 00:00:45"), + ( + "2117-01-01 00:00:45.000000012", + "floor", + "10ns", + "2117-01-01 00:00:45.000000010", + ), + ( + "1823-01-01 00:00:01.000000012", + "ceil", + "10ns", + "1823-01-01 00:00:01.000000020", + ), + ("1823-01-01 00:00:01", "floor", "1s", "1823-01-01 00:00:01"), + ("1823-01-01 00:00:01", "ceil", "1s", "1823-01-01 00:00:01"), + ("NaT", "floor", "1s", "NaT"), + ("NaT", "ceil", "1s", "NaT"), + ], + ) def test_ceil_floor_edge(self, test_input, rounder, freq, expected): dt = Timestamp(test_input) func = getattr(dt, rounder) @@ -102,15 +116,18 @@ def test_ceil_floor_edge(self, test_input, rounder, freq, expected): expected = Timestamp(expected) assert result == expected - @pytest.mark.parametrize('test_input, freq, expected', [ - ('2018-01-01 00:02:06', '2s', '2018-01-01 00:02:06'), - ('2018-01-01 00:02:00', '2T', '2018-01-01 00:02:00'), - ('2018-01-01 00:04:00', '4T', '2018-01-01 00:04:00'), - ('2018-01-01 00:15:00', '15T', '2018-01-01 00:15:00'), - ('2018-01-01 00:20:00', '20T', '2018-01-01 00:20:00'), - ('2018-01-01 03:00:00', '3H', '2018-01-01 03:00:00'), - ]) - @pytest.mark.parametrize('rounder', ['ceil', 'floor', 'round']) + @pytest.mark.parametrize( + "test_input, freq, expected", + [ + ("2018-01-01 00:02:06", "2s", "2018-01-01 00:02:06"), + ("2018-01-01 00:02:00", "2T", "2018-01-01 00:02:00"), + ("2018-01-01 00:04:00", "4T", "2018-01-01 00:04:00"), + ("2018-01-01 00:15:00", "15T", "2018-01-01 00:15:00"), + ("2018-01-01 00:20:00", "20T", "2018-01-01 00:20:00"), + ("2018-01-01 03:00:00", "3H", "2018-01-01 03:00:00"), + ], + ) + @pytest.mark.parametrize("rounder", ["ceil", "floor", "round"]) def test_round_minute_freq(self, test_input, freq, expected, rounder): # Ensure timestamps that shouldn't round dont! # GH#21262 @@ -122,72 +139,92 @@ def test_round_minute_freq(self, test_input, freq, expected, rounder): assert result == expected def test_ceil(self): - dt = Timestamp('20130101 09:10:11') - result = dt.ceil('D') - expected = Timestamp('20130102') + dt = Timestamp("20130101 09:10:11") + result = dt.ceil("D") + expected = Timestamp("20130102") assert result == expected def test_floor(self): - dt = Timestamp('20130101 09:10:11') - result = dt.floor('D') - expected = Timestamp('20130101') + dt = Timestamp("20130101 09:10:11") + result = dt.floor("D") + expected = Timestamp("20130101") assert result == expected - @pytest.mark.parametrize('method', ['ceil', 'round', 'floor']) + @pytest.mark.parametrize("method", ["ceil", "round", "floor"]) def test_round_dst_border_ambiguous(self, method): # GH 18946 round near "fall back" DST - ts = Timestamp('2017-10-29 00:00:00', tz='UTC').tz_convert( - 'Europe/Madrid' - ) + ts = Timestamp("2017-10-29 00:00:00", tz="UTC").tz_convert("Europe/Madrid") # - result = getattr(ts, method)('H', ambiguous=True) + result = getattr(ts, method)("H", ambiguous=True) assert result == ts - result = getattr(ts, method)('H', ambiguous=False) - expected = Timestamp('2017-10-29 01:00:00', tz='UTC').tz_convert( - 'Europe/Madrid' + result = getattr(ts, method)("H", ambiguous=False) + expected = Timestamp("2017-10-29 01:00:00", tz="UTC").tz_convert( + "Europe/Madrid" ) assert result == expected - result = getattr(ts, method)('H', ambiguous='NaT') + result = getattr(ts, method)("H", ambiguous="NaT") assert result is NaT with pytest.raises(pytz.AmbiguousTimeError): - getattr(ts, method)('H', ambiguous='raise') - - @pytest.mark.parametrize('method, ts_str, freq', [ - ['ceil', '2018-03-11 01:59:00-0600', '5min'], - ['round', '2018-03-11 01:59:00-0600', '5min'], - ['floor', '2018-03-11 03:01:00-0500', '2H']]) + getattr(ts, method)("H", ambiguous="raise") + + @pytest.mark.parametrize( + "method, ts_str, freq", + [ + ["ceil", "2018-03-11 01:59:00-0600", "5min"], + ["round", "2018-03-11 01:59:00-0600", "5min"], + ["floor", "2018-03-11 03:01:00-0500", "2H"], + ], + ) def test_round_dst_border_nonexistent(self, method, ts_str, freq): # GH 23324 round near "spring forward" DST - ts = Timestamp(ts_str, tz='America/Chicago') - result = getattr(ts, method)(freq, nonexistent='shift_forward') - expected = Timestamp('2018-03-11 03:00:00', tz='America/Chicago') + ts = Timestamp(ts_str, tz="America/Chicago") + result = getattr(ts, method)(freq, nonexistent="shift_forward") + expected = Timestamp("2018-03-11 03:00:00", tz="America/Chicago") assert result == expected - result = getattr(ts, method)(freq, nonexistent='NaT') + result = getattr(ts, method)(freq, nonexistent="NaT") assert result is NaT - with pytest.raises(pytz.NonExistentTimeError, - match='2018-03-11 02:00:00'): - getattr(ts, method)(freq, nonexistent='raise') - - @pytest.mark.parametrize('timestamp', [ - '2018-01-01 0:0:0.124999360', - '2018-01-01 0:0:0.125000367', - '2018-01-01 0:0:0.125500', - '2018-01-01 0:0:0.126500', - '2018-01-01 12:00:00', - '2019-01-01 12:00:00', - ]) - @pytest.mark.parametrize('freq', [ - '2ns', '3ns', '4ns', '5ns', '6ns', '7ns', - '250ns', '500ns', '750ns', - '1us', '19us', '250us', '500us', '750us', - '1s', '2s', '3s', - '1D', - ]) + with pytest.raises(pytz.NonExistentTimeError, match="2018-03-11 02:00:00"): + getattr(ts, method)(freq, nonexistent="raise") + + @pytest.mark.parametrize( + "timestamp", + [ + "2018-01-01 0:0:0.124999360", + "2018-01-01 0:0:0.125000367", + "2018-01-01 0:0:0.125500", + "2018-01-01 0:0:0.126500", + "2018-01-01 12:00:00", + "2019-01-01 12:00:00", + ], + ) + @pytest.mark.parametrize( + "freq", + [ + "2ns", + "3ns", + "4ns", + "5ns", + "6ns", + "7ns", + "250ns", + "500ns", + "750ns", + "1us", + "19us", + "250us", + "500us", + "750us", + "1s", + "2s", + "3s", + "1D", + ], + ) def test_round_int64(self, timestamp, freq): """check that all rounding modes are accurate to int64 precision see GH#22591 @@ -218,26 +255,26 @@ def test_round_int64(self, timestamp, freq): def test_replace_naive(self): # GH#14621, GH#7825 - ts = Timestamp('2016-01-01 09:00:00') + ts = Timestamp("2016-01-01 09:00:00") result = ts.replace(hour=0) - expected = Timestamp('2016-01-01 00:00:00') + expected = Timestamp("2016-01-01 00:00:00") assert result == expected def test_replace_aware(self, tz_aware_fixture): tz = tz_aware_fixture # GH#14621, GH#7825 # replacing datetime components with and w/o presence of a timezone - ts = Timestamp('2016-01-01 09:00:00', tz=tz) + ts = Timestamp("2016-01-01 09:00:00", tz=tz) result = ts.replace(hour=0) - expected = Timestamp('2016-01-01 00:00:00', tz=tz) + expected = Timestamp("2016-01-01 00:00:00", tz=tz) assert result == expected def test_replace_preserves_nanos(self, tz_aware_fixture): tz = tz_aware_fixture # GH#14621, GH#7825 - ts = Timestamp('2016-01-01 09:00:00.000000123', tz=tz) + ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) result = ts.replace(hour=0) - expected = Timestamp('2016-01-01 00:00:00.000000123', tz=tz) + expected = Timestamp("2016-01-01 00:00:00.000000123", tz=tz) assert result == expected def test_replace_multiple(self, tz_aware_fixture): @@ -245,43 +282,51 @@ def test_replace_multiple(self, tz_aware_fixture): # GH#14621, GH#7825 # replacing datetime components with and w/o presence of a timezone # test all - ts = Timestamp('2016-01-01 09:00:00.000000123', tz=tz) - result = ts.replace(year=2015, month=2, day=2, hour=0, minute=5, - second=5, microsecond=5, nanosecond=5) - expected = Timestamp('2015-02-02 00:05:05.000005005', tz=tz) + ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) + result = ts.replace( + year=2015, + month=2, + day=2, + hour=0, + minute=5, + second=5, + microsecond=5, + nanosecond=5, + ) + expected = Timestamp("2015-02-02 00:05:05.000005005", tz=tz) assert result == expected def test_replace_invalid_kwarg(self, tz_aware_fixture): tz = tz_aware_fixture # GH#14621, GH#7825 - ts = Timestamp('2016-01-01 09:00:00.000000123', tz=tz) + ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) with pytest.raises(TypeError): ts.replace(foo=5) def test_replace_integer_args(self, tz_aware_fixture): tz = tz_aware_fixture # GH#14621, GH#7825 - ts = Timestamp('2016-01-01 09:00:00.000000123', tz=tz) + ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) with pytest.raises(ValueError): ts.replace(hour=0.1) def test_replace_tzinfo_equiv_tz_localize_none(self): # GH#14621, GH#7825 # assert conversion to naive is the same as replacing tzinfo with None - ts = Timestamp('2013-11-03 01:59:59.999999-0400', tz='US/Eastern') + ts = Timestamp("2013-11-03 01:59:59.999999-0400", tz="US/Eastern") assert ts.tz_localize(None) == ts.replace(tzinfo=None) @td.skip_if_windows def test_replace_tzinfo(self): # GH#15683 dt = datetime(2016, 3, 27, 1) - tzinfo = pytz.timezone('CET').localize(dt, is_dst=False).tzinfo + tzinfo = pytz.timezone("CET").localize(dt, is_dst=False).tzinfo result_dt = dt.replace(tzinfo=tzinfo) result_pd = Timestamp(dt).replace(tzinfo=tzinfo) # datetime.timestamp() converts in the local timezone - with tm.set_timezone('UTC'): + with tm.set_timezone("UTC"): assert result_dt.timestamp() == result_pd.timestamp() assert result_dt == result_pd @@ -291,19 +336,23 @@ def test_replace_tzinfo(self): result_pd = Timestamp(dt).replace(tzinfo=tzinfo).replace(tzinfo=None) # datetime.timestamp() converts in the local timezone - with tm.set_timezone('UTC'): + with tm.set_timezone("UTC"): assert result_dt.timestamp() == result_pd.timestamp() assert result_dt == result_pd assert result_dt == result_pd.to_pydatetime() - @pytest.mark.parametrize('tz, normalize', [ - (pytz.timezone('US/Eastern'), lambda x: x.tzinfo.normalize(x)), - (gettz('US/Eastern'), lambda x: x)]) + @pytest.mark.parametrize( + "tz, normalize", + [ + (pytz.timezone("US/Eastern"), lambda x: x.tzinfo.normalize(x)), + (gettz("US/Eastern"), lambda x: x), + ], + ) def test_replace_across_dst(self, tz, normalize): # GH#18319 check that 1) timezone is correctly normalized and # 2) that hour is not incorrectly changed by this normalization - ts_naive = Timestamp('2017-12-03 16:03:30') + ts_naive = Timestamp("2017-12-03 16:03:30") ts_aware = conversion.localize_pydatetime(ts_naive, tz) # Preliminary sanity-check @@ -321,14 +370,14 @@ def test_replace_across_dst(self, tz, normalize): def test_replace_dst_border(self): # Gh 7825 - t = Timestamp('2013-11-3', tz='America/Chicago') + t = Timestamp("2013-11-3", tz="America/Chicago") result = t.replace(hour=3) - expected = Timestamp('2013-11-3 03:00:00', tz='America/Chicago') + expected = Timestamp("2013-11-3 03:00:00", tz="America/Chicago") assert result == expected - @pytest.mark.skipif(not PY36, reason='Fold not available until PY3.6') - @pytest.mark.parametrize('fold', [0, 1]) - @pytest.mark.parametrize('tz', ['dateutil/Europe/London', 'Europe/London']) + @pytest.mark.skipif(not PY36, reason="Fold not available until PY3.6") + @pytest.mark.parametrize("fold", [0, 1]) + @pytest.mark.parametrize("tz", ["dateutil/Europe/London", "Europe/London"]) def test_replace_dst_fold(self, fold, tz): # GH 25017 d = datetime(2019, 10, 27, 2, 30) @@ -342,12 +391,12 @@ def test_replace_dst_fold(self, fold, tz): # -------------------------------------------------------------- # Timestamp.normalize - @pytest.mark.parametrize('arg', ['2013-11-30', '2013-11-30 12:00:00']) + @pytest.mark.parametrize("arg", ["2013-11-30", "2013-11-30 12:00:00"]) def test_normalize(self, tz_naive_fixture, arg): tz = tz_naive_fixture ts = Timestamp(arg, tz=tz) result = ts.normalize() - expected = Timestamp('2013-11-30', tz=tz) + expected = Timestamp("2013-11-30", tz=tz) assert result == expected # -------------------------------------------------------------- @@ -360,14 +409,14 @@ def test_timestamp(self): uts = ts.replace(tzinfo=utc) assert ts.timestamp() == uts.timestamp() - tsc = Timestamp('2014-10-11 11:00:01.12345678', tz='US/Central') - utsc = tsc.tz_convert('UTC') + tsc = Timestamp("2014-10-11 11:00:01.12345678", tz="US/Central") + utsc = tsc.tz_convert("UTC") # utsc is a different representation of the same time assert tsc.timestamp() == utsc.timestamp() # datetime.timestamp() converts in the local timezone - with tm.set_timezone('UTC'): + with tm.set_timezone("UTC"): # should agree with datetime.timestamp method dt = ts.to_pydatetime() assert dt.timestamp() == ts.timestamp() diff --git a/pandas/tests/series/common.py b/pandas/tests/series/common.py index 220bf20c81dc3c..38c62e89f18734 100644 --- a/pandas/tests/series/common.py +++ b/pandas/tests/series/common.py @@ -7,23 +7,22 @@ class TestData: - @cache_readonly def ts(self): ts = _ts.copy() - ts.name = 'ts' + ts.name = "ts" return ts @cache_readonly def series(self): series = tm.makeStringSeries() - series.name = 'series' + series.name = "series" return series @cache_readonly def objSeries(self): objSeries = tm.makeObjectSeries() - objSeries.name = 'objects' + objSeries.name = "objects" return objSeries @cache_readonly diff --git a/pandas/tests/series/conftest.py b/pandas/tests/series/conftest.py index 367e7a1baa7f31..f5b401398d6d65 100644 --- a/pandas/tests/series/conftest.py +++ b/pandas/tests/series/conftest.py @@ -9,7 +9,7 @@ def datetime_series(): Fixture for Series of floats with DatetimeIndex """ s = tm.makeTimeSeries() - s.name = 'ts' + s.name = "ts" return s @@ -19,7 +19,7 @@ def string_series(): Fixture for Series of floats with Index of unique strings """ s = tm.makeStringSeries() - s.name = 'series' + s.name = "series" return s @@ -29,5 +29,5 @@ def object_series(): Fixture for Series of dtype datetime64[ns] with Index of unique strings """ s = tm.makeObjectSeries() - s.name = 'objects' + s.name = "objects" return s diff --git a/pandas/tests/series/indexing/conftest.py b/pandas/tests/series/indexing/conftest.py index 0e06f6b8e4640a..9c7103c196d604 100644 --- a/pandas/tests/series/indexing/conftest.py +++ b/pandas/tests/series/indexing/conftest.py @@ -3,6 +3,6 @@ from pandas.tests.series.common import TestData -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def test_data(): return TestData() diff --git a/pandas/tests/series/indexing/test_alter_index.py b/pandas/tests/series/indexing/test_alter_index.py index 2c1eb11d9b5150..31a1f43470f2c7 100644 --- a/pandas/tests/series/indexing/test_alter_index.py +++ b/pandas/tests/series/indexing/test_alter_index.py @@ -11,13 +11,15 @@ @pytest.mark.parametrize( - 'first_slice,second_slice', [ + "first_slice,second_slice", + [ [[2, None], [None, -5]], [[None, 0], [None, -5]], [[None, -5], [None, 0]], - [[None, 0], [None, 0]] - ]) -@pytest.mark.parametrize('fill', [None, -1]) + [[None, 0], [None, 0]], + ], +) +@pytest.mark.parametrize("fill", [None, -1]) def test_align(test_data, first_slice, second_slice, join_type, fill): a = test_data.ts[slice(*first_slice)] b = test_data.ts[slice(*second_slice)] @@ -42,24 +44,26 @@ def test_align(test_data, first_slice, second_slice, join_type, fill): assert_series_equal(aa, ea) assert_series_equal(ab, eb) - assert aa.name == 'ts' - assert ea.name == 'ts' - assert ab.name == 'ts' - assert eb.name == 'ts' + assert aa.name == "ts" + assert ea.name == "ts" + assert ab.name == "ts" + assert eb.name == "ts" @pytest.mark.parametrize( - 'first_slice,second_slice', [ + "first_slice,second_slice", + [ [[2, None], [None, -5]], [[None, 0], [None, -5]], [[None, -5], [None, 0]], - [[None, 0], [None, 0]] - ]) -@pytest.mark.parametrize('method', ['pad', 'bfill']) -@pytest.mark.parametrize('limit', [None, 1]) -def test_align_fill_method(test_data, - first_slice, second_slice, - join_type, method, limit): + [[None, 0], [None, 0]], + ], +) +@pytest.mark.parametrize("method", ["pad", "bfill"]) +@pytest.mark.parametrize("limit", [None, 1]) +def test_align_fill_method( + test_data, first_slice, second_slice, join_type, method, limit +): a = test_data.ts[slice(*first_slice)] b = test_data.ts[slice(*second_slice)] @@ -81,27 +85,27 @@ def test_align_nocopy(test_data): # do copy a = test_data.ts.copy() - ra, _ = a.align(b, join='left') + ra, _ = a.align(b, join="left") ra[:5] = 5 assert not (a[:5] == 5).any() # do not copy a = test_data.ts.copy() - ra, _ = a.align(b, join='left', copy=False) + ra, _ = a.align(b, join="left", copy=False) ra[:5] = 5 assert (a[:5] == 5).all() # do copy a = test_data.ts.copy() b = test_data.ts[:5].copy() - _, rb = a.align(b, join='right') + _, rb = a.align(b, join="right") rb[:3] = 5 assert not (b[:3] == 5).any() # do not copy a = test_data.ts.copy() b = test_data.ts[:5].copy() - _, rb = a.align(b, join='right', copy=False) + _, rb = a.align(b, join="right", copy=False) rb[:2] = 5 assert (b[:2] == 5).all() @@ -119,15 +123,16 @@ def test_align_same_index(test_data): def test_align_multiindex(): # GH 10665 - midx = pd.MultiIndex.from_product([range(2), range(3), range(2)], - names=('a', 'b', 'c')) - idx = pd.Index(range(2), name='b') - s1 = pd.Series(np.arange(12, dtype='int64'), index=midx) - s2 = pd.Series(np.arange(2, dtype='int64'), index=idx) + midx = pd.MultiIndex.from_product( + [range(2), range(3), range(2)], names=("a", "b", "c") + ) + idx = pd.Index(range(2), name="b") + s1 = pd.Series(np.arange(12, dtype="int64"), index=midx) + s2 = pd.Series(np.arange(2, dtype="int64"), index=idx) # these must be the same results (but flipped) - res1l, res1r = s1.align(s2, join='left') - res2l, res2r = s2.align(s1, join='right') + res1l, res1r = s1.align(s2, join="left") + res2l, res2r = s2.align(s1, join="right") expl = s1 tm.assert_series_equal(expl, res1l) @@ -136,11 +141,12 @@ def test_align_multiindex(): tm.assert_series_equal(expr, res1r) tm.assert_series_equal(expr, res2l) - res1l, res1r = s1.align(s2, join='right') - res2l, res2r = s2.align(s1, join='left') + res1l, res1r = s1.align(s2, join="right") + res2l, res2r = s2.align(s1, join="left") - exp_idx = pd.MultiIndex.from_product([range(2), range(2), range(2)], - names=('a', 'b', 'c')) + exp_idx = pd.MultiIndex.from_product( + [range(2), range(2), range(2)], names=("a", "b", "c") + ) expl = pd.Series([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) tm.assert_series_equal(expl, res1l) tm.assert_series_equal(expl, res2r) @@ -194,18 +200,18 @@ def test_reindex_nan(): i, j = [nan, 1, nan, 8, 4, nan], [2, 0, 2, 3, 1, 2] assert_series_equal(ts.reindex(i), ts.iloc[j]) - ts.index = ts.index.astype('object') + ts.index = ts.index.astype("object") # reindex coerces index.dtype to float, loc/iloc doesn't assert_series_equal(ts.reindex(i), ts.iloc[j], check_index_type=False) def test_reindex_series_add_nat(): - rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s') + rng = date_range("1/1/2000 00:00:00", periods=10, freq="10s") series = Series(rng) result = series.reindex(range(15)) - assert np.issubdtype(result.dtype, np.dtype('M8[ns]')) + assert np.issubdtype(result.dtype, np.dtype("M8[ns]")) mask = result.isna() assert mask[-5:].all() @@ -213,7 +219,7 @@ def test_reindex_series_add_nat(): def test_reindex_with_datetimes(): - rng = date_range('1/1/2000', periods=20) + rng = date_range("1/1/2000", periods=20) ts = Series(np.random.randn(20), index=rng) result = ts.reindex(list(ts.index[5:10])) @@ -226,10 +232,10 @@ def test_reindex_with_datetimes(): def test_reindex_corner(test_data): # (don't forget to fix this) I think it's fixed - test_data.empty.reindex(test_data.ts.index, method='pad') # it works + test_data.empty.reindex(test_data.ts.index, method="pad") # it works # corner case: pad empty series - reindexed = test_data.empty.reindex(test_data.ts.index, method='pad') + reindexed = test_data.empty.reindex(test_data.ts.index, method="pad") # pass non-Index reindexed = test_data.ts.reindex(list(test_data.ts.index)) @@ -237,75 +243,75 @@ def test_reindex_corner(test_data): # bad fill method ts = test_data.ts[::2] - msg = (r"Invalid fill method\. Expecting pad \(ffill\), backfill" - r" \(bfill\) or nearest\. Got foo") + msg = ( + r"Invalid fill method\. Expecting pad \(ffill\), backfill" + r" \(bfill\) or nearest\. Got foo" + ) with pytest.raises(ValueError, match=msg): - ts.reindex(test_data.ts.index, method='foo') + ts.reindex(test_data.ts.index, method="foo") def test_reindex_pad(): - s = Series(np.arange(10), dtype='int64') + s = Series(np.arange(10), dtype="int64") s2 = s[::2] - reindexed = s2.reindex(s.index, method='pad') - reindexed2 = s2.reindex(s.index, method='ffill') + reindexed = s2.reindex(s.index, method="pad") + reindexed2 = s2.reindex(s.index, method="ffill") assert_series_equal(reindexed, reindexed2) expected = Series([0, 0, 2, 2, 4, 4, 6, 6, 8, 8], index=np.arange(10)) assert_series_equal(reindexed, expected) # GH4604 - s = Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e']) - new_index = ['a', 'g', 'c', 'f'] + s = Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"]) + new_index = ["a", "g", "c", "f"] expected = Series([1, 1, 3, 3], index=new_index) # this changes dtype because the ffill happens after result = s.reindex(new_index).ffill() - assert_series_equal(result, expected.astype('float64')) + assert_series_equal(result, expected.astype("float64")) - result = s.reindex(new_index).ffill(downcast='infer') + result = s.reindex(new_index).ffill(downcast="infer") assert_series_equal(result, expected) expected = Series([1, 5, 3, 5], index=new_index) - result = s.reindex(new_index, method='ffill') + result = s.reindex(new_index, method="ffill") assert_series_equal(result, expected) # inference of new dtype - s = Series([True, False, False, True], index=list('abcd')) - new_index = 'agc' + s = Series([True, False, False, True], index=list("abcd")) + new_index = "agc" result = s.reindex(list(new_index)).ffill() expected = Series([True, True, False], index=list(new_index)) assert_series_equal(result, expected) # GH4618 shifted series downcasting s = Series(False, index=range(0, 5)) - result = s.shift(1).fillna(method='bfill') + result = s.shift(1).fillna(method="bfill") expected = Series(False, index=range(0, 5)) assert_series_equal(result, expected) def test_reindex_nearest(): - s = Series(np.arange(10, dtype='int64')) + s = Series(np.arange(10, dtype="int64")) target = [0.1, 0.9, 1.5, 2.0] - actual = s.reindex(target, method='nearest') - expected = Series(np.around(target).astype('int64'), target) + actual = s.reindex(target, method="nearest") + expected = Series(np.around(target).astype("int64"), target) assert_series_equal(expected, actual) - actual = s.reindex_like(actual, method='nearest') + actual = s.reindex_like(actual, method="nearest") assert_series_equal(expected, actual) - actual = s.reindex_like(actual, method='nearest', tolerance=1) + actual = s.reindex_like(actual, method="nearest", tolerance=1) assert_series_equal(expected, actual) - actual = s.reindex_like(actual, method='nearest', - tolerance=[1, 2, 3, 4]) + actual = s.reindex_like(actual, method="nearest", tolerance=[1, 2, 3, 4]) assert_series_equal(expected, actual) - actual = s.reindex(target, method='nearest', tolerance=0.2) + actual = s.reindex(target, method="nearest", tolerance=0.2) expected = Series([0, 1, np.nan, 2], target) assert_series_equal(expected, actual) - actual = s.reindex(target, method='nearest', - tolerance=[0.3, 0.01, 0.4, 3]) + actual = s.reindex(target, method="nearest", tolerance=[0.3, 0.01, 0.4, 3]) expected = Series([0, np.nan, np.nan, 2], target) assert_series_equal(expected, actual) @@ -349,30 +355,29 @@ def test_reindex_bool_pad(test_data): # fail ts = test_data.ts[5:] bool_ts = Series(np.zeros(len(ts), dtype=bool), index=ts.index) - filled_bool = bool_ts.reindex(test_data.ts.index, method='pad') + filled_bool = bool_ts.reindex(test_data.ts.index, method="pad") assert isna(filled_bool[:5]).all() def test_reindex_categorical(): - index = date_range('20000101', periods=3) + index = date_range("20000101", periods=3) # reindexing to an invalid Categorical - s = Series(['a', 'b', 'c'], dtype='category') + s = Series(["a", "b", "c"], dtype="category") result = s.reindex(index) - expected = Series(Categorical(values=[np.nan, np.nan, np.nan], - categories=['a', 'b', 'c'])) + expected = Series( + Categorical(values=[np.nan, np.nan, np.nan], categories=["a", "b", "c"]) + ) expected.index = index tm.assert_series_equal(result, expected) # partial reindexing - expected = Series(Categorical(values=['b', 'c'], categories=['a', 'b', - 'c'])) + expected = Series(Categorical(values=["b", "c"], categories=["a", "b", "c"])) expected.index = [1, 2] result = s.reindex([1, 2]) tm.assert_series_equal(result, expected) - expected = Series(Categorical( - values=['c', np.nan], categories=['a', 'b', 'c'])) + expected = Series(Categorical(values=["c", np.nan], categories=["a", "b", "c"])) expected.index = [2, 3] result = s.reindex([2, 3]) tm.assert_series_equal(result, expected) @@ -380,8 +385,9 @@ def test_reindex_categorical(): def test_reindex_like(test_data): other = test_data.ts[::2] - assert_series_equal(test_data.ts.reindex(other.index), - test_data.ts.reindex_like(other)) + assert_series_equal( + test_data.ts.reindex(other.index), test_data.ts.reindex_like(other) + ) # GH 7179 day1 = datetime(2013, 3, 5) @@ -391,7 +397,7 @@ def test_reindex_like(test_data): series1 = Series([5, None, None], [day1, day2, day3]) series2 = Series([None, None], [day1, day3]) - result = series1.reindex_like(series2, method='pad') + result = series1.reindex_like(series2, method="pad") expected = Series([5, np.nan], index=[day1, day3]) assert_series_equal(result, expected) @@ -399,13 +405,13 @@ def test_reindex_like(test_data): def test_reindex_fill_value(): # ----------------------------------------------------------- # floats - floats = Series([1., 2., 3.]) + floats = Series([1.0, 2.0, 3.0]) result = floats.reindex([1, 2, 3]) - expected = Series([2., 3., np.nan], index=[1, 2, 3]) + expected = Series([2.0, 3.0, np.nan], index=[1, 2, 3]) assert_series_equal(result, expected) result = floats.reindex([1, 2, 3], fill_value=0) - expected = Series([2., 3., 0], index=[1, 2, 3]) + expected = Series([2.0, 3.0, 0], index=[1, 2, 3]) assert_series_equal(result, expected) # ----------------------------------------------------------- @@ -413,7 +419,7 @@ def test_reindex_fill_value(): ints = Series([1, 2, 3]) result = ints.reindex([1, 2, 3]) - expected = Series([2., 3., np.nan], index=[1, 2, 3]) + expected = Series([2.0, 3.0, np.nan], index=[1, 2, 3]) assert_series_equal(result, expected) # don't upcast @@ -430,8 +436,8 @@ def test_reindex_fill_value(): expected = Series([2, 3, np.nan], index=[1, 2, 3], dtype=object) assert_series_equal(result, expected) - result = objects.reindex([1, 2, 3], fill_value='foo') - expected = Series([2, 3, 'foo'], index=[1, 2, 3], dtype=object) + result = objects.reindex([1, 2, 3], fill_value="foo") + expected = Series([2, 3, "foo"], index=[1, 2, 3], dtype=object) assert_series_equal(result, expected) # ------------------------------------------------------------ @@ -449,23 +455,23 @@ def test_reindex_fill_value(): def test_reindex_datetimeindexes_tz_naive_and_aware(): # GH 8306 - idx = date_range('20131101', tz='America/Chicago', periods=7) - newidx = date_range('20131103', periods=10, freq='H') + idx = date_range("20131101", tz="America/Chicago", periods=7) + newidx = date_range("20131103", periods=10, freq="H") s = Series(range(7), index=idx) with pytest.raises(TypeError): - s.reindex(newidx, method='ffill') + s.reindex(newidx, method="ffill") def test_reindex_empty_series_tz_dtype(): # GH 20869 - result = Series(dtype='datetime64[ns, UTC]').reindex([0, 1]) - expected = Series([pd.NaT] * 2, dtype='datetime64[ns, UTC]') + result = Series(dtype="datetime64[ns, UTC]").reindex([0, 1]) + expected = Series([pd.NaT] * 2, dtype="datetime64[ns, UTC]") tm.assert_equal(result, expected) def test_rename(): # GH 17407 - s = Series(range(1, 6), index=pd.Index(range(2, 7), name='IntIndex')) + s = Series(range(1, 6), index=pd.Index(range(2, 7), name="IntIndex")) result = s.rename(str) expected = s.rename(lambda i: str(i)) assert_series_equal(result, expected) @@ -474,26 +480,21 @@ def test_rename(): @pytest.mark.parametrize( - 'data, index, drop_labels,' - ' axis, expected_data, expected_index', + "data, index, drop_labels," " axis, expected_data, expected_index", [ # Unique Index - ([1, 2], ['one', 'two'], ['two'], - 0, [1], ['one']), - ([1, 2], ['one', 'two'], ['two'], - 'rows', [1], ['one']), - ([1, 1, 2], ['one', 'two', 'one'], ['two'], - 0, [1, 2], ['one', 'one']), - + ([1, 2], ["one", "two"], ["two"], 0, [1], ["one"]), + ([1, 2], ["one", "two"], ["two"], "rows", [1], ["one"]), + ([1, 1, 2], ["one", "two", "one"], ["two"], 0, [1, 2], ["one", "one"]), # GH 5248 Non-Unique Index - ([1, 1, 2], ['one', 'two', 'one'], 'two', - 0, [1, 2], ['one', 'one']), - ([1, 1, 2], ['one', 'two', 'one'], ['one'], - 0, [1], ['two']), - ([1, 1, 2], ['one', 'two', 'one'], 'one', - 0, [1], ['two'])]) -def test_drop_unique_and_non_unique_index(data, index, axis, drop_labels, - expected_data, expected_index): + ([1, 1, 2], ["one", "two", "one"], "two", 0, [1, 2], ["one", "one"]), + ([1, 1, 2], ["one", "two", "one"], ["one"], 0, [1], ["two"]), + ([1, 1, 2], ["one", "two", "one"], "one", 0, [1], ["two"]), + ], +) +def test_drop_unique_and_non_unique_index( + data, index, axis, drop_labels, expected_data, expected_index +): s = Series(data=data, index=index) result = s.drop(drop_labels, axis=axis) @@ -502,20 +503,16 @@ def test_drop_unique_and_non_unique_index(data, index, axis, drop_labels, @pytest.mark.parametrize( - 'data, index, drop_labels,' - ' axis, error_type, error_desc', + "data, index, drop_labels," " axis, error_type, error_desc", [ # single string/tuple-like - (range(3), list('abc'), 'bc', - 0, KeyError, 'not found in axis'), - + (range(3), list("abc"), "bc", 0, KeyError, "not found in axis"), # bad axis - (range(3), list('abc'), ('a',), - 0, KeyError, 'not found in axis'), - (range(3), list('abc'), 'one', - 'columns', ValueError, 'No axis named columns')]) -def test_drop_exception_raised(data, index, drop_labels, - axis, error_type, error_desc): + (range(3), list("abc"), ("a",), 0, KeyError, "not found in axis"), + (range(3), list("abc"), "one", "columns", ValueError, "No axis named columns"), + ], +) +def test_drop_exception_raised(data, index, drop_labels, axis, error_type, error_desc): with pytest.raises(error_type, match=error_desc): Series(data, index=index).drop(drop_labels, axis=axis) @@ -523,10 +520,10 @@ def test_drop_exception_raised(data, index, drop_labels, def test_drop_with_ignore_errors(): # errors='ignore' - s = Series(range(3), index=list('abc')) - result = s.drop('bc', errors='ignore') + s = Series(range(3), index=list("abc")) + result = s.drop("bc", errors="ignore") tm.assert_series_equal(result, s) - result = s.drop(['a', 'd'], errors='ignore') + result = s.drop(["a", "d"], errors="ignore") expected = s.iloc[1:] tm.assert_series_equal(result, expected) @@ -538,8 +535,8 @@ def test_drop_with_ignore_errors(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('index', [[1, 2, 3], [1, 1, 3]]) -@pytest.mark.parametrize('drop_labels', [[], [1], [3]]) +@pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 3]]) +@pytest.mark.parametrize("drop_labels", [[], [1], [3]]) def test_drop_empty_list(index, drop_labels): # GH 21494 expected_index = [i for i in index if i not in drop_labels] @@ -547,12 +544,15 @@ def test_drop_empty_list(index, drop_labels): tm.assert_series_equal(series, pd.Series(index=expected_index)) -@pytest.mark.parametrize('data, index, drop_labels', [ - (None, [1, 2, 3], [1, 4]), - (None, [1, 2, 2], [1, 4]), - ([2, 3], [0, 1], [False, True]) -]) +@pytest.mark.parametrize( + "data, index, drop_labels", + [ + (None, [1, 2, 3], [1, 4]), + (None, [1, 2, 2], [1, 4]), + ([2, 3], [0, 1], [False, True]), + ], +) def test_drop_non_empty_list(data, index, drop_labels): # GH 21494 and GH 16877 - with pytest.raises(KeyError, match='not found in axis'): + with pytest.raises(KeyError, match="not found in axis"): pd.Series(data=data, index=index).drop(drop_labels) diff --git a/pandas/tests/series/indexing/test_boolean.py b/pandas/tests/series/indexing/test_boolean.py index ef7312616250db..9b76ed026e580f 100644 --- a/pandas/tests/series/indexing/test_boolean.py +++ b/pandas/tests/series/indexing/test_boolean.py @@ -25,27 +25,29 @@ def test_getitem_boolean(test_data): def test_getitem_boolean_empty(): s = Series([], dtype=np.int64) - s.index.name = 'index_name' + s.index.name = "index_name" s = s[s.isna()] - assert s.index.name == 'index_name' + assert s.index.name == "index_name" assert s.dtype == np.int64 # GH5877 # indexing with empty series - s = Series(['A', 'B']) - expected = Series(np.nan, index=['C'], dtype=object) - result = s[Series(['C'], dtype=object)] + s = Series(["A", "B"]) + expected = Series(np.nan, index=["C"], dtype=object) + result = s[Series(["C"], dtype=object)] assert_series_equal(result, expected) - s = Series(['A', 'B']) - expected = Series(dtype=object, index=Index([], dtype='int64')) + s = Series(["A", "B"]) + expected = Series(dtype=object, index=Index([], dtype="int64")) result = s[Series([], dtype=object)] assert_series_equal(result, expected) # invalid because of the boolean indexer # that's empty or not-aligned - msg = (r"Unalignable boolean Series provided as indexer \(index of" - r" the boolean Series and of the indexed object do not match") + msg = ( + r"Unalignable boolean Series provided as indexer \(index of" + r" the boolean Series and of the indexed object do not match" + ) with pytest.raises(IndexingError, match=msg): s[Series([], dtype=bool)] @@ -87,8 +89,10 @@ def test_getitem_setitem_boolean_corner(test_data): # these used to raise...?? - msg = (r"Unalignable boolean Series provided as indexer \(index of" - r" the boolean Series and of the indexed object do not match") + msg = ( + r"Unalignable boolean Series provided as indexer \(index of" + r" the boolean Series and of the indexed object do not match" + ) with pytest.raises(IndexingError, match=msg): ts[mask_shifted] with pytest.raises(IndexingError, match=msg): @@ -156,14 +160,17 @@ def test_where_unsafe_float(float_dtype): assert_series_equal(s, expected) -@pytest.mark.parametrize("dtype,expected_dtype", [ - (np.int8, np.float64), - (np.int16, np.float64), - (np.int32, np.float64), - (np.int64, np.float64), - (np.float32, np.float32), - (np.float64, np.float64) -]) +@pytest.mark.parametrize( + "dtype,expected_dtype", + [ + (np.int8, np.float64), + (np.int16, np.float64), + (np.int32, np.float64), + (np.int64, np.float64), + (np.float32, np.float32), + (np.float64, np.float64), + ], +) def test_where_unsafe_upcast(dtype, expected_dtype): # see gh-9743 s = Series(np.arange(10), dtype=dtype) @@ -186,17 +193,17 @@ def test_where_unsafe(): assert_series_equal(s, expected) # see gh-3235 - s = Series(np.arange(10), dtype='int64') + s = Series(np.arange(10), dtype="int64") mask = s < 5 s[mask] = range(2, 7) - expected = Series(list(range(2, 7)) + list(range(5, 10)), dtype='int64') + expected = Series(list(range(2, 7)) + list(range(5, 10)), dtype="int64") assert_series_equal(s, expected) assert s.dtype == expected.dtype - s = Series(np.arange(10), dtype='int64') + s = Series(np.arange(10), dtype="int64") mask = s > 5 s[mask] = [0] * 4 - expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype='int64') + expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype="int64") assert_series_equal(s, expected) s = Series(np.arange(10)) @@ -241,8 +248,8 @@ def test_where(): assert_series_equal(rs, s.abs()) rs = s.where(cond) - assert (s.shape == rs.shape) - assert (rs is not s) + assert s.shape == rs.shape + assert rs is not s # test alignment cond = Series([True, False, False, True, False], index=s.index) @@ -278,13 +285,15 @@ def test_where_error(): msg = "cannot assign mismatch length to masked array" with pytest.raises(ValueError, match=msg): s[[True, False]] = [0, 2, 3] - msg = ("NumPy boolean array indexing assignment cannot assign 0 input" - " values to the 1 output values where the mask is true") + msg = ( + "NumPy boolean array indexing assignment cannot assign 0 input" + " values to the 1 output values where the mask is true" + ) with pytest.raises(ValueError, match=msg): s[[True, False]] = [] -@pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) +@pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) def test_where_array_like(klass): # see gh-15414 s = Series([1, 2, 3]) @@ -295,12 +304,15 @@ def test_where_array_like(klass): assert_series_equal(result, expected) -@pytest.mark.parametrize('cond', [ - [1, 0, 1], - Series([2, 5, 7]), - ["True", "False", "True"], - [Timestamp("2017-01-01"), pd.NaT, Timestamp("2017-01-02")] -]) +@pytest.mark.parametrize( + "cond", + [ + [1, 0, 1], + Series([2, 5, 7]), + ["True", "False", "True"], + [Timestamp("2017-01-01"), pd.NaT, Timestamp("2017-01-02")], + ], +) def test_where_invalid_input(cond): # see gh-15414: only boolean arrays accepted s = Series([1, 2, 3]) @@ -341,74 +353,69 @@ def test_where_setitem_invalid(): # GH 2702 # make sure correct exceptions are raised on invalid list assignment - msg = ("cannot set using a {} indexer with a different length than" - " the value") + msg = "cannot set using a {} indexer with a different length than" " the value" # slice - s = Series(list('abc')) + s = Series(list("abc")) - with pytest.raises(ValueError, match=msg.format('slice')): + with pytest.raises(ValueError, match=msg.format("slice")): s[0:3] = list(range(27)) s[0:3] = list(range(3)) expected = Series([0, 1, 2]) - assert_series_equal(s.astype(np.int64), expected, ) + assert_series_equal(s.astype(np.int64), expected) # slice with step - s = Series(list('abcdef')) + s = Series(list("abcdef")) - with pytest.raises(ValueError, match=msg.format('slice')): + with pytest.raises(ValueError, match=msg.format("slice")): s[0:4:2] = list(range(27)) - s = Series(list('abcdef')) + s = Series(list("abcdef")) s[0:4:2] = list(range(2)) - expected = Series([0, 'b', 1, 'd', 'e', 'f']) + expected = Series([0, "b", 1, "d", "e", "f"]) assert_series_equal(s, expected) # neg slices - s = Series(list('abcdef')) + s = Series(list("abcdef")) - with pytest.raises(ValueError, match=msg.format('slice')): + with pytest.raises(ValueError, match=msg.format("slice")): s[:-1] = list(range(27)) s[-3:-1] = list(range(2)) - expected = Series(['a', 'b', 'c', 0, 1, 'f']) + expected = Series(["a", "b", "c", 0, 1, "f"]) assert_series_equal(s, expected) # list - s = Series(list('abc')) + s = Series(list("abc")) - with pytest.raises(ValueError, match=msg.format('list-like')): + with pytest.raises(ValueError, match=msg.format("list-like")): s[[0, 1, 2]] = list(range(27)) - s = Series(list('abc')) + s = Series(list("abc")) - with pytest.raises(ValueError, match=msg.format('list-like')): + with pytest.raises(ValueError, match=msg.format("list-like")): s[[0, 1, 2]] = list(range(2)) # scalar - s = Series(list('abc')) + s = Series(list("abc")) s[0] = list(range(10)) - expected = Series([list(range(10)), 'b', 'c']) + expected = Series([list(range(10)), "b", "c"]) assert_series_equal(s, expected) -@pytest.mark.parametrize('size', range(2, 6)) -@pytest.mark.parametrize('mask', [ - [True, False, False, False, False], - [True, False], - [False] -]) -@pytest.mark.parametrize('item', [ - 2.0, np.nan, np.finfo(np.float).max, np.finfo(np.float).min -]) +@pytest.mark.parametrize("size", range(2, 6)) +@pytest.mark.parametrize( + "mask", [[True, False, False, False, False], [True, False], [False]] +) +@pytest.mark.parametrize( + "item", [2.0, np.nan, np.finfo(np.float).max, np.finfo(np.float).min] +) # Test numpy arrays, lists and tuples as the input to be # broadcast -@pytest.mark.parametrize('box', [ - lambda x: np.array([x]), - lambda x: [x], - lambda x: (x,) -]) +@pytest.mark.parametrize( + "box", [lambda x: np.array([x]), lambda x: [x], lambda x: (x,)] +) def test_broadcast(size, mask, item, box): selection = np.resize(mask, size) @@ -416,8 +423,9 @@ def test_broadcast(size, mask, item, box): # Construct the expected series by taking the source # data or item based on the selection - expected = Series([item if use_item else data[ - i] for i, use_item in enumerate(selection)]) + expected = Series( + [item if use_item else data[i] for i, use_item in enumerate(selection)] + ) s = Series(data) s[selection] = box(item) @@ -454,8 +462,7 @@ def test_where_dups(): s2 = Series(list(range(3))) comb = pd.concat([s1, s2]) result = comb.where(comb < 2) - expected = Series([0, 1, np.nan, 0, 1, np.nan], - index=[0, 1, 2, 0, 1, 2]) + expected = Series([0, 1, np.nan, 0, 1, np.nan], index=[0, 1, 2, 0, 1, 2]) assert_series_equal(result, expected) # GH 4548 @@ -472,31 +479,31 @@ def test_where_dups(): def test_where_numeric_with_string(): # GH 9280 s = pd.Series([1, 2, 3]) - w = s.where(s > 1, 'X') + w = s.where(s > 1, "X") assert not is_integer(w[0]) assert is_integer(w[1]) assert is_integer(w[2]) assert isinstance(w[0], str) - assert w.dtype == 'object' + assert w.dtype == "object" - w = s.where(s > 1, ['X', 'Y', 'Z']) + w = s.where(s > 1, ["X", "Y", "Z"]) assert not is_integer(w[0]) assert is_integer(w[1]) assert is_integer(w[2]) assert isinstance(w[0], str) - assert w.dtype == 'object' + assert w.dtype == "object" - w = s.where(s > 1, np.array(['X', 'Y', 'Z'])) + w = s.where(s > 1, np.array(["X", "Y", "Z"])) assert not is_integer(w[0]) assert is_integer(w[1]) assert is_integer(w[2]) assert isinstance(w[0], str) - assert w.dtype == 'object' + assert w.dtype == "object" def test_where_timedelta_coerce(): - s = Series([1, 2], dtype='timedelta64[ns]') + s = Series([1, 2], dtype="timedelta64[ns]") expected = Series([10, 10]) mask = np.array([False, False]) @@ -513,12 +520,12 @@ def test_where_timedelta_coerce(): assert_series_equal(rs, expected) rs = s.where(mask, [10.0, np.nan]) - expected = Series([10, None], dtype='object') + expected = Series([10, None], dtype="object") assert_series_equal(rs, expected) def test_where_datetime_conversion(): - s = Series(date_range('20130102', periods=2)) + s = Series(date_range("20130102", periods=2)) expected = Series([10, 10]) mask = np.array([False, False]) @@ -535,12 +542,11 @@ def test_where_datetime_conversion(): assert_series_equal(rs, expected) rs = s.where(mask, [10.0, np.nan]) - expected = Series([10, None], dtype='object') + expected = Series([10, None], dtype="object") assert_series_equal(rs, expected) # GH 15701 - timestamps = ['2016-12-31 12:00:04+00:00', - '2016-12-31 12:00:04.010000+00:00'] + timestamps = ["2016-12-31 12:00:04+00:00", "2016-12-31 12:00:04.010000+00:00"] s = Series([pd.Timestamp(t) for t in timestamps]) rs = s.where(Series([False, True])) expected = Series([pd.NaT, s[1]]) @@ -548,14 +554,17 @@ def test_where_datetime_conversion(): def test_where_dt_tz_values(tz_naive_fixture): - ser1 = pd.Series(pd.DatetimeIndex(['20150101', '20150102', '20150103'], - tz=tz_naive_fixture)) - ser2 = pd.Series(pd.DatetimeIndex(['20160514', '20160515', '20160516'], - tz=tz_naive_fixture)) + ser1 = pd.Series( + pd.DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture) + ) + ser2 = pd.Series( + pd.DatetimeIndex(["20160514", "20160515", "20160516"], tz=tz_naive_fixture) + ) mask = pd.Series([True, True, False]) result = ser1.where(mask, ser2) - exp = pd.Series(pd.DatetimeIndex(['20150101', '20150102', '20160516'], - tz=tz_naive_fixture)) + exp = pd.Series( + pd.DatetimeIndex(["20150101", "20150102", "20160516"], tz=tz_naive_fixture) + ) assert_series_equal(exp, result) diff --git a/pandas/tests/series/indexing/test_callable.py b/pandas/tests/series/indexing/test_callable.py index b6561375459039..2d879eed967e58 100644 --- a/pandas/tests/series/indexing/test_callable.py +++ b/pandas/tests/series/indexing/test_callable.py @@ -4,12 +4,12 @@ def test_getitem_callable(): # GH 12533 - s = pd.Series(4, index=list('ABCD')) - result = s[lambda x: 'A'] - assert result == s.loc['A'] + s = pd.Series(4, index=list("ABCD")) + result = s[lambda x: "A"] + assert result == s.loc["A"] - result = s[lambda x: ['A', 'B']] - tm.assert_series_equal(result, s.loc[['A', 'B']]) + result = s[lambda x: ["A", "B"]] + tm.assert_series_equal(result, s.loc[["A", "B"]]) result = s[lambda x: [True, False, True, True]] tm.assert_series_equal(result, s.iloc[[0, 2, 3]]) @@ -17,9 +17,9 @@ def test_getitem_callable(): def test_setitem_callable(): # GH 12533 - s = pd.Series([1, 2, 3, 4], index=list('ABCD')) - s[lambda x: 'A'] = -1 - tm.assert_series_equal(s, pd.Series([-1, 2, 3, 4], index=list('ABCD'))) + s = pd.Series([1, 2, 3, 4], index=list("ABCD")) + s[lambda x: "A"] = -1 + tm.assert_series_equal(s, pd.Series([-1, 2, 3, 4], index=list("ABCD"))) def test_setitem_other_callable(): diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index a8120ec9c5c58b..721ea2b6e66324 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -10,7 +10,10 @@ from pandas import DataFrame, DatetimeIndex, NaT, Series, Timestamp, date_range import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) """ @@ -19,74 +22,86 @@ def test_fancy_getitem(): - dti = date_range(freq='WOM-1FRI', start=datetime(2005, 1, 1), - end=datetime(2010, 1, 1)) + dti = date_range( + freq="WOM-1FRI", start=datetime(2005, 1, 1), end=datetime(2010, 1, 1) + ) s = Series(np.arange(len(dti)), index=dti) assert s[48] == 48 - assert s['1/2/2009'] == 48 - assert s['2009-1-2'] == 48 + assert s["1/2/2009"] == 48 + assert s["2009-1-2"] == 48 assert s[datetime(2009, 1, 2)] == 48 assert s[Timestamp(datetime(2009, 1, 2))] == 48 with pytest.raises(KeyError, match=r"^'2009-1-3'$"): - s['2009-1-3'] - assert_series_equal(s['3/6/2009':'2009-06-05'], - s[datetime(2009, 3, 6):datetime(2009, 6, 5)]) + s["2009-1-3"] + assert_series_equal( + s["3/6/2009":"2009-06-05"], s[datetime(2009, 3, 6) : datetime(2009, 6, 5)] + ) def test_fancy_setitem(): - dti = date_range(freq='WOM-1FRI', start=datetime(2005, 1, 1), - end=datetime(2010, 1, 1)) + dti = date_range( + freq="WOM-1FRI", start=datetime(2005, 1, 1), end=datetime(2010, 1, 1) + ) s = Series(np.arange(len(dti)), index=dti) s[48] = -1 assert s[48] == -1 - s['1/2/2009'] = -2 + s["1/2/2009"] = -2 assert s[48] == -2 - s['1/2/2009':'2009-06-05'] = -3 + s["1/2/2009":"2009-06-05"] = -3 assert (s[48:54] == -3).all() @pytest.mark.filterwarnings("ignore::DeprecationWarning") -@pytest.mark.parametrize('tz', [None, 'Asia/Shanghai', 'Europe/Berlin']) -@pytest.mark.parametrize('name', [None, 'my_dti']) +@pytest.mark.parametrize("tz", [None, "Asia/Shanghai", "Europe/Berlin"]) +@pytest.mark.parametrize("name", [None, "my_dti"]) def test_dti_snap(name, tz): - dti = DatetimeIndex(['1/1/2002', '1/2/2002', '1/3/2002', '1/4/2002', - '1/5/2002', '1/6/2002', '1/7/2002'], - name=name, tz=tz, freq='D') - - result = dti.snap(freq='W-MON') - expected = date_range('12/31/2001', '1/7/2002', - name=name, tz=tz, freq='w-mon') + dti = DatetimeIndex( + [ + "1/1/2002", + "1/2/2002", + "1/3/2002", + "1/4/2002", + "1/5/2002", + "1/6/2002", + "1/7/2002", + ], + name=name, + tz=tz, + freq="D", + ) + + result = dti.snap(freq="W-MON") + expected = date_range("12/31/2001", "1/7/2002", name=name, tz=tz, freq="w-mon") expected = expected.repeat([3, 4]) tm.assert_index_equal(result, expected) assert result.tz == expected.tz - result = dti.snap(freq='B') + result = dti.snap(freq="B") - expected = date_range('1/1/2002', '1/7/2002', - name=name, tz=tz, freq='b') + expected = date_range("1/1/2002", "1/7/2002", name=name, tz=tz, freq="b") expected = expected.repeat([1, 1, 1, 2, 2]) tm.assert_index_equal(result, expected) assert result.tz == expected.tz def test_dti_reset_index_round_trip(): - dti = date_range(start='1/1/2001', end='6/1/2001', freq='D') - d1 = DataFrame({'v': np.random.rand(len(dti))}, index=dti) + dti = date_range(start="1/1/2001", end="6/1/2001", freq="D") + d1 = DataFrame({"v": np.random.rand(len(dti))}, index=dti) d2 = d1.reset_index() - assert d2.dtypes[0] == np.dtype('M8[ns]') - d3 = d2.set_index('index') + assert d2.dtypes[0] == np.dtype("M8[ns]") + d3 = d2.set_index("index") assert_frame_equal(d1, d3, check_names=False) # #2329 stamp = datetime(2012, 11, 22) - df = DataFrame([[stamp, 12.1]], columns=['Date', 'Value']) - df = df.set_index('Date') + df = DataFrame([[stamp, 12.1]], columns=["Date", "Value"]) + df = df.set_index("Date") assert df.index[0] == stamp - assert df.reset_index()['Date'][0] == stamp + assert df.reset_index()["Date"][0] == stamp def test_series_set_value(): @@ -95,14 +110,12 @@ def test_series_set_value(): dates = [datetime(2001, 1, 1), datetime(2001, 1, 2)] index = DatetimeIndex(dates) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - s = Series().set_value(dates[0], 1.) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + s = Series().set_value(dates[0], 1.0) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): s2 = s.set_value(dates[1], np.nan) - exp = Series([1., np.nan], index=index) + exp = Series([1.0, np.nan], index=index) assert_series_equal(s2, exp) @@ -113,59 +126,60 @@ def test_series_set_value(): @pytest.mark.slow def test_slice_locs_indexerror(): - times = [datetime(2000, 1, 1) + timedelta(minutes=i * 10) - for i in range(100000)] + times = [datetime(2000, 1, 1) + timedelta(minutes=i * 10) for i in range(100000)] s = Series(range(100000), times) - s.loc[datetime(1900, 1, 1):datetime(2100, 1, 1)] + s.loc[datetime(1900, 1, 1) : datetime(2100, 1, 1)] def test_slicing_datetimes(): # GH 7523 # unique - df = DataFrame(np.arange(4., dtype='float64'), - index=[datetime(2001, 1, i, 10, 00) - for i in [1, 2, 3, 4]]) - result = df.loc[datetime(2001, 1, 1, 10):] + df = DataFrame( + np.arange(4.0, dtype="float64"), + index=[datetime(2001, 1, i, 10, 00) for i in [1, 2, 3, 4]], + ) + result = df.loc[datetime(2001, 1, 1, 10) :] assert_frame_equal(result, df) - result = df.loc[:datetime(2001, 1, 4, 10)] + result = df.loc[: datetime(2001, 1, 4, 10)] assert_frame_equal(result, df) - result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] + result = df.loc[datetime(2001, 1, 1, 10) : datetime(2001, 1, 4, 10)] assert_frame_equal(result, df) - result = df.loc[datetime(2001, 1, 1, 11):] + result = df.loc[datetime(2001, 1, 1, 11) :] expected = df.iloc[1:] assert_frame_equal(result, expected) - result = df.loc['20010101 11':] + result = df.loc["20010101 11":] assert_frame_equal(result, expected) # duplicates - df = pd.DataFrame(np.arange(5., dtype='float64'), - index=[datetime(2001, 1, i, 10, 00) - for i in [1, 2, 2, 3, 4]]) + df = pd.DataFrame( + np.arange(5.0, dtype="float64"), + index=[datetime(2001, 1, i, 10, 00) for i in [1, 2, 2, 3, 4]], + ) - result = df.loc[datetime(2001, 1, 1, 10):] + result = df.loc[datetime(2001, 1, 1, 10) :] assert_frame_equal(result, df) - result = df.loc[:datetime(2001, 1, 4, 10)] + result = df.loc[: datetime(2001, 1, 4, 10)] assert_frame_equal(result, df) - result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] + result = df.loc[datetime(2001, 1, 1, 10) : datetime(2001, 1, 4, 10)] assert_frame_equal(result, df) - result = df.loc[datetime(2001, 1, 1, 11):] + result = df.loc[datetime(2001, 1, 1, 11) :] expected = df.iloc[1:] assert_frame_equal(result, expected) - result = df.loc['20010101 11':] + result = df.loc["20010101 11":] assert_frame_equal(result, expected) def test_frame_datetime64_duplicated(): - dates = date_range('2010-07-01', end='2010-08-05') + dates = date_range("2010-07-01", end="2010-08-05") - tst = DataFrame({'symbol': 'AAA', 'date': dates}) - result = tst.duplicated(['date', 'symbol']) + tst = DataFrame({"symbol": "AAA", "date": dates}) + result = tst.duplicated(["date", "symbol"]) assert (-result).all() - tst = DataFrame({'date': dates}) + tst = DataFrame({"date": dates}) result = tst.duplicated() assert (-result).all() @@ -176,7 +190,7 @@ def test_getitem_setitem_datetime_tz_pytz(): N = 50 # testing with timezone, GH #2785 - rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') + rng = date_range("1/1/1990", periods=N, freq="H", tz="US/Eastern") ts = Series(np.random.randn(N), index=rng) # also test Timestamp tz handling, GH #2789 @@ -192,14 +206,14 @@ def test_getitem_setitem_datetime_tz_pytz(): # repeat with datetimes result = ts.copy() - result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0 - result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4] + result[datetime(1990, 1, 1, 9, tzinfo=tz("UTC"))] = 0 + result[datetime(1990, 1, 1, 9, tzinfo=tz("UTC"))] = ts[4] assert_series_equal(result, ts) result = ts.copy() # comparison dates with datetime MUST be localized! - date = tz('US/Central').localize(datetime(1990, 1, 1, 3)) + date = tz("US/Central").localize(datetime(1990, 1, 1, 3)) result[date] = 0 result[date] = ts[4] assert_series_equal(result, ts) @@ -209,16 +223,16 @@ def test_getitem_setitem_datetime_tz_dateutil(): from dateutil.tz import tzutc from pandas._libs.tslibs.timezones import dateutil_gettz as gettz - tz = lambda x: tzutc() if x == 'UTC' else gettz( - x) # handle special case for utc in dateutil + tz = ( + lambda x: tzutc() if x == "UTC" else gettz(x) + ) # handle special case for utc in dateutil from pandas import date_range N = 50 # testing with timezone, GH #2785 - rng = date_range('1/1/1990', periods=N, freq='H', - tz='America/New_York') + rng = date_range("1/1/1990", periods=N, freq="H", tz="America/New_York") ts = Series(np.random.randn(N), index=rng) # also test Timestamp tz handling, GH #2789 @@ -234,20 +248,20 @@ def test_getitem_setitem_datetime_tz_dateutil(): # repeat with datetimes result = ts.copy() - result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0 - result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4] + result[datetime(1990, 1, 1, 9, tzinfo=tz("UTC"))] = 0 + result[datetime(1990, 1, 1, 9, tzinfo=tz("UTC"))] = ts[4] assert_series_equal(result, ts) result = ts.copy() - result[datetime(1990, 1, 1, 3, tzinfo=tz('America/Chicago'))] = 0 - result[datetime(1990, 1, 1, 3, tzinfo=tz('America/Chicago'))] = ts[4] + result[datetime(1990, 1, 1, 3, tzinfo=tz("America/Chicago"))] = 0 + result[datetime(1990, 1, 1, 3, tzinfo=tz("America/Chicago"))] = ts[4] assert_series_equal(result, ts) def test_getitem_setitem_datetimeindex(): N = 50 # testing with timezone, GH #2785 - rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') + rng = date_range("1/1/1990", periods=N, freq="H", tz="US/Eastern") ts = Series(np.random.randn(N), index=rng) result = ts["1990-01-01 04:00:00"] @@ -291,13 +305,13 @@ def test_getitem_setitem_datetimeindex(): result[datetime(1990, 1, 1, 4)] = ts[4] assert_series_equal(result, ts) - result = ts[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] + result = ts[datetime(1990, 1, 1, 4) : datetime(1990, 1, 1, 7)] expected = ts[4:8] assert_series_equal(result, expected) result = ts.copy() - result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = 0 - result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = ts[4:8] + result[datetime(1990, 1, 1, 4) : datetime(1990, 1, 1, 7)] = 0 + result[datetime(1990, 1, 1, 4) : datetime(1990, 1, 1, 7)] = ts[4:8] assert_series_equal(result, ts) lb = datetime(1990, 1, 1, 4) @@ -342,7 +356,7 @@ def test_getitem_setitem_periodindex(): from pandas import period_range N = 50 - rng = period_range('1/1/1990', periods=N, freq='H') + rng = period_range("1/1/1990", periods=N, freq="H") ts = Series(np.random.randn(N), index=rng) result = ts["1990-01-01 04"] @@ -387,7 +401,7 @@ def test_getitem_setitem_periodindex(): # FutureWarning from NumPy. @pytest.mark.filterwarnings("ignore:Using a non-tuple:FutureWarning") def test_getitem_median_slice_bug(): - index = date_range('20090415', '20090519', freq='2B') + index = date_range("20090415", "20090519", freq="2B") s = Series(np.random.randn(13), index=index) indexer = [slice(6, 7, None)] @@ -399,11 +413,11 @@ def test_getitem_median_slice_bug(): def test_datetime_indexing(): from pandas import date_range - index = date_range('1/1/2000', '1/7/2000') + index = date_range("1/1/2000", "1/7/2000") index = index.repeat(3) s = Series(len(index), index=index) - stamp = Timestamp('1/8/2000') + stamp = Timestamp("1/8/2000") with pytest.raises(KeyError, match=r"^947289600000000000$"): s[stamp] @@ -427,11 +441,18 @@ def test_datetime_indexing(): @pytest.fixture def dups(): - dates = [datetime(2000, 1, 2), datetime(2000, 1, 2), - datetime(2000, 1, 2), datetime(2000, 1, 3), - datetime(2000, 1, 3), datetime(2000, 1, 3), - datetime(2000, 1, 4), datetime(2000, 1, 4), - datetime(2000, 1, 4), datetime(2000, 1, 5)] + dates = [ + datetime(2000, 1, 2), + datetime(2000, 1, 2), + datetime(2000, 1, 2), + datetime(2000, 1, 3), + datetime(2000, 1, 3), + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 4), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + ] return Series(np.random.randn(len(dates)), index=dates) @@ -447,22 +468,28 @@ def test_is_unique_monotonic(dups): def test_index_unique(dups): uniques = dups.index.unique() - expected = DatetimeIndex([datetime(2000, 1, 2), datetime(2000, 1, 3), - datetime(2000, 1, 4), datetime(2000, 1, 5)]) - assert uniques.dtype == 'M8[ns]' # sanity + expected = DatetimeIndex( + [ + datetime(2000, 1, 2), + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + ] + ) + assert uniques.dtype == "M8[ns]" # sanity tm.assert_index_equal(uniques, expected) assert dups.index.nunique() == 4 # #2563 assert isinstance(uniques, DatetimeIndex) - dups_local = dups.index.tz_localize('US/Eastern') - dups_local.name = 'foo' + dups_local = dups.index.tz_localize("US/Eastern") + dups_local.name = "foo" result = dups_local.unique() - expected = DatetimeIndex(expected, name='foo') - expected = expected.tz_localize('US/Eastern') + expected = DatetimeIndex(expected, name="foo") + expected = expected.tz_localize("US/Eastern") assert result.tz is not None - assert result.name == 'foo' + assert result.name == "foo" tm.assert_index_equal(result, expected) # NaT, note this is excluded @@ -472,8 +499,9 @@ def test_index_unique(dups): assert idx.nunique() == 20 assert idx.nunique(dropna=False) == 21 - arr = [Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) - for t in range(20)] + [NaT] + arr = [ + Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20) + ] + [NaT] idx = DatetimeIndex(arr * 3) tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) assert idx.nunique() == 20 @@ -515,16 +543,15 @@ def test_duplicate_dates_indexing(dups): def test_range_slice(): - idx = DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/3/2000', - '1/4/2000']) + idx = DatetimeIndex(["1/1/2000", "1/2/2000", "1/2/2000", "1/3/2000", "1/4/2000"]) ts = Series(np.random.randn(len(idx)), index=idx) - result = ts['1/2/2000':] + result = ts["1/2/2000":] expected = ts[1:] assert_series_equal(result, expected) - result = ts['1/2/2000':'1/3/2000'] + result = ts["1/2/2000":"1/3/2000"] expected = ts[1:4] assert_series_equal(result, expected) @@ -537,6 +564,7 @@ def test_groupby_average_dup_values(dups): def test_indexing_over_size_cutoff(): import datetime + # #1821 old_cutoff = _index._SIZE_CUTOFF @@ -561,9 +589,9 @@ def test_indexing_over_size_cutoff(): for p in duplicate_positions: dates[p + 1] = dates[p] - df = DataFrame(np.random.randn(len(dates), 4), - index=dates, - columns=list('ABCD')) + df = DataFrame( + np.random.randn(len(dates), 4), index=dates, columns=list("ABCD") + ) pos = n * 3 timestamp = df.index[pos] @@ -578,7 +606,7 @@ def test_indexing_over_size_cutoff(): def test_indexing_unordered(): # GH 2437 - rng = date_range(start='2011-01-01', end='2011-01-15') + rng = date_range(start="2011-01-01", end="2011-01-15") ts = Series(np.random.rand(len(rng)), index=rng) ts2 = pd.concat([ts[0:4], ts[-4:], ts[4:-4]]) @@ -597,83 +625,84 @@ def compare(slobj): expected = ts[slobj] assert_series_equal(result, expected) - compare(slice('2011-01-01', '2011-01-15')) - compare(slice('2010-12-30', '2011-01-15')) - compare(slice('2011-01-01', '2011-01-16')) + compare(slice("2011-01-01", "2011-01-15")) + compare(slice("2010-12-30", "2011-01-15")) + compare(slice("2011-01-01", "2011-01-16")) # partial ranges - compare(slice('2011-01-01', '2011-01-6')) - compare(slice('2011-01-06', '2011-01-8')) - compare(slice('2011-01-06', '2011-01-12')) + compare(slice("2011-01-01", "2011-01-6")) + compare(slice("2011-01-06", "2011-01-8")) + compare(slice("2011-01-06", "2011-01-12")) # single values - result = ts2['2011'].sort_index() - expected = ts['2011'] + result = ts2["2011"].sort_index() + expected = ts["2011"] assert_series_equal(result, expected) # diff freq - rng = date_range(datetime(2005, 1, 1), periods=20, freq='M') + rng = date_range(datetime(2005, 1, 1), periods=20, freq="M") ts = Series(np.arange(len(rng)), index=rng) ts = ts.take(np.random.permutation(20)) - result = ts['2005'] + result = ts["2005"] for t in result.index: assert t.year == 2005 def test_indexing(): - idx = date_range("2001-1-1", periods=20, freq='M') + idx = date_range("2001-1-1", periods=20, freq="M") ts = Series(np.random.rand(len(idx)), index=idx) # getting # GH 3070, make sure semantics work on Series/Frame - expected = ts['2001'] - expected.name = 'A' + expected = ts["2001"] + expected.name = "A" df = DataFrame(dict(A=ts)) - result = df['2001']['A'] + result = df["2001"]["A"] assert_series_equal(expected, result) # setting - ts['2001'] = 1 - expected = ts['2001'] - expected.name = 'A' + ts["2001"] = 1 + expected = ts["2001"] + expected.name = "A" - df.loc['2001', 'A'] = 1 + df.loc["2001", "A"] = 1 - result = df['2001']['A'] + result = df["2001"]["A"] assert_series_equal(expected, result) # GH3546 (not including times on the last day) - idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:00', - freq='H') + idx = date_range(start="2013-05-31 00:00", end="2013-05-31 23:00", freq="H") ts = Series(range(len(idx)), index=idx) - expected = ts['2013-05'] + expected = ts["2013-05"] assert_series_equal(expected, ts) - idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:59', - freq='S') + idx = date_range(start="2013-05-31 00:00", end="2013-05-31 23:59", freq="S") ts = Series(range(len(idx)), index=idx) - expected = ts['2013-05'] + expected = ts["2013-05"] assert_series_equal(expected, ts) - idx = [Timestamp('2013-05-31 00:00'), - Timestamp(datetime(2013, 5, 31, 23, 59, 59, 999999))] + idx = [ + Timestamp("2013-05-31 00:00"), + Timestamp(datetime(2013, 5, 31, 23, 59, 59, 999999)), + ] ts = Series(range(len(idx)), index=idx) - expected = ts['2013'] + expected = ts["2013"] assert_series_equal(expected, ts) # GH14826, indexing with a seconds resolution string / datetime object - df = DataFrame(np.random.rand(5, 5), - columns=['open', 'high', 'low', 'close', 'volume'], - index=date_range('2012-01-02 18:01:00', - periods=5, tz='US/Central', freq='s')) + df = DataFrame( + np.random.rand(5, 5), + columns=["open", "high", "low", "close", "volume"], + index=date_range("2012-01-02 18:01:00", periods=5, tz="US/Central", freq="s"), + ) expected = df.loc[[df.index[2]]] # this is a single date, so will raise with pytest.raises(KeyError, match=r"^'2012-01-02 18:01:02'$"): - df['2012-01-02 18:01:02'] + df["2012-01-02 18:01:02"] msg = r"Timestamp\('2012-01-02 18:01:02-0600', tz='US/Central', freq='S'\)" with pytest.raises(KeyError, match=msg): df[df.index[2]] @@ -685,7 +714,7 @@ def test_indexing(): def test_set_none_nan(): - series = Series(date_range('1/1/2000', periods=10)) + series = Series(date_range("1/1/2000", periods=10)) series[3] = None assert series[3] is NaT @@ -701,15 +730,15 @@ def test_set_none_nan(): def test_nat_operations(): # GH 8617 - s = Series([0, pd.NaT], dtype='m8[ns]') + s = Series([0, pd.NaT], dtype="m8[ns]") exp = s[0] assert s.median() == exp assert s.min() == exp assert s.max() == exp -@pytest.mark.parametrize('method', ["round", "floor", "ceil"]) -@pytest.mark.parametrize('freq', ["s", "5s", "min", "5min", "h", "5h"]) +@pytest.mark.parametrize("method", ["round", "floor", "ceil"]) +@pytest.mark.parametrize("freq", ["s", "5s", "min", "5min", "h", "5h"]) def test_round_nat(method, freq): # GH14940 s = Series([pd.NaT]) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index d794b4aca82e67..c8342c54e9b5db 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -8,8 +8,7 @@ from pandas.core.dtypes.common import is_scalar import pandas as pd -from pandas import ( - Categorical, DataFrame, MultiIndex, Series, Timedelta, Timestamp) +from pandas import Categorical, DataFrame, MultiIndex, Series, Timedelta, Timestamp import pandas.util.testing as tm from pandas.util.testing import assert_series_equal @@ -17,7 +16,7 @@ def test_basic_indexing(): - s = Series(np.random.randn(5), index=['a', 'b', 'a', 'a', 'b']) + s = Series(np.random.randn(5), index=["a", "b", "a", "a", "b"]) msg = "index out of bounds" with pytest.raises(IndexError, match=msg): @@ -27,7 +26,7 @@ def test_basic_indexing(): s[5] = 0 with pytest.raises(KeyError, match=r"^'c'$"): - s['c'] + s["c"] s = s.sort_index() @@ -46,36 +45,35 @@ def test_basic_getitem_with_labels(test_data): expected = test_data.ts.reindex(indices) assert_series_equal(result, expected) - result = test_data.ts[indices[0]:indices[2]] - expected = test_data.ts.loc[indices[0]:indices[2]] + result = test_data.ts[indices[0] : indices[2]] + expected = test_data.ts.loc[indices[0] : indices[2]] assert_series_equal(result, expected) # integer indexes, be careful s = Series(np.random.randn(10), index=list(range(0, 20, 2))) inds = [0, 2, 5, 7, 8] arr_inds = np.array([0, 2, 5, 7, 8]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = s[inds] expected = s.reindex(inds) assert_series_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = s[arr_inds] expected = s.reindex(arr_inds) assert_series_equal(result, expected) # GH12089 # with tz for values - s = Series(pd.date_range("2011-01-01", periods=3, tz="US/Eastern"), - index=['a', 'b', 'c']) - expected = Timestamp('2011-01-01', tz='US/Eastern') - result = s.loc['a'] + s = Series( + pd.date_range("2011-01-01", periods=3, tz="US/Eastern"), index=["a", "b", "c"] + ) + expected = Timestamp("2011-01-01", tz="US/Eastern") + result = s.loc["a"] assert result == expected result = s.iloc[0] assert result == expected - result = s['a'] + result = s["a"] assert result == expected @@ -115,7 +113,7 @@ def test_getitem_get(test_data): # None # GH 5652 - for s in [Series(), Series(index=list('abc'))]: + for s in [Series(), Series(index=list("abc"))]: result = s.get(None) assert result is None @@ -144,25 +142,26 @@ def test_type_promotion(): s["a"] = pd.Timestamp("2016-01-01") s["b"] = 3.0 s["c"] = "foo" - expected = Series([pd.Timestamp("2016-01-01"), 3.0, "foo"], - index=["a", "b", "c"]) + expected = Series([pd.Timestamp("2016-01-01"), 3.0, "foo"], index=["a", "b", "c"]) assert_series_equal(s, expected) @pytest.mark.parametrize( - 'result_1, duplicate_item, expected_1', + "result_1, duplicate_item, expected_1", [ [ - pd.Series({1: 12, 2: [1, 2, 2, 3]}), pd.Series({1: 313}), - pd.Series({1: 12, }, dtype=object), + pd.Series({1: 12, 2: [1, 2, 2, 3]}), + pd.Series({1: 313}), + pd.Series({1: 12}, dtype=object), ], [ pd.Series({1: [1, 2, 3], 2: [1, 2, 2, 3]}), - pd.Series({1: [1, 2, 3]}), pd.Series({1: [1, 2, 3], }), + pd.Series({1: [1, 2, 3]}), + pd.Series({1: [1, 2, 3]}), ], - ]) -def test_getitem_with_duplicates_indices( - result_1, duplicate_item, expected_1): + ], +) +def test_getitem_with_duplicates_indices(result_1, duplicate_item, expected_1): # GH 17610 result = result_1.append(duplicate_item) expected = expected_1.append(duplicate_item) @@ -184,11 +183,11 @@ def test_getitem_out_of_bounds(test_data): def test_getitem_setitem_integers(): # caused bug without test - s = Series([1, 2, 3], ['a', 'b', 'c']) + s = Series([1, 2, 3], ["a", "b", "c"]) - assert s.iloc[0] == s['a'] + assert s.iloc[0] == s["a"] s.iloc[0] = 5 - tm.assert_almost_equal(s['a'], 5) + tm.assert_almost_equal(s["a"], 5) def test_getitem_box_float64(test_data): @@ -197,12 +196,9 @@ def test_getitem_box_float64(test_data): @pytest.mark.parametrize( - 'arr', - [ - np.random.randn(10), - tm.makeDateIndex(10, name='a').tz_localize( - tz='US/Eastern'), - ]) + "arr", + [np.random.randn(10), tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern")], +) def test_get(arr): # GH 21260 s = Series(arr, index=[2 * i for i in range(len(arr))]) @@ -219,14 +215,14 @@ def test_get(arr): assert s.get(-1) is None assert s.get(s.index.max() + 1) is None - s = Series(arr[:6], index=list('abcdef')) - assert s.get('c') == s.iloc[2] + s = Series(arr[:6], index=list("abcdef")) + assert s.get("c") == s.iloc[2] - result = s.get(slice('b', 'd')) + result = s.get(slice("b", "d")) expected = s.iloc[[1, 2, 3]] tm.assert_series_equal(result, expected) - result = s.get('Z') + result = s.get("Z") assert result is None assert s.get(4) == s.iloc[4] @@ -240,12 +236,12 @@ def test_get(arr): def test_series_box_timestamp(): - rng = pd.date_range('20090415', '20090519', freq='B') + rng = pd.date_range("20090415", "20090519", freq="B") ser = Series(rng) assert isinstance(ser[5], pd.Timestamp) - rng = pd.date_range('20090415', '20090519', freq='B') + rng = pd.date_range("20090415", "20090519", freq="B") ser = Series(rng, index=rng) assert isinstance(ser[5], pd.Timestamp) @@ -261,29 +257,27 @@ def test_getitem_ambiguous_keyerror(): def test_getitem_unordered_dup(): - obj = Series(range(5), index=['c', 'a', 'a', 'b', 'b']) - assert is_scalar(obj['c']) - assert obj['c'] == 0 + obj = Series(range(5), index=["c", "a", "a", "b", "b"]) + assert is_scalar(obj["c"]) + assert obj["c"] == 0 def test_getitem_dups_with_missing(): # breaks reindex, so need to use .loc internally # GH 4246 - s = Series([1, 2, 3, 4], ['foo', 'bar', 'foo', 'bah']) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - expected = s.loc[['foo', 'bar', 'bah', 'bam']] - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = s[['foo', 'bar', 'bah', 'bam']] + s = Series([1, 2, 3, 4], ["foo", "bar", "foo", "bah"]) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + expected = s.loc[["foo", "bar", "bah", "bam"]] + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = s[["foo", "bar", "bah", "bam"]] assert_series_equal(result, expected) def test_getitem_dups(): - s = Series(range(5), index=['A', 'A', 'B', 'C', 'C'], dtype=np.int64) - expected = Series([3, 4], index=['C', 'C'], dtype=np.int64) - result = s['C'] + s = Series(range(5), index=["A", "A", "B", "C", "C"], dtype=np.int64) + expected = Series([3, 4], index=["C", "C"], dtype=np.int64) + result = s["C"] assert_series_equal(result, expected) @@ -306,8 +300,10 @@ def test_getitem_dataframe(): rng = list(range(10)) s = pd.Series(10, index=rng) df = pd.DataFrame(rng, index=rng) - msg = ("Indexing a Series with DataFrame is not supported," - " use the appropriate DataFrame column") + msg = ( + "Indexing a Series with DataFrame is not supported," + " use the appropriate DataFrame column" + ) with pytest.raises(TypeError, match=msg): s[df > 5] @@ -322,30 +318,29 @@ def test_setitem(test_data): assert not np.isnan(test_data.ts[2]) # caught this bug when writing tests - series = Series(tm.makeIntIndex(20).astype(float), - index=tm.makeIntIndex(20)) + series = Series(tm.makeIntIndex(20).astype(float), index=tm.makeIntIndex(20)) series[::2] = 0 assert (series[::2] == 0).all() # set item that's not contained s = test_data.series.copy() - s['foobar'] = 1 + s["foobar"] = 1 - app = Series([1], index=['foobar'], name='series') + app = Series([1], index=["foobar"], name="series") expected = test_data.series.append(app) assert_series_equal(s, expected) # Test for issue #10193 - key = pd.Timestamp('2012-01-01') + key = pd.Timestamp("2012-01-01") series = pd.Series() series[key] = 47 expected = pd.Series(47, [key]) assert_series_equal(series, expected) - series = pd.Series([], pd.DatetimeIndex([], freq='D')) + series = pd.Series([], pd.DatetimeIndex([], freq="D")) series[key] = 47 - expected = pd.Series(47, pd.DatetimeIndex([key], freq='D')) + expected = pd.Series(47, pd.DatetimeIndex([key], freq="D")) assert_series_equal(series, expected) @@ -377,25 +372,23 @@ def test_setitem_dtypes(): def test_set_value(test_data): idx = test_data.ts.index[10] - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): res = test_data.ts.set_value(idx, 0) assert res is test_data.ts assert test_data.ts[idx] == 0 # equiv s = test_data.series.copy() - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - res = s.set_value('foobar', 0) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + res = s.set_value("foobar", 0) assert res is s - assert res.index[-1] == 'foobar' - assert res['foobar'] == 0 + assert res.index[-1] == "foobar" + assert res["foobar"] == 0 s = test_data.series.copy() - s.loc['foobar'] = 0 - assert s.index[-1] == 'foobar' - assert s['foobar'] == 0 + s.loc["foobar"] = 0 + assert s.index[-1] == "foobar" + assert s["foobar"] == 0 def test_setslice(test_data): @@ -427,37 +420,46 @@ def test_basic_getitem_setitem_corner(test_data): test_data.ts[[5, slice(None, None)]] = 2 -@pytest.mark.parametrize('tz', ['US/Eastern', 'UTC', 'Asia/Tokyo']) +@pytest.mark.parametrize("tz", ["US/Eastern", "UTC", "Asia/Tokyo"]) def test_setitem_with_tz(tz): - orig = pd.Series(pd.date_range('2016-01-01', freq='H', periods=3, - tz=tz)) - assert orig.dtype == 'datetime64[ns, {0}]'.format(tz) + orig = pd.Series(pd.date_range("2016-01-01", freq="H", periods=3, tz=tz)) + assert orig.dtype == "datetime64[ns, {0}]".format(tz) # scalar s = orig.copy() - s[1] = pd.Timestamp('2011-01-01', tz=tz) - exp = pd.Series([pd.Timestamp('2016-01-01 00:00', tz=tz), - pd.Timestamp('2011-01-01 00:00', tz=tz), - pd.Timestamp('2016-01-01 02:00', tz=tz)]) + s[1] = pd.Timestamp("2011-01-01", tz=tz) + exp = pd.Series( + [ + pd.Timestamp("2016-01-01 00:00", tz=tz), + pd.Timestamp("2011-01-01 00:00", tz=tz), + pd.Timestamp("2016-01-01 02:00", tz=tz), + ] + ) tm.assert_series_equal(s, exp) s = orig.copy() - s.loc[1] = pd.Timestamp('2011-01-01', tz=tz) + s.loc[1] = pd.Timestamp("2011-01-01", tz=tz) tm.assert_series_equal(s, exp) s = orig.copy() - s.iloc[1] = pd.Timestamp('2011-01-01', tz=tz) + s.iloc[1] = pd.Timestamp("2011-01-01", tz=tz) tm.assert_series_equal(s, exp) # vector - vals = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2012-01-01', tz=tz)], index=[1, 2]) - assert vals.dtype == 'datetime64[ns, {0}]'.format(tz) + vals = pd.Series( + [pd.Timestamp("2011-01-01", tz=tz), pd.Timestamp("2012-01-01", tz=tz)], + index=[1, 2], + ) + assert vals.dtype == "datetime64[ns, {0}]".format(tz) s[[1, 2]] = vals - exp = pd.Series([pd.Timestamp('2016-01-01 00:00', tz=tz), - pd.Timestamp('2011-01-01 00:00', tz=tz), - pd.Timestamp('2012-01-01 00:00', tz=tz)]) + exp = pd.Series( + [ + pd.Timestamp("2016-01-01 00:00", tz=tz), + pd.Timestamp("2011-01-01 00:00", tz=tz), + pd.Timestamp("2012-01-01 00:00", tz=tz), + ] + ) tm.assert_series_equal(s, exp) s = orig.copy() @@ -471,36 +473,45 @@ def test_setitem_with_tz(tz): def test_setitem_with_tz_dst(): # GH XXX - tz = 'US/Eastern' - orig = pd.Series(pd.date_range('2016-11-06', freq='H', periods=3, - tz=tz)) - assert orig.dtype == 'datetime64[ns, {0}]'.format(tz) + tz = "US/Eastern" + orig = pd.Series(pd.date_range("2016-11-06", freq="H", periods=3, tz=tz)) + assert orig.dtype == "datetime64[ns, {0}]".format(tz) # scalar s = orig.copy() - s[1] = pd.Timestamp('2011-01-01', tz=tz) - exp = pd.Series([pd.Timestamp('2016-11-06 00:00-04:00', tz=tz), - pd.Timestamp('2011-01-01 00:00-05:00', tz=tz), - pd.Timestamp('2016-11-06 01:00-05:00', tz=tz)]) + s[1] = pd.Timestamp("2011-01-01", tz=tz) + exp = pd.Series( + [ + pd.Timestamp("2016-11-06 00:00-04:00", tz=tz), + pd.Timestamp("2011-01-01 00:00-05:00", tz=tz), + pd.Timestamp("2016-11-06 01:00-05:00", tz=tz), + ] + ) tm.assert_series_equal(s, exp) s = orig.copy() - s.loc[1] = pd.Timestamp('2011-01-01', tz=tz) + s.loc[1] = pd.Timestamp("2011-01-01", tz=tz) tm.assert_series_equal(s, exp) s = orig.copy() - s.iloc[1] = pd.Timestamp('2011-01-01', tz=tz) + s.iloc[1] = pd.Timestamp("2011-01-01", tz=tz) tm.assert_series_equal(s, exp) # vector - vals = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2012-01-01', tz=tz)], index=[1, 2]) - assert vals.dtype == 'datetime64[ns, {0}]'.format(tz) + vals = pd.Series( + [pd.Timestamp("2011-01-01", tz=tz), pd.Timestamp("2012-01-01", tz=tz)], + index=[1, 2], + ) + assert vals.dtype == "datetime64[ns, {0}]".format(tz) s[[1, 2]] = vals - exp = pd.Series([pd.Timestamp('2016-11-06 00:00', tz=tz), - pd.Timestamp('2011-01-01 00:00', tz=tz), - pd.Timestamp('2012-01-01 00:00', tz=tz)]) + exp = pd.Series( + [ + pd.Timestamp("2016-11-06 00:00", tz=tz), + pd.Timestamp("2011-01-01 00:00", tz=tz), + pd.Timestamp("2012-01-01 00:00", tz=tz), + ] + ) tm.assert_series_equal(s, exp) s = orig.copy() @@ -537,8 +548,7 @@ def test_categorial_assigning_ops(): s = orig.copy() s.index = ["x", "y"] s["y"] = "a" - exp = Series(Categorical(["b", "a"], categories=["a", "b"]), - index=["x", "y"]) + exp = Series(Categorical(["b", "a"], categories=["a", "b"]), index=["x", "y"]) tm.assert_series_equal(s, exp) # ensure that one can set something to np.nan @@ -570,7 +580,7 @@ def test_slice(test_data): def test_slice_can_reorder_not_uniquely_indexed(): - s = Series(1, index=['a', 'a', 'b', 'b', 'c']) + s = Series(1, index=["a", "a", "b", "b", "c"]) s[::-1] # it works! @@ -614,8 +624,7 @@ def test_setitem_na(): s[::2] = np.nan assert_series_equal(s, expected) - expected = Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, - 9]) + expected = Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, 9]) s = Series(np.arange(10)) s[:5] = np.nan assert_series_equal(s, expected) @@ -624,65 +633,63 @@ def test_setitem_na(): def test_timedelta_assignment(): # GH 8209 s = Series([]) - s.loc['B'] = timedelta(1) - tm.assert_series_equal(s, Series(Timedelta('1 days'), index=['B'])) + s.loc["B"] = timedelta(1) + tm.assert_series_equal(s, Series(Timedelta("1 days"), index=["B"])) - s = s.reindex(s.index.insert(0, 'A')) - tm.assert_series_equal(s, Series( - [np.nan, Timedelta('1 days')], index=['A', 'B'])) + s = s.reindex(s.index.insert(0, "A")) + tm.assert_series_equal(s, Series([np.nan, Timedelta("1 days")], index=["A", "B"])) result = s.fillna(timedelta(1)) - expected = Series(Timedelta('1 days'), index=['A', 'B']) + expected = Series(Timedelta("1 days"), index=["A", "B"]) tm.assert_series_equal(result, expected) - s.loc['A'] = timedelta(1) + s.loc["A"] = timedelta(1) tm.assert_series_equal(s, expected) # GH 14155 - s = Series(10 * [np.timedelta64(10, 'm')]) - s.loc[[1, 2, 3]] = np.timedelta64(20, 'm') - expected = pd.Series(10 * [np.timedelta64(10, 'm')]) - expected.loc[[1, 2, 3]] = pd.Timedelta(np.timedelta64(20, 'm')) + s = Series(10 * [np.timedelta64(10, "m")]) + s.loc[[1, 2, 3]] = np.timedelta64(20, "m") + expected = pd.Series(10 * [np.timedelta64(10, "m")]) + expected.loc[[1, 2, 3]] = pd.Timedelta(np.timedelta64(20, "m")) tm.assert_series_equal(s, expected) def test_underlying_data_conversion(): # GH 4080 - df = DataFrame({c: [1, 2, 3] for c in ['a', 'b', 'c']}) - df.set_index(['a', 'b', 'c'], inplace=True) + df = DataFrame({c: [1, 2, 3] for c in ["a", "b", "c"]}) + df.set_index(["a", "b", "c"], inplace=True) s = Series([1], index=[(2, 2, 2)]) - df['val'] = 0 + df["val"] = 0 df - df['val'].update(s) + df["val"].update(s) - expected = DataFrame( - dict(a=[1, 2, 3], b=[1, 2, 3], c=[1, 2, 3], val=[0, 1, 0])) - expected.set_index(['a', 'b', 'c'], inplace=True) + expected = DataFrame(dict(a=[1, 2, 3], b=[1, 2, 3], c=[1, 2, 3], val=[0, 1, 0])) + expected.set_index(["a", "b", "c"], inplace=True) tm.assert_frame_equal(df, expected) # GH 3970 # these are chained assignments as well - pd.set_option('chained_assignment', None) + pd.set_option("chained_assignment", None) df = DataFrame({"aa": range(5), "bb": [2.2] * 5}) df["cc"] = 0.0 ck = [True] * len(df) - df["bb"].iloc[0] = .13 + df["bb"].iloc[0] = 0.13 # TODO: unused df_tmp = df.iloc[ck] # noqa - df["bb"].iloc[0] = .15 - assert df['bb'].iloc[0] == 0.15 - pd.set_option('chained_assignment', 'raise') + df["bb"].iloc[0] = 0.15 + assert df["bb"].iloc[0] == 0.15 + pd.set_option("chained_assignment", "raise") # GH 3217 df = DataFrame(dict(a=[1, 3], b=[np.nan, 2])) - df['c'] = np.nan - df['c'].update(pd.Series(['foo'], index=[0])) + df["c"] = np.nan + df["c"].update(pd.Series(["foo"], index=[0])) - expected = DataFrame(dict(a=[1, 3], b=[np.nan, 2], c=['foo', np.nan])) + expected = DataFrame(dict(a=[1, 3], b=[np.nan, 2], c=["foo", np.nan])) tm.assert_frame_equal(df, expected) @@ -696,9 +703,9 @@ def test_cast_on_putmask(): # GH 2746 # need to upcast - s = Series([1, 2], index=[1, 2], dtype='int64') - s[[True, False]] = Series([0], index=[1], dtype='int64') - expected = Series([0, 2], index=[1, 2], dtype='int64') + s = Series([1, 2], index=[1, 2], dtype="int64") + s[[True, False]] = Series([0], index=[1], dtype="int64") + expected = Series([0, 2], index=[1, 2], dtype="int64") assert_series_equal(s, expected) @@ -715,25 +722,25 @@ def test_type_promote_putmask(): mask = s > 0 s2 = s[mask].map(str) s[mask] = s2 - assert_series_equal(s, Series([0, '1', '2', 0])) + assert_series_equal(s, Series([0, "1", "2", 0])) - s = Series([0, 'foo', 'bar', 0]) + s = Series([0, "foo", "bar", 0]) mask = Series([False, True, True, False]) s2 = s[mask] s[mask] = s2 - assert_series_equal(s, Series([0, 'foo', 'bar', 0])) + assert_series_equal(s, Series([0, "foo", "bar", 0])) def test_multilevel_preserve_name(): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - s = Series(np.random.randn(len(index)), index=index, name='sth') - - result = s['foo'] - result2 = s.loc['foo'] + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + s = Series(np.random.randn(len(index)), index=index, name="sth") + + result = s["foo"] + result2 = s.loc["foo"] assert result.name == s.name assert result2.name == s.name @@ -774,13 +781,13 @@ def test_setitem_slice_into_readonly_backing_data(): def test_pop(): # GH 6600 - df = DataFrame({'A': 0, 'B': np.arange(5, dtype='int64'), 'C': 0, }) + df = DataFrame({"A": 0, "B": np.arange(5, dtype="int64"), "C": 0}) k = df.iloc[4] - result = k.pop('B') + result = k.pop("B") assert result == 4 - expected = Series([0, 0], index=['A', 'C'], name=4) + expected = Series([0, 0], index=["A", "C"], name=4) assert_series_equal(k, expected) @@ -804,11 +811,11 @@ def test_take(): def test_take_categorical(): # https://github.com/pandas-dev/pandas/issues/20664 - s = Series(pd.Categorical(['a', 'b', 'c'])) + s = Series(pd.Categorical(["a", "b", "c"])) result = s.take([-2, -2, 0]) - expected = Series(pd.Categorical(['b', 'b', 'a'], - categories=['a', 'b', 'c']), - index=[1, 1, 0]) + expected = Series( + pd.Categorical(["b", "b", "a"], categories=["a", "b", "c"]), index=[1, 1, 0] + ) assert_series_equal(result, expected) diff --git a/pandas/tests/series/indexing/test_loc.py b/pandas/tests/series/indexing/test_loc.py index 8fefc19b4843f9..2f7807526a29d4 100644 --- a/pandas/tests/series/indexing/test_loc.py +++ b/pandas/tests/series/indexing/test_loc.py @@ -6,21 +6,16 @@ from pandas.util.testing import assert_series_equal -@pytest.mark.parametrize("val,expected", [ - (2**63 - 1, 3), - (2**63, 4), -]) +@pytest.mark.parametrize("val,expected", [(2 ** 63 - 1, 3), (2 ** 63, 4)]) def test_loc_uint64(val, expected): # see gh-19399 - s = Series({2**63 - 1: 3, 2**63: 4}) + s = Series({2 ** 63 - 1: 3, 2 ** 63: 4}) assert s.loc[val] == expected def test_loc_getitem(test_data): inds = test_data.series.index[[3, 4, 7]] - assert_series_equal( - test_data.series.loc[inds], - test_data.series.reindex(inds)) + assert_series_equal(test_data.series.loc[inds], test_data.series.reindex(inds)) assert_series_equal(test_data.series.iloc[5::2], test_data.series[5::2]) # slice with indices @@ -99,7 +94,7 @@ def test_loc_setitem_corner(test_data): test_data.series.loc[inds] = 5 msg = r"\['foo'\] not in index" with pytest.raises(KeyError, match=msg): - test_data.series.loc[inds + ['foo']] = 5 + test_data.series.loc[inds + ["foo"]] = 5 def test_basic_setitem_with_labels(test_data): @@ -113,8 +108,8 @@ def test_basic_setitem_with_labels(test_data): cp = test_data.ts.copy() exp = test_data.ts.copy() - cp[indices[0]:indices[2]] = 0 - exp.loc[indices[0]:indices[2]] = 0 + cp[indices[0] : indices[2]] = 0 + exp.loc[indices[0] : indices[2]] = 0 assert_series_equal(cp, exp) # integer indexes, be careful @@ -144,12 +139,13 @@ def test_basic_setitem_with_labels(test_data): # GH12089 # with tz for values - s = Series(pd.date_range("2011-01-01", periods=3, tz="US/Eastern"), - index=['a', 'b', 'c']) + s = Series( + pd.date_range("2011-01-01", periods=3, tz="US/Eastern"), index=["a", "b", "c"] + ) s2 = s.copy() - expected = Timestamp('2011-01-03', tz='US/Eastern') - s2.loc['a'] = expected - result = s2.loc['a'] + expected = Timestamp("2011-01-03", tz="US/Eastern") + s2.loc["a"] = expected + result = s2.loc["a"] assert result == expected s2 = s.copy() @@ -158,6 +154,6 @@ def test_basic_setitem_with_labels(test_data): assert result == expected s2 = s.copy() - s2['a'] = expected - result = s2['a'] + s2["a"] = expected + result = s2["a"] assert result == expected diff --git a/pandas/tests/series/indexing/test_numeric.py b/pandas/tests/series/indexing/test_numeric.py index 480d185f18b8d3..b4996575b0a05f 100644 --- a/pandas/tests/series/indexing/test_numeric.py +++ b/pandas/tests/series/indexing/test_numeric.py @@ -9,21 +9,88 @@ def test_get(): # GH 6383 - s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, 45, - 51, 39, 55, 43, 54, 52, 51, 54])) + s = Series( + np.array( + [ + 43, + 48, + 60, + 48, + 50, + 51, + 50, + 45, + 57, + 48, + 56, + 45, + 51, + 39, + 55, + 43, + 54, + 52, + 51, + 54, + ] + ) + ) result = s.get(25, 0) expected = 0 assert result == expected - s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, - 45, 51, 39, 55, 43, 54, 52, 51, 54]), - index=pd.Float64Index( - [25.0, 36.0, 49.0, 64.0, 81.0, 100.0, - 121.0, 144.0, 169.0, 196.0, 1225.0, - 1296.0, 1369.0, 1444.0, 1521.0, 1600.0, - 1681.0, 1764.0, 1849.0, 1936.0], - dtype='object')) + s = Series( + np.array( + [ + 43, + 48, + 60, + 48, + 50, + 51, + 50, + 45, + 57, + 48, + 56, + 45, + 51, + 39, + 55, + 43, + 54, + 52, + 51, + 54, + ] + ), + index=pd.Float64Index( + [ + 25.0, + 36.0, + 49.0, + 64.0, + 81.0, + 100.0, + 121.0, + 144.0, + 169.0, + 196.0, + 1225.0, + 1296.0, + 1369.0, + 1444.0, + 1521.0, + 1600.0, + 1681.0, + 1764.0, + 1849.0, + 1936.0, + ], + dtype="object", + ), + ) result = s.get(25, 0) expected = 43 @@ -31,24 +98,24 @@ def test_get(): # GH 7407 # with a boolean accessor - df = pd.DataFrame({'i': [0] * 3, 'b': [False] * 3}) + df = pd.DataFrame({"i": [0] * 3, "b": [False] * 3}) vc = df.i.value_counts() - result = vc.get(99, default='Missing') - assert result == 'Missing' + result = vc.get(99, default="Missing") + assert result == "Missing" vc = df.b.value_counts() - result = vc.get(False, default='Missing') + result = vc.get(False, default="Missing") assert result == 3 - result = vc.get(True, default='Missing') - assert result == 'Missing' + result = vc.get(True, default="Missing") + assert result == "Missing" def test_get_nan(): # GH 8569 s = pd.Float64Index(range(10)).to_series() assert s.get(np.nan) is None - assert s.get(np.nan, default='Missing') == 'Missing' + assert s.get(np.nan, default="Missing") == "Missing" def test_get_nan_multiple(): @@ -59,20 +126,18 @@ def test_get_nan_multiple(): idx = [2, 30] with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - assert_series_equal(s.get(idx), - Series([2, np.nan], index=idx)) + assert_series_equal(s.get(idx), Series([2, np.nan], index=idx)) idx = [2, np.nan] with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - assert_series_equal(s.get(idx), - Series([2, np.nan], index=idx)) + assert_series_equal(s.get(idx), Series([2, np.nan], index=idx)) # GH 17295 - all missing keys idx = [20, 30] - assert(s.get(idx) is None) + assert s.get(idx) is None idx = [np.nan, np.nan] - assert(s.get(idx) is None) + assert s.get(idx) is None def test_delitem(): @@ -97,28 +162,24 @@ def test_delitem(): # only 1 left, del, add, del s = Series(1) del s[0] - assert_series_equal(s, Series(dtype='int64', index=Index( - [], dtype='int64'))) + assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) s[0] = 1 assert_series_equal(s, Series(1)) del s[0] - assert_series_equal(s, Series(dtype='int64', index=Index( - [], dtype='int64'))) + assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) # Index(dtype=object) - s = Series(1, index=['a']) - del s['a'] - assert_series_equal(s, Series(dtype='int64', index=Index( - [], dtype='object'))) - s['a'] = 1 - assert_series_equal(s, Series(1, index=['a'])) - del s['a'] - assert_series_equal(s, Series(dtype='int64', index=Index( - [], dtype='object'))) + s = Series(1, index=["a"]) + del s["a"] + assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="object"))) + s["a"] = 1 + assert_series_equal(s, Series(1, index=["a"])) + del s["a"] + assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="object"))) def test_slice_float64(): - values = np.arange(10., 50., 2) + values = np.arange(10.0, 50.0, 2) index = Index(values) start, end = values[[5, 15]] @@ -150,7 +211,7 @@ def test_getitem_negative_out_of_bounds(): s[-11] msg = "index -11 is out of bounds for axis 0 with size 10" with pytest.raises(IndexError, match=msg): - s[-11] = 'foo' + s[-11] = "foo" def test_getitem_regression(): @@ -192,19 +253,21 @@ def test_getitem_setitem_slice_integers(): def test_setitem_float_labels(): # note labels are floats - s = Series(['a', 'b', 'c'], index=[0, 0.5, 1]) + s = Series(["a", "b", "c"], index=[0, 0.5, 1]) tmp = s.copy() - s.loc[1] = 'zoo' - tmp.iloc[2] = 'zoo' + s.loc[1] = "zoo" + tmp.iloc[2] = "zoo" assert_series_equal(s, tmp) def test_slice_float_get_set(test_data): - msg = (r"cannot do slice indexing on with these indexers \[{key}\]" - r" of ") + msg = ( + r"cannot do slice indexing on with these indexers \[{key}\]" + r" of " + ) with pytest.raises(TypeError, match=msg.format(key=r"4\.0")): test_data.ts[4.0:10.0] @@ -237,7 +300,7 @@ def test_int_indexing(): s[5] with pytest.raises(KeyError, match=r"^'c'$"): - s['c'] + s["c"] # not monotonic s = Series(np.random.randn(6), index=[2, 2, 0, 0, 1, 1]) @@ -246,7 +309,7 @@ def test_int_indexing(): s[5] with pytest.raises(KeyError, match=r"^'c'$"): - s['c'] + s["c"] def test_getitem_int64(test_data): diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index f734fe7fd58f4a..63baa6af7c02a6 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -8,17 +8,20 @@ class TestSeriesAlterAxes: - def test_setindex(self, string_series): # wrong type - msg = (r"Index\(\.\.\.\) must be called with a collection of some" - r" kind, None was passed") + msg = ( + r"Index\(\.\.\.\) must be called with a collection of some" + r" kind, None was passed" + ) with pytest.raises(TypeError, match=msg): string_series.index = None # wrong length - msg = ("Length mismatch: Expected axis has 30 elements, new" - " values have 29 elements") + msg = ( + "Length mismatch: Expected axis has 30 elements, new" + " values have 29 elements" + ) with pytest.raises(ValueError, match=msg): string_series.index = np.arange(len(string_series) - 1) @@ -30,7 +33,7 @@ def test_setindex(self, string_series): def test_rename(self, datetime_series): ts = datetime_series - renamer = lambda x: x.strftime('%Y%m%d') + renamer = lambda x: x.strftime("%Y%m%d") renamed = ts.rename(renamer) assert renamed.index[0] == renamer(ts.index[0]) @@ -40,53 +43,53 @@ def test_rename(self, datetime_series): tm.assert_series_equal(renamed, renamed2) # partial dict - s = Series(np.arange(4), index=['a', 'b', 'c', 'd'], dtype='int64') - renamed = s.rename({'b': 'foo', 'd': 'bar'}) - tm.assert_index_equal(renamed.index, Index(['a', 'foo', 'c', 'bar'])) + s = Series(np.arange(4), index=["a", "b", "c", "d"], dtype="int64") + renamed = s.rename({"b": "foo", "d": "bar"}) + tm.assert_index_equal(renamed.index, Index(["a", "foo", "c", "bar"])) # index with name - renamer = Series(np.arange(4), - index=Index(['a', 'b', 'c', 'd'], name='name'), - dtype='int64') + renamer = Series( + np.arange(4), index=Index(["a", "b", "c", "d"], name="name"), dtype="int64" + ) renamed = renamer.rename({}) assert renamed.index.name == renamer.index.name def test_rename_by_series(self): - s = Series(range(5), name='foo') + s = Series(range(5), name="foo") renamer = Series({1: 10, 2: 20}) result = s.rename(renamer) - expected = Series(range(5), index=[0, 10, 20, 3, 4], name='foo') + expected = Series(range(5), index=[0, 10, 20, 3, 4], name="foo") tm.assert_series_equal(result, expected) def test_rename_set_name(self): - s = Series(range(4), index=list('abcd')) - for name in ['foo', 123, 123., datetime(2001, 11, 11), ('foo',)]: + s = Series(range(4), index=list("abcd")) + for name in ["foo", 123, 123.0, datetime(2001, 11, 11), ("foo",)]: result = s.rename(name) assert result.name == name tm.assert_numpy_array_equal(result.index.values, s.index.values) assert s.name is None def test_rename_set_name_inplace(self): - s = Series(range(3), index=list('abc')) - for name in ['foo', 123, 123., datetime(2001, 11, 11), ('foo',)]: + s = Series(range(3), index=list("abc")) + for name in ["foo", 123, 123.0, datetime(2001, 11, 11), ("foo",)]: s.rename(name, inplace=True) assert s.name == name - exp = np.array(['a', 'b', 'c'], dtype=np.object_) + exp = np.array(["a", "b", "c"], dtype=np.object_) tm.assert_numpy_array_equal(s.index.values, exp) def test_rename_axis_supported(self): # Supporting axis for compatibility, detailed in GH-18589 s = Series(range(5)) s.rename({}, axis=0) - s.rename({}, axis='index') - with pytest.raises(ValueError, match='No axis named 5'): + s.rename({}, axis="index") + with pytest.raises(ValueError, match="No axis named 5"): s.rename({}, axis=5) def test_set_name_attribute(self): s = Series([1, 2, 3]) - s2 = Series([1, 2, 3], name='bar') - for name in [7, 7., 'name', datetime(2001, 1, 1), (1,), "\u05D0"]: + s2 = Series([1, 2, 3], name="bar") + for name in [7, 7.0, "name", datetime(2001, 1, 1), (1,), "\u05D0"]: s.name = name assert s.name == name s2.name = name @@ -94,13 +97,13 @@ def test_set_name_attribute(self): def test_set_name(self): s = Series([1, 2, 3]) - s2 = s._set_name('foo') - assert s2.name == 'foo' + s2 = s._set_name("foo") + assert s2.name == "foo" assert s.name is None assert s is not s2 def test_rename_inplace(self, datetime_series): - renamer = lambda x: x.strftime('%Y%m%d') + renamer = lambda x: x.strftime("%Y%m%d") expected = renamer(datetime_series.index[0]) datetime_series.rename(renamer, inplace=True) @@ -116,14 +119,14 @@ def test_set_index_makes_timeseries(self): def test_reset_index(self): df = tm.makeDataFrame()[:5] ser = df.stack() - ser.index.names = ['hash', 'category'] + ser.index.names = ["hash", "category"] - ser.name = 'value' + ser.name = "value" df = ser.reset_index() - assert 'value' in df + assert "value" in df - df = ser.reset_index(name='value2') - assert 'value2' in df + df = ser.reset_index(name="value2") + assert "value2" in df # check inplace s = ser.reset_index(drop=True) @@ -132,9 +135,10 @@ def test_reset_index(self): tm.assert_series_equal(s, s2) # level - index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], - codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], - [0, 1, 0, 1, 0, 1]]) + index = MultiIndex( + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + ) s = Series(np.random.randn(6), index=index) rs = s.reset_index(level=1) assert len(rs.columns) == 2 @@ -144,47 +148,45 @@ def test_reset_index(self): assert isinstance(rs, Series) def test_reset_index_name(self): - s = Series([1, 2, 3], index=Index(range(3), name='x')) + s = Series([1, 2, 3], index=Index(range(3), name="x")) assert s.reset_index().index.name is None assert s.reset_index(drop=True).index.name is None def test_reset_index_level(self): - df = DataFrame([[1, 2, 3], [4, 5, 6]], - columns=['A', 'B', 'C']) + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) - for levels in ['A', 'B'], [0, 1]: + for levels in ["A", "B"], [0, 1]: # With MultiIndex - s = df.set_index(['A', 'B'])['C'] + s = df.set_index(["A", "B"])["C"] result = s.reset_index(level=levels[0]) - tm.assert_frame_equal(result, df.set_index('B')) + tm.assert_frame_equal(result, df.set_index("B")) result = s.reset_index(level=levels[:1]) - tm.assert_frame_equal(result, df.set_index('B')) + tm.assert_frame_equal(result, df.set_index("B")) result = s.reset_index(level=levels) tm.assert_frame_equal(result, df) - result = df.set_index(['A', 'B']).reset_index(level=levels, - drop=True) - tm.assert_frame_equal(result, df[['C']]) + result = df.set_index(["A", "B"]).reset_index(level=levels, drop=True) + tm.assert_frame_equal(result, df[["C"]]) - with pytest.raises(KeyError, match='Level E '): - s.reset_index(level=['A', 'E']) + with pytest.raises(KeyError, match="Level E "): + s.reset_index(level=["A", "E"]) # With single-level Index - s = df.set_index('A')['B'] + s = df.set_index("A")["B"] result = s.reset_index(level=levels[0]) - tm.assert_frame_equal(result, df[['A', 'B']]) + tm.assert_frame_equal(result, df[["A", "B"]]) result = s.reset_index(level=levels[:1]) - tm.assert_frame_equal(result, df[['A', 'B']]) + tm.assert_frame_equal(result, df[["A", "B"]]) result = s.reset_index(level=levels[0], drop=True) - tm.assert_series_equal(result, df['B']) + tm.assert_series_equal(result, df["B"]) - with pytest.raises(IndexError, match='Too many levels'): + with pytest.raises(IndexError, match="Too many levels"): s.reset_index(level=[0, 1, 2]) # Check that .reset_index([],drop=True) doesn't fail @@ -194,19 +196,20 @@ def test_reset_index_level(self): def test_reset_index_range(self): # GH 12071 - s = Series(range(2), name='A', dtype='int64') + s = Series(range(2), name="A", dtype="int64") series_result = s.reset_index() assert isinstance(series_result.index, RangeIndex) - series_expected = DataFrame([[0, 0], [1, 1]], - columns=['index', 'A'], - index=RangeIndex(stop=2)) + series_expected = DataFrame( + [[0, 0], [1, 1]], columns=["index", "A"], index=RangeIndex(stop=2) + ) tm.assert_frame_equal(series_result, series_expected) def test_reorder_levels(self): - index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], - codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], - [0, 1, 0, 1, 0, 1]], - names=['L0', 'L1', 'L2']) + index = MultiIndex( + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + names=["L0", "L1", "L2"], + ) s = Series(np.arange(6), index=index) # no change, position @@ -214,49 +217,49 @@ def test_reorder_levels(self): tm.assert_series_equal(s, result) # no change, labels - result = s.reorder_levels(['L0', 'L1', 'L2']) + result = s.reorder_levels(["L0", "L1", "L2"]) tm.assert_series_equal(s, result) # rotate, position result = s.reorder_levels([1, 2, 0]) - e_idx = MultiIndex(levels=[['one', 'two', 'three'], [0, 1], ['bar']], - codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1], - [0, 0, 0, 0, 0, 0]], - names=['L1', 'L2', 'L0']) + e_idx = MultiIndex( + levels=[["one", "two", "three"], [0, 1], ["bar"]], + codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1], [0, 0, 0, 0, 0, 0]], + names=["L1", "L2", "L0"], + ) expected = Series(np.arange(6), index=e_idx) tm.assert_series_equal(result, expected) def test_rename_axis_mapper(self): # GH 19978 - mi = MultiIndex.from_product([['a', 'b', 'c'], [1, 2]], - names=['ll', 'nn']) + mi = MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["ll", "nn"]) s = Series([i for i in range(len(mi))], index=mi) - result = s.rename_axis(index={'ll': 'foo'}) - assert result.index.names == ['foo', 'nn'] + result = s.rename_axis(index={"ll": "foo"}) + assert result.index.names == ["foo", "nn"] result = s.rename_axis(index=str.upper, axis=0) - assert result.index.names == ['LL', 'NN'] + assert result.index.names == ["LL", "NN"] - result = s.rename_axis(index=['foo', 'goo']) - assert result.index.names == ['foo', 'goo'] + result = s.rename_axis(index=["foo", "goo"]) + assert result.index.names == ["foo", "goo"] - with pytest.raises(TypeError, match='unexpected'): - s.rename_axis(columns='wrong') + with pytest.raises(TypeError, match="unexpected"): + s.rename_axis(columns="wrong") def test_rename_axis_inplace(self, datetime_series): # GH 15704 - expected = datetime_series.rename_axis('foo') + expected = datetime_series.rename_axis("foo") result = datetime_series - no_return = result.rename_axis('foo', inplace=True) + no_return = result.rename_axis("foo", inplace=True) assert no_return is None tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('kwargs', [{'mapper': None}, {'index': None}, {}]) + @pytest.mark.parametrize("kwargs", [{"mapper": None}, {"index": None}, {}]) def test_rename_axis_none(self, kwargs): # GH 25034 - index = Index(list('abc'), name='foo') + index = Index(list("abc"), name="foo") df = Series([1, 2, 3], index=index) result = df.rename_axis(**kwargs) @@ -266,52 +269,52 @@ def test_rename_axis_none(self, kwargs): def test_set_axis_inplace_axes(self, axis_series): # GH14636 - ser = Series(np.arange(4), index=[1, 3, 5, 7], dtype='int64') + ser = Series(np.arange(4), index=[1, 3, 5, 7], dtype="int64") expected = ser.copy() - expected.index = list('abcd') + expected.index = list("abcd") # inplace=True # The FutureWarning comes from the fact that we would like to have # inplace default to False some day for inplace, warn in [(None, FutureWarning), (True, None)]: result = ser.copy() - kwargs = {'inplace': inplace} + kwargs = {"inplace": inplace} with tm.assert_produces_warning(warn): - result.set_axis(list('abcd'), axis=axis_series, **kwargs) + result.set_axis(list("abcd"), axis=axis_series, **kwargs) tm.assert_series_equal(result, expected) def test_set_axis_inplace(self): # GH14636 - s = Series(np.arange(4), index=[1, 3, 5, 7], dtype='int64') + s = Series(np.arange(4), index=[1, 3, 5, 7], dtype="int64") expected = s.copy() - expected.index = list('abcd') + expected.index = list("abcd") # inplace=False - result = s.set_axis(list('abcd'), axis=0, inplace=False) + result = s.set_axis(list("abcd"), axis=0, inplace=False) tm.assert_series_equal(expected, result) # omitting the "axis" parameter with tm.assert_produces_warning(None): - result = s.set_axis(list('abcd'), inplace=False) + result = s.set_axis(list("abcd"), inplace=False) tm.assert_series_equal(result, expected) # wrong values for the "axis" parameter - for axis in [2, 'foo']: - with pytest.raises(ValueError, match='No axis named'): - s.set_axis(list('abcd'), axis=axis, inplace=False) + for axis in [2, "foo"]: + with pytest.raises(ValueError, match="No axis named"): + s.set_axis(list("abcd"), axis=axis, inplace=False) def test_set_axis_prior_to_deprecation_signature(self): - s = Series(np.arange(4), index=[1, 3, 5, 7], dtype='int64') + s = Series(np.arange(4), index=[1, 3, 5, 7], dtype="int64") expected = s.copy() - expected.index = list('abcd') + expected.index = list("abcd") - for axis in [0, 'index']: + for axis in [0, "index"]: with tm.assert_produces_warning(FutureWarning): - result = s.set_axis(0, list('abcd'), inplace=False) + result = s.set_axis(0, list("abcd"), inplace=False) tm.assert_series_equal(result, expected) def test_reset_index_drop_errors(self): @@ -319,24 +322,25 @@ def test_reset_index_drop_errors(self): # KeyError raised for series index when passed level name is missing s = Series(range(4)) - with pytest.raises(KeyError, match='must be same as name'): - s.reset_index('wrong', drop=True) - with pytest.raises(KeyError, match='must be same as name'): - s.reset_index('wrong') + with pytest.raises(KeyError, match="must be same as name"): + s.reset_index("wrong", drop=True) + with pytest.raises(KeyError, match="must be same as name"): + s.reset_index("wrong") # KeyError raised for series when level to be dropped is missing s = Series(range(4), index=MultiIndex.from_product([[1, 2]] * 2)) - with pytest.raises(KeyError, match='not found'): - s.reset_index('wrong', drop=True) + with pytest.raises(KeyError, match="not found"): + s.reset_index("wrong", drop=True) def test_droplevel(self): # GH20342 ser = Series([1, 2, 3, 4]) - ser.index = MultiIndex.from_arrays([(1, 2, 3, 4), (5, 6, 7, 8)], - names=['a', 'b']) - expected = ser.reset_index('b', drop=True) - result = ser.droplevel('b', axis='index') + ser.index = MultiIndex.from_arrays( + [(1, 2, 3, 4), (5, 6, 7, 8)], names=["a", "b"] + ) + expected = ser.reset_index("b", drop=True) + result = ser.droplevel("b", axis="index") tm.assert_series_equal(result, expected) # test that droplevel raises ValueError on axis != 0 with pytest.raises(ValueError): - ser.droplevel(1, axis='columns') + ser.droplevel(1, axis="columns") diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index e48fd9ce11a7d4..89b411a284563d 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -9,45 +9,60 @@ import pandas as pd from pandas import ( - Categorical, CategoricalIndex, DataFrame, Series, date_range, isna, notna) + Categorical, + CategoricalIndex, + DataFrame, + Series, + date_range, + isna, + notna, +) from pandas.api.types import is_scalar from pandas.core.index import MultiIndex from pandas.core.indexes.datetimes import Timestamp import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_index_equal, - assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_index_equal, + assert_series_equal, +) class TestSeriesAnalytics: - def test_describe(self): - s = Series([0, 1, 2, 3, 4], name='int_data') + s = Series([0, 1, 2, 3, 4], name="int_data") result = s.describe() - expected = Series([5, 2, s.std(), 0, 1, 2, 3, 4], - name='int_data', - index=['count', 'mean', 'std', 'min', '25%', - '50%', '75%', 'max']) + expected = Series( + [5, 2, s.std(), 0, 1, 2, 3, 4], + name="int_data", + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) tm.assert_series_equal(result, expected) - s = Series([True, True, False, False, False], name='bool_data') + s = Series([True, True, False, False, False], name="bool_data") result = s.describe() - expected = Series([5, 2, False, 3], name='bool_data', - index=['count', 'unique', 'top', 'freq']) + expected = Series( + [5, 2, False, 3], name="bool_data", index=["count", "unique", "top", "freq"] + ) tm.assert_series_equal(result, expected) - s = Series(['a', 'a', 'b', 'c', 'd'], name='str_data') + s = Series(["a", "a", "b", "c", "d"], name="str_data") result = s.describe() - expected = Series([5, 4, 'a', 2], name='str_data', - index=['count', 'unique', 'top', 'freq']) + expected = Series( + [5, 4, "a", 2], name="str_data", index=["count", "unique", "top", "freq"] + ) tm.assert_series_equal(result, expected) def test_describe_empty_object(self): # https://github.com/pandas-dev/pandas/issues/27183 s = pd.Series([None, None], dtype=object) result = s.describe() - expected = pd.Series([0, 0, np.nan, np.nan], dtype=object, - index=['count', 'unique', 'top', 'freq']) + expected = pd.Series( + [0, 0, np.nan, np.nan], + dtype=object, + index=["count", "unique", "top", "freq"], + ) tm.assert_series_equal(result, expected) result = s[:0].describe() @@ -65,61 +80,67 @@ def test_describe_with_tz(self, tz_naive_fixture): s = Series(date_range(start, end, tz=tz), name=name) result = s.describe() expected = Series( - [5, 5, s.value_counts().index[0], 1, start.tz_localize(tz), - end.tz_localize(tz) - ], + [ + 5, + 5, + s.value_counts().index[0], + 1, + start.tz_localize(tz), + end.tz_localize(tz), + ], name=name, - index=['count', 'unique', 'top', 'freq', 'first', 'last'] + index=["count", "unique", "top", "freq", "first", "last"], ) tm.assert_series_equal(result, expected) def test_argsort(self, datetime_series): - self._check_accum_op('argsort', datetime_series, check_dtype=False) + self._check_accum_op("argsort", datetime_series, check_dtype=False) argsorted = datetime_series.argsort() assert issubclass(argsorted.dtype.type, np.integer) # GH 2967 (introduced bug in 0.11-dev I think) - s = Series([Timestamp('201301%02d' % (i + 1)) for i in range(5)]) - assert s.dtype == 'datetime64[ns]' + s = Series([Timestamp("201301%02d" % (i + 1)) for i in range(5)]) + assert s.dtype == "datetime64[ns]" shifted = s.shift(-1) - assert shifted.dtype == 'datetime64[ns]' + assert shifted.dtype == "datetime64[ns]" assert isna(shifted[4]) result = s.argsort() - expected = Series(range(5), dtype='int64') + expected = Series(range(5), dtype="int64") assert_series_equal(result, expected) result = shifted.argsort() - expected = Series(list(range(4)) + [-1], dtype='int64') + expected = Series(list(range(4)) + [-1], dtype="int64") assert_series_equal(result, expected) def test_argsort_stable(self): s = Series(np.random.randint(0, 100, size=10000)) - mindexer = s.argsort(kind='mergesort') + mindexer = s.argsort(kind="mergesort") qindexer = s.argsort() - mexpected = np.argsort(s.values, kind='mergesort') - qexpected = np.argsort(s.values, kind='quicksort') + mexpected = np.argsort(s.values, kind="mergesort") + qexpected = np.argsort(s.values, kind="quicksort") - tm.assert_series_equal(mindexer, Series(mexpected), - check_dtype=False) - tm.assert_series_equal(qindexer, Series(qexpected), - check_dtype=False) - msg = (r"ndarray Expected type ," - r" found instead") + tm.assert_series_equal(mindexer, Series(mexpected), check_dtype=False) + tm.assert_series_equal(qindexer, Series(qexpected), check_dtype=False) + msg = ( + r"ndarray Expected type ," + r" found instead" + ) with pytest.raises(AssertionError, match=msg): tm.assert_numpy_array_equal(qindexer, mindexer) def test_cumsum(self, datetime_series): - self._check_accum_op('cumsum', datetime_series) + self._check_accum_op("cumsum", datetime_series) def test_cumprod(self, datetime_series): - self._check_accum_op('cumprod', datetime_series) + self._check_accum_op("cumprod", datetime_series) def test_cummin(self, datetime_series): - tm.assert_numpy_array_equal(datetime_series.cummin().values, - np.minimum - .accumulate(np.array(datetime_series))) + tm.assert_numpy_array_equal( + datetime_series.cummin().values, + np.minimum.accumulate(np.array(datetime_series)), + ) ts = datetime_series.copy() ts[::2] = np.NaN result = ts.cummin()[1::2] @@ -128,9 +149,10 @@ def test_cummin(self, datetime_series): tm.assert_series_equal(result, expected) def test_cummax(self, datetime_series): - tm.assert_numpy_array_equal(datetime_series.cummax().values, - np.maximum - .accumulate(np.array(datetime_series))) + tm.assert_numpy_array_equal( + datetime_series.cummax().values, + np.maximum.accumulate(np.array(datetime_series)), + ) ts = datetime_series.copy() ts[::2] = np.NaN result = ts.cummax()[1::2] @@ -139,90 +161,75 @@ def test_cummax(self, datetime_series): tm.assert_series_equal(result, expected) def test_cummin_datetime64(self): - s = pd.Series(pd.to_datetime(['NaT', '2000-1-2', 'NaT', '2000-1-1', - 'NaT', '2000-1-3'])) + s = pd.Series( + pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"]) + ) - expected = pd.Series(pd.to_datetime(['NaT', '2000-1-2', 'NaT', - '2000-1-1', 'NaT', '2000-1-1'])) + expected = pd.Series( + pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-1"]) + ) result = s.cummin(skipna=True) tm.assert_series_equal(expected, result) - expected = pd.Series(pd.to_datetime( - ['NaT', '2000-1-2', '2000-1-2', '2000-1-1', '2000-1-1', '2000-1-1' - ])) + expected = pd.Series( + pd.to_datetime( + ["NaT", "2000-1-2", "2000-1-2", "2000-1-1", "2000-1-1", "2000-1-1"] + ) + ) result = s.cummin(skipna=False) tm.assert_series_equal(expected, result) def test_cummax_datetime64(self): - s = pd.Series(pd.to_datetime(['NaT', '2000-1-2', 'NaT', '2000-1-1', - 'NaT', '2000-1-3'])) + s = pd.Series( + pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"]) + ) - expected = pd.Series(pd.to_datetime(['NaT', '2000-1-2', 'NaT', - '2000-1-2', 'NaT', '2000-1-3'])) + expected = pd.Series( + pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-2", "NaT", "2000-1-3"]) + ) result = s.cummax(skipna=True) tm.assert_series_equal(expected, result) - expected = pd.Series(pd.to_datetime( - ['NaT', '2000-1-2', '2000-1-2', '2000-1-2', '2000-1-2', '2000-1-3' - ])) + expected = pd.Series( + pd.to_datetime( + ["NaT", "2000-1-2", "2000-1-2", "2000-1-2", "2000-1-2", "2000-1-3"] + ) + ) result = s.cummax(skipna=False) tm.assert_series_equal(expected, result) def test_cummin_timedelta64(self): - s = pd.Series(pd.to_timedelta(['NaT', - '2 min', - 'NaT', - '1 min', - 'NaT', - '3 min', ])) - - expected = pd.Series(pd.to_timedelta(['NaT', - '2 min', - 'NaT', - '1 min', - 'NaT', - '1 min', ])) + s = pd.Series(pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "3 min"])) + + expected = pd.Series( + pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "1 min"]) + ) result = s.cummin(skipna=True) tm.assert_series_equal(expected, result) - expected = pd.Series(pd.to_timedelta(['NaT', - '2 min', - '2 min', - '1 min', - '1 min', - '1 min', ])) + expected = pd.Series( + pd.to_timedelta(["NaT", "2 min", "2 min", "1 min", "1 min", "1 min"]) + ) result = s.cummin(skipna=False) tm.assert_series_equal(expected, result) def test_cummax_timedelta64(self): - s = pd.Series(pd.to_timedelta(['NaT', - '2 min', - 'NaT', - '1 min', - 'NaT', - '3 min', ])) - - expected = pd.Series(pd.to_timedelta(['NaT', - '2 min', - 'NaT', - '2 min', - 'NaT', - '3 min', ])) + s = pd.Series(pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "3 min"])) + + expected = pd.Series( + pd.to_timedelta(["NaT", "2 min", "NaT", "2 min", "NaT", "3 min"]) + ) result = s.cummax(skipna=True) tm.assert_series_equal(expected, result) - expected = pd.Series(pd.to_timedelta(['NaT', - '2 min', - '2 min', - '2 min', - '2 min', - '3 min', ])) + expected = pd.Series( + pd.to_timedelta(["NaT", "2 min", "2 min", "2 min", "2 min", "3 min"]) + ) result = s.cummax(skipna=False) tm.assert_series_equal(expected, result) def test_npdiff(self): - pytest.skip("skipping due to Series no longer being an " - "ndarray") + pytest.skip("skipping due to Series no longer being an " "ndarray") # no longer works as the return type of np.diff is now nd.array s = Series(np.arange(5)) @@ -232,9 +239,11 @@ def test_npdiff(self): def _check_accum_op(self, name, datetime_series_, check_dtype=True): func = getattr(np, name) - tm.assert_numpy_array_equal(func(datetime_series_).values, - func(np.array(datetime_series_)), - check_dtype=check_dtype) + tm.assert_numpy_array_equal( + func(datetime_series_).values, + func(np.array(datetime_series_)), + check_dtype=check_dtype, + ) # with missing values ts = datetime_series_.copy() @@ -243,25 +252,20 @@ def _check_accum_op(self, name, datetime_series_, check_dtype=True): result = func(ts)[1::2] expected = func(np.array(ts.dropna())) - tm.assert_numpy_array_equal(result.values, expected, - check_dtype=False) + tm.assert_numpy_array_equal(result.values, expected, check_dtype=False) def test_compress(self): cond = [True, False, True, False, False] - s = Series([1, -1, 5, 8, 7], - index=list('abcde'), name='foo') - expected = Series(s.values.compress(cond), - index=list('ac'), name='foo') + s = Series([1, -1, 5, 8, 7], index=list("abcde"), name="foo") + expected = Series(s.values.compress(cond), index=list("ac"), name="foo") with tm.assert_produces_warning(FutureWarning): result = s.compress(cond) tm.assert_series_equal(result, expected) def test_numpy_compress(self): cond = [True, False, True, False, False] - s = Series([1, -1, 5, 8, 7], - index=list('abcde'), name='foo') - expected = Series(s.values.compress(cond), - index=list('ac'), name='foo') + s = Series([1, -1, 5, 8, 7], index=list("abcde"), name="foo") + expected = Series(s.values.compress(cond), index=list("ac"), name="foo") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): tm.assert_series_equal(np.compress(cond, s), expected) @@ -277,8 +281,9 @@ def test_numpy_compress(self): def test_round(self, datetime_series): datetime_series.index.name = "index_name" result = datetime_series.round(2) - expected = Series(np.round(datetime_series.values, 2), - index=datetime_series.index, name='ts') + expected = Series( + np.round(datetime_series.values, 2), index=datetime_series.index, name="ts" + ) assert_series_equal(result, expected) assert result.name == datetime_series.name @@ -286,7 +291,7 @@ def test_numpy_round(self): # See gh-12600 s = Series([1.53, 1.36, 0.06]) out = np.round(s, decimals=0) - expected = Series([2., 1., 0.]) + expected = Series([2.0, 1.0, 0.0]) assert_series_equal(out, expected) msg = "the 'out' parameter is not supported" @@ -298,13 +303,13 @@ def test_numpy_round_nan(self): s = Series([1.53, np.nan, 0.06]) with tm.assert_produces_warning(None): result = s.round() - expected = Series([2., np.nan, 0.]) + expected = Series([2.0, np.nan, 0.0]) assert_series_equal(result, expected) def test_built_in_round(self): s = Series([1.123, 2.123, 3.123], index=range(3)) result = round(s) - expected_rounded0 = Series([1., 2., 3.], index=range(3)) + expected_rounded0 = Series([1.0, 2.0, 3.0], index=range(3)) tm.assert_series_equal(result, expected_rounded0) decimals = 2 @@ -313,7 +318,7 @@ def test_built_in_round(self): tm.assert_series_equal(result, expected_rounded) def test_prod_numpy16_bug(self): - s = Series([1., 1., 1.], index=range(3)) + s = Series([1.0, 1.0, 1.0], index=range(3)) result = s.prod() assert not isinstance(result, Series) @@ -326,11 +331,9 @@ def test_corr(self, datetime_series): tm.assert_almost_equal(datetime_series.corr(datetime_series), 1) # partial overlap - tm.assert_almost_equal(datetime_series[:15].corr(datetime_series[5:]), - 1) + tm.assert_almost_equal(datetime_series[:15].corr(datetime_series[5:]), 1) - assert isna(datetime_series[:15].corr(datetime_series[5:], - min_periods=12)) + assert isna(datetime_series[:15].corr(datetime_series[5:], min_periods=12)) ts1 = datetime_series[:15].reindex(datetime_series.index) ts2 = datetime_series[5:].reindex(datetime_series.index) @@ -358,75 +361,100 @@ def test_corr_rank(self): A = tm.makeTimeSeries() B = tm.makeTimeSeries() A[-5:] = A[:5] - result = A.corr(B, method='kendall') + result = A.corr(B, method="kendall") expected = stats.kendalltau(A, B)[0] tm.assert_almost_equal(result, expected) - result = A.corr(B, method='spearman') + result = A.corr(B, method="spearman") expected = stats.spearmanr(A, B)[0] tm.assert_almost_equal(result, expected) # results from R A = Series( - [-0.89926396, 0.94209606, -1.03289164, -0.95445587, 0.76910310, - - 0.06430576, -2.09704447, 0.40660407, -0.89926396, 0.94209606]) + [ + -0.89926396, + 0.94209606, + -1.03289164, + -0.95445587, + 0.76910310, + -0.06430576, + -2.09704447, + 0.40660407, + -0.89926396, + 0.94209606, + ] + ) B = Series( - [-1.01270225, -0.62210117, -1.56895827, 0.59592943, -0.01680292, - 1.17258718, -1.06009347, -0.10222060, -0.89076239, 0.89372375]) + [ + -1.01270225, + -0.62210117, + -1.56895827, + 0.59592943, + -0.01680292, + 1.17258718, + -1.06009347, + -0.10222060, + -0.89076239, + 0.89372375, + ] + ) kexp = 0.4319297 sexp = 0.5853767 - tm.assert_almost_equal(A.corr(B, method='kendall'), kexp) - tm.assert_almost_equal(A.corr(B, method='spearman'), sexp) + tm.assert_almost_equal(A.corr(B, method="kendall"), kexp) + tm.assert_almost_equal(A.corr(B, method="spearman"), sexp) def test_corr_invalid_method(self): # GH PR #22298 s1 = pd.Series(np.random.randn(10)) s2 = pd.Series(np.random.randn(10)) - msg = ("method must be either 'pearson', " - "'spearman', 'kendall', or a callable, ") + msg = ( + "method must be either 'pearson', " "'spearman', 'kendall', or a callable, " + ) with pytest.raises(ValueError, match=msg): s1.corr(s2, method="____") def test_corr_callable_method(self, datetime_series): # simple correlation example # returns 1 if exact equality, 0 otherwise - my_corr = lambda a, b: 1. if (a == b).all() else 0. + my_corr = lambda a, b: 1.0 if (a == b).all() else 0.0 # simple example s1 = Series([1, 2, 3, 4, 5]) s2 = Series([5, 4, 3, 2, 1]) expected = 0 - tm.assert_almost_equal( - s1.corr(s2, method=my_corr), - expected) + tm.assert_almost_equal(s1.corr(s2, method=my_corr), expected) # full overlap - tm.assert_almost_equal(datetime_series.corr( - datetime_series, method=my_corr), 1.) + tm.assert_almost_equal( + datetime_series.corr(datetime_series, method=my_corr), 1.0 + ) # partial overlap - tm.assert_almost_equal(datetime_series[:15].corr( - datetime_series[5:], method=my_corr), 1.) + tm.assert_almost_equal( + datetime_series[:15].corr(datetime_series[5:], method=my_corr), 1.0 + ) # No overlap - assert np.isnan(datetime_series[::2].corr( - datetime_series[1::2], method=my_corr)) + assert np.isnan( + datetime_series[::2].corr(datetime_series[1::2], method=my_corr) + ) # dataframe example df = pd.DataFrame([s1, s2]) - expected = pd.DataFrame([ - {0: 1., 1: 0}, {0: 0, 1: 1.}]) - tm.assert_almost_equal( - df.transpose().corr(method=my_corr), expected) + expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}]) + tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected) def test_cov(self, datetime_series): # full overlap - tm.assert_almost_equal(datetime_series.cov(datetime_series), - datetime_series.std() ** 2) + tm.assert_almost_equal( + datetime_series.cov(datetime_series), datetime_series.std() ** 2 + ) # partial overlap - tm.assert_almost_equal(datetime_series[:15].cov(datetime_series[5:]), - datetime_series[5:15].std() ** 2) + tm.assert_almost_equal( + datetime_series[:15].cov(datetime_series[5:]), + datetime_series[5:15].std() ** 2, + ) # No overlap assert np.isnan(datetime_series[::2].cov(datetime_series[1::2])) @@ -437,8 +465,7 @@ def test_cov(self, datetime_series): assert isna(cp.cov(cp)) # min_periods - assert isna(datetime_series[:15].cov(datetime_series[5:], - min_periods=12)) + assert isna(datetime_series[:15].cov(datetime_series[5:], min_periods=12)) ts1 = datetime_series[:15].reindex(datetime_series.index) ts2 = datetime_series[5:].reindex(datetime_series.index) @@ -451,7 +478,7 @@ def test_count(self, datetime_series): assert datetime_series.count() == np.isfinite(datetime_series).sum() - mi = MultiIndex.from_arrays([list('aabbcc'), [1, 2, 2, nan, 1, 2]]) + mi = MultiIndex.from_arrays([list("aabbcc"), [1, 2, 2, nan, 1, 2]]) ts = Series(np.arange(len(mi)), index=mi) left = ts.count(level=1) @@ -462,12 +489,13 @@ def test_count(self, datetime_series): assert_series_equal(ts.count(level=1), right - 1) def test_dot(self): - a = Series(np.random.randn(4), index=['p', 'q', 'r', 's']) - b = DataFrame(np.random.randn(3, 4), index=['1', '2', '3'], - columns=['p', 'q', 'r', 's']).T + a = Series(np.random.randn(4), index=["p", "q", "r", "s"]) + b = DataFrame( + np.random.randn(3, 4), index=["1", "2", "3"], columns=["p", "q", "r", "s"] + ).T result = a.dot(b) - expected = Series(np.dot(a.values, b.values), index=['1', '2', '3']) + expected = Series(np.dot(a.values, b.values), index=["1", "2", "3"]) assert_series_equal(result, expected) # Check index alignment @@ -478,11 +506,11 @@ def test_dot(self): # Check ndarray argument result = a.dot(b.values) assert np.all(result == expected.values) - assert_almost_equal(a.dot(b['2'].values), expected['2']) + assert_almost_equal(a.dot(b["2"].values), expected["2"]) # Check series argument - assert_almost_equal(a.dot(b['1']), expected['1']) - assert_almost_equal(a.dot(b2['1']), expected['1']) + assert_almost_equal(a.dot(b["1"]), expected["1"]) + assert_almost_equal(a.dot(b2["1"]), expected["1"]) msg = r"Dot product shape mismatch, \(4,\) vs \(3,\)" # exception raised is of type Exception @@ -494,19 +522,19 @@ def test_dot(self): def test_matmul(self): # matmul test is for GH #10259 - a = Series(np.random.randn(4), index=['p', 'q', 'r', 's']) - b = DataFrame(np.random.randn(3, 4), index=['1', '2', '3'], - columns=['p', 'q', 'r', 's']).T + a = Series(np.random.randn(4), index=["p", "q", "r", "s"]) + b = DataFrame( + np.random.randn(3, 4), index=["1", "2", "3"], columns=["p", "q", "r", "s"] + ).T # Series @ DataFrame -> Series result = operator.matmul(a, b) - expected = Series(np.dot(a.values, b.values), index=['1', '2', '3']) + expected = Series(np.dot(a.values, b.values), index=["1", "2", "3"]) assert_series_equal(result, expected) # DataFrame @ Series -> Series result = operator.matmul(b.T, a) - expected = Series(np.dot(b.T.values, a.T.values), - index=['1', '2', '3']) + expected = Series(np.dot(b.T.values, a.T.values), index=["1", "2", "3"]) assert_series_equal(result, expected) # Series @ Series -> scalar @@ -539,17 +567,15 @@ def test_matmul(self): assert_almost_equal(result, expected) # mixed dtype DataFrame @ Series - a['p'] = int(a.p) + a["p"] = int(a.p) result = operator.matmul(b.T, a) - expected = Series(np.dot(b.T.values, a.T.values), - index=['1', '2', '3']) + expected = Series(np.dot(b.T.values, a.T.values), index=["1", "2", "3"]) assert_series_equal(result, expected) # different dtypes DataFrame @ Series a = a.astype(int) result = operator.matmul(b.T, a) - expected = Series(np.dot(b.T.values, a.T.values), - index=['1', '2', '3']) + expected = Series(np.dot(b.T.values, a.T.values), index=["1", "2", "3"]) assert_series_equal(result, expected) msg = r"Dot product shape mismatch, \(4,\) vs \(3,\)" @@ -578,9 +604,11 @@ def test_clip(self, datetime_series): def test_clip_types_and_nulls(self): - sers = [Series([np.nan, 1.0, 2.0, 3.0]), Series([None, 'a', 'b', 'c']), - Series(pd.to_datetime( - [np.nan, 1, 2, 3], unit='D'))] + sers = [ + Series([np.nan, 1.0, 2.0, 3.0]), + Series([None, "a", "b", "c"]), + Series(pd.to_datetime([np.nan, 1, 2, 3], unit="D")), + ] for s in sers: thresh = s[2] @@ -599,14 +627,11 @@ def test_clip_with_na_args(self): s = Series([1, 2, 3]) assert_series_equal(s.clip(np.nan), Series([1, 2, 3])) - assert_series_equal(s.clip(upper=np.nan, lower=np.nan), - Series([1, 2, 3])) + assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3])) # GH #19992 - assert_series_equal(s.clip(lower=[0, 4, np.nan]), - Series([1, 4, np.nan])) - assert_series_equal(s.clip(upper=[1, np.nan, 1]), - Series([1, np.nan, 1])) + assert_series_equal(s.clip(lower=[0, 4, np.nan]), Series([1, 4, np.nan])) + assert_series_equal(s.clip(upper=[1, np.nan, 1]), Series([1, np.nan, 1])) def test_clip_against_series(self): # GH #6966 @@ -615,11 +640,9 @@ def test_clip_against_series(self): threshold = Series([1.0, 2.0, 3.0]) with tm.assert_produces_warning(FutureWarning): - assert_series_equal(s.clip_lower(threshold), - Series([1.0, 2.0, 4.0])) + assert_series_equal(s.clip_lower(threshold), Series([1.0, 2.0, 4.0])) with tm.assert_produces_warning(FutureWarning): - assert_series_equal(s.clip_upper(threshold), - Series([1.0, 1.0, 3.0])) + assert_series_equal(s.clip_upper(threshold), Series([1.0, 1.0, 3.0])) lower = Series([1.0, 2.0, 3.0]) upper = Series([1.5, 2.5, 3.5]) @@ -644,20 +667,28 @@ def test_clip_with_datetimes(self): # GH 11838 # naive and tz-aware datetimes - t = Timestamp('2015-12-01 09:30:30') - s = Series([Timestamp('2015-12-01 09:30:00'), - Timestamp('2015-12-01 09:31:00')]) + t = Timestamp("2015-12-01 09:30:30") + s = Series([Timestamp("2015-12-01 09:30:00"), Timestamp("2015-12-01 09:31:00")]) result = s.clip(upper=t) - expected = Series([Timestamp('2015-12-01 09:30:00'), - Timestamp('2015-12-01 09:30:30')]) + expected = Series( + [Timestamp("2015-12-01 09:30:00"), Timestamp("2015-12-01 09:30:30")] + ) assert_series_equal(result, expected) - t = Timestamp('2015-12-01 09:30:30', tz='US/Eastern') - s = Series([Timestamp('2015-12-01 09:30:00', tz='US/Eastern'), - Timestamp('2015-12-01 09:31:00', tz='US/Eastern')]) + t = Timestamp("2015-12-01 09:30:30", tz="US/Eastern") + s = Series( + [ + Timestamp("2015-12-01 09:30:00", tz="US/Eastern"), + Timestamp("2015-12-01 09:31:00", tz="US/Eastern"), + ] + ) result = s.clip(upper=t) - expected = Series([Timestamp('2015-12-01 09:30:00', tz='US/Eastern'), - Timestamp('2015-12-01 09:30:30', tz='US/Eastern')]) + expected = Series( + [ + Timestamp("2015-12-01 09:30:00", tz="US/Eastern"), + Timestamp("2015-12-01 09:30:30", tz="US/Eastern"), + ] + ) assert_series_equal(result, expected) def test_cummethods_bool(self): @@ -667,10 +698,12 @@ def test_cummethods_bool(self): b = ~a c = pd.Series([False] * len(b)) d = ~c - methods = {'cumsum': np.cumsum, - 'cumprod': np.cumprod, - 'cummin': np.minimum.accumulate, - 'cummax': np.maximum.accumulate} + methods = { + "cumsum": np.cumsum, + "cumprod": np.cumprod, + "cummin": np.minimum.accumulate, + "cummax": np.maximum.accumulate, + } args = product((a, b, c, d), methods) for s, method in args: expected = Series(methods[method](s.values)) @@ -682,19 +715,16 @@ def test_cummethods_bool(self): cpe = pd.Series([False, 0, nan, 0]) cmin = pd.Series([False, False, nan, False]) cmax = pd.Series([False, True, nan, True]) - expecteds = {'cumsum': cse, - 'cumprod': cpe, - 'cummin': cmin, - 'cummax': cmax} + expecteds = {"cumsum": cse, "cumprod": cpe, "cummin": cmin, "cummax": cmax} for method in methods: res = getattr(e, method)() assert_series_equal(res, expecteds[method]) def test_isin(self): - s = Series(['A', 'B', 'C', 'a', 'B', 'B', 'A', 'C']) + s = Series(["A", "B", "C", "a", "B", "B", "A", "C"]) - result = s.isin(['A', 'C']) + result = s.isin(["A", "C"]) expected = Series([True, False, True, False, False, False, True, True]) assert_series_equal(result, expected) @@ -702,25 +732,26 @@ def test_isin(self): # This specific issue has to have a series over 1e6 in len, but the # comparison array (in_list) must be large enough so that numpy doesn't # do a manual masking trick that will avoid this issue altogether - s = Series(list('abcdefghijk' * 10 ** 5)) + s = Series(list("abcdefghijk" * 10 ** 5)) # If numpy doesn't do the manual comparison/mask, these # unorderable mixed types are what cause the exception in numpy - in_list = [-1, 'a', 'b', 'G', 'Y', 'Z', 'E', - 'K', 'E', 'S', 'I', 'R', 'R'] * 6 + in_list = [-1, "a", "b", "G", "Y", "Z", "E", "K", "E", "S", "I", "R", "R"] * 6 assert s.isin(in_list).sum() == 200000 def test_isin_with_string_scalar(self): # GH4763 - s = Series(['A', 'B', 'C', 'a', 'B', 'B', 'A', 'C']) - msg = (r"only list-like objects are allowed to be passed to isin\(\)," - r" you passed a \[str\]") + s = Series(["A", "B", "C", "a", "B", "B", "A", "C"]) + msg = ( + r"only list-like objects are allowed to be passed to isin\(\)," + r" you passed a \[str\]" + ) with pytest.raises(TypeError, match=msg): - s.isin('a') + s.isin("a") - s = Series(['aaa', 'b', 'c']) + s = Series(["aaa", "b", "c"]) with pytest.raises(TypeError, match=msg): - s.isin('aaa') + s.isin("aaa") def test_isin_with_i8(self): # GH 5021 @@ -729,7 +760,7 @@ def test_isin_with_i8(self): expected2 = Series([False, True, False, False, False]) # datetime64[ns] - s = Series(date_range('jan-01-2013', 'jan-05-2013')) + s = Series(date_range("jan-01-2013", "jan-05-2013")) result = s.isin(s[0:2]) assert_series_equal(result, expected) @@ -738,7 +769,7 @@ def test_isin_with_i8(self): assert_series_equal(result, expected) # fails on dtype conversion in the first place - result = s.isin(s[0:2].values.astype('datetime64[D]')) + result = s.isin(s[0:2].values.astype("datetime64[D]")) assert_series_equal(result, expected) result = s.isin([s[1]]) @@ -751,7 +782,7 @@ def test_isin_with_i8(self): assert_series_equal(result, expected) # timedelta64[ns] - s = Series(pd.to_timedelta(range(5), unit='d')) + s = Series(pd.to_timedelta(range(5), unit="d")) result = s.isin(s[0:2]) assert_series_equal(result, expected) @@ -778,39 +809,35 @@ def test_ptp(self): assert s.ptp() == 13 assert pd.isna(s.ptp(skipna=False)) - mi = pd.MultiIndex.from_product([['a', 'b'], [1, 2, 3]]) + mi = pd.MultiIndex.from_product([["a", "b"], [1, 2, 3]]) s = pd.Series([1, np.nan, 7, 3, 5, np.nan], index=mi) - expected = pd.Series([6, 2], index=['a', 'b'], dtype=np.float64) + expected = pd.Series([6, 2], index=["a", "b"], dtype=np.float64) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): tm.assert_series_equal(s.ptp(level=0), expected) - expected = pd.Series([np.nan, np.nan], index=['a', 'b']) + expected = pd.Series([np.nan, np.nan], index=["a", "b"]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): tm.assert_series_equal(s.ptp(level=0, skipna=False), expected) - msg = ("No axis named 1 for object type" - " ") + msg = "No axis named 1 for object type" " " with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): s.ptp(axis=1) - s = pd.Series(['a', 'b', 'c', 'd', 'e']) + s = pd.Series(["a", "b", "c", "d", "e"]) msg = r"unsupported operand type\(s\) for -: 'str' and 'str'" with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): s.ptp() msg = r"Series\.ptp does not implement numeric_only\." with pytest.raises(NotImplementedError, match=msg): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): s.ptp(numeric_only=True) def test_repeat(self): - s = Series(np.random.randn(3), index=['a', 'b', 'c']) + s = Series(np.random.randn(3), index=["a", "b", "c"]) reps = s.repeat(5) exp = Series(s.values.repeat(5), index=s.index.values.repeat(5)) @@ -818,14 +845,12 @@ def test_repeat(self): to_rep = [2, 3, 4] reps = s.repeat(to_rep) - exp = Series(s.values.repeat(to_rep), - index=s.index.values.repeat(to_rep)) + exp = Series(s.values.repeat(to_rep), index=s.index.values.repeat(to_rep)) assert_series_equal(reps, exp) def test_numpy_repeat(self): - s = Series(np.arange(3), name='x') - expected = Series(s.values.repeat(2), name='x', - index=s.index.values.repeat(2)) + s = Series(np.arange(3), name="x") + expected = Series(s.values.repeat(2), name="x", index=s.index.values.repeat(2)) assert_series_equal(np.repeat(s, 2), expected) msg = "the 'axis' parameter is not supported" @@ -835,11 +860,11 @@ def test_numpy_repeat(self): def test_searchsorted(self): s = Series([1, 2, 3]) - result = s.searchsorted(1, side='left') + result = s.searchsorted(1, side="left") assert is_scalar(result) assert result == 0 - result = s.searchsorted(1, side='right') + result = s.searchsorted(1, side="right") assert is_scalar(result) assert result == 1 @@ -860,15 +885,15 @@ def test_searchsorted_numeric_dtypes_vector(self): tm.assert_numpy_array_equal(r, e) def test_search_sorted_datetime64_scalar(self): - s = Series(pd.date_range('20120101', periods=10, freq='2D')) - v = pd.Timestamp('20120102') + s = Series(pd.date_range("20120101", periods=10, freq="2D")) + v = pd.Timestamp("20120102") r = s.searchsorted(v) assert is_scalar(r) assert r == 1 def test_search_sorted_datetime64_list(self): - s = Series(pd.date_range('20120101', periods=10, freq='2D')) - v = [pd.Timestamp('20120102'), pd.Timestamp('20120104')] + s = Series(pd.date_range("20120101", periods=10, freq="2D")) + v = [pd.Timestamp("20120102"), pd.Timestamp("20120104")] r = s.searchsorted(v) e = np.array([1, 2], dtype=np.intp) tm.assert_numpy_array_equal(r, e) @@ -890,7 +915,7 @@ def test_is_monotonic(self): s = Series(np.arange(1000, 0, -1)) assert s.is_monotonic_decreasing is True - s = Series(pd.date_range('20130101', periods=10)) + s = Series(pd.date_range("20130101", periods=10)) assert s.is_monotonic is True assert s.is_monotonic_increasing is True s = Series(list(reversed(s.tolist()))) @@ -898,38 +923,36 @@ def test_is_monotonic(self): assert s.is_monotonic_decreasing is True def test_sort_index_level(self): - mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) s = Series([1, 2], mi) backwards = s.iloc[[1, 0]] - res = s.sort_index(level='A') + res = s.sort_index(level="A") assert_series_equal(backwards, res) - res = s.sort_index(level=['A', 'B']) + res = s.sort_index(level=["A", "B"]) assert_series_equal(backwards, res) - res = s.sort_index(level='A', sort_remaining=False) + res = s.sort_index(level="A", sort_remaining=False) assert_series_equal(s, res) - res = s.sort_index(level=['A', 'B'], sort_remaining=False) + res = s.sort_index(level=["A", "B"], sort_remaining=False) assert_series_equal(s, res) def test_apply_categorical(self): - values = pd.Categorical(list('ABBABCD'), categories=list('DCBA'), - ordered=True) - s = pd.Series(values, name='XX', index=list('abcdefg')) + values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) + s = pd.Series(values, name="XX", index=list("abcdefg")) result = s.apply(lambda x: x.lower()) # should be categorical dtype when the number of categories are # the same - values = pd.Categorical(list('abbabcd'), categories=list('dcba'), - ordered=True) - exp = pd.Series(values, name='XX', index=list('abcdefg')) + values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True) + exp = pd.Series(values, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) tm.assert_categorical_equal(result.values, exp.values) - result = s.apply(lambda x: 'A') - exp = pd.Series(['A'] * 7, name='XX', index=list('abcdefg')) + result = s.apply(lambda x: "A") + exp = pd.Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) assert result.dtype == np.object @@ -941,7 +964,7 @@ def test_shift_int(self, datetime_series): def test_shift_categorical(self): # GH 9416 - s = pd.Series(['a', 'b', 'c', 'd'], dtype='category') + s = pd.Series(["a", "b", "c", "d"], dtype="category") assert_series_equal(s.iloc[:-1], s.shift(1).shift(-1).dropna()) @@ -961,29 +984,35 @@ def test_shift_categorical(self): def test_unstack(self): from numpy import nan - index = MultiIndex(levels=[['bar', 'foo'], ['one', 'three', 'two']], - codes=[[1, 1, 0, 0], [0, 1, 0, 2]]) + index = MultiIndex( + levels=[["bar", "foo"], ["one", "three", "two"]], + codes=[[1, 1, 0, 0], [0, 1, 0, 2]], + ) - s = Series(np.arange(4.), index=index) + s = Series(np.arange(4.0), index=index) unstacked = s.unstack() - expected = DataFrame([[2., nan, 3.], [0., 1., nan]], - index=['bar', 'foo'], - columns=['one', 'three', 'two']) + expected = DataFrame( + [[2.0, nan, 3.0], [0.0, 1.0, nan]], + index=["bar", "foo"], + columns=["one", "three", "two"], + ) assert_frame_equal(unstacked, expected) unstacked = s.unstack(level=0) assert_frame_equal(unstacked, expected.T) - index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], - codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], - [0, 1, 0, 1, 0, 1]]) + index = MultiIndex( + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + ) s = Series(np.random.randn(6), index=index) - exp_index = MultiIndex(levels=[['one', 'two', 'three'], [0, 1]], - codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]) - expected = DataFrame({'bar': s.values}, - index=exp_index).sort_index(level=0) + exp_index = MultiIndex( + levels=[["one", "two", "three"], [0, 1]], + codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + ) + expected = DataFrame({"bar": s.values}, index=exp_index).sort_index(level=0) unstacked = s.unstack(0).sort_index() assert_frame_equal(unstacked, expected) @@ -991,87 +1020,97 @@ def test_unstack(self): idx = pd.MultiIndex.from_arrays([[101, 102], [3.5, np.nan]]) ts = pd.Series([1, 2], index=idx) left = ts.unstack() - right = DataFrame([[nan, 1], [2, nan]], index=[101, 102], - columns=[nan, 3.5]) + right = DataFrame([[nan, 1], [2, nan]], index=[101, 102], columns=[nan, 3.5]) assert_frame_equal(left, right) - idx = pd.MultiIndex.from_arrays([['cat', 'cat', 'cat', 'dog', 'dog' - ], ['a', 'a', 'b', 'a', 'b'], - [1, 2, 1, 1, np.nan]]) + idx = pd.MultiIndex.from_arrays( + [ + ["cat", "cat", "cat", "dog", "dog"], + ["a", "a", "b", "a", "b"], + [1, 2, 1, 1, np.nan], + ] + ) ts = pd.Series([1.0, 1.1, 1.2, 1.3, 1.4], index=idx) - right = DataFrame([[1.0, 1.3], [1.1, nan], [nan, 1.4], [1.2, nan]], - columns=['cat', 'dog']) - tpls = [('a', 1), ('a', 2), ('b', nan), ('b', 1)] + right = DataFrame( + [[1.0, 1.3], [1.1, nan], [nan, 1.4], [1.2, nan]], columns=["cat", "dog"] + ) + tpls = [("a", 1), ("a", 2), ("b", nan), ("b", 1)] right.index = pd.MultiIndex.from_tuples(tpls) assert_frame_equal(ts.unstack(level=0), right) def test_value_counts_datetime(self): # most dtypes are tested in test_base.py - values = [pd.Timestamp('2011-01-01 09:00'), - pd.Timestamp('2011-01-01 10:00'), - pd.Timestamp('2011-01-01 11:00'), - pd.Timestamp('2011-01-01 09:00'), - pd.Timestamp('2011-01-01 09:00'), - pd.Timestamp('2011-01-01 11:00')] - - exp_idx = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 11:00', - '2011-01-01 10:00']) - exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx') - - s = pd.Series(values, name='xxx') + values = [ + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 10:00"), + pd.Timestamp("2011-01-01 11:00"), + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 11:00"), + ] + + exp_idx = pd.DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"] + ) + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + + s = pd.Series(values, name="xxx") tm.assert_series_equal(s.value_counts(), exp) # check DatetimeIndex outputs the same result - idx = pd.DatetimeIndex(values, name='xxx') + idx = pd.DatetimeIndex(values, name="xxx") tm.assert_series_equal(idx.value_counts(), exp) # normalize - exp = pd.Series(np.array([3., 2., 1]) / 6., - index=exp_idx, name='xxx') + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") tm.assert_series_equal(s.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) def test_value_counts_datetime_tz(self): - values = [pd.Timestamp('2011-01-01 09:00', tz='US/Eastern'), - pd.Timestamp('2011-01-01 10:00', tz='US/Eastern'), - pd.Timestamp('2011-01-01 11:00', tz='US/Eastern'), - pd.Timestamp('2011-01-01 09:00', tz='US/Eastern'), - pd.Timestamp('2011-01-01 09:00', tz='US/Eastern'), - pd.Timestamp('2011-01-01 11:00', tz='US/Eastern')] - - exp_idx = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 11:00', - '2011-01-01 10:00'], tz='US/Eastern') - exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx') - - s = pd.Series(values, name='xxx') + values = [ + pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 10:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"), + ] + + exp_idx = pd.DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"], + tz="US/Eastern", + ) + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + + s = pd.Series(values, name="xxx") tm.assert_series_equal(s.value_counts(), exp) - idx = pd.DatetimeIndex(values, name='xxx') + idx = pd.DatetimeIndex(values, name="xxx") tm.assert_series_equal(idx.value_counts(), exp) - exp = pd.Series(np.array([3., 2., 1]) / 6., - index=exp_idx, name='xxx') + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") tm.assert_series_equal(s.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) def test_value_counts_period(self): - values = [pd.Period('2011-01', freq='M'), - pd.Period('2011-02', freq='M'), - pd.Period('2011-03', freq='M'), - pd.Period('2011-01', freq='M'), - pd.Period('2011-01', freq='M'), - pd.Period('2011-03', freq='M')] + values = [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Period("2011-03", freq="M"), + pd.Period("2011-01", freq="M"), + pd.Period("2011-01", freq="M"), + pd.Period("2011-03", freq="M"), + ] - exp_idx = pd.PeriodIndex(['2011-01', '2011-03', '2011-02'], freq='M') - exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx') + exp_idx = pd.PeriodIndex(["2011-01", "2011-03", "2011-02"], freq="M") + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") - s = pd.Series(values, name='xxx') + s = pd.Series(values, name="xxx") tm.assert_series_equal(s.value_counts(), exp) # check DatetimeIndex outputs the same result - idx = pd.PeriodIndex(values, name='xxx') + idx = pd.PeriodIndex(values, name="xxx") tm.assert_series_equal(idx.value_counts(), exp) # normalize - exp = pd.Series(np.array([3., 2., 1]) / 6., - index=exp_idx, name='xxx') + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") tm.assert_series_equal(s.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) @@ -1079,72 +1118,71 @@ def test_value_counts_categorical_ordered(self): # most dtypes are tested in test_base.py values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=True) - exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], - ordered=True) - exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx') + exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=True) + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") - s = pd.Series(values, name='xxx') + s = pd.Series(values, name="xxx") tm.assert_series_equal(s.value_counts(), exp) # check CategoricalIndex outputs the same result - idx = pd.CategoricalIndex(values, name='xxx') + idx = pd.CategoricalIndex(values, name="xxx") tm.assert_series_equal(idx.value_counts(), exp) # normalize - exp = pd.Series(np.array([3., 2., 1]) / 6., - index=exp_idx, name='xxx') + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") tm.assert_series_equal(s.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) def test_value_counts_categorical_not_ordered(self): values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=False) - exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], - ordered=False) - exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx') + exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=False) + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") - s = pd.Series(values, name='xxx') + s = pd.Series(values, name="xxx") tm.assert_series_equal(s.value_counts(), exp) # check CategoricalIndex outputs the same result - idx = pd.CategoricalIndex(values, name='xxx') + idx = pd.CategoricalIndex(values, name="xxx") tm.assert_series_equal(idx.value_counts(), exp) # normalize - exp = pd.Series(np.array([3., 2., 1]) / 6., - index=exp_idx, name='xxx') + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") tm.assert_series_equal(s.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) @pytest.mark.parametrize("func", [np.any, np.all]) - @pytest.mark.parametrize("kwargs", [ - dict(keepdims=True), - dict(out=object()), - ]) + @pytest.mark.parametrize("kwargs", [dict(keepdims=True), dict(out=object())]) @td.skip_if_np_lt("1.15") def test_validate_any_all_out_keepdims_raises(self, kwargs, func): s = pd.Series([1, 2]) param = list(kwargs)[0] name = func.__name__ - msg = (r"the '{arg}' parameter is not " - r"supported in the pandas " - r"implementation of {fname}\(\)").format(arg=param, fname=name) + msg = ( + r"the '{arg}' parameter is not " + r"supported in the pandas " + r"implementation of {fname}\(\)" + ).format(arg=param, fname=name) with pytest.raises(ValueError, match=msg): func(s, **kwargs) @td.skip_if_np_lt("1.15") def test_validate_sum_initial(self): s = pd.Series([1, 2]) - msg = (r"the 'initial' parameter is not " - r"supported in the pandas " - r"implementation of sum\(\)") + msg = ( + r"the 'initial' parameter is not " + r"supported in the pandas " + r"implementation of sum\(\)" + ) with pytest.raises(ValueError, match=msg): np.sum(s, initial=10) def test_validate_median_initial(self): s = pd.Series([1, 2]) - msg = (r"the 'overwrite_input' parameter is not " - r"supported in the pandas " - r"implementation of median\(\)") + msg = ( + r"the 'overwrite_input' parameter is not " + r"supported in the pandas " + r"implementation of median\(\)" + ) with pytest.raises(ValueError, match=msg): # It seems like np.median doesn't dispatch, so we use the # method instead of the ufunc. @@ -1153,36 +1191,38 @@ def test_validate_median_initial(self): @td.skip_if_np_lt("1.15") def test_validate_stat_keepdims(self): s = pd.Series([1, 2]) - msg = (r"the 'keepdims' parameter is not " - r"supported in the pandas " - r"implementation of sum\(\)") + msg = ( + r"the 'keepdims' parameter is not " + r"supported in the pandas " + r"implementation of sum\(\)" + ) with pytest.raises(ValueError, match=msg): np.sum(s, keepdims=True) def test_compound_deprecated(self): - s = Series([.1, .2, .3, .4]) + s = Series([0.1, 0.2, 0.3, 0.4]) with tm.assert_produces_warning(FutureWarning): s.compound() - df = pd.DataFrame({'s': s}) + df = pd.DataFrame({"s": s}) with tm.assert_produces_warning(FutureWarning): df.compound() main_dtypes = [ - 'datetime', - 'datetimetz', - 'timedelta', - 'int8', - 'int16', - 'int32', - 'int64', - 'float32', - 'float64', - 'uint8', - 'uint16', - 'uint32', - 'uint64' + "datetime", + "datetimetz", + "timedelta", + "int8", + "int16", + "int32", + "int64", + "float32", + "float64", + "uint8", + "uint16", + "uint32", + "uint64", ] @@ -1199,19 +1239,27 @@ def s_main_dtypes(): The columns are the name of the dtype. """ df = pd.DataFrame( - {'datetime': pd.to_datetime(['2003', '2002', - '2001', '2002', - '2005']), - 'datetimetz': pd.to_datetime( - ['2003', '2002', - '2001', '2002', - '2005']).tz_localize('US/Eastern'), - 'timedelta': pd.to_timedelta(['3d', '2d', '1d', - '2d', '5d'])}) - - for dtype in ['int8', 'int16', 'int32', 'int64', - 'float32', 'float64', - 'uint8', 'uint16', 'uint32', 'uint64']: + { + "datetime": pd.to_datetime(["2003", "2002", "2001", "2002", "2005"]), + "datetimetz": pd.to_datetime( + ["2003", "2002", "2001", "2002", "2005"] + ).tz_localize("US/Eastern"), + "timedelta": pd.to_timedelta(["3d", "2d", "1d", "2d", "5d"]), + } + ) + + for dtype in [ + "int8", + "int16", + "int32", + "int64", + "float32", + "float64", + "uint8", + "uint16", + "uint32", + "uint64", + ]: df[dtype] = Series([3, 2, 1, 2, 5], dtype=dtype) return df @@ -1227,25 +1275,27 @@ def assert_check_nselect_boundary(vals, dtype, method): # helper function for 'test_boundary_{dtype}' tests s = Series(vals, dtype=dtype) result = getattr(s, method)(3) - expected_idxr = [0, 1, 2] if method == 'nsmallest' else [3, 2, 1] + expected_idxr = [0, 1, 2] if method == "nsmallest" else [3, 2, 1] expected = s.loc[expected_idxr] tm.assert_series_equal(result, expected) class TestNLargestNSmallest: - @pytest.mark.parametrize( - "r", [Series([3., 2, 1, 2, '5'], dtype='object'), - Series([3., 2, 1, 2, 5], dtype='object'), - # not supported on some archs - # Series([3., 2, 1, 2, 5], dtype='complex256'), - Series([3., 2, 1, 2, 5], dtype='complex128'), - Series(list('abcde')), - Series(list('abcde'), dtype='category')]) + "r", + [ + Series([3.0, 2, 1, 2, "5"], dtype="object"), + Series([3.0, 2, 1, 2, 5], dtype="object"), + # not supported on some archs + # Series([3., 2, 1, 2, 5], dtype='complex256'), + Series([3.0, 2, 1, 2, 5], dtype="complex128"), + Series(list("abcde")), + Series(list("abcde"), dtype="category"), + ], + ) def test_error(self, r): dt = r.dtype - msg = ("Cannot use method 'n(larg|small)est' with " - "dtype {dt}".format(dt=dt)) + msg = "Cannot use method 'n(larg|small)est' with " "dtype {dt}".format(dt=dt) args = 2, len(r), 0, -1 methods = r.nlargest, r.nsmallest for method, arg in product(methods, args): @@ -1258,7 +1308,7 @@ def test_nsmallest_nlargest(self, s_main_dtypes_split): s = s_main_dtypes_split assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]]) - assert_series_equal(s.nsmallest(2, keep='last'), s.iloc[[2, 3]]) + assert_series_equal(s.nsmallest(2, keep="last"), s.iloc[[2, 3]]) empty = s.iloc[0:0] assert_series_equal(s.nsmallest(0), empty) @@ -1269,20 +1319,19 @@ def test_nsmallest_nlargest(self, s_main_dtypes_split): assert_series_equal(s.nsmallest(len(s)), s.sort_values()) assert_series_equal(s.nsmallest(len(s) + 1), s.sort_values()) assert_series_equal(s.nlargest(len(s)), s.iloc[[4, 0, 1, 3, 2]]) - assert_series_equal(s.nlargest(len(s) + 1), - s.iloc[[4, 0, 1, 3, 2]]) + assert_series_equal(s.nlargest(len(s) + 1), s.iloc[[4, 0, 1, 3, 2]]) def test_misc(self): - s = Series([3., np.nan, 1, 2, 5]) + s = Series([3.0, np.nan, 1, 2, 5]) assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]]) assert_series_equal(s.nsmallest(), s.iloc[[2, 3, 0, 4]]) msg = 'keep must be either "first", "last"' with pytest.raises(ValueError, match=msg): - s.nsmallest(keep='invalid') + s.nsmallest(keep="invalid") with pytest.raises(ValueError, match=msg): - s.nlargest(keep='invalid') + s.nlargest(keep="invalid") # GH 15297 s = Series([1] * 5, index=[1, 2, 3, 4, 5]) @@ -1292,16 +1341,16 @@ def test_misc(self): result = s.nsmallest(3) assert_series_equal(result, expected_first) - result = s.nsmallest(3, keep='last') + result = s.nsmallest(3, keep="last") assert_series_equal(result, expected_last) result = s.nlargest(3) assert_series_equal(result, expected_first) - result = s.nlargest(3, keep='last') + result = s.nlargest(3, keep="last") assert_series_equal(result, expected_last) - @pytest.mark.parametrize('n', range(1, 5)) + @pytest.mark.parametrize("n", range(1, 5)) def test_n(self, n): # GH 13412 @@ -1325,17 +1374,16 @@ def test_boundary_float(self, nselect_method, float_dtype): # GH 21426 dtype_info = np.finfo(float_dtype) min_val, max_val = dtype_info.min, dtype_info.max - min_2nd, max_2nd = np.nextafter( - [min_val, max_val], 0, dtype=float_dtype) + min_2nd, max_2nd = np.nextafter([min_val, max_val], 0, dtype=float_dtype) vals = [min_val, min_2nd, max_2nd, max_val] assert_check_nselect_boundary(vals, float_dtype, nselect_method) - @pytest.mark.parametrize('dtype', ['datetime64[ns]', 'timedelta64[ns]']) + @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) def test_boundary_datetimelike(self, nselect_method, dtype): # GH 21426 # use int64 bounds and +1 to min_val since true minimum is NaT # (include min_val/NaT at end to maintain same expected_idxr) - dtype_info = np.iinfo('int64') + dtype_info = np.iinfo("int64") min_val, max_val = dtype_info.min, dtype_info.max vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val] assert_check_nselect_boundary(vals, dtype, nselect_method) @@ -1343,17 +1391,17 @@ def test_boundary_datetimelike(self, nselect_method, dtype): def test_duplicate_keep_all_ties(self): # see gh-16818 s = Series([10, 9, 8, 7, 7, 7, 7, 6]) - result = s.nlargest(4, keep='all') + result = s.nlargest(4, keep="all") expected = Series([10, 9, 8, 7, 7, 7, 7]) assert_series_equal(result, expected) - result = s.nsmallest(2, keep='all') + result = s.nsmallest(2, keep="all") expected = Series([6, 7, 7, 7, 7], index=[7, 3, 4, 5, 6]) assert_series_equal(result, expected) - @pytest.mark.parametrize('data,expected', - [([True, False], [True]), - ([True, False, True, True], [True])]) + @pytest.mark.parametrize( + "data,expected", [([True, False], [True]), ([True, False, True, True], [True])] + ) def test_boolean(self, data, expected): # GH 26154 : ensure True > False s = Series(data) @@ -1363,35 +1411,37 @@ def test_boolean(self, data, expected): class TestCategoricalSeriesAnalytics: - def test_count(self): - s = Series(Categorical([np.nan, 1, 2, np.nan], - categories=[5, 4, 3, 2, 1], ordered=True)) + s = Series( + Categorical( + [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True + ) + ) result = s.count() assert result == 2 def test_value_counts(self): # GH 12835 - cats = Categorical(list('abcccb'), categories=list('cabd')) - s = Series(cats, name='xxx') + cats = Categorical(list("abcccb"), categories=list("cabd")) + s = Series(cats, name="xxx") res = s.value_counts(sort=False) - exp_index = CategoricalIndex(list('cabd'), categories=cats.categories) - exp = Series([3, 1, 2, 0], name='xxx', index=exp_index) + exp_index = CategoricalIndex(list("cabd"), categories=cats.categories) + exp = Series([3, 1, 2, 0], name="xxx", index=exp_index) tm.assert_series_equal(res, exp) res = s.value_counts(sort=True) - exp_index = CategoricalIndex(list('cbad'), categories=cats.categories) - exp = Series([3, 2, 1, 0], name='xxx', index=exp_index) + exp_index = CategoricalIndex(list("cbad"), categories=cats.categories) + exp = Series([3, 2, 1, 0], name="xxx", index=exp_index) tm.assert_series_equal(res, exp) # check object dtype handles the Series.name as the same # (tested in test_base.py) - s = Series(["a", "b", "c", "c", "c", "b"], name='xxx') + s = Series(["a", "b", "c", "c", "c", "b"], name="xxx") res = s.value_counts() - exp = Series([3, 2, 1], name='xxx', index=["c", "b", "a"]) + exp = Series([3, 2, 1], name="xxx", index=["c", "b", "a"]) tm.assert_series_equal(res, exp) def test_value_counts_with_nan(self): @@ -1410,8 +1460,9 @@ def test_value_counts_with_nan(self): # same Series via two different constructions --> same behaviour series = [ Series(["a", "b", None, "a", None, None], dtype="category"), - Series(Categorical(["a", "b", None, "a", None, None], - categories=["a", "b"])) + Series( + Categorical(["a", "b", None, "a", None, None], categories=["a", "b"]) + ), ] for s in series: @@ -1433,18 +1484,23 @@ def test_value_counts_with_nan(self): @pytest.mark.parametrize( "dtype", - ["int_", "uint", "float_", "unicode_", "timedelta64[h]", - pytest.param("datetime64[D]", - marks=pytest.mark.xfail(reason="GH#7996", strict=False))] + [ + "int_", + "uint", + "float_", + "unicode_", + "timedelta64[h]", + pytest.param( + "datetime64[D]", marks=pytest.mark.xfail(reason="GH#7996", strict=False) + ), + ], ) - def test_drop_duplicates_categorical_non_bool(self, dtype, - ordered_fixture): + def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture): cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) # Test case 1 input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype)) - tc1 = Series(Categorical(input1, categories=cat_array, - ordered=ordered_fixture)) + tc1 = Series(Categorical(input1, categories=cat_array, ordered=ordered_fixture)) expected = Series([False, False, False, True]) tm.assert_series_equal(tc1.duplicated(), expected) @@ -1454,11 +1510,10 @@ def test_drop_duplicates_categorical_non_bool(self, dtype, tm.assert_series_equal(sc, tc1[~expected]) expected = Series([False, False, True, False]) - tm.assert_series_equal(tc1.duplicated(keep='last'), expected) - tm.assert_series_equal(tc1.drop_duplicates(keep='last'), - tc1[~expected]) + tm.assert_series_equal(tc1.duplicated(keep="last"), expected) + tm.assert_series_equal(tc1.drop_duplicates(keep="last"), tc1[~expected]) sc = tc1.copy() - sc.drop_duplicates(keep='last', inplace=True) + sc.drop_duplicates(keep="last", inplace=True) tm.assert_series_equal(sc, tc1[~expected]) expected = Series([False, False, True, True]) @@ -1470,9 +1525,7 @@ def test_drop_duplicates_categorical_non_bool(self, dtype, # Test case 2 input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) - tc2 = Series(Categorical( - input2, categories=cat_array, ordered=ordered_fixture) - ) + tc2 = Series(Categorical(input2, categories=cat_array, ordered=ordered_fixture)) expected = Series([False, False, False, False, True, True, False]) tm.assert_series_equal(tc2.duplicated(), expected) @@ -1482,11 +1535,10 @@ def test_drop_duplicates_categorical_non_bool(self, dtype, tm.assert_series_equal(sc, tc2[~expected]) expected = Series([False, True, True, False, False, False, False]) - tm.assert_series_equal(tc2.duplicated(keep='last'), expected) - tm.assert_series_equal(tc2.drop_duplicates(keep='last'), - tc2[~expected]) + tm.assert_series_equal(tc2.duplicated(keep="last"), expected) + tm.assert_series_equal(tc2.drop_duplicates(keep="last"), tc2[~expected]) sc = tc2.copy() - sc.drop_duplicates(keep='last', inplace=True) + sc.drop_duplicates(keep="last", inplace=True) tm.assert_series_equal(sc, tc2[~expected]) expected = Series([False, True, True, False, True, True, False]) @@ -1497,9 +1549,13 @@ def test_drop_duplicates_categorical_non_bool(self, dtype, tm.assert_series_equal(sc, tc2[~expected]) def test_drop_duplicates_categorical_bool(self, ordered_fixture): - tc = Series(Categorical([True, False, True, False], - categories=[True, False], - ordered=ordered_fixture)) + tc = Series( + Categorical( + [True, False, True, False], + categories=[True, False], + ordered=ordered_fixture, + ) + ) expected = Series([False, False, True, True]) tm.assert_series_equal(tc.duplicated(), expected) @@ -1509,10 +1565,10 @@ def test_drop_duplicates_categorical_bool(self, ordered_fixture): tm.assert_series_equal(sc, tc[~expected]) expected = Series([True, True, False, False]) - tm.assert_series_equal(tc.duplicated(keep='last'), expected) - tm.assert_series_equal(tc.drop_duplicates(keep='last'), tc[~expected]) + tm.assert_series_equal(tc.duplicated(keep="last"), expected) + tm.assert_series_equal(tc.drop_duplicates(keep="last"), tc[~expected]) sc = tc.copy() - sc.drop_duplicates(keep='last', inplace=True) + sc.drop_duplicates(keep="last", inplace=True) tm.assert_series_equal(sc, tc[~expected]) expected = Series([True, True, True, True]) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 71b0a2d9d74eb8..2097264ba5e785 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -7,8 +7,16 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, DatetimeIndex, Index, Series, TimedeltaIndex, - date_range, period_range, timedelta_range) + Categorical, + DataFrame, + DatetimeIndex, + Index, + Series, + TimedeltaIndex, + date_range, + period_range, + timedelta_range, +) from pandas.core.arrays import PeriodArray from pandas.core.indexes.datetimes import Timestamp import pandas.util.testing as tm @@ -26,6 +34,7 @@ class SharedWithSparse: In generic tests on this class, use ``self._assert_series_equal()`` which is implemented in sub-classes. """ + def _assert_series_equal(self, left, right): """Dispatch to series class dependent assertion""" raise NotImplementedError @@ -47,7 +56,7 @@ def test_copy_index_name_checking(self): assert self.ts is self.ts cp = self.ts.copy() - cp.index.name = 'foo' + cp.index.name = "foo" printing.pprint_thing(self.ts.index.name) assert self.ts.index.name is None @@ -67,14 +76,14 @@ def test_binop_maybe_preserve_name(self): # names don't match, don't preserve cp = self.ts.copy() - cp.name = 'something else' + cp.name = "something else" result = self.ts + cp assert result.name is None result = self.ts.add(cp) assert result.name is None - ops = ['add', 'sub', 'mul', 'div', 'truediv', 'floordiv', 'mod', 'pow'] - ops = ops + ['r' + op for op in ops] + ops = ["add", "sub", "mul", "div", "truediv", "floordiv", "mod", "pow"] + ops = ops + ["r" + op for op in ops] for op in ops: # names match, preserve s = self.ts.copy() @@ -83,7 +92,7 @@ def test_binop_maybe_preserve_name(self): # names don't match, don't preserve cp = self.ts.copy() - cp.name = 'changed' + cp.name = "changed" result = getattr(s, op)(cp) assert result.name is None @@ -129,14 +138,13 @@ def test_to_sparse_pass_name(self): assert result.name == self.ts.name def test_constructor_dict(self): - d = {'a': 0., 'b': 1., 'c': 2.} + d = {"a": 0.0, "b": 1.0, "c": 2.0} result = self.series_klass(d) expected = self.series_klass(d, index=sorted(d.keys())) self._assert_series_equal(result, expected) - result = self.series_klass(d, index=['b', 'c', 'd', 'a']) - expected = self.series_klass([1, 2, np.nan, 0], - index=['b', 'c', 'd', 'a']) + result = self.series_klass(d, index=["b", "c", "d", "a"]) + expected = self.series_klass([1, 2, np.nan, 0], index=["b", "c", "d", "a"]) self._assert_series_equal(result, expected) def test_constructor_subclass_dict(self): @@ -147,8 +155,7 @@ def test_constructor_subclass_dict(self): def test_constructor_ordereddict(self): # GH3283 - data = OrderedDict( - ('col%s' % i, np.random.random()) for i in range(12)) + data = OrderedDict(("col%s" % i, np.random.random()) for i in range(12)) series = self.series_klass(data) expected = self.series_klass(list(data.values()), list(data.keys())) @@ -162,20 +169,20 @@ class A(OrderedDict): self._assert_series_equal(series, expected) def test_constructor_dict_multiindex(self): - d = {('a', 'a'): 0., ('b', 'a'): 1., ('b', 'c'): 2.} + d = {("a", "a"): 0.0, ("b", "a"): 1.0, ("b", "c"): 2.0} _d = sorted(d.items()) result = self.series_klass(d) expected = self.series_klass( - [x[1] for x in _d], - index=pd.MultiIndex.from_tuples([x[0] for x in _d])) + [x[1] for x in _d], index=pd.MultiIndex.from_tuples([x[0] for x in _d]) + ) self._assert_series_equal(result, expected) - d['z'] = 111. - _d.insert(0, ('z', d['z'])) + d["z"] = 111.0 + _d.insert(0, ("z", d["z"])) result = self.series_klass(d) - expected = self.series_klass([x[1] for x in _d], - index=pd.Index([x[0] for x in _d], - tupleize_cols=False)) + expected = self.series_klass( + [x[1] for x in _d], index=pd.Index([x[0] for x in _d], tupleize_cols=False) + ) result = result.reindex(index=expected.index) self._assert_series_equal(result, expected) @@ -184,15 +191,16 @@ def test_constructor_dict_timedelta_index(self): # construct Series from dict as data and TimedeltaIndex as index # will result NaN in result Series data expected = self.series_klass( - data=['A', 'B', 'C'], - index=pd.to_timedelta([0, 10, 20], unit='s') + data=["A", "B", "C"], index=pd.to_timedelta([0, 10, 20], unit="s") ) result = self.series_klass( - data={pd.to_timedelta(0, unit='s'): 'A', - pd.to_timedelta(10, unit='s'): 'B', - pd.to_timedelta(20, unit='s'): 'C'}, - index=pd.to_timedelta([0, 10, 20], unit='s') + data={ + pd.to_timedelta(0, unit="s"): "A", + pd.to_timedelta(10, unit="s"): "B", + pd.to_timedelta(20, unit="s"): "C", + }, + index=pd.to_timedelta([0, 10, 20], unit="s"), ) self._assert_series_equal(result, expected) @@ -200,8 +208,7 @@ def test_constructor_dict_timedelta_index(self): def test_from_array_deprecated(self): # multiple FutureWarnings, so can't assert stacklevel - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): self.series_klass.from_array([1, 2, 3]) def test_sparse_accessor_updates_on_inplace(self): @@ -218,71 +225,85 @@ class TestSeriesMisc(TestData, SharedWithSparse): def test_tab_completion(self): # GH 9910 - s = Series(list('abcd')) + s = Series(list("abcd")) # Series of str values should have .str but not .dt/.cat in __dir__ - assert 'str' in dir(s) - assert 'dt' not in dir(s) - assert 'cat' not in dir(s) + assert "str" in dir(s) + assert "dt" not in dir(s) + assert "cat" not in dir(s) # similarly for .dt - s = Series(date_range('1/1/2015', periods=5)) - assert 'dt' in dir(s) - assert 'str' not in dir(s) - assert 'cat' not in dir(s) + s = Series(date_range("1/1/2015", periods=5)) + assert "dt" in dir(s) + assert "str" not in dir(s) + assert "cat" not in dir(s) # Similarly for .cat, but with the twist that str and dt should be # there if the categories are of that type first cat and str. - s = Series(list('abbcd'), dtype="category") - assert 'cat' in dir(s) - assert 'str' in dir(s) # as it is a string categorical - assert 'dt' not in dir(s) + s = Series(list("abbcd"), dtype="category") + assert "cat" in dir(s) + assert "str" in dir(s) # as it is a string categorical + assert "dt" not in dir(s) # similar to cat and str - s = Series(date_range('1/1/2015', periods=5)).astype("category") - assert 'cat' in dir(s) - assert 'str' not in dir(s) - assert 'dt' in dir(s) # as it is a datetime categorical + s = Series(date_range("1/1/2015", periods=5)).astype("category") + assert "cat" in dir(s) + assert "str" not in dir(s) + assert "dt" in dir(s) # as it is a datetime categorical def test_tab_completion_with_categorical(self): # test the tab completion display - ok_for_cat = ['name', 'index', 'categorical', 'categories', 'codes', - 'ordered', 'set_categories', 'add_categories', - 'remove_categories', 'rename_categories', - 'reorder_categories', 'remove_unused_categories', - 'as_ordered', 'as_unordered'] + ok_for_cat = [ + "name", + "index", + "categorical", + "categories", + "codes", + "ordered", + "set_categories", + "add_categories", + "remove_categories", + "rename_categories", + "reorder_categories", + "remove_unused_categories", + "as_ordered", + "as_unordered", + ] def get_dir(s): - results = [r for r in s.cat.__dir__() if not r.startswith('_')] + results = [r for r in s.cat.__dir__() if not r.startswith("_")] return list(sorted(set(results))) - s = Series(list('aabbcde')).astype('category') + s = Series(list("aabbcde")).astype("category") results = get_dir(s) tm.assert_almost_equal(results, list(sorted(set(ok_for_cat)))) - @pytest.mark.parametrize("index", [ - tm.makeUnicodeIndex(10), - tm.makeStringIndex(10), - tm.makeCategoricalIndex(10), - Index(['foo', 'bar', 'baz'] * 2), - tm.makeDateIndex(10), - tm.makePeriodIndex(10), - tm.makeTimedeltaIndex(10), - tm.makeIntIndex(10), - tm.makeUIntIndex(10), - tm.makeIntIndex(10), - tm.makeFloatIndex(10), - Index([True, False]), - Index(['a{}'.format(i) for i in range(101)]), - pd.MultiIndex.from_tuples(zip('ABCD', 'EFGH')), - pd.MultiIndex.from_tuples(zip([0, 1, 2, 3], 'EFGH')), ]) + @pytest.mark.parametrize( + "index", + [ + tm.makeUnicodeIndex(10), + tm.makeStringIndex(10), + tm.makeCategoricalIndex(10), + Index(["foo", "bar", "baz"] * 2), + tm.makeDateIndex(10), + tm.makePeriodIndex(10), + tm.makeTimedeltaIndex(10), + tm.makeIntIndex(10), + tm.makeUIntIndex(10), + tm.makeIntIndex(10), + tm.makeFloatIndex(10), + Index([True, False]), + Index(["a{}".format(i) for i in range(101)]), + pd.MultiIndex.from_tuples(zip("ABCD", "EFGH")), + pd.MultiIndex.from_tuples(zip([0, 1, 2, 3], "EFGH")), + ], + ) def test_index_tab_completion(self, index): # dir contains string-like values of the Index. s = pd.Series(index=index) dir_s = dir(s) for i, x in enumerate(s.index.unique(level=0)): if i < 100: - assert (not isinstance(x, str) or - not x.isidentifier() or x in dir_s) + assert not isinstance(x, str) or not x.isidentifier() or x in dir_s else: assert x not in dir_s @@ -322,7 +343,7 @@ def test_iteritems(self): assert val == self.ts[idx] # assert is lazy (genrators don't define reverse, lists do) - assert not hasattr(self.series.iteritems(), 'reverse') + assert not hasattr(self.series.iteritems(), "reverse") def test_items(self): for idx, val in self.series.items(): @@ -332,7 +353,7 @@ def test_items(self): assert val == self.ts[idx] # assert is lazy (genrators don't define reverse, lists do) - assert not hasattr(self.series.items(), 'reverse') + assert not hasattr(self.series.items(), "reverse") def test_raise_on_info(self): s = Series(np.random.randn(10)) @@ -343,7 +364,7 @@ def test_raise_on_info(self): def test_copy(self): for deep in [None, False, True]: - s = Series(np.arange(10), dtype='float64') + s = Series(np.arange(10), dtype="float64") # default deep is True if deep is None: @@ -365,19 +386,19 @@ def test_copy(self): def test_copy_tzaware(self): # GH#11794 # copy of tz-aware - expected = Series([Timestamp('2012/01/01', tz='UTC')]) - expected2 = Series([Timestamp('1999/01/01', tz='UTC')]) + expected = Series([Timestamp("2012/01/01", tz="UTC")]) + expected2 = Series([Timestamp("1999/01/01", tz="UTC")]) for deep in [None, False, True]: - s = Series([Timestamp('2012/01/01', tz='UTC')]) + s = Series([Timestamp("2012/01/01", tz="UTC")]) if deep is None: s2 = s.copy() else: s2 = s.copy(deep=deep) - s2[0] = pd.Timestamp('1999/01/01', tz='UTC') + s2[0] = pd.Timestamp("1999/01/01", tz="UTC") # default deep is True if deep is None or deep is True: @@ -391,10 +412,10 @@ def test_copy_tzaware(self): def test_axis_alias(self): s = Series([1, 2, np.nan]) - assert_series_equal(s.dropna(axis='rows'), s.dropna(axis='index')) - assert s.dropna().sum('rows') == 3 - assert s._get_axis_number('rows') == 0 - assert s._get_axis_name('rows') == 'index' + assert_series_equal(s.dropna(axis="rows"), s.dropna(axis="index")) + assert s.dropna().sum("rows") == 3 + assert s._get_axis_number("rows") == 0 + assert s._get_axis_name("rows") == "index" def test_class_axis(self): # https://github.com/pandas-dev/pandas/issues/18147 @@ -408,8 +429,11 @@ def test_numpy_unique(self): def test_ndarray_compat(self): # test numpy compat with Series as sub-class of NDFrame - tsdf = DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'], - index=date_range('1/1/2000', periods=1000)) + tsdf = DataFrame( + np.random.randn(1000, 3), + columns=["A", "B", "C"], + index=date_range("1/1/2000", periods=1000), + ) def f(x): return x[x.idxmax()] @@ -428,53 +452,53 @@ def f(x): # using an ndarray like function s = Series(np.random.randn(10)) result = Series(np.ones_like(s)) - expected = Series(1, index=range(10), dtype='float64') + expected = Series(1, index=range(10), dtype="float64") tm.assert_series_equal(result, expected) # ravel s = Series(np.random.randn(10)) - tm.assert_almost_equal(s.ravel(order='F'), s.values.ravel(order='F')) + tm.assert_almost_equal(s.ravel(order="F"), s.values.ravel(order="F")) # compress # GH 6658 - s = Series([0, 1., -1], index=list('abc')) + s = Series([0, 1.0, -1], index=list("abc")) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s > 0, s) - tm.assert_series_equal(result, Series([1.], index=['b'])) + tm.assert_series_equal(result, Series([1.0], index=["b"])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s < -1, s) # result empty Index(dtype=object) as the same as original - exp = Series([], dtype='float64', index=Index([], dtype='object')) + exp = Series([], dtype="float64", index=Index([], dtype="object")) tm.assert_series_equal(result, exp) - s = Series([0, 1., -1], index=[.1, .2, .3]) + s = Series([0, 1.0, -1], index=[0.1, 0.2, 0.3]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s > 0, s) - tm.assert_series_equal(result, Series([1.], index=[.2])) + tm.assert_series_equal(result, Series([1.0], index=[0.2])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s < -1, s) # result empty Float64Index as the same as original - exp = Series([], dtype='float64', index=Index([], dtype='float64')) + exp = Series([], dtype="float64", index=Index([], dtype="float64")) tm.assert_series_equal(result, exp) def test_str_accessor_updates_on_inplace(self): - s = pd.Series(list('abc')) + s = pd.Series(list("abc")) s.drop([0], inplace=True) assert len(s.str.lower()) == 2 def test_str_attribute(self): # GH9068 - methods = ['strip', 'rstrip', 'lstrip'] - s = Series([' jack', 'jill ', ' jesse ', 'frank']) + methods = ["strip", "rstrip", "lstrip"] + s = Series([" jack", "jill ", " jesse ", "frank"]) for method in methods: expected = Series([getattr(str, method)(x) for x in s.values]) assert_series_equal(getattr(Series.str, method)(s.str), expected) # str accessor only valid with string values s = Series(range(5)) - with pytest.raises(AttributeError, match='only use .str accessor'): + with pytest.raises(AttributeError, match="only use .str accessor"): s.str.repeat(2) def test_empty_method(self): @@ -486,14 +510,14 @@ def test_empty_method(self): def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 - pytest.importorskip('IPython', minversion="6.0.0") + pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; s = pd.Series()" ip.run_code(code) with tm.assert_produces_warning(None): - with provisionalcompleter('ignore'): - list(ip.Completer.completions('s.', 1)) + with provisionalcompleter("ignore"): + list(ip.Completer.completions("s.", 1)) def test_integer_series_size(self): # GH 25580 @@ -510,7 +534,6 @@ def test_get_values_deprecation(self): class TestCategoricalSeries: - @pytest.mark.parametrize( "method", [ @@ -522,11 +545,12 @@ class TestCategoricalSeries: lambda x: x.cat.add_categories([4]), lambda x: x.cat.as_ordered(), lambda x: x.cat.as_unordered(), - ]) + ], + ) def test_getname_categorical_accessor(self, method): # GH 17509 - s = Series([1, 2, 3], name='A').astype('category') - expected = 'A' + s = Series([1, 2, 3], name="A").astype("category") + expected = "A" result = method(s).name assert result == expected @@ -549,24 +573,24 @@ def test_cat_accessor(self): def test_cat_accessor_api(self): # GH 9322 from pandas.core.arrays.categorical import CategoricalAccessor + assert Series.cat is CategoricalAccessor - s = Series(list('aabbcde')).astype('category') + s = Series(list("aabbcde")).astype("category") assert isinstance(s.cat, CategoricalAccessor) invalid = Series([1]) with pytest.raises(AttributeError, match="only use .cat accessor"): invalid.cat - assert not hasattr(invalid, 'cat') + assert not hasattr(invalid, "cat") def test_cat_accessor_no_new_attributes(self): # https://github.com/pandas-dev/pandas/issues/10673 - c = Series(list('aabbcde')).astype('category') - with pytest.raises(AttributeError, - match="You cannot add any new attribute"): + c = Series(list("aabbcde")).astype("category") + with pytest.raises(AttributeError, match="You cannot add any new attribute"): c.cat.xlabel = "a" def test_cat_accessor_updates_on_inplace(self): - s = Series(list('abc')).astype('category') + s = Series(list("abc")).astype("category") s.drop(0, inplace=True) s.cat.remove_unused_categories(inplace=True) assert len(s.cat.categories) == 2 @@ -580,11 +604,11 @@ def test_categorical_delegations(self): with pytest.raises(AttributeError, match=msg): Series([1, 2, 3]).cat() with pytest.raises(AttributeError, match=msg): - Series(['a', 'b', 'c']).cat + Series(["a", "b", "c"]).cat with pytest.raises(AttributeError, match=msg): - Series(np.arange(5.)).cat + Series(np.arange(5.0)).cat with pytest.raises(AttributeError, match=msg): - Series([Timestamp('20130101')]).cat + Series([Timestamp("20130101")]).cat # Series should delegate calls to '.categories', '.codes', '.ordered' # and the methods '.set_categories()' 'drop_unused_categories()' to the @@ -596,7 +620,7 @@ def test_categorical_delegations(self): exp_categories = Index([1, 2, 3]) tm.assert_index_equal(s.cat.categories, exp_categories) - exp_codes = Series([0, 1, 2, 0], dtype='int8') + exp_codes = Series([0, 1, 2, 0], dtype="int8") tm.assert_series_equal(s.cat.codes, exp_codes) assert s.cat.ordered @@ -615,8 +639,7 @@ def test_categorical_delegations(self): tm.assert_numpy_array_equal(s.__array__(), exp_values) # remove unused categories - s = Series(Categorical(["a", "b", "b", "a"], categories=["a", "b", "c" - ])) + s = Series(Categorical(["a", "b", "b", "a"], categories=["a", "b", "c"])) exp_categories = Index(["a", "b"]) exp_values = np.array(["a", "b", "b", "a"], dtype=np.object_) s = s.cat.remove_unused_categories() @@ -635,22 +658,22 @@ def test_categorical_delegations(self): # GH18862 (let Series.cat.rename_categories take callables) s = Series(Categorical(["a", "b", "c", "a"], ordered=True)) result = s.cat.rename_categories(lambda x: x.upper()) - expected = Series(Categorical(["A", "B", "C", "A"], - categories=["A", "B", "C"], - ordered=True)) + expected = Series( + Categorical(["A", "B", "C", "A"], categories=["A", "B", "C"], ordered=True) + ) tm.assert_series_equal(result, expected) def test_dt_accessor_api_for_categorical(self): # https://github.com/pandas-dev/pandas/issues/10661 from pandas.core.indexes.accessors import Properties - s_dr = Series(date_range('1/1/2015', periods=5, tz="MET")) + s_dr = Series(date_range("1/1/2015", periods=5, tz="MET")) c_dr = s_dr.astype("category") - s_pr = Series(period_range('1/1/2015', freq='D', periods=5)) + s_pr = Series(period_range("1/1/2015", freq="D", periods=5)) c_pr = s_pr.astype("category") - s_tdr = Series(timedelta_range('1 days', '10 days')) + s_tdr = Series(timedelta_range("1 days", "10 days")) c_tdr = s_tdr.astype("category") # only testing field (like .day) @@ -660,29 +683,36 @@ def test_dt_accessor_api_for_categorical(self): test_data = [ ("Datetime", get_ops(DatetimeIndex), s_dr, c_dr), ("Period", get_ops(PeriodArray), s_pr, c_pr), - ("Timedelta", get_ops(TimedeltaIndex), s_tdr, c_tdr)] + ("Timedelta", get_ops(TimedeltaIndex), s_tdr, c_tdr), + ] assert isinstance(c_dr.dt, Properties) special_func_defs = [ - ('strftime', ("%Y-%m-%d",), {}), - ('tz_convert', ("EST",), {}), - ('round', ("D",), {}), - ('floor', ("D",), {}), - ('ceil', ("D",), {}), - ('asfreq', ("D",), {}), + ("strftime", ("%Y-%m-%d",), {}), + ("tz_convert", ("EST",), {}), + ("round", ("D",), {}), + ("floor", ("D",), {}), + ("ceil", ("D",), {}), + ("asfreq", ("D",), {}), # ('tz_localize', ("UTC",), {}), ] _special_func_names = [f[0] for f in special_func_defs] # the series is already localized - _ignore_names = ['tz_localize', 'components'] + _ignore_names = ["tz_localize", "components"] for name, attr_names, s, c in test_data: - func_names = [f - for f in dir(s.dt) - if not (f.startswith("_") or f in attr_names or f in - _special_func_names or f in _ignore_names)] + func_names = [ + f + for f in dir(s.dt) + if not ( + f.startswith("_") + or f in attr_names + or f in _special_func_names + or f in _ignore_names + ) + ] func_defs = [(f, (), {}) for f in func_names] for f_def in special_func_defs: @@ -691,7 +721,7 @@ def test_dt_accessor_api_for_categorical(self): for func, args, kwargs in func_defs: with warnings.catch_warnings(): - if func == 'to_period': + if func == "to_period": # dropping TZ warnings.simplefilter("ignore", UserWarning) res = getattr(c.dt, func)(*args, **kwargs) @@ -719,9 +749,9 @@ def test_dt_accessor_api_for_categorical(self): else: tm.assert_almost_equal(res, exp) - invalid = Series([1, 2, 3]).astype('category') + invalid = Series([1, 2, 3]).astype("category") msg = "Can only use .dt accessor with datetimelike" with pytest.raises(AttributeError, match=msg): invalid.dt - assert not hasattr(invalid, 'str') + assert not hasattr(invalid, "str") diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 45514534994558..65a0822bbc55f7 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -12,19 +12,21 @@ class TestSeriesApply: - def test_apply(self, datetime_series): - with np.errstate(all='ignore'): - tm.assert_series_equal(datetime_series.apply(np.sqrt), - np.sqrt(datetime_series)) + with np.errstate(all="ignore"): + tm.assert_series_equal( + datetime_series.apply(np.sqrt), np.sqrt(datetime_series) + ) # element-wise apply import math - tm.assert_series_equal(datetime_series.apply(math.exp), - np.exp(datetime_series)) + + tm.assert_series_equal( + datetime_series.apply(math.exp), np.exp(datetime_series) + ) # empty series - s = Series(dtype=object, name='foo', index=pd.Index([], name='bar')) + s = Series(dtype=object, name="foo", index=pd.Index([], name="bar")) rs = s.apply(lambda x: x) tm.assert_series_equal(s, rs) @@ -61,21 +63,21 @@ def test_apply_dont_convert_dtype(self): def test_with_string_args(self, datetime_series): - for arg in ['sum', 'mean', 'min', 'max', 'std']: + for arg in ["sum", "mean", "min", "max", "std"]: result = datetime_series.apply(arg) expected = getattr(datetime_series, arg)() assert result == expected def test_apply_args(self): - s = Series(['foo,bar']) + s = Series(["foo,bar"]) - result = s.apply(str.split, args=(',', )) - assert result[0] == ['foo', 'bar'] + result = s.apply(str.split, args=(",",)) + assert result[0] == ["foo", "bar"] assert isinstance(result[0], list) def test_series_map_box_timestamps(self): # GH#2689, GH#2627 - ser = Series(pd.date_range('1/1/2000', periods=10)) + ser = Series(pd.date_range("1/1/2000", periods=10)) def func(x): return (x.hour, x.day, x.month) @@ -86,57 +88,57 @@ def func(x): def test_apply_box(self): # ufunc will not be boxed. Same test cases as the test_map_box - vals = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')] + vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] s = pd.Series(vals) - assert s.dtype == 'datetime64[ns]' + assert s.dtype == "datetime64[ns]" # boxed value must be Timestamp instance - res = s.apply(lambda x: '{0}_{1}_{2}'.format(x.__class__.__name__, - x.day, x.tz)) - exp = pd.Series(['Timestamp_1_None', 'Timestamp_2_None']) + res = s.apply(lambda x: "{0}_{1}_{2}".format(x.__class__.__name__, x.day, x.tz)) + exp = pd.Series(["Timestamp_1_None", "Timestamp_2_None"]) tm.assert_series_equal(res, exp) - vals = [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern')] + vals = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + ] s = pd.Series(vals) - assert s.dtype == 'datetime64[ns, US/Eastern]' - res = s.apply(lambda x: '{0}_{1}_{2}'.format(x.__class__.__name__, - x.day, x.tz)) - exp = pd.Series(['Timestamp_1_US/Eastern', 'Timestamp_2_US/Eastern']) + assert s.dtype == "datetime64[ns, US/Eastern]" + res = s.apply(lambda x: "{0}_{1}_{2}".format(x.__class__.__name__, x.day, x.tz)) + exp = pd.Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) tm.assert_series_equal(res, exp) # timedelta - vals = [pd.Timedelta('1 days'), pd.Timedelta('2 days')] + vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] s = pd.Series(vals) - assert s.dtype == 'timedelta64[ns]' - res = s.apply(lambda x: '{0}_{1}'.format(x.__class__.__name__, x.days)) - exp = pd.Series(['Timedelta_1', 'Timedelta_2']) + assert s.dtype == "timedelta64[ns]" + res = s.apply(lambda x: "{0}_{1}".format(x.__class__.__name__, x.days)) + exp = pd.Series(["Timedelta_1", "Timedelta_2"]) tm.assert_series_equal(res, exp) # period - vals = [pd.Period('2011-01-01', freq='M'), - pd.Period('2011-01-02', freq='M')] + vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] s = pd.Series(vals) - assert s.dtype == 'Period[M]' - res = s.apply(lambda x: '{0}_{1}'.format(x.__class__.__name__, - x.freqstr)) - exp = pd.Series(['Period_M', 'Period_M']) + assert s.dtype == "Period[M]" + res = s.apply(lambda x: "{0}_{1}".format(x.__class__.__name__, x.freqstr)) + exp = pd.Series(["Period_M", "Period_M"]) tm.assert_series_equal(res, exp) def test_apply_datetimetz(self): - values = pd.date_range('2011-01-01', '2011-01-02', - freq='H').tz_localize('Asia/Tokyo') - s = pd.Series(values, name='XX') + values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( + "Asia/Tokyo" + ) + s = pd.Series(values, name="XX") result = s.apply(lambda x: x + pd.offsets.Day()) - exp_values = pd.date_range('2011-01-02', '2011-01-03', - freq='H').tz_localize('Asia/Tokyo') - exp = pd.Series(exp_values, name='XX') + exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( + "Asia/Tokyo" + ) + exp = pd.Series(exp_values, name="XX") tm.assert_series_equal(result, exp) # change dtype # GH 14506 : Returned dtype changed from int32 to int64 result = s.apply(lambda x: x.hour) - exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int64) + exp = pd.Series(list(range(24)) + [0], name="XX", dtype=np.int64) tm.assert_series_equal(result, exp) # not vectorized @@ -146,36 +148,35 @@ def f(x): return str(x.tz) result = s.map(f) - exp = pd.Series(['Asia/Tokyo'] * 25, name='XX') + exp = pd.Series(["Asia/Tokyo"] * 25, name="XX") tm.assert_series_equal(result, exp) def test_apply_dict_depr(self): - tsdf = pd.DataFrame(np.random.randn(10, 3), - columns=['A', 'B', 'C'], - index=pd.date_range('1/1/2000', periods=10)) + tsdf = pd.DataFrame( + np.random.randn(10, 3), + columns=["A", "B", "C"], + index=pd.date_range("1/1/2000", periods=10), + ) with tm.assert_produces_warning(FutureWarning): - tsdf.A.agg({'foo': ['sum', 'mean']}) + tsdf.A.agg({"foo": ["sum", "mean"]}) - @pytest.mark.parametrize('series', [ - ['1-1', '1-1', np.NaN], - ['1-1', '1-2', np.NaN]]) + @pytest.mark.parametrize("series", [["1-1", "1-1", np.NaN], ["1-1", "1-2", np.NaN]]) def test_apply_categorical_with_nan_values(self, series): # GH 20714 bug fixed in: GH 24275 - s = pd.Series(series, dtype='category') - result = s.apply(lambda x: x.split('-')[0]) + s = pd.Series(series, dtype="category") + result = s.apply(lambda x: x.split("-")[0]) result = result.astype(object) - expected = pd.Series(['1', '1', np.NaN], dtype='category') + expected = pd.Series(["1", "1", np.NaN], dtype="category") expected = expected.astype(object) tm.assert_series_equal(result, expected) class TestSeriesAggregate: - def test_transform(self, string_series): # transforming functions - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): f_sqrt = np.sqrt(string_series) f_abs = np.abs(string_series) @@ -191,86 +192,89 @@ def test_transform(self, string_series): # list-like result = string_series.transform([np.sqrt]) expected = f_sqrt.to_frame().copy() - expected.columns = ['sqrt'] + expected.columns = ["sqrt"] assert_frame_equal(result, expected) result = string_series.transform([np.sqrt]) assert_frame_equal(result, expected) - result = string_series.transform(['sqrt']) + result = string_series.transform(["sqrt"]) assert_frame_equal(result, expected) # multiple items in list # these are in the order as if we are applying both functions per # series and then concatting expected = pd.concat([f_sqrt, f_abs], axis=1) - expected.columns = ['sqrt', 'absolute'] + expected.columns = ["sqrt", "absolute"] result = string_series.apply([np.sqrt, np.abs]) assert_frame_equal(result, expected) - result = string_series.transform(['sqrt', 'abs']) - expected.columns = ['sqrt', 'abs'] + result = string_series.transform(["sqrt", "abs"]) + expected.columns = ["sqrt", "abs"] assert_frame_equal(result, expected) # dict, provide renaming expected = pd.concat([f_sqrt, f_abs], axis=1) - expected.columns = ['foo', 'bar'] - expected = expected.unstack().rename('series') + expected.columns = ["foo", "bar"] + expected = expected.unstack().rename("series") - result = string_series.apply({'foo': np.sqrt, 'bar': np.abs}) + result = string_series.apply({"foo": np.sqrt, "bar": np.abs}) assert_series_equal(result.reindex_like(expected), expected) def test_transform_and_agg_error(self, string_series): # we are trying to transform with an aggregator with pytest.raises(ValueError): - string_series.transform(['min', 'max']) + string_series.transform(["min", "max"]) with pytest.raises(ValueError): - with np.errstate(all='ignore'): - string_series.agg(['sqrt', 'max']) + with np.errstate(all="ignore"): + string_series.agg(["sqrt", "max"]) with pytest.raises(ValueError): - with np.errstate(all='ignore'): - string_series.transform(['sqrt', 'max']) + with np.errstate(all="ignore"): + string_series.transform(["sqrt", "max"]) with pytest.raises(ValueError): - with np.errstate(all='ignore'): - string_series.agg({'foo': np.sqrt, 'bar': 'sum'}) + with np.errstate(all="ignore"): + string_series.agg({"foo": np.sqrt, "bar": "sum"}) def test_demo(self): # demonstration tests - s = Series(range(6), dtype='int64', name='series') + s = Series(range(6), dtype="int64", name="series") - result = s.agg(['min', 'max']) - expected = Series([0, 5], index=['min', 'max'], name='series') + result = s.agg(["min", "max"]) + expected = Series([0, 5], index=["min", "max"], name="series") tm.assert_series_equal(result, expected) - result = s.agg({'foo': 'min'}) - expected = Series([0], index=['foo'], name='series') + result = s.agg({"foo": "min"}) + expected = Series([0], index=["foo"], name="series") tm.assert_series_equal(result, expected) # nested renaming with tm.assert_produces_warning(FutureWarning): - result = s.agg({'foo': ['min', 'max']}) + result = s.agg({"foo": ["min", "max"]}) - expected = DataFrame( - {'foo': [0, 5]}, - index=['min', 'max']).unstack().rename('series') + expected = ( + DataFrame({"foo": [0, 5]}, index=["min", "max"]).unstack().rename("series") + ) tm.assert_series_equal(result, expected) def test_multiple_aggregators_with_dict_api(self): - s = Series(range(6), dtype='int64', name='series') + s = Series(range(6), dtype="int64", name="series") # nested renaming with tm.assert_produces_warning(FutureWarning): - result = s.agg({'foo': ['min', 'max'], 'bar': ['sum', 'mean']}) - - expected = DataFrame( - {'foo': [5.0, np.nan, 0.0, np.nan], - 'bar': [np.nan, 2.5, np.nan, 15.0]}, - columns=['foo', 'bar'], - index=['max', 'mean', - 'min', 'sum']).unstack().rename('series') + result = s.agg({"foo": ["min", "max"], "bar": ["sum", "mean"]}) + + expected = ( + DataFrame( + {"foo": [5.0, np.nan, 0.0, np.nan], "bar": [np.nan, 2.5, np.nan, 15.0]}, + columns=["foo", "bar"], + index=["max", "mean", "min", "sum"], + ) + .unstack() + .rename("series") + ) tm.assert_series_equal(result.reindex_like(expected), expected) def test_agg_apply_evaluate_lambdas_the_same(self, string_series): @@ -287,37 +291,42 @@ def test_agg_apply_evaluate_lambdas_the_same(self, string_series): def test_with_nested_series(self, datetime_series): # GH 2316 # .agg with a reducer and a transform, what to do - result = datetime_series.apply(lambda x: Series( - [x, x ** 2], index=['x', 'x^2'])) - expected = DataFrame({'x': datetime_series, - 'x^2': datetime_series ** 2}) + result = datetime_series.apply( + lambda x: Series([x, x ** 2], index=["x", "x^2"]) + ) + expected = DataFrame({"x": datetime_series, "x^2": datetime_series ** 2}) tm.assert_frame_equal(result, expected) - result = datetime_series.agg(lambda x: Series( - [x, x ** 2], index=['x', 'x^2'])) + result = datetime_series.agg(lambda x: Series([x, x ** 2], index=["x", "x^2"])) tm.assert_frame_equal(result, expected) def test_replicate_describe(self, string_series): # this also tests a result set that is all scalars expected = string_series.describe() - result = string_series.apply(OrderedDict( - [('count', 'count'), - ('mean', 'mean'), - ('std', 'std'), - ('min', 'min'), - ('25%', lambda x: x.quantile(0.25)), - ('50%', 'median'), - ('75%', lambda x: x.quantile(0.75)), - ('max', 'max')])) + result = string_series.apply( + OrderedDict( + [ + ("count", "count"), + ("mean", "mean"), + ("std", "std"), + ("min", "min"), + ("25%", lambda x: x.quantile(0.25)), + ("50%", "median"), + ("75%", lambda x: x.quantile(0.75)), + ("max", "max"), + ] + ) + ) assert_series_equal(result, expected) def test_reduce(self, string_series): # reductions with named functions - result = string_series.agg(['sum', 'mean']) - expected = Series([string_series.sum(), - string_series.mean()], - ['sum', 'mean'], - name=string_series.name) + result = string_series.agg(["sum", "mean"]) + expected = Series( + [string_series.sum(), string_series.mean()], + ["sum", "mean"], + name=string_series.name, + ) assert_series_equal(result, expected) def test_non_callable_aggregates(self): @@ -325,50 +334,60 @@ def test_non_callable_aggregates(self): s = Series([1, 2, None]) # Calling agg w/ just a string arg same as calling s.arg - result = s.agg('size') + result = s.agg("size") expected = s.size assert result == expected # test when mixed w/ callable reducers - result = s.agg(['size', 'count', 'mean']) - expected = Series(OrderedDict([('size', 3.0), - ('count', 2.0), - ('mean', 1.5)])) + result = s.agg(["size", "count", "mean"]) + expected = Series(OrderedDict([("size", 3.0), ("count", 2.0), ("mean", 1.5)])) assert_series_equal(result[expected.index], expected) - @pytest.mark.parametrize("series, func, expected", chain( - _get_cython_table_params(Series(), [ - ('sum', 0), - ('max', np.nan), - ('min', np.nan), - ('all', True), - ('any', False), - ('mean', np.nan), - ('prod', 1), - ('std', np.nan), - ('var', np.nan), - ('median', np.nan), - ]), - _get_cython_table_params(Series([np.nan, 1, 2, 3]), [ - ('sum', 6), - ('max', 3), - ('min', 1), - ('all', True), - ('any', True), - ('mean', 2), - ('prod', 6), - ('std', 1), - ('var', 1), - ('median', 2), - ]), - _get_cython_table_params(Series('a b c'.split()), [ - ('sum', 'abc'), - ('max', 'c'), - ('min', 'a'), - ('all', 'c'), # see GH12863 - ('any', 'a'), - ]), - )) + @pytest.mark.parametrize( + "series, func, expected", + chain( + _get_cython_table_params( + Series(), + [ + ("sum", 0), + ("max", np.nan), + ("min", np.nan), + ("all", True), + ("any", False), + ("mean", np.nan), + ("prod", 1), + ("std", np.nan), + ("var", np.nan), + ("median", np.nan), + ], + ), + _get_cython_table_params( + Series([np.nan, 1, 2, 3]), + [ + ("sum", 6), + ("max", 3), + ("min", 1), + ("all", True), + ("any", True), + ("mean", 2), + ("prod", 6), + ("std", 1), + ("var", 1), + ("median", 2), + ], + ), + _get_cython_table_params( + Series("a b c".split()), + [ + ("sum", "abc"), + ("max", "c"), + ("min", "a"), + ("all", "c"), # see GH12863 + ("any", "a"), + ], + ), + ), + ) def test_agg_cython_table(self, series, func, expected): # GH21224 # test reducing functions in @@ -379,19 +398,25 @@ def test_agg_cython_table(self, series, func, expected): else: assert result == expected - @pytest.mark.parametrize("series, func, expected", chain( - _get_cython_table_params(Series(), [ - ('cumprod', Series([], Index([]))), - ('cumsum', Series([], Index([]))), - ]), - _get_cython_table_params(Series([np.nan, 1, 2, 3]), [ - ('cumprod', Series([np.nan, 1, 2, 6])), - ('cumsum', Series([np.nan, 1, 3, 6])), - ]), - _get_cython_table_params(Series('a b c'.split()), [ - ('cumsum', Series(['a', 'ab', 'abc'])), - ]), - )) + @pytest.mark.parametrize( + "series, func, expected", + chain( + _get_cython_table_params( + Series(), + [("cumprod", Series([], Index([]))), ("cumsum", Series([], Index([])))], + ), + _get_cython_table_params( + Series([np.nan, 1, 2, 3]), + [ + ("cumprod", Series([np.nan, 1, 2, 6])), + ("cumsum", Series([np.nan, 1, 3, 6])), + ], + ), + _get_cython_table_params( + Series("a b c".split()), [("cumsum", Series(["a", "ab", "abc"]))] + ), + ), + ) def test_agg_cython_table_transform(self, series, func, expected): # GH21224 # test transforming functions in @@ -399,16 +424,22 @@ def test_agg_cython_table_transform(self, series, func, expected): result = series.agg(func) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("series, func, expected", chain( - _get_cython_table_params(Series('a b c'.split()), [ - ('mean', TypeError), # mean raises TypeError - ('prod', TypeError), - ('std', TypeError), - ('var', TypeError), - ('median', TypeError), - ('cumprod', TypeError), - ]) - )) + @pytest.mark.parametrize( + "series, func, expected", + chain( + _get_cython_table_params( + Series("a b c".split()), + [ + ("mean", TypeError), # mean raises TypeError + ("prod", TypeError), + ("std", TypeError), + ("var", TypeError), + ("median", TypeError), + ("cumprod", TypeError), + ], + ) + ), + ) def test_agg_cython_table_raises(self, series, func, expected): # GH21224 with pytest.raises(expected): @@ -417,12 +448,11 @@ def test_agg_cython_table_raises(self, series, func, expected): class TestSeriesMap: - def test_map(self, datetime_series): index, data = tm.getMixedTypeDict() - source = Series(data['B'], index=data['C']) - target = Series(data['C'][:4], index=data['D'][:4]) + source = Series(data["B"], index=data["C"]) + target = Series(data["C"][:4], index=data["D"][:4]) merged = target.map(source) @@ -449,25 +479,28 @@ def test_map(self, datetime_series): exp = Series(["odd", "even", "odd", np.nan]) tm.assert_series_equal(a.map(c), exp) - a = Series(['a', 'b', 'c', 'd']) - b = Series([1, 2, 3, 4], - index=pd.CategoricalIndex(['b', 'c', 'd', 'e'])) - c = Series([1, 2, 3, 4], index=Index(['b', 'c', 'd', 'e'])) + a = Series(["a", "b", "c", "d"]) + b = Series([1, 2, 3, 4], index=pd.CategoricalIndex(["b", "c", "d", "e"])) + c = Series([1, 2, 3, 4], index=Index(["b", "c", "d", "e"])) exp = Series([np.nan, 1, 2, 3]) tm.assert_series_equal(a.map(b), exp) exp = Series([np.nan, 1, 2, 3]) tm.assert_series_equal(a.map(c), exp) - a = Series(['a', 'b', 'c', 'd']) - b = Series(['B', 'C', 'D', 'E'], dtype='category', - index=pd.CategoricalIndex(['b', 'c', 'd', 'e'])) - c = Series(['B', 'C', 'D', 'E'], index=Index(['b', 'c', 'd', 'e'])) - - exp = Series(pd.Categorical([np.nan, 'B', 'C', 'D'], - categories=['B', 'C', 'D', 'E'])) + a = Series(["a", "b", "c", "d"]) + b = Series( + ["B", "C", "D", "E"], + dtype="category", + index=pd.CategoricalIndex(["b", "c", "d", "e"]), + ) + c = Series(["B", "C", "D", "E"], index=Index(["b", "c", "d", "e"])) + + exp = Series( + pd.Categorical([np.nan, "B", "C", "D"], categories=["B", "C", "D", "E"]) + ) tm.assert_series_equal(a.map(b), exp) - exp = Series([np.nan, 'B', 'C', 'D']) + exp = Series([np.nan, "B", "C", "D"]) tm.assert_series_equal(a.map(c), exp) @pytest.mark.parametrize("index", tm.all_index_generator(10)) @@ -481,12 +514,12 @@ def test_map_empty(self, index): def test_map_compat(self): # related GH 8024 s = Series([True, True, False], index=[1, 2, 3]) - result = s.map({True: 'foo', False: 'bar'}) - expected = Series(['foo', 'foo', 'bar'], index=[1, 2, 3]) + result = s.map({True: "foo", False: "bar"}) + expected = Series(["foo", "foo", "bar"], index=[1, 2, 3]) assert_series_equal(result, expected) def test_map_int(self): - left = Series({'a': 1., 'b': 2., 'c': 3., 'd': 4}) + left = Series({"a": 1.0, "b": 2.0, "c": 3.0, "d": 4}) right = Series({1: 11, 2: 22, 3: 33}) assert left.dtype == np.float_ @@ -494,8 +527,8 @@ def test_map_int(self): merged = left.map(right) assert merged.dtype == np.float_ - assert isna(merged['d']) - assert not isna(merged['c']) + assert isna(merged["d"]) + assert not isna(merged["c"]) def test_map_type_inference(self): s = Series(range(3)) @@ -512,7 +545,7 @@ def test_map_decimal(self, string_series): def test_map_na_exclusion(self): s = Series([1.5, np.nan, 3, np.nan, 5]) - result = s.map(lambda x: x * 2, na_action='ignore') + result = s.map(lambda x: x * 2, na_action="ignore") exp = s * 2 assert_series_equal(result, exp) @@ -524,30 +557,29 @@ def test_map_dict_with_tuple_keys(self): from being mapped properly. """ # GH 18496 - df = pd.DataFrame({'a': [(1, ), (2, ), (3, 4), (5, 6)]}) - label_mappings = {(1, ): 'A', (2, ): 'B', (3, 4): 'A', (5, 6): 'B'} + df = pd.DataFrame({"a": [(1,), (2,), (3, 4), (5, 6)]}) + label_mappings = {(1,): "A", (2,): "B", (3, 4): "A", (5, 6): "B"} - df['labels'] = df['a'].map(label_mappings) - df['expected_labels'] = pd.Series(['A', 'B', 'A', 'B'], index=df.index) + df["labels"] = df["a"].map(label_mappings) + df["expected_labels"] = pd.Series(["A", "B", "A", "B"], index=df.index) # All labels should be filled now - tm.assert_series_equal(df['labels'], df['expected_labels'], - check_names=False) + tm.assert_series_equal(df["labels"], df["expected_labels"], check_names=False) def test_map_counter(self): - s = Series(['a', 'b', 'c'], index=[1, 2, 3]) + s = Series(["a", "b", "c"], index=[1, 2, 3]) counter = Counter() - counter['b'] = 5 - counter['c'] += 1 + counter["b"] = 5 + counter["c"] += 1 result = s.map(counter) expected = Series([0, 5, 1], index=[1, 2, 3]) assert_series_equal(result, expected) def test_map_defaultdict(self): - s = Series([1, 2, 3], index=['a', 'b', 'c']) - default_dict = defaultdict(lambda: 'blank') - default_dict[1] = 'stuff' + s = Series([1, 2, 3], index=["a", "b", "c"]) + default_dict = defaultdict(lambda: "blank") + default_dict[1] = "stuff" result = s.map(default_dict) - expected = Series(['stuff', 'blank', 'blank'], index=['a', 'b', 'c']) + expected = Series(["stuff", "blank", "blank"], index=["a", "b", "c"]) assert_series_equal(result, expected) def test_map_dict_subclass_with_missing(self): @@ -555,101 +587,104 @@ def test_map_dict_subclass_with_missing(self): Test Series.map with a dictionary subclass that defines __missing__, i.e. sets a default value (GH #15999). """ + class DictWithMissing(dict): def __missing__(self, key): - return 'missing' + return "missing" + s = Series([1, 2, 3]) - dictionary = DictWithMissing({3: 'three'}) + dictionary = DictWithMissing({3: "three"}) result = s.map(dictionary) - expected = Series(['missing', 'missing', 'three']) + expected = Series(["missing", "missing", "three"]) assert_series_equal(result, expected) def test_map_dict_subclass_without_missing(self): class DictWithoutMissing(dict): pass + s = Series([1, 2, 3]) - dictionary = DictWithoutMissing({3: 'three'}) + dictionary = DictWithoutMissing({3: "three"}) result = s.map(dictionary) - expected = Series([np.nan, np.nan, 'three']) + expected = Series([np.nan, np.nan, "three"]) assert_series_equal(result, expected) def test_map_box(self): - vals = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')] + vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] s = pd.Series(vals) - assert s.dtype == 'datetime64[ns]' + assert s.dtype == "datetime64[ns]" # boxed value must be Timestamp instance - res = s.map(lambda x: '{0}_{1}_{2}'.format(x.__class__.__name__, - x.day, x.tz)) - exp = pd.Series(['Timestamp_1_None', 'Timestamp_2_None']) + res = s.map(lambda x: "{0}_{1}_{2}".format(x.__class__.__name__, x.day, x.tz)) + exp = pd.Series(["Timestamp_1_None", "Timestamp_2_None"]) tm.assert_series_equal(res, exp) - vals = [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern')] + vals = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + ] s = pd.Series(vals) - assert s.dtype == 'datetime64[ns, US/Eastern]' - res = s.map(lambda x: '{0}_{1}_{2}'.format(x.__class__.__name__, - x.day, x.tz)) - exp = pd.Series(['Timestamp_1_US/Eastern', 'Timestamp_2_US/Eastern']) + assert s.dtype == "datetime64[ns, US/Eastern]" + res = s.map(lambda x: "{0}_{1}_{2}".format(x.__class__.__name__, x.day, x.tz)) + exp = pd.Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) tm.assert_series_equal(res, exp) # timedelta - vals = [pd.Timedelta('1 days'), pd.Timedelta('2 days')] + vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] s = pd.Series(vals) - assert s.dtype == 'timedelta64[ns]' - res = s.map(lambda x: '{0}_{1}'.format(x.__class__.__name__, x.days)) - exp = pd.Series(['Timedelta_1', 'Timedelta_2']) + assert s.dtype == "timedelta64[ns]" + res = s.map(lambda x: "{0}_{1}".format(x.__class__.__name__, x.days)) + exp = pd.Series(["Timedelta_1", "Timedelta_2"]) tm.assert_series_equal(res, exp) # period - vals = [pd.Period('2011-01-01', freq='M'), - pd.Period('2011-01-02', freq='M')] + vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] s = pd.Series(vals) - assert s.dtype == 'Period[M]' - res = s.map(lambda x: '{0}_{1}'.format(x.__class__.__name__, - x.freqstr)) - exp = pd.Series(['Period_M', 'Period_M']) + assert s.dtype == "Period[M]" + res = s.map(lambda x: "{0}_{1}".format(x.__class__.__name__, x.freqstr)) + exp = pd.Series(["Period_M", "Period_M"]) tm.assert_series_equal(res, exp) def test_map_categorical(self): - values = pd.Categorical(list('ABBABCD'), categories=list('DCBA'), - ordered=True) - s = pd.Series(values, name='XX', index=list('abcdefg')) + values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) + s = pd.Series(values, name="XX", index=list("abcdefg")) result = s.map(lambda x: x.lower()) - exp_values = pd.Categorical(list('abbabcd'), categories=list('dcba'), - ordered=True) - exp = pd.Series(exp_values, name='XX', index=list('abcdefg')) + exp_values = pd.Categorical( + list("abbabcd"), categories=list("dcba"), ordered=True + ) + exp = pd.Series(exp_values, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) tm.assert_categorical_equal(result.values, exp_values) - result = s.map(lambda x: 'A') - exp = pd.Series(['A'] * 7, name='XX', index=list('abcdefg')) + result = s.map(lambda x: "A") + exp = pd.Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) assert result.dtype == np.object with pytest.raises(NotImplementedError): - s.map(lambda x: x, na_action='ignore') + s.map(lambda x: x, na_action="ignore") def test_map_datetimetz(self): - values = pd.date_range('2011-01-01', '2011-01-02', - freq='H').tz_localize('Asia/Tokyo') - s = pd.Series(values, name='XX') + values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( + "Asia/Tokyo" + ) + s = pd.Series(values, name="XX") # keep tz result = s.map(lambda x: x + pd.offsets.Day()) - exp_values = pd.date_range('2011-01-02', '2011-01-03', - freq='H').tz_localize('Asia/Tokyo') - exp = pd.Series(exp_values, name='XX') + exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( + "Asia/Tokyo" + ) + exp = pd.Series(exp_values, name="XX") tm.assert_series_equal(result, exp) # change dtype # GH 14506 : Returned dtype changed from int32 to int64 result = s.map(lambda x: x.hour) - exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int64) + exp = pd.Series(list(range(24)) + [0], name="XX", dtype=np.int64) tm.assert_series_equal(result, exp) with pytest.raises(NotImplementedError): - s.map(lambda x: x, na_action='ignore') + s.map(lambda x: x, na_action="ignore") # not vectorized def f(x): @@ -658,13 +693,17 @@ def f(x): return str(x.tz) result = s.map(f) - exp = pd.Series(['Asia/Tokyo'] * 25, name='XX') + exp = pd.Series(["Asia/Tokyo"] * 25, name="XX") tm.assert_series_equal(result, exp) - @pytest.mark.parametrize("vals,mapping,exp", [ - (list('abc'), {np.nan: 'not NaN'}, [np.nan] * 3 + ['not NaN']), - (list('abc'), {'a': 'a letter'}, ['a letter'] + [np.nan] * 3), - (list(range(3)), {0: 42}, [42] + [np.nan] * 3)]) + @pytest.mark.parametrize( + "vals,mapping,exp", + [ + (list("abc"), {np.nan: "not NaN"}, [np.nan] * 3 + ["not NaN"]), + (list("abc"), {"a": "a letter"}, ["a letter"] + [np.nan] * 3), + (list(range(3)), {0: 42}, [42] + [np.nan] * 3), + ], + ) def test_map_missing_mixed(self, vals, mapping, exp): # GH20495 s = pd.Series(vals + [np.nan]) @@ -672,22 +711,29 @@ def test_map_missing_mixed(self, vals, mapping, exp): tm.assert_series_equal(result, pd.Series(exp)) - @pytest.mark.parametrize("dti,exp", [ - (Series([1, 2], index=pd.DatetimeIndex([0, 31536000000])), - DataFrame(np.repeat([[1, 2]], 2, axis=0), dtype='int64')), - (tm.makeTimeSeries(nper=30), - DataFrame(np.repeat([[1, 2]], 30, axis=0), dtype='int64')) - ]) + @pytest.mark.parametrize( + "dti,exp", + [ + ( + Series([1, 2], index=pd.DatetimeIndex([0, 31536000000])), + DataFrame(np.repeat([[1, 2]], 2, axis=0), dtype="int64"), + ), + ( + tm.makeTimeSeries(nper=30), + DataFrame(np.repeat([[1, 2]], 30, axis=0), dtype="int64"), + ), + ], + ) def test_apply_series_on_date_time_index_aware_series(self, dti, exp): # GH 25959 # Calling apply on a localized time series should not cause an error - index = dti.tz_localize('UTC').index + index = dti.tz_localize("UTC").index result = pd.Series(index).apply(lambda x: pd.Series([1, 2])) assert_frame_equal(result, exp) def test_apply_scaler_on_date_time_index_aware_series(self): # GH 25959 # Calling apply on a localized time series should not cause an error - series = tm.makeTimeSeries(nper=30).tz_localize('UTC') + series = tm.makeTimeSeries(nper=30).tz_localize("UTC") result = pd.Series(series.index).apply(lambda x: 1) - assert_series_equal(result, pd.Series(np.ones(30), dtype='int64')) + assert_series_equal(result, pd.Series(np.ones(30), dtype="int64")) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 2cc2ad080eb4ce..5b57b5ba2dbaec 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -15,31 +15,31 @@ def _permute(obj): class TestSeriesFlexArithmetic: @pytest.mark.parametrize( - 'ts', + "ts", [ (lambda x: x, lambda x: x * 2, False), (lambda x: x, lambda x: x[::2], False), (lambda x: x, lambda x: 5, True), - (lambda x: tm.makeFloatSeries(), - lambda x: tm.makeFloatSeries(), - True) - ]) - @pytest.mark.parametrize('opname', ['add', 'sub', 'mul', 'floordiv', - 'truediv', 'div', 'pow']) + (lambda x: tm.makeFloatSeries(), lambda x: tm.makeFloatSeries(), True), + ], + ) + @pytest.mark.parametrize( + "opname", ["add", "sub", "mul", "floordiv", "truediv", "div", "pow"] + ) def test_flex_method_equivalence(self, opname, ts): # check that Series.{opname} behaves like Series.__{opname}__, - tser = tm.makeTimeSeries().rename('ts') + tser = tm.makeTimeSeries().rename("ts") series = ts[0](tser) other = ts[1](tser) check_reverse = ts[2] - if opname == 'div': - pytest.skip('div test only for Py3') + if opname == "div": + pytest.skip("div test only for Py3") op = getattr(Series, opname) - if op == 'div': + if op == "div": alt = operator.truediv else: alt = getattr(operator, opname) @@ -58,7 +58,7 @@ class TestSeriesArithmetic: # Some of these may end up in tests/arithmetic, but are not yet sorted def test_add_series_with_period_index(self): - rng = pd.period_range('1/1/2000', '1/1/2010', freq='A') + rng = pd.period_range("1/1/2000", "1/1/2010", freq="A") ts = Series(np.random.randn(len(rng)), index=rng) result = ts + ts[::2] @@ -71,12 +71,13 @@ def test_add_series_with_period_index(self): msg = "Input has different freq=D from PeriodIndex\\(freq=A-DEC\\)" with pytest.raises(IncompatibleFrequency, match=msg): - ts + ts.asfreq('D', how="end") + ts + ts.asfreq("D", how="end") # ------------------------------------------------------------------ # Comparisons + class TestSeriesFlexComparison: def test_comparison_flex_basic(self): left = pd.Series(np.random.randn(10)) @@ -90,7 +91,7 @@ def test_comparison_flex_basic(self): tm.assert_series_equal(left.ge(right), left >= right) # axis - for axis in [0, None, 'index']: + for axis in [0, None, "index"]: tm.assert_series_equal(left.eq(right, axis=axis), left == right) tm.assert_series_equal(left.ne(right, axis=axis), left != right) tm.assert_series_equal(left.le(right, axis=axis), left < right) @@ -99,16 +100,16 @@ def test_comparison_flex_basic(self): tm.assert_series_equal(left.ge(right, axis=axis), left >= right) # - msg = 'No axis named 1 for object type' - for op in ['eq', 'ne', 'le', 'le', 'gt', 'ge']: + msg = "No axis named 1 for object type" + for op in ["eq", "ne", "le", "le", "gt", "ge"]: with pytest.raises(ValueError, match=msg): getattr(left, op)(right, axis=1) class TestSeriesComparison: def test_comparison_different_length(self): - a = Series(['a', 'b', 'c']) - b = Series(['b', 'a']) + a = Series(["a", "b", "c"]) + b = Series(["b", "a"]) with pytest.raises(ValueError): a < b @@ -117,41 +118,41 @@ def test_comparison_different_length(self): with pytest.raises(ValueError): a == b - @pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le']) + @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) def test_ser_flex_cmp_return_dtypes(self, opname): # GH#15115 ser = Series([1, 3, 2], index=range(3)) const = 2 result = getattr(ser, opname)(const).dtypes - expected = np.dtype('bool') + expected = np.dtype("bool") assert result == expected - @pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le']) + @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) def test_ser_flex_cmp_return_dtypes_empty(self, opname): # GH#15115 empty Series case ser = Series([1, 3, 2], index=range(3)) empty = ser.iloc[:0] const = 2 result = getattr(empty, opname)(const).dtypes - expected = np.dtype('bool') + expected = np.dtype("bool") assert result == expected - @pytest.mark.parametrize('op', [operator.eq, operator.ne, - operator.le, operator.lt, - operator.ge, operator.gt]) - @pytest.mark.parametrize('names', [(None, None, None), - ('foo', 'bar', None), - ('baz', 'baz', 'baz')]) + @pytest.mark.parametrize( + "op", + [operator.eq, operator.ne, operator.le, operator.lt, operator.ge, operator.gt], + ) + @pytest.mark.parametrize( + "names", [(None, None, None), ("foo", "bar", None), ("baz", "baz", "baz")] + ) def test_ser_cmp_result_names(self, names, op): # datetime64 dtype - dti = pd.date_range('1949-06-07 03:00:00', - freq='H', periods=5, name=names[0]) + dti = pd.date_range("1949-06-07 03:00:00", freq="H", periods=5, name=names[0]) ser = Series(dti).rename(names[1]) result = op(ser, dti) assert result.name == names[2] # datetime64tz dtype - dti = dti.tz_localize('US/Central') + dti = dti.tz_localize("US/Central") ser = Series(dti).rename(names[1]) result = op(ser, dti) assert result.name == names[2] @@ -165,7 +166,7 @@ def test_ser_cmp_result_names(self, names, op): # categorical if op in [operator.eq, operator.ne]: # categorical dtype comparisons raise for inequalities - cidx = tdi.astype('category') + cidx = tdi.astype("category") ser = Series(cidx).rename(names[1]) result = op(ser, cidx) assert result.name == names[2] diff --git a/pandas/tests/series/test_asof.py b/pandas/tests/series/test_asof.py index ee94b32717d1ad..8bc9e9c38d83af 100644 --- a/pandas/tests/series/test_asof.py +++ b/pandas/tests/series/test_asof.py @@ -6,15 +6,14 @@ class TestSeriesAsof: - def test_basic(self): # array or list or dates N = 50 - rng = date_range('1/1/1990', periods=N, freq='53s') + rng = date_range("1/1/1990", periods=N, freq="53s") ts = Series(np.random.randn(N), index=rng) ts[15:30] = np.nan - dates = date_range('1/1/1990', periods=N * 3, freq='25s') + dates = date_range("1/1/1990", periods=N * 3, freq="25s") result = ts.asof(dates) assert notna(result).all() @@ -36,7 +35,7 @@ def test_basic(self): def test_scalar(self): N = 30 - rng = date_range('1/1/1990', periods=N, freq='53s') + rng = date_range("1/1/1990", periods=N, freq="53s") ts = Series(np.arange(N), index=rng) ts[5:10] = np.NaN ts[15:20] = np.NaN @@ -61,35 +60,42 @@ def test_scalar(self): def test_with_nan(self): # basic asof test - rng = date_range('1/1/2000', '1/2/2000', freq='4h') + rng = date_range("1/1/2000", "1/2/2000", freq="4h") s = Series(np.arange(len(rng)), index=rng) - r = s.resample('2h').mean() + r = s.resample("2h").mean() result = r.asof(r.index) - expected = Series([0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6.], - index=date_range('1/1/2000', '1/2/2000', freq='2h')) + expected = Series( + [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6.0], + index=date_range("1/1/2000", "1/2/2000", freq="2h"), + ) tm.assert_series_equal(result, expected) r.iloc[3:5] = np.nan result = r.asof(r.index) - expected = Series([0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 5, 5, 6.], - index=date_range('1/1/2000', '1/2/2000', freq='2h')) + expected = Series( + [0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 5, 5, 6.0], + index=date_range("1/1/2000", "1/2/2000", freq="2h"), + ) tm.assert_series_equal(result, expected) r.iloc[-3:] = np.nan result = r.asof(r.index) - expected = Series([0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 4, 4, 4.], - index=date_range('1/1/2000', '1/2/2000', freq='2h')) + expected = Series( + [0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 4, 4, 4.0], + index=date_range("1/1/2000", "1/2/2000", freq="2h"), + ) tm.assert_series_equal(result, expected) def test_periodindex(self): from pandas import period_range, PeriodIndex + # array or list or dates N = 50 - rng = period_range('1/1/1990', periods=N, freq='H') + rng = period_range("1/1/1990", periods=N, freq="H") ts = Series(np.random.randn(N), index=rng) ts[15:30] = np.nan - dates = date_range('1/1/1990', periods=N * 3, freq='37min') + dates = date_range("1/1/1990", periods=N * 3, freq="37min") result = ts.asof(dates) assert notna(result).all() @@ -101,7 +107,7 @@ def test_periodindex(self): lb = ts.index[14] ub = ts.index[30] - pix = PeriodIndex(result.index.values, freq='H') + pix = PeriodIndex(result.index.values, freq="H") mask = (pix >= lb) & (pix < ub) rs = result[mask] assert (rs == ts[lb]).all() @@ -128,10 +134,10 @@ def test_periodindex(self): def test_errors(self): - s = Series([1, 2, 3], - index=[Timestamp('20130101'), - Timestamp('20130103'), - Timestamp('20130102')]) + s = Series( + [1, 2, 3], + index=[Timestamp("20130101"), Timestamp("20130103"), Timestamp("20130102")], + ) # non-monotonic assert not s.index.is_monotonic @@ -140,10 +146,10 @@ def test_errors(self): # subset with Series N = 10 - rng = date_range('1/1/1990', periods=N, freq='53s') + rng = date_range("1/1/1990", periods=N, freq="53s") s = Series(np.random.randn(N), index=rng) with pytest.raises(ValueError): - s.asof(s.index[0], subset='foo') + s.asof(s.index[0], subset="foo") def test_all_nans(self): # GH 15713 @@ -154,19 +160,19 @@ def test_all_nans(self): # testing non-default indexes N = 50 - rng = date_range('1/1/1990', periods=N, freq='53s') + rng = date_range("1/1/1990", periods=N, freq="53s") - dates = date_range('1/1/1990', periods=N * 3, freq='25s') + dates = date_range("1/1/1990", periods=N * 3, freq="25s") result = Series(np.nan, index=rng).asof(dates) expected = Series(np.nan, index=dates) tm.assert_series_equal(result, expected) # testing scalar input - date = date_range('1/1/1990', periods=N * 3, freq='25s')[0] + date = date_range("1/1/1990", periods=N * 3, freq="25s")[0] result = Series(np.nan, index=rng).asof(date) assert isna(result) # test name is propagated - result = Series(np.nan, index=[1, 2, 3, 4], name='test').asof([4, 5]) - expected = Series(np.nan, index=[4, 5], name='test') + result = Series(np.nan, index=[1, 2, 3, 4], name="test").asof([4, 5]) + expected = Series(np.nan, index=[4, 5], name="test") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_block_internals.py b/pandas/tests/series/test_block_internals.py index a18f1e1f444b73..18e75c3be5bcc8 100644 --- a/pandas/tests/series/test_block_internals.py +++ b/pandas/tests/series/test_block_internals.py @@ -5,17 +5,16 @@ class TestSeriesBlockInternals: - def test_setitem_invalidates_datetime_index_freq(self): # GH#24096 altering a datetime64tz Series inplace invalidates the # `freq` attribute on the underlying DatetimeIndex - dti = pd.date_range('20130101', periods=3, tz='US/Eastern') + dti = pd.date_range("20130101", periods=3, tz="US/Eastern") ts = dti[1] ser = pd.Series(dti) assert ser._values is not dti assert ser._values._data.base is not dti._data._data.base - assert dti.freq == 'D' + assert dti.freq == "D" ser.iloc[1] = pd.NaT assert ser._values.freq is None @@ -23,18 +22,17 @@ def test_setitem_invalidates_datetime_index_freq(self): assert ser._values is not dti assert ser._values._data.base is not dti._data._data.base assert dti[1] == ts - assert dti.freq == 'D' + assert dti.freq == "D" def test_dt64tz_setitem_does_not_mutate_dti(self): # GH#21907, GH#24096 - dti = pd.date_range('2016-01-01', periods=10, tz='US/Pacific') + dti = pd.date_range("2016-01-01", periods=10, tz="US/Pacific") ts = dti[0] ser = pd.Series(dti) assert ser._values is not dti assert ser._values._data.base is not dti._data._data.base assert ser._data.blocks[0].values is not dti - assert (ser._data.blocks[0].values._data.base - is not dti._data._data.base) + assert ser._data.blocks[0].values._data.base is not dti._data._data.base ser[::3] = pd.NaT assert ser[0] is pd.NaT diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index d03c29ad79469c..bf527bae297d9a 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -11,7 +11,6 @@ class TestSeriesCombine: - def test_append(self, datetime_series, string_series, object_series): appendedSeries = string_series.append(object_series) for idx, value in appendedSeries.items(): @@ -27,8 +26,7 @@ def test_append(self, datetime_series, string_series, object_series): datetime_series.append(datetime_series, verify_integrity=True) def test_append_many(self, datetime_series): - pieces = [datetime_series[:5], datetime_series[5:10], - datetime_series[10:]] + pieces = [datetime_series[:5], datetime_series[5:10], datetime_series[10:]] result = pieces[0].append(pieces[1:]) assert_series_equal(result, datetime_series) @@ -43,12 +41,14 @@ def test_append_duplicates(self): # the result must have RangeIndex exp = pd.Series([1, 2, 3, 4, 5, 6]) - tm.assert_series_equal(s1.append(s2, ignore_index=True), - exp, check_index_type=True) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), - exp, check_index_type=True) + tm.assert_series_equal( + s1.append(s2, ignore_index=True), exp, check_index_type=True + ) + tm.assert_series_equal( + pd.concat([s1, s2], ignore_index=True), exp, check_index_type=True + ) - msg = 'Indexes have overlapping values:' + msg = "Indexes have overlapping values:" with pytest.raises(ValueError, match=msg): s1.append(s2, verify_integrity=True) with pytest.raises(ValueError, match=msg): @@ -94,54 +94,57 @@ def test_combine_first(self): combined = strings.combine_first(floats) tm.assert_series_equal(strings, combined.loc[index[::2]]) - tm.assert_series_equal(floats[1::2].astype(object), - combined.loc[index[1::2]]) + tm.assert_series_equal(floats[1::2].astype(object), combined.loc[index[1::2]]) # corner case - s = Series([1., 2, 3], index=[0, 1, 2]) + s = Series([1.0, 2, 3], index=[0, 1, 2]) result = s.combine_first(Series([], index=[])) - s.index = s.index.astype('O') + s.index = s.index.astype("O") assert_series_equal(s, result) def test_update(self): - s = Series([1.5, nan, 3., 4., nan]) - s2 = Series([nan, 3.5, nan, 5.]) + s = Series([1.5, nan, 3.0, 4.0, nan]) + s2 = Series([nan, 3.5, nan, 5.0]) s.update(s2) - expected = Series([1.5, 3.5, 3., 5., np.nan]) + expected = Series([1.5, 3.5, 3.0, 5.0, np.nan]) assert_series_equal(s, expected) # GH 3217 df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) - df['c'] = np.nan + df["c"] = np.nan - df['c'].update(Series(['foo'], index=[0])) - expected = DataFrame([[1, np.nan, 'foo'], [3, 2., np.nan]], - columns=['a', 'b', 'c']) + df["c"].update(Series(["foo"], index=[0])) + expected = DataFrame( + [[1, np.nan, "foo"], [3, 2.0, np.nan]], columns=["a", "b", "c"] + ) assert_frame_equal(df, expected) - @pytest.mark.parametrize('other, dtype, expected', [ - # other is int - ([61, 63], 'int32', pd.Series([10, 61, 12], dtype='int32')), - ([61, 63], 'int64', pd.Series([10, 61, 12])), - ([61, 63], float, pd.Series([10., 61., 12.])), - ([61, 63], object, pd.Series([10, 61, 12], dtype=object)), - # other is float, but can be cast to int - ([61., 63.], 'int32', pd.Series([10, 61, 12], dtype='int32')), - ([61., 63.], 'int64', pd.Series([10, 61, 12])), - ([61., 63.], float, pd.Series([10., 61., 12.])), - ([61., 63.], object, pd.Series([10, 61., 12], dtype=object)), - # others is float, cannot be cast to int - ([61.1, 63.1], 'int32', pd.Series([10., 61.1, 12.])), - ([61.1, 63.1], 'int64', pd.Series([10., 61.1, 12.])), - ([61.1, 63.1], float, pd.Series([10., 61.1, 12.])), - ([61.1, 63.1], object, pd.Series([10, 61.1, 12], dtype=object)), - # other is object, cannot be cast - ([(61,), (63,)], 'int32', pd.Series([10, (61,), 12])), - ([(61,), (63,)], 'int64', pd.Series([10, (61,), 12])), - ([(61,), (63,)], float, pd.Series([10., (61,), 12.])), - ([(61,), (63,)], object, pd.Series([10, (61,), 12])) - ]) + @pytest.mark.parametrize( + "other, dtype, expected", + [ + # other is int + ([61, 63], "int32", pd.Series([10, 61, 12], dtype="int32")), + ([61, 63], "int64", pd.Series([10, 61, 12])), + ([61, 63], float, pd.Series([10.0, 61.0, 12.0])), + ([61, 63], object, pd.Series([10, 61, 12], dtype=object)), + # other is float, but can be cast to int + ([61.0, 63.0], "int32", pd.Series([10, 61, 12], dtype="int32")), + ([61.0, 63.0], "int64", pd.Series([10, 61, 12])), + ([61.0, 63.0], float, pd.Series([10.0, 61.0, 12.0])), + ([61.0, 63.0], object, pd.Series([10, 61.0, 12], dtype=object)), + # others is float, cannot be cast to int + ([61.1, 63.1], "int32", pd.Series([10.0, 61.1, 12.0])), + ([61.1, 63.1], "int64", pd.Series([10.0, 61.1, 12.0])), + ([61.1, 63.1], float, pd.Series([10.0, 61.1, 12.0])), + ([61.1, 63.1], object, pd.Series([10, 61.1, 12], dtype=object)), + # other is object, cannot be cast + ([(61,), (63,)], "int32", pd.Series([10, (61,), 12])), + ([(61,), (63,)], "int64", pd.Series([10, (61,), 12])), + ([(61,), (63,)], float, pd.Series([10.0, (61,), 12.0])), + ([(61,), (63,)], object, pd.Series([10, (61,), 12])), + ], + ) def test_update_dtypes(self, other, dtype, expected): s = Series([10, 11, 12], dtype=dtype) @@ -153,29 +156,30 @@ def test_update_dtypes(self, other, dtype, expected): def test_concat_empty_series_dtypes_roundtrips(self): # round-tripping with self & like self - dtypes = map(np.dtype, ['float64', 'int8', 'uint8', 'bool', 'm8[ns]', - 'M8[ns]']) + dtypes = map(np.dtype, ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"]) for dtype in dtypes: assert pd.concat([Series(dtype=dtype)]).dtype == dtype - assert pd.concat([Series(dtype=dtype), - Series(dtype=dtype)]).dtype == dtype + assert pd.concat([Series(dtype=dtype), Series(dtype=dtype)]).dtype == dtype def int_result_type(dtype, dtype2): typs = {dtype.kind, dtype2.kind} - if not len(typs - {'i', 'u', 'b'}) and (dtype.kind == 'i' or - dtype2.kind == 'i'): - return 'i' - elif not len(typs - {'u', 'b'}) and (dtype.kind == 'u' or - dtype2.kind == 'u'): - return 'u' + if not len(typs - {"i", "u", "b"}) and ( + dtype.kind == "i" or dtype2.kind == "i" + ): + return "i" + elif not len(typs - {"u", "b"}) and ( + dtype.kind == "u" or dtype2.kind == "u" + ): + return "u" return None def float_result_type(dtype, dtype2): typs = {dtype.kind, dtype2.kind} - if not len(typs - {'f', 'i', 'u'}) and (dtype.kind == 'f' or - dtype2.kind == 'f'): - return 'f' + if not len(typs - {"f", "i", "u"}) and ( + dtype.kind == "f" or dtype2.kind == "f" + ): + return "f" return None def get_result_type(dtype, dtype2): @@ -185,7 +189,7 @@ def get_result_type(dtype, dtype2): result = int_result_type(dtype, dtype2) if result is not None: return result - return 'O' + return "O" for dtype in dtypes: for dtype2 in dtypes: @@ -193,22 +197,25 @@ def get_result_type(dtype, dtype2): continue expected = get_result_type(dtype, dtype2) - result = pd.concat([Series(dtype=dtype), Series(dtype=dtype2) - ]).dtype + result = pd.concat([Series(dtype=dtype), Series(dtype=dtype2)]).dtype assert result.kind == expected def test_combine_first_dt_tz_values(self, tz_naive_fixture): - ser1 = pd.Series(pd.DatetimeIndex(['20150101', '20150102', '20150103'], - tz=tz_naive_fixture), - name='ser1') - ser2 = pd.Series(pd.DatetimeIndex(['20160514', '20160515', '20160516'], - tz=tz_naive_fixture), - index=[2, 3, 4], name='ser2') + ser1 = pd.Series( + pd.DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture), + name="ser1", + ) + ser2 = pd.Series( + pd.DatetimeIndex(["20160514", "20160515", "20160516"], tz=tz_naive_fixture), + index=[2, 3, 4], + name="ser2", + ) result = ser1.combine_first(ser2) - exp_vals = pd.DatetimeIndex(['20150101', '20150102', '20150103', - '20160515', '20160516'], - tz=tz_naive_fixture) - exp = pd.Series(exp_vals, name='ser1') + exp_vals = pd.DatetimeIndex( + ["20150101", "20150102", "20150103", "20160515", "20160516"], + tz=tz_naive_fixture, + ) + exp = pd.Series(exp_vals, name="ser1") assert_series_equal(exp, result) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @@ -216,82 +223,108 @@ def test_combine_first_dt_tz_values(self, tz_naive_fixture): def test_concat_empty_series_dtypes(self): # booleans - assert pd.concat([Series(dtype=np.bool_), - Series(dtype=np.int32)]).dtype == np.int32 - assert pd.concat([Series(dtype=np.bool_), - Series(dtype=np.float32)]).dtype == np.object_ + assert ( + pd.concat([Series(dtype=np.bool_), Series(dtype=np.int32)]).dtype + == np.int32 + ) + assert ( + pd.concat([Series(dtype=np.bool_), Series(dtype=np.float32)]).dtype + == np.object_ + ) # datetime-like - assert pd.concat([Series(dtype='m8[ns]'), - Series(dtype=np.bool)]).dtype == np.object_ - assert pd.concat([Series(dtype='m8[ns]'), - Series(dtype=np.int64)]).dtype == np.object_ - assert pd.concat([Series(dtype='M8[ns]'), - Series(dtype=np.bool)]).dtype == np.object_ - assert pd.concat([Series(dtype='M8[ns]'), - Series(dtype=np.int64)]).dtype == np.object_ - assert pd.concat([Series(dtype='M8[ns]'), - Series(dtype=np.bool_), - Series(dtype=np.int64)]).dtype == np.object_ + assert ( + pd.concat([Series(dtype="m8[ns]"), Series(dtype=np.bool)]).dtype + == np.object_ + ) + assert ( + pd.concat([Series(dtype="m8[ns]"), Series(dtype=np.int64)]).dtype + == np.object_ + ) + assert ( + pd.concat([Series(dtype="M8[ns]"), Series(dtype=np.bool)]).dtype + == np.object_ + ) + assert ( + pd.concat([Series(dtype="M8[ns]"), Series(dtype=np.int64)]).dtype + == np.object_ + ) + assert ( + pd.concat( + [Series(dtype="M8[ns]"), Series(dtype=np.bool_), Series(dtype=np.int64)] + ).dtype + == np.object_ + ) # categorical - assert pd.concat([Series(dtype='category'), - Series(dtype='category')]).dtype == 'category' + assert ( + pd.concat([Series(dtype="category"), Series(dtype="category")]).dtype + == "category" + ) # GH 18515 - assert pd.concat([Series(np.array([]), dtype='category'), - Series(dtype='float64')]).dtype == 'float64' - assert pd.concat([Series(dtype='category'), - Series(dtype='object')]).dtype == 'object' + assert ( + pd.concat( + [Series(np.array([]), dtype="category"), Series(dtype="float64")] + ).dtype + == "float64" + ) + assert ( + pd.concat([Series(dtype="category"), Series(dtype="object")]).dtype + == "object" + ) # sparse # TODO: move? - result = pd.concat([Series(dtype='float64').to_sparse(), - Series(dtype='float64').to_sparse()]) - assert result.dtype == 'Sparse[float64]' + result = pd.concat( + [Series(dtype="float64").to_sparse(), Series(dtype="float64").to_sparse()] + ) + assert result.dtype == "Sparse[float64]" # GH 26705 - Assert .ftype is deprecated with tm.assert_produces_warning(FutureWarning): - assert result.ftype == 'float64:sparse' + assert result.ftype == "float64:sparse" - result = pd.concat([Series(dtype='float64').to_sparse(), - Series(dtype='float64')]) + result = pd.concat( + [Series(dtype="float64").to_sparse(), Series(dtype="float64")] + ) # TODO: release-note: concat sparse dtype expected = pd.core.sparse.api.SparseDtype(np.float64) assert result.dtype == expected # GH 26705 - Assert .ftype is deprecated with tm.assert_produces_warning(FutureWarning): - assert result.ftype == 'float64:sparse' + assert result.ftype == "float64:sparse" - result = pd.concat([Series(dtype='float64').to_sparse(), - Series(dtype='object')]) + result = pd.concat( + [Series(dtype="float64").to_sparse(), Series(dtype="object")] + ) # TODO: release-note: concat sparse dtype - expected = pd.core.sparse.api.SparseDtype('object') + expected = pd.core.sparse.api.SparseDtype("object") assert result.dtype == expected # GH 26705 - Assert .ftype is deprecated with tm.assert_produces_warning(FutureWarning): - assert result.ftype == 'object:sparse' + assert result.ftype == "object:sparse" def test_combine_first_dt64(self): from pandas.core.tools.datetimes import to_datetime + s0 = to_datetime(Series(["2010", np.NaN])) s1 = to_datetime(Series([np.NaN, "2011"])) rs = s0.combine_first(s1) - xp = to_datetime(Series(['2010', '2011'])) + xp = to_datetime(Series(["2010", "2011"])) assert_series_equal(rs, xp) s0 = to_datetime(Series(["2010", np.NaN])) s1 = Series([np.NaN, "2011"]) rs = s0.combine_first(s1) - xp = Series([datetime(2010, 1, 1), '2011']) + xp = Series([datetime(2010, 1, 1), "2011"]) assert_series_equal(rs, xp) class TestTimeseries: - def test_append_concat(self): - rng = date_range('5/8/2012 1:45', periods=10, freq='5T') + rng = date_range("5/8/2012 1:45", periods=10, freq="5T") ts = Series(np.random.randn(len(rng)), rng) df = DataFrame(np.random.randn(len(rng), 4), index=rng) @@ -311,19 +344,16 @@ def test_append_concat(self): # different index names rng1 = rng.copy() rng2 = rng.copy() - rng1.name = 'foo' - rng2.name = 'bar' - assert rng1.append(rng1).name == 'foo' + rng1.name = "foo" + rng2.name = "bar" + assert rng1.append(rng1).name == "foo" assert rng1.append(rng2).name is None def test_append_concat_tz(self): # see gh-2938 - rng = date_range('5/8/2012 1:45', periods=10, freq='5T', - tz='US/Eastern') - rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', - tz='US/Eastern') - rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', - tz='US/Eastern') + rng = date_range("5/8/2012 1:45", periods=10, freq="5T", tz="US/Eastern") + rng2 = date_range("5/8/2012 2:35", periods=10, freq="5T", tz="US/Eastern") + rng3 = date_range("5/8/2012 1:45", periods=20, freq="5T", tz="US/Eastern") ts = Series(np.random.randn(len(rng)), rng) df = DataFrame(np.random.randn(len(rng), 4), index=rng) ts2 = Series(np.random.randn(len(rng2)), rng2) @@ -341,12 +371,15 @@ def test_append_concat_tz_explicit_pytz(self): # see gh-2938 from pytz import timezone as timezone - rng = date_range('5/8/2012 1:45', periods=10, freq='5T', - tz=timezone('US/Eastern')) - rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', - tz=timezone('US/Eastern')) - rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', - tz=timezone('US/Eastern')) + rng = date_range( + "5/8/2012 1:45", periods=10, freq="5T", tz=timezone("US/Eastern") + ) + rng2 = date_range( + "5/8/2012 2:35", periods=10, freq="5T", tz=timezone("US/Eastern") + ) + rng3 = date_range( + "5/8/2012 1:45", periods=20, freq="5T", tz=timezone("US/Eastern") + ) ts = Series(np.random.randn(len(rng)), rng) df = DataFrame(np.random.randn(len(rng), 4), index=rng) ts2 = Series(np.random.randn(len(rng2)), rng2) @@ -362,12 +395,15 @@ def test_append_concat_tz_explicit_pytz(self): def test_append_concat_tz_dateutil(self): # see gh-2938 - rng = date_range('5/8/2012 1:45', periods=10, freq='5T', - tz='dateutil/US/Eastern') - rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', - tz='dateutil/US/Eastern') - rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', - tz='dateutil/US/Eastern') + rng = date_range( + "5/8/2012 1:45", periods=10, freq="5T", tz="dateutil/US/Eastern" + ) + rng2 = date_range( + "5/8/2012 2:35", periods=10, freq="5T", tz="dateutil/US/Eastern" + ) + rng3 = date_range( + "5/8/2012 1:45", periods=20, freq="5T", tz="dateutil/US/Eastern" + ) ts = Series(np.random.randn(len(rng)), rng) df = DataFrame(np.random.randn(len(rng), 4), index=rng) ts2 = Series(np.random.randn(len(rng2)), rng2) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 663d5ae5053030..2f09d777e719cb 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -10,52 +10,63 @@ from pandas._libs.tslib import iNaT from pandas.compat import PY36 -from pandas.core.dtypes.common import ( - is_categorical_dtype, is_datetime64tz_dtype) +from pandas.core.dtypes.common import is_categorical_dtype, is_datetime64tz_dtype from pandas.core.dtypes.dtypes import CategoricalDtype, ordered_sentinel import pandas as pd from pandas import ( - Categorical, DataFrame, Index, IntervalIndex, MultiIndex, NaT, Series, - Timestamp, date_range, isna, period_range, timedelta_range) + Categorical, + DataFrame, + Index, + IntervalIndex, + MultiIndex, + NaT, + Series, + Timestamp, + date_range, + isna, + period_range, + timedelta_range, +) from pandas.core.arrays import period_array import pandas.util.testing as tm from pandas.util.testing import assert_series_equal class TestSeriesConstructors: - - @pytest.mark.parametrize('constructor,check_index_type', [ - # NOTE: some overlap with test_constructor_empty but that test does not - # test for None or an empty generator. - # test_constructor_pass_none tests None but only with the index also - # passed. - (lambda: Series(), True), - (lambda: Series(None), True), - (lambda: Series({}), True), - (lambda: Series(()), False), # creates a RangeIndex - (lambda: Series([]), False), # creates a RangeIndex - (lambda: Series((x for x in [])), False), # creates a RangeIndex - (lambda: Series(data=None), True), - (lambda: Series(data={}), True), - (lambda: Series(data=()), False), # creates a RangeIndex - (lambda: Series(data=[]), False), # creates a RangeIndex - (lambda: Series(data=(x for x in [])), False), # creates a RangeIndex - ]) + @pytest.mark.parametrize( + "constructor,check_index_type", + [ + # NOTE: some overlap with test_constructor_empty but that test does not + # test for None or an empty generator. + # test_constructor_pass_none tests None but only with the index also + # passed. + (lambda: Series(), True), + (lambda: Series(None), True), + (lambda: Series({}), True), + (lambda: Series(()), False), # creates a RangeIndex + (lambda: Series([]), False), # creates a RangeIndex + (lambda: Series((x for x in [])), False), # creates a RangeIndex + (lambda: Series(data=None), True), + (lambda: Series(data={}), True), + (lambda: Series(data=()), False), # creates a RangeIndex + (lambda: Series(data=[]), False), # creates a RangeIndex + (lambda: Series(data=(x for x in [])), False), # creates a RangeIndex + ], + ) def test_empty_constructor(self, constructor, check_index_type): expected = Series() result = constructor() assert len(result.index) == 0 - tm.assert_series_equal(result, expected, - check_index_type=check_index_type) + tm.assert_series_equal(result, expected, check_index_type=check_index_type) def test_invalid_dtype(self): # GH15520 - msg = 'not understood' - invalid_list = [pd.Timestamp, 'pd.Timestamp', list] + msg = "not understood" + invalid_list = [pd.Timestamp, "pd.Timestamp", list] for dtype in invalid_list: with pytest.raises(TypeError, match=msg): - Series([], name='time', dtype=dtype) + Series([], name="time", dtype=dtype) def test_scalar_conversion(self): @@ -64,8 +75,8 @@ def test_scalar_conversion(self): assert not isinstance(scalar, float) # Coercion - assert float(Series([1.])) == 1.0 - assert int(Series([1.])) == 1 + assert float(Series([1.0])) == 1.0 + assert int(Series([1.0])) == 1 def test_constructor(self, datetime_series): empty_series = Series() @@ -81,7 +92,7 @@ def test_constructor(self, datetime_series): assert id(datetime_series.index) == id(derived.index) # Mixed type Series - mixed = Series(['hello', np.NaN], index=[0, 1]) + mixed = Series(["hello", np.NaN], index=[0, 1]) assert mixed.dtype == np.object_ assert mixed[1] is np.NaN @@ -92,9 +103,9 @@ def test_constructor(self, datetime_series): with pytest.raises(Exception, match="Data must be 1-dimensional"): Series(np.random.randn(3, 3), index=np.arange(3)) - mixed.name = 'Series' + mixed.name = "Series" rs = Series(mixed).name - xp = 'Series' + xp = "Series" assert rs == xp # raise on MultiIndex GH4187 @@ -103,7 +114,7 @@ def test_constructor(self, datetime_series): with pytest.raises(NotImplementedError, match=msg): Series(m) - @pytest.mark.parametrize('input_class', [list, dict, OrderedDict]) + @pytest.mark.parametrize("input_class", [list, dict, OrderedDict]) def test_constructor_empty(self, input_class): empty = Series() empty2 = Series(input_class()) @@ -113,13 +124,13 @@ def test_constructor_empty(self, input_class): assert_series_equal(empty, empty2, check_index_type=False) # With explicit dtype: - empty = Series(dtype='float64') - empty2 = Series(input_class(), dtype='float64') + empty = Series(dtype="float64") + empty2 = Series(input_class(), dtype="float64") assert_series_equal(empty, empty2, check_index_type=False) # GH 18515 : with dtype=category: - empty = Series(dtype='category') - empty2 = Series(input_class(), dtype='category') + empty = Series(dtype="category") + empty2 = Series(input_class(), dtype="category") assert_series_equal(empty, empty2, check_index_type=False) if input_class is not list: @@ -130,26 +141,26 @@ def test_constructor_empty(self, input_class): # With index and dtype float64: empty = Series(np.nan, index=range(10)) - empty2 = Series(input_class(), index=range(10), dtype='float64') + empty2 = Series(input_class(), index=range(10), dtype="float64") assert_series_equal(empty, empty2) # GH 19853 : with empty string, index and dtype str - empty = Series('', dtype=str, index=range(3)) - empty2 = Series('', index=range(3)) + empty = Series("", dtype=str, index=range(3)) + empty2 = Series("", index=range(3)) assert_series_equal(empty, empty2) - @pytest.mark.parametrize('input_arg', [np.nan, float('nan')]) + @pytest.mark.parametrize("input_arg", [np.nan, float("nan")]) def test_constructor_nan(self, input_arg): - empty = Series(dtype='float64', index=range(10)) + empty = Series(dtype="float64", index=range(10)) empty2 = Series(input_arg, index=range(10)) assert_series_equal(empty, empty2, check_index_type=False) - @pytest.mark.parametrize('dtype', [ - 'f8', 'i8', 'M8[ns]', 'm8[ns]', 'category', 'object', - 'datetime64[ns, UTC]', - ]) - @pytest.mark.parametrize('index', [None, pd.Index([])]) + @pytest.mark.parametrize( + "dtype", + ["f8", "i8", "M8[ns]", "m8[ns]", "category", "object", "datetime64[ns, UTC]"], + ) + @pytest.mark.parametrize("index", [None, pd.Index([])]) def test_constructor_dtype_only(self, dtype, index): # GH-20865 result = pd.Series(dtype=dtype, index=index) @@ -157,15 +168,15 @@ def test_constructor_dtype_only(self, dtype, index): assert len(result) == 0 def test_constructor_no_data_index_order(self): - result = pd.Series(index=['b', 'a', 'c']) - assert result.index.tolist() == ['b', 'a', 'c'] + result = pd.Series(index=["b", "a", "c"]) + assert result.index.tolist() == ["b", "a", "c"] def test_constructor_no_data_string_type(self): # GH 22477 result = pd.Series(index=[1], dtype=str) assert np.isnan(result.iloc[0]) - @pytest.mark.parametrize('item', ['entry', 'ѐ', 13]) + @pytest.mark.parametrize("item", ["entry", "ѐ", 13]) def test_constructor_string_element_string_type(self, item): # GH 22477 result = pd.Series(item, index=[1], dtype=str) @@ -173,17 +184,17 @@ def test_constructor_string_element_string_type(self, item): def test_constructor_dtype_str_na_values(self, string_dtype): # https://github.com/pandas-dev/pandas/issues/21083 - ser = Series(['x', None], dtype=string_dtype) + ser = Series(["x", None], dtype=string_dtype) result = ser.isna() expected = Series([False, True]) tm.assert_series_equal(result, expected) assert ser.iloc[1] is None - ser = Series(['x', np.nan], dtype=string_dtype) + ser = Series(["x", np.nan], dtype=string_dtype) assert np.isnan(ser.iloc[1]) def test_constructor_series(self): - index1 = ['d', 'b', 'a', 'c'] + index1 = ["d", "b", "a", "c"] index2 = sorted(index1) s1 = Series([4, 7, -5, 3], index=index1) s2 = Series(s1, index=index2) @@ -197,20 +208,20 @@ def __iter__(self): for i in range(10): yield i - expected = Series(list(range(10)), dtype='int64') - result = Series(Iter(), dtype='int64') + expected = Series(list(range(10)), dtype="int64") + result = Series(Iter(), dtype="int64") assert_series_equal(result, expected) def test_constructor_sequence(self): # GH 21987 - expected = Series(list(range(10)), dtype='int64') - result = Series(range(10), dtype='int64') + expected = Series(list(range(10)), dtype="int64") + result = Series(range(10), dtype="int64") assert_series_equal(result, expected) def test_constructor_single_str(self): # GH 21987 - expected = Series(['abc']) - result = Series('abc') + expected = Series(["abc"]) + result = Series("abc") assert_series_equal(result, expected) def test_constructor_list_like(self): @@ -218,27 +229,28 @@ def test_constructor_list_like(self): # make sure that we are coercing different # list-likes to standard dtypes and not # platform specific - expected = Series([1, 2, 3], dtype='int64') - for obj in [[1, 2, 3], (1, 2, 3), - np.array([1, 2, 3], dtype='int64')]: + expected = Series([1, 2, 3], dtype="int64") + for obj in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype="int64")]: result = Series(obj, index=[0, 1, 2]) assert_series_equal(result, expected) - @pytest.mark.parametrize('dtype', ['bool', 'int32', 'int64', 'float64']) + @pytest.mark.parametrize("dtype", ["bool", "int32", "int64", "float64"]) def test_constructor_index_dtype(self, dtype): # GH 17088 s = Series(Index([0, 2, 4]), dtype=dtype) assert s.dtype == dtype - @pytest.mark.parametrize('input_vals', [ - ([1, 2]), - (['1', '2']), - (list(pd.date_range('1/1/2011', periods=2, freq='H'))), - (list(pd.date_range('1/1/2011', periods=2, freq='H', - tz='US/Eastern'))), - ([pd.Interval(left=0, right=5)]), - ]) + @pytest.mark.parametrize( + "input_vals", + [ + ([1, 2]), + (["1", "2"]), + (list(pd.date_range("1/1/2011", periods=2, freq="H"))), + (list(pd.date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))), + ([pd.Interval(left=0, right=5)]), + ], + ) def test_constructor_list_str(self, input_vals, string_dtype): # GH 16605 # Ensure that data elements from a list are converted to strings @@ -249,7 +261,7 @@ def test_constructor_list_str(self, input_vals, string_dtype): def test_constructor_list_str_na(self, string_dtype): result = Series([1.0, 2.0, np.nan], dtype=string_dtype) - expected = Series(['1.0', '2.0', np.nan], dtype=object) + expected = Series(["1.0", "2.0", np.nan], dtype=object) assert_series_equal(result, expected) assert np.isnan(result[2]) @@ -279,37 +291,35 @@ def test_constructor_map(self): assert_series_equal(result, exp) def test_constructor_categorical(self): - cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'], - fastpath=True) + cat = pd.Categorical([0, 1, 2, 0, 1, 2], ["a", "b", "c"], fastpath=True) res = Series(cat) tm.assert_categorical_equal(res.values, cat) # can cast to a new dtype - result = Series(pd.Categorical([1, 2, 3]), - dtype='int64') - expected = pd.Series([1, 2, 3], dtype='int64') + result = Series(pd.Categorical([1, 2, 3]), dtype="int64") + expected = pd.Series([1, 2, 3], dtype="int64") tm.assert_series_equal(result, expected) # GH12574 - cat = Series(pd.Categorical([1, 2, 3]), dtype='category') + cat = Series(pd.Categorical([1, 2, 3]), dtype="category") assert is_categorical_dtype(cat) assert is_categorical_dtype(cat.dtype) - s = Series([1, 2, 3], dtype='category') + s = Series([1, 2, 3], dtype="category") assert is_categorical_dtype(s) assert is_categorical_dtype(s.dtype) def test_constructor_categorical_with_coercion(self): - factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]) # test basic creation / coercion of categoricals - s = Series(factor, name='A') - assert s.dtype == 'category' + s = Series(factor, name="A") + assert s.dtype == "category" assert len(s) == len(factor) str(s.values) str(s) # in a frame - df = DataFrame({'A': factor}) - result = df['A'] + df = DataFrame({"A": factor}) + result = df["A"] tm.assert_series_equal(result, s) result = df.iloc[:, 0] tm.assert_series_equal(result, s) @@ -317,30 +327,30 @@ def test_constructor_categorical_with_coercion(self): str(df.values) str(df) - df = DataFrame({'A': s}) - result = df['A'] + df = DataFrame({"A": s}) + result = df["A"] tm.assert_series_equal(result, s) assert len(df) == len(factor) str(df.values) str(df) # multiples - df = DataFrame({'A': s, 'B': s, 'C': 1}) - result1 = df['A'] - result2 = df['B'] + df = DataFrame({"A": s, "B": s, "C": 1}) + result1 = df["A"] + result2 = df["B"] tm.assert_series_equal(result1, s) tm.assert_series_equal(result2, s, check_names=False) - assert result2.name == 'B' + assert result2.name == "B" assert len(df) == len(factor) str(df.values) str(df) # GH8623 - x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], - [1, 'John P. Doe']], - columns=['person_id', 'person_name']) - x['person_name'] = Categorical(x.person_name - ) # doing this breaks transform + x = DataFrame( + [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]], + columns=["person_id", "person_name"], + ) + x["person_name"] = Categorical(x.person_name) # doing this breaks transform expected = x.iloc[0].person_name result = x.person_name.iloc[0] @@ -353,47 +363,50 @@ def test_constructor_categorical_with_coercion(self): assert result == expected def test_constructor_categorical_dtype(self): - result = pd.Series(['a', 'b'], - dtype=CategoricalDtype(['a', 'b', 'c'], - ordered=True)) + result = pd.Series( + ["a", "b"], dtype=CategoricalDtype(["a", "b", "c"], ordered=True) + ) assert is_categorical_dtype(result) is True - tm.assert_index_equal(result.cat.categories, pd.Index(['a', 'b', 'c'])) + tm.assert_index_equal(result.cat.categories, pd.Index(["a", "b", "c"])) assert result.cat.ordered - result = pd.Series(['a', 'b'], dtype=CategoricalDtype(['b', 'a'])) + result = pd.Series(["a", "b"], dtype=CategoricalDtype(["b", "a"])) assert is_categorical_dtype(result) - tm.assert_index_equal(result.cat.categories, pd.Index(['b', 'a'])) + tm.assert_index_equal(result.cat.categories, pd.Index(["b", "a"])) assert result.cat.ordered is False # GH 19565 - Check broadcasting of scalar with Categorical dtype - result = Series('a', index=[0, 1], - dtype=CategoricalDtype(['a', 'b'], ordered=True)) - expected = Series(['a', 'a'], index=[0, 1], - dtype=CategoricalDtype(['a', 'b'], ordered=True)) + result = Series( + "a", index=[0, 1], dtype=CategoricalDtype(["a", "b"], ordered=True) + ) + expected = Series( + ["a", "a"], index=[0, 1], dtype=CategoricalDtype(["a", "b"], ordered=True) + ) tm.assert_series_equal(result, expected, check_categorical=True) def test_constructor_categorical_string(self): # GH 26336: the string 'category' maintains existing CategoricalDtype - cdt = CategoricalDtype(categories=list('dabc'), ordered=True) - expected = Series(list('abcabc'), dtype=cdt) + cdt = CategoricalDtype(categories=list("dabc"), ordered=True) + expected = Series(list("abcabc"), dtype=cdt) # Series(Categorical, dtype='category') keeps existing dtype - cat = Categorical(list('abcabc'), dtype=cdt) - result = Series(cat, dtype='category') + cat = Categorical(list("abcabc"), dtype=cdt) + result = Series(cat, dtype="category") tm.assert_series_equal(result, expected) # Series(Series[Categorical], dtype='category') keeps existing dtype - result = Series(result, dtype='category') + result = Series(result, dtype="category") tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('none, warning', [ - (None, None), (ordered_sentinel, FutureWarning)]) + @pytest.mark.parametrize( + "none, warning", [(None, None), (ordered_sentinel, FutureWarning)] + ) def test_categorical_ordered_none_deprecated(self, none, warning): # GH 26336: only warn if None is not explicitly passed - cdt1 = CategoricalDtype(categories=list('cdab'), ordered=True) - cdt2 = CategoricalDtype(categories=list('cedafb'), ordered=none) + cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True) + cdt2 = CategoricalDtype(categories=list("cedafb"), ordered=none) - cat = Categorical(list('abcdaba'), dtype=cdt1) + cat = Categorical(list("abcdaba"), dtype=cdt1) with tm.assert_produces_warning(warning, check_stacklevel=False): Series(cat, dtype=cdt2) @@ -436,21 +449,19 @@ def test_categorical_sideeffects_free(self): tm.assert_numpy_array_equal(cat.__array__(), exp_s2) def test_unordered_compare_equal(self): - left = pd.Series(['a', 'b', 'c'], - dtype=CategoricalDtype(['a', 'b'])) - right = pd.Series(pd.Categorical(['a', 'b', np.nan], - categories=['a', 'b'])) + left = pd.Series(["a", "b", "c"], dtype=CategoricalDtype(["a", "b"])) + right = pd.Series(pd.Categorical(["a", "b", np.nan], categories=["a", "b"])) tm.assert_series_equal(left, right) def test_constructor_maskedarray(self): - data = ma.masked_all((3, ), dtype=float) + data = ma.masked_all((3,), dtype=float) result = Series(data) expected = Series([nan, nan, nan]) assert_series_equal(result, expected) data[0] = 0.0 data[2] = 2.0 - index = ['a', 'b', 'c'] + index = ["a", "b", "c"] result = Series(data, index=index) expected = Series([0.0, nan, 2.0], index=index) assert_series_equal(result, expected) @@ -460,14 +471,14 @@ def test_constructor_maskedarray(self): expected = Series([0.0, 1.0, 2.0], index=index) assert_series_equal(result, expected) - data = ma.masked_all((3, ), dtype=int) + data = ma.masked_all((3,), dtype=int) result = Series(data) expected = Series([nan, nan, nan], dtype=float) assert_series_equal(result, expected) data[0] = 0 data[2] = 2 - index = ['a', 'b', 'c'] + index = ["a", "b", "c"] result = Series(data, index=index) expected = Series([0, nan, 2], index=index, dtype=float) assert_series_equal(result, expected) @@ -477,14 +488,14 @@ def test_constructor_maskedarray(self): expected = Series([0, 1, 2], index=index, dtype=int) assert_series_equal(result, expected) - data = ma.masked_all((3, ), dtype=bool) + data = ma.masked_all((3,), dtype=bool) result = Series(data) expected = Series([nan, nan, nan], dtype=object) assert_series_equal(result, expected) data[0] = True data[2] = False - index = ['a', 'b', 'c'] + index = ["a", "b", "c"] result = Series(data, index=index) expected = Series([True, nan, False], index=index, dtype=object) assert_series_equal(result, expected) @@ -494,34 +505,40 @@ def test_constructor_maskedarray(self): expected = Series([True, True, False], index=index, dtype=bool) assert_series_equal(result, expected) - data = ma.masked_all((3, ), dtype='M8[ns]') + data = ma.masked_all((3,), dtype="M8[ns]") result = Series(data) - expected = Series([iNaT, iNaT, iNaT], dtype='M8[ns]') + expected = Series([iNaT, iNaT, iNaT], dtype="M8[ns]") assert_series_equal(result, expected) data[0] = datetime(2001, 1, 1) data[2] = datetime(2001, 1, 3) - index = ['a', 'b', 'c'] + index = ["a", "b", "c"] result = Series(data, index=index) - expected = Series([datetime(2001, 1, 1), iNaT, - datetime(2001, 1, 3)], index=index, dtype='M8[ns]') + expected = Series( + [datetime(2001, 1, 1), iNaT, datetime(2001, 1, 3)], + index=index, + dtype="M8[ns]", + ) assert_series_equal(result, expected) data[1] = datetime(2001, 1, 2) result = Series(data, index=index) - expected = Series([datetime(2001, 1, 1), datetime(2001, 1, 2), - datetime(2001, 1, 3)], index=index, dtype='M8[ns]') + expected = Series( + [datetime(2001, 1, 1), datetime(2001, 1, 2), datetime(2001, 1, 3)], + index=index, + dtype="M8[ns]", + ) assert_series_equal(result, expected) def test_constructor_maskedarray_hardened(self): # Check numpy masked arrays with hard masks -- from GH24574 - data = ma.masked_all((3, ), dtype=float).harden_mask() + data = ma.masked_all((3,), dtype=float).harden_mask() result = pd.Series(data) expected = pd.Series([nan, nan, nan]) tm.assert_series_equal(result, expected) def test_series_ctor_plus_datetimeindex(self): - rng = date_range('20090415', '20090519', freq='B') + rng = date_range("20090415", "20090519", freq="B") data = {k: 1 for k in rng} result = Series(data, index=rng) @@ -531,17 +548,22 @@ def test_constructor_default_index(self): s = Series([0, 1, 2]) tm.assert_index_equal(s.index, pd.Index(np.arange(3))) - @pytest.mark.parametrize('input', [[1, 2, 3], - (1, 2, 3), - list(range(3)), - pd.Categorical(['a', 'b', 'a']), - (i for i in range(3)), - map(lambda x: x, range(3))]) + @pytest.mark.parametrize( + "input", + [ + [1, 2, 3], + (1, 2, 3), + list(range(3)), + pd.Categorical(["a", "b", "a"]), + (i for i in range(3)), + map(lambda x: x, range(3)), + ], + ) def test_constructor_index_mismatch(self, input): # GH 19342 # test that construction of a Series with an index of different length # raises an error - msg = 'Length of passed values is 3, index implies 4' + msg = "Length of passed values is 3, index implies 4" with pytest.raises(ValueError, match=msg): Series(input, index=np.arange(4)) @@ -549,8 +571,8 @@ def test_constructor_numpy_scalar(self): # GH 19342 # construction with a numpy scalar # should not raise - result = Series(np.array(100), index=np.arange(4), dtype='int64') - expected = Series(100, index=np.arange(4), dtype='int64') + result = Series(np.array(100), index=np.arange(4), dtype="int64") + expected = Series(100, index=np.arange(4), dtype="int64") tm.assert_series_equal(result, expected) def test_constructor_broadcast_list(self): @@ -559,7 +581,7 @@ def test_constructor_broadcast_list(self): # should raise msg = "Length of passed values is 1, index implies 3" with pytest.raises(ValueError, match=msg): - Series(['foo'], index=['a', 'b', 'c']) + Series(["foo"], index=["a", "b", "c"]) def test_constructor_corner(self): df = tm.makeTimeDataFrame() @@ -568,16 +590,16 @@ def test_constructor_corner(self): assert isinstance(s, Series) def test_constructor_sanitize(self): - s = Series(np.array([1., 1., 8.]), dtype='i8') - assert s.dtype == np.dtype('i8') + s = Series(np.array([1.0, 1.0, 8.0]), dtype="i8") + assert s.dtype == np.dtype("i8") - s = Series(np.array([1., 1., np.nan]), copy=True, dtype='i8') - assert s.dtype == np.dtype('f8') + s = Series(np.array([1.0, 1.0, np.nan]), copy=True, dtype="i8") + assert s.dtype == np.dtype("f8") def test_constructor_copy(self): # GH15125 # test dtype parameter has no side effects on copy=True - for data in [[1.], np.array([1.])]: + for data in [[1.0], np.array([1.0])]: x = Series(data) y = pd.Series(x, copy=True, dtype=float) @@ -585,22 +607,24 @@ def test_constructor_copy(self): tm.assert_series_equal(x, y) # changes to origin of copy does not affect the copy - x[0] = 2. + x[0] = 2.0 assert not x.equals(y) - assert x[0] == 2. - assert y[0] == 1. + assert x[0] == 2.0 + assert y[0] == 1.0 @pytest.mark.parametrize( "index", [ - pd.date_range('20170101', periods=3, tz='US/Eastern'), - pd.date_range('20170101', periods=3), - pd.timedelta_range('1 day', periods=3), - pd.period_range('2012Q1', periods=3, freq='Q'), - pd.Index(list('abc')), + pd.date_range("20170101", periods=3, tz="US/Eastern"), + pd.date_range("20170101", periods=3), + pd.timedelta_range("1 day", periods=3), + pd.period_range("2012Q1", periods=3, freq="Q"), + pd.Index(list("abc")), pd.Int64Index([1, 2, 3]), - pd.RangeIndex(0, 3)], - ids=lambda x: type(x).__name__) + pd.RangeIndex(0, 3), + ], + ids=lambda x: type(x).__name__, + ) def test_constructor_limit_copies(self, index): # GH 17449 # limit copies of input @@ -630,7 +654,7 @@ def test_constructor_pass_nan_nat(self): tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp) exp = Series([pd.NaT, pd.NaT]) - assert exp.dtype == 'datetime64[ns]' + assert exp.dtype == "datetime64[ns]" tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp) tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp) @@ -647,7 +671,7 @@ def test_constructor_cast(self): def test_constructor_unsigned_dtype_overflow(self, uint_dtype): # see gh-15832 - msg = 'Trying to coerce negative values to unsigned integers' + msg = "Trying to coerce negative values to unsigned integers" with pytest.raises(OverflowError, match=msg): Series([-1], dtype=uint_dtype) @@ -675,38 +699,36 @@ def test_constructor_datelike_coercion(self): # GH 9477 # incorrectly inferring on dateimelike looking when object dtype is # specified - s = Series([Timestamp('20130101'), 'NOV'], dtype=object) - assert s.iloc[0] == Timestamp('20130101') - assert s.iloc[1] == 'NOV' + s = Series([Timestamp("20130101"), "NOV"], dtype=object) + assert s.iloc[0] == Timestamp("20130101") + assert s.iloc[1] == "NOV" assert s.dtype == object # the dtype was being reset on the slicing and re-inferred to datetime # even thought the blocks are mixed - belly = '216 3T19'.split() - wing1 = '2T15 4H19'.split() - wing2 = '416 4T20'.split() - mat = pd.to_datetime('2016-01-22 2019-09-07'.split()) - df = pd.DataFrame( - {'wing1': wing1, - 'wing2': wing2, - 'mat': mat}, index=belly) - - result = df.loc['3T19'] + belly = "216 3T19".split() + wing1 = "2T15 4H19".split() + wing2 = "416 4T20".split() + mat = pd.to_datetime("2016-01-22 2019-09-07".split()) + df = pd.DataFrame({"wing1": wing1, "wing2": wing2, "mat": mat}, index=belly) + + result = df.loc["3T19"] assert result.dtype == object - result = df.loc['216'] + result = df.loc["216"] assert result.dtype == object def test_constructor_datetimes_with_nulls(self): # gh-15869 - for arr in [np.array([None, None, None, None, - datetime.now(), None]), - np.array([None, None, datetime.now(), None])]: + for arr in [ + np.array([None, None, None, None, datetime.now(), None]), + np.array([None, None, datetime.now(), None]), + ]: result = Series(arr) - assert result.dtype == 'M8[ns]' + assert result.dtype == "M8[ns]" def test_constructor_dtype_datetime64(self): - s = Series(iNaT, dtype='M8[ns]', index=range(5)) + s = Series(iNaT, dtype="M8[ns]", index=range(5)) assert isna(s).all() # in theory this should be all nulls, but since @@ -714,16 +736,16 @@ def test_constructor_dtype_datetime64(self): s = Series(iNaT, index=range(5)) assert not isna(s).all() - s = Series(nan, dtype='M8[ns]', index=range(5)) + s = Series(nan, dtype="M8[ns]", index=range(5)) assert isna(s).all() - s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype='M8[ns]') + s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype="M8[ns]") assert isna(s[1]) - assert s.dtype == 'M8[ns]' + assert s.dtype == "M8[ns]" - s = Series([datetime(2001, 1, 2, 0, 0), nan], dtype='M8[ns]') + s = Series([datetime(2001, 1, 2, 0, 0), nan], dtype="M8[ns]") assert isna(s[1]) - assert s.dtype == 'M8[ns]' + assert s.dtype == "M8[ns]" # GH3416 dates = [ @@ -733,43 +755,38 @@ def test_constructor_dtype_datetime64(self): ] s = Series(dates) - assert s.dtype == 'M8[ns]' + assert s.dtype == "M8[ns]" s.iloc[0] = np.nan - assert s.dtype == 'M8[ns]' + assert s.dtype == "M8[ns]" # GH3414 related - expected = Series([ - datetime(2013, 1, 1), - datetime(2013, 1, 2), - datetime(2013, 1, 3), - ], dtype='datetime64[ns]') + expected = Series( + [datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3)], + dtype="datetime64[ns]", + ) - result = Series( - Series(dates).astype(np.int64) / 1000000, dtype='M8[ms]') + result = Series(Series(dates).astype(np.int64) / 1000000, dtype="M8[ms]") tm.assert_series_equal(result, expected) - result = Series(dates, dtype='datetime64[ns]') + result = Series(dates, dtype="datetime64[ns]") tm.assert_series_equal(result, expected) - expected = Series([ - pd.NaT, - datetime(2013, 1, 2), - datetime(2013, 1, 3), - ], dtype='datetime64[ns]') - result = Series([np.nan] + dates[1:], dtype='datetime64[ns]') + expected = Series( + [pd.NaT, datetime(2013, 1, 2), datetime(2013, 1, 3)], dtype="datetime64[ns]" + ) + result = Series([np.nan] + dates[1:], dtype="datetime64[ns]") tm.assert_series_equal(result, expected) - dts = Series(dates, dtype='datetime64[ns]') + dts = Series(dates, dtype="datetime64[ns]") # valid astype - dts.astype('int64') + dts.astype("int64") # invalid casting - msg = (r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" - r" \[int32\]") + msg = r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" r" \[int32\]" with pytest.raises(TypeError, match=msg): - dts.astype('int32') + dts.astype("int32") # ints are ok # we test with np.int64 to get similar results on @@ -786,101 +803,102 @@ def test_constructor_dtype_datetime64(self): assert result[0] == datetime(3000, 1, 1, 0, 0) # don't mix types - result = Series([Timestamp('20130101'), 1], index=['a', 'b']) - assert result['a'] == Timestamp('20130101') - assert result['b'] == 1 + result = Series([Timestamp("20130101"), 1], index=["a", "b"]) + assert result["a"] == Timestamp("20130101") + assert result["b"] == 1 # GH6529 # coerce datetime64 non-ns properly - dates = date_range('01-Jan-2015', '01-Dec-2015', freq='M') - values2 = dates.view(np.ndarray).astype('datetime64[ns]') + dates = date_range("01-Jan-2015", "01-Dec-2015", freq="M") + values2 = dates.view(np.ndarray).astype("datetime64[ns]") expected = Series(values2, index=dates) - for dtype in ['s', 'D', 'ms', 'us', 'ns']: - values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype)) + for dtype in ["s", "D", "ms", "us", "ns"]: + values1 = dates.view(np.ndarray).astype("M8[{0}]".format(dtype)) result = Series(values1, dates) assert_series_equal(result, expected) # GH 13876 # coerce to non-ns to object properly expected = Series(values2, index=dates, dtype=object) - for dtype in ['s', 'D', 'ms', 'us', 'ns']: - values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype)) + for dtype in ["s", "D", "ms", "us", "ns"]: + values1 = dates.view(np.ndarray).astype("M8[{0}]".format(dtype)) result = Series(values1, index=dates, dtype=object) assert_series_equal(result, expected) # leave datetime.date alone - dates2 = np.array([d.date() for d in dates.to_pydatetime()], - dtype=object) + dates2 = np.array([d.date() for d in dates.to_pydatetime()], dtype=object) series1 = Series(dates2, dates) tm.assert_numpy_array_equal(series1.values, dates2) assert series1.dtype == object # these will correctly infer a datetime - s = Series([None, pd.NaT, '2013-08-05 15:30:00.000001']) - assert s.dtype == 'datetime64[ns]' - s = Series([np.nan, pd.NaT, '2013-08-05 15:30:00.000001']) - assert s.dtype == 'datetime64[ns]' - s = Series([pd.NaT, None, '2013-08-05 15:30:00.000001']) - assert s.dtype == 'datetime64[ns]' - s = Series([pd.NaT, np.nan, '2013-08-05 15:30:00.000001']) - assert s.dtype == 'datetime64[ns]' + s = Series([None, pd.NaT, "2013-08-05 15:30:00.000001"]) + assert s.dtype == "datetime64[ns]" + s = Series([np.nan, pd.NaT, "2013-08-05 15:30:00.000001"]) + assert s.dtype == "datetime64[ns]" + s = Series([pd.NaT, None, "2013-08-05 15:30:00.000001"]) + assert s.dtype == "datetime64[ns]" + s = Series([pd.NaT, np.nan, "2013-08-05 15:30:00.000001"]) + assert s.dtype == "datetime64[ns]" # tz-aware (UTC and other tz's) # GH 8411 - dr = date_range('20130101', periods=3) + dr = date_range("20130101", periods=3) assert Series(dr).iloc[0].tz is None - dr = date_range('20130101', periods=3, tz='UTC') - assert str(Series(dr).iloc[0].tz) == 'UTC' - dr = date_range('20130101', periods=3, tz='US/Eastern') - assert str(Series(dr).iloc[0].tz) == 'US/Eastern' + dr = date_range("20130101", periods=3, tz="UTC") + assert str(Series(dr).iloc[0].tz) == "UTC" + dr = date_range("20130101", periods=3, tz="US/Eastern") + assert str(Series(dr).iloc[0].tz) == "US/Eastern" # non-convertible s = Series([1479596223000, -1479590, pd.NaT]) - assert s.dtype == 'object' + assert s.dtype == "object" assert s[2] is pd.NaT - assert 'NaT' in str(s) + assert "NaT" in str(s) # if we passed a NaT it remains s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), pd.NaT]) - assert s.dtype == 'object' + assert s.dtype == "object" assert s[2] is pd.NaT - assert 'NaT' in str(s) + assert "NaT" in str(s) # if we passed a nan it remains s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan]) - assert s.dtype == 'object' + assert s.dtype == "object" assert s[2] is np.nan - assert 'NaN' in str(s) + assert "NaN" in str(s) def test_constructor_with_datetime_tz(self): # 8260 # support datetime64 with tz - dr = date_range('20130101', periods=3, tz='US/Eastern') + dr = date_range("20130101", periods=3, tz="US/Eastern") s = Series(dr) - assert s.dtype.name == 'datetime64[ns, US/Eastern]' - assert s.dtype == 'datetime64[ns, US/Eastern]' + assert s.dtype.name == "datetime64[ns, US/Eastern]" + assert s.dtype == "datetime64[ns, US/Eastern]" assert is_datetime64tz_dtype(s.dtype) - assert 'datetime64[ns, US/Eastern]' in str(s) + assert "datetime64[ns, US/Eastern]" in str(s) # export result = s.values assert isinstance(result, np.ndarray) - assert result.dtype == 'datetime64[ns]' + assert result.dtype == "datetime64[ns]" exp = pd.DatetimeIndex(result) - exp = exp.tz_localize('UTC').tz_convert(tz=s.dt.tz) + exp = exp.tz_localize("UTC").tz_convert(tz=s.dt.tz) tm.assert_index_equal(dr, exp) # indexing result = s.iloc[0] - assert result == Timestamp('2013-01-01 00:00:00-0500', - tz='US/Eastern', freq='D') + assert result == Timestamp( + "2013-01-01 00:00:00-0500", tz="US/Eastern", freq="D" + ) result = s[0] - assert result == Timestamp('2013-01-01 00:00:00-0500', - tz='US/Eastern', freq='D') + assert result == Timestamp( + "2013-01-01 00:00:00-0500", tz="US/Eastern", freq="D" + ) result = s[Series([True, True, False], index=s.index)] assert_series_equal(result, s[0:2]) @@ -893,39 +911,47 @@ def test_constructor_with_datetime_tz(self): assert_series_equal(result, s) # short str - assert 'datetime64[ns, US/Eastern]' in str(s) + assert "datetime64[ns, US/Eastern]" in str(s) # formatting with NaT result = s.shift() - assert 'datetime64[ns, US/Eastern]' in str(result) - assert 'NaT' in str(result) + assert "datetime64[ns, US/Eastern]" in str(result) + assert "NaT" in str(result) # long str - t = Series(date_range('20130101', periods=1000, tz='US/Eastern')) - assert 'datetime64[ns, US/Eastern]' in str(t) + t = Series(date_range("20130101", periods=1000, tz="US/Eastern")) + assert "datetime64[ns, US/Eastern]" in str(t) - result = pd.DatetimeIndex(s, freq='infer') + result = pd.DatetimeIndex(s, freq="infer") tm.assert_index_equal(result, dr) # inference - s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), - pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')]) - assert s.dtype == 'datetime64[ns, US/Pacific]' - assert lib.infer_dtype(s, skipna=True) == 'datetime64' - - s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), - pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Eastern')]) - assert s.dtype == 'object' - assert lib.infer_dtype(s, skipna=True) == 'datetime' + s = Series( + [ + pd.Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), + pd.Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), + ] + ) + assert s.dtype == "datetime64[ns, US/Pacific]" + assert lib.infer_dtype(s, skipna=True) == "datetime64" + + s = Series( + [ + pd.Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), + pd.Timestamp("2013-01-02 14:00:00-0800", tz="US/Eastern"), + ] + ) + assert s.dtype == "object" + assert lib.infer_dtype(s, skipna=True) == "datetime" # with all NaT - s = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]') - expected = Series(pd.DatetimeIndex(['NaT', 'NaT'], tz='US/Eastern')) + s = Series(pd.NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]") + expected = Series(pd.DatetimeIndex(["NaT", "NaT"], tz="US/Eastern")) assert_series_equal(s, expected) @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) @pytest.mark.parametrize("dtype", ["M8", "m8"]) - @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_construction_to_datetimelike_unit(self, arr_dtype, dtype, unit): # tests all units # gh-19223 @@ -937,17 +963,16 @@ def test_construction_to_datetimelike_unit(self, arr_dtype, dtype, unit): tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('arg', - ['2013-01-01 00:00:00', pd.NaT, np.nan, None]) + @pytest.mark.parametrize("arg", ["2013-01-01 00:00:00", pd.NaT, np.nan, None]) def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): # GH 17415: With naive string - result = Series([arg], dtype='datetime64[ns, CET]') - expected = Series(pd.Timestamp(arg)).dt.tz_localize('CET') + result = Series([arg], dtype="datetime64[ns, CET]") + expected = Series(pd.Timestamp(arg)).dt.tz_localize("CET") assert_series_equal(result, expected) def test_construction_interval(self): # construction from interval & array of intervals - index = IntervalIndex.from_breaks(np.arange(3), closed='right') + index = IntervalIndex.from_breaks(np.arange(3), closed="right") result = Series(index) repr(result) str(result) @@ -960,30 +985,30 @@ def test_construction_consistency(self): # make sure that we are not re-localizing upon construction # GH 14928 - s = Series(pd.date_range('20130101', periods=3, tz='US/Eastern')) + s = Series(pd.date_range("20130101", periods=3, tz="US/Eastern")) result = Series(s, dtype=s.dtype) tm.assert_series_equal(result, s) - result = Series(s.dt.tz_convert('UTC'), dtype=s.dtype) + result = Series(s.dt.tz_convert("UTC"), dtype=s.dtype) tm.assert_series_equal(result, s) result = Series(s.values, dtype=s.dtype) tm.assert_series_equal(result, s) def test_constructor_infer_period(self): - data = [pd.Period('2000', 'D'), pd.Period('2001', 'D'), None] + data = [pd.Period("2000", "D"), pd.Period("2001", "D"), None] result = pd.Series(data) expected = pd.Series(period_array(data)) tm.assert_series_equal(result, expected) - assert result.dtype == 'Period[D]' + assert result.dtype == "Period[D]" data = np.asarray(data, dtype=object) tm.assert_series_equal(result, expected) - assert result.dtype == 'Period[D]' + assert result.dtype == "Period[D]" def test_constructor_period_incompatible_frequency(self): - data = [pd.Period('2000', 'D'), pd.Period('2001', 'A')] + data = [pd.Period("2000", "D"), pd.Period("2001", "A")] result = pd.Series(data) assert result.dtype == object assert result.tolist() == data @@ -992,16 +1017,16 @@ def test_constructor_periodindex(self): # GH7932 # converting a PeriodIndex when put in a Series - pi = period_range('20130101', periods=5, freq='D') + pi = period_range("20130101", periods=5, freq="D") s = Series(pi) - assert s.dtype == 'Period[D]' + assert s.dtype == "Period[D]" expected = Series(pi.astype(object)) assert_series_equal(s, expected) def test_constructor_dict(self): - d = {'a': 0., 'b': 1., 'c': 2.} - result = Series(d, index=['b', 'c', 'd', 'a']) - expected = Series([1, 2, nan, 0], index=['b', 'c', 'd', 'a']) + d = {"a": 0.0, "b": 1.0, "c": 2.0} + result = Series(d, index=["b", "c", "d", "a"]) + expected = Series([1, 2, nan, 0], index=["b", "c", "d", "a"]) assert_series_equal(result, expected) pidx = tm.makePeriodIndex(100) @@ -1016,40 +1041,41 @@ def test_constructor_dict_order(self): # GH19018 # initialization ordering: by insertion order if python>= 3.6, else # order by value - d = {'b': 1, 'a': 0, 'c': 2} + d = {"b": 1, "a": 0, "c": 2} result = Series(d) if PY36: - expected = Series([1, 0, 2], index=list('bac')) + expected = Series([1, 0, 2], index=list("bac")) else: - expected = Series([0, 1, 2], index=list('abc')) + expected = Series([0, 1, 2], index=list("abc")) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')]) + @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")]) def test_constructor_dict_nan_key(self, value): # GH 18480 - d = {1: 'a', value: 'b', float('nan'): 'c', 4: 'd'} + d = {1: "a", value: "b", float("nan"): "c", 4: "d"} result = Series(d).sort_values() - expected = Series(['a', 'b', 'c', 'd'], index=[1, value, np.nan, 4]) + expected = Series(["a", "b", "c", "d"], index=[1, value, np.nan, 4]) assert_series_equal(result, expected) # MultiIndex: - d = {(1, 1): 'a', (2, np.nan): 'b', (3, value): 'c'} + d = {(1, 1): "a", (2, np.nan): "b", (3, value): "c"} result = Series(d).sort_values() - expected = Series(['a', 'b', 'c'], - index=Index([(1, 1), (2, np.nan), (3, value)])) + expected = Series( + ["a", "b", "c"], index=Index([(1, 1), (2, np.nan), (3, value)]) + ) assert_series_equal(result, expected) def test_constructor_dict_datetime64_index(self): # GH 9456 - dates_as_str = ['1984-02-19', '1988-11-06', '1989-12-03', '1990-03-15'] + dates_as_str = ["1984-02-19", "1988-11-06", "1989-12-03", "1990-03-15"] values = [42544017.198965244, 1234565, 40512335.181958228, -1] def create_data(constructor): return dict(zip((constructor(x) for x in dates_as_str), values)) data_datetime64 = create_data(np.datetime64) - data_datetime = create_data(lambda x: datetime.strptime(x, '%Y-%m-%d')) + data_datetime = create_data(lambda x: datetime.strptime(x, "%Y-%m-%d")) data_Timestamp = create_data(Timestamp) expected = Series(values, (Timestamp(x) for x in dates_as_str)) @@ -1073,11 +1099,9 @@ def test_constructor_tuple_of_tuples(self): assert tuple(s) == data def test_constructor_dict_of_tuples(self): - data = {(1, 2): 3, - (None, 5): 6} + data = {(1, 2): 3, (None, 5): 6} result = Series(data).sort_values() - expected = Series([3, 6], - index=MultiIndex.from_tuples([(1, 2), (None, 5)])) + expected = Series([3, 6], index=MultiIndex.from_tuples([(1, 2), (None, 5)])) tm.assert_series_equal(result, expected) def test_constructor_set(self): @@ -1091,20 +1115,20 @@ def test_constructor_set(self): # https://github.com/pandas-dev/pandas/issues/22698 @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning") def test_fromDict(self): - data = {'a': 0, 'b': 1, 'c': 2, 'd': 3} + data = {"a": 0, "b": 1, "c": 2, "d": 3} series = Series(data) tm.assert_is_sorted(series.index) - data = {'a': 0, 'b': '1', 'c': '2', 'd': datetime.now()} + data = {"a": 0, "b": "1", "c": "2", "d": datetime.now()} series = Series(data) assert series.dtype == np.object_ - data = {'a': 0, 'b': '1', 'c': '2', 'd': '3'} + data = {"a": 0, "b": "1", "c": "2", "d": "3"} series = Series(data) assert series.dtype == np.object_ - data = {'a': '0', 'b': '1'} + data = {"a": "0", "b": "1"} series = Series(data, dtype=float) assert series.dtype == np.float64 @@ -1114,20 +1138,20 @@ def test_fromValue(self, datetime_series): assert nans.dtype == np.float_ assert len(nans) == len(datetime_series) - strings = Series('foo', index=datetime_series.index) + strings = Series("foo", index=datetime_series.index) assert strings.dtype == np.object_ assert len(strings) == len(datetime_series) d = datetime.now() dates = Series(d, index=datetime_series.index) - assert dates.dtype == 'M8[ns]' + assert dates.dtype == "M8[ns]" assert len(dates) == len(datetime_series) # GH12336 # Test construction of categorical series from value categorical = Series(0, index=datetime_series.index, dtype="category") expected = Series(0, index=datetime_series.index).astype("category") - assert categorical.dtype == 'category' + assert categorical.dtype == "category" assert len(categorical) == len(datetime_series) tm.assert_series_equal(categorical, expected) @@ -1135,43 +1159,42 @@ def test_constructor_dtype_timedelta64(self): # basic td = Series([timedelta(days=i) for i in range(3)]) - assert td.dtype == 'timedelta64[ns]' + assert td.dtype == "timedelta64[ns]" td = Series([timedelta(days=1)]) - assert td.dtype == 'timedelta64[ns]' + assert td.dtype == "timedelta64[ns]" - td = Series([timedelta(days=1), timedelta(days=2), np.timedelta64( - 1, 's')]) + td = Series([timedelta(days=1), timedelta(days=2), np.timedelta64(1, "s")]) - assert td.dtype == 'timedelta64[ns]' + assert td.dtype == "timedelta64[ns]" # mixed with NaT - td = Series([timedelta(days=1), NaT], dtype='m8[ns]') - assert td.dtype == 'timedelta64[ns]' + td = Series([timedelta(days=1), NaT], dtype="m8[ns]") + assert td.dtype == "timedelta64[ns]" - td = Series([timedelta(days=1), np.nan], dtype='m8[ns]') - assert td.dtype == 'timedelta64[ns]' + td = Series([timedelta(days=1), np.nan], dtype="m8[ns]") + assert td.dtype == "timedelta64[ns]" - td = Series([np.timedelta64(300000000), pd.NaT], dtype='m8[ns]') - assert td.dtype == 'timedelta64[ns]' + td = Series([np.timedelta64(300000000), pd.NaT], dtype="m8[ns]") + assert td.dtype == "timedelta64[ns]" # improved inference # GH5689 td = Series([np.timedelta64(300000000), NaT]) - assert td.dtype == 'timedelta64[ns]' + assert td.dtype == "timedelta64[ns]" # because iNaT is int, not coerced to timedelta td = Series([np.timedelta64(300000000), iNaT]) - assert td.dtype == 'object' + assert td.dtype == "object" td = Series([np.timedelta64(300000000), np.nan]) - assert td.dtype == 'timedelta64[ns]' + assert td.dtype == "timedelta64[ns]" td = Series([pd.NaT, np.timedelta64(300000000)]) - assert td.dtype == 'timedelta64[ns]' + assert td.dtype == "timedelta64[ns]" - td = Series([np.timedelta64(1, 's')]) - assert td.dtype == 'timedelta64[ns]' + td = Series([np.timedelta64(1, "s")]) + assert td.dtype == "timedelta64[ns]" # these are frequency conversion astypes # for t in ['s', 'D', 'us', 'ms']: @@ -1179,44 +1202,43 @@ def test_constructor_dtype_timedelta64(self): # td.astype('m8[%s]' % t) # valid astype - td.astype('int64') + td.astype("int64") # invalid casting - msg = (r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" - r" \[int32\]") + msg = r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" r" \[int32\]" with pytest.raises(TypeError, match=msg): - td.astype('int32') + td.astype("int32") # this is an invalid casting msg = "Could not convert object to NumPy timedelta" with pytest.raises(ValueError, match=msg): - Series([timedelta(days=1), 'foo'], dtype='m8[ns]') + Series([timedelta(days=1), "foo"], dtype="m8[ns]") # leave as object here - td = Series([timedelta(days=i) for i in range(3)] + ['foo']) - assert td.dtype == 'object' + td = Series([timedelta(days=i) for i in range(3)] + ["foo"]) + assert td.dtype == "object" # these will correctly infer a timedelta - s = Series([None, pd.NaT, '1 Day']) - assert s.dtype == 'timedelta64[ns]' - s = Series([np.nan, pd.NaT, '1 Day']) - assert s.dtype == 'timedelta64[ns]' - s = Series([pd.NaT, None, '1 Day']) - assert s.dtype == 'timedelta64[ns]' - s = Series([pd.NaT, np.nan, '1 Day']) - assert s.dtype == 'timedelta64[ns]' + s = Series([None, pd.NaT, "1 Day"]) + assert s.dtype == "timedelta64[ns]" + s = Series([np.nan, pd.NaT, "1 Day"]) + assert s.dtype == "timedelta64[ns]" + s = Series([pd.NaT, None, "1 Day"]) + assert s.dtype == "timedelta64[ns]" + s = Series([pd.NaT, np.nan, "1 Day"]) + assert s.dtype == "timedelta64[ns]" # GH 16406 def test_constructor_mixed_tz(self): - s = Series([Timestamp('20130101'), - Timestamp('20130101', tz='US/Eastern')]) - expected = Series([Timestamp('20130101'), - Timestamp('20130101', tz='US/Eastern')], - dtype='object') + s = Series([Timestamp("20130101"), Timestamp("20130101", tz="US/Eastern")]) + expected = Series( + [Timestamp("20130101"), Timestamp("20130101", tz="US/Eastern")], + dtype="object", + ) assert_series_equal(s, expected) def test_NaT_scalar(self): - series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') + series = Series([0, 1000, 2000, iNaT], dtype="M8[ns]") val = series[3] assert isna(val) @@ -1226,42 +1248,42 @@ def test_NaT_scalar(self): def test_NaT_cast(self): # GH10747 - result = Series([np.nan]).astype('M8[ns]') + result = Series([np.nan]).astype("M8[ns]") expected = Series([NaT]) assert_series_equal(result, expected) def test_constructor_name_hashable(self): - for n in [777, 777., 'name', datetime(2001, 11, 11), (1, ), "\u05D0"]: - for data in [[1, 2, 3], np.ones(3), {'a': 0, 'b': 1}]: + for n in [777, 777.0, "name", datetime(2001, 11, 11), (1,), "\u05D0"]: + for data in [[1, 2, 3], np.ones(3), {"a": 0, "b": 1}]: s = Series(data, name=n) assert s.name == n def test_constructor_name_unhashable(self): msg = r"Series\.name must be a hashable type" - for n in [['name_list'], np.ones(2), {1: 2}]: - for data in [['name_list'], np.ones(2), {1: 2}]: + for n in [["name_list"], np.ones(2), {1: 2}]: + for data in [["name_list"], np.ones(2), {1: 2}]: with pytest.raises(TypeError, match=msg): Series(data, name=n) def test_auto_conversion(self): - series = Series(list(date_range('1/1/2000', periods=10))) - assert series.dtype == 'M8[ns]' + series = Series(list(date_range("1/1/2000", periods=10))) + assert series.dtype == "M8[ns]" def test_convert_non_ns(self): # convert from a numpy array of non-ns timedelta64 - arr = np.array([1, 2, 3], dtype='timedelta64[s]') + arr = np.array([1, 2, 3], dtype="timedelta64[s]") s = Series(arr) - expected = Series(pd.timedelta_range('00:00:01', periods=3, freq='s')) + expected = Series(pd.timedelta_range("00:00:01", periods=3, freq="s")) assert_series_equal(s, expected) # convert from a numpy array of non-ns datetime64 # note that creating a numpy datetime64 is in LOCAL time!!!! # seems to work for M8[D], but not for M8[s] - s = Series(np.array(['2013-01-01', '2013-01-02', - '2013-01-03'], dtype='datetime64[D]')) - assert_series_equal(s, Series(date_range('20130101', periods=3, - freq='D'))) + s = Series( + np.array(["2013-01-01", "2013-01-02", "2013-01-03"], dtype="datetime64[D]") + ) + assert_series_equal(s, Series(date_range("20130101", periods=3, freq="D"))) # s = Series(np.array(['2013-01-01 00:00:01','2013-01-01 # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')) @@ -1272,10 +1294,12 @@ def test_convert_non_ns(self): @pytest.mark.parametrize( "index", [ - date_range('1/1/2000', periods=10), - timedelta_range('1 day', periods=10), - period_range('2000-Q1', periods=10, freq='Q')], - ids=lambda x: type(x).__name__) + date_range("1/1/2000", periods=10), + timedelta_range("1 day", periods=10), + period_range("2000-Q1", periods=10, freq="Q"), + ], + ids=lambda x: type(x).__name__, + ) def test_constructor_cant_cast_datetimelike(self, index): # floats are not ok @@ -1298,10 +1322,12 @@ def test_constructor_cant_cast_datetimelike(self, index): @pytest.mark.parametrize( "index", [ - date_range('1/1/2000', periods=10), - timedelta_range('1 day', periods=10), - period_range('2000-Q1', periods=10, freq='Q')], - ids=lambda x: type(x).__name__) + date_range("1/1/2000", periods=10), + timedelta_range("1 day", periods=10), + period_range("2000-Q1", periods=10, freq="Q"), + ], + ids=lambda x: type(x).__name__, + ) def test_constructor_cast_object(self, index): s = Series(index, dtype=object) exp = Series(index).astype(object) @@ -1315,10 +1341,7 @@ def test_constructor_cast_object(self, index): exp = Series(index).astype(object) tm.assert_series_equal(s, exp) - @pytest.mark.parametrize("dtype", [ - np.datetime64, - np.timedelta64, - ]) + @pytest.mark.parametrize("dtype", [np.datetime64, np.timedelta64]) def test_constructor_generic_timestamp_no_frequency(self, dtype): # see gh-15524, gh-15987 msg = "dtype has no unit. Please pass in" @@ -1326,27 +1349,32 @@ def test_constructor_generic_timestamp_no_frequency(self, dtype): with pytest.raises(ValueError, match=msg): Series([], dtype=dtype) - @pytest.mark.parametrize("dtype,msg", [ - ("m8[ps]", "cannot convert timedeltalike"), - ("M8[ps]", "cannot convert datetimelike"), - ]) + @pytest.mark.parametrize( + "dtype,msg", + [ + ("m8[ps]", "cannot convert timedeltalike"), + ("M8[ps]", "cannot convert datetimelike"), + ], + ) def test_constructor_generic_timestamp_bad_frequency(self, dtype, msg): # see gh-15524, gh-15987 with pytest.raises(TypeError, match=msg): Series([], dtype=dtype) - @pytest.mark.parametrize('dtype', [None, 'uint8', 'category']) + @pytest.mark.parametrize("dtype", [None, "uint8", "category"]) def test_constructor_range_dtype(self, dtype): # GH 16804 - expected = Series([0, 1, 2, 3, 4], dtype=dtype or 'int64') + expected = Series([0, 1, 2, 3, 4], dtype=dtype or "int64") result = Series(range(5), dtype=dtype) tm.assert_series_equal(result, expected) def test_constructor_tz_mixed_data(self): # GH 13051 - dt_list = [Timestamp('2016-05-01 02:03:37'), - Timestamp('2016-04-30 19:03:37-0700', tz='US/Pacific')] + dt_list = [ + Timestamp("2016-05-01 02:03:37"), + Timestamp("2016-04-30 19:03:37-0700", tz="US/Pacific"), + ] result = Series(dt_list) expected = Series(dt_list, dtype=object) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 86164f53c55155..c9092917cb0c69 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -13,8 +13,17 @@ import pandas as pd from pandas import ( - DataFrame, DatetimeIndex, Index, PeriodIndex, Series, TimedeltaIndex, - bdate_range, date_range, period_range, timedelta_range) + DataFrame, + DatetimeIndex, + Index, + PeriodIndex, + Series, + TimedeltaIndex, + bdate_range, + date_range, + period_range, + timedelta_range, +) from pandas.core.arrays import PeriodArray import pandas.core.common as com import pandas.util.testing as tm @@ -22,27 +31,42 @@ class TestSeriesDatetimeValues: - def test_dt_namespace_accessor(self): # GH 7207, 11128 # test .dt namespace accessor ok_for_period = PeriodArray._datetimelike_ops - ok_for_period_methods = ['strftime', 'to_timestamp', 'asfreq'] + ok_for_period_methods = ["strftime", "to_timestamp", "asfreq"] ok_for_dt = DatetimeIndex._datetimelike_ops - ok_for_dt_methods = ['to_period', 'to_pydatetime', 'tz_localize', - 'tz_convert', 'normalize', 'strftime', 'round', - 'floor', 'ceil', 'day_name', 'month_name'] + ok_for_dt_methods = [ + "to_period", + "to_pydatetime", + "tz_localize", + "tz_convert", + "normalize", + "strftime", + "round", + "floor", + "ceil", + "day_name", + "month_name", + ] ok_for_td = TimedeltaIndex._datetimelike_ops - ok_for_td_methods = ['components', 'to_pytimedelta', 'total_seconds', - 'round', 'floor', 'ceil'] + ok_for_td_methods = [ + "components", + "to_pytimedelta", + "total_seconds", + "round", + "floor", + "ceil", + ] def get_expected(s, name): result = getattr(Index(s._values), prop) if isinstance(result, np.ndarray): if is_integer_dtype(result): - result = result.astype('int64') + result = result.astype("int64") elif not is_list_like(result): return result return Series(result, index=s.index, name=s.name) @@ -56,15 +80,15 @@ def compare(s, name): tm.assert_series_equal(a, b) # datetimeindex - cases = [Series(date_range('20130101', periods=5), name='xxx'), - Series(date_range('20130101', periods=5, freq='s'), - name='xxx'), - Series(date_range('20130101 00:00:00', periods=5, freq='ms'), - name='xxx')] + cases = [ + Series(date_range("20130101", periods=5), name="xxx"), + Series(date_range("20130101", periods=5, freq="s"), name="xxx"), + Series(date_range("20130101 00:00:00", periods=5, freq="ms"), name="xxx"), + ] for s in cases: for prop in ok_for_dt: # we test freq below - if prop != 'freq': + if prop != "freq": compare(s, prop) for prop in ok_for_dt_methods: @@ -74,30 +98,30 @@ def compare(s, name): assert isinstance(result, np.ndarray) assert result.dtype == object - result = s.dt.tz_localize('US/Eastern') - exp_values = DatetimeIndex(s.values).tz_localize('US/Eastern') - expected = Series(exp_values, index=s.index, name='xxx') + result = s.dt.tz_localize("US/Eastern") + exp_values = DatetimeIndex(s.values).tz_localize("US/Eastern") + expected = Series(exp_values, index=s.index, name="xxx") tm.assert_series_equal(result, expected) tz_result = result.dt.tz - assert str(tz_result) == 'US/Eastern' + assert str(tz_result) == "US/Eastern" freq_result = s.dt.freq - assert freq_result == DatetimeIndex(s.values, freq='infer').freq + assert freq_result == DatetimeIndex(s.values, freq="infer").freq # let's localize, then convert - result = s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern') - exp_values = (DatetimeIndex(s.values).tz_localize('UTC') - .tz_convert('US/Eastern')) - expected = Series(exp_values, index=s.index, name='xxx') + result = s.dt.tz_localize("UTC").dt.tz_convert("US/Eastern") + exp_values = ( + DatetimeIndex(s.values).tz_localize("UTC").tz_convert("US/Eastern") + ) + expected = Series(exp_values, index=s.index, name="xxx") tm.assert_series_equal(result, expected) # datetimeindex with tz - s = Series(date_range('20130101', periods=5, tz='US/Eastern'), - name='xxx') + s = Series(date_range("20130101", periods=5, tz="US/Eastern"), name="xxx") for prop in ok_for_dt: # we test freq below - if prop != 'freq': + if prop != "freq": compare(s, prop) for prop in ok_for_dt_methods: @@ -107,27 +131,30 @@ def compare(s, name): assert isinstance(result, np.ndarray) assert result.dtype == object - result = s.dt.tz_convert('CET') - expected = Series(s._values.tz_convert('CET'), - index=s.index, name='xxx') + result = s.dt.tz_convert("CET") + expected = Series(s._values.tz_convert("CET"), index=s.index, name="xxx") tm.assert_series_equal(result, expected) tz_result = result.dt.tz - assert str(tz_result) == 'CET' + assert str(tz_result) == "CET" freq_result = s.dt.freq - assert freq_result == DatetimeIndex(s.values, freq='infer').freq + assert freq_result == DatetimeIndex(s.values, freq="infer").freq # timedelta index - cases = [Series(timedelta_range('1 day', periods=5), - index=list('abcde'), name='xxx'), - Series(timedelta_range('1 day 01:23:45', periods=5, - freq='s'), name='xxx'), - Series(timedelta_range('2 days 01:23:45.012345', periods=5, - freq='ms'), name='xxx')] + cases = [ + Series( + timedelta_range("1 day", periods=5), index=list("abcde"), name="xxx" + ), + Series(timedelta_range("1 day 01:23:45", periods=5, freq="s"), name="xxx"), + Series( + timedelta_range("2 days 01:23:45.012345", periods=5, freq="ms"), + name="xxx", + ), + ] for s in cases: for prop in ok_for_td: # we test freq below - if prop != 'freq': + if prop != "freq": compare(s, prop) for prop in ok_for_td_methods: @@ -143,37 +170,34 @@ def compare(s, name): result = s.dt.total_seconds() assert isinstance(result, pd.Series) - assert result.dtype == 'float64' + assert result.dtype == "float64" freq_result = s.dt.freq - assert freq_result == TimedeltaIndex(s.values, freq='infer').freq + assert freq_result == TimedeltaIndex(s.values, freq="infer").freq # both - index = date_range('20130101', periods=3, freq='D') - s = Series(date_range('20140204', periods=3, freq='s'), - index=index, name='xxx') - exp = Series(np.array([2014, 2014, 2014], dtype='int64'), - index=index, name='xxx') + index = date_range("20130101", periods=3, freq="D") + s = Series(date_range("20140204", periods=3, freq="s"), index=index, name="xxx") + exp = Series( + np.array([2014, 2014, 2014], dtype="int64"), index=index, name="xxx" + ) tm.assert_series_equal(s.dt.year, exp) - exp = Series(np.array([2, 2, 2], dtype='int64'), - index=index, name='xxx') + exp = Series(np.array([2, 2, 2], dtype="int64"), index=index, name="xxx") tm.assert_series_equal(s.dt.month, exp) - exp = Series(np.array([0, 1, 2], dtype='int64'), - index=index, name='xxx') + exp = Series(np.array([0, 1, 2], dtype="int64"), index=index, name="xxx") tm.assert_series_equal(s.dt.second, exp) - exp = pd.Series([s[0]] * 3, index=index, name='xxx') + exp = pd.Series([s[0]] * 3, index=index, name="xxx") tm.assert_series_equal(s.dt.normalize(), exp) # periodindex - cases = [Series(period_range('20130101', periods=5, freq='D'), - name='xxx')] + cases = [Series(period_range("20130101", periods=5, freq="D"), name="xxx")] for s in cases: for prop in ok_for_period: # we test freq below - if prop != 'freq': + if prop != "freq": compare(s, prop) for prop in ok_for_period_methods: @@ -184,168 +208,203 @@ def compare(s, name): # test limited display api def get_dir(s): - results = [r for r in s.dt.__dir__() if not r.startswith('_')] + results = [r for r in s.dt.__dir__() if not r.startswith("_")] return list(sorted(set(results))) - s = Series(date_range('20130101', periods=5, freq='D'), name='xxx') + s = Series(date_range("20130101", periods=5, freq="D"), name="xxx") results = get_dir(s) tm.assert_almost_equal( - results, list(sorted(set(ok_for_dt + ok_for_dt_methods)))) + results, list(sorted(set(ok_for_dt + ok_for_dt_methods))) + ) - s = Series(period_range('20130101', periods=5, - freq='D', name='xxx').astype(object)) + s = Series( + period_range("20130101", periods=5, freq="D", name="xxx").astype(object) + ) results = get_dir(s) tm.assert_almost_equal( - results, list(sorted(set(ok_for_period + ok_for_period_methods)))) + results, list(sorted(set(ok_for_period + ok_for_period_methods))) + ) # 11295 # ambiguous time error on the conversions - s = Series(pd.date_range('2015-01-01', '2016-01-01', - freq='T'), name='xxx') - s = s.dt.tz_localize('UTC').dt.tz_convert('America/Chicago') + s = Series(pd.date_range("2015-01-01", "2016-01-01", freq="T"), name="xxx") + s = s.dt.tz_localize("UTC").dt.tz_convert("America/Chicago") results = get_dir(s) tm.assert_almost_equal( - results, list(sorted(set(ok_for_dt + ok_for_dt_methods)))) - exp_values = pd.date_range('2015-01-01', '2016-01-01', freq='T', - tz='UTC').tz_convert('America/Chicago') - expected = Series(exp_values, name='xxx') + results, list(sorted(set(ok_for_dt + ok_for_dt_methods))) + ) + exp_values = pd.date_range( + "2015-01-01", "2016-01-01", freq="T", tz="UTC" + ).tz_convert("America/Chicago") + expected = Series(exp_values, name="xxx") tm.assert_series_equal(s, expected) # no setting allowed - s = Series(date_range('20130101', periods=5, freq='D'), name='xxx') + s = Series(date_range("20130101", periods=5, freq="D"), name="xxx") with pytest.raises(ValueError, match="modifications"): s.dt.hour = 5 # trying to set a copy - with pd.option_context('chained_assignment', 'raise'): + with pd.option_context("chained_assignment", "raise"): with pytest.raises(com.SettingWithCopyError): s.dt.hour[0] = 5 - @pytest.mark.parametrize('method, dates', [ - ['round', ['2012-01-02', '2012-01-02', '2012-01-01']], - ['floor', ['2012-01-01', '2012-01-01', '2012-01-01']], - ['ceil', ['2012-01-02', '2012-01-02', '2012-01-02']] - ]) + @pytest.mark.parametrize( + "method, dates", + [ + ["round", ["2012-01-02", "2012-01-02", "2012-01-01"]], + ["floor", ["2012-01-01", "2012-01-01", "2012-01-01"]], + ["ceil", ["2012-01-02", "2012-01-02", "2012-01-02"]], + ], + ) def test_dt_round(self, method, dates): # round - s = Series(pd.to_datetime(['2012-01-01 13:00:00', - '2012-01-01 12:01:00', - '2012-01-01 08:00:00']), name='xxx') - result = getattr(s.dt, method)('D') - expected = Series(pd.to_datetime(dates), name='xxx') + s = Series( + pd.to_datetime( + ["2012-01-01 13:00:00", "2012-01-01 12:01:00", "2012-01-01 08:00:00"] + ), + name="xxx", + ) + result = getattr(s.dt, method)("D") + expected = Series(pd.to_datetime(dates), name="xxx") tm.assert_series_equal(result, expected) def test_dt_round_tz(self): - s = Series(pd.to_datetime(['2012-01-01 13:00:00', - '2012-01-01 12:01:00', - '2012-01-01 08:00:00']), name='xxx') - result = (s.dt.tz_localize('UTC') - .dt.tz_convert('US/Eastern') - .dt.round('D')) - - exp_values = pd.to_datetime(['2012-01-01', '2012-01-01', - '2012-01-01']).tz_localize('US/Eastern') - expected = Series(exp_values, name='xxx') + s = Series( + pd.to_datetime( + ["2012-01-01 13:00:00", "2012-01-01 12:01:00", "2012-01-01 08:00:00"] + ), + name="xxx", + ) + result = s.dt.tz_localize("UTC").dt.tz_convert("US/Eastern").dt.round("D") + + exp_values = pd.to_datetime( + ["2012-01-01", "2012-01-01", "2012-01-01"] + ).tz_localize("US/Eastern") + expected = Series(exp_values, name="xxx") tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('method', ['ceil', 'round', 'floor']) + @pytest.mark.parametrize("method", ["ceil", "round", "floor"]) def test_dt_round_tz_ambiguous(self, method): # GH 18946 round near "fall back" DST - df1 = pd.DataFrame([ - pd.to_datetime('2017-10-29 02:00:00+02:00', utc=True), - pd.to_datetime('2017-10-29 02:00:00+01:00', utc=True), - pd.to_datetime('2017-10-29 03:00:00+01:00', utc=True) - ], - columns=['date']) - df1['date'] = df1['date'].dt.tz_convert('Europe/Madrid') + df1 = pd.DataFrame( + [ + pd.to_datetime("2017-10-29 02:00:00+02:00", utc=True), + pd.to_datetime("2017-10-29 02:00:00+01:00", utc=True), + pd.to_datetime("2017-10-29 03:00:00+01:00", utc=True), + ], + columns=["date"], + ) + df1["date"] = df1["date"].dt.tz_convert("Europe/Madrid") # infer - result = getattr(df1.date.dt, method)('H', ambiguous='infer') - expected = df1['date'] + result = getattr(df1.date.dt, method)("H", ambiguous="infer") + expected = df1["date"] tm.assert_series_equal(result, expected) # bool-array - result = getattr(df1.date.dt, method)( - 'H', ambiguous=[True, False, False] - ) + result = getattr(df1.date.dt, method)("H", ambiguous=[True, False, False]) tm.assert_series_equal(result, expected) # NaT - result = getattr(df1.date.dt, method)('H', ambiguous='NaT') - expected = df1['date'].copy() + result = getattr(df1.date.dt, method)("H", ambiguous="NaT") + expected = df1["date"].copy() expected.iloc[0:2] = pd.NaT tm.assert_series_equal(result, expected) # raise with pytest.raises(pytz.AmbiguousTimeError): - getattr(df1.date.dt, method)('H', ambiguous='raise') - - @pytest.mark.parametrize('method, ts_str, freq', [ - ['ceil', '2018-03-11 01:59:00-0600', '5min'], - ['round', '2018-03-11 01:59:00-0600', '5min'], - ['floor', '2018-03-11 03:01:00-0500', '2H']]) + getattr(df1.date.dt, method)("H", ambiguous="raise") + + @pytest.mark.parametrize( + "method, ts_str, freq", + [ + ["ceil", "2018-03-11 01:59:00-0600", "5min"], + ["round", "2018-03-11 01:59:00-0600", "5min"], + ["floor", "2018-03-11 03:01:00-0500", "2H"], + ], + ) def test_dt_round_tz_nonexistent(self, method, ts_str, freq): # GH 23324 round near "spring forward" DST - s = Series([pd.Timestamp(ts_str, tz='America/Chicago')]) - result = getattr(s.dt, method)(freq, nonexistent='shift_forward') - expected = Series( - [pd.Timestamp('2018-03-11 03:00:00', tz='America/Chicago')] - ) + s = Series([pd.Timestamp(ts_str, tz="America/Chicago")]) + result = getattr(s.dt, method)(freq, nonexistent="shift_forward") + expected = Series([pd.Timestamp("2018-03-11 03:00:00", tz="America/Chicago")]) tm.assert_series_equal(result, expected) - result = getattr(s.dt, method)(freq, nonexistent='NaT') + result = getattr(s.dt, method)(freq, nonexistent="NaT") expected = Series([pd.NaT]).dt.tz_localize(result.dt.tz) tm.assert_series_equal(result, expected) - with pytest.raises(pytz.NonExistentTimeError, - match='2018-03-11 02:00:00'): - getattr(s.dt, method)(freq, nonexistent='raise') + with pytest.raises(pytz.NonExistentTimeError, match="2018-03-11 02:00:00"): + getattr(s.dt, method)(freq, nonexistent="raise") def test_dt_namespace_accessor_categorical(self): # GH 19468 - dti = DatetimeIndex(['20171111', '20181212']).repeat(2) - s = Series(pd.Categorical(dti), name='foo') + dti = DatetimeIndex(["20171111", "20181212"]).repeat(2) + s = Series(pd.Categorical(dti), name="foo") result = s.dt.year - expected = Series([2017, 2017, 2018, 2018], name='foo') + expected = Series([2017, 2017, 2018, 2018], name="foo") tm.assert_series_equal(result, expected) def test_dt_accessor_no_new_attributes(self): # https://github.com/pandas-dev/pandas/issues/10673 - s = Series(date_range('20130101', periods=5, freq='D')) - with pytest.raises(AttributeError, - match="You cannot add any new attribute"): + s = Series(date_range("20130101", periods=5, freq="D")) + with pytest.raises(AttributeError, match="You cannot add any new attribute"): s.dt.xlabel = "a" - @pytest.mark.parametrize('time_locale', [ - None] if tm.get_locales() is None else [None] + tm.get_locales()) + @pytest.mark.parametrize( + "time_locale", [None] if tm.get_locales() is None else [None] + tm.get_locales() + ) def test_dt_accessor_datetime_name_accessors(self, time_locale): # Test Monday -> Sunday and January -> December, in that sequence if time_locale is None: # If the time_locale is None, day-name and month_name should # return the english attributes - expected_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', - 'Friday', 'Saturday', 'Sunday'] - expected_months = ['January', 'February', 'March', 'April', 'May', - 'June', 'July', 'August', 'September', - 'October', 'November', 'December'] + expected_days = [ + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + "Sunday", + ] + expected_months = [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", + ] else: with tm.set_locale(time_locale, locale.LC_TIME): expected_days = calendar.day_name[:] expected_months = calendar.month_name[1:] - s = Series(date_range(freq='D', start=datetime(1998, 1, 1), - periods=365)) - english_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', - 'Friday', 'Saturday', 'Sunday'] - for day, name, eng_name in zip(range(4, 11), - expected_days, - english_days): + s = Series(date_range(freq="D", start=datetime(1998, 1, 1), periods=365)) + english_days = [ + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + "Sunday", + ] + for day, name, eng_name in zip(range(4, 11), expected_days, english_days): name = name.capitalize() assert s.dt.weekday_name[day] == eng_name assert s.dt.day_name(locale=time_locale)[day] == name s = s.append(Series([pd.NaT])) assert np.isnan(s.dt.day_name(locale=time_locale).iloc[-1]) - s = Series(date_range(freq='M', start='2012', end='2013')) + s = Series(date_range(freq="M", start="2012", end="2013")) result = s.dt.month_name(locale=time_locale) expected = Series([month.capitalize() for month in expected_months]) @@ -369,71 +428,98 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale): def test_strftime(self): # GH 10086 - s = Series(date_range('20130101', periods=5)) - result = s.dt.strftime('%Y/%m/%d') - expected = Series(['2013/01/01', '2013/01/02', '2013/01/03', - '2013/01/04', '2013/01/05']) + s = Series(date_range("20130101", periods=5)) + result = s.dt.strftime("%Y/%m/%d") + expected = Series( + ["2013/01/01", "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] + ) tm.assert_series_equal(result, expected) - s = Series(date_range('2015-02-03 11:22:33.4567', periods=5)) - result = s.dt.strftime('%Y/%m/%d %H-%M-%S') - expected = Series(['2015/02/03 11-22-33', '2015/02/04 11-22-33', - '2015/02/05 11-22-33', '2015/02/06 11-22-33', - '2015/02/07 11-22-33']) + s = Series(date_range("2015-02-03 11:22:33.4567", periods=5)) + result = s.dt.strftime("%Y/%m/%d %H-%M-%S") + expected = Series( + [ + "2015/02/03 11-22-33", + "2015/02/04 11-22-33", + "2015/02/05 11-22-33", + "2015/02/06 11-22-33", + "2015/02/07 11-22-33", + ] + ) tm.assert_series_equal(result, expected) - s = Series(period_range('20130101', periods=5)) - result = s.dt.strftime('%Y/%m/%d') - expected = Series(['2013/01/01', '2013/01/02', '2013/01/03', - '2013/01/04', '2013/01/05']) + s = Series(period_range("20130101", periods=5)) + result = s.dt.strftime("%Y/%m/%d") + expected = Series( + ["2013/01/01", "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] + ) tm.assert_series_equal(result, expected) - s = Series(period_range( - '2015-02-03 11:22:33.4567', periods=5, freq='s')) - result = s.dt.strftime('%Y/%m/%d %H-%M-%S') - expected = Series(['2015/02/03 11-22-33', '2015/02/03 11-22-34', - '2015/02/03 11-22-35', '2015/02/03 11-22-36', - '2015/02/03 11-22-37']) + s = Series(period_range("2015-02-03 11:22:33.4567", periods=5, freq="s")) + result = s.dt.strftime("%Y/%m/%d %H-%M-%S") + expected = Series( + [ + "2015/02/03 11-22-33", + "2015/02/03 11-22-34", + "2015/02/03 11-22-35", + "2015/02/03 11-22-36", + "2015/02/03 11-22-37", + ] + ) tm.assert_series_equal(result, expected) - s = Series(date_range('20130101', periods=5)) + s = Series(date_range("20130101", periods=5)) s.iloc[0] = pd.NaT - result = s.dt.strftime('%Y/%m/%d') - expected = Series(['NaT', '2013/01/02', '2013/01/03', '2013/01/04', - '2013/01/05']) + result = s.dt.strftime("%Y/%m/%d") + expected = Series( + ["NaT", "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] + ) tm.assert_series_equal(result, expected) - datetime_index = date_range('20150301', periods=5) + datetime_index = date_range("20150301", periods=5) result = datetime_index.strftime("%Y/%m/%d") - expected = Index(['2015/03/01', '2015/03/02', '2015/03/03', - '2015/03/04', '2015/03/05'], dtype=np.object_) + expected = Index( + ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"], + dtype=np.object_, + ) # dtype may be S10 or U10 depending on python version tm.assert_index_equal(result, expected) - period_index = period_range('20150301', periods=5) + period_index = period_range("20150301", periods=5) result = period_index.strftime("%Y/%m/%d") - expected = Index(['2015/03/01', '2015/03/02', '2015/03/03', - '2015/03/04', '2015/03/05'], dtype='=U10') + expected = Index( + ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"], + dtype="=U10", + ) tm.assert_index_equal(result, expected) - s = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14, - 32, 1)]) - result = s.dt.strftime('%Y-%m-%d %H:%M:%S') + s = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14, 32, 1)]) + result = s.dt.strftime("%Y-%m-%d %H:%M:%S") expected = Series(["2013-01-01 02:32:59", "2013-01-02 14:32:01"]) tm.assert_series_equal(result, expected) - s = Series(period_range('20130101', periods=4, freq='H')) - result = s.dt.strftime('%Y/%m/%d %H:%M:%S') - expected = Series(["2013/01/01 00:00:00", "2013/01/01 01:00:00", - "2013/01/01 02:00:00", "2013/01/01 03:00:00"]) - - s = Series(period_range('20130101', periods=4, freq='L')) - result = s.dt.strftime('%Y/%m/%d %H:%M:%S.%l') - expected = Series(["2013/01/01 00:00:00.000", - "2013/01/01 00:00:00.001", - "2013/01/01 00:00:00.002", - "2013/01/01 00:00:00.003"]) + s = Series(period_range("20130101", periods=4, freq="H")) + result = s.dt.strftime("%Y/%m/%d %H:%M:%S") + expected = Series( + [ + "2013/01/01 00:00:00", + "2013/01/01 01:00:00", + "2013/01/01 02:00:00", + "2013/01/01 03:00:00", + ] + ) + + s = Series(period_range("20130101", periods=4, freq="L")) + result = s.dt.strftime("%Y/%m/%d %H:%M:%S.%l") + expected = Series( + [ + "2013/01/01 00:00:00.000", + "2013/01/01 00:00:00.001", + "2013/01/01 00:00:00.002", + "2013/01/01 00:00:00.003", + ] + ) tm.assert_series_equal(result, expected) def test_valid_dt_with_missing_values(self): @@ -441,11 +527,10 @@ def test_valid_dt_with_missing_values(self): from datetime import date, time # GH 8689 - s = Series(date_range('20130101', periods=5, freq='D')) + s = Series(date_range("20130101", periods=5, freq="D")) s.iloc[2] = pd.NaT - for attr in ['microsecond', 'nanosecond', 'second', 'minute', 'hour', - 'day']: + for attr in ["microsecond", "nanosecond", "second", "minute", "hour", "day"]: expected = getattr(s.dt, attr).copy() expected.iloc[2] = np.nan result = getattr(s.dt, attr) @@ -453,42 +538,51 @@ def test_valid_dt_with_missing_values(self): result = s.dt.date expected = Series( - [date(2013, 1, 1), date(2013, 1, 2), np.nan, date(2013, 1, 4), - date(2013, 1, 5)], dtype='object') + [ + date(2013, 1, 1), + date(2013, 1, 2), + np.nan, + date(2013, 1, 4), + date(2013, 1, 5), + ], + dtype="object", + ) tm.assert_series_equal(result, expected) result = s.dt.time - expected = Series( - [time(0), time(0), np.nan, time(0), time(0)], dtype='object') + expected = Series([time(0), time(0), np.nan, time(0), time(0)], dtype="object") tm.assert_series_equal(result, expected) def test_dt_accessor_api(self): # GH 9322 from pandas.core.indexes.accessors import ( - CombinedDatetimelikeProperties, DatetimeProperties) + CombinedDatetimelikeProperties, + DatetimeProperties, + ) + assert Series.dt is CombinedDatetimelikeProperties - s = Series(date_range('2000-01-01', periods=3)) + s = Series(date_range("2000-01-01", periods=3)) assert isinstance(s.dt, DatetimeProperties) - @pytest.mark.parametrize('ser', [Series(np.arange(5)), - Series(list('abcde')), - Series(np.random.randn(5))]) + @pytest.mark.parametrize( + "ser", [Series(np.arange(5)), Series(list("abcde")), Series(np.random.randn(5))] + ) def test_dt_accessor_invalid(self, ser): # GH#9322 check that series with incorrect dtypes don't have attr with pytest.raises(AttributeError, match="only use .dt accessor"): ser.dt - assert not hasattr(ser, 'dt') + assert not hasattr(ser, "dt") def test_dt_accessor_updates_on_inplace(self): - s = Series(pd.date_range('2018-01-01', periods=10)) + s = Series(pd.date_range("2018-01-01", periods=10)) s[2] = None - s.fillna(pd.Timestamp('2018-01-01'), inplace=True) + s.fillna(pd.Timestamp("2018-01-01"), inplace=True) result = s.dt.date assert result[0] == result[2] def test_between(self): - s = Series(bdate_range('1/1/2000', periods=20).astype(object)) + s = Series(bdate_range("1/1/2000", periods=20).astype(object)) s[::2] = np.nan result = s[s.between(s[3], s[17])] @@ -501,13 +595,12 @@ def test_between(self): def test_date_tz(self): # GH11757 - rng = pd.DatetimeIndex(['2014-04-04 23:56', - '2014-07-18 21:24', - '2015-11-22 22:14'], tz="US/Eastern") + rng = pd.DatetimeIndex( + ["2014-04-04 23:56", "2014-07-18 21:24", "2015-11-22 22:14"], + tz="US/Eastern", + ) s = Series(rng) - expected = Series([date(2014, 4, 4), - date(2014, 7, 18), - date(2015, 11, 22)]) + expected = Series([date(2014, 4, 4), date(2014, 7, 18), date(2015, 11, 22)]) assert_series_equal(s.dt.date, expected) assert_series_equal(s.apply(lambda x: x.date()), expected) @@ -517,35 +610,39 @@ def test_datetime_understood(self): series = pd.Series(pd.date_range("2012-01-01", periods=3)) offset = pd.offsets.DateOffset(days=6) result = series - offset - expected = pd.Series(pd.to_datetime([ - '2011-12-26', '2011-12-27', '2011-12-28'])) + expected = pd.Series(pd.to_datetime(["2011-12-26", "2011-12-27", "2011-12-28"])) tm.assert_series_equal(result, expected) def test_dt_timetz_accessor(self, tz_naive_fixture): # GH21358 tz = maybe_get_tz(tz_naive_fixture) - dtindex = pd.DatetimeIndex(['2014-04-04 23:56', '2014-07-18 21:24', - '2015-11-22 22:14'], tz=tz) + dtindex = pd.DatetimeIndex( + ["2014-04-04 23:56", "2014-07-18 21:24", "2015-11-22 22:14"], tz=tz + ) s = Series(dtindex) - expected = Series([time(23, 56, tzinfo=tz), time(21, 24, tzinfo=tz), - time(22, 14, tzinfo=tz)]) + expected = Series( + [time(23, 56, tzinfo=tz), time(21, 24, tzinfo=tz), time(22, 14, tzinfo=tz)] + ) result = s.dt.timetz tm.assert_series_equal(result, expected) def test_setitem_with_string_index(self): # GH 23451 - x = pd.Series([1, 2, 3], index=['Date', 'b', 'other']) - x['Date'] = date.today() + x = pd.Series([1, 2, 3], index=["Date", "b", "other"]) + x["Date"] = date.today() assert x.Date == date.today() - assert x['Date'] == date.today() + assert x["Date"] == date.today() def test_setitem_with_different_tz(self): # GH#24024 - ser = pd.Series(pd.date_range('2000', periods=2, tz="US/Central")) - ser[0] = pd.Timestamp("2000", tz='US/Eastern') - expected = pd.Series([ - pd.Timestamp("2000-01-01 00:00:00-05:00", tz="US/Eastern"), - pd.Timestamp("2000-01-02 00:00:00-06:00", tz="US/Central"), - ], dtype=object) + ser = pd.Series(pd.date_range("2000", periods=2, tz="US/Central")) + ser[0] = pd.Timestamp("2000", tz="US/Eastern") + expected = pd.Series( + [ + pd.Timestamp("2000-01-01 00:00:00-05:00", tz="US/Eastern"), + pd.Timestamp("2000-01-02 00:00:00-06:00", tz="US/Central"), + ], + dtype=object, + ) tm.assert_series_equal(ser, expected) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index b17f24fef825eb..9be79bf93ece77 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -12,61 +12,67 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, Index, Series, Timedelta, Timestamp, date_range) + Categorical, + DataFrame, + Index, + Series, + Timedelta, + Timestamp, + date_range, +) import pandas.util.testing as tm class TestSeriesDtypes: - def test_dt64_series_astype_object(self): - dt64ser = Series(date_range('20130101', periods=3)) + dt64ser = Series(date_range("20130101", periods=3)) result = dt64ser.astype(object) assert isinstance(result.iloc[0], datetime) assert result.dtype == np.object_ def test_td64_series_astype_object(self): - tdser = Series(['59 Days', '59 Days', 'NaT'], dtype='timedelta64[ns]') + tdser = Series(["59 Days", "59 Days", "NaT"], dtype="timedelta64[ns]") result = tdser.astype(object) assert isinstance(result.iloc[0], timedelta) assert result.dtype == np.object_ - @pytest.mark.parametrize("dtype", ["float32", "float64", - "int64", "int32"]) + @pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"]) def test_astype(self, dtype): - s = Series(np.random.randn(5), name='foo') + s = Series(np.random.randn(5), name="foo") as_typed = s.astype(dtype) assert as_typed.dtype == dtype assert as_typed.name == s.name def test_asobject_deprecated(self): - s = Series(np.random.randn(5), name='foo') + s = Series(np.random.randn(5), name="foo") with tm.assert_produces_warning(FutureWarning): o = s.asobject assert isinstance(o, np.ndarray) def test_dtype(self, datetime_series): - assert datetime_series.dtype == np.dtype('float64') - assert datetime_series.dtypes == np.dtype('float64') + assert datetime_series.dtype == np.dtype("float64") + assert datetime_series.dtypes == np.dtype("float64") # GH 26705 - Assert .ftype is deprecated with tm.assert_produces_warning(FutureWarning): - assert datetime_series.ftype == 'float64:dense' + assert datetime_series.ftype == "float64:dense" # GH 26705 - Assert .ftypes is deprecated with tm.assert_produces_warning(FutureWarning): - assert datetime_series.ftypes == 'float64:dense' + assert datetime_series.ftypes == "float64:dense" # GH18243 - Assert .get_ftype_counts is deprecated with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal(datetime_series.get_ftype_counts(), - Series(1, ['float64:dense'])) + tm.assert_series_equal( + datetime_series.get_ftype_counts(), Series(1, ["float64:dense"]) + ) @pytest.mark.parametrize("value", [np.nan, np.inf]) @pytest.mark.parametrize("dtype", [np.int32, np.int64]) def test_astype_cast_nan_inf_int(self, dtype, value): # gh-14265: check NaN and inf raise error when converting to int - msg = 'Cannot convert non-finite values \\(NA or inf\\) to integer' + msg = "Cannot convert non-finite values \\(NA or inf\\) to integer" s = Series([value]) with pytest.raises(ValueError, match=msg): @@ -80,39 +86,39 @@ def test_astype_cast_object_int_fail(self, dtype): arr.astype(dtype) def test_astype_cast_object_int(self): - arr = Series(['1', '2', '3', '4'], dtype=object) + arr = Series(["1", "2", "3", "4"], dtype=object) result = arr.astype(int) tm.assert_series_equal(result, Series(np.arange(1, 5))) def test_astype_datetime(self): - s = Series(iNaT, dtype='M8[ns]', index=range(5)) + s = Series(iNaT, dtype="M8[ns]", index=range(5)) - s = s.astype('O') + s = s.astype("O") assert s.dtype == np.object_ s = Series([datetime(2001, 1, 2, 0, 0)]) - s = s.astype('O') + s = s.astype("O") assert s.dtype == np.object_ s = Series([datetime(2001, 1, 2, 0, 0) for i in range(3)]) s[1] = np.nan - assert s.dtype == 'M8[ns]' + assert s.dtype == "M8[ns]" - s = s.astype('O') + s = s.astype("O") assert s.dtype == np.object_ def test_astype_datetime64tz(self): - s = Series(date_range('20130101', periods=3, tz='US/Eastern')) + s = Series(date_range("20130101", periods=3, tz="US/Eastern")) # astype result = s.astype(object) expected = Series(s.astype(object), dtype=object) tm.assert_series_equal(result, expected) - result = Series(s.values).dt.tz_localize('UTC').dt.tz_convert(s.dt.tz) + result = Series(s.values).dt.tz_localize("UTC").dt.tz_convert(s.dt.tz) tm.assert_series_equal(result, s) # astype - object, preserves on construction @@ -121,24 +127,24 @@ def test_astype_datetime64tz(self): tm.assert_series_equal(result, expected) # astype - datetime64[ns, tz] - result = Series(s.values).astype('datetime64[ns, US/Eastern]') + result = Series(s.values).astype("datetime64[ns, US/Eastern]") tm.assert_series_equal(result, s) result = Series(s.values).astype(s.dtype) tm.assert_series_equal(result, s) - result = s.astype('datetime64[ns, CET]') - expected = Series(date_range('20130101 06:00:00', periods=3, tz='CET')) + result = s.astype("datetime64[ns, CET]") + expected = Series(date_range("20130101 06:00:00", periods=3, tz="CET")) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", [str, np.str_]) - @pytest.mark.parametrize("series", [Series([string.digits * 10, - tm.rands(63), - tm.rands(64), - tm.rands(1000)]), - Series([string.digits * 10, - tm.rands(63), - tm.rands(64), np.nan, 1.0])]) + @pytest.mark.parametrize( + "series", + [ + Series([string.digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]), + Series([string.digits * 10, tm.rands(63), tm.rands(64), np.nan, 1.0]), + ], + ) def test_astype_str_map(self, dtype, series): # see gh-4405 result = series.astype(dtype) @@ -147,22 +153,22 @@ def test_astype_str_map(self, dtype, series): def test_astype_str_cast(self): # see gh-9757 - ts = Series([Timestamp('2010-01-04 00:00:00')]) + ts = Series([Timestamp("2010-01-04 00:00:00")]) s = ts.astype(str) - expected = Series([str('2010-01-04')]) + expected = Series([str("2010-01-04")]) tm.assert_series_equal(s, expected) - ts = Series([Timestamp('2010-01-04 00:00:00', tz='US/Eastern')]) + ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")]) s = ts.astype(str) - expected = Series([str('2010-01-04 00:00:00-05:00')]) + expected = Series([str("2010-01-04 00:00:00-05:00")]) tm.assert_series_equal(s, expected) - td = Series([Timedelta(1, unit='d')]) + td = Series([Timedelta(1, unit="d")]) s = td.astype(str) - expected = Series([str('1 days 00:00:00.000000000')]) + expected = Series([str("1 days 00:00:00.000000000")]) tm.assert_series_equal(s, expected) def test_astype_unicode(self): @@ -171,14 +177,13 @@ def test_astype_unicode(self): digits = string.digits test_series = [ Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]), - Series(['データーサイエンス、お前はもう死んでいる']), + Series(["データーサイエンス、お前はもう死んでいる"]), ] former_encoding = None if sys.getdefaultencoding() == "utf-8": - test_series.append(Series(['野菜食べないとやばい' - .encode("utf-8")])) + test_series.append(Series(["野菜食べないとやばい".encode("utf-8")])) for s in test_series: res = s.astype("unicode") @@ -193,22 +198,23 @@ def test_astype_unicode(self): @pytest.mark.parametrize("dtype_class", [dict, Series]) def test_astype_dict_like(self, dtype_class): # see gh-7271 - s = Series(range(0, 10, 2), name='abc') + s = Series(range(0, 10, 2), name="abc") - dt1 = dtype_class({'abc': str}) + dt1 = dtype_class({"abc": str}) result = s.astype(dt1) - expected = Series(['0', '2', '4', '6', '8'], name='abc') + expected = Series(["0", "2", "4", "6", "8"], name="abc") tm.assert_series_equal(result, expected) - dt2 = dtype_class({'abc': 'float64'}) + dt2 = dtype_class({"abc": "float64"}) result = s.astype(dt2) - expected = Series([0.0, 2.0, 4.0, 6.0, 8.0], dtype='float64', - name='abc') + expected = Series([0.0, 2.0, 4.0, 6.0, 8.0], dtype="float64", name="abc") tm.assert_series_equal(result, expected) - dt3 = dtype_class({'abc': str, 'def': str}) - msg = ("Only the Series name can be used for the key in Series dtype" - r" mappings\.") + dt3 = dtype_class({"abc": str, "def": str}) + msg = ( + "Only the Series name can be used for the key in Series dtype" + r" mappings\." + ) with pytest.raises(KeyError, match=msg): s.astype(dt3) @@ -225,17 +231,18 @@ def test_astype_dict_like(self, dtype_class): def test_astype_categories_deprecation_raises(self): # deprecated 17636 - s = Series(['a', 'b', 'a']) + s = Series(["a", "b", "a"]) with pytest.raises(ValueError, match="Got an unexpected"): - s.astype('category', categories=['a', 'b'], ordered=True) + s.astype("category", categories=["a", "b"], ordered=True) - @pytest.mark.parametrize('none, warning', [ - (None, None), (ordered_sentinel, FutureWarning)]) + @pytest.mark.parametrize( + "none, warning", [(None, None), (ordered_sentinel, FutureWarning)] + ) def test_astype_category_ordered_none_deprecated(self, none, warning): # GH 26336: only warn if None is not explicitly passed - cdt1 = CategoricalDtype(categories=list('cdab'), ordered=True) - cdt2 = CategoricalDtype(categories=list('cedafb'), ordered=none) - s = Series(list('abcdaba'), dtype=cdt1) + cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True) + cdt2 = CategoricalDtype(categories=list("cedafb"), ordered=none) + s = Series(list("abcdaba"), dtype=cdt1) with tm.assert_produces_warning(warning, check_stacklevel=False): s.astype(cdt2) @@ -243,25 +250,25 @@ def test_astype_from_categorical(self): items = ["a", "b", "c", "a"] s = Series(items) exp = Series(Categorical(items)) - res = s.astype('category') + res = s.astype("category") tm.assert_series_equal(res, exp) items = [1, 2, 3, 1] s = Series(items) exp = Series(Categorical(items)) - res = s.astype('category') + res = s.astype("category") tm.assert_series_equal(res, exp) - df = DataFrame({"cats": [1, 2, 3, 4, 5, 6], - "vals": [1, 2, 3, 4, 5, 6]}) + df = DataFrame({"cats": [1, 2, 3, 4, 5, 6], "vals": [1, 2, 3, 4, 5, 6]}) cats = Categorical([1, 2, 3, 4, 5, 6]) exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]}) df["cats"] = df["cats"].astype("category") tm.assert_frame_equal(exp_df, df) - df = DataFrame({"cats": ['a', 'b', 'b', 'a', 'a', 'd'], - "vals": [1, 2, 3, 4, 5, 6]}) - cats = Categorical(['a', 'b', 'b', 'a', 'a', 'd']) + df = DataFrame( + {"cats": ["a", "b", "b", "a", "a", "d"], "vals": [1, 2, 3, 4, 5, 6]} + ) + cats = Categorical(["a", "b", "b", "a", "a", "d"]) exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]}) df["cats"] = df["cats"].astype("category") tm.assert_frame_equal(exp_df, df) @@ -273,57 +280,56 @@ def test_astype_from_categorical(self): res = s.astype(CategoricalDtype(None, ordered=True)) tm.assert_series_equal(res, exp) - exp = Series(Categorical(lst, categories=list('abcdef'), ordered=True)) - res = s.astype(CategoricalDtype(list('abcdef'), ordered=True)) + exp = Series(Categorical(lst, categories=list("abcdef"), ordered=True)) + res = s.astype(CategoricalDtype(list("abcdef"), ordered=True)) tm.assert_series_equal(res, exp) def test_astype_categorical_to_other(self): value = np.random.RandomState(0).randint(0, 10000, 100) - df = DataFrame({'value': value}) + df = DataFrame({"value": value}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) - df = df.sort_values(by=['value'], ascending=True) - df['value_group'] = pd.cut(df.value, range(0, 10500, 500), - right=False, labels=cat_labels) + df = df.sort_values(by=["value"], ascending=True) + df["value_group"] = pd.cut( + df.value, range(0, 10500, 500), right=False, labels=cat_labels + ) - s = df['value_group'] + s = df["value_group"] expected = s - tm.assert_series_equal(s.astype('category'), expected) + tm.assert_series_equal(s.astype("category"), expected) tm.assert_series_equal(s.astype(CategoricalDtype()), expected) - msg = (r"could not convert string to float|" - r"invalid literal for float\(\)") + msg = r"could not convert string to float|" r"invalid literal for float\(\)" with pytest.raises(ValueError, match=msg): - s.astype('float64') + s.astype("float64") - cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])) - exp = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) - tm.assert_series_equal(cat.astype('str'), exp) - s2 = Series(Categorical(['1', '2', '3', '4'])) + cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) + exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) + tm.assert_series_equal(cat.astype("str"), exp) + s2 = Series(Categorical(["1", "2", "3", "4"])) exp2 = Series([1, 2, 3, 4]).astype(int) - tm.assert_series_equal(s2.astype('int'), exp2) + tm.assert_series_equal(s2.astype("int"), exp2) # object don't sort correctly, so just compare that we have the same # values def cmp(a, b): - tm.assert_almost_equal( - np.sort(np.unique(a)), np.sort(np.unique(b))) + tm.assert_almost_equal(np.sort(np.unique(a)), np.sort(np.unique(b))) - expected = Series(np.array(s.values), name='value_group') - cmp(s.astype('object'), expected) + expected = Series(np.array(s.values), name="value_group") + cmp(s.astype("object"), expected) cmp(s.astype(np.object_), expected) # array conversion tm.assert_almost_equal(np.array(s), np.array(s.values)) # valid conversion - for valid in [lambda x: x.astype('category'), - lambda x: x.astype(CategoricalDtype()), - lambda x: x.astype('object').astype('category'), - lambda x: x.astype('object').astype( - CategoricalDtype()) - ]: + for valid in [ + lambda x: x.astype("category"), + lambda x: x.astype(CategoricalDtype()), + lambda x: x.astype("object").astype("category"), + lambda x: x.astype("object").astype(CategoricalDtype()), + ]: result = valid(s) # compare series values @@ -331,21 +337,26 @@ def cmp(a, b): tm.assert_series_equal(result, s, check_categorical=False) # invalid conversion (these are NOT a dtype) - msg = (r"invalid type for astype") - for invalid in [lambda x: x.astype(Categorical), - lambda x: x.astype('object').astype(Categorical)]: + msg = ( + r"invalid type for astype" + ) + for invalid in [ + lambda x: x.astype(Categorical), + lambda x: x.astype("object").astype(Categorical), + ]: with pytest.raises(TypeError, match=msg): invalid(s) - @pytest.mark.parametrize('name', [None, 'foo']) - @pytest.mark.parametrize('dtype_ordered', [True, False]) - @pytest.mark.parametrize('series_ordered', [True, False]) - def test_astype_categorical_to_categorical(self, name, dtype_ordered, - series_ordered): + @pytest.mark.parametrize("name", [None, "foo"]) + @pytest.mark.parametrize("dtype_ordered", [True, False]) + @pytest.mark.parametrize("series_ordered", [True, False]) + def test_astype_categorical_to_categorical( + self, name, dtype_ordered, series_ordered + ): # GH 10696/18593 - s_data = list('abcaacbab') - s_dtype = CategoricalDtype(list('bac'), ordered=series_ordered) + s_data = list("abcaacbab") + s_dtype = CategoricalDtype(list("bac"), ordered=series_ordered) s = Series(s_data, dtype=s_dtype, name=name) # unspecified categories @@ -356,7 +367,7 @@ def test_astype_categorical_to_categorical(self, name, dtype_ordered, tm.assert_series_equal(result, expected) # different categories - dtype = CategoricalDtype(list('adc'), dtype_ordered) + dtype = CategoricalDtype(list("adc"), dtype_ordered) result = s.astype(dtype) expected = Series(s_data, name=name, dtype=dtype) tm.assert_series_equal(result, expected) @@ -364,54 +375,54 @@ def test_astype_categorical_to_categorical(self, name, dtype_ordered, if dtype_ordered is False: # not specifying ordered, so only test once expected = s - result = s.astype('category') + result = s.astype("category") tm.assert_series_equal(result, expected) def test_astype_categoricaldtype(self): - s = Series(['a', 'b', 'a']) - result = s.astype(CategoricalDtype(['a', 'b'], ordered=True)) - expected = Series(Categorical(['a', 'b', 'a'], ordered=True)) + s = Series(["a", "b", "a"]) + result = s.astype(CategoricalDtype(["a", "b"], ordered=True)) + expected = Series(Categorical(["a", "b", "a"], ordered=True)) tm.assert_series_equal(result, expected) - result = s.astype(CategoricalDtype(['a', 'b'], ordered=False)) - expected = Series(Categorical(['a', 'b', 'a'], ordered=False)) + result = s.astype(CategoricalDtype(["a", "b"], ordered=False)) + expected = Series(Categorical(["a", "b", "a"], ordered=False)) tm.assert_series_equal(result, expected) - result = s.astype(CategoricalDtype(['a', 'b', 'c'], ordered=False)) - expected = Series(Categorical(['a', 'b', 'a'], - categories=['a', 'b', 'c'], - ordered=False)) + result = s.astype(CategoricalDtype(["a", "b", "c"], ordered=False)) + expected = Series( + Categorical(["a", "b", "a"], categories=["a", "b", "c"], ordered=False) + ) tm.assert_series_equal(result, expected) - tm.assert_index_equal(result.cat.categories, Index(['a', 'b', 'c'])) + tm.assert_index_equal(result.cat.categories, Index(["a", "b", "c"])) - @pytest.mark.parametrize("dtype", [ - np.datetime64, - np.timedelta64, - ]) + @pytest.mark.parametrize("dtype", [np.datetime64, np.timedelta64]) def test_astype_generic_timestamp_no_frequency(self, dtype): # see gh-15524, gh-15987 data = [1] s = Series(data) - msg = ((r"The '{dtype}' dtype has no unit\. " - r"Please pass in '{dtype}\[ns\]' instead.") - .format(dtype=dtype.__name__)) + msg = ( + r"The '{dtype}' dtype has no unit\. " + r"Please pass in '{dtype}\[ns\]' instead." + ).format(dtype=dtype.__name__) with pytest.raises(ValueError, match=msg): s.astype(dtype) - @pytest.mark.parametrize("dtype", np.typecodes['All']) + @pytest.mark.parametrize("dtype", np.typecodes["All"]) def test_astype_empty_constructor_equality(self, dtype): # see gh-15524 if dtype not in ( - "S", "V", # poor support (if any) currently - "M", "m" # Generic timestamps raise a ValueError. Already tested. + "S", + "V", # poor support (if any) currently + "M", + "m", # Generic timestamps raise a ValueError. Already tested. ): init_empty = Series([], dtype=dtype) as_type_empty = Series([]).astype(dtype) tm.assert_series_equal(init_empty, as_type_empty) - @pytest.mark.filterwarnings('ignore::FutureWarning') + @pytest.mark.filterwarnings("ignore::FutureWarning") def test_complex(self): # see gh-4819: complex access for ndarray compat a = np.arange(5, dtype=np.float64) @@ -435,59 +446,60 @@ def test_arg_for_errors_in_astype(self): # see gh-14878 s = Series([1, 2, 3]) - msg = (r"Expected value of kwarg 'errors' to be one of \['raise'," - r" 'ignore'\]\. Supplied value is 'False'") + msg = ( + r"Expected value of kwarg 'errors' to be one of \['raise'," + r" 'ignore'\]\. Supplied value is 'False'" + ) with pytest.raises(ValueError, match=msg): s.astype(np.float64, errors=False) - s.astype(np.int8, errors='raise') + s.astype(np.int8, errors="raise") def test_intercept_astype_object(self): - series = Series(date_range('1/1/2000', periods=10)) + series = Series(date_range("1/1/2000", periods=10)) # This test no longer makes sense, as # Series is by default already M8[ns]. - expected = series.astype('object') + expected = series.astype("object") - df = DataFrame({'a': series, - 'b': np.random.randn(len(series))}) - exp_dtypes = Series([np.dtype('datetime64[ns]'), - np.dtype('float64')], index=['a', 'b']) + df = DataFrame({"a": series, "b": np.random.randn(len(series))}) + exp_dtypes = Series( + [np.dtype("datetime64[ns]"), np.dtype("float64")], index=["a", "b"] + ) tm.assert_series_equal(df.dtypes, exp_dtypes) result = df.values.squeeze() assert (result[:, 0] == expected.values).all() - df = DataFrame({'a': series, 'b': ['foo'] * len(series)}) + df = DataFrame({"a": series, "b": ["foo"] * len(series)}) result = df.values.squeeze() assert (result[:, 0] == expected.values).all() def test_series_to_categorical(self): # see gh-16524: test conversion of Series to Categorical - series = Series(['a', 'b', 'c']) + series = Series(["a", "b", "c"]) - result = Series(series, dtype='category') - expected = Series(['a', 'b', 'c'], dtype='category') + result = Series(series, dtype="category") + expected = Series(["a", "b", "c"], dtype="category") tm.assert_series_equal(result, expected) def test_infer_objects_series(self): # GH 11221 - actual = Series(np.array([1, 2, 3], dtype='O')).infer_objects() + actual = Series(np.array([1, 2, 3], dtype="O")).infer_objects() expected = Series([1, 2, 3]) tm.assert_series_equal(actual, expected) - actual = Series(np.array([1, 2, 3, None], dtype='O')).infer_objects() - expected = Series([1., 2., 3., np.nan]) + actual = Series(np.array([1, 2, 3, None], dtype="O")).infer_objects() + expected = Series([1.0, 2.0, 3.0, np.nan]) tm.assert_series_equal(actual, expected) # only soft conversions, unconvertable pass thru unchanged - actual = (Series(np.array([1, 2, 3, None, 'a'], dtype='O')) - .infer_objects()) - expected = Series([1, 2, 3, None, 'a']) + actual = Series(np.array([1, 2, 3, None, "a"], dtype="O")).infer_objects() + expected = Series([1, 2, 3, None, "a"]) - assert actual.dtype == 'object' + assert actual.dtype == "object" tm.assert_series_equal(actual, expected) def test_is_homogeneous_type(self): @@ -495,10 +507,13 @@ def test_is_homogeneous_type(self): assert Series([1, 2])._is_homogeneous_type assert Series(pd.Categorical([1, 2]))._is_homogeneous_type - @pytest.mark.parametrize("data", [ - pd.period_range("2000", periods=4), - pd.IntervalIndex.from_breaks([1, 2, 3, 4]) - ]) + @pytest.mark.parametrize( + "data", + [ + pd.period_range("2000", periods=4), + pd.IntervalIndex.from_breaks([1, 2, 3, 4]), + ], + ) def test_values_compatibility(self, data): # https://github.com/pandas-dev/pandas/issues/23995 result = pd.Series(data).values diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py index 392aa48e200a2f..4a914e4fb0f2c1 100644 --- a/pandas/tests/series/test_duplicates.py +++ b/pandas/tests/series/test_duplicates.py @@ -27,13 +27,13 @@ def test_unique(): result = s.unique() assert len(result) == 2 - s = Series([1.2345] * 100, dtype='f4') + s = Series([1.2345] * 100, dtype="f4") s[::2] = np.nan result = s.unique() assert len(result) == 2 # NAs in object arrays #714 - s = Series(['foo'] * 100, dtype='O') + s = Series(["foo"] * 100, dtype="O") s[::2] = np.nan result = s.unique() assert len(result) == 2 @@ -48,8 +48,7 @@ def test_unique(): s = Series(Categorical([])) tm.assert_categorical_equal(s.unique(), Categorical([]), check_dtype=False) s = Series(Categorical([np.nan])) - tm.assert_categorical_equal(s.unique(), Categorical([np.nan]), - check_dtype=False) + tm.assert_categorical_equal(s.unique(), Categorical([np.nan]), check_dtype=False) def test_unique_data_ownership(): @@ -57,14 +56,18 @@ def test_unique_data_ownership(): Series(Series(["a", "c", "b"]).unique()).sort_values() -@pytest.mark.parametrize('data, expected', [ - (np.random.randint(0, 10, size=1000), False), - (np.arange(1000), True), - ([], True), - ([np.nan], True), - (['foo', 'bar', np.nan], True), - (['foo', 'foo', np.nan], False), - (['foo', 'bar', np.nan, np.nan], False)]) +@pytest.mark.parametrize( + "data, expected", + [ + (np.random.randint(0, 10, size=1000), False), + (np.arange(1000), True), + ([], True), + ([np.nan], True), + (["foo", "bar", np.nan], True), + (["foo", "foo", np.nan], False), + (["foo", "bar", np.nan, np.nan], False), + ], +) def test_is_unique(data, expected): # GH11946 / GH25180 s = Series(data) @@ -89,17 +92,18 @@ def __ne__(self, other): @pytest.mark.parametrize( - 'keep, expected', + "keep, expected", [ - ('first', Series([False, False, False, False, True, True, False])), - ('last', Series([False, True, True, False, False, False, False])), - (False, Series([False, True, True, False, True, True, False])) - ]) + ("first", Series([False, False, False, False, True, True, False])), + ("last", Series([False, True, True, False, False, False, False])), + (False, Series([False, True, True, False, True, True, False])), + ], +) def test_drop_duplicates(any_numpy_dtype, keep, expected): tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype)) - if tc.dtype == 'bool': - pytest.skip('tested separately in test_drop_duplicates_bool') + if tc.dtype == "bool": + pytest.skip("tested separately in test_drop_duplicates_bool") tm.assert_series_equal(tc.duplicated(keep=keep), expected) tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) @@ -108,10 +112,14 @@ def test_drop_duplicates(any_numpy_dtype, keep, expected): tm.assert_series_equal(sc, tc[~expected]) -@pytest.mark.parametrize('keep, expected', - [('first', Series([False, False, True, True])), - ('last', Series([True, True, False, False])), - (False, Series([True, True, True, True]))]) +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, True])), + ("last", Series([True, True, False, False])), + (False, Series([True, True, True, True])), + ], +) def test_drop_duplicates_bool(keep, expected): tc = Series([True, False, True, False]) @@ -122,23 +130,29 @@ def test_drop_duplicates_bool(keep, expected): tm.assert_series_equal(sc, tc[~expected]) -@pytest.mark.parametrize('keep, expected', [ - ('first', Series([False, False, True, False, True], name='name')), - ('last', Series([True, True, False, False, False], name='name')), - (False, Series([True, True, True, False, True], name='name')) -]) +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True], name="name")), + ("last", Series([True, True, False, False, False], name="name")), + (False, Series([True, True, True, False, True], name="name")), + ], +) def test_duplicated_keep(keep, expected): - s = Series(['a', 'b', 'b', 'c', 'a'], name='name') + s = Series(["a", "b", "b", "c", "a"], name="name") result = s.duplicated(keep=keep) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('keep, expected', [ - ('first', Series([False, False, True, False, True])), - ('last', Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])) -]) +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) def test_duplicated_nan_none(keep, expected): s = Series([np.nan, 3, 3, None, np.nan], dtype=object) diff --git a/pandas/tests/series/test_internals.py b/pandas/tests/series/test_internals.py index 0b62624ad2696b..d35198ca70f377 100644 --- a/pandas/tests/series/test_internals.py +++ b/pandas/tests/series/test_internals.py @@ -16,7 +16,7 @@ class TestSeriesInternals: def test_convert(self): # Tests: All to nans, coerce, true # Test coercion returns correct type - s = Series(['a', 'b', 'c']) + s = Series(["a", "b", "c"]) results = s._convert(datetime=True, coerce=True) expected = Series([NaT] * 3) assert_series_equal(results, expected) @@ -25,7 +25,7 @@ def test_convert(self): expected = Series([np.nan] * 3) assert_series_equal(results, expected) - expected = Series([NaT] * 3, dtype=np.dtype('m8[ns]')) + expected = Series([NaT] * 3, dtype=np.dtype("m8[ns]")) results = s._convert(timedelta=True, coerce=True) assert_series_equal(results, expected) @@ -33,7 +33,7 @@ def test_convert(self): td = dt - datetime(2000, 1, 1, 0, 0) # Test coercion with mixed types - s = Series(['a', '3.1415', dt, td]) + s = Series(["a", "3.1415", dt, td]) results = s._convert(datetime=True, coerce=True) expected = Series([NaT, NaT, dt, NaT]) assert_series_equal(results, expected) @@ -43,8 +43,7 @@ def test_convert(self): assert_series_equal(results, expected) results = s._convert(timedelta=True, coerce=True) - expected = Series([NaT, NaT, NaT, td], - dtype=np.dtype('m8[ns]')) + expected = Series([NaT, NaT, NaT, td], dtype=np.dtype("m8[ns]")) assert_series_equal(results, expected) # Test standard conversion returns original @@ -57,94 +56,117 @@ def test_convert(self): assert_series_equal(results, s) # test pass-through and non-conversion when other types selected - s = Series(['1.0', '2.0', '3.0']) + s = Series(["1.0", "2.0", "3.0"]) results = s._convert(datetime=True, numeric=True, timedelta=True) expected = Series([1.0, 2.0, 3.0]) assert_series_equal(results, expected) results = s._convert(True, False, True) assert_series_equal(results, s) - s = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)], - dtype='O') + s = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)], dtype="O") results = s._convert(datetime=True, numeric=True, timedelta=True) - expected = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, - 0)]) + expected = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)]) assert_series_equal(results, expected) results = s._convert(datetime=False, numeric=True, timedelta=True) assert_series_equal(results, s) td = datetime(2001, 1, 1, 0, 0) - datetime(2000, 1, 1, 0, 0) - s = Series([td, td], dtype='O') + s = Series([td, td], dtype="O") results = s._convert(datetime=True, numeric=True, timedelta=True) expected = Series([td, td]) assert_series_equal(results, expected) results = s._convert(True, True, False) assert_series_equal(results, s) - s = Series([1., 2, 3], index=['a', 'b', 'c']) + s = Series([1.0, 2, 3], index=["a", "b", "c"]) result = s._convert(numeric=True) assert_series_equal(result, s) # force numeric conversion - r = s.copy().astype('O') - r['a'] = '1' + r = s.copy().astype("O") + r["a"] = "1" result = r._convert(numeric=True) assert_series_equal(result, s) - r = s.copy().astype('O') - r['a'] = '1.' + r = s.copy().astype("O") + r["a"] = "1." result = r._convert(numeric=True) assert_series_equal(result, s) - r = s.copy().astype('O') - r['a'] = 'garbled' + r = s.copy().astype("O") + r["a"] = "garbled" result = r._convert(numeric=True) expected = s.copy() - expected['a'] = np.nan + expected["a"] = np.nan assert_series_equal(result, expected) # GH 4119, not converting a mixed type (e.g.floats and object) - s = Series([1, 'na', 3, 4]) + s = Series([1, "na", 3, 4]) result = s._convert(datetime=True, numeric=True) expected = Series([1, np.nan, 3, 4]) assert_series_equal(result, expected) - s = Series([1, '', 3, 4]) + s = Series([1, "", 3, 4]) result = s._convert(datetime=True, numeric=True) assert_series_equal(result, expected) # dates - s = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0), - datetime(2001, 1, 3, 0, 0)]) - s2 = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0), - datetime(2001, 1, 3, 0, 0), 'foo', 1.0, 1, - Timestamp('20010104'), '20010105'], dtype='O') + s = Series( + [ + datetime(2001, 1, 1, 0, 0), + datetime(2001, 1, 2, 0, 0), + datetime(2001, 1, 3, 0, 0), + ] + ) + s2 = Series( + [ + datetime(2001, 1, 1, 0, 0), + datetime(2001, 1, 2, 0, 0), + datetime(2001, 1, 3, 0, 0), + "foo", + 1.0, + 1, + Timestamp("20010104"), + "20010105", + ], + dtype="O", + ) result = s._convert(datetime=True) - expected = Series([Timestamp('20010101'), Timestamp('20010102'), - Timestamp('20010103')], dtype='M8[ns]') + expected = Series( + [Timestamp("20010101"), Timestamp("20010102"), Timestamp("20010103")], + dtype="M8[ns]", + ) assert_series_equal(result, expected) result = s._convert(datetime=True, coerce=True) assert_series_equal(result, expected) - expected = Series([Timestamp('20010101'), Timestamp('20010102'), - Timestamp('20010103'), NaT, NaT, NaT, - Timestamp('20010104'), Timestamp('20010105')], - dtype='M8[ns]') - result = s2._convert(datetime=True, numeric=False, timedelta=False, - coerce=True) + expected = Series( + [ + Timestamp("20010101"), + Timestamp("20010102"), + Timestamp("20010103"), + NaT, + NaT, + NaT, + Timestamp("20010104"), + Timestamp("20010105"), + ], + dtype="M8[ns]", + ) + result = s2._convert(datetime=True, numeric=False, timedelta=False, coerce=True) assert_series_equal(result, expected) result = s2._convert(datetime=True, coerce=True) assert_series_equal(result, expected) - s = Series(['foo', 'bar', 1, 1.0], dtype='O') + s = Series(["foo", "bar", 1, 1.0], dtype="O") result = s._convert(datetime=True, coerce=True) expected = Series([NaT] * 2 + [Timestamp(1)] * 2) assert_series_equal(result, expected) # preserver if non-object - s = Series([1], dtype='float32') + s = Series([1], dtype="float32") result = s._convert(datetime=True, coerce=True) assert_series_equal(result, s) @@ -155,7 +177,7 @@ def test_convert(self): # dateutil parses some single letters into today's value as a date expected = Series([NaT]) - for x in 'abcdefghijklmnopqrstuvwxyz': + for x in "abcdefghijklmnopqrstuvwxyz": s = Series([x]) result = s._convert(datetime=True, coerce=True) assert_series_equal(result, expected) @@ -164,7 +186,7 @@ def test_convert(self): assert_series_equal(result, expected) def test_convert_no_arg_error(self): - s = Series(['1.0', '2']) + s = Series(["1.0", "2"]) msg = r"At least one of datetime, numeric or timedelta must be True\." with pytest.raises(ValueError, match=msg): s._convert() @@ -172,7 +194,7 @@ def test_convert_no_arg_error(self): def test_convert_preserve_bool(self): s = Series([1, True, 3, 5], dtype=object) r = s._convert(datetime=True, numeric=True) - e = Series([1, 1, 3, 5], dtype='i8') + e = Series([1, 1, 3, 5], dtype="i8") tm.assert_series_equal(r, e) def test_convert_preserve_all_bool(self): @@ -196,17 +218,17 @@ def test_astype_no_pandas_dtype(self): tm.assert_series_equal(result, ser) def test_from_array(self): - result = pd.Series(pd.array(['1H', '2H'], dtype='timedelta64[ns]')) + result = pd.Series(pd.array(["1H", "2H"], dtype="timedelta64[ns]")) assert result._data.blocks[0].is_extension is False - result = pd.Series(pd.array(['2015'], dtype='datetime64[ns]')) + result = pd.Series(pd.array(["2015"], dtype="datetime64[ns]")) assert result._data.blocks[0].is_extension is False def test_from_list_dtype(self): - result = pd.Series(['1H', '2H'], dtype='timedelta64[ns]') + result = pd.Series(["1H", "2H"], dtype="timedelta64[ns]") assert result._data.blocks[0].is_extension is False - result = pd.Series(['2015'], dtype='datetime64[ns]') + result = pd.Series(["2015"], dtype="datetime64[ns]") assert result._data.blocks[0].is_extension is False @@ -214,10 +236,10 @@ def test_hasnans_unchached_for_series(): # GH#19700 idx = pd.Index([0, 1]) assert idx.hasnans is False - assert 'hasnans' in idx._cache + assert "hasnans" in idx._cache ser = idx.to_series() assert ser.hasnans is False - assert not hasattr(ser, '_cache') + assert not hasattr(ser, "_cache") ser.iloc[-1] = np.nan assert ser.hasnans is True assert Series.hasnans.__doc__ == pd.Index.hasnans.__doc__ diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 39c217e7d95b13..0238314122462c 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -9,16 +9,18 @@ from pandas import DataFrame, Series import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal, ensure_clean) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, + ensure_clean, +) from pandas.io.common import _get_handle class TestSeriesToCSV: - def read_csv(self, path, **kwargs): - params = dict(squeeze=True, index_col=0, - header=None, parse_dates=True) + params = dict(squeeze=True, index_col=0, header=None, parse_dates=True) params.update(**kwargs) header = params.get("header") @@ -77,8 +79,9 @@ def test_from_csv(self, datetime_series, string_series): outfile.write("1998-01-01|1.0\n1999-01-01|2.0") series = self.read_csv(path, sep="|") - check_series = Series({datetime(1998, 1, 1): 1.0, - datetime(1999, 1, 1): 2.0}) + check_series = Series( + {datetime(1998, 1, 1): 1.0, datetime(1999, 1, 1): 2.0} + ) assert_series_equal(check_series, series) series = self.read_csv(path, sep="|", parse_dates=False) @@ -93,7 +96,7 @@ def test_to_csv(self, datetime_series): with io.open(path, newline=None) as f: lines = f.readlines() - assert (lines[1] != '\n') + assert lines[1] != "\n" datetime_series.to_csv(path, index=False, header=False) arr = np.loadtxt(path) @@ -120,9 +123,9 @@ def test_to_csv_float_format(self): assert_series_equal(rs, xp) def test_to_csv_list_entries(self): - s = Series(['jack and jill', 'jesse and frank']) + s = Series(["jack and jill", "jesse and frank"]) - split = s.str.split(r'\s+and\s+') + split = s.str.split(r"\s+and\s+") buf = StringIO() split.to_csv(buf, header=False) @@ -135,75 +138,91 @@ def test_to_csv_path_is_none(self): csv_str = s.to_csv(path_or_buf=None, header=False) assert isinstance(csv_str, str) - @pytest.mark.parametrize('s,encoding', [ - (Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], - name='X'), None), - # GH 21241, 21118 - (Series(['abc', 'def', 'ghi'], name='X'), 'ascii'), - (Series(["123", "你好", "世界"], name="中文"), 'gb2312'), - (Series(["123", "Γειά σου", "Κόσμε"], name="Ελληνικά"), 'cp737') - ]) + @pytest.mark.parametrize( + "s,encoding", + [ + ( + Series([0.123456, 0.234567, 0.567567], index=["A", "B", "C"], name="X"), + None, + ), + # GH 21241, 21118 + (Series(["abc", "def", "ghi"], name="X"), "ascii"), + (Series(["123", "你好", "世界"], name="中文"), "gb2312"), + (Series(["123", "Γειά σου", "Κόσμε"], name="Ελληνικά"), "cp737"), + ], + ) def test_to_csv_compression(self, s, encoding, compression): with ensure_clean() as filename: - s.to_csv(filename, compression=compression, encoding=encoding, - header=True) + s.to_csv(filename, compression=compression, encoding=encoding, header=True) # test the round trip - to_csv -> read_csv - result = pd.read_csv(filename, compression=compression, - encoding=encoding, index_col=0, squeeze=True) + result = pd.read_csv( + filename, + compression=compression, + encoding=encoding, + index_col=0, + squeeze=True, + ) assert_series_equal(s, result) # test the round trip using file handle - to_csv -> read_csv - f, _handles = _get_handle(filename, 'w', compression=compression, - encoding=encoding) + f, _handles = _get_handle( + filename, "w", compression=compression, encoding=encoding + ) with f: s.to_csv(f, encoding=encoding, header=True) - result = pd.read_csv(filename, compression=compression, - encoding=encoding, index_col=0, squeeze=True) + result = pd.read_csv( + filename, + compression=compression, + encoding=encoding, + index_col=0, + squeeze=True, + ) assert_series_equal(s, result) # explicitly ensure file was compressed with tm.decompress_file(filename, compression) as fh: - text = fh.read().decode(encoding or 'utf8') + text = fh.read().decode(encoding or "utf8") assert s.name in text with tm.decompress_file(filename, compression) as fh: - assert_series_equal(s, pd.read_csv(fh, - index_col=0, - squeeze=True, - encoding=encoding)) + assert_series_equal( + s, pd.read_csv(fh, index_col=0, squeeze=True, encoding=encoding) + ) class TestSeriesIO: - def test_to_frame(self, datetime_series): datetime_series.name = None rs = datetime_series.to_frame() xp = pd.DataFrame(datetime_series.values, index=datetime_series.index) assert_frame_equal(rs, xp) - datetime_series.name = 'testname' + datetime_series.name = "testname" rs = datetime_series.to_frame() - xp = pd.DataFrame(dict(testname=datetime_series.values), - index=datetime_series.index) + xp = pd.DataFrame( + dict(testname=datetime_series.values), index=datetime_series.index + ) assert_frame_equal(rs, xp) - rs = datetime_series.to_frame(name='testdifferent') - xp = pd.DataFrame(dict(testdifferent=datetime_series.values), - index=datetime_series.index) + rs = datetime_series.to_frame(name="testdifferent") + xp = pd.DataFrame( + dict(testdifferent=datetime_series.values), index=datetime_series.index + ) assert_frame_equal(rs, xp) def test_timeseries_periodindex(self): # GH2891 from pandas import period_range - prng = period_range('1/1/2011', '1/1/2012', freq='M') + + prng = period_range("1/1/2011", "1/1/2012", freq="M") ts = Series(np.random.randn(len(prng)), prng) new_ts = tm.round_trip_pickle(ts) - assert new_ts.index.freq == 'M' + assert new_ts.index.freq == "M" def test_pickle_preserve_name(self): - for n in [777, 777., 'name', datetime(2001, 11, 11), (1, 2)]: + for n in [777, 777.0, "name", datetime(2001, 11, 11), (1, 2)]: unpickled = self._pickle_roundtrip_name(tm.makeTimeSeries(name=n)) assert unpickled.name == n @@ -218,7 +237,6 @@ def test_to_frame_expanddim(self): # GH 9762 class SubclassedSeries(Series): - @property def _constructor_expanddim(self): return SubclassedFrame @@ -226,22 +244,20 @@ def _constructor_expanddim(self): class SubclassedFrame(DataFrame): pass - s = SubclassedSeries([1, 2, 3], name='X') + s = SubclassedSeries([1, 2, 3], name="X") result = s.to_frame() assert isinstance(result, SubclassedFrame) - expected = SubclassedFrame({'X': [1, 2, 3]}) + expected = SubclassedFrame({"X": [1, 2, 3]}) assert_frame_equal(result, expected) - @pytest.mark.parametrize('mapping', ( - dict, - collections.defaultdict(list), - collections.OrderedDict)) + @pytest.mark.parametrize( + "mapping", (dict, collections.defaultdict(list), collections.OrderedDict) + ) def test_to_dict(self, mapping, datetime_series): # GH16122 tm.assert_series_equal( - Series(datetime_series.to_dict(mapping), name='ts'), - datetime_series) + Series(datetime_series.to_dict(mapping), name="ts"), datetime_series + ) from_method = Series(datetime_series.to_dict(collections.Counter)) - from_constructor = Series(collections - .Counter(datetime_series.iteritems())) + from_constructor = Series(collections.Counter(datetime_series.iteritems())) tm.assert_series_equal(from_method, from_constructor) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 94050f75264445..6012f3986e955f 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -11,8 +11,17 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, Index, IntervalIndex, MultiIndex, NaT, Series, - Timestamp, date_range, isna) + Categorical, + DataFrame, + Index, + IntervalIndex, + MultiIndex, + NaT, + Series, + Timestamp, + date_range, + isna, +) from pandas.core.series import remove_na import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -23,7 +32,8 @@ def _skip_if_no_pchip(): from scipy.interpolate import pchip_interpolate # noqa except ImportError: import pytest - pytest.skip('scipy.interpolate.pchip missing') + + pytest.skip("scipy.interpolate.pchip missing") def _skip_if_no_akima(): @@ -31,16 +41,16 @@ def _skip_if_no_akima(): from scipy.interpolate import Akima1DInterpolator # noqa except ImportError: import pytest - pytest.skip('scipy.interpolate.Akima1DInterpolator missing') + + pytest.skip("scipy.interpolate.Akima1DInterpolator missing") -def _simple_ts(start, end, freq='D'): +def _simple_ts(start, end, freq="D"): rng = date_range(start, end, freq=freq) return Series(np.random.randn(len(rng)), index=rng) class TestSeriesMissingData: - def test_remove_na_deprecation(self): # see gh-16971 with tm.assert_produces_warning(FutureWarning): @@ -48,40 +58,74 @@ def test_remove_na_deprecation(self): def test_timedelta_fillna(self): # GH 3371 - s = Series([Timestamp('20130101'), Timestamp('20130101'), - Timestamp('20130102'), Timestamp('20130103 9:01:01')]) + s = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130102"), + Timestamp("20130103 9:01:01"), + ] + ) td = s.diff() # reg fillna with tm.assert_produces_warning(FutureWarning): result = td.fillna(0) - expected = Series([timedelta(0), timedelta(0), timedelta(1), - timedelta(days=1, seconds=9 * 3600 + 60 + 1)]) + expected = Series( + [ + timedelta(0), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ] + ) assert_series_equal(result, expected) # interpreted as seconds, deprecated with tm.assert_produces_warning(FutureWarning): result = td.fillna(1) - expected = Series([timedelta(seconds=1), - timedelta(0), timedelta(1), - timedelta(days=1, seconds=9 * 3600 + 60 + 1)]) + expected = Series( + [ + timedelta(seconds=1), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ] + ) assert_series_equal(result, expected) result = td.fillna(timedelta(days=1, seconds=1)) - expected = Series([timedelta(days=1, seconds=1), timedelta(0), - timedelta(1), - timedelta(days=1, seconds=9 * 3600 + 60 + 1)]) + expected = Series( + [ + timedelta(days=1, seconds=1), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ] + ) assert_series_equal(result, expected) result = td.fillna(np.timedelta64(int(1e9))) - expected = Series([timedelta(seconds=1), timedelta(0), timedelta(1), - timedelta(days=1, seconds=9 * 3600 + 60 + 1)]) + expected = Series( + [ + timedelta(seconds=1), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ] + ) assert_series_equal(result, expected) result = td.fillna(NaT) - expected = Series([NaT, timedelta(0), timedelta(1), - timedelta(days=1, seconds=9 * 3600 + 60 + 1)], - dtype='m8[ns]') + expected = Series( + [ + NaT, + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ], + dtype="m8[ns]", + ) assert_series_equal(result, expected) # ffill @@ -102,14 +146,26 @@ def test_timedelta_fillna(self): def test_datetime64_fillna(self): - s = Series([Timestamp('20130101'), Timestamp('20130101'), Timestamp( - '20130102'), Timestamp('20130103 9:01:01')]) + s = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130102"), + Timestamp("20130103 9:01:01"), + ] + ) s[2] = np.nan # reg fillna - result = s.fillna(Timestamp('20130104')) - expected = Series([Timestamp('20130101'), Timestamp( - '20130101'), Timestamp('20130104'), Timestamp('20130103 9:01:01')]) + result = s.fillna(Timestamp("20130104")) + expected = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130104"), + Timestamp("20130103 9:01:01"), + ] + ) assert_series_equal(result, expected) result = s.fillna(NaT) @@ -118,260 +174,367 @@ def test_datetime64_fillna(self): # ffill result = s.ffill() - expected = Series([Timestamp('20130101'), Timestamp( - '20130101'), Timestamp('20130101'), Timestamp('20130103 9:01:01')]) + expected = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130103 9:01:01"), + ] + ) assert_series_equal(result, expected) # bfill result = s.bfill() - expected = Series([Timestamp('20130101'), Timestamp('20130101'), - Timestamp('20130103 9:01:01'), Timestamp( - '20130103 9:01:01')]) + expected = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130103 9:01:01"), + Timestamp("20130103 9:01:01"), + ] + ) assert_series_equal(result, expected) # GH 6587 # make sure that we are treating as integer when filling # this also tests inference of a datetime-like with NaT's - s = Series([pd.NaT, pd.NaT, '2013-08-05 15:30:00.000001']) + s = Series([pd.NaT, pd.NaT, "2013-08-05 15:30:00.000001"]) expected = Series( - ['2013-08-05 15:30:00.000001', '2013-08-05 15:30:00.000001', - '2013-08-05 15:30:00.000001'], dtype='M8[ns]') - result = s.fillna(method='backfill') + [ + "2013-08-05 15:30:00.000001", + "2013-08-05 15:30:00.000001", + "2013-08-05 15:30:00.000001", + ], + dtype="M8[ns]", + ) + result = s.fillna(method="backfill") assert_series_equal(result, expected) def test_datetime64_tz_fillna(self): - for tz in ['US/Eastern', 'Asia/Tokyo']: + for tz in ["US/Eastern", "Asia/Tokyo"]: # DatetimeBlock - s = Series([Timestamp('2011-01-01 10:00'), pd.NaT, - Timestamp('2011-01-03 10:00'), pd.NaT]) + s = Series( + [ + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-03 10:00"), + pd.NaT, + ] + ) null_loc = pd.Series([False, True, False, True]) - result = s.fillna(pd.Timestamp('2011-01-02 10:00')) - expected = Series([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00'), - Timestamp('2011-01-03 10:00'), - Timestamp('2011-01-02 10:00')]) + result = s.fillna(pd.Timestamp("2011-01-02 10:00")) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-02 10:00"), + ] + ) tm.assert_series_equal(expected, result) # check s is not changed tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna(pd.Timestamp('2011-01-02 10:00', tz=tz)) - expected = Series([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00', tz=tz), - Timestamp('2011-01-03 10:00'), - Timestamp('2011-01-02 10:00', tz=tz)]) + result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz)) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + ] + ) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna('AAA') - expected = Series([Timestamp('2011-01-01 10:00'), 'AAA', - Timestamp('2011-01-03 10:00'), 'AAA'], - dtype=object) + result = s.fillna("AAA") + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + "AAA", + Timestamp("2011-01-03 10:00"), + "AAA", + ], + dtype=object, + ) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna({1: pd.Timestamp('2011-01-02 10:00', tz=tz), - 3: pd.Timestamp('2011-01-04 10:00')}) - expected = Series([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00', tz=tz), - Timestamp('2011-01-03 10:00'), - Timestamp('2011-01-04 10:00')]) + result = s.fillna( + { + 1: pd.Timestamp("2011-01-02 10:00", tz=tz), + 3: pd.Timestamp("2011-01-04 10:00"), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-04 10:00"), + ] + ) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna({1: pd.Timestamp('2011-01-02 10:00'), - 3: pd.Timestamp('2011-01-04 10:00')}) - expected = Series([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00'), - Timestamp('2011-01-03 10:00'), - Timestamp('2011-01-04 10:00')]) + result = s.fillna( + { + 1: pd.Timestamp("2011-01-02 10:00"), + 3: pd.Timestamp("2011-01-04 10:00"), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-04 10:00"), + ] + ) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) # DatetimeBlockTZ - idx = pd.DatetimeIndex(['2011-01-01 10:00', pd.NaT, - '2011-01-03 10:00', pd.NaT], tz=tz) + idx = pd.DatetimeIndex( + ["2011-01-01 10:00", pd.NaT, "2011-01-03 10:00", pd.NaT], tz=tz + ) s = pd.Series(idx) - assert s.dtype == 'datetime64[ns, {0}]'.format(tz) + assert s.dtype == "datetime64[ns, {0}]".format(tz) tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna(pd.Timestamp('2011-01-02 10:00')) - expected = Series([Timestamp('2011-01-01 10:00', tz=tz), - Timestamp('2011-01-02 10:00'), - Timestamp('2011-01-03 10:00', tz=tz), - Timestamp('2011-01-02 10:00')]) + result = s.fillna(pd.Timestamp("2011-01-02 10:00")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-02 10:00"), + ] + ) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna(pd.Timestamp('2011-01-02 10:00', tz=tz)) - idx = pd.DatetimeIndex(['2011-01-01 10:00', '2011-01-02 10:00', - '2011-01-03 10:00', '2011-01-02 10:00'], - tz=tz) + result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz)) + idx = pd.DatetimeIndex( + [ + "2011-01-01 10:00", + "2011-01-02 10:00", + "2011-01-03 10:00", + "2011-01-02 10:00", + ], + tz=tz, + ) expected = Series(idx) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna(pd.Timestamp('2011-01-02 10:00', - tz=tz).to_pydatetime()) - idx = pd.DatetimeIndex(['2011-01-01 10:00', '2011-01-02 10:00', - '2011-01-03 10:00', '2011-01-02 10:00'], - tz=tz) + result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz).to_pydatetime()) + idx = pd.DatetimeIndex( + [ + "2011-01-01 10:00", + "2011-01-02 10:00", + "2011-01-03 10:00", + "2011-01-02 10:00", + ], + tz=tz, + ) expected = Series(idx) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna('AAA') - expected = Series([Timestamp('2011-01-01 10:00', tz=tz), 'AAA', - Timestamp('2011-01-03 10:00', tz=tz), 'AAA'], - dtype=object) + result = s.fillna("AAA") + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + "AAA", + Timestamp("2011-01-03 10:00", tz=tz), + "AAA", + ], + dtype=object, + ) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna({1: pd.Timestamp('2011-01-02 10:00', tz=tz), - 3: pd.Timestamp('2011-01-04 10:00')}) - expected = Series([Timestamp('2011-01-01 10:00', tz=tz), - Timestamp('2011-01-02 10:00', tz=tz), - Timestamp('2011-01-03 10:00', tz=tz), - Timestamp('2011-01-04 10:00')]) + result = s.fillna( + { + 1: pd.Timestamp("2011-01-02 10:00", tz=tz), + 3: pd.Timestamp("2011-01-04 10:00"), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-04 10:00"), + ] + ) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna({1: pd.Timestamp('2011-01-02 10:00', tz=tz), - 3: pd.Timestamp('2011-01-04 10:00', tz=tz)}) - expected = Series([Timestamp('2011-01-01 10:00', tz=tz), - Timestamp('2011-01-02 10:00', tz=tz), - Timestamp('2011-01-03 10:00', tz=tz), - Timestamp('2011-01-04 10:00', tz=tz)]) + result = s.fillna( + { + 1: pd.Timestamp("2011-01-02 10:00", tz=tz), + 3: pd.Timestamp("2011-01-04 10:00", tz=tz), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-04 10:00", tz=tz), + ] + ) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) # filling with a naive/other zone, coerce to object - result = s.fillna(Timestamp('20130101')) - expected = Series([Timestamp('2011-01-01 10:00', tz=tz), - Timestamp('2013-01-01'), - Timestamp('2011-01-03 10:00', tz=tz), - Timestamp('2013-01-01')]) + result = s.fillna(Timestamp("20130101")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2013-01-01"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2013-01-01"), + ] + ) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna(Timestamp('20130101', tz='US/Pacific')) - expected = Series([Timestamp('2011-01-01 10:00', tz=tz), - Timestamp('2013-01-01', tz='US/Pacific'), - Timestamp('2011-01-03 10:00', tz=tz), - Timestamp('2013-01-01', tz='US/Pacific')]) + result = s.fillna(Timestamp("20130101", tz="US/Pacific")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2013-01-01", tz="US/Pacific"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2013-01-01", tz="US/Pacific"), + ] + ) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) # with timezone # GH 15855 - df = pd.Series([pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.NaT]) - exp = pd.Series([pd.Timestamp('2012-11-11 00:00:00+01:00'), - pd.Timestamp('2012-11-11 00:00:00+01:00')]) - assert_series_equal(df.fillna(method='pad'), exp) - - df = pd.Series([pd.NaT, pd.Timestamp('2012-11-11 00:00:00+01:00')]) - exp = pd.Series([pd.Timestamp('2012-11-11 00:00:00+01:00'), - pd.Timestamp('2012-11-11 00:00:00+01:00')]) - assert_series_equal(df.fillna(method='bfill'), exp) + df = pd.Series([pd.Timestamp("2012-11-11 00:00:00+01:00"), pd.NaT]) + exp = pd.Series( + [ + pd.Timestamp("2012-11-11 00:00:00+01:00"), + pd.Timestamp("2012-11-11 00:00:00+01:00"), + ] + ) + assert_series_equal(df.fillna(method="pad"), exp) + + df = pd.Series([pd.NaT, pd.Timestamp("2012-11-11 00:00:00+01:00")]) + exp = pd.Series( + [ + pd.Timestamp("2012-11-11 00:00:00+01:00"), + pd.Timestamp("2012-11-11 00:00:00+01:00"), + ] + ) + assert_series_equal(df.fillna(method="bfill"), exp) def test_fillna_consistency(self): # GH 16402 # fillna with a tz aware to a tz-naive, should result in object - s = Series([Timestamp('20130101'), pd.NaT]) + s = Series([Timestamp("20130101"), pd.NaT]) - result = s.fillna(Timestamp('20130101', tz='US/Eastern')) - expected = Series([Timestamp('20130101'), - Timestamp('2013-01-01', tz='US/Eastern')], - dtype='object') + result = s.fillna(Timestamp("20130101", tz="US/Eastern")) + expected = Series( + [Timestamp("20130101"), Timestamp("2013-01-01", tz="US/Eastern")], + dtype="object", + ) assert_series_equal(result, expected) # where (we ignore the errors=) - result = s.where([True, False], - Timestamp('20130101', tz='US/Eastern'), - errors='ignore') + result = s.where( + [True, False], Timestamp("20130101", tz="US/Eastern"), errors="ignore" + ) assert_series_equal(result, expected) - result = s.where([True, False], - Timestamp('20130101', tz='US/Eastern'), - errors='ignore') + result = s.where( + [True, False], Timestamp("20130101", tz="US/Eastern"), errors="ignore" + ) assert_series_equal(result, expected) # with a non-datetime - result = s.fillna('foo') - expected = Series([Timestamp('20130101'), - 'foo']) + result = s.fillna("foo") + expected = Series([Timestamp("20130101"), "foo"]) assert_series_equal(result, expected) # assignment s2 = s.copy() - s2[1] = 'foo' + s2[1] = "foo" assert_series_equal(s2, expected) def test_datetime64tz_fillna_round_issue(self): # GH 14872 - data = pd.Series([pd.NaT, pd.NaT, - datetime(2016, 12, 12, 22, 24, 6, 100001, - tzinfo=pytz.utc)]) + data = pd.Series( + [pd.NaT, pd.NaT, datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc)] + ) - filled = data.fillna(method='bfill') + filled = data.fillna(method="bfill") - expected = pd.Series([datetime(2016, 12, 12, 22, 24, 6, - 100001, tzinfo=pytz.utc), - datetime(2016, 12, 12, 22, 24, 6, - 100001, tzinfo=pytz.utc), - datetime(2016, 12, 12, 22, 24, 6, - 100001, tzinfo=pytz.utc)]) + expected = pd.Series( + [ + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + ] + ) assert_series_equal(filled, expected) def test_fillna_downcast(self): # GH 15277 # infer int64 from float64 - s = pd.Series([1., np.nan]) - result = s.fillna(0, downcast='infer') + s = pd.Series([1.0, np.nan]) + result = s.fillna(0, downcast="infer") expected = pd.Series([1, 0]) assert_series_equal(result, expected) # infer int64 from float64 when fillna value is a dict - s = pd.Series([1., np.nan]) - result = s.fillna({1: 0}, downcast='infer') + s = pd.Series([1.0, np.nan]) + result = s.fillna({1: 0}, downcast="infer") expected = pd.Series([1, 0]) assert_series_equal(result, expected) def test_fillna_int(self): s = Series(np.random.randint(-100, 100, 50)) - s.fillna(method='ffill', inplace=True) - assert_series_equal(s.fillna(method='ffill', inplace=False), s) + s.fillna(method="ffill", inplace=True) + assert_series_equal(s.fillna(method="ffill", inplace=False), s) def test_fillna_raise(self): s = Series(np.random.randint(-100, 100, 50)) - msg = ('"value" parameter must be a scalar or dict, but you passed a' - ' "list"') + msg = '"value" parameter must be a scalar or dict, but you passed a' ' "list"' with pytest.raises(TypeError, match=msg): s.fillna([1, 2]) - msg = ('"value" parameter must be a scalar or dict, but you passed a' - ' "tuple"') + msg = '"value" parameter must be a scalar or dict, but you passed a' ' "tuple"' with pytest.raises(TypeError, match=msg): s.fillna((1, 2)) # related GH 9217, make sure limit is an int and greater than 0 s = Series([1, 2, 3, None]) - msg = (r"Cannot specify both 'value' and 'method'\.|" - r"Limit must be greater than 0|" - "Limit must be an integer") - for limit in [-1, 0, 1., 2.]: - for method in ['backfill', 'bfill', 'pad', 'ffill', None]: + msg = ( + r"Cannot specify both 'value' and 'method'\.|" + r"Limit must be greater than 0|" + "Limit must be an integer" + ) + for limit in [-1, 0, 1.0, 2.0]: + for method in ["backfill", "bfill", "pad", "ffill", None]: with pytest.raises(ValueError, match=msg): s.fillna(1, limit=limit, method=method) def test_categorical_nan_equality(self): cat = Series(Categorical(["a", "b", "c", np.nan])) exp = Series([True, True, True, False]) - res = (cat == cat) + res = cat == cat tm.assert_series_equal(res, exp) def test_categorical_nan_handling(self): @@ -379,62 +542,63 @@ def test_categorical_nan_handling(self): # NaNs are represented as -1 in labels s = Series(Categorical(["a", "b", np.nan, "a"])) tm.assert_index_equal(s.cat.categories, Index(["a", "b"])) - tm.assert_numpy_array_equal(s.values.codes, - np.array([0, 1, -1, 0], dtype=np.int8)) - - @pytest.mark.parametrize('fill_value, expected_output', [ - ('a', ['a', 'a', 'b', 'a', 'a']), - ({1: 'a', 3: 'b', 4: 'b'}, ['a', 'a', 'b', 'b', 'b']), - ({1: 'a'}, ['a', 'a', 'b', np.nan, np.nan]), - ({1: 'a', 3: 'b'}, ['a', 'a', 'b', 'b', np.nan]), - (Series('a'), ['a', np.nan, 'b', np.nan, np.nan]), - (Series('a', index=[1]), ['a', 'a', 'b', np.nan, np.nan]), - (Series({1: 'a', 3: 'b'}), ['a', 'a', 'b', 'b', np.nan]), - (Series(['a', 'b'], index=[3, 4]), ['a', np.nan, 'b', 'a', 'b']) - ]) + tm.assert_numpy_array_equal( + s.values.codes, np.array([0, 1, -1, 0], dtype=np.int8) + ) + + @pytest.mark.parametrize( + "fill_value, expected_output", + [ + ("a", ["a", "a", "b", "a", "a"]), + ({1: "a", 3: "b", 4: "b"}, ["a", "a", "b", "b", "b"]), + ({1: "a"}, ["a", "a", "b", np.nan, np.nan]), + ({1: "a", 3: "b"}, ["a", "a", "b", "b", np.nan]), + (Series("a"), ["a", np.nan, "b", np.nan, np.nan]), + (Series("a", index=[1]), ["a", "a", "b", np.nan, np.nan]), + (Series({1: "a", 3: "b"}), ["a", "a", "b", "b", np.nan]), + (Series(["a", "b"], index=[3, 4]), ["a", np.nan, "b", "a", "b"]), + ], + ) def test_fillna_categorical(self, fill_value, expected_output): # GH 17033 # Test fillna for a Categorical series - data = ['a', np.nan, 'b', np.nan, np.nan] - s = Series(Categorical(data, categories=['a', 'b'])) - exp = Series(Categorical(expected_output, categories=['a', 'b'])) + data = ["a", np.nan, "b", np.nan, np.nan] + s = Series(Categorical(data, categories=["a", "b"])) + exp = Series(Categorical(expected_output, categories=["a", "b"])) tm.assert_series_equal(s.fillna(fill_value), exp) def test_fillna_categorical_raise(self): - data = ['a', np.nan, 'b', np.nan, np.nan] - s = Series(Categorical(data, categories=['a', 'b'])) + data = ["a", np.nan, "b", np.nan, np.nan] + s = Series(Categorical(data, categories=["a", "b"])) - with pytest.raises(ValueError, - match="fill value must be in categories"): - s.fillna('d') + with pytest.raises(ValueError, match="fill value must be in categories"): + s.fillna("d") - with pytest.raises(ValueError, - match="fill value must be in categories"): - s.fillna(Series('d')) + with pytest.raises(ValueError, match="fill value must be in categories"): + s.fillna(Series("d")) - with pytest.raises(ValueError, - match="fill value must be in categories"): - s.fillna({1: 'd', 3: 'a'}) + with pytest.raises(ValueError, match="fill value must be in categories"): + s.fillna({1: "d", 3: "a"}) - msg = ('"value" parameter must be a scalar or ' - 'dict, but you passed a "list"') + msg = '"value" parameter must be a scalar or ' 'dict, but you passed a "list"' with pytest.raises(TypeError, match=msg): - s.fillna(['a', 'b']) + s.fillna(["a", "b"]) - msg = ('"value" parameter must be a scalar or ' - 'dict, but you passed a "tuple"') + msg = '"value" parameter must be a scalar or ' 'dict, but you passed a "tuple"' with pytest.raises(TypeError, match=msg): - s.fillna(('a', 'b')) + s.fillna(("a", "b")) - msg = ('"value" parameter must be a scalar, dict ' - 'or Series, but you passed a "DataFrame"') + msg = ( + '"value" parameter must be a scalar, dict ' + 'or Series, but you passed a "DataFrame"' + ) with pytest.raises(TypeError, match=msg): - s.fillna(DataFrame({1: ['a'], 3: ['b']})) + s.fillna(DataFrame({1: ["a"], 3: ["b"]})) def test_fillna_nat(self): - series = Series([0, 1, 2, iNaT], dtype='M8[ns]') + series = Series([0, 1, 2, iNaT], dtype="M8[ns]") - filled = series.fillna(method='pad') + filled = series.fillna(method="pad") filled2 = series.fillna(value=series.values[2]) expected = series.copy() @@ -443,16 +607,16 @@ def test_fillna_nat(self): assert_series_equal(filled, expected) assert_series_equal(filled2, expected) - df = DataFrame({'A': series}) - filled = df.fillna(method='pad') + df = DataFrame({"A": series}) + filled = df.fillna(method="pad") filled2 = df.fillna(value=series.values[2]) - expected = DataFrame({'A': expected}) + expected = DataFrame({"A": expected}) assert_frame_equal(filled, expected) assert_frame_equal(filled2, expected) - series = Series([iNaT, 0, 1, 2], dtype='M8[ns]') + series = Series([iNaT, 0, 1, 2], dtype="M8[ns]") - filled = series.fillna(method='bfill') + filled = series.fillna(method="bfill") filled2 = series.fillna(value=series[1]) expected = series.copy() @@ -461,49 +625,49 @@ def test_fillna_nat(self): assert_series_equal(filled, expected) assert_series_equal(filled2, expected) - df = DataFrame({'A': series}) - filled = df.fillna(method='bfill') + df = DataFrame({"A": series}) + filled = df.fillna(method="bfill") filled2 = df.fillna(value=series[1]) - expected = DataFrame({'A': expected}) + expected = DataFrame({"A": expected}) assert_frame_equal(filled, expected) assert_frame_equal(filled2, expected) def test_isna_for_inf(self): - s = Series(['a', np.inf, np.nan, 1.0]) - with pd.option_context('mode.use_inf_as_na', True): + s = Series(["a", np.inf, np.nan, 1.0]) + with pd.option_context("mode.use_inf_as_na", True): r = s.isna() dr = s.dropna() e = Series([False, True, True, False]) - de = Series(['a', 1.0], index=[0, 3]) + de = Series(["a", 1.0], index=[0, 3]) tm.assert_series_equal(r, e) tm.assert_series_equal(dr, de) def test_isnull_for_inf_deprecated(self): # gh-17115 - s = Series(['a', np.inf, np.nan, 1.0]) - with pd.option_context('mode.use_inf_as_null', True): + s = Series(["a", np.inf, np.nan, 1.0]) + with pd.option_context("mode.use_inf_as_null", True): r = s.isna() dr = s.dropna() e = Series([False, True, True, False]) - de = Series(['a', 1.0], index=[0, 3]) + de = Series(["a", 1.0], index=[0, 3]) tm.assert_series_equal(r, e) tm.assert_series_equal(dr, de) def test_fillna(self, datetime_series): - ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) + ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) - tm.assert_series_equal(ts, ts.fillna(method='ffill')) + tm.assert_series_equal(ts, ts.fillna(method="ffill")) ts[2] = np.NaN - exp = Series([0., 1., 1., 3., 4.], index=ts.index) - tm.assert_series_equal(ts.fillna(method='ffill'), exp) + exp = Series([0.0, 1.0, 1.0, 3.0, 4.0], index=ts.index) + tm.assert_series_equal(ts.fillna(method="ffill"), exp) - exp = Series([0., 1., 3., 3., 4.], index=ts.index) - tm.assert_series_equal(ts.fillna(method='backfill'), exp) + exp = Series([0.0, 1.0, 3.0, 3.0, 4.0], index=ts.index) + tm.assert_series_equal(ts.fillna(method="backfill"), exp) - exp = Series([0., 1., 5., 3., 4.], index=ts.index) + exp = Series([0.0, 1.0, 5.0, 3.0, 4.0], index=ts.index) tm.assert_series_equal(ts.fillna(value=5), exp) msg = "Must specify a fill 'value' or 'method'" @@ -512,13 +676,13 @@ def test_fillna(self, datetime_series): msg = "Cannot specify both 'value' and 'method'" with pytest.raises(ValueError, match=msg): - datetime_series.fillna(value=0, method='ffill') + datetime_series.fillna(value=0, method="ffill") # GH 5703 s1 = Series([np.nan]) s2 = Series([1]) result = s1.fillna(s2) - expected = Series([1.]) + expected = Series([1.0]) assert_series_equal(result, expected) result = s1.fillna({}) assert_series_equal(result, s1) @@ -537,10 +701,10 @@ def test_fillna(self, datetime_series): result = s1.fillna(Series({0: 1, 1: 1}, index=[4, 5])) assert_series_equal(result, s1) - s1 = Series([0, 1, 2], list('abc')) - s2 = Series([0, np.nan, 2], list('bac')) + s1 = Series([0, 1, 2], list("abc")) + s2 = Series([0, np.nan, 2], list("bac")) result = s2.fillna(s1) - expected = Series([0, 0, 2.], list('bac')) + expected = Series([0, 0, 2.0], list("bac")) assert_series_equal(result, expected) # limit @@ -556,25 +720,25 @@ def test_fillna(self, datetime_series): # GH 9043 # make sure a string representation of int/float values can be filled # correctly without raising errors or being converted - vals = ['0', '1.5', '-0.3'] + vals = ["0", "1.5", "-0.3"] for val in vals: - s = Series([0, 1, np.nan, np.nan, 4], dtype='float64') + s = Series([0, 1, np.nan, np.nan, 4], dtype="float64") result = s.fillna(val) - expected = Series([0, 1, val, val, 4], dtype='object') + expected = Series([0, 1, val, val, 4], dtype="object") assert_series_equal(result, expected) def test_fillna_bug(self): - x = Series([nan, 1., nan, 3., nan], ['z', 'a', 'b', 'c', 'd']) - filled = x.fillna(method='ffill') - expected = Series([nan, 1., 1., 3., 3.], x.index) + x = Series([nan, 1.0, nan, 3.0, nan], ["z", "a", "b", "c", "d"]) + filled = x.fillna(method="ffill") + expected = Series([nan, 1.0, 1.0, 3.0, 3.0], x.index) assert_series_equal(filled, expected) - filled = x.fillna(method='bfill') - expected = Series([1., 1., 3., 3., nan], x.index) + filled = x.fillna(method="bfill") + expected = Series([1.0, 1.0, 3.0, 3.0, nan], x.index) assert_series_equal(filled, expected) def test_fillna_inplace(self): - x = Series([nan, 1., nan, 3., nan], ['z', 'a', 'b', 'c', 'd']) + x = Series([nan, 1.0, nan, 3.0, nan], ["z", "a", "b", "c", "d"]) y = x.copy() y.fillna(value=0, inplace=True) @@ -584,14 +748,14 @@ def test_fillna_inplace(self): def test_fillna_invalid_method(self, datetime_series): try: - datetime_series.fillna(method='ffil') + datetime_series.fillna(method="ffil") except ValueError as inst: - assert 'ffil' in str(inst) + assert "ffil" in str(inst) def test_ffill(self): - ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) + ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) ts[2] = np.NaN - assert_series_equal(ts.ffill(), ts.fillna(method='ffill')) + assert_series_equal(ts.ffill(), ts.fillna(method="ffill")) def test_ffill_mixed_dtypes_without_missing_data(self): # GH14956 @@ -600,9 +764,9 @@ def test_ffill_mixed_dtypes_without_missing_data(self): assert_series_equal(series, result) def test_bfill(self): - ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) + ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) ts[2] = np.NaN - assert_series_equal(ts.bfill(), ts.fillna(method='bfill')) + assert_series_equal(ts.bfill(), ts.fillna(method="bfill")) def test_timedelta64_nan(self): @@ -649,36 +813,45 @@ def test_dropna_empty(self): assert len(s) == 0 # invalid axis - msg = ("No axis named 1 for object type" - " ") + msg = "No axis named 1 for object type" " " with pytest.raises(ValueError, match=msg): s.dropna(axis=1) def test_datetime64_tz_dropna(self): # DatetimeBlock - s = Series([Timestamp('2011-01-01 10:00'), pd.NaT, Timestamp( - '2011-01-03 10:00'), pd.NaT]) + s = Series( + [ + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-03 10:00"), + pd.NaT, + ] + ) result = s.dropna() - expected = Series([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-03 10:00')], index=[0, 2]) + expected = Series( + [Timestamp("2011-01-01 10:00"), Timestamp("2011-01-03 10:00")], index=[0, 2] + ) tm.assert_series_equal(result, expected) # DatetimeBlockTZ - idx = pd.DatetimeIndex(['2011-01-01 10:00', pd.NaT, - '2011-01-03 10:00', pd.NaT], - tz='Asia/Tokyo') + idx = pd.DatetimeIndex( + ["2011-01-01 10:00", pd.NaT, "2011-01-03 10:00", pd.NaT], tz="Asia/Tokyo" + ) s = pd.Series(idx) - assert s.dtype == 'datetime64[ns, Asia/Tokyo]' + assert s.dtype == "datetime64[ns, Asia/Tokyo]" result = s.dropna() - expected = Series([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - Timestamp('2011-01-03 10:00', tz='Asia/Tokyo')], - index=[0, 2]) - assert result.dtype == 'datetime64[ns, Asia/Tokyo]' + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-03 10:00", tz="Asia/Tokyo"), + ], + index=[0, 2], + ) + assert result.dtype == "datetime64[ns, Asia/Tokyo]" tm.assert_series_equal(result, expected) def test_dropna_no_nan(self): - for s in [Series([1, 2, 3], name='x'), Series( - [False, True, False], name='x')]: + for s in [Series([1, 2, 3], name="x"), Series([False, True, False], name="x")]: result = s.dropna() tm.assert_series_equal(result, s) @@ -689,9 +862,10 @@ def test_dropna_no_nan(self): tm.assert_series_equal(s2, s) def test_dropna_intervals(self): - s = Series([np.nan, 1, 2, 3], IntervalIndex.from_arrays( - [np.nan, 0, 1, 2], - [np.nan, 1, 2, 3])) + s = Series( + [np.nan, 1, 2, 3], + IntervalIndex.from_arrays([np.nan, 0, 1, 2], [np.nan, 1, 2, 3]), + ) result = s.dropna() expected = s.iloc[1:] @@ -725,25 +899,27 @@ def test_notna(self): tm.assert_series_equal(ser.notna(), expected) def test_pad_nan(self): - x = Series([np.nan, 1., np.nan, 3., np.nan], ['z', 'a', 'b', 'c', 'd'], - dtype=float) + x = Series( + [np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"], dtype=float + ) - x.fillna(method='pad', inplace=True) + x.fillna(method="pad", inplace=True) - expected = Series([np.nan, 1.0, 1.0, 3.0, 3.0], - ['z', 'a', 'b', 'c', 'd'], dtype=float) + expected = Series( + [np.nan, 1.0, 1.0, 3.0, 3.0], ["z", "a", "b", "c", "d"], dtype=float + ) assert_series_equal(x[1:], expected[1:]) assert np.isnan(x[0]), np.isnan(expected[0]) def test_pad_require_monotonicity(self): - rng = date_range('1/1/2000', '3/1/2000', freq='B') + rng = date_range("1/1/2000", "3/1/2000", freq="B") # neither monotonic increasing or decreasing rng2 = rng[[1, 0, 2]] msg = "index must be monotonic increasing or decreasing" with pytest.raises(ValueError, match=msg): - rng2.get_indexer(rng, method='pad') + rng2.get_indexer(rng, method="pad") def test_dropna_preserve_name(self, datetime_series): datetime_series[:5] = np.nan @@ -767,16 +943,16 @@ def test_series_fillna_limit(self): s = Series(np.random.randn(10), index=index) result = s[:2].reindex(index) - result = result.fillna(method='pad', limit=5) + result = result.fillna(method="pad", limit=5) - expected = s[:2].reindex(index).fillna(method='pad') + expected = s[:2].reindex(index).fillna(method="pad") expected[-3:] = np.nan assert_series_equal(result, expected) result = s[-2:].reindex(index) - result = result.fillna(method='bfill', limit=5) + result = result.fillna(method="bfill", limit=5) - expected = s[-2:].reindex(index).fillna(method='backfill') + expected = s[-2:].reindex(index).fillna(method="backfill") expected[:3] = np.nan assert_series_equal(result, expected) @@ -789,21 +965,23 @@ def test_sparse_series_fillna_limit(self): ss = s[:2].reindex(index).to_sparse() # TODO: what is this test doing? why are result an expected # the same call to fillna? - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): # TODO: release-note fillna performance warning - result = ss.fillna(method='pad', limit=5) - expected = ss.fillna(method='pad', limit=5) + result = ss.fillna(method="pad", limit=5) + expected = ss.fillna(method="pad", limit=5) expected = expected.to_dense() expected[-3:] = np.nan expected = expected.to_sparse() assert_series_equal(result, expected) ss = s[-2:].reindex(index).to_sparse() - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): - result = ss.fillna(method='backfill', limit=5) - expected = ss.fillna(method='backfill') + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): + result = ss.fillna(method="backfill", limit=5) + expected = ss.fillna(method="backfill") expected = expected.to_dense() expected[:3] = np.nan expected = expected.to_sparse() @@ -816,19 +994,21 @@ def test_sparse_series_pad_backfill_limit(self): s = Series(np.random.randn(10), index=index) s = s.to_sparse() - result = s[:2].reindex(index, method='pad', limit=5) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): - expected = s[:2].reindex(index).fillna(method='pad') + result = s[:2].reindex(index, method="pad", limit=5) + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): + expected = s[:2].reindex(index).fillna(method="pad") expected = expected.to_dense() expected[-3:] = np.nan expected = expected.to_sparse() assert_series_equal(result, expected) - result = s[-2:].reindex(index, method='backfill', limit=5) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): - expected = s[-2:].reindex(index).fillna(method='backfill') + result = s[-2:].reindex(index, method="backfill", limit=5) + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): + expected = s[-2:].reindex(index).fillna(method="backfill") expected = expected.to_dense() expected[:3] = np.nan expected = expected.to_sparse() @@ -839,23 +1019,39 @@ def test_series_pad_backfill_limit(self): index = np.arange(10) s = Series(np.random.randn(10), index=index) - result = s[:2].reindex(index, method='pad', limit=5) + result = s[:2].reindex(index, method="pad", limit=5) - expected = s[:2].reindex(index).fillna(method='pad') + expected = s[:2].reindex(index).fillna(method="pad") expected[-3:] = np.nan assert_series_equal(result, expected) - result = s[-2:].reindex(index, method='backfill', limit=5) + result = s[-2:].reindex(index, method="backfill", limit=5) - expected = s[-2:].reindex(index).fillna(method='backfill') + expected = s[-2:].reindex(index).fillna(method="backfill") expected[:3] = np.nan assert_series_equal(result, expected) -@pytest.fixture(params=['linear', 'index', 'values', 'nearest', 'slinear', - 'zero', 'quadratic', 'cubic', 'barycentric', 'krogh', - 'polynomial', 'spline', 'piecewise_polynomial', - 'from_derivatives', 'pchip', 'akima', ]) +@pytest.fixture( + params=[ + "linear", + "index", + "values", + "nearest", + "slinear", + "zero", + "quadratic", + "cubic", + "barycentric", + "krogh", + "polynomial", + "spline", + "piecewise_polynomial", + "from_derivatives", + "pchip", + "akima", + ] +) def nontemporal_method(request): """ Fixture that returns an (method name, required kwargs) pair. @@ -864,14 +1060,27 @@ def nontemporal_method(request): separately from these non-temporal methods. """ method = request.param - kwargs = dict(order=1) if method in ('spline', 'polynomial') else dict() + kwargs = dict(order=1) if method in ("spline", "polynomial") else dict() return method, kwargs -@pytest.fixture(params=['linear', 'slinear', 'zero', 'quadratic', 'cubic', - 'barycentric', 'krogh', 'polynomial', 'spline', - 'piecewise_polynomial', 'from_derivatives', 'pchip', - 'akima', ]) +@pytest.fixture( + params=[ + "linear", + "slinear", + "zero", + "quadratic", + "cubic", + "barycentric", + "krogh", + "polynomial", + "spline", + "piecewise_polynomial", + "from_derivatives", + "pchip", + "akima", + ] +) def interp_methods_ind(request): """ Fixture that returns a (method name, required kwargs) pair to be tested for various Index types. @@ -880,38 +1089,39 @@ def interp_methods_ind(request): 'values' as a parameterization """ method = request.param - kwargs = dict(order=1) if method in ('spline', 'polynomial') else dict() + kwargs = dict(order=1) if method in ("spline", "polynomial") else dict() return method, kwargs class TestSeriesInterpolateData: def test_interpolate(self, datetime_series, string_series): - ts = Series(np.arange(len(datetime_series), dtype=float), - datetime_series.index) + ts = Series(np.arange(len(datetime_series), dtype=float), datetime_series.index) ts_copy = ts.copy() ts_copy[5:10] = np.NaN - linear_interp = ts_copy.interpolate(method='linear') + linear_interp = ts_copy.interpolate(method="linear") tm.assert_series_equal(linear_interp, ts) - ord_ts = Series([d.toordinal() for d in datetime_series.index], - index=datetime_series.index).astype(float) + ord_ts = Series( + [d.toordinal() for d in datetime_series.index], index=datetime_series.index + ).astype(float) ord_ts_copy = ord_ts.copy() ord_ts_copy[5:10] = np.NaN - time_interp = ord_ts_copy.interpolate(method='time') + time_interp = ord_ts_copy.interpolate(method="time") tm.assert_series_equal(time_interp, ord_ts) def test_interpolate_time_raises_for_non_timeseries(self): # When method='time' is used on a non-TimeSeries that contains a null # value, a ValueError should be raised. non_ts = Series([0, 1, 2, np.NaN]) - msg = ("time-weighted interpolation only works on Series.* " - "with a DatetimeIndex") + msg = ( + "time-weighted interpolation only works on Series.* " "with a DatetimeIndex" + ) with pytest.raises(ValueError, match=msg): - non_ts.interpolate(method='time') + non_ts.interpolate(method="time") @td.skip_if_no_scipy def test_interpolate_pchip(self): @@ -920,9 +1130,10 @@ def test_interpolate_pchip(self): ser = Series(np.sort(np.random.uniform(size=100))) # interpolate at new_index - new_index = ser.index.union(Index([49.25, 49.5, 49.75, 50.25, 50.5, - 50.75])).astype(float) - interp_s = ser.reindex(new_index).interpolate(method='pchip') + new_index = ser.index.union( + Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75]) + ).astype(float) + interp_s = ser.reindex(new_index).interpolate(method="pchip") # does not blow up, GH5977 interp_s[49:51] @@ -932,54 +1143,56 @@ def test_interpolate_akima(self): ser = Series([10, 11, 12, 13]) - expected = Series([11.00, 11.25, 11.50, 11.75, - 12.00, 12.25, 12.50, 12.75, 13.00], - index=Index([1.0, 1.25, 1.5, 1.75, - 2.0, 2.25, 2.5, 2.75, 3.0])) + expected = Series( + [11.00, 11.25, 11.50, 11.75, 12.00, 12.25, 12.50, 12.75, 13.00], + index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0]), + ) # interpolate at new_index - new_index = ser.index.union( - Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75]) - ).astype(float) - interp_s = ser.reindex(new_index).interpolate(method='akima') + new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])).astype( + float + ) + interp_s = ser.reindex(new_index).interpolate(method="akima") assert_series_equal(interp_s[1:3], expected) @td.skip_if_no_scipy def test_interpolate_piecewise_polynomial(self): ser = Series([10, 11, 12, 13]) - expected = Series([11.00, 11.25, 11.50, 11.75, - 12.00, 12.25, 12.50, 12.75, 13.00], - index=Index([1.0, 1.25, 1.5, 1.75, - 2.0, 2.25, 2.5, 2.75, 3.0])) + expected = Series( + [11.00, 11.25, 11.50, 11.75, 12.00, 12.25, 12.50, 12.75, 13.00], + index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0]), + ) # interpolate at new_index - new_index = ser.index.union( - Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75]) - ).astype(float) - interp_s = ser.reindex(new_index).interpolate( - method='piecewise_polynomial') + new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])).astype( + float + ) + interp_s = ser.reindex(new_index).interpolate(method="piecewise_polynomial") assert_series_equal(interp_s[1:3], expected) @td.skip_if_no_scipy def test_interpolate_from_derivatives(self): ser = Series([10, 11, 12, 13]) - expected = Series([11.00, 11.25, 11.50, 11.75, - 12.00, 12.25, 12.50, 12.75, 13.00], - index=Index([1.0, 1.25, 1.5, 1.75, - 2.0, 2.25, 2.5, 2.75, 3.0])) + expected = Series( + [11.00, 11.25, 11.50, 11.75, 12.00, 12.25, 12.50, 12.75, 13.00], + index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0]), + ) # interpolate at new_index - new_index = ser.index.union( - Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75]) - ).astype(float) - interp_s = ser.reindex(new_index).interpolate( - method='from_derivatives') + new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])).astype( + float + ) + interp_s = ser.reindex(new_index).interpolate(method="from_derivatives") assert_series_equal(interp_s[1:3], expected) - @pytest.mark.parametrize("kwargs", [ - {}, - pytest.param({'method': 'polynomial', 'order': 1}, - marks=td.skip_if_no_scipy) - ]) + @pytest.mark.parametrize( + "kwargs", + [ + {}, + pytest.param( + {"method": "polynomial", "order": 1}, marks=td.skip_if_no_scipy + ), + ], + ) def test_interpolate_corners(self, kwargs): s = Series([np.nan, np.nan]) assert_series_equal(s.interpolate(**kwargs), s) @@ -993,107 +1206,112 @@ def test_interpolate_index_values(self): vals = s.index.values.astype(float) - result = s.interpolate(method='index') + result = s.interpolate(method="index") expected = s.copy() bad = isna(expected.values) good = ~bad - expected = Series(np.interp(vals[bad], vals[good], - s.values[good]), - index=s.index[bad]) + expected = Series( + np.interp(vals[bad], vals[good], s.values[good]), index=s.index[bad] + ) assert_series_equal(result[bad], expected) # 'values' is synonymous with 'index' for the method kwarg - other_result = s.interpolate(method='values') + other_result = s.interpolate(method="values") assert_series_equal(other_result, result) assert_series_equal(other_result[bad], expected) def test_interpolate_non_ts(self): s = Series([1, 3, np.nan, np.nan, np.nan, 11]) - msg = ("time-weighted interpolation only works on Series or DataFrames" - " with a DatetimeIndex") + msg = ( + "time-weighted interpolation only works on Series or DataFrames" + " with a DatetimeIndex" + ) with pytest.raises(ValueError, match=msg): - s.interpolate(method='time') + s.interpolate(method="time") - @pytest.mark.parametrize("kwargs", [ - {}, - pytest.param({'method': 'polynomial', 'order': 1}, - marks=td.skip_if_no_scipy) - ]) + @pytest.mark.parametrize( + "kwargs", + [ + {}, + pytest.param( + {"method": "polynomial", "order": 1}, marks=td.skip_if_no_scipy + ), + ], + ) def test_nan_interpolate(self, kwargs): s = Series([0, 1, np.nan, 3]) result = s.interpolate(**kwargs) - expected = Series([0., 1., 2., 3.]) + expected = Series([0.0, 1.0, 2.0, 3.0]) assert_series_equal(result, expected) def test_nan_irregular_index(self): s = Series([1, 2, np.nan, 4], index=[1, 3, 5, 9]) result = s.interpolate() - expected = Series([1., 2., 3., 4.], index=[1, 3, 5, 9]) + expected = Series([1.0, 2.0, 3.0, 4.0], index=[1, 3, 5, 9]) assert_series_equal(result, expected) def test_nan_str_index(self): - s = Series([0, 1, 2, np.nan], index=list('abcd')) + s = Series([0, 1, 2, np.nan], index=list("abcd")) result = s.interpolate() - expected = Series([0., 1., 2., 2.], index=list('abcd')) + expected = Series([0.0, 1.0, 2.0, 2.0], index=list("abcd")) assert_series_equal(result, expected) @td.skip_if_no_scipy def test_interp_quad(self): sq = Series([1, 4, np.nan, 16], index=[1, 2, 3, 4]) - result = sq.interpolate(method='quadratic') - expected = Series([1., 4., 9., 16.], index=[1, 2, 3, 4]) + result = sq.interpolate(method="quadratic") + expected = Series([1.0, 4.0, 9.0, 16.0], index=[1, 2, 3, 4]) assert_series_equal(result, expected) @td.skip_if_no_scipy def test_interp_scipy_basic(self): s = Series([1, 3, np.nan, 12, np.nan, 25]) # slinear - expected = Series([1., 3., 7.5, 12., 18.5, 25.]) - result = s.interpolate(method='slinear') + expected = Series([1.0, 3.0, 7.5, 12.0, 18.5, 25.0]) + result = s.interpolate(method="slinear") assert_series_equal(result, expected) - result = s.interpolate(method='slinear', downcast='infer') + result = s.interpolate(method="slinear", downcast="infer") assert_series_equal(result, expected) # nearest expected = Series([1, 3, 3, 12, 12, 25]) - result = s.interpolate(method='nearest') - assert_series_equal(result, expected.astype('float')) + result = s.interpolate(method="nearest") + assert_series_equal(result, expected.astype("float")) - result = s.interpolate(method='nearest', downcast='infer') + result = s.interpolate(method="nearest", downcast="infer") assert_series_equal(result, expected) # zero expected = Series([1, 3, 3, 12, 12, 25]) - result = s.interpolate(method='zero') - assert_series_equal(result, expected.astype('float')) + result = s.interpolate(method="zero") + assert_series_equal(result, expected.astype("float")) - result = s.interpolate(method='zero', downcast='infer') + result = s.interpolate(method="zero", downcast="infer") assert_series_equal(result, expected) # quadratic # GH #15662. - expected = Series([1, 3., 6.823529, 12., 18.058824, 25.]) - result = s.interpolate(method='quadratic') + expected = Series([1, 3.0, 6.823529, 12.0, 18.058824, 25.0]) + result = s.interpolate(method="quadratic") assert_series_equal(result, expected) - result = s.interpolate(method='quadratic', downcast='infer') + result = s.interpolate(method="quadratic", downcast="infer") assert_series_equal(result, expected) # cubic - expected = Series([1., 3., 6.8, 12., 18.2, 25.]) - result = s.interpolate(method='cubic') + expected = Series([1.0, 3.0, 6.8, 12.0, 18.2, 25.0]) + result = s.interpolate(method="cubic") assert_series_equal(result, expected) def test_interp_limit(self): s = Series([1, 3, np.nan, np.nan, np.nan, 11]) - expected = Series([1., 3., 5., 7., np.nan, 11.]) - result = s.interpolate(method='linear', limit=2) + expected = Series([1.0, 3.0, 5.0, 7.0, np.nan, 11.0]) + result = s.interpolate(method="linear", limit=2) assert_series_equal(result, expected) @pytest.mark.parametrize("limit", [-1, 0]) - def test_interpolate_invalid_nonpositive_limit(self, nontemporal_method, - limit): + def test_interpolate_invalid_nonpositive_limit(self, nontemporal_method, limit): # GH 9217: make sure limit is greater than zero. s = pd.Series([1, 2, np.nan, 4]) method, kwargs = nontemporal_method @@ -1108,7 +1326,7 @@ def test_interpolate_invalid_float_limit(self, nontemporal_method): with pytest.raises(ValueError, match="Limit must be an integer"): s.interpolate(limit=limit, method=method, **kwargs) - @pytest.mark.parametrize("invalid_method", [None, 'nonexistent_method']) + @pytest.mark.parametrize("invalid_method", [None, "nonexistent_method"]) def test_interp_invalid_method(self, invalid_method): s = Series([1, 3, np.nan, 12, np.nan, 25]) @@ -1125,164 +1343,154 @@ def test_interp_limit_forward(self): s = Series([1, 3, np.nan, np.nan, np.nan, 11]) # Provide 'forward' (the default) explicitly here. - expected = Series([1., 3., 5., 7., np.nan, 11.]) + expected = Series([1.0, 3.0, 5.0, 7.0, np.nan, 11.0]) - result = s.interpolate(method='linear', limit=2, - limit_direction='forward') + result = s.interpolate(method="linear", limit=2, limit_direction="forward") assert_series_equal(result, expected) - result = s.interpolate(method='linear', limit=2, - limit_direction='FORWARD') + result = s.interpolate(method="linear", limit=2, limit_direction="FORWARD") assert_series_equal(result, expected) def test_interp_unlimited(self): # these test are for issue #16282 default Limit=None is unlimited - s = Series([np.nan, 1., 3., np.nan, np.nan, np.nan, 11., np.nan]) - expected = Series([1., 1., 3., 5., 7., 9., 11., 11.]) - result = s.interpolate(method='linear', - limit_direction='both') + s = Series([np.nan, 1.0, 3.0, np.nan, np.nan, np.nan, 11.0, np.nan]) + expected = Series([1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 11.0]) + result = s.interpolate(method="linear", limit_direction="both") assert_series_equal(result, expected) - expected = Series([np.nan, 1., 3., 5., 7., 9., 11., 11.]) - result = s.interpolate(method='linear', - limit_direction='forward') + expected = Series([np.nan, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 11.0]) + result = s.interpolate(method="linear", limit_direction="forward") assert_series_equal(result, expected) - expected = Series([1., 1., 3., 5., 7., 9., 11., np.nan]) - result = s.interpolate(method='linear', - limit_direction='backward') + expected = Series([1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, np.nan]) + result = s.interpolate(method="linear", limit_direction="backward") assert_series_equal(result, expected) def test_interp_limit_bad_direction(self): s = Series([1, 3, np.nan, np.nan, np.nan, 11]) - msg = (r"Invalid limit_direction: expecting one of \['forward'," - r" 'backward', 'both'\], got 'abc'") + msg = ( + r"Invalid limit_direction: expecting one of \['forward'," + r" 'backward', 'both'\], got 'abc'" + ) with pytest.raises(ValueError, match=msg): - s.interpolate(method='linear', limit=2, limit_direction='abc') + s.interpolate(method="linear", limit=2, limit_direction="abc") # raises an error even if no limit is specified. with pytest.raises(ValueError, match=msg): - s.interpolate(method='linear', limit_direction='abc') + s.interpolate(method="linear", limit_direction="abc") # limit_area introduced GH #16284 def test_interp_limit_area(self): # These tests are for issue #9218 -- fill NaNs in both directions. s = Series([nan, nan, 3, nan, nan, nan, 7, nan, nan]) - expected = Series([nan, nan, 3., 4., 5., 6., 7., nan, nan]) - result = s.interpolate(method='linear', limit_area='inside') + expected = Series([nan, nan, 3.0, 4.0, 5.0, 6.0, 7.0, nan, nan]) + result = s.interpolate(method="linear", limit_area="inside") assert_series_equal(result, expected) - expected = Series([nan, nan, 3., 4., nan, nan, 7., nan, nan]) - result = s.interpolate(method='linear', limit_area='inside', - limit=1) + expected = Series([nan, nan, 3.0, 4.0, nan, nan, 7.0, nan, nan]) + result = s.interpolate(method="linear", limit_area="inside", limit=1) - expected = Series([nan, nan, 3., 4., nan, 6., 7., nan, nan]) - result = s.interpolate(method='linear', limit_area='inside', - limit_direction='both', limit=1) + expected = Series([nan, nan, 3.0, 4.0, nan, 6.0, 7.0, nan, nan]) + result = s.interpolate( + method="linear", limit_area="inside", limit_direction="both", limit=1 + ) assert_series_equal(result, expected) - expected = Series([nan, nan, 3., nan, nan, nan, 7., 7., 7.]) - result = s.interpolate(method='linear', limit_area='outside') + expected = Series([nan, nan, 3.0, nan, nan, nan, 7.0, 7.0, 7.0]) + result = s.interpolate(method="linear", limit_area="outside") assert_series_equal(result, expected) - expected = Series([nan, nan, 3., nan, nan, nan, 7., 7., nan]) - result = s.interpolate(method='linear', limit_area='outside', - limit=1) + expected = Series([nan, nan, 3.0, nan, nan, nan, 7.0, 7.0, nan]) + result = s.interpolate(method="linear", limit_area="outside", limit=1) - expected = Series([nan, 3., 3., nan, nan, nan, 7., 7., nan]) - result = s.interpolate(method='linear', limit_area='outside', - limit_direction='both', limit=1) + expected = Series([nan, 3.0, 3.0, nan, nan, nan, 7.0, 7.0, nan]) + result = s.interpolate( + method="linear", limit_area="outside", limit_direction="both", limit=1 + ) assert_series_equal(result, expected) - expected = Series([3., 3., 3., nan, nan, nan, 7., nan, nan]) - result = s.interpolate(method='linear', limit_area='outside', - direction='backward') + expected = Series([3.0, 3.0, 3.0, nan, nan, nan, 7.0, nan, nan]) + result = s.interpolate( + method="linear", limit_area="outside", direction="backward" + ) # raises an error even if limit type is wrong. - msg = (r"Invalid limit_area: expecting one of \['inside', 'outside'\]," - " got abc") + msg = ( + r"Invalid limit_area: expecting one of \['inside', 'outside'\]," " got abc" + ) with pytest.raises(ValueError, match=msg): - s.interpolate(method='linear', limit_area='abc') + s.interpolate(method="linear", limit_area="abc") def test_interp_limit_direction(self): # These tests are for issue #9218 -- fill NaNs in both directions. s = Series([1, 3, np.nan, np.nan, np.nan, 11]) - expected = Series([1., 3., np.nan, 7., 9., 11.]) - result = s.interpolate(method='linear', limit=2, - limit_direction='backward') + expected = Series([1.0, 3.0, np.nan, 7.0, 9.0, 11.0]) + result = s.interpolate(method="linear", limit=2, limit_direction="backward") assert_series_equal(result, expected) - expected = Series([1., 3., 5., np.nan, 9., 11.]) - result = s.interpolate(method='linear', limit=1, - limit_direction='both') + expected = Series([1.0, 3.0, 5.0, np.nan, 9.0, 11.0]) + result = s.interpolate(method="linear", limit=1, limit_direction="both") assert_series_equal(result, expected) # Check that this works on a longer series of nans. - s = Series([1, 3, np.nan, np.nan, np.nan, 7, 9, np.nan, np.nan, 12, - np.nan]) + s = Series([1, 3, np.nan, np.nan, np.nan, 7, 9, np.nan, np.nan, 12, np.nan]) - expected = Series([1., 3., 4., 5., 6., 7., 9., 10., 11., 12., 12.]) - result = s.interpolate(method='linear', limit=2, - limit_direction='both') + expected = Series([1.0, 3.0, 4.0, 5.0, 6.0, 7.0, 9.0, 10.0, 11.0, 12.0, 12.0]) + result = s.interpolate(method="linear", limit=2, limit_direction="both") assert_series_equal(result, expected) - expected = Series([1., 3., 4., np.nan, 6., 7., 9., 10., 11., 12., 12.]) - result = s.interpolate(method='linear', limit=1, - limit_direction='both') + expected = Series( + [1.0, 3.0, 4.0, np.nan, 6.0, 7.0, 9.0, 10.0, 11.0, 12.0, 12.0] + ) + result = s.interpolate(method="linear", limit=1, limit_direction="both") assert_series_equal(result, expected) def test_interp_limit_to_ends(self): # These test are for issue #10420 -- flow back to beginning. s = Series([np.nan, np.nan, 5, 7, 9, np.nan]) - expected = Series([5., 5., 5., 7., 9., np.nan]) - result = s.interpolate(method='linear', limit=2, - limit_direction='backward') + expected = Series([5.0, 5.0, 5.0, 7.0, 9.0, np.nan]) + result = s.interpolate(method="linear", limit=2, limit_direction="backward") assert_series_equal(result, expected) - expected = Series([5., 5., 5., 7., 9., 9.]) - result = s.interpolate(method='linear', limit=2, - limit_direction='both') + expected = Series([5.0, 5.0, 5.0, 7.0, 9.0, 9.0]) + result = s.interpolate(method="linear", limit=2, limit_direction="both") assert_series_equal(result, expected) def test_interp_limit_before_ends(self): # These test are for issue #11115 -- limit ends properly. s = Series([np.nan, np.nan, 5, 7, np.nan, np.nan]) - expected = Series([np.nan, np.nan, 5., 7., 7., np.nan]) - result = s.interpolate(method='linear', limit=1, - limit_direction='forward') + expected = Series([np.nan, np.nan, 5.0, 7.0, 7.0, np.nan]) + result = s.interpolate(method="linear", limit=1, limit_direction="forward") assert_series_equal(result, expected) - expected = Series([np.nan, 5., 5., 7., np.nan, np.nan]) - result = s.interpolate(method='linear', limit=1, - limit_direction='backward') + expected = Series([np.nan, 5.0, 5.0, 7.0, np.nan, np.nan]) + result = s.interpolate(method="linear", limit=1, limit_direction="backward") assert_series_equal(result, expected) - expected = Series([np.nan, 5., 5., 7., 7., np.nan]) - result = s.interpolate(method='linear', limit=1, - limit_direction='both') + expected = Series([np.nan, 5.0, 5.0, 7.0, 7.0, np.nan]) + result = s.interpolate(method="linear", limit=1, limit_direction="both") assert_series_equal(result, expected) @td.skip_if_no_scipy def test_interp_all_good(self): s = Series([1, 2, 3]) - result = s.interpolate(method='polynomial', order=1) + result = s.interpolate(method="polynomial", order=1) assert_series_equal(result, s) # non-scipy result = s.interpolate() assert_series_equal(result, s) - @pytest.mark.parametrize("check_scipy", [ - False, - pytest.param(True, marks=td.skip_if_no_scipy) - ]) + @pytest.mark.parametrize( + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] + ) def test_interp_multiIndex(self, check_scipy): - idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')]) + idx = MultiIndex.from_tuples([(0, "a"), (1, "b"), (2, "c")]) s = Series([1, 2, np.nan], index=idx) expected = s.copy() @@ -1293,32 +1501,31 @@ def test_interp_multiIndex(self, check_scipy): msg = "Only `method=linear` interpolation is supported on MultiIndexes" if check_scipy: with pytest.raises(ValueError, match=msg): - s.interpolate(method='polynomial', order=1) + s.interpolate(method="polynomial", order=1) @td.skip_if_no_scipy def test_interp_nonmono_raise(self): s = Series([1, np.nan, 3], index=[0, 2, 1]) msg = "krogh interpolation requires that the index be monotonic" with pytest.raises(ValueError, match=msg): - s.interpolate(method='krogh') + s.interpolate(method="krogh") @td.skip_if_no_scipy def test_interp_datetime64(self): - df = Series([1, np.nan, 3], index=date_range('1/1/2000', periods=3)) - result = df.interpolate(method='nearest') - expected = Series([1., 1., 3.], - index=date_range('1/1/2000', periods=3)) + df = Series([1, np.nan, 3], index=date_range("1/1/2000", periods=3)) + result = df.interpolate(method="nearest") + expected = Series([1.0, 1.0, 3.0], index=date_range("1/1/2000", periods=3)) assert_series_equal(result, expected) def test_interp_limit_no_nans(self): # GH 7173 - s = pd.Series([1., 2., 3.]) + s = pd.Series([1.0, 2.0, 3.0]) result = s.interpolate(limit=1) expected = s assert_series_equal(result, expected) @td.skip_if_no_scipy - @pytest.mark.parametrize("method", ['polynomial', 'spline']) + @pytest.mark.parametrize("method", ["polynomial", "spline"]) def test_no_order(self, method): # see GH-10633, GH-24014 s = Series([0, 1, np.nan, 3]) @@ -1327,83 +1534,81 @@ def test_no_order(self, method): s.interpolate(method=method) @td.skip_if_no_scipy - @pytest.mark.parametrize('order', [-1, -1.0, 0, 0.0, np.nan]) + @pytest.mark.parametrize("order", [-1, -1.0, 0, 0.0, np.nan]) def test_interpolate_spline_invalid_order(self, order): s = Series([0, 1, np.nan, 3]) msg = "order needs to be specified and greater than 0" with pytest.raises(ValueError, match=msg): - s.interpolate(method='spline', order=order) + s.interpolate(method="spline", order=order) @td.skip_if_no_scipy def test_spline(self): s = Series([1, 2, np.nan, 4, 5, np.nan, 7]) - result = s.interpolate(method='spline', order=1) - expected = Series([1., 2., 3., 4., 5., 6., 7.]) + result = s.interpolate(method="spline", order=1) + expected = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]) assert_series_equal(result, expected) @td.skip_if_no_scipy def test_spline_extrapolate(self): s = Series([1, 2, 3, 4, np.nan, 6, np.nan]) - result3 = s.interpolate(method='spline', order=1, ext=3) - expected3 = Series([1., 2., 3., 4., 5., 6., 6.]) + result3 = s.interpolate(method="spline", order=1, ext=3) + expected3 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0]) assert_series_equal(result3, expected3) - result1 = s.interpolate(method='spline', order=1, ext=0) - expected1 = Series([1., 2., 3., 4., 5., 6., 7.]) + result1 = s.interpolate(method="spline", order=1, ext=0) + expected1 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]) assert_series_equal(result1, expected1) @td.skip_if_no_scipy def test_spline_smooth(self): s = Series([1, 2, np.nan, 4, 5.1, np.nan, 7]) - assert (s.interpolate(method='spline', order=3, s=0)[5] != - s.interpolate(method='spline', order=3)[5]) + assert ( + s.interpolate(method="spline", order=3, s=0)[5] + != s.interpolate(method="spline", order=3)[5] + ) @td.skip_if_no_scipy def test_spline_interpolation(self): s = Series(np.arange(10) ** 2) s[np.random.randint(0, 9, 3)] = np.nan - result1 = s.interpolate(method='spline', order=1) - expected1 = s.interpolate(method='spline', order=1) + result1 = s.interpolate(method="spline", order=1) + expected1 = s.interpolate(method="spline", order=1) assert_series_equal(result1, expected1) def test_interp_timedelta64(self): # GH 6424 - df = Series([1, np.nan, 3], - index=pd.to_timedelta([1, 2, 3])) - result = df.interpolate(method='time') - expected = Series([1., 2., 3.], - index=pd.to_timedelta([1, 2, 3])) + df = Series([1, np.nan, 3], index=pd.to_timedelta([1, 2, 3])) + result = df.interpolate(method="time") + expected = Series([1.0, 2.0, 3.0], index=pd.to_timedelta([1, 2, 3])) assert_series_equal(result, expected) # test for non uniform spacing - df = Series([1, np.nan, 3], - index=pd.to_timedelta([1, 2, 4])) - result = df.interpolate(method='time') - expected = Series([1., 1.666667, 3.], - index=pd.to_timedelta([1, 2, 4])) + df = Series([1, np.nan, 3], index=pd.to_timedelta([1, 2, 4])) + result = df.interpolate(method="time") + expected = Series([1.0, 1.666667, 3.0], index=pd.to_timedelta([1, 2, 4])) assert_series_equal(result, expected) def test_series_interpolate_method_values(self): # #1646 - ts = _simple_ts('1/1/2000', '1/20/2000') + ts = _simple_ts("1/1/2000", "1/20/2000") ts[::2] = np.nan - result = ts.interpolate(method='values') + result = ts.interpolate(method="values") exp = ts.interpolate() assert_series_equal(result, exp) def test_series_interpolate_intraday(self): # #1698 - index = pd.date_range('1/1/2012', periods=4, freq='12D') + index = pd.date_range("1/1/2012", periods=4, freq="12D") ts = pd.Series([0, 12, 24, 36], index) new_index = index.append(index + pd.DateOffset(days=1)).sort_values() - exp = ts.reindex(new_index).interpolate(method='time') + exp = ts.reindex(new_index).interpolate(method="time") - index = pd.date_range('1/1/2012', periods=4, freq='12H') + index = pd.date_range("1/1/2012", periods=4, freq="12H") ts = pd.Series([0, 12, 24, 36], index) new_index = index.append(index + pd.DateOffset(hours=1)).sort_values() - result = ts.reindex(new_index).interpolate(method='time') + result = ts.reindex(new_index).interpolate(method="time") tm.assert_numpy_array_equal(result.values, exp.values) @@ -1416,10 +1621,11 @@ def test_nonzero_warning(self): @pytest.mark.parametrize( "ind", [ - ['a', 'b', 'c', 'd'], + ["a", "b", "c", "d"], pd.period_range(start="2019-01-01", periods=4), pd.interval_range(start=0, end=4), - ]) + ], + ) def test_interp_non_timedelta_index(self, interp_methods_ind, ind): # gh 21662 df = pd.DataFrame([0, 1, np.nan, 3], index=ind) @@ -1437,7 +1643,8 @@ def test_interp_non_timedelta_index(self, interp_methods_ind, ind): "Index column must be numeric or datetime type when " "using {method} method other than linear. " "Try setting a numeric or datetime index column before " - "interpolating.".format(method=method)) + "interpolating.".format(method=method) + ) with pytest.raises(ValueError, match=expected_error): df[0].interpolate(method=method, **kwargs) @@ -1461,6 +1668,5 @@ def test_interpolate_timedelta_index(self, interp_methods_ind): assert_series_equal(result, expected) else: pytest.skip( - "This interpolation method is not supported for " - "Timedelta Index yet." + "This interpolation method is not supported for " "Timedelta Index yet." ) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 215fa9f22277e9..aada5cca9fdc75 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -5,25 +5,26 @@ import pytest import pandas as pd -from pandas import ( - Categorical, DataFrame, Index, Series, bdate_range, date_range, isna) +from pandas import Categorical, DataFrame, Index, Series, bdate_range, date_range, isna from pandas.core import ops from pandas.core.indexes.base import InvalidIndexError import pandas.core.nanops as nanops import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_index_equal, - assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_index_equal, + assert_series_equal, +) from .common import TestData class TestSeriesLogicalOps: - @pytest.mark.parametrize('bool_op', [operator.and_, - operator.or_, operator.xor]) + @pytest.mark.parametrize("bool_op", [operator.and_, operator.or_, operator.xor]) def test_bool_operators_with_nas(self, bool_op): # boolean &, |, ^ should work with object arrays and propagate NAs - ser = Series(bdate_range('1/1/2000', periods=10), dtype=object) + ser = Series(bdate_range("1/1/2000", periods=10), dtype=object) ser[::2] = np.nan mask = ser.isna() @@ -37,7 +38,7 @@ def test_bool_operators_with_nas(self, bool_op): def test_operators_bitwise(self): # GH#9016: support bitwise op for integer types - index = list('bca') + index = list("bca") s_tft = Series([True, False, True], index=index) s_fff = Series([False, False, False], index=index) @@ -47,7 +48,7 @@ def test_operators_bitwise(self): # TODO: unused # s_0101 = Series([0, 1, 0, 1]) - s_0123 = Series(range(4), dtype='int64') + s_0123 = Series(range(4), dtype="int64") s_3333 = Series([3] * 4) s_4444 = Series([4] * 4) @@ -60,21 +61,21 @@ def test_operators_bitwise(self): assert_series_equal(res, expected) res = s_0123 & s_3333 - expected = Series(range(4), dtype='int64') + expected = Series(range(4), dtype="int64") assert_series_equal(res, expected) res = s_0123 | s_4444 - expected = Series(range(4, 8), dtype='int64') + expected = Series(range(4, 8), dtype="int64") assert_series_equal(res, expected) - s_a0b1c0 = Series([1], list('b')) + s_a0b1c0 = Series([1], list("b")) res = s_tft & s_a0b1c0 - expected = s_tff.reindex(list('abc')) + expected = s_tff.reindex(list("abc")) assert_series_equal(res, expected) res = s_tft | s_a0b1c0 - expected = s_tft.reindex(list('abc')) + expected = s_tft.reindex(list("abc")) assert_series_equal(res, expected) n0 = 0 @@ -95,19 +96,19 @@ def test_operators_bitwise(self): expected = Series([0, 1, 0, 1]) assert_series_equal(res, expected) - s_1111 = Series([1] * 4, dtype='int8') + s_1111 = Series([1] * 4, dtype="int8") res = s_0123 & s_1111 - expected = Series([0, 1, 0, 1], dtype='int64') + expected = Series([0, 1, 0, 1], dtype="int64") assert_series_equal(res, expected) res = s_0123.astype(np.int16) | s_1111.astype(np.int32) - expected = Series([1, 1, 3, 3], dtype='int32') + expected = Series([1, 1, 3, 3], dtype="int32") assert_series_equal(res, expected) with pytest.raises(TypeError): - s_1111 & 'a' + s_1111 & "a" with pytest.raises(TypeError): - s_1111 & ['a', 'b', 'c', 'd'] + s_1111 & ["a", "b", "c", "d"] with pytest.raises(TypeError): s_0123 & np.NaN with pytest.raises(TypeError): @@ -116,24 +117,25 @@ def test_operators_bitwise(self): s_0123 & [0.1, 4, 3.14, 2] # s_0123 will be all false now because of reindexing like s_tft - exp = Series([False] * 7, index=[0, 1, 2, 3, 'a', 'b', 'c']) + exp = Series([False] * 7, index=[0, 1, 2, 3, "a", "b", "c"]) assert_series_equal(s_tft & s_0123, exp) # s_tft will be all false now because of reindexing like s_0123 - exp = Series([False] * 7, index=[0, 1, 2, 3, 'a', 'b', 'c']) + exp = Series([False] * 7, index=[0, 1, 2, 3, "a", "b", "c"]) assert_series_equal(s_0123 & s_tft, exp) assert_series_equal(s_0123 & False, Series([False] * 4)) assert_series_equal(s_0123 ^ False, Series([False, True, True, True])) assert_series_equal(s_0123 & [False], Series([False] * 4)) assert_series_equal(s_0123 & (False), Series([False] * 4)) - assert_series_equal(s_0123 & Series([False, np.NaN, False, False]), - Series([False] * 4)) + assert_series_equal( + s_0123 & Series([False, np.NaN, False, False]), Series([False] * 4) + ) s_ftft = Series([False, True, False, True]) assert_series_equal(s_0123 & Series([0.1, 4, -3.14, 2]), s_ftft) - s_abNd = Series(['a', 'b', np.NaN, 'd']) + s_abNd = Series(["a", "b", np.NaN, "d"]) res = s_0123 & s_abNd expected = s_ftft assert_series_equal(res, expected) @@ -152,7 +154,7 @@ def test_scalar_na_logical_ops_corners(self): result = s & list(s) assert_series_equal(result, expected) - d = DataFrame({'A': s}) + d = DataFrame({"A": s}) # TODO: Fix this exception - needs to be fixed! (see GH5035) # (previously this was a TypeError because series returned # NotImplemented @@ -161,7 +163,7 @@ def test_scalar_na_logical_ops_corners(self): # https://github.com/pandas-dev/pandas/issues/5284 with pytest.raises(TypeError): - d.__and__(s, axis='columns') + d.__and__(s, axis="columns") with pytest.raises(TypeError): s & d @@ -169,11 +171,7 @@ def test_scalar_na_logical_ops_corners(self): # this is wrong as its not a boolean result # result = d.__and__(s,axis='index') - @pytest.mark.parametrize('op', [ - operator.and_, - operator.or_, - operator.xor, - ]) + @pytest.mark.parametrize("op", [operator.and_, operator.or_, operator.xor]) def test_logical_ops_with_index(self, op): # GH#22092, GH#19792 ser = Series([True, True, False, False]) @@ -185,26 +183,33 @@ def test_logical_ops_with_index(self, op): result = op(ser, idx1) assert_series_equal(result, expected) - expected = Series([op(ser[n], idx2[n]) for n in range(len(ser))], - dtype=bool) + expected = Series([op(ser[n], idx2[n]) for n in range(len(ser))], dtype=bool) result = op(ser, idx2) assert_series_equal(result, expected) - @pytest.mark.parametrize('op', [ - pytest.param(ops.rand_, - marks=pytest.mark.xfail(reason="GH#22092 Index " - "implementation returns " - "Index", - raises=AssertionError, - strict=True)), - pytest.param(ops.ror_, - marks=pytest.mark.xfail(reason="Index.get_indexer " - "with non unique index", - raises=InvalidIndexError, - strict=True)), - ops.rxor, - ]) + @pytest.mark.parametrize( + "op", + [ + pytest.param( + ops.rand_, + marks=pytest.mark.xfail( + reason="GH#22092 Index " "implementation returns " "Index", + raises=AssertionError, + strict=True, + ), + ), + pytest.param( + ops.ror_, + marks=pytest.mark.xfail( + reason="Index.get_indexer " "with non unique index", + raises=InvalidIndexError, + strict=True, + ), + ), + ops.rxor, + ], + ) def test_reversed_logical_ops_with_index(self, op): # GH#22092, GH#19792 ser = Series([True, True, False, False]) @@ -222,11 +227,14 @@ def test_reversed_logical_ops_with_index(self, op): result = op(ser, idx2) assert_index_equal(result, expected) - @pytest.mark.parametrize("op, expected", [ - (ops.rand_, pd.Index([False, True])), - (ops.ror_, pd.Index([False, True])), - (ops.rxor, pd.Index([])), - ]) + @pytest.mark.parametrize( + "op, expected", + [ + (ops.rand_, pd.Index([False, True])), + (ops.ror_, pd.Index([False, True])), + (ops.rxor, pd.Index([])), + ], + ) def test_reverse_ops_with_index(self, op, expected): # https://github.com/pandas-dev/pandas/pull/23628 # multi-set Index ops are buggy, so let's avoid duplicates... @@ -239,30 +247,30 @@ def test_logical_ops_label_based(self): # GH#4947 # logical ops should be label based - a = Series([True, False, True], list('bca')) - b = Series([False, True, False], list('abc')) + a = Series([True, False, True], list("bca")) + b = Series([False, True, False], list("abc")) - expected = Series([False, True, False], list('abc')) + expected = Series([False, True, False], list("abc")) result = a & b assert_series_equal(result, expected) - expected = Series([True, True, False], list('abc')) + expected = Series([True, True, False], list("abc")) result = a | b assert_series_equal(result, expected) - expected = Series([True, False, False], list('abc')) + expected = Series([True, False, False], list("abc")) result = a ^ b assert_series_equal(result, expected) # rhs is bigger - a = Series([True, False, True], list('bca')) - b = Series([False, True, False, True], list('abcd')) + a = Series([True, False, True], list("bca")) + b = Series([False, True, False, True], list("abcd")) - expected = Series([False, True, False, False], list('abcd')) + expected = Series([False, True, False, False], list("abcd")) result = a & b assert_series_equal(result, expected) - expected = Series([True, True, False, False], list('abcd')) + expected = Series([True, True, False, False], list("abcd")) result = a | b assert_series_equal(result, expected) @@ -270,35 +278,39 @@ def test_logical_ops_label_based(self): # vs empty result = a & Series([]) - expected = Series([False, False, False], list('bca')) + expected = Series([False, False, False], list("bca")) assert_series_equal(result, expected) result = a | Series([]) - expected = Series([True, False, True], list('bca')) + expected = Series([True, False, True], list("bca")) assert_series_equal(result, expected) # vs non-matching - result = a & Series([1], ['z']) - expected = Series([False, False, False, False], list('abcz')) + result = a & Series([1], ["z"]) + expected = Series([False, False, False, False], list("abcz")) assert_series_equal(result, expected) - result = a | Series([1], ['z']) - expected = Series([True, True, False, False], list('abcz')) + result = a | Series([1], ["z"]) + expected = Series([True, True, False, False], list("abcz")) assert_series_equal(result, expected) # identity # we would like s[s|e] == s to hold for any e, whether empty or not - for e in [Series([]), Series([1], ['z']), - Series(np.nan, b.index), Series(np.nan, a.index)]: + for e in [ + Series([]), + Series([1], ["z"]), + Series(np.nan, b.index), + Series(np.nan, a.index), + ]: result = a[a | e] assert_series_equal(result, a[a]) - for e in [Series(['z'])]: + for e in [Series(["z"])]: result = a[a | e] assert_series_equal(result, a[a]) # vs scalars - index = list('bca') + index = list("bca") t = Series([True, False, True]) for v in [True, 1, 2]: @@ -306,7 +318,7 @@ def test_logical_ops_label_based(self): expected = Series([True, True, True], index=index) assert_series_equal(result, expected) - for v in [np.nan, 'foo']: + for v in [np.nan, "foo"]: with pytest.raises(TypeError): t | v @@ -330,59 +342,49 @@ def test_logical_ops_label_based(self): def test_logical_ops_df_compat(self): # GH#1134 - s1 = pd.Series([True, False, True], index=list('ABC'), name='x') - s2 = pd.Series([True, True, False], index=list('ABD'), name='x') + s1 = pd.Series([True, False, True], index=list("ABC"), name="x") + s2 = pd.Series([True, True, False], index=list("ABD"), name="x") - exp = pd.Series([True, False, False, False], - index=list('ABCD'), name='x') + exp = pd.Series([True, False, False, False], index=list("ABCD"), name="x") assert_series_equal(s1 & s2, exp) assert_series_equal(s2 & s1, exp) # True | np.nan => True - exp = pd.Series([True, True, True, False], - index=list('ABCD'), name='x') + exp = pd.Series([True, True, True, False], index=list("ABCD"), name="x") assert_series_equal(s1 | s2, exp) # np.nan | True => np.nan, filled with False - exp = pd.Series([True, True, False, False], - index=list('ABCD'), name='x') + exp = pd.Series([True, True, False, False], index=list("ABCD"), name="x") assert_series_equal(s2 | s1, exp) # DataFrame doesn't fill nan with False - exp = pd.DataFrame({'x': [True, False, np.nan, np.nan]}, - index=list('ABCD')) + exp = pd.DataFrame({"x": [True, False, np.nan, np.nan]}, index=list("ABCD")) assert_frame_equal(s1.to_frame() & s2.to_frame(), exp) assert_frame_equal(s2.to_frame() & s1.to_frame(), exp) - exp = pd.DataFrame({'x': [True, True, np.nan, np.nan]}, - index=list('ABCD')) + exp = pd.DataFrame({"x": [True, True, np.nan, np.nan]}, index=list("ABCD")) assert_frame_equal(s1.to_frame() | s2.to_frame(), exp) assert_frame_equal(s2.to_frame() | s1.to_frame(), exp) # different length - s3 = pd.Series([True, False, True], index=list('ABC'), name='x') - s4 = pd.Series([True, True, True, True], index=list('ABCD'), name='x') + s3 = pd.Series([True, False, True], index=list("ABC"), name="x") + s4 = pd.Series([True, True, True, True], index=list("ABCD"), name="x") - exp = pd.Series([True, False, True, False], - index=list('ABCD'), name='x') + exp = pd.Series([True, False, True, False], index=list("ABCD"), name="x") assert_series_equal(s3 & s4, exp) assert_series_equal(s4 & s3, exp) # np.nan | True => np.nan, filled with False - exp = pd.Series([True, True, True, False], - index=list('ABCD'), name='x') + exp = pd.Series([True, True, True, False], index=list("ABCD"), name="x") assert_series_equal(s3 | s4, exp) # True | np.nan => True - exp = pd.Series([True, True, True, True], - index=list('ABCD'), name='x') + exp = pd.Series([True, True, True, True], index=list("ABCD"), name="x") assert_series_equal(s4 | s3, exp) - exp = pd.DataFrame({'x': [True, False, True, np.nan]}, - index=list('ABCD')) + exp = pd.DataFrame({"x": [True, False, True, np.nan]}, index=list("ABCD")) assert_frame_equal(s3.to_frame() & s4.to_frame(), exp) assert_frame_equal(s4.to_frame() & s3.to_frame(), exp) - exp = pd.DataFrame({'x': [True, True, True, np.nan]}, - index=list('ABCD')) + exp = pd.DataFrame({"x": [True, True, True, np.nan]}, index=list("ABCD")) assert_frame_equal(s3.to_frame() | s4.to_frame(), exp) assert_frame_equal(s4.to_frame() | s3.to_frame(), exp) @@ -394,13 +396,13 @@ def test_comparisons(self): left[:3] = np.nan result = nanops.nangt(left, right) - with np.errstate(invalid='ignore'): - expected = (left > right).astype('O') + with np.errstate(invalid="ignore"): + expected = (left > right).astype("O") expected[:3] = np.nan assert_almost_equal(result, expected) - s = Series(['a', 'b', 'c']) + s = Series(["a", "b", "c"]) s2 = Series([False, True, False]) # it works! @@ -411,21 +413,21 @@ def test_comparisons(self): def test_categorical_comparisons(self): # GH 8938 # allow equality comparisons - a = Series(list('abc'), dtype="category") - b = Series(list('abc'), dtype="object") - c = Series(['a', 'b', 'cc'], dtype="object") - d = Series(list('acb'), dtype="object") - e = Categorical(list('abc')) - f = Categorical(list('acb')) + a = Series(list("abc"), dtype="category") + b = Series(list("abc"), dtype="object") + c = Series(["a", "b", "cc"], dtype="object") + d = Series(list("acb"), dtype="object") + e = Categorical(list("abc")) + f = Categorical(list("acb")) # vs scalar - assert not (a == 'a').all() - assert ((a != 'a') == ~(a == 'a')).all() + assert not (a == "a").all() + assert ((a != "a") == ~(a == "a")).all() - assert not ('a' == a).all() - assert (a == 'a')[0] - assert ('a' == a)[0] - assert not ('a' != a)[0] + assert not ("a" == a).all() + assert (a == "a")[0] + assert ("a" == a)[0] + assert not ("a" != a)[0] # vs list-like assert (a == a).all() @@ -448,10 +450,10 @@ def test_categorical_comparisons(self): assert not (a == f).all() assert not (f == a).all() - assert ((~(a == e) == (a != e)).all()) - assert ((~(e == a) == (e != a)).all()) - assert ((~(a == f) == (a != f)).all()) - assert ((~(f == a) == (f != a)).all()) + assert (~(a == e) == (a != e)).all() + assert (~(e == a) == (e != a)).all() + assert (~(a == f) == (a != f)).all() + assert (~(f == a) == (f != a)).all() # non-equality is not comparable with pytest.raises(TypeError): @@ -501,11 +503,11 @@ def test_comparison_tuples(self): assert_series_equal(result, expected) def test_comparison_operators_with_nas(self): - ser = Series(bdate_range('1/1/2000', periods=10), dtype=object) + ser = Series(bdate_range("1/1/2000", periods=10), dtype=object) ser[::2] = np.nan # test that comparisons work - ops = ['lt', 'le', 'gt', 'ge', 'eq', 'ne'] + ops = ["lt", "le", "gt", "ge", "eq", "ne"] for op in ops: val = ser[5] @@ -514,7 +516,7 @@ def test_comparison_operators_with_nas(self): expected = f(ser.dropna(), val).reindex(ser.index) - if op == 'ne': + if op == "ne": expected = expected.fillna(True).astype(bool) else: expected = expected.fillna(False).astype(bool) @@ -561,11 +563,11 @@ def test_ne(self): def test_comp_ops_df_compat(self): # GH 1134 - s1 = pd.Series([1, 2, 3], index=list('ABC'), name='x') - s2 = pd.Series([2, 2, 2], index=list('ABD'), name='x') + s1 = pd.Series([1, 2, 3], index=list("ABC"), name="x") + s2 = pd.Series([2, 2, 2], index=list("ABD"), name="x") - s3 = pd.Series([1, 2, 3], index=list('ABC'), name='x') - s4 = pd.Series([2, 2, 2, 2], index=list('ABCD'), name='x') + s3 = pd.Series([1, 2, 3], index=list("ABC"), name="x") + s4 = pd.Series([2, 2, 2, 2], index=list("ABCD"), name="x") for left, right in [(s1, s2), (s2, s1), (s3, s4), (s4, s3)]: @@ -591,70 +593,68 @@ def test_comp_ops_df_compat(self): def test_compare_series_interval_keyword(self): # GH 25338 - s = Series(['IntervalA', 'IntervalB', 'IntervalC']) - result = s == 'IntervalA' + s = Series(["IntervalA", "IntervalB", "IntervalC"]) + result = s == "IntervalA" expected = Series([True, False, False]) assert_series_equal(result, expected) class TestSeriesFlexComparisonOps: - def test_comparison_flex_alignment(self): - left = Series([1, 3, 2], index=list('abc')) - right = Series([2, 2, 2], index=list('bcd')) + left = Series([1, 3, 2], index=list("abc")) + right = Series([2, 2, 2], index=list("bcd")) - exp = pd.Series([False, False, True, False], index=list('abcd')) + exp = pd.Series([False, False, True, False], index=list("abcd")) assert_series_equal(left.eq(right), exp) - exp = pd.Series([True, True, False, True], index=list('abcd')) + exp = pd.Series([True, True, False, True], index=list("abcd")) assert_series_equal(left.ne(right), exp) - exp = pd.Series([False, False, True, False], index=list('abcd')) + exp = pd.Series([False, False, True, False], index=list("abcd")) assert_series_equal(left.le(right), exp) - exp = pd.Series([False, False, False, False], index=list('abcd')) + exp = pd.Series([False, False, False, False], index=list("abcd")) assert_series_equal(left.lt(right), exp) - exp = pd.Series([False, True, True, False], index=list('abcd')) + exp = pd.Series([False, True, True, False], index=list("abcd")) assert_series_equal(left.ge(right), exp) - exp = pd.Series([False, True, False, False], index=list('abcd')) + exp = pd.Series([False, True, False, False], index=list("abcd")) assert_series_equal(left.gt(right), exp) def test_comparison_flex_alignment_fill(self): - left = Series([1, 3, 2], index=list('abc')) - right = Series([2, 2, 2], index=list('bcd')) + left = Series([1, 3, 2], index=list("abc")) + right = Series([2, 2, 2], index=list("bcd")) - exp = pd.Series([False, False, True, True], index=list('abcd')) + exp = pd.Series([False, False, True, True], index=list("abcd")) assert_series_equal(left.eq(right, fill_value=2), exp) - exp = pd.Series([True, True, False, False], index=list('abcd')) + exp = pd.Series([True, True, False, False], index=list("abcd")) assert_series_equal(left.ne(right, fill_value=2), exp) - exp = pd.Series([False, False, True, True], index=list('abcd')) + exp = pd.Series([False, False, True, True], index=list("abcd")) assert_series_equal(left.le(right, fill_value=0), exp) - exp = pd.Series([False, False, False, True], index=list('abcd')) + exp = pd.Series([False, False, False, True], index=list("abcd")) assert_series_equal(left.lt(right, fill_value=0), exp) - exp = pd.Series([True, True, True, False], index=list('abcd')) + exp = pd.Series([True, True, True, False], index=list("abcd")) assert_series_equal(left.ge(right, fill_value=0), exp) - exp = pd.Series([True, True, False, False], index=list('abcd')) + exp = pd.Series([True, True, False, False], index=list("abcd")) assert_series_equal(left.gt(right, fill_value=0), exp) class TestSeriesOperators(TestData): - def test_operators_empty_int_corner(self): s1 = Series([], [], dtype=np.int32) - s2 = Series({'x': 0.}) - assert_series_equal(s1 * s2, Series([np.nan], index=['x'])) + s2 = Series({"x": 0.0}) + assert_series_equal(s1 * s2, Series([np.nan], index=["x"])) def test_ops_datetimelike_align(self): # GH 7500 # datetimelike ops need to align - dt = Series(date_range('2012-1-1', periods=3, freq='D')) + dt = Series(date_range("2012-1-1", periods=3, freq="D")) dt.iloc[2] = np.nan dt2 = dt[::-1] @@ -687,23 +687,26 @@ def test_operators_corner(self): # float + int int_ts = self.ts.astype(int)[:-5] added = self.ts + int_ts - expected = Series(self.ts.values[:-5] + int_ts.values, - index=self.ts.index[:-5], name='ts') + expected = Series( + self.ts.values[:-5] + int_ts.values, index=self.ts.index[:-5], name="ts" + ) tm.assert_series_equal(added[:-5], expected) - pairings = [(Series.div, operator.truediv, 1), - (Series.rdiv, lambda x, y: operator.truediv(y, x), 1)] - for op in ['add', 'sub', 'mul', 'pow', 'truediv', 'floordiv']: + pairings = [ + (Series.div, operator.truediv, 1), + (Series.rdiv, lambda x, y: operator.truediv(y, x), 1), + ] + for op in ["add", "sub", "mul", "pow", "truediv", "floordiv"]: fv = 0 lop = getattr(Series, op) lequiv = getattr(operator, op) - rop = getattr(Series, 'r' + op) + rop = getattr(Series, "r" + op) # bind op at definition time... requiv = lambda x, y, op=op: getattr(operator, op)(y, x) pairings.append((lop, lequiv, fv)) pairings.append((rop, requiv, fv)) - @pytest.mark.parametrize('op, equiv_op, fv', pairings) + @pytest.mark.parametrize("op, equiv_op, fv", pairings) def test_operators_combine(self, op, equiv_op, fv): def _check_fill(meth, op, a, b, fill_value=0): exp_index = a.index.union(b.index) @@ -715,7 +718,7 @@ def _check_fill(meth, op, a, b, fill_value=0): exp_values = [] for i in range(len(exp_index)): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): if amask[i]: if bmask[i]: exp_values.append(np.nan) @@ -733,8 +736,8 @@ def _check_fill(meth, op, a, b, fill_value=0): expected = Series(exp_values, exp_index) assert_series_equal(result, expected) - a = Series([np.nan, 1., 2., 3., np.nan], index=np.arange(5)) - b = Series([np.nan, 1, np.nan, 3, np.nan, 4.], index=np.arange(6)) + a = Series([np.nan, 1.0, 2.0, 3.0, np.nan], index=np.arange(5)) + b = Series([np.nan, 1, np.nan, 3, np.nan, 4.0], index=np.arange(6)) result = op(a, b) exp = equiv_op(a, b) @@ -746,8 +749,10 @@ def _check_fill(meth, op, a, b, fill_value=0): def test_operators_na_handling(self): from decimal import Decimal from datetime import date - s = Series([Decimal('1.3'), Decimal('2.3')], - index=[date(2012, 1, 1), date(2012, 1, 2)]) + + s = Series( + [Decimal("1.3"), Decimal("2.3")], index=[date(2012, 1, 1), date(2012, 1, 2)] + ) result = s + s.shift(1) result2 = s.shift(1) + s @@ -764,8 +769,8 @@ def test_op_duplicate_index(self): def test_divmod(self): # GH25557 - a = Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) - b = Series([2, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) + a = Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"]) + b = Series([2, np.nan, 1, np.nan], index=["a", "b", "d", "e"]) result = a.divmod(b) expected = divmod(a, b) @@ -783,10 +788,10 @@ class TestSeriesUnaryOps: def test_neg(self): ser = tm.makeStringSeries() - ser.name = 'series' + ser.name = "series" assert_series_equal(-ser, -1 * ser) def test_invert(self): ser = tm.makeStringSeries() - ser.name = 'series' + ser.name = "series" assert_series_equal(-(ser < 0), ~(ser < 0)) diff --git a/pandas/tests/series/test_period.py b/pandas/tests/series/test_period.py index 6b0edf670e03ec..9b34b52bf39b93 100644 --- a/pandas/tests/series/test_period.py +++ b/pandas/tests/series/test_period.py @@ -8,52 +8,49 @@ class TestSeriesPeriod: - def setup_method(self, method): - self.series = Series(period_range('2000-01-01', periods=10, freq='D')) + self.series = Series(period_range("2000-01-01", periods=10, freq="D")) def test_auto_conversion(self): - series = Series(list(period_range('2000-01-01', periods=10, freq='D'))) - assert series.dtype == 'Period[D]' + series = Series(list(period_range("2000-01-01", periods=10, freq="D"))) + assert series.dtype == "Period[D]" - series = pd.Series([pd.Period('2011-01-01', freq='D'), - pd.Period('2011-02-01', freq='D')]) - assert series.dtype == 'Period[D]' + series = pd.Series( + [pd.Period("2011-01-01", freq="D"), pd.Period("2011-02-01", freq="D")] + ) + assert series.dtype == "Period[D]" def test_getitem(self): - assert self.series[1] == pd.Period('2000-01-02', freq='D') + assert self.series[1] == pd.Period("2000-01-02", freq="D") result = self.series[[2, 4]] - exp = pd.Series([pd.Period('2000-01-03', freq='D'), - pd.Period('2000-01-05', freq='D')], - index=[2, 4], dtype='Period[D]') + exp = pd.Series( + [pd.Period("2000-01-03", freq="D"), pd.Period("2000-01-05", freq="D")], + index=[2, 4], + dtype="Period[D]", + ) tm.assert_series_equal(result, exp) - assert result.dtype == 'Period[D]' + assert result.dtype == "Period[D]" def test_isna(self): # GH 13737 - s = Series([pd.Period('2011-01', freq='M'), - pd.Period('NaT', freq='M')]) + s = Series([pd.Period("2011-01", freq="M"), pd.Period("NaT", freq="M")]) tm.assert_series_equal(s.isna(), Series([False, True])) tm.assert_series_equal(s.notna(), Series([True, False])) def test_fillna(self): # GH 13737 - s = Series([pd.Period('2011-01', freq='M'), - pd.Period('NaT', freq='M')]) + s = Series([pd.Period("2011-01", freq="M"), pd.Period("NaT", freq="M")]) - res = s.fillna(pd.Period('2012-01', freq='M')) - exp = Series([pd.Period('2011-01', freq='M'), - pd.Period('2012-01', freq='M')]) + res = s.fillna(pd.Period("2012-01", freq="M")) + exp = Series([pd.Period("2011-01", freq="M"), pd.Period("2012-01", freq="M")]) tm.assert_series_equal(res, exp) - assert res.dtype == 'Period[M]' + assert res.dtype == "Period[M]" def test_dropna(self): # GH 13737 - s = Series([pd.Period('2011-01', freq='M'), - pd.Period('NaT', freq='M')]) - tm.assert_series_equal(s.dropna(), - Series([pd.Period('2011-01', freq='M')])) + s = Series([pd.Period("2011-01", freq="M"), pd.Period("NaT", freq="M")]) + tm.assert_series_equal(s.dropna(), Series([pd.Period("2011-01", freq="M")])) def test_between(self): left, right = self.series[[2, 7]] @@ -66,7 +63,7 @@ def test_between(self): @pytest.mark.xfail(reason="PeriodDtype Series not supported yet") def test_NaT_scalar(self): - series = Series([0, 1000, 2000, pd._libs.iNaT], dtype='period[D]') + series = Series([0, 1000, 2000, pd._libs.iNaT], dtype="period[D]") val = series[3] assert pd.isna(val) @@ -76,7 +73,7 @@ def test_NaT_scalar(self): @pytest.mark.xfail(reason="PeriodDtype Series not supported yet") def test_NaT_cast(self): - result = Series([np.nan]).astype('period[D]') + result = Series([np.nan]).astype("period[D]") expected = Series([pd.NaT]) tm.assert_series_equal(result, expected) @@ -96,64 +93,65 @@ def test_set_nan(self): assert self.series[6] is pd.NaT def test_intercept_astype_object(self): - expected = self.series.astype('object') + expected = self.series.astype("object") - df = DataFrame({'a': self.series, - 'b': np.random.randn(len(self.series))}) + df = DataFrame({"a": self.series, "b": np.random.randn(len(self.series))}) result = df.values.squeeze() assert (result[:, 0] == expected.values).all() - df = DataFrame({'a': self.series, 'b': ['foo'] * len(self.series)}) + df = DataFrame({"a": self.series, "b": ["foo"] * len(self.series)}) result = df.values.squeeze() assert (result[:, 0] == expected.values).all() def test_align_series(self, join_type): - rng = period_range('1/1/2000', '1/1/2010', freq='A') + rng = period_range("1/1/2000", "1/1/2010", freq="A") ts = Series(np.random.randn(len(rng)), index=rng) ts.align(ts[::2], join=join_type) def test_truncate(self): # GH 17717 - idx1 = pd.PeriodIndex([ - pd.Period('2017-09-02'), - pd.Period('2017-09-02'), - pd.Period('2017-09-03') - ]) + idx1 = pd.PeriodIndex( + [pd.Period("2017-09-02"), pd.Period("2017-09-02"), pd.Period("2017-09-03")] + ) series1 = pd.Series([1, 2, 3], index=idx1) - result1 = series1.truncate(after='2017-09-02') + result1 = series1.truncate(after="2017-09-02") - expected_idx1 = pd.PeriodIndex([ - pd.Period('2017-09-02'), - pd.Period('2017-09-02') - ]) + expected_idx1 = pd.PeriodIndex( + [pd.Period("2017-09-02"), pd.Period("2017-09-02")] + ) tm.assert_series_equal(result1, pd.Series([1, 2], index=expected_idx1)) - idx2 = pd.PeriodIndex([ - pd.Period('2017-09-03'), - pd.Period('2017-09-02'), - pd.Period('2017-09-03') - ]) + idx2 = pd.PeriodIndex( + [pd.Period("2017-09-03"), pd.Period("2017-09-02"), pd.Period("2017-09-03")] + ) series2 = pd.Series([1, 2, 3], index=idx2) - result2 = series2.sort_index().truncate(after='2017-09-02') + result2 = series2.sort_index().truncate(after="2017-09-02") - expected_idx2 = pd.PeriodIndex([ - pd.Period('2017-09-02') - ]) + expected_idx2 = pd.PeriodIndex([pd.Period("2017-09-02")]) tm.assert_series_equal(result2, pd.Series([2], index=expected_idx2)) - @pytest.mark.parametrize('input_vals', [ - [Period('2016-01', freq='M'), Period('2016-02', freq='M')], - [Period('2016-01-01', freq='D'), Period('2016-01-02', freq='D')], - [Period('2016-01-01 00:00:00', freq='H'), - Period('2016-01-01 01:00:00', freq='H')], - [Period('2016-01-01 00:00:00', freq='M'), - Period('2016-01-01 00:01:00', freq='M')], - [Period('2016-01-01 00:00:00', freq='S'), - Period('2016-01-01 00:00:01', freq='S')] - ]) + @pytest.mark.parametrize( + "input_vals", + [ + [Period("2016-01", freq="M"), Period("2016-02", freq="M")], + [Period("2016-01-01", freq="D"), Period("2016-01-02", freq="D")], + [ + Period("2016-01-01 00:00:00", freq="H"), + Period("2016-01-01 01:00:00", freq="H"), + ], + [ + Period("2016-01-01 00:00:00", freq="M"), + Period("2016-01-01 00:01:00", freq="M"), + ], + [ + Period("2016-01-01 00:00:00", freq="S"), + Period("2016-01-01 00:00:01", freq="S"), + ], + ], + ) def test_end_time_timevalues(self, input_vals): # GH 17157 # Check that the time part of the Period is adjusted by end_time @@ -165,11 +163,9 @@ def test_end_time_timevalues(self, input_vals): expected = s.apply(lambda x: x.end_time) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('input_vals', [ - ('2001'), ('NaT') - ]) + @pytest.mark.parametrize("input_vals", [("2001"), ("NaT")]) def test_to_period(self, input_vals): # GH 21205 - expected = Series([input_vals], dtype='Period[D]') - result = Series([input_vals], dtype='datetime64[ns]').dt.to_period('D') + expected = Series([input_vals], dtype="Period[D]") + result = Series([input_vals], dtype="datetime64[ns]").dt.to_period("D") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index 18bc7b119c97ad..b001312fa37f39 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -12,7 +12,6 @@ class TestSeriesQuantile(TestData): - def test_quantile(self): q = self.ts.quantile(0.1) @@ -27,50 +26,53 @@ def test_quantile(self): # datetime64[ns] dtype dts = self.ts.index.to_series() - q = dts.quantile(.2) - assert q == Timestamp('2000-01-10 19:12:00') + q = dts.quantile(0.2) + assert q == Timestamp("2000-01-10 19:12:00") # timedelta64[ns] dtype tds = dts.diff() - q = tds.quantile(.25) - assert q == pd.to_timedelta('24:00:00') + q = tds.quantile(0.25) + assert q == pd.to_timedelta("24:00:00") # GH7661 - result = Series([np.timedelta64('NaT')]).sum() + result = Series([np.timedelta64("NaT")]).sum() assert result == pd.Timedelta(0) - msg = 'percentiles should all be in the interval \\[0, 1\\]' + msg = "percentiles should all be in the interval \\[0, 1\\]" for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: with pytest.raises(ValueError, match=msg): self.ts.quantile(invalid) def test_quantile_multi(self): - qs = [.1, .9] + qs = [0.1, 0.9] result = self.ts.quantile(qs) - expected = pd.Series([np.percentile(self.ts.dropna(), 10), - np.percentile(self.ts.dropna(), 90)], - index=qs, name=self.ts.name) + expected = pd.Series( + [np.percentile(self.ts.dropna(), 10), np.percentile(self.ts.dropna(), 90)], + index=qs, + name=self.ts.name, + ) tm.assert_series_equal(result, expected) dts = self.ts.index.to_series() - dts.name = 'xxx' - result = dts.quantile((.2, .2)) - expected = Series([Timestamp('2000-01-10 19:12:00'), - Timestamp('2000-01-10 19:12:00')], - index=[.2, .2], name='xxx') + dts.name = "xxx" + result = dts.quantile((0.2, 0.2)) + expected = Series( + [Timestamp("2000-01-10 19:12:00"), Timestamp("2000-01-10 19:12:00")], + index=[0.2, 0.2], + name="xxx", + ) tm.assert_series_equal(result, expected) result = self.ts.quantile([]) - expected = pd.Series([], name=self.ts.name, index=Index( - [], dtype=float)) + expected = pd.Series([], name=self.ts.name, index=Index([], dtype=float)) tm.assert_series_equal(result, expected) def test_quantile_interpolation(self): # see gh-10174 # interpolation = linear (default case) - q = self.ts.quantile(0.1, interpolation='linear') + q = self.ts.quantile(0.1, interpolation="linear") assert q == np.percentile(self.ts.dropna(), 10) q1 = self.ts.quantile(0.1) assert q1 == np.percentile(self.ts.dropna(), 10) @@ -82,11 +84,11 @@ def test_quantile_interpolation_dtype(self): # GH #10174 # interpolation = linear (default case) - q = pd.Series([1, 3, 4]).quantile(0.5, interpolation='lower') + q = pd.Series([1, 3, 4]).quantile(0.5, interpolation="lower") assert q == np.percentile(np.array([1, 3, 4]), 50) assert is_integer(q) - q = pd.Series([1, 3, 4]).quantile(0.5, interpolation='higher') + q = pd.Series([1, 3, 4]).quantile(0.5, interpolation="higher") assert q == np.percentile(np.array([1, 3, 4]), 50) assert is_integer(q) @@ -109,38 +111,56 @@ def test_quantile_nan(self): tm.assert_series_equal(res, pd.Series([np.nan], index=[0.5])) res = s.quantile([0.2, 0.3]) - tm.assert_series_equal(res, pd.Series([np.nan, np.nan], - index=[0.2, 0.3])) - - @pytest.mark.parametrize('case', [ - [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-03')], - [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timestamp('2011-01-03', tz='US/Eastern')], - [pd.Timedelta('1 days'), pd.Timedelta('2 days'), - pd.Timedelta('3 days')], - # NaT - [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-03'), pd.NaT], - [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timestamp('2011-01-03', tz='US/Eastern'), pd.NaT], - [pd.Timedelta('1 days'), pd.Timedelta('2 days'), - pd.Timedelta('3 days'), pd.NaT]]) + tm.assert_series_equal(res, pd.Series([np.nan, np.nan], index=[0.2, 0.3])) + + @pytest.mark.parametrize( + "case", + [ + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + ], + [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03", tz="US/Eastern"), + ], + [pd.Timedelta("1 days"), pd.Timedelta("2 days"), pd.Timedelta("3 days")], + # NaT + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + pd.NaT, + ], + [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03", tz="US/Eastern"), + pd.NaT, + ], + [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + pd.NaT, + ], + ], + ) def test_quantile_box(self, case): - s = pd.Series(case, name='XXX') + s = pd.Series(case, name="XXX") res = s.quantile(0.5) assert res == case[1] res = s.quantile([0.5]) - exp = pd.Series([case[1]], index=[0.5], name='XXX') + exp = pd.Series([case[1]], index=[0.5], name="XXX") tm.assert_series_equal(res, exp) def test_datetime_timedelta_quantiles(self): # covers #9694 - assert pd.isna(Series([], dtype='M8[ns]').quantile(.5)) - assert pd.isna(Series([], dtype='m8[ns]').quantile(.5)) + assert pd.isna(Series([], dtype="M8[ns]").quantile(0.5)) + assert pd.isna(Series([], dtype="m8[ns]").quantile(0.5)) def test_quantile_nat(self): res = Series([pd.NaT, pd.NaT]).quantile(0.5) @@ -149,10 +169,10 @@ def test_quantile_nat(self): res = Series([pd.NaT, pd.NaT]).quantile([0.5]) tm.assert_series_equal(res, pd.Series([pd.NaT], index=[0.5])) - @pytest.mark.parametrize('values, dtype', [ - ([0, 0, 0, 1, 2, 3], 'Sparse[int]'), - ([0., None, 1., 2.], 'Sparse[float]'), - ]) + @pytest.mark.parametrize( + "values, dtype", + [([0, 0, 0, 1, 2, 3], "Sparse[int]"), ([0.0, None, 1.0, 2.0], "Sparse[float]")], + ) def test_quantile_sparse(self, values, dtype): ser = pd.Series(values, dtype=dtype) result = ser.quantile([0.5]) @@ -162,7 +182,7 @@ def test_quantile_sparse(self, values, dtype): def test_quantile_empty(self): # floats - s = Series([], dtype='float64') + s = Series([], dtype="float64") res = s.quantile(0.5) assert np.isnan(res) @@ -172,7 +192,7 @@ def test_quantile_empty(self): tm.assert_series_equal(res, exp) # int - s = Series([], dtype='int64') + s = Series([], dtype="int64") res = s.quantile(0.5) assert np.isnan(res) @@ -182,7 +202,7 @@ def test_quantile_empty(self): tm.assert_series_equal(res, exp) # datetime - s = Series([], dtype='datetime64[ns]') + s = Series([], dtype="datetime64[ns]") res = s.quantile(0.5) assert res is pd.NaT diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py index 05ea9df5452113..f93e1651c8b107 100644 --- a/pandas/tests/series/test_rank.py +++ b/pandas/tests/series/test_rank.py @@ -19,23 +19,22 @@ class TestSeriesRank(TestData): s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]) results = { - 'average': np.array([1.5, 5.5, 7.0, 3.5, nan, - 3.5, 1.5, 8.0, nan, 5.5]), - 'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]), - 'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]), - 'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]), - 'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]), + "average": np.array([1.5, 5.5, 7.0, 3.5, nan, 3.5, 1.5, 8.0, nan, 5.5]), + "min": np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]), + "max": np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]), + "first": np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]), + "dense": np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]), } def test_rank(self): - pytest.importorskip('scipy.stats.special') - rankdata = pytest.importorskip('scipy.stats.rankdata') + pytest.importorskip("scipy.stats.special") + rankdata = pytest.importorskip("scipy.stats.rankdata") self.ts[::2] = np.nan - self.ts[:10][::3] = 4. + self.ts[:10][::3] = 4.0 ranks = self.ts.rank() - oranks = self.ts.astype('O').rank() + oranks = self.ts.astype("O").rank() assert_series_equal(ranks, oranks) @@ -43,7 +42,7 @@ def test_rank(self): filled = self.ts.fillna(np.inf) # rankdata returns a ndarray - exp = Series(rankdata(filled), index=filled.index, name='ts') + exp = Series(rankdata(filled), index=filled.index, name="ts") exp[mask] = np.nan tm.assert_series_equal(ranks, exp) @@ -87,7 +86,7 @@ def test_rank(self): iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) - rng = date_range('1/1/1990', periods=5) + rng = date_range("1/1/1990", periods=5) iseries = Series(np.arange(5), rng) + 1 iseries.iloc[4] = np.nan exp = iseries / 4.0 @@ -100,18 +99,18 @@ def test_rank(self): assert_series_equal(iranks, exp) # GH 5968 - iseries = Series(['3 day', '1 day 10m', '-2 day', NaT], - dtype='m8[ns]') + iseries = Series(["3 day", "1 day 10m", "-2 day", NaT], dtype="m8[ns]") exp = Series([3, 2, 1, np.nan]) iranks = iseries.rank() assert_series_equal(iranks, exp) values = np.array( - [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40 - ], dtype='float64') + [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40], + dtype="float64", + ) random_order = np.random.permutation(len(values)) iseries = Series(values[random_order]) - exp = Series(random_order + 1.0, dtype='float64') + exp = Series(random_order + 1.0, dtype="float64") iranks = iseries.rank() assert_series_equal(iranks, exp) @@ -119,125 +118,171 @@ def test_rank_categorical(self): # GH issue #15420 rank incorrectly orders ordered categories # Test ascending/descending ranking for ordered categoricals - exp = Series([1., 2., 3., 4., 5., 6.]) - exp_desc = Series([6., 5., 4., 3., 2., 1.]) + exp = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) + exp_desc = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0]) ordered = Series( - ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'] - ).astype(CategoricalDtype(categories=['first', 'second', 'third', - 'fourth', 'fifth', 'sixth'], - ordered=True)) + ["first", "second", "third", "fourth", "fifth", "sixth"] + ).astype( + CategoricalDtype( + categories=["first", "second", "third", "fourth", "fifth", "sixth"], + ordered=True, + ) + ) assert_series_equal(ordered.rank(), exp) assert_series_equal(ordered.rank(ascending=False), exp_desc) # Unordered categoricals should be ranked as objects - unordered = Series(['first', 'second', 'third', 'fourth', - 'fifth', 'sixth']).astype( - CategoricalDtype(categories=['first', 'second', 'third', - 'fourth', 'fifth', 'sixth'], - ordered=False)) - exp_unordered = Series([2., 4., 6., 3., 1., 5.]) + unordered = Series( + ["first", "second", "third", "fourth", "fifth", "sixth"] + ).astype( + CategoricalDtype( + categories=["first", "second", "third", "fourth", "fifth", "sixth"], + ordered=False, + ) + ) + exp_unordered = Series([2.0, 4.0, 6.0, 3.0, 1.0, 5.0]) res = unordered.rank() assert_series_equal(res, exp_unordered) - unordered1 = Series( - [1, 2, 3, 4, 5, 6], - ).astype(CategoricalDtype([1, 2, 3, 4, 5, 6], False)) - exp_unordered1 = Series([1., 2., 3., 4., 5., 6.]) + unordered1 = Series([1, 2, 3, 4, 5, 6]).astype( + CategoricalDtype([1, 2, 3, 4, 5, 6], False) + ) + exp_unordered1 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) res1 = unordered1.rank() assert_series_equal(res1, exp_unordered1) # Test na_option for rank data na_ser = Series( - ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN] - ).astype(CategoricalDtype(['first', 'second', 'third', 'fourth', - 'fifth', 'sixth', 'seventh'], True)) + ["first", "second", "third", "fourth", "fifth", "sixth", np.NaN] + ).astype( + CategoricalDtype( + ["first", "second", "third", "fourth", "fifth", "sixth", "seventh"], + True, + ) + ) - exp_top = Series([2., 3., 4., 5., 6., 7., 1.]) - exp_bot = Series([1., 2., 3., 4., 5., 6., 7.]) - exp_keep = Series([1., 2., 3., 4., 5., 6., np.NaN]) + exp_top = Series([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 1.0]) + exp_bot = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]) + exp_keep = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, np.NaN]) - assert_series_equal(na_ser.rank(na_option='top'), exp_top) - assert_series_equal(na_ser.rank(na_option='bottom'), exp_bot) - assert_series_equal(na_ser.rank(na_option='keep'), exp_keep) + assert_series_equal(na_ser.rank(na_option="top"), exp_top) + assert_series_equal(na_ser.rank(na_option="bottom"), exp_bot) + assert_series_equal(na_ser.rank(na_option="keep"), exp_keep) # Test na_option for rank data with ascending False - exp_top = Series([7., 6., 5., 4., 3., 2., 1.]) - exp_bot = Series([6., 5., 4., 3., 2., 1., 7.]) - exp_keep = Series([6., 5., 4., 3., 2., 1., np.NaN]) + exp_top = Series([7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]) + exp_bot = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 7.0]) + exp_keep = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, np.NaN]) - assert_series_equal( - na_ser.rank(na_option='top', ascending=False), - exp_top - ) - assert_series_equal( - na_ser.rank(na_option='bottom', ascending=False), - exp_bot - ) - assert_series_equal( - na_ser.rank(na_option='keep', ascending=False), - exp_keep - ) + assert_series_equal(na_ser.rank(na_option="top", ascending=False), exp_top) + assert_series_equal(na_ser.rank(na_option="bottom", ascending=False), exp_bot) + assert_series_equal(na_ser.rank(na_option="keep", ascending=False), exp_keep) # Test invalid values for na_option msg = "na_option must be one of 'keep', 'top', or 'bottom'" with pytest.raises(ValueError, match=msg): - na_ser.rank(na_option='bad', ascending=False) + na_ser.rank(na_option="bad", ascending=False) # invalid type with pytest.raises(ValueError, match=msg): na_ser.rank(na_option=True, ascending=False) # Test with pct=True - na_ser = Series(['first', 'second', 'third', 'fourth', np.NaN]).astype( - CategoricalDtype(['first', 'second', 'third', 'fourth'], True)) - exp_top = Series([0.4, 0.6, 0.8, 1., 0.2]) - exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.]) - exp_keep = Series([0.25, 0.5, 0.75, 1., np.NaN]) + na_ser = Series(["first", "second", "third", "fourth", np.NaN]).astype( + CategoricalDtype(["first", "second", "third", "fourth"], True) + ) + exp_top = Series([0.4, 0.6, 0.8, 1.0, 0.2]) + exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.0]) + exp_keep = Series([0.25, 0.5, 0.75, 1.0, np.NaN]) - assert_series_equal(na_ser.rank(na_option='top', pct=True), exp_top) - assert_series_equal(na_ser.rank(na_option='bottom', pct=True), exp_bot) - assert_series_equal(na_ser.rank(na_option='keep', pct=True), exp_keep) + assert_series_equal(na_ser.rank(na_option="top", pct=True), exp_top) + assert_series_equal(na_ser.rank(na_option="bottom", pct=True), exp_bot) + assert_series_equal(na_ser.rank(na_option="keep", pct=True), exp_keep) def test_rank_signature(self): s = Series([0, 1]) - s.rank(method='average') - msg = ("No axis named average for object type" - " ") + s.rank(method="average") + msg = ( + "No axis named average for object type" + " " + ) with pytest.raises(ValueError, match=msg): - s.rank('average') - - @pytest.mark.parametrize('contents,dtype', [ - ([-np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, - 2, 40, np.inf], - 'float64'), - ([-np.inf, -50, -1, -1e-20, -1e-25, -1e-45, 0, 1e-40, 1e-20, 1e-10, - 2, 40, np.inf], - 'float32'), - ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], - 'uint8'), - pytest.param([np.iinfo(np.int64).min, -100, 0, 1, 9999, 100000, - 1e10, np.iinfo(np.int64).max], - 'int64', - marks=pytest.mark.xfail( - reason="iNaT is equivalent to minimum value of dtype" - "int64 pending issue GH#16674")), - ([NegInfinity(), '1', 'A', 'BA', 'Ba', 'C', Infinity()], - 'object') - ]) + s.rank("average") + + @pytest.mark.parametrize( + "contents,dtype", + [ + ( + [ + -np.inf, + -50, + -1, + -1e-20, + -1e-25, + -1e-50, + 0, + 1e-40, + 1e-20, + 1e-10, + 2, + 40, + np.inf, + ], + "float64", + ), + ( + [ + -np.inf, + -50, + -1, + -1e-20, + -1e-25, + -1e-45, + 0, + 1e-40, + 1e-20, + 1e-10, + 2, + 40, + np.inf, + ], + "float32", + ), + ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"), + pytest.param( + [ + np.iinfo(np.int64).min, + -100, + 0, + 1, + 9999, + 100000, + 1e10, + np.iinfo(np.int64).max, + ], + "int64", + marks=pytest.mark.xfail( + reason="iNaT is equivalent to minimum value of dtype" + "int64 pending issue GH#16674" + ), + ), + ([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"), + ], + ) def test_rank_inf(self, contents, dtype): dtype_na_map = { - 'float64': np.nan, - 'float32': np.nan, - 'int64': iNaT, - 'object': None + "float64": np.nan, + "float32": np.nan, + "int64": iNaT, + "object": None, } # Insert nans at random positions if underlying dtype has missing # value. Then adjust the expected order by adding nans accordingly # This is for testing whether rank calculation is affected # when values are interwined with nan values. values = np.array(contents, dtype=dtype) - exp_order = np.array(range(len(values)), dtype='float64') + 1.0 + exp_order = np.array(range(len(values)), dtype="float64") + 1.0 if dtype in dtype_na_map: na_value = dtype_na_map[dtype] nan_indices = np.random.choice(range(len(values)), 5) @@ -246,19 +291,19 @@ def test_rank_inf(self, contents, dtype): # shuffle the testing array and expected results in the same way random_order = np.random.permutation(len(values)) iseries = Series(values[random_order]) - exp = Series(exp_order[random_order], dtype='float64') + exp = Series(exp_order[random_order], dtype="float64") iranks = iseries.rank() assert_series_equal(iranks, exp) def test_rank_tie_methods(self): s = self.s - def _check(s, expected, method='average'): + def _check(s, expected, method="average"): result = s.rank(method=method) tm.assert_series_equal(result, Series(expected)) dtypes = [None, object] - disabled = {(object, 'first')} + disabled = {(object, "first")} results = self.results for method, dtype in product(results, dtypes): @@ -268,36 +313,36 @@ def _check(s, expected, method='average'): _check(series, results[method], method=method) @td.skip_if_no_scipy - @pytest.mark.parametrize('ascending', [True, False]) - @pytest.mark.parametrize('method', ['average', 'min', 'max', 'first', - 'dense']) - @pytest.mark.parametrize('na_option', ['top', 'bottom', 'keep']) + @pytest.mark.parametrize("ascending", [True, False]) + @pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) + @pytest.mark.parametrize("na_option", ["top", "bottom", "keep"]) def test_rank_tie_methods_on_infs_nans(self, method, na_option, ascending): - dtypes = [('object', None, Infinity(), NegInfinity()), - ('float64', np.nan, np.inf, -np.inf)] + dtypes = [ + ("object", None, Infinity(), NegInfinity()), + ("float64", np.nan, np.inf, -np.inf), + ] chunk = 3 - disabled = {('object', 'first')} + disabled = {("object", "first")} def _check(s, method, na_option, ascending): exp_ranks = { - 'average': ([2, 2, 2], [5, 5, 5], [8, 8, 8]), - 'min': ([1, 1, 1], [4, 4, 4], [7, 7, 7]), - 'max': ([3, 3, 3], [6, 6, 6], [9, 9, 9]), - 'first': ([1, 2, 3], [4, 5, 6], [7, 8, 9]), - 'dense': ([1, 1, 1], [2, 2, 2], [3, 3, 3]) + "average": ([2, 2, 2], [5, 5, 5], [8, 8, 8]), + "min": ([1, 1, 1], [4, 4, 4], [7, 7, 7]), + "max": ([3, 3, 3], [6, 6, 6], [9, 9, 9]), + "first": ([1, 2, 3], [4, 5, 6], [7, 8, 9]), + "dense": ([1, 1, 1], [2, 2, 2], [3, 3, 3]), } ranks = exp_ranks[method] - if na_option == 'top': + if na_option == "top": order = [ranks[1], ranks[0], ranks[2]] - elif na_option == 'bottom': + elif na_option == "bottom": order = [ranks[0], ranks[2], ranks[1]] else: order = [ranks[0], [np.nan] * chunk, ranks[1]] expected = order if ascending else order[::-1] expected = list(chain.from_iterable(expected)) - result = s.rank(method=method, na_option=na_option, - ascending=ascending) - tm.assert_series_equal(result, Series(expected, dtype='float64')) + result = s.rank(method=method, na_option=na_option, ascending=ascending) + tm.assert_series_equal(result, Series(expected, dtype="float64")) for dtype, na_value, pos_inf, neg_inf in dtypes: in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk @@ -311,51 +356,53 @@ def test_rank_desc_mix_nans_infs(self): # check descending ranking when mix nans and infs iseries = Series([1, np.nan, np.inf, -np.inf, 25]) result = iseries.rank(ascending=False) - exp = Series([3, np.nan, 1, 4, 2], dtype='float64') + exp = Series([3, np.nan, 1, 4, 2], dtype="float64") tm.assert_series_equal(result, exp) def test_rank_methods_series(self): - pytest.importorskip('scipy.stats.special') - rankdata = pytest.importorskip('scipy.stats.rankdata') + pytest.importorskip("scipy.stats.special") + rankdata = pytest.importorskip("scipy.stats.rankdata") xs = np.random.randn(9) xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates np.random.shuffle(xs) - index = [chr(ord('a') + i) for i in range(len(xs))] + index = [chr(ord("a") + i) for i in range(len(xs))] for vals in [xs, xs + 1e6, xs * 1e-6]: ts = Series(vals, index=index) - for m in ['average', 'min', 'max', 'first', 'dense']: + for m in ["average", "min", "max", "first", "dense"]: result = ts.rank(method=m) - sprank = rankdata(vals, m if m != 'first' else 'ordinal') - expected = Series(sprank, index=index).astype('float64') + sprank = rankdata(vals, m if m != "first" else "ordinal") + expected = Series(sprank, index=index).astype("float64") tm.assert_series_equal(result, expected) def test_rank_dense_method(self): - dtypes = ['O', 'f8', 'i8'] - in_out = [([1], [1]), - ([2], [1]), - ([0], [1]), - ([2, 2], [1, 1]), - ([1, 2, 3], [1, 2, 3]), - ([4, 2, 1], [3, 2, 1],), - ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]), - ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5])] + dtypes = ["O", "f8", "i8"] + in_out = [ + ([1], [1]), + ([2], [1]), + ([0], [1]), + ([2, 2], [1, 1]), + ([1, 2, 3], [1, 2, 3]), + ([4, 2, 1], [3, 2, 1]), + ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]), + ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5]), + ] for ser, exp in in_out: for dtype in dtypes: s = Series(ser).astype(dtype) - result = s.rank(method='dense') + result = s.rank(method="dense") expected = Series(exp).astype(result.dtype) assert_series_equal(result, expected) def test_rank_descending(self): - dtypes = ['O', 'f8', 'i8'] + dtypes = ["O", "f8", "i8"] for dtype, method in product(dtypes, self.results): - if 'i' in dtype: + if "i" in dtype: s = self.s.dropna() else: s = self.s.astype(dtype) @@ -364,7 +411,7 @@ def test_rank_descending(self): expected = (s.max() - s).rank() assert_series_equal(res, expected) - if method == 'first' and dtype == 'O': + if method == "first" and dtype == "O": continue expected = (s.max() - s).rank(method=method) @@ -372,7 +419,7 @@ def test_rank_descending(self): assert_series_equal(res2, expected) def test_rank_int(self): - s = self.s.dropna().astype('i8') + s = self.s.dropna().astype("i8") for method, res in self.results.items(): result = s.rank(method=method) @@ -390,7 +437,7 @@ def test_rank_object_bug(self): def test_rank_modify_inplace(self): # GH 18521 # Check rank does not mutate series - s = Series([Timestamp('2017-01-05 10:20:27.569000'), NaT]) + s = Series([Timestamp("2017-01-05 10:20:27.569000"), NaT]) expected = s.copy() s.rank() @@ -400,93 +447,113 @@ def test_rank_modify_inplace(self): # GH15630, pct should be on 100% basis when method='dense' -@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) -@pytest.mark.parametrize('ser, exp', [ - ([1], [1.]), - ([1, 2], [1. / 2, 2. / 2]), - ([2, 2], [1., 1.]), - ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), - ([1, 2, 2], [1. / 2, 2. / 2, 2. / 2]), - ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), - ([1, 1, 5, 5, 3], [1. / 3, 1. / 3, 3. / 3, 3. / 3, 2. / 3]), - ([1, 1, 3, 3, 5, 5], [1. / 3, 1. / 3, 2. / 3, 2. / 3, 3. / 3, 3. / 3]), - ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]) + +@pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) +@pytest.mark.parametrize( + "ser, exp", + [ + ([1], [1.0]), + ([1, 2], [1.0 / 2, 2.0 / 2]), + ([2, 2], [1.0, 1.0]), + ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([1, 2, 2], [1.0 / 2, 2.0 / 2, 2.0 / 2]), + ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]), + ([1, 1, 5, 5, 3], [1.0 / 3, 1.0 / 3, 3.0 / 3, 3.0 / 3, 2.0 / 3]), + ([1, 1, 3, 3, 5, 5], [1.0 / 3, 1.0 / 3, 2.0 / 3, 2.0 / 3, 3.0 / 3, 3.0 / 3]), + ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]), + ], +) def test_rank_dense_pct(dtype, ser, exp): s = Series(ser).astype(dtype) - result = s.rank(method='dense', pct=True) + result = s.rank(method="dense", pct=True) expected = Series(exp).astype(result.dtype) assert_series_equal(result, expected) -@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) -@pytest.mark.parametrize('ser, exp', [ - ([1], [1.]), - ([1, 2], [1. / 2, 2. / 2]), - ([2, 2], [1. / 2, 1. / 2]), - ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), - ([1, 2, 2], [1. / 3, 2. / 3, 2. / 3]), - ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), - ([1, 1, 5, 5, 3], [1. / 5, 1. / 5, 4. / 5, 4. / 5, 3. / 5]), - ([1, 1, 3, 3, 5, 5], [1. / 6, 1. / 6, 3. / 6, 3. / 6, 5. / 6, 5. / 6]), - ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]) +@pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) +@pytest.mark.parametrize( + "ser, exp", + [ + ([1], [1.0]), + ([1, 2], [1.0 / 2, 2.0 / 2]), + ([2, 2], [1.0 / 2, 1.0 / 2]), + ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([1, 2, 2], [1.0 / 3, 2.0 / 3, 2.0 / 3]), + ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]), + ([1, 1, 5, 5, 3], [1.0 / 5, 1.0 / 5, 4.0 / 5, 4.0 / 5, 3.0 / 5]), + ([1, 1, 3, 3, 5, 5], [1.0 / 6, 1.0 / 6, 3.0 / 6, 3.0 / 6, 5.0 / 6, 5.0 / 6]), + ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]), + ], +) def test_rank_min_pct(dtype, ser, exp): s = Series(ser).astype(dtype) - result = s.rank(method='min', pct=True) + result = s.rank(method="min", pct=True) expected = Series(exp).astype(result.dtype) assert_series_equal(result, expected) -@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) -@pytest.mark.parametrize('ser, exp', [ - ([1], [1.]), - ([1, 2], [1. / 2, 2. / 2]), - ([2, 2], [1., 1.]), - ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), - ([1, 2, 2], [1. / 3, 3. / 3, 3. / 3]), - ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), - ([1, 1, 5, 5, 3], [2. / 5, 2. / 5, 5. / 5, 5. / 5, 3. / 5]), - ([1, 1, 3, 3, 5, 5], [2. / 6, 2. / 6, 4. / 6, 4. / 6, 6. / 6, 6. / 6]), - ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]) +@pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) +@pytest.mark.parametrize( + "ser, exp", + [ + ([1], [1.0]), + ([1, 2], [1.0 / 2, 2.0 / 2]), + ([2, 2], [1.0, 1.0]), + ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([1, 2, 2], [1.0 / 3, 3.0 / 3, 3.0 / 3]), + ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]), + ([1, 1, 5, 5, 3], [2.0 / 5, 2.0 / 5, 5.0 / 5, 5.0 / 5, 3.0 / 5]), + ([1, 1, 3, 3, 5, 5], [2.0 / 6, 2.0 / 6, 4.0 / 6, 4.0 / 6, 6.0 / 6, 6.0 / 6]), + ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]), + ], +) def test_rank_max_pct(dtype, ser, exp): s = Series(ser).astype(dtype) - result = s.rank(method='max', pct=True) + result = s.rank(method="max", pct=True) expected = Series(exp).astype(result.dtype) assert_series_equal(result, expected) -@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) -@pytest.mark.parametrize('ser, exp', [ - ([1], [1.]), - ([1, 2], [1. / 2, 2. / 2]), - ([2, 2], [1.5 / 2, 1.5 / 2]), - ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), - ([1, 2, 2], [1. / 3, 2.5 / 3, 2.5 / 3]), - ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), - ([1, 1, 5, 5, 3], [1.5 / 5, 1.5 / 5, 4.5 / 5, 4.5 / 5, 3. / 5]), - ([1, 1, 3, 3, 5, 5], - [1.5 / 6, 1.5 / 6, 3.5 / 6, 3.5 / 6, 5.5 / 6, 5.5 / 6]), - ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]) +@pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) +@pytest.mark.parametrize( + "ser, exp", + [ + ([1], [1.0]), + ([1, 2], [1.0 / 2, 2.0 / 2]), + ([2, 2], [1.5 / 2, 1.5 / 2]), + ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([1, 2, 2], [1.0 / 3, 2.5 / 3, 2.5 / 3]), + ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]), + ([1, 1, 5, 5, 3], [1.5 / 5, 1.5 / 5, 4.5 / 5, 4.5 / 5, 3.0 / 5]), + ([1, 1, 3, 3, 5, 5], [1.5 / 6, 1.5 / 6, 3.5 / 6, 3.5 / 6, 5.5 / 6, 5.5 / 6]), + ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]), + ], +) def test_rank_average_pct(dtype, ser, exp): s = Series(ser).astype(dtype) - result = s.rank(method='average', pct=True) + result = s.rank(method="average", pct=True) expected = Series(exp).astype(result.dtype) assert_series_equal(result, expected) -@pytest.mark.parametrize('dtype', ['f8', 'i8']) -@pytest.mark.parametrize('ser, exp', [ - ([1], [1.]), - ([1, 2], [1. / 2, 2. / 2]), - ([2, 2], [1. / 2, 2. / 2.]), - ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), - ([1, 2, 2], [1. / 3, 2. / 3, 3. / 3]), - ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), - ([1, 1, 5, 5, 3], [1. / 5, 2. / 5, 4. / 5, 5. / 5, 3. / 5]), - ([1, 1, 3, 3, 5, 5], [1. / 6, 2. / 6, 3. / 6, 4. / 6, 5. / 6, 6. / 6]), - ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]) +@pytest.mark.parametrize("dtype", ["f8", "i8"]) +@pytest.mark.parametrize( + "ser, exp", + [ + ([1], [1.0]), + ([1, 2], [1.0 / 2, 2.0 / 2]), + ([2, 2], [1.0 / 2, 2.0 / 2.0]), + ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([1, 2, 2], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]), + ([1, 1, 5, 5, 3], [1.0 / 5, 2.0 / 5, 4.0 / 5, 5.0 / 5, 3.0 / 5]), + ([1, 1, 3, 3, 5, 5], [1.0 / 6, 2.0 / 6, 3.0 / 6, 4.0 / 6, 5.0 / 6, 6.0 / 6]), + ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]), + ], +) def test_rank_first_pct(dtype, ser, exp): s = Series(ser).astype(dtype) - result = s.rank(method='first', pct=True) + result = s.rank(method="first", pct=True) expected = Series(exp).astype(result.dtype) assert_series_equal(result, expected) @@ -495,6 +562,6 @@ def test_rank_first_pct(dtype, ser, exp): @pytest.mark.high_memory def test_pct_max_many_rows(): # GH 18271 - s = Series(np.arange(2**24 + 1)) + s = Series(np.arange(2 ** 24 + 1)) result = s.rank(pct=True).max() assert result == 1 diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index 92096b3c95670b..06a859963cf93a 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -20,18 +20,17 @@ def test_replace(self): exp = ser.fillna(-1) tm.assert_series_equal(ser, exp) - rs = ser.replace(0., np.nan) - ser[ser == 0.] = np.nan + rs = ser.replace(0.0, np.nan) + ser[ser == 0.0] = np.nan tm.assert_series_equal(rs, ser) - ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), - dtype=object) + ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), dtype=object) ser[:5] = np.nan - ser[6:10] = 'foo' - ser[20:30] = 'bar' + ser[6:10] = "foo" + ser[20:30] = "bar" # replace list with a single value - rs = ser.replace([np.nan, 'foo', 'bar'], -1) + rs = ser.replace([np.nan, "foo", "bar"], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() @@ -39,7 +38,7 @@ def test_replace(self): assert (pd.isna(ser[:5])).all() # replace with different values - rs = ser.replace({np.nan: -1, 'foo': -2, 'bar': -3}) + rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() @@ -47,11 +46,11 @@ def test_replace(self): assert (pd.isna(ser[:5])).all() # replace with different values with 2 lists - rs2 = ser.replace([np.nan, 'foo', 'bar'], [-1, -2, -3]) + rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace - ser.replace([np.nan, 'foo', 'bar'], -1, inplace=True) + ser.replace([np.nan, "foo", "bar"], -1, inplace=True) assert (ser[:5] == -1).all() assert (ser[6:10] == -1).all() @@ -60,7 +59,7 @@ def test_replace(self): ser = pd.Series([np.nan, 0, np.inf]) tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) - ser = pd.Series([np.nan, 0, 'foo', 'bar', np.inf, None, pd.NaT]) + ser = pd.Series([np.nan, 0, "foo", "bar", np.inf, None, pd.NaT]) tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) filled = ser.copy() filled[4] = 0 @@ -76,7 +75,7 @@ def test_replace(self): # make sure that we aren't just masking a TypeError because bools don't # implement indexing - with pytest.raises(TypeError, match='Cannot compare types .+'): + with pytest.raises(TypeError, match="Cannot compare types .+"): ser.replace([1, 2], [np.nan, 0]) ser = pd.Series([0, 1, 2, 3, 4]) @@ -96,19 +95,17 @@ def test_replace_gh5319(self): result = ser.replace(np.nan) tm.assert_series_equal(result, expected) # GH 5797 - ser = pd.Series(pd.date_range('20130101', periods=5)) + ser = pd.Series(pd.date_range("20130101", periods=5)) expected = ser.copy() - expected.loc[2] = pd.Timestamp('20120101') - result = ser.replace({pd.Timestamp('20130103'): - pd.Timestamp('20120101')}) + expected.loc[2] = pd.Timestamp("20120101") + result = ser.replace({pd.Timestamp("20130103"): pd.Timestamp("20120101")}) tm.assert_series_equal(result, expected) - result = ser.replace(pd.Timestamp('20130103'), - pd.Timestamp('20120101')) + result = ser.replace(pd.Timestamp("20130103"), pd.Timestamp("20120101")) tm.assert_series_equal(result, expected) # GH 11792: Test with replacing NaT in a list with tz data - ts = pd.Timestamp('2015/01/01', tz='UTC') - s = pd.Series([pd.NaT, pd.Timestamp('2015/01/01', tz='UTC')]) + ts = pd.Timestamp("2015/01/01", tz="UTC") + s = pd.Series([pd.NaT, pd.Timestamp("2015/01/01", tz="UTC")]) result = s.replace([np.nan, pd.NaT], pd.Timestamp.min) expected = pd.Series([pd.Timestamp.min, ts], dtype=object) tm.assert_series_equal(expected, result) @@ -124,10 +121,12 @@ def test_replace_with_single_list(self): # make sure things don't get corrupted when fillna call fails s = ser.copy() - msg = (r"Invalid fill method\. Expecting pad \(ffill\) or backfill" - r" \(bfill\)\. Got crash_cymbal") + msg = ( + r"Invalid fill method\. Expecting pad \(ffill\) or backfill" + r" \(bfill\)\. Got crash_cymbal" + ) with pytest.raises(ValueError, match=msg): - s.replace([1, 2, 3], inplace=True, method='crash_cymbal') + s.replace([1, 2, 3], inplace=True, method="crash_cymbal") tm.assert_series_equal(s, ser) def test_replace_with_empty_list(self): @@ -141,10 +140,10 @@ def test_replace_with_empty_list(self): with pytest.raises(ValueError, match="cannot assign mismatch"): s.replace({np.nan: []}) with pytest.raises(ValueError, match="cannot assign mismatch"): - s.replace({np.nan: ['dummy', 'alt']}) + s.replace({np.nan: ["dummy", "alt"]}) def test_replace_mixed_types(self): - s = pd.Series(np.arange(5), dtype='int64') + s = pd.Series(np.arange(5), dtype="int64") def check_replace(to_rep, val, expected): sc = s.copy() @@ -154,7 +153,7 @@ def check_replace(to_rep, val, expected): tm.assert_series_equal(expected, sc) # MUST upcast to float - e = pd.Series([0., 1., 2., 3., 4.]) + e = pd.Series([0.0, 1.0, 2.0, 3.0, 4.0]) tr, v = [3], [3.0] check_replace(tr, v, e) @@ -164,38 +163,40 @@ def check_replace(to_rep, val, expected): check_replace(tr, v, e) # casts to object - e = pd.Series([0, 1, 2, 3.5, 'a']) - tr, v = [3, 4], [3.5, 'a'] + e = pd.Series([0, 1, 2, 3.5, "a"]) + tr, v = [3, 4], [3.5, "a"] check_replace(tr, v, e) # again casts to object - e = pd.Series([0, 1, 2, 3.5, pd.Timestamp('20130101')]) - tr, v = [3, 4], [3.5, pd.Timestamp('20130101')] + e = pd.Series([0, 1, 2, 3.5, pd.Timestamp("20130101")]) + tr, v = [3, 4], [3.5, pd.Timestamp("20130101")] check_replace(tr, v, e) # casts to object - e = pd.Series([0, 1, 2, 3.5, True], dtype='object') + e = pd.Series([0, 1, 2, 3.5, True], dtype="object") tr, v = [3, 4], [3.5, True] check_replace(tr, v, e) # test an object with dates + floats + integers + strings - dr = pd.date_range('1/1/2001', '1/10/2001', - freq='D').to_series().reset_index(drop=True) - result = dr.astype(object).replace( - [dr[0], dr[1], dr[2]], [1.0, 2, 'a']) - expected = pd.Series([1.0, 2, 'a'] + dr[3:].tolist(), dtype=object) + dr = ( + pd.date_range("1/1/2001", "1/10/2001", freq="D") + .to_series() + .reset_index(drop=True) + ) + result = dr.astype(object).replace([dr[0], dr[1], dr[2]], [1.0, 2, "a"]) + expected = pd.Series([1.0, 2, "a"] + dr[3:].tolist(), dtype=object) tm.assert_series_equal(result, expected) def test_replace_bool_with_string_no_op(self): s = pd.Series([True, False, True]) - result = s.replace('fun', 'in-the-sun') + result = s.replace("fun", "in-the-sun") tm.assert_series_equal(s, result) def test_replace_bool_with_string(self): # nonexistent elements s = pd.Series([True, False, True]) - result = s.replace(True, '2u') - expected = pd.Series(['2u', False, '2u']) + result = s.replace(True, "2u") + expected = pd.Series(["2u", False, "2u"]) tm.assert_series_equal(expected, result) def test_replace_bool_with_bool(self): @@ -206,19 +207,18 @@ def test_replace_bool_with_bool(self): def test_replace_with_dict_with_bool_keys(self): s = pd.Series([True, False, True]) - with pytest.raises(TypeError, match='Cannot compare types .+'): - s.replace({'asdf': 'asdb', True: 'yes'}) + with pytest.raises(TypeError, match="Cannot compare types .+"): + s.replace({"asdf": "asdb", True: "yes"}) def test_replace2(self): N = 100 - ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), - dtype=object) + ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), dtype=object) ser[:5] = np.nan - ser[6:10] = 'foo' - ser[20:30] = 'bar' + ser[6:10] = "foo" + ser[20:30] = "bar" # replace list with a single value - rs = ser.replace([np.nan, 'foo', 'bar'], -1) + rs = ser.replace([np.nan, "foo", "bar"], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() @@ -226,7 +226,7 @@ def test_replace2(self): assert (pd.isna(ser[:5])).all() # replace with different values - rs = ser.replace({np.nan: -1, 'foo': -2, 'bar': -3}) + rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() @@ -234,61 +234,64 @@ def test_replace2(self): assert (pd.isna(ser[:5])).all() # replace with different values with 2 lists - rs2 = ser.replace([np.nan, 'foo', 'bar'], [-1, -2, -3]) + rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace - ser.replace([np.nan, 'foo', 'bar'], -1, inplace=True) + ser.replace([np.nan, "foo", "bar"], -1, inplace=True) assert (ser[:5] == -1).all() assert (ser[6:10] == -1).all() assert (ser[20:30] == -1).all() def test_replace_with_empty_dictlike(self): # GH 15289 - s = pd.Series(list('abcd')) + s = pd.Series(list("abcd")) tm.assert_series_equal(s, s.replace(dict())) tm.assert_series_equal(s, s.replace(pd.Series([]))) def test_replace_string_with_number(self): # GH 15743 s = pd.Series([1, 2, 3]) - result = s.replace('2', np.nan) + result = s.replace("2", np.nan) expected = pd.Series([1, 2, 3]) tm.assert_series_equal(expected, result) def test_replace_replacer_equals_replacement(self): # GH 20656 # make sure all replacers are matching against original values - s = pd.Series(['a', 'b']) - expected = pd.Series(['b', 'a']) - result = s.replace({'a': 'b', 'b': 'a'}) + s = pd.Series(["a", "b"]) + expected = pd.Series(["b", "a"]) + result = s.replace({"a": "b", "b": "a"}) tm.assert_series_equal(expected, result) def test_replace_unicode_with_number(self): # GH 15743 s = pd.Series([1, 2, 3]) - result = s.replace('2', np.nan) + result = s.replace("2", np.nan) expected = pd.Series([1, 2, 3]) tm.assert_series_equal(expected, result) def test_replace_mixed_types_with_string(self): # Testing mixed - s = pd.Series([1, 2, 3, '4', 4, 5]) - result = s.replace([2, '4'], np.nan) + s = pd.Series([1, 2, 3, "4", 4, 5]) + result = s.replace([2, "4"], np.nan) expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) - @pytest.mark.parametrize("categorical, numeric", [ - (pd.Categorical('A', categories=['A', 'B']), [1]), - (pd.Categorical(('A', ), categories=['A', 'B']), [1]), - (pd.Categorical(('A', 'B'), categories=['A', 'B']), [1, 2]), - ]) + @pytest.mark.parametrize( + "categorical, numeric", + [ + (pd.Categorical("A", categories=["A", "B"]), [1]), + (pd.Categorical(("A",), categories=["A", "B"]), [1]), + (pd.Categorical(("A", "B"), categories=["A", "B"]), [1, 2]), + ], + ) def test_replace_categorical(self, categorical, numeric): # GH 24971 # Do not check if dtypes are equal due to a known issue that # Categorical.replace sometimes coerces to object (GH 23305) s = pd.Series(categorical) - result = s.replace({'A': 1, 'B': 2}) + result = s.replace({"A": 1, "B": 2}) expected = pd.Series(numeric) tm.assert_series_equal(expected, result, check_dtype=False) @@ -296,12 +299,11 @@ def test_replace_with_no_overflowerror(self): # GH 25616 # casts to object without Exception from OverflowError s = pd.Series([0, 1, 2, 3, 4]) - result = s.replace([3], ['100000000000000000000']) - expected = pd.Series([0, 1, 2, '100000000000000000000', 4]) + result = s.replace([3], ["100000000000000000000"]) + expected = pd.Series([0, 1, 2, "100000000000000000000", 4]) tm.assert_series_equal(result, expected) - s = pd.Series([0, '100000000000000000000', - '100000000000000000001']) - result = s.replace(['100000000000000000000'], [1]) - expected = pd.Series([0, 1, '100000000000000000001']) + s = pd.Series([0, "100000000000000000000", "100000000000000000001"]) + result = s.replace(["100000000000000000000"], [1]) + expected = pd.Series([0, 1, "100000000000000000001"]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 92b6fb06109796..3e8f653c474246 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -5,8 +5,15 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, Index, Series, date_range, option_context, - period_range, timedelta_range) + Categorical, + DataFrame, + Index, + Series, + date_range, + option_context, + period_range, + timedelta_range, +) from pandas.core.base import StringMixin from pandas.core.index import MultiIndex import pandas.util.testing as tm @@ -15,20 +22,27 @@ class TestSeriesRepr(TestData): - def test_multilevel_name_print(self): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - s = Series(range(len(index)), index=index, name='sth') - expected = ["first second", "foo one 0", - " two 1", " three 2", - "bar one 3", " two 4", - "baz two 5", " three 6", - "qux one 7", " two 8", - " three 9", "Name: sth, dtype: int64"] + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + s = Series(range(len(index)), index=index, name="sth") + expected = [ + "first second", + "foo one 0", + " two 1", + " three 2", + "bar one 3", + " two 4", + "baz two 5", + " three 6", + "qux one 7", + " two 8", + " three 9", + "Name: sth, dtype: int64", + ] expected = "\n".join(expected) assert repr(s) == expected @@ -51,7 +65,7 @@ def test_name_printing(self): s.name = None assert "Name:" not in repr(s) - s = Series(index=date_range('20010101', '20020101'), name='test') + s = Series(index=date_range("20010101", "20020101"), name="test") assert "Name: test" in repr(s) def test_repr(self): @@ -71,21 +85,30 @@ def test_repr(self): str(self.series) # with Nones - ots = self.ts.astype('O') + ots = self.ts.astype("O") ots[::2] = None repr(ots) # various names - for name in ['', 1, 1.2, 'foo', '\u03B1\u03B2\u03B3', - 'loooooooooooooooooooooooooooooooooooooooooooooooooooong', - ('foo', 'bar', 'baz'), (1, 2), ('foo', 1, 2.3), - ('\u03B1', '\u03B2', '\u03B3'), - ('\u03B1', 'bar')]: + for name in [ + "", + 1, + 1.2, + "foo", + "\u03B1\u03B2\u03B3", + "loooooooooooooooooooooooooooooooooooooooooooooooooooong", + ("foo", "bar", "baz"), + (1, 2), + ("foo", 1, 2.3), + ("\u03B1", "\u03B2", "\u03B3"), + ("\u03B1", "bar"), + ]: self.series.name = name repr(self.series) - biggie = Series(tm.randn(1000), index=np.arange(1000), - name=('foo', 'bar', 'baz')) + biggie = Series( + tm.randn(1000), index=np.arange(1000), name=("foo", "bar", "baz") + ) repr(biggie) # 0 as name @@ -104,15 +127,15 @@ def test_repr(self): assert "a\n" not in repr(ser) # with empty series (#4651) - s = Series([], dtype=np.int64, name='foo') - assert repr(s) == 'Series([], Name: foo, dtype: int64)' + s = Series([], dtype=np.int64, name="foo") + assert repr(s) == "Series([], Name: foo, dtype: int64)" s = Series([], dtype=np.int64, name=None) - assert repr(s) == 'Series([], dtype: int64)' + assert repr(s) == "Series([], dtype: int64)" def test_tidy_repr(self): a = Series(["\u05d0"] * 1000) - a.name = 'title1' + a.name = "title1" repr(a) # should not raise exception def test_repr_bool_fails(self, capsys): @@ -122,7 +145,7 @@ def test_repr_bool_fails(self, capsys): repr(s) captured = capsys.readouterr() - assert captured.err == '' + assert captured.err == "" def test_repr_name_iterable_indexable(self): s = Series([1, 2, 3], name=np.int64(3)) @@ -130,7 +153,7 @@ def test_repr_name_iterable_indexable(self): # it works! repr(s) - s.name = ("\u05d0", ) * 2 + s.name = ("\u05d0",) * 2 repr(s) def test_repr_should_return_str(self): @@ -146,7 +169,7 @@ def test_repr_should_return_str(self): def test_repr_max_rows(self): # GH 6863 - with pd.option_context('max_rows', None): + with pd.option_context("max_rows", None): str(Series(range(1001))) # should not raise exception def test_unicode_string_with_unicode(self): @@ -161,13 +184,14 @@ def test_str_to_bytes_raises(self): bytes(df) def test_timeseries_repr_object_dtype(self): - index = Index([datetime(2000, 1, 1) + timedelta(i) - for i in range(1000)], dtype=object) + index = Index( + [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], dtype=object + ) ts = Series(np.random.randn(len(index)), index) repr(ts) ts = tm.makeTimeSeries(1000) - assert repr(ts).splitlines()[-1].startswith('Freq:') + assert repr(ts).splitlines()[-1].startswith("Freq:") ts2 = ts.iloc[np.random.randint(0, len(ts) - 1, 400)] repr(ts2).splitlines()[-1] @@ -183,9 +207,8 @@ def test_latex_repr(self): \bottomrule \end{tabular} """ - with option_context('display.latex.escape', False, - 'display.latex.repr', True): - s = Series([r'$\alpha$', 'b', 'c']) + with option_context("display.latex.escape", False, "display.latex.repr", True): + s = Series([r"$\alpha$", "b", "c"]) assert result == s._repr_latex_() assert s._repr_latex_() is None @@ -200,16 +223,15 @@ def test_index_repr_in_frame_with_nan(self): class TestCategoricalRepr: - def test_categorical_repr_unicode(self): # see gh-21002 class County(StringMixin): - name = 'San Sebastián' - state = 'PR' + name = "San Sebastián" + state = "PR" def __str__(self): - return self.name + ', ' + self.state + return self.name + ", " + self.state cat = pd.Categorical([County() for _ in range(61)]) idx = pd.Index(cat) @@ -220,21 +242,29 @@ def __str__(self): def test_categorical_repr(self): a = Series(Categorical([1, 2, 3, 4])) - exp = ("0 1\n1 2\n2 3\n3 4\n" + - "dtype: category\nCategories (4, int64): [1, 2, 3, 4]") + exp = ( + "0 1\n1 2\n2 3\n3 4\n" + + "dtype: category\nCategories (4, int64): [1, 2, 3, 4]" + ) assert exp == a.__str__() a = Series(Categorical(["a", "b"] * 25)) - exp = ("0 a\n1 b\n" + " ..\n" + "48 a\n49 b\n" + - "Length: 50, dtype: category\nCategories (2, object): [a, b]") + exp = ( + "0 a\n1 b\n" + + " ..\n" + + "48 a\n49 b\n" + + "Length: 50, dtype: category\nCategories (2, object): [a, b]" + ) with option_context("display.max_rows", 5): assert exp == repr(a) levs = list("abcdefghijklmnopqrstuvwxyz") a = Series(Categorical(["a", "b"], categories=levs, ordered=True)) - exp = ("0 a\n1 b\n" + "dtype: category\n" - "Categories (26, object): [a < b < c < d ... w < x < y < z]") + exp = ( + "0 a\n1 b\n" + "dtype: category\n" + "Categories (26, object): [a < b < c < d ... w < x < y < z]" + ) assert exp == a.__str__() def test_categorical_series_repr(self): @@ -290,7 +320,7 @@ def test_categorical_series_repr_ordered(self): assert repr(s) == exp def test_categorical_series_repr_datetime(self): - idx = date_range('2011-01-01 09:00', freq='H', periods=5) + idx = date_range("2011-01-01 09:00", freq="H", periods=5) s = Series(Categorical(idx)) exp = """0 2011-01-01 09:00:00 1 2011-01-01 10:00:00 @@ -303,8 +333,7 @@ def test_categorical_series_repr_datetime(self): assert repr(s) == exp - idx = date_range('2011-01-01 09:00', freq='H', periods=5, - tz='US/Eastern') + idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") s = Series(Categorical(idx)) exp = """0 2011-01-01 09:00:00-05:00 1 2011-01-01 10:00:00-05:00 @@ -319,7 +348,7 @@ def test_categorical_series_repr_datetime(self): assert repr(s) == exp def test_categorical_series_repr_datetime_ordered(self): - idx = date_range('2011-01-01 09:00', freq='H', periods=5) + idx = date_range("2011-01-01 09:00", freq="H", periods=5) s = Series(Categorical(idx, ordered=True)) exp = """0 2011-01-01 09:00:00 1 2011-01-01 10:00:00 @@ -332,8 +361,7 @@ def test_categorical_series_repr_datetime_ordered(self): assert repr(s) == exp - idx = date_range('2011-01-01 09:00', freq='H', periods=5, - tz='US/Eastern') + idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") s = Series(Categorical(idx, ordered=True)) exp = """0 2011-01-01 09:00:00-05:00 1 2011-01-01 10:00:00-05:00 @@ -348,7 +376,7 @@ def test_categorical_series_repr_datetime_ordered(self): assert repr(s) == exp def test_categorical_series_repr_period(self): - idx = period_range('2011-01-01 09:00', freq='H', periods=5) + idx = period_range("2011-01-01 09:00", freq="H", periods=5) s = Series(Categorical(idx)) exp = """0 2011-01-01 09:00 1 2011-01-01 10:00 @@ -361,7 +389,7 @@ def test_categorical_series_repr_period(self): assert repr(s) == exp - idx = period_range('2011-01', freq='M', periods=5) + idx = period_range("2011-01", freq="M", periods=5) s = Series(Categorical(idx)) exp = """0 2011-01 1 2011-02 @@ -374,7 +402,7 @@ def test_categorical_series_repr_period(self): assert repr(s) == exp def test_categorical_series_repr_period_ordered(self): - idx = period_range('2011-01-01 09:00', freq='H', periods=5) + idx = period_range("2011-01-01 09:00", freq="H", periods=5) s = Series(Categorical(idx, ordered=True)) exp = """0 2011-01-01 09:00 1 2011-01-01 10:00 @@ -387,7 +415,7 @@ def test_categorical_series_repr_period_ordered(self): assert repr(s) == exp - idx = period_range('2011-01', freq='M', periods=5) + idx = period_range("2011-01", freq="M", periods=5) s = Series(Categorical(idx, ordered=True)) exp = """0 2011-01 1 2011-02 @@ -400,7 +428,7 @@ def test_categorical_series_repr_period_ordered(self): assert repr(s) == exp def test_categorical_series_repr_timedelta(self): - idx = timedelta_range('1 days', periods=5) + idx = timedelta_range("1 days", periods=5) s = Series(Categorical(idx)) exp = """0 1 days 1 2 days @@ -412,7 +440,7 @@ def test_categorical_series_repr_timedelta(self): assert repr(s) == exp - idx = timedelta_range('1 hours', periods=10) + idx = timedelta_range("1 hours", periods=10) s = Series(Categorical(idx)) exp = """0 0 days 01:00:00 1 1 days 01:00:00 @@ -432,7 +460,7 @@ def test_categorical_series_repr_timedelta(self): assert repr(s) == exp def test_categorical_series_repr_timedelta_ordered(self): - idx = timedelta_range('1 days', periods=5) + idx = timedelta_range("1 days", periods=5) s = Series(Categorical(idx, ordered=True)) exp = """0 1 days 1 2 days @@ -444,7 +472,7 @@ def test_categorical_series_repr_timedelta_ordered(self): assert repr(s) == exp - idx = timedelta_range('1 hours', periods=10) + idx = timedelta_range("1 hours", periods=10) s = Series(Categorical(idx, ordered=True)) exp = """0 0 days 01:00:00 1 1 days 01:00:00 diff --git a/pandas/tests/series/test_sorting.py b/pandas/tests/series/test_sorting.py index 0d95a1014f4a84..0ae2194543b44d 100644 --- a/pandas/tests/series/test_sorting.py +++ b/pandas/tests/series/test_sorting.py @@ -11,12 +11,11 @@ class TestSeriesSorting(TestData): - def test_sort_values(self): # check indexes are reordered corresponding with the values - ser = Series([3, 2, 4, 1], ['A', 'B', 'C', 'D']) - expected = Series([1, 2, 3, 4], ['D', 'B', 'A', 'C']) + ser = Series([3, 2, 4, 1], ["A", "B", "C", "D"]) + expected = Series([1, 2, 3, 4], ["D", "B", "A", "C"]) result = ser.sort_values() tm.assert_series_equal(expected, result) @@ -29,12 +28,12 @@ def test_sort_values(self): tm.assert_numpy_array_equal(result[:-5].values, np.sort(vals[5:])) # na_position - result = ts.sort_values(na_position='first') + result = ts.sort_values(na_position="first") assert np.isnan(result[:5]).all() tm.assert_numpy_array_equal(result[5:].values, np.sort(vals[5:])) # something object-type - ser = Series(['A', 'B'], [1, 2]) + ser = Series(["A", "B"], [1, 2]) # no failure ser.sort_values() @@ -42,15 +41,15 @@ def test_sort_values(self): ordered = ts.sort_values(ascending=False) expected = np.sort(ts.dropna().values)[::-1] assert_almost_equal(expected, ordered.dropna().values) - ordered = ts.sort_values(ascending=False, na_position='first') + ordered = ts.sort_values(ascending=False, na_position="first") assert_almost_equal(expected, ordered.dropna().values) # ascending=[False] should behave the same as ascending=False ordered = ts.sort_values(ascending=[False]) expected = ts.sort_values(ascending=False) assert_series_equal(expected, ordered) - ordered = ts.sort_values(ascending=[False], na_position='first') - expected = ts.sort_values(ascending=False, na_position='first') + ordered = ts.sort_values(ascending=[False], na_position="first") + expected = ts.sort_values(ascending=False, na_position="first") assert_series_equal(expected, ordered) msg = "ascending must be boolean" @@ -67,22 +66,23 @@ def test_sort_values(self): ts.sort_values(ascending=[False, False]) msg = "ascending must be boolean" with pytest.raises(ValueError, match=msg): - ts.sort_values(ascending='foobar') + ts.sort_values(ascending="foobar") # inplace=True ts = self.ts.copy() ts.sort_values(ascending=False, inplace=True) tm.assert_series_equal(ts, self.ts.sort_values(ascending=False)) - tm.assert_index_equal(ts.index, - self.ts.sort_values(ascending=False).index) + tm.assert_index_equal(ts.index, self.ts.sort_values(ascending=False).index) # GH 5856/5853 # Series.sort_values operating on a view df = DataFrame(np.random.randn(10, 4)) s = df.iloc[:, 0] - msg = ("This Series is a view of some other array, to sort in-place" - " you must create a copy") + msg = ( + "This Series is a view of some other array, to sort in-place" + " you must create a copy" + ) with pytest.raises(ValueError, match=msg): s.sort_values(inplace=True) @@ -96,8 +96,7 @@ def test_sort_index(self): # descending sorted_series = random_order.sort_index(ascending=False) - assert_series_equal(sorted_series, - self.ts.reindex(self.ts.index[::-1])) + assert_series_equal(sorted_series, self.ts.reindex(self.ts.index[::-1])) # compat on level sorted_series = random_order.sort_index(level=0) @@ -107,8 +106,7 @@ def test_sort_index(self): sorted_series = random_order.sort_index(axis=0) assert_series_equal(sorted_series, self.ts) - msg = ("No axis named 1 for object type" - " ") + msg = "No axis named 1 for object type" " " with pytest.raises(ValueError, match=msg): random_order.sort_values(axis=1) @@ -129,8 +127,7 @@ def test_sort_index_inplace(self): result = random_order.sort_index(ascending=False, inplace=True) assert result is None - tm.assert_series_equal(random_order, self.ts.reindex( - self.ts.index[::-1])) + tm.assert_series_equal(random_order, self.ts.reindex(self.ts.index[::-1])) # ascending random_order = self.ts.reindex(rindex) @@ -139,10 +136,10 @@ def test_sort_index_inplace(self): assert result is None tm.assert_series_equal(random_order, self.ts) - @pytest.mark.parametrize("level", ['A', 0]) # GH 21052 + @pytest.mark.parametrize("level", ["A", 0]) # GH 21052 def test_sort_index_multiindex(self, level): - mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) s = Series([1, 2], mi) backwards = s.iloc[[1, 0]] @@ -160,39 +157,39 @@ def test_sort_index_kind(self): series = Series(index=[3, 2, 1, 4, 3]) expected_series = Series(index=[1, 2, 3, 3, 4]) - index_sorted_series = series.sort_index(kind='mergesort') + index_sorted_series = series.sort_index(kind="mergesort") assert_series_equal(expected_series, index_sorted_series) - index_sorted_series = series.sort_index(kind='quicksort') + index_sorted_series = series.sort_index(kind="quicksort") assert_series_equal(expected_series, index_sorted_series) - index_sorted_series = series.sort_index(kind='heapsort') + index_sorted_series = series.sort_index(kind="heapsort") assert_series_equal(expected_series, index_sorted_series) def test_sort_index_na_position(self): series = Series(index=[3, 2, 1, 4, 3, np.nan]) expected_series_first = Series(index=[np.nan, 1, 2, 3, 3, 4]) - index_sorted_series = series.sort_index(na_position='first') + index_sorted_series = series.sort_index(na_position="first") assert_series_equal(expected_series_first, index_sorted_series) expected_series_last = Series(index=[1, 2, 3, 3, 4, np.nan]) - index_sorted_series = series.sort_index(na_position='last') + index_sorted_series = series.sort_index(na_position="last") assert_series_equal(expected_series_last, index_sorted_series) def test_sort_index_intervals(self): - s = Series([np.nan, 1, 2, 3], IntervalIndex.from_arrays( - [0, 1, 2, 3], - [1, 2, 3, 4])) + s = Series( + [np.nan, 1, 2, 3], IntervalIndex.from_arrays([0, 1, 2, 3], [1, 2, 3, 4]) + ) result = s.sort_index() expected = s assert_series_equal(result, expected) result = s.sort_index(ascending=False) - expected = Series([3, 2, 1, np.nan], IntervalIndex.from_arrays( - [3, 2, 1, 0], - [4, 3, 2, 1])) + expected = Series( + [3, 2, 1, np.nan], IntervalIndex.from_arrays([3, 2, 1, 0], [4, 3, 2, 1]) + ) assert_series_equal(result, expected) def test_sort_values_categorical(self): @@ -202,8 +199,8 @@ def test_sort_values_categorical(self): # sort in the categories order expected = Series( - Categorical(["a", "a", "b", "b"], - ordered=False), index=[0, 3, 1, 2]) + Categorical(["a", "a", "b", "b"], ordered=False), index=[0, 3, 1, 2] + ) result = cat.sort_values() tm.assert_series_equal(result, expected) @@ -212,8 +209,11 @@ def test_sort_values_categorical(self): exp = np.array(["a", "b", "c", "d"], dtype=np.object_) tm.assert_numpy_array_equal(res.__array__(), exp) - cat = Series(Categorical(["a", "c", "b", "d"], categories=[ - "a", "b", "c", "d"], ordered=True)) + cat = Series( + Categorical( + ["a", "c", "b", "d"], categories=["a", "b", "c", "d"], ordered=True + ) + ) res = cat.sort_values() exp = np.array(["a", "b", "c", "d"], dtype=np.object_) tm.assert_numpy_array_equal(res.__array__(), exp) @@ -222,15 +222,16 @@ def test_sort_values_categorical(self): exp = np.array(["d", "c", "b", "a"], dtype=np.object_) tm.assert_numpy_array_equal(res.__array__(), exp) - raw_cat1 = Categorical(["a", "b", "c", "d"], - categories=["a", "b", "c", "d"], ordered=False) - raw_cat2 = Categorical(["a", "b", "c", "d"], - categories=["d", "c", "b", "a"], ordered=True) + raw_cat1 = Categorical( + ["a", "b", "c", "d"], categories=["a", "b", "c", "d"], ordered=False + ) + raw_cat2 = Categorical( + ["a", "b", "c", "d"], categories=["d", "c", "b", "a"], ordered=True + ) s = ["a", "b", "c", "d"] - df = DataFrame({"unsort": raw_cat1, - "sort": raw_cat2, - "string": s, - "values": [1, 2, 3, 4]}) + df = DataFrame( + {"unsort": raw_cat1, "sort": raw_cat2, "string": s, "values": [1, 2, 3, 4]} + ) # Cats must be sorted in a dataframe res = df.sort_values(by=["string"], ascending=False) @@ -249,17 +250,18 @@ def test_sort_values_categorical(self): # multi-columns sort # GH 7848 - df = DataFrame({"id": [6, 5, 4, 3, 2, 1], - "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']}) + df = DataFrame( + {"id": [6, 5, 4, 3, 2, 1], "raw_grade": ["a", "b", "b", "a", "a", "e"]} + ) df["grade"] = Categorical(df["raw_grade"], ordered=True) - df['grade'] = df['grade'].cat.set_categories(['b', 'e', 'a']) + df["grade"] = df["grade"].cat.set_categories(["b", "e", "a"]) # sorts 'grade' according to the order of the categories - result = df.sort_values(by=['grade']) + result = df.sort_values(by=["grade"]) expected = df.iloc[[1, 2, 5, 0, 3, 4]] tm.assert_frame_equal(result, expected) # multi - result = df.sort_values(by=['grade', 'id']) + result = df.sort_values(by=["grade", "id"]) expected = df.iloc[[2, 1, 5, 4, 3, 0]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index b47d339f5a5f20..450fdc3f4dd6fe 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -7,45 +7,41 @@ class TestSeriesSubclassing: - def test_indexing_sliced(self): - s = tm.SubclassedSeries([1, 2, 3, 4], index=list('abcd')) - res = s.loc[['a', 'b']] - exp = tm.SubclassedSeries([1, 2], index=list('ab')) + s = tm.SubclassedSeries([1, 2, 3, 4], index=list("abcd")) + res = s.loc[["a", "b"]] + exp = tm.SubclassedSeries([1, 2], index=list("ab")) tm.assert_series_equal(res, exp) res = s.iloc[[2, 3]] - exp = tm.SubclassedSeries([3, 4], index=list('cd')) + exp = tm.SubclassedSeries([3, 4], index=list("cd")) tm.assert_series_equal(res, exp) - res = s.loc[['a', 'b']] - exp = tm.SubclassedSeries([1, 2], index=list('ab')) + res = s.loc[["a", "b"]] + exp = tm.SubclassedSeries([1, 2], index=list("ab")) tm.assert_series_equal(res, exp) def test_to_frame(self): - s = tm.SubclassedSeries([1, 2, 3, 4], index=list('abcd'), name='xxx') + s = tm.SubclassedSeries([1, 2, 3, 4], index=list("abcd"), name="xxx") res = s.to_frame() - exp = tm.SubclassedDataFrame({'xxx': [1, 2, 3, 4]}, index=list('abcd')) + exp = tm.SubclassedDataFrame({"xxx": [1, 2, 3, 4]}, index=list("abcd")) tm.assert_frame_equal(res, exp) def test_subclass_unstack(self): # GH 15564 - s = tm.SubclassedSeries( - [1, 2, 3, 4], index=[list('aabb'), list('xyxy')]) + s = tm.SubclassedSeries([1, 2, 3, 4], index=[list("aabb"), list("xyxy")]) res = s.unstack() - exp = tm.SubclassedDataFrame( - {'x': [1, 3], 'y': [2, 4]}, index=['a', 'b']) + exp = tm.SubclassedDataFrame({"x": [1, 3], "y": [2, 4]}, index=["a", "b"]) tm.assert_frame_equal(res, exp) def test_subclass_empty_repr(self): - assert 'SubclassedSeries' in repr(tm.SubclassedSeries()) + assert "SubclassedSeries" in repr(tm.SubclassedSeries()) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseSeriesSubclassing: - def test_subclass_sparse_slice(self): # int64 s = tm.SubclassedSparseSeries([1, 2, 3, 4, 5]) @@ -62,16 +58,16 @@ def test_subclass_sparse_slice(self): assert s[1:3].dtype == SparseDtype(np.int64) # float64 - s = tm.SubclassedSparseSeries([1., 2., 3., 4., 5.]) - exp = tm.SubclassedSparseSeries([2., 3., 4.], index=[1, 2, 3]) + s = tm.SubclassedSparseSeries([1.0, 2.0, 3.0, 4.0, 5.0]) + exp = tm.SubclassedSparseSeries([2.0, 3.0, 4.0], index=[1, 2, 3]) tm.assert_sp_series_equal(s.loc[1:3], exp) assert s.loc[1:3].dtype == SparseDtype(np.float64) - exp = tm.SubclassedSparseSeries([2., 3.], index=[1, 2]) + exp = tm.SubclassedSparseSeries([2.0, 3.0], index=[1, 2]) tm.assert_sp_series_equal(s.iloc[1:3], exp) assert s.iloc[1:3].dtype == SparseDtype(np.float64) - exp = tm.SubclassedSparseSeries([2., 3.], index=[1, 2]) + exp = tm.SubclassedSparseSeries([2.0, 3.0], index=[1, 2]) tm.assert_sp_series_equal(s[1:3], exp) assert s[1:3].dtype == SparseDtype(np.float64) @@ -83,29 +79,26 @@ def test_subclass_sparse_addition(self): s1 = tm.SubclassedSparseSeries([4.0, 5.0, 6.0]) s2 = tm.SubclassedSparseSeries([1.0, 2.0, 3.0]) - exp = tm.SubclassedSparseSeries([5., 7., 9.]) + exp = tm.SubclassedSparseSeries([5.0, 7.0, 9.0]) tm.assert_sp_series_equal(s1 + s2, exp) def test_subclass_sparse_to_frame(self): - s = tm.SubclassedSparseSeries([1, 2], index=list('ab'), name='xxx') + s = tm.SubclassedSparseSeries([1, 2], index=list("ab"), name="xxx") res = s.to_frame() - exp_arr = pd.SparseArray([1, 2], dtype=np.int64, kind='block', - fill_value=0) - exp = tm.SubclassedSparseDataFrame({'xxx': exp_arr}, - index=list('ab'), - default_fill_value=0) + exp_arr = pd.SparseArray([1, 2], dtype=np.int64, kind="block", fill_value=0) + exp = tm.SubclassedSparseDataFrame( + {"xxx": exp_arr}, index=list("ab"), default_fill_value=0 + ) tm.assert_sp_frame_equal(res, exp) # create from int dict - res = tm.SubclassedSparseDataFrame({'xxx': [1, 2]}, - index=list('ab'), - default_fill_value=0) + res = tm.SubclassedSparseDataFrame( + {"xxx": [1, 2]}, index=list("ab"), default_fill_value=0 + ) tm.assert_sp_frame_equal(res, exp) - s = tm.SubclassedSparseSeries([1.1, 2.1], index=list('ab'), - name='xxx') + s = tm.SubclassedSparseSeries([1.1, 2.1], index=list("ab"), name="xxx") res = s.to_frame() - exp = tm.SubclassedSparseDataFrame({'xxx': [1.1, 2.1]}, - index=list('ab')) + exp = tm.SubclassedSparseDataFrame({"xxx": [1.1, 2.1]}, index=list("ab")) tm.assert_sp_frame_equal(res, exp) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 97f1cd1cc77893..6be1b9a9143bf3 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -12,39 +12,49 @@ import pandas as pd from pandas import ( - DataFrame, Index, NaT, Series, Timestamp, concat, date_range, offsets, - timedelta_range, to_datetime) + DataFrame, + Index, + NaT, + Series, + Timestamp, + concat, + date_range, + offsets, + timedelta_range, + to_datetime, +) from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.tests.series.common import TestData import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) from pandas.tseries.offsets import BDay, BMonthEnd -def _simple_ts(start, end, freq='D'): +def _simple_ts(start, end, freq="D"): rng = date_range(start, end, freq=freq) return Series(np.random.randn(len(rng)), index=rng) def assert_range_equal(left, right): - assert (left.equals(right)) - assert (left.freq == right.freq) - assert (left.tz == right.tz) + assert left.equals(right) + assert left.freq == right.freq + assert left.tz == right.tz class TestTimeSeries(TestData): - def test_shift(self): shifted = self.ts.shift(1) unshifted = shifted.shift(-1) tm.assert_index_equal(shifted.index, self.ts.index) tm.assert_index_equal(unshifted.index, self.ts.index) - tm.assert_numpy_array_equal(unshifted.dropna().values, - self.ts.values[:-1]) + tm.assert_numpy_array_equal(unshifted.dropna().values, self.ts.values[:-1]) offset = BDay() shifted = self.ts.shift(1, freq=offset) @@ -55,8 +65,8 @@ def test_shift(self): unshifted = self.ts.shift(0, freq=offset) assert_series_equal(unshifted, self.ts) - shifted = self.ts.shift(1, freq='B') - unshifted = shifted.shift(-1, freq='B') + shifted = self.ts.shift(1, freq="B") + unshifted = shifted.shift(-1, freq="B") assert_series_equal(unshifted, self.ts) @@ -72,17 +82,17 @@ def test_shift(self): tm.assert_index_equal(unshifted.index, ps.index) tm.assert_numpy_array_equal(unshifted.dropna().values, ps.values[:-1]) - shifted2 = ps.shift(1, 'B') + shifted2 = ps.shift(1, "B") shifted3 = ps.shift(1, BDay()) assert_series_equal(shifted2, shifted3) - assert_series_equal(ps, shifted2.shift(-1, 'B')) + assert_series_equal(ps, shifted2.shift(-1, "B")) msg = "Given freq D does not match PeriodIndex freq B" with pytest.raises(ValueError, match=msg): - ps.shift(freq='D') + ps.shift(freq="D") # legacy support - shifted4 = ps.shift(1, freq='B') + shifted4 = ps.shift(1, freq="B") assert_series_equal(shifted2, shifted4) shifted5 = ps.shift(1, freq=BDay()) @@ -90,8 +100,8 @@ def test_shift(self): # 32-bit taking # GH 8129 - index = date_range('2000-01-01', periods=5) - for dtype in ['int32', 'int64']: + index = date_range("2000-01-01", periods=5) + for dtype in ["int32", "int64"]: s1 = Series(np.arange(5, dtype=dtype), index=index) p = s1.iloc[1] result = s1.shift(periods=p) @@ -100,52 +110,57 @@ def test_shift(self): # xref 8260 # with tz - s = Series(date_range('2000-01-01 09:00:00', periods=5, - tz='US/Eastern'), name='foo') + s = Series( + date_range("2000-01-01 09:00:00", periods=5, tz="US/Eastern"), name="foo" + ) result = s - s.shift() - exp = Series(TimedeltaIndex(['NaT'] + ['1 days'] * 4), name='foo') + exp = Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") assert_series_equal(result, exp) # incompat tz - s2 = Series(date_range('2000-01-01 09:00:00', periods=5, - tz='CET'), name='foo') - msg = ("DatetimeArray subtraction must have the same timezones or no" - " timezones") + s2 = Series(date_range("2000-01-01 09:00:00", periods=5, tz="CET"), name="foo") + msg = ( + "DatetimeArray subtraction must have the same timezones or no" " timezones" + ) with pytest.raises(TypeError, match=msg): s - s2 def test_shift2(self): - ts = Series(np.random.randn(5), - index=date_range('1/1/2000', periods=5, freq='H')) + ts = Series( + np.random.randn(5), index=date_range("1/1/2000", periods=5, freq="H") + ) - result = ts.shift(1, freq='5T') - exp_index = ts.index.shift(1, freq='5T') + result = ts.shift(1, freq="5T") + exp_index = ts.index.shift(1, freq="5T") tm.assert_index_equal(result.index, exp_index) # GH #1063, multiple of same base - result = ts.shift(1, freq='4H') + result = ts.shift(1, freq="4H") exp_index = ts.index + offsets.Hour(4) tm.assert_index_equal(result.index, exp_index) - idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) + idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-04"]) msg = "Cannot shift with no freq" with pytest.raises(NullFrequencyError, match=msg): idx.shift(1) def test_shift_fill_value(self): # GH #24128 - ts = Series([1.0, 2.0, 3.0, 4.0, 5.0], - index=date_range('1/1/2000', periods=5, freq='H')) + ts = Series( + [1.0, 2.0, 3.0, 4.0, 5.0], index=date_range("1/1/2000", periods=5, freq="H") + ) - exp = Series([0.0, 1.0, 2.0, 3.0, 4.0], - index=date_range('1/1/2000', periods=5, freq='H')) + exp = Series( + [0.0, 1.0, 2.0, 3.0, 4.0], index=date_range("1/1/2000", periods=5, freq="H") + ) # check that fill value works result = ts.shift(1, fill_value=0.0) tm.assert_series_equal(result, exp) - exp = Series([0.0, 0.0, 1.0, 2.0, 3.0], - index=date_range('1/1/2000', periods=5, freq='H')) + exp = Series( + [0.0, 0.0, 1.0, 2.0, 3.0], index=date_range("1/1/2000", periods=5, freq="H") + ) result = ts.shift(2, fill_value=0.0) tm.assert_series_equal(result, exp) @@ -154,44 +169,46 @@ def test_shift_fill_value(self): assert res.dtype == ts.dtype def test_categorical_shift_fill_value(self): - ts = pd.Series(['a', 'b', 'c', 'd'], dtype="category") - res = ts.shift(1, fill_value='a') - expected = pd.Series(pd.Categorical(['a', 'a', 'b', 'c'], - categories=['a', 'b', 'c', 'd'], - ordered=False)) + ts = pd.Series(["a", "b", "c", "d"], dtype="category") + res = ts.shift(1, fill_value="a") + expected = pd.Series( + pd.Categorical( + ["a", "a", "b", "c"], categories=["a", "b", "c", "d"], ordered=False + ) + ) tm.assert_equal(res, expected) # check for incorrect fill_value msg = "'fill_value=f' is not present in this Categorical's categories" with pytest.raises(ValueError, match=msg): - ts.shift(1, fill_value='f') + ts.shift(1, fill_value="f") def test_shift_dst(self): # GH 13926 - dates = date_range('2016-11-06', freq='H', periods=10, tz='US/Eastern') + dates = date_range("2016-11-06", freq="H", periods=10, tz="US/Eastern") s = Series(dates) res = s.shift(0) tm.assert_series_equal(res, s) - assert res.dtype == 'datetime64[ns, US/Eastern]' + assert res.dtype == "datetime64[ns, US/Eastern]" res = s.shift(1) exp_vals = [NaT] + dates.astype(object).values.tolist()[:9] exp = Series(exp_vals) tm.assert_series_equal(res, exp) - assert res.dtype == 'datetime64[ns, US/Eastern]' + assert res.dtype == "datetime64[ns, US/Eastern]" res = s.shift(-2) exp_vals = dates.astype(object).values.tolist()[2:] + [NaT, NaT] exp = Series(exp_vals) tm.assert_series_equal(res, exp) - assert res.dtype == 'datetime64[ns, US/Eastern]' + assert res.dtype == "datetime64[ns, US/Eastern]" for ex in [10, -10, 20, -20]: res = s.shift(ex) - exp = Series([NaT] * 10, dtype='datetime64[ns, US/Eastern]') + exp = Series([NaT] * 10, dtype="datetime64[ns, US/Eastern]") tm.assert_series_equal(res, exp) - assert res.dtype == 'datetime64[ns, US/Eastern]' + assert res.dtype == "datetime64[ns, US/Eastern]" def test_tshift(self): # PeriodIndex @@ -201,7 +218,7 @@ def test_tshift(self): assert_series_equal(unshifted, ps) - shifted2 = ps.tshift(freq='B') + shifted2 = ps.tshift(freq="B") assert_series_equal(shifted, shifted2) shifted3 = ps.tshift(freq=BDay()) @@ -209,7 +226,7 @@ def test_tshift(self): msg = "Given freq M does not match PeriodIndex freq B" with pytest.raises(ValueError, match=msg): - ps.tshift(freq='M') + ps.tshift(freq="M") # DatetimeIndex shifted = self.ts.tshift(1) @@ -220,8 +237,9 @@ def test_tshift(self): shifted2 = self.ts.tshift(freq=self.ts.index.freq) assert_series_equal(shifted, shifted2) - inferred_ts = Series(self.ts.values, Index(np.asarray(self.ts.index)), - name='ts') + inferred_ts = Series( + self.ts.values, Index(np.asarray(self.ts.index)), name="ts" + ) shifted = inferred_ts.tshift(1) unshifted = shifted.tshift(-1) assert_series_equal(shifted, self.ts.tshift(1)) @@ -273,66 +291,70 @@ def test_truncate(self): # corner case, empty series returned truncated = ts.truncate(after=self.ts.index[0] - offset) - assert (len(truncated) == 0) + assert len(truncated) == 0 truncated = ts.truncate(before=self.ts.index[-1] + offset) - assert (len(truncated) == 0) + assert len(truncated) == 0 msg = "Truncate: 1999-12-31 00:00:00 must be after 2000-02-14 00:00:00" with pytest.raises(ValueError, match=msg): - ts.truncate(before=self.ts.index[-1] + offset, - after=self.ts.index[0] - offset) + ts.truncate( + before=self.ts.index[-1] + offset, after=self.ts.index[0] - offset + ) def test_truncate_nonsortedindex(self): # GH 17935 - s = pd.Series(['a', 'b', 'c', 'd', 'e'], - index=[5, 3, 2, 9, 0]) - msg = 'truncate requires a sorted index' + s = pd.Series(["a", "b", "c", "d", "e"], index=[5, 3, 2, 9, 0]) + msg = "truncate requires a sorted index" with pytest.raises(ValueError, match=msg): s.truncate(before=3, after=9) - rng = pd.date_range('2011-01-01', '2012-01-01', freq='W') + rng = pd.date_range("2011-01-01", "2012-01-01", freq="W") ts = pd.Series(np.random.randn(len(rng)), index=rng) - msg = 'truncate requires a sorted index' + msg = "truncate requires a sorted index" with pytest.raises(ValueError, match=msg): - ts.sort_values(ascending=False).truncate(before='2011-11', - after='2011-12') + ts.sort_values(ascending=False).truncate(before="2011-11", after="2011-12") def test_asfreq(self): - ts = Series([0., 1., 2.], index=[datetime(2009, 10, 30), datetime( - 2009, 11, 30), datetime(2009, 12, 31)]) - - daily_ts = ts.asfreq('B') - monthly_ts = daily_ts.asfreq('BM') + ts = Series( + [0.0, 1.0, 2.0], + index=[ + datetime(2009, 10, 30), + datetime(2009, 11, 30), + datetime(2009, 12, 31), + ], + ) + + daily_ts = ts.asfreq("B") + monthly_ts = daily_ts.asfreq("BM") tm.assert_series_equal(monthly_ts, ts) - daily_ts = ts.asfreq('B', method='pad') - monthly_ts = daily_ts.asfreq('BM') + daily_ts = ts.asfreq("B", method="pad") + monthly_ts = daily_ts.asfreq("BM") tm.assert_series_equal(monthly_ts, ts) daily_ts = ts.asfreq(BDay()) monthly_ts = daily_ts.asfreq(BMonthEnd()) tm.assert_series_equal(monthly_ts, ts) - result = ts[:0].asfreq('M') + result = ts[:0].asfreq("M") assert len(result) == 0 assert result is not ts - daily_ts = ts.asfreq('D', fill_value=-1) + daily_ts = ts.asfreq("D", fill_value=-1) result = daily_ts.value_counts().sort_index() - expected = Series([60, 1, 1, 1], - index=[-1.0, 2.0, 1.0, 0.0]).sort_index() + expected = Series([60, 1, 1, 1], index=[-1.0, 2.0, 1.0, 0.0]).sort_index() tm.assert_series_equal(result, expected) def test_asfreq_datetimeindex_empty_series(self): # GH 14320 - expected = Series(index=pd.DatetimeIndex( - ["2016-09-29 11:00"])).asfreq('H') - result = Series(index=pd.DatetimeIndex(["2016-09-29 11:00"]), - data=[3]).asfreq('H') + expected = Series(index=pd.DatetimeIndex(["2016-09-29 11:00"])).asfreq("H") + result = Series(index=pd.DatetimeIndex(["2016-09-29 11:00"]), data=[3]).asfreq( + "H" + ) tm.assert_index_equal(expected.index, result.index) def test_diff(self): @@ -358,7 +380,7 @@ def test_diff(self): assert_series_equal(rs, xp) # datetime diff (GH3100) - s = Series(date_range('20130102', periods=5)) + s = Series(date_range("20130102", periods=5)) rs = s - s.shift(1) xp = s.diff() assert_series_equal(rs, xp) @@ -370,61 +392,58 @@ def test_diff(self): # with tz s = Series( - date_range('2000-01-01 09:00:00', periods=5, - tz='US/Eastern'), name='foo') + date_range("2000-01-01 09:00:00", periods=5, tz="US/Eastern"), name="foo" + ) result = s.diff() - assert_series_equal(result, Series( - TimedeltaIndex(['NaT'] + ['1 days'] * 4), name='foo')) + assert_series_equal( + result, Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") + ) def test_pct_change(self): rs = self.ts.pct_change(fill_method=None) assert_series_equal(rs, self.ts / self.ts.shift(1) - 1) rs = self.ts.pct_change(2) - filled = self.ts.fillna(method='pad') + filled = self.ts.fillna(method="pad") assert_series_equal(rs, filled / filled.shift(2) - 1) - rs = self.ts.pct_change(fill_method='bfill', limit=1) - filled = self.ts.fillna(method='bfill', limit=1) + rs = self.ts.pct_change(fill_method="bfill", limit=1) + filled = self.ts.fillna(method="bfill", limit=1) assert_series_equal(rs, filled / filled.shift(1) - 1) - rs = self.ts.pct_change(freq='5D') - filled = self.ts.fillna(method='pad') - assert_series_equal(rs, - (filled / filled.shift(freq='5D') - 1) - .reindex_like(filled)) + rs = self.ts.pct_change(freq="5D") + filled = self.ts.fillna(method="pad") + assert_series_equal( + rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) + ) def test_pct_change_shift_over_nas(self): - s = Series([1., 1.5, np.nan, 2.5, 3.]) + s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) chg = s.pct_change() - expected = Series([np.nan, 0.5, 0., 2.5 / 1.5 - 1, .2]) + expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) assert_series_equal(chg, expected) - @pytest.mark.parametrize("freq, periods, fill_method, limit", - [('5B', 5, None, None), - ('3B', 3, None, None), - ('3B', 3, 'bfill', None), - ('7B', 7, 'pad', 1), - ('7B', 7, 'bfill', 3), - ('14B', 14, None, None)]) + @pytest.mark.parametrize( + "freq, periods, fill_method, limit", + [ + ("5B", 5, None, None), + ("3B", 3, None, None), + ("3B", 3, "bfill", None), + ("7B", 7, "pad", 1), + ("7B", 7, "bfill", 3), + ("14B", 14, None, None), + ], + ) def test_pct_change_periods_freq(self, freq, periods, fill_method, limit): # GH 7292 - rs_freq = self.ts.pct_change(freq=freq, - fill_method=fill_method, - limit=limit) - rs_periods = self.ts.pct_change(periods, - fill_method=fill_method, - limit=limit) + rs_freq = self.ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) + rs_periods = self.ts.pct_change(periods, fill_method=fill_method, limit=limit) assert_series_equal(rs_freq, rs_periods) empty_ts = Series(index=self.ts.index) - rs_freq = empty_ts.pct_change(freq=freq, - fill_method=fill_method, - limit=limit) - rs_periods = empty_ts.pct_change(periods, - fill_method=fill_method, - limit=limit) + rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) + rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) assert_series_equal(rs_freq, rs_periods) def test_autocorr(self): @@ -499,7 +518,7 @@ def test_timeseries_coercion(self): assert isinstance(ser.index, DatetimeIndex) def test_contiguous_boolean_preserve_freq(self): - rng = date_range('1/1/2000', '3/1/2000', freq='B') + rng = date_range("1/1/2000", "3/1/2000", freq="B") mask = np.zeros(len(rng), dtype=bool) mask[10:20] = True @@ -517,95 +536,112 @@ def test_to_datetime_unit(self): epoch = 1370745748 s = Series([epoch + t for t in range(20)]) - result = to_datetime(s, unit='s') - expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( - seconds=t) for t in range(20)]) + result = to_datetime(s, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + ) assert_series_equal(result, expected) s = Series([epoch + t for t in range(20)]).astype(float) - result = to_datetime(s, unit='s') - expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( - seconds=t) for t in range(20)]) + result = to_datetime(s, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + ) assert_series_equal(result, expected) s = Series([epoch + t for t in range(20)] + [iNaT]) - result = to_datetime(s, unit='s') - expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( - seconds=t) for t in range(20)] + [NaT]) + result = to_datetime(s, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + + [NaT] + ) assert_series_equal(result, expected) s = Series([epoch + t for t in range(20)] + [iNaT]).astype(float) - result = to_datetime(s, unit='s') - expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( - seconds=t) for t in range(20)] + [NaT]) + result = to_datetime(s, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + + [NaT] + ) assert_series_equal(result, expected) # GH13834 - s = Series([epoch + t for t in np.arange(0, 2, .25)] + - [iNaT]).astype(float) - result = to_datetime(s, unit='s') - expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( - seconds=t) for t in np.arange(0, 2, .25)] + [NaT]) + s = Series([epoch + t for t in np.arange(0, 2, 0.25)] + [iNaT]).astype(float) + result = to_datetime(s, unit="s") + expected = Series( + [ + Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) + for t in np.arange(0, 2, 0.25) + ] + + [NaT] + ) assert_series_equal(result, expected) - s = concat([Series([epoch + t for t in range(20)] - ).astype(float), Series([np.nan])], - ignore_index=True) - result = to_datetime(s, unit='s') - expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( - seconds=t) for t in range(20)] + [NaT]) + s = concat( + [Series([epoch + t for t in range(20)]).astype(float), Series([np.nan])], + ignore_index=True, + ) + result = to_datetime(s, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + + [NaT] + ) assert_series_equal(result, expected) - result = to_datetime([1, 2, 'NaT', pd.NaT, np.nan], unit='D') - expected = DatetimeIndex([Timestamp('1970-01-02'), - Timestamp('1970-01-03')] + ['NaT'] * 3) + result = to_datetime([1, 2, "NaT", pd.NaT, np.nan], unit="D") + expected = DatetimeIndex( + [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 3 + ) tm.assert_index_equal(result, expected) msg = "non convertible value foo with the unit 'D'" with pytest.raises(ValueError, match=msg): - to_datetime([1, 2, 'foo'], unit='D') + to_datetime([1, 2, "foo"], unit="D") msg = "cannot convert input 111111111 with the unit 'D'" with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime([1, 2, 111111111], unit='D') + to_datetime([1, 2, 111111111], unit="D") # coerce we can process - expected = DatetimeIndex([Timestamp('1970-01-02'), - Timestamp('1970-01-03')] + ['NaT'] * 1) - result = to_datetime([1, 2, 'foo'], unit='D', errors='coerce') + expected = DatetimeIndex( + [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 1 + ) + result = to_datetime([1, 2, "foo"], unit="D", errors="coerce") tm.assert_index_equal(result, expected) - result = to_datetime([1, 2, 111111111], unit='D', errors='coerce') + result = to_datetime([1, 2, 111111111], unit="D", errors="coerce") tm.assert_index_equal(result, expected) def test_series_ctor_datetime64(self): - rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') + rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") dates = np.asarray(rng) series = Series(dates) - assert np.issubdtype(series.dtype, np.dtype('M8[ns]')) + assert np.issubdtype(series.dtype, np.dtype("M8[ns]")) def test_series_repr_nat(self): - series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') + series = Series([0, 1000, 2000, iNaT], dtype="M8[ns]") result = repr(series) - expected = ('0 1970-01-01 00:00:00.000000\n' - '1 1970-01-01 00:00:00.000001\n' - '2 1970-01-01 00:00:00.000002\n' - '3 NaT\n' - 'dtype: datetime64[ns]') + expected = ( + "0 1970-01-01 00:00:00.000000\n" + "1 1970-01-01 00:00:00.000001\n" + "2 1970-01-01 00:00:00.000002\n" + "3 NaT\n" + "dtype: datetime64[ns]" + ) assert result == expected def test_asfreq_keep_index_name(self): # GH #9854 - index_name = 'bar' - index = pd.date_range('20130101', periods=20, name=index_name) - df = pd.DataFrame([x for x in range(20)], columns=['foo'], index=index) + index_name = "bar" + index = pd.date_range("20130101", periods=20, name=index_name) + df = pd.DataFrame([x for x in range(20)], columns=["foo"], index=index) assert index_name == df.index.name - assert index_name == df.asfreq('10D').index.name + assert index_name == df.asfreq("10D").index.name def test_promote_datetime_date(self): - rng = date_range('1/1/2000', periods=20) + rng = date_range("1/1/2000", periods=20) ts = Series(np.random.randn(20), index=rng) ts_slice = ts[5:] @@ -619,8 +655,8 @@ def test_promote_datetime_date(self): assert_series_equal(result2, expected) # test asfreq - result = ts2.asfreq('4H', method='ffill') - expected = ts[5:].asfreq('4H', method='ffill') + result = ts2.asfreq("4H", method="ffill") + expected = ts[5:].asfreq("4H", method="ffill") assert_series_equal(result, expected) result = rng.get_indexer(ts2.index) @@ -628,13 +664,13 @@ def test_promote_datetime_date(self): tm.assert_numpy_array_equal(result, expected) def test_asfreq_normalize(self): - rng = date_range('1/1/2000 09:30', periods=20) - norm = date_range('1/1/2000', periods=20) + rng = date_range("1/1/2000 09:30", periods=20) + norm = date_range("1/1/2000", periods=20) vals = np.random.randn(20) ts = Series(vals, index=rng) - result = ts.asfreq('D', normalize=True) - norm = date_range('1/1/2000', periods=20) + result = ts.asfreq("D", normalize=True) + norm = date_range("1/1/2000", periods=20) expected = Series(vals, index=norm) assert_series_equal(result, expected) @@ -642,80 +678,80 @@ def test_asfreq_normalize(self): vals = np.random.randn(20, 3) ts = DataFrame(vals, index=rng) - result = ts.asfreq('D', normalize=True) + result = ts.asfreq("D", normalize=True) expected = DataFrame(vals, index=norm) assert_frame_equal(result, expected) def test_first_subset(self): - ts = _simple_ts('1/1/2000', '1/1/2010', freq='12h') - result = ts.first('10d') + ts = _simple_ts("1/1/2000", "1/1/2010", freq="12h") + result = ts.first("10d") assert len(result) == 20 - ts = _simple_ts('1/1/2000', '1/1/2010') - result = ts.first('10d') + ts = _simple_ts("1/1/2000", "1/1/2010") + result = ts.first("10d") assert len(result) == 10 - result = ts.first('3M') - expected = ts[:'3/31/2000'] + result = ts.first("3M") + expected = ts[:"3/31/2000"] assert_series_equal(result, expected) - result = ts.first('21D') + result = ts.first("21D") expected = ts[:21] assert_series_equal(result, expected) - result = ts[:0].first('3M') + result = ts[:0].first("3M") assert_series_equal(result, ts[:0]) def test_first_raises(self): # GH20725 - ser = pd.Series('a b c'.split()) + ser = pd.Series("a b c".split()) msg = "'first' only supports a DatetimeIndex index" with pytest.raises(TypeError, match=msg): - ser.first('1D') + ser.first("1D") def test_last_subset(self): - ts = _simple_ts('1/1/2000', '1/1/2010', freq='12h') - result = ts.last('10d') + ts = _simple_ts("1/1/2000", "1/1/2010", freq="12h") + result = ts.last("10d") assert len(result) == 20 - ts = _simple_ts('1/1/2000', '1/1/2010') - result = ts.last('10d') + ts = _simple_ts("1/1/2000", "1/1/2010") + result = ts.last("10d") assert len(result) == 10 - result = ts.last('21D') - expected = ts['12/12/2009':] + result = ts.last("21D") + expected = ts["12/12/2009":] assert_series_equal(result, expected) - result = ts.last('21D') + result = ts.last("21D") expected = ts[-21:] assert_series_equal(result, expected) - result = ts[:0].last('3M') + result = ts[:0].last("3M") assert_series_equal(result, ts[:0]) def test_last_raises(self): # GH20725 - ser = pd.Series('a b c'.split()) + ser = pd.Series("a b c".split()) msg = "'last' only supports a DatetimeIndex index" with pytest.raises(TypeError, match=msg): - ser.last('1D') + ser.last("1D") def test_format_pre_1900_dates(self): - rng = date_range('1/1/1850', '1/1/1950', freq='A-DEC') + rng = date_range("1/1/1850", "1/1/1950", freq="A-DEC") rng.format() ts = Series(1, index=rng) repr(ts) def test_at_time(self): - rng = date_range('1/1/2000', '1/5/2000', freq='5min') + rng = date_range("1/1/2000", "1/5/2000", freq="5min") ts = Series(np.random.randn(len(rng)), index=rng) rs = ts.at_time(rng[1]) assert (rs.index.hour == rng[1].hour).all() assert (rs.index.minute == rng[1].minute).all() assert (rs.index.second == rng[1].second).all() - result = ts.at_time('9:30') + result = ts.at_time("9:30") expected = ts.at_time(time(9, 30)) assert_series_equal(result, expected) @@ -731,33 +767,33 @@ def test_at_time(self): assert_series_equal(result, expected) tm.assert_frame_equal(result_df, exp_df) - chunk = df.loc['1/4/2000':] + chunk = df.loc["1/4/2000":] result = chunk.loc[time(9, 30)] expected = result_df[-1:] tm.assert_frame_equal(result, expected) # midnight, everything - rng = date_range('1/1/2000', '1/31/2000') + rng = date_range("1/1/2000", "1/31/2000") ts = Series(np.random.randn(len(rng)), index=rng) result = ts.at_time(time(0, 0)) assert_series_equal(result, ts) # time doesn't exist - rng = date_range('1/1/2012', freq='23Min', periods=384) + rng = date_range("1/1/2012", freq="23Min", periods=384) ts = Series(np.random.randn(len(rng)), rng) - rs = ts.at_time('16:00') + rs = ts.at_time("16:00") assert len(rs) == 0 def test_at_time_raises(self): # GH20725 - ser = pd.Series('a b c'.split()) + ser = pd.Series("a b c".split()) msg = "Index must be DatetimeIndex" with pytest.raises(TypeError, match=msg): - ser.at_time('00:00') + ser.at_time("00:00") def test_between(self): - series = Series(date_range('1/1/2000', periods=10)) + series = Series(date_range("1/1/2000", periods=10)) left, right = series[[2, 7]] result = series.between(left, right) @@ -765,7 +801,7 @@ def test_between(self): assert_series_equal(result, expected) def test_between_time(self): - rng = date_range('1/1/2000', '1/5/2000', freq='5min') + rng = date_range("1/1/2000", "1/5/2000", freq="5min") ts = Series(np.random.randn(len(rng)), index=rng) stime = time(0, 0) etime = time(1, 0) @@ -792,12 +828,12 @@ def test_between_time(self): else: assert t < etime - result = ts.between_time('00:00', '01:00') + result = ts.between_time("00:00", "01:00") expected = ts.between_time(stime, etime) assert_series_equal(result, expected) # across midnight - rng = date_range('1/1/2000', '1/5/2000', freq='5min') + rng = date_range("1/1/2000", "1/5/2000", freq="5min") ts = Series(np.random.randn(len(rng)), index=rng) stime = time(22, 0) etime = time(9, 0) @@ -826,40 +862,45 @@ def test_between_time(self): def test_between_time_raises(self): # GH20725 - ser = pd.Series('a b c'.split()) + ser = pd.Series("a b c".split()) msg = "Index must be DatetimeIndex" with pytest.raises(TypeError, match=msg): - ser.between_time(start_time='00:00', end_time='12:00') + ser.between_time(start_time="00:00", end_time="12:00") def test_between_time_types(self): # GH11818 - rng = date_range('1/1/2000', '1/5/2000', freq='5min') - msg = (r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\]" - " to a time") + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + msg = ( + r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\]" + " to a time" + ) with pytest.raises(ValueError, match=msg): - rng.indexer_between_time(datetime(2010, 1, 2, 1), - datetime(2010, 1, 2, 5)) + rng.indexer_between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) - frame = DataFrame({'A': 0}, index=rng) + frame = DataFrame({"A": 0}, index=rng) with pytest.raises(ValueError, match=msg): - frame.between_time(datetime(2010, 1, 2, 1), - datetime(2010, 1, 2, 5)) + frame.between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) series = Series(0, index=rng) with pytest.raises(ValueError, match=msg): - series.between_time(datetime(2010, 1, 2, 1), - datetime(2010, 1, 2, 5)) + series.between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) @td.skip_if_has_locale def test_between_time_formats(self): # GH11818 - rng = date_range('1/1/2000', '1/5/2000', freq='5min') + rng = date_range("1/1/2000", "1/5/2000", freq="5min") ts = DataFrame(np.random.randn(len(rng), 2), index=rng) - strings = [("2:00", "2:30"), ("0200", "0230"), ("2:00am", "2:30am"), - ("0200am", "0230am"), ("2:00:00", "2:30:00"), - ("020000", "023000"), ("2:00:00am", "2:30:00am"), - ("020000am", "023000am")] + strings = [ + ("2:00", "2:30"), + ("0200", "0230"), + ("2:00am", "2:30am"), + ("0200am", "0230am"), + ("2:00:00", "2:30:00"), + ("020000", "023000"), + ("2:00:00am", "2:30:00am"), + ("020000am", "023000am"), + ] expected_length = 28 for time_string in strings: @@ -867,38 +908,37 @@ def test_between_time_formats(self): def test_between_time_axis(self): # issue 8839 - rng = date_range('1/1/2000', periods=100, freq='10min') + rng = date_range("1/1/2000", periods=100, freq="10min") ts = Series(np.random.randn(len(rng)), index=rng) - stime, etime = ('08:00:00', '09:00:00') + stime, etime = ("08:00:00", "09:00:00") expected_length = 7 assert len(ts.between_time(stime, etime)) == expected_length assert len(ts.between_time(stime, etime, axis=0)) == expected_length - msg = ("No axis named 1 for object type" - " ") + msg = "No axis named 1 for object type" " " with pytest.raises(ValueError, match=msg): ts.between_time(stime, etime, axis=1) def test_to_period(self): from pandas.core.indexes.period import period_range - ts = _simple_ts('1/1/2000', '1/1/2001') + ts = _simple_ts("1/1/2000", "1/1/2001") pts = ts.to_period() exp = ts.copy() - exp.index = period_range('1/1/2000', '1/1/2001') + exp.index = period_range("1/1/2000", "1/1/2001") assert_series_equal(pts, exp) - pts = ts.to_period('M') - exp.index = exp.index.asfreq('M') - tm.assert_index_equal(pts.index, exp.index.asfreq('M')) + pts = ts.to_period("M") + exp.index = exp.index.asfreq("M") + tm.assert_index_equal(pts.index, exp.index.asfreq("M")) assert_series_equal(pts, exp) # GH 7606 without freq - idx = DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', - '2011-01-04']) - exp_idx = pd.PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03', - '2011-01-04'], freq='D') + idx = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"]) + exp_idx = pd.PeriodIndex( + ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], freq="D" + ) s = Series(np.random.randn(4), index=idx) expected = s.copy() @@ -915,7 +955,7 @@ def test_to_period(self): assert_frame_equal(df.to_period(axis=1), expected) def test_groupby_count_dateparseerror(self): - dr = date_range(start='1/1/2012', freq='5min', periods=10) + dr = date_range(start="1/1/2012", freq="5min", periods=10) # BAD Example, datetimes first s = Series(np.arange(10), index=[dr, np.arange(10)]) @@ -929,17 +969,17 @@ def test_groupby_count_dateparseerror(self): assert_series_equal(result, expected) def test_to_csv_numpy_16_bug(self): - frame = DataFrame({'a': date_range('1/1/2000', periods=10)}) + frame = DataFrame({"a": date_range("1/1/2000", periods=10)}) buf = StringIO() frame.to_csv(buf) result = buf.getvalue() - assert '2000-01-01' in result + assert "2000-01-01" in result def test_series_map_box_timedelta(self): # GH 11349 - s = Series(timedelta_range('1 day 1 s', periods=5, freq='h')) + s = Series(timedelta_range("1 day 1 s", periods=5, freq="h")) def f(x): return x.total_seconds() @@ -951,19 +991,20 @@ def f(x): def test_asfreq_resample_set_correct_freq(self): # GH5613 # we test if .asfreq() and .resample() set the correct value for .freq - df = pd.DataFrame({'date': ["2012-01-01", "2012-01-02", "2012-01-03"], - 'col': [1, 2, 3]}) + df = pd.DataFrame( + {"date": ["2012-01-01", "2012-01-02", "2012-01-03"], "col": [1, 2, 3]} + ) df = df.set_index(pd.to_datetime(df.date)) # testing the settings before calling .asfreq() and .resample() assert df.index.freq is None - assert df.index.inferred_freq == 'D' + assert df.index.inferred_freq == "D" # does .asfreq() set .freq correctly? - assert df.asfreq('D').index.freq == 'D' + assert df.asfreq("D").index.freq == "D" # does .resample() set .freq correctly? - assert df.resample('D').asfreq().index.freq == 'D' + assert df.resample("D").asfreq().index.freq == "D" def test_pickle(self): @@ -971,7 +1012,7 @@ def test_pickle(self): p = tm.round_trip_pickle(NaT) assert p is NaT - idx = pd.to_datetime(['2013-01-01', NaT, '2014-01-06']) + idx = pd.to_datetime(["2013-01-01", NaT, "2014-01-06"]) idx_p = tm.round_trip_pickle(idx) assert idx_p[0] == idx[0] assert idx_p[1] is NaT @@ -979,13 +1020,13 @@ def test_pickle(self): # GH11002 # don't infer freq - idx = date_range('1750-1-1', '2050-1-1', freq='7D') + idx = date_range("1750-1-1", "2050-1-1", freq="7D") idx_p = tm.round_trip_pickle(idx) tm.assert_index_equal(idx, idx_p) - @pytest.mark.parametrize('tz', [None, 'Asia/Tokyo', 'US/Eastern']) + @pytest.mark.parametrize("tz", [None, "Asia/Tokyo", "US/Eastern"]) def test_setops_preserve_freq(self, tz): - rng = date_range('1/1/2000', '1/1/2002', name='idx', tz=tz) + rng = date_range("1/1/2000", "1/1/2002", name="idx", tz=tz) result = rng[:50].union(rng[50:100]) assert result.name == rng.name @@ -1004,10 +1045,10 @@ def test_setops_preserve_freq(self, tz): result = rng[:50].intersection(rng[25:75]) assert result.name == rng.name - assert result.freqstr == 'D' + assert result.freqstr == "D" assert result.tz == rng.tz - nofreq = DatetimeIndex(list(rng[25:75]), name='other') + nofreq = DatetimeIndex(list(rng[25:75]), name="other") result = rng[:50].union(nofreq) assert result.name is None assert result.freq == rng.freq @@ -1020,25 +1061,24 @@ def test_setops_preserve_freq(self, tz): def test_from_M8_structured(self): dates = [(datetime(2012, 9, 9, 0, 0), datetime(2012, 9, 8, 15, 10))] - arr = np.array(dates, - dtype=[('Date', 'M8[us]'), ('Forecasting', 'M8[us]')]) + arr = np.array(dates, dtype=[("Date", "M8[us]"), ("Forecasting", "M8[us]")]) df = DataFrame(arr) - assert df['Date'][0] == dates[0][0] - assert df['Forecasting'][0] == dates[0][1] + assert df["Date"][0] == dates[0][0] + assert df["Forecasting"][0] == dates[0][1] - s = Series(arr['Date']) + s = Series(arr["Date"]) assert isinstance(s[0], Timestamp) assert s[0] == dates[0][0] with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s = Series.from_array(arr['Date'], Index([0])) + s = Series.from_array(arr["Date"], Index([0])) assert s[0] == dates[0][0] def test_get_level_values_box(self): from pandas import MultiIndex - dates = date_range('1/1/2000', periods=4) + dates = date_range("1/1/2000", periods=4) levels = [dates, [0, 1]] codes = [[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]] @@ -1048,18 +1088,22 @@ def test_get_level_values_box(self): def test_view_tz(self): # GH#24024 - ser = pd.Series(pd.date_range('2000', periods=4, tz='US/Central')) + ser = pd.Series(pd.date_range("2000", periods=4, tz="US/Central")) result = ser.view("i8") - expected = pd.Series([946706400000000000, - 946792800000000000, - 946879200000000000, - 946965600000000000]) + expected = pd.Series( + [ + 946706400000000000, + 946792800000000000, + 946879200000000000, + 946965600000000000, + ] + ) tm.assert_series_equal(result, expected) def test_asarray_tz_naive(self): # This shouldn't produce a warning. - ser = pd.Series(pd.date_range('2000', periods=2)) - expected = np.array(['2000-01-01', '2000-01-02'], dtype='M8[ns]') + ser = pd.Series(pd.date_range("2000", periods=2)) + expected = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]") with tm.assert_produces_warning(None): result = np.asarray(ser) @@ -1069,14 +1113,13 @@ def test_asarray_tz_naive(self): with tm.assert_produces_warning(None): result = np.asarray(ser, dtype=object) - expected = np.array([pd.Timestamp('2000-01-01'), - pd.Timestamp('2000-01-02')]) + expected = np.array([pd.Timestamp("2000-01-01"), pd.Timestamp("2000-01-02")]) tm.assert_numpy_array_equal(result, expected) def test_asarray_tz_aware(self): - tz = 'US/Central' - ser = pd.Series(pd.date_range('2000', periods=2, tz=tz)) - expected = np.array(['2000-01-01T06', '2000-01-02T06'], dtype='M8[ns]') + tz = "US/Central" + ser = pd.Series(pd.date_range("2000", periods=2, tz=tz)) + expected = np.array(["2000-01-01T06", "2000-01-02T06"], dtype="M8[ns]") # We warn by default and return an ndarray[M8[ns]] with tm.assert_produces_warning(FutureWarning): result = np.asarray(ser) @@ -1090,8 +1133,9 @@ def test_asarray_tz_aware(self): tm.assert_numpy_array_equal(result, expected) # Future behavior with no warning - expected = np.array([pd.Timestamp("2000-01-01", tz=tz), - pd.Timestamp("2000-01-02", tz=tz)]) + expected = np.array( + [pd.Timestamp("2000-01-01", tz=tz), pd.Timestamp("2000-01-02", tz=tz)] + ) with tm.assert_produces_warning(None): result = np.asarray(ser, dtype=object) diff --git a/pandas/tests/series/test_timezones.py b/pandas/tests/series/test_timezones.py index 6ff02b31600203..c16e2864b131f0 100644 --- a/pandas/tests/series/test_timezones.py +++ b/pandas/tests/series/test_timezones.py @@ -20,77 +20,80 @@ class TestSeriesTimezones: # Series.tz_localize def test_series_tz_localize(self): - rng = date_range('1/1/2011', periods=100, freq='H') + rng = date_range("1/1/2011", periods=100, freq="H") ts = Series(1, index=rng) - result = ts.tz_localize('utc') - assert result.index.tz.zone == 'UTC' + result = ts.tz_localize("utc") + assert result.index.tz.zone == "UTC" # Can't localize if already tz-aware - rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") ts = Series(1, index=rng) - with pytest.raises(TypeError, match='Already tz-aware'): - ts.tz_localize('US/Eastern') + with pytest.raises(TypeError, match="Already tz-aware"): + ts.tz_localize("US/Eastern") - @pytest.mark.filterwarnings('ignore::FutureWarning') + @pytest.mark.filterwarnings("ignore::FutureWarning") def test_tz_localize_errors_deprecation(self): # GH 22644 - tz = 'Europe/Warsaw' + tz = "Europe/Warsaw" n = 60 - rng = date_range(start='2015-03-29 02:00:00', periods=n, freq='min') + rng = date_range(start="2015-03-29 02:00:00", periods=n, freq="min") ts = Series(rng) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): with pytest.raises(ValueError): - ts.dt.tz_localize(tz, errors='foo') + ts.dt.tz_localize(tz, errors="foo") # make sure errors='coerce' gets mapped correctly to nonexistent - result = ts.dt.tz_localize(tz, errors='coerce') - expected = ts.dt.tz_localize(tz, nonexistent='NaT') + result = ts.dt.tz_localize(tz, errors="coerce") + expected = ts.dt.tz_localize(tz, nonexistent="NaT") tm.assert_series_equal(result, expected) def test_series_tz_localize_ambiguous_bool(self): # make sure that we are correctly accepting bool values as ambiguous # GH#14402 - ts = Timestamp('2015-11-01 01:00:03') - expected0 = Timestamp('2015-11-01 01:00:03-0500', tz='US/Central') - expected1 = Timestamp('2015-11-01 01:00:03-0600', tz='US/Central') + ts = Timestamp("2015-11-01 01:00:03") + expected0 = Timestamp("2015-11-01 01:00:03-0500", tz="US/Central") + expected1 = Timestamp("2015-11-01 01:00:03-0600", tz="US/Central") ser = Series([ts]) expected0 = Series([expected0]) expected1 = Series([expected1]) with pytest.raises(pytz.AmbiguousTimeError): - ser.dt.tz_localize('US/Central') + ser.dt.tz_localize("US/Central") - result = ser.dt.tz_localize('US/Central', ambiguous=True) + result = ser.dt.tz_localize("US/Central", ambiguous=True) tm.assert_series_equal(result, expected0) - result = ser.dt.tz_localize('US/Central', ambiguous=[True]) + result = ser.dt.tz_localize("US/Central", ambiguous=[True]) tm.assert_series_equal(result, expected0) - result = ser.dt.tz_localize('US/Central', ambiguous=False) + result = ser.dt.tz_localize("US/Central", ambiguous=False) tm.assert_series_equal(result, expected1) - result = ser.dt.tz_localize('US/Central', ambiguous=[False]) + result = ser.dt.tz_localize("US/Central", ambiguous=[False]) tm.assert_series_equal(result, expected1) - @pytest.mark.parametrize('tz', ['Europe/Warsaw', 'dateutil/Europe/Warsaw']) - @pytest.mark.parametrize('method, exp', [ - ['shift_forward', '2015-03-29 03:00:00'], - ['NaT', NaT], - ['raise', None], - ['foo', 'invalid'] - ]) + @pytest.mark.parametrize("tz", ["Europe/Warsaw", "dateutil/Europe/Warsaw"]) + @pytest.mark.parametrize( + "method, exp", + [ + ["shift_forward", "2015-03-29 03:00:00"], + ["NaT", NaT], + ["raise", None], + ["foo", "invalid"], + ], + ) def test_series_tz_localize_nonexistent(self, tz, method, exp): # GH 8917 n = 60 - dti = date_range(start='2015-03-29 02:00:00', periods=n, freq='min') + dti = date_range(start="2015-03-29 02:00:00", periods=n, freq="min") s = Series(1, dti) - if method == 'raise': + if method == "raise": with pytest.raises(pytz.NonExistentTimeError): s.tz_localize(tz, nonexistent=method) - elif exp == 'invalid': + elif exp == "invalid": with pytest.raises(ValueError): dti.tz_localize(tz, nonexistent=method) else: @@ -98,12 +101,12 @@ def test_series_tz_localize_nonexistent(self, tz, method, exp): expected = Series(1, index=DatetimeIndex([exp] * n, tz=tz)) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_series_tz_localize_empty(self, tzstr): # GH#2248 ser = Series() - ser2 = ser.tz_localize('utc') + ser2 = ser.tz_localize("utc") assert ser2.index.tz == pytz.utc ser2 = ser.tz_localize(tzstr) @@ -113,24 +116,23 @@ def test_series_tz_localize_empty(self, tzstr): # Series.tz_convert def test_series_tz_convert(self): - rng = date_range('1/1/2011', periods=200, freq='D', tz='US/Eastern') + rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern") ts = Series(1, index=rng) - result = ts.tz_convert('Europe/Berlin') - assert result.index.tz.zone == 'Europe/Berlin' + result = ts.tz_convert("Europe/Berlin") + assert result.index.tz.zone == "Europe/Berlin" # can't convert tz-naive - rng = date_range('1/1/2011', periods=200, freq='D') + rng = date_range("1/1/2011", periods=200, freq="D") ts = Series(1, index=rng) with pytest.raises(TypeError, match="Cannot convert tz-naive"): - ts.tz_convert('US/Eastern') + ts.tz_convert("US/Eastern") def test_series_tz_convert_to_utc(self): - base = DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], - tz='UTC') - idx1 = base.tz_convert('Asia/Tokyo')[:2] - idx2 = base.tz_convert('US/Eastern')[1:] + base = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], tz="UTC") + idx1 = base.tz_convert("Asia/Tokyo")[:2] + idx2 = base.tz_convert("US/Eastern")[1:] res = Series([1, 2], index=idx1) + Series([1, 1], index=idx2) tm.assert_series_equal(res, Series([np.nan, 3, np.nan], index=base)) @@ -139,28 +141,26 @@ def test_series_tz_convert_to_utc(self): # Series.append def test_series_append_aware(self): - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', - tz='US/Eastern') - rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', - tz='US/Eastern') + rng1 = date_range("1/1/2011 01:00", periods=1, freq="H", tz="US/Eastern") + rng2 = date_range("1/1/2011 02:00", periods=1, freq="H", tz="US/Eastern") ser1 = Series([1], index=rng1) ser2 = Series([2], index=rng2) ts_result = ser1.append(ser2) - exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], - tz='US/Eastern') + exp_index = DatetimeIndex( + ["2011-01-01 01:00", "2011-01-01 02:00"], tz="US/Eastern" + ) exp = Series([1, 2], index=exp_index) tm.assert_series_equal(ts_result, exp) assert ts_result.index.tz == rng1.tz - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', tz='UTC') - rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='UTC') + rng1 = date_range("1/1/2011 01:00", periods=1, freq="H", tz="UTC") + rng2 = date_range("1/1/2011 02:00", periods=1, freq="H", tz="UTC") ser1 = Series([1], index=rng1) ser2 = Series([2], index=rng2) ts_result = ser1.append(ser2) - exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], - tz='UTC') + exp_index = DatetimeIndex(["2011-01-01 01:00", "2011-01-01 02:00"], tz="UTC") exp = Series([1, 2], index=exp_index) tm.assert_series_equal(ts_result, exp) utc = rng1.tz @@ -168,22 +168,23 @@ def test_series_append_aware(self): # GH#7795 # different tz coerces to object dtype, not UTC - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', - tz='US/Eastern') - rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', - tz='US/Central') + rng1 = date_range("1/1/2011 01:00", periods=1, freq="H", tz="US/Eastern") + rng2 = date_range("1/1/2011 02:00", periods=1, freq="H", tz="US/Central") ser1 = Series([1], index=rng1) ser2 = Series([2], index=rng2) ts_result = ser1.append(ser2) - exp_index = Index([Timestamp('1/1/2011 01:00', tz='US/Eastern'), - Timestamp('1/1/2011 02:00', tz='US/Central')]) + exp_index = Index( + [ + Timestamp("1/1/2011 01:00", tz="US/Eastern"), + Timestamp("1/1/2011 02:00", tz="US/Central"), + ] + ) exp = Series([1, 2], index=exp_index) tm.assert_series_equal(ts_result, exp) def test_series_append_aware_naive(self): - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') - rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', - tz='US/Eastern') + rng1 = date_range("1/1/2011 01:00", periods=1, freq="H") + rng2 = date_range("1/1/2011 02:00", periods=1, freq="H", tz="US/Eastern") ser1 = Series(np.random.randn(len(rng1)), index=rng1) ser2 = Series(np.random.randn(len(rng2)), index=rng2) ts_result = ser1.append(ser2) @@ -192,7 +193,7 @@ def test_series_append_aware_naive(self): assert ts_result.index.equals(expected) # mixed - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') + rng1 = date_range("1/1/2011 01:00", periods=1, freq="H") rng2 = range(100) ser1 = Series(np.random.randn(len(rng1)), index=rng1) ser2 = Series(np.random.randn(len(rng2)), index=rng2) @@ -202,18 +203,23 @@ def test_series_append_aware_naive(self): assert ts_result.index.equals(expected) def test_series_append_dst(self): - rng1 = date_range('1/1/2016 01:00', periods=3, freq='H', - tz='US/Eastern') - rng2 = date_range('8/1/2016 01:00', periods=3, freq='H', - tz='US/Eastern') + rng1 = date_range("1/1/2016 01:00", periods=3, freq="H", tz="US/Eastern") + rng2 = date_range("8/1/2016 01:00", periods=3, freq="H", tz="US/Eastern") ser1 = Series([1, 2, 3], index=rng1) ser2 = Series([10, 11, 12], index=rng2) ts_result = ser1.append(ser2) - exp_index = DatetimeIndex(['2016-01-01 01:00', '2016-01-01 02:00', - '2016-01-01 03:00', '2016-08-01 01:00', - '2016-08-01 02:00', '2016-08-01 03:00'], - tz='US/Eastern') + exp_index = DatetimeIndex( + [ + "2016-01-01 01:00", + "2016-01-01 02:00", + "2016-01-01 03:00", + "2016-08-01 01:00", + "2016-08-01 02:00", + "2016-08-01 03:00", + ], + tz="US/Eastern", + ) exp = Series([1, 2, 3, 10, 11, 12], index=exp_index) tm.assert_series_equal(ts_result, exp) assert ts_result.index.tz == rng1.tz @@ -223,8 +229,10 @@ def test_series_append_dst(self): def test_dateutil_tzoffset_support(self): values = [188.5, 328.25] tzinfo = tzoffset(None, 7200) - index = [datetime(2012, 5, 11, 11, tzinfo=tzinfo), - datetime(2012, 5, 11, 12, tzinfo=tzinfo)] + index = [ + datetime(2012, 5, 11, 11, tzinfo=tzinfo), + datetime(2012, 5, 11, 12, tzinfo=tzinfo), + ] series = Series(data=values, index=index) assert series.index.tz == tzinfo @@ -232,29 +240,29 @@ def test_dateutil_tzoffset_support(self): # it works! #2443 repr(series.index[0]) - @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) def test_tz_aware_asfreq(self, tz): - dr = date_range('2011-12-01', '2012-07-20', freq='D', tz=tz) + dr = date_range("2011-12-01", "2012-07-20", freq="D", tz=tz) ser = Series(np.random.randn(len(dr)), index=dr) # it works! - ser.asfreq('T') + ser.asfreq("T") - @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) def test_string_index_alias_tz_aware(self, tz): - rng = date_range('1/1/2000', periods=10, tz=tz) + rng = date_range("1/1/2000", periods=10, tz=tz) ser = Series(np.random.randn(len(rng)), index=rng) - result = ser['1/3/2000'] + result = ser["1/3/2000"] tm.assert_almost_equal(result, ser[2]) # TODO: De-duplicate with test below def test_series_add_tz_mismatch_converts_to_utc_duplicate(self): - rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern') + rng = date_range("1/1/2011", periods=10, freq="H", tz="US/Eastern") ser = Series(np.random.randn(len(rng)), index=rng) - ts_moscow = ser.tz_convert('Europe/Moscow') + ts_moscow = ser.tz_convert("Europe/Moscow") result = ser + ts_moscow assert result.index.tz is pytz.utc @@ -263,30 +271,32 @@ def test_series_add_tz_mismatch_converts_to_utc_duplicate(self): assert result.index.tz is pytz.utc def test_series_add_tz_mismatch_converts_to_utc(self): - rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") perm = np.random.permutation(100)[:90] - ser1 = Series(np.random.randn(90), - index=rng.take(perm).tz_convert('US/Eastern')) + ser1 = Series( + np.random.randn(90), index=rng.take(perm).tz_convert("US/Eastern") + ) perm = np.random.permutation(100)[:90] - ser2 = Series(np.random.randn(90), - index=rng.take(perm).tz_convert('Europe/Berlin')) + ser2 = Series( + np.random.randn(90), index=rng.take(perm).tz_convert("Europe/Berlin") + ) result = ser1 + ser2 - uts1 = ser1.tz_convert('utc') - uts2 = ser2.tz_convert('utc') + uts1 = ser1.tz_convert("utc") + uts2 = ser2.tz_convert("utc") expected = uts1 + uts2 assert result.index.tz == pytz.UTC tm.assert_series_equal(result, expected) def test_series_add_aware_naive_raises(self): - rng = date_range('1/1/2011', periods=10, freq='H') + rng = date_range("1/1/2011", periods=10, freq="H") ser = Series(np.random.randn(len(rng)), index=rng) - ser_utc = ser.tz_localize('utc') + ser_utc = ser.tz_localize("utc") with pytest.raises(Exception): ser + ser_utc @@ -295,21 +305,22 @@ def test_series_add_aware_naive_raises(self): ser_utc + ser def test_series_align_aware(self): - idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern') + idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") ser = Series(np.random.randn(len(idx1)), index=idx1) - ser_central = ser.tz_convert('US/Central') + ser_central = ser.tz_convert("US/Central") # # different timezones convert to UTC new1, new2 = ser.align(ser_central) assert new1.index.tz == pytz.UTC assert new2.index.tz == pytz.UTC - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_localized_at_time_between_time(self, tzstr): from datetime import time + tz = timezones.maybe_get_tz(tzstr) - rng = date_range('4/16/2012', '5/1/2012', freq='H') + rng = date_range("4/16/2012", "5/1/2012", freq="H") ts = Series(np.random.randn(len(rng)), index=rng) ts_local = ts.tz_localize(tzstr) @@ -325,15 +336,15 @@ def test_localized_at_time_between_time(self, tzstr): tm.assert_series_equal(result, expected) assert timezones.tz_compare(result.index.tz, tz) - @pytest.mark.parametrize('tzstr', ['Europe/Berlin', - 'dateutil/Europe/Berlin']) + @pytest.mark.parametrize("tzstr", ["Europe/Berlin", "dateutil/Europe/Berlin"]) def test_getitem_pydatetime_tz(self, tzstr): tz = timezones.maybe_get_tz(tzstr) - index = date_range(start='2012-12-24 16:00', end='2012-12-24 18:00', - freq='H', tz=tzstr) + index = date_range( + start="2012-12-24 16:00", end="2012-12-24 18:00", freq="H", tz=tzstr + ) ts = Series(index=index, data=index.hour) - time_pandas = Timestamp('2012-12-24 17:00', tz=tzstr) + time_pandas = Timestamp("2012-12-24 17:00", tz=tzstr) dt = datetime(2012, 12, 24, 17, 0) time_datetime = conversion.localize_pydatetime(dt, tz) @@ -341,32 +352,30 @@ def test_getitem_pydatetime_tz(self, tzstr): def test_series_truncate_datetimeindex_tz(self): # GH 9243 - idx = date_range('4/1/2005', '4/30/2005', freq='D', tz='US/Pacific') + idx = date_range("4/1/2005", "4/30/2005", freq="D", tz="US/Pacific") s = Series(range(len(idx)), index=idx) result = s.truncate(datetime(2005, 4, 2), datetime(2005, 4, 4)) expected = Series([1, 2, 3], index=idx[1:4]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('copy', [True, False]) - @pytest.mark.parametrize('method, tz', [ - ['tz_localize', None], - ['tz_convert', 'Europe/Berlin'] - ]) + @pytest.mark.parametrize("copy", [True, False]) + @pytest.mark.parametrize( + "method, tz", [["tz_localize", None], ["tz_convert", "Europe/Berlin"]] + ) def test_tz_localize_convert_copy_inplace_mutate(self, copy, method, tz): # GH 6326 - result = Series(np.arange(0, 5), - index=date_range('20131027', periods=5, freq='1H', - tz=tz)) - getattr(result, method)('UTC', copy=copy) - expected = Series(np.arange(0, 5), - index=date_range('20131027', periods=5, freq='1H', - tz=tz)) + result = Series( + np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=tz) + ) + getattr(result, method)("UTC", copy=copy) + expected = Series( + np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=tz) + ) tm.assert_series_equal(result, expected) def test_constructor_data_aware_dtype_naive(self, tz_aware_fixture): # GH 25843 tz = tz_aware_fixture - result = Series([Timestamp('2019', tz=tz)], - dtype='datetime64[ns]') - expected = Series([Timestamp('2019')]) + result = Series([Timestamp("2019", tz=tz)], dtype="datetime64[ns]") + expected = Series([Timestamp("2019")]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 183aa6e3933553..c024e9caba1566 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -8,19 +8,10 @@ import pandas.util.testing as tm UNARY_UFUNCS = [np.positive, np.floor, np.exp] -BINARY_UFUNCS = [ - np.add, # dunder op - np.logaddexp, -] -SPARSE = [ - True, - False -] -SPARSE_IDS = ['sparse', 'dense'] -SHUFFLE = [ - True, - False -] +BINARY_UFUNCS = [np.add, np.logaddexp] # dunder op +SPARSE = [True, False] +SPARSE_IDS = ["sparse", "dense"] +SHUFFLE = [True, False] @pytest.fixture @@ -28,8 +19,8 @@ def arrays_for_binary_ufunc(): """ A pair of random, length-100 integer-dtype arrays, that are mostly 0. """ - a1 = np.random.randint(0, 10, 100, dtype='int64') - a2 = np.random.randint(0, 10, 100, dtype='int64') + a1 = np.random.randint(0, 10, 100, dtype="int64") + a2 = np.random.randint(0, 10, 100, dtype="int64") a1[::3] = 0 a2[::4] = 0 return a1, a2 @@ -39,10 +30,10 @@ def arrays_for_binary_ufunc(): @pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) def test_unary_ufunc(ufunc, sparse): # Test that ufunc(Series) == Series(ufunc) - array = np.random.randint(0, 10, 10, dtype='int64') + array = np.random.randint(0, 10, 10, dtype="int64") array[::2] = 0 if sparse: - array = pd.SparseArray(array, dtype=pd.SparseDtype('int64', 0)) + array = pd.SparseArray(array, dtype=pd.SparseDtype("int64", 0)) index = list(string.ascii_letters[:10]) name = "name" @@ -55,20 +46,20 @@ def test_unary_ufunc(ufunc, sparse): @pytest.mark.parametrize("ufunc", BINARY_UFUNCS) @pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) -@pytest.mark.parametrize("flip", [True, False], ids=['flipped', 'straight']) +@pytest.mark.parametrize("flip", [True, False], ids=["flipped", "straight"]) def test_binary_ufunc_with_array(flip, sparse, ufunc, arrays_for_binary_ufunc): # Test that ufunc(Series(a), array) == Series(ufunc(a, b)) a1, a2 = arrays_for_binary_ufunc if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int64', 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int64', 0)) + a1 = pd.SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = pd.SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) name = "name" # op(Series, array) preserves the name. series = pd.Series(a1, name=name) other = a2 array_args = (a1, a2) - series_args = (series, other) # ufunc(series, array) + series_args = (series, other) # ufunc(series, array) if flip: array_args = reversed(array_args) @@ -81,22 +72,22 @@ def test_binary_ufunc_with_array(flip, sparse, ufunc, arrays_for_binary_ufunc): @pytest.mark.parametrize("ufunc", BINARY_UFUNCS) @pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) -@pytest.mark.parametrize("flip", [True, False], ids=['flipped', 'straight']) +@pytest.mark.parametrize("flip", [True, False], ids=["flipped", "straight"]) def test_binary_ufunc_with_index(flip, sparse, ufunc, arrays_for_binary_ufunc): # Test that # * func(Series(a), Series(b)) == Series(ufunc(a, b)) # * ufunc(Index, Series) dispatches to Series (returns a Series) a1, a2 = arrays_for_binary_ufunc if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int64', 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int64', 0)) + a1 = pd.SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = pd.SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) name = "name" # op(Series, array) preserves the name. series = pd.Series(a1, name=name) other = pd.Index(a2, name=name).astype("int64") array_args = (a1, a2) - series_args = (series, other) # ufunc(series, array) + series_args = (series, other) # ufunc(series, array) if flip: array_args = reversed(array_args) @@ -109,18 +100,18 @@ def test_binary_ufunc_with_index(flip, sparse, ufunc, arrays_for_binary_ufunc): @pytest.mark.parametrize("ufunc", BINARY_UFUNCS) @pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) -@pytest.mark.parametrize("shuffle", [True, False], ids=['unaligned', - 'aligned']) -@pytest.mark.parametrize("flip", [True, False], ids=['flipped', 'straight']) -def test_binary_ufunc_with_series(flip, shuffle, sparse, ufunc, - arrays_for_binary_ufunc): +@pytest.mark.parametrize("shuffle", [True, False], ids=["unaligned", "aligned"]) +@pytest.mark.parametrize("flip", [True, False], ids=["flipped", "straight"]) +def test_binary_ufunc_with_series( + flip, shuffle, sparse, ufunc, arrays_for_binary_ufunc +): # Test that # * func(Series(a), Series(b)) == Series(ufunc(a, b)) # with alignment between the indices a1, a2 = arrays_for_binary_ufunc if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int64', 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int64', 0)) + a1 = pd.SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = pd.SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) name = "name" # op(Series, array) preserves the name. series = pd.Series(a1, name=name) @@ -179,8 +170,7 @@ def test_binary_ufunc_scalar(ufunc, sparse, flip, arrays_for_binary_ufunc): @pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) @pytest.mark.parametrize("shuffle", SHUFFLE) @pytest.mark.filterwarnings("ignore:divide by zero:RuntimeWarning") -def test_multiple_ouput_binary_ufuncs(ufunc, sparse, shuffle, - arrays_for_binary_ufunc): +def test_multiple_ouput_binary_ufuncs(ufunc, sparse, shuffle, arrays_for_binary_ufunc): # Test that # the same conditions from binary_ufunc_scalar apply to # ufuncs with multiple outputs. @@ -193,8 +183,8 @@ def test_multiple_ouput_binary_ufuncs(ufunc, sparse, shuffle, a2[a2 == 0] = 1 if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int64', 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int64', 0)) + a1 = pd.SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = pd.SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) s1 = pd.Series(a1) s2 = pd.Series(a2) @@ -234,12 +224,11 @@ def test_multiple_ouput_ufunc(sparse, arrays_for_binary_ufunc): @pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) @pytest.mark.parametrize("ufunc", BINARY_UFUNCS) -def test_binary_ufunc_drops_series_name(ufunc, sparse, - arrays_for_binary_ufunc): +def test_binary_ufunc_drops_series_name(ufunc, sparse, arrays_for_binary_ufunc): # Drop the names when they differ. a1, a2 = arrays_for_binary_ufunc - s1 = pd.Series(a1, name='a') - s2 = pd.Series(a2, name='b') + s1 = pd.Series(a1, name="a") + s2 = pd.Series(a2, name="b") result = ufunc(s1, s2) assert result.name is None @@ -256,55 +245,51 @@ def __add__(self, other): arr = np.array([Dummy(0), Dummy(1)]) ser = pd.Series(arr) tm.assert_series_equal(np.add(ser, ser), pd.Series(np.add(ser, arr))) - tm.assert_series_equal(np.add(ser, Dummy(1)), - pd.Series(np.add(ser, Dummy(1)))) - - -@pytest.mark.parametrize('values', [ - pd.array([1, 3, 2]), - pytest.param( - pd.array([1, 10, 0], dtype='Sparse[int]'), - marks=pytest.mark.xfail(resason='GH-27080. Bug in SparseArray') - ), - pd.to_datetime(['2000', '2010', '2001']), - pd.to_datetime(['2000', '2010', '2001']).tz_localize("CET"), - pd.to_datetime(['2000', '2010', '2001']).to_period(freq="D"), - -]) + tm.assert_series_equal(np.add(ser, Dummy(1)), pd.Series(np.add(ser, Dummy(1)))) + + +@pytest.mark.parametrize( + "values", + [ + pd.array([1, 3, 2]), + pytest.param( + pd.array([1, 10, 0], dtype="Sparse[int]"), + marks=pytest.mark.xfail(resason="GH-27080. Bug in SparseArray"), + ), + pd.to_datetime(["2000", "2010", "2001"]), + pd.to_datetime(["2000", "2010", "2001"]).tz_localize("CET"), + pd.to_datetime(["2000", "2010", "2001"]).to_period(freq="D"), + ], +) def test_reduce(values): a = pd.Series(values) assert np.maximum.reduce(a) == values[1] -@pytest.mark.parametrize('type_', [ - list, - deque, - tuple, -]) +@pytest.mark.parametrize("type_", [list, deque, tuple]) def test_binary_ufunc_other_types(type_): - a = pd.Series([1, 2, 3], name='name') + a = pd.Series([1, 2, 3], name="name") b = type_([3, 4, 5]) result = np.add(a, b) - expected = pd.Series(np.add(a.to_numpy(), b), name='name') + expected = pd.Series(np.add(a.to_numpy(), b), name="name") tm.assert_series_equal(result, expected) def test_object_dtype_ok(): - class Thing: def __init__(self, value): self.value = value def __add__(self, other): - other = getattr(other, 'value', other) + other = getattr(other, "value", other) return type(self)(self.value + other) def __eq__(self, other): return type(other) is Thing and self.value == other.value def __repr__(self): - return 'Thing({})'.format(self.value) + return "Thing({})".format(self.value) s = pd.Series([Thing(1), Thing(2)]) result = np.add(s, Thing(1)) @@ -319,9 +304,5 @@ def test_outer(): with tm.assert_produces_warning(FutureWarning): result = np.subtract.outer(s, o) - expected = np.array([ - [0, -1, -2], - [1, 0, -1], - [2, 1, 0] - ], dtype=np.dtype('int64')) + expected = np.array([[0, -1, -2], [1, 0, -1], [2, 1, 0]], dtype=np.dtype("int64")) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/series/test_validate.py b/pandas/tests/series/test_validate.py index cef38d5ce3f233..c4311f507f7ee5 100644 --- a/pandas/tests/series/test_validate.py +++ b/pandas/tests/series/test_validate.py @@ -4,12 +4,13 @@ class TestSeriesValidate: """Tests for error handling related to data types of method arguments.""" - @pytest.mark.parametrize("func", ["reset_index", "_set_name", - "sort_values", "sort_index", - "rename", "dropna"]) + @pytest.mark.parametrize( + "func", + ["reset_index", "_set_name", "sort_values", "sort_index", "rename", "dropna"], + ) @pytest.mark.parametrize("inplace", [1, "True", [1, 2, 3], 5.0]) def test_validate_bool_args(self, string_series, func, inplace): - msg = "For argument \"inplace\" expected type bool" + msg = 'For argument "inplace" expected type bool' kwargs = dict(inplace=inplace) if func == "_set_name": diff --git a/pandas/tests/sparse/frame/conftest.py b/pandas/tests/sparse/frame/conftest.py index 3423260c1720ad..989b58419c2cd6 100644 --- a/pandas/tests/sparse/frame/conftest.py +++ b/pandas/tests/sparse/frame/conftest.py @@ -3,16 +3,19 @@ from pandas import DataFrame, SparseArray, SparseDataFrame, bdate_range -data = {'A': [np.nan, np.nan, np.nan, 0, 1, 2, 3, 4, 5, 6], - 'B': [0, 1, 2, np.nan, np.nan, np.nan, 3, 4, 5, 6], - 'C': np.arange(10, dtype=np.float64), - 'D': [0, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan]} -dates = bdate_range('1/1/2011', periods=10) +data = { + "A": [np.nan, np.nan, np.nan, 0, 1, 2, 3, 4, 5, 6], + "B": [0, 1, 2, np.nan, np.nan, np.nan, 3, 4, 5, 6], + "C": np.arange(10, dtype=np.float64), + "D": [0, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan], +} +dates = bdate_range("1/1/2011", periods=10) # fixture names must be compatible with the tests in # tests/frame/test_api.SharedWithSparse + @pytest.fixture def float_frame_dense(): """ @@ -31,7 +34,7 @@ def float_frame(): Columns are ['A', 'B', 'C', 'D']; some entries are missing """ # default_kind='block' is the default - return SparseDataFrame(data, index=dates, default_kind='block') + return SparseDataFrame(data, index=dates, default_kind="block") @pytest.fixture @@ -42,7 +45,7 @@ def float_frame_int_kind(): Columns are ['A', 'B', 'C', 'D'] and default_kind='integer'. Some entries are missing. """ - return SparseDataFrame(data, index=dates, default_kind='integer') + return SparseDataFrame(data, index=dates, default_kind="integer") @pytest.fixture @@ -53,7 +56,7 @@ def float_string_frame(): Columns are ['A', 'B', 'C', 'D', 'foo']; some entries are missing """ sdf = SparseDataFrame(data, index=dates) - sdf['foo'] = SparseArray(['bar'] * len(dates)) + sdf["foo"] = SparseArray(["bar"] * len(dates)) return sdf @@ -66,7 +69,7 @@ def float_frame_fill0_dense(): """ values = SparseDataFrame(data).values values[np.isnan(values)] = 0 - return DataFrame(values, columns=['A', 'B', 'C', 'D'], index=dates) + return DataFrame(values, columns=["A", "B", "C", "D"], index=dates) @pytest.fixture @@ -78,8 +81,9 @@ def float_frame_fill0(): """ values = SparseDataFrame(data).values values[np.isnan(values)] = 0 - return SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], - default_fill_value=0, index=dates) + return SparseDataFrame( + values, columns=["A", "B", "C", "D"], default_fill_value=0, index=dates + ) @pytest.fixture @@ -91,7 +95,7 @@ def float_frame_fill2_dense(): """ values = SparseDataFrame(data).values values[np.isnan(values)] = 2 - return DataFrame(values, columns=['A', 'B', 'C', 'D'], index=dates) + return DataFrame(values, columns=["A", "B", "C", "D"], index=dates) @pytest.fixture @@ -103,8 +107,9 @@ def float_frame_fill2(): """ values = SparseDataFrame(data).values values[np.isnan(values)] = 2 - return SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], - default_fill_value=2, index=dates) + return SparseDataFrame( + values, columns=["A", "B", "C", "D"], default_fill_value=2, index=dates + ) @pytest.fixture diff --git a/pandas/tests/sparse/frame/test_analytics.py b/pandas/tests/sparse/frame/test_analytics.py index ae97682f297ad4..fae879b3d33b59 100644 --- a/pandas/tests/sparse/frame/test_analytics.py +++ b/pandas/tests/sparse/frame/test_analytics.py @@ -6,7 +6,7 @@ @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)') +@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") def test_quantile(): # GH 17386 data = [[1, 1], [2, 10], [3, 100], [np.nan, np.nan]] @@ -24,7 +24,7 @@ def test_quantile(): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)') +@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") def test_quantile_multi(): # GH 17386 data = [[1, 1], [2, 10], [3, 100], [np.nan, np.nan]] diff --git a/pandas/tests/sparse/frame/test_apply.py b/pandas/tests/sparse/frame/test_apply.py index 4e677f5055e797..d8158db32d8f08 100644 --- a/pandas/tests/sparse/frame/test_apply.py +++ b/pandas/tests/sparse/frame/test_apply.py @@ -9,7 +9,7 @@ @pytest.fixture def dates(): - return bdate_range('1/1/2011', periods=10) + return bdate_range("1/1/2011", periods=10) @pytest.fixture @@ -19,10 +19,12 @@ def empty(): @pytest.fixture def frame(dates): - data = {'A': [np.nan, np.nan, np.nan, 0, 1, 2, 3, 4, 5, 6], - 'B': [0, 1, 2, np.nan, np.nan, np.nan, 3, 4, 5, 6], - 'C': np.arange(10, dtype=np.float64), - 'D': [0, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan]} + data = { + "A": [np.nan, np.nan, np.nan, 0, 1, 2, 3, 4, 5, 6], + "B": [0, 1, 2, np.nan, np.nan, np.nan, 3, 4, 5, 6], + "C": np.arange(10, dtype=np.float64), + "D": [0, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan], + } return SparseDataFrame(data, index=dates) @@ -32,9 +34,9 @@ def fill_frame(frame): values = frame.values.copy() values[np.isnan(values)] = 2 - return SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], - default_fill_value=2, - index=frame.index) + return SparseDataFrame( + values, columns=["A", "B", "C", "D"], default_fill_value=2, index=frame.index + ) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @@ -46,25 +48,22 @@ def test_apply(frame): # agg / broadcast # two FutureWarnings, so we can't check stacklevel properly. - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): broadcasted = frame.apply(np.sum, broadcast=True) assert isinstance(broadcasted, SparseDataFrame) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): exp = frame.to_dense().apply(np.sum, broadcast=True) tm.assert_frame_equal(broadcasted.to_dense(), exp) applied = frame.apply(np.sum) - tm.assert_series_equal(applied, - frame.to_dense().apply(nanops.nansum).to_sparse()) + tm.assert_series_equal(applied, frame.to_dense().apply(nanops.nansum).to_sparse()) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_apply_fill(fill_frame): applied = fill_frame.apply(np.sqrt) - assert applied['A'].fill_value == np.sqrt(2) + assert applied["A"].fill_value == np.sqrt(2) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @@ -75,8 +74,7 @@ def test_apply_empty(empty): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") def test_apply_nonuq(): - orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=['a', 'a', 'c']) + orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) sparse = orig.to_sparse() res = sparse.apply(lambda s: s[0], axis=1) exp = orig.apply(lambda s: s[0], axis=1) @@ -107,8 +105,11 @@ def test_applymap(frame): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_apply_keep_sparse_dtype(): # GH 23744 - sdf = SparseDataFrame(np.array([[0, 1, 0], [0, 0, 0], [0, 0, 1]]), - columns=['b', 'a', 'c'], default_fill_value=1) + sdf = SparseDataFrame( + np.array([[0, 1, 0], [0, 0, 0], [0, 0, 1]]), + columns=["b", "a", "c"], + default_fill_value=1, + ) df = DataFrame(sdf) expected = sdf.apply(np.exp) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index d3e2e1357f9d7c..96e3c4640d2f6f 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -12,7 +12,11 @@ from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.sparse import frame as spf from pandas.core.sparse.api import ( - SparseArray, SparseDataFrame, SparseDtype, SparseSeries) + SparseArray, + SparseDataFrame, + SparseDtype, + SparseSeries, +) from pandas.tests.frame.test_api import SharedWithSparse from pandas.util import testing as tm @@ -53,8 +57,8 @@ def test_itertuples(self, float_frame): def test_fill_value_when_combine_const(self): # GH12723 - dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float') - df = SparseDataFrame({'foo': dat}, index=range(6)) + dat = np.array([0, 1, np.nan, 3, 4, 5], dtype="float") + df = SparseDataFrame({"foo": dat}, index=range(6)) exp = df.fillna(0).add(2) res = df.add(2, fill_value=0) @@ -81,23 +85,23 @@ def test_copy(self, float_frame): # this is now identical (but not is_a ) assert cp.index.identical(float_frame.index) - def test_constructor(self, float_frame, float_frame_int_kind, - float_frame_fill0): + def test_constructor(self, float_frame, float_frame_int_kind, float_frame_fill0): for col, series in float_frame.items(): assert isinstance(series, SparseSeries) - assert isinstance(float_frame_int_kind['A'].sp_index, IntIndex) + assert isinstance(float_frame_int_kind["A"].sp_index, IntIndex) # constructed zframe from matrix above - assert float_frame_fill0['A'].fill_value == 0 + assert float_frame_fill0["A"].fill_value == 0 # XXX: changed asarray - expected = pd.SparseArray([0, 0, 0, 0, 1., 2., 3., 4., 5., 6.], - fill_value=0, kind='block') - tm.assert_sp_array_equal(expected, - float_frame_fill0['A'].values) - tm.assert_numpy_array_equal(np.array([0., 0., 0., 0., 1., 2., - 3., 4., 5., 6.]), - float_frame_fill0['A'].to_dense().values) + expected = pd.SparseArray( + [0, 0, 0, 0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0], fill_value=0, kind="block" + ) + tm.assert_sp_array_equal(expected, float_frame_fill0["A"].values) + tm.assert_numpy_array_equal( + np.array([0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]), + float_frame_fill0["A"].to_dense().values, + ) # construct no data sdf = SparseDataFrame(columns=np.arange(10), index=np.arange(10)) @@ -115,9 +119,13 @@ def test_constructor(self, float_frame, float_frame_int_kind, # init dict with different index idx = float_frame.index[:5] cons = SparseDataFrame( - float_frame, index=idx, columns=float_frame.columns, + float_frame, + index=idx, + columns=float_frame.columns, default_fill_value=float_frame.default_fill_value, - default_kind=float_frame.default_kind, copy=True) + default_kind=float_frame.default_kind, + copy=True, + ) reindexed = float_frame.reindex(idx) tm.assert_sp_frame_equal(cons, reindexed, exact_indices=False) @@ -132,12 +140,12 @@ def test_constructor_dict_order(self): # GH19018 # initialization ordering: by insertion order if python>= 3.6, else # order by value - d = {'b': [2, 3], 'a': [0, 1]} + d = {"b": [2, 3], "a": [0, 1]} frame = SparseDataFrame(data=d) if compat.PY36: - expected = SparseDataFrame(data=d, columns=list('ba')) + expected = SparseDataFrame(data=d, columns=list("ba")) else: - expected = SparseDataFrame(data=d, columns=list('ab')) + expected = SparseDataFrame(data=d, columns=list("ab")) tm.assert_sp_frame_equal(frame, expected) def test_constructor_ndarray(self, float_frame): @@ -145,22 +153,22 @@ def test_constructor_ndarray(self, float_frame): sp = SparseDataFrame(float_frame.values) # 1d - sp = SparseDataFrame(float_frame['A'].values, index=float_frame.index, - columns=['A']) - tm.assert_sp_frame_equal(sp, float_frame.reindex(columns=['A'])) + sp = SparseDataFrame( + float_frame["A"].values, index=float_frame.index, columns=["A"] + ) + tm.assert_sp_frame_equal(sp, float_frame.reindex(columns=["A"])) # raise on level argument msg = "Reindex by level not supported for sparse" with pytest.raises(TypeError, match=msg): - float_frame.reindex(columns=['A'], level=1) + float_frame.reindex(columns=["A"], level=1) # wrong length index / columns with pytest.raises(ValueError, match="^Index length"): SparseDataFrame(float_frame.values, index=float_frame.index[:-1]) with pytest.raises(ValueError, match="^Column length"): - SparseDataFrame(float_frame.values, - columns=float_frame.columns[:-1]) + SparseDataFrame(float_frame.values, columns=float_frame.columns[:-1]) # GH 9272 def test_constructor_empty(self): @@ -181,14 +189,14 @@ def test_constructor_convert_index_once(self): def test_constructor_from_series(self): # GH 2873 - x = Series(np.random.randn(10000), name='a') + x = Series(np.random.randn(10000), name="a") x = x.to_sparse(fill_value=0) assert isinstance(x, SparseSeries) df = SparseDataFrame(x) assert isinstance(df, SparseDataFrame) - x = Series(np.random.randn(10000), name='a') - y = Series(np.random.randn(10000), name='b') + x = Series(np.random.randn(10000), name="a") + y = Series(np.random.randn(10000), name="b") x2 = x.astype(float) x2.loc[:9998] = np.NaN # TODO: x_sparse is unused...fix @@ -206,7 +214,7 @@ def test_constructor_from_series(self): def test_constructor_from_dense_series(self): # GH 19393 # series with name - x = Series(np.random.randn(10000), name='a') + x = Series(np.random.randn(10000), name="a") result = SparseDataFrame(x) expected = x.to_frame().to_sparse() tm.assert_sp_frame_equal(result, expected) @@ -221,9 +229,14 @@ def test_constructor_from_unknown_type(self): # GH 19393 class Unknown: pass - with pytest.raises(TypeError, - match=('SparseDataFrame called with unknown type ' - '"Unknown" for data argument')): + + with pytest.raises( + TypeError, + match=( + "SparseDataFrame called with unknown type " + '"Unknown" for data argument' + ), + ): SparseDataFrame(Unknown()) def test_constructor_preserve_attr(self): @@ -232,58 +245,63 @@ def test_constructor_preserve_attr(self): assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 - df = pd.SparseDataFrame({'x': arr}) - assert df['x'].dtype == SparseDtype(np.int64) - assert df['x'].fill_value == 0 + df = pd.SparseDataFrame({"x": arr}) + assert df["x"].dtype == SparseDtype(np.int64) + assert df["x"].fill_value == 0 - s = pd.SparseSeries(arr, name='x') + s = pd.SparseSeries(arr, name="x") assert s.dtype == SparseDtype(np.int64) assert s.fill_value == 0 df = pd.SparseDataFrame(s) - assert df['x'].dtype == SparseDtype(np.int64) - assert df['x'].fill_value == 0 + assert df["x"].dtype == SparseDtype(np.int64) + assert df["x"].fill_value == 0 - df = pd.SparseDataFrame({'x': s}) - assert df['x'].dtype == SparseDtype(np.int64) - assert df['x'].fill_value == 0 + df = pd.SparseDataFrame({"x": s}) + assert df["x"].dtype == SparseDtype(np.int64) + assert df["x"].fill_value == 0 def test_constructor_nan_dataframe(self): # GH 10079 trains = np.arange(100) thresholds = [10, 20, 30, 40, 50, 60] tuples = [(i, j) for i in trains for j in thresholds] - index = pd.MultiIndex.from_tuples(tuples, - names=['trains', 'thresholds']) + index = pd.MultiIndex.from_tuples(tuples, names=["trains", "thresholds"]) matrix = np.empty((len(index), len(trains))) matrix.fill(np.nan) df = pd.DataFrame(matrix, index=index, columns=trains, dtype=float) result = df.to_sparse() - expected = pd.SparseDataFrame(matrix, index=index, columns=trains, - dtype=float) + expected = pd.SparseDataFrame(matrix, index=index, columns=trains, dtype=float) tm.assert_sp_frame_equal(result, expected) def test_type_coercion_at_construction(self): # GH 15682 result = pd.SparseDataFrame( - {'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1]}, dtype='uint8', - default_fill_value=0) + {"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]}, + dtype="uint8", + default_fill_value=0, + ) expected = pd.SparseDataFrame( - {'a': pd.SparseSeries([1, 0, 0], dtype='uint8'), - 'b': pd.SparseSeries([0, 1, 0], dtype='uint8'), - 'c': pd.SparseSeries([0, 0, 1], dtype='uint8')}, - default_fill_value=0) + { + "a": pd.SparseSeries([1, 0, 0], dtype="uint8"), + "b": pd.SparseSeries([0, 1, 0], dtype="uint8"), + "c": pd.SparseSeries([0, 0, 1], dtype="uint8"), + }, + default_fill_value=0, + ) tm.assert_sp_frame_equal(result, expected) def test_default_dtype(self): - result = pd.SparseDataFrame(columns=list('ab'), index=range(2)) - expected = pd.SparseDataFrame([[np.nan, np.nan], [np.nan, np.nan]], - columns=list('ab'), index=range(2)) + result = pd.SparseDataFrame(columns=list("ab"), index=range(2)) + expected = pd.SparseDataFrame( + [[np.nan, np.nan], [np.nan, np.nan]], columns=list("ab"), index=range(2) + ) tm.assert_sp_frame_equal(result, expected) def test_nan_data_with_int_dtype_raises_error(self): - sdf = pd.SparseDataFrame([[np.nan, np.nan], [np.nan, np.nan]], - columns=list('ab'), index=range(2)) + sdf = pd.SparseDataFrame( + [[np.nan, np.nan], [np.nan, np.nan]], columns=list("ab"), index=range(2) + ) msg = "Cannot convert non-finite values" with pytest.raises(ValueError, match=msg): pd.SparseDataFrame(sdf, dtype=np.int64) @@ -293,11 +311,12 @@ def test_dtypes(self): df.loc[:9998] = np.nan sdf = df.to_sparse() result = sdf.dtypes - expected = Series(['Sparse[float64, nan]'] * 4) + expected = Series(["Sparse[float64, nan]"] * 4) tm.assert_series_equal(result, expected) - def test_shape(self, float_frame, float_frame_int_kind, - float_frame_fill0, float_frame_fill2): + def test_shape( + self, float_frame, float_frame_int_kind, float_frame_fill0, float_frame_fill2 + ): # see gh-10452 assert float_frame.shape == (10, 4) assert float_frame_int_kind.shape == (10, 4) @@ -316,10 +335,16 @@ def test_array_interface(self, float_frame): dres = np.sqrt(float_frame.to_dense()) tm.assert_frame_equal(res.to_dense(), dres) - def test_pickle(self, float_frame, float_frame_int_kind, float_frame_dense, - float_frame_fill0, float_frame_fill0_dense, - float_frame_fill2, float_frame_fill2_dense): - + def test_pickle( + self, + float_frame, + float_frame_int_kind, + float_frame_dense, + float_frame_fill0, + float_frame_fill0_dense, + float_frame_fill2, + float_frame_fill2_dense, + ): def _test_roundtrip(frame, orig): result = tm.round_trip_pickle(frame) tm.assert_sp_frame_equal(frame, result) @@ -332,19 +357,17 @@ def _test_roundtrip(frame, orig): _test_roundtrip(float_frame_fill2, float_frame_fill2_dense) def test_dense_to_sparse(self): - df = DataFrame({'A': [nan, nan, nan, 1, 2], - 'B': [1, 2, nan, nan, nan]}) + df = DataFrame({"A": [nan, nan, nan, 1, 2], "B": [1, 2, nan, nan, nan]}) sdf = df.to_sparse() assert isinstance(sdf, SparseDataFrame) assert np.isnan(sdf.default_fill_value) - assert isinstance(sdf['A'].sp_index, BlockIndex) + assert isinstance(sdf["A"].sp_index, BlockIndex) tm.assert_frame_equal(sdf.to_dense(), df) - sdf = df.to_sparse(kind='integer') - assert isinstance(sdf['A'].sp_index, IntIndex) + sdf = df.to_sparse(kind="integer") + assert isinstance(sdf["A"].sp_index, IntIndex) - df = DataFrame({'A': [0, 0, 0, 1, 2], - 'B': [1, 2, 0, 0, 0]}, dtype=float) + df = DataFrame({"A": [0, 0, 0, 1, 2], "B": [1, 2, 0, 0, 0]}, dtype=float) sdf = df.to_sparse(fill_value=0) assert sdf.default_fill_value == 0 tm.assert_frame_equal(sdf.to_dense(), df) @@ -356,8 +379,7 @@ def test_deprecated_dense_to_sparse(self): df = pd.DataFrame({"A": [1, np.nan, 3]}) sparse_df = pd.SparseDataFrame({"A": [1, np.nan, 3]}) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.to_sparse() tm.assert_frame_equal(result, sparse_df) @@ -365,10 +387,14 @@ def test_density(self): df = SparseSeries([nan, nan, nan, 0, 1, 2, 3, 4, 5, 6]) assert df.density == 0.7 - df = SparseDataFrame({'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], - 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], - 'C': np.arange(10), - 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]}) + df = SparseDataFrame( + { + "A": [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], + "B": [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], + "C": np.arange(10), + "D": [0, 1, 2, 3, 4, 5, nan, nan, nan, nan], + } + ) assert df.density == 0.75 @@ -388,41 +414,50 @@ def test_sparse_series_ops_fill(self, float_frame_fill2): self._check_frame_ops(float_frame_fill2) def _check_frame_ops(self, frame): - def _compare_to_dense(a, b, da, db, op): sparse_result = op(a, b) dense_result = op(da, db) fill = sparse_result.default_fill_value dense_result = dense_result.to_sparse(fill_value=fill) - tm.assert_sp_frame_equal(sparse_result, dense_result, - exact_indices=False) + tm.assert_sp_frame_equal(sparse_result, dense_result, exact_indices=False) if isinstance(a, DataFrame) and isinstance(db, DataFrame): mixed_result = op(a, db) assert isinstance(mixed_result, SparseDataFrame) - tm.assert_sp_frame_equal(mixed_result, sparse_result, - exact_indices=False) + tm.assert_sp_frame_equal( + mixed_result, sparse_result, exact_indices=False + ) - opnames = ['add', 'sub', 'mul', 'truediv', 'floordiv'] + opnames = ["add", "sub", "mul", "truediv", "floordiv"] ops = [getattr(operator, name) for name in opnames] fidx = frame.index # time series operations - series = [frame['A'], frame['B'], frame['C'], frame['D'], - frame['A'].reindex(fidx[:7]), frame['A'].reindex(fidx[::2]), - SparseSeries( - [], index=[])] + series = [ + frame["A"], + frame["B"], + frame["C"], + frame["D"], + frame["A"].reindex(fidx[:7]), + frame["A"].reindex(fidx[::2]), + SparseSeries([], index=[]), + ] for op in opnames: - _compare_to_dense(frame, frame[::2], frame.to_dense(), - frame[::2].to_dense(), getattr(operator, op)) + _compare_to_dense( + frame, + frame[::2], + frame.to_dense(), + frame[::2].to_dense(), + getattr(operator, op), + ) # 2304, no auto-broadcasting for i, s in enumerate(series): - f = lambda a, b: getattr(a, op)(b, axis='index') + f = lambda a, b: getattr(a, op)(b, axis="index") _compare_to_dense(frame, s, frame.to_dense(), s.to_dense(), f) # rops are not implemented @@ -430,8 +465,13 @@ def _compare_to_dense(a, b, da, db, op): # frame.to_dense(), f) # cross-sectional operations - series = [frame.xs(fidx[0]), frame.xs(fidx[3]), frame.xs(fidx[5]), - frame.xs(fidx[7]), frame.xs(fidx[5])[:2]] + series = [ + frame.xs(fidx[0]), + frame.xs(fidx[3]), + frame.xs(fidx[5]), + frame.xs(fidx[7]), + frame.xs(fidx[5])[:2], + ] for op in ops: for s in series: @@ -439,7 +479,7 @@ def _compare_to_dense(a, b, da, db, op): _compare_to_dense(s, frame, s, frame.to_dense(), op) # it works! - result = frame + frame.loc[:, ['A', 'B']] # noqa + result = frame + frame.loc[:, ["A", "B"]] # noqa def test_op_corners(self, float_frame, empty_frame): empty = empty_frame + empty_frame @@ -457,58 +497,53 @@ def test_scalar_ops(self): def test_getitem(self): # 1585 select multiple columns - sdf = SparseDataFrame(index=[0, 1, 2], columns=['a', 'b', 'c']) + sdf = SparseDataFrame(index=[0, 1, 2], columns=["a", "b", "c"]) - result = sdf[['a', 'b']] - exp = sdf.reindex(columns=['a', 'b']) + result = sdf[["a", "b"]] + exp = sdf.reindex(columns=["a", "b"]) tm.assert_sp_frame_equal(result, exp) with pytest.raises(KeyError, match=r"\['d'\] not in index"): - sdf[['a', 'd']] + sdf[["a", "d"]] def test_iloc(self, float_frame): # GH 2227 result = float_frame.iloc[:, 0] assert isinstance(result, SparseSeries) - tm.assert_sp_series_equal(result, float_frame['A']) + tm.assert_sp_series_equal(result, float_frame["A"]) # preserve sparse index type. #2251 - data = {'A': [0, 1]} - iframe = SparseDataFrame(data, default_kind='integer') - tm.assert_class_equal(iframe['A'].sp_index, - iframe.iloc[:, 0].sp_index) + data = {"A": [0, 1]} + iframe = SparseDataFrame(data, default_kind="integer") + tm.assert_class_equal(iframe["A"].sp_index, iframe.iloc[:, 0].sp_index) def test_set_value(self, float_frame): # ok, as the index gets converted to object frame = float_frame.copy() - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - res = frame.set_value('foobar', 'B', 1.5) - assert res.index.dtype == 'object' + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + res = frame.set_value("foobar", "B", 1.5) + assert res.index.dtype == "object" res = float_frame res.index = res.index.astype(object) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - res = float_frame.set_value('foobar', 'B', 1.5) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + res = float_frame.set_value("foobar", "B", 1.5) assert res is not float_frame - assert res.index[-1] == 'foobar' - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - assert res.get_value('foobar', 'B') == 1.5 - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - res2 = res.set_value('foobar', 'qux', 1.5) + assert res.index[-1] == "foobar" + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert res.get_value("foobar", "B") == 1.5 + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + res2 = res.set_value("foobar", "qux", 1.5) assert res2 is not res - tm.assert_index_equal(res2.columns, - pd.Index(list(float_frame.columns) + ['qux'])) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - assert res2.get_value('foobar', 'qux') == 1.5 + tm.assert_index_equal( + res2.columns, pd.Index(list(float_frame.columns) + ["qux"]) + ) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert res2.get_value("foobar", "qux") == 1.5 def test_fancy_index_misc(self, float_frame): # axis = 0 @@ -524,8 +559,7 @@ def test_fancy_index_misc(self, float_frame): def test_getitem_overload(self, float_frame): # slicing sl = float_frame[:20] - tm.assert_sp_frame_equal(sl, - float_frame.reindex(float_frame.index[:20])) + tm.assert_sp_frame_equal(sl, float_frame.reindex(float_frame.index[:20])) # boolean indexing d = float_frame.index[5] @@ -539,136 +573,146 @@ def test_getitem_overload(self, float_frame): with pytest.raises(ValueError, match=msg): float_frame[indexer[:-1]] - def test_setitem(self, float_frame, float_frame_int_kind, - float_frame_dense, - float_frame_fill0, float_frame_fill0_dense, - float_frame_fill2, float_frame_fill2_dense): - + def test_setitem( + self, + float_frame, + float_frame_int_kind, + float_frame_dense, + float_frame_fill0, + float_frame_fill0_dense, + float_frame_fill2, + float_frame_fill2_dense, + ): def _check_frame(frame, orig): N = len(frame) # insert SparseSeries - frame['E'] = frame['A'] - assert isinstance(frame['E'], SparseSeries) - tm.assert_sp_series_equal(frame['E'], frame['A'], - check_names=False) + frame["E"] = frame["A"] + assert isinstance(frame["E"], SparseSeries) + tm.assert_sp_series_equal(frame["E"], frame["A"], check_names=False) # insert SparseSeries differently-indexed - to_insert = frame['A'][::2] - frame['E'] = to_insert + to_insert = frame["A"][::2] + frame["E"] = to_insert expected = to_insert.to_dense().reindex(frame.index) - result = frame['E'].to_dense() + result = frame["E"].to_dense() tm.assert_series_equal(result, expected, check_names=False) - assert result.name == 'E' + assert result.name == "E" # insert Series - frame['F'] = frame['A'].to_dense() - assert isinstance(frame['F'], SparseSeries) - tm.assert_sp_series_equal(frame['F'], frame['A'], - check_names=False) + frame["F"] = frame["A"].to_dense() + assert isinstance(frame["F"], SparseSeries) + tm.assert_sp_series_equal(frame["F"], frame["A"], check_names=False) # insert Series differently-indexed - to_insert = frame['A'].to_dense()[::2] - frame['G'] = to_insert + to_insert = frame["A"].to_dense()[::2] + frame["G"] = to_insert expected = to_insert.reindex(frame.index) - expected.name = 'G' - tm.assert_series_equal(frame['G'].to_dense(), expected) + expected.name = "G" + tm.assert_series_equal(frame["G"].to_dense(), expected) # insert ndarray - frame['H'] = np.random.randn(N) - assert isinstance(frame['H'], SparseSeries) + frame["H"] = np.random.randn(N) + assert isinstance(frame["H"], SparseSeries) to_sparsify = np.random.randn(N) - to_sparsify[N // 2:] = frame.default_fill_value - frame['I'] = to_sparsify - assert len(frame['I'].sp_values) == N // 2 + to_sparsify[N // 2 :] = frame.default_fill_value + frame["I"] = to_sparsify + assert len(frame["I"].sp_values) == N // 2 # insert ndarray wrong size # GH 25484 - msg = 'Length of values does not match length of index' + msg = "Length of values does not match length of index" with pytest.raises(ValueError, match=msg): - frame['foo'] = np.random.randn(N - 1) + frame["foo"] = np.random.randn(N - 1) # scalar value - frame['J'] = 5 - assert len(frame['J'].sp_values) == N - assert (frame['J'].sp_values == 5).all() + frame["J"] = 5 + assert len(frame["J"].sp_values) == N + assert (frame["J"].sp_values == 5).all() - frame['K'] = frame.default_fill_value - assert len(frame['K'].sp_values) == 0 + frame["K"] = frame.default_fill_value + assert len(frame["K"].sp_values) == 0 _check_frame(float_frame, float_frame_dense) _check_frame(float_frame_int_kind, float_frame_dense) _check_frame(float_frame_fill0, float_frame_fill0_dense) _check_frame(float_frame_fill2, float_frame_fill2_dense) - @pytest.mark.parametrize('values', [ - [True, False], - [0, 1], - [1, None], - ['a', 'b'], - [pd.Timestamp('2017'), pd.NaT], - [pd.Timedelta('10s'), pd.NaT], - ]) + @pytest.mark.parametrize( + "values", + [ + [True, False], + [0, 1], + [1, None], + ["a", "b"], + [pd.Timestamp("2017"), pd.NaT], + [pd.Timedelta("10s"), pd.NaT], + ], + ) def test_setitem_more(self, values): df = pd.DataFrame({"A": values}) - df['A'] = pd.SparseArray(values) - expected = pd.DataFrame({'A': pd.SparseArray(values)}) + df["A"] = pd.SparseArray(values) + expected = pd.DataFrame({"A": pd.SparseArray(values)}) tm.assert_frame_equal(df, expected) def test_setitem_corner(self, float_frame): - float_frame['a'] = float_frame['B'] - tm.assert_sp_series_equal(float_frame['a'], float_frame['B'], - check_names=False) + float_frame["a"] = float_frame["B"] + tm.assert_sp_series_equal(float_frame["a"], float_frame["B"], check_names=False) def test_setitem_array(self, float_frame): - arr = float_frame['B'] + arr = float_frame["B"] - float_frame['E'] = arr - tm.assert_sp_series_equal(float_frame['E'], float_frame['B'], - check_names=False) + float_frame["E"] = arr + tm.assert_sp_series_equal(float_frame["E"], float_frame["B"], check_names=False) - float_frame['F'] = arr[:-1] + float_frame["F"] = arr[:-1] index = float_frame.index[:-1] - tm.assert_sp_series_equal(float_frame['E'].reindex(index), - float_frame['F'].reindex(index), - check_names=False) + tm.assert_sp_series_equal( + float_frame["E"].reindex(index), + float_frame["F"].reindex(index), + check_names=False, + ) def test_setitem_chained_no_consolidate(self): # https://github.com/pandas-dev/pandas/pull/19268 # issuecomment-361696418 # chained setitem used to cause consolidation sdf = pd.SparseDataFrame([[np.nan, 1], [2, np.nan]]) - with pd.option_context('mode.chained_assignment', None): + with pd.option_context("mode.chained_assignment", None): sdf[0][1] = 2 assert len(sdf._data.blocks) == 2 def test_delitem(self, float_frame): - A = float_frame['A'] - C = float_frame['C'] + A = float_frame["A"] + C = float_frame["C"] - del float_frame['B'] - assert 'B' not in float_frame - tm.assert_sp_series_equal(float_frame['A'], A) - tm.assert_sp_series_equal(float_frame['C'], C) + del float_frame["B"] + assert "B" not in float_frame + tm.assert_sp_series_equal(float_frame["A"], A) + tm.assert_sp_series_equal(float_frame["C"], C) - del float_frame['D'] - assert 'D' not in float_frame + del float_frame["D"] + assert "D" not in float_frame - del float_frame['A'] - assert 'A' not in float_frame + del float_frame["A"] + assert "A" not in float_frame def test_set_columns(self, float_frame): float_frame.columns = float_frame.columns - msg = ("Length mismatch: Expected axis has 4 elements, new values have" - " 3 elements") + msg = ( + "Length mismatch: Expected axis has 4 elements, new values have" + " 3 elements" + ) with pytest.raises(ValueError, match=msg): float_frame.columns = float_frame.columns[:-1] def test_set_index(self, float_frame): float_frame.index = float_frame.index - msg = ("Length mismatch: Expected axis has 10 elements, new values" - " have 9 elements") + msg = ( + "Length mismatch: Expected axis has 10 elements, new values" + " have 9 elements" + ) with pytest.raises(ValueError, match=msg): float_frame.index = float_frame.index[:-1] @@ -687,22 +731,28 @@ def test_append(self, float_frame): a = float_frame.iloc[:5, :3] b = float_frame.iloc[5:] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ): # Stacklevel is set for pd.concat, not append appended = a.append(b) - tm.assert_sp_frame_equal(appended.iloc[:, :3], float_frame.iloc[:, :3], - exact_indices=False) + tm.assert_sp_frame_equal( + appended.iloc[:, :3], float_frame.iloc[:, :3], exact_indices=False + ) - a = a[['B', 'C', 'A']].head(2) + a = a[["B", "C", "A"]].head(2) b = b.head(2) - expected = pd.SparseDataFrame({ - "B": [0., 1, None, 3], - "C": [0., 1, 5, 6], - "A": [None, None, 2, 3], - "D": [None, None, 5, None], - }, index=a.index | b.index, columns=['B', 'C', 'A', 'D']) + expected = pd.SparseDataFrame( + { + "B": [0.0, 1, None, 3], + "C": [0.0, 1, 5, 6], + "A": [None, None, 2, 3], + "D": [None, None, 5, None], + }, + index=a.index | b.index, + columns=["B", "C", "A", "D"], + ) with tm.assert_produces_warning(None, raise_on_extra_warnings=False): appended = a.append(b, sort=False) @@ -711,77 +761,97 @@ def test_append(self, float_frame): with tm.assert_produces_warning(None, raise_on_extra_warnings=False): appended = a.append(b, sort=True) - tm.assert_sp_frame_equal(appended, expected[['A', 'B', 'C', 'D']], - consolidate_block_indices=True, - check_kind=False) + tm.assert_sp_frame_equal( + appended, + expected[["A", "B", "C", "D"]], + consolidate_block_indices=True, + check_kind=False, + ) def test_astype(self): - sparse = pd.SparseDataFrame({'A': SparseArray([1, 2, 3, 4], - dtype=np.int64), - 'B': SparseArray([4, 5, 6, 7], - dtype=np.int64)}) - assert sparse['A'].dtype == SparseDtype(np.int64) - assert sparse['B'].dtype == SparseDtype(np.int64) + sparse = pd.SparseDataFrame( + { + "A": SparseArray([1, 2, 3, 4], dtype=np.int64), + "B": SparseArray([4, 5, 6, 7], dtype=np.int64), + } + ) + assert sparse["A"].dtype == SparseDtype(np.int64) + assert sparse["B"].dtype == SparseDtype(np.int64) # retain fill_value res = sparse.astype(np.float64) - exp = pd.SparseDataFrame({'A': SparseArray([1., 2., 3., 4.], - fill_value=0, - kind='integer'), - 'B': SparseArray([4., 5., 6., 7.], - fill_value=0, - kind='integer')}, - default_fill_value=np.nan) + exp = pd.SparseDataFrame( + { + "A": SparseArray([1.0, 2.0, 3.0, 4.0], fill_value=0, kind="integer"), + "B": SparseArray([4.0, 5.0, 6.0, 7.0], fill_value=0, kind="integer"), + }, + default_fill_value=np.nan, + ) tm.assert_sp_frame_equal(res, exp) - assert res['A'].dtype == SparseDtype(np.float64, 0) - assert res['B'].dtype == SparseDtype(np.float64, 0) + assert res["A"].dtype == SparseDtype(np.float64, 0) + assert res["B"].dtype == SparseDtype(np.float64, 0) # update fill_value res = sparse.astype(SparseDtype(np.float64, np.nan)) - exp = pd.SparseDataFrame({'A': SparseArray([1., 2., 3., 4.], - fill_value=np.nan, - kind='integer'), - 'B': SparseArray([4., 5., 6., 7.], - fill_value=np.nan, - kind='integer')}, - default_fill_value=np.nan) + exp = pd.SparseDataFrame( + { + "A": SparseArray( + [1.0, 2.0, 3.0, 4.0], fill_value=np.nan, kind="integer" + ), + "B": SparseArray( + [4.0, 5.0, 6.0, 7.0], fill_value=np.nan, kind="integer" + ), + }, + default_fill_value=np.nan, + ) tm.assert_sp_frame_equal(res, exp) - assert res['A'].dtype == SparseDtype(np.float64, np.nan) - assert res['B'].dtype == SparseDtype(np.float64, np.nan) + assert res["A"].dtype == SparseDtype(np.float64, np.nan) + assert res["B"].dtype == SparseDtype(np.float64, np.nan) def test_astype_bool(self): - sparse = pd.SparseDataFrame({'A': SparseArray([0, 2, 0, 4], - fill_value=0, - dtype=np.int64), - 'B': SparseArray([0, 5, 0, 7], - fill_value=0, - dtype=np.int64)}, - default_fill_value=0) - assert sparse['A'].dtype == SparseDtype(np.int64) - assert sparse['B'].dtype == SparseDtype(np.int64) + sparse = pd.SparseDataFrame( + { + "A": SparseArray([0, 2, 0, 4], fill_value=0, dtype=np.int64), + "B": SparseArray([0, 5, 0, 7], fill_value=0, dtype=np.int64), + }, + default_fill_value=0, + ) + assert sparse["A"].dtype == SparseDtype(np.int64) + assert sparse["B"].dtype == SparseDtype(np.int64) res = sparse.astype(SparseDtype(bool, False)) - exp = pd.SparseDataFrame({'A': SparseArray([False, True, False, True], - dtype=np.bool, - fill_value=False, - kind='integer'), - 'B': SparseArray([False, True, False, True], - dtype=np.bool, - fill_value=False, - kind='integer')}, - default_fill_value=False) + exp = pd.SparseDataFrame( + { + "A": SparseArray( + [False, True, False, True], + dtype=np.bool, + fill_value=False, + kind="integer", + ), + "B": SparseArray( + [False, True, False, True], + dtype=np.bool, + fill_value=False, + kind="integer", + ), + }, + default_fill_value=False, + ) tm.assert_sp_frame_equal(res, exp) - assert res['A'].dtype == SparseDtype(np.bool) - assert res['B'].dtype == SparseDtype(np.bool) + assert res["A"].dtype == SparseDtype(np.bool) + assert res["B"].dtype == SparseDtype(np.bool) def test_astype_object(self): # This may change in GH-23125 - df = pd.DataFrame({"A": SparseArray([0, 1]), - "B": SparseArray([0, 1])}) + df = pd.DataFrame({"A": SparseArray([0, 1]), "B": SparseArray([0, 1])}) result = df.astype(object) dtype = SparseDtype(object, 0) - expected = pd.DataFrame({"A": SparseArray([0, 1], dtype=dtype), - "B": SparseArray([0, 1], dtype=dtype)}) + expected = pd.DataFrame( + { + "A": SparseArray([0, 1], dtype=dtype), + "B": SparseArray([0, 1], dtype=dtype), + } + ) tm.assert_frame_equal(result, expected) def test_fillna(self, float_frame_fill0, float_frame_fill0_dense): @@ -790,58 +860,64 @@ def test_fillna(self, float_frame_fill0, float_frame_fill0_dense): result = df.fillna(0) expected = dense.fillna(0) - tm.assert_sp_frame_equal(result, expected.to_sparse(fill_value=0), - exact_indices=False) + tm.assert_sp_frame_equal( + result, expected.to_sparse(fill_value=0), exact_indices=False + ) tm.assert_frame_equal(result.to_dense(), expected) result = df.copy() result.fillna(0, inplace=True) expected = dense.fillna(0) - tm.assert_sp_frame_equal(result, expected.to_sparse(fill_value=0), - exact_indices=False) + tm.assert_sp_frame_equal( + result, expected.to_sparse(fill_value=0), exact_indices=False + ) tm.assert_frame_equal(result.to_dense(), expected) result = df.copy() - result = df['A'] + result = df["A"] result.fillna(0, inplace=True) - expected = dense['A'].fillna(0) + expected = dense["A"].fillna(0) # this changes internal SparseArray repr # tm.assert_sp_series_equal(result, expected.to_sparse(fill_value=0)) tm.assert_series_equal(result.to_dense(), expected) def test_fillna_fill_value(self): - df = pd.DataFrame({'A': [1, 0, 0], 'B': [np.nan, np.nan, 4]}) + df = pd.DataFrame({"A": [1, 0, 0], "B": [np.nan, np.nan, 4]}) sparse = pd.SparseDataFrame(df) - tm.assert_frame_equal(sparse.fillna(-1).to_dense(), - df.fillna(-1), check_dtype=False) + tm.assert_frame_equal( + sparse.fillna(-1).to_dense(), df.fillna(-1), check_dtype=False + ) sparse = pd.SparseDataFrame(df, default_fill_value=0) - tm.assert_frame_equal(sparse.fillna(-1).to_dense(), - df.fillna(-1), check_dtype=False) + tm.assert_frame_equal( + sparse.fillna(-1).to_dense(), df.fillna(-1), check_dtype=False + ) def test_sparse_frame_pad_backfill_limit(self): index = np.arange(10) df = DataFrame(np.random.randn(10, 4), index=index) sdf = df.to_sparse() - result = sdf[:2].reindex(index, method='pad', limit=5) + result = sdf[:2].reindex(index, method="pad", limit=5) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): - expected = sdf[:2].reindex(index).fillna(method='pad') + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): + expected = sdf[:2].reindex(index).fillna(method="pad") expected = expected.to_dense() expected.values[-3:] = np.nan expected = expected.to_sparse() tm.assert_frame_equal(result, expected) - result = sdf[-2:].reindex(index, method='backfill', limit=5) + result = sdf[-2:].reindex(index, method="backfill", limit=5) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): - expected = sdf[-2:].reindex(index).fillna(method='backfill') + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): + expected = sdf[-2:].reindex(index).fillna(method="backfill") expected = expected.to_dense() expected.values[:3] = np.nan expected = expected.to_sparse() @@ -853,26 +929,30 @@ def test_sparse_frame_fillna_limit(self): sdf = df.to_sparse() result = sdf[:2].reindex(index) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): - result = result.fillna(method='pad', limit=5) - - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): - expected = sdf[:2].reindex(index).fillna(method='pad') + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): + result = result.fillna(method="pad", limit=5) + + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): + expected = sdf[:2].reindex(index).fillna(method="pad") expected = expected.to_dense() expected.values[-3:] = np.nan expected = expected.to_sparse() tm.assert_frame_equal(result, expected) result = sdf[-2:].reindex(index) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): - result = result.fillna(method='backfill', limit=5) - - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): - expected = sdf[-2:].reindex(index).fillna(method='backfill') + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): + result = result.fillna(method="backfill", limit=5) + + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): + expected = sdf[-2:].reindex(index).fillna(method="backfill") expected = expected.to_dense() expected.values[:3] = np.nan expected = expected.to_sparse() @@ -880,17 +960,20 @@ def test_sparse_frame_fillna_limit(self): def test_rename(self, float_frame): result = float_frame.rename(index=str) - expected = SparseDataFrame(float_frame.values, - index=float_frame.index.strftime( - "%Y-%m-%d %H:%M:%S"), - columns=list('ABCD')) + expected = SparseDataFrame( + float_frame.values, + index=float_frame.index.strftime("%Y-%m-%d %H:%M:%S"), + columns=list("ABCD"), + ) tm.assert_sp_frame_equal(result, expected) - result = float_frame.rename(columns=lambda x: '%s%d' % (x, 1)) - data = {'A1': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], - 'B1': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], - 'C1': np.arange(10, dtype=np.float64), - 'D1': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} + result = float_frame.rename(columns=lambda x: "%s%d" % (x, 1)) + data = { + "A1": [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], + "B1": [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], + "C1": np.arange(10, dtype=np.float64), + "D1": [0, 1, 2, 3, 4, 5, nan, nan, nan, nan], + } expected = SparseDataFrame(data, index=float_frame.index) tm.assert_sp_frame_equal(result, expected) @@ -900,30 +983,33 @@ def test_corr(self, float_frame): tm.assert_frame_equal(res, float_frame.to_dense().corr().to_sparse()) def test_describe(self, float_frame): - float_frame['foo'] = np.nan + float_frame["foo"] = np.nan float_frame.dtypes.value_counts() str(float_frame) desc = float_frame.describe() # noqa def test_join(self, float_frame): - left = float_frame.loc[:, ['A', 'B']] - right = float_frame.loc[:, ['C', 'D']] + left = float_frame.loc[:, ["A", "B"]] + right = float_frame.loc[:, ["C", "D"]] joined = left.join(right) tm.assert_sp_frame_equal(joined, float_frame, exact_indices=False) - right = float_frame.loc[:, ['B', 'D']] - msg = (r"columns overlap but no suffix specified: Index\(\['B'\]," - r" dtype='object'\)") + right = float_frame.loc[:, ["B", "D"]] + msg = ( + r"columns overlap but no suffix specified: Index\(\['B'\]," + r" dtype='object'\)" + ) with pytest.raises(ValueError, match=msg): left.join(right) - with pytest.raises(ValueError, match='Other Series must have a name'): - float_frame.join(Series( - np.random.randn(len(float_frame)), index=float_frame.index)) - - def test_reindex(self, float_frame, float_frame_int_kind, - float_frame_fill0, float_frame_fill2): + with pytest.raises(ValueError, match="Other Series must have a name"): + float_frame.join( + Series(np.random.randn(len(float_frame)), index=float_frame.index) + ) + def test_reindex( + self, float_frame, float_frame_int_kind, float_frame_fill0, float_frame_fill2 + ): def _check_frame(frame): index = frame.index sidx = index[::2] @@ -933,37 +1019,35 @@ def _check_frame(frame): dense_result = frame.to_dense().reindex(sidx) tm.assert_frame_equal(sparse_result.to_dense(), dense_result) - tm.assert_frame_equal(frame.reindex(list(sidx)).to_dense(), - dense_result) + tm.assert_frame_equal(frame.reindex(list(sidx)).to_dense(), dense_result) sparse_result2 = sparse_result.reindex(index) dense_result2 = dense_result.reindex(index) tm.assert_frame_equal(sparse_result2.to_dense(), dense_result2) # propagate CORRECT fill value - tm.assert_almost_equal(sparse_result.default_fill_value, - frame.default_fill_value) - tm.assert_almost_equal(sparse_result['A'].fill_value, - frame['A'].fill_value) + tm.assert_almost_equal( + sparse_result.default_fill_value, frame.default_fill_value + ) + tm.assert_almost_equal(sparse_result["A"].fill_value, frame["A"].fill_value) # length zero length_zero = frame.reindex([]) assert len(length_zero) == 0 assert len(length_zero.columns) == len(frame.columns) - assert len(length_zero['A']) == 0 + assert len(length_zero["A"]) == 0 # frame being reindexed has length zero length_n = length_zero.reindex(index) assert len(length_n) == len(frame) assert len(length_n.columns) == len(frame.columns) - assert len(length_n['A']) == len(frame) + assert len(length_n["A"]) == len(frame) # reindex columns - reindexed = frame.reindex(columns=['A', 'B', 'Z']) + reindexed = frame.reindex(columns=["A", "B", "Z"]) assert len(reindexed.columns) == 3 - tm.assert_almost_equal(reindexed['Z'].fill_value, - frame.default_fill_value) - assert np.isnan(reindexed['Z'].sp_values).all() + tm.assert_almost_equal(reindexed["Z"].fill_value, frame.default_fill_value) + assert np.isnan(reindexed["Z"].sp_values).all() _check_frame(float_frame) _check_frame(float_frame_int_kind) @@ -972,16 +1056,15 @@ def _check_frame(frame): # with copy=False reindexed = float_frame.reindex(float_frame.index, copy=False) - reindexed['F'] = reindexed['A'] - assert 'F' in float_frame + reindexed["F"] = reindexed["A"] + assert "F" in float_frame reindexed = float_frame.reindex(float_frame.index) - reindexed['G'] = reindexed['A'] - assert 'G' not in float_frame + reindexed["G"] = reindexed["A"] + assert "G" not in float_frame - def test_reindex_fill_value(self, float_frame_fill0, - float_frame_fill0_dense): - rng = bdate_range('20110110', periods=20) + def test_reindex_fill_value(self, float_frame_fill0, float_frame_fill0_dense): + rng = bdate_range("20110110", periods=20) result = float_frame_fill0.reindex(rng, fill_value=0) exp = float_frame_fill0_dense.reindex(rng, fill_value=0) @@ -990,83 +1073,105 @@ def test_reindex_fill_value(self, float_frame_fill0, def test_reindex_method(self): - sparse = SparseDataFrame(data=[[11., 12., 14.], - [21., 22., 24.], - [41., 42., 44.]], - index=[1, 2, 4], - columns=[1, 2, 4], - dtype=float) + sparse = SparseDataFrame( + data=[[11.0, 12.0, 14.0], [21.0, 22.0, 24.0], [41.0, 42.0, 44.0]], + index=[1, 2, 4], + columns=[1, 2, 4], + dtype=float, + ) # Over indices # default method result = sparse.reindex(index=range(6)) - expected = SparseDataFrame(data=[[nan, nan, nan], - [11., 12., 14.], - [21., 22., 24.], - [nan, nan, nan], - [41., 42., 44.], - [nan, nan, nan]], - index=range(6), - columns=[1, 2, 4], - dtype=float) + expected = SparseDataFrame( + data=[ + [nan, nan, nan], + [11.0, 12.0, 14.0], + [21.0, 22.0, 24.0], + [nan, nan, nan], + [41.0, 42.0, 44.0], + [nan, nan, nan], + ], + index=range(6), + columns=[1, 2, 4], + dtype=float, + ) tm.assert_sp_frame_equal(result, expected) # method='bfill' - result = sparse.reindex(index=range(6), method='bfill') - expected = SparseDataFrame(data=[[11., 12., 14.], - [11., 12., 14.], - [21., 22., 24.], - [41., 42., 44.], - [41., 42., 44.], - [nan, nan, nan]], - index=range(6), - columns=[1, 2, 4], - dtype=float) + result = sparse.reindex(index=range(6), method="bfill") + expected = SparseDataFrame( + data=[ + [11.0, 12.0, 14.0], + [11.0, 12.0, 14.0], + [21.0, 22.0, 24.0], + [41.0, 42.0, 44.0], + [41.0, 42.0, 44.0], + [nan, nan, nan], + ], + index=range(6), + columns=[1, 2, 4], + dtype=float, + ) tm.assert_sp_frame_equal(result, expected) # method='ffill' - result = sparse.reindex(index=range(6), method='ffill') - expected = SparseDataFrame(data=[[nan, nan, nan], - [11., 12., 14.], - [21., 22., 24.], - [21., 22., 24.], - [41., 42., 44.], - [41., 42., 44.]], - index=range(6), - columns=[1, 2, 4], - dtype=float) + result = sparse.reindex(index=range(6), method="ffill") + expected = SparseDataFrame( + data=[ + [nan, nan, nan], + [11.0, 12.0, 14.0], + [21.0, 22.0, 24.0], + [21.0, 22.0, 24.0], + [41.0, 42.0, 44.0], + [41.0, 42.0, 44.0], + ], + index=range(6), + columns=[1, 2, 4], + dtype=float, + ) tm.assert_sp_frame_equal(result, expected) # Over columns # default method result = sparse.reindex(columns=range(6)) - expected = SparseDataFrame(data=[[nan, 11., 12., nan, 14., nan], - [nan, 21., 22., nan, 24., nan], - [nan, 41., 42., nan, 44., nan]], - index=[1, 2, 4], - columns=range(6), - dtype=float) + expected = SparseDataFrame( + data=[ + [nan, 11.0, 12.0, nan, 14.0, nan], + [nan, 21.0, 22.0, nan, 24.0, nan], + [nan, 41.0, 42.0, nan, 44.0, nan], + ], + index=[1, 2, 4], + columns=range(6), + dtype=float, + ) tm.assert_sp_frame_equal(result, expected) # method='bfill' with pytest.raises(NotImplementedError): - sparse.reindex(columns=range(6), method='bfill') + sparse.reindex(columns=range(6), method="bfill") # method='ffill' with pytest.raises(NotImplementedError): - sparse.reindex(columns=range(6), method='ffill') + sparse.reindex(columns=range(6), method="ffill") def test_take(self, float_frame): result = float_frame.take([1, 0, 2], axis=1) - expected = float_frame.reindex(columns=['B', 'A', 'C']) + expected = float_frame.reindex(columns=["B", "A", "C"]) tm.assert_sp_frame_equal(result, expected) - def test_to_dense(self, float_frame, float_frame_int_kind, - float_frame_dense, - float_frame_fill0, float_frame_fill0_dense, - float_frame_fill2, float_frame_fill2_dense): + def test_to_dense( + self, + float_frame, + float_frame_int_kind, + float_frame_dense, + float_frame_fill0, + float_frame_fill0_dense, + float_frame_fill2, + float_frame_fill2_dense, + ): def _check(frame, orig): dense_dm = frame.to_dense() # Sparse[float] != float @@ -1078,8 +1183,9 @@ def _check(frame, orig): _check(float_frame_fill0, float_frame_fill0_dense) _check(float_frame_fill2, float_frame_fill2_dense) - def test_stack_sparse_frame(self, float_frame, float_frame_int_kind, - float_frame_fill0, float_frame_fill2): + def test_stack_sparse_frame( + self, float_frame, float_frame_int_kind, float_frame_fill0, float_frame_fill2 + ): def _check(frame): dense_frame = frame.to_dense() # noqa @@ -1087,8 +1193,7 @@ def _check(frame): from_sparse_lp = spf.stack_sparse_frame(frame) - tm.assert_numpy_array_equal(from_dense_lp.values, - from_sparse_lp.values) + tm.assert_numpy_array_equal(from_dense_lp.values, from_sparse_lp.values) _check(float_frame) _check(float_frame_int_kind) @@ -1100,11 +1205,16 @@ def _check(frame): with pytest.raises(TypeError, match=msg): _check(float_frame_fill2) - def test_transpose(self, float_frame, float_frame_int_kind, - float_frame_dense, - float_frame_fill0, float_frame_fill0_dense, - float_frame_fill2, float_frame_fill2_dense): - + def test_transpose( + self, + float_frame, + float_frame_int_kind, + float_frame_dense, + float_frame_fill0, + float_frame_fill0_dense, + float_frame_fill2, + float_frame_fill2_dense, + ): def _check(frame, orig): transposed = frame.T untransposed = transposed.T @@ -1119,10 +1229,16 @@ def _check(frame, orig): _check(float_frame_fill0, float_frame_fill0_dense) _check(float_frame_fill2, float_frame_fill2_dense) - def test_shift(self, float_frame, float_frame_int_kind, float_frame_dense, - float_frame_fill0, float_frame_fill0_dense, - float_frame_fill2, float_frame_fill2_dense): - + def test_shift( + self, + float_frame, + float_frame_int_kind, + float_frame_dense, + float_frame_fill0, + float_frame_fill0_dense, + float_frame_fill2, + float_frame_fill2_dense, + ): def _check(frame, orig): shifted = frame.shift(0) exp = orig.shift(0) @@ -1136,16 +1252,14 @@ def _check(frame, orig): exp = orig.shift(-2) tm.assert_frame_equal(shifted.to_dense(), exp) - shifted = frame.shift(2, freq='B') - exp = orig.shift(2, freq='B') - exp = exp.to_sparse(frame.default_fill_value, - kind=frame.default_kind) + shifted = frame.shift(2, freq="B") + exp = orig.shift(2, freq="B") + exp = exp.to_sparse(frame.default_fill_value, kind=frame.default_kind) tm.assert_frame_equal(shifted, exp) shifted = frame.shift(2, freq=BDay()) exp = orig.shift(2, freq=BDay()) - exp = exp.to_sparse(frame.default_fill_value, - kind=frame.default_kind) + exp = exp.to_sparse(frame.default_fill_value, kind=frame.default_kind) tm.assert_frame_equal(shifted, exp) _check(float_frame, float_frame_dense) @@ -1172,7 +1286,7 @@ def test_count(self, float_frame): tm.assert_series_equal(result, dense_result, check_dtype=False) def test_numpy_transpose(self): - sdf = SparseDataFrame([1, 2, 3], index=[1, 2, 3], columns=['a']) + sdf = SparseDataFrame([1, 2, 3], index=[1, 2, 3], columns=["a"]) result = np.transpose(np.transpose(sdf)) tm.assert_sp_frame_equal(result, sdf) @@ -1206,47 +1320,46 @@ def test_combine_first_with_dense(self): def test_combine_add(self, float_frame): df = float_frame.to_dense() df2 = df.copy() - df2['C'][:3] = np.nan - df['A'][:3] = 5.7 + df2["C"][:3] = np.nan + df["A"][:3] = 5.7 result = df.to_sparse().add(df2.to_sparse(), fill_value=0) expected = df.add(df2, fill_value=0).to_sparse() tm.assert_sp_frame_equal(result, expected) def test_isin(self): - sparse_df = DataFrame({'flag': [1., 0., 1.]}).to_sparse(fill_value=0.) - xp = sparse_df[sparse_df.flag == 1.] - rs = sparse_df[sparse_df.flag.isin([1.])] + sparse_df = DataFrame({"flag": [1.0, 0.0, 1.0]}).to_sparse(fill_value=0.0) + xp = sparse_df[sparse_df.flag == 1.0] + rs = sparse_df[sparse_df.flag.isin([1.0])] tm.assert_frame_equal(xp, rs) def test_sparse_pow_issue(self): # 2220 - df = SparseDataFrame({'A': [1.1, 3.3], 'B': [2.5, -3.9]}) + df = SparseDataFrame({"A": [1.1, 3.3], "B": [2.5, -3.9]}) # note : no error without nan - df = SparseDataFrame({'A': [nan, 0, 1]}) + df = SparseDataFrame({"A": [nan, 0, 1]}) # note that 2 ** df works fine, also df ** 1 result = 1 ** df - r1 = result.take([0], 1)['A'] - r2 = result['A'] + r1 = result.take([0], 1)["A"] + r2 = result["A"] assert len(r2.sp_values) == len(r1.sp_values) def test_as_blocks(self): - df = SparseDataFrame({'A': [1.1, 3.3], 'B': [nan, -3.9]}, - dtype='float64') + df = SparseDataFrame({"A": [1.1, 3.3], "B": [nan, -3.9]}, dtype="float64") # deprecated 0.21.0 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): df_blocks = df.blocks - assert list(df_blocks.keys()) == ['Sparse[float64, nan]'] - tm.assert_frame_equal(df_blocks['Sparse[float64, nan]'], df) + assert list(df_blocks.keys()) == ["Sparse[float64, nan]"] + tm.assert_frame_equal(df_blocks["Sparse[float64, nan]"], df) - @pytest.mark.xfail(reason='nan column names in _init_dict problematic ' - '(GH#16894)') + @pytest.mark.xfail( + reason="nan column names in _init_dict problematic " "(GH#16894)" + ) def test_nan_columnname(self): # GH 8822 nan_colname = DataFrame(Series(1.0, index=[0]), columns=[nan]) @@ -1255,76 +1368,105 @@ def test_nan_columnname(self): def test_isna(self): # GH 8276 - df = pd.SparseDataFrame({'A': [np.nan, np.nan, 1, 2, np.nan], - 'B': [0, np.nan, np.nan, 2, np.nan]}) + df = pd.SparseDataFrame( + {"A": [np.nan, np.nan, 1, 2, np.nan], "B": [0, np.nan, np.nan, 2, np.nan]} + ) res = df.isna() - exp = pd.SparseDataFrame({'A': [True, True, False, False, True], - 'B': [False, True, True, False, True]}, - default_fill_value=True) + exp = pd.SparseDataFrame( + { + "A": [True, True, False, False, True], + "B": [False, True, True, False, True], + }, + default_fill_value=True, + ) exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) # if fill_value is not nan, True can be included in sp_values - df = pd.SparseDataFrame({'A': [0, 0, 1, 2, np.nan], - 'B': [0, np.nan, 0, 2, np.nan]}, - default_fill_value=0.) + df = pd.SparseDataFrame( + {"A": [0, 0, 1, 2, np.nan], "B": [0, np.nan, 0, 2, np.nan]}, + default_fill_value=0.0, + ) res = df.isna() assert isinstance(res, pd.SparseDataFrame) - exp = pd.DataFrame({'A': [False, False, False, False, True], - 'B': [False, True, False, False, True]}) + exp = pd.DataFrame( + { + "A": [False, False, False, False, True], + "B": [False, True, False, False, True], + } + ) tm.assert_frame_equal(res.to_dense(), exp) def test_notna(self): # GH 8276 - df = pd.SparseDataFrame({'A': [np.nan, np.nan, 1, 2, np.nan], - 'B': [0, np.nan, np.nan, 2, np.nan]}) + df = pd.SparseDataFrame( + {"A": [np.nan, np.nan, 1, 2, np.nan], "B": [0, np.nan, np.nan, 2, np.nan]} + ) res = df.notna() - exp = pd.SparseDataFrame({'A': [False, False, True, True, False], - 'B': [True, False, False, True, False]}, - default_fill_value=False) + exp = pd.SparseDataFrame( + { + "A": [False, False, True, True, False], + "B": [True, False, False, True, False], + }, + default_fill_value=False, + ) exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) # if fill_value is not nan, True can be included in sp_values - df = pd.SparseDataFrame({'A': [0, 0, 1, 2, np.nan], - 'B': [0, np.nan, 0, 2, np.nan]}, - default_fill_value=0.) + df = pd.SparseDataFrame( + {"A": [0, 0, 1, 2, np.nan], "B": [0, np.nan, 0, 2, np.nan]}, + default_fill_value=0.0, + ) res = df.notna() assert isinstance(res, pd.SparseDataFrame) - exp = pd.DataFrame({'A': [True, True, True, True, False], - 'B': [True, False, True, True, False]}) + exp = pd.DataFrame( + { + "A": [True, True, True, True, False], + "B": [True, False, True, True, False], + } + ) tm.assert_frame_equal(res.to_dense(), exp) def test_default_fill_value_with_no_data(self): # GH 16807 - expected = pd.SparseDataFrame([[1.0, 1.0], [1.0, 1.0]], - columns=list('ab'), index=range(2)) - result = pd.SparseDataFrame(columns=list('ab'), index=range(2), - default_fill_value=1.0) + expected = pd.SparseDataFrame( + [[1.0, 1.0], [1.0, 1.0]], columns=list("ab"), index=range(2) + ) + result = pd.SparseDataFrame( + columns=list("ab"), index=range(2), default_fill_value=1.0 + ) tm.assert_frame_equal(expected, result) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparseDataFrameArithmetic: - def test_numeric_op_scalar(self): - df = pd.DataFrame({'A': [nan, nan, 0, 1, ], - 'B': [0, 1, 2, nan], - 'C': [1., 2., 3., 4.], - 'D': [nan, nan, nan, nan]}) + df = pd.DataFrame( + { + "A": [nan, nan, 0, 1], + "B": [0, 1, 2, nan], + "C": [1.0, 2.0, 3.0, 4.0], + "D": [nan, nan, nan, nan], + } + ) sparse = df.to_sparse() tm.assert_sp_frame_equal(sparse + 1, (df + 1).to_sparse()) def test_comparison_op_scalar(self): # GH 13001 - df = pd.DataFrame({'A': [nan, nan, 0, 1, ], - 'B': [0, 1, 2, nan], - 'C': [1., 2., 3., 4.], - 'D': [nan, nan, nan, nan]}) + df = pd.DataFrame( + { + "A": [nan, nan, 0, 1], + "B": [0, 1, 2, nan], + "C": [1.0, 2.0, 3.0, 4.0], + "D": [nan, nan, nan, nan], + } + ) sparse = df.to_sparse() # comparison changes internal repr, compare with dense @@ -1340,7 +1482,6 @@ def test_comparison_op_scalar(self): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparseDataFrameAnalytics: - def test_cumsum(self, float_frame): expected = SparseDataFrame(float_frame.to_dense().cumsum()) @@ -1369,13 +1510,11 @@ def test_numpy_cumsum(self, float_frame): def test_numpy_func_call(self, float_frame): # no exception should be raised even though # numpy passes in 'axis=None' or `axis=-1' - funcs = ['sum', 'cumsum', 'var', - 'mean', 'prod', 'cumprod', - 'std', 'min', 'max'] + funcs = ["sum", "cumsum", "var", "mean", "prod", "cumprod", "std", "min", "max"] for func in funcs: getattr(np, func)(float_frame) - @pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH 17386)') + @pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH 17386)") def test_quantile(self): # GH 17386 data = [[1, 1], [2, 10], [3, 100], [nan, nan]] @@ -1391,7 +1530,7 @@ def test_quantile(self): tm.assert_series_equal(result, dense_expected) tm.assert_sp_series_equal(result, sparse_expected) - @pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH 17386)') + @pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH 17386)") def test_quantile_multi(self): # GH 17386 data = [[1, 1], [2, 10], [3, 100], [nan, nan]] @@ -1424,7 +1563,7 @@ def test_dropna(self, inplace, how): # Tests regression #21172. expected = pd.SparseDataFrame({"F2": [0, 1]}) input_df = pd.SparseDataFrame( - {"F1": [float('nan'), float('nan')], "F2": [0, 1]} + {"F1": [float("nan"), float("nan")], "F2": [0, 1]} ) result_df = input_df.dropna(axis=1, inplace=inplace, how=how) if inplace: diff --git a/pandas/tests/sparse/frame/test_indexing.py b/pandas/tests/sparse/frame/test_indexing.py index 2d2a7ac278dd6a..c93e9d1e0e8d10 100644 --- a/pandas/tests/sparse/frame/test_indexing.py +++ b/pandas/tests/sparse/frame/test_indexing.py @@ -7,18 +7,21 @@ pytestmark = pytest.mark.skip("Wrong SparseBlock initialization (GH 17386)") -@pytest.mark.parametrize('data', [ - [[1, 1], [2, 2], [3, 3], [4, 4], [0, 0]], - [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0], [4.0, 4.0], [np.nan, np.nan]], +@pytest.mark.parametrize( + "data", [ - [1.0, 1.0 + 1.0j], - [2.0 + 2.0j, 2.0], - [3.0, 3.0 + 3.0j], - [4.0 + 4.0j, 4.0], - [np.nan, np.nan] - ] -]) -@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)') + [[1, 1], [2, 2], [3, 3], [4, 4], [0, 0]], + [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0], [4.0, 4.0], [np.nan, np.nan]], + [ + [1.0, 1.0 + 1.0j], + [2.0 + 2.0j, 2.0], + [3.0, 3.0 + 3.0j], + [4.0 + 4.0j, 4.0], + [np.nan, np.nan], + ], + ], +) +@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") def test_where_with_numeric_data(data): # GH 17386 lower_bound = 1.5 @@ -34,24 +37,22 @@ def test_where_with_numeric_data(data): tm.assert_sp_frame_equal(result, sparse_expected) -@pytest.mark.parametrize('data', [ - [[1, 1], [2, 2], [3, 3], [4, 4], [0, 0]], - [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0], [4.0, 4.0], [np.nan, np.nan]], +@pytest.mark.parametrize( + "data", [ - [1.0, 1.0 + 1.0j], - [2.0 + 2.0j, 2.0], - [3.0, 3.0 + 3.0j], - [4.0 + 4.0j, 4.0], - [np.nan, np.nan] - ] -]) -@pytest.mark.parametrize('other', [ - True, - -100, - 0.1, - 100.0 + 100.0j -]) -@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)') + [[1, 1], [2, 2], [3, 3], [4, 4], [0, 0]], + [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0], [4.0, 4.0], [np.nan, np.nan]], + [ + [1.0, 1.0 + 1.0j], + [2.0 + 2.0j, 2.0], + [3.0, 3.0 + 3.0j], + [4.0 + 4.0j, 4.0], + [np.nan, np.nan], + ], + ], +) +@pytest.mark.parametrize("other", [True, -100, 0.1, 100.0 + 100.0j]) +@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") def test_where_with_numeric_data_and_other(data, other): # GH 17386 lower_bound = 1.5 @@ -61,14 +62,13 @@ def test_where_with_numeric_data_and_other(data, other): dense = DataFrame(data) dense_expected = dense.where(dense > lower_bound, other) - sparse_expected = SparseDataFrame(dense_expected, - default_fill_value=other) + sparse_expected = SparseDataFrame(dense_expected, default_fill_value=other) tm.assert_frame_equal(result, dense_expected) tm.assert_sp_frame_equal(result, sparse_expected) -@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)') +@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") def test_where_with_bool_data(): # GH 17386 data = [[False, False], [True, True], [False, False]] @@ -85,13 +85,8 @@ def test_where_with_bool_data(): tm.assert_sp_frame_equal(result, sparse_expected) -@pytest.mark.parametrize('other', [ - True, - 0, - 0.1, - 100.0 + 100.0j -]) -@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)') +@pytest.mark.parametrize("other", [True, 0, 0.1, 100.0 + 100.0j]) +@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") def test_where_with_bool_data_and_other(other): # GH 17386 data = [[False, False], [True, True], [False, False]] @@ -102,8 +97,7 @@ def test_where_with_bool_data_and_other(other): dense = DataFrame(data) dense_expected = dense.where(dense == cond, other) - sparse_expected = SparseDataFrame(dense_expected, - default_fill_value=other) + sparse_expected = SparseDataFrame(dense_expected, default_fill_value=other) tm.assert_frame_equal(result, dense_expected) tm.assert_sp_frame_equal(result, sparse_expected) diff --git a/pandas/tests/sparse/frame/test_to_csv.py b/pandas/tests/sparse/frame/test_to_csv.py index 41d7bfabed44aa..4ba4fba7391d46 100644 --- a/pandas/tests/sparse/frame/test_to_csv.py +++ b/pandas/tests/sparse/frame/test_to_csv.py @@ -10,13 +10,14 @@ class TestSparseDataFrameToCsv: fill_values = [np.nan, 0, None, 1] - @pytest.mark.parametrize('fill_value', fill_values) + @pytest.mark.parametrize("fill_value", fill_values) def test_to_csv_sparse_dataframe(self, fill_value): # GH19384 - sdf = SparseDataFrame({'a': type(self).fill_values}, - default_fill_value=fill_value) + sdf = SparseDataFrame( + {"a": type(self).fill_values}, default_fill_value=fill_value + ) - with tm.ensure_clean('sparse_df.csv') as path: + with tm.ensure_clean("sparse_df.csv") as path: sdf.to_csv(path, index=False) df = read_csv(path, skip_blank_lines=False) diff --git a/pandas/tests/sparse/frame/test_to_from_scipy.py b/pandas/tests/sparse/frame/test_to_from_scipy.py index 881d8d31e51627..9d1ccc62146ab4 100644 --- a/pandas/tests/sparse/frame/test_to_from_scipy.py +++ b/pandas/tests/sparse/frame/test_to_from_scipy.py @@ -8,16 +8,16 @@ from pandas.core.sparse.api import SparseDtype from pandas.util import testing as tm -scipy = pytest.importorskip('scipy') +scipy = pytest.importorskip("scipy") ignore_matrix_warning = pytest.mark.filterwarnings( "ignore:the matrix subclass:PendingDeprecationWarning" ) -@pytest.mark.parametrize('index', [None, list('abc')]) # noqa: F811 -@pytest.mark.parametrize('columns', [None, list('def')]) -@pytest.mark.parametrize('fill_value', [None, 0, np.nan]) -@pytest.mark.parametrize('dtype', [bool, int, float, np.uint16]) +@pytest.mark.parametrize("index", [None, list("abc")]) # noqa: F811 +@pytest.mark.parametrize("columns", [None, list("def")]) +@pytest.mark.parametrize("fill_value", [None, 0, np.nan]) +@pytest.mark.parametrize("dtype", [bool, int, float, np.uint16]) @ignore_matrix_warning @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): @@ -36,8 +36,9 @@ def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): # can just skip testing it thoroughly return - sdf = SparseDataFrame(spm, index=index, columns=columns, - default_fill_value=fill_value) + sdf = SparseDataFrame( + spm, index=index, columns=columns, default_fill_value=fill_value + ) # Expected result construction is kind of tricky for all # dtype-fill_value combinations; easiest to cast to something generic @@ -45,7 +46,8 @@ def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): rarr = arr.astype(object) rarr[arr == 0] = np.nan expected = SparseDataFrame(rarr, index=index, columns=columns).fillna( - fill_value if fill_value is not None else np.nan) + fill_value if fill_value is not None else np.nan + ) # Assert frame is as expected sdf_obj = sdf.astype(object) @@ -58,24 +60,25 @@ def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): # Ensure dtype is preserved if possible # XXX: verify this res_dtype = bool if is_bool_dtype(dtype) else dtype - tm.assert_contains_all(sdf.dtypes.apply(lambda dtype: dtype.subtype), - {np.dtype(res_dtype)}) + tm.assert_contains_all( + sdf.dtypes.apply(lambda dtype: dtype.subtype), {np.dtype(res_dtype)} + ) assert sdf.to_coo().dtype == res_dtype # However, adding a str column results in an upcast to object - sdf['strings'] = np.arange(len(sdf)).astype(str) + sdf["strings"] = np.arange(len(sdf)).astype(str) assert sdf.to_coo().dtype == np.object_ -@pytest.mark.parametrize('fill_value', [None, 0, np.nan]) # noqa: F811 +@pytest.mark.parametrize("fill_value", [None, 0, np.nan]) # noqa: F811 @ignore_matrix_warning @pytest.mark.filterwarnings("ignore:object dtype is not supp:UserWarning") @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_from_to_scipy_object(spmatrix, fill_value): # GH 4343 dtype = object - columns = list('cd') - index = list('ab') + columns = list("cd") + index = list("ab") if spmatrix is scipy.sparse.dok_matrix: pytest.skip("dok_matrix from object does not work in SciPy") @@ -92,8 +95,9 @@ def test_from_to_scipy_object(spmatrix, fill_value): # can just skip testing it thoroughly return - sdf = SparseDataFrame(spm, index=index, columns=columns, - default_fill_value=fill_value) + sdf = SparseDataFrame( + spm, index=index, columns=columns, default_fill_value=fill_value + ) # Expected result construction is kind of tricky for all # dtype-fill_value combinations; easiest to cast to something generic @@ -101,7 +105,8 @@ def test_from_to_scipy_object(spmatrix, fill_value): rarr = arr.astype(object) rarr[arr == 0] = np.nan expected = SparseDataFrame(rarr, index=index, columns=columns).fillna( - fill_value if fill_value is not None else np.nan) + fill_value if fill_value is not None else np.nan + ) # Assert frame is as expected sdf_obj = sdf.astype(SparseDtype(object, fill_value)) @@ -113,8 +118,9 @@ def test_from_to_scipy_object(spmatrix, fill_value): # Ensure dtype is preserved if possible res_dtype = object - tm.assert_contains_all(sdf.dtypes.apply(lambda dtype: dtype.subtype), - {np.dtype(res_dtype)}) + tm.assert_contains_all( + sdf.dtypes.apply(lambda dtype: dtype.subtype), {np.dtype(res_dtype)} + ) assert sdf.to_coo().dtype == res_dtype @@ -157,11 +163,14 @@ def test_from_scipy_fillna(spmatrix): sdf = SparseDataFrame(spm).fillna(-1.0) # Returning frame should fill all nan values with -1.0 - expected = SparseDataFrame({ - 0: SparseSeries([1., -1, -1]), - 1: SparseSeries([np.nan, 1, np.nan]), - 2: SparseSeries([np.nan, np.nan, 1]), - }, default_fill_value=-1) + expected = SparseDataFrame( + { + 0: SparseSeries([1.0, -1, -1]), + 1: SparseSeries([np.nan, 1, np.nan]), + 2: SparseSeries([np.nan, np.nan, 1]), + }, + default_fill_value=-1, + ) # fill_value is expected to be what .fillna() above was called with # We don't use -1 as initial fill_value in expected SparseSeries @@ -179,8 +188,7 @@ def test_index_names_multiple_nones(): # https://github.com/pandas-dev/pandas/pull/24092 sparse = pytest.importorskip("scipy.sparse") - s = (pd.Series(1, index=pd.MultiIndex.from_product([['A', 'B'], [0, 1]])) - .to_sparse()) + s = pd.Series(1, index=pd.MultiIndex.from_product([["A", "B"], [0, 1]])).to_sparse() result, _, _ = s.to_coo() assert isinstance(result, sparse.coo_matrix) result = result.toarray() diff --git a/pandas/tests/sparse/series/test_indexing.py b/pandas/tests/sparse/series/test_indexing.py index 0f4235d7cc3fec..525b0487a93768 100644 --- a/pandas/tests/sparse/series/test_indexing.py +++ b/pandas/tests/sparse/series/test_indexing.py @@ -7,18 +7,26 @@ pytestmark = pytest.mark.skip("Wrong SparseBlock initialization (GH 17386)") -@pytest.mark.parametrize('data', [ - [1, 1, 2, 2, 3, 3, 4, 4, 0, 0], - [1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, np.nan, np.nan], +@pytest.mark.parametrize( + "data", [ - 1.0, 1.0 + 1.0j, - 2.0 + 2.0j, 2.0, - 3.0, 3.0 + 3.0j, - 4.0 + 4.0j, 4.0, - np.nan, np.nan - ] -]) -@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)') + [1, 1, 2, 2, 3, 3, 4, 4, 0, 0], + [1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, np.nan, np.nan], + [ + 1.0, + 1.0 + 1.0j, + 2.0 + 2.0j, + 2.0, + 3.0, + 3.0 + 3.0j, + 4.0 + 4.0j, + 4.0, + np.nan, + np.nan, + ], + ], +) +@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") def test_where_with_numeric_data(data): # GH 17386 lower_bound = 1.5 @@ -34,26 +42,27 @@ def test_where_with_numeric_data(data): tm.assert_sp_series_equal(result, sparse_expected) -@pytest.mark.parametrize('data', [ - [1, 1, 2, 2, 3, 3, 4, 4, 0, 0], - [1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, np.nan, np.nan], +@pytest.mark.parametrize( + "data", [ - 1.0, 1.0 + 1.0j, - 2.0 + 2.0j, 2.0, - 3.0, 3.0 + 3.0j, - 4.0 + 4.0j, 4.0, - np.nan, np.nan - ] -]) -@pytest.mark.parametrize('other', [ - True, - -100, - 0.1, - 100.0 + 100.0j -]) -@pytest.mark.skip(reason='Wrong SparseBlock initialization ' - '(Segfault) ' - '(GH 17386)') + [1, 1, 2, 2, 3, 3, 4, 4, 0, 0], + [1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, np.nan, np.nan], + [ + 1.0, + 1.0 + 1.0j, + 2.0 + 2.0j, + 2.0, + 3.0, + 3.0 + 3.0j, + 4.0 + 4.0j, + 4.0, + np.nan, + np.nan, + ], + ], +) +@pytest.mark.parametrize("other", [True, -100, 0.1, 100.0 + 100.0j]) +@pytest.mark.skip(reason="Wrong SparseBlock initialization " "(Segfault) " "(GH 17386)") def test_where_with_numeric_data_and_other(data, other): # GH 17386 lower_bound = 1.5 @@ -69,7 +78,7 @@ def test_where_with_numeric_data_and_other(data, other): tm.assert_sp_series_equal(result, sparse_expected) -@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)') +@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") def test_where_with_bool_data(): # GH 17386 data = [False, False, True, True, False, False] @@ -86,15 +95,8 @@ def test_where_with_bool_data(): tm.assert_sp_series_equal(result, sparse_expected) -@pytest.mark.parametrize('other', [ - True, - 0, - 0.1, - 100.0 + 100.0j -]) -@pytest.mark.skip(reason='Wrong SparseBlock initialization ' - '(Segfault) ' - '(GH 17386)') +@pytest.mark.parametrize("other", [True, 0, 0.1, 100.0 + 100.0j]) +@pytest.mark.skip(reason="Wrong SparseBlock initialization " "(Segfault) " "(GH 17386)") def test_where_with_bool_data_and_other(other): # GH 17386 data = [False, False, True, True, False, False] diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 2abd63281c4fee..8895544958d7a8 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -11,8 +11,7 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import ( - DataFrame, Series, SparseDtype, SparseSeries, bdate_range, isna) +from pandas import DataFrame, Series, SparseDtype, SparseSeries, bdate_range, isna from pandas.core.reshape.util import cartesian_product import pandas.core.sparse.frame as spf from pandas.tests.series.test_api import SharedWithSparse @@ -71,36 +70,32 @@ class TestSparseSeries(SharedWithSparse): def setup_method(self, method): arr, index = _test_data1() - date_index = bdate_range('1/1/2011', periods=len(index)) + date_index = bdate_range("1/1/2011", periods=len(index)) - self.bseries = SparseSeries(arr, index=index, kind='block', - name='bseries') + self.bseries = SparseSeries(arr, index=index, kind="block", name="bseries") self.ts = self.bseries - self.btseries = SparseSeries(arr, index=date_index, kind='block') + self.btseries = SparseSeries(arr, index=date_index, kind="block") - self.iseries = SparseSeries(arr, index=index, kind='integer', - name='iseries') + self.iseries = SparseSeries(arr, index=index, kind="integer", name="iseries") arr, index = _test_data2() - self.bseries2 = SparseSeries(arr, index=index, kind='block') - self.iseries2 = SparseSeries(arr, index=index, kind='integer') + self.bseries2 = SparseSeries(arr, index=index, kind="block") + self.iseries2 = SparseSeries(arr, index=index, kind="integer") arr, index = _test_data1_zero() - self.zbseries = SparseSeries(arr, index=index, kind='block', - fill_value=0, name='zbseries') - self.ziseries = SparseSeries(arr, index=index, kind='integer', - fill_value=0) + self.zbseries = SparseSeries( + arr, index=index, kind="block", fill_value=0, name="zbseries" + ) + self.ziseries = SparseSeries(arr, index=index, kind="integer", fill_value=0) arr, index = _test_data2_zero() - self.zbseries2 = SparseSeries(arr, index=index, kind='block', - fill_value=0) - self.ziseries2 = SparseSeries(arr, index=index, kind='integer', - fill_value=0) + self.zbseries2 = SparseSeries(arr, index=index, kind="block", fill_value=0) + self.ziseries2 = SparseSeries(arr, index=index, kind="integer", fill_value=0) def test_constructor_dict_input(self): # gh-16905 - constructor_dict = {1: 1.} + constructor_dict = {1: 1.0} index = [0, 1, 2] # Series with index passed in @@ -120,12 +115,12 @@ def test_constructor_dict_order(self): # GH19018 # initialization ordering: by insertion order if python>= 3.6, else # order by value - d = {'b': 1, 'a': 0, 'c': 2} + d = {"b": 1, "a": 0, "c": 2} result = SparseSeries(d) if PY36: - expected = SparseSeries([1, 0, 2], index=list('bac')) + expected = SparseSeries([1, 0, 2], index=list("bac")) else: - expected = SparseSeries([0, 1, 2], index=list('abc')) + expected = SparseSeries([0, 1, 2], index=list("abc")) tm.assert_sp_series_equal(result, expected) def test_constructor_dtype(self): @@ -155,16 +150,16 @@ def test_iteration_and_str(self): def test_construct_DataFrame_with_sp_series(self): # it works! - df = DataFrame({'col': self.bseries}) + df = DataFrame({"col": self.bseries}) # printing & access df.iloc[:1] - df['col'] + df["col"] df.dtypes str(df) # blocking - expected = Series({'col': 'float64:sparse'}) + expected = Series({"col": "float64:sparse"}) # GH 26705 - Assert .ftypes is deprecated with tm.assert_produces_warning(FutureWarning): @@ -176,7 +171,7 @@ def test_constructor_preserve_attr(self): assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 - s = pd.SparseSeries(arr, name='x') + s = pd.SparseSeries(arr, name="x") assert s.dtype == SparseDtype(np.int64) assert s.fill_value == 0 @@ -191,14 +186,14 @@ def test_series_density(self): def test_sparse_to_dense(self): arr, index = _test_data1() series = self.bseries.to_dense() - tm.assert_series_equal(series, Series(arr, name='bseries')) + tm.assert_series_equal(series, Series(arr, name="bseries")) series = self.iseries.to_dense() - tm.assert_series_equal(series, Series(arr, name='iseries')) + tm.assert_series_equal(series, Series(arr, name="iseries")) arr, index = _test_data1_zero() series = self.zbseries.to_dense() - tm.assert_series_equal(series, Series(arr, name='zbseries')) + tm.assert_series_equal(series, Series(arr, name="zbseries")) series = self.ziseries.to_dense() tm.assert_series_equal(series, Series(arr)) @@ -228,8 +223,8 @@ def test_to_dense_fill_value(self): def test_dense_to_sparse(self): series = self.bseries.to_dense() - bseries = series.to_sparse(kind='block') - iseries = series.to_sparse(kind='integer') + bseries = series.to_sparse(kind="block") + iseries = series.to_sparse(kind="integer") tm.assert_sp_series_equal(bseries, self.bseries) tm.assert_sp_series_equal(iseries, self.iseries, check_names=False) assert iseries.name == self.bseries.name @@ -241,8 +236,8 @@ def test_dense_to_sparse(self): # non-NaN fill value series = self.zbseries.to_dense() - zbseries = series.to_sparse(kind='block', fill_value=0) - ziseries = series.to_sparse(kind='integer', fill_value=0) + zbseries = series.to_sparse(kind="block", fill_value=0) + ziseries = series.to_sparse(kind="integer", fill_value=0) tm.assert_sp_series_equal(zbseries, self.zbseries) tm.assert_sp_series_equal(ziseries, self.ziseries, check_names=False) assert ziseries.name == self.zbseries.name @@ -253,7 +248,7 @@ def test_dense_to_sparse(self): assert series.shape == ziseries.shape def test_to_dense_preserve_name(self): - assert (self.bseries.name is not None) + assert self.bseries.name is not None result = self.bseries.to_dense() assert result.name == self.bseries.name @@ -265,8 +260,9 @@ def test_constructor(self): assert isinstance(self.iseries.sp_index, IntIndex) assert self.zbseries.fill_value == 0 - tm.assert_numpy_array_equal(self.zbseries.values.to_dense(), - self.bseries.to_dense().fillna(0).values) + tm.assert_numpy_array_equal( + self.zbseries.values.to_dense(), self.bseries.to_dense().fillna(0).values + ) # pass SparseSeries def _check_const(sparse, name): @@ -277,16 +273,16 @@ def _check_const(sparse, name): assert result.name == name # use passed name - result = SparseSeries(sparse, name='x') + result = SparseSeries(sparse, name="x") tm.assert_sp_series_equal(result, sparse, check_names=False) - assert result.name == 'x' + assert result.name == "x" - _check_const(self.bseries, 'bseries') - _check_const(self.iseries, 'iseries') - _check_const(self.zbseries, 'zbseries') + _check_const(self.bseries, "bseries") + _check_const(self.iseries, "iseries") + _check_const(self.zbseries, "zbseries") # Sparse time series works - date_index = bdate_range('1/1/2000', periods=len(self.bseries)) + date_index = bdate_range("1/1/2000", periods=len(self.bseries)) s5 = SparseSeries(self.bseries, index=date_index) assert isinstance(s5, SparseSeries) @@ -303,16 +299,15 @@ def _check_const(sparse, name): assert values[0] == 97 assert len(sp) == 20 - assert sp.shape == (20, ) + assert sp.shape == (20,) # but can make it copy! - sp = SparseSeries(values, sparse_index=self.bseries.sp_index, - copy=True) + sp = SparseSeries(values, sparse_index=self.bseries.sp_index, copy=True) sp.sp_values[:5] = 100 assert values[0] == 97 assert len(sp) == 20 - assert sp.shape == (20, ) + assert sp.shape == (20,) def test_constructor_scalar(self): data = 5 @@ -324,7 +319,7 @@ def test_constructor_scalar(self): data = np.nan sp = SparseSeries(data, np.arange(100)) assert len(sp) == 100 - assert sp.shape == (100, ) + assert sp.shape == (100,) def test_constructor_ndarray(self): pass @@ -334,13 +329,13 @@ def test_constructor_nonnan(self): sp_series = SparseSeries(arr, fill_value=0) tm.assert_numpy_array_equal(sp_series.values.to_dense(), np.array(arr)) assert len(sp_series) == 5 - assert sp_series.shape == (5, ) + assert sp_series.shape == (5,) def test_constructor_empty(self): # see gh-9272 sp = SparseSeries() assert len(sp.index) == 0 - assert sp.shape == (0, ) + assert sp.shape == (0,) def test_copy_astype(self): cop = self.bseries.astype(np.float64) @@ -372,30 +367,28 @@ def test_copy_astype(self): def test_shape(self): # see gh-10452 - assert self.bseries.shape == (20, ) - assert self.btseries.shape == (20, ) - assert self.iseries.shape == (20, ) + assert self.bseries.shape == (20,) + assert self.btseries.shape == (20,) + assert self.iseries.shape == (20,) - assert self.bseries2.shape == (15, ) - assert self.iseries2.shape == (15, ) + assert self.bseries2.shape == (15,) + assert self.iseries2.shape == (15,) - assert self.zbseries2.shape == (15, ) - assert self.ziseries2.shape == (15, ) + assert self.zbseries2.shape == (15,) + assert self.ziseries2.shape == (15,) def test_astype(self): result = self.bseries.astype(SparseDtype(np.int64, 0)) - expected = (self.bseries.to_dense() - .fillna(0) - .astype(np.int64) - .to_sparse(fill_value=0)) + expected = ( + self.bseries.to_dense().fillna(0).astype(np.int64).to_sparse(fill_value=0) + ) tm.assert_sp_series_equal(result, expected) def test_astype_all(self): orig = pd.Series(np.array([1, 2, 3])) s = SparseSeries(orig) - types = [np.float64, np.float32, np.int64, - np.int32, np.int16, np.int8] + types = [np.float64, np.float32, np.int64, np.int32, np.int16, np.int8] for typ in types: dtype = SparseDtype(typ) res = s.astype(dtype) @@ -403,25 +396,24 @@ def test_astype_all(self): tm.assert_series_equal(res.to_dense(), orig.astype(typ)) def test_kind(self): - assert self.bseries.kind == 'block' - assert self.iseries.kind == 'integer' + assert self.bseries.kind == "block" + assert self.iseries.kind == "integer" def test_to_frame(self): # GH 9850 - s = pd.SparseSeries([1, 2, 0, nan, 4, nan, 0], name='x') - exp = pd.SparseDataFrame({'x': [1, 2, 0, nan, 4, nan, 0]}) + s = pd.SparseSeries([1, 2, 0, nan, 4, nan, 0], name="x") + exp = pd.SparseDataFrame({"x": [1, 2, 0, nan, 4, nan, 0]}) tm.assert_sp_frame_equal(s.to_frame(), exp) - exp = pd.SparseDataFrame({'y': [1, 2, 0, nan, 4, nan, 0]}) - tm.assert_sp_frame_equal(s.to_frame(name='y'), exp) + exp = pd.SparseDataFrame({"y": [1, 2, 0, nan, 4, nan, 0]}) + tm.assert_sp_frame_equal(s.to_frame(name="y"), exp) - s = pd.SparseSeries([1, 2, 0, nan, 4, nan, 0], name='x', fill_value=0) - exp = pd.SparseDataFrame({'x': [1, 2, 0, nan, 4, nan, 0]}, - default_fill_value=0) + s = pd.SparseSeries([1, 2, 0, nan, 4, nan, 0], name="x", fill_value=0) + exp = pd.SparseDataFrame({"x": [1, 2, 0, nan, 4, nan, 0]}, default_fill_value=0) tm.assert_sp_frame_equal(s.to_frame(), exp) - exp = pd.DataFrame({'y': [1, 2, 0, nan, 4, nan, 0]}) - tm.assert_frame_equal(s.to_frame(name='y').to_dense(), exp) + exp = pd.DataFrame({"y": [1, 2, 0, nan, 4, nan, 0]}) + tm.assert_frame_equal(s.to_frame(name="y").to_dense(), exp) def test_pickle(self): def _test_roundtrip(series): @@ -477,24 +469,20 @@ def test_get_get_value(self): expected = self.btseries.to_dense()[dt] tm.assert_almost_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - tm.assert_almost_equal( - self.bseries.get_value(10), self.bseries[10]) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + tm.assert_almost_equal(self.bseries.get_value(10), self.bseries[10]) def test_set_value(self): idx = self.btseries.index[7] - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): self.btseries.set_value(idx, 0) assert self.btseries[idx] == 0 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - self.iseries.set_value('foobar', 0) - assert self.iseries.index[-1] == 'foobar' - assert self.iseries['foobar'] == 0 + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.iseries.set_value("foobar", 0) + assert self.iseries.index[-1] == "foobar" + assert self.iseries["foobar"] == 0 def test_getitem_slice(self): idx = self.bseries.index @@ -523,10 +511,9 @@ def _compare(idx): dense_result = dense.take(idx).values sparse_result = sp.take(idx) assert isinstance(sparse_result, SparseSeries) - tm.assert_almost_equal(dense_result, - sparse_result.values.to_dense()) + tm.assert_almost_equal(dense_result, sparse_result.values.to_dense()) - _compare([1., 2., 3., 4., 5., 0.]) + _compare([1.0, 2.0, 3.0, 4.0, 5.0, 0.0]) _compare([7, 2, 9, 0, 4]) _compare([3, 6, 3, 4, 7]) @@ -546,8 +533,10 @@ def test_numpy_take(self): sp = SparseSeries([1.0, 2.0, 3.0]) indices = [1, 2] - tm.assert_series_equal(np.take(sp, indices, axis=0).to_dense(), - np.take(sp.to_dense(), indices, axis=0)) + tm.assert_series_equal( + np.take(sp, indices, axis=0).to_dense(), + np.take(sp.to_dense(), indices, axis=0), + ) msg = "the 'out' parameter is not supported" with pytest.raises(ValueError, match=msg): @@ -555,20 +544,20 @@ def test_numpy_take(self): msg = "the 'mode' parameter is not supported" with pytest.raises(ValueError, match=msg): - np.take(sp, indices, out=None, mode='clip') + np.take(sp, indices, out=None, mode="clip") def test_setitem(self): - self.bseries[5] = 7. - assert self.bseries[5] == 7. + self.bseries[5] = 7.0 + assert self.bseries[5] == 7.0 def test_setslice(self): - self.bseries[5:10] = 7. - tm.assert_series_equal(self.bseries[5:10].to_dense(), - Series(7., index=range(5, 10), - name=self.bseries.name)) + self.bseries[5:10] = 7.0 + tm.assert_series_equal( + self.bseries[5:10].to_dense(), + Series(7.0, index=range(5, 10), name=self.bseries.name), + ) def test_operators(self): - def _check_op(a, b, op): sp_result = op(a, b) adense = a.to_dense() if isinstance(a, SparseSeries) else a @@ -617,6 +606,7 @@ def test_binary_operators(self): # skipping for now ##### import pytest + pytest.skip("skipping sparse binary operators test") def _check_inplace_op(iop, op): @@ -626,19 +616,21 @@ def _check_inplace_op(iop, op): iop(tmp, self.bseries) tm.assert_sp_series_equal(tmp, expected) - inplace_ops = ['add', 'sub', 'mul', 'truediv', 'floordiv', 'pow'] + inplace_ops = ["add", "sub", "mul", "truediv", "floordiv", "pow"] for op in inplace_ops: - _check_inplace_op(getattr(operator, "i%s" % op), - getattr(operator, op)) - - @pytest.mark.parametrize("values, op, fill_value", [ - ([True, False, False, True], operator.invert, True), - ([True, False, False, True], operator.invert, False), - ([0, 1, 2, 3], operator.pos, 0), - ([0, 1, 2, 3], operator.neg, 0), - ([0, np.nan, 2, 3], operator.pos, np.nan), - ([0, np.nan, 2, 3], operator.neg, np.nan), - ]) + _check_inplace_op(getattr(operator, "i%s" % op), getattr(operator, op)) + + @pytest.mark.parametrize( + "values, op, fill_value", + [ + ([True, False, False, True], operator.invert, True), + ([True, False, False, True], operator.invert, False), + ([0, 1, 2, 3], operator.pos, 0), + ([0, 1, 2, 3], operator.neg, 0), + ([0, np.nan, 2, 3], operator.pos, np.nan), + ([0, np.nan, 2, 3], operator.neg, np.nan), + ], + ) def test_unary_operators(self, values, op, fill_value): # https://github.com/pandas-dev/pandas/issues/22835 values = np.asarray(values) @@ -646,46 +638,48 @@ def test_unary_operators(self, values, op, fill_value): new_fill_value = not fill_value else: new_fill_value = op(fill_value) - s = SparseSeries(values, - fill_value=fill_value, - index=['a', 'b', 'c', 'd'], - name='name') + s = SparseSeries( + values, fill_value=fill_value, index=["a", "b", "c", "d"], name="name" + ) result = op(s) - expected = SparseSeries(op(values), - fill_value=new_fill_value, - index=['a', 'b', 'c', 'd'], - name='name') + expected = SparseSeries( + op(values), + fill_value=new_fill_value, + index=["a", "b", "c", "d"], + name="name", + ) tm.assert_sp_series_equal(result, expected) def test_abs(self): - s = SparseSeries([1, 2, -3], name='x') - expected = SparseSeries([1, 2, 3], name='x') + s = SparseSeries([1, 2, -3], name="x") + expected = SparseSeries([1, 2, 3], name="x") result = s.abs() tm.assert_sp_series_equal(result, expected) - assert result.name == 'x' + assert result.name == "x" result = abs(s) tm.assert_sp_series_equal(result, expected) - assert result.name == 'x' + assert result.name == "x" result = np.abs(s) tm.assert_sp_series_equal(result, expected) - assert result.name == 'x' + assert result.name == "x" - s = SparseSeries([1, -2, 2, -3], fill_value=-2, name='x') - expected = SparseSeries([1, 2, 3], sparse_index=s.sp_index, - fill_value=2, name='x') + s = SparseSeries([1, -2, 2, -3], fill_value=-2, name="x") + expected = SparseSeries( + [1, 2, 3], sparse_index=s.sp_index, fill_value=2, name="x" + ) result = s.abs() tm.assert_sp_series_equal(result, expected) - assert result.name == 'x' + assert result.name == "x" result = abs(s) tm.assert_sp_series_equal(result, expected) - assert result.name == 'x' + assert result.name == "x" result = np.abs(s) tm.assert_sp_series_equal(result, expected) - assert result.name == 'x' + assert result.name == "x" def test_reindex(self): def _compare_with_series(sps, new_index): @@ -720,19 +714,20 @@ def _compare_with_series(sps, new_index): # with copy=False reindexed = self.bseries.reindex(self.bseries.index, copy=True) - reindexed.sp_values[:] = 1. - assert (self.bseries.sp_values != 1.).all() + reindexed.sp_values[:] = 1.0 + assert (self.bseries.sp_values != 1.0).all() reindexed = self.bseries.reindex(self.bseries.index, copy=False) - reindexed.sp_values[:] = 1. - tm.assert_numpy_array_equal(self.bseries.sp_values, np.repeat(1., 10)) + reindexed.sp_values[:] = 1.0 + tm.assert_numpy_array_equal(self.bseries.sp_values, np.repeat(1.0, 10)) def test_sparse_reindex(self): length = 10 def _check(values, index1, index2, fill_value): - first_series = SparseSeries(values, sparse_index=index1, - fill_value=fill_value) + first_series = SparseSeries( + values, sparse_index=index1, fill_value=fill_value + ) reindexed = first_series.sparse_reindex(index2) assert reindexed.sp_index is index2 @@ -762,7 +757,7 @@ def _check_all(values, first, second): _check_with_fill_value(values, first, second, fill_value=0) index1 = [2, 4, 5, 6, 8, 9] - values1 = np.arange(6.) + values1 = np.arange(6.0) _check_all(values1, index1, [2, 4, 5]) _check_all(values1, index1, [2, 3, 4, 5, 6, 7, 8, 9]) @@ -770,11 +765,10 @@ def _check_all(values, first, second): _check_all(values1, index1, [0, 1, 7, 8, 9]) _check_all(values1, index1, []) - first_series = SparseSeries(values1, - sparse_index=IntIndex(length, index1), - fill_value=nan) - with pytest.raises(TypeError, - match='new index must be a SparseIndex'): + first_series = SparseSeries( + values1, sparse_index=IntIndex(length, index1), fill_value=nan + ) + with pytest.raises(TypeError, match="new index must be a SparseIndex"): first_series.sparse_reindex(0) def test_repr(self): @@ -801,7 +795,7 @@ def _compare_with_dense(obj, op): dense_result = getattr(series, op)() assert sparse_result == dense_result - to_compare = ['count', 'sum', 'mean', 'std', 'var', 'skew'] + to_compare = ["count", "sum", "mean", "std", "var", "skew"] def _compare_all(obj): for op in to_compare: @@ -833,7 +827,7 @@ def test_dropna(self): expected = sp.to_dense().dropna() expected = expected[expected != 0] - exp_arr = pd.SparseArray(expected.values, fill_value=0, kind='block') + exp_arr = pd.SparseArray(expected.values, fill_value=0, kind="block") tm.assert_sp_array_equal(sp_valid.values, exp_arr) tm.assert_index_equal(sp_valid.index, expected.index) assert len(sp_valid.sp_values) == 2 @@ -845,18 +839,24 @@ def test_dropna(self): def test_homogenize(self): def _check_matches(indices, expected): - data = {i: SparseSeries(idx.to_int_index().indices, - sparse_index=idx, fill_value=np.nan) - for i, idx in enumerate(indices)} + data = { + i: SparseSeries( + idx.to_int_index().indices, sparse_index=idx, fill_value=np.nan + ) + for i, idx in enumerate(indices) + } # homogenized is only valid with NaN fill values homogenized = spf.homogenize(data) for k, v in homogenized.items(): - assert (v.sp_index.equals(expected)) + assert v.sp_index.equals(expected) - indices1 = [BlockIndex(10, [2], [7]), BlockIndex(10, [1, 6], [3, 4]), - BlockIndex(10, [0], [10])] + indices1 = [ + BlockIndex(10, [2], [7]), + BlockIndex(10, [1, 6], [3, 4]), + BlockIndex(10, [0], [10]), + ] expected1 = BlockIndex(10, [2, 6], [2, 3]) _check_matches(indices1, expected1) @@ -865,8 +865,7 @@ def _check_matches(indices, expected): _check_matches(indices2, expected2) # must have NaN fill value - data = {'a': SparseSeries(np.arange(7), sparse_index=expected2, - fill_value=0)} + data = {"a": SparseSeries(np.arange(7), sparse_index=expected2, fill_value=0)} with pytest.raises(TypeError, match="NaN fill value"): spf.homogenize(data) @@ -892,7 +891,7 @@ def test_fill_value_when_combine_const(self): tm.assert_series_equal(res, exp) def test_shift(self): - series = SparseSeries([nan, 1., 2., 3., nan, nan], index=np.arange(6)) + series = SparseSeries([nan, 1.0, 2.0, 3.0, nan, nan], index=np.arange(6)) shifted = series.shift(0) # assert shifted is not series @@ -904,9 +903,10 @@ def test_shift(self): f = lambda s: s.shift(-2) _dense_series_compare(series, f) - series = SparseSeries([nan, 1., 2., 3., nan, nan], - index=bdate_range('1/1/2000', periods=6)) - f = lambda s: s.shift(2, freq='B') + series = SparseSeries( + [nan, 1.0, 2.0, 3.0, nan, nan], index=bdate_range("1/1/2000", periods=6) + ) + f = lambda s: s.shift(2, freq="B") _dense_series_compare(series, f) f = lambda s: s.shift(2, freq=BDay()) @@ -917,14 +917,18 @@ def test_shift_nan(self): orig = pd.Series([np.nan, 2, np.nan, 4, 0, np.nan, 0]) sparse = orig.to_sparse() - tm.assert_sp_series_equal(sparse.shift(0), orig.shift(0).to_sparse(), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(1), orig.shift(1).to_sparse(), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(2), orig.shift(2).to_sparse(), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(3), orig.shift(3).to_sparse(), - check_kind=False) + tm.assert_sp_series_equal( + sparse.shift(0), orig.shift(0).to_sparse(), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(1), orig.shift(1).to_sparse(), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(2), orig.shift(2).to_sparse(), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(3), orig.shift(3).to_sparse(), check_kind=False + ) tm.assert_sp_series_equal(sparse.shift(-1), orig.shift(-1).to_sparse()) tm.assert_sp_series_equal(sparse.shift(-2), orig.shift(-2).to_sparse()) @@ -933,31 +937,30 @@ def test_shift_nan(self): sparse = orig.to_sparse(fill_value=0) tm.assert_sp_series_equal( - sparse.shift(0), - orig.shift(0).to_sparse(fill_value=sparse.fill_value) + sparse.shift(0), orig.shift(0).to_sparse(fill_value=sparse.fill_value) + ) + tm.assert_sp_series_equal( + sparse.shift(1), orig.shift(1).to_sparse(fill_value=0), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(2), orig.shift(2).to_sparse(fill_value=0), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(3), orig.shift(3).to_sparse(fill_value=0), check_kind=False + ) + + tm.assert_sp_series_equal( + sparse.shift(-1), orig.shift(-1).to_sparse(fill_value=0), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(-2), orig.shift(-2).to_sparse(fill_value=0), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(-3), orig.shift(-3).to_sparse(fill_value=0), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(-4), orig.shift(-4).to_sparse(fill_value=0), check_kind=False ) - tm.assert_sp_series_equal(sparse.shift(1), - orig.shift(1).to_sparse(fill_value=0), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(2), - orig.shift(2).to_sparse(fill_value=0), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(3), - orig.shift(3).to_sparse(fill_value=0), - check_kind=False) - - tm.assert_sp_series_equal(sparse.shift(-1), - orig.shift(-1).to_sparse(fill_value=0), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(-2), - orig.shift(-2).to_sparse(fill_value=0), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(-3), - orig.shift(-3).to_sparse(fill_value=0), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(-4), - orig.shift(-4).to_sparse(fill_value=0), - check_kind=False) def test_shift_dtype(self): # GH 12908 @@ -967,50 +970,50 @@ def test_shift_dtype(self): tm.assert_sp_series_equal(sparse.shift(0), orig.shift(0).to_sparse()) sparse = orig.to_sparse(fill_value=np.nan) - tm.assert_sp_series_equal(sparse.shift(0), - orig.shift(0).to_sparse(fill_value=np.nan)) + tm.assert_sp_series_equal( + sparse.shift(0), orig.shift(0).to_sparse(fill_value=np.nan) + ) # shift(1) or more span changes dtype to float64 # XXX: SparseSeries doesn't need to shift dtype here. # Do we want to astype in shift, for backwards compat? # If not, document it. - tm.assert_sp_series_equal(sparse.shift(1).astype('f8'), - orig.shift(1).to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.shift(2).astype('f8'), - orig.shift(2).to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.shift(3).astype('f8'), - orig.shift(3).to_sparse(kind='integer')) - - tm.assert_sp_series_equal(sparse.shift(-1).astype('f8'), - orig.shift(-1).to_sparse(), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(-2).astype('f8'), - orig.shift(-2).to_sparse(), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(-3).astype('f8'), - orig.shift(-3).to_sparse(), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(-4).astype('f8'), - orig.shift(-4).to_sparse(), - check_kind=False) - - @pytest.mark.parametrize("fill_value", [ - 0, - 1, - np.nan - ]) + tm.assert_sp_series_equal( + sparse.shift(1).astype("f8"), orig.shift(1).to_sparse(kind="integer") + ) + tm.assert_sp_series_equal( + sparse.shift(2).astype("f8"), orig.shift(2).to_sparse(kind="integer") + ) + tm.assert_sp_series_equal( + sparse.shift(3).astype("f8"), orig.shift(3).to_sparse(kind="integer") + ) + + tm.assert_sp_series_equal( + sparse.shift(-1).astype("f8"), orig.shift(-1).to_sparse(), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(-2).astype("f8"), orig.shift(-2).to_sparse(), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(-3).astype("f8"), orig.shift(-3).to_sparse(), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(-4).astype("f8"), orig.shift(-4).to_sparse(), check_kind=False + ) + + @pytest.mark.parametrize("fill_value", [0, 1, np.nan]) @pytest.mark.parametrize("periods", [0, 1, 2, 3, -1, -2, -3, -4]) def test_shift_dtype_fill_value(self, fill_value, periods): # GH 12908 - orig = pd.Series([1, 0, 0, 4], dtype=np.dtype('int64')) + orig = pd.Series([1, 0, 0, 4], dtype=np.dtype("int64")) sparse = orig.to_sparse(fill_value=fill_value) result = sparse.shift(periods) expected = orig.shift(periods).to_sparse(fill_value=fill_value) - tm.assert_sp_series_equal(result, expected, - check_kind=False, - consolidate_block_indices=True) + tm.assert_sp_series_equal( + result, expected, check_kind=False, consolidate_block_indices=True + ) def test_combine_first(self): s = self.bseries @@ -1024,8 +1027,8 @@ def test_combine_first(self): tm.assert_sp_series_equal(result, result2) tm.assert_sp_series_equal(result, expected) - @pytest.mark.parametrize('deep', [True, False]) - @pytest.mark.parametrize('fill_value', [0, 1, np.nan, None]) + @pytest.mark.parametrize("deep", [True, False]) + @pytest.mark.parametrize("fill_value", [0, 1, np.nan, None]) def test_memory_usage_deep(self, deep, fill_value): values = [1.0] + [fill_value] * 20 sparse_series = SparseSeries(values, fill_value=fill_value) @@ -1039,35 +1042,38 @@ def test_memory_usage_deep(self, deep, fill_value): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparseHandlingMultiIndexes: - def setup_method(self, method): miindex = pd.MultiIndex.from_product( - [["x", "y"], ["10", "20"]], names=['row-foo', 'row-bar']) + [["x", "y"], ["10", "20"]], names=["row-foo", "row-bar"] + ) micol = pd.MultiIndex.from_product( - [['a', 'b', 'c'], ["1", "2"]], names=['col-foo', 'col-bar']) - dense_multiindex_frame = pd.DataFrame( - index=miindex, columns=micol).sort_index().sort_index(axis=1) + [["a", "b", "c"], ["1", "2"]], names=["col-foo", "col-bar"] + ) + dense_multiindex_frame = ( + pd.DataFrame(index=miindex, columns=micol).sort_index().sort_index(axis=1) + ) self.dense_multiindex_frame = dense_multiindex_frame.fillna(value=3.14) def test_to_sparse_preserve_multiindex_names_columns(self): sparse_multiindex_frame = self.dense_multiindex_frame.to_sparse() sparse_multiindex_frame = sparse_multiindex_frame.copy() - tm.assert_index_equal(sparse_multiindex_frame.columns, - self.dense_multiindex_frame.columns) + tm.assert_index_equal( + sparse_multiindex_frame.columns, self.dense_multiindex_frame.columns + ) def test_round_trip_preserve_multiindex_names(self): sparse_multiindex_frame = self.dense_multiindex_frame.to_sparse() round_trip_multiindex_frame = sparse_multiindex_frame.to_dense() - tm.assert_frame_equal(self.dense_multiindex_frame, - round_trip_multiindex_frame, - check_column_type=True, - check_names=True) + tm.assert_frame_equal( + self.dense_multiindex_frame, + round_trip_multiindex_frame, + check_column_type=True, + check_names=True, + ) @td.skip_if_no_scipy -@pytest.mark.filterwarnings( - "ignore:the matrix subclass:PendingDeprecationWarning" -) +@pytest.mark.filterwarnings("ignore:the matrix subclass:PendingDeprecationWarning") @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") class TestSparseSeriesScipyInteraction: @@ -1075,72 +1081,89 @@ class TestSparseSeriesScipyInteraction: def setup_method(self, method): import scipy.sparse + # SparseSeries inputs used in tests, the tests rely on the order self.sparse_series = [] s = pd.Series([3.0, nan, 1.0, 2.0, nan, nan]) - s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), - (1, 2, 'a', 1), - (1, 1, 'b', 0), - (1, 1, 'b', 1), - (2, 1, 'b', 0), - (2, 1, 'b', 1)], - names=['A', 'B', 'C', 'D']) + s.index = pd.MultiIndex.from_tuples( + [ + (1, 2, "a", 0), + (1, 2, "a", 1), + (1, 1, "b", 0), + (1, 1, "b", 1), + (2, 1, "b", 0), + (2, 1, "b", 1), + ], + names=["A", "B", "C", "D"], + ) self.sparse_series.append(s.to_sparse()) ss = self.sparse_series[0].copy() ss.index.names = [3, 0, 1, 2] self.sparse_series.append(ss) - ss = pd.Series([ - nan - ] * 12, index=cartesian_product((range(3), range(4)))).to_sparse() + ss = pd.Series( + [nan] * 12, index=cartesian_product((range(3), range(4))) + ).to_sparse() for k, v in zip([(0, 0), (1, 2), (1, 3)], [3.0, 1.0, 2.0]): ss[k] = v self.sparse_series.append(ss) # results used in tests self.coo_matrices = [] - self.coo_matrices.append(scipy.sparse.coo_matrix( - ([3.0, 1.0, 2.0], ([0, 1, 1], [0, 2, 3])), shape=(3, 4))) - self.coo_matrices.append(scipy.sparse.coo_matrix( - ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4))) - self.coo_matrices.append(scipy.sparse.coo_matrix( - ([3.0, 1.0, 2.0], ([0, 1, 1], [0, 0, 1])), shape=(3, 2))) - self.ils = [[(1, 2), (1, 1), (2, 1)], [(1, 1), (1, 2), (2, 1)], - [(1, 2, 'a'), (1, 1, 'b'), (2, 1, 'b')]] - self.jls = [[('a', 0), ('a', 1), ('b', 0), ('b', 1)], [0, 1]] + self.coo_matrices.append( + scipy.sparse.coo_matrix( + ([3.0, 1.0, 2.0], ([0, 1, 1], [0, 2, 3])), shape=(3, 4) + ) + ) + self.coo_matrices.append( + scipy.sparse.coo_matrix( + ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4) + ) + ) + self.coo_matrices.append( + scipy.sparse.coo_matrix( + ([3.0, 1.0, 2.0], ([0, 1, 1], [0, 0, 1])), shape=(3, 2) + ) + ) + self.ils = [ + [(1, 2), (1, 1), (2, 1)], + [(1, 1), (1, 2), (2, 1)], + [(1, 2, "a"), (1, 1, "b"), (2, 1, "b")], + ] + self.jls = [[("a", 0), ("a", 1), ("b", 0), ("b", 1)], [0, 1]] def test_to_coo_text_names_integer_row_levels_nosort(self): ss = self.sparse_series[0] - kwargs = {'row_levels': [0, 1], 'column_levels': [2, 3]} + kwargs = {"row_levels": [0, 1], "column_levels": [2, 3]} result = (self.coo_matrices[0], self.ils[0], self.jls[0]) self._run_test(ss, kwargs, result) def test_to_coo_text_names_integer_row_levels_sort(self): ss = self.sparse_series[0] - kwargs = {'row_levels': [0, 1], - 'column_levels': [2, 3], - 'sort_labels': True} + kwargs = {"row_levels": [0, 1], "column_levels": [2, 3], "sort_labels": True} result = (self.coo_matrices[1], self.ils[1], self.jls[0]) self._run_test(ss, kwargs, result) def test_to_coo_text_names_text_row_levels_nosort_col_level_single(self): ss = self.sparse_series[0] - kwargs = {'row_levels': ['A', 'B', 'C'], - 'column_levels': ['D'], - 'sort_labels': False} + kwargs = { + "row_levels": ["A", "B", "C"], + "column_levels": ["D"], + "sort_labels": False, + } result = (self.coo_matrices[2], self.ils[2], self.jls[1]) self._run_test(ss, kwargs, result) def test_to_coo_integer_names_integer_row_levels_nosort(self): ss = self.sparse_series[1] - kwargs = {'row_levels': [3, 0], 'column_levels': [1, 2]} + kwargs = {"row_levels": [3, 0], "column_levels": [1, 2]} result = (self.coo_matrices[0], self.ils[0], self.jls[0]) self._run_test(ss, kwargs, result) def test_to_coo_text_names_text_row_levels_nosort(self): ss = self.sparse_series[0] - kwargs = {'row_levels': ['A', 'B'], 'column_levels': ['C', 'D']} + kwargs = {"row_levels": ["A", "B"], "column_levels": ["C", "D"]} result = (self.coo_matrices[0], self.ils[0], self.jls[0]) self._run_test(ss, kwargs, result) @@ -1148,13 +1171,13 @@ def test_to_coo_bad_partition_nonnull_intersection(self): ss = self.sparse_series[0] msg = "Is not a partition because intersection is not null" with pytest.raises(ValueError, match=msg): - ss.to_coo(['A', 'B', 'C'], ['C', 'D']) + ss.to_coo(["A", "B", "C"], ["C", "D"]) def test_to_coo_bad_partition_small_union(self): ss = self.sparse_series[0] msg = "Is not a partition because union is not the whole" with pytest.raises(ValueError, match=msg): - ss.to_coo(['A'], ['C', 'D']) + ss.to_coo(["A"], ["C", "D"]) def test_to_coo_nlevels_less_than_two(self): ss = self.sparse_series[0] @@ -1166,15 +1189,13 @@ def test_to_coo_nlevels_less_than_two(self): def test_to_coo_bad_ilevel(self): ss = self.sparse_series[0] with pytest.raises(KeyError, match="Level E not found"): - ss.to_coo(['A', 'B'], ['C', 'D', 'E']) + ss.to_coo(["A", "B"], ["C", "D", "E"]) def test_to_coo_duplicate_index_entries(self): - ss = pd.concat([self.sparse_series[0], - self.sparse_series[0]]).to_sparse() - msg = ("Duplicate index entries are not allowed in to_coo" - " transformation") + ss = pd.concat([self.sparse_series[0], self.sparse_series[0]]).to_sparse() + msg = "Duplicate index entries are not allowed in to_coo" " transformation" with pytest.raises(ValueError, match=msg): - ss.to_coo(['A', 'B'], ['C', 'D']) + ss.to_coo(["A", "B"], ["C", "D"]) def test_from_coo_dense_index(self): ss = SparseSeries.from_coo(self.coo_matrices[0], dense_index=True) @@ -1201,8 +1222,8 @@ def _run_test(self, ss, kwargs, check): # for every test, also test symmetry property (transpose), switch # row_levels and column_levels d = kwargs.copy() - d['row_levels'] = kwargs['column_levels'] - d['column_levels'] = kwargs['row_levels'] + d["row_levels"] = kwargs["column_levels"] + d["column_levels"] = kwargs["row_levels"] results = ss.to_coo(**d) results = (results[0].T, results[2], results[1]) self._check_results_to_coo(results, check) @@ -1222,34 +1243,32 @@ def test_concat(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - for kind in ['integer', 'block']: - sparse1 = pd.SparseSeries(val1, name='x', kind=kind) - sparse2 = pd.SparseSeries(val2, name='y', kind=kind) + for kind in ["integer", "block"]: + sparse1 = pd.SparseSeries(val1, name="x", kind=kind) + sparse2 = pd.SparseSeries(val2, name="y", kind=kind) res = pd.concat([sparse1, sparse2]) exp = pd.concat([pd.Series(val1), pd.Series(val2)]) exp = pd.SparseSeries(exp, kind=kind) tm.assert_sp_series_equal(res, exp) - sparse1 = pd.SparseSeries(val1, fill_value=0, name='x', kind=kind) - sparse2 = pd.SparseSeries(val2, fill_value=0, name='y', kind=kind) + sparse1 = pd.SparseSeries(val1, fill_value=0, name="x", kind=kind) + sparse2 = pd.SparseSeries(val2, fill_value=0, name="y", kind=kind) res = pd.concat([sparse1, sparse2]) exp = pd.concat([pd.Series(val1), pd.Series(val2)]) exp = pd.SparseSeries(exp, fill_value=0, kind=kind) - tm.assert_sp_series_equal(res, exp, - consolidate_block_indices=True) + tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True) def test_concat_axis1(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - sparse1 = pd.SparseSeries(val1, name='x') - sparse2 = pd.SparseSeries(val2, name='y') + sparse1 = pd.SparseSeries(val1, name="x") + sparse2 = pd.SparseSeries(val2, name="y") res = pd.concat([sparse1, sparse2], axis=1) - exp = pd.concat([pd.Series(val1, name='x'), - pd.Series(val2, name='y')], axis=1) + exp = pd.concat([pd.Series(val1, name="x"), pd.Series(val2, name="y")], axis=1) exp = pd.SparseDataFrame(exp) tm.assert_sp_frame_equal(res, exp) @@ -1257,19 +1276,21 @@ def test_concat_different_fill(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - for kind in ['integer', 'block']: - sparse1 = pd.SparseSeries(val1, name='x', kind=kind) - sparse2 = pd.SparseSeries(val2, name='y', kind=kind, fill_value=0) + for kind in ["integer", "block"]: + sparse1 = pd.SparseSeries(val1, name="x", kind=kind) + sparse2 = pd.SparseSeries(val2, name="y", kind=kind, fill_value=0) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): res = pd.concat([sparse1, sparse2]) exp = pd.concat([pd.Series(val1), pd.Series(val2)]) exp = pd.SparseSeries(exp, kind=kind) tm.assert_sp_series_equal(res, exp) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): res = pd.concat([sparse2, sparse1]) exp = pd.concat([pd.Series(val2), pd.Series(val1)]) exp = pd.SparseSeries(exp, kind=kind, fill_value=0) @@ -1279,12 +1300,11 @@ def test_concat_axis1_different_fill(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - sparse1 = pd.SparseSeries(val1, name='x') - sparse2 = pd.SparseSeries(val2, name='y', fill_value=0) + sparse1 = pd.SparseSeries(val1, name="x") + sparse2 = pd.SparseSeries(val2, name="y", fill_value=0) res = pd.concat([sparse1, sparse2], axis=1) - exp = pd.concat([pd.Series(val1, name='x'), - pd.Series(val2, name='y')], axis=1) + exp = pd.concat([pd.Series(val1, name="x"), pd.Series(val2, name="y")], axis=1) assert isinstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res.to_dense(), exp) @@ -1292,21 +1312,23 @@ def test_concat_different_kind(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - sparse1 = pd.SparseSeries(val1, name='x', kind='integer') - sparse2 = pd.SparseSeries(val2, name='y', kind='block', fill_value=0) + sparse1 = pd.SparseSeries(val1, name="x", kind="integer") + sparse2 = pd.SparseSeries(val2, name="y", kind="block", fill_value=0) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): res = pd.concat([sparse1, sparse2]) exp = pd.concat([pd.Series(val1), pd.Series(val2)]) - exp = pd.SparseSeries(exp, kind='integer') + exp = pd.SparseSeries(exp, kind="integer") tm.assert_sp_series_equal(res, exp) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): res = pd.concat([sparse2, sparse1]) exp = pd.concat([pd.Series(val2), pd.Series(val1)]) - exp = pd.SparseSeries(exp, kind='block', fill_value=0) + exp = pd.SparseSeries(exp, kind="block", fill_value=0) tm.assert_sp_series_equal(res, exp) def test_concat_sparse_dense(self): @@ -1314,9 +1336,9 @@ def test_concat_sparse_dense(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - for kind in ['integer', 'block']: - sparse = pd.SparseSeries(val1, name='x', kind=kind) - dense = pd.Series(val2, name='y') + for kind in ["integer", "block"]: + sparse = pd.SparseSeries(val1, name="x", kind=kind) + dense = pd.Series(val2, name="y") res = pd.concat([sparse, dense]) exp = pd.concat([pd.Series(val1), dense]) @@ -1328,8 +1350,8 @@ def test_concat_sparse_dense(self): exp = exp.astype("Sparse") tm.assert_series_equal(res, exp) - sparse = pd.SparseSeries(val1, name='x', kind=kind, fill_value=0) - dense = pd.Series(val2, name='y') + sparse = pd.SparseSeries(val1, name="x", kind=kind, fill_value=0) + dense = pd.Series(val2, name="y") res = pd.concat([sparse, dense]) exp = pd.concat([pd.Series(val1), dense]) @@ -1343,96 +1365,96 @@ def test_concat_sparse_dense(self): def test_value_counts(self): vals = [1, 2, nan, 0, nan, 1, 2, nan, nan, 1, 2, 0, 1, 1] - dense = pd.Series(vals, name='xx') + dense = pd.Series(vals, name="xx") - sparse = pd.SparseSeries(vals, name='xx') - tm.assert_series_equal(sparse.value_counts(), - dense.value_counts()) - tm.assert_series_equal(sparse.value_counts(dropna=False), - dense.value_counts(dropna=False)) + sparse = pd.SparseSeries(vals, name="xx") + tm.assert_series_equal(sparse.value_counts(), dense.value_counts()) + tm.assert_series_equal( + sparse.value_counts(dropna=False), dense.value_counts(dropna=False) + ) - sparse = pd.SparseSeries(vals, name='xx', fill_value=0) - tm.assert_series_equal(sparse.value_counts(), - dense.value_counts()) - tm.assert_series_equal(sparse.value_counts(dropna=False), - dense.value_counts(dropna=False)) + sparse = pd.SparseSeries(vals, name="xx", fill_value=0) + tm.assert_series_equal(sparse.value_counts(), dense.value_counts()) + tm.assert_series_equal( + sparse.value_counts(dropna=False), dense.value_counts(dropna=False) + ) def test_value_counts_dup(self): vals = [1, 2, nan, 0, nan, 1, 2, nan, nan, 1, 2, 0, 1, 1] # numeric op may cause sp_values to include the same value as # fill_value - dense = pd.Series(vals, name='xx') / 0. - sparse = pd.SparseSeries(vals, name='xx') / 0. - tm.assert_series_equal(sparse.value_counts(), - dense.value_counts()) - tm.assert_series_equal(sparse.value_counts(dropna=False), - dense.value_counts(dropna=False)) + dense = pd.Series(vals, name="xx") / 0.0 + sparse = pd.SparseSeries(vals, name="xx") / 0.0 + tm.assert_series_equal(sparse.value_counts(), dense.value_counts()) + tm.assert_series_equal( + sparse.value_counts(dropna=False), dense.value_counts(dropna=False) + ) vals = [1, 2, 0, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 1] - dense = pd.Series(vals, name='xx') * 0. - sparse = pd.SparseSeries(vals, name='xx') * 0. - tm.assert_series_equal(sparse.value_counts(), - dense.value_counts()) - tm.assert_series_equal(sparse.value_counts(dropna=False), - dense.value_counts(dropna=False)) + dense = pd.Series(vals, name="xx") * 0.0 + sparse = pd.SparseSeries(vals, name="xx") * 0.0 + tm.assert_series_equal(sparse.value_counts(), dense.value_counts()) + tm.assert_series_equal( + sparse.value_counts(dropna=False), dense.value_counts(dropna=False) + ) def test_value_counts_int(self): vals = [1, 2, 0, 1, 2, 1, 2, 0, 1, 1] - dense = pd.Series(vals, name='xx') + dense = pd.Series(vals, name="xx") # fill_value is np.nan, but should not be included in the result - sparse = pd.SparseSeries(vals, name='xx') - tm.assert_series_equal(sparse.value_counts(), - dense.value_counts()) - tm.assert_series_equal(sparse.value_counts(dropna=False), - dense.value_counts(dropna=False)) - - sparse = pd.SparseSeries(vals, name='xx', fill_value=0) - tm.assert_series_equal(sparse.value_counts(), - dense.value_counts()) - tm.assert_series_equal(sparse.value_counts(dropna=False), - dense.value_counts(dropna=False)) + sparse = pd.SparseSeries(vals, name="xx") + tm.assert_series_equal(sparse.value_counts(), dense.value_counts()) + tm.assert_series_equal( + sparse.value_counts(dropna=False), dense.value_counts(dropna=False) + ) + + sparse = pd.SparseSeries(vals, name="xx", fill_value=0) + tm.assert_series_equal(sparse.value_counts(), dense.value_counts()) + tm.assert_series_equal( + sparse.value_counts(dropna=False), dense.value_counts(dropna=False) + ) def test_isna(self): # GH 8276 - s = pd.SparseSeries([np.nan, np.nan, 1, 2, np.nan], name='xxx') + s = pd.SparseSeries([np.nan, np.nan, 1, 2, np.nan], name="xxx") res = s.isna() - exp = pd.SparseSeries([True, True, False, False, True], name='xxx', - fill_value=True) + exp = pd.SparseSeries( + [True, True, False, False, True], name="xxx", fill_value=True + ) tm.assert_sp_series_equal(res, exp) # if fill_value is not nan, True can be included in sp_values - s = pd.SparseSeries([np.nan, 0., 1., 2., 0.], name='xxx', - fill_value=0.) + s = pd.SparseSeries([np.nan, 0.0, 1.0, 2.0, 0.0], name="xxx", fill_value=0.0) res = s.isna() assert isinstance(res, pd.SparseSeries) - exp = pd.Series([True, False, False, False, False], name='xxx') + exp = pd.Series([True, False, False, False, False], name="xxx") tm.assert_series_equal(res.to_dense(), exp) def test_notna(self): # GH 8276 - s = pd.SparseSeries([np.nan, np.nan, 1, 2, np.nan], name='xxx') + s = pd.SparseSeries([np.nan, np.nan, 1, 2, np.nan], name="xxx") res = s.notna() - exp = pd.SparseSeries([False, False, True, True, False], name='xxx', - fill_value=False) + exp = pd.SparseSeries( + [False, False, True, True, False], name="xxx", fill_value=False + ) tm.assert_sp_series_equal(res, exp) # if fill_value is not nan, True can be included in sp_values - s = pd.SparseSeries([np.nan, 0., 1., 2., 0.], name='xxx', - fill_value=0.) + s = pd.SparseSeries([np.nan, 0.0, 1.0, 2.0, 0.0], name="xxx", fill_value=0.0) res = s.notna() assert isinstance(res, pd.SparseSeries) - exp = pd.Series([False, True, True, True, True], name='xxx') + exp = pd.Series([False, True, True, True, True], name="xxx") tm.assert_series_equal(res.to_dense(), exp) def _dense_series_compare(s, f): result = f(s) - assert (isinstance(result, SparseSeries)) + assert isinstance(result, SparseSeries) dense_result = f(s.to_dense()) tm.assert_series_equal(result.to_dense(), dense_result) @@ -1440,15 +1462,14 @@ def _dense_series_compare(s, f): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") class TestSparseSeriesAnalytics: - def setup_method(self, method): arr, index = _test_data1() - self.bseries = SparseSeries(arr, index=index, kind='block', - name='bseries') + self.bseries = SparseSeries(arr, index=index, kind="block", name="bseries") arr, index = _test_data1_zero() - self.zbseries = SparseSeries(arr, index=index, kind='block', - fill_value=0, name='zbseries') + self.zbseries = SparseSeries( + arr, index=index, kind="block", fill_value=0, name="zbseries" + ) def test_cumsum(self): result = self.bseries.cumsum() @@ -1484,38 +1505,47 @@ def test_numpy_cumsum(self): def test_numpy_func_call(self): # no exception should be raised even though # numpy passes in 'axis=None' or `axis=-1' - funcs = ['sum', 'cumsum', 'var', 'mean', - 'prod', 'cumprod', 'std', 'argsort', - 'min', 'max'] + funcs = [ + "sum", + "cumsum", + "var", + "mean", + "prod", + "cumprod", + "std", + "argsort", + "min", + "max", + ] for func in funcs: - for series in ('bseries', 'zbseries'): + for series in ("bseries", "zbseries"): getattr(np, func)(getattr(self, series)) def test_deprecated_numpy_func_call(self): # NOTE: These should be add to the 'test_numpy_func_call' test above # once the behavior of argmin/argmax is corrected. - funcs = ['argmin', 'argmax'] + funcs = ["argmin", "argmax"] for func in funcs: - for series in ('bseries', 'zbseries'): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False, - raise_on_extra_warnings=False): + for series in ("bseries", "zbseries"): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ): getattr(np, func)(getattr(self, series)) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ): getattr(getattr(self, series), func)() @pytest.mark.parametrize( - 'datetime_type', (np.datetime64, - pd.Timestamp, - lambda x: datetime.strptime(x, '%Y-%m-%d'))) + "datetime_type", + (np.datetime64, pd.Timestamp, lambda x: datetime.strptime(x, "%Y-%m-%d")), +) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_constructor_dict_datetime64_index(datetime_type): # GH 9456 - dates = ['1984-02-19', '1988-11-06', '1989-12-03', '1990-03-15'] + dates = ["1984-02-19", "1988-11-06", "1989-12-03", "1990-03-15"] values = [42544017.198965244, 1234565, 40512335.181958228, -1] result = SparseSeries(dict(zip(map(datetime_type, dates), values))) @@ -1542,8 +1572,7 @@ def test_deprecated_to_sparse(): ser = Series([1, np.nan, 3]) sparse_ser = pd.SparseSeries([1, np.nan, 3]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = ser.to_sparse() tm.assert_series_equal(result, sparse_ser) diff --git a/pandas/tests/sparse/test_combine_concat.py b/pandas/tests/sparse/test_combine_concat.py index 4fed878a10ca64..d7295c4bfe5f03 100644 --- a/pandas/tests/sparse/test_combine_concat.py +++ b/pandas/tests/sparse/test_combine_concat.py @@ -10,7 +10,7 @@ class TestSparseArrayConcat: - @pytest.mark.parametrize('kind', ['integer', 'block']) + @pytest.mark.parametrize("kind", ["integer", "block"]) def test_basic(self, kind): a = pd.SparseArray([1, 0, 0, 2], kind=kind) b = pd.SparseArray([1, 0, 2, 2], kind=kind) @@ -19,43 +19,39 @@ def test_basic(self, kind): # Can't make any assertions about the sparse index itself # since we aren't don't merge sparse blocs across arrays # in to_concat - expected = np.array([1, 2, 1, 2, 2], dtype='int64') + expected = np.array([1, 2, 1, 2, 2], dtype="int64") tm.assert_numpy_array_equal(result.sp_values, expected) assert result.kind == kind - @pytest.mark.parametrize('kind', ['integer', 'block']) + @pytest.mark.parametrize("kind", ["integer", "block"]) def test_uses_first_kind(self, kind): - other = 'integer' if kind == 'block' else 'block' + other = "integer" if kind == "block" else "block" a = pd.SparseArray([1, 0, 0, 2], kind=kind) b = pd.SparseArray([1, 0, 2, 2], kind=other) result = pd.SparseArray._concat_same_type([a, b]) - expected = np.array([1, 2, 1, 2, 2], dtype='int64') + expected = np.array([1, 2, 1, 2, 2], dtype="int64") tm.assert_numpy_array_equal(result.sp_values, expected) assert result.kind == kind @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseSeriesConcat: - - @pytest.mark.parametrize('kind', [ - 'integer', - 'block', - ]) + @pytest.mark.parametrize("kind", ["integer", "block"]) def test_concat(self, kind): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - sparse1 = pd.SparseSeries(val1, name='x', kind=kind) - sparse2 = pd.SparseSeries(val2, name='y', kind=kind) + sparse1 = pd.SparseSeries(val1, name="x", kind=kind) + sparse2 = pd.SparseSeries(val2, name="y", kind=kind) res = pd.concat([sparse1, sparse2]) exp = pd.concat([pd.Series(val1), pd.Series(val2)]) exp = pd.SparseSeries(exp, kind=kind) tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True) - sparse1 = pd.SparseSeries(val1, fill_value=0, name='x', kind=kind) - sparse2 = pd.SparseSeries(val2, fill_value=0, name='y', kind=kind) + sparse1 = pd.SparseSeries(val1, fill_value=0, name="x", kind=kind) + sparse2 = pd.SparseSeries(val2, fill_value=0, name="y", kind=kind) res = pd.concat([sparse1, sparse2]) exp = pd.concat([pd.Series(val1), pd.Series(val2)]) @@ -66,12 +62,11 @@ def test_concat_axis1(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - sparse1 = pd.SparseSeries(val1, name='x') - sparse2 = pd.SparseSeries(val2, name='y') + sparse1 = pd.SparseSeries(val1, name="x") + sparse2 = pd.SparseSeries(val2, name="y") res = pd.concat([sparse1, sparse2], axis=1) - exp = pd.concat([pd.Series(val1, name='x'), - pd.Series(val2, name='y')], axis=1) + exp = pd.concat([pd.Series(val1, name="x"), pd.Series(val2, name="y")], axis=1) exp = pd.SparseDataFrame(exp) tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True) @@ -79,20 +74,22 @@ def test_concat_different_fill(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - for kind in ['integer', 'block']: - sparse1 = pd.SparseSeries(val1, name='x', kind=kind) - sparse2 = pd.SparseSeries(val2, name='y', kind=kind, fill_value=0) + for kind in ["integer", "block"]: + sparse1 = pd.SparseSeries(val1, name="x", kind=kind) + sparse2 = pd.SparseSeries(val2, name="y", kind=kind, fill_value=0) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): res = pd.concat([sparse1, sparse2]) exp = pd.concat([pd.Series(val1), pd.Series(val2)]) exp = pd.SparseSeries(exp, kind=kind) tm.assert_sp_series_equal(res, exp) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): res = pd.concat([sparse2, sparse1]) exp = pd.concat([pd.Series(val2), pd.Series(val1)]) @@ -103,12 +100,11 @@ def test_concat_axis1_different_fill(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - sparse1 = pd.SparseSeries(val1, name='x') - sparse2 = pd.SparseSeries(val2, name='y', fill_value=0) + sparse1 = pd.SparseSeries(val1, name="x") + sparse2 = pd.SparseSeries(val2, name="y", fill_value=0) res = pd.concat([sparse1, sparse2], axis=1) - exp = pd.concat([pd.Series(val1, name='x'), - pd.Series(val2, name='y')], axis=1) + exp = pd.concat([pd.Series(val1, name="x"), pd.Series(val2, name="y")], axis=1) assert isinstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res.to_dense(), exp) @@ -116,8 +112,8 @@ def test_concat_different_kind(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - sparse1 = pd.SparseSeries(val1, name='x', kind='integer') - sparse2 = pd.SparseSeries(val2, name='y', kind='block') + sparse1 = pd.SparseSeries(val1, name="x", kind="integer") + sparse2 = pd.SparseSeries(val2, name="y", kind="block") res = pd.concat([sparse1, sparse2]) exp = pd.concat([pd.Series(val1), pd.Series(val2)]) @@ -129,17 +125,14 @@ def test_concat_different_kind(self): exp = pd.SparseSeries(exp, kind=sparse2.kind) tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True) - @pytest.mark.parametrize('kind', [ - 'integer', - 'block', - ]) + @pytest.mark.parametrize("kind", ["integer", "block"]) def test_concat_sparse_dense(self, kind): # use first input's fill_value val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - sparse = pd.SparseSeries(val1, name='x', kind=kind) - dense = pd.Series(val2, name='y') + sparse = pd.SparseSeries(val1, name="x", kind=kind) + dense = pd.Series(val2, name="y") res = pd.concat([sparse, dense]) exp = pd.SparseSeries(pd.concat([pd.Series(val1), dense]), kind=kind) @@ -148,23 +141,17 @@ def test_concat_sparse_dense(self, kind): res = pd.concat([dense, sparse, dense]) exp = pd.concat([dense, pd.Series(val1), dense]) # XXX: changed from SparseSeries to Series[sparse] - exp = pd.Series( - pd.SparseArray(exp, kind=kind), - index=exp.index, - name=exp.name, - ) + exp = pd.Series(pd.SparseArray(exp, kind=kind), index=exp.index, name=exp.name) tm.assert_series_equal(res, exp) - sparse = pd.SparseSeries(val1, name='x', kind=kind, fill_value=0) - dense = pd.Series(val2, name='y') + sparse = pd.SparseSeries(val1, name="x", kind=kind, fill_value=0) + dense = pd.Series(val2, name="y") res = pd.concat([sparse, dense]) # XXX: changed from SparseSeries to Series[sparse] exp = pd.concat([pd.Series(val1), dense]) exp = pd.Series( - pd.SparseArray(exp, kind=kind, fill_value=0), - index=exp.index, - name=exp.name, + pd.SparseArray(exp, kind=kind, fill_value=0), index=exp.index, name=exp.name ) tm.assert_series_equal(res, exp) @@ -172,9 +159,7 @@ def test_concat_sparse_dense(self, kind): exp = pd.concat([dense, pd.Series(val1), dense]) # XXX: changed from SparseSeries to Series[sparse] exp = pd.Series( - pd.SparseArray(exp, kind=kind, fill_value=0), - index=exp.index, - name=exp.name, + pd.SparseArray(exp, kind=kind, fill_value=0), index=exp.index, name=exp.name ) tm.assert_series_equal(res, exp) @@ -182,23 +167,34 @@ def test_concat_sparse_dense(self, kind): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparseDataFrameConcat: - def setup_method(self, method): - self.dense1 = pd.DataFrame({'A': [0., 1., 2., np.nan], - 'B': [0., 0., 0., 0.], - 'C': [np.nan, np.nan, np.nan, np.nan], - 'D': [1., 2., 3., 4.]}) + self.dense1 = pd.DataFrame( + { + "A": [0.0, 1.0, 2.0, np.nan], + "B": [0.0, 0.0, 0.0, 0.0], + "C": [np.nan, np.nan, np.nan, np.nan], + "D": [1.0, 2.0, 3.0, 4.0], + } + ) - self.dense2 = pd.DataFrame({'A': [5., 6., 7., 8.], - 'B': [np.nan, 0., 7., 8.], - 'C': [5., 6., np.nan, np.nan], - 'D': [np.nan, np.nan, np.nan, np.nan]}) + self.dense2 = pd.DataFrame( + { + "A": [5.0, 6.0, 7.0, 8.0], + "B": [np.nan, 0.0, 7.0, 8.0], + "C": [5.0, 6.0, np.nan, np.nan], + "D": [np.nan, np.nan, np.nan, np.nan], + } + ) - self.dense3 = pd.DataFrame({'E': [5., 6., 7., 8.], - 'F': [np.nan, 0., 7., 8.], - 'G': [5., 6., np.nan, np.nan], - 'H': [np.nan, np.nan, np.nan, np.nan]}) + self.dense3 = pd.DataFrame( + { + "E": [5.0, 6.0, 7.0, 8.0], + "F": [np.nan, 0.0, 7.0, 8.0], + "G": [5.0, 6.0, np.nan, np.nan], + "H": [np.nan, np.nan, np.nan, np.nan], + } + ) def test_concat(self): # fill_value = np.nan @@ -250,14 +246,16 @@ def test_concat_different_fill_value(self): sparse = self.dense1.to_sparse() sparse2 = self.dense2.to_sparse(fill_value=0) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): res = pd.concat([sparse, sparse2]) exp = pd.concat([self.dense1, self.dense2]).to_sparse() tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): res = pd.concat([sparse2, sparse]) exp = pd.concat([self.dense2, self.dense1]).to_sparse(fill_value=0) exp._default_fill_value = np.nan @@ -269,13 +267,13 @@ def test_concat_different_columns_sort_warns(self): # stacklevel is wrong since we have two FutureWarnings, # one for depr, one for sorting. - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ): res = pd.concat([sparse, sparse3]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False, - raise_on_extra_warnings=False,): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ): exp = pd.concat([self.dense1, self.dense3]) exp = exp.to_sparse() @@ -297,12 +295,13 @@ def test_concat_different_columns(self): def test_concat_bug(self): from pandas.core.sparse.api import SparseDtype - x = pd.SparseDataFrame({"A": pd.SparseArray([np.nan, np.nan], - fill_value=0)}) + + x = pd.SparseDataFrame({"A": pd.SparseArray([np.nan, np.nan], fill_value=0)}) y = pd.SparseDataFrame({"B": []}) - res = pd.concat([x, y], sort=False)[['A']] - exp = pd.DataFrame({"A": pd.SparseArray([np.nan, np.nan], - dtype=SparseDtype(float, 0))}) + res = pd.concat([x, y], sort=False)[["A"]] + exp = pd.DataFrame( + {"A": pd.SparseArray([np.nan, np.nan], dtype=SparseDtype(float, 0))} + ) tm.assert_frame_equal(res, exp) def test_concat_different_columns_buggy(self): @@ -310,19 +309,19 @@ def test_concat_different_columns_buggy(self): sparse3 = self.dense3.to_sparse(fill_value=0) res = pd.concat([sparse, sparse3], sort=True) - exp = (pd.concat([self.dense1, self.dense3], sort=True) - .to_sparse(fill_value=0)) + exp = pd.concat([self.dense1, self.dense3], sort=True).to_sparse(fill_value=0) exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(res, exp, check_kind=False, - consolidate_block_indices=True) + tm.assert_sp_frame_equal( + res, exp, check_kind=False, consolidate_block_indices=True + ) res = pd.concat([sparse3, sparse], sort=True) - exp = (pd.concat([self.dense3, self.dense1], sort=True) - .to_sparse(fill_value=0)) + exp = pd.concat([self.dense3, self.dense1], sort=True).to_sparse(fill_value=0) exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(res, exp, check_kind=False, - consolidate_block_indices=True) + tm.assert_sp_frame_equal( + res, exp, check_kind=False, consolidate_block_indices=True + ) # different fill values sparse = self.dense1.to_sparse() @@ -343,7 +342,7 @@ def test_concat_series(self): sparse = self.dense1.to_sparse() sparse2 = self.dense2.to_sparse() - for col in ['A', 'D']: + for col in ["A", "D"]: res = pd.concat([sparse, sparse2[col]]) exp = pd.concat([self.dense1, self.dense2[col]]).to_sparse() tm.assert_sp_frame_equal(res, exp, check_kind=False) @@ -356,21 +355,21 @@ def test_concat_series(self): sparse = self.dense1.to_sparse(fill_value=0) sparse2 = self.dense2.to_sparse(fill_value=0) - for col in ['C', 'D']: + for col in ["C", "D"]: res = pd.concat([sparse, sparse2[col]]) - exp = pd.concat([self.dense1, - self.dense2[col]]).to_sparse(fill_value=0) + exp = pd.concat([self.dense1, self.dense2[col]]).to_sparse(fill_value=0) exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(res, exp, check_kind=False, - consolidate_block_indices=True) + tm.assert_sp_frame_equal( + res, exp, check_kind=False, consolidate_block_indices=True + ) res = pd.concat([sparse2[col], sparse]) - exp = pd.concat([self.dense2[col], - self.dense1]).to_sparse(fill_value=0) - exp['C'] = res['C'] + exp = pd.concat([self.dense2[col], self.dense1]).to_sparse(fill_value=0) + exp["C"] = res["C"] exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True, - check_kind=False) + tm.assert_sp_frame_equal( + res, exp, consolidate_block_indices=True, check_kind=False + ) def test_concat_axis1(self): # fill_value = np.nan @@ -391,14 +390,12 @@ def test_concat_axis1(self): sparse3 = self.dense3.to_sparse(fill_value=0) res = pd.concat([sparse, sparse3], axis=1) - exp = pd.concat([self.dense1, self.dense3], - axis=1).to_sparse(fill_value=0) + exp = pd.concat([self.dense1, self.dense3], axis=1).to_sparse(fill_value=0) exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) res = pd.concat([sparse3, sparse], axis=1) - exp = pd.concat([self.dense3, self.dense1], - axis=1).to_sparse(fill_value=0) + exp = pd.concat([self.dense3, self.dense1], axis=1).to_sparse(fill_value=0) exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) @@ -416,14 +413,16 @@ def test_concat_axis1(self): assert isinstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res.to_dense(), exp) - @pytest.mark.parametrize('fill_value,sparse_idx,dense_idx', - itertools.product([None, 0, 1, np.nan], - [0, 1], - [1, 0])) + @pytest.mark.parametrize( + "fill_value,sparse_idx,dense_idx", + itertools.product([None, 0, 1, np.nan], [0, 1], [1, 0]), + ) def test_concat_sparse_dense_rows(self, fill_value, sparse_idx, dense_idx): frames = [self.dense1, self.dense2] - sparse_frame = [frames[dense_idx], - frames[sparse_idx].to_sparse(fill_value=fill_value)] + sparse_frame = [ + frames[dense_idx], + frames[sparse_idx].to_sparse(fill_value=fill_value), + ] dense_frame = [frames[dense_idx], frames[sparse_idx]] # This will try both directions sparse + dense and dense + sparse @@ -437,20 +436,21 @@ def test_concat_sparse_dense_rows(self, fill_value, sparse_idx, dense_idx): sparse_frame = sparse_frame[::-1] dense_frame = dense_frame[::-1] - @pytest.mark.parametrize('fill_value,sparse_idx,dense_idx', - itertools.product([None, 0, 1, np.nan], - [0, 1], - [1, 0])) - @pytest.mark.xfail(reason="The iloc fails and I can't make expected", - strict=False) + @pytest.mark.parametrize( + "fill_value,sparse_idx,dense_idx", + itertools.product([None, 0, 1, np.nan], [0, 1], [1, 0]), + ) + @pytest.mark.xfail(reason="The iloc fails and I can't make expected", strict=False) def test_concat_sparse_dense_cols(self, fill_value, sparse_idx, dense_idx): # See GH16874, GH18914 and #18686 for why this should be a DataFrame from pandas.core.dtypes.common import is_sparse frames = [self.dense1, self.dense3] - sparse_frame = [frames[dense_idx], - frames[sparse_idx].to_sparse(fill_value=fill_value)] + sparse_frame = [ + frames[dense_idx], + frames[sparse_idx].to_sparse(fill_value=fill_value), + ] dense_frame = [frames[dense_idx], frames[sparse_idx]] # This will try both directions sparse + dense and dense + sparse diff --git a/pandas/tests/sparse/test_format.py b/pandas/tests/sparse/test_format.py index 805f77eb21c2f3..cf8734910cd195 100644 --- a/pandas/tests/sparse/test_format.py +++ b/pandas/tests/sparse/test_format.py @@ -15,19 +15,20 @@ @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") class TestSparseSeriesFormatting: - @property def dtype_format_for_platform(self): - return '' if use_32bit_repr else ', dtype=int32' + return "" if use_32bit_repr else ", dtype=int32" def test_sparse_max_row(self): s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse() result = repr(s) dfm = self.dtype_format_for_platform - exp = ("0 1.0\n1 NaN\n2 NaN\n3 3.0\n" - "4 NaN\ndtype: Sparse[float64, nan]\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dfm)) + exp = ( + "0 1.0\n1 NaN\n2 NaN\n3 3.0\n" + "4 NaN\ndtype: Sparse[float64, nan]\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dfm) + ) assert result == exp def test_sparsea_max_row_truncated(self): @@ -37,55 +38,63 @@ def test_sparsea_max_row_truncated(self): with option_context("display.max_rows", 3): # GH 10560 result = repr(s) - exp = ("0 1.0\n ... \n4 NaN\n" - "Length: 5, dtype: Sparse[float64, nan]\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dfm)) + exp = ( + "0 1.0\n ... \n4 NaN\n" + "Length: 5, dtype: Sparse[float64, nan]\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dfm) + ) assert result == exp def test_sparse_mi_max_row(self): - idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0), - ('C', 0), ('C', 1), ('C', 2)]) - s = pd.Series([1, np.nan, np.nan, 3, np.nan, np.nan], - index=idx).to_sparse() + idx = pd.MultiIndex.from_tuples( + [("A", 0), ("A", 1), ("B", 0), ("C", 0), ("C", 1), ("C", 2)] + ) + s = pd.Series([1, np.nan, np.nan, 3, np.nan, np.nan], index=idx).to_sparse() result = repr(s) dfm = self.dtype_format_for_platform - exp = ("A 0 1.0\n 1 NaN\nB 0 NaN\n" - "C 0 3.0\n 1 NaN\n 2 NaN\n" - "dtype: Sparse[float64, nan]\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dfm)) + exp = ( + "A 0 1.0\n 1 NaN\nB 0 NaN\n" + "C 0 3.0\n 1 NaN\n 2 NaN\n" + "dtype: Sparse[float64, nan]\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dfm) + ) assert result == exp - with option_context("display.max_rows", 3, - "display.show_dimensions", False): + with option_context("display.max_rows", 3, "display.show_dimensions", False): # GH 13144 result = repr(s) - exp = ("A 0 1.0\n ... \nC 2 NaN\n" - "dtype: Sparse[float64, nan]\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dfm)) + exp = ( + "A 0 1.0\n ... \nC 2 NaN\n" + "dtype: Sparse[float64, nan]\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dfm) + ) assert result == exp def test_sparse_bool(self): # GH 13110 - s = pd.SparseSeries([True, False, False, True, False, False], - fill_value=False) + s = pd.SparseSeries([True, False, False, True, False, False], fill_value=False) result = repr(s) - dtype = '' if use_32bit_repr else ', dtype=int32' - exp = ("0 True\n1 False\n2 False\n" - "3 True\n4 False\n5 False\n" - "dtype: Sparse[bool, False]\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype)) + dtype = "" if use_32bit_repr else ", dtype=int32" + exp = ( + "0 True\n1 False\n2 False\n" + "3 True\n4 False\n5 False\n" + "dtype: Sparse[bool, False]\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype) + ) assert result == exp with option_context("display.max_rows", 3): result = repr(s) - exp = ("0 True\n ... \n5 False\n" - "Length: 6, dtype: Sparse[bool, False]\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype)) + exp = ( + "0 True\n ... \n5 False\n" + "Length: 6, dtype: Sparse[bool, False]\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype) + ) assert result == exp def test_sparse_int(self): @@ -93,33 +102,39 @@ def test_sparse_int(self): s = pd.SparseSeries([0, 1, 0, 0, 1, 0], fill_value=False) result = repr(s) - dtype = '' if use_32bit_repr else ', dtype=int32' - exp = ("0 0\n1 1\n2 0\n3 0\n4 1\n" - "5 0\ndtype: Sparse[int64, False]\nBlockIndex\n" - "Block locations: array([1, 4]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype)) + dtype = "" if use_32bit_repr else ", dtype=int32" + exp = ( + "0 0\n1 1\n2 0\n3 0\n4 1\n" + "5 0\ndtype: Sparse[int64, False]\nBlockIndex\n" + "Block locations: array([1, 4]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype) + ) assert result == exp - with option_context("display.max_rows", 3, - "display.show_dimensions", False): + with option_context("display.max_rows", 3, "display.show_dimensions", False): result = repr(s) - exp = ("0 0\n ..\n5 0\n" - "dtype: Sparse[int64, False]\nBlockIndex\n" - "Block locations: array([1, 4]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype)) + exp = ( + "0 0\n ..\n5 0\n" + "dtype: Sparse[int64, False]\nBlockIndex\n" + "Block locations: array([1, 4]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype) + ) assert result == exp @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparseDataFrameFormatting: - def test_sparse_frame(self): # GH 13110 - df = pd.DataFrame({'A': [True, False, True, False, True], - 'B': [True, False, True, False, True], - 'C': [0, 0, 3, 0, 5], - 'D': [np.nan, np.nan, np.nan, 1, 2]}) + df = pd.DataFrame( + { + "A": [True, False, True, False, True], + "B": [True, False, True, False, True], + "C": [0, 0, 3, 0, 5], + "D": [np.nan, np.nan, np.nan, 1, 2], + } + ) sparse = df.to_sparse() assert repr(sparse) == repr(df) @@ -132,7 +147,7 @@ def test_sparse_repr_after_set(self): res = sdf.copy() # Ignore the warning - with pd.option_context('mode.chained_assignment', None): + with pd.option_context("mode.chained_assignment", None): sdf[0][1] = 2 # This line triggers the bug repr(sdf) @@ -143,7 +158,7 @@ def test_repr_no_warning(): with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) df = pd.SparseDataFrame({"A": [1, 2]}) - s = df['A'] + s = df["A"] with tm.assert_produces_warning(None): repr(df) diff --git a/pandas/tests/sparse/test_groupby.py b/pandas/tests/sparse/test_groupby.py index bf6055bc127259..04e49a272a77aa 100644 --- a/pandas/tests/sparse/test_groupby.py +++ b/pandas/tests/sparse/test_groupby.py @@ -8,22 +8,22 @@ @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparseGroupBy: - def setup_method(self, method): - self.dense = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8), - 'E': [np.nan, np.nan, 1, 2, - np.nan, 1, np.nan, np.nan]}) + self.dense = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + "E": [np.nan, np.nan, 1, 2, np.nan, 1, np.nan, np.nan], + } + ) self.sparse = self.dense.to_sparse() def test_first_last_nth(self): # tests for first / last / nth - sparse_grouped = self.sparse.groupby('A') - dense_grouped = self.dense.groupby('A') + sparse_grouped = self.sparse.groupby("A") + dense_grouped = self.dense.groupby("A") sparse_grouped_first = sparse_grouped.first() sparse_grouped_last = sparse_grouped.last() @@ -33,16 +33,13 @@ def test_first_last_nth(self): dense_grouped_last = pd.DataFrame(dense_grouped.last().to_sparse()) dense_grouped_nth = pd.DataFrame(dense_grouped.nth(1).to_sparse()) - tm.assert_frame_equal(sparse_grouped_first, - dense_grouped_first) - tm.assert_frame_equal(sparse_grouped_last, - dense_grouped_last) - tm.assert_frame_equal(sparse_grouped_nth, - dense_grouped_nth) + tm.assert_frame_equal(sparse_grouped_first, dense_grouped_first) + tm.assert_frame_equal(sparse_grouped_last, dense_grouped_last) + tm.assert_frame_equal(sparse_grouped_nth, dense_grouped_nth) def test_aggfuncs(self): - sparse_grouped = self.sparse.groupby('A') - dense_grouped = self.dense.groupby('A') + sparse_grouped = self.sparse.groupby("A") + dense_grouped = self.dense.groupby("A") result = sparse_grouped.mean().to_sparse() expected = dense_grouped.mean().to_sparse() @@ -64,10 +61,13 @@ def test_aggfuncs(self): @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") def test_groupby_includes_fill_value(fill_value): # https://github.com/pandas-dev/pandas/issues/5078 - df = pd.DataFrame({'a': [fill_value, 1, fill_value, fill_value], - 'b': [fill_value, 1, fill_value, fill_value]}) + df = pd.DataFrame( + { + "a": [fill_value, 1, fill_value, fill_value], + "b": [fill_value, 1, fill_value, fill_value], + } + ) sdf = df.to_sparse(fill_value=fill_value) - result = sdf.groupby('a').sum() - expected = pd.DataFrame(df.groupby('a').sum().to_sparse( - fill_value=fill_value)) + result = sdf.groupby("a").sum() + expected = pd.DataFrame(df.groupby("a").sum().to_sparse(fill_value=fill_value)) tm.assert_frame_equal(result, expected, check_index_type=False) diff --git a/pandas/tests/sparse/test_indexing.py b/pandas/tests/sparse/test_indexing.py index df59f1dfe7b135..5cfacaf16cffe5 100644 --- a/pandas/tests/sparse/test_indexing.py +++ b/pandas/tests/sparse/test_indexing.py @@ -9,7 +9,6 @@ @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") class TestSparseSeriesIndexing: - def setup_method(self, method): self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) self.sparse = self.orig.to_sparse() @@ -51,16 +50,17 @@ def test_getitem_slice(self): def test_getitem_int_dtype(self): # GH 8292 - s = pd.SparseSeries([0, 1, 2, 3, 4, 5, 6], name='xxx') + s = pd.SparseSeries([0, 1, 2, 3, 4, 5, 6], name="xxx") res = s[::2] - exp = pd.SparseSeries([0, 2, 4, 6], index=[0, 2, 4, 6], name='xxx') + exp = pd.SparseSeries([0, 2, 4, 6], index=[0, 2, 4, 6], name="xxx") tm.assert_sp_series_equal(res, exp) assert res.dtype == SparseDtype(np.int64) - s = pd.SparseSeries([0, 1, 2, 3, 4, 5, 6], fill_value=0, name='xxx') + s = pd.SparseSeries([0, 1, 2, 3, 4, 5, 6], fill_value=0, name="xxx") res = s[::2] - exp = pd.SparseSeries([0, 2, 4, 6], index=[0, 2, 4, 6], - fill_value=0, name='xxx') + exp = pd.SparseSeries( + [0, 2, 4, 6], index=[0, 2, 4, 6], fill_value=0, name="xxx" + ) tm.assert_sp_series_equal(res, exp) assert res.dtype == SparseDtype(np.int64) @@ -102,14 +102,10 @@ def test_getitem_ellipsis(self): def test_getitem_slice_fill_value(self): orig = pd.Series([1, np.nan, 0, 3, 0]) sparse = orig.to_sparse(fill_value=0) - tm.assert_sp_series_equal(sparse[:2], - orig[:2].to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse[4:2], - orig[4:2].to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse[::2], - orig[::2].to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse[-5:], - orig[-5:].to_sparse(fill_value=0)) + tm.assert_sp_series_equal(sparse[:2], orig[:2].to_sparse(fill_value=0)) + tm.assert_sp_series_equal(sparse[4:2], orig[4:2].to_sparse(fill_value=0)) + tm.assert_sp_series_equal(sparse[::2], orig[::2].to_sparse(fill_value=0)) + tm.assert_sp_series_equal(sparse[-5:], orig[-5:].to_sparse(fill_value=0)) def test_loc(self): orig = self.orig @@ -144,14 +140,14 @@ def test_loc(self): tm.assert_sp_series_equal(result, exp) def test_loc_index(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list('ABCDE')) + orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list("ABCDE")) sparse = orig.to_sparse() - assert sparse.loc['A'] == 1 - assert np.isnan(sparse.loc['B']) + assert sparse.loc["A"] == 1 + assert np.isnan(sparse.loc["B"]) - result = sparse.loc[['A', 'C', 'D']] - exp = orig.loc[['A', 'C', 'D']].to_sparse() + result = sparse.loc[["A", "C", "D"]] + exp = orig.loc[["A", "C", "D"]].to_sparse() tm.assert_sp_series_equal(result, exp) # dense array @@ -169,14 +165,14 @@ def test_loc_index(self): tm.assert_sp_series_equal(result, exp) def test_loc_index_fill_value(self): - orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE')) + orig = pd.Series([1, np.nan, 0, 3, 0], index=list("ABCDE")) sparse = orig.to_sparse(fill_value=0) - assert sparse.loc['A'] == 1 - assert np.isnan(sparse.loc['B']) + assert sparse.loc["A"] == 1 + assert np.isnan(sparse.loc["B"]) - result = sparse.loc[['A', 'C', 'D']] - exp = orig.loc[['A', 'C', 'D']].to_sparse(fill_value=0) + result = sparse.loc[["A", "C", "D"]] + exp = orig.loc[["A", "C", "D"]].to_sparse(fill_value=0) tm.assert_sp_series_equal(result, exp) # dense array @@ -195,17 +191,17 @@ def test_loc_slice(self): tm.assert_sp_series_equal(sparse.loc[2:], orig.loc[2:].to_sparse()) def test_loc_slice_index_fill_value(self): - orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE')) + orig = pd.Series([1, np.nan, 0, 3, 0], index=list("ABCDE")) sparse = orig.to_sparse(fill_value=0) - tm.assert_sp_series_equal(sparse.loc['C':], - orig.loc['C':].to_sparse(fill_value=0)) + tm.assert_sp_series_equal( + sparse.loc["C":], orig.loc["C":].to_sparse(fill_value=0) + ) def test_loc_slice_fill_value(self): orig = pd.Series([1, np.nan, 0, 3, 0]) sparse = orig.to_sparse(fill_value=0) - tm.assert_sp_series_equal(sparse.loc[2:], - orig.loc[2:].to_sparse(fill_value=0)) + tm.assert_sp_series_equal(sparse.loc[2:], orig.loc[2:].to_sparse(fill_value=0)) def test_iloc(self): orig = self.orig @@ -245,8 +241,9 @@ def test_iloc_slice(self): def test_iloc_slice_fill_value(self): orig = pd.Series([1, np.nan, 0, 3, 0]) sparse = orig.to_sparse(fill_value=0) - tm.assert_sp_series_equal(sparse.iloc[2:], - orig.iloc[2:].to_sparse(fill_value=0)) + tm.assert_sp_series_equal( + sparse.iloc[2:], orig.iloc[2:].to_sparse(fill_value=0) + ) def test_at(self): orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) @@ -257,24 +254,22 @@ def test_at(self): assert sparse.at[3] == orig.at[3] assert np.isnan(sparse.at[4]) - orig = pd.Series([1, np.nan, np.nan, 3, np.nan], - index=list('abcde')) + orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list("abcde")) sparse = orig.to_sparse() - assert sparse.at['a'] == orig.at['a'] - assert np.isnan(sparse.at['b']) - assert np.isnan(sparse.at['c']) - assert sparse.at['d'] == orig.at['d'] - assert np.isnan(sparse.at['e']) + assert sparse.at["a"] == orig.at["a"] + assert np.isnan(sparse.at["b"]) + assert np.isnan(sparse.at["c"]) + assert sparse.at["d"] == orig.at["d"] + assert np.isnan(sparse.at["e"]) def test_at_fill_value(self): - orig = pd.Series([1, np.nan, 0, 3, 0], - index=list('abcde')) + orig = pd.Series([1, np.nan, 0, 3, 0], index=list("abcde")) sparse = orig.to_sparse(fill_value=0) - assert sparse.at['a'] == orig.at['a'] - assert np.isnan(sparse.at['b']) - assert sparse.at['c'] == orig.at['c'] - assert sparse.at['d'] == orig.at['d'] - assert sparse.at['e'] == orig.at['e'] + assert sparse.at["a"] == orig.at["a"] + assert np.isnan(sparse.at["b"]) + assert sparse.at["c"] == orig.at["c"] + assert sparse.at["d"] == orig.at["d"] + assert sparse.at["e"] == orig.at["e"] def test_iat(self): orig = self.orig @@ -307,38 +302,37 @@ def test_get(self): assert np.isnan(s.get(1)) assert s.get(5) is None - s = pd.SparseSeries([1, np.nan, 0, 3, 0], index=list('ABCDE')) - assert s.get('A') == 1 - assert np.isnan(s.get('B')) - assert s.get('C') == 0 - assert s.get('XX') is None + s = pd.SparseSeries([1, np.nan, 0, 3, 0], index=list("ABCDE")) + assert s.get("A") == 1 + assert np.isnan(s.get("B")) + assert s.get("C") == 0 + assert s.get("XX") is None - s = pd.SparseSeries([1, np.nan, 0, 3, 0], index=list('ABCDE'), - fill_value=0) - assert s.get('A') == 1 - assert np.isnan(s.get('B')) - assert s.get('C') == 0 - assert s.get('XX') is None + s = pd.SparseSeries([1, np.nan, 0, 3, 0], index=list("ABCDE"), fill_value=0) + assert s.get("A") == 1 + assert np.isnan(s.get("B")) + assert s.get("C") == 0 + assert s.get("XX") is None def test_take(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan], - index=list('ABCDE')) + orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list("ABCDE")) sparse = orig.to_sparse() - tm.assert_sp_series_equal(sparse.take([0]), - orig.take([0]).to_sparse()) - tm.assert_sp_series_equal(sparse.take([0, 1, 3]), - orig.take([0, 1, 3]).to_sparse()) - tm.assert_sp_series_equal(sparse.take([-1, -2]), - orig.take([-1, -2]).to_sparse()) + tm.assert_sp_series_equal(sparse.take([0]), orig.take([0]).to_sparse()) + tm.assert_sp_series_equal( + sparse.take([0, 1, 3]), orig.take([0, 1, 3]).to_sparse() + ) + tm.assert_sp_series_equal( + sparse.take([-1, -2]), orig.take([-1, -2]).to_sparse() + ) def test_take_fill_value(self): - orig = pd.Series([1, np.nan, 0, 3, 0], - index=list('ABCDE')) + orig = pd.Series([1, np.nan, 0, 3, 0], index=list("ABCDE")) sparse = orig.to_sparse(fill_value=0) - tm.assert_sp_series_equal(sparse.take([0]), - orig.take([0]).to_sparse(fill_value=0)) + tm.assert_sp_series_equal( + sparse.take([0]), orig.take([0]).to_sparse(fill_value=0) + ) exp = orig.take([0, 1, 3]).to_sparse(fill_value=0) tm.assert_sp_series_equal(sparse.take([0, 1, 3]), exp) @@ -347,81 +341,76 @@ def test_take_fill_value(self): tm.assert_sp_series_equal(sparse.take([-1, -2]), exp) def test_reindex(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan], - index=list('ABCDE')) + orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list("ABCDE")) sparse = orig.to_sparse() - res = sparse.reindex(['A', 'E', 'C', 'D']) - exp = orig.reindex(['A', 'E', 'C', 'D']).to_sparse() + res = sparse.reindex(["A", "E", "C", "D"]) + exp = orig.reindex(["A", "E", "C", "D"]).to_sparse() tm.assert_sp_series_equal(res, exp) # all missing & fill_value - res = sparse.reindex(['B', 'E', 'C']) - exp = orig.reindex(['B', 'E', 'C']).to_sparse() + res = sparse.reindex(["B", "E", "C"]) + exp = orig.reindex(["B", "E", "C"]).to_sparse() tm.assert_sp_series_equal(res, exp) - orig = pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan], - index=list('ABCDE')) + orig = pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan], index=list("ABCDE")) sparse = orig.to_sparse() - res = sparse.reindex(['A', 'E', 'C', 'D']) - exp = orig.reindex(['A', 'E', 'C', 'D']).to_sparse() + res = sparse.reindex(["A", "E", "C", "D"]) + exp = orig.reindex(["A", "E", "C", "D"]).to_sparse() tm.assert_sp_series_equal(res, exp) def test_fill_value_reindex(self): - orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE')) + orig = pd.Series([1, np.nan, 0, 3, 0], index=list("ABCDE")) sparse = orig.to_sparse(fill_value=0) - res = sparse.reindex(['A', 'E', 'C', 'D']) - exp = orig.reindex(['A', 'E', 'C', 'D']).to_sparse(fill_value=0) + res = sparse.reindex(["A", "E", "C", "D"]) + exp = orig.reindex(["A", "E", "C", "D"]).to_sparse(fill_value=0) tm.assert_sp_series_equal(res, exp) # includes missing and fill_value - res = sparse.reindex(['A', 'B', 'C']) - exp = orig.reindex(['A', 'B', 'C']).to_sparse(fill_value=0) + res = sparse.reindex(["A", "B", "C"]) + exp = orig.reindex(["A", "B", "C"]).to_sparse(fill_value=0) tm.assert_sp_series_equal(res, exp) # all missing - orig = pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan], - index=list('ABCDE')) + orig = pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan], index=list("ABCDE")) sparse = orig.to_sparse(fill_value=0) - res = sparse.reindex(['A', 'E', 'C', 'D']) - exp = orig.reindex(['A', 'E', 'C', 'D']).to_sparse(fill_value=0) + res = sparse.reindex(["A", "E", "C", "D"]) + exp = orig.reindex(["A", "E", "C", "D"]).to_sparse(fill_value=0) tm.assert_sp_series_equal(res, exp) # all fill_value - orig = pd.Series([0., 0., 0., 0., 0.], - index=list('ABCDE')) + orig = pd.Series([0.0, 0.0, 0.0, 0.0, 0.0], index=list("ABCDE")) sparse = orig.to_sparse(fill_value=0) def test_fill_value_reindex_coerces_float_int(self): - orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE')) + orig = pd.Series([1, np.nan, 0, 3, 0], index=list("ABCDE")) sparse = orig.to_sparse(fill_value=0) - res = sparse.reindex(['A', 'E', 'C', 'D']) - exp = orig.reindex(['A', 'E', 'C', 'D']).to_sparse(fill_value=0) + res = sparse.reindex(["A", "E", "C", "D"]) + exp = orig.reindex(["A", "E", "C", "D"]).to_sparse(fill_value=0) tm.assert_sp_series_equal(res, exp) def test_reindex_fill_value(self): - floats = pd.Series([1., 2., 3.]).to_sparse() + floats = pd.Series([1.0, 2.0, 3.0]).to_sparse() result = floats.reindex([1, 2, 3], fill_value=0) - expected = pd.Series([2., 3., 0], index=[1, 2, 3]).to_sparse() + expected = pd.Series([2.0, 3.0, 0], index=[1, 2, 3]).to_sparse() tm.assert_sp_series_equal(result, expected) def test_reindex_nearest(self): - s = pd.Series(np.arange(10, dtype='float64')).to_sparse() + s = pd.Series(np.arange(10, dtype="float64")).to_sparse() target = [0.1, 0.9, 1.5, 2.0] - actual = s.reindex(target, method='nearest') + actual = s.reindex(target, method="nearest") expected = pd.Series(np.around(target), target).to_sparse() tm.assert_sp_series_equal(expected, actual) - actual = s.reindex(target, method='nearest', tolerance=0.2) + actual = s.reindex(target, method="nearest", tolerance=0.2) expected = pd.Series([0, 1, np.nan, 2], target).to_sparse() tm.assert_sp_series_equal(expected, actual) - actual = s.reindex(target, method='nearest', - tolerance=[0.3, 0.01, 0.4, 3]) + actual = s.reindex(target, method="nearest", tolerance=[0.3, 0.01, 0.4, 3]) expected = pd.Series([0, np.nan, np.nan, 2], target).to_sparse() tm.assert_sp_series_equal(expected, actual) @@ -430,17 +419,19 @@ def test_reindex_nearest(self): def tests_indexing_with_sparse(self, kind, fill): # see gh-13985 arr = pd.SparseArray([1, 2, 3], kind=kind) - indexer = pd.SparseArray([True, False, True], - fill_value=fill, - dtype=bool) + indexer = pd.SparseArray([True, False, True], fill_value=fill, dtype=bool) expected = arr[indexer] result = pd.SparseArray([1, 3], kind=kind) tm.assert_sp_array_equal(result, expected) s = pd.SparseSeries(arr, index=["a", "b", "c"], dtype=np.float64) - expected = pd.SparseSeries([1, 3], index=["a", "c"], kind=kind, - dtype=SparseDtype(np.float64, s.fill_value)) + expected = pd.SparseSeries( + [1, 3], + index=["a", "c"], + kind=kind, + dtype=SparseDtype(np.float64, s.fill_value), + ) tm.assert_sp_series_equal(s[indexer], expected) tm.assert_sp_series_equal(s.loc[indexer], expected) @@ -450,19 +441,18 @@ def tests_indexing_with_sparse(self, kind, fill): tm.assert_sp_series_equal(s[indexer], expected) tm.assert_sp_series_equal(s.loc[indexer], expected) - msg = ("iLocation based boolean indexing cannot " - "use an indexable as a mask") + msg = "iLocation based boolean indexing cannot " "use an indexable as a mask" with pytest.raises(ValueError, match=msg): s.iloc[indexer] @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseSeriesMultiIndexing(TestSparseSeriesIndexing): - def setup_method(self, method): # Mi with duplicated values - idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0), - ('C', 0), ('C', 1)]) + idx = pd.MultiIndex.from_tuples( + [("A", 0), ("A", 1), ("B", 0), ("C", 0), ("C", 1)] + ) self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=idx) self.sparse = self.orig.to_sparse() @@ -474,8 +464,8 @@ def test_getitem_multi(self): assert np.isnan(sparse[1]) assert sparse[3] == orig[3] - tm.assert_sp_series_equal(sparse['A'], orig['A'].to_sparse()) - tm.assert_sp_series_equal(sparse['B'], orig['B'].to_sparse()) + tm.assert_sp_series_equal(sparse["A"], orig["A"].to_sparse()) + tm.assert_sp_series_equal(sparse["B"], orig["B"].to_sparse()) result = sparse[[1, 3, 4]] exp = orig[[1, 3, 4]].to_sparse() @@ -499,31 +489,28 @@ def test_getitem_multi_tuple(self): orig = self.orig sparse = self.sparse - assert sparse['C', 0] == orig['C', 0] - assert np.isnan(sparse['A', 1]) - assert np.isnan(sparse['B', 0]) + assert sparse["C", 0] == orig["C", 0] + assert np.isnan(sparse["A", 1]) + assert np.isnan(sparse["B", 0]) def test_getitems_slice_multi(self): orig = self.orig sparse = self.sparse tm.assert_sp_series_equal(sparse[2:], orig[2:].to_sparse()) - tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse()) - tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse()) + tm.assert_sp_series_equal(sparse.loc["B":], orig.loc["B":].to_sparse()) + tm.assert_sp_series_equal(sparse.loc["C":], orig.loc["C":].to_sparse()) - tm.assert_sp_series_equal(sparse.loc['A':'B'], - orig.loc['A':'B'].to_sparse()) - tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc["A":"B"], orig.loc["A":"B"].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[:"B"], orig.loc[:"B"].to_sparse()) def test_loc(self): # need to be override to use different label orig = self.orig sparse = self.sparse - tm.assert_sp_series_equal(sparse.loc['A'], - orig.loc['A'].to_sparse()) - tm.assert_sp_series_equal(sparse.loc['B'], - orig.loc['B'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc["A"], orig.loc["A"].to_sparse()) + tm.assert_sp_series_equal(sparse.loc["B"], orig.loc["B"].to_sparse()) result = sparse.loc[[1, 3, 4]] exp = orig.loc[[1, 3, 4]].to_sparse() @@ -535,8 +522,8 @@ def test_loc(self): tm.assert_sp_series_equal(result, exp) # single element list (GH 15447) - result = sparse.loc[['A']] - exp = orig.loc[['A']].to_sparse() + result = sparse.loc[["A"]] + exp = orig.loc[["A"]].to_sparse() tm.assert_sp_series_equal(result, exp) # dense array @@ -557,43 +544,42 @@ def test_loc_multi_tuple(self): orig = self.orig sparse = self.sparse - assert sparse.loc['C', 0] == orig.loc['C', 0] - assert np.isnan(sparse.loc['A', 1]) - assert np.isnan(sparse.loc['B', 0]) + assert sparse.loc["C", 0] == orig.loc["C", 0] + assert np.isnan(sparse.loc["A", 1]) + assert np.isnan(sparse.loc["B", 0]) def test_loc_slice(self): orig = self.orig sparse = self.sparse - tm.assert_sp_series_equal(sparse.loc['A':], orig.loc['A':].to_sparse()) - tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse()) - tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse()) + tm.assert_sp_series_equal(sparse.loc["A":], orig.loc["A":].to_sparse()) + tm.assert_sp_series_equal(sparse.loc["B":], orig.loc["B":].to_sparse()) + tm.assert_sp_series_equal(sparse.loc["C":], orig.loc["C":].to_sparse()) - tm.assert_sp_series_equal(sparse.loc['A':'B'], - orig.loc['A':'B'].to_sparse()) - tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc["A":"B"], orig.loc["A":"B"].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[:"B"], orig.loc[:"B"].to_sparse()) def test_reindex(self): # GH 15447 orig = self.orig sparse = self.sparse - res = sparse.reindex([('A', 0), ('C', 1)]) - exp = orig.reindex([('A', 0), ('C', 1)]).to_sparse() + res = sparse.reindex([("A", 0), ("C", 1)]) + exp = orig.reindex([("A", 0), ("C", 1)]).to_sparse() tm.assert_sp_series_equal(res, exp) # On specific level: - res = sparse.reindex(['A', 'C', 'B'], level=0) - exp = orig.reindex(['A', 'C', 'B'], level=0).to_sparse() + res = sparse.reindex(["A", "C", "B"], level=0) + exp = orig.reindex(["A", "C", "B"], level=0).to_sparse() tm.assert_sp_series_equal(res, exp) # single element list (GH 15447) - res = sparse.reindex(['A'], level=0) - exp = orig.reindex(['A'], level=0).to_sparse() + res = sparse.reindex(["A"], level=0) + exp = orig.reindex(["A"], level=0).to_sparse() tm.assert_sp_series_equal(res, exp) with pytest.raises(TypeError): # Incomplete keys are not accepted for reindexing: - sparse.reindex(['A', 'C']) + sparse.reindex(["A", "C"]) # "copy" argument: res = sparse.reindex(sparse.index, copy=True) @@ -606,48 +592,44 @@ def test_reindex(self): @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") class TestSparseDataFrameIndexing: - def test_getitem(self): - orig = pd.DataFrame([[1, np.nan, np.nan], - [2, 3, np.nan], - [np.nan, np.nan, 4], - [0, np.nan, 5]], - columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, np.nan], [2, 3, np.nan], [np.nan, np.nan, 4], [0, np.nan, 5]], + columns=list("xyz"), + ) sparse = orig.to_sparse() - tm.assert_sp_series_equal(sparse['x'], orig['x'].to_sparse()) - tm.assert_sp_frame_equal(sparse[['x']], orig[['x']].to_sparse()) - tm.assert_sp_frame_equal(sparse[['z', 'x']], - orig[['z', 'x']].to_sparse()) + tm.assert_sp_series_equal(sparse["x"], orig["x"].to_sparse()) + tm.assert_sp_frame_equal(sparse[["x"]], orig[["x"]].to_sparse()) + tm.assert_sp_frame_equal(sparse[["z", "x"]], orig[["z", "x"]].to_sparse()) - tm.assert_sp_frame_equal(sparse[[True, False, True, True]], - orig[[True, False, True, True]].to_sparse()) + tm.assert_sp_frame_equal( + sparse[[True, False, True, True]], + orig[[True, False, True, True]].to_sparse(), + ) - tm.assert_sp_frame_equal(sparse.iloc[[1, 2]], - orig.iloc[[1, 2]].to_sparse()) + tm.assert_sp_frame_equal(sparse.iloc[[1, 2]], orig.iloc[[1, 2]].to_sparse()) def test_getitem_fill_value(self): - orig = pd.DataFrame([[1, np.nan, 0], - [2, 3, np.nan], - [0, np.nan, 4], - [0, np.nan, 5]], - columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], + columns=list("xyz"), + ) sparse = orig.to_sparse(fill_value=0) - result = sparse[['z']] - expected = orig[['z']].to_sparse(fill_value=0) + result = sparse[["z"]] + expected = orig[["z"]].to_sparse(fill_value=0) tm.assert_sp_frame_equal(result, expected, check_fill_value=False) - tm.assert_sp_series_equal(sparse['y'], - orig['y'].to_sparse(fill_value=0)) + tm.assert_sp_series_equal(sparse["y"], orig["y"].to_sparse(fill_value=0)) - exp = orig[['x']].to_sparse(fill_value=0) + exp = orig[["x"]].to_sparse(fill_value=0) exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(sparse[['x']], exp) + tm.assert_sp_frame_equal(sparse[["x"]], exp) - exp = orig[['z', 'x']].to_sparse(fill_value=0) + exp = orig[["z", "x"]].to_sparse(fill_value=0) exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(sparse[['z', 'x']], exp) + tm.assert_sp_frame_equal(sparse[["z", "x"]], exp) indexer = [True, False, True, True] exp = orig[indexer].to_sparse(fill_value=0) @@ -659,31 +641,29 @@ def test_getitem_fill_value(self): tm.assert_sp_frame_equal(sparse.iloc[[1, 2]], exp) def test_loc(self): - orig = pd.DataFrame([[1, np.nan, np.nan], - [2, 3, np.nan], - [np.nan, np.nan, 4]], - columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, np.nan], [2, 3, np.nan], [np.nan, np.nan, 4]], + columns=list("xyz"), + ) sparse = orig.to_sparse() - assert sparse.loc[0, 'x'] == 1 - assert np.isnan(sparse.loc[1, 'z']) - assert sparse.loc[2, 'z'] == 4 + assert sparse.loc[0, "x"] == 1 + assert np.isnan(sparse.loc[1, "z"]) + assert sparse.loc[2, "z"] == 4 # have to specify `kind='integer'`, since we construct a # new SparseArray here, and the default sparse type is # integer there, but block in SparseSeries - tm.assert_sp_series_equal(sparse.loc[0], - orig.loc[0].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.loc[1], - orig.loc[1].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.loc[2, :], - orig.loc[2, :].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.loc[2, :], - orig.loc[2, :].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.loc[:, 'y'], - orig.loc[:, 'y'].to_sparse()) - tm.assert_sp_series_equal(sparse.loc[:, 'y'], - orig.loc[:, 'y'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[0], orig.loc[0].to_sparse(kind="integer")) + tm.assert_sp_series_equal(sparse.loc[1], orig.loc[1].to_sparse(kind="integer")) + tm.assert_sp_series_equal( + sparse.loc[2, :], orig.loc[2, :].to_sparse(kind="integer") + ) + tm.assert_sp_series_equal( + sparse.loc[2, :], orig.loc[2, :].to_sparse(kind="integer") + ) + tm.assert_sp_series_equal(sparse.loc[:, "y"], orig.loc[:, "y"].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[:, "y"], orig.loc[:, "y"].to_sparse()) result = sparse.loc[[1, 2]] exp = orig.loc[[1, 2]].to_sparse() @@ -693,12 +673,12 @@ def test_loc(self): exp = orig.loc[[1, 2], :].to_sparse() tm.assert_sp_frame_equal(result, exp) - result = sparse.loc[:, ['x', 'z']] - exp = orig.loc[:, ['x', 'z']].to_sparse() + result = sparse.loc[:, ["x", "z"]] + exp = orig.loc[:, ["x", "z"]].to_sparse() tm.assert_sp_frame_equal(result, exp) - result = sparse.loc[[0, 2], ['x', 'z']] - exp = orig.loc[[0, 2], ['x', 'z']].to_sparse() + result = sparse.loc[[0, 2], ["x", "z"]] + exp = orig.loc[[0, 2], ["x", "z"]].to_sparse() tm.assert_sp_frame_equal(result, exp) # exceeds the bounds @@ -721,44 +701,47 @@ def test_loc(self): tm.assert_sp_frame_equal(result, exp) def test_loc_index(self): - orig = pd.DataFrame([[1, np.nan, np.nan], - [2, 3, np.nan], - [np.nan, np.nan, 4]], - index=list('abc'), columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, np.nan], [2, 3, np.nan], [np.nan, np.nan, 4]], + index=list("abc"), + columns=list("xyz"), + ) sparse = orig.to_sparse() - assert sparse.loc['a', 'x'] == 1 - assert np.isnan(sparse.loc['b', 'z']) - assert sparse.loc['c', 'z'] == 4 - - tm.assert_sp_series_equal(sparse.loc['a'], - orig.loc['a'].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.loc['b'], - orig.loc['b'].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.loc['b', :], - orig.loc['b', :].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.loc['b', :], - orig.loc['b', :].to_sparse(kind='integer')) - - tm.assert_sp_series_equal(sparse.loc[:, 'z'], - orig.loc[:, 'z'].to_sparse()) - tm.assert_sp_series_equal(sparse.loc[:, 'z'], - orig.loc[:, 'z'].to_sparse()) - - result = sparse.loc[['a', 'b']] - exp = orig.loc[['a', 'b']].to_sparse() + assert sparse.loc["a", "x"] == 1 + assert np.isnan(sparse.loc["b", "z"]) + assert sparse.loc["c", "z"] == 4 + + tm.assert_sp_series_equal( + sparse.loc["a"], orig.loc["a"].to_sparse(kind="integer") + ) + tm.assert_sp_series_equal( + sparse.loc["b"], orig.loc["b"].to_sparse(kind="integer") + ) + tm.assert_sp_series_equal( + sparse.loc["b", :], orig.loc["b", :].to_sparse(kind="integer") + ) + tm.assert_sp_series_equal( + sparse.loc["b", :], orig.loc["b", :].to_sparse(kind="integer") + ) + + tm.assert_sp_series_equal(sparse.loc[:, "z"], orig.loc[:, "z"].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[:, "z"], orig.loc[:, "z"].to_sparse()) + + result = sparse.loc[["a", "b"]] + exp = orig.loc[["a", "b"]].to_sparse() tm.assert_sp_frame_equal(result, exp) - result = sparse.loc[['a', 'b'], :] - exp = orig.loc[['a', 'b'], :].to_sparse() + result = sparse.loc[["a", "b"], :] + exp = orig.loc[["a", "b"], :].to_sparse() tm.assert_sp_frame_equal(result, exp) - result = sparse.loc[:, ['x', 'z']] - exp = orig.loc[:, ['x', 'z']].to_sparse() + result = sparse.loc[:, ["x", "z"]] + exp = orig.loc[:, ["x", "z"]].to_sparse() tm.assert_sp_frame_equal(result, exp) - result = sparse.loc[['c', 'a'], ['x', 'z']] - exp = orig.loc[['c', 'a'], ['x', 'z']].to_sparse() + result = sparse.loc[["c", "a"], ["x", "z"]] + exp = orig.loc[["c", "a"], ["x", "z"]].to_sparse() tm.assert_sp_frame_equal(result, exp) # dense array @@ -776,34 +759,30 @@ def test_loc_index(self): tm.assert_sp_frame_equal(result, exp) def test_loc_slice(self): - orig = pd.DataFrame([[1, np.nan, np.nan], - [2, 3, np.nan], - [np.nan, np.nan, 4]], - columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, np.nan], [2, 3, np.nan], [np.nan, np.nan, 4]], + columns=list("xyz"), + ) sparse = orig.to_sparse() tm.assert_sp_frame_equal(sparse.loc[2:], orig.loc[2:].to_sparse()) def test_iloc(self): - orig = pd.DataFrame([[1, np.nan, np.nan], - [2, 3, np.nan], - [np.nan, np.nan, 4]]) + orig = pd.DataFrame([[1, np.nan, np.nan], [2, 3, np.nan], [np.nan, np.nan, 4]]) sparse = orig.to_sparse() assert sparse.iloc[1, 1] == 3 assert np.isnan(sparse.iloc[2, 0]) - tm.assert_sp_series_equal(sparse.iloc[0], - orig.loc[0].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.iloc[1], - orig.loc[1].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.iloc[2, :], - orig.iloc[2, :].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.iloc[2, :], - orig.iloc[2, :].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.iloc[:, 1], - orig.iloc[:, 1].to_sparse()) - tm.assert_sp_series_equal(sparse.iloc[:, 1], - orig.iloc[:, 1].to_sparse()) + tm.assert_sp_series_equal(sparse.iloc[0], orig.loc[0].to_sparse(kind="integer")) + tm.assert_sp_series_equal(sparse.iloc[1], orig.loc[1].to_sparse(kind="integer")) + tm.assert_sp_series_equal( + sparse.iloc[2, :], orig.iloc[2, :].to_sparse(kind="integer") + ) + tm.assert_sp_series_equal( + sparse.iloc[2, :], orig.iloc[2, :].to_sparse(kind="integer") + ) + tm.assert_sp_series_equal(sparse.iloc[:, 1], orig.iloc[:, 1].to_sparse()) + tm.assert_sp_series_equal(sparse.iloc[:, 1], orig.iloc[:, 1].to_sparse()) result = sparse.iloc[[1, 2]] exp = orig.iloc[[1, 2]].to_sparse() @@ -825,43 +804,43 @@ def test_iloc(self): sparse.iloc[[1, 3, 5]] def test_iloc_slice(self): - orig = pd.DataFrame([[1, np.nan, np.nan], - [2, 3, np.nan], - [np.nan, np.nan, 4]], - columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, np.nan], [2, 3, np.nan], [np.nan, np.nan, 4]], + columns=list("xyz"), + ) sparse = orig.to_sparse() tm.assert_sp_frame_equal(sparse.iloc[2:], orig.iloc[2:].to_sparse()) def test_at(self): - orig = pd.DataFrame([[1, np.nan, 0], - [2, 3, np.nan], - [0, np.nan, 4], - [0, np.nan, 5]], - index=list('ABCD'), columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], + index=list("ABCD"), + columns=list("xyz"), + ) sparse = orig.to_sparse() - assert sparse.at['A', 'x'] == orig.at['A', 'x'] - assert np.isnan(sparse.at['B', 'z']) - assert np.isnan(sparse.at['C', 'y']) - assert sparse.at['D', 'x'] == orig.at['D', 'x'] + assert sparse.at["A", "x"] == orig.at["A", "x"] + assert np.isnan(sparse.at["B", "z"]) + assert np.isnan(sparse.at["C", "y"]) + assert sparse.at["D", "x"] == orig.at["D", "x"] def test_at_fill_value(self): - orig = pd.DataFrame([[1, np.nan, 0], - [2, 3, np.nan], - [0, np.nan, 4], - [0, np.nan, 5]], - index=list('ABCD'), columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], + index=list("ABCD"), + columns=list("xyz"), + ) sparse = orig.to_sparse(fill_value=0) - assert sparse.at['A', 'x'] == orig.at['A', 'x'] - assert np.isnan(sparse.at['B', 'z']) - assert np.isnan(sparse.at['C', 'y']) - assert sparse.at['D', 'x'] == orig.at['D', 'x'] + assert sparse.at["A", "x"] == orig.at["A", "x"] + assert np.isnan(sparse.at["B", "z"]) + assert np.isnan(sparse.at["C", "y"]) + assert sparse.at["D", "x"] == orig.at["D", "x"] def test_iat(self): - orig = pd.DataFrame([[1, np.nan, 0], - [2, 3, np.nan], - [0, np.nan, 4], - [0, np.nan, 5]], - index=list('ABCD'), columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], + index=list("ABCD"), + columns=list("xyz"), + ) sparse = orig.to_sparse() assert sparse.iat[0, 0] == orig.iat[0, 0] assert np.isnan(sparse.iat[1, 2]) @@ -872,11 +851,11 @@ def test_iat(self): assert sparse.iat[-1, -1] == orig.iat[-1, -1] def test_iat_fill_value(self): - orig = pd.DataFrame([[1, np.nan, 0], - [2, 3, np.nan], - [0, np.nan, 4], - [0, np.nan, 5]], - index=list('ABCD'), columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], + index=list("ABCD"), + columns=list("xyz"), + ) sparse = orig.to_sparse(fill_value=0) assert sparse.iat[0, 0] == orig.iat[0, 0] assert np.isnan(sparse.iat[1, 2]) @@ -887,26 +866,21 @@ def test_iat_fill_value(self): assert sparse.iat[-1, -1] == orig.iat[-1, -1] def test_take(self): - orig = pd.DataFrame([[1, np.nan, 0], - [2, 3, np.nan], - [0, np.nan, 4], - [0, np.nan, 5]], - columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], + columns=list("xyz"), + ) sparse = orig.to_sparse() - tm.assert_sp_frame_equal(sparse.take([0]), - orig.take([0]).to_sparse()) - tm.assert_sp_frame_equal(sparse.take([0, 1]), - orig.take([0, 1]).to_sparse()) - tm.assert_sp_frame_equal(sparse.take([-1, -2]), - orig.take([-1, -2]).to_sparse()) + tm.assert_sp_frame_equal(sparse.take([0]), orig.take([0]).to_sparse()) + tm.assert_sp_frame_equal(sparse.take([0, 1]), orig.take([0, 1]).to_sparse()) + tm.assert_sp_frame_equal(sparse.take([-1, -2]), orig.take([-1, -2]).to_sparse()) def test_take_fill_value(self): - orig = pd.DataFrame([[1, np.nan, 0], - [2, 3, np.nan], - [0, np.nan, 4], - [0, np.nan, 5]], - columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], + columns=list("xyz"), + ) sparse = orig.to_sparse(fill_value=0) exp = orig.take([0]).to_sparse(fill_value=0) @@ -922,142 +896,163 @@ def test_take_fill_value(self): tm.assert_sp_frame_equal(sparse.take([-1, -2]), exp) def test_reindex(self): - orig = pd.DataFrame([[1, np.nan, 0], - [2, 3, np.nan], - [0, np.nan, 4], - [0, np.nan, 5]], - index=list('ABCD'), columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], + index=list("ABCD"), + columns=list("xyz"), + ) sparse = orig.to_sparse() - res = sparse.reindex(['A', 'C', 'B']) - exp = orig.reindex(['A', 'C', 'B']).to_sparse() + res = sparse.reindex(["A", "C", "B"]) + exp = orig.reindex(["A", "C", "B"]).to_sparse() tm.assert_sp_frame_equal(res, exp) - orig = pd.DataFrame([[np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan]], - index=list('ABCD'), columns=list('xyz')) + orig = pd.DataFrame( + [ + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + ], + index=list("ABCD"), + columns=list("xyz"), + ) sparse = orig.to_sparse() - res = sparse.reindex(['A', 'C', 'B']) - exp = orig.reindex(['A', 'C', 'B']).to_sparse() + res = sparse.reindex(["A", "C", "B"]) + exp = orig.reindex(["A", "C", "B"]).to_sparse() tm.assert_sp_frame_equal(res, exp) def test_reindex_fill_value(self): - orig = pd.DataFrame([[1, np.nan, 0], - [2, 3, np.nan], - [0, np.nan, 4], - [0, np.nan, 5]], - index=list('ABCD'), columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], + index=list("ABCD"), + columns=list("xyz"), + ) sparse = orig.to_sparse(fill_value=0) - res = sparse.reindex(['A', 'C', 'B']) - exp = orig.reindex(['A', 'C', 'B']).to_sparse(fill_value=0) + res = sparse.reindex(["A", "C", "B"]) + exp = orig.reindex(["A", "C", "B"]).to_sparse(fill_value=0) tm.assert_sp_frame_equal(res, exp) # all missing - orig = pd.DataFrame([[np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan]], - index=list('ABCD'), columns=list('xyz')) + orig = pd.DataFrame( + [ + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + ], + index=list("ABCD"), + columns=list("xyz"), + ) sparse = orig.to_sparse(fill_value=0) - res = sparse.reindex(['A', 'C', 'B']) - exp = orig.reindex(['A', 'C', 'B']).to_sparse(fill_value=0) + res = sparse.reindex(["A", "C", "B"]) + exp = orig.reindex(["A", "C", "B"]).to_sparse(fill_value=0) tm.assert_sp_frame_equal(res, exp) # all fill_value - orig = pd.DataFrame([[0, 0, 0], - [0, 0, 0], - [0, 0, 0], - [0, 0, 0]], - index=list('ABCD'), columns=list('xyz'), - dtype=np.int) + orig = pd.DataFrame( + [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], + index=list("ABCD"), + columns=list("xyz"), + dtype=np.int, + ) sparse = orig.to_sparse(fill_value=0) - res = sparse.reindex(['A', 'C', 'B']) - exp = orig.reindex(['A', 'C', 'B']).to_sparse(fill_value=0) + res = sparse.reindex(["A", "C", "B"]) + exp = orig.reindex(["A", "C", "B"]).to_sparse(fill_value=0) tm.assert_sp_frame_equal(res, exp) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestMultitype: - def setup_method(self, method): - self.cols = ['string', 'int', 'float', 'object'] + self.cols = ["string", "int", "float", "object"] - self.string_series = pd.SparseSeries(['a', 'b', 'c']) + self.string_series = pd.SparseSeries(["a", "b", "c"]) self.int_series = pd.SparseSeries([1, 2, 3]) self.float_series = pd.SparseSeries([1.1, 1.2, 1.3]) self.object_series = pd.SparseSeries([[], {}, set()]) - self.sdf = pd.SparseDataFrame({ - 'string': self.string_series, - 'int': self.int_series, - 'float': self.float_series, - 'object': self.object_series, - }) + self.sdf = pd.SparseDataFrame( + { + "string": self.string_series, + "int": self.int_series, + "float": self.float_series, + "object": self.object_series, + } + ) self.sdf = self.sdf[self.cols] - self.ss = pd.SparseSeries(['a', 1, 1.1, []], index=self.cols) + self.ss = pd.SparseSeries(["a", 1, 1.1, []], index=self.cols) def test_frame_basic_dtypes(self): for _, row in self.sdf.iterrows(): assert row.dtype == SparseDtype(object) - tm.assert_sp_series_equal(self.sdf['string'], self.string_series, - check_names=False) - tm.assert_sp_series_equal(self.sdf['int'], self.int_series, - check_names=False) - tm.assert_sp_series_equal(self.sdf['float'], self.float_series, - check_names=False) - tm.assert_sp_series_equal(self.sdf['object'], self.object_series, - check_names=False) + tm.assert_sp_series_equal( + self.sdf["string"], self.string_series, check_names=False + ) + tm.assert_sp_series_equal(self.sdf["int"], self.int_series, check_names=False) + tm.assert_sp_series_equal( + self.sdf["float"], self.float_series, check_names=False + ) + tm.assert_sp_series_equal( + self.sdf["object"], self.object_series, check_names=False + ) def test_frame_indexing_single(self): - tm.assert_sp_series_equal(self.sdf.iloc[0], - pd.SparseSeries(['a', 1, 1.1, []], - index=self.cols), - check_names=False) - tm.assert_sp_series_equal(self.sdf.iloc[1], - pd.SparseSeries(['b', 2, 1.2, {}], - index=self.cols), - check_names=False) - tm.assert_sp_series_equal(self.sdf.iloc[2], - pd.SparseSeries(['c', 3, 1.3, set()], - index=self.cols), - check_names=False) + tm.assert_sp_series_equal( + self.sdf.iloc[0], + pd.SparseSeries(["a", 1, 1.1, []], index=self.cols), + check_names=False, + ) + tm.assert_sp_series_equal( + self.sdf.iloc[1], + pd.SparseSeries(["b", 2, 1.2, {}], index=self.cols), + check_names=False, + ) + tm.assert_sp_series_equal( + self.sdf.iloc[2], + pd.SparseSeries(["c", 3, 1.3, set()], index=self.cols), + check_names=False, + ) def test_frame_indexing_multiple(self): tm.assert_sp_frame_equal(self.sdf, self.sdf[:]) tm.assert_sp_frame_equal(self.sdf, self.sdf.loc[:]) - tm.assert_sp_frame_equal(self.sdf.iloc[[1, 2]], - pd.SparseDataFrame({ - 'string': self.string_series.iloc[[1, 2]], - 'int': self.int_series.iloc[[1, 2]], - 'float': self.float_series.iloc[[1, 2]], - 'object': self.object_series.iloc[[1, 2]] - }, index=[1, 2])[self.cols]) - tm.assert_sp_frame_equal(self.sdf[['int', 'string']], - pd.SparseDataFrame({ - 'int': self.int_series, - 'string': self.string_series, - })) + tm.assert_sp_frame_equal( + self.sdf.iloc[[1, 2]], + pd.SparseDataFrame( + { + "string": self.string_series.iloc[[1, 2]], + "int": self.int_series.iloc[[1, 2]], + "float": self.float_series.iloc[[1, 2]], + "object": self.object_series.iloc[[1, 2]], + }, + index=[1, 2], + )[self.cols], + ) + tm.assert_sp_frame_equal( + self.sdf[["int", "string"]], + pd.SparseDataFrame({"int": self.int_series, "string": self.string_series}), + ) def test_series_indexing_single(self): for i, idx in enumerate(self.cols): assert self.ss.iloc[i] == self.ss[idx] - tm.assert_class_equal(self.ss.iloc[i], self.ss[idx], - obj="series index") + tm.assert_class_equal(self.ss.iloc[i], self.ss[idx], obj="series index") - assert self.ss['string'] == 'a' - assert self.ss['int'] == 1 - assert self.ss['float'] == 1.1 - assert self.ss['object'] == [] + assert self.ss["string"] == "a" + assert self.ss["int"] == 1 + assert self.ss["float"] == 1.1 + assert self.ss["object"] == [] def test_series_indexing_multiple(self): - tm.assert_sp_series_equal(self.ss.loc[['string', 'int']], - pd.SparseSeries(['a', 1], - index=['string', 'int'])) - tm.assert_sp_series_equal(self.ss.loc[['string', 'object']], - pd.SparseSeries(['a', []], - index=['string', 'object'])) + tm.assert_sp_series_equal( + self.ss.loc[["string", "int"]], + pd.SparseSeries(["a", 1], index=["string", "int"]), + ) + tm.assert_sp_series_equal( + self.ss.loc[["string", "object"]], + pd.SparseSeries(["a", []], index=["string", "object"]), + ) diff --git a/pandas/tests/sparse/test_pivot.py b/pandas/tests/sparse/test_pivot.py index 8f98117f20208f..85b899dfe76d59 100644 --- a/pandas/tests/sparse/test_pivot.py +++ b/pandas/tests/sparse/test_pivot.py @@ -10,52 +10,56 @@ @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestPivotTable: - def setup_method(self, method): rs = np.random.RandomState(0) - self.dense = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': rs.randn(8), - 'D': rs.randn(8), - 'E': [np.nan, np.nan, 1, 2, - np.nan, 1, np.nan, np.nan]}) + self.dense = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": rs.randn(8), + "D": rs.randn(8), + "E": [np.nan, np.nan, 1, 2, np.nan, 1, np.nan, np.nan], + } + ) self.sparse = self.dense.to_sparse() def test_pivot_table(self): - res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', - values='C') - res_dense = pd.pivot_table(self.dense, index='A', columns='B', - values='C') + res_sparse = pd.pivot_table(self.sparse, index="A", columns="B", values="C") + res_dense = pd.pivot_table(self.dense, index="A", columns="B", values="C") tm.assert_frame_equal(res_sparse, res_dense) - res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', - values='E') - res_dense = pd.pivot_table(self.dense, index='A', columns='B', - values='E') + res_sparse = pd.pivot_table(self.sparse, index="A", columns="B", values="E") + res_dense = pd.pivot_table(self.dense, index="A", columns="B", values="E") tm.assert_frame_equal(res_sparse, res_dense) - res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', - values='E', aggfunc='mean') - res_dense = pd.pivot_table(self.dense, index='A', columns='B', - values='E', aggfunc='mean') + res_sparse = pd.pivot_table( + self.sparse, index="A", columns="B", values="E", aggfunc="mean" + ) + res_dense = pd.pivot_table( + self.dense, index="A", columns="B", values="E", aggfunc="mean" + ) tm.assert_frame_equal(res_sparse, res_dense) def test_pivot_table_with_nans(self): - res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', - values='E', aggfunc='sum') - res_dense = pd.pivot_table(self.dense, index='A', columns='B', - values='E', aggfunc='sum') + res_sparse = pd.pivot_table( + self.sparse, index="A", columns="B", values="E", aggfunc="sum" + ) + res_dense = pd.pivot_table( + self.dense, index="A", columns="B", values="E", aggfunc="sum" + ) tm.assert_frame_equal(res_sparse, res_dense) - @pytest.mark.xfail(not _np_version_under1p17, - reason="failing occasionally on numpy > 1.17", - strict=False) + @pytest.mark.xfail( + not _np_version_under1p17, + reason="failing occasionally on numpy > 1.17", + strict=False, + ) def test_pivot_table_multi(self): - res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', - values=['D', 'E']) - res_dense = pd.pivot_table(self.dense, index='A', columns='B', - values=['D', 'E']) + res_sparse = pd.pivot_table( + self.sparse, index="A", columns="B", values=["D", "E"] + ) + res_dense = pd.pivot_table( + self.dense, index="A", columns="B", values=["D", "E"] + ) res_dense = res_dense.apply(lambda x: x.astype("Sparse[float64]")) tm.assert_frame_equal(res_sparse, res_dense) diff --git a/pandas/tests/sparse/test_reshape.py b/pandas/tests/sparse/test_reshape.py index 37ec0bba2621dc..bb5232f065a049 100644 --- a/pandas/tests/sparse/test_reshape.py +++ b/pandas/tests/sparse/test_reshape.py @@ -26,9 +26,7 @@ def test_sparse_frame_stack(sparse_df, multi_index3): def test_sparse_frame_unstack(sparse_df): mi = pd.MultiIndex.from_tuples([(0, 0), (1, 0), (1, 2)]) sparse_df.index = mi - arr = np.array([[1, np.nan, np.nan], - [np.nan, 1, np.nan], - [np.nan, np.nan, 1]]) + arr = np.array([[1, np.nan, np.nan], [np.nan, 1, np.nan], [np.nan, np.nan, 1]]) unstacked_df = pd.DataFrame(arr, index=mi).unstack() unstacked_sdf = sparse_df.unstack() diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 565c98ffad77ba..c0d73821020b53 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -7,8 +7,7 @@ from numpy.random import RandomState import pytest -from pandas._libs import ( - algos as libalgos, groupby as libgroupby, hashtable as ht) +from pandas._libs import algos as libalgos, groupby as libgroupby, hashtable as ht from pandas.compat.numpy import np_array_datetime64_compat import pandas.util._test_decorators as td @@ -16,8 +15,15 @@ import pandas as pd from pandas import ( - Categorical, CategoricalIndex, DatetimeIndex, Index, IntervalIndex, Series, - Timestamp, compat) + Categorical, + CategoricalIndex, + DatetimeIndex, + Index, + IntervalIndex, + Series, + Timestamp, + compat, +) import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray import pandas.core.common as com @@ -27,7 +33,6 @@ class TestMatch: - def test_ints(self): values = np.array([0, 2, 1]) to_match = np.array([0, 1, 2, 2, 0, 1, 3, 0]) @@ -50,8 +55,8 @@ def test_ints(self): tm.assert_series_equal(result, expected) def test_strings(self): - values = ['foo', 'bar', 'baz'] - to_match = ['bar', 'foo', 'qux', 'foo', 'bar', 'baz', 'qux'] + values = ["foo", "bar", "baz"] + to_match = ["bar", "foo", "qux", "foo", "bar", "baz", "qux"] result = algos.match(to_match, values) expected = np.array([1, 0, -1, 0, 1, 2, -1], dtype=np.int64) @@ -63,19 +68,17 @@ def test_strings(self): class TestFactorize: - def test_basic(self): - labels, uniques = algos.factorize(['a', 'b', 'b', 'a', 'a', 'c', 'c', - 'c']) - tm.assert_numpy_array_equal( - uniques, np.array(['a', 'b', 'c'], dtype=object)) + labels, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"]) + tm.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object)) - labels, uniques = algos.factorize(['a', 'b', 'b', 'a', - 'a', 'c', 'c', 'c'], sort=True) + labels, uniques = algos.factorize( + ["a", "b", "b", "a", "a", "c", "c", "c"], sort=True + ) exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) - exp = np.array(['a', 'b', 'c'], dtype=object) + exp = np.array(["a", "b", "c"], dtype=object) tm.assert_numpy_array_equal(uniques, exp) labels, uniques = algos.factorize(list(reversed(range(5)))) @@ -91,41 +94,40 @@ def test_basic(self): exp = np.array([0, 1, 2, 3, 4], dtype=np.int64) tm.assert_numpy_array_equal(uniques, exp) - labels, uniques = algos.factorize(list(reversed(np.arange(5.)))) + labels, uniques = algos.factorize(list(reversed(np.arange(5.0)))) exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) - exp = np.array([4., 3., 2., 1., 0.], dtype=np.float64) + exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=np.float64) tm.assert_numpy_array_equal(uniques, exp) - labels, uniques = algos.factorize(list(reversed(np.arange(5.))), - sort=True) + labels, uniques = algos.factorize(list(reversed(np.arange(5.0))), sort=True) exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) - exp = np.array([0., 1., 2., 3., 4.], dtype=np.float64) + exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64) tm.assert_numpy_array_equal(uniques, exp) def test_mixed(self): # doc example reshaping.rst - x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf]) + x = Series(["A", "A", np.nan, "B", 3.14, np.inf]) labels, uniques = algos.factorize(x) exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) - exp = Index(['A', 'B', 3.14, np.inf]) + exp = Index(["A", "B", 3.14, np.inf]) tm.assert_index_equal(uniques, exp) labels, uniques = algos.factorize(x, sort=True) exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) - exp = Index([3.14, np.inf, 'A', 'B']) + exp = Index([3.14, np.inf, "A", "B"]) tm.assert_index_equal(uniques, exp) def test_datelike(self): # M8 - v1 = Timestamp('20130101 09:00:00.00004') - v2 = Timestamp('20130101') + v1 = Timestamp("20130101 09:00:00.00004") + v2 = Timestamp("20130101") x = Series([v1, v1, v1, v2, v2, v1]) labels, uniques = algos.factorize(x) @@ -141,8 +143,8 @@ def test_datelike(self): tm.assert_index_equal(uniques, exp) # period - v1 = pd.Period('201302', freq='M') - v2 = pd.Period('201303', freq='M') + v1 = pd.Period("201302", freq="M") + v2 = pd.Period("201303", freq="M") x = Series([v1, v1, v1, v2, v2, v1]) # periods are not 'sorted' as they are converted back into an index @@ -157,8 +159,8 @@ def test_datelike(self): tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2])) # GH 5986 - v1 = pd.to_timedelta('1 day 1 min') - v2 = pd.to_timedelta('1 day') + v1 = pd.to_timedelta("1 day 1 min") + v2 = pd.to_timedelta("1 day") x = Series([v1, v2, v1, v1, v2, v2, v1]) labels, uniques = algos.factorize(x) exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp) @@ -174,61 +176,59 @@ def test_factorize_nan(self): # nan should map to na_sentinel, not reverse_indexer[na_sentinel] # rizer.factorize should not raise an exception if na_sentinel indexes # outside of reverse_indexer - key = np.array([1, 2, 1, np.nan], dtype='O') + key = np.array([1, 2, 1, np.nan], dtype="O") rizer = ht.Factorizer(len(key)) for na_sentinel in (-1, 20): ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel) - expected = np.array([0, 1, 0, na_sentinel], dtype='int32') + expected = np.array([0, 1, 0, na_sentinel], dtype="int32") assert len(set(key)) == len(set(expected)) - tm.assert_numpy_array_equal(pd.isna(key), - expected == na_sentinel) + tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel) # nan still maps to na_sentinel when sort=False - key = np.array([0, np.nan, 1], dtype='O') + key = np.array([0, np.nan, 1], dtype="O") na_sentinel = -1 # TODO(wesm): unused? ids = rizer.factorize(key, sort=False, na_sentinel=na_sentinel) # noqa - expected = np.array([2, -1, 0], dtype='int32') + expected = np.array([2, -1, 0], dtype="int32") assert len(set(key)) == len(set(expected)) tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel) - @pytest.mark.parametrize("data,expected_label,expected_level", [ - ( - [(1, 1), (1, 2), (0, 0), (1, 2), 'nonsense'], - [0, 1, 2, 1, 3], - [(1, 1), (1, 2), (0, 0), 'nonsense'] - ), - ( - [(1, 1), (1, 2), (0, 0), (1, 2), (1, 2, 3)], - [0, 1, 2, 1, 3], - [(1, 1), (1, 2), (0, 0), (1, 2, 3)] - ), - ( - [(1, 1), (1, 2), (0, 0), (1, 2)], - [0, 1, 2, 1], - [(1, 1), (1, 2), (0, 0)] - ) - ]) + @pytest.mark.parametrize( + "data,expected_label,expected_level", + [ + ( + [(1, 1), (1, 2), (0, 0), (1, 2), "nonsense"], + [0, 1, 2, 1, 3], + [(1, 1), (1, 2), (0, 0), "nonsense"], + ), + ( + [(1, 1), (1, 2), (0, 0), (1, 2), (1, 2, 3)], + [0, 1, 2, 1, 3], + [(1, 1), (1, 2), (0, 0), (1, 2, 3)], + ), + ([(1, 1), (1, 2), (0, 0), (1, 2)], [0, 1, 2, 1], [(1, 1), (1, 2), (0, 0)]), + ], + ) def test_factorize_tuple_list(self, data, expected_label, expected_level): # GH9454 result = pd.factorize(data) - tm.assert_numpy_array_equal(result[0], - np.array(expected_label, dtype=np.intp)) + tm.assert_numpy_array_equal(result[0], np.array(expected_label, dtype=np.intp)) - expected_level_array = com.asarray_tuplesafe(expected_level, - dtype=object) + expected_level_array = com.asarray_tuplesafe(expected_level, dtype=object) tm.assert_numpy_array_equal(result[1], expected_level_array) def test_complex_sorting(self): # gh 12666 - check no segfault x17 = np.array([complex(i) for i in range(17)], dtype=object) - msg = ("unorderable types: .* [<>] .*" - "|" # the above case happens for numpy < 1.14 - "'[<>]' not supported between instances of .*") + msg = ( + "unorderable types: .* [<>] .*" + "|" # the above case happens for numpy < 1.14 + "'[<>]' not supported between instances of .*" + ) with pytest.raises(TypeError, match=msg): algos.factorize(x17[::-1], sort=True) @@ -243,42 +243,40 @@ def test_float64_factorize(self, writable): tm.assert_numpy_array_equal(uniques, exp_uniques) def test_uint64_factorize(self, writable): - data = np.array([2**64 - 1, 1, 2**64 - 1], dtype=np.uint64) + data = np.array([2 ** 64 - 1, 1, 2 ** 64 - 1], dtype=np.uint64) data.setflags(write=writable) exp_labels = np.array([0, 1, 0], dtype=np.intp) - exp_uniques = np.array([2**64 - 1, 1], dtype=np.uint64) + exp_uniques = np.array([2 ** 64 - 1, 1], dtype=np.uint64) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) tm.assert_numpy_array_equal(uniques, exp_uniques) def test_int64_factorize(self, writable): - data = np.array([2**63 - 1, -2**63, 2**63 - 1], dtype=np.int64) + data = np.array([2 ** 63 - 1, -2 ** 63, 2 ** 63 - 1], dtype=np.int64) data.setflags(write=writable) exp_labels = np.array([0, 1, 0], dtype=np.intp) - exp_uniques = np.array([2**63 - 1, -2**63], dtype=np.int64) + exp_uniques = np.array([2 ** 63 - 1, -2 ** 63], dtype=np.int64) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) tm.assert_numpy_array_equal(uniques, exp_uniques) def test_string_factorize(self, writable): - data = np.array(['a', 'c', 'a', 'b', 'c'], - dtype=object) + data = np.array(["a", "c", "a", "b", "c"], dtype=object) data.setflags(write=writable) exp_labels = np.array([0, 1, 0, 2, 1], dtype=np.intp) - exp_uniques = np.array(['a', 'c', 'b'], dtype=object) + exp_uniques = np.array(["a", "c", "b"], dtype=object) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) tm.assert_numpy_array_equal(uniques, exp_uniques) def test_object_factorize(self, writable): - data = np.array(['a', 'c', None, np.nan, 'a', 'b', pd.NaT, 'c'], - dtype=object) + data = np.array(["a", "c", None, np.nan, "a", "b", pd.NaT, "c"], dtype=object) data.setflags(write=writable) exp_labels = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp) - exp_uniques = np.array(['a', 'c', 'b'], dtype=object) + exp_uniques = np.array(["a", "c", "b"], dtype=object) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) @@ -287,17 +285,20 @@ def test_object_factorize(self, writable): def test_deprecate_order(self): # gh 19727 - check warning is raised for deprecated keyword, order. # Test not valid once order keyword is removed. - data = np.array([2**63, 1, 2**63], dtype=np.uint64) + data = np.array([2 ** 63, 1, 2 ** 63], dtype=np.uint64) with tm.assert_produces_warning(expected_warning=FutureWarning): algos.factorize(data, order=True) with tm.assert_produces_warning(False): algos.factorize(data) - @pytest.mark.parametrize('data', [ - np.array([0, 1, 0], dtype='u8'), - np.array([-2**63, 1, -2**63], dtype='i8'), - np.array(['__nan__', 'foo', '__nan__'], dtype='object'), - ]) + @pytest.mark.parametrize( + "data", + [ + np.array([0, 1, 0], dtype="u8"), + np.array([-2 ** 63, 1, -2 ** 63], dtype="i8"), + np.array(["__nan__", "foo", "__nan__"], dtype="object"), + ], + ) def test_parametrized_factorize_na_value_default(self, data): # arrays that include the NA default for that type, but isn't used. l, u = algos.factorize(data) @@ -306,16 +307,18 @@ def test_parametrized_factorize_na_value_default(self, data): tm.assert_numpy_array_equal(l, expected_labels) tm.assert_numpy_array_equal(u, expected_uniques) - @pytest.mark.parametrize('data, na_value', [ - (np.array([0, 1, 0, 2], dtype='u8'), 0), - (np.array([1, 0, 1, 2], dtype='u8'), 1), - (np.array([-2**63, 1, -2**63, 0], dtype='i8'), -2**63), - (np.array([1, -2**63, 1, 0], dtype='i8'), 1), - (np.array(['a', '', 'a', 'b'], dtype=object), 'a'), - (np.array([(), ('a', 1), (), ('a', 2)], dtype=object), ()), - (np.array([('a', 1), (), ('a', 1), ('a', 2)], dtype=object), - ('a', 1)), - ]) + @pytest.mark.parametrize( + "data, na_value", + [ + (np.array([0, 1, 0, 2], dtype="u8"), 0), + (np.array([1, 0, 1, 2], dtype="u8"), 1), + (np.array([-2 ** 63, 1, -2 ** 63, 0], dtype="i8"), -2 ** 63), + (np.array([1, -2 ** 63, 1, 0], dtype="i8"), 1), + (np.array(["a", "", "a", "b"], dtype=object), "a"), + (np.array([(), ("a", 1), (), ("a", 2)], dtype=object), ()), + (np.array([("a", 1), (), ("a", 1), ("a", 2)], dtype=object), ("a", 1)), + ], + ) def test_parametrized_factorize_na_value(self, data, na_value): l, u = algos._factorize_array(data, na_value=na_value) expected_uniques = data[[1, 3]] @@ -323,17 +326,24 @@ def test_parametrized_factorize_na_value(self, data, na_value): tm.assert_numpy_array_equal(l, expected_labels) tm.assert_numpy_array_equal(u, expected_uniques) - @pytest.mark.parametrize('sort', [True, False]) - @pytest.mark.parametrize('na_sentinel', [-1, -10, 100]) - @pytest.mark.parametrize('data, uniques', [ - (np.array(['b', 'a', None, 'b'], dtype=object), - np.array(['b', 'a'], dtype=object)), - (pd.array([2, 1, np.nan, 2], dtype='Int64'), - pd.array([2, 1], dtype='Int64'))], - ids=['numpy_array', 'extension_array']) + @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("na_sentinel", [-1, -10, 100]) + @pytest.mark.parametrize( + "data, uniques", + [ + ( + np.array(["b", "a", None, "b"], dtype=object), + np.array(["b", "a"], dtype=object), + ), + ( + pd.array([2, 1, np.nan, 2], dtype="Int64"), + pd.array([2, 1], dtype="Int64"), + ), + ], + ids=["numpy_array", "extension_array"], + ) def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): - labels, uniques = algos.factorize(data, sort=sort, - na_sentinel=na_sentinel) + labels, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel) if sort: expected_labels = np.array([1, 0, na_sentinel, 1], dtype=np.intp) expected_uniques = safe_sort(uniques) @@ -348,7 +358,6 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): class TestUnique: - def test_ints(self): arr = np.random.randint(0, 100, size=50) @@ -356,20 +365,21 @@ def test_ints(self): assert isinstance(result, np.ndarray) def test_objects(self): - arr = np.random.randint(0, 100, size=50).astype('O') + arr = np.random.randint(0, 100, size=50).astype("O") result = algos.unique(arr) assert isinstance(result, np.ndarray) def test_object_refcount_bug(self): - lst = ['A', 'B', 'C', 'D', 'E'] + lst = ["A", "B", "C", "D", "E"] for i in range(1000): len(algos.unique(lst)) def test_on_index_object(self): - mindex = pd.MultiIndex.from_arrays([np.arange(5).repeat(5), np.tile( - np.arange(5), 5)]) + mindex = pd.MultiIndex.from_arrays( + [np.arange(5).repeat(5), np.tile(np.arange(5), 5)] + ) expected = mindex.values expected.sort() @@ -383,13 +393,20 @@ def test_on_index_object(self): def test_datetime64_dtype_array_returned(self): # GH 9431 expected = np_array_datetime64_compat( - ['2015-01-03T00:00:00.000000000+0000', - '2015-01-01T00:00:00.000000000+0000'], - dtype='M8[ns]') + [ + "2015-01-03T00:00:00.000000000+0000", + "2015-01-01T00:00:00.000000000+0000", + ], + dtype="M8[ns]", + ) - dt_index = pd.to_datetime(['2015-01-03T00:00:00.000000000', - '2015-01-01T00:00:00.000000000', - '2015-01-01T00:00:00.000000000']) + dt_index = pd.to_datetime( + [ + "2015-01-03T00:00:00.000000000", + "2015-01-01T00:00:00.000000000", + "2015-01-01T00:00:00.000000000", + ] + ) result = algos.unique(dt_index) tm.assert_numpy_array_equal(result, expected) assert result.dtype == expected.dtype @@ -406,7 +423,7 @@ def test_datetime64_dtype_array_returned(self): def test_timedelta64_dtype_array_returned(self): # GH 9431 - expected = np.array([31200, 45678, 10000], dtype='m8[ns]') + expected = np.array([31200, 45678, 10000], dtype="m8[ns]") td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678]) result = algos.unique(td_index) @@ -424,36 +441,35 @@ def test_timedelta64_dtype_array_returned(self): assert result.dtype == expected.dtype def test_uint64_overflow(self): - s = Series([1, 2, 2**63, 2**63], dtype=np.uint64) - exp = np.array([1, 2, 2**63], dtype=np.uint64) + s = Series([1, 2, 2 ** 63, 2 ** 63], dtype=np.uint64) + exp = np.array([1, 2, 2 ** 63], dtype=np.uint64) tm.assert_numpy_array_equal(algos.unique(s), exp) def test_nan_in_object_array(self): - duplicated_items = ['a', np.nan, 'c', 'c'] + duplicated_items = ["a", np.nan, "c", "c"] result = pd.unique(duplicated_items) - expected = np.array(['a', np.nan, 'c'], dtype=object) + expected = np.array(["a", np.nan, "c"], dtype=object) tm.assert_numpy_array_equal(result, expected) def test_categorical(self): # we are expecting to return in the order # of appearance - expected = Categorical(list('bac'), categories=list('bac')) + expected = Categorical(list("bac"), categories=list("bac")) # we are expecting to return in the order # of the categories - expected_o = Categorical( - list('bac'), categories=list('abc'), ordered=True) + expected_o = Categorical(list("bac"), categories=list("abc"), ordered=True) # GH 15939 - c = Categorical(list('baabc')) + c = Categorical(list("baabc")) result = c.unique() tm.assert_categorical_equal(result, expected) result = algos.unique(c) tm.assert_categorical_equal(result, expected) - c = Categorical(list('baabc'), ordered=True) + c = Categorical(list("baabc"), ordered=True) result = c.unique() tm.assert_categorical_equal(result, expected_o) @@ -461,7 +477,7 @@ def test_categorical(self): tm.assert_categorical_equal(result, expected_o) # Series of categorical dtype - s = Series(Categorical(list('baabc')), name='foo') + s = Series(Categorical(list("baabc")), name="foo") result = s.unique() tm.assert_categorical_equal(result, expected) @@ -469,8 +485,7 @@ def test_categorical(self): tm.assert_categorical_equal(result, expected) # CI -> return CI - ci = CategoricalIndex(Categorical(list('baabc'), - categories=list('bac'))) + ci = CategoricalIndex(Categorical(list("baabc"), categories=list("bac"))) expected = CategoricalIndex(expected) result = ci.unique() tm.assert_index_equal(result, expected) @@ -482,31 +497,55 @@ def test_datetime64tz_aware(self): # GH 15939 result = Series( - Index([Timestamp('20160101', tz='US/Eastern'), - Timestamp('20160101', tz='US/Eastern')])).unique() - expected = DatetimeArray._from_sequence(np.array([ - Timestamp('2016-01-01 00:00:00-0500', tz="US/Eastern") - ])) + Index( + [ + Timestamp("20160101", tz="US/Eastern"), + Timestamp("20160101", tz="US/Eastern"), + ] + ) + ).unique() + expected = DatetimeArray._from_sequence( + np.array([Timestamp("2016-01-01 00:00:00-0500", tz="US/Eastern")]) + ) tm.assert_extension_array_equal(result, expected) - result = Index([Timestamp('20160101', tz='US/Eastern'), - Timestamp('20160101', tz='US/Eastern')]).unique() - expected = DatetimeIndex(['2016-01-01 00:00:00'], - dtype='datetime64[ns, US/Eastern]', freq=None) + result = Index( + [ + Timestamp("20160101", tz="US/Eastern"), + Timestamp("20160101", tz="US/Eastern"), + ] + ).unique() + expected = DatetimeIndex( + ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None + ) tm.assert_index_equal(result, expected) result = pd.unique( - Series(Index([Timestamp('20160101', tz='US/Eastern'), - Timestamp('20160101', tz='US/Eastern')]))) - expected = DatetimeArray._from_sequence(np.array([ - Timestamp('2016-01-01', tz="US/Eastern"), - ])) + Series( + Index( + [ + Timestamp("20160101", tz="US/Eastern"), + Timestamp("20160101", tz="US/Eastern"), + ] + ) + ) + ) + expected = DatetimeArray._from_sequence( + np.array([Timestamp("2016-01-01", tz="US/Eastern")]) + ) tm.assert_extension_array_equal(result, expected) - result = pd.unique(Index([Timestamp('20160101', tz='US/Eastern'), - Timestamp('20160101', tz='US/Eastern')])) - expected = DatetimeIndex(['2016-01-01 00:00:00'], - dtype='datetime64[ns, US/Eastern]', freq=None) + result = pd.unique( + Index( + [ + Timestamp("20160101", tz="US/Eastern"), + Timestamp("20160101", tz="US/Eastern"), + ] + ) + ) + expected = DatetimeIndex( + ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None + ) tm.assert_index_equal(result, expected) def test_order_of_appearance(self): @@ -514,39 +553,43 @@ def test_order_of_appearance(self): # light testing of guarantee of order of appearance # these also are the doc-examples result = pd.unique(Series([2, 1, 3, 3])) - tm.assert_numpy_array_equal(result, - np.array([2, 1, 3], dtype='int64')) + tm.assert_numpy_array_equal(result, np.array([2, 1, 3], dtype="int64")) result = pd.unique(Series([2] + [1] * 5)) - tm.assert_numpy_array_equal(result, - np.array([2, 1], dtype='int64')) + tm.assert_numpy_array_equal(result, np.array([2, 1], dtype="int64")) - result = pd.unique(Series([Timestamp('20160101'), - Timestamp('20160101')])) - expected = np.array(['2016-01-01T00:00:00.000000000'], - dtype='datetime64[ns]') + result = pd.unique(Series([Timestamp("20160101"), Timestamp("20160101")])) + expected = np.array(["2016-01-01T00:00:00.000000000"], dtype="datetime64[ns]") tm.assert_numpy_array_equal(result, expected) - result = pd.unique(Index( - [Timestamp('20160101', tz='US/Eastern'), - Timestamp('20160101', tz='US/Eastern')])) - expected = DatetimeIndex(['2016-01-01 00:00:00'], - dtype='datetime64[ns, US/Eastern]', - freq=None) + result = pd.unique( + Index( + [ + Timestamp("20160101", tz="US/Eastern"), + Timestamp("20160101", tz="US/Eastern"), + ] + ) + ) + expected = DatetimeIndex( + ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None + ) tm.assert_index_equal(result, expected) - result = pd.unique(list('aabc')) - expected = np.array(['a', 'b', 'c'], dtype=object) + result = pd.unique(list("aabc")) + expected = np.array(["a", "b", "c"], dtype=object) tm.assert_numpy_array_equal(result, expected) - result = pd.unique(Series(Categorical(list('aabc')))) - expected = Categorical(list('abc')) + result = pd.unique(Series(Categorical(list("aabc")))) + expected = Categorical(list("abc")) tm.assert_categorical_equal(result, expected) - @pytest.mark.parametrize("arg ,expected", [ - (('1', '1', '2'), np.array(['1', '2'], dtype=object)), - (('foo',), np.array(['foo'], dtype=object)) - ]) + @pytest.mark.parametrize( + "arg ,expected", + [ + (("1", "1", "2"), np.array(["1", "2"], dtype=object)), + (("foo",), np.array(["foo"], dtype=object)), + ], + ) def test_tuple_with_strings(self, arg, expected): # see GH 17108 result = pd.unique(arg) @@ -554,9 +597,9 @@ def test_tuple_with_strings(self, arg, expected): def test_obj_none_preservation(self): # GH 20866 - arr = np.array(['foo', None], dtype=object) + arr = np.array(["foo", None], dtype=object) result = pd.unique(arr) - expected = np.array(['foo', None], dtype=object) + expected = np.array(["foo", None], dtype=object) tm.assert_numpy_array_equal(result, expected, strict_nan=True) @@ -570,8 +613,8 @@ def test_signed_zero(self): def test_different_nans(self): # GH 21866 # create different nans from bit-patterns: - NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0] - NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0] + NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0] + NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0] assert NAN1 != NAN1 assert NAN2 != NAN2 a = np.array([NAN1, NAN2]) # NAN1 and NAN2 are equivalent @@ -582,8 +625,8 @@ def test_different_nans(self): def test_first_nan_kept(self): # GH 22295 # create different nans from bit-patterns: - bits_for_nan1 = 0xfff8000000000001 - bits_for_nan2 = 0x7ff8000000000001 + bits_for_nan1 = 0xFFF8000000000001 + bits_for_nan2 = 0x7FF8000000000001 NAN1 = struct.unpack("d", struct.pack("=Q", bits_for_nan1))[0] NAN2 = struct.unpack("d", struct.pack("=Q", bits_for_nan2))[0] assert NAN1 != NAN1 @@ -593,17 +636,14 @@ def test_first_nan_kept(self): result = pd.unique(a) assert result.size == 1 # use bit patterns to identify which nan was kept: - result_nan_bits = struct.unpack("=Q", - struct.pack("d", result[0]))[0] + result_nan_bits = struct.unpack("=Q", struct.pack("d", result[0]))[0] assert result_nan_bits == bits_for_nan1 - def test_do_not_mangle_na_values(self, unique_nulls_fixture, - unique_nulls_fixture2): + def test_do_not_mangle_na_values(self, unique_nulls_fixture, unique_nulls_fixture2): # GH 22295 if unique_nulls_fixture is unique_nulls_fixture2: return # skip it, values not unique - a = np.array([unique_nulls_fixture, - unique_nulls_fixture2], dtype=np.object) + a = np.array([unique_nulls_fixture, unique_nulls_fixture2], dtype=np.object) result = pd.unique(a) assert result.size == 2 assert a[0] is unique_nulls_fixture @@ -611,11 +651,12 @@ def test_do_not_mangle_na_values(self, unique_nulls_fixture, class TestIsin: - def test_invalid(self): - msg = (r"only list-like objects are allowed to be passed to isin\(\)," - r" you passed a \[int\]") + msg = ( + r"only list-like objects are allowed to be passed to isin\(\)," + r" you passed a \[int\]" + ) with pytest.raises(TypeError, match=msg): algos.isin(1, 1) with pytest.raises(TypeError, match=msg): @@ -645,25 +686,25 @@ def test_basic(self): expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - result = algos.isin(['a', 'b'], ['a']) + result = algos.isin(["a", "b"], ["a"]) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - result = algos.isin(Series(['a', 'b']), Series(['a'])) + result = algos.isin(Series(["a", "b"]), Series(["a"])) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - result = algos.isin(Series(['a', 'b']), {'a'}) + result = algos.isin(Series(["a", "b"]), {"a"}) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - result = algos.isin(['a', 'b'], [1]) + result = algos.isin(["a", "b"], [1]) expected = np.array([False, False]) tm.assert_numpy_array_equal(result, expected) def test_i8(self): - arr = pd.date_range('20130101', periods=3).values + arr = pd.date_range("20130101", periods=3).values result = algos.isin(arr, [arr[0]]) expected = np.array([True, False, False]) tm.assert_numpy_array_equal(result, expected) @@ -676,7 +717,7 @@ def test_i8(self): expected = np.array([True, True, False]) tm.assert_numpy_array_equal(result, expected) - arr = pd.timedelta_range('1 day', periods=3).values + arr = pd.timedelta_range("1 day", periods=3).values result = algos.isin(arr, [arr[0]]) expected = np.array([True, False, False]) tm.assert_numpy_array_equal(result, expected) @@ -691,7 +732,7 @@ def test_i8(self): def test_large(self): - s = pd.date_range('20000101', periods=2000000, freq='s').values + s = pd.date_range("20000101", periods=2000000, freq="s").values result = algos.isin(s, s[0:2]) expected = np.zeros(len(s), dtype=bool) expected[0] = True @@ -701,7 +742,7 @@ def test_large(self): def test_categorical_from_codes(self): # GH 16639 vals = np.array([0, 1, 2, 0]) - cats = ['a', 'b', 'c'] + cats = ["a", "b", "c"] Sd = Series(Categorical(1).from_codes(vals, cats)) St = Series(Categorical(1).from_codes(np.array([0, 1]), cats)) expected = np.array([True, True, False, True]) @@ -743,8 +784,8 @@ def test_different_nans(self): # GH 22160 # all nans are handled as equivalent - comps = [float('nan')] - values = [float('nan')] + comps = [float("nan")] + values = [float("nan")] assert comps[0] is not values[0] # different nan-objects # as list of python-objects: @@ -752,20 +793,22 @@ def test_different_nans(self): tm.assert_numpy_array_equal(np.array([True]), result) # as object-array: - result = algos.isin(np.asarray(comps, dtype=np.object), - np.asarray(values, dtype=np.object)) + result = algos.isin( + np.asarray(comps, dtype=np.object), np.asarray(values, dtype=np.object) + ) tm.assert_numpy_array_equal(np.array([True]), result) # as float64-array: - result = algos.isin(np.asarray(comps, dtype=np.float64), - np.asarray(values, dtype=np.float64)) + result = algos.isin( + np.asarray(comps, dtype=np.float64), np.asarray(values, dtype=np.float64) + ) tm.assert_numpy_array_equal(np.array([True]), result) def test_no_cast(self): # GH 22160 # ensure 42 is not casted to a string - comps = ['ss', 42] - values = ['42'] + comps = ["ss", 42] + values = ["42"] expected = np.array([False, False]) result = algos.isin(comps, values) tm.assert_numpy_array_equal(expected, result) @@ -781,8 +824,8 @@ def test_empty(self, empty): def test_different_nan_objects(self): # GH 22119 - comps = np.array(['nan', np.nan * 1j, float('nan')], dtype=np.object) - vals = np.array([float('nan')], dtype=np.object) + comps = np.array(["nan", np.nan * 1j, float("nan")], dtype=np.object) + vals = np.array([float("nan")], dtype=np.object) expected = np.array([False, False, True]) result = algos.isin(comps, vals) tm.assert_numpy_array_equal(expected, result) @@ -792,8 +835,8 @@ def test_different_nans_as_float64(self): # create different nans from bit-patterns, # these nans will land in different buckets in the hash-table # if no special care is taken - NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0] - NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0] + NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0] + NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0] assert NAN1 != NAN1 assert NAN2 != NAN2 @@ -811,7 +854,6 @@ def test_different_nans_as_float64(self): class TestValueCounts: - def test_value_counts(self): np.random.seed(1234) from pandas.core.reshape.tile import cut @@ -829,33 +871,32 @@ def test_value_counts(self): def test_value_counts_bins(self): s = [1, 2, 3, 4] result = algos.value_counts(s, bins=1) - expected = Series([4], - index=IntervalIndex.from_tuples([(0.996, 4.0)])) + expected = Series([4], index=IntervalIndex.from_tuples([(0.996, 4.0)])) tm.assert_series_equal(result, expected) result = algos.value_counts(s, bins=2, sort=False) - expected = Series([2, 2], - index=IntervalIndex.from_tuples([(0.996, 2.5), - (2.5, 4.0)])) + expected = Series( + [2, 2], index=IntervalIndex.from_tuples([(0.996, 2.5), (2.5, 4.0)]) + ) tm.assert_series_equal(result, expected) def test_value_counts_dtypes(self): - result = algos.value_counts([1, 1.]) + result = algos.value_counts([1, 1.0]) assert len(result) == 1 - result = algos.value_counts([1, 1.], bins=1) + result = algos.value_counts([1, 1.0], bins=1) assert len(result) == 1 - result = algos.value_counts(Series([1, 1., '1'])) # object + result = algos.value_counts(Series([1, 1.0, "1"])) # object assert len(result) == 2 msg = "bins argument only works with numeric data" with pytest.raises(TypeError, match=msg): - algos.value_counts(['1', 1], bins=1) + algos.value_counts(["1", 1], bins=1) def test_value_counts_nat(self): - td = Series([np.timedelta64(10000), pd.NaT], dtype='timedelta64[ns]') - dt = pd.to_datetime(['NaT', '2014-01-01']) + td = Series([np.timedelta64(10000), pd.NaT], dtype="timedelta64[ns]") + dt = pd.to_datetime(["NaT", "2014-01-01"]) for s in [td, dt]: vc = algos.value_counts(s) @@ -863,32 +904,40 @@ def test_value_counts_nat(self): assert len(vc) == 1 assert len(vc_with_na) == 2 - exp_dt = Series({Timestamp('2014-01-01 00:00:00'): 1}) + exp_dt = Series({Timestamp("2014-01-01 00:00:00"): 1}) tm.assert_series_equal(algos.value_counts(dt), exp_dt) # TODO same for (timedelta) def test_value_counts_datetime_outofbounds(self): # GH 13663 - s = Series([datetime(3000, 1, 1), datetime(5000, 1, 1), - datetime(5000, 1, 1), datetime(6000, 1, 1), - datetime(3000, 1, 1), datetime(3000, 1, 1)]) + s = Series( + [ + datetime(3000, 1, 1), + datetime(5000, 1, 1), + datetime(5000, 1, 1), + datetime(6000, 1, 1), + datetime(3000, 1, 1), + datetime(3000, 1, 1), + ] + ) res = s.value_counts() - exp_index = Index([datetime(3000, 1, 1), datetime(5000, 1, 1), - datetime(6000, 1, 1)], dtype=object) + exp_index = Index( + [datetime(3000, 1, 1), datetime(5000, 1, 1), datetime(6000, 1, 1)], + dtype=object, + ) exp = Series([3, 2, 1], index=exp_index) tm.assert_series_equal(res, exp) # GH 12424 - res = pd.to_datetime(Series(['2362-01-01', np.nan]), - errors='ignore') - exp = Series(['2362-01-01', np.nan], dtype=object) + res = pd.to_datetime(Series(["2362-01-01", np.nan]), errors="ignore") + exp = Series(["2362-01-01", np.nan], dtype=object) tm.assert_series_equal(res, exp) def test_categorical(self): - s = Series(Categorical(list('aaabbc'))) + s = Series(Categorical(list("aaabbc"))) result = s.value_counts() - expected = Series([3, 2, 1], index=CategoricalIndex(['a', 'b', 'c'])) + expected = Series([3, 2, 1], index=CategoricalIndex(["a", "b", "c"])) tm.assert_series_equal(result, expected, check_index_type=True) @@ -899,39 +948,51 @@ def test_categorical(self): tm.assert_series_equal(result, expected, check_index_type=True) def test_categorical_nans(self): - s = Series(Categorical(list('aaaaabbbcc'))) # 4,3,2,1 (nan) + s = Series(Categorical(list("aaaaabbbcc"))) # 4,3,2,1 (nan) s.iloc[1] = np.nan result = s.value_counts() - expected = Series([4, 3, 2], index=CategoricalIndex( - ['a', 'b', 'c'], categories=['a', 'b', 'c'])) + expected = Series( + [4, 3, 2], + index=CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c"]), + ) tm.assert_series_equal(result, expected, check_index_type=True) result = s.value_counts(dropna=False) - expected = Series([ - 4, 3, 2, 1 - ], index=CategoricalIndex(['a', 'b', 'c', np.nan])) + expected = Series([4, 3, 2, 1], index=CategoricalIndex(["a", "b", "c", np.nan])) tm.assert_series_equal(result, expected, check_index_type=True) # out of order - s = Series(Categorical( - list('aaaaabbbcc'), ordered=True, categories=['b', 'a', 'c'])) + s = Series( + Categorical(list("aaaaabbbcc"), ordered=True, categories=["b", "a", "c"]) + ) s.iloc[1] = np.nan result = s.value_counts() - expected = Series([4, 3, 2], index=CategoricalIndex( - ['a', 'b', 'c'], categories=['b', 'a', 'c'], ordered=True)) + expected = Series( + [4, 3, 2], + index=CategoricalIndex( + ["a", "b", "c"], categories=["b", "a", "c"], ordered=True + ), + ) tm.assert_series_equal(result, expected, check_index_type=True) result = s.value_counts(dropna=False) - expected = Series([4, 3, 2, 1], index=CategoricalIndex( - ['a', 'b', 'c', np.nan], categories=['b', 'a', 'c'], ordered=True)) + expected = Series( + [4, 3, 2, 1], + index=CategoricalIndex( + ["a", "b", "c", np.nan], categories=["b", "a", "c"], ordered=True + ), + ) tm.assert_series_equal(result, expected, check_index_type=True) def test_categorical_zeroes(self): # keep the `d` category with 0 - s = Series(Categorical( - list('bbbaac'), categories=list('abcd'), ordered=True)) + s = Series(Categorical(list("bbbaac"), categories=list("abcd"), ordered=True)) result = s.value_counts() - expected = Series([3, 2, 1, 0], index=Categorical( - ['b', 'a', 'c', 'd'], categories=list('abcd'), ordered=True)) + expected = Series( + [3, 2, 1, 0], + index=Categorical( + ["b", "a", "c", "d"], categories=list("abcd"), ordered=True + ), + ) tm.assert_series_equal(result, expected, check_index_type=True) def test_dropna(self): @@ -939,59 +1000,66 @@ def test_dropna(self): tm.assert_series_equal( Series([True, True, False]).value_counts(dropna=True), - Series([2, 1], index=[True, False])) + Series([2, 1], index=[True, False]), + ) tm.assert_series_equal( Series([True, True, False]).value_counts(dropna=False), - Series([2, 1], index=[True, False])) + Series([2, 1], index=[True, False]), + ) tm.assert_series_equal( Series([True, True, False, None]).value_counts(dropna=True), - Series([2, 1], index=[True, False])) + Series([2, 1], index=[True, False]), + ) tm.assert_series_equal( Series([True, True, False, None]).value_counts(dropna=False), - Series([2, 1, 1], index=[True, False, np.nan])) + Series([2, 1, 1], index=[True, False, np.nan]), + ) tm.assert_series_equal( - Series([10.3, 5., 5.]).value_counts(dropna=True), - Series([2, 1], index=[5., 10.3])) + Series([10.3, 5.0, 5.0]).value_counts(dropna=True), + Series([2, 1], index=[5.0, 10.3]), + ) tm.assert_series_equal( - Series([10.3, 5., 5.]).value_counts(dropna=False), - Series([2, 1], index=[5., 10.3])) + Series([10.3, 5.0, 5.0]).value_counts(dropna=False), + Series([2, 1], index=[5.0, 10.3]), + ) tm.assert_series_equal( - Series([10.3, 5., 5., None]).value_counts(dropna=True), - Series([2, 1], index=[5., 10.3])) + Series([10.3, 5.0, 5.0, None]).value_counts(dropna=True), + Series([2, 1], index=[5.0, 10.3]), + ) # 32-bit linux has a different ordering if not compat.is_platform_32bit(): - result = Series([10.3, 5., 5., None]).value_counts(dropna=False) - expected = Series([2, 1, 1], index=[5., 10.3, np.nan]) + result = Series([10.3, 5.0, 5.0, None]).value_counts(dropna=False) + expected = Series([2, 1, 1], index=[5.0, 10.3, np.nan]) tm.assert_series_equal(result, expected) def test_value_counts_normalized(self): # GH12558 s = Series([1, 2, np.nan, np.nan, np.nan]) - dtypes = (np.float64, np.object, 'M8[ns]') + dtypes = (np.float64, np.object, "M8[ns]") for t in dtypes: s_typed = s.astype(t) result = s_typed.value_counts(normalize=True, dropna=False) - expected = Series([0.6, 0.2, 0.2], - index=Series([np.nan, 2.0, 1.0], dtype=t)) + expected = Series( + [0.6, 0.2, 0.2], index=Series([np.nan, 2.0, 1.0], dtype=t) + ) tm.assert_series_equal(result, expected) result = s_typed.value_counts(normalize=True, dropna=True) - expected = Series([0.5, 0.5], - index=Series([2.0, 1.0], dtype=t)) + expected = Series([0.5, 0.5], index=Series([2.0, 1.0], dtype=t)) tm.assert_series_equal(result, expected) def test_value_counts_uint64(self): - arr = np.array([2**63], dtype=np.uint64) - expected = Series([1], index=[2**63]) + arr = np.array([2 ** 63], dtype=np.uint64) + expected = Series([1], index=[2 ** 63]) result = algos.value_counts(arr) tm.assert_series_equal(result, expected) - arr = np.array([-1, 2**63], dtype=object) - expected = Series([1, 1], index=[-1, 2**63]) + arr = np.array([-1, 2 ** 63], dtype=object) + expected = Series([1, 1], index=[-1, 2 ** 63]) result = algos.value_counts(arr) # 32-bit linux has a different ordering @@ -1000,7 +1068,6 @@ def test_value_counts_uint64(self): class TestDuplicated: - def test_duplicated_with_nas(self): keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object) @@ -1008,11 +1075,11 @@ def test_duplicated_with_nas(self): expected = np.array([False, False, False, True, False, True]) tm.assert_numpy_array_equal(result, expected) - result = algos.duplicated(keys, keep='first') + result = algos.duplicated(keys, keep="first") expected = np.array([False, False, False, True, False, True]) tm.assert_numpy_array_equal(result, expected) - result = algos.duplicated(keys, keep='last') + result = algos.duplicated(keys, keep="last") expected = np.array([True, False, True, False, False, False]) tm.assert_numpy_array_equal(result, expected) @@ -1021,8 +1088,9 @@ def test_duplicated_with_nas(self): tm.assert_numpy_array_equal(result, expected) keys = np.empty(8, dtype=object) - for i, t in enumerate(zip([0, 0, np.nan, np.nan] * 2, - [0, np.nan, 0, np.nan] * 2)): + for i, t in enumerate( + zip([0, 0, np.nan, np.nan] * 2, [0, np.nan, 0, np.nan] * 2) + ): keys[i] = t result = algos.duplicated(keys) @@ -1031,7 +1099,7 @@ def test_duplicated_with_nas(self): expected = np.array(falses + trues) tm.assert_numpy_array_equal(result, expected) - result = algos.duplicated(keys, keep='last') + result = algos.duplicated(keys, keep="last") expected = np.array(trues + falses) tm.assert_numpy_array_equal(result, expected) @@ -1039,51 +1107,66 @@ def test_duplicated_with_nas(self): expected = np.array(trues + trues) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('case', [ - np.array([1, 2, 1, 5, 3, - 2, 4, 1, 5, 6]), - np.array([1.1, 2.2, 1.1, np.nan, 3.3, - 2.2, 4.4, 1.1, np.nan, 6.6]), - np.array([1 + 1j, 2 + 2j, 1 + 1j, 5 + 5j, 3 + 3j, - 2 + 2j, 4 + 4j, 1 + 1j, 5 + 5j, 6 + 6j]), - np.array(['a', 'b', 'a', 'e', 'c', - 'b', 'd', 'a', 'e', 'f'], dtype=object), - np.array([1, 2**63, 1, 3**5, 10, 2**63, 39, 1, 3**5, 7], - dtype=np.uint64), - ]) + @pytest.mark.parametrize( + "case", + [ + np.array([1, 2, 1, 5, 3, 2, 4, 1, 5, 6]), + np.array([1.1, 2.2, 1.1, np.nan, 3.3, 2.2, 4.4, 1.1, np.nan, 6.6]), + np.array( + [ + 1 + 1j, + 2 + 2j, + 1 + 1j, + 5 + 5j, + 3 + 3j, + 2 + 2j, + 4 + 4j, + 1 + 1j, + 5 + 5j, + 6 + 6j, + ] + ), + np.array(["a", "b", "a", "e", "c", "b", "d", "a", "e", "f"], dtype=object), + np.array( + [1, 2 ** 63, 1, 3 ** 5, 10, 2 ** 63, 39, 1, 3 ** 5, 7], dtype=np.uint64 + ), + ], + ) def test_numeric_object_likes(self, case): - exp_first = np.array([False, False, True, False, False, - True, False, True, True, False]) - exp_last = np.array([True, True, True, True, False, - False, False, False, False, False]) + exp_first = np.array( + [False, False, True, False, False, True, False, True, True, False] + ) + exp_last = np.array( + [True, True, True, True, False, False, False, False, False, False] + ) exp_false = exp_first | exp_last - res_first = algos.duplicated(case, keep='first') + res_first = algos.duplicated(case, keep="first") tm.assert_numpy_array_equal(res_first, exp_first) - res_last = algos.duplicated(case, keep='last') + res_last = algos.duplicated(case, keep="last") tm.assert_numpy_array_equal(res_last, exp_last) res_false = algos.duplicated(case, keep=False) tm.assert_numpy_array_equal(res_false, exp_false) # index - for idx in [Index(case), Index(case, dtype='category')]: - res_first = idx.duplicated(keep='first') + for idx in [Index(case), Index(case, dtype="category")]: + res_first = idx.duplicated(keep="first") tm.assert_numpy_array_equal(res_first, exp_first) - res_last = idx.duplicated(keep='last') + res_last = idx.duplicated(keep="last") tm.assert_numpy_array_equal(res_last, exp_last) res_false = idx.duplicated(keep=False) tm.assert_numpy_array_equal(res_false, exp_false) # series - for s in [Series(case), Series(case, dtype='category')]: - res_first = s.duplicated(keep='first') + for s in [Series(case), Series(case, dtype="category")]: + res_first = s.duplicated(keep="first") tm.assert_series_equal(res_first, Series(exp_first)) - res_last = s.duplicated(keep='last') + res_last = s.duplicated(keep="last") tm.assert_series_equal(res_last, Series(exp_last)) res_false = s.duplicated(keep=False) @@ -1091,52 +1174,82 @@ def test_numeric_object_likes(self, case): def test_datetime_likes(self): - dt = ['2011-01-01', '2011-01-02', '2011-01-01', 'NaT', '2011-01-03', - '2011-01-02', '2011-01-04', '2011-01-01', 'NaT', '2011-01-06'] - td = ['1 days', '2 days', '1 days', 'NaT', '3 days', - '2 days', '4 days', '1 days', 'NaT', '6 days'] - - cases = [np.array([Timestamp(d) for d in dt]), - np.array([Timestamp(d, tz='US/Eastern') for d in dt]), - np.array([pd.Period(d, freq='D') for d in dt]), - np.array([np.datetime64(d) for d in dt]), - np.array([pd.Timedelta(d) for d in td])] - - exp_first = np.array([False, False, True, False, False, - True, False, True, True, False]) - exp_last = np.array([True, True, True, True, False, - False, False, False, False, False]) + dt = [ + "2011-01-01", + "2011-01-02", + "2011-01-01", + "NaT", + "2011-01-03", + "2011-01-02", + "2011-01-04", + "2011-01-01", + "NaT", + "2011-01-06", + ] + td = [ + "1 days", + "2 days", + "1 days", + "NaT", + "3 days", + "2 days", + "4 days", + "1 days", + "NaT", + "6 days", + ] + + cases = [ + np.array([Timestamp(d) for d in dt]), + np.array([Timestamp(d, tz="US/Eastern") for d in dt]), + np.array([pd.Period(d, freq="D") for d in dt]), + np.array([np.datetime64(d) for d in dt]), + np.array([pd.Timedelta(d) for d in td]), + ] + + exp_first = np.array( + [False, False, True, False, False, True, False, True, True, False] + ) + exp_last = np.array( + [True, True, True, True, False, False, False, False, False, False] + ) exp_false = exp_first | exp_last for case in cases: - res_first = algos.duplicated(case, keep='first') + res_first = algos.duplicated(case, keep="first") tm.assert_numpy_array_equal(res_first, exp_first) - res_last = algos.duplicated(case, keep='last') + res_last = algos.duplicated(case, keep="last") tm.assert_numpy_array_equal(res_last, exp_last) res_false = algos.duplicated(case, keep=False) tm.assert_numpy_array_equal(res_false, exp_false) # index - for idx in [Index(case), Index(case, dtype='category'), - Index(case, dtype=object)]: - res_first = idx.duplicated(keep='first') + for idx in [ + Index(case), + Index(case, dtype="category"), + Index(case, dtype=object), + ]: + res_first = idx.duplicated(keep="first") tm.assert_numpy_array_equal(res_first, exp_first) - res_last = idx.duplicated(keep='last') + res_last = idx.duplicated(keep="last") tm.assert_numpy_array_equal(res_last, exp_last) res_false = idx.duplicated(keep=False) tm.assert_numpy_array_equal(res_false, exp_false) # series - for s in [Series(case), Series(case, dtype='category'), - Series(case, dtype=object)]: - res_first = s.duplicated(keep='first') + for s in [ + Series(case), + Series(case, dtype="category"), + Series(case, dtype=object), + ]: + res_first = s.duplicated(keep="first") tm.assert_series_equal(res_first, Series(exp_first)) - res_last = s.duplicated(keep='last') + res_last = s.duplicated(keep="last") tm.assert_series_equal(res_last, Series(exp_last)) res_false = s.duplicated(keep=False) @@ -1146,17 +1259,24 @@ def test_unique_index(self): cases = [Index([1, 2, 3]), pd.RangeIndex(0, 3)] for case in cases: assert case.is_unique is True - tm.assert_numpy_array_equal(case.duplicated(), - np.array([False, False, False])) - - @pytest.mark.parametrize('arr, unique', [ - ([(0, 0), (0, 1), (1, 0), (1, 1), (0, 0), (0, 1), (1, 0), (1, 1)], - [(0, 0), (0, 1), (1, 0), (1, 1)]), - ([('b', 'c'), ('a', 'b'), ('a', 'b'), ('b', 'c')], - [('b', 'c'), ('a', 'b')]), - ([('a', 1), ('b', 2), ('a', 3), ('a', 1)], - [('a', 1), ('b', 2), ('a', 3)]), - ]) + tm.assert_numpy_array_equal( + case.duplicated(), np.array([False, False, False]) + ) + + @pytest.mark.parametrize( + "arr, unique", + [ + ( + [(0, 0), (0, 1), (1, 0), (1, 1), (0, 0), (0, 1), (1, 0), (1, 1)], + [(0, 0), (0, 1), (1, 0), (1, 1)], + ), + ( + [("b", "c"), ("a", "b"), ("a", "b"), ("b", "c")], + [("b", "c"), ("a", "b")], + ), + ([("a", 1), ("b", 2), ("a", 3), ("a", 1)], [("a", 1), ("b", 2), ("a", 3)]), + ], + ) def test_unique_tuples(self, arr, unique): # https://github.com/pandas-dev/pandas/issues/16519 expected = np.empty(len(unique), dtype=object) @@ -1167,18 +1287,17 @@ def test_unique_tuples(self, arr, unique): class GroupVarTestMixin: - def test_group_var_generic_1d(self): prng = RandomState(1234) out = (np.nan * np.ones((5, 1))).astype(self.dtype) - counts = np.zeros(5, dtype='int64') + counts = np.zeros(5, dtype="int64") values = 10 * prng.rand(15, 1).astype(self.dtype) - labels = np.tile(np.arange(5), (3, )).astype('int64') + labels = np.tile(np.arange(5), (3,)).astype("int64") - expected_out = (np.squeeze(values) - .reshape((5, 3), order='F') - .std(axis=1, ddof=1) ** 2)[:, np.newaxis] + expected_out = ( + np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2 + )[:, np.newaxis] expected_counts = counts + 3 self.algo(out, counts, values, labels) @@ -1189,9 +1308,9 @@ def test_group_var_generic_1d_flat_labels(self): prng = RandomState(1234) out = (np.nan * np.ones((1, 1))).astype(self.dtype) - counts = np.zeros(1, dtype='int64') + counts = np.zeros(1, dtype="int64") values = 10 * prng.rand(5, 1).astype(self.dtype) - labels = np.zeros(5, dtype='int64') + labels = np.zeros(5, dtype="int64") expected_out = np.array([[values.std(ddof=1) ** 2]]) expected_counts = counts + 5 @@ -1205,9 +1324,9 @@ def test_group_var_generic_2d_all_finite(self): prng = RandomState(1234) out = (np.nan * np.ones((5, 2))).astype(self.dtype) - counts = np.zeros(5, dtype='int64') + counts = np.zeros(5, dtype="int64") values = 10 * prng.rand(10, 2).astype(self.dtype) - labels = np.tile(np.arange(5), (2, )).astype('int64') + labels = np.tile(np.arange(5), (2,)).astype("int64") expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2 expected_counts = counts + 2 @@ -1220,15 +1339,17 @@ def test_group_var_generic_2d_some_nan(self): prng = RandomState(1234) out = (np.nan * np.ones((5, 2))).astype(self.dtype) - counts = np.zeros(5, dtype='int64') + counts = np.zeros(5, dtype="int64") values = 10 * prng.rand(10, 2).astype(self.dtype) values[:, 1] = np.nan - labels = np.tile(np.arange(5), (2, )).astype('int64') - - expected_out = np.vstack([values[:, 0] - .reshape(5, 2, order='F') - .std(ddof=1, axis=1) ** 2, - np.nan * np.ones(5)]).T.astype(self.dtype) + labels = np.tile(np.arange(5), (2,)).astype("int64") + + expected_out = np.vstack( + [ + values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2, + np.nan * np.ones(5), + ] + ).T.astype(self.dtype) expected_counts = counts + 2 self.algo(out, counts, values, labels) @@ -1239,9 +1360,9 @@ def test_group_var_constant(self): # Regression test from GH 10448. out = np.array([[np.nan]], dtype=self.dtype) - counts = np.array([0], dtype='int64') + counts = np.array([0], dtype="int64") values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype) - labels = np.zeros(3, dtype='int64') + labels = np.zeros(3, dtype="int64") self.algo(out, counts, values, labels) @@ -1262,10 +1383,10 @@ def test_group_var_large_inputs(self): prng = RandomState(1234) out = np.array([[np.nan]], dtype=self.dtype) - counts = np.array([0], dtype='int64') + counts = np.array([0], dtype="int64") values = (prng.rand(10 ** 6) + 10 ** 12).astype(self.dtype) values.shape = (10 ** 6, 1) - labels = np.zeros(10 ** 6, dtype='int64') + labels = np.zeros(10 ** 6, dtype="int64") self.algo(out, counts, values, labels) @@ -1282,15 +1403,13 @@ class TestGroupVarFloat32(GroupVarTestMixin): class TestHashTable: - def test_lookup_nan(self, writable): xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3]) # GH 21688 ensure we can deal with readonly memory views xs.setflags(write=writable) m = ht.Float64HashTable() m.map_locations(xs) - tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), - dtype=np.int64)) + tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.int64)) def test_add_signed_zeros(self): # GH 21866 inconsistent hash-function for float64 @@ -1306,8 +1425,8 @@ def test_add_signed_zeros(self): def test_add_different_nans(self): # GH 21866 inconsistent hash-function for float64 # create different nans from bit-patterns: - NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0] - NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0] + NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0] + NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0] assert NAN1 != NAN1 assert NAN2 != NAN2 # default hash function would lead to different hash-buckets @@ -1318,28 +1437,32 @@ def test_add_different_nans(self): assert len(m) == 1 # NAN1 and NAN2 are equivalent def test_lookup_overflow(self, writable): - xs = np.array([1, 2, 2**63], dtype=np.uint64) + xs = np.array([1, 2, 2 ** 63], dtype=np.uint64) # GH 21688 ensure we can deal with readonly memory views xs.setflags(write=writable) m = ht.UInt64HashTable() m.map_locations(xs) - tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), - dtype=np.int64)) + tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.int64)) def test_get_unique(self): - s = Series([1, 2, 2**63, 2**63], dtype=np.uint64) - exp = np.array([1, 2, 2**63], dtype=np.uint64) + s = Series([1, 2, 2 ** 63, 2 ** 63], dtype=np.uint64) + exp = np.array([1, 2, 2 ** 63], dtype=np.uint64) tm.assert_numpy_array_equal(s.unique(), exp) - @pytest.mark.parametrize('nvals', [0, 10]) # resizing to 0 is special case - @pytest.mark.parametrize('htable, uniques, dtype, safely_resizes', [ - (ht.PyObjectHashTable, ht.ObjectVector, 'object', False), - (ht.StringHashTable, ht.ObjectVector, 'object', True), - (ht.Float64HashTable, ht.Float64Vector, 'float64', False), - (ht.Int64HashTable, ht.Int64Vector, 'int64', False), - (ht.UInt64HashTable, ht.UInt64Vector, 'uint64', False)]) - def test_vector_resize(self, writable, htable, uniques, dtype, - safely_resizes, nvals): + @pytest.mark.parametrize("nvals", [0, 10]) # resizing to 0 is special case + @pytest.mark.parametrize( + "htable, uniques, dtype, safely_resizes", + [ + (ht.PyObjectHashTable, ht.ObjectVector, "object", False), + (ht.StringHashTable, ht.ObjectVector, "object", True), + (ht.Float64HashTable, ht.Float64Vector, "float64", False), + (ht.Int64HashTable, ht.Int64Vector, "int64", False), + (ht.UInt64HashTable, ht.UInt64Vector, "uint64", False), + ], + ) + def test_vector_resize( + self, writable, htable, uniques, dtype, safely_resizes, nvals + ): # Test for memory errors after internal vector # reallocations (GH 7157) vals = np.array(np.random.randn(1000), dtype=dtype) @@ -1364,21 +1487,25 @@ def test_vector_resize(self, writable, htable, uniques, dtype, if safely_resizes: htable.get_labels(vals, uniques, 0, -1) else: - with pytest.raises(ValueError, match='external reference.*'): + with pytest.raises(ValueError, match="external reference.*"): htable.get_labels(vals, uniques, 0, -1) - uniques.to_array() # should not raise here + uniques.to_array() # should not raise here assert tmp.shape == oldshape - @pytest.mark.parametrize('htable, tm_dtype', [ - (ht.PyObjectHashTable, 'String'), - (ht.StringHashTable, 'String'), - (ht.Float64HashTable, 'Float'), - (ht.Int64HashTable, 'Int'), - (ht.UInt64HashTable, 'UInt')]) + @pytest.mark.parametrize( + "htable, tm_dtype", + [ + (ht.PyObjectHashTable, "String"), + (ht.StringHashTable, "String"), + (ht.Float64HashTable, "Float"), + (ht.Int64HashTable, "Int"), + (ht.UInt64HashTable, "UInt"), + ], + ) def test_hashtable_unique(self, htable, tm_dtype, writable): # output of maker has guaranteed unique elements - maker = getattr(tm, 'make' + tm_dtype + 'Index') + maker = getattr(tm, "make" + tm_dtype + "Index") s = Series(maker(1000)) if htable == ht.Float64HashTable: # add NaN for float column @@ -1393,27 +1520,32 @@ def test_hashtable_unique(self, htable, tm_dtype, writable): # drop_duplicates has own cython code (hash_table_func_helper.pxi) # and is tested separately; keeps first occurrence like ht.unique() - expected_unique = s_duplicated.drop_duplicates(keep='first').values + expected_unique = s_duplicated.drop_duplicates(keep="first").values result_unique = htable().unique(s_duplicated.values) tm.assert_numpy_array_equal(result_unique, expected_unique) # test return_inverse=True # reconstruction can only succeed if the inverse is correct - result_unique, result_inverse = htable().unique(s_duplicated.values, - return_inverse=True) + result_unique, result_inverse = htable().unique( + s_duplicated.values, return_inverse=True + ) tm.assert_numpy_array_equal(result_unique, expected_unique) reconstr = result_unique[result_inverse] tm.assert_numpy_array_equal(reconstr, s_duplicated.values) - @pytest.mark.parametrize('htable, tm_dtype', [ - (ht.PyObjectHashTable, 'String'), - (ht.StringHashTable, 'String'), - (ht.Float64HashTable, 'Float'), - (ht.Int64HashTable, 'Int'), - (ht.UInt64HashTable, 'UInt')]) + @pytest.mark.parametrize( + "htable, tm_dtype", + [ + (ht.PyObjectHashTable, "String"), + (ht.StringHashTable, "String"), + (ht.Float64HashTable, "Float"), + (ht.Int64HashTable, "Int"), + (ht.UInt64HashTable, "UInt"), + ], + ) def test_hashtable_factorize(self, htable, tm_dtype, writable): # output of maker has guaranteed unique elements - maker = getattr(tm, 'make' + tm_dtype + 'Index') + maker = getattr(tm, "make" + tm_dtype + "Index") s = Series(maker(1000)) if htable == ht.Float64HashTable: # add NaN for float column @@ -1441,42 +1573,46 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable): expected_reconstruct = s_duplicated.dropna().values tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct) - @pytest.mark.parametrize('hashtable', [ - ht.PyObjectHashTable, ht.StringHashTable, - ht.Float64HashTable, ht.Int64HashTable, ht.UInt64HashTable]) + @pytest.mark.parametrize( + "hashtable", + [ + ht.PyObjectHashTable, + ht.StringHashTable, + ht.Float64HashTable, + ht.Int64HashTable, + ht.UInt64HashTable, + ], + ) def test_hashtable_large_sizehint(self, hashtable): # GH 22729 size_hint = np.iinfo(np.uint32).max + 1 - tbl = hashtable(size_hint=size_hint) # noqa + tbl = hashtable(size_hint=size_hint) # noqa def test_quantile(): s = Series(np.random.randn(100)) - result = algos.quantile(s, [0, .25, .5, .75, 1.]) - expected = algos.quantile(s.values, [0, .25, .5, .75, 1.]) + result = algos.quantile(s, [0, 0.25, 0.5, 0.75, 1.0]) + expected = algos.quantile(s.values, [0, 0.25, 0.5, 0.75, 1.0]) tm.assert_almost_equal(result, expected) def test_unique_label_indices(): - a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8') + a = np.random.randint(1, 1 << 10, 1 << 15).astype("i8") left = ht.unique_label_indices(a) right = np.unique(a, return_index=True)[1] - tm.assert_numpy_array_equal(left, right, - check_dtype=False) + tm.assert_numpy_array_equal(left, right, check_dtype=False) a[np.random.choice(len(a), 10)] = -1 left = ht.unique_label_indices(a) right = np.unique(a, return_index=True)[1][1:] - tm.assert_numpy_array_equal(left, right, - check_dtype=False) + tm.assert_numpy_array_equal(left, right, check_dtype=False) class TestRank: - @td.skip_if_no_scipy def test_scipy_compat(self): from scipy.stats import rankdata @@ -1490,13 +1626,13 @@ def _check(arr): exp[mask] = nan assert_almost_equal(result, exp) - _check(np.array([nan, nan, 5., 5., 5., nan, 1, 2, 3, nan])) - _check(np.array([4., nan, 5., 5., 5., nan, 1, 2, 4., nan])) + _check(np.array([nan, nan, 5.0, 5.0, 5.0, nan, 1, 2, 3, nan])) + _check(np.array([4.0, nan, 5.0, 5.0, 5.0, nan, 1, 2, 4.0, nan])) def test_basic(self): exp = np.array([1, 2], dtype=np.float64) - for dtype in np.typecodes['AllInteger']: + for dtype in np.typecodes["AllInteger"]: s = Series([1, 100], dtype=dtype) tm.assert_numpy_array_equal(algos.rank(s), exp) @@ -1504,7 +1640,7 @@ def test_uint64_overflow(self): exp = np.array([1, 2], dtype=np.float64) for dtype in [np.float64, np.uint64]: - s = Series([1, 2**63], dtype=dtype) + s = Series([1, 2 ** 63], dtype=dtype) tm.assert_numpy_array_equal(algos.rank(s), exp) def test_too_many_ndims(self): @@ -1516,10 +1652,11 @@ def test_too_many_ndims(self): @pytest.mark.single @pytest.mark.high_memory - @pytest.mark.parametrize('values', [ - np.arange(2**24 + 1), - np.arange(2**25 + 2).reshape(2**24 + 1, 2)], - ids=['1d', '2d']) + @pytest.mark.parametrize( + "values", + [np.arange(2 ** 24 + 1), np.arange(2 ** 25 + 2).reshape(2 ** 24 + 1, 2)], + ids=["1d", "2d"], + ) def test_pct_max_many_rows(self, values): # GH 18271 result = algos.rank(values, pct=True).max() @@ -1528,8 +1665,8 @@ def test_pct_max_many_rows(self, values): def test_pad_backfill_object_segfault(): - old = np.array([], dtype='O') - new = np.array([datetime(2010, 12, 31)], dtype='O') + old = np.array([], dtype="O") + new = np.array([datetime(2010, 12, 31)], dtype="O") result = libalgos.pad["object"](old, new) expected = np.array([-1], dtype=np.int64) @@ -1549,13 +1686,12 @@ def test_pad_backfill_object_segfault(): def test_arrmap(): - values = np.array(['foo', 'foo', 'bar', 'bar', 'baz', 'qux'], dtype='O') - result = libalgos.arrmap_object(values, lambda x: x in ['foo', 'bar']) - assert (result.dtype == np.bool_) + values = np.array(["foo", "foo", "bar", "bar", "baz", "qux"], dtype="O") + result = libalgos.arrmap_object(values, lambda x: x in ["foo", "bar"]) + assert result.dtype == np.bool_ class TestTseriesUtil: - def test_combineFunc(self): pass @@ -1577,8 +1713,7 @@ def test_backfill(self): filler = libalgos.backfill["int64_t"](old.values, new.values) - expect_filler = np.array([0, 0, 1, 1, 1, 1, - 2, 2, 2, 2, 2, -1], dtype=np.int64) + expect_filler = np.array([0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1], dtype=np.int64) tm.assert_numpy_array_equal(filler, expect_filler) # corner case @@ -1595,8 +1730,7 @@ def test_pad(self): filler = libalgos.pad["int64_t"](old.values, new.values) - expect_filler = np.array([-1, 0, 0, 0, 0, 1, - 1, 1, 1, 1, 2, 2], dtype=np.int64) + expect_filler = np.array([-1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2], dtype=np.int64) tm.assert_numpy_array_equal(filler, expect_filler) # corner case @@ -1609,32 +1743,267 @@ def test_pad(self): def test_is_lexsorted(): failure = [ - np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, - 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='int64'), - np.array([30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, - 15, 14, - 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, - 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, - 12, 11, - 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, - 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, - 9, 8, - 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, 24, 23, 22, - 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, - 6, 5, - 4, 3, 2, 1, 0], dtype='int64')] - - assert (not libalgos.is_lexsorted(failure)) + np.array( + [ + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ], + dtype="int64", + ), + np.array( + [ + 30, + 29, + 28, + 27, + 26, + 25, + 24, + 23, + 22, + 21, + 20, + 19, + 18, + 17, + 16, + 15, + 14, + 13, + 12, + 11, + 10, + 9, + 8, + 7, + 6, + 5, + 4, + 3, + 2, + 1, + 0, + 30, + 29, + 28, + 27, + 26, + 25, + 24, + 23, + 22, + 21, + 20, + 19, + 18, + 17, + 16, + 15, + 14, + 13, + 12, + 11, + 10, + 9, + 8, + 7, + 6, + 5, + 4, + 3, + 2, + 1, + 0, + 30, + 29, + 28, + 27, + 26, + 25, + 24, + 23, + 22, + 21, + 20, + 19, + 18, + 17, + 16, + 15, + 14, + 13, + 12, + 11, + 10, + 9, + 8, + 7, + 6, + 5, + 4, + 3, + 2, + 1, + 0, + 30, + 29, + 28, + 27, + 26, + 25, + 24, + 23, + 22, + 21, + 20, + 19, + 18, + 17, + 16, + 15, + 14, + 13, + 12, + 11, + 10, + 9, + 8, + 7, + 6, + 5, + 4, + 3, + 2, + 1, + 0, + ], + dtype="int64", + ), + ] + + assert not libalgos.is_lexsorted(failure) def test_groupsort_indexer(): @@ -1646,7 +2015,7 @@ def test_groupsort_indexer(): # need to use a stable sort # np.argsort returns int, groupsort_indexer # always returns int64 - expected = np.argsort(a, kind='mergesort') + expected = np.argsort(a, kind="mergesort") expected = expected.astype(np.int64) tm.assert_numpy_array_equal(result, expected) @@ -1718,7 +2087,7 @@ def test_ensure_platform_int(): arr = np.arange(100, dtype=np.intp) result = libalgos.ensure_platform_int(arr) - assert (result is arr) + assert result is arr def test_int64_add_overflow(): @@ -1738,34 +2107,42 @@ def test_int64_add_overflow(): with pytest.raises(OverflowError, match=msg): algos.checked_add_with_arr(np.array([m, n]), np.array([n, n])) with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), - arr_mask=np.array([False, True])) + algos.checked_add_with_arr( + np.array([m, m]), np.array([m, m]), arr_mask=np.array([False, True]) + ) with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), - b_mask=np.array([False, True])) + algos.checked_add_with_arr( + np.array([m, m]), np.array([m, m]), b_mask=np.array([False, True]) + ) with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), - arr_mask=np.array([False, True]), - b_mask=np.array([False, True])) + algos.checked_add_with_arr( + np.array([m, m]), + np.array([m, m]), + arr_mask=np.array([False, True]), + b_mask=np.array([False, True]), + ) with pytest.raises(OverflowError, match=msg): with tm.assert_produces_warning(RuntimeWarning): - algos.checked_add_with_arr(np.array([m, m]), - np.array([np.nan, m])) + algos.checked_add_with_arr(np.array([m, m]), np.array([np.nan, m])) # Check that the nan boolean arrays override whether or not # the addition overflows. We don't check the result but just # the fact that an OverflowError is not raised. - algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), - arr_mask=np.array([True, True])) - algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), - b_mask=np.array([True, True])) - algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), - arr_mask=np.array([True, False]), - b_mask=np.array([False, True])) + algos.checked_add_with_arr( + np.array([m, m]), np.array([m, m]), arr_mask=np.array([True, True]) + ) + algos.checked_add_with_arr( + np.array([m, m]), np.array([m, m]), b_mask=np.array([True, True]) + ) + algos.checked_add_with_arr( + np.array([m, m]), + np.array([m, m]), + arr_mask=np.array([True, False]), + b_mask=np.array([False, True]), + ) class TestMode: - def test_no_mode(self): exp = Series([], dtype=np.float64) tm.assert_series_equal(algos.mode([]), exp) @@ -1778,7 +2155,7 @@ def test_mode_single(self): exp_multi = [1] data_multi = [1, 1] - for dt in np.typecodes['AllInteger'] + np.typecodes['Float']: + for dt in np.typecodes["AllInteger"] + np.typecodes["Float"]: s = Series(data_single, dtype=dt) exp = Series(exp_single, dtype=dt) tm.assert_series_equal(algos.mode(s), exp) @@ -1790,8 +2167,8 @@ def test_mode_single(self): exp = Series([1], dtype=np.int) tm.assert_series_equal(algos.mode([1]), exp) - exp = Series(['a', 'b', 'c'], dtype=np.object) - tm.assert_series_equal(algos.mode(['a', 'b', 'c']), exp) + exp = Series(["a", "b", "c"], dtype=np.object) + tm.assert_series_equal(algos.mode(["a", "b", "c"]), exp) def test_number_mode(self): exp_single = [1] @@ -1800,7 +2177,7 @@ def test_number_mode(self): exp_multi = [1, 3] data_multi = [1] * 5 + [2] * 3 + [3] * 5 - for dt in np.typecodes['AllInteger'] + np.typecodes['Float']: + for dt in np.typecodes["AllInteger"] + np.typecodes["Float"]: s = Series(data_single, dtype=dt) exp = Series(exp_single, dtype=dt) tm.assert_series_equal(algos.mode(s), exp) @@ -1810,15 +2187,15 @@ def test_number_mode(self): tm.assert_series_equal(algos.mode(s), exp) def test_strobj_mode(self): - exp = ['b'] - data = ['a'] * 2 + ['b'] * 3 + exp = ["b"] + data = ["a"] * 2 + ["b"] * 3 - s = Series(data, dtype='c') - exp = Series(exp, dtype='c') + s = Series(data, dtype="c") + exp = Series(exp, dtype="c") tm.assert_series_equal(algos.mode(s), exp) - exp = ['bar'] - data = ['foo'] * 2 + ['bar'] * 3 + exp = ["bar"] + data = ["foo"] * 2 + ["bar"] * 3 for dt in [str, object]: s = Series(data, dtype=dt) @@ -1826,41 +2203,41 @@ def test_strobj_mode(self): tm.assert_series_equal(algos.mode(s), exp) def test_datelike_mode(self): - exp = Series(['1900-05-03', '2011-01-03', - '2013-01-02'], dtype="M8[ns]") - s = Series(['2011-01-03', '2013-01-02', - '1900-05-03'], dtype='M8[ns]') + exp = Series(["1900-05-03", "2011-01-03", "2013-01-02"], dtype="M8[ns]") + s = Series(["2011-01-03", "2013-01-02", "1900-05-03"], dtype="M8[ns]") tm.assert_series_equal(algos.mode(s), exp) - exp = Series(['2011-01-03', '2013-01-02'], dtype='M8[ns]') - s = Series(['2011-01-03', '2013-01-02', '1900-05-03', - '2011-01-03', '2013-01-02'], dtype='M8[ns]') + exp = Series(["2011-01-03", "2013-01-02"], dtype="M8[ns]") + s = Series( + ["2011-01-03", "2013-01-02", "1900-05-03", "2011-01-03", "2013-01-02"], + dtype="M8[ns]", + ) tm.assert_series_equal(algos.mode(s), exp) def test_timedelta_mode(self): - exp = Series(['-1 days', '0 days', '1 days'], - dtype='timedelta64[ns]') - s = Series(['1 days', '-1 days', '0 days'], - dtype='timedelta64[ns]') + exp = Series(["-1 days", "0 days", "1 days"], dtype="timedelta64[ns]") + s = Series(["1 days", "-1 days", "0 days"], dtype="timedelta64[ns]") tm.assert_series_equal(algos.mode(s), exp) - exp = Series(['2 min', '1 day'], dtype='timedelta64[ns]') - s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min', - '2 min', '2 min'], dtype='timedelta64[ns]') + exp = Series(["2 min", "1 day"], dtype="timedelta64[ns]") + s = Series( + ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"], + dtype="timedelta64[ns]", + ) tm.assert_series_equal(algos.mode(s), exp) def test_mixed_dtype(self): - exp = Series(['foo']) - s = Series([1, 'foo', 'foo']) + exp = Series(["foo"]) + s = Series([1, "foo", "foo"]) tm.assert_series_equal(algos.mode(s), exp) def test_uint64_overflow(self): - exp = Series([2**63], dtype=np.uint64) - s = Series([1, 2**63, 2**63], dtype=np.uint64) + exp = Series([2 ** 63], dtype=np.uint64) + s = Series([1, 2 ** 63, 2 ** 63], dtype=np.uint64) tm.assert_series_equal(algos.mode(s), exp) - exp = Series([1, 2**63], dtype=np.uint64) - s = Series([1, 2**63], dtype=np.uint64) + exp = Series([1, 2 ** 63], dtype=np.uint64) + s = Series([1, 2 ** 63], dtype=np.uint64) tm.assert_series_equal(algos.mode(s), exp) def test_categorical(self): @@ -1869,8 +2246,8 @@ def test_categorical(self): tm.assert_categorical_equal(algos.mode(c), exp) tm.assert_categorical_equal(c.mode(), exp) - c = Categorical([1, 'a', 'a']) - exp = Categorical(['a'], categories=[1, 'a']) + c = Categorical([1, "a", "a"]) + exp = Categorical(["a"], categories=[1, "a"]) tm.assert_categorical_equal(algos.mode(c), exp) tm.assert_categorical_equal(c.mode(), exp) @@ -1884,15 +2261,17 @@ def test_index(self): exp = Series([1, 2, 3], dtype=np.int64) tm.assert_series_equal(algos.mode(idx), exp) - idx = Index([1, 'a', 'a']) - exp = Series(['a'], dtype=object) + idx = Index([1, "a", "a"]) + exp = Series(["a"], dtype=object) tm.assert_series_equal(algos.mode(idx), exp) idx = Index([1, 1, 2, 3, 3]) exp = Series([1, 3], dtype=np.int64) tm.assert_series_equal(algos.mode(idx), exp) - exp = Series(['2 min', '1 day'], dtype='timedelta64[ns]') - idx = Index(['1 day', '1 day', '-1 day', '-1 day 2 min', - '2 min', '2 min'], dtype='timedelta64[ns]') + exp = Series(["2 min", "1 day"], dtype="timedelta64[ns]") + idx = Index( + ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"], + dtype="timedelta64[ns]", + ) tm.assert_series_equal(algos.mode(idx), exp) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index f9a1bb97cc48cd..279d6dd84d92bb 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -11,14 +11,28 @@ from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.dtypes.common import ( - is_datetime64_dtype, is_datetime64tz_dtype, is_object_dtype, - is_timedelta64_dtype, needs_i8_conversion) + is_datetime64_dtype, + is_datetime64tz_dtype, + is_object_dtype, + is_timedelta64_dtype, + needs_i8_conversion, +) from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd from pandas import ( - CategoricalIndex, DataFrame, DatetimeIndex, Index, Interval, IntervalIndex, - PeriodIndex, Series, Timedelta, TimedeltaIndex, Timestamp) + CategoricalIndex, + DataFrame, + DatetimeIndex, + Index, + Interval, + IntervalIndex, + PeriodIndex, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, +) from pandas.core.accessor import PandasDelegate from pandas.core.arrays import DatetimeArray, PandasArray, TimedeltaArray from pandas.core.base import NoNewAttributesMixin, PandasObject @@ -27,21 +41,20 @@ class CheckStringMixin: - def test_string_methods_dont_fail(self): repr(self.container) str(self.container) bytes(self.container) def test_tricky_container(self): - if not hasattr(self, 'unicode_container'): - pytest.skip('Need unicode_container to test with this') + if not hasattr(self, "unicode_container"): + pytest.skip("Need unicode_container to test with this") repr(self.unicode_container) str(self.unicode_container) class CheckImmutable: - mutable_regex = re.compile('does not support mutable operations') + mutable_regex = re.compile("does not support mutable operations") def check_mutable_error(self, *args, **kwargs): # Pass whatever function you normally would to pytest.raises @@ -86,10 +99,9 @@ def check_result(self, result, expected, klass=None): class TestPandasDelegate: - class Delegator: - _properties = ['foo'] - _methods = ['bar'] + _properties = ["foo"] + _methods = ["bar"] def _set_foo(self, value): self.foo = value @@ -104,7 +116,6 @@ def bar(self, *args, **kwargs): pass class Delegate(PandasDelegate, PandasObject): - def __init__(self, obj): self.obj = obj @@ -119,12 +130,10 @@ def test_invalid_delegation(self): self.Delegate._add_delegate_accessors( delegate=self.Delegator, accessors=self.Delegator._properties, - typ='property' + typ="property", ) self.Delegate._add_delegate_accessors( - delegate=self.Delegator, - accessors=self.Delegator._methods, - typ='method' + delegate=self.Delegator, accessors=self.Delegator._methods, typ="method" ) delegate = self.Delegate(self.Delegator()) @@ -148,40 +157,36 @@ def test_memory_usage(self): class Ops: - def _allow_na_ops(self, obj): """Whether to skip test cases including NaN""" - if (isinstance(obj, Index) and - (obj.is_boolean() or not obj._can_hold_na)): + if isinstance(obj, Index) and (obj.is_boolean() or not obj._can_hold_na): # don't test boolean / int64 index return False return True def setup_method(self, method): - self.bool_index = tm.makeBoolIndex(10, name='a') - self.int_index = tm.makeIntIndex(10, name='a') - self.float_index = tm.makeFloatIndex(10, name='a') - self.dt_index = tm.makeDateIndex(10, name='a') - self.dt_tz_index = tm.makeDateIndex(10, name='a').tz_localize( - tz='US/Eastern') - self.period_index = tm.makePeriodIndex(10, name='a') - self.string_index = tm.makeStringIndex(10, name='a') - self.unicode_index = tm.makeUnicodeIndex(10, name='a') + self.bool_index = tm.makeBoolIndex(10, name="a") + self.int_index = tm.makeIntIndex(10, name="a") + self.float_index = tm.makeFloatIndex(10, name="a") + self.dt_index = tm.makeDateIndex(10, name="a") + self.dt_tz_index = tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern") + self.period_index = tm.makePeriodIndex(10, name="a") + self.string_index = tm.makeStringIndex(10, name="a") + self.unicode_index = tm.makeUnicodeIndex(10, name="a") arr = np.random.randn(10) - self.bool_series = Series(arr, index=self.bool_index, name='a') - self.int_series = Series(arr, index=self.int_index, name='a') - self.float_series = Series(arr, index=self.float_index, name='a') - self.dt_series = Series(arr, index=self.dt_index, name='a') + self.bool_series = Series(arr, index=self.bool_index, name="a") + self.int_series = Series(arr, index=self.int_index, name="a") + self.float_series = Series(arr, index=self.float_index, name="a") + self.dt_series = Series(arr, index=self.dt_index, name="a") self.dt_tz_series = self.dt_tz_index.to_series(keep_tz=True) - self.period_series = Series(arr, index=self.period_index, name='a') - self.string_series = Series(arr, index=self.string_index, name='a') - self.unicode_series = Series(arr, index=self.unicode_index, name='a') - - types = ['bool', 'int', 'float', 'dt', 'dt_tz', 'period', 'string', - 'unicode'] - self.indexes = [getattr(self, '{}_index'.format(t)) for t in types] - self.series = [getattr(self, '{}_series'.format(t)) for t in types] + self.period_series = Series(arr, index=self.period_index, name="a") + self.string_series = Series(arr, index=self.string_index, name="a") + self.unicode_series = Series(arr, index=self.unicode_index, name="a") + + types = ["bool", "int", "float", "dt", "dt_tz", "period", "string", "unicode"] + self.indexes = [getattr(self, "{}_index".format(t)) for t in types] + self.series = [getattr(self, "{}_series".format(t)) for t in types] self.objs = self.indexes + self.series def check_ops_properties(self, props, filter=None, ignore_failures=False): @@ -196,8 +201,7 @@ def check_ops_properties(self, props, filter=None, ignore_failures=False): try: if isinstance(o, Series): - expected = Series( - getattr(o.index, op), index=o.index, name='a') + expected = Series(getattr(o.index, op), index=o.index, name="a") else: expected = getattr(o, op) except (AttributeError): @@ -211,8 +215,9 @@ def check_ops_properties(self, props, filter=None, ignore_failures=False): tm.assert_series_equal(result, expected) elif isinstance(result, Index) and isinstance(expected, Index): tm.assert_index_equal(result, expected) - elif isinstance(result, np.ndarray) and isinstance(expected, - np.ndarray): + elif isinstance(result, np.ndarray) and isinstance( + expected, np.ndarray + ): tm.assert_numpy_array_equal(result, expected) else: assert result == expected @@ -231,29 +236,30 @@ def check_ops_properties(self, props, filter=None, ignore_failures=False): with pytest.raises(err): getattr(o, op) - @pytest.mark.parametrize('klass', [Series, DataFrame]) + @pytest.mark.parametrize("klass", [Series, DataFrame]) def test_binary_ops_docs(self, klass): - op_map = {'add': '+', - 'sub': '-', - 'mul': '*', - 'mod': '%', - 'pow': '**', - 'truediv': '/', - 'floordiv': '//'} + op_map = { + "add": "+", + "sub": "-", + "mul": "*", + "mod": "%", + "pow": "**", + "truediv": "/", + "floordiv": "//", + } for op_name in op_map: operand1 = klass.__name__.lower() - operand2 = 'other' + operand2 = "other" op = op_map[op_name] - expected_str = ' '.join([operand1, op, operand2]) + expected_str = " ".join([operand1, op, operand2]) assert expected_str in getattr(klass, op_name).__doc__ # reverse version of the binary ops - expected_str = ' '.join([operand2, op, operand1]) - assert expected_str in getattr(klass, 'r' + op_name).__doc__ + expected_str = " ".join([operand2, op, operand1]) + assert expected_str in getattr(klass, "r" + op_name).__doc__ class TestIndexOps(Ops): - def setup_method(self, method): super().setup_method(method) self.is_valid_objs = self.objs @@ -286,7 +292,7 @@ def test_none_comparison(self): assert result.iat[0] assert result.iat[1] - if (is_datetime64_dtype(o) or is_datetime64tz_dtype(o)): + if is_datetime64_dtype(o) or is_datetime64tz_dtype(o): # Following DatetimeIndex (and Timestamp) convention, # inequality comparisons with Series[datetime64] raise with pytest.raises(TypeError): @@ -306,16 +312,16 @@ def test_ndarray_compat_properties(self): for o in self.objs: # Check that we work. - for p in ['shape', 'dtype', 'T', 'nbytes']: + for p in ["shape", "dtype", "T", "nbytes"]: assert getattr(o, p, None) is not None # deprecated properties - for p in ['flags', 'strides', 'itemsize']: + for p in ["flags", "strides", "itemsize"]: with tm.assert_produces_warning(FutureWarning): assert getattr(o, p, None) is not None with tm.assert_produces_warning(FutureWarning): - assert hasattr(o, 'base') + assert hasattr(o, "base") # If we have a datetime-like dtype then needs a view to work # but the user is responsible for that @@ -354,25 +360,26 @@ def test_value_counts_unique_nunique(self): expected_index = Index(o[::-1]) expected_index.name = None o = o.repeat(range(1, len(o) + 1)) - o.name = 'a' + o.name = "a" else: expected_index = Index(values[::-1]) idx = o.index.repeat(range(1, len(o) + 1)) # take-based repeat indices = np.repeat(np.arange(len(o)), range(1, len(o) + 1)) rep = values.take(indices) - o = klass(rep, index=idx, name='a') + o = klass(rep, index=idx, name="a") # check values has the same dtype as the original assert o.dtype == orig.dtype - expected_s = Series(range(10, 0, -1), index=expected_index, - dtype='int64', name='a') + expected_s = Series( + range(10, 0, -1), index=expected_index, dtype="int64", name="a" + ) result = o.value_counts() tm.assert_series_equal(result, expected_s) assert result.index.name is None - assert result.name == 'a' + assert result.name == "a" result = o.unique() if isinstance(o, Index): @@ -385,14 +392,14 @@ def test_value_counts_unique_nunique(self): assert isinstance(r, Timestamp) tm.assert_numpy_array_equal( - result.astype(object), - orig._values.astype(object)) + result.astype(object), orig._values.astype(object) + ) else: tm.assert_numpy_array_equal(result, orig.values) assert o.nunique() == len(np.unique(o.values)) - @pytest.mark.parametrize('null_obj', [np.nan, None]) + @pytest.mark.parametrize("null_obj", [np.nan, None]) def test_value_counts_unique_nunique_null(self, null_obj): for orig in self.objs: @@ -431,7 +438,7 @@ def test_value_counts_unique_nunique_null(self, null_obj): # attach name to klass o = klass(values.repeat(range(1, len(o) + 1))) - o.name = 'a' + o.name = "a" else: if isinstance(o, DatetimeIndex): expected_index = orig._values._shallow_copy(values) @@ -439,7 +446,7 @@ def test_value_counts_unique_nunique_null(self, null_obj): expected_index = Index(values) expected_index.name = None o = o.repeat(range(1, len(o) + 1)) - o.name = 'a' + o.name = "a" # check values has the same dtype as the original assert o.dtype == orig.dtype @@ -449,29 +456,34 @@ def test_value_counts_unique_nunique_null(self, null_obj): if isinstance(o, Index): tm.assert_numpy_array_equal(pd.isna(o), nanloc) else: - exp = Series(nanloc, o.index, name='a') + exp = Series(nanloc, o.index, name="a") tm.assert_series_equal(pd.isna(o), exp) - expected_s_na = Series(list(range(10, 2, -1)) + [3], - index=expected_index[9:0:-1], - dtype='int64', name='a') - expected_s = Series(list(range(10, 2, -1)), - index=expected_index[9:1:-1], - dtype='int64', name='a') + expected_s_na = Series( + list(range(10, 2, -1)) + [3], + index=expected_index[9:0:-1], + dtype="int64", + name="a", + ) + expected_s = Series( + list(range(10, 2, -1)), + index=expected_index[9:1:-1], + dtype="int64", + name="a", + ) result_s_na = o.value_counts(dropna=False) tm.assert_series_equal(result_s_na, expected_s_na) assert result_s_na.index.name is None - assert result_s_na.name == 'a' + assert result_s_na.name == "a" result_s = o.value_counts() tm.assert_series_equal(o.value_counts(), expected_s) assert result_s.index.name is None - assert result_s.name == 'a' + assert result_s.name == "a" result = o.unique() if isinstance(o, Index): - tm.assert_index_equal(result, - Index(values[1:], name='a')) + tm.assert_index_equal(result, Index(values[1:], name="a")) elif is_datetime64tz_dtype(o): # unable to compare NaT / nan tm.assert_extension_array_equal(result[1:], values[2:]) @@ -485,11 +497,11 @@ def test_value_counts_unique_nunique_null(self, null_obj): assert o.nunique() == 8 assert o.nunique(dropna=False) == 9 - @pytest.mark.parametrize('klass', [Index, Series]) + @pytest.mark.parametrize("klass", [Index, Series]) def test_value_counts_inferred(self, klass): - s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] + s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] s = klass(s_values) - expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) + expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"]) tm.assert_series_equal(s.value_counts(), expected) if isinstance(s, Index): @@ -503,22 +515,22 @@ def test_value_counts_inferred(self, klass): # don't sort, have to sort after the fact as not sorting is # platform-dep hist = s.value_counts(sort=False).sort_values() - expected = Series([3, 1, 4, 2], index=list('acbd')).sort_values() + expected = Series([3, 1, 4, 2], index=list("acbd")).sort_values() tm.assert_series_equal(hist, expected) # sort ascending hist = s.value_counts(ascending=True) - expected = Series([1, 2, 3, 4], index=list('cdab')) + expected = Series([1, 2, 3, 4], index=list("cdab")) tm.assert_series_equal(hist, expected) # relative histogram. hist = s.value_counts(normalize=True) - expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c']) + expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"]) tm.assert_series_equal(hist, expected) - @pytest.mark.parametrize('klass', [Index, Series]) + @pytest.mark.parametrize("klass", [Index, Series]) def test_value_counts_bins(self, klass): - s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] + s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] s = klass(s_values) # bins @@ -553,63 +565,66 @@ def test_value_counts_bins(self, klass): tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) - exp4n = Series([0.5, 0.25, 0.25, 0], - index=intervals.take([0, 3, 1, 2])) + exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4n, exp4n) # handle NA's properly - s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, - 'd', 'd', 'a', 'a', 'b'] + s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"] s = klass(s_values) - expected = Series([4, 3, 2], index=['b', 'a', 'd']) + expected = Series([4, 3, 2], index=["b", "a", "d"]) tm.assert_series_equal(s.value_counts(), expected) if isinstance(s, Index): - exp = Index(['a', 'b', np.nan, 'd']) + exp = Index(["a", "b", np.nan, "d"]) tm.assert_index_equal(s.unique(), exp) else: - exp = np.array(['a', 'b', np.nan, 'd'], dtype=object) + exp = np.array(["a", "b", np.nan, "d"], dtype=object) tm.assert_numpy_array_equal(s.unique(), exp) assert s.nunique() == 3 s = klass({}) expected = Series([], dtype=np.int64) - tm.assert_series_equal(s.value_counts(), expected, - check_index_type=False) + tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) # returned dtype differs depending on original if isinstance(s, Index): tm.assert_index_equal(s.unique(), Index([]), exact=False) else: - tm.assert_numpy_array_equal(s.unique(), np.array([]), - check_dtype=False) + tm.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False) assert s.nunique() == 0 - @pytest.mark.parametrize('klass', [Index, Series]) + @pytest.mark.parametrize("klass", [Index, Series]) def test_value_counts_datetime64(self, klass): # GH 3002, datetime64[ns] # don't test names though - txt = "\n".join(['xxyyzz20100101PIE', 'xxyyzz20100101GUM', - 'xxyyzz20100101EGG', 'xxyyww20090101EGG', - 'foofoo20080909PIE', 'foofoo20080909GUM']) + txt = "\n".join( + [ + "xxyyzz20100101PIE", + "xxyyzz20100101GUM", + "xxyyzz20100101EGG", + "xxyyww20090101EGG", + "foofoo20080909PIE", + "foofoo20080909GUM", + ] + ) f = StringIO(txt) - df = pd.read_fwf(f, widths=[6, 8, 3], - names=["person_id", "dt", "food"], - parse_dates=["dt"]) + df = pd.read_fwf( + f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"] + ) - s = klass(df['dt'].copy()) + s = klass(df["dt"].copy()) s.name = None - idx = pd.to_datetime(['2010-01-01 00:00:00', - '2008-09-09 00:00:00', - '2009-01-01 00:00:00']) + idx = pd.to_datetime( + ["2010-01-01 00:00:00", "2008-09-09 00:00:00", "2009-01-01 00:00:00"] + ) expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) - expected = np_array_datetime64_compat(['2010-01-01 00:00:00', - '2009-01-01 00:00:00', - '2008-09-09 00:00:00'], - dtype='datetime64[ns]') + expected = np_array_datetime64_compat( + ["2010-01-01 00:00:00", "2009-01-01 00:00:00", "2008-09-09 00:00:00"], + dtype="datetime64[ns]", + ) if isinstance(s, Index): tm.assert_index_equal(s.unique(), DatetimeIndex(expected)) else: @@ -618,11 +633,11 @@ def test_value_counts_datetime64(self, klass): assert s.nunique() == 3 # with NaT - s = df['dt'].copy() + s = df["dt"].copy() s = klass([v for v in s.values] + [pd.NaT]) result = s.value_counts() - assert result.index.dtype == 'datetime64[ns]' + assert result.index.dtype == "datetime64[ns]" tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) @@ -630,7 +645,7 @@ def test_value_counts_datetime64(self, klass): tm.assert_series_equal(result, expected_s) unique = s.unique() - assert unique.dtype == 'datetime64[ns]' + assert unique.dtype == "datetime64[ns]" # numpy_array_equal cannot compare pd.NaT if isinstance(s, Index): @@ -645,20 +660,20 @@ def test_value_counts_datetime64(self, klass): # timedelta64[ns] td = df.dt - df.dt + timedelta(1) - td = klass(td, name='dt') + td = klass(td, name="dt") result = td.value_counts() - expected_s = Series([6], index=[Timedelta('1day')], name='dt') + expected_s = Series([6], index=[Timedelta("1day")], name="dt") tm.assert_series_equal(result, expected_s) - expected = TimedeltaIndex(['1 days'], name='dt') + expected = TimedeltaIndex(["1 days"], name="dt") if isinstance(td, Index): tm.assert_index_equal(td.unique(), expected) else: tm.assert_numpy_array_equal(td.unique(), expected.values) td2 = timedelta(1) + (df.dt - df.dt) - td2 = klass(td2, name='dt') + td2 = klass(td2, name="dt") result2 = td2.value_counts() tm.assert_series_equal(result2, expected_s) @@ -677,12 +692,10 @@ def test_factorize(self): tm.assert_numpy_array_equal(labels, exp_arr) if isinstance(o, Series): - tm.assert_index_equal(uniques, Index(orig), - check_names=False) + tm.assert_index_equal(uniques, Index(orig), check_names=False) else: # factorize explicitly resets name - tm.assert_index_equal(uniques, exp_uniques, - check_names=False) + tm.assert_index_equal(uniques, exp_uniques, check_names=False) def test_factorize_repeated(self): for orig in self.objs: @@ -701,19 +714,20 @@ def test_factorize_repeated(self): o = o.take(indexer) n = o[5:].append(o) - exp_arr = np.array([5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], - dtype=np.intp) + exp_arr = np.array( + [5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.intp + ) labels, uniques = n.factorize(sort=True) tm.assert_numpy_array_equal(labels, exp_arr) if isinstance(o, Series): - tm.assert_index_equal(uniques, Index(orig).sort_values(), - check_names=False) + tm.assert_index_equal( + uniques, Index(orig).sort_values(), check_names=False + ) else: tm.assert_index_equal(uniques, o, check_names=False) - exp_arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4], - np.intp) + exp_arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4], np.intp) labels, uniques = n.factorize(sort=False) tm.assert_numpy_array_equal(labels, exp_arr) @@ -732,7 +746,7 @@ def test_duplicated_drop_duplicates_index(self): # special case if original.is_boolean(): result = original.drop_duplicates() - expected = Index([False, True], name='a') + expected = Index([False, True], name="a") tm.assert_index_equal(result, expected) continue @@ -750,8 +764,7 @@ def test_duplicated_drop_duplicates_index(self): # create repeated values, 3rd and 5th values are duplicated idx = original[list(range(len(original))) + [5, 3]] - expected = np.array([False] * len(original) + [True, True], - dtype=bool) + expected = np.array([False] * len(original) + [True, True], dtype=bool) duplicated = idx.duplicated() tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool @@ -762,10 +775,10 @@ def test_duplicated_drop_duplicates_index(self): base[5] = True expected = np.array(base) - duplicated = idx.duplicated(keep='last') + duplicated = idx.duplicated(keep="last") tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool - result = idx.drop_duplicates(keep='last') + result = idx.drop_duplicates(keep="last") tm.assert_index_equal(result, idx[~expected]) base = [False] * len(original) + [True, True] @@ -779,14 +792,18 @@ def test_duplicated_drop_duplicates_index(self): result = idx.drop_duplicates(keep=False) tm.assert_index_equal(result, idx[~expected]) - with pytest.raises(TypeError, - match=(r"drop_duplicates\(\) got an " - r"unexpected keyword argument")): + with pytest.raises( + TypeError, + match=( + r"drop_duplicates\(\) got an " r"unexpected keyword argument" + ), + ): idx.drop_duplicates(inplace=True) else: - expected = Series([False] * len(original), - index=original.index, name='a') + expected = Series( + [False] * len(original), index=original.index, name="a" + ) tm.assert_series_equal(original.duplicated(), expected) result = original.drop_duplicates() tm.assert_series_equal(result, original) @@ -794,45 +811,56 @@ def test_duplicated_drop_duplicates_index(self): idx = original.index[list(range(len(original))) + [5, 3]] values = original._values[list(range(len(original))) + [5, 3]] - s = Series(values, index=idx, name='a') + s = Series(values, index=idx, name="a") - expected = Series([False] * len(original) + [True, True], - index=idx, name='a') + expected = Series( + [False] * len(original) + [True, True], index=idx, name="a" + ) tm.assert_series_equal(s.duplicated(), expected) tm.assert_series_equal(s.drop_duplicates(), original) base = [False] * len(idx) base[3] = True base[5] = True - expected = Series(base, index=idx, name='a') + expected = Series(base, index=idx, name="a") - tm.assert_series_equal(s.duplicated(keep='last'), expected) - tm.assert_series_equal(s.drop_duplicates(keep='last'), - s[~np.array(base)]) + tm.assert_series_equal(s.duplicated(keep="last"), expected) + tm.assert_series_equal( + s.drop_duplicates(keep="last"), s[~np.array(base)] + ) base = [False] * len(original) + [True, True] base[3] = True base[5] = True - expected = Series(base, index=idx, name='a') + expected = Series(base, index=idx, name="a") tm.assert_series_equal(s.duplicated(keep=False), expected) - tm.assert_series_equal(s.drop_duplicates(keep=False), - s[~np.array(base)]) + tm.assert_series_equal( + s.drop_duplicates(keep=False), s[~np.array(base)] + ) s.drop_duplicates(inplace=True) tm.assert_series_equal(s, original) def test_drop_duplicates_series_vs_dataframe(self): # GH 14192 - df = pd.DataFrame({'a': [1, 1, 1, 'one', 'one'], - 'b': [2, 2, np.nan, np.nan, np.nan], - 'c': [3, 3, np.nan, np.nan, 'three'], - 'd': [1, 2, 3, 4, 4], - 'e': [datetime(2015, 1, 1), datetime(2015, 1, 1), - datetime(2015, 2, 1), pd.NaT, pd.NaT] - }) + df = pd.DataFrame( + { + "a": [1, 1, 1, "one", "one"], + "b": [2, 2, np.nan, np.nan, np.nan], + "c": [3, 3, np.nan, np.nan, "three"], + "d": [1, 2, 3, 4, 4], + "e": [ + datetime(2015, 1, 1), + datetime(2015, 1, 1), + datetime(2015, 2, 1), + pd.NaT, + pd.NaT, + ], + } + ) for column in df.columns: - for keep in ['first', 'last', False]: + for keep in ["first", "last", False]: dropped_frame = df[[column]].drop_duplicates(keep=keep) dropped_series = df[column].drop_duplicates(keep=keep) tm.assert_frame_equal(dropped_frame, dropped_series.to_frame()) @@ -896,17 +924,18 @@ def test_memory_usage(self): res = o.memory_usage() res_deep = o.memory_usage(deep=True) - if (is_object_dtype(o) or (isinstance(o, Series) and - is_object_dtype(o.index))): + if is_object_dtype(o) or ( + isinstance(o, Series) and is_object_dtype(o.index) + ): # if there are objects, only deep will pick them up assert res_deep > res else: assert res == res_deep if isinstance(o, Series): - assert ((o.memory_usage(index=False) + - o.index.memory_usage()) == - o.memory_usage(index=True)) + assert ( + o.memory_usage(index=False) + o.index.memory_usage() + ) == o.memory_usage(index=True) # sys.getsizeof will call the .memory_usage with # deep=True, and add on some GC overhead @@ -944,10 +973,15 @@ def test_getitem(self): with pytest.raises(IndexError): s.iloc[20] - @pytest.mark.parametrize('indexer_klass', [list, pd.Index]) - @pytest.mark.parametrize('indexer', [[True] * 10, [False] * 10, - [True, False, True, True, False, - False, True, True, False, True]]) + @pytest.mark.parametrize("indexer_klass", [list, pd.Index]) + @pytest.mark.parametrize( + "indexer", + [ + [True] * 10, + [False] * 10, + [True, False, True, True, False, False, True, True, False, True], + ], + ) def test_bool_indexing(self, indexer_klass, indexer): # GH 22533 for idx in self.indexes: @@ -980,7 +1014,6 @@ def test_numpy_transpose(self): class TestNoNewAttributesMixin: - def test_mixin(self): class T(NoNewAttributesMixin): pass @@ -1005,32 +1038,34 @@ class TestToIterable: # test that we convert an iterable to python types dtypes = [ - ('int8', int), - ('int16', int), - ('int32', int), - ('int64', int), - ('uint8', int), - ('uint16', int), - ('uint32', int), - ('uint64', int), - ('float16', float), - ('float32', float), - ('float64', float), - ('datetime64[ns]', Timestamp), - ('datetime64[ns, US/Eastern]', Timestamp), - ('timedelta64[ns]', Timedelta)] - - @pytest.mark.parametrize( - 'dtype, rdtype', dtypes) + ("int8", int), + ("int16", int), + ("int32", int), + ("int64", int), + ("uint8", int), + ("uint16", int), + ("uint32", int), + ("uint64", int), + ("float16", float), + ("float32", float), + ("float64", float), + ("datetime64[ns]", Timestamp), + ("datetime64[ns, US/Eastern]", Timestamp), + ("timedelta64[ns]", Timedelta), + ] + + @pytest.mark.parametrize("dtype, rdtype", dtypes) @pytest.mark.parametrize( - 'method', + "method", [ lambda x: x.tolist(), lambda x: x.to_list(), lambda x: list(x), lambda x: list(x.__iter__()), - ], ids=['tolist', 'to_list', 'list', 'iter']) - @pytest.mark.parametrize('typ', [Series, Index]) + ], + ids=["tolist", "to_list", "list", "iter"], + ) + @pytest.mark.parametrize("typ", [Series, Index]) @pytest.mark.filterwarnings("ignore:\\n Passing:FutureWarning") # TODO(GH-24559): Remove the filterwarnings def test_iterable(self, typ, method, dtype, rdtype): @@ -1042,23 +1077,26 @@ def test_iterable(self, typ, method, dtype, rdtype): assert isinstance(result, rdtype) @pytest.mark.parametrize( - 'dtype, rdtype, obj', + "dtype, rdtype, obj", [ - ('object', object, 'a'), - ('object', int, 1), - ('category', object, 'a'), - ('category', int, 1)]) + ("object", object, "a"), + ("object", int, 1), + ("category", object, "a"), + ("category", int, 1), + ], + ) @pytest.mark.parametrize( - 'method', + "method", [ lambda x: x.tolist(), lambda x: x.to_list(), lambda x: list(x), lambda x: list(x.__iter__()), - ], ids=['tolist', 'to_list', 'list', 'iter']) - @pytest.mark.parametrize('typ', [Series, Index]) - def test_iterable_object_and_category(self, typ, method, - dtype, rdtype, obj): + ], + ids=["tolist", "to_list", "list", "iter"], + ) + @pytest.mark.parametrize("typ", [Series, Index]) + def test_iterable_object_and_category(self, typ, method, dtype, rdtype, obj): # gh-10904 # gh-13258 # coerce iteration to underlying python / pandas types @@ -1066,8 +1104,7 @@ def test_iterable_object_and_category(self, typ, method, result = method(s)[0] assert isinstance(result, rdtype) - @pytest.mark.parametrize( - 'dtype, rdtype', dtypes) + @pytest.mark.parametrize("dtype, rdtype", dtypes) def test_iterable_items(self, dtype, rdtype): # gh-13258 # test items / iteritems yields the correct boxed scalars @@ -1080,11 +1117,9 @@ def test_iterable_items(self, dtype, rdtype): assert isinstance(result, rdtype) @pytest.mark.parametrize( - 'dtype, rdtype', - dtypes + [ - ('object', int), - ('category', int)]) - @pytest.mark.parametrize('typ', [Series, Index]) + "dtype, rdtype", dtypes + [("object", int), ("category", int)] + ) + @pytest.mark.parametrize("typ", [Series, Index]) @pytest.mark.filterwarnings("ignore:\\n Passing:FutureWarning") # TODO(GH-24559): Remove the filterwarnings def test_iterable_map(self, typ, dtype, rdtype): @@ -1097,89 +1132,104 @@ def test_iterable_map(self, typ, dtype, rdtype): assert result in rdtype @pytest.mark.parametrize( - 'method', + "method", [ lambda x: x.tolist(), lambda x: x.to_list(), lambda x: list(x), lambda x: list(x.__iter__()), - ], ids=['tolist', 'to_list', 'list', 'iter']) + ], + ids=["tolist", "to_list", "list", "iter"], + ) def test_categorial_datetimelike(self, method): - i = CategoricalIndex([Timestamp('1999-12-31'), - Timestamp('2000-12-31')]) + i = CategoricalIndex([Timestamp("1999-12-31"), Timestamp("2000-12-31")]) result = method(i)[0] assert isinstance(result, Timestamp) def test_iter_box(self): - vals = [Timestamp('2011-01-01'), Timestamp('2011-01-02')] + vals = [Timestamp("2011-01-01"), Timestamp("2011-01-02")] s = Series(vals) - assert s.dtype == 'datetime64[ns]' + assert s.dtype == "datetime64[ns]" for res, exp in zip(s, vals): assert isinstance(res, Timestamp) assert res.tz is None assert res == exp - vals = [Timestamp('2011-01-01', tz='US/Eastern'), - Timestamp('2011-01-02', tz='US/Eastern')] + vals = [ + Timestamp("2011-01-01", tz="US/Eastern"), + Timestamp("2011-01-02", tz="US/Eastern"), + ] s = Series(vals) - assert s.dtype == 'datetime64[ns, US/Eastern]' + assert s.dtype == "datetime64[ns, US/Eastern]" for res, exp in zip(s, vals): assert isinstance(res, Timestamp) assert res.tz == exp.tz assert res == exp # timedelta - vals = [Timedelta('1 days'), Timedelta('2 days')] + vals = [Timedelta("1 days"), Timedelta("2 days")] s = Series(vals) - assert s.dtype == 'timedelta64[ns]' + assert s.dtype == "timedelta64[ns]" for res, exp in zip(s, vals): assert isinstance(res, Timedelta) assert res == exp # period - vals = [pd.Period('2011-01-01', freq='M'), - pd.Period('2011-01-02', freq='M')] + vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] s = Series(vals) - assert s.dtype == 'Period[M]' + assert s.dtype == "Period[M]" for res, exp in zip(s, vals): assert isinstance(res, pd.Period) - assert res.freq == 'M' + assert res.freq == "M" assert res == exp -@pytest.mark.parametrize('array, expected_type, dtype', [ - (np.array([0, 1], dtype=np.int64), np.ndarray, 'int64'), - (np.array(['a', 'b']), np.ndarray, 'object'), - (pd.Categorical(['a', 'b']), pd.Categorical, 'category'), - (pd.DatetimeIndex(['2017', '2018'], tz="US/Central"), DatetimeArray, - 'datetime64[ns, US/Central]'), - - (pd.PeriodIndex([2018, 2019], freq='A'), pd.core.arrays.PeriodArray, - pd.core.dtypes.dtypes.PeriodDtype("A-DEC")), - (pd.IntervalIndex.from_breaks([0, 1, 2]), pd.core.arrays.IntervalArray, - 'interval'), - - # This test is currently failing for datetime64[ns] and timedelta64[ns]. - # The NumPy type system is sufficient for representing these types, so - # we just use NumPy for Series / DataFrame columns of these types (so - # we get consolidation and so on). - # However, DatetimeIndex and TimedeltaIndex use the DateLikeArray - # abstraction to for code reuse. - # At the moment, we've judged that allowing this test to fail is more - # practical that overriding Series._values to special case - # Series[M8[ns]] and Series[m8[ns]] to return a DateLikeArray. - pytest.param( - pd.DatetimeIndex(['2017', '2018']), np.ndarray, 'datetime64[ns]', - marks=[pytest.mark.xfail(reason="datetime _values", strict=True)] - ), - pytest.param( - pd.TimedeltaIndex([10**10]), np.ndarray, 'm8[ns]', - marks=[pytest.mark.xfail(reason="timedelta _values", strict=True)] - ), - -]) +@pytest.mark.parametrize( + "array, expected_type, dtype", + [ + (np.array([0, 1], dtype=np.int64), np.ndarray, "int64"), + (np.array(["a", "b"]), np.ndarray, "object"), + (pd.Categorical(["a", "b"]), pd.Categorical, "category"), + ( + pd.DatetimeIndex(["2017", "2018"], tz="US/Central"), + DatetimeArray, + "datetime64[ns, US/Central]", + ), + ( + pd.PeriodIndex([2018, 2019], freq="A"), + pd.core.arrays.PeriodArray, + pd.core.dtypes.dtypes.PeriodDtype("A-DEC"), + ), + ( + pd.IntervalIndex.from_breaks([0, 1, 2]), + pd.core.arrays.IntervalArray, + "interval", + ), + # This test is currently failing for datetime64[ns] and timedelta64[ns]. + # The NumPy type system is sufficient for representing these types, so + # we just use NumPy for Series / DataFrame columns of these types (so + # we get consolidation and so on). + # However, DatetimeIndex and TimedeltaIndex use the DateLikeArray + # abstraction to for code reuse. + # At the moment, we've judged that allowing this test to fail is more + # practical that overriding Series._values to special case + # Series[M8[ns]] and Series[m8[ns]] to return a DateLikeArray. + pytest.param( + pd.DatetimeIndex(["2017", "2018"]), + np.ndarray, + "datetime64[ns]", + marks=[pytest.mark.xfail(reason="datetime _values", strict=True)], + ), + pytest.param( + pd.TimedeltaIndex([10 ** 10]), + np.ndarray, + "m8[ns]", + marks=[pytest.mark.xfail(reason="timedelta _values", strict=True)], + ), + ], +) def test_values_consistent(array, expected_type, dtype): l_values = pd.Series(array)._values r_values = pd.Index(array)._values @@ -1189,18 +1239,27 @@ def test_values_consistent(array, expected_type, dtype): tm.assert_equal(l_values, r_values) -@pytest.mark.parametrize('array, expected', [ - (np.array([0, 1], dtype=np.int64), np.array([0, 1], dtype=np.int64)), - (np.array(['0', '1']), np.array(['0', '1'], dtype=object)), - (pd.Categorical(['a', 'a']), np.array([0, 0], dtype='int8')), - (pd.DatetimeIndex(['2017-01-01T00:00:00']), - np.array(['2017-01-01T00:00:00'], dtype='M8[ns]')), - (pd.DatetimeIndex(['2017-01-01T00:00:00'], tz="US/Eastern"), - np.array(['2017-01-01T05:00:00'], dtype='M8[ns]')), - (pd.TimedeltaIndex([10**10]), np.array([10**10], dtype='m8[ns]')), - (pd.PeriodIndex(['2017', '2018'], freq='D'), - np.array([17167, 17532], dtype=np.int64)), -]) +@pytest.mark.parametrize( + "array, expected", + [ + (np.array([0, 1], dtype=np.int64), np.array([0, 1], dtype=np.int64)), + (np.array(["0", "1"]), np.array(["0", "1"], dtype=object)), + (pd.Categorical(["a", "a"]), np.array([0, 0], dtype="int8")), + ( + pd.DatetimeIndex(["2017-01-01T00:00:00"]), + np.array(["2017-01-01T00:00:00"], dtype="M8[ns]"), + ), + ( + pd.DatetimeIndex(["2017-01-01T00:00:00"], tz="US/Eastern"), + np.array(["2017-01-01T05:00:00"], dtype="M8[ns]"), + ), + (pd.TimedeltaIndex([10 ** 10]), np.array([10 ** 10], dtype="m8[ns]")), + ( + pd.PeriodIndex(["2017", "2018"], freq="D"), + np.array([17167, 17532], dtype=np.int64), + ), + ], +) def test_ndarray_values(array, expected): l_values = pd.Series(array)._ndarray_values r_values = pd.Index(array)._ndarray_values @@ -1208,9 +1267,7 @@ def test_ndarray_values(array, expected): tm.assert_numpy_array_equal(l_values, expected) -@pytest.mark.parametrize("arr", [ - np.array([1, 2, 3]), -]) +@pytest.mark.parametrize("arr", [np.array([1, 2, 3])]) def test_numpy_array(arr): ser = pd.Series(arr) result = ser.array @@ -1229,23 +1286,30 @@ def test_numpy_array_all_dtypes(any_numpy_dtype): assert isinstance(result, PandasArray) -@pytest.mark.parametrize("array, attr", [ - (pd.Categorical(['a', 'b']), '_codes'), - (pd.core.arrays.period_array(['2000', '2001'], freq='D'), '_data'), - (pd.core.arrays.integer_array([0, np.nan]), '_data'), - (pd.core.arrays.IntervalArray.from_breaks([0, 1]), '_left'), - (pd.SparseArray([0, 1]), '_sparse_values'), - (DatetimeArray(np.array([1, 2], dtype="datetime64[ns]")), "_data"), - # tz-aware Datetime - (DatetimeArray(np.array(['2000-01-01T12:00:00', - '2000-01-02T12:00:00'], - dtype='M8[ns]'), - dtype=DatetimeTZDtype(tz="US/Central")), - '_data'), -]) -@pytest.mark.parametrize('box', [pd.Series, pd.Index]) +@pytest.mark.parametrize( + "array, attr", + [ + (pd.Categorical(["a", "b"]), "_codes"), + (pd.core.arrays.period_array(["2000", "2001"], freq="D"), "_data"), + (pd.core.arrays.integer_array([0, np.nan]), "_data"), + (pd.core.arrays.IntervalArray.from_breaks([0, 1]), "_left"), + (pd.SparseArray([0, 1]), "_sparse_values"), + (DatetimeArray(np.array([1, 2], dtype="datetime64[ns]")), "_data"), + # tz-aware Datetime + ( + DatetimeArray( + np.array( + ["2000-01-01T12:00:00", "2000-01-02T12:00:00"], dtype="M8[ns]" + ), + dtype=DatetimeTZDtype(tz="US/Central"), + ), + "_data", + ), + ], +) +@pytest.mark.parametrize("box", [pd.Series, pd.Index]) def test_array(array, attr, box): - if array.dtype.name in ('Int64', 'Sparse[int64, 0]') and box is pd.Index: + if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index: pytest.skip("No index type for {}".format(array.dtype)) result = box(array, copy=False).array @@ -1257,43 +1321,61 @@ def test_array(array, attr, box): def test_array_multiindex_raises(): - idx = pd.MultiIndex.from_product([['A'], ['a', 'b']]) - with pytest.raises(ValueError, match='MultiIndex'): + idx = pd.MultiIndex.from_product([["A"], ["a", "b"]]) + with pytest.raises(ValueError, match="MultiIndex"): idx.array -@pytest.mark.parametrize('array, expected', [ - (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)), - (pd.Categorical(['a', 'b']), np.array(['a', 'b'], dtype=object)), - (pd.core.arrays.period_array(['2000', '2001'], freq='D'), - np.array([pd.Period('2000', freq="D"), pd.Period('2001', freq='D')])), - (pd.core.arrays.integer_array([0, np.nan]), - np.array([0, np.nan], dtype=object)), - (pd.core.arrays.IntervalArray.from_breaks([0, 1, 2]), - np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object)), - (pd.SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), - - # tz-naive datetime - (DatetimeArray(np.array(['2000', '2001'], dtype='M8[ns]')), - np.array(['2000', '2001'], dtype='M8[ns]')), - - # tz-aware stays tz`-aware - (DatetimeArray(np.array(['2000-01-01T06:00:00', - '2000-01-02T06:00:00'], - dtype='M8[ns]'), - dtype=DatetimeTZDtype(tz='US/Central')), - np.array([pd.Timestamp('2000-01-01', tz='US/Central'), - pd.Timestamp('2000-01-02', tz='US/Central')])), - - # Timedelta - (TimedeltaArray(np.array([0, 3600000000000], dtype='i8'), freq='H'), - np.array([0, 3600000000000], dtype='m8[ns]')), -]) -@pytest.mark.parametrize('box', [pd.Series, pd.Index]) +@pytest.mark.parametrize( + "array, expected", + [ + (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)), + (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)), + ( + pd.core.arrays.period_array(["2000", "2001"], freq="D"), + np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]), + ), + ( + pd.core.arrays.integer_array([0, np.nan]), + np.array([0, np.nan], dtype=object), + ), + ( + pd.core.arrays.IntervalArray.from_breaks([0, 1, 2]), + np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object), + ), + (pd.SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), + # tz-naive datetime + ( + DatetimeArray(np.array(["2000", "2001"], dtype="M8[ns]")), + np.array(["2000", "2001"], dtype="M8[ns]"), + ), + # tz-aware stays tz`-aware + ( + DatetimeArray( + np.array( + ["2000-01-01T06:00:00", "2000-01-02T06:00:00"], dtype="M8[ns]" + ), + dtype=DatetimeTZDtype(tz="US/Central"), + ), + np.array( + [ + pd.Timestamp("2000-01-01", tz="US/Central"), + pd.Timestamp("2000-01-02", tz="US/Central"), + ] + ), + ), + # Timedelta + ( + TimedeltaArray(np.array([0, 3600000000000], dtype="i8"), freq="H"), + np.array([0, 3600000000000], dtype="m8[ns]"), + ), + ], +) +@pytest.mark.parametrize("box", [pd.Series, pd.Index]) def test_to_numpy(array, expected, box): thing = box(array) - if array.dtype.name in ('Int64', 'Sparse[int64, 0]') and box is pd.Index: + if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index: pytest.skip("No index type for {}".format(array.dtype)) result = thing.to_numpy() @@ -1301,10 +1383,9 @@ def test_to_numpy(array, expected, box): @pytest.mark.parametrize("as_series", [True, False]) -@pytest.mark.parametrize("arr", [ - np.array([1, 2, 3], dtype="int64"), - np.array(['a', 'b', 'c'], dtype=object), -]) +@pytest.mark.parametrize( + "arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)] +) def test_to_numpy_copy(arr, as_series): obj = pd.Index(arr, copy=False) if as_series: @@ -1325,23 +1406,22 @@ def test_to_numpy_copy(arr, as_series): @pytest.mark.parametrize("as_series", [True, False]) def test_to_numpy_dtype(as_series): tz = "US/Eastern" - obj = pd.DatetimeIndex(['2000', '2001'], tz=tz) + obj = pd.DatetimeIndex(["2000", "2001"], tz=tz) if as_series: obj = pd.Series(obj) # preserve tz by default result = obj.to_numpy() - expected = np.array([pd.Timestamp('2000', tz=tz), - pd.Timestamp('2001', tz=tz)], - dtype=object) + expected = np.array( + [pd.Timestamp("2000", tz=tz), pd.Timestamp("2001", tz=tz)], dtype=object + ) tm.assert_numpy_array_equal(result, expected) result = obj.to_numpy(dtype="object") tm.assert_numpy_array_equal(result, expected) result = obj.to_numpy(dtype="M8[ns]") - expected = np.array(['2000-01-01T05', '2001-01-01T05'], - dtype='M8[ns]') + expected = np.array(["2000-01-01T05", "2001-01-01T05"], dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) @@ -1349,35 +1429,47 @@ class TestConstruction: # test certain constructor behaviours on dtype inference across Series, # Index and DataFrame - @pytest.mark.parametrize("klass", [ - Series, - lambda x, **kwargs: DataFrame({'a': x}, **kwargs)['a'], - pytest.param(lambda x, **kwargs: DataFrame(x, **kwargs)[0], - marks=pytest.mark.xfail), - Index, - ]) - @pytest.mark.parametrize("a", [ - np.array(['2263-01-01'], dtype='datetime64[D]'), - np.array([datetime(2263, 1, 1)], dtype=object), - np.array([np.datetime64('2263-01-01', 'D')], dtype=object), - np.array(["2263-01-01"], dtype=object) - ], ids=['datetime64[D]', 'object-datetime.datetime', - 'object-numpy-scalar', 'object-string']) + @pytest.mark.parametrize( + "klass", + [ + Series, + lambda x, **kwargs: DataFrame({"a": x}, **kwargs)["a"], + pytest.param( + lambda x, **kwargs: DataFrame(x, **kwargs)[0], marks=pytest.mark.xfail + ), + Index, + ], + ) + @pytest.mark.parametrize( + "a", + [ + np.array(["2263-01-01"], dtype="datetime64[D]"), + np.array([datetime(2263, 1, 1)], dtype=object), + np.array([np.datetime64("2263-01-01", "D")], dtype=object), + np.array(["2263-01-01"], dtype=object), + ], + ids=[ + "datetime64[D]", + "object-datetime.datetime", + "object-numpy-scalar", + "object-string", + ], + ) def test_constructor_datetime_outofbound(self, a, klass): # GH-26853 (+ bug GH-26206 out of bound non-ns unit) # No dtype specified (dtype inference) # datetime64[non-ns] raise error, other cases result in object dtype # and preserve original data - if a.dtype.kind == 'M': + if a.dtype.kind == "M": with pytest.raises(pd.errors.OutOfBoundsDatetime): klass(a) else: result = klass(a) - assert result.dtype == 'object' + assert result.dtype == "object" tm.assert_numpy_array_equal(result.to_numpy(), a) # Explicit dtype specified # Forced conversion fails for all -> all cases raise error with pytest.raises(pd.errors.OutOfBoundsDatetime): - klass(a, dtype='datetime64[ns]') + klass(a, dtype="datetime64[ns]") diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index de8d28eeb41a6f..d96f806bc383f3 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -21,31 +21,31 @@ def fn(x): part2 = partial(part1) class somecall: - def __call__(self): return x # noqa - assert getname(fn) == 'fn' + assert getname(fn) == "fn" assert getname(lambda_) - assert getname(part1) == 'fn' - assert getname(part2) == 'fn' - assert getname(somecall()) == 'somecall' + assert getname(part1) == "fn" + assert getname(part2) == "fn" + assert getname(somecall()) == "somecall" assert getname(1) is None def test_any_none(): - assert (com._any_none(1, 2, 3, None)) - assert (not com._any_none(1, 2, 3, 4)) + assert com._any_none(1, 2, 3, None) + assert not com._any_none(1, 2, 3, 4) def test_all_not_none(): - assert (com._all_not_none(1, 2, 3, 4)) - assert (not com._all_not_none(1, 2, 3, None)) - assert (not com._all_not_none(None, None, None, None)) + assert com._all_not_none(1, 2, 3, 4) + assert not com._all_not_none(1, 2, 3, None) + assert not com._all_not_none(None, None, None, None) def test_random_state(): import numpy.random as npr + # Check with seed state = com.random_state(5) assert state.uniform() == npr.RandomState(5).uniform() @@ -59,31 +59,34 @@ def test_random_state(): # Error for floats or strings with pytest.raises(ValueError): - com.random_state('test') + com.random_state("test") with pytest.raises(ValueError): com.random_state(5.5) -@pytest.mark.parametrize('left, right, expected', [ - (Series([1], name='x'), Series([2], name='x'), 'x'), - (Series([1], name='x'), Series([2], name='y'), None), - (Series([1]), Series([2], name='x'), None), - (Series([1], name='x'), Series([2]), None), - (Series([1], name='x'), [2], 'x'), - ([1], Series([2], name='y'), 'y')]) +@pytest.mark.parametrize( + "left, right, expected", + [ + (Series([1], name="x"), Series([2], name="x"), "x"), + (Series([1], name="x"), Series([2], name="y"), None), + (Series([1]), Series([2], name="x"), None), + (Series([1], name="x"), Series([2]), None), + (Series([1], name="x"), [2], "x"), + ([1], Series([2], name="y"), "y"), + ], +) def test_maybe_match_name(left, right, expected): assert ops._maybe_match_name(left, right) == expected def test_dict_compat(): - data_datetime64 = {np.datetime64('1990-03-15'): 1, - np.datetime64('2015-03-15'): 2} + data_datetime64 = {np.datetime64("1990-03-15"): 1, np.datetime64("2015-03-15"): 2} data_unchanged = {1: 2, 3: 4, 5: 6} - expected = {Timestamp('1990-3-15'): 1, Timestamp('2015-03-15'): 2} - assert (com.dict_compat(data_datetime64) == expected) - assert (com.dict_compat(expected) == expected) - assert (com.dict_compat(data_unchanged) == data_unchanged) + expected = {Timestamp("1990-3-15"): 1, Timestamp("2015-03-15"): 2} + assert com.dict_compat(data_datetime64) == expected + assert com.dict_compat(expected) == expected + assert com.dict_compat(data_unchanged) == data_unchanged def test_standardize_mapping(): @@ -99,11 +102,11 @@ def test_standardize_mapping(): with pytest.raises(TypeError): com.standardize_mapping(list) - fill = {'bad': 'data'} - assert (com.standardize_mapping(fill) == dict) + fill = {"bad": "data"} + assert com.standardize_mapping(fill) == dict # Convert instance to type - assert (com.standardize_mapping({}) == dict) + assert com.standardize_mapping({}) == dict dd = collections.defaultdict(list) assert isinstance(com.standardize_mapping(dd), partial) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index bb662e99664e2c..d644c002fbdfb6 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -35,13 +35,13 @@ def import_module(name): @pytest.fixture def df(): - return DataFrame({'A': [1, 2, 3]}) + return DataFrame({"A": [1, 2, 3]}) def test_dask(df): - toolz = import_module('toolz') # noqa - dask = import_module('dask') # noqa + toolz = import_module("toolz") # noqa + dask = import_module("dask") # noqa import dask.dataframe as dd @@ -52,7 +52,7 @@ def test_dask(df): def test_xarray(df): - xarray = import_module('xarray') # noqa + xarray = import_module("xarray") # noqa assert df.to_xarray() is not None @@ -67,22 +67,23 @@ def test_oo_optimizable(): @pytest.mark.filterwarnings("ignore:can't:ImportWarning") def test_statsmodels(): - statsmodels = import_module('statsmodels') # noqa + statsmodels = import_module("statsmodels") # noqa import statsmodels.api as sm import statsmodels.formula.api as smf + df = sm.datasets.get_rdataset("Guerry", "HistData").data - smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=df).fit() + smf.ols("Lottery ~ Literacy + np.log(Pop1831)", data=df).fit() # Cython import warning @pytest.mark.filterwarnings("ignore:can't:ImportWarning") def test_scikit_learn(df): - sklearn = import_module('sklearn') # noqa + sklearn = import_module("sklearn") # noqa from sklearn import svm, datasets digits = datasets.load_digits() - clf = svm.SVC(gamma=0.001, C=100.) + clf = svm.SVC(gamma=0.001, C=100.0) clf.fit(digits.data[:-1], digits.target[:-1]) clf.predict(digits.data[-1:]) @@ -92,23 +93,22 @@ def test_scikit_learn(df): @pytest.mark.filterwarnings("ignore") def test_seaborn(): - seaborn = import_module('seaborn') + seaborn = import_module("seaborn") tips = seaborn.load_dataset("tips") seaborn.stripplot(x="day", y="total_bill", data=tips) def test_pandas_gbq(df): - pandas_gbq = import_module('pandas_gbq') # noqa + pandas_gbq = import_module("pandas_gbq") # noqa @pytest.mark.xfail(reason="0.7.0 pending") @tm.network def test_pandas_datareader(): - pandas_datareader = import_module('pandas_datareader') # noqa - pandas_datareader.DataReader( - 'F', 'quandl', '2017-01-01', '2017-02-01') + pandas_datareader = import_module("pandas_datareader") # noqa + pandas_datareader.DataReader("F", "quandl", "2017-01-01", "2017-02-01") # importing from pandas, Cython import warning @@ -118,8 +118,8 @@ def test_pandas_datareader(): @pytest.mark.skip(reason="gh-25778: geopandas stack issue") def test_geopandas(): - geopandas = import_module('geopandas') # noqa - fp = geopandas.datasets.get_path('naturalearth_lowres') + geopandas = import_module("geopandas") # noqa + fp = geopandas.datasets.get_path("naturalearth_lowres") assert geopandas.read_file(fp) is not None @@ -127,7 +127,7 @@ def test_geopandas(): @pytest.mark.filterwarnings("ignore:can't resolve:ImportWarning") def test_pyarrow(df): - pyarrow = import_module('pyarrow') # noqa + pyarrow = import_module("pyarrow") # noqa table = pyarrow.Table.from_pandas(df) result = table.to_pandas() tm.assert_frame_equal(result, df) @@ -142,11 +142,11 @@ def test_missing_required_dependency(): # -E : disable PYTHON* env vars, especially PYTHONPATH # And, that's apparently not enough, so we give up. # https://github.com/MacPython/pandas-wheels/pull/50 - call = ['python', '-sSE', '-c', 'import pandas'] + call = ["python", "-sSE", "-c", "import pandas"] with pytest.raises(subprocess.CalledProcessError) as exc: subprocess.check_output(call, stderr=subprocess.STDOUT) output = exc.value.stdout.decode() - for name in ['numpy', 'pytz', 'dateutil']: + for name in ["numpy", "pytz", "dateutil"]: assert name in output diff --git a/pandas/tests/test_errors.py b/pandas/tests/test_errors.py index 899b985f247d42..531c511e8c02d4 100644 --- a/pandas/tests/test_errors.py +++ b/pandas/tests/test_errors.py @@ -6,12 +6,22 @@ @pytest.mark.parametrize( - "exc", ['UnsupportedFunctionCall', 'UnsortedIndexError', - 'OutOfBoundsDatetime', - 'ParserError', 'PerformanceWarning', 'DtypeWarning', - 'EmptyDataError', 'ParserWarning', 'MergeError']) + "exc", + [ + "UnsupportedFunctionCall", + "UnsortedIndexError", + "OutOfBoundsDatetime", + "ParserError", + "PerformanceWarning", + "DtypeWarning", + "EmptyDataError", + "ParserWarning", + "MergeError", + ], +) def test_exception_importable(exc): from pandas import errors + e = getattr(errors, exc) assert e is not None @@ -24,7 +34,7 @@ def test_catch_oob(): from pandas import errors try: - pd.Timestamp('15000101') + pd.Timestamp("15000101") except errors.OutOfBoundsDatetime: pass @@ -48,11 +58,11 @@ def test_error_rename(): class Foo: @classmethod def classmethod(cls): - raise AbstractMethodError(cls, methodtype='classmethod') + raise AbstractMethodError(cls, methodtype="classmethod") @property def property(self): - raise AbstractMethodError(self, methodtype='property') + raise AbstractMethodError(self, methodtype="property") def method(self): raise AbstractMethodError(self) diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 1f3f5a251ef174..a7281e002cc5ca 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -9,31 +9,41 @@ from pandas.core.computation import expressions as expr import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) from pandas.io.formats.printing import pprint_thing -_frame = DataFrame(randn(10000, 4), columns=list('ABCD'), dtype='float64') -_frame2 = DataFrame(randn(100, 4), columns=list('ABCD'), dtype='float64') -_mixed = DataFrame({'A': _frame['A'].copy(), - 'B': _frame['B'].astype('float32'), - 'C': _frame['C'].astype('int64'), - 'D': _frame['D'].astype('int32')}) -_mixed2 = DataFrame({'A': _frame2['A'].copy(), - 'B': _frame2['B'].astype('float32'), - 'C': _frame2['C'].astype('int64'), - 'D': _frame2['D'].astype('int32')}) +_frame = DataFrame(randn(10000, 4), columns=list("ABCD"), dtype="float64") +_frame2 = DataFrame(randn(100, 4), columns=list("ABCD"), dtype="float64") +_mixed = DataFrame( + { + "A": _frame["A"].copy(), + "B": _frame["B"].astype("float32"), + "C": _frame["C"].astype("int64"), + "D": _frame["D"].astype("int32"), + } +) +_mixed2 = DataFrame( + { + "A": _frame2["A"].copy(), + "B": _frame2["B"].astype("float32"), + "C": _frame2["C"].astype("int64"), + "D": _frame2["D"].astype("int32"), + } +) _integer = DataFrame( - np.random.randint(1, 100, - size=(10001, 4)), - columns=list('ABCD'), dtype='int64') -_integer2 = DataFrame(np.random.randint(1, 100, size=(101, 4)), - columns=list('ABCD'), dtype='int64') + np.random.randint(1, 100, size=(10001, 4)), columns=list("ABCD"), dtype="int64" +) +_integer2 = DataFrame( + np.random.randint(1, 100, size=(101, 4)), columns=list("ABCD"), dtype="int64" +) -@pytest.mark.skipif(not expr._USE_NUMEXPR, reason='not using numexpr') +@pytest.mark.skipif(not expr._USE_NUMEXPR, reason="not using numexpr") class TestExpressions: - def setup_method(self, method): self.frame = _frame.copy() @@ -46,15 +56,14 @@ def setup_method(self, method): def teardown_method(self, method): expr._MIN_ELEMENTS = self._MIN_ELEMENTS - def run_arithmetic(self, df, other, assert_func, check_dtype=False, - test_flex=True): + def run_arithmetic(self, df, other, assert_func, check_dtype=False, test_flex=True): expr._MIN_ELEMENTS = 0 - operations = ['add', 'sub', 'mul', 'mod', 'truediv', 'floordiv'] + operations = ["add", "sub", "mul", "mod", "truediv", "floordiv"] for arith in operations: operator_name = arith - if arith == 'div': - operator_name = 'truediv' + if arith == "div": + operator_name = "truediv" if test_flex: op = lambda x, y: getattr(df, arith)(y) @@ -68,22 +77,30 @@ def run_arithmetic(self, df, other, assert_func, check_dtype=False, result = op(df, other) try: if check_dtype: - if arith == 'truediv': - assert expected.dtype.kind == 'f' + if arith == "truediv": + assert expected.dtype.kind == "f" assert_func(expected, result) except Exception: pprint_thing("Failed test with operator %r" % op.__name__) raise def test_integer_arithmetic(self): - self.run_arithmetic(self.integer, self.integer, - assert_frame_equal) - self.run_arithmetic(self.integer.iloc[:, 0], - self.integer.iloc[:, 0], assert_series_equal, - check_dtype=True) - - def run_binary(self, df, other, assert_func, test_flex=False, - numexpr_ops={'gt', 'lt', 'ge', 'le', 'eq', 'ne'}): + self.run_arithmetic(self.integer, self.integer, assert_frame_equal) + self.run_arithmetic( + self.integer.iloc[:, 0], + self.integer.iloc[:, 0], + assert_series_equal, + check_dtype=True, + ) + + def run_binary( + self, + df, + other, + assert_func, + test_flex=False, + numexpr_ops={"gt", "lt", "ge", "le", "eq", "ne"}, + ): """ tests solely that the result is the same whether or not numexpr is enabled. Need to test whether the function does the correct thing @@ -91,7 +108,7 @@ def run_binary(self, df, other, assert_func, test_flex=False, """ expr._MIN_ELEMENTS = 0 expr.set_test_mode(True) - operations = ['gt', 'lt', 'ge', 'le', 'eq', 'ne'] + operations = ["gt", "lt", "ge", "le", "eq", "ne"] for arith in operations: if test_flex: @@ -116,27 +133,24 @@ def run_binary(self, df, other, assert_func, test_flex=False, pprint_thing("test_flex was %r" % test_flex) raise - def run_frame(self, df, other, binary_comp=None, run_binary=True, - **kwargs): - self.run_arithmetic(df, other, assert_frame_equal, - test_flex=False, **kwargs) - self.run_arithmetic(df, other, assert_frame_equal, test_flex=True, - **kwargs) + def run_frame(self, df, other, binary_comp=None, run_binary=True, **kwargs): + self.run_arithmetic(df, other, assert_frame_equal, test_flex=False, **kwargs) + self.run_arithmetic(df, other, assert_frame_equal, test_flex=True, **kwargs) if run_binary: if binary_comp is None: expr.set_use_numexpr(False) binary_comp = other + 1 expr.set_use_numexpr(True) - self.run_binary(df, binary_comp, assert_frame_equal, - test_flex=False, **kwargs) - self.run_binary(df, binary_comp, assert_frame_equal, - test_flex=True, **kwargs) + self.run_binary( + df, binary_comp, assert_frame_equal, test_flex=False, **kwargs + ) + self.run_binary( + df, binary_comp, assert_frame_equal, test_flex=True, **kwargs + ) def run_series(self, ser, other, binary_comp=None, **kwargs): - self.run_arithmetic(ser, other, assert_series_equal, - test_flex=False, **kwargs) - self.run_arithmetic(ser, other, assert_almost_equal, - test_flex=True, **kwargs) + self.run_arithmetic(ser, other, assert_series_equal, test_flex=False, **kwargs) + self.run_arithmetic(ser, other, assert_almost_equal, test_flex=True, **kwargs) # series doesn't uses vec_compare instead of numexpr... # if binary_comp is None: # binary_comp = other + 1 @@ -169,78 +183,84 @@ def test_mixed_arithmetic_series(self): def test_float_arithemtic(self): self.run_arithmetic(self.frame, self.frame, assert_frame_equal) - self.run_arithmetic(self.frame.iloc[:, 0], self.frame.iloc[:, 0], - assert_series_equal, check_dtype=True) + self.run_arithmetic( + self.frame.iloc[:, 0], + self.frame.iloc[:, 0], + assert_series_equal, + check_dtype=True, + ) def test_mixed_arithmetic(self): self.run_arithmetic(self.mixed, self.mixed, assert_frame_equal) for col in self.mixed.columns: - self.run_arithmetic(self.mixed[col], self.mixed[col], - assert_series_equal) + self.run_arithmetic(self.mixed[col], self.mixed[col], assert_series_equal) def test_integer_with_zeros(self): self.integer *= np.random.randint(0, 2, size=np.shape(self.integer)) - self.run_arithmetic(self.integer, self.integer, - assert_frame_equal) - self.run_arithmetic(self.integer.iloc[:, 0], - self.integer.iloc[:, 0], assert_series_equal) + self.run_arithmetic(self.integer, self.integer, assert_frame_equal) + self.run_arithmetic( + self.integer.iloc[:, 0], self.integer.iloc[:, 0], assert_series_equal + ) def test_invalid(self): # no op - result = expr._can_use_numexpr(operator.add, None, self.frame, - self.frame, 'evaluate') + result = expr._can_use_numexpr( + operator.add, None, self.frame, self.frame, "evaluate" + ) assert not result # mixed - result = expr._can_use_numexpr(operator.add, '+', self.mixed, - self.frame, 'evaluate') + result = expr._can_use_numexpr( + operator.add, "+", self.mixed, self.frame, "evaluate" + ) assert not result # min elements - result = expr._can_use_numexpr(operator.add, '+', self.frame2, - self.frame2, 'evaluate') + result = expr._can_use_numexpr( + operator.add, "+", self.frame2, self.frame2, "evaluate" + ) assert not result # ok, we only check on first part of expression - result = expr._can_use_numexpr(operator.add, '+', self.frame, - self.frame2, 'evaluate') + result = expr._can_use_numexpr( + operator.add, "+", self.frame, self.frame2, "evaluate" + ) assert result def test_binary_ops(self): def testit(): - for f, f2 in [(self.frame, self.frame2), - (self.mixed, self.mixed2)]: + for f, f2 in [(self.frame, self.frame2), (self.mixed, self.mixed2)]: - for op, op_str in [('add', '+'), ('sub', '-'), ('mul', '*'), - ('div', '/'), ('pow', '**')]: + for op, op_str in [ + ("add", "+"), + ("sub", "-"), + ("mul", "*"), + ("div", "/"), + ("pow", "**"), + ]: - if op == 'pow': + if op == "pow": continue - if op == 'div': - op = getattr(operator, 'truediv', None) + if op == "div": + op = getattr(operator, "truediv", None) else: op = getattr(operator, op, None) if op is not None: - result = expr._can_use_numexpr(op, op_str, f, f, - 'evaluate') + result = expr._can_use_numexpr(op, op_str, f, f, "evaluate") assert result != f._is_mixed_type - result = expr.evaluate(op, op_str, f, f, - use_numexpr=True) - expected = expr.evaluate(op, op_str, f, f, - use_numexpr=False) + result = expr.evaluate(op, op_str, f, f, use_numexpr=True) + expected = expr.evaluate(op, op_str, f, f, use_numexpr=False) if isinstance(result, DataFrame): tm.assert_frame_equal(result, expected) else: - tm.assert_numpy_array_equal(result, - expected.values) + tm.assert_numpy_array_equal(result, expected.values) - result = expr._can_use_numexpr(op, op_str, f2, f2, - 'evaluate') + result = expr._can_use_numexpr(op, op_str, f2, f2, "evaluate") assert not result expr.set_use_numexpr(False) @@ -253,8 +273,7 @@ def testit(): def test_boolean_ops(self): def testit(): - for f, f2 in [(self.frame, self.frame2), - (self.mixed, self.mixed2)]: + for f, f2 in [(self.frame, self.frame2), (self.mixed, self.mixed2)]: f11 = f f12 = f + 1 @@ -262,26 +281,28 @@ def testit(): f21 = f2 f22 = f2 + 1 - for op, op_str in [('gt', '>'), ('lt', '<'), ('ge', '>='), - ('le', '<='), ('eq', '=='), ('ne', '!=')]: + for op, op_str in [ + ("gt", ">"), + ("lt", "<"), + ("ge", ">="), + ("le", "<="), + ("eq", "=="), + ("ne", "!="), + ]: op = getattr(operator, op) - result = expr._can_use_numexpr(op, op_str, f11, f12, - 'evaluate') + result = expr._can_use_numexpr(op, op_str, f11, f12, "evaluate") assert result != f11._is_mixed_type - result = expr.evaluate(op, op_str, f11, f12, - use_numexpr=True) - expected = expr.evaluate(op, op_str, f11, f12, - use_numexpr=False) + result = expr.evaluate(op, op_str, f11, f12, use_numexpr=True) + expected = expr.evaluate(op, op_str, f11, f12, use_numexpr=False) if isinstance(result, DataFrame): tm.assert_frame_equal(result, expected) else: tm.assert_numpy_array_equal(result, expected.values) - result = expr._can_use_numexpr(op, op_str, f21, f22, - 'evaluate') + result = expr._can_use_numexpr(op, op_str, f21, f22, "evaluate") assert not result expr.set_use_numexpr(False) @@ -313,11 +334,10 @@ def testit(): testit() def test_bool_ops_raise_on_arithmetic(self): - df = DataFrame({'a': np.random.rand(10) > 0.5, - 'b': np.random.rand(10) > 0.5}) - names = 'truediv', 'floordiv', 'pow' - ops = '/', '//', '**' - msg = 'operator %r not implemented for bool dtypes' + df = DataFrame({"a": np.random.rand(10) > 0.5, "b": np.random.rand(10) > 0.5}) + names = "truediv", "floordiv", "pow" + ops = "/", "//", "**" + msg = "operator %r not implemented for bool dtypes" for op, name in zip(ops, names): f = getattr(operator, name) err_msg = re.escape(msg % op) @@ -342,17 +362,16 @@ def test_bool_ops_raise_on_arithmetic(self): def test_bool_ops_warn_on_arithmetic(self): n = 10 - df = DataFrame({'a': np.random.rand(n) > 0.5, - 'b': np.random.rand(n) > 0.5}) - names = 'add', 'mul', 'sub' - ops = '+', '*', '-' - subs = {'+': '|', '*': '&', '-': '^'} - sub_funcs = {'|': 'or_', '&': 'and_', '^': 'xor'} + df = DataFrame({"a": np.random.rand(n) > 0.5, "b": np.random.rand(n) > 0.5}) + names = "add", "mul", "sub" + ops = "+", "*", "-" + subs = {"+": "|", "*": "&", "-": "^"} + sub_funcs = {"|": "or_", "&": "and_", "^": "xor"} for op, name in zip(ops, names): f = getattr(operator, name) fe = getattr(operator, sub_funcs[subs[op]]) - if op == '-': + if op == "-": # raises TypeError continue @@ -387,18 +406,28 @@ def test_bool_ops_warn_on_arithmetic(self): e = fe(df, True) tm.assert_frame_equal(r, e) - @pytest.mark.parametrize("test_input,expected", [ - (DataFrame([[0, 1, 2, 'aa'], [0, 1, 2, 'aa']], - columns=['a', 'b', 'c', 'dtype']), - DataFrame([[False, False], [False, False]], - columns=['a', 'dtype'])), - (DataFrame([[0, 3, 2, 'aa'], [0, 4, 2, 'aa'], [0, 1, 1, 'bb']], - columns=['a', 'b', 'c', 'dtype']), - DataFrame([[False, False], [False, False], - [False, False]], columns=['a', 'dtype'])), - ]) + @pytest.mark.parametrize( + "test_input,expected", + [ + ( + DataFrame( + [[0, 1, 2, "aa"], [0, 1, 2, "aa"]], columns=["a", "b", "c", "dtype"] + ), + DataFrame([[False, False], [False, False]], columns=["a", "dtype"]), + ), + ( + DataFrame( + [[0, 3, 2, "aa"], [0, 4, 2, "aa"], [0, 1, 1, "bb"]], + columns=["a", "b", "c", "dtype"], + ), + DataFrame( + [[False, False], [False, False], [False, False]], + columns=["a", "dtype"], + ), + ), + ], + ) def test_bool_ops_column_name_dtype(self, test_input, expected): # GH 22383 - .ne fails if columns containing column name 'dtype' - result = test_input.loc[:, ['a', 'dtype']].ne( - test_input.loc[:, ['a', 'dtype']]) + result = test_input.loc[:, ["a", "dtype"]].ne(test_input.loc[:, ["a", "dtype"]]) assert_frame_equal(result, expected) diff --git a/pandas/tests/test_join.py b/pandas/tests/test_join.py index 455981506efca8..e750193abb71ad 100644 --- a/pandas/tests/test_join.py +++ b/pandas/tests/test_join.py @@ -8,13 +8,14 @@ class TestIndexer: - def test_outer_join_indexer(self): - typemap = [('int32', _join.outer_join_indexer_int32), - ('int64', _join.outer_join_indexer_int64), - ('float32', _join.outer_join_indexer_float32), - ('float64', _join.outer_join_indexer_float64), - ('object', _join.outer_join_indexer_object)] + typemap = [ + ("int32", _join.outer_join_indexer_int32), + ("int64", _join.outer_join_indexer_int64), + ("float32", _join.outer_join_indexer_float32), + ("float64", _join.outer_join_indexer_float64), + ("object", _join.outer_join_indexer_object), + ] for dtype, indexer in typemap: left = np.arange(3, dtype=dtype) @@ -56,12 +57,111 @@ def test_left_join_indexer_unique(): def test_left_outer_join_bug(): - left = np.array([0, 1, 0, 1, 1, 2, 3, 1, 0, 2, 1, 2, 0, 1, 1, 2, 3, 2, 3, - 2, 1, 1, 3, 0, 3, 2, 3, 0, 0, 2, 3, 2, 0, 3, 1, 3, 0, 1, - 3, 0, 0, 1, 0, 3, 1, 0, 1, 0, 1, 1, 0, 2, 2, 2, 2, 2, 0, - 3, 1, 2, 0, 0, 3, 1, 3, 2, 2, 0, 1, 3, 0, 2, 3, 2, 3, 3, - 2, 3, 3, 1, 3, 2, 0, 0, 3, 1, 1, 1, 0, 2, 3, 3, 1, 2, 0, - 3, 1, 2, 0, 2], dtype=np.int64) + left = np.array( + [ + 0, + 1, + 0, + 1, + 1, + 2, + 3, + 1, + 0, + 2, + 1, + 2, + 0, + 1, + 1, + 2, + 3, + 2, + 3, + 2, + 1, + 1, + 3, + 0, + 3, + 2, + 3, + 0, + 0, + 2, + 3, + 2, + 0, + 3, + 1, + 3, + 0, + 1, + 3, + 0, + 0, + 1, + 0, + 3, + 1, + 0, + 1, + 0, + 1, + 1, + 0, + 2, + 2, + 2, + 2, + 2, + 0, + 3, + 1, + 2, + 0, + 0, + 3, + 1, + 3, + 2, + 2, + 0, + 1, + 3, + 0, + 2, + 3, + 2, + 3, + 3, + 2, + 3, + 3, + 1, + 3, + 2, + 0, + 0, + 3, + 1, + 1, + 1, + 0, + 2, + 3, + 3, + 1, + 2, + 0, + 3, + 1, + 2, + 0, + 2, + ], + dtype=np.int64, + ) right = np.array([3, 1], dtype=np.int64) max_groups = 4 @@ -196,39 +296,55 @@ def test_inner_join_indexer2(): def test_merge_join_categorical_multiindex(): # From issue 16627 - a = {'Cat1': Categorical(['a', 'b', 'a', 'c', 'a', 'b'], - ['a', 'b', 'c']), - 'Int1': [0, 1, 0, 1, 0, 0]} + a = { + "Cat1": Categorical(["a", "b", "a", "c", "a", "b"], ["a", "b", "c"]), + "Int1": [0, 1, 0, 1, 0, 0], + } a = DataFrame(a) - b = {'Cat': Categorical(['a', 'b', 'c', 'a', 'b', 'c'], - ['a', 'b', 'c']), - 'Int': [0, 0, 0, 1, 1, 1], - 'Factor': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]} - b = DataFrame(b).set_index(['Cat', 'Int'])['Factor'] - - expected = merge(a, b.reset_index(), left_on=['Cat1', 'Int1'], - right_on=['Cat', 'Int'], how='left') - result = a.join(b, on=['Cat1', 'Int1']) - expected = expected.drop(['Cat', 'Int'], axis=1) + b = { + "Cat": Categorical(["a", "b", "c", "a", "b", "c"], ["a", "b", "c"]), + "Int": [0, 0, 0, 1, 1, 1], + "Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6], + } + b = DataFrame(b).set_index(["Cat", "Int"])["Factor"] + + expected = merge( + a, + b.reset_index(), + left_on=["Cat1", "Int1"], + right_on=["Cat", "Int"], + how="left", + ) + result = a.join(b, on=["Cat1", "Int1"]) + expected = expected.drop(["Cat", "Int"], axis=1) assert_frame_equal(expected, result) # Same test, but with ordered categorical - a = {'Cat1': Categorical(['a', 'b', 'a', 'c', 'a', 'b'], - ['b', 'a', 'c'], - ordered=True), - 'Int1': [0, 1, 0, 1, 0, 0]} + a = { + "Cat1": Categorical( + ["a", "b", "a", "c", "a", "b"], ["b", "a", "c"], ordered=True + ), + "Int1": [0, 1, 0, 1, 0, 0], + } a = DataFrame(a) - b = {'Cat': Categorical(['a', 'b', 'c', 'a', 'b', 'c'], - ['b', 'a', 'c'], - ordered=True), - 'Int': [0, 0, 0, 1, 1, 1], - 'Factor': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]} - b = DataFrame(b).set_index(['Cat', 'Int'])['Factor'] - - expected = merge(a, b.reset_index(), left_on=['Cat1', 'Int1'], - right_on=['Cat', 'Int'], how='left') - result = a.join(b, on=['Cat1', 'Int1']) - expected = expected.drop(['Cat', 'Int'], axis=1) + b = { + "Cat": Categorical( + ["a", "b", "c", "a", "b", "c"], ["b", "a", "c"], ordered=True + ), + "Int": [0, 0, 0, 1, 1, 1], + "Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6], + } + b = DataFrame(b).set_index(["Cat", "Int"])["Factor"] + + expected = merge( + a, + b.reset_index(), + left_on=["Cat1", "Int1"], + right_on=["Cat", "Int"], + how="left", + ) + result = a.join(b, on=["Cat1", "Int1"]) + expected = expected.drop(["Cat", "Int"], axis=1) assert_frame_equal(expected, result) diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 66b0d8869940a5..77841f0bb9f0d2 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -8,40 +8,38 @@ class TestMisc: - def test_max_len_string_array(self): - arr = a = np.array(['foo', 'b', np.nan], dtype='object') + arr = a = np.array(["foo", "b", np.nan], dtype="object") assert libwriters.max_len_string_array(arr) == 3 # unicode - arr = a.astype('U').astype(object) + arr = a.astype("U").astype(object) assert libwriters.max_len_string_array(arr) == 3 # bytes for python3 - arr = a.astype('S').astype(object) + arr = a.astype("S").astype(object) assert libwriters.max_len_string_array(arr) == 3 # raises with pytest.raises(TypeError): - libwriters.max_len_string_array(arr.astype('U')) + libwriters.max_len_string_array(arr.astype("U")) def test_fast_unique_multiple_list_gen_sort(self): - keys = [['p', 'a'], ['n', 'd'], ['a', 's']] + keys = [["p", "a"], ["n", "d"], ["a", "s"]] gen = (key for key in keys) - expected = np.array(['a', 'd', 'n', 'p', 's']) + expected = np.array(["a", "d", "n", "p", "s"]) out = lib.fast_unique_multiple_list_gen(gen, sort=True) tm.assert_numpy_array_equal(np.array(out), expected) gen = (key for key in keys) - expected = np.array(['p', 'a', 'n', 'd', 's']) + expected = np.array(["p", "a", "n", "d", "s"]) out = lib.fast_unique_multiple_list_gen(gen, sort=False) tm.assert_numpy_array_equal(np.array(out), expected) class TestIndexing: - def test_maybe_indices_to_slice_left_edge(self): target = np.arange(100) @@ -58,20 +56,17 @@ def test_maybe_indices_to_slice_left_edge(self): maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert isinstance(maybe_slice, slice) - tm.assert_numpy_array_equal(target[indices], - target[maybe_slice]) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) # reverse indices = indices[::-1] maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert isinstance(maybe_slice, slice) - tm.assert_numpy_array_equal(target[indices], - target[maybe_slice]) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) # not slice - for case in [[2, 1, 2, 0], [2, 2, 1, 0], [0, 1, 2, 1], [-2, 0, 2], - [2, 0, -2]]: + for case in [[2, 1, 2, 0], [2, 2, 1, 0], [0, 1, 2, 1], [-2, 0, 2], [2, 0, -2]]: indices = np.array(case, dtype=np.int64) maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) @@ -89,16 +84,14 @@ def test_maybe_indices_to_slice_right_edge(self): maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert isinstance(maybe_slice, slice) - tm.assert_numpy_array_equal(target[indices], - target[maybe_slice]) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) # reverse indices = indices[::-1] maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert isinstance(maybe_slice, slice) - tm.assert_numpy_array_equal(target[indices], - target[maybe_slice]) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) # not slice indices = np.array([97, 98, 99, 100], dtype=np.int64) @@ -165,16 +158,14 @@ def test_maybe_indices_to_slice_middle(self): maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert isinstance(maybe_slice, slice) - tm.assert_numpy_array_equal(target[indices], - target[maybe_slice]) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) # reverse indices = indices[::-1] maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert isinstance(maybe_slice, slice) - tm.assert_numpy_array_equal(target[indices], - target[maybe_slice]) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) # not slice for case in [[14, 12, 10, 12], [12, 12, 11, 10], [10, 11, 12, 11]]: diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index aa9c9bb05f8774..a76f2bb04a5420 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -16,28 +16,44 @@ from pandas.core.index import Index, MultiIndex import pandas.util.testing as tm -AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', 'mad', - 'std', 'var', 'sem'] +AGG_FUNCTIONS = [ + "sum", + "prod", + "min", + "max", + "median", + "mean", + "skew", + "mad", + "std", + "var", + "sem", +] class Base: - def setup_method(self, method): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - self.frame = DataFrame(np.random.randn(10, 3), index=index, - columns=Index(['A', 'B', 'C'], name='exp')) - - self.single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], - codes=[[0, 1, 2, 3]], names=['first']) + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + self.frame = DataFrame( + np.random.randn(10, 3), + index=index, + columns=Index(["A", "B", "C"], name="exp"), + ) + + self.single_level = MultiIndex( + levels=[["foo", "bar", "baz", "qux"]], codes=[[0, 1, 2, 3]], names=["first"] + ) # create test series object - arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + arrays = [ + ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) @@ -45,32 +61,31 @@ def setup_method(self, method): self.series = s self.tdf = tm.makeTimeDataFrame(100) - self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month, - lambda x: x.day]).sum() + self.ymd = self.tdf.groupby( + [lambda x: x.year, lambda x: x.month, lambda x: x.day] + ).sum() # use Int64Index, to make sure things work - self.ymd.index.set_levels([lev.astype('i8') - for lev in self.ymd.index.levels], - inplace=True) - self.ymd.index.set_names(['year', 'month', 'day'], inplace=True) + self.ymd.index.set_levels( + [lev.astype("i8") for lev in self.ymd.index.levels], inplace=True + ) + self.ymd.index.set_names(["year", "month", "day"], inplace=True) class TestMultiLevel(Base): - def test_append(self): a, b = self.frame[:5], self.frame[5:] result = a.append(b) tm.assert_frame_equal(result, self.frame) - result = a['A'].append(b['A']) - tm.assert_series_equal(result, self.frame['A']) + result = a["A"].append(b["A"]) + tm.assert_series_equal(result, self.frame["A"]) def test_append_index(self): idx1 = Index([1.1, 1.2, 1.3]) - idx2 = pd.date_range('2011-01-01', freq='D', periods=3, - tz='Asia/Tokyo') - idx3 = Index(['A', 'B', 'C']) + idx2 = pd.date_range("2011-01-01", freq="D", periods=3, tz="Asia/Tokyo") + idx3 = Index(["A", "B", "C"]) midx_lv2 = MultiIndex.from_arrays([idx1, idx2]) midx_lv3 = MultiIndex.from_arrays([idx1, idx2, idx3]) @@ -78,10 +93,12 @@ def test_append_index(self): result = idx1.append(midx_lv2) # see gh-7112 - tz = pytz.timezone('Asia/Tokyo') - expected_tuples = [(1.1, tz.localize(datetime.datetime(2011, 1, 1))), - (1.2, tz.localize(datetime.datetime(2011, 1, 2))), - (1.3, tz.localize(datetime.datetime(2011, 1, 3)))] + tz = pytz.timezone("Asia/Tokyo") + expected_tuples = [ + (1.1, tz.localize(datetime.datetime(2011, 1, 1))), + (1.2, tz.localize(datetime.datetime(2011, 1, 2))), + (1.3, tz.localize(datetime.datetime(2011, 1, 3))), + ] expected = Index([1.1, 1.2, 1.3] + expected_tuples) tm.assert_index_equal(result, expected) @@ -90,8 +107,7 @@ def test_append_index(self): tm.assert_index_equal(result, expected) result = midx_lv2.append(midx_lv2) - expected = MultiIndex.from_arrays([idx1.append(idx1), - idx2.append(idx2)]) + expected = MultiIndex.from_arrays([idx1.append(idx1), idx2.append(idx2)]) tm.assert_index_equal(result, expected) result = midx_lv2.append(midx_lv3) @@ -99,78 +115,84 @@ def test_append_index(self): result = midx_lv3.append(midx_lv2) expected = Index._simple_new( - np.array([(1.1, tz.localize(datetime.datetime(2011, 1, 1)), 'A'), - (1.2, tz.localize(datetime.datetime(2011, 1, 2)), 'B'), - (1.3, tz.localize(datetime.datetime(2011, 1, 3)), 'C')] + - expected_tuples), None) + np.array( + [ + (1.1, tz.localize(datetime.datetime(2011, 1, 1)), "A"), + (1.2, tz.localize(datetime.datetime(2011, 1, 2)), "B"), + (1.3, tz.localize(datetime.datetime(2011, 1, 3)), "C"), + ] + + expected_tuples + ), + None, + ) tm.assert_index_equal(result, expected) def test_dataframe_constructor(self): - multi = DataFrame(np.random.randn(4, 4), - index=[np.array(['a', 'a', 'b', 'b']), - np.array(['x', 'y', 'x', 'y'])]) + multi = DataFrame( + np.random.randn(4, 4), + index=[np.array(["a", "a", "b", "b"]), np.array(["x", "y", "x", "y"])], + ) assert isinstance(multi.index, MultiIndex) assert not isinstance(multi.columns, MultiIndex) - multi = DataFrame(np.random.randn(4, 4), - columns=[['a', 'a', 'b', 'b'], - ['x', 'y', 'x', 'y']]) + multi = DataFrame( + np.random.randn(4, 4), columns=[["a", "a", "b", "b"], ["x", "y", "x", "y"]] + ) assert isinstance(multi.columns, MultiIndex) def test_series_constructor(self): - multi = Series(1., index=[np.array(['a', 'a', 'b', 'b']), np.array( - ['x', 'y', 'x', 'y'])]) + multi = Series( + 1.0, index=[np.array(["a", "a", "b", "b"]), np.array(["x", "y", "x", "y"])] + ) assert isinstance(multi.index, MultiIndex) - multi = Series(1., index=[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']]) + multi = Series(1.0, index=[["a", "a", "b", "b"], ["x", "y", "x", "y"]]) assert isinstance(multi.index, MultiIndex) - multi = Series(range(4), index=[['a', 'a', 'b', 'b'], - ['x', 'y', 'x', 'y']]) + multi = Series(range(4), index=[["a", "a", "b", "b"], ["x", "y", "x", "y"]]) assert isinstance(multi.index, MultiIndex) def test_reindex_level(self): # axis=0 - month_sums = self.ymd.sum(level='month') + month_sums = self.ymd.sum(level="month") result = month_sums.reindex(self.ymd.index, level=1) - expected = self.ymd.groupby(level='month').transform(np.sum) + expected = self.ymd.groupby(level="month").transform(np.sum) tm.assert_frame_equal(result, expected) # Series - result = month_sums['A'].reindex(self.ymd.index, level=1) - expected = self.ymd['A'].groupby(level='month').transform(np.sum) + result = month_sums["A"].reindex(self.ymd.index, level=1) + expected = self.ymd["A"].groupby(level="month").transform(np.sum) tm.assert_series_equal(result, expected, check_names=False) # axis=1 - month_sums = self.ymd.T.sum(axis=1, level='month') + month_sums = self.ymd.T.sum(axis=1, level="month") result = month_sums.reindex(columns=self.ymd.index, level=1) - expected = self.ymd.groupby(level='month').transform(np.sum).T + expected = self.ymd.groupby(level="month").transform(np.sum).T tm.assert_frame_equal(result, expected) def test_binops_level(self): def _check_op(opname): op = getattr(DataFrame, opname) - month_sums = self.ymd.sum(level='month') - result = op(self.ymd, month_sums, level='month') + month_sums = self.ymd.sum(level="month") + result = op(self.ymd, month_sums, level="month") - broadcasted = self.ymd.groupby(level='month').transform(np.sum) + broadcasted = self.ymd.groupby(level="month").transform(np.sum) expected = op(self.ymd, broadcasted) tm.assert_frame_equal(result, expected) # Series op = getattr(Series, opname) - result = op(self.ymd['A'], month_sums['A'], level='month') - broadcasted = self.ymd['A'].groupby(level='month').transform( - np.sum) - expected = op(self.ymd['A'], broadcasted) - expected.name = 'A' + result = op(self.ymd["A"], month_sums["A"], level="month") + broadcasted = self.ymd["A"].groupby(level="month").transform(np.sum) + expected = op(self.ymd["A"], broadcasted) + expected.name = "A" tm.assert_series_equal(result, expected) - _check_op('sub') - _check_op('add') - _check_op('mul') - _check_op('div') + _check_op("sub") + _check_op("add") + _check_op("mul") + _check_op("div") def test_pickle(self): def _test_roundtrip(frame): @@ -184,12 +206,12 @@ def _test_roundtrip(frame): def test_reindex(self): expected = self.frame.iloc[[0, 3]] - reindexed = self.frame.loc[[('foo', 'one'), ('bar', 'one')]] + reindexed = self.frame.loc[[("foo", "one"), ("bar", "one")]] tm.assert_frame_equal(reindexed, expected) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]] + reindexed = self.frame.ix[[("foo", "one"), ("bar", "one")]] tm.assert_frame_equal(reindexed, expected) def test_reindex_preserve_levels(self): @@ -225,24 +247,22 @@ def test_repr_to_string(self): self.ymd.T.to_string(buf=buf) def test_repr_name_coincide(self): - index = MultiIndex.from_tuples([('a', 0, 'foo'), ('b', 1, 'bar')], - names=['a', 'b', 'c']) + index = MultiIndex.from_tuples( + [("a", 0, "foo"), ("b", 1, "bar")], names=["a", "b", "c"] + ) - df = DataFrame({'value': [0, 1]}, index=index) + df = DataFrame({"value": [0, 1]}, index=index) - lines = repr(df).split('\n') - assert lines[2].startswith('a 0 foo') + lines = repr(df).split("\n") + assert lines[2].startswith("a 0 foo") def test_delevel_infer_dtype(self): - tuples = [tuple - for tuple in product( - ['foo', 'bar'], [10, 20], [1.0, 1.1])] - index = MultiIndex.from_tuples(tuples, names=['prm0', 'prm1', 'prm2']) - df = DataFrame(np.random.randn(8, 3), columns=['A', 'B', 'C'], - index=index) + tuples = [tuple for tuple in product(["foo", "bar"], [10, 20], [1.0, 1.1])] + index = MultiIndex.from_tuples(tuples, names=["prm0", "prm1", "prm2"]) + df = DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"], index=index) deleveled = df.reset_index() - assert is_integer_dtype(deleveled['prm1']) - assert is_float_dtype(deleveled['prm2']) + assert is_integer_dtype(deleveled["prm1"]) + assert is_float_dtype(deleveled["prm2"]) def test_reset_index_with_drop(self): deleveled = self.ymd.reset_index(drop=True) @@ -264,7 +284,7 @@ def _check_counts(frame, axis=0): for i in range(index.nlevels): result = frame.count(axis=axis, level=i) expected = frame.groupby(axis=axis, level=i).count() - expected = expected.reindex_like(result).astype('i8') + expected = expected.reindex_like(result).astype("i8") tm.assert_frame_equal(result, expected) self.frame.iloc[1, [1, 2]] = np.nan @@ -279,40 +299,46 @@ def _check_counts(frame, axis=0): # can't call with level on regular DataFrame df = tm.makeTimeDataFrame() - with pytest.raises(TypeError, match='hierarchical'): + with pytest.raises(TypeError, match="hierarchical"): df.count(level=0) - self.frame['D'] = 'foo' + self.frame["D"] = "foo" result = self.frame.count(level=0, numeric_only=True) - tm.assert_index_equal(result.columns, Index(list('ABC'), name='exp')) + tm.assert_index_equal(result.columns, Index(list("ABC"), name="exp")) def test_count_level_series(self): - index = MultiIndex(levels=[['foo', 'bar', 'baz'], ['one', 'two', - 'three', 'four']], - codes=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]]) + index = MultiIndex( + levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]], + codes=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]], + ) s = Series(np.random.randn(len(index)), index=index) result = s.count(level=0) expected = s.groupby(level=0).count() tm.assert_series_equal( - result.astype('f8'), expected.reindex(result.index).fillna(0)) + result.astype("f8"), expected.reindex(result.index).fillna(0) + ) result = s.count(level=1) expected = s.groupby(level=1).count() tm.assert_series_equal( - result.astype('f8'), expected.reindex(result.index).fillna(0)) + result.astype("f8"), expected.reindex(result.index).fillna(0) + ) def test_count_level_corner(self): - s = self.frame['A'][:0] + s = self.frame["A"][:0] result = s.count(level=0) - expected = Series(0, index=s.index.levels[0], name='A') + expected = Series(0, index=s.index.levels[0], name="A") tm.assert_series_equal(result, expected) df = self.frame[:0] result = df.count(level=0) - expected = DataFrame(index=s.index.levels[0], - columns=df.columns).fillna(0).astype(np.int64) + expected = ( + DataFrame(index=s.index.levels[0], columns=df.columns) + .fillna(0) + .astype(np.int64) + ) tm.assert_frame_equal(result, expected) def test_get_level_number_out_of_bounds(self): @@ -333,13 +359,14 @@ def test_unstack(self): self.ymd.astype(np.int32).unstack() def test_unstack_multiple_no_empty_columns(self): - index = MultiIndex.from_tuples([(0, 'foo', 0), (0, 'bar', 0), ( - 1, 'baz', 1), (1, 'qux', 1)]) + index = MultiIndex.from_tuples( + [(0, "foo", 0), (0, "bar", 0), (1, "baz", 1), (1, "qux", 1)] + ) s = Series(np.random.randn(4), index=index) unstacked = s.unstack([1, 2]) - expected = unstacked.dropna(axis=1, how='all') + expected = unstacked.dropna(axis=1, how="all") tm.assert_frame_equal(unstacked, expected) def test_stack(self): @@ -402,33 +429,39 @@ def check(left, right): li, ri = left.index, right.index tm.assert_index_equal(li, ri) - df = DataFrame(np.arange(12).reshape(4, 3), - index=list('abab'), - columns=['1st', '2nd', '3rd']) + df = DataFrame( + np.arange(12).reshape(4, 3), + index=list("abab"), + columns=["1st", "2nd", "3rd"], + ) - mi = MultiIndex(levels=[['a', 'b'], ['1st', '2nd', '3rd']], - codes=[np.tile( - np.arange(2).repeat(3), 2), np.tile( - np.arange(3), 4)]) + mi = MultiIndex( + levels=[["a", "b"], ["1st", "2nd", "3rd"]], + codes=[np.tile(np.arange(2).repeat(3), 2), np.tile(np.arange(3), 4)], + ) left, right = df.stack(), Series(np.arange(12), index=mi) check(left, right) - df.columns = ['1st', '2nd', '1st'] - mi = MultiIndex(levels=[['a', 'b'], ['1st', '2nd']], codes=[np.tile( - np.arange(2).repeat(3), 2), np.tile( - [0, 1, 0], 4)]) + df.columns = ["1st", "2nd", "1st"] + mi = MultiIndex( + levels=[["a", "b"], ["1st", "2nd"]], + codes=[np.tile(np.arange(2).repeat(3), 2), np.tile([0, 1, 0], 4)], + ) left, right = df.stack(), Series(np.arange(12), index=mi) check(left, right) - tpls = ('a', 2), ('b', 1), ('a', 1), ('b', 2) + tpls = ("a", 2), ("b", 1), ("a", 1), ("b", 2) df.index = MultiIndex.from_tuples(tpls) - mi = MultiIndex(levels=[['a', 'b'], [1, 2], ['1st', '2nd']], - codes=[np.tile( - np.arange(2).repeat(3), 2), np.repeat( - [1, 0, 1], [3, 6, 3]), np.tile( - [0, 1, 0], 4)]) + mi = MultiIndex( + levels=[["a", "b"], [1, 2], ["1st", "2nd"]], + codes=[ + np.tile(np.arange(2).repeat(3), 2), + np.repeat([1, 0, 1], [3, 6, 3]), + np.tile([0, 1, 0], 4), + ], + ) left, right = df.stack(), Series(np.arange(12), index=mi) check(left, right) @@ -447,7 +480,7 @@ def test_unstack_odd_failure(self): Thur,Lunch,No,117.32,44 Thur,Lunch,Yes,51.51,17""" - df = pd.read_csv(StringIO(data)).set_index(['day', 'time', 'smoker']) + df = pd.read_csv(StringIO(data)).set_index(["day", "time", "smoker"]) # it works, #2100 result = df.unstack(2) @@ -457,65 +490,67 @@ def test_unstack_odd_failure(self): def test_stack_mixed_dtype(self): df = self.frame.T - df['foo', 'four'] = 'foo' + df["foo", "four"] = "foo" df = df.sort_index(level=1, axis=1) stacked = df.stack() - result = df['foo'].stack().sort_index() - tm.assert_series_equal(stacked['foo'], result, check_names=False) + result = df["foo"].stack().sort_index() + tm.assert_series_equal(stacked["foo"], result, check_names=False) assert result.name is None - assert stacked['bar'].dtype == np.float_ + assert stacked["bar"].dtype == np.float_ def test_unstack_bug(self): - df = DataFrame({'state': ['naive', 'naive', 'naive', 'activ', 'activ', - 'activ'], - 'exp': ['a', 'b', 'b', 'b', 'a', 'a'], - 'barcode': [1, 2, 3, 4, 1, 3], - 'v': ['hi', 'hi', 'bye', 'bye', 'bye', 'peace'], - 'extra': np.arange(6.)}) + df = DataFrame( + { + "state": ["naive", "naive", "naive", "activ", "activ", "activ"], + "exp": ["a", "b", "b", "b", "a", "a"], + "barcode": [1, 2, 3, 4, 1, 3], + "v": ["hi", "hi", "bye", "bye", "bye", "peace"], + "extra": np.arange(6.0), + } + ) - result = df.groupby(['state', 'exp', 'barcode', 'v']).apply(len) + result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) unstacked = result.unstack() restacked = unstacked.stack() - tm.assert_series_equal( - restacked, result.reindex(restacked.index).astype(float)) + tm.assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) def test_stack_unstack_preserve_names(self): unstacked = self.frame.unstack() - assert unstacked.index.name == 'first' - assert unstacked.columns.names == ['exp', 'second'] + assert unstacked.index.name == "first" + assert unstacked.columns.names == ["exp", "second"] restacked = unstacked.stack() assert restacked.index.names == self.frame.index.names def test_unstack_level_name(self): - result = self.frame.unstack('second') + result = self.frame.unstack("second") expected = self.frame.unstack(level=1) tm.assert_frame_equal(result, expected) def test_stack_level_name(self): - unstacked = self.frame.unstack('second') - result = unstacked.stack('exp') + unstacked = self.frame.unstack("second") + result = unstacked.stack("exp") expected = self.frame.unstack().stack(0) tm.assert_frame_equal(result, expected) - result = self.frame.stack('exp') + result = self.frame.stack("exp") expected = self.frame.stack() tm.assert_series_equal(result, expected) def test_stack_unstack_multiple(self): - unstacked = self.ymd.unstack(['year', 'month']) - expected = self.ymd.unstack('year').unstack('month') + unstacked = self.ymd.unstack(["year", "month"]) + expected = self.ymd.unstack("year").unstack("month") tm.assert_frame_equal(unstacked, expected) assert unstacked.columns.names == expected.columns.names # series - s = self.ymd['A'] - s_unstacked = s.unstack(['year', 'month']) - tm.assert_frame_equal(s_unstacked, expected['A']) + s = self.ymd["A"] + s_unstacked = s.unstack(["year", "month"]) + tm.assert_frame_equal(s_unstacked, expected["A"]) - restacked = unstacked.stack(['year', 'month']) + restacked = unstacked.stack(["year", "month"]) restacked = restacked.swaplevel(0, 1).swaplevel(1, 2) restacked = restacked.sort_index(level=0) @@ -524,23 +559,23 @@ def test_stack_unstack_multiple(self): # GH #451 unstacked = self.ymd.unstack([1, 2]) - expected = self.ymd.unstack(1).unstack(1).dropna(axis=1, how='all') + expected = self.ymd.unstack(1).unstack(1).dropna(axis=1, how="all") tm.assert_frame_equal(unstacked, expected) unstacked = self.ymd.unstack([2, 1]) - expected = self.ymd.unstack(2).unstack(1).dropna(axis=1, how='all') + expected = self.ymd.unstack(2).unstack(1).dropna(axis=1, how="all") tm.assert_frame_equal(unstacked, expected.loc[:, unstacked.columns]) def test_stack_names_and_numbers(self): - unstacked = self.ymd.unstack(['year', 'month']) + unstacked = self.ymd.unstack(["year", "month"]) # Can't use mixture of names and numbers to stack with pytest.raises(ValueError, match="level should contain"): - unstacked.stack([0, 'month']) + unstacked.stack([0, "month"]) def test_stack_multiple_out_of_bounds(self): # nlevels == 3 - unstacked = self.ymd.unstack(['year', 'month']) + unstacked = self.ymd.unstack(["year", "month"]) with pytest.raises(IndexError, match="Too many levels"): unstacked.stack([2, 3]) @@ -549,9 +584,12 @@ def test_stack_multiple_out_of_bounds(self): def test_unstack_period_series(self): # GH 4342 - idx1 = pd.PeriodIndex(['2013-01', '2013-01', '2013-02', '2013-02', - '2013-03', '2013-03'], freq='M', name='period') - idx2 = Index(['A', 'B'] * 3, name='str') + idx1 = pd.PeriodIndex( + ["2013-01", "2013-01", "2013-02", "2013-02", "2013-03", "2013-03"], + freq="M", + name="period", + ) + idx2 = Index(["A", "B"] * 3, name="str") value = [1, 2, 3, 4, 5, 6] idx = MultiIndex.from_arrays([idx1, idx2]) @@ -562,20 +600,28 @@ def test_unstack_period_series(self): result3 = s.unstack(level=0) e_idx = pd.PeriodIndex( - ['2013-01', '2013-02', '2013-03'], freq='M', name='period') - expected = DataFrame({'A': [1, 3, 5], 'B': [2, 4, 6]}, index=e_idx, - columns=['A', 'B']) - expected.columns.name = 'str' + ["2013-01", "2013-02", "2013-03"], freq="M", name="period" + ) + expected = DataFrame( + {"A": [1, 3, 5], "B": [2, 4, 6]}, index=e_idx, columns=["A", "B"] + ) + expected.columns.name = "str" tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) tm.assert_frame_equal(result3, expected.T) - idx1 = pd.PeriodIndex(['2013-01', '2013-01', '2013-02', '2013-02', - '2013-03', '2013-03'], freq='M', name='period1') - - idx2 = pd.PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09', - '2013-08', '2013-07'], freq='M', name='period2') + idx1 = pd.PeriodIndex( + ["2013-01", "2013-01", "2013-02", "2013-02", "2013-03", "2013-03"], + freq="M", + name="period1", + ) + + idx2 = pd.PeriodIndex( + ["2013-12", "2013-11", "2013-10", "2013-09", "2013-08", "2013-07"], + freq="M", + name="period2", + ) idx = MultiIndex.from_arrays([idx1, idx2]) s = Series(value, index=idx) @@ -584,14 +630,22 @@ def test_unstack_period_series(self): result3 = s.unstack(level=0) e_idx = pd.PeriodIndex( - ['2013-01', '2013-02', '2013-03'], freq='M', name='period1') - e_cols = pd.PeriodIndex(['2013-07', '2013-08', '2013-09', '2013-10', - '2013-11', '2013-12'], - freq='M', name='period2') - expected = DataFrame([[np.nan, np.nan, np.nan, np.nan, 2, 1], - [np.nan, np.nan, 4, 3, np.nan, np.nan], - [6, 5, np.nan, np.nan, np.nan, np.nan]], - index=e_idx, columns=e_cols) + ["2013-01", "2013-02", "2013-03"], freq="M", name="period1" + ) + e_cols = pd.PeriodIndex( + ["2013-07", "2013-08", "2013-09", "2013-10", "2013-11", "2013-12"], + freq="M", + name="period2", + ) + expected = DataFrame( + [ + [np.nan, np.nan, np.nan, np.nan, 2, 1], + [np.nan, np.nan, 4, 3, np.nan, np.nan], + [6, 5, np.nan, np.nan, np.nan, np.nan], + ], + index=e_idx, + columns=e_cols, + ) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) @@ -599,13 +653,17 @@ def test_unstack_period_series(self): def test_unstack_period_frame(self): # GH 4342 - idx1 = pd.PeriodIndex(['2014-01', '2014-02', '2014-02', '2014-02', - '2014-01', '2014-01'], - freq='M', name='period1') - idx2 = pd.PeriodIndex(['2013-12', '2013-12', '2014-02', '2013-10', - '2013-10', '2014-02'], - freq='M', name='period2') - value = {'A': [1, 2, 3, 4, 5, 6], 'B': [6, 5, 4, 3, 2, 1]} + idx1 = pd.PeriodIndex( + ["2014-01", "2014-02", "2014-02", "2014-02", "2014-01", "2014-01"], + freq="M", + name="period1", + ) + idx2 = pd.PeriodIndex( + ["2013-12", "2013-12", "2014-02", "2013-10", "2013-10", "2014-02"], + freq="M", + name="period2", + ) + value = {"A": [1, 2, 3, 4, 5, 6], "B": [6, 5, 4, 3, 2, 1]} idx = MultiIndex.from_arrays([idx1, idx2]) df = DataFrame(value, index=idx) @@ -613,48 +671,55 @@ def test_unstack_period_frame(self): result2 = df.unstack(level=1) result3 = df.unstack(level=0) - e_1 = pd.PeriodIndex(['2014-01', '2014-02'], freq='M', name='period1') - e_2 = pd.PeriodIndex(['2013-10', '2013-12', '2014-02', '2013-10', - '2013-12', '2014-02'], freq='M', name='period2') - e_cols = MultiIndex.from_arrays(['A A A B B B'.split(), e_2]) - expected = DataFrame([[5, 1, 6, 2, 6, 1], [4, 2, 3, 3, 5, 4]], - index=e_1, columns=e_cols) + e_1 = pd.PeriodIndex(["2014-01", "2014-02"], freq="M", name="period1") + e_2 = pd.PeriodIndex( + ["2013-10", "2013-12", "2014-02", "2013-10", "2013-12", "2014-02"], + freq="M", + name="period2", + ) + e_cols = MultiIndex.from_arrays(["A A A B B B".split(), e_2]) + expected = DataFrame( + [[5, 1, 6, 2, 6, 1], [4, 2, 3, 3, 5, 4]], index=e_1, columns=e_cols + ) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) - e_1 = pd.PeriodIndex(['2014-01', '2014-02', '2014-01', - '2014-02'], freq='M', name='period1') + e_1 = pd.PeriodIndex( + ["2014-01", "2014-02", "2014-01", "2014-02"], freq="M", name="period1" + ) e_2 = pd.PeriodIndex( - ['2013-10', '2013-12', '2014-02'], freq='M', name='period2') - e_cols = MultiIndex.from_arrays(['A A B B'.split(), e_1]) - expected = DataFrame([[5, 4, 2, 3], [1, 2, 6, 5], [6, 3, 1, 4]], - index=e_2, columns=e_cols) + ["2013-10", "2013-12", "2014-02"], freq="M", name="period2" + ) + e_cols = MultiIndex.from_arrays(["A A B B".split(), e_1]) + expected = DataFrame( + [[5, 4, 2, 3], [1, 2, 6, 5], [6, 3, 1, 4]], index=e_2, columns=e_cols + ) tm.assert_frame_equal(result3, expected) def test_stack_multiple_bug(self): """ bug when some uniques are not present in the data #3170""" id_col = ([1] * 3) + ([2] * 3) - name = (['a'] * 3) + (['b'] * 3) - date = pd.to_datetime(['2013-01-03', '2013-01-04', '2013-01-05'] * 2) + name = (["a"] * 3) + (["b"] * 3) + date = pd.to_datetime(["2013-01-03", "2013-01-04", "2013-01-05"] * 2) var1 = np.random.randint(0, 100, 6) df = DataFrame(dict(ID=id_col, NAME=name, DATE=date, VAR1=var1)) - multi = df.set_index(['DATE', 'ID']) - multi.columns.name = 'Params' - unst = multi.unstack('ID') - down = unst.resample('W-THU').mean() + multi = df.set_index(["DATE", "ID"]) + multi.columns.name = "Params" + unst = multi.unstack("ID") + down = unst.resample("W-THU").mean() - rs = down.stack('ID') - xp = unst.loc[:, ['VAR1']].resample('W-THU').mean().stack('ID') - xp.columns.name = 'Params' + rs = down.stack("ID") + xp = unst.loc[:, ["VAR1"]].resample("W-THU").mean().stack("ID") + xp.columns.name = "Params" tm.assert_frame_equal(rs, xp) def test_stack_dropna(self): # GH #3997 - df = DataFrame({'A': ['a1', 'a2'], 'B': ['b1', 'b2'], 'C': [1, 1]}) - df = df.set_index(['A', 'B']) + df = DataFrame({"A": ["a1", "a2"], "B": ["b1", "b2"], "C": [1, 1]}) + df = df.set_index(["A", "B"]) stacked = df.unstack().stack(dropna=False) assert len(stacked) > len(stacked.dropna()) @@ -663,19 +728,23 @@ def test_stack_dropna(self): tm.assert_frame_equal(stacked, stacked.dropna()) def test_unstack_multiple_hierarchical(self): - df = DataFrame(index=[[0, 0, 0, 0, 1, 1, 1, 1], - [0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, 0, 1 - ]], - columns=[[0, 0, 1, 1], [0, 1, 0, 1]]) + df = DataFrame( + index=[ + [0, 0, 0, 0, 1, 1, 1, 1], + [0, 0, 1, 1, 0, 0, 1, 1], + [0, 1, 0, 1, 0, 1, 0, 1], + ], + columns=[[0, 0, 1, 1], [0, 1, 0, 1]], + ) - df.index.names = ['a', 'b', 'c'] - df.columns.names = ['d', 'e'] + df.index.names = ["a", "b", "c"] + df.columns.names = ["d", "e"] # it works! - df.unstack(['b', 'c']) + df.unstack(["b", "c"]) def test_groupby_transform(self): - s = self.frame['A'] + s = self.frame["A"] grouper = s.index.get_level_values(0) grouped = s.groupby(grouper) @@ -690,17 +759,21 @@ def test_unstack_sparse_keyspace(self): # Generate Long File & Test Pivot NUM_ROWS = 1000 - df = DataFrame({'A': np.random.randint(100, size=NUM_ROWS), - 'B': np.random.randint(300, size=NUM_ROWS), - 'C': np.random.randint(-7, 7, size=NUM_ROWS), - 'D': np.random.randint(-19, 19, size=NUM_ROWS), - 'E': np.random.randint(3000, size=NUM_ROWS), - 'F': np.random.randn(NUM_ROWS)}) - - idf = df.set_index(['A', 'B', 'C', 'D', 'E']) + df = DataFrame( + { + "A": np.random.randint(100, size=NUM_ROWS), + "B": np.random.randint(300, size=NUM_ROWS), + "C": np.random.randint(-7, 7, size=NUM_ROWS), + "D": np.random.randint(-19, 19, size=NUM_ROWS), + "E": np.random.randint(3000, size=NUM_ROWS), + "F": np.random.randn(NUM_ROWS), + } + ) + + idf = df.set_index(["A", "B", "C", "D", "E"]) # it works! is sufficient - idf.unstack('E') + idf.unstack("E") def test_unstack_unobserved_keys(self): # related to #2278 refactoring @@ -720,69 +793,79 @@ def test_unstack_unobserved_keys(self): @pytest.mark.slow def test_unstack_number_of_levels_larger_than_int32(self): # GH 20601 - df = DataFrame(np.random.randn(2 ** 16, 2), - index=[np.arange(2 ** 16), np.arange(2 ** 16)]) - with pytest.raises(ValueError, match='int32 overflow'): + df = DataFrame( + np.random.randn(2 ** 16, 2), index=[np.arange(2 ** 16), np.arange(2 ** 16)] + ) + with pytest.raises(ValueError, match="int32 overflow"): df.unstack() def test_stack_order_with_unsorted_levels(self): # GH 16323 def manual_compare_stacked(df, df_stacked, lev0, lev1): - assert all(df.loc[row, col] == - df_stacked.loc[(row, col[lev0]), col[lev1]] - for row in df.index for col in df.columns) + assert all( + df.loc[row, col] == df_stacked.loc[(row, col[lev0]), col[lev1]] + for row in df.index + for col in df.columns + ) # deep check for 1-row case for width in [2, 3]: levels_poss = itertools.product( - itertools.permutations([0, 1, 2], width), - repeat=2) + itertools.permutations([0, 1, 2], width), repeat=2 + ) for levels in levels_poss: - columns = MultiIndex(levels=levels, - codes=[[0, 0, 1, 1], - [0, 1, 0, 1]]) + columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) df = DataFrame(columns=columns, data=[range(4)]) for stack_lev in range(2): df_stacked = df.stack(stack_lev) - manual_compare_stacked(df, df_stacked, - stack_lev, 1 - stack_lev) + manual_compare_stacked(df, df_stacked, stack_lev, 1 - stack_lev) # check multi-row case - mi = MultiIndex(levels=[["A", "C", "B"], ["B", "A", "C"]], - codes=[np.repeat(range(3), 3), np.tile(range(3), 3)]) - df = DataFrame(columns=mi, index=range(5), - data=np.arange(5 * len(mi)).reshape(5, -1)) + mi = MultiIndex( + levels=[["A", "C", "B"], ["B", "A", "C"]], + codes=[np.repeat(range(3), 3), np.tile(range(3), 3)], + ) + df = DataFrame( + columns=mi, index=range(5), data=np.arange(5 * len(mi)).reshape(5, -1) + ) manual_compare_stacked(df, df.stack(0), 0, 1) def test_groupby_corner(self): - midx = MultiIndex(levels=[['foo'], ['bar'], ['baz']], - codes=[[0], [0], [0]], - names=['one', 'two', 'three']) - df = DataFrame([np.random.rand(4)], columns=['a', 'b', 'c', 'd'], - index=midx) + midx = MultiIndex( + levels=[["foo"], ["bar"], ["baz"]], + codes=[[0], [0], [0]], + names=["one", "two", "three"], + ) + df = DataFrame([np.random.rand(4)], columns=["a", "b", "c", "d"], index=midx) # should work - df.groupby(level='three') + df.groupby(level="three") def test_groupby_level_no_obs(self): # #1697 - midx = MultiIndex.from_tuples([('f1', 's1'), ('f1', 's2'), ( - 'f2', 's1'), ('f2', 's2'), ('f3', 's1'), ('f3', 's2')]) - df = DataFrame( - [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], columns=midx) - df1 = df.loc(axis=1)[df.columns.map( - lambda u: u[0] in ['f2', 'f3'])] + midx = MultiIndex.from_tuples( + [ + ("f1", "s1"), + ("f1", "s2"), + ("f2", "s1"), + ("f2", "s2"), + ("f3", "s1"), + ("f3", "s2"), + ] + ) + df = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], columns=midx) + df1 = df.loc(axis=1)[df.columns.map(lambda u: u[0] in ["f2", "f3"])] grouped = df1.groupby(axis=1, level=0) result = grouped.sum() - assert (result.columns == ['f2', 'f3']).all() + assert (result.columns == ["f2", "f3"]).all() def test_join(self): - a = self.frame.loc[self.frame.index[:5], ['A']] - b = self.frame.loc[self.frame.index[2:], ['B', 'C']] + a = self.frame.loc[self.frame.index[:5], ["A"]] + b = self.frame.loc[self.frame.index[2:], ["B", "C"]] - joined = a.join(b, how='outer').reindex(self.frame.index) + joined = a.join(b, how="outer").reindex(self.frame.index) expected = self.frame.copy() expected.values[np.isnan(joined.values)] = np.nan @@ -792,10 +875,10 @@ def test_join(self): tm.assert_frame_equal(joined, expected, check_names=False) def test_swaplevel(self): - swapped = self.frame['A'].swaplevel() - swapped2 = self.frame['A'].swaplevel(0) - swapped3 = self.frame['A'].swaplevel(0, 1) - swapped4 = self.frame['A'].swaplevel('first', 'second') + swapped = self.frame["A"].swaplevel() + swapped2 = self.frame["A"].swaplevel(0) + swapped3 = self.frame["A"].swaplevel(0, 1) + swapped4 = self.frame["A"].swaplevel("first", "second") assert not swapped.index.equals(self.frame.index) tm.assert_series_equal(swapped, swapped2) tm.assert_series_equal(swapped, swapped3) @@ -804,34 +887,34 @@ def test_swaplevel(self): back = swapped.swaplevel() back2 = swapped.swaplevel(0) back3 = swapped.swaplevel(0, 1) - back4 = swapped.swaplevel('second', 'first') + back4 = swapped.swaplevel("second", "first") assert back.index.equals(self.frame.index) tm.assert_series_equal(back, back2) tm.assert_series_equal(back, back3) tm.assert_series_equal(back, back4) ft = self.frame.T - swapped = ft.swaplevel('first', 'second', axis=1) - exp = self.frame.swaplevel('first', 'second').T + swapped = ft.swaplevel("first", "second", axis=1) + exp = self.frame.swaplevel("first", "second").T tm.assert_frame_equal(swapped, exp) def test_reorder_levels(self): - result = self.ymd.reorder_levels(['month', 'day', 'year']) + result = self.ymd.reorder_levels(["month", "day", "year"]) expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2) tm.assert_frame_equal(result, expected) - result = self.ymd['A'].reorder_levels(['month', 'day', 'year']) - expected = self.ymd['A'].swaplevel(0, 1).swaplevel(1, 2) + result = self.ymd["A"].reorder_levels(["month", "day", "year"]) + expected = self.ymd["A"].swaplevel(0, 1).swaplevel(1, 2) tm.assert_series_equal(result, expected) - result = self.ymd.T.reorder_levels(['month', 'day', 'year'], axis=1) + result = self.ymd.T.reorder_levels(["month", "day", "year"], axis=1) expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1) tm.assert_frame_equal(result, expected) - with pytest.raises(TypeError, match='hierarchical axis'): + with pytest.raises(TypeError, match="hierarchical axis"): self.ymd.reorder_levels([1, 2], axis=1) - with pytest.raises(IndexError, match='Too many levels'): + with pytest.raises(IndexError, match="Too many levels"): self.ymd.index.reorder_levels([1, 2, 3]) def test_insert_index(self): @@ -841,11 +924,13 @@ def test_insert_index(self): assert (df[2000, 1, 10] == df[2000, 1, 7]).all() def test_alignment(self): - x = Series(data=[1, 2, 3], index=MultiIndex.from_tuples([("A", 1), ( - "A", 2), ("B", 3)])) + x = Series( + data=[1, 2, 3], index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3)]) + ) - y = Series(data=[4, 5, 6], index=MultiIndex.from_tuples([("Z", 1), ( - "Z", 2), ("B", 3)])) + y = Series( + data=[4, 5, 6], index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B", 3)]) + ) res = x - y exp_index = x.index.union(y.index) @@ -860,39 +945,39 @@ def test_alignment(self): def test_count(self): frame = self.frame.copy() - frame.index.names = ['a', 'b'] + frame.index.names = ["a", "b"] - result = frame.count(level='b') + result = frame.count(level="b") expect = self.frame.count(level=1) tm.assert_frame_equal(result, expect, check_names=False) - result = frame.count(level='a') + result = frame.count(level="a") expect = self.frame.count(level=0) tm.assert_frame_equal(result, expect, check_names=False) series = self.series.copy() - series.index.names = ['a', 'b'] + series.index.names = ["a", "b"] - result = series.count(level='b') + result = series.count(level="b") expect = self.series.count(level=1) tm.assert_series_equal(result, expect, check_names=False) - assert result.index.name == 'b' + assert result.index.name == "b" - result = series.count(level='a') + result = series.count(level="a") expect = self.series.count(level=0) tm.assert_series_equal(result, expect, check_names=False) - assert result.index.name == 'a' + assert result.index.name == "a" msg = "Level x not found" with pytest.raises(KeyError, match=msg): - series.count('x') + series.count("x") with pytest.raises(KeyError, match=msg): - frame.count(level='x') + frame.count(level="x") - @pytest.mark.parametrize('op', AGG_FUNCTIONS) - @pytest.mark.parametrize('level', [0, 1]) - @pytest.mark.parametrize('skipna', [True, False]) - @pytest.mark.parametrize('sort', [True, False]) + @pytest.mark.parametrize("op", AGG_FUNCTIONS) + @pytest.mark.parametrize("level", [0, 1]) + @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("sort", [True, False]) def test_series_group_min_max(self, op, level, skipna, sort): # GH 17537 grouped = self.series.groupby(level=level, sort=sort) @@ -903,11 +988,11 @@ def test_series_group_min_max(self, op, level, skipna, sort): rightside = rightside.sort_index(level=level) tm.assert_series_equal(leftside, rightside) - @pytest.mark.parametrize('op', AGG_FUNCTIONS) - @pytest.mark.parametrize('level', [0, 1]) - @pytest.mark.parametrize('axis', [0, 1]) - @pytest.mark.parametrize('skipna', [True, False]) - @pytest.mark.parametrize('sort', [True, False]) + @pytest.mark.parametrize("op", AGG_FUNCTIONS) + @pytest.mark.parametrize("level", [0, 1]) + @pytest.mark.parametrize("axis", [0, 1]) + @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("sort", [True, False]) def test_frame_group_ops(self, op, level, axis, skipna, sort): # GH 17537 self.frame.iloc[1, [1, 2]] = np.nan @@ -927,8 +1012,7 @@ def aggf(x): return getattr(x, op)(skipna=skipna, axis=axis) leftside = grouped.agg(aggf) - rightside = getattr(frame, op)(level=level, axis=axis, - skipna=skipna) + rightside = getattr(frame, op)(level=level, axis=axis, skipna=skipna) if sort: rightside = rightside.sort_index(level=level, axis=axis) frame = frame.sort_index(level=level, axis=axis) @@ -950,25 +1034,28 @@ def test_stat_op_corner(self): def test_frame_any_all_group(self): df = DataFrame( - {'data': [False, False, True, False, True, False, True]}, + {"data": [False, False, True, False, True, False, True]}, index=[ - ['one', 'one', 'two', 'one', 'two', 'two', 'two'], - [0, 1, 0, 2, 1, 2, 3]]) + ["one", "one", "two", "one", "two", "two", "two"], + [0, 1, 0, 2, 1, 2, 3], + ], + ) result = df.any(level=0) - ex = DataFrame({'data': [False, True]}, index=['one', 'two']) + ex = DataFrame({"data": [False, True]}, index=["one", "two"]) tm.assert_frame_equal(result, ex) result = df.all(level=0) - ex = DataFrame({'data': [False, False]}, index=['one', 'two']) + ex = DataFrame({"data": [False, False]}, index=["one", "two"]) tm.assert_frame_equal(result, ex) def test_std_var_pass_ddof(self): - index = MultiIndex.from_arrays([np.arange(5).repeat(10), np.tile( - np.arange(10), 5)]) + index = MultiIndex.from_arrays( + [np.arange(5).repeat(10), np.tile(np.arange(10), 5)] + ) df = DataFrame(np.random.randn(len(index), 5), index=index) - for meth in ['var', 'std']: + for meth in ["var", "std"]: ddof = 4 alt = lambda x: getattr(x, meth)(ddof=ddof) @@ -981,12 +1068,12 @@ def test_std_var_pass_ddof(self): tm.assert_frame_equal(result, expected) def test_frame_series_agg_multiple_levels(self): - result = self.ymd.sum(level=['year', 'month']) - expected = self.ymd.groupby(level=['year', 'month']).sum() + result = self.ymd.sum(level=["year", "month"]) + expected = self.ymd.groupby(level=["year", "month"]).sum() tm.assert_frame_equal(result, expected) - result = self.ymd['A'].sum(level=['year', 'month']) - expected = self.ymd['A'].groupby(level=['year', 'month']).sum() + result = self.ymd["A"].sum(level=["year", "month"]) + expected = self.ymd["A"].groupby(level=["year", "month"]).sum() tm.assert_series_equal(result, expected) def test_groupby_multilevel(self): @@ -1008,39 +1095,42 @@ def test_groupby_multilevel_with_transform(self): pass def test_multilevel_consolidate(self): - index = MultiIndex.from_tuples([('foo', 'one'), ('foo', 'two'), ( - 'bar', 'one'), ('bar', 'two')]) + index = MultiIndex.from_tuples( + [("foo", "one"), ("foo", "two"), ("bar", "one"), ("bar", "two")] + ) df = DataFrame(np.random.randn(4, 4), index=index, columns=index) - df['Totals', ''] = df.sum(1) + df["Totals", ""] = df.sum(1) df = df._consolidate() def test_loc_preserve_names(self): result = self.ymd.loc[2000] - result2 = self.ymd['A'].loc[2000] + result2 = self.ymd["A"].loc[2000] assert result.index.names == self.ymd.index.names[1:] assert result2.index.names == self.ymd.index.names[1:] result = self.ymd.loc[2000, 2] - result2 = self.ymd['A'].loc[2000, 2] + result2 = self.ymd["A"].loc[2000, 2] assert result.index.name == self.ymd.index.names[2] assert result2.index.name == self.ymd.index.names[2] def test_unstack_preserve_types(self): # GH #403 - self.ymd['E'] = 'foo' - self.ymd['F'] = 2 + self.ymd["E"] = "foo" + self.ymd["F"] = 2 - unstacked = self.ymd.unstack('month') - assert unstacked['A', 1].dtype == np.float64 - assert unstacked['E', 1].dtype == np.object_ - assert unstacked['F', 1].dtype == np.float64 + unstacked = self.ymd.unstack("month") + assert unstacked["A", 1].dtype == np.float64 + assert unstacked["E", 1].dtype == np.object_ + assert unstacked["F", 1].dtype == np.float64 def test_unstack_group_index_overflow(self): codes = np.tile(np.arange(500), 2) level = np.arange(500) - index = MultiIndex(levels=[level] * 8 + [[0, 1]], - codes=[codes] * 8 + [np.arange(2).repeat(500)]) + index = MultiIndex( + levels=[level] * 8 + [[0, 1]], + codes=[codes] * 8 + [np.arange(2).repeat(500)], + ) s = Series(np.arange(1000), index=index) result = s.unstack() @@ -1051,17 +1141,20 @@ def test_unstack_group_index_overflow(self): tm.assert_series_equal(s, stacked.reindex(s.index)) # put it at beginning - index = MultiIndex(levels=[[0, 1]] + [level] * 8, - codes=[np.arange(2).repeat(500)] + [codes] * 8) + index = MultiIndex( + levels=[[0, 1]] + [level] * 8, + codes=[np.arange(2).repeat(500)] + [codes] * 8, + ) s = Series(np.arange(1000), index=index) result = s.unstack(0) assert result.shape == (500, 2) # put it in middle - index = MultiIndex(levels=[level] * 4 + [[0, 1]] + [level] * 4, - codes=([codes] * 4 + [np.arange(2).repeat(500)] + - [codes] * 4)) + index = MultiIndex( + levels=[level] * 4 + [[0, 1]] + [level] * 4, + codes=([codes] * 4 + [np.arange(2).repeat(500)] + [codes] * 4), + ) s = Series(np.arange(1000), index=index) result = s.unstack(4) @@ -1072,12 +1165,17 @@ def test_pyint_engine(self): # bits, the index underlying the MultiIndex engine works with Python # integers, rather than uint64. N = 5 - keys = [tuple(l) for l in [[0] * 10 * N, - [1] * 10 * N, - [2] * 10 * N, - [np.nan] * N + [2] * 9 * N, - [0] * N + [2] * 9 * N, - [np.nan] * N + [2] * 8 * N + [0] * N]] + keys = [ + tuple(l) + for l in [ + [0] * 10 * N, + [1] * 10 * N, + [2] * 10 * N, + [np.nan] * N + [2] * 9 * N, + [0] * N + [2] * 9 * N, + [np.nan] * N + [2] * 8 * N + [0] * N, + ] + ] # Each level contains 4 elements (including NaN), so it is represented # in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a # 64 bit engine and truncating the first levels, the fourth and fifth @@ -1100,93 +1198,106 @@ def test_pyint_engine(self): tm.assert_numpy_array_equal(result, expected) def test_to_html(self): - self.ymd.columns.name = 'foo' + self.ymd.columns.name = "foo" self.ymd.to_html() self.ymd.T.to_html() def test_level_with_tuples(self): - index = MultiIndex(levels=[[('foo', 'bar', 0), ('foo', 'baz', 0), ( - 'foo', 'qux', 0)], [0, 1]], - codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) + index = MultiIndex( + levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]], + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + ) series = Series(np.random.randn(6), index=index) frame = DataFrame(np.random.randn(6, 4), index=index) - result = series[('foo', 'bar', 0)] - result2 = series.loc[('foo', 'bar', 0)] + result = series[("foo", "bar", 0)] + result2 = series.loc[("foo", "bar", 0)] expected = series[:2] expected.index = expected.index.droplevel(0) tm.assert_series_equal(result, expected) tm.assert_series_equal(result2, expected) with pytest.raises(KeyError, match=r"^\(\('foo', 'bar', 0\), 2\)$"): - series[('foo', 'bar', 0), 2] + series[("foo", "bar", 0), 2] - result = frame.loc[('foo', 'bar', 0)] - result2 = frame.xs(('foo', 'bar', 0)) + result = frame.loc[("foo", "bar", 0)] + result2 = frame.xs(("foo", "bar", 0)) expected = frame[:2] expected.index = expected.index.droplevel(0) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) - index = MultiIndex(levels=[[('foo', 'bar'), ('foo', 'baz'), ( - 'foo', 'qux')], [0, 1]], - codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) + index = MultiIndex( + levels=[[("foo", "bar"), ("foo", "baz"), ("foo", "qux")], [0, 1]], + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + ) series = Series(np.random.randn(6), index=index) frame = DataFrame(np.random.randn(6, 4), index=index) - result = series[('foo', 'bar')] - result2 = series.loc[('foo', 'bar')] + result = series[("foo", "bar")] + result2 = series.loc[("foo", "bar")] expected = series[:2] expected.index = expected.index.droplevel(0) tm.assert_series_equal(result, expected) tm.assert_series_equal(result2, expected) - result = frame.loc[('foo', 'bar')] - result2 = frame.xs(('foo', 'bar')) + result = frame.loc[("foo", "bar")] + result2 = frame.xs(("foo", "bar")) expected = frame[:2] expected.index = expected.index.droplevel(0) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) def test_mixed_depth_drop(self): - arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], - ['', 'OD', 'OD', 'result1', 'result2', 'result1'], - ['', 'wx', 'wy', '', '', '']] + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] tuples = sorted(zip(*arrays)) index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4, 6), columns=index) - result = df.drop('a', axis=1) - expected = df.drop([('a', '', '')], axis=1) + result = df.drop("a", axis=1) + expected = df.drop([("a", "", "")], axis=1) tm.assert_frame_equal(expected, result) - result = df.drop(['top'], axis=1) - expected = df.drop([('top', 'OD', 'wx')], axis=1) - expected = expected.drop([('top', 'OD', 'wy')], axis=1) + result = df.drop(["top"], axis=1) + expected = df.drop([("top", "OD", "wx")], axis=1) + expected = expected.drop([("top", "OD", "wy")], axis=1) tm.assert_frame_equal(expected, result) - result = df.drop(('top', 'OD', 'wx'), axis=1) - expected = df.drop([('top', 'OD', 'wx')], axis=1) + result = df.drop(("top", "OD", "wx"), axis=1) + expected = df.drop([("top", "OD", "wx")], axis=1) tm.assert_frame_equal(expected, result) - expected = df.drop([('top', 'OD', 'wy')], axis=1) - expected = df.drop('top', axis=1) + expected = df.drop([("top", "OD", "wy")], axis=1) + expected = df.drop("top", axis=1) - result = df.drop('result1', level=1, axis=1) - expected = df.drop([('routine1', 'result1', ''), - ('routine2', 'result1', '')], axis=1) + result = df.drop("result1", level=1, axis=1) + expected = df.drop( + [("routine1", "result1", ""), ("routine2", "result1", "")], axis=1 + ) tm.assert_frame_equal(expected, result) def test_drop_nonunique(self): - df = DataFrame([["x-a", "x", "a", 1.5], ["x-a", "x", "a", 1.2], - ["z-c", "z", "c", 3.1], ["x-a", "x", "a", 4.1], - ["x-b", "x", "b", 5.1], ["x-b", "x", "b", 4.1], - ["x-b", "x", "b", 2.2], - ["y-a", "y", "a", 1.2], ["z-b", "z", "b", 2.1]], - columns=["var1", "var2", "var3", "var4"]) + df = DataFrame( + [ + ["x-a", "x", "a", 1.5], + ["x-a", "x", "a", 1.2], + ["z-c", "z", "c", 3.1], + ["x-a", "x", "a", 4.1], + ["x-b", "x", "b", 5.1], + ["x-b", "x", "b", 4.1], + ["x-b", "x", "b", 2.2], + ["y-a", "y", "a", 1.2], + ["z-b", "z", "b", 2.1], + ], + columns=["var1", "var2", "var3", "var4"], + ) grp_size = df.groupby("var1").size() drop_idx = grp_size.loc[grp_size == 1] @@ -1202,9 +1313,11 @@ def test_drop_nonunique(self): tm.assert_frame_equal(result, expected) def test_mixed_depth_pop(self): - arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], - ['', 'OD', 'OD', 'result1', 'result2', 'result1'], - ['', 'wx', 'wy', '', '', '']] + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] tuples = sorted(zip(*arrays)) index = MultiIndex.from_tuples(tuples) @@ -1212,97 +1325,99 @@ def test_mixed_depth_pop(self): df1 = df.copy() df2 = df.copy() - result = df1.pop('a') - expected = df2.pop(('a', '', '')) + result = df1.pop("a") + expected = df2.pop(("a", "", "")) tm.assert_series_equal(expected, result, check_names=False) tm.assert_frame_equal(df1, df2) - assert result.name == 'a' + assert result.name == "a" - expected = df1['top'] - df1 = df1.drop(['top'], axis=1) - result = df2.pop('top') + expected = df1["top"] + df1 = df1.drop(["top"], axis=1) + result = df2.pop("top") tm.assert_frame_equal(expected, result) tm.assert_frame_equal(df1, df2) def test_reindex_level_partial_selection(self): - result = self.frame.reindex(['foo', 'qux'], level=0) + result = self.frame.reindex(["foo", "qux"], level=0) expected = self.frame.iloc[[0, 1, 2, 7, 8, 9]] tm.assert_frame_equal(result, expected) - result = self.frame.T.reindex(['foo', 'qux'], axis=1, level=0) + result = self.frame.T.reindex(["foo", "qux"], axis=1, level=0) tm.assert_frame_equal(result, expected.T) - result = self.frame.loc[['foo', 'qux']] + result = self.frame.loc[["foo", "qux"]] tm.assert_frame_equal(result, expected) - result = self.frame['A'].loc[['foo', 'qux']] - tm.assert_series_equal(result, expected['A']) + result = self.frame["A"].loc[["foo", "qux"]] + tm.assert_series_equal(result, expected["A"]) - result = self.frame.T.loc[:, ['foo', 'qux']] + result = self.frame.T.loc[:, ["foo", "qux"]] tm.assert_frame_equal(result, expected.T) def test_drop_level(self): - result = self.frame.drop(['bar', 'qux'], level='first') + result = self.frame.drop(["bar", "qux"], level="first") expected = self.frame.iloc[[0, 1, 2, 5, 6]] tm.assert_frame_equal(result, expected) - result = self.frame.drop(['two'], level='second') + result = self.frame.drop(["two"], level="second") expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]] tm.assert_frame_equal(result, expected) - result = self.frame.T.drop(['bar', 'qux'], axis=1, level='first') + result = self.frame.T.drop(["bar", "qux"], axis=1, level="first") expected = self.frame.iloc[[0, 1, 2, 5, 6]].T tm.assert_frame_equal(result, expected) - result = self.frame.T.drop(['two'], axis=1, level='second') + result = self.frame.T.drop(["two"], axis=1, level="second") expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]].T tm.assert_frame_equal(result, expected) def test_drop_level_nonunique_datetime(self): # GH 12701 - idx = Index([2, 3, 4, 4, 5], name='id') - idxdt = pd.to_datetime(['201603231400', - '201603231500', - '201603231600', - '201603231600', - '201603231700']) - df = DataFrame(np.arange(10).reshape(5, 2), - columns=list('ab'), index=idx) - df['tstamp'] = idxdt - df = df.set_index('tstamp', append=True) - ts = Timestamp('201603231600') + idx = Index([2, 3, 4, 4, 5], name="id") + idxdt = pd.to_datetime( + [ + "201603231400", + "201603231500", + "201603231600", + "201603231600", + "201603231700", + ] + ) + df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx) + df["tstamp"] = idxdt + df = df.set_index("tstamp", append=True) + ts = Timestamp("201603231600") assert df.index.is_unique is False - result = df.drop(ts, level='tstamp') + result = df.drop(ts, level="tstamp") expected = df.loc[idx != 4] tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('box', [Series, DataFrame]) + @pytest.mark.parametrize("box", [Series, DataFrame]) def test_drop_tz_aware_timestamp_across_dst(self, box): # GH 21761 - start = Timestamp('2017-10-29', tz='Europe/Berlin') - end = Timestamp('2017-10-29 04:00:00', tz='Europe/Berlin') - index = pd.date_range(start, end, freq='15min') + start = Timestamp("2017-10-29", tz="Europe/Berlin") + end = Timestamp("2017-10-29 04:00:00", tz="Europe/Berlin") + index = pd.date_range(start, end, freq="15min") data = box(data=[1] * len(index), index=index) result = data.drop(start) - expected_start = Timestamp('2017-10-29 00:15:00', tz='Europe/Berlin') - expected_idx = pd.date_range(expected_start, end, freq='15min') + expected_start = Timestamp("2017-10-29 00:15:00", tz="Europe/Berlin") + expected_idx = pd.date_range(expected_start, end, freq="15min") expected = box(data=[1] * len(expected_idx), index=expected_idx) tm.assert_equal(result, expected) def test_drop_preserve_names(self): - index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], - [1, 2, 3, 1, 2, 3]], - names=['one', 'two']) + index = MultiIndex.from_arrays( + [[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]], names=["one", "two"] + ) df = DataFrame(np.random.randn(6, 3), index=index) result = df.drop([(0, 2)]) - assert result.index.names == ('one', 'two') + assert result.index.names == ("one", "two") def test_unicode_repr_issues(self): - levels = [Index(['a/\u03c3', 'b/\u03c3', 'c/\u03c3']), - Index([0, 1])] + levels = [Index(["a/\u03c3", "b/\u03c3", "c/\u03c3"]), Index([0, 1])] codes = [np.arange(3).repeat(2), np.tile(np.arange(2), 3)] index = MultiIndex(levels=levels, codes=codes) @@ -1312,8 +1427,7 @@ def test_unicode_repr_issues(self): # repr(index.get_level_values(1)) def test_unicode_repr_level_names(self): - index = MultiIndex.from_tuples([(0, 0), (1, 1)], - names=['\u0394', 'i1']) + index = MultiIndex.from_tuples([(0, 0), (1, 1)], names=["\u0394", "i1"]) s = Series(range(2), index=index) df = DataFrame(np.random.randn(2, 4), index=index) @@ -1322,48 +1436,51 @@ def test_unicode_repr_level_names(self): def test_join_segfault(self): # 1532 - df1 = DataFrame({'a': [1, 1], 'b': [1, 2], 'x': [1, 2]}) - df2 = DataFrame({'a': [2, 2], 'b': [1, 2], 'y': [1, 2]}) - df1 = df1.set_index(['a', 'b']) - df2 = df2.set_index(['a', 'b']) + df1 = DataFrame({"a": [1, 1], "b": [1, 2], "x": [1, 2]}) + df2 = DataFrame({"a": [2, 2], "b": [1, 2], "y": [1, 2]}) + df1 = df1.set_index(["a", "b"]) + df2 = df2.set_index(["a", "b"]) # it works! - for how in ['left', 'right', 'outer']: + for how in ["left", "right", "outer"]: df1.join(df2, how=how) def test_frame_dict_constructor_empty_series(self): - s1 = Series([ - 1, 2, 3, 4 - ], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2), (2, 4)])) - s2 = Series([ - 1, 2, 3, 4 - ], index=MultiIndex.from_tuples([(1, 2), (1, 3), (3, 2), (3, 4)])) + s1 = Series( + [1, 2, 3, 4], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2), (2, 4)]) + ) + s2 = Series( + [1, 2, 3, 4], index=MultiIndex.from_tuples([(1, 2), (1, 3), (3, 2), (3, 4)]) + ) s3 = Series() # it works! - DataFrame({'foo': s1, 'bar': s2, 'baz': s3}) - DataFrame.from_dict({'foo': s1, 'baz': s3, 'bar': s2}) + DataFrame({"foo": s1, "bar": s2, "baz": s3}) + DataFrame.from_dict({"foo": s1, "baz": s3, "bar": s2}) def test_multiindex_na_repr(self): # only an issue with long columns from numpy import nan - df3 = DataFrame({ - 'A' * 30: {('A', 'A0006000', 'nuit'): 'A0006000'}, - 'B' * 30: {('A', 'A0006000', 'nuit'): nan}, - 'C' * 30: {('A', 'A0006000', 'nuit'): nan}, - 'D' * 30: {('A', 'A0006000', 'nuit'): nan}, - 'E' * 30: {('A', 'A0006000', 'nuit'): 'A'}, - 'F' * 30: {('A', 'A0006000', 'nuit'): nan}, - }) - - idf = df3.set_index(['A' * 30, 'C' * 30]) + + df3 = DataFrame( + { + "A" * 30: {("A", "A0006000", "nuit"): "A0006000"}, + "B" * 30: {("A", "A0006000", "nuit"): nan}, + "C" * 30: {("A", "A0006000", "nuit"): nan}, + "D" * 30: {("A", "A0006000", "nuit"): nan}, + "E" * 30: {("A", "A0006000", "nuit"): "A"}, + "F" * 30: {("A", "A0006000", "nuit"): nan}, + } + ) + + idf = df3.set_index(["A" * 30, "C" * 30]) repr(idf) def test_assign_index_sequences(self): # #2200 - df = DataFrame({"a": [1, 2, 3], - "b": [4, 5, 6], - "c": [7, 8, 9]}).set_index(["a", "b"]) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}).set_index( + ["a", "b"] + ) index = list(df.index) index[0] = ("faz", "boo") df.index = index @@ -1375,19 +1492,24 @@ def test_assign_index_sequences(self): repr(df) def test_tuples_have_na(self): - index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], - codes=[[1, 1, 1, 1, -1, 0, 0, 0], - [0, 1, 2, 3, 0, 1, 2, 3]]) + index = MultiIndex( + levels=[[1, 0], [0, 1, 2, 3]], + codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]], + ) assert isna(index[4][0]) assert isna(index.values[4][0]) def test_duplicate_groupby_issues(self): - idx_tp = [('600809', '20061231'), ('600809', '20070331'), - ('600809', '20070630'), ('600809', '20070331')] - dt = ['demo', 'demo', 'demo', 'demo'] - - idx = MultiIndex.from_tuples(idx_tp, names=['STK_ID', 'RPT_Date']) + idx_tp = [ + ("600809", "20061231"), + ("600809", "20070331"), + ("600809", "20070630"), + ("600809", "20070331"), + ] + dt = ["demo", "demo", "demo", "demo"] + + idx = MultiIndex.from_tuples(idx_tp, names=["STK_ID", "RPT_Date"]) s = Series(dt, index=idx) result = s.groupby(s.index).first() @@ -1395,25 +1517,31 @@ def test_duplicate_groupby_issues(self): def test_duplicate_mi(self): # GH 4516 - df = DataFrame([['foo', 'bar', 1.0, 1], ['foo', 'bar', 2.0, 2], - ['bah', 'bam', 3.0, 3], - ['bah', 'bam', 4.0, 4], ['foo', 'bar', 5.0, 5], - ['bah', 'bam', 6.0, 6]], - columns=list('ABCD')) - df = df.set_index(['A', 'B']) + df = DataFrame( + [ + ["foo", "bar", 1.0, 1], + ["foo", "bar", 2.0, 2], + ["bah", "bam", 3.0, 3], + ["bah", "bam", 4.0, 4], + ["foo", "bar", 5.0, 5], + ["bah", "bam", 6.0, 6], + ], + columns=list("ABCD"), + ) + df = df.set_index(["A", "B"]) df = df.sort_index(level=0) - expected = DataFrame([['foo', 'bar', 1.0, 1], ['foo', 'bar', 2.0, 2], - ['foo', 'bar', 5.0, 5]], - columns=list('ABCD')).set_index(['A', 'B']) - result = df.loc[('foo', 'bar')] + expected = DataFrame( + [["foo", "bar", 1.0, 1], ["foo", "bar", 2.0, 2], ["foo", "bar", 5.0, 5]], + columns=list("ABCD"), + ).set_index(["A", "B"]) + result = df.loc[("foo", "bar")] tm.assert_frame_equal(result, expected) def test_duplicated_drop_duplicates(self): # GH 4060 idx = MultiIndex.from_arrays(([1, 2, 3, 1, 2, 3], [1, 1, 1, 1, 2, 2])) - expected = np.array( - [False, False, False, True, False, False], dtype=bool) + expected = np.array([False, False, False, True, False, False], dtype=bool) duplicated = idx.duplicated() tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool @@ -1421,11 +1549,11 @@ def test_duplicated_drop_duplicates(self): tm.assert_index_equal(idx.drop_duplicates(), expected) expected = np.array([True, False, False, False, False, False]) - duplicated = idx.duplicated(keep='last') + duplicated = idx.duplicated(keep="last") tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool expected = MultiIndex.from_arrays(([2, 3, 1, 2, 3], [1, 1, 1, 2, 2])) - tm.assert_index_equal(idx.drop_duplicates(keep='last'), expected) + tm.assert_index_equal(idx.drop_duplicates(keep="last"), expected) expected = np.array([True, False, False, True, False, False]) duplicated = idx.duplicated(keep=False) @@ -1436,25 +1564,26 @@ def test_duplicated_drop_duplicates(self): def test_multiindex_set_index(self): # segfault in #3308 - d = {'t1': [2, 2.5, 3], 't2': [4, 5, 6]} + d = {"t1": [2, 2.5, 3], "t2": [4, 5, 6]} df = DataFrame(d) tuples = [(0, 1), (0, 2), (1, 2)] - df['tuples'] = tuples + df["tuples"] = tuples - index = MultiIndex.from_tuples(df['tuples']) + index = MultiIndex.from_tuples(df["tuples"]) # it works! df.set_index(index) def test_datetimeindex(self): idx1 = pd.DatetimeIndex( - ['2013-04-01 9:00', '2013-04-02 9:00', '2013-04-03 9:00' - ] * 2, tz='Asia/Tokyo') - idx2 = pd.date_range('2010/01/01', periods=6, freq='M', - tz='US/Eastern') + ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"] * 2, + tz="Asia/Tokyo", + ) + idx2 = pd.date_range("2010/01/01", periods=6, freq="M", tz="US/Eastern") idx = MultiIndex.from_arrays([idx1, idx2]) - expected1 = pd.DatetimeIndex(['2013-04-01 9:00', '2013-04-02 9:00', - '2013-04-03 9:00'], tz='Asia/Tokyo') + expected1 = pd.DatetimeIndex( + ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"], tz="Asia/Tokyo" + ) tm.assert_index_equal(idx.levels[0], expected1) tm.assert_index_equal(idx.levels[1], idx2) @@ -1465,18 +1594,19 @@ def test_datetimeindex(self): date2 = datetime.datetime.today() date3 = Timestamp.today() - for d1, d2 in itertools.product( - [date1, date2, date3], [date1, date2, date3]): + for d1, d2 in itertools.product([date1, date2, date3], [date1, date2, date3]): index = MultiIndex.from_product([[d1], [d2]]) assert isinstance(index.levels[0], pd.DatetimeIndex) assert isinstance(index.levels[1], pd.DatetimeIndex) def test_constructor_with_tz(self): - index = pd.DatetimeIndex(['2013/01/01 09:00', '2013/01/02 09:00'], - name='dt1', tz='US/Pacific') - columns = pd.DatetimeIndex(['2014/01/01 09:00', '2014/01/02 09:00'], - name='dt2', tz='Asia/Tokyo') + index = pd.DatetimeIndex( + ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific" + ) + columns = pd.DatetimeIndex( + ["2014/01/01 09:00", "2014/01/02 09:00"], name="dt2", tz="Asia/Tokyo" + ) result = MultiIndex.from_arrays([index, columns]) tm.assert_index_equal(result.levels[0], index) @@ -1489,49 +1619,72 @@ def test_constructor_with_tz(self): def test_set_index_datetime(self): # GH 3950 df = DataFrame( - {'label': ['a', 'a', 'a', 'b', 'b', 'b'], - 'datetime': ['2011-07-19 07:00:00', '2011-07-19 08:00:00', - '2011-07-19 09:00:00', '2011-07-19 07:00:00', - '2011-07-19 08:00:00', '2011-07-19 09:00:00'], - 'value': range(6)}) - df.index = pd.to_datetime(df.pop('datetime'), utc=True) - df.index = df.index.tz_convert('US/Pacific') - - expected = pd.DatetimeIndex(['2011-07-19 07:00:00', - '2011-07-19 08:00:00', - '2011-07-19 09:00:00'], name='datetime') - expected = expected.tz_localize('UTC').tz_convert('US/Pacific') - - df = df.set_index('label', append=True) + { + "label": ["a", "a", "a", "b", "b", "b"], + "datetime": [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ], + "value": range(6), + } + ) + df.index = pd.to_datetime(df.pop("datetime"), utc=True) + df.index = df.index.tz_convert("US/Pacific") + + expected = pd.DatetimeIndex( + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], + name="datetime", + ) + expected = expected.tz_localize("UTC").tz_convert("US/Pacific") + + df = df.set_index("label", append=True) tm.assert_index_equal(df.index.levels[0], expected) - tm.assert_index_equal(df.index.levels[1], - Index(['a', 'b'], name='label')) + tm.assert_index_equal(df.index.levels[1], Index(["a", "b"], name="label")) df = df.swaplevel(0, 1) - tm.assert_index_equal(df.index.levels[0], - Index(['a', 'b'], name='label')) + tm.assert_index_equal(df.index.levels[0], Index(["a", "b"], name="label")) tm.assert_index_equal(df.index.levels[1], expected) df = DataFrame(np.random.random(6)) - idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', - '2011-07-19 09:00:00', '2011-07-19 07:00:00', - '2011-07-19 08:00:00', '2011-07-19 09:00:00'], - tz='US/Eastern') - idx2 = pd.DatetimeIndex(['2012-04-01 09:00', '2012-04-01 09:00', - '2012-04-01 09:00', '2012-04-02 09:00', - '2012-04-02 09:00', '2012-04-02 09:00'], - tz='US/Eastern') - idx3 = pd.date_range('2011-01-01 09:00', periods=6, tz='Asia/Tokyo') + idx1 = pd.DatetimeIndex( + [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ], + tz="US/Eastern", + ) + idx2 = pd.DatetimeIndex( + [ + "2012-04-01 09:00", + "2012-04-01 09:00", + "2012-04-01 09:00", + "2012-04-02 09:00", + "2012-04-02 09:00", + "2012-04-02 09:00", + ], + tz="US/Eastern", + ) + idx3 = pd.date_range("2011-01-01 09:00", periods=6, tz="Asia/Tokyo") df = df.set_index(idx1) df = df.set_index(idx2, append=True) df = df.set_index(idx3, append=True) - expected1 = pd.DatetimeIndex(['2011-07-19 07:00:00', - '2011-07-19 08:00:00', - '2011-07-19 09:00:00'], tz='US/Eastern') - expected2 = pd.DatetimeIndex(['2012-04-01 09:00', '2012-04-02 09:00'], - tz='US/Eastern') + expected1 = pd.DatetimeIndex( + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], + tz="US/Eastern", + ) + expected2 = pd.DatetimeIndex( + ["2012-04-01 09:00", "2012-04-02 09:00"], tz="US/Eastern" + ) tm.assert_index_equal(df.index.levels[0], expected1) tm.assert_index_equal(df.index.levels[1], expected2) @@ -1544,157 +1697,190 @@ def test_set_index_datetime(self): def test_reset_index_datetime(self): # GH 3950 - for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']: - idx1 = pd.date_range('1/1/2011', periods=5, freq='D', tz=tz, - name='idx1') - idx2 = Index(range(5), name='idx2', dtype='int64') + for tz in ["UTC", "Asia/Tokyo", "US/Eastern"]: + idx1 = pd.date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx1") + idx2 = Index(range(5), name="idx2", dtype="int64") idx = MultiIndex.from_arrays([idx1, idx2]) df = DataFrame( - {'a': np.arange(5, dtype='int64'), - 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx) - - expected = DataFrame({'idx1': [datetime.datetime(2011, 1, 1), - datetime.datetime(2011, 1, 2), - datetime.datetime(2011, 1, 3), - datetime.datetime(2011, 1, 4), - datetime.datetime(2011, 1, 5)], - 'idx2': np.arange(5, dtype='int64'), - 'a': np.arange(5, dtype='int64'), - 'b': ['A', 'B', 'C', 'D', 'E']}, - columns=['idx1', 'idx2', 'a', 'b']) - expected['idx1'] = expected['idx1'].apply( - lambda d: Timestamp(d, tz=tz)) + {"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]}, + index=idx, + ) + + expected = DataFrame( + { + "idx1": [ + datetime.datetime(2011, 1, 1), + datetime.datetime(2011, 1, 2), + datetime.datetime(2011, 1, 3), + datetime.datetime(2011, 1, 4), + datetime.datetime(2011, 1, 5), + ], + "idx2": np.arange(5, dtype="int64"), + "a": np.arange(5, dtype="int64"), + "b": ["A", "B", "C", "D", "E"], + }, + columns=["idx1", "idx2", "a", "b"], + ) + expected["idx1"] = expected["idx1"].apply(lambda d: Timestamp(d, tz=tz)) tm.assert_frame_equal(df.reset_index(), expected) - idx3 = pd.date_range('1/1/2012', periods=5, freq='MS', - tz='Europe/Paris', name='idx3') + idx3 = pd.date_range( + "1/1/2012", periods=5, freq="MS", tz="Europe/Paris", name="idx3" + ) idx = MultiIndex.from_arrays([idx1, idx2, idx3]) df = DataFrame( - {'a': np.arange(5, dtype='int64'), - 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx) - - expected = DataFrame({'idx1': [datetime.datetime(2011, 1, 1), - datetime.datetime(2011, 1, 2), - datetime.datetime(2011, 1, 3), - datetime.datetime(2011, 1, 4), - datetime.datetime(2011, 1, 5)], - 'idx2': np.arange(5, dtype='int64'), - 'idx3': [datetime.datetime(2012, 1, 1), - datetime.datetime(2012, 2, 1), - datetime.datetime(2012, 3, 1), - datetime.datetime(2012, 4, 1), - datetime.datetime(2012, 5, 1)], - 'a': np.arange(5, dtype='int64'), - 'b': ['A', 'B', 'C', 'D', 'E']}, - columns=['idx1', 'idx2', 'idx3', 'a', 'b']) - expected['idx1'] = expected['idx1'].apply( - lambda d: Timestamp(d, tz=tz)) - expected['idx3'] = expected['idx3'].apply( - lambda d: Timestamp(d, tz='Europe/Paris')) + {"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]}, + index=idx, + ) + + expected = DataFrame( + { + "idx1": [ + datetime.datetime(2011, 1, 1), + datetime.datetime(2011, 1, 2), + datetime.datetime(2011, 1, 3), + datetime.datetime(2011, 1, 4), + datetime.datetime(2011, 1, 5), + ], + "idx2": np.arange(5, dtype="int64"), + "idx3": [ + datetime.datetime(2012, 1, 1), + datetime.datetime(2012, 2, 1), + datetime.datetime(2012, 3, 1), + datetime.datetime(2012, 4, 1), + datetime.datetime(2012, 5, 1), + ], + "a": np.arange(5, dtype="int64"), + "b": ["A", "B", "C", "D", "E"], + }, + columns=["idx1", "idx2", "idx3", "a", "b"], + ) + expected["idx1"] = expected["idx1"].apply(lambda d: Timestamp(d, tz=tz)) + expected["idx3"] = expected["idx3"].apply( + lambda d: Timestamp(d, tz="Europe/Paris") + ) tm.assert_frame_equal(df.reset_index(), expected) # GH 7793 - idx = MultiIndex.from_product([['a', 'b'], pd.date_range( - '20130101', periods=3, tz=tz)]) + idx = MultiIndex.from_product( + [["a", "b"], pd.date_range("20130101", periods=3, tz=tz)] + ) df = DataFrame( - np.arange(6, dtype='int64').reshape( - 6, 1), columns=['a'], index=idx) - - expected = DataFrame({'level_0': 'a a a b b b'.split(), - 'level_1': [ - datetime.datetime(2013, 1, 1), - datetime.datetime(2013, 1, 2), - datetime.datetime(2013, 1, 3)] * 2, - 'a': np.arange(6, dtype='int64')}, - columns=['level_0', 'level_1', 'a']) - expected['level_1'] = expected['level_1'].apply( - lambda d: Timestamp(d, freq='D', tz=tz)) + np.arange(6, dtype="int64").reshape(6, 1), columns=["a"], index=idx + ) + + expected = DataFrame( + { + "level_0": "a a a b b b".split(), + "level_1": [ + datetime.datetime(2013, 1, 1), + datetime.datetime(2013, 1, 2), + datetime.datetime(2013, 1, 3), + ] + * 2, + "a": np.arange(6, dtype="int64"), + }, + columns=["level_0", "level_1", "a"], + ) + expected["level_1"] = expected["level_1"].apply( + lambda d: Timestamp(d, freq="D", tz=tz) + ) tm.assert_frame_equal(df.reset_index(), expected) def test_reset_index_period(self): # GH 7746 idx = MultiIndex.from_product( - [pd.period_range('20130101', periods=3, freq='M'), list('abc')], - names=['month', 'feature']) - - df = DataFrame(np.arange(9, dtype='int64').reshape(-1, 1), - index=idx, columns=['a']) - expected = DataFrame({ - 'month': ([pd.Period('2013-01', freq='M')] * 3 + - [pd.Period('2013-02', freq='M')] * 3 + - [pd.Period('2013-03', freq='M')] * 3), - 'feature': ['a', 'b', 'c'] * 3, - 'a': np.arange(9, dtype='int64') - }, columns=['month', 'feature', 'a']) + [pd.period_range("20130101", periods=3, freq="M"), list("abc")], + names=["month", "feature"], + ) + + df = DataFrame( + np.arange(9, dtype="int64").reshape(-1, 1), index=idx, columns=["a"] + ) + expected = DataFrame( + { + "month": ( + [pd.Period("2013-01", freq="M")] * 3 + + [pd.Period("2013-02", freq="M")] * 3 + + [pd.Period("2013-03", freq="M")] * 3 + ), + "feature": ["a", "b", "c"] * 3, + "a": np.arange(9, dtype="int64"), + }, + columns=["month", "feature", "a"], + ) tm.assert_frame_equal(df.reset_index(), expected) def test_reset_index_multiindex_columns(self): - levels = [['A', ''], ['B', 'b']] - df = DataFrame([[0, 2], [1, 3]], - columns=MultiIndex.from_tuples(levels)) - result = df[['B']].rename_axis('A').reset_index() + levels = [["A", ""], ["B", "b"]] + df = DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels)) + result = df[["B"]].rename_axis("A").reset_index() tm.assert_frame_equal(result, df) # gh-16120: already existing column - with pytest.raises(ValueError, - match=(r"cannot insert \('A', ''\), " - "already exists")): - df.rename_axis('A').reset_index() + with pytest.raises( + ValueError, match=(r"cannot insert \('A', ''\), " "already exists") + ): + df.rename_axis("A").reset_index() # gh-16164: multiindex (tuple) full key - result = df.set_index([('A', '')]).reset_index() + result = df.set_index([("A", "")]).reset_index() tm.assert_frame_equal(result, df) # with additional (unnamed) index level - idx_col = DataFrame([[0], [1]], - columns=MultiIndex.from_tuples([('level_0', '')])) - expected = pd.concat([idx_col, df[[('B', 'b'), ('A', '')]]], axis=1) - result = df.set_index([('B', 'b')], append=True).reset_index() + idx_col = DataFrame( + [[0], [1]], columns=MultiIndex.from_tuples([("level_0", "")]) + ) + expected = pd.concat([idx_col, df[[("B", "b"), ("A", "")]]], axis=1) + result = df.set_index([("B", "b")], append=True).reset_index() tm.assert_frame_equal(result, expected) # with index name which is a too long tuple... - with pytest.raises(ValueError, - match=("Item must have length equal " - "to number of levels.")): - df.rename_axis([('C', 'c', 'i')]).reset_index() + with pytest.raises( + ValueError, match=("Item must have length equal " "to number of levels.") + ): + df.rename_axis([("C", "c", "i")]).reset_index() # or too short... - levels = [['A', 'a', ''], ['B', 'b', 'i']] - df2 = DataFrame([[0, 2], [1, 3]], - columns=MultiIndex.from_tuples(levels)) - idx_col = DataFrame([[0], [1]], - columns=MultiIndex.from_tuples([('C', 'c', 'ii')])) + levels = [["A", "a", ""], ["B", "b", "i"]] + df2 = DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels)) + idx_col = DataFrame( + [[0], [1]], columns=MultiIndex.from_tuples([("C", "c", "ii")]) + ) expected = pd.concat([idx_col, df2], axis=1) - result = df2.rename_axis([('C', 'c')]).reset_index(col_fill='ii') + result = df2.rename_axis([("C", "c")]).reset_index(col_fill="ii") tm.assert_frame_equal(result, expected) # ... which is incompatible with col_fill=None - with pytest.raises(ValueError, - match=("col_fill=None is incompatible with " - r"incomplete column name \('C', 'c'\)")): - df2.rename_axis([('C', 'c')]).reset_index(col_fill=None) + with pytest.raises( + ValueError, + match=( + "col_fill=None is incompatible with " + r"incomplete column name \('C', 'c'\)" + ), + ): + df2.rename_axis([("C", "c")]).reset_index(col_fill=None) # with col_level != 0 - result = df2.rename_axis([('c', 'ii')]).reset_index(col_level=1, - col_fill='C') + result = df2.rename_axis([("c", "ii")]).reset_index(col_level=1, col_fill="C") tm.assert_frame_equal(result, expected) def test_set_index_period(self): # GH 6631 df = DataFrame(np.random.random(6)) - idx1 = pd.period_range('2011-01-01', periods=3, freq='M') + idx1 = pd.period_range("2011-01-01", periods=3, freq="M") idx1 = idx1.append(idx1) - idx2 = pd.period_range('2013-01-01 09:00', periods=2, freq='H') + idx2 = pd.period_range("2013-01-01 09:00", periods=2, freq="H") idx2 = idx2.append(idx2).append(idx2) - idx3 = pd.period_range('2005', periods=6, freq='A') + idx3 = pd.period_range("2005", periods=6, freq="A") df = df.set_index(idx1) df = df.set_index(idx2, append=True) df = df.set_index(idx3, append=True) - expected1 = pd.period_range('2011-01-01', periods=3, freq='M') - expected2 = pd.period_range('2013-01-01 09:00', periods=2, freq='H') + expected1 = pd.period_range("2011-01-01", periods=3, freq="M") + expected2 = pd.period_range("2013-01-01 09:00", periods=2, freq="H") tm.assert_index_equal(df.index.levels[0], expected1) tm.assert_index_equal(df.index.levels[1], expected2) @@ -1708,9 +1894,9 @@ def test_repeat(self): # GH 9361 # fixed by # GH 7891 m_idx = MultiIndex.from_tuples([(1, 2), (3, 4), (5, 6), (7, 8)]) - data = ['a', 'b', 'c', 'd'] + data = ["a", "b", "c", "d"] m_df = Series(data, index=m_idx) - assert m_df.repeat(3).shape == (3 * len(data), ) + assert m_df.repeat(3).shape == (3 * len(data),) class TestSorted(Base): @@ -1725,22 +1911,38 @@ def test_sorting_repr_8017(self): np.random.seed(0) data = np.random.randn(3, 4) - for gen, extra in [([1., 3., 2., 5.], 4.), ([1, 3, 2, 5], 4), - ([Timestamp('20130101'), Timestamp('20130103'), - Timestamp('20130102'), Timestamp('20130105')], - Timestamp('20130104')), - (['1one', '3one', '2one', '5one'], '4one')]: - columns = MultiIndex.from_tuples([('red', i) for i in gen]) - df = DataFrame(data, index=list('def'), columns=columns) - df2 = pd.concat([df, - DataFrame('world', index=list('def'), - columns=MultiIndex.from_tuples( - [('red', extra)]))], axis=1) + for gen, extra in [ + ([1.0, 3.0, 2.0, 5.0], 4.0), + ([1, 3, 2, 5], 4), + ( + [ + Timestamp("20130101"), + Timestamp("20130103"), + Timestamp("20130102"), + Timestamp("20130105"), + ], + Timestamp("20130104"), + ), + (["1one", "3one", "2one", "5one"], "4one"), + ]: + columns = MultiIndex.from_tuples([("red", i) for i in gen]) + df = DataFrame(data, index=list("def"), columns=columns) + df2 = pd.concat( + [ + df, + DataFrame( + "world", + index=list("def"), + columns=MultiIndex.from_tuples([("red", extra)]), + ), + ], + axis=1, + ) # check that the repr is good # make sure that we have a correct sparsified repr # e.g. only 1 header of read - assert str(df2).splitlines()[0].split() == ['red'] + assert str(df2).splitlines()[0].split() == ["red"] # GH 8017 # sorting fails after columns added @@ -1756,7 +1958,7 @@ def test_sorting_repr_8017(self): # setitem then sort result = df.copy() - result[('red', extra)] = 'world' + result[("red", extra)] = "world" result = result.sort_index(axis=1) tm.assert_frame_equal(result, expected) @@ -1768,7 +1970,7 @@ def test_sort_index_level(self): # axis=1 # series - a_sorted = self.frame['A'].sort_index(level=0) + a_sorted = self.frame["A"].sort_index(level=0) # preserve names assert a_sorted.index.names == self.frame.index.names @@ -1798,8 +2000,8 @@ def test_sort_index_level_large_cardinality(self): assert result.index.lexsort_depth == 3 def test_sort_index_level_by_name(self): - self.frame.index.names = ['first', 'second'] - result = self.frame.sort_index(level='second') + self.frame.index.names = ["first", "second"] + result = self.frame.sort_index(level="second") expected = self.frame.sort_index(level=1) tm.assert_frame_equal(result, expected) @@ -1807,32 +2009,36 @@ def test_sort_index_level_mixed(self): sorted_before = self.frame.sort_index(level=1) df = self.frame.copy() - df['foo'] = 'bar' + df["foo"] = "bar" sorted_after = df.sort_index(level=1) - tm.assert_frame_equal(sorted_before, - sorted_after.drop(['foo'], axis=1)) + tm.assert_frame_equal(sorted_before, sorted_after.drop(["foo"], axis=1)) dft = self.frame.T sorted_before = dft.sort_index(level=1, axis=1) - dft['foo', 'three'] = 'bar' + dft["foo", "three"] = "bar" sorted_after = dft.sort_index(level=1, axis=1) - tm.assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1), - sorted_after.drop([('foo', 'three')], axis=1)) + tm.assert_frame_equal( + sorted_before.drop([("foo", "three")], axis=1), + sorted_after.drop([("foo", "three")], axis=1), + ) def test_is_lexsorted(self): levels = [[0, 1], [0, 1, 2]] - index = MultiIndex(levels=levels, - codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) + index = MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] + ) assert index.is_lexsorted() - index = MultiIndex(levels=levels, - codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]]) + index = MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]] + ) assert not index.is_lexsorted() - index = MultiIndex(levels=levels, - codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]]) + index = MultiIndex( + levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]] + ) assert not index.is_lexsorted() assert index.lexsort_depth == 0 @@ -1842,17 +2048,19 @@ def test_sort_index_and_reconstruction(self): # lexsortedness should be identical # across MultiIndex construction methods - df = DataFrame([[1, 1], [2, 2]], index=list('ab')) - expected = DataFrame([[1, 1], [2, 2], [1, 1], [2, 2]], - index=MultiIndex.from_tuples([(0.5, 'a'), - (0.5, 'b'), - (0.8, 'a'), - (0.8, 'b')])) + df = DataFrame([[1, 1], [2, 2]], index=list("ab")) + expected = DataFrame( + [[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex.from_tuples( + [(0.5, "a"), (0.5, "b"), (0.8, "a"), (0.8, "b")] + ), + ) assert expected.index.is_lexsorted() result = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], - index=MultiIndex.from_product([[0.5, 0.8], list('ab')])) + index=MultiIndex.from_product([[0.5, 0.8], list("ab")]), + ) result = result.sort_index() assert result.index.is_lexsorted() assert result.index.is_monotonic @@ -1861,8 +2069,10 @@ def test_sort_index_and_reconstruction(self): result = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], - index=MultiIndex(levels=[[0.5, 0.8], ['a', 'b']], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]])) + index=MultiIndex( + levels=[[0.5, 0.8], ["a", "b"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] + ), + ) result = result.sort_index() assert result.index.is_lexsorted() @@ -1877,15 +2087,17 @@ def test_sort_index_and_reconstruction(self): tm.assert_frame_equal(result, expected) # 14015 - df = DataFrame([[1, 2], [6, 7]], - columns=MultiIndex.from_tuples( - [(0, '20160811 12:00:00'), - (0, '20160809 12:00:00')], - names=['l1', 'Date'])) - - df.columns.set_levels(pd.to_datetime(df.columns.levels[1]), - level=1, - inplace=True) + df = DataFrame( + [[1, 2], [6, 7]], + columns=MultiIndex.from_tuples( + [(0, "20160811 12:00:00"), (0, "20160809 12:00:00")], + names=["l1", "Date"], + ), + ) + + df.columns.set_levels( + pd.to_datetime(df.columns.levels[1]), level=1, inplace=True + ) assert not df.columns.is_lexsorted() assert not df.columns.is_monotonic result = df.sort_index(axis=1) @@ -1897,18 +2109,22 @@ def test_sort_index_and_reconstruction(self): def test_sort_index_and_reconstruction_doc_example(self): # doc example - df = DataFrame({'value': [1, 2, 3, 4]}, - index=MultiIndex( - levels=[['a', 'b'], ['bb', 'aa']], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]])) + df = DataFrame( + {"value": [1, 2, 3, 4]}, + index=MultiIndex( + levels=[["a", "b"], ["bb", "aa"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] + ), + ) assert df.index.is_lexsorted() assert not df.index.is_monotonic # sort it - expected = DataFrame({'value': [2, 1, 4, 3]}, - index=MultiIndex( - levels=[['a', 'b'], ['aa', 'bb']], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]])) + expected = DataFrame( + {"value": [2, 1, 4, 3]}, + index=MultiIndex( + levels=[["a", "b"], ["aa", "bb"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] + ), + ) result = df.sort_index() assert result.index.is_lexsorted() assert result.index.is_monotonic @@ -1928,20 +2144,22 @@ def test_sort_index_reorder_on_ops(self): df = DataFrame( np.random.randn(8, 2), index=MultiIndex.from_product( - [['a', 'b'], ['big', 'small'], ['red', 'blu']], - names=['letter', 'size', 'color']), - columns=['near', 'far']) + [["a", "b"], ["big", "small"], ["red", "blu"]], + names=["letter", "size", "color"], + ), + columns=["near", "far"], + ) df = df.sort_index() def my_func(group): - group.index = ['newz', 'newa'] + group.index = ["newz", "newa"] return group - result = df.groupby(level=['letter', 'size']).apply( - my_func).sort_index() + result = df.groupby(level=["letter", "size"]).apply(my_func).sort_index() expected = MultiIndex.from_product( - [['a', 'b'], ['big', 'small'], ['newa', 'newz']], - names=['letter', 'size', None]) + [["a", "b"], ["big", "small"], ["newa", "newz"]], + names=["letter", "size", None], + ) tm.assert_index_equal(result.index, expected) @@ -1949,14 +2167,11 @@ def test_sort_non_lexsorted(self): # degenerate case where we sort but don't # have a satisfying result :< # GH 15797 - idx = MultiIndex([['A', 'B', 'C'], - ['c', 'b', 'a']], - [[0, 1, 2, 0, 1, 2], - [0, 2, 1, 1, 0, 2]]) - - df = DataFrame({'col': range(len(idx))}, - index=idx, - dtype='int64') + idx = MultiIndex( + [["A", "B", "C"], ["c", "b", "a"]], [[0, 1, 2, 0, 1, 2], [0, 2, 1, 1, 0, 2]] + ) + + df = DataFrame({"col": range(len(idx))}, index=idx, dtype="int64") assert df.index.is_lexsorted() is False assert df.index.is_monotonic is False @@ -1965,11 +2180,13 @@ def test_sort_non_lexsorted(self): assert sorted.index.is_monotonic is True expected = DataFrame( - {'col': [1, 4, 5, 2]}, - index=MultiIndex.from_tuples([('B', 'a'), ('B', 'c'), - ('C', 'a'), ('C', 'b')]), - dtype='int64') - result = sorted.loc[pd.IndexSlice['B':'C', 'a':'c'], :] + {"col": [1, 4, 5, 2]}, + index=MultiIndex.from_tuples( + [("B", "a"), ("B", "c"), ("C", "a"), ("C", "b")] + ), + dtype="int64", + ) + result = sorted.loc[pd.IndexSlice["B":"C", "a":"c"], :] tm.assert_frame_equal(result, expected) def test_sort_index_nan(self): @@ -1978,21 +2195,45 @@ def test_sort_index_nan(self): tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]] mi = MultiIndex.from_tuples(tuples) - df = DataFrame(np.arange(16).reshape(4, 4), - index=mi, columns=list('ABCD')) + df = DataFrame(np.arange(16).reshape(4, 4), index=mi, columns=list("ABCD")) s = Series(np.arange(4), index=mi) - df2 = DataFrame({ - 'date': pd.to_datetime([ - '20121002', '20121007', '20130130', '20130202', '20130305', - '20121002', '20121207', '20130130', '20130202', '20130305', - '20130202', '20130305' - ]), - 'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], - 'whole_cost': [1790, np.nan, 280, 259, np.nan, 623, 90, 312, - np.nan, 301, 359, 801], - 'cost': [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12] - }).set_index(['date', 'user_id']) + df2 = DataFrame( + { + "date": pd.to_datetime( + [ + "20121002", + "20121007", + "20130130", + "20130202", + "20130305", + "20121002", + "20121207", + "20130130", + "20130202", + "20130305", + "20130202", + "20130305", + ] + ), + "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], + "whole_cost": [ + 1790, + np.nan, + 280, + 259, + np.nan, + 623, + 90, + 312, + np.nan, + 301, + 359, + 801, + ], + "cost": [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12], + } + ).set_index(["date", "user_id"]) # sorting frame, default nan position is last result = df.sort_index() @@ -2000,12 +2241,12 @@ def test_sort_index_nan(self): tm.assert_frame_equal(result, expected) # sorting frame, nan position last - result = df.sort_index(na_position='last') + result = df.sort_index(na_position="last") expected = df.iloc[[3, 0, 2, 1], :] tm.assert_frame_equal(result, expected) # sorting frame, nan position first - result = df.sort_index(na_position='first') + result = df.sort_index(na_position="first") expected = df.iloc[[1, 2, 3, 0], :] tm.assert_frame_equal(result, expected) @@ -2020,12 +2261,12 @@ def test_sort_index_nan(self): tm.assert_series_equal(result, expected) # sorting series, nan position last - result = s.sort_index(na_position='last') + result = s.sort_index(na_position="last") expected = s.iloc[[3, 0, 2, 1]] tm.assert_series_equal(result, expected) # sorting series, nan position first - result = s.sort_index(na_position='first') + result = s.sort_index(na_position="first") expected = s.iloc[[1, 2, 3, 0]] tm.assert_series_equal(result, expected) @@ -2033,20 +2274,21 @@ def test_sort_ascending_list(self): # GH: 16934 # Set up a Series with a three level MultiIndex - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'], - [4, 3, 2, 1, 4, 3, 2, 1]] + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + [4, 3, 2, 1, 4, 3, 2, 1], + ] tuples = zip(*arrays) - mi = MultiIndex.from_tuples(tuples, names=['first', 'second', 'third']) + mi = MultiIndex.from_tuples(tuples, names=["first", "second", "third"]) s = Series(range(8), index=mi) # Sort with boolean ascending - result = s.sort_index(level=['third', 'first'], ascending=False) + result = s.sort_index(level=["third", "first"], ascending=False) expected = s.iloc[[4, 0, 5, 1, 6, 2, 7, 3]] tm.assert_series_equal(result, expected) # Sort with list of boolean ascending - result = s.sort_index(level=['third', 'first'], - ascending=[False, True]) + result = s.sort_index(level=["third", "first"], ascending=[False, True]) expected = s.iloc[[0, 4, 1, 5, 2, 6, 3, 7]] tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 6e7b34a0632ad8..f6e936630f6be8 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -18,7 +18,6 @@ class TestnanopsDataFrame: - def setup_method(self, method): np.random.seed(11235) nanops._USE_BOTTLENECK = False @@ -30,12 +29,10 @@ def setup_method(self, method): self.arr_complex = self.arr_float + self.arr_float1 * 1j self.arr_int = np.random.randint(-10, 10, arr_shape) self.arr_bool = np.random.randint(0, 2, arr_shape) == 0 - self.arr_str = np.abs(self.arr_float).astype('S') - self.arr_utf = np.abs(self.arr_float).astype('U') - self.arr_date = np.random.randint(0, 20000, - arr_shape).astype('M8[ns]') - self.arr_tdelta = np.random.randint(0, 20000, - arr_shape).astype('m8[ns]') + self.arr_str = np.abs(self.arr_float).astype("S") + self.arr_utf = np.abs(self.arr_float).astype("U") + self.arr_date = np.random.randint(0, 20000, arr_shape).astype("M8[ns]") + self.arr_tdelta = np.random.randint(0, 20000, arr_shape).astype("m8[ns]") self.arr_nan = np.tile(np.nan, arr_shape) self.arr_float_nan = np.vstack([self.arr_float, self.arr_nan]) @@ -47,29 +44,27 @@ def setup_method(self, method): self.arr_float_inf = np.vstack([self.arr_float, self.arr_inf]) self.arr_nan_inf = np.vstack([self.arr_nan, self.arr_inf]) - self.arr_float_nan_inf = np.vstack([self.arr_float, self.arr_nan, - self.arr_inf]) - self.arr_nan_nan_inf = np.vstack([self.arr_nan, self.arr_nan, - self.arr_inf]) - self.arr_obj = np.vstack([ - self.arr_float.astype('O'), - self.arr_int.astype('O'), - self.arr_bool.astype('O'), - self.arr_complex.astype('O'), - self.arr_str.astype('O'), - self.arr_utf.astype('O'), - self.arr_date.astype('O'), - self.arr_tdelta.astype('O') - ]) - - with np.errstate(invalid='ignore'): + self.arr_float_nan_inf = np.vstack([self.arr_float, self.arr_nan, self.arr_inf]) + self.arr_nan_nan_inf = np.vstack([self.arr_nan, self.arr_nan, self.arr_inf]) + self.arr_obj = np.vstack( + [ + self.arr_float.astype("O"), + self.arr_int.astype("O"), + self.arr_bool.astype("O"), + self.arr_complex.astype("O"), + self.arr_str.astype("O"), + self.arr_utf.astype("O"), + self.arr_date.astype("O"), + self.arr_tdelta.astype("O"), + ] + ) + + with np.errstate(invalid="ignore"): self.arr_nan_nanj = self.arr_nan + self.arr_nan * 1j - self.arr_complex_nan = np.vstack([self.arr_complex, - self.arr_nan_nanj]) + self.arr_complex_nan = np.vstack([self.arr_complex, self.arr_nan_nanj]) self.arr_nan_infj = self.arr_inf * 1j - self.arr_complex_nan_infj = np.vstack([self.arr_complex, - self.arr_nan_infj]) + self.arr_complex_nan_infj = np.vstack([self.arr_complex, self.arr_nan_infj]) self.arr_float_2d = self.arr_float[:, :, 0] self.arr_float1_2d = self.arr_float1[:, :, 0] @@ -91,22 +86,26 @@ def teardown_method(self, method): nanops._USE_BOTTLENECK = use_bn def check_results(self, targ, res, axis, check_dtype=True): - res = getattr(res, 'asm8', res) - res = getattr(res, 'values', res) + res = getattr(res, "asm8", res) + res = getattr(res, "values", res) # timedeltas are a beast here def _coerce_tds(targ, res): - if hasattr(targ, 'dtype') and targ.dtype == 'm8[ns]': + if hasattr(targ, "dtype") and targ.dtype == "m8[ns]": if len(targ) == 1: targ = targ[0].item() res = res.item() else: - targ = targ.view('i8') + targ = targ.view("i8") return targ, res try: - if axis != 0 and hasattr( - targ, 'shape') and targ.ndim and targ.shape != res.shape: + if ( + axis != 0 + and hasattr(targ, "shape") + and targ.ndim + and targ.shape != res.shape + ): res = np.split(res, [targ.shape[0]], axis=0)[0] except (ValueError, IndexError): targ, res = _coerce_tds(targ, res) @@ -116,7 +115,7 @@ def _coerce_tds(targ, res): except AssertionError: # handle timedelta dtypes - if hasattr(targ, 'dtype') and targ.dtype == 'm8[ns]': + if hasattr(targ, "dtype") and targ.dtype == "m8[ns]": targ, res = _coerce_tds(targ, res) tm.assert_almost_equal(targ, res, check_dtype=check_dtype) return @@ -124,34 +123,40 @@ def _coerce_tds(targ, res): # There are sometimes rounding errors with # complex and object dtypes. # If it isn't one of those, re-raise the error. - if not hasattr(res, 'dtype') or res.dtype.kind not in ['c', 'O']: + if not hasattr(res, "dtype") or res.dtype.kind not in ["c", "O"]: raise # convert object dtypes to something that can be split into # real and imaginary parts - if res.dtype.kind == 'O': - if targ.dtype.kind != 'O': + if res.dtype.kind == "O": + if targ.dtype.kind != "O": res = res.astype(targ.dtype) else: try: - res = res.astype('c16') + res = res.astype("c16") except RuntimeError: - res = res.astype('f8') + res = res.astype("f8") try: - targ = targ.astype('c16') + targ = targ.astype("c16") except RuntimeError: - targ = targ.astype('f8') + targ = targ.astype("f8") # there should never be a case where numpy returns an object # but nanops doesn't, so make that an exception - elif targ.dtype.kind == 'O': + elif targ.dtype.kind == "O": raise - tm.assert_almost_equal(np.real(targ), np.real(res), - check_dtype=check_dtype) - tm.assert_almost_equal(np.imag(targ), np.imag(res), - check_dtype=check_dtype) - - def check_fun_data(self, testfunc, targfunc, testarval, targarval, - targarnanval, check_dtype=True, empty_targfunc=None, - **kwargs): + tm.assert_almost_equal(np.real(targ), np.real(res), check_dtype=check_dtype) + tm.assert_almost_equal(np.imag(targ), np.imag(res), check_dtype=check_dtype) + + def check_fun_data( + self, + testfunc, + targfunc, + testarval, + targarval, + targarnanval, + check_dtype=True, + empty_targfunc=None, + **kwargs + ): for axis in list(range(targarval.ndim)) + [None]: for skipna in [False, True]: targartempval = targarval if skipna else targarnanval @@ -161,25 +166,23 @@ def check_fun_data(self, testfunc, targfunc, testarval, targarval, targ = targfunc(targartempval, axis=axis, **kwargs) try: - res = testfunc(testarval, axis=axis, skipna=skipna, - **kwargs) - self.check_results(targ, res, axis, - check_dtype=check_dtype) + res = testfunc(testarval, axis=axis, skipna=skipna, **kwargs) + self.check_results(targ, res, axis, check_dtype=check_dtype) if skipna: res = testfunc(testarval, axis=axis, **kwargs) - self.check_results(targ, res, axis, - check_dtype=check_dtype) + self.check_results(targ, res, axis, check_dtype=check_dtype) if axis is None: res = testfunc(testarval, skipna=skipna, **kwargs) - self.check_results(targ, res, axis, - check_dtype=check_dtype) + self.check_results(targ, res, axis, check_dtype=check_dtype) if skipna and axis is None: res = testfunc(testarval, **kwargs) - self.check_results(targ, res, axis, - check_dtype=check_dtype) + self.check_results(targ, res, axis, check_dtype=check_dtype) except BaseException as exc: - exc.args += ('axis: %s of %s' % (axis, testarval.ndim - 1), - 'skipna: %s' % skipna, 'kwargs: %s' % kwargs) + exc.args += ( + "axis: %s of %s" % (axis, testarval.ndim - 1), + "skipna: %s" % skipna, + "kwargs: %s" % kwargs, + ) raise if testarval.ndim <= 1: @@ -191,12 +194,27 @@ def check_fun_data(self, testfunc, targfunc, testarval, targarval, targarnanval2 = np.take(targarnanval, 0, axis=-1) except ValueError: return - self.check_fun_data(testfunc, targfunc, testarval2, targarval2, - targarnanval2, check_dtype=check_dtype, - empty_targfunc=empty_targfunc, **kwargs) - - def check_fun(self, testfunc, targfunc, testar, targar=None, - targarnan=None, empty_targfunc=None, **kwargs): + self.check_fun_data( + testfunc, + targfunc, + testarval2, + targarval2, + targarnanval2, + check_dtype=check_dtype, + empty_targfunc=empty_targfunc, + **kwargs + ) + + def check_fun( + self, + testfunc, + targfunc, + testar, + targar=None, + targarnan=None, + empty_targfunc=None, + **kwargs + ): if targar is None: targar = testar if targarnan is None: @@ -205,40 +223,61 @@ def check_fun(self, testfunc, targfunc, testar, targar=None, targarval = getattr(self, targar) targarnanval = getattr(self, targarnan) try: - self.check_fun_data(testfunc, targfunc, testarval, targarval, - targarnanval, empty_targfunc=empty_targfunc, - **kwargs) + self.check_fun_data( + testfunc, + targfunc, + testarval, + targarval, + targarnanval, + empty_targfunc=empty_targfunc, + **kwargs + ) except BaseException as exc: - exc.args += ('testar: %s' % testar, 'targar: %s' % targar, - 'targarnan: %s' % targarnan) + exc.args += ( + "testar: %s" % testar, + "targar: %s" % targar, + "targarnan: %s" % targarnan, + ) raise - def check_funs(self, testfunc, targfunc, allow_complex=True, - allow_all_nan=True, allow_str=True, allow_date=True, - allow_tdelta=True, allow_obj=True, **kwargs): - self.check_fun(testfunc, targfunc, 'arr_float', **kwargs) - self.check_fun(testfunc, targfunc, 'arr_float_nan', 'arr_float', - **kwargs) - self.check_fun(testfunc, targfunc, 'arr_int', **kwargs) - self.check_fun(testfunc, targfunc, 'arr_bool', **kwargs) - objs = [self.arr_float.astype('O'), self.arr_int.astype('O'), - self.arr_bool.astype('O')] + def check_funs( + self, + testfunc, + targfunc, + allow_complex=True, + allow_all_nan=True, + allow_str=True, + allow_date=True, + allow_tdelta=True, + allow_obj=True, + **kwargs + ): + self.check_fun(testfunc, targfunc, "arr_float", **kwargs) + self.check_fun(testfunc, targfunc, "arr_float_nan", "arr_float", **kwargs) + self.check_fun(testfunc, targfunc, "arr_int", **kwargs) + self.check_fun(testfunc, targfunc, "arr_bool", **kwargs) + objs = [ + self.arr_float.astype("O"), + self.arr_int.astype("O"), + self.arr_bool.astype("O"), + ] if allow_all_nan: - self.check_fun(testfunc, targfunc, 'arr_nan', **kwargs) + self.check_fun(testfunc, targfunc, "arr_nan", **kwargs) if allow_complex: - self.check_fun(testfunc, targfunc, 'arr_complex', **kwargs) - self.check_fun(testfunc, targfunc, 'arr_complex_nan', - 'arr_complex', **kwargs) + self.check_fun(testfunc, targfunc, "arr_complex", **kwargs) + self.check_fun( + testfunc, targfunc, "arr_complex_nan", "arr_complex", **kwargs + ) if allow_all_nan: - self.check_fun(testfunc, targfunc, 'arr_nan_nanj', **kwargs) - objs += [self.arr_complex.astype('O')] + self.check_fun(testfunc, targfunc, "arr_nan_nanj", **kwargs) + objs += [self.arr_complex.astype("O")] if allow_str: - self.check_fun(testfunc, targfunc, 'arr_str', **kwargs) - self.check_fun(testfunc, targfunc, 'arr_utf', **kwargs) - objs += [self.arr_str.astype('O'), self.arr_utf.astype('O')] + self.check_fun(testfunc, targfunc, "arr_str", **kwargs) + self.check_fun(testfunc, targfunc, "arr_utf", **kwargs) + objs += [self.arr_str.astype("O"), self.arr_utf.astype("O")] if allow_date: try: @@ -246,8 +285,8 @@ def check_funs(self, testfunc, targfunc, allow_complex=True, except TypeError: pass else: - self.check_fun(testfunc, targfunc, 'arr_date', **kwargs) - objs += [self.arr_date.astype('O')] + self.check_fun(testfunc, targfunc, "arr_date", **kwargs) + objs += [self.arr_date.astype("O")] if allow_tdelta: try: @@ -255,44 +294,69 @@ def check_funs(self, testfunc, targfunc, allow_complex=True, except TypeError: pass else: - self.check_fun(testfunc, targfunc, 'arr_tdelta', **kwargs) - objs += [self.arr_tdelta.astype('O')] + self.check_fun(testfunc, targfunc, "arr_tdelta", **kwargs) + objs += [self.arr_tdelta.astype("O")] if allow_obj: self.arr_obj = np.vstack(objs) # some nanops handle object dtypes better than their numpy # counterparts, so the numpy functions need to be given something # else - if allow_obj == 'convert': - targfunc = partial(self._badobj_wrap, func=targfunc, - allow_complex=allow_complex) - self.check_fun(testfunc, targfunc, 'arr_obj', **kwargs) + if allow_obj == "convert": + targfunc = partial( + self._badobj_wrap, func=targfunc, allow_complex=allow_complex + ) + self.check_fun(testfunc, targfunc, "arr_obj", **kwargs) def _badobj_wrap(self, value, func, allow_complex=True, **kwargs): - if value.dtype.kind == 'O': + if value.dtype.kind == "O": if allow_complex: - value = value.astype('c16') + value = value.astype("c16") else: - value = value.astype('f8') + value = value.astype("f8") return func(value, **kwargs) def test_nanany(self): - self.check_funs(nanops.nanany, np.any, allow_all_nan=False, - allow_str=False, allow_date=False, allow_tdelta=False) + self.check_funs( + nanops.nanany, + np.any, + allow_all_nan=False, + allow_str=False, + allow_date=False, + allow_tdelta=False, + ) def test_nanall(self): - self.check_funs(nanops.nanall, np.all, allow_all_nan=False, - allow_str=False, allow_date=False, allow_tdelta=False) + self.check_funs( + nanops.nanall, + np.all, + allow_all_nan=False, + allow_str=False, + allow_date=False, + allow_tdelta=False, + ) def test_nansum(self): - self.check_funs(nanops.nansum, np.sum, allow_str=False, - allow_date=False, allow_tdelta=True, check_dtype=False, - empty_targfunc=np.nansum) + self.check_funs( + nanops.nansum, + np.sum, + allow_str=False, + allow_date=False, + allow_tdelta=True, + check_dtype=False, + empty_targfunc=np.nansum, + ) def test_nanmean(self): - self.check_funs(nanops.nanmean, np.mean, allow_complex=False, - allow_obj=False, allow_str=False, allow_date=False, - allow_tdelta=True) + self.check_funs( + nanops.nanmean, + np.mean, + allow_complex=False, + allow_obj=False, + allow_str=False, + allow_date=False, + allow_tdelta=True, + ) def test_nanmean_overflow(self): # GH 10155 @@ -310,13 +374,13 @@ def test_nanmean_overflow(self): def test_returned_dtype(self): dtypes = [np.int16, np.int32, np.int64, np.float32, np.float64] - if hasattr(np, 'float128'): + if hasattr(np, "float128"): dtypes.append(np.float128) for dtype in dtypes: s = Series(range(10), dtype=dtype) - group_a = ['mean', 'std', 'var', 'skew', 'kurt'] - group_b = ['min', 'max'] + group_a = ["mean", "std", "var", "skew", "kurt"] + group_b = ["min", "max"] for method in group_a + group_b: result = getattr(s, method)() if is_integer_dtype(dtype) and method in group_a: @@ -327,36 +391,64 @@ def test_returned_dtype(self): def test_nanmedian(self): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) - self.check_funs(nanops.nanmedian, np.median, allow_complex=False, - allow_str=False, allow_date=False, - allow_tdelta=True, allow_obj='convert') - - @pytest.mark.parametrize('ddof', range(3)) + self.check_funs( + nanops.nanmedian, + np.median, + allow_complex=False, + allow_str=False, + allow_date=False, + allow_tdelta=True, + allow_obj="convert", + ) + + @pytest.mark.parametrize("ddof", range(3)) def test_nanvar(self, ddof): - self.check_funs(nanops.nanvar, np.var, allow_complex=False, - allow_str=False, allow_date=False, - allow_tdelta=True, allow_obj='convert', ddof=ddof) - - @pytest.mark.parametrize('ddof', range(3)) + self.check_funs( + nanops.nanvar, + np.var, + allow_complex=False, + allow_str=False, + allow_date=False, + allow_tdelta=True, + allow_obj="convert", + ddof=ddof, + ) + + @pytest.mark.parametrize("ddof", range(3)) def test_nanstd(self, ddof): - self.check_funs(nanops.nanstd, np.std, allow_complex=False, - allow_str=False, allow_date=False, - allow_tdelta=True, allow_obj='convert', ddof=ddof) + self.check_funs( + nanops.nanstd, + np.std, + allow_complex=False, + allow_str=False, + allow_date=False, + allow_tdelta=True, + allow_obj="convert", + ddof=ddof, + ) @td.skip_if_no_scipy - @pytest.mark.parametrize('ddof', range(3)) + @pytest.mark.parametrize("ddof", range(3)) def test_nansem(self, ddof): from scipy.stats import sem - with np.errstate(invalid='ignore'): - self.check_funs(nanops.nansem, sem, allow_complex=False, - allow_str=False, allow_date=False, - allow_tdelta=False, allow_obj='convert', ddof=ddof) + + with np.errstate(invalid="ignore"): + self.check_funs( + nanops.nansem, + sem, + allow_complex=False, + allow_str=False, + allow_date=False, + allow_tdelta=False, + allow_obj="convert", + ddof=ddof, + ) def _minmax_wrap(self, value, axis=None, func=None): # numpy warns if all nan res = func(value, axis) - if res.dtype.kind == 'm': + if res.dtype.kind == "m": res = np.atleast_1d(res) return res @@ -364,15 +456,13 @@ def test_nanmin(self): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) func = partial(self._minmax_wrap, func=np.min) - self.check_funs(nanops.nanmin, func, - allow_str=False, allow_obj=False) + self.check_funs(nanops.nanmin, func, allow_str=False, allow_obj=False) def test_nanmax(self): with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) func = partial(self._minmax_wrap, func=np.max) - self.check_funs(nanops.nanmax, func, - allow_str=False, allow_obj=False) + self.check_funs(nanops.nanmax, func, allow_str=False, allow_obj=False) def _argminmax_wrap(self, value, axis=None, func=None): res = func(value, axis) @@ -380,8 +470,12 @@ def _argminmax_wrap(self, value, axis=None, func=None): nullnan = isna(nans) if res.ndim: res[nullnan] = -1 - elif (hasattr(nullnan, 'all') and nullnan.all() or - not hasattr(nullnan, 'all') and nullnan): + elif ( + hasattr(nullnan, "all") + and nullnan.all() + or not hasattr(nullnan, "all") + and nullnan + ): res = -1 return res @@ -389,64 +483,92 @@ def test_nanargmax(self): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) func = partial(self._argminmax_wrap, func=np.argmax) - self.check_funs(nanops.nanargmax, func, - allow_str=False, allow_obj=False, - allow_date=True, allow_tdelta=True) + self.check_funs( + nanops.nanargmax, + func, + allow_str=False, + allow_obj=False, + allow_date=True, + allow_tdelta=True, + ) def test_nanargmin(self): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) func = partial(self._argminmax_wrap, func=np.argmin) - self.check_funs(nanops.nanargmin, func, allow_str=False, - allow_obj=False) + self.check_funs(nanops.nanargmin, func, allow_str=False, allow_obj=False) def _skew_kurt_wrap(self, values, axis=None, func=None): if not isinstance(values.dtype.type, np.floating): - values = values.astype('f8') + values = values.astype("f8") result = func(values, axis=axis, bias=False) # fix for handling cases where all elements in an axis are the same if isinstance(result, np.ndarray): result[np.max(values, axis=axis) == np.min(values, axis=axis)] = 0 return result elif np.max(values) == np.min(values): - return 0. + return 0.0 return result @td.skip_if_no_scipy def test_nanskew(self): from scipy.stats import skew + func = partial(self._skew_kurt_wrap, func=skew) - with np.errstate(invalid='ignore'): - self.check_funs(nanops.nanskew, func, allow_complex=False, - allow_str=False, allow_date=False, - allow_tdelta=False) + with np.errstate(invalid="ignore"): + self.check_funs( + nanops.nanskew, + func, + allow_complex=False, + allow_str=False, + allow_date=False, + allow_tdelta=False, + ) @td.skip_if_no_scipy def test_nankurt(self): from scipy.stats import kurtosis + func1 = partial(kurtosis, fisher=True) func = partial(self._skew_kurt_wrap, func=func1) - with np.errstate(invalid='ignore'): - self.check_funs(nanops.nankurt, func, allow_complex=False, - allow_str=False, allow_date=False, - allow_tdelta=False) + with np.errstate(invalid="ignore"): + self.check_funs( + nanops.nankurt, + func, + allow_complex=False, + allow_str=False, + allow_date=False, + allow_tdelta=False, + ) def test_nanprod(self): - self.check_funs(nanops.nanprod, np.prod, allow_str=False, - allow_date=False, allow_tdelta=False, - empty_targfunc=np.nanprod) + self.check_funs( + nanops.nanprod, + np.prod, + allow_str=False, + allow_date=False, + allow_tdelta=False, + empty_targfunc=np.nanprod, + ) def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): res00 = checkfun(self.arr_float_2d, self.arr_float1_2d, **kwargs) - res01 = checkfun(self.arr_float_2d, self.arr_float1_2d, - min_periods=len(self.arr_float_2d) - 1, **kwargs) + res01 = checkfun( + self.arr_float_2d, + self.arr_float1_2d, + min_periods=len(self.arr_float_2d) - 1, + **kwargs + ) tm.assert_almost_equal(targ0, res00) tm.assert_almost_equal(targ0, res01) - res10 = checkfun(self.arr_float_nan_2d, self.arr_float1_nan_2d, - **kwargs) - res11 = checkfun(self.arr_float_nan_2d, self.arr_float1_nan_2d, - min_periods=len(self.arr_float_2d) - 1, **kwargs) + res10 = checkfun(self.arr_float_nan_2d, self.arr_float1_nan_2d, **kwargs) + res11 = checkfun( + self.arr_float_nan_2d, + self.arr_float1_nan_2d, + min_periods=len(self.arr_float_2d) - 1, + **kwargs + ) tm.assert_almost_equal(targ1, res10) tm.assert_almost_equal(targ1, res11) @@ -454,12 +576,19 @@ def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): res20 = checkfun(self.arr_nan_2d, self.arr_float1_2d, **kwargs) res21 = checkfun(self.arr_float_2d, self.arr_nan_2d, **kwargs) res22 = checkfun(self.arr_nan_2d, self.arr_nan_2d, **kwargs) - res23 = checkfun(self.arr_float_nan_2d, self.arr_nan_float1_2d, - **kwargs) - res24 = checkfun(self.arr_float_nan_2d, self.arr_nan_float1_2d, - min_periods=len(self.arr_float_2d) - 1, **kwargs) - res25 = checkfun(self.arr_float_2d, self.arr_float1_2d, - min_periods=len(self.arr_float_2d) + 1, **kwargs) + res23 = checkfun(self.arr_float_nan_2d, self.arr_nan_float1_2d, **kwargs) + res24 = checkfun( + self.arr_float_nan_2d, + self.arr_nan_float1_2d, + min_periods=len(self.arr_float_2d) - 1, + **kwargs + ) + res25 = checkfun( + self.arr_float_2d, + self.arr_float1_2d, + min_periods=len(self.arr_float_2d) + 1, + **kwargs + ) tm.assert_almost_equal(targ2, res20) tm.assert_almost_equal(targ2, res21) tm.assert_almost_equal(targ2, res22) @@ -469,15 +598,22 @@ def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): def check_nancorr_nancov_1d(self, checkfun, targ0, targ1, **kwargs): res00 = checkfun(self.arr_float_1d, self.arr_float1_1d, **kwargs) - res01 = checkfun(self.arr_float_1d, self.arr_float1_1d, - min_periods=len(self.arr_float_1d) - 1, **kwargs) + res01 = checkfun( + self.arr_float_1d, + self.arr_float1_1d, + min_periods=len(self.arr_float_1d) - 1, + **kwargs + ) tm.assert_almost_equal(targ0, res00) tm.assert_almost_equal(targ0, res01) - res10 = checkfun(self.arr_float_nan_1d, self.arr_float1_nan_1d, - **kwargs) - res11 = checkfun(self.arr_float_nan_1d, self.arr_float1_nan_1d, - min_periods=len(self.arr_float_1d) - 1, **kwargs) + res10 = checkfun(self.arr_float_nan_1d, self.arr_float1_nan_1d, **kwargs) + res11 = checkfun( + self.arr_float_nan_1d, + self.arr_float1_nan_1d, + min_periods=len(self.arr_float_1d) - 1, + **kwargs + ) tm.assert_almost_equal(targ1, res10) tm.assert_almost_equal(targ1, res11) @@ -485,12 +621,19 @@ def check_nancorr_nancov_1d(self, checkfun, targ0, targ1, **kwargs): res20 = checkfun(self.arr_nan_1d, self.arr_float1_1d, **kwargs) res21 = checkfun(self.arr_float_1d, self.arr_nan_1d, **kwargs) res22 = checkfun(self.arr_nan_1d, self.arr_nan_1d, **kwargs) - res23 = checkfun(self.arr_float_nan_1d, self.arr_nan_float1_1d, - **kwargs) - res24 = checkfun(self.arr_float_nan_1d, self.arr_nan_float1_1d, - min_periods=len(self.arr_float_1d) - 1, **kwargs) - res25 = checkfun(self.arr_float_1d, self.arr_float1_1d, - min_periods=len(self.arr_float_1d) + 1, **kwargs) + res23 = checkfun(self.arr_float_nan_1d, self.arr_nan_float1_1d, **kwargs) + res24 = checkfun( + self.arr_float_nan_1d, + self.arr_nan_float1_1d, + min_periods=len(self.arr_float_1d) - 1, + **kwargs + ) + res25 = checkfun( + self.arr_float_1d, + self.arr_float1_1d, + min_periods=len(self.arr_float_1d) + 1, + **kwargs + ) tm.assert_almost_equal(targ2, res20) tm.assert_almost_equal(targ2, res21) tm.assert_almost_equal(targ2, res22) @@ -500,50 +643,41 @@ def check_nancorr_nancov_1d(self, checkfun, targ0, targ1, **kwargs): def test_nancorr(self): targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1] - targ1 = np.corrcoef(self.arr_float_2d.flat, - self.arr_float1_2d.flat)[0, 1] + targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1] self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1) targ0 = np.corrcoef(self.arr_float_1d, self.arr_float1_1d)[0, 1] - targ1 = np.corrcoef(self.arr_float_1d.flat, - self.arr_float1_1d.flat)[0, 1] - self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, - method='pearson') + targ1 = np.corrcoef(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0, 1] + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="pearson") def test_nancorr_pearson(self): targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1] - targ1 = np.corrcoef(self.arr_float_2d.flat, - self.arr_float1_2d.flat)[0, 1] - self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, - method='pearson') + targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1] + self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method="pearson") targ0 = np.corrcoef(self.arr_float_1d, self.arr_float1_1d)[0, 1] - targ1 = np.corrcoef(self.arr_float_1d.flat, - self.arr_float1_1d.flat)[0, 1] - self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, - method='pearson') + targ1 = np.corrcoef(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0, 1] + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="pearson") @td.skip_if_no_scipy def test_nancorr_kendall(self): from scipy.stats import kendalltau + targ0 = kendalltau(self.arr_float_2d, self.arr_float1_2d)[0] targ1 = kendalltau(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0] - self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, - method='kendall') + self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method="kendall") targ0 = kendalltau(self.arr_float_1d, self.arr_float1_1d)[0] targ1 = kendalltau(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0] - self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, - method='kendall') + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="kendall") @td.skip_if_no_scipy def test_nancorr_spearman(self): from scipy.stats import spearmanr + targ0 = spearmanr(self.arr_float_2d, self.arr_float1_2d)[0] targ1 = spearmanr(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0] - self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, - method='spearman') + self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method="spearman") targ0 = spearmanr(self.arr_float_1d, self.arr_float1_1d)[0] targ1 = spearmanr(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0] - self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, - method='spearman') + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="spearman") def test_nancov(self): targ0 = np.cov(self.arr_float_2d, self.arr_float1_2d)[0, 1] @@ -578,7 +712,7 @@ def check_nancomp(self, checkfun, targ0): res2 = checkfun(arr_float_nan, arr_nan_float1) tm.assert_numpy_array_equal(targ2, res2, check_dtype=False) except Exception as exc: - exc.args += ('ndim: %s' % arr_float.ndim, ) + exc.args += ("ndim: %s" % arr_float.ndim,) raise try: @@ -618,7 +752,7 @@ def test_nanne(self): self.check_nancomp(nanops.nanne, targ0) def check_bool(self, func, value, correct, *args, **kwargs): - while getattr(value, 'ndim', True): + while getattr(value, "ndim", True): try: res0 = func(value, *args, **kwargs) if correct: @@ -626,9 +760,9 @@ def check_bool(self, func, value, correct, *args, **kwargs): else: assert not res0 except BaseException as exc: - exc.args += ('dim: %s' % getattr(value, 'ndim', value), ) + exc.args += ("dim: %s" % getattr(value, "ndim", value),) raise - if not hasattr(value, 'ndim'): + if not hasattr(value, "ndim"): break try: value = np.take(value, 0, axis=-1) @@ -636,46 +770,72 @@ def check_bool(self, func, value, correct, *args, **kwargs): break def test__has_infs(self): - pairs = [('arr_complex', False), ('arr_int', False), - ('arr_bool', False), ('arr_str', False), ('arr_utf', False), - ('arr_complex', False), ('arr_complex_nan', False), - ('arr_nan_nanj', False), ('arr_nan_infj', True), - ('arr_complex_nan_infj', True)] - pairs_float = [('arr_float', False), ('arr_nan', False), - ('arr_float_nan', False), ('arr_nan_nan', False), - ('arr_float_inf', True), ('arr_inf', True), - ('arr_nan_inf', True), ('arr_float_nan_inf', True), - ('arr_nan_nan_inf', True)] + pairs = [ + ("arr_complex", False), + ("arr_int", False), + ("arr_bool", False), + ("arr_str", False), + ("arr_utf", False), + ("arr_complex", False), + ("arr_complex_nan", False), + ("arr_nan_nanj", False), + ("arr_nan_infj", True), + ("arr_complex_nan_infj", True), + ] + pairs_float = [ + ("arr_float", False), + ("arr_nan", False), + ("arr_float_nan", False), + ("arr_nan_nan", False), + ("arr_float_inf", True), + ("arr_inf", True), + ("arr_nan_inf", True), + ("arr_float_nan_inf", True), + ("arr_nan_nan_inf", True), + ] for arr, correct in pairs: val = getattr(self, arr) try: self.check_bool(nanops._has_infs, val, correct) except BaseException as exc: - exc.args += (arr, ) + exc.args += (arr,) raise for arr, correct in pairs_float: val = getattr(self, arr) try: self.check_bool(nanops._has_infs, val, correct) - self.check_bool(nanops._has_infs, val.astype('f4'), correct) - self.check_bool(nanops._has_infs, val.astype('f2'), correct) + self.check_bool(nanops._has_infs, val.astype("f4"), correct) + self.check_bool(nanops._has_infs, val.astype("f2"), correct) except BaseException as exc: - exc.args += (arr, ) + exc.args += (arr,) raise def test__isfinite(self): - pairs = [('arr_complex', False), ('arr_int', False), - ('arr_bool', False), ('arr_str', False), ('arr_utf', False), - ('arr_complex', False), ('arr_complex_nan', True), - ('arr_nan_nanj', True), ('arr_nan_infj', True), - ('arr_complex_nan_infj', True)] - pairs_float = [('arr_float', False), ('arr_nan', True), - ('arr_float_nan', True), ('arr_nan_nan', True), - ('arr_float_inf', True), ('arr_inf', True), - ('arr_nan_inf', True), ('arr_float_nan_inf', True), - ('arr_nan_nan_inf', True)] + pairs = [ + ("arr_complex", False), + ("arr_int", False), + ("arr_bool", False), + ("arr_str", False), + ("arr_utf", False), + ("arr_complex", False), + ("arr_complex_nan", True), + ("arr_nan_nanj", True), + ("arr_nan_infj", True), + ("arr_complex_nan_infj", True), + ] + pairs_float = [ + ("arr_float", False), + ("arr_nan", True), + ("arr_float_nan", True), + ("arr_nan_nan", True), + ("arr_float_inf", True), + ("arr_inf", True), + ("arr_nan_inf", True), + ("arr_float_nan_inf", True), + ("arr_nan_nan_inf", True), + ] func1 = lambda x: np.any(nanops._isfinite(x).ravel()) @@ -687,33 +847,32 @@ def test__isfinite(self): try: self.check_bool(func1, val, correct) except BaseException as exc: - exc.args += (arr, ) + exc.args += (arr,) raise for arr, correct in pairs_float: val = getattr(self, arr) try: self.check_bool(func1, val, correct) - self.check_bool(func1, val.astype('f4'), correct) - self.check_bool(func1, val.astype('f2'), correct) + self.check_bool(func1, val.astype("f4"), correct) + self.check_bool(func1, val.astype("f2"), correct) except BaseException as exc: - exc.args += (arr, ) + exc.args += (arr,) raise def test__bn_ok_dtype(self): - assert nanops._bn_ok_dtype(self.arr_float.dtype, 'test') - assert nanops._bn_ok_dtype(self.arr_complex.dtype, 'test') - assert nanops._bn_ok_dtype(self.arr_int.dtype, 'test') - assert nanops._bn_ok_dtype(self.arr_bool.dtype, 'test') - assert nanops._bn_ok_dtype(self.arr_str.dtype, 'test') - assert nanops._bn_ok_dtype(self.arr_utf.dtype, 'test') - assert not nanops._bn_ok_dtype(self.arr_date.dtype, 'test') - assert not nanops._bn_ok_dtype(self.arr_tdelta.dtype, 'test') - assert not nanops._bn_ok_dtype(self.arr_obj.dtype, 'test') + assert nanops._bn_ok_dtype(self.arr_float.dtype, "test") + assert nanops._bn_ok_dtype(self.arr_complex.dtype, "test") + assert nanops._bn_ok_dtype(self.arr_int.dtype, "test") + assert nanops._bn_ok_dtype(self.arr_bool.dtype, "test") + assert nanops._bn_ok_dtype(self.arr_str.dtype, "test") + assert nanops._bn_ok_dtype(self.arr_utf.dtype, "test") + assert not nanops._bn_ok_dtype(self.arr_date.dtype, "test") + assert not nanops._bn_ok_dtype(self.arr_tdelta.dtype, "test") + assert not nanops._bn_ok_dtype(self.arr_obj.dtype, "test") class TestEnsureNumeric: - def test_numeric_values(self): # Test integer assert nanops._ensure_numeric(1) == 1 @@ -734,24 +893,24 @@ def test_ndarray(self): assert np.allclose(nanops._ensure_numeric(o_values), values) # Test convertible string ndarray - s_values = np.array(['1', '2', '3'], dtype=object) + s_values = np.array(["1", "2", "3"], dtype=object) assert np.allclose(nanops._ensure_numeric(s_values), values) # Test non-convertible string ndarray - s_values = np.array(['foo', 'bar', 'baz'], dtype=object) + s_values = np.array(["foo", "bar", "baz"], dtype=object) msg = r"could not convert string to float: '(foo|baz)'" with pytest.raises(ValueError, match=msg): nanops._ensure_numeric(s_values) def test_convertable_values(self): - assert np.allclose(nanops._ensure_numeric('1'), 1.0) - assert np.allclose(nanops._ensure_numeric('1.1'), 1.1) - assert np.allclose(nanops._ensure_numeric('1+1j'), 1 + 1j) + assert np.allclose(nanops._ensure_numeric("1"), 1.0) + assert np.allclose(nanops._ensure_numeric("1.1"), 1.1) + assert np.allclose(nanops._ensure_numeric("1+1j"), 1 + 1j) def test_non_convertable_values(self): msg = "Could not convert foo to numeric" with pytest.raises(TypeError, match=msg): - nanops._ensure_numeric('foo') + nanops._ensure_numeric("foo") msg = "Could not convert {} to numeric" with pytest.raises(TypeError, match=msg): nanops._ensure_numeric({}) @@ -772,16 +931,14 @@ def setup_method(self, method): def test_nanvar_all_finite(self): samples = self.samples actual_variance = nanops.nanvar(samples) - tm.assert_almost_equal(actual_variance, self.variance, - check_less_precise=2) + tm.assert_almost_equal(actual_variance, self.variance, check_less_precise=2) def test_nanvar_nans(self): samples = np.nan * np.ones(2 * self.samples.shape[0]) samples[::2] = self.samples actual_variance = nanops.nanvar(samples, skipna=True) - tm.assert_almost_equal(actual_variance, self.variance, - check_less_precise=2) + tm.assert_almost_equal(actual_variance, self.variance, check_less_precise=2) actual_variance = nanops.nanvar(samples, skipna=False) tm.assert_almost_equal(actual_variance, np.nan, check_less_precise=2) @@ -791,12 +948,10 @@ def test_nanstd_nans(self): samples[::2] = self.samples actual_std = nanops.nanstd(samples, skipna=True) - tm.assert_almost_equal(actual_std, self.variance ** 0.5, - check_less_precise=2) + tm.assert_almost_equal(actual_std, self.variance ** 0.5, check_less_precise=2) actual_std = nanops.nanvar(samples, skipna=False) - tm.assert_almost_equal(actual_std, np.nan, - check_less_precise=2) + tm.assert_almost_equal(actual_std, np.nan, check_less_precise=2) def test_nanvar_axis(self): # Generate some sample data. @@ -805,8 +960,9 @@ def test_nanvar_axis(self): samples = np.vstack([samples_norm, samples_unif]) actual_variance = nanops.nanvar(samples, axis=1) - tm.assert_almost_equal(actual_variance, np.array( - [self.variance, 1.0 / 12]), check_less_precise=2) + tm.assert_almost_equal( + actual_variance, np.array([self.variance, 1.0 / 12]), check_less_precise=2 + ) def test_nanvar_ddof(self): n = 5 @@ -819,32 +975,43 @@ def test_nanvar_ddof(self): # The unbiased estimate. var = 1.0 / 12 - tm.assert_almost_equal(variance_1, var, - check_less_precise=2) + tm.assert_almost_equal(variance_1, var, check_less_precise=2) # The underestimated variance. - tm.assert_almost_equal(variance_0, (n - 1.0) / n * var, - check_less_precise=2) + tm.assert_almost_equal(variance_0, (n - 1.0) / n * var, check_less_precise=2) # The overestimated variance. - tm.assert_almost_equal(variance_2, (n - 1.0) / (n - 2.0) * var, - check_less_precise=2) + tm.assert_almost_equal( + variance_2, (n - 1.0) / (n - 2.0) * var, check_less_precise=2 + ) def test_ground_truth(self): # Test against values that were precomputed with Numpy. samples = np.empty((4, 4)) - samples[:3, :3] = np.array([[0.97303362, 0.21869576, 0.55560287 - ], [0.72980153, 0.03109364, 0.99155171], - [0.09317602, 0.60078248, 0.15871292]]) + samples[:3, :3] = np.array( + [ + [0.97303362, 0.21869576, 0.55560287], + [0.72980153, 0.03109364, 0.99155171], + [0.09317602, 0.60078248, 0.15871292], + ] + ) samples[3] = samples[:, 3] = np.nan # Actual variances along axis=0, 1 for ddof=0, 1, 2 - variance = np.array([[[0.13762259, 0.05619224, 0.11568816 - ], [0.20643388, 0.08428837, 0.17353224], - [0.41286776, 0.16857673, 0.34706449]], - [[0.09519783, 0.16435395, 0.05082054 - ], [0.14279674, 0.24653093, 0.07623082], - [0.28559348, 0.49306186, 0.15246163]]]) + variance = np.array( + [ + [ + [0.13762259, 0.05619224, 0.11568816], + [0.20643388, 0.08428837, 0.17353224], + [0.41286776, 0.16857673, 0.34706449], + ], + [ + [0.09519783, 0.16435395, 0.05082054], + [0.14279674, 0.24653093, 0.07623082], + [0.28559348, 0.49306186, 0.15246163], + ], + ] + ) # Test nanvar. for axis in range(2): @@ -903,8 +1070,7 @@ def test_ground_truth(self): tm.assert_almost_equal(skew, self.actual_skew) def test_axis(self): - samples = np.vstack([self.samples, - np.nan * np.ones(len(self.samples))]) + samples = np.vstack([self.samples, np.nan * np.ones(len(self.samples))]) skew = nanops.nanskew(samples, axis=1) tm.assert_almost_equal(skew, np.array([self.actual_skew, np.nan])) @@ -953,8 +1119,7 @@ def test_ground_truth(self): tm.assert_almost_equal(kurt, self.actual_kurt) def test_axis(self): - samples = np.vstack([self.samples, - np.nan * np.ones(len(self.samples))]) + samples = np.vstack([self.samples, np.nan * np.ones(len(self.samples))]) kurt = nanops.nankurt(samples, axis=1) tm.assert_almost_equal(kurt, np.array([self.actual_kurt, np.nan])) @@ -974,12 +1139,12 @@ def prng(self): class TestDatetime64NaNOps: - @pytest.mark.parametrize('tz', [None, 'UTC']) + @pytest.mark.parametrize("tz", [None, "UTC"]) @pytest.mark.xfail(reason="disabled") # Enabling mean changes the behavior of DataFrame.mean # See https://github.com/pandas-dev/pandas/issues/24752 def test_nanmean(self, tz): - dti = pd.date_range('2016-01-01', periods=3, tz=tz) + dti = pd.date_range("2016-01-01", periods=3, tz=tz) expected = dti[1] for obj in [dti, DatetimeArray(dti), Series(dti)]: @@ -997,50 +1162,56 @@ def test_use_bottleneck(): if nanops._BOTTLENECK_INSTALLED: - pd.set_option('use_bottleneck', True) - assert pd.get_option('use_bottleneck') - - pd.set_option('use_bottleneck', False) - assert not pd.get_option('use_bottleneck') - - pd.set_option('use_bottleneck', use_bn) - - -@pytest.mark.parametrize("numpy_op, expected", [ - (np.sum, 10), - (np.nansum, 10), - (np.mean, 2.5), - (np.nanmean, 2.5), - (np.median, 2.5), - (np.nanmedian, 2.5), - (np.min, 1), - (np.max, 4), - (np.nanmin, 1), - (np.nanmax, 4) -]) + pd.set_option("use_bottleneck", True) + assert pd.get_option("use_bottleneck") + + pd.set_option("use_bottleneck", False) + assert not pd.get_option("use_bottleneck") + + pd.set_option("use_bottleneck", use_bn) + + +@pytest.mark.parametrize( + "numpy_op, expected", + [ + (np.sum, 10), + (np.nansum, 10), + (np.mean, 2.5), + (np.nanmean, 2.5), + (np.median, 2.5), + (np.nanmedian, 2.5), + (np.min, 1), + (np.max, 4), + (np.nanmin, 1), + (np.nanmax, 4), + ], +) def test_numpy_ops(numpy_op, expected): # GH8383 result = numpy_op(pd.Series([1, 2, 3, 4])) assert result == expected -@pytest.mark.parametrize("operation", [ - nanops.nanany, - nanops.nanall, - nanops.nansum, - nanops.nanmean, - nanops.nanmedian, - nanops.nanstd, - nanops.nanvar, - nanops.nansem, - nanops.nanargmax, - nanops.nanargmin, - nanops.nanmax, - nanops.nanmin, - nanops.nanskew, - nanops.nankurt, - nanops.nanprod, -]) +@pytest.mark.parametrize( + "operation", + [ + nanops.nanany, + nanops.nanall, + nanops.nansum, + nanops.nanmean, + nanops.nanmedian, + nanops.nanstd, + nanops.nanvar, + nanops.nansem, + nanops.nanargmax, + nanops.nanargmin, + nanops.nanmax, + nanops.nanmin, + nanops.nanskew, + nanops.nankurt, + nanops.nanprod, + ], +) def test_nanops_independent_of_mask_param(operation): # GH22764 s = pd.Series([1, 2, np.nan, 3, np.nan, 4]) diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py index 3916bedb8e44bc..cd154ed5fe570e 100644 --- a/pandas/tests/test_optional_dependency.py +++ b/pandas/tests/test_optional_dependency.py @@ -18,16 +18,16 @@ def test_import_optional(): def test_xlrd_version_fallback(): - pytest.importorskip('xlrd') + pytest.importorskip("xlrd") import_optional_dependency("xlrd") def test_bad_version(): - name = 'fakemodule' + name = "fakemodule" module = types.ModuleType(name) module.__version__ = "0.9.0" sys.modules[name] = module - VERSIONS[name] = '1.0.0' + VERSIONS[name] = "1.0.0" match = "Pandas requires .*1.0.0.* of .fakemodule.*'0.9.0'" with pytest.raises(ImportError, match=match): @@ -43,10 +43,10 @@ def test_bad_version(): def test_no_version_raises(): - name = 'fakemodule' + name = "fakemodule" module = types.ModuleType(name) sys.modules[name] = module - VERSIONS[name] = '1.0.0' + VERSIONS[name] = "1.0.0" with pytest.raises(ImportError, match="Can't determine .* fakemodule"): import_optional_dependency(name) diff --git a/pandas/tests/test_register_accessor.py b/pandas/tests/test_register_accessor.py index e79ec56c819c1b..97086f8ab1e85d 100644 --- a/pandas/tests/test_register_accessor.py +++ b/pandas/tests/test_register_accessor.py @@ -21,10 +21,9 @@ def ensure_removed(obj, attr): class MyAccessor: - def __init__(self, obj): self.obj = obj - self.item = 'item' + self.item = "item" @property def prop(self): @@ -34,30 +33,33 @@ def method(self): return self.item -@pytest.mark.parametrize('obj, registrar', [ - (pd.Series, pd.api.extensions.register_series_accessor), - (pd.DataFrame, pd.api.extensions.register_dataframe_accessor), - (pd.Index, pd.api.extensions.register_index_accessor) -]) +@pytest.mark.parametrize( + "obj, registrar", + [ + (pd.Series, pd.api.extensions.register_series_accessor), + (pd.DataFrame, pd.api.extensions.register_dataframe_accessor), + (pd.Index, pd.api.extensions.register_index_accessor), + ], +) def test_register(obj, registrar): - with ensure_removed(obj, 'mine'): + with ensure_removed(obj, "mine"): before = set(dir(obj)) - registrar('mine')(MyAccessor) - assert obj([]).mine.prop == 'item' + registrar("mine")(MyAccessor) + assert obj([]).mine.prop == "item" after = set(dir(obj)) - assert (before ^ after) == {'mine'} - assert 'mine' in obj._accessors + assert (before ^ after) == {"mine"} + assert "mine" in obj._accessors def test_accessor_works(): - with ensure_removed(pd.Series, 'mine'): - pd.api.extensions.register_series_accessor('mine')(MyAccessor) + with ensure_removed(pd.Series, "mine"): + pd.api.extensions.register_series_accessor("mine")(MyAccessor) s = pd.Series([1, 2]) assert s.mine.obj is s - assert s.mine.prop == 'item' - assert s.mine.method() == 'item' + assert s.mine.prop == "item" + assert s.mine.method() == "item" def test_overwrite_warns(): @@ -65,20 +67,20 @@ def test_overwrite_warns(): mean = pd.Series.mean try: with tm.assert_produces_warning(UserWarning) as w: - pd.api.extensions.register_series_accessor('mean')(MyAccessor) + pd.api.extensions.register_series_accessor("mean")(MyAccessor) s = pd.Series([1, 2]) - assert s.mean.prop == 'item' + assert s.mean.prop == "item" msg = str(w[0].message) - assert 'mean' in msg - assert 'MyAccessor' in msg - assert 'Series' in msg + assert "mean" in msg + assert "MyAccessor" in msg + assert "Series" in msg finally: pd.Series.mean = mean def test_raises_attribute_error(): - with ensure_removed(pd.Series, 'bad'): + with ensure_removed(pd.Series, "bad"): @pd.api.extensions.register_series_accessor("bad") class Bad: diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index f198fb6ae57b1e..f64ad8edafbd74 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -9,34 +9,42 @@ from pandas import DataFrame, MultiIndex, Series, array, concat, merge from pandas.core import common as com from pandas.core.sorting import ( - decons_group_index, get_group_index, is_int64_overflow_possible, - lexsort_indexer, nargsort, safe_sort) + decons_group_index, + get_group_index, + is_int64_overflow_possible, + lexsort_indexer, + nargsort, + safe_sort, +) from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal class TestSorting: - @pytest.mark.slow def test_int64_overflow(self): B = np.concatenate((np.arange(1000), np.arange(1000), np.arange(500))) A = np.arange(2500) - df = DataFrame({'A': A, - 'B': B, - 'C': A, - 'D': B, - 'E': A, - 'F': B, - 'G': A, - 'H': B, - 'values': np.random.randn(2500)}) - - lg = df.groupby(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']) - rg = df.groupby(['H', 'G', 'F', 'E', 'D', 'C', 'B', 'A']) - - left = lg.sum()['values'] - right = rg.sum()['values'] + df = DataFrame( + { + "A": A, + "B": B, + "C": A, + "D": B, + "E": A, + "F": B, + "G": A, + "H": B, + "values": np.random.randn(2500), + } + ) + + lg = df.groupby(["A", "B", "C", "D", "E", "F", "G", "H"]) + rg = df.groupby(["H", "G", "F", "E", "D", "C", "B", "A"]) + + left = lg.sum()["values"] + right = rg.sum()["values"] exp_index, _ = left.index.sortlevel() tm.assert_index_equal(left.index, exp_index) @@ -44,11 +52,10 @@ def test_int64_overflow(self): exp_index, _ = right.index.sortlevel(0) tm.assert_index_equal(right.index, exp_index) - tups = list(map(tuple, df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H' - ]].values)) + tups = list(map(tuple, df[["A", "B", "C", "D", "E", "F", "G", "H"]].values)) tups = com.asarray_tuplesafe(tups) - expected = df.groupby(tups).sum()['values'] + expected = df.groupby(tups).sum()["values"] for k, v in expected.items(): assert left[k] == right[k[::-1]] @@ -59,9 +66,8 @@ def test_int64_overflow_moar(self): # GH9096 values = range(55109) - data = DataFrame.from_dict( - {'a': values, 'b': values, 'c': values, 'd': values}) - grouped = data.groupby(['a', 'b', 'c', 'd']) + data = DataFrame.from_dict({"a": values, "b": values, "c": values, "d": values}) + grouped = data.groupby(["a", "b", "c", "d"]) assert len(grouped) == len(values) arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5)) @@ -71,26 +77,26 @@ def test_int64_overflow_moar(self): i = np.random.permutation(len(arr)) arr = arr[i] # shuffle rows - df = DataFrame(arr, columns=list('abcde')) - df['jim'], df['joe'] = np.random.randn(2, len(df)) * 10 - gr = df.groupby(list('abcde')) + df = DataFrame(arr, columns=list("abcde")) + df["jim"], df["joe"] = np.random.randn(2, len(df)) * 10 + gr = df.groupby(list("abcde")) # verify this is testing what it is supposed to test! assert is_int64_overflow_possible(gr.grouper.shape) # manually compute groupings jim, joe = defaultdict(list), defaultdict(list) - for key, a, b in zip(map(tuple, arr), df['jim'], df['joe']): + for key, a, b in zip(map(tuple, arr), df["jim"], df["joe"]): jim[key].append(a) joe[key].append(b) assert len(gr) == len(jim) - mi = MultiIndex.from_tuples(jim.keys(), names=list('abcde')) + mi = MultiIndex.from_tuples(jim.keys(), names=list("abcde")) def aggr(func): - f = lambda a: np.fromiter(map(func, a), dtype='f8') + f = lambda a: np.fromiter(map(func, a), dtype="f8") arr = np.vstack((f(jim.values()), f(joe.values()))).T - res = DataFrame(arr, columns=['jim', 'joe'], index=mi) + res = DataFrame(arr, columns=["jim", "joe"], index=mi) return res.sort_index() assert_frame_equal(gr.mean(), aggr(np.mean)) @@ -99,22 +105,22 @@ def aggr(func): def test_lexsort_indexer(self): keys = [[nan] * 5 + list(range(100)) + [nan] * 5] # orders=True, na_position='last' - result = lexsort_indexer(keys, orders=True, na_position='last') + result = lexsort_indexer(keys, orders=True, na_position="last") exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) # orders=True, na_position='first' - result = lexsort_indexer(keys, orders=True, na_position='first') + result = lexsort_indexer(keys, orders=True, na_position="first") exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) # orders=False, na_position='last' - result = lexsort_indexer(keys, orders=False, na_position='last') + result = lexsort_indexer(keys, orders=False, na_position="last") exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) # orders=False, na_position='first' - result = lexsort_indexer(keys, orders=False, na_position='first') + result = lexsort_indexer(keys, orders=False, na_position="first") exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) @@ -122,7 +128,7 @@ def test_nargsort(self): # np.argsort(items) places NaNs last items = [nan] * 5 + list(range(100)) + [nan] * 5 # np.argsort(items2) may not place NaNs first - items2 = np.array(items, dtype='O') + items2 = np.array(items, dtype="O") # mergesort is the most difficult to get right because we want it to be # stable. @@ -133,104 +139,96 @@ def test_nargsort(self): # arrays.""" # mergesort, ascending=True, na_position='last' - result = nargsort(items, kind='mergesort', ascending=True, - na_position='last') + result = nargsort(items, kind="mergesort", ascending=True, na_position="last") exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=True, na_position='first' - result = nargsort(items, kind='mergesort', ascending=True, - na_position='first') + result = nargsort(items, kind="mergesort", ascending=True, na_position="first") exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=False, na_position='last' - result = nargsort(items, kind='mergesort', ascending=False, - na_position='last') + result = nargsort(items, kind="mergesort", ascending=False, na_position="last") exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=False, na_position='first' - result = nargsort(items, kind='mergesort', ascending=False, - na_position='first') + result = nargsort(items, kind="mergesort", ascending=False, na_position="first") exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=True, na_position='last' - result = nargsort(items2, kind='mergesort', ascending=True, - na_position='last') + result = nargsort(items2, kind="mergesort", ascending=True, na_position="last") exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=True, na_position='first' - result = nargsort(items2, kind='mergesort', ascending=True, - na_position='first') + result = nargsort(items2, kind="mergesort", ascending=True, na_position="first") exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=False, na_position='last' - result = nargsort(items2, kind='mergesort', ascending=False, - na_position='last') + result = nargsort(items2, kind="mergesort", ascending=False, na_position="last") exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=False, na_position='first' - result = nargsort(items2, kind='mergesort', ascending=False, - na_position='first') + result = nargsort( + items2, kind="mergesort", ascending=False, na_position="first" + ) exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) class TestMerge: - @pytest.mark.slow def test_int64_overflow_issues(self): # #2690, combinatorial explosion - df1 = DataFrame(np.random.randn(1000, 7), - columns=list('ABCDEF') + ['G1']) - df2 = DataFrame(np.random.randn(1000, 7), - columns=list('ABCDEF') + ['G2']) + df1 = DataFrame(np.random.randn(1000, 7), columns=list("ABCDEF") + ["G1"]) + df2 = DataFrame(np.random.randn(1000, 7), columns=list("ABCDEF") + ["G2"]) # it works! - result = merge(df1, df2, how='outer') + result = merge(df1, df2, how="outer") assert len(result) == 2000 low, high, n = -1 << 10, 1 << 10, 1 << 20 - left = DataFrame(np.random.randint(low, high, (n, 7)), - columns=list('ABCDEFG')) - left['left'] = left.sum(axis=1) + left = DataFrame(np.random.randint(low, high, (n, 7)), columns=list("ABCDEFG")) + left["left"] = left.sum(axis=1) # one-2-one match i = np.random.permutation(len(left)) right = left.iloc[i].copy() - right.columns = right.columns[:-1].tolist() + ['right'] + right.columns = right.columns[:-1].tolist() + ["right"] right.index = np.arange(len(right)) - right['right'] *= -1 + right["right"] *= -1 - out = merge(left, right, how='outer') + out = merge(left, right, how="outer") assert len(out) == len(left) - assert_series_equal(out['left'], - out['right'], check_names=False) + assert_series_equal(out["left"], -out["right"], check_names=False) result = out.iloc[:, :-2].sum(axis=1) - assert_series_equal(out['left'], result, check_names=False) + assert_series_equal(out["left"], result, check_names=False) assert result.name is None out.sort_values(out.columns.tolist(), inplace=True) out.index = np.arange(len(out)) - for how in ['left', 'right', 'outer', 'inner']: + for how in ["left", "right", "outer", "inner"]: assert_frame_equal(out, merge(left, right, how=how, sort=True)) # check that left merge w/ sort=False maintains left frame order - out = merge(left, right, how='left', sort=False) + out = merge(left, right, how="left", sort=False) assert_frame_equal(left, out[left.columns.tolist()]) - out = merge(right, left, how='left', sort=False) + out = merge(right, left, how="left", sort=False) assert_frame_equal(right, out[right.columns.tolist()]) # one-2-many/none match n = 1 << 11 - left = DataFrame(np.random.randint(low, high, (n, 7)).astype('int64'), - columns=list('ABCDEFG')) + left = DataFrame( + np.random.randint(low, high, (n, 7)).astype("int64"), + columns=list("ABCDEFG"), + ) # confirm that this is checking what it is supposed to check shape = left.apply(Series.nunique).values @@ -239,16 +237,17 @@ def test_int64_overflow_issues(self): # add duplicates to left frame left = concat([left, left], ignore_index=True) - right = DataFrame(np.random.randint(low, high, (n // 2, 7)) - .astype('int64'), - columns=list('ABCDEFG')) + right = DataFrame( + np.random.randint(low, high, (n // 2, 7)).astype("int64"), + columns=list("ABCDEFG"), + ) # add duplicates & overlap with left to the right frame i = np.random.choice(len(left), n) right = concat([right, right, left.iloc[i]], ignore_index=True) - left['left'] = np.random.randn(len(left)) - right['right'] = np.random.randn(len(right)) + left["left"] = np.random.randn(len(left)) + right["right"] = np.random.randn(len(right)) # shuffle left & right frames i = np.random.permutation(len(left)) @@ -262,11 +261,11 @@ def test_int64_overflow_issues(self): # manually compute outer merge ldict, rdict = defaultdict(list), defaultdict(list) - for idx, row in left.set_index(list('ABCDEFG')).iterrows(): - ldict[idx].append(row['left']) + for idx, row in left.set_index(list("ABCDEFG")).iterrows(): + ldict[idx].append(row["left"]) - for idx, row in right.set_index(list('ABCDEFG')).iterrows(): - rdict[idx].append(row['right']) + for idx, row in right.set_index(list("ABCDEFG")).iterrows(): + rdict[idx].append(row["right"]) vals = [] for k, lval in ldict.items(): @@ -285,22 +284,25 @@ def align(df): return df def verify_order(df): - kcols = list('ABCDEFG') - assert_frame_equal(df[kcols].copy(), - df[kcols].sort_values(kcols, kind='mergesort')) + kcols = list("ABCDEFG") + assert_frame_equal( + df[kcols].copy(), df[kcols].sort_values(kcols, kind="mergesort") + ) - out = DataFrame(vals, columns=list('ABCDEFG') + ['left', 'right']) + out = DataFrame(vals, columns=list("ABCDEFG") + ["left", "right"]) out = align(out) - jmask = {'left': out['left'].notna(), - 'right': out['right'].notna(), - 'inner': out['left'].notna() & out['right'].notna(), - 'outer': np.ones(len(out), dtype='bool')} + jmask = { + "left": out["left"].notna(), + "right": out["right"].notna(), + "inner": out["left"].notna() & out["right"].notna(), + "outer": np.ones(len(out), dtype="bool"), + } - for how in 'left', 'right', 'outer', 'inner': + for how in "left", "right", "outer", "inner": mask = jmask[how] frame = align(out[mask].copy()) - assert mask.all() ^ mask.any() or how == 'outer' + assert mask.all() ^ mask.any() or how == "outer" for sort in [False, True]: res = merge(left, right, how=how, sort=sort) @@ -308,12 +310,12 @@ def verify_order(df): verify_order(res) # as in GH9092 dtypes break with outer/right join - assert_frame_equal(frame, align(res), - check_dtype=how not in ('right', 'outer')) + assert_frame_equal( + frame, align(res), check_dtype=how not in ("right", "outer") + ) def test_decons(): - def testit(label_list, shape): group_index = get_group_index(label_list, shape, sort=True, xnull=True) label_list2 = decons_group_index(group_index, shape) @@ -322,19 +324,22 @@ def testit(label_list, shape): tm.assert_numpy_array_equal(a, b) shape = (4, 5, 6) - label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100).astype(np.int64), - np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100).astype(np.int64), - np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100).astype(np.int64)] + label_list = [ + np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100).astype(np.int64), + np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100).astype(np.int64), + np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100).astype(np.int64), + ] testit(label_list, shape) shape = (10000, 10000) - label_list = [np.tile(np.arange(10000, dtype=np.int64), 5), - np.tile(np.arange(10000, dtype=np.int64), 5)] + label_list = [ + np.tile(np.arange(10000, dtype=np.int64), 5), + np.tile(np.arange(10000, dtype=np.int64), 5), + ] testit(label_list, shape) class TestSafeSort: - def test_basic_sort(self): values = [3, 1, 2, 0, 4] result = safe_sort(values) @@ -343,7 +348,7 @@ def test_basic_sort(self): values = list("baaacb") result = safe_sort(values) - expected = np.array(list("aaabbc"), dtype='object') + expected = np.array(list("aaabbc"), dtype="object") tm.assert_numpy_array_equal(result, expected) values = [] @@ -351,7 +356,7 @@ def test_basic_sort(self): expected = np.array([]) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('verify', [True, False]) + @pytest.mark.parametrize("verify", [True, False]) def test_labels(self, verify): values = [3, 1, 2, 0, 4] expected = np.array([0, 1, 2, 3, 4]) @@ -364,8 +369,7 @@ def test_labels(self, verify): # na_sentinel labels = [0, 1, 1, 2, 3, 0, 99, 4] - result, result_labels = safe_sort(values, labels, na_sentinel=99, - verify=verify) + result, result_labels = safe_sort(values, labels, na_sentinel=99, verify=verify) expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_labels, expected_labels) @@ -376,78 +380,77 @@ def test_labels(self, verify): tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_labels, expected_labels) - @pytest.mark.parametrize('na_sentinel', [-1, 99]) + @pytest.mark.parametrize("na_sentinel", [-1, 99]) def test_labels_out_of_bound(self, na_sentinel): values = [3, 1, 2, 0, 4] expected = np.array([0, 1, 2, 3, 4]) # out of bound indices labels = [0, 101, 102, 2, 3, 0, 99, 4] - result, result_labels = safe_sort( - values, labels, na_sentinel=na_sentinel) + result, result_labels = safe_sort(values, labels, na_sentinel=na_sentinel) expected_labels = np.array( - [3, na_sentinel, na_sentinel, 2, 0, 3, na_sentinel, 4], - dtype=np.intp) + [3, na_sentinel, na_sentinel, 2, 0, 3, na_sentinel, 4], dtype=np.intp + ) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_labels, expected_labels) def test_mixed_integer(self): - values = np.array(['b', 1, 0, 'a', 0, 'b'], dtype=object) + values = np.array(["b", 1, 0, "a", 0, "b"], dtype=object) result = safe_sort(values) - expected = np.array([0, 0, 1, 'a', 'b', 'b'], dtype=object) + expected = np.array([0, 0, 1, "a", "b", "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) - values = np.array(['b', 1, 0, 'a'], dtype=object) + values = np.array(["b", 1, 0, "a"], dtype=object) labels = [0, 1, 2, 3, 0, -1, 1] result, result_labels = safe_sort(values, labels) - expected = np.array([0, 1, 'a', 'b'], dtype=object) + expected = np.array([0, 1, "a", "b"], dtype=object) expected_labels = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_labels, expected_labels) def test_mixed_integer_from_list(self): - values = ['b', 1, 0, 'a', 0, 'b'] + values = ["b", 1, 0, "a", 0, "b"] result = safe_sort(values) - expected = np.array([0, 0, 1, 'a', 'b', 'b'], dtype=object) + expected = np.array([0, 0, 1, "a", "b", "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) def test_unsortable(self): # GH 13714 arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object) - msg = ("unorderable types: .* [<>] .*" - "|" # the above case happens for numpy < 1.14 - "'[<>]' not supported between instances of .*") + msg = ( + "unorderable types: .* [<>] .*" + "|" # the above case happens for numpy < 1.14 + "'[<>]' not supported between instances of .*" + ) with pytest.raises(TypeError, match=msg): safe_sort(arr) def test_exceptions(self): - with pytest.raises(TypeError, - match="Only list-like objects are allowed"): + with pytest.raises(TypeError, match="Only list-like objects are allowed"): safe_sort(values=1) - with pytest.raises(TypeError, - match="Only list-like objects or None"): + with pytest.raises(TypeError, match="Only list-like objects or None"): safe_sort(values=[0, 1, 2], labels=1) - with pytest.raises(ValueError, - match="values should be unique"): + with pytest.raises(ValueError, match="values should be unique"): safe_sort(values=[0, 1, 2, 1], labels=[0, 1]) def test_extension_array(self): # a = array([1, 3, np.nan, 2], dtype='Int64') - a = array([1, 3, 2], dtype='Int64') + a = array([1, 3, 2], dtype="Int64") result = safe_sort(a) # expected = array([1, 2, 3, np.nan], dtype='Int64') - expected = array([1, 2, 3], dtype='Int64') + expected = array([1, 2, 3], dtype="Int64") tm.assert_extension_array_equal(result, expected) - @pytest.mark.parametrize('verify', [True, False]) - @pytest.mark.parametrize('na_sentinel', [-1, 99]) + @pytest.mark.parametrize("verify", [True, False]) + @pytest.mark.parametrize("na_sentinel", [-1, 99]) def test_extension_array_labels(self, verify, na_sentinel): - a = array([1, 3, 2], dtype='Int64') - result, labels = safe_sort(a, [0, 1, na_sentinel, 2], - na_sentinel=na_sentinel, verify=verify) - expected_values = array([1, 2, 3], dtype='Int64') + a = array([1, 3, 2], dtype="Int64") + result, labels = safe_sort( + a, [0, 1, na_sentinel, 2], na_sentinel=na_sentinel, verify=verify + ) + expected_values = array([1, 2, 3], dtype="Int64") expected_labels = np.array([0, 2, na_sentinel, 1], dtype=np.intp) tm.assert_extension_array_equal(result, expected_values) tm.assert_numpy_array_equal(labels, expected_labels) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 955554f60aa1f2..d70614fcd2700d 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -20,64 +20,89 @@ def assert_series_or_index_equal(left, right): _any_string_method = [ - ('cat', (), {'sep': ','}), # noqa: E241 - ('cat', (Series(list('zyx')),), {'sep': ',', # noqa: E241 - 'join': 'left'}), - ('center', (10,), {}), # noqa: E241 - ('contains', ('a',), {}), # noqa: E241 - ('count', ('a',), {}), # noqa: E241 - ('decode', ('UTF-8',), {}), # noqa: E241 - ('encode', ('UTF-8',), {}), # noqa: E241 - ('endswith', ('a',), {}), # noqa: E241 - ('extract', ('([a-z]*)',), {'expand': False}), # noqa: E241 - ('extract', ('([a-z]*)',), {'expand': True}), # noqa: E241 - ('extractall', ('([a-z]*)',), {}), # noqa: E241 - ('find', ('a',), {}), # noqa: E241 - ('findall', ('a',), {}), # noqa: E241 - ('get', (0,), {}), # noqa: E241 + ("cat", (), {"sep": ","}), # noqa: E241 + ("cat", (Series(list("zyx")),), {"sep": ",", "join": "left"}), # noqa: E241 + ("center", (10,), {}), # noqa: E241 + ("contains", ("a",), {}), # noqa: E241 + ("count", ("a",), {}), # noqa: E241 + ("decode", ("UTF-8",), {}), # noqa: E241 + ("encode", ("UTF-8",), {}), # noqa: E241 + ("endswith", ("a",), {}), # noqa: E241 + ("extract", ("([a-z]*)",), {"expand": False}), # noqa: E241 + ("extract", ("([a-z]*)",), {"expand": True}), # noqa: E241 + ("extractall", ("([a-z]*)",), {}), # noqa: E241 + ("find", ("a",), {}), # noqa: E241 + ("findall", ("a",), {}), # noqa: E241 + ("get", (0,), {}), # noqa: E241 # because "index" (and "rindex") fail intentionally # if the string is not found, search only for empty string - ('index', ('',), {}), # noqa: E241 - ('join', (',',), {}), # noqa: E241 - ('ljust', (10,), {}), # noqa: E241 - ('match', ('a',), {}), # noqa: E241 - ('normalize', ('NFC',), {}), # noqa: E241 - ('pad', (10,), {}), # noqa: E241 - ('partition', (' ',), {'expand': False}), # noqa: E241 - ('partition', (' ',), {'expand': True}), # noqa: E241 - ('repeat', (3,), {}), # noqa: E241 - ('replace', ('a', 'z',), {}), # noqa: E241 - ('rfind', ('a',), {}), # noqa: E241 - ('rindex', ('',), {}), # noqa: E241 - ('rjust', (10,), {}), # noqa: E241 - ('rpartition', (' ',), {'expand': False}), # noqa: E241 - ('rpartition', (' ',), {'expand': True}), # noqa: E241 - ('slice', (0, 1,), {}), # noqa: E241 - ('slice_replace', (0, 1, 'z',), {}), # noqa: E241 - ('split', (' ',), {'expand': False}), # noqa: E241 - ('split', (' ',), {'expand': True}), # noqa: E241 - ('startswith', ('a',), {}), # noqa: E241 + ("index", ("",), {}), # noqa: E241 + ("join", (",",), {}), # noqa: E241 + ("ljust", (10,), {}), # noqa: E241 + ("match", ("a",), {}), # noqa: E241 + ("normalize", ("NFC",), {}), # noqa: E241 + ("pad", (10,), {}), # noqa: E241 + ("partition", (" ",), {"expand": False}), # noqa: E241 + ("partition", (" ",), {"expand": True}), # noqa: E241 + ("repeat", (3,), {}), # noqa: E241 + ("replace", ("a", "z"), {}), # noqa: E241 + ("rfind", ("a",), {}), # noqa: E241 + ("rindex", ("",), {}), # noqa: E241 + ("rjust", (10,), {}), # noqa: E241 + ("rpartition", (" ",), {"expand": False}), # noqa: E241 + ("rpartition", (" ",), {"expand": True}), # noqa: E241 + ("slice", (0, 1), {}), # noqa: E241 + ("slice_replace", (0, 1, "z"), {}), # noqa: E241 + ("split", (" ",), {"expand": False}), # noqa: E241 + ("split", (" ",), {"expand": True}), # noqa: E241 + ("startswith", ("a",), {}), # noqa: E241 # translating unicode points of "a" to "d" - ('translate', ({97: 100},), {}), # noqa: E241 - ('wrap', (2,), {}), # noqa: E241 - ('zfill', (10,), {}) # noqa: E241 -] + list(zip([ - # methods without positional arguments: zip with empty tuple and empty dict - 'capitalize', 'cat', 'get_dummies', - 'isalnum', 'isalpha', 'isdecimal', - 'isdigit', 'islower', 'isnumeric', - 'isspace', 'istitle', 'isupper', - 'len', 'lower', 'lstrip', 'partition', - 'rpartition', 'rsplit', 'rstrip', - 'slice', 'slice_replace', 'split', - 'strip', 'swapcase', 'title', 'upper', 'casefold' -], [()] * 100, [{}] * 100)) + ("translate", ({97: 100},), {}), # noqa: E241 + ("wrap", (2,), {}), # noqa: E241 + ("zfill", (10,), {}), # noqa: E241 +] + list( + zip( + [ + # methods without positional arguments: zip with empty tuple and empty dict + "capitalize", + "cat", + "get_dummies", + "isalnum", + "isalpha", + "isdecimal", + "isdigit", + "islower", + "isnumeric", + "isspace", + "istitle", + "isupper", + "len", + "lower", + "lstrip", + "partition", + "rpartition", + "rsplit", + "rstrip", + "slice", + "slice_replace", + "split", + "strip", + "swapcase", + "title", + "upper", + "casefold", + ], + [()] * 100, + [{}] * 100, + ) +) ids, _, _ = zip(*_any_string_method) # use method name as fixture-id # test that the above list captures all methods of StringMethods -missing_methods = {f for f in dir(strings.StringMethods) - if not f.startswith('_')} - set(ids) +missing_methods = { + f for f in dir(strings.StringMethods) if not f.startswith("_") +} - set(ids) assert not missing_methods @@ -113,11 +138,11 @@ def any_string_method(request): # subset of the full set from pandas/conftest.py _any_allowed_skipna_inferred_dtype = [ - ('string', ['a', np.nan, 'c']), - ('bytes', [b'a', np.nan, b'c']), - ('empty', [np.nan, np.nan, np.nan]), - ('empty', []), - ('mixed-integer', ['a', np.nan, 2]) + ("string", ["a", np.nan, "c"]), + ("bytes", [b"a", np.nan, b"c"]), + ("empty", [np.nan, np.nan, np.nan]), + ("empty", []), + ("mixed-integer", ["a", np.nan, 2]), ] ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id @@ -162,23 +187,24 @@ def any_allowed_skipna_inferred_dtype(request): class TestStringMethods: - def test_api(self): # GH 6106, GH 9322 assert Series.str is strings.StringMethods - assert isinstance(Series(['']).str, strings.StringMethods) + assert isinstance(Series([""]).str, strings.StringMethods) def test_api_mi_raises(self): # GH 23679 - mi = MultiIndex.from_arrays([['a', 'b', 'c']]) - with pytest.raises(AttributeError, match='Can only use .str accessor ' - 'with Index, not MultiIndex'): + mi = MultiIndex.from_arrays([["a", "b", "c"]]) + with pytest.raises( + AttributeError, + match="Can only use .str accessor " "with Index, not MultiIndex", + ): mi.str - assert not hasattr(mi, 'str') + assert not hasattr(mi, "str") - @pytest.mark.parametrize('dtype', [object, 'category']) - @pytest.mark.parametrize('box', [Series, Index]) + @pytest.mark.parametrize("dtype", [object, "category"]) + @pytest.mark.parametrize("box", [Series, Index]) def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype): # one instance of parametrized fixture inferred_dtype, values = any_skipna_inferred_dtype @@ -186,28 +212,38 @@ def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype): t = box(values, dtype=dtype) # explicit dtype to avoid casting # TODO: get rid of these xfails - if dtype == 'category' and inferred_dtype in ['period', 'interval']: - pytest.xfail(reason='Conversion to numpy array fails because ' - 'the ._values-attribute is not a numpy array for ' - 'PeriodArray/IntervalArray; see GH 23553') - - types_passing_constructor = ['string', 'unicode', 'empty', - 'bytes', 'mixed', 'mixed-integer'] + if dtype == "category" and inferred_dtype in ["period", "interval"]: + pytest.xfail( + reason="Conversion to numpy array fails because " + "the ._values-attribute is not a numpy array for " + "PeriodArray/IntervalArray; see GH 23553" + ) + + types_passing_constructor = [ + "string", + "unicode", + "empty", + "bytes", + "mixed", + "mixed-integer", + ] if inferred_dtype in types_passing_constructor: # GH 6106 assert isinstance(t.str, strings.StringMethods) else: # GH 9184, GH 23011, GH 23163 - with pytest.raises(AttributeError, match='Can only use .str ' - 'accessor with string values.*'): + with pytest.raises( + AttributeError, + match="Can only use .str " "accessor with string values.*", + ): t.str - assert not hasattr(t, 'str') + assert not hasattr(t, "str") - @pytest.mark.parametrize('dtype', [object, 'category']) - @pytest.mark.parametrize('box', [Series, Index]) - def test_api_per_method(self, box, dtype, - any_allowed_skipna_inferred_dtype, - any_string_method): + @pytest.mark.parametrize("dtype", [object, "category"]) + @pytest.mark.parametrize("box", [Series, Index]) + def test_api_per_method( + self, box, dtype, any_allowed_skipna_inferred_dtype, any_string_method + ): # this test does not check correctness of the different methods, # just that the methods work on the specified (inferred) dtypes, # and raise on all others @@ -217,47 +253,62 @@ def test_api_per_method(self, box, dtype, method_name, args, kwargs = any_string_method # TODO: get rid of these xfails - if (method_name in ['partition', 'rpartition'] and box == Index - and inferred_dtype == 'empty'): - pytest.xfail(reason='Method cannot deal with empty Index') - if (method_name == 'split' and box == Index and values.size == 0 - and kwargs.get('expand', None) is not None): - pytest.xfail(reason='Split fails on empty Series when expand=True') - if (method_name == 'get_dummies' and box == Index - and inferred_dtype == 'empty' and (dtype == object - or values.size == 0)): - pytest.xfail(reason='Need to fortify get_dummies corner cases') + if ( + method_name in ["partition", "rpartition"] + and box == Index + and inferred_dtype == "empty" + ): + pytest.xfail(reason="Method cannot deal with empty Index") + if ( + method_name == "split" + and box == Index + and values.size == 0 + and kwargs.get("expand", None) is not None + ): + pytest.xfail(reason="Split fails on empty Series when expand=True") + if ( + method_name == "get_dummies" + and box == Index + and inferred_dtype == "empty" + and (dtype == object or values.size == 0) + ): + pytest.xfail(reason="Need to fortify get_dummies corner cases") t = box(values, dtype=dtype) # explicit dtype to avoid casting method = getattr(t.str, method_name) - bytes_allowed = method_name in ['decode', 'get', 'len', 'slice'] + bytes_allowed = method_name in ["decode", "get", "len", "slice"] # as of v0.23.4, all methods except 'cat' are very lenient with the # allowed data types, just returning NaN for entries that error. # This could be changed with an 'errors'-kwarg to the `str`-accessor, # see discussion in GH 13877 - mixed_allowed = method_name not in ['cat'] + mixed_allowed = method_name not in ["cat"] - allowed_types = (['string', 'unicode', 'empty'] - + ['bytes'] * bytes_allowed - + ['mixed', 'mixed-integer'] * mixed_allowed) + allowed_types = ( + ["string", "unicode", "empty"] + + ["bytes"] * bytes_allowed + + ["mixed", "mixed-integer"] * mixed_allowed + ) if inferred_dtype in allowed_types: # xref GH 23555, GH 23556 method(*args, **kwargs) # works! else: # GH 23011, GH 23163 - msg = ('Cannot use .str.{name} with values of inferred dtype ' - '{inferred_dtype!r}.'.format(name=method_name, - inferred_dtype=inferred_dtype)) + msg = ( + "Cannot use .str.{name} with values of inferred dtype " + "{inferred_dtype!r}.".format( + name=method_name, inferred_dtype=inferred_dtype + ) + ) with pytest.raises(TypeError, match=msg): method(*args, **kwargs) def test_api_for_categorical(self, any_string_method): # https://github.com/pandas-dev/pandas/issues/10661 - s = Series(list('aabb')) + s = Series(list("aabb")) s = s + " " + s - c = s.astype('category') + c = s.astype("category") assert isinstance(c.str, strings.StringMethods) method_name, args, kwargs = any_string_method @@ -275,7 +326,7 @@ def test_api_for_categorical(self, any_string_method): def test_iter(self): # GH3638 - strs = 'google', 'wikimedia', 'wikipedia', 'wikitravel' + strs = "google", "wikimedia", "wikipedia", "wikitravel" ds = Series(strs) for s in ds.str: @@ -293,7 +344,7 @@ def test_iter(self): # desired behavior is to iterate until everything would be nan on the # next iter so make sure the last element of the iterator was 'l' in # this case since 'wikitravel' is the longest string - assert s.dropna().values.item() == 'l' + assert s.dropna().values.item() == "l" def test_iter_empty(self): ds = Series([], dtype=object) @@ -309,7 +360,7 @@ def test_iter_empty(self): assert s == 1 def test_iter_single_element(self): - ds = Series(['a']) + ds = Series(["a"]) for i, s in enumerate(ds.str): pass @@ -318,62 +369,61 @@ def test_iter_single_element(self): assert_series_equal(ds, s) def test_iter_object_try_string(self): - ds = Series([slice(None, randint(10), randint(10, 20)) for _ in range( - 4)]) + ds = Series([slice(None, randint(10), randint(10, 20)) for _ in range(4)]) - i, s = 100, 'h' + i, s = 100, "h" for i, s in enumerate(ds.str): pass assert i == 100 - assert s == 'h' + assert s == "h" - @pytest.mark.parametrize('box', [Series, Index]) - @pytest.mark.parametrize('other', [None, Series, Index]) + @pytest.mark.parametrize("box", [Series, Index]) + @pytest.mark.parametrize("other", [None, Series, Index]) def test_str_cat_name(self, box, other): # GH 21053 - values = ['a', 'b'] + values = ["a", "b"] if other: other = other(values) else: other = values - result = box(values, name='name').str.cat(other, sep=',', join='left') - assert result.name == 'name' + result = box(values, name="name").str.cat(other, sep=",", join="left") + assert result.name == "name" - @pytest.mark.parametrize('box', [Series, Index]) + @pytest.mark.parametrize("box", [Series, Index]) def test_str_cat(self, box): # test_cat above tests "str_cat" from ndarray; # here testing "str.cat" from Series/Indext to ndarray/list - s = box(['a', 'a', 'b', 'b', 'c', np.nan]) + s = box(["a", "a", "b", "b", "c", np.nan]) # single array result = s.str.cat() - expected = 'aabbc' + expected = "aabbc" assert result == expected - result = s.str.cat(na_rep='-') - expected = 'aabbc-' + result = s.str.cat(na_rep="-") + expected = "aabbc-" assert result == expected - result = s.str.cat(sep='_', na_rep='NA') - expected = 'a_a_b_b_c_NA' + result = s.str.cat(sep="_", na_rep="NA") + expected = "a_a_b_b_c_NA" assert result == expected - t = np.array(['a', np.nan, 'b', 'd', 'foo', np.nan], dtype=object) - expected = box(['aa', 'a-', 'bb', 'bd', 'cfoo', '--']) + t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object) + expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"]) # Series/Index with array - result = s.str.cat(t, na_rep='-') + result = s.str.cat(t, na_rep="-") assert_series_or_index_equal(result, expected) # Series/Index with list - result = s.str.cat(list(t), na_rep='-') + result = s.str.cat(list(t), na_rep="-") assert_series_or_index_equal(result, expected) # errors for incorrect lengths - rgx = 'All arrays must be same length, except those having an index.*' - z = Series(['1', '2', '3']) + rgx = "All arrays must be same length, except those having an index.*" + z = Series(["1", "2", "3"]) with pytest.raises(ValueError, match=rgx): s.str.cat(z) @@ -384,26 +434,26 @@ def test_str_cat(self, box): with pytest.raises(ValueError, match=rgx): s.str.cat(list(z)) - @pytest.mark.parametrize('box', [Series, Index]) + @pytest.mark.parametrize("box", [Series, Index]) def test_str_cat_raises_intuitive_error(self, box): # GH 11334 - s = box(['a', 'b', 'c', 'd']) + s = box(["a", "b", "c", "d"]) message = "Did you mean to supply a `sep` keyword?" with pytest.raises(ValueError, match=message): - s.str.cat('|') + s.str.cat("|") with pytest.raises(ValueError, match=message): - s.str.cat(' ') + s.str.cat(" ") - @pytest.mark.parametrize('sep', ['', None]) - @pytest.mark.parametrize('dtype_target', ['object', 'category']) - @pytest.mark.parametrize('dtype_caller', ['object', 'category']) - @pytest.mark.parametrize('box', [Series, Index]) + @pytest.mark.parametrize("sep", ["", None]) + @pytest.mark.parametrize("dtype_target", ["object", "category"]) + @pytest.mark.parametrize("dtype_caller", ["object", "category"]) + @pytest.mark.parametrize("box", [Series, Index]) def test_str_cat_categorical(self, box, dtype_caller, dtype_target, sep): - s = Index(['a', 'a', 'b', 'a'], dtype=dtype_caller) + s = Index(["a", "a", "b", "a"], dtype=dtype_caller) s = s if box == Index else Series(s, index=s) - t = Index(['b', 'a', 'b', 'c'], dtype=dtype_target) + t = Index(["b", "a", "b", "c"], dtype=dtype_target) - expected = Index(['ab', 'aa', 'bb', 'ac']) + expected = Index(["ab", "aa", "bb", "ac"]) expected = expected if box == Index else Series(expected, index=s) # Series/Index with unaligned Index @@ -429,33 +479,37 @@ def test_str_cat_categorical(self, box, dtype_caller, dtype_target, sep): assert_series_or_index_equal(result, expected) # test integer/float dtypes (inferred by constructor) and mixed - @pytest.mark.parametrize('data', [[1, 2, 3], [.1, .2, .3], [1, 2, 'b']], - ids=['integers', 'floats', 'mixed']) + @pytest.mark.parametrize( + "data", + [[1, 2, 3], [0.1, 0.2, 0.3], [1, 2, "b"]], + ids=["integers", "floats", "mixed"], + ) # without dtype=object, np.array would cast [1, 2, 'b'] to ['1', '2', 'b'] - @pytest.mark.parametrize('box', [Series, Index, list, - lambda x: np.array(x, dtype=object)], - ids=['Series', 'Index', 'list', 'np.array']) + @pytest.mark.parametrize( + "box", + [Series, Index, list, lambda x: np.array(x, dtype=object)], + ids=["Series", "Index", "list", "np.array"], + ) def test_str_cat_wrong_dtype_raises(self, box, data): # GH 22722 - s = Series(['a', 'b', 'c']) + s = Series(["a", "b", "c"]) t = box(data) - msg = 'Concatenation requires list-likes containing only strings.*' + msg = "Concatenation requires list-likes containing only strings.*" with pytest.raises(TypeError, match=msg): # need to use outer and na_rep, as otherwise Index would not raise - s.str.cat(t, join='outer', na_rep='-') + s.str.cat(t, join="outer", na_rep="-") - @pytest.mark.parametrize('box', [Series, Index]) + @pytest.mark.parametrize("box", [Series, Index]) def test_str_cat_mixed_inputs(self, box): - s = Index(['a', 'b', 'c', 'd']) + s = Index(["a", "b", "c", "d"]) s = s if box == Index else Series(s, index=s) - t = Series(['A', 'B', 'C', 'D'], index=s.values) + t = Series(["A", "B", "C", "D"], index=s.values) d = concat([t, Series(s, index=s)], axis=1) - expected = Index(['aAa', 'bBb', 'cCc', 'dDd']) - expected = expected if box == Index else Series(expected.values, - index=s.values) + expected = Index(["aAa", "bBb", "cCc", "dDd"]) + expected = expected if box == Index else Series(expected.values, index=s.values) # Series/Index with DataFrame result = s.str.cat(d) @@ -480,7 +534,7 @@ def test_str_cat_mixed_inputs(self, box): assert_series_or_index_equal(result, expected) # Series/Index with list of Series; different indexes - t.index = ['b', 'c', 'd', 'a'] + t.index = ["b", "c", "d", "a"] with tm.assert_produces_warning(expected_warning=FutureWarning): # FutureWarning to switch to alignment by default result = s.str.cat([t, s]) @@ -493,7 +547,7 @@ def test_str_cat_mixed_inputs(self, box): assert_series_or_index_equal(result, expected) # Series/Index with DataFrame; different indexes - d.index = ['b', 'c', 'd', 'a'] + d.index = ["b", "c", "d", "a"] with tm.assert_produces_warning(expected_warning=FutureWarning): # FutureWarning to switch to alignment by default result = s.str.cat(d) @@ -506,8 +560,8 @@ def test_str_cat_mixed_inputs(self, box): assert_series_or_index_equal(result, expected) # errors for incorrect lengths - rgx = 'All arrays must be same length, except those having an index.*' - z = Series(['1', '2', '3']) + rgx = "All arrays must be same length, except those having an index.*" + z = Series(["1", "2", "3"]) e = concat([z, z], axis=1) # DataFrame @@ -531,13 +585,13 @@ def test_str_cat_mixed_inputs(self, box): s.str.cat([z.values, s]) # errors for incorrect arguments in list-like - rgx = 'others must be Series, Index, DataFrame,.*' + rgx = "others must be Series, Index, DataFrame,.*" # make sure None/NaN do not crash checks in _get_series_list - u = Series(['a', np.nan, 'c', None]) + u = Series(["a", np.nan, "c", None]) # mix of string and Series with pytest.raises(TypeError, match=rgx): - s.str.cat([u, 'u']) + s.str.cat([u, "u"]) # DataFrame in list with pytest.raises(TypeError, match=rgx): @@ -565,59 +619,59 @@ def test_str_cat_mixed_inputs(self, box): with pytest.raises(TypeError, match=rgx): s.str.cat(1) - @pytest.mark.parametrize('join', ['left', 'outer', 'inner', 'right']) - @pytest.mark.parametrize('box', [Series, Index]) + @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"]) + @pytest.mark.parametrize("box", [Series, Index]) def test_str_cat_align_indexed(self, box, join): # https://github.com/pandas-dev/pandas/issues/18657 - s = Series(['a', 'b', 'c', 'd'], index=['a', 'b', 'c', 'd']) - t = Series(['D', 'A', 'E', 'B'], index=['d', 'a', 'e', 'b']) + s = Series(["a", "b", "c", "d"], index=["a", "b", "c", "d"]) + t = Series(["D", "A", "E", "B"], index=["d", "a", "e", "b"]) sa, ta = s.align(t, join=join) # result after manual alignment of inputs - expected = sa.str.cat(ta, na_rep='-') + expected = sa.str.cat(ta, na_rep="-") if box == Index: s = Index(s) sa = Index(sa) expected = Index(expected) - result = s.str.cat(t, join=join, na_rep='-') + result = s.str.cat(t, join=join, na_rep="-") assert_series_or_index_equal(result, expected) - @pytest.mark.parametrize('join', ['left', 'outer', 'inner', 'right']) + @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"]) def test_str_cat_align_mixed_inputs(self, join): - s = Series(['a', 'b', 'c', 'd']) - t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1]) + s = Series(["a", "b", "c", "d"]) + t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1]) d = concat([t, t], axis=1) - expected_outer = Series(['aaa', 'bbb', 'c--', 'ddd', '-ee']) + expected_outer = Series(["aaa", "bbb", "c--", "ddd", "-ee"]) expected = expected_outer.loc[s.index.join(t.index, how=join)] # list of Series - result = s.str.cat([t, t], join=join, na_rep='-') + result = s.str.cat([t, t], join=join, na_rep="-") tm.assert_series_equal(result, expected) # DataFrame - result = s.str.cat(d, join=join, na_rep='-') + result = s.str.cat(d, join=join, na_rep="-") tm.assert_series_equal(result, expected) # mixed list of indexed/unindexed - u = np.array(['A', 'B', 'C', 'D']) - expected_outer = Series(['aaA', 'bbB', 'c-C', 'ddD', '-e-']) + u = np.array(["A", "B", "C", "D"]) + expected_outer = Series(["aaA", "bbB", "c-C", "ddD", "-e-"]) # joint index of rhs [t, u]; u will be forced have index of s - rhs_idx = t.index & s.index if join == 'inner' else t.index | s.index + rhs_idx = t.index & s.index if join == "inner" else t.index | s.index expected = expected_outer.loc[s.index.join(rhs_idx, how=join)] - result = s.str.cat([t, u], join=join, na_rep='-') + result = s.str.cat([t, u], join=join, na_rep="-") tm.assert_series_equal(result, expected) with tm.assert_produces_warning(expected_warning=FutureWarning): # nested list-likes will be deprecated - result = s.str.cat([t, list(u)], join=join, na_rep='-') + result = s.str.cat([t, list(u)], join=join, na_rep="-") tm.assert_series_equal(result, expected) # errors for incorrect lengths - rgx = r'If `others` contains arrays or lists \(or other list-likes.*' - z = Series(['1', '2', '3']).values + rgx = r"If `others` contains arrays or lists \(or other list-likes.*" + z = Series(["1", "2", "3"]).values # unindexed object of wrong length with pytest.raises(ValueError, match=rgx): @@ -627,13 +681,13 @@ def test_str_cat_align_mixed_inputs(self, join): with pytest.raises(ValueError, match=rgx): s.str.cat([t, z], join=join) - @pytest.mark.parametrize('box', [Series, Index]) - @pytest.mark.parametrize('other', [Series, Index]) + @pytest.mark.parametrize("box", [Series, Index]) + @pytest.mark.parametrize("other", [Series, Index]) def test_str_cat_all_na(self, box, other): # GH 24044 # check that all NaNs in caller / target work - s = Index(['a', 'b', 'c', 'd']) + s = Index(["a", "b", "c", "d"]) s = s if box == Index else Series(s, index=s) t = other([np.nan] * 4, dtype=object) # add index of s for alignment @@ -644,74 +698,77 @@ def test_str_cat_all_na(self, box, other): expected = Series([np.nan] * 4, index=s.index, dtype=object) else: # box == Index expected = Index([np.nan] * 4, dtype=object) - result = s.str.cat(t, join='left') + result = s.str.cat(t, join="left") assert_series_or_index_equal(result, expected) # all-NA caller (only for Series) if other == Series: expected = Series([np.nan] * 4, dtype=object, index=t.index) - result = t.str.cat(s, join='left') + result = t.str.cat(s, join="left") tm.assert_series_equal(result, expected) def test_str_cat_special_cases(self): - s = Series(['a', 'b', 'c', 'd']) - t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1]) + s = Series(["a", "b", "c", "d"]) + t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1]) # iterator of elements with different types - expected = Series(['aaa', 'bbb', 'c-c', 'ddd', '-e-']) - result = s.str.cat(iter([t, s.values]), join='outer', na_rep='-') + expected = Series(["aaa", "bbb", "c-c", "ddd", "-e-"]) + result = s.str.cat(iter([t, s.values]), join="outer", na_rep="-") tm.assert_series_equal(result, expected) # right-align with different indexes in others - expected = Series(['aa-', 'd-d'], index=[0, 3]) - result = s.str.cat([t.loc[[0]], t.loc[[3]]], join='right', na_rep='-') + expected = Series(["aa-", "d-d"], index=[0, 3]) + result = s.str.cat([t.loc[[0]], t.loc[[3]]], join="right", na_rep="-") tm.assert_series_equal(result, expected) def test_cat_on_filtered_index(self): - df = DataFrame(index=MultiIndex.from_product( - [[2011, 2012], [1, 2, 3]], names=['year', 'month'])) + df = DataFrame( + index=MultiIndex.from_product( + [[2011, 2012], [1, 2, 3]], names=["year", "month"] + ) + ) df = df.reset_index() df = df[df.month > 1] - str_year = df.year.astype('str') - str_month = df.month.astype('str') - str_both = str_year.str.cat(str_month, sep=' ') + str_year = df.year.astype("str") + str_month = df.month.astype("str") + str_both = str_year.str.cat(str_month, sep=" ") - assert str_both.loc[1] == '2011 2' + assert str_both.loc[1] == "2011 2" - str_multiple = str_year.str.cat([str_month, str_month], sep=' ') + str_multiple = str_year.str.cat([str_month, str_month], sep=" ") - assert str_multiple.loc[1] == '2011 2 2' + assert str_multiple.loc[1] == "2011 2 2" def test_count(self): - values = np.array(['foo', 'foofoo', NA, 'foooofooofommmfoo'], - dtype=np.object_) + values = np.array(["foo", "foofoo", NA, "foooofooofommmfoo"], dtype=np.object_) - result = strings.str_count(values, 'f[o]+') + result = strings.str_count(values, "f[o]+") exp = np.array([1, 2, NA, 4]) tm.assert_numpy_array_equal(result, exp) - result = Series(values).str.count('f[o]+') + result = Series(values).str.count("f[o]+") exp = Series([1, 2, NA, 4]) assert isinstance(result, Series) tm.assert_series_equal(result, exp) # mixed - mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] - rs = strings.str_count(mixed, 'a') + mixed = ["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0] + rs = strings.str_count(mixed, "a") xp = np.array([1, NA, 0, NA, NA, 0, NA, NA, NA]) tm.assert_numpy_array_equal(rs, xp) - rs = Series(mixed).str.count('a') + rs = Series(mixed).str.count("a") xp = Series([1, NA, 0, NA, NA, 0, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) def test_contains(self): - values = np.array(['foo', NA, 'fooommm__foo', - 'mmm_', 'foommm[_]+bar'], dtype=np.object_) - pat = 'mmm[_]+' + values = np.array( + ["foo", NA, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ + ) + pat = "mmm[_]+" result = strings.str_contains(values, pat) expected = np.array([False, NA, True, True, False], dtype=np.object_) @@ -721,39 +778,37 @@ def test_contains(self): expected = np.array([False, NA, False, False, True], dtype=np.object_) tm.assert_numpy_array_equal(result, expected) - values = ['foo', 'xyz', 'fooommm__foo', 'mmm_'] + values = ["foo", "xyz", "fooommm__foo", "mmm_"] result = strings.str_contains(values, pat) expected = np.array([False, False, True, True]) assert result.dtype == np.bool_ tm.assert_numpy_array_equal(result, expected) # case insensitive using regex - values = ['Foo', 'xYz', 'fOOomMm__fOo', 'MMM_'] - result = strings.str_contains(values, 'FOO|mmm', case=False) + values = ["Foo", "xYz", "fOOomMm__fOo", "MMM_"] + result = strings.str_contains(values, "FOO|mmm", case=False) expected = np.array([True, False, True, True]) tm.assert_numpy_array_equal(result, expected) # case insensitive without regex - result = strings.str_contains(values, 'foo', regex=False, case=False) + result = strings.str_contains(values, "foo", regex=False, case=False) expected = np.array([True, False, True, False]) tm.assert_numpy_array_equal(result, expected) # mixed - mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] - rs = strings.str_contains(mixed, 'o') - xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA], - dtype=np.object_) + mixed = ["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0] + rs = strings.str_contains(mixed, "o") + xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA], dtype=np.object_) tm.assert_numpy_array_equal(rs, xp) - rs = Series(mixed).str.contains('o') + rs = Series(mixed).str.contains("o") xp = Series([False, NA, False, NA, NA, True, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) # unicode - values = np.array(['foo', NA, 'fooommm__foo', 'mmm_'], - dtype=np.object_) - pat = 'mmm[_]+' + values = np.array(["foo", NA, "fooommm__foo", "mmm_"], dtype=np.object_) + pat = "mmm[_]+" result = strings.str_contains(values, pat) expected = np.array([False, np.nan, True, True], dtype=np.object_) @@ -763,8 +818,7 @@ def test_contains(self): expected = np.array([False, False, True, True]) tm.assert_numpy_array_equal(result, expected) - values = np.array(['foo', 'xyz', 'fooommm__foo', 'mmm_'], - dtype=np.object_) + values = np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=np.object_) result = strings.str_contains(values, pat) expected = np.array([False, False, True, True]) assert result.dtype == np.bool_ @@ -775,65 +829,65 @@ def test_contains_for_object_category(self): # na for category values = Series(["a", "b", "c", "a", np.nan], dtype="category") - result = values.str.contains('a', na=True) + result = values.str.contains("a", na=True) expected = Series([True, False, False, True, True]) tm.assert_series_equal(result, expected) - result = values.str.contains('a', na=False) + result = values.str.contains("a", na=False) expected = Series([True, False, False, True, False]) tm.assert_series_equal(result, expected) # na for objects values = Series(["a", "b", "c", "a", np.nan]) - result = values.str.contains('a', na=True) + result = values.str.contains("a", na=True) expected = Series([True, False, False, True, True]) tm.assert_series_equal(result, expected) - result = values.str.contains('a', na=False) + result = values.str.contains("a", na=False) expected = Series([True, False, False, True, False]) tm.assert_series_equal(result, expected) def test_startswith(self): - values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo']) + values = Series(["om", NA, "foo_nom", "nom", "bar_foo", NA, "foo"]) - result = values.str.startswith('foo') + result = values.str.startswith("foo") exp = Series([False, NA, True, False, False, NA, True]) tm.assert_series_equal(result, exp) - result = values.str.startswith('foo', na=True) + result = values.str.startswith("foo", na=True) tm.assert_series_equal(result, exp.fillna(True).astype(bool)) # mixed - mixed = np.array(['a', NA, 'b', True, datetime.today(), - 'foo', None, 1, 2.], dtype=np.object_) - rs = strings.str_startswith(mixed, 'f') - xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA], - dtype=np.object_) + mixed = np.array( + ["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0], + dtype=np.object_, + ) + rs = strings.str_startswith(mixed, "f") + xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA], dtype=np.object_) tm.assert_numpy_array_equal(rs, xp) - rs = Series(mixed).str.startswith('f') + rs = Series(mixed).str.startswith("f") assert isinstance(rs, Series) xp = Series([False, NA, False, NA, NA, True, NA, NA, NA]) tm.assert_series_equal(rs, xp) def test_endswith(self): - values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo']) + values = Series(["om", NA, "foo_nom", "nom", "bar_foo", NA, "foo"]) - result = values.str.endswith('foo') + result = values.str.endswith("foo") exp = Series([False, NA, False, False, True, NA, True]) tm.assert_series_equal(result, exp) - result = values.str.endswith('foo', na=False) + result = values.str.endswith("foo", na=False) tm.assert_series_equal(result, exp.fillna(False).astype(bool)) # mixed - mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] - rs = strings.str_endswith(mixed, 'f') - xp = np.array([False, NA, False, NA, NA, False, NA, NA, NA], - dtype=np.object_) + mixed = ["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0] + rs = strings.str_endswith(mixed, "f") + xp = np.array([False, NA, False, NA, NA, False, NA, NA, NA], dtype=np.object_) tm.assert_numpy_array_equal(rs, xp) - rs = Series(mixed).str.endswith('f') + rs = Series(mixed).str.endswith("f") xp = Series([False, NA, False, NA, NA, False, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) @@ -846,28 +900,26 @@ def test_title(self): tm.assert_series_equal(result, exp) # mixed - mixed = Series(["FOO", NA, "bar", True, datetime.today(), "blah", None, - 1, 2.]) + mixed = Series(["FOO", NA, "bar", True, datetime.today(), "blah", None, 1, 2.0]) mixed = mixed.str.title() exp = Series(["Foo", NA, "Bar", NA, NA, "Blah", NA, NA, NA]) tm.assert_almost_equal(mixed, exp) def test_lower_upper(self): - values = Series(['om', NA, 'nom', 'nom']) + values = Series(["om", NA, "nom", "nom"]) result = values.str.upper() - exp = Series(['OM', NA, 'NOM', 'NOM']) + exp = Series(["OM", NA, "NOM", "NOM"]) tm.assert_series_equal(result, exp) result = result.str.lower() tm.assert_series_equal(result, values) # mixed - mixed = Series(['a', NA, 'b', True, datetime.today(), 'foo', None, 1, - 2.]) + mixed = Series(["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0]) mixed = mixed.str.upper() rs = Series(mixed).str.lower() - xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA]) + xp = Series(["a", NA, "b", NA, NA, "foo", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) @@ -878,8 +930,7 @@ def test_capitalize(self): tm.assert_series_equal(result, exp) # mixed - mixed = Series(["FOO", NA, "bar", True, datetime.today(), "blah", None, - 1, 2.]) + mixed = Series(["FOO", NA, "bar", True, datetime.today(), "blah", None, 1, 2.0]) mixed = mixed.str.capitalize() exp = Series(["Foo", NA, "Bar", NA, NA, "Blah", NA, NA, NA]) tm.assert_almost_equal(mixed, exp) @@ -891,14 +942,13 @@ def test_swapcase(self): tm.assert_series_equal(result, exp) # mixed - mixed = Series(["FOO", NA, "bar", True, datetime.today(), "Blah", None, - 1, 2.]) + mixed = Series(["FOO", NA, "bar", True, datetime.today(), "Blah", None, 1, 2.0]) mixed = mixed.str.swapcase() exp = Series(["foo", NA, "BAR", NA, NA, "bLAH", NA, NA, NA]) tm.assert_almost_equal(mixed, exp) def test_casemethods(self): - values = ['aaa', 'bbb', 'CCC', 'Dddd', 'eEEE'] + values = ["aaa", "bbb", "CCC", "Dddd", "eEEE"] s = Series(values) assert s.str.lower().tolist() == [v.lower() for v in values] assert s.str.upper().tolist() == [v.upper() for v in values] @@ -907,22 +957,23 @@ def test_casemethods(self): assert s.str.swapcase().tolist() == [v.swapcase() for v in values] def test_replace(self): - values = Series(['fooBAD__barBAD', NA]) + values = Series(["fooBAD__barBAD", NA]) - result = values.str.replace('BAD[_]*', '') - exp = Series(['foobar', NA]) + result = values.str.replace("BAD[_]*", "") + exp = Series(["foobar", NA]) tm.assert_series_equal(result, exp) - result = values.str.replace('BAD[_]*', '', n=1) - exp = Series(['foobarBAD', NA]) + result = values.str.replace("BAD[_]*", "", n=1) + exp = Series(["foobarBAD", NA]) tm.assert_series_equal(result, exp) # mixed - mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD', - None, 1, 2.]) + mixed = Series( + ["aBAD", NA, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] + ) - rs = Series(mixed).str.replace('BAD[_]*', '') - xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA]) + rs = Series(mixed).str.replace("BAD[_]*", "") + xp = Series(["a", NA, "b", NA, NA, "foo", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) @@ -935,66 +986,69 @@ def test_replace(self): # GH 13438 msg = "repl must be a string or callable" for klass in (Series, Index): - for repl in (None, 3, {'a': 'b'}): - for data in (['a', 'b', None], ['a', 'b', 'c', 'ad']): + for repl in (None, 3, {"a": "b"}): + for data in (["a", "b", None], ["a", "b", "c", "ad"]): values = klass(data) with pytest.raises(TypeError, match=msg): - values.str.replace('a', repl) + values.str.replace("a", repl) def test_replace_callable(self): # GH 15055 - values = Series(['fooBAD__barBAD', NA]) + values = Series(["fooBAD__barBAD", NA]) # test with callable repl = lambda m: m.group(0).swapcase() - result = values.str.replace('[a-z][A-Z]{2}', repl, n=2) - exp = Series(['foObaD__baRbaD', NA]) + result = values.str.replace("[a-z][A-Z]{2}", repl, n=2) + exp = Series(["foObaD__baRbaD", NA]) tm.assert_series_equal(result, exp) # test with wrong number of arguments, raising an error - p_err = (r'((takes)|(missing)) (?(2)from \d+ to )?\d+ ' - r'(?(3)required )positional arguments?') + p_err = ( + r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " + r"(?(3)required )positional arguments?" + ) repl = lambda: None with pytest.raises(TypeError, match=p_err): - values.str.replace('a', repl) + values.str.replace("a", repl) repl = lambda m, x: None with pytest.raises(TypeError, match=p_err): - values.str.replace('a', repl) + values.str.replace("a", repl) repl = lambda m, x, y=None: None with pytest.raises(TypeError, match=p_err): - values.str.replace('a', repl) + values.str.replace("a", repl) # test regex named groups - values = Series(['Foo Bar Baz', NA]) + values = Series(["Foo Bar Baz", NA]) pat = r"(?P\w+) (?P\w+) (?P\w+)" - repl = lambda m: m.group('middle').swapcase() + repl = lambda m: m.group("middle").swapcase() result = values.str.replace(pat, repl) - exp = Series(['bAR', NA]) + exp = Series(["bAR", NA]) tm.assert_series_equal(result, exp) def test_replace_compiled_regex(self): # GH 15446 - values = Series(['fooBAD__barBAD', NA]) + values = Series(["fooBAD__barBAD", NA]) # test with compiled regex - pat = re.compile(r'BAD[_]*') - result = values.str.replace(pat, '') - exp = Series(['foobar', NA]) + pat = re.compile(r"BAD[_]*") + result = values.str.replace(pat, "") + exp = Series(["foobar", NA]) tm.assert_series_equal(result, exp) - result = values.str.replace(pat, '', n=1) - exp = Series(['foobarBAD', NA]) + result = values.str.replace(pat, "", n=1) + exp = Series(["foobarBAD", NA]) tm.assert_series_equal(result, exp) # mixed - mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD', - None, 1, 2.]) + mixed = Series( + ["aBAD", NA, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] + ) - rs = Series(mixed).str.replace(pat, '') - xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA]) + rs = Series(mixed).str.replace(pat, "") + xp = Series(["a", NA, "b", NA, NA, "foo", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) @@ -1007,410 +1061,399 @@ def test_replace_compiled_regex(self): # case and flags provided to str.replace will have no effect # and will produce warnings - values = Series(['fooBAD__barBAD__bad', NA]) - pat = re.compile(r'BAD[_]*') + values = Series(["fooBAD__barBAD__bad", NA]) + pat = re.compile(r"BAD[_]*") - with pytest.raises(ValueError, - match="case and flags cannot be"): - result = values.str.replace(pat, '', flags=re.IGNORECASE) + with pytest.raises(ValueError, match="case and flags cannot be"): + result = values.str.replace(pat, "", flags=re.IGNORECASE) - with pytest.raises(ValueError, - match="case and flags cannot be"): - result = values.str.replace(pat, '', case=False) + with pytest.raises(ValueError, match="case and flags cannot be"): + result = values.str.replace(pat, "", case=False) - with pytest.raises(ValueError, - match="case and flags cannot be"): - result = values.str.replace(pat, '', case=True) + with pytest.raises(ValueError, match="case and flags cannot be"): + result = values.str.replace(pat, "", case=True) # test with callable - values = Series(['fooBAD__barBAD', NA]) + values = Series(["fooBAD__barBAD", NA]) repl = lambda m: m.group(0).swapcase() - pat = re.compile('[a-z][A-Z]{2}') + pat = re.compile("[a-z][A-Z]{2}") result = values.str.replace(pat, repl, n=2) - exp = Series(['foObaD__baRbaD', NA]) + exp = Series(["foObaD__baRbaD", NA]) tm.assert_series_equal(result, exp) def test_replace_literal(self): # GH16808 literal replace (regex=False vs regex=True) - values = Series(['f.o', 'foo', NA]) - exp = Series(['bao', 'bao', NA]) - result = values.str.replace('f.', 'ba') + values = Series(["f.o", "foo", NA]) + exp = Series(["bao", "bao", NA]) + result = values.str.replace("f.", "ba") tm.assert_series_equal(result, exp) - exp = Series(['bao', 'foo', NA]) - result = values.str.replace('f.', 'ba', regex=False) + exp = Series(["bao", "foo", NA]) + result = values.str.replace("f.", "ba", regex=False) tm.assert_series_equal(result, exp) # Cannot do a literal replace if given a callable repl or compiled # pattern callable_repl = lambda m: m.group(0).swapcase() - compiled_pat = re.compile('[a-z][A-Z]{2}') + compiled_pat = re.compile("[a-z][A-Z]{2}") msg = "Cannot use a callable replacement when regex=False" with pytest.raises(ValueError, match=msg): - values.str.replace('abc', callable_repl, regex=False) + values.str.replace("abc", callable_repl, regex=False) - msg = ("Cannot use a compiled regex as replacement pattern with" - " regex=False") + msg = "Cannot use a compiled regex as replacement pattern with" " regex=False" with pytest.raises(ValueError, match=msg): - values.str.replace(compiled_pat, '', regex=False) + values.str.replace(compiled_pat, "", regex=False) def test_repeat(self): - values = Series(['a', 'b', NA, 'c', NA, 'd']) + values = Series(["a", "b", NA, "c", NA, "d"]) result = values.str.repeat(3) - exp = Series(['aaa', 'bbb', NA, 'ccc', NA, 'ddd']) + exp = Series(["aaa", "bbb", NA, "ccc", NA, "ddd"]) tm.assert_series_equal(result, exp) result = values.str.repeat([1, 2, 3, 4, 5, 6]) - exp = Series(['a', 'bb', NA, 'cccc', NA, 'dddddd']) + exp = Series(["a", "bb", NA, "cccc", NA, "dddddd"]) tm.assert_series_equal(result, exp) # mixed - mixed = Series(['a', NA, 'b', True, datetime.today(), 'foo', None, 1, - 2.]) + mixed = Series(["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0]) rs = Series(mixed).str.repeat(3) - xp = Series(['aaa', NA, 'bbb', NA, NA, 'foofoofoo', NA, NA, NA]) + xp = Series(["aaa", NA, "bbb", NA, NA, "foofoofoo", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) def test_match(self): # New match behavior introduced in 0.13 - values = Series(['fooBAD__barBAD', NA, 'foo']) - result = values.str.match('.*(BAD[_]+).*(BAD)') + values = Series(["fooBAD__barBAD", NA, "foo"]) + result = values.str.match(".*(BAD[_]+).*(BAD)") exp = Series([True, NA, False]) tm.assert_series_equal(result, exp) - values = Series(['fooBAD__barBAD', NA, 'foo']) - result = values.str.match('.*BAD[_]+.*BAD') + values = Series(["fooBAD__barBAD", NA, "foo"]) + result = values.str.match(".*BAD[_]+.*BAD") exp = Series([True, NA, False]) tm.assert_series_equal(result, exp) # mixed - mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(), - 'foo', None, 1, 2.]) - rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)') + mixed = Series( + ["aBAD_BAD", NA, "BAD_b_BAD", True, datetime.today(), "foo", None, 1, 2.0] + ) + rs = Series(mixed).str.match(".*(BAD[_]+).*(BAD)") xp = Series([True, NA, True, NA, NA, False, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) # na GH #6609 - res = Series(['a', 0, np.nan]).str.match('a', na=False) + res = Series(["a", 0, np.nan]).str.match("a", na=False) exp = Series([True, False, False]) assert_series_equal(exp, res) - res = Series(['a', 0, np.nan]).str.match('a') + res = Series(["a", 0, np.nan]).str.match("a") exp = Series([True, np.nan, np.nan]) assert_series_equal(exp, res) def test_extract_expand_None(self): - values = Series(['fooBAD__barBAD', NA, 'foo']) - with pytest.raises(ValueError, - match='expand must be True or False'): - values.str.extract('.*(BAD[_]+).*(BAD)', expand=None) + values = Series(["fooBAD__barBAD", NA, "foo"]) + with pytest.raises(ValueError, match="expand must be True or False"): + values.str.extract(".*(BAD[_]+).*(BAD)", expand=None) def test_extract_expand_unspecified(self): - values = Series(['fooBAD__barBAD', NA, 'foo']) - result_unspecified = values.str.extract('.*(BAD[_]+).*') + values = Series(["fooBAD__barBAD", NA, "foo"]) + result_unspecified = values.str.extract(".*(BAD[_]+).*") assert isinstance(result_unspecified, DataFrame) - result_true = values.str.extract('.*(BAD[_]+).*', expand=True) + result_true = values.str.extract(".*(BAD[_]+).*", expand=True) tm.assert_frame_equal(result_unspecified, result_true) def test_extract_expand_False(self): # Contains tests like those in test_match and some others. - values = Series(['fooBAD__barBAD', NA, 'foo']) + values = Series(["fooBAD__barBAD", NA, "foo"]) er = [NA, NA] # empty row - result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=False) - exp = DataFrame([['BAD__', 'BAD'], er, er]) + result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=False) + exp = DataFrame([["BAD__", "BAD"], er, er]) tm.assert_frame_equal(result, exp) # mixed - mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(), - 'foo', None, 1, 2.]) + mixed = Series( + ["aBAD_BAD", NA, "BAD_b_BAD", True, datetime.today(), "foo", None, 1, 2.0] + ) - rs = Series(mixed).str.extract('.*(BAD[_]+).*(BAD)', expand=False) - exp = DataFrame([['BAD_', 'BAD'], er, ['BAD_', 'BAD'], er, er, er, er, - er, er]) + rs = Series(mixed).str.extract(".*(BAD[_]+).*(BAD)", expand=False) + exp = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er]) tm.assert_frame_equal(rs, exp) # unicode - values = Series(['fooBAD__barBAD', NA, 'foo']) + values = Series(["fooBAD__barBAD", NA, "foo"]) - result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=False) - exp = DataFrame([['BAD__', 'BAD'], er, er]) + result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=False) + exp = DataFrame([["BAD__", "BAD"], er, er]) tm.assert_frame_equal(result, exp) # GH9980 # Index only works with one regex group since # multi-group would expand to a frame - idx = Index(['A1', 'A2', 'A3', 'A4', 'B5']) + idx = Index(["A1", "A2", "A3", "A4", "B5"]) with pytest.raises(ValueError, match="supported"): - idx.str.extract('([AB])([123])', expand=False) + idx.str.extract("([AB])([123])", expand=False) # these should work for both Series and Index for klass in [Series, Index]: # no groups - s_or_idx = klass(['A1', 'B2', 'C3']) + s_or_idx = klass(["A1", "B2", "C3"]) msg = "pattern contains no capture groups" with pytest.raises(ValueError, match=msg): - s_or_idx.str.extract('[ABC][123]', expand=False) + s_or_idx.str.extract("[ABC][123]", expand=False) # only non-capturing groups with pytest.raises(ValueError, match=msg): - s_or_idx.str.extract('(?:[AB]).*', expand=False) + s_or_idx.str.extract("(?:[AB]).*", expand=False) # single group renames series/index properly - s_or_idx = klass(['A1', 'A2']) - result = s_or_idx.str.extract(r'(?PA)\d', expand=False) - assert result.name == 'uno' + s_or_idx = klass(["A1", "A2"]) + result = s_or_idx.str.extract(r"(?PA)\d", expand=False) + assert result.name == "uno" - exp = klass(['A', 'A'], name='uno') + exp = klass(["A", "A"], name="uno") if klass == Series: tm.assert_series_equal(result, exp) else: tm.assert_index_equal(result, exp) - s = Series(['A1', 'B2', 'C3']) + s = Series(["A1", "B2", "C3"]) # one group, no matches - result = s.str.extract('(_)', expand=False) + result = s.str.extract("(_)", expand=False) exp = Series([NA, NA, NA], dtype=object) tm.assert_series_equal(result, exp) # two groups, no matches - result = s.str.extract('(_)(_)', expand=False) + result = s.str.extract("(_)(_)", expand=False) exp = DataFrame([[NA, NA], [NA, NA], [NA, NA]], dtype=object) tm.assert_frame_equal(result, exp) # one group, some matches - result = s.str.extract('([AB])[123]', expand=False) - exp = Series(['A', 'B', NA]) + result = s.str.extract("([AB])[123]", expand=False) + exp = Series(["A", "B", NA]) tm.assert_series_equal(result, exp) # two groups, some matches - result = s.str.extract('([AB])([123])', expand=False) - exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]]) + result = s.str.extract("([AB])([123])", expand=False) + exp = DataFrame([["A", "1"], ["B", "2"], [NA, NA]]) tm.assert_frame_equal(result, exp) # one named group - result = s.str.extract('(?P[AB])', expand=False) - exp = Series(['A', 'B', NA], name='letter') + result = s.str.extract("(?P[AB])", expand=False) + exp = Series(["A", "B", NA], name="letter") tm.assert_series_equal(result, exp) # two named groups - result = s.str.extract('(?P[AB])(?P[123])', - expand=False) - exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], - columns=['letter', 'number']) + result = s.str.extract("(?P[AB])(?P[123])", expand=False) + exp = DataFrame( + [["A", "1"], ["B", "2"], [NA, NA]], columns=["letter", "number"] + ) tm.assert_frame_equal(result, exp) # mix named and unnamed groups - result = s.str.extract('([AB])(?P[123])', expand=False) - exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], - columns=[0, 'number']) + result = s.str.extract("([AB])(?P[123])", expand=False) + exp = DataFrame([["A", "1"], ["B", "2"], [NA, NA]], columns=[0, "number"]) tm.assert_frame_equal(result, exp) # one normal group, one non-capturing group - result = s.str.extract('([AB])(?:[123])', expand=False) - exp = Series(['A', 'B', NA]) + result = s.str.extract("([AB])(?:[123])", expand=False) + exp = Series(["A", "B", NA]) tm.assert_series_equal(result, exp) # two normal groups, one non-capturing group - result = Series(['A11', 'B22', 'C33']).str.extract( - '([AB])([123])(?:[123])', expand=False) - exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]]) + result = Series(["A11", "B22", "C33"]).str.extract( + "([AB])([123])(?:[123])", expand=False + ) + exp = DataFrame([["A", "1"], ["B", "2"], [NA, NA]]) tm.assert_frame_equal(result, exp) # one optional group followed by one normal group - result = Series(['A1', 'B2', '3']).str.extract( - '(?P[AB])?(?P[123])', expand=False) - exp = DataFrame([['A', '1'], ['B', '2'], [NA, '3']], - columns=['letter', 'number']) + result = Series(["A1", "B2", "3"]).str.extract( + "(?P[AB])?(?P[123])", expand=False + ) + exp = DataFrame( + [["A", "1"], ["B", "2"], [NA, "3"]], columns=["letter", "number"] + ) tm.assert_frame_equal(result, exp) # one normal group followed by one optional group - result = Series(['A1', 'B2', 'C']).str.extract( - '(?P[ABC])(?P[123])?', expand=False) - exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], - columns=['letter', 'number']) + result = Series(["A1", "B2", "C"]).str.extract( + "(?P[ABC])(?P[123])?", expand=False + ) + exp = DataFrame( + [["A", "1"], ["B", "2"], ["C", NA]], columns=["letter", "number"] + ) tm.assert_frame_equal(result, exp) # GH6348 # not passing index to the extractor def check_index(index): - data = ['A1', 'B2', 'C'] - index = index[:len(data)] + data = ["A1", "B2", "C"] + index = index[: len(data)] s = Series(data, index=index) - result = s.str.extract(r'(\d)', expand=False) - exp = Series(['1', '2', NA], index=index) + result = s.str.extract(r"(\d)", expand=False) + exp = Series(["1", "2", NA], index=index) tm.assert_series_equal(result, exp) result = Series(data, index=index).str.extract( - r'(?P\D)(?P\d)?', expand=False) - e_list = [ - ['A', '1'], - ['B', '2'], - ['C', NA] - ] - exp = DataFrame(e_list, columns=['letter', 'number'], index=index) + r"(?P\D)(?P\d)?", expand=False + ) + e_list = [["A", "1"], ["B", "2"], ["C", NA]] + exp = DataFrame(e_list, columns=["letter", "number"], index=index) tm.assert_frame_equal(result, exp) i_funs = [ - tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex, - tm.makeDateIndex, tm.makePeriodIndex, tm.makeRangeIndex + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeIntIndex, + tm.makeDateIndex, + tm.makePeriodIndex, + tm.makeRangeIndex, ] for index in i_funs: check_index(index()) # single_series_name_is_preserved. - s = Series(['a3', 'b3', 'c2'], name='bob') - r = s.str.extract(r'(?P[a-z])', expand=False) - e = Series(['a', 'b', 'c'], name='sue') + s = Series(["a3", "b3", "c2"], name="bob") + r = s.str.extract(r"(?P[a-z])", expand=False) + e = Series(["a", "b", "c"], name="sue") tm.assert_series_equal(r, e) assert r.name == e.name def test_extract_expand_True(self): # Contains tests like those in test_match and some others. - values = Series(['fooBAD__barBAD', NA, 'foo']) + values = Series(["fooBAD__barBAD", NA, "foo"]) er = [NA, NA] # empty row - result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=True) - exp = DataFrame([['BAD__', 'BAD'], er, er]) + result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=True) + exp = DataFrame([["BAD__", "BAD"], er, er]) tm.assert_frame_equal(result, exp) # mixed - mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(), - 'foo', None, 1, 2.]) + mixed = Series( + ["aBAD_BAD", NA, "BAD_b_BAD", True, datetime.today(), "foo", None, 1, 2.0] + ) - rs = Series(mixed).str.extract('.*(BAD[_]+).*(BAD)', expand=True) - exp = DataFrame([['BAD_', 'BAD'], er, ['BAD_', 'BAD'], er, er, - er, er, er, er]) + rs = Series(mixed).str.extract(".*(BAD[_]+).*(BAD)", expand=True) + exp = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er]) tm.assert_frame_equal(rs, exp) # these should work for both Series and Index for klass in [Series, Index]: # no groups - s_or_idx = klass(['A1', 'B2', 'C3']) + s_or_idx = klass(["A1", "B2", "C3"]) msg = "pattern contains no capture groups" with pytest.raises(ValueError, match=msg): - s_or_idx.str.extract('[ABC][123]', expand=True) + s_or_idx.str.extract("[ABC][123]", expand=True) # only non-capturing groups with pytest.raises(ValueError, match=msg): - s_or_idx.str.extract('(?:[AB]).*', expand=True) + s_or_idx.str.extract("(?:[AB]).*", expand=True) # single group renames series/index properly - s_or_idx = klass(['A1', 'A2']) - result_df = s_or_idx.str.extract(r'(?PA)\d', expand=True) + s_or_idx = klass(["A1", "A2"]) + result_df = s_or_idx.str.extract(r"(?PA)\d", expand=True) assert isinstance(result_df, DataFrame) - result_series = result_df['uno'] - assert_series_equal(result_series, Series(['A', 'A'], name='uno')) + result_series = result_df["uno"] + assert_series_equal(result_series, Series(["A", "A"], name="uno")) def test_extract_series(self): # extract should give the same result whether or not the # series has a name. for series_name in None, "series_name": - s = Series(['A1', 'B2', 'C3'], name=series_name) + s = Series(["A1", "B2", "C3"], name=series_name) # one group, no matches - result = s.str.extract('(_)', expand=True) + result = s.str.extract("(_)", expand=True) exp = DataFrame([NA, NA, NA], dtype=object) tm.assert_frame_equal(result, exp) # two groups, no matches - result = s.str.extract('(_)(_)', expand=True) + result = s.str.extract("(_)(_)", expand=True) exp = DataFrame([[NA, NA], [NA, NA], [NA, NA]], dtype=object) tm.assert_frame_equal(result, exp) # one group, some matches - result = s.str.extract('([AB])[123]', expand=True) - exp = DataFrame(['A', 'B', NA]) + result = s.str.extract("([AB])[123]", expand=True) + exp = DataFrame(["A", "B", NA]) tm.assert_frame_equal(result, exp) # two groups, some matches - result = s.str.extract('([AB])([123])', expand=True) - exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]]) + result = s.str.extract("([AB])([123])", expand=True) + exp = DataFrame([["A", "1"], ["B", "2"], [NA, NA]]) tm.assert_frame_equal(result, exp) # one named group - result = s.str.extract('(?P[AB])', expand=True) - exp = DataFrame({"letter": ['A', 'B', NA]}) + result = s.str.extract("(?P[AB])", expand=True) + exp = DataFrame({"letter": ["A", "B", NA]}) tm.assert_frame_equal(result, exp) # two named groups - result = s.str.extract( - '(?P[AB])(?P[123])', - expand=True) - e_list = [ - ['A', '1'], - ['B', '2'], - [NA, NA] - ] - exp = DataFrame(e_list, columns=['letter', 'number']) + result = s.str.extract("(?P[AB])(?P[123])", expand=True) + e_list = [["A", "1"], ["B", "2"], [NA, NA]] + exp = DataFrame(e_list, columns=["letter", "number"]) tm.assert_frame_equal(result, exp) # mix named and unnamed groups - result = s.str.extract('([AB])(?P[123])', expand=True) - exp = DataFrame(e_list, columns=[0, 'number']) + result = s.str.extract("([AB])(?P[123])", expand=True) + exp = DataFrame(e_list, columns=[0, "number"]) tm.assert_frame_equal(result, exp) # one normal group, one non-capturing group - result = s.str.extract('([AB])(?:[123])', expand=True) - exp = DataFrame(['A', 'B', NA]) + result = s.str.extract("([AB])(?:[123])", expand=True) + exp = DataFrame(["A", "B", NA]) tm.assert_frame_equal(result, exp) def test_extract_optional_groups(self): # two normal groups, one non-capturing group - result = Series(['A11', 'B22', 'C33']).str.extract( - '([AB])([123])(?:[123])', expand=True) - exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]]) + result = Series(["A11", "B22", "C33"]).str.extract( + "([AB])([123])(?:[123])", expand=True + ) + exp = DataFrame([["A", "1"], ["B", "2"], [NA, NA]]) tm.assert_frame_equal(result, exp) # one optional group followed by one normal group - result = Series(['A1', 'B2', '3']).str.extract( - '(?P[AB])?(?P[123])', expand=True) - e_list = [ - ['A', '1'], - ['B', '2'], - [NA, '3'] - ] - exp = DataFrame(e_list, columns=['letter', 'number']) + result = Series(["A1", "B2", "3"]).str.extract( + "(?P[AB])?(?P[123])", expand=True + ) + e_list = [["A", "1"], ["B", "2"], [NA, "3"]] + exp = DataFrame(e_list, columns=["letter", "number"]) tm.assert_frame_equal(result, exp) # one normal group followed by one optional group - result = Series(['A1', 'B2', 'C']).str.extract( - '(?P[ABC])(?P[123])?', expand=True) - e_list = [ - ['A', '1'], - ['B', '2'], - ['C', NA] - ] - exp = DataFrame(e_list, columns=['letter', 'number']) + result = Series(["A1", "B2", "C"]).str.extract( + "(?P[ABC])(?P[123])?", expand=True + ) + e_list = [["A", "1"], ["B", "2"], ["C", NA]] + exp = DataFrame(e_list, columns=["letter", "number"]) tm.assert_frame_equal(result, exp) # GH6348 # not passing index to the extractor def check_index(index): - data = ['A1', 'B2', 'C'] - index = index[:len(data)] - result = Series(data, index=index).str.extract( - r'(\d)', expand=True) - exp = DataFrame(['1', '2', NA], index=index) + data = ["A1", "B2", "C"] + index = index[: len(data)] + result = Series(data, index=index).str.extract(r"(\d)", expand=True) + exp = DataFrame(["1", "2", NA], index=index) tm.assert_frame_equal(result, exp) result = Series(data, index=index).str.extract( - r'(?P\D)(?P\d)?', expand=True) - e_list = [ - ['A', '1'], - ['B', '2'], - ['C', NA] - ] - exp = DataFrame(e_list, columns=['letter', 'number'], index=index) + r"(?P\D)(?P\d)?", expand=True + ) + e_list = [["A", "1"], ["B", "2"], ["C", NA]] + exp = DataFrame(e_list, columns=["letter", "number"], index=index) tm.assert_frame_equal(result, exp) i_funs = [ - tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex, - tm.makeDateIndex, tm.makePeriodIndex, tm.makeRangeIndex + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeIntIndex, + tm.makeDateIndex, + tm.makePeriodIndex, + tm.makeRangeIndex, ] for index in i_funs: check_index(index()) @@ -1419,18 +1462,18 @@ def test_extract_single_group_returns_frame(self): # GH11386 extract should always return DataFrame, even when # there is only one group. Prior to v0.18.0, extract returned # Series when there was only one group in the regex. - s = Series(['a3', 'b3', 'c2'], name='series_name') - r = s.str.extract(r'(?P[a-z])', expand=True) - e = DataFrame({"letter": ['a', 'b', 'c']}) + s = Series(["a3", "b3", "c2"], name="series_name") + r = s.str.extract(r"(?P[a-z])", expand=True) + e = DataFrame({"letter": ["a", "b", "c"]}) tm.assert_frame_equal(r, e) def test_extractall(self): subject_list = [ - 'dave@google.com', - 'tdhock5@gmail.com', - 'maudelaperriere@gmail.com', - 'rob@gmail.com some text steve@gmail.com', - 'a@b.com some text c@d.com and e@f.com', + "dave@google.com", + "tdhock5@gmail.com", + "maudelaperriere@gmail.com", + "rob@gmail.com some text steve@gmail.com", + "a@b.com some text c@d.com and e@f.com", np.nan, "", ] @@ -1438,8 +1481,11 @@ def test_extractall(self): ("dave", "google", "com"), ("tdhock5", "gmail", "com"), ("maudelaperriere", "gmail", "com"), - ("rob", "gmail", "com"), ("steve", "gmail", "com"), - ("a", "b", "com"), ("c", "d", "com"), ("e", "f", "com"), + ("rob", "gmail", "com"), + ("steve", "gmail", "com"), + ("a", "b", "com"), + ("c", "d", "com"), + ("e", "f", "com"), ] named_pattern = r""" (?P[a-z0-9]+) @@ -1452,45 +1498,42 @@ def test_extractall(self): S = Series(subject_list) # extractall should return a DataFrame with one row for each # match, indexed by the subject from which the match came. - expected_index = MultiIndex.from_tuples([ - (0, 0), - (1, 0), - (2, 0), - (3, 0), - (3, 1), - (4, 0), - (4, 1), - (4, 2), - ], names=(None, "match")) - expected_df = DataFrame( - expected_tuples, expected_index, expected_columns) + expected_index = MultiIndex.from_tuples( + [(0, 0), (1, 0), (2, 0), (3, 0), (3, 1), (4, 0), (4, 1), (4, 2)], + names=(None, "match"), + ) + expected_df = DataFrame(expected_tuples, expected_index, expected_columns) computed_df = S.str.extractall(named_pattern, re.VERBOSE) tm.assert_frame_equal(computed_df, expected_df) # The index of the input Series should be used to construct # the index of the output DataFrame: - series_index = MultiIndex.from_tuples([ - ("single", "Dave"), - ("single", "Toby"), - ("single", "Maude"), - ("multiple", "robAndSteve"), - ("multiple", "abcdef"), - ("none", "missing"), - ("none", "empty"), - ]) + series_index = MultiIndex.from_tuples( + [ + ("single", "Dave"), + ("single", "Toby"), + ("single", "Maude"), + ("multiple", "robAndSteve"), + ("multiple", "abcdef"), + ("none", "missing"), + ("none", "empty"), + ] + ) Si = Series(subject_list, series_index) - expected_index = MultiIndex.from_tuples([ - ("single", "Dave", 0), - ("single", "Toby", 0), - ("single", "Maude", 0), - ("multiple", "robAndSteve", 0), - ("multiple", "robAndSteve", 1), - ("multiple", "abcdef", 0), - ("multiple", "abcdef", 1), - ("multiple", "abcdef", 2), - ], names=(None, None, "match")) - expected_df = DataFrame( - expected_tuples, expected_index, expected_columns) + expected_index = MultiIndex.from_tuples( + [ + ("single", "Dave", 0), + ("single", "Toby", 0), + ("single", "Maude", 0), + ("multiple", "robAndSteve", 0), + ("multiple", "robAndSteve", 1), + ("multiple", "abcdef", 0), + ("multiple", "abcdef", 1), + ("multiple", "abcdef", 2), + ], + names=(None, None, "match"), + ) + expected_df = DataFrame(expected_tuples, expected_index, expected_columns) computed_df = Si.str.extractall(named_pattern, re.VERBOSE) tm.assert_frame_equal(computed_df, expected_df) @@ -1498,81 +1541,73 @@ def test_extractall(self): Sn = Series(subject_list, series_index) Sn.index.names = ("matches", "description") expected_index.names = ("matches", "description", "match") - expected_df = DataFrame( - expected_tuples, expected_index, expected_columns) + expected_df = DataFrame(expected_tuples, expected_index, expected_columns) computed_df = Sn.str.extractall(named_pattern, re.VERBOSE) tm.assert_frame_equal(computed_df, expected_df) # optional groups. - subject_list = ['', 'A1', '32'] - named_pattern = '(?P[AB])?(?P[123])' + subject_list = ["", "A1", "32"] + named_pattern = "(?P[AB])?(?P[123])" computed_df = Series(subject_list).str.extractall(named_pattern) - expected_index = MultiIndex.from_tuples([ - (1, 0), - (2, 0), - (2, 1), - ], names=(None, "match")) - expected_df = DataFrame([ - ('A', '1'), - (NA, '3'), - (NA, '2'), - ], expected_index, columns=['letter', 'number']) + expected_index = MultiIndex.from_tuples( + [(1, 0), (2, 0), (2, 1)], names=(None, "match") + ) + expected_df = DataFrame( + [("A", "1"), (NA, "3"), (NA, "2")], + expected_index, + columns=["letter", "number"], + ) tm.assert_frame_equal(computed_df, expected_df) # only one of two groups has a name. - pattern = '([AB])?(?P[123])' + pattern = "([AB])?(?P[123])" computed_df = Series(subject_list).str.extractall(pattern) - expected_df = DataFrame([ - ('A', '1'), - (NA, '3'), - (NA, '2'), - ], expected_index, columns=[0, 'number']) + expected_df = DataFrame( + [("A", "1"), (NA, "3"), (NA, "2")], expected_index, columns=[0, "number"] + ) tm.assert_frame_equal(computed_df, expected_df) def test_extractall_single_group(self): # extractall(one named group) returns DataFrame with one named # column. - s = Series(['a3', 'b3', 'd4c2'], name='series_name') - r = s.str.extractall(r'(?P[a-z])') - i = MultiIndex.from_tuples([ - (0, 0), - (1, 0), - (2, 0), - (2, 1), - ], names=(None, "match")) - e = DataFrame({"letter": ['a', 'b', 'd', 'c']}, i) + s = Series(["a3", "b3", "d4c2"], name="series_name") + r = s.str.extractall(r"(?P[a-z])") + i = MultiIndex.from_tuples( + [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match") + ) + e = DataFrame({"letter": ["a", "b", "d", "c"]}, i) tm.assert_frame_equal(r, e) # extractall(one un-named group) returns DataFrame with one # un-named column. - r = s.str.extractall(r'([a-z])') - e = DataFrame(['a', 'b', 'd', 'c'], i) + r = s.str.extractall(r"([a-z])") + e = DataFrame(["a", "b", "d", "c"], i) tm.assert_frame_equal(r, e) def test_extractall_single_group_with_quantifier(self): # extractall(one un-named group with quantifier) returns # DataFrame with one un-named column (GH13382). - s = Series(['ab3', 'abc3', 'd4cd2'], name='series_name') - r = s.str.extractall(r'([a-z]+)') - i = MultiIndex.from_tuples([ - (0, 0), - (1, 0), - (2, 0), - (2, 1), - ], names=(None, "match")) - e = DataFrame(['ab', 'abc', 'd', 'cd'], i) + s = Series(["ab3", "abc3", "d4cd2"], name="series_name") + r = s.str.extractall(r"([a-z]+)") + i = MultiIndex.from_tuples( + [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match") + ) + e = DataFrame(["ab", "abc", "d", "cd"], i) tm.assert_frame_equal(r, e) - @pytest.mark.parametrize('data, names', [ - ([], (None, )), - ([], ('i1', )), - ([], (None, 'i2')), - ([], ('i1', 'i2')), - (['a3', 'b3', 'd4c2'], (None, )), - (['a3', 'b3', 'd4c2'], ('i1', 'i2')), - (['a3', 'b3', 'd4c2'], (None, 'i2')), - (['a3', 'b3', 'd4c2'], ('i1', 'i2')), - ]) + @pytest.mark.parametrize( + "data, names", + [ + ([], (None,)), + ([], ("i1",)), + ([], (None, "i2")), + ([], ("i1", "i2")), + (["a3", "b3", "d4c2"], (None,)), + (["a3", "b3", "d4c2"], ("i1", "i2")), + (["a3", "b3", "d4c2"], (None, "i2")), + (["a3", "b3", "d4c2"], ("i1", "i2")), + ], + ) def test_extractall_no_matches(self, data, names): # GH19075 extractall with no matches should return a valid MultiIndex n = len(data) @@ -1581,108 +1616,109 @@ def test_extractall_no_matches(self, data, names): else: a = (tuple([i] * (n - 1)) for i in range(n)) i = MultiIndex.from_tuples(a, names=names) - s = Series(data, name='series_name', index=i, dtype='object') - ei = MultiIndex.from_tuples([], names=(names + ('match',))) + s = Series(data, name="series_name", index=i, dtype="object") + ei = MultiIndex.from_tuples([], names=(names + ("match",))) # one un-named group. - r = s.str.extractall('(z)') + r = s.str.extractall("(z)") e = DataFrame(columns=[0], index=ei) tm.assert_frame_equal(r, e) # two un-named groups. - r = s.str.extractall('(z)(z)') + r = s.str.extractall("(z)(z)") e = DataFrame(columns=[0, 1], index=ei) tm.assert_frame_equal(r, e) # one named group. - r = s.str.extractall('(?Pz)') + r = s.str.extractall("(?Pz)") e = DataFrame(columns=["first"], index=ei) tm.assert_frame_equal(r, e) # two named groups. - r = s.str.extractall('(?Pz)(?Pz)') + r = s.str.extractall("(?Pz)(?Pz)") e = DataFrame(columns=["first", "second"], index=ei) tm.assert_frame_equal(r, e) # one named, one un-named. - r = s.str.extractall('(z)(?Pz)') + r = s.str.extractall("(z)(?Pz)") e = DataFrame(columns=[0, "second"], index=ei) tm.assert_frame_equal(r, e) def test_extractall_stringindex(self): - s = Series(["a1a2", "b1", "c1"], name='xxx') + s = Series(["a1a2", "b1", "c1"], name="xxx") res = s.str.extractall(r"[ab](?P\d)") - exp_idx = MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0)], - names=[None, 'match']) - exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx) + exp_idx = MultiIndex.from_tuples( + [(0, 0), (0, 1), (1, 0)], names=[None, "match"] + ) + exp = DataFrame({"digit": ["1", "2", "1"]}, index=exp_idx) tm.assert_frame_equal(res, exp) # index should return the same result as the default index without name # thus index.name doesn't affect to the result - for idx in [Index(["a1a2", "b1", "c1"]), - Index(["a1a2", "b1", "c1"], name='xxx')]: + for idx in [ + Index(["a1a2", "b1", "c1"]), + Index(["a1a2", "b1", "c1"], name="xxx"), + ]: res = idx.str.extractall(r"[ab](?P\d)") tm.assert_frame_equal(res, exp) - s = Series(["a1a2", "b1", "c1"], name='s_name', - index=Index(["XX", "yy", "zz"], name='idx_name')) + s = Series( + ["a1a2", "b1", "c1"], + name="s_name", + index=Index(["XX", "yy", "zz"], name="idx_name"), + ) res = s.str.extractall(r"[ab](?P\d)") - exp_idx = MultiIndex.from_tuples([("XX", 0), ("XX", 1), ("yy", 0)], - names=["idx_name", 'match']) - exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx) + exp_idx = MultiIndex.from_tuples( + [("XX", 0), ("XX", 1), ("yy", 0)], names=["idx_name", "match"] + ) + exp = DataFrame({"digit": ["1", "2", "1"]}, index=exp_idx) tm.assert_frame_equal(res, exp) def test_extractall_errors(self): # Does not make sense to use extractall with a regex that has # no capture groups. (it returns DataFrame with one column for # each capture group) - s = Series(['a3', 'b3', 'd4c2'], name='series_name') + s = Series(["a3", "b3", "d4c2"], name="series_name") with pytest.raises(ValueError, match="no capture groups"): - s.str.extractall(r'[a-z]') + s.str.extractall(r"[a-z]") def test_extract_index_one_two_groups(self): - s = Series(['a3', 'b3', 'd4c2'], index=["A3", "B3", "D4"], - name='series_name') - r = s.index.str.extract(r'([A-Z])', expand=True) - e = DataFrame(['A', "B", "D"]) + s = Series(["a3", "b3", "d4c2"], index=["A3", "B3", "D4"], name="series_name") + r = s.index.str.extract(r"([A-Z])", expand=True) + e = DataFrame(["A", "B", "D"]) tm.assert_frame_equal(r, e) # Prior to v0.18.0, index.str.extract(regex with one group) # returned Index. With more than one group, extract raised an # error (GH9980). Now extract always returns DataFrame. - r = s.index.str.extract( - r'(?P[A-Z])(?P[0-9])', expand=True) - e_list = [ - ("A", "3"), - ("B", "3"), - ("D", "4"), - ] + r = s.index.str.extract(r"(?P[A-Z])(?P[0-9])", expand=True) + e_list = [("A", "3"), ("B", "3"), ("D", "4")] e = DataFrame(e_list, columns=["letter", "digit"]) tm.assert_frame_equal(r, e) def test_extractall_same_as_extract(self): - s = Series(['a3', 'b3', 'c2'], name='series_name') + s = Series(["a3", "b3", "c2"], name="series_name") - pattern_two_noname = r'([a-z])([0-9])' + pattern_two_noname = r"([a-z])([0-9])" extract_two_noname = s.str.extract(pattern_two_noname, expand=True) has_multi_index = s.str.extractall(pattern_two_noname) no_multi_index = has_multi_index.xs(0, level="match") tm.assert_frame_equal(extract_two_noname, no_multi_index) - pattern_two_named = r'(?P[a-z])(?P[0-9])' + pattern_two_named = r"(?P[a-z])(?P[0-9])" extract_two_named = s.str.extract(pattern_two_named, expand=True) has_multi_index = s.str.extractall(pattern_two_named) no_multi_index = has_multi_index.xs(0, level="match") tm.assert_frame_equal(extract_two_named, no_multi_index) - pattern_one_named = r'(?P[a-z])' + pattern_one_named = r"(?P[a-z])" extract_one_named = s.str.extract(pattern_one_named, expand=True) has_multi_index = s.str.extractall(pattern_one_named) no_multi_index = has_multi_index.xs(0, level="match") tm.assert_frame_equal(extract_one_named, no_multi_index) - pattern_one_noname = r'([a-z])' + pattern_one_noname = r"([a-z])" extract_one_noname = s.str.extract(pattern_one_noname, expand=True) has_multi_index = s.str.extractall(pattern_one_noname) no_multi_index = has_multi_index.xs(0, level="match") @@ -1690,32 +1726,31 @@ def test_extractall_same_as_extract(self): def test_extractall_same_as_extract_subject_index(self): # same as above tests, but s has an MultiIndex. - i = MultiIndex.from_tuples([ - ("A", "first"), - ("B", "second"), - ("C", "third"), - ], names=("capital", "ordinal")) - s = Series(['a3', 'b3', 'c2'], i, name='series_name') - - pattern_two_noname = r'([a-z])([0-9])' + i = MultiIndex.from_tuples( + [("A", "first"), ("B", "second"), ("C", "third")], + names=("capital", "ordinal"), + ) + s = Series(["a3", "b3", "c2"], i, name="series_name") + + pattern_two_noname = r"([a-z])([0-9])" extract_two_noname = s.str.extract(pattern_two_noname, expand=True) has_match_index = s.str.extractall(pattern_two_noname) no_match_index = has_match_index.xs(0, level="match") tm.assert_frame_equal(extract_two_noname, no_match_index) - pattern_two_named = r'(?P[a-z])(?P[0-9])' + pattern_two_named = r"(?P[a-z])(?P[0-9])" extract_two_named = s.str.extract(pattern_two_named, expand=True) has_match_index = s.str.extractall(pattern_two_named) no_match_index = has_match_index.xs(0, level="match") tm.assert_frame_equal(extract_two_named, no_match_index) - pattern_one_named = r'(?P[a-z])' + pattern_one_named = r"(?P[a-z])" extract_one_named = s.str.extract(pattern_one_named, expand=True) has_match_index = s.str.extractall(pattern_one_named) no_match_index = has_match_index.xs(0, level="match") tm.assert_frame_equal(extract_one_named, no_match_index) - pattern_one_noname = r'([a-z])' + pattern_one_noname = r"([a-z])" extract_one_noname = s.str.extract(pattern_one_noname, expand=True) has_match_index = s.str.extractall(pattern_one_noname) no_match_index = has_match_index.xs(0, level="match") @@ -1731,43 +1766,40 @@ def test_empty_str_methods(self): # (extract) on empty series tm.assert_series_equal(empty_str, empty.str.cat(empty)) - assert '' == empty.str.cat() + assert "" == empty.str.cat() tm.assert_series_equal(empty_str, empty.str.title()) - tm.assert_series_equal(empty_int, empty.str.count('a')) - tm.assert_series_equal(empty_bool, empty.str.contains('a')) - tm.assert_series_equal(empty_bool, empty.str.startswith('a')) - tm.assert_series_equal(empty_bool, empty.str.endswith('a')) + tm.assert_series_equal(empty_int, empty.str.count("a")) + tm.assert_series_equal(empty_bool, empty.str.contains("a")) + tm.assert_series_equal(empty_bool, empty.str.startswith("a")) + tm.assert_series_equal(empty_bool, empty.str.endswith("a")) tm.assert_series_equal(empty_str, empty.str.lower()) tm.assert_series_equal(empty_str, empty.str.upper()) - tm.assert_series_equal(empty_str, empty.str.replace('a', 'b')) + tm.assert_series_equal(empty_str, empty.str.replace("a", "b")) tm.assert_series_equal(empty_str, empty.str.repeat(3)) - tm.assert_series_equal(empty_bool, empty.str.match('^a')) + tm.assert_series_equal(empty_bool, empty.str.match("^a")) tm.assert_frame_equal( - DataFrame(columns=[0], dtype=str), - empty.str.extract('()', expand=True)) + DataFrame(columns=[0], dtype=str), empty.str.extract("()", expand=True) + ) tm.assert_frame_equal( - DataFrame(columns=[0, 1], dtype=str), - empty.str.extract('()()', expand=True)) - tm.assert_series_equal( - empty_str, - empty.str.extract('()', expand=False)) + DataFrame(columns=[0, 1], dtype=str), empty.str.extract("()()", expand=True) + ) + tm.assert_series_equal(empty_str, empty.str.extract("()", expand=False)) tm.assert_frame_equal( DataFrame(columns=[0, 1], dtype=str), - empty.str.extract('()()', expand=False)) + empty.str.extract("()()", expand=False), + ) tm.assert_frame_equal(DataFrame(dtype=str), empty.str.get_dummies()) - tm.assert_series_equal(empty_str, empty_str.str.join('')) + tm.assert_series_equal(empty_str, empty_str.str.join("")) tm.assert_series_equal(empty_int, empty.str.len()) - tm.assert_series_equal(empty_str, empty_str.str.findall('a')) - tm.assert_series_equal(empty_int, empty.str.find('a')) - tm.assert_series_equal(empty_int, empty.str.rfind('a')) + tm.assert_series_equal(empty_str, empty_str.str.findall("a")) + tm.assert_series_equal(empty_int, empty.str.find("a")) + tm.assert_series_equal(empty_int, empty.str.rfind("a")) tm.assert_series_equal(empty_str, empty.str.pad(42)) tm.assert_series_equal(empty_str, empty.str.center(42)) - tm.assert_series_equal(empty_str, empty.str.split('a')) - tm.assert_series_equal(empty_str, empty.str.rsplit('a')) - tm.assert_series_equal(empty_str, - empty.str.partition('a', expand=False)) - tm.assert_series_equal(empty_str, - empty.str.rpartition('a', expand=False)) + tm.assert_series_equal(empty_str, empty.str.split("a")) + tm.assert_series_equal(empty_str, empty.str.rsplit("a")) + tm.assert_series_equal(empty_str, empty.str.partition("a", expand=False)) + tm.assert_series_equal(empty_str, empty.str.rpartition("a", expand=False)) tm.assert_series_equal(empty_str, empty.str.slice(stop=1)) tm.assert_series_equal(empty_str, empty.str.slice(step=1)) tm.assert_series_equal(empty_str, empty.str.strip()) @@ -1775,8 +1807,8 @@ def test_empty_str_methods(self): tm.assert_series_equal(empty_str, empty.str.rstrip()) tm.assert_series_equal(empty_str, empty.str.wrap(42)) tm.assert_series_equal(empty_str, empty.str.get(0)) - tm.assert_series_equal(empty_str, empty_bytes.str.decode('ascii')) - tm.assert_series_equal(empty_bytes, empty.str.encode('ascii')) + tm.assert_series_equal(empty_str, empty_bytes.str.decode("ascii")) + tm.assert_series_equal(empty_bytes, empty.str.encode("ascii")) tm.assert_series_equal(empty_str, empty.str.isalnum()) tm.assert_series_equal(empty_str, empty.str.isalpha()) tm.assert_series_equal(empty_str, empty.str.isdigit()) @@ -1788,39 +1820,42 @@ def test_empty_str_methods(self): tm.assert_series_equal(empty_str, empty.str.isdecimal()) tm.assert_series_equal(empty_str, empty.str.capitalize()) tm.assert_series_equal(empty_str, empty.str.swapcase()) - tm.assert_series_equal(empty_str, empty.str.normalize('NFC')) + tm.assert_series_equal(empty_str, empty.str.normalize("NFC")) - table = str.maketrans('a', 'b') + table = str.maketrans("a", "b") tm.assert_series_equal(empty_str, empty.str.translate(table)) def test_empty_str_methods_to_frame(self): empty = Series(dtype=str) empty_df = DataFrame() - tm.assert_frame_equal(empty_df, empty.str.partition('a')) - tm.assert_frame_equal(empty_df, empty.str.rpartition('a')) + tm.assert_frame_equal(empty_df, empty.str.partition("a")) + tm.assert_frame_equal(empty_df, empty.str.rpartition("a")) def test_ismethods(self): - values = ['A', 'b', 'Xy', '4', '3A', '', 'TT', '55', '-', ' '] + values = ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "] str_s = Series(values) - alnum_e = [True, True, True, True, True, False, True, True, False, - False] - alpha_e = [True, True, True, False, False, False, True, False, False, - False] - digit_e = [False, False, False, True, False, False, False, True, False, - False] + alnum_e = [True, True, True, True, True, False, True, True, False, False] + alpha_e = [True, True, True, False, False, False, True, False, False, False] + digit_e = [False, False, False, True, False, False, False, True, False, False] # TODO: unused - num_e = [False, False, False, True, False, False, # noqa - False, True, False, False] - - space_e = [False, False, False, False, False, False, False, False, - False, True] - lower_e = [False, True, False, False, False, False, False, False, - False, False] - upper_e = [True, False, False, False, True, False, True, False, False, - False] - title_e = [True, False, True, False, True, False, False, False, False, - False] + num_e = [ + False, + False, + False, + True, + False, + False, # noqa + False, + True, + False, + False, + ] + + space_e = [False, False, False, False, False, False, False, False, False, True] + lower_e = [False, True, False, False, False, False, False, False, False, False] + upper_e = [True, False, False, False, True, False, True, False, False, False] + title_e = [True, False, True, False, True, False, False, False, False, False] tm.assert_series_equal(str_s.str.isalnum(), Series(alnum_e)) tm.assert_series_equal(str_s.str.isalpha(), Series(alpha_e)) @@ -1843,18 +1878,18 @@ def test_isnumeric(self): # 0x2605: ★ not number # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY # 0xFF13: 3 Em 3 - values = ['A', '3', '¼', '★', '፸', '3', 'four'] + values = ["A", "3", "¼", "★", "፸", "3", "four"] s = Series(values) numeric_e = [False, True, True, False, True, True, False] decimal_e = [False, True, False, False, False, True, False] tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e)) tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e)) - unicodes = ['A', '3', '¼', '★', '፸', '3', 'four'] + unicodes = ["A", "3", "¼", "★", "፸", "3", "four"] assert s.str.isnumeric().tolist() == [v.isnumeric() for v in unicodes] assert s.str.isdecimal().tolist() == [v.isdecimal() for v in unicodes] - values = ['A', np.nan, '¼', '★', np.nan, '3', 'four'] + values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] s = Series(values) numeric_e = [False, np.nan, True, False, np.nan, True, False] decimal_e = [False, np.nan, False, False, np.nan, True, False] @@ -1862,68 +1897,70 @@ def test_isnumeric(self): tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e)) def test_get_dummies(self): - s = Series(['a|b', 'a|c', np.nan]) - result = s.str.get_dummies('|') - expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], - columns=list('abc')) + s = Series(["a|b", "a|c", np.nan]) + result = s.str.get_dummies("|") + expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc")) tm.assert_frame_equal(result, expected) - s = Series(['a;b', 'a', 7]) - result = s.str.get_dummies(';') - expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]], - columns=list('7ab')) + s = Series(["a;b", "a", 7]) + result = s.str.get_dummies(";") + expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]], columns=list("7ab")) tm.assert_frame_equal(result, expected) # GH9980, GH8028 - idx = Index(['a|b', 'a|c', 'b|c']) - result = idx.str.get_dummies('|') + idx = Index(["a|b", "a|c", "b|c"]) + result = idx.str.get_dummies("|") - expected = MultiIndex.from_tuples([(1, 1, 0), (1, 0, 1), - (0, 1, 1)], names=('a', 'b', 'c')) + expected = MultiIndex.from_tuples( + [(1, 1, 0), (1, 0, 1), (0, 1, 1)], names=("a", "b", "c") + ) tm.assert_index_equal(result, expected) def test_get_dummies_with_name_dummy(self): # GH 12180 # Dummies named 'name' should work as expected - s = Series(['a', 'b,name', 'b']) - result = s.str.get_dummies(',') - expected = DataFrame([[1, 0, 0], [0, 1, 1], [0, 1, 0]], - columns=['a', 'b', 'name']) + s = Series(["a", "b,name", "b"]) + result = s.str.get_dummies(",") + expected = DataFrame( + [[1, 0, 0], [0, 1, 1], [0, 1, 0]], columns=["a", "b", "name"] + ) tm.assert_frame_equal(result, expected) - idx = Index(['a|b', 'name|c', 'b|name']) - result = idx.str.get_dummies('|') + idx = Index(["a|b", "name|c", "b|name"]) + result = idx.str.get_dummies("|") - expected = MultiIndex.from_tuples([(1, 1, 0, 0), (0, 0, 1, 1), - (0, 1, 0, 1)], - names=('a', 'b', 'c', 'name')) + expected = MultiIndex.from_tuples( + [(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name") + ) tm.assert_index_equal(result, expected) def test_join(self): - values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h']) - result = values.str.split('_').str.join('_') + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) + result = values.str.split("_").str.join("_") tm.assert_series_equal(values, result) # mixed - mixed = Series(['a_b', NA, 'asdf_cas_asdf', True, datetime.today(), - 'foo', None, 1, 2.]) + mixed = Series( + ["a_b", NA, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0] + ) - rs = Series(mixed).str.split('_').str.join('_') - xp = Series(['a_b', NA, 'asdf_cas_asdf', NA, NA, 'foo', NA, NA, NA]) + rs = Series(mixed).str.split("_").str.join("_") + xp = Series(["a_b", NA, "asdf_cas_asdf", NA, NA, "foo", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) def test_len(self): - values = Series(['foo', 'fooo', 'fooooo', np.nan, 'fooooooo']) + values = Series(["foo", "fooo", "fooooo", np.nan, "fooooooo"]) result = values.str.len() exp = values.map(lambda x: len(x) if notna(x) else NA) tm.assert_series_equal(result, exp) # mixed - mixed = Series(['a_b', NA, 'asdf_cas_asdf', True, datetime.today(), - 'foo', None, 1, 2.]) + mixed = Series( + ["a_b", NA, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0] + ) rs = Series(mixed).str.len() xp = Series([3, NA, 13, NA, NA, 3, NA, NA, NA]) @@ -1932,90 +1969,84 @@ def test_len(self): tm.assert_almost_equal(rs, xp) def test_findall(self): - values = Series(['fooBAD__barBAD', NA, 'foo', 'BAD']) + values = Series(["fooBAD__barBAD", NA, "foo", "BAD"]) - result = values.str.findall('BAD[_]*') - exp = Series([['BAD__', 'BAD'], NA, [], ['BAD']]) + result = values.str.findall("BAD[_]*") + exp = Series([["BAD__", "BAD"], NA, [], ["BAD"]]) tm.assert_almost_equal(result, exp) # mixed - mixed = Series(['fooBAD__barBAD', NA, 'foo', True, datetime.today(), - 'BAD', None, 1, 2.]) + mixed = Series( + ["fooBAD__barBAD", NA, "foo", True, datetime.today(), "BAD", None, 1, 2.0] + ) - rs = Series(mixed).str.findall('BAD[_]*') - xp = Series([['BAD__', 'BAD'], NA, [], NA, NA, ['BAD'], NA, NA, NA]) + rs = Series(mixed).str.findall("BAD[_]*") + xp = Series([["BAD__", "BAD"], NA, [], NA, NA, ["BAD"], NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) def test_find(self): - values = Series(['ABCDEFG', 'BCDEFEF', 'DEFGHIJEF', 'EFGHEF', 'XXXX']) - result = values.str.find('EF') + values = Series(["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"]) + result = values.str.find("EF") tm.assert_series_equal(result, Series([4, 3, 1, 0, -1])) - expected = np.array([v.find('EF') for v in values.values], - dtype=np.int64) + expected = np.array([v.find("EF") for v in values.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) - result = values.str.rfind('EF') + result = values.str.rfind("EF") tm.assert_series_equal(result, Series([4, 5, 7, 4, -1])) - expected = np.array([v.rfind('EF') for v in values.values], - dtype=np.int64) + expected = np.array([v.rfind("EF") for v in values.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) - result = values.str.find('EF', 3) + result = values.str.find("EF", 3) tm.assert_series_equal(result, Series([4, 3, 7, 4, -1])) - expected = np.array([v.find('EF', 3) for v in values.values], - dtype=np.int64) + expected = np.array([v.find("EF", 3) for v in values.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) - result = values.str.rfind('EF', 3) + result = values.str.rfind("EF", 3) tm.assert_series_equal(result, Series([4, 5, 7, 4, -1])) - expected = np.array([v.rfind('EF', 3) for v in values.values], - dtype=np.int64) + expected = np.array([v.rfind("EF", 3) for v in values.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) - result = values.str.find('EF', 3, 6) + result = values.str.find("EF", 3, 6) tm.assert_series_equal(result, Series([4, 3, -1, 4, -1])) - expected = np.array([v.find('EF', 3, 6) for v in values.values], - dtype=np.int64) + expected = np.array([v.find("EF", 3, 6) for v in values.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) - result = values.str.rfind('EF', 3, 6) + result = values.str.rfind("EF", 3, 6) tm.assert_series_equal(result, Series([4, 3, -1, 4, -1])) - expected = np.array([v.rfind('EF', 3, 6) for v in values.values], - dtype=np.int64) + expected = np.array( + [v.rfind("EF", 3, 6) for v in values.values], dtype=np.int64 + ) tm.assert_numpy_array_equal(result.values, expected) - with pytest.raises(TypeError, - match="expected a string object, not int"): + with pytest.raises(TypeError, match="expected a string object, not int"): result = values.str.find(0) - with pytest.raises(TypeError, - match="expected a string object, not int"): + with pytest.raises(TypeError, match="expected a string object, not int"): result = values.str.rfind(0) def test_find_nan(self): - values = Series(['ABCDEFG', np.nan, 'DEFGHIJEF', np.nan, 'XXXX']) - result = values.str.find('EF') + values = Series(["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"]) + result = values.str.find("EF") tm.assert_series_equal(result, Series([4, np.nan, 1, np.nan, -1])) - result = values.str.rfind('EF') + result = values.str.rfind("EF") tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1])) - result = values.str.find('EF', 3) + result = values.str.find("EF", 3) tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1])) - result = values.str.rfind('EF', 3) + result = values.str.rfind("EF", 3) tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1])) - result = values.str.find('EF', 3, 6) + result = values.str.find("EF", 3, 6) tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1])) - result = values.str.rfind('EF', 3, 6) + result = values.str.rfind("EF", 3, 6) tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1])) def test_index(self): - def _check(result, expected): if isinstance(result, Series): tm.assert_series_equal(result, expected) @@ -2023,136 +2054,126 @@ def _check(result, expected): tm.assert_index_equal(result, expected) for klass in [Series, Index]: - s = klass(['ABCDEFG', 'BCDEFEF', 'DEFGHIJEF', 'EFGHEF']) + s = klass(["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"]) - result = s.str.index('EF') + result = s.str.index("EF") _check(result, klass([4, 3, 1, 0])) - expected = np.array([v.index('EF') for v in s.values], - dtype=np.int64) + expected = np.array([v.index("EF") for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) - result = s.str.rindex('EF') + result = s.str.rindex("EF") _check(result, klass([4, 5, 7, 4])) - expected = np.array([v.rindex('EF') for v in s.values], - dtype=np.int64) + expected = np.array([v.rindex("EF") for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) - result = s.str.index('EF', 3) + result = s.str.index("EF", 3) _check(result, klass([4, 3, 7, 4])) - expected = np.array([v.index('EF', 3) for v in s.values], - dtype=np.int64) + expected = np.array([v.index("EF", 3) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) - result = s.str.rindex('EF', 3) + result = s.str.rindex("EF", 3) _check(result, klass([4, 5, 7, 4])) - expected = np.array([v.rindex('EF', 3) for v in s.values], - dtype=np.int64) + expected = np.array([v.rindex("EF", 3) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) - result = s.str.index('E', 4, 8) + result = s.str.index("E", 4, 8) _check(result, klass([4, 5, 7, 4])) - expected = np.array([v.index('E', 4, 8) for v in s.values], - dtype=np.int64) + expected = np.array([v.index("E", 4, 8) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) - result = s.str.rindex('E', 0, 5) + result = s.str.rindex("E", 0, 5) _check(result, klass([4, 3, 1, 4])) - expected = np.array([v.rindex('E', 0, 5) for v in s.values], - dtype=np.int64) + expected = np.array([v.rindex("E", 0, 5) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) with pytest.raises(ValueError, match="substring not found"): - result = s.str.index('DE') + result = s.str.index("DE") msg = "expected a string object, not int" with pytest.raises(TypeError, match=msg): result = s.str.index(0) # test with nan - s = Series(['abcb', 'ab', 'bcbe', np.nan]) - result = s.str.index('b') + s = Series(["abcb", "ab", "bcbe", np.nan]) + result = s.str.index("b") tm.assert_series_equal(result, Series([1, 1, 0, np.nan])) - result = s.str.rindex('b') + result = s.str.rindex("b") tm.assert_series_equal(result, Series([3, 1, 2, np.nan])) def test_pad(self): - values = Series(['a', 'b', NA, 'c', NA, 'eeeeee']) + values = Series(["a", "b", NA, "c", NA, "eeeeee"]) - result = values.str.pad(5, side='left') - exp = Series([' a', ' b', NA, ' c', NA, 'eeeeee']) + result = values.str.pad(5, side="left") + exp = Series([" a", " b", NA, " c", NA, "eeeeee"]) tm.assert_almost_equal(result, exp) - result = values.str.pad(5, side='right') - exp = Series(['a ', 'b ', NA, 'c ', NA, 'eeeeee']) + result = values.str.pad(5, side="right") + exp = Series(["a ", "b ", NA, "c ", NA, "eeeeee"]) tm.assert_almost_equal(result, exp) - result = values.str.pad(5, side='both') - exp = Series([' a ', ' b ', NA, ' c ', NA, 'eeeeee']) + result = values.str.pad(5, side="both") + exp = Series([" a ", " b ", NA, " c ", NA, "eeeeee"]) tm.assert_almost_equal(result, exp) # mixed - mixed = Series(['a', NA, 'b', True, datetime.today(), 'ee', None, 1, 2. - ]) + mixed = Series(["a", NA, "b", True, datetime.today(), "ee", None, 1, 2.0]) - rs = Series(mixed).str.pad(5, side='left') - xp = Series([' a', NA, ' b', NA, NA, ' ee', NA, NA, NA]) + rs = Series(mixed).str.pad(5, side="left") + xp = Series([" a", NA, " b", NA, NA, " ee", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) - mixed = Series(['a', NA, 'b', True, datetime.today(), 'ee', None, 1, 2. - ]) + mixed = Series(["a", NA, "b", True, datetime.today(), "ee", None, 1, 2.0]) - rs = Series(mixed).str.pad(5, side='right') - xp = Series(['a ', NA, 'b ', NA, NA, 'ee ', NA, NA, NA]) + rs = Series(mixed).str.pad(5, side="right") + xp = Series(["a ", NA, "b ", NA, NA, "ee ", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) - mixed = Series(['a', NA, 'b', True, datetime.today(), 'ee', None, 1, 2. - ]) + mixed = Series(["a", NA, "b", True, datetime.today(), "ee", None, 1, 2.0]) - rs = Series(mixed).str.pad(5, side='both') - xp = Series([' a ', NA, ' b ', NA, NA, ' ee ', NA, NA, NA]) + rs = Series(mixed).str.pad(5, side="both") + xp = Series([" a ", NA, " b ", NA, NA, " ee ", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) def test_pad_fillchar(self): - values = Series(['a', 'b', NA, 'c', NA, 'eeeeee']) + values = Series(["a", "b", NA, "c", NA, "eeeeee"]) - result = values.str.pad(5, side='left', fillchar='X') - exp = Series(['XXXXa', 'XXXXb', NA, 'XXXXc', NA, 'eeeeee']) + result = values.str.pad(5, side="left", fillchar="X") + exp = Series(["XXXXa", "XXXXb", NA, "XXXXc", NA, "eeeeee"]) tm.assert_almost_equal(result, exp) - result = values.str.pad(5, side='right', fillchar='X') - exp = Series(['aXXXX', 'bXXXX', NA, 'cXXXX', NA, 'eeeeee']) + result = values.str.pad(5, side="right", fillchar="X") + exp = Series(["aXXXX", "bXXXX", NA, "cXXXX", NA, "eeeeee"]) tm.assert_almost_equal(result, exp) - result = values.str.pad(5, side='both', fillchar='X') - exp = Series(['XXaXX', 'XXbXX', NA, 'XXcXX', NA, 'eeeeee']) + result = values.str.pad(5, side="both", fillchar="X") + exp = Series(["XXaXX", "XXbXX", NA, "XXcXX", NA, "eeeeee"]) tm.assert_almost_equal(result, exp) msg = "fillchar must be a character, not str" with pytest.raises(TypeError, match=msg): - result = values.str.pad(5, fillchar='XY') + result = values.str.pad(5, fillchar="XY") msg = "fillchar must be a character, not int" with pytest.raises(TypeError, match=msg): result = values.str.pad(5, fillchar=5) - @pytest.mark.parametrize("f", ['center', 'ljust', 'rjust', 'zfill', 'pad']) + @pytest.mark.parametrize("f", ["center", "ljust", "rjust", "zfill", "pad"]) def test_pad_width(self, f): # see gh-13598 - s = Series(['1', '22', 'a', 'bb']) + s = Series(["1", "22", "a", "bb"]) msg = "width must be of integer type, not*" with pytest.raises(TypeError, match=msg): - getattr(s.str, f)('f') + getattr(s.str, f)("f") def test_translate(self): - def _check(result, expected): if isinstance(result, Series): tm.assert_series_equal(result, expected) @@ -2160,77 +2181,70 @@ def _check(result, expected): tm.assert_index_equal(result, expected) for klass in [Series, Index]: - s = klass(['abcdefg', 'abcc', 'cdddfg', 'cdefggg']) - table = str.maketrans('abc', 'cde') + s = klass(["abcdefg", "abcc", "cdddfg", "cdefggg"]) + table = str.maketrans("abc", "cde") result = s.str.translate(table) - expected = klass(['cdedefg', 'cdee', 'edddfg', 'edefggg']) + expected = klass(["cdedefg", "cdee", "edddfg", "edefggg"]) _check(result, expected) # Series with non-string values - s = Series(['a', 'b', 'c', 1.2]) - expected = Series(['c', 'd', 'e', np.nan]) + s = Series(["a", "b", "c", 1.2]) + expected = Series(["c", "d", "e", np.nan]) result = s.str.translate(table) tm.assert_series_equal(result, expected) def test_center_ljust_rjust(self): - values = Series(['a', 'b', NA, 'c', NA, 'eeeeee']) + values = Series(["a", "b", NA, "c", NA, "eeeeee"]) result = values.str.center(5) - exp = Series([' a ', ' b ', NA, ' c ', NA, 'eeeeee']) + exp = Series([" a ", " b ", NA, " c ", NA, "eeeeee"]) tm.assert_almost_equal(result, exp) result = values.str.ljust(5) - exp = Series(['a ', 'b ', NA, 'c ', NA, 'eeeeee']) + exp = Series(["a ", "b ", NA, "c ", NA, "eeeeee"]) tm.assert_almost_equal(result, exp) result = values.str.rjust(5) - exp = Series([' a', ' b', NA, ' c', NA, 'eeeeee']) + exp = Series([" a", " b", NA, " c", NA, "eeeeee"]) tm.assert_almost_equal(result, exp) # mixed - mixed = Series(['a', NA, 'b', True, datetime.today(), 'c', 'eee', None, - 1, 2.]) + mixed = Series(["a", NA, "b", True, datetime.today(), "c", "eee", None, 1, 2.0]) rs = Series(mixed).str.center(5) - xp = Series([' a ', NA, ' b ', NA, NA, ' c ', ' eee ', NA, NA, NA - ]) + xp = Series([" a ", NA, " b ", NA, NA, " c ", " eee ", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.ljust(5) - xp = Series(['a ', NA, 'b ', NA, NA, 'c ', 'eee ', NA, NA, NA - ]) + xp = Series(["a ", NA, "b ", NA, NA, "c ", "eee ", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.rjust(5) - xp = Series([' a', NA, ' b', NA, NA, ' c', ' eee', NA, NA, NA - ]) + xp = Series([" a", NA, " b", NA, NA, " c", " eee", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) def test_center_ljust_rjust_fillchar(self): - values = Series(['a', 'bb', 'cccc', 'ddddd', 'eeeeee']) + values = Series(["a", "bb", "cccc", "ddddd", "eeeeee"]) - result = values.str.center(5, fillchar='X') - expected = Series(['XXaXX', 'XXbbX', 'Xcccc', 'ddddd', 'eeeeee']) + result = values.str.center(5, fillchar="X") + expected = Series(["XXaXX", "XXbbX", "Xcccc", "ddddd", "eeeeee"]) tm.assert_series_equal(result, expected) - expected = np.array([v.center(5, 'X') for v in values.values], - dtype=np.object_) + expected = np.array([v.center(5, "X") for v in values.values], dtype=np.object_) tm.assert_numpy_array_equal(result.values, expected) - result = values.str.ljust(5, fillchar='X') - expected = Series(['aXXXX', 'bbXXX', 'ccccX', 'ddddd', 'eeeeee']) + result = values.str.ljust(5, fillchar="X") + expected = Series(["aXXXX", "bbXXX", "ccccX", "ddddd", "eeeeee"]) tm.assert_series_equal(result, expected) - expected = np.array([v.ljust(5, 'X') for v in values.values], - dtype=np.object_) + expected = np.array([v.ljust(5, "X") for v in values.values], dtype=np.object_) tm.assert_numpy_array_equal(result.values, expected) - result = values.str.rjust(5, fillchar='X') - expected = Series(['XXXXa', 'XXXbb', 'Xcccc', 'ddddd', 'eeeeee']) + result = values.str.rjust(5, fillchar="X") + expected = Series(["XXXXa", "XXXbb", "Xcccc", "ddddd", "eeeeee"]) tm.assert_series_equal(result, expected) - expected = np.array([v.rjust(5, 'X') for v in values.values], - dtype=np.object_) + expected = np.array([v.rjust(5, "X") for v in values.values], dtype=np.object_) tm.assert_numpy_array_equal(result.values, expected) # If fillchar is not a charatter, normal str raises TypeError @@ -2239,13 +2253,13 @@ def test_center_ljust_rjust_fillchar(self): template = "fillchar must be a character, not {dtype}" with pytest.raises(TypeError, match=template.format(dtype="str")): - values.str.center(5, fillchar='XY') + values.str.center(5, fillchar="XY") with pytest.raises(TypeError, match=template.format(dtype="str")): - values.str.ljust(5, fillchar='XY') + values.str.ljust(5, fillchar="XY") with pytest.raises(TypeError, match=template.format(dtype="str")): - values.str.rjust(5, fillchar='XY') + values.str.rjust(5, fillchar="XY") with pytest.raises(TypeError, match=template.format(dtype="int")): values.str.center(5, fillchar=1) @@ -2257,125 +2271,125 @@ def test_center_ljust_rjust_fillchar(self): values.str.rjust(5, fillchar=1) def test_zfill(self): - values = Series(['1', '22', 'aaa', '333', '45678']) + values = Series(["1", "22", "aaa", "333", "45678"]) result = values.str.zfill(5) - expected = Series(['00001', '00022', '00aaa', '00333', '45678']) + expected = Series(["00001", "00022", "00aaa", "00333", "45678"]) tm.assert_series_equal(result, expected) - expected = np.array([v.zfill(5) for v in values.values], - dtype=np.object_) + expected = np.array([v.zfill(5) for v in values.values], dtype=np.object_) tm.assert_numpy_array_equal(result.values, expected) result = values.str.zfill(3) - expected = Series(['001', '022', 'aaa', '333', '45678']) + expected = Series(["001", "022", "aaa", "333", "45678"]) tm.assert_series_equal(result, expected) - expected = np.array([v.zfill(3) for v in values.values], - dtype=np.object_) + expected = np.array([v.zfill(3) for v in values.values], dtype=np.object_) tm.assert_numpy_array_equal(result.values, expected) - values = Series(['1', np.nan, 'aaa', np.nan, '45678']) + values = Series(["1", np.nan, "aaa", np.nan, "45678"]) result = values.str.zfill(5) - expected = Series(['00001', np.nan, '00aaa', np.nan, '45678']) + expected = Series(["00001", np.nan, "00aaa", np.nan, "45678"]) tm.assert_series_equal(result, expected) def test_split(self): - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + values = Series(["a_b_c", "c_d_e", NA, "f_g_h"]) - result = values.str.split('_') - exp = Series([['a', 'b', 'c'], ['c', 'd', 'e'], NA, ['f', 'g', 'h']]) + result = values.str.split("_") + exp = Series([["a", "b", "c"], ["c", "d", "e"], NA, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) # more than one char - values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h']) - result = values.str.split('__') + values = Series(["a__b__c", "c__d__e", NA, "f__g__h"]) + result = values.str.split("__") tm.assert_series_equal(result, exp) - result = values.str.split('__', expand=False) + result = values.str.split("__", expand=False) tm.assert_series_equal(result, exp) # mixed - mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(), None, 1, - 2.]) - result = mixed.str.split('_') - exp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA, NA, NA, NA - ]) + mixed = Series(["a_b_c", NA, "d_e_f", True, datetime.today(), None, 1, 2.0]) + result = mixed.str.split("_") + exp = Series([["a", "b", "c"], NA, ["d", "e", "f"], NA, NA, NA, NA, NA]) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) - result = mixed.str.split('_', expand=False) + result = mixed.str.split("_", expand=False) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) # regex split - values = Series(['a,b_c', 'c_d,e', NA, 'f,g,h']) - result = values.str.split('[,_]') - exp = Series([['a', 'b', 'c'], ['c', 'd', 'e'], NA, ['f', 'g', 'h']]) + values = Series(["a,b_c", "c_d,e", NA, "f,g,h"]) + result = values.str.split("[,_]") + exp = Series([["a", "b", "c"], ["c", "d", "e"], NA, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) def test_rsplit(self): - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) - result = values.str.rsplit('_') - exp = Series([['a', 'b', 'c'], ['c', 'd', 'e'], NA, ['f', 'g', 'h']]) + values = Series(["a_b_c", "c_d_e", NA, "f_g_h"]) + result = values.str.rsplit("_") + exp = Series([["a", "b", "c"], ["c", "d", "e"], NA, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) # more than one char - values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h']) - result = values.str.rsplit('__') + values = Series(["a__b__c", "c__d__e", NA, "f__g__h"]) + result = values.str.rsplit("__") tm.assert_series_equal(result, exp) - result = values.str.rsplit('__', expand=False) + result = values.str.rsplit("__", expand=False) tm.assert_series_equal(result, exp) # mixed - mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(), None, 1, - 2.]) - result = mixed.str.rsplit('_') - exp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA, NA, NA, NA - ]) + mixed = Series(["a_b_c", NA, "d_e_f", True, datetime.today(), None, 1, 2.0]) + result = mixed.str.rsplit("_") + exp = Series([["a", "b", "c"], NA, ["d", "e", "f"], NA, NA, NA, NA, NA]) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) - result = mixed.str.rsplit('_', expand=False) + result = mixed.str.rsplit("_", expand=False) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) # regex split is not supported by rsplit - values = Series(['a,b_c', 'c_d,e', NA, 'f,g,h']) - result = values.str.rsplit('[,_]') - exp = Series([['a,b_c'], ['c_d,e'], NA, ['f,g,h']]) + values = Series(["a,b_c", "c_d,e", NA, "f,g,h"]) + result = values.str.rsplit("[,_]") + exp = Series([["a,b_c"], ["c_d,e"], NA, ["f,g,h"]]) tm.assert_series_equal(result, exp) # setting max number of splits, make sure it's from reverse - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) - result = values.str.rsplit('_', n=1) - exp = Series([['a_b', 'c'], ['c_d', 'e'], NA, ['f_g', 'h']]) + values = Series(["a_b_c", "c_d_e", NA, "f_g_h"]) + result = values.str.rsplit("_", n=1) + exp = Series([["a_b", "c"], ["c_d", "e"], NA, ["f_g", "h"]]) tm.assert_series_equal(result, exp) def test_split_blank_string(self): # expand blank split GH 20067 - values = Series([''], name='test') + values = Series([""], name="test") result = values.str.split(expand=True) exp = DataFrame([[]]) # NOTE: this is NOT an empty DataFrame tm.assert_frame_equal(result, exp) - values = Series(['a b c', 'a b', '', ' '], name='test') + values = Series(["a b c", "a b", "", " "], name="test") result = values.str.split(expand=True) - exp = DataFrame([['a', 'b', 'c'], ['a', 'b', np.nan], - [np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan]]) + exp = DataFrame( + [ + ["a", "b", "c"], + ["a", "b", np.nan], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + ] + ) tm.assert_frame_equal(result, exp) def test_split_noargs(self): # #1859 - s = Series(['Wes McKinney', 'Travis Oliphant']) + s = Series(["Wes McKinney", "Travis Oliphant"]) result = s.str.split() - expected = ['Travis', 'Oliphant'] + expected = ["Travis", "Oliphant"] assert result[1] == expected result = s.str.rsplit() assert result[1] == expected def test_split_maxsplit(self): # re.split 0, str.split -1 - s = Series(['bd asdf jfg', 'kjasdflqw asdfnfk']) + s = Series(["bd asdf jfg", "kjasdflqw asdfnfk"]) result = s.str.split(n=-1) xp = s.str.split() @@ -2384,134 +2398,142 @@ def test_split_maxsplit(self): result = s.str.split(n=0) tm.assert_series_equal(result, xp) - xp = s.str.split('asdf') - result = s.str.split('asdf', n=0) + xp = s.str.split("asdf") + result = s.str.split("asdf", n=0) tm.assert_series_equal(result, xp) - result = s.str.split('asdf', n=-1) + result = s.str.split("asdf", n=-1) tm.assert_series_equal(result, xp) def test_split_no_pat_with_nonzero_n(self): - s = Series(['split once', 'split once too!']) + s = Series(["split once", "split once too!"]) result = s.str.split(n=1) - expected = Series({0: ['split', 'once'], 1: ['split', 'once too!']}) + expected = Series({0: ["split", "once"], 1: ["split", "once too!"]}) tm.assert_series_equal(expected, result, check_index_type=False) def test_split_to_dataframe(self): - s = Series(['nosplit', 'alsonosplit']) - result = s.str.split('_', expand=True) - exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])}) + s = Series(["nosplit", "alsonosplit"]) + result = s.str.split("_", expand=True) + exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}) tm.assert_frame_equal(result, exp) - s = Series(['some_equal_splits', 'with_no_nans']) - result = s.str.split('_', expand=True) - exp = DataFrame({0: ['some', 'with'], - 1: ['equal', 'no'], - 2: ['splits', 'nans']}) + s = Series(["some_equal_splits", "with_no_nans"]) + result = s.str.split("_", expand=True) + exp = DataFrame( + {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]} + ) tm.assert_frame_equal(result, exp) - s = Series(['some_unequal_splits', 'one_of_these_things_is_not']) - result = s.str.split('_', expand=True) - exp = DataFrame({0: ['some', 'one'], - 1: ['unequal', 'of'], - 2: ['splits', 'these'], - 3: [NA, 'things'], - 4: [NA, 'is'], - 5: [NA, 'not']}) + s = Series(["some_unequal_splits", "one_of_these_things_is_not"]) + result = s.str.split("_", expand=True) + exp = DataFrame( + { + 0: ["some", "one"], + 1: ["unequal", "of"], + 2: ["splits", "these"], + 3: [NA, "things"], + 4: [NA, "is"], + 5: [NA, "not"], + } + ) tm.assert_frame_equal(result, exp) - s = Series(['some_splits', 'with_index'], index=['preserve', 'me']) - result = s.str.split('_', expand=True) - exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']}, - index=['preserve', 'me']) + s = Series(["some_splits", "with_index"], index=["preserve", "me"]) + result = s.str.split("_", expand=True) + exp = DataFrame( + {0: ["some", "with"], 1: ["splits", "index"]}, index=["preserve", "me"] + ) tm.assert_frame_equal(result, exp) with pytest.raises(ValueError, match="expand must be"): - s.str.split('_', expand="not_a_boolean") + s.str.split("_", expand="not_a_boolean") def test_split_to_multiindex_expand(self): # https://github.com/pandas-dev/pandas/issues/23677 - idx = Index(['nosplit', 'alsonosplit', np.nan]) - result = idx.str.split('_', expand=True) + idx = Index(["nosplit", "alsonosplit", np.nan]) + result = idx.str.split("_", expand=True) exp = idx tm.assert_index_equal(result, exp) assert result.nlevels == 1 - idx = Index(['some_equal_splits', 'with_no_nans', np.nan, None]) - result = idx.str.split('_', expand=True) - exp = MultiIndex.from_tuples([('some', 'equal', 'splits'), - ('with', 'no', 'nans'), - [np.nan, np.nan, np.nan], - [None, None, None]]) + idx = Index(["some_equal_splits", "with_no_nans", np.nan, None]) + result = idx.str.split("_", expand=True) + exp = MultiIndex.from_tuples( + [ + ("some", "equal", "splits"), + ("with", "no", "nans"), + [np.nan, np.nan, np.nan], + [None, None, None], + ] + ) tm.assert_index_equal(result, exp) assert result.nlevels == 3 - idx = Index(['some_unequal_splits', - 'one_of_these_things_is_not', - np.nan, None]) - result = idx.str.split('_', expand=True) - exp = MultiIndex.from_tuples([('some', 'unequal', 'splits', - NA, NA, NA), - ('one', 'of', 'these', - 'things', 'is', 'not'), - (np.nan, np.nan, np.nan, - np.nan, np.nan, np.nan), - (None, None, None, - None, None, None)]) + idx = Index(["some_unequal_splits", "one_of_these_things_is_not", np.nan, None]) + result = idx.str.split("_", expand=True) + exp = MultiIndex.from_tuples( + [ + ("some", "unequal", "splits", NA, NA, NA), + ("one", "of", "these", "things", "is", "not"), + (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan), + (None, None, None, None, None, None), + ] + ) tm.assert_index_equal(result, exp) assert result.nlevels == 6 with pytest.raises(ValueError, match="expand must be"): - idx.str.split('_', expand="not_a_boolean") + idx.str.split("_", expand="not_a_boolean") def test_rsplit_to_dataframe_expand(self): - s = Series(['nosplit', 'alsonosplit']) - result = s.str.rsplit('_', expand=True) - exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])}) + s = Series(["nosplit", "alsonosplit"]) + result = s.str.rsplit("_", expand=True) + exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}) tm.assert_frame_equal(result, exp) - s = Series(['some_equal_splits', 'with_no_nans']) - result = s.str.rsplit('_', expand=True) - exp = DataFrame({0: ['some', 'with'], - 1: ['equal', 'no'], - 2: ['splits', 'nans']}) + s = Series(["some_equal_splits", "with_no_nans"]) + result = s.str.rsplit("_", expand=True) + exp = DataFrame( + {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]} + ) tm.assert_frame_equal(result, exp) - result = s.str.rsplit('_', expand=True, n=2) - exp = DataFrame({0: ['some', 'with'], - 1: ['equal', 'no'], - 2: ['splits', 'nans']}) + result = s.str.rsplit("_", expand=True, n=2) + exp = DataFrame( + {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]} + ) tm.assert_frame_equal(result, exp) - result = s.str.rsplit('_', expand=True, n=1) - exp = DataFrame({0: ['some_equal', 'with_no'], 1: ['splits', 'nans']}) + result = s.str.rsplit("_", expand=True, n=1) + exp = DataFrame({0: ["some_equal", "with_no"], 1: ["splits", "nans"]}) tm.assert_frame_equal(result, exp) - s = Series(['some_splits', 'with_index'], index=['preserve', 'me']) - result = s.str.rsplit('_', expand=True) - exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']}, - index=['preserve', 'me']) + s = Series(["some_splits", "with_index"], index=["preserve", "me"]) + result = s.str.rsplit("_", expand=True) + exp = DataFrame( + {0: ["some", "with"], 1: ["splits", "index"]}, index=["preserve", "me"] + ) tm.assert_frame_equal(result, exp) def test_rsplit_to_multiindex_expand(self): - idx = Index(['nosplit', 'alsonosplit']) - result = idx.str.rsplit('_', expand=True) + idx = Index(["nosplit", "alsonosplit"]) + result = idx.str.rsplit("_", expand=True) exp = idx tm.assert_index_equal(result, exp) assert result.nlevels == 1 - idx = Index(['some_equal_splits', 'with_no_nans']) - result = idx.str.rsplit('_', expand=True) - exp = MultiIndex.from_tuples([('some', 'equal', 'splits'), ( - 'with', 'no', 'nans')]) + idx = Index(["some_equal_splits", "with_no_nans"]) + result = idx.str.rsplit("_", expand=True) + exp = MultiIndex.from_tuples( + [("some", "equal", "splits"), ("with", "no", "nans")] + ) tm.assert_index_equal(result, exp) assert result.nlevels == 3 - idx = Index(['some_equal_splits', 'with_no_nans']) - result = idx.str.rsplit('_', expand=True, n=1) - exp = MultiIndex.from_tuples([('some_equal', 'splits'), - ('with_no', 'nans')]) + idx = Index(["some_equal_splits", "with_no_nans"]) + result = idx.str.rsplit("_", expand=True, n=1) + exp = MultiIndex.from_tuples([("some_equal", "splits"), ("with_no", "nans")]) tm.assert_index_equal(result, exp) assert result.nlevels == 2 @@ -2531,126 +2553,146 @@ def test_split_with_name(self): # GH 12617 # should preserve name - s = Series(['a,b', 'c,d'], name='xxx') - res = s.str.split(',') - exp = Series([['a', 'b'], ['c', 'd']], name='xxx') + s = Series(["a,b", "c,d"], name="xxx") + res = s.str.split(",") + exp = Series([["a", "b"], ["c", "d"]], name="xxx") tm.assert_series_equal(res, exp) - res = s.str.split(',', expand=True) - exp = DataFrame([['a', 'b'], ['c', 'd']]) + res = s.str.split(",", expand=True) + exp = DataFrame([["a", "b"], ["c", "d"]]) tm.assert_frame_equal(res, exp) - idx = Index(['a,b', 'c,d'], name='xxx') - res = idx.str.split(',') - exp = Index([['a', 'b'], ['c', 'd']], name='xxx') + idx = Index(["a,b", "c,d"], name="xxx") + res = idx.str.split(",") + exp = Index([["a", "b"], ["c", "d"]], name="xxx") assert res.nlevels == 1 tm.assert_index_equal(res, exp) - res = idx.str.split(',', expand=True) - exp = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')]) + res = idx.str.split(",", expand=True) + exp = MultiIndex.from_tuples([("a", "b"), ("c", "d")]) assert res.nlevels == 2 tm.assert_index_equal(res, exp) def test_partition_series(self): # https://github.com/pandas-dev/pandas/issues/23558 - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None]) + values = Series(["a_b_c", "c_d_e", NA, "f_g_h", None]) - result = values.str.partition('_', expand=False) - exp = Series([('a', '_', 'b_c'), ('c', '_', 'd_e'), NA, - ('f', '_', 'g_h'), None]) + result = values.str.partition("_", expand=False) + exp = Series( + [("a", "_", "b_c"), ("c", "_", "d_e"), NA, ("f", "_", "g_h"), None] + ) tm.assert_series_equal(result, exp) - result = values.str.rpartition('_', expand=False) - exp = Series([('a_b', '_', 'c'), ('c_d', '_', 'e'), NA, - ('f_g', '_', 'h'), None]) + result = values.str.rpartition("_", expand=False) + exp = Series( + [("a_b", "_", "c"), ("c_d", "_", "e"), NA, ("f_g", "_", "h"), None] + ) tm.assert_series_equal(result, exp) # more than one char - values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h', None]) - result = values.str.partition('__', expand=False) - exp = Series([('a', '__', 'b__c'), ('c', '__', 'd__e'), NA, - ('f', '__', 'g__h'), None]) + values = Series(["a__b__c", "c__d__e", NA, "f__g__h", None]) + result = values.str.partition("__", expand=False) + exp = Series( + [("a", "__", "b__c"), ("c", "__", "d__e"), NA, ("f", "__", "g__h"), None] + ) tm.assert_series_equal(result, exp) - result = values.str.rpartition('__', expand=False) - exp = Series([('a__b', '__', 'c'), ('c__d', '__', 'e'), NA, - ('f__g', '__', 'h'), None]) + result = values.str.rpartition("__", expand=False) + exp = Series( + [("a__b", "__", "c"), ("c__d", "__", "e"), NA, ("f__g", "__", "h"), None] + ) tm.assert_series_equal(result, exp) # None - values = Series(['a b c', 'c d e', NA, 'f g h', None]) + values = Series(["a b c", "c d e", NA, "f g h", None]) result = values.str.partition(expand=False) - exp = Series([('a', ' ', 'b c'), ('c', ' ', 'd e'), NA, - ('f', ' ', 'g h'), None]) + exp = Series( + [("a", " ", "b c"), ("c", " ", "d e"), NA, ("f", " ", "g h"), None] + ) tm.assert_series_equal(result, exp) result = values.str.rpartition(expand=False) - exp = Series([('a b', ' ', 'c'), ('c d', ' ', 'e'), NA, - ('f g', ' ', 'h'), None]) + exp = Series( + [("a b", " ", "c"), ("c d", " ", "e"), NA, ("f g", " ", "h"), None] + ) tm.assert_series_equal(result, exp) # Not split - values = Series(['abc', 'cde', NA, 'fgh', None]) - result = values.str.partition('_', expand=False) - exp = Series([('abc', '', ''), ('cde', '', ''), NA, - ('fgh', '', ''), None]) + values = Series(["abc", "cde", NA, "fgh", None]) + result = values.str.partition("_", expand=False) + exp = Series([("abc", "", ""), ("cde", "", ""), NA, ("fgh", "", ""), None]) tm.assert_series_equal(result, exp) - result = values.str.rpartition('_', expand=False) - exp = Series([('', '', 'abc'), ('', '', 'cde'), NA, - ('', '', 'fgh'), None]) + result = values.str.rpartition("_", expand=False) + exp = Series([("", "", "abc"), ("", "", "cde"), NA, ("", "", "fgh"), None]) tm.assert_series_equal(result, exp) # unicode - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + values = Series(["a_b_c", "c_d_e", NA, "f_g_h"]) - result = values.str.partition('_', expand=False) - exp = Series([('a', '_', 'b_c'), ('c', '_', 'd_e'), - NA, ('f', '_', 'g_h')]) + result = values.str.partition("_", expand=False) + exp = Series([("a", "_", "b_c"), ("c", "_", "d_e"), NA, ("f", "_", "g_h")]) tm.assert_series_equal(result, exp) - result = values.str.rpartition('_', expand=False) - exp = Series([('a_b', '_', 'c'), ('c_d', '_', 'e'), - NA, ('f_g', '_', 'h')]) + result = values.str.rpartition("_", expand=False) + exp = Series([("a_b", "_", "c"), ("c_d", "_", "e"), NA, ("f_g", "_", "h")]) tm.assert_series_equal(result, exp) # compare to standard lib - values = Series(['A_B_C', 'B_C_D', 'E_F_G', 'EFGHEF']) - result = values.str.partition('_', expand=False).tolist() - assert result == [v.partition('_') for v in values] - result = values.str.rpartition('_', expand=False).tolist() - assert result == [v.rpartition('_') for v in values] + values = Series(["A_B_C", "B_C_D", "E_F_G", "EFGHEF"]) + result = values.str.partition("_", expand=False).tolist() + assert result == [v.partition("_") for v in values] + result = values.str.rpartition("_", expand=False).tolist() + assert result == [v.rpartition("_") for v in values] def test_partition_index(self): # https://github.com/pandas-dev/pandas/issues/23558 - values = Index(['a_b_c', 'c_d_e', 'f_g_h', np.nan, None]) + values = Index(["a_b_c", "c_d_e", "f_g_h", np.nan, None]) - result = values.str.partition('_', expand=False) - exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'), - ('f', '_', 'g_h'), np.nan, None])) + result = values.str.partition("_", expand=False) + exp = Index( + np.array( + [("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), np.nan, None] + ) + ) tm.assert_index_equal(result, exp) assert result.nlevels == 1 - result = values.str.rpartition('_', expand=False) - exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'), - ('f_g', '_', 'h'), np.nan, None])) + result = values.str.rpartition("_", expand=False) + exp = Index( + np.array( + [("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), np.nan, None] + ) + ) tm.assert_index_equal(result, exp) assert result.nlevels == 1 - result = values.str.partition('_') - exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'), - ('f', '_', 'g_h'), (np.nan, np.nan, np.nan), - (None, None, None)]) + result = values.str.partition("_") + exp = Index( + [ + ("a", "_", "b_c"), + ("c", "_", "d_e"), + ("f", "_", "g_h"), + (np.nan, np.nan, np.nan), + (None, None, None), + ] + ) tm.assert_index_equal(result, exp) assert isinstance(result, MultiIndex) assert result.nlevels == 3 - result = values.str.rpartition('_') - exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'), - ('f_g', '_', 'h'), (np.nan, np.nan, np.nan), - (None, None, None)]) + result = values.str.rpartition("_") + exp = Index( + [ + ("a_b", "_", "c"), + ("c_d", "_", "e"), + ("f_g", "_", "h"), + (np.nan, np.nan, np.nan), + (None, None, None), + ] + ) tm.assert_index_equal(result, exp) assert isinstance(result, MultiIndex) assert result.nlevels == 3 @@ -2658,209 +2700,222 @@ def test_partition_index(self): def test_partition_to_dataframe(self): # https://github.com/pandas-dev/pandas/issues/23558 - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None]) - result = values.str.partition('_') - exp = DataFrame({0: ['a', 'c', np.nan, 'f', None], - 1: ['_', '_', np.nan, '_', None], - 2: ['b_c', 'd_e', np.nan, 'g_h', None]}) + values = Series(["a_b_c", "c_d_e", NA, "f_g_h", None]) + result = values.str.partition("_") + exp = DataFrame( + { + 0: ["a", "c", np.nan, "f", None], + 1: ["_", "_", np.nan, "_", None], + 2: ["b_c", "d_e", np.nan, "g_h", None], + } + ) tm.assert_frame_equal(result, exp) - result = values.str.rpartition('_') - exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g', None], - 1: ['_', '_', np.nan, '_', None], - 2: ['c', 'e', np.nan, 'h', None]}) + result = values.str.rpartition("_") + exp = DataFrame( + { + 0: ["a_b", "c_d", np.nan, "f_g", None], + 1: ["_", "_", np.nan, "_", None], + 2: ["c", "e", np.nan, "h", None], + } + ) tm.assert_frame_equal(result, exp) - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None]) - result = values.str.partition('_', expand=True) - exp = DataFrame({0: ['a', 'c', np.nan, 'f', None], - 1: ['_', '_', np.nan, '_', None], - 2: ['b_c', 'd_e', np.nan, 'g_h', None]}) + values = Series(["a_b_c", "c_d_e", NA, "f_g_h", None]) + result = values.str.partition("_", expand=True) + exp = DataFrame( + { + 0: ["a", "c", np.nan, "f", None], + 1: ["_", "_", np.nan, "_", None], + 2: ["b_c", "d_e", np.nan, "g_h", None], + } + ) tm.assert_frame_equal(result, exp) - result = values.str.rpartition('_', expand=True) - exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g', None], - 1: ['_', '_', np.nan, '_', None], - 2: ['c', 'e', np.nan, 'h', None]}) + result = values.str.rpartition("_", expand=True) + exp = DataFrame( + { + 0: ["a_b", "c_d", np.nan, "f_g", None], + 1: ["_", "_", np.nan, "_", None], + 2: ["c", "e", np.nan, "h", None], + } + ) tm.assert_frame_equal(result, exp) def test_partition_with_name(self): # GH 12617 - s = Series(['a,b', 'c,d'], name='xxx') - res = s.str.partition(',') - exp = DataFrame({0: ['a', 'c'], 1: [',', ','], 2: ['b', 'd']}) + s = Series(["a,b", "c,d"], name="xxx") + res = s.str.partition(",") + exp = DataFrame({0: ["a", "c"], 1: [",", ","], 2: ["b", "d"]}) tm.assert_frame_equal(res, exp) # should preserve name - res = s.str.partition(',', expand=False) - exp = Series([('a', ',', 'b'), ('c', ',', 'd')], name='xxx') + res = s.str.partition(",", expand=False) + exp = Series([("a", ",", "b"), ("c", ",", "d")], name="xxx") tm.assert_series_equal(res, exp) - idx = Index(['a,b', 'c,d'], name='xxx') - res = idx.str.partition(',') - exp = MultiIndex.from_tuples([('a', ',', 'b'), ('c', ',', 'd')]) + idx = Index(["a,b", "c,d"], name="xxx") + res = idx.str.partition(",") + exp = MultiIndex.from_tuples([("a", ",", "b"), ("c", ",", "d")]) assert res.nlevels == 3 tm.assert_index_equal(res, exp) # should preserve name - res = idx.str.partition(',', expand=False) - exp = Index(np.array([('a', ',', 'b'), ('c', ',', 'd')]), name='xxx') + res = idx.str.partition(",", expand=False) + exp = Index(np.array([("a", ",", "b"), ("c", ",", "d")]), name="xxx") assert res.nlevels == 1 tm.assert_index_equal(res, exp) def test_partition_deprecation(self): # GH 22676; depr kwarg "pat" in favor of "sep" - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + values = Series(["a_b_c", "c_d_e", NA, "f_g_h"]) # str.partition # using sep -> no warning - expected = values.str.partition(sep='_') + expected = values.str.partition(sep="_") with tm.assert_produces_warning(FutureWarning): - result = values.str.partition(pat='_') + result = values.str.partition(pat="_") tm.assert_frame_equal(result, expected) # str.rpartition # using sep -> no warning - expected = values.str.rpartition(sep='_') + expected = values.str.rpartition(sep="_") with tm.assert_produces_warning(FutureWarning): - result = values.str.rpartition(pat='_') + result = values.str.rpartition(pat="_") tm.assert_frame_equal(result, expected) def test_pipe_failures(self): # #2119 - s = Series(['A|B|C']) + s = Series(["A|B|C"]) - result = s.str.split('|') - exp = Series([['A', 'B', 'C']]) + result = s.str.split("|") + exp = Series([["A", "B", "C"]]) tm.assert_series_equal(result, exp) - result = s.str.replace('|', ' ') - exp = Series(['A B C']) + result = s.str.replace("|", " ") + exp = Series(["A B C"]) tm.assert_series_equal(result, exp) def test_slice(self): - values = Series(['aafootwo', 'aabartwo', NA, 'aabazqux']) + values = Series(["aafootwo", "aabartwo", NA, "aabazqux"]) result = values.str.slice(2, 5) - exp = Series(['foo', 'bar', NA, 'baz']) + exp = Series(["foo", "bar", NA, "baz"]) tm.assert_series_equal(result, exp) - for start, stop, step in [(0, 3, -1), (None, None, -1), (3, 10, 2), - (3, 0, -1)]: + for start, stop, step in [(0, 3, -1), (None, None, -1), (3, 10, 2), (3, 0, -1)]: try: result = values.str.slice(start, stop, step) - expected = Series([s[start:stop:step] if not isna(s) else NA - for s in values]) + expected = Series( + [s[start:stop:step] if not isna(s) else NA for s in values] + ) tm.assert_series_equal(result, expected) except IndexError: - print('failed on %s:%s:%s' % (start, stop, step)) + print("failed on %s:%s:%s" % (start, stop, step)) raise # mixed - mixed = Series(['aafootwo', NA, 'aabartwo', True, datetime.today(), - None, 1, 2.]) + mixed = Series( + ["aafootwo", NA, "aabartwo", True, datetime.today(), None, 1, 2.0] + ) rs = Series(mixed).str.slice(2, 5) - xp = Series(['foo', NA, 'bar', NA, NA, NA, NA, NA]) + xp = Series(["foo", NA, "bar", NA, NA, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.slice(2, 5, -1) - xp = Series(['oof', NA, 'rab', NA, NA, NA, NA, NA]) + xp = Series(["oof", NA, "rab", NA, NA, NA, NA, NA]) def test_slice_replace(self): - values = Series(['short', 'a bit longer', 'evenlongerthanthat', '', NA - ]) + values = Series(["short", "a bit longer", "evenlongerthanthat", "", NA]) - exp = Series(['shrt', 'a it longer', 'evnlongerthanthat', '', NA]) + exp = Series(["shrt", "a it longer", "evnlongerthanthat", "", NA]) result = values.str.slice_replace(2, 3) tm.assert_series_equal(result, exp) - exp = Series(['shzrt', 'a zit longer', 'evznlongerthanthat', 'z', NA]) - result = values.str.slice_replace(2, 3, 'z') + exp = Series(["shzrt", "a zit longer", "evznlongerthanthat", "z", NA]) + result = values.str.slice_replace(2, 3, "z") tm.assert_series_equal(result, exp) - exp = Series(['shzort', 'a zbit longer', 'evzenlongerthanthat', 'z', NA - ]) - result = values.str.slice_replace(2, 2, 'z') + exp = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", NA]) + result = values.str.slice_replace(2, 2, "z") tm.assert_series_equal(result, exp) - exp = Series(['shzort', 'a zbit longer', 'evzenlongerthanthat', 'z', NA - ]) - result = values.str.slice_replace(2, 1, 'z') + exp = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", NA]) + result = values.str.slice_replace(2, 1, "z") tm.assert_series_equal(result, exp) - exp = Series(['shorz', 'a bit longez', 'evenlongerthanthaz', 'z', NA]) - result = values.str.slice_replace(-1, None, 'z') + exp = Series(["shorz", "a bit longez", "evenlongerthanthaz", "z", NA]) + result = values.str.slice_replace(-1, None, "z") tm.assert_series_equal(result, exp) - exp = Series(['zrt', 'zer', 'zat', 'z', NA]) - result = values.str.slice_replace(None, -2, 'z') + exp = Series(["zrt", "zer", "zat", "z", NA]) + result = values.str.slice_replace(None, -2, "z") tm.assert_series_equal(result, exp) - exp = Series(['shortz', 'a bit znger', 'evenlozerthanthat', 'z', NA]) - result = values.str.slice_replace(6, 8, 'z') + exp = Series(["shortz", "a bit znger", "evenlozerthanthat", "z", NA]) + result = values.str.slice_replace(6, 8, "z") tm.assert_series_equal(result, exp) - exp = Series(['zrt', 'a zit longer', 'evenlongzerthanthat', 'z', NA]) - result = values.str.slice_replace(-10, 3, 'z') + exp = Series(["zrt", "a zit longer", "evenlongzerthanthat", "z", NA]) + result = values.str.slice_replace(-10, 3, "z") tm.assert_series_equal(result, exp) def test_strip_lstrip_rstrip(self): - values = Series([' aa ', ' bb \n', NA, 'cc ']) + values = Series([" aa ", " bb \n", NA, "cc "]) result = values.str.strip() - exp = Series(['aa', 'bb', NA, 'cc']) + exp = Series(["aa", "bb", NA, "cc"]) tm.assert_series_equal(result, exp) result = values.str.lstrip() - exp = Series(['aa ', 'bb \n', NA, 'cc ']) + exp = Series(["aa ", "bb \n", NA, "cc "]) tm.assert_series_equal(result, exp) result = values.str.rstrip() - exp = Series([' aa', ' bb', NA, 'cc']) + exp = Series([" aa", " bb", NA, "cc"]) tm.assert_series_equal(result, exp) def test_strip_lstrip_rstrip_mixed(self): # mixed - mixed = Series([' aa ', NA, ' bb \t\n', True, datetime.today(), None, - 1, 2.]) + mixed = Series([" aa ", NA, " bb \t\n", True, datetime.today(), None, 1, 2.0]) rs = Series(mixed).str.strip() - xp = Series(['aa', NA, 'bb', NA, NA, NA, NA, NA]) + xp = Series(["aa", NA, "bb", NA, NA, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.lstrip() - xp = Series(['aa ', NA, 'bb \t\n', NA, NA, NA, NA, NA]) + xp = Series(["aa ", NA, "bb \t\n", NA, NA, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.rstrip() - xp = Series([' aa', NA, ' bb', NA, NA, NA, NA, NA]) + xp = Series([" aa", NA, " bb", NA, NA, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) def test_strip_lstrip_rstrip_args(self): - values = Series(['xxABCxx', 'xx BNSD', 'LDFJH xx']) + values = Series(["xxABCxx", "xx BNSD", "LDFJH xx"]) - rs = values.str.strip('x') - xp = Series(['ABC', ' BNSD', 'LDFJH ']) + rs = values.str.strip("x") + xp = Series(["ABC", " BNSD", "LDFJH "]) assert_series_equal(rs, xp) - rs = values.str.lstrip('x') - xp = Series(['ABCxx', ' BNSD', 'LDFJH xx']) + rs = values.str.lstrip("x") + xp = Series(["ABCxx", " BNSD", "LDFJH xx"]) assert_series_equal(rs, xp) - rs = values.str.rstrip('x') - xp = Series(['xxABC', 'xx BNSD', 'LDFJH ']) + rs = values.str.rstrip("x") + xp = Series(["xxABC", "xx BNSD", "LDFJH "]) assert_series_equal(rs, xp) def test_wrap(self): @@ -2868,69 +2923,87 @@ def test_wrap(self): # two words greater than width, one word less than width, one word # equal to width, one word greater than width, multiple tokens with # trailing whitespace equal to width - values = Series(['hello world', 'hello world!', 'hello world!!', - 'abcdefabcde', 'abcdefabcdef', 'abcdefabcdefa', - 'ab ab ab ab ', 'ab ab ab ab a', '\t']) + values = Series( + [ + "hello world", + "hello world!", + "hello world!!", + "abcdefabcde", + "abcdefabcdef", + "abcdefabcdefa", + "ab ab ab ab ", + "ab ab ab ab a", + "\t", + ] + ) # expected values - xp = Series(['hello world', 'hello world!', 'hello\nworld!!', - 'abcdefabcde', 'abcdefabcdef', 'abcdefabcdef\na', - 'ab ab ab ab', 'ab ab ab ab\na', '']) + xp = Series( + [ + "hello world", + "hello world!", + "hello\nworld!!", + "abcdefabcde", + "abcdefabcdef", + "abcdefabcdef\na", + "ab ab ab ab", + "ab ab ab ab\na", + "", + ] + ) rs = values.str.wrap(12, break_long_words=True) assert_series_equal(rs, xp) # test with pre and post whitespace (non-unicode), NaN, and non-ascii # Unicode - values = Series([' pre ', np.nan, '\xac\u20ac\U00008000 abadcafe']) - xp = Series([' pre', NA, '\xac\u20ac\U00008000 ab\nadcafe']) + values = Series([" pre ", np.nan, "\xac\u20ac\U00008000 abadcafe"]) + xp = Series([" pre", NA, "\xac\u20ac\U00008000 ab\nadcafe"]) rs = values.str.wrap(6) assert_series_equal(rs, xp) def test_get(self): - values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h']) + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) - result = values.str.split('_').str.get(1) - expected = Series(['b', 'd', np.nan, 'g']) + result = values.str.split("_").str.get(1) + expected = Series(["b", "d", np.nan, "g"]) tm.assert_series_equal(result, expected) # mixed - mixed = Series(['a_b_c', NA, 'c_d_e', True, datetime.today(), None, 1, - 2.]) + mixed = Series(["a_b_c", NA, "c_d_e", True, datetime.today(), None, 1, 2.0]) - rs = Series(mixed).str.split('_').str.get(1) - xp = Series(['b', NA, 'd', NA, NA, NA, NA, NA]) + rs = Series(mixed).str.split("_").str.get(1) + xp = Series(["b", NA, "d", NA, NA, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # bounds testing - values = Series(['1_2_3_4_5', '6_7_8_9_10', '11_12']) + values = Series(["1_2_3_4_5", "6_7_8_9_10", "11_12"]) # positive index - result = values.str.split('_').str.get(2) - expected = Series(['3', '8', np.nan]) + result = values.str.split("_").str.get(2) + expected = Series(["3", "8", np.nan]) tm.assert_series_equal(result, expected) # negative index - result = values.str.split('_').str.get(-3) - expected = Series(['3', '8', np.nan]) + result = values.str.split("_").str.get(-3) + expected = Series(["3", "8", np.nan]) tm.assert_series_equal(result, expected) def test_get_complex(self): # GH 20671, getting value not in dict raising `KeyError` - values = Series([(1, 2, 3), [1, 2, 3], {1, 2, 3}, - {1: 'a', 2: 'b', 3: 'c'}]) + values = Series([(1, 2, 3), [1, 2, 3], {1, 2, 3}, {1: "a", 2: "b", 3: "c"}]) result = values.str.get(1) - expected = Series([2, 2, np.nan, 'a']) + expected = Series([2, 2, np.nan, "a"]) tm.assert_series_equal(result, expected) result = values.str.get(-1) expected = Series([3, 3, np.nan, np.nan]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('to_type', [tuple, list, np.array]) + @pytest.mark.parametrize("to_type", [tuple, list, np.array]) def test_get_complex_nested(self, to_type): values = Series([to_type([to_type([1, 2])])]) @@ -2944,77 +3017,106 @@ def test_get_complex_nested(self, to_type): def test_contains_moar(self): # PR #1179 - s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA, - 'CABA', 'dog', 'cat']) + s = Series(["A", "B", "C", "Aaba", "Baca", "", NA, "CABA", "dog", "cat"]) - result = s.str.contains('a') - expected = Series([False, False, False, True, True, False, np.nan, - False, False, True]) + result = s.str.contains("a") + expected = Series( + [False, False, False, True, True, False, np.nan, False, False, True] + ) assert_series_equal(result, expected) - result = s.str.contains('a', case=False) - expected = Series([True, False, False, True, True, False, np.nan, True, - False, True]) + result = s.str.contains("a", case=False) + expected = Series( + [True, False, False, True, True, False, np.nan, True, False, True] + ) assert_series_equal(result, expected) - result = s.str.contains('Aa') - expected = Series([False, False, False, True, False, False, np.nan, - False, False, False]) + result = s.str.contains("Aa") + expected = Series( + [False, False, False, True, False, False, np.nan, False, False, False] + ) assert_series_equal(result, expected) - result = s.str.contains('ba') - expected = Series([False, False, False, True, False, False, np.nan, - False, False, False]) + result = s.str.contains("ba") + expected = Series( + [False, False, False, True, False, False, np.nan, False, False, False] + ) assert_series_equal(result, expected) - result = s.str.contains('ba', case=False) - expected = Series([False, False, False, True, True, False, np.nan, - True, False, False]) + result = s.str.contains("ba", case=False) + expected = Series( + [False, False, False, True, True, False, np.nan, True, False, False] + ) assert_series_equal(result, expected) def test_contains_nan(self): # PR #14171 s = Series([np.nan, np.nan, np.nan], dtype=np.object_) - result = s.str.contains('foo', na=False) + result = s.str.contains("foo", na=False) expected = Series([False, False, False], dtype=np.bool_) assert_series_equal(result, expected) - result = s.str.contains('foo', na=True) + result = s.str.contains("foo", na=True) expected = Series([True, True, True], dtype=np.bool_) assert_series_equal(result, expected) - result = s.str.contains('foo', na="foo") + result = s.str.contains("foo", na="foo") expected = Series(["foo", "foo", "foo"], dtype=np.object_) assert_series_equal(result, expected) - result = s.str.contains('foo') + result = s.str.contains("foo") expected = Series([np.nan, np.nan, np.nan], dtype=np.object_) assert_series_equal(result, expected) def test_replace_moar(self): # PR #1179 - s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA, 'CABA', - 'dog', 'cat']) + s = Series(["A", "B", "C", "Aaba", "Baca", "", NA, "CABA", "dog", "cat"]) - result = s.str.replace('A', 'YYY') - expected = Series(['YYY', 'B', 'C', 'YYYaba', 'Baca', '', NA, - 'CYYYBYYY', 'dog', 'cat']) + result = s.str.replace("A", "YYY") + expected = Series( + ["YYY", "B", "C", "YYYaba", "Baca", "", NA, "CYYYBYYY", "dog", "cat"] + ) assert_series_equal(result, expected) - result = s.str.replace('A', 'YYY', case=False) - expected = Series(['YYY', 'B', 'C', 'YYYYYYbYYY', 'BYYYcYYY', '', NA, - 'CYYYBYYY', 'dog', 'cYYYt']) + result = s.str.replace("A", "YYY", case=False) + expected = Series( + [ + "YYY", + "B", + "C", + "YYYYYYbYYY", + "BYYYcYYY", + "", + NA, + "CYYYBYYY", + "dog", + "cYYYt", + ] + ) assert_series_equal(result, expected) - result = s.str.replace('^.a|dog', 'XX-XX ', case=False) - expected = Series(['A', 'B', 'C', 'XX-XX ba', 'XX-XX ca', '', NA, - 'XX-XX BA', 'XX-XX ', 'XX-XX t']) + result = s.str.replace("^.a|dog", "XX-XX ", case=False) + expected = Series( + [ + "A", + "B", + "C", + "XX-XX ba", + "XX-XX ca", + "", + NA, + "XX-XX BA", + "XX-XX ", + "XX-XX t", + ] + ) assert_series_equal(result, expected) def test_string_slice_get_syntax(self): - s = Series(['YYY', 'B', 'C', 'YYYYYYbYYY', 'BYYYcYYY', NA, 'CYYYBYYY', - 'dog', 'cYYYt']) + s = Series( + ["YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", NA, "CYYYBYYY", "dog", "cYYYt"] + ) result = s.str[0] expected = s.str.get(0) @@ -3029,35 +3131,37 @@ def test_string_slice_get_syntax(self): assert_series_equal(result, expected) def test_string_slice_out_of_bounds(self): - s = Series([(1, 2), (1, ), (3, 4, 5)]) + s = Series([(1, 2), (1,), (3, 4, 5)]) result = s.str[1] expected = Series([2, np.nan, 4]) assert_series_equal(result, expected) - s = Series(['foo', 'b', 'ba']) + s = Series(["foo", "b", "ba"]) result = s.str[1] - expected = Series(['o', np.nan, 'a']) + expected = Series(["o", np.nan, "a"]) assert_series_equal(result, expected) def test_match_findall_flags(self): - data = {'Dave': 'dave@google.com', - 'Steve': 'steve@gmail.com', - 'Rob': 'rob@gmail.com', - 'Wes': np.nan} + data = { + "Dave": "dave@google.com", + "Steve": "steve@gmail.com", + "Rob": "rob@gmail.com", + "Wes": np.nan, + } data = Series(data) - pat = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})' + pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})" result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) - assert result.iloc[0].tolist() == ['dave', 'google', 'com'] + assert result.iloc[0].tolist() == ["dave", "google", "com"] result = data.str.match(pat, flags=re.IGNORECASE) assert result[0] result = data.str.findall(pat, flags=re.IGNORECASE) - assert result[0][0] == ('dave', 'google', 'com') + assert result[0][0] == ("dave", "google", "com") result = data.str.count(pat, flags=re.IGNORECASE) assert result[0] == 1 @@ -3067,73 +3171,80 @@ def test_match_findall_flags(self): assert result[0] def test_encode_decode(self): - base = Series(['a', 'b', 'a\xe4']) - series = base.str.encode('utf-8') + base = Series(["a", "b", "a\xe4"]) + series = base.str.encode("utf-8") - f = lambda x: x.decode('utf-8') - result = series.str.decode('utf-8') + f = lambda x: x.decode("utf-8") + result = series.str.decode("utf-8") exp = series.map(f) tm.assert_series_equal(result, exp) def test_encode_decode_errors(self): - encodeBase = Series(['a', 'b', 'a\x9d']) + encodeBase = Series(["a", "b", "a\x9d"]) - msg = (r"'charmap' codec can't encode character '\\x9d' in position 1:" - " character maps to ") + msg = ( + r"'charmap' codec can't encode character '\\x9d' in position 1:" + " character maps to " + ) with pytest.raises(UnicodeEncodeError, match=msg): - encodeBase.str.encode('cp1252') + encodeBase.str.encode("cp1252") - f = lambda x: x.encode('cp1252', 'ignore') - result = encodeBase.str.encode('cp1252', 'ignore') + f = lambda x: x.encode("cp1252", "ignore") + result = encodeBase.str.encode("cp1252", "ignore") exp = encodeBase.map(f) tm.assert_series_equal(result, exp) - decodeBase = Series([b'a', b'b', b'a\x9d']) + decodeBase = Series([b"a", b"b", b"a\x9d"]) - msg = ("'charmap' codec can't decode byte 0x9d in position 1:" - " character maps to ") + msg = ( + "'charmap' codec can't decode byte 0x9d in position 1:" + " character maps to " + ) with pytest.raises(UnicodeDecodeError, match=msg): - decodeBase.str.decode('cp1252') + decodeBase.str.decode("cp1252") - f = lambda x: x.decode('cp1252', 'ignore') - result = decodeBase.str.decode('cp1252', 'ignore') + f = lambda x: x.decode("cp1252", "ignore") + result = decodeBase.str.decode("cp1252", "ignore") exp = decodeBase.map(f) tm.assert_series_equal(result, exp) def test_normalize(self): - values = ['ABC', 'ABC', '123', np.nan, 'アイエ'] - s = Series(values, index=['a', 'b', 'c', 'd', 'e']) + values = ["ABC", "ABC", "123", np.nan, "アイエ"] + s = Series(values, index=["a", "b", "c", "d", "e"]) - normed = ['ABC', 'ABC', '123', np.nan, 'アイエ'] - expected = Series(normed, index=['a', 'b', 'c', 'd', 'e']) + normed = ["ABC", "ABC", "123", np.nan, "アイエ"] + expected = Series(normed, index=["a", "b", "c", "d", "e"]) - result = s.str.normalize('NFKC') + result = s.str.normalize("NFKC") tm.assert_series_equal(result, expected) - expected = Series(['ABC', 'ABC', '123', np.nan, 'アイエ'], - index=['a', 'b', 'c', 'd', 'e']) + expected = Series( + ["ABC", "ABC", "123", np.nan, "アイエ"], index=["a", "b", "c", "d", "e"] + ) - result = s.str.normalize('NFC') + result = s.str.normalize("NFC") tm.assert_series_equal(result, expected) with pytest.raises(ValueError, match="invalid normalization form"): - s.str.normalize('xxx') + s.str.normalize("xxx") - s = Index(['ABC', '123', 'アイエ']) - expected = Index(['ABC', '123', 'アイエ']) - result = s.str.normalize('NFKC') + s = Index(["ABC", "123", "アイエ"]) + expected = Index(["ABC", "123", "アイエ"]) + result = s.str.normalize("NFKC") tm.assert_index_equal(result, expected) def test_index_str_accessor_visibility(self): from pandas.core.strings import StringMethods - cases = [(['a', 'b'], 'string'), - (['a', 'b', 1], 'mixed-integer'), - (['a', 'b', 1.3], 'mixed'), - (['a', 'b', 1.3, 1], 'mixed-integer'), - (['aa', datetime(2011, 1, 1)], 'mixed')] + cases = [ + (["a", "b"], "string"), + (["a", "b", 1], "mixed-integer"), + (["a", "b", 1.3], "mixed"), + (["a", "b", 1.3, 1], "mixed-integer"), + (["aa", datetime(2011, 1, 1)], "mixed"), + ] for values, tp in cases: idx = Index(values) assert isinstance(Series(values).str, StringMethods) @@ -3146,12 +3257,14 @@ def test_index_str_accessor_visibility(self): assert isinstance(idx.str, StringMethods) assert idx.inferred_type == tp - cases = [([1, np.nan], 'floating'), - ([datetime(2011, 1, 1)], 'datetime64'), - ([timedelta(1)], 'timedelta64')] + cases = [ + ([1, np.nan], "floating"), + ([datetime(2011, 1, 1)], "datetime64"), + ([timedelta(1)], "timedelta64"), + ] for values, tp in cases: idx = Index(values) - message = 'Can only use .str accessor with string values' + message = "Can only use .str accessor with string values" with pytest.raises(AttributeError, match=message): Series(values).str with pytest.raises(AttributeError, match=message): @@ -3159,30 +3272,28 @@ def test_index_str_accessor_visibility(self): assert idx.inferred_type == tp # MultiIndex has mixed dtype, but not allow to use accessor - idx = MultiIndex.from_tuples([('a', 'b'), ('a', 'b')]) - assert idx.inferred_type == 'mixed' - message = 'Can only use .str accessor with Index, not MultiIndex' + idx = MultiIndex.from_tuples([("a", "b"), ("a", "b")]) + assert idx.inferred_type == "mixed" + message = "Can only use .str accessor with Index, not MultiIndex" with pytest.raises(AttributeError, match=message): idx.str def test_str_accessor_no_new_attributes(self): # https://github.com/pandas-dev/pandas/issues/10673 - s = Series(list('aabbcde')) - with pytest.raises(AttributeError, - match="You cannot add any new attribute"): + s = Series(list("aabbcde")) + with pytest.raises(AttributeError, match="You cannot add any new attribute"): s.str.xlabel = "a" def test_method_on_bytes(self): - lhs = Series(np.array(list('abc'), 'S1').astype(object)) - rhs = Series(np.array(list('def'), 'S1').astype(object)) - with pytest.raises(TypeError, - match="Cannot use .str.cat with values of.*"): + lhs = Series(np.array(list("abc"), "S1").astype(object)) + rhs = Series(np.array(list("def"), "S1").astype(object)) + with pytest.raises(TypeError, match="Cannot use .str.cat with values of.*"): lhs.str.cat(rhs) def test_casefold(self): # GH25405 - expected = Series(['ss', NA, 'case', 'ssd']) - s = Series(['ß', NA, 'case', 'ßd']) + expected = Series(["ss", NA, "case", "ssd"]) + s = Series(["ß", NA, "case", "ßd"]) result = s.str.casefold() tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index afcc90a1c8e74b..d2a9e1dc94bb52 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -18,48 +18,52 @@ def writeable(request): # Check that take_nd works both with writeable arrays # (in which case fast typed memory-views implementation) # and read-only arrays alike. -@pytest.fixture(params=[ - (np.float64, True), - (np.float32, True), - (np.uint64, False), - (np.uint32, False), - (np.uint16, False), - (np.uint8, False), - (np.int64, False), - (np.int32, False), - (np.int16, False), - (np.int8, False), - (np.object_, True), - (np.bool, False), -]) +@pytest.fixture( + params=[ + (np.float64, True), + (np.float32, True), + (np.uint64, False), + (np.uint32, False), + (np.uint16, False), + (np.uint8, False), + (np.int64, False), + (np.int32, False), + (np.int16, False), + (np.int8, False), + (np.object_, True), + (np.bool, False), + ] +) def dtype_can_hold_na(request): return request.param -@pytest.fixture(params=[ - (np.int8, np.int16(127), np.int8), - (np.int8, np.int16(128), np.int16), - (np.int32, 1, np.int32), - (np.int32, 2.0, np.float64), - (np.int32, 3.0 + 4.0j, np.complex128), - (np.int32, True, np.object_), - (np.int32, "", np.object_), - (np.float64, 1, np.float64), - (np.float64, 2.0, np.float64), - (np.float64, 3.0 + 4.0j, np.complex128), - (np.float64, True, np.object_), - (np.float64, "", np.object_), - (np.complex128, 1, np.complex128), - (np.complex128, 2.0, np.complex128), - (np.complex128, 3.0 + 4.0j, np.complex128), - (np.complex128, True, np.object_), - (np.complex128, "", np.object_), - (np.bool_, 1, np.object_), - (np.bool_, 2.0, np.object_), - (np.bool_, 3.0 + 4.0j, np.object_), - (np.bool_, True, np.bool_), - (np.bool_, '', np.object_), -]) +@pytest.fixture( + params=[ + (np.int8, np.int16(127), np.int8), + (np.int8, np.int16(128), np.int16), + (np.int32, 1, np.int32), + (np.int32, 2.0, np.float64), + (np.int32, 3.0 + 4.0j, np.complex128), + (np.int32, True, np.object_), + (np.int32, "", np.object_), + (np.float64, 1, np.float64), + (np.float64, 2.0, np.float64), + (np.float64, 3.0 + 4.0j, np.complex128), + (np.float64, True, np.object_), + (np.float64, "", np.object_), + (np.complex128, 1, np.complex128), + (np.complex128, 2.0, np.complex128), + (np.complex128, 3.0 + 4.0j, np.complex128), + (np.complex128, True, np.object_), + (np.complex128, "", np.object_), + (np.bool_, 1, np.object_), + (np.bool_, 2.0, np.object_), + (np.bool_, 3.0 + 4.0j, np.object_), + (np.bool_, True, np.bool_), + (np.bool_, "", np.object_), + ] +) def dtype_fill_out_dtype(request): return request.param @@ -102,15 +106,15 @@ def test_1d_fill_nonna(self, dtype_fill_out_dtype): indexer = [2, 1, 0, -1] result = algos.take_1d(data, indexer, fill_value=fill_value) - assert ((result[[0, 1, 2]] == data[[2, 1, 0]]).all()) - assert (result[3] == fill_value) - assert (result.dtype == out_dtype) + assert (result[[0, 1, 2]] == data[[2, 1, 0]]).all() + assert result[3] == fill_value + assert result.dtype == out_dtype indexer = [2, 1, 0, 1] result = algos.take_1d(data, indexer, fill_value=fill_value) - assert ((result[[0, 1, 2, 3]] == data[indexer]).all()) - assert (result.dtype == dtype) + assert (result[[0, 1, 2, 3]] == data[indexer]).all() + assert result.dtype == dtype def test_2d_with_out(self, dtype_can_hold_na, writeable): dtype, can_hold_na = dtype_can_hold_na @@ -157,28 +161,24 @@ def test_2d_fill_nonna(self, dtype_fill_out_dtype): data = np.random.randint(0, 2, (5, 3)).astype(dtype) indexer = [2, 1, 0, -1] - result = algos.take_nd(data, indexer, axis=0, - fill_value=fill_value) - assert ((result[[0, 1, 2], :] == data[[2, 1, 0], :]).all()) - assert ((result[3, :] == fill_value).all()) - assert (result.dtype == out_dtype) + result = algos.take_nd(data, indexer, axis=0, fill_value=fill_value) + assert (result[[0, 1, 2], :] == data[[2, 1, 0], :]).all() + assert (result[3, :] == fill_value).all() + assert result.dtype == out_dtype - result = algos.take_nd(data, indexer, axis=1, - fill_value=fill_value) - assert ((result[:, [0, 1, 2]] == data[:, [2, 1, 0]]).all()) - assert ((result[:, 3] == fill_value).all()) - assert (result.dtype == out_dtype) + result = algos.take_nd(data, indexer, axis=1, fill_value=fill_value) + assert (result[:, [0, 1, 2]] == data[:, [2, 1, 0]]).all() + assert (result[:, 3] == fill_value).all() + assert result.dtype == out_dtype indexer = [2, 1, 0, 1] - result = algos.take_nd(data, indexer, axis=0, - fill_value=fill_value) - assert ((result[[0, 1, 2, 3], :] == data[indexer, :]).all()) - assert (result.dtype == dtype) + result = algos.take_nd(data, indexer, axis=0, fill_value=fill_value) + assert (result[[0, 1, 2, 3], :] == data[indexer, :]).all() + assert result.dtype == dtype - result = algos.take_nd(data, indexer, axis=1, - fill_value=fill_value) - assert ((result[:, [0, 1, 2, 3]] == data[:, indexer]).all()) - assert (result.dtype == dtype) + result = algos.take_nd(data, indexer, axis=1, fill_value=fill_value) + assert (result[:, [0, 1, 2, 3]] == data[:, indexer]).all() + assert result.dtype == dtype def test_3d_with_out(self, dtype_can_hold_na): dtype, can_hold_na = dtype_can_hold_na @@ -237,39 +237,33 @@ def test_3d_fill_nonna(self, dtype_fill_out_dtype): data = np.random.randint(0, 2, (5, 4, 3)).astype(dtype) indexer = [2, 1, 0, -1] - result = algos.take_nd(data, indexer, axis=0, - fill_value=fill_value) - assert ((result[[0, 1, 2], :, :] == data[[2, 1, 0], :, :]).all()) - assert ((result[3, :, :] == fill_value).all()) - assert (result.dtype == out_dtype) + result = algos.take_nd(data, indexer, axis=0, fill_value=fill_value) + assert (result[[0, 1, 2], :, :] == data[[2, 1, 0], :, :]).all() + assert (result[3, :, :] == fill_value).all() + assert result.dtype == out_dtype - result = algos.take_nd(data, indexer, axis=1, - fill_value=fill_value) - assert ((result[:, [0, 1, 2], :] == data[:, [2, 1, 0], :]).all()) - assert ((result[:, 3, :] == fill_value).all()) - assert (result.dtype == out_dtype) + result = algos.take_nd(data, indexer, axis=1, fill_value=fill_value) + assert (result[:, [0, 1, 2], :] == data[:, [2, 1, 0], :]).all() + assert (result[:, 3, :] == fill_value).all() + assert result.dtype == out_dtype - result = algos.take_nd(data, indexer, axis=2, - fill_value=fill_value) - assert ((result[:, :, [0, 1, 2]] == data[:, :, [2, 1, 0]]).all()) - assert ((result[:, :, 3] == fill_value).all()) - assert (result.dtype == out_dtype) + result = algos.take_nd(data, indexer, axis=2, fill_value=fill_value) + assert (result[:, :, [0, 1, 2]] == data[:, :, [2, 1, 0]]).all() + assert (result[:, :, 3] == fill_value).all() + assert result.dtype == out_dtype indexer = [2, 1, 0, 1] - result = algos.take_nd(data, indexer, axis=0, - fill_value=fill_value) - assert ((result[[0, 1, 2, 3], :, :] == data[indexer, :, :]).all()) - assert (result.dtype == dtype) + result = algos.take_nd(data, indexer, axis=0, fill_value=fill_value) + assert (result[[0, 1, 2, 3], :, :] == data[indexer, :, :]).all() + assert result.dtype == dtype - result = algos.take_nd(data, indexer, axis=1, - fill_value=fill_value) - assert ((result[:, [0, 1, 2, 3], :] == data[:, indexer, :]).all()) - assert (result.dtype == dtype) + result = algos.take_nd(data, indexer, axis=1, fill_value=fill_value) + assert (result[:, [0, 1, 2, 3], :] == data[:, indexer, :]).all() + assert result.dtype == dtype - result = algos.take_nd(data, indexer, axis=2, - fill_value=fill_value) - assert ((result[:, :, [0, 1, 2, 3]] == data[:, :, indexer]).all()) - assert (result.dtype == dtype) + result = algos.take_nd(data, indexer, axis=2, fill_value=fill_value) + assert (result[:, :, [0, 1, 2, 3]] == data[:, :, indexer]).all() + assert result.dtype == dtype def test_1d_other_dtypes(self): arr = np.random.randn(10).astype(np.float32) @@ -336,7 +330,7 @@ def test_2d_float32(self): tm.assert_almost_equal(result, expected) # this now accepts a float32! # test with float64 out buffer - out = np.empty((len(indexer), arr.shape[1]), dtype='float32') + out = np.empty((len(indexer), arr.shape[1]), dtype="float32") algos.take_nd(arr, indexer, out=out) # it works! # axis=1 @@ -352,7 +346,7 @@ def test_2d_float32(self): def test_2d_datetime64(self): # 2005/01/01 - 2006/01/01 arr = np.random.randint(11045376, 11360736, (5, 3)) * 100000000000 - arr = arr.view(dtype='datetime64[ns]') + arr = arr.view(dtype="datetime64[ns]") indexer = [0, 2, -1, 1, -1] # axis=0 @@ -365,11 +359,11 @@ def test_2d_datetime64(self): expected.view(np.int64)[[2, 4], :] = iNaT tm.assert_almost_equal(result, expected) - result = algos.take_nd(arr, indexer, axis=0, - fill_value=datetime(2007, 1, 1)) + result = algos.take_nd(arr, indexer, axis=0, fill_value=datetime(2007, 1, 1)) result2 = np.empty_like(result) - algos.take_nd(arr, indexer, out=result2, axis=0, - fill_value=datetime(2007, 1, 1)) + algos.take_nd( + arr, indexer, out=result2, axis=0, fill_value=datetime(2007, 1, 1) + ) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=0) @@ -386,11 +380,11 @@ def test_2d_datetime64(self): expected.view(np.int64)[:, [2, 4]] = iNaT tm.assert_almost_equal(result, expected) - result = algos.take_nd(arr, indexer, axis=1, - fill_value=datetime(2007, 1, 1)) + result = algos.take_nd(arr, indexer, axis=1, fill_value=datetime(2007, 1, 1)) result2 = np.empty_like(result) - algos.take_nd(arr, indexer, out=result2, axis=1, - fill_value=datetime(2007, 1, 1)) + algos.take_nd( + arr, indexer, out=result2, axis=1, fill_value=datetime(2007, 1, 1) + ) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=1) @@ -415,15 +409,13 @@ def test_take_axis_1(self): tm.assert_numpy_array_equal(result, expected) # allow_fill=True - result = algos.take(arr, [0, -1], axis=1, allow_fill=True, - fill_value=0) + result = algos.take(arr, [0, -1], axis=1, allow_fill=True, fill_value=0) expected = np.array([[0, 0], [3, 0], [6, 0], [9, 0]]) tm.assert_numpy_array_equal(result, expected) # GH#26976 make sure we validate along the correct axis with pytest.raises(IndexError, match="indices are out-of-bounds"): - algos.take(arr, [0, 3], axis=1, allow_fill=True, - fill_value=0) + algos.take(arr, [0, 3], axis=1, allow_fill=True, fill_value=0) class TestExtensionTake: @@ -447,7 +439,7 @@ def test_bounds_check_small(self): expected = np.array([1, 3, 2], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('allow_fill', [True, False]) + @pytest.mark.parametrize("allow_fill", [True, False]) def test_take_empty(self, allow_fill): arr = np.array([], dtype=np.int64) # empty take is ok @@ -458,9 +450,8 @@ def test_take_empty(self, allow_fill): algos.take(arr, [0], allow_fill=allow_fill) def test_take_na_empty(self): - result = algos.take(np.array([]), [-1, -1], allow_fill=True, - fill_value=0.0) - expected = np.array([0., 0.]) + result = algos.take(np.array([]), [-1, -1], allow_fill=True, fill_value=0.0) + expected = np.array([0.0, 0.0]) tm.assert_numpy_array_equal(result, expected) def test_take_coerces_list(self): diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 8604acb1bd2b2c..27700d778df191 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -12,8 +12,7 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import ( - DataFrame, Index, Series, Timestamp, bdate_range, concat, isna, notna) +from pandas import DataFrame, Index, Series, Timestamp, bdate_range, concat, isna, notna from pandas.core.base import DataError, SpecificationError from pandas.core.sorting import safe_sort import pandas.core.window as rwindow @@ -36,20 +35,30 @@ def raw(request): return request.param -@pytest.fixture(params=['triang', 'blackman', 'hamming', 'bartlett', 'bohman', - 'blackmanharris', 'nuttall', 'barthann']) +@pytest.fixture( + params=[ + "triang", + "blackman", + "hamming", + "bartlett", + "bohman", + "blackmanharris", + "nuttall", + "barthann", + ] +) def win_types(request): return request.param -@pytest.fixture(params=['kaiser', 'gaussian', 'general_gaussian', - 'exponential']) +@pytest.fixture(params=["kaiser", "gaussian", "general_gaussian", "exponential"]) def win_types_special(request): return request.param -@pytest.fixture(params=["sum", "mean", "median", "max", "min", - "var", "std", "kurt", "skew"]) +@pytest.fixture( + params=["sum", "mean", "median", "max", "min", "var", "std", "kurt", "skew"] +) def arithmetic_win_operators(request): return request.param @@ -66,12 +75,10 @@ def _create_data(self): self.arr = arr self.rng = bdate_range(datetime(2009, 1, 1), periods=N) self.series = Series(arr.copy(), index=self.rng) - self.frame = DataFrame(randn(N, K), index=self.rng, - columns=np.arange(K)) + self.frame = DataFrame(randn(N, K), index=self.rng, columns=np.arange(K)) class TestApi(Base): - def setup_method(self, method): self._create_data() @@ -85,205 +92,237 @@ def test_getitem(self): # technically this is allowed r = self.frame.rolling(window=5)[1, 3] - tm.assert_index_equal(r._selected_obj.columns, - self.frame.columns[[1, 3]]) + tm.assert_index_equal(r._selected_obj.columns, self.frame.columns[[1, 3]]) r = self.frame.rolling(window=5)[[1, 3]] - tm.assert_index_equal(r._selected_obj.columns, - self.frame.columns[[1, 3]]) + tm.assert_index_equal(r._selected_obj.columns, self.frame.columns[[1, 3]]) def test_select_bad_cols(self): - df = DataFrame([[1, 2]], columns=['A', 'B']) + df = DataFrame([[1, 2]], columns=["A", "B"]) g = df.rolling(window=5) with pytest.raises(KeyError, match="Columns not found: 'C'"): - g[['C']] - with pytest.raises(KeyError, match='^[^A]+$'): + g[["C"]] + with pytest.raises(KeyError, match="^[^A]+$"): # A should not be referenced as a bad column... # will have to rethink regex if you change message! - g[['A', 'C']] + g[["A", "C"]] def test_attribute_access(self): - df = DataFrame([[1, 2]], columns=['A', 'B']) + df = DataFrame([[1, 2]], columns=["A", "B"]) r = df.rolling(window=5) - tm.assert_series_equal(r.A.sum(), r['A'].sum()) + tm.assert_series_equal(r.A.sum(), r["A"].sum()) msg = "'Rolling' object has no attribute 'F'" with pytest.raises(AttributeError, match=msg): r.F def tests_skip_nuisance(self): - df = DataFrame({'A': range(5), 'B': range(5, 10), 'C': 'foo'}) + df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) r = df.rolling(window=3) - result = r[['A', 'B']].sum() - expected = DataFrame({'A': [np.nan, np.nan, 3, 6, 9], - 'B': [np.nan, np.nan, 18, 21, 24]}, - columns=list('AB')) + result = r[["A", "B"]].sum() + expected = DataFrame( + {"A": [np.nan, np.nan, 3, 6, 9], "B": [np.nan, np.nan, 18, 21, 24]}, + columns=list("AB"), + ) tm.assert_frame_equal(result, expected) def test_skip_sum_object_raises(self): - df = DataFrame({'A': range(5), 'B': range(5, 10), 'C': 'foo'}) + df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) r = df.rolling(window=3) result = r.sum() - expected = DataFrame({'A': [np.nan, np.nan, 3, 6, 9], - 'B': [np.nan, np.nan, 18, 21, 24]}, - columns=list('AB')) + expected = DataFrame( + {"A": [np.nan, np.nan, 3, 6, 9], "B": [np.nan, np.nan, 18, 21, 24]}, + columns=list("AB"), + ) tm.assert_frame_equal(result, expected) def test_agg(self): - df = DataFrame({'A': range(5), 'B': range(0, 10, 2)}) + df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) r = df.rolling(window=3) - a_mean = r['A'].mean() - a_std = r['A'].std() - a_sum = r['A'].sum() - b_mean = r['B'].mean() - b_std = r['B'].std() - b_sum = r['B'].sum() + a_mean = r["A"].mean() + a_std = r["A"].std() + a_sum = r["A"].sum() + b_mean = r["B"].mean() + b_std = r["B"].std() + b_sum = r["B"].sum() result = r.aggregate([np.mean, np.std]) expected = concat([a_mean, a_std, b_mean, b_std], axis=1) - expected.columns = pd.MultiIndex.from_product([['A', 'B'], ['mean', - 'std']]) + expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) tm.assert_frame_equal(result, expected) - result = r.aggregate({'A': np.mean, 'B': np.std}) + result = r.aggregate({"A": np.mean, "B": np.std}) expected = concat([a_mean, b_std], axis=1) tm.assert_frame_equal(result, expected, check_like=True) - result = r.aggregate({'A': ['mean', 'std']}) + result = r.aggregate({"A": ["mean", "std"]}) expected = concat([a_mean, a_std], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', - 'std')]) + expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std")]) tm.assert_frame_equal(result, expected) - result = r['A'].aggregate(['mean', 'sum']) + result = r["A"].aggregate(["mean", "sum"]) expected = concat([a_mean, a_sum], axis=1) - expected.columns = ['mean', 'sum'] + expected.columns = ["mean", "sum"] tm.assert_frame_equal(result, expected) with catch_warnings(record=True): # using a dict with renaming warnings.simplefilter("ignore", FutureWarning) - result = r.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}}) + result = r.aggregate({"A": {"mean": "mean", "sum": "sum"}}) expected = concat([a_mean, a_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), - ('A', 'sum')]) + expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "sum")]) tm.assert_frame_equal(result, expected, check_like=True) with catch_warnings(record=True): warnings.simplefilter("ignore", FutureWarning) - result = r.aggregate({'A': {'mean': 'mean', - 'sum': 'sum'}, - 'B': {'mean2': 'mean', - 'sum2': 'sum'}}) + result = r.aggregate( + { + "A": {"mean": "mean", "sum": "sum"}, + "B": {"mean2": "mean", "sum2": "sum"}, + } + ) expected = concat([a_mean, a_sum, b_mean, b_sum], axis=1) - exp_cols = [('A', 'mean'), ('A', 'sum'), ('B', 'mean2'), ('B', 'sum2')] + exp_cols = [("A", "mean"), ("A", "sum"), ("B", "mean2"), ("B", "sum2")] expected.columns = pd.MultiIndex.from_tuples(exp_cols) tm.assert_frame_equal(result, expected, check_like=True) - result = r.aggregate({'A': ['mean', 'std'], 'B': ['mean', 'std']}) + result = r.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) expected = concat([a_mean, a_std, b_mean, b_std], axis=1) - exp_cols = [('A', 'mean'), ('A', 'std'), ('B', 'mean'), ('B', 'std')] + exp_cols = [("A", "mean"), ("A", "std"), ("B", "mean"), ("B", "std")] expected.columns = pd.MultiIndex.from_tuples(exp_cols) tm.assert_frame_equal(result, expected, check_like=True) def test_agg_apply(self, raw): # passed lambda - df = DataFrame({'A': range(5), 'B': range(0, 10, 2)}) + df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) r = df.rolling(window=3) - a_sum = r['A'].sum() + a_sum = r["A"].sum() - result = r.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)}) - rcustom = r['B'].apply(lambda x: np.std(x, ddof=1), raw=raw) + result = r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) + rcustom = r["B"].apply(lambda x: np.std(x, ddof=1), raw=raw) expected = concat([a_sum, rcustom], axis=1) tm.assert_frame_equal(result, expected, check_like=True) def test_agg_consistency(self): - df = DataFrame({'A': range(5), 'B': range(0, 10, 2)}) + df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) r = df.rolling(window=3) result = r.agg([np.sum, np.mean]).columns - expected = pd.MultiIndex.from_product([list('AB'), ['sum', 'mean']]) + expected = pd.MultiIndex.from_product([list("AB"), ["sum", "mean"]]) tm.assert_index_equal(result, expected) - result = r['A'].agg([np.sum, np.mean]).columns - expected = Index(['sum', 'mean']) + result = r["A"].agg([np.sum, np.mean]).columns + expected = Index(["sum", "mean"]) tm.assert_index_equal(result, expected) - result = r.agg({'A': [np.sum, np.mean]}).columns - expected = pd.MultiIndex.from_tuples([('A', 'sum'), ('A', 'mean')]) + result = r.agg({"A": [np.sum, np.mean]}).columns + expected = pd.MultiIndex.from_tuples([("A", "sum"), ("A", "mean")]) tm.assert_index_equal(result, expected) def test_agg_nested_dicts(self): # API change for disallowing these types of nested dicts - df = DataFrame({'A': range(5), 'B': range(0, 10, 2)}) + df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) r = df.rolling(window=3) msg = r"cannot perform renaming for (r1|r2) with a nested dictionary" with pytest.raises(SpecificationError, match=msg): - r.aggregate({'r1': {'A': ['mean', 'sum']}, - 'r2': {'B': ['mean', 'sum']}}) - - expected = concat([r['A'].mean(), r['A'].std(), - r['B'].mean(), r['B'].std()], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( - 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')]) + r.aggregate({"r1": {"A": ["mean", "sum"]}, "r2": {"B": ["mean", "sum"]}}) + + expected = concat( + [r["A"].mean(), r["A"].std(), r["B"].mean(), r["B"].std()], axis=1 + ) + expected.columns = pd.MultiIndex.from_tuples( + [("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")] + ) with catch_warnings(record=True): warnings.simplefilter("ignore", FutureWarning) - result = r[['A', 'B']].agg({'A': {'ra': ['mean', 'std']}, - 'B': {'rb': ['mean', 'std']}}) + result = r[["A", "B"]].agg( + {"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}} + ) tm.assert_frame_equal(result, expected, check_like=True) with catch_warnings(record=True): warnings.simplefilter("ignore", FutureWarning) - result = r.agg({'A': {'ra': ['mean', 'std']}, - 'B': {'rb': ['mean', 'std']}}) - expected.columns = pd.MultiIndex.from_tuples([('A', 'ra', 'mean'), ( - 'A', 'ra', 'std'), ('B', 'rb', 'mean'), ('B', 'rb', 'std')]) + result = r.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) + expected.columns = pd.MultiIndex.from_tuples( + [ + ("A", "ra", "mean"), + ("A", "ra", "std"), + ("B", "rb", "mean"), + ("B", "rb", "std"), + ] + ) tm.assert_frame_equal(result, expected, check_like=True) def test_count_nonnumeric_types(self): # GH12541 - cols = ['int', 'float', 'string', 'datetime', 'timedelta', 'periods', - 'fl_inf', 'fl_nan', 'str_nan', 'dt_nat', 'periods_nat'] + cols = [ + "int", + "float", + "string", + "datetime", + "timedelta", + "periods", + "fl_inf", + "fl_nan", + "str_nan", + "dt_nat", + "periods_nat", + ] df = DataFrame( - {'int': [1, 2, 3], - 'float': [4., 5., 6.], - 'string': list('abc'), - 'datetime': pd.date_range('20170101', periods=3), - 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s'), - 'periods': [pd.Period('2012-01'), pd.Period('2012-02'), - pd.Period('2012-03')], - 'fl_inf': [1., 2., np.Inf], - 'fl_nan': [1., 2., np.NaN], - 'str_nan': ['aa', 'bb', np.NaN], - 'dt_nat': [Timestamp('20170101'), Timestamp('20170203'), - Timestamp(None)], - 'periods_nat': [pd.Period('2012-01'), pd.Period('2012-02'), - pd.Period(None)]}, - columns=cols) + { + "int": [1, 2, 3], + "float": [4.0, 5.0, 6.0], + "string": list("abc"), + "datetime": pd.date_range("20170101", periods=3), + "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), + "periods": [ + pd.Period("2012-01"), + pd.Period("2012-02"), + pd.Period("2012-03"), + ], + "fl_inf": [1.0, 2.0, np.Inf], + "fl_nan": [1.0, 2.0, np.NaN], + "str_nan": ["aa", "bb", np.NaN], + "dt_nat": [ + Timestamp("20170101"), + Timestamp("20170203"), + Timestamp(None), + ], + "periods_nat": [ + pd.Period("2012-01"), + pd.Period("2012-02"), + pd.Period(None), + ], + }, + columns=cols, + ) expected = DataFrame( - {'int': [1., 2., 2.], - 'float': [1., 2., 2.], - 'string': [1., 2., 2.], - 'datetime': [1., 2., 2.], - 'timedelta': [1., 2., 2.], - 'periods': [1., 2., 2.], - 'fl_inf': [1., 2., 2.], - 'fl_nan': [1., 2., 1.], - 'str_nan': [1., 2., 1.], - 'dt_nat': [1., 2., 1.], - 'periods_nat': [1., 2., 1.]}, - columns=cols) + { + "int": [1.0, 2.0, 2.0], + "float": [1.0, 2.0, 2.0], + "string": [1.0, 2.0, 2.0], + "datetime": [1.0, 2.0, 2.0], + "timedelta": [1.0, 2.0, 2.0], + "periods": [1.0, 2.0, 2.0], + "fl_inf": [1.0, 2.0, 2.0], + "fl_nan": [1.0, 2.0, 1.0], + "str_nan": [1.0, 2.0, 1.0], + "dt_nat": [1.0, 2.0, 1.0], + "periods_nat": [1.0, 2.0, 1.0], + }, + columns=cols, + ) result = df.rolling(window=2).count() tm.assert_frame_equal(result, expected) @@ -296,12 +335,12 @@ def test_count_nonnumeric_types(self): @pytest.mark.filterwarnings("ignore:can't resolve:ImportWarning") def test_window_with_args(self): # make sure that we are aggregating window functions correctly with arg - r = Series(np.random.randn(100)).rolling(window=10, min_periods=1, - win_type='gaussian') - expected = concat([r.mean(std=10), r.mean(std=.01)], axis=1) - expected.columns = ['', ''] - result = r.aggregate([lambda x: x.mean(std=10), - lambda x: x.mean(std=.01)]) + r = Series(np.random.randn(100)).rolling( + window=10, min_periods=1, win_type="gaussian" + ) + expected = concat([r.mean(std=10), r.mean(std=0.01)], axis=1) + expected.columns = ["", ""] + result = r.aggregate([lambda x: x.mean(std=10), lambda x: x.mean(std=0.01)]) tm.assert_frame_equal(result, expected) def a(x): @@ -310,77 +349,95 @@ def a(x): def b(x): return x.mean(std=0.01) - expected = concat([r.mean(std=10), r.mean(std=.01)], axis=1) - expected.columns = ['a', 'b'] + expected = concat([r.mean(std=10), r.mean(std=0.01)], axis=1) + expected.columns = ["a", "b"] result = r.aggregate([a, b]) tm.assert_frame_equal(result, expected) def test_preserve_metadata(self): # GH 10565 - s = Series(np.arange(100), name='foo') + s = Series(np.arange(100), name="foo") s2 = s.rolling(30).sum() s3 = s.rolling(20).sum() - assert s2.name == 'foo' - assert s3.name == 'foo' - - @pytest.mark.parametrize("func,window_size,expected_vals", [ - ('rolling', 2, [[np.nan, np.nan, np.nan, np.nan], - [15., 20., 25., 20.], - [25., 30., 35., 30.], - [np.nan, np.nan, np.nan, np.nan], - [20., 30., 35., 30.], - [35., 40., 60., 40.], - [60., 80., 85., 80]]), - ('expanding', None, [[10., 10., 20., 20.], - [15., 20., 25., 20.], - [20., 30., 30., 20.], - [10., 10., 30., 30.], - [20., 30., 35., 30.], - [26.666667, 40., 50., 30.], - [40., 80., 60., 30.]])]) + assert s2.name == "foo" + assert s3.name == "foo" + + @pytest.mark.parametrize( + "func,window_size,expected_vals", + [ + ( + "rolling", + 2, + [ + [np.nan, np.nan, np.nan, np.nan], + [15.0, 20.0, 25.0, 20.0], + [25.0, 30.0, 35.0, 30.0], + [np.nan, np.nan, np.nan, np.nan], + [20.0, 30.0, 35.0, 30.0], + [35.0, 40.0, 60.0, 40.0], + [60.0, 80.0, 85.0, 80], + ], + ), + ( + "expanding", + None, + [ + [10.0, 10.0, 20.0, 20.0], + [15.0, 20.0, 25.0, 20.0], + [20.0, 30.0, 30.0, 20.0], + [10.0, 10.0, 30.0, 30.0], + [20.0, 30.0, 35.0, 30.0], + [26.666667, 40.0, 50.0, 30.0], + [40.0, 80.0, 60.0, 30.0], + ], + ), + ], + ) def test_multiple_agg_funcs(self, func, window_size, expected_vals): # GH 15072 - df = pd.DataFrame([ - ['A', 10, 20], - ['A', 20, 30], - ['A', 30, 40], - ['B', 10, 30], - ['B', 30, 40], - ['B', 40, 80], - ['B', 80, 90]], columns=['stock', 'low', 'high']) - - f = getattr(df.groupby('stock'), func) + df = pd.DataFrame( + [ + ["A", 10, 20], + ["A", 20, 30], + ["A", 30, 40], + ["B", 10, 30], + ["B", 30, 40], + ["B", 40, 80], + ["B", 80, 90], + ], + columns=["stock", "low", "high"], + ) + + f = getattr(df.groupby("stock"), func) if window_size: window = f(window_size) else: window = f() - index = pd.MultiIndex.from_tuples([ - ('A', 0), ('A', 1), ('A', 2), - ('B', 3), ('B', 4), ('B', 5), ('B', 6)], names=['stock', None]) - columns = pd.MultiIndex.from_tuples([ - ('low', 'mean'), ('low', 'max'), ('high', 'mean'), - ('high', 'min')]) + index = pd.MultiIndex.from_tuples( + [("A", 0), ("A", 1), ("A", 2), ("B", 3), ("B", 4), ("B", 5), ("B", 6)], + names=["stock", None], + ) + columns = pd.MultiIndex.from_tuples( + [("low", "mean"), ("low", "max"), ("high", "mean"), ("high", "min")] + ) expected = pd.DataFrame(expected_vals, index=index, columns=columns) - result = window.agg(OrderedDict(( - ('low', ['mean', 'max']), - ('high', ['mean', 'min']), - ))) + result = window.agg( + OrderedDict((("low", ["mean", "max"]), ("high", ["mean", "min"]))) + ) tm.assert_frame_equal(result, expected) @pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning") class TestWindow(Base): - def setup_method(self, method): self._create_data() @td.skip_if_no_scipy - @pytest.mark.parametrize( - 'which', ['series', 'frame']) + @pytest.mark.parametrize("which", ["series", "frame"]) def test_constructor(self, which): # GH 12669 @@ -388,32 +445,30 @@ def test_constructor(self, which): c = o.rolling # valid - c(win_type='boxcar', window=2, min_periods=1) - c(win_type='boxcar', window=2, min_periods=1, center=True) - c(win_type='boxcar', window=2, min_periods=1, center=False) + c(win_type="boxcar", window=2, min_periods=1) + c(win_type="boxcar", window=2, min_periods=1, center=True) + c(win_type="boxcar", window=2, min_periods=1, center=False) # not valid - for w in [2., 'foo', np.array([2])]: + for w in [2.0, "foo", np.array([2])]: with pytest.raises(ValueError): - c(win_type='boxcar', window=2, min_periods=w) + c(win_type="boxcar", window=2, min_periods=w) with pytest.raises(ValueError): - c(win_type='boxcar', window=2, min_periods=1, center=w) + c(win_type="boxcar", window=2, min_periods=1, center=w) - for wt in ['foobar', 1]: + for wt in ["foobar", 1]: with pytest.raises(ValueError): c(win_type=wt, window=2) @td.skip_if_no_scipy - @pytest.mark.parametrize( - 'which', ['series', 'frame']) + @pytest.mark.parametrize("which", ["series", "frame"]) def test_constructor_with_win_type(self, which, win_types): # GH 12669 o = getattr(self, which) c = o.rolling c(win_type=win_types, window=2) - @pytest.mark.parametrize( - 'method', ['sum', 'mean']) + @pytest.mark.parametrize("method", ["sum", "mean"]) def test_numpy_compat(self, method): # see gh-12811 w = rwindow.Window(Series([2, 4, 6]), window=[0, 2]) @@ -427,19 +482,17 @@ def test_numpy_compat(self, method): class TestRolling(Base): - def setup_method(self, method): self._create_data() def test_doc_string(self): - df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df = DataFrame({"B": [0, 1, 2, np.nan, 4]}) df df.rolling(2).sum() df.rolling(2, min_periods=1).sum() - @pytest.mark.parametrize( - 'which', ['series', 'frame']) + @pytest.mark.parametrize("which", ["series", "frame"]) def test_constructor(self, which): # GH 12669 @@ -458,7 +511,7 @@ def test_constructor(self, which): c(-1) # not valid - for w in [2., 'foo', np.array([2])]: + for w in [2.0, "foo", np.array([2])]: with pytest.raises(ValueError): c(window=w) with pytest.raises(ValueError): @@ -467,50 +520,53 @@ def test_constructor(self, which): c(window=2, min_periods=1, center=w) @td.skip_if_no_scipy - @pytest.mark.parametrize( - 'which', ['series', 'frame']) + @pytest.mark.parametrize("which", ["series", "frame"]) def test_constructor_with_win_type(self, which): # GH 13383 o = getattr(self, which) c = o.rolling with pytest.raises(ValueError): - c(-1, win_type='boxcar') + c(-1, win_type="boxcar") - @pytest.mark.parametrize( - 'window', [timedelta(days=3), pd.Timedelta(days=3)]) + @pytest.mark.parametrize("window", [timedelta(days=3), pd.Timedelta(days=3)]) def test_constructor_with_timedelta_window(self, window): # GH 15440 n = 10 - df = DataFrame({'value': np.arange(n)}, - index=pd.date_range('2015-12-24', periods=n, freq="D")) - expected_data = np.append([0., 1.], np.arange(3., 27., 3)) + df = DataFrame( + {"value": np.arange(n)}, + index=pd.date_range("2015-12-24", periods=n, freq="D"), + ) + expected_data = np.append([0.0, 1.0], np.arange(3.0, 27.0, 3)) result = df.rolling(window=window).sum() - expected = DataFrame({'value': expected_data}, - index=pd.date_range('2015-12-24', periods=n, - freq="D")) + expected = DataFrame( + {"value": expected_data}, + index=pd.date_range("2015-12-24", periods=n, freq="D"), + ) tm.assert_frame_equal(result, expected) - expected = df.rolling('3D').sum() + expected = df.rolling("3D").sum() tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - 'window', [timedelta(days=3), pd.Timedelta(days=3), '3D']) + @pytest.mark.parametrize("window", [timedelta(days=3), pd.Timedelta(days=3), "3D"]) def test_constructor_timedelta_window_and_minperiods(self, window, raw): # GH 15305 n = 10 - df = DataFrame({'value': np.arange(n)}, - index=pd.date_range('2017-08-08', periods=n, freq="D")) + df = DataFrame( + {"value": np.arange(n)}, + index=pd.date_range("2017-08-08", periods=n, freq="D"), + ) expected = DataFrame( - {'value': np.append([np.NaN, 1.], np.arange(3., 27., 3))}, - index=pd.date_range('2017-08-08', periods=n, freq="D")) + {"value": np.append([np.NaN, 1.0], np.arange(3.0, 27.0, 3))}, + index=pd.date_range("2017-08-08", periods=n, freq="D"), + ) result_roll_sum = df.rolling(window=window, min_periods=2).sum() - result_roll_generic = df.rolling(window=window, - min_periods=2).apply(sum, raw=raw) + result_roll_generic = df.rolling(window=window, min_periods=2).apply( + sum, raw=raw + ) tm.assert_frame_equal(result_roll_sum, expected) tm.assert_frame_equal(result_roll_generic, expected) - @pytest.mark.parametrize( - 'method', ['std', 'mean', 'sum', 'max', 'min', 'var']) + @pytest.mark.parametrize("method", ["std", "mean", "sum", "max", "min", "var"]) def test_numpy_compat(self, method): # see gh-12811 r = rwindow.Rolling(Series([2, 4, 6]), window=2) @@ -523,108 +579,116 @@ def test_numpy_compat(self, method): getattr(r, method)(dtype=np.float64) def test_closed(self): - df = DataFrame({'A': [0, 1, 2, 3, 4]}) + df = DataFrame({"A": [0, 1, 2, 3, 4]}) # closed only allowed for datetimelike with pytest.raises(ValueError): - df.rolling(window=3, closed='neither') + df.rolling(window=3, closed="neither") @pytest.mark.parametrize("closed", ["neither", "left"]) def test_closed_empty(self, closed, arithmetic_win_operators): # GH 26005 func_name = arithmetic_win_operators - ser = pd.Series(data=np.arange(5), - index=pd.date_range("2000", periods=5, freq="2D")) + ser = pd.Series( + data=np.arange(5), index=pd.date_range("2000", periods=5, freq="2D") + ) roll = ser.rolling("1D", closed=closed) result = getattr(roll, func_name)() expected = pd.Series([np.nan] * 5, index=ser.index) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("func", ['min', 'max']) + @pytest.mark.parametrize("func", ["min", "max"]) def test_closed_one_entry(self, func): # GH24718 - ser = pd.Series(data=[2], index=pd.date_range('2000', periods=1)) - result = getattr(ser.rolling('10D', closed='left'), func)() + ser = pd.Series(data=[2], index=pd.date_range("2000", periods=1)) + result = getattr(ser.rolling("10D", closed="left"), func)() tm.assert_series_equal(result, pd.Series([np.nan], index=ser.index)) - @pytest.mark.parametrize("func", ['min', 'max']) + @pytest.mark.parametrize("func", ["min", "max"]) def test_closed_one_entry_groupby(self, func): # GH24718 - ser = pd.DataFrame(data={'A': [1, 1, 2], 'B': [3, 2, 1]}, - index=pd.date_range('2000', periods=3)) + ser = pd.DataFrame( + data={"A": [1, 1, 2], "B": [3, 2, 1]}, + index=pd.date_range("2000", periods=3), + ) result = getattr( - ser.groupby('A', sort=False)['B'].rolling('10D', closed='left'), - func)() - exp_idx = pd.MultiIndex.from_arrays(arrays=[[1, 1, 2], ser.index], - names=('A', None)) - expected = pd.Series(data=[np.nan, 3, np.nan], index=exp_idx, name='B') + ser.groupby("A", sort=False)["B"].rolling("10D", closed="left"), func + )() + exp_idx = pd.MultiIndex.from_arrays( + arrays=[[1, 1, 2], ser.index], names=("A", None) + ) + expected = pd.Series(data=[np.nan, 3, np.nan], index=exp_idx, name="B") tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("input_dtype", ['int', 'float']) - @pytest.mark.parametrize("func,closed,expected", [ - ('min', 'right', [0.0, 0, 0, 1, 2, 3, 4, 5, 6, 7]), - ('min', 'both', [0.0, 0, 0, 0, 1, 2, 3, 4, 5, 6]), - ('min', 'neither', [np.nan, 0, 0, 1, 2, 3, 4, 5, 6, 7]), - ('min', 'left', [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, 6]), - ('max', 'right', [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), - ('max', 'both', [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), - ('max', 'neither', [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]), - ('max', 'left', [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]) - ]) - def test_closed_min_max_datetime(self, input_dtype, - func, closed, - expected): + @pytest.mark.parametrize("input_dtype", ["int", "float"]) + @pytest.mark.parametrize( + "func,closed,expected", + [ + ("min", "right", [0.0, 0, 0, 1, 2, 3, 4, 5, 6, 7]), + ("min", "both", [0.0, 0, 0, 0, 1, 2, 3, 4, 5, 6]), + ("min", "neither", [np.nan, 0, 0, 1, 2, 3, 4, 5, 6, 7]), + ("min", "left", [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, 6]), + ("max", "right", [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + ("max", "both", [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + ("max", "neither", [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]), + ("max", "left", [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]), + ], + ) + def test_closed_min_max_datetime(self, input_dtype, func, closed, expected): # see gh-21704 - ser = pd.Series(data=np.arange(10).astype(input_dtype), - index=pd.date_range('2000', periods=10)) + ser = pd.Series( + data=np.arange(10).astype(input_dtype), + index=pd.date_range("2000", periods=10), + ) - result = getattr(ser.rolling('3D', closed=closed), func)() + result = getattr(ser.rolling("3D", closed=closed), func)() expected = pd.Series(expected, index=ser.index) tm.assert_series_equal(result, expected) def test_closed_uneven(self): # see gh-21704 - ser = pd.Series(data=np.arange(10), - index=pd.date_range('2000', periods=10)) + ser = pd.Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) # uneven ser = ser.drop(index=ser.index[[1, 5]]) - result = ser.rolling('3D', closed='left').min() - expected = pd.Series([np.nan, 0, 0, 2, 3, 4, 6, 6], - index=ser.index) + result = ser.rolling("3D", closed="left").min() + expected = pd.Series([np.nan, 0, 0, 2, 3, 4, 6, 6], index=ser.index) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("func,closed,expected", [ - ('min', 'right', [np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan, np.nan]), - ('min', 'both', [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, np.nan]), - ('min', 'neither', [np.nan, np.nan, 0, 1, 2, 3, 4, 5, np.nan, np.nan]), - ('min', 'left', [np.nan, np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan]), - ('max', 'right', [np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan, np.nan]), - ('max', 'both', [np.nan, 1, 2, 3, 4, 5, 6, 6, 6, np.nan]), - ('max', 'neither', [np.nan, np.nan, 1, 2, 3, 4, 5, 6, np.nan, np.nan]), - ('max', 'left', [np.nan, np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan]) - ]) + @pytest.mark.parametrize( + "func,closed,expected", + [ + ("min", "right", [np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan, np.nan]), + ("min", "both", [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, np.nan]), + ("min", "neither", [np.nan, np.nan, 0, 1, 2, 3, 4, 5, np.nan, np.nan]), + ("min", "left", [np.nan, np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan]), + ("max", "right", [np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan, np.nan]), + ("max", "both", [np.nan, 1, 2, 3, 4, 5, 6, 6, 6, np.nan]), + ("max", "neither", [np.nan, np.nan, 1, 2, 3, 4, 5, 6, np.nan, np.nan]), + ("max", "left", [np.nan, np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan]), + ], + ) def test_closed_min_max_minp(self, func, closed, expected): # see gh-21704 - ser = pd.Series(data=np.arange(10), - index=pd.date_range('2000', periods=10)) + ser = pd.Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) ser[ser.index[-3:]] = np.nan - result = getattr(ser.rolling('3D', min_periods=2, closed=closed), - func)() + result = getattr(ser.rolling("3D", min_periods=2, closed=closed), func)() expected = pd.Series(expected, index=ser.index) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("closed,expected", [ - ('right', [0, 0.5, 1, 2, 3, 4, 5, 6, 7, 8]), - ('both', [0, 0.5, 1, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), - ('neither', [np.nan, 0, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), - ('left', [np.nan, 0, 0.5, 1, 2, 3, 4, 5, 6, 7]) - ]) + @pytest.mark.parametrize( + "closed,expected", + [ + ("right", [0, 0.5, 1, 2, 3, 4, 5, 6, 7, 8]), + ("both", [0, 0.5, 1, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), + ("neither", [np.nan, 0, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), + ("left", [np.nan, 0, 0.5, 1, 2, 3, 4, 5, 6, 7]), + ], + ) def test_closed_median_quantile(self, closed, expected): # GH 26005 - ser = pd.Series(data=np.arange(10), - index=pd.date_range('2000', periods=10)) - roll = ser.rolling('3D', closed=closed) + ser = pd.Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) + roll = ser.rolling("3D", closed=closed) expected = pd.Series(expected, index=ser.index) result = roll.median() @@ -633,7 +697,7 @@ def test_closed_median_quantile(self, closed, expected): result = roll.quantile(0.5) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('roller', ['1s', 1]) + @pytest.mark.parametrize("roller", ["1s", 1]) def tests_empty_df_rolling(self, roller): # GH 15819 Verifies that datetime and integer rolling windows can be # applied to empty DataFrames @@ -673,9 +737,12 @@ def test_missing_minp_zero(self): def test_missing_minp_zero_variable(self): # https://github.com/pandas-dev/pandas/pull/18921 - x = pd.Series([np.nan] * 4, - index=pd.DatetimeIndex(['2017-01-01', '2017-01-04', - '2017-01-06', '2017-01-07'])) + x = pd.Series( + [np.nan] * 4, + index=pd.DatetimeIndex( + ["2017-01-01", "2017-01-04", "2017-01-06", "2017-01-07"] + ), + ) result = x.rolling(pd.Timedelta("2d"), min_periods=0).sum() expected = pd.Series(0.0, index=x.index) tm.assert_series_equal(result, expected) @@ -683,15 +750,16 @@ def test_missing_minp_zero_variable(self): def test_multi_index_names(self): # GH 16789, 16825 - cols = pd.MultiIndex.from_product([['A', 'B'], ['C', 'D', 'E']], - names=['1', '2']) + cols = pd.MultiIndex.from_product( + [["A", "B"], ["C", "D", "E"]], names=["1", "2"] + ) df = DataFrame(np.ones((10, 6)), columns=cols) result = df.rolling(3).cov() tm.assert_index_equal(result.columns, df.columns) - assert result.index.names == [None, '1', '2'] + assert result.index.names == [None, "1", "2"] - @pytest.mark.parametrize('klass', [pd.Series, pd.DataFrame]) + @pytest.mark.parametrize("klass", [pd.Series, pd.DataFrame]) def test_iter_raises(self, klass): # https://github.com/pandas-dev/pandas/issues/11704 # Iteration over a Window @@ -705,47 +773,40 @@ def test_rolling_axis_sum(self, axis_frame): axis = df._get_axis_number(axis_frame) if axis == 0: - expected = DataFrame({ - i: [np.nan] * 2 + [3.0] * 8 - for i in range(20) - }) + expected = DataFrame({i: [np.nan] * 2 + [3.0] * 8 for i in range(20)}) else: # axis == 1 - expected = DataFrame([ - [np.nan] * 2 + [3.0] * 18 - ] * 10) + expected = DataFrame([[np.nan] * 2 + [3.0] * 18] * 10) result = df.rolling(3, axis=axis_frame).sum() tm.assert_frame_equal(result, expected) def test_rolling_axis_count(self, axis_frame): # see gh-26055 - df = DataFrame({'x': range(3), 'y': range(3)}) + df = DataFrame({"x": range(3), "y": range(3)}) axis = df._get_axis_number(axis_frame) - if axis in [0, 'index']: - expected = DataFrame({'x': [1.0, 2.0, 2.0], 'y': [1.0, 2.0, 2.0]}) + if axis in [0, "index"]: + expected = DataFrame({"x": [1.0, 2.0, 2.0], "y": [1.0, 2.0, 2.0]}) else: - expected = DataFrame({'x': [1.0, 1.0, 1.0], 'y': [2.0, 2.0, 2.0]}) + expected = DataFrame({"x": [1.0, 1.0, 1.0], "y": [2.0, 2.0, 2.0]}) result = df.rolling(2, axis=axis_frame).count() tm.assert_frame_equal(result, expected) class TestExpanding(Base): - def setup_method(self, method): self._create_data() def test_doc_string(self): - df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df = DataFrame({"B": [0, 1, 2, np.nan, 4]}) df df.expanding(2).sum() - @pytest.mark.parametrize( - 'which', ['series', 'frame']) + @pytest.mark.parametrize("which", ["series", "frame"]) def test_constructor(self, which): # GH 12669 @@ -758,14 +819,13 @@ def test_constructor(self, which): c(min_periods=1, center=False) # not valid - for w in [2., 'foo', np.array([2])]: + for w in [2.0, "foo", np.array([2])]: with pytest.raises(ValueError): c(min_periods=w) with pytest.raises(ValueError): c(min_periods=1, center=w) - @pytest.mark.parametrize( - 'method', ['std', 'mean', 'sum', 'max', 'min', 'var']) + @pytest.mark.parametrize("method", ["std", "mean", "sum", "max", "min", "var"]) def test_numpy_compat(self, method): # see gh-12811 e = rwindow.Expanding(Series([2, 4, 6]), window=2) @@ -778,10 +838,17 @@ def test_numpy_compat(self, method): getattr(e, method)(dtype=np.float64) @pytest.mark.parametrize( - 'expander', - [1, pytest.param('ls', marks=pytest.mark.xfail( - reason='GH#16425 expanding with ' - 'offset not supported'))]) + "expander", + [ + 1, + pytest.param( + "ls", + marks=pytest.mark.xfail( + reason="GH#16425 expanding with " "offset not supported" + ), + ), + ], + ) def test_empty_df_expanding(self, expander): # GH 15819 Verifies that datetime and integer expanding windows can be # applied to empty DataFrames @@ -793,8 +860,7 @@ def test_empty_df_expanding(self, expander): # Verifies that datetime and integer expanding windows can be applied # to empty DataFrames with datetime index expected = DataFrame(index=pd.DatetimeIndex([])) - result = DataFrame( - index=pd.DatetimeIndex([])).expanding(expander).sum() + result = DataFrame(index=pd.DatetimeIndex([])).expanding(expander).sum() tm.assert_frame_equal(result, expected) def test_missing_minp_zero(self): @@ -810,7 +876,7 @@ def test_missing_minp_zero(self): expected = pd.Series([np.nan]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('klass', [pd.Series, pd.DataFrame]) + @pytest.mark.parametrize("klass", [pd.Series, pd.DataFrame]) def test_iter_raises(self, klass): # https://github.com/pandas-dev/pandas/issues/11704 # Iteration over a Window @@ -824,33 +890,28 @@ def test_expanding_axis(self, axis_frame): axis = df._get_axis_number(axis_frame) if axis == 0: - expected = DataFrame({ - i: [np.nan] * 2 + [float(j) for j in range(3, 11)] - for i in range(20) - }) + expected = DataFrame( + {i: [np.nan] * 2 + [float(j) for j in range(3, 11)] for i in range(20)} + ) else: # axis == 1 - expected = DataFrame([ - [np.nan] * 2 + [float(i) for i in range(3, 21)] - ] * 10) + expected = DataFrame([[np.nan] * 2 + [float(i) for i in range(3, 21)]] * 10) result = df.expanding(3, axis=axis_frame).sum() tm.assert_frame_equal(result, expected) class TestEWM(Base): - def setup_method(self, method): self._create_data() def test_doc_string(self): - df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df = DataFrame({"B": [0, 1, 2, np.nan, 4]}) df df.ewm(com=0.5).mean() - @pytest.mark.parametrize( - 'which', ['series', 'frame']) + @pytest.mark.parametrize("which", ["series", "frame"]) def test_constructor(self, which): o = getattr(self, which) c = o.ewm @@ -889,8 +950,7 @@ def test_constructor(self, which): with pytest.raises(ValueError): c(alpha=alpha) - @pytest.mark.parametrize( - 'method', ['std', 'mean', 'var']) + @pytest.mark.parametrize("method", ["std", "mean", "var"]) def test_numpy_compat(self, method): # see gh-12811 e = rwindow.EWM(Series([2, 4, 6]), alpha=0.5) @@ -915,64 +975,78 @@ class Dtype: window = 2 funcs = { - 'count': lambda v: v.count(), - 'max': lambda v: v.max(), - 'min': lambda v: v.min(), - 'sum': lambda v: v.sum(), - 'mean': lambda v: v.mean(), - 'std': lambda v: v.std(), - 'var': lambda v: v.var(), - 'median': lambda v: v.median() + "count": lambda v: v.count(), + "max": lambda v: v.max(), + "min": lambda v: v.min(), + "sum": lambda v: v.sum(), + "mean": lambda v: v.mean(), + "std": lambda v: v.std(), + "var": lambda v: v.var(), + "median": lambda v: v.median(), } def get_expects(self): expects = { - 'sr1': { - 'count': Series([1, 2, 2, 2, 2], dtype='float64'), - 'max': Series([np.nan, 1, 2, 3, 4], dtype='float64'), - 'min': Series([np.nan, 0, 1, 2, 3], dtype='float64'), - 'sum': Series([np.nan, 1, 3, 5, 7], dtype='float64'), - 'mean': Series([np.nan, .5, 1.5, 2.5, 3.5], dtype='float64'), - 'std': Series([np.nan] + [np.sqrt(.5)] * 4, dtype='float64'), - 'var': Series([np.nan, .5, .5, .5, .5], dtype='float64'), - 'median': Series([np.nan, .5, 1.5, 2.5, 3.5], dtype='float64') + "sr1": { + "count": Series([1, 2, 2, 2, 2], dtype="float64"), + "max": Series([np.nan, 1, 2, 3, 4], dtype="float64"), + "min": Series([np.nan, 0, 1, 2, 3], dtype="float64"), + "sum": Series([np.nan, 1, 3, 5, 7], dtype="float64"), + "mean": Series([np.nan, 0.5, 1.5, 2.5, 3.5], dtype="float64"), + "std": Series([np.nan] + [np.sqrt(0.5)] * 4, dtype="float64"), + "var": Series([np.nan, 0.5, 0.5, 0.5, 0.5], dtype="float64"), + "median": Series([np.nan, 0.5, 1.5, 2.5, 3.5], dtype="float64"), }, - 'sr2': { - 'count': Series([1, 2, 2, 2, 2], dtype='float64'), - 'max': Series([np.nan, 10, 8, 6, 4], dtype='float64'), - 'min': Series([np.nan, 8, 6, 4, 2], dtype='float64'), - 'sum': Series([np.nan, 18, 14, 10, 6], dtype='float64'), - 'mean': Series([np.nan, 9, 7, 5, 3], dtype='float64'), - 'std': Series([np.nan] + [np.sqrt(2)] * 4, dtype='float64'), - 'var': Series([np.nan, 2, 2, 2, 2], dtype='float64'), - 'median': Series([np.nan, 9, 7, 5, 3], dtype='float64') + "sr2": { + "count": Series([1, 2, 2, 2, 2], dtype="float64"), + "max": Series([np.nan, 10, 8, 6, 4], dtype="float64"), + "min": Series([np.nan, 8, 6, 4, 2], dtype="float64"), + "sum": Series([np.nan, 18, 14, 10, 6], dtype="float64"), + "mean": Series([np.nan, 9, 7, 5, 3], dtype="float64"), + "std": Series([np.nan] + [np.sqrt(2)] * 4, dtype="float64"), + "var": Series([np.nan, 2, 2, 2, 2], dtype="float64"), + "median": Series([np.nan, 9, 7, 5, 3], dtype="float64"), + }, + "df": { + "count": DataFrame( + {0: Series([1, 2, 2, 2, 2]), 1: Series([1, 2, 2, 2, 2])}, + dtype="float64", + ), + "max": DataFrame( + {0: Series([np.nan, 2, 4, 6, 8]), 1: Series([np.nan, 3, 5, 7, 9])}, + dtype="float64", + ), + "min": DataFrame( + {0: Series([np.nan, 0, 2, 4, 6]), 1: Series([np.nan, 1, 3, 5, 7])}, + dtype="float64", + ), + "sum": DataFrame( + { + 0: Series([np.nan, 2, 6, 10, 14]), + 1: Series([np.nan, 4, 8, 12, 16]), + }, + dtype="float64", + ), + "mean": DataFrame( + {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}, + dtype="float64", + ), + "std": DataFrame( + { + 0: Series([np.nan] + [np.sqrt(2)] * 4), + 1: Series([np.nan] + [np.sqrt(2)] * 4), + }, + dtype="float64", + ), + "var": DataFrame( + {0: Series([np.nan, 2, 2, 2, 2]), 1: Series([np.nan, 2, 2, 2, 2])}, + dtype="float64", + ), + "median": DataFrame( + {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}, + dtype="float64", + ), }, - 'df': { - 'count': DataFrame({0: Series([1, 2, 2, 2, 2]), - 1: Series([1, 2, 2, 2, 2])}, - dtype='float64'), - 'max': DataFrame({0: Series([np.nan, 2, 4, 6, 8]), - 1: Series([np.nan, 3, 5, 7, 9])}, - dtype='float64'), - 'min': DataFrame({0: Series([np.nan, 0, 2, 4, 6]), - 1: Series([np.nan, 1, 3, 5, 7])}, - dtype='float64'), - 'sum': DataFrame({0: Series([np.nan, 2, 6, 10, 14]), - 1: Series([np.nan, 4, 8, 12, 16])}, - dtype='float64'), - 'mean': DataFrame({0: Series([np.nan, 1, 3, 5, 7]), - 1: Series([np.nan, 2, 4, 6, 8])}, - dtype='float64'), - 'std': DataFrame({0: Series([np.nan] + [np.sqrt(2)] * 4), - 1: Series([np.nan] + [np.sqrt(2)] * 4)}, - dtype='float64'), - 'var': DataFrame({0: Series([np.nan, 2, 2, 2, 2]), - 1: Series([np.nan, 2, 2, 2, 2])}, - dtype='float64'), - 'median': DataFrame({0: Series([np.nan, 1, 3, 5, 7]), - 1: Series([np.nan, 2, 4, 6, 8])}, - dtype='float64'), - } } return expects @@ -981,11 +1055,7 @@ def _create_dtype_data(self, dtype): sr2 = Series(np.arange(10, 0, -2), dtype=dtype) df = DataFrame(np.arange(10).reshape((5, 2)), dtype=dtype) - data = { - 'sr1': sr1, - 'sr2': sr2, - 'df': df - } + data = {"sr1": sr1, "sr2": sr2, "df": df} return data @@ -1069,27 +1139,23 @@ class TestDtype_float64(Dtype_float): class TestDtype_category(Dtype): - dtype = 'category' + dtype = "category" include_df = False def _create_dtype_data(self, dtype): sr1 = Series(range(5), dtype=dtype) sr2 = Series(range(10, 0, -2), dtype=dtype) - data = { - 'sr1': sr1, - 'sr2': sr2 - } + data = {"sr1": sr1, "sr2": sr2} return data class DatetimeLike(Dtype): - def check_dtypes(self, f, f_name, d, d_name, exp): roll = d.rolling(window=self.window) - if f_name == 'count': + if f_name == "count": result = f(roll) tm.assert_almost_equal(result, exp) @@ -1099,24 +1165,25 @@ def check_dtypes(self, f, f_name, d, d_name, exp): class TestDtype_timedelta(DatetimeLike): - dtype = np.dtype('m8[ns]') + dtype = np.dtype("m8[ns]") class TestDtype_datetime(DatetimeLike): - dtype = np.dtype('M8[ns]') + dtype = np.dtype("M8[ns]") class TestDtype_datetime64UTC(DatetimeLike): - dtype = 'datetime64[ns, UTC]' + dtype = "datetime64[ns, UTC]" def _create_data(self): - pytest.skip("direct creation of extension dtype " - "datetime64[ns, UTC] is not supported ATM") + pytest.skip( + "direct creation of extension dtype " + "datetime64[ns, UTC] is not supported ATM" + ) @pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning") class TestMoments(Base): - def setup_method(self, method): self._create_data() @@ -1130,46 +1197,69 @@ def test_centered_axis_validation(self): Series(np.ones(10)).rolling(window=3, center=True, axis=1).mean() # ok ok - DataFrame(np.ones((10, 10))).rolling(window=3, center=True, - axis=0).mean() - DataFrame(np.ones((10, 10))).rolling(window=3, center=True, - axis=1).mean() + DataFrame(np.ones((10, 10))).rolling(window=3, center=True, axis=0).mean() + DataFrame(np.ones((10, 10))).rolling(window=3, center=True, axis=1).mean() # bad axis with pytest.raises(ValueError): - (DataFrame(np.ones((10, 10))) - .rolling(window=3, center=True, axis=2).mean()) + (DataFrame(np.ones((10, 10))).rolling(window=3, center=True, axis=2).mean()) def test_rolling_sum(self): - self._check_moment_func(np.nansum, name='sum', - zero_min_periods_equal=False) + self._check_moment_func(np.nansum, name="sum", zero_min_periods_equal=False) def test_rolling_count(self): counter = lambda x: np.isfinite(x).astype(float).sum() - self._check_moment_func(counter, name='count', has_min_periods=False, - fill_value=0) + self._check_moment_func( + counter, name="count", has_min_periods=False, fill_value=0 + ) def test_rolling_mean(self): - self._check_moment_func(np.mean, name='mean') + self._check_moment_func(np.mean, name="mean") @td.skip_if_no_scipy def test_cmov_mean(self): # GH 8238 - vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, - 10.63, 14.48]) + vals = np.array( + [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48] + ) result = Series(vals).rolling(5, center=True).mean() - expected = Series([np.nan, np.nan, 9.962, 11.27, 11.564, 12.516, - 12.818, 12.952, np.nan, np.nan]) + expected = Series( + [ + np.nan, + np.nan, + 9.962, + 11.27, + 11.564, + 12.516, + 12.818, + 12.952, + np.nan, + np.nan, + ] + ) tm.assert_series_equal(expected, result) @td.skip_if_no_scipy def test_cmov_window(self): # GH 8238 - vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, - 10.63, 14.48]) - result = Series(vals).rolling(5, win_type='boxcar', center=True).mean() - expected = Series([np.nan, np.nan, 9.962, 11.27, 11.564, 12.516, - 12.818, 12.952, np.nan, np.nan]) + vals = np.array( + [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48] + ) + result = Series(vals).rolling(5, win_type="boxcar", center=True).mean() + expected = Series( + [ + np.nan, + np.nan, + 9.962, + 11.27, + 11.564, + 12.516, + 12.818, + 12.952, + np.nan, + np.nan, + ] + ) tm.assert_series_equal(expected, result) @td.skip_if_no_scipy @@ -1177,49 +1267,78 @@ def test_cmov_window_corner(self): # GH 8238 # all nan vals = pd.Series([np.nan] * 10) - result = vals.rolling(5, center=True, win_type='boxcar').mean() + result = vals.rolling(5, center=True, win_type="boxcar").mean() assert np.isnan(result).all() # empty vals = pd.Series([]) - result = vals.rolling(5, center=True, win_type='boxcar').mean() + result = vals.rolling(5, center=True, win_type="boxcar").mean() assert len(result) == 0 # shorter than window vals = pd.Series(np.random.randn(5)) - result = vals.rolling(10, win_type='boxcar').mean() + result = vals.rolling(10, win_type="boxcar").mean() assert np.isnan(result).all() assert len(result) == 5 @td.skip_if_no_scipy def test_cmov_window_frame(self): # Gh 8238 - vals = np.array([[12.18, 3.64], [10.18, 9.16], [13.24, 14.61], - [4.51, 8.11], [6.15, 11.44], [9.14, 6.21], - [11.31, 10.67], [2.94, 6.51], [9.42, 8.39], [12.44, - 7.34]]) - - xp = np.array([[np.nan, np.nan], [np.nan, np.nan], [9.252, 9.392], - [8.644, 9.906], [8.87, 10.208], [6.81, 8.588], - [7.792, 8.644], [9.05, 7.824], [np.nan, np.nan - ], [np.nan, np.nan]]) + vals = np.array( + [ + [12.18, 3.64], + [10.18, 9.16], + [13.24, 14.61], + [4.51, 8.11], + [6.15, 11.44], + [9.14, 6.21], + [11.31, 10.67], + [2.94, 6.51], + [9.42, 8.39], + [12.44, 7.34], + ] + ) + + xp = np.array( + [ + [np.nan, np.nan], + [np.nan, np.nan], + [9.252, 9.392], + [8.644, 9.906], + [8.87, 10.208], + [6.81, 8.588], + [7.792, 8.644], + [9.05, 7.824], + [np.nan, np.nan], + [np.nan, np.nan], + ] + ) # DataFrame - rs = DataFrame(vals).rolling(5, win_type='boxcar', center=True).mean() + rs = DataFrame(vals).rolling(5, win_type="boxcar", center=True).mean() tm.assert_frame_equal(DataFrame(xp), rs) # invalid method with pytest.raises(AttributeError): - (DataFrame(vals).rolling(5, win_type='boxcar', center=True) - .std()) + (DataFrame(vals).rolling(5, win_type="boxcar", center=True).std()) # sum - xp = np.array([[np.nan, np.nan], [np.nan, np.nan], [46.26, 46.96], - [43.22, 49.53], [44.35, 51.04], [34.05, 42.94], - [38.96, 43.22], [45.25, 39.12], [np.nan, np.nan - ], [np.nan, np.nan]]) - - rs = DataFrame(vals).rolling(5, win_type='boxcar', center=True).sum() + xp = np.array( + [ + [np.nan, np.nan], + [np.nan, np.nan], + [46.26, 46.96], + [43.22, 49.53], + [44.35, 51.04], + [34.05, 42.94], + [38.96, 43.22], + [45.25, 39.12], + [np.nan, np.nan], + [np.nan, np.nan], + ] + ) + + rs = DataFrame(vals).rolling(5, win_type="boxcar", center=True).sum() tm.assert_frame_equal(DataFrame(xp), rs) @td.skip_if_no_scipy @@ -1230,32 +1349,112 @@ def test_cmov_window_na_min_periods(self): vals[8] = np.nan xp = vals.rolling(5, min_periods=4, center=True).mean() - rs = vals.rolling(5, win_type='boxcar', min_periods=4, - center=True).mean() + rs = vals.rolling(5, win_type="boxcar", min_periods=4, center=True).mean() tm.assert_series_equal(xp, rs) @td.skip_if_no_scipy def test_cmov_window_regular(self, win_types): # GH 8238 - vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, - 10.63, 14.48]) + vals = np.array( + [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48] + ) xps = { - 'hamming': [np.nan, np.nan, 8.71384, 9.56348, 12.38009, 14.03687, - 13.8567, 11.81473, np.nan, np.nan], - 'triang': [np.nan, np.nan, 9.28667, 10.34667, 12.00556, 13.33889, - 13.38, 12.33667, np.nan, np.nan], - 'barthann': [np.nan, np.nan, 8.4425, 9.1925, 12.5575, 14.3675, - 14.0825, 11.5675, np.nan, np.nan], - 'bohman': [np.nan, np.nan, 7.61599, 9.1764, 12.83559, 14.17267, - 14.65923, 11.10401, np.nan, np.nan], - 'blackmanharris': [np.nan, np.nan, 6.97691, 9.16438, 13.05052, - 14.02156, 15.10512, 10.74574, np.nan, np.nan], - 'nuttall': [np.nan, np.nan, 7.04618, 9.16786, 13.02671, 14.03559, - 15.05657, 10.78514, np.nan, np.nan], - 'blackman': [np.nan, np.nan, 7.73345, 9.17869, 12.79607, 14.20036, - 14.57726, 11.16988, np.nan, np.nan], - 'bartlett': [np.nan, np.nan, 8.4425, 9.1925, 12.5575, 14.3675, - 14.0825, 11.5675, np.nan, np.nan] + "hamming": [ + np.nan, + np.nan, + 8.71384, + 9.56348, + 12.38009, + 14.03687, + 13.8567, + 11.81473, + np.nan, + np.nan, + ], + "triang": [ + np.nan, + np.nan, + 9.28667, + 10.34667, + 12.00556, + 13.33889, + 13.38, + 12.33667, + np.nan, + np.nan, + ], + "barthann": [ + np.nan, + np.nan, + 8.4425, + 9.1925, + 12.5575, + 14.3675, + 14.0825, + 11.5675, + np.nan, + np.nan, + ], + "bohman": [ + np.nan, + np.nan, + 7.61599, + 9.1764, + 12.83559, + 14.17267, + 14.65923, + 11.10401, + np.nan, + np.nan, + ], + "blackmanharris": [ + np.nan, + np.nan, + 6.97691, + 9.16438, + 13.05052, + 14.02156, + 15.10512, + 10.74574, + np.nan, + np.nan, + ], + "nuttall": [ + np.nan, + np.nan, + 7.04618, + 9.16786, + 13.02671, + 14.03559, + 15.05657, + 10.78514, + np.nan, + np.nan, + ], + "blackman": [ + np.nan, + np.nan, + 7.73345, + 9.17869, + 12.79607, + 14.20036, + 14.57726, + 11.16988, + np.nan, + np.nan, + ], + "bartlett": [ + np.nan, + np.nan, + 8.4425, + 9.1925, + 12.5575, + 14.3675, + 14.0825, + 11.5675, + np.nan, + np.nan, + ], } xp = Series(xps[win_types]) @@ -1277,25 +1476,106 @@ def test_cmov_window_regular_linear_range(self, win_types): @td.skip_if_no_scipy def test_cmov_window_regular_missing_data(self, win_types): # GH 8238 - vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, np.nan, - 10.63, 14.48]) + vals = np.array( + [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, np.nan, 10.63, 14.48] + ) xps = { - 'bartlett': [np.nan, np.nan, 9.70333, 10.5225, 8.4425, 9.1925, - 12.5575, 14.3675, 15.61667, 13.655], - 'blackman': [np.nan, np.nan, 9.04582, 11.41536, 7.73345, 9.17869, - 12.79607, 14.20036, 15.8706, 13.655], - 'barthann': [np.nan, np.nan, 9.70333, 10.5225, 8.4425, 9.1925, - 12.5575, 14.3675, 15.61667, 13.655], - 'bohman': [np.nan, np.nan, 8.9444, 11.56327, 7.61599, 9.1764, - 12.83559, 14.17267, 15.90976, 13.655], - 'hamming': [np.nan, np.nan, 9.59321, 10.29694, 8.71384, 9.56348, - 12.38009, 14.20565, 15.24694, 13.69758], - 'nuttall': [np.nan, np.nan, 8.47693, 12.2821, 7.04618, 9.16786, - 13.02671, 14.03673, 16.08759, 13.65553], - 'triang': [np.nan, np.nan, 9.33167, 9.76125, 9.28667, 10.34667, - 12.00556, 13.82125, 14.49429, 13.765], - 'blackmanharris': [np.nan, np.nan, 8.42526, 12.36824, 6.97691, - 9.16438, 13.05052, 14.02175, 16.1098, 13.65509] + "bartlett": [ + np.nan, + np.nan, + 9.70333, + 10.5225, + 8.4425, + 9.1925, + 12.5575, + 14.3675, + 15.61667, + 13.655, + ], + "blackman": [ + np.nan, + np.nan, + 9.04582, + 11.41536, + 7.73345, + 9.17869, + 12.79607, + 14.20036, + 15.8706, + 13.655, + ], + "barthann": [ + np.nan, + np.nan, + 9.70333, + 10.5225, + 8.4425, + 9.1925, + 12.5575, + 14.3675, + 15.61667, + 13.655, + ], + "bohman": [ + np.nan, + np.nan, + 8.9444, + 11.56327, + 7.61599, + 9.1764, + 12.83559, + 14.17267, + 15.90976, + 13.655, + ], + "hamming": [ + np.nan, + np.nan, + 9.59321, + 10.29694, + 8.71384, + 9.56348, + 12.38009, + 14.20565, + 15.24694, + 13.69758, + ], + "nuttall": [ + np.nan, + np.nan, + 8.47693, + 12.2821, + 7.04618, + 9.16786, + 13.02671, + 14.03673, + 16.08759, + 13.65553, + ], + "triang": [ + np.nan, + np.nan, + 9.33167, + 9.76125, + 9.28667, + 10.34667, + 12.00556, + 13.82125, + 14.49429, + 13.765, + ], + "blackmanharris": [ + np.nan, + np.nan, + 8.42526, + 12.36824, + 6.97691, + 9.16438, + 13.05052, + 14.02175, + 16.1098, + 13.65509, + ], } xp = Series(xps[win_types]) @@ -1306,40 +1586,85 @@ def test_cmov_window_regular_missing_data(self, win_types): def test_cmov_window_special(self, win_types_special): # GH 8238 kwds = { - 'kaiser': {'beta': 1.}, - 'gaussian': {'std': 1.}, - 'general_gaussian': {'power': 2., 'width': 2.}, - 'exponential': {'tau': 10}} + "kaiser": {"beta": 1.0}, + "gaussian": {"std": 1.0}, + "general_gaussian": {"power": 2.0, "width": 2.0}, + "exponential": {"tau": 10}, + } - vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, - 10.63, 14.48]) + vals = np.array( + [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48] + ) xps = { - 'gaussian': [np.nan, np.nan, 8.97297, 9.76077, 12.24763, 13.89053, - 13.65671, 12.01002, np.nan, np.nan], - 'general_gaussian': [np.nan, np.nan, 9.85011, 10.71589, 11.73161, - 13.08516, 12.95111, 12.74577, np.nan, np.nan], - 'kaiser': [np.nan, np.nan, 9.86851, 11.02969, 11.65161, 12.75129, - 12.90702, 12.83757, np.nan, np.nan], - 'exponential': [np.nan, np.nan, 9.83364, 11.10472, 11.64551, - 12.66138, 12.92379, 12.83770, np.nan, np.nan], + "gaussian": [ + np.nan, + np.nan, + 8.97297, + 9.76077, + 12.24763, + 13.89053, + 13.65671, + 12.01002, + np.nan, + np.nan, + ], + "general_gaussian": [ + np.nan, + np.nan, + 9.85011, + 10.71589, + 11.73161, + 13.08516, + 12.95111, + 12.74577, + np.nan, + np.nan, + ], + "kaiser": [ + np.nan, + np.nan, + 9.86851, + 11.02969, + 11.65161, + 12.75129, + 12.90702, + 12.83757, + np.nan, + np.nan, + ], + "exponential": [ + np.nan, + np.nan, + 9.83364, + 11.10472, + 11.64551, + 12.66138, + 12.92379, + 12.83770, + np.nan, + np.nan, + ], } xp = Series(xps[win_types_special]) - rs = Series(vals).rolling( - 5, win_type=win_types_special, center=True).mean( - **kwds[win_types_special]) + rs = ( + Series(vals) + .rolling(5, win_type=win_types_special, center=True) + .mean(**kwds[win_types_special]) + ) tm.assert_series_equal(xp, rs) @td.skip_if_no_scipy def test_cmov_window_special_linear_range(self, win_types_special): # GH 8238 kwds = { - 'kaiser': {'beta': 1.}, - 'gaussian': {'std': 1.}, - 'general_gaussian': {'power': 2., 'width': 2.}, - 'slepian': {'width': 0.5}, - 'exponential': {'tau': 10}} + "kaiser": {"beta": 1.0}, + "gaussian": {"std": 1.0}, + "general_gaussian": {"power": 2.0, "width": 2.0}, + "slepian": {"width": 0.5}, + "exponential": {"tau": 10}, + } vals = np.array(range(10), dtype=np.float) xp = vals.copy() @@ -1347,16 +1672,18 @@ def test_cmov_window_special_linear_range(self, win_types_special): xp[-2:] = np.nan xp = Series(xp) - rs = Series(vals).rolling( - 5, win_type=win_types_special, center=True).mean( - **kwds[win_types_special]) + rs = ( + Series(vals) + .rolling(5, win_type=win_types_special, center=True) + .mean(**kwds[win_types_special]) + ) tm.assert_series_equal(xp, rs) def test_rolling_median(self): - self._check_moment_func(np.median, name='median') + self._check_moment_func(np.median, name="median") def test_rolling_min(self): - self._check_moment_func(np.min, name='min') + self._check_moment_func(np.min, name="min") a = pd.Series([1, 2, 3, 4, 5]) result = a.rolling(window=100, min_periods=1).min() @@ -1367,7 +1694,7 @@ def test_rolling_min(self): pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).min() def test_rolling_max(self): - self._check_moment_func(np.max, name='max') + self._check_moment_func(np.max, name="max") a = pd.Series([1, 2, 3, 4, 5], dtype=np.float64) b = a.rolling(window=100, min_periods=1).max() @@ -1376,13 +1703,12 @@ def test_rolling_max(self): with pytest.raises(ValueError): pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).max() - @pytest.mark.parametrize('q', [0.0, .1, .5, .9, 1.0]) + @pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0]) def test_rolling_quantile(self, q): - def scoreatpercentile(a, per): values = np.sort(a, axis=0) - idx = int(per / 1. * (values.shape[0] - 1)) + idx = int(per / 1.0 * (values.shape[0] - 1)) if idx == values.shape[0] - 1: retval = values[-1] @@ -1399,15 +1725,14 @@ def scoreatpercentile(a, per): def quantile_func(x): return scoreatpercentile(x, q) - self._check_moment_func(quantile_func, name='quantile', - quantile=q) + self._check_moment_func(quantile_func, name="quantile", quantile=q) def test_rolling_quantile_np_percentile(self): # #9413: Tests that rolling window's quantile default behavior # is analogous to Numpy's percentile row = 10 col = 5 - idx = pd.date_range('20100101', periods=row, freq='B') + idx = pd.date_range("20100101", periods=row, freq="B") df = DataFrame(np.random.rand(row * col).reshape((row, -1)), index=idx) df_quantile = df.quantile([0.25, 0.5, 0.75], axis=0) @@ -1415,24 +1740,31 @@ def test_rolling_quantile_np_percentile(self): tm.assert_almost_equal(df_quantile.values, np.array(np_percentile)) - @pytest.mark.parametrize('quantile', [0.0, 0.1, 0.45, 0.5, 1]) - @pytest.mark.parametrize('interpolation', ['linear', 'lower', 'higher', - 'nearest', 'midpoint']) - @pytest.mark.parametrize('data', [[1., 2., 3., 4., 5., 6., 7.], - [8., 1., 3., 4., 5., 2., 6., 7.], - [0., np.nan, 0.2, np.nan, 0.4], - [np.nan, np.nan, np.nan, np.nan], - [np.nan, 0.1, np.nan, 0.3, 0.4, 0.5], - [0.5], [np.nan, 0.7, 0.6]]) - def test_rolling_quantile_interpolation_options(self, quantile, - interpolation, data): + @pytest.mark.parametrize("quantile", [0.0, 0.1, 0.45, 0.5, 1]) + @pytest.mark.parametrize( + "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] + ) + @pytest.mark.parametrize( + "data", + [ + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], + [8.0, 1.0, 3.0, 4.0, 5.0, 2.0, 6.0, 7.0], + [0.0, np.nan, 0.2, np.nan, 0.4], + [np.nan, np.nan, np.nan, np.nan], + [np.nan, 0.1, np.nan, 0.3, 0.4, 0.5], + [0.5], + [np.nan, 0.7, 0.6], + ], + ) + def test_rolling_quantile_interpolation_options( + self, quantile, interpolation, data + ): # Tests that rolling window's quantile behavior is analogous to # Series' quantile for each interpolation option s = Series(data) q1 = s.quantile(quantile, interpolation) - q2 = s.expanding(min_periods=1).quantile( - quantile, interpolation).iloc[-1] + q2 = s.expanding(min_periods=1).quantile(quantile, interpolation).iloc[-1] if np.isnan(q1): assert np.isnan(q2) @@ -1443,13 +1775,13 @@ def test_invalid_quantile_value(self): data = np.arange(5) s = Series(data) - with pytest.raises(ValueError, match="Interpolation 'invalid'" - " is not supported"): - s.rolling(len(data), min_periods=1).quantile( - 0.5, interpolation='invalid') + with pytest.raises( + ValueError, match="Interpolation 'invalid'" " is not supported" + ): + s.rolling(len(data), min_periods=1).quantile(0.5, interpolation="invalid") def test_rolling_quantile_param(self): - ser = Series([0.0, .1, .5, .9, 1.0]) + ser = Series([0.0, 0.1, 0.5, 0.9, 1.0]) with pytest.raises(ValueError): ser.rolling(3).quantile(-0.1) @@ -1458,21 +1790,23 @@ def test_rolling_quantile_param(self): ser.rolling(3).quantile(10.0) with pytest.raises(TypeError): - ser.rolling(3).quantile('foo') + ser.rolling(3).quantile("foo") def test_rolling_apply(self, raw): # suppress warnings about empty slices, as we are deliberately testing # with a 0-length Series with warnings.catch_warnings(): - warnings.filterwarnings("ignore", - message=".*(empty slice|0 for slice).*", - category=RuntimeWarning) + warnings.filterwarnings( + "ignore", + message=".*(empty slice|0 for slice).*", + category=RuntimeWarning, + ) def f(x): return x[np.isfinite(x)].mean() - self._check_moment_func(np.mean, name='apply', func=f, raw=raw) + self._check_moment_func(np.mean, name="apply", func=f, raw=raw) expected = Series([]) result = expected.rolling(10).apply(lambda x: x.mean(), raw=raw) @@ -1481,15 +1815,16 @@ def f(x): # gh-8080 s = Series([None, None, None]) result = s.rolling(2, min_periods=0).apply(lambda x: len(x), raw=raw) - expected = Series([1., 2., 2.]) + expected = Series([1.0, 2.0, 2.0]) tm.assert_series_equal(result, expected) result = s.rolling(2, min_periods=0).apply(len, raw=raw) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('klass', [Series, DataFrame]) + @pytest.mark.parametrize("klass", [Series, DataFrame]) @pytest.mark.parametrize( - 'method', [lambda x: x.rolling(window=2), lambda x: x.expanding()]) + "method", [lambda x: x.rolling(window=2), lambda x: x.expanding()] + ) def test_apply_future_warning(self, klass, method): # gh-5071 @@ -1509,12 +1844,13 @@ def test_rolling_apply_out_of_bounds(self, raw): expected = pd.Series([1, 3, 6, 10], dtype=float) tm.assert_almost_equal(result, expected) - @pytest.mark.parametrize('window', [2, '2s']) + @pytest.mark.parametrize("window", [2, "2s"]) def test_rolling_apply_with_pandas_objects(self, window): # 5071 - df = pd.DataFrame({'A': np.random.randn(5), - 'B': np.random.randint(0, 10, size=5)}, - index=pd.date_range('20130101', periods=5, freq='s')) + df = pd.DataFrame( + {"A": np.random.randn(5), "B": np.random.randint(0, 10, size=5)}, + index=pd.date_range("20130101", periods=5, freq="s"), + ) # we have an equal spaced timeseries index # so simulate removing the first period @@ -1531,24 +1867,21 @@ def f(x): df.rolling(window).apply(f, raw=True) def test_rolling_std(self): - self._check_moment_func(lambda x: np.std(x, ddof=1), - name='std') - self._check_moment_func(lambda x: np.std(x, ddof=0), - name='std', ddof=0) + self._check_moment_func(lambda x: np.std(x, ddof=1), name="std") + self._check_moment_func(lambda x: np.std(x, ddof=0), name="std", ddof=0) def test_rolling_std_1obs(self): - vals = pd.Series([1., 2., 3., 4., 5.]) + vals = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0]) result = vals.rolling(1, min_periods=1).std() expected = pd.Series([np.nan] * 5) tm.assert_series_equal(result, expected) result = vals.rolling(1, min_periods=1).std(ddof=0) - expected = pd.Series([0.] * 5) + expected = pd.Series([0.0] * 5) tm.assert_series_equal(result, expected) - result = (pd.Series([np.nan, np.nan, 3, 4, 5]) - .rolling(3, min_periods=2).std()) + result = pd.Series([np.nan, np.nan, 3, 4, 5]).rolling(3, min_periods=2).std() assert np.isnan(result[2]) def test_rolling_std_neg_sqrt(self): @@ -1556,9 +1889,15 @@ def test_rolling_std_neg_sqrt(self): # Test move_nanstd for neg sqrt. - a = pd.Series([0.0011448196318903589, 0.00028718669878572767, - 0.00028718669878572767, 0.00028718669878572767, - 0.00028718669878572767]) + a = pd.Series( + [ + 0.0011448196318903589, + 0.00028718669878572767, + 0.00028718669878572767, + 0.00028718669878572767, + 0.00028718669878572767, + ] + ) b = a.rolling(window=3).std() assert np.isfinite(b[2:]).all() @@ -1566,56 +1905,58 @@ def test_rolling_std_neg_sqrt(self): assert np.isfinite(b[2:]).all() def test_rolling_var(self): - self._check_moment_func(lambda x: np.var(x, ddof=1), - name='var') - self._check_moment_func(lambda x: np.var(x, ddof=0), - name='var', ddof=0) + self._check_moment_func(lambda x: np.var(x, ddof=1), name="var") + self._check_moment_func(lambda x: np.var(x, ddof=0), name="var", ddof=0) @td.skip_if_no_scipy def test_rolling_skew(self): from scipy.stats import skew - self._check_moment_func(lambda x: skew(x, bias=False), name='skew') + + self._check_moment_func(lambda x: skew(x, bias=False), name="skew") @td.skip_if_no_scipy def test_rolling_kurt(self): from scipy.stats import kurtosis - self._check_moment_func(lambda x: kurtosis(x, bias=False), - name='kurt') - - def _check_moment_func(self, static_comp, name, has_min_periods=True, - has_center=True, has_time_rule=True, - fill_value=None, zero_min_periods_equal=True, - **kwargs): + self._check_moment_func(lambda x: kurtosis(x, bias=False), name="kurt") + + def _check_moment_func( + self, + static_comp, + name, + has_min_periods=True, + has_center=True, + has_time_rule=True, + fill_value=None, + zero_min_periods_equal=True, + **kwargs + ): def get_result(obj, window, min_periods=None, center=False): - r = obj.rolling(window=window, min_periods=min_periods, - center=center) + r = obj.rolling(window=window, min_periods=min_periods, center=center) return getattr(r, name)(**kwargs) series_result = get_result(self.series, window=50) assert isinstance(series_result, Series) - tm.assert_almost_equal(series_result.iloc[-1], - static_comp(self.series[-50:])) + tm.assert_almost_equal(series_result.iloc[-1], static_comp(self.series[-50:])) frame_result = get_result(self.frame, window=50) assert isinstance(frame_result, DataFrame) tm.assert_series_equal( frame_result.iloc[-1, :], self.frame.iloc[-50:, :].apply(static_comp, axis=0, raw=raw), - check_names=False) + check_names=False, + ) # check time_rule works if has_time_rule: win = 25 minp = 10 - series = self.series[::2].resample('B').mean() - frame = self.frame[::2].resample('B').mean() + series = self.series[::2].resample("B").mean() + frame = self.frame[::2].resample("B").mean() if has_min_periods: - series_result = get_result(series, window=win, - min_periods=minp) - frame_result = get_result(frame, window=win, - min_periods=minp) + series_result = get_result(series, window=win, min_periods=minp) + frame_result = get_result(frame, window=win, min_periods=minp) else: series_result = get_result(series, window=win) frame_result = get_result(frame, window=win) @@ -1626,12 +1967,13 @@ def get_result(obj, window, min_periods=None, center=False): trunc_series = self.series[::2].truncate(prev_date, last_date) trunc_frame = self.frame[::2].truncate(prev_date, last_date) - tm.assert_almost_equal(series_result[-1], - static_comp(trunc_series)) + tm.assert_almost_equal(series_result[-1], static_comp(trunc_series)) - tm.assert_series_equal(frame_result.xs(last_date), - trunc_frame.apply(static_comp, raw=raw), - check_names=False) + tm.assert_series_equal( + frame_result.xs(last_date), + trunc_frame.apply(static_comp, raw=raw), + check_names=False, + ) # excluding NaNs correctly obj = Series(randn(50)) @@ -1666,16 +2008,13 @@ def get_result(obj, window, min_periods=None, center=False): # window larger than series length (#7297) if has_min_periods: for minp in (0, len(self.series) - 1, len(self.series)): - result = get_result(self.series, len(self.series) + 1, - min_periods=minp) - expected = get_result(self.series, len(self.series), - min_periods=minp) + result = get_result(self.series, len(self.series) + 1, min_periods=minp) + expected = get_result(self.series, len(self.series), min_periods=minp) nan_mask = isna(result) tm.assert_series_equal(nan_mask, isna(expected)) nan_mask = ~nan_mask - tm.assert_almost_equal(result[nan_mask], - expected[nan_mask]) + tm.assert_almost_equal(result[nan_mask], expected[nan_mask]) else: result = get_result(self.series, len(self.series) + 1) expected = get_result(self.series, len(self.series)) @@ -1690,43 +2029,63 @@ def get_result(obj, window, min_periods=None, center=False): if has_min_periods: result = get_result(obj, 20, min_periods=15, center=True) expected = get_result( - pd.concat([obj, Series([np.NaN] * 9)]), 20, - min_periods=15)[9:].reset_index(drop=True) + pd.concat([obj, Series([np.NaN] * 9)]), 20, min_periods=15 + )[9:].reset_index(drop=True) else: result = get_result(obj, 20, center=True) - expected = get_result( - pd.concat([obj, Series([np.NaN] * 9)]), - 20)[9:].reset_index(drop=True) + expected = get_result(pd.concat([obj, Series([np.NaN] * 9)]), 20)[ + 9: + ].reset_index(drop=True) tm.assert_series_equal(result, expected) # shifter index - s = ['x%d' % x for x in range(12)] + s = ["x%d" % x for x in range(12)] if has_min_periods: minp = 10 - series_xp = get_result( - self.series.reindex(list(self.series.index) + s), - window=25, - min_periods=minp).shift(-12).reindex(self.series.index) - frame_xp = get_result( - self.frame.reindex(list(self.frame.index) + s), - window=25, - min_periods=minp).shift(-12).reindex(self.frame.index) + series_xp = ( + get_result( + self.series.reindex(list(self.series.index) + s), + window=25, + min_periods=minp, + ) + .shift(-12) + .reindex(self.series.index) + ) + frame_xp = ( + get_result( + self.frame.reindex(list(self.frame.index) + s), + window=25, + min_periods=minp, + ) + .shift(-12) + .reindex(self.frame.index) + ) - series_rs = get_result(self.series, window=25, - min_periods=minp, center=True) - frame_rs = get_result(self.frame, window=25, min_periods=minp, - center=True) + series_rs = get_result( + self.series, window=25, min_periods=minp, center=True + ) + frame_rs = get_result( + self.frame, window=25, min_periods=minp, center=True + ) else: - series_xp = get_result( - self.series.reindex(list(self.series.index) + s), - window=25).shift(-12).reindex(self.series.index) - frame_xp = get_result( - self.frame.reindex(list(self.frame.index) + s), - window=25).shift(-12).reindex(self.frame.index) + series_xp = ( + get_result( + self.series.reindex(list(self.series.index) + s), window=25 + ) + .shift(-12) + .reindex(self.series.index) + ) + frame_xp = ( + get_result( + self.frame.reindex(list(self.frame.index) + s), window=25 + ) + .shift(-12) + .reindex(self.frame.index) + ) series_rs = get_result(self.series, window=25, center=True) frame_rs = get_result(self.frame, window=25, center=True) @@ -1738,15 +2097,15 @@ def get_result(obj, window, min_periods=None, center=False): tm.assert_frame_equal(frame_xp, frame_rs) def test_ewma(self): - self._check_ew(name='mean') + self._check_ew(name="mean") vals = pd.Series(np.zeros(1000)) vals[5] = 1 result = vals.ewm(span=100, adjust=False).mean().sum() assert np.abs(result - 1) < 1e-2 - @pytest.mark.parametrize('adjust', [True, False]) - @pytest.mark.parametrize('ignore_na', [True, False]) + @pytest.mark.parametrize("adjust", [True, False]) + @pytest.mark.parametrize("ignore_na", [True, False]) def test_ewma_cases(self, adjust, ignore_na): # try adjust/ignore_na args matrix @@ -1761,51 +2120,68 @@ def test_ewma_cases(self, adjust, ignore_na): tm.assert_series_equal(result, expected) def test_ewma_nan_handling(self): - s = Series([1.] + [np.nan] * 5 + [1.]) + s = Series([1.0] + [np.nan] * 5 + [1.0]) result = s.ewm(com=5).mean() - tm.assert_series_equal(result, Series([1.] * len(s))) + tm.assert_series_equal(result, Series([1.0] * len(s))) - s = Series([np.nan] * 2 + [1.] + [np.nan] * 2 + [1.]) + s = Series([np.nan] * 2 + [1.0] + [np.nan] * 2 + [1.0]) result = s.ewm(com=5).mean() - tm.assert_series_equal(result, Series([np.nan] * 2 + [1.] * 4)) + tm.assert_series_equal(result, Series([np.nan] * 2 + [1.0] * 4)) # GH 7603 - s0 = Series([np.nan, 1., 101.]) - s1 = Series([1., np.nan, 101.]) - s2 = Series([np.nan, 1., np.nan, np.nan, 101., np.nan]) - s3 = Series([1., np.nan, 101., 50.]) - com = 2. - alpha = 1. / (1. + com) + s0 = Series([np.nan, 1.0, 101.0]) + s1 = Series([1.0, np.nan, 101.0]) + s2 = Series([np.nan, 1.0, np.nan, np.nan, 101.0, np.nan]) + s3 = Series([1.0, np.nan, 101.0, 50.0]) + com = 2.0 + alpha = 1.0 / (1.0 + com) def simple_wma(s, w): - return (s.multiply(w).cumsum() / w.cumsum()).fillna(method='ffill') + return (s.multiply(w).cumsum() / w.cumsum()).fillna(method="ffill") for (s, adjust, ignore_na, w) in [ - (s0, True, False, [np.nan, (1. - alpha), 1.]), - (s0, True, True, [np.nan, (1. - alpha), 1.]), - (s0, False, False, [np.nan, (1. - alpha), alpha]), - (s0, False, True, [np.nan, (1. - alpha), alpha]), - (s1, True, False, [(1. - alpha) ** 2, np.nan, 1.]), - (s1, True, True, [(1. - alpha), np.nan, 1.]), - (s1, False, False, [(1. - alpha) ** 2, np.nan, alpha]), - (s1, False, True, [(1. - alpha), np.nan, alpha]), - (s2, True, False, [np.nan, (1. - alpha) ** - 3, np.nan, np.nan, 1., np.nan]), - (s2, True, True, [np.nan, (1. - alpha), - np.nan, np.nan, 1., np.nan]), - (s2, False, False, [np.nan, (1. - alpha) ** - 3, np.nan, np.nan, alpha, np.nan]), - (s2, False, True, [np.nan, (1. - alpha), - np.nan, np.nan, alpha, np.nan]), - (s3, True, False, [(1. - alpha) ** - 3, np.nan, (1. - alpha), 1.]), - (s3, True, True, [(1. - alpha) ** - 2, np.nan, (1. - alpha), 1.]), - (s3, False, False, [(1. - alpha) ** 3, np.nan, - (1. - alpha) * alpha, - alpha * ((1. - alpha) ** 2 + alpha)]), - (s3, False, True, [(1. - alpha) ** 2, - np.nan, (1. - alpha) * alpha, alpha])]: + (s0, True, False, [np.nan, (1.0 - alpha), 1.0]), + (s0, True, True, [np.nan, (1.0 - alpha), 1.0]), + (s0, False, False, [np.nan, (1.0 - alpha), alpha]), + (s0, False, True, [np.nan, (1.0 - alpha), alpha]), + (s1, True, False, [(1.0 - alpha) ** 2, np.nan, 1.0]), + (s1, True, True, [(1.0 - alpha), np.nan, 1.0]), + (s1, False, False, [(1.0 - alpha) ** 2, np.nan, alpha]), + (s1, False, True, [(1.0 - alpha), np.nan, alpha]), + ( + s2, + True, + False, + [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, 1.0, np.nan], + ), + (s2, True, True, [np.nan, (1.0 - alpha), np.nan, np.nan, 1.0, np.nan]), + ( + s2, + False, + False, + [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, alpha, np.nan], + ), + (s2, False, True, [np.nan, (1.0 - alpha), np.nan, np.nan, alpha, np.nan]), + (s3, True, False, [(1.0 - alpha) ** 3, np.nan, (1.0 - alpha), 1.0]), + (s3, True, True, [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha), 1.0]), + ( + s3, + False, + False, + [ + (1.0 - alpha) ** 3, + np.nan, + (1.0 - alpha) * alpha, + alpha * ((1.0 - alpha) ** 2 + alpha), + ], + ), + ( + s3, + False, + True, + [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha) * alpha, alpha], + ), + ]: expected = simple_wma(s, Series(w)) result = s.ewm(com=com, adjust=adjust, ignore_na=ignore_na).mean() @@ -1816,10 +2192,10 @@ def simple_wma(s, w): tm.assert_series_equal(result, expected) def test_ewmvar(self): - self._check_ew(name='var') + self._check_ew(name="var") def test_ewmvol(self): - self._check_ew(name='vol') + self._check_ew(name="vol") def test_ewma_span_com_args(self): A = self.series.ewm(com=9.5).mean() @@ -1904,7 +2280,7 @@ def test_ewm_domain_checks(self): with pytest.raises(ValueError, match=msg): s.ewm(alpha=1.1) - @pytest.mark.parametrize('method', ['mean', 'vol', 'var']) + @pytest.mark.parametrize("method", ["mean", "vol", "var"]) def test_ew_empty_series(self, method): vals = pd.Series([], dtype=np.float64) @@ -1937,7 +2313,7 @@ def _check_ew(self, name=None, preserve_nan=False): for min_periods in (0, 1): result = getattr(s.ewm(com=50, min_periods=min_periods), name)() - if name == 'mean': + if name == "mean": assert result[:10].isna().all() assert not result[10:].isna().any() else: @@ -1947,15 +2323,13 @@ def _check_ew(self, name=None, preserve_nan=False): assert not result[11:].isna().any() # check series of length 0 - result = getattr(Series().ewm(com=50, min_periods=min_periods), - name)() + result = getattr(Series().ewm(com=50, min_periods=min_periods), name)() tm.assert_series_equal(result, Series()) # check series of length 1 - result = getattr(Series([1.]).ewm(50, min_periods=min_periods), - name)() - if name == 'mean': - tm.assert_series_equal(result, Series([1.])) + result = getattr(Series([1.0]).ewm(50, min_periods=min_periods), name)() + if name == "mean": + tm.assert_series_equal(result, Series([1.0])) else: # ewm.std, ewm.vol, ewm.var with bias=False require at least # two values @@ -1969,22 +2343,22 @@ def _check_ew(self, name=None, preserve_nan=False): class TestPairwise: # GH 7738 - df1s = [DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0, 1]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 0]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 1]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], - columns=['C', 'C']), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1., 0]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0., 1]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=['C', 1]), - DataFrame([[2., 4.], [1., 2.], [5., 2.], [8., 1.]], - columns=[1, 0.]), - DataFrame([[2, 4.], [1, 2.], [5, 2.], [8, 1.]], - columns=[0, 1.]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1.]], - columns=[1., 'X']), ] - df2 = DataFrame([[None, 1, 1], [None, 1, 2], - [None, 3, 2], [None, 8, 1]], columns=['Y', 'Z', 'X']) + df1s = [ + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0, 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=["C", "C"]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1.0, 0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0.0, 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=["C", 1]), + DataFrame([[2.0, 4.0], [1.0, 2.0], [5.0, 2.0], [8.0, 1.0]], columns=[1, 0.0]), + DataFrame([[2, 4.0], [1, 2.0], [5, 2.0], [8, 1.0]], columns=[0, 1.0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1.0]], columns=[1.0, "X"]), + ] + df2 = DataFrame( + [[None, 1, 1], [None, 1, 2], [None, 3, 2], [None, 8, 1]], + columns=["Y", "Z", "X"], + ) s = Series([1, 1, 3, 8]) def compare(self, result, expected): @@ -1996,7 +2370,7 @@ def compare(self, result, expected): tm.assert_numpy_array_equal(result, expected, check_dtype=False) - @pytest.mark.parametrize('f', [lambda x: x.cov(), lambda x: x.corr()]) + @pytest.mark.parametrize("f", [lambda x: x.cov(), lambda x: x.corr()]) def test_no_flex(self, f): # DataFrame methods (which do not call _flex_binary_moment()) @@ -2010,12 +2384,16 @@ def test_no_flex(self, f): self.compare(result, results[0]) @pytest.mark.parametrize( - 'f', [lambda x: x.expanding().cov(pairwise=True), - lambda x: x.expanding().corr(pairwise=True), - lambda x: x.rolling(window=3).cov(pairwise=True), - lambda x: x.rolling(window=3).corr(pairwise=True), - lambda x: x.ewm(com=3).cov(pairwise=True), - lambda x: x.ewm(com=3).corr(pairwise=True)]) + "f", + [ + lambda x: x.expanding().cov(pairwise=True), + lambda x: x.expanding().corr(pairwise=True), + lambda x: x.rolling(window=3).cov(pairwise=True), + lambda x: x.rolling(window=3).corr(pairwise=True), + lambda x: x.ewm(com=3).cov(pairwise=True), + lambda x: x.ewm(com=3).corr(pairwise=True), + ], + ) def test_pairwise_with_self(self, f): # DataFrame with itself, pairwise=True @@ -2024,11 +2402,10 @@ def test_pairwise_with_self(self, f): results = [] for i, df in enumerate(self.df1s): result = f(df) - tm.assert_index_equal(result.index.levels[0], - df.index, - check_names=False) - tm.assert_numpy_array_equal(safe_sort(result.index.levels[1]), - safe_sort(df.columns.unique())) + tm.assert_index_equal(result.index.levels[0], df.index, check_names=False) + tm.assert_numpy_array_equal( + safe_sort(result.index.levels[1]), safe_sort(df.columns.unique()) + ) tm.assert_index_equal(result.columns, df.columns) results.append(df) @@ -2037,12 +2414,16 @@ def test_pairwise_with_self(self, f): self.compare(result, results[0]) @pytest.mark.parametrize( - 'f', [lambda x: x.expanding().cov(pairwise=False), - lambda x: x.expanding().corr(pairwise=False), - lambda x: x.rolling(window=3).cov(pairwise=False), - lambda x: x.rolling(window=3).corr(pairwise=False), - lambda x: x.ewm(com=3).cov(pairwise=False), - lambda x: x.ewm(com=3).corr(pairwise=False), ]) + "f", + [ + lambda x: x.expanding().cov(pairwise=False), + lambda x: x.expanding().corr(pairwise=False), + lambda x: x.rolling(window=3).cov(pairwise=False), + lambda x: x.rolling(window=3).corr(pairwise=False), + lambda x: x.ewm(com=3).cov(pairwise=False), + lambda x: x.ewm(com=3).corr(pairwise=False), + ], + ) def test_no_pairwise_with_self(self, f): # DataFrame with itself, pairwise=False @@ -2055,38 +2436,46 @@ def test_no_pairwise_with_self(self, f): self.compare(result, results[0]) @pytest.mark.parametrize( - 'f', [lambda x, y: x.expanding().cov(y, pairwise=True), - lambda x, y: x.expanding().corr(y, pairwise=True), - lambda x, y: x.rolling(window=3).cov(y, pairwise=True), - lambda x, y: x.rolling(window=3).corr(y, pairwise=True), - lambda x, y: x.ewm(com=3).cov(y, pairwise=True), - lambda x, y: x.ewm(com=3).corr(y, pairwise=True), ]) + "f", + [ + lambda x, y: x.expanding().cov(y, pairwise=True), + lambda x, y: x.expanding().corr(y, pairwise=True), + lambda x, y: x.rolling(window=3).cov(y, pairwise=True), + lambda x, y: x.rolling(window=3).corr(y, pairwise=True), + lambda x, y: x.ewm(com=3).cov(y, pairwise=True), + lambda x, y: x.ewm(com=3).corr(y, pairwise=True), + ], + ) def test_pairwise_with_other(self, f): # DataFrame with another DataFrame, pairwise=True results = [f(df, self.df2) for df in self.df1s] for (df, result) in zip(self.df1s, results): - tm.assert_index_equal(result.index.levels[0], - df.index, - check_names=False) - tm.assert_numpy_array_equal(safe_sort(result.index.levels[1]), - safe_sort(self.df2.columns.unique())) + tm.assert_index_equal(result.index.levels[0], df.index, check_names=False) + tm.assert_numpy_array_equal( + safe_sort(result.index.levels[1]), safe_sort(self.df2.columns.unique()) + ) for i, result in enumerate(results): if i > 0: self.compare(result, results[0]) @pytest.mark.parametrize( - 'f', [lambda x, y: x.expanding().cov(y, pairwise=False), - lambda x, y: x.expanding().corr(y, pairwise=False), - lambda x, y: x.rolling(window=3).cov(y, pairwise=False), - lambda x, y: x.rolling(window=3).corr(y, pairwise=False), - lambda x, y: x.ewm(com=3).cov(y, pairwise=False), - lambda x, y: x.ewm(com=3).corr(y, pairwise=False), ]) + "f", + [ + lambda x, y: x.expanding().cov(y, pairwise=False), + lambda x, y: x.expanding().corr(y, pairwise=False), + lambda x, y: x.rolling(window=3).cov(y, pairwise=False), + lambda x, y: x.rolling(window=3).corr(y, pairwise=False), + lambda x, y: x.ewm(com=3).cov(y, pairwise=False), + lambda x, y: x.ewm(com=3).corr(y, pairwise=False), + ], + ) def test_no_pairwise_with_other(self, f): # DataFrame with another DataFrame, pairwise=False - results = [f(df, self.df2) if df.columns.is_unique else None - for df in self.df1s] + results = [ + f(df, self.df2) if df.columns.is_unique else None for df in self.df1s + ] for (df, result) in zip(self.df1s, results): if result is not None: with catch_warnings(record=True): @@ -2097,25 +2486,28 @@ def test_no_pairwise_with_other(self, f): tm.assert_index_equal(result.index, expected_index) tm.assert_index_equal(result.columns, expected_columns) else: - with pytest.raises(ValueError, - match="'arg1' columns are not unique"): + with pytest.raises(ValueError, match="'arg1' columns are not unique"): f(df, self.df2) - with pytest.raises(ValueError, - match="'arg2' columns are not unique"): + with pytest.raises(ValueError, match="'arg2' columns are not unique"): f(self.df2, df) @pytest.mark.parametrize( - 'f', [lambda x, y: x.expanding().cov(y), - lambda x, y: x.expanding().corr(y), - lambda x, y: x.rolling(window=3).cov(y), - lambda x, y: x.rolling(window=3).corr(y), - lambda x, y: x.ewm(com=3).cov(y), - lambda x, y: x.ewm(com=3).corr(y), ]) + "f", + [ + lambda x, y: x.expanding().cov(y), + lambda x, y: x.expanding().corr(y), + lambda x, y: x.rolling(window=3).cov(y), + lambda x, y: x.rolling(window=3).corr(y), + lambda x, y: x.ewm(com=3).cov(y), + lambda x, y: x.ewm(com=3).corr(y), + ], + ) def test_pairwise_with_series(self, f): # DataFrame with a Series - results = ([f(df, self.s) for df in self.df1s] + - [f(self.s, df) for df in self.df1s]) + results = [f(df, self.s) for df in self.df1s] + [ + f(self.s, df) for df in self.df1s + ] for (df, result) in zip(self.df1s, results): tm.assert_index_equal(result.index, df.index) tm.assert_index_equal(result.columns, df.columns) @@ -2127,42 +2519,135 @@ def test_pairwise_with_series(self, f): # create the data only once as we are not setting it def _create_consistency_data(): def create_series(): - return [Series(), - Series([np.nan]), - Series([np.nan, np.nan]), - Series([3.]), - Series([np.nan, 3.]), - Series([3., np.nan]), - Series([1., 3.]), - Series([2., 2.]), - Series([3., 1.]), - Series([5., 5., 5., 5., np.nan, np.nan, np.nan, 5., 5., np.nan, - np.nan]), - Series([np.nan, 5., 5., 5., np.nan, np.nan, np.nan, 5., 5., - np.nan, np.nan]), - Series([np.nan, np.nan, 5., 5., np.nan, np.nan, np.nan, 5., 5., - np.nan, np.nan]), - Series([np.nan, 3., np.nan, 3., 4., 5., 6., np.nan, np.nan, 7., - 12., 13., 14., 15.]), - Series([np.nan, 5., np.nan, 2., 4., 0., 9., np.nan, np.nan, 3., - 12., 13., 14., 15.]), - Series([2., 3., np.nan, 3., 4., 5., 6., np.nan, np.nan, 7., - 12., 13., 14., 15.]), - Series([2., 5., np.nan, 2., 4., 0., 9., np.nan, np.nan, 3., - 12., 13., 14., 15.]), - Series(range(10)), - Series(range(20, 0, -2)), ] + return [ + Series(), + Series([np.nan]), + Series([np.nan, np.nan]), + Series([3.0]), + Series([np.nan, 3.0]), + Series([3.0, np.nan]), + Series([1.0, 3.0]), + Series([2.0, 2.0]), + Series([3.0, 1.0]), + Series( + [5.0, 5.0, 5.0, 5.0, np.nan, np.nan, np.nan, 5.0, 5.0, np.nan, np.nan] + ), + Series( + [ + np.nan, + 5.0, + 5.0, + 5.0, + np.nan, + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + ] + ), + Series( + [ + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + ] + ), + Series( + [ + np.nan, + 3.0, + np.nan, + 3.0, + 4.0, + 5.0, + 6.0, + np.nan, + np.nan, + 7.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + np.nan, + 5.0, + np.nan, + 2.0, + 4.0, + 0.0, + 9.0, + np.nan, + np.nan, + 3.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + 2.0, + 3.0, + np.nan, + 3.0, + 4.0, + 5.0, + 6.0, + np.nan, + np.nan, + 7.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + 2.0, + 5.0, + np.nan, + 2.0, + 4.0, + 0.0, + 9.0, + np.nan, + np.nan, + 3.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series(range(10)), + Series(range(20, 0, -2)), + ] def create_dataframes(): - return ([DataFrame(), - DataFrame(columns=['a']), - DataFrame(columns=['a', 'a']), - DataFrame(columns=['a', 'b']), - DataFrame(np.arange(10).reshape((5, 2))), - DataFrame(np.arange(25).reshape((5, 5))), - DataFrame(np.arange(25).reshape((5, 5)), - columns=['a', 'b', 99, 'd', 'd'])] + - [DataFrame(s) for s in create_series()]) + return [ + DataFrame(), + DataFrame(columns=["a"]), + DataFrame(columns=["a", "a"]), + DataFrame(columns=["a", "b"]), + DataFrame(np.arange(10).reshape((5, 2))), + DataFrame(np.arange(25).reshape((5, 5))), + DataFrame(np.arange(25).reshape((5, 5)), columns=["a", "b", 99, "d", "d"]), + ] + [DataFrame(s) for s in create_series()] def is_constant(x): values = x.values.ravel() @@ -2191,40 +2676,37 @@ def _rolling_consistency_cases(): class TestMomentsConsistency(Base): base_functions = [ - (lambda v: Series(v).count(), None, 'count'), - (lambda v: Series(v).max(), None, 'max'), - (lambda v: Series(v).min(), None, 'min'), - (lambda v: Series(v).sum(), None, 'sum'), - (lambda v: Series(v).mean(), None, 'mean'), - (lambda v: Series(v).std(), 1, 'std'), - (lambda v: Series(v).cov(Series(v)), None, 'cov'), - (lambda v: Series(v).corr(Series(v)), None, 'corr'), - (lambda v: Series(v).var(), 1, 'var'), - + (lambda v: Series(v).count(), None, "count"), + (lambda v: Series(v).max(), None, "max"), + (lambda v: Series(v).min(), None, "min"), + (lambda v: Series(v).sum(), None, "sum"), + (lambda v: Series(v).mean(), None, "mean"), + (lambda v: Series(v).std(), 1, "std"), + (lambda v: Series(v).cov(Series(v)), None, "cov"), + (lambda v: Series(v).corr(Series(v)), None, "corr"), + (lambda v: Series(v).var(), 1, "var"), # restore once GH 8086 is fixed # lambda v: Series(v).skew(), 3, 'skew'), # (lambda v: Series(v).kurt(), 4, 'kurt'), - # restore once GH 8084 is fixed # lambda v: Series(v).quantile(0.3), None, 'quantile'), - - (lambda v: Series(v).median(), None, 'median'), - (np.nanmax, 1, 'max'), - (np.nanmin, 1, 'min'), - (np.nansum, 1, 'sum'), - (np.nanmean, 1, 'mean'), - (lambda v: np.nanstd(v, ddof=1), 1, 'std'), - (lambda v: np.nanvar(v, ddof=1), 1, 'var'), - (np.nanmedian, 1, 'median'), + (lambda v: Series(v).median(), None, "median"), + (np.nanmax, 1, "max"), + (np.nanmin, 1, "min"), + (np.nansum, 1, "sum"), + (np.nanmean, 1, "mean"), + (lambda v: np.nanstd(v, ddof=1), 1, "std"), + (lambda v: np.nanvar(v, ddof=1), 1, "var"), + (np.nanmedian, 1, "median"), ] no_nan_functions = [ - (np.max, None, 'max'), - (np.min, None, 'min'), - (np.sum, None, 'sum'), - (np.mean, None, 'mean'), - (lambda v: np.std(v, ddof=1), 1, 'std'), - (lambda v: np.var(v, ddof=1), 1, 'var'), - (np.median, None, 'median'), + (np.max, None, "max"), + (np.min, None, "min"), + (np.sum, None, "sum"), + (np.mean, None, "mean"), + (lambda v: np.std(v, ddof=1), 1, "std"), + (lambda v: np.var(v, ddof=1), 1, "var"), + (np.median, None, "median"), ] def _create_data(self): @@ -2234,11 +2716,21 @@ def _create_data(self): def setup_method(self, method): self._create_data() - def _test_moments_consistency(self, min_periods, count, mean, mock_mean, - corr, var_unbiased=None, std_unbiased=None, - cov_unbiased=None, var_biased=None, - std_biased=None, cov_biased=None, - var_debiasing_factors=None): + def _test_moments_consistency( + self, + min_periods, + count, + mean, + mock_mean, + corr, + var_unbiased=None, + std_unbiased=None, + cov_unbiased=None, + var_biased=None, + std_biased=None, + cov_biased=None, + var_debiasing_factors=None, + ): def _non_null_values(x): values = x.values.ravel() return set(values[notna(values)].tolist()) @@ -2250,7 +2742,7 @@ def _non_null_values(x): if mock_mean: # check that mean equals mock_mean expected = mock_mean(x) - assert_equal(mean_x, expected.astype('float64')) + assert_equal(mean_x, expected.astype("float64")) # check that correlation of a series with itself is either 1 or NaN corr_x_x = corr(x, x) @@ -2275,12 +2767,12 @@ def _non_null_values(x): var_unbiased_x = var_unbiased(x) var_biased_x = var_biased(x) var_debiasing_factors_x = var_debiasing_factors(x) - assert_equal(var_unbiased_x, var_biased_x * - var_debiasing_factors_x) + assert_equal(var_unbiased_x, var_biased_x * var_debiasing_factors_x) - for (std, var, cov) in [(std_biased, var_biased, cov_biased), - (std_unbiased, var_unbiased, cov_unbiased) - ]: + for (std, var, cov) in [ + (std_biased, var_biased, cov_biased), + (std_unbiased, var_unbiased, cov_unbiased), + ]: # check that var(x), std(x), and cov(x) are all >= 0 var_x = var(x) @@ -2306,7 +2798,7 @@ def _non_null_values(x): # check that variance of constant series is identically 0 assert not (var_x > 0).any().any() expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = 0. + expected[count_x >= max(min_periods, 1)] = 0.0 if var is var_unbiased: expected[count_x < 2] = np.nan assert_equal(var_x, expected) @@ -2333,8 +2825,7 @@ def _non_null_values(x): # var(y)) / 2 var_x_plus_y = var(x + y) var_y = var(y) - assert_equal(cov_x_y, 0.5 * - (var_x_plus_y - var_x - var_y)) + assert_equal(cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y)) # check that corr(x, y) == cov(x, y) / (std(x) * # std(y)) @@ -2346,146 +2837,160 @@ def _non_null_values(x): # mean(x)*mean(y) mean_y = mean(y) mean_x_times_y = mean(x * y) - assert_equal(cov_x_y, mean_x_times_y - - (mean_x * mean_y)) + assert_equal( + cov_x_y, mean_x_times_y - (mean_x * mean_y) + ) @pytest.mark.slow - @pytest.mark.parametrize('min_periods', [0, 1, 2, 3, 4]) - @pytest.mark.parametrize('adjust', [True, False]) - @pytest.mark.parametrize('ignore_na', [True, False]) + @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) + @pytest.mark.parametrize("adjust", [True, False]) + @pytest.mark.parametrize("ignore_na", [True, False]) def test_ewm_consistency(self, min_periods, adjust, ignore_na): def _weights(s, com, adjust, ignore_na): if isinstance(s, DataFrame): if not len(s.columns): return DataFrame(index=s.index, columns=s.columns) - w = concat([ - _weights(s.iloc[:, i], com=com, adjust=adjust, - ignore_na=ignore_na) - for i, _ in enumerate(s.columns)], axis=1) + w = concat( + [ + _weights( + s.iloc[:, i], com=com, adjust=adjust, ignore_na=ignore_na + ) + for i, _ in enumerate(s.columns) + ], + axis=1, + ) w.index = s.index w.columns = s.columns return w w = Series(np.nan, index=s.index) - alpha = 1. / (1. + com) + alpha = 1.0 / (1.0 + com) if ignore_na: - w[s.notna()] = _weights(s[s.notna()], com=com, - adjust=adjust, ignore_na=False) + w[s.notna()] = _weights( + s[s.notna()], com=com, adjust=adjust, ignore_na=False + ) elif adjust: for i in range(len(s)): if s.iat[i] == s.iat[i]: - w.iat[i] = pow(1. / (1. - alpha), i) + w.iat[i] = pow(1.0 / (1.0 - alpha), i) else: - sum_wts = 0. + sum_wts = 0.0 prev_i = -1 for i in range(len(s)): if s.iat[i] == s.iat[i]: if prev_i == -1: - w.iat[i] = 1. + w.iat[i] = 1.0 else: - w.iat[i] = alpha * sum_wts / pow(1. - alpha, - i - prev_i) + w.iat[i] = alpha * sum_wts / pow(1.0 - alpha, i - prev_i) sum_wts += w.iat[i] prev_i = i return w def _variance_debiasing_factors(s, com, adjust, ignore_na): weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) - cum_sum = weights.cumsum().fillna(method='ffill') - cum_sum_sq = (weights * weights).cumsum().fillna(method='ffill') + cum_sum = weights.cumsum().fillna(method="ffill") + cum_sum_sq = (weights * weights).cumsum().fillna(method="ffill") numerator = cum_sum * cum_sum denominator = numerator - cum_sum_sq - denominator[denominator <= 0.] = np.nan + denominator[denominator <= 0.0] = np.nan return numerator / denominator def _ewma(s, com, min_periods, adjust, ignore_na): weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) - result = s.multiply(weights).cumsum().divide(weights.cumsum( - )).fillna(method='ffill') - result[s.expanding().count() < (max(min_periods, 1) if min_periods - else 1)] = np.nan + result = ( + s.multiply(weights) + .cumsum() + .divide(weights.cumsum()) + .fillna(method="ffill") + ) + result[ + s.expanding().count() < (max(min_periods, 1) if min_periods else 1) + ] = np.nan return result - com = 3. + com = 3.0 # test consistency between different ewm* moments self._test_moments_consistency( min_periods=min_periods, count=lambda x: x.expanding().count(), - mean=lambda x: x.ewm(com=com, min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na).mean(), - mock_mean=lambda x: _ewma(x, com=com, - min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na), - corr=lambda x, y: x.ewm(com=com, min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na).corr(y), + mean=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).mean(), + mock_mean=lambda x: _ewma( + x, com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ), + corr=lambda x, y: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).corr(y), var_unbiased=lambda x: ( - x.ewm(com=com, min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na).var(bias=False)), + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=False) + ), std_unbiased=lambda x: ( - x.ewm(com=com, min_periods=min_periods, - adjust=adjust, ignore_na=ignore_na) - .std(bias=False)), + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).std(bias=False) + ), cov_unbiased=lambda x, y: ( - x.ewm(com=com, min_periods=min_periods, - adjust=adjust, ignore_na=ignore_na) - .cov(y, bias=False)), + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).cov(y, bias=False) + ), var_biased=lambda x: ( - x.ewm(com=com, min_periods=min_periods, - adjust=adjust, ignore_na=ignore_na) - .var(bias=True)), - std_biased=lambda x: x.ewm(com=com, min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na).std(bias=True), + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=True) + ), + std_biased=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).std(bias=True), cov_biased=lambda x, y: ( - x.ewm(com=com, min_periods=min_periods, - adjust=adjust, ignore_na=ignore_na) - .cov(y, bias=True)), + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).cov(y, bias=True) + ), var_debiasing_factors=lambda x: ( - _variance_debiasing_factors(x, com=com, adjust=adjust, - ignore_na=ignore_na))) + _variance_debiasing_factors( + x, com=com, adjust=adjust, ignore_na=ignore_na + ) + ), + ) @pytest.mark.slow - @pytest.mark.parametrize( - 'min_periods', [0, 1, 2, 3, 4]) + @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) def test_expanding_consistency(self, min_periods): # suppress warnings about empty slices, as we are deliberately testing # with empty/0-length Series/DataFrames with warnings.catch_warnings(): - warnings.filterwarnings("ignore", - message=".*(empty slice|0 for slice).*", - category=RuntimeWarning) + warnings.filterwarnings( + "ignore", + message=".*(empty slice|0 for slice).*", + category=RuntimeWarning, + ) # test consistency between different expanding_* moments self._test_moments_consistency( min_periods=min_periods, count=lambda x: x.expanding().count(), - mean=lambda x: x.expanding( - min_periods=min_periods).mean(), - mock_mean=lambda x: x.expanding( - min_periods=min_periods).sum() / x.expanding().count(), - corr=lambda x, y: x.expanding( - min_periods=min_periods).corr(y), - var_unbiased=lambda x: x.expanding( - min_periods=min_periods).var(), - std_unbiased=lambda x: x.expanding( - min_periods=min_periods).std(), - cov_unbiased=lambda x, y: x.expanding( - min_periods=min_periods).cov(y), - var_biased=lambda x: x.expanding( - min_periods=min_periods).var(ddof=0), - std_biased=lambda x: x.expanding( - min_periods=min_periods).std(ddof=0), - cov_biased=lambda x, y: x.expanding( - min_periods=min_periods).cov(y, ddof=0), + mean=lambda x: x.expanding(min_periods=min_periods).mean(), + mock_mean=lambda x: x.expanding(min_periods=min_periods).sum() + / x.expanding().count(), + corr=lambda x, y: x.expanding(min_periods=min_periods).corr(y), + var_unbiased=lambda x: x.expanding(min_periods=min_periods).var(), + std_unbiased=lambda x: x.expanding(min_periods=min_periods).std(), + cov_unbiased=lambda x, y: x.expanding(min_periods=min_periods).cov(y), + var_biased=lambda x: x.expanding(min_periods=min_periods).var(ddof=0), + std_biased=lambda x: x.expanding(min_periods=min_periods).std(ddof=0), + cov_biased=lambda x, y: x.expanding(min_periods=min_periods).cov( + y, ddof=0 + ), var_debiasing_factors=lambda x: ( - x.expanding().count() / - (x.expanding().count() - 1.) - .replace(0., np.nan))) + x.expanding().count() + / (x.expanding().count() - 1.0).replace(0.0, np.nan) + ), + ) # test consistency between expanding_xyz() and either (a) # expanding_apply of Series.xyz(), or (b) expanding_apply of @@ -2497,92 +3002,111 @@ def test_expanding_consistency(self, min_periods): if no_nans: functions = self.base_functions + self.no_nan_functions for (f, require_min_periods, name) in functions: - expanding_f = getattr( - x.expanding(min_periods=min_periods), name) + expanding_f = getattr(x.expanding(min_periods=min_periods), name) - if (require_min_periods and - (min_periods is not None) and - (min_periods < require_min_periods)): + if ( + require_min_periods + and (min_periods is not None) + and (min_periods < require_min_periods) + ): continue - if name == 'count': + if name == "count": expanding_f_result = expanding_f() - expanding_apply_f_result = x.expanding( - min_periods=0).apply(func=f, raw=True) + expanding_apply_f_result = x.expanding(min_periods=0).apply( + func=f, raw=True + ) else: - if name in ['cov', 'corr']: - expanding_f_result = expanding_f( - pairwise=False) + if name in ["cov", "corr"]: + expanding_f_result = expanding_f(pairwise=False) else: expanding_f_result = expanding_f() expanding_apply_f_result = x.expanding( - min_periods=min_periods).apply(func=f, raw=True) + min_periods=min_periods + ).apply(func=f, raw=True) # GH 9422 - if name in ['sum', 'prod']: - assert_equal(expanding_f_result, - expanding_apply_f_result) + if name in ["sum", "prod"]: + assert_equal(expanding_f_result, expanding_apply_f_result) @pytest.mark.slow @pytest.mark.parametrize( - 'window,min_periods,center', list(_rolling_consistency_cases())) + "window,min_periods,center", list(_rolling_consistency_cases()) + ) def test_rolling_consistency(self, window, min_periods, center): # suppress warnings about empty slices, as we are deliberately testing # with empty/0-length Series/DataFrames with warnings.catch_warnings(): - warnings.filterwarnings("ignore", - message=".*(empty slice|0 for slice).*", - category=RuntimeWarning) + warnings.filterwarnings( + "ignore", + message=".*(empty slice|0 for slice).*", + category=RuntimeWarning, + ) # test consistency between different rolling_* moments self._test_moments_consistency( min_periods=min_periods, - count=lambda x: ( - x.rolling(window=window, center=center) - .count()), + count=lambda x: (x.rolling(window=window, center=center).count()), mean=lambda x: ( - x.rolling(window=window, min_periods=min_periods, - center=center).mean()), + x.rolling( + window=window, min_periods=min_periods, center=center + ).mean() + ), mock_mean=lambda x: ( - x.rolling(window=window, - min_periods=min_periods, - center=center).sum() - .divide(x.rolling(window=window, - min_periods=min_periods, - center=center).count())), + x.rolling(window=window, min_periods=min_periods, center=center) + .sum() + .divide( + x.rolling( + window=window, min_periods=min_periods, center=center + ).count() + ) + ), corr=lambda x, y: ( - x.rolling(window=window, min_periods=min_periods, - center=center).corr(y)), - + x.rolling( + window=window, min_periods=min_periods, center=center + ).corr(y) + ), var_unbiased=lambda x: ( - x.rolling(window=window, min_periods=min_periods, - center=center).var()), - + x.rolling( + window=window, min_periods=min_periods, center=center + ).var() + ), std_unbiased=lambda x: ( - x.rolling(window=window, min_periods=min_periods, - center=center).std()), - + x.rolling( + window=window, min_periods=min_periods, center=center + ).std() + ), cov_unbiased=lambda x, y: ( - x.rolling(window=window, min_periods=min_periods, - center=center).cov(y)), - + x.rolling( + window=window, min_periods=min_periods, center=center + ).cov(y) + ), var_biased=lambda x: ( - x.rolling(window=window, min_periods=min_periods, - center=center).var(ddof=0)), - + x.rolling( + window=window, min_periods=min_periods, center=center + ).var(ddof=0) + ), std_biased=lambda x: ( - x.rolling(window=window, min_periods=min_periods, - center=center).std(ddof=0)), - + x.rolling( + window=window, min_periods=min_periods, center=center + ).std(ddof=0) + ), cov_biased=lambda x, y: ( - x.rolling(window=window, min_periods=min_periods, - center=center).cov(y, ddof=0)), + x.rolling( + window=window, min_periods=min_periods, center=center + ).cov(y, ddof=0) + ), var_debiasing_factors=lambda x: ( - x.rolling(window=window, center=center).count() - .divide((x.rolling(window=window, center=center) - .count() - 1.) - .replace(0., np.nan)))) + x.rolling(window=window, center=center) + .count() + .divide( + (x.rolling(window=window, center=center).count() - 1.0).replace( + 0.0, np.nan + ) + ) + ), + ) # test consistency between rolling_xyz() and either (a) # rolling_apply of Series.xyz(), or (b) rolling_apply of @@ -2595,33 +3119,36 @@ def test_rolling_consistency(self, window, min_periods, center): functions = self.base_functions + self.no_nan_functions for (f, require_min_periods, name) in functions: rolling_f = getattr( - x.rolling(window=window, center=center, - min_periods=min_periods), name) + x.rolling( + window=window, center=center, min_periods=min_periods + ), + name, + ) - if require_min_periods and ( - min_periods is not None) and ( - min_periods < require_min_periods): + if ( + require_min_periods + and (min_periods is not None) + and (min_periods < require_min_periods) + ): continue - if name == 'count': + if name == "count": rolling_f_result = rolling_f() rolling_apply_f_result = x.rolling( - window=window, min_periods=0, - center=center).apply(func=f, raw=True) + window=window, min_periods=0, center=center + ).apply(func=f, raw=True) else: - if name in ['cov', 'corr']: - rolling_f_result = rolling_f( - pairwise=False) + if name in ["cov", "corr"]: + rolling_f_result = rolling_f(pairwise=False) else: rolling_f_result = rolling_f() rolling_apply_f_result = x.rolling( - window=window, min_periods=min_periods, - center=center).apply(func=f, raw=True) + window=window, min_periods=min_periods, center=center + ).apply(func=f, raw=True) # GH 9422 - if name in ['sum', 'prod']: - assert_equal(rolling_f_result, - rolling_apply_f_result) + if name in ["sum", "prod"]: + assert_equal(rolling_f_result, rolling_apply_f_result) # binary moments def test_rolling_cov(self): @@ -2632,7 +3159,7 @@ def test_rolling_cov(self): tm.assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1]) def test_rolling_cov_pairwise(self): - self._check_pairwise_moment('rolling', 'cov', window=10, min_periods=5) + self._check_pairwise_moment("rolling", "cov", window=10, min_periods=5) def test_rolling_corr(self): A = self.series @@ -2651,10 +3178,9 @@ def test_rolling_corr(self): tm.assert_almost_equal(result[-1], a.corr(b)) def test_rolling_corr_pairwise(self): - self._check_pairwise_moment('rolling', 'corr', window=10, - min_periods=5) + self._check_pairwise_moment("rolling", "corr", window=10, min_periods=5) - @pytest.mark.parametrize('window', range(7)) + @pytest.mark.parametrize("window", range(7)) def test_rolling_corr_with_zero_variance(self, window): # GH 18430 s = pd.Series(np.zeros(20)) @@ -2675,17 +3201,27 @@ def get_result(obj, obj2=None): def test_flex_binary_moment(self): # GH3155 # don't blow the stack - msg = ("arguments to moment function must be of type" - " np.ndarray/Series/DataFrame") + msg = ( + "arguments to moment function must be of type" + " np.ndarray/Series/DataFrame" + ) with pytest.raises(TypeError, match=msg): rwindow._flex_binary_moment(5, 6, None) def test_corr_sanity(self): # GH 3155 - df = DataFrame(np.array( - [[0.87024726, 0.18505595], [0.64355431, 0.3091617], - [0.92372966, 0.50552513], [0.00203756, 0.04520709], - [0.84780328, 0.33394331], [0.78369152, 0.63919667]])) + df = DataFrame( + np.array( + [ + [0.87024726, 0.18505595], + [0.64355431, 0.3091617], + [0.92372966, 0.50552513], + [0.00203756, 0.04520709], + [0.84780328, 0.33394331], + [0.78369152, 0.63919667], + ] + ) + ) res = df[0].rolling(5, center=True).corr(df[1]) assert all(np.abs(np.nan_to_num(x)) <= 1 for x in res) @@ -2699,14 +3235,13 @@ def test_corr_sanity(self): except AssertionError: print(res) - @pytest.mark.parametrize('method', ['corr', 'cov']) + @pytest.mark.parametrize("method", ["corr", "cov"]) def test_flex_binary_frame(self, method): series = self.frame[1] res = getattr(series.rolling(window=10), method)(self.frame) res2 = getattr(self.frame.rolling(window=10), method)(series) - exp = self.frame.apply(lambda x: getattr( - series.rolling(window=10), method)(x)) + exp = self.frame.apply(lambda x: getattr(series.rolling(window=10), method)(x)) tm.assert_frame_equal(res, exp) tm.assert_frame_equal(res2, exp) @@ -2715,21 +3250,25 @@ def test_flex_binary_frame(self, method): frame2.values[:] = np.random.randn(*frame2.shape) res3 = getattr(self.frame.rolling(window=10), method)(frame2) - exp = DataFrame({k: getattr(self.frame[k].rolling( - window=10), method)(frame2[k]) for k in self.frame}) + exp = DataFrame( + { + k: getattr(self.frame[k].rolling(window=10), method)(frame2[k]) + for k in self.frame + } + ) tm.assert_frame_equal(res3, exp) def test_ewmcov(self): - self._check_binary_ew('cov') + self._check_binary_ew("cov") def test_ewmcov_pairwise(self): - self._check_pairwise_moment('ewm', 'cov', span=10, min_periods=5) + self._check_pairwise_moment("ewm", "cov", span=10, min_periods=5) def test_ewmcorr(self): - self._check_binary_ew('corr') + self._check_binary_ew("corr") def test_ewmcorr_pairwise(self): - self._check_pairwise_moment('ewm', 'corr', span=10, min_periods=5) + self._check_pairwise_moment("ewm", "corr", span=10, min_periods=5) def _check_binary_ew(self, name): def func(A, B, com, **kwargs): @@ -2758,8 +3297,7 @@ def func(A, B, com, **kwargs): tm.assert_series_equal(result, Series([])) # check series of length 1 - result = func( - Series([1.]), Series([1.]), 50, min_periods=min_periods) + result = func(Series([1.0]), Series([1.0]), 50, min_periods=min_periods) tm.assert_series_equal(result, Series([np.NaN])) msg = "Input arrays must be of the same type!" @@ -2768,22 +3306,17 @@ def func(A, B, com, **kwargs): func(A, randn(50), 20, min_periods=5) def test_expanding_apply_args_kwargs(self, raw): - def mean_w_arg(x, const): return np.mean(x) + const df = DataFrame(np.random.rand(20, 3)) - expected = df.expanding().apply(np.mean, raw=raw) + 20. + expected = df.expanding().apply(np.mean, raw=raw) + 20.0 - result = df.expanding().apply(mean_w_arg, - raw=raw, - args=(20, )) + result = df.expanding().apply(mean_w_arg, raw=raw, args=(20,)) tm.assert_frame_equal(result, expected) - result = df.expanding().apply(mean_w_arg, - raw=raw, - kwargs={'const': 20}) + result = df.expanding().apply(mean_w_arg, raw=raw, kwargs={"const": 20}) tm.assert_frame_equal(result, expected) def test_expanding_corr(self): @@ -2798,14 +3331,16 @@ def test_expanding_corr(self): def test_expanding_count(self): result = self.series.expanding().count() - tm.assert_almost_equal(result, self.series.rolling( - window=len(self.series)).count()) + tm.assert_almost_equal( + result, self.series.rolling(window=len(self.series)).count() + ) def test_expanding_quantile(self): result = self.series.expanding().quantile(0.5) - rolling_result = self.series.rolling(window=len(self.series), - min_periods=1).quantile(0.5) + rolling_result = self.series.rolling( + window=len(self.series), min_periods=1 + ).quantile(0.5) tm.assert_almost_equal(result, rolling_result) @@ -2822,16 +3357,18 @@ def test_expanding_cov(self): def test_expanding_cov_pairwise(self): result = self.frame.expanding().corr() - rolling_result = self.frame.rolling(window=len(self.frame), - min_periods=1).corr() + rolling_result = self.frame.rolling( + window=len(self.frame), min_periods=1 + ).corr() tm.assert_frame_equal(result, rolling_result) def test_expanding_corr_pairwise(self): result = self.frame.expanding().corr() - rolling_result = self.frame.rolling(window=len(self.frame), - min_periods=1).corr() + rolling_result = self.frame.rolling( + window=len(self.frame), min_periods=1 + ).corr() tm.assert_frame_equal(result, rolling_result) def test_expanding_cov_diff_index(self): @@ -2867,7 +3404,7 @@ def test_expanding_corr_diff_index(self): s1 = Series([7, 8, 10], index=[0, 1, 3]) s2 = Series([7, 9, 10], index=[0, 2, 3]) result = s1.expanding().corr(s2) - expected = Series([None, None, None, 1.]) + expected = Series([None, None, None, 1.0]) tm.assert_series_equal(result, expected) def test_rolling_cov_diff_length(self): @@ -2895,12 +3432,10 @@ def test_rolling_corr_diff_length(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - 'f', + "f", [ - lambda x: (x.rolling(window=10, min_periods=5) - .cov(x, pairwise=False)), - lambda x: (x.rolling(window=10, min_periods=5) - .corr(x, pairwise=False)), + lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=False)), + lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=False)), lambda x: x.rolling(window=10, min_periods=5).max(), lambda x: x.rolling(window=10, min_periods=5).min(), lambda x: x.rolling(window=10, min_periods=5).sum(), @@ -2909,20 +3444,18 @@ def test_rolling_corr_diff_length(self): lambda x: x.rolling(window=10, min_periods=5).var(), lambda x: x.rolling(window=10, min_periods=5).skew(), lambda x: x.rolling(window=10, min_periods=5).kurt(), - lambda x: x.rolling( - window=10, min_periods=5).quantile(quantile=0.5), + lambda x: x.rolling(window=10, min_periods=5).quantile(quantile=0.5), lambda x: x.rolling(window=10, min_periods=5).median(), - lambda x: x.rolling(window=10, min_periods=5).apply( - sum, raw=False), - lambda x: x.rolling(window=10, min_periods=5).apply( - sum, raw=True), - lambda x: x.rolling(win_type='boxcar', - window=10, min_periods=5).mean()]) + lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), + lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), + lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), + ], + ) def test_rolling_functions_window_non_shrinkage(self, f): # GH 7764 s = Series(range(4)) s_expected = Series(np.nan, index=s.index) - df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=['A', 'B']) + df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=["A", "B"]) df_expected = DataFrame(np.nan, index=df.index, columns=df.columns) try: @@ -2939,18 +3472,22 @@ def test_rolling_functions_window_non_shrinkage(self, f): def test_rolling_functions_window_non_shrinkage_binary(self): # corr/cov return a MI DataFrame - df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], - columns=Index(['A', 'B'], name='foo'), - index=Index(range(4), name='bar')) + df = DataFrame( + [[1, 5], [3, 2], [3, 9], [-1, 0]], + columns=Index(["A", "B"], name="foo"), + index=Index(range(4), name="bar"), + ) df_expected = DataFrame( - columns=Index(['A', 'B'], name='foo'), - index=pd.MultiIndex.from_product([df.index, df.columns], - names=['bar', 'foo']), - dtype='float64') - functions = [lambda x: (x.rolling(window=10, min_periods=5) - .cov(x, pairwise=True)), - lambda x: (x.rolling(window=10, min_periods=5) - .corr(x, pairwise=True))] + columns=Index(["A", "B"], name="foo"), + index=pd.MultiIndex.from_product( + [df.index, df.columns], names=["bar", "foo"] + ), + dtype="float64", + ) + functions = [ + lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)), + lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)), + ] for f in functions: df_result = f(df) tm.assert_frame_equal(df_result, df_expected) @@ -2961,52 +3498,43 @@ def test_moment_functions_zero_length(self): s_expected = s df1 = DataFrame() df1_expected = df1 - df2 = DataFrame(columns=['a']) - df2['a'] = df2['a'].astype('float64') + df2 = DataFrame(columns=["a"]) + df2["a"] = df2["a"].astype("float64") df2_expected = df2 - functions = [lambda x: x.expanding().count(), - lambda x: x.expanding(min_periods=5).cov( - x, pairwise=False), - lambda x: x.expanding(min_periods=5).corr( - x, pairwise=False), - lambda x: x.expanding(min_periods=5).max(), - lambda x: x.expanding(min_periods=5).min(), - lambda x: x.expanding(min_periods=5).sum(), - lambda x: x.expanding(min_periods=5).mean(), - lambda x: x.expanding(min_periods=5).std(), - lambda x: x.expanding(min_periods=5).var(), - lambda x: x.expanding(min_periods=5).skew(), - lambda x: x.expanding(min_periods=5).kurt(), - lambda x: x.expanding(min_periods=5).quantile(0.5), - lambda x: x.expanding(min_periods=5).median(), - lambda x: x.expanding(min_periods=5).apply( - sum, raw=False), - lambda x: x.expanding(min_periods=5).apply( - sum, raw=True), - lambda x: x.rolling(window=10).count(), - lambda x: x.rolling(window=10, min_periods=5).cov( - x, pairwise=False), - lambda x: x.rolling(window=10, min_periods=5).corr( - x, pairwise=False), - lambda x: x.rolling(window=10, min_periods=5).max(), - lambda x: x.rolling(window=10, min_periods=5).min(), - lambda x: x.rolling(window=10, min_periods=5).sum(), - lambda x: x.rolling(window=10, min_periods=5).mean(), - lambda x: x.rolling(window=10, min_periods=5).std(), - lambda x: x.rolling(window=10, min_periods=5).var(), - lambda x: x.rolling(window=10, min_periods=5).skew(), - lambda x: x.rolling(window=10, min_periods=5).kurt(), - lambda x: x.rolling( - window=10, min_periods=5).quantile(0.5), - lambda x: x.rolling(window=10, min_periods=5).median(), - lambda x: x.rolling(window=10, min_periods=5).apply( - sum, raw=False), - lambda x: x.rolling(window=10, min_periods=5).apply( - sum, raw=True), - lambda x: x.rolling(win_type='boxcar', - window=10, min_periods=5).mean(), - ] + functions = [ + lambda x: x.expanding().count(), + lambda x: x.expanding(min_periods=5).cov(x, pairwise=False), + lambda x: x.expanding(min_periods=5).corr(x, pairwise=False), + lambda x: x.expanding(min_periods=5).max(), + lambda x: x.expanding(min_periods=5).min(), + lambda x: x.expanding(min_periods=5).sum(), + lambda x: x.expanding(min_periods=5).mean(), + lambda x: x.expanding(min_periods=5).std(), + lambda x: x.expanding(min_periods=5).var(), + lambda x: x.expanding(min_periods=5).skew(), + lambda x: x.expanding(min_periods=5).kurt(), + lambda x: x.expanding(min_periods=5).quantile(0.5), + lambda x: x.expanding(min_periods=5).median(), + lambda x: x.expanding(min_periods=5).apply(sum, raw=False), + lambda x: x.expanding(min_periods=5).apply(sum, raw=True), + lambda x: x.rolling(window=10).count(), + lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False), + lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False), + lambda x: x.rolling(window=10, min_periods=5).max(), + lambda x: x.rolling(window=10, min_periods=5).min(), + lambda x: x.rolling(window=10, min_periods=5).sum(), + lambda x: x.rolling(window=10, min_periods=5).mean(), + lambda x: x.rolling(window=10, min_periods=5).std(), + lambda x: x.rolling(window=10, min_periods=5).var(), + lambda x: x.rolling(window=10, min_periods=5).skew(), + lambda x: x.rolling(window=10, min_periods=5).kurt(), + lambda x: x.rolling(window=10, min_periods=5).quantile(0.5), + lambda x: x.rolling(window=10, min_periods=5).median(), + lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), + lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), + lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), + ] for f in functions: try: s_result = f(s) @@ -3026,28 +3554,27 @@ def test_moment_functions_zero_length_pairwise(self): df1 = DataFrame() df1_expected = df1 - df2 = DataFrame(columns=Index(['a'], name='foo'), - index=Index([], name='bar')) - df2['a'] = df2['a'].astype('float64') + df2 = DataFrame(columns=Index(["a"], name="foo"), index=Index([], name="bar")) + df2["a"] = df2["a"].astype("float64") df1_expected = DataFrame( index=pd.MultiIndex.from_product([df1.index, df1.columns]), - columns=Index([])) + columns=Index([]), + ) df2_expected = DataFrame( - index=pd.MultiIndex.from_product([df2.index, df2.columns], - names=['bar', 'foo']), - columns=Index(['a'], name='foo'), - dtype='float64') - - functions = [lambda x: (x.expanding(min_periods=5) - .cov(x, pairwise=True)), - lambda x: (x.expanding(min_periods=5) - .corr(x, pairwise=True)), - lambda x: (x.rolling(window=10, min_periods=5) - .cov(x, pairwise=True)), - lambda x: (x.rolling(window=10, min_periods=5) - .corr(x, pairwise=True)), - ] + index=pd.MultiIndex.from_product( + [df2.index, df2.columns], names=["bar", "foo"] + ), + columns=Index(["a"], name="foo"), + dtype="float64", + ) + + functions = [ + lambda x: (x.expanding(min_periods=5).cov(x, pairwise=True)), + lambda x: (x.expanding(min_periods=5).corr(x, pairwise=True)), + lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)), + lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)), + ] for f in functions: df1_result = f(df1) tm.assert_frame_equal(df1_result, df1_expected) @@ -3057,25 +3584,27 @@ def test_moment_functions_zero_length_pairwise(self): def test_expanding_cov_pairwise_diff_length(self): # GH 7512 - df1 = DataFrame([[1, 5], [3, 2], [3, 9]], - columns=Index(['A', 'B'], name='foo')) - df1a = DataFrame([[1, 5], [3, 9]], - index=[0, 2], - columns=Index(['A', 'B'], name='foo')) - df2 = DataFrame([[5, 6], [None, None], [2, 1]], - columns=Index(['X', 'Y'], name='foo')) - df2a = DataFrame([[5, 6], [2, 1]], - index=[0, 2], - columns=Index(['X', 'Y'], name='foo')) + df1 = DataFrame([[1, 5], [3, 2], [3, 9]], columns=Index(["A", "B"], name="foo")) + df1a = DataFrame( + [[1, 5], [3, 9]], index=[0, 2], columns=Index(["A", "B"], name="foo") + ) + df2 = DataFrame( + [[5, 6], [None, None], [2, 1]], columns=Index(["X", "Y"], name="foo") + ) + df2a = DataFrame( + [[5, 6], [2, 1]], index=[0, 2], columns=Index(["X", "Y"], name="foo") + ) # TODO: xref gh-15826 # .loc is not preserving the names result1 = df1.expanding().cov(df2a, pairwise=True).loc[2] result2 = df1.expanding().cov(df2a, pairwise=True).loc[2] result3 = df1a.expanding().cov(df2, pairwise=True).loc[2] result4 = df1a.expanding().cov(df2a, pairwise=True).loc[2] - expected = DataFrame([[-3.0, -6.0], [-5.0, -10.0]], - columns=Index(['A', 'B'], name='foo'), - index=Index(['X', 'Y'], name='foo')) + expected = DataFrame( + [[-3.0, -6.0], [-5.0, -10.0]], + columns=Index(["A", "B"], name="foo"), + index=Index(["X", "Y"], name="foo"), + ) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) tm.assert_frame_equal(result3, expected) @@ -3083,25 +3612,29 @@ def test_expanding_cov_pairwise_diff_length(self): def test_expanding_corr_pairwise_diff_length(self): # GH 7512 - df1 = DataFrame([[1, 2], [3, 2], [3, 4]], - columns=['A', 'B'], - index=Index(range(3), name='bar')) - df1a = DataFrame([[1, 2], [3, 4]], - index=Index([0, 2], name='bar'), - columns=['A', 'B']) - df2 = DataFrame([[5, 6], [None, None], [2, 1]], - columns=['X', 'Y'], - index=Index(range(3), name='bar')) - df2a = DataFrame([[5, 6], [2, 1]], - index=Index([0, 2], name='bar'), - columns=['X', 'Y']) + df1 = DataFrame( + [[1, 2], [3, 2], [3, 4]], + columns=["A", "B"], + index=Index(range(3), name="bar"), + ) + df1a = DataFrame( + [[1, 2], [3, 4]], index=Index([0, 2], name="bar"), columns=["A", "B"] + ) + df2 = DataFrame( + [[5, 6], [None, None], [2, 1]], + columns=["X", "Y"], + index=Index(range(3), name="bar"), + ) + df2a = DataFrame( + [[5, 6], [2, 1]], index=Index([0, 2], name="bar"), columns=["X", "Y"] + ) result1 = df1.expanding().corr(df2, pairwise=True).loc[2] result2 = df1.expanding().corr(df2a, pairwise=True).loc[2] result3 = df1a.expanding().corr(df2, pairwise=True).loc[2] result4 = df1a.expanding().corr(df2a, pairwise=True).loc[2] - expected = DataFrame([[-1.0, -1.0], [-1.0, -1.0]], - columns=['A', 'B'], - index=Index(['X', 'Y'])) + expected = DataFrame( + [[-1.0, -1.0], [-1.0, -1.0]], columns=["A", "B"], index=Index(["X", "Y"]) + ) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) tm.assert_frame_equal(result3, expected) @@ -3122,8 +3655,7 @@ def test_rolling_skew_edge_cases(self): tm.assert_series_equal(all_nan, x) # yields [NaN, NaN, NaN, 0.177994, 1.548824] - d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401 - ]) + d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401]) expected = Series([np.NaN, np.NaN, np.NaN, 0.177994, 1.548824]) x = d.rolling(window=4).skew() tm.assert_series_equal(expected, x) @@ -3143,8 +3675,7 @@ def test_rolling_kurt_edge_cases(self): tm.assert_series_equal(all_nan, x) # yields [NaN, NaN, NaN, 1.224307, 2.671499] - d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401 - ]) + d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401]) expected = Series([np.NaN, np.NaN, np.NaN, 1.224307, 2.671499]) x = d.rolling(window=4).kurt() tm.assert_series_equal(expected, x) @@ -3159,20 +3690,19 @@ def test_rolling_kurt_eq_value_fperr(self): a = Series([1.1] * 15).rolling(window=10).kurt() assert np.isnan(a).all() - @pytest.mark.parametrize('func,static_comp', [('sum', np.sum), - ('mean', np.mean), - ('max', np.max), - ('min', np.min)], - ids=['sum', 'mean', 'max', 'min']) + @pytest.mark.parametrize( + "func,static_comp", + [("sum", np.sum), ("mean", np.mean), ("max", np.max), ("min", np.min)], + ids=["sum", "mean", "max", "min"], + ) def test_expanding_func(self, func, static_comp): def expanding_func(x, min_periods=1, center=False, axis=0): - exp = x.expanding(min_periods=min_periods, - center=center, axis=axis) + exp = x.expanding(min_periods=min_periods, center=center, axis=axis) return getattr(exp, func)() + self._check_expanding(expanding_func, static_comp, preserve_nan=False) def test_expanding_apply(self, raw): - def expanding_mean(x, min_periods=1): exp = x.expanding(min_periods=min_periods) @@ -3184,17 +3714,22 @@ def expanding_mean(x, min_periods=1): self._check_expanding(expanding_mean, np.mean, preserve_nan=False) ser = Series([]) - tm.assert_series_equal(ser, ser.expanding().apply( - lambda x: x.mean(), raw=raw)) + tm.assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean(), raw=raw)) # GH 8080 s = Series([None, None, None]) result = s.expanding(min_periods=0).apply(lambda x: len(x), raw=raw) - expected = Series([1., 2., 3.]) + expected = Series([1.0, 2.0, 3.0]) tm.assert_series_equal(result, expected) - def _check_expanding(self, func, static_comp, has_min_periods=True, - has_time_rule=True, preserve_nan=True): + def _check_expanding( + self, + func, + static_comp, + has_min_periods=True, + has_time_rule=True, + preserve_nan=True, + ): series_result = func(self.series) assert isinstance(series_result, Series) @@ -3244,9 +3779,11 @@ def test_rolling_max_gh6297(self): # Sort chronologically series = series.sort_index() - expected = Series([1.0, 2.0, 6.0, 4.0, 5.0], - index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - x = series.resample('D').max().rolling(window=1).max() + expected = Series( + [1.0, 2.0, 6.0, 4.0, 5.0], + index=[datetime(1975, 1, i, 0) for i in range(1, 6)], + ) + x = series.resample("D").max().rolling(window=1).max() tm.assert_series_equal(expected, x) def test_rolling_max_resample(self): @@ -3262,22 +3799,28 @@ def test_rolling_max_resample(self): series = series.sort_index() # Default how should be max - expected = Series([0.0, 1.0, 2.0, 3.0, 20.0], - index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - x = series.resample('D').max().rolling(window=1).max() + expected = Series( + [0.0, 1.0, 2.0, 3.0, 20.0], + index=[datetime(1975, 1, i, 0) for i in range(1, 6)], + ) + x = series.resample("D").max().rolling(window=1).max() tm.assert_series_equal(expected, x) # Now specify median (10.0) - expected = Series([0.0, 1.0, 2.0, 3.0, 10.0], - index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - x = series.resample('D').median().rolling(window=1).max() + expected = Series( + [0.0, 1.0, 2.0, 3.0, 10.0], + index=[datetime(1975, 1, i, 0) for i in range(1, 6)], + ) + x = series.resample("D").median().rolling(window=1).max() tm.assert_series_equal(expected, x) # Now specify mean (4+10+20)/3 v = (4.0 + 10.0 + 20.0) / 3.0 - expected = Series([0.0, 1.0, 2.0, 3.0, v], - index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - x = series.resample('D').mean().rolling(window=1).max() + expected = Series( + [0.0, 1.0, 2.0, 3.0, v], + index=[datetime(1975, 1, i, 0) for i in range(1, 6)], + ) + x = series.resample("D").mean().rolling(window=1).max() tm.assert_series_equal(expected, x) def test_rolling_min_resample(self): @@ -3293,9 +3836,11 @@ def test_rolling_min_resample(self): series = series.sort_index() # Default how should be min - expected = Series([0.0, 1.0, 2.0, 3.0, 4.0], - index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - r = series.resample('D').min().rolling(window=1) + expected = Series( + [0.0, 1.0, 2.0, 3.0, 4.0], + index=[datetime(1975, 1, i, 0) for i in range(1, 6)], + ) + r = series.resample("D").min().rolling(window=1) tm.assert_series_equal(expected, r.min()) def test_rolling_median_resample(self): @@ -3311,9 +3856,11 @@ def test_rolling_median_resample(self): series = series.sort_index() # Default how should be median - expected = Series([0.0, 1.0, 2.0, 3.0, 10], - index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - x = series.resample('D').median().rolling(window=1).median() + expected = Series( + [0.0, 1.0, 2.0, 3.0, 10], + index=[datetime(1975, 1, i, 0) for i in range(1, 6)], + ) + x = series.resample("D").median().rolling(window=1).median() tm.assert_series_equal(expected, x) def test_rolling_median_memory_error(self): @@ -3326,41 +3873,42 @@ def test_rolling_min_max_numeric_types(self): # GH12373 types_test = [np.dtype("f{}".format(width)) for width in [4, 8]] - types_test.extend([np.dtype("{}{}".format(sign, width)) - for width in [1, 2, 4, 8] for sign in "ui"]) + types_test.extend( + [ + np.dtype("{}{}".format(sign, width)) + for width in [1, 2, 4, 8] + for sign in "ui" + ] + ) for data_type in types_test: # Just testing that these don't throw exceptions and that # the return type is float64. Other tests will cover quantitative # correctness - result = (DataFrame(np.arange(20, dtype=data_type)) - .rolling(window=5).max()) + result = DataFrame(np.arange(20, dtype=data_type)).rolling(window=5).max() assert result.dtypes[0] == np.dtype("f8") - result = (DataFrame(np.arange(20, dtype=data_type)) - .rolling(window=5).min()) + result = DataFrame(np.arange(20, dtype=data_type)).rolling(window=5).min() assert result.dtypes[0] == np.dtype("f8") class TestGrouperGrouping: - def setup_method(self, method): self.series = Series(np.arange(10)) - self.frame = DataFrame({'A': [1] * 20 + [2] * 12 + [3] * 8, - 'B': np.arange(40)}) + self.frame = DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) def test_mutated(self): msg = r"group\(\) got an unexpected keyword argument 'foo'" with pytest.raises(TypeError, match=msg): - self.frame.groupby('A', foo=1) + self.frame.groupby("A", foo=1) - g = self.frame.groupby('A') + g = self.frame.groupby("A") assert not g.mutated - g = self.frame.groupby('A', mutated=True) + g = self.frame.groupby("A", mutated=True) assert g.mutated def test_getitem(self): - g = self.frame.groupby('A') - g_mutated = self.frame.groupby('A', mutated=True) + g = self.frame.groupby("A") + g_mutated = self.frame.groupby("A", mutated=True) expected = g_mutated.B.apply(lambda x: x.rolling(2).mean()) @@ -3379,9 +3927,9 @@ def test_getitem(self): def test_getitem_multiple(self): # GH 13174 - g = self.frame.groupby('A') + g = self.frame.groupby("A") r = g.rolling(2) - g_mutated = self.frame.groupby('A', mutated=True) + g_mutated = self.frame.groupby("A", mutated=True) expected = g_mutated.B.apply(lambda x: x.rolling(2).count()) result = r.B.count() @@ -3391,16 +3939,16 @@ def test_getitem_multiple(self): tm.assert_series_equal(result, expected) def test_rolling(self): - g = self.frame.groupby('A') + g = self.frame.groupby("A") r = g.rolling(window=4) - for f in ['sum', 'mean', 'min', 'max', 'count', 'kurt', 'skew']: + for f in ["sum", "mean", "min", "max", "count", "kurt", "skew"]: result = getattr(r, f)() expected = g.apply(lambda x: getattr(x.rolling(4), f)()) tm.assert_frame_equal(result, expected) - for f in ['std', 'var']: + for f in ["std", "var"]: result = getattr(r, f)(ddof=1) expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) tm.assert_frame_equal(result, expected) @@ -3410,14 +3958,15 @@ def test_rolling(self): tm.assert_frame_equal(result, expected) def test_rolling_corr_cov(self): - g = self.frame.groupby('A') + g = self.frame.groupby("A") r = g.rolling(window=4) - for f in ['corr', 'cov']: + for f in ["corr", "cov"]: result = getattr(r, f)(self.frame) def func(x): return getattr(x.rolling(4), f)(self.frame) + expected = g.apply(func) tm.assert_frame_equal(result, expected) @@ -3425,30 +3974,31 @@ def func(x): def func(x): return getattr(x.B.rolling(4), f)(pairwise=True) + expected = g.apply(func) tm.assert_series_equal(result, expected) def test_rolling_apply(self, raw): - g = self.frame.groupby('A') + g = self.frame.groupby("A") r = g.rolling(window=4) # reduction result = r.apply(lambda x: x.sum(), raw=raw) - expected = g.apply( - lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) + expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) tm.assert_frame_equal(result, expected) def test_rolling_apply_mutability(self): # GH 14013 - df = pd.DataFrame({'A': ['foo'] * 3 + ['bar'] * 3, 'B': [1] * 6}) - g = df.groupby('A') + df = pd.DataFrame({"A": ["foo"] * 3 + ["bar"] * 3, "B": [1] * 6}) + g = df.groupby("A") - mi = pd.MultiIndex.from_tuples([('bar', 3), ('bar', 4), ('bar', 5), - ('foo', 0), ('foo', 1), ('foo', 2)]) + mi = pd.MultiIndex.from_tuples( + [("bar", 3), ("bar", 4), ("bar", 5), ("foo", 0), ("foo", 1), ("foo", 2)] + ) - mi.names = ['A', None] + mi.names = ["A", None] # Grouped column should not be a part of the output - expected = pd.DataFrame([np.nan, 2., 2.] * 2, columns=['B'], index=mi) + expected = pd.DataFrame([np.nan, 2.0, 2.0] * 2, columns=["B"], index=mi) result = g.rolling(window=2).sum() tm.assert_frame_equal(result, expected) @@ -3461,16 +4011,16 @@ def test_rolling_apply_mutability(self): tm.assert_frame_equal(result, expected) def test_expanding(self): - g = self.frame.groupby('A') + g = self.frame.groupby("A") r = g.expanding() - for f in ['sum', 'mean', 'min', 'max', 'count', 'kurt', 'skew']: + for f in ["sum", "mean", "min", "max", "count", "kurt", "skew"]: result = getattr(r, f)() expected = g.apply(lambda x: getattr(x.expanding(), f)()) tm.assert_frame_equal(result, expected) - for f in ['std', 'var']: + for f in ["std", "var"]: result = getattr(r, f)(ddof=0) expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) tm.assert_frame_equal(result, expected) @@ -3480,14 +4030,15 @@ def test_expanding(self): tm.assert_frame_equal(result, expected) def test_expanding_corr_cov(self): - g = self.frame.groupby('A') + g = self.frame.groupby("A") r = g.expanding() - for f in ['corr', 'cov']: + for f in ["corr", "cov"]: result = getattr(r, f)(self.frame) def func(x): return getattr(x.expanding(), f)(self.frame) + expected = g.apply(func) tm.assert_frame_equal(result, expected) @@ -3495,17 +4046,17 @@ def func(x): def func(x): return getattr(x.B.expanding(), f)(pairwise=True) + expected = g.apply(func) tm.assert_series_equal(result, expected) def test_expanding_apply(self, raw): - g = self.frame.groupby('A') + g = self.frame.groupby("A") r = g.expanding() # reduction result = r.apply(lambda x: x.sum(), raw=raw) - expected = g.apply( - lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw)) + expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw)) tm.assert_frame_equal(result, expected) @@ -3516,28 +4067,33 @@ class TestRollingTS: def setup_method(self, method): - self.regular = DataFrame({'A': pd.date_range('20130101', - periods=5, - freq='s'), - 'B': range(5)}).set_index('A') + self.regular = DataFrame( + {"A": pd.date_range("20130101", periods=5, freq="s"), "B": range(5)} + ).set_index("A") - self.ragged = DataFrame({'B': range(5)}) - self.ragged.index = [Timestamp('20130101 09:00:00'), - Timestamp('20130101 09:00:02'), - Timestamp('20130101 09:00:03'), - Timestamp('20130101 09:00:05'), - Timestamp('20130101 09:00:06')] + self.ragged = DataFrame({"B": range(5)}) + self.ragged.index = [ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ] def test_doc_string(self): - df = DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=[Timestamp('20130101 09:00:00'), - Timestamp('20130101 09:00:02'), - Timestamp('20130101 09:00:03'), - Timestamp('20130101 09:00:05'), - Timestamp('20130101 09:00:06')]) + df = DataFrame( + {"B": [0, 1, 2, np.nan, 4]}, + index=[ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ], + ) df - df.rolling('2s').sum() + df.rolling("2s").sum() def test_valid(self): @@ -3545,28 +4101,28 @@ def test_valid(self): # not a valid freq with pytest.raises(ValueError): - df.rolling(window='foobar') + df.rolling(window="foobar") # not a datetimelike index with pytest.raises(ValueError): - df.reset_index().rolling(window='foobar') + df.reset_index().rolling(window="foobar") # non-fixed freqs - for freq in ['2MS', pd.offsets.MonthBegin(2)]: + for freq in ["2MS", pd.offsets.MonthBegin(2)]: with pytest.raises(ValueError): df.rolling(window=freq) - for freq in ['1D', pd.offsets.Day(2), '2ms']: + for freq in ["1D", pd.offsets.Day(2), "2ms"]: df.rolling(window=freq) # non-integer min_periods - for minp in [1.0, 'foo', np.array([1, 2, 3])]: + for minp in [1.0, "foo", np.array([1, 2, 3])]: with pytest.raises(ValueError): - df.rolling(window='1D', min_periods=minp) + df.rolling(window="1D", min_periods=minp) # center is not implemented with pytest.raises(NotImplementedError): - df.rolling(window='1D', center=True) + df.rolling(window="1D", center=True) def test_on(self): @@ -3574,71 +4130,66 @@ def test_on(self): # not a valid column with pytest.raises(ValueError): - df.rolling(window='2s', on='foobar') + df.rolling(window="2s", on="foobar") # column is valid df = df.copy() - df['C'] = pd.date_range('20130101', periods=len(df)) - df.rolling(window='2d', on='C').sum() + df["C"] = pd.date_range("20130101", periods=len(df)) + df.rolling(window="2d", on="C").sum() # invalid columns with pytest.raises(ValueError): - df.rolling(window='2d', on='B') + df.rolling(window="2d", on="B") # ok even though on non-selected - df.rolling(window='2d', on='C').B.sum() + df.rolling(window="2d", on="C").B.sum() def test_monotonic_on(self): # on/index must be monotonic - df = DataFrame({'A': pd.date_range('20130101', - periods=5, - freq='s'), - 'B': range(5)}) + df = DataFrame( + {"A": pd.date_range("20130101", periods=5, freq="s"), "B": range(5)} + ) assert df.A.is_monotonic - df.rolling('2s', on='A').sum() + df.rolling("2s", on="A").sum() - df = df.set_index('A') + df = df.set_index("A") assert df.index.is_monotonic - df.rolling('2s').sum() + df.rolling("2s").sum() # non-monotonic df.index = reversed(df.index.tolist()) assert not df.index.is_monotonic with pytest.raises(ValueError): - df.rolling('2s').sum() + df.rolling("2s").sum() df = df.reset_index() with pytest.raises(ValueError): - df.rolling('2s', on='A').sum() + df.rolling("2s", on="A").sum() def test_frame_on(self): - df = DataFrame({'B': range(5), - 'C': pd.date_range('20130101 09:00:00', - periods=5, - freq='3s')}) + df = DataFrame( + { + "B": range(5), + "C": pd.date_range("20130101 09:00:00", periods=5, freq="3s"), + } + ) - df['A'] = [Timestamp('20130101 09:00:00'), - Timestamp('20130101 09:00:02'), - Timestamp('20130101 09:00:03'), - Timestamp('20130101 09:00:05'), - Timestamp('20130101 09:00:06')] + df["A"] = [ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ] # we are doing simulating using 'on' - expected = (df.set_index('A') - .rolling('2s') - .B - .sum() - .reset_index(drop=True) - ) + expected = df.set_index("A").rolling("2s").B.sum().reset_index(drop=True) - result = (df.rolling('2s', on='A') - .B - .sum() - ) + result = df.rolling("2s", on="A").B.sum() tm.assert_series_equal(result, expected) # test as a frame @@ -3647,66 +4198,70 @@ def test_frame_on(self): # so the columns need to be switched compared # to the actual result where they are ordered as in the # original - expected = (df.set_index('A') - .rolling('2s')[['B']] - .sum() - .reset_index()[['B', 'A']] - ) + expected = ( + df.set_index("A").rolling("2s")[["B"]].sum().reset_index()[["B", "A"]] + ) - result = (df.rolling('2s', on='A')[['B']] - .sum() - ) + result = df.rolling("2s", on="A")[["B"]].sum() tm.assert_frame_equal(result, expected) def test_frame_on2(self): # using multiple aggregation columns - df = DataFrame({'A': [0, 1, 2, 3, 4], - 'B': [0, 1, 2, np.nan, 4], - 'C': Index([Timestamp('20130101 09:00:00'), - Timestamp('20130101 09:00:02'), - Timestamp('20130101 09:00:03'), - Timestamp('20130101 09:00:05'), - Timestamp('20130101 09:00:06')])}, - columns=['A', 'C', 'B']) - - expected1 = DataFrame({'A': [0., 1, 3, 3, 7], - 'B': [0, 1, 3, np.nan, 4], - 'C': df['C']}, - columns=['A', 'C', 'B']) - - result = df.rolling('2s', on='C').sum() + df = DataFrame( + { + "A": [0, 1, 2, 3, 4], + "B": [0, 1, 2, np.nan, 4], + "C": Index( + [ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ] + ), + }, + columns=["A", "C", "B"], + ) + + expected1 = DataFrame( + {"A": [0.0, 1, 3, 3, 7], "B": [0, 1, 3, np.nan, 4], "C": df["C"]}, + columns=["A", "C", "B"], + ) + + result = df.rolling("2s", on="C").sum() expected = expected1 tm.assert_frame_equal(result, expected) - expected = Series([0, 1, 3, np.nan, 4], name='B') - result = df.rolling('2s', on='C').B.sum() + expected = Series([0, 1, 3, np.nan, 4], name="B") + result = df.rolling("2s", on="C").B.sum() tm.assert_series_equal(result, expected) - expected = expected1[['A', 'B', 'C']] - result = df.rolling('2s', on='C')[['A', 'B', 'C']].sum() + expected = expected1[["A", "B", "C"]] + result = df.rolling("2s", on="C")[["A", "B", "C"]].sum() tm.assert_frame_equal(result, expected) def test_basic_regular(self): df = self.regular.copy() - df.index = pd.date_range('20130101', periods=5, freq='D') + df.index = pd.date_range("20130101", periods=5, freq="D") expected = df.rolling(window=1, min_periods=1).sum() - result = df.rolling(window='1D').sum() + result = df.rolling(window="1D").sum() tm.assert_frame_equal(result, expected) - df.index = pd.date_range('20130101', periods=5, freq='2D') + df.index = pd.date_range("20130101", periods=5, freq="2D") expected = df.rolling(window=1, min_periods=1).sum() - result = df.rolling(window='2D', min_periods=1).sum() + result = df.rolling(window="2D", min_periods=1).sum() tm.assert_frame_equal(result, expected) expected = df.rolling(window=1, min_periods=1).sum() - result = df.rolling(window='2D', min_periods=1).sum() + result = df.rolling(window="2D", min_periods=1).sum() tm.assert_frame_equal(result, expected) expected = df.rolling(window=1).sum() - result = df.rolling(window='2D').sum() + result = df.rolling(window="2D").sum() tm.assert_frame_equal(result, expected) def test_min_periods(self): @@ -3716,307 +4271,312 @@ def test_min_periods(self): # these slightly different expected = df.rolling(2, min_periods=1).sum() - result = df.rolling('2s').sum() + result = df.rolling("2s").sum() tm.assert_frame_equal(result, expected) expected = df.rolling(2, min_periods=1).sum() - result = df.rolling('2s', min_periods=1).sum() + result = df.rolling("2s", min_periods=1).sum() tm.assert_frame_equal(result, expected) def test_closed(self): # xref GH13965 - df = DataFrame({'A': [1] * 5}, - index=[Timestamp('20130101 09:00:01'), - Timestamp('20130101 09:00:02'), - Timestamp('20130101 09:00:03'), - Timestamp('20130101 09:00:04'), - Timestamp('20130101 09:00:06')]) + df = DataFrame( + {"A": [1] * 5}, + index=[ + Timestamp("20130101 09:00:01"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:04"), + Timestamp("20130101 09:00:06"), + ], + ) # closed must be 'right', 'left', 'both', 'neither' with pytest.raises(ValueError): - self.regular.rolling(window='2s', closed="blabla") + self.regular.rolling(window="2s", closed="blabla") expected = df.copy() expected["A"] = [1.0, 2, 2, 2, 1] - result = df.rolling('2s', closed='right').sum() + result = df.rolling("2s", closed="right").sum() tm.assert_frame_equal(result, expected) # default should be 'right' - result = df.rolling('2s').sum() + result = df.rolling("2s").sum() tm.assert_frame_equal(result, expected) expected = df.copy() expected["A"] = [1.0, 2, 3, 3, 2] - result = df.rolling('2s', closed='both').sum() + result = df.rolling("2s", closed="both").sum() tm.assert_frame_equal(result, expected) expected = df.copy() expected["A"] = [np.nan, 1.0, 2, 2, 1] - result = df.rolling('2s', closed='left').sum() + result = df.rolling("2s", closed="left").sum() tm.assert_frame_equal(result, expected) expected = df.copy() expected["A"] = [np.nan, 1.0, 1, 1, np.nan] - result = df.rolling('2s', closed='neither').sum() + result = df.rolling("2s", closed="neither").sum() tm.assert_frame_equal(result, expected) def test_ragged_sum(self): df = self.ragged - result = df.rolling(window='1s', min_periods=1).sum() + result = df.rolling(window="1s", min_periods=1).sum() expected = df.copy() - expected['B'] = [0.0, 1, 2, 3, 4] + expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) - result = df.rolling(window='2s', min_periods=1).sum() + result = df.rolling(window="2s", min_periods=1).sum() expected = df.copy() - expected['B'] = [0.0, 1, 3, 3, 7] + expected["B"] = [0.0, 1, 3, 3, 7] tm.assert_frame_equal(result, expected) - result = df.rolling(window='2s', min_periods=2).sum() + result = df.rolling(window="2s", min_periods=2).sum() expected = df.copy() - expected['B'] = [np.nan, np.nan, 3, np.nan, 7] + expected["B"] = [np.nan, np.nan, 3, np.nan, 7] tm.assert_frame_equal(result, expected) - result = df.rolling(window='3s', min_periods=1).sum() + result = df.rolling(window="3s", min_periods=1).sum() expected = df.copy() - expected['B'] = [0.0, 1, 3, 5, 7] + expected["B"] = [0.0, 1, 3, 5, 7] tm.assert_frame_equal(result, expected) - result = df.rolling(window='3s').sum() + result = df.rolling(window="3s").sum() expected = df.copy() - expected['B'] = [0.0, 1, 3, 5, 7] + expected["B"] = [0.0, 1, 3, 5, 7] tm.assert_frame_equal(result, expected) - result = df.rolling(window='4s', min_periods=1).sum() + result = df.rolling(window="4s", min_periods=1).sum() expected = df.copy() - expected['B'] = [0.0, 1, 3, 6, 9] + expected["B"] = [0.0, 1, 3, 6, 9] tm.assert_frame_equal(result, expected) - result = df.rolling(window='4s', min_periods=3).sum() + result = df.rolling(window="4s", min_periods=3).sum() expected = df.copy() - expected['B'] = [np.nan, np.nan, 3, 6, 9] + expected["B"] = [np.nan, np.nan, 3, 6, 9] tm.assert_frame_equal(result, expected) - result = df.rolling(window='5s', min_periods=1).sum() + result = df.rolling(window="5s", min_periods=1).sum() expected = df.copy() - expected['B'] = [0.0, 1, 3, 6, 10] + expected["B"] = [0.0, 1, 3, 6, 10] tm.assert_frame_equal(result, expected) def test_ragged_mean(self): df = self.ragged - result = df.rolling(window='1s', min_periods=1).mean() + result = df.rolling(window="1s", min_periods=1).mean() expected = df.copy() - expected['B'] = [0.0, 1, 2, 3, 4] + expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) - result = df.rolling(window='2s', min_periods=1).mean() + result = df.rolling(window="2s", min_periods=1).mean() expected = df.copy() - expected['B'] = [0.0, 1, 1.5, 3.0, 3.5] + expected["B"] = [0.0, 1, 1.5, 3.0, 3.5] tm.assert_frame_equal(result, expected) def test_ragged_median(self): df = self.ragged - result = df.rolling(window='1s', min_periods=1).median() + result = df.rolling(window="1s", min_periods=1).median() expected = df.copy() - expected['B'] = [0.0, 1, 2, 3, 4] + expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) - result = df.rolling(window='2s', min_periods=1).median() + result = df.rolling(window="2s", min_periods=1).median() expected = df.copy() - expected['B'] = [0.0, 1, 1.5, 3.0, 3.5] + expected["B"] = [0.0, 1, 1.5, 3.0, 3.5] tm.assert_frame_equal(result, expected) def test_ragged_quantile(self): df = self.ragged - result = df.rolling(window='1s', min_periods=1).quantile(0.5) + result = df.rolling(window="1s", min_periods=1).quantile(0.5) expected = df.copy() - expected['B'] = [0.0, 1, 2, 3, 4] + expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) - result = df.rolling(window='2s', min_periods=1).quantile(0.5) + result = df.rolling(window="2s", min_periods=1).quantile(0.5) expected = df.copy() - expected['B'] = [0.0, 1, 1.5, 3.0, 3.5] + expected["B"] = [0.0, 1, 1.5, 3.0, 3.5] tm.assert_frame_equal(result, expected) def test_ragged_std(self): df = self.ragged - result = df.rolling(window='1s', min_periods=1).std(ddof=0) + result = df.rolling(window="1s", min_periods=1).std(ddof=0) expected = df.copy() - expected['B'] = [0.0] * 5 + expected["B"] = [0.0] * 5 tm.assert_frame_equal(result, expected) - result = df.rolling(window='1s', min_periods=1).std(ddof=1) + result = df.rolling(window="1s", min_periods=1).std(ddof=1) expected = df.copy() - expected['B'] = [np.nan] * 5 + expected["B"] = [np.nan] * 5 tm.assert_frame_equal(result, expected) - result = df.rolling(window='3s', min_periods=1).std(ddof=0) + result = df.rolling(window="3s", min_periods=1).std(ddof=0) expected = df.copy() - expected['B'] = [0.0] + [0.5] * 4 + expected["B"] = [0.0] + [0.5] * 4 tm.assert_frame_equal(result, expected) - result = df.rolling(window='5s', min_periods=1).std(ddof=1) + result = df.rolling(window="5s", min_periods=1).std(ddof=1) expected = df.copy() - expected['B'] = [np.nan, 0.707107, 1.0, 1.0, 1.290994] + expected["B"] = [np.nan, 0.707107, 1.0, 1.0, 1.290994] tm.assert_frame_equal(result, expected) def test_ragged_var(self): df = self.ragged - result = df.rolling(window='1s', min_periods=1).var(ddof=0) + result = df.rolling(window="1s", min_periods=1).var(ddof=0) expected = df.copy() - expected['B'] = [0.0] * 5 + expected["B"] = [0.0] * 5 tm.assert_frame_equal(result, expected) - result = df.rolling(window='1s', min_periods=1).var(ddof=1) + result = df.rolling(window="1s", min_periods=1).var(ddof=1) expected = df.copy() - expected['B'] = [np.nan] * 5 + expected["B"] = [np.nan] * 5 tm.assert_frame_equal(result, expected) - result = df.rolling(window='3s', min_periods=1).var(ddof=0) + result = df.rolling(window="3s", min_periods=1).var(ddof=0) expected = df.copy() - expected['B'] = [0.0] + [0.25] * 4 + expected["B"] = [0.0] + [0.25] * 4 tm.assert_frame_equal(result, expected) - result = df.rolling(window='5s', min_periods=1).var(ddof=1) + result = df.rolling(window="5s", min_periods=1).var(ddof=1) expected = df.copy() - expected['B'] = [np.nan, 0.5, 1.0, 1.0, 1 + 2 / 3.] + expected["B"] = [np.nan, 0.5, 1.0, 1.0, 1 + 2 / 3.0] tm.assert_frame_equal(result, expected) def test_ragged_skew(self): df = self.ragged - result = df.rolling(window='3s', min_periods=1).skew() + result = df.rolling(window="3s", min_periods=1).skew() expected = df.copy() - expected['B'] = [np.nan] * 5 + expected["B"] = [np.nan] * 5 tm.assert_frame_equal(result, expected) - result = df.rolling(window='5s', min_periods=1).skew() + result = df.rolling(window="5s", min_periods=1).skew() expected = df.copy() - expected['B'] = [np.nan] * 2 + [0.0, 0.0, 0.0] + expected["B"] = [np.nan] * 2 + [0.0, 0.0, 0.0] tm.assert_frame_equal(result, expected) def test_ragged_kurt(self): df = self.ragged - result = df.rolling(window='3s', min_periods=1).kurt() + result = df.rolling(window="3s", min_periods=1).kurt() expected = df.copy() - expected['B'] = [np.nan] * 5 + expected["B"] = [np.nan] * 5 tm.assert_frame_equal(result, expected) - result = df.rolling(window='5s', min_periods=1).kurt() + result = df.rolling(window="5s", min_periods=1).kurt() expected = df.copy() - expected['B'] = [np.nan] * 4 + [-1.2] + expected["B"] = [np.nan] * 4 + [-1.2] tm.assert_frame_equal(result, expected) def test_ragged_count(self): df = self.ragged - result = df.rolling(window='1s', min_periods=1).count() + result = df.rolling(window="1s", min_periods=1).count() expected = df.copy() - expected['B'] = [1.0, 1, 1, 1, 1] + expected["B"] = [1.0, 1, 1, 1, 1] tm.assert_frame_equal(result, expected) df = self.ragged - result = df.rolling(window='1s').count() + result = df.rolling(window="1s").count() tm.assert_frame_equal(result, expected) - result = df.rolling(window='2s', min_periods=1).count() + result = df.rolling(window="2s", min_periods=1).count() expected = df.copy() - expected['B'] = [1.0, 1, 2, 1, 2] + expected["B"] = [1.0, 1, 2, 1, 2] tm.assert_frame_equal(result, expected) - result = df.rolling(window='2s', min_periods=2).count() + result = df.rolling(window="2s", min_periods=2).count() expected = df.copy() - expected['B'] = [np.nan, np.nan, 2, np.nan, 2] + expected["B"] = [np.nan, np.nan, 2, np.nan, 2] tm.assert_frame_equal(result, expected) def test_regular_min(self): - df = DataFrame({'A': pd.date_range('20130101', - periods=5, - freq='s'), - 'B': [0.0, 1, 2, 3, 4]}).set_index('A') - result = df.rolling('1s').min() + df = DataFrame( + { + "A": pd.date_range("20130101", periods=5, freq="s"), + "B": [0.0, 1, 2, 3, 4], + } + ).set_index("A") + result = df.rolling("1s").min() expected = df.copy() - expected['B'] = [0.0, 1, 2, 3, 4] + expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) - df = DataFrame({'A': pd.date_range('20130101', - periods=5, - freq='s'), - 'B': [5, 4, 3, 4, 5]}).set_index('A') + df = DataFrame( + {"A": pd.date_range("20130101", periods=5, freq="s"), "B": [5, 4, 3, 4, 5]} + ).set_index("A") tm.assert_frame_equal(result, expected) - result = df.rolling('2s').min() + result = df.rolling("2s").min() expected = df.copy() - expected['B'] = [5.0, 4, 3, 3, 4] + expected["B"] = [5.0, 4, 3, 3, 4] tm.assert_frame_equal(result, expected) - result = df.rolling('5s').min() + result = df.rolling("5s").min() expected = df.copy() - expected['B'] = [5.0, 4, 3, 3, 3] + expected["B"] = [5.0, 4, 3, 3, 3] tm.assert_frame_equal(result, expected) def test_ragged_min(self): df = self.ragged - result = df.rolling(window='1s', min_periods=1).min() + result = df.rolling(window="1s", min_periods=1).min() expected = df.copy() - expected['B'] = [0.0, 1, 2, 3, 4] + expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) - result = df.rolling(window='2s', min_periods=1).min() + result = df.rolling(window="2s", min_periods=1).min() expected = df.copy() - expected['B'] = [0.0, 1, 1, 3, 3] + expected["B"] = [0.0, 1, 1, 3, 3] tm.assert_frame_equal(result, expected) - result = df.rolling(window='5s', min_periods=1).min() + result = df.rolling(window="5s", min_periods=1).min() expected = df.copy() - expected['B'] = [0.0, 0, 0, 1, 1] + expected["B"] = [0.0, 0, 0, 1, 1] tm.assert_frame_equal(result, expected) def test_perf_min(self): N = 10000 - dfp = DataFrame({'B': np.random.randn(N)}, - index=pd.date_range('20130101', - periods=N, - freq='s')) + dfp = DataFrame( + {"B": np.random.randn(N)}, + index=pd.date_range("20130101", periods=N, freq="s"), + ) expected = dfp.rolling(2, min_periods=1).min() - result = dfp.rolling('2s').min() + result = dfp.rolling("2s").min() assert ((result - expected) < 0.01).all().bool() expected = dfp.rolling(200, min_periods=1).min() - result = dfp.rolling('200s').min() + result = dfp.rolling("200s").min() assert ((result - expected) < 0.01).all().bool() def test_ragged_max(self): df = self.ragged - result = df.rolling(window='1s', min_periods=1).max() + result = df.rolling(window="1s", min_periods=1).max() expected = df.copy() - expected['B'] = [0.0, 1, 2, 3, 4] + expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) - result = df.rolling(window='2s', min_periods=1).max() + result = df.rolling(window="2s", min_periods=1).max() expected = df.copy() - expected['B'] = [0.0, 1, 2, 3, 4] + expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) - result = df.rolling(window='5s', min_periods=1).max() + result = df.rolling(window="5s", min_periods=1).max() expected = df.copy() - expected['B'] = [0.0, 1, 2, 3, 4] + expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) def test_ragged_apply(self, raw): @@ -4024,19 +4584,19 @@ def test_ragged_apply(self, raw): df = self.ragged f = lambda x: 1 - result = df.rolling(window='1s', min_periods=1).apply(f, raw=raw) + result = df.rolling(window="1s", min_periods=1).apply(f, raw=raw) expected = df.copy() - expected['B'] = 1. + expected["B"] = 1.0 tm.assert_frame_equal(result, expected) - result = df.rolling(window='2s', min_periods=1).apply(f, raw=raw) + result = df.rolling(window="2s", min_periods=1).apply(f, raw=raw) expected = df.copy() - expected['B'] = 1. + expected["B"] = 1.0 tm.assert_frame_equal(result, expected) - result = df.rolling(window='5s', min_periods=1).apply(f, raw=raw) + result = df.rolling(window="5s", min_periods=1).apply(f, raw=raw) expected = df.copy() - expected['B'] = 1. + expected["B"] = 1.0 tm.assert_frame_equal(result, expected) def test_all(self): @@ -4044,10 +4604,20 @@ def test_all(self): # simple comparison of integer vs time-based windowing df = self.regular * 2 er = df.rolling(window=1) - r = df.rolling(window='1s') - - for f in ['sum', 'mean', 'count', 'median', 'std', - 'var', 'kurt', 'skew', 'min', 'max']: + r = df.rolling(window="1s") + + for f in [ + "sum", + "mean", + "count", + "median", + "std", + "var", + "kurt", + "skew", + "min", + "max", + ]: result = getattr(r, f)() expected = getattr(er, f)() @@ -4061,7 +4631,7 @@ def test_all_apply(self, raw): df = self.regular * 2 er = df.rolling(window=1) - r = df.rolling(window='1s') + r = df.rolling(window="1s") result = r.apply(lambda x: 1, raw=raw) expected = er.apply(lambda x: 1, raw=raw) @@ -4071,17 +4641,26 @@ def test_all2(self): # more sophisticated comparison of integer vs. # time-based windowing - df = DataFrame({'B': np.arange(50)}, - index=pd.date_range('20130101', - periods=50, freq='H') - ) + df = DataFrame( + {"B": np.arange(50)}, index=pd.date_range("20130101", periods=50, freq="H") + ) # in-range data dft = df.between_time("09:00", "16:00") - r = dft.rolling(window='5H') - - for f in ['sum', 'mean', 'count', 'median', 'std', - 'var', 'kurt', 'skew', 'min', 'max']: + r = dft.rolling(window="5H") + + for f in [ + "sum", + "mean", + "count", + "median", + "std", + "var", + "kurt", + "skew", + "min", + "max", + ]: result = getattr(r, f)() @@ -4092,8 +4671,12 @@ def test_all2(self): def agg_by_day(x): x = x.between_time("09:00", "16:00") return getattr(x.rolling(5, min_periods=1), f)() - expected = df.groupby(df.index.day).apply( - agg_by_day).reset_index(level=0, drop=True) + + expected = ( + df.groupby(df.index.day) + .apply(agg_by_day) + .reset_index(level=0, drop=True) + ) tm.assert_frame_equal(result, expected) @@ -4103,52 +4686,65 @@ def test_groupby_monotonic(self): # we don't need to validate monotonicity when grouping data = [ - ['David', '1/1/2015', 100], ['David', '1/5/2015', 500], - ['David', '5/30/2015', 50], ['David', '7/25/2015', 50], - ['Ryan', '1/4/2014', 100], ['Ryan', '1/19/2015', 500], - ['Ryan', '3/31/2016', 50], ['Joe', '7/1/2015', 100], - ['Joe', '9/9/2015', 500], ['Joe', '10/15/2015', 50]] - - df = DataFrame(data=data, columns=['name', 'date', 'amount']) - df['date'] = pd.to_datetime(df['date']) - - expected = df.set_index('date').groupby('name').apply( - lambda x: x.rolling('180D')['amount'].sum()) - result = df.groupby('name').rolling('180D', on='date')['amount'].sum() + ["David", "1/1/2015", 100], + ["David", "1/5/2015", 500], + ["David", "5/30/2015", 50], + ["David", "7/25/2015", 50], + ["Ryan", "1/4/2014", 100], + ["Ryan", "1/19/2015", 500], + ["Ryan", "3/31/2016", 50], + ["Joe", "7/1/2015", 100], + ["Joe", "9/9/2015", 500], + ["Joe", "10/15/2015", 50], + ] + + df = DataFrame(data=data, columns=["name", "date", "amount"]) + df["date"] = pd.to_datetime(df["date"]) + + expected = ( + df.set_index("date") + .groupby("name") + .apply(lambda x: x.rolling("180D")["amount"].sum()) + ) + result = df.groupby("name").rolling("180D", on="date")["amount"].sum() tm.assert_series_equal(result, expected) def test_non_monotonic(self): # GH 13966 (similar to #15130, closed by #15175) - dates = pd.date_range(start='2016-01-01 09:30:00', - periods=20, freq='s') - df = DataFrame({'A': [1] * 20 + [2] * 12 + [3] * 8, - 'B': np.concatenate((dates, dates)), - 'C': np.arange(40)}) + dates = pd.date_range(start="2016-01-01 09:30:00", periods=20, freq="s") + df = DataFrame( + { + "A": [1] * 20 + [2] * 12 + [3] * 8, + "B": np.concatenate((dates, dates)), + "C": np.arange(40), + } + ) - result = df.groupby('A').rolling('4s', on='B').C.mean() - expected = df.set_index('B').groupby('A').apply( - lambda x: x.rolling('4s')['C'].mean()) + result = df.groupby("A").rolling("4s", on="B").C.mean() + expected = ( + df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) + ) tm.assert_series_equal(result, expected) - df2 = df.sort_values('B') - result = df2.groupby('A').rolling('4s', on='B').C.mean() + df2 = df.sort_values("B") + result = df2.groupby("A").rolling("4s", on="B").C.mean() tm.assert_series_equal(result, expected) def test_rolling_cov_offset(self): # GH16058 - idx = pd.date_range('2017-01-01', periods=24, freq='1h') + idx = pd.date_range("2017-01-01", periods=24, freq="1h") ss = Series(np.arange(len(idx)), index=idx) - result = ss.rolling('2h').cov() + result = ss.rolling("2h").cov() expected = Series([np.nan] + [0.5] * (len(idx) - 1), index=idx) tm.assert_series_equal(result, expected) expected2 = ss.rolling(2, min_periods=1).cov() tm.assert_series_equal(result, expected2) - result = ss.rolling('3h').cov() + result = ss.rolling("3h").cov() expected = Series([np.nan, 0.5] + [1.0] * (len(idx) - 2), index=idx) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/tools/test_numeric.py b/pandas/tests/tools/test_numeric.py index 6e3e768f9360f6..bc1eee2a0aaf2a 100644 --- a/pandas/tests/tools/test_numeric.py +++ b/pandas/tests/tools/test_numeric.py @@ -24,10 +24,7 @@ def transform(request): return request.param -@pytest.fixture(params=[ - 47393996303418497800, - 100000000000000000000 -]) +@pytest.fixture(params=[47393996303418497800, 100000000000000000000]) def large_val(request): return request.param @@ -37,19 +34,24 @@ def multiple_elts(request): return request.param -@pytest.fixture(params=[ - (lambda x: Index(x, name="idx"), tm.assert_index_equal), - (lambda x: Series(x, name="ser"), tm.assert_series_equal), - (lambda x: np.array(Index(x).values), tm.assert_numpy_array_equal) -]) +@pytest.fixture( + params=[ + (lambda x: Index(x, name="idx"), tm.assert_index_equal), + (lambda x: Series(x, name="ser"), tm.assert_series_equal), + (lambda x: np.array(Index(x).values), tm.assert_numpy_array_equal), + ] +) def transform_assert_equal(request): return request.param -@pytest.mark.parametrize("input_kwargs,result_kwargs", [ - (dict(), dict(dtype=np.int64)), - (dict(errors="coerce", downcast="integer"), dict(dtype=np.int8)) -]) +@pytest.mark.parametrize( + "input_kwargs,result_kwargs", + [ + (dict(), dict(dtype=np.int64)), + (dict(errors="coerce", downcast="integer"), dict(dtype=np.int8)), + ], +) def test_empty(input_kwargs, result_kwargs): # see gh-16302 ser = Series([], dtype=object) @@ -68,13 +70,15 @@ def test_series(last_val): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("data", [ - [1, 3, 4, 5], - [1., 3., 4., 5.], - - # Bool is regarded as numeric. - [True, False, True, True] -]) +@pytest.mark.parametrize( + "data", + [ + [1, 3, 4, 5], + [1.0, 3.0, 4.0, 5.0], + # Bool is regarded as numeric. + [True, False, True, True], + ], +) def test_series_numeric(data): ser = Series(data, index=list("ABCD"), name="EFG") @@ -82,12 +86,16 @@ def test_series_numeric(data): tm.assert_series_equal(result, ser) -@pytest.mark.parametrize("data,msg", [ - ([1, -3.14, "apple"], - 'Unable to parse string "apple" at position 2'), - (["orange", 1, -3.14, "apple"], - 'Unable to parse string "orange" at position 0') -]) +@pytest.mark.parametrize( + "data,msg", + [ + ([1, -3.14, "apple"], 'Unable to parse string "apple" at position 2'), + ( + ["orange", 1, -3.14, "apple"], + 'Unable to parse string "orange" at position 0', + ), + ], +) def test_error(data, msg): ser = Series(data) @@ -95,10 +103,9 @@ def test_error(data, msg): to_numeric(ser, errors="raise") -@pytest.mark.parametrize("errors,exp_data", [ - ("ignore", [1, -3.14, "apple"]), - ("coerce", [1, -3.14, np.nan]) -]) +@pytest.mark.parametrize( + "errors,exp_data", [("ignore", [1, -3.14, "apple"]), ("coerce", [1, -3.14, np.nan])] +) def test_ignore_error(errors, exp_data): ser = Series([1, -3.14, "apple"]) result = to_numeric(ser, errors=errors) @@ -107,13 +114,15 @@ def test_ignore_error(errors, exp_data): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("errors,exp", [ - ("raise", 'Unable to parse string "apple" at position 2'), - ("ignore", [True, False, "apple"]), - - # Coerces to float. - ("coerce", [1., 0., np.nan]) -]) +@pytest.mark.parametrize( + "errors,exp", + [ + ("raise", 'Unable to parse string "apple" at position 2'), + ("ignore", [True, False, "apple"]), + # Coerces to float. + ("coerce", [1.0, 0.0, np.nan]), + ], +) def test_bool_handling(errors, exp): ser = Series([True, False, "apple"]) @@ -135,22 +144,22 @@ def test_list(): tm.assert_numpy_array_equal(res, expected) -@pytest.mark.parametrize("data,arr_kwargs", [ - ([1, 3, 4, 5], dict(dtype=np.int64)), - ([1., 3., 4., 5.], dict()), - - # Boolean is regarded as numeric. - ([True, False, True, True], dict()) -]) +@pytest.mark.parametrize( + "data,arr_kwargs", + [ + ([1, 3, 4, 5], dict(dtype=np.int64)), + ([1.0, 3.0, 4.0, 5.0], dict()), + # Boolean is regarded as numeric. + ([True, False, True, True], dict()), + ], +) def test_list_numeric(data, arr_kwargs): result = to_numeric(data) expected = np.array(data, **arr_kwargs) tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("kwargs", [ - dict(dtype="O"), dict() -]) +@pytest.mark.parametrize("kwargs", [dict(dtype="O"), dict()]) def test_numeric(kwargs): data = [1, -3.14, 7] @@ -161,24 +170,25 @@ def test_numeric(kwargs): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("columns", [ - # One column. - "a", - - # Multiple columns. - ["a", "b"] -]) +@pytest.mark.parametrize( + "columns", + [ + # One column. + "a", + # Multiple columns. + ["a", "b"], + ], +) def test_numeric_df_columns(columns): # see gh-14827 - df = DataFrame(dict( - a=[1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), "0.1"], - b=[1.0, 2.0, 3.0, 4.0], - )) + df = DataFrame( + dict( + a=[1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), "0.1"], + b=[1.0, 2.0, 3.0, 4.0], + ) + ) - expected = DataFrame(dict( - a=[1.2, 3.14, np.inf, 0.1], - b=[1.0, 2.0, 3.0, 4.0], - )) + expected = DataFrame(dict(a=[1.2, 3.14, np.inf, 0.1], b=[1.0, 2.0, 3.0, 4.0])) df_copy = df.copy() df_copy[columns] = df_copy[columns].apply(to_numeric) @@ -186,12 +196,16 @@ def test_numeric_df_columns(columns): tm.assert_frame_equal(df_copy, expected) -@pytest.mark.parametrize("data,exp_data", [ - ([[decimal.Decimal(3.14), 1.0], decimal.Decimal(1.6), 0.1], - [[3.14, 1.0], 1.6, 0.1]), - ([np.array([decimal.Decimal(3.14), 1.0]), 0.1], - [[3.14, 1.0], 0.1]) -]) +@pytest.mark.parametrize( + "data,exp_data", + [ + ( + [[decimal.Decimal(3.14), 1.0], decimal.Decimal(1.6), 0.1], + [[3.14, 1.0], 1.6, 0.1], + ), + ([np.array([decimal.Decimal(3.14), 1.0]), 0.1], [[3.14, 1.0], 0.1]), + ], +) def test_numeric_embedded_arr_likes(data, exp_data): # Test to_numeric with embedded lists and arrays df = DataFrame(dict(a=data)) @@ -238,13 +252,11 @@ def test_really_large_scalar(large_val, signed, transform, errors): with pytest.raises(ValueError, match=msg): to_numeric(val, **kwargs) else: - expected = float(val) if (errors == "coerce" and - val_is_string) else val + expected = float(val) if (errors == "coerce" and val_is_string) else val tm.assert_almost_equal(to_numeric(val, **kwargs), expected) -def test_really_large_in_arr(large_val, signed, transform, - multiple_elts, errors): +def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors): # see gh-24910 kwargs = dict(errors=errors) if errors is not None else dict() val = -large_val if signed else large_val @@ -283,8 +295,7 @@ def test_really_large_in_arr(large_val, signed, transform, tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype)) -def test_really_large_in_arr_consistent(large_val, signed, - multiple_elts, errors): +def test_really_large_in_arr_consistent(large_val, signed, multiple_elts, errors): # see gh-24910 # # Even if we discover that we have to hold float, does not mean @@ -314,11 +325,14 @@ def test_really_large_in_arr_consistent(large_val, signed, tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype)) -@pytest.mark.parametrize("errors,checker", [ - ("raise", 'Unable to parse string "fail" at position 0'), - ("ignore", lambda x: x == "fail"), - ("coerce", lambda x: np.isnan(x)) -]) +@pytest.mark.parametrize( + "errors,checker", + [ + ("raise", 'Unable to parse string "fail" at position 0'), + ("ignore", lambda x: x == "fail"), + ("coerce", lambda x: np.isnan(x)), + ], +) def test_scalar_fail(errors, checker): scalar = "fail" @@ -329,10 +343,7 @@ def test_scalar_fail(errors, checker): assert checker(to_numeric(scalar, errors=errors)) -@pytest.mark.parametrize("data", [ - [1, 2, 3], - [1., np.nan, 3, np.nan] -]) +@pytest.mark.parametrize("data", [[1, 2, 3], [1.0, np.nan, 3, np.nan]]) def test_numeric_dtypes(data, transform_assert_equal): transform, assert_equal = transform_assert_equal data = transform(data) @@ -341,10 +352,13 @@ def test_numeric_dtypes(data, transform_assert_equal): assert_equal(result, data) -@pytest.mark.parametrize("data,exp", [ - (["1", "2", "3"], np.array([1, 2, 3], dtype="int64")), - (["1.5", "2.7", "3.4"], np.array([1.5, 2.7, 3.4])) -]) +@pytest.mark.parametrize( + "data,exp", + [ + (["1", "2", "3"], np.array([1, 2, 3], dtype="int64")), + (["1.5", "2.7", "3.4"], np.array([1.5, 2.7, 3.4])), + ], +) def test_str(data, exp, transform_assert_equal): transform, assert_equal = transform_assert_equal result = to_numeric(transform(data)) @@ -386,11 +400,14 @@ def test_period(transform_assert_equal): pytest.skip("Missing PeriodDtype support in to_numeric") -@pytest.mark.parametrize("errors,expected", [ - ("raise", "Invalid object type at position 0"), - ("ignore", Series([[10.0, 2], 1.0, "apple"])), - ("coerce", Series([np.nan, 1.0, np.nan])) -]) +@pytest.mark.parametrize( + "errors,expected", + [ + ("raise", "Invalid object type at position 0"), + ("ignore", Series([[10.0, 2], 1.0, "apple"])), + ("coerce", Series([np.nan, 1.0, np.nan])), + ], +) def test_non_hashable(errors, expected): # see gh-13324 ser = Series([[10.0, 2], 1.0, "apple"]) @@ -423,23 +440,26 @@ def test_errors_invalid_value(): to_numeric(data, errors=invalid_error_value) -@pytest.mark.parametrize("data", [ - ["1", 2, 3], - [1, 2, 3], - np.array(["1970-01-02", "1970-01-03", - "1970-01-04"], dtype="datetime64[D]") -]) -@pytest.mark.parametrize("kwargs,exp_dtype", [ - # Basic function tests. - (dict(), np.int64), - (dict(downcast=None), np.int64), - - # Support below np.float32 is rare and far between. - (dict(downcast="float"), np.dtype(np.float32).char), - - # Basic dtype support. - (dict(downcast="unsigned"), np.dtype(np.typecodes["UnsignedInteger"][0])) -]) +@pytest.mark.parametrize( + "data", + [ + ["1", 2, 3], + [1, 2, 3], + np.array(["1970-01-02", "1970-01-03", "1970-01-04"], dtype="datetime64[D]"), + ], +) +@pytest.mark.parametrize( + "kwargs,exp_dtype", + [ + # Basic function tests. + (dict(), np.int64), + (dict(downcast=None), np.int64), + # Support below np.float32 is rare and far between. + (dict(downcast="float"), np.dtype(np.float32).char), + # Basic dtype support. + (dict(downcast="unsigned"), np.dtype(np.typecodes["UnsignedInteger"][0])), + ], +) def test_downcast_basic(data, kwargs, exp_dtype): # see gh-13352 result = to_numeric(data, **kwargs) @@ -448,12 +468,14 @@ def test_downcast_basic(data, kwargs, exp_dtype): @pytest.mark.parametrize("signed_downcast", ["integer", "signed"]) -@pytest.mark.parametrize("data", [ - ["1", 2, 3], - [1, 2, 3], - np.array(["1970-01-02", "1970-01-03", - "1970-01-04"], dtype="datetime64[D]") -]) +@pytest.mark.parametrize( + "data", + [ + ["1", 2, 3], + [1, 2, 3], + np.array(["1970-01-02", "1970-01-03", "1970-01-04"], dtype="datetime64[D]"), + ], +) def test_signed_downcast(data, signed_downcast): # see gh-13352 smallest_int_dtype = np.dtype(np.typecodes["Integer"][0]) @@ -470,8 +492,7 @@ def test_ignore_downcast_invalid_data(): data = ["foo", 2, 3] expected = np.array(data, dtype=object) - res = to_numeric(data, errors="ignore", - downcast="unsigned") + res = to_numeric(data, errors="ignore", downcast="unsigned") tm.assert_numpy_array_equal(res, expected) @@ -486,13 +507,18 @@ def test_ignore_downcast_neg_to_unsigned(): @pytest.mark.parametrize("downcast", ["integer", "signed", "unsigned"]) -@pytest.mark.parametrize("data,expected", [ - (["1.1", 2, 3], - np.array([1.1, 2, 3], dtype=np.float64)), - ([10000.0, 20000, 3000, 40000.36, 50000, 50000.00], - np.array([10000.0, 20000, 3000, - 40000.36, 50000, 50000.00], dtype=np.float64)) -]) +@pytest.mark.parametrize( + "data,expected", + [ + (["1.1", 2, 3], np.array([1.1, 2, 3], dtype=np.float64)), + ( + [10000.0, 20000, 3000, 40000.36, 50000, 50000.00], + np.array( + [10000.0, 20000, 3000, 40000.36, 50000, 50000.00], dtype=np.float64 + ), + ), + ], +) def test_ignore_downcast_cannot_convert_float(data, expected, downcast): # Cannot cast to an integer (signed or unsigned) # because we have a float number. @@ -500,11 +526,10 @@ def test_ignore_downcast_cannot_convert_float(data, expected, downcast): tm.assert_numpy_array_equal(res, expected) -@pytest.mark.parametrize("downcast,expected_dtype", [ - ("integer", np.int16), - ("signed", np.int16), - ("unsigned", np.uint16) -]) +@pytest.mark.parametrize( + "downcast,expected_dtype", + [("integer", np.int16), ("signed", np.int16), ("unsigned", np.uint16)], +) def test_downcast_not8bit(downcast, expected_dtype): # the smallest integer dtype need not be np.(u)int8 data = ["256", 257, 258] @@ -514,54 +539,47 @@ def test_downcast_not8bit(downcast, expected_dtype): tm.assert_numpy_array_equal(res, expected) -@pytest.mark.parametrize("dtype,downcast,min_max", [ - ("int8", "integer", [iinfo(np.int8).min, - iinfo(np.int8).max]), - ("int16", "integer", [iinfo(np.int16).min, - iinfo(np.int16).max]), - ("int32", "integer", [iinfo(np.int32).min, - iinfo(np.int32).max]), - ("int64", "integer", [iinfo(np.int64).min, - iinfo(np.int64).max]), - ("uint8", "unsigned", [iinfo(np.uint8).min, - iinfo(np.uint8).max]), - ("uint16", "unsigned", [iinfo(np.uint16).min, - iinfo(np.uint16).max]), - ("uint32", "unsigned", [iinfo(np.uint32).min, - iinfo(np.uint32).max]), - ("uint64", "unsigned", [iinfo(np.uint64).min, - iinfo(np.uint64).max]), - ("int16", "integer", [iinfo(np.int8).min, - iinfo(np.int8).max + 1]), - ("int32", "integer", [iinfo(np.int16).min, - iinfo(np.int16).max + 1]), - ("int64", "integer", [iinfo(np.int32).min, - iinfo(np.int32).max + 1]), - ("int16", "integer", [iinfo(np.int8).min - 1, - iinfo(np.int16).max]), - ("int32", "integer", [iinfo(np.int16).min - 1, - iinfo(np.int32).max]), - ("int64", "integer", [iinfo(np.int32).min - 1, - iinfo(np.int64).max]), - ("uint16", "unsigned", [iinfo(np.uint8).min, - iinfo(np.uint8).max + 1]), - ("uint32", "unsigned", [iinfo(np.uint16).min, - iinfo(np.uint16).max + 1]), - ("uint64", "unsigned", [iinfo(np.uint32).min, - iinfo(np.uint32).max + 1]) -]) +@pytest.mark.parametrize( + "dtype,downcast,min_max", + [ + ("int8", "integer", [iinfo(np.int8).min, iinfo(np.int8).max]), + ("int16", "integer", [iinfo(np.int16).min, iinfo(np.int16).max]), + ("int32", "integer", [iinfo(np.int32).min, iinfo(np.int32).max]), + ("int64", "integer", [iinfo(np.int64).min, iinfo(np.int64).max]), + ("uint8", "unsigned", [iinfo(np.uint8).min, iinfo(np.uint8).max]), + ("uint16", "unsigned", [iinfo(np.uint16).min, iinfo(np.uint16).max]), + ("uint32", "unsigned", [iinfo(np.uint32).min, iinfo(np.uint32).max]), + ("uint64", "unsigned", [iinfo(np.uint64).min, iinfo(np.uint64).max]), + ("int16", "integer", [iinfo(np.int8).min, iinfo(np.int8).max + 1]), + ("int32", "integer", [iinfo(np.int16).min, iinfo(np.int16).max + 1]), + ("int64", "integer", [iinfo(np.int32).min, iinfo(np.int32).max + 1]), + ("int16", "integer", [iinfo(np.int8).min - 1, iinfo(np.int16).max]), + ("int32", "integer", [iinfo(np.int16).min - 1, iinfo(np.int32).max]), + ("int64", "integer", [iinfo(np.int32).min - 1, iinfo(np.int64).max]), + ("uint16", "unsigned", [iinfo(np.uint8).min, iinfo(np.uint8).max + 1]), + ("uint32", "unsigned", [iinfo(np.uint16).min, iinfo(np.uint16).max + 1]), + ("uint64", "unsigned", [iinfo(np.uint32).min, iinfo(np.uint32).max + 1]), + ], +) def test_downcast_limits(dtype, downcast, min_max): # see gh-14404: test the limits of each downcast. series = to_numeric(Series(min_max), downcast=downcast) assert series.dtype == dtype -@pytest.mark.parametrize("data,exp_data", [ - ([200, 300, "", "NaN", 30000000000000000000], - [200, 300, np.nan, np.nan, 30000000000000000000]), - (["12345678901234567890", "1234567890", "ITEM"], - [12345678901234567890, 1234567890, np.nan]) -]) +@pytest.mark.parametrize( + "data,exp_data", + [ + ( + [200, 300, "", "NaN", 30000000000000000000], + [200, 300, np.nan, np.nan, 30000000000000000000], + ), + ( + ["12345678901234567890", "1234567890", "ITEM"], + [12345678901234567890, 1234567890, np.nan], + ), + ], +) def test_coerce_uint64_conflict(data, exp_data): # see gh-17007 and gh-17125 # @@ -572,10 +590,13 @@ def test_coerce_uint64_conflict(data, exp_data): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("errors,exp", [ - ("ignore", Series(["12345678901234567890", "1234567890", "ITEM"])), - ("raise", "Unable to parse string") -]) +@pytest.mark.parametrize( + "errors,exp", + [ + ("ignore", Series(["12345678901234567890", "1234567890", "ITEM"])), + ("raise", "Unable to parse string"), + ], +) def test_non_coerce_uint64_conflict(errors, exp): # see gh-17007 and gh-17125 # diff --git a/pandas/tests/tseries/frequencies/test_freq_code.py b/pandas/tests/tseries/frequencies/test_freq_code.py index 7de1e8117289ed..be07f829dbae85 100644 --- a/pandas/tests/tseries/frequencies/test_freq_code.py +++ b/pandas/tests/tseries/frequencies/test_freq_code.py @@ -2,7 +2,11 @@ from pandas._libs.tslibs import frequencies as libfrequencies, resolution from pandas._libs.tslibs.frequencies import ( - FreqGroup, _period_code_map, get_freq, get_freq_code) + FreqGroup, + _period_code_map, + get_freq, + get_freq_code, +) import pandas.tseries.offsets as offsets @@ -12,11 +16,20 @@ def period_code_item(request): return request.param -@pytest.mark.parametrize("freqstr,expected", [ - ("A", 1000), ("3A", 1000), ("-1A", 1000), - ("Y", 1000), ("3Y", 1000), ("-1Y", 1000), - ("W", 4000), ("W-MON", 4001), ("W-FRI", 4005) -]) +@pytest.mark.parametrize( + "freqstr,expected", + [ + ("A", 1000), + ("3A", 1000), + ("-1A", 1000), + ("Y", 1000), + ("3Y", 1000), + ("-1Y", 1000), + ("W", 4000), + ("W-MON", 4001), + ("W-FRI", 4005), + ], +) def test_freq_code(freqstr, expected): assert get_freq(freqstr) == expected @@ -26,15 +39,31 @@ def test_freq_code_match(period_code_item): assert get_freq(freqstr) == code -@pytest.mark.parametrize("freqstr,expected", [ - ("A", 1000), ("3A", 1000), ("-1A", 1000), ("A-JAN", 1000), - ("A-MAY", 1000), ("Y", 1000), ("3Y", 1000), ("-1Y", 1000), - ("Y-JAN", 1000), ("Y-MAY", 1000), (offsets.YearEnd(), 1000), - (offsets.YearEnd(month=1), 1000), (offsets.YearEnd(month=5), 1000), - ("W", 4000), ("W-MON", 4000), ("W-FRI", 4000), (offsets.Week(), 4000), - (offsets.Week(weekday=1), 4000), (offsets.Week(weekday=5), 4000), - ("T", FreqGroup.FR_MIN), -]) +@pytest.mark.parametrize( + "freqstr,expected", + [ + ("A", 1000), + ("3A", 1000), + ("-1A", 1000), + ("A-JAN", 1000), + ("A-MAY", 1000), + ("Y", 1000), + ("3Y", 1000), + ("-1Y", 1000), + ("Y-JAN", 1000), + ("Y-MAY", 1000), + (offsets.YearEnd(), 1000), + (offsets.YearEnd(month=1), 1000), + (offsets.YearEnd(month=5), 1000), + ("W", 4000), + ("W-MON", 4000), + ("W-FRI", 4000), + (offsets.Week(), 4000), + (offsets.Week(weekday=1), 4000), + (offsets.Week(weekday=5), 4000), + ("T", FreqGroup.FR_MIN), + ], +) def test_freq_group(freqstr, expected): assert resolution.get_freq_group(freqstr) == expected @@ -48,10 +77,10 @@ def test_freq_group_match(period_code_item): assert str_group == code_group == code // 1000 * 1000 -@pytest.mark.parametrize("freqstr,exp_freqstr", [ - ("D", "D"), ("W", "D"), ("M", "D"), - ("S", "S"), ("T", "S"), ("H", "S") -]) +@pytest.mark.parametrize( + "freqstr,exp_freqstr", + [("D", "D"), ("W", "D"), ("M", "D"), ("S", "S"), ("T", "S"), ("H", "S")], +) def test_get_to_timestamp_base(freqstr, exp_freqstr): tsb = libfrequencies.get_to_timestamp_base @@ -61,18 +90,26 @@ def test_get_to_timestamp_base(freqstr, exp_freqstr): _reso = resolution.Resolution -@pytest.mark.parametrize("freqstr,expected", [ - ("A", "year"), ("Q", "quarter"), ("M", "month"), - ("D", "day"), ("H", "hour"), ("T", "minute"), - ("S", "second"), ("L", "millisecond"), - ("U", "microsecond"), ("N", "nanosecond") -]) +@pytest.mark.parametrize( + "freqstr,expected", + [ + ("A", "year"), + ("Q", "quarter"), + ("M", "month"), + ("D", "day"), + ("H", "hour"), + ("T", "minute"), + ("S", "second"), + ("L", "millisecond"), + ("U", "microsecond"), + ("N", "nanosecond"), + ], +) def test_get_str_from_freq(freqstr, expected): assert _reso.get_str_from_freq(freqstr) == expected -@pytest.mark.parametrize("freq", ["A", "Q", "M", "D", "H", - "T", "S", "L", "U", "N"]) +@pytest.mark.parametrize("freq", ["A", "Q", "M", "D", "H", "T", "S", "L", "U", "N"]) def test_get_freq_roundtrip(freq): result = _reso.get_freq(_reso.get_str_from_freq(freq)) assert freq == result @@ -84,22 +121,30 @@ def test_get_freq_roundtrip2(freq): assert freq == result -@pytest.mark.parametrize("args,expected", [ - ((1.5, "T"), (90, "S")), ((62.4, "T"), (3744, "S")), - ((1.04, "H"), (3744, "S")), ((1, "D"), (1, "D")), - ((0.342931, "H"), (1234551600, "U")), ((1.2345, "D"), (106660800, "L")) -]) +@pytest.mark.parametrize( + "args,expected", + [ + ((1.5, "T"), (90, "S")), + ((62.4, "T"), (3744, "S")), + ((1.04, "H"), (3744, "S")), + ((1, "D"), (1, "D")), + ((0.342931, "H"), (1234551600, "U")), + ((1.2345, "D"), (106660800, "L")), + ], +) def test_resolution_bumping(args, expected): # see gh-14378 assert _reso.get_stride_from_decimal(*args) == expected -@pytest.mark.parametrize("args", [ - (0.5, "N"), - - # Too much precision in the input can prevent. - (0.3429324798798269273987982, "H") -]) +@pytest.mark.parametrize( + "args", + [ + (0.5, "N"), + # Too much precision in the input can prevent. + (0.3429324798798269273987982, "H"), + ], +) def test_cat(args): msg = "Could not convert to integer offset at any resolution" @@ -107,38 +152,37 @@ def test_cat(args): _reso.get_stride_from_decimal(*args) -@pytest.mark.parametrize("freq_input,expected", [ - # Frequency string. - ("A", (get_freq("A"), 1)), - ("3D", (get_freq("D"), 3)), - ("-2M", (get_freq("M"), -2)), - - # Tuple. - (("D", 1), (get_freq("D"), 1)), - (("A", 3), (get_freq("A"), 3)), - (("M", -2), (get_freq("M"), -2)), - ((5, "T"), (FreqGroup.FR_MIN, 5)), - - # Numeric Tuple. - ((1000, 1), (1000, 1)), - - # Offsets. - (offsets.Day(), (get_freq("D"), 1)), - (offsets.Day(3), (get_freq("D"), 3)), - (offsets.Day(-2), (get_freq("D"), -2)), - (offsets.MonthEnd(), (get_freq("M"), 1)), - (offsets.MonthEnd(3), (get_freq("M"), 3)), - (offsets.MonthEnd(-2), (get_freq("M"), -2)), - (offsets.Week(), (get_freq("W"), 1)), - (offsets.Week(3), (get_freq("W"), 3)), - (offsets.Week(-2), (get_freq("W"), -2)), - (offsets.Hour(), (FreqGroup.FR_HR, 1)), - - # Monday is weekday=0. - (offsets.Week(weekday=1), (get_freq("W-TUE"), 1)), - (offsets.Week(3, weekday=0), (get_freq("W-MON"), 3)), - (offsets.Week(-2, weekday=4), (get_freq("W-FRI"), -2)), -]) +@pytest.mark.parametrize( + "freq_input,expected", + [ + # Frequency string. + ("A", (get_freq("A"), 1)), + ("3D", (get_freq("D"), 3)), + ("-2M", (get_freq("M"), -2)), + # Tuple. + (("D", 1), (get_freq("D"), 1)), + (("A", 3), (get_freq("A"), 3)), + (("M", -2), (get_freq("M"), -2)), + ((5, "T"), (FreqGroup.FR_MIN, 5)), + # Numeric Tuple. + ((1000, 1), (1000, 1)), + # Offsets. + (offsets.Day(), (get_freq("D"), 1)), + (offsets.Day(3), (get_freq("D"), 3)), + (offsets.Day(-2), (get_freq("D"), -2)), + (offsets.MonthEnd(), (get_freq("M"), 1)), + (offsets.MonthEnd(3), (get_freq("M"), 3)), + (offsets.MonthEnd(-2), (get_freq("M"), -2)), + (offsets.Week(), (get_freq("W"), 1)), + (offsets.Week(3), (get_freq("W"), 3)), + (offsets.Week(-2), (get_freq("W"), -2)), + (offsets.Hour(), (FreqGroup.FR_HR, 1)), + # Monday is weekday=0. + (offsets.Week(weekday=1), (get_freq("W-TUE"), 1)), + (offsets.Week(3, weekday=0), (get_freq("W-MON"), 3)), + (offsets.Week(-2, weekday=4), (get_freq("W-FRI"), -2)), + ], +) def test_get_freq_code(freq_input, expected): assert get_freq_code(freq_input) == expected diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index fb65ec1eb99619..4c8f6253cdf7ba 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -7,8 +7,7 @@ from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG from pandas.compat import is_platform_windows -from pandas import ( - DatetimeIndex, Index, Series, Timestamp, date_range, period_range) +from pandas import DatetimeIndex, Index, Series, Timestamp, date_range, period_range from pandas.core.tools.datetimes import to_datetime import pandas.util.testing as tm @@ -39,21 +38,38 @@ def _check_generated_range(start, periods, freq): else: inf_freq = frequencies.infer_freq(index) is_dec_range = inf_freq == "Q-DEC" and gen.freqstr in ( - "Q", "Q-DEC", "Q-SEP", "Q-JUN", "Q-MAR") + "Q", + "Q-DEC", + "Q-SEP", + "Q-JUN", + "Q-MAR", + ) is_nov_range = inf_freq == "Q-NOV" and gen.freqstr in ( - "Q-NOV", "Q-AUG", "Q-MAY", "Q-FEB") + "Q-NOV", + "Q-AUG", + "Q-MAY", + "Q-FEB", + ) is_oct_range = inf_freq == "Q-OCT" and gen.freqstr in ( - "Q-OCT", "Q-JUL", "Q-APR", "Q-JAN") + "Q-OCT", + "Q-JUL", + "Q-APR", + "Q-JAN", + ) assert is_dec_range or is_nov_range or is_oct_range -@pytest.fixture(params=[(timedelta(1), "D"), - (timedelta(hours=1), "H"), - (timedelta(minutes=1), "T"), - (timedelta(seconds=1), "S"), - (np.timedelta64(1, "ns"), "N"), - (timedelta(microseconds=1), "U"), - (timedelta(microseconds=1000), "L")]) +@pytest.fixture( + params=[ + (timedelta(1), "D"), + (timedelta(hours=1), "H"), + (timedelta(minutes=1), "T"), + (timedelta(seconds=1), "S"), + (np.timedelta64(1, "ns"), "N"), + (timedelta(microseconds=1), "U"), + (timedelta(microseconds=1000), "L"), + ] +) def base_delta_code_pair(request): return request.param @@ -128,8 +144,7 @@ def test_fifth_week_of_month_infer(): def test_week_of_month_fake(): # All of these dates are on same day # of week and are 4 or 5 weeks apart. - index = DatetimeIndex(["2013-08-27", "2013-10-01", - "2013-10-29", "2013-11-26"]) + index = DatetimeIndex(["2013-08-27", "2013-10-01", "2013-10-29", "2013-11-26"]) assert frequencies.infer_freq(index) != "WOM-4TUE" @@ -137,8 +152,10 @@ def test_fifth_week_of_month(): # see gh-9425 # # Only supports freq up to WOM-4. - msg = ("Of the four parameters: start, end, periods, " - "and freq, exactly three must be specified") + msg = ( + "Of the four parameters: start, end, periods, " + "and freq, exactly three must be specified" + ) with pytest.raises(ValueError, match=msg): date_range("2014-01-01", freq="WOM-5MON") @@ -165,12 +182,17 @@ def test_infer_freq_delta(base_delta_code_pair, count): assert frequencies.infer_freq(index) == exp_freq -@pytest.mark.parametrize("constructor", [ - lambda now, delta: DatetimeIndex([now + delta * 7] + - [now + delta * j for j in range(3)]), - lambda now, delta: DatetimeIndex([now + delta * j for j in range(3)] + - [now + delta * 7]) -]) +@pytest.mark.parametrize( + "constructor", + [ + lambda now, delta: DatetimeIndex( + [now + delta * 7] + [now + delta * j for j in range(3)] + ), + lambda now, delta: DatetimeIndex( + [now + delta * j for j in range(3)] + [now + delta * 7] + ), + ], +) def test_infer_freq_custom(base_delta_code_pair, constructor): b = Timestamp(datetime.now()) base_delta, _ = base_delta_code_pair @@ -184,8 +206,9 @@ def test_weekly_infer(periods, day): def test_week_of_month_infer(periods, day, count): - _check_generated_range("1/1/2000", periods, - "WOM-{count}{day}".format(count=count, day=day)) + _check_generated_range( + "1/1/2000", periods, "WOM-{count}{day}".format(count=count, day=day) + ) @pytest.mark.parametrize("freq", ["M", "BM", "BMS"]) @@ -194,20 +217,19 @@ def test_monthly_infer(periods, freq): def test_quarterly_infer(month, periods): - _check_generated_range("1/1/2000", periods, - "Q-{month}".format(month=month)) + _check_generated_range("1/1/2000", periods, "Q-{month}".format(month=month)) @pytest.mark.parametrize("annual", ["A", "BA"]) def test_annually_infer(month, periods, annual): - _check_generated_range("1/1/2000", periods, - "{annual}-{month}".format(annual=annual, - month=month)) + _check_generated_range( + "1/1/2000", periods, "{annual}-{month}".format(annual=annual, month=month) + ) -@pytest.mark.parametrize("freq,expected", [ - ("Q", "Q-DEC"), ("Q-NOV", "Q-NOV"), ("Q-OCT", "Q-OCT") -]) +@pytest.mark.parametrize( + "freq,expected", [("Q", "Q-DEC"), ("Q-NOV", "Q-NOV"), ("Q-OCT", "Q-OCT")] +) def test_infer_freq_index(freq, expected): rng = period_range("1959Q2", "2009Q3", freq=freq) rng = Index(rng.to_timestamp("D", how="e").astype(object)) @@ -218,13 +240,20 @@ def test_infer_freq_index(freq, expected): @pytest.mark.parametrize( "expected,dates", list( - {"AS-JAN": ["2009-01-01", "2010-01-01", "2011-01-01", "2012-01-01"], - "Q-OCT": ["2009-01-31", "2009-04-30", "2009-07-31", "2009-10-31"], - "M": ["2010-11-30", "2010-12-31", "2011-01-31", "2011-02-28"], - "W-SAT": ["2010-12-25", "2011-01-01", "2011-01-08", "2011-01-15"], - "D": ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], - "H": ["2011-12-31 22:00", "2011-12-31 23:00", - "2012-01-01 00:00", "2012-01-01 01:00"]}.items()) + { + "AS-JAN": ["2009-01-01", "2010-01-01", "2011-01-01", "2012-01-01"], + "Q-OCT": ["2009-01-31", "2009-04-30", "2009-07-31", "2009-10-31"], + "M": ["2010-11-30", "2010-12-31", "2011-01-31", "2011-02-28"], + "W-SAT": ["2010-12-25", "2011-01-01", "2011-01-08", "2011-01-15"], + "D": ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], + "H": [ + "2011-12-31 22:00", + "2011-12-31 23:00", + "2012-01-01 00:00", + "2012-01-01 01:00", + ], + }.items() + ), ) def test_infer_freq_tz(tz_naive_fixture, expected, dates): # see gh-7310 @@ -233,14 +262,17 @@ def test_infer_freq_tz(tz_naive_fixture, expected, dates): assert idx.inferred_freq == expected -@pytest.mark.parametrize("date_pair", [ - ["2013-11-02", "2013-11-5"], # Fall DST - ["2014-03-08", "2014-03-11"], # Spring DST - ["2014-01-01", "2014-01-03"] # Regular Time -]) -@pytest.mark.parametrize("freq", [ - "3H", "10T", "3601S", "3600001L", "3600000001U", "3600000000001N" -]) +@pytest.mark.parametrize( + "date_pair", + [ + ["2013-11-02", "2013-11-5"], # Fall DST + ["2014-03-08", "2014-03-11"], # Spring DST + ["2014-01-01", "2014-01-03"], # Regular Time + ], +) +@pytest.mark.parametrize( + "freq", ["3H", "10T", "3601S", "3600001L", "3600000001U", "3600000000001N"] +) def test_infer_freq_tz_transition(tz_naive_fixture, date_pair, freq): # see gh-8772 tz = tz_naive_fixture @@ -249,33 +281,90 @@ def test_infer_freq_tz_transition(tz_naive_fixture, date_pair, freq): def test_infer_freq_tz_transition_custom(): - index = date_range("2013-11-03", periods=5, - freq="3H").tz_localize("America/Chicago") + index = date_range("2013-11-03", periods=5, freq="3H").tz_localize( + "America/Chicago" + ) assert index.inferred_freq is None -@pytest.mark.parametrize("data,expected", [ - # Hourly freq in a day must result in "H" - (["2014-07-01 09:00", "2014-07-01 10:00", "2014-07-01 11:00", - "2014-07-01 12:00", "2014-07-01 13:00", "2014-07-01 14:00"], "H"), - - (["2014-07-01 09:00", "2014-07-01 10:00", "2014-07-01 11:00", - "2014-07-01 12:00", "2014-07-01 13:00", "2014-07-01 14:00", - "2014-07-01 15:00", "2014-07-01 16:00", "2014-07-02 09:00", - "2014-07-02 10:00", "2014-07-02 11:00"], "BH"), - (["2014-07-04 09:00", "2014-07-04 10:00", "2014-07-04 11:00", - "2014-07-04 12:00", "2014-07-04 13:00", "2014-07-04 14:00", - "2014-07-04 15:00", "2014-07-04 16:00", "2014-07-07 09:00", - "2014-07-07 10:00", "2014-07-07 11:00"], "BH"), - (["2014-07-04 09:00", "2014-07-04 10:00", "2014-07-04 11:00", - "2014-07-04 12:00", "2014-07-04 13:00", "2014-07-04 14:00", - "2014-07-04 15:00", "2014-07-04 16:00", "2014-07-07 09:00", - "2014-07-07 10:00", "2014-07-07 11:00", "2014-07-07 12:00", - "2014-07-07 13:00", "2014-07-07 14:00", "2014-07-07 15:00", - "2014-07-07 16:00", "2014-07-08 09:00", "2014-07-08 10:00", - "2014-07-08 11:00", "2014-07-08 12:00", "2014-07-08 13:00", - "2014-07-08 14:00", "2014-07-08 15:00", "2014-07-08 16:00"], "BH"), -]) +@pytest.mark.parametrize( + "data,expected", + [ + # Hourly freq in a day must result in "H" + ( + [ + "2014-07-01 09:00", + "2014-07-01 10:00", + "2014-07-01 11:00", + "2014-07-01 12:00", + "2014-07-01 13:00", + "2014-07-01 14:00", + ], + "H", + ), + ( + [ + "2014-07-01 09:00", + "2014-07-01 10:00", + "2014-07-01 11:00", + "2014-07-01 12:00", + "2014-07-01 13:00", + "2014-07-01 14:00", + "2014-07-01 15:00", + "2014-07-01 16:00", + "2014-07-02 09:00", + "2014-07-02 10:00", + "2014-07-02 11:00", + ], + "BH", + ), + ( + [ + "2014-07-04 09:00", + "2014-07-04 10:00", + "2014-07-04 11:00", + "2014-07-04 12:00", + "2014-07-04 13:00", + "2014-07-04 14:00", + "2014-07-04 15:00", + "2014-07-04 16:00", + "2014-07-07 09:00", + "2014-07-07 10:00", + "2014-07-07 11:00", + ], + "BH", + ), + ( + [ + "2014-07-04 09:00", + "2014-07-04 10:00", + "2014-07-04 11:00", + "2014-07-04 12:00", + "2014-07-04 13:00", + "2014-07-04 14:00", + "2014-07-04 15:00", + "2014-07-04 16:00", + "2014-07-07 09:00", + "2014-07-07 10:00", + "2014-07-07 11:00", + "2014-07-07 12:00", + "2014-07-07 13:00", + "2014-07-07 14:00", + "2014-07-07 15:00", + "2014-07-07 16:00", + "2014-07-08 09:00", + "2014-07-08 10:00", + "2014-07-08 11:00", + "2014-07-08 12:00", + "2014-07-08 13:00", + "2014-07-08 14:00", + "2014-07-08 15:00", + "2014-07-08 16:00", + ], + "BH", + ), + ], +) def test_infer_freq_business_hour(data, expected): # see gh-7905 idx = DatetimeIndex(data) @@ -297,21 +386,21 @@ def test_non_datetime_index2(): assert result == rng.inferred_freq -@pytest.mark.parametrize("idx", [ - tm.makeIntIndex(10), tm.makeFloatIndex(10), tm.makePeriodIndex(10) -]) +@pytest.mark.parametrize( + "idx", [tm.makeIntIndex(10), tm.makeFloatIndex(10), tm.makePeriodIndex(10)] +) def test_invalid_index_types(idx): - msg = ("(cannot infer freq from a non-convertible)|" - "(Check the `freq` attribute instead of using infer_freq)") + msg = ( + "(cannot infer freq from a non-convertible)|" + "(Check the `freq` attribute instead of using infer_freq)" + ) with pytest.raises(TypeError, match=msg): frequencies.infer_freq(idx) -@pytest.mark.skipif(is_platform_windows(), - reason="see gh-10822: Windows issue") -@pytest.mark.parametrize("idx", [tm.makeStringIndex(10), - tm.makeUnicodeIndex(10)]) +@pytest.mark.skipif(is_platform_windows(), reason="see gh-10822: Windows issue") +@pytest.mark.parametrize("idx", [tm.makeStringIndex(10), tm.makeUnicodeIndex(10)]) def test_invalid_index_types_unicode(idx): # see gh-10822 # @@ -339,7 +428,7 @@ def test_series(): assert inferred == "D" -@pytest.mark.parametrize("end", [10, 10.]) +@pytest.mark.parametrize("end", [10, 10.0]) def test_series_invalid_type(end): # see gh-6407 msg = "cannot infer freq from a non-convertible dtype on a Series" @@ -376,22 +465,63 @@ def test_series_datetime_index(freq): assert inferred == freq -@pytest.mark.parametrize("offset_func", [ - frequencies.get_offset, - lambda freq: date_range("2011-01-01", periods=5, freq=freq) -]) -@pytest.mark.parametrize("freq", [ - "WEEKDAY", "EOM", "W@MON", "W@TUE", "W@WED", "W@THU", - "W@FRI", "W@SAT", "W@SUN", "Q@JAN", "Q@FEB", "Q@MAR", - "A@JAN", "A@FEB", "A@MAR", "A@APR", "A@MAY", "A@JUN", - "A@JUL", "A@AUG", "A@SEP", "A@OCT", "A@NOV", "A@DEC", - "Y@JAN", "WOM@1MON", "WOM@2MON", "WOM@3MON", - "WOM@4MON", "WOM@1TUE", "WOM@2TUE", "WOM@3TUE", - "WOM@4TUE", "WOM@1WED", "WOM@2WED", "WOM@3WED", - "WOM@4WED", "WOM@1THU", "WOM@2THU", "WOM@3THU", - "WOM@4THU", "WOM@1FRI", "WOM@2FRI", "WOM@3FRI", - "WOM@4FRI" -]) +@pytest.mark.parametrize( + "offset_func", + [ + frequencies.get_offset, + lambda freq: date_range("2011-01-01", periods=5, freq=freq), + ], +) +@pytest.mark.parametrize( + "freq", + [ + "WEEKDAY", + "EOM", + "W@MON", + "W@TUE", + "W@WED", + "W@THU", + "W@FRI", + "W@SAT", + "W@SUN", + "Q@JAN", + "Q@FEB", + "Q@MAR", + "A@JAN", + "A@FEB", + "A@MAR", + "A@APR", + "A@MAY", + "A@JUN", + "A@JUL", + "A@AUG", + "A@SEP", + "A@OCT", + "A@NOV", + "A@DEC", + "Y@JAN", + "WOM@1MON", + "WOM@2MON", + "WOM@3MON", + "WOM@4MON", + "WOM@1TUE", + "WOM@2TUE", + "WOM@3TUE", + "WOM@4TUE", + "WOM@1WED", + "WOM@2WED", + "WOM@3WED", + "WOM@4WED", + "WOM@1THU", + "WOM@2THU", + "WOM@3THU", + "WOM@4THU", + "WOM@1FRI", + "WOM@2FRI", + "WOM@3FRI", + "WOM@4FRI", + ], +) def test_legacy_offset_warnings(offset_func, freq): with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): offset_func(freq) diff --git a/pandas/tests/tseries/frequencies/test_to_offset.py b/pandas/tests/tseries/frequencies/test_to_offset.py index c9c35b47f34753..b6069c446160df 100644 --- a/pandas/tests/tseries/frequencies/test_to_offset.py +++ b/pandas/tests/tseries/frequencies/test_to_offset.py @@ -8,54 +8,79 @@ import pandas.tseries.offsets as offsets -@pytest.mark.parametrize("freq_input,expected", [ - (frequencies.to_offset("10us"), offsets.Micro(10)), - (offsets.Hour(), offsets.Hour()), - ((5, "T"), offsets.Minute(5)), - ("2h30min", offsets.Minute(150)), - ("2h 30min", offsets.Minute(150)), - ("2h30min15s", offsets.Second(150 * 60 + 15)), - ("2h 60min", offsets.Hour(3)), - ("2h 20.5min", offsets.Second(8430)), - ("1.5min", offsets.Second(90)), - ("0.5S", offsets.Milli(500)), - ("15l500u", offsets.Micro(15500)), - ("10s75L", offsets.Milli(10075)), - ("1s0.25ms", offsets.Micro(1000250)), - ("1s0.25L", offsets.Micro(1000250)), - ("2800N", offsets.Nano(2800)), - ("2SM", offsets.SemiMonthEnd(2)), - ("2SM-16", offsets.SemiMonthEnd(2, day_of_month=16)), - ("2SMS-14", offsets.SemiMonthBegin(2, day_of_month=14)), - ("2SMS-15", offsets.SemiMonthBegin(2)), -]) +@pytest.mark.parametrize( + "freq_input,expected", + [ + (frequencies.to_offset("10us"), offsets.Micro(10)), + (offsets.Hour(), offsets.Hour()), + ((5, "T"), offsets.Minute(5)), + ("2h30min", offsets.Minute(150)), + ("2h 30min", offsets.Minute(150)), + ("2h30min15s", offsets.Second(150 * 60 + 15)), + ("2h 60min", offsets.Hour(3)), + ("2h 20.5min", offsets.Second(8430)), + ("1.5min", offsets.Second(90)), + ("0.5S", offsets.Milli(500)), + ("15l500u", offsets.Micro(15500)), + ("10s75L", offsets.Milli(10075)), + ("1s0.25ms", offsets.Micro(1000250)), + ("1s0.25L", offsets.Micro(1000250)), + ("2800N", offsets.Nano(2800)), + ("2SM", offsets.SemiMonthEnd(2)), + ("2SM-16", offsets.SemiMonthEnd(2, day_of_month=16)), + ("2SMS-14", offsets.SemiMonthBegin(2, day_of_month=14)), + ("2SMS-15", offsets.SemiMonthBegin(2)), + ], +) def test_to_offset(freq_input, expected): result = frequencies.to_offset(freq_input) assert result == expected -@pytest.mark.parametrize("freqstr,expected", [ - ("-1S", -1), - ("-2SM", -2), - ("-1SMS", -1), - ("-5min10s", -310), -]) +@pytest.mark.parametrize( + "freqstr,expected", [("-1S", -1), ("-2SM", -2), ("-1SMS", -1), ("-5min10s", -310)] +) def test_to_offset_negative(freqstr, expected): result = frequencies.to_offset(freqstr) assert result.n == expected -@pytest.mark.parametrize("freqstr", [ - "2h20m", "U1", "-U", "3U1", "-2-3U", "-2D:3H", - "1.5.0S", "2SMS-15-15", "2SMS-15D", "100foo", - - # Invalid leading +/- signs. - "+-1d", "-+1h", "+1", "-7", "+d", "-m", - - # Invalid shortcut anchors. - "SM-0", "SM-28", "SM-29", "SM-FOO", "BSM", "SM--1", "SMS-1", - "SMS-28", "SMS-30", "SMS-BAR", "SMS-BYR", "BSMS", "SMS--2" -]) +@pytest.mark.parametrize( + "freqstr", + [ + "2h20m", + "U1", + "-U", + "3U1", + "-2-3U", + "-2D:3H", + "1.5.0S", + "2SMS-15-15", + "2SMS-15D", + "100foo", + # Invalid leading +/- signs. + "+-1d", + "-+1h", + "+1", + "-7", + "+d", + "-m", + # Invalid shortcut anchors. + "SM-0", + "SM-28", + "SM-29", + "SM-FOO", + "BSM", + "SM--1", + "SMS-1", + "SMS-28", + "SMS-30", + "SMS-BAR", + "SMS-BYR", + "BSMS", + "SMS--2", + ], +) def test_to_offset_invalid(freqstr): # see gh-13930 @@ -71,47 +96,49 @@ def test_to_offset_no_evaluate(): frequencies.to_offset(("", "")) -@pytest.mark.parametrize("freqstr,expected", [ - ("2D 3H", offsets.Hour(51)), - ("2 D3 H", offsets.Hour(51)), - ("2 D 3 H", offsets.Hour(51)), - (" 2 D 3 H ", offsets.Hour(51)), - (" H ", offsets.Hour()), - (" 3 H ", offsets.Hour(3)), -]) +@pytest.mark.parametrize( + "freqstr,expected", + [ + ("2D 3H", offsets.Hour(51)), + ("2 D3 H", offsets.Hour(51)), + ("2 D 3 H", offsets.Hour(51)), + (" 2 D 3 H ", offsets.Hour(51)), + (" H ", offsets.Hour()), + (" 3 H ", offsets.Hour(3)), + ], +) def test_to_offset_whitespace(freqstr, expected): result = frequencies.to_offset(freqstr) assert result == expected -@pytest.mark.parametrize("freqstr,expected", [ - ("00H 00T 01S", 1), - ("-00H 03T 14S", -194), -]) +@pytest.mark.parametrize( + "freqstr,expected", [("00H 00T 01S", 1), ("-00H 03T 14S", -194)] +) def test_to_offset_leading_zero(freqstr, expected): result = frequencies.to_offset(freqstr) assert result.n == expected -@pytest.mark.parametrize("freqstr,expected", [ - ("+1d", 1), - ("+2h30min", 150), -]) +@pytest.mark.parametrize("freqstr,expected", [("+1d", 1), ("+2h30min", 150)]) def test_to_offset_leading_plus(freqstr, expected): result = frequencies.to_offset(freqstr) assert result.n == expected -@pytest.mark.parametrize("kwargs,expected", [ - (dict(days=1, seconds=1), offsets.Second(86401)), - (dict(days=-1, seconds=1), offsets.Second(-86399)), - (dict(hours=1, minutes=10), offsets.Minute(70)), - (dict(hours=1, minutes=-10), offsets.Minute(50)), - (dict(weeks=1), offsets.Day(7)), - (dict(hours=1), offsets.Hour(1)), - (dict(hours=1), frequencies.to_offset("60min")), - (dict(microseconds=1), offsets.Micro(1)) -]) +@pytest.mark.parametrize( + "kwargs,expected", + [ + (dict(days=1, seconds=1), offsets.Second(86401)), + (dict(days=-1, seconds=1), offsets.Second(-86399)), + (dict(hours=1, minutes=10), offsets.Minute(70)), + (dict(hours=1, minutes=-10), offsets.Minute(50)), + (dict(weeks=1), offsets.Day(7)), + (dict(hours=1), offsets.Hour(1)), + (dict(hours=1), frequencies.to_offset("60min")), + (dict(microseconds=1), offsets.Micro(1)), + ], +) def test_to_offset_pd_timedelta(kwargs, expected): # see gh-9064 td = Timedelta(**kwargs) @@ -128,19 +155,22 @@ def test_to_offset_pd_timedelta_invalid(): frequencies.to_offset(td) -@pytest.mark.parametrize("shortcut,expected", [ - ("W", offsets.Week(weekday=6)), - ("W-SUN", offsets.Week(weekday=6)), - ("Q", offsets.QuarterEnd(startingMonth=12)), - ("Q-DEC", offsets.QuarterEnd(startingMonth=12)), - ("Q-MAY", offsets.QuarterEnd(startingMonth=5)), - ("SM", offsets.SemiMonthEnd(day_of_month=15)), - ("SM-15", offsets.SemiMonthEnd(day_of_month=15)), - ("SM-1", offsets.SemiMonthEnd(day_of_month=1)), - ("SM-27", offsets.SemiMonthEnd(day_of_month=27)), - ("SMS-2", offsets.SemiMonthBegin(day_of_month=2)), - ("SMS-27", offsets.SemiMonthBegin(day_of_month=27)), -]) +@pytest.mark.parametrize( + "shortcut,expected", + [ + ("W", offsets.Week(weekday=6)), + ("W-SUN", offsets.Week(weekday=6)), + ("Q", offsets.QuarterEnd(startingMonth=12)), + ("Q-DEC", offsets.QuarterEnd(startingMonth=12)), + ("Q-MAY", offsets.QuarterEnd(startingMonth=5)), + ("SM", offsets.SemiMonthEnd(day_of_month=15)), + ("SM-15", offsets.SemiMonthEnd(day_of_month=15)), + ("SM-1", offsets.SemiMonthEnd(day_of_month=1)), + ("SM-27", offsets.SemiMonthEnd(day_of_month=27)), + ("SMS-2", offsets.SemiMonthBegin(day_of_month=2)), + ("SMS-27", offsets.SemiMonthBegin(day_of_month=27)), + ], +) def test_anchored_shortcuts(shortcut, expected): result = frequencies.to_offset(shortcut) assert result == expected diff --git a/pandas/tests/tseries/holiday/test_calendar.py b/pandas/tests/tseries/holiday/test_calendar.py index 407e83de5e6e9d..79c28942769f0e 100644 --- a/pandas/tests/tseries/holiday/test_calendar.py +++ b/pandas/tests/tseries/holiday/test_calendar.py @@ -6,15 +6,18 @@ import pandas.util.testing as tm from pandas.tseries.holiday import ( - AbstractHolidayCalendar, Holiday, Timestamp, USFederalHolidayCalendar, - USThanksgivingDay, get_calendar) - - -@pytest.mark.parametrize("transform", [ - lambda x: x, - lambda x: x.strftime("%Y-%m-%d"), - lambda x: Timestamp(x) -]) + AbstractHolidayCalendar, + Holiday, + Timestamp, + USFederalHolidayCalendar, + USThanksgivingDay, + get_calendar, +) + + +@pytest.mark.parametrize( + "transform", [lambda x: x, lambda x: x.strftime("%Y-%m-%d"), lambda x: Timestamp(x)] +) def test_calendar(transform): start_date = datetime(2012, 1, 1) end_date = datetime(2012, 12, 31) @@ -32,7 +35,7 @@ def test_calendar(transform): datetime(2012, 10, 8), datetime(2012, 11, 12), datetime(2012, 11, 22), - datetime(2012, 12, 25) + datetime(2012, 12, 25), ] assert list(holidays.to_pydatetime()) == expected @@ -56,12 +59,15 @@ def __init__(self, name=None, rules=None): def test_calendar_observance_dates(): # see gh-11477 us_fed_cal = get_calendar("USFederalHolidayCalendar") - holidays0 = us_fed_cal.holidays(datetime(2015, 7, 3), datetime( - 2015, 7, 3)) # <-- same start and end dates - holidays1 = us_fed_cal.holidays(datetime(2015, 7, 3), datetime( - 2015, 7, 6)) # <-- different start and end dates - holidays2 = us_fed_cal.holidays(datetime(2015, 7, 3), datetime( - 2015, 7, 3)) # <-- same start and end dates + holidays0 = us_fed_cal.holidays( + datetime(2015, 7, 3), datetime(2015, 7, 3) + ) # <-- same start and end dates + holidays1 = us_fed_cal.holidays( + datetime(2015, 7, 3), datetime(2015, 7, 6) + ) # <-- different start and end dates + holidays2 = us_fed_cal.holidays( + datetime(2015, 7, 3), datetime(2015, 7, 3) + ) # <-- same start and end dates # These should all produce the same result. # diff --git a/pandas/tests/tseries/holiday/test_federal.py b/pandas/tests/tseries/holiday/test_federal.py index 62b5ab2b849ae9..64c60d4e365e6b 100644 --- a/pandas/tests/tseries/holiday/test_federal.py +++ b/pandas/tests/tseries/holiday/test_federal.py @@ -1,7 +1,10 @@ from datetime import datetime from pandas.tseries.holiday import ( - AbstractHolidayCalendar, USMartinLutherKingJr, USMemorialDay) + AbstractHolidayCalendar, + USMartinLutherKingJr, + USMemorialDay, +) def test_no_mlk_before_1986(): @@ -9,28 +12,27 @@ def test_no_mlk_before_1986(): class MLKCalendar(AbstractHolidayCalendar): rules = [USMartinLutherKingJr] - holidays = MLKCalendar().holidays(start="1984", - end="1988").to_pydatetime().tolist() + holidays = MLKCalendar().holidays(start="1984", end="1988").to_pydatetime().tolist() # Testing to make sure holiday is not incorrectly observed before 1986. - assert holidays == [datetime(1986, 1, 20, 0, 0), - datetime(1987, 1, 19, 0, 0)] + assert holidays == [datetime(1986, 1, 20, 0, 0), datetime(1987, 1, 19, 0, 0)] def test_memorial_day(): class MemorialDay(AbstractHolidayCalendar): rules = [USMemorialDay] - holidays = MemorialDay().holidays(start="1971", - end="1980").to_pydatetime().tolist() + holidays = MemorialDay().holidays(start="1971", end="1980").to_pydatetime().tolist() # Fixes 5/31 error and checked manually against Wikipedia. - assert holidays == [datetime(1971, 5, 31, 0, 0), - datetime(1972, 5, 29, 0, 0), - datetime(1973, 5, 28, 0, 0), - datetime(1974, 5, 27, 0, 0), - datetime(1975, 5, 26, 0, 0), - datetime(1976, 5, 31, 0, 0), - datetime(1977, 5, 30, 0, 0), - datetime(1978, 5, 29, 0, 0), - datetime(1979, 5, 28, 0, 0)] + assert holidays == [ + datetime(1971, 5, 31, 0, 0), + datetime(1972, 5, 29, 0, 0), + datetime(1973, 5, 28, 0, 0), + datetime(1974, 5, 27, 0, 0), + datetime(1975, 5, 26, 0, 0), + datetime(1976, 5, 31, 0, 0), + datetime(1977, 5, 30, 0, 0), + datetime(1978, 5, 29, 0, 0), + datetime(1979, 5, 28, 0, 0), + ] diff --git a/pandas/tests/tseries/holiday/test_holiday.py b/pandas/tests/tseries/holiday/test_holiday.py index 27bba1cc89deed..06869fcd7a4f87 100644 --- a/pandas/tests/tseries/holiday/test_holiday.py +++ b/pandas/tests/tseries/holiday/test_holiday.py @@ -6,10 +6,24 @@ import pandas.util.testing as tm from pandas.tseries.holiday import ( - MO, SA, AbstractHolidayCalendar, DateOffset, EasterMonday, GoodFriday, - Holiday, HolidayCalendarFactory, Timestamp, USColumbusDay, USLaborDay, - USMartinLutherKingJr, USMemorialDay, USPresidentsDay, USThanksgivingDay, - get_calendar, next_monday) + MO, + SA, + AbstractHolidayCalendar, + DateOffset, + EasterMonday, + GoodFriday, + Holiday, + HolidayCalendarFactory, + Timestamp, + USColumbusDay, + USLaborDay, + USMartinLutherKingJr, + USMemorialDay, + USPresidentsDay, + USThanksgivingDay, + get_calendar, + next_monday, +) def _check_holiday_results(holiday, start, end, expected): @@ -30,92 +44,141 @@ def _check_holiday_results(holiday, start, end, expected): assert list(holiday.dates(start, end)) == expected # Verify that timezone info is preserved. - assert (list(holiday.dates(utc.localize(Timestamp(start)), - utc.localize(Timestamp(end)))) == - [utc.localize(dt) for dt in expected]) - - -@pytest.mark.parametrize("holiday,start_date,end_date,expected", [ - (USMemorialDay, datetime(2011, 1, 1), datetime(2020, 12, 31), - [datetime(2011, 5, 30), datetime(2012, 5, 28), datetime(2013, 5, 27), - datetime(2014, 5, 26), datetime(2015, 5, 25), datetime(2016, 5, 30), - datetime(2017, 5, 29), datetime(2018, 5, 28), datetime(2019, 5, 27), - datetime(2020, 5, 25)]), - - (Holiday("July 4th Eve", month=7, day=3), "2001-01-01", "2003-03-03", - [Timestamp("2001-07-03 00:00:00"), Timestamp("2002-07-03 00:00:00")]), - (Holiday("July 4th Eve", month=7, day=3, days_of_week=(0, 1, 2, 3)), - "2001-01-01", "2008-03-03", [ - Timestamp("2001-07-03 00:00:00"), Timestamp("2002-07-03 00:00:00"), - Timestamp("2003-07-03 00:00:00"), Timestamp("2006-07-03 00:00:00"), - Timestamp("2007-07-03 00:00:00")]), - - (EasterMonday, datetime(2011, 1, 1), datetime(2020, 12, 31), - [Timestamp("2011-04-25 00:00:00"), Timestamp("2012-04-09 00:00:00"), - Timestamp("2013-04-01 00:00:00"), Timestamp("2014-04-21 00:00:00"), - Timestamp("2015-04-06 00:00:00"), Timestamp("2016-03-28 00:00:00"), - Timestamp("2017-04-17 00:00:00"), Timestamp("2018-04-02 00:00:00"), - Timestamp("2019-04-22 00:00:00"), Timestamp("2020-04-13 00:00:00")]), - (GoodFriday, datetime(2011, 1, 1), datetime(2020, 12, 31), - [Timestamp("2011-04-22 00:00:00"), Timestamp("2012-04-06 00:00:00"), - Timestamp("2013-03-29 00:00:00"), Timestamp("2014-04-18 00:00:00"), - Timestamp("2015-04-03 00:00:00"), Timestamp("2016-03-25 00:00:00"), - Timestamp("2017-04-14 00:00:00"), Timestamp("2018-03-30 00:00:00"), - Timestamp("2019-04-19 00:00:00"), Timestamp("2020-04-10 00:00:00")]), - - (USThanksgivingDay, datetime(2011, 1, 1), datetime(2020, 12, 31), - [datetime(2011, 11, 24), datetime(2012, 11, 22), datetime(2013, 11, 28), - datetime(2014, 11, 27), datetime(2015, 11, 26), datetime(2016, 11, 24), - datetime(2017, 11, 23), datetime(2018, 11, 22), datetime(2019, 11, 28), - datetime(2020, 11, 26)]) -]) + assert list( + holiday.dates(utc.localize(Timestamp(start)), utc.localize(Timestamp(end))) + ) == [utc.localize(dt) for dt in expected] + + +@pytest.mark.parametrize( + "holiday,start_date,end_date,expected", + [ + ( + USMemorialDay, + datetime(2011, 1, 1), + datetime(2020, 12, 31), + [ + datetime(2011, 5, 30), + datetime(2012, 5, 28), + datetime(2013, 5, 27), + datetime(2014, 5, 26), + datetime(2015, 5, 25), + datetime(2016, 5, 30), + datetime(2017, 5, 29), + datetime(2018, 5, 28), + datetime(2019, 5, 27), + datetime(2020, 5, 25), + ], + ), + ( + Holiday("July 4th Eve", month=7, day=3), + "2001-01-01", + "2003-03-03", + [Timestamp("2001-07-03 00:00:00"), Timestamp("2002-07-03 00:00:00")], + ), + ( + Holiday("July 4th Eve", month=7, day=3, days_of_week=(0, 1, 2, 3)), + "2001-01-01", + "2008-03-03", + [ + Timestamp("2001-07-03 00:00:00"), + Timestamp("2002-07-03 00:00:00"), + Timestamp("2003-07-03 00:00:00"), + Timestamp("2006-07-03 00:00:00"), + Timestamp("2007-07-03 00:00:00"), + ], + ), + ( + EasterMonday, + datetime(2011, 1, 1), + datetime(2020, 12, 31), + [ + Timestamp("2011-04-25 00:00:00"), + Timestamp("2012-04-09 00:00:00"), + Timestamp("2013-04-01 00:00:00"), + Timestamp("2014-04-21 00:00:00"), + Timestamp("2015-04-06 00:00:00"), + Timestamp("2016-03-28 00:00:00"), + Timestamp("2017-04-17 00:00:00"), + Timestamp("2018-04-02 00:00:00"), + Timestamp("2019-04-22 00:00:00"), + Timestamp("2020-04-13 00:00:00"), + ], + ), + ( + GoodFriday, + datetime(2011, 1, 1), + datetime(2020, 12, 31), + [ + Timestamp("2011-04-22 00:00:00"), + Timestamp("2012-04-06 00:00:00"), + Timestamp("2013-03-29 00:00:00"), + Timestamp("2014-04-18 00:00:00"), + Timestamp("2015-04-03 00:00:00"), + Timestamp("2016-03-25 00:00:00"), + Timestamp("2017-04-14 00:00:00"), + Timestamp("2018-03-30 00:00:00"), + Timestamp("2019-04-19 00:00:00"), + Timestamp("2020-04-10 00:00:00"), + ], + ), + ( + USThanksgivingDay, + datetime(2011, 1, 1), + datetime(2020, 12, 31), + [ + datetime(2011, 11, 24), + datetime(2012, 11, 22), + datetime(2013, 11, 28), + datetime(2014, 11, 27), + datetime(2015, 11, 26), + datetime(2016, 11, 24), + datetime(2017, 11, 23), + datetime(2018, 11, 22), + datetime(2019, 11, 28), + datetime(2020, 11, 26), + ], + ), + ], +) def test_holiday_dates(holiday, start_date, end_date, expected): _check_holiday_results(holiday, start_date, end_date, expected) -@pytest.mark.parametrize("holiday,start,expected", [ - (USMemorialDay, datetime(2015, 7, 1), []), - (USMemorialDay, "2015-05-25", "2015-05-25"), - - (USLaborDay, datetime(2015, 7, 1), []), - (USLaborDay, "2015-09-07", "2015-09-07"), - - (USColumbusDay, datetime(2015, 7, 1), []), - (USColumbusDay, "2015-10-12", "2015-10-12"), - - (USThanksgivingDay, datetime(2015, 7, 1), []), - (USThanksgivingDay, "2015-11-26", "2015-11-26"), - - (USMartinLutherKingJr, datetime(2015, 7, 1), []), - (USMartinLutherKingJr, "2015-01-19", "2015-01-19"), - - (USPresidentsDay, datetime(2015, 7, 1), []), - (USPresidentsDay, "2015-02-16", "2015-02-16"), - - (GoodFriday, datetime(2015, 7, 1), []), - (GoodFriday, "2015-04-03", "2015-04-03"), - - (EasterMonday, "2015-04-06", "2015-04-06"), - (EasterMonday, datetime(2015, 7, 1), []), - (EasterMonday, "2015-04-05", []), - - ("New Years Day", "2015-01-01", "2015-01-01"), - ("New Years Day", "2010-12-31", "2010-12-31"), - ("New Years Day", datetime(2015, 7, 1), []), - ("New Years Day", "2011-01-01", []), - - ("July 4th", "2015-07-03", "2015-07-03"), - ("July 4th", datetime(2015, 7, 1), []), - ("July 4th", "2015-07-04", []), - - ("Veterans Day", "2012-11-12", "2012-11-12"), - ("Veterans Day", datetime(2015, 7, 1), []), - ("Veterans Day", "2012-11-11", []), - - ("Christmas", "2011-12-26", "2011-12-26"), - ("Christmas", datetime(2015, 7, 1), []), - ("Christmas", "2011-12-25", []), -]) +@pytest.mark.parametrize( + "holiday,start,expected", + [ + (USMemorialDay, datetime(2015, 7, 1), []), + (USMemorialDay, "2015-05-25", "2015-05-25"), + (USLaborDay, datetime(2015, 7, 1), []), + (USLaborDay, "2015-09-07", "2015-09-07"), + (USColumbusDay, datetime(2015, 7, 1), []), + (USColumbusDay, "2015-10-12", "2015-10-12"), + (USThanksgivingDay, datetime(2015, 7, 1), []), + (USThanksgivingDay, "2015-11-26", "2015-11-26"), + (USMartinLutherKingJr, datetime(2015, 7, 1), []), + (USMartinLutherKingJr, "2015-01-19", "2015-01-19"), + (USPresidentsDay, datetime(2015, 7, 1), []), + (USPresidentsDay, "2015-02-16", "2015-02-16"), + (GoodFriday, datetime(2015, 7, 1), []), + (GoodFriday, "2015-04-03", "2015-04-03"), + (EasterMonday, "2015-04-06", "2015-04-06"), + (EasterMonday, datetime(2015, 7, 1), []), + (EasterMonday, "2015-04-05", []), + ("New Years Day", "2015-01-01", "2015-01-01"), + ("New Years Day", "2010-12-31", "2010-12-31"), + ("New Years Day", datetime(2015, 7, 1), []), + ("New Years Day", "2011-01-01", []), + ("July 4th", "2015-07-03", "2015-07-03"), + ("July 4th", datetime(2015, 7, 1), []), + ("July 4th", "2015-07-04", []), + ("Veterans Day", "2012-11-12", "2012-11-12"), + ("Veterans Day", datetime(2015, 7, 1), []), + ("Veterans Day", "2012-11-11", []), + ("Christmas", "2011-12-26", "2011-12-26"), + ("Christmas", datetime(2015, 7, 1), []), + ("Christmas", "2011-12-25", []), + ], +) def test_holidays_within_dates(holiday, start, expected): # see gh-11477 # @@ -132,26 +195,34 @@ def test_holidays_within_dates(holiday, start, expected): _check_holiday_results(holiday, start, start, expected) -@pytest.mark.parametrize("transform", [ - lambda x: x.strftime("%Y-%m-%d"), - lambda x: Timestamp(x) -]) +@pytest.mark.parametrize( + "transform", [lambda x: x.strftime("%Y-%m-%d"), lambda x: Timestamp(x)] +) def test_argument_types(transform): start_date = datetime(2011, 1, 1) end_date = datetime(2020, 12, 31) holidays = USThanksgivingDay.dates(start_date, end_date) - holidays2 = USThanksgivingDay.dates( - transform(start_date), transform(end_date)) + holidays2 = USThanksgivingDay.dates(transform(start_date), transform(end_date)) tm.assert_index_equal(holidays, holidays2) -@pytest.mark.parametrize("name,kwargs", [ - ("One-Time", dict(year=2012, month=5, day=28)), - ("Range", dict(month=5, day=28, start_date=datetime(2012, 1, 1), - end_date=datetime(2012, 12, 31), - offset=DateOffset(weekday=MO(1)))) -]) +@pytest.mark.parametrize( + "name,kwargs", + [ + ("One-Time", dict(year=2012, month=5, day=28)), + ( + "Range", + dict( + month=5, + day=28, + start_date=datetime(2012, 1, 1), + end_date=datetime(2012, 12, 31), + offset=DateOffset(weekday=MO(1)), + ), + ), + ], +) def test_special_holidays(name, kwargs): base_date = [datetime(2012, 5, 28)] holiday = Holiday(name, **kwargs) @@ -171,12 +242,12 @@ class TestCalendar(AbstractHolidayCalendar): def test_factory(): - class_1 = HolidayCalendarFactory("MemorialDay", - AbstractHolidayCalendar, - USMemorialDay) - class_2 = HolidayCalendarFactory("Thanksgiving", - AbstractHolidayCalendar, - USThanksgivingDay) + class_1 = HolidayCalendarFactory( + "MemorialDay", AbstractHolidayCalendar, USMemorialDay + ) + class_2 = HolidayCalendarFactory( + "Thanksgiving", AbstractHolidayCalendar, USThanksgivingDay + ) class_3 = HolidayCalendarFactory("Combined", class_1, class_2) assert len(class_1.rules) == 1 @@ -188,6 +259,10 @@ def test_both_offset_observance_raises(): # see gh-10217 msg = "Cannot use both offset and observance" with pytest.raises(NotImplementedError, match=msg): - Holiday("Cyber Monday", month=11, day=1, - offset=[DateOffset(weekday=SA(4))], - observance=next_monday) + Holiday( + "Cyber Monday", + month=11, + day=1, + offset=[DateOffset(weekday=SA(4))], + observance=next_monday, + ) diff --git a/pandas/tests/tseries/holiday/test_observance.py b/pandas/tests/tseries/holiday/test_observance.py index 1c22918b2efd89..9ee63d2a365560 100644 --- a/pandas/tests/tseries/holiday/test_observance.py +++ b/pandas/tests/tseries/holiday/test_observance.py @@ -3,9 +3,17 @@ import pytest from pandas.tseries.holiday import ( - after_nearest_workday, before_nearest_workday, nearest_workday, - next_monday, next_monday_or_tuesday, next_workday, previous_friday, - previous_workday, sunday_to_monday, weekend_to_monday) + after_nearest_workday, + before_nearest_workday, + nearest_workday, + next_monday, + next_monday_or_tuesday, + next_workday, + previous_friday, + previous_workday, + sunday_to_monday, + weekend_to_monday, +) _WEDNESDAY = datetime(2014, 4, 9) _THURSDAY = datetime(2014, 4, 10) @@ -21,11 +29,9 @@ def test_next_monday(day): assert next_monday(day) == _MONDAY -@pytest.mark.parametrize("day,expected", [ - (_SATURDAY, _MONDAY), - (_SUNDAY, _TUESDAY), - (_MONDAY, _TUESDAY) -]) +@pytest.mark.parametrize( + "day,expected", [(_SATURDAY, _MONDAY), (_SUNDAY, _TUESDAY), (_MONDAY, _TUESDAY)] +) def test_next_monday_or_tuesday(day, expected): assert next_monday_or_tuesday(day) == expected @@ -39,55 +45,43 @@ def test_sunday_to_monday(): assert sunday_to_monday(_SUNDAY) == _MONDAY -@pytest.mark.parametrize("day,expected", [ - (_SATURDAY, _FRIDAY), - (_SUNDAY, _MONDAY), - (_MONDAY, _MONDAY) -]) +@pytest.mark.parametrize( + "day,expected", [(_SATURDAY, _FRIDAY), (_SUNDAY, _MONDAY), (_MONDAY, _MONDAY)] +) def test_nearest_workday(day, expected): assert nearest_workday(day) == expected -@pytest.mark.parametrize("day,expected", [ - (_SATURDAY, _MONDAY), - (_SUNDAY, _MONDAY), - (_MONDAY, _MONDAY) -]) +@pytest.mark.parametrize( + "day,expected", [(_SATURDAY, _MONDAY), (_SUNDAY, _MONDAY), (_MONDAY, _MONDAY)] +) def test_weekend_to_monday(day, expected): assert weekend_to_monday(day) == expected -@pytest.mark.parametrize("day,expected", [ - (_SATURDAY, _MONDAY), - (_SUNDAY, _MONDAY), - (_MONDAY, _TUESDAY) -]) +@pytest.mark.parametrize( + "day,expected", [(_SATURDAY, _MONDAY), (_SUNDAY, _MONDAY), (_MONDAY, _TUESDAY)] +) def test_next_workday(day, expected): assert next_workday(day) == expected -@pytest.mark.parametrize("day,expected", [ - (_SATURDAY, _FRIDAY), - (_SUNDAY, _FRIDAY), - (_TUESDAY, _MONDAY) -]) +@pytest.mark.parametrize( + "day,expected", [(_SATURDAY, _FRIDAY), (_SUNDAY, _FRIDAY), (_TUESDAY, _MONDAY)] +) def test_previous_workday(day, expected): assert previous_workday(day) == expected -@pytest.mark.parametrize("day,expected", [ - (_SATURDAY, _THURSDAY), - (_SUNDAY, _FRIDAY), - (_TUESDAY, _MONDAY) -]) +@pytest.mark.parametrize( + "day,expected", [(_SATURDAY, _THURSDAY), (_SUNDAY, _FRIDAY), (_TUESDAY, _MONDAY)] +) def test_before_nearest_workday(day, expected): assert before_nearest_workday(day) == expected -@pytest.mark.parametrize("day,expected", [ - (_SATURDAY, _MONDAY), - (_SUNDAY, _TUESDAY), - (_FRIDAY, _MONDAY) -]) +@pytest.mark.parametrize( + "day,expected", [(_SATURDAY, _MONDAY), (_SUNDAY, _TUESDAY), (_FRIDAY, _MONDAY)] +) def test_after_nearest_workday(day, expected): assert after_nearest_workday(day) == expected diff --git a/pandas/tests/tseries/offsets/common.py b/pandas/tests/tseries/offsets/common.py index e6177fa4aaa4bd..079fcc36ff3eed 100644 --- a/pandas/tests/tseries/offsets/common.py +++ b/pandas/tests/tseries/offsets/common.py @@ -12,13 +12,15 @@ def assert_offset_equal(offset, base, expected): assert actual_swapped == expected assert actual_apply == expected except AssertionError: - raise AssertionError("\nExpected: %s\nActual: %s\nFor Offset: %s)" - "\nAt Date: %s" % - (expected, actual, offset, base)) + raise AssertionError( + "\nExpected: %s\nActual: %s\nFor Offset: %s)" + "\nAt Date: %s" % (expected, actual, offset, base) + ) def assert_onOffset(offset, date, expected): actual = offset.onOffset(date) - assert actual == expected, ("\nExpected: %s\nActual: %s\nFor Offset: %s)" - "\nAt Date: %s" % - (expected, actual, offset, date)) + assert actual == expected, ( + "\nExpected: %s\nActual: %s\nFor Offset: %s)" + "\nAt Date: %s" % (expected, actual, offset, date) + ) diff --git a/pandas/tests/tseries/offsets/conftest.py b/pandas/tests/tseries/offsets/conftest.py index c192a56b205cac..2f6868f56c073c 100644 --- a/pandas/tests/tseries/offsets/conftest.py +++ b/pandas/tests/tseries/offsets/conftest.py @@ -11,9 +11,13 @@ def offset_types(request): return request.param -@pytest.fixture(params=[getattr(offsets, o) for o in offsets.__all__ if - issubclass(getattr(offsets, o), offsets.MonthOffset) - and o != 'MonthOffset']) +@pytest.fixture( + params=[ + getattr(offsets, o) + for o in offsets.__all__ + if issubclass(getattr(offsets, o), offsets.MonthOffset) and o != "MonthOffset" + ] +) def month_classes(request): """ Fixture for month based datetime offsets available for a time series. diff --git a/pandas/tests/tseries/offsets/test_fiscal.py b/pandas/tests/tseries/offsets/test_fiscal.py index b61f5f3b013e4f..c24d917a5e454f 100644 --- a/pandas/tests/tseries/offsets/test_fiscal.py +++ b/pandas/tests/tseries/offsets/test_fiscal.py @@ -34,49 +34,61 @@ def makeFY5253LastOfMonth(*args, **kwds): def test_get_offset_name(): - assert (makeFY5253LastOfMonthQuarter( - weekday=1, startingMonth=3, - qtr_with_extra_week=4).freqstr == "REQ-L-MAR-TUE-4") - assert (makeFY5253NearestEndMonthQuarter( - weekday=1, startingMonth=3, - qtr_with_extra_week=3).freqstr == "REQ-N-MAR-TUE-3") + assert ( + makeFY5253LastOfMonthQuarter( + weekday=1, startingMonth=3, qtr_with_extra_week=4 + ).freqstr + == "REQ-L-MAR-TUE-4" + ) + assert ( + makeFY5253NearestEndMonthQuarter( + weekday=1, startingMonth=3, qtr_with_extra_week=3 + ).freqstr + == "REQ-N-MAR-TUE-3" + ) def test_get_offset(): with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - get_offset('gibberish') + get_offset("gibberish") with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - get_offset('QS-JAN-B') + get_offset("QS-JAN-B") pairs = [ - ("RE-N-DEC-MON", - makeFY5253NearestEndMonth(weekday=0, startingMonth=12)), - ("RE-L-DEC-TUE", - makeFY5253LastOfMonth(weekday=1, startingMonth=12)), - ("REQ-L-MAR-TUE-4", - makeFY5253LastOfMonthQuarter(weekday=1, - startingMonth=3, - qtr_with_extra_week=4)), - ("REQ-L-DEC-MON-3", - makeFY5253LastOfMonthQuarter(weekday=0, - startingMonth=12, - qtr_with_extra_week=3)), - ("REQ-N-DEC-MON-3", - makeFY5253NearestEndMonthQuarter(weekday=0, - startingMonth=12, - qtr_with_extra_week=3))] + ("RE-N-DEC-MON", makeFY5253NearestEndMonth(weekday=0, startingMonth=12)), + ("RE-L-DEC-TUE", makeFY5253LastOfMonth(weekday=1, startingMonth=12)), + ( + "REQ-L-MAR-TUE-4", + makeFY5253LastOfMonthQuarter( + weekday=1, startingMonth=3, qtr_with_extra_week=4 + ), + ), + ( + "REQ-L-DEC-MON-3", + makeFY5253LastOfMonthQuarter( + weekday=0, startingMonth=12, qtr_with_extra_week=3 + ), + ), + ( + "REQ-N-DEC-MON-3", + makeFY5253NearestEndMonthQuarter( + weekday=0, startingMonth=12, qtr_with_extra_week=3 + ), + ), + ] for name, expected in pairs: offset = get_offset(name) - assert offset == expected, ("Expected %r to yield %r (actual: %r)" % - (name, expected, offset)) + assert offset == expected, "Expected %r to yield %r (actual: %r)" % ( + name, + expected, + offset, + ) class TestFY5253LastOfMonth(Base): - offset_lom_sat_aug = makeFY5253LastOfMonth(1, startingMonth=8, - weekday=WeekDay.SAT) - offset_lom_sat_sep = makeFY5253LastOfMonth(1, startingMonth=9, - weekday=WeekDay.SAT) + offset_lom_sat_aug = makeFY5253LastOfMonth(1, startingMonth=8, weekday=WeekDay.SAT) + offset_lom_sat_sep = makeFY5253LastOfMonth(1, startingMonth=9, weekday=WeekDay.SAT) on_offset_cases = [ # From Wikipedia (see: @@ -95,14 +107,12 @@ class TestFY5253LastOfMonth(Base): (offset_lom_sat_aug, datetime(2017, 8, 26), True), (offset_lom_sat_aug, datetime(2018, 8, 25), True), (offset_lom_sat_aug, datetime(2019, 8, 31), True), - (offset_lom_sat_aug, datetime(2006, 8, 27), False), (offset_lom_sat_aug, datetime(2007, 8, 28), False), (offset_lom_sat_aug, datetime(2008, 8, 31), False), (offset_lom_sat_aug, datetime(2009, 8, 30), False), (offset_lom_sat_aug, datetime(2010, 8, 29), False), (offset_lom_sat_aug, datetime(2011, 8, 28), False), - (offset_lom_sat_aug, datetime(2006, 8, 25), False), (offset_lom_sat_aug, datetime(2007, 8, 24), False), (offset_lom_sat_aug, datetime(2008, 8, 29), False), @@ -110,42 +120,48 @@ class TestFY5253LastOfMonth(Base): (offset_lom_sat_aug, datetime(2010, 8, 27), False), (offset_lom_sat_aug, datetime(2011, 8, 26), False), (offset_lom_sat_aug, datetime(2019, 8, 30), False), - # From GMCR (see for example: # http://yahoo.brand.edgar-online.com/Default.aspx? # companyid=3184&formtypeID=7) (offset_lom_sat_sep, datetime(2010, 9, 25), True), (offset_lom_sat_sep, datetime(2011, 9, 24), True), - (offset_lom_sat_sep, datetime(2012, 9, 29), True)] + (offset_lom_sat_sep, datetime(2012, 9, 29), True), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) def test_apply(self): - offset_lom_aug_sat = makeFY5253LastOfMonth(startingMonth=8, - weekday=WeekDay.SAT) - offset_lom_aug_sat_1 = makeFY5253LastOfMonth(n=1, startingMonth=8, - weekday=WeekDay.SAT) - - date_seq_lom_aug_sat = [datetime(2006, 8, 26), datetime(2007, 8, 25), - datetime(2008, 8, 30), datetime(2009, 8, 29), - datetime(2010, 8, 28), datetime(2011, 8, 27), - datetime(2012, 8, 25), datetime(2013, 8, 31), - datetime(2014, 8, 30), datetime(2015, 8, 29), - datetime(2016, 8, 27)] + offset_lom_aug_sat = makeFY5253LastOfMonth(startingMonth=8, weekday=WeekDay.SAT) + offset_lom_aug_sat_1 = makeFY5253LastOfMonth( + n=1, startingMonth=8, weekday=WeekDay.SAT + ) + + date_seq_lom_aug_sat = [ + datetime(2006, 8, 26), + datetime(2007, 8, 25), + datetime(2008, 8, 30), + datetime(2009, 8, 29), + datetime(2010, 8, 28), + datetime(2011, 8, 27), + datetime(2012, 8, 25), + datetime(2013, 8, 31), + datetime(2014, 8, 30), + datetime(2015, 8, 29), + datetime(2016, 8, 27), + ] tests = [ (offset_lom_aug_sat, date_seq_lom_aug_sat), (offset_lom_aug_sat_1, date_seq_lom_aug_sat), - (offset_lom_aug_sat, [ - datetime(2006, 8, 25)] + date_seq_lom_aug_sat), - (offset_lom_aug_sat_1, [ - datetime(2006, 8, 27)] + date_seq_lom_aug_sat[1:]), - (makeFY5253LastOfMonth(n=-1, startingMonth=8, - weekday=WeekDay.SAT), - list(reversed(date_seq_lom_aug_sat))), + (offset_lom_aug_sat, [datetime(2006, 8, 25)] + date_seq_lom_aug_sat), + (offset_lom_aug_sat_1, [datetime(2006, 8, 27)] + date_seq_lom_aug_sat[1:]), + ( + makeFY5253LastOfMonth(n=-1, startingMonth=8, weekday=WeekDay.SAT), + list(reversed(date_seq_lom_aug_sat)), + ), ] for test in tests: offset, data = test @@ -156,44 +172,36 @@ def test_apply(self): class TestFY5253NearestEndMonth(Base): - def test_get_year_end(self): - assert (makeFY5253NearestEndMonth( - startingMonth=8, weekday=WeekDay.SAT).get_year_end( - datetime(2013, 1, 1)) == datetime(2013, 8, 31)) - assert (makeFY5253NearestEndMonth( - startingMonth=8, weekday=WeekDay.SUN).get_year_end( - datetime(2013, 1, 1)) == datetime(2013, 9, 1)) - assert (makeFY5253NearestEndMonth( - startingMonth=8, weekday=WeekDay.FRI).get_year_end( - datetime(2013, 1, 1)) == datetime(2013, 8, 30)) - - offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, - variation="nearest") - assert (offset_n.get_year_end(datetime(2012, 1, 1)) == - datetime(2013, 1, 1)) - assert (offset_n.get_year_end(datetime(2012, 1, 10)) == - datetime(2013, 1, 1)) - - assert (offset_n.get_year_end(datetime(2013, 1, 1)) == - datetime(2013, 12, 31)) - assert (offset_n.get_year_end(datetime(2013, 1, 2)) == - datetime(2013, 12, 31)) - assert (offset_n.get_year_end(datetime(2013, 1, 3)) == - datetime(2013, 12, 31)) - assert (offset_n.get_year_end(datetime(2013, 1, 10)) == - datetime(2013, 12, 31)) + assert makeFY5253NearestEndMonth( + startingMonth=8, weekday=WeekDay.SAT + ).get_year_end(datetime(2013, 1, 1)) == datetime(2013, 8, 31) + assert makeFY5253NearestEndMonth( + startingMonth=8, weekday=WeekDay.SUN + ).get_year_end(datetime(2013, 1, 1)) == datetime(2013, 9, 1) + assert makeFY5253NearestEndMonth( + startingMonth=8, weekday=WeekDay.FRI + ).get_year_end(datetime(2013, 1, 1)) == datetime(2013, 8, 30) + + offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, variation="nearest") + assert offset_n.get_year_end(datetime(2012, 1, 1)) == datetime(2013, 1, 1) + assert offset_n.get_year_end(datetime(2012, 1, 10)) == datetime(2013, 1, 1) + + assert offset_n.get_year_end(datetime(2013, 1, 1)) == datetime(2013, 12, 31) + assert offset_n.get_year_end(datetime(2013, 1, 2)) == datetime(2013, 12, 31) + assert offset_n.get_year_end(datetime(2013, 1, 3)) == datetime(2013, 12, 31) + assert offset_n.get_year_end(datetime(2013, 1, 10)) == datetime(2013, 12, 31) JNJ = FY5253(n=1, startingMonth=12, weekday=6, variation="nearest") - assert (JNJ.get_year_end(datetime(2006, 1, 1)) == - datetime(2006, 12, 31)) + assert JNJ.get_year_end(datetime(2006, 1, 1)) == datetime(2006, 12, 31) - offset_lom_aug_sat = makeFY5253NearestEndMonth(1, startingMonth=8, - weekday=WeekDay.SAT) - offset_lom_aug_thu = makeFY5253NearestEndMonth(1, startingMonth=8, - weekday=WeekDay.THU) - offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, - variation="nearest") + offset_lom_aug_sat = makeFY5253NearestEndMonth( + 1, startingMonth=8, weekday=WeekDay.SAT + ) + offset_lom_aug_thu = makeFY5253NearestEndMonth( + 1, startingMonth=8, weekday=WeekDay.THU + ) + offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, variation="nearest") on_offset_cases = [ # From Wikipedia (see: @@ -219,19 +227,16 @@ def test_get_year_end(self): (offset_lom_aug_sat, datetime(2009, 8, 29), True), (offset_lom_aug_sat, datetime(2010, 8, 28), True), (offset_lom_aug_sat, datetime(2011, 9, 3), True), - (offset_lom_aug_sat, datetime(2016, 9, 3), True), (offset_lom_aug_sat, datetime(2017, 9, 2), True), (offset_lom_aug_sat, datetime(2018, 9, 1), True), (offset_lom_aug_sat, datetime(2019, 8, 31), True), - (offset_lom_aug_sat, datetime(2006, 8, 27), False), (offset_lom_aug_sat, datetime(2007, 8, 28), False), (offset_lom_aug_sat, datetime(2008, 8, 31), False), (offset_lom_aug_sat, datetime(2009, 8, 30), False), (offset_lom_aug_sat, datetime(2010, 8, 29), False), (offset_lom_aug_sat, datetime(2011, 8, 28), False), - (offset_lom_aug_sat, datetime(2006, 8, 25), False), (offset_lom_aug_sat, datetime(2007, 8, 24), False), (offset_lom_aug_sat, datetime(2008, 8, 29), False), @@ -239,62 +244,82 @@ def test_get_year_end(self): (offset_lom_aug_sat, datetime(2010, 8, 27), False), (offset_lom_aug_sat, datetime(2011, 8, 26), False), (offset_lom_aug_sat, datetime(2019, 8, 30), False), - # From Micron, see: # http://google.brand.edgar-online.com/?sym=MU&formtypeID=7 (offset_lom_aug_thu, datetime(2012, 8, 30), True), (offset_lom_aug_thu, datetime(2011, 9, 1), True), - (offset_n, datetime(2012, 12, 31), False), (offset_n, datetime(2013, 1, 1), True), - (offset_n, datetime(2013, 1, 2), False)] + (offset_n, datetime(2013, 1, 2), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) def test_apply(self): - date_seq_nem_8_sat = [datetime(2006, 9, 2), datetime(2007, 9, 1), - datetime(2008, 8, 30), datetime(2009, 8, 29), - datetime(2010, 8, 28), datetime(2011, 9, 3)] + date_seq_nem_8_sat = [ + datetime(2006, 9, 2), + datetime(2007, 9, 1), + datetime(2008, 8, 30), + datetime(2009, 8, 29), + datetime(2010, 8, 28), + datetime(2011, 9, 3), + ] - JNJ = [datetime(2005, 1, 2), datetime(2006, 1, 1), - datetime(2006, 12, 31), datetime(2007, 12, 30), - datetime(2008, 12, 28), datetime(2010, 1, 3), - datetime(2011, 1, 2), datetime(2012, 1, 1), - datetime(2012, 12, 30)] + JNJ = [ + datetime(2005, 1, 2), + datetime(2006, 1, 1), + datetime(2006, 12, 31), + datetime(2007, 12, 30), + datetime(2008, 12, 28), + datetime(2010, 1, 3), + datetime(2011, 1, 2), + datetime(2012, 1, 1), + datetime(2012, 12, 30), + ] - DEC_SAT = FY5253(n=-1, startingMonth=12, weekday=5, - variation="nearest") + DEC_SAT = FY5253(n=-1, startingMonth=12, weekday=5, variation="nearest") tests = [ - (makeFY5253NearestEndMonth(startingMonth=8, - weekday=WeekDay.SAT), - date_seq_nem_8_sat), - (makeFY5253NearestEndMonth(n=1, startingMonth=8, - weekday=WeekDay.SAT), - date_seq_nem_8_sat), - (makeFY5253NearestEndMonth(startingMonth=8, weekday=WeekDay.SAT), - [datetime(2006, 9, 1)] + date_seq_nem_8_sat), - (makeFY5253NearestEndMonth(n=1, startingMonth=8, - weekday=WeekDay.SAT), - [datetime(2006, 9, 3)] + date_seq_nem_8_sat[1:]), - (makeFY5253NearestEndMonth(n=-1, startingMonth=8, - weekday=WeekDay.SAT), - list(reversed(date_seq_nem_8_sat))), - (makeFY5253NearestEndMonth(n=1, startingMonth=12, - weekday=WeekDay.SUN), JNJ), - (makeFY5253NearestEndMonth(n=-1, startingMonth=12, - weekday=WeekDay.SUN), - list(reversed(JNJ))), - (makeFY5253NearestEndMonth(n=1, startingMonth=12, - weekday=WeekDay.SUN), - [datetime(2005, 1, 2), datetime(2006, 1, 1)]), - (makeFY5253NearestEndMonth(n=1, startingMonth=12, - weekday=WeekDay.SUN), - [datetime(2006, 1, 2), datetime(2006, 12, 31)]), - (DEC_SAT, [datetime(2013, 1, 15), datetime(2012, 12, 29)]) + ( + makeFY5253NearestEndMonth(startingMonth=8, weekday=WeekDay.SAT), + date_seq_nem_8_sat, + ), + ( + makeFY5253NearestEndMonth(n=1, startingMonth=8, weekday=WeekDay.SAT), + date_seq_nem_8_sat, + ), + ( + makeFY5253NearestEndMonth(startingMonth=8, weekday=WeekDay.SAT), + [datetime(2006, 9, 1)] + date_seq_nem_8_sat, + ), + ( + makeFY5253NearestEndMonth(n=1, startingMonth=8, weekday=WeekDay.SAT), + [datetime(2006, 9, 3)] + date_seq_nem_8_sat[1:], + ), + ( + makeFY5253NearestEndMonth(n=-1, startingMonth=8, weekday=WeekDay.SAT), + list(reversed(date_seq_nem_8_sat)), + ), + ( + makeFY5253NearestEndMonth(n=1, startingMonth=12, weekday=WeekDay.SUN), + JNJ, + ), + ( + makeFY5253NearestEndMonth(n=-1, startingMonth=12, weekday=WeekDay.SUN), + list(reversed(JNJ)), + ), + ( + makeFY5253NearestEndMonth(n=1, startingMonth=12, weekday=WeekDay.SUN), + [datetime(2005, 1, 2), datetime(2006, 1, 1)], + ), + ( + makeFY5253NearestEndMonth(n=1, startingMonth=12, weekday=WeekDay.SUN), + [datetime(2006, 1, 2), datetime(2006, 12, 31)], + ), + (DEC_SAT, [datetime(2013, 1, 15), datetime(2012, 12, 29)]), ] for test in tests: offset, data = test @@ -305,70 +330,82 @@ def test_apply(self): class TestFY5253LastOfMonthQuarter(Base): - def test_isAnchored(self): assert makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, - qtr_with_extra_week=4).isAnchored() + startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ).isAnchored() assert makeFY5253LastOfMonthQuarter( - weekday=WeekDay.SAT, startingMonth=3, - qtr_with_extra_week=4).isAnchored() + weekday=WeekDay.SAT, startingMonth=3, qtr_with_extra_week=4 + ).isAnchored() assert not makeFY5253LastOfMonthQuarter( - 2, startingMonth=1, weekday=WeekDay.SAT, - qtr_with_extra_week=4).isAnchored() + 2, startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ).isAnchored() def test_equality(self): - assert (makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, - qtr_with_extra_week=4) == makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4)) - assert (makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, - qtr_with_extra_week=4) != makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SUN, qtr_with_extra_week=4)) - assert (makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, - qtr_with_extra_week=4) != makeFY5253LastOfMonthQuarter( - startingMonth=2, weekday=WeekDay.SAT, qtr_with_extra_week=4)) + assert makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) == makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + assert makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) != makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SUN, qtr_with_extra_week=4 + ) + assert makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) != makeFY5253LastOfMonthQuarter( + startingMonth=2, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) def test_offset(self): - offset = makeFY5253LastOfMonthQuarter(1, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - offset2 = makeFY5253LastOfMonthQuarter(2, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - offset4 = makeFY5253LastOfMonthQuarter(4, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - - offset_neg1 = makeFY5253LastOfMonthQuarter(-1, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - offset_neg2 = makeFY5253LastOfMonthQuarter(-2, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - - GMCR = [datetime(2010, 3, 27), datetime(2010, 6, 26), - datetime(2010, 9, 25), datetime(2010, 12, 25), - datetime(2011, 3, 26), datetime(2011, 6, 25), - datetime(2011, 9, 24), datetime(2011, 12, 24), - datetime(2012, 3, 24), datetime(2012, 6, 23), - datetime(2012, 9, 29), datetime(2012, 12, 29), - datetime(2013, 3, 30), datetime(2013, 6, 29)] + offset = makeFY5253LastOfMonthQuarter( + 1, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + offset2 = makeFY5253LastOfMonthQuarter( + 2, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + offset4 = makeFY5253LastOfMonthQuarter( + 4, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + + offset_neg1 = makeFY5253LastOfMonthQuarter( + -1, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + offset_neg2 = makeFY5253LastOfMonthQuarter( + -2, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + + GMCR = [ + datetime(2010, 3, 27), + datetime(2010, 6, 26), + datetime(2010, 9, 25), + datetime(2010, 12, 25), + datetime(2011, 3, 26), + datetime(2011, 6, 25), + datetime(2011, 9, 24), + datetime(2011, 12, 24), + datetime(2012, 3, 24), + datetime(2012, 6, 23), + datetime(2012, 9, 29), + datetime(2012, 12, 29), + datetime(2013, 3, 30), + datetime(2013, 6, 29), + ] assert_offset_equal(offset, base=GMCR[0], expected=GMCR[1]) - assert_offset_equal(offset, base=GMCR[0] + relativedelta(days=-1), - expected=GMCR[0]) + assert_offset_equal( + offset, base=GMCR[0] + relativedelta(days=-1), expected=GMCR[0] + ) assert_offset_equal(offset, base=GMCR[1], expected=GMCR[2]) assert_offset_equal(offset2, base=GMCR[0], expected=GMCR[2]) assert_offset_equal(offset4, base=GMCR[0], expected=GMCR[4]) assert_offset_equal(offset_neg1, base=GMCR[-1], expected=GMCR[-2]) - assert_offset_equal(offset_neg1, - base=GMCR[-1] + relativedelta(days=+1), - expected=GMCR[-1]) + assert_offset_equal( + offset_neg1, base=GMCR[-1] + relativedelta(days=+1), expected=GMCR[-1] + ) assert_offset_equal(offset_neg2, base=GMCR[-1], expected=GMCR[-3]) date = GMCR[0] + relativedelta(days=-1) @@ -381,12 +418,12 @@ def test_offset(self): assert_offset_equal(offset_neg1, date, expected) date = date + offset_neg1 - lomq_aug_sat_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=8, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - lomq_sep_sat_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) + lomq_aug_sat_4 = makeFY5253LastOfMonthQuarter( + 1, startingMonth=8, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + lomq_sep_sat_4 = makeFY5253LastOfMonthQuarter( + 1, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) on_offset_cases = [ # From Wikipedia @@ -397,14 +434,12 @@ def test_offset(self): (lomq_aug_sat_4, datetime(2010, 8, 28), True), (lomq_aug_sat_4, datetime(2011, 8, 27), True), (lomq_aug_sat_4, datetime(2019, 8, 31), True), - (lomq_aug_sat_4, datetime(2006, 8, 27), False), (lomq_aug_sat_4, datetime(2007, 8, 28), False), (lomq_aug_sat_4, datetime(2008, 8, 31), False), (lomq_aug_sat_4, datetime(2009, 8, 30), False), (lomq_aug_sat_4, datetime(2010, 8, 29), False), (lomq_aug_sat_4, datetime(2011, 8, 28), False), - (lomq_aug_sat_4, datetime(2006, 8, 25), False), (lomq_aug_sat_4, datetime(2007, 8, 24), False), (lomq_aug_sat_4, datetime(2008, 8, 29), False), @@ -412,44 +447,51 @@ def test_offset(self): (lomq_aug_sat_4, datetime(2010, 8, 27), False), (lomq_aug_sat_4, datetime(2011, 8, 26), False), (lomq_aug_sat_4, datetime(2019, 8, 30), False), - # From GMCR (lomq_sep_sat_4, datetime(2010, 9, 25), True), (lomq_sep_sat_4, datetime(2011, 9, 24), True), (lomq_sep_sat_4, datetime(2012, 9, 29), True), - (lomq_sep_sat_4, datetime(2013, 6, 29), True), (lomq_sep_sat_4, datetime(2012, 6, 23), True), (lomq_sep_sat_4, datetime(2012, 6, 30), False), - (lomq_sep_sat_4, datetime(2013, 3, 30), True), (lomq_sep_sat_4, datetime(2012, 3, 24), True), - (lomq_sep_sat_4, datetime(2012, 12, 29), True), (lomq_sep_sat_4, datetime(2011, 12, 24), True), - # INTC (extra week in Q1) # See: http://www.intc.com/releasedetail.cfm?ReleaseID=542844 - (makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=1), - datetime(2011, 4, 2), True), - + ( + makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ), + datetime(2011, 4, 2), + True, + ), # see: http://google.brand.edgar-online.com/?sym=INTC&formtypeID=7 - (makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=1), - datetime(2012, 12, 29), True), - (makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=1), - datetime(2011, 12, 31), True), - (makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=1), - datetime(2010, 12, 25), True)] - - @pytest.mark.parametrize('case', on_offset_cases) + ( + makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ), + datetime(2012, 12, 29), + True, + ), + ( + makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ), + datetime(2011, 12, 31), + True, + ), + ( + makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ), + datetime(2010, 12, 25), + True, + ), + ] + + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) @@ -457,47 +499,46 @@ def test_onOffset(self, case): def test_year_has_extra_week(self): # End of long Q1 assert makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(2011, 4, 2)) + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ).year_has_extra_week(datetime(2011, 4, 2)) # Start of long Q1 assert makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(2010, 12, 26)) + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ).year_has_extra_week(datetime(2010, 12, 26)) # End of year before year with long Q1 assert not makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(2010, 12, 25)) + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ).year_has_extra_week(datetime(2010, 12, 25)) - for year in [x - for x in range(1994, 2011 + 1) - if x not in [2011, 2005, 2000, 1994]]: + for year in [ + x for x in range(1994, 2011 + 1) if x not in [2011, 2005, 2000, 1994] + ]: assert not makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week( - datetime(year, 4, 2)) + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ).year_has_extra_week(datetime(year, 4, 2)) # Other long years assert makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(2005, 4, 2)) + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ).year_has_extra_week(datetime(2005, 4, 2)) assert makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(2000, 4, 2)) + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ).year_has_extra_week(datetime(2000, 4, 2)) assert makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(1994, 4, 2)) + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ).year_has_extra_week(datetime(1994, 4, 2)) def test_get_weeks(self): - sat_dec_1 = makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=1) - sat_dec_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) + sat_dec_1 = makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ) + sat_dec_4 = makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) assert sat_dec_1.get_weeks(datetime(2011, 4, 2)) == [14, 13, 13, 13] assert sat_dec_4.get_weeks(datetime(2011, 4, 2)) == [13, 13, 13, 14] @@ -507,13 +548,12 @@ def test_get_weeks(self): class TestFY5253NearestEndMonthQuarter(Base): offset_nem_sat_aug_4 = makeFY5253NearestEndMonthQuarter( - 1, startingMonth=8, weekday=WeekDay.SAT, - qtr_with_extra_week=4) + 1, startingMonth=8, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) offset_nem_thu_aug_4 = makeFY5253NearestEndMonthQuarter( - 1, startingMonth=8, weekday=WeekDay.THU, - qtr_with_extra_week=4) - offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, - variation="nearest") + 1, startingMonth=8, weekday=WeekDay.THU, qtr_with_extra_week=4 + ) + offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, variation="nearest") on_offset_cases = [ # From Wikipedia @@ -523,19 +563,16 @@ class TestFY5253NearestEndMonthQuarter(Base): (offset_nem_sat_aug_4, datetime(2009, 8, 29), True), (offset_nem_sat_aug_4, datetime(2010, 8, 28), True), (offset_nem_sat_aug_4, datetime(2011, 9, 3), True), - (offset_nem_sat_aug_4, datetime(2016, 9, 3), True), (offset_nem_sat_aug_4, datetime(2017, 9, 2), True), (offset_nem_sat_aug_4, datetime(2018, 9, 1), True), (offset_nem_sat_aug_4, datetime(2019, 8, 31), True), - (offset_nem_sat_aug_4, datetime(2006, 8, 27), False), (offset_nem_sat_aug_4, datetime(2007, 8, 28), False), (offset_nem_sat_aug_4, datetime(2008, 8, 31), False), (offset_nem_sat_aug_4, datetime(2009, 8, 30), False), (offset_nem_sat_aug_4, datetime(2010, 8, 29), False), (offset_nem_sat_aug_4, datetime(2011, 8, 28), False), - (offset_nem_sat_aug_4, datetime(2006, 8, 25), False), (offset_nem_sat_aug_4, datetime(2007, 8, 24), False), (offset_nem_sat_aug_4, datetime(2008, 8, 29), False), @@ -543,12 +580,10 @@ class TestFY5253NearestEndMonthQuarter(Base): (offset_nem_sat_aug_4, datetime(2010, 8, 27), False), (offset_nem_sat_aug_4, datetime(2011, 8, 26), False), (offset_nem_sat_aug_4, datetime(2019, 8, 30), False), - # From Micron, see: # http://google.brand.edgar-online.com/?sym=MU&formtypeID=7 (offset_nem_thu_aug_4, datetime(2012, 8, 30), True), (offset_nem_thu_aug_4, datetime(2011, 9, 1), True), - # See: http://google.brand.edgar-online.com/?sym=MU&formtypeID=13 (offset_nem_thu_aug_4, datetime(2013, 5, 30), True), (offset_nem_thu_aug_4, datetime(2013, 2, 28), True), @@ -556,70 +591,68 @@ class TestFY5253NearestEndMonthQuarter(Base): (offset_nem_thu_aug_4, datetime(2012, 5, 31), True), (offset_nem_thu_aug_4, datetime(2007, 3, 1), True), (offset_nem_thu_aug_4, datetime(1994, 3, 3), True), - (offset_n, datetime(2012, 12, 31), False), (offset_n, datetime(2013, 1, 1), True), - (offset_n, datetime(2013, 1, 2), False)] + (offset_n, datetime(2013, 1, 2), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) def test_offset(self): - offset = makeFY5253NearestEndMonthQuarter(1, startingMonth=8, - weekday=WeekDay.THU, - qtr_with_extra_week=4) - - MU = [datetime(2012, 5, 31), - datetime(2012, 8, 30), datetime(2012, 11, 29), - datetime(2013, 2, 28), datetime(2013, 5, 30)] + offset = makeFY5253NearestEndMonthQuarter( + 1, startingMonth=8, weekday=WeekDay.THU, qtr_with_extra_week=4 + ) + + MU = [ + datetime(2012, 5, 31), + datetime(2012, 8, 30), + datetime(2012, 11, 29), + datetime(2013, 2, 28), + datetime(2013, 5, 30), + ] date = MU[0] + relativedelta(days=-1) for expected in MU: assert_offset_equal(offset, date, expected) date = date + offset - assert_offset_equal(offset, - datetime(2012, 5, 31), - datetime(2012, 8, 30)) - assert_offset_equal(offset, - datetime(2012, 5, 30), - datetime(2012, 5, 31)) + assert_offset_equal(offset, datetime(2012, 5, 31), datetime(2012, 8, 30)) + assert_offset_equal(offset, datetime(2012, 5, 30), datetime(2012, 5, 31)) - offset2 = FY5253Quarter(weekday=5, startingMonth=12, variation="last", - qtr_with_extra_week=4) + offset2 = FY5253Quarter( + weekday=5, startingMonth=12, variation="last", qtr_with_extra_week=4 + ) - assert_offset_equal(offset2, - datetime(2013, 1, 15), - datetime(2013, 3, 30)) + assert_offset_equal(offset2, datetime(2013, 1, 15), datetime(2013, 3, 30)) def test_bunched_yearends(): # GH#14774 cases with two fiscal year-ends in the same calendar-year - fy = FY5253(n=1, weekday=5, startingMonth=12, variation='nearest') - dt = Timestamp('2004-01-01') - assert fy.rollback(dt) == Timestamp('2002-12-28') - assert (-fy).apply(dt) == Timestamp('2002-12-28') - assert dt - fy == Timestamp('2002-12-28') + fy = FY5253(n=1, weekday=5, startingMonth=12, variation="nearest") + dt = Timestamp("2004-01-01") + assert fy.rollback(dt) == Timestamp("2002-12-28") + assert (-fy).apply(dt) == Timestamp("2002-12-28") + assert dt - fy == Timestamp("2002-12-28") - assert fy.rollforward(dt) == Timestamp('2004-01-03') - assert fy.apply(dt) == Timestamp('2004-01-03') - assert fy + dt == Timestamp('2004-01-03') - assert dt + fy == Timestamp('2004-01-03') + assert fy.rollforward(dt) == Timestamp("2004-01-03") + assert fy.apply(dt) == Timestamp("2004-01-03") + assert fy + dt == Timestamp("2004-01-03") + assert dt + fy == Timestamp("2004-01-03") # Same thing, but starting from a Timestamp in the previous year. - dt = Timestamp('2003-12-31') - assert fy.rollback(dt) == Timestamp('2002-12-28') - assert (-fy).apply(dt) == Timestamp('2002-12-28') - assert dt - fy == Timestamp('2002-12-28') + dt = Timestamp("2003-12-31") + assert fy.rollback(dt) == Timestamp("2002-12-28") + assert (-fy).apply(dt) == Timestamp("2002-12-28") + assert dt - fy == Timestamp("2002-12-28") def test_fy5253_last_onoffset(): # GH#18877 dates on the year-end but not normalized to midnight offset = FY5253(n=-5, startingMonth=5, variation="last", weekday=0) - ts = Timestamp('1984-05-28 06:29:43.955911354+0200', - tz='Europe/San_Marino') + ts = Timestamp("1984-05-28 06:29:43.955911354+0200", tz="Europe/San_Marino") fast = offset.onOffset(ts) slow = (ts + offset) - offset == ts assert fast == slow @@ -628,7 +661,7 @@ def test_fy5253_last_onoffset(): def test_fy5253_nearest_onoffset(): # GH#18877 dates on the year-end but not normalized to midnight offset = FY5253(n=3, startingMonth=7, variation="nearest", weekday=2) - ts = Timestamp('2032-07-28 00:12:59.035729419+0000', tz='Africa/Dakar') + ts = Timestamp("2032-07-28 00:12:59.035729419+0000", tz="Africa/Dakar") fast = offset.onOffset(ts) slow = (ts + offset) - offset == ts assert fast == slow @@ -636,10 +669,10 @@ def test_fy5253_nearest_onoffset(): def test_fy5253qtr_onoffset_nearest(): # GH#19036 - ts = Timestamp('1985-09-02 23:57:46.232550356-0300', - tz='Atlantic/Bermuda') - offset = FY5253Quarter(n=3, qtr_with_extra_week=1, startingMonth=2, - variation="nearest", weekday=0) + ts = Timestamp("1985-09-02 23:57:46.232550356-0300", tz="Atlantic/Bermuda") + offset = FY5253Quarter( + n=3, qtr_with_extra_week=1, startingMonth=2, variation="nearest", weekday=0 + ) fast = offset.onOffset(ts) slow = (ts + offset) - offset == ts assert fast == slow @@ -647,10 +680,10 @@ def test_fy5253qtr_onoffset_nearest(): def test_fy5253qtr_onoffset_last(): # GH#19036 - offset = FY5253Quarter(n=-2, qtr_with_extra_week=1, - startingMonth=7, variation="last", weekday=2) - ts = Timestamp('2011-01-26 19:03:40.331096129+0200', - tz='Africa/Windhoek') + offset = FY5253Quarter( + n=-2, qtr_with_extra_week=1, startingMonth=7, variation="last", weekday=2 + ) + ts = Timestamp("2011-01-26 19:03:40.331096129+0200", tz="Africa/Windhoek") slow = (ts + offset) - offset == ts fast = offset.onOffset(ts) assert fast == slow diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index c0021b1eade782..822e97b21f0da6 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -4,9 +4,17 @@ import pytest from pandas._libs.tslibs import ( - NaT, OutOfBoundsDatetime, Timestamp, conversion, timezones) + NaT, + OutOfBoundsDatetime, + Timestamp, + conversion, + timezones, +) from pandas._libs.tslibs.frequencies import ( - INVALID_FREQ_ERR_MSG, get_freq_code, get_freq_str) + INVALID_FREQ_ERR_MSG, + get_freq_code, + get_freq_str, +) import pandas._libs.tslibs.offsets as liboffsets from pandas._libs.tslibs.offsets import ApplyTypeError import pandas.compat as compat @@ -21,11 +29,37 @@ from pandas.tseries.holiday import USFederalHolidayCalendar import pandas.tseries.offsets as offsets from pandas.tseries.offsets import ( - FY5253, BDay, BMonthBegin, BMonthEnd, BQuarterBegin, BQuarterEnd, - BusinessHour, BYearBegin, BYearEnd, CBMonthBegin, CBMonthEnd, CDay, - CustomBusinessHour, DateOffset, Day, Easter, FY5253Quarter, - LastWeekOfMonth, MonthBegin, MonthEnd, Nano, QuarterBegin, QuarterEnd, - SemiMonthBegin, SemiMonthEnd, Tick, Week, WeekOfMonth, YearBegin, YearEnd) + FY5253, + BDay, + BMonthBegin, + BMonthEnd, + BQuarterBegin, + BQuarterEnd, + BusinessHour, + BYearBegin, + BYearEnd, + CBMonthBegin, + CBMonthEnd, + CDay, + CustomBusinessHour, + DateOffset, + Day, + Easter, + FY5253Quarter, + LastWeekOfMonth, + MonthBegin, + MonthEnd, + Nano, + QuarterBegin, + QuarterEnd, + SemiMonthBegin, + SemiMonthEnd, + Tick, + Week, + WeekOfMonth, + YearBegin, + YearEnd, +) from .common import assert_offset_equal, assert_onOffset @@ -61,18 +95,34 @@ class Base: _offset = None d = Timestamp(datetime(2008, 1, 2)) - timezones = [None, 'UTC', 'Asia/Tokyo', 'US/Eastern', - 'dateutil/Asia/Tokyo', 'dateutil/US/Pacific'] + timezones = [ + None, + "UTC", + "Asia/Tokyo", + "US/Eastern", + "dateutil/Asia/Tokyo", + "dateutil/US/Pacific", + ] def _get_offset(self, klass, value=1, normalize=False): # create instance from offset class if klass is FY5253: - klass = klass(n=value, startingMonth=1, weekday=1, - variation='last', normalize=normalize) + klass = klass( + n=value, + startingMonth=1, + weekday=1, + variation="last", + normalize=normalize, + ) elif klass is FY5253Quarter: - klass = klass(n=value, startingMonth=1, weekday=1, - qtr_with_extra_week=1, variation='last', - normalize=normalize) + klass = klass( + n=value, + startingMonth=1, + weekday=1, + qtr_with_extra_week=1, + variation="last", + normalize=normalize, + ) elif klass is LastWeekOfMonth: klass = klass(n=value, weekday=5, normalize=normalize) elif klass is WeekOfMonth: @@ -103,12 +153,12 @@ def test_apply_out_of_range(self, tz_naive_fixture): else: offset = self._get_offset(self._offset, value=10000) - result = Timestamp('20080101') + offset + result = Timestamp("20080101") + offset assert isinstance(result, datetime) assert result.tzinfo is None # Check tz is preserved - t = Timestamp('20080101', tz=tz) + t = Timestamp("20080101", tz=tz) result = t + offset assert isinstance(result, datetime) assert t.tzinfo == result.tzinfo @@ -189,41 +239,42 @@ class TestCommon(Base): # exected value created by Base._get_offset # are applied to 2011/01/01 09:00 (Saturday) # used for .apply and .rollforward - expecteds = {'Day': Timestamp('2011-01-02 09:00:00'), - 'DateOffset': Timestamp('2011-01-02 09:00:00'), - 'BusinessDay': Timestamp('2011-01-03 09:00:00'), - 'CustomBusinessDay': Timestamp('2011-01-03 09:00:00'), - 'CustomBusinessMonthEnd': Timestamp('2011-01-31 09:00:00'), - 'CustomBusinessMonthBegin': Timestamp('2011-01-03 09:00:00'), - 'MonthBegin': Timestamp('2011-02-01 09:00:00'), - 'BusinessMonthBegin': Timestamp('2011-01-03 09:00:00'), - 'MonthEnd': Timestamp('2011-01-31 09:00:00'), - 'SemiMonthEnd': Timestamp('2011-01-15 09:00:00'), - 'SemiMonthBegin': Timestamp('2011-01-15 09:00:00'), - 'BusinessMonthEnd': Timestamp('2011-01-31 09:00:00'), - 'YearBegin': Timestamp('2012-01-01 09:00:00'), - 'BYearBegin': Timestamp('2011-01-03 09:00:00'), - 'YearEnd': Timestamp('2011-12-31 09:00:00'), - 'BYearEnd': Timestamp('2011-12-30 09:00:00'), - 'QuarterBegin': Timestamp('2011-03-01 09:00:00'), - 'BQuarterBegin': Timestamp('2011-03-01 09:00:00'), - 'QuarterEnd': Timestamp('2011-03-31 09:00:00'), - 'BQuarterEnd': Timestamp('2011-03-31 09:00:00'), - 'BusinessHour': Timestamp('2011-01-03 10:00:00'), - 'CustomBusinessHour': Timestamp('2011-01-03 10:00:00'), - 'WeekOfMonth': Timestamp('2011-01-08 09:00:00'), - 'LastWeekOfMonth': Timestamp('2011-01-29 09:00:00'), - 'FY5253Quarter': Timestamp('2011-01-25 09:00:00'), - 'FY5253': Timestamp('2011-01-25 09:00:00'), - 'Week': Timestamp('2011-01-08 09:00:00'), - 'Easter': Timestamp('2011-04-24 09:00:00'), - 'Hour': Timestamp('2011-01-01 10:00:00'), - 'Minute': Timestamp('2011-01-01 09:01:00'), - 'Second': Timestamp('2011-01-01 09:00:01'), - 'Milli': Timestamp('2011-01-01 09:00:00.001000'), - 'Micro': Timestamp('2011-01-01 09:00:00.000001'), - 'Nano': Timestamp(np_datetime64_compat( - '2011-01-01T09:00:00.000000001Z'))} + expecteds = { + "Day": Timestamp("2011-01-02 09:00:00"), + "DateOffset": Timestamp("2011-01-02 09:00:00"), + "BusinessDay": Timestamp("2011-01-03 09:00:00"), + "CustomBusinessDay": Timestamp("2011-01-03 09:00:00"), + "CustomBusinessMonthEnd": Timestamp("2011-01-31 09:00:00"), + "CustomBusinessMonthBegin": Timestamp("2011-01-03 09:00:00"), + "MonthBegin": Timestamp("2011-02-01 09:00:00"), + "BusinessMonthBegin": Timestamp("2011-01-03 09:00:00"), + "MonthEnd": Timestamp("2011-01-31 09:00:00"), + "SemiMonthEnd": Timestamp("2011-01-15 09:00:00"), + "SemiMonthBegin": Timestamp("2011-01-15 09:00:00"), + "BusinessMonthEnd": Timestamp("2011-01-31 09:00:00"), + "YearBegin": Timestamp("2012-01-01 09:00:00"), + "BYearBegin": Timestamp("2011-01-03 09:00:00"), + "YearEnd": Timestamp("2011-12-31 09:00:00"), + "BYearEnd": Timestamp("2011-12-30 09:00:00"), + "QuarterBegin": Timestamp("2011-03-01 09:00:00"), + "BQuarterBegin": Timestamp("2011-03-01 09:00:00"), + "QuarterEnd": Timestamp("2011-03-31 09:00:00"), + "BQuarterEnd": Timestamp("2011-03-31 09:00:00"), + "BusinessHour": Timestamp("2011-01-03 10:00:00"), + "CustomBusinessHour": Timestamp("2011-01-03 10:00:00"), + "WeekOfMonth": Timestamp("2011-01-08 09:00:00"), + "LastWeekOfMonth": Timestamp("2011-01-29 09:00:00"), + "FY5253Quarter": Timestamp("2011-01-25 09:00:00"), + "FY5253": Timestamp("2011-01-25 09:00:00"), + "Week": Timestamp("2011-01-08 09:00:00"), + "Easter": Timestamp("2011-04-24 09:00:00"), + "Hour": Timestamp("2011-01-01 10:00:00"), + "Minute": Timestamp("2011-01-01 09:01:00"), + "Second": Timestamp("2011-01-01 09:00:01"), + "Milli": Timestamp("2011-01-01 09:00:00.001000"), + "Micro": Timestamp("2011-01-01 09:00:00.000001"), + "Nano": Timestamp(np_datetime64_compat("2011-01-01T09:00:00.000000001Z")), + } def test_immutable(self, offset_types): # GH#21341 check that __setattr__ raises @@ -237,7 +288,7 @@ def test_return_type(self, offset_types): offset = self._get_offset(offset_types) # make sure that we are returning a Timestamp - result = Timestamp('20080101') + offset + result = Timestamp("20080101") + offset assert isinstance(result, Timestamp) # make sure that we are returning NaT @@ -262,7 +313,7 @@ def test_offset_timedelta64_arg(self, offset_types): # object off = self._get_offset(offset_types) - td64 = np.timedelta64(4567, 's') + td64 = np.timedelta64(4567, "s") with pytest.raises(TypeError, match="argument must be an integer"): type(off)(n=td64, **off.kwds) @@ -281,14 +332,11 @@ def test_offset_freqstr(self, offset_types): offset = self._get_offset(offset_types) freqstr = offset.freqstr - if freqstr not in ('', - "", - 'LWOM-SAT', ): + if freqstr not in ("", "", "LWOM-SAT"): code = get_offset(freqstr) assert offset.rule_code == code - def _check_offsetfunc_works(self, offset, funcname, dt, expected, - normalize=False): + def _check_offsetfunc_works(self, offset, funcname, dt, expected, normalize=False): if normalize and issubclass(offset, Tick): # normalize=True disallowed for Tick subclasses GH#21427 @@ -309,14 +357,15 @@ def _check_offsetfunc_works(self, offset, funcname, dt, expected, exp_warning = None ts = Timestamp(dt) + Nano(5) - if (offset_s.__class__.__name__ == 'DateOffset' and - (funcname == 'apply' or normalize) and - ts.nanosecond > 0): + if ( + offset_s.__class__.__name__ == "DateOffset" + and (funcname == "apply" or normalize) + and ts.nanosecond > 0 + ): exp_warning = UserWarning # test nanosecond is preserved - with tm.assert_produces_warning(exp_warning, - check_stacklevel=False): + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): result = func(ts) assert isinstance(result, Timestamp) if normalize is False: @@ -345,14 +394,15 @@ def _check_offsetfunc_works(self, offset, funcname, dt, expected, exp_warning = None ts = Timestamp(dt, tz=tz) + Nano(5) - if (offset_s.__class__.__name__ == 'DateOffset' and - (funcname == 'apply' or normalize) and - ts.nanosecond > 0): + if ( + offset_s.__class__.__name__ == "DateOffset" + and (funcname == "apply" or normalize) + and ts.nanosecond > 0 + ): exp_warning = UserWarning # test nanosecond is preserved - with tm.assert_produces_warning(exp_warning, - check_stacklevel=False): + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): result = func(ts) assert isinstance(result, Timestamp) if normalize is False: @@ -362,119 +412,146 @@ def _check_offsetfunc_works(self, offset, funcname, dt, expected, def test_apply(self, offset_types): sdt = datetime(2011, 1, 1, 9, 0) - ndt = np_datetime64_compat('2011-01-01 09:00Z') + ndt = np_datetime64_compat("2011-01-01 09:00Z") for dt in [sdt, ndt]: expected = self.expecteds[offset_types.__name__] - self._check_offsetfunc_works(offset_types, 'apply', dt, expected) + self._check_offsetfunc_works(offset_types, "apply", dt, expected) expected = Timestamp(expected.date()) - self._check_offsetfunc_works(offset_types, 'apply', dt, expected, - normalize=True) + self._check_offsetfunc_works( + offset_types, "apply", dt, expected, normalize=True + ) def test_rollforward(self, offset_types): expecteds = self.expecteds.copy() # result will not be changed if the target is on the offset - no_changes = ['Day', 'MonthBegin', 'SemiMonthBegin', 'YearBegin', - 'Week', 'Hour', 'Minute', 'Second', 'Milli', 'Micro', - 'Nano', 'DateOffset'] + no_changes = [ + "Day", + "MonthBegin", + "SemiMonthBegin", + "YearBegin", + "Week", + "Hour", + "Minute", + "Second", + "Milli", + "Micro", + "Nano", + "DateOffset", + ] for n in no_changes: - expecteds[n] = Timestamp('2011/01/01 09:00') + expecteds[n] = Timestamp("2011/01/01 09:00") - expecteds['BusinessHour'] = Timestamp('2011-01-03 09:00:00') - expecteds['CustomBusinessHour'] = Timestamp('2011-01-03 09:00:00') + expecteds["BusinessHour"] = Timestamp("2011-01-03 09:00:00") + expecteds["CustomBusinessHour"] = Timestamp("2011-01-03 09:00:00") # but be changed when normalize=True norm_expected = expecteds.copy() for k in norm_expected: norm_expected[k] = Timestamp(norm_expected[k].date()) - normalized = {'Day': Timestamp('2011-01-02 00:00:00'), - 'DateOffset': Timestamp('2011-01-02 00:00:00'), - 'MonthBegin': Timestamp('2011-02-01 00:00:00'), - 'SemiMonthBegin': Timestamp('2011-01-15 00:00:00'), - 'YearBegin': Timestamp('2012-01-01 00:00:00'), - 'Week': Timestamp('2011-01-08 00:00:00'), - 'Hour': Timestamp('2011-01-01 00:00:00'), - 'Minute': Timestamp('2011-01-01 00:00:00'), - 'Second': Timestamp('2011-01-01 00:00:00'), - 'Milli': Timestamp('2011-01-01 00:00:00'), - 'Micro': Timestamp('2011-01-01 00:00:00')} + normalized = { + "Day": Timestamp("2011-01-02 00:00:00"), + "DateOffset": Timestamp("2011-01-02 00:00:00"), + "MonthBegin": Timestamp("2011-02-01 00:00:00"), + "SemiMonthBegin": Timestamp("2011-01-15 00:00:00"), + "YearBegin": Timestamp("2012-01-01 00:00:00"), + "Week": Timestamp("2011-01-08 00:00:00"), + "Hour": Timestamp("2011-01-01 00:00:00"), + "Minute": Timestamp("2011-01-01 00:00:00"), + "Second": Timestamp("2011-01-01 00:00:00"), + "Milli": Timestamp("2011-01-01 00:00:00"), + "Micro": Timestamp("2011-01-01 00:00:00"), + } norm_expected.update(normalized) sdt = datetime(2011, 1, 1, 9, 0) - ndt = np_datetime64_compat('2011-01-01 09:00Z') + ndt = np_datetime64_compat("2011-01-01 09:00Z") for dt in [sdt, ndt]: expected = expecteds[offset_types.__name__] - self._check_offsetfunc_works(offset_types, 'rollforward', dt, - expected) + self._check_offsetfunc_works(offset_types, "rollforward", dt, expected) expected = norm_expected[offset_types.__name__] - self._check_offsetfunc_works(offset_types, 'rollforward', dt, - expected, normalize=True) + self._check_offsetfunc_works( + offset_types, "rollforward", dt, expected, normalize=True + ) def test_rollback(self, offset_types): - expecteds = {'BusinessDay': Timestamp('2010-12-31 09:00:00'), - 'CustomBusinessDay': Timestamp('2010-12-31 09:00:00'), - 'CustomBusinessMonthEnd': - Timestamp('2010-12-31 09:00:00'), - 'CustomBusinessMonthBegin': - Timestamp('2010-12-01 09:00:00'), - 'BusinessMonthBegin': Timestamp('2010-12-01 09:00:00'), - 'MonthEnd': Timestamp('2010-12-31 09:00:00'), - 'SemiMonthEnd': Timestamp('2010-12-31 09:00:00'), - 'BusinessMonthEnd': Timestamp('2010-12-31 09:00:00'), - 'BYearBegin': Timestamp('2010-01-01 09:00:00'), - 'YearEnd': Timestamp('2010-12-31 09:00:00'), - 'BYearEnd': Timestamp('2010-12-31 09:00:00'), - 'QuarterBegin': Timestamp('2010-12-01 09:00:00'), - 'BQuarterBegin': Timestamp('2010-12-01 09:00:00'), - 'QuarterEnd': Timestamp('2010-12-31 09:00:00'), - 'BQuarterEnd': Timestamp('2010-12-31 09:00:00'), - 'BusinessHour': Timestamp('2010-12-31 17:00:00'), - 'CustomBusinessHour': Timestamp('2010-12-31 17:00:00'), - 'WeekOfMonth': Timestamp('2010-12-11 09:00:00'), - 'LastWeekOfMonth': Timestamp('2010-12-25 09:00:00'), - 'FY5253Quarter': Timestamp('2010-10-26 09:00:00'), - 'FY5253': Timestamp('2010-01-26 09:00:00'), - 'Easter': Timestamp('2010-04-04 09:00:00')} + expecteds = { + "BusinessDay": Timestamp("2010-12-31 09:00:00"), + "CustomBusinessDay": Timestamp("2010-12-31 09:00:00"), + "CustomBusinessMonthEnd": Timestamp("2010-12-31 09:00:00"), + "CustomBusinessMonthBegin": Timestamp("2010-12-01 09:00:00"), + "BusinessMonthBegin": Timestamp("2010-12-01 09:00:00"), + "MonthEnd": Timestamp("2010-12-31 09:00:00"), + "SemiMonthEnd": Timestamp("2010-12-31 09:00:00"), + "BusinessMonthEnd": Timestamp("2010-12-31 09:00:00"), + "BYearBegin": Timestamp("2010-01-01 09:00:00"), + "YearEnd": Timestamp("2010-12-31 09:00:00"), + "BYearEnd": Timestamp("2010-12-31 09:00:00"), + "QuarterBegin": Timestamp("2010-12-01 09:00:00"), + "BQuarterBegin": Timestamp("2010-12-01 09:00:00"), + "QuarterEnd": Timestamp("2010-12-31 09:00:00"), + "BQuarterEnd": Timestamp("2010-12-31 09:00:00"), + "BusinessHour": Timestamp("2010-12-31 17:00:00"), + "CustomBusinessHour": Timestamp("2010-12-31 17:00:00"), + "WeekOfMonth": Timestamp("2010-12-11 09:00:00"), + "LastWeekOfMonth": Timestamp("2010-12-25 09:00:00"), + "FY5253Quarter": Timestamp("2010-10-26 09:00:00"), + "FY5253": Timestamp("2010-01-26 09:00:00"), + "Easter": Timestamp("2010-04-04 09:00:00"), + } # result will not be changed if the target is on the offset - for n in ['Day', 'MonthBegin', 'SemiMonthBegin', 'YearBegin', 'Week', - 'Hour', 'Minute', 'Second', 'Milli', 'Micro', 'Nano', - 'DateOffset']: - expecteds[n] = Timestamp('2011/01/01 09:00') + for n in [ + "Day", + "MonthBegin", + "SemiMonthBegin", + "YearBegin", + "Week", + "Hour", + "Minute", + "Second", + "Milli", + "Micro", + "Nano", + "DateOffset", + ]: + expecteds[n] = Timestamp("2011/01/01 09:00") # but be changed when normalize=True norm_expected = expecteds.copy() for k in norm_expected: norm_expected[k] = Timestamp(norm_expected[k].date()) - normalized = {'Day': Timestamp('2010-12-31 00:00:00'), - 'DateOffset': Timestamp('2010-12-31 00:00:00'), - 'MonthBegin': Timestamp('2010-12-01 00:00:00'), - 'SemiMonthBegin': Timestamp('2010-12-15 00:00:00'), - 'YearBegin': Timestamp('2010-01-01 00:00:00'), - 'Week': Timestamp('2010-12-25 00:00:00'), - 'Hour': Timestamp('2011-01-01 00:00:00'), - 'Minute': Timestamp('2011-01-01 00:00:00'), - 'Second': Timestamp('2011-01-01 00:00:00'), - 'Milli': Timestamp('2011-01-01 00:00:00'), - 'Micro': Timestamp('2011-01-01 00:00:00')} + normalized = { + "Day": Timestamp("2010-12-31 00:00:00"), + "DateOffset": Timestamp("2010-12-31 00:00:00"), + "MonthBegin": Timestamp("2010-12-01 00:00:00"), + "SemiMonthBegin": Timestamp("2010-12-15 00:00:00"), + "YearBegin": Timestamp("2010-01-01 00:00:00"), + "Week": Timestamp("2010-12-25 00:00:00"), + "Hour": Timestamp("2011-01-01 00:00:00"), + "Minute": Timestamp("2011-01-01 00:00:00"), + "Second": Timestamp("2011-01-01 00:00:00"), + "Milli": Timestamp("2011-01-01 00:00:00"), + "Micro": Timestamp("2011-01-01 00:00:00"), + } norm_expected.update(normalized) sdt = datetime(2011, 1, 1, 9, 0) - ndt = np_datetime64_compat('2011-01-01 09:00Z') + ndt = np_datetime64_compat("2011-01-01 09:00Z") for dt in [sdt, ndt]: expected = expecteds[offset_types.__name__] - self._check_offsetfunc_works(offset_types, 'rollback', dt, - expected) + self._check_offsetfunc_works(offset_types, "rollback", dt, expected) expected = norm_expected[offset_types.__name__] - self._check_offsetfunc_works(offset_types, 'rollback', dt, - expected, normalize=True) + self._check_offsetfunc_works( + offset_types, "rollback", dt, expected, normalize=True + ) def test_onOffset(self, offset_types): dt = self.expecteds[offset_types.__name__] @@ -531,14 +608,15 @@ def test_add(self, offset_types, tz_naive_fixture): assert result == expected_localize def test_pickle_v0_15_2(self, datapath): - offsets = {'DateOffset': DateOffset(years=1), - 'MonthBegin': MonthBegin(1), - 'Day': Day(1), - 'YearBegin': YearBegin(1), - 'Week': Week(1)} - - pickle_path = datapath('tseries', 'offsets', 'data', - 'dateoffset_0_15_2.pickle') + offsets = { + "DateOffset": DateOffset(years=1), + "MonthBegin": MonthBegin(1), + "Day": Day(1), + "YearBegin": YearBegin(1), + "Week": Week(1), + } + + pickle_path = datapath("tseries", "offsets", "data", "dateoffset_0_15_2.pickle") # This code was executed once on v0.15.2 to generate the pickle: # with open(pickle_path, 'wb') as f: pickle.dump(offsets, f) # @@ -546,7 +624,6 @@ def test_pickle_v0_15_2(self, datapath): class TestDateOffset(Base): - def setup_method(self, method): self.d = Timestamp(datetime(2008, 1, 2)) _offset_map.clear() @@ -563,19 +640,19 @@ def test_mul(self): def test_constructor(self): - assert ((self.d + DateOffset(months=2)) == datetime(2008, 3, 2)) - assert ((self.d - DateOffset(months=2)) == datetime(2007, 11, 2)) + assert (self.d + DateOffset(months=2)) == datetime(2008, 3, 2) + assert (self.d - DateOffset(months=2)) == datetime(2007, 11, 2) - assert ((self.d + DateOffset(2)) == datetime(2008, 1, 4)) + assert (self.d + DateOffset(2)) == datetime(2008, 1, 4) assert not DateOffset(2).isAnchored() assert DateOffset(1).isAnchored() d = datetime(2008, 1, 31) - assert ((d + DateOffset(months=1)) == datetime(2008, 2, 29)) + assert (d + DateOffset(months=1)) == datetime(2008, 2, 29) def test_copy(self): - assert (DateOffset(months=2).copy() == DateOffset(months=2)) + assert DateOffset(months=2).copy() == DateOffset(months=2) def test_eq(self): offset1 = DateOffset(days=1) @@ -601,13 +678,13 @@ def test_different_normalize_equals(self): assert offset != offset2 def test_repr(self): - assert repr(self.offset) == '' - assert repr(self.offset2) == '<2 * BusinessDays>' + assert repr(self.offset) == "" + assert repr(self.offset2) == "<2 * BusinessDays>" if compat.PY37: - expected = '' + expected = "" else: - expected = '' + expected = "" assert repr(self.offset + timedelta(1)) == expected def test_with_offset(self): @@ -631,15 +708,13 @@ def testRollback1(self): assert BDay(10).rollback(self.d) == self.d def testRollback2(self): - assert (BDay(10).rollback(datetime(2008, 1, 5)) == - datetime(2008, 1, 4)) + assert BDay(10).rollback(datetime(2008, 1, 5)) == datetime(2008, 1, 4) def testRollforward1(self): assert BDay(10).rollforward(self.d) == self.d def testRollforward2(self): - assert (BDay(10).rollforward(datetime(2008, 1, 5)) == - datetime(2008, 1, 7)) + assert BDay(10).rollforward(datetime(2008, 1, 5)) == datetime(2008, 1, 7) def test_roll_date_object(self): offset = BDay() @@ -660,52 +735,84 @@ def test_roll_date_object(self): assert result == datetime(2012, 9, 15) def test_onOffset(self): - tests = [(BDay(), datetime(2008, 1, 1), True), - (BDay(), datetime(2008, 1, 5), False)] + tests = [ + (BDay(), datetime(2008, 1, 1), True), + (BDay(), datetime(2008, 1, 5), False), + ] for offset, d, expected in tests: assert_onOffset(offset, d, expected) apply_cases = [] - apply_cases.append((BDay(), { - datetime(2008, 1, 1): datetime(2008, 1, 2), - datetime(2008, 1, 4): datetime(2008, 1, 7), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 8)})) - - apply_cases.append((2 * BDay(), { - datetime(2008, 1, 1): datetime(2008, 1, 3), - datetime(2008, 1, 4): datetime(2008, 1, 8), - datetime(2008, 1, 5): datetime(2008, 1, 8), - datetime(2008, 1, 6): datetime(2008, 1, 8), - datetime(2008, 1, 7): datetime(2008, 1, 9)})) - - apply_cases.append((-BDay(), { - datetime(2008, 1, 1): datetime(2007, 12, 31), - datetime(2008, 1, 4): datetime(2008, 1, 3), - datetime(2008, 1, 5): datetime(2008, 1, 4), - datetime(2008, 1, 6): datetime(2008, 1, 4), - datetime(2008, 1, 7): datetime(2008, 1, 4), - datetime(2008, 1, 8): datetime(2008, 1, 7)})) - - apply_cases.append((-2 * BDay(), { - datetime(2008, 1, 1): datetime(2007, 12, 28), - datetime(2008, 1, 4): datetime(2008, 1, 2), - datetime(2008, 1, 5): datetime(2008, 1, 3), - datetime(2008, 1, 6): datetime(2008, 1, 3), - datetime(2008, 1, 7): datetime(2008, 1, 3), - datetime(2008, 1, 8): datetime(2008, 1, 4), - datetime(2008, 1, 9): datetime(2008, 1, 7)})) - - apply_cases.append((BDay(0), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 1, 4): datetime(2008, 1, 4), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 7)})) - - @pytest.mark.parametrize('case', apply_cases) + apply_cases.append( + ( + BDay(), + { + datetime(2008, 1, 1): datetime(2008, 1, 2), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 8), + }, + ) + ) + + apply_cases.append( + ( + 2 * BDay(), + { + datetime(2008, 1, 1): datetime(2008, 1, 3), + datetime(2008, 1, 4): datetime(2008, 1, 8), + datetime(2008, 1, 5): datetime(2008, 1, 8), + datetime(2008, 1, 6): datetime(2008, 1, 8), + datetime(2008, 1, 7): datetime(2008, 1, 9), + }, + ) + ) + + apply_cases.append( + ( + -BDay(), + { + datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 3), + datetime(2008, 1, 5): datetime(2008, 1, 4), + datetime(2008, 1, 6): datetime(2008, 1, 4), + datetime(2008, 1, 7): datetime(2008, 1, 4), + datetime(2008, 1, 8): datetime(2008, 1, 7), + }, + ) + ) + + apply_cases.append( + ( + -2 * BDay(), + { + datetime(2008, 1, 1): datetime(2007, 12, 28), + datetime(2008, 1, 4): datetime(2008, 1, 2), + datetime(2008, 1, 5): datetime(2008, 1, 3), + datetime(2008, 1, 6): datetime(2008, 1, 3), + datetime(2008, 1, 7): datetime(2008, 1, 3), + datetime(2008, 1, 8): datetime(2008, 1, 4), + datetime(2008, 1, 9): datetime(2008, 1, 7), + }, + ) + ) + + apply_cases.append( + ( + BDay(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 4): datetime(2008, 1, 4), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 7), + }, + ) + ) + + @pytest.mark.parametrize("case", apply_cases) def test_apply(self, case): offset, cases = case for base, expected in cases.items(): @@ -736,8 +843,7 @@ def test_apply_large_n(self): assert rs == xp def test_apply_corner(self): - msg = ("Only know how to combine business day with datetime or" - " timedelta") + msg = "Only know how to combine business day with datetime or" " timedelta" with pytest.raises(ApplyTypeError, match=msg): BDay().apply(BMonthEnd()) @@ -755,69 +861,56 @@ def setup_method(self, method): self.offset4 = BusinessHour(n=-4) from datetime import time as dt_time + self.offset5 = BusinessHour(start=dt_time(11, 0), end=dt_time(14, 30)) - self.offset6 = BusinessHour(start='20:00', end='05:00') - self.offset7 = BusinessHour(n=-2, start=dt_time(21, 30), - end=dt_time(6, 30)) - self.offset8 = BusinessHour(start=['09:00', '13:00'], - end=['12:00', '17:00']) - self.offset9 = BusinessHour(n=3, start=['09:00', '22:00'], - end=['13:00', '03:00']) - self.offset10 = BusinessHour(n=-1, start=['23:00', '13:00'], - end=['02:00', '17:00']) - - @pytest.mark.parametrize("start,end,match", [ - ( - dt_time(11, 0, 5), - '17:00', - "time data must be specified only with hour and minute" - ), - ( - 'AAA', - '17:00', - "time data must match '%H:%M' format" - ), - ( - '14:00:05', - '17:00', - "time data must match '%H:%M' format" - ), - ( - [], - '17:00', - "Must include at least 1 start time" - ), - ( - '09:00', - [], - "Must include at least 1 end time" - ), - ( - ['09:00', '11:00'], - '17:00', - "number of starting time and ending time must be the same" - ), - ( - ['09:00', '11:00'], - ['10:00'], - "number of starting time and ending time must be the same" - ), - ( - ['09:00', '11:00'], - ['12:00', '20:00'], - r"invalid starting and ending time\(s\): opening hours should not " - "touch or overlap with one another" - ), - ( - ['12:00', '20:00'], - ['09:00', '11:00'], - r"invalid starting and ending time\(s\): opening hours should not " - "touch or overlap with one another" - ), - ]) + self.offset6 = BusinessHour(start="20:00", end="05:00") + self.offset7 = BusinessHour(n=-2, start=dt_time(21, 30), end=dt_time(6, 30)) + self.offset8 = BusinessHour(start=["09:00", "13:00"], end=["12:00", "17:00"]) + self.offset9 = BusinessHour( + n=3, start=["09:00", "22:00"], end=["13:00", "03:00"] + ) + self.offset10 = BusinessHour( + n=-1, start=["23:00", "13:00"], end=["02:00", "17:00"] + ) + + @pytest.mark.parametrize( + "start,end,match", + [ + ( + dt_time(11, 0, 5), + "17:00", + "time data must be specified only with hour and minute", + ), + ("AAA", "17:00", "time data must match '%H:%M' format"), + ("14:00:05", "17:00", "time data must match '%H:%M' format"), + ([], "17:00", "Must include at least 1 start time"), + ("09:00", [], "Must include at least 1 end time"), + ( + ["09:00", "11:00"], + "17:00", + "number of starting time and ending time must be the same", + ), + ( + ["09:00", "11:00"], + ["10:00"], + "number of starting time and ending time must be the same", + ), + ( + ["09:00", "11:00"], + ["12:00", "20:00"], + r"invalid starting and ending time\(s\): opening hours should not " + "touch or overlap with one another", + ), + ( + ["12:00", "20:00"], + ["09:00", "11:00"], + r"invalid starting and ending time\(s\): opening hours should not " + "touch or overlap with one another", + ), + ], + ) def test_constructor_errors(self, start, end, match): - with pytest.raises(ValueError, - match=match): + with pytest.raises(ValueError, match=match): BusinessHour(start=start, end=end) def test_different_normalize_equals(self): @@ -827,68 +920,67 @@ def test_different_normalize_equals(self): assert offset != offset2 def test_repr(self): - assert repr(self.offset1) == '' - assert repr(self.offset2) == '<3 * BusinessHours: BH=09:00-17:00>' - assert repr(self.offset3) == '<-1 * BusinessHour: BH=09:00-17:00>' - assert repr(self.offset4) == '<-4 * BusinessHours: BH=09:00-17:00>' - - assert repr(self.offset5) == '' - assert repr(self.offset6) == '' - assert repr(self.offset7) == '<-2 * BusinessHours: BH=21:30-06:30>' - assert (repr(self.offset8) == - '') - assert (repr(self.offset9) == - '<3 * BusinessHours: BH=09:00-13:00,22:00-03:00>') - assert (repr(self.offset10) == - '<-1 * BusinessHour: BH=13:00-17:00,23:00-02:00>') + assert repr(self.offset1) == "" + assert repr(self.offset2) == "<3 * BusinessHours: BH=09:00-17:00>" + assert repr(self.offset3) == "<-1 * BusinessHour: BH=09:00-17:00>" + assert repr(self.offset4) == "<-4 * BusinessHours: BH=09:00-17:00>" + + assert repr(self.offset5) == "" + assert repr(self.offset6) == "" + assert repr(self.offset7) == "<-2 * BusinessHours: BH=21:30-06:30>" + assert repr(self.offset8) == "" + assert repr(self.offset9) == "<3 * BusinessHours: BH=09:00-13:00,22:00-03:00>" + assert repr(self.offset10) == "<-1 * BusinessHour: BH=13:00-17:00,23:00-02:00>" def test_with_offset(self): - expected = Timestamp('2014-07-01 13:00') + expected = Timestamp("2014-07-01 13:00") assert self.d + BusinessHour() * 3 == expected assert self.d + BusinessHour(n=3) == expected - @pytest.mark.parametrize("offset_name", [ - "offset1", - "offset2", - "offset3", - "offset4", - "offset8", - "offset9", - "offset10" - ]) + @pytest.mark.parametrize( + "offset_name", + ["offset1", "offset2", "offset3", "offset4", "offset8", "offset9", "offset10"], + ) def test_eq_attribute(self, offset_name): offset = getattr(self, offset_name) assert offset == offset - @pytest.mark.parametrize("offset1,offset2", [ - (BusinessHour(start='09:00'), BusinessHour()), - (BusinessHour(start=['23:00', '13:00'], end=['12:00', '17:00']), - BusinessHour(start=['13:00', '23:00'], end=['17:00', '12:00'])), - ]) + @pytest.mark.parametrize( + "offset1,offset2", + [ + (BusinessHour(start="09:00"), BusinessHour()), + ( + BusinessHour(start=["23:00", "13:00"], end=["12:00", "17:00"]), + BusinessHour(start=["13:00", "23:00"], end=["17:00", "12:00"]), + ), + ], + ) def test_eq(self, offset1, offset2): assert offset1 == offset2 - @pytest.mark.parametrize("offset1,offset2", [ - (BusinessHour(), BusinessHour(-1)), - (BusinessHour(start='09:00'), BusinessHour(start='09:01')), - (BusinessHour(start='09:00', end='17:00'), - BusinessHour(start='17:00', end='09:01')), - (BusinessHour(start=['13:00', '23:00'], end=['18:00', '07:00']), - BusinessHour(start=['13:00', '23:00'], end=['17:00', '12:00'])), - ]) + @pytest.mark.parametrize( + "offset1,offset2", + [ + (BusinessHour(), BusinessHour(-1)), + (BusinessHour(start="09:00"), BusinessHour(start="09:01")), + ( + BusinessHour(start="09:00", end="17:00"), + BusinessHour(start="17:00", end="09:01"), + ), + ( + BusinessHour(start=["13:00", "23:00"], end=["18:00", "07:00"]), + BusinessHour(start=["13:00", "23:00"], end=["17:00", "12:00"]), + ), + ], + ) def test_neq(self, offset1, offset2): assert offset1 != offset2 - @pytest.mark.parametrize("offset_name", [ - "offset1", - "offset2", - "offset3", - "offset4", - "offset8", - "offset9", - "offset10" - ]) + @pytest.mark.parametrize( + "offset_name", + ["offset1", "offset2", "offset3", "offset4", "offset8", "offset9", "offset10"], + ) def test_hash(self, offset_name): offset = getattr(self, offset_name) assert offset == offset @@ -940,20 +1032,18 @@ def testRollback1(self): assert self._offset(5).rollback(self.d) == self.d def testRollback2(self): - assert (self._offset(-3).rollback(datetime(2014, 7, 5, 15, 0)) == - datetime(2014, 7, 4, 17, 0)) + assert self._offset(-3).rollback(datetime(2014, 7, 5, 15, 0)) == datetime( + 2014, 7, 4, 17, 0 + ) def testRollforward1(self): assert self.offset1.rollforward(self.d) == self.d assert self.offset2.rollforward(self.d) == self.d assert self.offset3.rollforward(self.d) == self.d assert self.offset4.rollforward(self.d) == self.d - assert (self.offset5.rollforward(self.d) == - datetime(2014, 7, 1, 11, 0)) - assert (self.offset6.rollforward(self.d) == - datetime(2014, 7, 1, 20, 0)) - assert (self.offset7.rollforward(self.d) == - datetime(2014, 7, 1, 21, 30)) + assert self.offset5.rollforward(self.d) == datetime(2014, 7, 1, 11, 0) + assert self.offset6.rollforward(self.d) == datetime(2014, 7, 1, 20, 0) + assert self.offset7.rollforward(self.d) == datetime(2014, 7, 1, 21, 30) assert self.offset8.rollforward(self.d) == self.d assert self.offset9.rollforward(self.d) == self.d assert self.offset10.rollforward(self.d) == datetime(2014, 7, 1, 13) @@ -973,8 +1063,9 @@ def testRollforward1(self): assert self._offset(5).rollforward(self.d) == self.d def testRollforward2(self): - assert (self._offset(-3).rollforward(datetime(2014, 7, 5, 16, 0)) == - datetime(2014, 7, 7, 9)) + assert self._offset(-3).rollforward(datetime(2014, 7, 5, 16, 0)) == datetime( + 2014, 7, 7, 9 + ) def test_roll_date_object(self): offset = BusinessHour() @@ -988,111 +1079,156 @@ def test_roll_date_object(self): assert result == datetime(2014, 7, 7, 9) normalize_cases = [] - normalize_cases.append((BusinessHour(normalize=True), { - datetime(2014, 7, 1, 8): datetime(2014, 7, 1), - datetime(2014, 7, 1, 17): datetime(2014, 7, 2), - datetime(2014, 7, 1, 16): datetime(2014, 7, 2), - datetime(2014, 7, 1, 23): datetime(2014, 7, 2), - datetime(2014, 7, 1, 0): datetime(2014, 7, 1), - datetime(2014, 7, 4, 15): datetime(2014, 7, 4), - datetime(2014, 7, 4, 15, 59): datetime(2014, 7, 4), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7), - datetime(2014, 7, 5, 23): datetime(2014, 7, 7), - datetime(2014, 7, 6, 10): datetime(2014, 7, 7)})) - - normalize_cases.append((BusinessHour(-1, normalize=True), { - datetime(2014, 7, 1, 8): datetime(2014, 6, 30), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1), - datetime(2014, 7, 1, 16): datetime(2014, 7, 1), - datetime(2014, 7, 1, 10): datetime(2014, 6, 30), - datetime(2014, 7, 1, 0): datetime(2014, 6, 30), - datetime(2014, 7, 7, 10): datetime(2014, 7, 4), - datetime(2014, 7, 7, 10, 1): datetime(2014, 7, 7), - datetime(2014, 7, 5, 23): datetime(2014, 7, 4), - datetime(2014, 7, 6, 10): datetime(2014, 7, 4)})) - - normalize_cases.append((BusinessHour(1, normalize=True, start='17:00', - end='04:00'), { - datetime(2014, 7, 1, 8): datetime(2014, 7, 1), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1), - datetime(2014, 7, 1, 23): datetime(2014, 7, 2), - datetime(2014, 7, 2, 2): datetime(2014, 7, 2), - datetime(2014, 7, 2, 3): datetime(2014, 7, 2), - datetime(2014, 7, 4, 23): datetime(2014, 7, 5), - datetime(2014, 7, 5, 2): datetime(2014, 7, 5), - datetime(2014, 7, 7, 2): datetime(2014, 7, 7), - datetime(2014, 7, 7, 17): datetime(2014, 7, 7)})) - - @pytest.mark.parametrize('case', normalize_cases) + normalize_cases.append( + ( + BusinessHour(normalize=True), + { + datetime(2014, 7, 1, 8): datetime(2014, 7, 1), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2), + datetime(2014, 7, 1, 23): datetime(2014, 7, 2), + datetime(2014, 7, 1, 0): datetime(2014, 7, 1), + datetime(2014, 7, 4, 15): datetime(2014, 7, 4), + datetime(2014, 7, 4, 15, 59): datetime(2014, 7, 4), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7), + datetime(2014, 7, 5, 23): datetime(2014, 7, 7), + datetime(2014, 7, 6, 10): datetime(2014, 7, 7), + }, + ) + ) + + normalize_cases.append( + ( + BusinessHour(-1, normalize=True), + { + datetime(2014, 7, 1, 8): datetime(2014, 6, 30), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1), + datetime(2014, 7, 1, 10): datetime(2014, 6, 30), + datetime(2014, 7, 1, 0): datetime(2014, 6, 30), + datetime(2014, 7, 7, 10): datetime(2014, 7, 4), + datetime(2014, 7, 7, 10, 1): datetime(2014, 7, 7), + datetime(2014, 7, 5, 23): datetime(2014, 7, 4), + datetime(2014, 7, 6, 10): datetime(2014, 7, 4), + }, + ) + ) + + normalize_cases.append( + ( + BusinessHour(1, normalize=True, start="17:00", end="04:00"), + { + datetime(2014, 7, 1, 8): datetime(2014, 7, 1), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1), + datetime(2014, 7, 1, 23): datetime(2014, 7, 2), + datetime(2014, 7, 2, 2): datetime(2014, 7, 2), + datetime(2014, 7, 2, 3): datetime(2014, 7, 2), + datetime(2014, 7, 4, 23): datetime(2014, 7, 5), + datetime(2014, 7, 5, 2): datetime(2014, 7, 5), + datetime(2014, 7, 7, 2): datetime(2014, 7, 7), + datetime(2014, 7, 7, 17): datetime(2014, 7, 7), + }, + ) + ) + + @pytest.mark.parametrize("case", normalize_cases) def test_normalize(self, case): offset, cases = case for dt, expected in cases.items(): assert offset.apply(dt) == expected on_offset_cases = [] - on_offset_cases.append((BusinessHour(), { - datetime(2014, 7, 1, 9): True, - datetime(2014, 7, 1, 8, 59): False, - datetime(2014, 7, 1, 8): False, - datetime(2014, 7, 1, 17): True, - datetime(2014, 7, 1, 17, 1): False, - datetime(2014, 7, 1, 18): False, - datetime(2014, 7, 5, 9): False, - datetime(2014, 7, 6, 12): False})) - - on_offset_cases.append((BusinessHour(start='10:00', end='15:00'), { - datetime(2014, 7, 1, 9): False, - datetime(2014, 7, 1, 10): True, - datetime(2014, 7, 1, 15): True, - datetime(2014, 7, 1, 15, 1): False, - datetime(2014, 7, 5, 12): False, - datetime(2014, 7, 6, 12): False})) - - on_offset_cases.append((BusinessHour(start='19:00', end='05:00'), { - datetime(2014, 7, 1, 9, 0): False, - datetime(2014, 7, 1, 10, 0): False, - datetime(2014, 7, 1, 15): False, - datetime(2014, 7, 1, 15, 1): False, - datetime(2014, 7, 5, 12, 0): False, - datetime(2014, 7, 6, 12, 0): False, - datetime(2014, 7, 1, 19, 0): True, - datetime(2014, 7, 2, 0, 0): True, - datetime(2014, 7, 4, 23): True, - datetime(2014, 7, 5, 1): True, - datetime(2014, 7, 5, 5, 0): True, - datetime(2014, 7, 6, 23, 0): False, - datetime(2014, 7, 7, 3, 0): False})) - - on_offset_cases.append((BusinessHour(start=['09:00', '13:00'], - end=['12:00', '17:00']), { - datetime(2014, 7, 1, 9): True, - datetime(2014, 7, 1, 8, 59): False, - datetime(2014, 7, 1, 8): False, - datetime(2014, 7, 1, 17): True, - datetime(2014, 7, 1, 17, 1): False, - datetime(2014, 7, 1, 18): False, - datetime(2014, 7, 5, 9): False, - datetime(2014, 7, 6, 12): False, - datetime(2014, 7, 1, 12, 30): False})) - - on_offset_cases.append((BusinessHour(start=['19:00', '23:00'], - end=['21:00', '05:00']), { - datetime(2014, 7, 1, 9, 0): False, - datetime(2014, 7, 1, 10, 0): False, - datetime(2014, 7, 1, 15): False, - datetime(2014, 7, 1, 15, 1): False, - datetime(2014, 7, 5, 12, 0): False, - datetime(2014, 7, 6, 12, 0): False, - datetime(2014, 7, 1, 19, 0): True, - datetime(2014, 7, 2, 0, 0): True, - datetime(2014, 7, 4, 23): True, - datetime(2014, 7, 5, 1): True, - datetime(2014, 7, 5, 5, 0): True, - datetime(2014, 7, 6, 23, 0): False, - datetime(2014, 7, 7, 3, 0): False, - datetime(2014, 7, 4, 22): False})) - - @pytest.mark.parametrize('case', on_offset_cases) + on_offset_cases.append( + ( + BusinessHour(), + { + datetime(2014, 7, 1, 9): True, + datetime(2014, 7, 1, 8, 59): False, + datetime(2014, 7, 1, 8): False, + datetime(2014, 7, 1, 17): True, + datetime(2014, 7, 1, 17, 1): False, + datetime(2014, 7, 1, 18): False, + datetime(2014, 7, 5, 9): False, + datetime(2014, 7, 6, 12): False, + }, + ) + ) + + on_offset_cases.append( + ( + BusinessHour(start="10:00", end="15:00"), + { + datetime(2014, 7, 1, 9): False, + datetime(2014, 7, 1, 10): True, + datetime(2014, 7, 1, 15): True, + datetime(2014, 7, 1, 15, 1): False, + datetime(2014, 7, 5, 12): False, + datetime(2014, 7, 6, 12): False, + }, + ) + ) + + on_offset_cases.append( + ( + BusinessHour(start="19:00", end="05:00"), + { + datetime(2014, 7, 1, 9, 0): False, + datetime(2014, 7, 1, 10, 0): False, + datetime(2014, 7, 1, 15): False, + datetime(2014, 7, 1, 15, 1): False, + datetime(2014, 7, 5, 12, 0): False, + datetime(2014, 7, 6, 12, 0): False, + datetime(2014, 7, 1, 19, 0): True, + datetime(2014, 7, 2, 0, 0): True, + datetime(2014, 7, 4, 23): True, + datetime(2014, 7, 5, 1): True, + datetime(2014, 7, 5, 5, 0): True, + datetime(2014, 7, 6, 23, 0): False, + datetime(2014, 7, 7, 3, 0): False, + }, + ) + ) + + on_offset_cases.append( + ( + BusinessHour(start=["09:00", "13:00"], end=["12:00", "17:00"]), + { + datetime(2014, 7, 1, 9): True, + datetime(2014, 7, 1, 8, 59): False, + datetime(2014, 7, 1, 8): False, + datetime(2014, 7, 1, 17): True, + datetime(2014, 7, 1, 17, 1): False, + datetime(2014, 7, 1, 18): False, + datetime(2014, 7, 5, 9): False, + datetime(2014, 7, 6, 12): False, + datetime(2014, 7, 1, 12, 30): False, + }, + ) + ) + + on_offset_cases.append( + ( + BusinessHour(start=["19:00", "23:00"], end=["21:00", "05:00"]), + { + datetime(2014, 7, 1, 9, 0): False, + datetime(2014, 7, 1, 10, 0): False, + datetime(2014, 7, 1, 15): False, + datetime(2014, 7, 1, 15, 1): False, + datetime(2014, 7, 5, 12, 0): False, + datetime(2014, 7, 6, 12, 0): False, + datetime(2014, 7, 1, 19, 0): True, + datetime(2014, 7, 2, 0, 0): True, + datetime(2014, 7, 4, 23): True, + datetime(2014, 7, 5, 1): True, + datetime(2014, 7, 5, 5, 0): True, + datetime(2014, 7, 6, 23, 0): False, + datetime(2014, 7, 7, 3, 0): False, + datetime(2014, 7, 4, 22): False, + }, + ) + ) + + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, cases = case for dt, expected in cases.items(): @@ -1101,233 +1237,462 @@ def test_onOffset(self, case): opening_time_cases = [] # opening time should be affected by sign of n, not by n's value and # end - opening_time_cases.append(([BusinessHour(), BusinessHour(n=2), - BusinessHour(n=4), BusinessHour(end='10:00'), - BusinessHour(n=2, end='4:00'), - BusinessHour(n=4, end='15:00')], { - datetime(2014, 7, 1, 11): (datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 9)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 9)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 9)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 9)), - # if timestamp is on opening time, next opening time is - # as it is - datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 2, 10): (datetime(2014, 7, 3, 9), - datetime(2014, 7, 2, 9)), - # 2014-07-05 is saturday - datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 9), - datetime(2014, 7, 4, 9)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 7, 9), - datetime(2014, 7, 4, 9)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 9), - datetime(2014, 7, 4, 9)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 9), - datetime(2014, 7, 4, 9)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 9), - datetime(2014, 7, 4, 9)), - datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 8, 9), - datetime(2014, 7, 7, 9))})) - - opening_time_cases.append(([BusinessHour(start='11:15'), - BusinessHour(n=2, start='11:15'), - BusinessHour(n=3, start='11:15'), - BusinessHour(start='11:15', end='10:00'), - BusinessHour(n=2, start='11:15', end='4:00'), - BusinessHour(n=3, start='11:15', - end='15:00')], { - datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 11, 15), - datetime(2014, 6, 30, 11, 15)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 11, 15)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 11, 15)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 11, 15)), - datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 11, 15)), - datetime(2014, 7, 2, 10): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 11, 15)), - datetime(2014, 7, 2, 11, 15): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 2, 11, 15)), - datetime(2014, 7, 2, 11, 15, 1): (datetime(2014, 7, 3, 11, 15), - datetime(2014, 7, 2, 11, 15)), - datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 11, 15)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 11, 15), - datetime(2014, 7, 3, 11, 15)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 11, 15)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 11, 15)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 11, 15)), - datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 11, 15))})) - - opening_time_cases.append(([BusinessHour(-1), BusinessHour(n=-2), - BusinessHour(n=-4), - BusinessHour(n=-1, end='10:00'), - BusinessHour(n=-2, end='4:00'), - BusinessHour(n=-4, end='15:00')], { - datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 1, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 1, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 1, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 2, 10): (datetime(2014, 7, 2, 9), - datetime(2014, 7, 3, 9)), - datetime(2014, 7, 5, 10): (datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 7, 9): (datetime(2014, 7, 7, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 7, 9), - datetime(2014, 7, 8, 9))})) - - opening_time_cases.append(([BusinessHour(start='17:00', end='05:00'), - BusinessHour(n=3, start='17:00', - end='03:00')], { - datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 17), - datetime(2014, 6, 30, 17)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 17), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 17), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 17), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 17), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 4, 17): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 3, 17)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 7, 17, 1): (datetime(2014, 7, 8, 17), - datetime(2014, 7, 7, 17)), })) - - opening_time_cases.append(([BusinessHour(-1, start='17:00', end='05:00'), - BusinessHour(n=-2, start='17:00', - end='03:00')], { - datetime(2014, 7, 1, 11): (datetime(2014, 6, 30, 17), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 2, 9): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 2, 16, 59): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 5, 10): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 17)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 3, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 17)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 17)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 17)), - datetime(2014, 7, 7, 18): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 8, 17))})) - - opening_time_cases.append(([BusinessHour(start=['11:15', '15:00'], - end=['13:00', '20:00']), - BusinessHour(n=3, start=['11:15', '15:00'], - end=['12:00', '20:00']), - BusinessHour(start=['11:15', '15:00'], - end=['13:00', '17:00']), - BusinessHour(n=2, start=['11:15', '15:00'], - end=['12:00', '03:00']), - BusinessHour(n=3, start=['11:15', '15:00'], - end=['13:00', '16:00'])], { - datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 11, 15), - datetime(2014, 6, 30, 15)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 15)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 15)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 15)), - datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 15)), - datetime(2014, 7, 2, 10): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 15)), - datetime(2014, 7, 2, 11, 15): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 2, 11, 15)), - datetime(2014, 7, 2, 11, 15, 1): (datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 11, 15)), - datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 15)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 11, 15), - datetime(2014, 7, 3, 15)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 15)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 15)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 15)), - datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 15)), - datetime(2014, 7, 7, 12): (datetime(2014, 7, 7, 15), - datetime(2014, 7, 7, 11, 15))})) - - opening_time_cases.append(([BusinessHour(n=-1, start=['17:00', '08:00'], - end=['05:00', '10:00']), - BusinessHour(n=-2, start=['08:00', '17:00'], - end=['10:00', '03:00'])], { - datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 8), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 8)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 8)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 8), - datetime(2014, 7, 2, 8)), - datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 8), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 2, 16, 59): (datetime(2014, 7, 2, 8), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 5, 10): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 8)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 8), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 8)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 8)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 8)), - datetime(2014, 7, 7, 18): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 8, 8))})) - - @pytest.mark.parametrize('case', opening_time_cases) + opening_time_cases.append( + ( + [ + BusinessHour(), + BusinessHour(n=2), + BusinessHour(n=4), + BusinessHour(end="10:00"), + BusinessHour(n=2, end="4:00"), + BusinessHour(n=4, end="15:00"), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9), + ), + # if timestamp is on opening time, next opening time is + # as it is + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 2, 10): ( + datetime(2014, 7, 3, 9), + datetime(2014, 7, 2, 9), + ), + # 2014-07-05 is saturday + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9), + ), + datetime(2014, 7, 7, 9, 1): ( + datetime(2014, 7, 8, 9), + datetime(2014, 7, 7, 9), + ), + }, + ) + ) + + opening_time_cases.append( + ( + [ + BusinessHour(start="11:15"), + BusinessHour(n=2, start="11:15"), + BusinessHour(n=3, start="11:15"), + BusinessHour(start="11:15", end="10:00"), + BusinessHour(n=2, start="11:15", end="4:00"), + BusinessHour(n=3, start="11:15", end="15:00"), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 1, 11, 15), + datetime(2014, 6, 30, 11, 15), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15), + ), + datetime(2014, 7, 2, 10): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15), + ), + datetime(2014, 7, 2, 11, 15): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 2, 11, 15), + ), + datetime(2014, 7, 2, 11, 15, 1): ( + datetime(2014, 7, 3, 11, 15), + datetime(2014, 7, 2, 11, 15), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 4, 11, 15), + datetime(2014, 7, 3, 11, 15), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15), + ), + datetime(2014, 7, 7, 9, 1): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15), + ), + }, + ) + ) + + opening_time_cases.append( + ( + [ + BusinessHour(-1), + BusinessHour(n=-2), + BusinessHour(n=-4), + BusinessHour(n=-1, end="10:00"), + BusinessHour(n=-2, end="4:00"), + BusinessHour(n=-4, end="15:00"), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 2, 10): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 3, 9), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 7, 9): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 7, 9, 1): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 8, 9), + ), + }, + ) + ) + + opening_time_cases.append( + ( + [ + BusinessHour(start="17:00", end="05:00"), + BusinessHour(n=3, start="17:00", end="03:00"), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 1, 17), + datetime(2014, 6, 30, 17), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 4, 17): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 3, 17), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 7, 17, 1): ( + datetime(2014, 7, 8, 17), + datetime(2014, 7, 7, 17), + ), + }, + ) + ) + + opening_time_cases.append( + ( + [ + BusinessHour(-1, start="17:00", end="05:00"), + BusinessHour(n=-2, start="17:00", end="03:00"), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 2, 16, 59): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 3, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17), + ), + datetime(2014, 7, 7, 18): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 8, 17), + ), + }, + ) + ) + + opening_time_cases.append( + ( + [ + BusinessHour(start=["11:15", "15:00"], end=["13:00", "20:00"]), + BusinessHour(n=3, start=["11:15", "15:00"], end=["12:00", "20:00"]), + BusinessHour(start=["11:15", "15:00"], end=["13:00", "17:00"]), + BusinessHour(n=2, start=["11:15", "15:00"], end=["12:00", "03:00"]), + BusinessHour(n=3, start=["11:15", "15:00"], end=["13:00", "16:00"]), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 1, 11, 15), + datetime(2014, 6, 30, 15), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15), + ), + datetime(2014, 7, 2, 10): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15), + ), + datetime(2014, 7, 2, 11, 15): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 2, 11, 15), + ), + datetime(2014, 7, 2, 11, 15, 1): ( + datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 11, 15), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 4, 11, 15), + datetime(2014, 7, 3, 15), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15), + ), + datetime(2014, 7, 7, 9, 1): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15), + ), + datetime(2014, 7, 7, 12): ( + datetime(2014, 7, 7, 15), + datetime(2014, 7, 7, 11, 15), + ), + }, + ) + ) + + opening_time_cases.append( + ( + [ + BusinessHour(n=-1, start=["17:00", "08:00"], end=["05:00", "10:00"]), + BusinessHour(n=-2, start=["08:00", "17:00"], end=["10:00", "03:00"]), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 1, 8), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 8), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 8), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 2, 8), + datetime(2014, 7, 2, 8), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 8), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 2, 16, 59): ( + datetime(2014, 7, 2, 8), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 8), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 4, 8), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 8), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 8), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 8), + ), + datetime(2014, 7, 7, 18): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 8, 8), + ), + }, + ) + ) + + @pytest.mark.parametrize("case", opening_time_cases) def test_opening_time(self, case): _offsets, cases = case for offset in _offsets: @@ -1336,251 +1701,343 @@ def test_opening_time(self, case): assert offset._prev_opening_time(dt) == exp_prev apply_cases = [] - apply_cases.append((BusinessHour(), { - datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 12), - datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), - datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), - datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 10), - datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 2, 9, 30, 15), - datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 10), - datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 12), - # out of business hours - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 10), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), - datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), - # saturday - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 10), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 9, 30), - datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 9, 30, 30)})) - - apply_cases.append((BusinessHour(4), { - datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 15), - datetime(2014, 7, 1, 13): datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 11), - datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 12), - datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 13), - datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 13), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 13), - datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 13), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 13), - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 13), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 13), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 12, 30), - datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 12, 30, 30)})) - - apply_cases.append((BusinessHour(-1), { - datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 10), - datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 12), - datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 14), - datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 15), - datetime(2014, 7, 1, 10): datetime(2014, 6, 30, 17), - datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 1, 15, 30, 15), - datetime(2014, 7, 1, 9, 30, 15): datetime(2014, 6, 30, 16, 30, 15), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 16), - datetime(2014, 7, 1, 5): datetime(2014, 6, 30, 16), - datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 10), - # out of business hours - datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 16), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 16), - datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 16), - datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 16), - # saturday - datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 16), - datetime(2014, 7, 7, 9): datetime(2014, 7, 4, 16), - datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 16, 30), - datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 16, 30, 30)})) - - apply_cases.append((BusinessHour(-4), { - datetime(2014, 7, 1, 11): datetime(2014, 6, 30, 15), - datetime(2014, 7, 1, 13): datetime(2014, 6, 30, 17), - datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 11), - datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 12), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13), - datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15), - datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13), - datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 13), - datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13), - datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13), - datetime(2014, 7, 4, 18): datetime(2014, 7, 4, 13), - datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 13, 30), - datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 13, 30, 30)})) - - apply_cases.append((BusinessHour(start='13:00', end='16:00'), { - datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 14), - datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), - datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 13), - datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 14), - datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 14), - datetime(2014, 7, 1, 15, 30, 15): datetime(2014, 7, 2, 13, 30, 15), - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 14), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 14)})) - - apply_cases.append((BusinessHour(n=2, start='13:00', end='16:00'), { - datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 14): datetime(2014, 7, 3, 13), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 15), - datetime(2014, 7, 2, 14, 30): datetime(2014, 7, 3, 13, 30), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 15), - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 15), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 15), - datetime(2014, 7, 4, 14, 30): datetime(2014, 7, 7, 13, 30), - datetime(2014, 7, 4, 14, 30, 30): datetime(2014, 7, 7, 13, 30, 30)})) - - apply_cases.append((BusinessHour(n=-1, start='13:00', end='16:00'), { - datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15), - datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 15), - datetime(2014, 7, 2, 14): datetime(2014, 7, 1, 16), - datetime(2014, 7, 2, 15): datetime(2014, 7, 2, 14), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 16): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 13, 30, 15): datetime(2014, 7, 1, 15, 30, 15), - datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 15), - datetime(2014, 7, 7, 11): datetime(2014, 7, 4, 15)})) - - apply_cases.append((BusinessHour(n=-3, start='10:00', end='16:00'), { - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13), - datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 11), - datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13), - datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 16), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13), - datetime(2014, 7, 2, 11, 30): datetime(2014, 7, 1, 14, 30), - datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13), - datetime(2014, 7, 4, 10): datetime(2014, 7, 3, 13), - datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13), - datetime(2014, 7, 4, 16): datetime(2014, 7, 4, 13), - datetime(2014, 7, 4, 12, 30): datetime(2014, 7, 3, 15, 30), - datetime(2014, 7, 4, 12, 30, 30): datetime(2014, 7, 3, 15, 30, 30)})) - - apply_cases.append((BusinessHour(start='19:00', end='05:00'), { - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 20), - datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 20), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 20), - datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 20), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 20), - datetime(2014, 7, 2, 4, 30): datetime(2014, 7, 2, 19, 30), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 1), - datetime(2014, 7, 4, 10): datetime(2014, 7, 4, 20), - datetime(2014, 7, 4, 23): datetime(2014, 7, 5, 0), - datetime(2014, 7, 5, 0): datetime(2014, 7, 5, 1), - datetime(2014, 7, 5, 4): datetime(2014, 7, 7, 19), - datetime(2014, 7, 5, 4, 30): datetime(2014, 7, 7, 19, 30), - datetime(2014, 7, 5, 4, 30, 30): datetime(2014, 7, 7, 19, 30, 30)})) - - apply_cases.append((BusinessHour(n=-1, start='19:00', end='05:00'), { - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 4), - datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 20): datetime(2014, 7, 2, 5), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 19, 30): datetime(2014, 7, 2, 4, 30), - datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 23), - datetime(2014, 7, 3, 6): datetime(2014, 7, 3, 4), - datetime(2014, 7, 4, 23): datetime(2014, 7, 4, 22), - datetime(2014, 7, 5, 0): datetime(2014, 7, 4, 23), - datetime(2014, 7, 5, 4): datetime(2014, 7, 5, 3), - datetime(2014, 7, 7, 19, 30): datetime(2014, 7, 5, 4, 30), - datetime(2014, 7, 7, 19, 30, 30): datetime(2014, 7, 5, 4, 30, 30)})) + apply_cases.append( + ( + BusinessHour(), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), + datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 10), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 2, 9, 30, 15), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 10), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 12), + # out of business hours + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 10), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), + # saturday + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 9, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 9, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(4), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 1, 13): datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 11), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 12), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 13), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 13), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 13), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 12, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 12, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(-1), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 10), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 15), + datetime(2014, 7, 1, 10): datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 1, 15, 30, 15), + datetime(2014, 7, 1, 9, 30, 15): datetime(2014, 6, 30, 16, 30, 15), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 16), + datetime(2014, 7, 1, 5): datetime(2014, 6, 30, 16), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 10), + # out of business hours + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 16), + datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 16), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 16), + # saturday + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 16), + datetime(2014, 7, 7, 9): datetime(2014, 7, 4, 16), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 16, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 16, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(-4), + { + datetime(2014, 7, 1, 11): datetime(2014, 6, 30, 15), + datetime(2014, 7, 1, 13): datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 11), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 13), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13), + datetime(2014, 7, 4, 18): datetime(2014, 7, 4, 13), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 13, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 13, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(start="13:00", end="16:00"), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 13), + datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 14), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 14), + datetime(2014, 7, 1, 15, 30, 15): datetime(2014, 7, 2, 13, 30, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 14), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 14), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=2, start="13:00", end="16:00"), + { + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 14): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 15), + datetime(2014, 7, 2, 14, 30): datetime(2014, 7, 3, 13, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 15), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 15), + datetime(2014, 7, 4, 14, 30): datetime(2014, 7, 7, 13, 30), + datetime(2014, 7, 4, 14, 30, 30): datetime(2014, 7, 7, 13, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=-1, start="13:00", end="16:00"), + { + datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 14): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 15): datetime(2014, 7, 2, 14), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 16): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 13, 30, 15): datetime(2014, 7, 1, 15, 30, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 15), + datetime(2014, 7, 7, 11): datetime(2014, 7, 4, 15), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=-3, start="10:00", end="16:00"), + { + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 11), + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 11, 30): datetime(2014, 7, 1, 14, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13), + datetime(2014, 7, 4, 10): datetime(2014, 7, 3, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13), + datetime(2014, 7, 4, 16): datetime(2014, 7, 4, 13), + datetime(2014, 7, 4, 12, 30): datetime(2014, 7, 3, 15, 30), + datetime(2014, 7, 4, 12, 30, 30): datetime(2014, 7, 3, 15, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(start="19:00", end="05:00"), + { + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 20), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 4, 30): datetime(2014, 7, 2, 19, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 1), + datetime(2014, 7, 4, 10): datetime(2014, 7, 4, 20), + datetime(2014, 7, 4, 23): datetime(2014, 7, 5, 0), + datetime(2014, 7, 5, 0): datetime(2014, 7, 5, 1), + datetime(2014, 7, 5, 4): datetime(2014, 7, 7, 19), + datetime(2014, 7, 5, 4, 30): datetime(2014, 7, 7, 19, 30), + datetime(2014, 7, 5, 4, 30, 30): datetime(2014, 7, 7, 19, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=-1, start="19:00", end="05:00"), + { + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 4), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 20): datetime(2014, 7, 2, 5), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 19, 30): datetime(2014, 7, 2, 4, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 23), + datetime(2014, 7, 3, 6): datetime(2014, 7, 3, 4), + datetime(2014, 7, 4, 23): datetime(2014, 7, 4, 22), + datetime(2014, 7, 5, 0): datetime(2014, 7, 4, 23), + datetime(2014, 7, 5, 4): datetime(2014, 7, 5, 3), + datetime(2014, 7, 7, 19, 30): datetime(2014, 7, 5, 4, 30), + datetime(2014, 7, 7, 19, 30, 30): datetime(2014, 7, 5, 4, 30, 30), + }, + ) + ) # long business hours (see gh-26381) - apply_cases.append((BusinessHour(n=4, start='00:00', end='23:00'), { - datetime(2014, 7, 3, 22): datetime(2014, 7, 4, 3), - datetime(2014, 7, 4, 22): datetime(2014, 7, 7, 3), - datetime(2014, 7, 3, 22, 30): datetime(2014, 7, 4, 3, 30), - datetime(2014, 7, 3, 22, 20): datetime(2014, 7, 4, 3, 20), - datetime(2014, 7, 4, 22, 30, 30): datetime(2014, 7, 7, 3, 30, 30), - datetime(2014, 7, 4, 22, 30, 20): datetime(2014, 7, 7, 3, 30, 20)})) - - apply_cases.append((BusinessHour(n=-4, start='00:00', end='23:00'), { - datetime(2014, 7, 4, 3): datetime(2014, 7, 3, 22), - datetime(2014, 7, 7, 3): datetime(2014, 7, 4, 22), - datetime(2014, 7, 4, 3, 30): datetime(2014, 7, 3, 22, 30), - datetime(2014, 7, 4, 3, 20): datetime(2014, 7, 3, 22, 20), - datetime(2014, 7, 7, 3, 30, 30): datetime(2014, 7, 4, 22, 30, 30), - datetime(2014, 7, 7, 3, 30, 20): datetime(2014, 7, 4, 22, 30, 20)})) + apply_cases.append( + ( + BusinessHour(n=4, start="00:00", end="23:00"), + { + datetime(2014, 7, 3, 22): datetime(2014, 7, 4, 3), + datetime(2014, 7, 4, 22): datetime(2014, 7, 7, 3), + datetime(2014, 7, 3, 22, 30): datetime(2014, 7, 4, 3, 30), + datetime(2014, 7, 3, 22, 20): datetime(2014, 7, 4, 3, 20), + datetime(2014, 7, 4, 22, 30, 30): datetime(2014, 7, 7, 3, 30, 30), + datetime(2014, 7, 4, 22, 30, 20): datetime(2014, 7, 7, 3, 30, 20), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=-4, start="00:00", end="23:00"), + { + datetime(2014, 7, 4, 3): datetime(2014, 7, 3, 22), + datetime(2014, 7, 7, 3): datetime(2014, 7, 4, 22), + datetime(2014, 7, 4, 3, 30): datetime(2014, 7, 3, 22, 30), + datetime(2014, 7, 4, 3, 20): datetime(2014, 7, 3, 22, 20), + datetime(2014, 7, 7, 3, 30, 30): datetime(2014, 7, 4, 22, 30, 30), + datetime(2014, 7, 7, 3, 30, 20): datetime(2014, 7, 4, 22, 30, 20), + }, + ) + ) # multiple business hours - apply_cases.append((BusinessHour(start=['09:00', '14:00'], - end=['12:00', '18:00']), { - datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 14), - datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), - datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 10), - datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 17), - datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 1, 17, 30, 15), - datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 9), - datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 14), - # out of business hours - datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 15), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 10), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), - datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), - # saturday - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 9), - datetime(2014, 7, 4, 17, 30): datetime(2014, 7, 7, 9, 30), - datetime(2014, 7, 4, 17, 30, 30): datetime(2014, 7, 7, 9, 30, 30)})) - - apply_cases.append((BusinessHour(n=4, start=['09:00', '14:00'], - end=['12:00', '18:00']), { - datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 17), - datetime(2014, 7, 1, 13): datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 10), - datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 11), - datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 14), - datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 17), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 15), - datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 15), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 15), - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 15), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 14), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 11, 30), - datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 11, 30, 30)})) - - apply_cases.append((BusinessHour(n=-4, start=['09:00', '14:00'], - end=['12:00', '18:00']), { - datetime(2014, 7, 1, 11): datetime(2014, 6, 30, 16), - datetime(2014, 7, 1, 13): datetime(2014, 6, 30, 17), - datetime(2014, 7, 1, 15): datetime(2014, 6, 30, 18), - datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 10), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 11), - datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 16), - datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 12), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 12), - datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 12), - datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 12), - datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 12), - datetime(2014, 7, 4, 18): datetime(2014, 7, 4, 12), - datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 14, 30), - datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 14, 30, 30)})) - - apply_cases.append((BusinessHour(n=-1, start=['19:00', '03:00'], - end=['01:00', '05:00']), { - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 4), - datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 20): datetime(2014, 7, 2, 5), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 4): datetime(2014, 7, 2, 1), - datetime(2014, 7, 2, 19, 30): datetime(2014, 7, 2, 4, 30), - datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 23), - datetime(2014, 7, 3, 6): datetime(2014, 7, 3, 4), - datetime(2014, 7, 4, 23): datetime(2014, 7, 4, 22), - datetime(2014, 7, 5, 0): datetime(2014, 7, 4, 23), - datetime(2014, 7, 5, 4): datetime(2014, 7, 5, 0), - datetime(2014, 7, 7, 3, 30): datetime(2014, 7, 5, 0, 30), - datetime(2014, 7, 7, 19, 30): datetime(2014, 7, 7, 4, 30), - datetime(2014, 7, 7, 19, 30, 30): datetime(2014, 7, 7, 4, 30, 30)})) - - @pytest.mark.parametrize('case', apply_cases) + apply_cases.append( + ( + BusinessHour(start=["09:00", "14:00"], end=["12:00", "18:00"]), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), + datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 10), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 17), + datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 1, 17, 30, 15), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 9), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 14), + # out of business hours + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 10), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), + # saturday + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 17, 30): datetime(2014, 7, 7, 9, 30), + datetime(2014, 7, 4, 17, 30, 30): datetime(2014, 7, 7, 9, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=4, start=["09:00", "14:00"], end=["12:00", "18:00"]), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 17), + datetime(2014, 7, 1, 13): datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 10), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 11), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 14), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 17), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 15), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 15), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 15), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 14), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 11, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 11, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=-4, start=["09:00", "14:00"], end=["12:00", "18:00"]), + { + datetime(2014, 7, 1, 11): datetime(2014, 6, 30, 16), + datetime(2014, 7, 1, 13): datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 15): datetime(2014, 6, 30, 18), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 10), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 11), + datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 12), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 12), + datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 12), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 12), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 12), + datetime(2014, 7, 4, 18): datetime(2014, 7, 4, 12), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 14, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 14, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=-1, start=["19:00", "03:00"], end=["01:00", "05:00"]), + { + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 4), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 20): datetime(2014, 7, 2, 5), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 4): datetime(2014, 7, 2, 1), + datetime(2014, 7, 2, 19, 30): datetime(2014, 7, 2, 4, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 23), + datetime(2014, 7, 3, 6): datetime(2014, 7, 3, 4), + datetime(2014, 7, 4, 23): datetime(2014, 7, 4, 22), + datetime(2014, 7, 5, 0): datetime(2014, 7, 4, 23), + datetime(2014, 7, 5, 4): datetime(2014, 7, 5, 0), + datetime(2014, 7, 7, 3, 30): datetime(2014, 7, 5, 0, 30), + datetime(2014, 7, 7, 19, 30): datetime(2014, 7, 7, 4, 30), + datetime(2014, 7, 7, 19, 30, 30): datetime(2014, 7, 7, 4, 30, 30), + }, + ) + ) + + @pytest.mark.parametrize("case", apply_cases) def test_apply(self, case): offset, cases = case for base, expected in cases.items(): @@ -1588,91 +2045,119 @@ def test_apply(self, case): apply_large_n_cases = [] # A week later - apply_large_n_cases.append((BusinessHour(40), { - datetime(2014, 7, 1, 11): datetime(2014, 7, 8, 11), - datetime(2014, 7, 1, 13): datetime(2014, 7, 8, 13), - datetime(2014, 7, 1, 15): datetime(2014, 7, 8, 15), - datetime(2014, 7, 1, 16): datetime(2014, 7, 8, 16), - datetime(2014, 7, 1, 17): datetime(2014, 7, 9, 9), - datetime(2014, 7, 2, 11): datetime(2014, 7, 9, 11), - datetime(2014, 7, 2, 8): datetime(2014, 7, 9, 9), - datetime(2014, 7, 2, 19): datetime(2014, 7, 10, 9), - datetime(2014, 7, 2, 23): datetime(2014, 7, 10, 9), - datetime(2014, 7, 3, 0): datetime(2014, 7, 10, 9), - datetime(2014, 7, 5, 15): datetime(2014, 7, 14, 9), - datetime(2014, 7, 4, 18): datetime(2014, 7, 14, 9), - datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 14, 9, 30), - datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 14, 9, 30, 30)})) + apply_large_n_cases.append( + ( + BusinessHour(40), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 8, 11), + datetime(2014, 7, 1, 13): datetime(2014, 7, 8, 13), + datetime(2014, 7, 1, 15): datetime(2014, 7, 8, 15), + datetime(2014, 7, 1, 16): datetime(2014, 7, 8, 16), + datetime(2014, 7, 1, 17): datetime(2014, 7, 9, 9), + datetime(2014, 7, 2, 11): datetime(2014, 7, 9, 11), + datetime(2014, 7, 2, 8): datetime(2014, 7, 9, 9), + datetime(2014, 7, 2, 19): datetime(2014, 7, 10, 9), + datetime(2014, 7, 2, 23): datetime(2014, 7, 10, 9), + datetime(2014, 7, 3, 0): datetime(2014, 7, 10, 9), + datetime(2014, 7, 5, 15): datetime(2014, 7, 14, 9), + datetime(2014, 7, 4, 18): datetime(2014, 7, 14, 9), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 14, 9, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 14, 9, 30, 30), + }, + ) + ) # 3 days and 1 hour before - apply_large_n_cases.append((BusinessHour(-25), { - datetime(2014, 7, 1, 11): datetime(2014, 6, 26, 10), - datetime(2014, 7, 1, 13): datetime(2014, 6, 26, 12), - datetime(2014, 7, 1, 9): datetime(2014, 6, 25, 16), - datetime(2014, 7, 1, 10): datetime(2014, 6, 25, 17), - datetime(2014, 7, 3, 11): datetime(2014, 6, 30, 10), - datetime(2014, 7, 3, 8): datetime(2014, 6, 27, 16), - datetime(2014, 7, 3, 19): datetime(2014, 6, 30, 16), - datetime(2014, 7, 3, 23): datetime(2014, 6, 30, 16), - datetime(2014, 7, 4, 9): datetime(2014, 6, 30, 16), - datetime(2014, 7, 5, 15): datetime(2014, 7, 1, 16), - datetime(2014, 7, 6, 18): datetime(2014, 7, 1, 16), - datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 1, 16, 30), - datetime(2014, 7, 7, 10, 30, 30): datetime(2014, 7, 2, 9, 30, 30)})) + apply_large_n_cases.append( + ( + BusinessHour(-25), + { + datetime(2014, 7, 1, 11): datetime(2014, 6, 26, 10), + datetime(2014, 7, 1, 13): datetime(2014, 6, 26, 12), + datetime(2014, 7, 1, 9): datetime(2014, 6, 25, 16), + datetime(2014, 7, 1, 10): datetime(2014, 6, 25, 17), + datetime(2014, 7, 3, 11): datetime(2014, 6, 30, 10), + datetime(2014, 7, 3, 8): datetime(2014, 6, 27, 16), + datetime(2014, 7, 3, 19): datetime(2014, 6, 30, 16), + datetime(2014, 7, 3, 23): datetime(2014, 6, 30, 16), + datetime(2014, 7, 4, 9): datetime(2014, 6, 30, 16), + datetime(2014, 7, 5, 15): datetime(2014, 7, 1, 16), + datetime(2014, 7, 6, 18): datetime(2014, 7, 1, 16), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 1, 16, 30), + datetime(2014, 7, 7, 10, 30, 30): datetime(2014, 7, 2, 9, 30, 30), + }, + ) + ) # 5 days and 3 hours later - apply_large_n_cases.append((BusinessHour(28, start='21:00', end='02:00'), { - datetime(2014, 7, 1, 11): datetime(2014, 7, 9, 0), - datetime(2014, 7, 1, 22): datetime(2014, 7, 9, 1), - datetime(2014, 7, 1, 23): datetime(2014, 7, 9, 21), - datetime(2014, 7, 2, 2): datetime(2014, 7, 10, 0), - datetime(2014, 7, 3, 21): datetime(2014, 7, 11, 0), - datetime(2014, 7, 4, 1): datetime(2014, 7, 11, 23), - datetime(2014, 7, 4, 2): datetime(2014, 7, 12, 0), - datetime(2014, 7, 4, 3): datetime(2014, 7, 12, 0), - datetime(2014, 7, 5, 1): datetime(2014, 7, 14, 23), - datetime(2014, 7, 5, 15): datetime(2014, 7, 15, 0), - datetime(2014, 7, 6, 18): datetime(2014, 7, 15, 0), - datetime(2014, 7, 7, 1): datetime(2014, 7, 15, 0), - datetime(2014, 7, 7, 23, 30): datetime(2014, 7, 15, 21, 30)})) + apply_large_n_cases.append( + ( + BusinessHour(28, start="21:00", end="02:00"), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 9, 0), + datetime(2014, 7, 1, 22): datetime(2014, 7, 9, 1), + datetime(2014, 7, 1, 23): datetime(2014, 7, 9, 21), + datetime(2014, 7, 2, 2): datetime(2014, 7, 10, 0), + datetime(2014, 7, 3, 21): datetime(2014, 7, 11, 0), + datetime(2014, 7, 4, 1): datetime(2014, 7, 11, 23), + datetime(2014, 7, 4, 2): datetime(2014, 7, 12, 0), + datetime(2014, 7, 4, 3): datetime(2014, 7, 12, 0), + datetime(2014, 7, 5, 1): datetime(2014, 7, 14, 23), + datetime(2014, 7, 5, 15): datetime(2014, 7, 15, 0), + datetime(2014, 7, 6, 18): datetime(2014, 7, 15, 0), + datetime(2014, 7, 7, 1): datetime(2014, 7, 15, 0), + datetime(2014, 7, 7, 23, 30): datetime(2014, 7, 15, 21, 30), + }, + ) + ) # large n for multiple opening hours (3 days and 1 hour before) - apply_large_n_cases.append((BusinessHour(n=-25, start=['09:00', '14:00'], - end=['12:00', '19:00']), { - datetime(2014, 7, 1, 11): datetime(2014, 6, 26, 10), - datetime(2014, 7, 1, 13): datetime(2014, 6, 26, 11), - datetime(2014, 7, 1, 9): datetime(2014, 6, 25, 18), - datetime(2014, 7, 1, 10): datetime(2014, 6, 25, 19), - datetime(2014, 7, 3, 11): datetime(2014, 6, 30, 10), - datetime(2014, 7, 3, 8): datetime(2014, 6, 27, 18), - datetime(2014, 7, 3, 19): datetime(2014, 6, 30, 18), - datetime(2014, 7, 3, 23): datetime(2014, 6, 30, 18), - datetime(2014, 7, 4, 9): datetime(2014, 6, 30, 18), - datetime(2014, 7, 5, 15): datetime(2014, 7, 1, 18), - datetime(2014, 7, 6, 18): datetime(2014, 7, 1, 18), - datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 1, 18, 30), - datetime(2014, 7, 7, 10, 30, 30): datetime(2014, 7, 2, 9, 30, 30)})) + apply_large_n_cases.append( + ( + BusinessHour(n=-25, start=["09:00", "14:00"], end=["12:00", "19:00"]), + { + datetime(2014, 7, 1, 11): datetime(2014, 6, 26, 10), + datetime(2014, 7, 1, 13): datetime(2014, 6, 26, 11), + datetime(2014, 7, 1, 9): datetime(2014, 6, 25, 18), + datetime(2014, 7, 1, 10): datetime(2014, 6, 25, 19), + datetime(2014, 7, 3, 11): datetime(2014, 6, 30, 10), + datetime(2014, 7, 3, 8): datetime(2014, 6, 27, 18), + datetime(2014, 7, 3, 19): datetime(2014, 6, 30, 18), + datetime(2014, 7, 3, 23): datetime(2014, 6, 30, 18), + datetime(2014, 7, 4, 9): datetime(2014, 6, 30, 18), + datetime(2014, 7, 5, 15): datetime(2014, 7, 1, 18), + datetime(2014, 7, 6, 18): datetime(2014, 7, 1, 18), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 1, 18, 30), + datetime(2014, 7, 7, 10, 30, 30): datetime(2014, 7, 2, 9, 30, 30), + }, + ) + ) # 5 days and 3 hours later - apply_large_n_cases.append((BusinessHour(28, start=['21:00', '03:00'], - end=['01:00', '04:00']), { - datetime(2014, 7, 1, 11): datetime(2014, 7, 9, 0), - datetime(2014, 7, 1, 22): datetime(2014, 7, 9, 3), - datetime(2014, 7, 1, 23): datetime(2014, 7, 9, 21), - datetime(2014, 7, 2, 2): datetime(2014, 7, 9, 23), - datetime(2014, 7, 3, 21): datetime(2014, 7, 11, 0), - datetime(2014, 7, 4, 1): datetime(2014, 7, 11, 23), - datetime(2014, 7, 4, 2): datetime(2014, 7, 11, 23), - datetime(2014, 7, 4, 3): datetime(2014, 7, 11, 23), - datetime(2014, 7, 4, 21): datetime(2014, 7, 12, 0), - datetime(2014, 7, 5, 0): datetime(2014, 7, 14, 22), - datetime(2014, 7, 5, 1): datetime(2014, 7, 14, 23), - datetime(2014, 7, 5, 15): datetime(2014, 7, 14, 23), - datetime(2014, 7, 6, 18): datetime(2014, 7, 14, 23), - datetime(2014, 7, 7, 1): datetime(2014, 7, 14, 23), - datetime(2014, 7, 7, 23, 30): datetime(2014, 7, 15, 21, 30)})) - - @pytest.mark.parametrize('case', apply_large_n_cases) + apply_large_n_cases.append( + ( + BusinessHour(28, start=["21:00", "03:00"], end=["01:00", "04:00"]), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 9, 0), + datetime(2014, 7, 1, 22): datetime(2014, 7, 9, 3), + datetime(2014, 7, 1, 23): datetime(2014, 7, 9, 21), + datetime(2014, 7, 2, 2): datetime(2014, 7, 9, 23), + datetime(2014, 7, 3, 21): datetime(2014, 7, 11, 0), + datetime(2014, 7, 4, 1): datetime(2014, 7, 11, 23), + datetime(2014, 7, 4, 2): datetime(2014, 7, 11, 23), + datetime(2014, 7, 4, 3): datetime(2014, 7, 11, 23), + datetime(2014, 7, 4, 21): datetime(2014, 7, 12, 0), + datetime(2014, 7, 5, 0): datetime(2014, 7, 14, 22), + datetime(2014, 7, 5, 1): datetime(2014, 7, 14, 23), + datetime(2014, 7, 5, 15): datetime(2014, 7, 14, 23), + datetime(2014, 7, 6, 18): datetime(2014, 7, 14, 23), + datetime(2014, 7, 7, 1): datetime(2014, 7, 14, 23), + datetime(2014, 7, 7, 23, 30): datetime(2014, 7, 15, 21, 30), + }, + ) + ) + + @pytest.mark.parametrize("case", apply_large_n_cases) def test_apply_large_n(self, case): offset, cases = case for base, expected in cases.items(): @@ -1681,57 +2166,89 @@ def test_apply_large_n(self, case): def test_apply_nanoseconds(self): tests = [] - tests.append((BusinessHour(), - {Timestamp('2014-07-04 15:00') + Nano(5): Timestamp( - '2014-07-04 16:00') + Nano(5), - Timestamp('2014-07-04 16:00') + Nano(5): Timestamp( - '2014-07-07 09:00') + Nano(5), - Timestamp('2014-07-04 16:00') - Nano(5): Timestamp( - '2014-07-04 17:00') - Nano(5)})) - - tests.append((BusinessHour(-1), - {Timestamp('2014-07-04 15:00') + Nano(5): Timestamp( - '2014-07-04 14:00') + Nano(5), - Timestamp('2014-07-04 10:00') + Nano(5): Timestamp( - '2014-07-04 09:00') + Nano(5), - Timestamp('2014-07-04 10:00') - Nano(5): Timestamp( - '2014-07-03 17:00') - Nano(5), })) + tests.append( + ( + BusinessHour(), + { + Timestamp("2014-07-04 15:00") + + Nano(5): Timestamp("2014-07-04 16:00") + + Nano(5), + Timestamp("2014-07-04 16:00") + + Nano(5): Timestamp("2014-07-07 09:00") + + Nano(5), + Timestamp("2014-07-04 16:00") + - Nano(5): Timestamp("2014-07-04 17:00") + - Nano(5), + }, + ) + ) + + tests.append( + ( + BusinessHour(-1), + { + Timestamp("2014-07-04 15:00") + + Nano(5): Timestamp("2014-07-04 14:00") + + Nano(5), + Timestamp("2014-07-04 10:00") + + Nano(5): Timestamp("2014-07-04 09:00") + + Nano(5), + Timestamp("2014-07-04 10:00") + - Nano(5): Timestamp("2014-07-03 17:00") + - Nano(5), + }, + ) + ) for offset, cases in tests: for base, expected in cases.items(): assert_offset_equal(offset, base, expected) def test_datetimeindex(self): - idx1 = date_range(start='2014-07-04 15:00', end='2014-07-08 10:00', - freq='BH') - idx2 = date_range(start='2014-07-04 15:00', periods=12, freq='BH') - idx3 = date_range(end='2014-07-08 10:00', periods=12, freq='BH') - expected = DatetimeIndex(['2014-07-04 15:00', '2014-07-04 16:00', - '2014-07-07 09:00', - '2014-07-07 10:00', '2014-07-07 11:00', - '2014-07-07 12:00', - '2014-07-07 13:00', '2014-07-07 14:00', - '2014-07-07 15:00', - '2014-07-07 16:00', '2014-07-08 09:00', - '2014-07-08 10:00'], - freq='BH') + idx1 = date_range(start="2014-07-04 15:00", end="2014-07-08 10:00", freq="BH") + idx2 = date_range(start="2014-07-04 15:00", periods=12, freq="BH") + idx3 = date_range(end="2014-07-08 10:00", periods=12, freq="BH") + expected = DatetimeIndex( + [ + "2014-07-04 15:00", + "2014-07-04 16:00", + "2014-07-07 09:00", + "2014-07-07 10:00", + "2014-07-07 11:00", + "2014-07-07 12:00", + "2014-07-07 13:00", + "2014-07-07 14:00", + "2014-07-07 15:00", + "2014-07-07 16:00", + "2014-07-08 09:00", + "2014-07-08 10:00", + ], + freq="BH", + ) for idx in [idx1, idx2, idx3]: tm.assert_index_equal(idx, expected) - idx1 = date_range(start='2014-07-04 15:45', end='2014-07-08 10:45', - freq='BH') - idx2 = date_range(start='2014-07-04 15:45', periods=12, freq='BH') - idx3 = date_range(end='2014-07-08 10:45', periods=12, freq='BH') - - expected = DatetimeIndex(['2014-07-04 15:45', '2014-07-04 16:45', - '2014-07-07 09:45', - '2014-07-07 10:45', '2014-07-07 11:45', - '2014-07-07 12:45', - '2014-07-07 13:45', '2014-07-07 14:45', - '2014-07-07 15:45', - '2014-07-07 16:45', '2014-07-08 09:45', - '2014-07-08 10:45'], - freq='BH') + idx1 = date_range(start="2014-07-04 15:45", end="2014-07-08 10:45", freq="BH") + idx2 = date_range(start="2014-07-04 15:45", periods=12, freq="BH") + idx3 = date_range(end="2014-07-08 10:45", periods=12, freq="BH") + + expected = DatetimeIndex( + [ + "2014-07-04 15:45", + "2014-07-04 16:45", + "2014-07-07 09:45", + "2014-07-07 10:45", + "2014-07-07 11:45", + "2014-07-07 12:45", + "2014-07-07 13:45", + "2014-07-07 14:45", + "2014-07-07 15:45", + "2014-07-07 16:45", + "2014-07-08 09:45", + "2014-07-08 10:45", + ], + freq="BH", + ) expected = idx1 for idx in [idx1, idx2, idx3]: tm.assert_index_equal(idx, expected) @@ -1739,8 +2256,7 @@ def test_datetimeindex(self): class TestCustomBusinessHour(Base): _offset = CustomBusinessHour - holidays = ['2014-06-27', datetime(2014, 6, 30), - np.datetime64('2014-07-02')] + holidays = ["2014-06-27", datetime(2014, 6, 30), np.datetime64("2014-07-02")] def setup_method(self, method): # 2014 Calendar to check custom holidays @@ -1749,18 +2265,19 @@ def setup_method(self, method): # 29 30 7/1 2 3 4 5 # 6 7 8 9 10 11 12 self.d = datetime(2014, 7, 1, 10, 00) - self.offset1 = CustomBusinessHour(weekmask='Tue Wed Thu Fri') + self.offset1 = CustomBusinessHour(weekmask="Tue Wed Thu Fri") self.offset2 = CustomBusinessHour(holidays=self.holidays) def test_constructor_errors(self): from datetime import time as dt_time + with pytest.raises(ValueError): CustomBusinessHour(start=dt_time(11, 0, 5)) with pytest.raises(ValueError): - CustomBusinessHour(start='AAA') + CustomBusinessHour(start="AAA") with pytest.raises(ValueError): - CustomBusinessHour(start='14:00:05') + CustomBusinessHour(start="14:00:05") def test_different_normalize_equals(self): # GH#21404 changed __eq__ to return False when `normalize` doesnt match @@ -1769,11 +2286,11 @@ def test_different_normalize_equals(self): assert offset != offset2 def test_repr(self): - assert repr(self.offset1) == '' - assert repr(self.offset2) == '' + assert repr(self.offset1) == "" + assert repr(self.offset2) == "" def test_with_offset(self): - expected = Timestamp('2014-07-01 13:00') + expected = Timestamp("2014-07-01 13:00") assert self.d + CustomBusinessHour() * 3 == expected assert self.d + CustomBusinessHour(n=3) == expected @@ -1783,17 +2300,18 @@ def test_eq(self): assert offset == offset assert CustomBusinessHour() != CustomBusinessHour(-1) - assert (CustomBusinessHour(start='09:00') == - CustomBusinessHour()) - assert (CustomBusinessHour(start='09:00') != - CustomBusinessHour(start='09:01')) - assert (CustomBusinessHour(start='09:00', end='17:00') != - CustomBusinessHour(start='17:00', end='09:01')) - - assert (CustomBusinessHour(weekmask='Tue Wed Thu Fri') != - CustomBusinessHour(weekmask='Mon Tue Wed Thu Fri')) - assert (CustomBusinessHour(holidays=['2014-06-27']) != - CustomBusinessHour(holidays=['2014-06-28'])) + assert CustomBusinessHour(start="09:00") == CustomBusinessHour() + assert CustomBusinessHour(start="09:00") != CustomBusinessHour(start="09:01") + assert CustomBusinessHour(start="09:00", end="17:00") != CustomBusinessHour( + start="17:00", end="09:01" + ) + + assert CustomBusinessHour(weekmask="Tue Wed Thu Fri") != CustomBusinessHour( + weekmask="Mon Tue Wed Thu Fri" + ) + assert CustomBusinessHour(holidays=["2014-06-27"]) != CustomBusinessHour( + holidays=["2014-06-28"] + ) def test_sub(self): # override the Base.test_sub implementation because self.offset2 is @@ -1821,8 +2339,9 @@ def testRollback1(self): assert self.offset2.rollback(d) == datetime(2014, 6, 26, 17) def testRollback2(self): - assert (self._offset(-3).rollback(datetime(2014, 7, 5, 15, 0)) == - datetime(2014, 7, 4, 17, 0)) + assert self._offset(-3).rollback(datetime(2014, 7, 5, 15, 0)) == datetime( + 2014, 7, 4, 17, 0 + ) def testRollforward1(self): assert self.offset1.rollforward(self.d) == self.d @@ -1833,8 +2352,9 @@ def testRollforward1(self): assert self.offset2.rollforward(d) == datetime(2014, 7, 1, 9) def testRollforward2(self): - assert (self._offset(-3).rollforward(datetime(2014, 7, 5, 16, 0)) == - datetime(2014, 7, 7, 9)) + assert self._offset(-3).rollforward(datetime(2014, 7, 5, 16, 0)) == datetime( + 2014, 7, 7, 9 + ) def test_roll_date_object(self): offset = BusinessHour() @@ -1848,46 +2368,61 @@ def test_roll_date_object(self): assert result == datetime(2014, 7, 7, 9) normalize_cases = [] - normalize_cases.append(( - CustomBusinessHour(normalize=True, holidays=holidays), - {datetime(2014, 7, 1, 8): datetime(2014, 7, 1), - datetime(2014, 7, 1, 17): datetime(2014, 7, 3), - datetime(2014, 7, 1, 16): datetime(2014, 7, 3), - datetime(2014, 7, 1, 23): datetime(2014, 7, 3), - datetime(2014, 7, 1, 0): datetime(2014, 7, 1), - datetime(2014, 7, 4, 15): datetime(2014, 7, 4), - datetime(2014, 7, 4, 15, 59): datetime(2014, 7, 4), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7), - datetime(2014, 7, 5, 23): datetime(2014, 7, 7), - datetime(2014, 7, 6, 10): datetime(2014, 7, 7)})) - - normalize_cases.append(( - CustomBusinessHour(-1, normalize=True, holidays=holidays), - {datetime(2014, 7, 1, 8): datetime(2014, 6, 26), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1), - datetime(2014, 7, 1, 16): datetime(2014, 7, 1), - datetime(2014, 7, 1, 10): datetime(2014, 6, 26), - datetime(2014, 7, 1, 0): datetime(2014, 6, 26), - datetime(2014, 7, 7, 10): datetime(2014, 7, 4), - datetime(2014, 7, 7, 10, 1): datetime(2014, 7, 7), - datetime(2014, 7, 5, 23): datetime(2014, 7, 4), - datetime(2014, 7, 6, 10): datetime(2014, 7, 4)})) - - normalize_cases.append(( - CustomBusinessHour(1, normalize=True, - start='17:00', end='04:00', - holidays=holidays), - {datetime(2014, 7, 1, 8): datetime(2014, 7, 1), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1), - datetime(2014, 7, 1, 23): datetime(2014, 7, 2), - datetime(2014, 7, 2, 2): datetime(2014, 7, 2), - datetime(2014, 7, 2, 3): datetime(2014, 7, 3), - datetime(2014, 7, 4, 23): datetime(2014, 7, 5), - datetime(2014, 7, 5, 2): datetime(2014, 7, 5), - datetime(2014, 7, 7, 2): datetime(2014, 7, 7), - datetime(2014, 7, 7, 17): datetime(2014, 7, 7)})) - - @pytest.mark.parametrize('norm_cases', normalize_cases) + normalize_cases.append( + ( + CustomBusinessHour(normalize=True, holidays=holidays), + { + datetime(2014, 7, 1, 8): datetime(2014, 7, 1), + datetime(2014, 7, 1, 17): datetime(2014, 7, 3), + datetime(2014, 7, 1, 16): datetime(2014, 7, 3), + datetime(2014, 7, 1, 23): datetime(2014, 7, 3), + datetime(2014, 7, 1, 0): datetime(2014, 7, 1), + datetime(2014, 7, 4, 15): datetime(2014, 7, 4), + datetime(2014, 7, 4, 15, 59): datetime(2014, 7, 4), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7), + datetime(2014, 7, 5, 23): datetime(2014, 7, 7), + datetime(2014, 7, 6, 10): datetime(2014, 7, 7), + }, + ) + ) + + normalize_cases.append( + ( + CustomBusinessHour(-1, normalize=True, holidays=holidays), + { + datetime(2014, 7, 1, 8): datetime(2014, 6, 26), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1), + datetime(2014, 7, 1, 10): datetime(2014, 6, 26), + datetime(2014, 7, 1, 0): datetime(2014, 6, 26), + datetime(2014, 7, 7, 10): datetime(2014, 7, 4), + datetime(2014, 7, 7, 10, 1): datetime(2014, 7, 7), + datetime(2014, 7, 5, 23): datetime(2014, 7, 4), + datetime(2014, 7, 6, 10): datetime(2014, 7, 4), + }, + ) + ) + + normalize_cases.append( + ( + CustomBusinessHour( + 1, normalize=True, start="17:00", end="04:00", holidays=holidays + ), + { + datetime(2014, 7, 1, 8): datetime(2014, 7, 1), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1), + datetime(2014, 7, 1, 23): datetime(2014, 7, 2), + datetime(2014, 7, 2, 2): datetime(2014, 7, 2), + datetime(2014, 7, 2, 3): datetime(2014, 7, 3), + datetime(2014, 7, 4, 23): datetime(2014, 7, 5), + datetime(2014, 7, 5, 2): datetime(2014, 7, 5), + datetime(2014, 7, 7, 2): datetime(2014, 7, 7), + datetime(2014, 7, 7, 17): datetime(2014, 7, 7), + }, + ) + ) + + @pytest.mark.parametrize("norm_cases", normalize_cases) def test_normalize(self, norm_cases): offset, cases = norm_cases for dt, expected in cases.items(): @@ -1896,59 +2431,74 @@ def test_normalize(self, norm_cases): def test_onOffset(self): tests = [] - tests.append((CustomBusinessHour(start='10:00', end='15:00', - holidays=self.holidays), - {datetime(2014, 7, 1, 9): False, - datetime(2014, 7, 1, 10): True, - datetime(2014, 7, 1, 15): True, - datetime(2014, 7, 1, 15, 1): False, - datetime(2014, 7, 5, 12): False, - datetime(2014, 7, 6, 12): False})) + tests.append( + ( + CustomBusinessHour(start="10:00", end="15:00", holidays=self.holidays), + { + datetime(2014, 7, 1, 9): False, + datetime(2014, 7, 1, 10): True, + datetime(2014, 7, 1, 15): True, + datetime(2014, 7, 1, 15, 1): False, + datetime(2014, 7, 5, 12): False, + datetime(2014, 7, 6, 12): False, + }, + ) + ) for offset, cases in tests: for dt, expected in cases.items(): assert offset.onOffset(dt) == expected apply_cases = [] - apply_cases.append(( - CustomBusinessHour(holidays=holidays), - {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 12), - datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), - datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), - datetime(2014, 7, 1, 19): datetime(2014, 7, 3, 10), - datetime(2014, 7, 1, 16): datetime(2014, 7, 3, 9), - datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 3, 9, 30, 15), - datetime(2014, 7, 1, 17): datetime(2014, 7, 3, 10), - datetime(2014, 7, 2, 11): datetime(2014, 7, 3, 10), - # out of business hours - datetime(2014, 7, 2, 8): datetime(2014, 7, 3, 10), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), - datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), - # saturday - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 10), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 9, 30), - datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 9, 30, 30)})) - - apply_cases.append(( - CustomBusinessHour(4, holidays=holidays), - {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 15), - datetime(2014, 7, 1, 13): datetime(2014, 7, 3, 9), - datetime(2014, 7, 1, 15): datetime(2014, 7, 3, 11), - datetime(2014, 7, 1, 16): datetime(2014, 7, 3, 12), - datetime(2014, 7, 1, 17): datetime(2014, 7, 3, 13), - datetime(2014, 7, 2, 11): datetime(2014, 7, 3, 13), - datetime(2014, 7, 2, 8): datetime(2014, 7, 3, 13), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 13), - datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 13), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 13), - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 13), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 13), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 12, 30), - datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 12, 30, 30)})) - - @pytest.mark.parametrize('apply_case', apply_cases) + apply_cases.append( + ( + CustomBusinessHour(holidays=holidays), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), + datetime(2014, 7, 1, 19): datetime(2014, 7, 3, 10), + datetime(2014, 7, 1, 16): datetime(2014, 7, 3, 9), + datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 3, 9, 30, 15), + datetime(2014, 7, 1, 17): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 11): datetime(2014, 7, 3, 10), + # out of business hours + datetime(2014, 7, 2, 8): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), + # saturday + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 9, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 9, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + CustomBusinessHour(4, holidays=holidays), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 1, 13): datetime(2014, 7, 3, 9), + datetime(2014, 7, 1, 15): datetime(2014, 7, 3, 11), + datetime(2014, 7, 1, 16): datetime(2014, 7, 3, 12), + datetime(2014, 7, 1, 17): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 11): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 8): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 13), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 13), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 13), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 12, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 12, 30, 30), + }, + ) + ) + + @pytest.mark.parametrize("apply_case", apply_cases) def test_apply(self, apply_case): offset, cases = apply_case for base, expected in cases.items(): @@ -1956,24 +2506,40 @@ def test_apply(self, apply_case): nano_cases = [] nano_cases.append( - (CustomBusinessHour(holidays=holidays), - {Timestamp('2014-07-01 15:00') + Nano(5): - Timestamp('2014-07-01 16:00') + Nano(5), - Timestamp('2014-07-01 16:00') + Nano(5): - Timestamp('2014-07-03 09:00') + Nano(5), - Timestamp('2014-07-01 16:00') - Nano(5): - Timestamp('2014-07-01 17:00') - Nano(5)})) + ( + CustomBusinessHour(holidays=holidays), + { + Timestamp("2014-07-01 15:00") + + Nano(5): Timestamp("2014-07-01 16:00") + + Nano(5), + Timestamp("2014-07-01 16:00") + + Nano(5): Timestamp("2014-07-03 09:00") + + Nano(5), + Timestamp("2014-07-01 16:00") + - Nano(5): Timestamp("2014-07-01 17:00") + - Nano(5), + }, + ) + ) nano_cases.append( - (CustomBusinessHour(-1, holidays=holidays), - {Timestamp('2014-07-01 15:00') + Nano(5): - Timestamp('2014-07-01 14:00') + Nano(5), - Timestamp('2014-07-01 10:00') + Nano(5): - Timestamp('2014-07-01 09:00') + Nano(5), - Timestamp('2014-07-01 10:00') - Nano(5): - Timestamp('2014-06-26 17:00') - Nano(5)})) - - @pytest.mark.parametrize('nano_case', nano_cases) + ( + CustomBusinessHour(-1, holidays=holidays), + { + Timestamp("2014-07-01 15:00") + + Nano(5): Timestamp("2014-07-01 14:00") + + Nano(5), + Timestamp("2014-07-01 10:00") + + Nano(5): Timestamp("2014-07-01 09:00") + + Nano(5), + Timestamp("2014-07-01 10:00") + - Nano(5): Timestamp("2014-06-26 17:00") + - Nano(5), + }, + ) + ) + + @pytest.mark.parametrize("nano_case", nano_cases) def test_apply_nanoseconds(self, nano_case): offset, cases = nano_case for base, expected in cases.items(): @@ -1985,7 +2551,7 @@ class TestCustomBusinessDay(Base): def setup_method(self, method): self.d = datetime(2008, 1, 1) - self.nd = np_datetime64_compat('2008-01-01 00:00:00Z') + self.nd = np_datetime64_compat("2008-01-01 00:00:00Z") self.offset = CDay() self.offset1 = self.offset @@ -1998,13 +2564,13 @@ def test_different_normalize_equals(self): assert offset != offset2 def test_repr(self): - assert repr(self.offset) == '' - assert repr(self.offset2) == '<2 * CustomBusinessDays>' + assert repr(self.offset) == "" + assert repr(self.offset2) == "<2 * CustomBusinessDays>" if compat.PY37: - expected = '' + expected = "" else: - expected = '' + expected = "" assert repr(self.offset + timedelta(1)) == expected def test_with_offset(self): @@ -2029,15 +2595,13 @@ def testRollback1(self): assert CDay(10).rollback(self.d) == self.d def testRollback2(self): - assert (CDay(10).rollback(datetime(2008, 1, 5)) == - datetime(2008, 1, 4)) + assert CDay(10).rollback(datetime(2008, 1, 5)) == datetime(2008, 1, 4) def testRollforward1(self): assert CDay(10).rollforward(self.d) == self.d def testRollforward2(self): - assert (CDay(10).rollforward(datetime(2008, 1, 5)) == - datetime(2008, 1, 7)) + assert CDay(10).rollforward(datetime(2008, 1, 5)) == datetime(2008, 1, 7) def test_roll_date_object(self): offset = CDay() @@ -2057,54 +2621,86 @@ def test_roll_date_object(self): result = offset.rollforward(dt) assert result == datetime(2012, 9, 15) - on_offset_cases = [(CDay(), datetime(2008, 1, 1), True), - (CDay(), datetime(2008, 1, 5), False)] + on_offset_cases = [ + (CDay(), datetime(2008, 1, 1), True), + (CDay(), datetime(2008, 1, 5), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, d, expected = case assert_onOffset(offset, d, expected) apply_cases = [] - apply_cases.append((CDay(), { - datetime(2008, 1, 1): datetime(2008, 1, 2), - datetime(2008, 1, 4): datetime(2008, 1, 7), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 8)})) - - apply_cases.append((2 * CDay(), { - datetime(2008, 1, 1): datetime(2008, 1, 3), - datetime(2008, 1, 4): datetime(2008, 1, 8), - datetime(2008, 1, 5): datetime(2008, 1, 8), - datetime(2008, 1, 6): datetime(2008, 1, 8), - datetime(2008, 1, 7): datetime(2008, 1, 9)})) - - apply_cases.append((-CDay(), { - datetime(2008, 1, 1): datetime(2007, 12, 31), - datetime(2008, 1, 4): datetime(2008, 1, 3), - datetime(2008, 1, 5): datetime(2008, 1, 4), - datetime(2008, 1, 6): datetime(2008, 1, 4), - datetime(2008, 1, 7): datetime(2008, 1, 4), - datetime(2008, 1, 8): datetime(2008, 1, 7)})) - - apply_cases.append((-2 * CDay(), { - datetime(2008, 1, 1): datetime(2007, 12, 28), - datetime(2008, 1, 4): datetime(2008, 1, 2), - datetime(2008, 1, 5): datetime(2008, 1, 3), - datetime(2008, 1, 6): datetime(2008, 1, 3), - datetime(2008, 1, 7): datetime(2008, 1, 3), - datetime(2008, 1, 8): datetime(2008, 1, 4), - datetime(2008, 1, 9): datetime(2008, 1, 7)})) - - apply_cases.append((CDay(0), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 1, 4): datetime(2008, 1, 4), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 7)})) - - @pytest.mark.parametrize('case', apply_cases) + apply_cases.append( + ( + CDay(), + { + datetime(2008, 1, 1): datetime(2008, 1, 2), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 8), + }, + ) + ) + + apply_cases.append( + ( + 2 * CDay(), + { + datetime(2008, 1, 1): datetime(2008, 1, 3), + datetime(2008, 1, 4): datetime(2008, 1, 8), + datetime(2008, 1, 5): datetime(2008, 1, 8), + datetime(2008, 1, 6): datetime(2008, 1, 8), + datetime(2008, 1, 7): datetime(2008, 1, 9), + }, + ) + ) + + apply_cases.append( + ( + -CDay(), + { + datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 3), + datetime(2008, 1, 5): datetime(2008, 1, 4), + datetime(2008, 1, 6): datetime(2008, 1, 4), + datetime(2008, 1, 7): datetime(2008, 1, 4), + datetime(2008, 1, 8): datetime(2008, 1, 7), + }, + ) + ) + + apply_cases.append( + ( + -2 * CDay(), + { + datetime(2008, 1, 1): datetime(2007, 12, 28), + datetime(2008, 1, 4): datetime(2008, 1, 2), + datetime(2008, 1, 5): datetime(2008, 1, 3), + datetime(2008, 1, 6): datetime(2008, 1, 3), + datetime(2008, 1, 7): datetime(2008, 1, 3), + datetime(2008, 1, 8): datetime(2008, 1, 4), + datetime(2008, 1, 9): datetime(2008, 1, 7), + }, + ) + ) + + apply_cases.append( + ( + CDay(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 4): datetime(2008, 1, 4), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 7), + }, + ) + ) + + @pytest.mark.parametrize("case", apply_cases) def test_apply(self, case): offset, cases = case for base, expected in cases.items(): @@ -2130,15 +2726,16 @@ def test_apply_large_n(self): assert rs == xp def test_apply_corner(self): - msg = ("Only know how to combine trading day with datetime, datetime64" - " or timedelta") + msg = ( + "Only know how to combine trading day with datetime, datetime64" + " or timedelta" + ) with pytest.raises(ApplyTypeError, match=msg): CDay().apply(BMonthEnd()) def test_holidays(self): # Define a TradingDay offset - holidays = ['2012-05-01', datetime(2013, 5, 1), - np.datetime64('2014-05-01')] + holidays = ["2012-05-01", datetime(2013, 5, 1), np.datetime64("2014-05-01")] tday = CDay(holidays=holidays) for year in range(2012, 2015): dt = datetime(year, 4, 30) @@ -2147,8 +2744,8 @@ def test_holidays(self): assert rs == xp def test_weekmask(self): - weekmask_saudi = 'Sat Sun Mon Tue Wed' # Thu-Fri Weekend - weekmask_uae = '1111001' # Fri-Sat Weekend + weekmask_saudi = "Sat Sun Mon Tue Wed" # Thu-Fri Weekend + weekmask_uae = "1111001" # Fri-Sat Weekend weekmask_egypt = [1, 1, 1, 1, 0, 0, 1] # Fri-Sat Weekend bday_saudi = CDay(weekmask=weekmask_saudi) bday_uae = CDay(weekmask=weekmask_uae) @@ -2166,9 +2763,8 @@ def test_weekmask(self): assert xp2 == dt + 2 * bday_egypt def test_weekmask_and_holidays(self): - weekmask_egypt = 'Sun Mon Tue Wed Thu' # Fri-Sat Weekend - holidays = ['2012-05-01', datetime(2013, 5, 1), - np.datetime64('2014-05-01')] + weekmask_egypt = "Sun Mon Tue Wed Thu" # Fri-Sat Weekend + holidays = ["2012-05-01", datetime(2013, 5, 1), np.datetime64("2014-05-01")] bday_egypt = CDay(holidays=holidays, weekmask=weekmask_egypt) dt = datetime(2013, 4, 30) xp_egypt = datetime(2013, 5, 5) @@ -2191,14 +2787,13 @@ def _check_roundtrip(obj): def test_pickle_compat_0_14_1(self, datapath): hdays = [datetime(2013, 1, 1) for ele in range(4)] - pth = datapath('tseries', 'offsets', 'data', 'cday-0.14.1.pickle') + pth = datapath("tseries", "offsets", "data", "cday-0.14.1.pickle") cday0_14_1 = read_pickle(pth) cday = CDay(holidays=hdays) assert cday == cday0_14_1 class CustomBusinessMonthBase: - def setup_method(self, method): self.d = datetime(2008, 1, 1) @@ -2226,7 +2821,7 @@ def _check_roundtrip(obj): def test_copy(self): # GH 17452 - off = self._offset(weekmask='Mon Wed Fri') + off = self._offset(weekmask="Mon Wed Fri") assert off == off.copy() @@ -2240,15 +2835,14 @@ def test_different_normalize_equals(self): assert offset != offset2 def test_repr(self): - assert repr(self.offset) == '' - assert repr(self.offset2) == '<2 * CustomBusinessMonthEnds>' + assert repr(self.offset) == "" + assert repr(self.offset2) == "<2 * CustomBusinessMonthEnds>" def testCall(self): assert self.offset2(self.d) == datetime(2008, 2, 29) def testRollback1(self): - assert (CDay(10).rollback(datetime(2007, 12, 31)) == - datetime(2007, 12, 31)) + assert CDay(10).rollback(datetime(2007, 12, 31)) == datetime(2007, 12, 31) def testRollback2(self): assert CBMonthEnd(10).rollback(self.d) == datetime(2007, 12, 31) @@ -2274,36 +2868,68 @@ def test_roll_date_object(self): result = offset.rollforward(dt) assert result == datetime(2012, 9, 15) - on_offset_cases = [(CBMonthEnd(), datetime(2008, 1, 31), True), - (CBMonthEnd(), datetime(2008, 1, 1), False)] + on_offset_cases = [ + (CBMonthEnd(), datetime(2008, 1, 31), True), + (CBMonthEnd(), datetime(2008, 1, 1), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, d, expected = case assert_onOffset(offset, d, expected) apply_cases = [] - apply_cases.append((CBMonthEnd(), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 2, 7): datetime(2008, 2, 29)})) - - apply_cases.append((2 * CBMonthEnd(), { - datetime(2008, 1, 1): datetime(2008, 2, 29), - datetime(2008, 2, 7): datetime(2008, 3, 31)})) - - apply_cases.append((-CBMonthEnd(), { - datetime(2008, 1, 1): datetime(2007, 12, 31), - datetime(2008, 2, 8): datetime(2008, 1, 31)})) - - apply_cases.append((-2 * CBMonthEnd(), { - datetime(2008, 1, 1): datetime(2007, 11, 30), - datetime(2008, 2, 9): datetime(2007, 12, 31)})) - - apply_cases.append((CBMonthEnd(0), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 2, 7): datetime(2008, 2, 29)})) - - @pytest.mark.parametrize('case', apply_cases) + apply_cases.append( + ( + CBMonthEnd(), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 2, 7): datetime(2008, 2, 29), + }, + ) + ) + + apply_cases.append( + ( + 2 * CBMonthEnd(), + { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 2, 7): datetime(2008, 3, 31), + }, + ) + ) + + apply_cases.append( + ( + -CBMonthEnd(), + { + datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 2, 8): datetime(2008, 1, 31), + }, + ) + ) + + apply_cases.append( + ( + -2 * CBMonthEnd(), + { + datetime(2008, 1, 1): datetime(2007, 11, 30), + datetime(2008, 2, 9): datetime(2007, 12, 31), + }, + ) + ) + + apply_cases.append( + ( + CBMonthEnd(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 2, 7): datetime(2008, 2, 29), + }, + ) + ) + + @pytest.mark.parametrize("case", apply_cases) def test_apply(self, case): offset, cases = case for base, expected in cases.items(): @@ -2330,8 +2956,7 @@ def test_apply_large_n(self): def test_holidays(self): # Define a TradingDay offset - holidays = ['2012-01-31', datetime(2012, 2, 28), - np.datetime64('2012-02-29')] + holidays = ["2012-01-31", datetime(2012, 2, 28), np.datetime64("2012-02-29")] bm_offset = CBMonthEnd(holidays=holidays) dt = datetime(2012, 1, 1) assert dt + bm_offset == datetime(2012, 1, 30) @@ -2340,11 +2965,13 @@ def test_holidays(self): @pytest.mark.filterwarnings("ignore:Non:pandas.errors.PerformanceWarning") def test_datetimeindex(self): from pandas.tseries.holiday import USFederalHolidayCalendar + hcal = USFederalHolidayCalendar() freq = CBMonthEnd(calendar=hcal) - assert (date_range(start='20120101', end='20130101', - freq=freq).tolist()[0] == datetime(2012, 1, 31)) + assert date_range(start="20120101", end="20130101", freq=freq).tolist()[ + 0 + ] == datetime(2012, 1, 31) class TestCustomBusinessMonthBegin(CustomBusinessMonthBase, Base): @@ -2357,15 +2984,14 @@ def test_different_normalize_equals(self): assert offset != offset2 def test_repr(self): - assert repr(self.offset) == '' - assert repr(self.offset2) == '<2 * CustomBusinessMonthBegins>' + assert repr(self.offset) == "" + assert repr(self.offset2) == "<2 * CustomBusinessMonthBegins>" def testCall(self): assert self.offset2(self.d) == datetime(2008, 3, 3) def testRollback1(self): - assert (CDay(10).rollback(datetime(2007, 12, 31)) == - datetime(2007, 12, 31)) + assert CDay(10).rollback(datetime(2007, 12, 31)) == datetime(2007, 12, 31) def testRollback2(self): assert CBMonthBegin(10).rollback(self.d) == datetime(2008, 1, 1) @@ -2391,36 +3017,68 @@ def test_roll_date_object(self): result = offset.rollforward(dt) assert result == datetime(2012, 9, 15) - on_offset_cases = [(CBMonthBegin(), datetime(2008, 1, 1), True), - (CBMonthBegin(), datetime(2008, 1, 31), False)] + on_offset_cases = [ + (CBMonthBegin(), datetime(2008, 1, 1), True), + (CBMonthBegin(), datetime(2008, 1, 31), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) apply_cases = [] - apply_cases.append((CBMonthBegin(), { - datetime(2008, 1, 1): datetime(2008, 2, 1), - datetime(2008, 2, 7): datetime(2008, 3, 3)})) - - apply_cases.append((2 * CBMonthBegin(), { - datetime(2008, 1, 1): datetime(2008, 3, 3), - datetime(2008, 2, 7): datetime(2008, 4, 1)})) - - apply_cases.append((-CBMonthBegin(), { - datetime(2008, 1, 1): datetime(2007, 12, 3), - datetime(2008, 2, 8): datetime(2008, 2, 1)})) - - apply_cases.append((-2 * CBMonthBegin(), { - datetime(2008, 1, 1): datetime(2007, 11, 1), - datetime(2008, 2, 9): datetime(2008, 1, 1)})) - - apply_cases.append((CBMonthBegin(0), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 1, 7): datetime(2008, 2, 1)})) - - @pytest.mark.parametrize('case', apply_cases) + apply_cases.append( + ( + CBMonthBegin(), + { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 2, 7): datetime(2008, 3, 3), + }, + ) + ) + + apply_cases.append( + ( + 2 * CBMonthBegin(), + { + datetime(2008, 1, 1): datetime(2008, 3, 3), + datetime(2008, 2, 7): datetime(2008, 4, 1), + }, + ) + ) + + apply_cases.append( + ( + -CBMonthBegin(), + { + datetime(2008, 1, 1): datetime(2007, 12, 3), + datetime(2008, 2, 8): datetime(2008, 2, 1), + }, + ) + ) + + apply_cases.append( + ( + -2 * CBMonthBegin(), + { + datetime(2008, 1, 1): datetime(2007, 11, 1), + datetime(2008, 2, 9): datetime(2008, 1, 1), + }, + ) + ) + + apply_cases.append( + ( + CBMonthBegin(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 7): datetime(2008, 2, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", apply_cases) def test_apply(self, case): offset, cases = case for base, expected in cases.items(): @@ -2448,8 +3106,7 @@ def test_apply_large_n(self): def test_holidays(self): # Define a TradingDay offset - holidays = ['2012-02-01', datetime(2012, 2, 2), - np.datetime64('2012-03-01')] + holidays = ["2012-02-01", datetime(2012, 2, 2), np.datetime64("2012-03-01")] bm_offset = CBMonthBegin(holidays=holidays) dt = datetime(2012, 1, 1) @@ -2460,8 +3117,9 @@ def test_holidays(self): def test_datetimeindex(self): hcal = USFederalHolidayCalendar() cbmb = CBMonthBegin(calendar=hcal) - assert (date_range(start='20120101', end='20130101', - freq=cbmb).tolist()[0] == datetime(2012, 1, 3)) + assert date_range(start="20120101", end="20130101", freq=cbmb).tolist()[ + 0 + ] == datetime(2012, 1, 3) class TestWeek(Base): @@ -2490,42 +3148,66 @@ def test_isAnchored(self): offset_cases = [] # not business week - offset_cases.append((Week(), { - datetime(2008, 1, 1): datetime(2008, 1, 8), - datetime(2008, 1, 4): datetime(2008, 1, 11), - datetime(2008, 1, 5): datetime(2008, 1, 12), - datetime(2008, 1, 6): datetime(2008, 1, 13), - datetime(2008, 1, 7): datetime(2008, 1, 14)})) + offset_cases.append( + ( + Week(), + { + datetime(2008, 1, 1): datetime(2008, 1, 8), + datetime(2008, 1, 4): datetime(2008, 1, 11), + datetime(2008, 1, 5): datetime(2008, 1, 12), + datetime(2008, 1, 6): datetime(2008, 1, 13), + datetime(2008, 1, 7): datetime(2008, 1, 14), + }, + ) + ) # Mon - offset_cases.append((Week(weekday=0), { - datetime(2007, 12, 31): datetime(2008, 1, 7), - datetime(2008, 1, 4): datetime(2008, 1, 7), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 14)})) + offset_cases.append( + ( + Week(weekday=0), + { + datetime(2007, 12, 31): datetime(2008, 1, 7), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 14), + }, + ) + ) # n=0 -> roll forward. Mon - offset_cases.append((Week(0, weekday=0), { - datetime(2007, 12, 31): datetime(2007, 12, 31), - datetime(2008, 1, 4): datetime(2008, 1, 7), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 7)})) + offset_cases.append( + ( + Week(0, weekday=0), + { + datetime(2007, 12, 31): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 7), + }, + ) + ) # n=0 -> roll forward. Mon - offset_cases.append((Week(-2, weekday=1), { - datetime(2010, 4, 6): datetime(2010, 3, 23), - datetime(2010, 4, 8): datetime(2010, 3, 30), - datetime(2010, 4, 5): datetime(2010, 3, 23)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + Week(-2, weekday=1), + { + datetime(2010, 4, 6): datetime(2010, 3, 23), + datetime(2010, 4, 8): datetime(2010, 3, 30), + datetime(2010, 4, 5): datetime(2010, 3, 23), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - @pytest.mark.parametrize('weekday', range(7)) + @pytest.mark.parametrize("weekday", range(7)) def test_onOffset(self, weekday): offset = Week(weekday=weekday) @@ -2558,8 +3240,9 @@ def test_constructor(self): WeekOfMonth(n=1, week=0, weekday=-7) def test_repr(self): - assert (repr(WeekOfMonth(weekday=1, week=2)) == - "") + assert ( + repr(WeekOfMonth(weekday=1, week=2)) == "" + ) def test_offset(self): date1 = datetime(2011, 1, 4) # 1st Tuesday of Month @@ -2573,12 +3256,10 @@ def test_offset(self): (-2, 2, 1, date2, datetime(2010, 11, 16)), (-2, 2, 1, date3, datetime(2010, 11, 16)), (-2, 2, 1, date4, datetime(2010, 12, 21)), - (-1, 2, 1, date1, datetime(2010, 12, 21)), (-1, 2, 1, date2, datetime(2010, 12, 21)), (-1, 2, 1, date3, datetime(2010, 12, 21)), (-1, 2, 1, date4, datetime(2011, 1, 18)), - (0, 0, 1, date1, datetime(2011, 1, 4)), (0, 0, 1, date2, datetime(2011, 2, 1)), (0, 0, 1, date3, datetime(2011, 2, 1)), @@ -2591,7 +3272,6 @@ def test_offset(self): (0, 1, 1, date2, datetime(2011, 1, 11)), (0, 2, 1, date3, datetime(2011, 1, 18)), (0, 3, 1, date4, datetime(2011, 1, 25)), - (1, 0, 0, date1, datetime(2011, 2, 7)), (1, 0, 0, date2, datetime(2011, 2, 7)), (1, 0, 0, date3, datetime(2011, 2, 7)), @@ -2604,16 +3284,15 @@ def test_offset(self): (1, 0, 2, date2, datetime(2011, 2, 2)), (1, 0, 2, date3, datetime(2011, 2, 2)), (1, 0, 2, date4, datetime(2011, 2, 2)), - (1, 2, 1, date1, datetime(2011, 1, 18)), (1, 2, 1, date2, datetime(2011, 1, 18)), (1, 2, 1, date3, datetime(2011, 2, 15)), (1, 2, 1, date4, datetime(2011, 2, 15)), - (2, 2, 1, date1, datetime(2011, 2, 15)), (2, 2, 1, date2, datetime(2011, 2, 15)), (2, 2, 1, date3, datetime(2011, 3, 15)), - (2, 2, 1, date4, datetime(2011, 3, 15))] + (2, 2, 1, date4, datetime(2011, 3, 15)), + ] for n, week, weekday, dt, expected in test_cases: offset = WeekOfMonth(n, week=week, weekday=weekday) @@ -2626,14 +3305,16 @@ def test_offset(self): result = datetime(2011, 2, 3) - WeekOfMonth(week=0, weekday=2) assert result == datetime(2011, 2, 2) - on_offset_cases = [(0, 0, datetime(2011, 2, 7), True), - (0, 0, datetime(2011, 2, 6), False), - (0, 0, datetime(2011, 2, 14), False), - (1, 0, datetime(2011, 2, 14), True), - (0, 1, datetime(2011, 2, 1), True), - (0, 1, datetime(2011, 2, 8), False)] + on_offset_cases = [ + (0, 0, datetime(2011, 2, 7), True), + (0, 0, datetime(2011, 2, 6), False), + (0, 0, datetime(2011, 2, 14), False), + (1, 0, datetime(2011, 2, 14), True), + (0, 1, datetime(2011, 2, 1), True), + (0, 1, datetime(2011, 2, 8), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): week, weekday, dt, expected = case offset = WeekOfMonth(week=week, weekday=weekday) @@ -2661,10 +3342,10 @@ def test_offset(self): next_sat = datetime(2013, 9, 28) offset_sat = LastWeekOfMonth(n=1, weekday=5) - one_day_before = (last_sat + timedelta(days=-1)) + one_day_before = last_sat + timedelta(days=-1) assert one_day_before + offset_sat == last_sat - one_day_after = (last_sat + timedelta(days=+1)) + one_day_after = last_sat + timedelta(days=+1) assert one_day_after + offset_sat == next_sat # Test On that day @@ -2701,16 +3382,16 @@ def test_offset(self): (WeekDay.SUN, datetime(2013, 2, 25), False), # Not a SUN (WeekDay.MON, datetime(2013, 2, 25), True), (WeekDay.SAT, datetime(2013, 11, 30), True), - (WeekDay.SAT, datetime(2006, 8, 26), True), (WeekDay.SAT, datetime(2007, 8, 25), True), (WeekDay.SAT, datetime(2008, 8, 30), True), (WeekDay.SAT, datetime(2009, 8, 29), True), (WeekDay.SAT, datetime(2010, 8, 28), True), (WeekDay.SAT, datetime(2011, 8, 27), True), - (WeekDay.SAT, datetime(2019, 8, 31), True)] + (WeekDay.SAT, datetime(2019, 8, 31), True), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): weekday, dt, expected = case offset = LastWeekOfMonth(weekday=weekday) @@ -2723,31 +3404,33 @@ class TestSemiMonthEnd(Base): offset2 = _offset(2) def test_offset_whole_year(self): - dates = (datetime(2007, 12, 31), - datetime(2008, 1, 15), - datetime(2008, 1, 31), - datetime(2008, 2, 15), - datetime(2008, 2, 29), - datetime(2008, 3, 15), - datetime(2008, 3, 31), - datetime(2008, 4, 15), - datetime(2008, 4, 30), - datetime(2008, 5, 15), - datetime(2008, 5, 31), - datetime(2008, 6, 15), - datetime(2008, 6, 30), - datetime(2008, 7, 15), - datetime(2008, 7, 31), - datetime(2008, 8, 15), - datetime(2008, 8, 31), - datetime(2008, 9, 15), - datetime(2008, 9, 30), - datetime(2008, 10, 15), - datetime(2008, 10, 31), - datetime(2008, 11, 15), - datetime(2008, 11, 30), - datetime(2008, 12, 15), - datetime(2008, 12, 31)) + dates = ( + datetime(2007, 12, 31), + datetime(2008, 1, 15), + datetime(2008, 1, 31), + datetime(2008, 2, 15), + datetime(2008, 2, 29), + datetime(2008, 3, 15), + datetime(2008, 3, 31), + datetime(2008, 4, 15), + datetime(2008, 4, 30), + datetime(2008, 5, 15), + datetime(2008, 5, 31), + datetime(2008, 6, 15), + datetime(2008, 6, 30), + datetime(2008, 7, 15), + datetime(2008, 7, 31), + datetime(2008, 8, 15), + datetime(2008, 8, 31), + datetime(2008, 9, 15), + datetime(2008, 9, 30), + datetime(2008, 10, 15), + datetime(2008, 10, 31), + datetime(2008, 11, 15), + datetime(2008, 11, 30), + datetime(2008, 12, 15), + datetime(2008, 12, 31), + ) for base, exp_date in zip(dates[:-1], dates[1:]): assert_offset_equal(SemiMonthEnd(), base, exp_date) @@ -2763,94 +3446,142 @@ def test_offset_whole_year(self): tm.assert_index_equal(result, exp) # ensure generating a range with DatetimeIndex gives same result - result = date_range(start=dates[0], end=dates[-1], freq='SM') + result = date_range(start=dates[0], end=dates[-1], freq="SM") exp = DatetimeIndex(dates) tm.assert_index_equal(result, exp) offset_cases = [] - offset_cases.append((SemiMonthEnd(), { - datetime(2008, 1, 1): datetime(2008, 1, 15), - datetime(2008, 1, 15): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 2, 15), - datetime(2006, 12, 14): datetime(2006, 12, 15), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2007, 1, 15), - datetime(2007, 1, 1): datetime(2007, 1, 15), - datetime(2006, 12, 1): datetime(2006, 12, 15), - datetime(2006, 12, 15): datetime(2006, 12, 31)})) - - offset_cases.append((SemiMonthEnd(day_of_month=20), { - datetime(2008, 1, 1): datetime(2008, 1, 20), - datetime(2008, 1, 15): datetime(2008, 1, 20), - datetime(2008, 1, 21): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 2, 20), - datetime(2006, 12, 14): datetime(2006, 12, 20), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2007, 1, 20), - datetime(2007, 1, 1): datetime(2007, 1, 20), - datetime(2006, 12, 1): datetime(2006, 12, 20), - datetime(2006, 12, 15): datetime(2006, 12, 20)})) - - offset_cases.append((SemiMonthEnd(0), { - datetime(2008, 1, 1): datetime(2008, 1, 15), - datetime(2008, 1, 16): datetime(2008, 1, 31), - datetime(2008, 1, 15): datetime(2008, 1, 15), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2006, 12, 31), - datetime(2007, 1, 1): datetime(2007, 1, 15)})) - - offset_cases.append((SemiMonthEnd(0, day_of_month=16), { - datetime(2008, 1, 1): datetime(2008, 1, 16), - datetime(2008, 1, 16): datetime(2008, 1, 16), - datetime(2008, 1, 15): datetime(2008, 1, 16), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2006, 12, 31), - datetime(2007, 1, 1): datetime(2007, 1, 16)})) - - offset_cases.append((SemiMonthEnd(2), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 2, 29), - datetime(2006, 12, 29): datetime(2007, 1, 15), - datetime(2006, 12, 31): datetime(2007, 1, 31), - datetime(2007, 1, 1): datetime(2007, 1, 31), - datetime(2007, 1, 16): datetime(2007, 2, 15), - datetime(2006, 11, 1): datetime(2006, 11, 30)})) - - offset_cases.append((SemiMonthEnd(-1), { - datetime(2007, 1, 1): datetime(2006, 12, 31), - datetime(2008, 6, 30): datetime(2008, 6, 15), - datetime(2008, 12, 31): datetime(2008, 12, 15), - datetime(2006, 12, 29): datetime(2006, 12, 15), - datetime(2006, 12, 30): datetime(2006, 12, 15), - datetime(2007, 1, 1): datetime(2006, 12, 31)})) - - offset_cases.append((SemiMonthEnd(-1, day_of_month=4), { - datetime(2007, 1, 1): datetime(2006, 12, 31), - datetime(2007, 1, 4): datetime(2006, 12, 31), - datetime(2008, 6, 30): datetime(2008, 6, 4), - datetime(2008, 12, 31): datetime(2008, 12, 4), - datetime(2006, 12, 5): datetime(2006, 12, 4), - datetime(2006, 12, 30): datetime(2006, 12, 4), - datetime(2007, 1, 1): datetime(2006, 12, 31)})) - - offset_cases.append((SemiMonthEnd(-2), { - datetime(2007, 1, 1): datetime(2006, 12, 15), - datetime(2008, 6, 30): datetime(2008, 5, 31), - datetime(2008, 3, 15): datetime(2008, 2, 15), - datetime(2008, 12, 31): datetime(2008, 11, 30), - datetime(2006, 12, 29): datetime(2006, 11, 30), - datetime(2006, 12, 14): datetime(2006, 11, 15), - datetime(2007, 1, 1): datetime(2006, 12, 15)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + SemiMonthEnd(), + { + datetime(2008, 1, 1): datetime(2008, 1, 15), + datetime(2008, 1, 15): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 15), + datetime(2006, 12, 14): datetime(2006, 12, 15), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2007, 1, 15), + datetime(2007, 1, 1): datetime(2007, 1, 15), + datetime(2006, 12, 1): datetime(2006, 12, 15), + datetime(2006, 12, 15): datetime(2006, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthEnd(day_of_month=20), + { + datetime(2008, 1, 1): datetime(2008, 1, 20), + datetime(2008, 1, 15): datetime(2008, 1, 20), + datetime(2008, 1, 21): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 20), + datetime(2006, 12, 14): datetime(2006, 12, 20), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2007, 1, 20), + datetime(2007, 1, 1): datetime(2007, 1, 20), + datetime(2006, 12, 1): datetime(2006, 12, 20), + datetime(2006, 12, 15): datetime(2006, 12, 20), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthEnd(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 15), + datetime(2008, 1, 16): datetime(2008, 1, 31), + datetime(2008, 1, 15): datetime(2008, 1, 15), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2006, 12, 31), + datetime(2007, 1, 1): datetime(2007, 1, 15), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthEnd(0, day_of_month=16), + { + datetime(2008, 1, 1): datetime(2008, 1, 16), + datetime(2008, 1, 16): datetime(2008, 1, 16), + datetime(2008, 1, 15): datetime(2008, 1, 16), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2006, 12, 31), + datetime(2007, 1, 1): datetime(2007, 1, 16), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthEnd(2), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2006, 12, 29): datetime(2007, 1, 15), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + datetime(2007, 1, 16): datetime(2007, 2, 15), + datetime(2006, 11, 1): datetime(2006, 11, 30), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthEnd(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2008, 6, 15), + datetime(2008, 12, 31): datetime(2008, 12, 15), + datetime(2006, 12, 29): datetime(2006, 12, 15), + datetime(2006, 12, 30): datetime(2006, 12, 15), + datetime(2007, 1, 1): datetime(2006, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthEnd(-1, day_of_month=4), + { + datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2007, 1, 4): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2008, 6, 4), + datetime(2008, 12, 31): datetime(2008, 12, 4), + datetime(2006, 12, 5): datetime(2006, 12, 4), + datetime(2006, 12, 30): datetime(2006, 12, 4), + datetime(2007, 1, 1): datetime(2006, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthEnd(-2), + { + datetime(2007, 1, 1): datetime(2006, 12, 15), + datetime(2008, 6, 30): datetime(2008, 5, 31), + datetime(2008, 3, 15): datetime(2008, 2, 15), + datetime(2008, 12, 31): datetime(2008, 11, 30), + datetime(2006, 12, 29): datetime(2006, 11, 30), + datetime(2006, 12, 14): datetime(2006, 11, 15), + datetime(2007, 1, 1): datetime(2006, 12, 15), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - @pytest.mark.parametrize('case', offset_cases) + @pytest.mark.parametrize("case", offset_cases) def test_apply_index(self, case): offset, cases = case s = DatetimeIndex(cases.keys()) @@ -2862,21 +3593,28 @@ def test_apply_index(self, case): exp = DatetimeIndex(cases.values()) tm.assert_index_equal(result, exp) - on_offset_cases = [(datetime(2007, 12, 31), True), - (datetime(2007, 12, 15), True), - (datetime(2007, 12, 14), False), - (datetime(2007, 12, 1), False), - (datetime(2008, 2, 29), True)] + on_offset_cases = [ + (datetime(2007, 12, 31), True), + (datetime(2007, 12, 15), True), + (datetime(2007, 12, 14), False), + (datetime(2007, 12, 1), False), + (datetime(2008, 2, 29), True), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): dt, expected = case assert_onOffset(SemiMonthEnd(), dt, expected) - @pytest.mark.parametrize('klass', [Series, DatetimeIndex]) + @pytest.mark.parametrize("klass", [Series, DatetimeIndex]) def test_vectorized_offset_addition(self, klass): - s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), - Timestamp('2000-02-15', tz='US/Central')], name='a') + s = klass( + [ + Timestamp("2000-01-15 00:15:00", tz="US/Central"), + Timestamp("2000-02-15", tz="US/Central"), + ], + name="a", + ) with tm.assert_produces_warning(None): # GH#22535 check that we don't get a FutureWarning from adding @@ -2884,13 +3622,23 @@ def test_vectorized_offset_addition(self, klass): result = s + SemiMonthEnd() result2 = SemiMonthEnd() + s - exp = klass([Timestamp('2000-01-31 00:15:00', tz='US/Central'), - Timestamp('2000-02-29', tz='US/Central')], name='a') + exp = klass( + [ + Timestamp("2000-01-31 00:15:00", tz="US/Central"), + Timestamp("2000-02-29", tz="US/Central"), + ], + name="a", + ) tm.assert_equal(result, exp) tm.assert_equal(result2, exp) - s = klass([Timestamp('2000-01-01 00:15:00', tz='US/Central'), - Timestamp('2000-02-01', tz='US/Central')], name='a') + s = klass( + [ + Timestamp("2000-01-01 00:15:00", tz="US/Central"), + Timestamp("2000-02-01", tz="US/Central"), + ], + name="a", + ) with tm.assert_produces_warning(None): # GH#22535 check that we don't get a FutureWarning from adding @@ -2898,8 +3646,13 @@ def test_vectorized_offset_addition(self, klass): result = s + SemiMonthEnd() result2 = SemiMonthEnd() + s - exp = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), - Timestamp('2000-02-15', tz='US/Central')], name='a') + exp = klass( + [ + Timestamp("2000-01-15 00:15:00", tz="US/Central"), + Timestamp("2000-02-15", tz="US/Central"), + ], + name="a", + ) tm.assert_equal(result, exp) tm.assert_equal(result2, exp) @@ -2910,31 +3663,33 @@ class TestSemiMonthBegin(Base): offset2 = _offset(2) def test_offset_whole_year(self): - dates = (datetime(2007, 12, 15), - datetime(2008, 1, 1), - datetime(2008, 1, 15), - datetime(2008, 2, 1), - datetime(2008, 2, 15), - datetime(2008, 3, 1), - datetime(2008, 3, 15), - datetime(2008, 4, 1), - datetime(2008, 4, 15), - datetime(2008, 5, 1), - datetime(2008, 5, 15), - datetime(2008, 6, 1), - datetime(2008, 6, 15), - datetime(2008, 7, 1), - datetime(2008, 7, 15), - datetime(2008, 8, 1), - datetime(2008, 8, 15), - datetime(2008, 9, 1), - datetime(2008, 9, 15), - datetime(2008, 10, 1), - datetime(2008, 10, 15), - datetime(2008, 11, 1), - datetime(2008, 11, 15), - datetime(2008, 12, 1), - datetime(2008, 12, 15)) + dates = ( + datetime(2007, 12, 15), + datetime(2008, 1, 1), + datetime(2008, 1, 15), + datetime(2008, 2, 1), + datetime(2008, 2, 15), + datetime(2008, 3, 1), + datetime(2008, 3, 15), + datetime(2008, 4, 1), + datetime(2008, 4, 15), + datetime(2008, 5, 1), + datetime(2008, 5, 15), + datetime(2008, 6, 1), + datetime(2008, 6, 15), + datetime(2008, 7, 1), + datetime(2008, 7, 15), + datetime(2008, 8, 1), + datetime(2008, 8, 15), + datetime(2008, 9, 1), + datetime(2008, 9, 15), + datetime(2008, 10, 1), + datetime(2008, 10, 15), + datetime(2008, 11, 1), + datetime(2008, 11, 15), + datetime(2008, 12, 1), + datetime(2008, 12, 15), + ) for base, exp_date in zip(dates[:-1], dates[1:]): assert_offset_equal(SemiMonthBegin(), base, exp_date) @@ -2950,98 +3705,146 @@ def test_offset_whole_year(self): tm.assert_index_equal(result, exp) # ensure generating a range with DatetimeIndex gives same result - result = date_range(start=dates[0], end=dates[-1], freq='SMS') + result = date_range(start=dates[0], end=dates[-1], freq="SMS") exp = DatetimeIndex(dates) tm.assert_index_equal(result, exp) offset_cases = [] - offset_cases.append((SemiMonthBegin(), { - datetime(2008, 1, 1): datetime(2008, 1, 15), - datetime(2008, 1, 15): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 14): datetime(2006, 12, 15), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2007, 1, 1): datetime(2007, 1, 15), - datetime(2006, 12, 1): datetime(2006, 12, 15), - datetime(2006, 12, 15): datetime(2007, 1, 1)})) - - offset_cases.append((SemiMonthBegin(day_of_month=20), { - datetime(2008, 1, 1): datetime(2008, 1, 20), - datetime(2008, 1, 15): datetime(2008, 1, 20), - datetime(2008, 1, 21): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 14): datetime(2006, 12, 20), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2007, 1, 1): datetime(2007, 1, 20), - datetime(2006, 12, 1): datetime(2006, 12, 20), - datetime(2006, 12, 15): datetime(2006, 12, 20)})) - - offset_cases.append((SemiMonthBegin(0), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 1, 16): datetime(2008, 2, 1), - datetime(2008, 1, 15): datetime(2008, 1, 15), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 2): datetime(2006, 12, 15), - datetime(2007, 1, 1): datetime(2007, 1, 1)})) - - offset_cases.append((SemiMonthBegin(0, day_of_month=16), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 1, 16): datetime(2008, 1, 16), - datetime(2008, 1, 15): datetime(2008, 1, 16), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2007, 1, 5): datetime(2007, 1, 16), - datetime(2007, 1, 1): datetime(2007, 1, 1)})) - - offset_cases.append((SemiMonthBegin(2), { - datetime(2008, 1, 1): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 15), - datetime(2006, 12, 1): datetime(2007, 1, 1), - datetime(2006, 12, 29): datetime(2007, 1, 15), - datetime(2006, 12, 15): datetime(2007, 1, 15), - datetime(2007, 1, 1): datetime(2007, 2, 1), - datetime(2007, 1, 16): datetime(2007, 2, 15), - datetime(2006, 11, 1): datetime(2006, 12, 1)})) - - offset_cases.append((SemiMonthBegin(-1), { - datetime(2007, 1, 1): datetime(2006, 12, 15), - datetime(2008, 6, 30): datetime(2008, 6, 15), - datetime(2008, 6, 14): datetime(2008, 6, 1), - datetime(2008, 12, 31): datetime(2008, 12, 15), - datetime(2006, 12, 29): datetime(2006, 12, 15), - datetime(2006, 12, 15): datetime(2006, 12, 1), - datetime(2007, 1, 1): datetime(2006, 12, 15)})) - - offset_cases.append((SemiMonthBegin(-1, day_of_month=4), { - datetime(2007, 1, 1): datetime(2006, 12, 4), - datetime(2007, 1, 4): datetime(2007, 1, 1), - datetime(2008, 6, 30): datetime(2008, 6, 4), - datetime(2008, 12, 31): datetime(2008, 12, 4), - datetime(2006, 12, 5): datetime(2006, 12, 4), - datetime(2006, 12, 30): datetime(2006, 12, 4), - datetime(2006, 12, 2): datetime(2006, 12, 1), - datetime(2007, 1, 1): datetime(2006, 12, 4)})) - - offset_cases.append((SemiMonthBegin(-2), { - datetime(2007, 1, 1): datetime(2006, 12, 1), - datetime(2008, 6, 30): datetime(2008, 6, 1), - datetime(2008, 6, 14): datetime(2008, 5, 15), - datetime(2008, 12, 31): datetime(2008, 12, 1), - datetime(2006, 12, 29): datetime(2006, 12, 1), - datetime(2006, 12, 15): datetime(2006, 11, 15), - datetime(2007, 1, 1): datetime(2006, 12, 1)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + SemiMonthBegin(), + { + datetime(2008, 1, 1): datetime(2008, 1, 15), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 14): datetime(2006, 12, 15), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2007, 1, 1): datetime(2007, 1, 15), + datetime(2006, 12, 1): datetime(2006, 12, 15), + datetime(2006, 12, 15): datetime(2007, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthBegin(day_of_month=20), + { + datetime(2008, 1, 1): datetime(2008, 1, 20), + datetime(2008, 1, 15): datetime(2008, 1, 20), + datetime(2008, 1, 21): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 14): datetime(2006, 12, 20), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2007, 1, 1): datetime(2007, 1, 20), + datetime(2006, 12, 1): datetime(2006, 12, 20), + datetime(2006, 12, 15): datetime(2006, 12, 20), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthBegin(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 16): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 1, 15), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 2): datetime(2006, 12, 15), + datetime(2007, 1, 1): datetime(2007, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthBegin(0, day_of_month=16), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 16): datetime(2008, 1, 16), + datetime(2008, 1, 15): datetime(2008, 1, 16), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2007, 1, 5): datetime(2007, 1, 16), + datetime(2007, 1, 1): datetime(2007, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthBegin(2), + { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 15), + datetime(2006, 12, 1): datetime(2007, 1, 1), + datetime(2006, 12, 29): datetime(2007, 1, 15), + datetime(2006, 12, 15): datetime(2007, 1, 15), + datetime(2007, 1, 1): datetime(2007, 2, 1), + datetime(2007, 1, 16): datetime(2007, 2, 15), + datetime(2006, 11, 1): datetime(2006, 12, 1), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthBegin(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 15), + datetime(2008, 6, 30): datetime(2008, 6, 15), + datetime(2008, 6, 14): datetime(2008, 6, 1), + datetime(2008, 12, 31): datetime(2008, 12, 15), + datetime(2006, 12, 29): datetime(2006, 12, 15), + datetime(2006, 12, 15): datetime(2006, 12, 1), + datetime(2007, 1, 1): datetime(2006, 12, 15), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthBegin(-1, day_of_month=4), + { + datetime(2007, 1, 1): datetime(2006, 12, 4), + datetime(2007, 1, 4): datetime(2007, 1, 1), + datetime(2008, 6, 30): datetime(2008, 6, 4), + datetime(2008, 12, 31): datetime(2008, 12, 4), + datetime(2006, 12, 5): datetime(2006, 12, 4), + datetime(2006, 12, 30): datetime(2006, 12, 4), + datetime(2006, 12, 2): datetime(2006, 12, 1), + datetime(2007, 1, 1): datetime(2006, 12, 4), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthBegin(-2), + { + datetime(2007, 1, 1): datetime(2006, 12, 1), + datetime(2008, 6, 30): datetime(2008, 6, 1), + datetime(2008, 6, 14): datetime(2008, 5, 15), + datetime(2008, 12, 31): datetime(2008, 12, 1), + datetime(2006, 12, 29): datetime(2006, 12, 1), + datetime(2006, 12, 15): datetime(2006, 11, 15), + datetime(2007, 1, 1): datetime(2006, 12, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - @pytest.mark.parametrize('case', offset_cases) + @pytest.mark.parametrize("case", offset_cases) def test_apply_index(self, case): offset, cases = case s = DatetimeIndex(cases.keys()) @@ -3054,42 +3857,64 @@ def test_apply_index(self, case): exp = DatetimeIndex(cases.values()) tm.assert_index_equal(result, exp) - on_offset_cases = [(datetime(2007, 12, 1), True), - (datetime(2007, 12, 15), True), - (datetime(2007, 12, 14), False), - (datetime(2007, 12, 31), False), - (datetime(2008, 2, 15), True)] + on_offset_cases = [ + (datetime(2007, 12, 1), True), + (datetime(2007, 12, 15), True), + (datetime(2007, 12, 14), False), + (datetime(2007, 12, 31), False), + (datetime(2008, 2, 15), True), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): dt, expected = case assert_onOffset(SemiMonthBegin(), dt, expected) - @pytest.mark.parametrize('klass', [Series, DatetimeIndex]) + @pytest.mark.parametrize("klass", [Series, DatetimeIndex]) def test_vectorized_offset_addition(self, klass): - s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), - Timestamp('2000-02-15', tz='US/Central')], name='a') + s = klass( + [ + Timestamp("2000-01-15 00:15:00", tz="US/Central"), + Timestamp("2000-02-15", tz="US/Central"), + ], + name="a", + ) with tm.assert_produces_warning(None): # GH#22535 check that we don't get a FutureWarning from adding # an integer array to PeriodIndex result = s + SemiMonthBegin() result2 = SemiMonthBegin() + s - exp = klass([Timestamp('2000-02-01 00:15:00', tz='US/Central'), - Timestamp('2000-03-01', tz='US/Central')], name='a') + exp = klass( + [ + Timestamp("2000-02-01 00:15:00", tz="US/Central"), + Timestamp("2000-03-01", tz="US/Central"), + ], + name="a", + ) tm.assert_equal(result, exp) tm.assert_equal(result2, exp) - s = klass([Timestamp('2000-01-01 00:15:00', tz='US/Central'), - Timestamp('2000-02-01', tz='US/Central')], name='a') + s = klass( + [ + Timestamp("2000-01-01 00:15:00", tz="US/Central"), + Timestamp("2000-02-01", tz="US/Central"), + ], + name="a", + ) with tm.assert_produces_warning(None): # GH#22535 check that we don't get a FutureWarning from adding # an integer array to PeriodIndex result = s + SemiMonthBegin() result2 = SemiMonthBegin() + s - exp = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), - Timestamp('2000-02-15', tz='US/Central')], name='a') + exp = klass( + [ + Timestamp("2000-01-15 00:15:00", tz="US/Central"), + Timestamp("2000-02-15", tz="US/Central"), + ], + name="a", + ) tm.assert_equal(result, exp) tm.assert_equal(result2, exp) @@ -3104,58 +3929,61 @@ def test_Easter(): assert_offset_equal(-Easter(), datetime(2011, 1, 1), datetime(2010, 4, 4)) assert_offset_equal(-Easter(), datetime(2010, 4, 5), datetime(2010, 4, 4)) - assert_offset_equal(-Easter(2), - datetime(2011, 1, 1), - datetime(2009, 4, 12)) + assert_offset_equal(-Easter(2), datetime(2011, 1, 1), datetime(2009, 4, 12)) assert_offset_equal(-Easter(), datetime(2010, 4, 4), datetime(2009, 4, 12)) - assert_offset_equal(-Easter(2), - datetime(2010, 4, 4), - datetime(2008, 3, 23)) + assert_offset_equal(-Easter(2), datetime(2010, 4, 4), datetime(2008, 3, 23)) class TestOffsetNames: - def test_get_offset_name(self): - assert BDay().freqstr == 'B' - assert BDay(2).freqstr == '2B' - assert BMonthEnd().freqstr == 'BM' - assert Week(weekday=0).freqstr == 'W-MON' - assert Week(weekday=1).freqstr == 'W-TUE' - assert Week(weekday=2).freqstr == 'W-WED' - assert Week(weekday=3).freqstr == 'W-THU' - assert Week(weekday=4).freqstr == 'W-FRI' + assert BDay().freqstr == "B" + assert BDay(2).freqstr == "2B" + assert BMonthEnd().freqstr == "BM" + assert Week(weekday=0).freqstr == "W-MON" + assert Week(weekday=1).freqstr == "W-TUE" + assert Week(weekday=2).freqstr == "W-WED" + assert Week(weekday=3).freqstr == "W-THU" + assert Week(weekday=4).freqstr == "W-FRI" assert LastWeekOfMonth(weekday=WeekDay.SUN).freqstr == "LWOM-SUN" def test_get_offset(): with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - get_offset('gibberish') + get_offset("gibberish") with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - get_offset('QS-JAN-B') + get_offset("QS-JAN-B") pairs = [ - ('B', BDay()), ('b', BDay()), ('bm', BMonthEnd()), - ('Bm', BMonthEnd()), ('W-MON', Week(weekday=0)), - ('W-TUE', Week(weekday=1)), ('W-WED', Week(weekday=2)), - ('W-THU', Week(weekday=3)), ('W-FRI', Week(weekday=4))] + ("B", BDay()), + ("b", BDay()), + ("bm", BMonthEnd()), + ("Bm", BMonthEnd()), + ("W-MON", Week(weekday=0)), + ("W-TUE", Week(weekday=1)), + ("W-WED", Week(weekday=2)), + ("W-THU", Week(weekday=3)), + ("W-FRI", Week(weekday=4)), + ] for name, expected in pairs: offset = get_offset(name) - assert offset == expected, ("Expected %r to yield %r (actual: %r)" % - (name, expected, offset)) + assert offset == expected, "Expected %r to yield %r (actual: %r)" % ( + name, + expected, + offset, + ) def test_get_offset_legacy(): - pairs = [('w@Sat', Week(weekday=5))] + pairs = [("w@Sat", Week(weekday=5))] for name, expected in pairs: with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): get_offset(name) class TestOffsetAliases: - def setup_method(self, method): _offset_map.clear() @@ -3166,32 +3994,44 @@ def test_alias_equality(self): assert k == v.copy() def test_rule_code(self): - lst = ['M', 'MS', 'BM', 'BMS', 'D', 'B', 'H', 'T', 'S', 'L', 'U'] + lst = ["M", "MS", "BM", "BMS", "D", "B", "H", "T", "S", "L", "U"] for k in lst: assert k == get_offset(k).rule_code # should be cached - this is kind of an internals test... assert k in _offset_map assert k == (get_offset(k) * 3).rule_code - suffix_lst = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] - base = 'W' + suffix_lst = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"] + base = "W" for v in suffix_lst: - alias = '-'.join([base, v]) + alias = "-".join([base, v]) assert alias == get_offset(alias).rule_code assert alias == (get_offset(alias) * 5).rule_code - suffix_lst = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', - 'SEP', 'OCT', 'NOV', 'DEC'] - base_lst = ['A', 'AS', 'BA', 'BAS', 'Q', 'QS', 'BQ', 'BQS'] + suffix_lst = [ + "JAN", + "FEB", + "MAR", + "APR", + "MAY", + "JUN", + "JUL", + "AUG", + "SEP", + "OCT", + "NOV", + "DEC", + ] + base_lst = ["A", "AS", "BA", "BAS", "Q", "QS", "BQ", "BQS"] for base in base_lst: for v in suffix_lst: - alias = '-'.join([base, v]) + alias = "-".join([base, v]) assert alias == get_offset(alias).rule_code assert alias == (get_offset(alias) * 5).rule_code - lst = ['M', 'D', 'B', 'H', 'T', 'S', 'L', 'U'] + lst = ["M", "D", "B", "H", "T", "S", "L", "U"] for k in lst: - code, stride = get_freq_code('3' + k) + code, stride = get_freq_code("3" + k) assert isinstance(code, int) assert stride == 3 assert k == get_freq_str(code) @@ -3202,30 +4042,42 @@ def test_dateoffset_misc(): # it works oset.freqstr - assert (not offsets.DateOffset(months=2) == 2) + assert not offsets.DateOffset(months=2) == 2 def test_freq_offsets(): off = BDay(1, offset=timedelta(0, 1800)) - assert (off.freqstr == 'B+30Min') + assert off.freqstr == "B+30Min" off = BDay(1, offset=timedelta(0, -1800)) - assert (off.freqstr == 'B-30Min') + assert off.freqstr == "B-30Min" class TestReprNames: - def test_str_for_named_is_name(self): # look at all the amazing combinations! - month_prefixes = ['A', 'AS', 'BA', 'BAS', 'Q', 'BQ', 'BQS', 'QS'] - names = [prefix + '-' + month - for prefix in month_prefixes - for month in ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', - 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']] - days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] - names += ['W-' + day for day in days] - names += ['WOM-' + week + day - for week in ('1', '2', '3', '4') for day in days] + month_prefixes = ["A", "AS", "BA", "BAS", "Q", "BQ", "BQS", "QS"] + names = [ + prefix + "-" + month + for prefix in month_prefixes + for month in [ + "JAN", + "FEB", + "MAR", + "APR", + "MAY", + "JUN", + "JUL", + "AUG", + "SEP", + "OCT", + "NOV", + "DEC", + ] + ] + days = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"] + names += ["W-" + day for day in days] + names += ["WOM-" + week + day for week in ("1", "2", "3", "4") for day in days] _offset_map.clear() for name in names: offset = get_offset(name) @@ -3242,29 +4094,40 @@ class TestDST: """ test DateOffset additions over Daylight Savings Time """ + # one microsecond before the DST transition ts_pre_fallback = "2013-11-03 01:59:59.999999" ts_pre_springfwd = "2013-03-10 01:59:59.999999" # test both basic names and dateutil timezones timezone_utc_offsets = { - 'US/Eastern': dict(utc_offset_daylight=-4, - utc_offset_standard=-5, ), - 'dateutil/US/Pacific': dict(utc_offset_daylight=-7, - utc_offset_standard=-8, ) + "US/Eastern": dict(utc_offset_daylight=-4, utc_offset_standard=-5), + "dateutil/US/Pacific": dict(utc_offset_daylight=-7, utc_offset_standard=-8), } valid_date_offsets_singular = [ - 'weekday', 'day', 'hour', 'minute', 'second', 'microsecond' + "weekday", + "day", + "hour", + "minute", + "second", + "microsecond", ] valid_date_offsets_plural = [ - 'weeks', 'days', - 'hours', 'minutes', 'seconds', - 'milliseconds', 'microseconds' + "weeks", + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", ] def _test_all_offsets(self, n, **kwds): - valid_offsets = self.valid_date_offsets_plural if n > 1 \ + valid_offsets = ( + self.valid_date_offsets_plural + if n > 1 else self.valid_date_offsets_singular + ) for name in valid_offsets: self._test_offset(offset_name=name, offset_n=n, **kwds) @@ -3276,90 +4139,101 @@ def _test_offset(self, offset_name, offset_n, tstart, expected_utc_offset): if expected_utc_offset is not None: assert get_utc_offset_hours(t) == expected_utc_offset - if offset_name == 'weeks': + if offset_name == "weeks": # dates should match - assert t.date() == timedelta(days=7 * offset.kwds[ - 'weeks']) + tstart.date() + assert t.date() == timedelta(days=7 * offset.kwds["weeks"]) + tstart.date() # expect the same day of week, hour of day, minute, second, ... - assert (t.dayofweek == tstart.dayofweek and - t.hour == tstart.hour and - t.minute == tstart.minute and - t.second == tstart.second) - elif offset_name == 'days': + assert ( + t.dayofweek == tstart.dayofweek + and t.hour == tstart.hour + and t.minute == tstart.minute + and t.second == tstart.second + ) + elif offset_name == "days": # dates should match - assert timedelta(offset.kwds['days']) + tstart.date() == t.date() + assert timedelta(offset.kwds["days"]) + tstart.date() == t.date() # expect the same hour of day, minute, second, ... - assert (t.hour == tstart.hour and - t.minute == tstart.minute and - t.second == tstart.second) + assert ( + t.hour == tstart.hour + and t.minute == tstart.minute + and t.second == tstart.second + ) elif offset_name in self.valid_date_offsets_singular: # expect the singular offset value to match between tstart and t - datepart_offset = getattr(t, offset_name - if offset_name != 'weekday' else - 'dayofweek') + datepart_offset = getattr( + t, offset_name if offset_name != "weekday" else "dayofweek" + ) assert datepart_offset == offset.kwds[offset_name] else: # the offset should be the same as if it was done in UTC - assert (t == (tstart.tz_convert('UTC') + offset) - .tz_convert('US/Pacific')) + assert t == (tstart.tz_convert("UTC") + offset).tz_convert("US/Pacific") def _make_timestamp(self, string, hrs_offset, tz): if hrs_offset >= 0: - offset_string = '{hrs:02d}00'.format(hrs=hrs_offset) + offset_string = "{hrs:02d}00".format(hrs=hrs_offset) else: - offset_string = '-{hrs:02d}00'.format(hrs=-1 * hrs_offset) + offset_string = "-{hrs:02d}00".format(hrs=-1 * hrs_offset) return Timestamp(string + offset_string).tz_convert(tz) def test_springforward_plural(self): # test moving from standard to daylight savings for tz, utc_offsets in self.timezone_utc_offsets.items(): - hrs_pre = utc_offsets['utc_offset_standard'] - hrs_post = utc_offsets['utc_offset_daylight'] + hrs_pre = utc_offsets["utc_offset_standard"] + hrs_post = utc_offsets["utc_offset_daylight"] self._test_all_offsets( - n=3, tstart=self._make_timestamp(self.ts_pre_springfwd, - hrs_pre, tz), - expected_utc_offset=hrs_post) + n=3, + tstart=self._make_timestamp(self.ts_pre_springfwd, hrs_pre, tz), + expected_utc_offset=hrs_post, + ) def test_fallback_singular(self): # in the case of singular offsets, we don't necessarily know which utc # offset the new Timestamp will wind up in (the tz for 1 month may be # different from 1 second) so we don't specify an expected_utc_offset for tz, utc_offsets in self.timezone_utc_offsets.items(): - hrs_pre = utc_offsets['utc_offset_standard'] - self._test_all_offsets(n=1, tstart=self._make_timestamp( - self.ts_pre_fallback, hrs_pre, tz), expected_utc_offset=None) + hrs_pre = utc_offsets["utc_offset_standard"] + self._test_all_offsets( + n=1, + tstart=self._make_timestamp(self.ts_pre_fallback, hrs_pre, tz), + expected_utc_offset=None, + ) def test_springforward_singular(self): for tz, utc_offsets in self.timezone_utc_offsets.items(): - hrs_pre = utc_offsets['utc_offset_standard'] - self._test_all_offsets(n=1, tstart=self._make_timestamp( - self.ts_pre_springfwd, hrs_pre, tz), expected_utc_offset=None) - - offset_classes = {MonthBegin: ['11/2/2012', '12/1/2012'], - MonthEnd: ['11/2/2012', '11/30/2012'], - BMonthBegin: ['11/2/2012', '12/3/2012'], - BMonthEnd: ['11/2/2012', '11/30/2012'], - CBMonthBegin: ['11/2/2012', '12/3/2012'], - CBMonthEnd: ['11/2/2012', '11/30/2012'], - SemiMonthBegin: ['11/2/2012', '11/15/2012'], - SemiMonthEnd: ['11/2/2012', '11/15/2012'], - Week: ['11/2/2012', '11/9/2012'], - YearBegin: ['11/2/2012', '1/1/2013'], - YearEnd: ['11/2/2012', '12/31/2012'], - BYearBegin: ['11/2/2012', '1/1/2013'], - BYearEnd: ['11/2/2012', '12/31/2012'], - QuarterBegin: ['11/2/2012', '12/1/2012'], - QuarterEnd: ['11/2/2012', '12/31/2012'], - BQuarterBegin: ['11/2/2012', '12/3/2012'], - BQuarterEnd: ['11/2/2012', '12/31/2012'], - Day: ['11/4/2012', '11/4/2012 23:00']}.items() - - @pytest.mark.parametrize('tup', offset_classes) + hrs_pre = utc_offsets["utc_offset_standard"] + self._test_all_offsets( + n=1, + tstart=self._make_timestamp(self.ts_pre_springfwd, hrs_pre, tz), + expected_utc_offset=None, + ) + + offset_classes = { + MonthBegin: ["11/2/2012", "12/1/2012"], + MonthEnd: ["11/2/2012", "11/30/2012"], + BMonthBegin: ["11/2/2012", "12/3/2012"], + BMonthEnd: ["11/2/2012", "11/30/2012"], + CBMonthBegin: ["11/2/2012", "12/3/2012"], + CBMonthEnd: ["11/2/2012", "11/30/2012"], + SemiMonthBegin: ["11/2/2012", "11/15/2012"], + SemiMonthEnd: ["11/2/2012", "11/15/2012"], + Week: ["11/2/2012", "11/9/2012"], + YearBegin: ["11/2/2012", "1/1/2013"], + YearEnd: ["11/2/2012", "12/31/2012"], + BYearBegin: ["11/2/2012", "1/1/2013"], + BYearEnd: ["11/2/2012", "12/31/2012"], + QuarterBegin: ["11/2/2012", "12/1/2012"], + QuarterEnd: ["11/2/2012", "12/31/2012"], + BQuarterBegin: ["11/2/2012", "12/3/2012"], + BQuarterEnd: ["11/2/2012", "12/31/2012"], + Day: ["11/4/2012", "11/4/2012 23:00"], + }.items() + + @pytest.mark.parametrize("tup", offset_classes) def test_all_offset_classes(self, tup): offset, test_values = tup - first = Timestamp(test_values[0], tz='US/Eastern') + offset() - second = Timestamp(test_values[1], tz='US/Eastern') + first = Timestamp(test_values[0], tz="US/Eastern") + offset() + second = Timestamp(test_values[1], tz="US/Eastern") assert first == second @@ -3379,7 +4253,7 @@ def test_valid_default_arguments(offset_types): cls() -@pytest.mark.parametrize('kwd', sorted(list(liboffsets.relativedelta_kwds))) +@pytest.mark.parametrize("kwd", sorted(list(liboffsets.relativedelta_kwds))) def test_valid_month_attributes(kwd, month_classes): # GH#18226 cls = month_classes @@ -3388,14 +4262,14 @@ def test_valid_month_attributes(kwd, month_classes): cls(**{kwd: 3}) -@pytest.mark.parametrize('kwd', sorted(list(liboffsets.relativedelta_kwds))) +@pytest.mark.parametrize("kwd", sorted(list(liboffsets.relativedelta_kwds))) def test_valid_relativedelta_kwargs(kwd): # Check that all the arguments specified in liboffsets.relativedelta_kwds # are in fact valid relativedelta keyword args DateOffset(**{kwd: 1}) -@pytest.mark.parametrize('kwd', sorted(list(liboffsets.relativedelta_kwds))) +@pytest.mark.parametrize("kwd", sorted(list(liboffsets.relativedelta_kwds))) def test_valid_tick_attributes(kwd, tick_classes): # GH#18226 cls = tick_classes @@ -3406,7 +4280,7 @@ def test_valid_tick_attributes(kwd, tick_classes): def test_validate_n_error(): with pytest.raises(TypeError): - DateOffset(n='Doh!') + DateOffset(n="Doh!") with pytest.raises(TypeError): MonthBegin(n=timedelta(1)) @@ -3433,14 +4307,14 @@ def test_weeks_onoffset(): # GH#18510 Week with weekday = None, normalize = False should always # be onOffset offset = Week(n=2, weekday=None) - ts = Timestamp('1862-01-13 09:03:34.873477378+0210', tz='Africa/Lusaka') + ts = Timestamp("1862-01-13 09:03:34.873477378+0210", tz="Africa/Lusaka") fast = offset.onOffset(ts) slow = (ts + offset) - offset == ts assert fast == slow # negative n offset = Week(n=2, weekday=None) - ts = Timestamp('1856-10-24 16:18:36.556360110-0717', tz='Pacific/Easter') + ts = Timestamp("1856-10-24 16:18:36.556360110-0717", tz="Pacific/Easter") fast = offset.onOffset(ts) slow = (ts + offset) - offset == ts assert fast == slow @@ -3450,14 +4324,14 @@ def test_weekofmonth_onoffset(): # GH#18864 # Make sure that nanoseconds don't trip up onOffset (and with it apply) offset = WeekOfMonth(n=2, week=2, weekday=0) - ts = Timestamp('1916-05-15 01:14:49.583410462+0422', tz='Asia/Qyzylorda') + ts = Timestamp("1916-05-15 01:14:49.583410462+0422", tz="Asia/Qyzylorda") fast = offset.onOffset(ts) slow = (ts + offset) - offset == ts assert fast == slow # negative n offset = WeekOfMonth(n=-3, week=1, weekday=0) - ts = Timestamp('1980-12-08 03:38:52.878321185+0500', tz='Asia/Oral') + ts = Timestamp("1980-12-08 03:38:52.878321185+0500", tz="Asia/Oral") fast = offset.onOffset(ts) slow = (ts + offset) - offset == ts assert fast == slow @@ -3466,16 +4340,14 @@ def test_weekofmonth_onoffset(): def test_last_week_of_month_on_offset(): # GH#19036, GH#18977 _adjust_dst was incorrect for LastWeekOfMonth offset = LastWeekOfMonth(n=4, weekday=6) - ts = Timestamp('1917-05-27 20:55:27.084284178+0200', - tz='Europe/Warsaw') + ts = Timestamp("1917-05-27 20:55:27.084284178+0200", tz="Europe/Warsaw") slow = (ts + offset) - offset == ts fast = offset.onOffset(ts) assert fast == slow # negative n offset = LastWeekOfMonth(n=-4, weekday=5) - ts = Timestamp('2005-08-27 05:01:42.799392561-0500', - tz='America/Rainy_River') + ts = Timestamp("2005-08-27 05:01:42.799392561-0500", tz="America/Rainy_River") slow = (ts + offset) - offset == ts fast = offset.onOffset(ts) assert fast == slow diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py index 271f4ceef5f49a..880ff1f1375200 100644 --- a/pandas/tests/tseries/offsets/test_offsets_properties.py +++ b/pandas/tests/tseries/offsets/test_offsets_properties.py @@ -17,42 +17,67 @@ import pandas as pd from pandas.tseries.offsets import ( - BMonthBegin, BMonthEnd, BQuarterBegin, BQuarterEnd, BYearBegin, BYearEnd, - MonthBegin, MonthEnd, QuarterBegin, QuarterEnd, YearBegin, YearEnd) + BMonthBegin, + BMonthEnd, + BQuarterBegin, + BQuarterEnd, + BYearBegin, + BYearEnd, + MonthBegin, + MonthEnd, + QuarterBegin, + QuarterEnd, + YearBegin, + YearEnd, +) # ---------------------------------------------------------------- # Helpers for generating random data with warnings.catch_warnings(): - warnings.simplefilter('ignore') - min_dt = pd.Timestamp(1900, 1, 1).to_pydatetime(), - max_dt = pd.Timestamp(1900, 1, 1).to_pydatetime(), + warnings.simplefilter("ignore") + min_dt = (pd.Timestamp(1900, 1, 1).to_pydatetime(),) + max_dt = (pd.Timestamp(1900, 1, 1).to_pydatetime(),) gen_date_range = st.builds( pd.date_range, start=st.datetimes( # TODO: Choose the min/max values more systematically min_value=pd.Timestamp(1900, 1, 1).to_pydatetime(), - max_value=pd.Timestamp(2100, 1, 1).to_pydatetime() + max_value=pd.Timestamp(2100, 1, 1).to_pydatetime(), ), periods=st.integers(min_value=2, max_value=100), - freq=st.sampled_from('Y Q M D H T s ms us ns'.split()), + freq=st.sampled_from("Y Q M D H T s ms us ns".split()), tz=st.one_of(st.none(), dateutil_timezones(), pytz_timezones()), ) gen_random_datetime = st.datetimes( min_value=min_dt, max_value=max_dt, - timezones=st.one_of(st.none(), dateutil_timezones(), pytz_timezones()) + timezones=st.one_of(st.none(), dateutil_timezones(), pytz_timezones()), ) # The strategy for each type is registered in conftest.py, as they don't carry # enough runtime information (e.g. type hints) to infer how to build them. -gen_yqm_offset = st.one_of(*map(st.from_type, [ - MonthBegin, MonthEnd, BMonthBegin, BMonthEnd, - QuarterBegin, QuarterEnd, BQuarterBegin, BQuarterEnd, - YearBegin, YearEnd, BYearBegin, BYearEnd -])) +gen_yqm_offset = st.one_of( + *map( + st.from_type, + [ + MonthBegin, + MonthEnd, + BMonthBegin, + BMonthEnd, + QuarterBegin, + QuarterEnd, + BQuarterBegin, + BQuarterEnd, + YearBegin, + YearEnd, + BYearBegin, + BYearEnd, + ], + ) +) # ---------------------------------------------------------------- @@ -60,7 +85,7 @@ # Based on CI runs: Always passes on OSX, fails on Linux, sometimes on Windows -@pytest.mark.xfail(strict=False, reason='inconsistent between OSs, Pythons') +@pytest.mark.xfail(strict=False, reason="inconsistent between OSs, Pythons") @given(gen_random_datetime, gen_yqm_offset) def test_on_offset_implementations(dt, offset): assume(not offset.normalize) @@ -71,10 +96,12 @@ def test_on_offset_implementations(dt, offset): assert offset.onOffset(dt) == (compare == dt) -@pytest.mark.xfail(reason="res_v2 below is incorrect, needs to use the " - "commented-out version with tz_localize. " - "But with that fix in place, hypothesis then " - "has errors in timezone generation.") +@pytest.mark.xfail( + reason="res_v2 below is incorrect, needs to use the " + "commented-out version with tz_localize. " + "But with that fix in place, hypothesis then " + "has errors in timezone generation." +) @given(gen_yqm_offset, gen_date_range) def test_apply_index_implementations(offset, rng): # offset.apply_index(dti)[i] should match dti[i] + offset @@ -103,8 +130,9 @@ def test_shift_across_dst(offset): # GH#18319 check that 1) timezone is correctly normalized and # 2) that hour is not incorrectly changed by this normalization # Note that dti includes a transition across DST boundary - dti = pd.date_range(start='2017-10-30 12:00:00', end='2017-11-06', - freq='D', tz='US/Eastern') + dti = pd.date_range( + start="2017-10-30 12:00:00", end="2017-11-06", freq="D", tz="US/Eastern" + ) assert (dti.hour == 12).all() # we haven't screwed up yet res = dti + offset diff --git a/pandas/tests/tseries/offsets/test_ticks.py b/pandas/tests/tseries/offsets/test_ticks.py index f2f6aed097d0ce..98a3631c8e63a9 100644 --- a/pandas/tests/tseries/offsets/test_ticks.py +++ b/pandas/tests/tseries/offsets/test_ticks.py @@ -27,21 +27,21 @@ def test_apply_ticks(): result = offsets.Hour(3).apply(offsets.Hour(4)) exp = offsets.Hour(7) - assert (result == exp) + assert result == exp def test_delta_to_tick(): delta = timedelta(3) tick = offsets._delta_to_tick(delta) - assert (tick == offsets.Day(3)) + assert tick == offsets.Day(3) td = Timedelta(nanoseconds=5) tick = offsets._delta_to_tick(td) assert tick == Nano(5) -@pytest.mark.parametrize('cls', tick_classes) +@pytest.mark.parametrize("cls", tick_classes) @settings(deadline=None) # GH 24641 @example(n=2, m=3) @example(n=800, m=300) @@ -62,7 +62,7 @@ def test_tick_add_sub(cls, n, m): assert left - right == expected -@pytest.mark.parametrize('cls', tick_classes) +@pytest.mark.parametrize("cls", tick_classes) @settings(deadline=None) @example(n=2, m=3) @given(n=st.integers(-999, 999), m=st.integers(-999, 999)) @@ -86,14 +86,10 @@ def test_tick_equality(cls, n, m): def test_Hour(): - assert_offset_equal(Hour(), - datetime(2010, 1, 1), datetime(2010, 1, 1, 1)) - assert_offset_equal(Hour(-1), - datetime(2010, 1, 1, 1), datetime(2010, 1, 1)) - assert_offset_equal(2 * Hour(), - datetime(2010, 1, 1), datetime(2010, 1, 1, 2)) - assert_offset_equal(-1 * Hour(), - datetime(2010, 1, 1, 1), datetime(2010, 1, 1)) + assert_offset_equal(Hour(), datetime(2010, 1, 1), datetime(2010, 1, 1, 1)) + assert_offset_equal(Hour(-1), datetime(2010, 1, 1, 1), datetime(2010, 1, 1)) + assert_offset_equal(2 * Hour(), datetime(2010, 1, 1), datetime(2010, 1, 1, 2)) + assert_offset_equal(-1 * Hour(), datetime(2010, 1, 1, 1), datetime(2010, 1, 1)) assert Hour(3) + Hour(2) == Hour(5) assert Hour(3) - Hour(2) == Hour() @@ -102,14 +98,10 @@ def test_Hour(): def test_Minute(): - assert_offset_equal(Minute(), - datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 1)) - assert_offset_equal(Minute(-1), - datetime(2010, 1, 1, 0, 1), datetime(2010, 1, 1)) - assert_offset_equal(2 * Minute(), - datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 2)) - assert_offset_equal(-1 * Minute(), - datetime(2010, 1, 1, 0, 1), datetime(2010, 1, 1)) + assert_offset_equal(Minute(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 1)) + assert_offset_equal(Minute(-1), datetime(2010, 1, 1, 0, 1), datetime(2010, 1, 1)) + assert_offset_equal(2 * Minute(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 2)) + assert_offset_equal(-1 * Minute(), datetime(2010, 1, 1, 0, 1), datetime(2010, 1, 1)) assert Minute(3) + Minute(2) == Minute(5) assert Minute(3) - Minute(2) == Minute() @@ -117,67 +109,61 @@ def test_Minute(): def test_Second(): - assert_offset_equal(Second(), - datetime(2010, 1, 1), - datetime(2010, 1, 1, 0, 0, 1)) - assert_offset_equal(Second(-1), - datetime(2010, 1, 1, 0, 0, 1), - datetime(2010, 1, 1)) - assert_offset_equal(2 * Second(), - datetime(2010, 1, 1), - datetime(2010, 1, 1, 0, 0, 2)) - assert_offset_equal(-1 * Second(), - datetime(2010, 1, 1, 0, 0, 1), - datetime(2010, 1, 1)) + assert_offset_equal(Second(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 1)) + assert_offset_equal(Second(-1), datetime(2010, 1, 1, 0, 0, 1), datetime(2010, 1, 1)) + assert_offset_equal( + 2 * Second(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 2) + ) + assert_offset_equal( + -1 * Second(), datetime(2010, 1, 1, 0, 0, 1), datetime(2010, 1, 1) + ) assert Second(3) + Second(2) == Second(5) assert Second(3) - Second(2) == Second() def test_Millisecond(): - assert_offset_equal(Milli(), - datetime(2010, 1, 1), - datetime(2010, 1, 1, 0, 0, 0, 1000)) - assert_offset_equal(Milli(-1), - datetime(2010, 1, 1, 0, 0, 0, 1000), - datetime(2010, 1, 1)) - assert_offset_equal(Milli(2), - datetime(2010, 1, 1), - datetime(2010, 1, 1, 0, 0, 0, 2000)) - assert_offset_equal(2 * Milli(), - datetime(2010, 1, 1), - datetime(2010, 1, 1, 0, 0, 0, 2000)) - assert_offset_equal(-1 * Milli(), - datetime(2010, 1, 1, 0, 0, 0, 1000), - datetime(2010, 1, 1)) + assert_offset_equal( + Milli(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 0, 1000) + ) + assert_offset_equal( + Milli(-1), datetime(2010, 1, 1, 0, 0, 0, 1000), datetime(2010, 1, 1) + ) + assert_offset_equal( + Milli(2), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 0, 2000) + ) + assert_offset_equal( + 2 * Milli(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 0, 2000) + ) + assert_offset_equal( + -1 * Milli(), datetime(2010, 1, 1, 0, 0, 0, 1000), datetime(2010, 1, 1) + ) assert Milli(3) + Milli(2) == Milli(5) assert Milli(3) - Milli(2) == Milli() def test_MillisecondTimestampArithmetic(): - assert_offset_equal(Milli(), - Timestamp('2010-01-01'), - Timestamp('2010-01-01 00:00:00.001')) - assert_offset_equal(Milli(-1), - Timestamp('2010-01-01 00:00:00.001'), - Timestamp('2010-01-01')) + assert_offset_equal( + Milli(), Timestamp("2010-01-01"), Timestamp("2010-01-01 00:00:00.001") + ) + assert_offset_equal( + Milli(-1), Timestamp("2010-01-01 00:00:00.001"), Timestamp("2010-01-01") + ) def test_Microsecond(): - assert_offset_equal(Micro(), - datetime(2010, 1, 1), - datetime(2010, 1, 1, 0, 0, 0, 1)) - assert_offset_equal(Micro(-1), - datetime(2010, 1, 1, 0, 0, 0, 1), - datetime(2010, 1, 1)) - - assert_offset_equal(2 * Micro(), - datetime(2010, 1, 1), - datetime(2010, 1, 1, 0, 0, 0, 2)) - assert_offset_equal(-1 * Micro(), - datetime(2010, 1, 1, 0, 0, 0, 1), - datetime(2010, 1, 1)) + assert_offset_equal(Micro(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 0, 1)) + assert_offset_equal( + Micro(-1), datetime(2010, 1, 1, 0, 0, 0, 1), datetime(2010, 1, 1) + ) + + assert_offset_equal( + 2 * Micro(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 0, 2) + ) + assert_offset_equal( + -1 * Micro(), datetime(2010, 1, 1, 0, 0, 0, 1), datetime(2010, 1, 1) + ) assert Micro(3) + Micro(2) == Micro(5) assert Micro(3) - Micro(2) == Micro() @@ -196,18 +182,10 @@ def test_NanosecondGeneric(): def test_Nanosecond(): timestamp = Timestamp(datetime(2010, 1, 1)) - assert_offset_equal(Nano(), - timestamp, - timestamp + np.timedelta64(1, 'ns')) - assert_offset_equal(Nano(-1), - timestamp + np.timedelta64(1, 'ns'), - timestamp) - assert_offset_equal(2 * Nano(), - timestamp, - timestamp + np.timedelta64(2, 'ns')) - assert_offset_equal(-1 * Nano(), - timestamp + np.timedelta64(1, 'ns'), - timestamp) + assert_offset_equal(Nano(), timestamp, timestamp + np.timedelta64(1, "ns")) + assert_offset_equal(Nano(-1), timestamp + np.timedelta64(1, "ns"), timestamp) + assert_offset_equal(2 * Nano(), timestamp, timestamp + np.timedelta64(2, "ns")) + assert_offset_equal(-1 * Nano(), timestamp + np.timedelta64(1, "ns"), timestamp) assert Nano(3) + Nano(2) == Nano(5) assert Nano(3) - Nano(2) == Nano() @@ -218,13 +196,17 @@ def test_Nanosecond(): assert Micro(5) + Nano(1) == Nano(5001) -@pytest.mark.parametrize('kls, expected', - [(Hour, Timedelta(hours=5)), - (Minute, Timedelta(hours=2, minutes=3)), - (Second, Timedelta(hours=2, seconds=3)), - (Milli, Timedelta(hours=2, milliseconds=3)), - (Micro, Timedelta(hours=2, microseconds=3)), - (Nano, Timedelta(hours=2, nanoseconds=3))]) +@pytest.mark.parametrize( + "kls, expected", + [ + (Hour, Timedelta(hours=5)), + (Minute, Timedelta(hours=2, minutes=3)), + (Second, Timedelta(hours=2, seconds=3)), + (Milli, Timedelta(hours=2, milliseconds=3)), + (Micro, Timedelta(hours=2, microseconds=3)), + (Nano, Timedelta(hours=2, nanoseconds=3)), + ], +) def test_tick_addition(kls, expected): offset = kls(3) result = offset + Timedelta(hours=2) @@ -232,7 +214,7 @@ def test_tick_addition(kls, expected): assert result == expected -@pytest.mark.parametrize('cls', tick_classes) +@pytest.mark.parametrize("cls", tick_classes) def test_tick_division(cls): off = cls(10) @@ -254,13 +236,13 @@ def test_tick_division(cls): if cls._inc < Timedelta(seconds=1): # Case where we end up with a bigger class - result = off / .001 + result = off / 0.001 assert isinstance(result, offsets.Tick) assert not isinstance(result, cls) - assert result.delta == off.delta / .001 + assert result.delta == off.delta / 0.001 -@pytest.mark.parametrize('cls', tick_classes) +@pytest.mark.parametrize("cls", tick_classes) def test_tick_rdiv(cls): off = cls(10) delta = off.delta @@ -278,12 +260,12 @@ def test_tick_rdiv(cls): assert (delta.to_pytimedelta() * 2) / off == 2 result = np.array([2 * td64, td64]) / off - expected = np.array([2., 1.]) + expected = np.array([2.0, 1.0]) tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize('cls1', tick_classes) -@pytest.mark.parametrize('cls2', tick_classes) +@pytest.mark.parametrize("cls1", tick_classes) +@pytest.mark.parametrize("cls2", tick_classes) def test_tick_zero(cls1, cls2): assert cls1(0) == cls2(0) assert cls1(0) + cls2(0) == cls1(0) @@ -295,17 +277,17 @@ def test_tick_zero(cls1, cls2): assert cls1(2) + Nano(0) == cls1(2) -@pytest.mark.parametrize('cls', tick_classes) +@pytest.mark.parametrize("cls", tick_classes) def test_tick_equalities(cls): assert cls() == cls(1) -@pytest.mark.parametrize('cls', tick_classes) +@pytest.mark.parametrize("cls", tick_classes) def test_tick_offset(cls): assert not cls().isAnchored() -@pytest.mark.parametrize('cls', tick_classes) +@pytest.mark.parametrize("cls", tick_classes) def test_compare_ticks(cls): three = cls(3) four = cls(4) @@ -318,7 +300,7 @@ def test_compare_ticks(cls): assert cls(3) != cls(4) -@pytest.mark.parametrize('cls', tick_classes) +@pytest.mark.parametrize("cls", tick_classes) def test_compare_ticks_to_strs(cls): # GH#23524 off = cls(19) diff --git a/pandas/tests/tseries/offsets/test_yqm_offsets.py b/pandas/tests/tseries/offsets/test_yqm_offsets.py index bcfe997583b023..12a524d82fcf5a 100644 --- a/pandas/tests/tseries/offsets/test_yqm_offsets.py +++ b/pandas/tests/tseries/offsets/test_yqm_offsets.py @@ -9,8 +9,19 @@ from pandas import Timestamp from pandas.tseries.offsets import ( - BMonthBegin, BMonthEnd, BQuarterBegin, BQuarterEnd, BYearBegin, BYearEnd, - MonthBegin, MonthEnd, QuarterBegin, QuarterEnd, YearBegin, YearEnd) + BMonthBegin, + BMonthEnd, + BQuarterBegin, + BQuarterEnd, + BYearBegin, + BYearEnd, + MonthBegin, + MonthEnd, + QuarterBegin, + QuarterEnd, + YearBegin, + YearEnd, +) from .common import assert_offset_equal, assert_onOffset from .test_offsets import Base @@ -26,19 +37,30 @@ def test_quarterly_dont_normalize(): for klass in offsets: result = date + klass() - assert (result.time() == date.time()) - - -@pytest.mark.parametrize('n', [-2, 1]) -@pytest.mark.parametrize('cls', [MonthBegin, MonthEnd, - BMonthBegin, BMonthEnd, - QuarterBegin, QuarterEnd, - BQuarterBegin, BQuarterEnd, - YearBegin, YearEnd, - BYearBegin, BYearEnd]) + assert result.time() == date.time() + + +@pytest.mark.parametrize("n", [-2, 1]) +@pytest.mark.parametrize( + "cls", + [ + MonthBegin, + MonthEnd, + BMonthBegin, + BMonthEnd, + QuarterBegin, + QuarterEnd, + BQuarterBegin, + BQuarterEnd, + YearBegin, + YearEnd, + BYearBegin, + BYearEnd, + ], +) def test_apply_index(cls, n): offset = cls(n=n) - rng = pd.date_range(start='1/1/2000', periods=100000, freq='T') + rng = pd.date_range(start="1/1/2000", periods=100000, freq="T") ser = pd.Series(rng) res = rng + offset @@ -52,12 +74,16 @@ def test_apply_index(cls, n): assert res2.iloc[-1] == ser.iloc[-1] + offset -@pytest.mark.parametrize('offset', [QuarterBegin(), QuarterEnd(), - BQuarterBegin(), BQuarterEnd()]) +@pytest.mark.parametrize( + "offset", [QuarterBegin(), QuarterEnd(), BQuarterBegin(), BQuarterEnd()] +) def test_on_offset(offset): - dates = [datetime(2016, m, d) - for m in [10, 11, 12] - for d in [1, 2, 3, 28, 29, 30, 31] if not (m == 11 and d == 31)] + dates = [ + datetime(2016, m, d) + for m in [10, 11, 12] + for d in [1, 2, 3, 28, 29, 30, 31] + if not (m == 11 and d == 31) + ] for date in dates: res = offset.onOffset(date) slow_version = date == (date + offset) - offset @@ -67,41 +93,66 @@ def test_on_offset(offset): # -------------------------------------------------------------------- # Months + class TestMonthBegin(Base): _offset = MonthBegin offset_cases = [] # NOTE: I'm not entirely happy with the logic here for Begin -ss # see thread 'offset conventions' on the ML - offset_cases.append((MonthBegin(), { - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2008, 2, 1): datetime(2008, 3, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2006, 12, 1): datetime(2007, 1, 1), - datetime(2007, 1, 31): datetime(2007, 2, 1)})) - - offset_cases.append((MonthBegin(0), { - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2006, 12, 3): datetime(2007, 1, 1), - datetime(2007, 1, 31): datetime(2007, 2, 1)})) - - offset_cases.append((MonthBegin(2), { - datetime(2008, 2, 29): datetime(2008, 4, 1), - datetime(2008, 1, 31): datetime(2008, 3, 1), - datetime(2006, 12, 31): datetime(2007, 2, 1), - datetime(2007, 12, 28): datetime(2008, 2, 1), - datetime(2007, 1, 1): datetime(2007, 3, 1), - datetime(2006, 11, 1): datetime(2007, 1, 1)})) - - offset_cases.append((MonthBegin(-1), { - datetime(2007, 1, 1): datetime(2006, 12, 1), - datetime(2008, 5, 31): datetime(2008, 5, 1), - datetime(2008, 12, 31): datetime(2008, 12, 1), - datetime(2006, 12, 29): datetime(2006, 12, 1), - datetime(2006, 1, 2): datetime(2006, 1, 1)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + MonthBegin(), + { + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 2, 1): datetime(2008, 3, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2006, 12, 1): datetime(2007, 1, 1), + datetime(2007, 1, 31): datetime(2007, 2, 1), + }, + ) + ) + + offset_cases.append( + ( + MonthBegin(0), + { + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2006, 12, 3): datetime(2007, 1, 1), + datetime(2007, 1, 31): datetime(2007, 2, 1), + }, + ) + ) + + offset_cases.append( + ( + MonthBegin(2), + { + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 1, 31): datetime(2008, 3, 1), + datetime(2006, 12, 31): datetime(2007, 2, 1), + datetime(2007, 12, 28): datetime(2008, 2, 1), + datetime(2007, 1, 1): datetime(2007, 3, 1), + datetime(2006, 11, 1): datetime(2007, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + MonthBegin(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 1), + datetime(2008, 5, 31): datetime(2008, 5, 1), + datetime(2008, 12, 31): datetime(2008, 12, 1), + datetime(2006, 12, 29): datetime(2006, 12, 1), + datetime(2006, 1, 2): datetime(2006, 1, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): @@ -129,47 +180,73 @@ def test_normalize(self): assert result == expected offset_cases = [] - offset_cases.append((MonthEnd(), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 2, 29), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2007, 1, 31), - datetime(2007, 1, 1): datetime(2007, 1, 31), - datetime(2006, 12, 1): datetime(2006, 12, 31)})) - - offset_cases.append((MonthEnd(0), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2006, 12, 31), - datetime(2007, 1, 1): datetime(2007, 1, 31)})) - - offset_cases.append((MonthEnd(2), { - datetime(2008, 1, 1): datetime(2008, 2, 29), - datetime(2008, 1, 31): datetime(2008, 3, 31), - datetime(2006, 12, 29): datetime(2007, 1, 31), - datetime(2006, 12, 31): datetime(2007, 2, 28), - datetime(2007, 1, 1): datetime(2007, 2, 28), - datetime(2006, 11, 1): datetime(2006, 12, 31)})) - - offset_cases.append((MonthEnd(-1), { - datetime(2007, 1, 1): datetime(2006, 12, 31), - datetime(2008, 6, 30): datetime(2008, 5, 31), - datetime(2008, 12, 31): datetime(2008, 11, 30), - datetime(2006, 12, 29): datetime(2006, 11, 30), - datetime(2006, 12, 30): datetime(2006, 11, 30), - datetime(2007, 1, 1): datetime(2006, 12, 31)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + MonthEnd(), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + datetime(2006, 12, 1): datetime(2006, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + MonthEnd(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2006, 12, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + }, + ) + ) + + offset_cases.append( + ( + MonthEnd(2), + { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 3, 31), + datetime(2006, 12, 29): datetime(2007, 1, 31), + datetime(2006, 12, 31): datetime(2007, 2, 28), + datetime(2007, 1, 1): datetime(2007, 2, 28), + datetime(2006, 11, 1): datetime(2006, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + MonthEnd(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2008, 5, 31), + datetime(2008, 12, 31): datetime(2008, 11, 30), + datetime(2006, 12, 29): datetime(2006, 11, 30), + datetime(2006, 12, 30): datetime(2006, 11, 30), + datetime(2007, 1, 1): datetime(2006, 12, 31), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - on_offset_cases = [(MonthEnd(), datetime(2007, 12, 31), True), - (MonthEnd(), datetime(2008, 1, 1), False)] + on_offset_cases = [ + (MonthEnd(), datetime(2007, 12, 31), True), + (MonthEnd(), datetime(2008, 1, 1), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) @@ -185,53 +262,79 @@ def test_offsets_compare_equal(self): assert not offset1 != offset2 offset_cases = [] - offset_cases.append((BMonthBegin(), { - datetime(2008, 1, 1): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2006, 9, 1): datetime(2006, 10, 2), - datetime(2007, 1, 1): datetime(2007, 2, 1), - datetime(2006, 12, 1): datetime(2007, 1, 1)})) - - offset_cases.append((BMonthBegin(0), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2006, 10, 2): datetime(2006, 10, 2), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2006, 9, 15): datetime(2006, 10, 2)})) - - offset_cases.append((BMonthBegin(2), { - datetime(2008, 1, 1): datetime(2008, 3, 3), - datetime(2008, 1, 15): datetime(2008, 3, 3), - datetime(2006, 12, 29): datetime(2007, 2, 1), - datetime(2006, 12, 31): datetime(2007, 2, 1), - datetime(2007, 1, 1): datetime(2007, 3, 1), - datetime(2006, 11, 1): datetime(2007, 1, 1)})) - - offset_cases.append((BMonthBegin(-1), { - datetime(2007, 1, 1): datetime(2006, 12, 1), - datetime(2008, 6, 30): datetime(2008, 6, 2), - datetime(2008, 6, 1): datetime(2008, 5, 1), - datetime(2008, 3, 10): datetime(2008, 3, 3), - datetime(2008, 12, 31): datetime(2008, 12, 1), - datetime(2006, 12, 29): datetime(2006, 12, 1), - datetime(2006, 12, 30): datetime(2006, 12, 1), - datetime(2007, 1, 1): datetime(2006, 12, 1)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + BMonthBegin(), + { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2006, 9, 1): datetime(2006, 10, 2), + datetime(2007, 1, 1): datetime(2007, 2, 1), + datetime(2006, 12, 1): datetime(2007, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + BMonthBegin(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2006, 10, 2): datetime(2006, 10, 2), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2006, 9, 15): datetime(2006, 10, 2), + }, + ) + ) + + offset_cases.append( + ( + BMonthBegin(2), + { + datetime(2008, 1, 1): datetime(2008, 3, 3), + datetime(2008, 1, 15): datetime(2008, 3, 3), + datetime(2006, 12, 29): datetime(2007, 2, 1), + datetime(2006, 12, 31): datetime(2007, 2, 1), + datetime(2007, 1, 1): datetime(2007, 3, 1), + datetime(2006, 11, 1): datetime(2007, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + BMonthBegin(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 1), + datetime(2008, 6, 30): datetime(2008, 6, 2), + datetime(2008, 6, 1): datetime(2008, 5, 1), + datetime(2008, 3, 10): datetime(2008, 3, 3), + datetime(2008, 12, 31): datetime(2008, 12, 1), + datetime(2006, 12, 29): datetime(2006, 12, 1), + datetime(2006, 12, 30): datetime(2006, 12, 1), + datetime(2007, 1, 1): datetime(2006, 12, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - on_offset_cases = [(BMonthBegin(), datetime(2007, 12, 31), False), - (BMonthBegin(), datetime(2008, 1, 1), True), - (BMonthBegin(), datetime(2001, 4, 2), True), - (BMonthBegin(), datetime(2008, 3, 3), True)] + on_offset_cases = [ + (BMonthBegin(), datetime(2007, 12, 31), False), + (BMonthBegin(), datetime(2008, 1, 1), True), + (BMonthBegin(), datetime(2001, 4, 2), True), + (BMonthBegin(), datetime(2008, 3, 3), True), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) @@ -254,57 +357,83 @@ def test_offsets_compare_equal(self): assert not offset1 != offset2 offset_cases = [] - offset_cases.append((BMonthEnd(), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 2, 29), - datetime(2006, 12, 29): datetime(2007, 1, 31), - datetime(2006, 12, 31): datetime(2007, 1, 31), - datetime(2007, 1, 1): datetime(2007, 1, 31), - datetime(2006, 12, 1): datetime(2006, 12, 29)})) - - offset_cases.append((BMonthEnd(0), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2006, 12, 29): datetime(2006, 12, 29), - datetime(2006, 12, 31): datetime(2007, 1, 31), - datetime(2007, 1, 1): datetime(2007, 1, 31)})) - - offset_cases.append((BMonthEnd(2), { - datetime(2008, 1, 1): datetime(2008, 2, 29), - datetime(2008, 1, 31): datetime(2008, 3, 31), - datetime(2006, 12, 29): datetime(2007, 2, 28), - datetime(2006, 12, 31): datetime(2007, 2, 28), - datetime(2007, 1, 1): datetime(2007, 2, 28), - datetime(2006, 11, 1): datetime(2006, 12, 29)})) - - offset_cases.append((BMonthEnd(-1), { - datetime(2007, 1, 1): datetime(2006, 12, 29), - datetime(2008, 6, 30): datetime(2008, 5, 30), - datetime(2008, 12, 31): datetime(2008, 11, 28), - datetime(2006, 12, 29): datetime(2006, 11, 30), - datetime(2006, 12, 30): datetime(2006, 12, 29), - datetime(2007, 1, 1): datetime(2006, 12, 29)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + BMonthEnd(), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2006, 12, 29): datetime(2007, 1, 31), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + datetime(2006, 12, 1): datetime(2006, 12, 29), + }, + ) + ) + + offset_cases.append( + ( + BMonthEnd(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 29), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + }, + ) + ) + + offset_cases.append( + ( + BMonthEnd(2), + { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 3, 31), + datetime(2006, 12, 29): datetime(2007, 2, 28), + datetime(2006, 12, 31): datetime(2007, 2, 28), + datetime(2007, 1, 1): datetime(2007, 2, 28), + datetime(2006, 11, 1): datetime(2006, 12, 29), + }, + ) + ) + + offset_cases.append( + ( + BMonthEnd(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 29), + datetime(2008, 6, 30): datetime(2008, 5, 30), + datetime(2008, 12, 31): datetime(2008, 11, 28), + datetime(2006, 12, 29): datetime(2006, 11, 30), + datetime(2006, 12, 30): datetime(2006, 12, 29), + datetime(2007, 1, 1): datetime(2006, 12, 29), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - on_offset_cases = [(BMonthEnd(), datetime(2007, 12, 31), True), - (BMonthEnd(), datetime(2008, 1, 1), False)] + on_offset_cases = [ + (BMonthEnd(), datetime(2007, 12, 31), True), + (BMonthEnd(), datetime(2008, 1, 1), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) + # -------------------------------------------------------------------- # Quarters class TestQuarterBegin(Base): - def test_repr(self): expected = "" assert repr(QuarterBegin()) == expected @@ -324,58 +453,88 @@ def test_offset_corner_case(self): assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 1) offset_cases = [] - offset_cases.append((QuarterBegin(startingMonth=1), { - datetime(2007, 12, 1): datetime(2008, 1, 1), - datetime(2008, 1, 1): datetime(2008, 4, 1), - datetime(2008, 2, 15): datetime(2008, 4, 1), - datetime(2008, 2, 29): datetime(2008, 4, 1), - datetime(2008, 3, 15): datetime(2008, 4, 1), - datetime(2008, 3, 31): datetime(2008, 4, 1), - datetime(2008, 4, 15): datetime(2008, 7, 1), - datetime(2008, 4, 1): datetime(2008, 7, 1)})) - - offset_cases.append((QuarterBegin(startingMonth=2), { - datetime(2008, 1, 1): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2008, 1, 15): datetime(2008, 2, 1), - datetime(2008, 2, 29): datetime(2008, 5, 1), - datetime(2008, 3, 15): datetime(2008, 5, 1), - datetime(2008, 3, 31): datetime(2008, 5, 1), - datetime(2008, 4, 15): datetime(2008, 5, 1), - datetime(2008, 4, 30): datetime(2008, 5, 1)})) - - offset_cases.append((QuarterBegin(startingMonth=1, n=0), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 12, 1): datetime(2009, 1, 1), - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 2, 15): datetime(2008, 4, 1), - datetime(2008, 2, 29): datetime(2008, 4, 1), - datetime(2008, 3, 15): datetime(2008, 4, 1), - datetime(2008, 3, 31): datetime(2008, 4, 1), - datetime(2008, 4, 15): datetime(2008, 7, 1), - datetime(2008, 4, 30): datetime(2008, 7, 1)})) - - offset_cases.append((QuarterBegin(startingMonth=1, n=-1), { - datetime(2008, 1, 1): datetime(2007, 10, 1), - datetime(2008, 1, 31): datetime(2008, 1, 1), - datetime(2008, 2, 15): datetime(2008, 1, 1), - datetime(2008, 2, 29): datetime(2008, 1, 1), - datetime(2008, 3, 15): datetime(2008, 1, 1), - datetime(2008, 3, 31): datetime(2008, 1, 1), - datetime(2008, 4, 15): datetime(2008, 4, 1), - datetime(2008, 4, 30): datetime(2008, 4, 1), - datetime(2008, 7, 1): datetime(2008, 4, 1)})) - - offset_cases.append((QuarterBegin(startingMonth=1, n=2), { - datetime(2008, 1, 1): datetime(2008, 7, 1), - datetime(2008, 2, 15): datetime(2008, 7, 1), - datetime(2008, 2, 29): datetime(2008, 7, 1), - datetime(2008, 3, 15): datetime(2008, 7, 1), - datetime(2008, 3, 31): datetime(2008, 7, 1), - datetime(2008, 4, 15): datetime(2008, 10, 1), - datetime(2008, 4, 1): datetime(2008, 10, 1)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + QuarterBegin(startingMonth=1), + { + datetime(2007, 12, 1): datetime(2008, 1, 1), + datetime(2008, 1, 1): datetime(2008, 4, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2008, 3, 31): datetime(2008, 4, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2008, 4, 1): datetime(2008, 7, 1), + }, + ) + ) + + offset_cases.append( + ( + QuarterBegin(startingMonth=2), + { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 2, 29): datetime(2008, 5, 1), + datetime(2008, 3, 15): datetime(2008, 5, 1), + datetime(2008, 3, 31): datetime(2008, 5, 1), + datetime(2008, 4, 15): datetime(2008, 5, 1), + datetime(2008, 4, 30): datetime(2008, 5, 1), + }, + ) + ) + + offset_cases.append( + ( + QuarterBegin(startingMonth=1, n=0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 12, 1): datetime(2009, 1, 1), + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2008, 3, 31): datetime(2008, 4, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2008, 4, 30): datetime(2008, 7, 1), + }, + ) + ) + + offset_cases.append( + ( + QuarterBegin(startingMonth=1, n=-1), + { + datetime(2008, 1, 1): datetime(2007, 10, 1), + datetime(2008, 1, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 1, 1), + datetime(2008, 2, 29): datetime(2008, 1, 1), + datetime(2008, 3, 15): datetime(2008, 1, 1), + datetime(2008, 3, 31): datetime(2008, 1, 1), + datetime(2008, 4, 15): datetime(2008, 4, 1), + datetime(2008, 4, 30): datetime(2008, 4, 1), + datetime(2008, 7, 1): datetime(2008, 4, 1), + }, + ) + ) + + offset_cases.append( + ( + QuarterBegin(startingMonth=1, n=2), + { + datetime(2008, 1, 1): datetime(2008, 7, 1), + datetime(2008, 2, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2008, 3, 31): datetime(2008, 7, 1), + datetime(2008, 4, 15): datetime(2008, 10, 1), + datetime(2008, 4, 1): datetime(2008, 10, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): @@ -404,57 +563,87 @@ def test_offset_corner_case(self): assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 31) offset_cases = [] - offset_cases.append((QuarterEnd(startingMonth=1), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 4, 30), - datetime(2008, 2, 15): datetime(2008, 4, 30), - datetime(2008, 2, 29): datetime(2008, 4, 30), - datetime(2008, 3, 15): datetime(2008, 4, 30), - datetime(2008, 3, 31): datetime(2008, 4, 30), - datetime(2008, 4, 15): datetime(2008, 4, 30), - datetime(2008, 4, 30): datetime(2008, 7, 31)})) - - offset_cases.append((QuarterEnd(startingMonth=2), { - datetime(2008, 1, 1): datetime(2008, 2, 29), - datetime(2008, 1, 31): datetime(2008, 2, 29), - datetime(2008, 2, 15): datetime(2008, 2, 29), - datetime(2008, 2, 29): datetime(2008, 5, 31), - datetime(2008, 3, 15): datetime(2008, 5, 31), - datetime(2008, 3, 31): datetime(2008, 5, 31), - datetime(2008, 4, 15): datetime(2008, 5, 31), - datetime(2008, 4, 30): datetime(2008, 5, 31)})) - - offset_cases.append((QuarterEnd(startingMonth=1, n=0), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2008, 2, 15): datetime(2008, 4, 30), - datetime(2008, 2, 29): datetime(2008, 4, 30), - datetime(2008, 3, 15): datetime(2008, 4, 30), - datetime(2008, 3, 31): datetime(2008, 4, 30), - datetime(2008, 4, 15): datetime(2008, 4, 30), - datetime(2008, 4, 30): datetime(2008, 4, 30)})) - - offset_cases.append((QuarterEnd(startingMonth=1, n=-1), { - datetime(2008, 1, 1): datetime(2007, 10, 31), - datetime(2008, 1, 31): datetime(2007, 10, 31), - datetime(2008, 2, 15): datetime(2008, 1, 31), - datetime(2008, 2, 29): datetime(2008, 1, 31), - datetime(2008, 3, 15): datetime(2008, 1, 31), - datetime(2008, 3, 31): datetime(2008, 1, 31), - datetime(2008, 4, 15): datetime(2008, 1, 31), - datetime(2008, 4, 30): datetime(2008, 1, 31), - datetime(2008, 7, 1): datetime(2008, 4, 30)})) - - offset_cases.append((QuarterEnd(startingMonth=1, n=2), { - datetime(2008, 1, 31): datetime(2008, 7, 31), - datetime(2008, 2, 15): datetime(2008, 7, 31), - datetime(2008, 2, 29): datetime(2008, 7, 31), - datetime(2008, 3, 15): datetime(2008, 7, 31), - datetime(2008, 3, 31): datetime(2008, 7, 31), - datetime(2008, 4, 15): datetime(2008, 7, 31), - datetime(2008, 4, 30): datetime(2008, 10, 31)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + QuarterEnd(startingMonth=1), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 4, 30), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 7, 31), + }, + ) + ) + + offset_cases.append( + ( + QuarterEnd(startingMonth=2), + { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2008, 2, 15): datetime(2008, 2, 29), + datetime(2008, 2, 29): datetime(2008, 5, 31), + datetime(2008, 3, 15): datetime(2008, 5, 31), + datetime(2008, 3, 31): datetime(2008, 5, 31), + datetime(2008, 4, 15): datetime(2008, 5, 31), + datetime(2008, 4, 30): datetime(2008, 5, 31), + }, + ) + ) + + offset_cases.append( + ( + QuarterEnd(startingMonth=1, n=0), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 4, 30), + }, + ) + ) + + offset_cases.append( + ( + QuarterEnd(startingMonth=1, n=-1), + { + datetime(2008, 1, 1): datetime(2007, 10, 31), + datetime(2008, 1, 31): datetime(2007, 10, 31), + datetime(2008, 2, 15): datetime(2008, 1, 31), + datetime(2008, 2, 29): datetime(2008, 1, 31), + datetime(2008, 3, 15): datetime(2008, 1, 31), + datetime(2008, 3, 31): datetime(2008, 1, 31), + datetime(2008, 4, 15): datetime(2008, 1, 31), + datetime(2008, 4, 30): datetime(2008, 1, 31), + datetime(2008, 7, 1): datetime(2008, 4, 30), + }, + ) + ) + + offset_cases.append( + ( + QuarterEnd(startingMonth=1, n=2), + { + datetime(2008, 1, 31): datetime(2008, 7, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 4, 30): datetime(2008, 10, 31), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): @@ -490,9 +679,10 @@ def test_offset(self, case): (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 30), False), (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 31), False), (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 29), False), - (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), True)] + (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), True), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) @@ -520,73 +710,103 @@ def test_offset_corner_case(self): assert datetime(2007, 4, 3) + offset == datetime(2007, 4, 2) offset_cases = [] - offset_cases.append((BQuarterBegin(startingMonth=1), { - datetime(2008, 1, 1): datetime(2008, 4, 1), - datetime(2008, 1, 31): datetime(2008, 4, 1), - datetime(2008, 2, 15): datetime(2008, 4, 1), - datetime(2008, 2, 29): datetime(2008, 4, 1), - datetime(2008, 3, 15): datetime(2008, 4, 1), - datetime(2008, 3, 31): datetime(2008, 4, 1), - datetime(2008, 4, 15): datetime(2008, 7, 1), - datetime(2007, 3, 15): datetime(2007, 4, 2), - datetime(2007, 2, 28): datetime(2007, 4, 2), - datetime(2007, 1, 1): datetime(2007, 4, 2), - datetime(2007, 4, 15): datetime(2007, 7, 2), - datetime(2007, 7, 1): datetime(2007, 7, 2), - datetime(2007, 4, 1): datetime(2007, 4, 2), - datetime(2007, 4, 2): datetime(2007, 7, 2), - datetime(2008, 4, 30): datetime(2008, 7, 1)})) - - offset_cases.append((BQuarterBegin(startingMonth=2), { - datetime(2008, 1, 1): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2008, 1, 15): datetime(2008, 2, 1), - datetime(2008, 2, 29): datetime(2008, 5, 1), - datetime(2008, 3, 15): datetime(2008, 5, 1), - datetime(2008, 3, 31): datetime(2008, 5, 1), - datetime(2008, 4, 15): datetime(2008, 5, 1), - datetime(2008, 8, 15): datetime(2008, 11, 3), - datetime(2008, 9, 15): datetime(2008, 11, 3), - datetime(2008, 11, 1): datetime(2008, 11, 3), - datetime(2008, 4, 30): datetime(2008, 5, 1)})) - - offset_cases.append((BQuarterBegin(startingMonth=1, n=0), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2007, 12, 31): datetime(2008, 1, 1), - datetime(2008, 2, 15): datetime(2008, 4, 1), - datetime(2008, 2, 29): datetime(2008, 4, 1), - datetime(2008, 1, 15): datetime(2008, 4, 1), - datetime(2008, 2, 27): datetime(2008, 4, 1), - datetime(2008, 3, 15): datetime(2008, 4, 1), - datetime(2007, 4, 1): datetime(2007, 4, 2), - datetime(2007, 4, 2): datetime(2007, 4, 2), - datetime(2007, 7, 1): datetime(2007, 7, 2), - datetime(2007, 4, 15): datetime(2007, 7, 2), - datetime(2007, 7, 2): datetime(2007, 7, 2)})) - - offset_cases.append((BQuarterBegin(startingMonth=1, n=-1), { - datetime(2008, 1, 1): datetime(2007, 10, 1), - datetime(2008, 1, 31): datetime(2008, 1, 1), - datetime(2008, 2, 15): datetime(2008, 1, 1), - datetime(2008, 2, 29): datetime(2008, 1, 1), - datetime(2008, 3, 15): datetime(2008, 1, 1), - datetime(2008, 3, 31): datetime(2008, 1, 1), - datetime(2008, 4, 15): datetime(2008, 4, 1), - datetime(2007, 7, 3): datetime(2007, 7, 2), - datetime(2007, 4, 3): datetime(2007, 4, 2), - datetime(2007, 7, 2): datetime(2007, 4, 2), - datetime(2008, 4, 1): datetime(2008, 1, 1)})) - - offset_cases.append((BQuarterBegin(startingMonth=1, n=2), { - datetime(2008, 1, 1): datetime(2008, 7, 1), - datetime(2008, 1, 15): datetime(2008, 7, 1), - datetime(2008, 2, 29): datetime(2008, 7, 1), - datetime(2008, 3, 15): datetime(2008, 7, 1), - datetime(2007, 3, 31): datetime(2007, 7, 2), - datetime(2007, 4, 15): datetime(2007, 10, 1), - datetime(2008, 4, 30): datetime(2008, 10, 1)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + BQuarterBegin(startingMonth=1), + { + datetime(2008, 1, 1): datetime(2008, 4, 1), + datetime(2008, 1, 31): datetime(2008, 4, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2008, 3, 31): datetime(2008, 4, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2007, 3, 15): datetime(2007, 4, 2), + datetime(2007, 2, 28): datetime(2007, 4, 2), + datetime(2007, 1, 1): datetime(2007, 4, 2), + datetime(2007, 4, 15): datetime(2007, 7, 2), + datetime(2007, 7, 1): datetime(2007, 7, 2), + datetime(2007, 4, 1): datetime(2007, 4, 2), + datetime(2007, 4, 2): datetime(2007, 7, 2), + datetime(2008, 4, 30): datetime(2008, 7, 1), + }, + ) + ) + + offset_cases.append( + ( + BQuarterBegin(startingMonth=2), + { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 2, 29): datetime(2008, 5, 1), + datetime(2008, 3, 15): datetime(2008, 5, 1), + datetime(2008, 3, 31): datetime(2008, 5, 1), + datetime(2008, 4, 15): datetime(2008, 5, 1), + datetime(2008, 8, 15): datetime(2008, 11, 3), + datetime(2008, 9, 15): datetime(2008, 11, 3), + datetime(2008, 11, 1): datetime(2008, 11, 3), + datetime(2008, 4, 30): datetime(2008, 5, 1), + }, + ) + ) + + offset_cases.append( + ( + BQuarterBegin(startingMonth=1, n=0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2007, 12, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 1, 15): datetime(2008, 4, 1), + datetime(2008, 2, 27): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2007, 4, 1): datetime(2007, 4, 2), + datetime(2007, 4, 2): datetime(2007, 4, 2), + datetime(2007, 7, 1): datetime(2007, 7, 2), + datetime(2007, 4, 15): datetime(2007, 7, 2), + datetime(2007, 7, 2): datetime(2007, 7, 2), + }, + ) + ) + + offset_cases.append( + ( + BQuarterBegin(startingMonth=1, n=-1), + { + datetime(2008, 1, 1): datetime(2007, 10, 1), + datetime(2008, 1, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 1, 1), + datetime(2008, 2, 29): datetime(2008, 1, 1), + datetime(2008, 3, 15): datetime(2008, 1, 1), + datetime(2008, 3, 31): datetime(2008, 1, 1), + datetime(2008, 4, 15): datetime(2008, 4, 1), + datetime(2007, 7, 3): datetime(2007, 7, 2), + datetime(2007, 4, 3): datetime(2007, 4, 2), + datetime(2007, 7, 2): datetime(2007, 4, 2), + datetime(2008, 4, 1): datetime(2008, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + BQuarterBegin(startingMonth=1, n=2), + { + datetime(2008, 1, 1): datetime(2008, 7, 1), + datetime(2008, 1, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2007, 3, 31): datetime(2007, 7, 2), + datetime(2007, 4, 15): datetime(2007, 10, 1), + datetime(2008, 4, 30): datetime(2008, 10, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): @@ -615,56 +835,86 @@ def test_offset_corner_case(self): assert datetime(2010, 1, 31) + offset == datetime(2010, 1, 29) offset_cases = [] - offset_cases.append((BQuarterEnd(startingMonth=1), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 4, 30), - datetime(2008, 2, 15): datetime(2008, 4, 30), - datetime(2008, 2, 29): datetime(2008, 4, 30), - datetime(2008, 3, 15): datetime(2008, 4, 30), - datetime(2008, 3, 31): datetime(2008, 4, 30), - datetime(2008, 4, 15): datetime(2008, 4, 30), - datetime(2008, 4, 30): datetime(2008, 7, 31)})) - - offset_cases.append((BQuarterEnd(startingMonth=2), { - datetime(2008, 1, 1): datetime(2008, 2, 29), - datetime(2008, 1, 31): datetime(2008, 2, 29), - datetime(2008, 2, 15): datetime(2008, 2, 29), - datetime(2008, 2, 29): datetime(2008, 5, 30), - datetime(2008, 3, 15): datetime(2008, 5, 30), - datetime(2008, 3, 31): datetime(2008, 5, 30), - datetime(2008, 4, 15): datetime(2008, 5, 30), - datetime(2008, 4, 30): datetime(2008, 5, 30)})) - - offset_cases.append((BQuarterEnd(startingMonth=1, n=0), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2008, 2, 15): datetime(2008, 4, 30), - datetime(2008, 2, 29): datetime(2008, 4, 30), - datetime(2008, 3, 15): datetime(2008, 4, 30), - datetime(2008, 3, 31): datetime(2008, 4, 30), - datetime(2008, 4, 15): datetime(2008, 4, 30), - datetime(2008, 4, 30): datetime(2008, 4, 30)})) - - offset_cases.append((BQuarterEnd(startingMonth=1, n=-1), { - datetime(2008, 1, 1): datetime(2007, 10, 31), - datetime(2008, 1, 31): datetime(2007, 10, 31), - datetime(2008, 2, 15): datetime(2008, 1, 31), - datetime(2008, 2, 29): datetime(2008, 1, 31), - datetime(2008, 3, 15): datetime(2008, 1, 31), - datetime(2008, 3, 31): datetime(2008, 1, 31), - datetime(2008, 4, 15): datetime(2008, 1, 31), - datetime(2008, 4, 30): datetime(2008, 1, 31)})) - - offset_cases.append((BQuarterEnd(startingMonth=1, n=2), { - datetime(2008, 1, 31): datetime(2008, 7, 31), - datetime(2008, 2, 15): datetime(2008, 7, 31), - datetime(2008, 2, 29): datetime(2008, 7, 31), - datetime(2008, 3, 15): datetime(2008, 7, 31), - datetime(2008, 3, 31): datetime(2008, 7, 31), - datetime(2008, 4, 15): datetime(2008, 7, 31), - datetime(2008, 4, 30): datetime(2008, 10, 31)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + BQuarterEnd(startingMonth=1), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 4, 30), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 7, 31), + }, + ) + ) + + offset_cases.append( + ( + BQuarterEnd(startingMonth=2), + { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2008, 2, 15): datetime(2008, 2, 29), + datetime(2008, 2, 29): datetime(2008, 5, 30), + datetime(2008, 3, 15): datetime(2008, 5, 30), + datetime(2008, 3, 31): datetime(2008, 5, 30), + datetime(2008, 4, 15): datetime(2008, 5, 30), + datetime(2008, 4, 30): datetime(2008, 5, 30), + }, + ) + ) + + offset_cases.append( + ( + BQuarterEnd(startingMonth=1, n=0), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 4, 30), + }, + ) + ) + + offset_cases.append( + ( + BQuarterEnd(startingMonth=1, n=-1), + { + datetime(2008, 1, 1): datetime(2007, 10, 31), + datetime(2008, 1, 31): datetime(2007, 10, 31), + datetime(2008, 2, 15): datetime(2008, 1, 31), + datetime(2008, 2, 29): datetime(2008, 1, 31), + datetime(2008, 3, 15): datetime(2008, 1, 31), + datetime(2008, 3, 31): datetime(2008, 1, 31), + datetime(2008, 4, 15): datetime(2008, 1, 31), + datetime(2008, 4, 30): datetime(2008, 1, 31), + }, + ) + ) + + offset_cases.append( + ( + BQuarterEnd(startingMonth=1, n=2), + { + datetime(2008, 1, 31): datetime(2008, 7, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 4, 30): datetime(2008, 10, 31), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): @@ -697,13 +947,15 @@ def test_offset(self, case): (BQuarterEnd(1, startingMonth=3), datetime(2008, 4, 30), False), (BQuarterEnd(1, startingMonth=3), datetime(2008, 5, 30), False), (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 29), True), - (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), False)] + (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) + # -------------------------------------------------------------------- # Years @@ -716,85 +968,147 @@ def test_misspecified(self): YearBegin(month=13) offset_cases = [] - offset_cases.append((YearBegin(), { - datetime(2008, 1, 1): datetime(2009, 1, 1), - datetime(2008, 6, 30): datetime(2009, 1, 1), - datetime(2008, 12, 31): datetime(2009, 1, 1), - datetime(2005, 12, 30): datetime(2006, 1, 1), - datetime(2005, 12, 31): datetime(2006, 1, 1)})) - - offset_cases.append((YearBegin(0), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 6, 30): datetime(2009, 1, 1), - datetime(2008, 12, 31): datetime(2009, 1, 1), - datetime(2005, 12, 30): datetime(2006, 1, 1), - datetime(2005, 12, 31): datetime(2006, 1, 1)})) - - offset_cases.append((YearBegin(3), { - datetime(2008, 1, 1): datetime(2011, 1, 1), - datetime(2008, 6, 30): datetime(2011, 1, 1), - datetime(2008, 12, 31): datetime(2011, 1, 1), - datetime(2005, 12, 30): datetime(2008, 1, 1), - datetime(2005, 12, 31): datetime(2008, 1, 1)})) - - offset_cases.append((YearBegin(-1), { - datetime(2007, 1, 1): datetime(2006, 1, 1), - datetime(2007, 1, 15): datetime(2007, 1, 1), - datetime(2008, 6, 30): datetime(2008, 1, 1), - datetime(2008, 12, 31): datetime(2008, 1, 1), - datetime(2006, 12, 29): datetime(2006, 1, 1), - datetime(2006, 12, 30): datetime(2006, 1, 1), - datetime(2007, 1, 1): datetime(2006, 1, 1)})) - - offset_cases.append((YearBegin(-2), { - datetime(2007, 1, 1): datetime(2005, 1, 1), - datetime(2008, 6, 30): datetime(2007, 1, 1), - datetime(2008, 12, 31): datetime(2007, 1, 1)})) - - offset_cases.append((YearBegin(month=4), { - datetime(2007, 4, 1): datetime(2008, 4, 1), - datetime(2007, 4, 15): datetime(2008, 4, 1), - datetime(2007, 3, 1): datetime(2007, 4, 1), - datetime(2007, 12, 15): datetime(2008, 4, 1), - datetime(2012, 1, 31): datetime(2012, 4, 1)})) - - offset_cases.append((YearBegin(0, month=4), { - datetime(2007, 4, 1): datetime(2007, 4, 1), - datetime(2007, 3, 1): datetime(2007, 4, 1), - datetime(2007, 12, 15): datetime(2008, 4, 1), - datetime(2012, 1, 31): datetime(2012, 4, 1)})) - - offset_cases.append((YearBegin(4, month=4), { - datetime(2007, 4, 1): datetime(2011, 4, 1), - datetime(2007, 4, 15): datetime(2011, 4, 1), - datetime(2007, 3, 1): datetime(2010, 4, 1), - datetime(2007, 12, 15): datetime(2011, 4, 1), - datetime(2012, 1, 31): datetime(2015, 4, 1)})) - - offset_cases.append((YearBegin(-1, month=4), { - datetime(2007, 4, 1): datetime(2006, 4, 1), - datetime(2007, 3, 1): datetime(2006, 4, 1), - datetime(2007, 12, 15): datetime(2007, 4, 1), - datetime(2012, 1, 31): datetime(2011, 4, 1)})) - - offset_cases.append((YearBegin(-3, month=4), { - datetime(2007, 4, 1): datetime(2004, 4, 1), - datetime(2007, 3, 1): datetime(2004, 4, 1), - datetime(2007, 12, 15): datetime(2005, 4, 1), - datetime(2012, 1, 31): datetime(2009, 4, 1)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + YearBegin(), + { + datetime(2008, 1, 1): datetime(2009, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2005, 12, 30): datetime(2006, 1, 1), + datetime(2005, 12, 31): datetime(2006, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2005, 12, 30): datetime(2006, 1, 1), + datetime(2005, 12, 31): datetime(2006, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(3), + { + datetime(2008, 1, 1): datetime(2011, 1, 1), + datetime(2008, 6, 30): datetime(2011, 1, 1), + datetime(2008, 12, 31): datetime(2011, 1, 1), + datetime(2005, 12, 30): datetime(2008, 1, 1), + datetime(2005, 12, 31): datetime(2008, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(-1), + { + datetime(2007, 1, 1): datetime(2006, 1, 1), + datetime(2007, 1, 15): datetime(2007, 1, 1), + datetime(2008, 6, 30): datetime(2008, 1, 1), + datetime(2008, 12, 31): datetime(2008, 1, 1), + datetime(2006, 12, 29): datetime(2006, 1, 1), + datetime(2006, 12, 30): datetime(2006, 1, 1), + datetime(2007, 1, 1): datetime(2006, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(-2), + { + datetime(2007, 1, 1): datetime(2005, 1, 1), + datetime(2008, 6, 30): datetime(2007, 1, 1), + datetime(2008, 12, 31): datetime(2007, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(month=4), + { + datetime(2007, 4, 1): datetime(2008, 4, 1), + datetime(2007, 4, 15): datetime(2008, 4, 1), + datetime(2007, 3, 1): datetime(2007, 4, 1), + datetime(2007, 12, 15): datetime(2008, 4, 1), + datetime(2012, 1, 31): datetime(2012, 4, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(0, month=4), + { + datetime(2007, 4, 1): datetime(2007, 4, 1), + datetime(2007, 3, 1): datetime(2007, 4, 1), + datetime(2007, 12, 15): datetime(2008, 4, 1), + datetime(2012, 1, 31): datetime(2012, 4, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(4, month=4), + { + datetime(2007, 4, 1): datetime(2011, 4, 1), + datetime(2007, 4, 15): datetime(2011, 4, 1), + datetime(2007, 3, 1): datetime(2010, 4, 1), + datetime(2007, 12, 15): datetime(2011, 4, 1), + datetime(2012, 1, 31): datetime(2015, 4, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(-1, month=4), + { + datetime(2007, 4, 1): datetime(2006, 4, 1), + datetime(2007, 3, 1): datetime(2006, 4, 1), + datetime(2007, 12, 15): datetime(2007, 4, 1), + datetime(2012, 1, 31): datetime(2011, 4, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(-3, month=4), + { + datetime(2007, 4, 1): datetime(2004, 4, 1), + datetime(2007, 3, 1): datetime(2004, 4, 1), + datetime(2007, 12, 15): datetime(2005, 4, 1), + datetime(2012, 1, 31): datetime(2009, 4, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - on_offset_cases = [(YearBegin(), datetime(2007, 1, 3), False), - (YearBegin(), datetime(2008, 1, 1), True), - (YearBegin(), datetime(2006, 12, 31), False), - (YearBegin(), datetime(2006, 1, 2), False)] + on_offset_cases = [ + (YearBegin(), datetime(2007, 1, 3), False), + (YearBegin(), datetime(2008, 1, 1), True), + (YearBegin(), datetime(2006, 12, 31), False), + (YearBegin(), datetime(2006, 1, 2), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) @@ -808,44 +1122,70 @@ def test_misspecified(self): YearEnd(month=13) offset_cases = [] - offset_cases.append((YearEnd(), { - datetime(2008, 1, 1): datetime(2008, 12, 31), - datetime(2008, 6, 30): datetime(2008, 12, 31), - datetime(2008, 12, 31): datetime(2009, 12, 31), - datetime(2005, 12, 30): datetime(2005, 12, 31), - datetime(2005, 12, 31): datetime(2006, 12, 31)})) - - offset_cases.append((YearEnd(0), { - datetime(2008, 1, 1): datetime(2008, 12, 31), - datetime(2008, 6, 30): datetime(2008, 12, 31), - datetime(2008, 12, 31): datetime(2008, 12, 31), - datetime(2005, 12, 30): datetime(2005, 12, 31)})) - - offset_cases.append((YearEnd(-1), { - datetime(2007, 1, 1): datetime(2006, 12, 31), - datetime(2008, 6, 30): datetime(2007, 12, 31), - datetime(2008, 12, 31): datetime(2007, 12, 31), - datetime(2006, 12, 29): datetime(2005, 12, 31), - datetime(2006, 12, 30): datetime(2005, 12, 31), - datetime(2007, 1, 1): datetime(2006, 12, 31)})) - - offset_cases.append((YearEnd(-2), { - datetime(2007, 1, 1): datetime(2005, 12, 31), - datetime(2008, 6, 30): datetime(2006, 12, 31), - datetime(2008, 12, 31): datetime(2006, 12, 31)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + YearEnd(), + { + datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2009, 12, 31), + datetime(2005, 12, 30): datetime(2005, 12, 31), + datetime(2005, 12, 31): datetime(2006, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + YearEnd(0), + { + datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2008, 12, 31), + datetime(2005, 12, 30): datetime(2005, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + YearEnd(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2007, 12, 31), + datetime(2008, 12, 31): datetime(2007, 12, 31), + datetime(2006, 12, 29): datetime(2005, 12, 31), + datetime(2006, 12, 30): datetime(2005, 12, 31), + datetime(2007, 1, 1): datetime(2006, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + YearEnd(-2), + { + datetime(2007, 1, 1): datetime(2005, 12, 31), + datetime(2008, 6, 30): datetime(2006, 12, 31), + datetime(2008, 12, 31): datetime(2006, 12, 31), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - on_offset_cases = [(YearEnd(), datetime(2007, 12, 31), True), - (YearEnd(), datetime(2008, 1, 1), False), - (YearEnd(), datetime(2006, 12, 31), True), - (YearEnd(), datetime(2006, 12, 29), False)] + on_offset_cases = [ + (YearEnd(), datetime(2007, 12, 31), True), + (YearEnd(), datetime(2008, 1, 1), False), + (YearEnd(), datetime(2006, 12, 31), True), + (YearEnd(), datetime(2006, 12, 29), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) @@ -853,45 +1193,71 @@ def test_onOffset(self, case): class TestYearEndDiffMonth(Base): offset_cases = [] - offset_cases.append((YearEnd(month=3), - {datetime(2008, 1, 1): datetime(2008, 3, 31), - datetime(2008, 2, 15): datetime(2008, 3, 31), - datetime(2008, 3, 31): datetime(2009, 3, 31), - datetime(2008, 3, 30): datetime(2008, 3, 31), - datetime(2005, 3, 31): datetime(2006, 3, 31), - datetime(2006, 7, 30): datetime(2007, 3, 31)})) - - offset_cases.append((YearEnd(0, month=3), - {datetime(2008, 1, 1): datetime(2008, 3, 31), - datetime(2008, 2, 28): datetime(2008, 3, 31), - datetime(2008, 3, 31): datetime(2008, 3, 31), - datetime(2005, 3, 30): datetime(2005, 3, 31)})) - - offset_cases.append((YearEnd(-1, month=3), - {datetime(2007, 1, 1): datetime(2006, 3, 31), - datetime(2008, 2, 28): datetime(2007, 3, 31), - datetime(2008, 3, 31): datetime(2007, 3, 31), - datetime(2006, 3, 29): datetime(2005, 3, 31), - datetime(2006, 3, 30): datetime(2005, 3, 31), - datetime(2007, 3, 1): datetime(2006, 3, 31)})) - - offset_cases.append((YearEnd(-2, month=3), - {datetime(2007, 1, 1): datetime(2005, 3, 31), - datetime(2008, 6, 30): datetime(2007, 3, 31), - datetime(2008, 3, 31): datetime(2006, 3, 31)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + YearEnd(month=3), + { + datetime(2008, 1, 1): datetime(2008, 3, 31), + datetime(2008, 2, 15): datetime(2008, 3, 31), + datetime(2008, 3, 31): datetime(2009, 3, 31), + datetime(2008, 3, 30): datetime(2008, 3, 31), + datetime(2005, 3, 31): datetime(2006, 3, 31), + datetime(2006, 7, 30): datetime(2007, 3, 31), + }, + ) + ) + + offset_cases.append( + ( + YearEnd(0, month=3), + { + datetime(2008, 1, 1): datetime(2008, 3, 31), + datetime(2008, 2, 28): datetime(2008, 3, 31), + datetime(2008, 3, 31): datetime(2008, 3, 31), + datetime(2005, 3, 30): datetime(2005, 3, 31), + }, + ) + ) + + offset_cases.append( + ( + YearEnd(-1, month=3), + { + datetime(2007, 1, 1): datetime(2006, 3, 31), + datetime(2008, 2, 28): datetime(2007, 3, 31), + datetime(2008, 3, 31): datetime(2007, 3, 31), + datetime(2006, 3, 29): datetime(2005, 3, 31), + datetime(2006, 3, 30): datetime(2005, 3, 31), + datetime(2007, 3, 1): datetime(2006, 3, 31), + }, + ) + ) + + offset_cases.append( + ( + YearEnd(-2, month=3), + { + datetime(2007, 1, 1): datetime(2005, 3, 31), + datetime(2008, 6, 30): datetime(2007, 3, 31), + datetime(2008, 3, 31): datetime(2006, 3, 31), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - on_offset_cases = [(YearEnd(month=3), datetime(2007, 3, 31), True), - (YearEnd(month=3), datetime(2008, 1, 1), False), - (YearEnd(month=3), datetime(2006, 3, 31), True), - (YearEnd(month=3), datetime(2006, 3, 29), False)] + on_offset_cases = [ + (YearEnd(month=3), datetime(2007, 3, 31), True), + (YearEnd(month=3), datetime(2008, 1, 1), False), + (YearEnd(month=3), datetime(2006, 3, 31), True), + (YearEnd(month=3), datetime(2006, 3, 29), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) @@ -908,38 +1274,62 @@ def test_misspecified(self): BYearEnd(month=13) offset_cases = [] - offset_cases.append((BYearBegin(), { - datetime(2008, 1, 1): datetime(2009, 1, 1), - datetime(2008, 6, 30): datetime(2009, 1, 1), - datetime(2008, 12, 31): datetime(2009, 1, 1), - datetime(2011, 1, 1): datetime(2011, 1, 3), - datetime(2011, 1, 3): datetime(2012, 1, 2), - datetime(2005, 12, 30): datetime(2006, 1, 2), - datetime(2005, 12, 31): datetime(2006, 1, 2)})) - - offset_cases.append((BYearBegin(0), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 6, 30): datetime(2009, 1, 1), - datetime(2008, 12, 31): datetime(2009, 1, 1), - datetime(2005, 12, 30): datetime(2006, 1, 2), - datetime(2005, 12, 31): datetime(2006, 1, 2)})) - - offset_cases.append((BYearBegin(-1), { - datetime(2007, 1, 1): datetime(2006, 1, 2), - datetime(2009, 1, 4): datetime(2009, 1, 1), - datetime(2009, 1, 1): datetime(2008, 1, 1), - datetime(2008, 6, 30): datetime(2008, 1, 1), - datetime(2008, 12, 31): datetime(2008, 1, 1), - datetime(2006, 12, 29): datetime(2006, 1, 2), - datetime(2006, 12, 30): datetime(2006, 1, 2), - datetime(2006, 1, 1): datetime(2005, 1, 3)})) - - offset_cases.append((BYearBegin(-2), { - datetime(2007, 1, 1): datetime(2005, 1, 3), - datetime(2007, 6, 30): datetime(2006, 1, 2), - datetime(2008, 12, 31): datetime(2007, 1, 1)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + BYearBegin(), + { + datetime(2008, 1, 1): datetime(2009, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2011, 1, 1): datetime(2011, 1, 3), + datetime(2011, 1, 3): datetime(2012, 1, 2), + datetime(2005, 12, 30): datetime(2006, 1, 2), + datetime(2005, 12, 31): datetime(2006, 1, 2), + }, + ) + ) + + offset_cases.append( + ( + BYearBegin(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2005, 12, 30): datetime(2006, 1, 2), + datetime(2005, 12, 31): datetime(2006, 1, 2), + }, + ) + ) + + offset_cases.append( + ( + BYearBegin(-1), + { + datetime(2007, 1, 1): datetime(2006, 1, 2), + datetime(2009, 1, 4): datetime(2009, 1, 1), + datetime(2009, 1, 1): datetime(2008, 1, 1), + datetime(2008, 6, 30): datetime(2008, 1, 1), + datetime(2008, 12, 31): datetime(2008, 1, 1), + datetime(2006, 12, 29): datetime(2006, 1, 2), + datetime(2006, 12, 30): datetime(2006, 1, 2), + datetime(2006, 1, 1): datetime(2005, 1, 3), + }, + ) + ) + + offset_cases.append( + ( + BYearBegin(-2), + { + datetime(2007, 1, 1): datetime(2005, 1, 3), + datetime(2007, 6, 30): datetime(2006, 1, 2), + datetime(2008, 12, 31): datetime(2007, 1, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): @@ -950,44 +1340,70 @@ class TestBYearEnd(Base): _offset = BYearEnd offset_cases = [] - offset_cases.append((BYearEnd(), { - datetime(2008, 1, 1): datetime(2008, 12, 31), - datetime(2008, 6, 30): datetime(2008, 12, 31), - datetime(2008, 12, 31): datetime(2009, 12, 31), - datetime(2005, 12, 30): datetime(2006, 12, 29), - datetime(2005, 12, 31): datetime(2006, 12, 29)})) - - offset_cases.append((BYearEnd(0), { - datetime(2008, 1, 1): datetime(2008, 12, 31), - datetime(2008, 6, 30): datetime(2008, 12, 31), - datetime(2008, 12, 31): datetime(2008, 12, 31), - datetime(2005, 12, 31): datetime(2006, 12, 29)})) - - offset_cases.append((BYearEnd(-1), { - datetime(2007, 1, 1): datetime(2006, 12, 29), - datetime(2008, 6, 30): datetime(2007, 12, 31), - datetime(2008, 12, 31): datetime(2007, 12, 31), - datetime(2006, 12, 29): datetime(2005, 12, 30), - datetime(2006, 12, 30): datetime(2006, 12, 29), - datetime(2007, 1, 1): datetime(2006, 12, 29)})) - - offset_cases.append((BYearEnd(-2), { - datetime(2007, 1, 1): datetime(2005, 12, 30), - datetime(2008, 6, 30): datetime(2006, 12, 29), - datetime(2008, 12, 31): datetime(2006, 12, 29)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + BYearEnd(), + { + datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2009, 12, 31), + datetime(2005, 12, 30): datetime(2006, 12, 29), + datetime(2005, 12, 31): datetime(2006, 12, 29), + }, + ) + ) + + offset_cases.append( + ( + BYearEnd(0), + { + datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2008, 12, 31), + datetime(2005, 12, 31): datetime(2006, 12, 29), + }, + ) + ) + + offset_cases.append( + ( + BYearEnd(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 29), + datetime(2008, 6, 30): datetime(2007, 12, 31), + datetime(2008, 12, 31): datetime(2007, 12, 31), + datetime(2006, 12, 29): datetime(2005, 12, 30), + datetime(2006, 12, 30): datetime(2006, 12, 29), + datetime(2007, 1, 1): datetime(2006, 12, 29), + }, + ) + ) + + offset_cases.append( + ( + BYearEnd(-2), + { + datetime(2007, 1, 1): datetime(2005, 12, 30), + datetime(2008, 6, 30): datetime(2006, 12, 29), + datetime(2008, 12, 31): datetime(2006, 12, 29), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - on_offset_cases = [(BYearEnd(), datetime(2007, 12, 31), True), - (BYearEnd(), datetime(2008, 1, 1), False), - (BYearEnd(), datetime(2006, 12, 31), False), - (BYearEnd(), datetime(2006, 12, 29), True)] + on_offset_cases = [ + (BYearEnd(), datetime(2007, 12, 31), True), + (BYearEnd(), datetime(2008, 1, 1), False), + (BYearEnd(), datetime(2006, 12, 31), False), + (BYearEnd(), datetime(2006, 12, 29), True), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) @@ -1004,15 +1420,27 @@ def test_bad_month_fail(self): BYearEnd(month=0) offset_cases = [] - offset_cases.append((BYearEnd(month=6), { - datetime(2008, 1, 1): datetime(2008, 6, 30), - datetime(2007, 6, 30): datetime(2008, 6, 30)})) - - offset_cases.append((BYearEnd(n=-1, month=6), { - datetime(2008, 1, 1): datetime(2007, 6, 29), - datetime(2007, 6, 30): datetime(2007, 6, 29)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + BYearEnd(month=6), + { + datetime(2008, 1, 1): datetime(2008, 6, 30), + datetime(2007, 6, 30): datetime(2008, 6, 30), + }, + ) + ) + + offset_cases.append( + ( + BYearEnd(n=-1, month=6), + { + datetime(2008, 1, 1): datetime(2007, 6, 29), + datetime(2007, 6, 30): datetime(2007, 6, 29), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): @@ -1025,10 +1453,12 @@ def test_roll(self): assert offset.rollforward(date) == datetime(2010, 6, 30) assert offset.rollback(date) == datetime(2009, 6, 30) - on_offset_cases = [(BYearEnd(month=2), datetime(2007, 2, 28), True), - (BYearEnd(month=6), datetime(2007, 6, 30), False)] + on_offset_cases = [ + (BYearEnd(month=2), datetime(2007, 2, 28), True), + (BYearEnd(month=6), datetime(2007, 6, 30), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py index 8474cb17fc28ae..47e398dfe3d167 100644 --- a/pandas/tests/tslibs/test_api.py +++ b/pandas/tests/tslibs/test_api.py @@ -5,38 +5,42 @@ def test_namespace(): - submodules = ['c_timestamp', - 'ccalendar', - 'conversion', - 'fields', - 'frequencies', - 'nattype', - 'np_datetime', - 'offsets', - 'parsing', - 'period', - 'resolution', - 'strptime', - 'timedeltas', - 'timestamps', - 'timezones', - 'tzconversion'] + submodules = [ + "c_timestamp", + "ccalendar", + "conversion", + "fields", + "frequencies", + "nattype", + "np_datetime", + "offsets", + "parsing", + "period", + "resolution", + "strptime", + "timedeltas", + "timestamps", + "timezones", + "tzconversion", + ] - api = ['NaT', - 'NaTType', - 'iNaT', - 'is_null_datetimelike', - 'OutOfBoundsDatetime', - 'Period', - 'IncompatibleFrequency', - 'Timedelta', - 'Timestamp', - 'delta_to_nanoseconds', - 'ints_to_pytimedelta', - 'localize_pydatetime', - 'normalize_date', - 'tz_convert_single'] + api = [ + "NaT", + "NaTType", + "iNaT", + "is_null_datetimelike", + "OutOfBoundsDatetime", + "Period", + "IncompatibleFrequency", + "Timedelta", + "Timestamp", + "delta_to_nanoseconds", + "ints_to_pytimedelta", + "localize_pydatetime", + "normalize_date", + "tz_convert_single", + ] expected = set(submodules + api) - names = [x for x in dir(tslibs) if not x.startswith('__')] + names = [x for x in dir(tslibs) if not x.startswith("__")] assert set(names) == expected diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 680be445e657e3..5cf2165993cd7e 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -12,14 +12,25 @@ import pandas.util.testing as tm -@pytest.mark.parametrize("data,expected", [ - (["01-01-2013", "01-02-2013"], - ["2013-01-01T00:00:00.000000000-0000", - "2013-01-02T00:00:00.000000000-0000"]), - (["Mon Sep 16 2013", "Tue Sep 17 2013"], - ["2013-09-16T00:00:00.000000000-0000", - "2013-09-17T00:00:00.000000000-0000"]) -]) +@pytest.mark.parametrize( + "data,expected", + [ + ( + ["01-01-2013", "01-02-2013"], + [ + "2013-01-01T00:00:00.000000000-0000", + "2013-01-02T00:00:00.000000000-0000", + ], + ), + ( + ["Mon Sep 16 2013", "Tue Sep 17 2013"], + [ + "2013-09-16T00:00:00.000000000-0000", + "2013-09-17T00:00:00.000000000-0000", + ], + ), + ], +) def test_parsing_valid_dates(data, expected): arr = np.array(data, dtype=object) result, _ = tslib.array_to_datetime(arr) @@ -28,12 +39,15 @@ def test_parsing_valid_dates(data, expected): tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("dt_string, expected_tz", [ - ["01-01-2013 08:00:00+08:00", 480], - ["2013-01-01T08:00:00.000000000+0800", 480], - ["2012-12-31T16:00:00.000000000-0800", -480], - ["12-31-2012 23:00:00-01:00", -60] -]) +@pytest.mark.parametrize( + "dt_string, expected_tz", + [ + ["01-01-2013 08:00:00+08:00", 480], + ["2013-01-01T08:00:00.000000000+0800", 480], + ["2012-12-31T16:00:00.000000000-0800", -480], + ["12-31-2012 23:00:00-01:00", -60], + ], +) def test_parsing_timezone_offsets(dt_string, expected_tz): # All of these datetime strings with offsets are equivalent # to the same datetime after the timezone offset is added. @@ -64,20 +78,21 @@ def test_parsing_different_timezone_offsets(): data = np.array(data, dtype=object) result, result_tz = tslib.array_to_datetime(data) - expected = np.array([datetime(2015, 11, 18, 15, 30, - tzinfo=tzoffset(None, 19800)), - datetime(2015, 11, 18, 15, 30, - tzinfo=tzoffset(None, 23400))], - dtype=object) + expected = np.array( + [ + datetime(2015, 11, 18, 15, 30, tzinfo=tzoffset(None, 19800)), + datetime(2015, 11, 18, 15, 30, tzinfo=tzoffset(None, 23400)), + ], + dtype=object, + ) tm.assert_numpy_array_equal(result, expected) assert result_tz is None -@pytest.mark.parametrize("data", [ - ["-352.737091", "183.575577"], - ["1", "2", "3", "4", "5"] -]) +@pytest.mark.parametrize( + "data", [["-352.737091", "183.575577"], ["1", "2", "3", "4", "5"]] +) def test_number_looking_strings_not_into_datetime(data): # see gh-4601 # @@ -89,12 +104,16 @@ def test_number_looking_strings_not_into_datetime(data): tm.assert_numpy_array_equal(result, arr) -@pytest.mark.parametrize("invalid_date", [ - date(1000, 1, 1), - datetime(1000, 1, 1), - "1000-01-01", - "Jan 1, 1000", - np.datetime64("1000-01-01")]) +@pytest.mark.parametrize( + "invalid_date", + [ + date(1000, 1, 1), + datetime(1000, 1, 1), + "1000-01-01", + "Jan 1, 1000", + np.datetime64("1000-01-01"), + ], +) @pytest.mark.parametrize("errors", ["coerce", "raise"]) def test_coerce_outside_ns_bounds(invalid_date, errors): arr = np.array([invalid_date], dtype="object") @@ -135,13 +154,11 @@ def test_coerce_of_invalid_datetimes(errors): else: # coerce. # With coercing, the invalid dates becomes iNaT result, _ = tslib.array_to_datetime(arr, errors="coerce") - expected = ["2013-01-01T00:00:00.000000000-0000", - iNaT, - iNaT] + expected = ["2013-01-01T00:00:00.000000000-0000", iNaT, iNaT] tm.assert_numpy_array_equal( - result, - np_array_datetime64_compat(expected, dtype="M8[ns]")) + result, np_array_datetime64_compat(expected, dtype="M8[ns]") + ) def test_to_datetime_barely_out_of_bounds(): @@ -160,14 +177,14 @@ class SubDatetime(datetime): pass -@pytest.mark.parametrize("data,expected", [ - ([SubDatetime(2000, 1, 1)], - ["2000-01-01T00:00:00.000000000-0000"]), - ([datetime(2000, 1, 1)], - ["2000-01-01T00:00:00.000000000-0000"]), - ([Timestamp(2000, 1, 1)], - ["2000-01-01T00:00:00.000000000-0000"]) -]) +@pytest.mark.parametrize( + "data,expected", + [ + ([SubDatetime(2000, 1, 1)], ["2000-01-01T00:00:00.000000000-0000"]), + ([datetime(2000, 1, 1)], ["2000-01-01T00:00:00.000000000-0000"]), + ([Timestamp(2000, 1, 1)], ["2000-01-01T00:00:00.000000000-0000"]), + ], +) def test_datetime_subclass(data, expected): # GH 25851 # ensure that subclassed datetime works with diff --git a/pandas/tests/tslibs/test_ccalendar.py b/pandas/tests/tslibs/test_ccalendar.py index f09dca7fb355e8..6f6e32411a7848 100644 --- a/pandas/tests/tslibs/test_ccalendar.py +++ b/pandas/tests/tslibs/test_ccalendar.py @@ -6,12 +6,15 @@ from pandas._libs.tslibs import ccalendar -@pytest.mark.parametrize("date_tuple,expected", [ - ((2001, 3, 1), 60), - ((2004, 3, 1), 61), - ((1907, 12, 31), 365), # End-of-year, non-leap year. - ((2004, 12, 31), 366), # End-of-year, leap year. -]) +@pytest.mark.parametrize( + "date_tuple,expected", + [ + ((2001, 3, 1), 60), + ((2004, 3, 1), 61), + ((1907, 12, 31), 365), # End-of-year, non-leap year. + ((2004, 12, 31), 366), # End-of-year, leap year. + ], +) def test_get_day_of_year_numeric(date_tuple, expected): assert ccalendar.get_day_of_year(*date_tuple) == expected diff --git a/pandas/tests/tslibs/test_conversion.py b/pandas/tests/tslibs/test_conversion.py index 9e6516ffeee9c1..6c30e2b6c7a1c9 100644 --- a/pandas/tests/tslibs/test_conversion.py +++ b/pandas/tests/tslibs/test_conversion.py @@ -50,13 +50,17 @@ def test_tz_convert_single_matches_tz_convert(tz_aware_fixture, freq): _compare_local_to_utc(tz_didx, utc_didx) -@pytest.mark.parametrize("arr", [ - pytest.param(np.array([], dtype=np.int64), id="empty"), - pytest.param(np.array([iNaT], dtype=np.int64), id="all_nat")]) +@pytest.mark.parametrize( + "arr", + [ + pytest.param(np.array([], dtype=np.int64), id="empty"), + pytest.param(np.array([iNaT], dtype=np.int64), id="all_nat"), + ], +) def test_tz_convert_corner(arr): - result = tzconversion.tz_convert(arr, - timezones.maybe_get_tz("US/Eastern"), - timezones.maybe_get_tz("Asia/Tokyo")) + result = tzconversion.tz_convert( + arr, timezones.maybe_get_tz("US/Eastern"), timezones.maybe_get_tz("Asia/Tokyo") + ) tm.assert_numpy_array_equal(result, arr) @@ -72,15 +76,22 @@ class SubDatetime(datetime): pass -@pytest.mark.parametrize("dt, expected", [ - pytest.param(Timestamp("2000-01-01"), - Timestamp("2000-01-01", tz=UTC), id="timestamp"), - pytest.param(datetime(2000, 1, 1), - datetime(2000, 1, 1, tzinfo=UTC), - id="datetime"), - pytest.param(SubDatetime(2000, 1, 1), - SubDatetime(2000, 1, 1, tzinfo=UTC), - id="subclassed_datetime")]) +@pytest.mark.parametrize( + "dt, expected", + [ + pytest.param( + Timestamp("2000-01-01"), Timestamp("2000-01-01", tz=UTC), id="timestamp" + ), + pytest.param( + datetime(2000, 1, 1), datetime(2000, 1, 1, tzinfo=UTC), id="datetime" + ), + pytest.param( + SubDatetime(2000, 1, 1), + SubDatetime(2000, 1, 1, tzinfo=UTC), + id="subclassed_datetime", + ), + ], +) def test_localize_pydatetime_dt_types(dt, expected): # GH 25851 # ensure that subclassed datetime works with diff --git a/pandas/tests/tslibs/test_libfrequencies.py b/pandas/tests/tslibs/test_libfrequencies.py index 279e8edebaf31d..5810c7e52abca7 100644 --- a/pandas/tests/tslibs/test_libfrequencies.py +++ b/pandas/tests/tslibs/test_libfrequencies.py @@ -1,94 +1,100 @@ import pytest from pandas._libs.tslibs.frequencies import ( - INVALID_FREQ_ERR_MSG, _period_str_to_code, get_rule_month, is_subperiod, - is_superperiod) + INVALID_FREQ_ERR_MSG, + _period_str_to_code, + get_rule_month, + is_subperiod, + is_superperiod, +) from pandas.tseries import offsets -@pytest.mark.parametrize("obj,expected", [ - ("W", "DEC"), - (offsets.Week(), "DEC"), - - ("D", "DEC"), - (offsets.Day(), "DEC"), - - ("Q", "DEC"), - (offsets.QuarterEnd(startingMonth=12), "DEC"), - - ("Q-JAN", "JAN"), - (offsets.QuarterEnd(startingMonth=1), "JAN"), - - ("A-DEC", "DEC"), - ("Y-DEC", "DEC"), - (offsets.YearEnd(), "DEC"), - - ("A-MAY", "MAY"), - ("Y-MAY", "MAY"), - (offsets.YearEnd(month=5), "MAY") -]) +@pytest.mark.parametrize( + "obj,expected", + [ + ("W", "DEC"), + (offsets.Week(), "DEC"), + ("D", "DEC"), + (offsets.Day(), "DEC"), + ("Q", "DEC"), + (offsets.QuarterEnd(startingMonth=12), "DEC"), + ("Q-JAN", "JAN"), + (offsets.QuarterEnd(startingMonth=1), "JAN"), + ("A-DEC", "DEC"), + ("Y-DEC", "DEC"), + (offsets.YearEnd(), "DEC"), + ("A-MAY", "MAY"), + ("Y-MAY", "MAY"), + (offsets.YearEnd(month=5), "MAY"), + ], +) def test_get_rule_month(obj, expected): result = get_rule_month(obj) assert result == expected -@pytest.mark.parametrize("obj,expected", [ - ("A", 1000), - ("A-DEC", 1000), - ("A-JAN", 1001), - - ("Y", 1000), - ("Y-DEC", 1000), - ("Y-JAN", 1001), - - ("Q", 2000), - ("Q-DEC", 2000), - ("Q-FEB", 2002), - - ("W", 4000), - ("W-SUN", 4000), - ("W-FRI", 4005), - - ("Min", 8000), - ("ms", 10000), - ("US", 11000), - ("NS", 12000) -]) +@pytest.mark.parametrize( + "obj,expected", + [ + ("A", 1000), + ("A-DEC", 1000), + ("A-JAN", 1001), + ("Y", 1000), + ("Y-DEC", 1000), + ("Y-JAN", 1001), + ("Q", 2000), + ("Q-DEC", 2000), + ("Q-FEB", 2002), + ("W", 4000), + ("W-SUN", 4000), + ("W-FRI", 4005), + ("Min", 8000), + ("ms", 10000), + ("US", 11000), + ("NS", 12000), + ], +) def test_period_str_to_code(obj, expected): assert _period_str_to_code(obj) == expected -@pytest.mark.parametrize("p1,p2,expected", [ - # Input validation. - (offsets.MonthEnd(), None, False), - (offsets.YearEnd(), None, False), - (None, offsets.YearEnd(), False), - (None, offsets.MonthEnd(), False), - (None, None, False), - - (offsets.YearEnd(), offsets.MonthEnd(), True), - (offsets.Hour(), offsets.Minute(), True), - (offsets.Second(), offsets.Milli(), True), - (offsets.Milli(), offsets.Micro(), True), - (offsets.Micro(), offsets.Nano(), True) -]) +@pytest.mark.parametrize( + "p1,p2,expected", + [ + # Input validation. + (offsets.MonthEnd(), None, False), + (offsets.YearEnd(), None, False), + (None, offsets.YearEnd(), False), + (None, offsets.MonthEnd(), False), + (None, None, False), + (offsets.YearEnd(), offsets.MonthEnd(), True), + (offsets.Hour(), offsets.Minute(), True), + (offsets.Second(), offsets.Milli(), True), + (offsets.Milli(), offsets.Micro(), True), + (offsets.Micro(), offsets.Nano(), True), + ], +) def test_super_sub_symmetry(p1, p2, expected): assert is_superperiod(p1, p2) is expected assert is_subperiod(p2, p1) is expected -@pytest.mark.parametrize("freq,expected,aliases", [ - ("D", 6000, ["DAY", "DLY", "DAILY"]), - ("M", 3000, ["MTH", "MONTH", "MONTHLY"]), - ("N", 12000, ["NANOSECOND", "NANOSECONDLY"]), - ("H", 7000, ["HR", "HOUR", "HRLY", "HOURLY"]), - ("T", 8000, ["minute", "MINUTE", "MINUTELY"]), - ("L", 10000, ["MILLISECOND", "MILLISECONDLY"]), - ("U", 11000, ["MICROSECOND", "MICROSECONDLY"]), - ("S", 9000, ["sec", "SEC", "SECOND", "SECONDLY"]), - ("B", 5000, ["BUS", "BUSINESS", "BUSINESSLY", "WEEKDAY"]), -]) +@pytest.mark.parametrize( + "freq,expected,aliases", + [ + ("D", 6000, ["DAY", "DLY", "DAILY"]), + ("M", 3000, ["MTH", "MONTH", "MONTHLY"]), + ("N", 12000, ["NANOSECOND", "NANOSECONDLY"]), + ("H", 7000, ["HR", "HOUR", "HRLY", "HOURLY"]), + ("T", 8000, ["minute", "MINUTE", "MINUTELY"]), + ("L", 10000, ["MILLISECOND", "MILLISECONDLY"]), + ("U", 11000, ["MICROSECOND", "MICROSECONDLY"]), + ("S", 9000, ["sec", "SEC", "SECOND", "SECONDLY"]), + ("B", 5000, ["BUS", "BUSINESS", "BUSINESSLY", "WEEKDAY"]), + ], +) def test_assert_aliases_deprecated(freq, expected, aliases): assert isinstance(aliases, list) assert _period_str_to_code(freq) == expected diff --git a/pandas/tests/tslibs/test_liboffsets.py b/pandas/tests/tslibs/test_liboffsets.py index ca6402d6151e03..6ff2ae669c8df9 100644 --- a/pandas/tests/tslibs/test_liboffsets.py +++ b/pandas/tests/tslibs/test_liboffsets.py @@ -16,43 +16,55 @@ def day_opt(request): return request.param -@pytest.mark.parametrize("dt,exp_week_day,exp_last_day", [ - (datetime(2017, 11, 30), 3, 30), # Business day. - (datetime(1993, 10, 31), 6, 29) # Non-business day. -]) +@pytest.mark.parametrize( + "dt,exp_week_day,exp_last_day", + [ + (datetime(2017, 11, 30), 3, 30), # Business day. + (datetime(1993, 10, 31), 6, 29), # Non-business day. + ], +) def test_get_last_bday(dt, exp_week_day, exp_last_day): assert dt.weekday() == exp_week_day assert liboffsets.get_lastbday(dt.year, dt.month) == exp_last_day -@pytest.mark.parametrize("dt,exp_week_day,exp_first_day", [ - (datetime(2017, 4, 1), 5, 3), # Non-weekday. - (datetime(1993, 10, 1), 4, 1) # Business day. -]) +@pytest.mark.parametrize( + "dt,exp_week_day,exp_first_day", + [ + (datetime(2017, 4, 1), 5, 3), # Non-weekday. + (datetime(1993, 10, 1), 4, 1), # Business day. + ], +) def test_get_first_bday(dt, exp_week_day, exp_first_day): assert dt.weekday() == exp_week_day assert liboffsets.get_firstbday(dt.year, dt.month) == exp_first_day -@pytest.mark.parametrize("months,day_opt,expected", [ - (0, 15, datetime(2017, 11, 15)), - (0, None, datetime(2017, 11, 30)), - (1, "start", datetime(2017, 12, 1)), - (-145, "end", datetime(2005, 10, 31)), - (0, "business_end", datetime(2017, 11, 30)), - (0, "business_start", datetime(2017, 11, 1)) -]) +@pytest.mark.parametrize( + "months,day_opt,expected", + [ + (0, 15, datetime(2017, 11, 15)), + (0, None, datetime(2017, 11, 30)), + (1, "start", datetime(2017, 12, 1)), + (-145, "end", datetime(2005, 10, 31)), + (0, "business_end", datetime(2017, 11, 30)), + (0, "business_start", datetime(2017, 11, 1)), + ], +) def test_shift_month_dt(months, day_opt, expected): dt = datetime(2017, 11, 30) assert liboffsets.shift_month(dt, months, day_opt=day_opt) == expected -@pytest.mark.parametrize("months,day_opt,expected", [ - (1, "start", Timestamp("1929-06-01")), - (-3, "end", Timestamp("1929-02-28")), - (25, None, Timestamp("1931-06-5")), - (-1, 31, Timestamp("1929-04-30")) -]) +@pytest.mark.parametrize( + "months,day_opt,expected", + [ + (1, "start", Timestamp("1929-06-01")), + (-3, "end", Timestamp("1929-02-28")), + (25, None, Timestamp("1931-06-5")), + (-1, 31, Timestamp("1929-04-30")), + ], +) def test_shift_month_ts(months, day_opt, expected): ts = Timestamp("1929-05-05") assert liboffsets.shift_month(ts, months, day_opt=day_opt) == expected @@ -66,13 +78,15 @@ def test_shift_month_error(): liboffsets.shift_month(dt, 3, day_opt=day_opt) -@pytest.mark.parametrize("other,expected", [ - # Before March 1. - (datetime(2017, 2, 10), {2: 1, -7: -7, 0: 0}), - - # After March 1. - (Timestamp("2014-03-15", tz="US/Eastern"), {2: 2, -7: -6, 0: 1}) -]) +@pytest.mark.parametrize( + "other,expected", + [ + # Before March 1. + (datetime(2017, 2, 10), {2: 1, -7: -7, 0: 0}), + # After March 1. + (Timestamp("2014-03-15", tz="US/Eastern"), {2: 2, -7: -6, 0: 1}), + ], +) @pytest.mark.parametrize("n", [2, -7, 0]) def test_roll_yearday(other, expected, n): month = 3 @@ -81,13 +95,15 @@ def test_roll_yearday(other, expected, n): assert liboffsets.roll_yearday(other, n, month, day_opt) == expected[n] -@pytest.mark.parametrize("other,expected", [ - # Before June 30. - (datetime(1999, 6, 29), {5: 4, -7: -7, 0: 0}), - - # After June 30. - (Timestamp(2072, 8, 24, 6, 17, 18), {5: 5, -7: -6, 0: 1}) -]) +@pytest.mark.parametrize( + "other,expected", + [ + # Before June 30. + (datetime(1999, 6, 29), {5: 4, -7: -7, 0: 0}), + # After June 30. + (Timestamp(2072, 8, 24, 6, 17, 18), {5: 5, -7: -6, 0: 1}), + ], +) @pytest.mark.parametrize("n", [5, -7, 0]) def test_roll_yearday2(other, expected, n): month = 6 @@ -107,56 +123,37 @@ def test_get_day_of_month_error(): liboffsets.roll_yearday(dt, n=3, month=11, day_opt=day_opt) -@pytest.mark.parametrize("month", [ - 3, # (other.month % 3) < (month % 3) - 5 # (other.month % 3) > (month % 3) -]) +@pytest.mark.parametrize( + "month", + [3, 5], # (other.month % 3) < (month % 3) # (other.month % 3) > (month % 3) +) @pytest.mark.parametrize("n", [4, -3]) def test_roll_qtr_day_not_mod_unequal(day_opt, month, n): - expected = { - 3: { - -3: -2, - 4: 4 - }, - 5: { - -3: -3, - 4: 3 - } - } + expected = {3: {-3: -2, 4: 4}, 5: {-3: -3, 4: 3}} other = Timestamp(2072, 10, 1, 6, 17, 18) # Saturday. assert roll_qtrday(other, n, month, day_opt, modby=3) == expected[month][n] -@pytest.mark.parametrize("other,month,exp_dict", [ - # Monday. - (datetime(1999, 5, 31), 2, { - -1: { - "start": 0, - "business_start": 0 - } - }), - - # Saturday. - (Timestamp(2072, 10, 1, 6, 17, 18), 4, { - 2: { - "end": 1, - "business_end": 1, - "business_start": 1 - } - }), - - # First business day. - (Timestamp(2072, 10, 3, 6, 17, 18), 4, { - 2: { - "end": 1, - "business_end": 1 - }, - -1: { - "start": 0 - } - }) -]) +@pytest.mark.parametrize( + "other,month,exp_dict", + [ + # Monday. + (datetime(1999, 5, 31), 2, {-1: {"start": 0, "business_start": 0}}), + # Saturday. + ( + Timestamp(2072, 10, 1, 6, 17, 18), + 4, + {2: {"end": 1, "business_end": 1, "business_start": 1}}, + ), + # First business day. + ( + Timestamp(2072, 10, 3, 6, 17, 18), + 4, + {2: {"end": 1, "business_end": 1}, -1: {"start": 0}}, + ), + ], +) @pytest.mark.parametrize("n", [2, -1]) def test_roll_qtr_day_mod_equal(other, month, exp_dict, n, day_opt): # All cases have (other.month % 3) == (month % 3). @@ -164,10 +161,9 @@ def test_roll_qtr_day_mod_equal(other, month, exp_dict, n, day_opt): assert roll_qtrday(other, n, month, day_opt, modby=3) == expected -@pytest.mark.parametrize("n,expected", [ - (42, {29: 42, 1: 42, 31: 41}), - (-4, {29: -4, 1: -3, 31: -4}) -]) +@pytest.mark.parametrize( + "n,expected", [(42, {29: 42, 1: 42, 31: 41}), (-4, {29: -4, 1: -3, 31: -4})] +) @pytest.mark.parametrize("compare", [29, 1, 31]) def test_roll_convention(n, expected, compare): assert liboffsets.roll_convention(29, n, compare) == expected[compare] diff --git a/pandas/tests/tslibs/test_normalize_date.py b/pandas/tests/tslibs/test_normalize_date.py index e169b1c7aa505d..2a41836f456ecd 100644 --- a/pandas/tests/tslibs/test_normalize_date.py +++ b/pandas/tests/tslibs/test_normalize_date.py @@ -8,11 +8,14 @@ from pandas._libs.tslibs.timestamps import Timestamp -@pytest.mark.parametrize("value,expected", [ - (date(2012, 9, 7), datetime(2012, 9, 7)), - (datetime(2012, 9, 7, 12), datetime(2012, 9, 7)), - (datetime(2007, 10, 1, 1, 12, 5, 10), datetime(2007, 10, 1)) -]) +@pytest.mark.parametrize( + "value,expected", + [ + (date(2012, 9, 7), datetime(2012, 9, 7)), + (datetime(2012, 9, 7, 12), datetime(2012, 9, 7)), + (datetime(2007, 10, 1, 1, 12, 5, 10), datetime(2007, 10, 1)), + ], +) def test_normalize_date(value, expected): result = tslibs.normalize_date(value) assert result == expected @@ -22,10 +25,14 @@ class SubDatetime(datetime): pass -@pytest.mark.parametrize("dt, expected", [ - (Timestamp(2000, 1, 1, 1), Timestamp(2000, 1, 1, 0)), - (datetime(2000, 1, 1, 1), datetime(2000, 1, 1, 0)), - (SubDatetime(2000, 1, 1, 1), SubDatetime(2000, 1, 1, 0))]) +@pytest.mark.parametrize( + "dt, expected", + [ + (Timestamp(2000, 1, 1, 1), Timestamp(2000, 1, 1, 0)), + (datetime(2000, 1, 1, 1), datetime(2000, 1, 1, 0)), + (SubDatetime(2000, 1, 1, 1), SubDatetime(2000, 1, 1, 0)), + ], +) def test_normalize_date_sub_types(dt, expected): # GH 25851 # ensure that subclassed datetime works with diff --git a/pandas/tests/tslibs/test_parse_iso8601.py b/pandas/tests/tslibs/test_parse_iso8601.py index 8c995f243a9931..a6e7aee46b485b 100644 --- a/pandas/tests/tslibs/test_parse_iso8601.py +++ b/pandas/tests/tslibs/test_parse_iso8601.py @@ -5,17 +5,21 @@ from pandas._libs import tslib -@pytest.mark.parametrize("date_str, exp", [ - ("2011-01-02", datetime(2011, 1, 2)), - ("2011-1-2", datetime(2011, 1, 2)), - ("2011-01", datetime(2011, 1, 1)), - ("2011-1", datetime(2011, 1, 1)), - ("2011 01 02", datetime(2011, 1, 2)), - ("2011.01.02", datetime(2011, 1, 2)), - ("2011/01/02", datetime(2011, 1, 2)), - ("2011\\01\\02", datetime(2011, 1, 2)), - ("2013-01-01 05:30:00", datetime(2013, 1, 1, 5, 30)), - ("2013-1-1 5:30:00", datetime(2013, 1, 1, 5, 30))]) +@pytest.mark.parametrize( + "date_str, exp", + [ + ("2011-01-02", datetime(2011, 1, 2)), + ("2011-1-2", datetime(2011, 1, 2)), + ("2011-01", datetime(2011, 1, 1)), + ("2011-1", datetime(2011, 1, 1)), + ("2011 01 02", datetime(2011, 1, 2)), + ("2011.01.02", datetime(2011, 1, 2)), + ("2011/01/02", datetime(2011, 1, 2)), + ("2011\\01\\02", datetime(2011, 1, 2)), + ("2013-01-01 05:30:00", datetime(2013, 1, 1, 5, 30)), + ("2013-1-1 5:30:00", datetime(2013, 1, 1, 5, 30)), + ], +) def test_parsers_iso8601(date_str, exp): # see gh-12060 # @@ -25,28 +29,29 @@ def test_parsers_iso8601(date_str, exp): assert actual == exp -@pytest.mark.parametrize("date_str", [ - "2011-01/02", - "2011=11=11", - "201401", - "201111", - "200101", - - # Mixed separated and unseparated. - "2005-0101", - "200501-01", - "20010101 12:3456", - "20010101 1234:56", - - # HHMMSS must have two digits in - # each component if unseparated. - "20010101 1", - "20010101 123", - "20010101 12345", - "20010101 12345Z", -]) +@pytest.mark.parametrize( + "date_str", + [ + "2011-01/02", + "2011=11=11", + "201401", + "201111", + "200101", + # Mixed separated and unseparated. + "2005-0101", + "200501-01", + "20010101 12:3456", + "20010101 1234:56", + # HHMMSS must have two digits in + # each component if unseparated. + "20010101 1", + "20010101 123", + "20010101 12345", + "20010101 12345Z", + ], +) def test_parsers_iso8601_invalid(date_str): - msg = "Error parsing datetime string \"{s}\"".format(s=date_str) + msg = 'Error parsing datetime string "{s}"'.format(s=date_str) with pytest.raises(ValueError, match=msg): tslib._test_parse_iso8601(date_str) @@ -54,8 +59,9 @@ def test_parsers_iso8601_invalid(date_str): def test_parsers_iso8601_invalid_offset_invalid(): date_str = "2001-01-01 12-34-56" - msg = ("Timezone hours offset out of range " - "in datetime string \"{s}\"".format(s=date_str)) + msg = "Timezone hours offset out of range " 'in datetime string "{s}"'.format( + s=date_str + ) with pytest.raises(ValueError, match=msg): tslib._test_parse_iso8601(date_str) @@ -64,5 +70,5 @@ def test_parsers_iso8601_invalid_offset_invalid(): def test_parsers_iso8601_leading_space(): # GH#25895 make sure isoparser doesn't overflow with long input date_str, expected = ("2013-1-1 5:30:00", datetime(2013, 1, 1, 5, 30)) - actual = tslib._test_parse_iso8601(' ' * 200 + date_str) + actual = tslib._test_parse_iso8601(" " * 200 + date_str) assert actual == expected diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 6d9b72b67d4c7e..700fee2d89f3ca 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -23,10 +23,9 @@ def test_parse_time_string(): assert parsed == parsed_lower -@pytest.mark.parametrize("dashed,normal", [ - ("1988-Q2", "1988Q2"), - ("2Q-1988", "2Q1988") -]) +@pytest.mark.parametrize( + "dashed,normal", [("1988-Q2", "1988Q2"), ("2Q-1988", "2Q1988")] +) def test_parse_time_quarter_with_dash(dashed, normal): # see gh-9688 (date_dash, parsed_dash, reso_dash) = parse_time_string(dashed) @@ -37,140 +36,171 @@ def test_parse_time_quarter_with_dash(dashed, normal): assert reso_dash == reso -@pytest.mark.parametrize("dashed", [ - "-2Q1992", "2-Q1992", "4-4Q1992" -]) +@pytest.mark.parametrize("dashed", ["-2Q1992", "2-Q1992", "4-4Q1992"]) def test_parse_time_quarter_with_dash_error(dashed): - msg = ("Unknown datetime string format, " - "unable to parse: {dashed}".format(dashed=dashed)) + msg = "Unknown datetime string format, " "unable to parse: {dashed}".format( + dashed=dashed + ) with pytest.raises(parsing.DateParseError, match=msg): parse_time_string(dashed) -@pytest.mark.parametrize("date_string,expected", [ - ("123.1234", False), - ("-50000", False), - ("999", False), - ("m", False), - ("T", False), - - ("Mon Sep 16, 2013", True), - ("2012-01-01", True), - ("01/01/2012", True), - ("01012012", True), - ("0101", True), - ("1-1", True) -]) +@pytest.mark.parametrize( + "date_string,expected", + [ + ("123.1234", False), + ("-50000", False), + ("999", False), + ("m", False), + ("T", False), + ("Mon Sep 16, 2013", True), + ("2012-01-01", True), + ("01/01/2012", True), + ("01012012", True), + ("0101", True), + ("1-1", True), + ], +) def test_does_not_convert_mixed_integer(date_string, expected): assert parsing._does_string_look_like_datetime(date_string) is expected -@pytest.mark.parametrize("date_str,kwargs,msg", [ - ("2013Q5", dict(), - ("Incorrect quarterly string is given, " - "quarter must be between 1 and 4: 2013Q5")), - - # see gh-5418 - ("2013Q1", dict(freq="INVLD-L-DEC-SAT"), - ("Unable to retrieve month information " - "from given freq: INVLD-L-DEC-SAT")) -]) +@pytest.mark.parametrize( + "date_str,kwargs,msg", + [ + ( + "2013Q5", + dict(), + ( + "Incorrect quarterly string is given, " + "quarter must be between 1 and 4: 2013Q5" + ), + ), + # see gh-5418 + ( + "2013Q1", + dict(freq="INVLD-L-DEC-SAT"), + ( + "Unable to retrieve month information " + "from given freq: INVLD-L-DEC-SAT" + ), + ), + ], +) def test_parsers_quarterly_with_freq_error(date_str, kwargs, msg): with pytest.raises(parsing.DateParseError, match=msg): parsing.parse_time_string(date_str, **kwargs) -@pytest.mark.parametrize("date_str,freq,expected", [ - ("2013Q2", None, datetime(2013, 4, 1)), - ("2013Q2", "A-APR", datetime(2012, 8, 1)), - ("2013-Q2", "A-DEC", datetime(2013, 4, 1)) -]) +@pytest.mark.parametrize( + "date_str,freq,expected", + [ + ("2013Q2", None, datetime(2013, 4, 1)), + ("2013Q2", "A-APR", datetime(2012, 8, 1)), + ("2013-Q2", "A-DEC", datetime(2013, 4, 1)), + ], +) def test_parsers_quarterly_with_freq(date_str, freq, expected): result, _, _ = parsing.parse_time_string(date_str, freq=freq) assert result == expected -@pytest.mark.parametrize("date_str", [ - "2Q 2005", "2Q-200A", "2Q-200", - "22Q2005", "2Q200.", "6Q-20" -]) +@pytest.mark.parametrize( + "date_str", ["2Q 2005", "2Q-200A", "2Q-200", "22Q2005", "2Q200.", "6Q-20"] +) def test_parsers_quarter_invalid(date_str): if date_str == "6Q-20": - msg = ("Incorrect quarterly string is given, quarter " - "must be between 1 and 4: {date_str}".format(date_str=date_str)) + msg = ( + "Incorrect quarterly string is given, quarter " + "must be between 1 and 4: {date_str}".format(date_str=date_str) + ) else: - msg = ("Unknown datetime string format, unable " - "to parse: {date_str}".format(date_str=date_str)) + msg = "Unknown datetime string format, unable " "to parse: {date_str}".format( + date_str=date_str + ) with pytest.raises(ValueError, match=msg): parsing.parse_time_string(date_str) -@pytest.mark.parametrize("date_str,expected", [ - ("201101", datetime(2011, 1, 1, 0, 0)), - ("200005", datetime(2000, 5, 1, 0, 0)) -]) +@pytest.mark.parametrize( + "date_str,expected", + [("201101", datetime(2011, 1, 1, 0, 0)), ("200005", datetime(2000, 5, 1, 0, 0))], +) def test_parsers_month_freq(date_str, expected): result, _, _ = parsing.parse_time_string(date_str, freq="M") assert result == expected @td.skip_if_not_us_locale -@pytest.mark.parametrize("string,fmt", [ - ("20111230", "%Y%m%d"), - ("2011-12-30", "%Y-%m-%d"), - ("30-12-2011", "%d-%m-%Y"), - ("2011-12-30 00:00:00", "%Y-%m-%d %H:%M:%S"), - ("2011-12-30T00:00:00", "%Y-%m-%dT%H:%M:%S"), - ("2011-12-30 00:00:00.000000", "%Y-%m-%d %H:%M:%S.%f") -]) +@pytest.mark.parametrize( + "string,fmt", + [ + ("20111230", "%Y%m%d"), + ("2011-12-30", "%Y-%m-%d"), + ("30-12-2011", "%d-%m-%Y"), + ("2011-12-30 00:00:00", "%Y-%m-%d %H:%M:%S"), + ("2011-12-30T00:00:00", "%Y-%m-%dT%H:%M:%S"), + ("2011-12-30 00:00:00.000000", "%Y-%m-%d %H:%M:%S.%f"), + ], +) def test_guess_datetime_format_with_parseable_formats(string, fmt): result = parsing._guess_datetime_format(string) assert result == fmt -@pytest.mark.parametrize("dayfirst,expected", [ - (True, "%d/%m/%Y"), - (False, "%m/%d/%Y") -]) +@pytest.mark.parametrize("dayfirst,expected", [(True, "%d/%m/%Y"), (False, "%m/%d/%Y")]) def test_guess_datetime_format_with_dayfirst(dayfirst, expected): ambiguous_string = "01/01/2011" - result = parsing._guess_datetime_format(ambiguous_string, - dayfirst=dayfirst) + result = parsing._guess_datetime_format(ambiguous_string, dayfirst=dayfirst) assert result == expected @td.skip_if_has_locale -@pytest.mark.parametrize("string,fmt", [ - ("30/Dec/2011", "%d/%b/%Y"), - ("30/December/2011", "%d/%B/%Y"), - ("30/Dec/2011 00:00:00", "%d/%b/%Y %H:%M:%S") -]) +@pytest.mark.parametrize( + "string,fmt", + [ + ("30/Dec/2011", "%d/%b/%Y"), + ("30/December/2011", "%d/%B/%Y"), + ("30/Dec/2011 00:00:00", "%d/%b/%Y %H:%M:%S"), + ], +) def test_guess_datetime_format_with_locale_specific_formats(string, fmt): result = parsing._guess_datetime_format(string) assert result == fmt -@pytest.mark.parametrize("invalid_dt", [ - "2013", "01/2013", "12:00:00", "1/1/1/1", - "this_is_not_a_datetime", "51a", 9, - datetime(2011, 1, 1) -]) +@pytest.mark.parametrize( + "invalid_dt", + [ + "2013", + "01/2013", + "12:00:00", + "1/1/1/1", + "this_is_not_a_datetime", + "51a", + 9, + datetime(2011, 1, 1), + ], +) def test_guess_datetime_format_invalid_inputs(invalid_dt): # A datetime string must include a year, month and a day for it to be # guessable, in addition to being a string that looks like a datetime. assert parsing._guess_datetime_format(invalid_dt) is None -@pytest.mark.parametrize("string,fmt", [ - ("2011-1-1", "%Y-%m-%d"), - ("1/1/2011", "%m/%d/%Y"), - ("30-1-2011", "%d-%m-%Y"), - ("2011-1-1 0:0:0", "%Y-%m-%d %H:%M:%S"), - ("2011-1-3T00:00:0", "%Y-%m-%dT%H:%M:%S"), - ("2011-1-1 00:00:00", "%Y-%m-%d %H:%M:%S") -]) +@pytest.mark.parametrize( + "string,fmt", + [ + ("2011-1-1", "%Y-%m-%d"), + ("1/1/2011", "%m/%d/%Y"), + ("30-1-2011", "%d-%m-%Y"), + ("2011-1-1 0:0:0", "%Y-%m-%d %H:%M:%S"), + ("2011-1-3T00:00:0", "%Y-%m-%dT%H:%M:%S"), + ("2011-1-1 00:00:00", "%Y-%m-%d %H:%M:%S"), + ], +) def test_guess_datetime_format_no_padding(string, fmt): # see gh-11142 result = parsing._guess_datetime_format(string) diff --git a/pandas/tests/tslibs/test_period_asfreq.py b/pandas/tests/tslibs/test_period_asfreq.py index a86f1e873893d8..5497cb65c53734 100644 --- a/pandas/tests/tslibs/test_period_asfreq.py +++ b/pandas/tests/tslibs/test_period_asfreq.py @@ -4,82 +4,75 @@ from pandas._libs.tslibs.period import period_asfreq, period_ordinal -@pytest.mark.parametrize("freq1,freq2,expected", [ - ("D", "H", 24), - ("D", "T", 1440), - ("D", "S", 86400), - ("D", "L", 86400000), - ("D", "U", 86400000000), - ("D", "N", 86400000000000), - - ("H", "T", 60), - ("H", "S", 3600), - ("H", "L", 3600000), - ("H", "U", 3600000000), - ("H", "N", 3600000000000), - - ("T", "S", 60), - ("T", "L", 60000), - ("T", "U", 60000000), - ("T", "N", 60000000000), - - ("S", "L", 1000), - ("S", "U", 1000000), - ("S", "N", 1000000000), - - ("L", "U", 1000), - ("L", "N", 1000000), - - ("U", "N", 1000) -]) +@pytest.mark.parametrize( + "freq1,freq2,expected", + [ + ("D", "H", 24), + ("D", "T", 1440), + ("D", "S", 86400), + ("D", "L", 86400000), + ("D", "U", 86400000000), + ("D", "N", 86400000000000), + ("H", "T", 60), + ("H", "S", 3600), + ("H", "L", 3600000), + ("H", "U", 3600000000), + ("H", "N", 3600000000000), + ("T", "S", 60), + ("T", "L", 60000), + ("T", "U", 60000000), + ("T", "N", 60000000000), + ("S", "L", 1000), + ("S", "U", 1000000), + ("S", "N", 1000000000), + ("L", "U", 1000), + ("L", "N", 1000000), + ("U", "N", 1000), + ], +) def test_intra_day_conversion_factors(freq1, freq2, expected): - assert period_asfreq(1, get_freq(freq1), - get_freq(freq2), False) == expected + assert period_asfreq(1, get_freq(freq1), get_freq(freq2), False) == expected -@pytest.mark.parametrize("freq,expected", [ - ("A", 0), - ("M", 0), - ("W", 1), - ("D", 0), - ("B", 0) -]) +@pytest.mark.parametrize( + "freq,expected", [("A", 0), ("M", 0), ("W", 1), ("D", 0), ("B", 0)] +) def test_period_ordinal_start_values(freq, expected): # information for Jan. 1, 1970. - assert period_ordinal(1970, 1, 1, 0, 0, 0, - 0, 0, get_freq(freq)) == expected - - -@pytest.mark.parametrize("dt,expected", [ - ((1970, 1, 4, 0, 0, 0, 0, 0), 1), - ((1970, 1, 5, 0, 0, 0, 0, 0), 2), - ((2013, 10, 6, 0, 0, 0, 0, 0), 2284), - ((2013, 10, 7, 0, 0, 0, 0, 0), 2285) -]) + assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq(freq)) == expected + + +@pytest.mark.parametrize( + "dt,expected", + [ + ((1970, 1, 4, 0, 0, 0, 0, 0), 1), + ((1970, 1, 5, 0, 0, 0, 0, 0), 2), + ((2013, 10, 6, 0, 0, 0, 0, 0), 2284), + ((2013, 10, 7, 0, 0, 0, 0, 0), 2285), + ], +) def test_period_ordinal_week(dt, expected): args = dt + (get_freq("W"),) assert period_ordinal(*args) == expected -@pytest.mark.parametrize("day,expected", [ - # Thursday (Oct. 3, 2013). - (3, 11415), - - # Friday (Oct. 4, 2013). - (4, 11416), - - # Saturday (Oct. 5, 2013). - (5, 11417), - - # Sunday (Oct. 6, 2013). - (6, 11417), - - # Monday (Oct. 7, 2013). - (7, 11417), - - # Tuesday (Oct. 8, 2013). - (8, 11418) -]) +@pytest.mark.parametrize( + "day,expected", + [ + # Thursday (Oct. 3, 2013). + (3, 11415), + # Friday (Oct. 4, 2013). + (4, 11416), + # Saturday (Oct. 5, 2013). + (5, 11417), + # Sunday (Oct. 6, 2013). + (6, 11417), + # Monday (Oct. 7, 2013). + (7, 11417), + # Tuesday (Oct. 8, 2013). + (8, 11418), + ], +) def test_period_ordinal_business_day(day, expected): args = (2013, 10, day, 0, 0, 0, 0, 0, get_freq("B")) assert period_ordinal(*args) == expected diff --git a/pandas/tests/tslibs/test_timedeltas.py b/pandas/tests/tslibs/test_timedeltas.py index 65ae9d6ed90ec8..d4bd7c2d5486ce 100644 --- a/pandas/tests/tslibs/test_timedeltas.py +++ b/pandas/tests/tslibs/test_timedeltas.py @@ -7,15 +7,18 @@ from pandas import Timedelta -@pytest.mark.parametrize("obj,expected", [ - (np.timedelta64(14, "D"), 14 * 24 * 3600 * 1e9), - (Timedelta(minutes=-7), -7 * 60 * 1e9), - (Timedelta(minutes=-7).to_pytimedelta(), -7 * 60 * 1e9), - (pd.offsets.Nano(125), 125), - (1, 1), - (np.int64(2), 2), - (np.int32(3), 3) -]) +@pytest.mark.parametrize( + "obj,expected", + [ + (np.timedelta64(14, "D"), 14 * 24 * 3600 * 1e9), + (Timedelta(minutes=-7), -7 * 60 * 1e9), + (Timedelta(minutes=-7).to_pytimedelta(), -7 * 60 * 1e9), + (pd.offsets.Nano(125), 125), + (1, 1), + (np.int64(2), 2), + (np.int32(3), 3), + ], +) def test_delta_to_nanoseconds(obj, expected): result = delta_to_nanoseconds(obj) assert result == expected diff --git a/pandas/tests/tslibs/test_timezones.py b/pandas/tests/tslibs/test_timezones.py index e3fb6ecfb9c1c9..03cc8fcb6e904f 100644 --- a/pandas/tests/tslibs/test_timezones.py +++ b/pandas/tests/tslibs/test_timezones.py @@ -32,7 +32,7 @@ def test_tzlocal_repr(): def test_tzlocal_maybe_get_tz(): # see gh-13583 - tz = timezones.maybe_get_tz('tzlocal()') + tz = timezones.maybe_get_tz("tzlocal()") assert tz == dateutil.tz.tzlocal() @@ -48,10 +48,12 @@ def test_tzlocal_offset(): assert ts.value + offset == Timestamp("2011-01-01").value -@pytest.fixture(params=[ - (pytz.timezone("US/Eastern"), lambda tz, x: tz.localize(x)), - (dateutil.tz.gettz("US/Eastern"), lambda tz, x: x.replace(tzinfo=tz)) -]) +@pytest.fixture( + params=[ + (pytz.timezone("US/Eastern"), lambda tz, x: tz.localize(x)), + (dateutil.tz.gettz("US/Eastern"), lambda tz, x: x.replace(tzinfo=tz)), + ] +) def infer_setup(request): eastern, localize = request.param @@ -67,12 +69,18 @@ def infer_setup(request): def test_infer_tz_compat(infer_setup): eastern, _, start, end, start_naive, end_naive = infer_setup - assert (timezones.infer_tzinfo(start, end) is - conversion.localize_pydatetime(start_naive, eastern).tzinfo) - assert (timezones.infer_tzinfo(start, None) is - conversion.localize_pydatetime(start_naive, eastern).tzinfo) - assert (timezones.infer_tzinfo(None, end) is - conversion.localize_pydatetime(end_naive, eastern).tzinfo) + assert ( + timezones.infer_tzinfo(start, end) + is conversion.localize_pydatetime(start_naive, eastern).tzinfo + ) + assert ( + timezones.infer_tzinfo(start, None) + is conversion.localize_pydatetime(start_naive, eastern).tzinfo + ) + assert ( + timezones.infer_tzinfo(None, end) + is conversion.localize_pydatetime(end_naive, eastern).tzinfo + ) def test_infer_tz_utc_localize(infer_setup): diff --git a/pandas/tests/util/test_assert_almost_equal.py b/pandas/tests/util/test_assert_almost_equal.py index 96fc64a2a7b9c2..1583420053fde7 100644 --- a/pandas/tests/util/test_assert_almost_equal.py +++ b/pandas/tests/util/test_assert_almost_equal.py @@ -39,8 +39,9 @@ def _assert_not_almost_equal(a, b, **kwargs): """ try: assert_almost_equal(a, b, **kwargs) - msg = ("{a} and {b} were approximately equal " - "when they shouldn't have been").format(a=a, b=b) + msg = ( + "{a} and {b} were approximately equal " "when they shouldn't have been" + ).format(a=a, b=b) pytest.fail(msg=msg) except AssertionError: pass @@ -65,62 +66,69 @@ def _assert_not_almost_equal_both(a, b, **kwargs): _assert_not_almost_equal(b, a, **kwargs) -@pytest.mark.parametrize("a,b", [ - (1.1, 1.1), (1.1, 1.100001), (np.int16(1), 1.000001), - (np.float64(1.1), 1.1), (np.uint32(5), 5), -]) +@pytest.mark.parametrize( + "a,b", + [ + (1.1, 1.1), + (1.1, 1.100001), + (np.int16(1), 1.000001), + (np.float64(1.1), 1.1), + (np.uint32(5), 5), + ], +) def test_assert_almost_equal_numbers(a, b): _assert_almost_equal_both(a, b) -@pytest.mark.parametrize("a,b", [ - (1.1, 1), (1.1, True), (1, 2), (1.0001, np.int16(1)), -]) +@pytest.mark.parametrize("a,b", [(1.1, 1), (1.1, True), (1, 2), (1.0001, np.int16(1))]) def test_assert_not_almost_equal_numbers(a, b): _assert_not_almost_equal_both(a, b) -@pytest.mark.parametrize("a,b", [ - (0, 0), (0, 0.0), (0, np.float64(0)), (0.000001, 0), -]) +@pytest.mark.parametrize("a,b", [(0, 0), (0, 0.0), (0, np.float64(0)), (0.000001, 0)]) def test_assert_almost_equal_numbers_with_zeros(a, b): _assert_almost_equal_both(a, b) -@pytest.mark.parametrize("a,b", [ - (0.001, 0), (1, 0), -]) +@pytest.mark.parametrize("a,b", [(0.001, 0), (1, 0)]) def test_assert_not_almost_equal_numbers_with_zeros(a, b): _assert_not_almost_equal_both(a, b) -@pytest.mark.parametrize("a,b", [ - (1, "abc"), (1, [1, ]), (1, object()), -]) +@pytest.mark.parametrize("a,b", [(1, "abc"), (1, [1]), (1, object())]) def test_assert_not_almost_equal_numbers_with_mixed(a, b): _assert_not_almost_equal_both(a, b) @pytest.mark.parametrize( - "left_dtype", ["M8[ns]", "m8[ns]", "float64", "int64", "object"]) + "left_dtype", ["M8[ns]", "m8[ns]", "float64", "int64", "object"] +) @pytest.mark.parametrize( - "right_dtype", ["M8[ns]", "m8[ns]", "float64", "int64", "object"]) + "right_dtype", ["M8[ns]", "m8[ns]", "float64", "int64", "object"] +) def test_assert_almost_equal_edge_case_ndarrays(left_dtype, right_dtype): # Empty compare. - _assert_almost_equal_both(np.array([], dtype=left_dtype), - np.array([], dtype=right_dtype), - check_dtype=False) + _assert_almost_equal_both( + np.array([], dtype=left_dtype), + np.array([], dtype=right_dtype), + check_dtype=False, + ) def test_assert_almost_equal_dicts(): _assert_almost_equal_both({"a": 1, "b": 2}, {"a": 1, "b": 2}) -@pytest.mark.parametrize("a,b", [ - ({"a": 1, "b": 2}, {"a": 1, "b": 3}), - ({"a": 1, "b": 2}, {"a": 1, "b": 2, "c": 3}), - ({"a": 1}, 1), ({"a": 1}, "abc"), ({"a": 1}, [1, ]), -]) +@pytest.mark.parametrize( + "a,b", + [ + ({"a": 1, "b": 2}, {"a": 1, "b": 3}), + ({"a": 1, "b": 2}, {"a": 1, "b": 2, "c": 3}), + ({"a": 1}, 1), + ({"a": 1}, "abc"), + ({"a": 1}, [1]), + ], +) def test_assert_not_almost_equal_dicts(a, b): _assert_not_almost_equal_both(a, b) @@ -132,14 +140,15 @@ def test_assert_almost_equal_dict_like_object(val): class DictLikeObj: def keys(self): - return "a", + return ("a",) def __getitem__(self, item): if item == "a": return dict_val - func = (_assert_almost_equal_both if val == dict_val - else _assert_not_almost_equal_both) + func = ( + _assert_almost_equal_both if val == dict_val else _assert_not_almost_equal_both + ) func(real_dict, DictLikeObj(), check_dtype=False) @@ -147,31 +156,34 @@ def test_assert_almost_equal_strings(): _assert_almost_equal_both("abc", "abc") -@pytest.mark.parametrize("a,b", [ - ("abc", "abcd"), ("abc", "abd"), ("abc", 1), ("abc", [1, ]), -]) +@pytest.mark.parametrize( + "a,b", [("abc", "abcd"), ("abc", "abd"), ("abc", 1), ("abc", [1])] +) def test_assert_not_almost_equal_strings(a, b): _assert_not_almost_equal_both(a, b) -@pytest.mark.parametrize("a,b", [ - ([1, 2, 3], [1, 2, 3]), (np.array([1, 2, 3]), np.array([1, 2, 3])), -]) +@pytest.mark.parametrize( + "a,b", [([1, 2, 3], [1, 2, 3]), (np.array([1, 2, 3]), np.array([1, 2, 3]))] +) def test_assert_almost_equal_iterables(a, b): _assert_almost_equal_both(a, b) -@pytest.mark.parametrize("a,b", [ - # Class is different. - (np.array([1, 2, 3]), [1, 2, 3]), - - # Dtype is different. - (np.array([1, 2, 3]), np.array([1., 2., 3.])), - - # Can't compare generators. - (iter([1, 2, 3]), [1, 2, 3]), ([1, 2, 3], [1, 2, 4]), - ([1, 2, 3], [1, 2, 3, 4]), ([1, 2, 3], 1), -]) +@pytest.mark.parametrize( + "a,b", + [ + # Class is different. + (np.array([1, 2, 3]), [1, 2, 3]), + # Dtype is different. + (np.array([1, 2, 3]), np.array([1.0, 2.0, 3.0])), + # Can't compare generators. + (iter([1, 2, 3]), [1, 2, 3]), + ([1, 2, 3], [1, 2, 4]), + ([1, 2, 3], [1, 2, 3, 4]), + ([1, 2, 3], 1), + ], +) def test_assert_not_almost_equal_iterables(a, b): _assert_not_almost_equal(a, b) @@ -180,20 +192,23 @@ def test_assert_almost_equal_null(): _assert_almost_equal_both(None, None) -@pytest.mark.parametrize("a,b", [ - (None, np.NaN), (None, 0), (np.NaN, 0), -]) +@pytest.mark.parametrize("a,b", [(None, np.NaN), (None, 0), (np.NaN, 0)]) def test_assert_not_almost_equal_null(a, b): _assert_not_almost_equal(a, b) -@pytest.mark.parametrize("a,b", [ - (np.inf, np.inf), (np.inf, float("inf")), - (np.array([np.inf, np.nan, -np.inf]), - np.array([np.inf, np.nan, -np.inf])), - (np.array([np.inf, None, -np.inf], dtype=np.object_), - np.array([np.inf, np.nan, -np.inf], dtype=np.object_)), -]) +@pytest.mark.parametrize( + "a,b", + [ + (np.inf, np.inf), + (np.inf, float("inf")), + (np.array([np.inf, np.nan, -np.inf]), np.array([np.inf, np.nan, -np.inf])), + ( + np.array([np.inf, None, -np.inf], dtype=np.object_), + np.array([np.inf, np.nan, -np.inf], dtype=np.object_), + ), + ], +) def test_assert_almost_equal_inf(a, b): _assert_almost_equal_both(a, b) @@ -202,12 +217,15 @@ def test_assert_not_almost_equal_inf(): _assert_not_almost_equal_both(np.inf, 0) -@pytest.mark.parametrize("a,b", [ - (Index([1., 1.1]), Index([1., 1.100001])), - (Series([1., 1.1]), Series([1., 1.100001])), - (np.array([1.1, 2.000001]), np.array([1.1, 2.0])), - (DataFrame({"a": [1., 1.1]}), DataFrame({"a": [1., 1.100001]})) -]) +@pytest.mark.parametrize( + "a,b", + [ + (Index([1.0, 1.1]), Index([1.0, 1.100001])), + (Series([1.0, 1.1]), Series([1.0, 1.100001])), + (np.array([1.1, 2.000001]), np.array([1.1, 2.0])), + (DataFrame({"a": [1.0, 1.1]}), DataFrame({"a": [1.0, 1.100001]})), + ], +) def test_assert_almost_equal_pandas(a, b): _assert_almost_equal_both(a, b) @@ -225,16 +243,18 @@ def test_assert_almost_equal_value_mismatch(): assert_almost_equal(1, 2) -@pytest.mark.parametrize("a,b,klass1,klass2", [ - (np.array([1]), 1, "ndarray", "int"), - (1, np.array([1]), "int", "ndarray"), -]) +@pytest.mark.parametrize( + "a,b,klass1,klass2", + [(np.array([1]), 1, "ndarray", "int"), (1, np.array([1]), "int", "ndarray")], +) def test_assert_almost_equal_class_mismatch(a, b, klass1, klass2): msg = """numpy array are different numpy array classes are different \\[left\\]: {klass1} -\\[right\\]: {klass2}""".format(klass1=klass1, klass2=klass2) +\\[right\\]: {klass2}""".format( + klass1=klass1, klass2=klass2 + ) with pytest.raises(AssertionError, match=msg): assert_almost_equal(a, b) @@ -248,8 +268,7 @@ def test_assert_almost_equal_value_mismatch1(): \\[right\\]: \\[1\\.0, nan, 3\\.0\\]""" with pytest.raises(AssertionError, match=msg): - assert_almost_equal(np.array([np.nan, 2, 3]), - np.array([1, np.nan, 3])) + assert_almost_equal(np.array([np.nan, 2, 3]), np.array([1, np.nan, 3])) def test_assert_almost_equal_value_mismatch2(): @@ -271,8 +290,9 @@ def test_assert_almost_equal_value_mismatch3(): \\[right\\]: \\[\\[1, 3\\], \\[3, 4\\], \\[5, 6\\]\\]""" with pytest.raises(AssertionError, match=msg): - assert_almost_equal(np.array([[1, 2], [3, 4], [5, 6]]), - np.array([[1, 3], [3, 4], [5, 6]])) + assert_almost_equal( + np.array([[1, 2], [3, 4], [5, 6]]), np.array([[1, 3], [3, 4], [5, 6]]) + ) def test_assert_almost_equal_value_mismatch4(): @@ -283,8 +303,7 @@ def test_assert_almost_equal_value_mismatch4(): \\[right\\]: \\[\\[1, 3\\], \\[3, 4\\]\\]""" with pytest.raises(AssertionError, match=msg): - assert_almost_equal(np.array([[1, 2], [3, 4]]), - np.array([[1, 3], [3, 4]])) + assert_almost_equal(np.array([[1, 2], [3, 4]]), np.array([[1, 3], [3, 4]])) def test_assert_almost_equal_shape_mismatch_override(): @@ -294,9 +313,7 @@ def test_assert_almost_equal_shape_mismatch_override(): \\[left\\]: \\(2L*,\\) \\[right\\]: \\(3L*,\\)""" with pytest.raises(AssertionError, match=msg): - assert_almost_equal(np.array([1, 2]), - np.array([3, 4, 5]), - obj="Index") + assert_almost_equal(np.array([1, 2]), np.array([3, 4, 5]), obj="Index") def test_assert_almost_equal_unicode(): @@ -308,8 +325,7 @@ def test_assert_almost_equal_unicode(): \\[right\\]: \\[á, à, å\\]""" with pytest.raises(AssertionError, match=msg): - assert_almost_equal(np.array(["á", "à", "ä"]), - np.array(["á", "à", "å"])) + assert_almost_equal(np.array(["á", "à", "ä"]), np.array(["á", "à", "å"])) def test_assert_almost_equal_timestamp(): diff --git a/pandas/tests/util/test_assert_categorical_equal.py b/pandas/tests/util/test_assert_categorical_equal.py index 139755d4510b54..d51dd8b36751ac 100644 --- a/pandas/tests/util/test_assert_categorical_equal.py +++ b/pandas/tests/util/test_assert_categorical_equal.py @@ -4,10 +4,10 @@ from pandas.util.testing import assert_categorical_equal -@pytest.mark.parametrize("c", [ - Categorical([1, 2, 3, 4]), - Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4, 5]), -]) +@pytest.mark.parametrize( + "c", + [Categorical([1, 2, 3, 4]), Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4, 5])], +) def test_categorical_equal(c): assert_categorical_equal(c, c) @@ -81,7 +81,9 @@ def test_categorical_equal_object_override(obj): Attribute "ordered" are different \\[left\\]: False -\\[right\\]: True""".format(obj=obj) +\\[right\\]: True""".format( + obj=obj + ) c1 = Categorical(data, ordered=False) c2 = Categorical(data, ordered=True) diff --git a/pandas/tests/util/test_assert_extension_array_equal.py b/pandas/tests/util/test_assert_extension_array_equal.py index 782b88be150579..43a474da2bbdac 100644 --- a/pandas/tests/util/test_assert_extension_array_equal.py +++ b/pandas/tests/util/test_assert_extension_array_equal.py @@ -5,10 +5,14 @@ from pandas.util.testing import assert_extension_array_equal -@pytest.mark.parametrize("kwargs", [ - dict(), # Default is check_exact=False - dict(check_exact=False), dict(check_exact=True) -]) +@pytest.mark.parametrize( + "kwargs", + [ + dict(), # Default is check_exact=False + dict(check_exact=False), + dict(check_exact=True), + ], +) def test_assert_extension_array_equal_not_exact(kwargs): # see gh-23709 arr1 = SparseArray([-0.17387645482451206, 0.3414148016424936]) @@ -28,9 +32,9 @@ def test_assert_extension_array_equal_not_exact(kwargs): assert_extension_array_equal(arr1, arr2, **kwargs) -@pytest.mark.parametrize("check_less_precise", [ - True, False, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 -]) +@pytest.mark.parametrize( + "check_less_precise", [True, False, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9] +) def test_assert_extension_array_equal_less_precise(check_less_precise): arr1 = SparseArray([0.5, 0.123456]) arr2 = SparseArray([0.5, 0.123457]) @@ -93,8 +97,11 @@ def test_assert_extension_array_equal_non_extension_array(side): extension_array = SparseArray(numpy_array) msg = "{side} is not an ExtensionArray".format(side=side) - args = ((numpy_array, extension_array) if side == "left" - else (extension_array, numpy_array)) + args = ( + (numpy_array, extension_array) + if side == "left" + else (extension_array, numpy_array) + ) with pytest.raises(AssertionError, match=msg): assert_extension_array_equal(*args) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 735d16f7ad0dbd..9571e8027ccf72 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -9,7 +9,7 @@ def by_blocks_fixture(request): return request.param -@pytest.fixture(params=['DataFrame', 'Series']) +@pytest.fixture(params=["DataFrame", "Series"]) def obj_fixture(request): return request.param @@ -76,29 +76,24 @@ def _assert_not_frame_equal_both(a, b, **kwargs): @pytest.mark.parametrize("check_like", [True, False]) def test_frame_equal_row_order_mismatch(check_like, obj_fixture): - df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, - index=["a", "b", "c"]) - df2 = DataFrame({"A": [3, 2, 1], "B": [6, 5, 4]}, - index=["c", "b", "a"]) + df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) + df2 = DataFrame({"A": [3, 2, 1], "B": [6, 5, 4]}, index=["c", "b", "a"]) if not check_like: # Do not ignore row-column orderings. msg = "{obj}.index are different".format(obj=obj_fixture) with pytest.raises(AssertionError, match=msg): - assert_frame_equal(df1, - df2, - check_like=check_like, - obj=obj_fixture) + assert_frame_equal(df1, df2, check_like=check_like, obj=obj_fixture) else: - _assert_frame_equal_both(df1, - df2, - check_like=check_like, - obj=obj_fixture) + _assert_frame_equal_both(df1, df2, check_like=check_like, obj=obj_fixture) -@pytest.mark.parametrize("df1,df2", [ - (DataFrame({"A": [1, 2, 3]}), DataFrame({"A": [1, 2, 3, 4]})), - (DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), DataFrame({"A": [1, 2, 3]})), -]) +@pytest.mark.parametrize( + "df1,df2", + [ + (DataFrame({"A": [1, 2, 3]}), DataFrame({"A": [1, 2, 3, 4]})), + (DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), DataFrame({"A": [1, 2, 3]})), + ], +) def test_frame_equal_shape_mismatch(df1, df2, obj_fixture): msg = "{obj} are different".format(obj=obj_fixture) @@ -106,21 +101,27 @@ def test_frame_equal_shape_mismatch(df1, df2, obj_fixture): assert_frame_equal(df1, df2, obj=obj_fixture) -@pytest.mark.parametrize("df1,df2,msg", [ - # Index - (DataFrame.from_records({"a": [1, 2], - "c": ["l1", "l2"]}, index=["a"]), - DataFrame.from_records({"a": [1.0, 2.0], - "c": ["l1", "l2"]}, index=["a"]), - "DataFrame\\.index are different"), - - # MultiIndex - (DataFrame.from_records({"a": [1, 2], "b": [2.1, 1.5], - "c": ["l1", "l2"]}, index=["a", "b"]), - DataFrame.from_records({"a": [1.0, 2.0], "b": [2.1, 1.5], - "c": ["l1", "l2"]}, index=["a", "b"]), - "MultiIndex level \\[0\\] are different") -]) +@pytest.mark.parametrize( + "df1,df2,msg", + [ + # Index + ( + DataFrame.from_records({"a": [1, 2], "c": ["l1", "l2"]}, index=["a"]), + DataFrame.from_records({"a": [1.0, 2.0], "c": ["l1", "l2"]}, index=["a"]), + "DataFrame\\.index are different", + ), + # MultiIndex + ( + DataFrame.from_records( + {"a": [1, 2], "b": [2.1, 1.5], "c": ["l1", "l2"]}, index=["a", "b"] + ), + DataFrame.from_records( + {"a": [1.0, 2.0], "b": [2.1, 1.5], "c": ["l1", "l2"]}, index=["a", "b"] + ), + "MultiIndex level \\[0\\] are different", + ), + ], +) def test_frame_equal_index_dtype_mismatch(df1, df2, msg, check_index_type): kwargs = dict(check_index_type=check_index_type) @@ -153,12 +154,11 @@ def test_frame_equal_index_mismatch(obj_fixture): {obj}\\.index values are different \\(33\\.33333 %\\) \\[left\\]: Index\\(\\['a', 'b', 'c'\\], dtype='object'\\) \\[right\\]: Index\\(\\['a', 'b', 'd'\\], dtype='object'\\)""".format( - obj=obj_fixture) + obj=obj_fixture + ) - df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, - index=["a", "b", "c"]) - df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, - index=["a", "b", "d"]) + df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) + df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "d"]) with pytest.raises(AssertionError, match=msg): assert_frame_equal(df1, df2, obj=obj_fixture) @@ -170,12 +170,11 @@ def test_frame_equal_columns_mismatch(obj_fixture): {obj}\\.columns values are different \\(50\\.0 %\\) \\[left\\]: Index\\(\\['A', 'B'\\], dtype='object'\\) \\[right\\]: Index\\(\\['A', 'b'\\], dtype='object'\\)""".format( - obj=obj_fixture) + obj=obj_fixture + ) - df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, - index=["a", "b", "c"]) - df2 = DataFrame({"A": [1, 2, 3], "b": [4, 5, 6]}, - index=["a", "b", "c"]) + df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) + df2 = DataFrame({"A": [1, 2, 3], "b": [4, 5, 6]}, index=["a", "b", "c"]) with pytest.raises(AssertionError, match=msg): assert_frame_equal(df1, df2, obj=obj_fixture) @@ -186,34 +185,40 @@ def test_frame_equal_block_mismatch(by_blocks_fixture, obj_fixture): {obj}\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) \\[left\\]: \\[4, 5, 6\\] -\\[right\\]: \\[4, 5, 7\\]""".format(obj=obj_fixture) +\\[right\\]: \\[4, 5, 7\\]""".format( + obj=obj_fixture + ) df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 7]}) with pytest.raises(AssertionError, match=msg): - assert_frame_equal(df1, - df2, - by_blocks=by_blocks_fixture, - obj=obj_fixture) + assert_frame_equal(df1, df2, by_blocks=by_blocks_fixture, obj=obj_fixture) -@pytest.mark.parametrize("df1,df2,msg", [ - (DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}), - DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "e̊"]}), - """{obj}\\.iloc\\[:, 1\\] are different +@pytest.mark.parametrize( + "df1,df2,msg", + [ + ( + DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}), + DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "e̊"]}), + """{obj}\\.iloc\\[:, 1\\] are different {obj}\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) \\[left\\]: \\[é, è, ë\\] -\\[right\\]: \\[é, è, e̊\\]"""), - (DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}), - DataFrame({"A": ["a", "a", "a"], "E": ["e", "e", "e"]}), - """{obj}\\.iloc\\[:, 0\\] are different +\\[right\\]: \\[é, è, e̊\\]""", + ), + ( + DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}), + DataFrame({"A": ["a", "a", "a"], "E": ["e", "e", "e"]}), + """{obj}\\.iloc\\[:, 0\\] are different {obj}\\.iloc\\[:, 0\\] values are different \\(100\\.0 %\\) \\[left\\]: \\[á, à, ä\\] -\\[right\\]: \\[a, a, a\\]"""), -]) +\\[right\\]: \\[a, a, a\\]""", + ), + ], +) def test_frame_equal_unicode(df1, df2, msg, by_blocks_fixture, obj_fixture): # see gh-20503 # @@ -221,7 +226,4 @@ def test_frame_equal_unicode(df1, df2, msg, by_blocks_fixture, obj_fixture): # when comparing DataFrames containing differing unicode objects. msg = msg.format(obj=obj_fixture) with pytest.raises(AssertionError, match=msg): - assert_frame_equal(df1, - df2, - by_blocks=by_blocks_fixture, - obj=obj_fixture) + assert_frame_equal(df1, df2, by_blocks=by_blocks_fixture, obj=obj_fixture) diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py index 445d9c4e482b09..270f765caebd02 100644 --- a/pandas/tests/util/test_assert_index_equal.py +++ b/pandas/tests/util/test_assert_index_equal.py @@ -17,8 +17,7 @@ def test_index_equal_levels_mismatch(): \\)""" idx1 = Index([1, 2, 3]) - idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), - ("B", 3), ("B", 4)]) + idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3), ("B", 4)]) with pytest.raises(AssertionError, match=msg): assert_index_equal(idx1, idx2, exact=False) @@ -31,10 +30,8 @@ def test_index_equal_values_mismatch(check_exact): \\[left\\]: Int64Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\) \\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" - idx1 = MultiIndex.from_tuples([("A", 2), ("A", 2), - ("B", 3), ("B", 4)]) - idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), - ("B", 3), ("B", 4)]) + idx1 = MultiIndex.from_tuples([("A", 2), ("A", 2), ("B", 3), ("B", 4)]) + idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3), ("B", 4)]) with pytest.raises(AssertionError, match=msg): assert_index_equal(idx1, idx2, check_exact=check_exact) @@ -69,7 +66,7 @@ def test_index_equal_class_mismatch(check_exact): def test_index_equal_values_close(check_exact): - idx1 = Index([1, 2, 3.]) + idx1 = Index([1, 2, 3.0]) idx2 = Index([1, 2, 3.0000000001]) if check_exact: @@ -86,10 +83,9 @@ def test_index_equal_values_close(check_exact): def test_index_equal_values_less_close(check_exact, check_less_precise): - idx1 = Index([1, 2, 3.]) + idx1 = Index([1, 2, 3.0]) idx2 = Index([1, 2, 3.0001]) - kwargs = dict(check_exact=check_exact, - check_less_precise=check_less_precise) + kwargs = dict(check_exact=check_exact, check_less_precise=check_less_precise) if check_exact or not check_less_precise: msg = """Index are different @@ -107,8 +103,7 @@ def test_index_equal_values_less_close(check_exact, check_less_precise): def test_index_equal_values_too_far(check_exact, check_less_precise): idx1 = Index([1, 2, 3]) idx2 = Index([1, 2, 4]) - kwargs = dict(check_exact=check_exact, - check_less_precise=check_less_precise) + kwargs = dict(check_exact=check_exact, check_less_precise=check_less_precise) msg = """Index are different @@ -121,12 +116,9 @@ def test_index_equal_values_too_far(check_exact, check_less_precise): def test_index_equal_level_values_mismatch(check_exact, check_less_precise): - idx1 = MultiIndex.from_tuples([("A", 2), ("A", 2), - ("B", 3), ("B", 4)]) - idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), - ("B", 3), ("B", 4)]) - kwargs = dict(check_exact=check_exact, - check_less_precise=check_less_precise) + idx1 = MultiIndex.from_tuples([("A", 2), ("A", 2), ("B", 3), ("B", 4)]) + idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3), ("B", 4)]) + kwargs = dict(check_exact=check_exact, check_less_precise=check_less_precise) msg = """MultiIndex level \\[1\\] are different @@ -138,9 +130,10 @@ def test_index_equal_level_values_mismatch(check_exact, check_less_precise): assert_index_equal(idx1, idx2, **kwargs) -@pytest.mark.parametrize("name1,name2", [ - (None, "x"), ("x", "x"), (np.nan, np.nan), (NaT, NaT), (np.nan, NaT) -]) +@pytest.mark.parametrize( + "name1,name2", + [(None, "x"), ("x", "x"), (np.nan, np.nan), (NaT, NaT), (np.nan, NaT)], +) def test_index_equal_names(name1, name2): msg = """Index are different diff --git a/pandas/tests/util/test_assert_interval_array_equal.py b/pandas/tests/util/test_assert_interval_array_equal.py index d2dd7912388b06..e4435b5f008e87 100644 --- a/pandas/tests/util/test_assert_interval_array_equal.py +++ b/pandas/tests/util/test_assert_interval_array_equal.py @@ -4,11 +4,14 @@ from pandas.util.testing import assert_interval_array_equal -@pytest.mark.parametrize("kwargs", [ - dict(start=0, periods=4), - dict(start=1, periods=5), - dict(start=5, end=10, closed="left"), -]) +@pytest.mark.parametrize( + "kwargs", + [ + dict(start=0, periods=4), + dict(start=1, periods=5), + dict(start=5, end=10, closed="left"), + ], +) def test_interval_array_equal(kwargs): arr = interval_range(**kwargs).values assert_interval_array_equal(arr, arr) diff --git a/pandas/tests/util/test_assert_numpy_array_equal.py b/pandas/tests/util/test_assert_numpy_array_equal.py index 447b1ac172202c..59f77d18a89292 100644 --- a/pandas/tests/util/test_assert_numpy_array_equal.py +++ b/pandas/tests/util/test_assert_numpy_array_equal.py @@ -23,16 +23,18 @@ def test_assert_numpy_array_equal_bad_type(): assert_numpy_array_equal(1, 2) -@pytest.mark.parametrize("a,b,klass1,klass2", [ - (np.array([1]), 1, "ndarray", "int"), - (1, np.array([1]), "int", "ndarray"), -]) +@pytest.mark.parametrize( + "a,b,klass1,klass2", + [(np.array([1]), 1, "ndarray", "int"), (1, np.array([1]), "int", "ndarray")], +) def test_assert_numpy_array_equal_class_mismatch(a, b, klass1, klass2): msg = """numpy array are different numpy array classes are different \\[left\\]: {klass1} -\\[right\\]: {klass2}""".format(klass1=klass1, klass2=klass2) +\\[right\\]: {klass2}""".format( + klass1=klass1, klass2=klass2 + ) with pytest.raises(AssertionError, match=msg): assert_numpy_array_equal(a, b) @@ -46,8 +48,7 @@ def test_assert_numpy_array_equal_value_mismatch1(): \\[right\\]: \\[1\\.0, nan, 3\\.0\\]""" with pytest.raises(AssertionError, match=msg): - assert_numpy_array_equal(np.array([np.nan, 2, 3]), - np.array([1, np.nan, 3])) + assert_numpy_array_equal(np.array([np.nan, 2, 3]), np.array([1, np.nan, 3])) def test_assert_numpy_array_equal_value_mismatch2(): @@ -69,8 +70,9 @@ def test_assert_numpy_array_equal_value_mismatch3(): \\[right\\]: \\[\\[1, 3\\], \\[3, 4\\], \\[5, 6\\]\\]""" with pytest.raises(AssertionError, match=msg): - assert_numpy_array_equal(np.array([[1, 2], [3, 4], [5, 6]]), - np.array([[1, 3], [3, 4], [5, 6]])) + assert_numpy_array_equal( + np.array([[1, 2], [3, 4], [5, 6]]), np.array([[1, 3], [3, 4], [5, 6]]) + ) def test_assert_numpy_array_equal_value_mismatch4(): @@ -81,8 +83,7 @@ def test_assert_numpy_array_equal_value_mismatch4(): \\[right\\]: \\[1\\.1, 2.0\\]""" with pytest.raises(AssertionError, match=msg): - assert_numpy_array_equal(np.array([1.1, 2.000001]), - np.array([1.1, 2.0])) + assert_numpy_array_equal(np.array([1.1, 2.000001]), np.array([1.1, 2.0])) def test_assert_numpy_array_equal_value_mismatch5(): @@ -93,8 +94,9 @@ def test_assert_numpy_array_equal_value_mismatch5(): \\[right\\]: \\[\\[1, 3\\], \\[3, 4\\], \\[5, 6\\]\\]""" with pytest.raises(AssertionError, match=msg): - assert_numpy_array_equal(np.array([[1, 2], [3, 4], [5, 6]]), - np.array([[1, 3], [3, 4], [5, 6]])) + assert_numpy_array_equal( + np.array([[1, 2], [3, 4], [5, 6]]), np.array([[1, 3], [3, 4], [5, 6]]) + ) def test_assert_numpy_array_equal_value_mismatch6(): @@ -105,8 +107,7 @@ def test_assert_numpy_array_equal_value_mismatch6(): \\[right\\]: \\[\\[1, 3\\], \\[3, 4\\]\\]""" with pytest.raises(AssertionError, match=msg): - assert_numpy_array_equal(np.array([[1, 2], [3, 4]]), - np.array([[1, 3], [3, 4]])) + assert_numpy_array_equal(np.array([[1, 2], [3, 4]]), np.array([[1, 3], [3, 4]])) def test_assert_numpy_array_equal_shape_mismatch_override(): @@ -117,9 +118,7 @@ def test_assert_numpy_array_equal_shape_mismatch_override(): \\[right\\]: \\(3L*,\\)""" with pytest.raises(AssertionError, match=msg): - assert_numpy_array_equal(np.array([1, 2]), - np.array([3, 4, 5]), - obj="Index") + assert_numpy_array_equal(np.array([1, 2]), np.array([3, 4, 5]), obj="Index") def test_numpy_array_equal_unicode(): @@ -134,8 +133,7 @@ def test_numpy_array_equal_unicode(): \\[right\\]: \\[á, à, å\\]""" with pytest.raises(AssertionError, match=msg): - assert_numpy_array_equal(np.array(["á", "à", "ä"]), - np.array(["á", "à", "å"])) + assert_numpy_array_equal(np.array(["á", "à", "ä"]), np.array(["á", "à", "å"])) def test_numpy_array_equal_object(): @@ -164,9 +162,11 @@ def test_numpy_array_equal_copy_flag(other_type, check_same): other = a.copy() if check_same != other_type: - msg = (r"array\(\[1, 2, 3\]\) is not array\(\[1, 2, 3\]\)" - if check_same == "same" - else r"array\(\[1, 2, 3\]\) is array\(\[1, 2, 3\]\)") + msg = ( + r"array\(\[1, 2, 3\]\) is not array\(\[1, 2, 3\]\)" + if check_same == "same" + else r"array\(\[1, 2, 3\]\) is array\(\[1, 2, 3\]\)" + ) if msg is not None: with pytest.raises(AssertionError, match=msg): diff --git a/pandas/tests/util/test_assert_produces_warning.py b/pandas/tests/util/test_assert_produces_warning.py index 79b2a565376139..c681817896903f 100644 --- a/pandas/tests/util/test_assert_produces_warning.py +++ b/pandas/tests/util/test_assert_produces_warning.py @@ -6,11 +6,11 @@ def f(): - warnings.warn('f1', FutureWarning) - warnings.warn('f2', RuntimeWarning) + warnings.warn("f1", FutureWarning) + warnings.warn("f2", RuntimeWarning) -@pytest.mark.filterwarnings('ignore:f1:FutureWarning') +@pytest.mark.filterwarnings("ignore:f1:FutureWarning") def test_assert_produces_warning_honors_filter(): # Raise by default. msg = r"Caused unexpected warning\(s\)" @@ -18,6 +18,5 @@ def test_assert_produces_warning_honors_filter(): with tm.assert_produces_warning(RuntimeWarning): f() - with tm.assert_produces_warning(RuntimeWarning, - raise_on_extra_warnings=False): + with tm.assert_produces_warning(RuntimeWarning, raise_on_extra_warnings=False): f() diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 61cabcf3f4aae0..a12d9386eb159c 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -64,29 +64,33 @@ def _assert_not_series_equal_both(a, b, **kwargs): _assert_not_series_equal(b, a, **kwargs) -@pytest.mark.parametrize("data", [ - range(3), list("abc"), list("áàä"), -]) +@pytest.mark.parametrize("data", [range(3), list("abc"), list("áàä")]) def test_series_equal(data): _assert_series_equal_both(Series(data), Series(data)) -@pytest.mark.parametrize("data1,data2", [ - (range(3), range(1, 4)), - (list("abc"), list("xyz")), - (list("áàä"), list("éèë")), - (list("áàä"), list(b"aaa")), - (range(3), range(4)), -]) +@pytest.mark.parametrize( + "data1,data2", + [ + (range(3), range(1, 4)), + (list("abc"), list("xyz")), + (list("áàä"), list("éèë")), + (list("áàä"), list(b"aaa")), + (range(3), range(4)), + ], +) def test_series_not_equal_value_mismatch(data1, data2): _assert_not_series_equal_both(Series(data1), Series(data2)) -@pytest.mark.parametrize("kwargs", [ - dict(dtype="float64"), # dtype mismatch - dict(index=[1, 2, 4]), # index mismatch - dict(name="foo"), # name mismatch -]) +@pytest.mark.parametrize( + "kwargs", + [ + dict(dtype="float64"), # dtype mismatch + dict(index=[1, 2, 4]), # index mismatch + dict(name="foo"), # name mismatch + ], +) def test_series_not_equal_metadata_mismatch(kwargs): data = range(3) s1 = Series(data) @@ -104,9 +108,10 @@ def test_less_precise(data1, data2, dtype, check_less_precise): kwargs = dict(check_less_precise=check_less_precise) - if ((check_less_precise is False or check_less_precise == 10) or - ((check_less_precise is True or check_less_precise >= 3) and - abs(data1 - data2) >= 0.0001)): + if (check_less_precise is False or check_less_precise == 10) or ( + (check_less_precise is True or check_less_precise >= 3) + and abs(data1 - data2) >= 0.0001 + ): msg = "Series values are different" with pytest.raises(AssertionError, match=msg): assert_series_equal(s1, s2, **kwargs) @@ -114,19 +119,27 @@ def test_less_precise(data1, data2, dtype, check_less_precise): _assert_series_equal_both(s1, s2, **kwargs) -@pytest.mark.parametrize("s1,s2,msg", [ - # Index - (Series(["l1", "l2"], index=[1, 2]), - Series(["l1", "l2"], index=[1., 2.]), - "Series\\.index are different"), - - # MultiIndex - (DataFrame.from_records({"a": [1, 2], "b": [2.1, 1.5], - "c": ["l1", "l2"]}, index=["a", "b"]).c, - DataFrame.from_records({"a": [1., 2.], "b": [2.1, 1.5], - "c": ["l1", "l2"]}, index=["a", "b"]).c, - "MultiIndex level \\[0\\] are different") -]) +@pytest.mark.parametrize( + "s1,s2,msg", + [ + # Index + ( + Series(["l1", "l2"], index=[1, 2]), + Series(["l1", "l2"], index=[1.0, 2.0]), + "Series\\.index are different", + ), + # MultiIndex + ( + DataFrame.from_records( + {"a": [1, 2], "b": [2.1, 1.5], "c": ["l1", "l2"]}, index=["a", "b"] + ).c, + DataFrame.from_records( + {"a": [1.0, 2.0], "b": [2.1, 1.5], "c": ["l1", "l2"]}, index=["a", "b"] + ).c, + "MultiIndex level \\[0\\] are different", + ), + ], +) def test_series_equal_index_dtype(s1, s2, msg, check_index_type): kwargs = dict(check_index_type=check_index_type) diff --git a/pandas/tests/util/test_deprecate.py b/pandas/tests/util/test_deprecate.py index 7fa7989eff6902..e7b38bb2b700a7 100644 --- a/pandas/tests/util/test_deprecate.py +++ b/pandas/tests/util/test_deprecate.py @@ -13,16 +13,16 @@ def new_func(): This is the extended summary. The deprecate directive goes before this. """ - return 'new_func called' + return "new_func called" def new_func_no_docstring(): - return 'new_func_no_docstring called' + return "new_func_no_docstring called" def new_func_wrong_docstring(): """Summary should be in the next line.""" - return 'new_func_wrong_docstring called' + return "new_func_wrong_docstring called" def new_func_with_deprecation(): @@ -38,26 +38,28 @@ def new_func_with_deprecation(): def test_deprecate_ok(): - depr_func = deprecate('depr_func', new_func, '1.0', - msg='Use new_func instead.') + depr_func = deprecate("depr_func", new_func, "1.0", msg="Use new_func instead.") with tm.assert_produces_warning(FutureWarning): result = depr_func() - assert result == 'new_func called' + assert result == "new_func called" assert depr_func.__doc__ == dedent(new_func_with_deprecation.__doc__) def test_deprecate_no_docstring(): - depr_func = deprecate('depr_func', new_func_no_docstring, '1.0', - msg='Use new_func instead.') + depr_func = deprecate( + "depr_func", new_func_no_docstring, "1.0", msg="Use new_func instead." + ) with tm.assert_produces_warning(FutureWarning): result = depr_func() - assert result == 'new_func_no_docstring called' + assert result == "new_func_no_docstring called" def test_deprecate_wrong_docstring(): - with pytest.raises(AssertionError, match='deprecate needs a correctly ' - 'formatted docstring'): - deprecate('depr_func', new_func_wrong_docstring, '1.0', - msg='Use new_func instead.') + with pytest.raises( + AssertionError, match="deprecate needs a correctly " "formatted docstring" + ): + deprecate( + "depr_func", new_func_wrong_docstring, "1.0", msg="Use new_func instead." + ) diff --git a/pandas/tests/util/test_deprecate_kwarg.py b/pandas/tests/util/test_deprecate_kwarg.py index b6e2f8e184a8df..c17c48197ccf75 100644 --- a/pandas/tests/util/test_deprecate_kwarg.py +++ b/pandas/tests/util/test_deprecate_kwarg.py @@ -27,10 +27,7 @@ def _f3(new=0): return new -@pytest.mark.parametrize("key,klass", [ - ("old", FutureWarning), - ("new", None) -]) +@pytest.mark.parametrize("key,klass", [("old", FutureWarning), ("new", None)]) def test_deprecate_kwarg(key, klass): x = 78 @@ -67,6 +64,7 @@ def test_bad_deprecate_kwarg(): msg = "mapping from old to new argument values must be dict or callable!" with pytest.raises(TypeError, match=msg): + @deprecate_kwarg("old", "new", 0) def f4(new=None): return new diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index 31468a40f72dd3..27a23180b269ad 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -10,15 +10,18 @@ import pandas.util.testing as tm -@pytest.fixture(params=[ - Series([1, 2, 3] * 3, dtype="int32"), - Series([None, 2.5, 3.5] * 3, dtype="float32"), - Series(["a", "b", "c"] * 3, dtype="category"), - Series(["d", "e", "f"] * 3), - Series([True, False, True] * 3), - Series(pd.date_range("20130101", periods=9)), - Series(pd.date_range("20130101", periods=9, tz="US/Eastern")), - Series(pd.timedelta_range("2000", periods=9))]) +@pytest.fixture( + params=[ + Series([1, 2, 3] * 3, dtype="int32"), + Series([None, 2.5, 3.5] * 3, dtype="float32"), + Series(["a", "b", "c"] * 3, dtype="category"), + Series(["d", "e", "f"] * 3), + Series([True, False, True] * 3), + Series(pd.date_range("20130101", periods=9)), + Series(pd.date_range("20130101", periods=9, tz="US/Eastern")), + Series(pd.timedelta_range("2000", periods=9)), + ] +) def series(request): return request.param @@ -65,9 +68,13 @@ def test_consistency(): # Check that our hash doesn't change because of a mistake # in the actual code; this is the ground truth. result = hash_pandas_object(Index(["foo", "bar", "baz"])) - expected = Series(np.array([3600424527151052760, 1374399572096150070, - 477881037637427054], dtype="uint64"), - index=["foo", "bar", "baz"]) + expected = Series( + np.array( + [3600424527151052760, 1374399572096150070, 477881037637427054], + dtype="uint64", + ), + index=["foo", "bar", "baz"], + ) tm.assert_series_equal(result, expected) @@ -76,10 +83,9 @@ def test_hash_array(series): tm.assert_numpy_array_equal(hash_array(arr), hash_array(arr)) -@pytest.mark.parametrize("arr2", [ - np.array([3, 4, "All"]), - np.array([3, 4, "All"], dtype=object), -]) +@pytest.mark.parametrize( + "arr2", [np.array([3, 4, "All"]), np.array([3, 4, "All"], dtype=object)] +) def test_hash_array_mixed(arr2): result1 = hash_array(np.array(["3", "4", "All"])) result2 = hash_array(arr2) @@ -105,9 +111,10 @@ def test_hash_tuples(): assert result == expected[0] -@pytest.mark.parametrize("tup", [ - (1, "one"), (1, np.nan), (1.0, pd.NaT, "A"), - ("A", pd.Timestamp("2012-01-01"))]) +@pytest.mark.parametrize( + "tup", + [(1, "one"), (1, np.nan), (1.0, pd.NaT, "A"), ("A", pd.Timestamp("2012-01-01"))], +) def test_hash_tuple(tup): # Test equivalence between # hash_tuples and hash_tuple. @@ -117,14 +124,26 @@ def test_hash_tuple(tup): assert result == expected -@pytest.mark.parametrize("val", [ - 1, 1.4, "A", b"A", pd.Timestamp("2012-01-01"), - pd.Timestamp("2012-01-01", tz="Europe/Brussels"), - datetime.datetime(2012, 1, 1), - pd.Timestamp("2012-01-01", tz="EST").to_pydatetime(), - pd.Timedelta("1 days"), datetime.timedelta(1), - pd.Period("2012-01-01", freq="D"), pd.Interval(0, 1), - np.nan, pd.NaT, None]) +@pytest.mark.parametrize( + "val", + [ + 1, + 1.4, + "A", + b"A", + pd.Timestamp("2012-01-01"), + pd.Timestamp("2012-01-01", tz="Europe/Brussels"), + datetime.datetime(2012, 1, 1), + pd.Timestamp("2012-01-01", tz="EST").to_pydatetime(), + pd.Timedelta("1 days"), + datetime.timedelta(1), + pd.Period("2012-01-01", freq="D"), + pd.Interval(0, 1), + np.nan, + pd.NaT, + None, + ], +) def test_hash_scalar(val): result = _hash_scalar(val) expected = hash_array(np.array([val], dtype=object), categorize=True) @@ -140,8 +159,7 @@ def test_hash_tuples_err(val): def test_multiindex_unique(): - mi = MultiIndex.from_tuples([(118, 472), (236, 118), - (51, 204), (102, 51)]) + mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)]) assert mi.is_unique is True result = hash_pandas_object(mi) @@ -149,9 +167,11 @@ def test_multiindex_unique(): def test_multiindex_objects(): - mi = MultiIndex(levels=[["b", "d", "a"], [1, 2, 3]], - codes=[[0, 1, 0, 2], [2, 0, 0, 1]], - names=["col1", "col2"]) + mi = MultiIndex( + levels=[["b", "d", "a"], [1, 2, 3]], + codes=[[0, 1, 0, 2], [2, 0, 0, 1]], + names=["col1", "col2"], + ) recons = mi._sort_levels_monotonic() # These are equal. @@ -176,32 +196,36 @@ def test_multiindex_objects(): tm.assert_numpy_array_equal(np.sort(result), np.sort(expected)) -@pytest.mark.parametrize("obj", [ - Series([1, 2, 3]), - Series([1.0, 1.5, 3.2]), - Series([1.0, 1.5, np.nan]), - Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), - Series(["a", "b", "c"]), - Series(["a", np.nan, "c"]), - Series(["a", None, "c"]), - Series([True, False, True]), - Series(), - Index([1, 2, 3]), - Index([True, False, True]), - DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}), - DataFrame(), - tm.makeMissingDataframe(), - tm.makeMixedDataFrame(), - tm.makeTimeDataFrame(), - tm.makeTimeSeries(), - tm.makeTimedeltaIndex(), - tm.makePeriodIndex(), - Series(tm.makePeriodIndex()), - Series(pd.date_range("20130101", periods=3, tz="US/Eastern")), - MultiIndex.from_product([range(5), ["foo", "bar", "baz"], - pd.date_range("20130101", periods=2)]), - MultiIndex.from_product([pd.CategoricalIndex(list("aabc")), range(3)]) -]) +@pytest.mark.parametrize( + "obj", + [ + Series([1, 2, 3]), + Series([1.0, 1.5, 3.2]), + Series([1.0, 1.5, np.nan]), + Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), + Series(["a", "b", "c"]), + Series(["a", np.nan, "c"]), + Series(["a", None, "c"]), + Series([True, False, True]), + Series(), + Index([1, 2, 3]), + Index([True, False, True]), + DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}), + DataFrame(), + tm.makeMissingDataframe(), + tm.makeMixedDataFrame(), + tm.makeTimeDataFrame(), + tm.makeTimeSeries(), + tm.makeTimedeltaIndex(), + tm.makePeriodIndex(), + Series(tm.makePeriodIndex()), + Series(pd.date_range("20130101", periods=3, tz="US/Eastern")), + MultiIndex.from_product( + [range(5), ["foo", "bar", "baz"], pd.date_range("20130101", periods=2)] + ), + MultiIndex.from_product([pd.CategoricalIndex(list("aabc")), range(3)]), + ], +) def test_hash_pandas_object(obj, index): _check_equal(obj, index=index) _check_not_equal_with_index(obj) @@ -212,18 +236,23 @@ def test_hash_pandas_object2(series, index): _check_not_equal_with_index(series) -@pytest.mark.parametrize("obj", [ - Series([], dtype="float64"), Series([], dtype="object"), Index([])]) +@pytest.mark.parametrize( + "obj", [Series([], dtype="float64"), Series([], dtype="object"), Index([])] +) def test_hash_pandas_empty_object(obj, index): # These are by-definition the same with # or without the index as the data is empty. _check_equal(obj, index=index) -@pytest.mark.parametrize("s1", [ - Series(["a", "b", "c", "d"]), - Series([1000, 2000, 3000, 4000]), - Series(pd.date_range(0, periods=4))]) +@pytest.mark.parametrize( + "s1", + [ + Series(["a", "b", "c", "d"]), + Series([1000, 2000, 3000, 4000]), + Series(pd.date_range(0, periods=4)), + ], +) @pytest.mark.parametrize("categorize", [True, False]) def test_categorical_consistency(s1, categorize): # see gh-15143 @@ -244,13 +273,11 @@ def test_categorical_consistency(s1, categorize): def test_categorical_with_nan_consistency(): c = pd.Categorical.from_codes( - [-1, 0, 1, 2, 3, 4], - categories=pd.date_range("2012-01-01", periods=5, name="B")) + [-1, 0, 1, 2, 3, 4], categories=pd.date_range("2012-01-01", periods=5, name="B") + ) expected = hash_array(c, categorize=False) - c = pd.Categorical.from_codes( - [-1, 0], - categories=[pd.Timestamp("2012-01-01")]) + c = pd.Categorical.from_codes([-1, 0], categories=[pd.Timestamp("2012-01-01")]) result = hash_array(c, categorize=False) assert result[0] in expected @@ -299,7 +326,7 @@ def test_alternate_encoding(index): @pytest.mark.parametrize("l_exp", range(8)) @pytest.mark.parametrize("l_add", [0, 1]) def test_same_len_hash_collisions(l_exp, l_add): - length = 2**(l_exp + 8) + l_add + length = 2 ** (l_exp + 8) + l_add s = tm.rands_array(length, 2) result = hash_array(s, "utf8") @@ -310,8 +337,10 @@ def test_hash_collisions(): # Hash collisions are bad. # # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726 - hashes = ["Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9", # noqa - "Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe"] # noqa + hashes = [ + "Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9", # noqa + "Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe", + ] # noqa # These should be different. result1 = hash_array(np.asarray(hashes[0:1], dtype=object), "utf8") @@ -323,5 +352,4 @@ def test_hash_collisions(): tm.assert_numpy_array_equal(result2, expected2) result = hash_array(np.asarray(hashes, dtype=object), "utf8") - tm.assert_numpy_array_equal(result, np.concatenate([expected1, - expected2], axis=0)) + tm.assert_numpy_array_equal(result, np.concatenate([expected1, expected2], axis=0)) diff --git a/pandas/tests/util/test_move.py b/pandas/tests/util/test_move.py index 0987600574b8db..0e28dd2dd9d71d 100644 --- a/pandas/tests/util/test_move.py +++ b/pandas/tests/util/test_move.py @@ -19,6 +19,7 @@ def test_more_than_one_ref(): b = b"testing" with pytest.raises(BadMove, match="testing") as e: + def handle_success(type_, value, tb): assert value.args[0] is b return type(e).handle_success(e, type_, value, tb) # super diff --git a/pandas/tests/util/test_safe_import.py b/pandas/tests/util/test_safe_import.py index eef3657af65626..bd07bea934ed3b 100644 --- a/pandas/tests/util/test_safe_import.py +++ b/pandas/tests/util/test_safe_import.py @@ -15,21 +15,16 @@ def test_safe_import_exists(): assert td.safe_import("pandas") -@pytest.mark.parametrize("min_version,valid", [ - ("0.0.0", True), - ("99.99.99", False) -]) +@pytest.mark.parametrize("min_version,valid", [("0.0.0", True), ("99.99.99", False)]) def test_safe_import_versions(min_version, valid): result = td.safe_import("pandas", min_version=min_version) result = result if valid else not result assert result -@pytest.mark.parametrize("min_version,valid", [ - (None, False), - ("1.0", True), - ("2.0", False) -]) +@pytest.mark.parametrize( + "min_version,valid", [(None, False), ("1.0", True), ("2.0", False)] +) def test_safe_import_dummy(monkeypatch, min_version, valid): mod_name = "hello123" diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py index 88ce48245dc70c..83d9be1ad235f6 100644 --- a/pandas/tests/util/test_util.py +++ b/pandas/tests/util/test_util.py @@ -27,8 +27,7 @@ def test_rands_array_2d(): def test_numpy_err_state_is_default(): - expected = {"over": "warn", "divide": "warn", - "invalid": "warn", "under": "ignore"} + expected = {"over": "warn", "divide": "warn", "invalid": "warn", "under": "ignore"} import numpy as np # The error state should be unchanged after that import. @@ -81,7 +80,7 @@ def test_assert_raises_regex_deprecated(): assert 1 == 2, msg -@pytest.mark.parametrize('strict_data_files', [True, False]) +@pytest.mark.parametrize("strict_data_files", [True, False]) def test_datapath_missing(datapath): with pytest.raises(ValueError, match="Could not find file"): datapath("not_a_file") diff --git a/pandas/tests/util/test_validate_args.py b/pandas/tests/util/test_validate_args.py index 581c394401223e..1f1365d62c64e8 100644 --- a/pandas/tests/util/test_validate_args.py +++ b/pandas/tests/util/test_validate_args.py @@ -21,10 +21,12 @@ def test_bad_arg_length_max_value_single(): min_fname_arg_count = 0 max_length = len(compat_args) + min_fname_arg_count actual_length = len(args) + min_fname_arg_count - msg = (r"{fname}\(\) takes at most {max_length} " - r"argument \({actual_length} given\)" - .format(fname=_fname, max_length=max_length, - actual_length=actual_length)) + msg = ( + r"{fname}\(\) takes at most {max_length} " + r"argument \({actual_length} given\)".format( + fname=_fname, max_length=max_length, actual_length=actual_length + ) + ) with pytest.raises(TypeError, match=msg): validate_args(_fname, args, min_fname_arg_count, compat_args) @@ -37,10 +39,12 @@ def test_bad_arg_length_max_value_multiple(): min_fname_arg_count = 2 max_length = len(compat_args) + min_fname_arg_count actual_length = len(args) + min_fname_arg_count - msg = (r"{fname}\(\) takes at most {max_length} " - r"arguments \({actual_length} given\)" - .format(fname=_fname, max_length=max_length, - actual_length=actual_length)) + msg = ( + r"{fname}\(\) takes at most {max_length} " + r"arguments \({actual_length} given\)".format( + fname=_fname, max_length=max_length, actual_length=actual_length + ) + ) with pytest.raises(TypeError, match=msg): validate_args(_fname, args, min_fname_arg_count, compat_args) @@ -49,9 +53,10 @@ def test_bad_arg_length_max_value_multiple(): @pytest.mark.parametrize("i", range(1, 3)) def test_not_all_defaults(i): bad_arg = "foo" - msg = ("the '{arg}' parameter is not supported " - r"in the pandas implementation of {func}\(\)". - format(arg=bad_arg, func=_fname)) + msg = ( + "the '{arg}' parameter is not supported " + r"in the pandas implementation of {func}\(\)".format(arg=bad_arg, func=_fname) + ) compat_args = OrderedDict() compat_args["foo"] = 2 diff --git a/pandas/tests/util/test_validate_args_and_kwargs.py b/pandas/tests/util/test_validate_args_and_kwargs.py index 1d75de18723192..396056466bb811 100644 --- a/pandas/tests/util/test_validate_args_and_kwargs.py +++ b/pandas/tests/util/test_validate_args_and_kwargs.py @@ -16,15 +16,15 @@ def test_invalid_total_length_max_length_one(): max_length = len(compat_args) + min_fname_arg_count actual_length = len(kwargs) + len(args) + min_fname_arg_count - msg = (r"{fname}\(\) takes at most {max_length} " - r"argument \({actual_length} given\)" - .format(fname=_fname, max_length=max_length, - actual_length=actual_length)) + msg = ( + r"{fname}\(\) takes at most {max_length} " + r"argument \({actual_length} given\)".format( + fname=_fname, max_length=max_length, actual_length=actual_length + ) + ) with pytest.raises(TypeError, match=msg): - validate_args_and_kwargs(_fname, args, kwargs, - min_fname_arg_count, - compat_args) + validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args) def test_invalid_total_length_max_length_multiple(): @@ -36,21 +36,18 @@ def test_invalid_total_length_max_length_multiple(): max_length = len(compat_args) + min_fname_arg_count actual_length = len(kwargs) + len(args) + min_fname_arg_count - msg = (r"{fname}\(\) takes at most {max_length} " - r"arguments \({actual_length} given\)" - .format(fname=_fname, max_length=max_length, - actual_length=actual_length)) + msg = ( + r"{fname}\(\) takes at most {max_length} " + r"arguments \({actual_length} given\)".format( + fname=_fname, max_length=max_length, actual_length=actual_length + ) + ) with pytest.raises(TypeError, match=msg): - validate_args_and_kwargs(_fname, args, kwargs, - min_fname_arg_count, - compat_args) + validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args) -@pytest.mark.parametrize("args,kwargs", [ - ((), {"foo": -5, "bar": 2}), - ((-5, 2), {}) -]) +@pytest.mark.parametrize("args,kwargs", [((), {"foo": -5, "bar": 2}), ((-5, 2), {})]) def test_missing_args_or_kwargs(args, kwargs): bad_arg = "bar" min_fname_arg_count = 2 @@ -59,13 +56,13 @@ def test_missing_args_or_kwargs(args, kwargs): compat_args["foo"] = -5 compat_args[bad_arg] = 1 - msg = (r"the '{arg}' parameter is not supported " - r"in the pandas implementation of {func}\(\)". - format(arg=bad_arg, func=_fname)) + msg = ( + r"the '{arg}' parameter is not supported " + r"in the pandas implementation of {func}\(\)".format(arg=bad_arg, func=_fname) + ) with pytest.raises(ValueError, match=msg): - validate_args_and_kwargs(_fname, args, kwargs, - min_fname_arg_count, compat_args) + validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args) def test_duplicate_argument(): @@ -79,13 +76,12 @@ def test_duplicate_argument(): kwargs = {"foo": None, "bar": None} args = (None,) # duplicate value for "foo" - msg = (r"{fname}\(\) got multiple values for keyword " - r"argument '{arg}'".format(fname=_fname, arg="foo")) + msg = r"{fname}\(\) got multiple values for keyword " r"argument '{arg}'".format( + fname=_fname, arg="foo" + ) with pytest.raises(TypeError, match=msg): - validate_args_and_kwargs(_fname, args, kwargs, - min_fname_arg_count, - compat_args) + validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args) def test_validation(): @@ -99,6 +95,4 @@ def test_validation(): args = (1, None) min_fname_arg_count = 2 - validate_args_and_kwargs(_fname, args, kwargs, - min_fname_arg_count, - compat_args) + validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args) diff --git a/pandas/tests/util/test_validate_kwargs.py b/pandas/tests/util/test_validate_kwargs.py index 1cf9736eaef921..ec9f3948403de1 100644 --- a/pandas/tests/util/test_validate_kwargs.py +++ b/pandas/tests/util/test_validate_kwargs.py @@ -16,8 +16,9 @@ def test_bad_kwarg(): compat_args[bad_arg + "o"] = "bar" kwargs = {good_arg: "foo", bad_arg: "bar"} - msg = (r"{fname}\(\) got an unexpected " - r"keyword argument '{arg}'".format(fname=_fname, arg=bad_arg)) + msg = r"{fname}\(\) got an unexpected " r"keyword argument '{arg}'".format( + fname=_fname, arg=bad_arg + ) with pytest.raises(TypeError, match=msg): validate_kwargs(_fname, kwargs, compat_args) @@ -26,9 +27,10 @@ def test_bad_kwarg(): @pytest.mark.parametrize("i", range(1, 3)) def test_not_all_none(i): bad_arg = "foo" - msg = (r"the '{arg}' parameter is not supported " - r"in the pandas implementation of {func}\(\)". - format(arg=bad_arg, func=_fname)) + msg = ( + r"the '{arg}' parameter is not supported " + r"in the pandas implementation of {func}\(\)".format(arg=bad_arg, func=_fname) + ) compat_args = OrderedDict() compat_args["foo"] = 1 @@ -58,8 +60,10 @@ def test_validation(): @pytest.mark.parametrize("name", ["inplace", "copy"]) @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0]) def test_validate_bool_kwarg_fail(name, value): - msg = ("For argument \"%s\" expected type bool, received type %s" % - (name, type(value).__name__)) + msg = 'For argument "%s" expected type bool, received type %s' % ( + name, + type(value).__name__, + ) with pytest.raises(ValueError, match=msg): validate_bool_kwarg(value, name) diff --git a/pandas/tseries/converter.py b/pandas/tseries/converter.py index e1e4dd4cf4b8a7..c2b76188ad36be 100644 --- a/pandas/tseries/converter.py +++ b/pandas/tseries/converter.py @@ -6,15 +6,27 @@ # in `pandas.plotting`, or remove from here (I guess they are here for # legacy reasons from pandas.plotting._matplotlib.converter import ( - DatetimeConverter, MilliSecondLocator, PandasAutoDateFormatter, - PandasAutoDateLocator, PeriodConverter, TimeConverter, TimeFormatter, - TimeSeries_DateFormatter, TimeSeries_DateLocator, get_datevalue, - get_finder, time2num) + DatetimeConverter, + MilliSecondLocator, + PandasAutoDateFormatter, + PandasAutoDateLocator, + PeriodConverter, + TimeConverter, + TimeFormatter, + TimeSeries_DateFormatter, + TimeSeries_DateLocator, + get_datevalue, + get_finder, + time2num, +) def register(): from pandas.plotting import register_matplotlib_converters - msg = ("'pandas.tseries.converter.register' has been moved and renamed to " - "'pandas.plotting.register_matplotlib_converters'. ") + + msg = ( + "'pandas.tseries.converter.register' has been moved and renamed to " + "'pandas.plotting.register_matplotlib_converters'. " + ) warnings.warn(msg, FutureWarning, stacklevel=2) register_matplotlib_converters() diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 4069a2004476b2..dfe91b514bbe1f 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -18,20 +18,32 @@ from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( - is_datetime64_dtype, is_period_arraylike, is_timedelta64_dtype) + is_datetime64_dtype, + is_period_arraylike, + is_timedelta64_dtype, +) from pandas.core.dtypes.generic import ABCSeries from pandas.core.algorithms import unique from pandas.tseries.offsets import ( - DateOffset, Day, Hour, Micro, Milli, Minute, Nano, Second, prefix_mapping) + DateOffset, + Day, + Hour, + Micro, + Milli, + Minute, + Nano, + Second, + prefix_mapping, +) _ONE_MICRO = 1000 -_ONE_MILLI = (_ONE_MICRO * 1000) -_ONE_SECOND = (_ONE_MILLI * 1000) -_ONE_MINUTE = (60 * _ONE_SECOND) -_ONE_HOUR = (60 * _ONE_MINUTE) -_ONE_DAY = (24 * _ONE_HOUR) +_ONE_MILLI = _ONE_MICRO * 1000 +_ONE_SECOND = _ONE_MILLI * 1000 +_ONE_MINUTE = 60 * _ONE_SECOND +_ONE_HOUR = 60 * _ONE_MINUTE +_ONE_DAY = 24 * _ONE_HOUR # --------------------------------------------------------------------- # Offset names ("time rules") and related functions @@ -45,13 +57,15 @@ def get_period_alias(offset_str): return _offset_to_period_map.get(offset_str, None) -_name_to_offset_map = {'days': Day(1), - 'hours': Hour(1), - 'minutes': Minute(1), - 'seconds': Second(1), - 'milliseconds': Milli(1), - 'microseconds': Micro(1), - 'nanoseconds': Nano(1)} +_name_to_offset_map = { + "days": Day(1), + "hours": Hour(1), + "minutes": Minute(1), + "seconds": Second(1), + "milliseconds": Milli(1), + "microseconds": Micro(1), + "nanoseconds": Nano(1), +} def to_offset(freq): @@ -132,16 +146,17 @@ def to_offset(freq): stride_sign = None try: splitted = re.split(libfreqs.opattern, freq) - if splitted[-1] != '' and not splitted[-1].isspace(): + if splitted[-1] != "" and not splitted[-1].isspace(): # the last element must be blank - raise ValueError('last element must be blank') - for sep, stride, name in zip(splitted[0::4], splitted[1::4], - splitted[2::4]): - if sep != '' and not sep.isspace(): - raise ValueError('separator must be spaces') + raise ValueError("last element must be blank") + for sep, stride, name in zip( + splitted[0::4], splitted[1::4], splitted[2::4] + ): + if sep != "" and not sep.isspace(): + raise ValueError("separator must be spaces") prefix = libfreqs._lite_rule_alias.get(name) or name if stride_sign is None: - stride_sign = -1 if stride.startswith('-') else 1 + stride_sign = -1 if stride.startswith("-") else 1 if not stride: stride = 1 if prefix in Resolution._reso_str_bump_map.keys(): @@ -181,7 +196,7 @@ def get_offset(name): if name not in _offset_map: try: - split = name.split('-') + split = name.split("-") klass = prefix_mapping[split[0]] # handles case where there's no suffix (and will TypeError if too # many '-') @@ -221,16 +236,22 @@ def infer_freq(index, warn=True): if isinstance(index, ABCSeries): values = index._values - if not (is_datetime64_dtype(values) or - is_timedelta64_dtype(values) or - values.dtype == object): - raise TypeError("cannot infer freq from a non-convertible dtype " - "on a Series of {dtype}".format(dtype=index.dtype)) + if not ( + is_datetime64_dtype(values) + or is_timedelta64_dtype(values) + or values.dtype == object + ): + raise TypeError( + "cannot infer freq from a non-convertible dtype " + "on a Series of {dtype}".format(dtype=index.dtype) + ) index = values if is_period_arraylike(index): - raise TypeError("PeriodIndex given. Check the `freq` attribute " - "instead of using infer_freq.") + raise TypeError( + "PeriodIndex given. Check the `freq` attribute " + "instead of using infer_freq." + ) elif is_timedelta64_dtype(index): # Allow TimedeltaIndex and TimedeltaArray inferer = _TimedeltaFrequencyInferer(index, warn=warn) @@ -238,8 +259,10 @@ def infer_freq(index, warn=True): if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex): if isinstance(index, (pd.Int64Index, pd.Float64Index)): - raise TypeError("cannot infer freq from a non-convertible index " - "type {type}".format(type=type(index))) + raise TypeError( + "cannot infer freq from a non-convertible index " + "type {type}".format(type=type(index)) + ) index = index.values if not isinstance(index, pd.DatetimeIndex): @@ -263,17 +286,18 @@ def __init__(self, index, warn=True): # This moves the values, which are implicitly in UTC, to the # the timezone so they are in local time - if hasattr(index, 'tz'): + if hasattr(index, "tz"): if index.tz is not None: self.values = tz_convert(self.values, UTC, index.tz) self.warn = warn if len(index) < 3: - raise ValueError('Need at least 3 dates to infer frequency') + raise ValueError("Need at least 3 dates to infer frequency") - self.is_monotonic = (self.index._is_monotonic_increasing or - self.index._is_monotonic_decreasing) + self.is_monotonic = ( + self.index._is_monotonic_increasing or self.index._is_monotonic_decreasing + ) @cache_readonly def deltas(self): @@ -309,7 +333,7 @@ def get_freq(self): # Business hourly, maybe. 17: one day / 65: one weekend if self.hour_deltas in ([1, 17], [1, 65], [1, 17, 65]): - return 'BH' + return "BH" # Possibly intraday frequency. Here we use the # original .asi8 values as the modified values # will not work around DST transitions. See #8772 @@ -319,22 +343,22 @@ def get_freq(self): delta = self.deltas_asi8[0] if _is_multiple(delta, _ONE_HOUR): # Hours - return _maybe_add_count('H', delta / _ONE_HOUR) + return _maybe_add_count("H", delta / _ONE_HOUR) elif _is_multiple(delta, _ONE_MINUTE): # Minutes - return _maybe_add_count('T', delta / _ONE_MINUTE) + return _maybe_add_count("T", delta / _ONE_MINUTE) elif _is_multiple(delta, _ONE_SECOND): # Seconds - return _maybe_add_count('S', delta / _ONE_SECOND) + return _maybe_add_count("S", delta / _ONE_SECOND) elif _is_multiple(delta, _ONE_MILLI): # Milliseconds - return _maybe_add_count('L', delta / _ONE_MILLI) + return _maybe_add_count("L", delta / _ONE_MILLI) elif _is_multiple(delta, _ONE_MICRO): # Microseconds - return _maybe_add_count('U', delta / _ONE_MICRO) + return _maybe_add_count("U", delta / _ONE_MICRO) else: # Nanoseconds - return _maybe_add_count('N', delta) + return _maybe_add_count("N", delta) @cache_readonly def day_deltas(self): @@ -353,24 +377,23 @@ def rep_stamp(self): return Timestamp(self.values[0]) def month_position_check(self): - return libresolution.month_position_check(self.fields, - self.index.dayofweek) + return libresolution.month_position_check(self.fields, self.index.dayofweek) @cache_readonly def mdiffs(self): - nmonths = self.fields['Y'] * 12 + self.fields['M'] - return unique_deltas(nmonths.astype('i8')) + nmonths = self.fields["Y"] * 12 + self.fields["M"] + return unique_deltas(nmonths.astype("i8")) @cache_readonly def ydiffs(self): - return unique_deltas(self.fields['Y'].astype('i8')) + return unique_deltas(self.fields["Y"].astype("i8")) def _infer_daily_rule(self): annual_rule = self._get_annual_rule() if annual_rule: nyears = self.ydiffs[0] month = MONTH_ALIASES[self.rep_stamp.month] - alias = '{prefix}-{month}'.format(prefix=annual_rule, month=month) + alias = "{prefix}-{month}".format(prefix=annual_rule, month=month) return _maybe_add_count(alias, nyears) quarterly_rule = self._get_quarterly_rule() @@ -378,8 +401,7 @@ def _infer_daily_rule(self): nquarters = self.mdiffs[0] / 3 mod_dict = {0: 12, 2: 11, 1: 10} month = MONTH_ALIASES[mod_dict[self.rep_stamp.month % 3]] - alias = '{prefix}-{month}'.format(prefix=quarterly_rule, - month=month) + alias = "{prefix}-{month}".format(prefix=quarterly_rule, month=month) return _maybe_add_count(alias, nquarters) monthly_rule = self._get_monthly_rule() @@ -391,13 +413,12 @@ def _infer_daily_rule(self): if days % 7 == 0: # Weekly day = int_to_weekday[self.rep_stamp.weekday()] - return _maybe_add_count( - 'W-{day}'.format(day=day), days / 7) + return _maybe_add_count("W-{day}".format(day=day), days / 7) else: - return _maybe_add_count('D', days) + return _maybe_add_count("D", days) if self._is_business_daily(): - return 'B' + return "B" wom_rule = self._get_wom_rule() if wom_rule: @@ -407,12 +428,11 @@ def _get_annual_rule(self): if len(self.ydiffs) > 1: return None - if len(unique(self.fields['M'])) > 1: + if len(unique(self.fields["M"])) > 1: return None pos_check = self.month_position_check() - return {'cs': 'AS', 'bs': 'BAS', - 'ce': 'A', 'be': 'BA'}.get(pos_check) + return {"cs": "AS", "bs": "BAS", "ce": "A", "be": "BA"}.get(pos_check) def _get_quarterly_rule(self): if len(self.mdiffs) > 1: @@ -422,15 +442,13 @@ def _get_quarterly_rule(self): return None pos_check = self.month_position_check() - return {'cs': 'QS', 'bs': 'BQS', - 'ce': 'Q', 'be': 'BQ'}.get(pos_check) + return {"cs": "QS", "bs": "BQS", "ce": "Q", "be": "BQ"}.get(pos_check) def _get_monthly_rule(self): if len(self.mdiffs) > 1: return None pos_check = self.month_position_check() - return {'cs': 'MS', 'bs': 'BMS', - 'ce': 'M', 'be': 'BM'}.get(pos_check) + return {"cs": "MS", "bs": "BMS", "ce": "M", "be": "BM"}.get(pos_check) def _is_business_daily(self): # quick check: cannot be business daily @@ -442,8 +460,10 @@ def _is_business_daily(self): shifts = np.diff(self.index.asi8) shifts = np.floor_divide(shifts, _ONE_DAY) weekdays = np.mod(first_weekday + np.cumsum(shifts), 7) - return np.all(((weekdays == 0) & (shifts == 3)) | - ((weekdays > 0) & (weekdays <= 4) & (shifts == 1))) + return np.all( + ((weekdays == 0) & (shifts == 3)) + | ((weekdays > 0) & (weekdays <= 4) & (shifts == 1)) + ) def _get_wom_rule(self): # wdiffs = unique(np.diff(self.index.week)) @@ -465,21 +485,20 @@ def _get_wom_rule(self): week = week_of_months[0] + 1 wd = int_to_weekday[weekdays[0]] - return 'WOM-{week}{weekday}'.format(week=week, weekday=wd) + return "WOM-{week}{weekday}".format(week=week, weekday=wd) class _TimedeltaFrequencyInferer(_FrequencyInferer): - def _infer_daily_rule(self): if self.is_unique: days = self.deltas[0] / _ONE_DAY if days % 7 == 0: # Weekly wd = int_to_weekday[self.rep_stamp.weekday()] - alias = 'W-{weekday}'.format(weekday=wd) + alias = "W-{weekday}".format(weekday=wd) return _maybe_add_count(alias, days / 7) else: - return _maybe_add_count('D', days) + return _maybe_add_count("D", days) def _is_multiple(us, mult): @@ -490,6 +509,6 @@ def _maybe_add_count(base, count): if count != 1: assert count == int(count) count = int(count) - return '{count}{base}'.format(count=count, base=base) + return "{count}{base}".format(count=count, base=base) else: return base diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 7171a6a182bdc3..1654163d2a9e0d 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -127,9 +127,18 @@ class Holiday: for observance. """ - def __init__(self, name, year=None, month=None, day=None, offset=None, - observance=None, start_date=None, end_date=None, - days_of_week=None): + def __init__( + self, + name, + year=None, + month=None, + day=None, + offset=None, + observance=None, + start_date=None, + end_date=None, + days_of_week=None, + ): """ Parameters ---------- @@ -166,27 +175,27 @@ class from pandas.tseries.offsets self.month = month self.day = day self.offset = offset - self.start_date = Timestamp( - start_date) if start_date is not None else start_date - self.end_date = Timestamp( - end_date) if end_date is not None else end_date + self.start_date = ( + Timestamp(start_date) if start_date is not None else start_date + ) + self.end_date = Timestamp(end_date) if end_date is not None else end_date self.observance = observance - assert (days_of_week is None or type(days_of_week) == tuple) + assert days_of_week is None or type(days_of_week) == tuple self.days_of_week = days_of_week def __repr__(self): - info = '' + info = "" if self.year is not None: - info += 'year={year}, '.format(year=self.year) - info += 'month={mon}, day={day}, '.format(mon=self.month, day=self.day) + info += "year={year}, ".format(year=self.year) + info += "month={mon}, day={day}, ".format(mon=self.month, day=self.day) if self.offset is not None: - info += 'offset={offset}'.format(offset=self.offset) + info += "offset={offset}".format(offset=self.offset) if self.observance is not None: - info += 'observance={obs}'.format(obs=self.observance) + info += "observance={obs}".format(obs=self.observance) - repr = 'Holiday: {name} ({info})'.format(name=self.name, info=info) + repr = "Holiday: {name} ({info})".format(name=self.name, info=info) return repr def dates(self, start_date, end_date, return_name=False): @@ -217,17 +226,21 @@ def dates(self, start_date, end_date, return_name=False): dates = self._reference_dates(start_date, end_date) holiday_dates = self._apply_rule(dates) if self.days_of_week is not None: - holiday_dates = holiday_dates[np.in1d(holiday_dates.dayofweek, - self.days_of_week)] + holiday_dates = holiday_dates[ + np.in1d(holiday_dates.dayofweek, self.days_of_week) + ] if self.start_date is not None: - filter_start_date = max(self.start_date.tz_localize( - filter_start_date.tz), filter_start_date) + filter_start_date = max( + self.start_date.tz_localize(filter_start_date.tz), filter_start_date + ) if self.end_date is not None: - filter_end_date = min(self.end_date.tz_localize( - filter_end_date.tz), filter_end_date) - holiday_dates = holiday_dates[(holiday_dates >= filter_start_date) & - (holiday_dates <= filter_end_date)] + filter_end_date = min( + self.end_date.tz_localize(filter_end_date.tz), filter_end_date + ) + holiday_dates = holiday_dates[ + (holiday_dates >= filter_start_date) & (holiday_dates <= filter_end_date) + ] if return_name: return Series(self.name, index=holiday_dates) return holiday_dates @@ -249,14 +262,19 @@ def _reference_dates(self, start_date, end_date): year_offset = DateOffset(years=1) reference_start_date = Timestamp( - datetime(start_date.year - 1, self.month, self.day)) + datetime(start_date.year - 1, self.month, self.day) + ) reference_end_date = Timestamp( - datetime(end_date.year + 1, self.month, self.day)) + datetime(end_date.year + 1, self.month, self.day) + ) # Don't process unnecessary holidays - dates = date_range(start=reference_start_date, - end=reference_end_date, - freq=year_offset, tz=start_date.tz) + dates = date_range( + start=reference_start_date, + end=reference_end_date, + freq=year_offset, + tz=start_date.tz, + ) return dates @@ -315,7 +333,6 @@ def get_calendar(name): class HolidayCalendarMetaClass(type): - def __new__(cls, clsname, bases, attrs): calendar_class = super().__new__(cls, clsname, bases, attrs) register(calendar_class) @@ -326,6 +343,7 @@ class AbstractHolidayCalendar(metaclass=HolidayCalendarMetaClass): """ Abstract interface to create holidays following certain rules. """ + rules = [] # type: List[Holiday] start_date = Timestamp(datetime(1970, 1, 1)) end_date = Timestamp(datetime(2030, 12, 31)) @@ -375,8 +393,10 @@ def holidays(self, start=None, end=None, return_name=False): DatetimeIndex of holidays """ if self.rules is None: - raise Exception('Holiday Calendar {name} does not have any ' - 'rules specified'.format(name=self.name)) + raise Exception( + "Holiday Calendar {name} does not have any " + "rules specified".format(name=self.name) + ) if start is None: start = AbstractHolidayCalendar.start_date @@ -390,8 +410,7 @@ def holidays(self, start=None, end=None, return_name=False): holidays = None # If we don't have a cache or the dates are outside the prior cache, we # get them again - if (self._cache is None or start < self._cache[0] or - end > self._cache[1]): + if self._cache is None or start < self._cache[0] or end > self._cache[1]: for rule in self.rules: rule_holidays = rule.dates(start, end, return_name=True) @@ -464,23 +483,29 @@ def merge(self, other, inplace=False): return holidays -USMemorialDay = Holiday('Memorial Day', month=5, day=31, - offset=DateOffset(weekday=MO(-1))) -USLaborDay = Holiday('Labor Day', month=9, day=1, - offset=DateOffset(weekday=MO(1))) -USColumbusDay = Holiday('Columbus Day', month=10, day=1, - offset=DateOffset(weekday=MO(2))) -USThanksgivingDay = Holiday('Thanksgiving', month=11, day=1, - offset=DateOffset(weekday=TH(4))) -USMartinLutherKingJr = Holiday('Martin Luther King Jr. Day', - start_date=datetime(1986, 1, 1), month=1, day=1, - offset=DateOffset(weekday=MO(3))) -USPresidentsDay = Holiday('Presidents Day', month=2, day=1, - offset=DateOffset(weekday=MO(3))) +USMemorialDay = Holiday( + "Memorial Day", month=5, day=31, offset=DateOffset(weekday=MO(-1)) +) +USLaborDay = Holiday("Labor Day", month=9, day=1, offset=DateOffset(weekday=MO(1))) +USColumbusDay = Holiday( + "Columbus Day", month=10, day=1, offset=DateOffset(weekday=MO(2)) +) +USThanksgivingDay = Holiday( + "Thanksgiving", month=11, day=1, offset=DateOffset(weekday=TH(4)) +) +USMartinLutherKingJr = Holiday( + "Martin Luther King Jr. Day", + start_date=datetime(1986, 1, 1), + month=1, + day=1, + offset=DateOffset(weekday=MO(3)), +) +USPresidentsDay = Holiday( + "Presidents Day", month=2, day=1, offset=DateOffset(weekday=MO(3)) +) GoodFriday = Holiday("Good Friday", month=1, day=1, offset=[Easter(), Day(-2)]) -EasterMonday = Holiday("Easter Monday", month=1, day=1, - offset=[Easter(), Day(1)]) +EasterMonday = Holiday("Easter Monday", month=1, day=1, offset=[Easter(), Day(1)]) class USFederalHolidayCalendar(AbstractHolidayCalendar): @@ -489,22 +514,22 @@ class USFederalHolidayCalendar(AbstractHolidayCalendar): https://www.opm.gov/policy-data-oversight/ snow-dismissal-procedures/federal-holidays/ """ + rules = [ - Holiday('New Years Day', month=1, day=1, observance=nearest_workday), + Holiday("New Years Day", month=1, day=1, observance=nearest_workday), USMartinLutherKingJr, USPresidentsDay, USMemorialDay, - Holiday('July 4th', month=7, day=4, observance=nearest_workday), + Holiday("July 4th", month=7, day=4, observance=nearest_workday), USLaborDay, USColumbusDay, - Holiday('Veterans Day', month=11, day=11, observance=nearest_workday), + Holiday("Veterans Day", month=11, day=11, observance=nearest_workday), USThanksgivingDay, - Holiday('Christmas', month=12, day=25, observance=nearest_workday) + Holiday("Christmas", month=12, day=25, observance=nearest_workday), ] -def HolidayCalendarFactory(name, base, other, - base_class=AbstractHolidayCalendar): +def HolidayCalendarFactory(name, base, other, base_class=AbstractHolidayCalendar): rules = AbstractHolidayCalendar.merge_class(base, other) calendar_class = type(name, (base_class,), {"rules": rules, "name": name}) return calendar_class diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 087c05574090ca..ac3e92c772517a 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -7,12 +7,29 @@ import numpy as np from pandas._libs.tslibs import ( - NaT, OutOfBoundsDatetime, Timedelta, Timestamp, ccalendar, conversion, - delta_to_nanoseconds, frequencies as libfrequencies, normalize_date, - offsets as liboffsets, timezones) + NaT, + OutOfBoundsDatetime, + Timedelta, + Timestamp, + ccalendar, + conversion, + delta_to_nanoseconds, + frequencies as libfrequencies, + normalize_date, + offsets as liboffsets, + timezones, +) from pandas._libs.tslibs.offsets import ( - ApplyTypeError, BaseOffset, _get_calendar, _is_normalized, _to_dt64, - apply_index_wraps, as_datetime, roll_yearday, shift_month) + ApplyTypeError, + BaseOffset, + _get_calendar, + _is_normalized, + _to_dt64, + apply_index_wraps, + as_datetime, + roll_yearday, + shift_month, +) from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, cache_readonly @@ -21,17 +38,44 @@ from pandas.core.tools.datetimes import to_datetime -__all__ = ['Day', 'BusinessDay', 'BDay', 'CustomBusinessDay', 'CDay', - 'CBMonthEnd', 'CBMonthBegin', - 'MonthBegin', 'BMonthBegin', 'MonthEnd', 'BMonthEnd', - 'SemiMonthEnd', 'SemiMonthBegin', - 'BusinessHour', 'CustomBusinessHour', - 'YearBegin', 'BYearBegin', 'YearEnd', 'BYearEnd', - 'QuarterBegin', 'BQuarterBegin', 'QuarterEnd', 'BQuarterEnd', - 'LastWeekOfMonth', 'FY5253Quarter', 'FY5253', - 'Week', 'WeekOfMonth', 'Easter', - 'Hour', 'Minute', 'Second', 'Milli', 'Micro', 'Nano', - 'DateOffset'] +__all__ = [ + "Day", + "BusinessDay", + "BDay", + "CustomBusinessDay", + "CDay", + "CBMonthEnd", + "CBMonthBegin", + "MonthBegin", + "BMonthBegin", + "MonthEnd", + "BMonthEnd", + "SemiMonthEnd", + "SemiMonthBegin", + "BusinessHour", + "CustomBusinessHour", + "YearBegin", + "BYearBegin", + "YearEnd", + "BYearEnd", + "QuarterBegin", + "BQuarterBegin", + "QuarterEnd", + "BQuarterEnd", + "LastWeekOfMonth", + "FY5253Quarter", + "FY5253", + "Week", + "WeekOfMonth", + "Easter", + "Hour", + "Minute", + "Second", + "Milli", + "Micro", + "Nano", + "DateOffset", +] # convert to/from datetime/timestamp to allow invalid Timestamp ranges to # pass thru @@ -58,8 +102,8 @@ def wrapper(self, other): elif isinstance(other, (np.datetime64, datetime, date)): other = as_timestamp(other) - tz = getattr(other, 'tzinfo', None) - nano = getattr(other, 'nanosecond', 0) + tz = getattr(other, "tzinfo", None) + nano = getattr(other, "nanosecond", 0) try: if self._adjust_dst and isinstance(other, Timestamp): @@ -80,7 +124,8 @@ def wrapper(self, other): if result.tz is not None: # convert to UTC value = conversion.tz_convert_single( - result.value, timezones.UTC, result.tz) + result.value, timezones.UTC, result.tz + ) else: value = result.value result = Timestamp(value + nano) @@ -101,6 +146,7 @@ def wrapper(self, other): result = Timestamp(result) return result + return wrapper @@ -199,11 +245,11 @@ def __add__(date): >>> ts + DateOffset(month=3) Timestamp('2017-03-01 09:10:11') """ + _params = cache_readonly(BaseOffset._params.fget) _use_relativedelta = False _adjust_dst = False - _attributes = frozenset(['n', 'normalize'] + - list(liboffsets.relativedelta_kwds)) + _attributes = frozenset(["n", "normalize"] + list(liboffsets.relativedelta_kwds)) # default for prior pickles normalize = False @@ -224,7 +270,7 @@ def apply(self, other): other = as_datetime(other) if len(self.kwds) > 0: - tzinfo = getattr(other, 'tzinfo', None) + tzinfo = getattr(other, "tzinfo", None) if tzinfo is not None and self._use_relativedelta: # perform calculation in UTC other = other.replace(tzinfo=None) @@ -261,72 +307,83 @@ def apply_index(self, i): """ if type(self) is not DateOffset: - raise NotImplementedError("DateOffset subclass {name} " - "does not have a vectorized " - "implementation".format( - name=self.__class__.__name__)) + raise NotImplementedError( + "DateOffset subclass {name} " + "does not have a vectorized " + "implementation".format(name=self.__class__.__name__) + ) kwds = self.kwds - relativedelta_fast = {'years', 'months', 'weeks', 'days', 'hours', - 'minutes', 'seconds', 'microseconds'} + relativedelta_fast = { + "years", + "months", + "weeks", + "days", + "hours", + "minutes", + "seconds", + "microseconds", + } # relativedelta/_offset path only valid for base DateOffset - if (self._use_relativedelta and - set(kwds).issubset(relativedelta_fast)): + if self._use_relativedelta and set(kwds).issubset(relativedelta_fast): - months = ((kwds.get('years', 0) * 12 + - kwds.get('months', 0)) * self.n) + months = (kwds.get("years", 0) * 12 + kwds.get("months", 0)) * self.n if months: shifted = liboffsets.shift_months(i.asi8, months) i = type(i)(shifted, dtype=i.dtype) - weeks = (kwds.get('weeks', 0)) * self.n + weeks = (kwds.get("weeks", 0)) * self.n if weeks: # integer addition on PeriodIndex is deprecated, # so we directly use _time_shift instead - asper = i.to_period('W') + asper = i.to_period("W") if not isinstance(asper._data, np.ndarray): # unwrap PeriodIndex --> PeriodArray asper = asper._data shifted = asper._time_shift(weeks) - i = shifted.to_timestamp() + i.to_perioddelta('W') + i = shifted.to_timestamp() + i.to_perioddelta("W") - timedelta_kwds = {k: v for k, v in kwds.items() - if k in ['days', 'hours', 'minutes', - 'seconds', 'microseconds']} + timedelta_kwds = { + k: v + for k, v in kwds.items() + if k in ["days", "hours", "minutes", "seconds", "microseconds"] + } if timedelta_kwds: delta = Timedelta(**timedelta_kwds) i = i + (self.n * delta) return i - elif not self._use_relativedelta and hasattr(self, '_offset'): + elif not self._use_relativedelta and hasattr(self, "_offset"): # timedelta return i + (self._offset * self.n) else: # relativedelta with other keywords kwd = set(kwds) - relativedelta_fast - raise NotImplementedError("DateOffset with relativedelta " - "keyword(s) {kwd} not able to be " - "applied vectorized".format(kwd=kwd)) + raise NotImplementedError( + "DateOffset with relativedelta " + "keyword(s) {kwd} not able to be " + "applied vectorized".format(kwd=kwd) + ) def isAnchored(self): # TODO: Does this make sense for the general case? It would help # if there were a canonical docstring for what isAnchored means. - return (self.n == 1) + return self.n == 1 # TODO: Combine this with BusinessMixin version by defining a whitelisted # set of attributes on each object rather than the existing behavior of # iterating over internal ``__dict__`` def _repr_attrs(self): - exclude = {'n', 'inc', 'normalize'} + exclude = {"n", "inc", "normalize"} attrs = [] for attr in sorted(self.__dict__): - if attr.startswith('_') or attr == 'kwds': + if attr.startswith("_") or attr == "kwds": continue elif attr not in exclude: value = getattr(self, attr) - attrs.append('{attr}={value}'.format(attr=attr, value=value)) + attrs.append("{attr}={value}".format(attr=attr, value=value)) - out = '' + out = "" if attrs: - out += ': ' + ', '.join(attrs) + out += ": " + ", ".join(attrs) return out @property @@ -372,13 +429,13 @@ def onOffset(self, dt): # date range generated by this offset. Subclasses may have this # re-implemented in a nicer way. a = dt - b = ((dt + self) - self) + b = (dt + self) - self return a == b # way to get around weirdness with rule_code @property def _prefix(self): - raise NotImplementedError('Prefix not defined') + raise NotImplementedError("Prefix not defined") @property def rule_code(self): @@ -392,7 +449,7 @@ def freqstr(self): return repr(self) if self.n != 1: - fstr = '{n}{code}'.format(n=self.n, code=code) + fstr = "{n}{code}".format(n=self.n, code=code) else: fstr = code @@ -406,7 +463,7 @@ def freqstr(self): return fstr def _offset_str(self): - return '' + return "" @property def nanos(self): @@ -427,10 +484,11 @@ class _CustomMixin: Mixin for classes that define and validate calendar, holidays, and weekdays attributes. """ + def __init__(self, weekmask, holidays, calendar): - calendar, holidays = _get_calendar(weekmask=weekmask, - holidays=holidays, - calendar=calendar) + calendar, holidays = _get_calendar( + weekmask=weekmask, holidays=holidays, calendar=calendar + ) # Custom offset instances are identified by the # following two attributes. See DateOffset._params() # holidays, weekmask @@ -455,12 +513,12 @@ def offset(self): def _repr_attrs(self): if self.offset: - attrs = ['offset={offset!r}'.format(offset=self.offset)] + attrs = ["offset={offset!r}".format(offset=self.offset)] else: attrs = None - out = '' + out = "" if attrs: - out += ': ' + ', '.join(attrs) + out += ": " + ", ".join(attrs) return out @@ -468,9 +526,10 @@ class BusinessDay(BusinessMixin, SingleConstructorOffset): """ DateOffset subclass representing possibly n business days. """ - _prefix = 'B' + + _prefix = "B" _adjust_dst = True - _attributes = frozenset(['n', 'normalize', 'offset']) + _attributes = frozenset(["n", "normalize", "offset"]) def __init__(self, n=1, normalize=False, offset=timedelta(0)): BaseOffset.__init__(self, n, normalize) @@ -478,34 +537,34 @@ def __init__(self, n=1, normalize=False, offset=timedelta(0)): def _offset_str(self): def get_str(td): - off_str = '' + off_str = "" if td.days > 0: - off_str += str(td.days) + 'D' + off_str += str(td.days) + "D" if td.seconds > 0: s = td.seconds hrs = int(s / 3600) if hrs != 0: - off_str += str(hrs) + 'H' + off_str += str(hrs) + "H" s -= hrs * 3600 mts = int(s / 60) if mts != 0: - off_str += str(mts) + 'Min' + off_str += str(mts) + "Min" s -= mts * 60 if s != 0: - off_str += str(s) + 's' + off_str += str(s) + "s" if td.microseconds > 0: - off_str += str(td.microseconds) + 'us' + off_str += str(td.microseconds) + "us" return off_str if isinstance(self.offset, timedelta): zero = timedelta(0, 0, 0) if self.offset >= zero: - off_str = '+' + get_str(self.offset) + off_str = "+" + get_str(self.offset) else: - off_str = '-' + get_str(-self.offset) + off_str = "-" + get_str(-self.offset) return off_str else: - return '+' + repr(self.offset) + return "+" + repr(self.offset) @apply_wraps def apply(self, other): @@ -541,24 +600,24 @@ def apply(self, other): return result elif isinstance(other, (timedelta, Tick)): - return BDay(self.n, offset=self.offset + other, - normalize=self.normalize) + return BDay(self.n, offset=self.offset + other, normalize=self.normalize) else: - raise ApplyTypeError('Only know how to combine business day with ' - 'datetime or timedelta.') + raise ApplyTypeError( + "Only know how to combine business day with " "datetime or timedelta." + ) @apply_index_wraps def apply_index(self, i): - time = i.to_perioddelta('D') + time = i.to_perioddelta("D") # to_period rolls forward to next BDay; track and # reduce n where it does when rolling forward - asper = i.to_period('B') + asper = i.to_period("B") if not isinstance(asper._data, np.ndarray): # unwrap PeriodIndex --> PeriodArray asper = asper._data if self.n > 0: - shifted = (i.to_perioddelta('B') - time).asi8 != 0 + shifted = (i.to_perioddelta("B") - time).asi8 != 0 # Integer-array addition is deprecated, so we use # _time_shift directly @@ -579,27 +638,26 @@ def onOffset(self, dt): class BusinessHourMixin(BusinessMixin): - - def __init__(self, start='09:00', end='17:00', offset=timedelta(0)): + def __init__(self, start="09:00", end="17:00", offset=timedelta(0)): # must be validated here to equality check if not is_list_like(start): start = [start] if not len(start): - raise ValueError('Must include at least 1 start time') + raise ValueError("Must include at least 1 start time") if not is_list_like(end): end = [end] if not len(end): - raise ValueError('Must include at least 1 end time') + raise ValueError("Must include at least 1 end time") - start = np.array([liboffsets._validate_business_time(x) - for x in start]) + start = np.array([liboffsets._validate_business_time(x) for x in start]) end = np.array([liboffsets._validate_business_time(x) for x in end]) # Validation of input if len(start) != len(end): - raise ValueError('number of starting time and ending time ' - 'must be the same') + raise ValueError( + "number of starting time and ending time " "must be the same" + ) num_openings = len(start) # sort starting and ending time by starting time @@ -613,11 +671,14 @@ def __init__(self, start='09:00', end='17:00', offset=timedelta(0)): for i in range(num_openings): total_secs += self._get_business_hours_by_sec(start[i], end[i]) total_secs += self._get_business_hours_by_sec( - end[i], start[(i + 1) % num_openings]) + end[i], start[(i + 1) % num_openings] + ) if total_secs != 24 * 60 * 60: - raise ValueError('invalid starting and ending time(s): ' - 'opening hours should not touch or overlap with ' - 'one another') + raise ValueError( + "invalid starting and ending time(s): " + "opening hours should not touch or overlap with " + "one another" + ) object.__setattr__(self, "start", start) object.__setattr__(self, "end", end) @@ -632,12 +693,14 @@ def next_bday(self): nb_offset = 1 else: nb_offset = -1 - if self._prefix.startswith('C'): + if self._prefix.startswith("C"): # CustomBusinessHour - return CustomBusinessDay(n=nb_offset, - weekmask=self.weekmask, - holidays=self.holidays, - calendar=self.calendar) + return CustomBusinessDay( + n=nb_offset, + weekmask=self.weekmask, + holidays=self.holidays, + calendar=self.calendar, + ) else: return BusinessDay(n=nb_offset) @@ -771,25 +834,31 @@ def _get_closing_time(self, dt): for i, st in enumerate(self.start): if st.hour == dt.hour and st.minute == dt.minute: return dt + timedelta( - seconds=self._get_business_hours_by_sec(st, self.end[i])) + seconds=self._get_business_hours_by_sec(st, self.end[i]) + ) assert False @apply_wraps def apply(self, other): if isinstance(other, datetime): # used for detecting edge condition - nanosecond = getattr(other, 'nanosecond', 0) + nanosecond = getattr(other, "nanosecond", 0) # reset timezone and nanosecond # other may be a Timestamp, thus not use replace - other = datetime(other.year, other.month, other.day, - other.hour, other.minute, - other.second, other.microsecond) + other = datetime( + other.year, + other.month, + other.day, + other.hour, + other.minute, + other.second, + other.microsecond, + ) n = self.n # adjust other to reduce number of cases to handle if n >= 0: - if (other.time() in self.end or - not self._onOffset(other)): + if other.time() in self.end or not self._onOffset(other): other = self._next_opening_time(other) else: if other.time() in self.start: @@ -800,8 +869,10 @@ def apply(self, other): other = self._get_closing_time(other) # get total business hours by sec in one business day - businesshours = sum(self._get_business_hours_by_sec(st, en) - for st, en in zip(self.start, self.end)) + businesshours = sum( + self._get_business_hours_by_sec(st, en) + for st, en in zip(self.start, self.end) + ) bd, r = divmod(abs(n * 60), businesshours // 60) if n < 0: @@ -824,8 +895,9 @@ def apply(self, other): if n >= 0: while bhour_remain != timedelta(0): # business hour left in this business time interval - bhour = self._get_closing_time( - self._prev_opening_time(other)) - other + bhour = ( + self._get_closing_time(self._prev_opening_time(other)) - other + ) if bhour_remain < bhour: # finish adjusting if possible other += bhour_remain @@ -838,8 +910,11 @@ def apply(self, other): while bhour_remain != timedelta(0): # business hour left in this business time interval bhour = self._next_opening_time(other) - other - if (bhour_remain > bhour or - bhour_remain == bhour and nanosecond != 0): + if ( + bhour_remain > bhour + or bhour_remain == bhour + and nanosecond != 0 + ): # finish adjusting if possible other += bhour_remain bhour_remain = timedelta(0) @@ -848,20 +923,22 @@ def apply(self, other): bhour_remain -= bhour other = self._get_closing_time( self._next_opening_time( - other + bhour - timedelta(seconds=1))) + other + bhour - timedelta(seconds=1) + ) + ) return other else: - raise ApplyTypeError( - 'Only know how to combine business hour with datetime') + raise ApplyTypeError("Only know how to combine business hour with datetime") def onOffset(self, dt): if self.normalize and not _is_normalized(dt): return False if dt.tzinfo is not None: - dt = datetime(dt.year, dt.month, dt.day, dt.hour, - dt.minute, dt.second, dt.microsecond) + dt = datetime( + dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second, dt.microsecond + ) # Valid BH can be on the different BusinessDay during midnight # Distinguish by the time spent from previous opening time return self._onOffset(dt) @@ -882,8 +959,7 @@ def _onOffset(self, dt): businesshours = 0 for i, st in enumerate(self.start): if op.hour == st.hour and op.minute == st.minute: - businesshours = self._get_business_hours_by_sec( - st, self.end[i]) + businesshours = self._get_business_hours_by_sec(st, self.end[i]) if span <= businesshours: return True else: @@ -891,11 +967,12 @@ def _onOffset(self, dt): def _repr_attrs(self): out = super()._repr_attrs() - hours = ','.join('{}-{}'.format( - st.strftime('%H:%M'), en.strftime('%H:%M')) - for st, en in zip(self.start, self.end)) - attrs = ['{prefix}={hours}'.format(prefix=self._prefix, hours=hours)] - out += ': ' + ', '.join(attrs) + hours = ",".join( + "{}-{}".format(st.strftime("%H:%M"), en.strftime("%H:%M")) + for st, en in zip(self.start, self.end) + ) + attrs = ["{prefix}={hours}".format(prefix=self._prefix, hours=hours)] + out += ": " + ", ".join(attrs) return out @@ -905,12 +982,14 @@ class BusinessHour(BusinessHourMixin, SingleConstructorOffset): .. versionadded:: 0.16.1 """ - _prefix = 'BH' + + _prefix = "BH" _anchor = 0 - _attributes = frozenset(['n', 'normalize', 'start', 'end', 'offset']) + _attributes = frozenset(["n", "normalize", "start", "end", "offset"]) - def __init__(self, n=1, normalize=False, start='09:00', - end='17:00', offset=timedelta(0)): + def __init__( + self, n=1, normalize=False, start="09:00", end="17:00", offset=timedelta(0) + ): BaseOffset.__init__(self, n, normalize) super().__init__(start=start, end=end, offset=offset) @@ -933,12 +1012,21 @@ class CustomBusinessDay(_CustomMixin, BusinessDay): calendar : pd.HolidayCalendar or np.busdaycalendar offset : timedelta, default timedelta(0) """ - _prefix = 'C' - _attributes = frozenset(['n', 'normalize', - 'weekmask', 'holidays', 'calendar', 'offset']) - def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', - holidays=None, calendar=None, offset=timedelta(0)): + _prefix = "C" + _attributes = frozenset( + ["n", "normalize", "weekmask", "holidays", "calendar", "offset"] + ) + + def __init__( + self, + n=1, + normalize=False, + weekmask="Mon Tue Wed Thu Fri", + holidays=None, + calendar=None, + offset=timedelta(0), + ): BaseOffset.__init__(self, n, normalize) object.__setattr__(self, "_offset", offset) @@ -947,16 +1035,17 @@ def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', @apply_wraps def apply(self, other): if self.n <= 0: - roll = 'forward' + roll = "forward" else: - roll = 'backward' + roll = "backward" if isinstance(other, datetime): date_in = other np_dt = np.datetime64(date_in.date()) - np_incr_dt = np.busday_offset(np_dt, self.n, roll=roll, - busdaycal=self.calendar) + np_incr_dt = np.busday_offset( + np_dt, self.n, roll=roll, busdaycal=self.calendar + ) dt_date = np_incr_dt.astype(datetime) result = datetime.combine(dt_date, date_in.time()) @@ -966,11 +1055,12 @@ def apply(self, other): return result elif isinstance(other, (timedelta, Tick)): - return BDay(self.n, offset=self.offset + other, - normalize=self.normalize) + return BDay(self.n, offset=self.offset + other, normalize=self.normalize) else: - raise ApplyTypeError('Only know how to combine trading day with ' - 'datetime, datetime64 or timedelta.') + raise ApplyTypeError( + "Only know how to combine trading day with " + "datetime, datetime64 or timedelta." + ) def apply_index(self, i): raise NotImplementedError @@ -978,26 +1068,34 @@ def apply_index(self, i): def onOffset(self, dt): if self.normalize and not _is_normalized(dt): return False - day64 = _to_dt64(dt, 'datetime64[D]') + day64 = _to_dt64(dt, "datetime64[D]") return np.is_busday(day64, busdaycal=self.calendar) -class CustomBusinessHour(_CustomMixin, BusinessHourMixin, - SingleConstructorOffset): +class CustomBusinessHour(_CustomMixin, BusinessHourMixin, SingleConstructorOffset): """ DateOffset subclass representing possibly n custom business days. .. versionadded:: 0.18.1 """ - _prefix = 'CBH' - _anchor = 0 - _attributes = frozenset(['n', 'normalize', - 'weekmask', 'holidays', 'calendar', - 'start', 'end', 'offset']) - def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', - holidays=None, calendar=None, - start='09:00', end='17:00', offset=timedelta(0)): + _prefix = "CBH" + _anchor = 0 + _attributes = frozenset( + ["n", "normalize", "weekmask", "holidays", "calendar", "start", "end", "offset"] + ) + + def __init__( + self, + n=1, + normalize=False, + weekmask="Mon Tue Wed Thu Fri", + holidays=None, + calendar=None, + start="09:00", + end="17:00", + offset=timedelta(0), + ): BaseOffset.__init__(self, n, normalize) object.__setattr__(self, "_offset", offset) @@ -1011,7 +1109,7 @@ def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', class MonthOffset(SingleConstructorOffset): _adjust_dst = True - _attributes = frozenset(['n', 'normalize']) + _attributes = frozenset(["n", "normalize"]) __init__ = BaseOffset.__init__ @@ -1021,8 +1119,7 @@ def name(self): return self.rule_code else: month = ccalendar.MONTH_ALIASES[self.n] - return "{code}-{month}".format(code=self.rule_code, - month=month) + return "{code}-{month}".format(code=self.rule_code, month=month) def onOffset(self, dt): if self.normalize and not _is_normalized(dt): @@ -1047,32 +1144,36 @@ class MonthEnd(MonthOffset): """ DateOffset of one month end. """ - _prefix = 'M' - _day_opt = 'end' + + _prefix = "M" + _day_opt = "end" class MonthBegin(MonthOffset): """ DateOffset of one month at beginning. """ - _prefix = 'MS' - _day_opt = 'start' + + _prefix = "MS" + _day_opt = "start" class BusinessMonthEnd(MonthOffset): """ DateOffset increments between business EOM dates. """ - _prefix = 'BM' - _day_opt = 'business_end' + + _prefix = "BM" + _day_opt = "business_end" class BusinessMonthBegin(MonthOffset): """ DateOffset of one business month at beginning. """ - _prefix = 'BMS' - _day_opt = 'business_start' + + _prefix = "BMS" + _day_opt = "business_start" class _CustomBusinessMonth(_CustomMixin, BusinessMixin, MonthOffset): @@ -1097,14 +1198,23 @@ class _CustomBusinessMonth(_CustomMixin, BusinessMixin, MonthOffset): offset : timedelta, default timedelta(0) Time offset to apply. """ - _attributes = frozenset(['n', 'normalize', - 'weekmask', 'holidays', 'calendar', 'offset']) - onOffset = DateOffset.onOffset # override MonthOffset method + _attributes = frozenset( + ["n", "normalize", "weekmask", "holidays", "calendar", "offset"] + ) + + onOffset = DateOffset.onOffset # override MonthOffset method apply_index = DateOffset.apply_index # override MonthOffset method - def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', - holidays=None, calendar=None, offset=timedelta(0)): + def __init__( + self, + n=1, + normalize=False, + weekmask="Mon Tue Wed Thu Fri", + holidays=None, + calendar=None, + offset=timedelta(0), + ): BaseOffset.__init__(self, n, normalize) object.__setattr__(self, "_offset", offset) @@ -1117,7 +1227,7 @@ def cbday_roll(self): """ cbday = CustomBusinessDay(n=self.n, normalize=False, **self.kwds) - if self._prefix.endswith('S'): + if self._prefix.endswith("S"): # MonthBegin roll_func = cbday.rollforward else: @@ -1127,7 +1237,7 @@ def cbday_roll(self): @cache_readonly def m_offset(self): - if self._prefix.endswith('S'): + if self._prefix.endswith("S"): # MonthBegin moff = MonthBegin(n=1, normalize=False) else: @@ -1140,7 +1250,7 @@ def month_roll(self): """ Define default roll function to be called in apply method. """ - if self._prefix.endswith('S'): + if self._prefix.endswith("S"): # MonthBegin roll_func = self.m_offset.rollback else: @@ -1165,36 +1275,37 @@ def apply(self, other): @Substitution(bound="end") @Appender(_CustomBusinessMonth.__doc__) class CustomBusinessMonthEnd(_CustomBusinessMonth): - _prefix = 'CBM' + _prefix = "CBM" @Substitution(bound="beginning") @Appender(_CustomBusinessMonth.__doc__) class CustomBusinessMonthBegin(_CustomBusinessMonth): - _prefix = 'CBMS' + _prefix = "CBMS" # --------------------------------------------------------------------- # Semi-Month Based Offset Classes + class SemiMonthOffset(DateOffset): _adjust_dst = True _default_day_of_month = 15 _min_day_of_month = 2 - _attributes = frozenset(['n', 'normalize', 'day_of_month']) + _attributes = frozenset(["n", "normalize", "day_of_month"]) def __init__(self, n=1, normalize=False, day_of_month=None): BaseOffset.__init__(self, n, normalize) if day_of_month is None: - object.__setattr__(self, "day_of_month", - self._default_day_of_month) + object.__setattr__(self, "day_of_month", self._default_day_of_month) else: object.__setattr__(self, "day_of_month", int(day_of_month)) if not self._min_day_of_month <= self.day_of_month <= 27: - msg = 'day_of_month must be {min}<=day_of_month<=27, got {day}' - raise ValueError(msg.format(min=self._min_day_of_month, - day=self.day_of_month)) + msg = "day_of_month must be {min}<=day_of_month<=27, got {day}" + raise ValueError( + msg.format(min=self._min_day_of_month, day=self.day_of_month) + ) @classmethod def _from_name(cls, suffix=None): @@ -1202,7 +1313,7 @@ def _from_name(cls, suffix=None): @property def rule_code(self): - suffix = '-{day_of_month}'.format(day_of_month=self.day_of_month) + suffix = "-{day_of_month}".format(day_of_month=self.day_of_month) return self._prefix + suffix @apply_wraps @@ -1219,8 +1330,7 @@ def apply(self, other): # initially positive. if type(self) is SemiMonthBegin and (self.n <= 0 and other.day == 1): n -= 1 - elif type(self) is SemiMonthEnd and (self.n > 0 and - other.day == days_in_month): + elif type(self) is SemiMonthEnd and (self.n > 0 and other.day == days_in_month): n += 1 return self._apply(n, other) @@ -1235,7 +1345,7 @@ def _apply(self, n, other): def apply_index(self, i): # determine how many days away from the 1st of the month we are dti = i - days_from_start = i.to_perioddelta('M').asi8 + days_from_start = i.to_perioddelta("M").asi8 delta = Timedelta(days=self.day_of_month - 1).value # get boolean array for each element before the day_of_month @@ -1248,13 +1358,13 @@ def apply_index(self, i): roll = self._get_roll(i, before_day_of_month, after_day_of_month) # isolate the time since it will be striped away one the next line - time = i.to_perioddelta('D') + time = i.to_perioddelta("D") # apply the correct number of months # integer-array addition on PeriodIndex is deprecated, # so we use _addsub_int_array directly - asper = i.to_period('M') + asper = i.to_period("M") if not isinstance(asper._data, np.ndarray): # unwrap PeriodIndex --> PeriodArray asper = asper._data @@ -1296,7 +1406,8 @@ class SemiMonthEnd(SemiMonthOffset): normalize : bool, default False day_of_month : int, {1, 3,...,27}, default 15 """ - _prefix = 'SM' + + _prefix = "SM" _min_day_of_month = 1 def onOffset(self, dt): @@ -1339,7 +1450,7 @@ def _apply_index_days(self, i, roll): result : DatetimeIndex """ nanos = (roll % 2) * Timedelta(days=self.day_of_month).value - i += nanos.astype('timedelta64[ns]') + i += nanos.astype("timedelta64[ns]") return i + Timedelta(days=-1) @@ -1356,7 +1467,8 @@ class SemiMonthBegin(SemiMonthOffset): normalize : bool, default False day_of_month : int, {2, 3,...,27}, default 15 """ - _prefix = 'SMS' + + _prefix = "SMS" def onOffset(self, dt): if self.normalize and not _is_normalized(dt): @@ -1397,12 +1509,13 @@ def _apply_index_days(self, i, roll): result : DatetimeIndex """ nanos = (roll % 2) * Timedelta(days=self.day_of_month - 1).value - return i + nanos.astype('timedelta64[ns]') + return i + nanos.astype("timedelta64[ns]") # --------------------------------------------------------------------- # Week-Based Offset Classes + class Week(DateOffset): """ Weekly offset. @@ -1412,10 +1525,11 @@ class Week(DateOffset): weekday : int, default None Always generate specific day of week. 0 for Monday """ + _adjust_dst = True _inc = timedelta(weeks=1) - _prefix = 'W' - _attributes = frozenset(['n', 'normalize', 'weekday']) + _prefix = "W" + _attributes = frozenset(["n", "normalize", "weekday"]) def __init__(self, n=1, normalize=False, weekday=None): BaseOffset.__init__(self, n, normalize) @@ -1423,11 +1537,12 @@ def __init__(self, n=1, normalize=False, weekday=None): if self.weekday is not None: if self.weekday < 0 or self.weekday > 6: - raise ValueError('Day must be 0<=day<=6, got {day}' - .format(day=self.weekday)) + raise ValueError( + "Day must be 0<=day<=6, got {day}".format(day=self.weekday) + ) def isAnchored(self): - return (self.n == 1 and self.weekday is not None) + return self.n == 1 and self.weekday is not None @apply_wraps def apply(self, other): @@ -1448,13 +1563,13 @@ def apply_index(self, i): if self.weekday is None: # integer addition on PeriodIndex is deprecated, # so we use _time_shift directly - asper = i.to_period('W') + asper = i.to_period("W") if not isinstance(asper._data, np.ndarray): # unwrap PeriodIndex --> PeriodArray asper = asper._data shifted = asper._time_shift(self.n) - return shifted.to_timestamp() + i.to_perioddelta('W') + return shifted.to_timestamp() + i.to_perioddelta("W") else: return self._end_apply_index(i) @@ -1471,7 +1586,7 @@ def _end_apply_index(self, dtindex): ------- result : DatetimeIndex """ - off = dtindex.to_perioddelta('D') + off = dtindex.to_perioddelta("D") base, mult = libfrequencies.get_freq_code(self.freqstr) base_period = dtindex.to_period(base) @@ -1481,20 +1596,21 @@ def _end_apply_index(self, dtindex): if self.n > 0: # when adding, dates on end roll to next - normed = dtindex - off + Timedelta(1, 'D') - Timedelta(1, 'ns') - roll = np.where(base_period.to_timestamp(how='end') == normed, - self.n, self.n - 1) + normed = dtindex - off + Timedelta(1, "D") - Timedelta(1, "ns") + roll = np.where( + base_period.to_timestamp(how="end") == normed, self.n, self.n - 1 + ) # integer-array addition on PeriodIndex is deprecated, # so we use _addsub_int_array directly shifted = base_period._addsub_int_array(roll, operator.add) - base = shifted.to_timestamp(how='end') + base = shifted.to_timestamp(how="end") else: # integer addition on PeriodIndex is deprecated, # so we use _time_shift directly roll = self.n - base = base_period._time_shift(roll).to_timestamp(how='end') + base = base_period._time_shift(roll).to_timestamp(how="end") - return base + off + Timedelta(1, 'ns') - Timedelta(1, 'D') + return base + off + Timedelta(1, "ns") - Timedelta(1, "D") def onOffset(self, dt): if self.normalize and not _is_normalized(dt): @@ -1505,10 +1621,10 @@ def onOffset(self, dt): @property def rule_code(self): - suffix = '' + suffix = "" if self.weekday is not None: weekday = ccalendar.int_to_weekday[self.weekday] - suffix = '-{weekday}'.format(weekday=weekday) + suffix = "-{weekday}".format(weekday=weekday) return self._prefix + suffix @classmethod @@ -1524,6 +1640,7 @@ class _WeekOfMonthMixin: """ Mixin for methods common to WeekOfMonth and LastWeekOfMonth. """ + @apply_wraps def apply(self, other): compare_day = self._get_offset_day(other) @@ -1534,7 +1651,7 @@ def apply(self, other): elif months <= 0 and compare_day < other.day: months += 1 - shifted = shift_month(other, months, 'start') + shifted = shift_month(other, months, "start") to_day = self._get_offset_day(shifted) return liboffsets.shift_day(shifted, to_day - shifted.day) @@ -1562,9 +1679,10 @@ class WeekOfMonth(_WeekOfMonthMixin, DateOffset): 5: Saturdays 6: Sundays """ - _prefix = 'WOM' + + _prefix = "WOM" _adjust_dst = True - _attributes = frozenset(['n', 'normalize', 'week', 'weekday']) + _attributes = frozenset(["n", "normalize", "week", "weekday"]) def __init__(self, n=1, normalize=False, week=0, weekday=0): BaseOffset.__init__(self, n, normalize) @@ -1572,11 +1690,13 @@ def __init__(self, n=1, normalize=False, week=0, weekday=0): object.__setattr__(self, "week", week) if self.weekday < 0 or self.weekday > 6: - raise ValueError('Day must be 0<=day<=6, got {day}' - .format(day=self.weekday)) + raise ValueError( + "Day must be 0<=day<=6, got {day}".format(day=self.weekday) + ) if self.week < 0 or self.week > 3: - raise ValueError('Week must be 0<=week<=3, got {week}' - .format(week=self.week)) + raise ValueError( + "Week must be 0<=week<=3, got {week}".format(week=self.week) + ) def _get_offset_day(self, other): """ @@ -1598,16 +1718,17 @@ def _get_offset_day(self, other): @property def rule_code(self): - weekday = ccalendar.int_to_weekday.get(self.weekday, '') - return '{prefix}-{week}{weekday}'.format(prefix=self._prefix, - week=self.week + 1, - weekday=weekday) + weekday = ccalendar.int_to_weekday.get(self.weekday, "") + return "{prefix}-{week}{weekday}".format( + prefix=self._prefix, week=self.week + 1, weekday=weekday + ) @classmethod def _from_name(cls, suffix=None): if not suffix: - raise ValueError("Prefix {prefix!r} requires a suffix." - .format(prefix=cls._prefix)) + raise ValueError( + "Prefix {prefix!r} requires a suffix.".format(prefix=cls._prefix) + ) # TODO: handle n here... # only one digit weeks (1 --> week 0, 2 --> week 1, etc.) week = int(suffix[0]) - 1 @@ -1632,20 +1753,22 @@ class LastWeekOfMonth(_WeekOfMonthMixin, DateOffset): 5: Saturdays 6: Sundays """ - _prefix = 'LWOM' + + _prefix = "LWOM" _adjust_dst = True - _attributes = frozenset(['n', 'normalize', 'weekday']) + _attributes = frozenset(["n", "normalize", "weekday"]) def __init__(self, n=1, normalize=False, weekday=0): BaseOffset.__init__(self, n, normalize) object.__setattr__(self, "weekday", weekday) if self.n == 0: - raise ValueError('N cannot be 0') + raise ValueError("N cannot be 0") if self.weekday < 0 or self.weekday > 6: - raise ValueError('Day must be 0<=day<=6, got {day}' - .format(day=self.weekday)) + raise ValueError( + "Day must be 0<=day<=6, got {day}".format(day=self.weekday) + ) def _get_offset_day(self, other): """ @@ -1668,19 +1791,20 @@ def _get_offset_day(self, other): @property def rule_code(self): - weekday = ccalendar.int_to_weekday.get(self.weekday, '') - return '{prefix}-{weekday}'.format(prefix=self._prefix, - weekday=weekday) + weekday = ccalendar.int_to_weekday.get(self.weekday, "") + return "{prefix}-{weekday}".format(prefix=self._prefix, weekday=weekday) @classmethod def _from_name(cls, suffix=None): if not suffix: - raise ValueError("Prefix {prefix!r} requires a suffix." - .format(prefix=cls._prefix)) + raise ValueError( + "Prefix {prefix!r} requires a suffix.".format(prefix=cls._prefix) + ) # TODO: handle n here... weekday = ccalendar.weekday_to_int[suffix] return cls(weekday=weekday) + # --------------------------------------------------------------------- # Quarter-Based Offset Classes @@ -1689,10 +1813,11 @@ class QuarterOffset(DateOffset): """ Quarter representation - doesn't call super. """ + _default_startingMonth = None # type: Optional[int] - _from_name_startingMonth = None # type: Optional[int] + _from_name_startingMonth = None # type: Optional[int] _adjust_dst = True - _attributes = frozenset(['n', 'normalize', 'startingMonth']) + _attributes = frozenset(["n", "normalize", "startingMonth"]) # TODO: Consider combining QuarterOffset and YearOffset __init__ at some # point. Also apply_index, onOffset, rule_code if # startingMonth vs month attr names are resolved @@ -1705,22 +1830,22 @@ def __init__(self, n=1, normalize=False, startingMonth=None): object.__setattr__(self, "startingMonth", startingMonth) def isAnchored(self): - return (self.n == 1 and self.startingMonth is not None) + return self.n == 1 and self.startingMonth is not None @classmethod def _from_name(cls, suffix=None): kwargs = {} if suffix: - kwargs['startingMonth'] = ccalendar.MONTH_TO_CAL_NUM[suffix] + kwargs["startingMonth"] = ccalendar.MONTH_TO_CAL_NUM[suffix] else: if cls._from_name_startingMonth is not None: - kwargs['startingMonth'] = cls._from_name_startingMonth + kwargs["startingMonth"] = cls._from_name_startingMonth return cls(**kwargs) @property def rule_code(self): month = ccalendar.MONTH_ALIASES[self.startingMonth] - return '{prefix}-{month}'.format(prefix=self._prefix, month=month) + return "{prefix}-{month}".format(prefix=self._prefix, month=month) @apply_wraps def apply(self, other): @@ -1730,8 +1855,9 @@ def apply(self, other): # self. `months_since` is the number of months to shift other.month # to get to this on-offset month. months_since = other.month % 3 - self.startingMonth % 3 - qtrs = liboffsets.roll_qtrday(other, self.n, self.startingMonth, - day_opt=self._day_opt, modby=3) + qtrs = liboffsets.roll_qtrday( + other, self.n, self.startingMonth, day_opt=self._day_opt, modby=3 + ) months = qtrs * 3 - months_since return shift_month(other, months, self._day_opt) @@ -1743,12 +1869,14 @@ def onOffset(self, dt): @apply_index_wraps def apply_index(self, dtindex): - shifted = liboffsets.shift_quarters(dtindex.asi8, self.n, - self.startingMonth, self._day_opt) + shifted = liboffsets.shift_quarters( + dtindex.asi8, self.n, self.startingMonth, self._day_opt + ) # TODO: going through __new__ raises on call to _validate_frequency; # are we passing incorrect freq? - return type(dtindex)._simple_new(shifted, freq=dtindex.freq, - dtype=dtindex.dtype) + return type(dtindex)._simple_new( + shifted, freq=dtindex.freq, dtype=dtindex.dtype + ) class BQuarterEnd(QuarterOffset): @@ -1759,11 +1887,12 @@ class BQuarterEnd(QuarterOffset): startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... startingMonth = 3 corresponds to dates like 3/30/2007, 6/29/2007, ... """ - _outputName = 'BusinessQuarterEnd' + + _outputName = "BusinessQuarterEnd" _default_startingMonth = 3 _from_name_startingMonth = 12 - _prefix = 'BQ' - _day_opt = 'business_end' + _prefix = "BQ" + _day_opt = "business_end" # TODO: This is basically the same as BQuarterEnd @@ -1772,8 +1901,8 @@ class BQuarterBegin(QuarterOffset): # I suspect this is wrong for *all* of them. _default_startingMonth = 3 _from_name_startingMonth = 1 - _prefix = 'BQS' - _day_opt = 'business_start' + _prefix = "BQS" + _day_opt = "business_start" class QuarterEnd(QuarterOffset): @@ -1784,35 +1913,39 @@ class QuarterEnd(QuarterOffset): startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... startingMonth = 3 corresponds to dates like 3/31/2007, 6/30/2007, ... """ - _outputName = 'QuarterEnd' + + _outputName = "QuarterEnd" _default_startingMonth = 3 - _prefix = 'Q' - _day_opt = 'end' + _prefix = "Q" + _day_opt = "end" class QuarterBegin(QuarterOffset): - _outputName = 'QuarterBegin' + _outputName = "QuarterBegin" _default_startingMonth = 3 _from_name_startingMonth = 1 - _prefix = 'QS' - _day_opt = 'start' + _prefix = "QS" + _day_opt = "start" # --------------------------------------------------------------------- # Year-Based Offset Classes + class YearOffset(DateOffset): """ DateOffset that just needs a month. """ + _adjust_dst = True - _attributes = frozenset(['n', 'normalize', 'month']) + _attributes = frozenset(["n", "normalize", "month"]) def _get_offset_day(self, other): # override BaseOffset method to use self.month instead of other.month # TODO: there may be a more performant way to do this - return liboffsets.get_day_of_month(other.replace(month=self.month), - self._day_opt) + return liboffsets.get_day_of_month( + other.replace(month=self.month), self._day_opt + ) @apply_wraps def apply(self, other): @@ -1822,13 +1955,14 @@ def apply(self, other): @apply_index_wraps def apply_index(self, dtindex): - shifted = liboffsets.shift_quarters(dtindex.asi8, self.n, - self.month, self._day_opt, - modby=12) + shifted = liboffsets.shift_quarters( + dtindex.asi8, self.n, self.month, self._day_opt, modby=12 + ) # TODO: going through __new__ raises on call to _validate_frequency; # are we passing incorrect freq? - return type(dtindex)._simple_new(shifted, freq=dtindex.freq, - dtype=dtindex.dtype) + return type(dtindex)._simple_new( + shifted, freq=dtindex.freq, dtype=dtindex.dtype + ) def onOffset(self, dt): if self.normalize and not _is_normalized(dt): @@ -1842,62 +1976,67 @@ def __init__(self, n=1, normalize=False, month=None): object.__setattr__(self, "month", month) if self.month < 1 or self.month > 12: - raise ValueError('Month must go from 1 to 12') + raise ValueError("Month must go from 1 to 12") @classmethod def _from_name(cls, suffix=None): kwargs = {} if suffix: - kwargs['month'] = ccalendar.MONTH_TO_CAL_NUM[suffix] + kwargs["month"] = ccalendar.MONTH_TO_CAL_NUM[suffix] return cls(**kwargs) @property def rule_code(self): month = ccalendar.MONTH_ALIASES[self.month] - return '{prefix}-{month}'.format(prefix=self._prefix, month=month) + return "{prefix}-{month}".format(prefix=self._prefix, month=month) class BYearEnd(YearOffset): """ DateOffset increments between business EOM dates. """ - _outputName = 'BusinessYearEnd' + + _outputName = "BusinessYearEnd" _default_month = 12 - _prefix = 'BA' - _day_opt = 'business_end' + _prefix = "BA" + _day_opt = "business_end" class BYearBegin(YearOffset): """ DateOffset increments between business year begin dates. """ - _outputName = 'BusinessYearBegin' + + _outputName = "BusinessYearBegin" _default_month = 1 - _prefix = 'BAS' - _day_opt = 'business_start' + _prefix = "BAS" + _day_opt = "business_start" class YearEnd(YearOffset): """ DateOffset increments between calendar year ends. """ + _default_month = 12 - _prefix = 'A' - _day_opt = 'end' + _prefix = "A" + _day_opt = "end" class YearBegin(YearOffset): """ DateOffset increments between calendar year begin dates. """ + _default_month = 1 - _prefix = 'AS' - _day_opt = 'start' + _prefix = "AS" + _day_opt = "start" # --------------------------------------------------------------------- # Special Offset Classes + class FY5253(DateOffset): """ Describes 52-53 week fiscal year. This is also known as a 4-4-5 calendar. @@ -1934,12 +2073,14 @@ class FY5253(DateOffset): variation : str {"nearest", "last"} for "LastOfMonth" or "NearestEndMonth" """ - _prefix = 'RE' + + _prefix = "RE" _adjust_dst = True - _attributes = frozenset(['weekday', 'startingMonth', 'variation']) + _attributes = frozenset(["weekday", "startingMonth", "variation"]) - def __init__(self, n=1, normalize=False, weekday=0, startingMonth=1, - variation="nearest"): + def __init__( + self, n=1, normalize=False, weekday=0, startingMonth=1, variation="nearest" + ): BaseOffset.__init__(self, n, normalize) object.__setattr__(self, "startingMonth", startingMonth) object.__setattr__(self, "weekday", weekday) @@ -1947,16 +2088,17 @@ def __init__(self, n=1, normalize=False, weekday=0, startingMonth=1, object.__setattr__(self, "variation", variation) if self.n == 0: - raise ValueError('N cannot be 0') + raise ValueError("N cannot be 0") if self.variation not in ["nearest", "last"]: - raise ValueError('{variation} is not a valid variation' - .format(variation=self.variation)) + raise ValueError( + "{variation} is not a valid variation".format(variation=self.variation) + ) def isAnchored(self): - return (self.n == 1 and - self.startingMonth is not None and - self.weekday is not None) + return ( + self.n == 1 and self.startingMonth is not None and self.weekday is not None + ) def onOffset(self, dt): if self.normalize and not _is_normalized(dt): @@ -1966,8 +2108,7 @@ def onOffset(self, dt): if self.variation == "nearest": # We have to check the year end of "this" cal year AND the previous - return (year_end == dt or - self.get_year_end(shift_month(dt, -1, None)) == dt) + return year_end == dt or self.get_year_end(shift_month(dt, -1, None)) == dt else: return year_end == dt @@ -1976,12 +2117,9 @@ def apply(self, other): norm = Timestamp(other).normalize() n = self.n - prev_year = self.get_year_end( - datetime(other.year - 1, self.startingMonth, 1)) - cur_year = self.get_year_end( - datetime(other.year, self.startingMonth, 1)) - next_year = self.get_year_end( - datetime(other.year + 1, self.startingMonth, 1)) + prev_year = self.get_year_end(datetime(other.year - 1, self.startingMonth, 1)) + cur_year = self.get_year_end(datetime(other.year, self.startingMonth, 1)) + next_year = self.get_year_end(datetime(other.year + 1, self.startingMonth, 1)) prev_year = conversion.localize_pydatetime(prev_year, other.tzinfo) cur_year = conversion.localize_pydatetime(cur_year, other.tzinfo) @@ -2005,8 +2143,11 @@ def apply(self, other): n += 1 elif prev_year < norm < cur_year: pass - elif (norm.year == prev_year.year and norm < prev_year and - prev_year - norm <= timedelta(6)): + elif ( + norm.year == prev_year.year + and norm < prev_year + and prev_year - norm <= timedelta(6) + ): # GH#14774, error when next_year.year == cur_year.year # e.g. prev_year == datetime(2004, 1, 3), # other == datetime(2004, 1, 1) @@ -2016,9 +2157,15 @@ def apply(self, other): shifted = datetime(other.year + n, self.startingMonth, 1) result = self.get_year_end(shifted) - result = datetime(result.year, result.month, result.day, - other.hour, other.minute, other.second, - other.microsecond) + result = datetime( + result.year, + result.month, + result.day, + other.hour, + other.minute, + other.second, + other.microsecond, + ) return result def get_year_end(self, dt): @@ -2055,16 +2202,17 @@ def rule_code(self): def _get_suffix_prefix(self): if self.variation == "nearest": - return 'N' + return "N" else: - return 'L' + return "L" def get_rule_code_suffix(self): prefix = self._get_suffix_prefix() month = ccalendar.MONTH_ALIASES[self.startingMonth] weekday = ccalendar.int_to_weekday[self.weekday] - return '{prefix}-{month}-{weekday}'.format(prefix=prefix, month=month, - weekday=weekday) + return "{prefix}-{month}-{weekday}".format( + prefix=prefix, month=month, weekday=weekday + ) @classmethod def _parse_suffix(cls, varion_code, startingMonth_code, weekday_code): @@ -2073,15 +2221,18 @@ def _parse_suffix(cls, varion_code, startingMonth_code, weekday_code): elif varion_code == "L": variation = "last" else: - raise ValueError("Unable to parse varion_code: " - "{code}".format(code=varion_code)) + raise ValueError( + "Unable to parse varion_code: " "{code}".format(code=varion_code) + ) startingMonth = ccalendar.MONTH_TO_CAL_NUM[startingMonth_code] weekday = ccalendar.weekday_to_int[weekday_code] - return {"weekday": weekday, - "startingMonth": startingMonth, - "variation": variation} + return { + "weekday": weekday, + "startingMonth": startingMonth, + "variation": variation, + } @classmethod def _from_name(cls, *args): @@ -2132,13 +2283,21 @@ class FY5253Quarter(DateOffset): {"nearest", "last"} for "LastOfMonth" or "NearestEndMonth" """ - _prefix = 'REQ' + _prefix = "REQ" _adjust_dst = True - _attributes = frozenset(['weekday', 'startingMonth', 'qtr_with_extra_week', - 'variation']) - - def __init__(self, n=1, normalize=False, weekday=0, startingMonth=1, - qtr_with_extra_week=1, variation="nearest"): + _attributes = frozenset( + ["weekday", "startingMonth", "qtr_with_extra_week", "variation"] + ) + + def __init__( + self, + n=1, + normalize=False, + weekday=0, + startingMonth=1, + qtr_with_extra_week=1, + variation="nearest", + ): BaseOffset.__init__(self, n, normalize) object.__setattr__(self, "startingMonth", startingMonth) @@ -2147,13 +2306,15 @@ def __init__(self, n=1, normalize=False, weekday=0, startingMonth=1, object.__setattr__(self, "variation", variation) if self.n == 0: - raise ValueError('N cannot be 0') + raise ValueError("N cannot be 0") @cache_readonly def _offset(self): - return FY5253(startingMonth=self.startingMonth, - weekday=self.weekday, - variation=self.variation) + return FY5253( + startingMonth=self.startingMonth, + weekday=self.weekday, + variation=self.variation, + ) def isAnchored(self): return self.n == 1 and self._offset.isAnchored() @@ -2275,13 +2436,15 @@ def onOffset(self, dt): def rule_code(self): suffix = self._offset.get_rule_code_suffix() qtr = self.qtr_with_extra_week - return "{prefix}-{suffix}-{qtr}".format(prefix=self._prefix, - suffix=suffix, qtr=qtr) + return "{prefix}-{suffix}-{qtr}".format( + prefix=self._prefix, suffix=suffix, qtr=qtr + ) @classmethod def _from_name(cls, *args): - return cls(**dict(FY5253._parse_suffix(*args[:-1]), - qtr_with_extra_week=int(args[-1]))) + return cls( + **dict(FY5253._parse_suffix(*args[:-1]), qtr_with_extra_week=int(args[-1])) + ) class Easter(DateOffset): @@ -2290,18 +2453,19 @@ class Easter(DateOffset): Right now uses the revised method which is valid in years 1583-4099. """ + _adjust_dst = True - _attributes = frozenset(['n', 'normalize']) + _attributes = frozenset(["n", "normalize"]) __init__ = BaseOffset.__init__ @apply_wraps def apply(self, other): current_easter = easter(other.year) - current_easter = datetime(current_easter.year, - current_easter.month, current_easter.day) - current_easter = conversion.localize_pydatetime(current_easter, - other.tzinfo) + current_easter = datetime( + current_easter.year, current_easter.month, current_easter.day + ) + current_easter = conversion.localize_pydatetime(current_easter, other.tzinfo) n = self.n if n >= 0 and other < current_easter: @@ -2313,8 +2477,15 @@ def apply(self, other): # NOTE: easter returns a datetime.date so we have to convert to type of # other new = easter(other.year + n) - new = datetime(new.year, new.month, new.day, other.hour, - other.minute, other.second, other.microsecond) + new = datetime( + new.year, + new.month, + new.day, + other.hour, + other.minute, + other.second, + other.microsecond, + ) return new def onOffset(self, dt): @@ -2322,6 +2493,7 @@ def onOffset(self, dt): return False return date(dt.year, dt.month, dt.day) == easter(dt.year) + # --------------------------------------------------------------------- # Ticks @@ -2334,24 +2506,27 @@ def f(self, other): return op(self.delta, other.delta) except AttributeError: # comparing with a non-Tick object - raise TypeError("Invalid comparison between {cls} and {typ}" - .format(cls=type(self).__name__, - typ=type(other).__name__)) + raise TypeError( + "Invalid comparison between {cls} and {typ}".format( + cls=type(self).__name__, typ=type(other).__name__ + ) + ) - f.__name__ = '__{opname}__'.format(opname=op.__name__) + f.__name__ = "__{opname}__".format(opname=op.__name__) return f class Tick(liboffsets._Tick, SingleConstructorOffset): _inc = Timedelta(microseconds=1000) - _prefix = 'undefined' - _attributes = frozenset(['n', 'normalize']) + _prefix = "undefined" + _attributes = frozenset(["n", "normalize"]) def __init__(self, n=1, normalize=False): BaseOffset.__init__(self, n, normalize) if normalize: - raise ValueError("Tick offset with `normalize=True` are not " - "allowed.") # GH#21427 + raise ValueError( + "Tick offset with `normalize=True` are not " "allowed." + ) # GH#21427 __gt__ = _tick_comp(operator.gt) __ge__ = _tick_comp(operator.ge) @@ -2371,12 +2546,15 @@ def __add__(self, other): except ApplyTypeError: return NotImplemented except OverflowError: - raise OverflowError("the add operation between {self} and {other} " - "will overflow".format(self=self, other=other)) + raise OverflowError( + "the add operation between {self} and {other} " + "will overflow".format(self=self, other=other) + ) def __eq__(self, other): if isinstance(other, str): from pandas.tseries.frequencies import to_offset + try: # GH#23524 if to_offset fails, we are dealing with an # incomparable type so == is False and != is True @@ -2398,6 +2576,7 @@ def __hash__(self): def __ne__(self, other): if isinstance(other, str): from pandas.tseries.frequencies import to_offset + try: # GH#23524 if to_offset fails, we are dealing with an # incomparable type so == is False and != is True @@ -2441,8 +2620,9 @@ def apply(self, other): elif isinstance(other, type(self)): return type(self)(self.n + other.n) - raise ApplyTypeError('Unhandled type: {type_str}' - .format(type_str=type(other).__name__)) + raise ApplyTypeError( + "Unhandled type: {type_str}".format(type_str=type(other).__name__) + ) def isAnchored(self): return False @@ -2473,37 +2653,37 @@ def _delta_to_tick(delta): class Day(Tick): _inc = Timedelta(days=1) - _prefix = 'D' + _prefix = "D" class Hour(Tick): _inc = Timedelta(hours=1) - _prefix = 'H' + _prefix = "H" class Minute(Tick): _inc = Timedelta(minutes=1) - _prefix = 'T' + _prefix = "T" class Second(Tick): _inc = Timedelta(seconds=1) - _prefix = 'S' + _prefix = "S" class Milli(Tick): _inc = Timedelta(milliseconds=1) - _prefix = 'L' + _prefix = "L" class Micro(Tick): _inc = Timedelta(microseconds=1) - _prefix = 'U' + _prefix = "U" class Nano(Tick): _inc = Timedelta(nanoseconds=1) - _prefix = 'N' + _prefix = "N" BDay = BusinessDay @@ -2541,6 +2721,7 @@ def generate_range(start=None, end=None, periods=None, offset=BDay()): dates : generator object """ from pandas.tseries.frequencies import to_offset + offset = to_offset(offset) start = to_datetime(start) @@ -2575,8 +2756,9 @@ def generate_range(start=None, end=None, periods=None, offset=BDay()): # faster than cur + offset next_date = offset.apply(cur) if next_date <= cur: - raise ValueError('Offset {offset} did not increment date' - .format(offset=offset)) + raise ValueError( + "Offset {offset} did not increment date".format(offset=offset) + ) cur = next_date else: while cur >= end: @@ -2590,41 +2772,45 @@ def generate_range(start=None, end=None, periods=None, offset=BDay()): # faster than cur + offset next_date = offset.apply(cur) if next_date >= cur: - raise ValueError('Offset {offset} did not decrement date' - .format(offset=offset)) + raise ValueError( + "Offset {offset} did not decrement date".format(offset=offset) + ) cur = next_date -prefix_mapping = {offset._prefix: offset for offset in [ - YearBegin, # 'AS' - YearEnd, # 'A' - BYearBegin, # 'BAS' - BYearEnd, # 'BA' - BusinessDay, # 'B' - BusinessMonthBegin, # 'BMS' - BusinessMonthEnd, # 'BM' - BQuarterEnd, # 'BQ' - BQuarterBegin, # 'BQS' - BusinessHour, # 'BH' - CustomBusinessDay, # 'C' - CustomBusinessMonthEnd, # 'CBM' - CustomBusinessMonthBegin, # 'CBMS' - CustomBusinessHour, # 'CBH' - MonthEnd, # 'M' - MonthBegin, # 'MS' - Nano, # 'N' - SemiMonthEnd, # 'SM' - SemiMonthBegin, # 'SMS' - Week, # 'W' - Second, # 'S' - Minute, # 'T' - Micro, # 'U' - QuarterEnd, # 'Q' - QuarterBegin, # 'QS' - Milli, # 'L' - Hour, # 'H' - Day, # 'D' - WeekOfMonth, # 'WOM' - FY5253, - FY5253Quarter -]} +prefix_mapping = { + offset._prefix: offset + for offset in [ + YearBegin, # 'AS' + YearEnd, # 'A' + BYearBegin, # 'BAS' + BYearEnd, # 'BA' + BusinessDay, # 'B' + BusinessMonthBegin, # 'BMS' + BusinessMonthEnd, # 'BM' + BQuarterEnd, # 'BQ' + BQuarterBegin, # 'BQS' + BusinessHour, # 'BH' + CustomBusinessDay, # 'C' + CustomBusinessMonthEnd, # 'CBM' + CustomBusinessMonthBegin, # 'CBMS' + CustomBusinessHour, # 'CBH' + MonthEnd, # 'M' + MonthBegin, # 'MS' + Nano, # 'N' + SemiMonthEnd, # 'SM' + SemiMonthBegin, # 'SMS' + Week, # 'W' + Second, # 'S' + Minute, # 'T' + Micro, # 'U' + QuarterEnd, # 'Q' + QuarterBegin, # 'QS' + Milli, # 'L' + Hour, # 'H' + Day, # 'D' + WeekOfMonth, # 'WOM' + FY5253, + FY5253Quarter, + ] +} diff --git a/pandas/util/__init__.py b/pandas/util/__init__.py index 9600109f015345..d906c0371d2071 100644 --- a/pandas/util/__init__.py +++ b/pandas/util/__init__.py @@ -1,4 +1,3 @@ -from pandas.util._decorators import ( # noqa - Appender, Substitution, cache_readonly) +from pandas.util._decorators import Appender, Substitution, cache_readonly # noqa from pandas.core.util.hashing import hash_array, hash_pandas_object # noqa diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index cdda02324ba066..f39020f4165dfa 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -6,8 +6,9 @@ from pandas._libs.properties import cache_readonly # noqa -def deprecate(name, alternative, version, alt_name=None, - klass=None, stacklevel=2, msg=None): +def deprecate( + name, alternative, version, alt_name=None, klass=None, stacklevel=2, msg=None +): """ Return a new function that emits a deprecation warning on use. @@ -36,8 +37,7 @@ def deprecate(name, alternative, version, alt_name=None, alt_name = alt_name or alternative.__name__ klass = klass or FutureWarning - warning_msg = msg or '{} is deprecated, use {} instead'.format(name, - alt_name) + warning_msg = msg or "{} is deprecated, use {} instead".format(name, alt_name) @wraps(alternative) def wrapper(*args, **kwargs): @@ -45,31 +45,37 @@ def wrapper(*args, **kwargs): return alternative(*args, **kwargs) # adding deprecated directive to the docstring - msg = msg or 'Use `{alt_name}` instead.'.format(alt_name=alt_name) - doc_error_msg = ('deprecate needs a correctly formatted docstring in ' - 'the target function (should have a one liner short ' - 'summary, and opening quotes should be in their own ' - 'line). Found:\n{}'.format(alternative.__doc__)) + msg = msg or "Use `{alt_name}` instead.".format(alt_name=alt_name) + doc_error_msg = ( + "deprecate needs a correctly formatted docstring in " + "the target function (should have a one liner short " + "summary, and opening quotes should be in their own " + "line). Found:\n{}".format(alternative.__doc__) + ) # when python is running in optimized mode (i.e. `-OO`), docstrings are # removed, so we check that a docstring with correct formatting is used # but we allow empty docstrings if alternative.__doc__: - if alternative.__doc__.count('\n') < 3: + if alternative.__doc__.count("\n") < 3: raise AssertionError(doc_error_msg) - empty1, summary, empty2, doc = alternative.__doc__.split('\n', 3) + empty1, summary, empty2, doc = alternative.__doc__.split("\n", 3) if empty1 or empty2 and not summary: raise AssertionError(doc_error_msg) - wrapper.__doc__ = dedent(""" + wrapper.__doc__ = dedent( + """ {summary} .. deprecated:: {depr_version} {depr_msg} - {rest_of_docstring}""").format(summary=summary.strip(), - depr_version=version, - depr_msg=msg, - rest_of_docstring=dedent(doc)) + {rest_of_docstring}""" + ).format( + summary=summary.strip(), + depr_version=version, + depr_msg=msg, + rest_of_docstring=dedent(doc), + ) return wrapper @@ -137,10 +143,10 @@ def deprecate_kwarg(old_arg_name, new_arg_name, mapping=None, stacklevel=2): should raise warning """ - if mapping is not None and not hasattr(mapping, 'get') and \ - not callable(mapping): - raise TypeError("mapping from old to new argument values " - "must be dict or callable!") + if mapping is not None and not hasattr(mapping, "get") and not callable(mapping): + raise TypeError( + "mapping from old to new argument values " "must be dict or callable!" + ) def _deprecate_kwarg(func): @wraps(func) @@ -159,34 +165,38 @@ def wrapper(*args, **kwargs): if old_arg_value is not None: if mapping is not None: - if hasattr(mapping, 'get'): - new_arg_value = mapping.get(old_arg_value, - old_arg_value) + if hasattr(mapping, "get"): + new_arg_value = mapping.get(old_arg_value, old_arg_value) else: new_arg_value = mapping(old_arg_value) - msg = ("the {old_name}={old_val!r} keyword is deprecated, " - "use {new_name}={new_val!r} instead" - ).format(old_name=old_arg_name, - old_val=old_arg_value, - new_name=new_arg_name, - new_val=new_arg_value) + msg = ( + "the {old_name}={old_val!r} keyword is deprecated, " + "use {new_name}={new_val!r} instead" + ).format( + old_name=old_arg_name, + old_val=old_arg_value, + new_name=new_arg_name, + new_val=new_arg_value, + ) else: new_arg_value = old_arg_value - msg = ("the '{old_name}' keyword is deprecated, " - "use '{new_name}' instead" - ).format(old_name=old_arg_name, - new_name=new_arg_name) + msg = ( + "the '{old_name}' keyword is deprecated, " + "use '{new_name}' instead" + ).format(old_name=old_arg_name, new_name=new_arg_name) warnings.warn(msg, FutureWarning, stacklevel=stacklevel) if kwargs.get(new_arg_name, None) is not None: - msg = ("Can only specify '{old_name}' or '{new_name}', " - "not both").format(old_name=old_arg_name, - new_name=new_arg_name) + msg = ( + "Can only specify '{old_name}' or '{new_name}', " "not both" + ).format(old_name=old_arg_name, new_name=new_arg_name) raise TypeError(msg) else: kwargs[new_arg_name] = new_arg_value return func(*args, **kwargs) + return wrapper + return _deprecate_kwarg @@ -198,11 +208,11 @@ def wrapper(*args, **kwargs): kind = inspect.Parameter.POSITIONAL_OR_KEYWORD params = [ - inspect.Parameter('self', kind), + inspect.Parameter("self", kind), inspect.Parameter(name, kind, default=None), - inspect.Parameter('index', kind, default=None), - inspect.Parameter('columns', kind, default=None), - inspect.Parameter('axis', kind, default=None), + inspect.Parameter("index", kind, default=None), + inspect.Parameter("columns", kind, default=None), + inspect.Parameter("axis", kind, default=None), ] for pname, default in extra_params: @@ -212,8 +222,10 @@ def wrapper(*args, **kwargs): func.__signature__ = sig return wrapper + return decorate + # Substitution and Appender are derived from matplotlib.docstring (1.1.0) # module http://matplotlib.org/users/license.html @@ -248,7 +260,7 @@ def some_function(x): """ def __init__(self, *args, **kwargs): - if (args and kwargs): + if args and kwargs: raise AssertionError("Only positional or keyword args are allowed") self.params = args or kwargs @@ -299,7 +311,7 @@ def my_dog(has='fleas'): pass """ - def __init__(self, addendum, join='', indents=0): + def __init__(self, addendum, join="", indents=0): if indents > 0: self.addendum = indent(addendum, indents=indents) else: @@ -307,8 +319,8 @@ def __init__(self, addendum, join='', indents=0): self.join = join def __call__(self, func): - func.__doc__ = func.__doc__ if func.__doc__ else '' - self.addendum = self.addendum if self.addendum else '' + func.__doc__ = func.__doc__ if func.__doc__ else "" + self.addendum = self.addendum if self.addendum else "" docitems = [func.__doc__, self.addendum] func.__doc__ = dedent(self.join.join(docitems)) return func @@ -316,6 +328,6 @@ def __call__(self, func): def indent(text, indents=1): if not text or not isinstance(text, str): - return '' - jointext = ''.join(['\n'] + [' '] * indents) - return jointext.join(text.split('\n')) + return "" + jointext = "".join(["\n"] + [" "] * indents) + return jointext.join(text.split("\n")) diff --git a/pandas/util/_depr_module.py b/pandas/util/_depr_module.py index 714ea1ce8086fe..54f090ede3fc41 100644 --- a/pandas/util/_depr_module.py +++ b/pandas/util/_depr_module.py @@ -23,8 +23,7 @@ class _DeprecatedModule: objects """ - def __init__(self, deprmod, deprmodto=None, removals=None, - moved=None): + def __init__(self, deprmod, deprmodto=None, removals=None, moved=None): self.deprmod = deprmod self.deprmodto = deprmodto self.removals = removals @@ -64,23 +63,27 @@ def __getattr__(self, name): warnings.warn( "{deprmod}.{name} is deprecated and will be removed in " "a future version.".format(deprmod=self.deprmod, name=name), - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2, + ) elif self.moved is not None and name in self.moved: warnings.warn( "{deprmod} is deprecated and will be removed in " "a future version.\nYou can access {name} as {moved}".format( - deprmod=self.deprmod, - name=name, - moved=self.moved[name]), - FutureWarning, stacklevel=2) + deprmod=self.deprmod, name=name, moved=self.moved[name] + ), + FutureWarning, + stacklevel=2, + ) else: deprmodto = self.deprmodto if deprmodto is False: warnings.warn( "{deprmod}.{name} is deprecated and will be removed in " - "a future version.".format( - deprmod=self.deprmod, name=name), - FutureWarning, stacklevel=2) + "a future version.".format(deprmod=self.deprmod, name=name), + FutureWarning, + stacklevel=2, + ) else: if deprmodto is None: deprmodto = obj.__module__ @@ -88,8 +91,11 @@ def __getattr__(self, name): warnings.warn( "{deprmod}.{name} is deprecated. Please use " "{deprmodto}.{name} instead.".format( - deprmod=self.deprmod, name=name, deprmodto=deprmodto), - FutureWarning, stacklevel=2) + deprmod=self.deprmod, name=name, deprmodto=deprmodto + ), + FutureWarning, + stacklevel=2, + ) return obj @@ -98,6 +104,6 @@ def _import_deprmod(self, mod=None): mod = self.deprmod with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=FutureWarning) + warnings.filterwarnings("ignore", category=FutureWarning) deprmodule = importlib.import_module(mod) return deprmodule diff --git a/pandas/util/_doctools.py b/pandas/util/_doctools.py index 3a64f86a6668d2..11156bc9728576 100644 --- a/pandas/util/_doctools.py +++ b/pandas/util/_doctools.py @@ -29,15 +29,11 @@ def _get_cells(self, left, right, vertical): if vertical: # calculate required number of cells - vcells = max(sum(self._shape(l)[0] for l in left), - self._shape(right)[0]) - hcells = (max(self._shape(l)[1] for l in left) + - self._shape(right)[1]) + vcells = max(sum(self._shape(l)[0] for l in left), self._shape(right)[0]) + hcells = max(self._shape(l)[1] for l in left) + self._shape(right)[1] else: - vcells = max([self._shape(l)[0] for l in left] + - [self._shape(right)[0]]) - hcells = sum([self._shape(l)[1] for l in left] + - [self._shape(right)[1]]) + vcells = max([self._shape(l)[0] for l in left] + [self._shape(right)[0]]) + hcells = sum([self._shape(l)[1] for l in left] + [self._shape(right)[1]]) return hcells, vcells def plot(self, left, right, labels=None, vertical=True): @@ -76,11 +72,10 @@ def plot(self, left, right, labels=None, vertical=True): max_left_rows = max(self._shape(l)[0] for l in left) for i, (l, label) in enumerate(zip(left, labels)): ax = fig.add_subplot(gs[i, 0:max_left_cols]) - self._make_table(ax, l, title=label, - height=1.0 / max_left_rows) + self._make_table(ax, l, title=label, height=1.0 / max_left_rows) # right ax = plt.subplot(gs[:, max_left_cols:]) - self._make_table(ax, right, title='Result', height=1.05 / vcells) + self._make_table(ax, right, title="Result", height=1.05 / vcells) fig.subplots_adjust(top=0.9, bottom=0.05, left=0.05, right=0.95) else: max_rows = max(self._shape(df)[0] for df in left + [right]) @@ -90,12 +85,12 @@ def plot(self, left, right, labels=None, vertical=True): i = 0 for l, label in zip(left, labels): sp = self._shape(l) - ax = fig.add_subplot(gs[0, i:i + sp[1]]) + ax = fig.add_subplot(gs[0, i : i + sp[1]]) self._make_table(ax, l, title=label, height=height) i += sp[1] # right ax = plt.subplot(gs[0, i:]) - self._make_table(ax, right, title='Result', height=height) + self._make_table(ax, right, title="Result", height=height) fig.subplots_adjust(top=0.85, bottom=0.05, left=0.05, right=0.95) return fig @@ -104,10 +99,10 @@ def _conv(self, data): """Convert each input to appropriate for table outplot""" if isinstance(data, pd.Series): if data.name is None: - data = data.to_frame(name='') + data = data.to_frame(name="") else: data = data.to_frame() - data = data.fillna('NaN') + data = data.fillna("NaN") return data def _insert_index(self, data): @@ -115,17 +110,17 @@ def _insert_index(self, data): data = data.copy() idx_nlevels = data.index.nlevels if idx_nlevels == 1: - data.insert(0, 'Index', data.index) + data.insert(0, "Index", data.index) else: for i in range(idx_nlevels): - data.insert(i, 'Index{0}'.format(i), - data.index._get_level_values(i)) + data.insert(i, "Index{0}".format(i), data.index._get_level_values(i)) col_nlevels = data.columns.nlevels if col_nlevels > 1: col = data.columns._get_level_values(0) - values = [data.columns._get_level_values(i).values - for i in range(1, col_nlevels)] + values = [ + data.columns._get_level_values(i).values for i in range(1, col_nlevels) + ] col_df = pd.DataFrame(values) data.columns = col_df.columns data = pd.concat([col_df, data]) @@ -150,17 +145,17 @@ def _make_table(self, ax, df, title, height=None): height = 1.0 / (len(df) + 1) props = tb.properties() - for (r, c), cell in props['celld'].items(): + for (r, c), cell in props["celld"].items(): if c == -1: cell.set_visible(False) elif r < col_nlevels and c < idx_nlevels: cell.set_visible(False) elif r < col_nlevels or c < idx_nlevels: - cell.set_facecolor('#AAAAAA') + cell.set_facecolor("#AAAAAA") cell.set_height(height) ax.set_title(title, size=self.font_size) - ax.axis('off') + ax.axis("off") if __name__ == "__main__": @@ -168,29 +163,24 @@ def _make_table(self, ax, df, title, height=None): p = TablePlotter() - df1 = pd.DataFrame({'A': [10, 11, 12], - 'B': [20, 21, 22], - 'C': [30, 31, 32]}) - df2 = pd.DataFrame({'A': [10, 12], - 'C': [30, 32]}) + df1 = pd.DataFrame({"A": [10, 11, 12], "B": [20, 21, 22], "C": [30, 31, 32]}) + df2 = pd.DataFrame({"A": [10, 12], "C": [30, 32]}) - p.plot([df1, df2], pd.concat([df1, df2]), - labels=['df1', 'df2'], vertical=True) + p.plot([df1, df2], pd.concat([df1, df2]), labels=["df1", "df2"], vertical=True) plt.show() - df3 = pd.DataFrame({'X': [10, 12], - 'Z': [30, 32]}) + df3 = pd.DataFrame({"X": [10, 12], "Z": [30, 32]}) - p.plot([df1, df3], pd.concat([df1, df3], axis=1), - labels=['df1', 'df2'], vertical=False) + p.plot( + [df1, df3], pd.concat([df1, df3], axis=1), labels=["df1", "df2"], vertical=False + ) plt.show() - idx = pd.MultiIndex.from_tuples([(1, 'A'), (1, 'B'), (1, 'C'), - (2, 'A'), (2, 'B'), (2, 'C')]) - col = pd.MultiIndex.from_tuples([(1, 'A'), (1, 'B')]) - df3 = pd.DataFrame({'v1': [1, 2, 3, 4, 5, 6], - 'v2': [5, 6, 7, 8, 9, 10]}, - index=idx) + idx = pd.MultiIndex.from_tuples( + [(1, "A"), (1, "B"), (1, "C"), (2, "A"), (2, "B"), (2, "C")] + ) + col = pd.MultiIndex.from_tuples([(1, "A"), (1, "B")]) + df3 = pd.DataFrame({"v1": [1, 2, 3, 4, 5, 6], "v2": [5, 6, 7, 8, 9, 10]}, index=idx) df3.columns = col - p.plot(df3, df3, labels=['df3']) + p.plot(df3, df3, labels=["df3"]) plt.show() diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index 5e2e013c4afcc3..21d09c06940ca6 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -6,8 +6,7 @@ import subprocess import sys -from pandas.compat._optional import ( - VERSIONS, _get_version, import_optional_dependency) +from pandas.compat._optional import VERSIONS, _get_version, import_optional_dependency def get_sys_info(): @@ -19,9 +18,11 @@ def get_sys_info(): commit = None if os.path.isdir(".git") and os.path.isdir("pandas"): try: - pipe = subprocess.Popen('git log --format="%H" -n 1'.split(" "), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + pipe = subprocess.Popen( + 'git log --format="%H" -n 1'.split(" "), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) so, serr = pipe.communicate() except (OSError, ValueError): pass @@ -29,29 +30,30 @@ def get_sys_info(): if pipe.returncode == 0: commit = so try: - commit = so.decode('utf-8') + commit = so.decode("utf-8") except ValueError: pass commit = commit.strip().strip('"') - blob.append(('commit', commit)) + blob.append(("commit", commit)) try: - (sysname, nodename, release, - version, machine, processor) = platform.uname() - blob.extend([ - ("python", '.'.join(map(str, sys.version_info))), - ("python-bits", struct.calcsize("P") * 8), - ("OS", "{sysname}".format(sysname=sysname)), - ("OS-release", "{release}".format(release=release)), - # ("Version", "{version}".format(version=version)), - ("machine", "{machine}".format(machine=machine)), - ("processor", "{processor}".format(processor=processor)), - ("byteorder", "{byteorder}".format(byteorder=sys.byteorder)), - ("LC_ALL", "{lc}".format(lc=os.environ.get('LC_ALL', "None"))), - ("LANG", "{lang}".format(lang=os.environ.get('LANG', "None"))), - ("LOCALE", '.'.join(map(str, locale.getlocale()))), - ]) + (sysname, nodename, release, version, machine, processor) = platform.uname() + blob.extend( + [ + ("python", ".".join(map(str, sys.version_info))), + ("python-bits", struct.calcsize("P") * 8), + ("OS", "{sysname}".format(sysname=sysname)), + ("OS-release", "{release}".format(release=release)), + # ("Version", "{version}".format(version=version)), + ("machine", "{machine}".format(machine=machine)), + ("processor", "{processor}".format(processor=processor)), + ("byteorder", "{byteorder}".format(byteorder=sys.byteorder)), + ("LC_ALL", "{lc}".format(lc=os.environ.get("LC_ALL", "None"))), + ("LANG", "{lang}".format(lang=os.environ.get("LANG", "None"))), + ("LOCALE", ".".join(map(str, locale.getlocale()))), + ] + ) except (KeyError, ValueError): pass @@ -61,18 +63,18 @@ def get_sys_info(): def show_versions(as_json=False): sys_info = get_sys_info() deps = [ - 'pandas', + "pandas", # required - 'numpy', - 'pytz', - 'dateutil', + "numpy", + "pytz", + "dateutil", # install / build, - 'pip', - 'setuptools', - 'Cython', + "pip", + "setuptools", + "Cython", # test - 'pytest', - 'hypothesis', + "pytest", + "hypothesis", # docs "sphinx", # Other, need a min version @@ -93,9 +95,9 @@ def show_versions(as_json=False): deps_blob = [] for modname in deps: - mod = import_optional_dependency(modname, - raise_on_missing=False, - on_version="ignore") + mod = import_optional_dependency( + modname, raise_on_missing=False, on_version="ignore" + ) if mod: ver = _get_version(mod) else: @@ -113,12 +115,12 @@ def show_versions(as_json=False): if as_json is True: print(j) else: - with codecs.open(as_json, "wb", encoding='utf8') as f: + with codecs.open(as_json, "wb", encoding="utf8") as f: json.dump(j, f, indent=2) else: maxlen = max(len(x) for x in deps) - tpl = '{{k:<{maxlen}}}: {{stat}}'.format(maxlen=maxlen) + tpl = "{{k:<{maxlen}}}: {{stat}}".format(maxlen=maxlen) print("\nINSTALLED VERSIONS") print("------------------") for k, stat in sys_info: @@ -130,10 +132,15 @@ def show_versions(as_json=False): def main(): from optparse import OptionParser + parser = OptionParser() - parser.add_option("-j", "--json", metavar="FILE", nargs=1, - help="Save output as JSON into file, pass in " - "'-' to output to stdout") + parser.add_option( + "-j", + "--json", + metavar="FILE", + nargs=1, + help="Save output as JSON into file, pass in " "'-' to output to stdout", + ) (options, args) = parser.parse_args() diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index ab22539f4530f5..3de4e5d66d5774 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -33,8 +33,7 @@ def test_foo(): from pandas.compat import is_platform_32bit, is_platform_windows from pandas.compat.numpy import _np_version -from pandas.core.computation.expressions import ( - _NUMEXPR_INSTALLED, _USE_NUMEXPR) +from pandas.core.computation.expressions import _NUMEXPR_INSTALLED, _USE_NUMEXPR def safe_import(mod_name, min_version=None): @@ -60,13 +59,15 @@ def safe_import(mod_name, min_version=None): return mod else: import sys + try: - version = getattr(sys.modules[mod_name], '__version__') + version = getattr(sys.modules[mod_name], "__version__") except AttributeError: # xlrd uses a capitalized attribute name - version = getattr(sys.modules[mod_name], '__VERSION__') + version = getattr(sys.modules[mod_name], "__VERSION__") if version: from distutils.version import LooseVersion + if LooseVersion(version) >= LooseVersion(min_version): return mod @@ -89,20 +90,20 @@ def _skip_if_has_locale(): def _skip_if_not_us_locale(): lang, _ = locale.getlocale() - if lang != 'en_US': + if lang != "en_US": return True def _skip_if_no_scipy(): - return not (safe_import('scipy.stats') and - safe_import('scipy.sparse') and - safe_import('scipy.interpolate') and - safe_import('scipy.signal')) + return not ( + safe_import("scipy.stats") + and safe_import("scipy.sparse") + and safe_import("scipy.interpolate") + and safe_import("scipy.signal") + ) -def skip_if_installed( - package: str, -) -> MarkDecorator: +def skip_if_installed(package: str,) -> MarkDecorator: """ Skip a test if a package is installed. @@ -112,15 +113,11 @@ def skip_if_installed( The name of the package. """ return pytest.mark.skipif( - safe_import(package), - reason="Skipping because {} is installed.".format(package) + safe_import(package), reason="Skipping because {} is installed.".format(package) ) -def skip_if_no( - package: str, - min_version: Optional[str] = None -) -> MarkDecorator: +def skip_if_no(package: str, min_version: Optional[str] = None) -> MarkDecorator: """ Generic function to help skip tests when required packages are not present on the testing system. @@ -158,37 +155,39 @@ def skip_if_no( ) -skip_if_no_mpl = pytest.mark.skipif(_skip_if_no_mpl(), - reason="Missing matplotlib dependency") -skip_if_mpl = pytest.mark.skipif(not _skip_if_no_mpl(), - reason="matplotlib is present") -skip_if_32bit = pytest.mark.skipif(is_platform_32bit(), - reason="skipping for 32 bit") -skip_if_windows = pytest.mark.skipif(is_platform_windows(), - reason="Running on Windows") -skip_if_windows_python_3 = pytest.mark.skipif(is_platform_windows(), - reason="not used on win32") -skip_if_has_locale = pytest.mark.skipif(_skip_if_has_locale(), - reason="Specific locale is set {lang}" - .format(lang=locale.getlocale()[0])) -skip_if_not_us_locale = pytest.mark.skipif(_skip_if_not_us_locale(), - reason="Specific locale is set " - "{lang}".format( - lang=locale.getlocale()[0])) -skip_if_no_scipy = pytest.mark.skipif(_skip_if_no_scipy(), - reason="Missing SciPy requirement") -skip_if_no_ne = pytest.mark.skipif(not _USE_NUMEXPR, - reason="numexpr enabled->{enabled}, " - "installed->{installed}".format( - enabled=_USE_NUMEXPR, - installed=_NUMEXPR_INSTALLED)) +skip_if_no_mpl = pytest.mark.skipif( + _skip_if_no_mpl(), reason="Missing matplotlib dependency" +) +skip_if_mpl = pytest.mark.skipif(not _skip_if_no_mpl(), reason="matplotlib is present") +skip_if_32bit = pytest.mark.skipif(is_platform_32bit(), reason="skipping for 32 bit") +skip_if_windows = pytest.mark.skipif(is_platform_windows(), reason="Running on Windows") +skip_if_windows_python_3 = pytest.mark.skipif( + is_platform_windows(), reason="not used on win32" +) +skip_if_has_locale = pytest.mark.skipif( + _skip_if_has_locale(), + reason="Specific locale is set {lang}".format(lang=locale.getlocale()[0]), +) +skip_if_not_us_locale = pytest.mark.skipif( + _skip_if_not_us_locale(), + reason="Specific locale is set " "{lang}".format(lang=locale.getlocale()[0]), +) +skip_if_no_scipy = pytest.mark.skipif( + _skip_if_no_scipy(), reason="Missing SciPy requirement" +) +skip_if_no_ne = pytest.mark.skipif( + not _USE_NUMEXPR, + reason="numexpr enabled->{enabled}, " + "installed->{installed}".format(enabled=_USE_NUMEXPR, installed=_NUMEXPR_INSTALLED), +) def skip_if_np_lt(ver_str, reason=None, *args, **kwds): if reason is None: reason = "NumPy %s or greater required" % ver_str - return pytest.mark.skipif(_np_version < LooseVersion(ver_str), - reason=reason, *args, **kwds) + return pytest.mark.skipif( + _np_version < LooseVersion(ver_str), reason=reason, *args, **kwds + ) def parametrize_fixture_doc(*args): @@ -210,7 +209,9 @@ def parametrize_fixture_doc(*args): The decorated function wrapped within a pytest ``parametrize_fixture_doc`` mark """ + def documented_fixture(fixture): fixture.__doc__ = fixture.__doc__.format(*args) return fixture + return documented_fixture diff --git a/pandas/util/_tester.py b/pandas/util/_tester.py index 19b1cc700261c9..0f5324c8d02baf 100644 --- a/pandas/util/_tester.py +++ b/pandas/util/_tester.py @@ -16,14 +16,14 @@ def test(extra_args=None): import hypothesis # noqa except ImportError: raise ImportError("Need hypothesis>=3.58 to run tests") - cmd = ['--skip-slow', '--skip-network', '--skip-db'] + cmd = ["--skip-slow", "--skip-network", "--skip-db"] if extra_args: if not isinstance(extra_args, list): extra_args = [extra_args] cmd = extra_args cmd += [PKG] - print("running: pytest {}".format(' '.join(cmd))) + print("running: pytest {}".format(" ".join(cmd))) sys.exit(pytest.main(cmd)) -__all__ = ['test'] +__all__ = ["test"] diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index 41faaf68d7f403..8d5f9f77496829 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -20,13 +20,17 @@ def _check_arg_length(fname, args, max_fname_arg_count, compat_args): if len(args) > len(compat_args): max_arg_count = len(compat_args) + max_fname_arg_count actual_arg_count = len(args) + max_fname_arg_count - argument = 'argument' if max_arg_count == 1 else 'arguments' + argument = "argument" if max_arg_count == 1 else "arguments" raise TypeError( "{fname}() takes at most {max_arg} {argument} " "({given_arg} given)".format( - fname=fname, max_arg=max_arg_count, - argument=argument, given_arg=actual_arg_count)) + fname=fname, + max_arg=max_arg_count, + argument=argument, + given_arg=actual_arg_count, + ) + ) def _check_for_default_values(fname, arg_val_dict, compat_args): @@ -48,11 +52,10 @@ def _check_for_default_values(fname, arg_val_dict, compat_args): # check for None-ness otherwise we could end up # comparing a numpy array vs None - if (v1 is not None and v2 is None) or \ - (v1 is None and v2 is not None): + if (v1 is not None and v2 is None) or (v1 is None and v2 is not None): match = False else: - match = (v1 == v2) + match = v1 == v2 if not is_bool(match): raise ValueError("'match' is not a boolean") @@ -60,13 +63,16 @@ def _check_for_default_values(fname, arg_val_dict, compat_args): # could not compare them directly, so try comparison # using the 'is' operator except ValueError: - match = (arg_val_dict[key] is compat_args[key]) + match = arg_val_dict[key] is compat_args[key] if not match: - raise ValueError(("the '{arg}' parameter is not " - "supported in the pandas " - "implementation of {fname}()". - format(fname=fname, arg=key))) + raise ValueError( + ( + "the '{arg}' parameter is not " + "supported in the pandas " + "implementation of {fname}()".format(fname=fname, arg=key) + ) + ) def validate_args(fname, args, max_fname_arg_count, compat_args): @@ -122,9 +128,12 @@ def _check_for_invalid_keys(fname, kwargs, compat_args): if diff: bad_arg = list(diff)[0] - raise TypeError(("{fname}() got an unexpected " - "keyword argument '{arg}'". - format(fname=fname, arg=bad_arg))) + raise TypeError( + ( + "{fname}() got an unexpected " + "keyword argument '{arg}'".format(fname=fname, arg=bad_arg) + ) + ) def validate_kwargs(fname, kwargs, compat_args): @@ -157,9 +166,7 @@ def validate_kwargs(fname, kwargs, compat_args): _check_for_default_values(fname, kwds, compat_args) -def validate_args_and_kwargs(fname, args, kwargs, - max_fname_arg_count, - compat_args): +def validate_args_and_kwargs(fname, args, kwargs, max_fname_arg_count, compat_args): """ Checks whether parameters passed to the *args and **kwargs argument in a function `fname` are valid parameters as specified in `*compat_args` @@ -202,8 +209,9 @@ def validate_args_and_kwargs(fname, args, kwargs, """ # Check that the total number of arguments passed in (i.e. # args and kwargs) does not exceed the length of compat_args - _check_arg_length(fname, args + tuple(kwargs.values()), - max_fname_arg_count, compat_args) + _check_arg_length( + fname, args + tuple(kwargs.values()), max_fname_arg_count, compat_args + ) # Check there is no overlap with the positional and keyword # arguments, similar to what is done in actual Python functions @@ -211,8 +219,10 @@ def validate_args_and_kwargs(fname, args, kwargs, for key in args_dict: if key in kwargs: - raise TypeError("{fname}() got multiple values for keyword " - "argument '{arg}'".format(fname=fname, arg=key)) + raise TypeError( + "{fname}() got multiple values for keyword " + "argument '{arg}'".format(fname=fname, arg=key) + ) kwargs.update(args_dict) validate_kwargs(fname, kwargs, compat_args) @@ -221,9 +231,10 @@ def validate_args_and_kwargs(fname, args, kwargs, def validate_bool_kwarg(value, arg_name): """ Ensures that argument passed in arg_name is of type bool. """ if not (is_bool(value) or value is None): - raise ValueError('For argument "{arg}" expected type bool, received ' - 'type {typ}.'.format(arg=arg_name, - typ=type(value).__name__)) + raise ValueError( + 'For argument "{arg}" expected type bool, received ' + "type {typ}.".format(arg=arg_name, typ=type(value).__name__) + ) return value @@ -268,18 +279,19 @@ def validate_axis_style_args(data, args, kwargs, arg_name, method_name): # like out = {'index': foo, 'columns': bar} # Start by validating for consistency - if 'axis' in kwargs and any(x in kwargs for x in data._AXIS_NUMBERS): + if "axis" in kwargs and any(x in kwargs for x in data._AXIS_NUMBERS): msg = "Cannot specify both 'axis' and any of 'index' or 'columns'." raise TypeError(msg) # First fill with explicit values provided by the user... if arg_name in kwargs: if args: - msg = ("{} got multiple values for argument " - "'{}'".format(method_name, arg_name)) + msg = "{} got multiple values for argument " "'{}'".format( + method_name, arg_name + ) raise TypeError(msg) - axis = data._get_axis_name(kwargs.get('axis', 0)) + axis = data._get_axis_name(kwargs.get("axis", 0)) out[axis] = kwargs[arg_name] # More user-provided arguments, now from kwargs @@ -298,22 +310,22 @@ def validate_axis_style_args(data, args, kwargs, arg_name, method_name): if len(args) == 0: pass # It's up to the function to decide if this is valid elif len(args) == 1: - axis = data._get_axis_name(kwargs.get('axis', 0)) + axis = data._get_axis_name(kwargs.get("axis", 0)) out[axis] = args[0] elif len(args) == 2: - if 'axis' in kwargs: + if "axis" in kwargs: # Unambiguously wrong - msg = ("Cannot specify both 'axis' and any of 'index' " - "or 'columns'") + msg = "Cannot specify both 'axis' and any of 'index' " "or 'columns'" raise TypeError(msg) - msg = ("Interpreting call\n\t'.{method_name}(a, b)' as " - "\n\t'.{method_name}(index=a, columns=b)'.\nUse named " - "arguments to remove any ambiguity. In the future, using " - "positional arguments for 'index' or 'columns' will raise " - " a 'TypeError'.") - warnings.warn(msg.format(method_name=method_name,), FutureWarning, - stacklevel=4) + msg = ( + "Interpreting call\n\t'.{method_name}(a, b)' as " + "\n\t'.{method_name}(index=a, columns=b)'.\nUse named " + "arguments to remove any ambiguity. In the future, using " + "positional arguments for 'index' or 'columns' will raise " + " a 'TypeError'." + ) + warnings.warn(msg.format(method_name=method_name), FutureWarning, stacklevel=4) out[data._AXIS_NAMES[0]] = args[0] out[data._AXIS_NAMES[1]] = args[1] else: @@ -349,8 +361,10 @@ def validate_fillna_kwargs(value, method, validate_scalar_dict_value=True): elif value is not None and method is None: if validate_scalar_dict_value and isinstance(value, (list, tuple)): - raise TypeError('"value" parameter must be a scalar or dict, but ' - 'you passed a "{0}"'.format(type(value).__name__)) + raise TypeError( + '"value" parameter must be a scalar or dict, but ' + 'you passed a "{0}"'.format(type(value).__name__) + ) elif value is not None and method is not None: raise ValueError("Cannot specify both 'value' and 'method'.") diff --git a/pandas/util/testing.py b/pandas/util/testing.py index cec9416e5d2c5c..037c885e4733f1 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -20,26 +20,54 @@ from numpy.random import rand, randn from pandas._config.localization import ( # noqa:F401 - can_set_locale, get_locales, set_locale) + can_set_locale, + get_locales, + set_locale, +) import pandas._libs.testing as _testing from pandas.compat import raise_with_traceback from pandas.core.dtypes.common import ( - is_bool, is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_datetimelike_v_numeric, is_datetimelike_v_object, - is_extension_array_dtype, is_interval_dtype, is_list_like, is_number, - is_period_dtype, is_sequence, is_timedelta64_dtype, needs_i8_conversion) + is_bool, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetimelike_v_numeric, + is_datetimelike_v_object, + is_extension_array_dtype, + is_interval_dtype, + is_list_like, + is_number, + is_period_dtype, + is_sequence, + is_timedelta64_dtype, + needs_i8_conversion, +) from pandas.core.dtypes.missing import array_equivalent import pandas as pd from pandas import ( - Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Index, - IntervalIndex, MultiIndex, RangeIndex, Series, bdate_range) + Categorical, + CategoricalIndex, + DataFrame, + DatetimeIndex, + Index, + IntervalIndex, + MultiIndex, + RangeIndex, + Series, + bdate_range, +) from pandas.core.algorithms import take_1d from pandas.core.arrays import ( - DatetimeArray, ExtensionArray, IntervalArray, PeriodArray, TimedeltaArray, - period_array) + DatetimeArray, + ExtensionArray, + IntervalArray, + PeriodArray, + TimedeltaArray, + period_array, +) from pandas.io.common import urlopen from pandas.io.formats.printing import pprint_thing @@ -54,16 +82,16 @@ def set_testing_mode(): # set the testing mode filters - testing_mode = os.environ.get('PANDAS_TESTING_MODE', 'None') - if 'deprecate' in testing_mode: - warnings.simplefilter('always', _testing_mode_warnings) + testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") + if "deprecate" in testing_mode: + warnings.simplefilter("always", _testing_mode_warnings) def reset_testing_mode(): # reset the testing mode filters - testing_mode = os.environ.get('PANDAS_TESTING_MODE', 'None') - if 'deprecate' in testing_mode: - warnings.simplefilter('ignore', _testing_mode_warnings) + testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") + if "deprecate" in testing_mode: + warnings.simplefilter("ignore", _testing_mode_warnings) set_testing_mode() @@ -74,7 +102,7 @@ def reset_display_options(): Reset the display options for printing and representing objects. """ - pd.reset_option('^display.', silent=True) + pd.reset_option("^display.", silent=True) def round_trip_pickle(obj, path=None): @@ -95,7 +123,7 @@ def round_trip_pickle(obj, path=None): """ if path is None: - path = '__{random_bytes}__.pickle'.format(random_bytes=rands(10)) + path = "__{random_bytes}__.pickle".format(random_bytes=rands(10)) with ensure_clean(path) as path: pd.to_pickle(obj, path) return pd.read_pickle(path) @@ -121,9 +149,10 @@ def round_trip_pathlib(writer, reader, path=None): """ import pytest - Path = pytest.importorskip('pathlib').Path + + Path = pytest.importorskip("pathlib").Path if path is None: - path = '___pathlib___' + path = "___pathlib___" with ensure_clean(path) as path: writer(Path(path)) obj = reader(Path(path)) @@ -149,9 +178,10 @@ def round_trip_localpath(writer, reader, path=None): The original object that was serialized and then re-read. """ import pytest - LocalPath = pytest.importorskip('py.path').local + + LocalPath = pytest.importorskip("py.path").local if path is None: - path = '___localpath___' + path = "___localpath___" with ensure_clean(path) as path: writer(LocalPath(path)) obj = reader(LocalPath(path)) @@ -177,23 +207,22 @@ def decompress_file(path, compression): """ if compression is None: - f = open(path, 'rb') - elif compression == 'gzip': - f = gzip.open(path, 'rb') - elif compression == 'bz2': - f = bz2.BZ2File(path, 'rb') - elif compression == 'xz': - f = lzma.LZMAFile(path, 'rb') - elif compression == 'zip': + f = open(path, "rb") + elif compression == "gzip": + f = gzip.open(path, "rb") + elif compression == "bz2": + f = bz2.BZ2File(path, "rb") + elif compression == "xz": + f = lzma.LZMAFile(path, "rb") + elif compression == "zip": zip_file = zipfile.ZipFile(path) zip_names = zip_file.namelist() if len(zip_names) == 1: f = zip_file.open(zip_names.pop()) else: - raise ValueError('ZIP file {} error. Only one file per ZIP.' - .format(path)) + raise ValueError("ZIP file {} error. Only one file per ZIP.".format(path)) else: - msg = 'Unrecognized compression type: {}'.format(compression) + msg = "Unrecognized compression type: {}".format(compression) raise ValueError(msg) try: @@ -226,15 +255,19 @@ def write_to_compressed(compression, path, data, dest="test"): if compression == "zip": import zipfile + compress_method = zipfile.ZipFile elif compression == "gzip": import gzip + compress_method = gzip.GzipFile elif compression == "bz2": import bz2 + compress_method = bz2.BZ2File elif compression == "xz": import lzma + compress_method = lzma.LZMAFile else: msg = "Unrecognized compression type: {}".format(compression) @@ -253,8 +286,9 @@ def write_to_compressed(compression, path, data, dest="test"): getattr(f, method)(*args) -def assert_almost_equal(left, right, check_dtype="equiv", - check_less_precise=False, **kwargs): +def assert_almost_equal( + left, right, check_dtype="equiv", check_less_precise=False, **kwargs +): """ Check that the left and right objects are approximately equal. @@ -282,25 +316,34 @@ def assert_almost_equal(left, right, check_dtype="equiv", """ if isinstance(left, pd.Index): - assert_index_equal(left, right, - check_exact=False, - exact=check_dtype, - check_less_precise=check_less_precise, - **kwargs) + assert_index_equal( + left, + right, + check_exact=False, + exact=check_dtype, + check_less_precise=check_less_precise, + **kwargs + ) elif isinstance(left, pd.Series): - assert_series_equal(left, right, - check_exact=False, - check_dtype=check_dtype, - check_less_precise=check_less_precise, - **kwargs) + assert_series_equal( + left, + right, + check_exact=False, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + **kwargs + ) elif isinstance(left, pd.DataFrame): - assert_frame_equal(left, right, - check_exact=False, - check_dtype=check_dtype, - check_less_precise=check_less_precise, - **kwargs) + assert_frame_equal( + left, + right, + check_exact=False, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + **kwargs + ) else: # Other sequences. @@ -312,17 +355,18 @@ def assert_almost_equal(left, right, check_dtype="equiv", # Do not compare bool classes, like np.bool_ and bool. pass else: - if (isinstance(left, np.ndarray) or - isinstance(right, np.ndarray)): + if isinstance(left, np.ndarray) or isinstance(right, np.ndarray): obj = "numpy array" else: obj = "Input" assert_class_equal(left, right, obj=obj) _testing.assert_almost_equal( - left, right, + left, + right, check_dtype=check_dtype, check_less_precise=check_less_precise, - **kwargs) + **kwargs + ) def _check_isinstance(left, right, cls): @@ -346,11 +390,13 @@ def _check_isinstance(left, right, cls): cls_name = cls.__name__ if not isinstance(left, cls): - raise AssertionError(err_msg.format(name=cls_name, exp_type=cls, - act_type=type(left))) + raise AssertionError( + err_msg.format(name=cls_name, exp_type=cls, act_type=type(left)) + ) if not isinstance(right, cls): - raise AssertionError(err_msg.format(name=cls_name, exp_type=cls, - act_type=type(right))) + raise AssertionError( + err_msg.format(name=cls_name, exp_type=cls, act_type=type(right)) + ) def assert_dict_equal(left, right, compare_keys=True): @@ -363,26 +409,33 @@ def randbool(size=(), p=0.5): return rand(*size) <= p -RANDS_CHARS = np.array(list(string.ascii_letters + string.digits), - dtype=(np.str_, 1)) -RANDU_CHARS = np.array(list("".join(map(chr, range(1488, 1488 + 26))) + - string.digits), dtype=(np.unicode_, 1)) +RANDS_CHARS = np.array(list(string.ascii_letters + string.digits), dtype=(np.str_, 1)) +RANDU_CHARS = np.array( + list("".join(map(chr, range(1488, 1488 + 26))) + string.digits), + dtype=(np.unicode_, 1), +) -def rands_array(nchars, size, dtype='O'): +def rands_array(nchars, size, dtype="O"): """Generate an array of byte strings.""" - retval = (np.random.choice(RANDS_CHARS, size=nchars * np.prod(size)) - .view((np.str_, nchars)).reshape(size)) + retval = ( + np.random.choice(RANDS_CHARS, size=nchars * np.prod(size)) + .view((np.str_, nchars)) + .reshape(size) + ) if dtype is None: return retval else: return retval.astype(dtype) -def randu_array(nchars, size, dtype='O'): +def randu_array(nchars, size, dtype="O"): """Generate an array of unicode strings.""" - retval = (np.random.choice(RANDU_CHARS, size=nchars * np.prod(size)) - .view((np.unicode_, nchars)).reshape(size)) + retval = ( + np.random.choice(RANDU_CHARS, size=nchars * np.prod(size)) + .view((np.unicode_, nchars)) + .reshape(size) + ) if dtype is None: return retval else: @@ -396,7 +449,7 @@ def rands(nchars): See `rands_array` if you want to create an array of random strings. """ - return ''.join(np.random.choice(RANDS_CHARS, nchars)) + return "".join(np.random.choice(RANDS_CHARS, nchars)) def randu(nchars): @@ -406,7 +459,7 @@ def randu(nchars): See `randu_array` if you want to create an array of random unicode strings. """ - return ''.join(np.random.choice(RANDU_CHARS, nchars)) + return "".join(np.random.choice(RANDU_CHARS, nchars)) def close(fignum=None): @@ -436,7 +489,7 @@ def ensure_clean(filename=None, return_filelike=False): if True, returns a file-like which is *always* cleaned. Necessary for savefig and other functions which want to append extensions. """ - filename = filename or '' + filename = filename or "" fd = None if return_filelike: @@ -454,7 +507,8 @@ def ensure_clean(filename=None, return_filelike=False): fd, filename = tempfile.mkstemp(suffix=filename) except UnicodeEncodeError: import pytest - pytest.skip('no unicode file names on this system') + + pytest.skip("no unicode file names on this system") try: yield filename @@ -462,8 +516,11 @@ def ensure_clean(filename=None, return_filelike=False): try: os.close(fd) except Exception: - print("Couldn't close file descriptor: {fdesc} (file: {fname})" - .format(fdesc=fd, fname=filename)) + print( + "Couldn't close file descriptor: {fdesc} (file: {fname})".format( + fdesc=fd, fname=filename + ) + ) try: if os.path.exists(filename): os.remove(filename) @@ -480,7 +537,7 @@ def ensure_clean_dir(): ------ Temporary directory path """ - directory_name = tempfile.mkdtemp(suffix='') + directory_name = tempfile.mkdtemp(suffix="") try: yield directory_name finally: @@ -516,14 +573,16 @@ def equalContents(arr1, arr2): return frozenset(arr1) == frozenset(arr2) -def assert_index_equal(left: Index, - right: Index, - exact: Union[bool, str] = 'equiv', - check_names: bool = True, - check_less_precise: Union[bool, int] = False, - check_exact: bool = True, - check_categorical: bool = True, - obj: str = 'Index') -> None: +def assert_index_equal( + left: Index, + right: Index, + exact: Union[bool, str] = "equiv", + check_names: bool = True, + check_less_precise: Union[bool, int] = False, + check_exact: bool = True, + check_categorical: bool = True, + obj: str = "Index", +) -> None: """Check that left and right Index are equal. Parameters @@ -550,19 +609,19 @@ def assert_index_equal(left: Index, """ __tracebackhide__ = True - def _check_types(l, r, obj='Index'): + def _check_types(l, r, obj="Index"): if exact: assert_class_equal(l, r, exact=exact, obj=obj) # Skip exact dtype checking when `check_categorical` is False if check_categorical: - assert_attr_equal('dtype', l, r, obj=obj) + assert_attr_equal("dtype", l, r, obj=obj) # allow string-like to have different inferred_types - if l.inferred_type in ('string', 'unicode'): - assert r.inferred_type in ('string', 'unicode') + if l.inferred_type in ("string", "unicode"): + assert r.inferred_type in ("string", "unicode") else: - assert_attr_equal('inferred_type', l, r, obj=obj) + assert_attr_equal("inferred_type", l, r, obj=obj) def _get_ilevel_values(index, level): # accept level number only @@ -580,16 +639,16 @@ def _get_ilevel_values(index, level): # level comparison if left.nlevels != right.nlevels: - msg1 = '{obj} levels are different'.format(obj=obj) - msg2 = '{nlevels}, {left}'.format(nlevels=left.nlevels, left=left) - msg3 = '{nlevels}, {right}'.format(nlevels=right.nlevels, right=right) + msg1 = "{obj} levels are different".format(obj=obj) + msg2 = "{nlevels}, {left}".format(nlevels=left.nlevels, left=left) + msg3 = "{nlevels}, {right}".format(nlevels=right.nlevels, right=right) raise_assert_detail(obj, msg1, msg2, msg3) # length comparison if len(left) != len(right): - msg1 = '{obj} length are different'.format(obj=obj) - msg2 = '{length}, {left}'.format(length=len(left), left=left) - msg3 = '{length}, {right}'.format(length=len(right), right=right) + msg1 = "{obj} length are different".format(obj=obj) + msg2 = "{length}, {left}".format(length=len(left), left=left) + msg3 = "{length}, {right}".format(length=len(right), right=right) raise_assert_detail(obj, msg1, msg2, msg3) # MultiIndex special comparison for little-friendly error messages @@ -602,44 +661,54 @@ def _get_ilevel_values(index, level): llevel = _get_ilevel_values(left, level) rlevel = _get_ilevel_values(right, level) - lobj = 'MultiIndex level [{level}]'.format(level=level) - assert_index_equal(llevel, rlevel, - exact=exact, check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, obj=lobj) + lobj = "MultiIndex level [{level}]".format(level=level) + assert_index_equal( + llevel, + rlevel, + exact=exact, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + obj=lobj, + ) # get_level_values may change dtype _check_types(left.levels[level], right.levels[level], obj=obj) # skip exact index checking when `check_categorical` is False if check_exact and check_categorical: if not left.equals(right): - diff = np.sum((left.values != right.values) - .astype(int)) * 100.0 / len(left) - msg = '{obj} values are different ({pct} %)'.format( - obj=obj, pct=np.round(diff, 5)) + diff = np.sum((left.values != right.values).astype(int)) * 100.0 / len(left) + msg = "{obj} values are different ({pct} %)".format( + obj=obj, pct=np.round(diff, 5) + ) raise_assert_detail(obj, msg, left, right) else: - _testing.assert_almost_equal(left.values, right.values, - check_less_precise=check_less_precise, - check_dtype=exact, - obj=obj, lobj=left, robj=right) + _testing.assert_almost_equal( + left.values, + right.values, + check_less_precise=check_less_precise, + check_dtype=exact, + obj=obj, + lobj=left, + robj=right, + ) # metadata comparison if check_names: - assert_attr_equal('names', left, right, obj=obj) + assert_attr_equal("names", left, right, obj=obj) if isinstance(left, pd.PeriodIndex) or isinstance(right, pd.PeriodIndex): - assert_attr_equal('freq', left, right, obj=obj) - if (isinstance(left, pd.IntervalIndex) or - isinstance(right, pd.IntervalIndex)): + assert_attr_equal("freq", left, right, obj=obj) + if isinstance(left, pd.IntervalIndex) or isinstance(right, pd.IntervalIndex): assert_interval_array_equal(left.values, right.values) if check_categorical: if is_categorical_dtype(left) or is_categorical_dtype(right): - assert_categorical_equal(left.values, right.values, - obj='{obj} category'.format(obj=obj)) + assert_categorical_equal( + left.values, right.values, obj="{obj} category".format(obj=obj) + ) -def assert_class_equal(left, right, exact=True, obj='Input'): +def assert_class_equal(left, right, exact=True, obj="Input"): """checks classes are equal.""" __tracebackhide__ = True @@ -653,22 +722,20 @@ def repr_class(x): except AttributeError: return repr(type(x)) - if exact == 'equiv': + if exact == "equiv": if type(left) != type(right): # allow equivalence of Int64Index/RangeIndex types = {type(left).__name__, type(right).__name__} - if len(types - {'Int64Index', 'RangeIndex'}): - msg = '{obj} classes are not equivalent'.format(obj=obj) - raise_assert_detail(obj, msg, repr_class(left), - repr_class(right)) + if len(types - {"Int64Index", "RangeIndex"}): + msg = "{obj} classes are not equivalent".format(obj=obj) + raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) elif exact: if type(left) != type(right): - msg = '{obj} classes are different'.format(obj=obj) - raise_assert_detail(obj, msg, repr_class(left), - repr_class(right)) + msg = "{obj} classes are different".format(obj=obj) + raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) -def assert_attr_equal(attr, left, right, obj='Attributes'): +def assert_attr_equal(attr, left, right, obj="Attributes"): """checks attributes are equal. Both objects must have attribute. Parameters @@ -688,8 +755,12 @@ def assert_attr_equal(attr, left, right, obj='Attributes'): if left_attr is right_attr: return True - elif (is_number(left_attr) and np.isnan(left_attr) and - is_number(right_attr) and np.isnan(right_attr)): + elif ( + is_number(left_attr) + and np.isnan(left_attr) + and is_number(right_attr) + and np.isnan(right_attr) + ): # np.nan return True @@ -710,20 +781,25 @@ def assert_attr_equal(attr, left, right, obj='Attributes'): def assert_is_valid_plot_return_object(objs): import matplotlib.pyplot as plt + if isinstance(objs, (pd.Series, np.ndarray)): for el in objs.ravel(): - msg = ("one of 'objs' is not a matplotlib Axes instance, type " - "encountered {name!r}").format(name=el.__class__.__name__) + msg = ( + "one of 'objs' is not a matplotlib Axes instance, type " + "encountered {name!r}" + ).format(name=el.__class__.__name__) assert isinstance(el, (plt.Axes, dict)), msg else: assert isinstance(objs, (plt.Artist, tuple, dict)), ( - 'objs is neither an ndarray of Artist instances nor a ' - 'single Artist instance, tuple, or dict, "objs" is a {name!r}' - .format(name=objs.__class__.__name__)) + "objs is neither an ndarray of Artist instances nor a " + 'single Artist instance, tuple, or dict, "objs" is a {name!r}'.format( + name=objs.__class__.__name__ + ) + ) def isiterable(obj): - return hasattr(obj, '__iter__') + return hasattr(obj, "__iter__") def assert_is_sorted(seq): @@ -734,8 +810,9 @@ def assert_is_sorted(seq): assert_numpy_array_equal(seq, np.sort(np.array(seq))) -def assert_categorical_equal(left, right, check_dtype=True, - check_category_order=True, obj='Categorical'): +def assert_categorical_equal( + left, right, check_dtype=True, check_category_order=True, obj="Categorical" +): """Test that Categoricals are equivalent. Parameters @@ -756,24 +833,31 @@ def assert_categorical_equal(left, right, check_dtype=True, _check_isinstance(left, right, Categorical) if check_category_order: - assert_index_equal(left.categories, right.categories, - obj='{obj}.categories'.format(obj=obj)) - assert_numpy_array_equal(left.codes, right.codes, - check_dtype=check_dtype, - obj='{obj}.codes'.format(obj=obj)) + assert_index_equal( + left.categories, right.categories, obj="{obj}.categories".format(obj=obj) + ) + assert_numpy_array_equal( + left.codes, + right.codes, + check_dtype=check_dtype, + obj="{obj}.codes".format(obj=obj), + ) else: - assert_index_equal(left.categories.sort_values(), - right.categories.sort_values(), - obj='{obj}.categories'.format(obj=obj)) - assert_index_equal(left.categories.take(left.codes), - right.categories.take(right.codes), - obj='{obj}.values'.format(obj=obj)) + assert_index_equal( + left.categories.sort_values(), + right.categories.sort_values(), + obj="{obj}.categories".format(obj=obj), + ) + assert_index_equal( + left.categories.take(left.codes), + right.categories.take(right.codes), + obj="{obj}.values".format(obj=obj), + ) - assert_attr_equal('ordered', left, right, obj=obj) + assert_attr_equal("ordered", left, right, obj=obj) -def assert_interval_array_equal(left, right, exact='equiv', - obj='IntervalArray'): +def assert_interval_array_equal(left, right, exact="equiv", obj="IntervalArray"): """Test that two IntervalArrays are equivalent. Parameters @@ -790,37 +874,38 @@ def assert_interval_array_equal(left, right, exact='equiv', """ _check_isinstance(left, right, IntervalArray) - assert_index_equal(left.left, right.left, exact=exact, - obj='{obj}.left'.format(obj=obj)) - assert_index_equal(left.right, right.right, exact=exact, - obj='{obj}.left'.format(obj=obj)) - assert_attr_equal('closed', left, right, obj=obj) + assert_index_equal( + left.left, right.left, exact=exact, obj="{obj}.left".format(obj=obj) + ) + assert_index_equal( + left.right, right.right, exact=exact, obj="{obj}.left".format(obj=obj) + ) + assert_attr_equal("closed", left, right, obj=obj) -def assert_period_array_equal(left, right, obj='PeriodArray'): +def assert_period_array_equal(left, right, obj="PeriodArray"): _check_isinstance(left, right, PeriodArray) - assert_numpy_array_equal(left._data, right._data, - obj='{obj}.values'.format(obj=obj)) - assert_attr_equal('freq', left, right, obj=obj) + assert_numpy_array_equal( + left._data, right._data, obj="{obj}.values".format(obj=obj) + ) + assert_attr_equal("freq", left, right, obj=obj) -def assert_datetime_array_equal(left, right, obj='DatetimeArray'): +def assert_datetime_array_equal(left, right, obj="DatetimeArray"): __tracebackhide__ = True _check_isinstance(left, right, DatetimeArray) - assert_numpy_array_equal(left._data, right._data, - obj='{obj}._data'.format(obj=obj)) - assert_attr_equal('freq', left, right, obj=obj) - assert_attr_equal('tz', left, right, obj=obj) + assert_numpy_array_equal(left._data, right._data, obj="{obj}._data".format(obj=obj)) + assert_attr_equal("freq", left, right, obj=obj) + assert_attr_equal("tz", left, right, obj=obj) -def assert_timedelta_array_equal(left, right, obj='TimedeltaArray'): +def assert_timedelta_array_equal(left, right, obj="TimedeltaArray"): __tracebackhide__ = True _check_isinstance(left, right, TimedeltaArray) - assert_numpy_array_equal(left._data, right._data, - obj='{obj}._data'.format(obj=obj)) - assert_attr_equal('freq', left, right, obj=obj) + assert_numpy_array_equal(left._data, right._data, obj="{obj}._data".format(obj=obj)) + assert_attr_equal("freq", left, right, obj=obj) def raise_assert_detail(obj, message, left, right, diff=None): @@ -840,7 +925,9 @@ def raise_assert_detail(obj, message, left, right, diff=None): {message} [left]: {left} -[right]: {right}""".format(obj=obj, message=message, left=left, right=right) +[right]: {right}""".format( + obj=obj, message=message, left=left, right=right + ) if diff is not None: msg += "\n[diff]: {diff}".format(diff=diff) @@ -848,9 +935,15 @@ def raise_assert_detail(obj, message, left, right, diff=None): raise AssertionError(msg) -def assert_numpy_array_equal(left, right, strict_nan=False, - check_dtype=True, err_msg=None, - check_same=None, obj='numpy array'): +def assert_numpy_array_equal( + left, + right, + strict_nan=False, + check_dtype=True, + err_msg=None, + check_same=None, + obj="numpy array", +): """ Checks that 'np.ndarray' is equivalent Parameters @@ -878,27 +971,29 @@ def assert_numpy_array_equal(left, right, strict_nan=False, _check_isinstance(left, right, np.ndarray) def _get_base(obj): - return obj.base if getattr(obj, 'base', None) is not None else obj + return obj.base if getattr(obj, "base", None) is not None else obj left_base = _get_base(left) right_base = _get_base(right) - if check_same == 'same': + if check_same == "same": if left_base is not right_base: - msg = "{left!r} is not {right!r}".format( - left=left_base, right=right_base) + msg = "{left!r} is not {right!r}".format(left=left_base, right=right_base) raise AssertionError(msg) - elif check_same == 'copy': + elif check_same == "copy": if left_base is right_base: - msg = "{left!r} is {right!r}".format( - left=left_base, right=right_base) + msg = "{left!r} is {right!r}".format(left=left_base, right=right_base) raise AssertionError(msg) def _raise(left, right, err_msg): if err_msg is None: if left.shape != right.shape: - raise_assert_detail(obj, '{obj} shapes are different' - .format(obj=obj), left.shape, right.shape) + raise_assert_detail( + obj, + "{obj} shapes are different".format(obj=obj), + left.shape, + right.shape, + ) diff = 0 for l, r in zip(left, right): @@ -907,8 +1002,9 @@ def _raise(left, right, err_msg): diff += 1 diff = diff * 100.0 / left.size - msg = '{obj} values are different ({pct} %)'.format( - obj=obj, pct=np.round(diff, 5)) + msg = "{obj} values are different ({pct} %)".format( + obj=obj, pct=np.round(diff, 5) + ) raise_assert_detail(obj, msg, left, right) raise AssertionError(err_msg) @@ -919,12 +1015,12 @@ def _raise(left, right, err_msg): if check_dtype: if isinstance(left, np.ndarray) and isinstance(right, np.ndarray): - assert_attr_equal('dtype', left, right, obj=obj) + assert_attr_equal("dtype", left, right, obj=obj) -def assert_extension_array_equal(left, right, check_dtype=True, - check_less_precise=False, - check_exact=False): +def assert_extension_array_equal( + left, right, check_dtype=True, check_less_precise=False, check_exact=False +): """Check that left and right ExtensionArrays are equal. Parameters @@ -946,10 +1042,10 @@ def assert_extension_array_equal(left, right, check_dtype=True, A mask of missing values is computed for each and checked to match. The remaining all-valid values are cast to object dtype and checked. """ - assert isinstance(left, ExtensionArray), 'left is not an ExtensionArray' - assert isinstance(right, ExtensionArray), 'right is not an ExtensionArray' + assert isinstance(left, ExtensionArray), "left is not an ExtensionArray" + assert isinstance(right, ExtensionArray), "right is not an ExtensionArray" if check_dtype: - assert_attr_equal('dtype', left, right, obj='ExtensionArray') + assert_attr_equal("dtype", left, right, obj="ExtensionArray") if hasattr(left, "asi8") and type(right) == type(left): # Avoid slow object-dtype comparisons @@ -958,29 +1054,36 @@ def assert_extension_array_equal(left, right, check_dtype=True, left_na = np.asarray(left.isna()) right_na = np.asarray(right.isna()) - assert_numpy_array_equal(left_na, right_na, obj='ExtensionArray NA mask') + assert_numpy_array_equal(left_na, right_na, obj="ExtensionArray NA mask") left_valid = np.asarray(left[~left_na].astype(object)) right_valid = np.asarray(right[~right_na].astype(object)) if check_exact: - assert_numpy_array_equal(left_valid, right_valid, obj='ExtensionArray') + assert_numpy_array_equal(left_valid, right_valid, obj="ExtensionArray") else: - _testing.assert_almost_equal(left_valid, right_valid, - check_dtype=check_dtype, - check_less_precise=check_less_precise, - obj='ExtensionArray') + _testing.assert_almost_equal( + left_valid, + right_valid, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + obj="ExtensionArray", + ) # This could be refactored to use the NDFrame.equals method -def assert_series_equal(left, right, check_dtype=True, - check_index_type='equiv', - check_series_type=True, - check_less_precise=False, - check_names=True, - check_exact=False, - check_datetimelike_compat=False, - check_categorical=True, - obj='Series'): +def assert_series_equal( + left, + right, + check_dtype=True, + check_index_type="equiv", + check_series_type=True, + check_less_precise=False, + check_names=True, + check_exact=False, + check_datetimelike_compat=False, + check_categorical=True, + obj="Series", +): """Check that left and right Series are equal. Parameters @@ -1029,93 +1132,117 @@ def assert_series_equal(left, right, check_dtype=True, # length comparison if len(left) != len(right): - msg1 = '{len}, {left}'.format(len=len(left), left=left.index) - msg2 = '{len}, {right}'.format(len=len(right), right=right.index) - raise_assert_detail(obj, 'Series length are different', msg1, msg2) + msg1 = "{len}, {left}".format(len=len(left), left=left.index) + msg2 = "{len}, {right}".format(len=len(right), right=right.index) + raise_assert_detail(obj, "Series length are different", msg1, msg2) # index comparison - assert_index_equal(left.index, right.index, exact=check_index_type, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_categorical=check_categorical, - obj='{obj}.index'.format(obj=obj)) + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_categorical=check_categorical, + obj="{obj}.index".format(obj=obj), + ) if check_dtype: # We want to skip exact dtype checking when `check_categorical` # is False. We'll still raise if only one is a `Categorical`, # regardless of `check_categorical` - if (is_categorical_dtype(left) and is_categorical_dtype(right) and - not check_categorical): + if ( + is_categorical_dtype(left) + and is_categorical_dtype(right) + and not check_categorical + ): pass else: - assert_attr_equal('dtype', left, right) + assert_attr_equal("dtype", left, right) if check_exact: - assert_numpy_array_equal(left._internal_get_values(), - right._internal_get_values(), - check_dtype=check_dtype, - obj='{obj}'.format(obj=obj),) + assert_numpy_array_equal( + left._internal_get_values(), + right._internal_get_values(), + check_dtype=check_dtype, + obj="{obj}".format(obj=obj), + ) elif check_datetimelike_compat: # we want to check only if we have compat dtypes # e.g. integer and M|m are NOT compat, but we can simply check # the values in that case - if (is_datetimelike_v_numeric(left, right) or - is_datetimelike_v_object(left, right) or - needs_i8_conversion(left) or - needs_i8_conversion(right)): + if ( + is_datetimelike_v_numeric(left, right) + or is_datetimelike_v_object(left, right) + or needs_i8_conversion(left) + or needs_i8_conversion(right) + ): # datetimelike may have different objects (e.g. datetime.datetime # vs Timestamp) but will compare equal if not Index(left.values).equals(Index(right.values)): - msg = ('[datetimelike_compat=True] {left} is not equal to ' - '{right}.').format(left=left.values, right=right.values) + msg = ( + "[datetimelike_compat=True] {left} is not equal to " "{right}." + ).format(left=left.values, right=right.values) raise AssertionError(msg) else: - assert_numpy_array_equal(left._internal_get_values(), - right._internal_get_values(), - check_dtype=check_dtype) + assert_numpy_array_equal( + left._internal_get_values(), + right._internal_get_values(), + check_dtype=check_dtype, + ) elif is_interval_dtype(left) or is_interval_dtype(right): assert_interval_array_equal(left.array, right.array) - elif (is_extension_array_dtype(left.dtype) and - is_datetime64tz_dtype(left.dtype)): + elif is_extension_array_dtype(left.dtype) and is_datetime64tz_dtype(left.dtype): # .values is an ndarray, but ._values is the ExtensionArray. # TODO: Use .array assert is_extension_array_dtype(right.dtype) assert_extension_array_equal(left._values, right._values) - elif (is_extension_array_dtype(left) and not is_categorical_dtype(left) and - is_extension_array_dtype(right) and not is_categorical_dtype(right)): + elif ( + is_extension_array_dtype(left) + and not is_categorical_dtype(left) + and is_extension_array_dtype(right) + and not is_categorical_dtype(right) + ): assert_extension_array_equal(left.array, right.array) else: - _testing.assert_almost_equal(left._internal_get_values(), - right._internal_get_values(), - check_less_precise=check_less_precise, - check_dtype=check_dtype, - obj='{obj}'.format(obj=obj)) + _testing.assert_almost_equal( + left._internal_get_values(), + right._internal_get_values(), + check_less_precise=check_less_precise, + check_dtype=check_dtype, + obj="{obj}".format(obj=obj), + ) # metadata comparison if check_names: - assert_attr_equal('name', left, right, obj=obj) + assert_attr_equal("name", left, right, obj=obj) if check_categorical: if is_categorical_dtype(left) or is_categorical_dtype(right): - assert_categorical_equal(left.values, right.values, - obj='{obj} category'.format(obj=obj)) + assert_categorical_equal( + left.values, right.values, obj="{obj} category".format(obj=obj) + ) # This could be refactored to use the NDFrame.equals method -def assert_frame_equal(left, right, check_dtype=True, - check_index_type='equiv', - check_column_type='equiv', - check_frame_type=True, - check_less_precise=False, - check_names=True, - by_blocks=False, - check_exact=False, - check_datetimelike_compat=False, - check_categorical=True, - check_like=False, - obj='DataFrame'): +def assert_frame_equal( + left, + right, + check_dtype=True, + check_index_type="equiv", + check_column_type="equiv", + check_frame_type=True, + check_less_precise=False, + check_names=True, + by_blocks=False, + check_exact=False, + check_datetimelike_compat=False, + check_categorical=True, + check_like=False, + obj="DataFrame", +): """ Check that left and right DataFrame are equal. @@ -1219,29 +1346,39 @@ def assert_frame_equal(left, right, check_dtype=True, # shape comparison if left.shape != right.shape: - raise_assert_detail(obj, - '{obj} shape mismatch'.format(obj=obj), - '{shape!r}'.format(shape=left.shape), - '{shape!r}'.format(shape=right.shape)) + raise_assert_detail( + obj, + "{obj} shape mismatch".format(obj=obj), + "{shape!r}".format(shape=left.shape), + "{shape!r}".format(shape=right.shape), + ) if check_like: left, right = left.reindex_like(right), right # index comparison - assert_index_equal(left.index, right.index, exact=check_index_type, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_categorical=check_categorical, - obj='{obj}.index'.format(obj=obj)) + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_categorical=check_categorical, + obj="{obj}.index".format(obj=obj), + ) # column comparison - assert_index_equal(left.columns, right.columns, exact=check_column_type, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_categorical=check_categorical, - obj='{obj}.columns'.format(obj=obj)) + assert_index_equal( + left.columns, + right.columns, + exact=check_column_type, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_categorical=check_categorical, + obj="{obj}.columns".format(obj=obj), + ) # compare by blocks if by_blocks: @@ -1250,8 +1387,9 @@ def assert_frame_equal(left, right, check_dtype=True, for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))): assert dtype in lblocks assert dtype in rblocks - assert_frame_equal(lblocks[dtype], rblocks[dtype], - check_dtype=check_dtype, obj=obj) + assert_frame_equal( + lblocks[dtype], rblocks[dtype], check_dtype=check_dtype, obj=obj + ) # compare by columns else: @@ -1260,13 +1398,17 @@ def assert_frame_equal(left, right, check_dtype=True, lcol = left.iloc[:, i] rcol = right.iloc[:, i] assert_series_equal( - lcol, rcol, check_dtype=check_dtype, + lcol, + rcol, + check_dtype=check_dtype, check_index_type=check_index_type, check_less_precise=check_less_precise, - check_exact=check_exact, check_names=check_names, + check_exact=check_exact, + check_names=check_names, check_datetimelike_compat=check_datetimelike_compat, check_categorical=check_categorical, - obj='{obj}.iloc[:, {idx}]'.format(obj=obj, idx=i)) + obj="{obj}.iloc[:, {idx}]".format(obj=obj, idx=i), + ) def assert_equal(left, right, **kwargs): @@ -1359,9 +1501,14 @@ def to_array(obj): # Sparse -def assert_sp_array_equal(left, right, check_dtype=True, check_kind=True, - check_fill_value=True, - consolidate_block_indices=False): +def assert_sp_array_equal( + left, + right, + check_dtype=True, + check_kind=True, + check_fill_value=True, + consolidate_block_indices=False, +): """Check that the left and right SparseArray are equal. Parameters @@ -1384,8 +1531,7 @@ def assert_sp_array_equal(left, right, check_dtype=True, check_kind=True, _check_isinstance(left, right, pd.SparseArray) - assert_numpy_array_equal(left.sp_values, right.sp_values, - check_dtype=check_dtype) + assert_numpy_array_equal(left.sp_values, right.sp_values, check_dtype=check_dtype) # SparseIndex comparison assert isinstance(left.sp_index, pd._libs.sparse.SparseIndex) @@ -1398,32 +1544,38 @@ def assert_sp_array_equal(left, right, check_dtype=True, check_kind=True, left_index = left.sp_index right_index = right.sp_index - if consolidate_block_indices and left.kind == 'block': + if consolidate_block_indices and left.kind == "block": # we'll probably remove this hack... left_index = left_index.to_int_index().to_block_index() right_index = right_index.to_int_index().to_block_index() if not left_index.equals(right_index): - raise_assert_detail('SparseArray.index', 'index are not equal', - left_index, right_index) + raise_assert_detail( + "SparseArray.index", "index are not equal", left_index, right_index + ) else: # Just ensure a pass if check_fill_value: - assert_attr_equal('fill_value', left, right) + assert_attr_equal("fill_value", left, right) if check_dtype: - assert_attr_equal('dtype', left, right) - assert_numpy_array_equal(left.to_dense(), right.to_dense(), - check_dtype=check_dtype) - - -def assert_sp_series_equal(left, right, check_dtype=True, exact_indices=True, - check_series_type=True, check_names=True, - check_kind=True, - check_fill_value=True, - consolidate_block_indices=False, - obj='SparseSeries'): + assert_attr_equal("dtype", left, right) + assert_numpy_array_equal(left.to_dense(), right.to_dense(), check_dtype=check_dtype) + + +def assert_sp_series_equal( + left, + right, + check_dtype=True, + exact_indices=True, + check_series_type=True, + check_names=True, + check_kind=True, + check_fill_value=True, + consolidate_block_indices=False, + obj="SparseSeries", +): """Check that the left and right SparseSeries are equal. Parameters @@ -1456,28 +1608,35 @@ def assert_sp_series_equal(left, right, check_dtype=True, exact_indices=True, if check_series_type: assert_class_equal(left, right, obj=obj) - assert_index_equal(left.index, right.index, - obj='{obj}.index'.format(obj=obj)) + assert_index_equal(left.index, right.index, obj="{obj}.index".format(obj=obj)) - assert_sp_array_equal(left.values, right.values, - check_kind=check_kind, - check_fill_value=check_fill_value, - consolidate_block_indices=consolidate_block_indices) + assert_sp_array_equal( + left.values, + right.values, + check_kind=check_kind, + check_fill_value=check_fill_value, + consolidate_block_indices=consolidate_block_indices, + ) if check_names: - assert_attr_equal('name', left, right) + assert_attr_equal("name", left, right) if check_dtype: - assert_attr_equal('dtype', left, right) - - assert_numpy_array_equal(np.asarray(left.values), - np.asarray(right.values)) - - -def assert_sp_frame_equal(left, right, check_dtype=True, exact_indices=True, - check_frame_type=True, check_kind=True, - check_fill_value=True, - consolidate_block_indices=False, - obj='SparseDataFrame'): + assert_attr_equal("dtype", left, right) + + assert_numpy_array_equal(np.asarray(left.values), np.asarray(right.values)) + + +def assert_sp_frame_equal( + left, + right, + check_dtype=True, + exact_indices=True, + check_frame_type=True, + check_kind=True, + check_fill_value=True, + consolidate_block_indices=False, + obj="SparseDataFrame", +): """Check that the left and right SparseDataFrame are equal. Parameters @@ -1510,35 +1669,36 @@ def assert_sp_frame_equal(left, right, check_dtype=True, exact_indices=True, if check_frame_type: assert_class_equal(left, right, obj=obj) - assert_index_equal(left.index, right.index, - obj='{obj}.index'.format(obj=obj)) - assert_index_equal(left.columns, right.columns, - obj='{obj}.columns'.format(obj=obj)) + assert_index_equal(left.index, right.index, obj="{obj}.index".format(obj=obj)) + assert_index_equal(left.columns, right.columns, obj="{obj}.columns".format(obj=obj)) if check_fill_value: - assert_attr_equal('default_fill_value', left, right, obj=obj) + assert_attr_equal("default_fill_value", left, right, obj=obj) for col, series in left.items(): - assert (col in right) + assert col in right # trade-off? if exact_indices: assert_sp_series_equal( - series, right[col], + series, + right[col], check_dtype=check_dtype, check_kind=check_kind, check_fill_value=check_fill_value, - consolidate_block_indices=consolidate_block_indices + consolidate_block_indices=consolidate_block_indices, ) else: - assert_series_equal(series.to_dense(), right[col].to_dense(), - check_dtype=check_dtype) + assert_series_equal( + series.to_dense(), right[col].to_dense(), check_dtype=check_dtype + ) # do I care? # assert(left.default_kind == right.default_kind) for col in right: - assert (col in left) + assert col in left + # ----------------------------------------------------------------------------- # Others @@ -1560,9 +1720,10 @@ def assert_copy(iter1, iter2, **eql_kwargs): """ for elem1, elem2 in zip(iter1, iter2): assert_almost_equal(elem1, elem2, **eql_kwargs) - msg = ("Expected object {obj1!r} and object {obj2!r} to be " - "different objects, but they were the same object." - ).format(obj1=type(elem1), obj2=type(elem2)) + msg = ( + "Expected object {obj1!r} and object {obj2!r} to be " + "different objects, but they were the same object." + ).format(obj1=type(elem1), obj2=type(elem2)) assert elem1 is not elem2, msg @@ -1604,7 +1765,7 @@ def makeIntIndex(k=10, name=None): def makeUIntIndex(k=10, name=None): - return Index([2**63 + i for i in range(k)], name=name) + return Index([2 ** 63 + i for i in range(k)], name=name) def makeRangeIndex(k=10, name=None, **kwargs): @@ -1616,26 +1777,24 @@ def makeFloatIndex(k=10, name=None): return Index(values * (10 ** np.random.randint(0, 9)), name=name) -def makeDateIndex(k=10, freq='B', name=None, **kwargs): +def makeDateIndex(k=10, freq="B", name=None, **kwargs): dt = datetime(2000, 1, 1) dr = bdate_range(dt, periods=k, freq=freq, name=name) return DatetimeIndex(dr, name=name, **kwargs) -def makeTimedeltaIndex(k=10, freq='D', name=None, **kwargs): - return pd.timedelta_range(start='1 day', periods=k, freq=freq, - name=name, **kwargs) +def makeTimedeltaIndex(k=10, freq="D", name=None, **kwargs): + return pd.timedelta_range(start="1 day", periods=k, freq=freq, name=name, **kwargs) def makePeriodIndex(k=10, name=None, **kwargs): dt = datetime(2000, 1, 1) - dr = pd.period_range(start=dt, periods=k, freq='B', name=name, **kwargs) + dr = pd.period_range(start=dt, periods=k, freq="B", name=name, **kwargs) return dr def makeMultiIndex(k=10, names=None, **kwargs): - return MultiIndex.from_product( - (('foo', 'bar'), (1, 2)), names=names, **kwargs) + return MultiIndex.from_product((("foo", "bar"), (1, 2)), names=names, **kwargs) def all_index_generator(k=10): @@ -1646,21 +1805,32 @@ def all_index_generator(k=10): ---------- k: length of each of the index instances """ - all_make_index_funcs = [makeIntIndex, makeFloatIndex, makeStringIndex, - makeUnicodeIndex, makeDateIndex, makePeriodIndex, - makeTimedeltaIndex, makeBoolIndex, makeRangeIndex, - makeIntervalIndex, - makeCategoricalIndex] + all_make_index_funcs = [ + makeIntIndex, + makeFloatIndex, + makeStringIndex, + makeUnicodeIndex, + makeDateIndex, + makePeriodIndex, + makeTimedeltaIndex, + makeBoolIndex, + makeRangeIndex, + makeIntervalIndex, + makeCategoricalIndex, + ] for make_index_func in all_make_index_funcs: yield make_index_func(k=k) def index_subclass_makers_generator(): make_index_funcs = [ - makeDateIndex, makePeriodIndex, - makeTimedeltaIndex, makeRangeIndex, - makeIntervalIndex, makeCategoricalIndex, - makeMultiIndex + makeDateIndex, + makePeriodIndex, + makeTimedeltaIndex, + makeRangeIndex, + makeIntervalIndex, + makeCategoricalIndex, + makeMultiIndex, ] for make_index_func in make_index_funcs: yield make_index_func @@ -1702,7 +1872,7 @@ def getSeriesData(): return {c: Series(randn(N), index=index) for c in getCols(K)} -def makeTimeSeries(nper=None, freq='B', name=None): +def makeTimeSeries(nper=None, freq="B", name=None): if nper is None: nper = N return Series(randn(nper), index=makeDateIndex(nper, freq=freq), name=name) @@ -1714,7 +1884,7 @@ def makePeriodSeries(nper=None, name=None): return Series(randn(nper), index=makePeriodIndex(nper), name=name) -def getTimeSeriesData(nper=None, freq='B'): +def getTimeSeriesData(nper=None, freq="B"): return {c: makeTimeSeries(nper, freq) for c in getCols(K)} @@ -1723,7 +1893,7 @@ def getPeriodData(nper=None): # make frame -def makeTimeDataFrame(nper=None, freq='B'): +def makeTimeDataFrame(nper=None, freq="B"): data = getTimeSeriesData(nper, freq) return DataFrame(data) @@ -1734,13 +1904,13 @@ def makeDataFrame(): def getMixedTypeDict(): - index = Index(['a', 'b', 'c', 'd', 'e']) + index = Index(["a", "b", "c", "d", "e"]) data = { - 'A': [0., 1., 2., 3., 4.], - 'B': [0., 1., 0., 1., 0.], - 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], - 'D': bdate_range('1/1/2009', periods=5) + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": bdate_range("1/1/2009", periods=5), } return index, data @@ -1755,8 +1925,9 @@ def makePeriodFrame(nper=None): return DataFrame(data) -def makeCustomIndex(nentries, nlevels, prefix='#', names=False, ndupe_l=None, - idx_type=None): +def makeCustomIndex( + nentries, nlevels, prefix="#", names=False, ndupe_l=None, idx_type=None +): """Create an index/multindex with given dimensions, levels, names, etc' nentries - number of entries in index @@ -1781,12 +1952,11 @@ def makeCustomIndex(nentries, nlevels, prefix='#', names=False, ndupe_l=None, if ndupe_l is None: ndupe_l = [1] * nlevels - assert (is_sequence(ndupe_l) and len(ndupe_l) <= nlevels) - assert (names is None or names is False or - names is True or len(names) is nlevels) - assert idx_type is None or (idx_type in ('i', 'f', 's', 'u', - 'dt', 'p', 'td') - and nlevels == 1) + assert is_sequence(ndupe_l) and len(ndupe_l) <= nlevels + assert names is None or names is False or names is True or len(names) is nlevels + assert idx_type is None or ( + idx_type in ("i", "f", "s", "u", "dt", "p", "td") and nlevels == 1 + ) if names is True: # build default names @@ -1800,10 +1970,15 @@ def makeCustomIndex(nentries, nlevels, prefix='#', names=False, ndupe_l=None, names = [names] # specific 1D index type requested? - idx_func = dict(i=makeIntIndex, f=makeFloatIndex, - s=makeStringIndex, u=makeUnicodeIndex, - dt=makeDateIndex, td=makeTimedeltaIndex, - p=makePeriodIndex).get(idx_type) + idx_func = dict( + i=makeIntIndex, + f=makeFloatIndex, + s=makeStringIndex, + u=makeUnicodeIndex, + dt=makeDateIndex, + td=makeTimedeltaIndex, + p=makePeriodIndex, + ).get(idx_type) if idx_func: idx = idx_func(nentries) # but we need to fill in the name @@ -1811,9 +1986,10 @@ def makeCustomIndex(nentries, nlevels, prefix='#', names=False, ndupe_l=None, idx.name = names[0] return idx elif idx_type is not None: - raise ValueError('"{idx_type}" is not a legal value for `idx_type`, ' - 'use "i"/"f"/"s"/"u"/"dt/"p"/"td".' - .format(idx_type=idx_type)) + raise ValueError( + '"{idx_type}" is not a legal value for `idx_type`, ' + 'use "i"/"f"/"s"/"u"/"dt/"p"/"td".'.format(idx_type=idx_type) + ) if len(ndupe_l) < nlevels: ndupe_l.extend([1] * (nlevels - len(ndupe_l))) @@ -1823,8 +1999,10 @@ def makeCustomIndex(nentries, nlevels, prefix='#', names=False, ndupe_l=None, tuples = [] for i in range(nlevels): + def keyfunc(x): import re + numeric_tuple = re.sub(r"[^\d_]_?", "", x).split("_") return [int(num) for num in numeric_tuple] @@ -1832,7 +2010,7 @@ def keyfunc(x): div_factor = nentries // ndupe_l[i] + 1 cnt = Counter() for j in range(div_factor): - label = '{prefix}_l{i}_g{j}'.format(prefix=prefix, i=i, j=j) + label = "{prefix}_l{i}_g{j}".format(prefix=prefix, i=i, j=j) cnt[label] = ndupe_l[i] # cute Counter trick result = list(sorted(cnt.elements(), key=keyfunc))[:nentries] @@ -1852,10 +2030,20 @@ def keyfunc(x): return index -def makeCustomDataframe(nrows, ncols, c_idx_names=True, r_idx_names=True, - c_idx_nlevels=1, r_idx_nlevels=1, data_gen_f=None, - c_ndupe_l=None, r_ndupe_l=None, dtype=None, - c_idx_type=None, r_idx_type=None): +def makeCustomDataframe( + nrows, + ncols, + c_idx_names=True, + r_idx_names=True, + c_idx_nlevels=1, + r_idx_nlevels=1, + data_gen_f=None, + c_ndupe_l=None, + r_ndupe_l=None, + dtype=None, + c_idx_type=None, + r_idx_type=None, +): """ nrows, ncols - number of data rows/cols c_idx_names, idx_names - False/True/list of strings, yields No names , @@ -1913,19 +2101,29 @@ def makeCustomDataframe(nrows, ncols, c_idx_names=True, r_idx_names=True, assert c_idx_nlevels > 0 assert r_idx_nlevels > 0 - assert r_idx_type is None or (r_idx_type in ('i', 'f', 's', - 'u', 'dt', 'p', 'td') - and r_idx_nlevels == 1) - assert c_idx_type is None or (c_idx_type in ('i', 'f', 's', - 'u', 'dt', 'p', 'td') - and c_idx_nlevels == 1) - - columns = makeCustomIndex(ncols, nlevels=c_idx_nlevels, prefix='C', - names=c_idx_names, ndupe_l=c_ndupe_l, - idx_type=c_idx_type) - index = makeCustomIndex(nrows, nlevels=r_idx_nlevels, prefix='R', - names=r_idx_names, ndupe_l=r_ndupe_l, - idx_type=r_idx_type) + assert r_idx_type is None or ( + r_idx_type in ("i", "f", "s", "u", "dt", "p", "td") and r_idx_nlevels == 1 + ) + assert c_idx_type is None or ( + c_idx_type in ("i", "f", "s", "u", "dt", "p", "td") and c_idx_nlevels == 1 + ) + + columns = makeCustomIndex( + ncols, + nlevels=c_idx_nlevels, + prefix="C", + names=c_idx_names, + ndupe_l=c_ndupe_l, + idx_type=c_idx_type, + ) + index = makeCustomIndex( + nrows, + nlevels=r_idx_nlevels, + prefix="R", + names=r_idx_names, + ndupe_l=r_ndupe_l, + idx_type=r_idx_type, + ) # by default, generate data based on location if data_gen_f is None: @@ -1958,17 +2156,27 @@ def _gen_unique_rand(rng, _extra_size): extra_size *= 1.05 ind = _gen_unique_rand(random_state, extra_size) - j = np.floor(ind * 1. / nrows).astype(int) + j = np.floor(ind * 1.0 / nrows).astype(int) i = (ind - j * nrows).astype(int) return i.tolist(), j.tolist() -def makeMissingCustomDataframe(nrows, ncols, density=.9, random_state=None, - c_idx_names=True, r_idx_names=True, - c_idx_nlevels=1, r_idx_nlevels=1, - data_gen_f=None, - c_ndupe_l=None, r_ndupe_l=None, dtype=None, - c_idx_type=None, r_idx_type=None): +def makeMissingCustomDataframe( + nrows, + ncols, + density=0.9, + random_state=None, + c_idx_names=True, + r_idx_names=True, + c_idx_nlevels=1, + r_idx_nlevels=1, + data_gen_f=None, + c_ndupe_l=None, + r_ndupe_l=None, + dtype=None, + c_idx_type=None, + r_idx_type=None, +): """ Parameters ---------- @@ -1980,30 +2188,34 @@ def makeMissingCustomDataframe(nrows, ncols, density=.9, random_state=None, See makeCustomDataframe for descriptions of the rest of the parameters. """ - df = makeCustomDataframe(nrows, ncols, c_idx_names=c_idx_names, - r_idx_names=r_idx_names, - c_idx_nlevels=c_idx_nlevels, - r_idx_nlevels=r_idx_nlevels, - data_gen_f=data_gen_f, - c_ndupe_l=c_ndupe_l, r_ndupe_l=r_ndupe_l, - dtype=dtype, c_idx_type=c_idx_type, - r_idx_type=r_idx_type) + df = makeCustomDataframe( + nrows, + ncols, + c_idx_names=c_idx_names, + r_idx_names=r_idx_names, + c_idx_nlevels=c_idx_nlevels, + r_idx_nlevels=r_idx_nlevels, + data_gen_f=data_gen_f, + c_ndupe_l=c_ndupe_l, + r_ndupe_l=r_ndupe_l, + dtype=dtype, + c_idx_type=c_idx_type, + r_idx_type=r_idx_type, + ) i, j = _create_missing_idx(nrows, ncols, density, random_state) df.values[i, j] = np.nan return df -def makeMissingDataframe(density=.9, random_state=None): +def makeMissingDataframe(density=0.9, random_state=None): df = makeDataFrame() - i, j = _create_missing_idx(*df.shape, density=density, - random_state=random_state) + i, j = _create_missing_idx(*df.shape, density=density, random_state=random_state) df.values[i, j] = np.nan return df class TestSubDict(dict): - def __init__(self, *args, **kwargs): dict.__init__(self, *args, **kwargs) @@ -2039,19 +2251,19 @@ def dec(f): # 'urlopen error timed out', # 'timeout: timed out', # 'socket.timeout: timed out', - 'timed out', - 'Server Hangup', - 'HTTP Error 503: Service Unavailable', - '502: Proxy Error', - 'HTTP Error 502: internal error', - 'HTTP Error 502', - 'HTTP Error 503', - 'HTTP Error 403', - 'HTTP Error 400', - 'Temporary failure in name resolution', - 'Name or service not known', - 'Connection refused', - 'certificate verify', + "timed out", + "Server Hangup", + "HTTP Error 503: Service Unavailable", + "502: Proxy Error", + "HTTP Error 502: internal error", + "HTTP Error 502", + "HTTP Error 503", + "HTTP Error 403", + "HTTP Error 400", + "Temporary failure in name resolution", + "Name or service not known", + "Connection refused", + "certificate verify", ) # or this e.errno/e.reason.errno @@ -2060,8 +2272,8 @@ def dec(f): 111, # Connection refused 110, # Connection timed out 104, # Connection reset Error - 54, # Connection reset by peer - 60, # urllib.error.URLError: [Errno 60] Connection timed out + 54, # Connection reset by peer + 60, # urllib.error.URLError: [Errno 60] Connection timed out ) # Both of the above shouldn't mask real issues such as 404's @@ -2098,13 +2310,15 @@ def can_connect(url, error_classes=_network_error_classes): @optional_args -def network(t, url="http://www.google.com", - raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT, - check_before_test=False, - error_classes=_network_error_classes, - skip_errnos=_network_errno_vals, - _skip_on_messages=_network_error_messages, - ): +def network( + t, + url="http://www.google.com", + raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT, + check_before_test=False, + error_classes=_network_error_classes, + skip_errnos=_network_errno_vals, + _skip_on_messages=_network_error_messages, +): """ Label a test as requiring network connection and, if an error is encountered, only raise if it does not find a network connection. @@ -2188,6 +2402,7 @@ def network(t, url="http://www.google.com", Errors not related to networking will always be raised. """ from pytest import skip + t.network = True @wraps(t) @@ -2198,13 +2413,15 @@ def wrapper(*args, **kwargs): try: return t(*args, **kwargs) except Exception as e: - errno = getattr(e, 'errno', None) + errno = getattr(e, "errno", None) if not errno and hasattr(errno, "reason"): - errno = getattr(e.reason, 'errno', None) + errno = getattr(e.reason, "errno", None) if errno in skip_errnos: - skip("Skipping test due to known errno" - " and error {error}".format(error=e)) + skip( + "Skipping test due to known errno" + " and error {error}".format(error=e) + ) try: e_str = traceback.format_exc(e) @@ -2212,8 +2429,10 @@ def wrapper(*args, **kwargs): e_str = str(e) if any(m.lower() in e_str.lower() for m in _skip_on_messages): - skip("Skipping test because exception " - "message is known and error {error}".format(error=e)) + skip( + "Skipping test because exception " + "message is known and error {error}".format(error=e) + ) if not isinstance(e, error_classes): raise @@ -2221,8 +2440,10 @@ def wrapper(*args, **kwargs): if raise_on_error or can_connect(url, error_classes): raise else: - skip("Skipping test due to lack of connectivity" - " and error {error}".format(error=e)) + skip( + "Skipping test due to lack of connectivity" + " and error {error}".format(error=e) + ) return wrapper @@ -2230,8 +2451,7 @@ def wrapper(*args, **kwargs): with_connectivity_check = network -def assert_raises_regex(_exception, _regexp, _callable=None, - *args, **kwargs): +def assert_raises_regex(_exception, _regexp, _callable=None, *args, **kwargs): r""" Check that the specified Exception is raised and that the error message matches a given regular expression pattern. This may be a regular @@ -2271,9 +2491,15 @@ def assert_raises_regex(_exception, _regexp, _callable=None, AssertionError: "banana" does not match "'str' object does not support \ item assignment" """ - warnings.warn(("assert_raises_regex has been deprecated and will " - "be removed in the next release. Please use " - "`pytest.raises` instead."), FutureWarning, stacklevel=2) + warnings.warn( + ( + "assert_raises_regex has been deprecated and will " + "be removed in the next release. Please use " + "`pytest.raises` instead." + ), + FutureWarning, + stacklevel=2, + ) manager = _AssertRaisesContextmanager(exception=_exception, regexp=_regexp) if _callable is not None: @@ -2351,7 +2577,8 @@ def exception_matches(self, exc_type, exc_value, trace_back): if not self.regexp.search(val): msg = '"{pat}" does not match "{val}"'.format( - pat=self.regexp.pattern, val=val) + pat=self.regexp.pattern, val=val + ) e = AssertionError(msg) raise_with_traceback(e, trace_back) @@ -2362,9 +2589,13 @@ def exception_matches(self, exc_type, exc_value, trace_back): @contextmanager -def assert_produces_warning(expected_warning=Warning, filter_level="always", - clear=None, check_stacklevel=True, - raise_on_extra_warnings=True): +def assert_produces_warning( + expected_warning=Warning, + filter_level="always", + clear=None, + check_stacklevel=True, + raise_on_extra_warnings=True, +): """ Context manager for running code expected to either raise a specific warning, or not raise any warnings. Verifies that the code raises the @@ -2447,30 +2678,40 @@ class for all warnings. To check that no warning is returned, extra_warnings = [] for actual_warning in w: - if (expected_warning and issubclass(actual_warning.category, - expected_warning)): + if expected_warning and issubclass( + actual_warning.category, expected_warning + ): saw_warning = True - if check_stacklevel and issubclass(actual_warning.category, - (FutureWarning, - DeprecationWarning)): + if check_stacklevel and issubclass( + actual_warning.category, (FutureWarning, DeprecationWarning) + ): from inspect import getframeinfo, stack + caller = getframeinfo(stack()[2][0]) - msg = ("Warning not set with correct stacklevel. " - "File where warning is raised: {actual} != " - "{caller}. Warning message: {message}" - ).format(actual=actual_warning.filename, - caller=caller.filename, - message=actual_warning.message) + msg = ( + "Warning not set with correct stacklevel. " + "File where warning is raised: {actual} != " + "{caller}. Warning message: {message}" + ).format( + actual=actual_warning.filename, + caller=caller.filename, + message=actual_warning.message, + ) assert actual_warning.filename == caller.filename, msg else: - extra_warnings.append((actual_warning.category.__name__, - actual_warning.message, - actual_warning.filename, - actual_warning.lineno)) + extra_warnings.append( + ( + actual_warning.category.__name__, + actual_warning.message, + actual_warning.filename, + actual_warning.lineno, + ) + ) if expected_warning: msg = "Did not see expected warning of class {name!r}.".format( - name=expected_warning.__name__) + name=expected_warning.__name__ + ) assert saw_warning, msg if raise_on_extra_warnings and extra_warnings: raise AssertionError( @@ -2529,6 +2770,7 @@ def with_csv_dialect(name, **kwargs): csv : Python's CSV library. """ import csv + _BUILTIN_DIALECTS = {"excel", "excel-tab", "unix"} if name in _BUILTIN_DIALECTS: @@ -2542,6 +2784,7 @@ def with_csv_dialect(name, **kwargs): @contextmanager def use_numexpr(use, min_elements=None): from pandas.core.computation import expressions as expr + if min_elements is None: min_elements = expr._MIN_ELEMENTS @@ -2590,19 +2833,20 @@ def inner(*args, **kwargs): threads = [] for i in range(num_threads): updated_kwargs = update_kwargs(i) - thread = threading.Thread(target=func, args=args, - kwargs=updated_kwargs) + thread = threading.Thread(target=func, args=args, kwargs=updated_kwargs) threads.append(thread) for thread in threads: thread.start() for thread in threads: thread.join() + return inner + return wrapper class SubclassedSeries(Series): - _metadata = ['testattr', 'name'] + _metadata = ["testattr", "name"] @property def _constructor(self): @@ -2614,7 +2858,7 @@ def _constructor_expanddim(self): class SubclassedDataFrame(DataFrame): - _metadata = ['testattr'] + _metadata = ["testattr"] @property def _constructor(self): @@ -2626,7 +2870,7 @@ def _constructor_sliced(self): class SubclassedSparseSeries(pd.SparseSeries): - _metadata = ['testattr'] + _metadata = ["testattr"] @property def _constructor(self): @@ -2638,7 +2882,7 @@ def _constructor_expanddim(self): class SubclassedSparseDataFrame(pd.SparseDataFrame): - _metadata = ['testattr'] + _metadata = ["testattr"] @property def _constructor(self): @@ -2650,7 +2894,6 @@ def _constructor_sliced(self): class SubclassedCategorical(Categorical): - @property def _constructor(self): return SubclassedCategorical @@ -2685,14 +2928,14 @@ def set_timezone(tz): def setTZ(tz): if tz is None: try: - del os.environ['TZ'] + del os.environ["TZ"] except KeyError: pass else: - os.environ['TZ'] = tz + os.environ["TZ"] = tz time.tzset() - orig_tz = os.environ.get('TZ') + orig_tz = os.environ.get("TZ") setTZ(tz) try: yield @@ -2716,9 +2959,12 @@ def _make_skipna_wrapper(alternative, skipna_alternative=None): skipna_wrapper : function """ if skipna_alternative: + def skipna_wrapper(x): return skipna_alternative(x.values) + else: + def skipna_wrapper(x): nona = x.dropna() if len(nona) == 0: diff --git a/scripts/download_wheels.py b/scripts/download_wheels.py index f5cdbbe36d90dd..4ca13543211349 100644 --- a/scripts/download_wheels.py +++ b/scripts/download_wheels.py @@ -16,16 +16,18 @@ def parse_args(args=None): def fetch(version): - base = 'http://wheels.scipy.org' + base = "http://wheels.scipy.org" tree = html.parse(base) root = tree.getroot() - dest = pathlib.Path('dist') + dest = pathlib.Path("dist") dest.mkdir(exist_ok=True) - files = [x for x in root.xpath("//a/text()") - if x.startswith('pandas-{}'.format(version)) - and not dest.joinpath(x).exists()] + files = [ + x + for x in root.xpath("//a/text()") + if x.startswith("pandas-{}".format(version)) and not dest.joinpath(x).exists() + ] N = len(files) @@ -33,9 +35,9 @@ def fetch(version): out = str(dest.joinpath(filename)) link = urllib.request.urljoin(base, filename) urllib.request.urlretrieve(link, out) - print("Downloaded {link} to {out} [{i}/{N}]".format( - link=link, out=out, i=i, N=N - )) + print( + "Downloaded {link} to {out} [{i}/{N}]".format(link=link, out=out, i=i, N=N) + ) def main(args=None): @@ -43,5 +45,5 @@ def main(args=None): fetch(args.version) -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(main()) diff --git a/scripts/find_commits_touching_func.py b/scripts/find_commits_touching_func.py index 959623f4e2b652..1075a257d42705 100755 --- a/scripts/find_commits_touching_func.py +++ b/scripts/find_commits_touching_func.py @@ -28,32 +28,50 @@ Find all commits touching a specified function across the codebase. """.strip() argparser = argparse.ArgumentParser(description=desc) -argparser.add_argument('funcname', metavar='FUNCNAME', - help='Name of function/method to search for changes on') -argparser.add_argument('-f', '--file-masks', metavar='f_re(,f_re)*', - default=[r"\.py.?$"], - help='comma separated list of regexes to match ' - 'filenames against\ndefaults all .py? files') -argparser.add_argument('-d', '--dir-masks', metavar='d_re(,d_re)*', - default=[], - help='comma separated list of regexes to match base ' - 'path against') -argparser.add_argument('-p', '--path-masks', metavar='p_re(,p_re)*', - default=[], - help='comma separated list of regexes to match full ' - 'file path against') -argparser.add_argument('-y', '--saw-the-warning', - action='store_true', default=False, - help='must specify this to run, acknowledge you ' - 'realize this will erase untracked files') -argparser.add_argument('--debug-level', - default="CRITICAL", - help='debug level of messages (DEBUG, INFO, etc...)') +argparser.add_argument( + "funcname", + metavar="FUNCNAME", + help="Name of function/method to search for changes on", +) +argparser.add_argument( + "-f", + "--file-masks", + metavar="f_re(,f_re)*", + default=[r"\.py.?$"], + help="comma separated list of regexes to match " + "filenames against\ndefaults all .py? files", +) +argparser.add_argument( + "-d", + "--dir-masks", + metavar="d_re(,d_re)*", + default=[], + help="comma separated list of regexes to match base " "path against", +) +argparser.add_argument( + "-p", + "--path-masks", + metavar="p_re(,p_re)*", + default=[], + help="comma separated list of regexes to match full " "file path against", +) +argparser.add_argument( + "-y", + "--saw-the-warning", + action="store_true", + default=False, + help="must specify this to run, acknowledge you " + "realize this will erase untracked files", +) +argparser.add_argument( + "--debug-level", + default="CRITICAL", + help="debug level of messages (DEBUG, INFO, etc...)", +) args = argparser.parse_args() -lfmt = logging.Formatter(fmt='%(levelname)-8s %(message)s', - datefmt='%m-%d %H:%M:%S') +lfmt = logging.Formatter(fmt="%(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S") shh = logging.StreamHandler() shh.setFormatter(lfmt) logger = logging.getLogger("findit") @@ -70,19 +88,21 @@ def clean_checkout(comm): s = s.split("\n")[0] logger.info("CO: %s %s" % (comm, s)) - sh.git('checkout', comm, _tty_out=False) - sh.git('clean', '-f') + sh.git("checkout", comm, _tty_out=False) + sh.git("clean", "-f") def get_hits(defname, files=()): cs = set() for f in files: try: - r = sh.git('blame', - '-L', - r'/def\s*{start}/,/def/'.format(start=defname), - f, - _tty_out=False) + r = sh.git( + "blame", + "-L", + r"/def\s*{start}/,/def/".format(start=defname), + f, + _tty_out=False, + ) except sh.ErrorReturnCode_128: logger.debug("no matches in %s" % f) continue @@ -96,31 +116,33 @@ def get_hits(defname, files=()): return cs -def get_commit_info(c, fmt, sep='\t'): - r = sh.git('log', - "--format={}".format(fmt), - '{}^..{}'.format(c, c), - "-n", - "1", - _tty_out=False) +def get_commit_info(c, fmt, sep="\t"): + r = sh.git( + "log", + "--format={}".format(fmt), + "{}^..{}".format(c, c), + "-n", + "1", + _tty_out=False, + ) return str(r).split(sep) def get_commit_vitals(c, hlen=HASH_LEN): - h, s, d = get_commit_info(c, '%H\t%s\t%ci', "\t") + h, s, d = get_commit_info(c, "%H\t%s\t%ci", "\t") return h[:hlen], s, parse(d) def file_filter(state, dirname, fnames): - if (args.dir_masks and - not any(re.search(x, dirname) for x in args.dir_masks)): + if args.dir_masks and not any(re.search(x, dirname) for x in args.dir_masks): return for f in fnames: p = os.path.abspath(os.path.join(os.path.realpath(dirname), f)) - if (any(re.search(x, f) for x in args.file_masks) or - any(re.search(x, p) for x in args.path_masks)): + if any(re.search(x, f) for x in args.file_masks) or any( + re.search(x, p) for x in args.path_masks + ): if os.path.isfile(p): - state['files'].append(p) + state["files"].append(p) def search(defname, head_commit="HEAD"): @@ -130,7 +152,7 @@ def search(defname, head_commit="HEAD"): # allhits = set() files = [] state = dict(files=files) - os.walk('.', file_filter, state) + os.walk(".", file_filter, state) # files now holds a list of paths to files # seed with hits from q @@ -172,8 +194,10 @@ def sorter(i): h, s, d = get_commit_vitals(hits[i].commit) return hits[i].path, d - print(('\nThese commits touched the %s method in these files ' - 'on these dates:\n') % args.funcname) + print( + ("\nThese commits touched the %s method in these files " "on these dates:\n") + % args.funcname + ) for i in sorted(range(len(hits)), key=sorter): hit = hits[i] h, s, d = get_commit_vitals(hit.commit) @@ -181,7 +205,7 @@ def sorter(i): fmt = "{:%d} {:10} {:<%d} {:<%d}" % (HASH_LEN, SUBJ_LEN, PATH_LEN) if len(s) > SUBJ_LEN: - s = s[:SUBJ_LEN - 5] + " ..." + s = s[: SUBJ_LEN - 5] + " ..." print(fmt.format(h[:HASH_LEN], d.isoformat()[:10], s, p[-20:])) print("\n") @@ -190,21 +214,23 @@ def sorter(i): def main(): if not args.saw_the_warning: argparser.print_help() - print(""" + print( + """ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! WARNING: this script uses git clean -f, running it on a repo with untracked files. It's recommended that you make a fresh clone and run from its root directory. You must specify the -y argument to ignore this warning. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -""") +""" + ) return if isinstance(args.file_masks, str): - args.file_masks = args.file_masks.split(',') + args.file_masks = args.file_masks.split(",") if isinstance(args.path_masks, str): - args.path_masks = args.path_masks.split(',') + args.path_masks = args.path_masks.split(",") if isinstance(args.dir_masks, str): - args.dir_masks = args.dir_masks.split(',') + args.dir_masks = args.dir_masks.split(",") logger.setLevel(getattr(logging, args.debug_level)) @@ -214,4 +240,5 @@ def main(): if __name__ == "__main__": import sys + sys.exit(main()) diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index 7b6eb1f9a32b55..ac73859b22598d 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -19,8 +19,8 @@ import yaml -EXCLUDE = {'python=3'} -RENAME = {'pytables': 'tables'} +EXCLUDE = {"python=3"} +RENAME = {"pytables": "tables"} def conda_package_to_pip(package): @@ -36,15 +36,15 @@ def conda_package_to_pip(package): if package in EXCLUDE: return - package = re.sub('(?<=[^<>])=', '==', package).strip() - for compare in ('<=', '>=', '=='): + package = re.sub("(?<=[^<>])=", "==", package).strip() + for compare in ("<=", ">=", "=="): if compare not in package: continue pkg, version = package.split(compare) if pkg in RENAME: - return ''.join((RENAME[pkg], compare, version)) + return "".join((RENAME[pkg], compare, version)) break @@ -73,7 +73,7 @@ def main(conda_fname, pip_fname, compare=False): True if the comparison fails, False otherwise """ with open(conda_fname) as conda_fd: - deps = yaml.safe_load(conda_fd)['dependencies'] + deps = yaml.safe_load(conda_fd)["dependencies"] pip_deps = [] for dep in deps: @@ -81,42 +81,51 @@ def main(conda_fname, pip_fname, compare=False): conda_dep = conda_package_to_pip(dep) if conda_dep: pip_deps.append(conda_dep) - elif isinstance(dep, dict) and len(dep) == 1 and 'pip' in dep: - pip_deps += dep['pip'] + elif isinstance(dep, dict) and len(dep) == 1 and "pip" in dep: + pip_deps += dep["pip"] else: - raise ValueError('Unexpected dependency {}'.format(dep)) + raise ValueError("Unexpected dependency {}".format(dep)) - pip_content = '\n'.join(pip_deps) + pip_content = "\n".join(pip_deps) if compare: with open(pip_fname) as pip_fd: return pip_content != pip_fd.read() else: - with open(pip_fname, 'w') as pip_fd: + with open(pip_fname, "w") as pip_fd: pip_fd.write(pip_content) return False -if __name__ == '__main__': +if __name__ == "__main__": argparser = argparse.ArgumentParser( - description='convert (or compare) conda file to pip') - argparser.add_argument('--compare', - action='store_true', - help='compare whether the two files are equivalent') - argparser.add_argument('--azure', - action='store_true', - help='show the output in azure-pipelines format') + description="convert (or compare) conda file to pip" + ) + argparser.add_argument( + "--compare", + action="store_true", + help="compare whether the two files are equivalent", + ) + argparser.add_argument( + "--azure", action="store_true", help="show the output in azure-pipelines format" + ) args = argparser.parse_args() repo_path = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) - res = main(os.path.join(repo_path, 'environment.yml'), - os.path.join(repo_path, 'requirements-dev.txt'), - compare=args.compare) + res = main( + os.path.join(repo_path, "environment.yml"), + os.path.join(repo_path, "requirements-dev.txt"), + compare=args.compare, + ) if res: - msg = ('`requirements-dev.txt` has to be generated with `{}` after ' - '`environment.yml` is modified.\n'.format(sys.argv[0])) + msg = ( + "`requirements-dev.txt` has to be generated with `{}` after " + "`environment.yml` is modified.\n".format(sys.argv[0]) + ) if args.azure: - msg = ('##vso[task.logissue type=error;' - 'sourcepath=requirements-dev.txt]{}'.format(msg)) + msg = ( + "##vso[task.logissue type=error;" + "sourcepath=requirements-dev.txt]{}".format(msg) + ) sys.stderr.write(msg) sys.exit(res) diff --git a/scripts/merge-pr.py b/scripts/merge-pr.py index 5c665faac5976a..95352751a23c6b 100755 --- a/scripts/merge-pr.py +++ b/scripts/merge-pr.py @@ -30,8 +30,8 @@ import sys import textwrap -PANDAS_HOME = '.' -PROJECT_NAME = 'pandas' +PANDAS_HOME = "." +PROJECT_NAME = "pandas" print("PANDAS_HOME = " + PANDAS_HOME) # Remote name with the PR @@ -51,10 +51,12 @@ auth_required = False if auth_required: - GITHUB_USERNAME = os.environ['GITHUB_USER'] + GITHUB_USERNAME = os.environ["GITHUB_USER"] import getpass - GITHUB_PASSWORD = getpass.getpass('Enter github.com password for %s:' - % GITHUB_USERNAME) + + GITHUB_PASSWORD = getpass.getpass( + "Enter github.com password for %s:" % GITHUB_USERNAME + ) def get_json_auth(url): auth = HTTPBasicAuth(GITHUB_USERNAME, GITHUB_PASSWORD) @@ -63,6 +65,7 @@ def get_json_auth(url): get_json = get_json_auth else: + def get_json_no_auth(url): req = requests.get(url) return req.json() @@ -78,12 +81,12 @@ def fail(msg): def run_cmd(cmd): if isinstance(cmd, str): - cmd = cmd.split(' ') + cmd = cmd.split(" ") output = check_output(cmd) if isinstance(output, bytes): - output = output.decode('utf-8') + output = output.decode("utf-8") return output @@ -119,40 +122,44 @@ def clean_up(): def merge_pr(pr_num, target_ref): pr_branch_name = "%s_MERGE_PR_%s" % (BRANCH_PREFIX, pr_num) - target_branch_name = "%s_MERGE_PR_%s_%s" % (BRANCH_PREFIX, pr_num, - target_ref.upper()) - run_cmd("git fetch %s pull/%s/head:%s" % (PR_REMOTE_NAME, pr_num, - pr_branch_name)) - run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, target_ref, - target_branch_name)) + target_branch_name = "%s_MERGE_PR_%s_%s" % ( + BRANCH_PREFIX, + pr_num, + target_ref.upper(), + ) + run_cmd("git fetch %s pull/%s/head:%s" % (PR_REMOTE_NAME, pr_num, pr_branch_name)) + run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, target_ref, target_branch_name)) run_cmd("git checkout %s" % target_branch_name) had_conflicts = False try: - run_cmd(['git', 'merge', pr_branch_name, '--squash']) + run_cmd(["git", "merge", pr_branch_name, "--squash"]) except Exception as e: - msg = ("Error merging: %s\nWould you like to manually fix-up " - "this merge?" % e) + msg = "Error merging: %s\nWould you like to manually fix-up " "this merge?" % e continue_maybe(msg) - msg = ("Okay, please fix any conflicts and 'git add' " - "conflicting files... Finished?") + msg = ( + "Okay, please fix any conflicts and 'git add' " + "conflicting files... Finished?" + ) continue_maybe(msg) had_conflicts = True - commit_authors = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name, - '--pretty=format:%an <%ae>']).split("\n") - distinct_authors = sorted(set(commit_authors), - key=lambda x: commit_authors.count(x), - reverse=True) + commit_authors = run_cmd( + ["git", "log", "HEAD..%s" % pr_branch_name, "--pretty=format:%an <%ae>"] + ).split("\n") + distinct_authors = sorted( + set(commit_authors), key=lambda x: commit_authors.count(x), reverse=True + ) primary_author = distinct_authors[0] - commits = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name, - '--pretty=format:%h [%an] %s']).split("\n\n") + commits = run_cmd( + ["git", "log", "HEAD..%s" % pr_branch_name, "--pretty=format:%h [%an] %s"] + ).split("\n\n") merge_message_flags = [] merge_message_flags += ["-m", title] if body is not None: - merge_message_flags += ["-m", '\n'.join(textwrap.wrap(body))] + merge_message_flags += ["-m", "\n".join(textwrap.wrap(body))] authors = "\n".join("Author: %s" % a for a in distinct_authors) @@ -161,9 +168,10 @@ def merge_pr(pr_num, target_ref): if had_conflicts: committer_name = run_cmd("git config --get user.name").strip() committer_email = run_cmd("git config --get user.email").strip() - message = ("This patch had conflicts when merged, " - "resolved by\nCommitter: %s <%s>" - % (committer_name, committer_email)) + message = ( + "This patch had conflicts when merged, " + "resolved by\nCommitter: %s <%s>" % (committer_name, committer_email) + ) merge_message_flags += ["-m", message] # The string "Closes #%s" string is required for GitHub to correctly close @@ -171,19 +179,22 @@ def merge_pr(pr_num, target_ref): merge_message_flags += [ "-m", "Closes #%s from %s and squashes the following commits:" - % (pr_num, pr_repo_desc)] + % (pr_num, pr_repo_desc), + ] for c in commits: merge_message_flags += ["-m", c] - run_cmd(['git', 'commit', '--author="%s"' % primary_author] + - merge_message_flags) + run_cmd(["git", "commit", '--author="%s"' % primary_author] + merge_message_flags) - continue_maybe("Merge complete (local ref %s). Push to %s?" % ( - target_branch_name, PUSH_REMOTE_NAME)) + continue_maybe( + "Merge complete (local ref %s). Push to %s?" + % (target_branch_name, PUSH_REMOTE_NAME) + ) try: - run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, target_branch_name, - target_ref)) + run_cmd( + "git push %s %s:%s" % (PUSH_REMOTE_NAME, target_branch_name, target_ref) + ) except Exception as e: clean_up() fail("Exception while pushing: %s" % e) @@ -199,25 +210,26 @@ def update_pr(pr_num, user_login, base_ref): pr_branch_name = "%s_MERGE_PR_%s" % (BRANCH_PREFIX, pr_num) - run_cmd("git fetch %s pull/%s/head:%s" % (PR_REMOTE_NAME, pr_num, - pr_branch_name)) + run_cmd("git fetch %s pull/%s/head:%s" % (PR_REMOTE_NAME, pr_num, pr_branch_name)) run_cmd("git checkout %s" % pr_branch_name) - continue_maybe("Update ready (local ref %s)? Push to %s/%s?" % ( - pr_branch_name, user_login, base_ref)) + continue_maybe( + "Update ready (local ref %s)? Push to %s/%s?" + % (pr_branch_name, user_login, base_ref) + ) push_user_remote = "https://github.com/%s/pandas.git" % user_login try: - run_cmd('git push %s %s:%s' % (push_user_remote, pr_branch_name, - base_ref)) + run_cmd("git push %s %s:%s" % (push_user_remote, pr_branch_name, base_ref)) except Exception as e: if continue_maybe2("Force push?"): try: run_cmd( - 'git push -f %s %s:%s' % (push_user_remote, pr_branch_name, - base_ref)) + "git push -f %s %s:%s" + % (push_user_remote, pr_branch_name, base_ref) + ) except Exception as e: fail("Exception while pushing: %s" % e) clean_up() @@ -234,20 +246,19 @@ def cherry_pick(pr_num, merge_hash, default_branch): if pick_ref == "": pick_ref = default_branch - pick_branch_name = "%s_PICK_PR_%s_%s" % (BRANCH_PREFIX, pr_num, - pick_ref.upper()) + pick_branch_name = "%s_PICK_PR_%s_%s" % (BRANCH_PREFIX, pr_num, pick_ref.upper()) - run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, pick_ref, - pick_branch_name)) + run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, pick_ref, pick_branch_name)) run_cmd("git checkout %s" % pick_branch_name) run_cmd("git cherry-pick -sx %s" % merge_hash) - continue_maybe("Pick complete (local ref %s). Push to %s?" % ( - pick_branch_name, PUSH_REMOTE_NAME)) + continue_maybe( + "Pick complete (local ref %s). Push to %s?" + % (pick_branch_name, PUSH_REMOTE_NAME) + ) try: - run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, pick_branch_name, - pick_ref)) + run_cmd("git push %s %s:%s" % (PUSH_REMOTE_NAME, pick_branch_name, pick_ref)) except Exception as e: clean_up() fail("Exception while pushing: %s" % e) @@ -282,35 +293,44 @@ def fix_version_from_branch(branch, versions): pr_repo_desc = "%s/%s" % (user_login, base_ref) if pr["merged"] is True: - print("Pull request {0} has already been merged, please backport manually" - .format(pr_num)) + print( + "Pull request {0} has already been merged, please backport manually".format( + pr_num + ) + ) sys.exit(0) if not bool(pr["mergeable"]): - msg = ("Pull request {0} is not mergeable in its current form.\n" - "Continue? (experts only!)".format(pr_num)) + msg = ( + "Pull request {0} is not mergeable in its current form.\n" + "Continue? (experts only!)".format(pr_num) + ) continue_maybe(msg) print("\n=== Pull Request #%s ===" % pr_num) # we may have un-printable unicode in our title try: - title = title.encode('raw_unicode_escape') + title = title.encode("raw_unicode_escape") except Exception: pass -print("title\t{title}\nsource\t{source}\ntarget\t{target}\nurl\t{url}".format( - title=title, source=pr_repo_desc, target=target_ref, url=url)) +print( + "title\t{title}\nsource\t{source}\ntarget\t{target}\nurl\t{url}".format( + title=title, source=pr_repo_desc, target=target_ref, url=url + ) +) merged_refs = [target_ref] print("\nProceed with updating or merging pull request #%s?" % pr_num) -update = input("Update PR and push to remote (r), merge locally (l), " - "or do nothing (n) ?") +update = input( + "Update PR and push to remote (r), merge locally (l), " "or do nothing (n) ?" +) update = update.lower() -if update == 'r': +if update == "r": merge_hash = update_pr(pr_num, user_login, base_ref) -elif update == 'l': +elif update == "l": merge_hash = merge_pr(pr_num, target_ref) diff --git a/scripts/tests/conftest.py b/scripts/tests/conftest.py index f8318b8d402af0..496a5195bfc841 100644 --- a/scripts/tests/conftest.py +++ b/scripts/tests/conftest.py @@ -1,3 +1,6 @@ def pytest_addoption(parser): - parser.addoption("--strict-data-files", action="store_true", - help="Unused. For compat with setup.cfg.") + parser.addoption( + "--strict-data-files", + action="store_true", + help="Unused. For compat with setup.cfg.", + ) diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index 34395435bd8c55..f3364e6725a200 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -7,6 +7,7 @@ import pandas as pd import validate_docstrings + validate_one = validate_docstrings.validate_one @@ -18,7 +19,7 @@ class GoodDocStrings: script without any errors. """ - def plot(self, kind, color='blue', **kwargs): + def plot(self, kind, color="blue", **kwargs): """ Generate a plot. @@ -244,8 +245,10 @@ def empty_returns(self): Since this function never returns a value, this docstring doesn't need a return section. """ + def say_hello(): return "Hello World!" + say_hello() if True: return @@ -446,7 +449,6 @@ def method_wo_docstrings(self): class BadSummaries: - def wrong_line(self): """Exists on the wrong line""" pass @@ -612,7 +614,6 @@ def list_incorrect_parameter_type(self, kind): class BadReturns: - def return_not_documented(self): """ Lacks section for Returns @@ -695,7 +696,6 @@ def no_period_multi(self): class BadSeeAlso: - def desc_no_period(self): """ Return the first 5 elements of the Series. @@ -733,7 +733,6 @@ def prefix_pandas(self): class BadExamples: - def unused_import(self): """ Examples @@ -771,7 +770,6 @@ def missing_whitespace_after_comma(self): class TestValidator: - def _import_path(self, klass=None, func=None): """ Build the required import path for tests in this module. @@ -799,162 +797,314 @@ def _import_path(self, klass=None, func=None): return base_path def test_good_class(self, capsys): - errors = validate_one(self._import_path( - klass='GoodDocStrings'))['errors'] + errors = validate_one(self._import_path(klass="GoodDocStrings"))["errors"] assert isinstance(errors, list) assert not errors - @pytest.mark.parametrize("func", [ - 'plot', 'sample', 'random_letters', 'sample_values', 'head', 'head1', - 'contains', 'mode', 'good_imports', 'no_returns', 'empty_returns']) + @pytest.mark.parametrize( + "func", + [ + "plot", + "sample", + "random_letters", + "sample_values", + "head", + "head1", + "contains", + "mode", + "good_imports", + "no_returns", + "empty_returns", + ], + ) def test_good_functions(self, capsys, func): - errors = validate_one(self._import_path( - klass='GoodDocStrings', func=func))['errors'] + errors = validate_one(self._import_path(klass="GoodDocStrings", func=func))[ + "errors" + ] assert isinstance(errors, list) assert not errors def test_bad_class(self, capsys): - errors = validate_one(self._import_path( - klass='BadGenericDocStrings'))['errors'] + errors = validate_one(self._import_path(klass="BadGenericDocStrings"))["errors"] assert isinstance(errors, list) assert errors - @pytest.mark.parametrize("func", [ - 'func', 'astype', 'astype1', 'astype2', 'astype3', 'plot', 'method', - 'private_classes', - ]) + @pytest.mark.parametrize( + "func", + [ + "func", + "astype", + "astype1", + "astype2", + "astype3", + "plot", + "method", + "private_classes", + ], + ) def test_bad_generic_functions(self, capsys, func): - errors = validate_one(self._import_path( # noqa:F821 - klass='BadGenericDocStrings', func=func))['errors'] + errors = validate_one( + self._import_path(klass="BadGenericDocStrings", func=func) # noqa:F821 + )["errors"] assert isinstance(errors, list) assert errors - @pytest.mark.parametrize("klass,func,msgs", [ - # See Also tests - ('BadGenericDocStrings', 'private_classes', - ("Private classes (NDFrame) should not be mentioned in public " - 'docstrings',)), - ('BadGenericDocStrings', 'unknown_section', - ('Found unknown section "Unknown Section".',)), - ('BadGenericDocStrings', 'sections_in_wrong_order', - ('Sections are in the wrong order. Correct order is: Parameters, ' - 'See Also, Examples',)), - ('BadGenericDocStrings', 'deprecation_in_wrong_order', - ('Deprecation warning should precede extended summary',)), - ('BadSeeAlso', 'desc_no_period', - ('Missing period at end of description for See Also "Series.iloc"',)), - ('BadSeeAlso', 'desc_first_letter_lowercase', - ('should be capitalized for See Also "Series.tail"',)), - # Summary tests - ('BadSummaries', 'wrong_line', - ('should start in the line immediately after the opening quotes',)), - ('BadSummaries', 'no_punctuation', - ('Summary does not end with a period',)), - ('BadSummaries', 'no_capitalization', - ('Summary does not start with a capital letter',)), - ('BadSummaries', 'no_capitalization', - ('Summary must start with infinitive verb',)), - ('BadSummaries', 'multi_line', - ('Summary should fit in a single line',)), - ('BadSummaries', 'two_paragraph_multi_line', - ('Summary should fit in a single line',)), - # Parameters tests - ('BadParameters', 'missing_params', - ('Parameters {**kwargs} not documented',)), - ('BadParameters', 'bad_colon_spacing', - ('Parameter "kind" requires a space before the colon ' - 'separating the parameter name and type',)), - ('BadParameters', 'no_description_period', - ('Parameter "kind" description should finish with "."',)), - ('BadParameters', 'no_description_period_with_directive', - ('Parameter "kind" description should finish with "."',)), - ('BadParameters', 'parameter_capitalization', - ('Parameter "kind" description should start with a capital letter',)), - ('BadParameters', 'integer_parameter', - ('Parameter "kind" type should use "int" instead of "integer"',)), - ('BadParameters', 'string_parameter', - ('Parameter "kind" type should use "str" instead of "string"',)), - ('BadParameters', 'boolean_parameter', - ('Parameter "kind" type should use "bool" instead of "boolean"',)), - ('BadParameters', 'list_incorrect_parameter_type', - ('Parameter "kind" type should use "bool" instead of "boolean"',)), - ('BadParameters', 'list_incorrect_parameter_type', - ('Parameter "kind" type should use "int" instead of "integer"',)), - ('BadParameters', 'list_incorrect_parameter_type', - ('Parameter "kind" type should use "str" instead of "string"',)), - pytest.param('BadParameters', 'blank_lines', ('No error yet?',), - marks=pytest.mark.xfail), - # Returns tests - ('BadReturns', 'return_not_documented', ('No Returns section found',)), - ('BadReturns', 'yield_not_documented', ('No Yields section found',)), - pytest.param('BadReturns', 'no_type', ('foo',), - marks=pytest.mark.xfail), - ('BadReturns', 'no_description', - ('Return value has no description',)), - ('BadReturns', 'no_punctuation', - ('Return value description should finish with "."',)), - ('BadReturns', 'named_single_return', - ('The first line of the Returns section should contain only the ' - 'type, unless multiple values are being returned',)), - ('BadReturns', 'no_capitalization', - ('Return value description should start with a capital ' - 'letter',)), - ('BadReturns', 'no_period_multi', - ('Return value description should finish with "."',)), - # Examples tests - ('BadGenericDocStrings', 'method', - ('Do not import numpy, as it is imported automatically',)), - ('BadGenericDocStrings', 'method', - ('Do not import pandas, as it is imported automatically',)), - ('BadGenericDocStrings', 'method_wo_docstrings', - ("The object does not have a docstring",)), - # See Also tests - ('BadSeeAlso', 'prefix_pandas', - ('pandas.Series.rename in `See Also` section ' - 'does not need `pandas` prefix',)), - # Examples tests - ('BadExamples', 'unused_import', - ("flake8 error: F401 'pandas as pdf' imported but unused",)), - ('BadExamples', 'indentation_is_not_a_multiple_of_four', - ('flake8 error: E111 indentation is not a multiple of four',)), - ('BadExamples', 'missing_whitespace_around_arithmetic_operator', - ('flake8 error: ' - 'E226 missing whitespace around arithmetic operator',)), - ('BadExamples', 'missing_whitespace_after_comma', - ("flake8 error: E231 missing whitespace after ',' (3 times)",)), - ('BadGenericDocStrings', 'two_linebreaks_between_sections', - ('Double line break found; please use only one blank line to ' - 'separate sections or paragraphs, and do not leave blank lines ' - 'at the end of docstrings',)), - ('BadGenericDocStrings', 'linebreak_at_end_of_docstring', - ('Double line break found; please use only one blank line to ' - 'separate sections or paragraphs, and do not leave blank lines ' - 'at the end of docstrings',)), - ]) + @pytest.mark.parametrize( + "klass,func,msgs", + [ + # See Also tests + ( + "BadGenericDocStrings", + "private_classes", + ( + "Private classes (NDFrame) should not be mentioned in public " + "docstrings", + ), + ), + ( + "BadGenericDocStrings", + "unknown_section", + ('Found unknown section "Unknown Section".',), + ), + ( + "BadGenericDocStrings", + "sections_in_wrong_order", + ( + "Sections are in the wrong order. Correct order is: Parameters, " + "See Also, Examples", + ), + ), + ( + "BadGenericDocStrings", + "deprecation_in_wrong_order", + ("Deprecation warning should precede extended summary",), + ), + ( + "BadSeeAlso", + "desc_no_period", + ('Missing period at end of description for See Also "Series.iloc"',), + ), + ( + "BadSeeAlso", + "desc_first_letter_lowercase", + ('should be capitalized for See Also "Series.tail"',), + ), + # Summary tests + ( + "BadSummaries", + "wrong_line", + ("should start in the line immediately after the opening quotes",), + ), + ("BadSummaries", "no_punctuation", ("Summary does not end with a period",)), + ( + "BadSummaries", + "no_capitalization", + ("Summary does not start with a capital letter",), + ), + ( + "BadSummaries", + "no_capitalization", + ("Summary must start with infinitive verb",), + ), + ("BadSummaries", "multi_line", ("Summary should fit in a single line",)), + ( + "BadSummaries", + "two_paragraph_multi_line", + ("Summary should fit in a single line",), + ), + # Parameters tests + ( + "BadParameters", + "missing_params", + ("Parameters {**kwargs} not documented",), + ), + ( + "BadParameters", + "bad_colon_spacing", + ( + 'Parameter "kind" requires a space before the colon ' + "separating the parameter name and type", + ), + ), + ( + "BadParameters", + "no_description_period", + ('Parameter "kind" description should finish with "."',), + ), + ( + "BadParameters", + "no_description_period_with_directive", + ('Parameter "kind" description should finish with "."',), + ), + ( + "BadParameters", + "parameter_capitalization", + ('Parameter "kind" description should start with a capital letter',), + ), + ( + "BadParameters", + "integer_parameter", + ('Parameter "kind" type should use "int" instead of "integer"',), + ), + ( + "BadParameters", + "string_parameter", + ('Parameter "kind" type should use "str" instead of "string"',), + ), + ( + "BadParameters", + "boolean_parameter", + ('Parameter "kind" type should use "bool" instead of "boolean"',), + ), + ( + "BadParameters", + "list_incorrect_parameter_type", + ('Parameter "kind" type should use "bool" instead of "boolean"',), + ), + ( + "BadParameters", + "list_incorrect_parameter_type", + ('Parameter "kind" type should use "int" instead of "integer"',), + ), + ( + "BadParameters", + "list_incorrect_parameter_type", + ('Parameter "kind" type should use "str" instead of "string"',), + ), + pytest.param( + "BadParameters", + "blank_lines", + ("No error yet?",), + marks=pytest.mark.xfail, + ), + # Returns tests + ("BadReturns", "return_not_documented", ("No Returns section found",)), + ("BadReturns", "yield_not_documented", ("No Yields section found",)), + pytest.param("BadReturns", "no_type", ("foo",), marks=pytest.mark.xfail), + ("BadReturns", "no_description", ("Return value has no description",)), + ( + "BadReturns", + "no_punctuation", + ('Return value description should finish with "."',), + ), + ( + "BadReturns", + "named_single_return", + ( + "The first line of the Returns section should contain only the " + "type, unless multiple values are being returned", + ), + ), + ( + "BadReturns", + "no_capitalization", + ("Return value description should start with a capital " "letter",), + ), + ( + "BadReturns", + "no_period_multi", + ('Return value description should finish with "."',), + ), + # Examples tests + ( + "BadGenericDocStrings", + "method", + ("Do not import numpy, as it is imported automatically",), + ), + ( + "BadGenericDocStrings", + "method", + ("Do not import pandas, as it is imported automatically",), + ), + ( + "BadGenericDocStrings", + "method_wo_docstrings", + ("The object does not have a docstring",), + ), + # See Also tests + ( + "BadSeeAlso", + "prefix_pandas", + ( + "pandas.Series.rename in `See Also` section " + "does not need `pandas` prefix", + ), + ), + # Examples tests + ( + "BadExamples", + "unused_import", + ("flake8 error: F401 'pandas as pdf' imported but unused",), + ), + ( + "BadExamples", + "indentation_is_not_a_multiple_of_four", + ("flake8 error: E111 indentation is not a multiple of four",), + ), + ( + "BadExamples", + "missing_whitespace_around_arithmetic_operator", + ( + "flake8 error: " + "E226 missing whitespace around arithmetic operator", + ), + ), + ( + "BadExamples", + "missing_whitespace_after_comma", + ("flake8 error: E231 missing whitespace after ',' (3 times)",), + ), + ( + "BadGenericDocStrings", + "two_linebreaks_between_sections", + ( + "Double line break found; please use only one blank line to " + "separate sections or paragraphs, and do not leave blank lines " + "at the end of docstrings", + ), + ), + ( + "BadGenericDocStrings", + "linebreak_at_end_of_docstring", + ( + "Double line break found; please use only one blank line to " + "separate sections or paragraphs, and do not leave blank lines " + "at the end of docstrings", + ), + ), + ], + ) def test_bad_docstrings(self, capsys, klass, func, msgs): result = validate_one(self._import_path(klass=klass, func=func)) for msg in msgs: - assert msg in ' '.join(err[1] for err in result['errors']) + assert msg in " ".join(err[1] for err in result["errors"]) def test_validate_all_ignore_deprecated(self, monkeypatch): monkeypatch.setattr( - validate_docstrings, 'validate_one', lambda func_name: { - 'docstring': 'docstring1', - 'errors': [('ER01', 'err desc'), - ('ER02', 'err desc'), - ('ER03', 'err desc')], - 'warnings': [], - 'examples_errors': '', - 'deprecated': True}) - result = validate_docstrings.validate_all(prefix=None, - ignore_deprecated=True) + validate_docstrings, + "validate_one", + lambda func_name: { + "docstring": "docstring1", + "errors": [ + ("ER01", "err desc"), + ("ER02", "err desc"), + ("ER03", "err desc"), + ], + "warnings": [], + "examples_errors": "", + "deprecated": True, + }, + ) + result = validate_docstrings.validate_all(prefix=None, ignore_deprecated=True) assert len(result) == 0 class TestApiItems: @property def api_doc(self): - return io.StringIO(textwrap.dedent(''' + return io.StringIO( + textwrap.dedent( + """ .. currentmodule:: itertools Itertools @@ -987,73 +1137,88 @@ def api_doc(self): seed randint - ''')) - - @pytest.mark.parametrize('idx,name', [(0, 'itertools.cycle'), - (1, 'itertools.count'), - (2, 'itertools.chain'), - (3, 'random.seed'), - (4, 'random.randint')]) + """ + ) + ) + + @pytest.mark.parametrize( + "idx,name", + [ + (0, "itertools.cycle"), + (1, "itertools.count"), + (2, "itertools.chain"), + (3, "random.seed"), + (4, "random.randint"), + ], + ) def test_item_name(self, idx, name): result = list(validate_docstrings.get_api_items(self.api_doc)) assert result[idx][0] == name - @pytest.mark.parametrize('idx,func', [(0, 'cycle'), - (1, 'count'), - (2, 'chain'), - (3, 'seed'), - (4, 'randint')]) + @pytest.mark.parametrize( + "idx,func", + [(0, "cycle"), (1, "count"), (2, "chain"), (3, "seed"), (4, "randint")], + ) def test_item_function(self, idx, func): result = list(validate_docstrings.get_api_items(self.api_doc)) assert callable(result[idx][1]) assert result[idx][1].__name__ == func - @pytest.mark.parametrize('idx,section', [(0, 'Itertools'), - (1, 'Itertools'), - (2, 'Itertools'), - (3, 'Random'), - (4, 'Random')]) + @pytest.mark.parametrize( + "idx,section", + [ + (0, "Itertools"), + (1, "Itertools"), + (2, "Itertools"), + (3, "Random"), + (4, "Random"), + ], + ) def test_item_section(self, idx, section): result = list(validate_docstrings.get_api_items(self.api_doc)) assert result[idx][2] == section - @pytest.mark.parametrize('idx,subsection', [(0, 'Infinite'), - (1, 'Infinite'), - (2, 'Finite'), - (3, 'All'), - (4, 'All')]) + @pytest.mark.parametrize( + "idx,subsection", + [(0, "Infinite"), (1, "Infinite"), (2, "Finite"), (3, "All"), (4, "All")], + ) def test_item_subsection(self, idx, subsection): result = list(validate_docstrings.get_api_items(self.api_doc)) assert result[idx][3] == subsection class TestDocstringClass: - @pytest.mark.parametrize('name, expected_obj', - [('pandas.isnull', pd.isnull), - ('pandas.DataFrame', pd.DataFrame), - ('pandas.Series.sum', pd.Series.sum)]) + @pytest.mark.parametrize( + "name, expected_obj", + [ + ("pandas.isnull", pd.isnull), + ("pandas.DataFrame", pd.DataFrame), + ("pandas.Series.sum", pd.Series.sum), + ], + ) def test_resolves_class_name(self, name, expected_obj): d = validate_docstrings.Docstring(name) assert d.obj is expected_obj - @pytest.mark.parametrize('invalid_name', ['panda', 'panda.DataFrame']) + @pytest.mark.parametrize("invalid_name", ["panda", "panda.DataFrame"]) def test_raises_for_invalid_module_name(self, invalid_name): msg = 'No module can be imported from "{}"'.format(invalid_name) with pytest.raises(ImportError, match=msg): validate_docstrings.Docstring(invalid_name) - @pytest.mark.parametrize('invalid_name', - ['pandas.BadClassName', - 'pandas.Series.bad_method_name']) + @pytest.mark.parametrize( + "invalid_name", ["pandas.BadClassName", "pandas.Series.bad_method_name"] + ) def test_raises_for_invalid_attribute_name(self, invalid_name): - name_components = invalid_name.split('.') + name_components = invalid_name.split(".") obj_name, invalid_attr_name = name_components[-2], name_components[-1] msg = "'{}' has no attribute '{}'".format(obj_name, invalid_attr_name) with pytest.raises(AttributeError, match=msg): validate_docstrings.Docstring(invalid_name) - @pytest.mark.parametrize('name', ['pandas.Series.str.isdecimal', - 'pandas.Series.str.islower']) + @pytest.mark.parametrize( + "name", ["pandas.Series.str.isdecimal", "pandas.Series.str.islower"] + ) def test_encode_content_write_to_file(self, name): # GH25466 docstr = validate_docstrings.Docstring(name).validate_pep8() @@ -1064,97 +1229,141 @@ def test_encode_content_write_to_file(self, name): class TestMainFunction: def test_exit_status_for_validate_one(self, monkeypatch): monkeypatch.setattr( - validate_docstrings, 'validate_one', lambda func_name: { - 'docstring': 'docstring1', - 'errors': [('ER01', 'err desc'), - ('ER02', 'err desc'), - ('ER03', 'err desc')], - 'warnings': [], - 'examples_errors': ''}) - exit_status = validate_docstrings.main(func_name='docstring1', - prefix=None, - errors=[], - output_format='default', - ignore_deprecated=False) + validate_docstrings, + "validate_one", + lambda func_name: { + "docstring": "docstring1", + "errors": [ + ("ER01", "err desc"), + ("ER02", "err desc"), + ("ER03", "err desc"), + ], + "warnings": [], + "examples_errors": "", + }, + ) + exit_status = validate_docstrings.main( + func_name="docstring1", + prefix=None, + errors=[], + output_format="default", + ignore_deprecated=False, + ) assert exit_status == 0 def test_exit_status_errors_for_validate_all(self, monkeypatch): monkeypatch.setattr( - validate_docstrings, 'validate_all', + validate_docstrings, + "validate_all", lambda prefix, ignore_deprecated=False: { - 'docstring1': {'errors': [('ER01', 'err desc'), - ('ER02', 'err desc'), - ('ER03', 'err desc')], - 'file': 'module1.py', - 'file_line': 23}, - 'docstring2': {'errors': [('ER04', 'err desc'), - ('ER05', 'err desc')], - 'file': 'module2.py', - 'file_line': 925}}) - exit_status = validate_docstrings.main(func_name=None, - prefix=None, - errors=[], - output_format='default', - ignore_deprecated=False) + "docstring1": { + "errors": [ + ("ER01", "err desc"), + ("ER02", "err desc"), + ("ER03", "err desc"), + ], + "file": "module1.py", + "file_line": 23, + }, + "docstring2": { + "errors": [("ER04", "err desc"), ("ER05", "err desc")], + "file": "module2.py", + "file_line": 925, + }, + }, + ) + exit_status = validate_docstrings.main( + func_name=None, + prefix=None, + errors=[], + output_format="default", + ignore_deprecated=False, + ) assert exit_status == 5 def test_no_exit_status_noerrors_for_validate_all(self, monkeypatch): monkeypatch.setattr( - validate_docstrings, 'validate_all', + validate_docstrings, + "validate_all", lambda prefix, ignore_deprecated=False: { - 'docstring1': {'errors': [], - 'warnings': [('WN01', 'warn desc')]}, - 'docstring2': {'errors': []}}) - exit_status = validate_docstrings.main(func_name=None, - prefix=None, - errors=[], - output_format='default', - ignore_deprecated=False) + "docstring1": {"errors": [], "warnings": [("WN01", "warn desc")]}, + "docstring2": {"errors": []}, + }, + ) + exit_status = validate_docstrings.main( + func_name=None, + prefix=None, + errors=[], + output_format="default", + ignore_deprecated=False, + ) assert exit_status == 0 def test_exit_status_for_validate_all_json(self, monkeypatch): - print('EXECUTED') + print("EXECUTED") monkeypatch.setattr( - validate_docstrings, 'validate_all', + validate_docstrings, + "validate_all", lambda prefix, ignore_deprecated=False: { - 'docstring1': {'errors': [('ER01', 'err desc'), - ('ER02', 'err desc'), - ('ER03', 'err desc')]}, - 'docstring2': {'errors': [('ER04', 'err desc'), - ('ER05', 'err desc')]}}) - exit_status = validate_docstrings.main(func_name=None, - prefix=None, - errors=[], - output_format='json', - ignore_deprecated=False) + "docstring1": { + "errors": [ + ("ER01", "err desc"), + ("ER02", "err desc"), + ("ER03", "err desc"), + ] + }, + "docstring2": {"errors": [("ER04", "err desc"), ("ER05", "err desc")]}, + }, + ) + exit_status = validate_docstrings.main( + func_name=None, + prefix=None, + errors=[], + output_format="json", + ignore_deprecated=False, + ) assert exit_status == 0 def test_errors_param_filters_errors(self, monkeypatch): monkeypatch.setattr( - validate_docstrings, 'validate_all', + validate_docstrings, + "validate_all", lambda prefix, ignore_deprecated=False: { - 'Series.foo': {'errors': [('ER01', 'err desc'), - ('ER02', 'err desc'), - ('ER03', 'err desc')], - 'file': 'series.py', - 'file_line': 142}, - 'DataFrame.bar': {'errors': [('ER01', 'err desc'), - ('ER02', 'err desc')], - 'file': 'frame.py', - 'file_line': 598}, - 'Series.foobar': {'errors': [('ER01', 'err desc')], - 'file': 'series.py', - 'file_line': 279}}) - exit_status = validate_docstrings.main(func_name=None, - prefix=None, - errors=['ER01'], - output_format='default', - ignore_deprecated=False) + "Series.foo": { + "errors": [ + ("ER01", "err desc"), + ("ER02", "err desc"), + ("ER03", "err desc"), + ], + "file": "series.py", + "file_line": 142, + }, + "DataFrame.bar": { + "errors": [("ER01", "err desc"), ("ER02", "err desc")], + "file": "frame.py", + "file_line": 598, + }, + "Series.foobar": { + "errors": [("ER01", "err desc")], + "file": "series.py", + "file_line": 279, + }, + }, + ) + exit_status = validate_docstrings.main( + func_name=None, + prefix=None, + errors=["ER01"], + output_format="default", + ignore_deprecated=False, + ) assert exit_status == 3 - exit_status = validate_docstrings.main(func_name=None, - prefix=None, - errors=['ER03'], - output_format='default', - ignore_deprecated=False) + exit_status = validate_docstrings.main( + func_name=None, + prefix=None, + errors=["ER03"], + output_format="default", + ignore_deprecated=False, + ) assert exit_status == 1 diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index dddd5eb1f1eab7..37623d32db6859 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -40,9 +40,10 @@ # to avoid that plot windows are open from the doctests while running the # script. Setting here before matplotlib is loaded. # We don't warn for the number of open plots, as none is actually being opened -os.environ['MPLBACKEND'] = 'Template' +os.environ["MPLBACKEND"] = "Template" import matplotlib -matplotlib.rc('figure', max_open_warning=10000) + +matplotlib.rc("figure", max_open_warning=10000) import numpy @@ -51,80 +52,90 @@ sys.path.insert(0, os.path.join(BASE_PATH)) import pandas -sys.path.insert(1, os.path.join(BASE_PATH, 'doc', 'sphinxext')) +sys.path.insert(1, os.path.join(BASE_PATH, "doc", "sphinxext")) from numpydoc.docscrape import NumpyDocString from pandas.io.formats.printing import pprint_thing -PRIVATE_CLASSES = ['NDFrame', 'IndexOpsMixin'] -DIRECTIVES = ['versionadded', 'versionchanged', 'deprecated'] -ALLOWED_SECTIONS = ['Parameters', 'Attributes', 'Methods', 'Returns', 'Yields', - 'Other Parameters', 'Raises', 'Warns', 'See Also', 'Notes', - 'References', 'Examples'] +PRIVATE_CLASSES = ["NDFrame", "IndexOpsMixin"] +DIRECTIVES = ["versionadded", "versionchanged", "deprecated"] +ALLOWED_SECTIONS = [ + "Parameters", + "Attributes", + "Methods", + "Returns", + "Yields", + "Other Parameters", + "Raises", + "Warns", + "See Also", + "Notes", + "References", + "Examples", +] ERROR_MSGS = { - 'GL01': 'Docstring text (summary) should start in the line immediately ' - 'after the opening quotes (not in the same line, or leaving a ' - 'blank line in between)', - 'GL02': 'Closing quotes should be placed in the line after the last text ' - 'in the docstring (do not close the quotes in the same line as ' - 'the text, or leave a blank line between the last text and the ' - 'quotes)', - 'GL03': 'Double line break found; please use only one blank line to ' - 'separate sections or paragraphs, and do not leave blank lines ' - 'at the end of docstrings', - 'GL04': 'Private classes ({mentioned_private_classes}) should not be ' - 'mentioned in public docstrings', - 'GL05': 'Tabs found at the start of line "{line_with_tabs}", please use ' - 'whitespace only', - 'GL06': 'Found unknown section "{section}". Allowed sections are: ' - '{allowed_sections}', - 'GL07': 'Sections are in the wrong order. Correct order is: ' - '{correct_sections}', - 'GL08': 'The object does not have a docstring', - 'GL09': 'Deprecation warning should precede extended summary', - 'SS01': 'No summary found (a short summary in a single line should be ' - 'present at the beginning of the docstring)', - 'SS02': 'Summary does not start with a capital letter', - 'SS03': 'Summary does not end with a period', - 'SS04': 'Summary contains heading whitespaces', - 'SS05': 'Summary must start with infinitive verb, not third person ' - '(e.g. use "Generate" instead of "Generates")', - 'SS06': 'Summary should fit in a single line', - 'ES01': 'No extended summary found', - 'PR01': 'Parameters {missing_params} not documented', - 'PR02': 'Unknown parameters {unknown_params}', - 'PR03': 'Wrong parameters order. Actual: {actual_params}. ' - 'Documented: {documented_params}', - 'PR04': 'Parameter "{param_name}" has no type', - 'PR05': 'Parameter "{param_name}" type should not finish with "."', - 'PR06': 'Parameter "{param_name}" type should use "{right_type}" instead ' - 'of "{wrong_type}"', - 'PR07': 'Parameter "{param_name}" has no description', - 'PR08': 'Parameter "{param_name}" description should start with a ' - 'capital letter', - 'PR09': 'Parameter "{param_name}" description should finish with "."', - 'PR10': 'Parameter "{param_name}" requires a space before the colon ' - 'separating the parameter name and type', - 'RT01': 'No Returns section found', - 'RT02': 'The first line of the Returns section should contain only the ' - 'type, unless multiple values are being returned', - 'RT03': 'Return value has no description', - 'RT04': 'Return value description should start with a capital letter', - 'RT05': 'Return value description should finish with "."', - 'YD01': 'No Yields section found', - 'SA01': 'See Also section not found', - 'SA02': 'Missing period at end of description for See Also ' - '"{reference_name}" reference', - 'SA03': 'Description should be capitalized for See Also ' - '"{reference_name}" reference', - 'SA04': 'Missing description for See Also "{reference_name}" reference', - 'SA05': '{reference_name} in `See Also` section does not need `pandas` ' - 'prefix, use {right_reference} instead.', - 'EX01': 'No examples section found', - 'EX02': 'Examples do not pass tests:\n{doctest_log}', - 'EX03': 'flake8 error: {error_code} {error_message}{times_happening}', - 'EX04': 'Do not import {imported_library}, as it is imported ' - 'automatically for the examples (numpy as np, pandas as pd)', + "GL01": "Docstring text (summary) should start in the line immediately " + "after the opening quotes (not in the same line, or leaving a " + "blank line in between)", + "GL02": "Closing quotes should be placed in the line after the last text " + "in the docstring (do not close the quotes in the same line as " + "the text, or leave a blank line between the last text and the " + "quotes)", + "GL03": "Double line break found; please use only one blank line to " + "separate sections or paragraphs, and do not leave blank lines " + "at the end of docstrings", + "GL04": "Private classes ({mentioned_private_classes}) should not be " + "mentioned in public docstrings", + "GL05": 'Tabs found at the start of line "{line_with_tabs}", please use ' + "whitespace only", + "GL06": 'Found unknown section "{section}". Allowed sections are: ' + "{allowed_sections}", + "GL07": "Sections are in the wrong order. Correct order is: " "{correct_sections}", + "GL08": "The object does not have a docstring", + "GL09": "Deprecation warning should precede extended summary", + "SS01": "No summary found (a short summary in a single line should be " + "present at the beginning of the docstring)", + "SS02": "Summary does not start with a capital letter", + "SS03": "Summary does not end with a period", + "SS04": "Summary contains heading whitespaces", + "SS05": "Summary must start with infinitive verb, not third person " + '(e.g. use "Generate" instead of "Generates")', + "SS06": "Summary should fit in a single line", + "ES01": "No extended summary found", + "PR01": "Parameters {missing_params} not documented", + "PR02": "Unknown parameters {unknown_params}", + "PR03": "Wrong parameters order. Actual: {actual_params}. " + "Documented: {documented_params}", + "PR04": 'Parameter "{param_name}" has no type', + "PR05": 'Parameter "{param_name}" type should not finish with "."', + "PR06": 'Parameter "{param_name}" type should use "{right_type}" instead ' + 'of "{wrong_type}"', + "PR07": 'Parameter "{param_name}" has no description', + "PR08": 'Parameter "{param_name}" description should start with a ' + "capital letter", + "PR09": 'Parameter "{param_name}" description should finish with "."', + "PR10": 'Parameter "{param_name}" requires a space before the colon ' + "separating the parameter name and type", + "RT01": "No Returns section found", + "RT02": "The first line of the Returns section should contain only the " + "type, unless multiple values are being returned", + "RT03": "Return value has no description", + "RT04": "Return value description should start with a capital letter", + "RT05": 'Return value description should finish with "."', + "YD01": "No Yields section found", + "SA01": "See Also section not found", + "SA02": "Missing period at end of description for See Also " + '"{reference_name}" reference', + "SA03": "Description should be capitalized for See Also " + '"{reference_name}" reference', + "SA04": 'Missing description for See Also "{reference_name}" reference', + "SA05": "{reference_name} in `See Also` section does not need `pandas` " + "prefix, use {right_reference} instead.", + "EX01": "No examples section found", + "EX02": "Examples do not pass tests:\n{doctest_log}", + "EX03": "flake8 error: {error_code} {error_message}{times_happening}", + "EX04": "Do not import {imported_library}, as it is imported " + "automatically for the examples (numpy as np, pandas as pd)", } @@ -182,43 +193,47 @@ def get_api_items(api_doc_fd): The name of the subsection in the API page where the object item is located. """ - current_module = 'pandas' - previous_line = current_section = current_subsection = '' + current_module = "pandas" + previous_line = current_section = current_subsection = "" position = None for line in api_doc_fd: line = line.strip() if len(line) == len(previous_line): - if set(line) == set('-'): + if set(line) == set("-"): current_section = previous_line continue - if set(line) == set('~'): + if set(line) == set("~"): current_subsection = previous_line continue - if line.startswith('.. currentmodule::'): - current_module = line.replace('.. currentmodule::', '').strip() + if line.startswith(".. currentmodule::"): + current_module = line.replace(".. currentmodule::", "").strip() continue - if line == '.. autosummary::': - position = 'autosummary' + if line == ".. autosummary::": + position = "autosummary" continue - if position == 'autosummary': - if line == '': - position = 'items' + if position == "autosummary": + if line == "": + position = "items" continue - if position == 'items': - if line == '': + if position == "items": + if line == "": position = None continue item = line.strip() func = importlib.import_module(current_module) - for part in item.split('.'): + for part in item.split("."): func = getattr(func, part) - yield ('.'.join([current_module, item]), func, - current_section, current_subsection) + yield ( + ".".join([current_module, item]), + func, + current_section, + current_subsection, + ) previous_line = line @@ -229,7 +244,7 @@ def __init__(self, name): obj = self._load_obj(name) self.obj = obj self.code_obj = self._to_original_callable(obj) - self.raw_doc = obj.__doc__ or '' + self.raw_doc = obj.__doc__ or "" self.clean_doc = pydoc.getdoc(obj) self.doc = NumpyDocString(self.clean_doc) @@ -256,9 +271,9 @@ def _load_obj(name): >>> Docstring._load_obj('pandas.Series') """ - for maxsplit in range(1, name.count('.') + 1): + for maxsplit in range(1, name.count(".") + 1): # TODO when py3 only replace by: module, *func_parts = ... - func_name_split = name.rsplit('.', maxsplit) + func_name_split = name.rsplit(".", maxsplit) module = func_name_split[0] func_parts = func_name_split[1:] try: @@ -268,9 +283,8 @@ def _load_obj(name): else: continue - if 'obj' not in locals(): - raise ImportError('No module can be imported ' - 'from "{}"'.format(name)) + if "obj" not in locals(): + raise ImportError("No module can be imported " 'from "{}"'.format(name)) for part in func_parts: obj = getattr(obj, part) @@ -288,7 +302,7 @@ def _to_original_callable(obj): while True: if inspect.isfunction(obj) or inspect.isclass(obj): f = inspect.getfile(obj) - if f.startswith('<') and f.endswith('>'): + if f.startswith("<") and f.endswith(">"): return None return obj if inspect.ismethod(obj): @@ -307,8 +321,7 @@ def type(self): @property def is_function_or_method(self): # TODO(py27): remove ismethod - return (inspect.isfunction(self.obj) - or inspect.ismethod(self.obj)) + return inspect.isfunction(self.obj) or inspect.ismethod(self.obj) @property def source_file_name(self): @@ -342,16 +355,15 @@ def source_file_def_line(self): @property def github_url(self): - url = 'https://github.com/pandas-dev/pandas/blob/master/' - url += '{}#L{}'.format(self.source_file_name, - self.source_file_def_line) + url = "https://github.com/pandas-dev/pandas/blob/master/" + url += "{}#L{}".format(self.source_file_name, self.source_file_def_line) return url @property def start_blank_lines(self): i = None if self.raw_doc: - for i, row in enumerate(self.raw_doc.split('\n')): + for i, row in enumerate(self.raw_doc.split("\n")): if row.strip(): break return i @@ -360,7 +372,7 @@ def start_blank_lines(self): def end_blank_lines(self): i = None if self.raw_doc: - for i, row in enumerate(reversed(self.raw_doc.split('\n'))): + for i, row in enumerate(reversed(self.raw_doc.split("\n"))): if row.strip(): break return i @@ -368,7 +380,7 @@ def end_blank_lines(self): @property def double_blank_lines(self): prev = True - for row in self.raw_doc.split('\n'): + for row in self.raw_doc.split("\n"): if not prev and not row.strip(): return True prev = row.strip() @@ -380,25 +392,27 @@ def section_titles(self): self.doc._doc.reset() while not self.doc._doc.eof(): content = self.doc._read_to_next_section() - if (len(content) > 1 - and len(content[0]) == len(content[1]) - and set(content[1]) == {'-'}): + if ( + len(content) > 1 + and len(content[0]) == len(content[1]) + and set(content[1]) == {"-"} + ): sections.append(content[0]) return sections @property def summary(self): - return ' '.join(self.doc['Summary']) + return " ".join(self.doc["Summary"]) @property def num_summary_lines(self): - return len(self.doc['Summary']) + return len(self.doc["Summary"]) @property def extended_summary(self): - if not self.doc['Extended Summary'] and len(self.doc['Summary']) > 1: - return ' '.join(self.doc['Summary']) - return ' '.join(self.doc['Extended Summary']) + if not self.doc["Extended Summary"] and len(self.doc["Summary"]) > 1: + return " ".join(self.doc["Summary"]) + return " ".join(self.doc["Extended Summary"]) @property def needs_summary(self): @@ -406,16 +420,17 @@ def needs_summary(self): @property def doc_parameters(self): - return collections.OrderedDict((name, (type_, ''.join(desc))) - for name, type_, desc - in self.doc['Parameters']) + return collections.OrderedDict( + (name, (type_, "".join(desc))) + for name, type_, desc in self.doc["Parameters"] + ) @property def signature_parameters(self): if inspect.isclass(self.obj): - if hasattr(self.obj, '_accessors') and ( - self.name.split('.')[-1] in - self.obj._accessors): + if hasattr(self.obj, "_accessors") and ( + self.name.split(".")[-1] in self.obj._accessors + ): # accessor classes have a signature but don't want to show this return tuple() try: @@ -430,7 +445,7 @@ def signature_parameters(self): if sig.varkw: params.append("**" + sig.varkw) params = tuple(params) - if params and params[0] in ('self', 'cls'): + if params and params[0] in ("self", "cls"): return params[1:] return params @@ -441,15 +456,21 @@ def parameter_mismatches(self): doc_params = tuple(self.doc_parameters) missing = set(signature_params) - set(doc_params) if missing: - errs.append(error('PR01', missing_params=pprint_thing(missing))) + errs.append(error("PR01", missing_params=pprint_thing(missing))) extra = set(doc_params) - set(signature_params) if extra: - errs.append(error('PR02', unknown_params=pprint_thing(extra))) - if (not missing and not extra and signature_params != doc_params - and not (not signature_params and not doc_params)): - errs.append(error('PR03', - actual_params=signature_params, - documented_params=doc_params)) + errs.append(error("PR02", unknown_params=pprint_thing(extra))) + if ( + not missing + and not extra + and signature_params != doc_params + and not (not signature_params and not doc_params) + ): + errs.append( + error( + "PR03", actual_params=signature_params, documented_params=doc_params + ) + ) return errs @@ -464,44 +485,44 @@ def parameter_desc(self, param): desc = self.doc_parameters[param][1] # Find and strip out any sphinx directives for directive in DIRECTIVES: - full_directive = '.. {}'.format(directive) + full_directive = ".. {}".format(directive) if full_directive in desc: # Only retain any description before the directive - desc = desc[:desc.index(full_directive)] + desc = desc[: desc.index(full_directive)] return desc @property def see_also(self): result = collections.OrderedDict() - for funcs, desc in self.doc['See Also']: + for funcs, desc in self.doc["See Also"]: for func, _ in funcs: - result[func] = ''.join(desc) + result[func] = "".join(desc) return result @property def examples(self): - return self.doc['Examples'] + return self.doc["Examples"] @property def returns(self): - return self.doc['Returns'] + return self.doc["Returns"] @property def yields(self): - return self.doc['Yields'] + return self.doc["Yields"] @property def method_source(self): try: source = inspect.getsource(self.obj) except TypeError: - return '' + return "" return textwrap.dedent(source) @property def method_returns_something(self): - ''' + """ Check if the docstrings method can return something. Bare returns, returns valued None and returns from nested functions are @@ -511,7 +532,7 @@ def method_returns_something(self): ------- bool Whether the docstrings method can return something. - ''' + """ def get_returns_not_on_nested_functions(node): returns = [node] if isinstance(node, ast.Return) else [] @@ -537,11 +558,11 @@ def get_returns_not_on_nested_functions(node): @property def first_line_ends_in_dot(self): if self.doc: - return self.doc.split('\n')[0][-1] == '.' + return self.doc.split("\n")[0][-1] == "." @property def deprecated(self): - return '.. deprecated:: ' in (self.summary + self.extended_summary) + return ".. deprecated:: " in (self.summary + self.extended_summary) @property def mentioned_private_classes(self): @@ -552,8 +573,8 @@ def examples_errors(self): flags = doctest.NORMALIZE_WHITESPACE | doctest.IGNORE_EXCEPTION_DETAIL finder = doctest.DocTestFinder() runner = doctest.DocTestRunner(optionflags=flags) - context = {'np': numpy, 'pd': pandas} - error_msgs = '' + context = {"np": numpy, "pd": pandas} + error_msgs = "" for test in finder.find(self.raw_doc, self.name, globs=context): f = StringIO() runner.run(test, out=f.write) @@ -571,14 +592,18 @@ def validate_pep8(self): # F401 is needed to not generate flake8 errors in examples # that do not user numpy or pandas - content = ''.join(('import numpy as np # noqa: F401\n', - 'import pandas as pd # noqa: F401\n', - *self.examples_source_code)) + content = "".join( + ( + "import numpy as np # noqa: F401\n", + "import pandas as pd # noqa: F401\n", + *self.examples_source_code, + ) + ) application = flake8.main.application.Application() application.initialize(["--quiet"]) - with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8') as file: + with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8") as file: file.write(content) file.flush() application.run_checks([file.name]) @@ -588,7 +613,7 @@ def validate_pep8(self): application.formatter.write = lambda line, source: None application.report() - yield from application.guide.stats.statistics_for('') + yield from application.guide.stats.statistics_for("") def get_validation_data(doc): @@ -639,57 +664,55 @@ def get_validation_data(doc): errs = [] wrns = [] if not doc.raw_doc: - errs.append(error('GL08')) - return errs, wrns, '' + errs.append(error("GL08")) + return errs, wrns, "" if doc.start_blank_lines != 1: - errs.append(error('GL01')) + errs.append(error("GL01")) if doc.end_blank_lines != 1: - errs.append(error('GL02')) + errs.append(error("GL02")) if doc.double_blank_lines: - errs.append(error('GL03')) + errs.append(error("GL03")) mentioned_errs = doc.mentioned_private_classes if mentioned_errs: - errs.append(error('GL04', - mentioned_private_classes=', '.join(mentioned_errs))) + errs.append(error("GL04", mentioned_private_classes=", ".join(mentioned_errs))) for line in doc.raw_doc.splitlines(): if re.match("^ *\t", line): - errs.append(error('GL05', line_with_tabs=line.lstrip())) + errs.append(error("GL05", line_with_tabs=line.lstrip())) - unexpected_sections = [section for section in doc.section_titles - if section not in ALLOWED_SECTIONS] + unexpected_sections = [ + section for section in doc.section_titles if section not in ALLOWED_SECTIONS + ] for section in unexpected_sections: - errs.append(error('GL06', - section=section, - allowed_sections=', '.join(ALLOWED_SECTIONS))) + errs.append( + error("GL06", section=section, allowed_sections=", ".join(ALLOWED_SECTIONS)) + ) - correct_order = [section for section in ALLOWED_SECTIONS - if section in doc.section_titles] + correct_order = [ + section for section in ALLOWED_SECTIONS if section in doc.section_titles + ] if correct_order != doc.section_titles: - errs.append(error('GL07', - correct_sections=', '.join(correct_order))) + errs.append(error("GL07", correct_sections=", ".join(correct_order))) - if (doc.deprecated - and not doc.extended_summary.startswith('.. deprecated:: ')): - errs.append(error('GL09')) + if doc.deprecated and not doc.extended_summary.startswith(".. deprecated:: "): + errs.append(error("GL09")) if not doc.summary: - errs.append(error('SS01')) + errs.append(error("SS01")) else: if not doc.summary[0].isupper(): - errs.append(error('SS02')) - if doc.summary[-1] != '.': - errs.append(error('SS03')) + errs.append(error("SS02")) + if doc.summary[-1] != ".": + errs.append(error("SS03")) if doc.summary != doc.summary.lstrip(): - errs.append(error('SS04')) - elif (doc.is_function_or_method - and doc.summary.split(' ')[0][-1] == 's'): - errs.append(error('SS05')) + errs.append(error("SS04")) + elif doc.is_function_or_method and doc.summary.split(" ")[0][-1] == "s": + errs.append(error("SS05")) if doc.num_summary_lines > 1: - errs.append(error('SS06')) + errs.append(error("SS06")) if not doc.extended_summary: - wrns.append(('ES01', 'No extended summary found')) + wrns.append(("ES01", "No extended summary found")) # PR01: Parameters not documented # PR02: Unknown parameters @@ -699,84 +722,98 @@ def get_validation_data(doc): for param in doc.doc_parameters: if not param.startswith("*"): # Check can ignore var / kwargs if not doc.parameter_type(param): - if ':' in param: - errs.append(error('PR10', - param_name=param.split(':')[0])) + if ":" in param: + errs.append(error("PR10", param_name=param.split(":")[0])) else: - errs.append(error('PR04', param_name=param)) + errs.append(error("PR04", param_name=param)) else: - if doc.parameter_type(param)[-1] == '.': - errs.append(error('PR05', param_name=param)) - common_type_errors = [('integer', 'int'), - ('boolean', 'bool'), - ('string', 'str')] + if doc.parameter_type(param)[-1] == ".": + errs.append(error("PR05", param_name=param)) + common_type_errors = [ + ("integer", "int"), + ("boolean", "bool"), + ("string", "str"), + ] for wrong_type, right_type in common_type_errors: if wrong_type in doc.parameter_type(param): - errs.append(error('PR06', - param_name=param, - right_type=right_type, - wrong_type=wrong_type)) + errs.append( + error( + "PR06", + param_name=param, + right_type=right_type, + wrong_type=wrong_type, + ) + ) if not doc.parameter_desc(param): - errs.append(error('PR07', param_name=param)) + errs.append(error("PR07", param_name=param)) else: if not doc.parameter_desc(param)[0].isupper(): - errs.append(error('PR08', param_name=param)) - if doc.parameter_desc(param)[-1] != '.': - errs.append(error('PR09', param_name=param)) + errs.append(error("PR08", param_name=param)) + if doc.parameter_desc(param)[-1] != ".": + errs.append(error("PR09", param_name=param)) if doc.is_function_or_method: if not doc.returns: if doc.method_returns_something: - errs.append(error('RT01')) + errs.append(error("RT01")) else: if len(doc.returns) == 1 and doc.returns[0].name: - errs.append(error('RT02')) + errs.append(error("RT02")) for name_or_type, type_, desc in doc.returns: if not desc: - errs.append(error('RT03')) + errs.append(error("RT03")) else: - desc = ' '.join(desc) + desc = " ".join(desc) if not desc[0].isupper(): - errs.append(error('RT04')) - if not desc.endswith('.'): - errs.append(error('RT05')) + errs.append(error("RT04")) + if not desc.endswith("."): + errs.append(error("RT05")) - if not doc.yields and 'yield' in doc.method_source: - errs.append(error('YD01')) + if not doc.yields and "yield" in doc.method_source: + errs.append(error("YD01")) if not doc.see_also: - wrns.append(error('SA01')) + wrns.append(error("SA01")) else: for rel_name, rel_desc in doc.see_also.items(): if rel_desc: - if not rel_desc.endswith('.'): - errs.append(error('SA02', reference_name=rel_name)) + if not rel_desc.endswith("."): + errs.append(error("SA02", reference_name=rel_name)) if not rel_desc[0].isupper(): - errs.append(error('SA03', reference_name=rel_name)) + errs.append(error("SA03", reference_name=rel_name)) else: - errs.append(error('SA04', reference_name=rel_name)) - if rel_name.startswith('pandas.'): - errs.append(error('SA05', - reference_name=rel_name, - right_reference=rel_name[len('pandas.'):])) - - examples_errs = '' + errs.append(error("SA04", reference_name=rel_name)) + if rel_name.startswith("pandas."): + errs.append( + error( + "SA05", + reference_name=rel_name, + right_reference=rel_name[len("pandas.") :], + ) + ) + + examples_errs = "" if not doc.examples: - wrns.append(error('EX01')) + wrns.append(error("EX01")) else: examples_errs = doc.examples_errors if examples_errs: - errs.append(error('EX02', doctest_log=examples_errs)) + errs.append(error("EX02", doctest_log=examples_errs)) for err in doc.validate_pep8(): - errs.append(error('EX03', - error_code=err.error_code, - error_message=err.message, - times_happening=' ({} times)'.format(err.count) - if err.count > 1 else '')) - examples_source_code = ''.join(doc.examples_source_code) - for wrong_import in ('numpy', 'pandas'): - if 'import {}'.format(wrong_import) in examples_source_code: - errs.append(error('EX04', imported_library=wrong_import)) + errs.append( + error( + "EX03", + error_code=err.error_code, + error_message=err.message, + times_happening=" ({} times)".format(err.count) + if err.count > 1 + else "", + ) + ) + examples_source_code = "".join(doc.examples_source_code) + for wrong_import in ("numpy", "pandas"): + if "import {}".format(wrong_import) in examples_source_code: + errs.append(error("EX04", imported_library=wrong_import)) return errs, wrns, examples_errs @@ -797,15 +834,17 @@ def validate_one(func_name): """ doc = Docstring(func_name) errs, wrns, examples_errs = get_validation_data(doc) - return {'type': doc.type, - 'docstring': doc.clean_doc, - 'deprecated': doc.deprecated, - 'file': doc.source_file_name, - 'file_line': doc.source_file_def_line, - 'github_link': doc.github_url, - 'errors': errs, - 'warnings': wrns, - 'examples_errors': examples_errs} + return { + "type": doc.type, + "docstring": doc.clean_doc, + "deprecated": doc.deprecated, + "file": doc.source_file_name, + "file_line": doc.source_file_def_line, + "github_link": doc.github_url, + "errors": errs, + "warnings": wrns, + "examples_errors": examples_errs, + } def validate_all(prefix, ignore_deprecated=False): @@ -831,8 +870,7 @@ def validate_all(prefix, ignore_deprecated=False): seen = {} # functions from the API docs - api_doc_fnames = os.path.join( - BASE_PATH, 'doc', 'source', 'reference', '*.rst') + api_doc_fnames = os.path.join(BASE_PATH, "doc", "source", "reference", "*.rst") api_items = [] for api_doc_fname in glob.glob(api_doc_fnames): with open(api_doc_fname) as f: @@ -841,16 +879,20 @@ def validate_all(prefix, ignore_deprecated=False): if prefix and not func_name.startswith(prefix): continue doc_info = validate_one(func_name) - if ignore_deprecated and doc_info['deprecated']: + if ignore_deprecated and doc_info["deprecated"]: continue result[func_name] = doc_info - shared_code_key = doc_info['file'], doc_info['file_line'] - shared_code = seen.get(shared_code_key, '') - result[func_name].update({'in_api': True, - 'section': section, - 'subsection': subsection, - 'shared_code_with': shared_code}) + shared_code_key = doc_info["file"], doc_info["file_line"] + shared_code = seen.get(shared_code_key, "") + result[func_name].update( + { + "in_api": True, + "section": section, + "subsection": subsection, + "shared_code_with": shared_code, + } + ) seen[shared_code_key] = func_name @@ -858,54 +900,55 @@ def validate_all(prefix, ignore_deprecated=False): api_item_names = set(list(zip(*api_items))[0]) for class_ in (pandas.Series, pandas.DataFrame): for member in inspect.getmembers(class_): - func_name = 'pandas.{}.{}'.format(class_.__name__, member[0]) - if (not member[0].startswith('_') - and func_name not in api_item_names): + func_name = "pandas.{}.{}".format(class_.__name__, member[0]) + if not member[0].startswith("_") and func_name not in api_item_names: if prefix and not func_name.startswith(prefix): continue doc_info = validate_one(func_name) - if ignore_deprecated and doc_info['deprecated']: + if ignore_deprecated and doc_info["deprecated"]: continue result[func_name] = doc_info - result[func_name]['in_api'] = False + result[func_name]["in_api"] = False return result def main(func_name, prefix, errors, output_format, ignore_deprecated): - def header(title, width=80, char='#'): + def header(title, width=80, char="#"): full_line = char * width side_len = (width - len(title) - 2) // 2 - adj = '' if len(title) % 2 == 0 else ' ' - title_line = '{side} {title}{adj} {side}'.format(side=char * side_len, - title=title, - adj=adj) + adj = "" if len(title) % 2 == 0 else " " + title_line = "{side} {title}{adj} {side}".format( + side=char * side_len, title=title, adj=adj + ) - return '\n{full_line}\n{title_line}\n{full_line}\n\n'.format( - full_line=full_line, title_line=title_line) + return "\n{full_line}\n{title_line}\n{full_line}\n\n".format( + full_line=full_line, title_line=title_line + ) exit_status = 0 if func_name is None: result = validate_all(prefix, ignore_deprecated) - if output_format == 'json': + if output_format == "json": output = json.dumps(result) else: - if output_format == 'default': - output_format = '{text}\n' - elif output_format == 'azure': - output_format = ('##vso[task.logissue type=error;' - 'sourcepath={path};' - 'linenumber={row};' - 'code={code};' - ']{text}\n') + if output_format == "default": + output_format = "{text}\n" + elif output_format == "azure": + output_format = ( + "##vso[task.logissue type=error;" + "sourcepath={path};" + "linenumber={row};" + "code={code};" + "]{text}\n" + ) else: - raise ValueError('Unknown output_format "{}"'.format( - output_format)) + raise ValueError('Unknown output_format "{}"'.format(output_format)) - output = '' + output = "" for name, res in result.items(): - for err_code, err_desc in res['errors']: + for err_code, err_desc in res["errors"]: # The script would be faster if instead of filtering the # errors after validating them, it didn't validate them # initially. But that would complicate the code too much @@ -914,76 +957,93 @@ def header(title, width=80, char='#'): exit_status += 1 output += output_format.format( name=name, - path=res['file'], - row=res['file_line'], + path=res["file"], + row=res["file_line"], code=err_code, - text='{}: {}'.format(name, err_desc)) + text="{}: {}".format(name, err_desc), + ) sys.stdout.write(output) else: result = validate_one(func_name) - sys.stderr.write(header('Docstring ({})'.format(func_name))) - sys.stderr.write('{}\n'.format(result['docstring'])) - sys.stderr.write(header('Validation')) - if result['errors']: - sys.stderr.write('{} Errors found:\n'.format( - len(result['errors']))) - for err_code, err_desc in result['errors']: + sys.stderr.write(header("Docstring ({})".format(func_name))) + sys.stderr.write("{}\n".format(result["docstring"])) + sys.stderr.write(header("Validation")) + if result["errors"]: + sys.stderr.write("{} Errors found:\n".format(len(result["errors"]))) + for err_code, err_desc in result["errors"]: # Failing examples are printed at the end - if err_code == 'EX02': - sys.stderr.write('\tExamples do not pass tests\n') + if err_code == "EX02": + sys.stderr.write("\tExamples do not pass tests\n") continue - sys.stderr.write('\t{}\n'.format(err_desc)) - if result['warnings']: - sys.stderr.write('{} Warnings found:\n'.format( - len(result['warnings']))) - for wrn_code, wrn_desc in result['warnings']: - sys.stderr.write('\t{}\n'.format(wrn_desc)) + sys.stderr.write("\t{}\n".format(err_desc)) + if result["warnings"]: + sys.stderr.write("{} Warnings found:\n".format(len(result["warnings"]))) + for wrn_code, wrn_desc in result["warnings"]: + sys.stderr.write("\t{}\n".format(wrn_desc)) - if not result['errors']: - sys.stderr.write('Docstring for "{}" correct. :)\n'.format( - func_name)) + if not result["errors"]: + sys.stderr.write('Docstring for "{}" correct. :)\n'.format(func_name)) - if result['examples_errors']: - sys.stderr.write(header('Doctests')) - sys.stderr.write(result['examples_errors']) + if result["examples_errors"]: + sys.stderr.write(header("Doctests")) + sys.stderr.write(result["examples_errors"]) return exit_status -if __name__ == '__main__': - format_opts = 'default', 'json', 'azure' - func_help = ('function or method to validate (e.g. pandas.DataFrame.head) ' - 'if not provided, all docstrings are validated and returned ' - 'as JSON') - argparser = argparse.ArgumentParser( - description='validate pandas docstrings') - argparser.add_argument('function', - nargs='?', - default=None, - help=func_help) - argparser.add_argument('--format', default='default', choices=format_opts, - help='format of the output when validating ' - 'multiple docstrings (ignored when validating one).' - 'It can be {}'.format(str(format_opts)[1:-1])) - argparser.add_argument('--prefix', default=None, help='pattern for the ' - 'docstring names, in order to decide which ones ' - 'will be validated. A prefix "pandas.Series.str.' - 'will make the script validate all the docstrings' - 'of methods starting by this pattern. It is ' - 'ignored if parameter function is provided') - argparser.add_argument('--errors', default=None, help='comma separated ' - 'list of error codes to validate. By default it ' - 'validates all errors (ignored when validating ' - 'a single docstring)') - argparser.add_argument('--ignore_deprecated', default=False, - action='store_true', help='if this flag is set, ' - 'deprecated objects are ignored when validating ' - 'all docstrings') +if __name__ == "__main__": + format_opts = "default", "json", "azure" + func_help = ( + "function or method to validate (e.g. pandas.DataFrame.head) " + "if not provided, all docstrings are validated and returned " + "as JSON" + ) + argparser = argparse.ArgumentParser(description="validate pandas docstrings") + argparser.add_argument("function", nargs="?", default=None, help=func_help) + argparser.add_argument( + "--format", + default="default", + choices=format_opts, + help="format of the output when validating " + "multiple docstrings (ignored when validating one)." + "It can be {}".format(str(format_opts)[1:-1]), + ) + argparser.add_argument( + "--prefix", + default=None, + help="pattern for the " + "docstring names, in order to decide which ones " + 'will be validated. A prefix "pandas.Series.str.' + "will make the script validate all the docstrings" + "of methods starting by this pattern. It is " + "ignored if parameter function is provided", + ) + argparser.add_argument( + "--errors", + default=None, + help="comma separated " + "list of error codes to validate. By default it " + "validates all errors (ignored when validating " + "a single docstring)", + ) + argparser.add_argument( + "--ignore_deprecated", + default=False, + action="store_true", + help="if this flag is set, " + "deprecated objects are ignored when validating " + "all docstrings", + ) args = argparser.parse_args() - sys.exit(main(args.function, args.prefix, - args.errors.split(',') if args.errors else None, - args.format, - args.ignore_deprecated)) + sys.exit( + main( + args.function, + args.prefix, + args.errors.split(",") if args.errors else None, + args.format, + args.ignore_deprecated, + ) + ) diff --git a/setup.py b/setup.py index 19c22fc25733d8..53e12da53cdebe 100755 --- a/setup.py +++ b/setup.py @@ -19,34 +19,37 @@ # versioning import versioneer + cmdclass = versioneer.get_cmdclass() def is_platform_windows(): - return sys.platform == 'win32' or sys.platform == 'cygwin' + return sys.platform == "win32" or sys.platform == "cygwin" def is_platform_mac(): - return sys.platform == 'darwin' + return sys.platform == "darwin" -min_numpy_ver = '1.13.3' +min_numpy_ver = "1.13.3" setuptools_kwargs = { - 'install_requires': [ - 'python-dateutil >= 2.6.1', - 'pytz >= 2017.2', - 'numpy >= {numpy_ver}'.format(numpy_ver=min_numpy_ver), + "install_requires": [ + "python-dateutil >= 2.6.1", + "pytz >= 2017.2", + "numpy >= {numpy_ver}".format(numpy_ver=min_numpy_ver), ], - 'setup_requires': ['numpy >= {numpy_ver}'.format(numpy_ver=min_numpy_ver)], - 'zip_safe': False, + "setup_requires": ["numpy >= {numpy_ver}".format(numpy_ver=min_numpy_ver)], + "zip_safe": False, } -min_cython_ver = '0.28.2' +min_cython_ver = "0.28.2" try: import Cython + ver = Cython.__version__ from Cython.Build import cythonize + _CYTHON_INSTALLED = ver >= LooseVersion(min_cython_ver) except ImportError: _CYTHON_INSTALLED = False @@ -60,11 +63,13 @@ def is_platform_mac(): try: if not _CYTHON_INSTALLED: - raise ImportError('No supported version of Cython installed.') + raise ImportError("No supported version of Cython installed.") from Cython.Distutils.old_build_ext import old_build_ext as _build_ext + cython = True except ImportError: from distutils.command.build_ext import build_ext as _build_ext + cython = False else: try: @@ -73,25 +78,29 @@ def is_platform_mac(): except ImportError: import tempita except ImportError: - raise ImportError('Building pandas requires Tempita: ' - 'pip install Tempita') + raise ImportError("Building pandas requires Tempita: " "pip install Tempita") _pxi_dep_template = { - 'algos': ['_libs/algos_common_helper.pxi.in', - '_libs/algos_take_helper.pxi.in', - '_libs/algos_rank_helper.pxi.in'], - 'groupby': ['_libs/groupby_helper.pxi.in'], - 'hashtable': ['_libs/hashtable_class_helper.pxi.in', - '_libs/hashtable_func_helper.pxi.in'], - 'index': ['_libs/index_class_helper.pxi.in'], - 'sparse': ['_libs/sparse_op_helper.pxi.in'], - 'interval': ['_libs/intervaltree.pxi.in']} + "algos": [ + "_libs/algos_common_helper.pxi.in", + "_libs/algos_take_helper.pxi.in", + "_libs/algos_rank_helper.pxi.in", + ], + "groupby": ["_libs/groupby_helper.pxi.in"], + "hashtable": [ + "_libs/hashtable_class_helper.pxi.in", + "_libs/hashtable_func_helper.pxi.in", + ], + "index": ["_libs/index_class_helper.pxi.in"], + "sparse": ["_libs/sparse_op_helper.pxi.in"], + "interval": ["_libs/intervaltree.pxi.in"], +} _pxifiles = [] _pxi_dep = {} for module, files in _pxi_dep_template.items(): - pxi_files = [pjoin('pandas', x) for x in files] + pxi_files = [pjoin("pandas", x) for x in files] _pxifiles.extend(pxi_files) _pxi_dep[module] = pxi_files @@ -101,11 +110,13 @@ class build_ext(_build_ext): def render_templates(cls, pxifiles): for pxifile in pxifiles: # build pxifiles first, template extension must be .pxi.in - assert pxifile.endswith('.pxi.in') + assert pxifile.endswith(".pxi.in") outfile = pxifile[:-3] - if (os.path.exists(outfile) and - os.stat(pxifile).st_mtime < os.stat(outfile).st_mtime): + if ( + os.path.exists(outfile) + and os.stat(pxifile).st_mtime < os.stat(outfile).st_mtime + ): # if .pxi.in is not updated, no need to output .pxi continue @@ -122,17 +133,17 @@ def build_extensions(self): if cython: self.render_templates(_pxifiles) - numpy_incl = pkg_resources.resource_filename('numpy', 'core/include') + numpy_incl = pkg_resources.resource_filename("numpy", "core/include") for ext in self.extensions: - if (hasattr(ext, 'include_dirs') and - numpy_incl not in ext.include_dirs): + if hasattr(ext, "include_dirs") and numpy_incl not in ext.include_dirs: ext.include_dirs.append(numpy_incl) _build_ext.build_extensions(self) -DESCRIPTION = ("Powerful data structures for data analysis, time series, " - "and statistics") +DESCRIPTION = ( + "Powerful data structures for data analysis, time series, " "and statistics" +) LONG_DESCRIPTION = """ **pandas** is a Python package providing fast, flexible, and expressive data structures designed to make working with structured (tabular, multidimensional, @@ -197,29 +208,30 @@ def build_extensions(self): the ideal tool for all of these tasks. """ -DISTNAME = 'pandas' -LICENSE = 'BSD' +DISTNAME = "pandas" +LICENSE = "BSD" AUTHOR = "The PyData Development Team" EMAIL = "pydata@googlegroups.com" URL = "http://pandas.pydata.org" -DOWNLOAD_URL = '' +DOWNLOAD_URL = "" PROJECT_URLS = { - 'Bug Tracker': 'https://github.com/pandas-dev/pandas/issues', - 'Documentation': 'http://pandas.pydata.org/pandas-docs/stable/', - 'Source Code': 'https://github.com/pandas-dev/pandas' + "Bug Tracker": "https://github.com/pandas-dev/pandas/issues", + "Documentation": "http://pandas.pydata.org/pandas-docs/stable/", + "Source Code": "https://github.com/pandas-dev/pandas", } CLASSIFIERS = [ - 'Development Status :: 5 - Production/Stable', - 'Environment :: Console', - 'Operating System :: OS Independent', - 'Intended Audience :: Science/Research', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Cython', - 'Topic :: Scientific/Engineering'] + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Operating System :: OS Independent", + "Intended Audience :: Science/Research", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Cython", + "Topic :: Scientific/Engineering", +] class CleanCommand(Command): @@ -232,37 +244,44 @@ def initialize_options(self): self._clean_me = [] self._clean_trees = [] - base = pjoin('pandas', '_libs', 'src') - tsbase = pjoin('pandas', '_libs', 'tslibs', 'src') - dt = pjoin(tsbase, 'datetime') - util = pjoin('pandas', 'util') - parser = pjoin(base, 'parser') - ujson_python = pjoin(base, 'ujson', 'python') - ujson_lib = pjoin(base, 'ujson', 'lib') - self._clean_exclude = [pjoin(dt, 'np_datetime.c'), - pjoin(dt, 'np_datetime_strings.c'), - pjoin(parser, 'tokenizer.c'), - pjoin(parser, 'io.c'), - pjoin(ujson_python, 'ujson.c'), - pjoin(ujson_python, 'objToJSON.c'), - pjoin(ujson_python, 'JSONtoObj.c'), - pjoin(ujson_lib, 'ultrajsonenc.c'), - pjoin(ujson_lib, 'ultrajsondec.c'), - pjoin(util, 'move.c'), - ] - - for root, dirs, files in os.walk('pandas'): + base = pjoin("pandas", "_libs", "src") + tsbase = pjoin("pandas", "_libs", "tslibs", "src") + dt = pjoin(tsbase, "datetime") + util = pjoin("pandas", "util") + parser = pjoin(base, "parser") + ujson_python = pjoin(base, "ujson", "python") + ujson_lib = pjoin(base, "ujson", "lib") + self._clean_exclude = [ + pjoin(dt, "np_datetime.c"), + pjoin(dt, "np_datetime_strings.c"), + pjoin(parser, "tokenizer.c"), + pjoin(parser, "io.c"), + pjoin(ujson_python, "ujson.c"), + pjoin(ujson_python, "objToJSON.c"), + pjoin(ujson_python, "JSONtoObj.c"), + pjoin(ujson_lib, "ultrajsonenc.c"), + pjoin(ujson_lib, "ultrajsondec.c"), + pjoin(util, "move.c"), + ] + + for root, dirs, files in os.walk("pandas"): for f in files: filepath = pjoin(root, f) if filepath in self._clean_exclude: continue - if os.path.splitext(f)[-1] in ('.pyc', '.so', '.o', - '.pyo', - '.pyd', '.c', '.orig'): + if os.path.splitext(f)[-1] in ( + ".pyc", + ".so", + ".o", + ".pyo", + ".pyd", + ".c", + ".orig", + ): self._clean_me.append(filepath) for d in dirs: - if d == '__pycache__': + if d == "__pycache__": self._clean_trees.append(pjoin(root, d)) # clean the generated pxi files @@ -270,7 +289,7 @@ def initialize_options(self): pxifile = pxifile.replace(".pxi.in", ".pxi") self._clean_me.append(pxifile) - for d in ('build', 'dist'): + for d in ("build", "dist"): if os.path.exists(d): self._clean_trees.append(d) @@ -292,68 +311,75 @@ def run(self): # we need to inherit from the versioneer # class as it encodes the version info -sdist_class = cmdclass['sdist'] +sdist_class = cmdclass["sdist"] class CheckSDist(sdist_class): """Custom sdist that ensures Cython has compiled all pyx files to c.""" - _pyxfiles = ['pandas/_libs/lib.pyx', - 'pandas/_libs/hashtable.pyx', - 'pandas/_libs/tslib.pyx', - 'pandas/_libs/index.pyx', - 'pandas/_libs/internals.pyx', - 'pandas/_libs/algos.pyx', - 'pandas/_libs/join.pyx', - 'pandas/_libs/indexing.pyx', - 'pandas/_libs/interval.pyx', - 'pandas/_libs/hashing.pyx', - 'pandas/_libs/missing.pyx', - 'pandas/_libs/reduction.pyx', - 'pandas/_libs/testing.pyx', - 'pandas/_libs/skiplist.pyx', - 'pandas/_libs/sparse.pyx', - 'pandas/_libs/ops.pyx', - 'pandas/_libs/parsers.pyx', - 'pandas/_libs/tslibs/c_timestamp.pyx', - 'pandas/_libs/tslibs/ccalendar.pyx', - 'pandas/_libs/tslibs/period.pyx', - 'pandas/_libs/tslibs/strptime.pyx', - 'pandas/_libs/tslibs/np_datetime.pyx', - 'pandas/_libs/tslibs/timedeltas.pyx', - 'pandas/_libs/tslibs/timestamps.pyx', - 'pandas/_libs/tslibs/timezones.pyx', - 'pandas/_libs/tslibs/conversion.pyx', - 'pandas/_libs/tslibs/fields.pyx', - 'pandas/_libs/tslibs/offsets.pyx', - 'pandas/_libs/tslibs/frequencies.pyx', - 'pandas/_libs/tslibs/resolution.pyx', - 'pandas/_libs/tslibs/parsing.pyx', - 'pandas/_libs/tslibs/tzconversion.pyx', - 'pandas/_libs/writers.pyx', - 'pandas/io/sas/sas.pyx'] - - _cpp_pyxfiles = ['pandas/_libs/window.pyx', - 'pandas/io/msgpack/_packer.pyx', - 'pandas/io/msgpack/_unpacker.pyx'] + _pyxfiles = [ + "pandas/_libs/lib.pyx", + "pandas/_libs/hashtable.pyx", + "pandas/_libs/tslib.pyx", + "pandas/_libs/index.pyx", + "pandas/_libs/internals.pyx", + "pandas/_libs/algos.pyx", + "pandas/_libs/join.pyx", + "pandas/_libs/indexing.pyx", + "pandas/_libs/interval.pyx", + "pandas/_libs/hashing.pyx", + "pandas/_libs/missing.pyx", + "pandas/_libs/reduction.pyx", + "pandas/_libs/testing.pyx", + "pandas/_libs/skiplist.pyx", + "pandas/_libs/sparse.pyx", + "pandas/_libs/ops.pyx", + "pandas/_libs/parsers.pyx", + "pandas/_libs/tslibs/c_timestamp.pyx", + "pandas/_libs/tslibs/ccalendar.pyx", + "pandas/_libs/tslibs/period.pyx", + "pandas/_libs/tslibs/strptime.pyx", + "pandas/_libs/tslibs/np_datetime.pyx", + "pandas/_libs/tslibs/timedeltas.pyx", + "pandas/_libs/tslibs/timestamps.pyx", + "pandas/_libs/tslibs/timezones.pyx", + "pandas/_libs/tslibs/conversion.pyx", + "pandas/_libs/tslibs/fields.pyx", + "pandas/_libs/tslibs/offsets.pyx", + "pandas/_libs/tslibs/frequencies.pyx", + "pandas/_libs/tslibs/resolution.pyx", + "pandas/_libs/tslibs/parsing.pyx", + "pandas/_libs/tslibs/tzconversion.pyx", + "pandas/_libs/writers.pyx", + "pandas/io/sas/sas.pyx", + ] + + _cpp_pyxfiles = [ + "pandas/_libs/window.pyx", + "pandas/io/msgpack/_packer.pyx", + "pandas/io/msgpack/_unpacker.pyx", + ] def initialize_options(self): sdist_class.initialize_options(self) def run(self): - if 'cython' in cmdclass: - self.run_command('cython') + if "cython" in cmdclass: + self.run_command("cython") else: # If we are not running cython then # compile the extensions correctly - pyx_files = [(self._pyxfiles, 'c'), (self._cpp_pyxfiles, 'cpp')] + pyx_files = [(self._pyxfiles, "c"), (self._cpp_pyxfiles, "cpp")] for pyxfiles, extension in pyx_files: for pyxfile in pyxfiles: sourcefile = pyxfile[:-3] + extension - msg = ("{extension}-source file '{source}' not found.\n" - "Run 'setup.py cython' before sdist.".format( - source=sourcefile, extension=extension)) + msg = ( + "{extension}-source file '{source}' not found.\n" + "Run 'setup.py cython' before sdist.".format( + source=sourcefile, extension=extension + ) + ) assert os.path.isfile(sourcefile), msg sdist_class.run(self) @@ -368,10 +394,14 @@ def check_cython_extensions(self, extensions): for src in ext.sources: if not os.path.exists(src): print("{}: -> [{}]".format(ext.name, ext.sources)) - raise Exception("""Cython-generated file '{src}' not found. + raise Exception( + """Cython-generated file '{src}' not found. Cython is required to compile pandas from a development branch. Please install Cython or download a release package of pandas. - """.format(src=src)) + """.format( + src=src + ) + ) def build_extensions(self): self.check_cython_extensions(self.extensions) @@ -384,6 +414,7 @@ class CythonCommand(build_ext): to compile pyx->c, and stop there. All this does is override the C-compile method build_extension() with a no-op. """ + def build_extension(self, ext): pass @@ -391,6 +422,7 @@ def build_extension(self, ext): class DummyBuildSrc(Command): """ numpy's build_src command interferes with Cython's build_ext. """ + user_options = [] def initialize_options(self): @@ -403,81 +435,82 @@ def run(self): pass -cmdclass.update({'clean': CleanCommand, - 'build': build}) +cmdclass.update({"clean": CleanCommand, "build": build}) if cython: - suffix = '.pyx' - cmdclass['build_ext'] = CheckingBuildExt - cmdclass['cython'] = CythonCommand + suffix = ".pyx" + cmdclass["build_ext"] = CheckingBuildExt + cmdclass["cython"] = CythonCommand else: - suffix = '.c' - cmdclass['build_src'] = DummyBuildSrc - cmdclass['build_ext'] = CheckingBuildExt + suffix = ".c" + cmdclass["build_src"] = DummyBuildSrc + cmdclass["build_ext"] = CheckingBuildExt # ---------------------------------------------------------------------- # Preparation of compiler arguments -debugging_symbols_requested = '--with-debugging-symbols' in sys.argv +debugging_symbols_requested = "--with-debugging-symbols" in sys.argv if debugging_symbols_requested: - sys.argv.remove('--with-debugging-symbols') + sys.argv.remove("--with-debugging-symbols") -if sys.byteorder == 'big': - endian_macro = [('__BIG_ENDIAN__', '1')] +if sys.byteorder == "big": + endian_macro = [("__BIG_ENDIAN__", "1")] else: - endian_macro = [('__LITTLE_ENDIAN__', '1')] + endian_macro = [("__LITTLE_ENDIAN__", "1")] if is_platform_windows(): extra_compile_args = [] extra_link_args = [] if debugging_symbols_requested: - extra_compile_args.append('/Z7') - extra_link_args.append('/DEBUG') + extra_compile_args.append("/Z7") + extra_link_args.append("/DEBUG") else: # args to ignore warnings - extra_compile_args = ['-Wno-unused-function'] + extra_compile_args = ["-Wno-unused-function"] extra_link_args = [] if debugging_symbols_requested: - extra_compile_args.append('-g') + extra_compile_args.append("-g") # Build for at least macOS 10.9 when compiling on a 10.9 system or above, # overriding CPython distuitls behaviour which is to target the version that # python was built for. This may be overridden by setting # MACOSX_DEPLOYMENT_TARGET before calling setup.py if is_platform_mac(): - if 'MACOSX_DEPLOYMENT_TARGET' not in os.environ: + if "MACOSX_DEPLOYMENT_TARGET" not in os.environ: current_system = platform.mac_ver()[0] - python_target = get_config_vars().get('MACOSX_DEPLOYMENT_TARGET', - current_system) - if (LooseVersion(python_target) < '10.9' and - LooseVersion(current_system) >= '10.9'): - os.environ['MACOSX_DEPLOYMENT_TARGET'] = '10.9' + python_target = get_config_vars().get( + "MACOSX_DEPLOYMENT_TARGET", current_system + ) + if ( + LooseVersion(python_target) < "10.9" + and LooseVersion(current_system) >= "10.9" + ): + os.environ["MACOSX_DEPLOYMENT_TARGET"] = "10.9" # enable coverage by building cython files by setting the environment variable # "PANDAS_CYTHON_COVERAGE" (with a Truthy value) or by running build_ext # with `--with-cython-coverage`enabled -linetrace = os.environ.get('PANDAS_CYTHON_COVERAGE', False) -if '--with-cython-coverage' in sys.argv: +linetrace = os.environ.get("PANDAS_CYTHON_COVERAGE", False) +if "--with-cython-coverage" in sys.argv: linetrace = True - sys.argv.remove('--with-cython-coverage') + sys.argv.remove("--with-cython-coverage") # Note: if not using `cythonize`, coverage can be enabled by # pinning `ext.cython_directives = directives` to each ext in extensions. # github.com/cython/cython/wiki/enhancements-compilerdirectives#in-setuppy -directives = {'linetrace': False, - 'language_level': 3} +directives = {"linetrace": False, "language_level": 3} macros = [] if linetrace: # https://pypkg.com/pypi/pytest-cython/f/tests/example-project/setup.py - directives['linetrace'] = True - macros = [('CYTHON_TRACE', '1'), ('CYTHON_TRACE_NOGIL', '1')] + directives["linetrace"] = True + macros = [("CYTHON_TRACE", "1"), ("CYTHON_TRACE_NOGIL", "1")] # in numpy>=1.16.0, silence build warnings about deprecated API usage # we can't do anything about these warnings because they stem from # cython+numpy version mismatches. -macros.append(('NPY_NO_DEPRECATED_API', '0')) +macros.append(("NPY_NO_DEPRECATED_API", "0")) # ---------------------------------------------------------------------- @@ -489,7 +522,7 @@ def maybe_cythonize(extensions, *args, **kwargs): """ Render tempita templates before calling cythonize """ - if len(sys.argv) > 1 and 'clean' in sys.argv: + if len(sys.argv) > 1 and "clean" in sys.argv: # Avoid running cythonize on `python setup.py clean` # See https://github.com/cython/cython/issues/1495 return extensions @@ -499,253 +532,255 @@ def maybe_cythonize(extensions, *args, **kwargs): # TODO: See if this can be removed after pyproject.toml added. return extensions - numpy_incl = pkg_resources.resource_filename('numpy', 'core/include') + numpy_incl = pkg_resources.resource_filename("numpy", "core/include") # TODO: Is this really necessary here? for ext in extensions: - if (hasattr(ext, 'include_dirs') and - numpy_incl not in ext.include_dirs): + if hasattr(ext, "include_dirs") and numpy_incl not in ext.include_dirs: ext.include_dirs.append(numpy_incl) build_ext.render_templates(_pxifiles) return cythonize(extensions, *args, **kwargs) -def srcpath(name=None, suffix='.pyx', subdir='src'): - return pjoin('pandas', subdir, name + suffix) +def srcpath(name=None, suffix=".pyx", subdir="src"): + return pjoin("pandas", subdir, name + suffix) -common_include = ['pandas/_libs/src/klib', 'pandas/_libs/src'] -ts_include = ['pandas/_libs/tslibs/src', 'pandas/_libs/tslibs'] +common_include = ["pandas/_libs/src/klib", "pandas/_libs/src"] +ts_include = ["pandas/_libs/tslibs/src", "pandas/_libs/tslibs"] -lib_depends = ['pandas/_libs/src/parse_helper.h', - 'pandas/_libs/src/compat_helper.h'] +lib_depends = ["pandas/_libs/src/parse_helper.h", "pandas/_libs/src/compat_helper.h"] np_datetime_headers = [ - 'pandas/_libs/tslibs/src/datetime/np_datetime.h', - 'pandas/_libs/tslibs/src/datetime/np_datetime_strings.h'] + "pandas/_libs/tslibs/src/datetime/np_datetime.h", + "pandas/_libs/tslibs/src/datetime/np_datetime_strings.h", +] np_datetime_sources = [ - 'pandas/_libs/tslibs/src/datetime/np_datetime.c', - 'pandas/_libs/tslibs/src/datetime/np_datetime_strings.c'] + "pandas/_libs/tslibs/src/datetime/np_datetime.c", + "pandas/_libs/tslibs/src/datetime/np_datetime_strings.c", +] tseries_depends = np_datetime_headers ext_data = { - '_libs.algos': { - 'pyxfile': '_libs/algos', - 'depends': _pxi_dep['algos']}, - '_libs.groupby': { - 'pyxfile': '_libs/groupby', - 'depends': _pxi_dep['groupby']}, - '_libs.hashing': { - 'pyxfile': '_libs/hashing', - 'include': [], - 'depends': []}, - '_libs.hashtable': { - 'pyxfile': '_libs/hashtable', - 'depends': (['pandas/_libs/src/klib/khash_python.h'] + - _pxi_dep['hashtable'])}, - '_libs.index': { - 'pyxfile': '_libs/index', - 'include': common_include + ts_include, - 'depends': _pxi_dep['index'], - 'sources': np_datetime_sources}, - '_libs.indexing': { - 'pyxfile': '_libs/indexing'}, - '_libs.internals': { - 'pyxfile': '_libs/internals'}, - '_libs.interval': { - 'pyxfile': '_libs/interval', - 'depends': _pxi_dep['interval']}, - '_libs.join': { - 'pyxfile': '_libs/join'}, - '_libs.lib': { - 'pyxfile': '_libs/lib', - 'include': common_include + ts_include, - 'depends': lib_depends + tseries_depends, - 'sources': ['pandas/_libs/src/parser/tokenizer.c']}, - '_libs.missing': { - 'pyxfile': '_libs/missing', - 'include': common_include + ts_include, - 'depends': tseries_depends}, - '_libs.parsers': { - 'pyxfile': '_libs/parsers', - 'depends': ['pandas/_libs/src/parser/tokenizer.h', - 'pandas/_libs/src/parser/io.h'], - 'sources': ['pandas/_libs/src/parser/tokenizer.c', - 'pandas/_libs/src/parser/io.c']}, - '_libs.reduction': { - 'pyxfile': '_libs/reduction'}, - '_libs.ops': { - 'pyxfile': '_libs/ops'}, - '_libs.properties': { - 'pyxfile': '_libs/properties', - 'include': []}, - '_libs.reshape': { - 'pyxfile': '_libs/reshape', - 'depends': []}, - '_libs.skiplist': { - 'pyxfile': '_libs/skiplist', - 'depends': ['pandas/_libs/src/skiplist.h']}, - '_libs.sparse': { - 'pyxfile': '_libs/sparse', - 'depends': _pxi_dep['sparse']}, - '_libs.tslib': { - 'pyxfile': '_libs/tslib', - 'include': ts_include, - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.tslibs.c_timestamp': { - 'pyxfile': '_libs/tslibs/c_timestamp', - 'include': ts_include, - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.tslibs.ccalendar': { - 'pyxfile': '_libs/tslibs/ccalendar', - 'include': []}, - '_libs.tslibs.conversion': { - 'pyxfile': '_libs/tslibs/conversion', - 'include': ts_include, - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.tslibs.fields': { - 'pyxfile': '_libs/tslibs/fields', - 'include': ts_include, - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.tslibs.frequencies': { - 'pyxfile': '_libs/tslibs/frequencies', - 'include': []}, - '_libs.tslibs.nattype': { - 'pyxfile': '_libs/tslibs/nattype', - 'include': []}, - '_libs.tslibs.np_datetime': { - 'pyxfile': '_libs/tslibs/np_datetime', - 'include': ts_include, - 'depends': np_datetime_headers, - 'sources': np_datetime_sources}, - '_libs.tslibs.offsets': { - 'pyxfile': '_libs/tslibs/offsets', - 'include': ts_include, - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.tslibs.parsing': { - 'pyxfile': '_libs/tslibs/parsing', - 'depends': ['pandas/_libs/src/parser/tokenizer.h'], - 'sources': ['pandas/_libs/src/parser/tokenizer.c']}, - '_libs.tslibs.period': { - 'pyxfile': '_libs/tslibs/period', - 'include': ts_include, - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.tslibs.resolution': { - 'pyxfile': '_libs/tslibs/resolution', - 'include': ts_include, - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.tslibs.strptime': { - 'pyxfile': '_libs/tslibs/strptime', - 'include': ts_include, - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.tslibs.timedeltas': { - 'pyxfile': '_libs/tslibs/timedeltas', - 'include': ts_include, - 'depends': np_datetime_headers, - 'sources': np_datetime_sources}, - '_libs.tslibs.timestamps': { - 'pyxfile': '_libs/tslibs/timestamps', - 'include': ts_include, - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.tslibs.timezones': { - 'pyxfile': '_libs/tslibs/timezones', - 'include': []}, - '_libs.tslibs.tzconversion': { - 'pyxfile': '_libs/tslibs/tzconversion', - 'include': ts_include, - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.testing': { - 'pyxfile': '_libs/testing'}, - '_libs.window': { - 'pyxfile': '_libs/window', - 'language': 'c++', - 'suffix': '.cpp'}, - '_libs.writers': { - 'pyxfile': '_libs/writers'}, - 'io.sas._sas': { - 'pyxfile': 'io/sas/sas'}, - 'io.msgpack._packer': { - 'macros': endian_macro + macros, - 'depends': ['pandas/_libs/src/msgpack/pack.h', - 'pandas/_libs/src/msgpack/pack_template.h'], - 'include': ['pandas/_libs/src/msgpack'] + common_include, - 'language': 'c++', - 'suffix': '.cpp', - 'pyxfile': 'io/msgpack/_packer', - 'subdir': 'io/msgpack'}, - 'io.msgpack._unpacker': { - 'depends': ['pandas/_libs/src/msgpack/unpack.h', - 'pandas/_libs/src/msgpack/unpack_define.h', - 'pandas/_libs/src/msgpack/unpack_template.h'], - 'macros': endian_macro + macros, - 'include': ['pandas/_libs/src/msgpack'] + common_include, - 'language': 'c++', - 'suffix': '.cpp', - 'pyxfile': 'io/msgpack/_unpacker', - 'subdir': 'io/msgpack' - } + "_libs.algos": {"pyxfile": "_libs/algos", "depends": _pxi_dep["algos"]}, + "_libs.groupby": {"pyxfile": "_libs/groupby", "depends": _pxi_dep["groupby"]}, + "_libs.hashing": {"pyxfile": "_libs/hashing", "include": [], "depends": []}, + "_libs.hashtable": { + "pyxfile": "_libs/hashtable", + "depends": (["pandas/_libs/src/klib/khash_python.h"] + _pxi_dep["hashtable"]), + }, + "_libs.index": { + "pyxfile": "_libs/index", + "include": common_include + ts_include, + "depends": _pxi_dep["index"], + "sources": np_datetime_sources, + }, + "_libs.indexing": {"pyxfile": "_libs/indexing"}, + "_libs.internals": {"pyxfile": "_libs/internals"}, + "_libs.interval": {"pyxfile": "_libs/interval", "depends": _pxi_dep["interval"]}, + "_libs.join": {"pyxfile": "_libs/join"}, + "_libs.lib": { + "pyxfile": "_libs/lib", + "include": common_include + ts_include, + "depends": lib_depends + tseries_depends, + "sources": ["pandas/_libs/src/parser/tokenizer.c"], + }, + "_libs.missing": { + "pyxfile": "_libs/missing", + "include": common_include + ts_include, + "depends": tseries_depends, + }, + "_libs.parsers": { + "pyxfile": "_libs/parsers", + "depends": [ + "pandas/_libs/src/parser/tokenizer.h", + "pandas/_libs/src/parser/io.h", + ], + "sources": [ + "pandas/_libs/src/parser/tokenizer.c", + "pandas/_libs/src/parser/io.c", + ], + }, + "_libs.reduction": {"pyxfile": "_libs/reduction"}, + "_libs.ops": {"pyxfile": "_libs/ops"}, + "_libs.properties": {"pyxfile": "_libs/properties", "include": []}, + "_libs.reshape": {"pyxfile": "_libs/reshape", "depends": []}, + "_libs.skiplist": { + "pyxfile": "_libs/skiplist", + "depends": ["pandas/_libs/src/skiplist.h"], + }, + "_libs.sparse": {"pyxfile": "_libs/sparse", "depends": _pxi_dep["sparse"]}, + "_libs.tslib": { + "pyxfile": "_libs/tslib", + "include": ts_include, + "depends": tseries_depends, + "sources": np_datetime_sources, + }, + "_libs.tslibs.c_timestamp": { + "pyxfile": "_libs/tslibs/c_timestamp", + "include": ts_include, + "depends": tseries_depends, + "sources": np_datetime_sources, + }, + "_libs.tslibs.ccalendar": {"pyxfile": "_libs/tslibs/ccalendar", "include": []}, + "_libs.tslibs.conversion": { + "pyxfile": "_libs/tslibs/conversion", + "include": ts_include, + "depends": tseries_depends, + "sources": np_datetime_sources, + }, + "_libs.tslibs.fields": { + "pyxfile": "_libs/tslibs/fields", + "include": ts_include, + "depends": tseries_depends, + "sources": np_datetime_sources, + }, + "_libs.tslibs.frequencies": {"pyxfile": "_libs/tslibs/frequencies", "include": []}, + "_libs.tslibs.nattype": {"pyxfile": "_libs/tslibs/nattype", "include": []}, + "_libs.tslibs.np_datetime": { + "pyxfile": "_libs/tslibs/np_datetime", + "include": ts_include, + "depends": np_datetime_headers, + "sources": np_datetime_sources, + }, + "_libs.tslibs.offsets": { + "pyxfile": "_libs/tslibs/offsets", + "include": ts_include, + "depends": tseries_depends, + "sources": np_datetime_sources, + }, + "_libs.tslibs.parsing": { + "pyxfile": "_libs/tslibs/parsing", + "depends": ["pandas/_libs/src/parser/tokenizer.h"], + "sources": ["pandas/_libs/src/parser/tokenizer.c"], + }, + "_libs.tslibs.period": { + "pyxfile": "_libs/tslibs/period", + "include": ts_include, + "depends": tseries_depends, + "sources": np_datetime_sources, + }, + "_libs.tslibs.resolution": { + "pyxfile": "_libs/tslibs/resolution", + "include": ts_include, + "depends": tseries_depends, + "sources": np_datetime_sources, + }, + "_libs.tslibs.strptime": { + "pyxfile": "_libs/tslibs/strptime", + "include": ts_include, + "depends": tseries_depends, + "sources": np_datetime_sources, + }, + "_libs.tslibs.timedeltas": { + "pyxfile": "_libs/tslibs/timedeltas", + "include": ts_include, + "depends": np_datetime_headers, + "sources": np_datetime_sources, + }, + "_libs.tslibs.timestamps": { + "pyxfile": "_libs/tslibs/timestamps", + "include": ts_include, + "depends": tseries_depends, + "sources": np_datetime_sources, + }, + "_libs.tslibs.timezones": {"pyxfile": "_libs/tslibs/timezones", "include": []}, + "_libs.tslibs.tzconversion": { + "pyxfile": "_libs/tslibs/tzconversion", + "include": ts_include, + "depends": tseries_depends, + "sources": np_datetime_sources, + }, + "_libs.testing": {"pyxfile": "_libs/testing"}, + "_libs.window": {"pyxfile": "_libs/window", "language": "c++", "suffix": ".cpp"}, + "_libs.writers": {"pyxfile": "_libs/writers"}, + "io.sas._sas": {"pyxfile": "io/sas/sas"}, + "io.msgpack._packer": { + "macros": endian_macro + macros, + "depends": [ + "pandas/_libs/src/msgpack/pack.h", + "pandas/_libs/src/msgpack/pack_template.h", + ], + "include": ["pandas/_libs/src/msgpack"] + common_include, + "language": "c++", + "suffix": ".cpp", + "pyxfile": "io/msgpack/_packer", + "subdir": "io/msgpack", + }, + "io.msgpack._unpacker": { + "depends": [ + "pandas/_libs/src/msgpack/unpack.h", + "pandas/_libs/src/msgpack/unpack_define.h", + "pandas/_libs/src/msgpack/unpack_template.h", + ], + "macros": endian_macro + macros, + "include": ["pandas/_libs/src/msgpack"] + common_include, + "language": "c++", + "suffix": ".cpp", + "pyxfile": "io/msgpack/_unpacker", + "subdir": "io/msgpack", + }, } extensions = [] for name, data in ext_data.items(): - source_suffix = suffix if suffix == '.pyx' else data.get('suffix', '.c') + source_suffix = suffix if suffix == ".pyx" else data.get("suffix", ".c") - sources = [srcpath(data['pyxfile'], suffix=source_suffix, subdir='')] + sources = [srcpath(data["pyxfile"], suffix=source_suffix, subdir="")] - sources.extend(data.get('sources', [])) + sources.extend(data.get("sources", [])) - include = data.get('include', common_include) + include = data.get("include", common_include) - obj = Extension('pandas.{name}'.format(name=name), - sources=sources, - depends=data.get('depends', []), - include_dirs=include, - language=data.get('language', 'c'), - define_macros=data.get('macros', macros), - extra_compile_args=extra_compile_args, - extra_link_args=extra_link_args) + obj = Extension( + "pandas.{name}".format(name=name), + sources=sources, + depends=data.get("depends", []), + include_dirs=include, + language=data.get("language", "c"), + define_macros=data.get("macros", macros), + extra_compile_args=extra_compile_args, + extra_link_args=extra_link_args, + ) extensions.append(obj) # ---------------------------------------------------------------------- # ujson -if suffix == '.pyx': +if suffix == ".pyx": # undo dumb setuptools bug clobbering .pyx sources back to .c for ext in extensions: - if ext.sources[0].endswith(('.c', '.cpp')): + if ext.sources[0].endswith((".c", ".cpp")): root, _ = os.path.splitext(ext.sources[0]) ext.sources[0] = root + suffix -ujson_ext = Extension('pandas._libs.json', - depends=['pandas/_libs/src/ujson/lib/ultrajson.h'], - sources=(['pandas/_libs/src/ujson/python/ujson.c', - 'pandas/_libs/src/ujson/python/objToJSON.c', - 'pandas/_libs/src/ujson/python/JSONtoObj.c', - 'pandas/_libs/src/ujson/lib/ultrajsonenc.c', - 'pandas/_libs/src/ujson/lib/ultrajsondec.c'] + - np_datetime_sources), - include_dirs=['pandas/_libs/src/ujson/python', - 'pandas/_libs/src/ujson/lib', - 'pandas/_libs/src/datetime'], - extra_compile_args=(['-D_GNU_SOURCE'] + - extra_compile_args), - extra_link_args=extra_link_args, - define_macros=macros) +ujson_ext = Extension( + "pandas._libs.json", + depends=["pandas/_libs/src/ujson/lib/ultrajson.h"], + sources=( + [ + "pandas/_libs/src/ujson/python/ujson.c", + "pandas/_libs/src/ujson/python/objToJSON.c", + "pandas/_libs/src/ujson/python/JSONtoObj.c", + "pandas/_libs/src/ujson/lib/ultrajsonenc.c", + "pandas/_libs/src/ujson/lib/ultrajsondec.c", + ] + + np_datetime_sources + ), + include_dirs=[ + "pandas/_libs/src/ujson/python", + "pandas/_libs/src/ujson/lib", + "pandas/_libs/src/datetime", + ], + extra_compile_args=(["-D_GNU_SOURCE"] + extra_compile_args), + extra_link_args=extra_link_args, + define_macros=macros, +) extensions.append(ujson_ext) @@ -753,12 +788,14 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): # ---------------------------------------------------------------------- # util # extension for pseudo-safely moving bytes into mutable buffers -_move_ext = Extension('pandas.util._move', - depends=[], - sources=['pandas/util/move.c'], - define_macros=macros, - extra_compile_args=extra_compile_args, - extra_link_args=extra_link_args) +_move_ext = Extension( + "pandas.util._move", + depends=[], + sources=["pandas/util/move.c"], + define_macros=macros, + extra_compile_args=extra_compile_args, + extra_link_args=extra_link_args, +) extensions.append(_move_ext) # ---------------------------------------------------------------------- @@ -767,29 +804,31 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): # The build cache system does string matching below this point. # if you change something, be careful. -setup(name=DISTNAME, - maintainer=AUTHOR, - version=versioneer.get_version(), - packages=find_packages(include=['pandas', 'pandas.*']), - package_data={'': ['templates/*', '_libs/*.dll']}, - ext_modules=maybe_cythonize(extensions, compiler_directives=directives), - maintainer_email=EMAIL, - description=DESCRIPTION, - license=LICENSE, - cmdclass=cmdclass, - url=URL, - download_url=DOWNLOAD_URL, - project_urls=PROJECT_URLS, - long_description=LONG_DESCRIPTION, - classifiers=CLASSIFIERS, - platforms='any', - python_requires='>=3.5.3', - extras_require={ - 'test': [ - # sync with setup.cfg minversion & install.rst - 'pytest>=4.0.2', - 'pytest-xdist', - 'hypothesis>=3.58', - ] - }, - **setuptools_kwargs) +setup( + name=DISTNAME, + maintainer=AUTHOR, + version=versioneer.get_version(), + packages=find_packages(include=["pandas", "pandas.*"]), + package_data={"": ["templates/*", "_libs/*.dll"]}, + ext_modules=maybe_cythonize(extensions, compiler_directives=directives), + maintainer_email=EMAIL, + description=DESCRIPTION, + license=LICENSE, + cmdclass=cmdclass, + url=URL, + download_url=DOWNLOAD_URL, + project_urls=PROJECT_URLS, + long_description=LONG_DESCRIPTION, + classifiers=CLASSIFIERS, + platforms="any", + python_requires=">=3.5.3", + extras_require={ + "test": [ + # sync with setup.cfg minversion & install.rst + "pytest>=4.0.2", + "pytest-xdist", + "hypothesis>=3.58", + ] + }, + **setuptools_kwargs +) diff --git a/versioneer.py b/versioneer.py index 865dc10f09216d..24d8105c307c0e 100644 --- a/versioneer.py +++ b/versioneer.py @@ -1,4 +1,3 @@ - # Version: 0.15 """ @@ -367,11 +366,13 @@ def get_root(): setup_py = os.path.join(root, "setup.py") versioneer_py = os.path.join(root, "versioneer.py") if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): - err = ("Versioneer was unable to run the project root directory. " - "Versioneer requires setup.py to be executed from " - "its immediate directory (like 'python setup.py COMMAND'), " - "or in a way that lets it use sys.argv[0] to find the root " - "(like 'python path/to/setup.py COMMAND').") + err = ( + "Versioneer was unable to run the project root directory. " + "Versioneer requires setup.py to be executed from " + "its immediate directory (like 'python setup.py COMMAND'), " + "or in a way that lets it use sys.argv[0] to find the root " + "(like 'python path/to/setup.py COMMAND')." + ) raise VersioneerBadRootError(err) try: # Certain runtime workflows (setup.py install/develop in a setuptools @@ -382,8 +383,10 @@ def get_root(): # versioneer.py was first imported, even in later projects. me = os.path.realpath(os.path.abspath(__file__)) if os.path.splitext(me)[0] != os.path.splitext(versioneer_py)[0]: - print("Warning: build in %s is using versioneer.py from %s" - % (os.path.dirname(me), versioneer_py)) + print( + "Warning: build in %s is using versioneer.py from %s" + % (os.path.dirname(me), versioneer_py) + ) except NameError: pass return root @@ -404,6 +407,7 @@ def get(parser, name): if parser.has_option("versioneer", name): return parser.get("versioneer", name) return None + cfg = VersioneerConfig() cfg.VCS = VCS cfg.style = get(parser, "style") or "" @@ -418,6 +422,7 @@ def get(parser, name): class NotThisMethod(Exception): pass + # these dictionaries contain VCS-specific tools LONG_VERSION_PY = {} HANDLERS = {} @@ -429,6 +434,7 @@ def decorate(f): HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f + return decorate @@ -439,9 +445,12 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen([c] + args, cwd=cwd, stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) + p = subprocess.Popen( + [c] + args, + cwd=cwd, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + ) break except EnvironmentError: e = sys.exc_info()[1] @@ -465,7 +474,9 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): return stdout -LONG_VERSION_PY['git'] = r''' +LONG_VERSION_PY[ + "git" +] = r""" # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build @@ -925,7 +936,7 @@ def get_versions(): return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version"} -''' +""" @register_vcs_handler("git", "get_keywords") @@ -965,7 +976,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} + tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -974,27 +985,32 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". - tags = {r for r in refs if re.search(r'\d', r)} + tags = {r for r in refs if re.search(r"\d", r)} if verbose: - print("discarding '%s', no digits" % ",".join(refs-tags)) + print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: print("likely tags: %s" % ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] + r = ref[len(tag_prefix) :] if verbose: print("picking %s" % r) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None - } + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags"} + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + } @register_vcs_handler("git", "pieces_from_vcs") @@ -1014,9 +1030,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): GITS = ["git.cmd", "git.exe"] # if there is a tag, this yields TAG-NUM-gHEX[-dirty] # if there are no tags, this yields HEX[-dirty] (no NUM) - describe_out = run_command(GITS, ["describe", "--tags", "--dirty", - "--always", "--long"], - cwd=root) + describe_out = run_command( + GITS, ["describe", "--tags", "--dirty", "--always", "--long"], cwd=root + ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") @@ -1039,17 +1055,16 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] + git_describe = git_describe[: git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? - pieces["error"] = ("unable to parse git-describe output: '%s'" - % describe_out) + pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces # tag @@ -1058,10 +1073,12 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) - pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" - % (full_tag, tag_prefix)) + pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( + full_tag, + tag_prefix, + ) return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] + pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) @@ -1072,8 +1089,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): else: # HEX: no tags pieces["closest-tag"] = None - count_out = run_command(GITS, ["rev-list", "HEAD", "--count"], - cwd=root) + count_out = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits return pieces @@ -1118,12 +1134,18 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): dirname = os.path.basename(root) if not dirname.startswith(parentdir_prefix): if verbose: - print("guessing rootdir is '%s', but '%s' doesn't start with " - "prefix '%s'" % (root, dirname, parentdir_prefix)) + print( + "guessing rootdir is '%s', but '%s' doesn't start with " + "prefix '%s'" % (root, dirname, parentdir_prefix) + ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None} + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + } + SHORT_VERSION_PY = """ # This file was generated by 'versioneer.py' (0.15) from @@ -1152,8 +1174,9 @@ def versions_from_file(filename): contents = f.read() except EnvironmentError: raise NotThisMethod("unable to read _version.py") - mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", - contents, re.M | re.S) + mo = re.search( + r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S + ) if not mo: raise NotThisMethod("no version_json in _version.py") return json.loads(mo.group(1)) @@ -1161,8 +1184,7 @@ def versions_from_file(filename): def write_to_version_file(filename, versions): os.unlink(filename) - contents = json.dumps(versions, sort_keys=True, - indent=1, separators=(",", ": ")) + contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) with open(filename, "w") as f: f.write(SHORT_VERSION_PY % contents) @@ -1192,8 +1214,7 @@ def render_pep440(pieces): rendered += ".dirty" else: # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], - pieces["short"]) + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered @@ -1300,10 +1321,12 @@ def render_git_describe_long(pieces): def render(pieces, style): if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"]} + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + } if not style or style == "default": style = "pep440" # the default @@ -1323,8 +1346,12 @@ def render(pieces, style): else: raise ValueError("unknown style '%s'" % style) - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None} + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + } class VersioneerBadRootError(Exception): @@ -1345,8 +1372,9 @@ def get_versions(verbose=False): handlers = HANDLERS.get(cfg.VCS) assert handlers, "unrecognized VCS '%s'" % cfg.VCS verbose = verbose or cfg.verbose - assert cfg.versionfile_source is not None, \ - "please set versioneer.versionfile_source" + assert ( + cfg.versionfile_source is not None + ), "please set versioneer.versionfile_source" assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" versionfile_abs = os.path.join(root, cfg.versionfile_source) @@ -1400,8 +1428,12 @@ def get_versions(verbose=False): if verbose: print("unable to compute version") - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, "error": "unable to compute version"} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + } def get_version(): @@ -1447,6 +1479,7 @@ def run(self): print(" dirty: %s" % vers.get("dirty")) if vers["error"]: print(" error: %s" % vers["error"]) + cmds["version"] = cmd_version # we override "build_py" in both distutils and setuptools @@ -1470,10 +1503,10 @@ def run(self): # now locate _version.py in the new build/ directory and replace # it with an updated value if cfg.versionfile_build: - target_versionfile = os.path.join(self.build_lib, - cfg.versionfile_build) + target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) + cmds["build_py"] = cmd_build_py if "cx_Freeze" in sys.modules: # cx_freeze enabled? @@ -1492,13 +1525,17 @@ def run(self): os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % - {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + cmds["build_exe"] = cmd_build_exe del cmds["build_py"] @@ -1526,8 +1563,10 @@ def make_release_tree(self, base_dir, files): # updated value target_versionfile = os.path.join(base_dir, cfg.versionfile_source) print("UPDATING %s" % target_versionfile) - write_to_version_file(target_versionfile, - self._versioneer_generated_versions) + write_to_version_file( + target_versionfile, self._versioneer_generated_versions + ) + cmds["sdist"] = cmd_sdist return cmds @@ -1581,11 +1620,13 @@ def do_setup(): root = get_root() try: cfg = get_config_from_root(root) - except (EnvironmentError, configparser.NoSectionError, - configparser.NoOptionError) as e: + except ( + EnvironmentError, + configparser.NoSectionError, + configparser.NoOptionError, + ) as e: if isinstance(e, (EnvironmentError, configparser.NoSectionError)): - print("Adding sample versioneer config to setup.cfg", - file=sys.stderr) + print("Adding sample versioneer config to setup.cfg", file=sys.stderr) with open(os.path.join(root, "setup.cfg"), "a") as f: f.write(SAMPLE_CONFIG) print(CONFIG_ERROR, file=sys.stderr) @@ -1594,15 +1635,18 @@ def do_setup(): print(" creating %s" % cfg.versionfile_source) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) - - ipy = os.path.join(os.path.dirname(cfg.versionfile_source), - "__init__.py") + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + + ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") if os.path.exists(ipy): try: with open(ipy, "r") as f: @@ -1644,8 +1688,10 @@ def do_setup(): else: print(" 'versioneer.py' already in MANIFEST.in") if cfg.versionfile_source not in simple_includes: - print(" appending versionfile_source ('%s') to MANIFEST.in" % - cfg.versionfile_source) + print( + " appending versionfile_source ('%s') to MANIFEST.in" + % cfg.versionfile_source + ) with open(manifest_in, "a") as f: f.write("include %s\n" % cfg.versionfile_source) else: @@ -1693,6 +1739,7 @@ def scan_setup_py(): errors += 1 return errors + if __name__ == "__main__": cmd = sys.argv[1] if cmd == "setup": From 8ea102acdb45bb70cb30ea77108a50054c28c24d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 27 Jun 2019 12:19:01 -0500 Subject: [PATCH 161/238] Fix-up black formatting: fix noqa comments + type annotations --- pandas/api/extensions/__init__.py | 3 +-- pandas/compat/numpy/function.py | 12 ++++-------- pandas/core/indexes/datetimelike.py | 8 ++++---- pandas/tests/arrays/categorical/test_constructors.py | 4 ++-- pandas/tests/arrays/sparse/test_array.py | 8 ++++---- pandas/tests/frame/test_alter_axes.py | 5 ++--- pandas/tests/frame/test_analytics.py | 4 ++-- pandas/tests/frame/test_query_eval.py | 4 ++-- pandas/tests/indexing/test_callable.py | 12 ++++++------ pandas/tests/test_strings.py | 4 ++-- 10 files changed, 29 insertions(+), 35 deletions(-) diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py index 431dd2b1968aee..573d700dac43d2 100644 --- a/pandas/api/extensions/__init__.py +++ b/pandas/api/extensions/__init__.py @@ -5,10 +5,9 @@ ) from pandas.core.accessor import ( # noqa: F401 + register_dataframe_accessor, register_index_accessor, register_series_accessor, ) from pandas.core.algorithms import take # noqa: F401 from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin # noqa: F401 - -from pandas.core.accessor import register_dataframe_accessor # noqa: F401; noqa: F401 diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index 840dec2489a52a..89f7d71e21e9d4 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -108,8 +108,7 @@ def validate_argmax_with_skipna(skipna, args, kwargs): return skipna -ARGSORT_DEFAULTS = OrderedDict() -# type: OrderedDict[str, Optional[Union[int, str]]] +ARGSORT_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[Union[int, str]]] ARGSORT_DEFAULTS["axis"] = -1 ARGSORT_DEFAULTS["kind"] = "quicksort" ARGSORT_DEFAULTS["order"] = None @@ -125,8 +124,7 @@ def validate_argmax_with_skipna(skipna, args, kwargs): # two different signatures of argsort, this second validation # for when the `kind` param is supported -ARGSORT_DEFAULTS_KIND = OrderedDict() -# type: OrderedDict[str, Optional[int]] +ARGSORT_DEFAULTS_KIND = OrderedDict() # type: OrderedDict[str, Optional[int]] ARGSORT_DEFAULTS_KIND["axis"] = -1 ARGSORT_DEFAULTS_KIND["order"] = None validate_argsort_kind = CompatValidator( @@ -243,8 +241,7 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): ROUND_DEFAULTS, fname="round", method="both", max_fname_arg_count=1 ) -SORT_DEFAULTS = OrderedDict() -# type: OrderedDict[str, Optional[Union[int, str]]] +SORT_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[Union[int, str]]] SORT_DEFAULTS["axis"] = -1 SORT_DEFAULTS["kind"] = "quicksort" SORT_DEFAULTS["order"] = None @@ -278,8 +275,7 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): MEDIAN_DEFAULTS, fname="median", method="both", max_fname_arg_count=1 ) -STAT_DDOF_FUNC_DEFAULTS = OrderedDict() -# type: OrderedDict[str, Optional[bool]] +STAT_DDOF_FUNC_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[bool]] STAT_DDOF_FUNC_DEFAULTS["dtype"] = None STAT_DDOF_FUNC_DEFAULTS["out"] = None STAT_DDOF_FUNC_DEFAULTS["keepdims"] = False diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index f2e6f631ae9ee2..731ab9c4163453 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -73,14 +73,14 @@ class DatetimeIndexOpsMixin(ExtensionOpsMixin): # properties there. They can be made into cache_readonly for Index # subclasses bc they are immutable inferred_freq = cache_readonly( - DatetimeLikeArrayMixin.inferred_freq.fget - ) # type: ignore + DatetimeLikeArrayMixin.inferred_freq.fget # type: ignore + ) _isnan = cache_readonly(DatetimeLikeArrayMixin._isnan.fget) # type: ignore hasnans = cache_readonly(DatetimeLikeArrayMixin._hasnans.fget) # type: ignore _hasnans = hasnans # for index / array -agnostic code _resolution = cache_readonly( - DatetimeLikeArrayMixin._resolution.fget - ) # type: ignore + DatetimeLikeArrayMixin._resolution.fget # type: ignore + ) resolution = cache_readonly(DatetimeLikeArrayMixin.resolution.fget) # type: ignore _maybe_mask_results = ea_passthrough(DatetimeLikeArrayMixin._maybe_mask_results) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 4bf31a52dcda8f..704f9c94463e6e 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -224,8 +224,8 @@ def test_constructor(self): # this is a legitimate constructor with tm.assert_produces_warning(None): - c = Categorical( - np.array([], dtype="int64"), categories=[3, 2, 1], ordered=True # noqa + c = Categorical( # noqa + np.array([], dtype="int64"), categories=[3, 2, 1], ordered=True ) def test_constructor_with_existing_categories(self): diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index c76b4d96005269..b94e2a16d217a0 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -689,13 +689,13 @@ def test_getslice_tuple(self): dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0]) sparse = SparseArray(dense) - res = sparse[4:,] - exp = SparseArray(dense[4:,]) + res = sparse[4:,] # noqa: E231 + exp = SparseArray(dense[4:,]) # noqa: E231 tm.assert_sp_array_equal(res, exp) sparse = SparseArray(dense, fill_value=0) - res = sparse[4:,] - exp = SparseArray(dense[4:,], fill_value=0) + res = sparse[4:,] # noqa: E231 + exp = SparseArray(dense[4:,], fill_value=0) # noqa: E231 tm.assert_sp_array_equal(res, exp) with pytest.raises(IndexError): diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 229713a5af11a5..2ce65bd15387ee 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -238,9 +238,8 @@ def test_set_index_pass_arrays_duplicate( # cannot drop the same column twice; # use "is" because == would give ambiguous Boolean error for containers first_drop = ( - False if (keys[0] is "A" and keys[1] is "A") else drop - ) # noqa: F632 - + False if (keys[0] is "A" and keys[1] is "A") else drop # noqa: F632 + ) # to test against already-tested behaviour, we add sequentially, # hence second append always True; must wrap keys in list, otherwise # box = list would be interpreted as keys diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 8c1534aa515e8c..13ffa8d17d47c2 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -2135,8 +2135,8 @@ def test_round(self): nan_round_Series = Series({"col1": np.nan, "col2": 1}) # TODO(wesm): unused? - expected_nan_round = DataFrame( - {"col1": [1.123, 2.123, 3.123], "col2": [1.2, 2.2, 3.2]} # noqa + expected_nan_round = DataFrame( # noqa + {"col1": [1.123, 2.123, 3.123], "col2": [1.2, 2.2, 3.2]} ) with pytest.raises(TypeError): diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 70c58471dd0d46..0781e20a71940d 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -102,8 +102,8 @@ def test_ops(self): ("/", "__truediv__", "__rtruediv__"), ]: - base = DataFrame( - np.tile(m.values, n).reshape(n, -1), columns=list("abcd") # noqa + base = DataFrame( # noqa + np.tile(m.values, n).reshape(n, -1), columns=list("abcd") ) expected = eval("base{op}df".format(op=op_str)) diff --git a/pandas/tests/indexing/test_callable.py b/pandas/tests/indexing/test_callable.py index 78aaf80b532fb0..aa73bd728595f4 100644 --- a/pandas/tests/indexing/test_callable.py +++ b/pandas/tests/indexing/test_callable.py @@ -17,11 +17,11 @@ def test_frame_loc_callable(self): res = df.loc[lambda x: x.A > 2] tm.assert_frame_equal(res, df.loc[df.A > 2]) - res = df.loc[lambda x: x.A > 2,] - tm.assert_frame_equal(res, df.loc[df.A > 2,]) + res = df.loc[lambda x: x.A > 2,] # noqa: E231 + tm.assert_frame_equal(res, df.loc[df.A > 2,]) # noqa: E231 - res = df.loc[lambda x: x.A > 2,] - tm.assert_frame_equal(res, df.loc[df.A > 2,]) + res = df.loc[lambda x: x.A > 2,] # noqa: E231 + tm.assert_frame_equal(res, df.loc[df.A > 2,]) # noqa: E231 res = df.loc[lambda x: x.B == "b", :] tm.assert_frame_equal(res, df.loc[df.B == "b", :]) @@ -90,8 +90,8 @@ def test_frame_loc_callable_labels(self): res = df.loc[lambda x: ["A", "C"]] tm.assert_frame_equal(res, df.loc[["A", "C"]]) - res = df.loc[lambda x: ["A", "C"],] - tm.assert_frame_equal(res, df.loc[["A", "C"],]) + res = df.loc[lambda x: ["A", "C"],] # noqa: E231 + tm.assert_frame_equal(res, df.loc[["A", "C"],]) # noqa: E231 res = df.loc[lambda x: ["A", "C"], :] tm.assert_frame_equal(res, df.loc[["A", "C"], :]) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index d70614fcd2700d..6833757c69eaaf 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1839,13 +1839,13 @@ def test_ismethods(self): digit_e = [False, False, False, True, False, False, False, True, False, False] # TODO: unused - num_e = [ + num_e = [ # noqa False, False, False, True, False, - False, # noqa + False, False, True, False, From 2efb60717bda9fc64344c5f6647d58564930808e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 3 Jul 2019 23:03:46 -0500 Subject: [PATCH 162/238] RLS: 0.25.0rc0 From a61218d9ed92eeb31c83fa6517a740c54d907f5d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 5 Jul 2019 09:32:03 -0400 Subject: [PATCH 163/238] STYLE: add black makefile & skip some dirs (#27231) --- Makefile | 10 +++++----- ci/code_checks.sh | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 956ff52338839f..a02fe145c5f0e6 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,4 @@ -tseries: pandas/_libs/lib.pyx pandas/_libs/tslib.pyx pandas/_libs/hashtable.pyx - python setup.py build_ext --inplace - -.PHONY : develop build clean clean_pyc tseries doc +.PHONY : develop build clean clean_pyc doc lint-diff black clean: -python setup.py clean @@ -15,8 +12,11 @@ build: clean_pyc lint-diff: git diff upstream/master --name-only -- "*.py" | xargs flake8 +black: + black . --exclude '(asv_bench/env|\.egg|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|_build|buck-out|build|dist)' + develop: build - -python setup.py develop + python setup.py develop doc: -rm -rf doc/build doc/source/generated diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 1494452ca136ba..fec2a882922806 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -56,7 +56,7 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then black --version MSG='Checking black formatting' ; echo $MSG - black . --check + black . --check --exclude '(asv_bench/env|\.egg|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|_build|buck-out|build|dist)' RET=$(($RET + $?)) ; echo $MSG "DONE" # `setup.cfg` contains the list of error codes that are being ignored in flake8 From c95027f33e553556268280bb0c9cc4977f4bd531 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 5 Jul 2019 09:28:35 -0500 Subject: [PATCH 164/238] REF: make ops a directory (#27238) --- pandas/core/{ops.py => ops/__init__.py} | 76 +++++-------------------- pandas/core/ops/roperator.py | 61 ++++++++++++++++++++ 2 files changed, 76 insertions(+), 61 deletions(-) rename pandas/core/{ops.py => ops/__init__.py} (98%) create mode 100644 pandas/core/ops/roperator.py diff --git a/pandas/core/ops.py b/pandas/core/ops/__init__.py similarity index 98% rename from pandas/core/ops.py rename to pandas/core/ops/__init__.py index 5c58a1433ba3cc..6fd53c4b244710 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops/__init__.py @@ -51,6 +51,21 @@ import pandas.core.common as com import pandas.core.missing as missing +from .roperator import ( # noqa:F401 + radd, + rand_, + rdiv, + rdivmod, + rfloordiv, + rmod, + rmul, + ror_, + rpow, + rsub, + rtruediv, + rxor, +) + # ----------------------------------------------------------------------------- # Ops Wrapping Utilities @@ -151,67 +166,6 @@ def maybe_upcast_for_op(obj): return obj -# ----------------------------------------------------------------------------- -# Reversed Operations not available in the stdlib operator module. -# Defining these instead of using lambdas allows us to reference them by name. - - -def radd(left, right): - return right + left - - -def rsub(left, right): - return right - left - - -def rmul(left, right): - return right * left - - -def rdiv(left, right): - return right / left - - -def rtruediv(left, right): - return right / left - - -def rfloordiv(left, right): - return right // left - - -def rmod(left, right): - # check if right is a string as % is the string - # formatting operation; this is a TypeError - # otherwise perform the op - if isinstance(right, str): - raise TypeError( - "{typ} cannot perform the operation mod".format(typ=type(left).__name__) - ) - - return right % left - - -def rdivmod(left, right): - return divmod(right, left) - - -def rpow(left, right): - return right ** left - - -def rand_(left, right): - return operator.and_(right, left) - - -def ror_(left, right): - return operator.or_(right, left) - - -def rxor(left, right): - return operator.xor(right, left) - - # ----------------------------------------------------------------------------- diff --git a/pandas/core/ops/roperator.py b/pandas/core/ops/roperator.py new file mode 100644 index 00000000000000..4cb02238aea163 --- /dev/null +++ b/pandas/core/ops/roperator.py @@ -0,0 +1,61 @@ +""" +Reversed Operations not available in the stdlib operator module. +Defining these instead of using lambdas allows us to reference them by name. +""" +import operator + + +def radd(left, right): + return right + left + + +def rsub(left, right): + return right - left + + +def rmul(left, right): + return right * left + + +def rdiv(left, right): + return right / left + + +def rtruediv(left, right): + return right / left + + +def rfloordiv(left, right): + return right // left + + +def rmod(left, right): + # check if right is a string as % is the string + # formatting operation; this is a TypeError + # otherwise perform the op + if isinstance(right, str): + raise TypeError( + "{typ} cannot perform the operation mod".format(typ=type(left).__name__) + ) + + return right % left + + +def rdivmod(left, right): + return divmod(right, left) + + +def rpow(left, right): + return right ** left + + +def rand_(left, right): + return operator.and_(right, left) + + +def ror_(left, right): + return operator.or_(right, left) + + +def rxor(left, right): + return operator.xor(right, left) From 55f0666681008198e6c2d3c934b25cf19fa6e73b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 5 Jul 2019 07:29:34 -0700 Subject: [PATCH 165/238] BUG: merge_asof with multiple by columns with tz (#27243) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/reshape/merge.py | 3 ++ pandas/tests/reshape/merge/test_merge_asof.py | 32 +++++++++++++++++-- 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ab242ece981817..101addfa097f8b 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -1151,6 +1151,7 @@ Reshaping - Bug in :func:`DataFrame.pivot_table` with a :class:`IntervalIndex` as pivot index would raise ``TypeError`` (:issue:`25814`) - Bug in :meth:`DataFrame.transpose` where transposing a DataFrame with a timezone-aware datetime column would incorrectly raise ``ValueError`` (:issue:`26825`) - Bug in :func:`pivot_table` when pivoting a timezone aware column as the ``values`` would remove timezone information (:issue:`14948`) +- Bug in :func:`merge_asof` when specifying multiple ``by`` columns where one is ``datetime64[ns, tz]`` dtype (:issue:`26649`) Sparse ^^^^^^ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4f910f6a278ad8..c1a07c129f7cda 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1686,6 +1686,9 @@ def _get_join_indexers(self): def flip(xs): """ unlike np.transpose, this returns an array of tuples """ + xs = [ + x if not is_extension_array_dtype(x) else x._ndarray_values for x in xs + ] labels = list(string.ascii_lowercase[: len(xs)]) dtypes = [x.dtype for x in xs] labeled_dtypes = list(zip(labels, dtypes)) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index e2e17397464fe7..6b66386bafc5e6 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -190,9 +190,9 @@ def test_basic_left_index(self): result = merge_asof( trades, quotes, left_index=True, right_on="time", by="ticker" ) - # left-only index uses right's index, oddly + # left-only index uses right"s index, oddly expected.index = result.index - # time column appears after left's columns + # time column appears after left"s columns expected = expected[result.columns] assert_frame_equal(result, expected) @@ -233,7 +233,7 @@ def test_multi_index(self): def test_on_and_index(self): - # 'on' parameter and index together is prohibited + # "on" parameter and index together is prohibited trades = self.trades.set_index("time") quotes = self.quotes.set_index("time") with pytest.raises(MergeError): @@ -1220,3 +1220,29 @@ def test_merge_by_col_tz_aware(self): columns=["by_col", "on_col", "values_x", "values_y"], ) assert_frame_equal(result, expected) + + def test_by_mixed_tz_aware(self): + # GH 26649 + left = pd.DataFrame( + { + "by_col1": pd.DatetimeIndex(["2018-01-01"]).tz_localize("UTC"), + "by_col2": ["HELLO"], + "on_col": [2], + "value": ["a"], + } + ) + right = pd.DataFrame( + { + "by_col1": pd.DatetimeIndex(["2018-01-01"]).tz_localize("UTC"), + "by_col2": ["WORLD"], + "on_col": [1], + "value": ["b"], + } + ) + result = pd.merge_asof(left, right, by=["by_col1", "by_col2"], on="on_col") + expected = pd.DataFrame( + [[pd.Timestamp("2018-01-01", tz="UTC"), "HELLO", 2, "a"]], + columns=["by_col1", "by_col2", "on_col", "value_x"], + ) + expected["value_y"] = np.array([np.nan], dtype=object) + assert_frame_equal(result, expected) From de006f807ac4eb704ae7ce22fc105b879dd9177d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 5 Jul 2019 08:21:46 -0700 Subject: [PATCH 166/238] remove never-used branch (#27236) --- pandas/core/ops/__init__.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 6fd53c4b244710..4692ec45df0adf 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -11,7 +11,7 @@ import numpy as np -from pandas._libs import algos as libalgos, lib, ops as libops +from pandas._libs import lib, ops as libops from pandas.errors import NullFrequencyError from pandas.util._decorators import Appender @@ -1667,10 +1667,6 @@ def na_op(x, y): result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs) except TypeError: result = masked_arith_op(x, y, op) - except Exception: # TODO: more specific? - if is_object_dtype(x): - return libalgos.arrmap_object(x, lambda val: op(val, y)) - raise if isinstance(result, tuple): # e.g. divmod From 1219b0fb0d8e07b5018381a0bd8e0b4b89029dc7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 5 Jul 2019 09:24:08 -0700 Subject: [PATCH 167/238] CLN: Move code outside of try/except blocks (#27223) --- pandas/core/indexing.py | 9 ++++- pandas/core/internals/blocks.py | 65 ++++++++++++++++----------------- 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ccc3a027af70d8..0bcaa83c496283 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -118,10 +118,15 @@ def __getitem__(self, key): key = tuple(com.apply_if_callable(x, self.obj) for x in key) try: values = self.obj._get_value(*key) + except (KeyError, TypeError): + # TypeError occurs here if the key has non-hashable entries, + # generally slice or list. + # TODO(ix): most/all of the TypeError cases here are for ix, + # so this check can be removed once ix is removed. + pass + else: if is_scalar(values): return values - except Exception: - pass return self._getitem_tuple(key) else: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 34186b60de27c3..022d855d9a15b5 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -413,12 +413,6 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): try: # Note: we only call try_coerce_args to let it raise self._try_coerce_args(value) - - blocks = self.putmask(mask, value, inplace=inplace) - blocks = [ - b.make_block(values=self._try_coerce_result(b.values)) for b in blocks - ] - return self._maybe_downcast(blocks, downcast) except (TypeError, ValueError): # we can't process the value, but nothing to do @@ -435,6 +429,12 @@ def f(m, v, i): return block.fillna(value, limit=limit, inplace=inplace, downcast=None) return self.split_and_operate(mask, f, inplace) + else: + blocks = self.putmask(mask, value, inplace=inplace) + blocks = [ + b.make_block(values=self._try_coerce_result(b.values)) for b in blocks + ] + return self._maybe_downcast(blocks, downcast) def split_and_operate(self, mask, f, inplace): """ @@ -615,10 +615,9 @@ def _astype(self, dtype, copy=False, errors="raise", values=None, **kwargs): return self.copy() return self - try: - # force the copy here - if values is None: - + if values is None: + try: + # force the copy here if self.is_extension: values = self.values.astype(dtype) else: @@ -644,10 +643,12 @@ def _astype(self, dtype, copy=False, errors="raise", values=None, **kwargs): if isinstance(values, np.ndarray): values = values.reshape(self.shape) - except Exception: # noqa: E722 - if errors == "raise": - raise - newb = self.copy() if copy else self + except Exception: # noqa: E722 + if errors == "raise": + raise + newb = self.copy() if copy else self + else: + newb = make_block(values, placement=self.mgr_locs, ndim=self.ndim) else: newb = make_block(values, placement=self.mgr_locs, ndim=self.ndim) @@ -861,13 +862,6 @@ def setitem(self, indexer, value): values = self.values try: value = self._try_coerce_args(value) - values = self._coerce_values(values) - # can keep its own dtype - if hasattr(value, "dtype") and is_dtype_equal(values.dtype, value.dtype): - dtype = self.dtype - else: - dtype = "infer" - except (TypeError, ValueError): # current dtype cannot store value, coerce to common dtype find_dtype = False @@ -891,6 +885,13 @@ def setitem(self, indexer, value): if not is_dtype_equal(self.dtype, dtype): b = self.astype(dtype) return b.setitem(indexer, value) + else: + values = self._coerce_values(values) + # can keep its own dtype + if hasattr(value, "dtype") and is_dtype_equal(values.dtype, value.dtype): + dtype = self.dtype + else: + dtype = "infer" # value must be storeable at this moment arr_value = np.array(value) @@ -2041,13 +2042,14 @@ def where( else: dtype = self.dtype + result = self.values.copy() + icond = ~cond + if lib.is_scalar(other): + set_other = other + else: + set_other = other[icond] try: - result = self.values.copy() - icond = ~cond - if lib.is_scalar(other): - result[icond] = other - else: - result[icond] = other[icond] + result[icond] = set_other except (NotImplementedError, TypeError): # NotImplementedError for class not implementing `__setitem__` # TypeError for SparseArray, which implements just to raise @@ -2314,10 +2316,7 @@ def _try_coerce_args(self, other): ------- base-type other """ - - if isinstance(other, bool): - raise TypeError - elif is_null_datetimelike(other): + if is_null_datetimelike(other): other = tslibs.iNaT elif isinstance(other, (datetime, np.datetime64, date)): other = self._box_func(other) @@ -2689,9 +2688,7 @@ def _try_coerce_args(self, other): base-type other """ - if isinstance(other, bool): - raise TypeError - elif is_null_datetimelike(other): + if is_null_datetimelike(other): other = tslibs.iNaT elif isinstance(other, (timedelta, np.timedelta64)): other = Timedelta(other).value From 8794516cf428c6f778aa6a2e55c12cd83a84c40b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 6 Jul 2019 12:43:36 -0500 Subject: [PATCH 168/238] REF: ops.missing (#27257) --- pandas/core/indexes/base.py | 3 +- pandas/core/missing.py | 138 --------------------- pandas/core/ops/__init__.py | 2 +- pandas/core/ops/missing.py | 165 +++++++++++++++++++++++++ pandas/tests/indexing/test_coercion.py | 3 +- 5 files changed, 169 insertions(+), 142 deletions(-) create mode 100644 pandas/core/ops/missing.py diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 973a022cfc3f15..d3837617d231a0 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -69,6 +69,7 @@ from pandas.core.indexes.frozen import FrozenList import pandas.core.missing as missing from pandas.core.ops import get_op_result_name, make_invalid_op +from pandas.core.ops.missing import dispatch_missing import pandas.core.sorting as sorting from pandas.core.strings import StringMethods @@ -154,7 +155,7 @@ def index_arithmetic_method(self, other): with np.errstate(all="ignore"): result = op(values, other) - result = missing.dispatch_missing(op, values, other, result) + result = dispatch_missing(op, values, other, result) attrs = self._get_attributes_dict() attrs = self._maybe_update_attributes(attrs) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index ad4b5e45238067..8f0abc91f7aef0 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -1,8 +1,6 @@ """ Routines for filling missing data. """ -import operator - import numpy as np from pandas._libs import algos, lib @@ -13,7 +11,6 @@ ensure_float64, is_datetime64_dtype, is_datetime64tz_dtype, - is_float_dtype, is_integer, is_integer_dtype, is_numeric_v_string_like, @@ -578,141 +575,6 @@ def clean_reindex_fill_method(method): return clean_fill_method(method, allow_nearest=True) -def fill_zeros(result, x, y, name, fill): - """ - If this is a reversed op, then flip x,y - - If we have an integer value (or array in y) - and we have 0's, fill them with the fill, - return the result. - - Mask the nan's from x. - """ - if fill is None or is_float_dtype(result): - return result - - if name.startswith(("r", "__r")): - x, y = y, x - - is_variable_type = hasattr(y, "dtype") or hasattr(y, "type") - is_scalar_type = is_scalar(y) - - if not is_variable_type and not is_scalar_type: - return result - - if is_scalar_type: - y = np.array(y) - - if is_integer_dtype(y): - - if (y == 0).any(): - - # GH 7325, mask and nans must be broadcastable (also: PR 9308) - # Raveling and then reshaping makes np.putmask faster - mask = ((y == 0) & ~np.isnan(result)).ravel() - - shape = result.shape - result = result.astype("float64", copy=False).ravel() - - np.putmask(result, mask, fill) - - # if we have a fill of inf, then sign it correctly - # (GH 6178 and PR 9308) - if np.isinf(fill): - signs = y if name.startswith(("r", "__r")) else x - signs = np.sign(signs.astype("float", copy=False)) - negative_inf_mask = (signs.ravel() < 0) & mask - np.putmask(result, negative_inf_mask, -fill) - - if "floordiv" in name: # (PR 9308) - nan_mask = ((y == 0) & (x == 0)).ravel() - np.putmask(result, nan_mask, np.nan) - - result = result.reshape(shape) - - return result - - -def mask_zero_div_zero(x, y, result, copy=False): - """ - Set results of 0 / 0 or 0 // 0 to np.nan, regardless of the dtypes - of the numerator or the denominator. - - Parameters - ---------- - x : ndarray - y : ndarray - result : ndarray - copy : bool (default False) - Whether to always create a new array or try to fill in the existing - array if possible. - - Returns - ------- - filled_result : ndarray - - Examples - -------- - >>> x = np.array([1, 0, -1], dtype=np.int64) - >>> y = 0 # int 0; numpy behavior is different with float - >>> result = x / y - >>> result # raw numpy result does not fill division by zero - array([0, 0, 0]) - >>> mask_zero_div_zero(x, y, result) - array([ inf, nan, -inf]) - """ - if is_scalar(y): - y = np.array(y) - - zmask = y == 0 - if zmask.any(): - shape = result.shape - - nan_mask = (zmask & (x == 0)).ravel() - neginf_mask = (zmask & (x < 0)).ravel() - posinf_mask = (zmask & (x > 0)).ravel() - - if nan_mask.any() or neginf_mask.any() or posinf_mask.any(): - # Fill negative/0 with -inf, positive/0 with +inf, 0/0 with NaN - result = result.astype("float64", copy=copy).ravel() - - np.putmask(result, nan_mask, np.nan) - np.putmask(result, posinf_mask, np.inf) - np.putmask(result, neginf_mask, -np.inf) - - result = result.reshape(shape) - - return result - - -def dispatch_missing(op, left, right, result): - """ - Fill nulls caused by division by zero, casting to a different dtype - if necessary. - - Parameters - ---------- - op : function (operator.add, operator.div, ...) - left : object (Index for non-reversed ops) - right : object (Index fof reversed ops) - result : ndarray - - Returns - ------- - result : ndarray - """ - opstr = "__{opname}__".format(opname=op.__name__).replace("____", "__") - if op in [operator.truediv, operator.floordiv, getattr(operator, "div", None)]: - result = mask_zero_div_zero(left, right, result) - elif op is operator.mod: - result = fill_zeros(result, left, right, opstr, np.nan) - elif op is divmod: - res0 = mask_zero_div_zero(left, right, result[0]) - res1 = fill_zeros(result[1], left, right, opstr, np.nan) - result = (res0, res1) - return result - - def _interp_limit(invalid, fw_limit, bw_limit): """ Get indexers of values that won't be filled diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 4692ec45df0adf..3ce6da6891a7ff 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -49,8 +49,8 @@ import pandas as pd from pandas._typing import ArrayLike import pandas.core.common as com -import pandas.core.missing as missing +from . import missing from .roperator import ( # noqa:F401 radd, rand_, diff --git a/pandas/core/ops/missing.py b/pandas/core/ops/missing.py new file mode 100644 index 00000000000000..947dfc68ac7c3a --- /dev/null +++ b/pandas/core/ops/missing.py @@ -0,0 +1,165 @@ +""" +Missing data handling for arithmetic operations. + +In particular, pandas conventions regarding divison by zero differ +from numpy in the following ways: + 1) np.array([-1, 0, 1], dtype=dtype1) // np.array([0, 0, 0], dtype=dtype2) + gives [nan, nan, nan] for most dtype combinations, and [0, 0, 0] for + the remaining pairs + (the remaining being dtype1==dtype2==intN and dtype==dtype2==uintN). + + pandas convention is to return [-inf, nan, inf] for all dtype + combinations. + + Note: the numpy behavior described here is py3-specific. + + 2) np.array([-1, 0, 1], dtype=dtype1) % np.array([0, 0, 0], dtype=dtype2) + gives precisely the same results as the // operation. + + pandas convention is to return [nan, nan, nan] for all dtype + combinations. + + 3) divmod behavior consistent with 1) and 2). +""" +import operator + +import numpy as np + +from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype, is_scalar + + +def fill_zeros(result, x, y, name, fill): + """ + If this is a reversed op, then flip x,y + + If we have an integer value (or array in y) + and we have 0's, fill them with the fill, + return the result. + + Mask the nan's from x. + """ + if fill is None or is_float_dtype(result): + return result + + if name.startswith(("r", "__r")): + x, y = y, x + + is_variable_type = hasattr(y, "dtype") or hasattr(y, "type") + is_scalar_type = is_scalar(y) + + if not is_variable_type and not is_scalar_type: + return result + + if is_scalar_type: + y = np.array(y) + + if is_integer_dtype(y): + + if (y == 0).any(): + + # GH#7325, mask and nans must be broadcastable (also: GH#9308) + # Raveling and then reshaping makes np.putmask faster + mask = ((y == 0) & ~np.isnan(result)).ravel() + + shape = result.shape + result = result.astype("float64", copy=False).ravel() + + np.putmask(result, mask, fill) + + # if we have a fill of inf, then sign it correctly + # (GH#6178 and GH#9308) + if np.isinf(fill): + signs = y if name.startswith(("r", "__r")) else x + signs = np.sign(signs.astype("float", copy=False)) + negative_inf_mask = (signs.ravel() < 0) & mask + np.putmask(result, negative_inf_mask, -fill) + + if "floordiv" in name: # (GH#9308) + nan_mask = ((y == 0) & (x == 0)).ravel() + np.putmask(result, nan_mask, np.nan) + + result = result.reshape(shape) + + return result + + +def mask_zero_div_zero(x, y, result, copy=False): + """ + Set results of 0 / 0 or 0 // 0 to np.nan, regardless of the dtypes + of the numerator or the denominator. + + Parameters + ---------- + x : ndarray + y : ndarray + result : ndarray + copy : bool (default False) + Whether to always create a new array or try to fill in the existing + array if possible. + + Returns + ------- + filled_result : ndarray + + Examples + -------- + >>> x = np.array([1, 0, -1], dtype=np.int64) + >>> y = 0 # int 0; numpy behavior is different with float + >>> result = x / y + >>> result # raw numpy result does not fill division by zero + array([0, 0, 0]) + >>> mask_zero_div_zero(x, y, result) + array([ inf, nan, -inf]) + """ + if is_scalar(y): + y = np.array(y) + + zmask = y == 0 + if zmask.any(): + shape = result.shape + + nan_mask = (zmask & (x == 0)).ravel() + neginf_mask = (zmask & (x < 0)).ravel() + posinf_mask = (zmask & (x > 0)).ravel() + + if nan_mask.any() or neginf_mask.any() or posinf_mask.any(): + # Fill negative/0 with -inf, positive/0 with +inf, 0/0 with NaN + result = result.astype("float64", copy=copy).ravel() + + np.putmask(result, nan_mask, np.nan) + np.putmask(result, posinf_mask, np.inf) + np.putmask(result, neginf_mask, -np.inf) + + result = result.reshape(shape) + + return result + + +def dispatch_missing(op, left, right, result): + """ + Fill nulls caused by division by zero, casting to a different dtype + if necessary. + + Parameters + ---------- + op : function (operator.add, operator.div, ...) + left : object (Index for non-reversed ops) + right : object (Index fof reversed ops) + result : ndarray + + Returns + ------- + result : ndarray + """ + opstr = "__{opname}__".format(opname=op.__name__).replace("____", "__") + if op is operator.floordiv: + # Note: no need to do this for truediv; in py3 numpy behaves the way + # we want. + result = mask_zero_div_zero(left, right, result) + elif op is operator.mod: + result = fill_zeros(result, left, right, opstr, np.nan) + elif op is divmod: + res0 = mask_zero_div_zero(left, right, result[0]) + res1 = fill_zeros(result[1], left, right, opstr, np.nan) + result = (res0, res1) + return result diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index f46fbcdb504e91..a18f8380f80c1d 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -1063,8 +1063,7 @@ def test_replace_series_datetime_tz(self): # TODO(jreback) commented out to only have a single xfail printed @pytest.mark.xfail( - reason="different tz, " "currently mask_missing raises SystemError", - strict=False, + reason="different tz, currently mask_missing raises SystemError", strict=False ) # @pytest.mark.parametrize('how', ['dict', 'series']) # @pytest.mark.parametrize('to_key', [ From 679dbd021eccc238e422057009365e2ee1c04b25 Mon Sep 17 00:00:00 2001 From: Kane Date: Sat, 6 Jul 2019 13:44:55 -0400 Subject: [PATCH 169/238] Misleading error for pd.read_msgpack (#27201) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/generic.py | 6 +++--- pandas/io/packers.py | 18 ++++++++++-------- pandas/tests/io/test_common.py | 4 ++-- pandas/tests/io/test_packers.py | 3 +++ 5 files changed, 19 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 101addfa097f8b..241e445bf66863 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -1094,6 +1094,7 @@ I/O - Bug while selecting from :class:`HDFStore` with ``where=''`` specified (:issue:`26610`). - Fixed bug in :func:`DataFrame.to_excel()` where custom objects (i.e. `PeriodIndex`) inside merged cells were not being converted into types safe for the Excel writer (:issue:`27006`) - Bug in :meth:`read_hdf` where reading a timezone aware :class:`DatetimeIndex` would raise a ``TypeError`` (:issue:`11926`) +- Bug in :meth:`to_msgpack` and :meth:`read_msgpack` which would raise a ``ValueError`` rather than a ``FileNotFoundError`` for an invalid path (:issue:`27160`) Plotting ^^^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4e9f74162ae787..ec89208e3bbb8e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2560,7 +2560,7 @@ def to_msgpack(self, path_or_buf=None, encoding="utf-8", **kwargs): Parameters ---------- path : string File path, buffer-like, or None - if None, return generated string + if None, return generated bytes append : bool whether to append to an existing msgpack (default is False) compress : type of compressor (zlib or blosc), default to None (no @@ -2568,9 +2568,9 @@ def to_msgpack(self, path_or_buf=None, encoding="utf-8", **kwargs): Returns ------- - None or str + None or bytes If path_or_buf is None, returns the resulting msgpack format as a - string. Otherwise returns None. + byte string. Otherwise returns None. """ from pandas.io import packers diff --git a/pandas/io/packers.py b/pandas/io/packers.py index b0ce7a4ccb12af..2e411fb07885fc 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -108,7 +108,7 @@ def to_msgpack(path_or_buf, *args, **kwargs): Parameters ---------- path_or_buf : string File path, buffer-like, or None - if None, return generated string + if None, return generated bytes args : an object or objects to serialize encoding : encoding for unicode objects append : boolean whether to append to an existing msgpack @@ -139,8 +139,12 @@ def writer(fh): path_or_buf = _stringify_path(path_or_buf) if isinstance(path_or_buf, str): - with open(path_or_buf, mode) as fh: - writer(fh) + try: + with open(path_or_buf, mode) as fh: + writer(fh) + except FileNotFoundError: + msg = "File b'{}' does not exist".format(path_or_buf) + raise FileNotFoundError(msg) elif path_or_buf is None: buf = BytesIO() writer(buf) @@ -204,13 +208,11 @@ def read(fh): # see if we have an actual file if isinstance(path_or_buf, str): try: - exists = os.path.exists(path_or_buf) - except (TypeError, ValueError): - exists = False - - if exists: with open(path_or_buf, "rb") as fh: return read(fh) + except FileNotFoundError: + msg = "File b'{}' does not exist".format(path_or_buf) + raise FileNotFoundError(msg) if isinstance(path_or_buf, bytes): # treat as a binary-like diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 426698bfa1e940..8e09e96fbd4713 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -142,7 +142,7 @@ def test_iterator(self): (pd.read_stata, "os", FileNotFoundError, "dta"), (pd.read_sas, "os", FileNotFoundError, "sas7bdat"), (pd.read_json, "os", ValueError, "json"), - (pd.read_msgpack, "os", ValueError, "mp"), + (pd.read_msgpack, "os", FileNotFoundError, "mp"), (pd.read_pickle, "os", FileNotFoundError, "pickle"), ], ) @@ -177,7 +177,7 @@ def test_read_non_existant(self, reader, module, error_class, fn_ext): (pd.read_stata, "os", FileNotFoundError, "dta"), (pd.read_sas, "os", FileNotFoundError, "sas7bdat"), (pd.read_json, "os", ValueError, "json"), - (pd.read_msgpack, "os", ValueError, "mp"), + (pd.read_msgpack, "os", FileNotFoundError, "mp"), (pd.read_pickle, "os", FileNotFoundError, "pickle"), ], ) diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index 83c11cd9ab996e..fb1f657905be71 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -165,12 +165,15 @@ def __init__(self): self.read = 0 msg = "Invalid file path or buffer object type: " + invalid_path = os.path.join("nonexistent_dir", "df.msgpack") with pytest.raises(ValueError, match=msg.format("NoneType")): read_msgpack(path_or_buf=None) with pytest.raises(ValueError, match=msg.format("dict")): read_msgpack(path_or_buf={}) with pytest.raises(ValueError, match=msg.format(r".*\.A")): read_msgpack(path_or_buf=A()) + with pytest.raises(FileNotFoundError, match="does not exist"): + read_msgpack(path_or_buf=invalid_path) @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") From f4752fcafb67298b5d28abc8a096d787bf117d27 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 6 Jul 2019 13:12:12 -0500 Subject: [PATCH 170/238] CLN: remove unnecessary fastpath, transpose kwargs in internals (#27260) --- pandas/core/generic.py | 1 - pandas/core/internals/blocks.py | 72 ++++++------------------ pandas/core/internals/managers.py | 12 ++-- pandas/tests/internals/test_internals.py | 34 +---------- 4 files changed, 23 insertions(+), 96 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ec89208e3bbb8e..b79bde9cc3cb14 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9175,7 +9175,6 @@ def _where( errors=errors, try_cast=try_cast, axis=block_axis, - transpose=self._AXIS_REVERSED, ) return self._constructor(new_data).__finalize__(self) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 022d855d9a15b5..bf6ebf1abe7605 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -143,7 +143,7 @@ def _check_ndim(self, values, ndim): ndim = values.ndim if self._validate_ndim and values.ndim != ndim: - msg = "Wrong number of dimensions. values.ndim != ndim " "[{} != {}]" + msg = "Wrong number of dimensions. values.ndim != ndim [{} != {}]" raise ValueError(msg.format(values.ndim, ndim)) return ndim @@ -259,7 +259,7 @@ def make_block_same_class(self, values, placement=None, ndim=None, dtype=None): if dtype is not None: # issue 19431 fastparquet is passing this warnings.warn( - "dtype argument is deprecated, will be removed " "in a future release.", + "dtype argument is deprecated, will be removed in a future release.", FutureWarning, ) if placement is None: @@ -399,7 +399,7 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): raise ValueError("Limit must be greater than 0") if self.ndim > 2: raise NotImplementedError( - "number of dimensions for 'fillna' " "is currently limited to 2" + "number of dimensions for 'fillna' is currently limited to 2" ) mask[mask.cumsum(self.ndim - 1) > limit] = False @@ -533,7 +533,7 @@ def downcast(self, dtypes=None): if not (dtypes == "infer" or isinstance(dtypes, dict)): raise ValueError( - "downcast must have a dictionary or 'infer' as " "its argument" + "downcast must have a dictionary or 'infer' as its argument" ) # operate column-by-column @@ -1025,7 +1025,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False) or mask[mask].shape[-1] == len(new) or len(new) == 1 ): - raise ValueError("cannot assign mismatch " "length to masked array") + raise ValueError("cannot assign mismatch length to masked array") np.putmask(new_values, mask, new) @@ -1381,16 +1381,7 @@ def shift(self, periods, axis=0, fill_value=None): return [self.make_block(new_values)] - def where( - self, - other, - cond, - align=True, - errors="raise", - try_cast=False, - axis=0, - transpose=False, - ): + def where(self, other, cond, align=True, errors="raise", try_cast=False, axis=0): """ evaluate the block; return result block(s) from the result @@ -1402,10 +1393,7 @@ def where( errors : str, {'raise', 'ignore'}, default 'raise' - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object - axis : int - transpose : boolean - Set to True if self is stored with axes reversed Returns ------- @@ -1414,6 +1402,7 @@ def where( import pandas.core.computation.expressions as expressions assert errors in ["raise", "ignore"] + transpose = self.ndim == 2 values = self.values orig_other = other @@ -1432,7 +1421,7 @@ def where( cond = cond.T if not hasattr(cond, "shape"): - raise ValueError("where must have a condition that is ndarray " "like") + raise ValueError("where must have a condition that is ndarray like") # our where function def func(cond, values, other): @@ -1473,7 +1462,6 @@ def func(cond, values, other): errors=errors, try_cast=try_cast, axis=axis, - transpose=transpose, ) return self._maybe_downcast(blocks, "infer") @@ -1917,7 +1905,7 @@ def _slice(self, slicer): if isinstance(slicer, tuple) and len(slicer) == 2: if not com.is_null_slice(slicer[0]): - raise AssertionError("invalid slicing for a 1-ndim " "categorical") + raise AssertionError("invalid slicing for a 1-ndim categorical") slicer = slicer[1] return self.values[slicer] @@ -2004,16 +1992,7 @@ def shift( ) ] - def where( - self, - other, - cond, - align=True, - errors="raise", - try_cast=False, - axis=0, - transpose=False, - ): + def where(self, other, cond, align=True, errors="raise", try_cast=False, axis=0): if isinstance(other, ABCDataFrame): # ExtensionArrays are 1-D, so if we get here then # `other` should be a DataFrame with a single column. @@ -2321,9 +2300,7 @@ def _try_coerce_args(self, other): elif isinstance(other, (datetime, np.datetime64, date)): other = self._box_func(other) if getattr(other, "tz") is not None: - raise TypeError( - "cannot coerce a Timestamp with a tz on a " "naive Block" - ) + raise TypeError("cannot coerce a Timestamp with a tz on a naive Block") other = other.asm8.view("i8") elif hasattr(other, "dtype") and is_datetime64_dtype(other): other = other.astype("i8", copy=False).view("i8") @@ -2997,7 +2974,7 @@ def _replace_single( # only one will survive if to_rep_re and regex_re: raise AssertionError( - "only one of to_replace and regex can be " "regex compilable" + "only one of to_replace and regex can be regex compilable" ) # if regex was passed as something that can be a regex (rather than a @@ -3181,16 +3158,7 @@ def concat_same_type(self, to_concat, placement=None): values, placement=placement or slice(0, len(values), 1), ndim=self.ndim ) - def where( - self, - other, - cond, - align=True, - errors="raise", - try_cast=False, - axis=0, - transpose=False, - ): + def where(self, other, cond, align=True, errors="raise", try_cast=False, axis=0): # TODO(CategoricalBlock.where): # This can all be deleted in favor of ExtensionBlock.where once # we enforce the deprecation. @@ -3205,19 +3173,11 @@ def where( ) try: # Attempt to do preserve categorical dtype. - result = super().where( - other, cond, align, errors, try_cast, axis, transpose - ) + result = super().where(other, cond, align, errors, try_cast, axis) except (TypeError, ValueError): warnings.warn(object_msg, FutureWarning, stacklevel=6) result = self.astype(object).where( - other, - cond, - align=align, - errors=errors, - try_cast=try_cast, - axis=axis, - transpose=transpose, + other, cond, align=align, errors=errors, try_cast=try_cast, axis=axis ) return result @@ -3286,7 +3246,7 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, fastpath=No if fastpath is not None: # GH#19265 pyarrow is passing this warnings.warn( - "fastpath argument is deprecated, will be removed " "in a future release.", + "fastpath argument is deprecated, will be removed in a future release.", FutureWarning, ) if klass is None: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c5254aaa4af5fa..b3c74aaaa5701a 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -936,7 +936,7 @@ def _consolidate_inplace(self): self._known_consolidated = True self._rebuild_blknos_and_blklocs() - def get(self, item, fastpath=True): + def get(self, item): """ Return values for selected item (ndarray or BlockManager). """ @@ -954,7 +954,7 @@ def get(self, item, fastpath=True): else: raise ValueError("cannot label index with a null key") - return self.iget(loc, fastpath=fastpath) + return self.iget(loc) else: if isna(item): @@ -965,18 +965,18 @@ def get(self, item, fastpath=True): new_axis=self.items[indexer], indexer=indexer, axis=0, allow_dups=True ) - def iget(self, i, fastpath=True): + def iget(self, i): """ - Return the data as a SingleBlockManager if fastpath=True and possible + Return the data as a SingleBlockManager if possible Otherwise return as a ndarray """ block = self.blocks[self._blknos[i]] values = block.iget(self._blklocs[i]) - if not fastpath or values.ndim != 1: + if values.ndim != 1: return values - # fastpath shortcut for select a single-dim from a 2-dim BM + # shortcut for select a single-dim from a 2-dim BM return SingleBlockManager( [ block.make_block_same_class( diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 9ce1062a6ec26b..6beb847da3eb49 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -418,9 +418,6 @@ def test_get(self): block = make_block(values=values.copy(), placement=np.arange(3)) mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)]) - assert_almost_equal(mgr.get("a", fastpath=False), values[0]) - assert_almost_equal(mgr.get("b", fastpath=False), values[1]) - assert_almost_equal(mgr.get("c", fastpath=False), values[2]) assert_almost_equal(mgr.get("a").internal_values(), values[0]) assert_almost_equal(mgr.get("b").internal_values(), values[1]) assert_almost_equal(mgr.get("c").internal_values(), values[2]) @@ -701,6 +698,7 @@ def test_consolidate_ordering_issues(self, mgr): ) def test_reindex_index(self): + # TODO: should this be pytest.skip? pass def test_reindex_items(self): @@ -710,18 +708,6 @@ def test_reindex_items(self): reindexed = mgr.reindex_axis(["g", "c", "a", "d"], axis=0) assert reindexed.nblocks == 2 tm.assert_index_equal(reindexed.items, pd.Index(["g", "c", "a", "d"])) - assert_almost_equal( - mgr.get("g", fastpath=False), reindexed.get("g", fastpath=False) - ) - assert_almost_equal( - mgr.get("c", fastpath=False), reindexed.get("c", fastpath=False) - ) - assert_almost_equal( - mgr.get("a", fastpath=False), reindexed.get("a", fastpath=False) - ) - assert_almost_equal( - mgr.get("d", fastpath=False), reindexed.get("d", fastpath=False) - ) assert_almost_equal( mgr.get("g").internal_values(), reindexed.get("g").internal_values() ) @@ -747,18 +733,12 @@ def test_get_numeric_data(self): tm.assert_index_equal( numeric.items, pd.Index(["int", "float", "complex", "bool"]) ) - assert_almost_equal( - mgr.get("float", fastpath=False), numeric.get("float", fastpath=False) - ) assert_almost_equal( mgr.get("float").internal_values(), numeric.get("float").internal_values() ) # Check sharing numeric.set("float", np.array([100.0, 200.0, 300.0])) - assert_almost_equal( - mgr.get("float", fastpath=False), np.array([100.0, 200.0, 300.0]) - ) assert_almost_equal( mgr.get("float").internal_values(), np.array([100.0, 200.0, 300.0]) ) @@ -768,9 +748,6 @@ def test_get_numeric_data(self): numeric.items, pd.Index(["int", "float", "complex", "bool"]) ) numeric2.set("float", np.array([1000.0, 2000.0, 3000.0])) - assert_almost_equal( - mgr.get("float", fastpath=False), np.array([100.0, 200.0, 300.0]) - ) assert_almost_equal( mgr.get("float").internal_values(), np.array([100.0, 200.0, 300.0]) ) @@ -785,17 +762,11 @@ def test_get_bool_data(self): bools = mgr.get_bool_data() tm.assert_index_equal(bools.items, pd.Index(["bool"])) - assert_almost_equal( - mgr.get("bool", fastpath=False), bools.get("bool", fastpath=False) - ) assert_almost_equal( mgr.get("bool").internal_values(), bools.get("bool").internal_values() ) bools.set("bool", np.array([True, False, True])) - tm.assert_numpy_array_equal( - mgr.get("bool", fastpath=False), np.array([True, False, True]) - ) tm.assert_numpy_array_equal( mgr.get("bool").internal_values(), np.array([True, False, True]) ) @@ -803,9 +774,6 @@ def test_get_bool_data(self): # Check sharing bools2 = mgr.get_bool_data(copy=True) bools2.set("bool", np.array([False, True, False])) - tm.assert_numpy_array_equal( - mgr.get("bool", fastpath=False), np.array([True, False, True]) - ) tm.assert_numpy_array_equal( mgr.get("bool").internal_values(), np.array([True, False, True]) ) From 20a85c392eaf92772b240a9d7c00abc7a3982b15 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 6 Jul 2019 16:45:46 -0500 Subject: [PATCH 171/238] BUG: Fix divmod fill value, closes #26987 (#27239) --- doc/source/whatsnew/v0.25.0.rst | 2 +- doc/source/whatsnew/v0.25.1.rst | 1 - pandas/core/ops/__init__.py | 14 ++---- pandas/core/ops/missing.py | 23 +++++++++ pandas/tests/arithmetic/test_numeric.py | 57 ++++++++++++++++------- pandas/tests/sparse/series/test_series.py | 1 + 6 files changed, 67 insertions(+), 31 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 241e445bf66863..9f59be73e501cb 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -1007,7 +1007,7 @@ Numeric - Raises a helpful exception when a non-numeric index is sent to :meth:`interpolate` with methods which require numeric index. (:issue:`21662`) - Bug in :meth:`~pandas.eval` when comparing floats with scalar operators, for example: ``x < -0.1`` (:issue:`25928`) - Fixed bug where casting all-boolean array to integer extension array failed (:issue:`25211`) -- +- Bug in ``divmod`` with a :class:`Series` object containing zeros incorrectly raising ``AttributeError`` (:issue:`26987`) - Conversion diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 8690e1974330b8..6234bc0f7bd35d 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -56,7 +56,6 @@ Timezones Numeric ^^^^^^^ - - - - diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 3ce6da6891a7ff..df2907bf591ddf 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -249,7 +249,7 @@ def _gen_fill_zeros(name): """ name = name.strip("__") if "div" in name: - # truediv, floordiv, div, and reversed variants + # truediv, floordiv, and reversed variants fill_value = np.inf elif "mod" in name: # mod, rmod @@ -1668,14 +1668,7 @@ def na_op(x, y): except TypeError: result = masked_arith_op(x, y, op) - if isinstance(result, tuple): - # e.g. divmod - result = tuple( - missing.fill_zeros(r, x, y, op_name, fill_zeros) for r in result - ) - else: - result = missing.fill_zeros(result, x, y, op_name, fill_zeros) - return result + return missing.dispatch_fill_zeros(op, x, y, result, fill_zeros) def wrapper(left, right): if isinstance(right, ABCDataFrame): @@ -2157,8 +2150,7 @@ def na_op(x, y): except TypeError: result = masked_arith_op(x, y, op) - result = missing.fill_zeros(result, x, y, op_name, fill_zeros) - return result + return missing.dispatch_fill_zeros(op, x, y, result, fill_zeros) if op_name in _op_descriptions: # i.e. include "add" but not "__add__" diff --git a/pandas/core/ops/missing.py b/pandas/core/ops/missing.py index 947dfc68ac7c3a..4ca1861baf237b 100644 --- a/pandas/core/ops/missing.py +++ b/pandas/core/ops/missing.py @@ -27,6 +27,8 @@ from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype, is_scalar +from .roperator import rdivmod + def fill_zeros(result, x, y, name, fill): """ @@ -163,3 +165,24 @@ def dispatch_missing(op, left, right, result): res1 = fill_zeros(result[1], left, right, opstr, np.nan) result = (res0, res1) return result + + +# FIXME: de-duplicate with dispatch_missing +def dispatch_fill_zeros(op, left, right, result, fill_value): + """ + Call fill_zeros with the appropriate fill value depending on the operation, + with special logic for divmod and rdivmod. + """ + if op is divmod: + result = ( + fill_zeros(result[0], left, right, "__floordiv__", np.inf), + fill_zeros(result[1], left, right, "__mod__", np.nan), + ) + elif op is rdivmod: + result = ( + fill_zeros(result[0], left, right, "__rfloordiv__", np.inf), + fill_zeros(result[1], left, right, "__rmod__", np.nan), + ) + else: + result = fill_zeros(result, left, right, op.__name__, fill_value) + return result diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 7dcd0cc820061b..f582bf8b13975b 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -265,25 +265,11 @@ def test_divmod_zero(self, zero, numeric_idx): # ------------------------------------------------------------------ - @pytest.mark.parametrize( - "dtype2", - [ - np.int64, - np.int32, - np.int16, - np.int8, - np.float64, - np.float32, - np.float16, - np.uint64, - np.uint32, - np.uint16, - np.uint8, - ], - ) @pytest.mark.parametrize("dtype1", [np.int64, np.float64, np.uint64]) - def test_ser_div_ser(self, dtype1, dtype2): + def test_ser_div_ser(self, dtype1, any_real_dtype): # no longer do integer div for any ops, but deal with the 0's + dtype2 = any_real_dtype + first = Series([3, 4, 5, 8], name="first").astype(dtype1) second = Series([0, 0, 0, 3], name="second").astype(dtype2) @@ -299,6 +285,39 @@ def test_ser_div_ser(self, dtype1, dtype2): tm.assert_series_equal(result, expected) assert not result.equals(second / first) + @pytest.mark.parametrize("dtype1", [np.int64, np.float64, np.uint64]) + def test_ser_divmod_zero(self, dtype1, any_real_dtype): + # GH#26987 + dtype2 = any_real_dtype + left = pd.Series([1, 1]).astype(dtype1) + right = pd.Series([0, 2]).astype(dtype2) + + expected = left // right, left % right + result = divmod(left, right) + + tm.assert_series_equal(result[0], expected[0]) + tm.assert_series_equal(result[1], expected[1]) + + # rdivmod case + result = divmod(left.values, right) + tm.assert_series_equal(result[0], expected[0]) + tm.assert_series_equal(result[1], expected[1]) + + def test_ser_divmod_inf(self): + left = pd.Series([np.inf, 1.0]) + right = pd.Series([np.inf, 2.0]) + + expected = left // right, left % right + result = divmod(left, right) + + tm.assert_series_equal(result[0], expected[0]) + tm.assert_series_equal(result[1], expected[1]) + + # rdivmod case + result = divmod(left.values, right) + tm.assert_series_equal(result[0], expected[0]) + tm.assert_series_equal(result[1], expected[1]) + def test_rdiv_zero_compat(self): # GH#8674 zero_array = np.array([0] * 5) @@ -662,7 +681,9 @@ def test_modulo2(self): result2 = p["second"] % p["first"] assert not result.equals(result2) - # GH#9144 + def test_modulo_zero_int(self): + # GH#9144 + with np.errstate(all="ignore"): s = Series([0, 1]) result = s % 0 diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 8895544958d7a8..5619a0a11fb116 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -578,6 +578,7 @@ def check(a, b): _check_op(a, b, lambda x, y: operator.floordiv(y, x)) _check_op(a, b, lambda x, y: operator.mul(y, x)) + # FIXME: don't leave commented-out # NaN ** 0 = 1 in C? # _check_op(a, b, operator.pow) # _check_op(a, b, lambda x, y: operator.pow(y, x)) From af5b2a25793794ad76b91027146e4c9f81ee1224 Mon Sep 17 00:00:00 2001 From: Christopher Whelan Date: Sat, 6 Jul 2019 15:11:47 -0700 Subject: [PATCH 172/238] PERF, BENCH: Fix performance issue when indexing into non-unique DatetimeIndex/PeriodIndex. (#27136) --- asv_bench/benchmarks/categoricals.py | 14 +++++++--- asv_bench/benchmarks/indexing.py | 23 ++++++++++----- doc/source/whatsnew/v0.25.0.rst | 2 ++ pandas/core/indexes/base.py | 1 - pandas/core/indexes/period.py | 28 +++++++++++++++++-- pandas/tests/series/indexing/test_datetime.py | 24 ++++++++++++++++ 6 files changed, 78 insertions(+), 14 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 933946b1ca1acc..8097118a79d20d 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import pandas.util.testing as tm +import warnings try: from pandas.api.types import union_categoricals @@ -122,11 +123,16 @@ def setup(self): ncats = 100 self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str) - self.s_str_cat = self.s_str.astype("category") - self.s_str_cat_ordered = self.s_str_cat.cat.as_ordered() + self.s_str_cat = pd.Series(self.s_str, dtype="category") + with warnings.catch_warnings(record=True): + str_cat_type = pd.CategoricalDtype(set(self.s_str), ordered=True) + self.s_str_cat_ordered = self.s_str.astype(str_cat_type) + self.s_int = pd.Series(np.random.randint(0, ncats, size=N)) - self.s_int_cat = self.s_int.astype("category") - self.s_int_cat_ordered = self.s_int_cat.cat.as_ordered() + self.s_int_cat = pd.Series(self.s_int, dtype="category") + with warnings.catch_warnings(record=True): + int_cat_type = pd.CategoricalDtype(set(self.s_int), ordered=True) + self.s_int_cat_ordered = self.s_int.astype(int_cat_type) def time_rank_string(self): self.s_str.rank() diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 489e5c4cd63ea3..eb730f91b10b31 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -15,6 +15,7 @@ concat, date_range, option_context, + period_range, ) @@ -93,22 +94,30 @@ def time_loc_slice(self, index, index_structure): class NonNumericSeriesIndexing: params = [ - ("string", "datetime"), - ("unique_monotonic_inc", "nonunique_monotonic_inc"), + ("string", "datetime", "period"), + ("unique_monotonic_inc", "nonunique_monotonic_inc", "non_monotonic"), ] param_names = ["index_dtype", "index_structure"] def setup(self, index, index_structure): N = 10 ** 6 - indexes = { - "string": tm.makeStringIndex(N), - "datetime": date_range("1900", periods=N, freq="s"), - } - index = indexes[index] + if index == "string": + index = tm.makeStringIndex(N) + elif index == "datetime": + index = date_range("1900", periods=N, freq="s") + elif index == "period": + index = period_range("1900", periods=N, freq="s") + index = index.sort_values() + assert index.is_unique and index.is_monotonic_increasing if index_structure == "nonunique_monotonic_inc": index = index.insert(item=index[2], loc=2)[:-1] + elif index_structure == "non_monotonic": + index = index[::2].append(index[1::2]) + assert len(index) == N self.s = Series(np.random.rand(N), index=index) self.lbl = index[80000] + # warm up index mapping + self.s[self.lbl] def time_getitem_label_slice(self, index, index_structure): self.s[: self.lbl] diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 9f59be73e501cb..77fa12ea95a486 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -938,6 +938,8 @@ Performance improvements - Improved performance when building :class:`MultiIndex` with at least one :class:`CategoricalIndex` level (:issue:`22044`) - Improved performance by removing the need for a garbage collect when checking for ``SettingWithCopyWarning`` (:issue:`27031`) - For :meth:`to_datetime` changed default value of cache parameter to ``True`` (:issue:`26043`) +- Improved performance of :class:`DatetimeIndex` and :class:`PeriodIndex` slicing given non-unique, monotonic data (:issue:`27136`). + .. _whatsnew_0250.bug_fixes: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d3837617d231a0..96ce408a0ff8c6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4792,7 +4792,6 @@ def get_indexer_non_unique(self, target): return pself.get_indexer_non_unique(ptarget) if self.is_all_dates: - self = Index(self.asi8) tgt_values = target.asi8 else: tgt_values = target._ndarray_values diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 0013df44614e86..47cf0f26f9ca5f 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -9,6 +9,7 @@ from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.core.dtypes.common import ( + ensure_platform_int, is_bool_dtype, is_datetime64_any_dtype, is_float, @@ -618,7 +619,7 @@ def get_value(self, series, key): elif grp == freqn: key = Period(asdt, freq=self.freq).ordinal return com.maybe_box( - self, self._engine.get_value(s, key), series, key + self, self._int64index.get_value(s, key), series, key ) else: raise KeyError(key) @@ -627,7 +628,7 @@ def get_value(self, series, key): period = Period(key, self.freq) key = period.value if isna(period) else period.ordinal - return com.maybe_box(self, self._engine.get_value(s, key), series, key) + return com.maybe_box(self, self._int64index.get_value(s, key), series, key) @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): @@ -648,6 +649,23 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): tolerance = self._convert_tolerance(tolerance, target) return Index.get_indexer(self._int64index, target, method, limit, tolerance) + @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) + def get_indexer_non_unique(self, target): + target = ensure_index(target) + + if isinstance(target, PeriodIndex): + target = target.asi8 + if hasattr(target, "freq") and target.freq != self.freq: + msg = DIFFERENT_FREQ.format( + cls=type(self).__name__, + own_freq=self.freqstr, + other_freq=target.freqstr, + ) + raise IncompatibleFrequency(msg) + + indexer, missing = self._int64index.get_indexer_non_unique(target) + return ensure_platform_int(indexer), missing + def _get_unique_index(self, dropna=False): """ wrap Index._get_unique_index to handle NaT @@ -954,6 +972,12 @@ def base(self): ) return np.asarray(self._data) + def memory_usage(self, deep=False): + result = super().memory_usage(deep=deep) + if hasattr(self, "_cache") and "_int64index" in self._cache: + result += self._int64index.memory_usage(deep=deep) + return result + PeriodIndex._add_comparison_ops() PeriodIndex._add_numeric_methods_disabled() diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 721ea2b6e66324..61a9909926efea 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -604,6 +604,30 @@ def test_indexing_over_size_cutoff(): _index._SIZE_CUTOFF = old_cutoff +def test_indexing_over_size_cutoff_period_index(): + # GH 27136 + + old_cutoff = _index._SIZE_CUTOFF + try: + _index._SIZE_CUTOFF = 1000 + + n = 1100 + idx = pd.period_range("1/1/2000", freq="T", periods=n) + assert idx._engine.over_size_threshold + + s = pd.Series(np.random.randn(len(idx)), index=idx) + + pos = n - 1 + timestamp = idx[pos] + assert timestamp in s.index + + # it works! + s[timestamp] + assert len(s.loc[[timestamp]]) > 0 + finally: + _index._SIZE_CUTOFF = old_cutoff + + def test_indexing_unordered(): # GH 2437 rng = date_range(start="2011-01-01", end="2011-01-15") From 6bc1cf4b5518b4cd909ea1f3b1663bf35ed2ec0f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 7 Jul 2019 09:13:39 -0500 Subject: [PATCH 173/238] Remove unused SparseArray code (#27269) --- pandas/core/arrays/sparse.py | 1 - pandas/core/ops/__init__.py | 47 ------------------------------------ 2 files changed, 48 deletions(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 2332da46574c5f..65976021f5053d 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1774,7 +1774,6 @@ def sparse_arithmetic_method(self, other): else: other = np.asarray(other) with np.errstate(all="ignore"): - # TODO: delete sparse stuff in core/ops.py # TODO: look into _wrap_result if len(self) != len(other): raise AssertionError( diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index df2907bf591ddf..545f98a02439ac 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -1401,12 +1401,6 @@ def _get_method_wrappers(cls): arith_special = _arith_method_SERIES comp_special = _comp_method_SERIES bool_special = _bool_method_SERIES - elif issubclass(cls, ABCSparseArray): - arith_flex = None - comp_flex = None - arith_special = _arith_method_SPARSE_ARRAY - comp_special = _arith_method_SPARSE_ARRAY - bool_special = _arith_method_SPARSE_ARRAY elif issubclass(cls, ABCDataFrame): # Same for DataFrame and SparseDataFrame arith_flex = _arith_method_FRAME @@ -2336,47 +2330,6 @@ def _sparse_series_op(left, right, op, name): return left._constructor(result, index=new_index, name=new_name) -def _arith_method_SPARSE_ARRAY(cls, op, special): - """ - Wrapper function for Series arithmetic operations, to avoid - code duplication. - """ - op_name = _get_op_name(op, special) - - def wrapper(self, other): - from pandas.core.arrays.sparse.array import ( - SparseArray, - _sparse_array_op, - _wrap_result, - _get_fill, - ) - - if isinstance(other, np.ndarray): - if len(self) != len(other): - raise AssertionError( - "length mismatch: {self} vs. {other}".format( - self=len(self), other=len(other) - ) - ) - if not isinstance(other, SparseArray): - dtype = getattr(other, "dtype", None) - other = SparseArray(other, fill_value=self.fill_value, dtype=dtype) - return _sparse_array_op(self, other, op, op_name) - elif is_scalar(other): - with np.errstate(all="ignore"): - fill = op(_get_fill(self), np.asarray(other)) - result = op(self.sp_values, other) - - return _wrap_result(op_name, result, self.sp_index, fill) - else: # pragma: no cover - raise TypeError( - "operation with {other} not supported".format(other=type(other)) - ) - - wrapper.__name__ = op_name - return wrapper - - def maybe_dispatch_ufunc_to_dunder_op( self: ArrayLike, ufunc: Callable, method: str, *inputs: ArrayLike, **kwargs: Any ): From acccdcc6c85d2331f1dd3ae14977ea861437f2d8 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 7 Jul 2019 16:44:49 +0100 Subject: [PATCH 174/238] TST/CLN: remove try block from test_indexing_over_size_cutoff_period_index (#27276) --- pandas/tests/series/indexing/test_datetime.py | 28 ++++++++----------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 61a9909926efea..e2f40c62674933 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -604,28 +604,24 @@ def test_indexing_over_size_cutoff(): _index._SIZE_CUTOFF = old_cutoff -def test_indexing_over_size_cutoff_period_index(): +def test_indexing_over_size_cutoff_period_index(monkeypatch): # GH 27136 - old_cutoff = _index._SIZE_CUTOFF - try: - _index._SIZE_CUTOFF = 1000 + monkeypatch.setattr(_index, "_SIZE_CUTOFF", 1000) - n = 1100 - idx = pd.period_range("1/1/2000", freq="T", periods=n) - assert idx._engine.over_size_threshold + n = 1100 + idx = pd.period_range("1/1/2000", freq="T", periods=n) + assert idx._engine.over_size_threshold - s = pd.Series(np.random.randn(len(idx)), index=idx) + s = pd.Series(np.random.randn(len(idx)), index=idx) - pos = n - 1 - timestamp = idx[pos] - assert timestamp in s.index + pos = n - 1 + timestamp = idx[pos] + assert timestamp in s.index - # it works! - s[timestamp] - assert len(s.loc[[timestamp]]) > 0 - finally: - _index._SIZE_CUTOFF = old_cutoff + # it works! + s[timestamp] + assert len(s.loc[[timestamp]]) > 0 def test_indexing_unordered(): From 4dc2b69804221834bc2a48958733d1430fef216d Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 8 Jul 2019 02:08:59 +0100 Subject: [PATCH 175/238] TST/STYLE: concatenate string literals post black reformatting (#27281) --- pandas/tests/arithmetic/test_numeric.py | 2 +- pandas/tests/arithmetic/test_period.py | 2 +- pandas/tests/arithmetic/test_timedelta64.py | 8 ++-- .../arrays/categorical/test_operators.py | 2 +- pandas/tests/arrays/test_integer.py | 2 +- pandas/tests/arrays/test_period.py | 2 +- pandas/tests/computation/test_eval.py | 6 +-- pandas/tests/extension/arrow/bool.py | 2 +- pandas/tests/extension/decimal/array.py | 4 +- .../tests/extension/decimal/test_decimal.py | 2 +- pandas/tests/extension/json/array.py | 2 +- pandas/tests/frame/test_alter_axes.py | 2 +- pandas/tests/frame/test_analytics.py | 10 ++--- pandas/tests/frame/test_api.py | 2 +- pandas/tests/frame/test_block_internals.py | 2 +- pandas/tests/frame/test_constructors.py | 2 +- pandas/tests/frame/test_dtypes.py | 10 ++--- pandas/tests/frame/test_missing.py | 4 +- pandas/tests/frame/test_quantile.py | 4 +- pandas/tests/frame/test_sorting.py | 2 +- pandas/tests/frame/test_timeseries.py | 2 +- pandas/tests/generic/test_generic.py | 4 +- pandas/tests/groupby/test_function.py | 8 ++-- pandas/tests/groupby/test_grouping.py | 2 +- pandas/tests/groupby/test_transform.py | 2 +- pandas/tests/groupby/test_whitelist.py | 4 +- .../indexes/datetimes/test_construction.py | 2 +- .../tests/indexes/datetimes/test_formats.py | 16 ++++---- pandas/tests/indexes/datetimes/test_tools.py | 2 +- .../tests/indexes/interval/test_interval.py | 4 +- pandas/tests/indexes/multi/test_analytics.py | 4 +- .../tests/indexes/multi/test_constructor.py | 2 +- pandas/tests/indexes/multi/test_contains.py | 2 +- pandas/tests/indexes/multi/test_indexing.py | 6 +-- .../tests/indexes/period/test_arithmetic.py | 6 +-- .../tests/indexes/period/test_construction.py | 6 +-- pandas/tests/indexes/period/test_formats.py | 16 +++----- pandas/tests/indexes/period/test_setops.py | 2 +- pandas/tests/indexes/period/test_tools.py | 2 +- pandas/tests/indexes/test_base.py | 8 ++-- pandas/tests/indexes/test_common.py | 2 +- pandas/tests/indexes/test_numeric.py | 6 +-- .../indexes/timedeltas/test_arithmetic.py | 6 +-- .../tests/indexes/timedeltas/test_formats.py | 24 ++++++------ pandas/tests/indexing/test_categorical.py | 6 +-- .../indexing/test_chaining_and_caching.py | 2 +- pandas/tests/indexing/test_coercion.py | 6 +-- pandas/tests/indexing/test_iloc.py | 6 +-- pandas/tests/indexing/test_scalar.py | 2 +- pandas/tests/internals/test_internals.py | 6 +-- pandas/tests/io/excel/test_style.py | 2 +- pandas/tests/io/excel/test_writers.py | 2 +- .../tests/io/formats/test_eng_formatting.py | 8 +--- pandas/tests/io/formats/test_format.py | 38 +++++++++---------- pandas/tests/io/formats/test_style.py | 4 +- pandas/tests/io/formats/test_to_csv.py | 2 +- .../tests/io/json/test_json_table_schema.py | 2 +- pandas/tests/io/json/test_normalize.py | 4 +- pandas/tests/io/json/test_pandas.py | 4 +- pandas/tests/io/parser/test_common.py | 2 +- pandas/tests/io/parser/test_compression.py | 4 +- pandas/tests/io/parser/test_header.py | 4 +- pandas/tests/io/parser/test_index_col.py | 5 +-- pandas/tests/io/parser/test_parse_dates.py | 4 +- pandas/tests/io/parser/test_textreader.py | 8 ++-- pandas/tests/io/parser/test_unsupported.py | 2 +- pandas/tests/io/parser/test_usecols.py | 6 +-- pandas/tests/io/pytables/test_pytables.py | 8 ++-- pandas/tests/io/test_feather.py | 2 +- pandas/tests/io/test_gbq.py | 4 +- pandas/tests/io/test_html.py | 4 +- pandas/tests/io/test_parquet.py | 4 +- pandas/tests/io/test_sql.py | 6 +-- pandas/tests/io/test_stata.py | 2 +- pandas/tests/plotting/test_datetimelike.py | 4 +- pandas/tests/plotting/test_frame.py | 4 +- pandas/tests/reshape/merge/test_merge.py | 8 ++-- pandas/tests/reshape/test_melt.py | 4 +- pandas/tests/reshape/test_pivot.py | 8 ++-- pandas/tests/scalar/period/test_asfreq.py | 2 +- pandas/tests/scalar/period/test_period.py | 8 ++-- .../scalar/timedelta/test_construction.py | 5 +-- .../tests/scalar/timestamp/test_timezones.py | 4 +- .../tests/series/indexing/test_alter_index.py | 4 +- pandas/tests/series/indexing/test_boolean.py | 2 +- pandas/tests/series/test_analytics.py | 10 ++--- pandas/tests/series/test_missing.py | 12 ++---- pandas/tests/series/test_operators.py | 4 +- pandas/tests/series/test_sorting.py | 2 +- pandas/tests/series/test_timeseries.py | 6 +-- pandas/tests/sparse/frame/test_frame.py | 4 +- pandas/tests/sparse/series/test_indexing.py | 4 +- pandas/tests/sparse/series/test_series.py | 2 +- pandas/tests/sparse/test_indexing.py | 2 +- pandas/tests/test_multilevel.py | 10 ++--- pandas/tests/test_strings.py | 14 +++---- pandas/tests/test_window.py | 7 ++-- pandas/tests/tseries/offsets/test_offsets.py | 2 +- pandas/tests/tslibs/test_parsing.py | 14 +++---- pandas/tests/util/test_assert_almost_equal.py | 2 +- pandas/tests/util/test_deprecate.py | 5 +-- 101 files changed, 227 insertions(+), 295 deletions(-) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index f582bf8b13975b..8179ab08895da5 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -469,7 +469,7 @@ class TestMultiplicationDivision: pytest.param( pd.Index, marks=pytest.mark.xfail( - reason="Index.__div__ always " "raises", raises=TypeError + reason="Index.__div__ always raises", raises=TypeError ), ), pd.Series, diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index bd21335a7f9c76..e54c16c7a27a4a 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -960,7 +960,7 @@ def test_add_iadd_timedeltalike_annual(self): def test_pi_add_sub_timedeltalike_freq_mismatch_annual(self, mismatched_freq): other = mismatched_freq rng = pd.period_range("2014", "2024", freq="A") - msg = "Input has different freq(=.+)? " "from Period.*?\\(freq=A-DEC\\)" + msg = "Input has different freq(=.+)? from Period.*?\\(freq=A-DEC\\)" with pytest.raises(IncompatibleFrequency, match=msg): rng + other with pytest.raises(IncompatibleFrequency, match=msg): diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 06c4a6ece4bcce..326c565308124a 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -318,19 +318,19 @@ def _check(result, expected): _check(result, expected) # tz mismatches - msg = "Timestamp subtraction must have the same timezones or no" " timezones" + msg = "Timestamp subtraction must have the same timezones or no timezones" with pytest.raises(TypeError, match=msg): dt_tz - ts msg = "can't subtract offset-naive and offset-aware datetimes" with pytest.raises(TypeError, match=msg): dt_tz - dt - msg = "Timestamp subtraction must have the same timezones or no" " timezones" + msg = "Timestamp subtraction must have the same timezones or no timezones" with pytest.raises(TypeError, match=msg): dt_tz - ts_tz2 msg = "can't subtract offset-naive and offset-aware datetimes" with pytest.raises(TypeError, match=msg): dt - dt_tz - msg = "Timestamp subtraction must have the same timezones or no" " timezones" + msg = "Timestamp subtraction must have the same timezones or no timezones" with pytest.raises(TypeError, match=msg): ts - dt_tz with pytest.raises(TypeError, match=msg): @@ -1771,7 +1771,7 @@ def test_td64arr_floordiv_int(self, box_with_array): result = idx // 1 tm.assert_equal(result, idx) - pattern = "floor_divide cannot use operands|" "Cannot divide int by Timedelta*" + pattern = "floor_divide cannot use operands|Cannot divide int by Timedelta*" with pytest.raises(TypeError, match=pattern): 1 // idx diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index cd8ec7fcb787d8..697ee483db6d96 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -313,7 +313,7 @@ def test_unordered_different_categories_raises(self): c1 = Categorical(["a", "b"], categories=["a", "b"], ordered=False) c2 = Categorical(["a", "c"], categories=["c", "a"], ordered=False) - with pytest.raises(TypeError, match=("Categoricals can " "only be compared")): + with pytest.raises(TypeError, match=("Categoricals can only be compared")): c1 == c2 def test_compare_different_lengths(self): diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index c01b52456ff877..dfdb08fa78cbc9 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -88,7 +88,7 @@ def test_repr_dtype(dtype, expected): def test_repr_array(): result = repr(integer_array([1, None, 3])) - expected = "\n" "[1, NaN, 3]\n" "Length: 3, dtype: Int64" + expected = "\n[1, NaN, 3]\nLength: 3, dtype: Int64" assert result == expected diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index fab59d312fb9d2..252f278242fcc3 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -259,7 +259,7 @@ def test_repr_small(): arr = period_array(["2000", "2001"], freq="D") result = str(arr) expected = ( - "\n" "['2000-01-01', '2001-01-01']\n" "Length: 2, dtype: period[D]" + "\n['2000-01-01', '2001-01-01']\nLength: 2, dtype: period[D]" ) assert result == expected diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 2fd7c8f04c8bec..37a885e33847f1 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -92,7 +92,7 @@ def _eval_single_bin(lhs, cmp1, rhs, engine): return c(lhs, rhs) except ValueError as e: if str(e).startswith( - "negative number cannot be " "raised to a fractional power" + "negative number cannot be raised to a fractional power" ): return np.nan raise @@ -362,7 +362,7 @@ def get_expected_pow_result(self, lhs, rhs): expected = _eval_single_bin(lhs, "**", rhs, self.engine) except ValueError as e: if str(e).startswith( - "negative number cannot be " "raised to a fractional power" + "negative number cannot be raised to a fractional power" ): if self.engine == "python": pytest.skip(str(e)) @@ -1944,7 +1944,7 @@ def test_empty_string_raises(engine, parser): def test_more_than_one_expression_raises(engine, parser): - with pytest.raises(SyntaxError, match=("only a single expression " "is allowed")): + with pytest.raises(SyntaxError, match=("only a single expression is allowed")): pd.eval("1 + 1; 2 + 2", engine=engine, parser=parser) diff --git a/pandas/tests/extension/arrow/bool.py b/pandas/tests/extension/arrow/bool.py index ee043a6bb837c5..eb75d6d968073b 100644 --- a/pandas/tests/extension/arrow/bool.py +++ b/pandas/tests/extension/arrow/bool.py @@ -33,7 +33,7 @@ def construct_from_string(cls, string): if string == cls.name: return cls() else: - raise TypeError("Cannot construct a '{}' from " "'{}'".format(cls, string)) + raise TypeError("Cannot construct a '{}' from '{}'".format(cls, string)) @classmethod def construct_array_type(cls): diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 90e6a91fbb91af..c28ff956a33a46 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -40,7 +40,7 @@ def construct_from_string(cls, string): if string == cls.name: return cls() else: - raise TypeError("Cannot construct a '{}' from " "'{}'".format(cls, string)) + raise TypeError("Cannot construct a '{}' from '{}'".format(cls, string)) @property def _is_numeric(self): @@ -172,7 +172,7 @@ def _reduce(self, name, skipna=True, **kwargs): op = getattr(self.data, name) except AttributeError: raise NotImplementedError( - "decimal does not support " "the {} operation".format(name) + "decimal does not support the {} operation".format(name) ) return op(axis=0) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 272936f6ec9f01..9dec023f4073aa 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -207,7 +207,7 @@ def test_series_repr(self, data): # TODO(extension) @pytest.mark.xfail( reason=( - "raising AssertionError as this is not implemented, " "though easy enough to do" + "raising AssertionError as this is not implemented, though easy enough to do" ) ) def test_series_constructor_coerce_data_to_extension_dtype_raises(): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index ece1924b1b2281..21c4ac8f055a25 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -44,7 +44,7 @@ def construct_from_string(cls, string): if string == cls.name: return cls() else: - raise TypeError("Cannot construct a '{}' from " "'{}'".format(cls, string)) + raise TypeError("Cannot construct a '{}' from '{}'".format(cls, string)) class JSONArray(ExtensionArray): diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 2ce65bd15387ee..912e8b5fba2336 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -1304,7 +1304,7 @@ def test_rename_axis_style_raises(self): df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["0", "1"]) # Named target and axis - over_spec_msg = "Cannot specify both 'axis' and " "any of 'index' or 'columns'" + over_spec_msg = "Cannot specify both 'axis' and any of 'index' or 'columns'" with pytest.raises(TypeError, match=over_spec_msg): df.rename(index=str.lower, axis=1) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 13ffa8d17d47c2..d5c66f0c1dd641 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -363,9 +363,7 @@ def test_corr_cov_independent_index_column(self): def test_corr_invalid_method(self): # GH 22298 df = pd.DataFrame(np.random.normal(size=(10, 2))) - msg = ( - "method must be either 'pearson', " "'spearman', 'kendall', or a callable, " - ) + msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, " with pytest.raises(ValueError, match=msg): df.corr(method="____") @@ -1441,7 +1439,7 @@ def test_mean_datetimelike(self): tm.assert_series_equal(result, expected) @pytest.mark.xfail( - reason="casts to object-dtype and then tries to " "add timestamps", + reason="casts to object-dtype and then tries to add timestamps", raises=TypeError, strict=True, ) @@ -1643,7 +1641,7 @@ def test_idxmin(self, float_frame, int_frame): expected = df.apply(Series.idxmin, axis=axis, skipna=skipna) tm.assert_series_equal(result, expected) - msg = "No axis named 2 for object type" " " + msg = "No axis named 2 for object type " with pytest.raises(ValueError, match=msg): frame.idxmin(axis=2) @@ -1658,7 +1656,7 @@ def test_idxmax(self, float_frame, int_frame): expected = df.apply(Series.idxmax, axis=axis, skipna=skipna) tm.assert_series_equal(result, expected) - msg = "No axis named 2 for object type" " " + msg = "No axis named 2 for object type " with pytest.raises(ValueError, match=msg): frame.idxmax(axis=2) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 93508d7ddc50bd..76a210e129eb3b 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -149,7 +149,7 @@ def test_not_hashable(self): empty_frame = DataFrame() df = self.klass([1]) - msg = "'(Sparse)?DataFrame' objects are mutable, thus they cannot be" " hashed" + msg = "'(Sparse)?DataFrame' objects are mutable, thus they cannot be hashed" with pytest.raises(TypeError, match=msg): hash(df) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 37b0d61ee31d9b..1b6ee91317996a 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -286,7 +286,7 @@ def f(dtype): data = list(itertools.repeat((datetime(2001, 1, 1), "aa", 20), 9)) return DataFrame(data=data, columns=["A", "B", "C"], dtype=dtype) - msg = "compound dtypes are not implemented in the DataFrame" " constructor" + msg = "compound dtypes are not implemented in the DataFrame constructor" with pytest.raises(NotImplementedError, match=msg): f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")]) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index a16ca7045cfddd..349e2d9c578be3 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -474,7 +474,7 @@ def test_constructor_error_msgs(self): with pytest.raises(ValueError, match=msg): DataFrame((range(10), range(10, 20)), columns=("ones", "twos")) - msg = "If using all scalar " "values, you must pass " "an index" + msg = "If using all scalar values, you must pass an index" with pytest.raises(ValueError, match=msg): DataFrame({"a": False, "b": True}) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index ba6a9d2aa6ee9f..00be13b1c0e72a 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -686,9 +686,7 @@ def test_astype_dict_like(self, dtype_class): # in the keys of the dtype dict dt4 = dtype_class({"b": str, 2: str}) dt5 = dtype_class({"e": str}) - msg = ( - "Only a column name can be used for the key in a dtype mappings" " argument" - ) + msg = "Only a column name can be used for the key in a dtype mappings argument" with pytest.raises(KeyError, match=msg): df.astype(dt4) with pytest.raises(KeyError, match=msg): @@ -1194,11 +1192,11 @@ def test_astype_str(self, timezone_frame): with option_context("display.max_columns", 20): result = str(timezone_frame) assert ( - "0 2013-01-01 2013-01-01 00:00:00-05:00 " "2013-01-01 00:00:00+01:00" + "0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00" ) in result assert ( - "1 2013-01-02 " "NaT NaT" + "1 2013-01-02 NaT NaT" ) in result assert ( - "2 2013-01-03 2013-01-03 00:00:00-05:00 " "2013-01-03 00:00:00+01:00" + "2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00" ) in result diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index c63a5ba64495f5..94667ecfa837d0 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -126,7 +126,7 @@ def test_dropna(self): assert_frame_equal(dropped, expected) # bad input - msg = "No axis named 3 for object type" " " + msg = "No axis named 3 for object type " with pytest.raises(ValueError, match=msg): df.dropna(axis=3) @@ -362,7 +362,7 @@ def test_na_actions_categorical(self): res = df.fillna(value={"cats": 3, "vals": "b"}) tm.assert_frame_equal(res, df_exp_fill) - with pytest.raises(ValueError, match=("fill value must " "be in categories")): + with pytest.raises(ValueError, match=("fill value must be in categories")): df.fillna(value={"cats": 4, "vals": "c"}) res = df.fillna(method="pad") diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/test_quantile.py index bbb3395fb23afd..236cadf67735d2 100644 --- a/pandas/tests/frame/test_quantile.py +++ b/pandas/tests/frame/test_quantile.py @@ -90,9 +90,7 @@ def test_quantile_axis_parameter(self): result = df.quantile(0.5, axis="columns") assert_series_equal(result, expected) - msg = ( - "No axis named -1 for object type" " " - ) + msg = "No axis named -1 for object type " with pytest.raises(ValueError, match=msg): df.quantile(0.1, axis=-1) msg = ( diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index b6442d89388436..24833f8c02df09 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -55,7 +55,7 @@ def test_sort_values(self): sorted_df = frame.sort_values(by=["B", "A"], ascending=[True, False]) assert_frame_equal(sorted_df, expected) - msg = "No axis named 2 for object type" " " + msg = "No axis named 2 for object type " with pytest.raises(ValueError, match=msg): frame.sort_values(by=["A", "B"], axis=2, inplace=True) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 92801b02dee224..1ca8333154c130 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -901,7 +901,7 @@ def test_frame_to_period(self): pts = df.to_period("M", axis=1) tm.assert_index_equal(pts.columns, exp.columns.asfreq("M")) - msg = "No axis named 2 for object type" " " + msg = "No axis named 2 for object type " with pytest.raises(ValueError, match=msg): df.to_period(axis=2) diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index aef6c3fe8070c4..b2b38980d0ceb2 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -724,10 +724,10 @@ def test_squeeze(self): tm.assert_series_equal(df.squeeze(axis=1), df.iloc[:, 0]) tm.assert_series_equal(df.squeeze(axis="columns"), df.iloc[:, 0]) assert df.squeeze() == df.iloc[0, 0] - msg = "No axis named 2 for object type " + msg = "No axis named 2 for object type " with pytest.raises(ValueError, match=msg): df.squeeze(axis=2) - msg = "No axis named x for object type " + msg = "No axis named x for object type " with pytest.raises(ValueError, match=msg): df.squeeze(axis="x") diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 68e3db3a1ccb04..efc3142b25b829 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -92,7 +92,7 @@ def test_builtins_apply(keys, f): result = df.groupby(keys).apply(f) ngroups = len(df.drop_duplicates(subset=keys)) - assert_msg = "invalid frame shape: {} " "(expected ({}, 3))".format( + assert_msg = "invalid frame shape: {} (expected ({}, 3))".format( result.shape, ngroups ) assert result.shape == (ngroups, 3), assert_msg @@ -1220,7 +1220,7 @@ def test_size_groupby_all_null(): def test_quantile(interpolation, a_vals, b_vals, q): if interpolation == "nearest" and q == 0.5 and b_vals == [4, 3, 2, 1]: pytest.skip( - "Unclear numpy expectation for nearest result with " "equidistant data" + "Unclear numpy expectation for nearest result with equidistant data" ) a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation) @@ -1243,9 +1243,7 @@ def test_quantile_raises(): [["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"] ) - with pytest.raises( - TypeError, match="cannot be performed against " "'object' dtypes" - ): + with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"): df.groupby("key").quantile() diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 1fd67caadf2e4e..72099f2fa3f11d 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -378,7 +378,7 @@ def test_groupby_grouper_f_sanity_checked(self): ts.groupby(lambda key: key[0:6]) def test_grouping_error_on_multidim_input(self, df): - msg = "Grouper for ''" " not 1-dimensional" + msg = "Grouper for '' not 1-dimensional" with pytest.raises(ValueError, match=msg): Grouping(df.index, df[["A", "A"]]) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 705e4080cf34e3..1eab3ba253f4dd 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -768,7 +768,7 @@ def test_transform_with_non_scalar_group(): @pytest.mark.parametrize("agg_func", ["count", "rank", "size"]) def test_transform_numeric_ret(cols, exp, comp_func, agg_func): if agg_func == "size" and isinstance(cols, list): - pytest.xfail("'size' transformation not supported with " "NDFrameGroupy") + pytest.xfail("'size' transformation not supported with NDFrameGroupy") # GH 19200 df = pd.DataFrame( diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 03e10ff44c2990..ee380c6108c388 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -232,9 +232,7 @@ def test_groupby_blacklist(df_letters): blacklist.extend(to_methods) # e.g., to_csv - defined_but_not_allowed = ( - "(?:^Cannot.+{0!r}.+{1!r}.+try using the " "'apply' method$)" - ) + defined_but_not_allowed = "(?:^Cannot.+{0!r}.+{1!r}.+try using the 'apply' method$)" # e.g., query, eval not_defined = "(?:^{1!r} object has no attribute {0!r}$)" diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index f22c820253ee58..6708feda7dd1e8 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -663,7 +663,7 @@ def test_constructor_dtype(self): DatetimeIndex(idx, dtype="datetime64[ns]") # this is effectively trying to convert tz's - msg = "data is already tz-aware US/Eastern, unable to set specified" " tz: CET" + msg = "data is already tz-aware US/Eastern, unable to set specified tz: CET" with pytest.raises(TypeError, match=msg): DatetimeIndex(idx, dtype="datetime64[ns, CET]") msg = "cannot supply both a tz and a dtype with a tz" diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index f0aae748092e32..33a744cc25ca19 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -83,8 +83,8 @@ def test_dti_representation(self, method): ) exp = [] - exp.append("""DatetimeIndex([], dtype='datetime64[ns]', freq='D')""") - exp.append("DatetimeIndex(['2011-01-01'], dtype='datetime64[ns]', " "freq='D')") + exp.append("DatetimeIndex([], dtype='datetime64[ns]', freq='D')") + exp.append("DatetimeIndex(['2011-01-01'], dtype='datetime64[ns]', freq='D')") exp.append( "DatetimeIndex(['2011-01-01', '2011-01-02'], " "dtype='datetime64[ns]', freq='D')" @@ -132,9 +132,9 @@ def test_dti_representation_to_series(self): exp1 = """Series([], dtype: datetime64[ns])""" - exp2 = "0 2011-01-01\n" "dtype: datetime64[ns]" + exp2 = "0 2011-01-01\ndtype: datetime64[ns]" - exp3 = "0 2011-01-01\n" "1 2011-01-02\n" "dtype: datetime64[ns]" + exp3 = "0 2011-01-01\n1 2011-01-02\ndtype: datetime64[ns]" exp4 = ( "0 2011-01-01\n" @@ -186,13 +186,13 @@ def test_dti_summary(self): ["2011-01-01 09:00", "2011-01-01 10:00", pd.NaT], tz="US/Eastern" ) - exp1 = "DatetimeIndex: 0 entries\n" "Freq: D" + exp1 = "DatetimeIndex: 0 entries\nFreq: D" - exp2 = "DatetimeIndex: 1 entries, 2011-01-01 to 2011-01-01\n" "Freq: D" + exp2 = "DatetimeIndex: 1 entries, 2011-01-01 to 2011-01-01\nFreq: D" - exp3 = "DatetimeIndex: 2 entries, 2011-01-01 to 2011-01-02\n" "Freq: D" + exp3 = "DatetimeIndex: 2 entries, 2011-01-01 to 2011-01-02\nFreq: D" - exp4 = "DatetimeIndex: 3 entries, 2011-01-01 to 2011-01-03\n" "Freq: D" + exp4 = "DatetimeIndex: 3 entries, 2011-01-01 to 2011-01-03\nFreq: D" exp5 = ( "DatetimeIndex: 3 entries, 2011-01-01 09:00:00+09:00 " diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index ec4310dbc8396b..10d422e8aa52cf 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -352,7 +352,7 @@ def test_to_datetime_iso_week_year_format(self, s, _format, dt): [ "ISO year directive '%G' must be used with the ISO week directive " "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", - "1999 " "Monday", + "1999 Monday", "%G %A", ], [ diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index dfe3a97ec9b903..962ed2b1cf8ed3 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -389,7 +389,7 @@ def test_frame_repr(self): {"A": [1, 2, 3, 4]}, index=pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4]) ) result = repr(df) - expected = " A\n" "(0, 1] 1\n" "(1, 2] 2\n" "(2, 3] 3\n" "(3, 4] 4" + expected = " A\n(0, 1] 1\n(1, 2] 2\n(2, 3] 3\n(3, 4] 4" assert result == expected @pytest.mark.parametrize( @@ -406,7 +406,7 @@ def test_frame_repr(self): ), ( pd.DataFrame, - (" 0\n" "(0.0, 1.0] a\n" "NaN b\n" "(2.0, 3.0] c"), + (" 0\n(0.0, 1.0] a\nNaN b\n(2.0, 3.0] c"), ), ], ) diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py index 7f5d57db8da886..36152bc4b60cda 100644 --- a/pandas/tests/indexes/multi/test_analytics.py +++ b/pandas/tests/indexes/multi/test_analytics.py @@ -212,9 +212,7 @@ def test_take_fill_value(): expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"]) tm.assert_index_equal(result, expected) - msg = ( - "When allow_fill=True and fill_value is not None, " "all indices must be >= -1" - ) + msg = "When allow_fill=True and fill_value is not None, all indices must be >= -1" with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 1b6177ede30ec9..86c9ee3455d0bb 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -454,7 +454,7 @@ def test_from_product_empty_three_levels(N): "invalid_input", [1, [1], [1, 2], [[1], 2], "a", ["a"], ["a", "b"], [["a"], "b"]] ) def test_from_product_invalid_input(invalid_input): - msg = r"Input must be a list / sequence of iterables|" "Input must be list-like" + msg = r"Input must be a list / sequence of iterables|Input must be list-like" with pytest.raises(TypeError, match=msg): MultiIndex.from_product(iterables=invalid_input) diff --git a/pandas/tests/indexes/multi/test_contains.py b/pandas/tests/indexes/multi/test_contains.py index 21b71613f00f0d..64d2859cd13db4 100644 --- a/pandas/tests/indexes/multi/test_contains.py +++ b/pandas/tests/indexes/multi/test_contains.py @@ -81,7 +81,7 @@ def test_isin_level_kwarg(): msg = "Too many levels: Index has only 2 levels, not 6" with pytest.raises(IndexError, match=msg): idx.isin(vals_0, level=5) - msg = "Too many levels: Index has only 2 levels, -5 is not a valid level" " number" + msg = "Too many levels: Index has only 2 levels, -5 is not a valid level number" with pytest.raises(IndexError, match=msg): idx.isin(vals_0, level=-5) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 75dea68eadbf7d..d366dbd8bc0a8c 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -187,9 +187,7 @@ def test_get_indexer(): def test_get_indexer_nearest(): midx = MultiIndex.from_tuples([("a", 1), ("b", 2)]) - msg = ( - "method='nearest' not implemented yet for MultiIndex; see GitHub" " issue 9365" - ) + msg = "method='nearest' not implemented yet for MultiIndex; see GitHub issue 9365" with pytest.raises(NotImplementedError, match=msg): midx.get_indexer(["a"], method="nearest") msg = "tolerance not implemented yet for MultiIndex" @@ -275,7 +273,7 @@ def test_get_loc(idx): with pytest.raises(KeyError, match=r"^'quux'$"): idx.get_loc("quux") - msg = "only the default get_loc method is currently supported for" " MultiIndex" + msg = "only the default get_loc method is currently supported for MultiIndex" with pytest.raises(NotImplementedError, match=msg): idx.get_loc("foo", method="nearest") diff --git a/pandas/tests/indexes/period/test_arithmetic.py b/pandas/tests/indexes/period/test_arithmetic.py index 1057ca7bbd6629..80e4b1fe1e4300 100644 --- a/pandas/tests/indexes/period/test_arithmetic.py +++ b/pandas/tests/indexes/period/test_arithmetic.py @@ -72,19 +72,19 @@ def test_shift_corner_cases(self): tm.assert_index_equal(idx.shift(3), idx) idx = pd.PeriodIndex( - ["2011-01-01 10:00", "2011-01-01 11:00" "2011-01-01 12:00"], + ["2011-01-01 10:00", "2011-01-01 11:00", "2011-01-01 12:00"], name="xxx", freq="H", ) tm.assert_index_equal(idx.shift(0), idx) exp = pd.PeriodIndex( - ["2011-01-01 13:00", "2011-01-01 14:00" "2011-01-01 15:00"], + ["2011-01-01 13:00", "2011-01-01 14:00", "2011-01-01 15:00"], name="xxx", freq="H", ) tm.assert_index_equal(idx.shift(3), exp) exp = pd.PeriodIndex( - ["2011-01-01 07:00", "2011-01-01 08:00" "2011-01-01 09:00"], + ["2011-01-01 07:00", "2011-01-01 08:00", "2011-01-01 09:00"], name="xxx", freq="H", ) diff --git a/pandas/tests/indexes/period/test_construction.py b/pandas/tests/indexes/period/test_construction.py index 7c10239faad420..eab55b91b3e606 100644 --- a/pandas/tests/indexes/period/test_construction.py +++ b/pandas/tests/indexes/period/test_construction.py @@ -394,15 +394,15 @@ def test_constructor_freq_mult(self, func, warning): ) tm.assert_index_equal(pidx, expected) - msg = "Frequency must be positive, because it" " represents span: -1M" + msg = "Frequency must be positive, because it represents span: -1M" with pytest.raises(ValueError, match=msg): PeriodIndex(["2011-01"], freq="-1M") - msg = "Frequency must be positive, because it" " represents span: 0M" + msg = "Frequency must be positive, because it represents span: 0M" with pytest.raises(ValueError, match=msg): PeriodIndex(["2011-01"], freq="0M") - msg = "Frequency must be positive, because it" " represents span: 0M" + msg = "Frequency must be positive, because it represents span: 0M" with pytest.raises(ValueError, match=msg): period_range("2011-01", periods=3, freq="0M") diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py index c5566f74af11e4..2a88b79f381c47 100644 --- a/pandas/tests/indexes/period/test_formats.py +++ b/pandas/tests/indexes/period/test_formats.py @@ -48,7 +48,7 @@ class TestPeriodIndexRendering: def test_frame_repr(self): df = pd.DataFrame({"A": [1, 2, 3]}, index=pd.date_range("2000", periods=3)) result = repr(df) - expected = " A\n" "2000-01-01 1\n" "2000-01-02 2\n" "2000-01-03 3" + expected = " A\n2000-01-01 1\n2000-01-02 2\n2000-01-03 3" assert result == expected @pytest.mark.parametrize("method", ["__repr__", "__str__"]) @@ -65,13 +65,11 @@ def test_representation(self, method): idx9 = pd.period_range("2013Q1", periods=3, freq="Q") idx10 = PeriodIndex(["2011-01-01", "2011-02-01"], freq="3D") - exp1 = """PeriodIndex([], dtype='period[D]', freq='D')""" + exp1 = "PeriodIndex([], dtype='period[D]', freq='D')" - exp2 = """PeriodIndex(['2011-01-01'], dtype='period[D]', freq='D')""" + exp2 = "PeriodIndex(['2011-01-01'], dtype='period[D]', freq='D')" - exp3 = ( - "PeriodIndex(['2011-01-01', '2011-01-02'], dtype='period[D]', " "freq='D')" - ) + exp3 = "PeriodIndex(['2011-01-01', '2011-01-02'], dtype='period[D]', freq='D')" exp4 = ( "PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], " @@ -88,11 +86,9 @@ def test_representation(self, method): "dtype='period[H]', freq='H')" ) - exp7 = "PeriodIndex(['2013Q1'], dtype='period[Q-DEC]', " "freq='Q-DEC')" + exp7 = "PeriodIndex(['2013Q1'], dtype='period[Q-DEC]', freq='Q-DEC')" - exp8 = ( - "PeriodIndex(['2013Q1', '2013Q2'], dtype='period[Q-DEC]', " "freq='Q-DEC')" - ) + exp8 = "PeriodIndex(['2013Q1', '2013Q2'], dtype='period[Q-DEC]', freq='Q-DEC')" exp9 = ( "PeriodIndex(['2013Q1', '2013Q2', '2013Q3'], " diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index 94b061330002fd..03e4bd5834166f 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -91,7 +91,7 @@ def test_union(self, sort): ["2000-01-01 09:01", "2000-01-01 09:03", "2000-01-01 09:05"], freq="T" ) other5 = pd.PeriodIndex( - ["2000-01-01 09:01", "2000-01-01 09:05" "2000-01-01 09:08"], freq="T" + ["2000-01-01 09:01", "2000-01-01 09:05", "2000-01-01 09:08"], freq="T" ) expected5 = pd.PeriodIndex( [ diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index e52954a1145788..1db2c5c3a8dac3 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -270,7 +270,7 @@ def test_to_timestamp_pi_nat(self): tm.assert_index_equal(result3, exp) assert result3.freqstr == "3M" - msg = "Frequency must be positive, because it" " represents span: -2A" + msg = "Frequency must be positive, because it represents span: -2A" with pytest.raises(ValueError, match=msg): result.to_period(freq="-2A") diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index f0382a040e0631..e75d80bec1fdfb 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -564,7 +564,7 @@ def test_constructor_overflow_int64(self): with pytest.raises(OverflowError, match=msg): Index([np.iinfo(np.uint64).max - 1], dtype="int64") - @pytest.mark.xfail(reason="see GH#21311: Index " "doesn't enforce dtype argument") + @pytest.mark.xfail(reason="see GH#21311: Index doesn't enforce dtype argument") def test_constructor_cast(self): msg = "could not convert string to float" with pytest.raises(ValueError, match=msg): @@ -728,9 +728,7 @@ def test_nanosecond_index_access(self): # this does not yet work, as parsing strings is done via dateutil # assert first_value == x['2013-01-01 00:00:00.000000050+0000'] - expected_ts = np_datetime64_compat( - "2013-01-01 00:00:00.000000050+" "0000", "ns" - ) + expected_ts = np_datetime64_compat("2013-01-01 00:00:00.000000050+0000", "ns") assert first_value == x[Timestamp(expected_ts)] def test_booleanindex(self): @@ -2361,7 +2359,7 @@ def test_string_index_repr(self, index, expected): # short ( pd.Index(["あ", "いい", "ううう"]), - ("Index(['あ', 'いい', 'ううう'], " "dtype='object')"), + ("Index(['あ', 'いい', 'ううう'], dtype='object')"), ), # multiple lines ( diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 465b7f5e23bb8a..b9bdaf40f85890 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -348,7 +348,7 @@ def test_has_duplicates(self, indices): # MultiIndex tested separately in: # tests/indexes/multi/test_unique_and_duplicates. # RangeIndex is unique by definition. - pytest.skip("Skip check for empty Index, MultiIndex, " "and RangeIndex") + pytest.skip("Skip check for empty Index, MultiIndex, and RangeIndex") idx = holder([indices[0]] * 5) assert idx.is_unique is False diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 1feb82a923b197..f246307e63e3b2 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -172,7 +172,7 @@ def test_constructor_invalid(self): ) with pytest.raises(TypeError, match=msg): Float64Index(["a", "b", 0.0]) - msg = r"float\(\) argument must be a string or a number, not" " 'Timestamp'" + msg = r"float\(\) argument must be a string or a number, not 'Timestamp'" with pytest.raises(TypeError, match=msg): Float64Index([Timestamp("20130101")]) @@ -569,9 +569,7 @@ def test_take_fill_value(self): tm.assert_index_equal(result, expected) name = self._holder.__name__ - msg = ("Unable to fill values because " "{name} cannot contain NA").format( - name=name - ) + msg = "Unable to fill values because {name} cannot contain NA".format(name=name) # fill_value=True with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/indexes/timedeltas/test_arithmetic.py b/pandas/tests/indexes/timedeltas/test_arithmetic.py index 0f51a6333ab2d4..4544657f79af77 100644 --- a/pandas/tests/indexes/timedeltas/test_arithmetic.py +++ b/pandas/tests/indexes/timedeltas/test_arithmetic.py @@ -215,9 +215,7 @@ def test_ops_ndarray(self): msg = r"unsupported operand type\(s\) for \+: 'Timedelta' and 'int'" with pytest.raises(TypeError, match=msg): td + np.array([1]) - msg = ( - r"unsupported operand type\(s\) for \+: 'numpy.ndarray' and" " 'Timedelta'" - ) + msg = r"unsupported operand type\(s\) for \+: 'numpy.ndarray' and 'Timedelta'" with pytest.raises(TypeError, match=msg): np.array([1]) + td @@ -227,7 +225,7 @@ def test_ops_ndarray(self): msg = r"unsupported operand type\(s\) for -: 'Timedelta' and 'int'" with pytest.raises(TypeError, match=msg): td - np.array([1]) - msg = r"unsupported operand type\(s\) for -: 'numpy.ndarray' and" " 'Timedelta'" + msg = r"unsupported operand type\(s\) for -: 'numpy.ndarray' and 'Timedelta'" with pytest.raises(TypeError, match=msg): np.array([1]) - td diff --git a/pandas/tests/indexes/timedeltas/test_formats.py b/pandas/tests/indexes/timedeltas/test_formats.py index ebc5f720d46fb1..1dfc5b5305008a 100644 --- a/pandas/tests/indexes/timedeltas/test_formats.py +++ b/pandas/tests/indexes/timedeltas/test_formats.py @@ -13,13 +13,11 @@ def test_representation(self, method): idx4 = TimedeltaIndex(["1 days", "2 days", "3 days"], freq="D") idx5 = TimedeltaIndex(["1 days 00:00:01", "2 days", "3 days"]) - exp1 = """TimedeltaIndex([], dtype='timedelta64[ns]', freq='D')""" + exp1 = "TimedeltaIndex([], dtype='timedelta64[ns]', freq='D')" - exp2 = "TimedeltaIndex(['1 days'], dtype='timedelta64[ns]', " "freq='D')" + exp2 = "TimedeltaIndex(['1 days'], dtype='timedelta64[ns]', freq='D')" - exp3 = ( - "TimedeltaIndex(['1 days', '2 days'], " "dtype='timedelta64[ns]', freq='D')" - ) + exp3 = "TimedeltaIndex(['1 days', '2 days'], dtype='timedelta64[ns]', freq='D')" exp4 = ( "TimedeltaIndex(['1 days', '2 days', '3 days'], " @@ -47,11 +45,11 @@ def test_representation_to_series(self): exp1 = """Series([], dtype: timedelta64[ns])""" - exp2 = "0 1 days\n" "dtype: timedelta64[ns]" + exp2 = "0 1 days\ndtype: timedelta64[ns]" - exp3 = "0 1 days\n" "1 2 days\n" "dtype: timedelta64[ns]" + exp3 = "0 1 days\n1 2 days\ndtype: timedelta64[ns]" - exp4 = "0 1 days\n" "1 2 days\n" "2 3 days\n" "dtype: timedelta64[ns]" + exp4 = "0 1 days\n1 2 days\n2 3 days\ndtype: timedelta64[ns]" exp5 = ( "0 1 days 00:00:01\n" @@ -75,15 +73,15 @@ def test_summary(self): idx4 = TimedeltaIndex(["1 days", "2 days", "3 days"], freq="D") idx5 = TimedeltaIndex(["1 days 00:00:01", "2 days", "3 days"]) - exp1 = "TimedeltaIndex: 0 entries\n" "Freq: D" + exp1 = "TimedeltaIndex: 0 entries\nFreq: D" - exp2 = "TimedeltaIndex: 1 entries, 1 days to 1 days\n" "Freq: D" + exp2 = "TimedeltaIndex: 1 entries, 1 days to 1 days\nFreq: D" - exp3 = "TimedeltaIndex: 2 entries, 1 days to 2 days\n" "Freq: D" + exp3 = "TimedeltaIndex: 2 entries, 1 days to 2 days\nFreq: D" - exp4 = "TimedeltaIndex: 3 entries, 1 days to 3 days\n" "Freq: D" + exp4 = "TimedeltaIndex: 3 entries, 1 days to 3 days\nFreq: D" - exp5 = "TimedeltaIndex: 3 entries, 1 days 00:00:01 to 3 days " "00:00:00" + exp5 = "TimedeltaIndex: 3 entries, 1 days 00:00:01 to 3 days 00:00:00" for idx, expected in zip( [idx1, idx2, idx3, idx4, idx5], [exp1, exp2, exp3, exp4, exp5] diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 3549d81623e107..0dccf023c66f8d 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -378,7 +378,7 @@ def test_loc_listlike_dtypes(self): exp = DataFrame({"A": [1, 1, 2], "B": [4, 4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) - msg = "a list-indexer must only include " "values that are in the categories" + msg = "a list-indexer must only include values that are in the categories" with pytest.raises(KeyError, match=msg): df.loc[["a", "x"]] @@ -401,7 +401,7 @@ def test_loc_listlike_dtypes(self): ) tm.assert_frame_equal(res, exp, check_index_type=True) - msg = "a list-indexer must only include values " "that are in the categories" + msg = "a list-indexer must only include values that are in the categories" with pytest.raises(KeyError, match=msg): df.loc[["a", "x"]] @@ -431,7 +431,7 @@ def test_loc_listlike_dtypes(self): ) tm.assert_frame_equal(res, exp, check_index_type=True) - msg = "a list-indexer must only include values " "that are in the categories" + msg = "a list-indexer must only include values that are in the categories" with pytest.raises(KeyError, match=msg): df.loc[["a", "x"]] diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 7d47063623d87b..702bf0b15dec98 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -307,7 +307,7 @@ def test_setting_with_copy_bug(self): ) mask = pd.isna(df.c) - msg = "A value is trying to be set on a copy of a slice from a" " DataFrame" + msg = "A value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(com.SettingWithCopyError, match=msg): df[["c"]][mask] = df[["b"]][mask] diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index a18f8380f80c1d..dea1d5114f1b94 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -694,7 +694,7 @@ def test_where_index_datetime(self): assert obj.dtype == "datetime64[ns]" cond = pd.Index([True, False, True, False]) - msg = "Index\\(\\.\\.\\.\\) must be called with a collection " "of some kind" + msg = "Index\\(\\.\\.\\.\\) must be called with a collection of some kind" with pytest.raises(TypeError, match=msg): obj.where(cond, fill_val) @@ -725,7 +725,7 @@ def test_where_index_datetimetz(self): assert obj.dtype == "datetime64[ns]" cond = pd.Index([True, False, True, False]) - msg = "Index\\(\\.\\.\\.\\) must be called with a collection " "of some kind" + msg = "Index\\(\\.\\.\\.\\) must be called with a collection of some kind" with pytest.raises(TypeError, match=msg): obj.where(cond, fill_val) @@ -1031,7 +1031,7 @@ def test_replace_series(self, how, to_key, from_key): # TODO(jbrockmendel) commented out to only have a single xfail printed @pytest.mark.xfail( - reason="GH #18376, tzawareness-compat bug " "in BlockManager.replace_list" + reason="GH #18376, tzawareness-compat bug in BlockManager.replace_list" ) # @pytest.mark.parametrize('how', ['dict', 'series']) # @pytest.mark.parametrize('to_key', ['timedelta64[ns]', 'bool', 'object', diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 760d8c70b94342..60a6a509c0912e 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -668,11 +668,11 @@ def test_iloc_mask(self): # GH 3631, iloc with a mask (of a series) should raise df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"]) mask = df.a % 2 == 0 - msg = "iLocation based boolean indexing cannot use an indexable as" " a mask" + msg = "iLocation based boolean indexing cannot use an indexable as a mask" with pytest.raises(ValueError, match=msg): df.iloc[mask] mask.index = range(len(mask)) - msg = "iLocation based boolean indexing on an integer type is not" " available" + msg = "iLocation based boolean indexing on an integer type is not available" with pytest.raises(NotImplementedError, match=msg): df.iloc[mask] @@ -693,7 +693,7 @@ def test_iloc_mask(self): ("index", ""): "0b11", ("index", ".loc"): "0b11", ("index", ".iloc"): ( - "iLocation based boolean indexing " "cannot use an indexable as a mask" + "iLocation based boolean indexing cannot use an indexable as a mask" ), ("locs", ""): "Unalignable boolean Series provided as indexer " "(index of the boolean Series and of the indexed " diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py index a6e1273a229dc7..e6ccee684b76b6 100644 --- a/pandas/tests/indexing/test_scalar.py +++ b/pandas/tests/indexing/test_scalar.py @@ -147,7 +147,7 @@ def test_at_to_fail(self): s = Series([1, 2, 3], index=[3, 2, 1]) result = s.at[1] assert result == 3 - msg = "At based indexing on an integer index can only have integer" " indexers" + msg = "At based indexing on an integer index can only have integer indexers" with pytest.raises(ValueError, match=msg): s.at["a"] diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 6beb847da3eb49..2d4fb87d0c6bfb 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -537,7 +537,7 @@ def test_astype(self): assert tmgr.get("e").dtype.type == t # mixed - mgr = create_mgr("a,b: object; c: bool; d: datetime;" "e: f4; f: f2; g: f8") + mgr = create_mgr("a,b: object; c: bool; d: datetime; e: f4; f: f2; g: f8") for t in ["float16", "float32", "float64", "int32", "int64"]: t = np.dtype(t) tmgr = mgr.astype(t, errors="ignore") @@ -599,7 +599,7 @@ def _compare(old_mgr, new_mgr): assert new_mgr.get("g").dtype == np.float64 mgr = create_mgr( - "a,b,foo: object; f: i4; bool: bool; dt: datetime;" "i: i8; g: f8; h: f2" + "a,b,foo: object; f: i4; bool: bool; dt: datetime; i: i8; g: f8; h: f2" ) mgr.set("a", np.array(["1"] * N, dtype=np.object_)) mgr.set("b", np.array(["2."] * N, dtype=np.object_)) @@ -703,7 +703,7 @@ def test_reindex_index(self): def test_reindex_items(self): # mgr is not consolidated, f8 & f8-2 blocks - mgr = create_mgr("a: f8; b: i8; c: f8; d: i8; e: f8;" "f: bool; g: f8-2") + mgr = create_mgr("a: f8; b: i8; c: f8; d: i8; e: f8; f: bool; g: f8-2") reindexed = mgr.reindex_axis(["g", "c", "a", "d"], axis=0) assert reindexed.nblocks == 2 diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index 76b27bce11b085..7ee84077a53341 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -14,7 +14,7 @@ pytest.param( "xlwt", marks=pytest.mark.xfail( - reason="xlwt does not support " "openpyxl-compatible " "style dicts" + reason="xlwt does not support openpyxl-compatible style dicts" ), ), "xlsxwriter", diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index cf26b20e5d0042..0908ed885a6ca9 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -172,7 +172,7 @@ def test_excel_multindex_roundtrip( with ensure_clean(ext) as pth: if c_idx_levels == 1 and c_idx_names: pytest.skip( - "Column index name cannot be " "serialized unless it's a MultiIndex" + "Column index name cannot be serialized unless it's a MultiIndex" ) # Empty name case current read in as diff --git a/pandas/tests/io/formats/test_eng_formatting.py b/pandas/tests/io/formats/test_eng_formatting.py index b122e4f6c3f33e..d2a2d0a6a97069 100644 --- a/pandas/tests/io/formats/test_eng_formatting.py +++ b/pandas/tests/io/formats/test_eng_formatting.py @@ -24,16 +24,12 @@ def test_eng_float_formatter(self): fmt.set_eng_float_format(use_eng_prefix=True) result = df.to_string() - expected = ( - " A\n" "0 1.410\n" "1 141.000\n" "2 14.100k\n" "3 1.410M" - ) + expected = " A\n0 1.410\n1 141.000\n2 14.100k\n3 1.410M" assert result == expected fmt.set_eng_float_format(accuracy=0) result = df.to_string() - expected = ( - " A\n" "0 1E+00\n" "1 141E+00\n" "2 14E+03\n" "3 1E+06" - ) + expected = " A\n0 1E+00\n1 141E+00\n2 14E+03\n3 1E+06" assert result == expected tm.reset_display_options() diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index af862b11c756c4..818bbc566aca89 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -374,7 +374,7 @@ def test_repr_max_columns_max_rows(self): term_width, term_height = get_terminal_size() if term_width < 10 or term_height < 10: pytest.skip( - "terminal size too small, " "{0} x {1}".format(term_width, term_height) + "terminal size too small, {0} x {1}".format(term_width, term_height) ) def mkframe(n): @@ -1409,11 +1409,11 @@ def test_to_string_no_index(self): df_s = df.to_string(index=False) # Leading space is expected for positive numbers. - expected = " x y z\n" " 11 33 AAA\n" " 22 -44 " + expected = " x y z\n 11 33 AAA\n 22 -44 " assert df_s == expected df_s = df[["y", "x", "z"]].to_string(index=False) - expected = " y x z\n" " 33 11 AAA\n" "-44 22 " + expected = " y x z\n 33 11 AAA\n-44 22 " assert df_s == expected def test_to_string_line_width_no_index(self): @@ -1475,7 +1475,7 @@ def test_to_string_float_formatting(self): df = DataFrame({"x": [3234, 0.253]}) df_s = df.to_string() - expected = " x\n" "0 3234.000\n" "1 0.253" + expected = " x\n0 3234.000\n1 0.253" assert df_s == expected tm.reset_display_options() @@ -1485,9 +1485,9 @@ def test_to_string_float_formatting(self): df_s = df.to_string() if _three_digit_exp(): - expected = " x\n" "0 1.000000e+009\n" "1 2.512000e-001" + expected = " x\n0 1.000000e+009\n1 2.512000e-001" else: - expected = " x\n" "0 1.000000e+09\n" "1 2.512000e-01" + expected = " x\n0 1.000000e+09\n1 2.512000e-01" assert df_s == expected def test_to_string_float_format_no_fixed_width(self): @@ -1526,14 +1526,14 @@ def test_to_string_small_float_values(self): # but not all exactly zero df = df * 0 result = df.to_string() - expected = " 0\n" "0 0\n" "1 0\n" "2 -0" + expected = " 0\n0 0\n1 0\n2 -0" def test_to_string_float_index(self): index = Index([1.5, 2, 3, 4, 5]) df = DataFrame(np.arange(5), index=index) result = df.to_string() - expected = " 0\n" "1.5 0\n" "2.0 1\n" "3.0 2\n" "4.0 3\n" "5.0 4" + expected = " 0\n1.5 0\n2.0 1\n3.0 2\n4.0 3\n5.0 4" assert result == expected def test_to_string_complex_float_formatting(self): @@ -1562,7 +1562,7 @@ def test_to_string_ascii_error(self): "0 ", " .gitignore ", " 5 ", - " \xe2\x80\xa2\xe2\x80\xa2\xe2\x80" "\xa2\xe2\x80\xa2\xe2\x80\xa2", + " \xe2\x80\xa2\xe2\x80\xa2\xe2\x80\xa2\xe2\x80\xa2\xe2\x80\xa2", ) ] df = DataFrame(data) @@ -1575,7 +1575,7 @@ def test_to_string_int_formatting(self): assert issubclass(df["x"].dtype.type, np.integer) output = df.to_string() - expected = " x\n" "0 -15\n" "1 20\n" "2 25\n" "3 -35" + expected = " x\n0 -15\n1 20\n2 25\n3 -35" assert output == expected def test_to_string_index_formatter(self): @@ -1596,7 +1596,7 @@ def test_to_string_left_justify_cols(self): tm.reset_display_options() df = DataFrame({"x": [3234, 0.253]}) df_s = df.to_string(justify="left") - expected = " x \n" "0 3234.000\n" "1 0.253" + expected = " x \n0 3234.000\n1 0.253" assert df_s == expected def test_to_string_format_na(self): @@ -2077,7 +2077,7 @@ def test_to_string(self): result = cp.to_string(length=True, name=True, dtype=True) last_line = result.split("\n")[-1].strip() assert last_line == ( - "Freq: B, Name: foo, " "Length: {cp}, dtype: float64".format(cp=len(cp)) + "Freq: B, Name: foo, Length: {cp}, dtype: float64".format(cp=len(cp)) ) def test_freq_name_separation(self): @@ -2136,22 +2136,18 @@ def test_east_asian_unicode_series(self): # unicode index s = Series(["a", "bb", "CCC", "D"], index=["あ", "いい", "ううう", "ええええ"]) - expected = ( - "あ a\nいい bb\nううう CCC\n" "ええええ D\ndtype: object" - ) + expected = "あ a\nいい bb\nううう CCC\nええええ D\ndtype: object" assert repr(s) == expected # unicode values s = Series(["あ", "いい", "ううう", "ええええ"], index=["a", "bb", "c", "ddd"]) - expected = ( - "a あ\nbb いい\nc ううう\n" "ddd ええええ\ndtype: object" - ) + expected = "a あ\nbb いい\nc ううう\nddd ええええ\ndtype: object" assert repr(s) == expected # both s = Series(["あ", "いい", "ううう", "ええええ"], index=["ああ", "いいいい", "う", "えええ"]) expected = ( - "ああ あ\nいいいい いい\nう ううう\n" "えええ ええええ\ndtype: object" + "ああ あ\nいいいい いい\nう ううう\nえええ ええええ\ndtype: object" ) assert repr(s) == expected @@ -2181,7 +2177,7 @@ def test_east_asian_unicode_series(self): # object dtype, shorter than unicode repr s = Series([1, 22, 3333, 44444], index=[1, "AB", np.nan, "あああ"]) expected = ( - "1 1\nAB 22\nNaN 3333\n" "あああ 44444\ndtype: int64" + "1 1\nAB 22\nNaN 3333\nあああ 44444\ndtype: int64" ) assert repr(s) == expected @@ -2559,7 +2555,7 @@ def test_format_explicit(self): exp = "0 a\n1 a\n ..\n98 a\n99 a\ndtype: object" assert exp == res res = repr(test_sers["twol"]) - exp = "0 ab\n1 ab\n ..\n98 ab\n99 ab\ndtype:" " object" + exp = "0 ab\n1 ab\n ..\n98 ab\n99 ab\ndtype: object" assert exp == res res = repr(test_sers["asc"]) exp = ( diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 7bd27b2ad9be32..f2fb54796f177c 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -484,12 +484,12 @@ def test_bar_align_left(self): (1, 0): [ "width: 10em", " height: 80%", - "background: linear-gradient(" "90deg,red 25.0%, transparent 25.0%)", + "background: linear-gradient(90deg,red 25.0%, transparent 25.0%)", ], (2, 0): [ "width: 10em", " height: 80%", - "background: linear-gradient(" "90deg,red 50.0%, transparent 50.0%)", + "background: linear-gradient(90deg,red 50.0%, transparent 50.0%)", ], } assert result == expected diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 7b493266144b06..c6485ff21bcfbf 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -12,7 +12,7 @@ class TestToCSV: @pytest.mark.xfail( (3, 6, 5) > sys.version_info >= (3, 5), - reason=("Python csv library bug " "(see https://bugs.python.org/issue32255)"), + reason=("Python csv library bug (see https://bugs.python.org/issue32255)"), ) def test_to_csv_with_single_column(self): # see gh-18676, https://bugs.python.org/issue32255 diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 28c8837731ec16..c2753d23966c63 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -508,7 +508,7 @@ def test_convert_json_field_to_pandas_type(self, inp, exp): def test_convert_json_field_to_pandas_type_raises(self, inp): field = {"type": inp} with pytest.raises( - ValueError, match=("Unsupported or invalid field " "type: {}".format(inp)) + ValueError, match=("Unsupported or invalid field type: {}".format(inp)) ): convert_json_field_to_pandas_type(field) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 82cd00c2d121da..412e5014c8d234 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -297,7 +297,7 @@ def test_meta_name_conflict(self): } ] - msg = r"Conflicting metadata name (foo|bar)," " need distinguishing prefix" + msg = r"Conflicting metadata name (foo|bar), need distinguishing prefix" with pytest.raises(ValueError, match=msg): json_normalize(data, "data", meta=["foo", "bar"]) @@ -491,7 +491,7 @@ def test_json_normalize_errors(self, missing_metadata): # If meta keys are not always present a new option to set # errors='ignore' has been implemented - msg = "Try running with errors='ignore' as key 'name'" " is not always present" + msg = "Try running with errors='ignore' as key 'name' is not always present" with pytest.raises(KeyError, match=msg): json_normalize( data=missing_metadata, diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index a0686b53b83a44..970fd465fd4eca 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1434,7 +1434,7 @@ def test_to_jsonl(self): def test_latin_encoding(self): # GH 13774 - pytest.skip("encoding not implemented in .to_json(), " "xref #13774") + pytest.skip("encoding not implemented in .to_json(), xref #13774") values = [ [b"E\xc9, 17", b"", b"a", b"b", b"c"], @@ -1589,7 +1589,7 @@ def test_index_false_error_to_json(self, orient): df = pd.DataFrame([[1, 2], [4, 5]], columns=["a", "b"]) - msg = "'index=False' is only valid when " "'orient' is 'split' or 'table'" + msg = "'index=False' is only valid when 'orient' is 'split' or 'table'" with pytest.raises(ValueError, match=msg): df.to_json(orient=orient, index=False) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 7d5bf9ec850bcf..d469d3c2e51de3 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1243,7 +1243,7 @@ def test_catch_too_many_names(all_parsers): 10,11,12\n""" parser = all_parsers msg = ( - "Too many columns specified: " "expected 4 and found 3" + "Too many columns specified: expected 4 and found 3" if parser.engine == "c" else "Number of passed names did not match " "number of header fields in the file" diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index 06ae2c0fef1b9e..9d0eab0b9a9072 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -89,7 +89,7 @@ def test_compression(parser_and_data, compression_only, buffer, filename): filename = filename if filename is None else filename.format(ext=ext) if filename and buffer: - pytest.skip("Cannot deduce compression from " "buffer of compressed data.") + pytest.skip("Cannot deduce compression from buffer of compressed data.") with tm.ensure_clean(filename=filename) as path: tm.write_to_compressed(compress_type, path, data) @@ -144,7 +144,7 @@ def test_invalid_compression(all_parsers, invalid_compression): parser = all_parsers compress_kwargs = dict(compression=invalid_compression) - msg = "Unrecognized compression " "type: {compression}".format(**compress_kwargs) + msg = "Unrecognized compression type: {compression}".format(**compress_kwargs) with pytest.raises(ValueError, match=msg): parser.read_csv("test_file.zip", **compress_kwargs) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index ff1dd10bdd0d9e..99e0181741998c 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -124,11 +124,11 @@ def test_header_multi_index(all_parsers): ), ( dict(index_col=[0, 1], names=["foo", "bar"]), - ("cannot specify names " "when specifying a " "multi-index header"), + ("cannot specify names when specifying a multi-index header"), ), ( dict(index_col=[0, 1], usecols=["foo", "bar"]), - ("cannot specify " "usecols when " "specifying a " "multi-index header"), + ("cannot specify usecols when specifying a multi-index header"), ), ], ) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 8199d632223c1f..4dfb8d3bd2dc8a 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -62,9 +62,8 @@ def test_index_col_is_true(all_parsers): data = "a,b\n1,2" parser = all_parsers - with pytest.raises( - ValueError, match="The value of index_col " "couldn't be 'True'" - ): + msg = "The value of index_col couldn't be 'True'" + with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), index_col=True) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 99e4e5c022ecb4..5d79f6e281ef12 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -560,7 +560,7 @@ def test_multiple_date_cols_with_header(all_parsers): KORD1,19990127, 19:00:00 KORD2,19990127, 20:00:00""", [[1, 2]], - ("New date column already " "in dict date_NominalTime"), + ("New date column already in dict date_NominalTime"), ), ( """\ @@ -1272,7 +1272,7 @@ def test_parse_date_time(all_parsers, data, kwargs, expected): def test_parse_date_fields(all_parsers): parser = all_parsers - data = "year,month,day,a\n2001,01,10,10.\n" "2001,02,1,11." + data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." result = parser.read_csv( StringIO(data), header=0, diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 57096a2652b883..73638fe8ab7c85 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -56,7 +56,7 @@ def test_string_factorize(self): assert len(set(map(id, result[0]))) == 2 def test_skipinitialspace(self): - data = "a, b\n" "a, b\n" "a, b\n" "a, b" + data = "a, b\na, b\na, b\na, b" reader = TextReader(StringIO(data), skipinitialspace=True, header=None) result = reader.read() @@ -129,10 +129,10 @@ def test_integer_thousands_alt(self): def test_skip_bad_lines(self, capsys): # too many lines, see #2430 for why - data = "a:b:c\n" "d:e:f\n" "g:h:i\n" "j:k:l:m\n" "l:m:n\n" "o:p:q:r" + data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r" reader = TextReader(StringIO(data), delimiter=":", header=None) - msg = r"Error tokenizing data\. C error: Expected 3 fields in" " line 4, saw 4" + msg = r"Error tokenizing data\. C error: Expected 3 fields in line 4, saw 4" with pytest.raises(parser.ParserError, match=msg): reader.read() @@ -165,7 +165,7 @@ def test_skip_bad_lines(self, capsys): assert "Skipping line 6" in captured.err def test_header_not_enough_lines(self): - data = "skip this\n" "skip this\n" "a,b,c\n" "1,2,3\n" "4,5,6" + data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6" reader = TextReader(StringIO(data), delimiter=",", header=2) header = reader.header diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index f135fac65f56a7..8bdf53c3caf61b 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -95,7 +95,7 @@ def test_python_engine(self, python_engine): 1,2,3,4,""" for default in py_unsupported: - msg = "The %r option is not supported " "with the %r engine" % ( + msg = "The %r option is not supported with the %r engine" % ( default, python_engine, ) diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index b449e848a0b5a0..47c4f93fbf59c4 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -18,7 +18,7 @@ "integers or a callable." ) _msg_validate_usecols_names = ( - "Usecols do not match columns, columns " "expected but not found: {0}" + "Usecols do not match columns, columns expected but not found: {0}" ) @@ -124,7 +124,7 @@ def test_usecols_name_length_conflict(all_parsers): 10,11,12""" parser = all_parsers msg = ( - "Number of passed names did not " "match number of header fields in the file" + "Number of passed names did not match number of header fields in the file" if parser.engine == "python" else "Passed header names mismatches usecols" ) @@ -501,7 +501,7 @@ def test_incomplete_first_row(all_parsers, usecols): ), # see gh-9549 ( - ("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n" "1,2,3,,,1,\n1,2,3\n5,6,7"), + ("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"), ["A", "B", "C"], dict(), DataFrame( diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index fee7e1cb2ba5ff..b9f4defb4edf83 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -1622,7 +1622,7 @@ def check_col(key, name, size): _maybe_remove(store, "df") store.append("df", df_new, data_columns=["A", "B", "string", "string2"]) result = store.select( - "df", "string='foo' and string2='foo'" " and A>0 and B<0" + "df", "string='foo' and string2='foo' and A>0 and B<0" ) expected = df_new[ (df_new.string == "foo") @@ -3726,7 +3726,7 @@ def test_append_to_multiple_dropna(self): tm.assert_index_equal(store.select("df1").index, store.select("df2").index) @pytest.mark.xfail( - run=False, reason="append_to_multiple_dropna_false " "is not raising as failed" + run=False, reason="append_to_multiple_dropna_false is not raising as failed" ) def test_append_to_multiple_dropna_false(self): df1 = tm.makeTimeDataFrame() @@ -3817,9 +3817,7 @@ def test_select_as_multiple(self): @pytest.mark.skipif( LooseVersion(tables.__version__) < LooseVersion("3.1.0"), - reason=( - "tables version does not support fix for nan selection " "bug: GH 4858" - ), + reason=("tables version does not support fix for nan selection bug: GH 4858"), ) def test_nan_selection_bug_4858(self): diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index fa63f102580fff..87a2405a10dd5c 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -109,7 +109,7 @@ def test_unsupported_other(self): def test_rw_nthreads(self): df = pd.DataFrame({"A": np.arange(100000)}) expected_warning = ( - "the 'nthreads' keyword is deprecated, " "use 'use_threads' instead" + "the 'nthreads' keyword is deprecated, use 'use_threads' instead" ) # TODO: make the warning work with check_stacklevel=True with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w: diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index 6ca6da01a6d6f6..52147f4e1afc73 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -33,9 +33,7 @@ def _skip_if_no_project_id(): def _skip_if_no_private_key_path(): if not _get_private_key_path(): - pytest.skip( - "Cannot run integration tests without a " "private key json file path" - ) + pytest.skip("Cannot run integration tests without a private key json file path") def _in_travis_environment(): diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index d3d05b6281d5b5..9752b4c62aff7d 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -220,9 +220,7 @@ def test_skiprows_ndarray(self): assert_framelist_equal(df1, df2) def test_skiprows_invalid(self): - with pytest.raises( - TypeError, match=("is not a valid type " "for skipping rows") - ): + with pytest.raises(TypeError, match=("is not a valid type for skipping rows")): self.read_html(self.spam_data, ".*Water.*", skiprows="asdf") def test_index(self): diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index f3e045be2e790f..6ac2e9cd65a271 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -40,13 +40,13 @@ pytest.param( "fastparquet", marks=pytest.mark.skipif( - not _HAVE_FASTPARQUET, reason="fastparquet is " "not installed" + not _HAVE_FASTPARQUET, reason="fastparquet is not installed" ), ), pytest.param( "pyarrow", marks=pytest.mark.skipif( - not _HAVE_PYARROW, reason="pyarrow is " "not installed" + not _HAVE_PYARROW, reason="pyarrow is not installed" ), ), ] diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 4fc90ea41718dd..347e280234f91e 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -824,7 +824,7 @@ def test_to_sql_index_label_multiindex(self): frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) assert frame.columns[:2].tolist() == ["C", "D"] - msg = "Length of 'index_label' should match number of levels, which" " is 2" + msg = "Length of 'index_label' should match number of levels, which is 2" with pytest.raises(ValueError, match=msg): sql.to_sql( temp_frame, @@ -1408,7 +1408,7 @@ def check(col): else: raise AssertionError( - "DateCol loaded with incorrect type " "-> {0}".format(col.dtype) + "DateCol loaded with incorrect type -> {0}".format(col.dtype) ) # GH11216 @@ -2566,7 +2566,7 @@ def clean_up(test_table_to_drop): @pytest.mark.single @pytest.mark.db @pytest.mark.skip( - reason="gh-13611: there is no support for MySQL " "if SQLAlchemy is not installed" + reason="gh-13611: there is no support for MySQL if SQLAlchemy is not installed" ) class TestXMySQL(MySQLMixIn): @pytest.fixture(autouse=True, scope="class") diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 715c7e370210fd..1e7d568602656e 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -915,7 +915,7 @@ def test_drop_column(self): columns = ["byte_", "byte_"] read_stata(self.dta15_117, convert_dates=True, columns=columns) - msg = "The following columns were not found in the Stata data set:" " not_found" + msg = "The following columns were not found in the Stata data set: not_found" with pytest.raises(ValueError, match=msg): columns = ["byte_", "int_", "long_", "not_found"] read_stata(self.dta15_117, convert_dates=True, columns=columns) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index ecd575020eca60..5ae29dc640dc96 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -179,7 +179,7 @@ def check_format_of_first_point(ax, expected_string): assert expected_string == ax.format_coord(first_x, first_y) except (ValueError): pytest.skip( - "skipping test because issue forming " "test comparison GH7664" + "skipping test because issue forming test comparison GH7664" ) annual = Series(1, index=date_range("2014-01-01", periods=3, freq="A-DEC")) @@ -1501,7 +1501,7 @@ def test_overlapping_datetime(self): s2.plot(ax=ax) s1.plot(ax=ax) - @pytest.mark.xfail(reason="GH9053 matplotlib does not use" " ax.xaxis.converter") + @pytest.mark.xfail(reason="GH9053 matplotlib does not use ax.xaxis.converter") def test_add_matplotlib_datetime64(self): # GH9053 - ensure that a plot with PeriodConverter still understands # datetime64 data. This still fails because matplotlib overrides the diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 0215b79cb993d5..65815bcedebfc4 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -547,9 +547,7 @@ def test_subplots_timeseries_y_axis(self): with pytest.raises(TypeError, match=msg): testdata.plot(y="text") - @pytest.mark.xfail( - reason="not support for period, categorical, " "datetime_mixed_tz" - ) + @pytest.mark.xfail(reason="not support for period, categorical, datetime_mixed_tz") def test_subplots_timeseries_y_axis_not_supported(self): """ This test will fail for: diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 80365e34fa87af..b6c6f967333a8f 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1141,7 +1141,7 @@ def test_validation(self): validate="one_to_many", ) - msg = "Merge keys are not unique in right dataset; not a one-to-one" " merge" + msg = "Merge keys are not unique in right dataset; not a one-to-one merge" with pytest.raises(MergeError, match=msg): merge( left, @@ -1166,7 +1166,7 @@ def test_validation(self): validate="many_to_one", ) - msg = "Merge keys are not unique in left dataset; not a one-to-one" " merge" + msg = "Merge keys are not unique in left dataset; not a one-to-one merge" with pytest.raises(MergeError, match=msg): merge( left_w_dups, @@ -1182,7 +1182,7 @@ def test_validation(self): # Dups on both merge(left_w_dups, right_w_dups, on="a", validate="many_to_many") - msg = "Merge keys are not unique in right dataset; not a many-to-one" " merge" + msg = "Merge keys are not unique in right dataset; not a many-to-one merge" with pytest.raises(MergeError, match=msg): merge( left_w_dups, @@ -1192,7 +1192,7 @@ def test_validation(self): validate="many_to_one", ) - msg = "Merge keys are not unique in left dataset; not a one-to-many" " merge" + msg = "Merge keys are not unique in left dataset; not a one-to-many merge" with pytest.raises(MergeError, match=msg): merge(left_w_dups, right_w_dups, on="a", validate="one_to_many") diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 56e83ada9eb992..1b067c08d2e40c 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -121,9 +121,7 @@ def test_tuple_vars_fail_with_multiindex(self): tuple_b = ("B", "b") list_b = [tuple_b] - msg = ( - r"(id|value)_vars must be a list of tuples when columns are" " a MultiIndex" - ) + msg = r"(id|value)_vars must be a list of tuples when columns are a MultiIndex" for id_vars, value_vars in ( (tuple_a, list_b), (list_a, tuple_b), diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index b497f6c3aa9b44..d3300ffb01c3ac 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -731,7 +731,7 @@ def test_pivot_with_list_like_values_nans(self, values, method): tm.assert_frame_equal(result, expected) @pytest.mark.xfail( - reason="MultiIndexed unstack with tuple names fails" "with KeyError GH#19966" + reason="MultiIndexed unstack with tuple names fails with KeyError GH#19966" ) @pytest.mark.parametrize("method", [True, False]) def test_pivot_with_multiindex(self, method): @@ -880,7 +880,7 @@ def test_margins_dtype(self): tm.assert_frame_equal(expected, result) - @pytest.mark.xfail(reason="GH#17035 (len of floats is casted back to " "floats)") + @pytest.mark.xfail(reason="GH#17035 (len of floats is casted back to floats)") def test_margins_dtype_len(self): mi_val = list(product(["bar", "foo"], ["one", "two"])) + [("All", "")] mi = MultiIndex.from_tuples(mi_val, names=("A", "B")) @@ -1575,7 +1575,7 @@ def test_pivot_table_margins_name_with_aggfunc_list(self): expected = pd.DataFrame(table.values, index=ix, columns=cols) tm.assert_frame_equal(table, expected) - @pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to " "ints)") + @pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to ints)") def test_categorical_margins(self, observed): # GH 10989 df = pd.DataFrame( @@ -1589,7 +1589,7 @@ def test_categorical_margins(self, observed): table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) tm.assert_frame_equal(table, expected) - @pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to " "ints)") + @pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to ints)") def test_categorical_margins_category(self, observed): df = pd.DataFrame( {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2} diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index ee0ff87e31aea3..4cff061cabc409 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -31,7 +31,7 @@ def test_asfreq_near_zero_weekly(self): assert week2.asfreq("D", "S") <= per2 @pytest.mark.xfail( - reason="GH#19643 period_helper asfreq functions fail " "to check for overflows" + reason="GH#19643 period_helper asfreq functions fail to check for overflows" ) def test_to_timestamp_out_of_bounds(self): # GH#19643, currently gives Timestamp('1754-08-30 22:43:41.128654848') diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 34d2fa6a9194ca..771a67dfceaa8d 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -390,11 +390,11 @@ def test_period_cons_mult(self): assert result.freq == p1.freq assert result.freqstr == "3M" - msg = "Frequency must be positive, because it" " represents span: -3M" + msg = "Frequency must be positive, because it represents span: -3M" with pytest.raises(ValueError, match=msg): Period("2011-01", freq="-3M") - msg = "Frequency must be positive, because it" " represents span: 0M" + msg = "Frequency must be positive, because it represents span: 0M" with pytest.raises(ValueError, match=msg): Period("2011-01", freq="0M") @@ -445,7 +445,7 @@ def test_period_cons_combined(self): assert result.freq == p2.freq assert result.freqstr == "25H" - msg = "Frequency must be positive, because it" " represents span: -25H" + msg = "Frequency must be positive, because it represents span: -25H" with pytest.raises(ValueError, match=msg): Period("2011-01", freq="-1D1H") with pytest.raises(ValueError, match=msg): @@ -455,7 +455,7 @@ def test_period_cons_combined(self): with pytest.raises(ValueError, match=msg): Period(ordinal=1, freq="-1H1D") - msg = "Frequency must be positive, because it" " represents span: 0D" + msg = "Frequency must be positive, because it represents span: 0D" with pytest.raises(ValueError, match=msg): Period("2011-01", freq="0D0H") with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/scalar/timedelta/test_construction.py b/pandas/tests/scalar/timedelta/test_construction.py index 9917e8bc4c9ac9..ae1e84576c092d 100644 --- a/pandas/tests/scalar/timedelta/test_construction.py +++ b/pandas/tests/scalar/timedelta/test_construction.py @@ -239,9 +239,8 @@ def test_iso_constructor(fmt, exp): ], ) def test_iso_constructor_raises(fmt): - with pytest.raises( - ValueError, match=("Invalid ISO 8601 Duration " "format - {}".format(fmt)) - ): + msg = "Invalid ISO 8601 Duration format - {}".format(fmt) + with pytest.raises(ValueError, match=msg): Timedelta(fmt) diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py index f64cf97acf8054..424b0c9abdef85 100644 --- a/pandas/tests/scalar/timestamp/test_timezones.py +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -63,11 +63,11 @@ def test_tz_localize_ambiguous(self): ts.tz_localize("US/Eastern", ambiguous="infer") # GH#8025 - msg = "Cannot localize tz-aware Timestamp, " "use tz_convert for conversions" + msg = "Cannot localize tz-aware Timestamp, use tz_convert for conversions" with pytest.raises(TypeError, match=msg): Timestamp("2011-01-01", tz="US/Eastern").tz_localize("Asia/Tokyo") - msg = "Cannot convert tz-naive Timestamp, " "use tz_localize to localize" + msg = "Cannot convert tz-naive Timestamp, use tz_localize to localize" with pytest.raises(TypeError, match=msg): Timestamp("2011-01-01").tz_convert("Asia/Tokyo") diff --git a/pandas/tests/series/indexing/test_alter_index.py b/pandas/tests/series/indexing/test_alter_index.py index 31a1f43470f2c7..c93a000f5e7ce2 100644 --- a/pandas/tests/series/indexing/test_alter_index.py +++ b/pandas/tests/series/indexing/test_alter_index.py @@ -480,7 +480,7 @@ def test_rename(): @pytest.mark.parametrize( - "data, index, drop_labels," " axis, expected_data, expected_index", + "data, index, drop_labels, axis, expected_data, expected_index", [ # Unique Index ([1, 2], ["one", "two"], ["two"], 0, [1], ["one"]), @@ -503,7 +503,7 @@ def test_drop_unique_and_non_unique_index( @pytest.mark.parametrize( - "data, index, drop_labels," " axis, error_type, error_desc", + "data, index, drop_labels, axis, error_type, error_desc", [ # single string/tuple-like (range(3), list("abc"), "bc", 0, KeyError, "not found in axis"), diff --git a/pandas/tests/series/indexing/test_boolean.py b/pandas/tests/series/indexing/test_boolean.py index 9b76ed026e580f..01b4a3c84a5653 100644 --- a/pandas/tests/series/indexing/test_boolean.py +++ b/pandas/tests/series/indexing/test_boolean.py @@ -353,7 +353,7 @@ def test_where_setitem_invalid(): # GH 2702 # make sure correct exceptions are raised on invalid list assignment - msg = "cannot set using a {} indexer with a different length than" " the value" + msg = "cannot set using a {} indexer with a different length than the value" # slice s = Series(list("abc")) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 89b411a284563d..67373686d67284 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -229,7 +229,7 @@ def test_cummax_timedelta64(self): tm.assert_series_equal(expected, result) def test_npdiff(self): - pytest.skip("skipping due to Series no longer being an " "ndarray") + pytest.skip("skipping due to Series no longer being an ndarray") # no longer works as the return type of np.diff is now nd.array s = Series(np.arange(5)) @@ -407,9 +407,7 @@ def test_corr_invalid_method(self): # GH PR #22298 s1 = pd.Series(np.random.randn(10)) s2 = pd.Series(np.random.randn(10)) - msg = ( - "method must be either 'pearson', " "'spearman', 'kendall', or a callable, " - ) + msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, " with pytest.raises(ValueError, match=msg): s1.corr(s2, method="____") @@ -820,7 +818,7 @@ def test_ptp(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): tm.assert_series_equal(s.ptp(level=0, skipna=False), expected) - msg = "No axis named 1 for object type" " " + msg = "No axis named 1 for object type " with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): s.ptp(axis=1) @@ -1295,7 +1293,7 @@ class TestNLargestNSmallest: ) def test_error(self, r): dt = r.dtype - msg = "Cannot use method 'n(larg|small)est' with " "dtype {dt}".format(dt=dt) + msg = "Cannot use method 'n(larg|small)est' with dtype {dt}".format(dt=dt) args = 2, len(r), 0, -1 methods = r.nlargest, r.nsmallest for method, arg in product(methods, args): diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 6012f3986e955f..f8a44b7f5639e9 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -813,7 +813,7 @@ def test_dropna_empty(self): assert len(s) == 0 # invalid axis - msg = "No axis named 1 for object type" " " + msg = "No axis named 1 for object type " with pytest.raises(ValueError, match=msg): s.dropna(axis=1) @@ -1117,9 +1117,7 @@ def test_interpolate_time_raises_for_non_timeseries(self): # When method='time' is used on a non-TimeSeries that contains a null # value, a ValueError should be raised. non_ts = Series([0, 1, 2, np.NaN]) - msg = ( - "time-weighted interpolation only works on Series.* " "with a DatetimeIndex" - ) + msg = "time-weighted interpolation only works on Series.* with a DatetimeIndex" with pytest.raises(ValueError, match=msg): non_ts.interpolate(method="time") @@ -1417,9 +1415,7 @@ def test_interp_limit_area(self): ) # raises an error even if limit type is wrong. - msg = ( - r"Invalid limit_area: expecting one of \['inside', 'outside'\]," " got abc" - ) + msg = r"Invalid limit_area: expecting one of \['inside', 'outside'\], got abc" with pytest.raises(ValueError, match=msg): s.interpolate(method="linear", limit_area="abc") @@ -1668,5 +1664,5 @@ def test_interpolate_timedelta_index(self, interp_methods_ind): assert_series_equal(result, expected) else: pytest.skip( - "This interpolation method is not supported for " "Timedelta Index yet." + "This interpolation method is not supported for Timedelta Index yet." ) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index aada5cca9fdc75..0c25df79974699 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -194,7 +194,7 @@ def test_logical_ops_with_index(self, op): pytest.param( ops.rand_, marks=pytest.mark.xfail( - reason="GH#22092 Index " "implementation returns " "Index", + reason="GH#22092 Index implementation returns Index", raises=AssertionError, strict=True, ), @@ -202,7 +202,7 @@ def test_logical_ops_with_index(self, op): pytest.param( ops.ror_, marks=pytest.mark.xfail( - reason="Index.get_indexer " "with non unique index", + reason="Index.get_indexer with non unique index", raises=InvalidIndexError, strict=True, ), diff --git a/pandas/tests/series/test_sorting.py b/pandas/tests/series/test_sorting.py index 0ae2194543b44d..125f516ab6b090 100644 --- a/pandas/tests/series/test_sorting.py +++ b/pandas/tests/series/test_sorting.py @@ -106,7 +106,7 @@ def test_sort_index(self): sorted_series = random_order.sort_index(axis=0) assert_series_equal(sorted_series, self.ts) - msg = "No axis named 1 for object type" " " + msg = "No axis named 1 for object type " with pytest.raises(ValueError, match=msg): random_order.sort_values(axis=1) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 6be1b9a9143bf3..d0ca5d82c6b33b 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -120,9 +120,7 @@ def test_shift(self): # incompat tz s2 = Series(date_range("2000-01-01 09:00:00", periods=5, tz="CET"), name="foo") - msg = ( - "DatetimeArray subtraction must have the same timezones or no" " timezones" - ) + msg = "DatetimeArray subtraction must have the same timezones or no timezones" with pytest.raises(TypeError, match=msg): s - s2 @@ -915,7 +913,7 @@ def test_between_time_axis(self): assert len(ts.between_time(stime, etime)) == expected_length assert len(ts.between_time(stime, etime, axis=0)) == expected_length - msg = "No axis named 1 for object type" " " + msg = "No axis named 1 for object type " with pytest.raises(ValueError, match=msg): ts.between_time(stime, etime, axis=1) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 96e3c4640d2f6f..6527d41eac841a 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -1357,9 +1357,7 @@ def test_as_blocks(self): assert list(df_blocks.keys()) == ["Sparse[float64, nan]"] tm.assert_frame_equal(df_blocks["Sparse[float64, nan]"], df) - @pytest.mark.xfail( - reason="nan column names in _init_dict problematic " "(GH#16894)" - ) + @pytest.mark.xfail(reason="nan column names in _init_dict problematic (GH#16894)") def test_nan_columnname(self): # GH 8822 nan_colname = DataFrame(Series(1.0, index=[0]), columns=[nan]) diff --git a/pandas/tests/sparse/series/test_indexing.py b/pandas/tests/sparse/series/test_indexing.py index 525b0487a93768..c75f3b2134f91e 100644 --- a/pandas/tests/sparse/series/test_indexing.py +++ b/pandas/tests/sparse/series/test_indexing.py @@ -62,7 +62,7 @@ def test_where_with_numeric_data(data): ], ) @pytest.mark.parametrize("other", [True, -100, 0.1, 100.0 + 100.0j]) -@pytest.mark.skip(reason="Wrong SparseBlock initialization " "(Segfault) " "(GH 17386)") +@pytest.mark.skip(reason="Wrong SparseBlock initialization (Segfault) (GH 17386)") def test_where_with_numeric_data_and_other(data, other): # GH 17386 lower_bound = 1.5 @@ -96,7 +96,7 @@ def test_where_with_bool_data(): @pytest.mark.parametrize("other", [True, 0, 0.1, 100.0 + 100.0j]) -@pytest.mark.skip(reason="Wrong SparseBlock initialization " "(Segfault) " "(GH 17386)") +@pytest.mark.skip(reason="Wrong SparseBlock initialization (Segfault) (GH 17386)") def test_where_with_bool_data_and_other(other): # GH 17386 data = [False, False, True, True, False, False] diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 5619a0a11fb116..eb217283c7a83a 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -1194,7 +1194,7 @@ def test_to_coo_bad_ilevel(self): def test_to_coo_duplicate_index_entries(self): ss = pd.concat([self.sparse_series[0], self.sparse_series[0]]).to_sparse() - msg = "Duplicate index entries are not allowed in to_coo" " transformation" + msg = "Duplicate index entries are not allowed in to_coo transformation" with pytest.raises(ValueError, match=msg): ss.to_coo(["A", "B"], ["C", "D"]) diff --git a/pandas/tests/sparse/test_indexing.py b/pandas/tests/sparse/test_indexing.py index 5cfacaf16cffe5..ea5e939b57566b 100644 --- a/pandas/tests/sparse/test_indexing.py +++ b/pandas/tests/sparse/test_indexing.py @@ -441,7 +441,7 @@ def tests_indexing_with_sparse(self, kind, fill): tm.assert_sp_series_equal(s[indexer], expected) tm.assert_sp_series_equal(s.loc[indexer], expected) - msg = "iLocation based boolean indexing cannot " "use an indexable as a mask" + msg = "iLocation based boolean indexing cannot use an indexable as a mask" with pytest.raises(ValueError, match=msg): s.iloc[indexer] diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index a76f2bb04a5420..c97c69c323b566 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1819,9 +1819,8 @@ def test_reset_index_multiindex_columns(self): tm.assert_frame_equal(result, df) # gh-16120: already existing column - with pytest.raises( - ValueError, match=(r"cannot insert \('A', ''\), " "already exists") - ): + msg = r"cannot insert \('A', ''\), already exists" + with pytest.raises(ValueError, match=msg): df.rename_axis("A").reset_index() # gh-16164: multiindex (tuple) full key @@ -1837,9 +1836,8 @@ def test_reset_index_multiindex_columns(self): tm.assert_frame_equal(result, expected) # with index name which is a too long tuple... - with pytest.raises( - ValueError, match=("Item must have length equal " "to number of levels.") - ): + msg = "Item must have length equal to number of levels." + with pytest.raises(ValueError, match=msg): df.rename_axis([("C", "c", "i")]).reset_index() # or too short... diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 6833757c69eaaf..950d6a9595f9ee 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -196,10 +196,8 @@ def test_api(self): def test_api_mi_raises(self): # GH 23679 mi = MultiIndex.from_arrays([["a", "b", "c"]]) - with pytest.raises( - AttributeError, - match="Can only use .str accessor " "with Index, not MultiIndex", - ): + msg = "Can only use .str accessor with Index, not MultiIndex" + with pytest.raises(AttributeError, match=msg): mi.str assert not hasattr(mi, "str") @@ -232,10 +230,8 @@ def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype): assert isinstance(t.str, strings.StringMethods) else: # GH 9184, GH 23011, GH 23163 - with pytest.raises( - AttributeError, - match="Can only use .str " "accessor with string values.*", - ): + msg = "Can only use .str accessor with string values.*" + with pytest.raises(AttributeError, match=msg): t.str assert not hasattr(t, "str") @@ -1101,7 +1097,7 @@ def test_replace_literal(self): with pytest.raises(ValueError, match=msg): values.str.replace("abc", callable_repl, regex=False) - msg = "Cannot use a compiled regex as replacement pattern with" " regex=False" + msg = "Cannot use a compiled regex as replacement pattern with regex=False" with pytest.raises(ValueError, match=msg): values.str.replace(compiled_pat, "", regex=False) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 27700d778df191..2df5460a059532 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -844,7 +844,7 @@ def test_numpy_compat(self, method): pytest.param( "ls", marks=pytest.mark.xfail( - reason="GH#16425 expanding with " "offset not supported" + reason="GH#16425 expanding with offset not supported" ), ), ], @@ -1775,9 +1775,8 @@ def test_invalid_quantile_value(self): data = np.arange(5) s = Series(data) - with pytest.raises( - ValueError, match="Interpolation 'invalid'" " is not supported" - ): + msg = "Interpolation 'invalid' is not supported" + with pytest.raises(ValueError, match=msg): s.rolling(len(data), min_periods=1).quantile(0.5, interpolation="invalid") def test_rolling_quantile_param(self): diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 822e97b21f0da6..2654d83ee0c525 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -843,7 +843,7 @@ def test_apply_large_n(self): assert rs == xp def test_apply_corner(self): - msg = "Only know how to combine business day with datetime or" " timedelta" + msg = "Only know how to combine business day with datetime or timedelta" with pytest.raises(ApplyTypeError, match=msg): BDay().apply(BMonthEnd()) diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 700fee2d89f3ca..126a1bd12ad597 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -38,11 +38,9 @@ def test_parse_time_quarter_with_dash(dashed, normal): @pytest.mark.parametrize("dashed", ["-2Q1992", "2-Q1992", "4-4Q1992"]) def test_parse_time_quarter_with_dash_error(dashed): - msg = "Unknown datetime string format, " "unable to parse: {dashed}".format( - dashed=dashed - ) + msg = "Unknown datetime string format, unable to parse: {dashed}" - with pytest.raises(parsing.DateParseError, match=msg): + with pytest.raises(parsing.DateParseError, match=msg.format(dashed=dashed)): parse_time_string(dashed) @@ -113,14 +111,12 @@ def test_parsers_quarter_invalid(date_str): if date_str == "6Q-20": msg = ( "Incorrect quarterly string is given, quarter " - "must be between 1 and 4: {date_str}".format(date_str=date_str) + "must be between 1 and 4: {date_str}" ) else: - msg = "Unknown datetime string format, unable " "to parse: {date_str}".format( - date_str=date_str - ) + msg = "Unknown datetime string format, unable to parse: {date_str}" - with pytest.raises(ValueError, match=msg): + with pytest.raises(ValueError, match=msg.format(date_str=date_str)): parsing.parse_time_string(date_str) diff --git a/pandas/tests/util/test_assert_almost_equal.py b/pandas/tests/util/test_assert_almost_equal.py index 1583420053fde7..5a677d629e72d9 100644 --- a/pandas/tests/util/test_assert_almost_equal.py +++ b/pandas/tests/util/test_assert_almost_equal.py @@ -40,7 +40,7 @@ def _assert_not_almost_equal(a, b, **kwargs): try: assert_almost_equal(a, b, **kwargs) msg = ( - "{a} and {b} were approximately equal " "when they shouldn't have been" + "{a} and {b} were approximately equal when they shouldn't have been" ).format(a=a, b=b) pytest.fail(msg=msg) except AssertionError: diff --git a/pandas/tests/util/test_deprecate.py b/pandas/tests/util/test_deprecate.py index e7b38bb2b700a7..8fbc8037ed7c50 100644 --- a/pandas/tests/util/test_deprecate.py +++ b/pandas/tests/util/test_deprecate.py @@ -57,9 +57,8 @@ def test_deprecate_no_docstring(): def test_deprecate_wrong_docstring(): - with pytest.raises( - AssertionError, match="deprecate needs a correctly " "formatted docstring" - ): + msg = "deprecate needs a correctly formatted docstring" + with pytest.raises(AssertionError, match=msg): deprecate( "depr_func", new_func_wrong_docstring, "1.0", msg="Use new_func instead." ) From b377a7875b74e45797f7f0ff26200f287823f3e4 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 8 Jul 2019 02:13:32 +0100 Subject: [PATCH 176/238] TST/CLN: remove try block from tests/test_strings.py::TestStringMethods::test_slice (#27277) --- pandas/tests/test_strings.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 950d6a9595f9ee..bc848a528f2fdf 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -2795,23 +2795,20 @@ def test_pipe_failures(self): tm.assert_series_equal(result, exp) - def test_slice(self): + @pytest.mark.parametrize( + "start, stop, step, expected", + [ + (2, 5, None, Series(["foo", "bar", NA, "baz"])), + (0, 3, -1, Series(["", "", NA, ""])), + (None, None, -1, Series(["owtoofaa", "owtrabaa", NA, "xuqzabaa"])), + (3, 10, 2, Series(["oto", "ato", NA, "aqx"])), + (3, 0, -1, Series(["ofa", "aba", NA, "aba"])), + ], + ) + def test_slice(self, start, stop, step, expected): values = Series(["aafootwo", "aabartwo", NA, "aabazqux"]) - - result = values.str.slice(2, 5) - exp = Series(["foo", "bar", NA, "baz"]) - tm.assert_series_equal(result, exp) - - for start, stop, step in [(0, 3, -1), (None, None, -1), (3, 10, 2), (3, 0, -1)]: - try: - result = values.str.slice(start, stop, step) - expected = Series( - [s[start:stop:step] if not isna(s) else NA for s in values] - ) - tm.assert_series_equal(result, expected) - except IndexError: - print("failed on %s:%s:%s" % (start, stop, step)) - raise + result = values.str.slice(start, stop, step) + tm.assert_series_equal(result, expected) # mixed mixed = Series( From 1d92c77c77fcdf964c7c30180045c8fd5ef9c5c1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 7 Jul 2019 18:14:25 -0700 Subject: [PATCH 177/238] REF: Avoid dispatching Series ops to pd.Index (#27268) --- pandas/core/ops/__init__.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 545f98a02439ac..f9112dbb1e4ab3 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -43,6 +43,7 @@ ABCSeries, ABCSparseArray, ABCSparseSeries, + ABCTimedeltaArray, ) from pandas.core.dtypes.missing import isna, notna @@ -1703,10 +1704,30 @@ def wrapper(left, right): # Note: we cannot use dispatch_to_index_op because # that may incorrectly raise TypeError when we # should get NullFrequencyError - result = op(pd.Index(left), right) - return construct_result( - left, result, index=left.index, name=res_name, dtype=result.dtype - ) + orig_right = right + if is_scalar(right): + # broadcast and wrap in a TimedeltaIndex + assert np.isnat(right) + right = np.broadcast_to(right, left.shape) + right = pd.TimedeltaIndex(right) + + assert isinstance(right, (pd.TimedeltaIndex, ABCTimedeltaArray, ABCSeries)) + try: + result = op(left._values, right) + except NullFrequencyError: + if orig_right is not right: + # i.e. scalar timedelta64('NaT') + # We get a NullFrequencyError because we broadcast to + # TimedeltaIndex, but this should be TypeError. + raise TypeError( + "incompatible type for a datetime/timedelta " + "operation [{name}]".format(name=op.__name__) + ) + raise + + # We do not pass dtype to ensure that the Series constructor + # does inference in the case where `result` has object-dtype. + return construct_result(left, result, index=left.index, name=res_name) lvalues = left.values rvalues = right From e607c3431afc882effe4784a42b0c636ed4099e0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 7 Jul 2019 21:15:43 -0400 Subject: [PATCH 178/238] COMPAT: catch InvalidIndexError in base Indexer getitem (#27259) --- pandas/core/indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 0bcaa83c496283..c30885291ffc9c 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -24,7 +24,7 @@ from pandas.core.dtypes.missing import _infer_fill_value, isna import pandas.core.common as com -from pandas.core.index import Index, MultiIndex +from pandas.core.index import Index, InvalidIndexError, MultiIndex # the supported indexers @@ -118,7 +118,7 @@ def __getitem__(self, key): key = tuple(com.apply_if_callable(x, self.obj) for x in key) try: values = self.obj._get_value(*key) - except (KeyError, TypeError): + except (KeyError, TypeError, InvalidIndexError): # TypeError occurs here if the key has non-hashable entries, # generally slice or list. # TODO(ix): most/all of the TypeError cases here are for ix, From 5422807194d83e47270c2b49b3c79453b437425a Mon Sep 17 00:00:00 2001 From: mazayo <45595210+mazayo@users.noreply.github.com> Date: Mon, 8 Jul 2019 10:26:49 +0900 Subject: [PATCH 179/238] BUG: from_dict ignored order of OrderedDict (#8425) (#26875) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/internals/construction.py | 8 +++++++- pandas/tests/frame/test_constructors.py | 20 +++++++++++++++++--- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 77fa12ea95a486..193a0edee5e967 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -1152,6 +1152,7 @@ Reshaping - Bug in :func:`DataFrame.sort_index` where an error is thrown when a multi-indexed ``DataFrame`` is sorted on all levels with the initial level sorted last (:issue:`26053`) - Bug in :meth:`Series.nlargest` treats ``True`` as smaller than ``False`` (:issue:`26154`) - Bug in :func:`DataFrame.pivot_table` with a :class:`IntervalIndex` as pivot index would raise ``TypeError`` (:issue:`25814`) +- Bug in which :meth:`DataFrame.from_dict` ignored order of ``OrderedDict`` when ``orient='index'`` (:issue:`8425`). - Bug in :meth:`DataFrame.transpose` where transposing a DataFrame with a timezone-aware datetime column would incorrectly raise ``ValueError`` (:issue:`26825`) - Bug in :func:`pivot_table` when pivoting a timezone aware column as the ``values`` would remove timezone information (:issue:`14948`) - Bug in :func:`merge_asof` when specifying multiple ``by`` columns where one is ``datetime64[ns, tz]`` dtype (:issue:`26649`) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 4d64be34e624f8..b4752039cf5b1d 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -9,6 +9,7 @@ from pandas._libs import lib from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime +import pandas.compat as compat from pandas.compat import raise_with_traceback from pandas.core.dtypes.cast import ( @@ -338,6 +339,7 @@ def extract_index(data): have_raw_arrays = False have_series = False have_dicts = False + have_ordered = False for val in data: if isinstance(val, ABCSeries): @@ -345,6 +347,8 @@ def extract_index(data): indexes.append(val.index) elif isinstance(val, dict): have_dicts = True + if isinstance(val, OrderedDict): + have_ordered = True indexes.append(list(val.keys())) elif is_list_like(val) and getattr(val, "ndim", 1) == 1: have_raw_arrays = True @@ -353,8 +357,10 @@ def extract_index(data): if not indexes and not raw_lengths: raise ValueError("If using all scalar values, you must pass" " an index") - if have_series or have_dicts: + if have_series: index = _union_indexes(indexes) + elif have_dicts: + index = _union_indexes(indexes, sort=not (compat.PY36 or have_ordered)) if have_raw_arrays: lengths = list(set(raw_lengths)) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 349e2d9c578be3..eca827f82e2969 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -517,7 +517,8 @@ def test_constructor_subclass_dict(self, float_frame): dct.update(v.to_dict()) data[k] = dct frame = DataFrame(data) - tm.assert_frame_equal(float_frame.sort_index(), frame) + expected = frame.reindex(index=float_frame.index) + tm.assert_frame_equal(float_frame, expected) def test_constructor_dict_block(self): expected = np.array([[4.0, 3.0, 2.0, 1.0]]) @@ -1203,7 +1204,7 @@ def test_constructor_list_of_series(self): sdict = OrderedDict(zip(["x", "Unnamed 0"], data)) expected = DataFrame.from_dict(sdict, orient="index") - tm.assert_frame_equal(result.sort_index(), expected) + tm.assert_frame_equal(result, expected) # none named data = [ @@ -1342,7 +1343,7 @@ def test_constructor_list_of_namedtuples(self): def test_constructor_orient(self, float_string_frame): data_dict = float_string_frame.T._series recons = DataFrame.from_dict(data_dict, orient="index") - expected = float_string_frame.sort_index() + expected = float_string_frame.reindex(index=recons.index) tm.assert_frame_equal(recons, expected) # dict of sequence @@ -1351,6 +1352,19 @@ def test_constructor_orient(self, float_string_frame): xp = DataFrame.from_dict(a).T.reindex(list(a.keys())) tm.assert_frame_equal(rs, xp) + def test_constructor_from_ordered_dict(self): + # GH8425 + a = OrderedDict( + [ + ("one", OrderedDict([("col_a", "foo1"), ("col_b", "bar1")])), + ("two", OrderedDict([("col_a", "foo2"), ("col_b", "bar2")])), + ("three", OrderedDict([("col_a", "foo3"), ("col_b", "bar3")])), + ] + ) + expected = DataFrame.from_dict(a, orient="columns").T + result = DataFrame.from_dict(a, orient="index") + tm.assert_frame_equal(result, expected) + def test_from_dict_columns_parameter(self): # GH 18529 # Test new columns parameter for from_dict that was added to make From 845d4b89e98d5d1678acb2007211fcf687001d12 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Mon, 8 Jul 2019 02:36:04 +0100 Subject: [PATCH 180/238] Typing: Support New Mypy Semantic Analyzer (#27070) --- pandas/core/arrays/interval.py | 1 - pandas/core/indexes/timedeltas.py | 1 - pandas/io/json/__init__.py | 15 +++++++++++---- pandas/io/json/{json.py => _json.py} | 6 +++--- pandas/io/json/{normalize.py => _normalize.py} | 2 +- .../io/json/{table_schema.py => _table_schema.py} | 0 pandas/tests/io/json/test_json_table_schema.py | 2 +- pandas/tests/io/json/test_normalize.py | 2 +- pandas/tests/io/json/test_readlines.py | 2 +- 9 files changed, 18 insertions(+), 13 deletions(-) rename pandas/io/json/{json.py => _json.py} (99%) rename pandas/io/json/{normalize.py => _normalize.py} (99%) rename pandas/io/json/{table_schema.py => _table_schema.py} (100%) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index f9fbd7ada376e9..a0319fe96896ad 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -139,7 +139,6 @@ ) ) class IntervalArray(IntervalMixin, ExtensionArray): - dtype = IntervalDtype() ndim = 1 can_hold_na = True _na_value = _fill_value = np.nan diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 29ed3c6b973181..ecadd11894bfb3 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -176,7 +176,6 @@ def _join_i8_wrapper(joinf, **kwargs): _freq = None - _box_func = TimedeltaArray._box_func _bool_ops = TimedeltaArray._bool_ops _object_ops = TimedeltaArray._object_ops _field_ops = TimedeltaArray._field_ops diff --git a/pandas/io/json/__init__.py b/pandas/io/json/__init__.py index cbb4e37fae6a10..2382d993df96b2 100644 --- a/pandas/io/json/__init__.py +++ b/pandas/io/json/__init__.py @@ -1,5 +1,12 @@ -from .json import dumps, loads, read_json, to_json # noqa -from .normalize import json_normalize # noqa -from .table_schema import build_table_schema # noqa +from pandas.io.json._json import dumps, loads, read_json, to_json +from pandas.io.json._normalize import json_normalize +from pandas.io.json._table_schema import build_table_schema -del json, normalize, table_schema # noqa +__all__ = [ + "dumps", + "loads", + "read_json", + "to_json", + "json_normalize", + "build_table_schema", +] diff --git a/pandas/io/json/json.py b/pandas/io/json/_json.py similarity index 99% rename from pandas/io/json/json.py rename to pandas/io/json/_json.py index f3f0f417acaabc..1f0728ee96469a 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/_json.py @@ -23,8 +23,8 @@ from pandas.io.formats.printing import pprint_thing from pandas.io.parsers import _validate_integer -from .normalize import _convert_to_line_delimits -from .table_schema import build_table_schema, parse_table_schema +from ._normalize import convert_to_line_delimits +from ._table_schema import build_table_schema, parse_table_schema loads = json.loads dumps = json.dumps @@ -79,7 +79,7 @@ def to_json( ).write() if lines: - s = _convert_to_line_delimits(s) + s = convert_to_line_delimits(s) if isinstance(path_or_buf, str): fh, handles = _get_handle(path_or_buf, "w", compression=compression) diff --git a/pandas/io/json/normalize.py b/pandas/io/json/_normalize.py similarity index 99% rename from pandas/io/json/normalize.py rename to pandas/io/json/_normalize.py index c09dc177ccbd1c..a6fde86297a3d7 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/_normalize.py @@ -12,7 +12,7 @@ from pandas import DataFrame -def _convert_to_line_delimits(s): +def convert_to_line_delimits(s): """ Helper function that converts JSON lists to line delimited JSON. """ diff --git a/pandas/io/json/table_schema.py b/pandas/io/json/_table_schema.py similarity index 100% rename from pandas/io/json/table_schema.py rename to pandas/io/json/_table_schema.py diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index c2753d23966c63..b2fc9ec217ca61 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -11,7 +11,7 @@ from pandas import DataFrame import pandas.util.testing as tm -from pandas.io.json.table_schema import ( +from pandas.io.json._table_schema import ( as_json_table_type, build_table_schema, convert_json_field_to_pandas_type, diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 412e5014c8d234..a32103d7b29b95 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -7,7 +7,7 @@ import pandas.util.testing as tm from pandas.io.json import json_normalize -from pandas.io.json.normalize import nested_to_record +from pandas.io.json._normalize import nested_to_record @pytest.fixture diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index a99488509f3360..821bf0287c6f12 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -7,7 +7,7 @@ import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal, ensure_clean -from pandas.io.json.json import JsonReader +from pandas.io.json._json import JsonReader @pytest.fixture From 65e123c3c236e8ea72b8311af0ce4196ed20054a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 7 Jul 2019 21:36:42 -0400 Subject: [PATCH 181/238] DOC: update contributing guidelines for black (#27233) * DOC: update contributing guidelines for black * add pre-commit-config file * update pre-commit hook for pandas * correct name for config file --- .github/PULL_REQUEST_TEMPLATE.md | 1 + .pre-commit-config.yaml | 16 ++++++++++ doc/source/development/contributing.rst | 39 +++++++++++++++++-------- 3 files changed, 44 insertions(+), 12 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 4e1e9ce0174087..7c3870470f0742 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,4 +1,5 @@ - [ ] closes #xxxx - [ ] tests added / passed +- [ ] passes `black pandas` - [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff` - [ ] whatsnew entry diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000000000..5f7143ef518bb5 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,16 @@ +repos: + - repo: https://github.com/python/black + rev: stable + hooks: + - id: black + language_version: python3.7 + - repo: https://gitlab.com/pycqa/flake8 + rev: 3.7.7 + hooks: + - id: flake8 + language: python_venv + - repo: https://github.com/pre-commit/mirrors-isort + rev: v4.3.20 + hooks: + - id: isort + language: python_venv diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index dde1db7e693de3..92d7cf1a79d8c4 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -562,23 +562,38 @@ many errors as possible, but it may not correct *all* of them. Thus, it is recommended that you run ``cpplint`` to double check and make any other style fixes manually. -Python (PEP8) -~~~~~~~~~~~~~ - -*pandas* uses the `PEP8 `_ standard. -There are several tools to ensure you abide by this standard. Here are *some* of -the more common ``PEP8`` issues: +Python (PEP8 / black) +~~~~~~~~~~~~~~~~~~~~~ -* we restrict line-length to 79 characters to promote readability -* passing arguments should have spaces after commas, e.g. ``foo(arg1, arg2, kw1='bar')`` +*pandas* follows the `PEP8 `_ standard +and uses `Black `_ and +`Flake8 `_ to ensure a consistent code +format throughout the project. -:ref:`Continuous Integration ` will run -the `flake8 `_ tool -and report any stylistic errors in your code. Therefore, it is helpful before -submitting code to run the check yourself on the diff:: +:ref:`Continuous Integration ` will run those tools and +report any stylistic errors in your code. Therefore, it is helpful before +submitting code to run the check yourself:: + black pandas git diff upstream/master -u -- "*.py" | flake8 --diff +to auto-format your code. Additionally, many editors have plugins that will +apply ``black`` as you edit files. + +Optionally, you may wish to setup `pre-commit hooks `_ +to automatically run ``black`` and ``flake8`` when you make a git commit. This +can be done by installing ``pre-commit``:: + + pip install pre-commit + +and then running:: + + pre-commit install + +from the root of the pandas repository. Now ``black`` and ``flake8`` will be run +each time you commit changes. You can skip these checks with +``git commit --no-verify``. + This command will catch any stylistic errors in your changes specifically, but be beware it may not catch all of them. For example, if you delete the only usage of an imported function, it is stylistically incorrect to import an From a359a99595a27b9fb36daa8e299a27e2621ba743 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 8 Jul 2019 05:43:10 -0700 Subject: [PATCH 182/238] BUG: Fix+test division by negative zero (#27278) --- pandas/core/arrays/integer.py | 5 +- pandas/core/ops/missing.py | 8 ++- pandas/tests/arithmetic/conftest.py | 6 +- pandas/tests/arithmetic/test_numeric.py | 67 +++++++++++++++++++++-- pandas/tests/io/pytables/test_pytables.py | 1 + setup.cfg | 2 + 6 files changed, 78 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index c999c4db232e6b..867122964fe592 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,5 +1,4 @@ import numbers -import sys from typing import Type import warnings @@ -675,7 +674,7 @@ def _maybe_mask_result(self, result, mask, other, op_name): # a float result # or our op is a divide if (is_float_dtype(other) or is_float(other)) or ( - op_name in ["rtruediv", "truediv", "rdiv", "div"] + op_name in ["rtruediv", "truediv"] ): result[mask] = np.nan return result @@ -747,8 +746,6 @@ def integer_arithmetic_method(self, other): IntegerArray._add_comparison_ops() -module = sys.modules[__name__] - _dtype_docstring = """ An ExtensionDtype for {dtype} integer data. diff --git a/pandas/core/ops/missing.py b/pandas/core/ops/missing.py index 4ca1861baf237b..608c2550994f19 100644 --- a/pandas/core/ops/missing.py +++ b/pandas/core/ops/missing.py @@ -120,9 +120,13 @@ def mask_zero_div_zero(x, y, result, copy=False): if zmask.any(): shape = result.shape + # Flip sign if necessary for -0.0 + zneg_mask = zmask & np.signbit(y) + zpos_mask = zmask & ~zneg_mask + nan_mask = (zmask & (x == 0)).ravel() - neginf_mask = (zmask & (x < 0)).ravel() - posinf_mask = (zmask & (x > 0)).ravel() + neginf_mask = ((zpos_mask & (x < 0)) | (zneg_mask & (x > 0))).ravel() + posinf_mask = ((zpos_mask & (x > 0)) | (zneg_mask & (x < 0))).ravel() if nan_mask.any() or neginf_mask.any() or posinf_mask.any(): # Fill negative/0 with -inf, positive/0 with +inf, 0/0 with NaN diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index c67a67bb31d625..f047154f2c6362 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -30,8 +30,12 @@ def one(request): for box_cls in [pd.Index, np.array] for dtype in [np.int64, np.uint64, np.float64] ] +zeros.extend( + [box_cls([-0.0] * 5, dtype=np.float64) for box_cls in [pd.Index, np.array]] +) zeros.extend([np.array(0, dtype=dtype) for dtype in [np.int64, np.uint64, np.float64]]) -zeros.extend([0, 0.0]) +zeros.extend([np.array(-0.0, dtype=np.float64)]) +zeros.extend([0, 0.0, -0.0]) @pytest.fixture(params=zeros) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 8179ab08895da5..1fbecbab469e40 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -14,6 +14,22 @@ from pandas.core import ops import pandas.util.testing as tm + +def adjust_negative_zero(zero, expected): + """ + Helper to adjust the expected result if we are dividing by -0.0 + as opposed to 0.0 + """ + if np.signbit(np.array(zero)).any(): + # All entries in the `zero` fixture should be either + # all-negative or no-negative. + assert np.signbit(np.array(zero)).all() + + expected *= -1 + + return expected + + # ------------------------------------------------------------------ # Comparisons @@ -229,20 +245,27 @@ def test_div_zero(self, zero, numeric_idx): idx = numeric_idx expected = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64) + # We only adjust for Index, because Series does not yet apply + # the adjustment correctly. + expected2 = adjust_negative_zero(zero, expected) + result = idx / zero - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected2) ser_compat = Series(idx).astype("i8") / np.array(zero).astype("i8") - tm.assert_series_equal(ser_compat, Series(result)) + tm.assert_series_equal(ser_compat, Series(expected)) def test_floordiv_zero(self, zero, numeric_idx): idx = numeric_idx expected = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64) + # We only adjust for Index, because Series does not yet apply + # the adjustment correctly. + expected2 = adjust_negative_zero(zero, expected) result = idx // zero - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected2) ser_compat = Series(idx).astype("i8") // np.array(zero).astype("i8") - tm.assert_series_equal(ser_compat, Series(result)) + tm.assert_series_equal(ser_compat, Series(expected)) def test_mod_zero(self, zero, numeric_idx): idx = numeric_idx @@ -258,11 +281,27 @@ def test_divmod_zero(self, zero, numeric_idx): exleft = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64) exright = pd.Index([np.nan, np.nan, np.nan, np.nan, np.nan], dtype=np.float64) + exleft = adjust_negative_zero(zero, exleft) result = divmod(idx, zero) tm.assert_index_equal(result[0], exleft) tm.assert_index_equal(result[1], exright) + @pytest.mark.parametrize("op", [operator.truediv, operator.floordiv]) + def test_div_negative_zero(self, zero, numeric_idx, op): + # Check that -1 / -0.0 returns np.inf, not -np.inf + if isinstance(numeric_idx, pd.UInt64Index): + return + idx = numeric_idx - 3 + + expected = pd.Index( + [-np.inf, -np.inf, -np.inf, np.nan, np.inf], dtype=np.float64 + ) + expected = adjust_negative_zero(zero, expected) + + result = op(idx, zero) + tm.assert_index_equal(result, expected) + # ------------------------------------------------------------------ @pytest.mark.parametrize("dtype1", [np.int64, np.float64, np.uint64]) @@ -896,6 +935,26 @@ def check(series, other): check(tser, tser[::2]) check(tser, 5) + @pytest.mark.xfail( + reason="Series division does not yet fill 1/0 consistently; Index does." + ) + def test_series_divmod_zero(self): + # Check that divmod uses pandas convention for division by zero, + # which does not match numpy. + # pandas convention has + # 1/0 == np.inf + # -1/0 == -np.inf + # 1/-0.0 == -np.inf + # -1/-0.0 == np.inf + tser = tm.makeTimeSeries().rename("ts") + other = tser * 0 + + result = divmod(tser, other) + exp1 = pd.Series([np.inf] * len(tser), index=tser.index) + exp2 = pd.Series([np.nan] * len(tser), index=tser.index) + tm.assert_series_equal(result[0], exp1) + tm.assert_series_equal(result[1], exp2) + class TestUFuncCompat: @pytest.mark.parametrize( diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index b9f4defb4edf83..fb87749ea62e07 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -4337,6 +4337,7 @@ def test_store_datetime_mixed(self): df["d"] = ts.index[:3] self._check_roundtrip(df, tm.assert_frame_equal) + # FIXME: don't leave commented-out code # def test_cant_write_multiindex_table(self): # # for now, #1848 # df = DataFrame(np.random.randn(10, 4), diff --git a/setup.cfg b/setup.cfg index fee0ab60f25b53..7549bfe2e325d2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -84,6 +84,8 @@ plugins = Cython.Coverage [coverage:report] ignore_errors = False show_missing = True +omit = + pandas/_version.py # Regexes for lines to exclude from consideration exclude_lines = # Have to re-enable the standard pragma From fd2146ff36cc916a2bdb88051bc57376fd5d367d Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Mon, 8 Jul 2019 14:44:36 +0200 Subject: [PATCH 183/238] BUG: fix KeyError with list of a single, missing, element (#27154) closes #27148 --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/indexing.py | 2 +- pandas/tests/indexing/multiindex/test_loc.py | 57 ++++++++----------- .../tests/indexing/multiindex/test_slice.py | 10 ++-- 4 files changed, 32 insertions(+), 38 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 193a0edee5e967..68ecb4c487a1e9 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -1046,6 +1046,7 @@ Indexing - Bug in which :meth:`DataFrame.to_csv` caused a segfault for a reindexed data frame, when the indices were single-level :class:`MultiIndex` (:issue:`26303`). - Fixed bug where assigning a :class:`arrays.PandasArray` to a :class:`pandas.core.frame.DataFrame` would raise error (:issue:`26390`) - Allow keyword arguments for callable local reference used in the :meth:`DataFrame.query` string (:issue:`26426`) +- Fixed a ``KeyError`` when indexing a :class:`MultiIndex`` level with a list containing exactly one label, which is missing (:issue:`27148`) - Bug which produced ``AttributeError`` on partial matching :class:`Timestamp` in a :class:`MultiIndex` (:issue:`26944`) - Bug in :class:`Categorical` and :class:`CategoricalIndex` with :class:`Interval` values when using the ``in`` operator (``__contains``) with objects that are not comparable to the values in the ``Interval`` (:issue:`23705`) - Bug in :meth:`DataFrame.loc` and :meth:`DataFrame.iloc` on a :class:`DataFrame` with a single timezone-aware datetime64[ns] column incorrectly returning a scalar instead of a :class:`Series` (:issue:`27110`) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index c30885291ffc9c..3cb89777d6b719 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1861,7 +1861,7 @@ def _getitem_axis(self, key, axis=None): if ( not isinstance(key, tuple) - and len(key) > 1 + and len(key) and not isinstance(key[0], tuple) ): key = tuple([key]) diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index 9188adc7d6e93d..d92cc00af6fce8 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -130,6 +130,19 @@ def test_loc_multiindex_missing_label_raises(self): with pytest.raises(KeyError, match=r"^2$"): df.loc[2] + @pytest.mark.parametrize("key, pos", [([2, 4], [0, 1]), ([2], []), ([2, 3], [])]) + def test_loc_multiindex_list_missing_label(self, key, pos): + # GH 27148 - lists with missing labels do not raise: + df = DataFrame( + np.random.randn(3, 3), + columns=[[2, 2, 4], [6, 8, 10]], + index=[[4, 4, 8], [8, 10, 12]], + ) + + expected = df.iloc[pos] + result = df.loc[key] + tm.assert_frame_equal(result, expected) + def test_loc_multiindex_too_many_dims_raises(self): # GH 14885 s = Series( @@ -280,47 +293,27 @@ def convert_nested_indexer(indexer_type, keys): @pytest.mark.parametrize( - "indexer, is_level1, expected_error", + "indexer, pos", [ - ([], False, None), # empty ok - (["A"], False, None), - (["A", "D"], False, None), - (["D"], False, r"\['D'\] not in index"), # not any values found - (pd.IndexSlice[:, ["foo"]], True, None), - (pd.IndexSlice[:, ["foo", "bah"]], True, None), + ([], []), # empty ok + (["A"], slice(3)), + (["A", "D"], slice(3)), + (["D", "E"], []), # no values found - fine + (["D"], []), # same, with single item list: GH 27148 + (pd.IndexSlice[:, ["foo"]], slice(2, None, 3)), + (pd.IndexSlice[:, ["foo", "bah"]], slice(2, None, 3)), ], ) -def test_loc_getitem_duplicates_multiindex_missing_indexers( - indexer, is_level1, expected_error -): +def test_loc_getitem_duplicates_multiindex_missing_indexers(indexer, pos): # GH 7866 # multi-index slicing with missing indexers idx = MultiIndex.from_product( [["A", "B", "C"], ["foo", "bar", "baz"]], names=["one", "two"] ) s = Series(np.arange(9, dtype="int64"), index=idx).sort_index() - - if indexer == []: - expected = s.iloc[[]] - elif is_level1: - expected = Series( - [0, 3, 6], - index=MultiIndex.from_product( - [["A", "B", "C"], ["foo"]], names=["one", "two"] - ), - ).sort_index() - else: - exp_idx = MultiIndex.from_product( - [["A"], ["foo", "bar", "baz"]], names=["one", "two"] - ) - expected = Series(np.arange(3, dtype="int64"), index=exp_idx).sort_index() - - if expected_error is not None: - with pytest.raises(KeyError, match=expected_error): - s.loc[indexer] - else: - result = s.loc[indexer] - tm.assert_series_equal(result, expected) + expected = s.iloc[pos] + result = s.loc[indexer] + tm.assert_series_equal(result, expected) def test_series_loc_getitem_fancy(multiindex_year_month_day_dataframe_random_data): diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py index 421ca71428bcc7..692a86aa1a3382 100644 --- a/pandas/tests/indexing/multiindex/test_slice.py +++ b/pandas/tests/indexing/multiindex/test_slice.py @@ -117,11 +117,11 @@ def test_per_axis_per_level_getitem(self): with pytest.raises(ValueError): df.loc[(slice(None), np.array([True, False])), :] - # ambiguous cases - # these can be multiply interpreted (e.g. in this case - # as df.loc[slice(None),[1]] as well - with pytest.raises(KeyError, match=r"'\[1\] not in index'"): - df.loc[slice(None), [1]] + # ambiguous notation + # this is interpreted as slicing on both axes (GH #16396) + result = df.loc[slice(None), [1]] + expected = df.iloc[:, []] + tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), [1]), :] expected = df.iloc[[0, 3]] From 3c78b2fa78e77d711722481646b6a3e0ce1b12cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Abdullah=20=C4=B0hsan=20Se=C3=A7er?= Date: Mon, 8 Jul 2019 15:47:07 +0300 Subject: [PATCH 184/238] DOC: Clarify column type for 'on' parameter of rolling (#27265) --- pandas/core/window.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/window.py b/pandas/core/window.py index 27588249b1b3c7..0c1f6a1a6dacee 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -513,8 +513,10 @@ class Window(_Window): Provide a window type. If ``None``, all points are evenly weighted. See the notes below for further information. on : str, optional - For a DataFrame, column on which to calculate - the rolling window, rather than the index. + For a DataFrame, a datetime-like column on which to calculate the rolling + window, rather than the DataFrame's index. Provided integer column is + ignored and excluded from result since an integer index is not used to + calculate the rolling window. axis : int or str, default 0 closed : str, default None Make the interval closed on the 'right', 'left', 'both' or From c64c9cb44222a42f7b02d4d6007919cd0645f1be Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 8 Jul 2019 09:31:03 -0400 Subject: [PATCH 185/238] TST: Add comment and test for geopandas compat fix (GH27259) (#27287) --- pandas/core/indexing.py | 3 +++ pandas/tests/test_downstream.py | 22 +++++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 3cb89777d6b719..e27f85eb6d0a4b 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -123,6 +123,9 @@ def __getitem__(self, key): # generally slice or list. # TODO(ix): most/all of the TypeError cases here are for ix, # so this check can be removed once ix is removed. + # The InvalidIndexError is only catched for compatibility + # with geopandas, see + # https://github.com/pandas-dev/pandas/issues/27258 pass else: if is_scalar(values): diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index d644c002fbdfb6..93baafddedeb48 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -10,7 +10,7 @@ from pandas.compat import PY36 -from pandas import DataFrame +from pandas import DataFrame, Series from pandas.util import testing as tm @@ -123,6 +123,26 @@ def test_geopandas(): assert geopandas.read_file(fp) is not None +def test_geopandas_coordinate_indexer(): + # this test is included to have coverage of one case in the indexing.py + # code that is only kept for compatibility with geopandas, see + # https://github.com/pandas-dev/pandas/issues/27258 + # We should be able to remove this after some time when its usage is + # removed in geopandas + from pandas.core.indexing import _NDFrameIndexer + + class _CoordinateIndexer(_NDFrameIndexer): + def _getitem_tuple(self, tup): + obj = self.obj + xs, ys = tup + return obj[xs][ys] + + Series._create_indexer("cx", _CoordinateIndexer) + s = Series(range(5)) + res = s.cx[:, :] + tm.assert_series_equal(s, res) + + # Cython import warning @pytest.mark.filterwarnings("ignore:can't resolve:ImportWarning") def test_pyarrow(df): From 9240439d3828d5a56c1a320a8fe3995f3c520b59 Mon Sep 17 00:00:00 2001 From: Shorokhov Sergey Date: Tue, 9 Jul 2019 23:42:58 +0300 Subject: [PATCH 186/238] ENH: maybe_convert_objects seen NaT speed-up (#27300) --- asv_bench/benchmarks/algorithms.py | 15 +++++++++++++++ doc/source/whatsnew/v0.25.0.rst | 1 - pandas/_libs/lib.pyx | 1 + 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 436093ef195ef7..7d97f2c740acb9 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -2,6 +2,8 @@ import numpy as np +from pandas._libs import lib + import pandas as pd from pandas.util import testing as tm @@ -13,6 +15,19 @@ pass +class MaybeConvertObjects: + def setup(self): + N = 10 ** 5 + + data = list(range(N)) + data[0] = pd.NaT + data = np.array(data) + self.data = data + + def time_maybe_convert_objects(self): + lib.maybe_convert_objects(self.data) + + class Factorize: params = [[True, False], ["int", "uint", "float", "string"]] diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 68ecb4c487a1e9..8c472cb3121d25 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -940,7 +940,6 @@ Performance improvements - For :meth:`to_datetime` changed default value of cache parameter to ``True`` (:issue:`26043`) - Improved performance of :class:`DatetimeIndex` and :class:`PeriodIndex` slicing given non-unique, monotonic data (:issue:`27136`). - .. _whatsnew_0250.bug_fixes: Bug fixes diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1df220029def62..1936404b756025 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1955,6 +1955,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, seen.timedelta_ = 1 if not (convert_datetime or convert_timedelta): seen.object_ = 1 + break elif util.is_bool_object(val): seen.bool_ = 1 bools[i] = val From 8efbe12148dc4ee6906f0e908f3bc0f9ed1c64eb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 9 Jul 2019 13:46:05 -0700 Subject: [PATCH 187/238] CLN: assorted cleanups (#27301) --- pandas/core/computation/expressions.py | 7 +++---- pandas/core/internals/blocks.py | 14 +++++++------- pandas/core/internals/managers.py | 6 ++++-- pandas/core/ops/__init__.py | 12 ++++++------ pandas/tests/computation/test_eval.py | 2 +- pandas/tests/io/json/test_pandas.py | 1 + pandas/tests/series/test_arithmetic.py | 11 ++--------- 7 files changed, 24 insertions(+), 29 deletions(-) diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index dc4e6e85f6e7d5..ea614670802916 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -12,9 +12,10 @@ from pandas._config import get_option +from pandas._libs.lib import values_from_object + from pandas.core.dtypes.generic import ABCDataFrame -import pandas.core.common as com from pandas.core.computation.check import _NUMEXPR_INSTALLED if _NUMEXPR_INSTALLED: @@ -129,9 +130,7 @@ def _evaluate_numexpr(op, op_str, a, b, truediv=True, reversed=False, **eval_kwa def _where_standard(cond, a, b): return np.where( - com.values_from_object(cond), - com.values_from_object(a), - com.values_from_object(b), + values_from_object(cond), values_from_object(a), values_from_object(b) ) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index bf6ebf1abe7605..5785dbfbd6cac9 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -397,10 +397,6 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): raise ValueError("Limit must be an integer") if limit < 1: raise ValueError("Limit must be greater than 0") - if self.ndim > 2: - raise NotImplementedError( - "number of dimensions for 'fillna' is currently limited to 2" - ) mask[mask.cumsum(self.ndim - 1) > limit] = False if not self._can_hold_na: @@ -853,6 +849,8 @@ def setitem(self, indexer, value): `indexer` is a direct slice/positional indexer. `value` must be a compatible shape. """ + transpose = self.ndim == 2 + # coerce None values, if appropriate if value is None: if self.is_numeric: @@ -901,8 +899,8 @@ def setitem(self, indexer, value): dtype, _ = maybe_promote(arr_value.dtype) values = values.astype(dtype) - transf = (lambda x: x.T) if self.ndim == 2 else (lambda x: x) - values = transf(values) + if transpose: + values = values.T # length checking check_setitem_lengths(indexer, value, values) @@ -961,7 +959,9 @@ def _is_empty_indexer(indexer): # coerce and try to infer the dtypes of the result values = self._try_coerce_and_cast_result(values, dtype) - block = self.make_block(transf(values)) + if transpose: + values = values.T + block = self.make_block(values) return block def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b3c74aaaa5701a..cd678a235cfc12 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -583,8 +583,9 @@ def astype(self, dtype, **kwargs): def convert(self, **kwargs): return self.apply("convert", **kwargs) - def replace(self, **kwargs): - return self.apply("replace", **kwargs) + def replace(self, value, **kwargs): + assert np.ndim(value) == 0, value + return self.apply("replace", value=value, **kwargs) def replace_list(self, src_list, dest_list, inplace=False, regex=False): """ do a list replace """ @@ -617,6 +618,7 @@ def comp(s, regex=False): # replace ALWAYS will return a list rb = [blk if inplace else blk.copy()] for i, (s, d) in enumerate(zip(src_list, dest_list)): + # TODO: assert/validate that `d` is always a scalar? new_rb = [] for b in rb: m = masks[i][b.mgr_locs.indexer] diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index f9112dbb1e4ab3..d735ab3ad25353 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -11,7 +11,7 @@ import numpy as np -from pandas._libs import lib, ops as libops +from pandas._libs import Timedelta, Timestamp, lib, ops as libops from pandas.errors import NullFrequencyError from pandas.util._decorators import Appender @@ -87,7 +87,7 @@ def get_op_result_name(left, right): Usually a string """ # `left` is always a pd.Series when called from within ops - if isinstance(right, (ABCSeries, pd.Index)): + if isinstance(right, (ABCSeries, ABCIndexClass)): name = _maybe_match_name(left, right) else: name = left.name @@ -151,14 +151,14 @@ def maybe_upcast_for_op(obj): # GH#22390 cast up to Timedelta to rely on Timedelta # implementation; otherwise operation against numeric-dtype # raises TypeError - return pd.Timedelta(obj) + return Timedelta(obj) elif isinstance(obj, np.timedelta64) and not isna(obj): # In particular non-nanosecond timedelta64 needs to be cast to # nanoseconds, or else we get undesired behavior like # np.timedelta64(3, 'D') / 2 == np.timedelta64(1, 'D') # The isna check is to avoid casting timedelta64("NaT"), which would # return NaT and incorrectly be treated as a datetime-NaT. - return pd.Timedelta(obj) + return Timedelta(obj) elif isinstance(obj, np.ndarray) and is_timedelta64_dtype(obj): # GH#22390 Unfortunately we need to special-case right-hand # timedelta64 dtypes because numpy casts integer dtypes to @@ -1864,7 +1864,7 @@ def wrapper(self, other, axis=None): ) msg = "\n".join(textwrap.wrap(msg.format(future=future))) warnings.warn(msg, FutureWarning, stacklevel=2) - other = pd.Timestamp(other) + other = Timestamp(other) res_values = dispatch_to_index_op(op, self, other, pd.DatetimeIndex) @@ -1890,7 +1890,7 @@ def wrapper(self, other, axis=None): res_values, index=self.index, name=res_name ).rename(res_name) - elif isinstance(other, (np.ndarray, pd.Index)): + elif isinstance(other, (np.ndarray, ABCIndexClass)): # do not check length of zerodim array # as it will broadcast if other.ndim != 0 and len(self) != len(other): diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 37a885e33847f1..49d11f58ebe082 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -411,7 +411,7 @@ def check_single_invert_op(self, lhs, cmp1, rhs): ) def check_compound_invert_op(self, lhs, cmp1, rhs): - skip_these = "in", "not in" + skip_these = ["in", "not in"] ex = "~(lhs {0} rhs)".format(cmp1) msg = ( diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 970fd465fd4eca..9c687f036aa684 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1432,6 +1432,7 @@ def test_to_jsonl(self): assert result == expected assert_frame_equal(pd.read_json(result, lines=True), df) + # TODO: there is a near-identical test for pytables; can we share? def test_latin_encoding(self): # GH 13774 pytest.skip("encoding not implemented in .to_json(), xref #13774") diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 5b57b5ba2dbaec..89557445cafb42 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -24,7 +24,7 @@ class TestSeriesFlexArithmetic: ], ) @pytest.mark.parametrize( - "opname", ["add", "sub", "mul", "floordiv", "truediv", "div", "pow"] + "opname", ["add", "sub", "mul", "floordiv", "truediv", "pow"] ) def test_flex_method_equivalence(self, opname, ts): # check that Series.{opname} behaves like Series.__{opname}__, @@ -34,15 +34,8 @@ def test_flex_method_equivalence(self, opname, ts): other = ts[1](tser) check_reverse = ts[2] - if opname == "div": - pytest.skip("div test only for Py3") - op = getattr(Series, opname) - - if op == "div": - alt = operator.truediv - else: - alt = getattr(operator, opname) + alt = getattr(operator, opname) result = op(series, other) expected = alt(series, other) From e62b62baf34d37b0bd28cb000b09921b2c14c5f6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 9 Jul 2019 13:48:31 -0700 Subject: [PATCH 188/238] CLN: checks instead of try/except (#27296) --- pandas/core/internals/blocks.py | 42 ++++++++++++++++----------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5785dbfbd6cac9..36074e19240bd1 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -3376,36 +3376,36 @@ def _putmask_smart(v, m, n): # will work in the current dtype try: nn = n[m] - + except TypeError: + # TypeError: only integer scalar arrays can be converted to a scalar index + pass + else: # make sure that we have a nullable type # if we have nulls if not _isna_compat(v, nn[0]): - raise ValueError - - # we ignore ComplexWarning here - with warnings.catch_warnings(record=True): - warnings.simplefilter("ignore", np.ComplexWarning) - nn_at = nn.astype(v.dtype) - - # avoid invalid dtype comparisons - # between numbers & strings - - # only compare integers/floats - # don't compare integers to datetimelikes - if not is_numeric_v_string_like(nn, nn_at) and ( - is_float_dtype(nn.dtype) - or is_integer_dtype(nn.dtype) - and is_float_dtype(nn_at.dtype) - or is_integer_dtype(nn_at.dtype) - ): + pass + elif is_numeric_v_string_like(nn, v): + # avoid invalid dtype comparisons + # between numbers & strings + pass + elif not (is_float_dtype(nn.dtype) or is_integer_dtype(nn.dtype)): + # only compare integers/floats + pass + elif not (is_float_dtype(v.dtype) or is_integer_dtype(v.dtype)): + # only compare integers/floats + pass + else: + + # we ignore ComplexWarning here + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", np.ComplexWarning) + nn_at = nn.astype(v.dtype) comp = nn == nn_at if is_list_like(comp) and comp.all(): nv = v.copy() nv[m] = nn_at return nv - except (ValueError, IndexError, TypeError, OverflowError): - pass n = np.asarray(n) From e94639b8d9b45d1519f7b02fec7b7648a2458138 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Abdullah=20=C4=B0hsan=20Se=C3=A7er?= Date: Tue, 9 Jul 2019 23:49:29 +0300 Subject: [PATCH 189/238] CLN: Remove unused vars in roll_window (#27294) --- pandas/_libs/window.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 6203577e450d95..46e4b17b8164cb 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1683,7 +1683,7 @@ def roll_window(ndarray[float64_t, ndim=1, cast=True] values, """ cdef: ndarray[float64_t] output, tot_wgt, counts - Py_ssize_t in_i, win_i, win_n, win_k, in_n, in_k + Py_ssize_t in_i, win_i, win_n, in_n float64_t val_in, val_win, c, w in_n = len(values) From 1e4260069ac1435c71e0acb30dfe06ea046fe97c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 9 Jul 2019 16:50:35 -0400 Subject: [PATCH 190/238] STYLE: fix line length check of flake8 (#27307) --- pandas/core/groupby/grouper.py | 2 +- pandas/core/indexing.py | 8 ++++---- pandas/io/parsers.py | 4 ++-- pandas/tests/io/json/test_normalize.py | 2 +- pandas/tests/util/test_hashing.py | 6 +++--- setup.cfg | 1 - 6 files changed, 11 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 818d844ca79947..3cf358261e685c 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -284,7 +284,7 @@ def __init__( if self.name is None: self.name = index.names[level] - self.grouper, self._labels, self._group_index = index._get_grouper_for_level( + self.grouper, self._labels, self._group_index = index._get_grouper_for_level( # noqa: E501 self.grouper, level ) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e27f85eb6d0a4b..612a857897a0c5 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1230,8 +1230,8 @@ def _validate_read_indexer(self, key, indexer, axis, raise_missing=False): KeyError in the future, you can use .reindex() as an alternative. See the documentation here: - https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike""" - ) # noqa + https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike""" # noqa: E501 + ) if not (ax.is_categorical() or ax.is_interval()): warnings.warn(_missing_key_warning, FutureWarning, stacklevel=6) @@ -1379,8 +1379,8 @@ class _IXIndexer(_NDFrameIndexer): .iloc for positional indexing See the documentation here: - http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated""" - ) # noqa + http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated""" # noqa: E501 + ) def __init__(self, name, obj): warnings.warn(self._ix_deprecation_warning, FutureWarning, stacklevel=2) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 78440939ebc01f..356934d457cc9c 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1913,7 +1913,7 @@ def __init__(self, src, **kwds): else: if len(self._reader.header) > 1: # we have a multi index in the columns - self.names, self.index_names, self.col_names, passed_names = self._extract_multi_indexer_columns( + self.names, self.index_names, self.col_names, passed_names = self._extract_multi_indexer_columns( # noqa: E501 self._reader.header, self.index_names, self.col_names, passed_names ) else: @@ -2308,7 +2308,7 @@ def __init__(self, f, **kwds): # The original set is stored in self.original_columns. if len(self.columns) > 1: # we are processing a multi index column - self.columns, self.index_names, self.col_names, _ = self._extract_multi_indexer_columns( + self.columns, self.index_names, self.col_names, _ = self._extract_multi_indexer_columns( # noqa: E501 self.columns, self.index_names, self.col_names ) # Update list of original names to include all indices. diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index a32103d7b29b95..a625c912d1d8e2 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -684,7 +684,7 @@ def test_with_large_max_level(self): "CreatedBy.user.family_tree.father.name": "Father001", "CreatedBy.user.family_tree.father.father.Name": "Father002", "CreatedBy.user.family_tree.father.father.father.name": "Father003", - "CreatedBy.user.family_tree.father.father.father.father.Name": "Father004", + "CreatedBy.user.family_tree.father.father.father.father.Name": "Father004", # noqa: E501 } ] output = nested_to_record(input_data, max_level=max_level) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index 27a23180b269ad..df3c7fe9c9936e 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -338,9 +338,9 @@ def test_hash_collisions(): # # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726 hashes = [ - "Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9", # noqa - "Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe", - ] # noqa + "Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9", # noqa: E501 + "Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe", # noqa: E501 + ] # These should be different. result1 = hash_array(np.asarray(hashes[0:1], dtype=object), "utf8") diff --git a/setup.cfg b/setup.cfg index 7549bfe2e325d2..e559ece2a759a5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -14,7 +14,6 @@ parentdir_prefix = pandas- [flake8] max-line-length = 88 ignore = - E501, # longer line length E203, # space before : (needed for how black formats slicing) W503, # line break before binary operator W504, # line break after binary operator From afe4a1eeefa2f157d10bfd06f4c4e94b2b1e69e2 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 9 Jul 2019 21:57:37 +0100 Subject: [PATCH 191/238] BUG: Incorrect Message in KeyError with MultiIndex (#27291) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/indexes/multi.py | 5 ++++- pandas/tests/indexing/multiindex/test_getitem.py | 6 +++--- pandas/tests/indexing/multiindex/test_loc.py | 2 +- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 8c472cb3121d25..23eba7fb5b73c7 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -1039,6 +1039,7 @@ Indexing - Improved exception message when calling :meth:`DataFrame.iloc` with a list of non-numeric objects (:issue:`25753`). - Improved exception message when calling ``.iloc`` or ``.loc`` with a boolean indexer with different length (:issue:`26658`). +- Bug in ``KeyError`` exception message when indexing a :class:`MultiIndex` with a non-existant key not displaying the original key (:issue:`27250`). - Bug in ``.iloc`` and ``.loc`` with a boolean indexer not raising an ``IndexError`` when too few items are passed (:issue:`26658`). - Bug in :meth:`DataFrame.loc` and :meth:`Series.loc` where ``KeyError`` was not raised for a ``MultiIndex`` when the key was less than or equal to the number of levels in the :class:`MultiIndex` (:issue:`14885`). - Bug in which :meth:`DataFrame.append` produced an erroneous warning indicating that a ``KeyError`` will be thrown in the future when the data to be appended contains new columns (:issue:`22252`). diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 71b551adaf3ef1..ff0bffacd37ad3 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2810,7 +2810,10 @@ def partial_selection(key, indexer=None): if len(key) == self.nlevels and self.is_unique: # Complete key in unique index -> standard get_loc - return (self._engine.get_loc(key), None) + try: + return (self._engine.get_loc(key), None) + except KeyError as e: + raise KeyError(key) from e else: return partial_selection(key) else: diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index 0c61644eb46aee..145bfe168390e0 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -83,9 +83,9 @@ def test_series_getitem_returns_scalar( @pytest.mark.parametrize( "indexer,expected_error,expected_error_msg", [ - (lambda s: s.__getitem__((2000, 3, 4)), KeyError, r"^356$"), - (lambda s: s[(2000, 3, 4)], KeyError, r"^356$"), - (lambda s: s.loc[(2000, 3, 4)], KeyError, r"^356$"), + (lambda s: s.__getitem__((2000, 3, 4)), KeyError, r"^\(2000, 3, 4\)$"), + (lambda s: s[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"), + (lambda s: s.loc[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"), (lambda s: s.loc[(2000, 3, 4, 5)], IndexingError, "Too many indexers"), (lambda s: s.__getitem__(len(s)), IndexError, "index out of bounds"), (lambda s: s[len(s)], IndexError, "index out of bounds"), diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index d92cc00af6fce8..a08b2b4c66af2d 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -382,7 +382,7 @@ def test_loc_getitem_lowerdim_corner(multiindex_dataframe_random_data): df = multiindex_dataframe_random_data # test setup - check key not in dataframe - with pytest.raises(KeyError, match=r"^11$"): + with pytest.raises(KeyError, match=r"^\('bar', 'three'\)$"): df.loc[("bar", "three"), "B"] # in theory should be inserting in a sorted space???? From 967bd957195fa0c1331e4bd24d56f3c0042f304a Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 9 Jul 2019 14:00:11 -0700 Subject: [PATCH 192/238] PERF: Suppress ix warnings benchmarks (#27304) --- asv_bench/benchmarks/frame_methods.py | 18 ++++++---- asv_bench/benchmarks/indexing.py | 48 ++++++++++++++++----------- asv_bench/benchmarks/io/msgpack.py | 4 ++- pandas/core/generic.py | 1 + 4 files changed, 44 insertions(+), 27 deletions(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index af4741f94d2943..5008b77d9fb28b 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -1,3 +1,4 @@ +import warnings import string import numpy as np @@ -320,9 +321,10 @@ class Dropna: def setup(self, how, axis): self.df = DataFrame(np.random.randn(10000, 1000)) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan + with warnings.catch_warnings(record=True): + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan self.df_mixed = self.df.copy() self.df_mixed["foo"] = "bar" @@ -340,9 +342,10 @@ class Count: def setup(self, axis): self.df = DataFrame(np.random.randn(10000, 1000)) - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan + with warnings.catch_warnings(record=True): + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan self.df_mixed = self.df.copy() self.df_mixed["foo"] = "bar" @@ -561,7 +564,8 @@ def setup(self): self.df = DataFrame(np.random.randn(10, 10000)) def time_frame_get_dtype_counts(self): - self.df.get_dtype_counts() + with warnings.catch_warnings(record=True): + self.df.get_dtype_counts() def time_info(self): self.df.info() diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index eb730f91b10b31..720bd0245be417 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -67,16 +67,20 @@ def time_iloc_slice(self, index, index_structure): self.data.iloc[:800000] def time_ix_array(self, index, index_structure): - self.data.ix[self.array] + with warnings.catch_warnings(record=True): + self.data.ix[self.array] def time_ix_list_like(self, index, index_structure): - self.data.ix[[800000]] + with warnings.catch_warnings(record=True): + self.data.ix[[800000]] def time_ix_scalar(self, index, index_structure): - self.data.ix[800000] + with warnings.catch_warnings(record=True): + self.data.ix[800000] def time_ix_slice(self, index, index_structure): - self.data.ix[:800000] + with warnings.catch_warnings(record=True): + self.data.ix[:800000] def time_loc_array(self, index, index_structure): self.data.loc[self.array] @@ -140,7 +144,8 @@ class DataFrameStringIndexing: def setup(self): index = tm.makeStringIndex(1000) columns = tm.makeStringIndex(30) - self.df = DataFrame(np.random.randn(1000, 30), index=index, columns=columns) + with warnings.catch_warnings(record=True): + self.df = DataFrame(np.random.randn(1000, 30), index=index, columns=columns) self.idx_scalar = index[100] self.col_scalar = columns[10] self.bool_indexer = self.df[self.col_scalar] > 0 @@ -151,7 +156,8 @@ def time_get_value(self): self.df.get_value(self.idx_scalar, self.col_scalar) def time_ix(self): - self.df.ix[self.idx_scalar, self.col_scalar] + with warnings.catch_warnings(record=True): + self.df.ix[self.idx_scalar, self.col_scalar] def time_loc(self): self.df.loc[self.idx_scalar, self.col_scalar] @@ -215,24 +221,27 @@ def setup(self): self.df = DataFrame(self.s) n = 100000 - self.mdt = DataFrame( - { - "A": np.random.choice(range(10000, 45000, 1000), n), - "B": np.random.choice(range(10, 400), n), - "C": np.random.choice(range(1, 150), n), - "D": np.random.choice(range(10000, 45000), n), - "x": np.random.choice(range(400), n), - "y": np.random.choice(range(25), n), - } - ) + with warnings.catch_warnings(record=True): + self.mdt = DataFrame( + { + "A": np.random.choice(range(10000, 45000, 1000), n), + "B": np.random.choice(range(10, 400), n), + "C": np.random.choice(range(1, 150), n), + "D": np.random.choice(range(10000, 45000), n), + "x": np.random.choice(range(400), n), + "y": np.random.choice(range(25), n), + } + ) self.idx = IndexSlice[20000:30000, 20:30, 35:45, 30000:40000] self.mdt = self.mdt.set_index(["A", "B", "C", "D"]).sort_index() def time_series_ix(self): - self.s.ix[999] + with warnings.catch_warnings(record=True): + self.s.ix[999] def time_frame_ix(self): - self.df.ix[999] + with warnings.catch_warnings(record=True): + self.df.ix[999] def time_index_slice(self): self.mdt.loc[self.idx, :] @@ -309,7 +318,8 @@ def time_lookup_iloc(self, s): s.iloc def time_lookup_ix(self, s): - s.ix + with warnings.catch_warnings(record=True): + s.ix def time_lookup_loc(self, s): s.loc diff --git a/asv_bench/benchmarks/io/msgpack.py b/asv_bench/benchmarks/io/msgpack.py index c43df7c2e91eda..d97b4ae13f0bd5 100644 --- a/asv_bench/benchmarks/io/msgpack.py +++ b/asv_bench/benchmarks/io/msgpack.py @@ -1,3 +1,4 @@ +import warnings import numpy as np from pandas import DataFrame, date_range, read_msgpack import pandas.util.testing as tm @@ -16,7 +17,8 @@ def setup(self): index=date_range("20000101", periods=N, freq="H"), ) self.df["object"] = tm.makeStringIndex(N) - self.df.to_msgpack(self.fname) + with warnings.catch_warnings(record=True): + self.df.to_msgpack(self.fname) def time_read_msgpack(self): read_msgpack(self.fname) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b79bde9cc3cb14..5db06d32880ccc 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5603,6 +5603,7 @@ def get_dtype_counts(self): FutureWarning, stacklevel=2, ) + from pandas import Series return Series(self._data.get_dtype_counts()) From fc86f8deb4c3e35d32e5767ae3993d5b6c54193c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 9 Jul 2019 14:13:27 -0700 Subject: [PATCH 193/238] BUG: Fix CategoricalIndex.__contains__ with non-hashable, closes #21729 (#27284) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/arrays/categorical.py | 2 +- pandas/core/frame.py | 9 +++------ pandas/core/indexes/category.py | 2 +- pandas/tests/arrays/categorical/test_operators.py | 12 ++++++++++++ pandas/tests/indexes/test_category.py | 12 ++++++++++++ 6 files changed, 30 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 23eba7fb5b73c7..4908bf6495d61b 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -1050,6 +1050,7 @@ Indexing - Bug which produced ``AttributeError`` on partial matching :class:`Timestamp` in a :class:`MultiIndex` (:issue:`26944`) - Bug in :class:`Categorical` and :class:`CategoricalIndex` with :class:`Interval` values when using the ``in`` operator (``__contains``) with objects that are not comparable to the values in the ``Interval`` (:issue:`23705`) - Bug in :meth:`DataFrame.loc` and :meth:`DataFrame.iloc` on a :class:`DataFrame` with a single timezone-aware datetime64[ns] column incorrectly returning a scalar instead of a :class:`Series` (:issue:`27110`) +- Bug in :class:`CategoricalIndex` and :class:`Categorical` incorrectly raising ``ValueError`` instead of ``TypeError`` when a list is passed using the ``in`` operator (``__contains__``) (:issue:`21729`) - Missing diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c4f7d6dbe32fa6..df5cd12a479f02 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2020,7 +2020,7 @@ def __contains__(self, key): Returns True if `key` is in this Categorical. """ # if key is a NaN, check if any NaN is in self. - if isna(key): + if is_scalar(key) and isna(key): return self.isna().any() return contains(self, key, container=self._codes) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a1989fd62b6ee0..ce1b99b3159362 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -60,6 +60,7 @@ is_extension_array_dtype, is_extension_type, is_float_dtype, + is_hashable, is_integer, is_integer_dtype, is_iterator, @@ -2954,16 +2955,12 @@ def __getitem__(self, key): key = lib.item_from_zerodim(key) key = com.apply_if_callable(key, self) - # shortcut if the key is in columns - try: + if is_hashable(key): + # shortcut if the key is in columns if self.columns.is_unique and key in self.columns: if self.columns.nlevels > 1: return self._getitem_multilevel(key) return self._get_item_cache(key) - except (TypeError, ValueError): - # The TypeError correctly catches non hashable "key" (e.g. list) - # The ValueError can be removed once GH #21729 is fixed - pass # Do we have a slicer (on rows)? indexer = convert_to_index_sliceable(self, key) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 9550d68f1d32bf..8f605e487ecf49 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -407,7 +407,7 @@ def _reverse_indexer(self): @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) def __contains__(self, key): # if key is a NaN, check if any NaN is in self. - if isna(key): + if is_scalar(key) and isna(key): return self.hasnans return contains(self, key, container=self._engine) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 697ee483db6d96..9a09ea8422b1fc 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -417,3 +417,15 @@ def test_contains_interval(self, item, expected): cat = Categorical(pd.IntervalIndex.from_breaks(range(3))) result = item in cat assert result is expected + + def test_contains_list(self): + # GH#21729 + cat = Categorical([1, 2, 3]) + + assert "a" not in cat + + with pytest.raises(TypeError, match="unhashable type"): + ["a"] in cat + + with pytest.raises(TypeError, match="unhashable type"): + ["a", "b"] in cat diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index d52bc818c95aaa..2b9632acd83cac 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -276,6 +276,18 @@ def test_contains_interval(self, item, expected): result = item in ci assert result is expected + def test_contains_list(self): + # GH#21729 + idx = pd.CategoricalIndex([1, 2, 3]) + + assert "a" not in idx + + with pytest.raises(TypeError, match="unhashable type"): + ["a"] in idx + + with pytest.raises(TypeError, match="unhashable type"): + ["a", "b"] in idx + def test_map(self): ci = pd.CategoricalIndex(list("ABABC"), categories=list("CBA"), ordered=True) result = ci.map(lambda x: x.lower()) From dc5a848b9d26c787a4eb6a5e1de9cc4cba21feb4 Mon Sep 17 00:00:00 2001 From: Wenhuan Date: Wed, 10 Jul 2019 05:29:22 +0800 Subject: [PATCH 194/238] TST: add tests to validate margin results for pivot (#25815) (#27245) thanks @peterpanmj --- pandas/tests/reshape/test_pivot.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index d3300ffb01c3ac..be82e7f595f8cc 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -286,6 +286,32 @@ def test_pivot_with_interval_index(self, interval_values, dropna): expected = DataFrame({"B": 1}, index=Index(interval_values.unique(), name="A")) tm.assert_frame_equal(result, expected) + def test_pivot_with_interval_index_margins(self): + # GH 25815 + ordered_cat = pd.IntervalIndex.from_arrays([0, 0, 1, 1], [1, 1, 2, 2]) + df = DataFrame( + { + "A": np.arange(4, 0, -1, dtype=np.intp), + "B": ["a", "b", "a", "b"], + "C": pd.Categorical(ordered_cat, ordered=True).sort_values( + ascending=False + ), + } + ) + + pivot_tab = pd.pivot_table( + df, index="C", columns="B", values="A", aggfunc="sum", margins=True + ) + + result = pivot_tab["All"] + expected = Series( + [3, 7, 10], + index=Index([pd.Interval(0, 1), pd.Interval(1, 2), "All"], name="C"), + name="All", + dtype=np.intp, + ) + tm.assert_series_equal(result, expected) + def test_pass_array(self): result = self.data.pivot_table("D", index=self.data.A, columns=self.data.C) expected = self.data.pivot_table("D", index="A", columns="C") From f38020f33052ea9029b410d7fae79bc8f249c0ac Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 9 Jul 2019 14:54:25 -0700 Subject: [PATCH 195/238] TST: parametrize sparse array arithmetic tests (#27271) --- pandas/conftest.py | 30 ++ .../tests/arrays/sparse/test_arithmetics.py | 472 ++++++++---------- 2 files changed, 249 insertions(+), 253 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 29833ab2fc0fa5..ef2758d263e1aa 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1,5 +1,6 @@ from datetime import date, time, timedelta, timezone from decimal import Decimal +import operator import os from dateutil.tz import tzlocal, tzutc @@ -13,6 +14,7 @@ import pandas as pd from pandas import DataFrame +from pandas.core import ops import pandas.util.testing as tm hypothesis.settings.register_profile( @@ -163,6 +165,34 @@ def all_arithmetic_operators(request): return request.param +@pytest.fixture( + params=[ + operator.add, + ops.radd, + operator.sub, + ops.rsub, + operator.mul, + ops.rmul, + operator.truediv, + ops.rtruediv, + operator.floordiv, + ops.rfloordiv, + operator.mod, + ops.rmod, + operator.pow, + ops.rpow, + ] +) +def all_arithmetic_functions(request): + """ + Fixture for operator and roperator arithmetic functions. + + Note: This includes divmod and rdivmod, whereas all_arithmetic_operators + does not. + """ + return request.param + + _all_numeric_reductions = [ "sum", "max", diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index 7bfedff2177197..0f8f3d261c3b36 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -4,10 +4,23 @@ import pytest import pandas as pd +from pandas.core import ops from pandas.core.sparse.api import SparseDtype import pandas.util.testing as tm +@pytest.fixture(params=["integer", "block"]) +def kind(request): + """kind kwarg to pass to SparseArray/SparseSeries""" + return request.param + + +@pytest.fixture(params=[True, False]) +def mix(request): + # whether to operate op(sparse, dense) instead of op(sparse, sparse) + return request.param + + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") class TestSparseArrayArithmetics: @@ -18,60 +31,25 @@ class TestSparseArrayArithmetics: def _assert(self, a, b): tm.assert_numpy_array_equal(a, b) - def _check_numeric_ops(self, a, b, a_dense, b_dense): + def _check_numeric_ops(self, a, b, a_dense, b_dense, mix, op): with np.errstate(invalid="ignore", divide="ignore"): - # Unfortunately, trying to wrap the computation of each expected - # value is with np.errstate() is too tedious. - - # sparse & sparse - self._assert((a + b).to_dense(), a_dense + b_dense) - self._assert((b + a).to_dense(), b_dense + a_dense) - - self._assert((a - b).to_dense(), a_dense - b_dense) - self._assert((b - a).to_dense(), b_dense - a_dense) - - self._assert((a * b).to_dense(), a_dense * b_dense) - self._assert((b * a).to_dense(), b_dense * a_dense) - - # pandas uses future division - self._assert((a / b).to_dense(), a_dense * 1.0 / b_dense) - self._assert((b / a).to_dense(), b_dense * 1.0 / a_dense) - - # ToDo: FIXME in GH 13843 - if not (self._base == pd.Series and a.dtype.subtype == np.dtype("int64")): - self._assert((a // b).to_dense(), a_dense // b_dense) - self._assert((b // a).to_dense(), b_dense // a_dense) - - self._assert((a % b).to_dense(), a_dense % b_dense) - self._assert((b % a).to_dense(), b_dense % a_dense) - - self._assert((a ** b).to_dense(), a_dense ** b_dense) - self._assert((b ** a).to_dense(), b_dense ** a_dense) - - # sparse & dense - self._assert((a + b_dense).to_dense(), a_dense + b_dense) - self._assert((b_dense + a).to_dense(), b_dense + a_dense) - - self._assert((a - b_dense).to_dense(), a_dense - b_dense) - self._assert((b_dense - a).to_dense(), b_dense - a_dense) - - self._assert((a * b_dense).to_dense(), a_dense * b_dense) - self._assert((b_dense * a).to_dense(), b_dense * a_dense) + if op in [operator.floordiv, ops.rfloordiv]: + # FIXME: GH#13843 + if self._base == pd.Series and a.dtype.subtype == np.dtype("int64"): + pytest.xfail("Not defined/working. See GH#13843") - # pandas uses future division - self._assert((a / b_dense).to_dense(), a_dense * 1.0 / b_dense) - self._assert((b_dense / a).to_dense(), b_dense * 1.0 / a_dense) + if mix: + result = op(a, b_dense).to_dense() + else: + result = op(a, b).to_dense() - # ToDo: FIXME in GH 13843 - if not (self._base == pd.Series and a.dtype.subtype == np.dtype("int64")): - self._assert((a // b_dense).to_dense(), a_dense // b_dense) - self._assert((b_dense // a).to_dense(), b_dense // a_dense) + if op in [operator.truediv, ops.rtruediv]: + # pandas uses future division + expected = op(a_dense * 1.0, b_dense) + else: + expected = op(a_dense, b_dense) - self._assert((a % b_dense).to_dense(), a_dense % b_dense) - self._assert((b_dense % a).to_dense(), b_dense % a_dense) - - self._assert((a ** b_dense).to_dense(), a_dense ** b_dense) - self._assert((b_dense ** a).to_dense(), b_dense ** a_dense) + self._assert(result, expected) def _check_bool_result(self, res): assert isinstance(res, self._klass) @@ -136,289 +114,275 @@ def _check_logical_ops(self, a, b, a_dense, b_dense): self._check_bool_result(a | b_dense) self._assert((a | b_dense).to_dense(), a_dense | b_dense) - def test_float_scalar(self): + @pytest.mark.parametrize("scalar", [0, 1, 3]) + @pytest.mark.parametrize("fill_value", [None, 0, 2]) + def test_float_scalar( + self, kind, mix, all_arithmetic_functions, fill_value, scalar + ): + op = all_arithmetic_functions values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - for kind in ["integer", "block"]: - a = self._klass(values, kind=kind) - self._check_numeric_ops(a, 1, values, 1) - self._check_numeric_ops(a, 0, values, 0) - self._check_numeric_ops(a, 3, values, 3) - - a = self._klass(values, kind=kind, fill_value=0) - self._check_numeric_ops(a, 1, values, 1) - self._check_numeric_ops(a, 0, values, 0) - self._check_numeric_ops(a, 3, values, 3) - - a = self._klass(values, kind=kind, fill_value=2) - self._check_numeric_ops(a, 1, values, 1) - self._check_numeric_ops(a, 0, values, 0) - self._check_numeric_ops(a, 3, values, 3) + a = self._klass(values, kind=kind, fill_value=fill_value) + self._check_numeric_ops(a, scalar, values, scalar, mix, op) - def test_float_scalar_comparison(self): + def test_float_scalar_comparison(self, kind): values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - for kind in ["integer", "block"]: - a = self._klass(values, kind=kind) - self._check_comparison_ops(a, 1, values, 1) - self._check_comparison_ops(a, 0, values, 0) - self._check_comparison_ops(a, 3, values, 3) + a = self._klass(values, kind=kind) + self._check_comparison_ops(a, 1, values, 1) + self._check_comparison_ops(a, 0, values, 0) + self._check_comparison_ops(a, 3, values, 3) - a = self._klass(values, kind=kind, fill_value=0) - self._check_comparison_ops(a, 1, values, 1) - self._check_comparison_ops(a, 0, values, 0) - self._check_comparison_ops(a, 3, values, 3) + a = self._klass(values, kind=kind, fill_value=0) + self._check_comparison_ops(a, 1, values, 1) + self._check_comparison_ops(a, 0, values, 0) + self._check_comparison_ops(a, 3, values, 3) - a = self._klass(values, kind=kind, fill_value=2) - self._check_comparison_ops(a, 1, values, 1) - self._check_comparison_ops(a, 0, values, 0) - self._check_comparison_ops(a, 3, values, 3) + a = self._klass(values, kind=kind, fill_value=2) + self._check_comparison_ops(a, 1, values, 1) + self._check_comparison_ops(a, 0, values, 0) + self._check_comparison_ops(a, 3, values, 3) - def test_float_same_index(self): + def test_float_same_index(self, kind, mix, all_arithmetic_functions): # when sp_index are the same - for kind in ["integer", "block"]: - values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) + op = all_arithmetic_functions + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) - a = self._klass(values, kind=kind) - b = self._klass(rvalues, kind=kind) - self._check_numeric_ops(a, b, values, rvalues) + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self._check_numeric_ops(a, b, values, rvalues, mix, op) - values = self._base([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0]) - rvalues = self._base([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0]) + values = self._base([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0]) + rvalues = self._base([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0]) - a = self._klass(values, kind=kind, fill_value=0) - b = self._klass(rvalues, kind=kind, fill_value=0) - self._check_numeric_ops(a, b, values, rvalues) + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self._check_numeric_ops(a, b, values, rvalues, mix, op) - def test_float_same_index_comparison(self): + def test_float_same_index_comparison(self, kind): # when sp_index are the same - for kind in ["integer", "block"]: - values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) - a = self._klass(values, kind=kind) - b = self._klass(rvalues, kind=kind) - self._check_comparison_ops(a, b, values, rvalues) + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) - values = self._base([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0]) - rvalues = self._base([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0]) + values = self._base([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0]) + rvalues = self._base([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0]) - a = self._klass(values, kind=kind, fill_value=0) - b = self._klass(rvalues, kind=kind, fill_value=0) - self._check_comparison_ops(a, b, values, rvalues) + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self._check_comparison_ops(a, b, values, rvalues) + + def test_float_array(self, kind, mix, all_arithmetic_functions): + op = all_arithmetic_functions - def test_float_array(self): values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) - for kind in ["integer", "block"]: - a = self._klass(values, kind=kind) - b = self._klass(rvalues, kind=kind) - self._check_numeric_ops(a, b, values, rvalues) - self._check_numeric_ops(a, b * 0, values, rvalues * 0) + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind) + self._check_numeric_ops(a, b, values, rvalues, mix, op) - a = self._klass(values, kind=kind, fill_value=0) - b = self._klass(rvalues, kind=kind) - self._check_numeric_ops(a, b, values, rvalues) + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self._check_numeric_ops(a, b, values, rvalues, mix, op) - a = self._klass(values, kind=kind, fill_value=0) - b = self._klass(rvalues, kind=kind, fill_value=0) - self._check_numeric_ops(a, b, values, rvalues) + a = self._klass(values, kind=kind, fill_value=1) + b = self._klass(rvalues, kind=kind, fill_value=2) + self._check_numeric_ops(a, b, values, rvalues, mix, op) - a = self._klass(values, kind=kind, fill_value=1) - b = self._klass(rvalues, kind=kind, fill_value=2) - self._check_numeric_ops(a, b, values, rvalues) + def test_float_array_different_kind(self, mix, all_arithmetic_functions): + op = all_arithmetic_functions - def test_float_array_different_kind(self): values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) a = self._klass(values, kind="integer") b = self._klass(rvalues, kind="block") - self._check_numeric_ops(a, b, values, rvalues) - self._check_numeric_ops(a, b * 0, values, rvalues * 0) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op) a = self._klass(values, kind="integer", fill_value=0) b = self._klass(rvalues, kind="block") - self._check_numeric_ops(a, b, values, rvalues) + self._check_numeric_ops(a, b, values, rvalues, mix, op) a = self._klass(values, kind="integer", fill_value=0) b = self._klass(rvalues, kind="block", fill_value=0) - self._check_numeric_ops(a, b, values, rvalues) + self._check_numeric_ops(a, b, values, rvalues, mix, op) a = self._klass(values, kind="integer", fill_value=1) b = self._klass(rvalues, kind="block", fill_value=2) - self._check_numeric_ops(a, b, values, rvalues) + self._check_numeric_ops(a, b, values, rvalues, mix, op) - def test_float_array_comparison(self): + def test_float_array_comparison(self, kind): values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) - for kind in ["integer", "block"]: - a = self._klass(values, kind=kind) - b = self._klass(rvalues, kind=kind) - self._check_comparison_ops(a, b, values, rvalues) - self._check_comparison_ops(a, b * 0, values, rvalues * 0) + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + self._check_comparison_ops(a, b * 0, values, rvalues * 0) - a = self._klass(values, kind=kind, fill_value=0) - b = self._klass(rvalues, kind=kind) - self._check_comparison_ops(a, b, values, rvalues) + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) - a = self._klass(values, kind=kind, fill_value=0) - b = self._klass(rvalues, kind=kind, fill_value=0) - self._check_comparison_ops(a, b, values, rvalues) + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self._check_comparison_ops(a, b, values, rvalues) - a = self._klass(values, kind=kind, fill_value=1) - b = self._klass(rvalues, kind=kind, fill_value=2) - self._check_comparison_ops(a, b, values, rvalues) + a = self._klass(values, kind=kind, fill_value=1) + b = self._klass(rvalues, kind=kind, fill_value=2) + self._check_comparison_ops(a, b, values, rvalues) + + def test_int_array(self, kind, mix, all_arithmetic_functions): + op = all_arithmetic_functions - def test_int_array(self): # have to specify dtype explicitly until fixing GH 667 dtype = np.int64 values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype) rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype) - for kind in ["integer", "block"]: - a = self._klass(values, dtype=dtype, kind=kind) - assert a.dtype == SparseDtype(dtype) - b = self._klass(rvalues, dtype=dtype, kind=kind) - assert b.dtype == SparseDtype(dtype) - - self._check_numeric_ops(a, b, values, rvalues) - self._check_numeric_ops(a, b * 0, values, rvalues * 0) + a = self._klass(values, dtype=dtype, kind=kind) + assert a.dtype == SparseDtype(dtype) + b = self._klass(rvalues, dtype=dtype, kind=kind) + assert b.dtype == SparseDtype(dtype) - a = self._klass(values, fill_value=0, dtype=dtype, kind=kind) - assert a.dtype == SparseDtype(dtype) - b = self._klass(rvalues, dtype=dtype, kind=kind) - assert b.dtype == SparseDtype(dtype) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op) - self._check_numeric_ops(a, b, values, rvalues) + a = self._klass(values, fill_value=0, dtype=dtype, kind=kind) + assert a.dtype == SparseDtype(dtype) + b = self._klass(rvalues, dtype=dtype, kind=kind) + assert b.dtype == SparseDtype(dtype) - a = self._klass(values, fill_value=0, dtype=dtype, kind=kind) - assert a.dtype == SparseDtype(dtype) - b = self._klass(rvalues, fill_value=0, dtype=dtype, kind=kind) - assert b.dtype == SparseDtype(dtype) - self._check_numeric_ops(a, b, values, rvalues) + self._check_numeric_ops(a, b, values, rvalues, mix, op) - a = self._klass(values, fill_value=1, dtype=dtype, kind=kind) - assert a.dtype == SparseDtype(dtype, fill_value=1) - b = self._klass(rvalues, fill_value=2, dtype=dtype, kind=kind) - assert b.dtype == SparseDtype(dtype, fill_value=2) - self._check_numeric_ops(a, b, values, rvalues) + a = self._klass(values, fill_value=0, dtype=dtype, kind=kind) + assert a.dtype == SparseDtype(dtype) + b = self._klass(rvalues, fill_value=0, dtype=dtype, kind=kind) + assert b.dtype == SparseDtype(dtype) + self._check_numeric_ops(a, b, values, rvalues, mix, op) - def test_int_array_comparison(self): + a = self._klass(values, fill_value=1, dtype=dtype, kind=kind) + assert a.dtype == SparseDtype(dtype, fill_value=1) + b = self._klass(rvalues, fill_value=2, dtype=dtype, kind=kind) + assert b.dtype == SparseDtype(dtype, fill_value=2) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + def test_int_array_comparison(self, kind): + dtype = "int64" # int32 NI ATM - for dtype in ["int64"]: - values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype) - rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype) - for kind in ["integer", "block"]: - a = self._klass(values, dtype=dtype, kind=kind) - b = self._klass(rvalues, dtype=dtype, kind=kind) - self._check_comparison_ops(a, b, values, rvalues) - self._check_comparison_ops(a, b * 0, values, rvalues * 0) + values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype) + rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype) + + a = self._klass(values, dtype=dtype, kind=kind) + b = self._klass(rvalues, dtype=dtype, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + self._check_comparison_ops(a, b * 0, values, rvalues * 0) - a = self._klass(values, dtype=dtype, kind=kind, fill_value=0) - b = self._klass(rvalues, dtype=dtype, kind=kind) - self._check_comparison_ops(a, b, values, rvalues) + a = self._klass(values, dtype=dtype, kind=kind, fill_value=0) + b = self._klass(rvalues, dtype=dtype, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) - a = self._klass(values, dtype=dtype, kind=kind, fill_value=0) - b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=0) - self._check_comparison_ops(a, b, values, rvalues) + a = self._klass(values, dtype=dtype, kind=kind, fill_value=0) + b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=0) + self._check_comparison_ops(a, b, values, rvalues) - a = self._klass(values, dtype=dtype, kind=kind, fill_value=1) - b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=2) - self._check_comparison_ops(a, b, values, rvalues) + a = self._klass(values, dtype=dtype, kind=kind, fill_value=1) + b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=2) + self._check_comparison_ops(a, b, values, rvalues) - def test_bool_same_index(self): + @pytest.mark.parametrize("fill_value", [True, False, np.nan]) + def test_bool_same_index(self, kind, fill_value): # GH 14000 # when sp_index are the same - for kind in ["integer", "block"]: - values = self._base([True, False, True, True], dtype=np.bool) - rvalues = self._base([True, False, True, True], dtype=np.bool) - - for fill_value in [True, False, np.nan]: - a = self._klass(values, kind=kind, dtype=np.bool, fill_value=fill_value) - b = self._klass( - rvalues, kind=kind, dtype=np.bool, fill_value=fill_value - ) - self._check_logical_ops(a, b, values, rvalues) - - def test_bool_array_logical(self): + values = self._base([True, False, True, True], dtype=np.bool) + rvalues = self._base([True, False, True, True], dtype=np.bool) + + a = self._klass(values, kind=kind, dtype=np.bool, fill_value=fill_value) + b = self._klass(rvalues, kind=kind, dtype=np.bool, fill_value=fill_value) + self._check_logical_ops(a, b, values, rvalues) + + @pytest.mark.parametrize("fill_value", [True, False, np.nan]) + def test_bool_array_logical(self, kind, fill_value): # GH 14000 # when sp_index are the same - for kind in ["integer", "block"]: - values = self._base([True, False, True, False, True, True], dtype=np.bool) - rvalues = self._base([True, False, False, True, False, True], dtype=np.bool) + values = self._base([True, False, True, False, True, True], dtype=np.bool) + rvalues = self._base([True, False, False, True, False, True], dtype=np.bool) - for fill_value in [True, False, np.nan]: - a = self._klass(values, kind=kind, dtype=np.bool, fill_value=fill_value) - b = self._klass( - rvalues, kind=kind, dtype=np.bool, fill_value=fill_value - ) - self._check_logical_ops(a, b, values, rvalues) + a = self._klass(values, kind=kind, dtype=np.bool, fill_value=fill_value) + b = self._klass(rvalues, kind=kind, dtype=np.bool, fill_value=fill_value) + self._check_logical_ops(a, b, values, rvalues) - def test_mixed_array_float_int(self): + def test_mixed_array_float_int(self, kind, mix, all_arithmetic_functions): + op = all_arithmetic_functions - for rdtype in ["int64"]: - values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) + rdtype = "int64" - for kind in ["integer", "block"]: - a = self._klass(values, kind=kind) - b = self._klass(rvalues, kind=kind) - assert b.dtype == SparseDtype(rdtype) + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) - self._check_numeric_ops(a, b, values, rvalues) - self._check_numeric_ops(a, b * 0, values, rvalues * 0) + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + assert b.dtype == SparseDtype(rdtype) - a = self._klass(values, kind=kind, fill_value=0) - b = self._klass(rvalues, kind=kind) - assert b.dtype == SparseDtype(rdtype) - self._check_numeric_ops(a, b, values, rvalues) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op) - a = self._klass(values, kind=kind, fill_value=0) - b = self._klass(rvalues, kind=kind, fill_value=0) - assert b.dtype == SparseDtype(rdtype) - self._check_numeric_ops(a, b, values, rvalues) + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind) + assert b.dtype == SparseDtype(rdtype) + self._check_numeric_ops(a, b, values, rvalues, mix, op) - a = self._klass(values, kind=kind, fill_value=1) - b = self._klass(rvalues, kind=kind, fill_value=2) - assert b.dtype == SparseDtype(rdtype, fill_value=2) - self._check_numeric_ops(a, b, values, rvalues) + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + assert b.dtype == SparseDtype(rdtype) + self._check_numeric_ops(a, b, values, rvalues, mix, op) - def test_mixed_array_comparison(self): + a = self._klass(values, kind=kind, fill_value=1) + b = self._klass(rvalues, kind=kind, fill_value=2) + assert b.dtype == SparseDtype(rdtype, fill_value=2) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + def test_mixed_array_comparison(self, kind): + rdtype = "int64" # int32 NI ATM - for rdtype in ["int64"]: - values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) - for kind in ["integer", "block"]: - a = self._klass(values, kind=kind) - b = self._klass(rvalues, kind=kind) - assert b.dtype == SparseDtype(rdtype) + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) + + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + assert b.dtype == SparseDtype(rdtype) - self._check_comparison_ops(a, b, values, rvalues) - self._check_comparison_ops(a, b * 0, values, rvalues * 0) + self._check_comparison_ops(a, b, values, rvalues) + self._check_comparison_ops(a, b * 0, values, rvalues * 0) - a = self._klass(values, kind=kind, fill_value=0) - b = self._klass(rvalues, kind=kind) - assert b.dtype == SparseDtype(rdtype) - self._check_comparison_ops(a, b, values, rvalues) + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind) + assert b.dtype == SparseDtype(rdtype) + self._check_comparison_ops(a, b, values, rvalues) - a = self._klass(values, kind=kind, fill_value=0) - b = self._klass(rvalues, kind=kind, fill_value=0) - assert b.dtype == SparseDtype(rdtype) - self._check_comparison_ops(a, b, values, rvalues) + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + assert b.dtype == SparseDtype(rdtype) + self._check_comparison_ops(a, b, values, rvalues) - a = self._klass(values, kind=kind, fill_value=1) - b = self._klass(rvalues, kind=kind, fill_value=2) - assert b.dtype == SparseDtype(rdtype, fill_value=2) - self._check_comparison_ops(a, b, values, rvalues) + a = self._klass(values, kind=kind, fill_value=1) + b = self._klass(rvalues, kind=kind, fill_value=2) + assert b.dtype == SparseDtype(rdtype, fill_value=2) + self._check_comparison_ops(a, b, values, rvalues) class TestSparseSeriesArithmetic(TestSparseArrayArithmetics): @@ -429,7 +393,9 @@ class TestSparseSeriesArithmetic(TestSparseArrayArithmetics): def _assert(self, a, b): tm.assert_series_equal(a, b) - def test_alignment(self): + def test_alignment(self, mix, all_arithmetic_functions): + op = all_arithmetic_functions + da = pd.Series(np.arange(4)) db = pd.Series(np.arange(4), index=[1, 2, 3, 4]) @@ -437,13 +403,13 @@ def test_alignment(self): sb = pd.SparseSeries( np.arange(4), index=[1, 2, 3, 4], dtype=np.int64, fill_value=0 ) - self._check_numeric_ops(sa, sb, da, db) + self._check_numeric_ops(sa, sb, da, db, mix, op) sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=np.nan) sb = pd.SparseSeries( np.arange(4), index=[1, 2, 3, 4], dtype=np.int64, fill_value=np.nan ) - self._check_numeric_ops(sa, sb, da, db) + self._check_numeric_ops(sa, sb, da, db, mix, op) da = pd.Series(np.arange(4)) db = pd.Series(np.arange(4), index=[10, 11, 12, 13]) @@ -452,13 +418,13 @@ def test_alignment(self): sb = pd.SparseSeries( np.arange(4), index=[10, 11, 12, 13], dtype=np.int64, fill_value=0 ) - self._check_numeric_ops(sa, sb, da, db) + self._check_numeric_ops(sa, sb, da, db, mix, op) sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=np.nan) sb = pd.SparseSeries( np.arange(4), index=[10, 11, 12, 13], dtype=np.int64, fill_value=np.nan ) - self._check_numeric_ops(sa, sb, da, db) + self._check_numeric_ops(sa, sb, da, db, mix, op) @pytest.mark.parametrize("op", [operator.eq, operator.add]) From c74a853add15425cf44e6c6943ade28eb3240d19 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Tue, 9 Jul 2019 22:29:52 -0600 Subject: [PATCH 196/238] DOC: Small whatsnew fixes (#27289) --- doc/source/whatsnew/v0.25.0.rst | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 4908bf6495d61b..daca08d69346d6 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -190,7 +190,7 @@ Other enhancements - Added support for ISO week year format ('%G-%V-%u') when parsing datetimes using :meth:`to_datetime` (:issue:`16607`) - Indexing of ``DataFrame`` and ``Series`` now accepts zerodim ``np.ndarray`` (:issue:`24919`) - :meth:`Timestamp.replace` now supports the ``fold`` argument to disambiguate DST transition times (:issue:`25017`) -- :meth:`DataFrame.at_time` and :meth:`Series.at_time` now support :meth:`datetime.time` objects with timezones (:issue:`24043`) +- :meth:`DataFrame.at_time` and :meth:`Series.at_time` now support :class:`datetime.time` objects with timezones (:issue:`24043`) - :meth:`DataFrame.pivot_table` now accepts an ``observed`` parameter which is passed to underlying calls to :meth:`DataFrame.groupby` to speed up grouping categorical data. (:issue:`24923`) - ``Series.str`` has gained :meth:`Series.str.casefold` method to removes all case distinctions present in a string (:issue:`25405`) - :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`) @@ -406,9 +406,8 @@ Previously, columns that were categorical, but not the groupby key(s) would be c .. ipython:: python - df = pd.DataFrame( - {'payload': [-1, -2, -1, -2], - 'col': pd.Categorical(["foo", "bar", "bar", "qux"], ordered=True)}) + cat = pd.Categorical(["foo", "bar", "bar", "qux"], ordered=True) + df = pd.DataFrame({'payload': [-1, -2, -1, -2], 'col': cat}) df df.dtypes @@ -879,7 +878,7 @@ Other deprecations :meth:`SparseArray.get_values` and :meth:`Categorical.get_values` methods are deprecated. One of ``np.asarray(..)`` or :meth:`~Series.to_numpy` can be used instead (:issue:`19617`). - The 'outer' method on NumPy ufuncs, e.g. ``np.subtract.outer`` has been deprecated on :class:`Series` objects. Convert the input to an array with :attr:`Series.array` first (:issue:`27186`) -- :meth:`Timedelta.resolution` is deprecated and replaced with :meth:`Timedelta.resolution_string`. In a future version, :meth:`Timedelta.resolution` will be changed to behave like the standard library :attr:`timedelta.resolution` (:issue:`21344`) +- :meth:`Timedelta.resolution` is deprecated and replaced with :meth:`Timedelta.resolution_string`. In a future version, :meth:`Timedelta.resolution` will be changed to behave like the standard library :attr:`datetime.timedelta.resolution` (:issue:`21344`) - :func:`read_table` has been undeprecated. (:issue:`25220`) - :attr:`Index.dtype_str` is deprecated. (:issue:`18262`) - :attr:`Series.imag` and :attr:`Series.real` are deprecated. (:issue:`18262`) @@ -902,7 +901,7 @@ Removal of prior version deprecations/changes - Removed the previously deprecated ``convert_objects`` (:issue:`11221`) - Removed the previously deprecated ``select`` method of ``DataFrame`` and ``Series`` (:issue:`17633`) - Removed the previously deprecated behavior of :class:`Series` treated as list-like in :meth:`~Series.cat.rename_categories` (:issue:`17982`) -- Removed the previously deprecated ``DataFrame.reindex_axis`` and ``Series.reindex_axis``` (:issue:`17842`) +- Removed the previously deprecated ``DataFrame.reindex_axis`` and ``Series.reindex_axis`` (:issue:`17842`) - Removed the previously deprecated behavior of altering column or index labels with :meth:`Series.rename_axis` or :meth:`DataFrame.rename_axis` (:issue:`17842`) - Removed the previously deprecated ``tupleize_cols`` keyword argument in :meth:`read_html`, :meth:`read_csv`, and :meth:`DataFrame.to_csv` (:issue:`17877`, :issue:`17820`) - Removed the previously deprecated ``DataFrame.from.csv`` and ``Series.from_csv`` (:issue:`17812`) @@ -910,7 +909,7 @@ Removal of prior version deprecations/changes - Removed the previously deprecated ``ordered`` and ``categories`` keyword arguments in ``astype`` (:issue:`17742`) - Removed the previously deprecated ``cdate_range`` (:issue:`17691`) - Removed the previously deprecated ``True`` option for the ``dropna`` keyword argument in :func:`SeriesGroupBy.nth` (:issue:`17493`) -- Removed the previously deprecated ``convert`` keyword argument in :meth:`Series.take` and :meth:`DataFrame.take`(:issue:`17352`) +- Removed the previously deprecated ``convert`` keyword argument in :meth:`Series.take` and :meth:`DataFrame.take` (:issue:`17352`) .. _whatsnew_0250.performance: @@ -1134,7 +1133,7 @@ Groupby/resample/rolling - Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where MemoryError is raised with empty window (:issue:`26005`) - Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where incorrect results are returned with ``closed='left'`` and ``closed='neither'`` (:issue:`26005`) - Improved :class:`pandas.core.window.Rolling`, :class:`pandas.core.window.Window` and :class:`pandas.core.window.EWM` functions to exclude nuisance columns from results instead of raising errors and raise a ``DataError`` only if all columns are nuisance (:issue:`12537`) -- Bug in :meth:`pandas.core.window.Rolling.max` and :meth:`pandas.core.window.Rolling.min` where incorrect results are returned with an empty variable window`` (:issue:`26005`) +- Bug in :meth:`pandas.core.window.Rolling.max` and :meth:`pandas.core.window.Rolling.min` where incorrect results are returned with an empty variable window (:issue:`26005`) Reshaping ^^^^^^^^^ From 4edf938aedf55b9e6fbfb3199f70f857e8ec7e41 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 10 Jul 2019 17:17:54 +0100 Subject: [PATCH 197/238] TST/CLN: replace %s formatting syntax with .format in tests (#27324) --- .../tests/arrays/categorical/test_dtypes.py | 10 ++- pandas/tests/arrays/sparse/test_libsparse.py | 2 +- pandas/tests/computation/test_eval.py | 6 +- pandas/tests/dtypes/test_inference.py | 2 +- pandas/tests/frame/test_alter_axes.py | 4 +- pandas/tests/frame/test_api.py | 8 +- pandas/tests/frame/test_constructors.py | 2 +- pandas/tests/frame/test_query_eval.py | 6 +- pandas/tests/frame/test_repr_info.py | 2 +- pandas/tests/frame/test_timeseries.py | 4 +- pandas/tests/frame/test_to_csv.py | 2 +- pandas/tests/groupby/aggregate/test_other.py | 3 +- pandas/tests/groupby/test_apply.py | 2 +- pandas/tests/groupby/test_bin_groupby.py | 2 +- pandas/tests/groupby/test_counting.py | 7 +- pandas/tests/indexes/datetimelike.py | 4 +- .../tests/indexes/datetimes/test_datetime.py | 2 +- pandas/tests/indexes/multi/test_integrity.py | 4 +- .../tests/indexes/period/test_construction.py | 2 +- pandas/tests/indexes/period/test_tools.py | 2 +- pandas/tests/indexes/test_common.py | 2 +- .../indexes/timedeltas/test_timedelta.py | 2 +- pandas/tests/indexing/common.py | 6 +- pandas/tests/indexing/test_iloc.py | 4 +- pandas/tests/indexing/test_ix.py | 8 +- pandas/tests/internals/test_internals.py | 4 +- pandas/tests/io/excel/test_readers.py | 2 +- pandas/tests/io/excel/test_style.py | 4 +- pandas/tests/io/formats/test_style.py | 2 +- pandas/tests/io/formats/test_to_latex.py | 4 +- .../tests/io/generate_legacy_storage_files.py | 4 +- pandas/tests/io/msgpack/test_case.py | 6 +- pandas/tests/io/msgpack/test_extension.py | 2 +- pandas/tests/io/parser/test_common.py | 6 +- pandas/tests/io/parser/test_multi_thread.py | 4 +- pandas/tests/io/parser/test_parse_dates.py | 2 +- pandas/tests/io/parser/test_read_fwf.py | 2 +- pandas/tests/io/parser/test_unsupported.py | 8 +- pandas/tests/io/pytables/test_pytables.py | 86 +++++++++++-------- pandas/tests/io/test_html.py | 4 +- pandas/tests/io/test_packers.py | 2 +- pandas/tests/io/test_pickle.py | 8 +- pandas/tests/io/test_sql.py | 50 +++++++---- pandas/tests/plotting/test_series.py | 4 +- pandas/tests/resample/test_period_index.py | 12 ++- pandas/tests/reshape/merge/test_join.py | 8 +- pandas/tests/reshape/test_melt.py | 8 +- pandas/tests/reshape/test_reshape.py | 3 +- pandas/tests/scalar/period/test_period.py | 8 +- .../tests/scalar/timestamp/test_timestamp.py | 4 +- pandas/tests/series/test_analytics.py | 2 +- pandas/tests/series/test_api.py | 4 +- pandas/tests/sparse/frame/test_frame.py | 2 +- pandas/tests/sparse/series/test_series.py | 4 +- pandas/tests/test_expressions.py | 6 +- pandas/tests/test_nanops.py | 16 ++-- pandas/tests/test_window.py | 2 +- .../tseries/frequencies/test_inference.py | 2 +- pandas/tests/tseries/offsets/common.py | 12 ++- pandas/tests/tseries/offsets/test_fiscal.py | 8 +- pandas/tests/tseries/offsets/test_offsets.py | 8 +- 61 files changed, 239 insertions(+), 172 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index be64b1f28c733d..c08ad1da386718 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -92,20 +92,22 @@ def test_codes_dtypes(self): result = Categorical(["foo", "bar", "baz"]) assert result.codes.dtype == "int8" - result = Categorical(["foo%05d" % i for i in range(400)]) + result = Categorical(["foo{i:05d}".format(i=i) for i in range(400)]) assert result.codes.dtype == "int16" - result = Categorical(["foo%05d" % i for i in range(40000)]) + result = Categorical(["foo{i:05d}".format(i=i) for i in range(40000)]) assert result.codes.dtype == "int32" # adding cats result = Categorical(["foo", "bar", "baz"]) assert result.codes.dtype == "int8" - result = result.add_categories(["foo%05d" % i for i in range(400)]) + result = result.add_categories(["foo{i:05d}".format(i=i) for i in range(400)]) assert result.codes.dtype == "int16" # removing cats - result = result.remove_categories(["foo%05d" % i for i in range(300)]) + result = result.remove_categories( + ["foo{i:05d}".format(i=i) for i in range(300)] + ) assert result.codes.dtype == "int8" @pytest.mark.parametrize("ordered", [True, False]) diff --git a/pandas/tests/arrays/sparse/test_libsparse.py b/pandas/tests/arrays/sparse/test_libsparse.py index 183eaada16452e..a6836c58348b3a 100644 --- a/pandas/tests/arrays/sparse/test_libsparse.py +++ b/pandas/tests/arrays/sparse/test_libsparse.py @@ -596,6 +596,6 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): @pytest.mark.parametrize("opname", ["add", "sub", "mul", "truediv", "floordiv"]) def test_op(self, opname): - sparse_op = getattr(splib, "sparse_%s_float64" % opname) + sparse_op = getattr(splib, "sparse_{opname}_float64".format(opname=opname)) python_op = getattr(operator, opname) self._op_tests(sparse_op, python_op) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 49d11f58ebe082..8c0930c044838f 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -736,16 +736,16 @@ def test_float_truncation(self): df = pd.DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]}) cutoff = 1000000000.0006 - result = df.query("A < %.4f" % cutoff) + result = df.query("A < {cutoff:.4f}".format(cutoff=cutoff)) assert result.empty cutoff = 1000000000.0010 - result = df.query("A > %.4f" % cutoff) + result = df.query("A > {cutoff:.4f}".format(cutoff=cutoff)) expected = df.loc[[1, 2], :] tm.assert_frame_equal(expected, result) exact = 1000000000.0011 - result = df.query("A == %.4f" % exact) + result = df.query("A == {exact:.4f}".format(exact=exact)) expected = df.loc[[1], :] tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 6824266c9282ba..0b440e0186fbca 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1349,7 +1349,7 @@ def test_is_scalar_pandas_containers(self): def test_datetimeindex_from_empty_datetime64_array(): for unit in ["ms", "us", "ns"]: - idx = DatetimeIndex(np.array([], dtype="datetime64[%s]" % unit)) + idx = DatetimeIndex(np.array([], dtype="datetime64[{unit}]".format(unit=unit))) assert len(idx) == 0 diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 912e8b5fba2336..c57b2a6964f39e 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -342,7 +342,7 @@ def __init__(self, name, color): self.color = color def __str__(self): - return "" % (self.name,) + return "".format(self=self) # necessary for pretty KeyError __repr__ = __str__ @@ -419,7 +419,7 @@ def __init__(self, name, color): self.color = color def __str__(self): - return "" % (self.name,) + return "".format(self=self) thing1 = Thing("One", "red") thing2 = Thing("Two", "blue") diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 76a210e129eb3b..fe59f0574fb75a 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -74,19 +74,19 @@ def test_get_value(self, float_frame): def test_add_prefix_suffix(self, float_frame): with_prefix = float_frame.add_prefix("foo#") - expected = pd.Index(["foo#%s" % c for c in float_frame.columns]) + expected = pd.Index(["foo#{c}".format(c=c) for c in float_frame.columns]) tm.assert_index_equal(with_prefix.columns, expected) with_suffix = float_frame.add_suffix("#foo") - expected = pd.Index(["%s#foo" % c for c in float_frame.columns]) + expected = pd.Index(["{c}#foo".format(c=c) for c in float_frame.columns]) tm.assert_index_equal(with_suffix.columns, expected) with_pct_prefix = float_frame.add_prefix("%") - expected = pd.Index(["%{}".format(c) for c in float_frame.columns]) + expected = pd.Index(["%{c}".format(c=c) for c in float_frame.columns]) tm.assert_index_equal(with_pct_prefix.columns, expected) with_pct_suffix = float_frame.add_suffix("%") - expected = pd.Index(["{}%".format(c) for c in float_frame.columns]) + expected = pd.Index(["{c}%".format(c=c) for c in float_frame.columns]) tm.assert_index_equal(with_pct_suffix.columns, expected) def test_get_axis(self, float_frame): diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index eca827f82e2969..a3817d3c226f53 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -264,7 +264,7 @@ def test_constructor_ordereddict(self): nitems = 100 nums = list(range(nitems)) random.shuffle(nums) - expected = ["A%d" % i for i in nums] + expected = ["A{i:d}".format(i=i) for i in nums] df = DataFrame(OrderedDict(zip(expected, [[0]] * nitems))) assert expected == list(df.columns) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 0781e20a71940d..82c197ac054f0b 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -453,7 +453,9 @@ def test_date_query_with_non_date(self): for op in ["<", ">", "<=", ">="]: with pytest.raises(TypeError): - df.query("dates %s nondate" % op, parser=parser, engine=engine) + df.query( + "dates {op} nondate".format(op=op), parser=parser, engine=engine + ) def test_query_syntax_error(self): engine, parser = self.engine, self.parser @@ -688,7 +690,7 @@ def test_inf(self): ops = "==", "!=" d = dict(zip(ops, (operator.eq, operator.ne))) for op, f in d.items(): - q = "a %s inf" % op + q = "a {op} inf".format(op=op) expected = df[f(df.a, np.inf)] result = df.query(q, engine=self.engine, parser=self.parser) assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index c33b758d2d62c3..48f42b5f101cef 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -285,7 +285,7 @@ def test_info_shows_column_dtypes(self): df.info(buf=buf) res = buf.getvalue() for i, dtype in enumerate(dtypes): - name = "%d %d non-null %s" % (i, n, dtype) + name = "{i:d} {n:d} non-null {dtype}".format(i=i, n=n, dtype=dtype) assert name in res def test_info_max_cols(self): diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 1ca8333154c130..b8708e6ca1871b 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -223,7 +223,7 @@ def test_frame_append_datetime64_col_other_units(self): ns_dtype = np.dtype("M8[ns]") for unit in units: - dtype = np.dtype("M8[%s]" % unit) + dtype = np.dtype("M8[{unit}]".format(unit=unit)) vals = np.arange(n, dtype=np.int64).view(dtype) df = DataFrame({"ints": np.arange(n)}, index=np.arange(n)) @@ -239,7 +239,7 @@ def test_frame_append_datetime64_col_other_units(self): df["dates"] = np.arange(n, dtype=np.int64).view(ns_dtype) for unit in units: - dtype = np.dtype("M8[%s]" % unit) + dtype = np.dtype("M8[{unit}]".format(unit=unit)) vals = np.arange(n, dtype=np.int64).view(dtype) tmp = df.copy() diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 33f29c6f8acb55..28051d9b7f3b90 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -718,7 +718,7 @@ def test_to_csv_withcommas(self): def test_to_csv_mixed(self): def create_cols(name): - return ["%s%03d" % (name, i) for i in range(5)] + return ["{name}{i:03d}".format(name=name, i=i) for i in range(5)] df_float = DataFrame( np.random.randn(100, 5), dtype="float64", columns=create_cols("float") diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 7905575a4a1a8e..103ebf514b7021 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -473,7 +473,8 @@ def test_agg_timezone_round_trip(): assert result3 == ts dates = [ - pd.Timestamp("2016-01-0%d 12:00:00" % i, tz="US/Pacific") for i in range(1, 5) + pd.Timestamp("2016-01-0{i:d} 12:00:00".format(i=i), tz="US/Pacific") + for i in range(1, 5) ] df = pd.DataFrame({"A": ["a", "b"] * 2, "B": dates}) grouped = df.groupby("A") diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 44a583bf661e89..76588549532b1e 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -265,7 +265,7 @@ def desc3(group): result = group.describe() # names are different - result.index.name = "stat_%d" % len(group) + result.index.name = "stat_{:d}".format(len(group)) result = result[: len(group)] # weirdo diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 7c12b490f46d2f..b240876de92b1a 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -95,7 +95,7 @@ def _check(dtype): counts = np.zeros(len(out), dtype=np.int64) labels = ensure_int64(np.repeat(np.arange(3), np.diff(np.r_[0, bins]))) - func = getattr(groupby, "group_ohlc_%s" % dtype) + func = getattr(groupby, "group_ohlc_{dtype}".format(dtype=dtype)) func(out, counts, obj[:, None], labels) def _ohlc(group): diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 5a864b3ab8cb48..7e5180a5c7b2ba 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -197,8 +197,11 @@ def test_ngroup_respects_groupby_order(self): @pytest.mark.parametrize( "datetimelike", [ - [Timestamp("2016-05-%02d 20:09:25+00:00" % i) for i in range(1, 4)], - [Timestamp("2016-05-%02d 20:09:25" % i) for i in range(1, 4)], + [ + Timestamp("2016-05-{i:02d} 20:09:25+00:00".format(i=i)) + for i in range(1, 4) + ], + [Timestamp("2016-05-{i:02d} 20:09:25".format(i=i)) for i in range(1, 4)], [Timedelta(x, unit="h") for x in range(1, 4)], [Period(freq="2W", year=2017, month=x) for x in range(1, 4)], ], diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index 1b3c4e65d252b3..7523b250ea2918 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -36,7 +36,7 @@ def test_str(self): # test the string repr idx = self.create_index() idx.name = "foo" - assert not "length=%s" % len(idx) in str(idx) + assert not "length={}".format(len(idx)) in str(idx) assert "'foo'" in str(idx) assert idx.__class__.__name__ in str(idx) @@ -44,7 +44,7 @@ def test_str(self): if idx.tz is not None: assert idx.tz in str(idx) if hasattr(idx, "freq"): - assert "freq='%s'" % idx.freqstr in str(idx) + assert "freq='{idx.freqstr}'".format(idx=idx) in str(idx) def test_view(self): i = self.create_index() diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index aeff489861f5dd..bb3fe7a136204f 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -90,7 +90,7 @@ def test_week_of_month_frequency(self): def test_hash_error(self): index = date_range("20010101", periods=10) with pytest.raises( - TypeError, match=("unhashable type: %r" % type(index).__name__) + TypeError, match=("unhashable type: {0.__name__!r}".format(type(index))) ): hash(index) diff --git a/pandas/tests/indexes/multi/test_integrity.py b/pandas/tests/indexes/multi/test_integrity.py index dba75b6247a20e..472a404c2a8eff 100644 --- a/pandas/tests/indexes/multi/test_integrity.py +++ b/pandas/tests/indexes/multi/test_integrity.py @@ -253,7 +253,9 @@ def test_rangeindex_fallback_coercion_bug(): def test_hash_error(indices): index = indices - with pytest.raises(TypeError, match=("unhashable type: %r" % type(index).__name__)): + with pytest.raises( + TypeError, match=("unhashable type: {0.__name__!r}".format(type(index))) + ): hash(indices) diff --git a/pandas/tests/indexes/period/test_construction.py b/pandas/tests/indexes/period/test_construction.py index eab55b91b3e606..8c75fbbae7de3c 100644 --- a/pandas/tests/indexes/period/test_construction.py +++ b/pandas/tests/indexes/period/test_construction.py @@ -363,7 +363,7 @@ def test_constructor_year_and_quarter(self): year = pd.Series([2001, 2002, 2003]) quarter = year - 2000 idx = PeriodIndex(year=year, quarter=quarter) - strs = ["%dQ%d" % t for t in zip(quarter, year)] + strs = ["{t[0]:d}Q{t[1]:d}".format(t=t) for t in zip(quarter, year)] lops = list(map(Period, strs)) p = PeriodIndex(lops) tm.assert_index_equal(p, idx) diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index 1db2c5c3a8dac3..a9c0ecd1a30417 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -161,7 +161,7 @@ def test_dti_to_period(self): @pytest.mark.parametrize("month", MONTHS) def test_to_period_quarterly(self, month): # make sure we can make the round trip - freq = "Q-%s" % month + freq = "Q-{month}".format(month=month) rng = period_range("1989Q3", "1991Q3", freq=freq) stamps = rng.to_timestamp() result = stamps.to_period(freq) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index b9bdaf40f85890..0400b7810ecc9e 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -163,7 +163,7 @@ def test_dtype_str(self, indices): def test_hash_error(self, indices): index = indices with pytest.raises( - TypeError, match=("unhashable type: %r" % type(index).__name__) + TypeError, match=("unhashable type: {0.__name__!r}".format(type(index))) ): hash(indices) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 018ccfb2439dc1..e790a913fcac28 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -239,7 +239,7 @@ def test_pickle(self): def test_hash_error(self): index = timedelta_range("1 days", periods=10) with pytest.raises( - TypeError, match=("unhashable type: %r" % type(index).__name__) + TypeError, match=("unhashable type: {0.__name__!r}".format(type(index))) ): hash(index) diff --git a/pandas/tests/indexing/common.py b/pandas/tests/indexing/common.py index 9f1ab82ec904b2..9ceeb06b6fd861 100644 --- a/pandas/tests/indexing/common.py +++ b/pandas/tests/indexing/common.py @@ -16,7 +16,7 @@ def _mklbl(prefix, n): - return ["%s%s" % (prefix, i) for i in range(n)] + return ["{prefix}{i}".format(prefix=prefix, i=i) for i in range(n)] def _axify(obj, key, axis): @@ -105,7 +105,7 @@ def setup_method(self, method): d = dict() for t in self._typs: - d[t] = getattr(self, "%s_%s" % (o, t), None) + d[t] = getattr(self, "{o}_{t}".format(o=o, t=t), None) setattr(self, o, d) @@ -247,7 +247,7 @@ def _print(result, error=None): # if we are in fails, the ok, otherwise raise it if fails is not None: if isinstance(detail, fails): - result = "ok (%s)" % type(detail).__name__ + result = "ok ({0.__name__})".format(type(detail)) _print(result) return diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 60a6a509c0912e..85eab91af3c480 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -729,7 +729,9 @@ def test_iloc_mask(self): r = expected.get(key) if r != ans: raise AssertionError( - "[%s] does not match [%s], received [%s]" % (key, ans, r) + "[{key}] does not match [{ans}], received [{r}]".format( + key=key, ans=ans, r=r + ) ) def test_iloc_non_unique_indexing(self): diff --git a/pandas/tests/indexing/test_ix.py b/pandas/tests/indexing/test_ix.py index ee62c91ad9698c..45ccd8d1b8fb38 100644 --- a/pandas/tests/indexing/test_ix.py +++ b/pandas/tests/indexing/test_ix.py @@ -292,8 +292,8 @@ def test_ix_slicing_strings(self): def test_ix_setitem_out_of_bounds_axis_0(self): df = DataFrame( np.random.randn(2, 5), - index=["row%s" % i for i in range(2)], - columns=["col%s" % i for i in range(5)], + index=["row{i}".format(i=i) for i in range(2)], + columns=["col{i}".format(i=i) for i in range(5)], ) with catch_warnings(record=True): msg = "cannot set by positional indexing with enlargement" @@ -303,8 +303,8 @@ def test_ix_setitem_out_of_bounds_axis_0(self): def test_ix_setitem_out_of_bounds_axis_1(self): df = DataFrame( np.random.randn(5, 2), - index=["row%s" % i for i in range(5)], - columns=["col%s" % i for i in range(2)], + index=["row{i}".format(i=i) for i in range(5)], + columns=["col{i}".format(i=i) for i in range(2)], ) with catch_warnings(record=True): msg = "cannot set by positional indexing with enlargement" diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 2d4fb87d0c6bfb..655e484bc34d17 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -110,7 +110,9 @@ def create_block(typestr, placement, item_shape=None, num_offset=0): elif typestr in ("complex", "c16", "c8"): values = 1.0j * (mat.astype(typestr) + num_offset) elif typestr in ("object", "string", "O"): - values = np.reshape(["A%d" % i for i in mat.ravel() + num_offset], shape) + values = np.reshape( + ["A{i:d}".format(i=i) for i in mat.ravel() + num_offset], shape + ) elif typestr in ("b", "bool"): values = np.ones(shape, dtype=np.bool_) elif typestr in ("datetime", "dt", "M8[ns]"): diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index cd8848828f6c4a..a39cface0e0157 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -537,7 +537,7 @@ def test_read_from_file_url(self, read_ext, datapath): # fails on some systems import platform - pytest.skip("failing on %s" % " ".join(platform.uname()).strip()) + pytest.skip("failing on {}".format(" ".join(platform.uname()).strip())) tm.assert_frame_equal(url_table, local_table) diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index 7ee84077a53341..8862f85ae9ab4a 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -108,7 +108,7 @@ def custom_converter(css): for col1, col2 in zip(wb["frame"].columns, wb["styled"].columns): assert len(col1) == len(col2) for cell1, cell2 in zip(col1, col2): - ref = "%s%d" % (cell2.column, cell2.row) + ref = "{cell2.column}{cell2.row:d}".format(cell2=cell2) # XXX: this isn't as strong a test as ideal; we should # confirm that differences are exclusive if ref == "B2": @@ -156,7 +156,7 @@ def custom_converter(css): for col1, col2 in zip(wb["frame"].columns, wb["custom"].columns): assert len(col1) == len(col2) for cell1, cell2 in zip(col1, col2): - ref = "%s%d" % (cell2.column, cell2.row) + ref = "{cell2.column}{cell2.row:d}".format(cell2=cell2) if ref in ("B2", "C3", "D4", "B5", "C6", "D7", "B8", "B9"): assert not cell1.font.bold assert cell2.font.bold diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index f2fb54796f177c..61c163d2cdaacd 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -362,7 +362,7 @@ def color_negative_red(val): strings, black otherwise. """ color = "red" if val < 0 else "black" - return "color: %s" % color + return "color: {color}".format(color=color) dic = { ("a", "d"): [-1.12, 2.11], diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index a8a6a96f60d606..924b2a19e85046 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -610,7 +610,9 @@ def test_to_latex_multiindex_names(self, name0, name1, axes): idx_names = tuple(n or "{}" for n in names) idx_names_row = ( - "%s & %s & & & & \\\\\n" % idx_names + "{idx_names[0]} & {idx_names[1]} & & & & \\\\\n".format( + idx_names=idx_names + ) if (0 in axes and any(names)) else "" ) diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py index 3ccb29f07dc837..2d2938697bd800 100755 --- a/pandas/tests/io/generate_legacy_storage_files.py +++ b/pandas/tests/io/generate_legacy_storage_files.py @@ -352,7 +352,7 @@ def write_legacy_pickles(output_dir): pickle.dump(create_pickle_data(), fh, pickle.HIGHEST_PROTOCOL) fh.close() - print("created pickle file: %s" % pth) + print("created pickle file: {pth}".format(pth=pth)) def write_legacy_msgpack(output_dir, compress): @@ -369,7 +369,7 @@ def write_legacy_msgpack(output_dir, compress): pth = "{0}.msgpack".format(platform_name()) to_msgpack(os.path.join(output_dir, pth), create_msgpack_data(), compress=compress) - print("created msgpack file: %s" % pth) + print("created msgpack file: {pth}".format(pth=pth)) def write_legacy_file(): diff --git a/pandas/tests/io/msgpack/test_case.py b/pandas/tests/io/msgpack/test_case.py index 15b7090c11badc..a868da69d54592 100644 --- a/pandas/tests/io/msgpack/test_case.py +++ b/pandas/tests/io/msgpack/test_case.py @@ -5,7 +5,11 @@ def check(length, obj): v = packb(obj) - assert len(v) == length, "%r length should be %r but get %r" % (obj, length, len(v)) + assert ( + len(v) == length + ), "{obj!r} length should be {length!r} but get {got:!r}".format( + obj=obj, length=length, got=len(v) + ) assert unpackb(v, use_list=0) == obj diff --git a/pandas/tests/io/msgpack/test_extension.py b/pandas/tests/io/msgpack/test_extension.py index 12f27459f5afe5..85ed43fa010795 100644 --- a/pandas/tests/io/msgpack/test_extension.py +++ b/pandas/tests/io/msgpack/test_extension.py @@ -48,7 +48,7 @@ def default(obj): typecode = 123 # application specific typecode data = tobytes(obj) return ExtType(typecode, data) - raise TypeError("Unknown type object %r" % (obj,)) + raise TypeError("Unknown type object {obj!r}".format(obj)) def ext_hook(code, data): print("ext_hook called", code, data) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index d469d3c2e51de3..b94d5cd497ccff 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -979,7 +979,7 @@ def test_nonexistent_path(all_parsers): # gh-2428: pls no segfault # gh-14086: raise more helpful FileNotFoundError parser = all_parsers - path = "%s.csv" % tm.rands(10) + path = "{}.csv".format(tm.rands(10)) msg = "does not exist" if parser.engine == "c" else r"\[Errno 2\]" with pytest.raises(FileNotFoundError, match=msg) as e: @@ -1078,7 +1078,7 @@ def test_utf16_bom_skiprows(all_parsers, sep, encoding): 4,5,6""".replace( ",", sep ) - path = "__%s__.csv" % tm.rands(10) + path = "__{}__.csv".format(tm.rands(10)) kwargs = dict(sep=sep, skiprows=2) utf8 = "utf-8" @@ -1982,7 +1982,7 @@ def test_internal_eof_byte_to_file(all_parsers): parser = all_parsers data = b'c1,c2\r\n"test \x1a test", test\r\n' expected = DataFrame([["test \x1a test", " test"]], columns=["c1", "c2"]) - path = "__%s__.csv" % tm.rands(10) + path = "__{}__.csv".format(tm.rands(10)) with tm.ensure_clean(path) as path: with open(path, "wb") as f: diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index 392628ee74ba2f..c94adf9da0bf3b 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -41,7 +41,9 @@ def test_multi_thread_string_io_read_csv(all_parsers): num_files = 100 bytes_to_df = [ - "\n".join(["%d,%d,%d" % (i, i, i) for i in range(max_row_range)]).encode() + "\n".join( + ["{i:d},{i:d},{i:d}".format(i=i) for i in range(max_row_range)] + ).encode() for _ in range(num_files) ] files = [BytesIO(b) for b in bytes_to_df] diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 5d79f6e281ef12..36391e19a102ef 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1101,7 +1101,7 @@ def test_bad_date_parse(all_parsers, cache_dates, value): # if we have an invalid date make sure that we handle this with # and w/o the cache properly parser = all_parsers - s = StringIO(("%s,\n" % value) * 50000) + s = StringIO(("{value},\n".format(value=value)) * 50000) parser.read_csv( s, diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 72885315e06bcd..9ddaccc4d38b7b 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -260,7 +260,7 @@ def test_fwf_regression(): # Turns out "T060" is parsable as a datetime slice! tz_list = [1, 10, 20, 30, 60, 80, 100] widths = [16] + [8] * len(tz_list) - names = ["SST"] + ["T%03d" % z for z in tz_list[1:]] + names = ["SST"] + ["T{z:03d}".format(z=z) for z in tz_list[1:]] data = """ 2009164202000 9.5403 9.4105 8.6571 7.8372 6.0612 5.8843 5.5192 2009164203000 9.5435 9.2010 8.6167 7.8176 6.0804 5.8728 5.4869 diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 8bdf53c3caf61b..b23ddf5bd92923 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -95,10 +95,10 @@ def test_python_engine(self, python_engine): 1,2,3,4,""" for default in py_unsupported: - msg = "The %r option is not supported with the %r engine" % ( - default, - python_engine, - ) + msg = ( + "The {default!r} option is not supported with the {python_engine!r}" + " engine" + ).format(default=default, python_engine=python_engine) kwargs = {default: object()} with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index fb87749ea62e07..946334b5df05e0 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -168,7 +168,7 @@ def teardown_class(cls): tm.set_testing_mode() def setup_method(self, method): - self.path = "tmp.__%s__.h5" % tm.rands(10) + self.path = "tmp.__{}__.h5".format(tm.rands(10)) def teardown_method(self, method): pass @@ -736,7 +736,7 @@ def test_getattr(self): # not stores for x in ["mode", "path", "handle", "complib"]: - getattr(store, "_%s" % x) + getattr(store, "_{x}".format(x=x)) def test_put(self): @@ -773,7 +773,9 @@ def test_put_string_index(self): with ensure_clean_store(self.path) as store: - index = Index(["I am a very long string index: %s" % i for i in range(20)]) + index = Index( + ["I am a very long string index: {i}".format(i=i) for i in range(20)] + ) s = Series(np.arange(20), index=index) df = DataFrame({"A": s, "B": s}) @@ -786,7 +788,7 @@ def test_put_string_index(self): # mixed length index = Index( ["abcdefghijklmnopqrstuvwxyz1234567890"] - + ["I am a very long string index: %s" % i for i in range(20)] + + ["I am a very long string index: {i}".format(i=i) for i in range(20)] ) s = Series(np.arange(21), index=index) df = DataFrame({"A": s, "B": s}) @@ -2109,7 +2111,7 @@ def test_unimplemented_dtypes_table_columns(self): df = tm.makeDataFrame() df[n] = f with pytest.raises(TypeError): - store.append("df1_%s" % n, df) + store.append("df1_{n}".format(n=n), df) # frame df = tm.makeDataFrame() @@ -2802,14 +2804,14 @@ def test_select_dtypes(self): expected = df[df.boolv == True].reindex(columns=["A", "boolv"]) # noqa for v in [True, "true", 1]: result = store.select( - "df", "boolv == %s" % str(v), columns=["A", "boolv"] + "df", "boolv == {v!s}".format(v=v), columns=["A", "boolv"] ) tm.assert_frame_equal(expected, result) expected = df[df.boolv == False].reindex(columns=["A", "boolv"]) # noqa for v in [False, "false", 0]: result = store.select( - "df", "boolv == %s" % str(v), columns=["A", "boolv"] + "df", "boolv == {v!s}".format(v=v), columns=["A", "boolv"] ) tm.assert_frame_equal(expected, result) @@ -2896,7 +2898,7 @@ def test_select_with_many_inputs(self): users=["a"] * 50 + ["b"] * 50 + ["c"] * 100 - + ["a%03d" % i for i in range(100)], + + ["a{i:03d}".format(i=i) for i in range(100)], ) ) _maybe_remove(store, "df") @@ -2917,7 +2919,7 @@ def test_select_with_many_inputs(self): tm.assert_frame_equal(expected, result) # big selector along the columns - selector = ["a", "b", "c"] + ["a%03d" % i for i in range(60)] + selector = ["a", "b", "c"] + ["a{i:03d}".format(i=i) for i in range(60)] result = store.select( "df", "ts>=Timestamp('2012-02-01') and users=selector" ) @@ -2990,7 +2992,7 @@ def test_select_iterator(self): df1 = tm.makeTimeDataFrame(500) store.append("df1", df1, data_columns=True) - df2 = tm.makeTimeDataFrame(500).rename(columns=lambda x: "%s_2" % x) + df2 = tm.makeTimeDataFrame(500).rename(columns="{}_2".format) df2["foo"] = "bar" store.append("df2", df2) @@ -3029,19 +3031,21 @@ def test_select_iterator_complete_8014(self): # select w/o iterator and where clause, single term, begin # of range, works - where = "index >= '%s'" % beg_dt + where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) result = store.select("df", where=where) tm.assert_frame_equal(expected, result) # select w/o iterator and where clause, single term, end # of range, works - where = "index <= '%s'" % end_dt + where = "index <= '{end_dt}'".format(end_dt=end_dt) result = store.select("df", where=where) tm.assert_frame_equal(expected, result) # select w/o iterator and where clause, inclusive range, # works - where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt) + where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( + beg_dt=beg_dt, end_dt=end_dt + ) result = store.select("df", where=where) tm.assert_frame_equal(expected, result) @@ -3061,19 +3065,21 @@ def test_select_iterator_complete_8014(self): tm.assert_frame_equal(expected, result) # select w/iterator and where clause, single term, begin of range - where = "index >= '%s'" % beg_dt + where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) results = [s for s in store.select("df", where=where, chunksize=chunksize)] result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, single term, end of range - where = "index <= '%s'" % end_dt + where = "index <= '{end_dt}'".format(end_dt=end_dt) results = [s for s in store.select("df", where=where, chunksize=chunksize)] result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, inclusive range - where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt) + where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( + beg_dt=beg_dt, end_dt=end_dt + ) results = [s for s in store.select("df", where=where, chunksize=chunksize)] result = concat(results) tm.assert_frame_equal(expected, result) @@ -3095,21 +3101,23 @@ def test_select_iterator_non_complete_8014(self): end_dt = expected.index[-2] # select w/iterator and where clause, single term, begin of range - where = "index >= '%s'" % beg_dt + where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) results = [s for s in store.select("df", where=where, chunksize=chunksize)] result = concat(results) rexpected = expected[expected.index >= beg_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, single term, end of range - where = "index <= '%s'" % end_dt + where = "index <= '{end_dt}'".format(end_dt=end_dt) results = [s for s in store.select("df", where=where, chunksize=chunksize)] result = concat(results) rexpected = expected[expected.index <= end_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, inclusive range - where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt) + where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( + beg_dt=beg_dt, end_dt=end_dt + ) results = [s for s in store.select("df", where=where, chunksize=chunksize)] result = concat(results) rexpected = expected[ @@ -3127,7 +3135,7 @@ def test_select_iterator_non_complete_8014(self): end_dt = expected.index[-1] # select w/iterator and where clause, single term, begin of range - where = "index > '%s'" % end_dt + where = "index > '{end_dt}'".format(end_dt=end_dt) results = [s for s in store.select("df", where=where, chunksize=chunksize)] assert 0 == len(results) @@ -3149,14 +3157,14 @@ def test_select_iterator_many_empty_frames(self): end_dt = expected.index[chunksize - 1] # select w/iterator and where clause, single term, begin of range - where = "index >= '%s'" % beg_dt + where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) results = [s for s in store.select("df", where=where, chunksize=chunksize)] result = concat(results) rexpected = expected[expected.index >= beg_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, single term, end of range - where = "index <= '%s'" % end_dt + where = "index <= '{end_dt}'".format(end_dt=end_dt) results = [s for s in store.select("df", where=where, chunksize=chunksize)] assert len(results) == 1 @@ -3165,7 +3173,9 @@ def test_select_iterator_many_empty_frames(self): tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, inclusive range - where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt) + where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( + beg_dt=beg_dt, end_dt=end_dt + ) results = [s for s in store.select("df", where=where, chunksize=chunksize)] # should be 1, is 10 @@ -3183,7 +3193,9 @@ def test_select_iterator_many_empty_frames(self): # return [] e.g. `for e in []: print True` never prints # True. - where = "index <= '%s' & index >= '%s'" % (beg_dt, end_dt) + where = "index <= '{beg_dt}' & index >= '{end_dt}'".format( + beg_dt=beg_dt, end_dt=end_dt + ) results = [s for s in store.select("df", where=where, chunksize=chunksize)] # should be [] @@ -3608,7 +3620,7 @@ def test_coordinates(self): _maybe_remove(store, "df1") _maybe_remove(store, "df2") df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) store.append("df1", df1, data_columns=["A", "B"]) store.append("df2", df2) @@ -3680,7 +3692,7 @@ def test_coordinates(self): def test_append_to_multiple(self): df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) df2["foo"] = "bar" df = concat([df1, df2], axis=1) @@ -3710,7 +3722,7 @@ def test_append_to_multiple(self): def test_append_to_multiple_dropna(self): df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan df = concat([df1, df2], axis=1) @@ -3730,7 +3742,7 @@ def test_append_to_multiple_dropna(self): ) def test_append_to_multiple_dropna_false(self): df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan df = concat([df1, df2], axis=1) @@ -3749,7 +3761,7 @@ def test_append_to_multiple_dropna_false(self): def test_select_as_multiple(self): df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) df2["foo"] = "bar" with ensure_clean_store(self.path) as store: @@ -3920,8 +3932,8 @@ def test_start_stop_fixed(self): def test_select_filter_corner(self): df = DataFrame(np.random.randn(50, 100)) - df.index = ["%.3d" % c for c in df.index] - df.columns = ["%.3d" % c for c in df.columns] + df.index = ["{c:3d}".format(c=c) for c in df.index] + df.columns = ["{c:3d}".format(c=c) for c in df.columns] with ensure_clean_store(self.path) as store: store.put("frame", df, format="table") @@ -4355,7 +4367,7 @@ def test_append_with_diff_col_name_types_raises_value_error(self): df5 = DataFrame({("1", 2, object): np.random.randn(10)}) with ensure_clean_store(self.path) as store: - name = "df_%s" % tm.rands(10) + name = "df_{}".format(tm.rands(10)) store.append(name, df) for d in (df2, df3, df4, df5): @@ -4775,16 +4787,16 @@ def test_query_long_float_literal(self): store.append("test", df, format="table", data_columns=True) cutoff = 1000000000.0006 - result = store.select("test", "A < %.4f" % cutoff) + result = store.select("test", "A < {cutoff:.4f}".format(cutoff=cutoff)) assert result.empty cutoff = 1000000000.0010 - result = store.select("test", "A > %.4f" % cutoff) + result = store.select("test", "A > {cutoff:.4f}".format(cutoff=cutoff)) expected = df.loc[[1, 2], :] tm.assert_frame_equal(expected, result) exact = 1000000000.0011 - result = store.select("test", "A == %.4f" % exact) + result = store.select("test", "A == {exact:.4f}".format(exact=exact)) expected = df.loc[[1], :] tm.assert_frame_equal(expected, result) @@ -5084,7 +5096,9 @@ def _compare_with_tz(self, a, b): a_e = a.loc[i, c] b_e = b.loc[i, c] if not (a_e == b_e and a_e.tz == b_e.tz): - raise AssertionError("invalid tz comparison [%s] [%s]" % (a_e, b_e)) + raise AssertionError( + "invalid tz comparison [{a_e}] [{b_e}]".format(a_e=a_e, b_e=b_e) + ) def test_append_with_timezones_dateutil(self): diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 9752b4c62aff7d..6d06113dfc9eca 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -902,8 +902,8 @@ def test_computer_sales_page(self, datapath): def test_wikipedia_states_table(self, datapath): data = datapath("io", "data", "wikipedia_states.html") - assert os.path.isfile(data), "%r is not a file" % data - assert os.path.getsize(data), "%r is an empty file" % data + assert os.path.isfile(data), "{data!r} is not a file".format(data=data) + assert os.path.getsize(data), "{data!r} is an empty file".format(data=data) result = self.read_html(data, "Arizona", header=1)[0] assert result["sq mi"].dtype == np.dtype("float64") diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index fb1f657905be71..33a11087f622d0 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -101,7 +101,7 @@ def check_arbitrary(a, b): @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestPackers: def setup_method(self, method): - self.path = "__%s__.msg" % tm.rands(10) + self.path = "__{}__.msg".format(tm.rands(10)) def teardown_method(self, method): pass diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 7aba2a3677f84c..076d0c9f947c76 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -48,7 +48,7 @@ def compare_element(result, expected, typ, version=None): return if typ.startswith("sp_"): - comparator = getattr(tm, "assert_%s_equal" % typ) + comparator = getattr(tm, "assert_{typ}_equal".format(typ=typ)) comparator(result, expected, exact_indices=False) elif typ == "timestamp": if expected is pd.NaT: @@ -57,7 +57,9 @@ def compare_element(result, expected, typ, version=None): assert result == expected assert result.freq == expected.freq else: - comparator = getattr(tm, "assert_%s_equal" % typ, tm.assert_almost_equal) + comparator = getattr( + tm, "assert_{typ}_equal".format(typ=typ), tm.assert_almost_equal + ) comparator(result, expected) @@ -242,7 +244,7 @@ def test_pickle_path_localpath(): @pytest.fixture def get_random_path(): - return "__%s__.pickle" % tm.rands(10) + return "__{}__.pickle".format(tm.rands(10)) class TestCompression: diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 347e280234f91e..d8465a427eaea5 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -217,7 +217,9 @@ def teardown_method(self, method): class MySQLMixIn(MixInBase): def drop_table(self, table_name): cur = self.conn.cursor() - cur.execute("DROP TABLE IF EXISTS %s" % sql._get_valid_mysql_name(table_name)) + cur.execute( + "DROP TABLE IF EXISTS {}".format(sql._get_valid_mysql_name(table_name)) + ) self.conn.commit() def _get_all_tables(self): @@ -237,7 +239,7 @@ def _close_conn(self): class SQLiteMixIn(MixInBase): def drop_table(self, table_name): self.conn.execute( - "DROP TABLE IF EXISTS %s" % sql._get_valid_sqlite_name(table_name) + "DROP TABLE IF EXISTS {}".format(sql._get_valid_sqlite_name(table_name)) ) self.conn.commit() @@ -405,7 +407,11 @@ def _load_raw_sql(self): def _count_rows(self, table_name): result = ( self._get_exec() - .execute("SELECT count(*) AS count_1 FROM %s" % table_name) + .execute( + "SELECT count(*) AS count_1 FROM {table_name}".format( + table_name=table_name + ) + ) .fetchone() ) return result[0] @@ -1201,7 +1207,7 @@ def _get_sqlite_column_type(self, schema, column): for col in schema.split("\n"): if col.split()[0].strip('""') == column: return col.split()[1] - raise ValueError("Column %s not found" % (column)) + raise ValueError("Column {column} not found".format(column=column)) def test_sqlite_type_mapping(self): @@ -2193,12 +2199,14 @@ def test_datetime_time(self): def _get_index_columns(self, tbl_name): ixs = sql.read_sql_query( "SELECT * FROM sqlite_master WHERE type = 'index' " - + "AND tbl_name = '%s'" % tbl_name, + + "AND tbl_name = '{tbl_name}'".format(tbl_name=tbl_name), self.conn, ) ix_cols = [] for ix_name in ixs.name: - ix_info = sql.read_sql_query("PRAGMA index_info(%s)" % ix_name, self.conn) + ix_info = sql.read_sql_query( + "PRAGMA index_info({ix_name})".format(ix_name=ix_name), self.conn + ) ix_cols.append(ix_info.name.tolist()) return ix_cols @@ -2211,11 +2219,15 @@ def test_transactions(self): self._transaction_test() def _get_sqlite_column_type(self, table, column): - recs = self.conn.execute("PRAGMA table_info(%s)" % table) + recs = self.conn.execute("PRAGMA table_info({table})".format(table=table)) for cid, name, ctype, not_null, default, pk in recs: if name == column: return ctype - raise ValueError("Table %s, column %s not found" % (table, column)) + raise ValueError( + "Table {table}, column {column} not found".format( + table=table, column=column + ) + ) def test_dtype(self): if self.flavor == "mysql": @@ -2285,7 +2297,7 @@ def test_illegal_names(self): sql.table_exists(weird_name, self.conn) df2 = DataFrame([[1, 2], [3, 4]], columns=["a", weird_name]) - c_tbl = "test_weird_col_name%d" % ndx + c_tbl = "test_weird_col_name{ndx:d}".format(ndx=ndx) df2.to_sql(c_tbl, self.conn) sql.table_exists(c_tbl, self.conn) @@ -2300,15 +2312,15 @@ def date_format(dt): _formatters = { - datetime: lambda dt: "'%s'" % date_format(dt), - str: lambda x: "'%s'" % x, - np.str_: lambda x: "'%s'" % x, - bytes: lambda x: "'%s'" % x, - float: lambda x: "%.8f" % x, - int: lambda x: "%s" % x, + datetime: "'{}'".format, + str: "'{}'".format, + np.str_: "'{}'".format, + bytes: "'{}'".format, + float: "{:.8f}".format, + int: "{:d}".format, type(None): lambda x: "NULL", - np.float64: lambda x: "%.10f" % x, - bool: lambda x: "'%s'" % x, + np.float64: "{:.10f}".format, + bool: "'{!s}'".format, } @@ -2490,7 +2502,7 @@ def test_if_exists(self): df_if_exists_1 = DataFrame({"col1": [1, 2], "col2": ["A", "B"]}) df_if_exists_2 = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"]}) table_name = "table_if_exists" - sql_select = "SELECT * FROM %s" % table_name + sql_select = "SELECT * FROM {table_name}".format(table_name=table_name) def clean_up(test_table_to_drop): """ @@ -2778,7 +2790,7 @@ def test_if_exists(self): df_if_exists_1 = DataFrame({"col1": [1, 2], "col2": ["A", "B"]}) df_if_exists_2 = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"]}) table_name = "table_if_exists" - sql_select = "SELECT * FROM %s" % table_name + sql_select = "SELECT * FROM {table_name}".format(table_name=table_name) def clean_up(test_table_to_drop): """ diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 4c5b1e66d00751..8b4a78e9195b5b 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -856,10 +856,10 @@ def test_time_series_plot_color_with_empty_kwargs(self): def test_xticklabels(self): # GH11529 - s = Series(np.arange(10), index=["P%02d" % i for i in range(10)]) + s = Series(np.arange(10), index=["P{i:02d}".format(i=i) for i in range(10)]) _, ax = self.plt.subplots() ax = s.plot(xticks=[0, 3, 5, 9], ax=ax) - exp = ["P%02d" % i for i in [0, 3, 5, 9]] + exp = ["P{i:02d}".format(i=i) for i in [0, 3, 5, 9]] self._check_text_labels(ax.get_xticklabels(), exp) def test_custom_business_day_freq(self): diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 2ced955652c213..30febe3d2cc830 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -101,7 +101,9 @@ def test_selection(self, index, freq, kind, kwargs): def test_annual_upsample_cases( self, targ, conv, meth, month, simple_period_range_series ): - ts = simple_period_range_series("1/1/1990", "12/31/1991", freq="A-%s" % month) + ts = simple_period_range_series( + "1/1/1990", "12/31/1991", freq="A-{month}".format(month=month) + ) result = getattr(ts.resample(targ, convention=conv), meth)() expected = result.to_timestamp(targ, how=conv) @@ -370,14 +372,16 @@ def test_resample_to_timestamps(self, simple_period_range_series): def test_resample_to_quarterly(self, simple_period_range_series): for month in MONTHS: - ts = simple_period_range_series("1990", "1992", freq="A-%s" % month) - quar_ts = ts.resample("Q-%s" % month).ffill() + ts = simple_period_range_series( + "1990", "1992", freq="A-{month}".format(month=month) + ) + quar_ts = ts.resample("Q-{month}".format(month=month)).ffill() stamps = ts.to_timestamp("D", how="start") qdates = period_range( ts.index[0].asfreq("D", "start"), ts.index[-1].asfreq("D", "end"), - freq="Q-%s" % month, + freq="Q-{month}".format(month=month), ) expected = stamps.reindex(qdates.to_timestamp("D", "s"), method="ffill") diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 16cfe3a469b340..305d7b97816341 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -790,7 +790,9 @@ def _check_join(left, right, result, join_col, how="left", lsuffix="_x", rsuffix except KeyError: if how in ("left", "inner"): raise AssertionError( - "key %s should not have been in the join" % str(group_key) + "key {group_key!s} should not have been in the join".format( + group_key=group_key + ) ) _assert_all_na(l_joined, left.columns, join_col) @@ -802,7 +804,9 @@ def _check_join(left, right, result, join_col, how="left", lsuffix="_x", rsuffix except KeyError: if how in ("right", "inner"): raise AssertionError( - "key %s should not have been in the join" % str(group_key) + "key {group_key!s} should not have been in the join".format( + group_key=group_key + ) ) _assert_all_na(r_joined, right.columns, join_col) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 1b067c08d2e40c..5b1f151daf2192 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -339,8 +339,8 @@ def test_pairs(self): df = DataFrame(data) spec = { - "visitdt": ["visitdt%d" % i for i in range(1, 4)], - "wt": ["wt%d" % i for i in range(1, 4)], + "visitdt": ["visitdt{i:d}".format(i=i) for i in range(1, 4)], + "wt": ["wt{i:d}".format(i=i) for i in range(1, 4)], } result = lreshape(df, spec) @@ -529,8 +529,8 @@ def test_pairs(self): tm.assert_frame_equal(result, exp) spec = { - "visitdt": ["visitdt%d" % i for i in range(1, 3)], - "wt": ["wt%d" % i for i in range(1, 4)], + "visitdt": ["visitdt{i:d}".format(i=i) for i in range(1, 3)], + "wt": ["wt{i:d}".format(i=i) for i in range(1, 4)], } msg = "All column lists must be same length" with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 1c9e3e57bc310d..149930059d868d 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -166,7 +166,8 @@ def test_unicode(self, sparse): s = [e, eacute, eacute] res = get_dummies(s, prefix="letter", sparse=sparse) exp = DataFrame( - {"letter_e": [1, 0, 0], "letter_%s" % eacute: [0, 1, 1]}, dtype=np.uint8 + {"letter_e": [1, 0, 0], "letter_{eacute}".format(eacute=eacute): [0, 1, 1]}, + dtype=np.uint8, ) if sparse: exp = exp.apply(pd.SparseArray, fill_value=0) diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 771a67dfceaa8d..4404b93e86218d 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -307,7 +307,7 @@ def test_multiples(self): @pytest.mark.parametrize("month", MONTHS) def test_period_cons_quarterly(self, month): # bugs in scikits.timeseries - freq = "Q-%s" % month + freq = "Q-{month}".format(month=month) exp = Period("1989Q3", freq=freq) assert "1989Q3" in str(exp) stamp = exp.to_timestamp("D", how="end") @@ -321,7 +321,7 @@ def test_period_cons_quarterly(self, month): @pytest.mark.parametrize("month", MONTHS) def test_period_cons_annual(self, month): # bugs in scikits.timeseries - freq = "A-%s" % month + freq = "A-{month}".format(month=month) exp = Period("1989", freq=freq) stamp = exp.to_timestamp("D", how="end") + timedelta(days=30) p = Period(stamp, freq=freq) @@ -332,8 +332,8 @@ def test_period_cons_annual(self, month): @pytest.mark.parametrize("day", DAYS) @pytest.mark.parametrize("num", range(10, 17)) def test_period_cons_weekly(self, num, day): - daystr = "2011-02-%d" % num - freq = "W-%s" % day + daystr = "2011-02-{num}".format(num=num) + freq = "W-{day}".format(day=day) result = Period(daystr, freq=freq) expected = Period(daystr, freq="D").asfreq(freq) diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 7b0ff83aee5d43..401fc285424fe3 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -576,7 +576,7 @@ def test_bounds_with_different_units(self): for date_string in out_of_bounds_dates: for unit in time_units: - dt64 = np.datetime64(date_string, dtype="M8[%s]" % unit) + dt64 = np.datetime64(date_string, dtype="M8[{unit}]".format(unit=unit)) with pytest.raises(ValueError): Timestamp(dt64) @@ -584,7 +584,7 @@ def test_bounds_with_different_units(self): for date_string in in_bounds_dates: for unit in time_units: - dt64 = np.datetime64(date_string, dtype="M8[%s]" % unit) + dt64 = np.datetime64(date_string, dtype="M8[{unit}]".format(unit=unit)) Timestamp(dt64) def test_min_valid(self): diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 67373686d67284..32d32a5d14fb24 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -99,7 +99,7 @@ def test_argsort(self, datetime_series): assert issubclass(argsorted.dtype.type, np.integer) # GH 2967 (introduced bug in 0.11-dev I think) - s = Series([Timestamp("201301%02d" % (i + 1)) for i in range(5)]) + s = Series([Timestamp("201301{i:02d}".format(i=i)) for i in range(1, 6)]) assert s.dtype == "datetime64[ns]" shifted = s.shift(-1) assert shifted.dtype == "datetime64[ns]" diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 2097264ba5e785..2870677e42d50e 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -155,7 +155,9 @@ def test_constructor_subclass_dict(self): def test_constructor_ordereddict(self): # GH3283 - data = OrderedDict(("col%s" % i, np.random.random()) for i in range(12)) + data = OrderedDict( + ("col{i}".format(i=i), np.random.random()) for i in range(12) + ) series = self.series_klass(data) expected = self.series_klass(list(data.values()), list(data.keys())) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 6527d41eac841a..55a37da6b663ff 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -967,7 +967,7 @@ def test_rename(self, float_frame): ) tm.assert_sp_frame_equal(result, expected) - result = float_frame.rename(columns=lambda x: "%s%d" % (x, 1)) + result = float_frame.rename(columns="{}1".format) data = { "A1": [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], "B1": [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index eb217283c7a83a..ad4c898b004ac4 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -619,7 +619,9 @@ def _check_inplace_op(iop, op): inplace_ops = ["add", "sub", "mul", "truediv", "floordiv", "pow"] for op in inplace_ops: - _check_inplace_op(getattr(operator, "i%s" % op), getattr(operator, op)) + _check_inplace_op( + getattr(operator, "i{op}".format(op=op)), getattr(operator, op) + ) @pytest.mark.parametrize( "values, op, fill_value", diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index a7281e002cc5ca..4070624985068f 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -81,7 +81,7 @@ def run_arithmetic(self, df, other, assert_func, check_dtype=False, test_flex=Tr assert expected.dtype.kind == "f" assert_func(expected, result) except Exception: - pprint_thing("Failed test with operator %r" % op.__name__) + pprint_thing("Failed test with operator {op.__name__!r}".format(op=op)) raise def test_integer_arithmetic(self): @@ -129,8 +129,8 @@ def run_binary( assert not used_numexpr, "Used numexpr unexpectedly." assert_func(expected, result) except Exception: - pprint_thing("Failed test with operation %r" % arith) - pprint_thing("test_flex was %r" % test_flex) + pprint_thing("Failed test with operation {arith!r}".format(arith=arith)) + pprint_thing("test_flex was {test_flex!r}".format(test_flex=test_flex)) raise def run_frame(self, df, other, binary_comp=None, run_binary=True, **kwargs): diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index f6e936630f6be8..21ab28c94c978a 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -179,9 +179,9 @@ def check_fun_data( self.check_results(targ, res, axis, check_dtype=check_dtype) except BaseException as exc: exc.args += ( - "axis: %s of %s" % (axis, testarval.ndim - 1), - "skipna: %s" % skipna, - "kwargs: %s" % kwargs, + "axis: {axis} of {of}".format(axis=axis, of=testarval.ndim - 1), + "skipna: {skipna}".format(skipna=skipna), + "kwargs: {kwargs}".format(kwargs=kwargs), ) raise @@ -234,9 +234,9 @@ def check_fun( ) except BaseException as exc: exc.args += ( - "testar: %s" % testar, - "targar: %s" % targar, - "targarnan: %s" % targarnan, + "testar: {testar}".format(testar=testar), + "targar: {targar}".format(targar=targar), + "targarnan: {targarnan}".format(targarnan=targarnan), ) raise @@ -712,7 +712,7 @@ def check_nancomp(self, checkfun, targ0): res2 = checkfun(arr_float_nan, arr_nan_float1) tm.assert_numpy_array_equal(targ2, res2, check_dtype=False) except Exception as exc: - exc.args += ("ndim: %s" % arr_float.ndim,) + exc.args += ("ndim: {arr_float.ndim}".format(arr_float=arr_float),) raise try: @@ -760,7 +760,7 @@ def check_bool(self, func, value, correct, *args, **kwargs): else: assert not res0 except BaseException as exc: - exc.args += ("dim: %s" % getattr(value, "ndim", value),) + exc.args += ("dim: {}".format(getattr(value, "ndim", value)),) raise if not hasattr(value, "ndim"): break diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 2df5460a059532..fca88ff3ce8ceb 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -2039,7 +2039,7 @@ def get_result(obj, window, min_periods=None, center=False): tm.assert_series_equal(result, expected) # shifter index - s = ["x%d" % x for x in range(12)] + s = ["x{x:d}".format(x=x) for x in range(12)] if has_min_periods: minp = 10 diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index 4c8f6253cdf7ba..50844aabb2c889 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -178,7 +178,7 @@ def test_infer_freq_delta(base_delta_code_pair, count): inc = base_delta * count index = DatetimeIndex([b + inc * j for j in range(3)]) - exp_freq = "%d%s" % (count, code) if count > 1 else code + exp_freq = "{count:d}{code}".format(count=count, code=code) if count > 1 else code assert frequencies.infer_freq(index) == exp_freq diff --git a/pandas/tests/tseries/offsets/common.py b/pandas/tests/tseries/offsets/common.py index 079fcc36ff3eed..fbf4454109ec09 100644 --- a/pandas/tests/tseries/offsets/common.py +++ b/pandas/tests/tseries/offsets/common.py @@ -13,14 +13,18 @@ def assert_offset_equal(offset, base, expected): assert actual_apply == expected except AssertionError: raise AssertionError( - "\nExpected: %s\nActual: %s\nFor Offset: %s)" - "\nAt Date: %s" % (expected, actual, offset, base) + "\nExpected: {expected}\nActual: {actual}\nFor Offset: {offset})" + "\nAt Date: {base}".format( + expected=expected, actual=actual, offset=offset, base=base + ) ) def assert_onOffset(offset, date, expected): actual = offset.onOffset(date) assert actual == expected, ( - "\nExpected: %s\nActual: %s\nFor Offset: %s)" - "\nAt Date: %s" % (expected, actual, offset, date) + "\nExpected: {expected}\nActual: {actual}\nFor Offset: {offset})" + "\nAt Date: {date}".format( + expected=expected, actual=actual, offset=offset, date=date + ) ) diff --git a/pandas/tests/tseries/offsets/test_fiscal.py b/pandas/tests/tseries/offsets/test_fiscal.py index c24d917a5e454f..8b1aaafb94e0b7 100644 --- a/pandas/tests/tseries/offsets/test_fiscal.py +++ b/pandas/tests/tseries/offsets/test_fiscal.py @@ -79,10 +79,10 @@ def test_get_offset(): for name, expected in pairs: offset = get_offset(name) - assert offset == expected, "Expected %r to yield %r (actual: %r)" % ( - name, - expected, - offset, + assert ( + offset == expected + ), "Expected {name!r} to yield {expected!r} (actual: {offset!r})".format( + name=name, expected=expected, offset=offset ) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 2654d83ee0c525..1abc8aece5ec93 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -3969,10 +3969,10 @@ def test_get_offset(): for name, expected in pairs: offset = get_offset(name) - assert offset == expected, "Expected %r to yield %r (actual: %r)" % ( - name, - expected, - offset, + assert ( + offset == expected + ), "Expected {name!r} to yield {expected!r} (actual: {offset!r})".format( + name=name, expected=expected, offset=offset ) From 823af7639b94a39609c06a2228ba9e0f31155724 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 10 Jul 2019 09:20:04 -0700 Subject: [PATCH 198/238] REF: check can_hold_element instead of try/except (#27298) --- pandas/core/internals/blocks.py | 50 ++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 36074e19240bd1..652a44609f2c51 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -7,7 +7,7 @@ import numpy as np -from pandas._libs import lib, tslib, tslibs +from pandas._libs import NaT, lib, tslib, tslibs import pandas._libs.internals as libinternals from pandas._libs.tslibs import Timedelta, conversion, is_null_datetimelike from pandas.util._validators import validate_bool_kwarg @@ -405,33 +405,29 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): else: return self.copy() - # fillna, but if we cannot coerce, then try again as an ObjectBlock - try: - # Note: we only call try_coerce_args to let it raise - self._try_coerce_args(value) - except (TypeError, ValueError): - - # we can't process the value, but nothing to do - if not mask.any(): - return self if inplace else self.copy() - - # operate column-by-column - def f(m, v, i): - block = self.coerce_to_target_dtype(value) - - # slice out our block - if i is not None: - block = block.getitem_block(slice(i, i + 1)) - return block.fillna(value, limit=limit, inplace=inplace, downcast=None) - - return self.split_and_operate(mask, f, inplace) - else: + if self._can_hold_element(value): + # equivalent: self._try_coerce_args(value) would not raise blocks = self.putmask(mask, value, inplace=inplace) blocks = [ b.make_block(values=self._try_coerce_result(b.values)) for b in blocks ] return self._maybe_downcast(blocks, downcast) + # we can't process the value, but nothing to do + if not mask.any(): + return self if inplace else self.copy() + + # operate column-by-column + def f(m, v, i): + block = self.coerce_to_target_dtype(value) + + # slice out our block + if i is not None: + block = block.getitem_block(slice(i, i + 1)) + return block.fillna(value, limit=limit, inplace=inplace, downcast=None) + + return self.split_and_operate(mask, f, inplace) + def split_and_operate(self, mask, f, inplace): """ split the block per-column, and apply the callable f @@ -2275,7 +2271,13 @@ def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: return tipo == _NS_DTYPE or tipo == np.int64 - return is_integer(element) or isinstance(element, datetime) or isna(element) + if isinstance(element, datetime): + return element.tzinfo is None + if is_integer(element): + return element == tslibs.iNaT + + # TODO: shouldnt we exclude timedelta64("NaT")? See GH#27297 + return isna(element) def _coerce_values(self, values): return values.view("i8") @@ -2627,6 +2629,8 @@ def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, (np.timedelta64, np.int64)) + if element is NaT: + return True return is_integer(element) or isinstance( element, (timedelta, np.timedelta64, np.int64) ) From 298c7ccccfdab48d8e08f0de6cff07a48dbc40ff Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 10 Jul 2019 09:23:05 -0700 Subject: [PATCH 199/238] REF: separate indexer utilities from indexing.py (#27229) --- pandas/core/algorithms.py | 3 +- pandas/core/indexers.py | 225 ++++++++++++++++++++ pandas/core/indexes/base.py | 2 +- pandas/core/indexing.py | 276 ++++--------------------- pandas/core/internals/blocks.py | 42 +--- pandas/core/internals/managers.py | 2 +- pandas/core/series.py | 3 +- pandas/tests/indexing/test_indexing.py | 7 +- 8 files changed, 285 insertions(+), 275 deletions(-) create mode 100644 pandas/core/indexers.py diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 79f205de118789..2c38e071d3d442 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -50,6 +50,7 @@ from pandas.core.dtypes.missing import isna, na_value_for_dtype from pandas.core import common as com +from pandas.core.indexers import validate_indices _shared_docs = {} # type: Dict[str, str] @@ -1587,8 +1588,6 @@ def take(arr, indices, axis=0, allow_fill=False, fill_value=None): ... fill_value=-10) array([ 10, 10, -10]) """ - from pandas.core.indexing import validate_indices - if not is_array_like(arr): arr = np.asarray(arr) diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py new file mode 100644 index 00000000000000..7b0030b91e4dc4 --- /dev/null +++ b/pandas/core/indexers.py @@ -0,0 +1,225 @@ +""" +Low-dependency indexing utilities. +""" +import numpy as np + +from pandas.core.dtypes.common import is_list_like +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries + +# ----------------------------------------------------------- +# Indexer Identification + + +def is_list_like_indexer(key) -> bool: + """ + Check if we have a list-like indexer that is *not* a NamedTuple. + + Parameters + ---------- + key : object + + Returns + ------- + bool + """ + # allow a list_like, but exclude NamedTuples which can be indexers + return is_list_like(key) and not (isinstance(key, tuple) and type(key) is not tuple) + + +def is_scalar_indexer(indexer, arr_value) -> bool: + # return True if we are all scalar indexers + + if arr_value.ndim == 1: + if not isinstance(indexer, tuple): + indexer = tuple([indexer]) + return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer) + return False + + +def is_empty_indexer(indexer, arr_value) -> bool: + # return a boolean if we have an empty indexer + + if is_list_like(indexer) and not len(indexer): + return True + if arr_value.ndim == 1: + if not isinstance(indexer, tuple): + indexer = tuple([indexer]) + return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer) + return False + + +# ----------------------------------------------------------- +# Indexer Validation + + +def check_setitem_lengths(indexer, value, values) -> None: + """ + Validate that value and indexer are the same length. + + An special-case is allowed for when the indexer is a boolean array + and the number of true values equals the length of ``value``. In + this case, no exception is raised. + + Parameters + ---------- + indexer : sequence + The key for the setitem + value : array-like + The value for the setitem + values : array-like + The values being set into + + Returns + ------- + None + + Raises + ------ + ValueError + When the indexer is an ndarray or list and the lengths don't + match. + """ + # boolean with truth values == len of the value is ok too + if isinstance(indexer, (np.ndarray, list)): + if is_list_like(value) and len(indexer) != len(value): + if not ( + isinstance(indexer, np.ndarray) + and indexer.dtype == np.bool_ + and len(indexer[indexer]) == len(value) + ): + raise ValueError( + "cannot set using a list-like indexer " + "with a different length than the value" + ) + + elif isinstance(indexer, slice): + # slice + if is_list_like(value) and len(values): + if len(value) != length_of_indexer(indexer, values): + raise ValueError( + "cannot set using a slice indexer with a " + "different length than the value" + ) + + +def validate_indices(indices: np.ndarray, n: int) -> None: + """ + Perform bounds-checking for an indexer. + + -1 is allowed for indicating missing values. + + Parameters + ---------- + indices : ndarray + n : int + length of the array being indexed + + Raises + ------ + ValueError + + Examples + -------- + >>> validate_indices([1, 2], 3) + # OK + >>> validate_indices([1, -2], 3) + ValueError + >>> validate_indices([1, 2, 3], 3) + IndexError + >>> validate_indices([-1, -1], 0) + # OK + >>> validate_indices([0, 1], 0) + IndexError + """ + if len(indices): + min_idx = indices.min() + if min_idx < -1: + msg = "'indices' contains values less than allowed ({} < {})".format( + min_idx, -1 + ) + raise ValueError(msg) + + max_idx = indices.max() + if max_idx >= n: + raise IndexError("indices are out-of-bounds") + + +# ----------------------------------------------------------- +# Indexer Conversion + + +def maybe_convert_indices(indices, n: int): + """ + Attempt to convert indices into valid, positive indices. + + If we have negative indices, translate to positive here. + If we have indices that are out-of-bounds, raise an IndexError. + + Parameters + ---------- + indices : array-like + The array of indices that we are to convert. + n : int + The number of elements in the array that we are indexing. + + Returns + ------- + valid_indices : array-like + An array-like of positive indices that correspond to the ones + that were passed in initially to this function. + + Raises + ------ + IndexError : one of the converted indices either exceeded the number + of elements (specified by `n`) OR was still negative. + """ + + if isinstance(indices, list): + indices = np.array(indices) + if len(indices) == 0: + # If list is empty, np.array will return float and cause indexing + # errors. + return np.empty(0, dtype=np.intp) + + mask = indices < 0 + if mask.any(): + indices = indices.copy() + indices[mask] += n + + mask = (indices >= n) | (indices < 0) + if mask.any(): + raise IndexError("indices are out-of-bounds") + return indices + + +# ----------------------------------------------------------- +# Unsorted + + +def length_of_indexer(indexer, target=None) -> int: + """ + return the length of a single non-tuple indexer which could be a slice + """ + if target is not None and isinstance(indexer, slice): + target_len = len(target) + start = indexer.start + stop = indexer.stop + step = indexer.step + if start is None: + start = 0 + elif start < 0: + start += target_len + if stop is None or stop > target_len: + stop = target_len + elif stop < 0: + stop += target_len + if step is None: + step = 1 + elif step < 0: + step = -step + return (stop - start + step - 1) // step + elif isinstance(indexer, (ABCSeries, ABCIndexClass, np.ndarray, list)): + return len(indexer) + elif not is_list_like_indexer(indexer): + return 1 + raise AssertionError("cannot find the length of the indexer") diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 96ce408a0ff8c6..abe20ee0a91ce6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -66,6 +66,7 @@ from pandas.core.arrays import ExtensionArray from pandas.core.base import IndexOpsMixin, PandasObject import pandas.core.common as com +from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.frozen import FrozenList import pandas.core.missing as missing from pandas.core.ops import get_op_result_name, make_invalid_op @@ -3318,7 +3319,6 @@ def _convert_list_indexer(self, keyarr, kind=None): # values outside the range of indices so as to trigger an # IndexError in maybe_convert_indices indexer[indexer < 0] = len(self) - from pandas.core.indexing import maybe_convert_indices return maybe_convert_indices(indexer, len(self)) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 612a857897a0c5..1f25be8b9e31e8 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -25,6 +25,7 @@ import pandas.core.common as com from pandas.core.index import Index, InvalidIndexError, MultiIndex +from pandas.core.indexers import is_list_like_indexer, length_of_indexer # the supported indexers @@ -115,6 +116,8 @@ def __iter__(self): def __getitem__(self, key): if type(key) is tuple: + # Note: we check the type exactly instead of with isinstance + # because NamedTuple is checked separately. key = tuple(com.apply_if_callable(x, self.obj) for x in key) try: values = self.obj._get_value(*key) @@ -558,27 +561,6 @@ def setter(item, v): # reset the sliced object if unique self.obj[item] = s - def can_do_equal_len(): - """ return True if we have an equal len settable """ - if ( - not len(labels) == 1 - or not np.iterable(value) - or is_scalar(plane_indexer[0]) - ): - return False - - item = labels[0] - index = self.obj[item].index - - values_len = len(value) - # equal len list/ndarray - if len(index) == values_len: - return True - elif lplane_indexer == values_len: - return True - - return False - # we need an iterable, with a ndim of at least 1 # eg. don't pass through np.array(0) if is_list_like_indexer(value) and getattr(value, "ndim", 1) > 0: @@ -622,7 +604,9 @@ def can_do_equal_len(): setter(item, value[:, i].tolist()) # we have an equal len list/ndarray - elif can_do_equal_len(): + elif _can_do_equal_len( + labels, value, plane_indexer, lplane_indexer, self.obj + ): setter(labels[0], value) # per label values @@ -1126,7 +1110,7 @@ def _get_listlike_indexer(self, key, axis, raise_missing=False): ) return keyarr, indexer - def _getitem_iterable(self, key, axis=None): + def _getitem_iterable(self, key, axis: int): """ Index current object with an an iterable key (which can be a boolean indexer, or a collection of keys). @@ -1135,7 +1119,7 @@ def _getitem_iterable(self, key, axis=None): ---------- key : iterable Target labels, or boolean indexer - axis: int, default None + axis: int Dimension on which the indexing is being made Raises @@ -1151,10 +1135,7 @@ def _getitem_iterable(self, key, axis=None): ------- scalar, DataFrame, or Series: indexed value(s), """ - - if axis is None: - axis = self.axis or 0 - + # caller is responsible for ensuring non-None axis self._validate_key(key, axis) labels = self.obj._get_axis(axis) @@ -1282,7 +1263,7 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False, raise_missing=Fal raise except TypeError: pass - except (ValueError): + except ValueError: if not is_int_positional: raise @@ -1332,20 +1313,15 @@ def _tuplify(self, loc): tup[0] = loc return tuple(tup) - def _get_slice_axis(self, slice_obj, axis=None): + def _get_slice_axis(self, slice_obj: slice, axis: int): + # caller is responsible for ensuring non-None axis obj = self.obj - if axis is None: - axis = self.axis or 0 - if not need_slice(slice_obj): return obj.copy(deep=False) - indexer = self._convert_slice_indexer(slice_obj, axis) - if isinstance(indexer, slice): - return self._slice(indexer, axis=axis, kind="iloc") - else: - return self.obj._take(indexer, axis=axis) + indexer = self._convert_slice_indexer(slice_obj, axis) + return self._slice(indexer, axis=axis, kind="iloc") class _IXIndexer(_NDFrameIndexer): @@ -1453,11 +1429,11 @@ class _LocationIndexer(_NDFrameIndexer): def __getitem__(self, key): if type(key) is tuple: key = tuple(com.apply_if_callable(x, self.obj) for x in key) - try: - if self._is_scalar_access(key): + if self._is_scalar_access(key): + try: return self._getitem_scalar(key) - except (KeyError, IndexError, AttributeError): - pass + except (KeyError, IndexError, AttributeError): + pass return self._getitem_tuple(key) else: # we by definition only have the 0th axis @@ -1475,9 +1451,8 @@ def _getitem_scalar(self, key): def _getitem_axis(self, key, axis=None): raise NotImplementedError() - def _getbool_axis(self, key, axis=None): - if axis is None: - axis = self.axis or 0 + def _getbool_axis(self, key, axis: int): + # caller is responsible for ensuring non-None axis labels = self.obj._get_axis(axis) key = check_bool_indexer(labels, key) inds, = key.nonzero() @@ -1486,11 +1461,9 @@ def _getbool_axis(self, key, axis=None): except Exception as detail: raise self._exception(detail) - def _get_slice_axis(self, slice_obj, axis=None): + def _get_slice_axis(self, slice_obj: slice, axis: int): """ this is pretty simple as we just have to deal with labels """ - if axis is None: - axis = self.axis or 0 - + # caller is responsible for ensuring non-None axis obj = self.obj if not need_slice(slice_obj): return obj.copy(deep=False) @@ -1503,6 +1476,8 @@ def _get_slice_axis(self, slice_obj, axis=None): if isinstance(indexer, slice): return self._slice(indexer, axis=axis, kind="iloc") else: + # DatetimeIndex overrides Index.slice_indexer and may + # return a DatetimeIndex instead of a slice object. return self.obj._take(indexer, axis=axis) @@ -2030,6 +2005,7 @@ class _iLocIndexer(_LocationIndexer): "point is EXCLUDED), listlike of integers, boolean array" ) _exception = IndexError + _get_slice_axis = _NDFrameIndexer._get_slice_axis def _validate_key(self, key, axis): if com.is_bool_indexer(key): @@ -2157,20 +2133,6 @@ def _getitem_tuple(self, tup): return retval - def _get_slice_axis(self, slice_obj, axis=None): - if axis is None: - axis = self.axis or 0 - obj = self.obj - - if not need_slice(slice_obj): - return obj.copy(deep=False) - - slice_obj = self._convert_slice_indexer(slice_obj, axis) - if isinstance(slice_obj, slice): - return self._slice(slice_obj, axis=axis, kind="iloc") - else: - return self.obj._take(slice_obj, axis=axis) - def _get_list_axis(self, key, axis=None): """ Return Series values by list or array of integers @@ -2413,35 +2375,6 @@ def _convert_key(self, key, is_setter=False): return key -def length_of_indexer(indexer, target=None): - """ - return the length of a single non-tuple indexer which could be a slice - """ - if target is not None and isinstance(indexer, slice): - target_len = len(target) - start = indexer.start - stop = indexer.stop - step = indexer.step - if start is None: - start = 0 - elif start < 0: - start += target_len - if stop is None or stop > target_len: - stop = target_len - elif stop < 0: - stop += target_len - if step is None: - step = 1 - elif step < 0: - step = -step - return (stop - start + step - 1) // step - elif isinstance(indexer, (ABCSeries, Index, np.ndarray, list)): - return len(indexer) - elif not is_list_like_indexer(indexer): - return 1 - raise AssertionError("cannot find the length of the indexer") - - def convert_to_index_sliceable(obj, key): """ if we are index sliceable, then return my slicer, otherwise return None @@ -2520,56 +2453,6 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: return result -def check_setitem_lengths(indexer, value, values): - """ - Validate that value and indexer are the same length. - - An special-case is allowed for when the indexer is a boolean array - and the number of true values equals the length of ``value``. In - this case, no exception is raised. - - Parameters - ---------- - indexer : sequence - The key for the setitem - value : array-like - The value for the setitem - values : array-like - The values being set into - - Returns - ------- - None - - Raises - ------ - ValueError - When the indexer is an ndarray or list and the lengths don't - match. - """ - # boolean with truth values == len of the value is ok too - if isinstance(indexer, (np.ndarray, list)): - if is_list_like(value) and len(indexer) != len(value): - if not ( - isinstance(indexer, np.ndarray) - and indexer.dtype == np.bool_ - and len(indexer[indexer]) == len(value) - ): - raise ValueError( - "cannot set using a list-like indexer " - "with a different length than the value" - ) - # slice - elif isinstance(indexer, slice): - - if is_list_like(value) and len(values): - if len(value) != length_of_indexer(indexer, values): - raise ValueError( - "cannot set using a slice indexer with a " - "different length than the value" - ) - - def convert_missing_indexer(indexer): """ reverse convert a missing indexer, which is a dict @@ -2599,92 +2482,6 @@ def get_indexer(_i, _idx): return tuple(get_indexer(_i, _idx) for _i, _idx in enumerate(indexer)) -def maybe_convert_indices(indices, n): - """ - Attempt to convert indices into valid, positive indices. - - If we have negative indices, translate to positive here. - If we have indices that are out-of-bounds, raise an IndexError. - - Parameters - ---------- - indices : array-like - The array of indices that we are to convert. - n : int - The number of elements in the array that we are indexing. - - Returns - ------- - valid_indices : array-like - An array-like of positive indices that correspond to the ones - that were passed in initially to this function. - - Raises - ------ - IndexError : one of the converted indices either exceeded the number - of elements (specified by `n`) OR was still negative. - """ - - if isinstance(indices, list): - indices = np.array(indices) - if len(indices) == 0: - # If list is empty, np.array will return float and cause indexing - # errors. - return np.empty(0, dtype=np.intp) - - mask = indices < 0 - if mask.any(): - indices = indices.copy() - indices[mask] += n - - mask = (indices >= n) | (indices < 0) - if mask.any(): - raise IndexError("indices are out-of-bounds") - return indices - - -def validate_indices(indices, n): - """ - Perform bounds-checking for an indexer. - - -1 is allowed for indicating missing values. - - Parameters - ---------- - indices : ndarray - n : int - length of the array being indexed - - Raises - ------ - ValueError - - Examples - -------- - >>> validate_indices([1, 2], 3) - # OK - >>> validate_indices([1, -2], 3) - ValueError - >>> validate_indices([1, 2, 3], 3) - IndexError - >>> validate_indices([-1, -1], 0) - # OK - >>> validate_indices([0, 1], 0) - IndexError - """ - if len(indices): - min_idx = indices.min() - if min_idx < -1: - msg = "'indices' contains values less than allowed ({} < {})".format( - min_idx, -1 - ) - raise ValueError(msg) - - max_idx = indices.max() - if max_idx >= n: - raise IndexError("indices are out-of-bounds") - - def maybe_convert_ix(*args): """ We likely want to take the cross-product @@ -2714,11 +2511,6 @@ def is_nested_tuple(tup, labels): return False -def is_list_like_indexer(key): - # allow a list_like, but exclude NamedTuples which can be indexers - return is_list_like(key) and not (isinstance(key, tuple) and type(key) is not tuple) - - def is_label_like(key): # select a label or row return not isinstance(key, slice) and not is_list_like_indexer(key) @@ -2794,3 +2586,21 @@ def _maybe_numeric_slice(df, slice_, include_bool=False): dtypes.append(bool) slice_ = IndexSlice[:, df.select_dtypes(include=dtypes).columns] return slice_ + + +def _can_do_equal_len(labels, value, plane_indexer, lplane_indexer, obj): + """ return True if we have an equal len settable """ + if not len(labels) == 1 or not np.iterable(value) or is_scalar(plane_indexer[0]): + return False + + item = labels[0] + index = obj[item].index + + values_len = len(value) + # equal len list/ndarray + if len(index) == values_len: + return True + elif lplane_indexer == values_len: + return True + + return False diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 652a44609f2c51..1e84437f5c2fc0 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -72,7 +72,11 @@ ) from pandas.core.base import PandasObject import pandas.core.common as com -from pandas.core.indexing import check_setitem_lengths +from pandas.core.indexers import ( + check_setitem_lengths, + is_empty_indexer, + is_scalar_indexer, +) from pandas.core.internals.arrays import extract_array import pandas.core.missing as missing from pandas.core.nanops import nanpercentile @@ -901,39 +905,13 @@ def setitem(self, indexer, value): # length checking check_setitem_lengths(indexer, value, values) - def _is_scalar_indexer(indexer): - # return True if we are all scalar indexers - - if arr_value.ndim == 1: - if not isinstance(indexer, tuple): - indexer = tuple([indexer]) - return any( - isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer - ) - return False - - def _is_empty_indexer(indexer): - # return a boolean if we have an empty indexer - - if is_list_like(indexer) and not len(indexer): - return True - if arr_value.ndim == 1: - if not isinstance(indexer, tuple): - indexer = tuple([indexer]) - return any( - isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer - ) - return False - - # empty indexers - # 8669 (empty) - if _is_empty_indexer(indexer): + if is_empty_indexer(indexer, arr_value): + # GH#8669 empty indexers pass - # setting a single element for each dim and with a rhs that could - # be say a list - # GH 6043 - elif _is_scalar_indexer(indexer): + elif is_scalar_indexer(indexer, arr_value): + # setting a single element for each dim and with a rhs that could + # be e.g. a list; see GH#6043 values[indexer] = value # if we are an exact match (ex-broadcasting), diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index cd678a235cfc12..44cc61d163b4d9 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -34,7 +34,7 @@ import pandas.core.algorithms as algos from pandas.core.base import PandasObject from pandas.core.index import Index, MultiIndex, ensure_index -from pandas.core.indexing import maybe_convert_indices +from pandas.core.indexers import maybe_convert_indices from pandas.io.formats.printing import pprint_thing diff --git a/pandas/core/series.py b/pandas/core/series.py index b3a7f38aef8ef5..1943b66818b95b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -66,12 +66,13 @@ MultiIndex, ensure_index, ) +from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.accessors import CombinedDatetimelikeProperties import pandas.core.indexes.base as ibase from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import PeriodIndex from pandas.core.indexes.timedeltas import TimedeltaIndex -from pandas.core.indexing import check_bool_indexer, maybe_convert_indices +from pandas.core.indexing import check_bool_indexer from pandas.core.internals import SingleBlockManager from pandas.core.internals.construction import sanitize_array from pandas.core.strings import StringMethods diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index e06047b52ac15f..ba144909724cfb 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -12,11 +12,8 @@ import pandas as pd from pandas import DataFrame, Index, NaT, Series from pandas.core.generic import NDFrame -from pandas.core.indexing import ( - _maybe_numeric_slice, - _non_reducing_slice, - validate_indices, -) +from pandas.core.indexers import validate_indices +from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice from pandas.tests.indexing.common import Base, _mklbl import pandas.util.testing as tm From 134bec4d2aca2d42b76ee40d12084384526e5aaf Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Wed, 10 Jul 2019 10:39:21 -0600 Subject: [PATCH 200/238] BUG: Preserve CategoricalDtype._ordered_from_sentinel with pickle (#27317) --- pandas/core/dtypes/dtypes.py | 3 ++- pandas/tests/dtypes/test_dtypes.py | 13 +++++++++++++ pandas/tests/series/test_io.py | 9 +++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 1cf452b4a6c2c6..7721c90c9b4b42 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -219,7 +219,7 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): kind = "O" # type: str_type str = "|O08" base = np.dtype("O") - _metadata = ("categories", "ordered") + _metadata = ("categories", "ordered", "_ordered_from_sentinel") _cache = {} # type: Dict[str_type, PandasExtensionDtype] def __init__(self, categories=None, ordered: OrderedType = ordered_sentinel): @@ -356,6 +356,7 @@ def __setstate__(self, state: Dict[str_type, Any]) -> None: # pickle -> need to set the settable private ones here (see GH26067) self._categories = state.pop("categories", None) self._ordered = state.pop("ordered", False) + self._ordered_from_sentinel = state.pop("_ordered_from_sentinel", False) def __hash__(self) -> int: # _hash_categories returns a uint64, so use the negative diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index a81c57537408ce..d3f0d7c43ee6ba 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -903,6 +903,19 @@ def test_ordered_none_default_deprecated(self, ordered): with tm.assert_produces_warning(warning): dtype.ordered + @pytest.mark.parametrize("ordered", [True, False, None, ordered_sentinel]) + def test_pickle_ordered_from_sentinel(self, ordered): + # GH 27295: can remove test when _ordered_from_sentinel is removed (GH 26403) + dtype = CategoricalDtype(categories=list("abc"), ordered=ordered) + + warning = FutureWarning if ordered is ordered_sentinel else None + with tm.assert_produces_warning(warning, check_stacklevel=False): + dtype_from_pickle = tm.round_trip_pickle(dtype) + + result = dtype_from_pickle._ordered_from_sentinel + expected = ordered is ordered_sentinel + assert result is expected + @pytest.mark.parametrize( "dtype", [CategoricalDtype, IntervalDtype, DatetimeTZDtype, PeriodDtype] diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 0238314122462c..5389390501b32f 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -226,6 +226,15 @@ def test_pickle_preserve_name(self): unpickled = self._pickle_roundtrip_name(tm.makeTimeSeries(name=n)) assert unpickled.name == n + def test_pickle_categorical_ordered_from_sentinel(self): + # GH 27295: can remove test when _ordered_from_sentinel is removed (GH 26403) + s = Series(["a", "b", "c", "a"], dtype="category") + result = tm.round_trip_pickle(s) + result = result.astype("category") + + tm.assert_series_equal(result, s) + assert result.dtype._ordered_from_sentinel is False + def _pickle_roundtrip_name(self, obj): with ensure_clean() as path: From c0c1c9a2f9ace9d9a681d47a0b9761dbd8747355 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 10 Jul 2019 09:55:54 -0700 Subject: [PATCH 201/238] check early for non-scalar default_fill_value (#27302) --- pandas/core/sparse/frame.py | 5 ++++- pandas/tests/sparse/frame/test_frame.py | 6 ++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index f195e4b5f4e373..60060a4a2d1fae 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -6,6 +6,7 @@ import numpy as np +from pandas._libs.lib import is_scalar, item_from_zerodim from pandas._libs.sparse import BlockIndex, get_blocks from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender @@ -74,6 +75,8 @@ def __init__( dtype=None, copy=False, ): + if not is_scalar(default_fill_value): + raise ValueError("'default_fill_value' must be a scalar") warnings.warn(depr_msg, FutureWarning, stacklevel=2) # pick up the defaults from the Sparse structures @@ -666,7 +669,7 @@ def _get_op_result_fill_value(self, other, func): fill_value = np.nan else: fill_value = func(np.float64(own_default), np.float64(other.fill_value)) - + fill_value = item_from_zerodim(fill_value) else: raise NotImplementedError(type(other)) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 55a37da6b663ff..64c81a8c109856 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -136,6 +136,12 @@ def test_constructor(self, float_frame, float_frame_int_kind, float_frame_fill0) repr(float_frame) + def test_constructor_fill_value_not_scalar_raises(self): + d = {"b": [2, 3], "a": [0, 1]} + fill_value = np.array(np.nan) + with pytest.raises(ValueError, match="must be a scalar"): + SparseDataFrame(data=d, default_fill_value=fill_value) + def test_constructor_dict_order(self): # GH19018 # initialization ordering: by insertion order if python>= 3.6, else From 50fb400fdf2f5a9fb24f86267d899532061b8a78 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 10 Jul 2019 11:35:59 -0700 Subject: [PATCH 202/238] CLN: Split test_window.py (#27305) --- pandas/tests/window/__init__.py | 0 pandas/tests/window/conftest.py | 49 + pandas/tests/window/test_dtypes.py | 228 +++ pandas/tests/window/test_pairwise.py | 183 +++ pandas/tests/window/test_timeseries_window.py | 692 ++++++++++ pandas/tests/{ => window}/test_window.py | 1218 +---------------- 6 files changed, 1201 insertions(+), 1169 deletions(-) create mode 100644 pandas/tests/window/__init__.py create mode 100644 pandas/tests/window/conftest.py create mode 100644 pandas/tests/window/test_dtypes.py create mode 100644 pandas/tests/window/test_pairwise.py create mode 100644 pandas/tests/window/test_timeseries_window.py rename pandas/tests/{ => window}/test_window.py (76%) diff --git a/pandas/tests/window/__init__.py b/pandas/tests/window/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py new file mode 100644 index 00000000000000..7ea4be25ca2a67 --- /dev/null +++ b/pandas/tests/window/conftest.py @@ -0,0 +1,49 @@ +import pytest + + +@pytest.fixture(params=[True, False]) +def raw(request): + return request.param + + +@pytest.fixture( + params=[ + "triang", + "blackman", + "hamming", + "bartlett", + "bohman", + "blackmanharris", + "nuttall", + "barthann", + ] +) +def win_types(request): + return request.param + + +@pytest.fixture(params=["kaiser", "gaussian", "general_gaussian", "exponential"]) +def win_types_special(request): + return request.param + + +@pytest.fixture( + params=["sum", "mean", "median", "max", "min", "var", "std", "kurt", "skew"] +) +def arithmetic_win_operators(request): + return request.param + + +@pytest.fixture(params=["right", "left", "both", "neither"]) +def closed(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def center(request): + return request.param + + +@pytest.fixture(params=[None, 1]) +def min_periods(request): + return request.param diff --git a/pandas/tests/window/test_dtypes.py b/pandas/tests/window/test_dtypes.py new file mode 100644 index 00000000000000..ab2915a333afd8 --- /dev/null +++ b/pandas/tests/window/test_dtypes.py @@ -0,0 +1,228 @@ +from itertools import product + +import numpy as np +import pytest + +from pandas import DataFrame, Series +from pandas.core.base import DataError +import pandas.util.testing as tm + +# gh-12373 : rolling functions error on float32 data +# make sure rolling functions works for different dtypes +# +# NOTE that these are yielded tests and so _create_data +# is explicitly called. +# +# further note that we are only checking rolling for fully dtype +# compliance (though both expanding and ewm inherit) + + +class Dtype: + window = 2 + + funcs = { + "count": lambda v: v.count(), + "max": lambda v: v.max(), + "min": lambda v: v.min(), + "sum": lambda v: v.sum(), + "mean": lambda v: v.mean(), + "std": lambda v: v.std(), + "var": lambda v: v.var(), + "median": lambda v: v.median(), + } + + def get_expects(self): + expects = { + "sr1": { + "count": Series([1, 2, 2, 2, 2], dtype="float64"), + "max": Series([np.nan, 1, 2, 3, 4], dtype="float64"), + "min": Series([np.nan, 0, 1, 2, 3], dtype="float64"), + "sum": Series([np.nan, 1, 3, 5, 7], dtype="float64"), + "mean": Series([np.nan, 0.5, 1.5, 2.5, 3.5], dtype="float64"), + "std": Series([np.nan] + [np.sqrt(0.5)] * 4, dtype="float64"), + "var": Series([np.nan, 0.5, 0.5, 0.5, 0.5], dtype="float64"), + "median": Series([np.nan, 0.5, 1.5, 2.5, 3.5], dtype="float64"), + }, + "sr2": { + "count": Series([1, 2, 2, 2, 2], dtype="float64"), + "max": Series([np.nan, 10, 8, 6, 4], dtype="float64"), + "min": Series([np.nan, 8, 6, 4, 2], dtype="float64"), + "sum": Series([np.nan, 18, 14, 10, 6], dtype="float64"), + "mean": Series([np.nan, 9, 7, 5, 3], dtype="float64"), + "std": Series([np.nan] + [np.sqrt(2)] * 4, dtype="float64"), + "var": Series([np.nan, 2, 2, 2, 2], dtype="float64"), + "median": Series([np.nan, 9, 7, 5, 3], dtype="float64"), + }, + "df": { + "count": DataFrame( + {0: Series([1, 2, 2, 2, 2]), 1: Series([1, 2, 2, 2, 2])}, + dtype="float64", + ), + "max": DataFrame( + {0: Series([np.nan, 2, 4, 6, 8]), 1: Series([np.nan, 3, 5, 7, 9])}, + dtype="float64", + ), + "min": DataFrame( + {0: Series([np.nan, 0, 2, 4, 6]), 1: Series([np.nan, 1, 3, 5, 7])}, + dtype="float64", + ), + "sum": DataFrame( + { + 0: Series([np.nan, 2, 6, 10, 14]), + 1: Series([np.nan, 4, 8, 12, 16]), + }, + dtype="float64", + ), + "mean": DataFrame( + {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}, + dtype="float64", + ), + "std": DataFrame( + { + 0: Series([np.nan] + [np.sqrt(2)] * 4), + 1: Series([np.nan] + [np.sqrt(2)] * 4), + }, + dtype="float64", + ), + "var": DataFrame( + {0: Series([np.nan, 2, 2, 2, 2]), 1: Series([np.nan, 2, 2, 2, 2])}, + dtype="float64", + ), + "median": DataFrame( + {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}, + dtype="float64", + ), + }, + } + return expects + + def _create_dtype_data(self, dtype): + sr1 = Series(np.arange(5), dtype=dtype) + sr2 = Series(np.arange(10, 0, -2), dtype=dtype) + df = DataFrame(np.arange(10).reshape((5, 2)), dtype=dtype) + + data = {"sr1": sr1, "sr2": sr2, "df": df} + + return data + + def _create_data(self): + self.data = self._create_dtype_data(self.dtype) + self.expects = self.get_expects() + + def test_dtypes(self): + self._create_data() + for f_name, d_name in product(self.funcs.keys(), self.data.keys()): + + f = self.funcs[f_name] + d = self.data[d_name] + exp = self.expects[d_name][f_name] + self.check_dtypes(f, f_name, d, d_name, exp) + + def check_dtypes(self, f, f_name, d, d_name, exp): + roll = d.rolling(window=self.window) + result = f(roll) + tm.assert_almost_equal(result, exp) + + +class TestDtype_object(Dtype): + dtype = object + + +class Dtype_integer(Dtype): + pass + + +class TestDtype_int8(Dtype_integer): + dtype = np.int8 + + +class TestDtype_int16(Dtype_integer): + dtype = np.int16 + + +class TestDtype_int32(Dtype_integer): + dtype = np.int32 + + +class TestDtype_int64(Dtype_integer): + dtype = np.int64 + + +class Dtype_uinteger(Dtype): + pass + + +class TestDtype_uint8(Dtype_uinteger): + dtype = np.uint8 + + +class TestDtype_uint16(Dtype_uinteger): + dtype = np.uint16 + + +class TestDtype_uint32(Dtype_uinteger): + dtype = np.uint32 + + +class TestDtype_uint64(Dtype_uinteger): + dtype = np.uint64 + + +class Dtype_float(Dtype): + pass + + +class TestDtype_float16(Dtype_float): + dtype = np.float16 + + +class TestDtype_float32(Dtype_float): + dtype = np.float32 + + +class TestDtype_float64(Dtype_float): + dtype = np.float64 + + +class TestDtype_category(Dtype): + dtype = "category" + include_df = False + + def _create_dtype_data(self, dtype): + sr1 = Series(range(5), dtype=dtype) + sr2 = Series(range(10, 0, -2), dtype=dtype) + + data = {"sr1": sr1, "sr2": sr2} + + return data + + +class DatetimeLike(Dtype): + def check_dtypes(self, f, f_name, d, d_name, exp): + + roll = d.rolling(window=self.window) + if f_name == "count": + result = f(roll) + tm.assert_almost_equal(result, exp) + + else: + with pytest.raises(DataError): + f(roll) + + +class TestDtype_timedelta(DatetimeLike): + dtype = np.dtype("m8[ns]") + + +class TestDtype_datetime(DatetimeLike): + dtype = np.dtype("M8[ns]") + + +class TestDtype_datetime64UTC(DatetimeLike): + dtype = "datetime64[ns, UTC]" + + def _create_data(self): + pytest.skip( + "direct creation of extension dtype " + "datetime64[ns, UTC] is not supported ATM" + ) diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py new file mode 100644 index 00000000000000..56d89e15c418cb --- /dev/null +++ b/pandas/tests/window/test_pairwise.py @@ -0,0 +1,183 @@ +import warnings + +import pytest + +from pandas import DataFrame, Series +from pandas.core.sorting import safe_sort +import pandas.util.testing as tm + + +class TestPairwise: + + # GH 7738 + df1s = [ + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0, 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=["C", "C"]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1.0, 0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0.0, 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=["C", 1]), + DataFrame([[2.0, 4.0], [1.0, 2.0], [5.0, 2.0], [8.0, 1.0]], columns=[1, 0.0]), + DataFrame([[2, 4.0], [1, 2.0], [5, 2.0], [8, 1.0]], columns=[0, 1.0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1.0]], columns=[1.0, "X"]), + ] + df2 = DataFrame( + [[None, 1, 1], [None, 1, 2], [None, 3, 2], [None, 8, 1]], + columns=["Y", "Z", "X"], + ) + s = Series([1, 1, 3, 8]) + + def compare(self, result, expected): + + # since we have sorted the results + # we can only compare non-nans + result = result.dropna().values + expected = expected.dropna().values + + tm.assert_numpy_array_equal(result, expected, check_dtype=False) + + @pytest.mark.parametrize("f", [lambda x: x.cov(), lambda x: x.corr()]) + def test_no_flex(self, f): + + # DataFrame methods (which do not call _flex_binary_moment()) + + results = [f(df) for df in self.df1s] + for (df, result) in zip(self.df1s, results): + tm.assert_index_equal(result.index, df.columns) + tm.assert_index_equal(result.columns, df.columns) + for i, result in enumerate(results): + if i > 0: + self.compare(result, results[0]) + + @pytest.mark.parametrize( + "f", + [ + lambda x: x.expanding().cov(pairwise=True), + lambda x: x.expanding().corr(pairwise=True), + lambda x: x.rolling(window=3).cov(pairwise=True), + lambda x: x.rolling(window=3).corr(pairwise=True), + lambda x: x.ewm(com=3).cov(pairwise=True), + lambda x: x.ewm(com=3).corr(pairwise=True), + ], + ) + def test_pairwise_with_self(self, f): + + # DataFrame with itself, pairwise=True + # note that we may construct the 1st level of the MI + # in a non-monotonic way, so compare accordingly + results = [] + for i, df in enumerate(self.df1s): + result = f(df) + tm.assert_index_equal(result.index.levels[0], df.index, check_names=False) + tm.assert_numpy_array_equal( + safe_sort(result.index.levels[1]), safe_sort(df.columns.unique()) + ) + tm.assert_index_equal(result.columns, df.columns) + results.append(df) + + for i, result in enumerate(results): + if i > 0: + self.compare(result, results[0]) + + @pytest.mark.parametrize( + "f", + [ + lambda x: x.expanding().cov(pairwise=False), + lambda x: x.expanding().corr(pairwise=False), + lambda x: x.rolling(window=3).cov(pairwise=False), + lambda x: x.rolling(window=3).corr(pairwise=False), + lambda x: x.ewm(com=3).cov(pairwise=False), + lambda x: x.ewm(com=3).corr(pairwise=False), + ], + ) + def test_no_pairwise_with_self(self, f): + + # DataFrame with itself, pairwise=False + results = [f(df) for df in self.df1s] + for (df, result) in zip(self.df1s, results): + tm.assert_index_equal(result.index, df.index) + tm.assert_index_equal(result.columns, df.columns) + for i, result in enumerate(results): + if i > 0: + self.compare(result, results[0]) + + @pytest.mark.parametrize( + "f", + [ + lambda x, y: x.expanding().cov(y, pairwise=True), + lambda x, y: x.expanding().corr(y, pairwise=True), + lambda x, y: x.rolling(window=3).cov(y, pairwise=True), + lambda x, y: x.rolling(window=3).corr(y, pairwise=True), + lambda x, y: x.ewm(com=3).cov(y, pairwise=True), + lambda x, y: x.ewm(com=3).corr(y, pairwise=True), + ], + ) + def test_pairwise_with_other(self, f): + + # DataFrame with another DataFrame, pairwise=True + results = [f(df, self.df2) for df in self.df1s] + for (df, result) in zip(self.df1s, results): + tm.assert_index_equal(result.index.levels[0], df.index, check_names=False) + tm.assert_numpy_array_equal( + safe_sort(result.index.levels[1]), safe_sort(self.df2.columns.unique()) + ) + for i, result in enumerate(results): + if i > 0: + self.compare(result, results[0]) + + @pytest.mark.parametrize( + "f", + [ + lambda x, y: x.expanding().cov(y, pairwise=False), + lambda x, y: x.expanding().corr(y, pairwise=False), + lambda x, y: x.rolling(window=3).cov(y, pairwise=False), + lambda x, y: x.rolling(window=3).corr(y, pairwise=False), + lambda x, y: x.ewm(com=3).cov(y, pairwise=False), + lambda x, y: x.ewm(com=3).corr(y, pairwise=False), + ], + ) + def test_no_pairwise_with_other(self, f): + + # DataFrame with another DataFrame, pairwise=False + results = [ + f(df, self.df2) if df.columns.is_unique else None for df in self.df1s + ] + for (df, result) in zip(self.df1s, results): + if result is not None: + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + # we can have int and str columns + expected_index = df.index.union(self.df2.index) + expected_columns = df.columns.union(self.df2.columns) + tm.assert_index_equal(result.index, expected_index) + tm.assert_index_equal(result.columns, expected_columns) + else: + with pytest.raises(ValueError, match="'arg1' columns are not unique"): + f(df, self.df2) + with pytest.raises(ValueError, match="'arg2' columns are not unique"): + f(self.df2, df) + + @pytest.mark.parametrize( + "f", + [ + lambda x, y: x.expanding().cov(y), + lambda x, y: x.expanding().corr(y), + lambda x, y: x.rolling(window=3).cov(y), + lambda x, y: x.rolling(window=3).corr(y), + lambda x, y: x.ewm(com=3).cov(y), + lambda x, y: x.ewm(com=3).corr(y), + ], + ) + def test_pairwise_with_series(self, f): + + # DataFrame with a Series + results = [f(df, self.s) for df in self.df1s] + [ + f(self.s, df) for df in self.df1s + ] + for (df, result) in zip(self.df1s, results): + tm.assert_index_equal(result.index, df.index) + tm.assert_index_equal(result.columns, df.columns) + for i, result in enumerate(results): + if i > 0: + self.compare(result, results[0]) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py new file mode 100644 index 00000000000000..e057eadae9da86 --- /dev/null +++ b/pandas/tests/window/test_timeseries_window.py @@ -0,0 +1,692 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Index, Series, Timestamp, date_range, to_datetime +import pandas.util.testing as tm + +import pandas.tseries.offsets as offsets + + +class TestRollingTS: + + # rolling time-series friendly + # xref GH13327 + + def setup_method(self, method): + + self.regular = DataFrame( + {"A": date_range("20130101", periods=5, freq="s"), "B": range(5)} + ).set_index("A") + + self.ragged = DataFrame({"B": range(5)}) + self.ragged.index = [ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ] + + def test_doc_string(self): + + df = DataFrame( + {"B": [0, 1, 2, np.nan, 4]}, + index=[ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ], + ) + df + df.rolling("2s").sum() + + def test_valid(self): + + df = self.regular + + # not a valid freq + with pytest.raises(ValueError): + df.rolling(window="foobar") + + # not a datetimelike index + with pytest.raises(ValueError): + df.reset_index().rolling(window="foobar") + + # non-fixed freqs + for freq in ["2MS", offsets.MonthBegin(2)]: + with pytest.raises(ValueError): + df.rolling(window=freq) + + for freq in ["1D", offsets.Day(2), "2ms"]: + df.rolling(window=freq) + + # non-integer min_periods + for minp in [1.0, "foo", np.array([1, 2, 3])]: + with pytest.raises(ValueError): + df.rolling(window="1D", min_periods=minp) + + # center is not implemented + with pytest.raises(NotImplementedError): + df.rolling(window="1D", center=True) + + def test_on(self): + + df = self.regular + + # not a valid column + with pytest.raises(ValueError): + df.rolling(window="2s", on="foobar") + + # column is valid + df = df.copy() + df["C"] = date_range("20130101", periods=len(df)) + df.rolling(window="2d", on="C").sum() + + # invalid columns + with pytest.raises(ValueError): + df.rolling(window="2d", on="B") + + # ok even though on non-selected + df.rolling(window="2d", on="C").B.sum() + + def test_monotonic_on(self): + + # on/index must be monotonic + df = DataFrame( + {"A": date_range("20130101", periods=5, freq="s"), "B": range(5)} + ) + + assert df.A.is_monotonic + df.rolling("2s", on="A").sum() + + df = df.set_index("A") + assert df.index.is_monotonic + df.rolling("2s").sum() + + # non-monotonic + df.index = reversed(df.index.tolist()) + assert not df.index.is_monotonic + + with pytest.raises(ValueError): + df.rolling("2s").sum() + + df = df.reset_index() + with pytest.raises(ValueError): + df.rolling("2s", on="A").sum() + + def test_frame_on(self): + + df = DataFrame( + {"B": range(5), "C": date_range("20130101 09:00:00", periods=5, freq="3s")} + ) + + df["A"] = [ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ] + + # we are doing simulating using 'on' + expected = df.set_index("A").rolling("2s").B.sum().reset_index(drop=True) + + result = df.rolling("2s", on="A").B.sum() + tm.assert_series_equal(result, expected) + + # test as a frame + # we should be ignoring the 'on' as an aggregation column + # note that the expected is setting, computing, and resetting + # so the columns need to be switched compared + # to the actual result where they are ordered as in the + # original + expected = ( + df.set_index("A").rolling("2s")[["B"]].sum().reset_index()[["B", "A"]] + ) + + result = df.rolling("2s", on="A")[["B"]].sum() + tm.assert_frame_equal(result, expected) + + def test_frame_on2(self): + + # using multiple aggregation columns + df = DataFrame( + { + "A": [0, 1, 2, 3, 4], + "B": [0, 1, 2, np.nan, 4], + "C": Index( + [ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ] + ), + }, + columns=["A", "C", "B"], + ) + + expected1 = DataFrame( + {"A": [0.0, 1, 3, 3, 7], "B": [0, 1, 3, np.nan, 4], "C": df["C"]}, + columns=["A", "C", "B"], + ) + + result = df.rolling("2s", on="C").sum() + expected = expected1 + tm.assert_frame_equal(result, expected) + + expected = Series([0, 1, 3, np.nan, 4], name="B") + result = df.rolling("2s", on="C").B.sum() + tm.assert_series_equal(result, expected) + + expected = expected1[["A", "B", "C"]] + result = df.rolling("2s", on="C")[["A", "B", "C"]].sum() + tm.assert_frame_equal(result, expected) + + def test_basic_regular(self): + + df = self.regular.copy() + + df.index = date_range("20130101", periods=5, freq="D") + expected = df.rolling(window=1, min_periods=1).sum() + result = df.rolling(window="1D").sum() + tm.assert_frame_equal(result, expected) + + df.index = date_range("20130101", periods=5, freq="2D") + expected = df.rolling(window=1, min_periods=1).sum() + result = df.rolling(window="2D", min_periods=1).sum() + tm.assert_frame_equal(result, expected) + + expected = df.rolling(window=1, min_periods=1).sum() + result = df.rolling(window="2D", min_periods=1).sum() + tm.assert_frame_equal(result, expected) + + expected = df.rolling(window=1).sum() + result = df.rolling(window="2D").sum() + tm.assert_frame_equal(result, expected) + + def test_min_periods(self): + + # compare for min_periods + df = self.regular + + # these slightly different + expected = df.rolling(2, min_periods=1).sum() + result = df.rolling("2s").sum() + tm.assert_frame_equal(result, expected) + + expected = df.rolling(2, min_periods=1).sum() + result = df.rolling("2s", min_periods=1).sum() + tm.assert_frame_equal(result, expected) + + def test_closed(self): + + # xref GH13965 + + df = DataFrame( + {"A": [1] * 5}, + index=[ + Timestamp("20130101 09:00:01"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:04"), + Timestamp("20130101 09:00:06"), + ], + ) + + # closed must be 'right', 'left', 'both', 'neither' + with pytest.raises(ValueError): + self.regular.rolling(window="2s", closed="blabla") + + expected = df.copy() + expected["A"] = [1.0, 2, 2, 2, 1] + result = df.rolling("2s", closed="right").sum() + tm.assert_frame_equal(result, expected) + + # default should be 'right' + result = df.rolling("2s").sum() + tm.assert_frame_equal(result, expected) + + expected = df.copy() + expected["A"] = [1.0, 2, 3, 3, 2] + result = df.rolling("2s", closed="both").sum() + tm.assert_frame_equal(result, expected) + + expected = df.copy() + expected["A"] = [np.nan, 1.0, 2, 2, 1] + result = df.rolling("2s", closed="left").sum() + tm.assert_frame_equal(result, expected) + + expected = df.copy() + expected["A"] = [np.nan, 1.0, 1, 1, np.nan] + result = df.rolling("2s", closed="neither").sum() + tm.assert_frame_equal(result, expected) + + def test_ragged_sum(self): + + df = self.ragged + result = df.rolling(window="1s", min_periods=1).sum() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).sum() + expected = df.copy() + expected["B"] = [0.0, 1, 3, 3, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=2).sum() + expected = df.copy() + expected["B"] = [np.nan, np.nan, 3, np.nan, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="3s", min_periods=1).sum() + expected = df.copy() + expected["B"] = [0.0, 1, 3, 5, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="3s").sum() + expected = df.copy() + expected["B"] = [0.0, 1, 3, 5, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="4s", min_periods=1).sum() + expected = df.copy() + expected["B"] = [0.0, 1, 3, 6, 9] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="4s", min_periods=3).sum() + expected = df.copy() + expected["B"] = [np.nan, np.nan, 3, 6, 9] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).sum() + expected = df.copy() + expected["B"] = [0.0, 1, 3, 6, 10] + tm.assert_frame_equal(result, expected) + + def test_ragged_mean(self): + + df = self.ragged + result = df.rolling(window="1s", min_periods=1).mean() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).mean() + expected = df.copy() + expected["B"] = [0.0, 1, 1.5, 3.0, 3.5] + tm.assert_frame_equal(result, expected) + + def test_ragged_median(self): + + df = self.ragged + result = df.rolling(window="1s", min_periods=1).median() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).median() + expected = df.copy() + expected["B"] = [0.0, 1, 1.5, 3.0, 3.5] + tm.assert_frame_equal(result, expected) + + def test_ragged_quantile(self): + + df = self.ragged + result = df.rolling(window="1s", min_periods=1).quantile(0.5) + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).quantile(0.5) + expected = df.copy() + expected["B"] = [0.0, 1, 1.5, 3.0, 3.5] + tm.assert_frame_equal(result, expected) + + def test_ragged_std(self): + + df = self.ragged + result = df.rolling(window="1s", min_periods=1).std(ddof=0) + expected = df.copy() + expected["B"] = [0.0] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="1s", min_periods=1).std(ddof=1) + expected = df.copy() + expected["B"] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="3s", min_periods=1).std(ddof=0) + expected = df.copy() + expected["B"] = [0.0] + [0.5] * 4 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).std(ddof=1) + expected = df.copy() + expected["B"] = [np.nan, 0.707107, 1.0, 1.0, 1.290994] + tm.assert_frame_equal(result, expected) + + def test_ragged_var(self): + + df = self.ragged + result = df.rolling(window="1s", min_periods=1).var(ddof=0) + expected = df.copy() + expected["B"] = [0.0] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="1s", min_periods=1).var(ddof=1) + expected = df.copy() + expected["B"] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="3s", min_periods=1).var(ddof=0) + expected = df.copy() + expected["B"] = [0.0] + [0.25] * 4 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).var(ddof=1) + expected = df.copy() + expected["B"] = [np.nan, 0.5, 1.0, 1.0, 1 + 2 / 3.0] + tm.assert_frame_equal(result, expected) + + def test_ragged_skew(self): + + df = self.ragged + result = df.rolling(window="3s", min_periods=1).skew() + expected = df.copy() + expected["B"] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).skew() + expected = df.copy() + expected["B"] = [np.nan] * 2 + [0.0, 0.0, 0.0] + tm.assert_frame_equal(result, expected) + + def test_ragged_kurt(self): + + df = self.ragged + result = df.rolling(window="3s", min_periods=1).kurt() + expected = df.copy() + expected["B"] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).kurt() + expected = df.copy() + expected["B"] = [np.nan] * 4 + [-1.2] + tm.assert_frame_equal(result, expected) + + def test_ragged_count(self): + + df = self.ragged + result = df.rolling(window="1s", min_periods=1).count() + expected = df.copy() + expected["B"] = [1.0, 1, 1, 1, 1] + tm.assert_frame_equal(result, expected) + + df = self.ragged + result = df.rolling(window="1s").count() + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).count() + expected = df.copy() + expected["B"] = [1.0, 1, 2, 1, 2] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=2).count() + expected = df.copy() + expected["B"] = [np.nan, np.nan, 2, np.nan, 2] + tm.assert_frame_equal(result, expected) + + def test_regular_min(self): + + df = DataFrame( + {"A": date_range("20130101", periods=5, freq="s"), "B": [0.0, 1, 2, 3, 4]} + ).set_index("A") + result = df.rolling("1s").min() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + df = DataFrame( + {"A": date_range("20130101", periods=5, freq="s"), "B": [5, 4, 3, 4, 5]} + ).set_index("A") + + tm.assert_frame_equal(result, expected) + result = df.rolling("2s").min() + expected = df.copy() + expected["B"] = [5.0, 4, 3, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling("5s").min() + expected = df.copy() + expected["B"] = [5.0, 4, 3, 3, 3] + tm.assert_frame_equal(result, expected) + + def test_ragged_min(self): + + df = self.ragged + + result = df.rolling(window="1s", min_periods=1).min() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).min() + expected = df.copy() + expected["B"] = [0.0, 1, 1, 3, 3] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).min() + expected = df.copy() + expected["B"] = [0.0, 0, 0, 1, 1] + tm.assert_frame_equal(result, expected) + + def test_perf_min(self): + + N = 10000 + + dfp = DataFrame( + {"B": np.random.randn(N)}, index=date_range("20130101", periods=N, freq="s") + ) + expected = dfp.rolling(2, min_periods=1).min() + result = dfp.rolling("2s").min() + assert ((result - expected) < 0.01).all().bool() + + expected = dfp.rolling(200, min_periods=1).min() + result = dfp.rolling("200s").min() + assert ((result - expected) < 0.01).all().bool() + + def test_ragged_max(self): + + df = self.ragged + + result = df.rolling(window="1s", min_periods=1).max() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).max() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).max() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + def test_ragged_apply(self, raw): + + df = self.ragged + + f = lambda x: 1 + result = df.rolling(window="1s", min_periods=1).apply(f, raw=raw) + expected = df.copy() + expected["B"] = 1.0 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).apply(f, raw=raw) + expected = df.copy() + expected["B"] = 1.0 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).apply(f, raw=raw) + expected = df.copy() + expected["B"] = 1.0 + tm.assert_frame_equal(result, expected) + + def test_all(self): + + # simple comparison of integer vs time-based windowing + df = self.regular * 2 + er = df.rolling(window=1) + r = df.rolling(window="1s") + + for f in [ + "sum", + "mean", + "count", + "median", + "std", + "var", + "kurt", + "skew", + "min", + "max", + ]: + + result = getattr(r, f)() + expected = getattr(er, f)() + tm.assert_frame_equal(result, expected) + + result = r.quantile(0.5) + expected = er.quantile(0.5) + tm.assert_frame_equal(result, expected) + + def test_all_apply(self, raw): + + df = self.regular * 2 + er = df.rolling(window=1) + r = df.rolling(window="1s") + + result = r.apply(lambda x: 1, raw=raw) + expected = er.apply(lambda x: 1, raw=raw) + tm.assert_frame_equal(result, expected) + + def test_all2(self): + + # more sophisticated comparison of integer vs. + # time-based windowing + df = DataFrame( + {"B": np.arange(50)}, index=date_range("20130101", periods=50, freq="H") + ) + # in-range data + dft = df.between_time("09:00", "16:00") + + r = dft.rolling(window="5H") + + for f in [ + "sum", + "mean", + "count", + "median", + "std", + "var", + "kurt", + "skew", + "min", + "max", + ]: + + result = getattr(r, f)() + + # we need to roll the days separately + # to compare with a time-based roll + # finally groupby-apply will return a multi-index + # so we need to drop the day + def agg_by_day(x): + x = x.between_time("09:00", "16:00") + return getattr(x.rolling(5, min_periods=1), f)() + + expected = ( + df.groupby(df.index.day) + .apply(agg_by_day) + .reset_index(level=0, drop=True) + ) + + tm.assert_frame_equal(result, expected) + + def test_groupby_monotonic(self): + + # GH 15130 + # we don't need to validate monotonicity when grouping + + data = [ + ["David", "1/1/2015", 100], + ["David", "1/5/2015", 500], + ["David", "5/30/2015", 50], + ["David", "7/25/2015", 50], + ["Ryan", "1/4/2014", 100], + ["Ryan", "1/19/2015", 500], + ["Ryan", "3/31/2016", 50], + ["Joe", "7/1/2015", 100], + ["Joe", "9/9/2015", 500], + ["Joe", "10/15/2015", 50], + ] + + df = DataFrame(data=data, columns=["name", "date", "amount"]) + df["date"] = to_datetime(df["date"]) + + expected = ( + df.set_index("date") + .groupby("name") + .apply(lambda x: x.rolling("180D")["amount"].sum()) + ) + result = df.groupby("name").rolling("180D", on="date")["amount"].sum() + tm.assert_series_equal(result, expected) + + def test_non_monotonic(self): + # GH 13966 (similar to #15130, closed by #15175) + + dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s") + df = DataFrame( + { + "A": [1] * 20 + [2] * 12 + [3] * 8, + "B": np.concatenate((dates, dates)), + "C": np.arange(40), + } + ) + + result = df.groupby("A").rolling("4s", on="B").C.mean() + expected = ( + df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) + ) + tm.assert_series_equal(result, expected) + + df2 = df.sort_values("B") + result = df2.groupby("A").rolling("4s", on="B").C.mean() + tm.assert_series_equal(result, expected) + + def test_rolling_cov_offset(self): + # GH16058 + + idx = date_range("2017-01-01", periods=24, freq="1h") + ss = Series(np.arange(len(idx)), index=idx) + + result = ss.rolling("2h").cov() + expected = Series([np.nan] + [0.5] * (len(idx) - 1), index=idx) + tm.assert_series_equal(result, expected) + + expected2 = ss.rolling(2, min_periods=1).cov() + tm.assert_series_equal(result, expected2) + + result = ss.rolling("3h").cov() + expected = Series([np.nan, 0.5] + [1.0] * (len(idx) - 2), index=idx) + tm.assert_series_equal(result, expected) + + expected2 = ss.rolling(3, min_periods=1).cov() + tm.assert_series_equal(result, expected2) diff --git a/pandas/tests/test_window.py b/pandas/tests/window/test_window.py similarity index 76% rename from pandas/tests/test_window.py rename to pandas/tests/window/test_window.py index fca88ff3ce8ceb..d85e22de1d176a 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/window/test_window.py @@ -1,6 +1,5 @@ from collections import OrderedDict from datetime import datetime, timedelta -from itertools import product import warnings from warnings import catch_warnings @@ -13,8 +12,7 @@ import pandas as pd from pandas import DataFrame, Index, Series, Timestamp, bdate_range, concat, isna, notna -from pandas.core.base import DataError, SpecificationError -from pandas.core.sorting import safe_sort +from pandas.core.base import SpecificationError import pandas.core.window as rwindow import pandas.util.testing as tm @@ -23,46 +21,6 @@ N, K = 100, 10 -def assert_equal(left, right): - if isinstance(left, Series): - tm.assert_series_equal(left, right) - else: - tm.assert_frame_equal(left, right) - - -@pytest.fixture(params=[True, False]) -def raw(request): - return request.param - - -@pytest.fixture( - params=[ - "triang", - "blackman", - "hamming", - "bartlett", - "bohman", - "blackmanharris", - "nuttall", - "barthann", - ] -) -def win_types(request): - return request.param - - -@pytest.fixture(params=["kaiser", "gaussian", "general_gaussian", "exponential"]) -def win_types_special(request): - return request.param - - -@pytest.fixture( - params=["sum", "mean", "median", "max", "min", "var", "std", "kurt", "skew"] -) -def arithmetic_win_operators(request): - return request.param - - class Base: _nan_locs = np.arange(20, 40) @@ -963,225 +921,6 @@ def test_numpy_compat(self, method): getattr(e, method)(dtype=np.float64) -# gh-12373 : rolling functions error on float32 data -# make sure rolling functions works for different dtypes -# -# NOTE that these are yielded tests and so _create_data -# is explicitly called. -# -# further note that we are only checking rolling for fully dtype -# compliance (though both expanding and ewm inherit) -class Dtype: - window = 2 - - funcs = { - "count": lambda v: v.count(), - "max": lambda v: v.max(), - "min": lambda v: v.min(), - "sum": lambda v: v.sum(), - "mean": lambda v: v.mean(), - "std": lambda v: v.std(), - "var": lambda v: v.var(), - "median": lambda v: v.median(), - } - - def get_expects(self): - expects = { - "sr1": { - "count": Series([1, 2, 2, 2, 2], dtype="float64"), - "max": Series([np.nan, 1, 2, 3, 4], dtype="float64"), - "min": Series([np.nan, 0, 1, 2, 3], dtype="float64"), - "sum": Series([np.nan, 1, 3, 5, 7], dtype="float64"), - "mean": Series([np.nan, 0.5, 1.5, 2.5, 3.5], dtype="float64"), - "std": Series([np.nan] + [np.sqrt(0.5)] * 4, dtype="float64"), - "var": Series([np.nan, 0.5, 0.5, 0.5, 0.5], dtype="float64"), - "median": Series([np.nan, 0.5, 1.5, 2.5, 3.5], dtype="float64"), - }, - "sr2": { - "count": Series([1, 2, 2, 2, 2], dtype="float64"), - "max": Series([np.nan, 10, 8, 6, 4], dtype="float64"), - "min": Series([np.nan, 8, 6, 4, 2], dtype="float64"), - "sum": Series([np.nan, 18, 14, 10, 6], dtype="float64"), - "mean": Series([np.nan, 9, 7, 5, 3], dtype="float64"), - "std": Series([np.nan] + [np.sqrt(2)] * 4, dtype="float64"), - "var": Series([np.nan, 2, 2, 2, 2], dtype="float64"), - "median": Series([np.nan, 9, 7, 5, 3], dtype="float64"), - }, - "df": { - "count": DataFrame( - {0: Series([1, 2, 2, 2, 2]), 1: Series([1, 2, 2, 2, 2])}, - dtype="float64", - ), - "max": DataFrame( - {0: Series([np.nan, 2, 4, 6, 8]), 1: Series([np.nan, 3, 5, 7, 9])}, - dtype="float64", - ), - "min": DataFrame( - {0: Series([np.nan, 0, 2, 4, 6]), 1: Series([np.nan, 1, 3, 5, 7])}, - dtype="float64", - ), - "sum": DataFrame( - { - 0: Series([np.nan, 2, 6, 10, 14]), - 1: Series([np.nan, 4, 8, 12, 16]), - }, - dtype="float64", - ), - "mean": DataFrame( - {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}, - dtype="float64", - ), - "std": DataFrame( - { - 0: Series([np.nan] + [np.sqrt(2)] * 4), - 1: Series([np.nan] + [np.sqrt(2)] * 4), - }, - dtype="float64", - ), - "var": DataFrame( - {0: Series([np.nan, 2, 2, 2, 2]), 1: Series([np.nan, 2, 2, 2, 2])}, - dtype="float64", - ), - "median": DataFrame( - {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}, - dtype="float64", - ), - }, - } - return expects - - def _create_dtype_data(self, dtype): - sr1 = Series(np.arange(5), dtype=dtype) - sr2 = Series(np.arange(10, 0, -2), dtype=dtype) - df = DataFrame(np.arange(10).reshape((5, 2)), dtype=dtype) - - data = {"sr1": sr1, "sr2": sr2, "df": df} - - return data - - def _create_data(self): - self.data = self._create_dtype_data(self.dtype) - self.expects = self.get_expects() - - def test_dtypes(self): - self._create_data() - for f_name, d_name in product(self.funcs.keys(), self.data.keys()): - - f = self.funcs[f_name] - d = self.data[d_name] - exp = self.expects[d_name][f_name] - self.check_dtypes(f, f_name, d, d_name, exp) - - def check_dtypes(self, f, f_name, d, d_name, exp): - roll = d.rolling(window=self.window) - result = f(roll) - tm.assert_almost_equal(result, exp) - - -class TestDtype_object(Dtype): - dtype = object - - -class Dtype_integer(Dtype): - pass - - -class TestDtype_int8(Dtype_integer): - dtype = np.int8 - - -class TestDtype_int16(Dtype_integer): - dtype = np.int16 - - -class TestDtype_int32(Dtype_integer): - dtype = np.int32 - - -class TestDtype_int64(Dtype_integer): - dtype = np.int64 - - -class Dtype_uinteger(Dtype): - pass - - -class TestDtype_uint8(Dtype_uinteger): - dtype = np.uint8 - - -class TestDtype_uint16(Dtype_uinteger): - dtype = np.uint16 - - -class TestDtype_uint32(Dtype_uinteger): - dtype = np.uint32 - - -class TestDtype_uint64(Dtype_uinteger): - dtype = np.uint64 - - -class Dtype_float(Dtype): - pass - - -class TestDtype_float16(Dtype_float): - dtype = np.float16 - - -class TestDtype_float32(Dtype_float): - dtype = np.float32 - - -class TestDtype_float64(Dtype_float): - dtype = np.float64 - - -class TestDtype_category(Dtype): - dtype = "category" - include_df = False - - def _create_dtype_data(self, dtype): - sr1 = Series(range(5), dtype=dtype) - sr2 = Series(range(10, 0, -2), dtype=dtype) - - data = {"sr1": sr1, "sr2": sr2} - - return data - - -class DatetimeLike(Dtype): - def check_dtypes(self, f, f_name, d, d_name, exp): - - roll = d.rolling(window=self.window) - if f_name == "count": - result = f(roll) - tm.assert_almost_equal(result, exp) - - else: - with pytest.raises(DataError): - f(roll) - - -class TestDtype_timedelta(DatetimeLike): - dtype = np.dtype("m8[ns]") - - -class TestDtype_datetime(DatetimeLike): - dtype = np.dtype("M8[ns]") - - -class TestDtype_datetime64UTC(DatetimeLike): - dtype = "datetime64[ns, UTC]" - - def _create_data(self): - pytest.skip( - "direct creation of extension dtype " - "datetime64[ns, UTC] is not supported ATM" - ) - - @pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning") class TestMoments(Base): def setup_method(self, method): @@ -1204,17 +943,19 @@ def test_centered_axis_validation(self): with pytest.raises(ValueError): (DataFrame(np.ones((10, 10))).rolling(window=3, center=True, axis=2).mean()) - def test_rolling_sum(self): - self._check_moment_func(np.nansum, name="sum", zero_min_periods_equal=False) + def test_rolling_sum(self, raw): + self._check_moment_func( + np.nansum, name="sum", zero_min_periods_equal=False, raw=raw + ) - def test_rolling_count(self): + def test_rolling_count(self, raw): counter = lambda x: np.isfinite(x).astype(float).sum() self._check_moment_func( - counter, name="count", has_min_periods=False, fill_value=0 + counter, name="count", has_min_periods=False, fill_value=0, raw=raw ) - def test_rolling_mean(self): - self._check_moment_func(np.mean, name="mean") + def test_rolling_mean(self, raw): + self._check_moment_func(np.mean, name="mean", raw=raw) @td.skip_if_no_scipy def test_cmov_mean(self): @@ -1679,11 +1420,11 @@ def test_cmov_window_special_linear_range(self, win_types_special): ) tm.assert_series_equal(xp, rs) - def test_rolling_median(self): - self._check_moment_func(np.median, name="median") + def test_rolling_median(self, raw): + self._check_moment_func(np.median, name="median", raw=raw) - def test_rolling_min(self): - self._check_moment_func(np.min, name="min") + def test_rolling_min(self, raw): + self._check_moment_func(np.min, name="min", raw=raw) a = pd.Series([1, 2, 3, 4, 5]) result = a.rolling(window=100, min_periods=1).min() @@ -1693,8 +1434,8 @@ def test_rolling_min(self): with pytest.raises(ValueError): pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).min() - def test_rolling_max(self): - self._check_moment_func(np.max, name="max") + def test_rolling_max(self, raw): + self._check_moment_func(np.max, name="max", raw=raw) a = pd.Series([1, 2, 3, 4, 5], dtype=np.float64) b = a.rolling(window=100, min_periods=1).max() @@ -1704,7 +1445,7 @@ def test_rolling_max(self): pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).max() @pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0]) - def test_rolling_quantile(self, q): + def test_rolling_quantile(self, q, raw): def scoreatpercentile(a, per): values = np.sort(a, axis=0) @@ -1725,7 +1466,7 @@ def scoreatpercentile(a, per): def quantile_func(x): return scoreatpercentile(x, q) - self._check_moment_func(quantile_func, name="quantile", quantile=q) + self._check_moment_func(quantile_func, name="quantile", quantile=q, raw=raw) def test_rolling_quantile_np_percentile(self): # #9413: Tests that rolling window's quantile default behavior @@ -1865,9 +1606,11 @@ def f(x): with pytest.raises(AttributeError): df.rolling(window).apply(f, raw=True) - def test_rolling_std(self): - self._check_moment_func(lambda x: np.std(x, ddof=1), name="std") - self._check_moment_func(lambda x: np.std(x, ddof=0), name="std", ddof=0) + def test_rolling_std(self, raw): + self._check_moment_func(lambda x: np.std(x, ddof=1), name="std", raw=raw) + self._check_moment_func( + lambda x: np.std(x, ddof=0), name="std", ddof=0, raw=raw + ) def test_rolling_std_1obs(self): vals = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0]) @@ -1903,26 +1646,29 @@ def test_rolling_std_neg_sqrt(self): b = a.ewm(span=3).std() assert np.isfinite(b[2:]).all() - def test_rolling_var(self): - self._check_moment_func(lambda x: np.var(x, ddof=1), name="var") - self._check_moment_func(lambda x: np.var(x, ddof=0), name="var", ddof=0) + def test_rolling_var(self, raw): + self._check_moment_func(lambda x: np.var(x, ddof=1), name="var", raw=raw) + self._check_moment_func( + lambda x: np.var(x, ddof=0), name="var", ddof=0, raw=raw + ) @td.skip_if_no_scipy - def test_rolling_skew(self): + def test_rolling_skew(self, raw): from scipy.stats import skew - self._check_moment_func(lambda x: skew(x, bias=False), name="skew") + self._check_moment_func(lambda x: skew(x, bias=False), name="skew", raw=raw) @td.skip_if_no_scipy - def test_rolling_kurt(self): + def test_rolling_kurt(self, raw): from scipy.stats import kurtosis - self._check_moment_func(lambda x: kurtosis(x, bias=False), name="kurt") + self._check_moment_func(lambda x: kurtosis(x, bias=False), name="kurt", raw=raw) def _check_moment_func( self, static_comp, name, + raw, has_min_periods=True, has_center=True, has_time_rule=True, @@ -2339,182 +2085,6 @@ def _check_ew(self, name=None, preserve_nan=False): assert result2.dtype == np.float_ -class TestPairwise: - - # GH 7738 - df1s = [ - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0, 1]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 0]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 1]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=["C", "C"]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1.0, 0]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0.0, 1]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=["C", 1]), - DataFrame([[2.0, 4.0], [1.0, 2.0], [5.0, 2.0], [8.0, 1.0]], columns=[1, 0.0]), - DataFrame([[2, 4.0], [1, 2.0], [5, 2.0], [8, 1.0]], columns=[0, 1.0]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1.0]], columns=[1.0, "X"]), - ] - df2 = DataFrame( - [[None, 1, 1], [None, 1, 2], [None, 3, 2], [None, 8, 1]], - columns=["Y", "Z", "X"], - ) - s = Series([1, 1, 3, 8]) - - def compare(self, result, expected): - - # since we have sorted the results - # we can only compare non-nans - result = result.dropna().values - expected = expected.dropna().values - - tm.assert_numpy_array_equal(result, expected, check_dtype=False) - - @pytest.mark.parametrize("f", [lambda x: x.cov(), lambda x: x.corr()]) - def test_no_flex(self, f): - - # DataFrame methods (which do not call _flex_binary_moment()) - - results = [f(df) for df in self.df1s] - for (df, result) in zip(self.df1s, results): - tm.assert_index_equal(result.index, df.columns) - tm.assert_index_equal(result.columns, df.columns) - for i, result in enumerate(results): - if i > 0: - self.compare(result, results[0]) - - @pytest.mark.parametrize( - "f", - [ - lambda x: x.expanding().cov(pairwise=True), - lambda x: x.expanding().corr(pairwise=True), - lambda x: x.rolling(window=3).cov(pairwise=True), - lambda x: x.rolling(window=3).corr(pairwise=True), - lambda x: x.ewm(com=3).cov(pairwise=True), - lambda x: x.ewm(com=3).corr(pairwise=True), - ], - ) - def test_pairwise_with_self(self, f): - - # DataFrame with itself, pairwise=True - # note that we may construct the 1st level of the MI - # in a non-monotonic way, so compare accordingly - results = [] - for i, df in enumerate(self.df1s): - result = f(df) - tm.assert_index_equal(result.index.levels[0], df.index, check_names=False) - tm.assert_numpy_array_equal( - safe_sort(result.index.levels[1]), safe_sort(df.columns.unique()) - ) - tm.assert_index_equal(result.columns, df.columns) - results.append(df) - - for i, result in enumerate(results): - if i > 0: - self.compare(result, results[0]) - - @pytest.mark.parametrize( - "f", - [ - lambda x: x.expanding().cov(pairwise=False), - lambda x: x.expanding().corr(pairwise=False), - lambda x: x.rolling(window=3).cov(pairwise=False), - lambda x: x.rolling(window=3).corr(pairwise=False), - lambda x: x.ewm(com=3).cov(pairwise=False), - lambda x: x.ewm(com=3).corr(pairwise=False), - ], - ) - def test_no_pairwise_with_self(self, f): - - # DataFrame with itself, pairwise=False - results = [f(df) for df in self.df1s] - for (df, result) in zip(self.df1s, results): - tm.assert_index_equal(result.index, df.index) - tm.assert_index_equal(result.columns, df.columns) - for i, result in enumerate(results): - if i > 0: - self.compare(result, results[0]) - - @pytest.mark.parametrize( - "f", - [ - lambda x, y: x.expanding().cov(y, pairwise=True), - lambda x, y: x.expanding().corr(y, pairwise=True), - lambda x, y: x.rolling(window=3).cov(y, pairwise=True), - lambda x, y: x.rolling(window=3).corr(y, pairwise=True), - lambda x, y: x.ewm(com=3).cov(y, pairwise=True), - lambda x, y: x.ewm(com=3).corr(y, pairwise=True), - ], - ) - def test_pairwise_with_other(self, f): - - # DataFrame with another DataFrame, pairwise=True - results = [f(df, self.df2) for df in self.df1s] - for (df, result) in zip(self.df1s, results): - tm.assert_index_equal(result.index.levels[0], df.index, check_names=False) - tm.assert_numpy_array_equal( - safe_sort(result.index.levels[1]), safe_sort(self.df2.columns.unique()) - ) - for i, result in enumerate(results): - if i > 0: - self.compare(result, results[0]) - - @pytest.mark.parametrize( - "f", - [ - lambda x, y: x.expanding().cov(y, pairwise=False), - lambda x, y: x.expanding().corr(y, pairwise=False), - lambda x, y: x.rolling(window=3).cov(y, pairwise=False), - lambda x, y: x.rolling(window=3).corr(y, pairwise=False), - lambda x, y: x.ewm(com=3).cov(y, pairwise=False), - lambda x, y: x.ewm(com=3).corr(y, pairwise=False), - ], - ) - def test_no_pairwise_with_other(self, f): - - # DataFrame with another DataFrame, pairwise=False - results = [ - f(df, self.df2) if df.columns.is_unique else None for df in self.df1s - ] - for (df, result) in zip(self.df1s, results): - if result is not None: - with catch_warnings(record=True): - warnings.simplefilter("ignore", RuntimeWarning) - # we can have int and str columns - expected_index = df.index.union(self.df2.index) - expected_columns = df.columns.union(self.df2.columns) - tm.assert_index_equal(result.index, expected_index) - tm.assert_index_equal(result.columns, expected_columns) - else: - with pytest.raises(ValueError, match="'arg1' columns are not unique"): - f(df, self.df2) - with pytest.raises(ValueError, match="'arg2' columns are not unique"): - f(self.df2, df) - - @pytest.mark.parametrize( - "f", - [ - lambda x, y: x.expanding().cov(y), - lambda x, y: x.expanding().corr(y), - lambda x, y: x.rolling(window=3).cov(y), - lambda x, y: x.rolling(window=3).corr(y), - lambda x, y: x.ewm(com=3).cov(y), - lambda x, y: x.ewm(com=3).corr(y), - ], - ) - def test_pairwise_with_series(self, f): - - # DataFrame with a Series - results = [f(df, self.s) for df in self.df1s] + [ - f(self.s, df) for df in self.df1s - ] - for (df, result) in zip(self.df1s, results): - tm.assert_index_equal(result.index, df.index) - tm.assert_index_equal(result.columns, df.columns) - for i, result in enumerate(results): - if i > 0: - self.compare(result, results[0]) - - # create the data only once as we are not setting it def _create_consistency_data(): def create_series(): @@ -2741,7 +2311,7 @@ def _non_null_values(x): if mock_mean: # check that mean equals mock_mean expected = mock_mean(x) - assert_equal(mean_x, expected.astype("float64")) + tm.assert_equal(mean_x, expected.astype("float64")) # check that correlation of a series with itself is either 1 or NaN corr_x_x = corr(x, x) @@ -2755,18 +2325,18 @@ def _non_null_values(x): # check mean of constant series expected = x * np.nan expected[count_x >= max(min_periods, 1)] = exp - assert_equal(mean_x, expected) + tm.assert_equal(mean_x, expected) # check correlation of constant series with itself is NaN expected[:] = np.nan - assert_equal(corr_x_x, expected) + tm.assert_equal(corr_x_x, expected) if var_unbiased and var_biased and var_debiasing_factors: # check variance debiasing factors var_unbiased_x = var_unbiased(x) var_biased_x = var_biased(x) var_debiasing_factors_x = var_debiasing_factors(x) - assert_equal(var_unbiased_x, var_biased_x * var_debiasing_factors_x) + tm.assert_equal(var_unbiased_x, var_biased_x * var_debiasing_factors_x) for (std, var, cov) in [ (std_biased, var_biased, cov_biased), @@ -2783,15 +2353,15 @@ def _non_null_values(x): assert not (cov_x_x < 0).any().any() # check that var(x) == cov(x, x) - assert_equal(var_x, cov_x_x) + tm.assert_equal(var_x, cov_x_x) # check that var(x) == std(x)^2 - assert_equal(var_x, std_x * std_x) + tm.assert_equal(var_x, std_x * std_x) if var is var_biased: # check that biased var(x) == mean(x^2) - mean(x)^2 mean_x2 = mean(x * x) - assert_equal(var_x, mean_x2 - (mean_x * mean_x)) + tm.assert_equal(var_x, mean_x2 - (mean_x * mean_x)) if is_constant: # check that variance of constant series is identically 0 @@ -2800,7 +2370,7 @@ def _non_null_values(x): expected[count_x >= max(min_periods, 1)] = 0.0 if var is var_unbiased: expected[count_x < 2] = np.nan - assert_equal(var_x, expected) + tm.assert_equal(var_x, expected) if isinstance(x, Series): for (y, is_constant, no_nans) in self.data: @@ -2812,31 +2382,33 @@ def _non_null_values(x): # check that cor(x, y) is symmetric corr_x_y = corr(x, y) corr_y_x = corr(y, x) - assert_equal(corr_x_y, corr_y_x) + tm.assert_equal(corr_x_y, corr_y_x) if cov: # check that cov(x, y) is symmetric cov_x_y = cov(x, y) cov_y_x = cov(y, x) - assert_equal(cov_x_y, cov_y_x) + tm.assert_equal(cov_x_y, cov_y_x) # check that cov(x, y) == (var(x+y) - var(x) - # var(y)) / 2 var_x_plus_y = var(x + y) var_y = var(y) - assert_equal(cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y)) + tm.assert_equal( + cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y) + ) # check that corr(x, y) == cov(x, y) / (std(x) * # std(y)) std_y = std(y) - assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) + tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) if cov is cov_biased: # check that biased cov(x, y) == mean(x*y) - # mean(x)*mean(y) mean_y = mean(y) mean_x_times_y = mean(x * y) - assert_equal( + tm.assert_equal( cov_x_y, mean_x_times_y - (mean_x * mean_y) ) @@ -3026,7 +2598,7 @@ def test_expanding_consistency(self, min_periods): # GH 9422 if name in ["sum", "prod"]: - assert_equal(expanding_f_result, expanding_apply_f_result) + tm.assert_equal(expanding_f_result, expanding_apply_f_result) @pytest.mark.slow @pytest.mark.parametrize( @@ -3147,7 +2719,7 @@ def test_rolling_consistency(self, window, min_periods, center): # GH 9422 if name in ["sum", "prod"]: - assert_equal(rolling_f_result, rolling_apply_f_result) + tm.assert_equal(rolling_f_result, rolling_apply_f_result) # binary moments def test_rolling_cov(self): @@ -4057,695 +3629,3 @@ def test_expanding_apply(self, raw): result = r.apply(lambda x: x.sum(), raw=raw) expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw)) tm.assert_frame_equal(result, expected) - - -class TestRollingTS: - - # rolling time-series friendly - # xref GH13327 - - def setup_method(self, method): - - self.regular = DataFrame( - {"A": pd.date_range("20130101", periods=5, freq="s"), "B": range(5)} - ).set_index("A") - - self.ragged = DataFrame({"B": range(5)}) - self.ragged.index = [ - Timestamp("20130101 09:00:00"), - Timestamp("20130101 09:00:02"), - Timestamp("20130101 09:00:03"), - Timestamp("20130101 09:00:05"), - Timestamp("20130101 09:00:06"), - ] - - def test_doc_string(self): - - df = DataFrame( - {"B": [0, 1, 2, np.nan, 4]}, - index=[ - Timestamp("20130101 09:00:00"), - Timestamp("20130101 09:00:02"), - Timestamp("20130101 09:00:03"), - Timestamp("20130101 09:00:05"), - Timestamp("20130101 09:00:06"), - ], - ) - df - df.rolling("2s").sum() - - def test_valid(self): - - df = self.regular - - # not a valid freq - with pytest.raises(ValueError): - df.rolling(window="foobar") - - # not a datetimelike index - with pytest.raises(ValueError): - df.reset_index().rolling(window="foobar") - - # non-fixed freqs - for freq in ["2MS", pd.offsets.MonthBegin(2)]: - with pytest.raises(ValueError): - df.rolling(window=freq) - - for freq in ["1D", pd.offsets.Day(2), "2ms"]: - df.rolling(window=freq) - - # non-integer min_periods - for minp in [1.0, "foo", np.array([1, 2, 3])]: - with pytest.raises(ValueError): - df.rolling(window="1D", min_periods=minp) - - # center is not implemented - with pytest.raises(NotImplementedError): - df.rolling(window="1D", center=True) - - def test_on(self): - - df = self.regular - - # not a valid column - with pytest.raises(ValueError): - df.rolling(window="2s", on="foobar") - - # column is valid - df = df.copy() - df["C"] = pd.date_range("20130101", periods=len(df)) - df.rolling(window="2d", on="C").sum() - - # invalid columns - with pytest.raises(ValueError): - df.rolling(window="2d", on="B") - - # ok even though on non-selected - df.rolling(window="2d", on="C").B.sum() - - def test_monotonic_on(self): - - # on/index must be monotonic - df = DataFrame( - {"A": pd.date_range("20130101", periods=5, freq="s"), "B": range(5)} - ) - - assert df.A.is_monotonic - df.rolling("2s", on="A").sum() - - df = df.set_index("A") - assert df.index.is_monotonic - df.rolling("2s").sum() - - # non-monotonic - df.index = reversed(df.index.tolist()) - assert not df.index.is_monotonic - - with pytest.raises(ValueError): - df.rolling("2s").sum() - - df = df.reset_index() - with pytest.raises(ValueError): - df.rolling("2s", on="A").sum() - - def test_frame_on(self): - - df = DataFrame( - { - "B": range(5), - "C": pd.date_range("20130101 09:00:00", periods=5, freq="3s"), - } - ) - - df["A"] = [ - Timestamp("20130101 09:00:00"), - Timestamp("20130101 09:00:02"), - Timestamp("20130101 09:00:03"), - Timestamp("20130101 09:00:05"), - Timestamp("20130101 09:00:06"), - ] - - # we are doing simulating using 'on' - expected = df.set_index("A").rolling("2s").B.sum().reset_index(drop=True) - - result = df.rolling("2s", on="A").B.sum() - tm.assert_series_equal(result, expected) - - # test as a frame - # we should be ignoring the 'on' as an aggregation column - # note that the expected is setting, computing, and resetting - # so the columns need to be switched compared - # to the actual result where they are ordered as in the - # original - expected = ( - df.set_index("A").rolling("2s")[["B"]].sum().reset_index()[["B", "A"]] - ) - - result = df.rolling("2s", on="A")[["B"]].sum() - tm.assert_frame_equal(result, expected) - - def test_frame_on2(self): - - # using multiple aggregation columns - df = DataFrame( - { - "A": [0, 1, 2, 3, 4], - "B": [0, 1, 2, np.nan, 4], - "C": Index( - [ - Timestamp("20130101 09:00:00"), - Timestamp("20130101 09:00:02"), - Timestamp("20130101 09:00:03"), - Timestamp("20130101 09:00:05"), - Timestamp("20130101 09:00:06"), - ] - ), - }, - columns=["A", "C", "B"], - ) - - expected1 = DataFrame( - {"A": [0.0, 1, 3, 3, 7], "B": [0, 1, 3, np.nan, 4], "C": df["C"]}, - columns=["A", "C", "B"], - ) - - result = df.rolling("2s", on="C").sum() - expected = expected1 - tm.assert_frame_equal(result, expected) - - expected = Series([0, 1, 3, np.nan, 4], name="B") - result = df.rolling("2s", on="C").B.sum() - tm.assert_series_equal(result, expected) - - expected = expected1[["A", "B", "C"]] - result = df.rolling("2s", on="C")[["A", "B", "C"]].sum() - tm.assert_frame_equal(result, expected) - - def test_basic_regular(self): - - df = self.regular.copy() - - df.index = pd.date_range("20130101", periods=5, freq="D") - expected = df.rolling(window=1, min_periods=1).sum() - result = df.rolling(window="1D").sum() - tm.assert_frame_equal(result, expected) - - df.index = pd.date_range("20130101", periods=5, freq="2D") - expected = df.rolling(window=1, min_periods=1).sum() - result = df.rolling(window="2D", min_periods=1).sum() - tm.assert_frame_equal(result, expected) - - expected = df.rolling(window=1, min_periods=1).sum() - result = df.rolling(window="2D", min_periods=1).sum() - tm.assert_frame_equal(result, expected) - - expected = df.rolling(window=1).sum() - result = df.rolling(window="2D").sum() - tm.assert_frame_equal(result, expected) - - def test_min_periods(self): - - # compare for min_periods - df = self.regular - - # these slightly different - expected = df.rolling(2, min_periods=1).sum() - result = df.rolling("2s").sum() - tm.assert_frame_equal(result, expected) - - expected = df.rolling(2, min_periods=1).sum() - result = df.rolling("2s", min_periods=1).sum() - tm.assert_frame_equal(result, expected) - - def test_closed(self): - - # xref GH13965 - - df = DataFrame( - {"A": [1] * 5}, - index=[ - Timestamp("20130101 09:00:01"), - Timestamp("20130101 09:00:02"), - Timestamp("20130101 09:00:03"), - Timestamp("20130101 09:00:04"), - Timestamp("20130101 09:00:06"), - ], - ) - - # closed must be 'right', 'left', 'both', 'neither' - with pytest.raises(ValueError): - self.regular.rolling(window="2s", closed="blabla") - - expected = df.copy() - expected["A"] = [1.0, 2, 2, 2, 1] - result = df.rolling("2s", closed="right").sum() - tm.assert_frame_equal(result, expected) - - # default should be 'right' - result = df.rolling("2s").sum() - tm.assert_frame_equal(result, expected) - - expected = df.copy() - expected["A"] = [1.0, 2, 3, 3, 2] - result = df.rolling("2s", closed="both").sum() - tm.assert_frame_equal(result, expected) - - expected = df.copy() - expected["A"] = [np.nan, 1.0, 2, 2, 1] - result = df.rolling("2s", closed="left").sum() - tm.assert_frame_equal(result, expected) - - expected = df.copy() - expected["A"] = [np.nan, 1.0, 1, 1, np.nan] - result = df.rolling("2s", closed="neither").sum() - tm.assert_frame_equal(result, expected) - - def test_ragged_sum(self): - - df = self.ragged - result = df.rolling(window="1s", min_periods=1).sum() - expected = df.copy() - expected["B"] = [0.0, 1, 2, 3, 4] - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="2s", min_periods=1).sum() - expected = df.copy() - expected["B"] = [0.0, 1, 3, 3, 7] - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="2s", min_periods=2).sum() - expected = df.copy() - expected["B"] = [np.nan, np.nan, 3, np.nan, 7] - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="3s", min_periods=1).sum() - expected = df.copy() - expected["B"] = [0.0, 1, 3, 5, 7] - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="3s").sum() - expected = df.copy() - expected["B"] = [0.0, 1, 3, 5, 7] - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="4s", min_periods=1).sum() - expected = df.copy() - expected["B"] = [0.0, 1, 3, 6, 9] - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="4s", min_periods=3).sum() - expected = df.copy() - expected["B"] = [np.nan, np.nan, 3, 6, 9] - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="5s", min_periods=1).sum() - expected = df.copy() - expected["B"] = [0.0, 1, 3, 6, 10] - tm.assert_frame_equal(result, expected) - - def test_ragged_mean(self): - - df = self.ragged - result = df.rolling(window="1s", min_periods=1).mean() - expected = df.copy() - expected["B"] = [0.0, 1, 2, 3, 4] - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="2s", min_periods=1).mean() - expected = df.copy() - expected["B"] = [0.0, 1, 1.5, 3.0, 3.5] - tm.assert_frame_equal(result, expected) - - def test_ragged_median(self): - - df = self.ragged - result = df.rolling(window="1s", min_periods=1).median() - expected = df.copy() - expected["B"] = [0.0, 1, 2, 3, 4] - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="2s", min_periods=1).median() - expected = df.copy() - expected["B"] = [0.0, 1, 1.5, 3.0, 3.5] - tm.assert_frame_equal(result, expected) - - def test_ragged_quantile(self): - - df = self.ragged - result = df.rolling(window="1s", min_periods=1).quantile(0.5) - expected = df.copy() - expected["B"] = [0.0, 1, 2, 3, 4] - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="2s", min_periods=1).quantile(0.5) - expected = df.copy() - expected["B"] = [0.0, 1, 1.5, 3.0, 3.5] - tm.assert_frame_equal(result, expected) - - def test_ragged_std(self): - - df = self.ragged - result = df.rolling(window="1s", min_periods=1).std(ddof=0) - expected = df.copy() - expected["B"] = [0.0] * 5 - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="1s", min_periods=1).std(ddof=1) - expected = df.copy() - expected["B"] = [np.nan] * 5 - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="3s", min_periods=1).std(ddof=0) - expected = df.copy() - expected["B"] = [0.0] + [0.5] * 4 - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="5s", min_periods=1).std(ddof=1) - expected = df.copy() - expected["B"] = [np.nan, 0.707107, 1.0, 1.0, 1.290994] - tm.assert_frame_equal(result, expected) - - def test_ragged_var(self): - - df = self.ragged - result = df.rolling(window="1s", min_periods=1).var(ddof=0) - expected = df.copy() - expected["B"] = [0.0] * 5 - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="1s", min_periods=1).var(ddof=1) - expected = df.copy() - expected["B"] = [np.nan] * 5 - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="3s", min_periods=1).var(ddof=0) - expected = df.copy() - expected["B"] = [0.0] + [0.25] * 4 - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="5s", min_periods=1).var(ddof=1) - expected = df.copy() - expected["B"] = [np.nan, 0.5, 1.0, 1.0, 1 + 2 / 3.0] - tm.assert_frame_equal(result, expected) - - def test_ragged_skew(self): - - df = self.ragged - result = df.rolling(window="3s", min_periods=1).skew() - expected = df.copy() - expected["B"] = [np.nan] * 5 - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="5s", min_periods=1).skew() - expected = df.copy() - expected["B"] = [np.nan] * 2 + [0.0, 0.0, 0.0] - tm.assert_frame_equal(result, expected) - - def test_ragged_kurt(self): - - df = self.ragged - result = df.rolling(window="3s", min_periods=1).kurt() - expected = df.copy() - expected["B"] = [np.nan] * 5 - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="5s", min_periods=1).kurt() - expected = df.copy() - expected["B"] = [np.nan] * 4 + [-1.2] - tm.assert_frame_equal(result, expected) - - def test_ragged_count(self): - - df = self.ragged - result = df.rolling(window="1s", min_periods=1).count() - expected = df.copy() - expected["B"] = [1.0, 1, 1, 1, 1] - tm.assert_frame_equal(result, expected) - - df = self.ragged - result = df.rolling(window="1s").count() - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="2s", min_periods=1).count() - expected = df.copy() - expected["B"] = [1.0, 1, 2, 1, 2] - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="2s", min_periods=2).count() - expected = df.copy() - expected["B"] = [np.nan, np.nan, 2, np.nan, 2] - tm.assert_frame_equal(result, expected) - - def test_regular_min(self): - - df = DataFrame( - { - "A": pd.date_range("20130101", periods=5, freq="s"), - "B": [0.0, 1, 2, 3, 4], - } - ).set_index("A") - result = df.rolling("1s").min() - expected = df.copy() - expected["B"] = [0.0, 1, 2, 3, 4] - tm.assert_frame_equal(result, expected) - - df = DataFrame( - {"A": pd.date_range("20130101", periods=5, freq="s"), "B": [5, 4, 3, 4, 5]} - ).set_index("A") - - tm.assert_frame_equal(result, expected) - result = df.rolling("2s").min() - expected = df.copy() - expected["B"] = [5.0, 4, 3, 3, 4] - tm.assert_frame_equal(result, expected) - - result = df.rolling("5s").min() - expected = df.copy() - expected["B"] = [5.0, 4, 3, 3, 3] - tm.assert_frame_equal(result, expected) - - def test_ragged_min(self): - - df = self.ragged - - result = df.rolling(window="1s", min_periods=1).min() - expected = df.copy() - expected["B"] = [0.0, 1, 2, 3, 4] - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="2s", min_periods=1).min() - expected = df.copy() - expected["B"] = [0.0, 1, 1, 3, 3] - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="5s", min_periods=1).min() - expected = df.copy() - expected["B"] = [0.0, 0, 0, 1, 1] - tm.assert_frame_equal(result, expected) - - def test_perf_min(self): - - N = 10000 - - dfp = DataFrame( - {"B": np.random.randn(N)}, - index=pd.date_range("20130101", periods=N, freq="s"), - ) - expected = dfp.rolling(2, min_periods=1).min() - result = dfp.rolling("2s").min() - assert ((result - expected) < 0.01).all().bool() - - expected = dfp.rolling(200, min_periods=1).min() - result = dfp.rolling("200s").min() - assert ((result - expected) < 0.01).all().bool() - - def test_ragged_max(self): - - df = self.ragged - - result = df.rolling(window="1s", min_periods=1).max() - expected = df.copy() - expected["B"] = [0.0, 1, 2, 3, 4] - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="2s", min_periods=1).max() - expected = df.copy() - expected["B"] = [0.0, 1, 2, 3, 4] - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="5s", min_periods=1).max() - expected = df.copy() - expected["B"] = [0.0, 1, 2, 3, 4] - tm.assert_frame_equal(result, expected) - - def test_ragged_apply(self, raw): - - df = self.ragged - - f = lambda x: 1 - result = df.rolling(window="1s", min_periods=1).apply(f, raw=raw) - expected = df.copy() - expected["B"] = 1.0 - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="2s", min_periods=1).apply(f, raw=raw) - expected = df.copy() - expected["B"] = 1.0 - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="5s", min_periods=1).apply(f, raw=raw) - expected = df.copy() - expected["B"] = 1.0 - tm.assert_frame_equal(result, expected) - - def test_all(self): - - # simple comparison of integer vs time-based windowing - df = self.regular * 2 - er = df.rolling(window=1) - r = df.rolling(window="1s") - - for f in [ - "sum", - "mean", - "count", - "median", - "std", - "var", - "kurt", - "skew", - "min", - "max", - ]: - - result = getattr(r, f)() - expected = getattr(er, f)() - tm.assert_frame_equal(result, expected) - - result = r.quantile(0.5) - expected = er.quantile(0.5) - tm.assert_frame_equal(result, expected) - - def test_all_apply(self, raw): - - df = self.regular * 2 - er = df.rolling(window=1) - r = df.rolling(window="1s") - - result = r.apply(lambda x: 1, raw=raw) - expected = er.apply(lambda x: 1, raw=raw) - tm.assert_frame_equal(result, expected) - - def test_all2(self): - - # more sophisticated comparison of integer vs. - # time-based windowing - df = DataFrame( - {"B": np.arange(50)}, index=pd.date_range("20130101", periods=50, freq="H") - ) - # in-range data - dft = df.between_time("09:00", "16:00") - - r = dft.rolling(window="5H") - - for f in [ - "sum", - "mean", - "count", - "median", - "std", - "var", - "kurt", - "skew", - "min", - "max", - ]: - - result = getattr(r, f)() - - # we need to roll the days separately - # to compare with a time-based roll - # finally groupby-apply will return a multi-index - # so we need to drop the day - def agg_by_day(x): - x = x.between_time("09:00", "16:00") - return getattr(x.rolling(5, min_periods=1), f)() - - expected = ( - df.groupby(df.index.day) - .apply(agg_by_day) - .reset_index(level=0, drop=True) - ) - - tm.assert_frame_equal(result, expected) - - def test_groupby_monotonic(self): - - # GH 15130 - # we don't need to validate monotonicity when grouping - - data = [ - ["David", "1/1/2015", 100], - ["David", "1/5/2015", 500], - ["David", "5/30/2015", 50], - ["David", "7/25/2015", 50], - ["Ryan", "1/4/2014", 100], - ["Ryan", "1/19/2015", 500], - ["Ryan", "3/31/2016", 50], - ["Joe", "7/1/2015", 100], - ["Joe", "9/9/2015", 500], - ["Joe", "10/15/2015", 50], - ] - - df = DataFrame(data=data, columns=["name", "date", "amount"]) - df["date"] = pd.to_datetime(df["date"]) - - expected = ( - df.set_index("date") - .groupby("name") - .apply(lambda x: x.rolling("180D")["amount"].sum()) - ) - result = df.groupby("name").rolling("180D", on="date")["amount"].sum() - tm.assert_series_equal(result, expected) - - def test_non_monotonic(self): - # GH 13966 (similar to #15130, closed by #15175) - - dates = pd.date_range(start="2016-01-01 09:30:00", periods=20, freq="s") - df = DataFrame( - { - "A": [1] * 20 + [2] * 12 + [3] * 8, - "B": np.concatenate((dates, dates)), - "C": np.arange(40), - } - ) - - result = df.groupby("A").rolling("4s", on="B").C.mean() - expected = ( - df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) - ) - tm.assert_series_equal(result, expected) - - df2 = df.sort_values("B") - result = df2.groupby("A").rolling("4s", on="B").C.mean() - tm.assert_series_equal(result, expected) - - def test_rolling_cov_offset(self): - # GH16058 - - idx = pd.date_range("2017-01-01", periods=24, freq="1h") - ss = Series(np.arange(len(idx)), index=idx) - - result = ss.rolling("2h").cov() - expected = Series([np.nan] + [0.5] * (len(idx) - 1), index=idx) - tm.assert_series_equal(result, expected) - - expected2 = ss.rolling(2, min_periods=1).cov() - tm.assert_series_equal(result, expected2) - - result = ss.rolling("3h").cov() - expected = Series([np.nan, 0.5] + [1.0] * (len(idx) - 2), index=idx) - tm.assert_series_equal(result, expected) - - expected2 = ss.rolling(3, min_periods=1).cov() - tm.assert_series_equal(result, expected2) From 15df8a439ca2517c20c0ff850026682308fef7bb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 10 Jul 2019 11:37:47 -0700 Subject: [PATCH 203/238] BUG: appending a Timedelta to Series incorrectly casts to integer (#27303) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/indexing.py | 9 ++++---- pandas/tests/series/indexing/test_indexing.py | 23 +++++++++++++++++++ 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index daca08d69346d6..237d2fec825205 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -1050,6 +1050,7 @@ Indexing - Bug in :class:`Categorical` and :class:`CategoricalIndex` with :class:`Interval` values when using the ``in`` operator (``__contains``) with objects that are not comparable to the values in the ``Interval`` (:issue:`23705`) - Bug in :meth:`DataFrame.loc` and :meth:`DataFrame.iloc` on a :class:`DataFrame` with a single timezone-aware datetime64[ns] column incorrectly returning a scalar instead of a :class:`Series` (:issue:`27110`) - Bug in :class:`CategoricalIndex` and :class:`Categorical` incorrectly raising ``ValueError`` instead of ``TypeError`` when a list is passed using the ``in`` operator (``__contains__``) (:issue:`21729`) +- Bug in setting a new value in a :class:`Series` with a :class:`Timedelta` object incorrectly casting the value to an integer (:issue:`22717`) - Missing diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 1f25be8b9e31e8..c31d6538ad2c30 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -20,6 +20,7 @@ is_sequence, is_sparse, ) +from pandas.core.dtypes.concat import _concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import _infer_fill_value, isna @@ -432,11 +433,9 @@ def _setitem_with_indexer(self, indexer, value): # this preserves dtype of the value new_values = Series([value])._values if len(self.obj._values): - try: - new_values = np.concatenate([self.obj._values, new_values]) - except TypeError: - as_obj = self.obj.astype(object) - new_values = np.concatenate([as_obj, new_values]) + # GH#22717 handle casting compatibility that np.concatenate + # does incorrectly + new_values = _concat_compat([self.obj._values, new_values]) self.obj._data = self.obj._constructor( new_values, index=new_index, name=self.obj.name )._data diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index c8342c54e9b5db..1fb1dd3bb998af 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -654,6 +654,29 @@ def test_timedelta_assignment(): tm.assert_series_equal(s, expected) +@pytest.mark.parametrize( + "td", + [ + pd.Timedelta("9 days"), + pd.Timedelta("9 days").to_timedelta64(), + pd.Timedelta("9 days").to_pytimedelta(), + ], +) +def test_append_timedelta_does_not_cast(td): + # GH#22717 inserting a Timedelta should _not_ cast to int64 + expected = pd.Series(["x", td], index=[0, "td"], dtype=object) + + ser = pd.Series(["x"]) + ser["td"] = td + tm.assert_series_equal(ser, expected) + assert isinstance(ser["td"], pd.Timedelta) + + ser = pd.Series(["x"]) + ser.loc["td"] = pd.Timedelta("9 days") + tm.assert_series_equal(ser, expected) + assert isinstance(ser["td"], pd.Timedelta) + + def test_underlying_data_conversion(): # GH 4080 df = DataFrame({c: [1, 2, 3] for c in ["a", "b", "c"]}) From 472af55268b59635e526e2d02105da04e812a27b Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Wed, 10 Jul 2019 21:16:54 +0200 Subject: [PATCH 204/238] CLN: replace usage internally of .iteritems with .items (#26114) --- asv_bench/benchmarks/frame_methods.py | 8 ++--- .../development/contributing_docstring.rst | 2 +- doc/source/getting_started/basics.rst | 10 +++--- doc/source/reference/frame.rst | 2 +- doc/source/reference/series.rst | 2 +- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/frame.py | 31 +++++++++++-------- pandas/core/generic.py | 21 ++++++++----- pandas/core/indexes/multi.py | 2 +- pandas/core/reshape/pivot.py | 2 +- pandas/core/reshape/reshape.py | 8 ++--- pandas/core/series.py | 13 ++++---- pandas/core/sparse/frame.py | 2 +- pandas/core/strings.py | 2 +- pandas/core/util/hashing.py | 2 +- pandas/io/formats/style.py | 2 +- pandas/io/json/_json.py | 2 +- pandas/io/json/_table_schema.py | 2 +- pandas/io/msgpack/_packer.pyx | 2 +- pandas/io/sql.py | 2 +- pandas/io/stata.py | 4 +-- pandas/plotting/_matplotlib/core.py | 2 +- pandas/tests/frame/test_api.py | 4 +-- pandas/tests/frame/test_indexing.py | 2 +- pandas/tests/frame/test_operators.py | 4 +-- pandas/tests/indexing/test_indexing.py | 2 +- pandas/tests/indexing/test_scalar.py | 4 +-- pandas/tests/io/test_html.py | 4 +-- pandas/tests/series/test_api.py | 4 +-- pandas/tests/series/test_io.py | 2 +- pandas/tests/test_base.py | 4 +-- 31 files changed, 84 insertions(+), 70 deletions(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 5008b77d9fb28b..e2f6764c76eef8 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -115,15 +115,15 @@ def setup(self): ) self.df4 = DataFrame(np.random.randn(N * 1000, 10)) - def time_iteritems(self): + def time_items(self): # (monitor no-copying behaviour) if hasattr(self.df, "_item_cache"): self.df._item_cache.clear() - for name, col in self.df.iteritems(): + for name, col in self.df.items(): pass - def time_iteritems_cached(self): - for name, col in self.df.iteritems(): + def time_items_cached(self): + for name, col in self.df.items(): pass def time_iteritems_indexing(self): diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst index 62216f168af3cd..34bc5f44eb0c01 100644 --- a/doc/source/development/contributing_docstring.rst +++ b/doc/source/development/contributing_docstring.rst @@ -522,7 +522,7 @@ examples: * ``loc`` and ``iloc``, as they do the same, but in one case providing indices and in the other positions * ``max`` and ``min``, as they do the opposite -* ``iterrows``, ``itertuples`` and ``iteritems``, as it is easy that a user +* ``iterrows``, ``itertuples`` and ``items``, as it is easy that a user looking for the method to iterate over columns ends up in the method to iterate over rows, and vice-versa * ``fillna`` and ``dropna``, as both methods are used to handle missing values diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 682d6c1ef8301f..bc3b7b4c70fd13 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -1475,7 +1475,7 @@ Thus, for example, iterating over a DataFrame gives you the column names: print(col) -Pandas objects also have the dict-like :meth:`~DataFrame.iteritems` method to +Pandas objects also have the dict-like :meth:`~DataFrame.items` method to iterate over the (key, value) pairs. To iterate over the rows of a DataFrame, you can use the following methods: @@ -1524,10 +1524,10 @@ To iterate over the rows of a DataFrame, you can use the following methods: df -iteritems -~~~~~~~~~ +items +~~~~~ -Consistent with the dict-like interface, :meth:`~DataFrame.iteritems` iterates +Consistent with the dict-like interface, :meth:`~DataFrame.items` iterates through key-value pairs: * **Series**: (index, scalar value) pairs @@ -1537,7 +1537,7 @@ For example: .. ipython:: python - for label, ser in df.iteritems(): + for label, ser in df.items(): print(label) print(ser) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 1a316c2f25ec63..c0b58fd2d99f50 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -67,8 +67,8 @@ Indexing, iteration DataFrame.insert DataFrame.__iter__ DataFrame.items - DataFrame.keys DataFrame.iteritems + DataFrame.keys DataFrame.iterrows DataFrame.itertuples DataFrame.lookup diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index e8e2f64e22cb51..8d2a764c33a43b 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -76,8 +76,8 @@ Indexing, iteration Series.loc Series.iloc Series.__iter__ - Series.iteritems Series.items + Series.iteritems Series.keys Series.pop Series.item diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 237d2fec825205..042c97a0c98b1a 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -889,6 +889,7 @@ Other deprecations - :meth:`DataFrame.get_dtype_counts` is deprecated. (:issue:`18262`) - :meth:`Categorical.ravel` will return a :class:`Categorical` instead of a ``np.ndarray`` (:issue:`27199`) + .. _whatsnew_0250.prior_deprecations: Removal of prior version deprecations/changes diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ce1b99b3159362..55a9eb6a0810a8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -771,15 +771,15 @@ def style(self): return Styler(self) - def iteritems(self): - r""" + _shared_docs[ + "items" + ] = r""" Iterator over (column name, Series) pairs. Iterates over the DataFrame columns, returning a tuple with the column name and the content as a Series. - Yields - ------ + %s label : object The column names for the DataFrame being iterated over. content : Series @@ -802,7 +802,7 @@ def iteritems(self): panda bear 1864 polar bear 22000 koala marsupial 80000 - >>> for label, content in df.iteritems(): + >>> for label, content in df.items(): ... print('label:', label) ... print('content:', content, sep='\n') ... @@ -819,6 +819,9 @@ def iteritems(self): koala 80000 Name: population, dtype: int64 """ + + @Appender(_shared_docs["items"] % "Yields\n ------") + def items(self): if self.columns.is_unique and hasattr(self, "_item_cache"): for k in self.columns: yield k, self._get_item_cache(k) @@ -826,6 +829,10 @@ def iteritems(self): for i, k in enumerate(self.columns): yield k, self._ixs(i, axis=1) + @Appender(_shared_docs["items"] % "Returns\n -------") + def iteritems(self): + return self.items() + def iterrows(self): """ Iterate over DataFrame rows as (index, Series) pairs. @@ -843,7 +850,7 @@ def iterrows(self): See Also -------- itertuples : Iterate over DataFrame rows as namedtuples of the values. - iteritems : Iterate over (column name, Series) pairs. + items : Iterate over (column name, Series) pairs. Notes ----- @@ -901,7 +908,7 @@ def itertuples(self, index=True, name="Pandas"): -------- DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) pairs. - DataFrame.iteritems : Iterate over (column name, Series) pairs. + DataFrame.items : Iterate over (column name, Series) pairs. Notes ----- @@ -958,8 +965,6 @@ def itertuples(self, index=True, name="Pandas"): # fallback to regular tuples return zip(*arrays) - items = iteritems - def __len__(self): """ Returns length of info axis, but here we use the index. @@ -2634,7 +2639,7 @@ def memory_usage(self, index=True, deep=False): 5216 """ result = Series( - [c.memory_usage(index=False, deep=deep) for col, c in self.iteritems()], + [c.memory_usage(index=False, deep=deep) for col, c in self.items()], index=self.columns, ) if index: @@ -4955,7 +4960,7 @@ def f(vals): if not diff.empty: raise KeyError(diff) - vals = (col.values for name, col in self.iteritems() if name in subset) + vals = (col.values for name, col in self.items() if name in subset) labels, shape = map(list, zip(*map(f, vals))) ids = get_group_index(labels, shape, sort=False, xnull=False) @@ -7343,7 +7348,7 @@ def round(self, decimals=0, *args, **kwargs): from pandas.core.reshape.concat import concat def _dict_round(df, decimals): - for col, vals in df.iteritems(): + for col, vals in df.items(): try: yield _series_round(vals, decimals[col]) except KeyError: @@ -7363,7 +7368,7 @@ def _series_round(s, decimals): new_cols = [col for col in _dict_round(self, decimals)] elif is_integer(decimals): # Dispatch to Series.round - new_cols = [_series_round(v, decimals) for _, v in self.iteritems()] + new_cols = [_series_round(v, decimals) for _, v in self.items()] else: raise TypeError("decimals must be an integer, a dict-like or a " "Series") diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5db06d32880ccc..4e05dfca43e786 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -494,7 +494,7 @@ def _get_space_character_free_column_resolvers(self): """ from pandas.core.computation.common import _remove_spaces_column_name - return {_remove_spaces_column_name(k): v for k, v in self.iteritems()} + return {_remove_spaces_column_name(k): v for k, v in self.items()} @property def _info_axis(self): @@ -1936,15 +1936,22 @@ def keys(self): """ return self._info_axis - def iteritems(self): - """ - Iterate over (label, values) on info axis + def items(self): + """Iterate over (label, values) on info axis - This is index for Series, columns for DataFrame and so on. + This is index for Series and columns for DataFrame. + + Returns + ------- + Generator """ for h in self._info_axis: yield h, self[h] + @Appender(items.__doc__) + def iteritems(self): + return self.items() + def __len__(self): """Returns length of info axis""" return len(self._info_axis) @@ -5912,7 +5919,7 @@ def astype(self, dtype, copy=True, errors="raise", **kwargs): "key in a dtype mappings argument." ) results = [] - for col_name, col in self.iteritems(): + for col_name, col in self.items(): if col_name in dtype: results.append( col.astype( @@ -10328,7 +10335,7 @@ def describe_1d(data): else: data = self.select_dtypes(include=include, exclude=exclude) - ldesc = [describe_1d(s) for _, s in data.iteritems()] + ldesc = [describe_1d(s) for _, s in data.items()] # set a convenient order for rows names = [] ldesc_indexes = sorted((x.index for x in ldesc), key=len) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ff0bffacd37ad3..670a4666a34405 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -601,7 +601,7 @@ def from_frame(cls, df, sortorder=None, names=None): if not isinstance(df, ABCDataFrame): raise TypeError("Input must be a DataFrame") - column_names, columns = zip(*df.iteritems()) + column_names, columns = zip(*df.items()) names = column_names if names is None else names return cls.from_arrays(columns, sortorder=sortorder, names=names) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 188f2edd96590a..23bf89b2bc1ac8 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -272,7 +272,7 @@ def _compute_grand_margin(data, values, aggfunc, margins_name="All"): if values: grand_margin = {} - for k, v in data[values].iteritems(): + for k, v in data[values].items(): try: if isinstance(aggfunc, str): grand_margin[k] = getattr(v, aggfunc)() diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 5d932d7ded9b8c..540a06caec2203 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -478,7 +478,7 @@ def _unstack_extension_series(series, level, fill_value): out = [] values = extract_array(series, extract_numpy=False) - for col, indices in result.iteritems(): + for col, indices in result.items(): out.append( Series( values.take(indices.values, allow_fill=True, fill_value=fill_value), @@ -544,7 +544,7 @@ def factorize(index): if is_extension_array_dtype(dtype): arr = dtype.construct_array_type() new_values = arr._concat_same_type( - [col._values for _, col in frame.iteritems()] + [col._values for _, col in frame.items()] ) new_values = _reorder_for_extension_array_stack(new_values, N, K) else: @@ -695,7 +695,7 @@ def _convert_level_number(level_num, columns): subset = this[this.columns[loc]] value_slice = dtype.construct_array_type()._concat_same_type( - [x._values for _, x in subset.iteritems()] + [x._values for _, x in subset.items()] ) N, K = this.shape idx = np.arange(N * K).reshape(K, N).T.ravel() @@ -909,7 +909,7 @@ def check_len(item, name): # columns to prepend to result. with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)] - for (col, pre, sep) in zip(data_to_encode.iteritems(), prefix, prefix_sep): + for (col, pre, sep) in zip(data_to_encode.items(), prefix, prefix_sep): # col is (column_name, column), use just column data here dummy = _get_dummies_1d( col[1], diff --git a/pandas/core/series.py b/pandas/core/series.py index 1943b66818b95b..4b78907e661067 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1693,13 +1693,12 @@ def to_string( # ---------------------------------------------------------------------- - def iteritems(self): + def items(self): """ Lazily iterate over (index, value) tuples. This method returns an iterable tuple (index, value). This is - convenient if you want to create a lazy iterator. Note that the - methods Series.items and Series.iteritems are the same methods. + convenient if you want to create a lazy iterator. Returns ------- @@ -1709,12 +1708,12 @@ def iteritems(self): See Also -------- - DataFrame.iteritems : Equivalent to Series.iteritems for DataFrame. + DataFrame.items : Equivalent to Series.items for DataFrame. Examples -------- >>> s = pd.Series(['A', 'B', 'C']) - >>> for index, value in s.iteritems(): + >>> for index, value in s.items(): ... print("Index : {}, Value : {}".format(index, value)) Index : 0, Value : A Index : 1, Value : B @@ -1722,7 +1721,9 @@ def iteritems(self): """ return zip(iter(self.index), iter(self)) - items = iteritems + @Appender(items.__doc__) + def iteritems(self): + return self.items() # ---------------------------------------------------------------------- # Misc public methods diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 60060a4a2d1fae..54998eb66e69d2 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -698,7 +698,7 @@ def _reindex_index( need_mask = mask.any() new_series = {} - for col, series in self.iteritems(): + for col, series in self.items(): if mask.all(): continue diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 70700653c47957..7c293ca4e50b07 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -998,7 +998,7 @@ def str_extractall(arr, pat, flags=0): index_list = [] is_mi = arr.index.nlevels > 1 - for subject_key, subject in arr.iteritems(): + for subject_key, subject in arr.items(): if isinstance(subject, str): if not is_mi: diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index f07133baed4359..f5ab81ad9089ec 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -113,7 +113,7 @@ def hash_pandas_object( h = Series(h, index=obj.index, dtype="uint64", copy=False) elif isinstance(obj, ABCDataFrame): - hashes = (hash_array(series.values) for _, series in obj.iteritems()) + hashes = (hash_array(series.values) for _, series in obj.items()) num_items = len(obj.columns) if index: index_hash_generator = ( diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index e7aa5d22995c66..98349fe1e4792e 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -538,7 +538,7 @@ def _update_ctx(self, attrs): matter. """ for row_label, v in attrs.iterrows(): - for col_label, col in v.iteritems(): + for col_label, col in v.items(): i = self.index.get_indexer([row_label])[0] j = self.columns.get_indexer([col_label])[0] for pair in col.rstrip(";").split(";"): diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 1f0728ee96469a..f3c966bb1a476d 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1105,7 +1105,7 @@ def _process_converter(self, f, filt=None): needs_new_obj = False new_obj = dict() - for i, (col, c) in enumerate(self.obj.iteritems()): + for i, (col, c) in enumerate(self.obj.items()): if filt(col, c): new_data, result = f(col, c) if result: diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 045127c63af5c2..1e7cd54d9f4a00 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -249,7 +249,7 @@ def build_table_schema(data, index=True, primary_key=None, version=True): fields.append(convert_pandas_type_to_json_field(data.index)) if data.ndim > 1: - for column, s in data.iteritems(): + for column, s in data.items(): fields.append(convert_pandas_type_to_json_field(s)) else: fields.append(convert_pandas_type_to_json_field(data)) diff --git a/pandas/io/msgpack/_packer.pyx b/pandas/io/msgpack/_packer.pyx index a0d2b013c8e9df..0ed188074f3d98 100644 --- a/pandas/io/msgpack/_packer.pyx +++ b/pandas/io/msgpack/_packer.pyx @@ -194,7 +194,7 @@ cdef class Packer: raise ValueError("dict is too large") ret = msgpack_pack_map(&self.pk, L) if ret == 0: - for k, v in d.iteritems(): + for k, v in d.items(): ret = self._pack(k, nest_limit - 1) if ret != 0: break ret = self._pack(v, nest_limit - 1) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 211571c7dbaa13..6fe34e4e9705aa 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -108,7 +108,7 @@ def _parse_date_columns(data_frame, parse_dates): # we want to coerce datetime64_tz dtypes for now to UTC # we could in theory do a 'nice' conversion from a FixedOffset tz # GH11216 - for col_name, df_col in data_frame.iteritems(): + for col_name, df_col in data_frame.items(): if is_datetime64tz_dtype(df_col) or col_name in parse_dates: try: fmt = parse_dates[col_name] diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 7087d2ee963cbd..29cb2a5dc0f0e0 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2302,7 +2302,7 @@ def _check_column_names(self, data): def _set_formats_and_types(self, data, dtypes): self.typlist = [] self.fmtlist = [] - for col, dtype in dtypes.iteritems(): + for col, dtype in dtypes.items(): self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, data[col])) self.typlist.append(_dtype_to_stata_type(dtype, data[col])) @@ -3168,7 +3168,7 @@ def _convert_strls(self, data): def _set_formats_and_types(self, data, dtypes): self.typlist = [] self.fmtlist = [] - for col, dtype in dtypes.iteritems(): + for col, dtype in dtypes.items(): force_strl = col in self._convert_strl fmt = _dtype_to_default_stata_fmt( dtype, data[col], dta_version=117, force_strl=force_strl diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index d25715e6d167b3..519465802085bb 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -258,7 +258,7 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): # else: # columns = data.columns - for col, values in data.iteritems(): + for col, values in data.items(): if keep_index is True: yield col, values else: diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index fe59f0574fb75a..b4b081cfe8d76d 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -319,7 +319,7 @@ def test_sequence_like_with_categorical(self): for row, s in df.iterrows(): str(s) - for c, col in df.iteritems(): + for c, col in df.items(): str(s) def test_len(self, float_frame): @@ -430,7 +430,7 @@ def test_repr_with_mi_nat(self, float_string_frame): expected = " X\nNaT a 1\n2013-01-01 b 2" assert result == expected - def test_iteritems_names(self, float_string_frame): + def test_items_names(self, float_string_frame): for k, v in float_string_frame.items(): assert v.name == k diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index c2d38b2938fca2..3c102f49c6cbf7 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -2712,7 +2712,7 @@ def _check_get(df, cond, check_dtypes=True): other1 = _safe_add(df) rs = df.where(cond, other1) rs2 = df.where(cond.values, other1) - for k, v in rs.iteritems(): + for k, v in rs.items(): exp = Series(np.where(cond[k], df[k], other1[k]), index=v.index) assert_series_equal(v, exp, check_names=False) assert_frame_equal(rs, rs2) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 67482ddf657fb2..bffdf17a497506 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -281,7 +281,7 @@ def test_binary_ops_align(self): result = getattr(df, op)(x, level="third", axis=0) expected = pd.concat( - [opa(df.loc[idx[:, :, i], :], v) for i, v in x.iteritems()] + [opa(df.loc[idx[:, :, i], :], v) for i, v in x.items()] ).sort_index() assert_frame_equal(result, expected) @@ -289,7 +289,7 @@ def test_binary_ops_align(self): result = getattr(df, op)(x, level="second", axis=0) expected = ( - pd.concat([opa(df.loc[idx[:, i], :], v) for i, v in x.iteritems()]) + pd.concat([opa(df.loc[idx[:, i], :], v) for i, v in x.items()]) .reindex_like(df) .sort_index() ) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index ba144909724cfb..a2a22bf60e0e40 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -839,7 +839,7 @@ def test_float_index_non_scalar_assignment(self): def test_float_index_at_iat(self): s = Series([1, 2, 3], index=[0.1, 0.2, 0.3]) - for el, item in s.iteritems(): + for el, item in s.items(): assert s.at[el] == item for i in range(len(s)): assert s.iat[i] == i + 1 diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py index e6ccee684b76b6..38b4897e55c844 100644 --- a/pandas/tests/indexing/test_scalar.py +++ b/pandas/tests/indexing/test_scalar.py @@ -198,7 +198,7 @@ def test_series_set_tz_timestamp(self, tz_naive_fixture): def test_mixed_index_at_iat_loc_iloc_series(self): # GH 19860 s = Series([1, 2, 3, 4, 5], index=["a", "b", "c", 1, 2]) - for el, item in s.iteritems(): + for el, item in s.items(): assert s.at[el] == s.loc[el] == item for i in range(len(s)): assert s.iat[i] == s.iloc[i] == i + 1 @@ -214,7 +214,7 @@ def test_mixed_index_at_iat_loc_iloc_dataframe(self): [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]], columns=["a", "b", "c", 1, 2] ) for rowIdx, row in df.iterrows(): - for el, item in row.iteritems(): + for el, item in row.items(): assert df.at[rowIdx, el] == df.loc[rowIdx, el] == item for row in range(2): diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 6d06113dfc9eca..615e2735cd288f 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -380,7 +380,7 @@ def test_thousands_macau_stats(self, datapath): dfs = self.read_html(macau_data, index_col=0, attrs={"class": "style1"}) df = dfs[all_non_nan_table_index] - assert not any(s.isna().any() for _, s in df.iteritems()) + assert not any(s.isna().any() for _, s in df.items()) @pytest.mark.slow def test_thousands_macau_index_col(self, datapath): @@ -389,7 +389,7 @@ def test_thousands_macau_index_col(self, datapath): dfs = self.read_html(macau_data, index_col=0, header=0) df = dfs[all_non_nan_table_index] - assert not any(s.isna().any() for _, s in df.iteritems()) + assert not any(s.isna().any() for _, s in df.items()) def test_empty_tables(self): """ diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 2870677e42d50e..d204d7d2a1d7ca 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -338,10 +338,10 @@ def test_values(self): tm.assert_almost_equal(self.ts.values, self.ts, check_dtype=False) def test_iteritems(self): - for idx, val in self.series.items(): + for idx, val in self.series.iteritems(): assert val == self.series[idx] - for idx, val in self.ts.items(): + for idx, val in self.ts.iteritems(): assert val == self.ts[idx] # assert is lazy (genrators don't define reverse, lists do) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 5389390501b32f..0686b397cbd811 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -268,5 +268,5 @@ def test_to_dict(self, mapping, datetime_series): Series(datetime_series.to_dict(mapping), name="ts"), datetime_series ) from_method = Series(datetime_series.to_dict(collections.Counter)) - from_constructor = Series(collections.Counter(datetime_series.iteritems())) + from_constructor = Series(collections.Counter(datetime_series.items())) tm.assert_series_equal(from_method, from_constructor) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 279d6dd84d92bb..d75016824d6cf3 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1107,13 +1107,13 @@ def test_iterable_object_and_category(self, typ, method, dtype, rdtype, obj): @pytest.mark.parametrize("dtype, rdtype", dtypes) def test_iterable_items(self, dtype, rdtype): # gh-13258 - # test items / iteritems yields the correct boxed scalars + # test if items yields the correct boxed scalars # this only applies to series s = Series([1], dtype=dtype) _, result = list(s.items())[0] assert isinstance(result, rdtype) - _, result = list(s.iteritems())[0] + _, result = list(s.items())[0] assert isinstance(result, rdtype) @pytest.mark.parametrize( From 2d0b20b719452dd2539fbd4ad2c5e48e8c4c95d1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 10 Jul 2019 19:18:33 -0500 Subject: [PATCH 205/238] TST: suppress rolling warnings correctly for raw= (#27330) --- pandas/tests/window/test_window.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/pandas/tests/window/test_window.py b/pandas/tests/window/test_window.py index d85e22de1d176a..2f3b83e172795f 100644 --- a/pandas/tests/window/test_window.py +++ b/pandas/tests/window/test_window.py @@ -1,4 +1,5 @@ from collections import OrderedDict +import copy from datetime import datetime, timedelta import warnings from warnings import catch_warnings @@ -1536,21 +1537,20 @@ def test_rolling_apply(self, raw): # suppress warnings about empty slices, as we are deliberately testing # with a 0-length Series - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - message=".*(empty slice|0 for slice).*", - category=RuntimeWarning, - ) - - def f(x): + def f(x): + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=".*(empty slice|0 for slice).*", + category=RuntimeWarning, + ) return x[np.isfinite(x)].mean() - self._check_moment_func(np.mean, name="apply", func=f, raw=raw) + self._check_moment_func(np.mean, name="apply", func=f, raw=raw) - expected = Series([]) - result = expected.rolling(10).apply(lambda x: x.mean(), raw=raw) - tm.assert_series_equal(result, expected) + expected = Series([]) + result = expected.rolling(10).apply(lambda x: x.mean(), raw=raw) + tm.assert_series_equal(result, expected) # gh-8080 s = Series([None, None, None]) @@ -1676,6 +1676,12 @@ def _check_moment_func( zero_min_periods_equal=True, **kwargs ): + + # inject raw + if name == "apply": + kwargs = copy.copy(kwargs) + kwargs["raw"] = raw + def get_result(obj, window, min_periods=None, center=False): r = obj.rolling(window=window, min_periods=min_periods, center=center) return getattr(r, name)(**kwargs) From 38855759e7ca30f6f0fb50be3dfe73f4ff6d42ab Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 11 Jul 2019 07:32:14 -0700 Subject: [PATCH 206/238] BUG: Consistent division by zero behavior for Index/Series (#27321) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/ops/__init__.py | 32 +------------ pandas/core/ops/missing.py | 48 ++++++++++++------- pandas/tests/arithmetic/test_numeric.py | 28 +++++------ .../tests/arrays/sparse/test_arithmetics.py | 6 +++ pandas/tests/arrays/test_integer.py | 4 ++ pandas/tests/sparse/frame/test_frame.py | 16 +++++-- pandas/tests/sparse/series/test_series.py | 15 ++++-- 8 files changed, 82 insertions(+), 68 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 042c97a0c98b1a..ebe8b4770f6aa2 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -1009,6 +1009,7 @@ Numeric - Bug in :meth:`~pandas.eval` when comparing floats with scalar operators, for example: ``x < -0.1`` (:issue:`25928`) - Fixed bug where casting all-boolean array to integer extension array failed (:issue:`25211`) - Bug in ``divmod`` with a :class:`Series` object containing zeros incorrectly raising ``AttributeError`` (:issue:`26987`) +- Inconsistency in :class:`Series` floor-division (`//`) and ``divmod`` filling positive//zero with ``NaN`` instead of ``Inf`` (:issue:`27321`) - Conversion diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index d735ab3ad25353..ee5c670364485a 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -234,32 +234,6 @@ def _gen_eval_kwargs(name): return kwargs -def _gen_fill_zeros(name): - """ - Find the appropriate fill value to use when filling in undefined values - in the results of the given operation caused by operating on - (generally dividing by) zero. - - Parameters - ---------- - name : str - - Returns - ------- - fill_value : {None, np.nan, np.inf} - """ - name = name.strip("__") - if "div" in name: - # truediv, floordiv, and reversed variants - fill_value = np.inf - elif "mod" in name: - # mod, rmod - fill_value = np.nan - else: - fill_value = None - return fill_value - - def _get_frame_op_default_axis(name): """ Only DataFrame cares about default_axis, specifically: @@ -1632,7 +1606,6 @@ def _arith_method_SERIES(cls, op, special): str_rep = _get_opstr(op, cls) op_name = _get_op_name(op, special) eval_kwargs = _gen_eval_kwargs(op_name) - fill_zeros = _gen_fill_zeros(op_name) construct_result = ( _construct_divmod_result if op in [divmod, rdivmod] else _construct_result ) @@ -1663,7 +1636,7 @@ def na_op(x, y): except TypeError: result = masked_arith_op(x, y, op) - return missing.dispatch_fill_zeros(op, x, y, result, fill_zeros) + return missing.dispatch_fill_zeros(op, x, y, result) def wrapper(left, right): if isinstance(right, ABCDataFrame): @@ -2154,7 +2127,6 @@ def _arith_method_FRAME(cls, op, special): str_rep = _get_opstr(op, cls) op_name = _get_op_name(op, special) eval_kwargs = _gen_eval_kwargs(op_name) - fill_zeros = _gen_fill_zeros(op_name) default_axis = _get_frame_op_default_axis(op_name) def na_op(x, y): @@ -2165,7 +2137,7 @@ def na_op(x, y): except TypeError: result = masked_arith_op(x, y, op) - return missing.dispatch_fill_zeros(op, x, y, result, fill_zeros) + return missing.dispatch_fill_zeros(op, x, y, result) if op_name in _op_descriptions: # i.e. include "add" but not "__add__" diff --git a/pandas/core/ops/missing.py b/pandas/core/ops/missing.py index 608c2550994f19..36989582615558 100644 --- a/pandas/core/ops/missing.py +++ b/pandas/core/ops/missing.py @@ -27,7 +27,7 @@ from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype, is_scalar -from .roperator import rdivmod +from .roperator import rdivmod, rfloordiv, rmod def fill_zeros(result, x, y, name, fill): @@ -85,7 +85,7 @@ def fill_zeros(result, x, y, name, fill): return result -def mask_zero_div_zero(x, y, result, copy=False): +def mask_zero_div_zero(x, y, result): """ Set results of 0 / 0 or 0 // 0 to np.nan, regardless of the dtypes of the numerator or the denominator. @@ -95,9 +95,6 @@ def mask_zero_div_zero(x, y, result, copy=False): x : ndarray y : ndarray result : ndarray - copy : bool (default False) - Whether to always create a new array or try to fill in the existing - array if possible. Returns ------- @@ -113,10 +110,19 @@ def mask_zero_div_zero(x, y, result, copy=False): >>> mask_zero_div_zero(x, y, result) array([ inf, nan, -inf]) """ + if not isinstance(result, np.ndarray): + # FIXME: SparseArray would raise TypeError with np.putmask + return result + if is_scalar(y): y = np.array(y) zmask = y == 0 + + if isinstance(zmask, bool): + # FIXME: numpy did not evaluate pointwise, seen in docs build + return result + if zmask.any(): shape = result.shape @@ -125,12 +131,13 @@ def mask_zero_div_zero(x, y, result, copy=False): zpos_mask = zmask & ~zneg_mask nan_mask = (zmask & (x == 0)).ravel() - neginf_mask = ((zpos_mask & (x < 0)) | (zneg_mask & (x > 0))).ravel() - posinf_mask = ((zpos_mask & (x > 0)) | (zneg_mask & (x < 0))).ravel() + with np.errstate(invalid="ignore"): + neginf_mask = ((zpos_mask & (x < 0)) | (zneg_mask & (x > 0))).ravel() + posinf_mask = ((zpos_mask & (x > 0)) | (zneg_mask & (x < 0))).ravel() if nan_mask.any() or neginf_mask.any() or posinf_mask.any(): # Fill negative/0 with -inf, positive/0 with +inf, 0/0 with NaN - result = result.astype("float64", copy=copy).ravel() + result = result.astype("float64", copy=False).ravel() np.putmask(result, nan_mask, np.nan) np.putmask(result, posinf_mask, np.inf) @@ -157,36 +164,45 @@ def dispatch_missing(op, left, right, result): ------- result : ndarray """ - opstr = "__{opname}__".format(opname=op.__name__).replace("____", "__") if op is operator.floordiv: # Note: no need to do this for truediv; in py3 numpy behaves the way # we want. result = mask_zero_div_zero(left, right, result) elif op is operator.mod: - result = fill_zeros(result, left, right, opstr, np.nan) + result = fill_zeros(result, left, right, "__mod__", np.nan) elif op is divmod: res0 = mask_zero_div_zero(left, right, result[0]) - res1 = fill_zeros(result[1], left, right, opstr, np.nan) + res1 = fill_zeros(result[1], left, right, "__divmod__", np.nan) result = (res0, res1) return result # FIXME: de-duplicate with dispatch_missing -def dispatch_fill_zeros(op, left, right, result, fill_value): +def dispatch_fill_zeros(op, left, right, result): """ Call fill_zeros with the appropriate fill value depending on the operation, with special logic for divmod and rdivmod. """ if op is divmod: result = ( - fill_zeros(result[0], left, right, "__floordiv__", np.inf), + mask_zero_div_zero(left, right, result[0]), fill_zeros(result[1], left, right, "__mod__", np.nan), ) elif op is rdivmod: result = ( - fill_zeros(result[0], left, right, "__rfloordiv__", np.inf), + mask_zero_div_zero(right, left, result[0]), fill_zeros(result[1], left, right, "__rmod__", np.nan), ) - else: - result = fill_zeros(result, left, right, op.__name__, fill_value) + elif op is operator.floordiv: + # Note: no need to do this for truediv; in py3 numpy behaves the way + # we want. + result = mask_zero_div_zero(left, right, result) + elif op is op is rfloordiv: + # Note: no need to do this for rtruediv; in py3 numpy behaves the way + # we want. + result = mask_zero_div_zero(right, left, result) + elif op is operator.mod: + result = fill_zeros(result, left, right, "__mod__", np.nan) + elif op is rmod: + result = fill_zeros(result, left, right, "__rmod__", np.nan) return result diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 1fbecbab469e40..2b23790e4ccd32 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -331,7 +331,12 @@ def test_ser_divmod_zero(self, dtype1, any_real_dtype): left = pd.Series([1, 1]).astype(dtype1) right = pd.Series([0, 2]).astype(dtype2) + # GH#27321 pandas convention is to set 1 // 0 to np.inf, as opposed + # to numpy which sets to np.nan; patch `expected[0]` below expected = left // right, left % right + expected = list(expected) + expected[0] = expected[0].astype(np.float64) + expected[0][0] = np.inf result = divmod(left, right) tm.assert_series_equal(result[0], expected[0]) @@ -881,17 +886,16 @@ def check(series, other): _check_op(series, other, operator.pow, pos_only=True) - _check_op(series, other, lambda x, y: operator.add(y, x)) - _check_op(series, other, lambda x, y: operator.sub(y, x)) - _check_op(series, other, lambda x, y: operator.truediv(y, x)) - _check_op(series, other, lambda x, y: operator.floordiv(y, x)) - _check_op(series, other, lambda x, y: operator.mul(y, x)) - _check_op(series, other, lambda x, y: operator.pow(y, x), pos_only=True) - _check_op(series, other, lambda x, y: operator.mod(y, x)) + _check_op(series, other, ops.radd) + _check_op(series, other, ops.rsub) + _check_op(series, other, ops.rtruediv) + _check_op(series, other, ops.rfloordiv) + _check_op(series, other, ops.rmul) + _check_op(series, other, ops.rpow, pos_only=True) + _check_op(series, other, ops.rmod) tser = tm.makeTimeSeries().rename("ts") check(tser, tser * 2) - check(tser, tser * 0) check(tser, tser[::2]) check(tser, 5) @@ -931,13 +935,9 @@ def check(series, other): tser = tm.makeTimeSeries().rename("ts") check(tser, tser * 2) - check(tser, tser * 0) check(tser, tser[::2]) check(tser, 5) - @pytest.mark.xfail( - reason="Series division does not yet fill 1/0 consistently; Index does." - ) def test_series_divmod_zero(self): # Check that divmod uses pandas convention for division by zero, # which does not match numpy. @@ -950,8 +950,8 @@ def test_series_divmod_zero(self): other = tser * 0 result = divmod(tser, other) - exp1 = pd.Series([np.inf] * len(tser), index=tser.index) - exp2 = pd.Series([np.nan] * len(tser), index=tser.index) + exp1 = pd.Series([np.inf] * len(tser), index=tser.index, name="ts") + exp2 = pd.Series([np.nan] * len(tser), index=tser.index, name="ts") tm.assert_series_equal(result[0], exp1) tm.assert_series_equal(result[1], exp2) diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index 0f8f3d261c3b36..57e5a35d99e482 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -49,6 +49,12 @@ def _check_numeric_ops(self, a, b, a_dense, b_dense, mix, op): else: expected = op(a_dense, b_dense) + if op in [operator.floordiv, ops.rfloordiv]: + # Series sets 1//0 to np.inf, which SparseArray does not do (yet) + mask = np.isinf(expected) + if mask.any(): + expected[mask] = np.nan + self._assert(result, expected) def _check_bool_result(self, res): diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index dfdb08fa78cbc9..8fbfb4c12f4b25 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -179,6 +179,10 @@ def _check_op_float(self, result, expected, mask, s, op_name, other): # check comparisons that are resulting in float dtypes expected[mask] = np.nan + if "floordiv" in op_name: + # Series op sets 1//0 to np.inf, which IntegerArray does not do (yet) + mask2 = np.isinf(expected) & np.isnan(result) + expected[mask2] = np.nan tm.assert_series_equal(result, expected) def _check_op_integer(self, result, expected, mask, s, op_name, other): diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 64c81a8c109856..5682c74a8b692b 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -1,4 +1,5 @@ import operator +from types import LambdaType import numpy as np from numpy import nan @@ -9,6 +10,7 @@ import pandas as pd from pandas import DataFrame, Series, bdate_range, compat +from pandas.core import ops from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.sparse import frame as spf from pandas.core.sparse.api import ( @@ -424,6 +426,13 @@ def _compare_to_dense(a, b, da, db, op): sparse_result = op(a, b) dense_result = op(da, db) + # catch lambdas but not non-lambdas e.g. operator.add + if op in [operator.floordiv, ops.rfloordiv] or isinstance(op, LambdaType): + # GH#27231 Series sets 1//0 to np.inf, which SparseArray + # does not do (yet) + mask = np.isinf(dense_result) & ~np.isinf(sparse_result.to_dense()) + dense_result[mask] = np.nan + fill = sparse_result.default_fill_value dense_result = dense_result.to_sparse(fill_value=fill) tm.assert_sp_frame_equal(sparse_result, dense_result, exact_indices=False) @@ -436,7 +445,6 @@ def _compare_to_dense(a, b, da, db, op): ) opnames = ["add", "sub", "mul", "truediv", "floordiv"] - ops = [getattr(operator, name) for name in opnames] fidx = frame.index @@ -466,6 +474,7 @@ def _compare_to_dense(a, b, da, db, op): f = lambda a, b: getattr(a, op)(b, axis="index") _compare_to_dense(frame, s, frame.to_dense(), s.to_dense(), f) + # FIXME: dont leave commented-out # rops are not implemented # _compare_to_dense(s, frame, s.to_dense(), # frame.to_dense(), f) @@ -479,13 +488,14 @@ def _compare_to_dense(a, b, da, db, op): frame.xs(fidx[5])[:2], ] - for op in ops: + for name in opnames: + op = getattr(operator, name) for s in series: _compare_to_dense(frame, s, frame.to_dense(), s, op) _compare_to_dense(s, frame, s, frame.to_dense(), op) # it works! - result = frame + frame.loc[:, ["A", "B"]] # noqa + frame + frame.loc[:, ["A", "B"]] def test_op_corners(self, float_frame, empty_frame): empty = empty_frame + empty_frame diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index ad4c898b004ac4..fb668f3d0e76d6 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -12,6 +12,7 @@ import pandas as pd from pandas import DataFrame, Series, SparseDtype, SparseSeries, bdate_range, isna +from pandas.core import ops from pandas.core.reshape.util import cartesian_product import pandas.core.sparse.frame as spf from pandas.tests.series.test_api import SharedWithSparse @@ -563,6 +564,10 @@ def _check_op(a, b, op): adense = a.to_dense() if isinstance(a, SparseSeries) else a bdense = b.to_dense() if isinstance(b, SparseSeries) else b dense_result = op(adense, bdense) + if "floordiv" in op.__name__: + # Series sets 1//0 to np.inf, which SparseSeries does not do (yet) + mask = np.isinf(dense_result) + dense_result[mask] = np.nan tm.assert_almost_equal(sp_result.to_dense(), dense_result) def check(a, b): @@ -572,11 +577,11 @@ def check(a, b): _check_op(a, b, operator.floordiv) _check_op(a, b, operator.mul) - _check_op(a, b, lambda x, y: operator.add(y, x)) - _check_op(a, b, lambda x, y: operator.sub(y, x)) - _check_op(a, b, lambda x, y: operator.truediv(y, x)) - _check_op(a, b, lambda x, y: operator.floordiv(y, x)) - _check_op(a, b, lambda x, y: operator.mul(y, x)) + _check_op(a, b, ops.radd) + _check_op(a, b, ops.rsub) + _check_op(a, b, ops.rtruediv) + _check_op(a, b, ops.rfloordiv) + _check_op(a, b, ops.rmul) # FIXME: don't leave commented-out # NaN ** 0 = 1 in C? From 8887b1e7ed7cdd8d7a44d0eef621b9f3a1b5f14f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 11 Jul 2019 09:28:01 -0700 Subject: [PATCH 207/238] CLN: requested follow-ups (#27332) --- pandas/core/generic.py | 24 -------- pandas/core/indexers.py | 14 ++++- pandas/core/indexing.py | 73 ++++++++++------------- pandas/core/internals/blocks.py | 6 +- pandas/io/pytables.py | 1 + pandas/tests/frame/test_indexing.py | 1 + pandas/tests/series/test_operators.py | 5 +- pandas/tests/sparse/series/test_series.py | 2 +- 8 files changed, 51 insertions(+), 75 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4e05dfca43e786..0e2253aed1c88c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -274,7 +274,6 @@ def _setup_axes( info_axis=None, stat_axis=None, aliases=None, - slicers=None, axes_are_reversed=False, build_axes=True, ns=None, @@ -288,7 +287,6 @@ def _setup_axes( info_axis_num : the axis of the selector dimension (int) stat_axis_num : the number of axis for the default stats (int) aliases : other names for a single axis (dict) - slicers : how axes slice to others (dict) axes_are_reversed : boolean whether to treat passed axes as reversed (DataFrame) build_axes : setup the axis properties (default True) @@ -300,7 +298,6 @@ def _setup_axes( cls._AXIS_ALIASES = aliases or dict() cls._AXIS_IALIASES = {v: k for k, v in cls._AXIS_ALIASES.items()} cls._AXIS_NAMES = dict(enumerate(axes)) - cls._AXIS_SLICEMAP = slicers or None cls._AXIS_REVERSED = axes_are_reversed # typ @@ -347,15 +344,6 @@ def _construct_axes_dict_from(self, axes, **kwargs): d.update(kwargs) return d - def _construct_axes_dict_for_slice(self, axes=None, **kwargs): - """Return an axes dictionary for myself.""" - d = { - self._AXIS_SLICEMAP[a]: self._get_axis(a) - for a in (axes or self._AXIS_ORDERS) - } - d.update(kwargs) - return d - def _construct_axes_from_arguments( self, args, kwargs, require_all=False, sentinel=None ): @@ -577,18 +565,6 @@ def _obj_with_exclusions(self): """ internal compat with SelectionMixin """ return self - def _expand_axes(self, key): - new_axes = [] - for k, ax in zip(key, self.axes): - if k not in ax: - if type(k) != ax.dtype.type: - ax = ax.astype("O") - new_axes.append(ax.insert(len(ax), k)) - else: - new_axes.append(ax) - - return new_axes - def set_axis(self, labels, axis=0, inplace=None): """ Assign desired index to given axis. diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 7b0030b91e4dc4..70c48e969172f5 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -36,9 +36,19 @@ def is_scalar_indexer(indexer, arr_value) -> bool: return False -def is_empty_indexer(indexer, arr_value) -> bool: - # return a boolean if we have an empty indexer +def is_empty_indexer(indexer, arr_value: np.ndarray) -> bool: + """ + Check if we have an empty indexer. + + Parameters + ---------- + indexer : object + arr_value : np.ndarray + Returns + ------- + bool + """ if is_list_like(indexer) and not len(indexer): return True if arr_value.ndim == 1: diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index c31d6538ad2c30..01f338a021cec5 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -158,9 +158,7 @@ def _get_label(self, label, axis=None): return self.obj._xs(label, axis=axis) - def _get_loc(self, key, axis=None): - if axis is None: - axis = self.axis + def _get_loc(self, key, axis: int): return self.obj._ixs(key, axis=axis) def _slice(self, obj, axis=None, kind=None): @@ -172,11 +170,11 @@ def _get_setitem_indexer(self, key): if self.axis is not None: return self._convert_tuple(key, is_setter=True) - axis = self.obj._get_axis(0) + ax = self.obj._get_axis(0) - if isinstance(axis, MultiIndex) and self.name != "iloc": + if isinstance(ax, MultiIndex) and self.name != "iloc": try: - return axis.get_loc(key) + return ax.get_loc(key) except Exception: pass @@ -189,8 +187,9 @@ def _get_setitem_indexer(self, key): if isinstance(key, range): return self._convert_range(key, is_setter=True) + axis = self.axis or 0 try: - return self._convert_to_indexer(key, is_setter=True) + return self._convert_to_indexer(key, axis=axis, is_setter=True) except TypeError as e: # invalid indexer type vs 'other' indexing errors @@ -206,7 +205,7 @@ def __setitem__(self, key, value): indexer = self._get_setitem_indexer(key) self._setitem_with_indexer(indexer, value) - def _validate_key(self, key, axis): + def _validate_key(self, key, axis: int): """ Ensure that key is valid for current indexer. @@ -214,7 +213,6 @@ def _validate_key(self, key, axis): ---------- key : scalar, slice or list-like The key requested - axis : int Dimension on which the indexing is being made @@ -222,14 +220,12 @@ def _validate_key(self, key, axis): ------ TypeError If the key (or some element of it) has wrong type - IndexError If the key (or some element of it) is out of bounds - KeyError If the key was not found """ - raise AbstractMethodError() + raise AbstractMethodError(self) def _has_valid_tuple(self, key): """ check the key for valid keys across my indexer """ @@ -249,7 +245,7 @@ def _is_nested_tuple_indexer(self, tup): return any(is_nested_tuple(tup, ax) for ax in self.obj.axes) return False - def _convert_tuple(self, key, is_setter=False): + def _convert_tuple(self, key, is_setter: bool = False): keyidx = [] if self.axis is not None: axis = self.obj._get_axis_number(self.axis) @@ -268,19 +264,17 @@ def _convert_tuple(self, key, is_setter=False): keyidx.append(idx) return tuple(keyidx) - def _convert_range(self, key, is_setter=False): + def _convert_range(self, key, is_setter: bool = False): """ convert a range argument """ return list(key) - def _convert_scalar_indexer(self, key, axis): + def _convert_scalar_indexer(self, key, axis: int): # if we are accessing via lowered dim, use the last dim - if axis is None: - axis = 0 ax = self.obj._get_axis(min(axis, self.ndim - 1)) # a scalar return ax._convert_scalar_indexer(key, kind=self.name) - def _convert_slice_indexer(self, key, axis): + def _convert_slice_indexer(self, key, axis: int): # if we are accessing via lowered dim, use the last dim ax = self.obj._get_axis(min(axis, self.ndim - 1)) return ax._convert_slice_indexer(key, kind=self.name) @@ -883,7 +877,7 @@ def _multi_take(self, tup): } return o._reindex_with_indexers(d, copy=True, allow_dups=True) - def _convert_for_reindex(self, key, axis=None): + def _convert_for_reindex(self, key, axis: int): return key def _handle_lowerdim_multi_index_axis0(self, tup): @@ -1055,7 +1049,7 @@ def _getitem_axis(self, key, axis=None): return self._get_label(key, axis=axis) - def _get_listlike_indexer(self, key, axis, raise_missing=False): + def _get_listlike_indexer(self, key, axis: int, raise_missing: bool = False): """ Transform a list-like of keys into a new index and an indexer. @@ -1151,7 +1145,9 @@ def _getitem_iterable(self, key, axis: int): {axis: [keyarr, indexer]}, copy=True, allow_dups=True ) - def _validate_read_indexer(self, key, indexer, axis, raise_missing=False): + def _validate_read_indexer( + self, key, indexer, axis: int, raise_missing: bool = False + ): """ Check that indexer can be used to return a result (e.g. at least one element was found, unless the list of keys was actually empty). @@ -1216,7 +1212,9 @@ def _validate_read_indexer(self, key, indexer, axis, raise_missing=False): if not (ax.is_categorical() or ax.is_interval()): warnings.warn(_missing_key_warning, FutureWarning, stacklevel=6) - def _convert_to_indexer(self, obj, axis=None, is_setter=False, raise_missing=False): + def _convert_to_indexer( + self, obj, axis: int, is_setter: bool = False, raise_missing: bool = False + ): """ Convert indexing key into something we can use to do actual fancy indexing on an ndarray @@ -1231,9 +1229,6 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False, raise_missing=Fal raise AmbiguousIndexError with integer labels? - No, prefer label-based indexing """ - if axis is None: - axis = self.axis or 0 - labels = self.obj._get_axis(axis) if isinstance(obj, slice): @@ -1362,7 +1357,7 @@ def __init__(self, name, obj): super().__init__(name, obj) @Appender(_NDFrameIndexer._validate_key.__doc__) - def _validate_key(self, key, axis): + def _validate_key(self, key, axis: int): if isinstance(key, slice): return True @@ -1378,7 +1373,7 @@ def _validate_key(self, key, axis): return True - def _convert_for_reindex(self, key, axis=None): + def _convert_for_reindex(self, key, axis: int): """ Transform a list of keys into a new array ready to be used as axis of the object we return (e.g. including NaNs). @@ -1394,9 +1389,6 @@ def _convert_for_reindex(self, key, axis=None): ------- list-like of labels """ - - if axis is None: - axis = self.axis or 0 labels = self.obj._get_axis(axis) if com.is_bool_indexer(key): @@ -1726,7 +1718,7 @@ class _LocIndexer(_LocationIndexer): _exception = KeyError @Appender(_NDFrameIndexer._validate_key.__doc__) - def _validate_key(self, key, axis): + def _validate_key(self, key, axis: int): # valid for a collection of labels (we check their presence later) # slice of labels (where start-end in labels) @@ -2006,7 +1998,7 @@ class _iLocIndexer(_LocationIndexer): _exception = IndexError _get_slice_axis = _NDFrameIndexer._get_slice_axis - def _validate_key(self, key, axis): + def _validate_key(self, key, axis: int): if com.is_bool_indexer(key): if hasattr(key, "index") and isinstance(key.index, Index): if key.index.inferred_type == "integer": @@ -2132,7 +2124,7 @@ def _getitem_tuple(self, tup): return retval - def _get_list_axis(self, key, axis=None): + def _get_list_axis(self, key, axis: int): """ Return Series values by list or array of integers @@ -2145,8 +2137,6 @@ def _get_list_axis(self, key, axis=None): ------- Series object """ - if axis is None: - axis = self.axis or 0 try: return self.obj._take(key, axis=axis) except IndexError: @@ -2184,10 +2174,11 @@ def _getitem_axis(self, key, axis=None): return self._get_loc(key, axis=axis) - def _convert_to_indexer(self, obj, axis=None, is_setter=False): + # raise_missing is included for compat with the parent class signature + def _convert_to_indexer( + self, obj, axis: int, is_setter: bool = False, raise_missing: bool = False + ): """ much simpler as we only have to deal with our valid types """ - if axis is None: - axis = self.axis or 0 # make need to convert a float key if isinstance(obj, slice): @@ -2209,7 +2200,7 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False): class _ScalarAccessIndexer(_NDFrameIndexer): """ access scalars quickly """ - def _convert_key(self, key, is_setter=False): + def _convert_key(self, key, is_setter: bool = False): return list(key) def __getitem__(self, key): @@ -2289,7 +2280,7 @@ class _AtIndexer(_ScalarAccessIndexer): _takeable = False - def _convert_key(self, key, is_setter=False): + def _convert_key(self, key, is_setter: bool = False): """ require they keys to be the same type as the index (so we don't fallback) """ @@ -2366,7 +2357,7 @@ class _iAtIndexer(_ScalarAccessIndexer): def _has_valid_setitem_indexer(self, indexer): self._has_valid_positional_setitem_indexer(indexer) - def _convert_key(self, key, is_setter=False): + def _convert_key(self, key, is_setter: bool = False): """ require integer args (and convert to label arguments) """ for a, i in zip(self.obj.axes, key): if not is_integer(i): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1e84437f5c2fc0..f931df25c4fd53 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2249,9 +2249,9 @@ def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: return tipo == _NS_DTYPE or tipo == np.int64 - if isinstance(element, datetime): + elif isinstance(element, datetime): return element.tzinfo is None - if is_integer(element): + elif is_integer(element): return element == tslibs.iNaT # TODO: shouldnt we exclude timedelta64("NaT")? See GH#27297 @@ -2607,7 +2607,7 @@ def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, (np.timedelta64, np.int64)) - if element is NaT: + elif element is NaT: return True return is_integer(element) or isinstance( element, (timedelta, np.timedelta64, np.int64) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 9206463e18fb31..1db177d792401b 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3974,6 +3974,7 @@ def process_filter(field, filt): for axis_name in obj._AXIS_NAMES.values(): axis_number = obj._get_axis_number(axis_name) axis_values = obj._get_axis(axis_name) + assert axis_number is not None # see if the field is the name of an axis if field == axis_name: diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 3c102f49c6cbf7..ae24ad65d2c56e 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -1680,6 +1680,7 @@ def test_setitem_single_column_mixed_datetime(self): df.loc["d", :] = np.nan assert not isna(df.loc["c", :]).all() + # FIXME: don't leave commented-out # as of GH 3216 this will now work! # try to set with a list like item # pytest.raises( diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 0c25df79974699..062c07cb6242aa 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -692,10 +692,7 @@ def test_operators_corner(self): ) tm.assert_series_equal(added[:-5], expected) - pairings = [ - (Series.div, operator.truediv, 1), - (Series.rdiv, lambda x, y: operator.truediv(y, x), 1), - ] + pairings = [(Series.div, operator.truediv, 1), (Series.rdiv, ops.rtruediv, 1)] for op in ["add", "sub", "mul", "pow", "truediv", "floordiv"]: fv = 0 lop = getattr(Series, op) diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index fb668f3d0e76d6..a9c3d157dd69b3 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -586,7 +586,7 @@ def check(a, b): # FIXME: don't leave commented-out # NaN ** 0 = 1 in C? # _check_op(a, b, operator.pow) - # _check_op(a, b, lambda x, y: operator.pow(y, x)) + # _check_op(a, b, ops.rpow) check(self.bseries, self.bseries) check(self.iseries, self.iseries) From 0cefff07d553bae5e5a36c60b5509d151917978c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 11 Jul 2019 09:39:33 -0700 Subject: [PATCH 208/238] BUG: fix inserting tz-aware datetime to Series, closes #12862 (#27322) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/series.py | 7 ++++++ pandas/tests/indexing/test_loc.py | 22 ++++++++++++++++++- pandas/tests/series/indexing/test_indexing.py | 2 +- 4 files changed, 30 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ebe8b4770f6aa2..2e7d5e95349be8 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -1053,6 +1053,7 @@ Indexing - Bug in :meth:`DataFrame.loc` and :meth:`DataFrame.iloc` on a :class:`DataFrame` with a single timezone-aware datetime64[ns] column incorrectly returning a scalar instead of a :class:`Series` (:issue:`27110`) - Bug in :class:`CategoricalIndex` and :class:`Categorical` incorrectly raising ``ValueError`` instead of ``TypeError`` when a list is passed using the ``in`` operator (``__contains__``) (:issue:`21729`) - Bug in setting a new value in a :class:`Series` with a :class:`Timedelta` object incorrectly casting the value to an integer (:issue:`22717`) +- Bug in :class:`Series` setting a new key (``__setitem__``) with a timezone-aware datetime incorrectly raising ``ValueError`` (:issue:`12862`) - Missing diff --git a/pandas/core/series.py b/pandas/core/series.py index 4b78907e661067..acb08269535083 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1268,6 +1268,13 @@ def _set_with(self, key, value): except Exception: pass + if is_scalar(key) and not is_integer(key) and key not in self.index: + # GH#12862 adding an new key to the Series + # Note: have to exclude integers because that is ambiguously + # position-based + self.loc[key] = value + return + if is_scalar(key): key = [key] elif not isinstance(key, (list, Series, np.ndarray)): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index d749e697c8282b..90d1b0b1e01983 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -802,7 +802,7 @@ def test_loc_setitem_with_scalar_index(self, indexer, value): assert is_scalar(result) and result == "Z" - def test_loc_coerceion(self): + def test_loc_coercion(self): # 12411 df = DataFrame({"date": [Timestamp("20130101").tz_localize("UTC"), pd.NaT]}) @@ -838,6 +838,26 @@ def test_loc_coerceion(self): result = df.iloc[3:] tm.assert_series_equal(result.dtypes, expected) + def test_setitem_new_key_tz(self): + # GH#12862 should not raise on assigning the second value + vals = [ + pd.to_datetime(42).tz_localize("UTC"), + pd.to_datetime(666).tz_localize("UTC"), + ] + expected = pd.Series(vals, index=["foo", "bar"]) + + ser = pd.Series() + ser["foo"] = vals[0] + ser["bar"] = vals[1] + + tm.assert_series_equal(ser, expected) + + ser = pd.Series() + ser.loc["foo"] = vals[0] + ser.loc["bar"] = vals[1] + + tm.assert_series_equal(ser, expected) + def test_loc_non_unique(self): # GH3659 # non-unique indexer with loc slice diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 1fb1dd3bb998af..6ff878f07da84b 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -523,7 +523,7 @@ def test_setitem_with_tz_dst(): tm.assert_series_equal(s, exp) -def test_categorial_assigning_ops(): +def test_categorical_assigning_ops(): orig = Series(Categorical(["b", "b"], categories=["a", "b"])) s = orig.copy() s[:] = "a" From d3e84b7ce8ab10361fce2d5d6184328bc243d9d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Abdullah=20=C4=B0hsan=20Se=C3=A7er?= Date: Thu, 11 Jul 2019 19:44:37 +0300 Subject: [PATCH 209/238] ENH: Raise ValueError for unsupported Window functions (#27275) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/base.py | 11 +++++++++-- pandas/tests/window/test_window.py | 16 ++++++++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 2e7d5e95349be8..72b71404365700 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -1138,6 +1138,7 @@ Groupby/resample/rolling - Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where incorrect results are returned with ``closed='left'`` and ``closed='neither'`` (:issue:`26005`) - Improved :class:`pandas.core.window.Rolling`, :class:`pandas.core.window.Window` and :class:`pandas.core.window.EWM` functions to exclude nuisance columns from results instead of raising errors and raise a ``DataError`` only if all columns are nuisance (:issue:`12537`) - Bug in :meth:`pandas.core.window.Rolling.max` and :meth:`pandas.core.window.Rolling.min` where incorrect results are returned with an empty variable window (:issue:`26005`) +- Raise a helpful exception when an unsupported weighted window function is used as an argument of :meth:`pandas.core.window.Window.aggregate` (:issue:`26597`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/base.py b/pandas/core/base.py index 15baf1bed0ecdf..9480e2e425f79a 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -314,9 +314,16 @@ def _try_aggregate_string_function(self, arg, *args, **kwargs): f = getattr(np, arg, None) if f is not None: - return f(self, *args, **kwargs) + try: + return f(self, *args, **kwargs) + + except (AttributeError, TypeError): + pass - raise ValueError("{arg} is an unknown string function".format(arg=arg)) + raise AttributeError( + "'{arg}' is not a valid function for " + "'{cls}' object".format(arg=arg, cls=type(self).__name__) + ) def _aggregate(self, arg, *args, **kwargs): """ diff --git a/pandas/tests/window/test_window.py b/pandas/tests/window/test_window.py index 2f3b83e172795f..3945a8aaa8b87d 100644 --- a/pandas/tests/window/test_window.py +++ b/pandas/tests/window/test_window.py @@ -439,6 +439,22 @@ def test_numpy_compat(self, method): with pytest.raises(UnsupportedFunctionCall, match=msg): getattr(w, method)(dtype=np.float64) + @td.skip_if_no_scipy + @pytest.mark.parametrize("arg", ["median", "var", "std", "kurt", "skew"]) + def test_agg_function_support(self, arg): + df = pd.DataFrame({"A": np.arange(5)}) + roll = df.rolling(2, win_type="triang") + + msg = "'{arg}' is not a valid function for " "'Window' object".format(arg=arg) + with pytest.raises(AttributeError, match=msg): + roll.agg(arg) + + with pytest.raises(AttributeError, match=msg): + roll.agg([arg]) + + with pytest.raises(AttributeError, match=msg): + roll.agg({"A": arg}) + class TestRolling(Base): def setup_method(self, method): From eeff07f9a0103bc643c0402a7eb7c3e27c33e749 Mon Sep 17 00:00:00 2001 From: Chris Stadler Date: Thu, 11 Jul 2019 15:14:49 -0400 Subject: [PATCH 210/238] Accept empty dataframes in DataFrame.to_parquet (#27341) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/io/parquet.py | 2 +- pandas/tests/io/test_parquet.py | 12 ++++++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 72b71404365700..eeaafd7ad7d51a 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -1102,6 +1102,7 @@ I/O - Fixed bug in :func:`DataFrame.to_excel()` where custom objects (i.e. `PeriodIndex`) inside merged cells were not being converted into types safe for the Excel writer (:issue:`27006`) - Bug in :meth:`read_hdf` where reading a timezone aware :class:`DatetimeIndex` would raise a ``TypeError`` (:issue:`11926`) - Bug in :meth:`to_msgpack` and :meth:`read_msgpack` which would raise a ``ValueError`` rather than a ``FileNotFoundError`` for an invalid path (:issue:`27160`) +- Fixed bug in :meth:`DataFrame.to_parquet` which would raise a ``ValueError`` when the dataframe had no columns (:issue:`27339`) Plotting ^^^^^^^^ diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 3db05b94e5dce4..a2502df45169f1 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -55,7 +55,7 @@ def validate_dataframe(df): raise ValueError("to_parquet only supports IO with DataFrames") # must have value column names (strings only) - if df.columns.inferred_type not in {"string", "unicode"}: + if df.columns.inferred_type not in {"string", "unicode", "empty"}: raise ValueError("parquet must have string column names") # index level names must be strings diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 6ac2e9cd65a271..a04fb9fd502577 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -471,6 +471,11 @@ def test_partition_cols_supported(self, pa, df_full): assert len(dataset.partitions.partition_names) == 2 assert dataset.partitions.partition_names == set(partition_cols) + def test_empty_dataframe(self, pa): + # GH #27339 + df = pd.DataFrame() + check_round_trip(df, pa) + class TestParquetFastParquet(Base): @td.skip_if_no("fastparquet", min_version="0.2.1") @@ -566,3 +571,10 @@ def test_error_on_using_partition_cols_and_partition_on(self, fp, df_full): partition_on=partition_cols, partition_cols=partition_cols, ) + + def test_empty_dataframe(self, fp): + # GH #27339 + df = pd.DataFrame() + expected = df.copy() + expected.index.name = "index" + check_round_trip(df, fp, expected=expected) From 76cca0ee5a9ad6d91b7c5c39bc8ce5917dac00b6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 11 Jul 2019 13:15:03 -0700 Subject: [PATCH 211/238] BUG: fix+test assigning invalid NAT-like to DTA/TDA/PA (#27331) --- pandas/core/arrays/datetimelike.py | 7 +++- pandas/core/dtypes/missing.py | 24 ++++++++++++ pandas/tests/arrays/test_datetimelike.py | 48 ++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 540442b7eaed40..df173888561171 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -36,7 +36,7 @@ ) from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import is_array_like -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna from pandas._typing import DatetimeLikeScalar from pandas.core import missing, nanops @@ -492,7 +492,10 @@ def __setitem__( elif isinstance(value, self._scalar_type): self._check_compatible_with(value) value = self._unbox_scalar(value) - elif isna(value) or value == iNaT: + elif is_valid_nat_for_dtype(value, self.dtype): + value = iNaT + elif not isna(value) and lib.is_integer(value) and value == iNaT: + # exclude misc e.g. object() and any NAs not allowed above value = iNaT else: msg = ( diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index f540e9297738a3..6a681954fd9022 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -559,3 +559,27 @@ def remove_na_arraylike(arr): return arr[notna(arr)] else: return arr[notna(lib.values_from_object(arr))] + + +def is_valid_nat_for_dtype(obj, dtype): + """ + isna check that excludes incompatible dtypes + + Parameters + ---------- + obj : object + dtype : np.datetime64, np.timedelta64, DatetimeTZDtype, or PeriodDtype + + Returns + ------- + bool + """ + if not isna(obj): + return False + if dtype.kind == "M": + return not isinstance(obj, np.timedelta64) + if dtype.kind == "m": + return not isinstance(obj, np.datetime64) + + # must be PeriodDType + return not isinstance(obj, (np.datetime64, np.timedelta64)) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 34fae1f4b1ab4d..d9646feaf661e1 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -651,3 +651,51 @@ def test_array_interface(self, period_index): result = np.asarray(arr, dtype="S20") expected = np.asarray(arr).astype("S20") tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "array,casting_nats", + [ + ( + pd.TimedeltaIndex(["1 Day", "3 Hours", "NaT"])._data, + (pd.NaT, np.timedelta64("NaT", "ns")), + ), + ( + pd.date_range("2000-01-01", periods=3, freq="D")._data, + (pd.NaT, np.datetime64("NaT", "ns")), + ), + (pd.period_range("2000-01-01", periods=3, freq="D")._data, (pd.NaT,)), + ], + ids=lambda x: type(x).__name__, +) +def test_casting_nat_setitem_array(array, casting_nats): + expected = type(array)._from_sequence([pd.NaT, array[1], array[2]]) + + for nat in casting_nats: + arr = array.copy() + arr[0] = nat + tm.assert_equal(arr, expected) + + +@pytest.mark.parametrize( + "array,non_casting_nats", + [ + ( + pd.TimedeltaIndex(["1 Day", "3 Hours", "NaT"])._data, + (np.datetime64("NaT", "ns"),), + ), + ( + pd.date_range("2000-01-01", periods=3, freq="D")._data, + (np.timedelta64("NaT", "ns"),), + ), + ( + pd.period_range("2000-01-01", periods=3, freq="D")._data, + (np.datetime64("NaT", "ns"), np.timedelta64("NaT", "ns")), + ), + ], + ids=lambda x: type(x).__name__, +) +def test_invalid_nat_setitem_array(array, non_casting_nats): + for nat in non_casting_nats: + with pytest.raises(TypeError): + array[0] = nat From 5a7a8e1decf7154f396adf42860780b15221df4b Mon Sep 17 00:00:00 2001 From: pilkibun <51503352+pilkibun@users.noreply.github.com> Date: Thu, 11 Jul 2019 21:00:59 +0000 Subject: [PATCH 212/238] CLN: remove Hypothesis warning during test (#27336) --- pandas/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index ef2758d263e1aa..2cf7bf6a6df41c 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -25,7 +25,6 @@ # if it really is slow add `@settings(deadline=...)` with a working value, # or `deadline=None` to entirely disable timeouts for that test. deadline=500, - timeout=hypothesis.unlimited, suppress_health_check=(hypothesis.HealthCheck.too_slow,), ) hypothesis.settings.load_profile("ci") From baba98c9c20a8d0f6b930334765526a624ff19de Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 12 Jul 2019 15:12:49 +0100 Subject: [PATCH 213/238] add type annotations to io\formats\html.py (#27355) --- pandas/io/formats/format.py | 1 - pandas/io/formats/html.py | 98 ++++++++++++++++++++++--------------- 2 files changed, 59 insertions(+), 40 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index c4e3dd1c755cf2..0e8ed7b25d665a 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -400,7 +400,6 @@ def _get_adjustment(): class TableFormatter: - is_truncated = False show_dimensions = None @property diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index e6aae44baa69b9..c2f4ee2c4a68b6 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -4,15 +4,20 @@ from collections import OrderedDict from textwrap import dedent +from typing import Dict, List, Optional, Tuple, Union from pandas._config import get_option -from pandas.core.dtypes.generic import ABCMultiIndex +from pandas.core.dtypes.generic import ABCIndex, ABCMultiIndex from pandas import option_context from pandas.io.common import _is_url -from pandas.io.formats.format import TableFormatter, get_level_lengths +from pandas.io.formats.format import ( + DataFrameFormatter, + TableFormatter, + get_level_lengths, +) from pandas.io.formats.printing import pprint_thing @@ -28,13 +33,18 @@ class HTMLFormatter(TableFormatter): indent_delta = 2 - def __init__(self, formatter, classes=None, border=None): + def __init__( + self, + formatter: DataFrameFormatter, + classes: Optional[Union[str, List, Tuple]] = None, + border: Optional[bool] = None, + ) -> None: self.fmt = formatter self.classes = classes self.frame = self.fmt.frame self.columns = self.fmt.tr_frame.columns - self.elements = [] + self.elements = [] # type: List[str] self.bold_rows = self.fmt.kwds.get("bold_rows", False) self.escape = self.fmt.kwds.get("escape", True) self.show_dimensions = self.fmt.show_dimensions @@ -47,15 +57,15 @@ def __init__(self, formatter, classes=None, border=None): self.fmt.col_space = "{colspace}px".format(colspace=self.fmt.col_space) @property - def show_row_idx_names(self): + def show_row_idx_names(self) -> bool: return self.fmt.show_row_idx_names @property - def show_col_idx_names(self): + def show_col_idx_names(self) -> bool: return self.fmt.show_col_idx_names @property - def row_levels(self): + def row_levels(self) -> int: if self.fmt.index: # showing (row) index return self.frame.index.nlevels @@ -69,22 +79,24 @@ def row_levels(self): # not showing (row) index return 0 - def _get_columns_formatted_values(self): + def _get_columns_formatted_values(self) -> ABCIndex: return self.columns @property - def is_truncated(self): + def is_truncated(self) -> bool: return self.fmt.is_truncated @property - def ncols(self): + def ncols(self) -> int: return len(self.fmt.tr_frame.columns) - def write(self, s, indent=0): + def write(self, s: str, indent: int = 0) -> None: rs = pprint_thing(s) self.elements.append(" " * indent + rs) - def write_th(self, s, header=False, indent=0, tags=None): + def write_th( + self, s: str, header: bool = False, indent: int = 0, tags: Optional[str] = None + ) -> None: """ Method for writting a formatted cell. @@ -111,12 +123,14 @@ def write_th(self, s, header=False, indent=0, tags=None): tags = tags or "" tags += 'style="min-width: {colspace};"'.format(colspace=self.fmt.col_space) - return self._write_cell(s, kind="th", indent=indent, tags=tags) + self._write_cell(s, kind="th", indent=indent, tags=tags) - def write_td(self, s, indent=0, tags=None): - return self._write_cell(s, kind="td", indent=indent, tags=tags) + def write_td(self, s: str, indent: int = 0, tags: Optional[str] = None) -> None: + self._write_cell(s, kind="td", indent=indent, tags=tags) - def _write_cell(self, s, kind="td", indent=0, tags=None): + def _write_cell( + self, s: str, kind: str = "td", indent: int = 0, tags: Optional[str] = None + ) -> None: if tags is not None: start_tag = "<{kind} {tags}>".format(kind=kind, tags=tags) else: @@ -124,7 +138,9 @@ def _write_cell(self, s, kind="td", indent=0, tags=None): if self.escape: # escape & first to prevent double escaping of & - esc = OrderedDict([("&", r"&"), ("<", r"<"), (">", r">")]) + esc = OrderedDict( + [("&", r"&"), ("<", r"<"), (">", r">")] + ) # type: Union[OrderedDict[str, str], Dict] else: esc = {} @@ -146,14 +162,14 @@ def _write_cell(self, s, kind="td", indent=0, tags=None): def write_tr( self, - line, - indent=0, - indent_delta=0, - header=False, - align=None, - tags=None, - nindex_levels=0, - ): + line: List[str], + indent: int = 0, + indent_delta: int = 0, + header: bool = False, + align: Optional[str] = None, + tags: Optional[Dict[int, str]] = None, + nindex_levels: int = 0, + ) -> None: if tags is None: tags = {} @@ -173,7 +189,7 @@ def write_tr( indent -= indent_delta self.write("", indent) - def render(self): + def render(self) -> List[str]: self._write_table() if self.should_show_dimensions: @@ -186,7 +202,7 @@ def render(self): return self.elements - def _write_table(self, indent=0): + def _write_table(self, indent: int = 0) -> None: _classes = ["dataframe"] # Default class. use_mathjax = get_option("display.html.use_mathjax") if not use_mathjax: @@ -220,7 +236,7 @@ def _write_table(self, indent=0): self.write("", indent) - def _write_col_header(self, indent): + def _write_col_header(self, indent: int) -> None: truncate_h = self.fmt.truncate_h if isinstance(self.columns, ABCMultiIndex): template = 'colspan="{span:d}" halign="left"' @@ -337,14 +353,14 @@ def _write_col_header(self, indent): self.write_tr(row, indent, self.indent_delta, header=True, align=align) - def _write_row_header(self, indent): + def _write_row_header(self, indent: int) -> None: truncate_h = self.fmt.truncate_h row = [x if x is not None else "" for x in self.frame.index.names] + [""] * ( self.ncols + (1 if truncate_h else 0) ) self.write_tr(row, indent, self.indent_delta, header=True) - def _write_header(self, indent): + def _write_header(self, indent: int) -> None: self.write("", indent) if self.fmt.header: @@ -355,12 +371,12 @@ def _write_header(self, indent): self.write("", indent) - def _get_formatted_values(self): + def _get_formatted_values(self) -> Dict[int, List[str]]: with option_context("display.max_colwidth", 999999): fmt_values = {i: self.fmt._format_col(i) for i in range(self.ncols)} return fmt_values - def _write_body(self, indent): + def _write_body(self, indent: int) -> None: self.write("", indent) fmt_values = self._get_formatted_values() @@ -372,7 +388,9 @@ def _write_body(self, indent): self.write("", indent) - def _write_regular_rows(self, fmt_values, indent): + def _write_regular_rows( + self, fmt_values: Dict[int, List[str]], indent: int + ) -> None: truncate_h = self.fmt.truncate_h truncate_v = self.fmt.truncate_v @@ -385,7 +403,7 @@ def _write_regular_rows(self, fmt_values, indent): else: index_values = self.fmt.tr_frame.index.format() - row = [] + row = [] # type: List[str] for i in range(nrows): if truncate_v and i == (self.fmt.tr_row_num): @@ -416,7 +434,9 @@ def _write_regular_rows(self, fmt_values, indent): row, indent, self.indent_delta, tags=None, nindex_levels=self.row_levels ) - def _write_hierarchical_rows(self, fmt_values, indent): + def _write_hierarchical_rows( + self, fmt_values: Dict[int, List[str]], indent: int + ) -> None: template = 'rowspan="{span}" valign="top"' truncate_h = self.fmt.truncate_h @@ -546,13 +566,13 @@ class NotebookFormatter(HTMLFormatter): DataFrame._repr_html_() and DataFrame.to_html(notebook=True) """ - def _get_formatted_values(self): + def _get_formatted_values(self) -> Dict[int, List[str]]: return {i: self.fmt._format_col(i) for i in range(self.ncols)} - def _get_columns_formatted_values(self): + def _get_columns_formatted_values(self) -> List[str]: return self.columns.format() - def write_style(self): + def write_style(self) -> None: # We use the "scoped" attribute here so that the desired # style properties for the data frame are not then applied # throughout the entire notebook. @@ -580,7 +600,7 @@ def write_style(self): template = dedent("\n".join((template_first, template_mid, template_last))) self.write(template) - def render(self): + def render(self) -> List[str]: self.write("
") self.write_style() super().render() From 45368531e3d0387fddd890ee8bdaf3bac2b08963 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 12 Jul 2019 15:14:34 +0100 Subject: [PATCH 214/238] TST/CLN: Add message checks to raises KeyError tests (#27354) --- .../tests/frame/test_axis_select_reindex.py | 6 ++- pandas/tests/frame/test_duplicates.py | 7 ++- pandas/tests/frame/test_indexing.py | 26 +++++++--- pandas/tests/frame/test_mutate_columns.py | 10 ++-- pandas/tests/generic/test_generic.py | 4 +- pandas/tests/groupby/test_timegrouper.py | 2 +- .../tests/indexes/datetimes/test_indexing.py | 8 +-- .../tests/indexes/interval/test_interval.py | 11 +++- .../indexes/interval/test_interval_new.py | 51 ++++++++++++++++--- .../indexes/interval/test_interval_tree.py | 12 +++-- .../indexes/multi/test_partial_indexing.py | 4 +- pandas/tests/indexes/multi/test_sorting.py | 2 +- pandas/tests/indexes/period/test_indexing.py | 2 +- pandas/tests/indexes/test_category.py | 6 +-- pandas/tests/indexes/test_common.py | 4 +- pandas/tests/indexes/test_range.py | 6 +-- .../tests/indexing/interval/test_interval.py | 18 +++---- .../indexing/interval/test_interval_new.py | 40 +++++++++------ pandas/tests/indexing/test_categorical.py | 6 ++- pandas/tests/indexing/test_indexing.py | 22 +++++--- pandas/tests/indexing/test_loc.py | 8 ++- pandas/tests/indexing/test_scalar.py | 8 +-- pandas/tests/io/excel/test_writers.py | 4 +- pandas/tests/io/pytables/test_pytables.py | 26 ++++++---- 24 files changed, 198 insertions(+), 95 deletions(-) diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 77be952506964c..1ef10ea5857d05 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -1,4 +1,5 @@ from datetime import datetime +import re import numpy as np import pytest @@ -1120,9 +1121,10 @@ def test_raise_on_drop_duplicate_index(self, actual): # issue 19186 level = 0 if isinstance(actual.index, MultiIndex) else None - with pytest.raises(KeyError): + msg = re.escape("\"['c'] not found in axis\"") + with pytest.raises(KeyError, match=msg): actual.drop("c", level=level, axis=0) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=msg): actual.T.drop("c", level=level, axis=1) expected_no_err = actual.drop("c", axis=0, level=level, errors="ignore") assert_frame_equal(expected_no_err, actual) diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index 0ea24777ae1f55..d2a1fc43d20466 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pytest @@ -9,11 +11,12 @@ def test_duplicated_with_misspelled_column_name(subset): # GH 19730 df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) + msg = re.escape("Index(['a'], dtype='object')") - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=msg): df.duplicated(subset) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=msg): df.drop_duplicates(subset) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index ae24ad65d2c56e..0cb7db0e471236 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -1,4 +1,5 @@ from datetime import date, datetime, time, timedelta +import re from warnings import catch_warnings, simplefilter import numpy as np @@ -59,7 +60,7 @@ def test_getitem(self, float_frame): ad = np.random.randn(len(df)) df["@awesome_domain"] = ad - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=re.escape("'df[\"$10\"]'")): df.__getitem__('df["$10"]') res = df["@awesome_domain"] @@ -67,7 +68,8 @@ def test_getitem(self, float_frame): def test_getitem_dupe_cols(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) - with pytest.raises(KeyError): + msg = "\"None of [Index(['baf'], dtype='object')] are in the [columns]\"" + with pytest.raises(KeyError, match=re.escape(msg)): df[["baf"]] def test_get(self, float_frame): @@ -446,14 +448,16 @@ def test_getitem_setitem_ix_negative_integers(self, float_frame): df = DataFrame(np.random.randn(8, 4)) # ix does label-based indexing when having an integer index + msg = "\"None of [Int64Index([-1], dtype='int64')] are in the [index]\"" with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=re.escape(msg)): df.ix[[-1]] + msg = "\"None of [Int64Index([-1], dtype='int64')] are in the [columns]\"" with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=re.escape(msg)): df.ix[:, [-1]] # #1942 @@ -497,7 +501,11 @@ def test_setitem(self, float_frame): float_frame["col6"] = series tm.assert_series_equal(series, float_frame["col6"], check_names=False) - with pytest.raises(KeyError): + msg = ( + r"\"None of \[Float64Index\(\[.*dtype='float64'\)\] are in the" + r" \[columns\]\"" + ) + with pytest.raises(KeyError, match=msg): float_frame[np.random.randn(len(float_frame) + 1)] = 1 # set ndarray @@ -1885,10 +1893,10 @@ def test_lookup_bool(self): assert df["mask"].dtype == np.bool_ def test_lookup_raises(self, float_frame): - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'One or more row labels was not found'"): float_frame.lookup(["xyz"], ["A"]) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'One or more column labels was not found'"): float_frame.lookup([float_frame.index[0]], ["xyz"]) with pytest.raises(ValueError, match="same size"): @@ -2544,7 +2552,9 @@ def test_xs(self, float_frame, datetime_frame): assert xs["A"] == 1 assert xs["B"] == "1" - with pytest.raises(KeyError): + with pytest.raises( + KeyError, match=re.escape("Timestamp('1999-12-31 00:00:00', freq='B')") + ): datetime_frame.xs(datetime_frame.index[0] - BDay()) # xs get column diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index ed9eeb594f7f67..7ad5abca82b290 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pytest @@ -88,9 +90,9 @@ def test_assign_dependent_old_python(self): df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) # Key C does not exist at definition time of df - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="^'C'$"): df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"]) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="^'C'$"): df.assign(C=df.A, D=lambda x: x["A"] + x["C"]) @pytest.mark.skipif( @@ -219,14 +221,14 @@ def test_delitem_multiindex(self): # A still in the levels, BUT get a KeyError if trying # to delete assert ("A",) not in df.columns - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=re.escape("('A',)")): del df[("A",)] # behavior of dropped/deleted MultiIndex levels changed from # GH 2770 to GH 19027: MultiIndex no longer '.__contains__' # levels which are dropped/deleted assert "A" not in df.columns - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=re.escape("('A',)")): del df["A"] def test_pop(self, float_frame): diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index b2b38980d0ceb2..7b9e50ebbf3427 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -618,7 +618,9 @@ def test_sample(sel): df.sample(n=1, weights="weight_column", axis=1) # Check weighting key error - with pytest.raises(KeyError): + with pytest.raises( + KeyError, match="'String passed to weights not a valid column'" + ): df.sample(n=3, weights="not_a_real_column_name") # Check that re-normalizes weights that don't sum to one. diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index d201b887739ec9..e1e35d8eb7d18b 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -206,7 +206,7 @@ def test_timegrouper_with_reg_groups(self): result = df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"]).sum() assert_frame_equal(result, expected) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'The grouper name foo is not found'"): df.groupby([pd.Grouper(freq="1M", key="foo"), "Buyer"]).sum() # passing the level diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 2a5bbdbb131edf..cd5efc86320c22 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -614,7 +614,7 @@ def test_get_loc(self): ) with pytest.raises(ValueError, match="unit abbreviation w/o a number"): idx.get_loc("2000-01-01T12", method="nearest", tolerance="foo") - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'2000-01-01T03'"): idx.get_loc("2000-01-01T03", method="nearest", tolerance="2 hours") with pytest.raises( ValueError, match="tolerance size must match target index size" @@ -634,12 +634,12 @@ def test_get_loc(self): assert idx.get_loc("1999", method="nearest") == 0 assert idx.get_loc("2001", method="nearest") == 2 - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'1999'"): idx.get_loc("1999", method="pad") - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'2001'"): idx.get_loc("2001", method="backfill") - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'foobar'"): idx.get_loc("foobar") with pytest.raises(TypeError): idx.get_loc(slice(2)) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 962ed2b1cf8ed3..c61af1ce70aed0 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -445,7 +445,7 @@ def test_get_loc_length_one_scalar(self, scalar, closed): result = index.get_loc(scalar) assert result == 0 else: - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=str(scalar)): index.get_loc(scalar) @pytest.mark.parametrize("other_closed", ["left", "right", "both", "neither"]) @@ -458,7 +458,14 @@ def test_get_loc_length_one_interval(self, left, right, closed, other_closed): result = index.get_loc(interval) assert result == 0 else: - with pytest.raises(KeyError): + with pytest.raises( + KeyError, + match=re.escape( + "Interval({left}, {right}, closed='{other_closed}')".format( + left=left, right=right, other_closed=other_closed + ) + ), + ): index.get_loc(interval) # Make consistent with test_interval_new.py (see #16316, #16386) diff --git a/pandas/tests/indexes/interval/test_interval_new.py b/pandas/tests/indexes/interval/test_interval_new.py index ab9f7ef1c3e262..d92559d2e3e49b 100644 --- a/pandas/tests/indexes/interval/test_interval_new.py +++ b/pandas/tests/indexes/interval/test_interval_new.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pytest @@ -15,16 +17,21 @@ def test_get_loc_interval(self, closed, side): for bound in [[0, 1], [1, 2], [2, 3], [3, 4], [0, 2], [2.5, 3], [-1, 4]]: # if get_loc is supplied an interval, it should only search # for exact matches, not overlaps or covers, else KeyError. + msg = re.escape( + "Interval({bound[0]}, {bound[1]}, closed='{side}')".format( + bound=bound, side=side + ) + ) if closed == side: if bound == [0, 1]: assert idx.get_loc(Interval(0, 1, closed=side)) == 0 elif bound == [2, 3]: assert idx.get_loc(Interval(2, 3, closed=side)) == 1 else: - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=msg): idx.get_loc(Interval(*bound, closed=side)) else: - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=msg): idx.get_loc(Interval(*bound, closed=side)) @pytest.mark.parametrize("scalar", [-0.5, 0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5]) @@ -81,18 +88,42 @@ def test_slice_locs_with_interval(self): # unsorted duplicates index = IntervalIndex.from_tuples([(0, 2), (2, 4), (0, 2)]) - with pytest.raises(KeyError): + with pytest.raises( + KeyError, + match=re.escape( + '"Cannot get left slice bound for non-unique label:' + " Interval(0, 2, closed='right')\"" + ), + ): index.slice_locs(start=Interval(0, 2), end=Interval(2, 4)) - with pytest.raises(KeyError): + with pytest.raises( + KeyError, + match=re.escape( + '"Cannot get left slice bound for non-unique label:' + " Interval(0, 2, closed='right')\"" + ), + ): index.slice_locs(start=Interval(0, 2)) assert index.slice_locs(end=Interval(2, 4)) == (0, 2) - with pytest.raises(KeyError): + with pytest.raises( + KeyError, + match=re.escape( + '"Cannot get right slice bound for non-unique label:' + " Interval(0, 2, closed='right')\"" + ), + ): index.slice_locs(end=Interval(0, 2)) - with pytest.raises(KeyError): + with pytest.raises( + KeyError, + match=re.escape( + '"Cannot get right slice bound for non-unique label:' + " Interval(0, 2, closed='right')\"" + ), + ): index.slice_locs(start=Interval(2, 4), end=Interval(0, 2)) # another unsorted duplicates @@ -139,7 +170,13 @@ def test_slice_locs_with_ints_and_floats_succeeds(self): def test_slice_locs_with_ints_and_floats_errors(self, tuples, query): start, stop = query index = IntervalIndex.from_tuples(tuples) - with pytest.raises(KeyError): + with pytest.raises( + KeyError, + match=( + "'can only get slices from an IntervalIndex if bounds are" + " non-overlapping and all monotonic increasing or decreasing'" + ), + ): index.slice_locs(start, stop) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/interval/test_interval_tree.py b/pandas/tests/indexes/interval/test_interval_tree.py index b7104242b5ccc6..87f9eaa209277c 100644 --- a/pandas/tests/indexes/interval/test_interval_tree.py +++ b/pandas/tests/indexes/interval/test_interval_tree.py @@ -62,7 +62,7 @@ def test_get_loc(self, tree): expected = np.array([0, 1], dtype="intp") tm.assert_numpy_array_equal(result, expected) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="-1"): tree.get_loc(-1) def test_get_indexer(self, tree): @@ -70,7 +70,9 @@ def test_get_indexer(self, tree): expected = np.array([0, 4, -1], dtype="intp") tm.assert_numpy_array_equal(result, expected) - with pytest.raises(KeyError): + with pytest.raises( + KeyError, match="'indexer does not intersect a unique set of intervals'" + ): tree.get_indexer(np.array([3.0])) def test_get_indexer_non_unique(self, tree): @@ -100,7 +102,9 @@ def test_duplicates(self, dtype): expected = np.array([0, 1, 2], dtype="intp") tm.assert_numpy_array_equal(result, expected) - with pytest.raises(KeyError): + with pytest.raises( + KeyError, match="'indexer does not intersect a unique set of intervals'" + ): tree.get_indexer(np.array([0.5])) indexer, missing = tree.get_indexer_non_unique(np.array([0.5])) @@ -116,7 +120,7 @@ def test_get_loc_closed(self, closed): tree = IntervalTree([0], [1], closed=closed) for p, errors in [(0, tree.open_left), (1, tree.open_right)]: if errors: - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=str(p)): tree.get_loc(p) else: result = tree.get_loc(p) diff --git a/pandas/tests/indexes/multi/test_partial_indexing.py b/pandas/tests/indexes/multi/test_partial_indexing.py index d6799e86683a9e..5db1296d828ca0 100644 --- a/pandas/tests/indexes/multi/test_partial_indexing.py +++ b/pandas/tests/indexes/multi/test_partial_indexing.py @@ -54,7 +54,7 @@ def test_partial_string_timestamp_multiindex(): # ambiguous and we don't want to extend this behavior forward to work # in multi-indexes. This would amount to selecting a scalar from a # column. - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'2016-01-01'"): df["2016-01-01"] # partial string match on year only @@ -83,7 +83,7 @@ def test_partial_string_timestamp_multiindex(): tm.assert_frame_equal(result, expected) # Slicing date on first level should break (of course) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'2016-01-01'"): df_swap.loc["2016-01-01"] # GH12685 (partial string with daily resolution or below) diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index c62bc80cfb53fd..3dee1dbecf3bab 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -115,7 +115,7 @@ def test_unsortedindex(): df.sort_index(inplace=True) assert len(df.loc(axis=0)["z", :]) == 2 - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'q'"): df.loc(axis=0)["q", :] diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 3f66891caddc33..cf03e2c7847f0e 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -200,7 +200,7 @@ def test_getitem_day(self): invalid = ["2013/02/01 9H", "2013/02/01 09:00"] for v in invalid: - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=v): s[v] diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 2b9632acd83cac..e79991f6521540 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -697,7 +697,7 @@ def test_get_loc(self): assert cidx1.get_loc("e") == idx1.get_loc("e") for i in [cidx1, idx1]: - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'NOT-EXIST'"): i.get_loc("NOT-EXIST") # non-unique @@ -716,7 +716,7 @@ def test_get_loc(self): assert res == 4 for i in [cidx2, idx2]: - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'NOT-EXIST'"): i.get_loc("NOT-EXIST") # non-unique, sliceable @@ -733,7 +733,7 @@ def test_get_loc(self): assert res == slice(2, 5, None) for i in [cidx3, idx3]: - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'c'"): i.get_loc("c") def test_repr_roundtrip(self): diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 0400b7810ecc9e..605df9971a567c 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -34,7 +34,9 @@ def test_droplevel(self, indices): indices.droplevel(level) for level in "wrong", ["wrong"]: - with pytest.raises(KeyError): + with pytest.raises( + KeyError, match=re.escape("'Level wrong must be same as name (None)'") + ): indices.droplevel(level) def test_constructor_non_hashable_name(self, indices): diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 213d9c65052291..58b98297f00f34 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -311,7 +311,7 @@ def test_cached_data(self): df.loc[50] assert idx._cached_data is None - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="51"): df.loc[51] assert idx._cached_data is None @@ -1027,13 +1027,13 @@ def test_engineless_lookup(self): tm.assert_numpy_array_equal( idx.get_indexer([2, 8]), ensure_platform_int(np.array([0, 2])) ) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="3"): idx.get_loc(3) assert "_engine" not in idx._cache # The engine is still required for lookup of a different dtype scalar: - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'a'"): assert idx.get_loc("a") == -1 assert "_engine" in idx._cache diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index 1bdb665101d416..7ae42782774db6 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -41,9 +41,9 @@ def test_nonoverlapping_monotonic(self, direction, closed): assert s[key] == expected assert s.loc[key] == expected else: - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=str(key)): s[key] - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=str(key)): s.loc[key] for key, expected in zip(idx.right, s): @@ -51,9 +51,9 @@ def test_nonoverlapping_monotonic(self, direction, closed): assert s[key] == expected assert s.loc[key] == expected else: - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=str(key)): s[key] - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=str(key)): s.loc[key] for key, expected in zip(idx.mid, s): @@ -65,10 +65,10 @@ def test_non_matching(self): # this is a departure from our current # indexin scheme, but simpler - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="^$"): s.loc[[-1, 3, 4, 5]] - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="^$"): s.loc[[-1, 3]] def test_large_series(self): @@ -93,7 +93,7 @@ def test_loc_getitem_frame(self): expected = df.iloc[4:6] tm.assert_frame_equal(result, expected) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="10"): df.loc[10] # single list-like @@ -106,9 +106,9 @@ def test_loc_getitem_frame(self): expected = df.take([4, 5, 4, 5]) tm.assert_frame_equal(result, expected) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="^$"): df.loc[[10]] # partial missing - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="^$"): df.loc[[10, 4]] diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index 92c71bbc6eb327..a86a9d16d3f9ff 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pytest @@ -30,31 +32,35 @@ def test_loc_with_interval(self): tm.assert_series_equal(expected, result) # missing or not exact - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='left')")): s.loc[Interval(3, 5, closed="left")] - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='left')")): s[Interval(3, 5, closed="left")] - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): s[Interval(3, 5)] - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): s.loc[Interval(3, 5)] - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): s[Interval(3, 5)] - with pytest.raises(KeyError): + with pytest.raises( + KeyError, match=re.escape("Interval(-2, 0, closed='right')") + ): s.loc[Interval(-2, 0)] - with pytest.raises(KeyError): + with pytest.raises( + KeyError, match=re.escape("Interval(-2, 0, closed='right')") + ): s[Interval(-2, 0)] - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=re.escape("Interval(5, 6, closed='right')")): s.loc[Interval(5, 6)] - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=re.escape("Interval(5, 6, closed='right')")): s[Interval(5, 6)] def test_loc_with_scalar(self): @@ -175,16 +181,16 @@ def test_loc_with_overlap(self): result = s[[Interval(1, 5), Interval(3, 7)]] tm.assert_series_equal(expected, result) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): s.loc[Interval(3, 5)] - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="^$"): s.loc[[Interval(3, 5)]] - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): s[Interval(3, 5)] - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="^$"): s[[Interval(3, 5)]] # slices with interval (only exact matches) @@ -195,15 +201,17 @@ def test_loc_with_overlap(self): result = s[Interval(1, 5) : Interval(3, 7)] tm.assert_series_equal(expected, result) - with pytest.raises(KeyError): + msg = "'can only get slices from an IntervalIndex if bounds are" + " non-overlapping and all monotonic increasing or decreasing'" + with pytest.raises(KeyError, match=msg): s.loc[Interval(1, 6) : Interval(3, 8)] - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=msg): s[Interval(1, 6) : Interval(3, 8)] # slices with scalar raise for overlapping intervals # TODO KeyError is the appropriate error? - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=msg): s.loc[1:4] def test_non_unique(self): diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 0dccf023c66f8d..c365c985eb4b64 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -355,7 +355,11 @@ def test_loc_listlike(self): assert_frame_equal(result, expected, check_index_type=True) # not all labels in the categories - with pytest.raises(KeyError): + with pytest.raises( + KeyError, + match="'a list-indexer must only include values that are in the" + " categories'", + ): self.df2.loc[["a", "d"]] def test_loc_listlike_dtypes(self): diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index a2a22bf60e0e40..77052de5e80e60 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -1,6 +1,7 @@ """ test fancy indexing & misc """ from datetime import datetime +import re from warnings import catch_warnings, simplefilter import weakref @@ -336,7 +337,12 @@ def test_dups_fancy_indexing(self): # List containing only missing label dfnu = DataFrame(np.random.randn(5, 3), index=list("AABCD")) - with pytest.raises(KeyError): + with pytest.raises( + KeyError, + match=re.escape( + "\"None of [Index(['E'], dtype='object')] are in the [index]\"" + ), + ): dfnu.loc[["E"]] # ToDo: check_index_type can be True after GH 11497 @@ -425,7 +431,7 @@ def test_multitype_list_index_access(self): # GH 10610 df = DataFrame(np.random.random((10, 5)), columns=["a"] + [20, 21, 22, 23]) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=re.escape("'[-8, 26] not in index'")): df[[22, 26, -8]] assert df[21].shape[0] == df.shape[0] @@ -641,18 +647,18 @@ def test_string_slice(self): # dtype should properly raises KeyError df = DataFrame([1], Index([pd.Timestamp("2011-01-01")], dtype=object)) assert df.index.is_all_dates - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'2011'"): df["2011"] - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'2011'"): df.loc["2011", 0] df = DataFrame() assert not df.index.is_all_dates - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'2011'"): df["2011"] - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'2011'"): df.loc["2011", 0] def test_astype_assignment(self): @@ -855,9 +861,9 @@ def test_mixed_index_assignment(self): def test_mixed_index_no_fallback(self): # GH 19860 s = Series([1, 2, 3, 4, 5], index=["a", "b", "c", 1, 2]) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="^0$"): s.at[0] - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="^4$"): s.at[4] def test_rhs_alignment(self): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 90d1b0b1e01983..06d71d1b1e3899 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1,5 +1,6 @@ """ test label based indexing with loc """ from io import StringIO +import re from warnings import catch_warnings, filterwarnings import numpy as np @@ -425,7 +426,12 @@ def test_loc_getitem_list_with_fail(self): s.loc[[2]] - with pytest.raises(KeyError): + with pytest.raises( + KeyError, + match=re.escape( + "\"None of [Int64Index([3], dtype='int64')] are in the [index]\"" + ), + ): s.loc[[3]] # a non-match and a match diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py index 38b4897e55c844..0b8f3af760f1d2 100644 --- a/pandas/tests/indexing/test_scalar.py +++ b/pandas/tests/indexing/test_scalar.py @@ -203,9 +203,9 @@ def test_mixed_index_at_iat_loc_iloc_series(self): for i in range(len(s)): assert s.iat[i] == s.iloc[i] == i + 1 - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="^4$"): s.at[4] - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="^4$"): s.loc[4] def test_mixed_index_at_iat_loc_iloc_dataframe(self): @@ -221,9 +221,9 @@ def test_mixed_index_at_iat_loc_iloc_dataframe(self): for i in range(5): assert df.iat[row, i] == df.iloc[row, i] == row * 5 + i - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="^3$"): df.at[0, 3] - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="^3$"): df.loc[0, 3] def test_iat_setter_incompatible_assignment(self): diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 0908ed885a6ca9..8ad09549f3cbe5 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -1024,7 +1024,9 @@ def test_invalid_columns(self, engine, ext): read_frame = pd.read_excel(self.path, "test1", index_col=0) tm.assert_frame_equal(expected, read_frame) - with pytest.raises(KeyError): + with pytest.raises( + KeyError, match="'passes columns are not ALL present dataframe'" + ): write_frame.to_excel(self.path, "test1", columns=["C", "D"]) def test_comment_arg(self, engine, ext): diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index 946334b5df05e0..d67f2c3b7bd66e 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -4,6 +4,7 @@ from distutils.version import LooseVersion from io import BytesIO import os +import re import tempfile from warnings import catch_warnings, simplefilter @@ -648,7 +649,7 @@ def test_get(self): right = store["/a"] tm.assert_series_equal(left, right) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'No object named b in the file'"): store.get("b") @pytest.mark.parametrize( @@ -1300,7 +1301,7 @@ def test_read_missing_key_close_store(self): df = pd.DataFrame({"a": range(2), "b": range(2)}) df.to_hdf(path, "k1") - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'No object named k2 in the file'"): pd.read_hdf(path, "k2") # smoke test to test that file is properly closed after @@ -1953,7 +1954,7 @@ def check(obj, comparator): # 0 len df_empty = DataFrame(columns=list("ABC")) store.append("df", df_empty) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'No object named df in the file'"): store.select("df") # repeated append of 0/non-zero frames @@ -2237,7 +2238,9 @@ def test_remove(self): assert len(store) == 0 # nonexistence - with pytest.raises(KeyError): + with pytest.raises( + KeyError, match="'No object named a_nonexistent_store in the file'" + ): store.remove("a_nonexistent_store") # pathing @@ -3530,7 +3533,9 @@ def test_read_column(self): store.append("df", df) # error - with pytest.raises(KeyError): + with pytest.raises( + KeyError, match=re.escape("'column [foo] not found in the table'") + ): store.select_column("df", "foo") with pytest.raises(Exception): @@ -3780,15 +3785,16 @@ def test_select_as_multiple(self): with pytest.raises(Exception): store.select_as_multiple([None], where=["A>0", "B>0"], selector="df1") - with pytest.raises(KeyError): + msg = "'No object named df3 in the file'" + with pytest.raises(KeyError, match=msg): store.select_as_multiple( ["df1", "df3"], where=["A>0", "B>0"], selector="df1" ) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=msg): store.select_as_multiple(["df3"], where=["A>0", "B>0"], selector="df1") - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="'No object named df4 in the file'"): store.select_as_multiple( ["df1", "df2"], where=["A>0", "B>0"], selector="df4" ) @@ -4502,7 +4508,9 @@ def test_categorical(self): assert result is not None store.remove("df3") - with pytest.raises(KeyError): + with pytest.raises( + KeyError, match="'No object named df3/meta/s/meta in the file'" + ): store.select("df3/meta/s/meta") def test_categorical_conversion(self): From 423ca86ccae5688d5cbae13d316be4a3fcd79797 Mon Sep 17 00:00:00 2001 From: endenis Date: Fri, 12 Jul 2019 16:26:54 +0200 Subject: [PATCH 215/238] CLN: Collapse private ._take implementation into the public take method #27174 (#27349) --- pandas/core/frame.py | 6 +-- pandas/core/generic.py | 71 +++++++++------------------------- pandas/core/groupby/groupby.py | 2 +- pandas/core/groupby/grouper.py | 2 +- pandas/core/groupby/ops.py | 4 +- pandas/core/indexing.py | 8 ++-- pandas/core/series.py | 5 ++- 7 files changed, 33 insertions(+), 65 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 55a9eb6a0810a8..263c4013de281d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2937,7 +2937,7 @@ def _ixs(self, i, axis=0): return self.loc[:, lab_slice] else: if isinstance(label, Index): - return self._take(i, axis=1) + return self.take(i, axis=1) index_len = len(self.index) @@ -2999,7 +2999,7 @@ def __getitem__(self, key): if getattr(indexer, "dtype", None) == bool: indexer = np.where(indexer)[0] - data = self._take(indexer, axis=1) + data = self.take(indexer, axis=1) if is_single_key: # What does looking for a single key in a non-unique index return? @@ -3032,7 +3032,7 @@ def _getitem_bool_array(self, key): # be reindexed to match DataFrame rows key = check_bool_indexer(self.index, key) indexer = key.nonzero()[0] - return self._take(indexer, axis=0) + return self.take(indexer, axis=0) def _getitem_multilevel(self, key): loc = self.columns.get_loc(key) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0e2253aed1c88c..e19b1f70ce2f7e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3294,7 +3294,7 @@ def _iget_item_cache(self, item): if ax.is_unique: lower = self._get_item_cache(ax[item]) else: - lower = self._take(item, axis=self._info_axis_number) + lower = self.take(item, axis=self._info_axis_number) return lower def _box_item_values(self, key, values): @@ -3522,52 +3522,6 @@ def __delitem__(self, key): except KeyError: pass - def _take(self, indices, axis=0, is_copy=True): - """ - Return the elements in the given *positional* indices along an axis. - - This means that we are not indexing according to actual values in - the index attribute of the object. We are indexing according to the - actual position of the element in the object. - - This is the internal version of ``.take()`` and will contain a wider - selection of parameters useful for internal use but not as suitable - for public usage. - - Parameters - ---------- - indices : array-like - An array of ints indicating which positions to take. - axis : int, default 0 - The axis on which to select elements. "0" means that we are - selecting rows, "1" means that we are selecting columns, etc. - is_copy : bool, default True - Whether to return a copy of the original object or not. - - Returns - ------- - taken : same type as caller - An array-like containing the elements taken from the object. - - See Also - -------- - numpy.ndarray.take - numpy.take - """ - self._consolidate_inplace() - - new_data = self._data.take( - indices, axis=self._get_block_manager_axis(axis), verify=True - ) - result = self._constructor(new_data).__finalize__(self) - - # Maybe set copy if we didn't actually change the index. - if is_copy: - if not result._get_axis(axis).equals(self._get_axis(axis)): - result._set_is_copy(self) - - return result - def take(self, indices, axis=0, is_copy=True, **kwargs): """ Return the elements in the given *positional* indices along an axis. @@ -3644,7 +3598,20 @@ class max_speed 3 lion mammal 80.5 """ nv.validate_take(tuple(), kwargs) - return self._take(indices, axis=axis, is_copy=is_copy) + + self._consolidate_inplace() + + new_data = self._data.take( + indices, axis=self._get_block_manager_axis(axis), verify=True + ) + result = self._constructor(new_data).__finalize__(self) + + # Maybe set copy if we didn't actually change the index. + if is_copy: + if not result._get_axis(axis).equals(self._get_axis(axis)): + result._set_is_copy(self) + + return result def xs(self, key, axis=0, level=None, drop_level=True): """ @@ -3773,9 +3740,9 @@ class animal locomotion if isinstance(loc, np.ndarray): if loc.dtype == np.bool_: inds, = loc.nonzero() - return self._take(inds, axis=axis) + return self.take(inds, axis=axis) else: - return self._take(loc, axis=axis) + return self.take(loc, axis=axis) if not is_scalar(loc): new_index = self.index[loc] @@ -8091,7 +8058,7 @@ def at_time(self, time, asof=False, axis=None): except AttributeError: raise TypeError("Index must be DatetimeIndex") - return self._take(indexer, axis=axis) + return self.take(indexer, axis=axis) def between_time( self, start_time, end_time, include_start=True, include_end=True, axis=None @@ -8168,7 +8135,7 @@ def between_time( except AttributeError: raise TypeError("Index must be DatetimeIndex") - return self._take(indexer, axis=axis) + return self.take(indexer, axis=axis) def resample( self, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index aa71fd68086fb6..9aba9723e0546d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -679,7 +679,7 @@ def get_group(self, name, obj=None): if not len(inds): raise KeyError(name) - return obj._take(inds, axis=self.axis) + return obj.take(inds, axis=self.axis) def __iter__(self): """ diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 3cf358261e685c..a127d092b7b1aa 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -194,7 +194,7 @@ def _set_grouper(self, obj, sort=False): # use stable sort to support first, last, nth indexer = self.indexer = ax.argsort(kind="mergesort") ax = ax.take(indexer) - obj = obj._take(indexer, axis=self.axis, is_copy=False) + obj = obj.take(indexer, axis=self.axis, is_copy=False) self.obj = obj self.grouper = ax diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 33341a489866bb..e341a66bb74597 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -675,7 +675,7 @@ def _aggregate_series_fast(self, obj, func): # avoids object / Series creation overhead dummy = obj._get_values(slice(None, 0)) indexer = get_group_index_sorter(group_index, ngroups) - obj = obj._take(indexer) + obj = obj.take(indexer) group_index = algorithms.take_nd(group_index, indexer, allow_fill=False) grouper = reduction.SeriesGrouper(obj, func, group_index, ngroups, dummy) result, counts = grouper.get_result() @@ -915,7 +915,7 @@ def __iter__(self): yield i, self._chop(sdata, slice(start, end)) def _get_sorted_data(self): - return self.data._take(self.sort_idx, axis=self.axis) + return self.data.take(self.sort_idx, axis=self.axis) def _chop(self, sdata, slice_obj): return sdata.iloc[slice_obj] diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 01f338a021cec5..6040385acbe402 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1137,7 +1137,7 @@ def _getitem_iterable(self, key, axis: int): # A boolean indexer key = check_bool_indexer(labels, key) inds, = key.nonzero() - return self.obj._take(inds, axis=axis) + return self.obj.take(inds, axis=axis) else: # A collection of keys keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False) @@ -1448,7 +1448,7 @@ def _getbool_axis(self, key, axis: int): key = check_bool_indexer(labels, key) inds, = key.nonzero() try: - return self.obj._take(inds, axis=axis) + return self.obj.take(inds, axis=axis) except Exception as detail: raise self._exception(detail) @@ -1469,7 +1469,7 @@ def _get_slice_axis(self, slice_obj: slice, axis: int): else: # DatetimeIndex overrides Index.slice_indexer and may # return a DatetimeIndex instead of a slice object. - return self.obj._take(indexer, axis=axis) + return self.obj.take(indexer, axis=axis) class _LocIndexer(_LocationIndexer): @@ -2138,7 +2138,7 @@ def _get_list_axis(self, key, axis: int): Series object """ try: - return self.obj._take(key, axis=axis) + return self.obj.take(key, axis=axis) except IndexError: # re-raise with different error message raise IndexError("positional indexers are out-of-bounds") diff --git a/pandas/core/series.py b/pandas/core/series.py index acb08269535083..73a71a2a41f4c8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4371,8 +4371,9 @@ def memory_usage(self, index=True, deep=False): v += self.index.memory_usage(deep=deep) return v - @Appender(generic.NDFrame._take.__doc__) - def _take(self, indices, axis=0, is_copy=False): + @Appender(generic.NDFrame.take.__doc__) + def take(self, indices, axis=0, is_copy=False, **kwargs): + nv.validate_take(tuple(), kwargs) indices = ensure_platform_int(indices) new_index = self.index.take(indices) From 40e3c7bab64e057175f5c7c8325af67ce2bb302b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Abdullah=20=C4=B0hsan=20Se=C3=A7er?= Date: Fri, 12 Jul 2019 17:29:37 +0300 Subject: [PATCH 216/238] CLN: Split test_window.py further (#27348) --- pandas/tests/window/common.py | 23 + pandas/tests/window/test_api.py | 367 +++ pandas/tests/window/test_ewm.py | 70 + pandas/tests/window/test_expanding.py | 115 + pandas/tests/window/test_grouper.py | 176 ++ pandas/tests/window/test_moments.py | 2562 ++++++++++++++++++ pandas/tests/window/test_rolling.py | 328 +++ pandas/tests/window/test_window.py | 3581 +------------------------ 8 files changed, 3643 insertions(+), 3579 deletions(-) create mode 100644 pandas/tests/window/common.py create mode 100644 pandas/tests/window/test_api.py create mode 100644 pandas/tests/window/test_ewm.py create mode 100644 pandas/tests/window/test_expanding.py create mode 100644 pandas/tests/window/test_grouper.py create mode 100644 pandas/tests/window/test_moments.py create mode 100644 pandas/tests/window/test_rolling.py diff --git a/pandas/tests/window/common.py b/pandas/tests/window/common.py new file mode 100644 index 00000000000000..1dfc0f34b2b8d8 --- /dev/null +++ b/pandas/tests/window/common.py @@ -0,0 +1,23 @@ +from datetime import datetime + +import numpy as np +from numpy.random import randn + +from pandas import DataFrame, Series, bdate_range + +N, K = 100, 10 + + +class Base: + + _nan_locs = np.arange(20, 40) + _inf_locs = np.array([]) + + def _create_data(self): + arr = randn(N) + arr[self._nan_locs] = np.NaN + + self.arr = arr + self.rng = bdate_range(datetime(2009, 1, 1), periods=N) + self.series = Series(arr.copy(), index=self.rng) + self.frame = DataFrame(randn(N, K), index=self.rng, columns=np.arange(K)) diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py new file mode 100644 index 00000000000000..11527efa4c39fb --- /dev/null +++ b/pandas/tests/window/test_api.py @@ -0,0 +1,367 @@ +from collections import OrderedDict +import warnings +from warnings import catch_warnings + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, Index, Series, Timestamp, concat +from pandas.core.base import SpecificationError +from pandas.tests.window.common import Base +import pandas.util.testing as tm + + +class TestApi(Base): + def setup_method(self, method): + self._create_data() + + def test_getitem(self): + + r = self.frame.rolling(window=5) + tm.assert_index_equal(r._selected_obj.columns, self.frame.columns) + + r = self.frame.rolling(window=5)[1] + assert r._selected_obj.name == self.frame.columns[1] + + # technically this is allowed + r = self.frame.rolling(window=5)[1, 3] + tm.assert_index_equal(r._selected_obj.columns, self.frame.columns[[1, 3]]) + + r = self.frame.rolling(window=5)[[1, 3]] + tm.assert_index_equal(r._selected_obj.columns, self.frame.columns[[1, 3]]) + + def test_select_bad_cols(self): + df = DataFrame([[1, 2]], columns=["A", "B"]) + g = df.rolling(window=5) + with pytest.raises(KeyError, match="Columns not found: 'C'"): + g[["C"]] + with pytest.raises(KeyError, match="^[^A]+$"): + # A should not be referenced as a bad column... + # will have to rethink regex if you change message! + g[["A", "C"]] + + def test_attribute_access(self): + + df = DataFrame([[1, 2]], columns=["A", "B"]) + r = df.rolling(window=5) + tm.assert_series_equal(r.A.sum(), r["A"].sum()) + msg = "'Rolling' object has no attribute 'F'" + with pytest.raises(AttributeError, match=msg): + r.F + + def tests_skip_nuisance(self): + + df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) + r = df.rolling(window=3) + result = r[["A", "B"]].sum() + expected = DataFrame( + {"A": [np.nan, np.nan, 3, 6, 9], "B": [np.nan, np.nan, 18, 21, 24]}, + columns=list("AB"), + ) + tm.assert_frame_equal(result, expected) + + def test_skip_sum_object_raises(self): + df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) + r = df.rolling(window=3) + result = r.sum() + expected = DataFrame( + {"A": [np.nan, np.nan, 3, 6, 9], "B": [np.nan, np.nan, 18, 21, 24]}, + columns=list("AB"), + ) + tm.assert_frame_equal(result, expected) + + def test_agg(self): + df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) + + r = df.rolling(window=3) + a_mean = r["A"].mean() + a_std = r["A"].std() + a_sum = r["A"].sum() + b_mean = r["B"].mean() + b_std = r["B"].std() + b_sum = r["B"].sum() + + result = r.aggregate([np.mean, np.std]) + expected = concat([a_mean, a_std, b_mean, b_std], axis=1) + expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) + tm.assert_frame_equal(result, expected) + + result = r.aggregate({"A": np.mean, "B": np.std}) + + expected = concat([a_mean, b_std], axis=1) + tm.assert_frame_equal(result, expected, check_like=True) + + result = r.aggregate({"A": ["mean", "std"]}) + expected = concat([a_mean, a_std], axis=1) + expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std")]) + tm.assert_frame_equal(result, expected) + + result = r["A"].aggregate(["mean", "sum"]) + expected = concat([a_mean, a_sum], axis=1) + expected.columns = ["mean", "sum"] + tm.assert_frame_equal(result, expected) + + with catch_warnings(record=True): + # using a dict with renaming + warnings.simplefilter("ignore", FutureWarning) + result = r.aggregate({"A": {"mean": "mean", "sum": "sum"}}) + expected = concat([a_mean, a_sum], axis=1) + expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "sum")]) + tm.assert_frame_equal(result, expected, check_like=True) + + with catch_warnings(record=True): + warnings.simplefilter("ignore", FutureWarning) + result = r.aggregate( + { + "A": {"mean": "mean", "sum": "sum"}, + "B": {"mean2": "mean", "sum2": "sum"}, + } + ) + expected = concat([a_mean, a_sum, b_mean, b_sum], axis=1) + exp_cols = [("A", "mean"), ("A", "sum"), ("B", "mean2"), ("B", "sum2")] + expected.columns = pd.MultiIndex.from_tuples(exp_cols) + tm.assert_frame_equal(result, expected, check_like=True) + + result = r.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) + expected = concat([a_mean, a_std, b_mean, b_std], axis=1) + + exp_cols = [("A", "mean"), ("A", "std"), ("B", "mean"), ("B", "std")] + expected.columns = pd.MultiIndex.from_tuples(exp_cols) + tm.assert_frame_equal(result, expected, check_like=True) + + def test_agg_apply(self, raw): + + # passed lambda + df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) + + r = df.rolling(window=3) + a_sum = r["A"].sum() + + result = r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) + rcustom = r["B"].apply(lambda x: np.std(x, ddof=1), raw=raw) + expected = concat([a_sum, rcustom], axis=1) + tm.assert_frame_equal(result, expected, check_like=True) + + def test_agg_consistency(self): + + df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) + r = df.rolling(window=3) + + result = r.agg([np.sum, np.mean]).columns + expected = pd.MultiIndex.from_product([list("AB"), ["sum", "mean"]]) + tm.assert_index_equal(result, expected) + + result = r["A"].agg([np.sum, np.mean]).columns + expected = Index(["sum", "mean"]) + tm.assert_index_equal(result, expected) + + result = r.agg({"A": [np.sum, np.mean]}).columns + expected = pd.MultiIndex.from_tuples([("A", "sum"), ("A", "mean")]) + tm.assert_index_equal(result, expected) + + def test_agg_nested_dicts(self): + + # API change for disallowing these types of nested dicts + df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) + r = df.rolling(window=3) + + msg = r"cannot perform renaming for (r1|r2) with a nested dictionary" + with pytest.raises(SpecificationError, match=msg): + r.aggregate({"r1": {"A": ["mean", "sum"]}, "r2": {"B": ["mean", "sum"]}}) + + expected = concat( + [r["A"].mean(), r["A"].std(), r["B"].mean(), r["B"].std()], axis=1 + ) + expected.columns = pd.MultiIndex.from_tuples( + [("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")] + ) + with catch_warnings(record=True): + warnings.simplefilter("ignore", FutureWarning) + result = r[["A", "B"]].agg( + {"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}} + ) + tm.assert_frame_equal(result, expected, check_like=True) + + with catch_warnings(record=True): + warnings.simplefilter("ignore", FutureWarning) + result = r.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) + expected.columns = pd.MultiIndex.from_tuples( + [ + ("A", "ra", "mean"), + ("A", "ra", "std"), + ("B", "rb", "mean"), + ("B", "rb", "std"), + ] + ) + tm.assert_frame_equal(result, expected, check_like=True) + + def test_count_nonnumeric_types(self): + # GH12541 + cols = [ + "int", + "float", + "string", + "datetime", + "timedelta", + "periods", + "fl_inf", + "fl_nan", + "str_nan", + "dt_nat", + "periods_nat", + ] + + df = DataFrame( + { + "int": [1, 2, 3], + "float": [4.0, 5.0, 6.0], + "string": list("abc"), + "datetime": pd.date_range("20170101", periods=3), + "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), + "periods": [ + pd.Period("2012-01"), + pd.Period("2012-02"), + pd.Period("2012-03"), + ], + "fl_inf": [1.0, 2.0, np.Inf], + "fl_nan": [1.0, 2.0, np.NaN], + "str_nan": ["aa", "bb", np.NaN], + "dt_nat": [ + Timestamp("20170101"), + Timestamp("20170203"), + Timestamp(None), + ], + "periods_nat": [ + pd.Period("2012-01"), + pd.Period("2012-02"), + pd.Period(None), + ], + }, + columns=cols, + ) + + expected = DataFrame( + { + "int": [1.0, 2.0, 2.0], + "float": [1.0, 2.0, 2.0], + "string": [1.0, 2.0, 2.0], + "datetime": [1.0, 2.0, 2.0], + "timedelta": [1.0, 2.0, 2.0], + "periods": [1.0, 2.0, 2.0], + "fl_inf": [1.0, 2.0, 2.0], + "fl_nan": [1.0, 2.0, 1.0], + "str_nan": [1.0, 2.0, 1.0], + "dt_nat": [1.0, 2.0, 1.0], + "periods_nat": [1.0, 2.0, 1.0], + }, + columns=cols, + ) + + result = df.rolling(window=2).count() + tm.assert_frame_equal(result, expected) + + result = df.rolling(1).count() + expected = df.notna().astype(float) + tm.assert_frame_equal(result, expected) + + @td.skip_if_no_scipy + @pytest.mark.filterwarnings("ignore:can't resolve:ImportWarning") + def test_window_with_args(self): + # make sure that we are aggregating window functions correctly with arg + r = Series(np.random.randn(100)).rolling( + window=10, min_periods=1, win_type="gaussian" + ) + expected = concat([r.mean(std=10), r.mean(std=0.01)], axis=1) + expected.columns = ["", ""] + result = r.aggregate([lambda x: x.mean(std=10), lambda x: x.mean(std=0.01)]) + tm.assert_frame_equal(result, expected) + + def a(x): + return x.mean(std=10) + + def b(x): + return x.mean(std=0.01) + + expected = concat([r.mean(std=10), r.mean(std=0.01)], axis=1) + expected.columns = ["a", "b"] + result = r.aggregate([a, b]) + tm.assert_frame_equal(result, expected) + + def test_preserve_metadata(self): + # GH 10565 + s = Series(np.arange(100), name="foo") + + s2 = s.rolling(30).sum() + s3 = s.rolling(20).sum() + assert s2.name == "foo" + assert s3.name == "foo" + + @pytest.mark.parametrize( + "func,window_size,expected_vals", + [ + ( + "rolling", + 2, + [ + [np.nan, np.nan, np.nan, np.nan], + [15.0, 20.0, 25.0, 20.0], + [25.0, 30.0, 35.0, 30.0], + [np.nan, np.nan, np.nan, np.nan], + [20.0, 30.0, 35.0, 30.0], + [35.0, 40.0, 60.0, 40.0], + [60.0, 80.0, 85.0, 80], + ], + ), + ( + "expanding", + None, + [ + [10.0, 10.0, 20.0, 20.0], + [15.0, 20.0, 25.0, 20.0], + [20.0, 30.0, 30.0, 20.0], + [10.0, 10.0, 30.0, 30.0], + [20.0, 30.0, 35.0, 30.0], + [26.666667, 40.0, 50.0, 30.0], + [40.0, 80.0, 60.0, 30.0], + ], + ), + ], + ) + def test_multiple_agg_funcs(self, func, window_size, expected_vals): + # GH 15072 + df = pd.DataFrame( + [ + ["A", 10, 20], + ["A", 20, 30], + ["A", 30, 40], + ["B", 10, 30], + ["B", 30, 40], + ["B", 40, 80], + ["B", 80, 90], + ], + columns=["stock", "low", "high"], + ) + + f = getattr(df.groupby("stock"), func) + if window_size: + window = f(window_size) + else: + window = f() + + index = pd.MultiIndex.from_tuples( + [("A", 0), ("A", 1), ("A", 2), ("B", 3), ("B", 4), ("B", 5), ("B", 6)], + names=["stock", None], + ) + columns = pd.MultiIndex.from_tuples( + [("low", "mean"), ("low", "max"), ("high", "mean"), ("high", "min")] + ) + expected = pd.DataFrame(expected_vals, index=index, columns=columns) + + result = window.agg( + OrderedDict((("low", ["mean", "max"]), ("high", ["mean", "min"]))) + ) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py new file mode 100644 index 00000000000000..a05b567adad7a9 --- /dev/null +++ b/pandas/tests/window/test_ewm.py @@ -0,0 +1,70 @@ +import numpy as np +import pytest + +from pandas.errors import UnsupportedFunctionCall + +from pandas import DataFrame, Series +import pandas.core.window as rwindow +from pandas.tests.window.common import Base + + +class TestEWM(Base): + def setup_method(self, method): + self._create_data() + + def test_doc_string(self): + + df = DataFrame({"B": [0, 1, 2, np.nan, 4]}) + df + df.ewm(com=0.5).mean() + + @pytest.mark.parametrize("which", ["series", "frame"]) + def test_constructor(self, which): + o = getattr(self, which) + c = o.ewm + + # valid + c(com=0.5) + c(span=1.5) + c(alpha=0.5) + c(halflife=0.75) + c(com=0.5, span=None) + c(alpha=0.5, com=None) + c(halflife=0.75, alpha=None) + + # not valid: mutually exclusive + with pytest.raises(ValueError): + c(com=0.5, alpha=0.5) + with pytest.raises(ValueError): + c(span=1.5, halflife=0.75) + with pytest.raises(ValueError): + c(alpha=0.5, span=1.5) + + # not valid: com < 0 + with pytest.raises(ValueError): + c(com=-0.5) + + # not valid: span < 1 + with pytest.raises(ValueError): + c(span=0.5) + + # not valid: halflife <= 0 + with pytest.raises(ValueError): + c(halflife=0) + + # not valid: alpha <= 0 or alpha > 1 + for alpha in (-0.5, 1.5): + with pytest.raises(ValueError): + c(alpha=alpha) + + @pytest.mark.parametrize("method", ["std", "mean", "var"]) + def test_numpy_compat(self, method): + # see gh-12811 + e = rwindow.EWM(Series([2, 4, 6]), alpha=0.5) + + msg = "numpy operations are not valid with window objects" + + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(e, method)(1, 2, 3) + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(e, method)(dtype=np.float64) diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py new file mode 100644 index 00000000000000..1e92c981964c5f --- /dev/null +++ b/pandas/tests/window/test_expanding.py @@ -0,0 +1,115 @@ +import numpy as np +import pytest + +from pandas.errors import UnsupportedFunctionCall + +import pandas as pd +from pandas import DataFrame, Series +import pandas.core.window as rwindow +from pandas.tests.window.common import Base +import pandas.util.testing as tm + + +class TestExpanding(Base): + def setup_method(self, method): + self._create_data() + + def test_doc_string(self): + + df = DataFrame({"B": [0, 1, 2, np.nan, 4]}) + df + df.expanding(2).sum() + + @pytest.mark.parametrize("which", ["series", "frame"]) + def test_constructor(self, which): + # GH 12669 + + o = getattr(self, which) + c = o.expanding + + # valid + c(min_periods=1) + c(min_periods=1, center=True) + c(min_periods=1, center=False) + + # not valid + for w in [2.0, "foo", np.array([2])]: + with pytest.raises(ValueError): + c(min_periods=w) + with pytest.raises(ValueError): + c(min_periods=1, center=w) + + @pytest.mark.parametrize("method", ["std", "mean", "sum", "max", "min", "var"]) + def test_numpy_compat(self, method): + # see gh-12811 + e = rwindow.Expanding(Series([2, 4, 6]), window=2) + + msg = "numpy operations are not valid with window objects" + + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(e, method)(1, 2, 3) + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(e, method)(dtype=np.float64) + + @pytest.mark.parametrize( + "expander", + [ + 1, + pytest.param( + "ls", + marks=pytest.mark.xfail( + reason="GH#16425 expanding with offset not supported" + ), + ), + ], + ) + def test_empty_df_expanding(self, expander): + # GH 15819 Verifies that datetime and integer expanding windows can be + # applied to empty DataFrames + + expected = DataFrame() + result = DataFrame().expanding(expander).sum() + tm.assert_frame_equal(result, expected) + + # Verifies that datetime and integer expanding windows can be applied + # to empty DataFrames with datetime index + expected = DataFrame(index=pd.DatetimeIndex([])) + result = DataFrame(index=pd.DatetimeIndex([])).expanding(expander).sum() + tm.assert_frame_equal(result, expected) + + def test_missing_minp_zero(self): + # https://github.com/pandas-dev/pandas/pull/18921 + # minp=0 + x = pd.Series([np.nan]) + result = x.expanding(min_periods=0).sum() + expected = pd.Series([0.0]) + tm.assert_series_equal(result, expected) + + # minp=1 + result = x.expanding(min_periods=1).sum() + expected = pd.Series([np.nan]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("klass", [pd.Series, pd.DataFrame]) + def test_iter_raises(self, klass): + # https://github.com/pandas-dev/pandas/issues/11704 + # Iteration over a Window + obj = klass([1, 2, 3, 4]) + with pytest.raises(NotImplementedError): + iter(obj.expanding(2)) + + def test_expanding_axis(self, axis_frame): + # see gh-23372. + df = DataFrame(np.ones((10, 20))) + axis = df._get_axis_number(axis_frame) + + if axis == 0: + expected = DataFrame( + {i: [np.nan] * 2 + [float(j) for j in range(3, 11)] for i in range(20)} + ) + else: + # axis == 1 + expected = DataFrame([[np.nan] * 2 + [float(i) for i in range(3, 21)]] * 10) + + result = df.expanding(3, axis=axis_frame).sum() + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py new file mode 100644 index 00000000000000..b726bd3e3c8a74 --- /dev/null +++ b/pandas/tests/window/test_grouper.py @@ -0,0 +1,176 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series +import pandas.util.testing as tm + + +class TestGrouperGrouping: + def setup_method(self, method): + self.series = Series(np.arange(10)) + self.frame = DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) + + def test_mutated(self): + + msg = r"group\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=msg): + self.frame.groupby("A", foo=1) + + g = self.frame.groupby("A") + assert not g.mutated + g = self.frame.groupby("A", mutated=True) + assert g.mutated + + def test_getitem(self): + g = self.frame.groupby("A") + g_mutated = self.frame.groupby("A", mutated=True) + + expected = g_mutated.B.apply(lambda x: x.rolling(2).mean()) + + result = g.rolling(2).mean().B + tm.assert_series_equal(result, expected) + + result = g.rolling(2).B.mean() + tm.assert_series_equal(result, expected) + + result = g.B.rolling(2).mean() + tm.assert_series_equal(result, expected) + + result = self.frame.B.groupby(self.frame.A).rolling(2).mean() + tm.assert_series_equal(result, expected) + + def test_getitem_multiple(self): + + # GH 13174 + g = self.frame.groupby("A") + r = g.rolling(2) + g_mutated = self.frame.groupby("A", mutated=True) + expected = g_mutated.B.apply(lambda x: x.rolling(2).count()) + + result = r.B.count() + tm.assert_series_equal(result, expected) + + result = r.B.count() + tm.assert_series_equal(result, expected) + + def test_rolling(self): + g = self.frame.groupby("A") + r = g.rolling(window=4) + + for f in ["sum", "mean", "min", "max", "count", "kurt", "skew"]: + + result = getattr(r, f)() + expected = g.apply(lambda x: getattr(x.rolling(4), f)()) + tm.assert_frame_equal(result, expected) + + for f in ["std", "var"]: + result = getattr(r, f)(ddof=1) + expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) + tm.assert_frame_equal(result, expected) + + result = r.quantile(0.5) + expected = g.apply(lambda x: x.rolling(4).quantile(0.5)) + tm.assert_frame_equal(result, expected) + + def test_rolling_corr_cov(self): + g = self.frame.groupby("A") + r = g.rolling(window=4) + + for f in ["corr", "cov"]: + result = getattr(r, f)(self.frame) + + def func(x): + return getattr(x.rolling(4), f)(self.frame) + + expected = g.apply(func) + tm.assert_frame_equal(result, expected) + + result = getattr(r.B, f)(pairwise=True) + + def func(x): + return getattr(x.B.rolling(4), f)(pairwise=True) + + expected = g.apply(func) + tm.assert_series_equal(result, expected) + + def test_rolling_apply(self, raw): + g = self.frame.groupby("A") + r = g.rolling(window=4) + + # reduction + result = r.apply(lambda x: x.sum(), raw=raw) + expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) + tm.assert_frame_equal(result, expected) + + def test_rolling_apply_mutability(self): + # GH 14013 + df = pd.DataFrame({"A": ["foo"] * 3 + ["bar"] * 3, "B": [1] * 6}) + g = df.groupby("A") + + mi = pd.MultiIndex.from_tuples( + [("bar", 3), ("bar", 4), ("bar", 5), ("foo", 0), ("foo", 1), ("foo", 2)] + ) + + mi.names = ["A", None] + # Grouped column should not be a part of the output + expected = pd.DataFrame([np.nan, 2.0, 2.0] * 2, columns=["B"], index=mi) + + result = g.rolling(window=2).sum() + tm.assert_frame_equal(result, expected) + + # Call an arbitrary function on the groupby + g.sum() + + # Make sure nothing has been mutated + result = g.rolling(window=2).sum() + tm.assert_frame_equal(result, expected) + + def test_expanding(self): + g = self.frame.groupby("A") + r = g.expanding() + + for f in ["sum", "mean", "min", "max", "count", "kurt", "skew"]: + + result = getattr(r, f)() + expected = g.apply(lambda x: getattr(x.expanding(), f)()) + tm.assert_frame_equal(result, expected) + + for f in ["std", "var"]: + result = getattr(r, f)(ddof=0) + expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) + tm.assert_frame_equal(result, expected) + + result = r.quantile(0.5) + expected = g.apply(lambda x: x.expanding().quantile(0.5)) + tm.assert_frame_equal(result, expected) + + def test_expanding_corr_cov(self): + g = self.frame.groupby("A") + r = g.expanding() + + for f in ["corr", "cov"]: + result = getattr(r, f)(self.frame) + + def func(x): + return getattr(x.expanding(), f)(self.frame) + + expected = g.apply(func) + tm.assert_frame_equal(result, expected) + + result = getattr(r.B, f)(pairwise=True) + + def func(x): + return getattr(x.B.expanding(), f)(pairwise=True) + + expected = g.apply(func) + tm.assert_series_equal(result, expected) + + def test_expanding_apply(self, raw): + g = self.frame.groupby("A") + r = g.expanding() + + # reduction + result = r.apply(lambda x: x.sum(), raw=raw) + expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw)) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_moments.py b/pandas/tests/window/test_moments.py new file mode 100644 index 00000000000000..d860859958254c --- /dev/null +++ b/pandas/tests/window/test_moments.py @@ -0,0 +1,2562 @@ +import copy +from datetime import datetime +import warnings + +import numpy as np +from numpy.random import randn +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, Index, Series, concat, isna, notna +import pandas.core.window as rwindow +from pandas.tests.window.common import Base +import pandas.util.testing as tm + +import pandas.tseries.offsets as offsets + + +@pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning") +class TestMoments(Base): + def setup_method(self, method): + self._create_data() + + def test_centered_axis_validation(self): + + # ok + Series(np.ones(10)).rolling(window=3, center=True, axis=0).mean() + + # bad axis + with pytest.raises(ValueError): + Series(np.ones(10)).rolling(window=3, center=True, axis=1).mean() + + # ok ok + DataFrame(np.ones((10, 10))).rolling(window=3, center=True, axis=0).mean() + DataFrame(np.ones((10, 10))).rolling(window=3, center=True, axis=1).mean() + + # bad axis + with pytest.raises(ValueError): + (DataFrame(np.ones((10, 10))).rolling(window=3, center=True, axis=2).mean()) + + def test_rolling_sum(self, raw): + self._check_moment_func( + np.nansum, name="sum", zero_min_periods_equal=False, raw=raw + ) + + def test_rolling_count(self, raw): + counter = lambda x: np.isfinite(x).astype(float).sum() + self._check_moment_func( + counter, name="count", has_min_periods=False, fill_value=0, raw=raw + ) + + def test_rolling_mean(self, raw): + self._check_moment_func(np.mean, name="mean", raw=raw) + + @td.skip_if_no_scipy + def test_cmov_mean(self): + # GH 8238 + vals = np.array( + [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48] + ) + result = Series(vals).rolling(5, center=True).mean() + expected = Series( + [ + np.nan, + np.nan, + 9.962, + 11.27, + 11.564, + 12.516, + 12.818, + 12.952, + np.nan, + np.nan, + ] + ) + tm.assert_series_equal(expected, result) + + @td.skip_if_no_scipy + def test_cmov_window(self): + # GH 8238 + vals = np.array( + [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48] + ) + result = Series(vals).rolling(5, win_type="boxcar", center=True).mean() + expected = Series( + [ + np.nan, + np.nan, + 9.962, + 11.27, + 11.564, + 12.516, + 12.818, + 12.952, + np.nan, + np.nan, + ] + ) + tm.assert_series_equal(expected, result) + + @td.skip_if_no_scipy + def test_cmov_window_corner(self): + # GH 8238 + # all nan + vals = pd.Series([np.nan] * 10) + result = vals.rolling(5, center=True, win_type="boxcar").mean() + assert np.isnan(result).all() + + # empty + vals = pd.Series([]) + result = vals.rolling(5, center=True, win_type="boxcar").mean() + assert len(result) == 0 + + # shorter than window + vals = pd.Series(np.random.randn(5)) + result = vals.rolling(10, win_type="boxcar").mean() + assert np.isnan(result).all() + assert len(result) == 5 + + @td.skip_if_no_scipy + def test_cmov_window_frame(self): + # Gh 8238 + vals = np.array( + [ + [12.18, 3.64], + [10.18, 9.16], + [13.24, 14.61], + [4.51, 8.11], + [6.15, 11.44], + [9.14, 6.21], + [11.31, 10.67], + [2.94, 6.51], + [9.42, 8.39], + [12.44, 7.34], + ] + ) + + xp = np.array( + [ + [np.nan, np.nan], + [np.nan, np.nan], + [9.252, 9.392], + [8.644, 9.906], + [8.87, 10.208], + [6.81, 8.588], + [7.792, 8.644], + [9.05, 7.824], + [np.nan, np.nan], + [np.nan, np.nan], + ] + ) + + # DataFrame + rs = DataFrame(vals).rolling(5, win_type="boxcar", center=True).mean() + tm.assert_frame_equal(DataFrame(xp), rs) + + # invalid method + with pytest.raises(AttributeError): + (DataFrame(vals).rolling(5, win_type="boxcar", center=True).std()) + + # sum + xp = np.array( + [ + [np.nan, np.nan], + [np.nan, np.nan], + [46.26, 46.96], + [43.22, 49.53], + [44.35, 51.04], + [34.05, 42.94], + [38.96, 43.22], + [45.25, 39.12], + [np.nan, np.nan], + [np.nan, np.nan], + ] + ) + + rs = DataFrame(vals).rolling(5, win_type="boxcar", center=True).sum() + tm.assert_frame_equal(DataFrame(xp), rs) + + @td.skip_if_no_scipy + def test_cmov_window_na_min_periods(self): + # min_periods + vals = Series(np.random.randn(10)) + vals[4] = np.nan + vals[8] = np.nan + + xp = vals.rolling(5, min_periods=4, center=True).mean() + rs = vals.rolling(5, win_type="boxcar", min_periods=4, center=True).mean() + tm.assert_series_equal(xp, rs) + + @td.skip_if_no_scipy + def test_cmov_window_regular(self, win_types): + # GH 8238 + vals = np.array( + [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48] + ) + xps = { + "hamming": [ + np.nan, + np.nan, + 8.71384, + 9.56348, + 12.38009, + 14.03687, + 13.8567, + 11.81473, + np.nan, + np.nan, + ], + "triang": [ + np.nan, + np.nan, + 9.28667, + 10.34667, + 12.00556, + 13.33889, + 13.38, + 12.33667, + np.nan, + np.nan, + ], + "barthann": [ + np.nan, + np.nan, + 8.4425, + 9.1925, + 12.5575, + 14.3675, + 14.0825, + 11.5675, + np.nan, + np.nan, + ], + "bohman": [ + np.nan, + np.nan, + 7.61599, + 9.1764, + 12.83559, + 14.17267, + 14.65923, + 11.10401, + np.nan, + np.nan, + ], + "blackmanharris": [ + np.nan, + np.nan, + 6.97691, + 9.16438, + 13.05052, + 14.02156, + 15.10512, + 10.74574, + np.nan, + np.nan, + ], + "nuttall": [ + np.nan, + np.nan, + 7.04618, + 9.16786, + 13.02671, + 14.03559, + 15.05657, + 10.78514, + np.nan, + np.nan, + ], + "blackman": [ + np.nan, + np.nan, + 7.73345, + 9.17869, + 12.79607, + 14.20036, + 14.57726, + 11.16988, + np.nan, + np.nan, + ], + "bartlett": [ + np.nan, + np.nan, + 8.4425, + 9.1925, + 12.5575, + 14.3675, + 14.0825, + 11.5675, + np.nan, + np.nan, + ], + } + + xp = Series(xps[win_types]) + rs = Series(vals).rolling(5, win_type=win_types, center=True).mean() + tm.assert_series_equal(xp, rs) + + @td.skip_if_no_scipy + def test_cmov_window_regular_linear_range(self, win_types): + # GH 8238 + vals = np.array(range(10), dtype=np.float) + xp = vals.copy() + xp[:2] = np.nan + xp[-2:] = np.nan + xp = Series(xp) + + rs = Series(vals).rolling(5, win_type=win_types, center=True).mean() + tm.assert_series_equal(xp, rs) + + @td.skip_if_no_scipy + def test_cmov_window_regular_missing_data(self, win_types): + # GH 8238 + vals = np.array( + [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, np.nan, 10.63, 14.48] + ) + xps = { + "bartlett": [ + np.nan, + np.nan, + 9.70333, + 10.5225, + 8.4425, + 9.1925, + 12.5575, + 14.3675, + 15.61667, + 13.655, + ], + "blackman": [ + np.nan, + np.nan, + 9.04582, + 11.41536, + 7.73345, + 9.17869, + 12.79607, + 14.20036, + 15.8706, + 13.655, + ], + "barthann": [ + np.nan, + np.nan, + 9.70333, + 10.5225, + 8.4425, + 9.1925, + 12.5575, + 14.3675, + 15.61667, + 13.655, + ], + "bohman": [ + np.nan, + np.nan, + 8.9444, + 11.56327, + 7.61599, + 9.1764, + 12.83559, + 14.17267, + 15.90976, + 13.655, + ], + "hamming": [ + np.nan, + np.nan, + 9.59321, + 10.29694, + 8.71384, + 9.56348, + 12.38009, + 14.20565, + 15.24694, + 13.69758, + ], + "nuttall": [ + np.nan, + np.nan, + 8.47693, + 12.2821, + 7.04618, + 9.16786, + 13.02671, + 14.03673, + 16.08759, + 13.65553, + ], + "triang": [ + np.nan, + np.nan, + 9.33167, + 9.76125, + 9.28667, + 10.34667, + 12.00556, + 13.82125, + 14.49429, + 13.765, + ], + "blackmanharris": [ + np.nan, + np.nan, + 8.42526, + 12.36824, + 6.97691, + 9.16438, + 13.05052, + 14.02175, + 16.1098, + 13.65509, + ], + } + + xp = Series(xps[win_types]) + rs = Series(vals).rolling(5, win_type=win_types, min_periods=3).mean() + tm.assert_series_equal(xp, rs) + + @td.skip_if_no_scipy + def test_cmov_window_special(self, win_types_special): + # GH 8238 + kwds = { + "kaiser": {"beta": 1.0}, + "gaussian": {"std": 1.0}, + "general_gaussian": {"power": 2.0, "width": 2.0}, + "exponential": {"tau": 10}, + } + + vals = np.array( + [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48] + ) + + xps = { + "gaussian": [ + np.nan, + np.nan, + 8.97297, + 9.76077, + 12.24763, + 13.89053, + 13.65671, + 12.01002, + np.nan, + np.nan, + ], + "general_gaussian": [ + np.nan, + np.nan, + 9.85011, + 10.71589, + 11.73161, + 13.08516, + 12.95111, + 12.74577, + np.nan, + np.nan, + ], + "kaiser": [ + np.nan, + np.nan, + 9.86851, + 11.02969, + 11.65161, + 12.75129, + 12.90702, + 12.83757, + np.nan, + np.nan, + ], + "exponential": [ + np.nan, + np.nan, + 9.83364, + 11.10472, + 11.64551, + 12.66138, + 12.92379, + 12.83770, + np.nan, + np.nan, + ], + } + + xp = Series(xps[win_types_special]) + rs = ( + Series(vals) + .rolling(5, win_type=win_types_special, center=True) + .mean(**kwds[win_types_special]) + ) + tm.assert_series_equal(xp, rs) + + @td.skip_if_no_scipy + def test_cmov_window_special_linear_range(self, win_types_special): + # GH 8238 + kwds = { + "kaiser": {"beta": 1.0}, + "gaussian": {"std": 1.0}, + "general_gaussian": {"power": 2.0, "width": 2.0}, + "slepian": {"width": 0.5}, + "exponential": {"tau": 10}, + } + + vals = np.array(range(10), dtype=np.float) + xp = vals.copy() + xp[:2] = np.nan + xp[-2:] = np.nan + xp = Series(xp) + + rs = ( + Series(vals) + .rolling(5, win_type=win_types_special, center=True) + .mean(**kwds[win_types_special]) + ) + tm.assert_series_equal(xp, rs) + + def test_rolling_median(self, raw): + self._check_moment_func(np.median, name="median", raw=raw) + + def test_rolling_min(self, raw): + self._check_moment_func(np.min, name="min", raw=raw) + + a = pd.Series([1, 2, 3, 4, 5]) + result = a.rolling(window=100, min_periods=1).min() + expected = pd.Series(np.ones(len(a))) + tm.assert_series_equal(result, expected) + + with pytest.raises(ValueError): + pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).min() + + def test_rolling_max(self, raw): + self._check_moment_func(np.max, name="max", raw=raw) + + a = pd.Series([1, 2, 3, 4, 5], dtype=np.float64) + b = a.rolling(window=100, min_periods=1).max() + tm.assert_almost_equal(a, b) + + with pytest.raises(ValueError): + pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).max() + + @pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0]) + def test_rolling_quantile(self, q, raw): + def scoreatpercentile(a, per): + values = np.sort(a, axis=0) + + idx = int(per / 1.0 * (values.shape[0] - 1)) + + if idx == values.shape[0] - 1: + retval = values[-1] + + else: + qlow = float(idx) / float(values.shape[0] - 1) + qhig = float(idx + 1) / float(values.shape[0] - 1) + vlow = values[idx] + vhig = values[idx + 1] + retval = vlow + (vhig - vlow) * (per - qlow) / (qhig - qlow) + + return retval + + def quantile_func(x): + return scoreatpercentile(x, q) + + self._check_moment_func(quantile_func, name="quantile", quantile=q, raw=raw) + + def test_rolling_quantile_np_percentile(self): + # #9413: Tests that rolling window's quantile default behavior + # is analogous to Numpy's percentile + row = 10 + col = 5 + idx = pd.date_range("20100101", periods=row, freq="B") + df = DataFrame(np.random.rand(row * col).reshape((row, -1)), index=idx) + + df_quantile = df.quantile([0.25, 0.5, 0.75], axis=0) + np_percentile = np.percentile(df, [25, 50, 75], axis=0) + + tm.assert_almost_equal(df_quantile.values, np.array(np_percentile)) + + @pytest.mark.parametrize("quantile", [0.0, 0.1, 0.45, 0.5, 1]) + @pytest.mark.parametrize( + "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] + ) + @pytest.mark.parametrize( + "data", + [ + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], + [8.0, 1.0, 3.0, 4.0, 5.0, 2.0, 6.0, 7.0], + [0.0, np.nan, 0.2, np.nan, 0.4], + [np.nan, np.nan, np.nan, np.nan], + [np.nan, 0.1, np.nan, 0.3, 0.4, 0.5], + [0.5], + [np.nan, 0.7, 0.6], + ], + ) + def test_rolling_quantile_interpolation_options( + self, quantile, interpolation, data + ): + # Tests that rolling window's quantile behavior is analogous to + # Series' quantile for each interpolation option + s = Series(data) + + q1 = s.quantile(quantile, interpolation) + q2 = s.expanding(min_periods=1).quantile(quantile, interpolation).iloc[-1] + + if np.isnan(q1): + assert np.isnan(q2) + else: + assert q1 == q2 + + def test_invalid_quantile_value(self): + data = np.arange(5) + s = Series(data) + + msg = "Interpolation 'invalid' is not supported" + with pytest.raises(ValueError, match=msg): + s.rolling(len(data), min_periods=1).quantile(0.5, interpolation="invalid") + + def test_rolling_quantile_param(self): + ser = Series([0.0, 0.1, 0.5, 0.9, 1.0]) + + with pytest.raises(ValueError): + ser.rolling(3).quantile(-0.1) + + with pytest.raises(ValueError): + ser.rolling(3).quantile(10.0) + + with pytest.raises(TypeError): + ser.rolling(3).quantile("foo") + + def test_rolling_apply(self, raw): + # suppress warnings about empty slices, as we are deliberately testing + # with a 0-length Series + + def f(x): + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=".*(empty slice|0 for slice).*", + category=RuntimeWarning, + ) + return x[np.isfinite(x)].mean() + + self._check_moment_func(np.mean, name="apply", func=f, raw=raw) + + expected = Series([]) + result = expected.rolling(10).apply(lambda x: x.mean(), raw=raw) + tm.assert_series_equal(result, expected) + + # gh-8080 + s = Series([None, None, None]) + result = s.rolling(2, min_periods=0).apply(lambda x: len(x), raw=raw) + expected = Series([1.0, 2.0, 2.0]) + tm.assert_series_equal(result, expected) + + result = s.rolling(2, min_periods=0).apply(len, raw=raw) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("klass", [Series, DataFrame]) + @pytest.mark.parametrize( + "method", [lambda x: x.rolling(window=2), lambda x: x.expanding()] + ) + def test_apply_future_warning(self, klass, method): + + # gh-5071 + s = klass(np.arange(3)) + + with tm.assert_produces_warning(FutureWarning): + method(s).apply(lambda x: len(x)) + + def test_rolling_apply_out_of_bounds(self, raw): + # gh-1850 + vals = pd.Series([1, 2, 3, 4]) + + result = vals.rolling(10).apply(np.sum, raw=raw) + assert result.isna().all() + + result = vals.rolling(10, min_periods=1).apply(np.sum, raw=raw) + expected = pd.Series([1, 3, 6, 10], dtype=float) + tm.assert_almost_equal(result, expected) + + @pytest.mark.parametrize("window", [2, "2s"]) + def test_rolling_apply_with_pandas_objects(self, window): + # 5071 + df = pd.DataFrame( + {"A": np.random.randn(5), "B": np.random.randint(0, 10, size=5)}, + index=pd.date_range("20130101", periods=5, freq="s"), + ) + + # we have an equal spaced timeseries index + # so simulate removing the first period + def f(x): + if x.index[0] == df.index[0]: + return np.nan + return x.iloc[-1] + + result = df.rolling(window).apply(f, raw=False) + expected = df.iloc[2:].reindex_like(df) + tm.assert_frame_equal(result, expected) + + with pytest.raises(AttributeError): + df.rolling(window).apply(f, raw=True) + + def test_rolling_std(self, raw): + self._check_moment_func(lambda x: np.std(x, ddof=1), name="std", raw=raw) + self._check_moment_func( + lambda x: np.std(x, ddof=0), name="std", ddof=0, raw=raw + ) + + def test_rolling_std_1obs(self): + vals = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0]) + + result = vals.rolling(1, min_periods=1).std() + expected = pd.Series([np.nan] * 5) + tm.assert_series_equal(result, expected) + + result = vals.rolling(1, min_periods=1).std(ddof=0) + expected = pd.Series([0.0] * 5) + tm.assert_series_equal(result, expected) + + result = pd.Series([np.nan, np.nan, 3, 4, 5]).rolling(3, min_periods=2).std() + assert np.isnan(result[2]) + + def test_rolling_std_neg_sqrt(self): + # unit test from Bottleneck + + # Test move_nanstd for neg sqrt. + + a = pd.Series( + [ + 0.0011448196318903589, + 0.00028718669878572767, + 0.00028718669878572767, + 0.00028718669878572767, + 0.00028718669878572767, + ] + ) + b = a.rolling(window=3).std() + assert np.isfinite(b[2:]).all() + + b = a.ewm(span=3).std() + assert np.isfinite(b[2:]).all() + + def test_rolling_var(self, raw): + self._check_moment_func(lambda x: np.var(x, ddof=1), name="var", raw=raw) + self._check_moment_func( + lambda x: np.var(x, ddof=0), name="var", ddof=0, raw=raw + ) + + @td.skip_if_no_scipy + def test_rolling_skew(self, raw): + from scipy.stats import skew + + self._check_moment_func(lambda x: skew(x, bias=False), name="skew", raw=raw) + + @td.skip_if_no_scipy + def test_rolling_kurt(self, raw): + from scipy.stats import kurtosis + + self._check_moment_func(lambda x: kurtosis(x, bias=False), name="kurt", raw=raw) + + def _check_moment_func( + self, + static_comp, + name, + raw, + has_min_periods=True, + has_center=True, + has_time_rule=True, + fill_value=None, + zero_min_periods_equal=True, + **kwargs + ): + + # inject raw + if name == "apply": + kwargs = copy.copy(kwargs) + kwargs["raw"] = raw + + def get_result(obj, window, min_periods=None, center=False): + r = obj.rolling(window=window, min_periods=min_periods, center=center) + return getattr(r, name)(**kwargs) + + series_result = get_result(self.series, window=50) + assert isinstance(series_result, Series) + tm.assert_almost_equal(series_result.iloc[-1], static_comp(self.series[-50:])) + + frame_result = get_result(self.frame, window=50) + assert isinstance(frame_result, DataFrame) + tm.assert_series_equal( + frame_result.iloc[-1, :], + self.frame.iloc[-50:, :].apply(static_comp, axis=0, raw=raw), + check_names=False, + ) + + # check time_rule works + if has_time_rule: + win = 25 + minp = 10 + series = self.series[::2].resample("B").mean() + frame = self.frame[::2].resample("B").mean() + + if has_min_periods: + series_result = get_result(series, window=win, min_periods=minp) + frame_result = get_result(frame, window=win, min_periods=minp) + else: + series_result = get_result(series, window=win) + frame_result = get_result(frame, window=win) + + last_date = series_result.index[-1] + prev_date = last_date - 24 * offsets.BDay() + + trunc_series = self.series[::2].truncate(prev_date, last_date) + trunc_frame = self.frame[::2].truncate(prev_date, last_date) + + tm.assert_almost_equal(series_result[-1], static_comp(trunc_series)) + + tm.assert_series_equal( + frame_result.xs(last_date), + trunc_frame.apply(static_comp, raw=raw), + check_names=False, + ) + + # excluding NaNs correctly + obj = Series(randn(50)) + obj[:10] = np.NaN + obj[-10:] = np.NaN + if has_min_periods: + result = get_result(obj, 50, min_periods=30) + tm.assert_almost_equal(result.iloc[-1], static_comp(obj[10:-10])) + + # min_periods is working correctly + result = get_result(obj, 20, min_periods=15) + assert isna(result.iloc[23]) + assert not isna(result.iloc[24]) + + assert not isna(result.iloc[-6]) + assert isna(result.iloc[-5]) + + obj2 = Series(randn(20)) + result = get_result(obj2, 10, min_periods=5) + assert isna(result.iloc[3]) + assert notna(result.iloc[4]) + + if zero_min_periods_equal: + # min_periods=0 may be equivalent to min_periods=1 + result0 = get_result(obj, 20, min_periods=0) + result1 = get_result(obj, 20, min_periods=1) + tm.assert_almost_equal(result0, result1) + else: + result = get_result(obj, 50) + tm.assert_almost_equal(result.iloc[-1], static_comp(obj[10:-10])) + + # window larger than series length (#7297) + if has_min_periods: + for minp in (0, len(self.series) - 1, len(self.series)): + result = get_result(self.series, len(self.series) + 1, min_periods=minp) + expected = get_result(self.series, len(self.series), min_periods=minp) + nan_mask = isna(result) + tm.assert_series_equal(nan_mask, isna(expected)) + + nan_mask = ~nan_mask + tm.assert_almost_equal(result[nan_mask], expected[nan_mask]) + else: + result = get_result(self.series, len(self.series) + 1) + expected = get_result(self.series, len(self.series)) + nan_mask = isna(result) + tm.assert_series_equal(nan_mask, isna(expected)) + + nan_mask = ~nan_mask + tm.assert_almost_equal(result[nan_mask], expected[nan_mask]) + + # check center=True + if has_center: + if has_min_periods: + result = get_result(obj, 20, min_periods=15, center=True) + expected = get_result( + pd.concat([obj, Series([np.NaN] * 9)]), 20, min_periods=15 + )[9:].reset_index(drop=True) + else: + result = get_result(obj, 20, center=True) + expected = get_result(pd.concat([obj, Series([np.NaN] * 9)]), 20)[ + 9: + ].reset_index(drop=True) + + tm.assert_series_equal(result, expected) + + # shifter index + s = ["x{x:d}".format(x=x) for x in range(12)] + + if has_min_periods: + minp = 10 + + series_xp = ( + get_result( + self.series.reindex(list(self.series.index) + s), + window=25, + min_periods=minp, + ) + .shift(-12) + .reindex(self.series.index) + ) + frame_xp = ( + get_result( + self.frame.reindex(list(self.frame.index) + s), + window=25, + min_periods=minp, + ) + .shift(-12) + .reindex(self.frame.index) + ) + + series_rs = get_result( + self.series, window=25, min_periods=minp, center=True + ) + frame_rs = get_result( + self.frame, window=25, min_periods=minp, center=True + ) + + else: + series_xp = ( + get_result( + self.series.reindex(list(self.series.index) + s), window=25 + ) + .shift(-12) + .reindex(self.series.index) + ) + frame_xp = ( + get_result( + self.frame.reindex(list(self.frame.index) + s), window=25 + ) + .shift(-12) + .reindex(self.frame.index) + ) + + series_rs = get_result(self.series, window=25, center=True) + frame_rs = get_result(self.frame, window=25, center=True) + + if fill_value is not None: + series_xp = series_xp.fillna(fill_value) + frame_xp = frame_xp.fillna(fill_value) + tm.assert_series_equal(series_xp, series_rs) + tm.assert_frame_equal(frame_xp, frame_rs) + + def test_ewma(self): + self._check_ew(name="mean") + + vals = pd.Series(np.zeros(1000)) + vals[5] = 1 + result = vals.ewm(span=100, adjust=False).mean().sum() + assert np.abs(result - 1) < 1e-2 + + @pytest.mark.parametrize("adjust", [True, False]) + @pytest.mark.parametrize("ignore_na", [True, False]) + def test_ewma_cases(self, adjust, ignore_na): + # try adjust/ignore_na args matrix + + s = Series([1.0, 2.0, 4.0, 8.0]) + + if adjust: + expected = Series([1.0, 1.6, 2.736842, 4.923077]) + else: + expected = Series([1.0, 1.333333, 2.222222, 4.148148]) + + result = s.ewm(com=2.0, adjust=adjust, ignore_na=ignore_na).mean() + tm.assert_series_equal(result, expected) + + def test_ewma_nan_handling(self): + s = Series([1.0] + [np.nan] * 5 + [1.0]) + result = s.ewm(com=5).mean() + tm.assert_series_equal(result, Series([1.0] * len(s))) + + s = Series([np.nan] * 2 + [1.0] + [np.nan] * 2 + [1.0]) + result = s.ewm(com=5).mean() + tm.assert_series_equal(result, Series([np.nan] * 2 + [1.0] * 4)) + + # GH 7603 + s0 = Series([np.nan, 1.0, 101.0]) + s1 = Series([1.0, np.nan, 101.0]) + s2 = Series([np.nan, 1.0, np.nan, np.nan, 101.0, np.nan]) + s3 = Series([1.0, np.nan, 101.0, 50.0]) + com = 2.0 + alpha = 1.0 / (1.0 + com) + + def simple_wma(s, w): + return (s.multiply(w).cumsum() / w.cumsum()).fillna(method="ffill") + + for (s, adjust, ignore_na, w) in [ + (s0, True, False, [np.nan, (1.0 - alpha), 1.0]), + (s0, True, True, [np.nan, (1.0 - alpha), 1.0]), + (s0, False, False, [np.nan, (1.0 - alpha), alpha]), + (s0, False, True, [np.nan, (1.0 - alpha), alpha]), + (s1, True, False, [(1.0 - alpha) ** 2, np.nan, 1.0]), + (s1, True, True, [(1.0 - alpha), np.nan, 1.0]), + (s1, False, False, [(1.0 - alpha) ** 2, np.nan, alpha]), + (s1, False, True, [(1.0 - alpha), np.nan, alpha]), + ( + s2, + True, + False, + [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, 1.0, np.nan], + ), + (s2, True, True, [np.nan, (1.0 - alpha), np.nan, np.nan, 1.0, np.nan]), + ( + s2, + False, + False, + [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, alpha, np.nan], + ), + (s2, False, True, [np.nan, (1.0 - alpha), np.nan, np.nan, alpha, np.nan]), + (s3, True, False, [(1.0 - alpha) ** 3, np.nan, (1.0 - alpha), 1.0]), + (s3, True, True, [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha), 1.0]), + ( + s3, + False, + False, + [ + (1.0 - alpha) ** 3, + np.nan, + (1.0 - alpha) * alpha, + alpha * ((1.0 - alpha) ** 2 + alpha), + ], + ), + ( + s3, + False, + True, + [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha) * alpha, alpha], + ), + ]: + expected = simple_wma(s, Series(w)) + result = s.ewm(com=com, adjust=adjust, ignore_na=ignore_na).mean() + + tm.assert_series_equal(result, expected) + if ignore_na is False: + # check that ignore_na defaults to False + result = s.ewm(com=com, adjust=adjust).mean() + tm.assert_series_equal(result, expected) + + def test_ewmvar(self): + self._check_ew(name="var") + + def test_ewmvol(self): + self._check_ew(name="vol") + + def test_ewma_span_com_args(self): + A = self.series.ewm(com=9.5).mean() + B = self.series.ewm(span=20).mean() + tm.assert_almost_equal(A, B) + + with pytest.raises(ValueError): + self.series.ewm(com=9.5, span=20) + with pytest.raises(ValueError): + self.series.ewm().mean() + + def test_ewma_halflife_arg(self): + A = self.series.ewm(com=13.932726172912965).mean() + B = self.series.ewm(halflife=10.0).mean() + tm.assert_almost_equal(A, B) + + with pytest.raises(ValueError): + self.series.ewm(span=20, halflife=50) + with pytest.raises(ValueError): + self.series.ewm(com=9.5, halflife=50) + with pytest.raises(ValueError): + self.series.ewm(com=9.5, span=20, halflife=50) + with pytest.raises(ValueError): + self.series.ewm() + + def test_ewm_alpha(self): + # GH 10789 + s = Series(self.arr) + a = s.ewm(alpha=0.61722699889169674).mean() + b = s.ewm(com=0.62014947789973052).mean() + c = s.ewm(span=2.240298955799461).mean() + d = s.ewm(halflife=0.721792864318).mean() + tm.assert_series_equal(a, b) + tm.assert_series_equal(a, c) + tm.assert_series_equal(a, d) + + def test_ewm_alpha_arg(self): + # GH 10789 + s = self.series + with pytest.raises(ValueError): + s.ewm() + with pytest.raises(ValueError): + s.ewm(com=10.0, alpha=0.5) + with pytest.raises(ValueError): + s.ewm(span=10.0, alpha=0.5) + with pytest.raises(ValueError): + s.ewm(halflife=10.0, alpha=0.5) + + def test_ewm_domain_checks(self): + # GH 12492 + s = Series(self.arr) + msg = "comass must satisfy: comass >= 0" + with pytest.raises(ValueError, match=msg): + s.ewm(com=-0.1) + s.ewm(com=0.0) + s.ewm(com=0.1) + + msg = "span must satisfy: span >= 1" + with pytest.raises(ValueError, match=msg): + s.ewm(span=-0.1) + with pytest.raises(ValueError, match=msg): + s.ewm(span=0.0) + with pytest.raises(ValueError, match=msg): + s.ewm(span=0.9) + s.ewm(span=1.0) + s.ewm(span=1.1) + + msg = "halflife must satisfy: halflife > 0" + with pytest.raises(ValueError, match=msg): + s.ewm(halflife=-0.1) + with pytest.raises(ValueError, match=msg): + s.ewm(halflife=0.0) + s.ewm(halflife=0.1) + + msg = "alpha must satisfy: 0 < alpha <= 1" + with pytest.raises(ValueError, match=msg): + s.ewm(alpha=-0.1) + with pytest.raises(ValueError, match=msg): + s.ewm(alpha=0.0) + s.ewm(alpha=0.1) + s.ewm(alpha=1.0) + with pytest.raises(ValueError, match=msg): + s.ewm(alpha=1.1) + + @pytest.mark.parametrize("method", ["mean", "vol", "var"]) + def test_ew_empty_series(self, method): + vals = pd.Series([], dtype=np.float64) + + ewm = vals.ewm(3) + result = getattr(ewm, method)() + tm.assert_almost_equal(result, vals) + + def _check_ew(self, name=None, preserve_nan=False): + series_result = getattr(self.series.ewm(com=10), name)() + assert isinstance(series_result, Series) + + frame_result = getattr(self.frame.ewm(com=10), name)() + assert type(frame_result) == DataFrame + + result = getattr(self.series.ewm(com=10), name)() + if preserve_nan: + assert result[self._nan_locs].isna().all() + + # excluding NaNs correctly + arr = randn(50) + arr[:10] = np.NaN + arr[-10:] = np.NaN + s = Series(arr) + + # check min_periods + # GH 7898 + result = getattr(s.ewm(com=50, min_periods=2), name)() + assert result[:11].isna().all() + assert not result[11:].isna().any() + + for min_periods in (0, 1): + result = getattr(s.ewm(com=50, min_periods=min_periods), name)() + if name == "mean": + assert result[:10].isna().all() + assert not result[10:].isna().any() + else: + # ewm.std, ewm.vol, ewm.var (with bias=False) require at least + # two values + assert result[:11].isna().all() + assert not result[11:].isna().any() + + # check series of length 0 + result = getattr(Series().ewm(com=50, min_periods=min_periods), name)() + tm.assert_series_equal(result, Series()) + + # check series of length 1 + result = getattr(Series([1.0]).ewm(50, min_periods=min_periods), name)() + if name == "mean": + tm.assert_series_equal(result, Series([1.0])) + else: + # ewm.std, ewm.vol, ewm.var with bias=False require at least + # two values + tm.assert_series_equal(result, Series([np.NaN])) + + # pass in ints + result2 = getattr(Series(np.arange(50)).ewm(span=10), name)() + assert result2.dtype == np.float_ + + +# create the data only once as we are not setting it +def _create_consistency_data(): + def create_series(): + return [ + Series(), + Series([np.nan]), + Series([np.nan, np.nan]), + Series([3.0]), + Series([np.nan, 3.0]), + Series([3.0, np.nan]), + Series([1.0, 3.0]), + Series([2.0, 2.0]), + Series([3.0, 1.0]), + Series( + [5.0, 5.0, 5.0, 5.0, np.nan, np.nan, np.nan, 5.0, 5.0, np.nan, np.nan] + ), + Series( + [ + np.nan, + 5.0, + 5.0, + 5.0, + np.nan, + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + ] + ), + Series( + [ + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + ] + ), + Series( + [ + np.nan, + 3.0, + np.nan, + 3.0, + 4.0, + 5.0, + 6.0, + np.nan, + np.nan, + 7.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + np.nan, + 5.0, + np.nan, + 2.0, + 4.0, + 0.0, + 9.0, + np.nan, + np.nan, + 3.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + 2.0, + 3.0, + np.nan, + 3.0, + 4.0, + 5.0, + 6.0, + np.nan, + np.nan, + 7.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + 2.0, + 5.0, + np.nan, + 2.0, + 4.0, + 0.0, + 9.0, + np.nan, + np.nan, + 3.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series(range(10)), + Series(range(20, 0, -2)), + ] + + def create_dataframes(): + return [ + DataFrame(), + DataFrame(columns=["a"]), + DataFrame(columns=["a", "a"]), + DataFrame(columns=["a", "b"]), + DataFrame(np.arange(10).reshape((5, 2))), + DataFrame(np.arange(25).reshape((5, 5))), + DataFrame(np.arange(25).reshape((5, 5)), columns=["a", "b", 99, "d", "d"]), + ] + [DataFrame(s) for s in create_series()] + + def is_constant(x): + values = x.values.ravel() + return len(set(values[notna(values)])) == 1 + + def no_nans(x): + return x.notna().all().all() + + # data is a tuple(object, is_constant, no_nans) + data = create_series() + create_dataframes() + + return [(x, is_constant(x), no_nans(x)) for x in data] + + +_consistency_data = _create_consistency_data() + + +def _rolling_consistency_cases(): + for window in [1, 2, 3, 10, 20]: + for min_periods in {0, 1, 2, 3, 4, window}: + if min_periods and (min_periods > window): + continue + for center in [False, True]: + yield window, min_periods, center + + +class TestMomentsConsistency(Base): + base_functions = [ + (lambda v: Series(v).count(), None, "count"), + (lambda v: Series(v).max(), None, "max"), + (lambda v: Series(v).min(), None, "min"), + (lambda v: Series(v).sum(), None, "sum"), + (lambda v: Series(v).mean(), None, "mean"), + (lambda v: Series(v).std(), 1, "std"), + (lambda v: Series(v).cov(Series(v)), None, "cov"), + (lambda v: Series(v).corr(Series(v)), None, "corr"), + (lambda v: Series(v).var(), 1, "var"), + # restore once GH 8086 is fixed + # lambda v: Series(v).skew(), 3, 'skew'), + # (lambda v: Series(v).kurt(), 4, 'kurt'), + # restore once GH 8084 is fixed + # lambda v: Series(v).quantile(0.3), None, 'quantile'), + (lambda v: Series(v).median(), None, "median"), + (np.nanmax, 1, "max"), + (np.nanmin, 1, "min"), + (np.nansum, 1, "sum"), + (np.nanmean, 1, "mean"), + (lambda v: np.nanstd(v, ddof=1), 1, "std"), + (lambda v: np.nanvar(v, ddof=1), 1, "var"), + (np.nanmedian, 1, "median"), + ] + no_nan_functions = [ + (np.max, None, "max"), + (np.min, None, "min"), + (np.sum, None, "sum"), + (np.mean, None, "mean"), + (lambda v: np.std(v, ddof=1), 1, "std"), + (lambda v: np.var(v, ddof=1), 1, "var"), + (np.median, None, "median"), + ] + + def _create_data(self): + super()._create_data() + self.data = _consistency_data + + def setup_method(self, method): + self._create_data() + + def _test_moments_consistency( + self, + min_periods, + count, + mean, + mock_mean, + corr, + var_unbiased=None, + std_unbiased=None, + cov_unbiased=None, + var_biased=None, + std_biased=None, + cov_biased=None, + var_debiasing_factors=None, + ): + def _non_null_values(x): + values = x.values.ravel() + return set(values[notna(values)].tolist()) + + for (x, is_constant, no_nans) in self.data: + count_x = count(x) + mean_x = mean(x) + + if mock_mean: + # check that mean equals mock_mean + expected = mock_mean(x) + tm.assert_equal(mean_x, expected.astype("float64")) + + # check that correlation of a series with itself is either 1 or NaN + corr_x_x = corr(x, x) + + # assert _non_null_values(corr_x_x).issubset(set([1.])) + # restore once rolling_cov(x, x) is identically equal to var(x) + + if is_constant: + exp = x.max() if isinstance(x, Series) else x.max().max() + + # check mean of constant series + expected = x * np.nan + expected[count_x >= max(min_periods, 1)] = exp + tm.assert_equal(mean_x, expected) + + # check correlation of constant series with itself is NaN + expected[:] = np.nan + tm.assert_equal(corr_x_x, expected) + + if var_unbiased and var_biased and var_debiasing_factors: + # check variance debiasing factors + var_unbiased_x = var_unbiased(x) + var_biased_x = var_biased(x) + var_debiasing_factors_x = var_debiasing_factors(x) + tm.assert_equal(var_unbiased_x, var_biased_x * var_debiasing_factors_x) + + for (std, var, cov) in [ + (std_biased, var_biased, cov_biased), + (std_unbiased, var_unbiased, cov_unbiased), + ]: + + # check that var(x), std(x), and cov(x) are all >= 0 + var_x = var(x) + std_x = std(x) + assert not (var_x < 0).any().any() + assert not (std_x < 0).any().any() + if cov: + cov_x_x = cov(x, x) + assert not (cov_x_x < 0).any().any() + + # check that var(x) == cov(x, x) + tm.assert_equal(var_x, cov_x_x) + + # check that var(x) == std(x)^2 + tm.assert_equal(var_x, std_x * std_x) + + if var is var_biased: + # check that biased var(x) == mean(x^2) - mean(x)^2 + mean_x2 = mean(x * x) + tm.assert_equal(var_x, mean_x2 - (mean_x * mean_x)) + + if is_constant: + # check that variance of constant series is identically 0 + assert not (var_x > 0).any().any() + expected = x * np.nan + expected[count_x >= max(min_periods, 1)] = 0.0 + if var is var_unbiased: + expected[count_x < 2] = np.nan + tm.assert_equal(var_x, expected) + + if isinstance(x, Series): + for (y, is_constant, no_nans) in self.data: + if not x.isna().equals(y.isna()): + # can only easily test two Series with similar + # structure + continue + + # check that cor(x, y) is symmetric + corr_x_y = corr(x, y) + corr_y_x = corr(y, x) + tm.assert_equal(corr_x_y, corr_y_x) + + if cov: + # check that cov(x, y) is symmetric + cov_x_y = cov(x, y) + cov_y_x = cov(y, x) + tm.assert_equal(cov_x_y, cov_y_x) + + # check that cov(x, y) == (var(x+y) - var(x) - + # var(y)) / 2 + var_x_plus_y = var(x + y) + var_y = var(y) + tm.assert_equal( + cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y) + ) + + # check that corr(x, y) == cov(x, y) / (std(x) * + # std(y)) + std_y = std(y) + tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) + + if cov is cov_biased: + # check that biased cov(x, y) == mean(x*y) - + # mean(x)*mean(y) + mean_y = mean(y) + mean_x_times_y = mean(x * y) + tm.assert_equal( + cov_x_y, mean_x_times_y - (mean_x * mean_y) + ) + + @pytest.mark.slow + @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) + @pytest.mark.parametrize("adjust", [True, False]) + @pytest.mark.parametrize("ignore_na", [True, False]) + def test_ewm_consistency(self, min_periods, adjust, ignore_na): + def _weights(s, com, adjust, ignore_na): + if isinstance(s, DataFrame): + if not len(s.columns): + return DataFrame(index=s.index, columns=s.columns) + w = concat( + [ + _weights( + s.iloc[:, i], com=com, adjust=adjust, ignore_na=ignore_na + ) + for i, _ in enumerate(s.columns) + ], + axis=1, + ) + w.index = s.index + w.columns = s.columns + return w + + w = Series(np.nan, index=s.index) + alpha = 1.0 / (1.0 + com) + if ignore_na: + w[s.notna()] = _weights( + s[s.notna()], com=com, adjust=adjust, ignore_na=False + ) + elif adjust: + for i in range(len(s)): + if s.iat[i] == s.iat[i]: + w.iat[i] = pow(1.0 / (1.0 - alpha), i) + else: + sum_wts = 0.0 + prev_i = -1 + for i in range(len(s)): + if s.iat[i] == s.iat[i]: + if prev_i == -1: + w.iat[i] = 1.0 + else: + w.iat[i] = alpha * sum_wts / pow(1.0 - alpha, i - prev_i) + sum_wts += w.iat[i] + prev_i = i + return w + + def _variance_debiasing_factors(s, com, adjust, ignore_na): + weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) + cum_sum = weights.cumsum().fillna(method="ffill") + cum_sum_sq = (weights * weights).cumsum().fillna(method="ffill") + numerator = cum_sum * cum_sum + denominator = numerator - cum_sum_sq + denominator[denominator <= 0.0] = np.nan + return numerator / denominator + + def _ewma(s, com, min_periods, adjust, ignore_na): + weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) + result = ( + s.multiply(weights) + .cumsum() + .divide(weights.cumsum()) + .fillna(method="ffill") + ) + result[ + s.expanding().count() < (max(min_periods, 1) if min_periods else 1) + ] = np.nan + return result + + com = 3.0 + # test consistency between different ewm* moments + self._test_moments_consistency( + min_periods=min_periods, + count=lambda x: x.expanding().count(), + mean=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).mean(), + mock_mean=lambda x: _ewma( + x, com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ), + corr=lambda x, y: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).corr(y), + var_unbiased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=False) + ), + std_unbiased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).std(bias=False) + ), + cov_unbiased=lambda x, y: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).cov(y, bias=False) + ), + var_biased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=True) + ), + std_biased=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).std(bias=True), + cov_biased=lambda x, y: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).cov(y, bias=True) + ), + var_debiasing_factors=lambda x: ( + _variance_debiasing_factors( + x, com=com, adjust=adjust, ignore_na=ignore_na + ) + ), + ) + + @pytest.mark.slow + @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) + def test_expanding_consistency(self, min_periods): + + # suppress warnings about empty slices, as we are deliberately testing + # with empty/0-length Series/DataFrames + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=".*(empty slice|0 for slice).*", + category=RuntimeWarning, + ) + + # test consistency between different expanding_* moments + self._test_moments_consistency( + min_periods=min_periods, + count=lambda x: x.expanding().count(), + mean=lambda x: x.expanding(min_periods=min_periods).mean(), + mock_mean=lambda x: x.expanding(min_periods=min_periods).sum() + / x.expanding().count(), + corr=lambda x, y: x.expanding(min_periods=min_periods).corr(y), + var_unbiased=lambda x: x.expanding(min_periods=min_periods).var(), + std_unbiased=lambda x: x.expanding(min_periods=min_periods).std(), + cov_unbiased=lambda x, y: x.expanding(min_periods=min_periods).cov(y), + var_biased=lambda x: x.expanding(min_periods=min_periods).var(ddof=0), + std_biased=lambda x: x.expanding(min_periods=min_periods).std(ddof=0), + cov_biased=lambda x, y: x.expanding(min_periods=min_periods).cov( + y, ddof=0 + ), + var_debiasing_factors=lambda x: ( + x.expanding().count() + / (x.expanding().count() - 1.0).replace(0.0, np.nan) + ), + ) + + # test consistency between expanding_xyz() and either (a) + # expanding_apply of Series.xyz(), or (b) expanding_apply of + # np.nanxyz() + for (x, is_constant, no_nans) in self.data: + functions = self.base_functions + + # GH 8269 + if no_nans: + functions = self.base_functions + self.no_nan_functions + for (f, require_min_periods, name) in functions: + expanding_f = getattr(x.expanding(min_periods=min_periods), name) + + if ( + require_min_periods + and (min_periods is not None) + and (min_periods < require_min_periods) + ): + continue + + if name == "count": + expanding_f_result = expanding_f() + expanding_apply_f_result = x.expanding(min_periods=0).apply( + func=f, raw=True + ) + else: + if name in ["cov", "corr"]: + expanding_f_result = expanding_f(pairwise=False) + else: + expanding_f_result = expanding_f() + expanding_apply_f_result = x.expanding( + min_periods=min_periods + ).apply(func=f, raw=True) + + # GH 9422 + if name in ["sum", "prod"]: + tm.assert_equal(expanding_f_result, expanding_apply_f_result) + + @pytest.mark.slow + @pytest.mark.parametrize( + "window,min_periods,center", list(_rolling_consistency_cases()) + ) + def test_rolling_consistency(self, window, min_periods, center): + + # suppress warnings about empty slices, as we are deliberately testing + # with empty/0-length Series/DataFrames + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=".*(empty slice|0 for slice).*", + category=RuntimeWarning, + ) + + # test consistency between different rolling_* moments + self._test_moments_consistency( + min_periods=min_periods, + count=lambda x: (x.rolling(window=window, center=center).count()), + mean=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).mean() + ), + mock_mean=lambda x: ( + x.rolling(window=window, min_periods=min_periods, center=center) + .sum() + .divide( + x.rolling( + window=window, min_periods=min_periods, center=center + ).count() + ) + ), + corr=lambda x, y: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).corr(y) + ), + var_unbiased=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).var() + ), + std_unbiased=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).std() + ), + cov_unbiased=lambda x, y: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).cov(y) + ), + var_biased=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).var(ddof=0) + ), + std_biased=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).std(ddof=0) + ), + cov_biased=lambda x, y: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).cov(y, ddof=0) + ), + var_debiasing_factors=lambda x: ( + x.rolling(window=window, center=center) + .count() + .divide( + (x.rolling(window=window, center=center).count() - 1.0).replace( + 0.0, np.nan + ) + ) + ), + ) + + # test consistency between rolling_xyz() and either (a) + # rolling_apply of Series.xyz(), or (b) rolling_apply of + # np.nanxyz() + for (x, is_constant, no_nans) in self.data: + functions = self.base_functions + + # GH 8269 + if no_nans: + functions = self.base_functions + self.no_nan_functions + for (f, require_min_periods, name) in functions: + rolling_f = getattr( + x.rolling( + window=window, center=center, min_periods=min_periods + ), + name, + ) + + if ( + require_min_periods + and (min_periods is not None) + and (min_periods < require_min_periods) + ): + continue + + if name == "count": + rolling_f_result = rolling_f() + rolling_apply_f_result = x.rolling( + window=window, min_periods=0, center=center + ).apply(func=f, raw=True) + else: + if name in ["cov", "corr"]: + rolling_f_result = rolling_f(pairwise=False) + else: + rolling_f_result = rolling_f() + rolling_apply_f_result = x.rolling( + window=window, min_periods=min_periods, center=center + ).apply(func=f, raw=True) + + # GH 9422 + if name in ["sum", "prod"]: + tm.assert_equal(rolling_f_result, rolling_apply_f_result) + + # binary moments + def test_rolling_cov(self): + A = self.series + B = A + randn(len(A)) + + result = A.rolling(window=50, min_periods=25).cov(B) + tm.assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1]) + + def test_rolling_cov_pairwise(self): + self._check_pairwise_moment("rolling", "cov", window=10, min_periods=5) + + def test_rolling_corr(self): + A = self.series + B = A + randn(len(A)) + + result = A.rolling(window=50, min_periods=25).corr(B) + tm.assert_almost_equal(result[-1], np.corrcoef(A[-50:], B[-50:])[0, 1]) + + # test for correct bias correction + a = tm.makeTimeSeries() + b = tm.makeTimeSeries() + a[:5] = np.nan + b[:10] = np.nan + + result = a.rolling(window=len(a), min_periods=1).corr(b) + tm.assert_almost_equal(result[-1], a.corr(b)) + + def test_rolling_corr_pairwise(self): + self._check_pairwise_moment("rolling", "corr", window=10, min_periods=5) + + @pytest.mark.parametrize("window", range(7)) + def test_rolling_corr_with_zero_variance(self, window): + # GH 18430 + s = pd.Series(np.zeros(20)) + other = pd.Series(np.arange(20)) + + assert s.rolling(window=window).corr(other=other).isna().all() + + def _check_pairwise_moment(self, dispatch, name, **kwargs): + def get_result(obj, obj2=None): + return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2) + + result = get_result(self.frame) + result = result.loc[(slice(None), 1), 5] + result.index = result.index.droplevel(1) + expected = get_result(self.frame[1], self.frame[5]) + tm.assert_series_equal(result, expected, check_names=False) + + def test_flex_binary_moment(self): + # GH3155 + # don't blow the stack + msg = ( + "arguments to moment function must be of type" + " np.ndarray/Series/DataFrame" + ) + with pytest.raises(TypeError, match=msg): + rwindow._flex_binary_moment(5, 6, None) + + def test_corr_sanity(self): + # GH 3155 + df = DataFrame( + np.array( + [ + [0.87024726, 0.18505595], + [0.64355431, 0.3091617], + [0.92372966, 0.50552513], + [0.00203756, 0.04520709], + [0.84780328, 0.33394331], + [0.78369152, 0.63919667], + ] + ) + ) + + res = df[0].rolling(5, center=True).corr(df[1]) + assert all(np.abs(np.nan_to_num(x)) <= 1 for x in res) + + # and some fuzzing + for _ in range(10): + df = DataFrame(np.random.rand(30, 2)) + res = df[0].rolling(5, center=True).corr(df[1]) + try: + assert all(np.abs(np.nan_to_num(x)) <= 1 for x in res) + except AssertionError: + print(res) + + @pytest.mark.parametrize("method", ["corr", "cov"]) + def test_flex_binary_frame(self, method): + series = self.frame[1] + + res = getattr(series.rolling(window=10), method)(self.frame) + res2 = getattr(self.frame.rolling(window=10), method)(series) + exp = self.frame.apply(lambda x: getattr(series.rolling(window=10), method)(x)) + + tm.assert_frame_equal(res, exp) + tm.assert_frame_equal(res2, exp) + + frame2 = self.frame.copy() + frame2.values[:] = np.random.randn(*frame2.shape) + + res3 = getattr(self.frame.rolling(window=10), method)(frame2) + exp = DataFrame( + { + k: getattr(self.frame[k].rolling(window=10), method)(frame2[k]) + for k in self.frame + } + ) + tm.assert_frame_equal(res3, exp) + + def test_ewmcov(self): + self._check_binary_ew("cov") + + def test_ewmcov_pairwise(self): + self._check_pairwise_moment("ewm", "cov", span=10, min_periods=5) + + def test_ewmcorr(self): + self._check_binary_ew("corr") + + def test_ewmcorr_pairwise(self): + self._check_pairwise_moment("ewm", "corr", span=10, min_periods=5) + + def _check_binary_ew(self, name): + def func(A, B, com, **kwargs): + return getattr(A.ewm(com, **kwargs), name)(B) + + A = Series(randn(50), index=np.arange(50)) + B = A[2:] + randn(48) + + A[:10] = np.NaN + B[-10:] = np.NaN + + result = func(A, B, 20, min_periods=5) + assert np.isnan(result.values[:14]).all() + assert not np.isnan(result.values[14:]).any() + + # GH 7898 + for min_periods in (0, 1, 2): + result = func(A, B, 20, min_periods=min_periods) + # binary functions (ewmcov, ewmcorr) with bias=False require at + # least two values + assert np.isnan(result.values[:11]).all() + assert not np.isnan(result.values[11:]).any() + + # check series of length 0 + result = func(Series([]), Series([]), 50, min_periods=min_periods) + tm.assert_series_equal(result, Series([])) + + # check series of length 1 + result = func(Series([1.0]), Series([1.0]), 50, min_periods=min_periods) + tm.assert_series_equal(result, Series([np.NaN])) + + msg = "Input arrays must be of the same type!" + # exception raised is Exception + with pytest.raises(Exception, match=msg): + func(A, randn(50), 20, min_periods=5) + + def test_expanding_apply_args_kwargs(self, raw): + def mean_w_arg(x, const): + return np.mean(x) + const + + df = DataFrame(np.random.rand(20, 3)) + + expected = df.expanding().apply(np.mean, raw=raw) + 20.0 + + result = df.expanding().apply(mean_w_arg, raw=raw, args=(20,)) + tm.assert_frame_equal(result, expected) + + result = df.expanding().apply(mean_w_arg, raw=raw, kwargs={"const": 20}) + tm.assert_frame_equal(result, expected) + + def test_expanding_corr(self): + A = self.series.dropna() + B = (A + randn(len(A)))[:-5] + + result = A.expanding().corr(B) + + rolling_result = A.rolling(window=len(A), min_periods=1).corr(B) + + tm.assert_almost_equal(rolling_result, result) + + def test_expanding_count(self): + result = self.series.expanding().count() + tm.assert_almost_equal( + result, self.series.rolling(window=len(self.series)).count() + ) + + def test_expanding_quantile(self): + result = self.series.expanding().quantile(0.5) + + rolling_result = self.series.rolling( + window=len(self.series), min_periods=1 + ).quantile(0.5) + + tm.assert_almost_equal(result, rolling_result) + + def test_expanding_cov(self): + A = self.series + B = (A + randn(len(A)))[:-5] + + result = A.expanding().cov(B) + + rolling_result = A.rolling(window=len(A), min_periods=1).cov(B) + + tm.assert_almost_equal(rolling_result, result) + + def test_expanding_cov_pairwise(self): + result = self.frame.expanding().corr() + + rolling_result = self.frame.rolling( + window=len(self.frame), min_periods=1 + ).corr() + + tm.assert_frame_equal(result, rolling_result) + + def test_expanding_corr_pairwise(self): + result = self.frame.expanding().corr() + + rolling_result = self.frame.rolling( + window=len(self.frame), min_periods=1 + ).corr() + tm.assert_frame_equal(result, rolling_result) + + def test_expanding_cov_diff_index(self): + # GH 7512 + s1 = Series([1, 2, 3], index=[0, 1, 2]) + s2 = Series([1, 3], index=[0, 2]) + result = s1.expanding().cov(s2) + expected = Series([None, None, 2.0]) + tm.assert_series_equal(result, expected) + + s2a = Series([1, None, 3], index=[0, 1, 2]) + result = s1.expanding().cov(s2a) + tm.assert_series_equal(result, expected) + + s1 = Series([7, 8, 10], index=[0, 1, 3]) + s2 = Series([7, 9, 10], index=[0, 2, 3]) + result = s1.expanding().cov(s2) + expected = Series([None, None, None, 4.5]) + tm.assert_series_equal(result, expected) + + def test_expanding_corr_diff_index(self): + # GH 7512 + s1 = Series([1, 2, 3], index=[0, 1, 2]) + s2 = Series([1, 3], index=[0, 2]) + result = s1.expanding().corr(s2) + expected = Series([None, None, 1.0]) + tm.assert_series_equal(result, expected) + + s2a = Series([1, None, 3], index=[0, 1, 2]) + result = s1.expanding().corr(s2a) + tm.assert_series_equal(result, expected) + + s1 = Series([7, 8, 10], index=[0, 1, 3]) + s2 = Series([7, 9, 10], index=[0, 2, 3]) + result = s1.expanding().corr(s2) + expected = Series([None, None, None, 1.0]) + tm.assert_series_equal(result, expected) + + def test_rolling_cov_diff_length(self): + # GH 7512 + s1 = Series([1, 2, 3], index=[0, 1, 2]) + s2 = Series([1, 3], index=[0, 2]) + result = s1.rolling(window=3, min_periods=2).cov(s2) + expected = Series([None, None, 2.0]) + tm.assert_series_equal(result, expected) + + s2a = Series([1, None, 3], index=[0, 1, 2]) + result = s1.rolling(window=3, min_periods=2).cov(s2a) + tm.assert_series_equal(result, expected) + + def test_rolling_corr_diff_length(self): + # GH 7512 + s1 = Series([1, 2, 3], index=[0, 1, 2]) + s2 = Series([1, 3], index=[0, 2]) + result = s1.rolling(window=3, min_periods=2).corr(s2) + expected = Series([None, None, 1.0]) + tm.assert_series_equal(result, expected) + + s2a = Series([1, None, 3], index=[0, 1, 2]) + result = s1.rolling(window=3, min_periods=2).corr(s2a) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "f", + [ + lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=False)), + lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=False)), + lambda x: x.rolling(window=10, min_periods=5).max(), + lambda x: x.rolling(window=10, min_periods=5).min(), + lambda x: x.rolling(window=10, min_periods=5).sum(), + lambda x: x.rolling(window=10, min_periods=5).mean(), + lambda x: x.rolling(window=10, min_periods=5).std(), + lambda x: x.rolling(window=10, min_periods=5).var(), + lambda x: x.rolling(window=10, min_periods=5).skew(), + lambda x: x.rolling(window=10, min_periods=5).kurt(), + lambda x: x.rolling(window=10, min_periods=5).quantile(quantile=0.5), + lambda x: x.rolling(window=10, min_periods=5).median(), + lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), + lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), + lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), + ], + ) + def test_rolling_functions_window_non_shrinkage(self, f): + # GH 7764 + s = Series(range(4)) + s_expected = Series(np.nan, index=s.index) + df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=["A", "B"]) + df_expected = DataFrame(np.nan, index=df.index, columns=df.columns) + + try: + s_result = f(s) + tm.assert_series_equal(s_result, s_expected) + + df_result = f(df) + tm.assert_frame_equal(df_result, df_expected) + except (ImportError): + + # scipy needed for rolling_window + pytest.skip("scipy not available") + + def test_rolling_functions_window_non_shrinkage_binary(self): + + # corr/cov return a MI DataFrame + df = DataFrame( + [[1, 5], [3, 2], [3, 9], [-1, 0]], + columns=Index(["A", "B"], name="foo"), + index=Index(range(4), name="bar"), + ) + df_expected = DataFrame( + columns=Index(["A", "B"], name="foo"), + index=pd.MultiIndex.from_product( + [df.index, df.columns], names=["bar", "foo"] + ), + dtype="float64", + ) + functions = [ + lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)), + lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)), + ] + for f in functions: + df_result = f(df) + tm.assert_frame_equal(df_result, df_expected) + + def test_moment_functions_zero_length(self): + # GH 8056 + s = Series() + s_expected = s + df1 = DataFrame() + df1_expected = df1 + df2 = DataFrame(columns=["a"]) + df2["a"] = df2["a"].astype("float64") + df2_expected = df2 + + functions = [ + lambda x: x.expanding().count(), + lambda x: x.expanding(min_periods=5).cov(x, pairwise=False), + lambda x: x.expanding(min_periods=5).corr(x, pairwise=False), + lambda x: x.expanding(min_periods=5).max(), + lambda x: x.expanding(min_periods=5).min(), + lambda x: x.expanding(min_periods=5).sum(), + lambda x: x.expanding(min_periods=5).mean(), + lambda x: x.expanding(min_periods=5).std(), + lambda x: x.expanding(min_periods=5).var(), + lambda x: x.expanding(min_periods=5).skew(), + lambda x: x.expanding(min_periods=5).kurt(), + lambda x: x.expanding(min_periods=5).quantile(0.5), + lambda x: x.expanding(min_periods=5).median(), + lambda x: x.expanding(min_periods=5).apply(sum, raw=False), + lambda x: x.expanding(min_periods=5).apply(sum, raw=True), + lambda x: x.rolling(window=10).count(), + lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False), + lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False), + lambda x: x.rolling(window=10, min_periods=5).max(), + lambda x: x.rolling(window=10, min_periods=5).min(), + lambda x: x.rolling(window=10, min_periods=5).sum(), + lambda x: x.rolling(window=10, min_periods=5).mean(), + lambda x: x.rolling(window=10, min_periods=5).std(), + lambda x: x.rolling(window=10, min_periods=5).var(), + lambda x: x.rolling(window=10, min_periods=5).skew(), + lambda x: x.rolling(window=10, min_periods=5).kurt(), + lambda x: x.rolling(window=10, min_periods=5).quantile(0.5), + lambda x: x.rolling(window=10, min_periods=5).median(), + lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), + lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), + lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), + ] + for f in functions: + try: + s_result = f(s) + tm.assert_series_equal(s_result, s_expected) + + df1_result = f(df1) + tm.assert_frame_equal(df1_result, df1_expected) + + df2_result = f(df2) + tm.assert_frame_equal(df2_result, df2_expected) + except (ImportError): + + # scipy needed for rolling_window + continue + + def test_moment_functions_zero_length_pairwise(self): + + df1 = DataFrame() + df1_expected = df1 + df2 = DataFrame(columns=Index(["a"], name="foo"), index=Index([], name="bar")) + df2["a"] = df2["a"].astype("float64") + + df1_expected = DataFrame( + index=pd.MultiIndex.from_product([df1.index, df1.columns]), + columns=Index([]), + ) + df2_expected = DataFrame( + index=pd.MultiIndex.from_product( + [df2.index, df2.columns], names=["bar", "foo"] + ), + columns=Index(["a"], name="foo"), + dtype="float64", + ) + + functions = [ + lambda x: (x.expanding(min_periods=5).cov(x, pairwise=True)), + lambda x: (x.expanding(min_periods=5).corr(x, pairwise=True)), + lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)), + lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)), + ] + for f in functions: + df1_result = f(df1) + tm.assert_frame_equal(df1_result, df1_expected) + + df2_result = f(df2) + tm.assert_frame_equal(df2_result, df2_expected) + + def test_expanding_cov_pairwise_diff_length(self): + # GH 7512 + df1 = DataFrame([[1, 5], [3, 2], [3, 9]], columns=Index(["A", "B"], name="foo")) + df1a = DataFrame( + [[1, 5], [3, 9]], index=[0, 2], columns=Index(["A", "B"], name="foo") + ) + df2 = DataFrame( + [[5, 6], [None, None], [2, 1]], columns=Index(["X", "Y"], name="foo") + ) + df2a = DataFrame( + [[5, 6], [2, 1]], index=[0, 2], columns=Index(["X", "Y"], name="foo") + ) + # TODO: xref gh-15826 + # .loc is not preserving the names + result1 = df1.expanding().cov(df2a, pairwise=True).loc[2] + result2 = df1.expanding().cov(df2a, pairwise=True).loc[2] + result3 = df1a.expanding().cov(df2, pairwise=True).loc[2] + result4 = df1a.expanding().cov(df2a, pairwise=True).loc[2] + expected = DataFrame( + [[-3.0, -6.0], [-5.0, -10.0]], + columns=Index(["A", "B"], name="foo"), + index=Index(["X", "Y"], name="foo"), + ) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result3, expected) + tm.assert_frame_equal(result4, expected) + + def test_expanding_corr_pairwise_diff_length(self): + # GH 7512 + df1 = DataFrame( + [[1, 2], [3, 2], [3, 4]], + columns=["A", "B"], + index=Index(range(3), name="bar"), + ) + df1a = DataFrame( + [[1, 2], [3, 4]], index=Index([0, 2], name="bar"), columns=["A", "B"] + ) + df2 = DataFrame( + [[5, 6], [None, None], [2, 1]], + columns=["X", "Y"], + index=Index(range(3), name="bar"), + ) + df2a = DataFrame( + [[5, 6], [2, 1]], index=Index([0, 2], name="bar"), columns=["X", "Y"] + ) + result1 = df1.expanding().corr(df2, pairwise=True).loc[2] + result2 = df1.expanding().corr(df2a, pairwise=True).loc[2] + result3 = df1a.expanding().corr(df2, pairwise=True).loc[2] + result4 = df1a.expanding().corr(df2a, pairwise=True).loc[2] + expected = DataFrame( + [[-1.0, -1.0], [-1.0, -1.0]], columns=["A", "B"], index=Index(["X", "Y"]) + ) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result3, expected) + tm.assert_frame_equal(result4, expected) + + def test_rolling_skew_edge_cases(self): + + all_nan = Series([np.NaN] * 5) + + # yields all NaN (0 variance) + d = Series([1] * 5) + x = d.rolling(window=5).skew() + tm.assert_series_equal(all_nan, x) + + # yields all NaN (window too small) + d = Series(np.random.randn(5)) + x = d.rolling(window=2).skew() + tm.assert_series_equal(all_nan, x) + + # yields [NaN, NaN, NaN, 0.177994, 1.548824] + d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401]) + expected = Series([np.NaN, np.NaN, np.NaN, 0.177994, 1.548824]) + x = d.rolling(window=4).skew() + tm.assert_series_equal(expected, x) + + def test_rolling_kurt_edge_cases(self): + + all_nan = Series([np.NaN] * 5) + + # yields all NaN (0 variance) + d = Series([1] * 5) + x = d.rolling(window=5).kurt() + tm.assert_series_equal(all_nan, x) + + # yields all NaN (window too small) + d = Series(np.random.randn(5)) + x = d.rolling(window=3).kurt() + tm.assert_series_equal(all_nan, x) + + # yields [NaN, NaN, NaN, 1.224307, 2.671499] + d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401]) + expected = Series([np.NaN, np.NaN, np.NaN, 1.224307, 2.671499]) + x = d.rolling(window=4).kurt() + tm.assert_series_equal(expected, x) + + def test_rolling_skew_eq_value_fperr(self): + # #18804 all rolling skew for all equal values should return Nan + a = Series([1.1] * 15).rolling(window=10).skew() + assert np.isnan(a).all() + + def test_rolling_kurt_eq_value_fperr(self): + # #18804 all rolling kurt for all equal values should return Nan + a = Series([1.1] * 15).rolling(window=10).kurt() + assert np.isnan(a).all() + + @pytest.mark.parametrize( + "func,static_comp", + [("sum", np.sum), ("mean", np.mean), ("max", np.max), ("min", np.min)], + ids=["sum", "mean", "max", "min"], + ) + def test_expanding_func(self, func, static_comp): + def expanding_func(x, min_periods=1, center=False, axis=0): + exp = x.expanding(min_periods=min_periods, center=center, axis=axis) + return getattr(exp, func)() + + self._check_expanding(expanding_func, static_comp, preserve_nan=False) + + def test_expanding_apply(self, raw): + def expanding_mean(x, min_periods=1): + + exp = x.expanding(min_periods=min_periods) + result = exp.apply(lambda x: x.mean(), raw=raw) + return result + + # TODO(jreback), needed to add preserve_nan=False + # here to make this pass + self._check_expanding(expanding_mean, np.mean, preserve_nan=False) + + ser = Series([]) + tm.assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean(), raw=raw)) + + # GH 8080 + s = Series([None, None, None]) + result = s.expanding(min_periods=0).apply(lambda x: len(x), raw=raw) + expected = Series([1.0, 2.0, 3.0]) + tm.assert_series_equal(result, expected) + + def _check_expanding( + self, + func, + static_comp, + has_min_periods=True, + has_time_rule=True, + preserve_nan=True, + ): + + series_result = func(self.series) + assert isinstance(series_result, Series) + frame_result = func(self.frame) + assert isinstance(frame_result, DataFrame) + + result = func(self.series) + tm.assert_almost_equal(result[10], static_comp(self.series[:11])) + + if preserve_nan: + assert result.iloc[self._nan_locs].isna().all() + + ser = Series(randn(50)) + + if has_min_periods: + result = func(ser, min_periods=30) + assert result[:29].isna().all() + tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) + + # min_periods is working correctly + result = func(ser, min_periods=15) + assert isna(result.iloc[13]) + assert notna(result.iloc[14]) + + ser2 = Series(randn(20)) + result = func(ser2, min_periods=5) + assert isna(result[3]) + assert notna(result[4]) + + # min_periods=0 + result0 = func(ser, min_periods=0) + result1 = func(ser, min_periods=1) + tm.assert_almost_equal(result0, result1) + else: + result = func(ser) + tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) + + def test_rolling_max_gh6297(self): + """Replicate result expected in GH #6297""" + + indices = [datetime(1975, 1, i) for i in range(1, 6)] + # So that we can have 2 datapoints on one of the days + indices.append(datetime(1975, 1, 3, 6, 0)) + series = Series(range(1, 7), index=indices) + # Use floats instead of ints as values + series = series.map(lambda x: float(x)) + # Sort chronologically + series = series.sort_index() + + expected = Series( + [1.0, 2.0, 6.0, 4.0, 5.0], + index=[datetime(1975, 1, i, 0) for i in range(1, 6)], + ) + x = series.resample("D").max().rolling(window=1).max() + tm.assert_series_equal(expected, x) + + def test_rolling_max_resample(self): + + indices = [datetime(1975, 1, i) for i in range(1, 6)] + # So that we can have 3 datapoints on last day (4, 10, and 20) + indices.append(datetime(1975, 1, 5, 1)) + indices.append(datetime(1975, 1, 5, 2)) + series = Series(list(range(0, 5)) + [10, 20], index=indices) + # Use floats instead of ints as values + series = series.map(lambda x: float(x)) + # Sort chronologically + series = series.sort_index() + + # Default how should be max + expected = Series( + [0.0, 1.0, 2.0, 3.0, 20.0], + index=[datetime(1975, 1, i, 0) for i in range(1, 6)], + ) + x = series.resample("D").max().rolling(window=1).max() + tm.assert_series_equal(expected, x) + + # Now specify median (10.0) + expected = Series( + [0.0, 1.0, 2.0, 3.0, 10.0], + index=[datetime(1975, 1, i, 0) for i in range(1, 6)], + ) + x = series.resample("D").median().rolling(window=1).max() + tm.assert_series_equal(expected, x) + + # Now specify mean (4+10+20)/3 + v = (4.0 + 10.0 + 20.0) / 3.0 + expected = Series( + [0.0, 1.0, 2.0, 3.0, v], + index=[datetime(1975, 1, i, 0) for i in range(1, 6)], + ) + x = series.resample("D").mean().rolling(window=1).max() + tm.assert_series_equal(expected, x) + + def test_rolling_min_resample(self): + + indices = [datetime(1975, 1, i) for i in range(1, 6)] + # So that we can have 3 datapoints on last day (4, 10, and 20) + indices.append(datetime(1975, 1, 5, 1)) + indices.append(datetime(1975, 1, 5, 2)) + series = Series(list(range(0, 5)) + [10, 20], index=indices) + # Use floats instead of ints as values + series = series.map(lambda x: float(x)) + # Sort chronologically + series = series.sort_index() + + # Default how should be min + expected = Series( + [0.0, 1.0, 2.0, 3.0, 4.0], + index=[datetime(1975, 1, i, 0) for i in range(1, 6)], + ) + r = series.resample("D").min().rolling(window=1) + tm.assert_series_equal(expected, r.min()) + + def test_rolling_median_resample(self): + + indices = [datetime(1975, 1, i) for i in range(1, 6)] + # So that we can have 3 datapoints on last day (4, 10, and 20) + indices.append(datetime(1975, 1, 5, 1)) + indices.append(datetime(1975, 1, 5, 2)) + series = Series(list(range(0, 5)) + [10, 20], index=indices) + # Use floats instead of ints as values + series = series.map(lambda x: float(x)) + # Sort chronologically + series = series.sort_index() + + # Default how should be median + expected = Series( + [0.0, 1.0, 2.0, 3.0, 10], + index=[datetime(1975, 1, i, 0) for i in range(1, 6)], + ) + x = series.resample("D").median().rolling(window=1).median() + tm.assert_series_equal(expected, x) + + def test_rolling_median_memory_error(self): + # GH11722 + n = 20000 + Series(np.random.randn(n)).rolling(window=2, center=False).median() + Series(np.random.randn(n)).rolling(window=2, center=False).median() + + def test_rolling_min_max_numeric_types(self): + + # GH12373 + types_test = [np.dtype("f{}".format(width)) for width in [4, 8]] + types_test.extend( + [ + np.dtype("{}{}".format(sign, width)) + for width in [1, 2, 4, 8] + for sign in "ui" + ] + ) + for data_type in types_test: + # Just testing that these don't throw exceptions and that + # the return type is float64. Other tests will cover quantitative + # correctness + result = DataFrame(np.arange(20, dtype=data_type)).rolling(window=5).max() + assert result.dtypes[0] == np.dtype("f8") + result = DataFrame(np.arange(20, dtype=data_type)).rolling(window=5).min() + assert result.dtypes[0] == np.dtype("f8") diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py new file mode 100644 index 00000000000000..c7177e1d3914fd --- /dev/null +++ b/pandas/tests/window/test_rolling.py @@ -0,0 +1,328 @@ +from datetime import timedelta + +import numpy as np +import pytest + +from pandas.errors import UnsupportedFunctionCall +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, Series +import pandas.core.window as rwindow +from pandas.tests.window.common import Base +import pandas.util.testing as tm + + +class TestRolling(Base): + def setup_method(self, method): + self._create_data() + + def test_doc_string(self): + + df = DataFrame({"B": [0, 1, 2, np.nan, 4]}) + df + df.rolling(2).sum() + df.rolling(2, min_periods=1).sum() + + @pytest.mark.parametrize("which", ["series", "frame"]) + def test_constructor(self, which): + # GH 12669 + + o = getattr(self, which) + c = o.rolling + + # valid + c(window=2) + c(window=2, min_periods=1) + c(window=2, min_periods=1, center=True) + c(window=2, min_periods=1, center=False) + + # GH 13383 + with pytest.raises(ValueError): + c(0) + c(-1) + + # not valid + for w in [2.0, "foo", np.array([2])]: + with pytest.raises(ValueError): + c(window=w) + with pytest.raises(ValueError): + c(window=2, min_periods=w) + with pytest.raises(ValueError): + c(window=2, min_periods=1, center=w) + + @td.skip_if_no_scipy + @pytest.mark.parametrize("which", ["series", "frame"]) + def test_constructor_with_win_type(self, which): + # GH 13383 + o = getattr(self, which) + c = o.rolling + with pytest.raises(ValueError): + c(-1, win_type="boxcar") + + @pytest.mark.parametrize("window", [timedelta(days=3), pd.Timedelta(days=3)]) + def test_constructor_with_timedelta_window(self, window): + # GH 15440 + n = 10 + df = DataFrame( + {"value": np.arange(n)}, + index=pd.date_range("2015-12-24", periods=n, freq="D"), + ) + expected_data = np.append([0.0, 1.0], np.arange(3.0, 27.0, 3)) + + result = df.rolling(window=window).sum() + expected = DataFrame( + {"value": expected_data}, + index=pd.date_range("2015-12-24", periods=n, freq="D"), + ) + tm.assert_frame_equal(result, expected) + expected = df.rolling("3D").sum() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("window", [timedelta(days=3), pd.Timedelta(days=3), "3D"]) + def test_constructor_timedelta_window_and_minperiods(self, window, raw): + # GH 15305 + n = 10 + df = DataFrame( + {"value": np.arange(n)}, + index=pd.date_range("2017-08-08", periods=n, freq="D"), + ) + expected = DataFrame( + {"value": np.append([np.NaN, 1.0], np.arange(3.0, 27.0, 3))}, + index=pd.date_range("2017-08-08", periods=n, freq="D"), + ) + result_roll_sum = df.rolling(window=window, min_periods=2).sum() + result_roll_generic = df.rolling(window=window, min_periods=2).apply( + sum, raw=raw + ) + tm.assert_frame_equal(result_roll_sum, expected) + tm.assert_frame_equal(result_roll_generic, expected) + + @pytest.mark.parametrize("method", ["std", "mean", "sum", "max", "min", "var"]) + def test_numpy_compat(self, method): + # see gh-12811 + r = rwindow.Rolling(Series([2, 4, 6]), window=2) + + msg = "numpy operations are not valid with window objects" + + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(r, method)(1, 2, 3) + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(r, method)(dtype=np.float64) + + def test_closed(self): + df = DataFrame({"A": [0, 1, 2, 3, 4]}) + # closed only allowed for datetimelike + with pytest.raises(ValueError): + df.rolling(window=3, closed="neither") + + @pytest.mark.parametrize("closed", ["neither", "left"]) + def test_closed_empty(self, closed, arithmetic_win_operators): + # GH 26005 + func_name = arithmetic_win_operators + ser = pd.Series( + data=np.arange(5), index=pd.date_range("2000", periods=5, freq="2D") + ) + roll = ser.rolling("1D", closed=closed) + + result = getattr(roll, func_name)() + expected = pd.Series([np.nan] * 5, index=ser.index) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("func", ["min", "max"]) + def test_closed_one_entry(self, func): + # GH24718 + ser = pd.Series(data=[2], index=pd.date_range("2000", periods=1)) + result = getattr(ser.rolling("10D", closed="left"), func)() + tm.assert_series_equal(result, pd.Series([np.nan], index=ser.index)) + + @pytest.mark.parametrize("func", ["min", "max"]) + def test_closed_one_entry_groupby(self, func): + # GH24718 + ser = pd.DataFrame( + data={"A": [1, 1, 2], "B": [3, 2, 1]}, + index=pd.date_range("2000", periods=3), + ) + result = getattr( + ser.groupby("A", sort=False)["B"].rolling("10D", closed="left"), func + )() + exp_idx = pd.MultiIndex.from_arrays( + arrays=[[1, 1, 2], ser.index], names=("A", None) + ) + expected = pd.Series(data=[np.nan, 3, np.nan], index=exp_idx, name="B") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("input_dtype", ["int", "float"]) + @pytest.mark.parametrize( + "func,closed,expected", + [ + ("min", "right", [0.0, 0, 0, 1, 2, 3, 4, 5, 6, 7]), + ("min", "both", [0.0, 0, 0, 0, 1, 2, 3, 4, 5, 6]), + ("min", "neither", [np.nan, 0, 0, 1, 2, 3, 4, 5, 6, 7]), + ("min", "left", [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, 6]), + ("max", "right", [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + ("max", "both", [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + ("max", "neither", [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]), + ("max", "left", [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]), + ], + ) + def test_closed_min_max_datetime(self, input_dtype, func, closed, expected): + # see gh-21704 + ser = pd.Series( + data=np.arange(10).astype(input_dtype), + index=pd.date_range("2000", periods=10), + ) + + result = getattr(ser.rolling("3D", closed=closed), func)() + expected = pd.Series(expected, index=ser.index) + tm.assert_series_equal(result, expected) + + def test_closed_uneven(self): + # see gh-21704 + ser = pd.Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) + + # uneven + ser = ser.drop(index=ser.index[[1, 5]]) + result = ser.rolling("3D", closed="left").min() + expected = pd.Series([np.nan, 0, 0, 2, 3, 4, 6, 6], index=ser.index) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "func,closed,expected", + [ + ("min", "right", [np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan, np.nan]), + ("min", "both", [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, np.nan]), + ("min", "neither", [np.nan, np.nan, 0, 1, 2, 3, 4, 5, np.nan, np.nan]), + ("min", "left", [np.nan, np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan]), + ("max", "right", [np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan, np.nan]), + ("max", "both", [np.nan, 1, 2, 3, 4, 5, 6, 6, 6, np.nan]), + ("max", "neither", [np.nan, np.nan, 1, 2, 3, 4, 5, 6, np.nan, np.nan]), + ("max", "left", [np.nan, np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan]), + ], + ) + def test_closed_min_max_minp(self, func, closed, expected): + # see gh-21704 + ser = pd.Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) + ser[ser.index[-3:]] = np.nan + result = getattr(ser.rolling("3D", min_periods=2, closed=closed), func)() + expected = pd.Series(expected, index=ser.index) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "closed,expected", + [ + ("right", [0, 0.5, 1, 2, 3, 4, 5, 6, 7, 8]), + ("both", [0, 0.5, 1, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), + ("neither", [np.nan, 0, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), + ("left", [np.nan, 0, 0.5, 1, 2, 3, 4, 5, 6, 7]), + ], + ) + def test_closed_median_quantile(self, closed, expected): + # GH 26005 + ser = pd.Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) + roll = ser.rolling("3D", closed=closed) + expected = pd.Series(expected, index=ser.index) + + result = roll.median() + tm.assert_series_equal(result, expected) + + result = roll.quantile(0.5) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("roller", ["1s", 1]) + def tests_empty_df_rolling(self, roller): + # GH 15819 Verifies that datetime and integer rolling windows can be + # applied to empty DataFrames + expected = DataFrame() + result = DataFrame().rolling(roller).sum() + tm.assert_frame_equal(result, expected) + + # Verifies that datetime and integer rolling windows can be applied to + # empty DataFrames with datetime index + expected = DataFrame(index=pd.DatetimeIndex([])) + result = DataFrame(index=pd.DatetimeIndex([])).rolling(roller).sum() + tm.assert_frame_equal(result, expected) + + def test_empty_window_median_quantile(self): + # GH 26005 + expected = pd.Series([np.nan, np.nan, np.nan]) + roll = pd.Series(np.arange(3)).rolling(0) + + result = roll.median() + tm.assert_series_equal(result, expected) + + result = roll.quantile(0.1) + tm.assert_series_equal(result, expected) + + def test_missing_minp_zero(self): + # https://github.com/pandas-dev/pandas/pull/18921 + # minp=0 + x = pd.Series([np.nan]) + result = x.rolling(1, min_periods=0).sum() + expected = pd.Series([0.0]) + tm.assert_series_equal(result, expected) + + # minp=1 + result = x.rolling(1, min_periods=1).sum() + expected = pd.Series([np.nan]) + tm.assert_series_equal(result, expected) + + def test_missing_minp_zero_variable(self): + # https://github.com/pandas-dev/pandas/pull/18921 + x = pd.Series( + [np.nan] * 4, + index=pd.DatetimeIndex( + ["2017-01-01", "2017-01-04", "2017-01-06", "2017-01-07"] + ), + ) + result = x.rolling(pd.Timedelta("2d"), min_periods=0).sum() + expected = pd.Series(0.0, index=x.index) + tm.assert_series_equal(result, expected) + + def test_multi_index_names(self): + + # GH 16789, 16825 + cols = pd.MultiIndex.from_product( + [["A", "B"], ["C", "D", "E"]], names=["1", "2"] + ) + df = DataFrame(np.ones((10, 6)), columns=cols) + result = df.rolling(3).cov() + + tm.assert_index_equal(result.columns, df.columns) + assert result.index.names == [None, "1", "2"] + + @pytest.mark.parametrize("klass", [pd.Series, pd.DataFrame]) + def test_iter_raises(self, klass): + # https://github.com/pandas-dev/pandas/issues/11704 + # Iteration over a Window + obj = klass([1, 2, 3, 4]) + with pytest.raises(NotImplementedError): + iter(obj.rolling(2)) + + def test_rolling_axis_sum(self, axis_frame): + # see gh-23372. + df = DataFrame(np.ones((10, 20))) + axis = df._get_axis_number(axis_frame) + + if axis == 0: + expected = DataFrame({i: [np.nan] * 2 + [3.0] * 8 for i in range(20)}) + else: + # axis == 1 + expected = DataFrame([[np.nan] * 2 + [3.0] * 18] * 10) + + result = df.rolling(3, axis=axis_frame).sum() + tm.assert_frame_equal(result, expected) + + def test_rolling_axis_count(self, axis_frame): + # see gh-26055 + df = DataFrame({"x": range(3), "y": range(3)}) + + axis = df._get_axis_number(axis_frame) + + if axis in [0, "index"]: + expected = DataFrame({"x": [1.0, 2.0, 2.0], "y": [1.0, 2.0, 2.0]}) + else: + expected = DataFrame({"x": [1.0, 1.0, 1.0], "y": [2.0, 2.0, 2.0]}) + + result = df.rolling(2, axis=axis_frame).count() + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_window.py b/pandas/tests/window/test_window.py index 3945a8aaa8b87d..a6a56c98a93776 100644 --- a/pandas/tests/window/test_window.py +++ b/pandas/tests/window/test_window.py @@ -1,393 +1,13 @@ -from collections import OrderedDict -import copy -from datetime import datetime, timedelta -import warnings -from warnings import catch_warnings - import numpy as np -from numpy.random import randn import pytest from pandas.errors import UnsupportedFunctionCall import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame, Index, Series, Timestamp, bdate_range, concat, isna, notna -from pandas.core.base import SpecificationError +from pandas import Series import pandas.core.window as rwindow -import pandas.util.testing as tm - -import pandas.tseries.offsets as offsets - -N, K = 100, 10 - - -class Base: - - _nan_locs = np.arange(20, 40) - _inf_locs = np.array([]) - - def _create_data(self): - arr = randn(N) - arr[self._nan_locs] = np.NaN - - self.arr = arr - self.rng = bdate_range(datetime(2009, 1, 1), periods=N) - self.series = Series(arr.copy(), index=self.rng) - self.frame = DataFrame(randn(N, K), index=self.rng, columns=np.arange(K)) - - -class TestApi(Base): - def setup_method(self, method): - self._create_data() - - def test_getitem(self): - - r = self.frame.rolling(window=5) - tm.assert_index_equal(r._selected_obj.columns, self.frame.columns) - - r = self.frame.rolling(window=5)[1] - assert r._selected_obj.name == self.frame.columns[1] - - # technically this is allowed - r = self.frame.rolling(window=5)[1, 3] - tm.assert_index_equal(r._selected_obj.columns, self.frame.columns[[1, 3]]) - - r = self.frame.rolling(window=5)[[1, 3]] - tm.assert_index_equal(r._selected_obj.columns, self.frame.columns[[1, 3]]) - - def test_select_bad_cols(self): - df = DataFrame([[1, 2]], columns=["A", "B"]) - g = df.rolling(window=5) - with pytest.raises(KeyError, match="Columns not found: 'C'"): - g[["C"]] - with pytest.raises(KeyError, match="^[^A]+$"): - # A should not be referenced as a bad column... - # will have to rethink regex if you change message! - g[["A", "C"]] - - def test_attribute_access(self): - - df = DataFrame([[1, 2]], columns=["A", "B"]) - r = df.rolling(window=5) - tm.assert_series_equal(r.A.sum(), r["A"].sum()) - msg = "'Rolling' object has no attribute 'F'" - with pytest.raises(AttributeError, match=msg): - r.F - - def tests_skip_nuisance(self): - - df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) - r = df.rolling(window=3) - result = r[["A", "B"]].sum() - expected = DataFrame( - {"A": [np.nan, np.nan, 3, 6, 9], "B": [np.nan, np.nan, 18, 21, 24]}, - columns=list("AB"), - ) - tm.assert_frame_equal(result, expected) - - def test_skip_sum_object_raises(self): - df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) - r = df.rolling(window=3) - result = r.sum() - expected = DataFrame( - {"A": [np.nan, np.nan, 3, 6, 9], "B": [np.nan, np.nan, 18, 21, 24]}, - columns=list("AB"), - ) - tm.assert_frame_equal(result, expected) - - def test_agg(self): - df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) - - r = df.rolling(window=3) - a_mean = r["A"].mean() - a_std = r["A"].std() - a_sum = r["A"].sum() - b_mean = r["B"].mean() - b_std = r["B"].std() - b_sum = r["B"].sum() - - result = r.aggregate([np.mean, np.std]) - expected = concat([a_mean, a_std, b_mean, b_std], axis=1) - expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) - tm.assert_frame_equal(result, expected) - - result = r.aggregate({"A": np.mean, "B": np.std}) - - expected = concat([a_mean, b_std], axis=1) - tm.assert_frame_equal(result, expected, check_like=True) - - result = r.aggregate({"A": ["mean", "std"]}) - expected = concat([a_mean, a_std], axis=1) - expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std")]) - tm.assert_frame_equal(result, expected) - - result = r["A"].aggregate(["mean", "sum"]) - expected = concat([a_mean, a_sum], axis=1) - expected.columns = ["mean", "sum"] - tm.assert_frame_equal(result, expected) - - with catch_warnings(record=True): - # using a dict with renaming - warnings.simplefilter("ignore", FutureWarning) - result = r.aggregate({"A": {"mean": "mean", "sum": "sum"}}) - expected = concat([a_mean, a_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "sum")]) - tm.assert_frame_equal(result, expected, check_like=True) - - with catch_warnings(record=True): - warnings.simplefilter("ignore", FutureWarning) - result = r.aggregate( - { - "A": {"mean": "mean", "sum": "sum"}, - "B": {"mean2": "mean", "sum2": "sum"}, - } - ) - expected = concat([a_mean, a_sum, b_mean, b_sum], axis=1) - exp_cols = [("A", "mean"), ("A", "sum"), ("B", "mean2"), ("B", "sum2")] - expected.columns = pd.MultiIndex.from_tuples(exp_cols) - tm.assert_frame_equal(result, expected, check_like=True) - - result = r.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) - expected = concat([a_mean, a_std, b_mean, b_std], axis=1) - - exp_cols = [("A", "mean"), ("A", "std"), ("B", "mean"), ("B", "std")] - expected.columns = pd.MultiIndex.from_tuples(exp_cols) - tm.assert_frame_equal(result, expected, check_like=True) - - def test_agg_apply(self, raw): - - # passed lambda - df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) - - r = df.rolling(window=3) - a_sum = r["A"].sum() - - result = r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) - rcustom = r["B"].apply(lambda x: np.std(x, ddof=1), raw=raw) - expected = concat([a_sum, rcustom], axis=1) - tm.assert_frame_equal(result, expected, check_like=True) - - def test_agg_consistency(self): - - df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) - r = df.rolling(window=3) - - result = r.agg([np.sum, np.mean]).columns - expected = pd.MultiIndex.from_product([list("AB"), ["sum", "mean"]]) - tm.assert_index_equal(result, expected) - - result = r["A"].agg([np.sum, np.mean]).columns - expected = Index(["sum", "mean"]) - tm.assert_index_equal(result, expected) - - result = r.agg({"A": [np.sum, np.mean]}).columns - expected = pd.MultiIndex.from_tuples([("A", "sum"), ("A", "mean")]) - tm.assert_index_equal(result, expected) - - def test_agg_nested_dicts(self): - - # API change for disallowing these types of nested dicts - df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) - r = df.rolling(window=3) - - msg = r"cannot perform renaming for (r1|r2) with a nested dictionary" - with pytest.raises(SpecificationError, match=msg): - r.aggregate({"r1": {"A": ["mean", "sum"]}, "r2": {"B": ["mean", "sum"]}}) - - expected = concat( - [r["A"].mean(), r["A"].std(), r["B"].mean(), r["B"].std()], axis=1 - ) - expected.columns = pd.MultiIndex.from_tuples( - [("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")] - ) - with catch_warnings(record=True): - warnings.simplefilter("ignore", FutureWarning) - result = r[["A", "B"]].agg( - {"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}} - ) - tm.assert_frame_equal(result, expected, check_like=True) - - with catch_warnings(record=True): - warnings.simplefilter("ignore", FutureWarning) - result = r.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) - expected.columns = pd.MultiIndex.from_tuples( - [ - ("A", "ra", "mean"), - ("A", "ra", "std"), - ("B", "rb", "mean"), - ("B", "rb", "std"), - ] - ) - tm.assert_frame_equal(result, expected, check_like=True) - - def test_count_nonnumeric_types(self): - # GH12541 - cols = [ - "int", - "float", - "string", - "datetime", - "timedelta", - "periods", - "fl_inf", - "fl_nan", - "str_nan", - "dt_nat", - "periods_nat", - ] - - df = DataFrame( - { - "int": [1, 2, 3], - "float": [4.0, 5.0, 6.0], - "string": list("abc"), - "datetime": pd.date_range("20170101", periods=3), - "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), - "periods": [ - pd.Period("2012-01"), - pd.Period("2012-02"), - pd.Period("2012-03"), - ], - "fl_inf": [1.0, 2.0, np.Inf], - "fl_nan": [1.0, 2.0, np.NaN], - "str_nan": ["aa", "bb", np.NaN], - "dt_nat": [ - Timestamp("20170101"), - Timestamp("20170203"), - Timestamp(None), - ], - "periods_nat": [ - pd.Period("2012-01"), - pd.Period("2012-02"), - pd.Period(None), - ], - }, - columns=cols, - ) - - expected = DataFrame( - { - "int": [1.0, 2.0, 2.0], - "float": [1.0, 2.0, 2.0], - "string": [1.0, 2.0, 2.0], - "datetime": [1.0, 2.0, 2.0], - "timedelta": [1.0, 2.0, 2.0], - "periods": [1.0, 2.0, 2.0], - "fl_inf": [1.0, 2.0, 2.0], - "fl_nan": [1.0, 2.0, 1.0], - "str_nan": [1.0, 2.0, 1.0], - "dt_nat": [1.0, 2.0, 1.0], - "periods_nat": [1.0, 2.0, 1.0], - }, - columns=cols, - ) - - result = df.rolling(window=2).count() - tm.assert_frame_equal(result, expected) - - result = df.rolling(1).count() - expected = df.notna().astype(float) - tm.assert_frame_equal(result, expected) - - @td.skip_if_no_scipy - @pytest.mark.filterwarnings("ignore:can't resolve:ImportWarning") - def test_window_with_args(self): - # make sure that we are aggregating window functions correctly with arg - r = Series(np.random.randn(100)).rolling( - window=10, min_periods=1, win_type="gaussian" - ) - expected = concat([r.mean(std=10), r.mean(std=0.01)], axis=1) - expected.columns = ["", ""] - result = r.aggregate([lambda x: x.mean(std=10), lambda x: x.mean(std=0.01)]) - tm.assert_frame_equal(result, expected) - - def a(x): - return x.mean(std=10) - - def b(x): - return x.mean(std=0.01) - - expected = concat([r.mean(std=10), r.mean(std=0.01)], axis=1) - expected.columns = ["a", "b"] - result = r.aggregate([a, b]) - tm.assert_frame_equal(result, expected) - - def test_preserve_metadata(self): - # GH 10565 - s = Series(np.arange(100), name="foo") - - s2 = s.rolling(30).sum() - s3 = s.rolling(20).sum() - assert s2.name == "foo" - assert s3.name == "foo" - - @pytest.mark.parametrize( - "func,window_size,expected_vals", - [ - ( - "rolling", - 2, - [ - [np.nan, np.nan, np.nan, np.nan], - [15.0, 20.0, 25.0, 20.0], - [25.0, 30.0, 35.0, 30.0], - [np.nan, np.nan, np.nan, np.nan], - [20.0, 30.0, 35.0, 30.0], - [35.0, 40.0, 60.0, 40.0], - [60.0, 80.0, 85.0, 80], - ], - ), - ( - "expanding", - None, - [ - [10.0, 10.0, 20.0, 20.0], - [15.0, 20.0, 25.0, 20.0], - [20.0, 30.0, 30.0, 20.0], - [10.0, 10.0, 30.0, 30.0], - [20.0, 30.0, 35.0, 30.0], - [26.666667, 40.0, 50.0, 30.0], - [40.0, 80.0, 60.0, 30.0], - ], - ), - ], - ) - def test_multiple_agg_funcs(self, func, window_size, expected_vals): - # GH 15072 - df = pd.DataFrame( - [ - ["A", 10, 20], - ["A", 20, 30], - ["A", 30, 40], - ["B", 10, 30], - ["B", 30, 40], - ["B", 40, 80], - ["B", 80, 90], - ], - columns=["stock", "low", "high"], - ) - - f = getattr(df.groupby("stock"), func) - if window_size: - window = f(window_size) - else: - window = f() - - index = pd.MultiIndex.from_tuples( - [("A", 0), ("A", 1), ("A", 2), ("B", 3), ("B", 4), ("B", 5), ("B", 6)], - names=["stock", None], - ) - columns = pd.MultiIndex.from_tuples( - [("low", "mean"), ("low", "max"), ("high", "mean"), ("high", "min")] - ) - expected = pd.DataFrame(expected_vals, index=index, columns=columns) - - result = window.agg( - OrderedDict((("low", ["mean", "max"]), ("high", ["mean", "min"]))) - ) - - tm.assert_frame_equal(result, expected) +from pandas.tests.window.common import Base @pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning") @@ -454,3200 +74,3 @@ def test_agg_function_support(self, arg): with pytest.raises(AttributeError, match=msg): roll.agg({"A": arg}) - - -class TestRolling(Base): - def setup_method(self, method): - self._create_data() - - def test_doc_string(self): - - df = DataFrame({"B": [0, 1, 2, np.nan, 4]}) - df - df.rolling(2).sum() - df.rolling(2, min_periods=1).sum() - - @pytest.mark.parametrize("which", ["series", "frame"]) - def test_constructor(self, which): - # GH 12669 - - o = getattr(self, which) - c = o.rolling - - # valid - c(window=2) - c(window=2, min_periods=1) - c(window=2, min_periods=1, center=True) - c(window=2, min_periods=1, center=False) - - # GH 13383 - with pytest.raises(ValueError): - c(0) - c(-1) - - # not valid - for w in [2.0, "foo", np.array([2])]: - with pytest.raises(ValueError): - c(window=w) - with pytest.raises(ValueError): - c(window=2, min_periods=w) - with pytest.raises(ValueError): - c(window=2, min_periods=1, center=w) - - @td.skip_if_no_scipy - @pytest.mark.parametrize("which", ["series", "frame"]) - def test_constructor_with_win_type(self, which): - # GH 13383 - o = getattr(self, which) - c = o.rolling - with pytest.raises(ValueError): - c(-1, win_type="boxcar") - - @pytest.mark.parametrize("window", [timedelta(days=3), pd.Timedelta(days=3)]) - def test_constructor_with_timedelta_window(self, window): - # GH 15440 - n = 10 - df = DataFrame( - {"value": np.arange(n)}, - index=pd.date_range("2015-12-24", periods=n, freq="D"), - ) - expected_data = np.append([0.0, 1.0], np.arange(3.0, 27.0, 3)) - - result = df.rolling(window=window).sum() - expected = DataFrame( - {"value": expected_data}, - index=pd.date_range("2015-12-24", periods=n, freq="D"), - ) - tm.assert_frame_equal(result, expected) - expected = df.rolling("3D").sum() - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("window", [timedelta(days=3), pd.Timedelta(days=3), "3D"]) - def test_constructor_timedelta_window_and_minperiods(self, window, raw): - # GH 15305 - n = 10 - df = DataFrame( - {"value": np.arange(n)}, - index=pd.date_range("2017-08-08", periods=n, freq="D"), - ) - expected = DataFrame( - {"value": np.append([np.NaN, 1.0], np.arange(3.0, 27.0, 3))}, - index=pd.date_range("2017-08-08", periods=n, freq="D"), - ) - result_roll_sum = df.rolling(window=window, min_periods=2).sum() - result_roll_generic = df.rolling(window=window, min_periods=2).apply( - sum, raw=raw - ) - tm.assert_frame_equal(result_roll_sum, expected) - tm.assert_frame_equal(result_roll_generic, expected) - - @pytest.mark.parametrize("method", ["std", "mean", "sum", "max", "min", "var"]) - def test_numpy_compat(self, method): - # see gh-12811 - r = rwindow.Rolling(Series([2, 4, 6]), window=2) - - msg = "numpy operations are not valid with window objects" - - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(r, method)(1, 2, 3) - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(r, method)(dtype=np.float64) - - def test_closed(self): - df = DataFrame({"A": [0, 1, 2, 3, 4]}) - # closed only allowed for datetimelike - with pytest.raises(ValueError): - df.rolling(window=3, closed="neither") - - @pytest.mark.parametrize("closed", ["neither", "left"]) - def test_closed_empty(self, closed, arithmetic_win_operators): - # GH 26005 - func_name = arithmetic_win_operators - ser = pd.Series( - data=np.arange(5), index=pd.date_range("2000", periods=5, freq="2D") - ) - roll = ser.rolling("1D", closed=closed) - - result = getattr(roll, func_name)() - expected = pd.Series([np.nan] * 5, index=ser.index) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("func", ["min", "max"]) - def test_closed_one_entry(self, func): - # GH24718 - ser = pd.Series(data=[2], index=pd.date_range("2000", periods=1)) - result = getattr(ser.rolling("10D", closed="left"), func)() - tm.assert_series_equal(result, pd.Series([np.nan], index=ser.index)) - - @pytest.mark.parametrize("func", ["min", "max"]) - def test_closed_one_entry_groupby(self, func): - # GH24718 - ser = pd.DataFrame( - data={"A": [1, 1, 2], "B": [3, 2, 1]}, - index=pd.date_range("2000", periods=3), - ) - result = getattr( - ser.groupby("A", sort=False)["B"].rolling("10D", closed="left"), func - )() - exp_idx = pd.MultiIndex.from_arrays( - arrays=[[1, 1, 2], ser.index], names=("A", None) - ) - expected = pd.Series(data=[np.nan, 3, np.nan], index=exp_idx, name="B") - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("input_dtype", ["int", "float"]) - @pytest.mark.parametrize( - "func,closed,expected", - [ - ("min", "right", [0.0, 0, 0, 1, 2, 3, 4, 5, 6, 7]), - ("min", "both", [0.0, 0, 0, 0, 1, 2, 3, 4, 5, 6]), - ("min", "neither", [np.nan, 0, 0, 1, 2, 3, 4, 5, 6, 7]), - ("min", "left", [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, 6]), - ("max", "right", [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), - ("max", "both", [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), - ("max", "neither", [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]), - ("max", "left", [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]), - ], - ) - def test_closed_min_max_datetime(self, input_dtype, func, closed, expected): - # see gh-21704 - ser = pd.Series( - data=np.arange(10).astype(input_dtype), - index=pd.date_range("2000", periods=10), - ) - - result = getattr(ser.rolling("3D", closed=closed), func)() - expected = pd.Series(expected, index=ser.index) - tm.assert_series_equal(result, expected) - - def test_closed_uneven(self): - # see gh-21704 - ser = pd.Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) - - # uneven - ser = ser.drop(index=ser.index[[1, 5]]) - result = ser.rolling("3D", closed="left").min() - expected = pd.Series([np.nan, 0, 0, 2, 3, 4, 6, 6], index=ser.index) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( - "func,closed,expected", - [ - ("min", "right", [np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan, np.nan]), - ("min", "both", [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, np.nan]), - ("min", "neither", [np.nan, np.nan, 0, 1, 2, 3, 4, 5, np.nan, np.nan]), - ("min", "left", [np.nan, np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan]), - ("max", "right", [np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan, np.nan]), - ("max", "both", [np.nan, 1, 2, 3, 4, 5, 6, 6, 6, np.nan]), - ("max", "neither", [np.nan, np.nan, 1, 2, 3, 4, 5, 6, np.nan, np.nan]), - ("max", "left", [np.nan, np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan]), - ], - ) - def test_closed_min_max_minp(self, func, closed, expected): - # see gh-21704 - ser = pd.Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) - ser[ser.index[-3:]] = np.nan - result = getattr(ser.rolling("3D", min_periods=2, closed=closed), func)() - expected = pd.Series(expected, index=ser.index) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( - "closed,expected", - [ - ("right", [0, 0.5, 1, 2, 3, 4, 5, 6, 7, 8]), - ("both", [0, 0.5, 1, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), - ("neither", [np.nan, 0, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), - ("left", [np.nan, 0, 0.5, 1, 2, 3, 4, 5, 6, 7]), - ], - ) - def test_closed_median_quantile(self, closed, expected): - # GH 26005 - ser = pd.Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) - roll = ser.rolling("3D", closed=closed) - expected = pd.Series(expected, index=ser.index) - - result = roll.median() - tm.assert_series_equal(result, expected) - - result = roll.quantile(0.5) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("roller", ["1s", 1]) - def tests_empty_df_rolling(self, roller): - # GH 15819 Verifies that datetime and integer rolling windows can be - # applied to empty DataFrames - expected = DataFrame() - result = DataFrame().rolling(roller).sum() - tm.assert_frame_equal(result, expected) - - # Verifies that datetime and integer rolling windows can be applied to - # empty DataFrames with datetime index - expected = DataFrame(index=pd.DatetimeIndex([])) - result = DataFrame(index=pd.DatetimeIndex([])).rolling(roller).sum() - tm.assert_frame_equal(result, expected) - - def test_empty_window_median_quantile(self): - # GH 26005 - expected = pd.Series([np.nan, np.nan, np.nan]) - roll = pd.Series(np.arange(3)).rolling(0) - - result = roll.median() - tm.assert_series_equal(result, expected) - - result = roll.quantile(0.1) - tm.assert_series_equal(result, expected) - - def test_missing_minp_zero(self): - # https://github.com/pandas-dev/pandas/pull/18921 - # minp=0 - x = pd.Series([np.nan]) - result = x.rolling(1, min_periods=0).sum() - expected = pd.Series([0.0]) - tm.assert_series_equal(result, expected) - - # minp=1 - result = x.rolling(1, min_periods=1).sum() - expected = pd.Series([np.nan]) - tm.assert_series_equal(result, expected) - - def test_missing_minp_zero_variable(self): - # https://github.com/pandas-dev/pandas/pull/18921 - x = pd.Series( - [np.nan] * 4, - index=pd.DatetimeIndex( - ["2017-01-01", "2017-01-04", "2017-01-06", "2017-01-07"] - ), - ) - result = x.rolling(pd.Timedelta("2d"), min_periods=0).sum() - expected = pd.Series(0.0, index=x.index) - tm.assert_series_equal(result, expected) - - def test_multi_index_names(self): - - # GH 16789, 16825 - cols = pd.MultiIndex.from_product( - [["A", "B"], ["C", "D", "E"]], names=["1", "2"] - ) - df = DataFrame(np.ones((10, 6)), columns=cols) - result = df.rolling(3).cov() - - tm.assert_index_equal(result.columns, df.columns) - assert result.index.names == [None, "1", "2"] - - @pytest.mark.parametrize("klass", [pd.Series, pd.DataFrame]) - def test_iter_raises(self, klass): - # https://github.com/pandas-dev/pandas/issues/11704 - # Iteration over a Window - obj = klass([1, 2, 3, 4]) - with pytest.raises(NotImplementedError): - iter(obj.rolling(2)) - - def test_rolling_axis_sum(self, axis_frame): - # see gh-23372. - df = DataFrame(np.ones((10, 20))) - axis = df._get_axis_number(axis_frame) - - if axis == 0: - expected = DataFrame({i: [np.nan] * 2 + [3.0] * 8 for i in range(20)}) - else: - # axis == 1 - expected = DataFrame([[np.nan] * 2 + [3.0] * 18] * 10) - - result = df.rolling(3, axis=axis_frame).sum() - tm.assert_frame_equal(result, expected) - - def test_rolling_axis_count(self, axis_frame): - # see gh-26055 - df = DataFrame({"x": range(3), "y": range(3)}) - - axis = df._get_axis_number(axis_frame) - - if axis in [0, "index"]: - expected = DataFrame({"x": [1.0, 2.0, 2.0], "y": [1.0, 2.0, 2.0]}) - else: - expected = DataFrame({"x": [1.0, 1.0, 1.0], "y": [2.0, 2.0, 2.0]}) - - result = df.rolling(2, axis=axis_frame).count() - tm.assert_frame_equal(result, expected) - - -class TestExpanding(Base): - def setup_method(self, method): - self._create_data() - - def test_doc_string(self): - - df = DataFrame({"B": [0, 1, 2, np.nan, 4]}) - df - df.expanding(2).sum() - - @pytest.mark.parametrize("which", ["series", "frame"]) - def test_constructor(self, which): - # GH 12669 - - o = getattr(self, which) - c = o.expanding - - # valid - c(min_periods=1) - c(min_periods=1, center=True) - c(min_periods=1, center=False) - - # not valid - for w in [2.0, "foo", np.array([2])]: - with pytest.raises(ValueError): - c(min_periods=w) - with pytest.raises(ValueError): - c(min_periods=1, center=w) - - @pytest.mark.parametrize("method", ["std", "mean", "sum", "max", "min", "var"]) - def test_numpy_compat(self, method): - # see gh-12811 - e = rwindow.Expanding(Series([2, 4, 6]), window=2) - - msg = "numpy operations are not valid with window objects" - - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(e, method)(1, 2, 3) - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(e, method)(dtype=np.float64) - - @pytest.mark.parametrize( - "expander", - [ - 1, - pytest.param( - "ls", - marks=pytest.mark.xfail( - reason="GH#16425 expanding with offset not supported" - ), - ), - ], - ) - def test_empty_df_expanding(self, expander): - # GH 15819 Verifies that datetime and integer expanding windows can be - # applied to empty DataFrames - - expected = DataFrame() - result = DataFrame().expanding(expander).sum() - tm.assert_frame_equal(result, expected) - - # Verifies that datetime and integer expanding windows can be applied - # to empty DataFrames with datetime index - expected = DataFrame(index=pd.DatetimeIndex([])) - result = DataFrame(index=pd.DatetimeIndex([])).expanding(expander).sum() - tm.assert_frame_equal(result, expected) - - def test_missing_minp_zero(self): - # https://github.com/pandas-dev/pandas/pull/18921 - # minp=0 - x = pd.Series([np.nan]) - result = x.expanding(min_periods=0).sum() - expected = pd.Series([0.0]) - tm.assert_series_equal(result, expected) - - # minp=1 - result = x.expanding(min_periods=1).sum() - expected = pd.Series([np.nan]) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("klass", [pd.Series, pd.DataFrame]) - def test_iter_raises(self, klass): - # https://github.com/pandas-dev/pandas/issues/11704 - # Iteration over a Window - obj = klass([1, 2, 3, 4]) - with pytest.raises(NotImplementedError): - iter(obj.expanding(2)) - - def test_expanding_axis(self, axis_frame): - # see gh-23372. - df = DataFrame(np.ones((10, 20))) - axis = df._get_axis_number(axis_frame) - - if axis == 0: - expected = DataFrame( - {i: [np.nan] * 2 + [float(j) for j in range(3, 11)] for i in range(20)} - ) - else: - # axis == 1 - expected = DataFrame([[np.nan] * 2 + [float(i) for i in range(3, 21)]] * 10) - - result = df.expanding(3, axis=axis_frame).sum() - tm.assert_frame_equal(result, expected) - - -class TestEWM(Base): - def setup_method(self, method): - self._create_data() - - def test_doc_string(self): - - df = DataFrame({"B": [0, 1, 2, np.nan, 4]}) - df - df.ewm(com=0.5).mean() - - @pytest.mark.parametrize("which", ["series", "frame"]) - def test_constructor(self, which): - o = getattr(self, which) - c = o.ewm - - # valid - c(com=0.5) - c(span=1.5) - c(alpha=0.5) - c(halflife=0.75) - c(com=0.5, span=None) - c(alpha=0.5, com=None) - c(halflife=0.75, alpha=None) - - # not valid: mutually exclusive - with pytest.raises(ValueError): - c(com=0.5, alpha=0.5) - with pytest.raises(ValueError): - c(span=1.5, halflife=0.75) - with pytest.raises(ValueError): - c(alpha=0.5, span=1.5) - - # not valid: com < 0 - with pytest.raises(ValueError): - c(com=-0.5) - - # not valid: span < 1 - with pytest.raises(ValueError): - c(span=0.5) - - # not valid: halflife <= 0 - with pytest.raises(ValueError): - c(halflife=0) - - # not valid: alpha <= 0 or alpha > 1 - for alpha in (-0.5, 1.5): - with pytest.raises(ValueError): - c(alpha=alpha) - - @pytest.mark.parametrize("method", ["std", "mean", "var"]) - def test_numpy_compat(self, method): - # see gh-12811 - e = rwindow.EWM(Series([2, 4, 6]), alpha=0.5) - - msg = "numpy operations are not valid with window objects" - - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(e, method)(1, 2, 3) - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(e, method)(dtype=np.float64) - - -@pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning") -class TestMoments(Base): - def setup_method(self, method): - self._create_data() - - def test_centered_axis_validation(self): - - # ok - Series(np.ones(10)).rolling(window=3, center=True, axis=0).mean() - - # bad axis - with pytest.raises(ValueError): - Series(np.ones(10)).rolling(window=3, center=True, axis=1).mean() - - # ok ok - DataFrame(np.ones((10, 10))).rolling(window=3, center=True, axis=0).mean() - DataFrame(np.ones((10, 10))).rolling(window=3, center=True, axis=1).mean() - - # bad axis - with pytest.raises(ValueError): - (DataFrame(np.ones((10, 10))).rolling(window=3, center=True, axis=2).mean()) - - def test_rolling_sum(self, raw): - self._check_moment_func( - np.nansum, name="sum", zero_min_periods_equal=False, raw=raw - ) - - def test_rolling_count(self, raw): - counter = lambda x: np.isfinite(x).astype(float).sum() - self._check_moment_func( - counter, name="count", has_min_periods=False, fill_value=0, raw=raw - ) - - def test_rolling_mean(self, raw): - self._check_moment_func(np.mean, name="mean", raw=raw) - - @td.skip_if_no_scipy - def test_cmov_mean(self): - # GH 8238 - vals = np.array( - [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48] - ) - result = Series(vals).rolling(5, center=True).mean() - expected = Series( - [ - np.nan, - np.nan, - 9.962, - 11.27, - 11.564, - 12.516, - 12.818, - 12.952, - np.nan, - np.nan, - ] - ) - tm.assert_series_equal(expected, result) - - @td.skip_if_no_scipy - def test_cmov_window(self): - # GH 8238 - vals = np.array( - [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48] - ) - result = Series(vals).rolling(5, win_type="boxcar", center=True).mean() - expected = Series( - [ - np.nan, - np.nan, - 9.962, - 11.27, - 11.564, - 12.516, - 12.818, - 12.952, - np.nan, - np.nan, - ] - ) - tm.assert_series_equal(expected, result) - - @td.skip_if_no_scipy - def test_cmov_window_corner(self): - # GH 8238 - # all nan - vals = pd.Series([np.nan] * 10) - result = vals.rolling(5, center=True, win_type="boxcar").mean() - assert np.isnan(result).all() - - # empty - vals = pd.Series([]) - result = vals.rolling(5, center=True, win_type="boxcar").mean() - assert len(result) == 0 - - # shorter than window - vals = pd.Series(np.random.randn(5)) - result = vals.rolling(10, win_type="boxcar").mean() - assert np.isnan(result).all() - assert len(result) == 5 - - @td.skip_if_no_scipy - def test_cmov_window_frame(self): - # Gh 8238 - vals = np.array( - [ - [12.18, 3.64], - [10.18, 9.16], - [13.24, 14.61], - [4.51, 8.11], - [6.15, 11.44], - [9.14, 6.21], - [11.31, 10.67], - [2.94, 6.51], - [9.42, 8.39], - [12.44, 7.34], - ] - ) - - xp = np.array( - [ - [np.nan, np.nan], - [np.nan, np.nan], - [9.252, 9.392], - [8.644, 9.906], - [8.87, 10.208], - [6.81, 8.588], - [7.792, 8.644], - [9.05, 7.824], - [np.nan, np.nan], - [np.nan, np.nan], - ] - ) - - # DataFrame - rs = DataFrame(vals).rolling(5, win_type="boxcar", center=True).mean() - tm.assert_frame_equal(DataFrame(xp), rs) - - # invalid method - with pytest.raises(AttributeError): - (DataFrame(vals).rolling(5, win_type="boxcar", center=True).std()) - - # sum - xp = np.array( - [ - [np.nan, np.nan], - [np.nan, np.nan], - [46.26, 46.96], - [43.22, 49.53], - [44.35, 51.04], - [34.05, 42.94], - [38.96, 43.22], - [45.25, 39.12], - [np.nan, np.nan], - [np.nan, np.nan], - ] - ) - - rs = DataFrame(vals).rolling(5, win_type="boxcar", center=True).sum() - tm.assert_frame_equal(DataFrame(xp), rs) - - @td.skip_if_no_scipy - def test_cmov_window_na_min_periods(self): - # min_periods - vals = Series(np.random.randn(10)) - vals[4] = np.nan - vals[8] = np.nan - - xp = vals.rolling(5, min_periods=4, center=True).mean() - rs = vals.rolling(5, win_type="boxcar", min_periods=4, center=True).mean() - tm.assert_series_equal(xp, rs) - - @td.skip_if_no_scipy - def test_cmov_window_regular(self, win_types): - # GH 8238 - vals = np.array( - [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48] - ) - xps = { - "hamming": [ - np.nan, - np.nan, - 8.71384, - 9.56348, - 12.38009, - 14.03687, - 13.8567, - 11.81473, - np.nan, - np.nan, - ], - "triang": [ - np.nan, - np.nan, - 9.28667, - 10.34667, - 12.00556, - 13.33889, - 13.38, - 12.33667, - np.nan, - np.nan, - ], - "barthann": [ - np.nan, - np.nan, - 8.4425, - 9.1925, - 12.5575, - 14.3675, - 14.0825, - 11.5675, - np.nan, - np.nan, - ], - "bohman": [ - np.nan, - np.nan, - 7.61599, - 9.1764, - 12.83559, - 14.17267, - 14.65923, - 11.10401, - np.nan, - np.nan, - ], - "blackmanharris": [ - np.nan, - np.nan, - 6.97691, - 9.16438, - 13.05052, - 14.02156, - 15.10512, - 10.74574, - np.nan, - np.nan, - ], - "nuttall": [ - np.nan, - np.nan, - 7.04618, - 9.16786, - 13.02671, - 14.03559, - 15.05657, - 10.78514, - np.nan, - np.nan, - ], - "blackman": [ - np.nan, - np.nan, - 7.73345, - 9.17869, - 12.79607, - 14.20036, - 14.57726, - 11.16988, - np.nan, - np.nan, - ], - "bartlett": [ - np.nan, - np.nan, - 8.4425, - 9.1925, - 12.5575, - 14.3675, - 14.0825, - 11.5675, - np.nan, - np.nan, - ], - } - - xp = Series(xps[win_types]) - rs = Series(vals).rolling(5, win_type=win_types, center=True).mean() - tm.assert_series_equal(xp, rs) - - @td.skip_if_no_scipy - def test_cmov_window_regular_linear_range(self, win_types): - # GH 8238 - vals = np.array(range(10), dtype=np.float) - xp = vals.copy() - xp[:2] = np.nan - xp[-2:] = np.nan - xp = Series(xp) - - rs = Series(vals).rolling(5, win_type=win_types, center=True).mean() - tm.assert_series_equal(xp, rs) - - @td.skip_if_no_scipy - def test_cmov_window_regular_missing_data(self, win_types): - # GH 8238 - vals = np.array( - [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, np.nan, 10.63, 14.48] - ) - xps = { - "bartlett": [ - np.nan, - np.nan, - 9.70333, - 10.5225, - 8.4425, - 9.1925, - 12.5575, - 14.3675, - 15.61667, - 13.655, - ], - "blackman": [ - np.nan, - np.nan, - 9.04582, - 11.41536, - 7.73345, - 9.17869, - 12.79607, - 14.20036, - 15.8706, - 13.655, - ], - "barthann": [ - np.nan, - np.nan, - 9.70333, - 10.5225, - 8.4425, - 9.1925, - 12.5575, - 14.3675, - 15.61667, - 13.655, - ], - "bohman": [ - np.nan, - np.nan, - 8.9444, - 11.56327, - 7.61599, - 9.1764, - 12.83559, - 14.17267, - 15.90976, - 13.655, - ], - "hamming": [ - np.nan, - np.nan, - 9.59321, - 10.29694, - 8.71384, - 9.56348, - 12.38009, - 14.20565, - 15.24694, - 13.69758, - ], - "nuttall": [ - np.nan, - np.nan, - 8.47693, - 12.2821, - 7.04618, - 9.16786, - 13.02671, - 14.03673, - 16.08759, - 13.65553, - ], - "triang": [ - np.nan, - np.nan, - 9.33167, - 9.76125, - 9.28667, - 10.34667, - 12.00556, - 13.82125, - 14.49429, - 13.765, - ], - "blackmanharris": [ - np.nan, - np.nan, - 8.42526, - 12.36824, - 6.97691, - 9.16438, - 13.05052, - 14.02175, - 16.1098, - 13.65509, - ], - } - - xp = Series(xps[win_types]) - rs = Series(vals).rolling(5, win_type=win_types, min_periods=3).mean() - tm.assert_series_equal(xp, rs) - - @td.skip_if_no_scipy - def test_cmov_window_special(self, win_types_special): - # GH 8238 - kwds = { - "kaiser": {"beta": 1.0}, - "gaussian": {"std": 1.0}, - "general_gaussian": {"power": 2.0, "width": 2.0}, - "exponential": {"tau": 10}, - } - - vals = np.array( - [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48] - ) - - xps = { - "gaussian": [ - np.nan, - np.nan, - 8.97297, - 9.76077, - 12.24763, - 13.89053, - 13.65671, - 12.01002, - np.nan, - np.nan, - ], - "general_gaussian": [ - np.nan, - np.nan, - 9.85011, - 10.71589, - 11.73161, - 13.08516, - 12.95111, - 12.74577, - np.nan, - np.nan, - ], - "kaiser": [ - np.nan, - np.nan, - 9.86851, - 11.02969, - 11.65161, - 12.75129, - 12.90702, - 12.83757, - np.nan, - np.nan, - ], - "exponential": [ - np.nan, - np.nan, - 9.83364, - 11.10472, - 11.64551, - 12.66138, - 12.92379, - 12.83770, - np.nan, - np.nan, - ], - } - - xp = Series(xps[win_types_special]) - rs = ( - Series(vals) - .rolling(5, win_type=win_types_special, center=True) - .mean(**kwds[win_types_special]) - ) - tm.assert_series_equal(xp, rs) - - @td.skip_if_no_scipy - def test_cmov_window_special_linear_range(self, win_types_special): - # GH 8238 - kwds = { - "kaiser": {"beta": 1.0}, - "gaussian": {"std": 1.0}, - "general_gaussian": {"power": 2.0, "width": 2.0}, - "slepian": {"width": 0.5}, - "exponential": {"tau": 10}, - } - - vals = np.array(range(10), dtype=np.float) - xp = vals.copy() - xp[:2] = np.nan - xp[-2:] = np.nan - xp = Series(xp) - - rs = ( - Series(vals) - .rolling(5, win_type=win_types_special, center=True) - .mean(**kwds[win_types_special]) - ) - tm.assert_series_equal(xp, rs) - - def test_rolling_median(self, raw): - self._check_moment_func(np.median, name="median", raw=raw) - - def test_rolling_min(self, raw): - self._check_moment_func(np.min, name="min", raw=raw) - - a = pd.Series([1, 2, 3, 4, 5]) - result = a.rolling(window=100, min_periods=1).min() - expected = pd.Series(np.ones(len(a))) - tm.assert_series_equal(result, expected) - - with pytest.raises(ValueError): - pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).min() - - def test_rolling_max(self, raw): - self._check_moment_func(np.max, name="max", raw=raw) - - a = pd.Series([1, 2, 3, 4, 5], dtype=np.float64) - b = a.rolling(window=100, min_periods=1).max() - tm.assert_almost_equal(a, b) - - with pytest.raises(ValueError): - pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).max() - - @pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0]) - def test_rolling_quantile(self, q, raw): - def scoreatpercentile(a, per): - values = np.sort(a, axis=0) - - idx = int(per / 1.0 * (values.shape[0] - 1)) - - if idx == values.shape[0] - 1: - retval = values[-1] - - else: - qlow = float(idx) / float(values.shape[0] - 1) - qhig = float(idx + 1) / float(values.shape[0] - 1) - vlow = values[idx] - vhig = values[idx + 1] - retval = vlow + (vhig - vlow) * (per - qlow) / (qhig - qlow) - - return retval - - def quantile_func(x): - return scoreatpercentile(x, q) - - self._check_moment_func(quantile_func, name="quantile", quantile=q, raw=raw) - - def test_rolling_quantile_np_percentile(self): - # #9413: Tests that rolling window's quantile default behavior - # is analogous to Numpy's percentile - row = 10 - col = 5 - idx = pd.date_range("20100101", periods=row, freq="B") - df = DataFrame(np.random.rand(row * col).reshape((row, -1)), index=idx) - - df_quantile = df.quantile([0.25, 0.5, 0.75], axis=0) - np_percentile = np.percentile(df, [25, 50, 75], axis=0) - - tm.assert_almost_equal(df_quantile.values, np.array(np_percentile)) - - @pytest.mark.parametrize("quantile", [0.0, 0.1, 0.45, 0.5, 1]) - @pytest.mark.parametrize( - "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] - ) - @pytest.mark.parametrize( - "data", - [ - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], - [8.0, 1.0, 3.0, 4.0, 5.0, 2.0, 6.0, 7.0], - [0.0, np.nan, 0.2, np.nan, 0.4], - [np.nan, np.nan, np.nan, np.nan], - [np.nan, 0.1, np.nan, 0.3, 0.4, 0.5], - [0.5], - [np.nan, 0.7, 0.6], - ], - ) - def test_rolling_quantile_interpolation_options( - self, quantile, interpolation, data - ): - # Tests that rolling window's quantile behavior is analogous to - # Series' quantile for each interpolation option - s = Series(data) - - q1 = s.quantile(quantile, interpolation) - q2 = s.expanding(min_periods=1).quantile(quantile, interpolation).iloc[-1] - - if np.isnan(q1): - assert np.isnan(q2) - else: - assert q1 == q2 - - def test_invalid_quantile_value(self): - data = np.arange(5) - s = Series(data) - - msg = "Interpolation 'invalid' is not supported" - with pytest.raises(ValueError, match=msg): - s.rolling(len(data), min_periods=1).quantile(0.5, interpolation="invalid") - - def test_rolling_quantile_param(self): - ser = Series([0.0, 0.1, 0.5, 0.9, 1.0]) - - with pytest.raises(ValueError): - ser.rolling(3).quantile(-0.1) - - with pytest.raises(ValueError): - ser.rolling(3).quantile(10.0) - - with pytest.raises(TypeError): - ser.rolling(3).quantile("foo") - - def test_rolling_apply(self, raw): - # suppress warnings about empty slices, as we are deliberately testing - # with a 0-length Series - - def f(x): - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - message=".*(empty slice|0 for slice).*", - category=RuntimeWarning, - ) - return x[np.isfinite(x)].mean() - - self._check_moment_func(np.mean, name="apply", func=f, raw=raw) - - expected = Series([]) - result = expected.rolling(10).apply(lambda x: x.mean(), raw=raw) - tm.assert_series_equal(result, expected) - - # gh-8080 - s = Series([None, None, None]) - result = s.rolling(2, min_periods=0).apply(lambda x: len(x), raw=raw) - expected = Series([1.0, 2.0, 2.0]) - tm.assert_series_equal(result, expected) - - result = s.rolling(2, min_periods=0).apply(len, raw=raw) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("klass", [Series, DataFrame]) - @pytest.mark.parametrize( - "method", [lambda x: x.rolling(window=2), lambda x: x.expanding()] - ) - def test_apply_future_warning(self, klass, method): - - # gh-5071 - s = klass(np.arange(3)) - - with tm.assert_produces_warning(FutureWarning): - method(s).apply(lambda x: len(x)) - - def test_rolling_apply_out_of_bounds(self, raw): - # gh-1850 - vals = pd.Series([1, 2, 3, 4]) - - result = vals.rolling(10).apply(np.sum, raw=raw) - assert result.isna().all() - - result = vals.rolling(10, min_periods=1).apply(np.sum, raw=raw) - expected = pd.Series([1, 3, 6, 10], dtype=float) - tm.assert_almost_equal(result, expected) - - @pytest.mark.parametrize("window", [2, "2s"]) - def test_rolling_apply_with_pandas_objects(self, window): - # 5071 - df = pd.DataFrame( - {"A": np.random.randn(5), "B": np.random.randint(0, 10, size=5)}, - index=pd.date_range("20130101", periods=5, freq="s"), - ) - - # we have an equal spaced timeseries index - # so simulate removing the first period - def f(x): - if x.index[0] == df.index[0]: - return np.nan - return x.iloc[-1] - - result = df.rolling(window).apply(f, raw=False) - expected = df.iloc[2:].reindex_like(df) - tm.assert_frame_equal(result, expected) - - with pytest.raises(AttributeError): - df.rolling(window).apply(f, raw=True) - - def test_rolling_std(self, raw): - self._check_moment_func(lambda x: np.std(x, ddof=1), name="std", raw=raw) - self._check_moment_func( - lambda x: np.std(x, ddof=0), name="std", ddof=0, raw=raw - ) - - def test_rolling_std_1obs(self): - vals = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0]) - - result = vals.rolling(1, min_periods=1).std() - expected = pd.Series([np.nan] * 5) - tm.assert_series_equal(result, expected) - - result = vals.rolling(1, min_periods=1).std(ddof=0) - expected = pd.Series([0.0] * 5) - tm.assert_series_equal(result, expected) - - result = pd.Series([np.nan, np.nan, 3, 4, 5]).rolling(3, min_periods=2).std() - assert np.isnan(result[2]) - - def test_rolling_std_neg_sqrt(self): - # unit test from Bottleneck - - # Test move_nanstd for neg sqrt. - - a = pd.Series( - [ - 0.0011448196318903589, - 0.00028718669878572767, - 0.00028718669878572767, - 0.00028718669878572767, - 0.00028718669878572767, - ] - ) - b = a.rolling(window=3).std() - assert np.isfinite(b[2:]).all() - - b = a.ewm(span=3).std() - assert np.isfinite(b[2:]).all() - - def test_rolling_var(self, raw): - self._check_moment_func(lambda x: np.var(x, ddof=1), name="var", raw=raw) - self._check_moment_func( - lambda x: np.var(x, ddof=0), name="var", ddof=0, raw=raw - ) - - @td.skip_if_no_scipy - def test_rolling_skew(self, raw): - from scipy.stats import skew - - self._check_moment_func(lambda x: skew(x, bias=False), name="skew", raw=raw) - - @td.skip_if_no_scipy - def test_rolling_kurt(self, raw): - from scipy.stats import kurtosis - - self._check_moment_func(lambda x: kurtosis(x, bias=False), name="kurt", raw=raw) - - def _check_moment_func( - self, - static_comp, - name, - raw, - has_min_periods=True, - has_center=True, - has_time_rule=True, - fill_value=None, - zero_min_periods_equal=True, - **kwargs - ): - - # inject raw - if name == "apply": - kwargs = copy.copy(kwargs) - kwargs["raw"] = raw - - def get_result(obj, window, min_periods=None, center=False): - r = obj.rolling(window=window, min_periods=min_periods, center=center) - return getattr(r, name)(**kwargs) - - series_result = get_result(self.series, window=50) - assert isinstance(series_result, Series) - tm.assert_almost_equal(series_result.iloc[-1], static_comp(self.series[-50:])) - - frame_result = get_result(self.frame, window=50) - assert isinstance(frame_result, DataFrame) - tm.assert_series_equal( - frame_result.iloc[-1, :], - self.frame.iloc[-50:, :].apply(static_comp, axis=0, raw=raw), - check_names=False, - ) - - # check time_rule works - if has_time_rule: - win = 25 - minp = 10 - series = self.series[::2].resample("B").mean() - frame = self.frame[::2].resample("B").mean() - - if has_min_periods: - series_result = get_result(series, window=win, min_periods=minp) - frame_result = get_result(frame, window=win, min_periods=minp) - else: - series_result = get_result(series, window=win) - frame_result = get_result(frame, window=win) - - last_date = series_result.index[-1] - prev_date = last_date - 24 * offsets.BDay() - - trunc_series = self.series[::2].truncate(prev_date, last_date) - trunc_frame = self.frame[::2].truncate(prev_date, last_date) - - tm.assert_almost_equal(series_result[-1], static_comp(trunc_series)) - - tm.assert_series_equal( - frame_result.xs(last_date), - trunc_frame.apply(static_comp, raw=raw), - check_names=False, - ) - - # excluding NaNs correctly - obj = Series(randn(50)) - obj[:10] = np.NaN - obj[-10:] = np.NaN - if has_min_periods: - result = get_result(obj, 50, min_periods=30) - tm.assert_almost_equal(result.iloc[-1], static_comp(obj[10:-10])) - - # min_periods is working correctly - result = get_result(obj, 20, min_periods=15) - assert isna(result.iloc[23]) - assert not isna(result.iloc[24]) - - assert not isna(result.iloc[-6]) - assert isna(result.iloc[-5]) - - obj2 = Series(randn(20)) - result = get_result(obj2, 10, min_periods=5) - assert isna(result.iloc[3]) - assert notna(result.iloc[4]) - - if zero_min_periods_equal: - # min_periods=0 may be equivalent to min_periods=1 - result0 = get_result(obj, 20, min_periods=0) - result1 = get_result(obj, 20, min_periods=1) - tm.assert_almost_equal(result0, result1) - else: - result = get_result(obj, 50) - tm.assert_almost_equal(result.iloc[-1], static_comp(obj[10:-10])) - - # window larger than series length (#7297) - if has_min_periods: - for minp in (0, len(self.series) - 1, len(self.series)): - result = get_result(self.series, len(self.series) + 1, min_periods=minp) - expected = get_result(self.series, len(self.series), min_periods=minp) - nan_mask = isna(result) - tm.assert_series_equal(nan_mask, isna(expected)) - - nan_mask = ~nan_mask - tm.assert_almost_equal(result[nan_mask], expected[nan_mask]) - else: - result = get_result(self.series, len(self.series) + 1) - expected = get_result(self.series, len(self.series)) - nan_mask = isna(result) - tm.assert_series_equal(nan_mask, isna(expected)) - - nan_mask = ~nan_mask - tm.assert_almost_equal(result[nan_mask], expected[nan_mask]) - - # check center=True - if has_center: - if has_min_periods: - result = get_result(obj, 20, min_periods=15, center=True) - expected = get_result( - pd.concat([obj, Series([np.NaN] * 9)]), 20, min_periods=15 - )[9:].reset_index(drop=True) - else: - result = get_result(obj, 20, center=True) - expected = get_result(pd.concat([obj, Series([np.NaN] * 9)]), 20)[ - 9: - ].reset_index(drop=True) - - tm.assert_series_equal(result, expected) - - # shifter index - s = ["x{x:d}".format(x=x) for x in range(12)] - - if has_min_periods: - minp = 10 - - series_xp = ( - get_result( - self.series.reindex(list(self.series.index) + s), - window=25, - min_periods=minp, - ) - .shift(-12) - .reindex(self.series.index) - ) - frame_xp = ( - get_result( - self.frame.reindex(list(self.frame.index) + s), - window=25, - min_periods=minp, - ) - .shift(-12) - .reindex(self.frame.index) - ) - - series_rs = get_result( - self.series, window=25, min_periods=minp, center=True - ) - frame_rs = get_result( - self.frame, window=25, min_periods=minp, center=True - ) - - else: - series_xp = ( - get_result( - self.series.reindex(list(self.series.index) + s), window=25 - ) - .shift(-12) - .reindex(self.series.index) - ) - frame_xp = ( - get_result( - self.frame.reindex(list(self.frame.index) + s), window=25 - ) - .shift(-12) - .reindex(self.frame.index) - ) - - series_rs = get_result(self.series, window=25, center=True) - frame_rs = get_result(self.frame, window=25, center=True) - - if fill_value is not None: - series_xp = series_xp.fillna(fill_value) - frame_xp = frame_xp.fillna(fill_value) - tm.assert_series_equal(series_xp, series_rs) - tm.assert_frame_equal(frame_xp, frame_rs) - - def test_ewma(self): - self._check_ew(name="mean") - - vals = pd.Series(np.zeros(1000)) - vals[5] = 1 - result = vals.ewm(span=100, adjust=False).mean().sum() - assert np.abs(result - 1) < 1e-2 - - @pytest.mark.parametrize("adjust", [True, False]) - @pytest.mark.parametrize("ignore_na", [True, False]) - def test_ewma_cases(self, adjust, ignore_na): - # try adjust/ignore_na args matrix - - s = Series([1.0, 2.0, 4.0, 8.0]) - - if adjust: - expected = Series([1.0, 1.6, 2.736842, 4.923077]) - else: - expected = Series([1.0, 1.333333, 2.222222, 4.148148]) - - result = s.ewm(com=2.0, adjust=adjust, ignore_na=ignore_na).mean() - tm.assert_series_equal(result, expected) - - def test_ewma_nan_handling(self): - s = Series([1.0] + [np.nan] * 5 + [1.0]) - result = s.ewm(com=5).mean() - tm.assert_series_equal(result, Series([1.0] * len(s))) - - s = Series([np.nan] * 2 + [1.0] + [np.nan] * 2 + [1.0]) - result = s.ewm(com=5).mean() - tm.assert_series_equal(result, Series([np.nan] * 2 + [1.0] * 4)) - - # GH 7603 - s0 = Series([np.nan, 1.0, 101.0]) - s1 = Series([1.0, np.nan, 101.0]) - s2 = Series([np.nan, 1.0, np.nan, np.nan, 101.0, np.nan]) - s3 = Series([1.0, np.nan, 101.0, 50.0]) - com = 2.0 - alpha = 1.0 / (1.0 + com) - - def simple_wma(s, w): - return (s.multiply(w).cumsum() / w.cumsum()).fillna(method="ffill") - - for (s, adjust, ignore_na, w) in [ - (s0, True, False, [np.nan, (1.0 - alpha), 1.0]), - (s0, True, True, [np.nan, (1.0 - alpha), 1.0]), - (s0, False, False, [np.nan, (1.0 - alpha), alpha]), - (s0, False, True, [np.nan, (1.0 - alpha), alpha]), - (s1, True, False, [(1.0 - alpha) ** 2, np.nan, 1.0]), - (s1, True, True, [(1.0 - alpha), np.nan, 1.0]), - (s1, False, False, [(1.0 - alpha) ** 2, np.nan, alpha]), - (s1, False, True, [(1.0 - alpha), np.nan, alpha]), - ( - s2, - True, - False, - [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, 1.0, np.nan], - ), - (s2, True, True, [np.nan, (1.0 - alpha), np.nan, np.nan, 1.0, np.nan]), - ( - s2, - False, - False, - [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, alpha, np.nan], - ), - (s2, False, True, [np.nan, (1.0 - alpha), np.nan, np.nan, alpha, np.nan]), - (s3, True, False, [(1.0 - alpha) ** 3, np.nan, (1.0 - alpha), 1.0]), - (s3, True, True, [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha), 1.0]), - ( - s3, - False, - False, - [ - (1.0 - alpha) ** 3, - np.nan, - (1.0 - alpha) * alpha, - alpha * ((1.0 - alpha) ** 2 + alpha), - ], - ), - ( - s3, - False, - True, - [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha) * alpha, alpha], - ), - ]: - expected = simple_wma(s, Series(w)) - result = s.ewm(com=com, adjust=adjust, ignore_na=ignore_na).mean() - - tm.assert_series_equal(result, expected) - if ignore_na is False: - # check that ignore_na defaults to False - result = s.ewm(com=com, adjust=adjust).mean() - tm.assert_series_equal(result, expected) - - def test_ewmvar(self): - self._check_ew(name="var") - - def test_ewmvol(self): - self._check_ew(name="vol") - - def test_ewma_span_com_args(self): - A = self.series.ewm(com=9.5).mean() - B = self.series.ewm(span=20).mean() - tm.assert_almost_equal(A, B) - - with pytest.raises(ValueError): - self.series.ewm(com=9.5, span=20) - with pytest.raises(ValueError): - self.series.ewm().mean() - - def test_ewma_halflife_arg(self): - A = self.series.ewm(com=13.932726172912965).mean() - B = self.series.ewm(halflife=10.0).mean() - tm.assert_almost_equal(A, B) - - with pytest.raises(ValueError): - self.series.ewm(span=20, halflife=50) - with pytest.raises(ValueError): - self.series.ewm(com=9.5, halflife=50) - with pytest.raises(ValueError): - self.series.ewm(com=9.5, span=20, halflife=50) - with pytest.raises(ValueError): - self.series.ewm() - - def test_ewm_alpha(self): - # GH 10789 - s = Series(self.arr) - a = s.ewm(alpha=0.61722699889169674).mean() - b = s.ewm(com=0.62014947789973052).mean() - c = s.ewm(span=2.240298955799461).mean() - d = s.ewm(halflife=0.721792864318).mean() - tm.assert_series_equal(a, b) - tm.assert_series_equal(a, c) - tm.assert_series_equal(a, d) - - def test_ewm_alpha_arg(self): - # GH 10789 - s = self.series - with pytest.raises(ValueError): - s.ewm() - with pytest.raises(ValueError): - s.ewm(com=10.0, alpha=0.5) - with pytest.raises(ValueError): - s.ewm(span=10.0, alpha=0.5) - with pytest.raises(ValueError): - s.ewm(halflife=10.0, alpha=0.5) - - def test_ewm_domain_checks(self): - # GH 12492 - s = Series(self.arr) - msg = "comass must satisfy: comass >= 0" - with pytest.raises(ValueError, match=msg): - s.ewm(com=-0.1) - s.ewm(com=0.0) - s.ewm(com=0.1) - - msg = "span must satisfy: span >= 1" - with pytest.raises(ValueError, match=msg): - s.ewm(span=-0.1) - with pytest.raises(ValueError, match=msg): - s.ewm(span=0.0) - with pytest.raises(ValueError, match=msg): - s.ewm(span=0.9) - s.ewm(span=1.0) - s.ewm(span=1.1) - - msg = "halflife must satisfy: halflife > 0" - with pytest.raises(ValueError, match=msg): - s.ewm(halflife=-0.1) - with pytest.raises(ValueError, match=msg): - s.ewm(halflife=0.0) - s.ewm(halflife=0.1) - - msg = "alpha must satisfy: 0 < alpha <= 1" - with pytest.raises(ValueError, match=msg): - s.ewm(alpha=-0.1) - with pytest.raises(ValueError, match=msg): - s.ewm(alpha=0.0) - s.ewm(alpha=0.1) - s.ewm(alpha=1.0) - with pytest.raises(ValueError, match=msg): - s.ewm(alpha=1.1) - - @pytest.mark.parametrize("method", ["mean", "vol", "var"]) - def test_ew_empty_series(self, method): - vals = pd.Series([], dtype=np.float64) - - ewm = vals.ewm(3) - result = getattr(ewm, method)() - tm.assert_almost_equal(result, vals) - - def _check_ew(self, name=None, preserve_nan=False): - series_result = getattr(self.series.ewm(com=10), name)() - assert isinstance(series_result, Series) - - frame_result = getattr(self.frame.ewm(com=10), name)() - assert type(frame_result) == DataFrame - - result = getattr(self.series.ewm(com=10), name)() - if preserve_nan: - assert result[self._nan_locs].isna().all() - - # excluding NaNs correctly - arr = randn(50) - arr[:10] = np.NaN - arr[-10:] = np.NaN - s = Series(arr) - - # check min_periods - # GH 7898 - result = getattr(s.ewm(com=50, min_periods=2), name)() - assert result[:11].isna().all() - assert not result[11:].isna().any() - - for min_periods in (0, 1): - result = getattr(s.ewm(com=50, min_periods=min_periods), name)() - if name == "mean": - assert result[:10].isna().all() - assert not result[10:].isna().any() - else: - # ewm.std, ewm.vol, ewm.var (with bias=False) require at least - # two values - assert result[:11].isna().all() - assert not result[11:].isna().any() - - # check series of length 0 - result = getattr(Series().ewm(com=50, min_periods=min_periods), name)() - tm.assert_series_equal(result, Series()) - - # check series of length 1 - result = getattr(Series([1.0]).ewm(50, min_periods=min_periods), name)() - if name == "mean": - tm.assert_series_equal(result, Series([1.0])) - else: - # ewm.std, ewm.vol, ewm.var with bias=False require at least - # two values - tm.assert_series_equal(result, Series([np.NaN])) - - # pass in ints - result2 = getattr(Series(np.arange(50)).ewm(span=10), name)() - assert result2.dtype == np.float_ - - -# create the data only once as we are not setting it -def _create_consistency_data(): - def create_series(): - return [ - Series(), - Series([np.nan]), - Series([np.nan, np.nan]), - Series([3.0]), - Series([np.nan, 3.0]), - Series([3.0, np.nan]), - Series([1.0, 3.0]), - Series([2.0, 2.0]), - Series([3.0, 1.0]), - Series( - [5.0, 5.0, 5.0, 5.0, np.nan, np.nan, np.nan, 5.0, 5.0, np.nan, np.nan] - ), - Series( - [ - np.nan, - 5.0, - 5.0, - 5.0, - np.nan, - np.nan, - np.nan, - 5.0, - 5.0, - np.nan, - np.nan, - ] - ), - Series( - [ - np.nan, - np.nan, - 5.0, - 5.0, - np.nan, - np.nan, - np.nan, - 5.0, - 5.0, - np.nan, - np.nan, - ] - ), - Series( - [ - np.nan, - 3.0, - np.nan, - 3.0, - 4.0, - 5.0, - 6.0, - np.nan, - np.nan, - 7.0, - 12.0, - 13.0, - 14.0, - 15.0, - ] - ), - Series( - [ - np.nan, - 5.0, - np.nan, - 2.0, - 4.0, - 0.0, - 9.0, - np.nan, - np.nan, - 3.0, - 12.0, - 13.0, - 14.0, - 15.0, - ] - ), - Series( - [ - 2.0, - 3.0, - np.nan, - 3.0, - 4.0, - 5.0, - 6.0, - np.nan, - np.nan, - 7.0, - 12.0, - 13.0, - 14.0, - 15.0, - ] - ), - Series( - [ - 2.0, - 5.0, - np.nan, - 2.0, - 4.0, - 0.0, - 9.0, - np.nan, - np.nan, - 3.0, - 12.0, - 13.0, - 14.0, - 15.0, - ] - ), - Series(range(10)), - Series(range(20, 0, -2)), - ] - - def create_dataframes(): - return [ - DataFrame(), - DataFrame(columns=["a"]), - DataFrame(columns=["a", "a"]), - DataFrame(columns=["a", "b"]), - DataFrame(np.arange(10).reshape((5, 2))), - DataFrame(np.arange(25).reshape((5, 5))), - DataFrame(np.arange(25).reshape((5, 5)), columns=["a", "b", 99, "d", "d"]), - ] + [DataFrame(s) for s in create_series()] - - def is_constant(x): - values = x.values.ravel() - return len(set(values[notna(values)])) == 1 - - def no_nans(x): - return x.notna().all().all() - - # data is a tuple(object, is_constant, no_nans) - data = create_series() + create_dataframes() - - return [(x, is_constant(x), no_nans(x)) for x in data] - - -_consistency_data = _create_consistency_data() - - -def _rolling_consistency_cases(): - for window in [1, 2, 3, 10, 20]: - for min_periods in {0, 1, 2, 3, 4, window}: - if min_periods and (min_periods > window): - continue - for center in [False, True]: - yield window, min_periods, center - - -class TestMomentsConsistency(Base): - base_functions = [ - (lambda v: Series(v).count(), None, "count"), - (lambda v: Series(v).max(), None, "max"), - (lambda v: Series(v).min(), None, "min"), - (lambda v: Series(v).sum(), None, "sum"), - (lambda v: Series(v).mean(), None, "mean"), - (lambda v: Series(v).std(), 1, "std"), - (lambda v: Series(v).cov(Series(v)), None, "cov"), - (lambda v: Series(v).corr(Series(v)), None, "corr"), - (lambda v: Series(v).var(), 1, "var"), - # restore once GH 8086 is fixed - # lambda v: Series(v).skew(), 3, 'skew'), - # (lambda v: Series(v).kurt(), 4, 'kurt'), - # restore once GH 8084 is fixed - # lambda v: Series(v).quantile(0.3), None, 'quantile'), - (lambda v: Series(v).median(), None, "median"), - (np.nanmax, 1, "max"), - (np.nanmin, 1, "min"), - (np.nansum, 1, "sum"), - (np.nanmean, 1, "mean"), - (lambda v: np.nanstd(v, ddof=1), 1, "std"), - (lambda v: np.nanvar(v, ddof=1), 1, "var"), - (np.nanmedian, 1, "median"), - ] - no_nan_functions = [ - (np.max, None, "max"), - (np.min, None, "min"), - (np.sum, None, "sum"), - (np.mean, None, "mean"), - (lambda v: np.std(v, ddof=1), 1, "std"), - (lambda v: np.var(v, ddof=1), 1, "var"), - (np.median, None, "median"), - ] - - def _create_data(self): - super()._create_data() - self.data = _consistency_data - - def setup_method(self, method): - self._create_data() - - def _test_moments_consistency( - self, - min_periods, - count, - mean, - mock_mean, - corr, - var_unbiased=None, - std_unbiased=None, - cov_unbiased=None, - var_biased=None, - std_biased=None, - cov_biased=None, - var_debiasing_factors=None, - ): - def _non_null_values(x): - values = x.values.ravel() - return set(values[notna(values)].tolist()) - - for (x, is_constant, no_nans) in self.data: - count_x = count(x) - mean_x = mean(x) - - if mock_mean: - # check that mean equals mock_mean - expected = mock_mean(x) - tm.assert_equal(mean_x, expected.astype("float64")) - - # check that correlation of a series with itself is either 1 or NaN - corr_x_x = corr(x, x) - - # assert _non_null_values(corr_x_x).issubset(set([1.])) - # restore once rolling_cov(x, x) is identically equal to var(x) - - if is_constant: - exp = x.max() if isinstance(x, Series) else x.max().max() - - # check mean of constant series - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = exp - tm.assert_equal(mean_x, expected) - - # check correlation of constant series with itself is NaN - expected[:] = np.nan - tm.assert_equal(corr_x_x, expected) - - if var_unbiased and var_biased and var_debiasing_factors: - # check variance debiasing factors - var_unbiased_x = var_unbiased(x) - var_biased_x = var_biased(x) - var_debiasing_factors_x = var_debiasing_factors(x) - tm.assert_equal(var_unbiased_x, var_biased_x * var_debiasing_factors_x) - - for (std, var, cov) in [ - (std_biased, var_biased, cov_biased), - (std_unbiased, var_unbiased, cov_unbiased), - ]: - - # check that var(x), std(x), and cov(x) are all >= 0 - var_x = var(x) - std_x = std(x) - assert not (var_x < 0).any().any() - assert not (std_x < 0).any().any() - if cov: - cov_x_x = cov(x, x) - assert not (cov_x_x < 0).any().any() - - # check that var(x) == cov(x, x) - tm.assert_equal(var_x, cov_x_x) - - # check that var(x) == std(x)^2 - tm.assert_equal(var_x, std_x * std_x) - - if var is var_biased: - # check that biased var(x) == mean(x^2) - mean(x)^2 - mean_x2 = mean(x * x) - tm.assert_equal(var_x, mean_x2 - (mean_x * mean_x)) - - if is_constant: - # check that variance of constant series is identically 0 - assert not (var_x > 0).any().any() - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = 0.0 - if var is var_unbiased: - expected[count_x < 2] = np.nan - tm.assert_equal(var_x, expected) - - if isinstance(x, Series): - for (y, is_constant, no_nans) in self.data: - if not x.isna().equals(y.isna()): - # can only easily test two Series with similar - # structure - continue - - # check that cor(x, y) is symmetric - corr_x_y = corr(x, y) - corr_y_x = corr(y, x) - tm.assert_equal(corr_x_y, corr_y_x) - - if cov: - # check that cov(x, y) is symmetric - cov_x_y = cov(x, y) - cov_y_x = cov(y, x) - tm.assert_equal(cov_x_y, cov_y_x) - - # check that cov(x, y) == (var(x+y) - var(x) - - # var(y)) / 2 - var_x_plus_y = var(x + y) - var_y = var(y) - tm.assert_equal( - cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y) - ) - - # check that corr(x, y) == cov(x, y) / (std(x) * - # std(y)) - std_y = std(y) - tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) - - if cov is cov_biased: - # check that biased cov(x, y) == mean(x*y) - - # mean(x)*mean(y) - mean_y = mean(y) - mean_x_times_y = mean(x * y) - tm.assert_equal( - cov_x_y, mean_x_times_y - (mean_x * mean_y) - ) - - @pytest.mark.slow - @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) - @pytest.mark.parametrize("adjust", [True, False]) - @pytest.mark.parametrize("ignore_na", [True, False]) - def test_ewm_consistency(self, min_periods, adjust, ignore_na): - def _weights(s, com, adjust, ignore_na): - if isinstance(s, DataFrame): - if not len(s.columns): - return DataFrame(index=s.index, columns=s.columns) - w = concat( - [ - _weights( - s.iloc[:, i], com=com, adjust=adjust, ignore_na=ignore_na - ) - for i, _ in enumerate(s.columns) - ], - axis=1, - ) - w.index = s.index - w.columns = s.columns - return w - - w = Series(np.nan, index=s.index) - alpha = 1.0 / (1.0 + com) - if ignore_na: - w[s.notna()] = _weights( - s[s.notna()], com=com, adjust=adjust, ignore_na=False - ) - elif adjust: - for i in range(len(s)): - if s.iat[i] == s.iat[i]: - w.iat[i] = pow(1.0 / (1.0 - alpha), i) - else: - sum_wts = 0.0 - prev_i = -1 - for i in range(len(s)): - if s.iat[i] == s.iat[i]: - if prev_i == -1: - w.iat[i] = 1.0 - else: - w.iat[i] = alpha * sum_wts / pow(1.0 - alpha, i - prev_i) - sum_wts += w.iat[i] - prev_i = i - return w - - def _variance_debiasing_factors(s, com, adjust, ignore_na): - weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) - cum_sum = weights.cumsum().fillna(method="ffill") - cum_sum_sq = (weights * weights).cumsum().fillna(method="ffill") - numerator = cum_sum * cum_sum - denominator = numerator - cum_sum_sq - denominator[denominator <= 0.0] = np.nan - return numerator / denominator - - def _ewma(s, com, min_periods, adjust, ignore_na): - weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) - result = ( - s.multiply(weights) - .cumsum() - .divide(weights.cumsum()) - .fillna(method="ffill") - ) - result[ - s.expanding().count() < (max(min_periods, 1) if min_periods else 1) - ] = np.nan - return result - - com = 3.0 - # test consistency between different ewm* moments - self._test_moments_consistency( - min_periods=min_periods, - count=lambda x: x.expanding().count(), - mean=lambda x: x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).mean(), - mock_mean=lambda x: _ewma( - x, com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ), - corr=lambda x, y: x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).corr(y), - var_unbiased=lambda x: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).var(bias=False) - ), - std_unbiased=lambda x: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).std(bias=False) - ), - cov_unbiased=lambda x, y: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).cov(y, bias=False) - ), - var_biased=lambda x: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).var(bias=True) - ), - std_biased=lambda x: x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).std(bias=True), - cov_biased=lambda x, y: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).cov(y, bias=True) - ), - var_debiasing_factors=lambda x: ( - _variance_debiasing_factors( - x, com=com, adjust=adjust, ignore_na=ignore_na - ) - ), - ) - - @pytest.mark.slow - @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) - def test_expanding_consistency(self, min_periods): - - # suppress warnings about empty slices, as we are deliberately testing - # with empty/0-length Series/DataFrames - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - message=".*(empty slice|0 for slice).*", - category=RuntimeWarning, - ) - - # test consistency between different expanding_* moments - self._test_moments_consistency( - min_periods=min_periods, - count=lambda x: x.expanding().count(), - mean=lambda x: x.expanding(min_periods=min_periods).mean(), - mock_mean=lambda x: x.expanding(min_periods=min_periods).sum() - / x.expanding().count(), - corr=lambda x, y: x.expanding(min_periods=min_periods).corr(y), - var_unbiased=lambda x: x.expanding(min_periods=min_periods).var(), - std_unbiased=lambda x: x.expanding(min_periods=min_periods).std(), - cov_unbiased=lambda x, y: x.expanding(min_periods=min_periods).cov(y), - var_biased=lambda x: x.expanding(min_periods=min_periods).var(ddof=0), - std_biased=lambda x: x.expanding(min_periods=min_periods).std(ddof=0), - cov_biased=lambda x, y: x.expanding(min_periods=min_periods).cov( - y, ddof=0 - ), - var_debiasing_factors=lambda x: ( - x.expanding().count() - / (x.expanding().count() - 1.0).replace(0.0, np.nan) - ), - ) - - # test consistency between expanding_xyz() and either (a) - # expanding_apply of Series.xyz(), or (b) expanding_apply of - # np.nanxyz() - for (x, is_constant, no_nans) in self.data: - functions = self.base_functions - - # GH 8269 - if no_nans: - functions = self.base_functions + self.no_nan_functions - for (f, require_min_periods, name) in functions: - expanding_f = getattr(x.expanding(min_periods=min_periods), name) - - if ( - require_min_periods - and (min_periods is not None) - and (min_periods < require_min_periods) - ): - continue - - if name == "count": - expanding_f_result = expanding_f() - expanding_apply_f_result = x.expanding(min_periods=0).apply( - func=f, raw=True - ) - else: - if name in ["cov", "corr"]: - expanding_f_result = expanding_f(pairwise=False) - else: - expanding_f_result = expanding_f() - expanding_apply_f_result = x.expanding( - min_periods=min_periods - ).apply(func=f, raw=True) - - # GH 9422 - if name in ["sum", "prod"]: - tm.assert_equal(expanding_f_result, expanding_apply_f_result) - - @pytest.mark.slow - @pytest.mark.parametrize( - "window,min_periods,center", list(_rolling_consistency_cases()) - ) - def test_rolling_consistency(self, window, min_periods, center): - - # suppress warnings about empty slices, as we are deliberately testing - # with empty/0-length Series/DataFrames - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - message=".*(empty slice|0 for slice).*", - category=RuntimeWarning, - ) - - # test consistency between different rolling_* moments - self._test_moments_consistency( - min_periods=min_periods, - count=lambda x: (x.rolling(window=window, center=center).count()), - mean=lambda x: ( - x.rolling( - window=window, min_periods=min_periods, center=center - ).mean() - ), - mock_mean=lambda x: ( - x.rolling(window=window, min_periods=min_periods, center=center) - .sum() - .divide( - x.rolling( - window=window, min_periods=min_periods, center=center - ).count() - ) - ), - corr=lambda x, y: ( - x.rolling( - window=window, min_periods=min_periods, center=center - ).corr(y) - ), - var_unbiased=lambda x: ( - x.rolling( - window=window, min_periods=min_periods, center=center - ).var() - ), - std_unbiased=lambda x: ( - x.rolling( - window=window, min_periods=min_periods, center=center - ).std() - ), - cov_unbiased=lambda x, y: ( - x.rolling( - window=window, min_periods=min_periods, center=center - ).cov(y) - ), - var_biased=lambda x: ( - x.rolling( - window=window, min_periods=min_periods, center=center - ).var(ddof=0) - ), - std_biased=lambda x: ( - x.rolling( - window=window, min_periods=min_periods, center=center - ).std(ddof=0) - ), - cov_biased=lambda x, y: ( - x.rolling( - window=window, min_periods=min_periods, center=center - ).cov(y, ddof=0) - ), - var_debiasing_factors=lambda x: ( - x.rolling(window=window, center=center) - .count() - .divide( - (x.rolling(window=window, center=center).count() - 1.0).replace( - 0.0, np.nan - ) - ) - ), - ) - - # test consistency between rolling_xyz() and either (a) - # rolling_apply of Series.xyz(), or (b) rolling_apply of - # np.nanxyz() - for (x, is_constant, no_nans) in self.data: - functions = self.base_functions - - # GH 8269 - if no_nans: - functions = self.base_functions + self.no_nan_functions - for (f, require_min_periods, name) in functions: - rolling_f = getattr( - x.rolling( - window=window, center=center, min_periods=min_periods - ), - name, - ) - - if ( - require_min_periods - and (min_periods is not None) - and (min_periods < require_min_periods) - ): - continue - - if name == "count": - rolling_f_result = rolling_f() - rolling_apply_f_result = x.rolling( - window=window, min_periods=0, center=center - ).apply(func=f, raw=True) - else: - if name in ["cov", "corr"]: - rolling_f_result = rolling_f(pairwise=False) - else: - rolling_f_result = rolling_f() - rolling_apply_f_result = x.rolling( - window=window, min_periods=min_periods, center=center - ).apply(func=f, raw=True) - - # GH 9422 - if name in ["sum", "prod"]: - tm.assert_equal(rolling_f_result, rolling_apply_f_result) - - # binary moments - def test_rolling_cov(self): - A = self.series - B = A + randn(len(A)) - - result = A.rolling(window=50, min_periods=25).cov(B) - tm.assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1]) - - def test_rolling_cov_pairwise(self): - self._check_pairwise_moment("rolling", "cov", window=10, min_periods=5) - - def test_rolling_corr(self): - A = self.series - B = A + randn(len(A)) - - result = A.rolling(window=50, min_periods=25).corr(B) - tm.assert_almost_equal(result[-1], np.corrcoef(A[-50:], B[-50:])[0, 1]) - - # test for correct bias correction - a = tm.makeTimeSeries() - b = tm.makeTimeSeries() - a[:5] = np.nan - b[:10] = np.nan - - result = a.rolling(window=len(a), min_periods=1).corr(b) - tm.assert_almost_equal(result[-1], a.corr(b)) - - def test_rolling_corr_pairwise(self): - self._check_pairwise_moment("rolling", "corr", window=10, min_periods=5) - - @pytest.mark.parametrize("window", range(7)) - def test_rolling_corr_with_zero_variance(self, window): - # GH 18430 - s = pd.Series(np.zeros(20)) - other = pd.Series(np.arange(20)) - - assert s.rolling(window=window).corr(other=other).isna().all() - - def _check_pairwise_moment(self, dispatch, name, **kwargs): - def get_result(obj, obj2=None): - return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2) - - result = get_result(self.frame) - result = result.loc[(slice(None), 1), 5] - result.index = result.index.droplevel(1) - expected = get_result(self.frame[1], self.frame[5]) - tm.assert_series_equal(result, expected, check_names=False) - - def test_flex_binary_moment(self): - # GH3155 - # don't blow the stack - msg = ( - "arguments to moment function must be of type" - " np.ndarray/Series/DataFrame" - ) - with pytest.raises(TypeError, match=msg): - rwindow._flex_binary_moment(5, 6, None) - - def test_corr_sanity(self): - # GH 3155 - df = DataFrame( - np.array( - [ - [0.87024726, 0.18505595], - [0.64355431, 0.3091617], - [0.92372966, 0.50552513], - [0.00203756, 0.04520709], - [0.84780328, 0.33394331], - [0.78369152, 0.63919667], - ] - ) - ) - - res = df[0].rolling(5, center=True).corr(df[1]) - assert all(np.abs(np.nan_to_num(x)) <= 1 for x in res) - - # and some fuzzing - for _ in range(10): - df = DataFrame(np.random.rand(30, 2)) - res = df[0].rolling(5, center=True).corr(df[1]) - try: - assert all(np.abs(np.nan_to_num(x)) <= 1 for x in res) - except AssertionError: - print(res) - - @pytest.mark.parametrize("method", ["corr", "cov"]) - def test_flex_binary_frame(self, method): - series = self.frame[1] - - res = getattr(series.rolling(window=10), method)(self.frame) - res2 = getattr(self.frame.rolling(window=10), method)(series) - exp = self.frame.apply(lambda x: getattr(series.rolling(window=10), method)(x)) - - tm.assert_frame_equal(res, exp) - tm.assert_frame_equal(res2, exp) - - frame2 = self.frame.copy() - frame2.values[:] = np.random.randn(*frame2.shape) - - res3 = getattr(self.frame.rolling(window=10), method)(frame2) - exp = DataFrame( - { - k: getattr(self.frame[k].rolling(window=10), method)(frame2[k]) - for k in self.frame - } - ) - tm.assert_frame_equal(res3, exp) - - def test_ewmcov(self): - self._check_binary_ew("cov") - - def test_ewmcov_pairwise(self): - self._check_pairwise_moment("ewm", "cov", span=10, min_periods=5) - - def test_ewmcorr(self): - self._check_binary_ew("corr") - - def test_ewmcorr_pairwise(self): - self._check_pairwise_moment("ewm", "corr", span=10, min_periods=5) - - def _check_binary_ew(self, name): - def func(A, B, com, **kwargs): - return getattr(A.ewm(com, **kwargs), name)(B) - - A = Series(randn(50), index=np.arange(50)) - B = A[2:] + randn(48) - - A[:10] = np.NaN - B[-10:] = np.NaN - - result = func(A, B, 20, min_periods=5) - assert np.isnan(result.values[:14]).all() - assert not np.isnan(result.values[14:]).any() - - # GH 7898 - for min_periods in (0, 1, 2): - result = func(A, B, 20, min_periods=min_periods) - # binary functions (ewmcov, ewmcorr) with bias=False require at - # least two values - assert np.isnan(result.values[:11]).all() - assert not np.isnan(result.values[11:]).any() - - # check series of length 0 - result = func(Series([]), Series([]), 50, min_periods=min_periods) - tm.assert_series_equal(result, Series([])) - - # check series of length 1 - result = func(Series([1.0]), Series([1.0]), 50, min_periods=min_periods) - tm.assert_series_equal(result, Series([np.NaN])) - - msg = "Input arrays must be of the same type!" - # exception raised is Exception - with pytest.raises(Exception, match=msg): - func(A, randn(50), 20, min_periods=5) - - def test_expanding_apply_args_kwargs(self, raw): - def mean_w_arg(x, const): - return np.mean(x) + const - - df = DataFrame(np.random.rand(20, 3)) - - expected = df.expanding().apply(np.mean, raw=raw) + 20.0 - - result = df.expanding().apply(mean_w_arg, raw=raw, args=(20,)) - tm.assert_frame_equal(result, expected) - - result = df.expanding().apply(mean_w_arg, raw=raw, kwargs={"const": 20}) - tm.assert_frame_equal(result, expected) - - def test_expanding_corr(self): - A = self.series.dropna() - B = (A + randn(len(A)))[:-5] - - result = A.expanding().corr(B) - - rolling_result = A.rolling(window=len(A), min_periods=1).corr(B) - - tm.assert_almost_equal(rolling_result, result) - - def test_expanding_count(self): - result = self.series.expanding().count() - tm.assert_almost_equal( - result, self.series.rolling(window=len(self.series)).count() - ) - - def test_expanding_quantile(self): - result = self.series.expanding().quantile(0.5) - - rolling_result = self.series.rolling( - window=len(self.series), min_periods=1 - ).quantile(0.5) - - tm.assert_almost_equal(result, rolling_result) - - def test_expanding_cov(self): - A = self.series - B = (A + randn(len(A)))[:-5] - - result = A.expanding().cov(B) - - rolling_result = A.rolling(window=len(A), min_periods=1).cov(B) - - tm.assert_almost_equal(rolling_result, result) - - def test_expanding_cov_pairwise(self): - result = self.frame.expanding().corr() - - rolling_result = self.frame.rolling( - window=len(self.frame), min_periods=1 - ).corr() - - tm.assert_frame_equal(result, rolling_result) - - def test_expanding_corr_pairwise(self): - result = self.frame.expanding().corr() - - rolling_result = self.frame.rolling( - window=len(self.frame), min_periods=1 - ).corr() - tm.assert_frame_equal(result, rolling_result) - - def test_expanding_cov_diff_index(self): - # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) - result = s1.expanding().cov(s2) - expected = Series([None, None, 2.0]) - tm.assert_series_equal(result, expected) - - s2a = Series([1, None, 3], index=[0, 1, 2]) - result = s1.expanding().cov(s2a) - tm.assert_series_equal(result, expected) - - s1 = Series([7, 8, 10], index=[0, 1, 3]) - s2 = Series([7, 9, 10], index=[0, 2, 3]) - result = s1.expanding().cov(s2) - expected = Series([None, None, None, 4.5]) - tm.assert_series_equal(result, expected) - - def test_expanding_corr_diff_index(self): - # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) - result = s1.expanding().corr(s2) - expected = Series([None, None, 1.0]) - tm.assert_series_equal(result, expected) - - s2a = Series([1, None, 3], index=[0, 1, 2]) - result = s1.expanding().corr(s2a) - tm.assert_series_equal(result, expected) - - s1 = Series([7, 8, 10], index=[0, 1, 3]) - s2 = Series([7, 9, 10], index=[0, 2, 3]) - result = s1.expanding().corr(s2) - expected = Series([None, None, None, 1.0]) - tm.assert_series_equal(result, expected) - - def test_rolling_cov_diff_length(self): - # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) - result = s1.rolling(window=3, min_periods=2).cov(s2) - expected = Series([None, None, 2.0]) - tm.assert_series_equal(result, expected) - - s2a = Series([1, None, 3], index=[0, 1, 2]) - result = s1.rolling(window=3, min_periods=2).cov(s2a) - tm.assert_series_equal(result, expected) - - def test_rolling_corr_diff_length(self): - # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) - result = s1.rolling(window=3, min_periods=2).corr(s2) - expected = Series([None, None, 1.0]) - tm.assert_series_equal(result, expected) - - s2a = Series([1, None, 3], index=[0, 1, 2]) - result = s1.rolling(window=3, min_periods=2).corr(s2a) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( - "f", - [ - lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=False)), - lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=False)), - lambda x: x.rolling(window=10, min_periods=5).max(), - lambda x: x.rolling(window=10, min_periods=5).min(), - lambda x: x.rolling(window=10, min_periods=5).sum(), - lambda x: x.rolling(window=10, min_periods=5).mean(), - lambda x: x.rolling(window=10, min_periods=5).std(), - lambda x: x.rolling(window=10, min_periods=5).var(), - lambda x: x.rolling(window=10, min_periods=5).skew(), - lambda x: x.rolling(window=10, min_periods=5).kurt(), - lambda x: x.rolling(window=10, min_periods=5).quantile(quantile=0.5), - lambda x: x.rolling(window=10, min_periods=5).median(), - lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), - lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), - lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), - ], - ) - def test_rolling_functions_window_non_shrinkage(self, f): - # GH 7764 - s = Series(range(4)) - s_expected = Series(np.nan, index=s.index) - df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=["A", "B"]) - df_expected = DataFrame(np.nan, index=df.index, columns=df.columns) - - try: - s_result = f(s) - tm.assert_series_equal(s_result, s_expected) - - df_result = f(df) - tm.assert_frame_equal(df_result, df_expected) - except (ImportError): - - # scipy needed for rolling_window - pytest.skip("scipy not available") - - def test_rolling_functions_window_non_shrinkage_binary(self): - - # corr/cov return a MI DataFrame - df = DataFrame( - [[1, 5], [3, 2], [3, 9], [-1, 0]], - columns=Index(["A", "B"], name="foo"), - index=Index(range(4), name="bar"), - ) - df_expected = DataFrame( - columns=Index(["A", "B"], name="foo"), - index=pd.MultiIndex.from_product( - [df.index, df.columns], names=["bar", "foo"] - ), - dtype="float64", - ) - functions = [ - lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)), - lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)), - ] - for f in functions: - df_result = f(df) - tm.assert_frame_equal(df_result, df_expected) - - def test_moment_functions_zero_length(self): - # GH 8056 - s = Series() - s_expected = s - df1 = DataFrame() - df1_expected = df1 - df2 = DataFrame(columns=["a"]) - df2["a"] = df2["a"].astype("float64") - df2_expected = df2 - - functions = [ - lambda x: x.expanding().count(), - lambda x: x.expanding(min_periods=5).cov(x, pairwise=False), - lambda x: x.expanding(min_periods=5).corr(x, pairwise=False), - lambda x: x.expanding(min_periods=5).max(), - lambda x: x.expanding(min_periods=5).min(), - lambda x: x.expanding(min_periods=5).sum(), - lambda x: x.expanding(min_periods=5).mean(), - lambda x: x.expanding(min_periods=5).std(), - lambda x: x.expanding(min_periods=5).var(), - lambda x: x.expanding(min_periods=5).skew(), - lambda x: x.expanding(min_periods=5).kurt(), - lambda x: x.expanding(min_periods=5).quantile(0.5), - lambda x: x.expanding(min_periods=5).median(), - lambda x: x.expanding(min_periods=5).apply(sum, raw=False), - lambda x: x.expanding(min_periods=5).apply(sum, raw=True), - lambda x: x.rolling(window=10).count(), - lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False), - lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False), - lambda x: x.rolling(window=10, min_periods=5).max(), - lambda x: x.rolling(window=10, min_periods=5).min(), - lambda x: x.rolling(window=10, min_periods=5).sum(), - lambda x: x.rolling(window=10, min_periods=5).mean(), - lambda x: x.rolling(window=10, min_periods=5).std(), - lambda x: x.rolling(window=10, min_periods=5).var(), - lambda x: x.rolling(window=10, min_periods=5).skew(), - lambda x: x.rolling(window=10, min_periods=5).kurt(), - lambda x: x.rolling(window=10, min_periods=5).quantile(0.5), - lambda x: x.rolling(window=10, min_periods=5).median(), - lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), - lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), - lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), - ] - for f in functions: - try: - s_result = f(s) - tm.assert_series_equal(s_result, s_expected) - - df1_result = f(df1) - tm.assert_frame_equal(df1_result, df1_expected) - - df2_result = f(df2) - tm.assert_frame_equal(df2_result, df2_expected) - except (ImportError): - - # scipy needed for rolling_window - continue - - def test_moment_functions_zero_length_pairwise(self): - - df1 = DataFrame() - df1_expected = df1 - df2 = DataFrame(columns=Index(["a"], name="foo"), index=Index([], name="bar")) - df2["a"] = df2["a"].astype("float64") - - df1_expected = DataFrame( - index=pd.MultiIndex.from_product([df1.index, df1.columns]), - columns=Index([]), - ) - df2_expected = DataFrame( - index=pd.MultiIndex.from_product( - [df2.index, df2.columns], names=["bar", "foo"] - ), - columns=Index(["a"], name="foo"), - dtype="float64", - ) - - functions = [ - lambda x: (x.expanding(min_periods=5).cov(x, pairwise=True)), - lambda x: (x.expanding(min_periods=5).corr(x, pairwise=True)), - lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)), - lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)), - ] - for f in functions: - df1_result = f(df1) - tm.assert_frame_equal(df1_result, df1_expected) - - df2_result = f(df2) - tm.assert_frame_equal(df2_result, df2_expected) - - def test_expanding_cov_pairwise_diff_length(self): - # GH 7512 - df1 = DataFrame([[1, 5], [3, 2], [3, 9]], columns=Index(["A", "B"], name="foo")) - df1a = DataFrame( - [[1, 5], [3, 9]], index=[0, 2], columns=Index(["A", "B"], name="foo") - ) - df2 = DataFrame( - [[5, 6], [None, None], [2, 1]], columns=Index(["X", "Y"], name="foo") - ) - df2a = DataFrame( - [[5, 6], [2, 1]], index=[0, 2], columns=Index(["X", "Y"], name="foo") - ) - # TODO: xref gh-15826 - # .loc is not preserving the names - result1 = df1.expanding().cov(df2a, pairwise=True).loc[2] - result2 = df1.expanding().cov(df2a, pairwise=True).loc[2] - result3 = df1a.expanding().cov(df2, pairwise=True).loc[2] - result4 = df1a.expanding().cov(df2a, pairwise=True).loc[2] - expected = DataFrame( - [[-3.0, -6.0], [-5.0, -10.0]], - columns=Index(["A", "B"], name="foo"), - index=Index(["X", "Y"], name="foo"), - ) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - tm.assert_frame_equal(result3, expected) - tm.assert_frame_equal(result4, expected) - - def test_expanding_corr_pairwise_diff_length(self): - # GH 7512 - df1 = DataFrame( - [[1, 2], [3, 2], [3, 4]], - columns=["A", "B"], - index=Index(range(3), name="bar"), - ) - df1a = DataFrame( - [[1, 2], [3, 4]], index=Index([0, 2], name="bar"), columns=["A", "B"] - ) - df2 = DataFrame( - [[5, 6], [None, None], [2, 1]], - columns=["X", "Y"], - index=Index(range(3), name="bar"), - ) - df2a = DataFrame( - [[5, 6], [2, 1]], index=Index([0, 2], name="bar"), columns=["X", "Y"] - ) - result1 = df1.expanding().corr(df2, pairwise=True).loc[2] - result2 = df1.expanding().corr(df2a, pairwise=True).loc[2] - result3 = df1a.expanding().corr(df2, pairwise=True).loc[2] - result4 = df1a.expanding().corr(df2a, pairwise=True).loc[2] - expected = DataFrame( - [[-1.0, -1.0], [-1.0, -1.0]], columns=["A", "B"], index=Index(["X", "Y"]) - ) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - tm.assert_frame_equal(result3, expected) - tm.assert_frame_equal(result4, expected) - - def test_rolling_skew_edge_cases(self): - - all_nan = Series([np.NaN] * 5) - - # yields all NaN (0 variance) - d = Series([1] * 5) - x = d.rolling(window=5).skew() - tm.assert_series_equal(all_nan, x) - - # yields all NaN (window too small) - d = Series(np.random.randn(5)) - x = d.rolling(window=2).skew() - tm.assert_series_equal(all_nan, x) - - # yields [NaN, NaN, NaN, 0.177994, 1.548824] - d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401]) - expected = Series([np.NaN, np.NaN, np.NaN, 0.177994, 1.548824]) - x = d.rolling(window=4).skew() - tm.assert_series_equal(expected, x) - - def test_rolling_kurt_edge_cases(self): - - all_nan = Series([np.NaN] * 5) - - # yields all NaN (0 variance) - d = Series([1] * 5) - x = d.rolling(window=5).kurt() - tm.assert_series_equal(all_nan, x) - - # yields all NaN (window too small) - d = Series(np.random.randn(5)) - x = d.rolling(window=3).kurt() - tm.assert_series_equal(all_nan, x) - - # yields [NaN, NaN, NaN, 1.224307, 2.671499] - d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401]) - expected = Series([np.NaN, np.NaN, np.NaN, 1.224307, 2.671499]) - x = d.rolling(window=4).kurt() - tm.assert_series_equal(expected, x) - - def test_rolling_skew_eq_value_fperr(self): - # #18804 all rolling skew for all equal values should return Nan - a = Series([1.1] * 15).rolling(window=10).skew() - assert np.isnan(a).all() - - def test_rolling_kurt_eq_value_fperr(self): - # #18804 all rolling kurt for all equal values should return Nan - a = Series([1.1] * 15).rolling(window=10).kurt() - assert np.isnan(a).all() - - @pytest.mark.parametrize( - "func,static_comp", - [("sum", np.sum), ("mean", np.mean), ("max", np.max), ("min", np.min)], - ids=["sum", "mean", "max", "min"], - ) - def test_expanding_func(self, func, static_comp): - def expanding_func(x, min_periods=1, center=False, axis=0): - exp = x.expanding(min_periods=min_periods, center=center, axis=axis) - return getattr(exp, func)() - - self._check_expanding(expanding_func, static_comp, preserve_nan=False) - - def test_expanding_apply(self, raw): - def expanding_mean(x, min_periods=1): - - exp = x.expanding(min_periods=min_periods) - result = exp.apply(lambda x: x.mean(), raw=raw) - return result - - # TODO(jreback), needed to add preserve_nan=False - # here to make this pass - self._check_expanding(expanding_mean, np.mean, preserve_nan=False) - - ser = Series([]) - tm.assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean(), raw=raw)) - - # GH 8080 - s = Series([None, None, None]) - result = s.expanding(min_periods=0).apply(lambda x: len(x), raw=raw) - expected = Series([1.0, 2.0, 3.0]) - tm.assert_series_equal(result, expected) - - def _check_expanding( - self, - func, - static_comp, - has_min_periods=True, - has_time_rule=True, - preserve_nan=True, - ): - - series_result = func(self.series) - assert isinstance(series_result, Series) - frame_result = func(self.frame) - assert isinstance(frame_result, DataFrame) - - result = func(self.series) - tm.assert_almost_equal(result[10], static_comp(self.series[:11])) - - if preserve_nan: - assert result.iloc[self._nan_locs].isna().all() - - ser = Series(randn(50)) - - if has_min_periods: - result = func(ser, min_periods=30) - assert result[:29].isna().all() - tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) - - # min_periods is working correctly - result = func(ser, min_periods=15) - assert isna(result.iloc[13]) - assert notna(result.iloc[14]) - - ser2 = Series(randn(20)) - result = func(ser2, min_periods=5) - assert isna(result[3]) - assert notna(result[4]) - - # min_periods=0 - result0 = func(ser, min_periods=0) - result1 = func(ser, min_periods=1) - tm.assert_almost_equal(result0, result1) - else: - result = func(ser) - tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) - - def test_rolling_max_gh6297(self): - """Replicate result expected in GH #6297""" - - indices = [datetime(1975, 1, i) for i in range(1, 6)] - # So that we can have 2 datapoints on one of the days - indices.append(datetime(1975, 1, 3, 6, 0)) - series = Series(range(1, 7), index=indices) - # Use floats instead of ints as values - series = series.map(lambda x: float(x)) - # Sort chronologically - series = series.sort_index() - - expected = Series( - [1.0, 2.0, 6.0, 4.0, 5.0], - index=[datetime(1975, 1, i, 0) for i in range(1, 6)], - ) - x = series.resample("D").max().rolling(window=1).max() - tm.assert_series_equal(expected, x) - - def test_rolling_max_resample(self): - - indices = [datetime(1975, 1, i) for i in range(1, 6)] - # So that we can have 3 datapoints on last day (4, 10, and 20) - indices.append(datetime(1975, 1, 5, 1)) - indices.append(datetime(1975, 1, 5, 2)) - series = Series(list(range(0, 5)) + [10, 20], index=indices) - # Use floats instead of ints as values - series = series.map(lambda x: float(x)) - # Sort chronologically - series = series.sort_index() - - # Default how should be max - expected = Series( - [0.0, 1.0, 2.0, 3.0, 20.0], - index=[datetime(1975, 1, i, 0) for i in range(1, 6)], - ) - x = series.resample("D").max().rolling(window=1).max() - tm.assert_series_equal(expected, x) - - # Now specify median (10.0) - expected = Series( - [0.0, 1.0, 2.0, 3.0, 10.0], - index=[datetime(1975, 1, i, 0) for i in range(1, 6)], - ) - x = series.resample("D").median().rolling(window=1).max() - tm.assert_series_equal(expected, x) - - # Now specify mean (4+10+20)/3 - v = (4.0 + 10.0 + 20.0) / 3.0 - expected = Series( - [0.0, 1.0, 2.0, 3.0, v], - index=[datetime(1975, 1, i, 0) for i in range(1, 6)], - ) - x = series.resample("D").mean().rolling(window=1).max() - tm.assert_series_equal(expected, x) - - def test_rolling_min_resample(self): - - indices = [datetime(1975, 1, i) for i in range(1, 6)] - # So that we can have 3 datapoints on last day (4, 10, and 20) - indices.append(datetime(1975, 1, 5, 1)) - indices.append(datetime(1975, 1, 5, 2)) - series = Series(list(range(0, 5)) + [10, 20], index=indices) - # Use floats instead of ints as values - series = series.map(lambda x: float(x)) - # Sort chronologically - series = series.sort_index() - - # Default how should be min - expected = Series( - [0.0, 1.0, 2.0, 3.0, 4.0], - index=[datetime(1975, 1, i, 0) for i in range(1, 6)], - ) - r = series.resample("D").min().rolling(window=1) - tm.assert_series_equal(expected, r.min()) - - def test_rolling_median_resample(self): - - indices = [datetime(1975, 1, i) for i in range(1, 6)] - # So that we can have 3 datapoints on last day (4, 10, and 20) - indices.append(datetime(1975, 1, 5, 1)) - indices.append(datetime(1975, 1, 5, 2)) - series = Series(list(range(0, 5)) + [10, 20], index=indices) - # Use floats instead of ints as values - series = series.map(lambda x: float(x)) - # Sort chronologically - series = series.sort_index() - - # Default how should be median - expected = Series( - [0.0, 1.0, 2.0, 3.0, 10], - index=[datetime(1975, 1, i, 0) for i in range(1, 6)], - ) - x = series.resample("D").median().rolling(window=1).median() - tm.assert_series_equal(expected, x) - - def test_rolling_median_memory_error(self): - # GH11722 - n = 20000 - Series(np.random.randn(n)).rolling(window=2, center=False).median() - Series(np.random.randn(n)).rolling(window=2, center=False).median() - - def test_rolling_min_max_numeric_types(self): - - # GH12373 - types_test = [np.dtype("f{}".format(width)) for width in [4, 8]] - types_test.extend( - [ - np.dtype("{}{}".format(sign, width)) - for width in [1, 2, 4, 8] - for sign in "ui" - ] - ) - for data_type in types_test: - # Just testing that these don't throw exceptions and that - # the return type is float64. Other tests will cover quantitative - # correctness - result = DataFrame(np.arange(20, dtype=data_type)).rolling(window=5).max() - assert result.dtypes[0] == np.dtype("f8") - result = DataFrame(np.arange(20, dtype=data_type)).rolling(window=5).min() - assert result.dtypes[0] == np.dtype("f8") - - -class TestGrouperGrouping: - def setup_method(self, method): - self.series = Series(np.arange(10)) - self.frame = DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) - - def test_mutated(self): - - msg = r"group\(\) got an unexpected keyword argument 'foo'" - with pytest.raises(TypeError, match=msg): - self.frame.groupby("A", foo=1) - - g = self.frame.groupby("A") - assert not g.mutated - g = self.frame.groupby("A", mutated=True) - assert g.mutated - - def test_getitem(self): - g = self.frame.groupby("A") - g_mutated = self.frame.groupby("A", mutated=True) - - expected = g_mutated.B.apply(lambda x: x.rolling(2).mean()) - - result = g.rolling(2).mean().B - tm.assert_series_equal(result, expected) - - result = g.rolling(2).B.mean() - tm.assert_series_equal(result, expected) - - result = g.B.rolling(2).mean() - tm.assert_series_equal(result, expected) - - result = self.frame.B.groupby(self.frame.A).rolling(2).mean() - tm.assert_series_equal(result, expected) - - def test_getitem_multiple(self): - - # GH 13174 - g = self.frame.groupby("A") - r = g.rolling(2) - g_mutated = self.frame.groupby("A", mutated=True) - expected = g_mutated.B.apply(lambda x: x.rolling(2).count()) - - result = r.B.count() - tm.assert_series_equal(result, expected) - - result = r.B.count() - tm.assert_series_equal(result, expected) - - def test_rolling(self): - g = self.frame.groupby("A") - r = g.rolling(window=4) - - for f in ["sum", "mean", "min", "max", "count", "kurt", "skew"]: - - result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.rolling(4), f)()) - tm.assert_frame_equal(result, expected) - - for f in ["std", "var"]: - result = getattr(r, f)(ddof=1) - expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) - tm.assert_frame_equal(result, expected) - - result = r.quantile(0.5) - expected = g.apply(lambda x: x.rolling(4).quantile(0.5)) - tm.assert_frame_equal(result, expected) - - def test_rolling_corr_cov(self): - g = self.frame.groupby("A") - r = g.rolling(window=4) - - for f in ["corr", "cov"]: - result = getattr(r, f)(self.frame) - - def func(x): - return getattr(x.rolling(4), f)(self.frame) - - expected = g.apply(func) - tm.assert_frame_equal(result, expected) - - result = getattr(r.B, f)(pairwise=True) - - def func(x): - return getattr(x.B.rolling(4), f)(pairwise=True) - - expected = g.apply(func) - tm.assert_series_equal(result, expected) - - def test_rolling_apply(self, raw): - g = self.frame.groupby("A") - r = g.rolling(window=4) - - # reduction - result = r.apply(lambda x: x.sum(), raw=raw) - expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) - tm.assert_frame_equal(result, expected) - - def test_rolling_apply_mutability(self): - # GH 14013 - df = pd.DataFrame({"A": ["foo"] * 3 + ["bar"] * 3, "B": [1] * 6}) - g = df.groupby("A") - - mi = pd.MultiIndex.from_tuples( - [("bar", 3), ("bar", 4), ("bar", 5), ("foo", 0), ("foo", 1), ("foo", 2)] - ) - - mi.names = ["A", None] - # Grouped column should not be a part of the output - expected = pd.DataFrame([np.nan, 2.0, 2.0] * 2, columns=["B"], index=mi) - - result = g.rolling(window=2).sum() - tm.assert_frame_equal(result, expected) - - # Call an arbitrary function on the groupby - g.sum() - - # Make sure nothing has been mutated - result = g.rolling(window=2).sum() - tm.assert_frame_equal(result, expected) - - def test_expanding(self): - g = self.frame.groupby("A") - r = g.expanding() - - for f in ["sum", "mean", "min", "max", "count", "kurt", "skew"]: - - result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.expanding(), f)()) - tm.assert_frame_equal(result, expected) - - for f in ["std", "var"]: - result = getattr(r, f)(ddof=0) - expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) - tm.assert_frame_equal(result, expected) - - result = r.quantile(0.5) - expected = g.apply(lambda x: x.expanding().quantile(0.5)) - tm.assert_frame_equal(result, expected) - - def test_expanding_corr_cov(self): - g = self.frame.groupby("A") - r = g.expanding() - - for f in ["corr", "cov"]: - result = getattr(r, f)(self.frame) - - def func(x): - return getattr(x.expanding(), f)(self.frame) - - expected = g.apply(func) - tm.assert_frame_equal(result, expected) - - result = getattr(r.B, f)(pairwise=True) - - def func(x): - return getattr(x.B.expanding(), f)(pairwise=True) - - expected = g.apply(func) - tm.assert_series_equal(result, expected) - - def test_expanding_apply(self, raw): - g = self.frame.groupby("A") - r = g.expanding() - - # reduction - result = r.apply(lambda x: x.sum(), raw=raw) - expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw)) - tm.assert_frame_equal(result, expected) From 07d04882889a7016029d652501024fbfeaf10fff Mon Sep 17 00:00:00 2001 From: pilkibun <51503352+pilkibun@users.noreply.github.com> Date: Fri, 12 Jul 2019 14:43:46 +0000 Subject: [PATCH 217/238] CLN: revisit build warnings in cython templates (#27346) --- pandas/_libs/groupby_helper.pxi.in | 12 ++++++++++-- pandas/_libs/hashtable_class_helper.pxi.in | 8 ++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 8e351244b7f43a..000689f6345451 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -76,7 +76,11 @@ def group_last_{{name}}({{c_type}}[:, :] out, val = values[i, j] # not nan - if val == val and val != {{nan_val}}: + if ( + {{if not name.startswith("int")}} + val == val and + {{endif}} + val != {{nan_val}}): nobs[lab, j] += 1 resx[lab, j] = val @@ -133,7 +137,11 @@ def group_nth_{{name}}({{c_type}}[:, :] out, val = values[i, j] # not nan - if val == val and val != {{nan_val}}: + if ( + {{if not name.startswith("int")}} + val == val and + {{endif}} + val != {{nan_val}}): nobs[lab, j] += 1 if nobs[lab, j] == rank: resx[lab, j] = val diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index bf2189a8c1fd72..17f1d011af01b5 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -424,8 +424,12 @@ cdef class {{name}}HashTable(HashTable): for i in range(n): val = values[i] - if ignore_na and (val != val - or (use_na_value and val == na_value2)): + if ignore_na and ( + {{if not name.lower().startswith(("uint", "int"))}} + val != val or + {{endif}} + (use_na_value and val == na_value2) + ): # if missing values do not count as unique values (i.e. if # ignore_na is True), skip the hashtable entry for them, # and replace the corresponding label with na_sentinel From 23a66844eb0dd502c9ff5caf9c59aecf6bb02b18 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 12 Jul 2019 17:01:02 +0100 Subject: [PATCH 218/238] TST: add test for multiindex partial indexing both axis (#27359) --- pandas/tests/indexing/multiindex/test_partial.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index 3c65f1b8abddb8..b1519d82e1aa74 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -188,3 +188,14 @@ def test_setitem_multiple_partial(self, multiindex_dataframe_random_data): expected.loc["foo"] = 0 expected.loc["bar"] = 0 tm.assert_series_equal(result, expected) + + +def test_loc_getitem_partial_both_axis(): + # gh-12660 + iterables = [["a", "b"], [2, 1]] + columns = MultiIndex.from_product(iterables, names=["col1", "col2"]) + rows = MultiIndex.from_product(iterables, names=["row1", "row2"]) + df = DataFrame(np.random.randn(4, 4), index=rows, columns=columns) + expected = df.iloc[:2, 2:].droplevel("row1").droplevel("col1", axis=1) + result = df.loc["a", "b"] + tm.assert_frame_equal(result, expected) From 84136d58024ff377e3505bde62eb2cb85249e5ef Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 12 Jul 2019 09:01:53 -0700 Subject: [PATCH 219/238] Dispatch Index ops to Series (#27352) --- pandas/core/indexes/base.py | 30 +++++++++-------------- pandas/core/ops/__init__.py | 9 +++++-- pandas/core/ops/missing.py | 34 ++++++++------------------ pandas/tests/arithmetic/test_object.py | 24 ------------------ pandas/tests/indexes/test_common.py | 1 + 5 files changed, 30 insertions(+), 68 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index abe20ee0a91ce6..e084f99ec5a2cf 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -70,7 +70,6 @@ from pandas.core.indexes.frozen import FrozenList import pandas.core.missing as missing from pandas.core.ops import get_op_result_name, make_invalid_op -from pandas.core.ops.missing import dispatch_missing import pandas.core.sorting as sorting from pandas.core.strings import StringMethods @@ -144,27 +143,18 @@ def index_arithmetic_method(self, other): out = op(self, other) return Index(out, name=self.name) - other = self._validate_for_numeric_binop(other, op) - # handle time-based others if isinstance(other, (ABCDateOffset, np.timedelta64, timedelta)): return self._evaluate_with_timedelta_like(other, op) - elif isinstance(other, (datetime, np.datetime64)): - return self._evaluate_with_datetime_like(other, op) - values = self.values - with np.errstate(all="ignore"): - result = op(values, other) + other = self._validate_for_numeric_binop(other, op) - result = dispatch_missing(op, values, other, result) + from pandas import Series - attrs = self._get_attributes_dict() - attrs = self._maybe_update_attributes(attrs) - if op is divmod: - result = (Index(result[0], **attrs), Index(result[1], **attrs)) - else: - result = Index(result, **attrs) - return result + result = op(Series(self), other) + if isinstance(result, tuple): + return (Index(result[0]), Index(result[1])) + return Index(result) name = "__{name}__".format(name=op.__name__) # TODO: docstring? @@ -2361,10 +2351,14 @@ def _get_unique_index(self, dropna=False): def __add__(self, other): if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented - return Index(np.array(self) + other) + from pandas import Series + + return Index(Series(self) + other) def __radd__(self, other): - return Index(other + np.array(self)) + from pandas import Series + + return Index(other + Series(self)) def __iadd__(self, other): # alias for __add__ diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index ee5c670364485a..43fe8f1a8698f9 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -38,6 +38,7 @@ ) from pandas.core.dtypes.generic import ( ABCDataFrame, + ABCDatetimeArray, ABCIndex, ABCIndexClass, ABCSeries, @@ -1702,10 +1703,14 @@ def wrapper(left, right): # does inference in the case where `result` has object-dtype. return construct_result(left, result, index=left.index, name=res_name) + elif isinstance(right, (ABCDatetimeArray, pd.DatetimeIndex)): + result = op(left._values, right) + return construct_result(left, result, index=left.index, name=res_name) + lvalues = left.values rvalues = right - if isinstance(rvalues, ABCSeries): - rvalues = rvalues.values + if isinstance(rvalues, (ABCSeries, ABCIndexClass)): + rvalues = rvalues._values with np.errstate(all="ignore"): result = na_op(lvalues, rvalues) diff --git a/pandas/core/ops/missing.py b/pandas/core/ops/missing.py index 36989582615558..01bc345a40b83c 100644 --- a/pandas/core/ops/missing.py +++ b/pandas/core/ops/missing.py @@ -148,40 +148,26 @@ def mask_zero_div_zero(x, y, result): return result -def dispatch_missing(op, left, right, result): +def dispatch_fill_zeros(op, left, right, result): """ - Fill nulls caused by division by zero, casting to a different dtype - if necessary. + Call fill_zeros with the appropriate fill value depending on the operation, + with special logic for divmod and rdivmod. Parameters ---------- op : function (operator.add, operator.div, ...) - left : object (Index for non-reversed ops) - right : object (Index fof reversed ops) + left : object (np.ndarray for non-reversed ops) + right : object (np.ndarray for reversed ops) result : ndarray Returns ------- - result : ndarray - """ - if op is operator.floordiv: - # Note: no need to do this for truediv; in py3 numpy behaves the way - # we want. - result = mask_zero_div_zero(left, right, result) - elif op is operator.mod: - result = fill_zeros(result, left, right, "__mod__", np.nan) - elif op is divmod: - res0 = mask_zero_div_zero(left, right, result[0]) - res1 = fill_zeros(result[1], left, right, "__divmod__", np.nan) - result = (res0, res1) - return result - + result : np.ndarray -# FIXME: de-duplicate with dispatch_missing -def dispatch_fill_zeros(op, left, right, result): - """ - Call fill_zeros with the appropriate fill value depending on the operation, - with special logic for divmod and rdivmod. + Notes + ----- + For divmod and rdivmod, the `result` parameter and returned `result` + is a 2-tuple of ndarray objects. """ if op is divmod: result = ( diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index f7f6ba8b114e75..fd9db806713603 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -103,18 +103,6 @@ def test_add_extension_scalar(self, other, box, op): result = op(arr, other) tm.assert_equal(result, expected) - @pytest.mark.parametrize( - "box", - [ - pytest.param( - pd.Index, - marks=pytest.mark.xfail(reason="Does not mask nulls", raises=TypeError), - ), - pd.Series, - pd.DataFrame, - ], - ids=lambda x: x.__name__, - ) def test_objarr_add_str(self, box): ser = pd.Series(["x", np.nan, "x"]) expected = pd.Series(["xa", np.nan, "xa"]) @@ -125,18 +113,6 @@ def test_objarr_add_str(self, box): result = ser + "a" tm.assert_equal(result, expected) - @pytest.mark.parametrize( - "box", - [ - pytest.param( - pd.Index, - marks=pytest.mark.xfail(reason="Does not mask nulls", raises=TypeError), - ), - pd.Series, - pd.DataFrame, - ], - ids=lambda x: x.__name__, - ) def test_objarr_radd_str(self, box): ser = pd.Series(["x", np.nan, "x"]) expected = pd.Series(["ax", np.nan, "ax"]) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 605df9971a567c..0e9aa07a4c05a8 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -144,6 +144,7 @@ def test_set_name_methods(self, indices): assert res is None assert indices.name == new_name assert indices.names == [new_name] + # FIXME: dont leave commented-out # with pytest.raises(TypeError, match="list-like"): # # should still fail even if it would be the right length # ind.set_names("a") From c6335792f187b2904c2fdd77662d1048dde631cc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 12 Jul 2019 09:09:00 -0700 Subject: [PATCH 220/238] CLN/REF: indexing typing, prune unreachable branches (#27351) --- pandas/core/frame.py | 75 ++++++---------- pandas/core/generic.py | 5 +- pandas/core/indexing.py | 188 ++++++++++++++++------------------------ pandas/core/series.py | 29 ++----- 4 files changed, 114 insertions(+), 183 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 263c4013de281d..53cb0cedc208bc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2889,11 +2889,11 @@ def _set_value(self, index, col, value, takeable=False): _set_value.__doc__ = set_value.__doc__ - def _ixs(self, i, axis=0): + def _ixs(self, i: int, axis: int = 0): """ Parameters ---------- - i : int, slice, or sequence of integers + i : int axis : int Notes @@ -2902,59 +2902,40 @@ def _ixs(self, i, axis=0): """ # irow if axis == 0: - if isinstance(i, slice): - return self[i] - else: - label = self.index[i] - if isinstance(label, Index): - # a location index by definition - result = self.take(i, axis=axis) - copy = True - else: - new_values = self._data.fast_xs(i) - if is_scalar(new_values): - return new_values - - # if we are a copy, mark as such - copy = ( - isinstance(new_values, np.ndarray) and new_values.base is None - ) - result = self._constructor_sliced( - new_values, - index=self.columns, - name=self.index[i], - dtype=new_values.dtype, - ) - result._set_is_copy(self, copy=copy) - return result + label = self.index[i] + new_values = self._data.fast_xs(i) + if is_scalar(new_values): + return new_values + + # if we are a copy, mark as such + copy = isinstance(new_values, np.ndarray) and new_values.base is None + result = self._constructor_sliced( + new_values, + index=self.columns, + name=self.index[i], + dtype=new_values.dtype, + ) + result._set_is_copy(self, copy=copy) + return result # icol else: label = self.columns[i] - if isinstance(i, slice): - # need to return view - lab_slice = slice(label[0], label[-1]) - return self.loc[:, lab_slice] - else: - if isinstance(label, Index): - return self.take(i, axis=1) - index_len = len(self.index) + # if the values returned are not the same length + # as the index (iow a not found value), iget returns + # a 0-len ndarray. This is effectively catching + # a numpy error (as numpy should really raise) + values = self._data.iget(i) - # if the values returned are not the same length - # as the index (iow a not found value), iget returns - # a 0-len ndarray. This is effectively catching - # a numpy error (as numpy should really raise) - values = self._data.iget(i) + if len(self.index) and not len(values): + values = np.array([np.nan] * len(self.index), dtype=object) + result = self._box_col_values(values, label) - if index_len and not len(values): - values = np.array([np.nan] * index_len, dtype=object) - result = self._box_col_values(values, label) + # this is a cached value, mark it so + result._set_as_cached(label, self) - # this is a cached value, mark it so - result._set_as_cached(label, self) - - return result + return result def __getitem__(self, key): key = lib.item_from_zerodim(key) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e19b1f70ce2f7e..f28f58b0703687 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3495,7 +3495,7 @@ def __delitem__(self, key): deleted = False maybe_shortcut = False - if hasattr(self, "columns") and isinstance(self.columns, MultiIndex): + if self.ndim == 2 and isinstance(self.columns, MultiIndex): try: maybe_shortcut = key not in self.columns._engine except TypeError: @@ -5231,9 +5231,6 @@ def _dir_additions(self): } return super()._dir_additions().union(additions) - # ---------------------------------------------------------------------- - # Getting and setting elements - # ---------------------------------------------------------------------- # Consolidation of internals diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 6040385acbe402..482e9c365420cd 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -143,10 +143,7 @@ def __getitem__(self, key): key = com.apply_if_callable(key, self.obj) return self._getitem_axis(key, axis=axis) - def _get_label(self, label, axis=None): - if axis is None: - axis = self.axis or 0 - + def _get_label(self, label, axis: int): if self.ndim == 1: # for perf reasons we want to try _xs first # as its basically direct indexing @@ -158,12 +155,10 @@ def _get_label(self, label, axis=None): return self.obj._xs(label, axis=axis) - def _get_loc(self, key, axis: int): + def _get_loc(self, key: int, axis: int): return self.obj._ixs(key, axis=axis) - def _slice(self, obj, axis=None, kind=None): - if axis is None: - axis = self.axis + def _slice(self, obj, axis: int, kind=None): return self.obj._slice(obj, axis=axis, kind=kind) def _get_setitem_indexer(self, key): @@ -330,19 +325,6 @@ def _setitem_with_indexer(self, indexer, value): val = list(value.values()) if isinstance(value, dict) else value take_split_path = not blk._can_hold_element(val) - if isinstance(indexer, tuple) and len(indexer) == len(self.obj.axes): - - for i, ax in zip(indexer, self.obj.axes): - - # if we have any multi-indexes that have non-trivial slices - # (not null slices) then we must take the split path, xref - # GH 10360 - if isinstance(ax, MultiIndex) and not ( - is_integer(i) or com.is_null_slice(i) - ): - take_split_path = True - break - if isinstance(indexer, tuple): nindexer = [] for i, idx in enumerate(indexer): @@ -406,71 +388,16 @@ def _setitem_with_indexer(self, indexer, value): indexer, missing = convert_missing_indexer(indexer) if missing: - - # reindex the axis to the new value - # and set inplace - if self.ndim == 1: - index = self.obj.index - new_index = index.insert(len(index), indexer) - - # we have a coerced indexer, e.g. a float - # that matches in an Int64Index, so - # we will not create a duplicate index, rather - # index to that element - # e.g. 0.0 -> 0 - # GH12246 - if index.is_unique: - new_indexer = index.get_indexer([new_index[-1]]) - if (new_indexer != -1).any(): - return self._setitem_with_indexer(new_indexer, value) - - # this preserves dtype of the value - new_values = Series([value])._values - if len(self.obj._values): - # GH#22717 handle casting compatibility that np.concatenate - # does incorrectly - new_values = _concat_compat([self.obj._values, new_values]) - self.obj._data = self.obj._constructor( - new_values, index=new_index, name=self.obj.name - )._data - self.obj._maybe_update_cacher(clear=True) - return self.obj - - elif self.ndim == 2: - - # no columns and scalar - if not len(self.obj.columns): - raise ValueError( - "cannot set a frame with no defined " "columns" - ) - - # append a Series - if isinstance(value, Series): - - value = value.reindex(index=self.obj.columns, copy=True) - value.name = indexer - - # a list-list - else: - - # must have conforming columns - if is_list_like_indexer(value): - if len(value) != len(self.obj.columns): - raise ValueError( - "cannot set a row with " "mismatched columns" - ) - - value = Series(value, index=self.obj.columns, name=indexer) - - self.obj._data = self.obj.append(value)._data - self.obj._maybe_update_cacher(clear=True) - return self.obj + return self._setitem_with_indexer_missing(indexer, value) # set item_labels = self.obj._get_axis(info_axis) # align and set the values if take_split_path: + # Above we only set take_split_path to True for 2D cases + assert self.ndim == 2 + assert info_axis == 1 if not isinstance(indexer, tuple): indexer = self._tuplify(indexer) @@ -524,11 +451,8 @@ def _setitem_with_indexer(self, indexer, value): # non-mi else: plane_indexer = indexer[:info_axis] + indexer[info_axis + 1 :] - if info_axis > 0: - plane_axis = self.obj.axes[:info_axis][0] - lplane_indexer = length_of_indexer(plane_indexer[0], plane_axis) - else: - lplane_indexer = 0 + plane_axis = self.obj.axes[:info_axis][0] + lplane_indexer = length_of_indexer(plane_indexer[0], plane_axis) def setter(item, v): s = self.obj[item] @@ -578,9 +502,7 @@ def setter(item, v): # hasattr first, to avoid coercing to ndarray without reason. # But we may be relying on the ndarray coercion to check ndim. # Why not just convert to an ndarray earlier on if needed? - elif (hasattr(value, "ndim") and value.ndim == 2) or ( - not hasattr(value, "ndim") and np.array(value).ndim - ) == 2: + elif np.ndim(value) == 2: # note that this coerces the dtype if we are mixed # GH 7551 @@ -656,6 +578,65 @@ def setter(item, v): self.obj._data = self.obj._data.setitem(indexer=indexer, value=value) self.obj._maybe_update_cacher(clear=True) + def _setitem_with_indexer_missing(self, indexer, value): + """ + Insert new row(s) or column(s) into the Series or DataFrame. + """ + from pandas import Series + + # reindex the axis to the new value + # and set inplace + if self.ndim == 1: + index = self.obj.index + new_index = index.insert(len(index), indexer) + + # we have a coerced indexer, e.g. a float + # that matches in an Int64Index, so + # we will not create a duplicate index, rather + # index to that element + # e.g. 0.0 -> 0 + # GH#12246 + if index.is_unique: + new_indexer = index.get_indexer([new_index[-1]]) + if (new_indexer != -1).any(): + return self._setitem_with_indexer(new_indexer, value) + + # this preserves dtype of the value + new_values = Series([value])._values + if len(self.obj._values): + # GH#22717 handle casting compatibility that np.concatenate + # does incorrectly + new_values = _concat_compat([self.obj._values, new_values]) + self.obj._data = self.obj._constructor( + new_values, index=new_index, name=self.obj.name + )._data + self.obj._maybe_update_cacher(clear=True) + return self.obj + + elif self.ndim == 2: + + if not len(self.obj.columns): + # no columns and scalar + raise ValueError("cannot set a frame with no defined columns") + + if isinstance(value, ABCSeries): + # append a Series + value = value.reindex(index=self.obj.columns, copy=True) + value.name = indexer + + else: + # a list-list + if is_list_like_indexer(value): + # must have conforming columns + if len(value) != len(self.obj.columns): + raise ValueError("cannot set a row with mismatched columns") + + value = Series(value, index=self.obj.columns, name=indexer) + + self.obj._data = self.obj.append(value)._data + self.obj._maybe_update_cacher(clear=True) + return self.obj + def _align_series(self, indexer, ser, multiindex_indexer=False): """ Parameters @@ -820,9 +801,6 @@ def _getitem_tuple(self, tup): # no shortcut needed retval = self.obj for i, key in enumerate(tup): - if i >= self.obj.ndim: - raise IndexingError("Too many indexers") - if com.is_null_slice(key): continue @@ -882,10 +860,10 @@ def _convert_for_reindex(self, key, axis: int): def _handle_lowerdim_multi_index_axis0(self, tup): # we have an axis0 multi-index, handle or raise - + axis = self.axis or 0 try: # fast path for series or for tup devoid of slices - return self._get_label(tup, axis=self.axis) + return self._get_label(tup, axis=axis) except TypeError: # slices are unhashable pass @@ -983,7 +961,8 @@ def _getitem_nested_tuple(self, tup): # this is a series with a multi-index specified a tuple of # selectors - return self._getitem_axis(tup, axis=self.axis) + axis = self.axis or 0 + return self._getitem_axis(tup, axis=axis) # handle the multi-axis by taking sections and reducing # this is iterative @@ -1010,11 +989,7 @@ def _getitem_nested_tuple(self, tup): return obj - def _getitem_axis(self, key, axis=None): - - if axis is None: - axis = self.axis or 0 - + def _getitem_axis(self, key, axis: int): if is_iterator(key): key = list(key) self._validate_key(key, axis) @@ -1439,7 +1414,7 @@ def _is_scalar_access(self, key): def _getitem_scalar(self, key): raise NotImplementedError() - def _getitem_axis(self, key, axis=None): + def _getitem_axis(self, key, axis: int): raise NotImplementedError() def _getbool_axis(self, key, axis: int): @@ -1786,10 +1761,7 @@ def _get_partial_string_timestamp_match_key(self, key, labels): return key - def _getitem_axis(self, key, axis=None): - if axis is None: - axis = self.axis or 0 - + def _getitem_axis(self, key, axis: int): key = item_from_zerodim(key) if is_iterator(key): key = list(key) @@ -2106,9 +2078,6 @@ def _getitem_tuple(self, tup): retval = self.obj axis = 0 for i, key in enumerate(tup): - if i >= self.obj.ndim: - raise IndexingError("Too many indexers") - if com.is_null_slice(key): axis += 1 continue @@ -2143,10 +2112,7 @@ def _get_list_axis(self, key, axis: int): # re-raise with different error message raise IndexError("positional indexers are out-of-bounds") - def _getitem_axis(self, key, axis=None): - if axis is None: - axis = self.axis or 0 - + def _getitem_axis(self, key, axis: int): if isinstance(key, slice): return self._get_slice_axis(key, axis=axis) diff --git a/pandas/core/series.py b/pandas/core/series.py index 73a71a2a41f4c8..6a58b1ea6f82d0 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1028,38 +1028,25 @@ def axes(self): """ return [self.index] - def _ixs(self, i, axis=0): + def _ixs(self, i: int, axis: int = 0): """ Return the i-th value or values in the Series by location. Parameters ---------- - i : int, slice, or sequence of integers + i : int Returns ------- scalar (int) or Series (slice, sequence) """ - try: - # dispatch to the values if we need - values = self._values - if isinstance(values, np.ndarray): - return libindex.get_value_at(values, i) - else: - return values[i] - except IndexError: - raise - except Exception: - if isinstance(i, slice): - indexer = self.index._convert_slice_indexer(i, kind="iloc") - return self._get_values(indexer) - else: - label = self.index[i] - if isinstance(label, Index): - return self.take(i, axis=axis, convert=True) - else: - return libindex.get_value_at(self, i) + # dispatch to the values if we need + values = self._values + if isinstance(values, np.ndarray): + return libindex.get_value_at(values, i) + else: + return values[i] @property def _is_mixed_type(self): From 0437f6899eee472e01d5c2f7a9c8d3e9ee2bed8c Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 12 Jul 2019 17:28:38 +0100 Subject: [PATCH 221/238] TST: add test for series list indexing with missing values (#27362) --- pandas/tests/indexing/test_loc.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 06d71d1b1e3899..19c288a4b63aeb 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1069,3 +1069,15 @@ def test_series_indexing_zerodim_np_array(self): s = Series([1, 2]) result = s.loc[np.array(0)] assert result == 1 + + +def test_series_loc_getitem_label_list_missing_values(): + # gh-11428 + key = np.array( + ["2001-01-04", "2001-01-02", "2001-01-04", "2001-01-14"], dtype="datetime64" + ) + s = Series([2, 5, 8, 11], date_range("2001-01-01", freq="D", periods=4)) + expected = Series([11.0, 5.0, 11.0, np.nan], index=key) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = s.loc[key] + tm.assert_series_equal(result, expected) From 8913b7e84ac75b1093bf708e0d1a9a0789e8cfcd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 12 Jul 2019 09:34:52 -0700 Subject: [PATCH 222/238] REF: separate out docstrings.py (#27361) --- pandas/core/ops/__init__.py | 682 +--------------------------------- pandas/core/ops/docstrings.py | 675 +++++++++++++++++++++++++++++++++ 2 files changed, 682 insertions(+), 675 deletions(-) create mode 100644 pandas/core/ops/docstrings.py diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 43fe8f1a8698f9..230abd6b301a6d 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -6,7 +6,7 @@ import datetime import operator import textwrap -from typing import Any, Callable, Dict, Optional +from typing import Any, Callable import warnings import numpy as np @@ -53,6 +53,12 @@ import pandas.core.common as com from . import missing +from .docstrings import ( + _arith_doc_FRAME, + _flex_comp_doc_FRAME, + _make_flex_doc, + _op_descriptions, +) from .roperator import ( # noqa:F401 radd, rand_, @@ -334,680 +340,6 @@ def _get_op_name(op, special): return opname -# ----------------------------------------------------------------------------- -# Docstring Generation and Templates - -_add_example_SERIES = """ -Examples --------- ->>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) ->>> a -a 1.0 -b 1.0 -c 1.0 -d NaN -dtype: float64 ->>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) ->>> b -a 1.0 -b NaN -d 1.0 -e NaN -dtype: float64 ->>> a.add(b, fill_value=0) -a 2.0 -b 1.0 -c 1.0 -d 1.0 -e NaN -dtype: float64 -""" - -_sub_example_SERIES = """ -Examples --------- ->>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) ->>> a -a 1.0 -b 1.0 -c 1.0 -d NaN -dtype: float64 ->>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) ->>> b -a 1.0 -b NaN -d 1.0 -e NaN -dtype: float64 ->>> a.subtract(b, fill_value=0) -a 0.0 -b 1.0 -c 1.0 -d -1.0 -e NaN -dtype: float64 -""" - -_mul_example_SERIES = """ -Examples --------- ->>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) ->>> a -a 1.0 -b 1.0 -c 1.0 -d NaN -dtype: float64 ->>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) ->>> b -a 1.0 -b NaN -d 1.0 -e NaN -dtype: float64 ->>> a.multiply(b, fill_value=0) -a 1.0 -b 0.0 -c 0.0 -d 0.0 -e NaN -dtype: float64 -""" - -_div_example_SERIES = """ -Examples --------- ->>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) ->>> a -a 1.0 -b 1.0 -c 1.0 -d NaN -dtype: float64 ->>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) ->>> b -a 1.0 -b NaN -d 1.0 -e NaN -dtype: float64 ->>> a.divide(b, fill_value=0) -a 1.0 -b inf -c inf -d 0.0 -e NaN -dtype: float64 -""" - -_floordiv_example_SERIES = """ -Examples --------- ->>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) ->>> a -a 1.0 -b 1.0 -c 1.0 -d NaN -dtype: float64 ->>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) ->>> b -a 1.0 -b NaN -d 1.0 -e NaN -dtype: float64 ->>> a.floordiv(b, fill_value=0) -a 1.0 -b NaN -c NaN -d 0.0 -e NaN -dtype: float64 -""" - -_mod_example_SERIES = """ -Examples --------- ->>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) ->>> a -a 1.0 -b 1.0 -c 1.0 -d NaN -dtype: float64 ->>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) ->>> b -a 1.0 -b NaN -d 1.0 -e NaN -dtype: float64 ->>> a.mod(b, fill_value=0) -a 0.0 -b NaN -c NaN -d 0.0 -e NaN -dtype: float64 -""" -_pow_example_SERIES = """ -Examples --------- ->>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) ->>> a -a 1.0 -b 1.0 -c 1.0 -d NaN -dtype: float64 ->>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) ->>> b -a 1.0 -b NaN -d 1.0 -e NaN -dtype: float64 ->>> a.pow(b, fill_value=0) -a 1.0 -b 1.0 -c 1.0 -d 0.0 -e NaN -dtype: float64 -""" - -_op_descriptions = { - # Arithmetic Operators - "add": { - "op": "+", - "desc": "Addition", - "reverse": "radd", - "series_examples": _add_example_SERIES, - }, - "sub": { - "op": "-", - "desc": "Subtraction", - "reverse": "rsub", - "series_examples": _sub_example_SERIES, - }, - "mul": { - "op": "*", - "desc": "Multiplication", - "reverse": "rmul", - "series_examples": _mul_example_SERIES, - "df_examples": None, - }, - "mod": { - "op": "%", - "desc": "Modulo", - "reverse": "rmod", - "series_examples": _mod_example_SERIES, - }, - "pow": { - "op": "**", - "desc": "Exponential power", - "reverse": "rpow", - "series_examples": _pow_example_SERIES, - "df_examples": None, - }, - "truediv": { - "op": "/", - "desc": "Floating division", - "reverse": "rtruediv", - "series_examples": _div_example_SERIES, - "df_examples": None, - }, - "floordiv": { - "op": "//", - "desc": "Integer division", - "reverse": "rfloordiv", - "series_examples": _floordiv_example_SERIES, - "df_examples": None, - }, - "divmod": { - "op": "divmod", - "desc": "Integer division and modulo", - "reverse": "rdivmod", - "series_examples": None, - "df_examples": None, - }, - # Comparison Operators - "eq": {"op": "==", "desc": "Equal to", "reverse": None, "series_examples": None}, - "ne": { - "op": "!=", - "desc": "Not equal to", - "reverse": None, - "series_examples": None, - }, - "lt": {"op": "<", "desc": "Less than", "reverse": None, "series_examples": None}, - "le": { - "op": "<=", - "desc": "Less than or equal to", - "reverse": None, - "series_examples": None, - }, - "gt": {"op": ">", "desc": "Greater than", "reverse": None, "series_examples": None}, - "ge": { - "op": ">=", - "desc": "Greater than or equal to", - "reverse": None, - "series_examples": None, - }, -} # type: Dict[str, Dict[str, Optional[str]]] - -_op_names = list(_op_descriptions.keys()) -for key in _op_names: - reverse_op = _op_descriptions[key]["reverse"] - if reverse_op is not None: - _op_descriptions[reverse_op] = _op_descriptions[key].copy() - _op_descriptions[reverse_op]["reverse"] = key - -_flex_doc_SERIES = """ -Return {desc} of series and other, element-wise (binary operator `{op_name}`). - -Equivalent to ``{equiv}``, but with support to substitute a fill_value for -missing data in one of the inputs. - -Parameters ----------- -other : Series or scalar value -fill_value : None or float value, default None (NaN) - Fill existing missing (NaN) values, and any new element needed for - successful Series alignment, with this value before computation. - If data in both corresponding Series locations is missing - the result will be missing. -level : int or name - Broadcast across a level, matching Index values on the - passed MultiIndex level. - -Returns -------- -Series - The result of the operation. - -See Also --------- -Series.{reverse} -""" - -_arith_doc_FRAME = """ -Binary operator %s with support to substitute a fill_value for missing data in -one of the inputs - -Parameters ----------- -other : Series, DataFrame, or constant -axis : {0, 1, 'index', 'columns'} - For Series input, axis to match Series index on -fill_value : None or float value, default None - Fill existing missing (NaN) values, and any new element needed for - successful DataFrame alignment, with this value before computation. - If data in both corresponding DataFrame locations is missing - the result will be missing -level : int or name - Broadcast across a level, matching Index values on the - passed MultiIndex level - -Returns -------- -result : DataFrame - -Notes ------ -Mismatched indices will be unioned together -""" - -_flex_doc_FRAME = """ -Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`). - -Equivalent to ``{equiv}``, but with support to substitute a fill_value -for missing data in one of the inputs. With reverse version, `{reverse}`. - -Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to -arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`. - -Parameters ----------- -other : scalar, sequence, Series, or DataFrame - Any single or multiple element data structure, or list-like object. -axis : {{0 or 'index', 1 or 'columns'}} - Whether to compare by the index (0 or 'index') or columns - (1 or 'columns'). For Series input, axis to match Series index on. -level : int or label - Broadcast across a level, matching Index values on the - passed MultiIndex level. -fill_value : float or None, default None - Fill existing missing (NaN) values, and any new element needed for - successful DataFrame alignment, with this value before computation. - If data in both corresponding DataFrame locations is missing - the result will be missing. - -Returns -------- -DataFrame - Result of the arithmetic operation. - -See Also --------- -DataFrame.add : Add DataFrames. -DataFrame.sub : Subtract DataFrames. -DataFrame.mul : Multiply DataFrames. -DataFrame.div : Divide DataFrames (float division). -DataFrame.truediv : Divide DataFrames (float division). -DataFrame.floordiv : Divide DataFrames (integer division). -DataFrame.mod : Calculate modulo (remainder after division). -DataFrame.pow : Calculate exponential power. - -Notes ------ -Mismatched indices will be unioned together. - -Examples --------- ->>> df = pd.DataFrame({{'angles': [0, 3, 4], -... 'degrees': [360, 180, 360]}}, -... index=['circle', 'triangle', 'rectangle']) ->>> df - angles degrees -circle 0 360 -triangle 3 180 -rectangle 4 360 - -Add a scalar with operator version which return the same -results. - ->>> df + 1 - angles degrees -circle 1 361 -triangle 4 181 -rectangle 5 361 - ->>> df.add(1) - angles degrees -circle 1 361 -triangle 4 181 -rectangle 5 361 - -Divide by constant with reverse version. - ->>> df.div(10) - angles degrees -circle 0.0 36.0 -triangle 0.3 18.0 -rectangle 0.4 36.0 - ->>> df.rdiv(10) - angles degrees -circle inf 0.027778 -triangle 3.333333 0.055556 -rectangle 2.500000 0.027778 - -Subtract a list and Series by axis with operator version. - ->>> df - [1, 2] - angles degrees -circle -1 358 -triangle 2 178 -rectangle 3 358 - ->>> df.sub([1, 2], axis='columns') - angles degrees -circle -1 358 -triangle 2 178 -rectangle 3 358 - ->>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']), -... axis='index') - angles degrees -circle -1 359 -triangle 2 179 -rectangle 3 359 - -Multiply a DataFrame of different shape with operator version. - ->>> other = pd.DataFrame({{'angles': [0, 3, 4]}}, -... index=['circle', 'triangle', 'rectangle']) ->>> other - angles -circle 0 -triangle 3 -rectangle 4 - ->>> df * other - angles degrees -circle 0 NaN -triangle 9 NaN -rectangle 16 NaN - ->>> df.mul(other, fill_value=0) - angles degrees -circle 0 0.0 -triangle 9 0.0 -rectangle 16 0.0 - -Divide by a MultiIndex by level. - ->>> df_multindex = pd.DataFrame({{'angles': [0, 3, 4, 4, 5, 6], -... 'degrees': [360, 180, 360, 360, 540, 720]}}, -... index=[['A', 'A', 'A', 'B', 'B', 'B'], -... ['circle', 'triangle', 'rectangle', -... 'square', 'pentagon', 'hexagon']]) ->>> df_multindex - angles degrees -A circle 0 360 - triangle 3 180 - rectangle 4 360 -B square 4 360 - pentagon 5 540 - hexagon 6 720 - ->>> df.div(df_multindex, level=1, fill_value=0) - angles degrees -A circle NaN 1.0 - triangle 1.0 1.0 - rectangle 1.0 1.0 -B square 0.0 0.0 - pentagon 0.0 0.0 - hexagon 0.0 0.0 -""" - -_flex_comp_doc_FRAME = """ -Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`). - -Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison -operators. - -Equivalent to `==`, `=!`, `<=`, `<`, `>=`, `>` with support to choose axis -(rows or columns) and level for comparison. - -Parameters ----------- -other : scalar, sequence, Series, or DataFrame - Any single or multiple element data structure, or list-like object. -axis : {{0 or 'index', 1 or 'columns'}}, default 'columns' - Whether to compare by the index (0 or 'index') or columns - (1 or 'columns'). -level : int or label - Broadcast across a level, matching Index values on the passed - MultiIndex level. - -Returns -------- -DataFrame of bool - Result of the comparison. - -See Also --------- -DataFrame.eq : Compare DataFrames for equality elementwise. -DataFrame.ne : Compare DataFrames for inequality elementwise. -DataFrame.le : Compare DataFrames for less than inequality - or equality elementwise. -DataFrame.lt : Compare DataFrames for strictly less than - inequality elementwise. -DataFrame.ge : Compare DataFrames for greater than inequality - or equality elementwise. -DataFrame.gt : Compare DataFrames for strictly greater than - inequality elementwise. - -Notes ------ -Mismatched indices will be unioned together. -`NaN` values are considered different (i.e. `NaN` != `NaN`). - -Examples --------- ->>> df = pd.DataFrame({{'cost': [250, 150, 100], -... 'revenue': [100, 250, 300]}}, -... index=['A', 'B', 'C']) ->>> df - cost revenue -A 250 100 -B 150 250 -C 100 300 - -Comparison with a scalar, using either the operator or method: - ->>> df == 100 - cost revenue -A False True -B False False -C True False - ->>> df.eq(100) - cost revenue -A False True -B False False -C True False - -When `other` is a :class:`Series`, the columns of a DataFrame are aligned -with the index of `other` and broadcast: - ->>> df != pd.Series([100, 250], index=["cost", "revenue"]) - cost revenue -A True True -B True False -C False True - -Use the method to control the broadcast axis: - ->>> df.ne(pd.Series([100, 300], index=["A", "D"]), axis='index') - cost revenue -A True False -B True True -C True True -D True True - -When comparing to an arbitrary sequence, the number of columns must -match the number elements in `other`: - ->>> df == [250, 100] - cost revenue -A True True -B False False -C False False - -Use the method to control the axis: - ->>> df.eq([250, 250, 100], axis='index') - cost revenue -A True False -B False True -C True False - -Compare to a DataFrame of different shape. - ->>> other = pd.DataFrame({{'revenue': [300, 250, 100, 150]}}, -... index=['A', 'B', 'C', 'D']) ->>> other - revenue -A 300 -B 250 -C 100 -D 150 - ->>> df.gt(other) - cost revenue -A False False -B False False -C False True -D False False - -Compare to a MultiIndex by level. - ->>> df_multindex = pd.DataFrame({{'cost': [250, 150, 100, 150, 300, 220], -... 'revenue': [100, 250, 300, 200, 175, 225]}}, -... index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'], -... ['A', 'B', 'C', 'A', 'B', 'C']]) ->>> df_multindex - cost revenue -Q1 A 250 100 - B 150 250 - C 100 300 -Q2 A 150 200 - B 300 175 - C 220 225 - ->>> df.le(df_multindex, level=1) - cost revenue -Q1 A True True - B True True - C True True -Q2 A False True - B True False - C True False -""" - - -def _make_flex_doc(op_name, typ): - """ - Make the appropriate substitutions for the given operation and class-typ - into either _flex_doc_SERIES or _flex_doc_FRAME to return the docstring - to attach to a generated method. - - Parameters - ---------- - op_name : str {'__add__', '__sub__', ... '__eq__', '__ne__', ...} - typ : str {series, 'dataframe']} - - Returns - ------- - doc : str - """ - op_name = op_name.replace("__", "") - op_desc = _op_descriptions[op_name] - - if op_name.startswith("r"): - equiv = "other " + op_desc["op"] + " " + typ - else: - equiv = typ + " " + op_desc["op"] + " other" - - if typ == "series": - base_doc = _flex_doc_SERIES - doc_no_examples = base_doc.format( - desc=op_desc["desc"], - op_name=op_name, - equiv=equiv, - reverse=op_desc["reverse"], - ) - if op_desc["series_examples"]: - doc = doc_no_examples + op_desc["series_examples"] - else: - doc = doc_no_examples - elif typ == "dataframe": - base_doc = _flex_doc_FRAME - doc = base_doc.format( - desc=op_desc["desc"], - op_name=op_name, - equiv=equiv, - reverse=op_desc["reverse"], - ) - else: - raise AssertionError("Invalid typ argument.") - return doc - - # ----------------------------------------------------------------------------- # Masking NA values and fallbacks for operations numpy does not support diff --git a/pandas/core/ops/docstrings.py b/pandas/core/ops/docstrings.py new file mode 100644 index 00000000000000..93f197366cf32a --- /dev/null +++ b/pandas/core/ops/docstrings.py @@ -0,0 +1,675 @@ +""" +Templating for ops docstrings +""" +from typing import Dict, Optional + + +def _make_flex_doc(op_name, typ): + """ + Make the appropriate substitutions for the given operation and class-typ + into either _flex_doc_SERIES or _flex_doc_FRAME to return the docstring + to attach to a generated method. + + Parameters + ---------- + op_name : str {'__add__', '__sub__', ... '__eq__', '__ne__', ...} + typ : str {series, 'dataframe']} + + Returns + ------- + doc : str + """ + op_name = op_name.replace("__", "") + op_desc = _op_descriptions[op_name] + + if op_name.startswith("r"): + equiv = "other " + op_desc["op"] + " " + typ + else: + equiv = typ + " " + op_desc["op"] + " other" + + if typ == "series": + base_doc = _flex_doc_SERIES + doc_no_examples = base_doc.format( + desc=op_desc["desc"], + op_name=op_name, + equiv=equiv, + reverse=op_desc["reverse"], + ) + if op_desc["series_examples"]: + doc = doc_no_examples + op_desc["series_examples"] + else: + doc = doc_no_examples + elif typ == "dataframe": + base_doc = _flex_doc_FRAME + doc = base_doc.format( + desc=op_desc["desc"], + op_name=op_name, + equiv=equiv, + reverse=op_desc["reverse"], + ) + else: + raise AssertionError("Invalid typ argument.") + return doc + + +_add_example_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.add(b, fill_value=0) +a 2.0 +b 1.0 +c 1.0 +d 1.0 +e NaN +dtype: float64 +""" + +_sub_example_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.subtract(b, fill_value=0) +a 0.0 +b 1.0 +c 1.0 +d -1.0 +e NaN +dtype: float64 +""" + +_mul_example_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.multiply(b, fill_value=0) +a 1.0 +b 0.0 +c 0.0 +d 0.0 +e NaN +dtype: float64 +""" + +_div_example_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.divide(b, fill_value=0) +a 1.0 +b inf +c inf +d 0.0 +e NaN +dtype: float64 +""" + +_floordiv_example_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.floordiv(b, fill_value=0) +a 1.0 +b NaN +c NaN +d 0.0 +e NaN +dtype: float64 +""" + +_mod_example_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.mod(b, fill_value=0) +a 0.0 +b NaN +c NaN +d 0.0 +e NaN +dtype: float64 +""" +_pow_example_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.pow(b, fill_value=0) +a 1.0 +b 1.0 +c 1.0 +d 0.0 +e NaN +dtype: float64 +""" + +_op_descriptions = { + # Arithmetic Operators + "add": { + "op": "+", + "desc": "Addition", + "reverse": "radd", + "series_examples": _add_example_SERIES, + }, + "sub": { + "op": "-", + "desc": "Subtraction", + "reverse": "rsub", + "series_examples": _sub_example_SERIES, + }, + "mul": { + "op": "*", + "desc": "Multiplication", + "reverse": "rmul", + "series_examples": _mul_example_SERIES, + "df_examples": None, + }, + "mod": { + "op": "%", + "desc": "Modulo", + "reverse": "rmod", + "series_examples": _mod_example_SERIES, + }, + "pow": { + "op": "**", + "desc": "Exponential power", + "reverse": "rpow", + "series_examples": _pow_example_SERIES, + "df_examples": None, + }, + "truediv": { + "op": "/", + "desc": "Floating division", + "reverse": "rtruediv", + "series_examples": _div_example_SERIES, + "df_examples": None, + }, + "floordiv": { + "op": "//", + "desc": "Integer division", + "reverse": "rfloordiv", + "series_examples": _floordiv_example_SERIES, + "df_examples": None, + }, + "divmod": { + "op": "divmod", + "desc": "Integer division and modulo", + "reverse": "rdivmod", + "series_examples": None, + "df_examples": None, + }, + # Comparison Operators + "eq": {"op": "==", "desc": "Equal to", "reverse": None, "series_examples": None}, + "ne": { + "op": "!=", + "desc": "Not equal to", + "reverse": None, + "series_examples": None, + }, + "lt": {"op": "<", "desc": "Less than", "reverse": None, "series_examples": None}, + "le": { + "op": "<=", + "desc": "Less than or equal to", + "reverse": None, + "series_examples": None, + }, + "gt": {"op": ">", "desc": "Greater than", "reverse": None, "series_examples": None}, + "ge": { + "op": ">=", + "desc": "Greater than or equal to", + "reverse": None, + "series_examples": None, + }, +} # type: Dict[str, Dict[str, Optional[str]]] + +_op_names = list(_op_descriptions.keys()) +for key in _op_names: + reverse_op = _op_descriptions[key]["reverse"] + if reverse_op is not None: + _op_descriptions[reverse_op] = _op_descriptions[key].copy() + _op_descriptions[reverse_op]["reverse"] = key + +_flex_doc_SERIES = """ +Return {desc} of series and other, element-wise (binary operator `{op_name}`). + +Equivalent to ``{equiv}``, but with support to substitute a fill_value for +missing data in one of the inputs. + +Parameters +---------- +other : Series or scalar value +fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result will be missing. +level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + +Returns +------- +Series + The result of the operation. + +See Also +-------- +Series.{reverse} +""" + +_arith_doc_FRAME = """ +Binary operator %s with support to substitute a fill_value for missing data in +one of the inputs + +Parameters +---------- +other : Series, DataFrame, or constant +axis : {0, 1, 'index', 'columns'} + For Series input, axis to match Series index on +fill_value : None or float value, default None + Fill existing missing (NaN) values, and any new element needed for + successful DataFrame alignment, with this value before computation. + If data in both corresponding DataFrame locations is missing + the result will be missing +level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + +Returns +------- +result : DataFrame + +Notes +----- +Mismatched indices will be unioned together +""" + +_flex_doc_FRAME = """ +Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`). + +Equivalent to ``{equiv}``, but with support to substitute a fill_value +for missing data in one of the inputs. With reverse version, `{reverse}`. + +Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to +arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`. + +Parameters +---------- +other : scalar, sequence, Series, or DataFrame + Any single or multiple element data structure, or list-like object. +axis : {{0 or 'index', 1 or 'columns'}} + Whether to compare by the index (0 or 'index') or columns + (1 or 'columns'). For Series input, axis to match Series index on. +level : int or label + Broadcast across a level, matching Index values on the + passed MultiIndex level. +fill_value : float or None, default None + Fill existing missing (NaN) values, and any new element needed for + successful DataFrame alignment, with this value before computation. + If data in both corresponding DataFrame locations is missing + the result will be missing. + +Returns +------- +DataFrame + Result of the arithmetic operation. + +See Also +-------- +DataFrame.add : Add DataFrames. +DataFrame.sub : Subtract DataFrames. +DataFrame.mul : Multiply DataFrames. +DataFrame.div : Divide DataFrames (float division). +DataFrame.truediv : Divide DataFrames (float division). +DataFrame.floordiv : Divide DataFrames (integer division). +DataFrame.mod : Calculate modulo (remainder after division). +DataFrame.pow : Calculate exponential power. + +Notes +----- +Mismatched indices will be unioned together. + +Examples +-------- +>>> df = pd.DataFrame({{'angles': [0, 3, 4], +... 'degrees': [360, 180, 360]}}, +... index=['circle', 'triangle', 'rectangle']) +>>> df + angles degrees +circle 0 360 +triangle 3 180 +rectangle 4 360 + +Add a scalar with operator version which return the same +results. + +>>> df + 1 + angles degrees +circle 1 361 +triangle 4 181 +rectangle 5 361 + +>>> df.add(1) + angles degrees +circle 1 361 +triangle 4 181 +rectangle 5 361 + +Divide by constant with reverse version. + +>>> df.div(10) + angles degrees +circle 0.0 36.0 +triangle 0.3 18.0 +rectangle 0.4 36.0 + +>>> df.rdiv(10) + angles degrees +circle inf 0.027778 +triangle 3.333333 0.055556 +rectangle 2.500000 0.027778 + +Subtract a list and Series by axis with operator version. + +>>> df - [1, 2] + angles degrees +circle -1 358 +triangle 2 178 +rectangle 3 358 + +>>> df.sub([1, 2], axis='columns') + angles degrees +circle -1 358 +triangle 2 178 +rectangle 3 358 + +>>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']), +... axis='index') + angles degrees +circle -1 359 +triangle 2 179 +rectangle 3 359 + +Multiply a DataFrame of different shape with operator version. + +>>> other = pd.DataFrame({{'angles': [0, 3, 4]}}, +... index=['circle', 'triangle', 'rectangle']) +>>> other + angles +circle 0 +triangle 3 +rectangle 4 + +>>> df * other + angles degrees +circle 0 NaN +triangle 9 NaN +rectangle 16 NaN + +>>> df.mul(other, fill_value=0) + angles degrees +circle 0 0.0 +triangle 9 0.0 +rectangle 16 0.0 + +Divide by a MultiIndex by level. + +>>> df_multindex = pd.DataFrame({{'angles': [0, 3, 4, 4, 5, 6], +... 'degrees': [360, 180, 360, 360, 540, 720]}}, +... index=[['A', 'A', 'A', 'B', 'B', 'B'], +... ['circle', 'triangle', 'rectangle', +... 'square', 'pentagon', 'hexagon']]) +>>> df_multindex + angles degrees +A circle 0 360 + triangle 3 180 + rectangle 4 360 +B square 4 360 + pentagon 5 540 + hexagon 6 720 + +>>> df.div(df_multindex, level=1, fill_value=0) + angles degrees +A circle NaN 1.0 + triangle 1.0 1.0 + rectangle 1.0 1.0 +B square 0.0 0.0 + pentagon 0.0 0.0 + hexagon 0.0 0.0 +""" + +_flex_comp_doc_FRAME = """ +Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`). + +Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison +operators. + +Equivalent to `==`, `=!`, `<=`, `<`, `>=`, `>` with support to choose axis +(rows or columns) and level for comparison. + +Parameters +---------- +other : scalar, sequence, Series, or DataFrame + Any single or multiple element data structure, or list-like object. +axis : {{0 or 'index', 1 or 'columns'}}, default 'columns' + Whether to compare by the index (0 or 'index') or columns + (1 or 'columns'). +level : int or label + Broadcast across a level, matching Index values on the passed + MultiIndex level. + +Returns +------- +DataFrame of bool + Result of the comparison. + +See Also +-------- +DataFrame.eq : Compare DataFrames for equality elementwise. +DataFrame.ne : Compare DataFrames for inequality elementwise. +DataFrame.le : Compare DataFrames for less than inequality + or equality elementwise. +DataFrame.lt : Compare DataFrames for strictly less than + inequality elementwise. +DataFrame.ge : Compare DataFrames for greater than inequality + or equality elementwise. +DataFrame.gt : Compare DataFrames for strictly greater than + inequality elementwise. + +Notes +----- +Mismatched indices will be unioned together. +`NaN` values are considered different (i.e. `NaN` != `NaN`). + +Examples +-------- +>>> df = pd.DataFrame({{'cost': [250, 150, 100], +... 'revenue': [100, 250, 300]}}, +... index=['A', 'B', 'C']) +>>> df + cost revenue +A 250 100 +B 150 250 +C 100 300 + +Comparison with a scalar, using either the operator or method: + +>>> df == 100 + cost revenue +A False True +B False False +C True False + +>>> df.eq(100) + cost revenue +A False True +B False False +C True False + +When `other` is a :class:`Series`, the columns of a DataFrame are aligned +with the index of `other` and broadcast: + +>>> df != pd.Series([100, 250], index=["cost", "revenue"]) + cost revenue +A True True +B True False +C False True + +Use the method to control the broadcast axis: + +>>> df.ne(pd.Series([100, 300], index=["A", "D"]), axis='index') + cost revenue +A True False +B True True +C True True +D True True + +When comparing to an arbitrary sequence, the number of columns must +match the number elements in `other`: + +>>> df == [250, 100] + cost revenue +A True True +B False False +C False False + +Use the method to control the axis: + +>>> df.eq([250, 250, 100], axis='index') + cost revenue +A True False +B False True +C True False + +Compare to a DataFrame of different shape. + +>>> other = pd.DataFrame({{'revenue': [300, 250, 100, 150]}}, +... index=['A', 'B', 'C', 'D']) +>>> other + revenue +A 300 +B 250 +C 100 +D 150 + +>>> df.gt(other) + cost revenue +A False False +B False False +C False True +D False False + +Compare to a MultiIndex by level. + +>>> df_multindex = pd.DataFrame({{'cost': [250, 150, 100, 150, 300, 220], +... 'revenue': [100, 250, 300, 200, 175, 225]}}, +... index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'], +... ['A', 'B', 'C', 'A', 'B', 'C']]) +>>> df_multindex + cost revenue +Q1 A 250 100 + B 150 250 + C 100 300 +Q2 A 150 200 + B 300 175 + C 220 225 + +>>> df.le(df_multindex, level=1) + cost revenue +Q1 A True True + B True True + C True True +Q2 A False True + B True False + C True False +""" From 51860f12bb17516b5dcf3ca2497131ef9439b309 Mon Sep 17 00:00:00 2001 From: Guillaume Gay Date: Fri, 12 Jul 2019 21:38:24 +0200 Subject: [PATCH 223/238] adds non regression test for GH27358 (#27360) --- pandas/tests/frame/test_constructors.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index a3817d3c226f53..7e6b707f01acfd 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2397,6 +2397,13 @@ def test_from_records_len0_with_columns(self): assert result.index.name == "foo" tm.assert_index_equal(result.columns, expected) + def test_from_records_series_list_dict(self): + # GH27358 + expected = DataFrame([[{"a": 1, "b": 2}, {"a": 3, "b": 4}]]).T + data = Series([[{"a": 1, "b": 2}], [{"a": 3, "b": 4}]]) + result = DataFrame.from_records(data) + tm.assert_frame_equal(result, expected) + def test_to_frame_with_falsey_names(self): # GH 16114 result = Series(name=0).to_frame().dtypes From 269d36814e471880627109a5e5ea63d4a5fdc5a6 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Fri, 12 Jul 2019 18:19:37 -0400 Subject: [PATCH 224/238] Add small docs about xlsxwriter in user guide (#27366) --- doc/source/user_guide/io.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 3050a630153926..eac86dda31507b 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3231,6 +3231,10 @@ The look and feel of Excel worksheets created from pandas can be modified using * ``float_format`` : Format string for floating point numbers (default ``None``). * ``freeze_panes`` : A tuple of two integers representing the bottommost row and rightmost column to freeze. Each of these parameters is one-based, so (1, 1) will freeze the first row and first column (default ``None``). +Using the `Xlsxwriter`_ engine provides many options for controlling the +format of an Excel worksheet created with the ``to_excel`` method. Excellent examples can be found in the +`Xlsxwriter`_ documentation here: https://xlsxwriter.readthedocs.io/working_with_pandas.html + .. _io.ods: OpenDocument Spreadsheets From 208bbe8d0b55c533011265d9168a4d0f4f8f20aa Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 15 Jul 2019 05:10:24 -0700 Subject: [PATCH 225/238] Added annotations to util._decorators (#27393) --- pandas/util/_decorators.py | 46 ++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index f39020f4165dfa..5c7d481ff2586e 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -1,14 +1,21 @@ from functools import wraps import inspect from textwrap import dedent +from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union import warnings from pandas._libs.properties import cache_readonly # noqa def deprecate( - name, alternative, version, alt_name=None, klass=None, stacklevel=2, msg=None -): + name: str, + alternative: Callable, + version: str, + alt_name: Optional[str] = None, + klass: Optional[Type[Warning]] = None, + stacklevel: int = 2, + msg: Optional[str] = None, +) -> Callable: """ Return a new function that emits a deprecation warning on use. @@ -80,7 +87,12 @@ def wrapper(*args, **kwargs): return wrapper -def deprecate_kwarg(old_arg_name, new_arg_name, mapping=None, stacklevel=2): +def deprecate_kwarg( + old_arg_name: str, + new_arg_name: Optional[str], + mapping: Optional[Union[Dict, Callable[[Any], Any]]] = None, + stacklevel: int = 2, +) -> Callable: """ Decorator to deprecate a keyword argument of a function. @@ -200,7 +212,9 @@ def wrapper(*args, **kwargs): return _deprecate_kwarg -def rewrite_axis_style_signature(name, extra_params): +def rewrite_axis_style_signature( + name: str, extra_params: List[Tuple[str, Any]] +) -> Callable: def decorate(func): @wraps(func) def wrapper(*args, **kwargs): @@ -265,11 +279,11 @@ def __init__(self, *args, **kwargs): self.params = args or kwargs - def __call__(self, func): + def __call__(self, func: Callable) -> Callable: func.__doc__ = func.__doc__ and func.__doc__ % self.params return func - def update(self, *args, **kwargs): + def update(self, *args, **kwargs) -> None: """ Update self.params with supplied args. @@ -278,18 +292,6 @@ def update(self, *args, **kwargs): self.params.update(*args, **kwargs) - @classmethod - def from_params(cls, params): - """ - In the case where the params is a mutable sequence (list or dictionary) - and it may change before this class is called, one may explicitly use a - reference to the params rather than using *args or **kwargs which will - copy the values and not reference them. - """ - result = cls() - result.params = params - return result - class Appender: """ @@ -311,14 +313,14 @@ def my_dog(has='fleas'): pass """ - def __init__(self, addendum, join="", indents=0): + def __init__(self, addendum: Optional[str], join: str = "", indents: int = 0): if indents > 0: - self.addendum = indent(addendum, indents=indents) + self.addendum = indent(addendum, indents=indents) # type: Optional[str] else: self.addendum = addendum self.join = join - def __call__(self, func): + def __call__(self, func: Callable) -> Callable: func.__doc__ = func.__doc__ if func.__doc__ else "" self.addendum = self.addendum if self.addendum else "" docitems = [func.__doc__, self.addendum] @@ -326,7 +328,7 @@ def __call__(self, func): return func -def indent(text, indents=1): +def indent(text: Optional[str], indents: int = 1) -> str: if not text or not isinstance(text, str): return "" jointext = "".join(["\n"] + [" "] * indents) From b9d6433a5776ff8be223c2ef3aeec27d53d60aff Mon Sep 17 00:00:00 2001 From: pilkibun <51503352+pilkibun@users.noreply.github.com> Date: Mon, 15 Jul 2019 12:21:58 +0000 Subject: [PATCH 226/238] Fix compilation warnings (#27371) --- pandas/_libs/internals.pyx | 2 +- pandas/_libs/tslibs/c_timestamp.pyx | 2 +- pandas/_libs/window.pyx | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 0d45897de859ae..54ee4753ba3326 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -382,7 +382,7 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True): object blkno list group_order - dict group_slices + dict group_dict int64_t[:] res_view n = blknos.shape[0] diff --git a/pandas/_libs/tslibs/c_timestamp.pyx b/pandas/_libs/tslibs/c_timestamp.pyx index f9d1a906207fe1..2d3ea3e14775ee 100644 --- a/pandas/_libs/tslibs/c_timestamp.pyx +++ b/pandas/_libs/tslibs/c_timestamp.pyx @@ -213,7 +213,7 @@ cdef class _Timestamp(datetime): def __add__(self, other): cdef: - int64_t other_int, nanos + int64_t other_int, nanos = 0 if is_timedelta64_object(other): other_int = other.astype('timedelta64[ns]').view('i8') diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 46e4b17b8164cb..0a986942d2a09d 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1682,7 +1682,7 @@ def roll_window(ndarray[float64_t, ndim=1, cast=True] values, Assume len(weights) << len(values) """ cdef: - ndarray[float64_t] output, tot_wgt, counts + float64_t[:] output, tot_wgt, counts Py_ssize_t in_i, win_i, win_n, in_n float64_t val_in, val_win, c, w From 0f725bf472e0d1f5c0dddd7f98c00e0473dc8b4d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 15 Jul 2019 05:25:26 -0700 Subject: [PATCH 227/238] BUG: Fix take with read-only indexer, closes #17192 (#27375) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/_libs/algos_take_helper.pxi.in | 8 ++++---- pandas/tests/indexing/test_indexing.py | 15 +++++++++++++++ 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index eeaafd7ad7d51a..7397ae8fda80cc 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -1054,6 +1054,7 @@ Indexing - Bug in :class:`CategoricalIndex` and :class:`Categorical` incorrectly raising ``ValueError`` instead of ``TypeError`` when a list is passed using the ``in`` operator (``__contains__``) (:issue:`21729`) - Bug in setting a new value in a :class:`Series` with a :class:`Timedelta` object incorrectly casting the value to an integer (:issue:`22717`) - Bug in :class:`Series` setting a new key (``__setitem__``) with a timezone-aware datetime incorrectly raising ``ValueError`` (:issue:`12862`) +- Bug in :meth:`DataFrame.iloc` when indexing with a read-only indexer (:issue:`17192`) - Missing diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 2fea8b17fd9d7f..3a3adc71875ed2 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -148,7 +148,7 @@ def get_dispatch(dtypes): @cython.wraparound(False) @cython.boundscheck(False) cdef inline take_1d_{{name}}_{{dest}}_memview({{c_type_in}}[:] values, - int64_t[:] indexer, + const int64_t[:] indexer, {{c_type_out}}[:] out, fill_value=np.nan): @@ -159,7 +159,7 @@ cdef inline take_1d_{{name}}_{{dest}}_memview({{c_type_in}}[:] values, @cython.wraparound(False) @cython.boundscheck(False) def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values, - int64_t[:] indexer, + const int64_t[:] indexer, {{c_type_out}}[:] out, fill_value=np.nan): @@ -178,7 +178,7 @@ def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values, @cython.wraparound(False) @cython.boundscheck(False) cdef inline take_2d_axis0_{{name}}_{{dest}}_memview({{c_type_in}}[:, :] values, - int64_t[:] indexer, + const int64_t[:] indexer, {{c_type_out}}[:, :] out, fill_value=np.nan): {{inner_take_2d_axis0}} @@ -205,7 +205,7 @@ def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, @cython.wraparound(False) @cython.boundscheck(False) cdef inline take_2d_axis1_{{name}}_{{dest}}_memview({{c_type_in}}[:, :] values, - int64_t[:] indexer, + const int64_t[:] indexer, {{c_type_out}}[:, :] out, fill_value=np.nan): {{inner_take_2d_axis1}} diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 77052de5e80e60..c29b0d644601a0 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -1244,3 +1244,18 @@ def test_ndframe_indexing_raises(idxr, error, error_message): frame = NDFrame(np.random.randint(5, size=(2, 2, 2))) with pytest.raises(error, match=error_message): idxr(frame)[0] + + +def test_readonly_indices(): + # GH#17192 iloc with read-only array raising TypeError + df = pd.DataFrame({"data": np.ones(100, dtype="float64")}) + indices = np.array([1, 3, 6]) + indices.flags.writeable = False + + result = df.iloc[indices] + expected = df.loc[[1, 3, 6]] + tm.assert_frame_equal(result, expected) + + result = df["data"].iloc[indices] + expected = df["data"].loc[[1, 3, 6]] + tm.assert_series_equal(result, expected) From c104a0c9ce4c1dd0d63df121f0fa50657b6d6447 Mon Sep 17 00:00:00 2001 From: Enrico Rotundo Date: Mon, 15 Jul 2019 15:35:19 +0200 Subject: [PATCH 228/238] DOC: update file path descriptions in IO docstrings (#25164) --- pandas/io/excel/_base.py | 14 ++++++++++---- pandas/io/feather_format.py | 21 ++++++++++++++++----- pandas/io/html.py | 2 +- pandas/io/json/_json.py | 17 ++++++++++++----- pandas/io/packers.py | 14 ++++++++++++-- pandas/io/parquet.py | 14 ++++++++++++-- pandas/io/parsers.py | 14 +++++++------- pandas/io/pytables.py | 18 +++++++++++++----- pandas/io/sas/sasreader.py | 14 ++++++++++++-- pandas/io/stata.py | 12 ++++++++++-- 10 files changed, 105 insertions(+), 35 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index fae8f4203e9a0a..763b12949ba0a5 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -41,10 +41,16 @@ Parameters ---------- -io : str, file descriptor, pathlib.Path, ExcelFile or xlrd.Book - The string could be a URL. Valid URL schemes include http, ftp, s3, - gcs, and file. For file URLs, a host is expected. For instance, a local - file could be /path/to/workbook.xlsx. +io : str, ExcelFile, xlrd.Book, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: ``file://localhost/path/to/table.xlsx``. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) + or ``StringIO``. sheet_name : str, int, list, or None, default 0 Strings are used for sheet names. Integers are used in zero-indexed sheet positions. Lists of strings/integers are used to request diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 05608f69c0d9da..296b1eef68d7d1 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -69,24 +69,35 @@ def to_feather(df, path): @deprecate_kwarg(old_arg_name="nthreads", new_arg_name="use_threads") def read_feather(path, columns=None, use_threads=True): """ - Load a feather-format object from the file path + Load a feather-format object from the file path. .. versionadded 0.20.0 Parameters ---------- - path : string file path, or file-like object + path : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: + ``file://localhost/path/to/table.feather``. + + If you want to pass in a path object, pandas accepts any + ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) + or ``StringIO``. columns : sequence, default None - If not provided, all columns are read + If not provided, all columns are read. .. versionadded 0.24.0 nthreads : int, default 1 - Number of CPU threads to use when reading to pandas.DataFrame + Number of CPU threads to use when reading to pandas.DataFrame. .. versionadded 0.21.0 .. deprecated 0.24.0 use_threads : bool, default True - Whether to parallelize reading using multiple threads + Whether to parallelize reading using multiple threads. .. versionadded 0.24.0 diff --git a/pandas/io/html.py b/pandas/io/html.py index 91f5e5a949ac32..12c8ec4214b381 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -941,7 +941,7 @@ def read_html( Parameters ---------- - io : str or file-like + io : str, path object or file-like object A URL, a file-like object, or a raw string containing HTML. Note that lxml only accepts the http, ftp and file url protocols. If you have a URL that starts with ``'https'`` you might try removing the ``'s'``. diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index f3c966bb1a476d..e2022490c3749a 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -352,11 +352,18 @@ def read_json( Parameters ---------- - path_or_buf : a valid JSON string or file-like, default: None - The string could be a URL. Valid URL schemes include http, ftp, s3, - gcs, and file. For file URLs, a host is expected. For instance, a local - file could be ``file://localhost/path/to/table.json`` - + path_or_buf : a valid JSON str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: + ``file://localhost/path/to/table.json``. + + If you want to pass in a path object, pandas accepts any + ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) + or ``StringIO``. orient : string, Indication of expected JSON string format. Compatible JSON strings can be produced by ``to_json()`` with a diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 2e411fb07885fc..04e49708ff082b 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -156,7 +156,7 @@ def writer(fh): def read_msgpack(path_or_buf, encoding="utf-8", iterator=False, **kwargs): """ Load msgpack pandas object from the specified - file path + file path. .. deprecated:: 0.25.0 @@ -166,7 +166,17 @@ def read_msgpack(path_or_buf, encoding="utf-8", iterator=False, **kwargs): Parameters ---------- - path_or_buf : string File path, BytesIO like or string + path_or_buf : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. + + If you want to pass in a path object, pandas accepts any + ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) or + ``StringIO``. encoding : Encoding for decoding msgpack str type iterator : boolean, if True, return an iterator to the unpacker (default is False) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index a2502df45169f1..617f4f44ae8afc 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -261,8 +261,18 @@ def read_parquet(path, engine="auto", columns=None, **kwargs): Parameters ---------- - path : string - File path + path : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: + ``file://localhost/path/to/table.parquet``. + + If you want to pass in a path object, pandas accepts any + ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) + or ``StringIO``. engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' Parquet library to use. If 'auto', then the option ``io.parquet.engine`` is used. The default ``io.parquet.engine`` diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 356934d457cc9c..6cc47b984914a1 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -85,13 +85,12 @@ Parameters ---------- -filepath_or_buffer : str, path object, or file-like object +filepath_or_buffer : str, path object or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: file://localhost/path/to/table.csv. - If you want to pass in a path object, pandas accepts either - ``pathlib.Path`` or ``py._path.local.LocalPath``. + If you want to pass in a path object, pandas accepts any ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, such as a file handler (e.g. via builtin ``open`` function) or ``StringIO``. @@ -728,13 +727,14 @@ def read_fwf( Parameters ---------- - filepath_or_buffer : str, path object, or file-like object + filepath_or_buffer : str, path object or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is - expected. A local file could be: file://localhost/path/to/table.csv. + expected. A local file could be: + ``file://localhost/path/to/table.csv``. - If you want to pass in a path object, pandas accepts either - ``pathlib.Path`` or ``py._path.local.LocalPath``. + If you want to pass in a path object, pandas accepts any + ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, such as a file handler (e.g. via builtin ``open`` function) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 1db177d792401b..3433d256092552 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -289,11 +289,19 @@ def read_hdf(path_or_buf, key=None, mode="r", **kwargs): Parameters ---------- - path_or_buf : string, buffer or path object - Path to the file to open, or an open :class:`pandas.HDFStore` object. - Supports any object implementing the ``__fspath__`` protocol. - This includes :class:`pathlib.Path` and py._path.local.LocalPath - objects. + path_or_buf : str, path object, pandas.HDFStore or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: ``file://localhost/path/to/table.h5``. + + If you want to pass in a path object, pandas accepts any + ``os.PathLike``. + + Alternatively, pandas accepts an open :class:`pandas.HDFStore` object. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) + or ``StringIO``. .. versionadded:: 0.19.0 support for pathlib, py.path. .. versionadded:: 0.21.0 support for __fspath__ protocol. diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 680425f421eec5..571c544d48b294 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -17,8 +17,18 @@ def read_sas( Parameters ---------- - filepath_or_buffer : string or file-like object - Path to the SAS file. + filepath_or_buffer : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: + ``file://localhost/path/to/table.sas``. + + If you want to pass in a path object, pandas accepts any + ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) + or ``StringIO``. format : string {'xport', 'sas7bdat'} or None If None, file format is inferred from file extension. If 'xport' or 'sas7bdat', uses the corresponding format. diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 29cb2a5dc0f0e0..8dbcee829ee1e3 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -94,8 +94,16 @@ Parameters ---------- -filepath_or_buffer : string or file-like object - Path to .dta file or object implementing a binary read() functions. +filepath_or_buffer : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: ``file://localhost/path/to/table.dta``. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) + or ``StringIO``. %s %s %s From 2ed1f28cfa8e5bd0a6a1723c2952da7a946b1c33 Mon Sep 17 00:00:00 2001 From: tadeja Date: Mon, 15 Jul 2019 19:11:31 +0200 Subject: [PATCH 229/238] DOC: Explicitly include "private" ExtensionArray methods in API docs (#27279) --- doc/source/reference/extensions.rst | 36 ++++++++++++++++- pandas/core/arrays/base.py | 62 ++++++++++++++++++++++++----- pandas/core/indexes/category.py | 4 +- 3 files changed, 90 insertions(+), 12 deletions(-) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 34f76642119c8d..407aab4bb1f1b7 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -18,10 +18,44 @@ objects. api.extensions.register_series_accessor api.extensions.register_index_accessor api.extensions.ExtensionDtype - api.extensions.ExtensionArray .. autosummary:: :toctree: api/ :template: autosummary/class_without_autosummary.rst + api.extensions.ExtensionArray arrays.PandasArray + +.. We need this autosummary so that methods and attributes are generated. +.. Separate block, since they aren't classes. + + .. autosummary:: + :toctree: api/ + + api.extensions.ExtensionArray._concat_same_type + api.extensions.ExtensionArray._formatter + api.extensions.ExtensionArray._formatting_values + api.extensions.ExtensionArray._from_factorized + api.extensions.ExtensionArray._from_sequence + api.extensions.ExtensionArray._from_sequence_of_strings + api.extensions.ExtensionArray._ndarray_values + api.extensions.ExtensionArray._reduce + api.extensions.ExtensionArray._values_for_argsort + api.extensions.ExtensionArray._values_for_factorize + api.extensions.ExtensionArray.argsort + api.extensions.ExtensionArray.astype + api.extensions.ExtensionArray.copy + api.extensions.ExtensionArray.dropna + api.extensions.ExtensionArray.factorize + api.extensions.ExtensionArray.fillna + api.extensions.ExtensionArray.isna + api.extensions.ExtensionArray.ravel + api.extensions.ExtensionArray.repeat + api.extensions.ExtensionArray.searchsorted + api.extensions.ExtensionArray.shift + api.extensions.ExtensionArray.take + api.extensions.ExtensionArray.unique + api.extensions.ExtensionArray.dtype + api.extensions.ExtensionArray.nbytes + api.extensions.ExtensionArray.ndim + api.extensions.ExtensionArray.shape diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 2a5556ff6d357a..ee796f9896b52d 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -39,6 +39,39 @@ class ExtensionArray: .. versionadded:: 0.23.0 + Attributes + ---------- + dtype + nbytes + ndim + shape + + Methods + ------- + argsort + astype + copy + dropna + factorize + fillna + isna + ravel + repeat + searchsorted + shift + take + unique + _concat_same_type + _formatter + _formatting_values + _from_factorized + _from_sequence + _from_sequence_of_strings + _ndarray_values + _reduce + _values_for_argsort + _values_for_factorize + Notes ----- The interface includes the following abstract methods that must be @@ -170,7 +203,6 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): Returns ------- ExtensionArray - """ raise AbstractMethodError(cls) @@ -188,7 +220,7 @@ def _from_factorized(cls, values, original): See Also -------- - pandas.factorize + factorize ExtensionArray.factorize """ raise AbstractMethodError(cls) @@ -654,7 +686,7 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArra See Also -------- - pandas.factorize : Top-level factorize method that dispatches here. + factorize : Top-level factorize method that dispatches here. Notes ----- @@ -778,6 +810,11 @@ def take( When `indices` contains negative values other than ``-1`` and `allow_fill` is True. + See Also + -------- + numpy.take + api.extensions.take + Notes ----- ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, @@ -785,11 +822,6 @@ def take( it's called by :meth:`Series.reindex`, or any other method that causes realignment, with a `fill_value`. - See Also - -------- - numpy.take - pandas.api.extensions.take - Examples -------- Here's an example implementation, which relies on casting the @@ -862,7 +894,7 @@ def _formatter(self, boxed: bool = False) -> Callable[[Any], Optional[str]]: Parameters ---------- - boxed: bool, default False + boxed : bool, default False An indicated for whether or not your array is being printed within a Series, DataFrame, or Index (True), or just by itself (False). This may be useful if you want scalar values @@ -889,6 +921,10 @@ def _formatting_values(self) -> np.ndarray: .. deprecated:: 0.24.0 Use :meth:`ExtensionArray._formatter` instead. + + Returns + ------- + array : ndarray """ return np.array(self) @@ -904,6 +940,10 @@ def ravel(self, order="C") -> ABCExtensionArray: ---------- order : {None, 'C', 'F', 'A', 'K'}, default 'C' + Returns + ------- + ExtensionArray + Notes ----- - Because ExtensionArrays are 1D-only, this is a no-op. @@ -944,6 +984,10 @@ def _ndarray_values(self) -> np.ndarray: The expectation is that this is cheap to compute, and is primarily used for interacting with our indexers. + + Returns + ------- + array : ndarray """ return np.array(self) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 8f605e487ecf49..e14bf7f86c0bee 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -306,12 +306,12 @@ def _is_dtype_compat(self, other): def equals(self, other): """ - Determine if two CategorialIndex objects contain the same elements. + Determine if two CategoricalIndex objects contain the same elements. Returns ------- bool - If two CategorialIndex objects have equal elements True, + If two CategoricalIndex objects have equal elements True, otherwise False. """ if self.is_(other): From 1fbc16d0ec31b82be5bd6db31edc011f3a5f82a6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 15 Jul 2019 11:15:42 -0700 Subject: [PATCH 230/238] disallow np.timedelta64 in is_integer (#27401) --- pandas/_libs/tslibs/util.pxd | 3 ++- pandas/core/indexes/timedeltas.py | 2 +- pandas/core/internals/blocks.py | 6 ++---- pandas/tests/dtypes/test_inference.py | 4 +--- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index dc32dcd5e0b21a..07c2805dd0ef61 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -76,7 +76,8 @@ cdef inline bint is_integer_object(object obj) nogil: ----- This counts np.timedelta64 objects as integers. """ - return not PyBool_Check(obj) and PyArray_IsIntegerScalar(obj) + return (not PyBool_Check(obj) and PyArray_IsIntegerScalar(obj) + and not is_timedelta64_object(obj)) cdef inline bint is_float_object(object obj) nogil: diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index ecadd11894bfb3..5a2dece98150fd 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -593,7 +593,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): return lbound else: return lbound + to_offset(parsed.resolution_string) - Timedelta(1, "ns") - elif (is_integer(label) or is_float(label)) and not is_timedelta64_dtype(label): + elif is_integer(label) or is_float(label): self._invalid_indexer("slice", label) return label diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f931df25c4fd53..24e0a7fbad0a5f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2609,15 +2609,13 @@ def _can_hold_element(self, element): return issubclass(tipo.type, (np.timedelta64, np.int64)) elif element is NaT: return True - return is_integer(element) or isinstance( - element, (timedelta, np.timedelta64, np.int64) - ) + return is_integer(element) or isinstance(element, (timedelta, np.timedelta64)) def fillna(self, value, **kwargs): # allow filling with integers to be # interpreted as nanoseconds - if is_integer(value) and not isinstance(value, np.timedelta64): + if is_integer(value): # Deprecation GH#24694, GH#19233 warnings.warn( "Passing integers to fillna is deprecated, will " diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 0b440e0186fbca..4d688976cd50b1 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1203,9 +1203,7 @@ def test_is_integer(self): assert not is_integer(Timestamp("2011-01-01", tz="US/Eastern")) assert not is_integer(timedelta(1000)) assert not is_integer(Timedelta("1 days")) - - # questionable - assert is_integer(np.timedelta64(1, "D")) + assert not is_integer(np.timedelta64(1, "D")) def test_is_float(self): assert is_float(1.1) From 568809601e727de3946584fba3365ea41c41b0a4 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 15 Jul 2019 11:17:37 -0700 Subject: [PATCH 231/238] Easy warning fixups for mypy (#27402) --- pandas/core/computation/pytables.py | 6 ++---- pandas/io/json/_normalize.py | 4 +--- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index e4e005c024345a..8ba01670bd8794 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -90,9 +90,7 @@ def pr(left, right): k = klass if isinstance(left, ConditionBinOp): - if isinstance(left, ConditionBinOp) and isinstance( - right, ConditionBinOp - ): + if isinstance(right, ConditionBinOp): k = JointConditionBinOp elif isinstance(left, k): return left @@ -100,7 +98,7 @@ def pr(left, right): return right elif isinstance(left, FilterBinOp): - if isinstance(left, FilterBinOp) and isinstance(right, FilterBinOp): + if isinstance(right, FilterBinOp): k = JointFilterBinOp elif isinstance(left, k): return left diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index a6fde86297a3d7..24a255c78f3c07 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -112,7 +112,7 @@ def nested_to_record( def json_normalize( - data: List[Dict], + data: Union[Dict, List[Dict]], record_path: Optional[Union[str, List]] = None, meta: Optional[Union[str, List]] = None, meta_prefix: Optional[str] = None, @@ -280,8 +280,6 @@ def _pull_field(js, spec): lengths = [] meta_vals = defaultdict(list) # type: DefaultDict - if not isinstance(sep, str): - sep = str(sep) meta_keys = [sep.join(val) for val in meta] def _recursive_extract(data, path, seen_meta, level=0): From d4e2734ffb7a13fdf10f0ed8ad59521841e8ed07 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 15 Jul 2019 14:11:13 -0600 Subject: [PATCH 232/238] Added 32-bit build (#27274) --- ci/azure/posix.yml | 6 ++++++ ci/deps/azure-36-32bit.yaml | 20 ++++++++++++++++++++ ci/setup_env.sh | 6 ++++++ pandas/tests/groupby/test_grouping.py | 2 +- 4 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 ci/deps/azure-36-32bit.yaml diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index c5676e0a2a6a0f..39f862290e7207 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -33,6 +33,12 @@ jobs: PATTERN: "not slow and not network" LOCALE_OVERRIDE: "it_IT.UTF-8" + py36_32bit: + ENV_FILE: ci/deps/azure-36-32bit.yaml + CONDA_PY: "36" + PATTERN: "not slow and not network" + BITS32: "yes" + py37_locale: ENV_FILE: ci/deps/azure-37-locale.yaml CONDA_PY: "37" diff --git a/ci/deps/azure-36-32bit.yaml b/ci/deps/azure-36-32bit.yaml new file mode 100644 index 00000000000000..43bf0ecdd6c3e6 --- /dev/null +++ b/ci/deps/azure-36-32bit.yaml @@ -0,0 +1,20 @@ +name: pandas-dev +channels: + - defaults + - conda-forge +dependencies: + - gcc_linux-32 + - gcc_linux-32 + - gxx_linux-32 + - cython=0.28.2 + - numpy=1.14.* + - python-dateutil + - python=3.6.* + - pytz=2017.2 + # universal + - pytest>=4.0.2,<5.0.0 + - pytest-xdist + - pytest-mock + - pytest-azurepipelines + - hypothesis>=3.58.0 + - pip diff --git a/ci/setup_env.sh b/ci/setup_env.sh index 8f73bb228e2bd7..88742e0483c7e2 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -94,6 +94,12 @@ echo echo "conda env create -q --file=${ENV_FILE}" time conda env create -q --file="${ENV_FILE}" + +if [[ "$BITS32" == "yes" ]]; then + # activate 32-bit compiler + export CONDA_BUILD=1 +fi + echo "activate pandas-dev" source activate pandas-dev diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 72099f2fa3f11d..403f5f11ee7686 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -616,7 +616,7 @@ def test_groupby_empty(self): # check group properties assert len(gr.grouper.groupings) == 1 tm.assert_numpy_array_equal( - gr.grouper.group_info[0], np.array([], dtype=np.dtype("intp")) + gr.grouper.group_info[0], np.array([], dtype=np.dtype("int64")) ) tm.assert_numpy_array_equal( From 7b61952c908bd94066f1786d8b05b5c7bbc65ed3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 15 Jul 2019 13:13:02 -0700 Subject: [PATCH 233/238] Fix _can_hold_element for datetimelike blocks (#27347) --- pandas/core/dtypes/missing.py | 2 +- pandas/core/internals/blocks.py | 42 ++++++++++++++++++++++++++------- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 6a681954fd9022..bea73d72b91c9d 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -574,7 +574,7 @@ def is_valid_nat_for_dtype(obj, dtype): ------- bool """ - if not isna(obj): + if not lib.is_scalar(obj) or not isna(obj): return False if dtype.kind == "M": return not isinstance(obj, np.timedelta64) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 24e0a7fbad0a5f..e02fecf0ef1140 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -9,7 +9,8 @@ from pandas._libs import NaT, lib, tslib, tslibs import pandas._libs.internals as libinternals -from pandas._libs.tslibs import Timedelta, conversion, is_null_datetimelike +from pandas._libs.tslibs import Timedelta, conversion +from pandas._libs.tslibs.timezones import tz_compare from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -60,7 +61,13 @@ ABCPandasArray, ABCSeries, ) -from pandas.core.dtypes.missing import _isna_compat, array_equivalent, isna, notna +from pandas.core.dtypes.missing import ( + _isna_compat, + array_equivalent, + is_valid_nat_for_dtype, + isna, + notna, +) import pandas.core.algorithms as algos from pandas.core.arrays import ( @@ -2248,14 +2255,17 @@ def _astype(self, dtype, **kwargs): def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: - return tipo == _NS_DTYPE or tipo == np.int64 + return is_dtype_equal(tipo, self.dtype) + elif element is NaT: + return True elif isinstance(element, datetime): + if self.is_datetimetz: + return tz_compare(element.tzinfo, self.dtype.tz) return element.tzinfo is None elif is_integer(element): return element == tslibs.iNaT - # TODO: shouldnt we exclude timedelta64("NaT")? See GH#27297 - return isna(element) + return is_valid_nat_for_dtype(element, self.dtype) def _coerce_values(self, values): return values.view("i8") @@ -2275,8 +2285,10 @@ def _try_coerce_args(self, other): ------- base-type other """ - if is_null_datetimelike(other): + if is_valid_nat_for_dtype(other, self.dtype): other = tslibs.iNaT + elif is_integer(other) and other == tslibs.iNaT: + pass elif isinstance(other, (datetime, np.datetime64, date)): other = self._box_func(other) if getattr(other, "tz") is not None: @@ -2359,6 +2371,8 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): is_datetimetz = True is_extension = True + _can_hold_element = DatetimeBlock._can_hold_element + @property def _holder(self): return DatetimeArray @@ -2465,8 +2479,10 @@ def _try_coerce_args(self, other): # add the tz back other = self._holder(other, dtype=self.dtype) - elif is_null_datetimelike(other): + elif is_valid_nat_for_dtype(other, self.dtype): other = tslibs.iNaT + elif is_integer(other) and other == tslibs.iNaT: + pass elif isinstance(other, self._holder): if other.tz != self.values.tz: raise ValueError("incompatible or non tz-aware value") @@ -2606,10 +2622,16 @@ def _box_func(self): def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: + # TODO: remove the np.int64 support once coerce_values and + # _try_coerce_args both coerce to m8[ns] and not i8. return issubclass(tipo.type, (np.timedelta64, np.int64)) elif element is NaT: return True - return is_integer(element) or isinstance(element, (timedelta, np.timedelta64)) + elif isinstance(element, (timedelta, np.timedelta64)): + return True + elif is_integer(element): + return element == tslibs.iNaT + return is_valid_nat_for_dtype(element, self.dtype) def fillna(self, value, **kwargs): @@ -2645,8 +2667,10 @@ def _try_coerce_args(self, other): base-type other """ - if is_null_datetimelike(other): + if is_valid_nat_for_dtype(other, self.dtype): other = tslibs.iNaT + elif is_integer(other) and other == tslibs.iNaT: + pass elif isinstance(other, (timedelta, np.timedelta64)): other = Timedelta(other).value elif hasattr(other, "dtype") and is_timedelta64_dtype(other): From 4dd2e3f5834076a1911c624ef0a5e123b3b72721 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 15 Jul 2019 13:19:27 -0700 Subject: [PATCH 234/238] Assorted cleanups (#27376) --- pandas/_libs/src/klib/khash_python.h | 2 +- pandas/core/indexing.py | 27 +++++++------------ pandas/core/internals/blocks.py | 39 +++++++++++---------------- pandas/core/internals/construction.py | 4 +-- pandas/core/internals/managers.py | 14 +++++----- 5 files changed, 34 insertions(+), 52 deletions(-) diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index a81f9785ebe64b..82251744915a52 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -106,7 +106,7 @@ khint_t PANDAS_INLINE kh_put_str_starts_item(kh_str_starts_t* table, char* key, return result; } -khint_t PANDAS_INLINE kh_get_str_starts_item(kh_str_starts_t* table, char* key) { +khint_t PANDAS_INLINE kh_get_str_starts_item(const kh_str_starts_t* table, const char* key) { unsigned char ch = *key; if (table->starts[ch]) { if (ch == '\0' || kh_get_str(table->table, key) != table->table->n_buckets) return 1; diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 482e9c365420cd..5aee37bc3b833e 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1,4 +1,5 @@ import textwrap +from typing import Tuple import warnings import numpy as np @@ -936,7 +937,7 @@ def _getitem_lowerdim(self, tup): new_key = b, a if len(new_key) == 1: - new_key, = new_key + new_key = new_key[0] # Slices should return views, but calling iloc/loc with a null # slice returns a new object. @@ -1250,7 +1251,7 @@ def _convert_to_indexer( # a positional if obj >= self.obj.shape[axis] and not isinstance(labels, MultiIndex): raise ValueError( - "cannot set by positional indexing with " "enlargement" + "cannot set by positional indexing with enlargement" ) return obj @@ -1408,7 +1409,7 @@ def __getitem__(self, key): maybe_callable = com.apply_if_callable(key, self.obj) return self._getitem_axis(maybe_callable, axis=axis) - def _is_scalar_access(self, key): + def _is_scalar_access(self, key: Tuple): raise NotImplementedError() def _getitem_scalar(self, key): @@ -1709,14 +1710,11 @@ def _validate_key(self, key, axis: int): if not is_list_like_indexer(key): self._convert_scalar_indexer(key, axis) - def _is_scalar_access(self, key): + def _is_scalar_access(self, key: Tuple): # this is a shortcut accessor to both .loc and .iloc # that provide the equivalent access of .at and .iat # a) avoid getting things via sections and (to minimize dtype changes) # b) provide a performant path - if not hasattr(key, "__len__"): - return False - if len(key) != self.ndim: return False @@ -2000,7 +1998,7 @@ def _validate_key(self, key, axis: int): # check that the key has a numeric dtype if not is_numeric_dtype(arr.dtype): raise IndexError( - ".iloc requires numeric indexers, got " "{arr}".format(arr=arr) + ".iloc requires numeric indexers, got {arr}".format(arr=arr) ) # check that the key does not exceed the maximum size of the index @@ -2015,14 +2013,11 @@ def _validate_key(self, key, axis: int): def _has_valid_setitem_indexer(self, indexer): self._has_valid_positional_setitem_indexer(indexer) - def _is_scalar_access(self, key): + def _is_scalar_access(self, key: Tuple): # this is a shortcut accessor to both .loc and .iloc # that provide the equivalent access of .at and .iat # a) avoid getting things via sections and (to minimize dtype changes) # b) provide a performant path - if not hasattr(key, "__len__"): - return False - if len(key) != self.ndim: return False @@ -2131,9 +2126,7 @@ def _getitem_axis(self, key, axis: int): else: key = item_from_zerodim(key) if not is_integer(key): - raise TypeError( - "Cannot index by location index with a " "non-integer key" - ) + raise TypeError("Cannot index by location index with a non-integer key") # validate the location self._validate_integer(key, axis) @@ -2191,7 +2184,7 @@ def __setitem__(self, key, value): if not isinstance(key, tuple): key = self._tuplify(key) if len(key) != self.obj.ndim: - raise ValueError("Not enough indexers for scalar access " "(setting)!") + raise ValueError("Not enough indexers for scalar access (setting)!") key = list(self._convert_key(key, is_setter=True)) key.append(value) self.obj._set_value(*key, takeable=self._takeable) @@ -2327,7 +2320,7 @@ def _convert_key(self, key, is_setter: bool = False): """ require integer args (and convert to label arguments) """ for a, i in zip(self.obj.axes, key): if not is_integer(i): - raise ValueError("iAt based indexing can only have integer " "indexers") + raise ValueError("iAt based indexing can only have integer indexers") return key diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e02fecf0ef1140..897a82f9a19687 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -42,7 +42,6 @@ is_integer_dtype, is_interval_dtype, is_list_like, - is_numeric_v_string_like, is_object_dtype, is_period_dtype, is_re, @@ -1304,24 +1303,20 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): if fill_tuple is None: fill_value = self.fill_value - new_values = algos.take_nd( - values, indexer, axis=axis, allow_fill=False, fill_value=fill_value - ) + allow_fill = False else: fill_value = fill_tuple[0] - new_values = algos.take_nd( - values, indexer, axis=axis, allow_fill=True, fill_value=fill_value - ) + allow_fill = True + new_values = algos.take_nd( + values, indexer, axis=axis, allow_fill=allow_fill, fill_value=fill_value + ) + + # Called from three places in managers, all of which satisfy + # this assertion + assert not (axis == 0 and new_mgr_locs is None) if new_mgr_locs is None: - if axis == 0: - slc = libinternals.indexer_as_slice(indexer) - if slc is not None: - new_mgr_locs = self.mgr_locs[slc] - else: - new_mgr_locs = self.mgr_locs[indexer] - else: - new_mgr_locs = self.mgr_locs + new_mgr_locs = self.mgr_locs if not is_dtype_equal(new_values.dtype, self.dtype): return self.make_block(new_values, new_mgr_locs) @@ -1865,11 +1860,11 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): # if its REALLY axis 0, then this will be a reindex and not a take new_values = self.values.take(indexer, fill_value=fill_value, allow_fill=True) - if self.ndim == 1 and new_mgr_locs is None: - new_mgr_locs = [0] - else: - if new_mgr_locs is None: - new_mgr_locs = self.mgr_locs + # Called from three places in managers, all of which satisfy + # this assertion + assert not (self.ndim == 1 and new_mgr_locs is None) + if new_mgr_locs is None: + new_mgr_locs = self.mgr_locs return self.make_block_same_class(new_values, new_mgr_locs) @@ -3388,10 +3383,6 @@ def _putmask_smart(v, m, n): # if we have nulls if not _isna_compat(v, nn[0]): pass - elif is_numeric_v_string_like(nn, v): - # avoid invalid dtype comparisons - # between numbers & strings - pass elif not (is_float_dtype(nn.dtype) or is_integer_dtype(nn.dtype)): # only compare integers/floats pass diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index b4752039cf5b1d..dbb01fc055d5cd 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -355,7 +355,7 @@ def extract_index(data): raw_lengths.append(len(val)) if not indexes and not raw_lengths: - raise ValueError("If using all scalar values, you must pass" " an index") + raise ValueError("If using all scalar values, you must pass an index") if have_series: index = _union_indexes(indexes) @@ -369,7 +369,7 @@ def extract_index(data): if have_dicts: raise ValueError( - "Mixing dicts with non-Series may lead to " "ambiguous ordering." + "Mixing dicts with non-Series may lead to ambiguous ordering." ) if have_series: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 44cc61d163b4d9..2e7280eeae0e2d 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1061,7 +1061,7 @@ def value_getitem(placement): if value.shape[1:] != self.shape[1:]: raise AssertionError( - "Shape of new values must be compatible " "with manager shape" + "Shape of new values must be compatible with manager shape" ) try: @@ -1154,7 +1154,7 @@ def value_getitem(placement): # Newly created block's dtype may already be present. self._known_consolidated = False - def insert(self, loc, item, value, allow_duplicates=False): + def insert(self, loc: int, item, value, allow_duplicates: bool = False): """ Insert item at selected position. @@ -1389,9 +1389,7 @@ def take(self, indexer, axis=1, verify=True, convert=True): if verify: if ((indexer == -1) | (indexer >= n)).any(): - raise Exception( - "Indices must be nonzero and less than " "the axis length" - ) + raise Exception("Indices must be nonzero and less than the axis length") new_labels = self.axes[axis].take(indexer) return self.reindex_indexer( @@ -1478,7 +1476,7 @@ def __init__( if isinstance(axis, list): if len(axis) != 1: raise ValueError( - "cannot create SingleBlockManager with more " "than 1 axis" + "cannot create SingleBlockManager with more than 1 axis" ) axis = axis[0] @@ -1492,7 +1490,7 @@ def __init__( block = [np.array([])] elif len(block) != 1: raise ValueError( - "Cannot create SingleBlockManager with " "more than 1 block" + "Cannot create SingleBlockManager with more than 1 block" ) block = block[0] else: @@ -1509,7 +1507,7 @@ def __init__( if len(block) != 1: raise ValueError( - "Cannot create SingleBlockManager with " "more than 1 block" + "Cannot create SingleBlockManager with more than 1 block" ) block = block[0] From 24bd67ec70cc7ab7e6a8747609dc932be8c21f00 Mon Sep 17 00:00:00 2001 From: Tilen Kusterle Date: Mon, 15 Jul 2019 20:20:19 +0000 Subject: [PATCH 235/238] TST: add NaN tests for all data types (#12535) (#27378) --- pandas/tests/window/test_dtypes.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/tests/window/test_dtypes.py b/pandas/tests/window/test_dtypes.py index ab2915a333afd8..9d023034c570ac 100644 --- a/pandas/tests/window/test_dtypes.py +++ b/pandas/tests/window/test_dtypes.py @@ -53,6 +53,18 @@ def get_expects(self): "var": Series([np.nan, 2, 2, 2, 2], dtype="float64"), "median": Series([np.nan, 9, 7, 5, 3], dtype="float64"), }, + "sr3": { + "count": Series([1, 2, 2, 1, 1], dtype="float64"), + "max": Series([np.nan, 1, 2, np.nan, np.nan], dtype="float64"), + "min": Series([np.nan, 0, 1, np.nan, np.nan], dtype="float64"), + "sum": Series([np.nan, 1, 3, np.nan, np.nan], dtype="float64"), + "mean": Series([np.nan, 0.5, 1.5, np.nan, np.nan], dtype="float64"), + "std": Series( + [np.nan] + [np.sqrt(0.5)] * 2 + [np.nan] * 2, dtype="float64" + ), + "var": Series([np.nan, 0.5, 0.5, np.nan, np.nan], dtype="float64"), + "median": Series([np.nan, 0.5, 1.5, np.nan, np.nan], dtype="float64"), + }, "df": { "count": DataFrame( {0: Series([1, 2, 2, 2, 2]), 1: Series([1, 2, 2, 2, 2])}, @@ -99,9 +111,11 @@ def get_expects(self): def _create_dtype_data(self, dtype): sr1 = Series(np.arange(5), dtype=dtype) sr2 = Series(np.arange(10, 0, -2), dtype=dtype) + sr3 = sr1.copy() + sr3[3] = np.NaN df = DataFrame(np.arange(10).reshape((5, 2)), dtype=dtype) - data = {"sr1": sr1, "sr2": sr2, "df": df} + data = {"sr1": sr1, "sr2": sr2, "sr3": sr3, "df": df} return data From 5614dadf244ab0516d5712353e28a7a296dede80 Mon Sep 17 00:00:00 2001 From: pilkibun <51503352+pilkibun@users.noreply.github.com> Date: Tue, 16 Jul 2019 15:25:39 +0000 Subject: [PATCH 236/238] CLN: docstring (#27410) --- pandas/core/groupby/grouper.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index a127d092b7b1aa..f8417c3f01eac8 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -41,9 +41,8 @@ class Grouper: level and/or axis parameters are given, a level of the index of the target object. - These are local specifications and will override 'global' settings, - that is the parameters axis and level which are passed to the groupby - itself. + If `axis` and/or `level` are passed as keywords to both `Grouper` and + `groupby`, the values passed to `Grouper` take precedence. Parameters ---------- From 7259371c3038724154b3f13ecd906df5d2ad86d2 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 16 Jul 2019 15:19:29 -0400 Subject: [PATCH 237/238] CI: limit pytest version on 3.6 (#27416) --- ci/deps/azure-37-numpydev.yaml | 3 ++- ci/deps/azure-macos-35.yaml | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml index c56dc819a90b1e..5cf897c98da100 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-37-numpydev.yaml @@ -17,4 +17,5 @@ dependencies: - "--pre" - "numpy" - "scipy" - - pytest-azurepipelines + # https://github.com/pandas-dev/pandas/issues/27421 + - pytest-azurepipelines<1.0.0 diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml index 0b96dd9762ef5d..98859b596ab2a3 100644 --- a/ci/deps/azure-macos-35.yaml +++ b/ci/deps/azure-macos-35.yaml @@ -29,4 +29,6 @@ dependencies: - pytest-xdist - pytest-mock - hypothesis>=3.58.0 - - pytest-azurepipelines + # https://github.com/pandas-dev/pandas/issues/27421 + - pytest-azurepipelines<1.0.0 + From 26bd34df233e3f103922fe11e238c1532f3e58a0 Mon Sep 17 00:00:00 2001 From: pilkibun <51503352+pilkibun@users.noreply.github.com> Date: Tue, 16 Jul 2019 19:42:49 +0000 Subject: [PATCH 238/238] CLN: fix compiler warnings in tzconversion.pyx (#27412) --- pandas/_libs/tslibs/tzconversion.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 26a64c13f6de1d..dd0c6fc75b06f3 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -96,6 +96,8 @@ timedelta-like} result[i] = _tz_convert_tzlocal_utc(v, tz, to_utc=True) return result + # silence false-positive compiler warning + ambiguous_array = np.empty(0, dtype=bool) if isinstance(ambiguous, str): if ambiguous == 'infer': infer_dst = True @@ -159,6 +161,8 @@ timedelta-like} if v_right + deltas[pos_right] == val: result_b[i] = v_right + # silence false-positive compiler warning + dst_hours = np.empty(0, dtype=np.int64) if infer_dst: dst_hours = np.empty(n, dtype=np.int64) dst_hours[:] = NPY_NAT

rmIfI-<&LQWK3&=&}5^@>2f?P$eA=i-`$W7!HavQmW z+(qso_mKz4L*xWZ7 z$WH`7K@>t^6hToGLvfTqNt8lqltEdPLwQs{MN~p%R6$i#Lv_?ZP1J>^L(`)f(2Qs% zG&7n7&5CA2v!glCoM;jtEsPdHi=xHQ;%Eu9Bw7kBjg~>n zqUF%?Xa%$)S_!R;Rza(x)zIo_4YVd&3$2aTK?BjcXgxFtt&cW98={TS#%L4NjW$J_ zq0P}2XiKye+8PZ;+n^z6TQn36L&MPsG!kuxwnsf^6dH|oKx5EYG!BhN6Hp8FqKRk{ znv8ZtJE5J?E@)S@8`>T9p*_%^XfL!k+6V26_Cx!l1JHr!AapP~1RaVFLx-ay(2?jU zbTm2!9gB`b$Da%dI!CW z-b3%B5739`BlI!)1bvD=L!YBB(3j{d^fmeheT%+B-=iPUkLV}#Gx`Pnihe`Cqd(A} zD1d<&guxhsp%{kY7=e)(h0z#;u^5N(n1G3xgvpqKshEc8n1Pv?3rmNk$1-3Uu}oNI zEDM$u%Z6pga$q^JTvz~>8_R>`#qweKu>x2@tPoZhD}oipiebgE5?D#B6jmB5gO$a~ zVdb$3SVgQ7RvD{;RmG}d)v+2_O{^AH8>@o_Vs){4SP)hpYk)Px8exsGCYT#*iZ#QU zV=b_jSSzeG7L2vQLa?@2C>Dl=V-Z*+)(&frd9Wxf8tZ_?V6j*n7LO%h7Usngu_P=R z>xgy2I%8e1u2?s$JLbcBU_G&3SZ}Nk))(uC^~VNa1F=EaU~C9B6dQ&O$3|cyu~FD) zYz#IQ8;6a@CSVh>N!Vm;3N{s+hE2z2U^B5<*lcVLHW!<#u7dxyQpK42fQPuOSd3-%TJhJD9=U_UVc z2XP38aRf(k499T-Cvgg=aRz5`4(D+J7jX%faRpa#4cBo4H*pu94o{C~z%$~R@XUA? zJS(0J&yMH7bK<%106aII2hWS=!}H?>@Pc?Dyf9t_FNzn#i{mBml6Wb+G+qWTi;3?Gh z5CIbiff5*j69hpL6hRXV!4e$769OR;5+M@`p%NOQ69!=tE+QR~p2$FCBr*}1i7Z4` zA{&vN$U)>JauESUZXyqnm&ixtCkhY+i9$qSq6krxC`J?~N)RQ9QbcK@3{jRSN0cWj z5EY3^L}j81QI)7hR3~Z>HHlh8ZK4hlNYo|j5kW+Kq5;v6Xhbw7nhvG(VrMV3?v2-gNY%;P+}M{oESljBt{XVi7~`j zVjMA^m_SS!#9m?_v7b0V93&1AhlwM^ zQQ{bJoH#+8Bu){hi8I7m;v8|FxIkPaE)kcBE5ude8gZStLEI#65x0pu#9iVZai4fV zJR}|wkBKM5Q{ox%oOnUJBwi7(i8sVs;vMmx_&|IlJ`taZFT_{k8}Xg^LHr~D5+orK zCJ_=PF%l;Uk|ZgTCK-|?Ig%#@QY0l(CKXa8HBu)H(j;AEIx;<(fy_u|A~Ta&$gE^G zGCP@r%t_`V1IXND9x^YPkIYXNAPbU($iie1vM5=MEKZgnOOmC?(qtL3ELn~$PgWo+ zl9kBHWEHY1S&ght)*x$=waD6J9Ws!tOV%TU$ogahvLV@sY)m#G-DFd;8QGj{LAE4Y zk*&#KvJDwRwk1Q!FfyEsAS20kWP8#>Mv>8E2Qr3?CF96=GJ&*6FPTUtk;!C7vJ=^v z>_T=WyOG^VAK8QKN%kUplYPj(WIwV$Ie;8U4k8DWL&%}zFmgCKf*eVXB1e;B$g$)& zay&VKoJdY0CzDgispK?rIyr-!NzNi?lXJ+qRBHiXxJGq10N$w(dlY7X$r{B2SZN$g|`*@;rHgyhvUmFOyfutK>EEI(dV@;&*1{78NxKa*d`ujDuKJNbkBNdgo|K@?0O z6iQ(fP7xGIQ4~!v6iaawPYIMrNt8?}luBuoP8pO*xu|qhdMX2zk;+76rm|33sccks zDhHL5%0&fGxv4xQb$GE`Zr995pGKvkqF zQI)AGR8^`PRh_Cq)ud`swW&H(AXS&DM+H&!sRmR-su9(gYC^fGrc^VkIn{z{NwuO{ zQ^8alDuilFg;HTuI2A!fQthbr_W#aBQPETfDu#-s;;4Aa{$D)GOC?fCR5I0(>O^&> zx=>xIZd7;5NA;k3QoX3&R3EA@)sN~=4WI^6gQ&sO5Naqjj2cdjphi-osL|9IYAiL5 z8c$82CQ_5A$Sra5^5>6j9N~upjJ|= zsMXXOYAv;nT2F1DHd33Y&D0iZE47WMV7RI!|4oE>f4M%hVO>Ds_#zPTinxQn#qv)E(+Bb&tAFJ)j;^ zkEqAg6Y44TjCxMJpk7k1sMpjR>Miw-dQW|zK2o2k&(s&{EA@@~PW_;MQUDFo5Dn7^ zjnWv6(*#Y@6iw3%&C(pr(*iBh5-rmTt5x&hsgZbUbxo6v5$Dcy{2PPd?2(yi#$ zbTHk94x!u9p>!A>PDjv@bUV5|?V+RSXu1O(L&wr_bUd9vTeO!>q?71mx+C3*?o4-~ zyVBk0?zE5YLHDG4(Y@(DbYHq3-Jc#n52OdtgXtmkPUT(X;6}^jvx#J)d4cFQgaIi|HlwQhFJ^oL)h%q*u|a z={59PdL6x<-av1pH_@BvE%a7;8@-+0LGPq@(YxtA^j>-&y`MfnAEXb_hv_5qQTiBt zoIXLHq)*YO=`-|M`W$_pzCd53FVUCjEA&e%hY3nnEFfurXkaaY0NZX+)PuZ8Pl9;!L(#rF|CFeaRdU?Q1zOnb(|L^07!2PTGzW#X84CV{aSFO$e5G099vrW4bd>B4kn zx-s1uAJc>B$@F4+Gkuu8Oh2YSGk_V$3}OZ|Lztn=FlIP2f*Hw-Vn#D#n6b<_W;`>2 znaE6HCNoo*smwHHIx~Zr$;@JAGjo`^%sgg3vw&I1EMgWjOPHn1GG;lmf?3I|VpcP2 zn6=D0W<9fk*~n~SHZxn8t;{xNJF|n?$?RfwGkci5%sys6bAUO>9AXYLN0_6`G3GdP zf;q{YVooz>n6u0|<~(zOxyW2%E;Cn{tIRd#I&*`$$=qUYGk2J~%su8l^MHBCJYpU* zPnf67Gv+z-f_cfjVqP zY*n@zTb-@J)?{n3wb?psAX}HM#|E+W*#>Mwwh`NyZNj?Qrff5|IopD5$+lu!v%zc| zHiT`(YFvGHsIYq4H7kxgQg*^X={wlmv>?aFpz zyR$yF2iueF#r9_VuzlHnY=3qDJCGg34rYh2L)l^MaCQVck{!j4X2-B&*>UW6b^<$* zoy1ONr?6AmY3y`%20N3T#m;8uuyfgY?0j|syO3SPE@qdoOW9@Ya&`r~l3m5FX4kN5 z*>&uCb_2VS-NbHYx3F8;ZR~b-2fLHq#qMVJuzT5k?0)tDdyqZE9%hfQN7-ZSarOjz zl0C(qX3wx^*>mi9_5yp6y~JK-udr9yYwUIQ278me#olJ`uy@&e?0xnD`;dLaK4zb= zPuXYebM^)Ml6}R#X5X-H*>~)F_5=Ho{ltD|zp!7~Z|ryW2m6x+IFN%lm_s;}!#JEH zIFh3{nqxSY<2arZIFXY$nNv8G(>R?oIFoa6>A3V<1}-C)iObAo;j(hsxa?dGE+?0Z z3*d5ddAPh>J}y63fGfxq;tF#`xT0J!t~ghME6J7ON^@nnvRpZ?JXe9M$W`Jhb5*#i zTs5vbSA(m`)#7S%b+|yTE?18W;_7n^xQ1LKt})kyb8}6(W?XZw1=o^m#kJ;wxi(w~ z*Om+A!nklQf{WzZaqT$|7sW+$9k>`SmW$)!xdhJQyj&ue#3gebxlUYXt_#&A8G zd|VH%C)bPX&Gq5>a{aje+yHJMH;5a|4dI4z!?@wx2yP@diW|+1;l^^~xbfTsZX!2{ zo6Jq&rgGD`>D&x%CO3se+stj@wsPCJ?c5G-C%22+&F$g#a{IXb+yU+&cZfU89pR30$GGF%3GO6! ziaX7n;m&gBxbxfv?jm=IyUbnTu5#D7>)Z|QCU=Xw&E4Vda`(9V+ym|*_lSGUJ>i~m z&$#E@3+^TNihIqy;ofrZxcA%#?j!e!`^G=$NMm`gtna{#!<+JhG`5b&sJ{KRr z=jQY9dHH;Ne!c)-kT1j+=8Nz}`C@!=z64*AFU6PU%kX9Sa(sEd0$-7@#8>94@KyP0 ze09DCUz4xJ*XHZ+fqY%Q9v{Tl=Ns@1`9^$Wz6tNK zpTbY&r}5MI8T?Fs7C)Px!_VdC@$>lw{6c;aznEXbFXfl<%lQ@jN`4i;nqR}O<=64+ z`3?L=eiOf$-@!7VfunhDK?7D7v*mC#xU7TO3QLR%qJ z2ou7E2q98vC$twlLX;3KbP!^MSRqb`7ZL6K!XRO=Fhm$C3=@V6BZQH{C}FfPMi?uM6UGY@go(l=VX`nq zm?}&YrVBHKnZhh#wlGJSE6fw-3k!sW!Xja@utZoYEEARsD}Y^c)47juX?#av>5m|M&v z<`wga`NaZaL9vinSS%tI6^n_*#S&skv6NU^EF+c`%ZcU13SvdEl2}=+B32cviPgm# zVokA@SX-PmSQWhwHPe65kthbVyGA< zhKmtmq}WbuFM7l%FkMQjyPAGC(aiahzrF<;$m@$xKvywE*DpbE5%jfYH^LYR$M2p7dMC- z#ZBU7af`TB+$L@pcZfU1UE*$WkGNOdC+-&yhzG?(;$iWKcvL(l9v4rDC&g3ZY4MDB zRy-%37cYnx#Y^I4@rrm=ye3{3Z-_U=TjFi;j(AtRC*Btyh!4d_;$!iN_*8r*J{Mnz zFU42lYw?ZvR(vPE7e9y}#ZTgA@r(FX{3d=Ee~3RtKmsL5f+a*kB}~F4LLwzfq9sOR zB~IccK@ufNk|jk_B~8*LLoy|oluk-7WsovTnWW587AdQgP0B9ika9}7qyQ@r8skBr^Dl3(f%1afbic%%1vQ$N?DpixJOEsjL zQZ1>rR7VPw>Pq#bAgR98Kx!y8k{U}*B)8O5Y9=+8T1YLWR#IyzSZX7MNNuH1DNG8N zBBV&Eoz!0PNKsO>)Io}oVx>4KUP_QG$txvFNm8=ZQR*ahmbyq?rEXGp$tU%YdP=>d z-clc_uhdWKFAb0eN`s`q(hzB=G)x*UjgUr4qomQ&7-_6DP8u&wkS0o#q{-40X{t0$ znl8q|4G3>8f;1x-Q+2Zc4YL+tMBBu5?ejFFlYRN{^(+(i7>a^h|m#y^vl? zucX(~8|kg|PI@nWkUmPEq|ee9>8tcj`Y!#DeoBB0%8(4ph>XgZjLU>f%9KpYjLgcM z%*%o-%91S0imb|-tjmUM$}TyboL&ZcKeYt_$P;MkQmYc|KxvAVtZZ5ZwTgt8E)^f1iMh=nN%Asr8r93#idadNzzAX~CmPLz}6WVxfxPz9HX~Z^^giJMvxmo_t?^AU~8J$&ckH@>BVl{9Jw^zm#9e zujM!LTlt;*Uj86|lt0Ozbb1y&FRRWJot2!&KAg;p4aRXBxL z1VvONMOG9=RWwCc48>GjN;)OIl0nI+WKuFKS(L0wHYK}~L&>S+QUa9RN**Pzl26I6 z6i^B(g_Ocd5v8b7OewCEP)aJLl+sEWrL0m;DX&yeDk_zf%1RZbs!~m8bQmdMkaD zzDhr(zcN4>s0>mDD?^l_$}nZPGC~=tj8aA`W0bMVIAy#tL7AvbQYI@?l&Q)zWx6s$ znW@ZDW-D`)xyn3czOq1Bs4P+zD@&B6$}(lSvO-y@tWs7hYm~LhI%U1GLD{HmQZ_4F zl(WxKLN*{SSOb}M_7y~;jizj8o1s2oxbD@T;0$}#1*azZ(&oKj9JXOy$bIpw@^ zLAj`0QZ6f3l&i`$<+^f1xvAVzZYy_`yUIP~zVbkMs60|0D^HZC$}{D;@&MPS=DT6b~T5ZQ_ZCYsJYcVYF;&;nqMuT z7E}wVh1DWzQMH&_TrHuNR7XwVYaBt)Ny^E2)*$Dr!}=np$10q1IGuskPNQ zYM@$It)~X5_01`@ zerkVpfI3heqz+bxs6*9Z>Tq?0I#L~_j#kI0W7To$cy)q0QJthtR;Q>_)oJQT-33x>8-Gu2$EmYt?n?dUb=kQQf3&R=22I z)oto_b%(lB-KFkU_o#c-ed>PofO=3pq#jm}s7KXf>T&gidQv^5o>tGOXVr7+dG&&N zQN5&IRT~sl`ci$RzEiH0snybIYjw0h zt*%y23)1Rq4YY<@BdxL4M00CRwPsp#t%cT7Yo)c;g0(hUh}KpM)xxxJEkcXb+G*`I zj~1mxYaO&0Emn)u;#g)q(WYwCwCUOmZKgI$ zo2|{!=4$h_`Pu?)p|(g{tS!-&YRk0c+6rx@wn|&At$LUS25qCZN!zS#(Y9*a zwC&msZKt+N+pX=<_G9|hlq)zFy&giVp>AWuJqAuyO zuIQ?+>AG&{rtZ?y>FM|YLXVtUm+4UTHPCb_%py$@}=y~;gdValtUQjQj z7uJjDMfGBOalM3IQZJ>K*30N+^>TW7y@Fm*ucTMjtLRnrYI=3OhF(*zrPtQ$=z)4& zy`CPV*Vh~94fRHPW4(#))|=|h^yYdCy`|ntZ>EU{W9;vs}+v^@Z zN{`k%=rMY%9;e6a3A&|w^+Y{MPu4r?o%GIn7rm?AP4BMz^d5Roy_eow@1ytC`|17l z0s26FkUm%+q7T)F>BIFA`bd40K3X56kJZQNC5#M`bvG3zFJ?SuhrM->-7!#MtzgMS>K{>)wk)} z^&R?7eV4vl-=pu<_v!of1NuSzkbYP{q94_d>BsdG`bqtiep)}HpViOl=k*KvMg5X~ zS-+xR)vxK-^&9$4{g!@PzoXyP@9FpT2l_+(k^WeJqCeH2>Cg2S`b+(l{#t*dzt!LA z@AVJ*NBxujS^uJc)xYWA^&k3A9WX!xGGGHSPy;h?gD^;gGH8P_Sc5ZoLoh@`GGs$B zR6{d#!!S(4Wu!CG8ySp@MkXV(k;TYrWHYiGIgFe}E+fFmZR9cX8u^U;MggOsQOGE4 z6fue##f;)c38SP@$|!A=G0Ga{jPgbWqoPsCsBBa*sv6ad>P8KtrcukNZPYOWjk-oX zBgm+4G%y+(jf}=d6T@vZHJTaCjTS~rqm|Lx2sYXnAx2vx)Ce=ejR+&sXlJxHJVule zZFDeVj94Sih&K`p%kUbBMv{?ibTm2{osBL=SEHNJ-S8PbjGjg>qqot==xg*d`Wpj` zfyN+Xurb6KY78@m8zYR7#wcU7F~%5cj5Ed?6O4(*BxABM#h7YLGo~9ejG4wPW41BJ zm}|^4<{Jx)g~lRdv9ZKhYAiFB8!L>J#wugAvBp?ytTWad8;p&{CS$X)#n@_WGqxK$ zjGe|VW4E!#*lX-F_8SL`gT^7_uyMpVY8*3;8z+pD#wp{pamF}noHNcF7mSO>CF8Pj z#kgu*Gp-vqjGM+SO2C*!m6#rSG`Grk)?jGqQzf+l3bCSsx{X5uDck|t%+CS$TDXY!_Cil$`Bredn5 zX6mM4nx@N4XQnqZm>JDXW@a;snbpi@W;b(~In7*VfSKFOW9BvUnfc8EWJ46HZ&WVjm;*e+iYq!Gn<<&%$8;=v$Yv)wlPD@wq~dqW`>&)W~AB9Y;StZC^OpZ zV8)oSW}F#sCYYA#H51JwGuiBDb}~DgUCgd#H?zCxGkcgl&0c12vya)=>}U2j2bcrR zLFQm{h&j|8W)3$;m?O#+-L4L510qdL*`-gh{48+ zOLOTi!)3Z$u5_;St_-e>u1v1Xt}L#su57OCt{kqMu3WAFS8i7xS6){>SAJIkS3y@H zS7BEXS5a3nS8-PfS4meXS7}!nS6Np%S9wq|tDY;!Ro~UX)$spe?Vf@q*}8BIw{3T7BC*=GZQHhOd$n!bwr$(CZJgfwfA`I~ zJ!jlJb7V!-RYs1isEBVE6^x2TC8M%Y#i(jjGpZXkjG9I*qqb4UsB6?S>KhG=hDIZ! zvC+h6YBV#N8!e2MMk}MW(Z*qqot==xg*d z`Wpj`fyN+Xurb6KY78?%jp4=!W27<47;TI(#v0>{@x}yWqA|&sY)mnx8q@oHl`;7g@0pp-?$T(~qF^(F?jN`@$8^Tq|^ zqH)Q%Y+Ny}8rO{L#tq}9am%=E+%fJN_l*0-1LL9b$ari#F`gRFjOWG+F2O~(|bG?l4M*Yr%^49pNSnVH;7 zVWu=wnW@b*W?D0yncmD`W;8RInawO_Rx_KK-OOR;G;^7`%{*paGoP8?EMOKi3z>z@ zB4$yum|5H`VU{#YnWfD#W?8eGS>CK*Rx~S_mCY(W?QqJ+1~76b~HPgoy{(0SF@Yh-RxoZG<%u7 z%|2#dv!B`D9AFMK2bqJ-A?8qXm>FsgH%FKw%~9rPbBsCG9A}ON#V|%~j@VbB(#yTxYH~H<%mE zP3C5Ei@DX@W^Olkm^;l~=5BM3x!2rh?l%vZ2hBs~Ve^Q2)I4S$H&2)+%~R%S^Ne}c zJZGLaFPIn2OXg+sih0$%W?naMm^aN^=56zidDpyW-ZvkZ56wsBWAlmm)O=<>H(!`9 z%~$4Y^Nsn|d}qElKbRlQPv&Ryi}}_3W_~w+m_N;5=5Od~qRrIpG`ZKbi&TIsCxRt77hmC4F%WwEka*{tkV4lAdX%gSx#vGQ8^to&91 ztDsfLDr^<8idx02;#LW(q*cl)ZI!XgTIH+GuUEHd|Y)t=2YcyS2mGY3;IhTYId%);?>$b-+4k9kLEvN35gPG3&T>!a8Z4 zvQArPth3fR>%4Wrx@cXpE?ZZutJXE^x^=_4Y2C7JTX(Fx);;UK^}u>)J+dBKPpqfb zGwZqa!g^`FvR+$nthd%X>%H~C`e=Q!K3iX`uhuu~yY<8RY5lT(TYs!Dc33-{9o~*$ zN3tZQHShEp26M z+qFI0w*x!GPG%>!Q`jl(RCa1Rjh)s`XQ#I_*ct6ic4j+^oz>1}XSZ|MIqh6_Zaa^i z*Uo3>w+q+>?Lu~8yNF%XE@l_EOV}mtQg&&(j9u0)XP37t*cI(cc4fPYUDd8;SGQ}} zHSJn|oT5%Kr?^wXDe07QN;_qovQ9atyi>uc=u~nlJ5`*j zPBo{xQ^Tq0)N*P&b)33RJ*U3Yz-j0-avD2LoTg4Qr@7O@Y3a0bT03o=woW^zz0<+z z=yY;AJ6)WvPB*8!)5GcM^m2MTeVo2dKc~Mlz!~Taat1p?oT1JzC)644jBrLeqny#s z7-y_A&Kd7aa3(sFoXO4?<{Z@I*Xje&Jt&-v&>oU ztZ-I3tDM!&8fUGu&ROqla5g%doXyS_XREW#+3xIcb~?M9-Oe6oud~nD?;LOrI)|LY z&JpLRbIdvJoN!J$r<~Ky8Rx8X&N=T~a4tHRoXgG?=c;qfx$fL>ZaTM|+s+;5u5-`1 z?>ulGI***k&J*XU^UQhfyl`GRubkJ;8|SU_&Ux>Aa6USpoX^e|=d1J0`R@F1emcLL z-_9Q=j0h{jiSQzVh$te7$Rdh}Dx!(#B8G@5Vu{!yj)*JbiTEOcNGKAC#3G4EDhy!? zOW48@LP(*67OwDwF9H!Fl8NLZg-9tvWe^>hsY^% ziQFQO$Sd-R{Gxy;C<=+fqKGIeiizT)geWOWiPEBsC@ac|@}h#MC@P7{qKc?0s)_2N zhNvlOiQ1x$s4MD;`l5koC>n{zqKRlKnu+G3g=i^SiPoZxXe-)@_M(I6C_0JGqKoJ% zx{2pTg5i9UF;A$#V)a1>=Ap#KCxdM5C_E}aabG?N5wI5T$~Um#VK)GoDpZm zIdNWG5EsQIaamjuSH(4PUEB~i#Vv7L+!1%hJ#k+=5D@mM?&PsKCwT)Yr3#Vhez zyb*83JMmt85Ff=S@mYKkU&S}^UHlL~#V_$&{1IVfSQ$=+ml0${8A(Q#QDjsZO-7e7 zWK0=L#+GqpTp3TsmkDG-nMfv)r6+wE$Pk%KCYLE>N|{Qg zmT6>KnNFsc8DvJ8NoJN=WLB9?W|ui+PMJ&QmU(1enNQ}I1!O^4NEVhwWKmg67MCSt zNm)vkmStpFSx%Oh6=X$ONmiCsWK~&BR+lwoO<7CUmUU!ZSx?rN4P-;vNH&&DWK-Ep zHkU1AOW8`cmThEP*-o~X9b`w@Np_Z9WLMcuc9%V5PuWZMmVIPj*-!SD1LQzCNDh`m zP1aa*13jm&xUFgTyUC^yN?a*Nz5x5@2thukT5$=z~~+$;CV z{qlf3C=bcQ@`yYtkICcmgghxv$rx{9GY;k7UaGh1qx!0Ts=peb2C6}7uo|L4zNF7#3)KPUz9aksRNp(t{R%g^% zbxxgE7t}>{NnKV~)Kzs&T~{~MO?6A%R(I50bx++_57a~TNIh0h)Km3LJy$Q(OZ7^< zR&Uf>^-jH4AJj+nNqtsd)K~RQeOEu!PxVXvR)16&9ae|a;dKNZQAg5|brc;{N7K=D z3>{O)(y?_M9aqQG@pS^7P$$xfbrPLa8`{*CwzZ>$mRf18UF~UK2RcM2)5&!Tol>XL zsdXBiR;SbHbq1YLXVRH<7M)dR)7f5fSJIVr6%bmSNGHX^#DCk57LA6 z5Is~6)1i8}9-&9-QF^oS=nqo}p*zS$ejfqvz^*dcIzu z7wScNv0kE=>ScPlUZGd&ReH5vqu1(ndcEGDH|kA#v)-b&>TPSOx2KA}(QQ~I<%qtEJd`nTCMCzM*gGTl%)X zqwnf_`o4akAL>W?v3{bT>Sy}7exYCLSNgSnqu=Ux`n~?3Kk85Vv;LyL>TmkH{-J;B zU;4NHqraJo7_#|rgT%esogYgS~s1W-p$}M;mYq_=EI&NLJo?G8-;5Kv{xsBZ>Zd13J+uUv8wsc#$t=%?m zTeqFt-tFLabUV47-7aodx0~DD?cw%xd%3;cK5k#PpWELZ;0|;Lxr5yy?ofA_8|n^s zN4O*1QSNAWj62pH=Z<$LxD(w;?qqk0JJp@$PIqUxGu>J4YjxEI|^?q&Cid)2+>UUzS}H{DzA zZTF6Q*S+W7cOSS9-AC?Y_lf(|eda!QU$`&bSMF=~jr-Pp=e~D8xF6k5?q~Ol`_=vC zes_PkKiyyMZ}*QI#tZ9(^TK-(yog>TFR~ZKi|R%5qI)sCm|iR|wim~X>&5fpdkMUR zULr5Cm&8lz8J_7`p6xlF@T8|a?YW-k`Cj0Kc*(rvUJ5Uzm&!}+rSZ~w>Adt_1}~$R z$;<3z@v?f^yzE{MFQ=Ew%kAay@_PBa{9XaCpjXH%>=p5fdd0lrUJ0+HSIR5xmGR1Y z<-GD<1+Su4$*b&D@v3^&yy{*JuclYatL@eC>U#CO`d$OCq1VW3>^1S4dd+SXN`g;Am{@ws@pf|`H><#gT zdc(X>Z@4$Y8|jVmMtfttvEDduyf?v{=uPq_dsDor-ZXEzH^ZCh&GKe@bG*6UJa4|Y zz+31o@)mnbyrteUZ@IU^Tj{OxR(or_wca{!y|=;J=xy>gdt1D%-ZpQ$x5L}%?ecbe zd%V5gK5xHwz&q$2@(z1PyrbSR@3?ouJL#SBPJ3s(v)(!Hym!I7=w0$Idsn=x-Zk&K zcf-5s-STdGcf7maJ@3BvzYnedTN4^*!JB13$!1<|p@4_$mEVeri9BpVm+3 zr}s1X8U0LtWP zerdmqU)C?@m-j3975z$nWxtAF)vxAP_iOky{aSu)zm8wmujkkI8~6?VMt);!pLb`P2Ow{!D+CKii+< z&-Lf|^Zf<>LVuCJ*k9r=^_Tg}{T2R7f0e)5U*oU!*ZJ%H4gN-dlfT*D;&1i0`P=;+ z{!V|FzuVvA@Adcj`~3s{LI03{*gxVQ^^f_-{S*F4|CE2)KjWYE&-v&53;sp_l7HF1 z;$QWz`Pcm${!Rauf7`#~-}UeL_x%U{L;sQg*ni?b^`H6A{TKdA|CRsRf8)RP-}&$T z5B^90lmFTO;(ztO`QQB?{!jmx|J(oLhY7+4;ezl%gdk!NDTo|I38Dtkg6KhvAZ8FN zh#kZU;s)`8_(6goVUQ?D93%;n21Z~8R$vECAOabvKnHH%1%40&AwjYrd5|JV8KeqQ z2Wf(|LAoG)kRiwzWC}6|S%R!Vwjg_uBgh%#3UUW|g1kY#Ab(IGC>Rt93I|1kqCv5s zcu*oJ8I%f22W5h?LAjuOP$8%oR0=8wRf4KPwV--XBd8hF3Tg*+g1SMypnlLGXc#mK z8V5~+ra`lydC($g8MF#o2W^73LA#)R&>`p;bP75LU4pJbx1f8_Bj_3Q3VH{9g1$k& zpnotR7#Iu+1_wiep~0{qG#DO?2u22@g3-a4U~Dih7#~asCI*v&$-$IhYA`LB9?S@4 z2D5_M!JJ@jFfW)NEC?0`i-N_$l3;1DELa|_2v!EGg4MyAU~RB2SRZT%HU^u5&B2yn zYp^ZY9_$Eq2D^gY!Jc4murJsj90(2uhl0bwk>F@>EI1yV2u=p4g44m7;B0U%I3HXH zE(Vu^%fXf4YH%&M9^43S2DgIS!JXi4a4)zYJO~~JkAla+li+FaEO;Kg2wn!Sg4e;D z;BD|OcprQSJ_etH&%u}AYw#`j9{dP?2ET&e!Ji;ZNZ62YA>l(JghUL96cRZkN=Vd@ zXd%%0+%<00Vcp*}xzNhd>;4y6_cw`d{ohM}Q`$xE4jcZS z{_l*vjsILDmQIIO>X+?rPyKiLf6WyAC;s!FivB+fQ?e=9yFz=W6x6JrugiUyi!p^Xj-l&DaniyrzIUw&m$7U`4Eim9Yv|#cEg`YhX>Rg|)E`*2Q{Q9~)ppY=n)m2{y%M*c@A6 zOKgR$u?@DxcGw;}U`OnPov{mc#ctRgdtguOg}t#4_QihK9|zz-9E5{$2oA+z7>dJj z1dhZ}I2y;`SR9AraRN@nNjMp&;8dK3({TpQ#925S=ipqNhx2g(F2qH+7?_uyXKhx_pW9>ha<7?0plJch^d z1fIlGcpA^(Sv-g5@d94NOL!Tt;8nba*YO74#9Me9@8Dg$hxhRTKEy}(7@y!%e1^~Q z1-`^r_!{5fTYQJ_@dJLuPxu+X;8*;H-|+|j#9#Ou|KQ)e*?%i6hQsg}0V850jEqq* zDn`TT7z1NsER2o+Bw7DcaWNkL6J`BRCBTIE&$fntDlsO(q-dat7TV~bK#2-9y6B;g z0ft~QOpYlqC8omEmkgV;gLX?XW#|z>e4nJ7X8@irug~_Q0Ol3wvW9 z?2G-dKMufwI0y&h5FCobFcgR52pox{a5Rp=u{aLL;{=?DlW;Ol!KpY6r{fHqiL-Dv z&cV4j59i|oT!@QsF)qQSxD1!$3S5b+a5b*MwYUz~;|AP_n{YF3!L7Irx8n}niMwz& z?!mpd5BK8%Jcx(zFdo69cnpu@2|S6X@HC#mvv>~A;|08km+&%P!K-);uj388iMQ}J z-od+g5AWjxe29{VlK>$c`z^L!~9qP3t}NG zj76|07Q^CL0!v~kERAKbESAIaSOF_yC9I59uqsx=>R1D7VlAwVb+9hh!}{0&8)74D zj7_j9HpAxF0$XA$Y>jQOEw;n<*a16YC+v(}uq$@M?$`r+VlV8CeXuX~!~Qq`2jU6G62FKz!9FG%lB2L1|I0dKTG@Onza3;>e**FL1;yj#>3veMW z!o|1*m*O&9jw^5_uEN#02G`;`T#p-YBW}XYxCOW3Hr$Roa3}7<-M9z$;y&Du2k;;s z!ozq3kK!>rjwkRWp2E|32G8O-JdYRfB3{DFcm=QGHN1{D@Fw2E+js}>;yt{N5AY#A z!pHaopW-uojxX>fzQWh|2H)a4e2*XSBYwiq_yxb>H~fx2@F)Jl-}ndrTOJlRJo6vJ zRx^VRVdvF)ct8Ud)I2u>cmt zLRc7!U{NfF#jymI#8Oxq%V1e7hvl&XR>VqJ8LMDb{C6eP-}|#VeGROMwXinU!Ma!v z>th3Kh>fr@Ho>OY44Y#MY>BO~HMYUF*bduc2keNQurqeSuGkH`V-M_!y|6d-!M@lJ z`{Mu{h=Xu24#A-~3`21^j=+&P3Pcz=gO77vmCKipy|0uE3SJ3RmMAT#M^)J#N5_xCuAo7Tk*4a69h6owy5k;~w0L z`*1%Vz=L=Q591L$ipTIcp1_lM3Qyx1Jd5Y>JYK+ycnL4#6}*bq@H*bWn|KRv;~l(< z_wYVGz=!w;O(V-YNh#jrS*z>-)BOJf- zus$}xhS&%jV-swO&9FJPz?RqwTVoq+i|w#IcEFC<2|HsK?26s6JNCey*b94OAMA_$ zus;sKfj9^U;}9H*!!Q(w;|Lsyqi{5i!Lc|F$KwQ?h?8(KPQj@-4X5J_oQbn=HqODh zI1lIJ0$hlTa4{~yrML{2;|g4ft8g{0!L_&!*W(7kM!LxV{&*KHWh?np(Ucsw)4X@)3yotB)Hr~Oz zcn|O61AK^&@G(BYr}zw?;|qL=ukba#!MFGh-{S}Th@bE?e!;K!4Zq_L{E5HtH~zuD zduIQwuow=*V+4$dkuWkw!KfGwqhkz=iLo#?#=*E4594D3Oo)jvF($#JXrPG}+UTG_ zi3&Bk=%J4RhF~&GjwvuDroz;i2Ge3XOph5bBWA+Pm<6+9Hq4GWFem21+?WURVm{1| z1+X9%!opYti()Y>jwP@pmcr6l2FqeOERPkiB38o6SOu%%zZRg|)E` z*2Q{Q9~)ppY=n)m2{y%M*c@A6OKgR$u?@DxcGw;}U`OnPov{mc#ctRgdtguOg}t#4 z_QihK9|zz-9E5{$2oA+z7>dJj1dhZ}I2y;`SR9AraRN@nNjMp&;8dK3({TpQ#925S z=ipqNhx2g(F2qH+7? z_uyXKhx_pW9>ha<7?0plJch^d1fIlGcpA^(Sv-g5@d94NOL!Tt;8nba*YO74#9Me9 z@8Dg$hxhRTKEy}(7@y!%e1^~Q1-`^r_!{5fTYQJ_@dJLuPxu+X;8*;H-|+|j#9#Ou z|KQ&(^Z%Rw7!Jc@1dNE0FfvBLs2B~SV+@Rmu`o8q!MGR?<6{C$h>0*UCc&g=potdR z=%7G}3N^aup^pKEU@}aODKI6b!qk`s(_%VIj~Or{X2Q&v1+!u{%#JxQC+5Q3m zKFp5=upkz~!dL{0Vlga^C9oux!qQj<%VIe!j}@>YR>I0y1*>8;td2FXCf35*SO@E3 zJ*D!}YiUH{vGTj9YLkZo}=k19##s+>Lv1 zFYd$rcmNOLAv}yn@F*U`<9Gs3;we0hXYeeZ!}E9nFXAP8n18?Fjyp4D8 zF5biY_y8Z`BYccc@F_mS=lB9&;wyZOZ}2U?!}s_BKjJ6+j9>68e#7th1ApQ#{EdGw zOl0OihQsg}0V850jEqq*Dn`TT7z1NsER2nDFfPW!_?Q3_Vj@h8NiZoIXrhHSIw(-0 zLX9qZ=wpB(m<*F+3QUQqFg2#Zw3rUlV+PEKnJ_bE!K|1Kvttg-iMcR0=E1y}5A$OI zEQp1$Fc!h0SPY9}2`q`Fur!vzvRDqwV+E{;m9R2a!Kzpdt78qUiM6mc*1@`159?zC zY>17pF*d=b*bJLv3v7w4ur;>9w%88aV+ZVrov<@@!LHa1yJHXRiM_Bl_QAf`5BuW) z9EgK(Fb=_?I1EE^IF7)PI0{GO7#xe^a6C@Hi8u)-;}o2V({MV@z?nD;XX6~4i}P?k zF2IGj2p8iLT#CzZIj+E!xC&R}8eEI(a6N9ojkpOn;}+bC+i*MXz@4}YcjF%1i~Ddt z9>9Zm2oK{CJc`HgIG(_hcnVMB89a;U@H}3?i+Bky;}yJ$*YG;tz?*mrZ{r=ji}&z8 zKEQ|g2p{7Ue2UNTIljP`_zGX+8+?oJ@I8LOkN62c;}`sj-|##Bz@PXFf8!qv6NUMY z;V?W#z=#+LBV!bdiqSAS#=w{u3u9v(jEnIwJ|@6~m{VlK>$c`z^L!~9qP3t}NG zj76|07Q^CL0!v~kERAKbESAIaSOF_yC9I59uqsx=>R1D7VlAwVb+9hh!}{0&8)74D zj7_j9HpAxF0$XA$Y>jQOEw;n<*a16YC+v(}uq$@M?$`r+VlV8CeXuX~!~Qq`2jU6G62FKz!9FG%lB2L1|I0dKTG@Onza3;>e**FL1;yj#>3veMW z!o|1*m*O&9jw^5_uEN#02G`;`T#p-YBW}XYxCOW3Hr$Roa3}7<-M9z$;y&Du2k;;s z!ozq3kK!>rjwkRWp2E|32G8O-JdYRfB3{DFcm=QGHN1{D@Fw2E+js}>;yt{N5AY#A z!pHaopW-uojxX>fzQWh|2H)a4e2*XSBYwiq_yxb>H~fx2@F)Jl-}nc^L}mVCI1G;w zFd|06$QT8qVl<47F)${^!q^xG<6=CFj|ng#Cc?y+1e2nHCR%8tg90Tg)aaszJ_Z;O(V-YNh z#jrS*z>-)BOJf-us$}xhS&%jV-swO z&9FJPz?RqwTVoq+i|w#IcEFC<2|HsK?26s6JNCey*b94OAMA_$us;sKfj9^U;}9H* z!!Q(w;|Lsyqi{5i!Lc|F$KwQ?h?8(KPQj@-4X5J_oQbn=HqODhI1lIJ0$hlTa4{~y zrML{2;|g4ft8g{0!L_&!*W(7kM!LxV{&*KHWh?np(Ucsw)4X@)3yotB)Hr~Ozcn|O61AK^&@G(BY zr}zw?;|qL=ukba#!MFGh-{S}Th@bE?e!;K!4Zq_L{E5HtH~zse(U|`j4#Q&vjEIpi zGDg9u7!9Li42+4fFgC`)xEK%PV**Twi7+uH!K7%Qi5A-Epg@TVHM;1bj{$~YGE9ys zFeRqK)R+d-VmeHZ889Pe!pxWjvtl;PjyW(V=EB^V2lHY+%#Q`IAQr;HSOkk=F)WTH zuq2kk(pUz|VmU026|f>!!pc|$t70{*jy13**23CY2kT-ztd9+_AvVIs*aVwmGi;76 zuqC#_*4PHyVmoY)9k3&I!p_(QyJ9!&jy)Jra4e3)@i+k|;v}4mQ*bIy!|6B!XW}fJjdO4=&cpe*02ksST#QR_DK5k1 zxB^$=DqM|ga4oLG^|%2y;wIdTTW~9G!|k{Ocj7MGjeBq}?!*0f01x6JJd8*1C?3P( zcmhx2DLjp5@GPFg^LPO-;w8L{SMVxc!|QkhZ{jVyjd$=a-oyL&03YHbe2h=_xJ%n;wSu!U+^n_!|(V5f8sCvjeqdJpx&_2|Mwmc4#Q&vjEIpi zGDg9u7!9Li42+4fFgC`)xEK%PV**Twi7+uH!K7%Qi5A-Epg@TVHM;1bj{$~YGE9ys zFeRqK)R+d-VmeHZ889Pe!pxWjvtl;PjyW(V=EB^V2lHY+%#Q`IAQr;HSOkk=F)WTH zuq2kk(pUz|VmU026|f>!!pc|$t70{*jy13**23CY2kT-ztd9+_AvVIs*aVwmGi;76 zuqC#_*4PHyVmoY)9k3&I!p_(QyJ9!&jy)Jra4e3)@i+k|;v}4mQ*bIy!|6B!XW}fJjdO4=&cpe*02ksST#QR_DK5k1 zxB^$=DqM|ga4oLG^|%2y;wIdTTW~9G!|k{Ocj7MGjeBq}?!*0f01x6JJd8*1C?3P( zcmhx2DLjp5@GPFg^LPO-;w8L{SMVxc!|QkhZ{jVyjd$=a-oyL&03YHbe2h=_xJ%n;wSu!U+^n_!|(V5f8sCvjeqdJfc~&CnEx0aBVa^~gpn}{ zM#X3t9b;fjjD@i=4#vfJ7#|a0LQI5-F$pF`15LEhMh68-RH)HK4}AtTItfDN$`HpV8{6q{jlY=JGY z6}HAU*cRJid+dN6u@iR2F4z^jVR!6-J+T+|#y;2=`(b|^fCF(54#puk6o+9b4#yEV z5=Y@^9D`$V9FE5cI1wk|WSoLiaT-p?88{PX;cT3Pb8#Nd#|5|$7vW-Df=h83F2@zP z5?A4BT!U+I9j?a>xDhwuX54~XaT{*O9k>&B;cnc6dvPD`#{+l}58+`vf=BTf9>)`S z5>Mf2JcDQP9G=Guco8q*WxRq{@fu#o8+a3M;cdKwckv$H#|QWjAK_zsf=}@oKF1gM z5?|qKe1mWC9lpm8_z^$hXZ(U+@f&`}ANUi0;cxtd|1IVT87|{8{=SHjEC_t0Vc#mm>82_QZ&#+3vF~zphSflUG&h$07EbtCdU+*5>sJn zOoM4L9j3<&m=QB!X3T_y7RM4;5=&ue zEQ4jS9G1rlSP?5>Wvqf#u^Lv#8dwu+VQs8~b+I1S#|GFC8)0K?f=#g*Hpdp&5?f(w zY=dpF9k#~~*bzHnXY7Jqu^V>B9@rCmVQ=h%eX$?*#{oDH2jO5Gffg^Dg zj>a)K7RTXuoPZN?5>Cb`I2EVibew@RaTdnch zFeb*r*cb=nVmyqG2{0ih!o-*alcIqpT4Js)Gh-IairFwb=D?ho3v**0%!~OjKNi4(SO^Pa5iE+uusD{$l2{5$V;L-q z<*+*1(!r3u|K?tc&%qJ~qIH*a#bA6KsmjusOECme>kgV;gLX z?XW#|z>e4nJ7X8@irug~_Q0Ol3wvW9?2G-dKMufwI0y&h5FCobFcgR52pox{a5Rp= zu{aLL;{=?DlW;Ol!KpY6r{fHqiL-Dv&cV4j59i|oT!@QsF)qQSxD1!$3S5b+a5b*M zwYUz~;|AP_n{YF3!L7Irx8n}niMwz&?!mpd5BK8%Jcx(zFdo69cnpu@2|S6X@HC#m zvv>~A;|08km+&%P!K-);uj388iMQ}J-od+g5AWjxe29Vx% zJ$As3*ax}57z<-# z9E^+cFg_;0gqR2uV-ie?2AXK0jSdQws8FMe9{Lzy2qweim;zH`Dol-OFfFFT^q2uN zVkXRtSuiVR!|a#?b7C&cjd?IH=EMA001ILvER034C>F!wSOQC8DJ+d;uq>9t@>l^Y zVkNAMRj?{n!|GTAYho>|jdidt*2DVP02^W>Y>Z8?DK^9A*aBN(D{PHzur0R3_SgYC zVkhj3U9c;5!|vDvdtxu_jeW2$_QU=-00-hA9E?M7C=SC=9F8M!B#y$-I0nb!I2?}? za3W5^$v6e4;xwF&GjJx(!r3?n=i)q^j|*@iF2cpQ1efA6T#hSnC9cBNxCYnaI$Vz% za3gNQ&A0`(;x^olJ8&oN!rizB_u@X>j|cD|9>T+T1drk|JdP*uB%Z?4cm~hnIXsUS z@FHHq%XkH^;x)XEH}EFj!rOQU@8UhYj}P!6KElWN1fSwFe2y>hCBDMf_y*tNJA98H z@FRZ0&-ewu;y3(`Kkz61!r%A@!^CC&V>k?t5ilY~!pIl}qhd6SjxjJM#=_Vb2jgNq zjE@O0Atu7am;{refhJmLqk{q^D%9wrhdu@vg2^yBrofb#3R7bmOpEC-J!Zg+mVx%J$As3*a0*UCc&g=potdR=%7G}3N^aup^pKEU@}aODKI6b!qk`s(_%VIj~Or{X2Q&v1+!u{ z%#JxQC+5Q3mKFp5=upkz~!dL{0Vlga^C9oux!qQj<%VIe!j}@>YR>I0y1*>8; ztd2FXCf35*SO@E3J*D!}YiUH{vGTj9YLk zZo}=k19##s+>Lv1FYd$rcmNOLAv}yn@F*U`<9Gs3;we0hXYeeZ!}E9nFXAP8n18?Fjyp4D8F5biY_y8Z`BYccc@F_mS=lB9&;wyZOZ}2U?!}s_BKjJ6+j9>68 ze#7th1ApQ#{EdGwOnl})hQsg}0V850jEqq*Dn`TT7z1NsER2nDFfPW!_?Q3_Vj@h8 zNiZoIXrhHSIw(-0LX9qZ=wpB(m<*F+3QUQqFg2#Zw3rUlV+PEKnJ_bE!K|1Kvttg- ziMcR0=E1y}5A$OIEQp1$Fc!h0|A&{m45%XL8#X>O3y9d=UD(})9Viyqh?FAs0qkR= zVt03UcXxMpcX!9TbIt`Sa^KJMe0k4@{q4@q&hGwaW@njm6suzmtcmVe3u~hX*1@`1 z59?zC^h7Ugh>fr@Ho>OY44b1jwm=_jiN5HE{uqFP*a}-?5VpZ!Y>Vx%J$As3*a|i>#ctRgdtguOg;wm1eXuX~!~Qq`2Vyu5!ofHMBXB4V!{ImrN8%_Pjbm^u zj>GXd0Vm=loQzX&Do(@cI0I+mES!yVa4ycn`M3ZV;v!s(OK>SJ!{xXFSK=yMjcaf% zuEX`X0XO0%+>BdrD{jN>xC3|MF5HcKa4+t|{dfQm;vqbYNAM^f!{c}YPvR*&jc4#I zp2PEa0Wabuyo^`yDqh3ucmr?ZExe6)@GjoN`}hDK;v;;FPw*)|!{_({U*ao#jc@QR zzQgzU0YBm={ET1lD}KZ8_yd39FBC}`|52fV4(NzZ7z>>-Ho9OOjEnIwJ|@6~mJs)Gh-IairFwb=0I1>iMcR0=E1y}5A$OIEQp1$ zFc!h0SPY9}2`q`Fur!vzvRDqwqZ?MhidYFNV->85)v!9&z?$fewXimNU>&TB^{_rR zKu`3-hS&%jV-swO&9FIoV+-`bmgtLq=#K#yh^??S24Ncv#ZzFARfZQcm$8)F+7eZ@FbqX(|88Y;yFBz z7w{rp!pnFCui`bljyLco-oo2>2k+uNypIp?AwI&#_ynKgGklIO@Fl*&*Z2nC;yZkg zAMhi7!q4~xzv4Iijz91x{z8$A@gEf$=zxysgt5>WW1|bk!MGR?<6{C$h>0*UCc&hb z43lFDOo^#5HKxI|m=4op2F!?=Ff(Sste6e6V-9r1oR|x9V;;17pF*d=b*bJMaH?}|@Y>B?;hyECVf!GRLV-U8%U~G%+uswFbj@Su1qXk1S6vMC! zcExVk9eZF;?1fhBjeW2$_QU=-00&|?4#L4W1S4=L4#VL%0!QK~9F1deERMtRH~}Z( zB%F*>a4Js2={N&t;w+qvb8s%s!}+)X7vdsZj7xASF2m)x0$1WHT#ajREw01$xB)lf zCftl$a4T-Z?YIMX;x62cdvGuA!~J*w58@#_j7RV&9>e2!0#D*8JdJ1YES|&jcmXfs zCA^GR@G4%z>v#ii;w`+5cknLW!~6IEAL1i?j8E_>KEvnu0$<`Qe2s7LExyC|_yIrS zC;W_G@GE}9@Aw0M;x81*8UInCfez@1P8bWFF*dqj9E^+cFg_;0gqR2uV-ie?$uK#l zz?7H@Q)3!Ti|H^uX26V?2{U6B%!=7CJLW)F%!#=$H|D{-m=E(~0W64xurL*1($RjKQ~H3nfD492$D4%=e~?1-JPGg>eNLop1yU{~yh z-LVJu#9nB{-q;8GVn6JU18^XQ;~*T2Lofn|;xHVJBXA^+!qGSe$Kp5~j}verPQuAJ z1*hUPoQ^YaCeFgyI0xtAJe-dUa3LSeNC+@=CxCi&*KHQH7@E{(-!*~Rb;xRmqC-5Ym!qa#L&*C{ej~DPFUc$?G z1+U^YypA{UCf>r^cn9y|J-m+(@F70J$M^)F;xl}XFYqP4!q@l)-{L!bk00yhEV*~U=FKmd7urW5l zrq~Rdqc^rdA8d)f=!gCofPvTwTVoKm!C-8Q?XW#|z>e4nJEH|dFcibE3wFhB*d2Rd zPwa(O?2Ub}FZRR!H~)Jra4e3)@i+k|;v}4mQ*bIy z!|6B!XW}fJjdO4=&cpe*02ksST#QR_DK5k1xB^$=DqM|ga4oLG^|%2y;wIdTTW~9G z!|k{Ocj7MGjeBq}?!*0f01x6JJd8*1C?3P(cmhx2DLjp5@GPFg^LPO-;w8L{SMVxc z!|QkhZ{jVyjd$=a-oyL&03YHbe2h=_xJ%n;wSu!U+^n_ z!|(V5f8s9`DH;D!p@9zQh)x&_oiR4LU>uB#@i0Cnz=W6x6JrugipelJrofb#3R7bm zOpEC-J!Zg+mLgW zIeKFY^udj0T_s_ur&r@8w|#_*bduc2keNQurpdP1Vb?lyI@!BhTX9T_QYOj z#opKl`(i)rj{|TZhT|X{j6*O2hvG0Cjw5g+j>6G62FKz!9FG%lB2L1|I0dKTG@Onz za3;>e**FL1;yj#>3veMW!o|1*m*O&9jw^5_uEN#02G`;`T#p-YBW}XYxCOW3Hr$Ro za3}7<-M9z$;y&Du2k;;s!ozq3kK!>rjwkRWp2E|32G8O-JdYRfB3{DFcm=QGHN1{D z@Fw2E+js}>;yt{N5AY#A!pHaopW-uojxX>fzQWh|2H)a4e2*XSBYwiq_yxb>H~fx2 z@F)I4k&5vj6&mP(j_8E3&>3T+3&z2?7!TuP0!)aBFfk^th4-L@#WJjj%B`!KT;@o1-_j zKp$+0zUYVk7=VG;3R`0kw!vU*i|w#IcEFC<2|J?&LogJ>unTs@ZrB}rU{CCYR_u*^ zurKz*{x|>!VmJ=M!8imXa3~JL;Wz?E;wT)AV{j~v!|^x)C*mZWj8kwbPQ&Rq183qa zoQ-pEF3!XGxBwU8B3z71a49as<+uV@;woH?Yj7>D!}YiUH{vGTj9YLkZo}=k19##s z+>Lv1FYd$rcmNOLAv}yn@F*U`<9Gs3;we0hXYeeZ!}E9nFXAP8n18?Fj zyp4D8F5biY_y8Z`BYccc@F_mS=lB9&;wyZOZ}2U?!}s_BKjJ6+j9>68e#7th1ApQ# z6sZ~iQK5kj=!i}j3!O1Gx?miPi}5f%CcuQ42oqxxOp3`cIi|prmus$|GPxQiu*a#bA6KsmjusM2T3-rO3 z=!<^nj{z8nt*|u)VH*s_w%88aV+ZVrov<@nFa$#}47*@g?1tU32lm8XXvN;x2m4|_ z?2iL*Aco^09E?LS0*B%-9F8M!B#y$-I0nb!I2?}?a3W5^$v6e4;xwF&GjJx(!r3?n z=i)q^j|*@iF2cpQ1efA6T#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN!rizB z_u@X>j|cD|9>T+T1drk|JdP*uB%Z?4cm~hnIXsUS@FHHq%XkH^;x)XEH}EFj!rOQU z@8UhYj}P!6KElWN1fSwFe2y>hCBDMf_y*tNJA98H@FRZ0&-ewu;y3(`Kkz61LXn2? z9~BztfR5;dvCtV~qYK8txEK%PV**Twi7+uH!K9cBlVb`@iK#F(roptB4%1@>%!rvV zGiJf8m<_XI4s^wwm;O(V-YNh#jrS*z>-)BOJf-7)R4Xa}ftcmVe3u~hX*1@`159?zC^h7Ugh>fr@Ho>OY44b1jwm=_jiN5HE z{uqFP*a}-?5VpZ!Y>Vx%J$As3*a|i>#ctRgdtguOg;wm1eXuX~!~Qq` z2Vyu5!ofHMBXB4V!{ImrN8%_Pjbm^uj>GXd0Vm=loQzX&Do(@cI0I+mES!yVa4ycn z`M3ZV;v!s(OK>SJ!{xXFSK=yMjcaf%uEX`X0XO0%+>BdrD{jN>xC3|MF5HcKa4+t| z{dfQm;vqbYNAM^f!{c}YPvR*&jc4#Ip2PEa0Wabuyo^`yDqh3ucmr?ZExe6)@GjoN z`}hDK;v;;FPw*)|!{_({U*ao#jc@QRzQgzU0YBm={ET1lD}KZ8_yd39FBEAR|52fV z4(NzZ7z>>-Ho9OOjEnIwJ|@6~mJs)Gh-Ia zirFwb=0I1>iMcR0=E1y}5A$OIEQp1$Fc!h0SPY9}2`q`Fur!vzvRDqwqZ?MhidYFN zV->85)v!9&z?$fewXimNU>&TB^{_rRKu`3-hS&%jV-swO&9FIoV+-`bmgtLq=#K#y zh^??S24Ncv#ZzF zARfZQcm$8)F+7eZ@FbqX(|88Y;yFBz7w{rp!pnFCui`bljyLco-oo2>2k+uNypIp? zAwI&#_ynKgGklIO@Fl*&*Z2nC;yZkgAMhi7!q4~xzv4Iijz91x{z8$C@gEf$=zxys zgt5>WW1|bk!MGR?<6{C$h>0*UCc&hb43lFDOo^#5HKxI|m=4op2F!?=Ff(Sste6e6 zV-9r1oR|x9V;;17pF*d=b*bJMaH?}|@Y>B?;hyECVf!GRL zV-U8%U~G%+uswFbj@Su1qXk1S6vMC!cExVk9eZF;?1fhBjeW2$_QU=-00&|?4#L4W z1S4=L4#VL%0!QK~9F1deERMtRH~}Z(B%F*>a4Js2={N&t;w+qvb8s%s!}+)X7vdsZ zj7xASF2m)x0$1WHT#ajREw01$xB)lfCftl$a4T-Z?YIMX;x62cdvGuA!~J*w58@#_ zj7RV&9>e2!0#D*8JdJ1YES|&jcmXfsCA^GR@G4%z>v#ii;w`+5cknLW!~6IEAL1i? zj8E_>KEvnu0$<`Qe2s7LExyC|_yIrSC;W_G@GE}9@Aw0M;x82G8UInCfez@1P8bWF zF*dqj9E^+cFg_;0gqR2uV-ie?$uK#lz?7H@Q)3!Ti|H^uX26V?2{U6B%!=7CJLW)F z%!#=$H|D{-m=E(~0W64xurL z*1($RjKQ~H3nfD z492$D4%=e~?1-JPGg>eNLop1yU{~yh-LVJu#9nB{-q;8GVn6JU18^XQ;~*T2Lofn| z;xHVJBXA^+!qGSe$Kp5~j}verPQuAJ1*hUPoQ^YaCeFgyI0xtAJe-dUa3LSeNC+@=CxCi&*KHQH7@E{(-!*~Rb z;xRmqC-5Ym!qa#L&*C{ej~DPFUc$?G1+U^YypA{UCf>r^cn9y|J-m+(@F70J$M^)F z;xl}XFYqP4!q@l)-{L!bk00yhEV*~U=FKmd7urW5lrq~Rdqc^rdA8d)f=!gCofPvTwTVoKm!C-8Q z?XW#|z>e4nJEH|dFcibE3wFhB*d2RdPwa(O?2Ub}FZRR!H~)Jra4e3)@i+k|;v}4mQ*bIy!|6B!XW}fJjdO4=&cpe*02ksST#QR_DK5k1 zxB^$=DqM|ga4oLG^|%2y;wIdTTW~9G!|k{Ocj7MGjeBq}?!*0f01x6JJd8*1C?3P( zcmhx2DLjp5@GPFg^LPO-;w8L{SMVxc!|QkhZ{jVyjd$=a-oyL&03YHbe2h=_xJ%n;wSu!U+^n_!|(V5f8s9`85#dkp@9zQh)x&_oiR4LU>uB# z@i0Cnz=W6x6JrugipelJrofb#3R7bmOpEC-J!Zg+mLgWIeKFY^udj0T_s_ur&r@8w|#_*bduc z2keNQurpdP1Vb?lyI@!BhTX9T_QYOj#opKl`(i)rj{|TZhT|X{j6*O2hvG0Cjw5g+ zj>6G62FKz!9FG%lB2L1|I0dKTG@Onza3;>e**FL1;yj#>3veMW!o|1*m*O&9jw^5_ zuEN#02G`;`T#p-YBW}XYxCOW3Hr$Roa3}7<-M9z$;y&Du2k;;s!ozq3kK!>rjwkRW zp2E|32G8O-JdYRfB3{DFcm=QGHN1{D@Fw2E+js}>;yt{N5AY#A!pHaopW-uojxX>f zzQWh|2H)a4e2*XSBYwiq_yxb>H~fx2@F)I4kxAG-5)~TgfR5;dvCtV~qYK8txEK%P zV**Twi7+uH!K9cBlVb`@iK#F(roptB4%1@>%!rvVGiJf8m<_XI4s^wwm;O(V-YNh#jrS*z>-)BOJf-7)R4Xa}ftcmVe3u~hX z*1@`159?zC^h7Ugh>fr@Ho>OY44b1jwm=_jiN5HE{uqFP*a}-?5VpZ!Y>Vx%J$As3 z*a|i>#ctRgdtguOg;wm1eXuX~!~Qq`2Vyu5!ofHMBXB4V!{ImrN8%_P zjbm^uj>GXd0Vm=loQzX&Do(@cI0I+mES!yVa4ycn`M3ZV;v!s(OK>SJ!{xXFSK=yM zjcaf%uEX`X0XO0%+>BdrD{jN>xC3|MF5HcKa4+t|{dfQm;vqbYNAM^f!{c}YPvR*& zjc4#Ip2PEa0Wabuyo^`yDqh3ucmr?ZExe6)@GjoN`}hDK;v;;FPw*)|!{_({U*ao# zjc@QRzQgzU0YBm={ET1lD}KZ8_yd39FBF*>|52fV4(NzZ7z>>-Ho9OOjEnIwJ|@6~ zmJs)Gh-IairFwb=0I1>iMcR0=E1y}5A$OI zEQp1$Fc!h0SPY9}2`q`Fur!vzvRDqwqZ?MhidYFNV->85)v!9&z?$fewXimNU>&TB z^{_rRKu`3-hS&%jV-swO&9FIoV+-`bmgtLq=#K#yh^??S24Ncv#ZzFARfZQcm$8)F+7eZ@FbqX(|88Y z;yFBz7w{rp!pnFCui`bljyLco-oo2>2k+uNypIp?AwI&#_ynKgGklIO@Fl*&*Z2nC z;yZkgAMhi7!q4~xzv4Iijz91x{z8$3@gEf$=zxysgt5>WW1|bk!MGR?<6{C$h>0*U zCc&hb43lFDOo^#5HKxI|m=4op2F!?=Ff(Sste6e6V-9r1oR|x9V;;17pF*d=b*bJMaH?}|@Y>B?;hyECVf!GRLV-U8%U~G%+uswFbj@Su1qXk1S z6vMC!cExVk9eZF;?1fhBjeW2$_QU=-00&|?4#L4W1S4=L4#VL%0!QK~9F1deERMtR zH~}Z(B%F*>a4Js2={N&t;w+qvb8s%s!}+)X7vdsZj7xASF2m)x0$1WHT#ajREw01$ zxB)lfCftl$a4T-Z?YIMX;x62cdvGuA!~J*w58@#_j7RV&9>e2!0#D*8JdJ1YES|&j zcmXfsCA^GR@G4%z>v#ii;w`+5cknLW!~6IEAL1i?j8E_>KEvnu0$<`Qe2s7LExyC| z_yIrSC;W_G@GE}9@Aw0M;x80g8UInCfez@1P8bWFF*dqj9E^+cFg_;0gqR2uV-ie? z$uK#lz?7H@Q)3!Ti|H^uX26V?2{U6B%!=7CJLW)F%!#=$H|D{-m=E(~0W64xurL*1($RjKQ~H3nfD492$D4%=e~?1-JPGg>eNLop1y zU{~yh-LVJu#9nB{-q;8GVn6JU18^XQ;~*T2Lofn|;xHVJBXA^+!qGSe$Kp5~j}ver zPQuAJ1*hUPoQ^YaCeFgyI0xtAJe-dUa3LSeNC+@=CxCi&*KHQH7@E{(-!*~Rb;xRmqC-5Ym!qa#L&*C{ej~DPF zUc$?G1+U^YypA{UCf>r^cn9y|J-m+(@F70J$M^)F;xl}XFYqP4!q@l)-{L!bk00yhEV*~U=FKmd7 zurW5lrq~Rdqc^rdA8d)f=!gCofPvTwTVoKm!C-8Q?XW#|z>e4nJEH|dFcibE3wFhB z*d2RdPwa(O?2Ub}FZRR!H~)Jra4e3)@i+k|;v}4m zQ*bIy!|6B!XW}fJjdO4=&cpe*02ksST#QR_DK5k1xB^$=DqM|ga4oLG^|%2y;wIdT zTW~9G!|k{Ocj7MGjeBq}?!*0f01x6JJd8*1C?3P(cmhx2DLjp5@GPFg^LPO-;w8L{ zSMVxc!|QkhZ{jVyjd$=a-oyL&03YHbe2h=_xJ%n;wSu! zU+^n_!|(V5f8s9`*%|*)p@9zQh)x&_oiR4LU>uB#@i0Cnz=W6x6JrugipelJrofb# z3R7bmOpEC-J!Zg+mLgWIeKFY^udj0T_s_ur&r@8w|#_*bduc2keNQurpdP1Vb?lyI@!BhTX9T z_QYOj#opKl`(i)rj{|TZhT|X{j6*O2hvG0Cjw5g+j>6G62FKz!9FG%lB2L1|I0dKT zG@Onza3;>e**FL1;yj#>3veMW!o|1*m*O&9jw^5_uEN#02G`;`T#p-YBW}XYxCOW3 zHr$Roa3}7<-M9z$;y&Du2k;;s!ozq3kK!>rjwkRWp2E|32G8O-JdYRfB3{DFcm=QG zHN1{D@Fw2E+js}>;yt{N5AY#A!pHaopW-uojxX>fzQWh|2H)a4e2*XSBYwiq_yxb> zH~fx2@F)I4k%RFc6&mP(j_8E3&>3T+3&z2?7!TuP0u1#FO6?IgsJ>-`6e7&e%dt&| z(1OJ*8_liYW#%GhKg%W)t7WsfbPZl@I6RHo}8fo9hvt zr(@|ml0peXI0#4KBw`6?5nH&3I3liyC*q3)BB4km5{o1vsYoW0ixeWINF`E>G$O4? zC(?@yBBRJ8GK(xCtH>s@n;~))IYlm!TjUXWMLv;V6c7bPAyHTq5k*BYQCyS|B}FMw zT9grGMLAJkxQPm)qNpS)iz=e3s3xk58lt9f7qvuf;UVgXx}u(_FB%9>;UyZ1MxwE3 zBASY3qPg%EErgF~DSU;W@D~9hP_z=QMUZGCf<;@=PP7*tL`Ts{bQTs7B0@!&=pwp` zZlb&BA$p2l!YX=;KBBMaC;E#4VxR~YgT!DlL_~<8Vwe~%Mu?GOlo&0>h_Pav7%wJ> ziDHtNET)L5Vw#vPW{8<$mY6N(h`C~(m@gKHg<_FdES89+VwqSjR*02il~^s-h_zy! zST8n+jbfA7EVhWPVw>15c8Hx~m)I@#h`nN;*e?!@Ui{g^FEUt*F;+nWFZit)WmbfkMh`ZvRxGx@vhvJcVES`v`;+c3ZUWk|C zm3S@Qh_~XMcrQMPkK&W~EWU`Z;+yy`eu$sqms$N%Drrau=_s9KEa@y`OBWeO#+C78 ze3?Kdl!;_wnM5X)$z*bwLZ+0dWNMj4rj_YrdYM6Hl$m5^nMG!m*<^N^L%PbGGMCIP z^T@n1pUf`{$bzzvEG&!2qOzDQE=$OgvXm?>%gC~_oGdThWCdAKR+5!v6UPv1mj~oQc}O0XN90j?OdgjfZCd=iwaSpDok}z zT~#;LUG-2sRWD^#y;UF8SM^i<)c`e6g{whouo|Kw)KE1{4Ob)7NHt20R%6syHBOCJ z6VyaCNljK$)KoQ1O;4zNF7#3)KPUz9aksRNp(t{ zR%g^%bxxgE7t}>{NnKV~)Kzs&T~{~MO?6A%R(I50bx++_57a~TNIh0h)Km3LJy$Q( zOZ7^^-jH4AJj+nNqtsd)K~RQeOEu!PxZ@u^h-k-hT&j18cs$m!`X;!xEOJa zxJEo9zLCI4Xe2Tc8%d0$MlvJ0k-|u6q%u+)X^gZ+IwQT2!N_Q2GBO)kjI2gBBfF8q za5Zuoxs2RK9wV=j&&Y2SFbW!ljKW3{qo`5LC~lN6N*bk%(ncAhtWnM=Z@3v1jEY7j zqq0%OsA^O*sv9+onufbk%cyO57sCBh&~px)@!JZbo;bhtbpM zWmt{gMjxZE(a-2_3@`>7;l?0iurb7lFoqh#jN!%zW27<47;TI(#v0>{@x}yWqA|&s zY)mnx8q@oHl`;7g@0pp-?$T(~qF^(F?jN`@$8^Tq|^qH)Q%Y+Ny}8rO{L#tq}9am%=E+%fJN_l*0-1LL9b$O!cdUZV&1 z;I-y5D1A@{t7WUX<{Fgbua|*sI?etv5R^H>(=CJj-NybtU|&}x8R&qH=!CYH2y@+; z?bzsoaWF2%!}yp06JjDvj7cylCd1^I0#jltOpR$UEvCct7R1D7qC3{Y+US9GurAia`q%(H(F+@5BW#RKuqigf=ID(r&<9(hFZ!WB z24Enz!qymsZ7>+yVmoY)9k3&I!p>;H5DdjI?1Ejf8+OMY*b{r96?9Zm2oK{CJc`HgIG(_hcnVMB89a;U@H}3?i+Bky;}yJ$*YG;tz?*mrZ{r=ji}&z8 zKEQ|g2p{7Ue2UNTIljP`_zGX+8+?oJ@I8LOkN62c;}`sj-|##Bz@PXF&4~cpN1{Rl z9ncY-FcvyvY;?gm7#HJVd`y4|F%c%lB$yPFVRB4?DKQnM#x$4~(_wndfEh6pX2vX- z6|-S>%z>_$6LVp1%!7F`ALhpbSP%nj16|oXl z#wu79t6_Dlfi=+`Yhi8lz&cnL>tTItfS%}u4Y3h6#wOSln_+YG#un&sUZ z5L;nu48k@TjBT+Uw#N?G5j$aLv|tE^Vi3T+3&z2?7!TuP0!)aBFfk^th4-L@#WJjj%B`!KT;@o1-_jKp$+0zUYVk7=VG; z3R`0kw!vU*i|w#IcEFC<2|J?&LogJ>unTs@ZrB}rU{CCYR_u*^urKz*{x|>!VmJ=M z!8imXa3~JL;Wz?E;wT)AV{j~v!|^x)C*mZWj8kwbPQ&Rq183qaoQ-pEF3!XGxBwU8 zB3z71a49as<+uV@;woH?Yj7>D!}YiUH{vGTj9YLkZo}=k19##s+>Lv1FYd$rcmNOL zAv}yn@F*U`<9Gs3;we0hXYeeZ!}E9nFXAP8n18?Fjyp4D8F5biY_y8Z` zBYccc@F_mS=lB9&;wyZOZ}2U?!}s_BKjJ6+j9>68e#7th1ApQ#G=BtOAOBIIfez@1 zP8bWFF*dqj9E^+cFg_;0gqR2uV-ie?$uK#lz?7H@Q)3!Ti|H^uX26V?2{U6B%!=7C zJLW)F%!#=$H|D{-m=E(~0W64xurL*1($RjKQ~ zH3nfD492$D4%=e~?1-JPGg>eNLop1yU{~yh-LVJu#9nB{-q;8GVn6JU18^XQ;~*T2 zLofn|;xHVJBXA^+!qGSe$Kp5~j}verPQuAJ1*hUPoQ^YaCeFgyI0xtAJe-dUa3LSeNC+@=CxCi&*KHQH7@E{(- z!*~Rb;xRmqC-5Ym!qa#L&*C{ej~DPFUc$?G1+U^YypA{UCf>r^cn9y|J-m+(@F70J zsNd)5i5Bzsc@F09^GaCsY)ju%-2Byom#h7SMF=zXuTS*tP(QOU zJ)dMhtDaGc>8yH&$=7Pp6HLJ_=E7>xb4;$madm>e=-_zf!rP)}oU9f--{cycP$wrc z7w*E{(OsIqv()oX!AbN&Uu~FgScqAJ$@O*$bD?k0FLWnD=8%%&HSyZ{bSyj?PDuZU!X2g zPrn5QFEp37AIm;9SA4xKdJ^uhmZWFnxOd%r{%B10GGCXca5uZon9$AEnDiuGRE_AF zyvQ2Svw8p2$ZMTwYqtMGjp(_*Xc_;m5j`mwLyhP;LTW@$6Go4to-)+M{%>kTPb=DM zL{BV6FG)`_>XQC1HKHdZ|ELi?FKPBNJuOKu(=(IwGTqYYWqO{{YSFWlG4(P%YiaLg zx+Y`nWkD|^yO=QBp4B4F<-c7_=~QovVPbzeIOv2Jo=J{oy4jMPbjCk_R~<{|w^#1} zsU>~xV{1vyx;)#%Ft#ozh7-G(>1M#{2jT(?>ho6CPgmqe#}TaudC?;x2@h~W;Bo9SlgQs|71 zK~7dnN?S(fpewFHcWchx`s$M)EKdFnRGcpaYcQJ9)$e6)A-n{IL`ZTs0%A^a7;r=q~j8;n)6KW`{&S|e9 z9Uf{Zo4IPWWVh}6w}x`)jJ6umCyud(f>Y{dPt~~EO3P_0E&9NbOQ(8Ua+}yAo<}DH zIeQS*yym9WlFybHEvotTG0~!GtF!`9MR-IDbV0qx475wQ4s=~14nDr>WCm6w>(S9m z8xY4~zINU8$#1nPU)~C4z(u`Xdq!zz*KK%H`|8fqJeN$%h4ksE9pRAl=%wEkQb=E# z{U1RTHdoEMDWX&U?2Sc#UyAvLU|x#-V}{N4G56X&|7j?3Vt*`j#@J}dCZ6PVH z&-Od2CCp8$rKBw}T2xEvV`7SG>8K*2N41RJ^Zy~LW%Z@mqgu{fH5+ny6B@Fcz6g6m z){TjVT)|wmS}NN1{n?ORY;Qo7bVgf4*0m91Lk`Ls*@`RMimGBODtarfs#Co!)lBRi zu)0o&p##=1)6LdUQ)m3MHMr~iwkJR^Z^X51C7kjI8el$KYMUEYi-#@Atgk`tam{Cm zrH;9+*F$)!xOX$3HFeFz;20L>d$f7tdioICBci^ouz%lZ8<^>4WIc5;E@o@ATD){l zdt~(u64{34s@2lSw(rl#+TSf3>x{O@>N|@uvR+0&SSL#poss86Q*+&FX=X0}eS|mH zsooZE6Z3V({-;F?offT!`{=a40@+fhG}is9f^AvRwyb1ZR<@`A{u{&B%xs3q&*ZQA z@&8@y|7q;xEi6En7kSx%x~yoI-RgJw)PHN8V!u4w^1ohokeS)6v^FMGTChIcUTHdJ zR9ahe)oN*H+xKs!wbvPKm8Neu=1OyH-=#S3~Efy1d zvkTD)F?5VjGu;e)n9lfT$LON-|5cw|bxJiJbHe;<>_m-%g^Cp_T%bsSKRzxV9%g6g zX66l^Vb*;$Hx96OgYNn?wryM=d?#u-wqM&ZLV4^eQf*w zjc8w;(H2pCA2CN%1%z7q>3oExFK$zn@F9%)Mdy0Y2MYvERuYeLxI%GS^HuD`B1ql`voDv{!--1eLJB zT(w#j+V=ff35IQGT%lflhHY}e+p^MJw@*h5FhddaKDd&JnRAuS z<84{3iF&54v2FeRPe<13G7+?P}qc?Y&o8})gyLA44u41=7_K&OBJK@v|#mCFDl8a@ZnUu4>Wxvkz&zTSC{C}KT{MfzVhk54z!KgDI zG>vrTk9_+&YNps5if>eL$IP^-Vvfg9%!z-BIT=|@#oxu8 z`m>nRF%)y=pJL8N7E|eWG3U&*K&$1v3EljHKF;3Fb?2a)U(~5q%O%^+Kl{1)Z&%F` z^0Llp>*u<)gjp?DqGdEEV{93(MrG6$YG%9^Eu(U`KRK>PWz_Kx3)It!<}7nOf8S6a zbB@-hRlAP9p_Usu7k&Sxxo)-GGM9{a|EcTW)>*tQcQjEoao4u>cZJ^5X=ck&Q5AY$ zr~KLVe|P!^<}Ujq^`Wh(-|xhcgTo^;-E2sYZM***PM_!$^E~D_Fi21IrsNTz-w|ye zbHD9ljttA}kFTd@lGXA|ANcR6@|^Pi7o*Axoi*C1^78lLJn>%H&hwu}mDlE8FDD;= zU;m&$%NujUYI&=Z96AKFX>WOFuDZ*RFhBFw-<#{dL;k_G^>@fW>NFjSsE~g$Q|zy< zzgz8RGcBs-zQj<>*MExn7FkS%-^F}4)6B;I!-U5FQy*t<{JMG2_Wr@-m{@Mh)soz{@6TtO{gIVIXS6-rbPVk8Fy`Fb%g9>O^Ym6rD%;WjwboMWEZ&wh zny8yhYuoy}*3#*;ziKVLPKo&@Gnl)A9nI*PwU*Ib|EtzAMV4gNTCn|3&ZvsBSI2*v z`2XDJzwb}zX5Y`WZf1Sv$coFN&l;`bvPND>RK;b}DKXzuc5|0maXCz=I9Hw7UU52p zR9sGT)oRIQ+xKt9<<=Q(6{o`xS#kC+qZl*t*b2yNE8xG@T|S+~+mc@sb)N-nTYuMG zL7nzj-4)U)G2drlbC+4#P_qwMEk$%bzUV5d*Q0-bRZJ(EMLS0RvGAY1=qmniMrHoT z2fx4QDq%a-c=NQu6U;^A7hNT7XN|1IQnszX@2)f@|9+m7u@x0HG}y+&U|q8Qa;mJk zGb&o;VkoaX<(VTDLyeoQJpCP9WBq+sXpki^&{Dxnw0|Y1U!!c-tc$N`CjaqGOC`PO zZKrPBW|icN6p0DSviI`}*%!xfW&y`>ULf zt-#-}a*P9od$Z9MLbC}r zSFM(|wtfGvm+K79q6otu9}(pPOO)6ZMIF@9NE#)85-cOef=V3!eB&@q{BaW!U5TP{ zcEzZJv3HHV_kz7)$3CNj6)P6h!7ldadAHo|-6c_A%#Xac&%XO!Zui+|_uf75S)(`j zZFCgYu+ywMx0JZS?`Us5xT8(Ypo>xJc3j7*1Hb#(-5*MqvO0{$J-L!nNQ}dYQ5D>i z&jga5YMsC|7mkn^2^Wec=W`@*aA6jV-JKtW>sTGl2HD;DF@$knXlOTNuB)Xuppc*P znN82sc*RL;qnO9aZA5{?Y|nA1i1dE^cwV}l`1Gl{;7Le2FC{$p_IVudbNg&Aqok1rkKW9{5Uv|aN@R9C?+oHlPZ z?a0N=3#uwpaUSvF*nR7x#17M4}a{piv8 zxJpL#X(@%H`V2Alup8B9fu!fjbHE%qp3e#MLMF_MHjFn<7WiRa$_evwCd?}~4D$5$ zKXZK55A#}1nAbC5-mqc(v?Va6En&>8eUt6I+t5svSs90xX6NQ&n@=SZ@_@%@ye&2N6-9LXlD11*E))Z-wYX)mIOS@1r@ zb*w(Z?|v4%kLglYpU}8b|5QpLc|Ie?$?3?mm`HDupz%Z)K8LdwhA$*|Rv5kn4jO+Y z)xIz+A+mCW;cG9{iU`9uew-B$hHp)rA~nzEPKia-gNQN^arQ@?^bzN}cLE*F;u?@X zW=0n-gO7H`av=aOX%yHtUoG?FU!u(>xcuMfAA7)ujnBOvCez##f zCHTV+^Jh+&zf71U*YGzHc`Ch7H6?uHN3sxv0Wd}oinug_z}a8~VIa|1RY+x45LRJ( z5d^Xn(|1tH(F($<&XF91<7z2KD+q&~BS{9wQp&N|JgB*dBa|V|ku0);fYVWYlHS^; zhN3!Ft4m|PFbBt>;!>JQ$C6qDJ~-f9QyQ!#4Z>Z74Oxl0#?1q!f~^ ziWtb2yil)8B%>VO=&T0^trqJ`z^qzq;5;@=dDz#ijeuh^Y)nUAPBx)G{~UaeUYui+ zoJ)gkK;i#;>DKqHo}}$4V{449HMY*!dSe@m9c}CwW5*gh&e%p{#~VAr*opdOe^d62 z-B}N3zocI3HnV-@M1ON4eGT0RIN%L$3m7x8wqz5%(vmD@Vr_*>eCsK7{=CvJ`y=Tn z^NPb|-|!+?;tXH6mH^vGfRL|INYriVf^R)(+|zC+rI4uG69Y360jVY;dBrI+TW2(> z9Z-(=D-n`Wgm?%N4Z$=7;cD0>U|cN`s;$qsUefAm<`v9{+3)MEwdo(5cF*jA>w=qd zv2%B{yIyOd4#AJbt?jK|ww?9zxRyi>QVP3uwEasQiN*ktZcO%hV*$Cr8J82KF%xBc zHp&D*Q1ywdDdEqL)sAeY$wvYh^6f-4R+FUCXP?e5G|^Mo$DxA;Pfj^nRo~e;vb(JY zxSK>6n5qmVgX0@AR@i#h&ov{157x&CrWbU6AjlK z-3vfK(Y+ajVSS4B1gvnPxcn5=@&0RjOLd#+^eI2RtoGrvSpH4rbEHIUU;Av1fBO-U yE{npB+8-4VuW2xbKP}=a|H6rbWB79bF7Xwr)cN$$2Y`H!)AV7tTJh@ato{e9*BrP2 diff --git a/pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_x86_64_darwin_3.6.1.pickle b/pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_x86_64_darwin_3.6.1.pickle deleted file mode 100644 index 75ea95ff402c4e9f0c93ef80e6c04baf7f0a70d7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 126076 zcmd?y1#}Zl!|3td*|xN$NO5;5R@|N9h2l;rm6SBpsNn8};KAM9-QC^Y-QDf(CjV5R z@bK`y-*@jhcR1wNbT`TF_{`2Wy=}%SQ)NjDJxne}yKr4tfG*sm{cP(W7~&maGA0Z4 z4)*g7cl8Yk)48(UXxGTH;}}z(K9;?`{JbN)O~#b5_YMe-(1iti2ZpQf(kLLfU7)T? zV2E$~y57Ow?Q~%#^NU?PO~&NBC?MER*I5_t>f;@*GZ`Jdf_0s|YH7bycQvN?VW%+f z;C9+x4K=NMX$Ni->>Z>tc^GZOBfP^ROzx7=COje})Z}if{h(8SkRu108k)S-Pfi&e z85G*Za&AG9fe``TVPW1~OhzZKFr9BmaCk&mq;G`w>9*z_O~wRjT$pzmqrUL~lbh^g z8l=9Li@WS@JHVvRVzl*(=n|^aULyQUsr$IgmhFmBE@$%a@TmP=4dw@TDZ=Cv<>q3J z{pg1m75TA}MSJ|v%cSnE9<$zApF%xNeJT%qYIjMW#@$w*)_O)gOh&sxu7!)a7B(k} zjkg+`CVgU~osSmp;U-V>ukAW|2S#ceavAM|+J$+k+td`Y_lXQpBOqA)(Kb9Vz*jp5 z_3&*iClKi!8Loc5(Z)+XJ@r$p8NtCTNEhL4PJ8vxQhBR~73pJC)RZ^r?JRji@1S>5 ze^L|rcZaTS&8*L+#i>3=>=?~s6ajJKQD5MG zI#Em_)HL)l866r21nJa*5EQB==MZ~I8h-4^r2ftx9$Fk575|VBQw?&Mr2CT$W{ny{ zOw@GvgQyV^YJU4ELqr$q2upr)HP--YZDB6@RlOs05$fSv3%7oxG0~5|GZ%Ps_WL>e zII4L`%|3Nw$D++o*++c`H=*WwwFImGB1{9d91`VLUy}AcxMX)WnIxs`8q!45FQLsG z&DB#TywVh|1uCCYZLodk2+bMwn_km=e)CcO^QslXdv((sW|QL3`DEJLs@cS7;~%E$ zU^1qPPA79s>KfwjAFfj)$*5FSE1>9lRSs4U%;*pt5*Fkg7|>N`iZa@?@8qXd(Z&Q` zmKUjISB(L4RH+HB#h9NiP_2yB@++%3Rx&4bGOx&Rom#5}dPf9wWQQoVsEUZLCJ$?U z7=Ii#GslJ{k6DY8Aj>IegSKdg2~f>>=HP(VVmMl_RZ04B@qhBv|G9L~&($xWep>7< z>z737m-aB}m+M!>#F>7r8bSJX#E!a^T46*?*55+bcZ^d@#;p}ZP(VCcU+)~9`OTSL zZT8F!klLcv)fyserq?pKU7)U`PHja!)tGXQ&N)XFb8pZM+X?~UX*7u6&6k>TVoq7u1KGsvnY)ne# zH}@$lt$m{EVIf_0!5&pB$7)`GZVC#=Zc_Cf<7*1kEGWye`4}?mzx>pq7!>u?qugxO zmO&QUUOJ;gct}`8NSL46CsDf%TKB;c*N^pYb-ykeT6>@WkK6C0m9)>YwBJ{?c)e~; z&0ogx@6$>D_`j1z`nTry(dj>ENAuh0`j?SvNTf!yhas{0sJ(SeRK}E}OVQ=XkW$_4 z4_Xn$FETXVvZPO9wEc&SuO(Nc)><0n$|j>+Mg8xl{;z5>G*mxaYx}fDoQ~no)zM$4 zw!vL}`yOf(#8bybACvhQ42{%14UM&jCfdVK#}yrQhFa?OnAmHrJ^SbGiQ=pF=YMX5 zf9jPfF&#>^i<~-kSykUb*Dge@ZM*|b#)NLMo<+A<_Fm?ar1pcNj0wH0FVMQ1&cBNa ztxmBdkK*GUrZ$5<-d#=lp0U%#Xy@e{;-|gSPd!ET)s$;RNanI18E;=X)~UrrW@Mbm z>=6^0%I|7*zdwr0Sew6sV6 zIE=DJ;g&d2c=pet&@NPMk3;@j;{Z#cNK7g8;=PS_^(`MV#^j-&XLM{3sg8AZVby~D zqWg||OTWgL$UF{Khu?KWf+O@McPFip=Nt$%3vs6PMU;opo040q$i z)RRAosYrE_Axs;rYk65QI>)YQEE|jtYKPyux_KvS4Wo`H9gR-$&LpVaqupcKxDHuZypY3U}Lq=9XR^62uO_9;sXQ`qlQNl`Pw1jp+Y_gZ7T8$C9d@@|Z>; zqxQ_wD2N~GF{6VvVzR`rzPhJL?d8@qII8Unep`QM4sl3FBM=Ku^im=?7yqmIreD2}nsm|cV@(Qcdj0rZOOju`kEQKk|;B&T2B0`rUg(_fGYXjkXc`FrCh%jt$K@Lz@d#r;7a; z>HgP7QbH2uj;t>3NXl?Zjd{aW^}#YU{qK}^%joo1r8K_Mt{#9n{5S`omD2zH5*91Z zZ`X3BT4j^xUu#b-b#~l5|IxZ3CeO#8gALWRZ1Z=`V|2cW>2drttov#F71Oa+Bl71o zQ7kq2V7JcJ8rR}K^*2&fvh0@zGhi0XhB+`7x?o<+A2XBjM@8m0ryi^Gl2Oe_F)eKY z?U^OjtksL<0>qEii_uXXVS1^PR=y#DroT+2=z6>14>jpeDI=ms#%jtKx~scedQ661 z>W@)|-siF-0Djh+6r0HN|!ba_V(3ly=L`VmtR1TI?b#0 z5T6TVs3W>B-P_GkuSuPs>WkSTnj+PSU>5$*{d16NM zzp4clwa=s0UTGwP`nKkvmVI;3;`49PF{WHcw<&*=m_N@`7<6hRtK+eDD&}W_jO=&J@l8^ z_uqAsW9k(1#Hx0vvFnsrd4Esbynp8BEN;pBm#iCpEEG4a8-DC8-?MHI?so3N-0%KH zX>n3#-z)_tqHFly7MPe*{#@8vW zZru=*w|xO@tEl_8K)k$DoV0<&emJLSR`sN*CW`qA?vQoWR7@-SNF9VvbVxAVVUj5Uj zH>6dYMm3${>5Hq&q15_YyX-rI9wb-?Usk0`Y(VQWE~_z)_7% zKL2`h`G}fChNJ33?4loQWUyCD!swaMGu8x^q9=^3BLiy+Mo)%X#?L#|SN!|w<@oBS zADi&s)K5QciZg)wIepckwKaWTkbdxEy8M`q*7*O=2iKMlbgg8u5EjNFSQLw4aV&u) zu@siZGFTSNVfp`&LH*M>nMP7qX8dYU|4`f0I(=@~p>~)If0tuoIzQ$dtv$rf(J`ev z{-J<1`G0Y(=f`YdE#}tjVAWbI;}*xCGn_grZ~nyiE`_MAhdHQ~QJi4?z13fB*V>|g z(b{M+6?+=_51Wl2Ulyy=lSWJLn3=eoG0AToGg?}QeAW#y%|>*mCr-XlTL<%^Cv)pi zQ!U)TTA-w@{EIJ2tx=_BFLO{&*&NgZFo!=`wPtN6j5gY$JCpv`-5>2J{vTSwX1J;5 z7rUpKFVv@CqpcheTWS{7JX7QULJ!thd!4-Z2JHSzys zAz*$f$IP>`dRoJgxr+7b2poH}i@rwS?YVpeqy)4aAkk>y3!H~h`=@g&jHvD)Nk=IHwf_*B;N zC$xNM{ObxG)H15}=4x6pr}gp2YW~M+{m0hcrIO{9d9_xwMz0T+XU6D78 zxT3keXtI2tR1@!=*G1E_Y1spxEElr1X%V%mOIRf`JRn#X9v*wuH!F9v+RkXl8gF%B zvbvPpvY0#8s$?zS8@8&E7IcP75uUr}KsryCODQbCg@QDbIE>V$2d$sPh zuA)+F-sCKzvJ@#-mQf{XWUhGQSxjZQTyH6dPLxAd208rs$U3Hdi#v~GUdS4!|NZN9 z)p{2H$?Y2d-K=1)RJB6-uanr^=Z>#XCa| zdb#np4Y798+J-poLcI0K|H`4qYO%EF(K_StAAYPIwR6;Z@&9}Zc9z&Lr0yIm{B!K9 z?;KtF{v_75H;@0YW5v4l+@f3Kcw^n#*R~$-k0ZYSU>Nh?KP5-bJNlIV#n@A;H2z}j z`M+`jDf)8NZzf2~Yn6A*NVjcFT^@I&+tIoqW~6Hw0voLxe!O%v%evvmnNwqOFF&2X zw|c9PSA6|0W1{HI?bR`}I@4*s1yo(?8E)vH#OOR6L!Wf*cH2BckF>Zu^0BnKG+xgVSgNe191=z#wZ+uL(z!C za5#>@kvIxR;}|sISR9AraRN@nNjMp&;8dK3({TpQ#925S=ipqNhx2g(F2qH+7?_uyXKhx_pW9>ha<7?0pl zJch^d1fIlGcpA^(Sv-g5@d94NOL!Tt;8nba*YO74#9Me9@8Dg$hxhRTKEy}(7@y!% ze1^~Q1-`^r_!{5fTYQJ_@dJLuPxu+X;8*;H-%*{tGzS^g<&);1-T$iv8?;3`On~<2 zfR30Dolv{cRSVj!u3AWf)?2Kdd7ccDV+u@(sW3IB!L*nT(_;qAh?y`mX2GnO4YOko z%!#=$H@aXR%!~OjKNdh&EQp1$Fc!h0SPY9}2`q`Fur!vzvRDqwV+E{;m9R2aK{u?5 z+VXBKRL2@v6SaG7wNM+i`+2oc2kW8-*2DVP0JTAm7PL`|7PRq%7PR$jT4;*RusOEC zme>kgqbIgOFKmn6=!3rKhdT7fcBsbyG+=uS#2^gD5DZ0axS@qG495tJ#E#erJ7X8@ zirr9Keys)VhFvZ6L~Sj$7J6eJ?2G-dKMufwI0y%06b`|mXvASS97o_t9EGEC44QB( zj>GXd0Vm=loQzX&Do(@cI0I+mES!yVa4ycn`M3ZV;v!s(OK>SJ!{xXFSK=yMjcaf% zuEX`X0XO0%+>BdrD{jN>xC3|MF5HcKa4+t|{dfQm;vqbYNAM^f!{c}YPvR*&jc4#I zp2PEa0Wabuyo^`yDqh3ucmr?ZExe6)@GjoN`}hDK;v;;FPw*)|!{_({U*ao#jc@QR zzQgzU0YBm={ET1lD}KZ8Xujm7)&sIIR|k?n^)CdN!+-k0ks=_wU7xjV;0Pc*)Tiiz?_&1bE6C9!MvCc^J4*Y#e!G} z3!`?2tQNFeWVKKXi(?7Y?zz=MDJ+d;uq>9t@>l^YVkNAMRnQHqVl}LeHLxbu!rJJL zb+9gaU_Gpl4X`0L!p7JHn_@F;jxDeywnFXZPAzz18}!1q=#4(;i+-p>e{6?(3_t_6 zN9}G>Ed*gOhF~amz%UHQ2#myz*a@|JPqokmyJ9!&jy+JjB~=T(us8O>zSs}@;{Y6p zgK#iL;Sd~(MjVF2aRiRUQ8*gMpb5v~I2?}?a3W5^$v6e4;xwF&GjJx(!r3?n=i)q^ zj|*@iF2cpQ1efA6T#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN!rizB_u@X> zj|cD|9>T+T1drk|JdP*uB%Z?4cm~hnIXsUS@FHHq%XkH^;x)XEH}EFj!rOQU@8UhY zj}P!6KElWN1fSwFe2y>hCBDMf_y*tNJA98H@FRZ0&-ewu;y3(`)`@;uvD6C!6*ONT zRX5r4+zu0d38m)g`EiKQ}VS3Df+MTmn$b^|O z3ueV^m>qLqPRxb5(FOBhUd)I2QM*@H3$9oY3t?d_f<>_y7RM4;5=&ueEQ4jS9G1rl zSP?5>Wvqg3SQV?GcJHeeYG6&Qg|*Qg>tJ23hgxcM;T4;hzu^BeU7T6M7 zVQcinHt2u^18 zz>T;GH{%xEira8I?!cY63wPrl+>85gKOVq?cnA;U5j={=@Hn2plXwbG;~6}Q=kPpU zz>9bZFXI)wir4Tu-oTr93vc5cyo>knKAIP?@Fx-DzkYAI+^#KGE%J}cRjZsxpcPVC z5|W~>*s)cY_au<)B?rk-y`b)-UPMnUC6SU!&QdZdxs*amDW#H9OKGIEQaUNUltIcU zWs)*WS){B|HYvN5L&_=Tl5$HfQXVOI%=uvA1UDixE8OC_X|QYopl zR7NT*m6OU#6{Lz%C8@GhMRJp>O4X$5>fO&ZrCL&Lb?IassjlQ9)syN=4Wx!rBdM{} zL~1HElbTB{q?S@EskP)OwUNA}wvxBxBl${xl1}oM+DUpTKr%?}r9df23YJ2oP^p6y zCWT88Ql!*T>LhiRx=3B6Zc=xthtyN*CH0p2NPVS#Qh#ZHG*B8O4VI#$A<|IEC=HW_ zOCzL_(kN-PG)6K>W2JG@cxi$(QJN%8mZnHkrD@W1X@)dYnkCJa=16m;dD47ofwWLs zBrTSfNK2(<(sF5qv{G6nt(MkEYo&G4dTE2SQQ9PJmbOS+rESu7X@|5^+9mCl_DFlB zebRpEfOJqgBpsHHNJph((sAj8bW%DcotDl>XQgw}dFg_5QMx2umaa%wrEAi4>4tPu zx+UF~?nrl~d(wUBf%H&%Bt4d%NKd6_(sSvB^ip~yy_VicZ>4wAd+CGpQTil(mcB?| zrEijYVMhJKtU^{~8`)O2lM~4HvV-g>CzPG!L~>#|iJVk+mXpcJJSd~$xdfb1$4lncp)GICkDoLpY6AXk(t$(7|QvYT90t|nKPYsfX_T5@f5iB284uIwS#lk3Y3 zBjl0tD0#FzMmEV~<#F~vzsO(ZZ?ZZuC)Aq{gd%K&t*{dbguQSOjv}FO5{X1&kwhdF&LWveE>eh;B9%xj z(ulMook%Y-h>RkW$SksmtRkDpE^>&RBA3W5Ttps`SL74u}-WP8^lJjNo*Ed#8$CQY!^GkPO(eu7JI~Au}|z52gE^fNE{YN z#8Gif92Y0VNpVV?7H7m+aZa2U7sN$zNn93J#8q)kTo*UQO>s-y7I(y5aZlVA55z<9 zNIVu##8dH1JQpv-OYus)7H`B`@lL!KAH+xTNqiPx#8>f6r~@-aR@9qM6dT1>u~QN# z_KJhzs3cUJltfBmC5e($aaNKk$(0mJN+p$&T1lg%RnjTxl?+NoC6kg_$)aRcvMJe> z97;|lmy%m?QSvBxm3&HmrGVn96jTZ+g_R;oQKgttTq&WHR7xqOl`=|MrJPb;si0I; zDk+tfDvFy@RjH;_S86CVm0C(|#a*eR)Kxr`dP;qzfznWEq%>BVC{2}SN^_-!(o$)q zv{pQoHj0 z7p1GxP3f-mP8}h>1}cM;!Ag`eL>a0Wm0`+oWrQ+P8KsO?#waFb ztTIj+uS`%TDwCAS$`oa)GEJGT%ur@3vy|D&9A&ODPnoYQP!=kSl*P&tWvQ}ES+1;5 zRw}EM)yf)Wt+Gy8uWV2@Dw~we$`)m-vQ62p>`-gPAaFA)5;m;ta45{uUt?rDwmYY$`$3Ra!t9e+)!>Rx0Kt;9p$caPq}Zt zi-^^FMT`9V)p|*DTUP7!E@hpvm(&h|CHydmH!t$lw%D@W4$Zv^^=EsYJD?*bL?=vy zi7^Q#MQ2Qg$uR|{#8j9X(_mUmhv_i`X2eXG8M9zk%!b)92j;|Fm>XR%59Y;um>&zE zD;C5;SQv|7Q7neVu>_XHQdkRg|*Qg>tJ2< zz3hgpIKYHpOPx99v*ZY=y1S6WgE{wncCBL0|Mk9r|NC)MEe|ussH15C&ri zhGGW{!*GniNbHE6urqeSuGkH`V-M_!y|6d-!M@lJ`{Mu{h=Xu2M&S?~ibfoU!*K+T z#8EgJ$Dj$v;y4_S6L2CZzFARfZQcm$8)F+7eZ z@FbqX(|88Y;yFBz7w{rp!pnFCui`bljyLco-oo2>2k+uNypIp?AwI&#_ynKgGklIO z@Fl*&*Z2nC;yZkgAMhi7!q4~xzv4Iij@E`>mMy;!sGtqnq8%ncdvriYOo&dH2oqxx zOp4B!43lFDOo^#5HKxI|m=4op2F!?=Ff(Sste6e6V-C!TxiB}nU>?ki`7l2gKvyh? zg|ILd!J=3Ui(?5aiPkP$X`YwCvRDqwV+E{;m9R2aK{u?5)v!9&z?xVKYoj~X!Mf;y z^{_rRz=qfe8)Fk}ip{V&w!oIy3R|Nmwm~m!i{9vizUYTK^v8Cn#{e{7dkn-N48{-) z#SR#T;TVCD*bzHnXY7Jqu^V>B9@rCmVQ=h%eX$?*#{oDH2jO6h!XY>mjW`U4;|Lsy zqi{5iK@*O}aX20);6$8+lW_`8#c4PlXW&eng|l%E&c%5+9~a<4T!f2p2`Lkg}ZSN?!|q$9}nO`JcNhw2p+{_cpOjQ zNj!z8@eH2Db9f#v;6=QIm+=Z-#cOySZ{SV5g}3nz-o<-(A0OaDe1wnj2|mSV_#9v0 zOMHc|@eRJkclaJZ;79y~pYaQR#c%i>)$0Q0AfrG9ZO|6&Fag@513F?tbizcK7?WU9 zbjDx4=M$CknF$-qJY?vK$U{1`1xzPplU|!6J`LO`HVnHl~ zg|P@0#bQ_-OJGSXg{83!mc?>d9xGr)tb~=Z3c6uctcKOG2G+z{SR3834%S5vtcUfn z0XD=&*ch8&Q*4IKu?4ooR@fRnu?>1*Tl7XB^hH0^p+B}mJqDlw+hZUGVK9bZD0aXw z495tJ#E#erJ7X8@irug~_Q0Ol3wvW9?2G-dKMufwI0y%06b`|mXvASS97o_t9EGEC z44QB(j>GXd0Vm=loQzX&Do(@cI0I+mES!yVa4ycn`M3ZV;v!s(OK>SJ!{xXFSK=yM zjcaf%uEX`X0XO0%+>BdrD{jN>xC3|MF5HcKa4+t|{dfQm;vqbYNAM^f!{c}YPvR*& zjc4#Ip2PEa0Wabuyo^`yDqh3ucmr?ZExe6)@GjoN`}hDK;v;;FPw*)|!{_({U*ao# zjc@QRzQgzU0YBm={ET1lD}KZ8sICgIwErkjK^wG1J4}G~=zxxx5S=g)CdMR~6rC{{ zCdU+*5>sJnOoM4L9j3<&m=QB!X3TKYVPPzS zMX?wb#}Zf)OJQj&gJrQCmd6TM5i4P3tb%S>6{}%&tbsML7S=|0tb=vY1M6XZY=8~1 z5jMsq*c6*#b8LYvu@$yPPi%u;*cQFf2Yt~Gb?A@nP>%sXS*48t)3 zBe5fP!p_(QyJ9!&jyn7X4#yEV5=Y@^9D^nt zi{o%SPQZyc2`A$eoQl(MI?lkEI16Xv9Gr{ua6T@;g}4Y8;}Tqo%Wyfaz?HZPSK}I7 zi|cSbZorMW2{+>w+=|<9JMO@pxC?jV9^8xja6cZvgLnuJ;}JZH$M86wz>|0iPvaRp zi|6n>UcifZ2`}Rnyo%TGI^MvWcnfdi9lVS8@IF4khxiB|;}d*}&+s|Ez?b+6U*j8m zi|_C~e!!3T2|wc({EFZ3J6bRB%QlvNfItOp&=&150otPjI$}a}!bF%DlVDPG#$=cr zQ(#I=g{d(Orp0ua9y4G@%!HXS3ueV^sC{#U7II)t%!Rqp1@mBD%!m200J>s9EQE!z z2o}X+SR6}WNi2ohS6gVI43@=mSRN~2MXZFCu?o7O_H7MXsD|1%WoV%W*2G#^8{M%E z)Vkhj3U9c;5!|vDvwJ(v-LNDx%eXuX~!~Qq`2jU~A;|08km+&%P!K-);uj388iMQ}J-od+g5AWjxe29s%$d3ik6$@e^ER034C>F!wSOQC8DJ+d;uq>9t@>l^YVkNAMRnQHq zVl}LeHLxbu!rJJLb+9gaU_Gpl4X`0L!p7JHn_@F;jxDeyw!+rviEYpe+oCu6pfCEN z4*jtm>M;Nf*d7Bh2!k;MLs9!W1TBPNI7VP3cEnED8M|Ot?1tU32lm8X*c)Jr(1c@g9FE5cI1wk|WSoLiaT-p?88{PX;cT3P zb8#Nd#|5|$7vW-Df=h83F2@zP5?A4BT!U+I9j?a>xDhwuX54~XaT{*O9k>&B;cnc6 zdvPD`#{+l}58+`vf=BTf9>)`S5>Mf2JcDQP9G=Guco8q*WxRq{@fu#o8+a3M;cdKw zckv$H#|QWjAK_zsf=}@oKF1gM5?|qKe1mWC9lpm8_z^$hXZ(U+@f&_e^Pl@x>mBV+ z=W9Wrf;MQ2c9;O|QTsdkT5!aK=!A(dF($#J=#0rQIi|prm|SQBeuZFI*vSQkC89@fVO*bp0GV{C#=u^BeU7T6M7VQcin zHt2Z93wCiwLfL9g-+NRyI@!BhTX9T z_QYP;8~b2i?1%kv01m`KI2faF2o6Of4#VL%0!QK~9F1epgky0Wj>ic&5hvkfoPtwv z8cxRvtf43fjKc3=0+FHgLyF@=Enl)iUqL{7RDl26pLYTEP*Al6qd#^ zSQg7+d8~jHu@Y9sD(Hq)u^Lv#8dwu+VQqBBI#?GyupZXO2G|fAVPkB9O|cm^#}?QU zTVZST#5U-KZP6Qj&=>tshyK_O^%#H#Y>$B$guxhsq1XY#FdQQ=5<6li?2KKoD|W-~ z*aLfFFYJwdurKz*{x|>!;vgK1Q8)yLq7jGTa2$anaTJcmF=)cEI1b0-1e}PIa57H8 zsW=U%;|!dMvv4-f!MQjO=i>rgh>LJBF2SX^442~yT#2i2HLk(6xDMCj2Hc37a5HYf zt+)-h;||=3yKpz|!M(T-_u~OPh==en9>Jq{43FapJc+09G@ik;cn;6w1-yut@G@S( zt9T8s;|;utx9~RJ!Mk`5@8bh}h>!3wKEbE>44>l*e2K5{HNL^O_zvIW2mFYi@H2kF zulNnWqxA-U+1}C*5U8LH+M*pMKznpRM@)!Lm`@)usYVjnpg{KqdV5Yy6A!Rus$}xhS&%jV-swO&9FJPz?Rqw zTcanoK`(5J-spqA=!ZJ=$9Aa405o8G48$M|#t;m}4j6{v7=e-45j$aL?1Ejf8+OMY z*b{qUZ|sA8u^;xw0XPr`;b4ryAvhF`I1Gp52pox{a5RoV6OP4kI36e9M4W_^aSBew zX*eBc;7pu_vvCg2#d$a%7vMr%go|+rF2!ZI99Q5rsL98cg$JcXz644%bvcpfj{MZAQU@d{qW zYj_=R;7z=RxA6|%#d~-kAK*iLgpctFKE-GF9ADr|e1)&^4Zg*9_#QvtNBo4J@e6*% zZ}=V6zXWI*|D!+!ZO|6&Fag@513F?tbizcK7?WU9bjDx4= zM$CknF$-qJY?vK$U{1`1xzPplU|!6J`LO`HVnHl~g|P@0#bQ_-OJGSXg{83!mc?>d z9xGr)tb~=Z3c6uctcKOG2G+z{SR3834%S5vtcUfn0XD=&*ch8&Q*4IKu?4ooR@fRn zu?>1*Tl7XB^hH0^p+B}mJqDlw+hZUGVK9bZD0aXw495tJ#E#erJ7X8@irug~_Q0Ol z3wvW9?2G-dKMufwI0y%06b`|mXvASS97o_t9EGEC44QB(j>GXd0Vm=loQzX&Do(@c zI0I+mES!yVa4ycn`M3ZV;v!s(OK>SJ!{xXFSK=yMjcaf%uEX`X0XO0%+>BdrD{jN> zxC3|MF5HcKa4+t|{dfQm;vqbYNAM^f!{c}YPvR*&jc4#Ip2PEa0Wabuyo^`yDqh3u zcmr?ZExe6)@GjoN`}hDK;v;;FPw*)|!{_({U*ao#jc@QRzQgzU0YBm={ET1lD}KZ8 zsQw*bOZ$%k6|_NHw8I2wj}GXF3DF4?VPZ^zNzoaTVRB4?DKQnM#x$4~(_wndfEh6p zX2vX-6|-S>%z-&E7v@G6%!7F`ALhpb=!ylg5EjNFSQLw4aV&u)u@siZGFTSNVR@{8 z6|oXl#wzHBRk0dY#~N4@Yhi74$2wRSJ+L0u#|GFC8)0K?f=#g*Hpdp&5?f(w^u#vk zg>BIreb5*EP>2554)qv-25gUk7=*zXf}z*}!!R5pFcLdrC+v(}uq$@M?$`r+VlV8C zeXuX~!~Qq`2jU~A;|08km+&%P!K-);uj388 ziMQ}J-od+g5AWjxe29kJo5fh>lCc?y+1e2mOCd1^I0#jltOpR$UEvCctm;p0lCd`ak zFe_%m?3e>{VlK>$E|>@NVm{1|1<(}>Vj(PyMX)Fq!{S&1OJXT3jb*Sbmc#N`0V`r9 ztc+FA4Xa``td2FXCf35*=#F);E_z@+td9+_AvVIs*aVwmGi;76uqC#_*64|C&ic&5hvkfoPtwv8cxR8(lCD=EZ!N9}A!>7Q{kW7>i(0EQZCg1eU~7SQ^Vw}aN>~}I zpc_`jYFHg>U`?!rwb32xU|saUdRQMDU_)$#jj;(f#b($XTVP9Ug{{#O+n^V=MQ`*$ zU-UyA`eQrPV*nbkJqBVB24e_@Vh0SvaE!o6?1-JPGj_qQ*bTd55A2D(us8O>zSs}@ z;{Y6pgK#iL;Sd~(MjVF2aRiRUQ8*gMpb5v~I2?}?a3W5^$v6e4;xwF&GjJx(!r3?n z=i)q^j|*@iF2cpQ1efA6T#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN!rizB z_u@X>j|cD|9>T+T1drk|JdP*uB%Z?4cm~hnIXsUS@FHHq%XkH^;x)XEH}EFj!rOQU z@8UhYj}P!6KElWN1fSwFe2y>hCBDMf_y*tNJA98H@FRZ0&-ewu;y3(`>Nfyd+J6+N zpbgrh9VS3~bU;T;h)$RY6Jrugiq4n}lVb`@iK#F(roptB4%1@>%!rvVGiJf8m<_XI z4$O(UFgLnj9?XmRFh3SRS1gEyurL*1(!r3u~i0*1@{yf%ULHHo%712peM)Y>LgWIkv!-*a};tC$>Q^Y>VFLgTCm8 zI`qeOsK)>_V0#S2APmM348;x@hT#~2k=PMCVQ1`uU9lT>#~#=ddtq_y7RM4;5=&ueEQ4jS9G1rlSP?5>Wvqg3SQV>b zb*zCku@=@wcdUbT(F5yYeQbaYu@N@LCfF34VRLMOEwL50Mo(;mUf34B(Fc9e4|V8| z?NE;aXu$Ruh(Q>PAsC7sFbu;n0wb{_cEZls1-oK5?2bLKC-%bL*a!P!KkSbKa3BuC z!5D=@a3~sa7!Jn~I1)$UXdHtk9E;;{JWjxgI0+}?6r76Fa5~PwnK%n);~boe^Kd>c zz=gO77vmCKipy|0uE3SJ3RmMAT#M^)J#N5_xCuAo7Tk*4a69h6owy5k;~w0L`*1%V zz=L=Q591L$ipTIcp1_lM3Qyx1Jd5Y>JYK+ycnL4#6}*bq@H*bWn|KRv;~l(<_wYVG zz=!wjwP@pmcr6l2FqeOERPkiB38o6SOwj%DptelSOaTf zEv$|1SO@E(2iC*-*Z>=1BW#RKuqigf=GX#TVk>Nop4bMxuq}F{5Bj1X>d+tCp&kR! zfbB65gD@CFFcdpr7=~j6Mq)?ogq^VqcExVk9eZF;?1jCt5B9}=*dGVrKpcdFF$#y^ zP&DE&9F8M!B#y$-I0j8P7RTXuoPZN?5>Cb`I2EVibew@RaTdx4=M$CknF$-qJY?vK$U{1`1xzPplU|!6J z`LO`HVnHl~g|P@0#bQ_-OJGSXg{83!mc?>d9xGr)tb~=Z3c6uctcKOG2G+z{SR383 z4%S5vtcUfn0XD=&*ch8&Q*4IKu?4ooR@fRnu?>1*Tl7XB^hH0^p+B}mJqDlw+hZUG zVK9bZD0aXw495tJ#E#erJ7X8@irug~_Q0Ol3wvW9?2G-dKMufwI0y%06b`|mXvASS z97o_t9EGEC44QB(j>GXd0Vm=loQzX&Do(@cI0I+mES!yVa4ycn`M3ZV;v!s(OK>SJ z!{xXFSK=yMjcaf%uEX`X0XO0%+>BdrD{jN>xC3|MF5HcKa4+t|{dfQm;vqbYNAM^f z!{c}YPvR*&jc4#Ip2PEa0Wabuyo^`yDqh3ucmr?ZExe6)@GjoN`}hDK;v;;FPw*)| z!{_({U*ao#jc@QRzQgzU0YBm={ET1lD}KZ8C^^&qqd*01&=&150otPjI$}a}!bF%D zlVDPG#$=crQ(#I=g{d(Orp0ua9y4G@%!HXS3ueV^m>qLqPRxb5(FOBhUd)I2u>iVa zK`exYu?QB$VptqYU`Z^6rLhc_#d264D_}*egq5)hx?xqUhSjkK*2G#^8{M%E)D!}YiUH{vGTj9YLkZo}=k19##s+>Lv1FYd$rcmNOLAv}yn@F*U`<9Gs3 z;we0hXYeeZ!}E9nFXAP8n18?Fjyp4D8F5biY_y8Z`BYccc@F_mS=lB9& z;wyZOZ}2U?!}s_BKjJ6+j9>68e#7r5C8PaEfePB7E!trMv_}VY#DwUCi7+uH!KCPn z$uK#lz?7H@Q)3!Ti|H^uX26V?2{U6B%!=7CJLbTgmU*ft78qUiM6mcx?>%ziyl}H>th3K zh>fr@Ho>OY44Y#MY>BO~HF{zj^uo62jXvm$eyBr#Y=?RbKm)eNKn%iQ48c(BfMFPp z5g3Ubu@iR2F4z^jVR!6-J+T+|#y;2=`(b|^fCF(54#p@Pfzo#N8=bY z;aD7p<8cB`#7Q_Ar{GkahSPBd&csv02a#7(#vx8PRXhTCxm?!;ZV8~5N|+=u(|03O6cco>i1Q9Opn@dTd4Q+OKB z;8{F}=kWqw#7lS?ui#a@hS%{1-o#sY8}Hy_!ytyQ+$Tc@ddubSNIy= z;9Go$@9_hE#83Dczu;H=hTl<2PWz7n6|_NHw8I2wj}GXF3DF4?VPZ^zNzoaTVRB4? zDKQnM#x$4~(_wndfEh6pX2vX-6|-S>%z-&E7v@G6%!7F`ALhpb=!*Y`rFx7NZ3hr2 zyjAUeu5H`4ZQDNAwr!tl+qP}nwr$OunbkjE(xge#rd=5@BWA+Pm<6+9Hq4GWFem21 z+?WURVm{1|1+X9%!opYti()Y>jwP@pmcr6l2FqeOERPkiB38o6SOu$MHLQ*`uqM{R z+E@qcVm+*n4X`0L!p7JHn_@F;jx8`0TVgA0jcu?kw!`+=0Xt$R?2KKoD|W-~*aLfF zFYJwdurKz*{x|>!;vgK1LvSb#!{ImrN8%_Pjbm^uj>GXd0Vm=loQz>O1*hUPoQ^Ya zCeFgyI0xtAJe-dUa3LSeN zC+@=CxCi&*KHQH7@E{(-!*~Rb;xRmqC-5Ym!qa#L&*C{ej~DPFUc$?G1+U^YypA{U zCf>r^cn9y|J-m+(@F70J$M^)F;xl}XFYqP4!q@l)-{L!bk00nchFeb*r*cb=n zVmyqG2{0ih!o-*alVUPVjwvuDroz;i2Ge3XOph5bBWA+Pm<6+9Hq4GWFem21+?WUR zVm{1|1+X9%!opYti()Y>jwP@pmcr6l2FqeOERPkiB38o6SOu$MHLQ*`uqM{R+E@qc zVm+*n4X`0L!p7JHn_@F;jx8`0TVgA0jcu?kw!`+=0Xt$R?2KKoD|W-~*aLfFFYJwd zurKz*{x|>!;vgK1LvSb#!{ImrN8%_Pjbm^uj>GXd0Vm=loQz>O1*hUPoQ^YaCeFgy zI0xtAJe-dUa3LSeNC+@=C zxCi&*KHQH7@E{(-!*~Rb;xRmqC-5Ym!qa#L&*C{ej~DPFUc$?G1+U^YypA{UCf>r^ zcn9y|J-m+(@F70J$M^)F;xl}XFYqP4!q@l)-{L!bk00nchFeb*r*cb=nVmyqG z2{0ih!o-*alVUPVjwvuDroz;i2Ge3XOph5bBWA+Pm<6+9Hq4GWFem21+?WURVm{1| z1+X9%!opYti()Y>jwP@pmcr6l2FqeOERPkiB38o6SOu$MHLQ*`uqM{R+E@qcVm+*n z4X`0L!p7JHn_@F;jx8`0TVgA0jcu?kw!`+=0Xt$R?2KKoD|W-~*aLfFFYJwdurKz* z{x|>!;vgK1LvSb#!{ImrN8%_Pjbm^uj>GXd0Vm=loQz>O1*hUPoQ^YaCeFgyI0xtA zJe-dUa3LSeNC+@=CxCi&* zKHQH7@E{(-!*~Rb;xRmqC-5Ym!qa#L&*C{ej~DPFUc$?G1+U^YypA{UCf>r^cn9y| zJ-m+(@F70J$M^)F;xl}XFYqP4!q@l)-{L!bk00nchFeb*r*cb=nVmyqG2{0ih z!o-*alVUPVjwvuDroz;i2Ge3XOph5bBWA+Pm<6+9Hq4GWFem21+?WURVm{1|1+X9% z!opYti()Y>jwP@pmcr6l2FqeOERPkiB38o6SOu$MHLQ*`uqM{R+E@qcVm+*n4X`0L z!p7JHn_@F;jx8`0TVgA0jcu?kw!`+=0Xt$R?2KKoD|W-~*aLfFFYJwdurKz*{x|>! z;vgK1LvSb#!{ImrN8%_Pjbm^uj>GXd0Vm=loQz>O1*hUPoQ^YaCeFgyI0xtAJe-dU za3LSeNC+@=CxCi&*KHQH7 z@E{(-!*~Rb;xRmqC-5Ym!qa#L&*C{ej~DPFUc$?G1+U^YypA{UCf>r^cn9y|J-m+( z@F70J$M^)F;xl}XFYqP4!q@l)-{L!bk00nchFeb*r*cb=nVmyqG2{0ih!o-*a zlVUPVjwvuDroz;i2Ge3XOph5bBWA+Pm<6+9Hq4GWFem21+?WURVm{1|1+X9%!opYt zi()Y>jwP@pmcr6l2FqeOERPkiB38o6SOu$MHLQ*`uqM{R+E@qcVm+*n4X`0L!p7JH zn_@F;jx8`0TVgA0jcu?kw!`+=0Xt$R?2KKoD|W-~*aLfFFYJwdurKz*{x|>!;vgK1 zLvSb#!{ImrN8%_Pjbm^uj>GXd0Vm=loQz>O1*hUPoQ^YaCeFgyI0xtAJe-dUa3LSeNC+@=CxCi&*KHQH7@E{(- z!*~Rb;xRmqC-5Ym!qa#L&*C{ej~DPFUc$?G1+U^YypA{UCf>r^cn9y|J-m+(@F70J z$M^)F;xl}XFYqP4!q@l)-{L!bk00nchFeb*r*cb=nVmyqG2{0ih!o-*alVUPV zjwvuDroz;i2Ge3XOph5bBWA+Pm<6+9Hq4GWFem21+?WURVm{1|1+X9%!opYti()Y> zjwP@pmcr6l2FqeOERPkiB38o6SOu$MHLQ*`uqM{R+E@qcVm+*n4X`0L!p7JHn_@F; zjx8`0TVgA0jcu?kw!`+=0Xt$R?2KKoD|W-~*aLfFFYJwdurKz*{x|>!;vgK1LvSb# z!{ImrN8%_Pjbm^uj>GXd0Vm=loQz>O1*hUPoQ^YaCeFgyI0xtAJe-dUa3LSeNC+@=CxCi&*KHQH7@E{(-!*~Rb z;xRmqC-5Ym!qa#L&*C{ej~DPFUc$?G1+U^YypA{UCf>r^cn9y|J-m+(@F70J$M^)F z;xl}XFYqP4!q@l)-{L!bk00nchFeb*r*cb=nVmyqG2{0ih!o-*alVUPVjwvuD zroz;i2Ge3XOph5bBWA+Pm<6+9Hq4GWFem21+?WURVm{1|1+X9%!opYti()Y>jwP@p zmcr6l2FqeOERPkiB38o6SOu$MHLQ*`uqM{R+E@qcVm+*n4X`0L!p7JHn_@F;jx8`0 zTVgA0jcu?kw!`+=0Xt$R?2KKoD|W-~*aLfFFYJwdurKz*{x|>!;vgK1LvSb#!{Imr zN8%_Pjbm^uj>GXd0Vm=loQz>O1*hUPoQ^YaCeFgyI0xtAJe-dUa3LSeNC+@=CxCi&*KHQH7@E{(-!*~Rb;xRmq zC-5Ym!qa#L&*C{ej~DPFUc$?G1+U^YypA{UCf>r^cn9y|J-m+(@F70J$M^)F;xl}X zFYqP4!q@l)-{L!bk00nchFeb*r*cb=nVmyqG2{0ih!o-*alVUPVjwvuDroz;i z2Ge3XOph5bBWA+Pm<6+9Hq4GWFem21+?WURVm{1|1+X9%!opYti()Y>jwP@pmcr6l z2FqeOERPkiB38o6SOu$MHLQ*`uqM{R+E@qcVm+*n4X`0L!p7JHn_@F;jx8`0TVgA0 zjcu?kw!`+=0Xt$R?2KKoD|W-~*aLfFFYJwdurKz*{x|>!;vgK1LvSb#!{ImrN8%_P zjbm^uj>GXd0Vm=loQz>O1*hUPoQ^YaCeFgyI0xtAJe-dUa3LSeNC+@=CxCi&*KHQH7@E{(-!*~Rb;xRmqC-5Ym z!qa#L&*C{ej~DPFUc$?G1+U^YypA{UCf>r^cn9y|J-m+(@F70J$M^)F;xl}XFYqP4 z!q@l)-{L!bk00nchFeb*r*cb=nVmyqG2{0ih!o-*alVUPVjwvuDroz;i2Ge3X zOph5bBWA+Pm<6+9Hq4GWFem21+?WURVm{1|1+X9%!opYti()Y>jwP@pmcr6l2FqeO zERPkiB38o6SOu$MHLQ*`uqM{R+E@qcVm+*n4X`0L!p7JHn_@F;jx8`0TVgA0jcu?k zw!`+=0Xt$R?2KKoD|W-~*aLfFFYJwdurKz*{x|>!;vgK1LvSb#!{ImrN8%_Pjbm^u zj>GXd0Vm=loQz>O1*hUPoQ^YaCeFgyI0xtAJe-dUa3LSeNC+@=CxCi&*KHQH7@E{(-!*~Rb;xRmqC-5Ym!qa#L z&*C{ej~DPFUc$?G1+U^YypA{UCf>r^cn9y|J-m+(@F70J$M^)F;xl}XFYqP4!q@l) z-{L!bk00nchFeb*r*cb=nVmyqG2{0ih!o-*alVUPVjwvuDroz;i2Ge3XOph5b zBWA+Pm<6+9Hq4GWFem21+?WURVm{1|1+X9%!opYti()Y>jwP@pmcr6l2FqeOERPki zB38o6SOu$MHLQ*`uqM{R+E@qcVm+*n4X`0L!p7JHn_@F;jx8`0TVgA0jcu?kw!`+= z0Xt$R?2KKoD|W-~*aLfFFYJwdurKz*{x|>!;vgK1LvSb#!{ImrN8%_Pjbm^uj>GXd z0Vm=loQz>O1*hUPoQ^YaCeFgyI0xtAJe-dUa3LSeNC+@=CxCi&*KHQH7@E{(-!*~Rb;xRmqC-5Ym!qa#L&*C{e zj~DPFUc$?G1+U^YypA{UCf>r^cn9y|J-m+(@F70J$M^)F;xl}XFYqP4!q@l)-{L!b zk00nchFeb*r*cb=nVmyqG2{0ih!o-*alVUPVjwvuDroz;i2Ge3XOph5bBWA+P zm<6+9Hq4GWFem21+?WURVm{1|1+X9%!opYti()Y>jwP@pmcr6l2FqeOERPkiB38o6 zSOu$MHLQ*`uqM{R+E@qcVm+*n4X`0L!p7JHn_@F;jx8`0TVgA0jcu?kw!`+=0Xt$R z?2KKoD|W-~*aLfFFYJwdurKz*{x|>!;vgK1LvSb#!{ImrN8%_Pjbm^uj>GXd0Vm=l zoQz>O1*hUPoQ^YaCeFgyI0xtAJe-dUa3LSeNC+@=CxCi&*KHQH7@E{(-!*~Rb;xRmqC-5Ym!qa#L&*C{ej~DPF zUc$?G1+U^YypA{UCf>r^cn9y|J-m+(@F70J$M^)F;xl}XFYqP4!q@l)-{L!bk00nchFeb*r*cb=nVmyqG2{0ih!o-*alVUPVjwvuDroz;i2Ge3XOph5bBWA+Pm<6+9 zHq4GWFem21+?WURVm{1|1+X9%!opYti()Y>jwP@pmcr6l2FqeOERPkiB38o6SOu$M zHLQ*`uqM{R+E@qcVm+*n4X`0L!p7JHn_@F;jx8`0TVgA0jcu?kw!`+=0Xt$R?2KKo zD|W-~*aLfFFYJwdurKz*{x|>!;vgK1LvSb#!{ImrN8%_Pjbm^uj>GXd0Vm=loQz>O z1*hUPoQ^YaCeFgyI0xtAJe-dUa3LSeNC+@=CxCi&*KHQH7@E{(-!*~Rb;xRmqC-5Ym!qa#L&*C{ej~DPFUc$?G z1+U^YypA{UCf>r^cn9y|J-m+(@F70J$M^)F;xl}XFYqP4!q@l)-{L!bk00nch zFeb*r*cb=nVmyqG2{0ih!o-*alVUPVjwvuDroz;i2Ge3XOph5bBWA+Pm<6+9Hq4GW zFem21+?WURVm{1|1+X9%!opYti()Y>jwP@pmcr6l2FqeOERPkiB38o6SOu$MHLQ*` zuqM{R+E@qcVm+*n4X`0L!p7JHn_@F;jx8`0TVgA0jcu?kw!`+=0Xt$R?2KKoD|W-~ z*aLfFFYJwdurKz*{x|>!;vgK1LvSb#!{ImrN8%_Pjbm^uj>GXd0Vm=loQz>O1*hUP zoQ^YaCeFgyI0xtAJe-dUa3LSeNC+@=CxCi&*KHQH7@E{(-!*~Rb;xRmqC-5Ym!qa#L&*C{ej~DPFUc$?G1+U^Y zypA{UCf>r^cn9y|J-m+(@F70J$M^)F;xl}XFYqP4!q@l)-{L!bk00nchFeb*r z*cb=nVmyqG2{0ih!o-*alVUPVjwvuDroz;i2Ge3XOph5bBWA+Pm<6+9Hq4GWFem21 z+?WURVm{1|1+X9%!opYti()Y>jwP@pmcr6l2FqeOERPkiB38o6SOu$MHLQ*`uqM{R z+E@qcVm+*n4X`0L!p7JHn_@F;jx8`0TVgA0jcu?kw!`+=0Xt$R?2KKoD|W-~*aLfF zFYJwdurKz*{x|>!;vgK1LvSb#!{ImrN8%_Pjbm^uj>GXd0Vm=loQz>O1*hUPoQ^Ya zCeFgyI0xtAJe-dUa3LSeN zC+@=CxCi&*KHQH7@E{(-!*~Rb;xRmqC-5Ym!qa#L&*C{ej~DPFUc$?G1+U^YypA{U zCf>r^cn9y|J-m+(@F70J$M^)F;xl}XFYqP4!q@l)-{L!bk00r%PppOBDU^onq5ilY~!pIl}qhd6SjxjJM#=_Vb z2jgNqjE@O0Atu7am;{qzGE9ysFeRqK)R+d-VmeHZ889Pe!pxWjvtl;PjyW(V=EB^V z2lHY+%#Q`IAQr;HSOkk=F)WTHuq2kk(pUz|VmU026|f>!!pc|$t70{*jy13**23CY z2kT-ztd9+_AvVIs*aVwmGi;76Fce#2D{PHzur0R3_SgYCVkhj3U9c;5!|vDvdtxu_ zjeW2$_QU=-00-hA9E?M7C=SEnI08rFC>)Jra4e3)@i+k|;v}4mVK@b+;xwF&GjJx( z!r3?n=i)q^j|*@iF2cpQ1efA6T#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN z!rizB_u@X>j|cD|9>T+T1drk|JdP*uB%Z?4cm~hnIXsUS@FHHq%XkH^;x)XEH}EFj z!rOQU@8UhYj}P!6KElWN1fSwFe2y>hCBDMf_y*tNJA98H@FRZ0&-ewu;y3(`Kkz61 z!r%A@|KdNi64QTl&_xdgN>r%PppOBDU^onq5ilY~!pIl}qhd6SjxjJM#=_Vb2jgNq zjE@O0Atu7am;{qzGE9ysFeRqK)R+d-VmeHZ889Pe!pxWjvtl;PjyW(V=EB^V2lHY+ z%#Q`IAQr;HSOkk=F)WTHuq2kk(pUz|VmU026|f>!!pc|$t70{*jy13**23CY2kT-z ztd9+_AvVIs*aVwmGi;76Fce#2D{PHzur0R3_SgYCVkhj3U9c;5!|vDvdtxu_jeW2$ z_QU=-00-hA9E?M7C=SEnI08rFC>)Jra4e3)@i+k|;v}4mVK@b+;xwF&GjJx(!r3?n z=i)q^j|*@iF2cpQ1efA6T#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN!rizB z_u@X>j|cD|9>T+T1drk|JdP*uB%Z?4cm~hnIXsUS@FHHq%XkH^;x)XEH}EFj!rOQU z@8UhYj}P!6KElWN1fSwFe2y>hCBDMf_y*tNJA98H@FRZ0&-ewu;y3(`Kkz61!r%A@ z|KdNilF)y2&_xdgN>r%PppOBDU^onq5ilY~!pIl}qhd6SjxjJM#=_Vb2jgNqjE@O0 zAtu7am;{qzGE9ysFeRqK)R+d-VmeHZ889Pe!pxWjvtl;PjyW(V=EB^V2lHY+%#Q`I zAQr;HSOkk=F)WTHuq2kk(pUz|VmU026|f>!!pc|$t70{*jy13**23CY2kT-ztd9+_ zAvVIs*aVwmGi;76Fce#2D{PHzur0R3_SgYCVkhj3U9c;5!|vDvdtxu_jeW2$_QU=- z00-hA9E?M7C=SEnI08rFC>)Jra4e3)@i+k|;v}4mVK@b+;xwF&GjJx(!r3?n=i)q^ zj|*@iF2cpQ1efA6T#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN!rizB_u@X> zj|cD|9>T+T1drk|JdP*uB%Z?4cm~hnIXsUS@FHHq%XkH^;x)XEH}EFj!rOQU@8UhY zj}P!6KElWN1fSwFe2y>hCBDMf_y*tNJA98H@FRZ0&-ewu;y3(`Kkz61!r%A@|KdNi zlG1;4&_xdgN>r%PppOBDU^onq5ilY~!pIl}qhd6SjxjJM#=_Vb2jgNqjE@O0Atu7a zm;{qzGE9ysFeRqK)R+d-VmeHZ889Pe!pxWjvtl;PjyW(V=EB^V2lHY+%#Q`IAQr;H zSOkk=F)WTHuq2kk(pUz|VmU026|f>!!pc|$t70{*jy13**23CY2kT-ztd9+_AvVIs z*aVwmGi;76Fce#2D{PHzur0R3_SgYCVkhj3U9c;5!|vDvdtxu_jeW2$_QU=-00-hA z9E?M7C=SEnI08rFC>)Jra4e3)@i+k|;v}4mVK@b+;xwF&GjJx(!r3?n=i)q^j|*@i zF2cpQ1efA6T#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN!rizB_u@X>j|cD| z9>T+T1drk|JdP*uB%Z?4cm~hnIXsUS@FHHq%XkH^;x)XEH}EFj!rOQU@8UhYj}P!6 zKElWN1fSwFe2y>hCBDMf_y*tNJA98H@FRZ0&-ewu;y3(`Kkz61!r%A@|KdNilF@&3 z&_xdgN>r%PppOBDU^onq5ilY~!pIl}qhd6SjxjJM#=_Vb2jgNqjE@O0Atu7am;{qz zGE9ysFeRqK)R+d-VmeHZ889Pe!pxWjvtl;PjyW(V=EB^V2lHY+%#Q`IAQr;HSOkk= zF)WTHuq2kk(pUz|VmU026|f>!!pc|$t70{*jy13**23CY2kT-ztd9+_AvVIs*aVwm zGi;76Fce#2D{PHzur0R3_SgYCVkhj3U9c;5!|vDvdtxu_jeW2$_QU=-00-hA9E?M7 zC=SEnI08rFC>)Jra4e3)@i+k|;v}4mVK@b+;xwF&GjJx(!r3?n=i)q^j|*@iF2cpQ z1efA6T#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN!rizB_u@X>j|cD|9>T+T z1drk|JdP*uB%Z?4cm~hnIXsUS@FHHq%XkH^;x)XEH}EFj!rOQU@8UhYj}P!6KElWN z1fSwFe2y>hCBDMf_y*tNJA98H@FRZ0&-ewu;y3(`Kkz61!r%A@|KdNilGA^5&_xdg zN>r%PppOBDU^onq5ilY~!pIl}qhd6SjxjJM#=_Vb2jgNqjE@O0Atu7am;{qzGE9ys zFeRqK)R+d-VmeHZ889Pe!pxWjvtl;PjyW(V=EB^V2lHY+%#Q`IAQr;HSOkk=F)WTH zuq2kk(pUz|VmU026|f>!!pc|$t70{*jy13**23CY2kT-ztd9+_AvVIs*aVwmGi;76 zFce#2D{PHzur0R3_SgYCVkhj3U9c;5!|vDvdtxu_jeW2$_QU=-00-hA9E?M7C=SEn zI08rFC>)Jra4e3)@i+k|;v}4mVK@b+;xwF&GjJx(!r3?n=i)q^j|*@iF2cpQ1efA6 zT#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN!rizB_u@X>j|cD|9>T+T1drk| zJdP*uB%Z?4cm~hnIXsUS@FHHq%XkH^;x)XEH}EFj!rOQU@8UhYj}P!6KElWN1fSwF ze2y>hCBDMf_y*tNJA98H@FRZ0&-ewu;y3(`Kkz61!r%A@|KdNiQqX^N&_xdgN>r%P zppOBDU^onq5iqPvSmPt$k%beM|KAn1wB3Y5GC{|P}nibuO zVa2p!S+T7+R$MEd72ir=CA1P*iLE47QY)F2{J(QZX{EAKTWPGcRyr%amBGqrWwJ6` zS*)y9HY>Z8!^&ypvT|E_th`n}E5B91Drgn53R^|2qE<1hxK+X`X_c}{TV<@WRynJ@ zRl%xgRkA8uRjjI3HLJQ+!>Vc3vT9p(th!b`tG?C1YG^gG8e2`QrdBhnxz)l7wOU%O ztkzZ=tF6_}YHxM0I$E8q&Q=$ztJTfwZuPKwTD`2^Rv)Xc)z9j04X_4UgRH^U5NoJ4 z%o=Wuutr*=tkKpOYpgZS8gEUoCR&rM$yS&(#hPkOv!+`!teMsX&t+Uo!8?24iCTp{`#oB6Zv$k71tew^_Yqzz> z+H38z_FD(6gVrJIuyw>bY8|tVTPLiO)+y_>b;detowLqc7p#lcCF`$Y{rx@+CD?pqJ6ht?zOvGv4yYCW@_TQ97a)+_6^^~QQ@y|dn1AFPkoC+oBI z#rkS}v%XtDte@5|>$mmC`fL6B@A_@qv0dAFSD21E9{l_DtooP#$Ic$v)9`j?2Yy&d$Ya8-fC~Nx7$1H zo%Sw!x4p;SYwxr7+Xw7}_96SQeZ)R$AG43!C+w5@*Uj&Xb^a6+7LPIxDR z6VZv}M0TP$QJrW`bSH)r(~0H8cH%g3op?@sCxMgDN#rDUk~m47WKMD?g_F`r<)n7f zIBA`9PI@PUlhMiKWOlMRS)FW7b|;6E)5+!JcJerRoqSGyr+`z?DdZG(ia14`Voq_V zgj3Qf<&<{HIAxu3PI;$-Q_-p9RCcO3Rh?>1b*F|?)2ZducIr5FoqA4vr-9SZY2-9^ znmA3JW=?abg%j$ubXqyBoiELv9Iys%4E>2gco73It;q-KRIlY}ePG6^= z)885340HxLgPkGHP-mDk+!^7FbVfO&oiWZ>XPh(Mncz%xCOMOxFlUN0)tTl@cV;*< zomtLoXO1)1ndi)R7B~x?Mb2VpiL=yM<}7zsI4hl1&T40kv({PXtammz8=XzgW@n4D z)!F83cXl{Con6juXOFYj+2`zc4mby$L(XC6h;!6A<{WoUI47M`&S~e2bJjWMoOdob z7oAJaW#@`>)w$+ecWyX0om9ykx3N6usCiSyKX<~(;^I4_-7&THq5 z^VWIiymvl0ADvImXXlIa)%oUpcYZiOonOvx=a2K(`S;(~Z@Z4`x}Gas=_*&d#`WF6 z4ROP{;oS&sL^qNf*^S~xb)&h_-5736HD>%&MmLk2+0Ei+b+ftI-5hRCHe~p>$&ya z25v*Qk=xj9;x=`gxy{`cZm8SRZRNIh+qiArc5ZvOgWJ*ViFsc89n_-C^!t2pS#~Z;2v}jxrf~&?os!cd)z(Yo^(&Sr`WybU(SD-7oG}_nZ6O{o(#}f4RTiKki@m-+$q^J;!rB&l8^X zl&3x8`Cj0Kc;US8UIZ_q7s-q4Me(9~(Y)wh3@@e^%Zu&B@#1>%y!c)MFQJ#nOY9}_ zl6uL!7P zub@}RE9@2Vih9Mo;$8`_q*ux-?UnJ$dgZ+GUInkBSIMjFRq?8N)x7Fn4X>tG%d73x z@#=c@y!u`Puc6n-YwR`gntIK==3Wag)NARr@>+XsytZCDuf5m7>*#gzI(uEbu3k5< zyVt|(>Gkq@dwsmVUO%tDH^3X{4e|ziL%gBhFmJdw!W-$0@~4bo9`{~7J7@k#oiKcskh8q?yc}vdaJzE-WqSM zx6WJdZSXdFo4n257H_M!&D-wn@OFB;yxra&Z?Ct{+wUFl4tj^Y!`>0^sCUde?w#;X zdZ)b8-Wl(#cg{QSUGOe?m%Pi~74NEd&Aaa1@NRmyyxZO#@2+>xyYD^l9(s?w$KDg~ zsrSr#?!E9{dau0K-W%_&_s)CoeegbdpS;iB7w@b0&HL{C@P2x~yx-m*@2~gIvxF@i z;R;U(A%zlJ7~zXRgotn=yoewoibx`|h$5njXd=3ZA!3SHBDRPl;)-}8zDOVvibNu@ zNFtJoWFompAySG|BDF{((u#BR@%8GKLyr>{5ib|rgs3NM0YNEQRA!>?RqPD0b>WX@z zzGxsCibkTbXd;@5W}>-hAwoq<(Mq%yZA4qqPP7*tL`Ts{bQWDiSJ6#$7d=Ez(M$9e zeMDc;PxKc9#6U4f3>HJgP%%sl7bC<-F-nXUW5if7PK*~5#6&SkOcr5cikK>t7*aiR0pgI4Mqv)8dRcE6$1Y;)1v+E{V(H zinuDSiRT59*M`|iFhiWiRa>lcqv|q*W!(ME8dCs;)D1o zK8erbi})(OiSOcv_$hvg-{Oz>EB*;f+R~A(^rVndDygNBz6@lD3@5|O2r{CKBqPfx zGOCOwqstgFri>+H%Q!Nwj3?vE1TvvaBooUdGO0`^lgkt`rA#GL%QP~rOefRJ3^Jq4 zBs0q_GONrcv&$SZr_3dD%RDl#%qR290p%7l!N48IYbVX!{l%|LXMQ9 zy*T}VUom?+B$c=K7+$^`qt#X^(E_cYCa+lmK_sG3+pWH7G$b<5bJS>mM zqw<(ME>Fml@{~L+&&ad#oIEct$cyrlyezNCtMZz>E^o-2@|L_U@5sCIp1dy~$cOTg zd@P^Hr}CM6E?>x(@|Aoo-^jP}oqR7p$dB@q{4Br7ukxGxE`P|M@|XNA|H!}cpR|;% z9OWuc2_=@ilJhvSSq%PqvEP~D!xjf5~@Th zu}Y$ns$?p;N}*D!R4TPfqtdE$D!s~}GOA1}v&y2fs%$E|%AsWDh3j;Z77ggU8CsnhC=I;+m9^Xh`Ss4l6? z>WaFmuBq$lhPtV4soUy~x~uM~`|5#us2-`u>WO-)o~h^Rg?g!8sn_a_daK^4_v(ZC zs6MIB>WliSzNzo(hx)00so(04`m6paOWWGfuJ*LhQY)>s(Y_9Jhz_U2>j*lcj-(^& zC_1W+rlac^I;M`LW9v9Nu8ybU>jXNXPNWm-Bs!^1rjzRwI;BpfQ|mN3txl)Y>kK-h z&ZINzEIO;srnBoDI;YO1bL%`hug<6Q>jJuk7J}uB0pLD!Qt!rmO23x~8tBYwJ3?uCAx+>jt`^ZloLQCc3F^rkm>)I#jpRt#oVM zMz_`NbbH-FchsG9XWd12)!lS=-9z`(y>xHgNB7nJbbmcS57dM7U_C?+)x-2~JwlJv zqx5J!Mvv9w^msi%Pt=q2WF4lb=&5>|o~~!;nR=F@^udY+!I7wCn0kzTBq=%sp@ zUanW@m3oz4t=H(adYxXcH|ULelisYi=&gF2-mZ7(oqCtvt@r4?dY|5}59ovXkUp%B z=%f0WKCVybllqiCt}{t>5Uk`kj8SKj@G8lm4u~=&$;l{;q%MpZb^nt^er1`k%Ip zZ5-no&j=%pGTIp9o4|yaa3;KoU?Q4GCbEfQqMB$Xx`|<8nph^biDTlLcqYC{U=o@{ zCb3CklA2^Dxk+JCnp7sWNn_HQbSAyYU^1FaCbP+6vYKoryUAg4np`Hg$z$@Gd?vps zU<#T-rm!huikf1kxG7;uno_2;DPzi-a;ChgU@DqQrn0GGs+wx1x~XAmnp&o|sblJz zdZxZcf6rm<;anwn;&xoKfSO-s|tv^H%_Thq?8Hyunz)5&x;T})Tg&2%?COi$Cx z^frA=U(?U@Hv`N-Gsp}!L(EV!%nUap%t$lJj5cG;SToLyHxtZ6Gs#RgVP=Y%YNna# zW`>z*W|`S$j+txbnfYdcS!foS#b$|FYL=PhW`$X4R+-gijah5fne}Fa*=RPI&1Q?) zYPOl}W{25ncA4E~kJ)SXnf>N~IcN@?iS)`pNv{ehNROpUO||r}5MJ z>HPG520x>p$=*Hi`o;X>ehI&% zU&=4-m+{N`<^1w~1;3(S$*=5J@vHjP{OW!UzouWyukF|I>-zQl`hEkyq2I`F>^Je7 z`px|2ehWX;Z|S%4Tl;PNwthRmz2Cv_=y&ov`(6C5emB3n-^1_e_wsxDef++DKfk{} zz#r%j@(24v{Gt9Zf4D!wAL)cUA_kFy$U&4KY7i}m9>fS@2C;(JL7X6N5HE-yBnT1)iGsvIk|1f2EJz-t2vP>A zg498pAZ?H?NFQVfG6tD~%t4kQYmhC-9^?pe2DyUVL7pIQkT1v|6bK3ig@VFCk)UW$ zEGQn72ucQ}g3>{mplnbsC?8Y^Dh8E;%0ZQ&YEUhx9@GeG2DO6PL7kv(P%o$-Gzc07 zje^EOlb~tPENC9I2ttFFL93v3&?aabvSRJeh)&}c>^}&Wu8 zVnT7Dgiul_C6pG*2xWzGLV2NrP*JEPR2HfTRfTFob)kk(Q>Z1>7U~Ffg?d7Lp@Gm) zXe2Zing~sWW^6PCJB>;DZ*4?nlN3M zA0&h3Acqi!d>B>a9?;JJQN-YkA)|~Q{kELTzDb86kZ9h zg*U=m;hpea_#k`~J_(T;Ks81b~AOL?8yF04YH#kQ$@` zX+b)W9%KL+K_-wHWC2-0Hjo|U069S}kQ?Lyc|ksq9~1xuK_O5W6ahs+F;EHlQtN2ik)Upd;u6I)g5tE9eHggC3wK=mmO%KAREYgArgP7zIXyF<>kh2gZX5U?P|VCW9$pDwqbQgBf5Zm<48oIbbfB2j+tXU?Erp z7K0^VDOd)UgB4&USOr#tHDE1R2iAiPU?bQBHiIo-E7%6MgB@Te*adcjJzy``2lj&l z;2<~z4ud1$C^!a=gA?E+I0a6FGvF*Z2hM{F;3BvLE`uxJD!2x&gB#!`xCL&5JK!$3 z2kwIh;30Sf9)l;~DR>5+gBRc>cm-aAH{dOJ2i}7Z;3N11K7%jdEBFS!gCF20_yvB0 zKj1I;2jalEFdmE#6TpNp5ljq|z@#u4Ob!JIAcP2t5JL&dP=N%hP=gfe(0~k@(1JE} zpbI_d!vJy^!U)E|6fh-B1yjQ`FfB|6)58ofBg_Od!z?f>%m%Z=955%$1#`nZFfYsp z^TPtLAS?t6!y>RKEC!3i60jsJ1xv#+uq-SG%fkw=BCG@}!z!>UtOl#Y8n7m;1#81P zur90z>%#`HA#4O2!zQpPYzCXdSl9x#gsos}*ao(R?O=P@0d|C)U}x9`c7@$wci02= zguP&I*a!B7{a}AM01kwM;9xie4u!+ua5w^vgrneSI0lY|)?900d9nw;AXf5 zZiU<6cDMuXguCEwxCicq``~_f03L*g;9+rcn98v_uzf_06v6|;A8j%K84TVbNB+jgs929-tSPW+G#o~Retr zm1q@Ojn<&GXdPOQHlU4Y6WWZnpsi>d+KzUhooE-@jrO3uXdl{-4xoeR5IT&Gprhy* zI*v}DljsyWjn1I6=o~taE})C(61t48psVN_x{hw3o9Gt0jqaek=pMR{9-xQl5qgZC zpr_~=dX8S8m*^FGjozTQ=pA~GKA?~26Z(w4ps(l~`i_2}pXe9*jsBp&=pTwB#uej< z@x=sULNSq;SWF@&6_bg{ML`526p<*3Sd>IrR74`Gq9#&N7Y&h#rf7+_=!mZ9iM|+! zTnxoXj1g0aDaBM`YB7zNR!k?R7c+<%#Y|#mF^ia0%qC_RbBH;`Tw-o9kC<1?C*~Im zhy}$$Vqvj}SX3+~78gs1CB;%=X|ar0RxBr$7b}Pr#Y$pjv5Ht#tR_|$Ylt<)T4HUn zj#yW$C)O7mhz-R?Vq>w1*i>vLHWy>X7Gg`WmDpNrBeoUWiS5M>Vn?x)*jel%b``sc z-NhbaPqCNSTkIqD75j<(#R1|#agaDz93l=Chl#_*5#mU3lsH-(BaRiviQ~ly;zV(h zI9Z$`P8Fw#)5RI$OmUVtTbv`#73Ycb#RcL*agn%KTp}(Nmx;^872-;9mAG15Bd!(K ziR;A;;zn_kxLMpHZWXtQ+r=H?PH~sGTihe=759nz#RK9&@sM~}JR%+ykBP^{6XHqn zlz3V^Bc2t{iRZ-&;zjY2cv-w6UKOv2*Toy+P4Sj^Tf8IQ74M1n#RuX;@sapgd?G#- zpNY@K7vf9tmH1kGBfb^iiSNY^;z#k5_*wiSeigro-^Cx|Pw|)dTl^#b75|BGa9kV@ z$Hxh9LYxRE#z}BeoD3(&0tOgjghh<8gk`K?f>o?xigj#YhD~f?8#~y=9`Mh>6X(LYaUPr(=fnAN0bCFl!i8}W zTof0>#c>H-5|_fIaT#0|m&4_81zZtV!j*9qToqTt)o~466W7AEaUEP2*TeO31Kbcd z!i{kg+!Qy%&2cPlfm`BMxHWEr+v0Y(J??-z;!e0T?t;7GZn!({fqUX!xHs;D`{I7M zKOTSw;z4*Y9)gGBVR$$mfk)y|cr+e^$Kr8#Jf46j;z@Wio`R?1X?QxGfoI}bcs8Dc z=i+&IK3;$q;zf8dUV@k6Wq3JWfmh;Hcr{*w*Wz_}J>Gyf;!Sun-h#K{ZFoE0fp_9v zcsJgI_u_qcKR$pD;zRf_K7xa@fq&v(_&5H8|Kfi*jucmlC&iZ%NC~AxQer8IlvGM4C6@#VNKitOC}ByGWJ!^T zq)M7ZC0#NkCYh2Y*^(one zRg@}8m8B|DRjHa(U8*6~lxj(}r8-hwsh(6{Y9KX~8cB_%CQ?(Wnbcg0m0CzGrB+gF zsg2ZDYA3aqI!GO*PEu#7i_}%>CUuv3NIj)qQg5k`)K}^!^_K=n1EoRIU}=anR2n7? zmqthh8YhjHCP)*dNz!C#iZoT4CQX-SNHe8b(rjstG*_A@&6gHP3#CQU zVrhxAR9Yr2msUtCrB%{uX^pg2S|_cSHb@(#P10s*i?mhRCT*8?NIRun(r#&wv{%|E z?UxQn2c<*OVd;o;R5~Udmrh70rBl*r>5Oz%Iwzf%E=U)pOVVZOigZ=FCS8|qNH?Wh z(rxLEbXU43-IpFn52Z)aW9fomtIIOrB~8x>5cSOdMCY?K1d&>Pts@Ui}Y3c zCViKFNI#`t(r@XH^jG>P#gXI6@#Oe&0y&|aNKPy#k(0{F*T9E7z0j%MIj)awEC1+(d3FH|?uW zUMw$>m&(iJg zOTI1Nk?+d)Sf04h+ z-{kM|5BaD3Oa3kYk^jp7GZnC54hwNu{J#(kN+_bV_<9gOX9nq-0jI zC|Q+kN_HiOl2gg0Kebq*PX_C{>keN_C}%Qd6m=)K=;!b(MNbeWiiYP-&zzR+=bHm1atFC01#nv{YIt zt(7)PTcw@SUg@B8R5~f0l`cwGrJK@S>7n#gdMUk?K1yGupVD6$pbS(7DT9?E%1~vP zGF%y>E^Ub&!LR4yr(l`G0s<(hI`xuM)t zZYj5wJIY<r0A!SK9Ql3;G6-gyhnN%TF zNi|ZP)F3rUEmE7*A$3VTQlB&+4M`)?m^2|xNi))%#F7@IC22)klQyI+X-C?V4x}UL zL^_i$q$}x0x|1HHC+S6clRl&`=|}pL0c0Q7}JWGPujmXj4^C0RvQlQm>5 zSx45B4P+zPL^hKxWGmT5wv!!XC)q`IlRacF*+=%11LPn%L=KZ9m!lRM-txkv7k2jn4nL>`kTqrS3@;YW7HIC zN;Q?5T1}&-Rnw{I)eLGzHItfI&7x*iv#HtD9BNKAmzrD6qvlofsrl6cYC*M-T39Wj z7FCO>#nlpONwt((S}miNRm-X6)e34wwUSy{t)f;{tEtu18fs0omReh_qt;dHsrA(c zYD2Y=+E{I(HdULc&DB`7h1ybWrM6bvsBP7DYJ0VV+EMMKc2>KnUDa-CceRJwQ|+bp zR{N-Z)qZM!b$~ih9i$Fchp0oUed6I#HdZPFAO=Q`Kqe zbajS0Q=O&GR_Ca5)p_cCb%DB2U8F8nm#9nCW$JQug}PE*rLI=jsB6`A>Uwp9x>4Pv zZdSLbTh(pqc6EokQ{AQRR`;lT)qU!I^?-U%J)|C1kElo0W9o7BgnCjvrJh#LsAtu4 z>Us5odQrWkURJNDSJi9kb@hgNQ@y3$R_~~H)qCoF^?~|OeWX5CpQumOXXU;Hr`ceI)epbJzU)68wclC$*Q~jm>R{yAf)qiRnEv^<%i?1cn5^9OG z#99(9sg_Jjt_d2@poTP2!t+du!8?CL@PHV4q&^l_Jw9Z-=t*h2e>#p_CdTPD2 z-dZ26uhviNuMN-!YJ;@F+7NB1HcT6?jnGDFqqNc57;UUJP8+XH&?ah=w8`2OZK^g+ zo372!W@@vv+1ea!t~O7buPx9PYKyeR+7fN4woF^Dty@aermt8-`XGTul7%iL*vqTG(Js06VgO9 zF-<~~(quF_6)2#PA}Uf$B`Q;e5~@;-QmRvfGHOzb+SH*g^{7t+%4tX=8bedilr$Ah zP1De{G#yP(Gti7Q6U|Js(5y5Y%}#UBoHQ5BP4m#aG#|}R3($hJ5G_oL(4w>$Elx|& zlC%^pP0P@-v>Yu@E6|Fx60JvTb8_C589LVqP=M!+L!jD{pkQYkPf1Q z=@2@U4x_{A2s)CEqNC{;I+l*3E~Cro3c8Z6qO0i|x|Xh^>*)r%k#3@!=@z<`Zll}j4!V=>qPyuHx|i;w z`{@CCkRGCk=@ELA9;3(U33`&AqNnK@dX}D}=jjD{kzS&g=@ojFUZdCP4SJK_qPOWC zdY9g#_vr)rkUpZ1=@a^tKBLd+3;L42qOa*2`j)<<@978nk$$3|=@8n)(bMYb^z?cLJ)@pU&#Y(Bv+CLO?0OD8 zr=CmCt>@A6>iP8idI7zlUPv#j7txFA#q{EO3B9CVN-wRK(aY-P^zwQIy`o-8udG+m ztLoMC>Us^mrd~_0t=G}(>h<*cdIPuvP5dON+n z-a+rEchWoSUG%PcH@&;wL+`2g(tGQD^uBsOy}v#{AE*z~2kS%hq53d=xIRK3sgKe} z>tpn>`Z#^OK0%+TPtqsrQ}n6&G<~{0L!YV7(r4>)^tt*xeZIaxU#KtA7wb#(rTQ{` zxxPYQsjt#k>udD2`Z|5RzCqupZ_+pGTlB5^HhsIkL*J?I(s%27^u78%eZPJ{Kd2wl z59>$tqxv!ZxPC%Esh`qM>u2<{`Z@i)enG#eU(zq@SM;m;HT}ANL%*rt(r@c`^t<{! z{l5M{f2cpwAL~!_r}{Jfx&A_bslU=+>u>b8`aAu-{z3n!f6_neU-Yl~H~qW*L;tD& z(tqoJ^uPK)J&qCAh-bt%5*P`ML`GsGiILPuW+XQR0~pXihG<|zGGs$Bh@l#qK@Ht7 z3}%>yW!Q#exQ1u=MqqFwG$JF$NMWQjQW>d@G)7t@osr(iU}Q8h8JUeNMph%6k=@8) z_xs5zVUL&88-zZ=dGzuAojUq--qnJ_LC}ET|N*Se%GDcaWoKfDWU{o|J8I_GH zMpdJlQQfFv)HG@twT(JPU89~+-)LYoG#VL=jV4A@qnXj%h&5UmEsa)2Yom?P)@Wz6 zH#!&{jZQ{qql?ki=w@^`dKf*8UPf=DkI~oYXY@A)7z2$##$aQJG1M4l3^zs?BaKnU zXk&~q));4uHzpVpjY-C2V~R1=m}X2jW*9S#S;lN*jxpDmXUsPi7z>R>#$scMvD8>* zEH_pdD~(mgYGaMD)>vn(H#Qg>jZMa8V~erX*k)`ub{IR2UB+%>kFnR-XY4l)7zd3* z#$n@#anv|w95+rFCyi6aY2%D>);MRJH!c_#jZ4O5TgGkUj&awx zXWTa)7!Qp{#$)4&@zi)`JU3n#FO65mYvYaa)_7;UH$E62jZemBC143zB9@pXVM$psmYfL;Fvt)S8Dr{KR*h9>HCRnn zi`8azSY1|+)n^S@L)M5jW=&XA){Hf0v8)Aa$y%}2tPN|++OhVm1MA2-vCgau>&m*Z z?yLvv$$GKgtPktU`mz3O02{~#vB7Kz8_I^U;cNsO$wsl!Yz!OA#1+m@$!4+HYz~{t=CS!~0b9rxvBhi&TgsNP*0J?$1KY?p zvCV7?+sd}F?Q93z$#${bY!BPZ_Obo!06WMIvBT^LJIao++A-*$!@XR><+uj?y>vq0ei?EvB&HQd&-`%=j;W0$zHM7><|0P{;@b_Tr-{--%Ma8G!vPL%_L@0Gntv( z6ii@36PcokP05r^#U!R`Y9=*x(=eH7nwDvsj_I17>6?Mc&Cra@7&C>L(oAKhHq)4C z&2(mZGlQAY%w%RZvzS@UY-V;dhndsNW#%^Xn0d{7W`47PS zP0eOzb2HX#VYW0|nXSz>W?QqJ+1~76b~HPgoy{(0SF@Yh-RxoZG<%u7%|2#dv!B`D z9AFMK2bqJ-A?8qXm^s`WVU9FMnWN1y=2&x_Io_OLPBbT(lg%mSRCAg+-JD_0G-sK! z%{k^=bDlZhTwpFV7nzIACFW9dnYr9tVXib+nXAn;=2~-|x!&AhZZtQUo6RleR&$%V z-P~dBGwXE7!9jmTY&#G@Vuo_y8tj1Oo ztEtt@YHr0^Ev%MSE338D#%gP|v)Wr7td3SEtFzU`>S}efx?4T0o>nibx7Ek$YxT4G zTLY|t)*x%JHN+Zf4YP(@Bdn3uC~LGe#u{slv&LH!tclhnYqB-Pnrcn6rdud1##(Ewv({T1tc}(tYqPb*+G=gH zwp%-_oz^aEx3$OGYwff4TL-Lz)*#}vlx@ujsu3I;(o7OGswspt4Yu&T%TMw*<)+6h&^~8E=J+q!$FRYi=E9$CO6`f7c%zFR-6pVlwyxAn*RYyGq0*m3Q6c6>X5ozPBXC$^K=N$q5I za$B&04Q*tLHnt^OwiTP$s;$}7)@{RPwrN|oZ9BGWd$w-}Hn&4NvSaKNc1k;yo!U-g zr?u1B>Fo@5Mmv+8+0J5TwX@mT?HqPaJC~i?&SU4b^V#|B0(L>WkX_g=Vi&cG*~RS= zc1gRGUD_^Vm$l2;)G|~26jWck=@vC zVmGy$+0E@(yM^7-Ze_Q&+t_XGc6NKagWb{YWOuf^*j?>zc6Ymn-P7)6_qO}keeHgB ze|vyE&>mzDwujh5?P2zCdxSmG9%YZV$Jk@-arSt7f<4imWKXuI*i-Fk_H=uOJ=30L z&$j2-bM1Nde0zbt&|YLOwwKsT?Pd0IdxgEyUS+Sg*Vt?Ab@qCDgT2w-WN)^&*jw#w z_I7)Rz0=-h@3!~Yd+mMpe*1uZ&^}}zwvX6H?PK4x zWM8(g*jMdq_I3M)ebc^W-?s1AckO%jefxp^(0*h;wx8Hf?PvCL`-T0|er3P5-`H>M zclLYxgZ89nbNdz~N5lL{5y8!b$0*a#A~K zoU~3lC%u!w$>?NqGCNtEtWGv3yOYDo>Ev>9J9(VEPCh5UQ@|Lic>C|#+J9V78PCci-)4*xyG;$g{ zO`N7qGpD%|>$GrMI<1`6P8+AK)6Qw{bZ|O4ot(~27pJS!&FSv+aC$nuoZe0!r?1n` z>F*4120DYB!Ojq8s58tN?u>9oI-{J?&KPH`GtL?BOmHSTlbp%U6lbb4&6)1ZaArEQ zoY~GCXRb5PneQxc7CMWZ#m*9Esk6*k?yPWDI;))3&KhT}v(8!XY;ZO@o1D$g7H6xo z&DrkkaCSPooZZeIXRou*+3y^14myXN!_E=ssB_FY?woK=I;Wh|&Kc*dbIv*MTyQQr zmz>Ma73ZpR&AIN}aBe!coZHSF=dN?lx$iu19y*Vl$IcVysq@Tv?!0hbIZe_QMTh*=RR(ET-HQicnZMTkF*RAK)cN@43-9~O> zw~5=-ZRR$2W8D^ROShHV+HK>ub=$e^-41R?x0Bo1?c#QIySd%n9&S&!m)qOz-QsR_ zx4GNh9qvwdm%H2DF$~-4E_Z_mlhC{o;Ogzq#MtAMQ{0m;2lO#mnkt^Rjz6yqsPxFSnP+%j@Oy@_Plmf?gr7uvf$@>J{^f zdnLS*UMa7%SH>&rmGjDb6}*aGC9kqq#jEO7^QwC_yqaDuueMjmtLxSC>U#~mhF&AD zvDd_F>NWG4d$C>%ucg<@Ywfl1+IsE0_Ff0Cqu0sn>~-`n2edegk=-VAT1 zH_Myt&GF`X^St@q0&k(W$Xo0!@s@hayye~sZ>6`&TkWm!)_Uu__1*?=qqoW1>}~P3 zdfUA1-VSf4x69k@?eX?{`@H?$0q>x9$UE#E@s4`OyyM;p@1%FiJMEqE&U)v(^WFvT zqIb!=>|OD$de^+`-VN`jcgwr&-SO^v_q_Yw1Mi{t$b0NP@t%6myyxBv@1^(3d+ojP z-g@u6_udEZqxZ@C?0xaRdf&Y7-Vg7m_sje3{qg>K{{@)&as7CHd_RGo&`;zi_LKNY z{bYV}U+{qsedLQi_9b8T6`%O3uldy1eZyzI>07?-JHG3CzV8P<_d`GOWBe3;NJ z-`H>BH}#wO&HY%vh2PR|<+t|R_-*}metW-z-_h^nclNvZUHxu;cfW_<)9>Z?_WSsK z{eFIbe}F&GALI}AhxkMNVg7J`gg??B<&XBq_+$NX{&;_aKhdA$PxhzyQ~hcFbbp3F z)1T$f_UHI>{dxX;e}TWyU*s?Lm-tKlW&U!1g}>5Y<*)YF_-p-j{(66dztP|1Z}zwN zTm5bRc7KPz)8FOq_V@UE{eAv^|A2qcKja_wkN8LZWBzgfgn!aM<)8M?_-Fld{(1j` zf6>3>U-qx~SN&`Lb^nHc)4%24_V4(2{d@j>|AGI|f8;;*pZHJxXZ~~lh5yoj<-hjd z_;3Ap{(Jv}|Iz>CfA+ulU;S_XcmId~)BolF_W$^Q{r>`Ng1AAvAbyY_NEjpv5(i0w zq(QPEc_0KJfB_1`00&Ya2TDK!HP8YY=z$TizznRw4xGRZyuc5FfCpg^1u;R2AZ3s$ zNFAgJ(gx{*^g)IoV~{Dx9ApWy2HAq_L5?72kSoX?@ju$_C|v@v1 zC}Fd`Tkj0#2vV}h~4xL|xRA($9U3ML0rf~mo@V0thk zm>J9pW(RYExxu_(ey|`|7%U1F2TOvb!Lnd^up(F)tO`~KYl5}Gx?p{VSIORGwIO8U_xXm5za*z8w;GBm%;xRl0Psvm9)I1GO z%hU1nJOj_jGx5wk3(v~4@$5VY&&hM~+&mA@%k%O4yZ|rA3-Q9d2rtTu@#4G$FUd>s z(!2~W%ggcdyaKPtEAh&_3a`qm@#?$=ugPoi+Pn_0%j@y_ya8{>8}Y`x32(}q@#Z|1 zx8N;#E8d#7;ca<4-kx{h9eF3-nRnq`c{kpj_uxHwFW#H?;eB~O-k%TP1Nk66m=EDY z`7l15kKiNuC_b8x;bZwYKAunD6Zs@QnNQ(U`7}PA&)_rpEIymh;dA*sKA$h(3;80x zm@nZ=`7*wouiz{BD!!Vp;cNLizMgO38~G-_nQ!4+`8K|t@8CQ6F20-Z;d}W$zMmi9 z2l*j>m>=Os`7wT+pWr9?DSn!t;b-|dex6_87x^WAnP1^o`89r>-{3d-Eq0l;cxjn{+@r}ANeQ#nSbG5`8WQZ|KLCQFaDeV z;eYu*9w&?&#tY+z3BrV7qA+ooBupA63zLUJ2tpX5Pz-S>g>tBbBveB!q@f-fAq&mW z3hmGd-OvmDFbH`VhEW(3rU+ApslwD@nlNpcE=(V02s4J6!pvcoFl(4C%pT?lbB4LX z++m(DZ ztRB_~YlgML+F_lrZdfm@A2tXZhK<6;VUw_F*eq-w#)d7zmSL-~b=W3s8@3DEhaJL> zVW+Tj*d^>5b_=_QJ;I)0udsL6C+r*c3;Txy!hzwSaBw&z92yP_hleA=k>RLtbT}p) z8;%RdhZDky;iPbKI3=7KP79}pGs2nStZ;TXC!8D33+IOm!iC|YaB;XKTpBJ5mxn9D zmEo#zb+{&68?Fo2ha19;;ihnNxFy^gZVR`EJHnmeu5fp_C)^wE3-^Zy!h_+V@Njq} zJQ^MgkB2A1li{iGba*B_8=ec#hZn+&;id3$cqP0VUJI{>H^Q6Yt?+huC%hZp3-5;y z!iV9b@NxJgd>TFrpNB8Pm*K1Mb@(QH8@>zQhabX^;ivF(_$B-reha^cKf<5kukd&H zC;S`!3*$s_qj*vLC_$7kN)#oIl0-?PWKr@+h(H7*6p0azq)3jGh(v0nMKsbQBVv&m zS&g?N)x4x(naZ`3{l1?Q<_= zMb)DkQO&4UR6D8@)s5;!^`iz+!>CczIBF6#jhaQxqu8iL)G}%nwT{|EZKHNk`=~?I zG3pd`j=Dr$qi#|6s7KT@>J{~l`b2%Beo_BuKr}EK6b+7sL_?!t(eP+QG%^|$jgH1d zW214=_-H~jF`5)jj;2IYqiNCfXht+Mnib8C=0tO&dC~l6L9{Sh6fKUHL`$P((eh|T zv@%*1t&Y}2Yom41`e;M6G1?Suj>osP~#XQOk``RGD)F}f68j;=&kqifOi=tguix)t4y?nHN^d(r*q zLG&RS2Nf?tTCUH!Xn4~euVv@%QF(3xUpqT%$mg~}}OUqsfzvJJ((s4$|FIu_Ee|nWB z|IfSFCI7EP?DCWYCjX}&yQ*~D*wrIq*YuejyDoNP>A0o;^NRbw>sv;2Y4pDw2^zKQ z9Q(h34K@``G3+yL+2KF>|DOl{3pF}f!A>n2rmxbjRgZR)%Z~5+KWL@?AG9uA()}L+ C__U?~ diff --git a/pandas/tests/io/data/legacy_pickle/0.20.3/0.20.3_x86_64_darwin_2.7.14.pickle b/pandas/tests/io/data/legacy_pickle/0.20.3/0.20.3_x86_64_darwin_2.7.14.pickle deleted file mode 100644 index 963e533c4d2b4bc670b33e10422c5d9b2ac658d0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 132857 zcmc$^1$5j>%rI((F|)m8PMMh*(x$lOHcisBn>0{wlJb_BnVFfHnVFfHnc;aiX>WVs z|Gx9igUvaXN2Ae58c8!6rzhgG;UG6271uhllkGInAUD=JEHTo?BHD+=wGK9}}rY*C)l#j4aWt;N#Ow- zgxQJ*_c%hTK^`i^Lx*^jh`6NK_MMYW5}Op07!~IAI)XX;BI4r`61_ zK(DO_L;c)vxHG>I3(b_lGXV9X7d3JD3R@x%EA9>I{$ zu~J-uGt~J@?4KAu$Xy45_CLH#oRc9)>k{ww!`2Ko*ALfBo%q@hkvzG}pLby09lJ*O z;mkS>ulwPPTaR;HaA28)9$o!#7^WV+ASXIH$A!g4+OCkK{`Krso@AKXjYh>K+G&0+ z%OB1V9+nW9><209pY~tL9Gf|783C%tB^D`^LYdBIrw{Td9?kQYaWW(u$K%)|>Zd(2 z1_h)4v`2~spui7K1`v`0(G-ZKKs*H!DUeKoR0^b1U}AWR&Y9dqdq?2-)^=vkziX7- z4eA7_QZ&lqv$F?Z! zQ;d_*S$ix+RXha}DUeKoR0^b1AV`5s3S?6tmjZujo-4^srUaaw+h^wqa+6IW!@Q2z zcHW;$`}e?1ApmwhN3Z-oyFfs%6gB?Pub>-`^;^L%6wErRI#DLn3`SNNai z@lyf2qBBt?pIzCRXrLQUNDT8P+ExBcV!*EICX$`BUp>2;8;x|9qq@gI3cTNS9b^r^ zc)>xQDxPYd>Yf@&jzoR|lXG58Cs{^zN(>G6+oqPsuUd%DuKkBV|)KwHxyb-PmE)#Ai1R$cTkT z#YNc79F52njhZ{^dP;=+TwkeTQbJ;U>`(LiSGmQ1@&5-Yuv8p67>a|GsC( z2E%U5{A<5uBB_V1hD?Q}Dq)6@u_E3l6FrPjA9}GwQ$&gJEyFz5!s5pD% zf3f|Y3fQ9@=A(V~m;m!X^`ZR%7}_AoS*)=he;AJQ+2aFrzX#w1|58nKaxCmmw6Nb8 zh~xEGAoET7FCmgM-eiaA6rVlyADT`3n`ZSRW21g8_VoW^`a2b{XE@Af`s`T&=8lZB z-K^to&gn2T;Pv*LU^BtvJbo@8<5&0R<}>%d=J<^W*z=qz=KJgg{wds;KT+5V11@1N z3i9~hN(5s`{>RTbKtPG)1zP-HQ~X5(>?O_&OMUjTe^_YwpB5sjG$~x5a6x;;e=$w& z2lh&b;VPfK`lnUa1bO-gla7ppo9+%RHF#Qc*E*jpUtTYdJnpUk&A zD@Oa5js9s;+JERs=DOp*xc(vm_D+Z4E}y+Sg<*ipA19MNoats26Q^rX)|m7?Qc z=h4%DxLd7{{`%xhlIN(;KI;^T;qlH%?wrS&yFl@R#qt#`^4HY)K!E9apMBvUOfUYy zG@^ay#4aGz(cw~%8IYi&1NAbnUpS_C z>9b!sXZO&Aq;RLiv|qcq}q^Or%Jtu2dKDm*?cv&{(rCc(kQlOXu)f8x^z@QYEqg-Ou9v`8=@2AkoVVNv72pBEd0AAP;$kFt- zBG-!qWQ2o}fap<$oFlChS6=l0BzmAyWt}OVLhW~6*e_nu{_%s{l)tw17j=owWrG)Y zaQ*v-hqdq$&YLH#=cCU{`iJvC6g4c9jJx)}3E$7AObve%<&uh1w5)%G2n-Uk^ zRDRO89>>$Ysr_UvylMOx|0R$&ZD5@L9>`xpdDA&fpErE~`B#nT&ETj1t@`q2bP&$z zHu)yWn<+q_a+&nUT4wh5I4&&&{Nm?*kT**J{lB^r>I6Y2(jF@RPJ*DcD_e~8!`Sk# zulwQF)BSV5cOcU`RX0EUXk2Rh!GTp5#jf|m@g)%aqXX|98DHNI3$@5O+Yd8HkA{76 zVEfs=tA4nnpq%rw1NnXPJNsdah9wVvmVRCzc(XdfCWo~*oAc^8UXUN7d47ehH+w)I zf8<8{qjiw`|AnxO{+F=JkuopR?`NnuP6Yi+Oy&Fs;U6)T%jr&zsoc(6vS;M+4+-25 z20SCL)AV`s1(3gaMt(p2@19Y>LHy+z1q1Z|<{5?jJps>%_Itx1_y7M3UD%N$Wuc2W zuTJ79>VW9vgi*|S4T|=sifDiIL?>s7;?AqjTO!cs&kfOkB!ZHDj6VTH`yC`X2Soop z?)`Tl;fV<;5njqq?T`8BCY2vN}fpOT*1%tzjn<(_DgTYWNwxGd|G%b2iX1lbzk>rI~2*zT+=_`FK4dhG=1KX0P;6yuI;D)-I?n+h`*eWdA0)oQ9Z z6@MKglJgb#V>|FKkl#-5&wPdXueJZS|2O#ukv~@j_Wq_$my@UdkiFUeN3u6hndi@3 z>%V^goBS>QLFiY~e{Ubqq2<5m5SlWC_-X%q4)t%p|JnZkXS=sb;s2N2{mlO+ci2CO zl9h8#G~xdub3_WcWC_P@X{R&kPmy7aM<8a?($<0|L3Ge3&z3;FFTKorC+uCdNPH z&rcoRSU)b=_fy!!{bW<&FE;VNvuXb?Z0vuq@&07vc`M@n6^^{ew-1pKL1r z#ipZ!i}ZOrIUr>l?Cc+xybbzuf66x4#gFxQy9PRc-v^ygcdXsbPZ`(;{fQ>Y=k@)K z&?y`Ogx!A;`lH$*?D023G#CpI_WVWY4;_cF*WU<{U^GD3`xl|#Uy~yJ$MC@Z>Zbjr zV)FLa$LVr&p{*m^ggIY7L)%5gwf6S)M zal)M{dy3OaX>{>U{n=;}AMc&^vtcK7@=kXefg?k-Uo7Vo7(+V^aJH~tGI*nxhel`94=lL@Gf>5Ka2Cgg^5EK**4z!2ptv!oT&Zv?p@+g zI!C^x0UpZ&Jc!VU_}0#fEO%Oe?8z(q-NA4zFz*}f2MAbKEyGyv)YK0 z-!TY>2#ACzh=u^fIOPWi@sI$CkOaw)0;!M&>5u`LkOjG*R8VRt4U`s22c?HHKpCM- zP-Z9#loiSb1wq-N98gXu7nB>y1LcMCLHVHqP(i2=R2V7(6@`jH#i0^VNvIT58Y%;o zg~~zYp$bq%s1j5essdGoszKGE8jicygla({P;ICVR2QlT)rT5DZm1#D2x<&9fto_i zpyp5us3jB%wSvN+a3}(54Mjq2ptg_)ih`n{c2Eoy3&lb4PI!v(d{B3&2hf+ zp>fc7XaY15ngmUTra)7nY0z|N1~e0z1@!DXalqn+5~Newm@5xoA=mK;Rx&&Q@u0U6zYtVJ*26Pj;1>J`3KzE^g(0%9u z^bmRkJ%*k@PoZbfbLa*15_$!_hTcGLp?A=G=mYc-`UHK3zCd50Z_sz>hd(pG2#mrQ zjKc&>!W2xy0A^qo=3pKcU=fyJ8CGBw)?ghrU=y}r7n}-C4X1(A!s+1ja0WOdoC(eh zXMwZA+29~JJDdZ~3Fm@y!+GGma6ULcTmUWz7lI4JMc|@vF}OHf0xk)cf=k0?;IeQz zxIA0|t_W9xE5lXbs&F;9I$XoqaB9M};1IYrTnDZT*MsZB4V*7I4dF&`W4H<26mAAL zhg-lc;ZV3090rHO5pZib5^e*xg*|W-91XXFW8hdg4vvT0!#3=N6W~NR3GM)Qgge2V z;Vy7jxEt()yTd);o^UU?H{1vA3-^Qj!vo-f@E~|FJOmyJ4}*upBjAznD0nnH1|AEK zgU7=Y;EC`gcrrW%o(fNcr^7SgneZ%lHarKO3(te+!wcYr@FI9IyaZkfFN2rEE8vyz zDtI-#23`xVgV)0w;EnJmcr&~O-U@Gnx5GQ&o$xMrH@pYl3-5#X!w2Al@FDmxd;~rU zAA^s>C*YItDfl#e20ja)gU`bk;EV7j_%eJ2z6xK1ufsRsoA538Hhc%Z3*Uq9!w=wx z@FVy!`~-dqKZBpcFW{H(EBH1127U{_gWtm+;E(Vp_%r+k{tADCzr#PA;CFTg1Vu0e zM+k&OD1=4;!XPZdAv_`=A|fF&q97`wAv$6pCSoBjBo&exNrR+C(jn=Q3`j;K6OtLp zf@DRqAwfuXBnOfc$%W)b@*sJUd`Nzz08$VsgcL@KAVra4NO7bDQW7bJlt#)RWs!17 zd87hT5vhb!MyeoHk!nbFqy`d<)I@3_AxLec4pJAXhtx+JAa0}~(gddX^li8ZIHHz2Z=(Wk#4o%0`XGIgen@|005T96gbYT8AVZO1$Z%u?G7=euj7G*FW07&l zcw_=H5t)QcMy4QBk!i?uWCk)5nT5~0CEsHgd9eWAV-m7 z$Z_NZauPX(oJP(dXOVNrdE^3e5xIn1My?=Nk!#3xx%LUW^e(7b3qG(TDZEr=FE3!_EQqG&O+I9dWNiIzf3qh-*tXgRbz zS^=$yRzfSIRnV$vHMBZf0}V!NqP5Tvv^H7?t&7$}>!S@&H`)+wgf>Q-piR+cXmhj$ z+7b;#TcKfSI2wVrMkCQSXj{~SMxoJYJ2VE3MdQ$Tv^{F0UNiwsM3c}CXh*aY+8OPF zc163PKD0a91MP|SLVKfq(7tFtv_Cok9f%G>2ctvKq3AGlI649yiH<@?qhrvq=s0vd zIsu)CPC_T6Q_!jCG;}&T1D%P^LT96M(7EV5bUwNOU5GA17o$tirRXwrIl2N}iLOFd zqifK$=sI*gx&hsYZbCPsThOiOHgr3>1Ko-4LU*Hk(7otBbU%6kJ%}Dc52HuWqv$d8 zIC=s-iJn4Fqi4{w=sEN}dI7zNUP3RUSJ12IHS{`q1HFmfLT{sY(7Wh8^gj9keTY6n zAEQsur|2{EIr;*9iM~Q#qi@i+=sWa1`T_ljenLN^U(m1UH}pID!`b*@48c$g!*Gni zNQ}a03}6h#VjRX}0w!V-CSwYwVj8An24-Rw=E728sj)OzS}Yxw9?O7b#4=%-u`F0t zEE^VtWyf-0Ik8+=ZY&R$7t4p`#|mHtu|im3tO!;VD~1)vN?;|iQdnuM3|1B^hn2@F zU=^`SSY@mVRu!v;RmW;z!B|bK78Zil#_C{ov3gj2tO4f68e)yG##j@qDb@^Yj$g27wd=h#|B^nu|e2iYzQ_K8-@+XMqnecQP^l~3^o=UhmFT3 zU=y)P*ko)9HWizOO~+mVb*k)`CwiVlkZO3+CJF#8ZZfp;>7u$#J#|~fzu|wEl>eRU>C7V*k$Yrb``sZUB_-U>~th*k|ku_7(eveaC)a5Dw!Aj^Y@O;{;CP z6i(v+XK)tha2^+M5tncoS8x^Aa2+>r6Sr^|o(fNmr@_G1S;20SC43D1mY!L#Do z@E|-po&(Q`=fZR2dGNe=K0H5O056Ca!VBX?@S=Dzyf|J0FNv4JOXFqmvUoYXJYE5> zh*!cZ<5lpgcs0B_UIP!tYvQ%=5WF^C2d|6Q!|US>a5vr%Z-h6-o8V3HW_WYF1>O=5 z#arQFcsL$`x5gv!Hh5dygGb@fcso1>kHzEgc)UGs<6b-gPsEe(4tPhr6W$r`f_KHc z;Xb@O-UIK6_riPQeek|`KfFIa03V1C!Uy9+@S*rHd^kP=ABm5`N8@AgvG_QAJU#)R zh)=>N<5Tdd_%wVvJ_DbL&%$TpbMU$NJbXUB0AGkN!WZL9@TK@Nd^x@XUx}~6SL18& zwfH)GJ-z|oh;PC-<6H2p_%?hyz60Nh@4|QEd+@#ZK72oZ06&Ny!Vlv|@T2%K{5XCB zKZ&2hPvd9sv-mmuJbnSch+o1l<5%#j_%-}GegnUW-@cksLTJ^ViY0Dp)-!XM*L z@Td4Q{5k#te~G`sU*m7^xA;5!J^lgzh=0OA<6rQv_&5AJ{sV^ym_P`WzzCcm2$G-( zng9etumnf&gg}UdM973fsDwu7gh7~uMYxDmL~0@pk(NkDq$e^E8Hr3pW+DrbmB>Z} z5!s0xL{1_Xk(3PeSs z5>c6`LR2NH5!HzrL@-g4s6~VjwTU`JU7{XQpJ+h1iH1ZYqA}5gXi79AniDOEmP9Dg ziU=dZi3p-K5lOTm+7ccjiijrK5ivw85l6%m?FpOk5(z{ikwkPLIuf0T&O{fYE76Vc z5#5O%L{Fj@(VOT)^d5C~C592hi4nv|ViYl&7(wAC6*D(i50|3VimEPSVOEO z))DK84a7!b6S0}tLTn|r5!;C!#7<%tv76XK>?QUQ`-ua@LE;c`m^eZlC5{oti4(+0 z;uLY3I76Hz&JpK{3&cg@5^^$kJpPvMgDSEKgP-E0UGS z%48L?Dp`%JPSzlU$(m#>GK8#6)*9$tGk|vKiT&Y(cgpL&;WT z7#U7Rkgdr`vJKgm^pH_xG}(@fA!ErnGM;Qt+N76EAQQ_m1ZyO3SUZlsUw zPWB*slD){@WFN9G*^lf`4j>1TgUG?;5OOFvj2upmAV-p;$kF5&ax6KH98XRlCz6xM z$>bDrDmjguPR<}_lC#L!~5^^cIj9gBxAXk#B$kpT;axJ-z zTu*KwH@-6v}d{2HLKa!uw&*T^KEBTH5PW~Vv3Z@VWr7#Mo2#TaAilzX? zP%On!JS9*fB~db^P%5QSI%QBMWl=6F6_uJwL#3tCQR%4+R7NTjm6^&yWu>xFK~#1s z2bGh`MdhaQPGh-yqVp_)?7sOD4)swEXlwW7kP za4LdoO+`{|sJ4`cilU;ac2o=%OT|&~RC~&%yi@{}NF`AnsE$-8sx#Gv>PmH^d{lR; z2i246MfIloP<^R>RDWs!HIN!a4W@=rL#biZaB2iKk{U&grp8cXsd3bJY63NpnnX>e zrchIP;;qy)O>0IwUAmwEvA-GOQ~hla%u&&l3GQrrq)nvsddzP zY6G>A+C*)pwoqHCZPa#Z2ep&hMeU~cPH>926dCVMctI3zW`b2%EzEEGOZ`60{2L;hEjnF8K(Kt=eBu&vY4QPgD zX^!S;ffi|rmT84nX^qxtgEnc4cG0Qm)N~p;EuD@|PiLSr(wXSYbQU@*osABnv(q`~ zoOCWaH=T#hOXs8W(*@{)bRoJhU4$-57o&^QCFqiLDY`UWhAvB&qs!A3=!$eDx-wmb zu1Z&b{({<>&bUnI0-GFw}4e3U7W4a05lx{{hr(4i1=}@{A9Y%-K z5p-)hl5Rt{r9E^M9Zk2RW9V2qj*h3>(>Cp;6X--biS9smq&v}_=`M6vx*P4IyVE`B zo^&s|H{FNsOZTJu(*x*%^dNdLJ%k=g52J_EBj}OzD0(zKh8|0gqsP+|=!x_sdNMtQ zo=Q)nr_(d≠4rHa&-)OV6X{(+lW@^dfpOy@Xy$FQb>!E9jNh zhCWN5qtDY9=!^6v`Z9fmzDi%CuhTc^oAfREHhqV_OW&jK(+}u}^dtH){e*r>Kck=1 zFX)%_EBZD4hJH)Gqu<5 zXa{0IEQkZ~!1?z*-~|aF5hQ^Qpd;u6I)g5tE9eG%pgZURdV*e{H|PWUf_|Vs7yt%> zL0~W#0)~QNU^o~7MuJgbG#CTMf^lFxm;fe%NnkRV0;Ym#U^NPHFyKwf_LCO_y9hFPvA570=|N8;5+yM zAO>a-24ye?X9$L5D28SL!!RtvF+3wMA|o*}qcAF?F*;)~CSx%!CKZ#KNyDUN(lP0o z3`|BQ6O)WtnnJd8PtWk*UN~W~wk%nQBaRrUnzt)MRQgAxv$i4pWz@$JA#UFm9$H(}-!z zG+~-D&6ws)3#KI#%Cus_m~bY7Y0X43ZJ4%BaPB`Y?T&eoTL605gyo#0+MJFhiMP%y4D|Gm;s_ zjAq6#W0`TxcxD1Kk(tCyW~MMxnQ6>)W(G5pnZ?Xz<}h=adCYuf0ke=<#4Ki(FiV+b z%yMQ0vyxfGtY+3QYngS-dS(N&k=ev-X0|X}nQhE=W(TvA*~RQ;_Aq;yeawF50CSKz z#2jXhFh`kV%yH%fbCNm5oMz52XPI-%dFBFhk-5ZNX09+-nQP2-<_2?`YX1*|AnQzQ@<_80@ zFpID#i?KLMup~>dGz(aUWm%5pS%DQQ7*wkzqHZ7ZuP0wav zGqRc3%xo4mE1QiCVzaY3*qm%GHaDAx&CBLv^Rormf@~qSFk6Hz$`)gbvnAM)Y$>)h zTZS#mmSfAa71)YwCAKnKg{{g~W2>_@*kHCMTZ;{0YqNFOx@?n3L zJBA(0j$_BO6WEFDBz7`8g`LVyW2dt-*qQ7sb~ZbQoy*Q+=d%mgh3q1BF}s9a$}VG< zvn$w@>?(FOyM|rMu4C7;8`zEPCU!Hsh26?-W4E(A*q!Vyb~n3+-OKJ{_p=AsgX|&p zFnfeO${u5nvnSY->?!s%dxkyBo@39m7ubvJCH69Vg}usNW3RI}*qiJv_BMNmz02NX z@3RlshwLNvG5dsl%06SCvoF|}>?`&)`-XkXzGL6BAJ~uVC-yV@h5gEYW52UMScro; zghM%u!#RQ@If|n>z%d-laU9PHoXAO>%qg78X`Id(oXJ_7i%Z3&=F)I!xpZ85E(4d5 z%fw~ovT#|sY+MkRoy)=HT&hC2ArE~$Ti{`b4|FW zTr;jY*Me)wg>tR9FfN>n;97H$TpO+}=i#EbXs#U>!^LuOTs+sFvpFx9z$J1?TnDZr z*NN-Qb>X^l-8diDo$JB%N*8^8_Z262PAA>2@I7&n|7!Hwibaih60 z+*ocLH=dioP2?tVlesC}RBjqKotweU}4snOM zBivE$7bzUF0rtm$@t4Rqh&hox8!^84j-r`++Dn2!zhEL0<6oe zJ~N+%&&p@xgZS)x4n8NJi_gvH;q&tO`22hUz93(SFU%L=i}JZ{fG{+xYGL4t^)Ui{H)f;rH_U`2G9={vdydKg=KD zkMhU(IfYz8ZXu75SI8&i7YYakg+fAMp@>jaC?*saN(d!|QbK8=j8Il6 zCzKZ|2o;4&LS>F@3ZX(PAxsDtB81jLq|ioaD|mz`AzElB#0arMoDeUx7i_^RBnXK@lF&iuD0C7! z3tfb+LN~!DbQgLEJ%wIEZ=sLSSLi477X}Ceg+an#VTdqP7$yuCMhGK?QNn0pj4)Oh zCyW;+2or@#!en8JFjbf)Oc!PdGlf~gY+;TtSC}Wv7ZwN$g+;<*VTrI*SSBnNRtPJF zRl;gvjj&c&C#)AX2pfe>!e(KMuvOS5Y!`M2JB3}sZefqGSJ)@)7Y+yqg+sz&;fQcl zI3^qyP6#K3Q^INCjBr*sC!7~92p5G*!e!x#a84KQkO+&2h>Dnq zi-bsult_y}WJFfvL|zm`QItekR76$OL|rsQQ?x{vm`Y47rV-PM>BRJ61~H?UNz5!} z5wnWf#2_)dm_y7d<`Q#@dBnV8J~6*oKrAR05(|q(#G+y`vA9@5EGd=}ON(X1vSK;0 zyjVf3C{_|Hi&ey`Vl}b4SVIgJYl^kR5V5vcN31K>6YGl&M7P*bY$P@on}|)tW@2-( zh1gOI6`vEn#! zyf{IeC{7Y5i&Mm@;xuu(I76H%&Jt&fbHusgJaN9bKwKy;5*Le0#HHdgak;ocTq&*+ zSBq=Jwc7v*J1Nym&#pC|(jTi&w;};x+NQctgA?-V$$%cf`BmJ@LNyKzt}Z z5+93C#HZpj@wxayd?~&XUyEN!g_wQcfwClv~Op<(2YD`K1Cl2lo$B2|^DN!6tqQm|B0swIU;wWT^zU8$Z_Uuqz^rG`=?sj<{VYAQ98noBLD zmQtwHN(z(0r3k6D6e+cl+DaZNN{W`+NikBa6eqMN!_I$QctOu)LZH!^_BWb{iOlYKxvRPSQ;V?m4->fr4iCdX_Pct8Y7LB#!2I) z3DQJqk~CSGB2AU1Nz6CO@IwPHx&PnH`3(`gDl5|bX&S3-IeZ1_oWBYL+O$9 zSb8Eom7Yn@r5Dmm>6P?adLzA+-bwGJ57I~Jlk{2oB7K#&v8 zHaSSnF6WSQ%DLp+avnLaoKMa#7my3eh2+9=5xJ;bOfD{$kW0#?e!g$gSl_xsBXb_Q+9kwA@aPkz?gJIbLor+p?av!;`+)wT=50D4SgXF>T5P7IPOdc+ekVnd+%CqFz@*H`tJWrl4FOV0?i{!=f5_ze-OkOUpkXOp9D5|0~-4m2ygXrGipX zsiag^swh>JYD#sbh7zpQRB9<9N^PZ%Qdg;`)K?lPZl$5pNNKDzQJN~vl;%ncrKJ+8 zv{J&9a3w-%twbtql(vdTiBh7Kc1nyAtHdesN_)jtyh?(Ss3a*Jl#WU#rL)pS>8f;7 zd`fqvhtgB&rSw+%D1DWFN`Ga5GEf<$3|59HLzQ95aAkxtQW>R;R>mk}m2t{=Wr8wM znWRisrYKXDY07kEhB8x`rOa05D07v0%6w&kvQSy1ELN5%OO<8Ha%F|GQdy;}R@NwM zm37K`WrMO&*`#b%wkTVbZOV3Khq6=IrR-MrD0`KC%6{d5a!@&>99E7fN0npBapi<^ zQaPoZR?aAAm2=8@<$`iixujfHt|(WPYsz)yhH_K6rQBBTD0h{6%6;X5@=$rCJXW44 zPnBoNbLEBdQhBAkR^BLYm3PW}<%9B3`J{YSz9?UnZ_0P&hXSdvim0fHskln0q)Mr@ z3RFgARZitqK^0X=l~qMmRZZ1ZLp4=Pb*ZV;)M^?vt(s0vuVzp)s+rWxY8ExCnoSK- zv#UAOoN6vLx0*-ItL9Vls|D18Y9Y0-T0||X7E_C>CDf8?DYdj(MlGwBQ_HIr)QV~) zwX#}8t*Ta2tE)BCV6~=NOAS$Lt98`6YCW~S+CX)y4b?_!W3`FeRBfgL7KnIz%0+4pWD#Bh-=VD0Q?tMjfk;Q^%_l)QRdO zb+S4|ovKb#r>is6nd&TcwmL_htIkvBs|(bH>LPWqxoAQE7XLK;8dPF^{9#fC2C)AVb zDfP5^Mm?*ZQ_rgx)QjpR^|E?Jy{cYQud6rIo9Zp~wt7dstKL)Zs}IzN>Lc~B`b2%I zK2x8oFVvUnEA_SdMt!TkQ{Sr})Q{>X^|Sg#{i=RbzpFn~NP{&*Lp4mpH9{jbN~1NP zF&e9J8m|eOs7acvDVnNjnywj|sacv!OQogO(r9V5bXs~XgO*Xtq-EB!Xj!#vT9B4q z%c14ea%s7>JX&5YpO#-MpcT{#X@#{ST2ZZ-R$MEgmDEaUrL{6zS*@H_UaO#0)GBF} zwJKUwt(sO{tDyyJHMLq=h*n#xqt(^wY4x=Rnp!J13dTG72K3ZR`pVnU+pbgXpX@j*P+E8tnHe4H_jnqbIqqQ;GSZ$m(UYnpz)Fx?@ zwJF+EZJIV+o1x9rW@)pvIoe!po;F`wpe@uEX^XWb+EQ(qwp?4Gt<+X&tF<-ST5X-S zUfZB;)HZ3GwJq9KZJV}T+oA2$c4@n{J=$JvpSE8+pdHi>X@|8V+EMM8c3eB5ozzZg zr?oTMS?!#5Ub~=O)Gle4wJX|H?V5I7yP@6GZfUo*JKA0Co_1e*pgq(cX^*uh+EeYB z_FQ|Rz0_W5ueCSYTkW0pUi+YZ)IMpSwJ+LN?VI*p`=LQPtRp(AV>+%AI;m4StplCW zS)J2)UC>2c(q&!IRbA6{-Ox?l(p`EgJ++=jPphZX)9V@ZjCv+Lvz|rIs%O)K^z3>L zJ*S>a&#mXt^XmEZ{CWYspk7EXtQXOX>c#ZpdI`OxUP>>mm(k1W<@EA;1-+tPNw2I| z(W~m!^y+#IJy@@)*V04u+Ik(mu3k^CuQ$-$dPBXD-dJykJIDz_PVWm^#napPtrT+9raFnXT6KwRqv+z^zM2O zy{Fzw@2&UI`|ADl{`vrYpgu?+tPjzL>cjNm`UribK1v_0kI~2KdW-y`U-uezDi%MuhG})>-6>d z27RNxN#Cq*(YNZ`^zHf%eW$)l->vV__v-uf{rUm@pnga{tRK;j>c{ls`U(A{eo8;B zpV80i=k)XX1^uFaNx!UL(XZ;)^y~T!{ic3PzpdZV@9OvT`}za@q5epJtUuA8>d*A& z`V0M~{z`wXztP|7@AUWj2mPb|N&l>W(ZA~7^zZr)9Wr18F;D|DaDy;NgED9X7>vOh zoWUD{AsUh)8;YSCnxPwpVH%dKJv6dPaStf#Eh98jXy`MiZl{(adOWv@lv4p++ks%m_Ck zjMhe^(Z*P!b(ZT3wbTT>{U5u_qH^XOiH+mR7 zjb27?qmR+o=x6jd1{ed4LB?QXh%wX{W(+q*7$c2Q#%N=VG1eGoj5j726OBp6WMhgk z)tF{XH)a?!jakNQV~#P`m}ks478nbSMaE)diLumJW-K>W7%Poc#%g1YvDR2;tT#3o z8;woIW@C%7)!1fiH+C31ja|lWV~?@d*k|lF4j2cGL&jm_h;h_7W*j$87$=QW#%be> zan?9zoHs5Q7mZ8CW#fu*)wpI{H*Od=ja$ZTcvzpn=ATzs}!^~;s zGIN`G%)DkkGrw8DENB)o3!6pEqGmC(xLLw1X_hien`O+hW;wIGS;4GmRx&G_Rm`ep zHM6=|!wfcSnzhUjv$k2stZUXY>zfTsx7pBaWHvUNm`%-QW^=QJ+0qO(TbW^IxEW!# zHY3e8W?R!^Mw!uOJ2S?NHRH^9v%P7XUNgZ=G?UB@W=FG=+1c!3b~U@1KC`>o!|ZAH zGJBhS%)Vwnv%fjO9B2+Q2b)98q2@4ixH-ZcX^t{Sn`6we<~Vb_Il-K0PBJH(Q_QL6 zG;_K+!<=c(GH07}%(><~bH2I2Txc#b7n@7WrRFkoxw*nzX|6I?n`_Lq<~nn|xxw6M zZZbEUTg`p8lvUa)W0keaS>>$?Rz<6lRoSXyRkf;F z)vX#`XR;bm=3bVqk2&=Uf zX|=K1S{^IPiniKWF;=Vvq<)yUNWI$|BQj#tq0aa>yh=?dSX4bo>|YW7uHMb zmG#->n}OG+(o!Z7v-W|z{R*&7w6(# zf=hHsF4?8HRF~$`U53kaSuU3=l`FL?jVrAyoh!X7gDayelPj|;iz}-un=8nb-Ic?Y z)0NAW+m*+a*Okwe-&Md>&{fD)*j2<;)K$z?+*QI=(pAb;+EvC?)>Y0`-c`X>(N)P+ z*;U0=)m6<^-BrUC?5gRi<@&!^yQg4FmM&bwZQEVSs;tUfZQHhO+qS*hwr$(CZQC|Z z@BP2~=G>k$Zk`-9Bj!~_jhZndzEQ>~Ym_s}8x@R-MkS-NQN^fgR5Pj@HH?}@Eu*$k z$Ea)6GwK@+jD|)dqp{J%XlgVwnj0;QmPRY1wb90CYqT@k8y$>}Mkk}Q(Z%R$bThge zJ&c}4FQd27$LMSHGx{3?jDf}=W3VyA7-|eNLXF|Z2xFu%${1~oF~%C>jPb?BbCWrZLNyZOk#|8uN_##sXuZvB+3#EHRcE%Z%m53S*_Q%2;izG1eOE zjP=F_W23Rj*lcVuwi?@v?Zyscr?Jb}ZR|1j8vBg>#sTA?amYAq95Id>$Bg5~3FD-3 z$~bMDG0qz2jPu3?&6Y^rg6)-ZQL>L8uyI*#slM_@yK{=JTaac z&y45B3*)8n%6M(OG2R;QjQ7R|D(_-uSJz8c?*@5T?~r}4}9ZTvC9m|@LuW_UA# z8PSYnMmD3EQO#&(bTft-(~M=tHshFa&3I;fGl7}VOk^fDlbA_O!!%9Hv`xnprfYhp zG<`ELm8s1TGntv(Okt)pQ<zzsncd7`<}`Dexy?Lg zUNfJW-z;DjGz*!9%_3$|vzS@jEMb;3OPQt3GGvGAS>3E* z)--FGwaq$aU9+B9-)vwuG#i(V0JV+nVrop zW>>SD+1>16_B4B$z0E#mU$dXt-yC2LGzXc3%^~JcbC?-w4mU@bBh69fXmgA?)*NS! zHz$}A%}M5DbBa0DoMuipXP7h1S>|kWjyczyXU;blm}XYMx-m